diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10856 @@ +{ + "best_metric": 0.812837658080022, + "best_model_checkpoint": "/data/hungnm/unisentiment/modernBERT-large-sentiment/checkpoint-4611", + "epoch": 5.0, + "eval_steps": 500, + "global_step": 7685, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032530904359141183, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 2.3733, + "step": 5 + }, + { + "epoch": 0.006506180871828237, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 2.3771, + "step": 10 + }, + { + "epoch": 0.009759271307742356, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 2.3846, + "step": 15 + }, + { + "epoch": 0.013012361743656473, + "grad_norm": 6.415463447570801, + "learning_rate": 1.2987012987012988e-06, + "loss": 2.3719, + "step": 20 + }, + { + "epoch": 0.01626545217957059, + "grad_norm": 6.333994388580322, + "learning_rate": 4.5454545454545455e-06, + "loss": 2.3437, + "step": 25 + }, + { + "epoch": 0.01951854261548471, + "grad_norm": 8.240402221679688, + "learning_rate": 7.792207792207792e-06, + "loss": 2.2279, + "step": 30 + }, + { + "epoch": 0.02277163305139883, + "grad_norm": 3.9838759899139404, + "learning_rate": 1.103896103896104e-05, + "loss": 2.079, + "step": 35 + }, + { + "epoch": 0.026024723487312947, + "grad_norm": 4.472020626068115, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.997, + "step": 40 + }, + { + "epoch": 0.029277813923227064, + "grad_norm": 3.7005715370178223, + "learning_rate": 1.7532467532467535e-05, + "loss": 1.9341, + "step": 45 + }, + { + "epoch": 0.03253090435914118, + "grad_norm": 13.776549339294434, + "learning_rate": 2.012987012987013e-05, + "loss": 1.8824, + "step": 50 + }, + { + "epoch": 0.035783994795055306, + "grad_norm": 21.78818130493164, + "learning_rate": 2.272727272727273e-05, + "loss": 1.8142, + "step": 55 + }, + { + "epoch": 0.03903708523096942, + "grad_norm": 5.216381549835205, + "learning_rate": 2.5324675324675325e-05, + "loss": 1.7531, + "step": 60 + }, + { + "epoch": 0.04229017566688354, + "grad_norm": 1.8160972595214844, + "learning_rate": 2.857142857142857e-05, + "loss": 1.7342, + "step": 65 + }, + { + "epoch": 0.04554326610279766, + "grad_norm": 6.6145339012146, + "learning_rate": 3.181818181818182e-05, + "loss": 1.719, + "step": 70 + }, + { + "epoch": 0.048796356538711776, + "grad_norm": 2.8823325634002686, + "learning_rate": 3.506493506493507e-05, + "loss": 1.6266, + "step": 75 + }, + { + "epoch": 0.05204944697462589, + "grad_norm": 7.7064433097839355, + "learning_rate": 3.831168831168831e-05, + "loss": 1.5322, + "step": 80 + }, + { + "epoch": 0.05530253741054001, + "grad_norm": 2.749300241470337, + "learning_rate": 4.155844155844156e-05, + "loss": 1.5303, + "step": 85 + }, + { + "epoch": 0.05855562784645413, + "grad_norm": 2.898153305053711, + "learning_rate": 4.4805194805194805e-05, + "loss": 1.4395, + "step": 90 + }, + { + "epoch": 0.06180871828236825, + "grad_norm": 2.754175901412964, + "learning_rate": 4.8051948051948054e-05, + "loss": 1.3824, + "step": 95 + }, + { + "epoch": 0.06506180871828236, + "grad_norm": 6.261497974395752, + "learning_rate": 4.99999914743261e-05, + "loss": 1.3976, + "step": 100 + }, + { + "epoch": 0.06831489915419649, + "grad_norm": 1.882625699043274, + "learning_rate": 4.9999895560561514e-05, + "loss": 1.3925, + "step": 105 + }, + { + "epoch": 0.07156798959011061, + "grad_norm": 1.972509503364563, + "learning_rate": 4.9999693076350204e-05, + "loss": 1.3423, + "step": 110 + }, + { + "epoch": 0.07482108002602472, + "grad_norm": 3.2671749591827393, + "learning_rate": 4.999938402255531e-05, + "loss": 1.2724, + "step": 115 + }, + { + "epoch": 0.07807417046193885, + "grad_norm": 2.316319465637207, + "learning_rate": 4.9998968400494294e-05, + "loss": 1.2621, + "step": 120 + }, + { + "epoch": 0.08132726089785296, + "grad_norm": 2.291067361831665, + "learning_rate": 4.9998446211938876e-05, + "loss": 1.2517, + "step": 125 + }, + { + "epoch": 0.08458035133376708, + "grad_norm": 2.1083593368530273, + "learning_rate": 4.999781745911506e-05, + "loss": 1.2589, + "step": 130 + }, + { + "epoch": 0.08783344176968119, + "grad_norm": 2.5555965900421143, + "learning_rate": 4.9997082144703124e-05, + "loss": 1.2316, + "step": 135 + }, + { + "epoch": 0.09108653220559532, + "grad_norm": 1.8004491329193115, + "learning_rate": 4.999624027183758e-05, + "loss": 1.2197, + "step": 140 + }, + { + "epoch": 0.09433962264150944, + "grad_norm": 1.7682017087936401, + "learning_rate": 4.999529184410721e-05, + "loss": 1.1816, + "step": 145 + }, + { + "epoch": 0.09759271307742355, + "grad_norm": 2.857111692428589, + "learning_rate": 4.999423686555498e-05, + "loss": 1.1797, + "step": 150 + }, + { + "epoch": 0.10084580351333768, + "grad_norm": 3.4952378273010254, + "learning_rate": 4.999307534067812e-05, + "loss": 1.1883, + "step": 155 + }, + { + "epoch": 0.10409889394925179, + "grad_norm": 2.0805113315582275, + "learning_rate": 4.999180727442799e-05, + "loss": 1.2026, + "step": 160 + }, + { + "epoch": 0.10735198438516591, + "grad_norm": 2.541172504425049, + "learning_rate": 4.9990432672210174e-05, + "loss": 1.1911, + "step": 165 + }, + { + "epoch": 0.11060507482108002, + "grad_norm": 2.9252259731292725, + "learning_rate": 4.9988951539884365e-05, + "loss": 1.2091, + "step": 170 + }, + { + "epoch": 0.11385816525699415, + "grad_norm": 1.487844705581665, + "learning_rate": 4.9987363883764396e-05, + "loss": 1.1773, + "step": 175 + }, + { + "epoch": 0.11711125569290826, + "grad_norm": 1.4860082864761353, + "learning_rate": 4.9985669710618156e-05, + "loss": 1.1507, + "step": 180 + }, + { + "epoch": 0.12036434612882238, + "grad_norm": 1.6352834701538086, + "learning_rate": 4.9983869027667656e-05, + "loss": 1.111, + "step": 185 + }, + { + "epoch": 0.1236174365647365, + "grad_norm": 1.2237991094589233, + "learning_rate": 4.99819618425889e-05, + "loss": 1.1241, + "step": 190 + }, + { + "epoch": 0.12687052700065063, + "grad_norm": 1.7817842960357666, + "learning_rate": 4.997994816351191e-05, + "loss": 1.1474, + "step": 195 + }, + { + "epoch": 0.13012361743656473, + "grad_norm": 1.440572738647461, + "learning_rate": 4.997782799902065e-05, + "loss": 1.1401, + "step": 200 + }, + { + "epoch": 0.13337670787247885, + "grad_norm": 1.1736341714859009, + "learning_rate": 4.997560135815307e-05, + "loss": 1.1317, + "step": 205 + }, + { + "epoch": 0.13662979830839297, + "grad_norm": 1.5103353261947632, + "learning_rate": 4.997326825040094e-05, + "loss": 1.1289, + "step": 210 + }, + { + "epoch": 0.1398828887443071, + "grad_norm": 2.190202236175537, + "learning_rate": 4.997082868570993e-05, + "loss": 1.1173, + "step": 215 + }, + { + "epoch": 0.14313597918022122, + "grad_norm": 1.1619236469268799, + "learning_rate": 4.9968282674479486e-05, + "loss": 1.1033, + "step": 220 + }, + { + "epoch": 0.14638906961613532, + "grad_norm": 1.4644910097122192, + "learning_rate": 4.9965630227562866e-05, + "loss": 1.0935, + "step": 225 + }, + { + "epoch": 0.14964216005204944, + "grad_norm": 1.8269380331039429, + "learning_rate": 4.9962871356266994e-05, + "loss": 1.0968, + "step": 230 + }, + { + "epoch": 0.15289525048796357, + "grad_norm": 1.0320109128952026, + "learning_rate": 4.996000607235248e-05, + "loss": 1.0718, + "step": 235 + }, + { + "epoch": 0.1561483409238777, + "grad_norm": 2.302537202835083, + "learning_rate": 4.995703438803359e-05, + "loss": 1.1044, + "step": 240 + }, + { + "epoch": 0.1594014313597918, + "grad_norm": 2.2435052394866943, + "learning_rate": 4.995395631597809e-05, + "loss": 1.0936, + "step": 245 + }, + { + "epoch": 0.16265452179570591, + "grad_norm": 2.0816986560821533, + "learning_rate": 4.995077186930731e-05, + "loss": 1.1036, + "step": 250 + }, + { + "epoch": 0.16590761223162004, + "grad_norm": 1.0913221836090088, + "learning_rate": 4.994748106159602e-05, + "loss": 1.0906, + "step": 255 + }, + { + "epoch": 0.16916070266753416, + "grad_norm": 1.5220658779144287, + "learning_rate": 4.9944083906872405e-05, + "loss": 1.0789, + "step": 260 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 2.121184825897217, + "learning_rate": 4.994058041961796e-05, + "loss": 1.0718, + "step": 265 + }, + { + "epoch": 0.17566688353936238, + "grad_norm": 1.4716627597808838, + "learning_rate": 4.9936970614767485e-05, + "loss": 1.0378, + "step": 270 + }, + { + "epoch": 0.1789199739752765, + "grad_norm": 1.700437068939209, + "learning_rate": 4.993325450770898e-05, + "loss": 1.0586, + "step": 275 + }, + { + "epoch": 0.18217306441119063, + "grad_norm": 1.87433660030365, + "learning_rate": 4.9929432114283614e-05, + "loss": 1.0486, + "step": 280 + }, + { + "epoch": 0.18542615484710476, + "grad_norm": 1.4089707136154175, + "learning_rate": 4.992550345078559e-05, + "loss": 1.0547, + "step": 285 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 1.6576772928237915, + "learning_rate": 4.992146853396219e-05, + "loss": 1.033, + "step": 290 + }, + { + "epoch": 0.19193233571893298, + "grad_norm": 1.2037906646728516, + "learning_rate": 4.9917327381013585e-05, + "loss": 1.0335, + "step": 295 + }, + { + "epoch": 0.1951854261548471, + "grad_norm": 1.2297475337982178, + "learning_rate": 4.9913080009592824e-05, + "loss": 1.0508, + "step": 300 + }, + { + "epoch": 0.19843851659076123, + "grad_norm": 1.6239577531814575, + "learning_rate": 4.990872643780577e-05, + "loss": 1.0528, + "step": 305 + }, + { + "epoch": 0.20169160702667535, + "grad_norm": 1.4611440896987915, + "learning_rate": 4.9904266684210964e-05, + "loss": 1.0437, + "step": 310 + }, + { + "epoch": 0.20494469746258945, + "grad_norm": 1.5891550779342651, + "learning_rate": 4.989970076781961e-05, + "loss": 1.0262, + "step": 315 + }, + { + "epoch": 0.20819778789850357, + "grad_norm": 1.6164276599884033, + "learning_rate": 4.9895028708095474e-05, + "loss": 1.0388, + "step": 320 + }, + { + "epoch": 0.2114508783344177, + "grad_norm": 1.509124517440796, + "learning_rate": 4.989025052495476e-05, + "loss": 1.0332, + "step": 325 + }, + { + "epoch": 0.21470396877033182, + "grad_norm": 1.2852742671966553, + "learning_rate": 4.988536623876609e-05, + "loss": 1.0306, + "step": 330 + }, + { + "epoch": 0.21795705920624595, + "grad_norm": 1.3699005842208862, + "learning_rate": 4.988037587035036e-05, + "loss": 1.0266, + "step": 335 + }, + { + "epoch": 0.22121014964216004, + "grad_norm": 1.5352535247802734, + "learning_rate": 4.98752794409807e-05, + "loss": 1.0281, + "step": 340 + }, + { + "epoch": 0.22446324007807417, + "grad_norm": 1.1084707975387573, + "learning_rate": 4.9870076972382354e-05, + "loss": 0.9941, + "step": 345 + }, + { + "epoch": 0.2277163305139883, + "grad_norm": 2.580686330795288, + "learning_rate": 4.9864768486732585e-05, + "loss": 1.022, + "step": 350 + }, + { + "epoch": 0.23096942094990242, + "grad_norm": 22.110929489135742, + "learning_rate": 4.98593540066606e-05, + "loss": 1.0305, + "step": 355 + }, + { + "epoch": 0.2342225113858165, + "grad_norm": 1.32615065574646, + "learning_rate": 4.985383355524743e-05, + "loss": 1.0218, + "step": 360 + }, + { + "epoch": 0.23747560182173064, + "grad_norm": 1.7118041515350342, + "learning_rate": 4.984820715602585e-05, + "loss": 1.0216, + "step": 365 + }, + { + "epoch": 0.24072869225764476, + "grad_norm": 0.9846453070640564, + "learning_rate": 4.984247483298029e-05, + "loss": 1.0244, + "step": 370 + }, + { + "epoch": 0.24398178269355889, + "grad_norm": 1.1930924654006958, + "learning_rate": 4.9836636610546697e-05, + "loss": 1.0333, + "step": 375 + }, + { + "epoch": 0.247234873129473, + "grad_norm": 1.5189563035964966, + "learning_rate": 4.983069251361244e-05, + "loss": 1.0261, + "step": 380 + }, + { + "epoch": 0.2504879635653871, + "grad_norm": 1.648707389831543, + "learning_rate": 4.982464256751624e-05, + "loss": 1.032, + "step": 385 + }, + { + "epoch": 0.25374105400130126, + "grad_norm": 1.6429810523986816, + "learning_rate": 4.981848679804803e-05, + "loss": 1.0325, + "step": 390 + }, + { + "epoch": 0.25699414443721535, + "grad_norm": 1.6516709327697754, + "learning_rate": 4.981222523144882e-05, + "loss": 1.0036, + "step": 395 + }, + { + "epoch": 0.26024723487312945, + "grad_norm": 1.458806037902832, + "learning_rate": 4.980585789441066e-05, + "loss": 1.0104, + "step": 400 + }, + { + "epoch": 0.2635003253090436, + "grad_norm": 1.695860505104065, + "learning_rate": 4.979938481407645e-05, + "loss": 1.0072, + "step": 405 + }, + { + "epoch": 0.2667534157449577, + "grad_norm": 1.897962212562561, + "learning_rate": 4.9792806018039876e-05, + "loss": 1.0102, + "step": 410 + }, + { + "epoch": 0.27000650618087185, + "grad_norm": 1.5890872478485107, + "learning_rate": 4.9786121534345265e-05, + "loss": 0.9945, + "step": 415 + }, + { + "epoch": 0.27325959661678595, + "grad_norm": 1.090980887413025, + "learning_rate": 4.977933139148746e-05, + "loss": 0.9897, + "step": 420 + }, + { + "epoch": 0.27651268705270005, + "grad_norm": 1.3130533695220947, + "learning_rate": 4.977243561841174e-05, + "loss": 1.0161, + "step": 425 + }, + { + "epoch": 0.2797657774886142, + "grad_norm": 1.4305720329284668, + "learning_rate": 4.976543424451365e-05, + "loss": 0.9802, + "step": 430 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 1.3603191375732422, + "learning_rate": 4.975832729963888e-05, + "loss": 0.9904, + "step": 435 + }, + { + "epoch": 0.28627195836044245, + "grad_norm": 4.940343856811523, + "learning_rate": 4.9751114814083186e-05, + "loss": 0.9873, + "step": 440 + }, + { + "epoch": 0.28952504879635654, + "grad_norm": 2.8304941654205322, + "learning_rate": 4.974379681859221e-05, + "loss": 0.9855, + "step": 445 + }, + { + "epoch": 0.29277813923227064, + "grad_norm": 1.6684479713439941, + "learning_rate": 4.973637334436135e-05, + "loss": 1.0027, + "step": 450 + }, + { + "epoch": 0.2960312296681848, + "grad_norm": 1.036837100982666, + "learning_rate": 4.972884442303566e-05, + "loss": 0.999, + "step": 455 + }, + { + "epoch": 0.2992843201040989, + "grad_norm": 1.512636423110962, + "learning_rate": 4.972121008670971e-05, + "loss": 0.9988, + "step": 460 + }, + { + "epoch": 0.302537410540013, + "grad_norm": 1.7532439231872559, + "learning_rate": 4.97134703679274e-05, + "loss": 0.984, + "step": 465 + }, + { + "epoch": 0.30579050097592714, + "grad_norm": 1.0935814380645752, + "learning_rate": 4.970562529968189e-05, + "loss": 0.9725, + "step": 470 + }, + { + "epoch": 0.30904359141184123, + "grad_norm": 0.9008370041847229, + "learning_rate": 4.969767491541543e-05, + "loss": 0.9788, + "step": 475 + }, + { + "epoch": 0.3122966818477554, + "grad_norm": 0.9959591627120972, + "learning_rate": 4.9689619249019174e-05, + "loss": 0.9821, + "step": 480 + }, + { + "epoch": 0.3155497722836695, + "grad_norm": 1.1844329833984375, + "learning_rate": 4.9681458334833114e-05, + "loss": 0.9588, + "step": 485 + }, + { + "epoch": 0.3188028627195836, + "grad_norm": 1.136703372001648, + "learning_rate": 4.9673192207645894e-05, + "loss": 0.9948, + "step": 490 + }, + { + "epoch": 0.32205595315549773, + "grad_norm": 1.3169392347335815, + "learning_rate": 4.9664820902694654e-05, + "loss": 0.9924, + "step": 495 + }, + { + "epoch": 0.32530904359141183, + "grad_norm": 0.863590657711029, + "learning_rate": 4.9656344455664885e-05, + "loss": 0.9875, + "step": 500 + }, + { + "epoch": 0.328562134027326, + "grad_norm": 1.396112084388733, + "learning_rate": 4.9647762902690295e-05, + "loss": 0.9765, + "step": 505 + }, + { + "epoch": 0.3318152244632401, + "grad_norm": 2.4344260692596436, + "learning_rate": 4.963907628035264e-05, + "loss": 0.9848, + "step": 510 + }, + { + "epoch": 0.3350683148991542, + "grad_norm": 2.1281023025512695, + "learning_rate": 4.963028462568154e-05, + "loss": 0.9926, + "step": 515 + }, + { + "epoch": 0.3383214053350683, + "grad_norm": 1.2993323802947998, + "learning_rate": 4.9621387976154396e-05, + "loss": 0.9782, + "step": 520 + }, + { + "epoch": 0.3415744957709824, + "grad_norm": 1.3118032217025757, + "learning_rate": 4.961238636969616e-05, + "loss": 0.9885, + "step": 525 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 3.1309878826141357, + "learning_rate": 4.960327984467919e-05, + "loss": 0.9786, + "step": 530 + }, + { + "epoch": 0.34808067664281067, + "grad_norm": 0.9523271918296814, + "learning_rate": 4.9594068439923115e-05, + "loss": 0.9742, + "step": 535 + }, + { + "epoch": 0.35133376707872477, + "grad_norm": 1.010175347328186, + "learning_rate": 4.958475219469464e-05, + "loss": 0.9569, + "step": 540 + }, + { + "epoch": 0.3545868575146389, + "grad_norm": 0.8311709761619568, + "learning_rate": 4.9575331148707385e-05, + "loss": 0.9638, + "step": 545 + }, + { + "epoch": 0.357839947950553, + "grad_norm": 2.4854001998901367, + "learning_rate": 4.9565805342121716e-05, + "loss": 0.9537, + "step": 550 + }, + { + "epoch": 0.36109303838646717, + "grad_norm": 11.345256805419922, + "learning_rate": 4.955617481554459e-05, + "loss": 0.9738, + "step": 555 + }, + { + "epoch": 0.36434612882238127, + "grad_norm": 1.5204081535339355, + "learning_rate": 4.954643961002936e-05, + "loss": 0.9907, + "step": 560 + }, + { + "epoch": 0.36759921925829536, + "grad_norm": 1.0884438753128052, + "learning_rate": 4.95365997670756e-05, + "loss": 0.9653, + "step": 565 + }, + { + "epoch": 0.3708523096942095, + "grad_norm": 0.7718302011489868, + "learning_rate": 4.952665532862895e-05, + "loss": 0.9573, + "step": 570 + }, + { + "epoch": 0.3741054001301236, + "grad_norm": 1.0595511198043823, + "learning_rate": 4.9516606337080904e-05, + "loss": 0.9535, + "step": 575 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 0.9665298461914062, + "learning_rate": 4.950645283526868e-05, + "loss": 0.9614, + "step": 580 + }, + { + "epoch": 0.38061158100195186, + "grad_norm": 2.5156960487365723, + "learning_rate": 4.949619486647497e-05, + "loss": 0.9699, + "step": 585 + }, + { + "epoch": 0.38386467143786596, + "grad_norm": 0.983572244644165, + "learning_rate": 4.948583247442783e-05, + "loss": 0.9419, + "step": 590 + }, + { + "epoch": 0.3871177618737801, + "grad_norm": 1.5895793437957764, + "learning_rate": 4.9475365703300416e-05, + "loss": 0.9431, + "step": 595 + }, + { + "epoch": 0.3903708523096942, + "grad_norm": 0.7313794493675232, + "learning_rate": 4.9464794597710864e-05, + "loss": 0.9517, + "step": 600 + }, + { + "epoch": 0.3936239427456083, + "grad_norm": 0.8579323887825012, + "learning_rate": 4.945411920272205e-05, + "loss": 0.9623, + "step": 605 + }, + { + "epoch": 0.39687703318152245, + "grad_norm": 1.7319488525390625, + "learning_rate": 4.944333956384144e-05, + "loss": 0.9652, + "step": 610 + }, + { + "epoch": 0.40013012361743655, + "grad_norm": 0.8903234601020813, + "learning_rate": 4.943245572702086e-05, + "loss": 0.9601, + "step": 615 + }, + { + "epoch": 0.4033832140533507, + "grad_norm": 0.8647845387458801, + "learning_rate": 4.942146773865631e-05, + "loss": 0.9503, + "step": 620 + }, + { + "epoch": 0.4066363044892648, + "grad_norm": 1.0848610401153564, + "learning_rate": 4.941037564558779e-05, + "loss": 0.9482, + "step": 625 + }, + { + "epoch": 0.4098893949251789, + "grad_norm": 1.0989733934402466, + "learning_rate": 4.939917949509907e-05, + "loss": 0.9418, + "step": 630 + }, + { + "epoch": 0.41314248536109305, + "grad_norm": 0.8608380556106567, + "learning_rate": 4.938787933491749e-05, + "loss": 0.943, + "step": 635 + }, + { + "epoch": 0.41639557579700714, + "grad_norm": 2.864649772644043, + "learning_rate": 4.937647521321378e-05, + "loss": 0.948, + "step": 640 + }, + { + "epoch": 0.4196486662329213, + "grad_norm": 0.8896744847297668, + "learning_rate": 4.936496717860184e-05, + "loss": 0.9573, + "step": 645 + }, + { + "epoch": 0.4229017566688354, + "grad_norm": 1.1625382900238037, + "learning_rate": 4.935335528013853e-05, + "loss": 0.9497, + "step": 650 + }, + { + "epoch": 0.4261548471047495, + "grad_norm": 0.8717337846755981, + "learning_rate": 4.934163956732345e-05, + "loss": 0.9442, + "step": 655 + }, + { + "epoch": 0.42940793754066364, + "grad_norm": 0.9472817778587341, + "learning_rate": 4.932982009009879e-05, + "loss": 0.941, + "step": 660 + }, + { + "epoch": 0.43266102797657774, + "grad_norm": 1.1837284564971924, + "learning_rate": 4.931789689884901e-05, + "loss": 0.9521, + "step": 665 + }, + { + "epoch": 0.4359141184124919, + "grad_norm": 0.7896617650985718, + "learning_rate": 4.9305870044400725e-05, + "loss": 0.9407, + "step": 670 + }, + { + "epoch": 0.439167208848406, + "grad_norm": 1.0529261827468872, + "learning_rate": 4.9293739578022444e-05, + "loss": 0.9483, + "step": 675 + }, + { + "epoch": 0.4424202992843201, + "grad_norm": 1.018526315689087, + "learning_rate": 4.928150555142436e-05, + "loss": 0.9474, + "step": 680 + }, + { + "epoch": 0.44567338972023424, + "grad_norm": 1.2944765090942383, + "learning_rate": 4.926916801675809e-05, + "loss": 0.9548, + "step": 685 + }, + { + "epoch": 0.44892648015614833, + "grad_norm": 1.1999421119689941, + "learning_rate": 4.925672702661653e-05, + "loss": 0.9313, + "step": 690 + }, + { + "epoch": 0.4521795705920625, + "grad_norm": 1.117092490196228, + "learning_rate": 4.92441826340336e-05, + "loss": 0.9212, + "step": 695 + }, + { + "epoch": 0.4554326610279766, + "grad_norm": 0.976388692855835, + "learning_rate": 4.923153489248395e-05, + "loss": 0.9258, + "step": 700 + }, + { + "epoch": 0.4586857514638907, + "grad_norm": 1.206737756729126, + "learning_rate": 4.921878385588284e-05, + "loss": 0.897, + "step": 705 + }, + { + "epoch": 0.46193884189980483, + "grad_norm": 0.9215301871299744, + "learning_rate": 4.920592957858584e-05, + "loss": 0.9646, + "step": 710 + }, + { + "epoch": 0.4651919323357189, + "grad_norm": 2.4111385345458984, + "learning_rate": 4.9192972115388634e-05, + "loss": 0.9249, + "step": 715 + }, + { + "epoch": 0.468445022771633, + "grad_norm": 1.1773241758346558, + "learning_rate": 4.9179911521526734e-05, + "loss": 0.9289, + "step": 720 + }, + { + "epoch": 0.4716981132075472, + "grad_norm": 1.2238733768463135, + "learning_rate": 4.9166747852675325e-05, + "loss": 0.9672, + "step": 725 + }, + { + "epoch": 0.4749512036434613, + "grad_norm": 0.8060822486877441, + "learning_rate": 4.9153481164948964e-05, + "loss": 0.9571, + "step": 730 + }, + { + "epoch": 0.4782042940793754, + "grad_norm": 1.0834110975265503, + "learning_rate": 4.914011151490135e-05, + "loss": 0.925, + "step": 735 + }, + { + "epoch": 0.4814573845152895, + "grad_norm": 2.0932958126068115, + "learning_rate": 4.912663895952511e-05, + "loss": 0.9022, + "step": 740 + }, + { + "epoch": 0.4847104749512036, + "grad_norm": 0.9362598061561584, + "learning_rate": 4.911306355625154e-05, + "loss": 0.9569, + "step": 745 + }, + { + "epoch": 0.48796356538711777, + "grad_norm": 0.8350614905357361, + "learning_rate": 4.909938536295034e-05, + "loss": 0.9283, + "step": 750 + }, + { + "epoch": 0.49121665582303187, + "grad_norm": 0.7186315059661865, + "learning_rate": 4.908560443792941e-05, + "loss": 0.916, + "step": 755 + }, + { + "epoch": 0.494469746258946, + "grad_norm": 0.8941408395767212, + "learning_rate": 4.907172083993457e-05, + "loss": 0.9132, + "step": 760 + }, + { + "epoch": 0.4977228366948601, + "grad_norm": 0.6881601214408875, + "learning_rate": 4.9057734628149296e-05, + "loss": 0.9401, + "step": 765 + }, + { + "epoch": 0.5009759271307742, + "grad_norm": 0.7907692790031433, + "learning_rate": 4.904364586219454e-05, + "loss": 0.9453, + "step": 770 + }, + { + "epoch": 0.5042290175666884, + "grad_norm": 0.6884419322013855, + "learning_rate": 4.902945460212839e-05, + "loss": 0.9213, + "step": 775 + }, + { + "epoch": 0.5074821080026025, + "grad_norm": 0.9783698916435242, + "learning_rate": 4.9015160908445846e-05, + "loss": 0.9324, + "step": 780 + }, + { + "epoch": 0.5107351984385166, + "grad_norm": 0.670250415802002, + "learning_rate": 4.900076484207857e-05, + "loss": 0.9186, + "step": 785 + }, + { + "epoch": 0.5139882888744307, + "grad_norm": 0.6589803695678711, + "learning_rate": 4.8986266464394645e-05, + "loss": 0.9245, + "step": 790 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 0.8181180953979492, + "learning_rate": 4.8971665837198266e-05, + "loss": 0.9117, + "step": 795 + }, + { + "epoch": 0.5204944697462589, + "grad_norm": 0.9122630953788757, + "learning_rate": 4.89569630227295e-05, + "loss": 0.916, + "step": 800 + }, + { + "epoch": 0.523747560182173, + "grad_norm": 1.7749128341674805, + "learning_rate": 4.894215808366404e-05, + "loss": 0.927, + "step": 805 + }, + { + "epoch": 0.5270006506180872, + "grad_norm": 0.708722710609436, + "learning_rate": 4.892725108311289e-05, + "loss": 0.9378, + "step": 810 + }, + { + "epoch": 0.5302537410540012, + "grad_norm": 0.7337002754211426, + "learning_rate": 4.891224208462217e-05, + "loss": 0.9042, + "step": 815 + }, + { + "epoch": 0.5335068314899154, + "grad_norm": 0.6495442986488342, + "learning_rate": 4.889713115217276e-05, + "loss": 0.9271, + "step": 820 + }, + { + "epoch": 0.5367599219258296, + "grad_norm": 0.6206863522529602, + "learning_rate": 4.8881918350180076e-05, + "loss": 0.9292, + "step": 825 + }, + { + "epoch": 0.5400130123617437, + "grad_norm": 1.1107088327407837, + "learning_rate": 4.886660374349381e-05, + "loss": 0.9002, + "step": 830 + }, + { + "epoch": 0.5432661027976577, + "grad_norm": 0.7245462536811829, + "learning_rate": 4.885118739739759e-05, + "loss": 0.898, + "step": 835 + }, + { + "epoch": 0.5465191932335719, + "grad_norm": 0.7149853110313416, + "learning_rate": 4.883566937760879e-05, + "loss": 0.9105, + "step": 840 + }, + { + "epoch": 0.549772283669486, + "grad_norm": 0.8408038020133972, + "learning_rate": 4.882004975027816e-05, + "loss": 0.9059, + "step": 845 + }, + { + "epoch": 0.5530253741054001, + "grad_norm": 0.8386579155921936, + "learning_rate": 4.880432858198962e-05, + "loss": 0.9148, + "step": 850 + }, + { + "epoch": 0.5562784645413142, + "grad_norm": 0.6639278531074524, + "learning_rate": 4.878850593975992e-05, + "loss": 0.8995, + "step": 855 + }, + { + "epoch": 0.5595315549772284, + "grad_norm": 0.8974155187606812, + "learning_rate": 4.8772581891038385e-05, + "loss": 0.9406, + "step": 860 + }, + { + "epoch": 0.5627846454131424, + "grad_norm": 1.0255565643310547, + "learning_rate": 4.875655650370662e-05, + "loss": 0.9067, + "step": 865 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 1.0870089530944824, + "learning_rate": 4.87404298460782e-05, + "loss": 0.9239, + "step": 870 + }, + { + "epoch": 0.5692908262849707, + "grad_norm": 0.7773581743240356, + "learning_rate": 4.872420198689845e-05, + "loss": 0.9284, + "step": 875 + }, + { + "epoch": 0.5725439167208849, + "grad_norm": 0.6731524467468262, + "learning_rate": 4.870787299534404e-05, + "loss": 0.9122, + "step": 880 + }, + { + "epoch": 0.5757970071567989, + "grad_norm": 0.8352004885673523, + "learning_rate": 4.869144294102279e-05, + "loss": 0.906, + "step": 885 + }, + { + "epoch": 0.5790500975927131, + "grad_norm": 0.7254254817962646, + "learning_rate": 4.8674911893973305e-05, + "loss": 0.899, + "step": 890 + }, + { + "epoch": 0.5823031880286272, + "grad_norm": 0.6976432204246521, + "learning_rate": 4.8658279924664754e-05, + "loss": 0.9104, + "step": 895 + }, + { + "epoch": 0.5855562784645413, + "grad_norm": 0.7411409616470337, + "learning_rate": 4.8641547103996456e-05, + "loss": 0.9216, + "step": 900 + }, + { + "epoch": 0.5888093689004554, + "grad_norm": 0.8754427433013916, + "learning_rate": 4.862471350329769e-05, + "loss": 0.9156, + "step": 905 + }, + { + "epoch": 0.5920624593363696, + "grad_norm": 0.7311625480651855, + "learning_rate": 4.8607779194327344e-05, + "loss": 0.9125, + "step": 910 + }, + { + "epoch": 0.5953155497722836, + "grad_norm": 0.7393909096717834, + "learning_rate": 4.8590744249273566e-05, + "loss": 0.9011, + "step": 915 + }, + { + "epoch": 0.5985686402081978, + "grad_norm": 0.7025457620620728, + "learning_rate": 4.857360874075355e-05, + "loss": 0.9198, + "step": 920 + }, + { + "epoch": 0.6018217306441119, + "grad_norm": 0.7867306470870972, + "learning_rate": 4.855637274181314e-05, + "loss": 0.8803, + "step": 925 + }, + { + "epoch": 0.605074821080026, + "grad_norm": 0.6452615857124329, + "learning_rate": 4.853903632592657e-05, + "loss": 0.9104, + "step": 930 + }, + { + "epoch": 0.6083279115159401, + "grad_norm": 0.8884091377258301, + "learning_rate": 4.852159956699614e-05, + "loss": 0.9119, + "step": 935 + }, + { + "epoch": 0.6115810019518543, + "grad_norm": 0.5305378437042236, + "learning_rate": 4.850406253935188e-05, + "loss": 0.9296, + "step": 940 + }, + { + "epoch": 0.6148340923877684, + "grad_norm": 3.5500142574310303, + "learning_rate": 4.848642531775126e-05, + "loss": 0.8996, + "step": 945 + }, + { + "epoch": 0.6180871828236825, + "grad_norm": 0.7085503339767456, + "learning_rate": 4.846868797737886e-05, + "loss": 0.9045, + "step": 950 + }, + { + "epoch": 0.6213402732595966, + "grad_norm": 0.9540202021598816, + "learning_rate": 4.8450850593846035e-05, + "loss": 0.9041, + "step": 955 + }, + { + "epoch": 0.6245933636955108, + "grad_norm": 0.7558302879333496, + "learning_rate": 4.843291324319064e-05, + "loss": 0.8998, + "step": 960 + }, + { + "epoch": 0.6278464541314248, + "grad_norm": 1.2235993146896362, + "learning_rate": 4.8414876001876636e-05, + "loss": 0.8977, + "step": 965 + }, + { + "epoch": 0.631099544567339, + "grad_norm": 0.9432989954948425, + "learning_rate": 4.839673894679383e-05, + "loss": 0.9013, + "step": 970 + }, + { + "epoch": 0.6343526350032531, + "grad_norm": 0.8516787886619568, + "learning_rate": 4.83785021552575e-05, + "loss": 0.9195, + "step": 975 + }, + { + "epoch": 0.6376057254391672, + "grad_norm": 0.9901612401008606, + "learning_rate": 4.836016570500809e-05, + "loss": 0.8917, + "step": 980 + }, + { + "epoch": 0.6408588158750813, + "grad_norm": 0.7319976687431335, + "learning_rate": 4.834172967421088e-05, + "loss": 0.8961, + "step": 985 + }, + { + "epoch": 0.6441119063109955, + "grad_norm": 0.8301442265510559, + "learning_rate": 4.832319414145565e-05, + "loss": 0.9025, + "step": 990 + }, + { + "epoch": 0.6473649967469096, + "grad_norm": 0.7469923496246338, + "learning_rate": 4.8304559185756303e-05, + "loss": 0.8908, + "step": 995 + }, + { + "epoch": 0.6506180871828237, + "grad_norm": 0.7159551382064819, + "learning_rate": 4.828582488655062e-05, + "loss": 0.8948, + "step": 1000 + }, + { + "epoch": 0.6538711776187378, + "grad_norm": 1.0059654712677002, + "learning_rate": 4.826699132369983e-05, + "loss": 0.9101, + "step": 1005 + }, + { + "epoch": 0.657124268054652, + "grad_norm": 0.9471872448921204, + "learning_rate": 4.824805857748831e-05, + "loss": 0.9172, + "step": 1010 + }, + { + "epoch": 0.660377358490566, + "grad_norm": 0.7876858711242676, + "learning_rate": 4.822902672862325e-05, + "loss": 0.8933, + "step": 1015 + }, + { + "epoch": 0.6636304489264802, + "grad_norm": 0.7322363257408142, + "learning_rate": 4.82098958582343e-05, + "loss": 0.897, + "step": 1020 + }, + { + "epoch": 0.6668835393623943, + "grad_norm": 0.7154924273490906, + "learning_rate": 4.819066604787321e-05, + "loss": 0.8931, + "step": 1025 + }, + { + "epoch": 0.6701366297983083, + "grad_norm": 0.759896457195282, + "learning_rate": 4.817133737951352e-05, + "loss": 0.894, + "step": 1030 + }, + { + "epoch": 0.6733897202342225, + "grad_norm": 0.9226410984992981, + "learning_rate": 4.815190993555013e-05, + "loss": 0.884, + "step": 1035 + }, + { + "epoch": 0.6766428106701367, + "grad_norm": 0.7603817582130432, + "learning_rate": 4.8132383798799077e-05, + "loss": 0.901, + "step": 1040 + }, + { + "epoch": 0.6798959011060507, + "grad_norm": 0.5785139203071594, + "learning_rate": 4.811275905249705e-05, + "loss": 0.9105, + "step": 1045 + }, + { + "epoch": 0.6831489915419648, + "grad_norm": 0.9068583846092224, + "learning_rate": 4.8093035780301135e-05, + "loss": 0.8941, + "step": 1050 + }, + { + "epoch": 0.686402081977879, + "grad_norm": 0.6756225228309631, + "learning_rate": 4.807321406628838e-05, + "loss": 0.9318, + "step": 1055 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.5247710943222046, + "learning_rate": 4.805329399495552e-05, + "loss": 0.8878, + "step": 1060 + }, + { + "epoch": 0.6929082628497072, + "grad_norm": 0.7933164238929749, + "learning_rate": 4.8033275651218525e-05, + "loss": 0.8926, + "step": 1065 + }, + { + "epoch": 0.6961613532856213, + "grad_norm": 0.7385444641113281, + "learning_rate": 4.8013159120412324e-05, + "loss": 0.9179, + "step": 1070 + }, + { + "epoch": 0.6994144437215355, + "grad_norm": 0.5647626519203186, + "learning_rate": 4.7992944488290357e-05, + "loss": 0.8982, + "step": 1075 + }, + { + "epoch": 0.7026675341574495, + "grad_norm": 0.7482002973556519, + "learning_rate": 4.79726318410243e-05, + "loss": 0.8903, + "step": 1080 + }, + { + "epoch": 0.7059206245933637, + "grad_norm": 0.7001275420188904, + "learning_rate": 4.7952221265203626e-05, + "loss": 0.9044, + "step": 1085 + }, + { + "epoch": 0.7091737150292778, + "grad_norm": 0.7466359734535217, + "learning_rate": 4.793171284783525e-05, + "loss": 0.8829, + "step": 1090 + }, + { + "epoch": 0.7124268054651919, + "grad_norm": 0.938048779964447, + "learning_rate": 4.791110667634321e-05, + "loss": 0.9097, + "step": 1095 + }, + { + "epoch": 0.715679895901106, + "grad_norm": 1.033058762550354, + "learning_rate": 4.789040283856822e-05, + "loss": 0.8829, + "step": 1100 + }, + { + "epoch": 0.7189329863370202, + "grad_norm": 0.7184875011444092, + "learning_rate": 4.7869601422767326e-05, + "loss": 0.9007, + "step": 1105 + }, + { + "epoch": 0.7221860767729343, + "grad_norm": 0.6718536615371704, + "learning_rate": 4.784870251761357e-05, + "loss": 0.8909, + "step": 1110 + }, + { + "epoch": 0.7254391672088484, + "grad_norm": 0.854129433631897, + "learning_rate": 4.782770621219552e-05, + "loss": 0.9017, + "step": 1115 + }, + { + "epoch": 0.7286922576447625, + "grad_norm": 0.6812130212783813, + "learning_rate": 4.7806612596017e-05, + "loss": 0.8995, + "step": 1120 + }, + { + "epoch": 0.7319453480806767, + "grad_norm": 0.6341739892959595, + "learning_rate": 4.778542175899662e-05, + "loss": 0.865, + "step": 1125 + }, + { + "epoch": 0.7351984385165907, + "grad_norm": 0.7255128026008606, + "learning_rate": 4.776413379146743e-05, + "loss": 0.8991, + "step": 1130 + }, + { + "epoch": 0.7384515289525049, + "grad_norm": 0.6501480340957642, + "learning_rate": 4.7742748784176554e-05, + "loss": 0.896, + "step": 1135 + }, + { + "epoch": 0.741704619388419, + "grad_norm": 1.5047345161437988, + "learning_rate": 4.7721266828284754e-05, + "loss": 0.9003, + "step": 1140 + }, + { + "epoch": 0.7449577098243331, + "grad_norm": 0.9915900230407715, + "learning_rate": 4.769968801536608e-05, + "loss": 0.887, + "step": 1145 + }, + { + "epoch": 0.7482108002602472, + "grad_norm": 1.0044691562652588, + "learning_rate": 4.767801243740746e-05, + "loss": 0.8908, + "step": 1150 + }, + { + "epoch": 0.7514638906961614, + "grad_norm": 0.5450899004936218, + "learning_rate": 4.765624018680833e-05, + "loss": 0.9114, + "step": 1155 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.6787106394767761, + "learning_rate": 4.763437135638021e-05, + "loss": 0.9027, + "step": 1160 + }, + { + "epoch": 0.7579700715679896, + "grad_norm": 0.8962056636810303, + "learning_rate": 4.761240603934633e-05, + "loss": 0.9099, + "step": 1165 + }, + { + "epoch": 0.7612231620039037, + "grad_norm": 0.7145211100578308, + "learning_rate": 4.759034432934123e-05, + "loss": 0.909, + "step": 1170 + }, + { + "epoch": 0.7644762524398179, + "grad_norm": 0.5293973684310913, + "learning_rate": 4.7568186320410356e-05, + "loss": 0.8734, + "step": 1175 + }, + { + "epoch": 0.7677293428757319, + "grad_norm": 0.741165041923523, + "learning_rate": 4.754593210700966e-05, + "loss": 0.866, + "step": 1180 + }, + { + "epoch": 0.7709824333116461, + "grad_norm": 0.7917879819869995, + "learning_rate": 4.7523581784005187e-05, + "loss": 0.8871, + "step": 1185 + }, + { + "epoch": 0.7742355237475602, + "grad_norm": 0.7449229955673218, + "learning_rate": 4.750113544667271e-05, + "loss": 0.8966, + "step": 1190 + }, + { + "epoch": 0.7774886141834743, + "grad_norm": 0.6702096462249756, + "learning_rate": 4.7478593190697254e-05, + "loss": 0.8784, + "step": 1195 + }, + { + "epoch": 0.7807417046193884, + "grad_norm": 0.6181638240814209, + "learning_rate": 4.745595511217277e-05, + "loss": 0.9148, + "step": 1200 + }, + { + "epoch": 0.7839947950553026, + "grad_norm": 0.6590509414672852, + "learning_rate": 4.743322130760166e-05, + "loss": 0.8738, + "step": 1205 + }, + { + "epoch": 0.7872478854912166, + "grad_norm": 0.6860212087631226, + "learning_rate": 4.7410391873894386e-05, + "loss": 0.8986, + "step": 1210 + }, + { + "epoch": 0.7905009759271308, + "grad_norm": 0.8018529415130615, + "learning_rate": 4.73874669083691e-05, + "loss": 0.8894, + "step": 1215 + }, + { + "epoch": 0.7937540663630449, + "grad_norm": 0.49539148807525635, + "learning_rate": 4.736444650875114e-05, + "loss": 0.8812, + "step": 1220 + }, + { + "epoch": 0.7970071567989591, + "grad_norm": 0.5640125870704651, + "learning_rate": 4.7341330773172686e-05, + "loss": 0.8681, + "step": 1225 + }, + { + "epoch": 0.8002602472348731, + "grad_norm": 0.7945250272750854, + "learning_rate": 4.731811980017234e-05, + "loss": 0.8608, + "step": 1230 + }, + { + "epoch": 0.8035133376707873, + "grad_norm": 1.0091153383255005, + "learning_rate": 4.729481368869465e-05, + "loss": 0.8853, + "step": 1235 + }, + { + "epoch": 0.8067664281067014, + "grad_norm": 0.6991880536079407, + "learning_rate": 4.727141253808974e-05, + "loss": 0.8783, + "step": 1240 + }, + { + "epoch": 0.8100195185426154, + "grad_norm": 1.1925946474075317, + "learning_rate": 4.724791644811287e-05, + "loss": 0.8685, + "step": 1245 + }, + { + "epoch": 0.8132726089785296, + "grad_norm": 0.5790075659751892, + "learning_rate": 4.722432551892402e-05, + "loss": 0.8715, + "step": 1250 + }, + { + "epoch": 0.8165256994144438, + "grad_norm": 0.6202079653739929, + "learning_rate": 4.720063985108743e-05, + "loss": 0.8673, + "step": 1255 + }, + { + "epoch": 0.8197787898503578, + "grad_norm": 0.6250977516174316, + "learning_rate": 4.717685954557123e-05, + "loss": 0.8636, + "step": 1260 + }, + { + "epoch": 0.8230318802862719, + "grad_norm": 0.5988998413085938, + "learning_rate": 4.715298470374694e-05, + "loss": 0.8929, + "step": 1265 + }, + { + "epoch": 0.8262849707221861, + "grad_norm": 0.6314589977264404, + "learning_rate": 4.712901542738908e-05, + "loss": 0.8759, + "step": 1270 + }, + { + "epoch": 0.8295380611581002, + "grad_norm": 2.416761875152588, + "learning_rate": 4.7104951818674755e-05, + "loss": 0.8854, + "step": 1275 + }, + { + "epoch": 0.8327911515940143, + "grad_norm": 0.6815042495727539, + "learning_rate": 4.7080793980183165e-05, + "loss": 0.8801, + "step": 1280 + }, + { + "epoch": 0.8360442420299284, + "grad_norm": 0.763461709022522, + "learning_rate": 4.7056542014895204e-05, + "loss": 0.8805, + "step": 1285 + }, + { + "epoch": 0.8392973324658426, + "grad_norm": 0.6846510767936707, + "learning_rate": 4.703219602619302e-05, + "loss": 0.8847, + "step": 1290 + }, + { + "epoch": 0.8425504229017566, + "grad_norm": 0.7311460375785828, + "learning_rate": 4.7007756117859566e-05, + "loss": 0.8802, + "step": 1295 + }, + { + "epoch": 0.8458035133376708, + "grad_norm": 0.6256663203239441, + "learning_rate": 4.698322239407814e-05, + "loss": 0.872, + "step": 1300 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 0.7773202061653137, + "learning_rate": 4.695859495943199e-05, + "loss": 0.8902, + "step": 1305 + }, + { + "epoch": 0.852309694209499, + "grad_norm": 0.6700133085250854, + "learning_rate": 4.6933873918903816e-05, + "loss": 0.8713, + "step": 1310 + }, + { + "epoch": 0.8555627846454131, + "grad_norm": 1.7503775358200073, + "learning_rate": 4.690905937787536e-05, + "loss": 0.8763, + "step": 1315 + }, + { + "epoch": 0.8588158750813273, + "grad_norm": 0.7265576720237732, + "learning_rate": 4.688415144212692e-05, + "loss": 0.8808, + "step": 1320 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 0.9674301147460938, + "learning_rate": 4.685915021783694e-05, + "loss": 0.8953, + "step": 1325 + }, + { + "epoch": 0.8653220559531555, + "grad_norm": 0.7183430194854736, + "learning_rate": 4.683405581158153e-05, + "loss": 0.8679, + "step": 1330 + }, + { + "epoch": 0.8685751463890696, + "grad_norm": 0.6251899003982544, + "learning_rate": 4.6808868330334024e-05, + "loss": 0.9087, + "step": 1335 + }, + { + "epoch": 0.8718282368249838, + "grad_norm": 0.6841866970062256, + "learning_rate": 4.67835878814645e-05, + "loss": 0.8935, + "step": 1340 + }, + { + "epoch": 0.8750813272608978, + "grad_norm": 0.7957316637039185, + "learning_rate": 4.675821457273938e-05, + "loss": 0.8677, + "step": 1345 + }, + { + "epoch": 0.878334417696812, + "grad_norm": 0.6032708287239075, + "learning_rate": 4.67327485123209e-05, + "loss": 0.8975, + "step": 1350 + }, + { + "epoch": 0.8815875081327261, + "grad_norm": 0.7431797981262207, + "learning_rate": 4.6707189808766684e-05, + "loss": 0.8919, + "step": 1355 + }, + { + "epoch": 0.8848405985686402, + "grad_norm": 0.6148476004600525, + "learning_rate": 4.6681538571029295e-05, + "loss": 0.8548, + "step": 1360 + }, + { + "epoch": 0.8880936890045543, + "grad_norm": 0.6982467770576477, + "learning_rate": 4.665579490845574e-05, + "loss": 0.8858, + "step": 1365 + }, + { + "epoch": 0.8913467794404685, + "grad_norm": 0.6519801616668701, + "learning_rate": 4.662995893078702e-05, + "loss": 0.8957, + "step": 1370 + }, + { + "epoch": 0.8945998698763825, + "grad_norm": 0.6641463041305542, + "learning_rate": 4.660403074815767e-05, + "loss": 0.86, + "step": 1375 + }, + { + "epoch": 0.8978529603122967, + "grad_norm": 0.7551462650299072, + "learning_rate": 4.657801047109527e-05, + "loss": 0.8709, + "step": 1380 + }, + { + "epoch": 0.9011060507482108, + "grad_norm": 0.7857052087783813, + "learning_rate": 4.655189821051998e-05, + "loss": 0.8539, + "step": 1385 + }, + { + "epoch": 0.904359141184125, + "grad_norm": 1.1861300468444824, + "learning_rate": 4.6525694077744076e-05, + "loss": 0.8855, + "step": 1390 + }, + { + "epoch": 0.907612231620039, + "grad_norm": 0.8817470073699951, + "learning_rate": 4.6499398184471476e-05, + "loss": 0.8734, + "step": 1395 + }, + { + "epoch": 0.9108653220559532, + "grad_norm": 0.6167863011360168, + "learning_rate": 4.647301064279725e-05, + "loss": 0.8765, + "step": 1400 + }, + { + "epoch": 0.9141184124918673, + "grad_norm": 0.8413434624671936, + "learning_rate": 4.644653156520715e-05, + "loss": 0.8889, + "step": 1405 + }, + { + "epoch": 0.9173715029277814, + "grad_norm": 0.5738908052444458, + "learning_rate": 4.6419961064577134e-05, + "loss": 0.8479, + "step": 1410 + }, + { + "epoch": 0.9206245933636955, + "grad_norm": 0.9078507423400879, + "learning_rate": 4.6393299254172875e-05, + "loss": 0.881, + "step": 1415 + }, + { + "epoch": 0.9238776837996097, + "grad_norm": 1.190238356590271, + "learning_rate": 4.63665462476493e-05, + "loss": 0.8692, + "step": 1420 + }, + { + "epoch": 0.9271307742355237, + "grad_norm": 0.5501294136047363, + "learning_rate": 4.633970215905007e-05, + "loss": 0.8792, + "step": 1425 + }, + { + "epoch": 0.9303838646714379, + "grad_norm": 0.6713528633117676, + "learning_rate": 4.631276710280713e-05, + "loss": 0.861, + "step": 1430 + }, + { + "epoch": 0.933636955107352, + "grad_norm": 0.600857675075531, + "learning_rate": 4.6285741193740194e-05, + "loss": 0.8657, + "step": 1435 + }, + { + "epoch": 0.936890045543266, + "grad_norm": 1.3047159910202026, + "learning_rate": 4.625862454705629e-05, + "loss": 0.8716, + "step": 1440 + }, + { + "epoch": 0.9401431359791802, + "grad_norm": 0.7485547065734863, + "learning_rate": 4.623141727834919e-05, + "loss": 0.8742, + "step": 1445 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 0.7072353959083557, + "learning_rate": 4.620411950359903e-05, + "loss": 0.8659, + "step": 1450 + }, + { + "epoch": 0.9466493168510085, + "grad_norm": 0.5867493748664856, + "learning_rate": 4.617673133917175e-05, + "loss": 0.8864, + "step": 1455 + }, + { + "epoch": 0.9499024072869225, + "grad_norm": 0.6515786647796631, + "learning_rate": 4.614925290181858e-05, + "loss": 0.8841, + "step": 1460 + }, + { + "epoch": 0.9531554977228367, + "grad_norm": 0.7220116853713989, + "learning_rate": 4.612168430867559e-05, + "loss": 0.88, + "step": 1465 + }, + { + "epoch": 0.9564085881587508, + "grad_norm": 0.5353178381919861, + "learning_rate": 4.6094025677263155e-05, + "loss": 0.8578, + "step": 1470 + }, + { + "epoch": 0.9596616785946649, + "grad_norm": 1.0675499439239502, + "learning_rate": 4.606627712548548e-05, + "loss": 0.8705, + "step": 1475 + }, + { + "epoch": 0.962914769030579, + "grad_norm": 0.6946088671684265, + "learning_rate": 4.6038438771630074e-05, + "loss": 0.8707, + "step": 1480 + }, + { + "epoch": 0.9661678594664932, + "grad_norm": 0.6132957339286804, + "learning_rate": 4.601051073436728e-05, + "loss": 0.872, + "step": 1485 + }, + { + "epoch": 0.9694209499024072, + "grad_norm": 2.741361379623413, + "learning_rate": 4.5982493132749724e-05, + "loss": 0.8711, + "step": 1490 + }, + { + "epoch": 0.9726740403383214, + "grad_norm": 0.6481953859329224, + "learning_rate": 4.595438608621183e-05, + "loss": 0.8804, + "step": 1495 + }, + { + "epoch": 0.9759271307742355, + "grad_norm": 0.8871548771858215, + "learning_rate": 4.592618971456933e-05, + "loss": 0.863, + "step": 1500 + }, + { + "epoch": 0.9791802212101497, + "grad_norm": 1.2673571109771729, + "learning_rate": 4.5897904138018724e-05, + "loss": 0.8781, + "step": 1505 + }, + { + "epoch": 0.9824333116460637, + "grad_norm": 0.5219647288322449, + "learning_rate": 4.586952947713677e-05, + "loss": 0.8738, + "step": 1510 + }, + { + "epoch": 0.9856864020819779, + "grad_norm": 0.7620292901992798, + "learning_rate": 4.584106585287998e-05, + "loss": 0.8602, + "step": 1515 + }, + { + "epoch": 0.988939492517892, + "grad_norm": 5.0167717933654785, + "learning_rate": 4.581251338658412e-05, + "loss": 0.879, + "step": 1520 + }, + { + "epoch": 0.9921925829538061, + "grad_norm": 0.6157656311988831, + "learning_rate": 4.578387219996366e-05, + "loss": 0.8645, + "step": 1525 + }, + { + "epoch": 0.9954456733897202, + "grad_norm": 0.6330501437187195, + "learning_rate": 4.5755142415111264e-05, + "loss": 0.8549, + "step": 1530 + }, + { + "epoch": 0.9986987638256344, + "grad_norm": 0.7185651063919067, + "learning_rate": 4.572632415449729e-05, + "loss": 0.8799, + "step": 1535 + }, + { + "epoch": 1.0, + "eval_f1": 0.8050111210499576, + "eval_loss": 0.432861328125, + "eval_precision": 0.8078559249569004, + "eval_recall": 0.8036506251146921, + "eval_runtime": 475.6379, + "eval_samples_per_second": 827.173, + "eval_steps_per_second": 0.809, + "step": 1537 + }, + { + "epoch": 1.0019518542615484, + "grad_norm": 1.2476260662078857, + "learning_rate": 4.5697417540969234e-05, + "loss": 0.8628, + "step": 1540 + }, + { + "epoch": 1.0052049446974627, + "grad_norm": 1.6661646366119385, + "learning_rate": 4.566842269775126e-05, + "loss": 0.8106, + "step": 1545 + }, + { + "epoch": 1.0084580351333767, + "grad_norm": 7.054599285125732, + "learning_rate": 4.563933974844361e-05, + "loss": 0.7696, + "step": 1550 + }, + { + "epoch": 1.0117111255692908, + "grad_norm": 0.7829424142837524, + "learning_rate": 4.561016881702212e-05, + "loss": 0.8057, + "step": 1555 + }, + { + "epoch": 1.014964216005205, + "grad_norm": 0.5904113054275513, + "learning_rate": 4.5580910027837673e-05, + "loss": 0.8178, + "step": 1560 + }, + { + "epoch": 1.018217306441119, + "grad_norm": 1.8633893728256226, + "learning_rate": 4.555156350561569e-05, + "loss": 0.8021, + "step": 1565 + }, + { + "epoch": 1.0214703968770331, + "grad_norm": 0.935964047908783, + "learning_rate": 4.5522129375455555e-05, + "loss": 0.7791, + "step": 1570 + }, + { + "epoch": 1.0247234873129474, + "grad_norm": 1.3689883947372437, + "learning_rate": 4.5492607762830145e-05, + "loss": 0.814, + "step": 1575 + }, + { + "epoch": 1.0279765777488614, + "grad_norm": 0.8765047788619995, + "learning_rate": 4.546299879358523e-05, + "loss": 0.8149, + "step": 1580 + }, + { + "epoch": 1.0312296681847755, + "grad_norm": 1.1618647575378418, + "learning_rate": 4.5433302593939e-05, + "loss": 0.7935, + "step": 1585 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 0.7140945196151733, + "learning_rate": 4.540351929048146e-05, + "loss": 0.7859, + "step": 1590 + }, + { + "epoch": 1.0377358490566038, + "grad_norm": 0.9278448820114136, + "learning_rate": 4.537364901017393e-05, + "loss": 0.8269, + "step": 1595 + }, + { + "epoch": 1.0409889394925178, + "grad_norm": 0.7428409457206726, + "learning_rate": 4.534369188034853e-05, + "loss": 0.806, + "step": 1600 + }, + { + "epoch": 1.044242029928432, + "grad_norm": 0.7308477759361267, + "learning_rate": 4.5313648028707557e-05, + "loss": 0.7991, + "step": 1605 + }, + { + "epoch": 1.047495120364346, + "grad_norm": 0.7885825037956238, + "learning_rate": 4.528351758332303e-05, + "loss": 0.7896, + "step": 1610 + }, + { + "epoch": 1.0507482108002602, + "grad_norm": 0.8900930285453796, + "learning_rate": 4.525330067263608e-05, + "loss": 0.791, + "step": 1615 + }, + { + "epoch": 1.0540013012361744, + "grad_norm": 0.7243936061859131, + "learning_rate": 4.5222997425456446e-05, + "loss": 0.8118, + "step": 1620 + }, + { + "epoch": 1.0572543916720885, + "grad_norm": 0.7627750039100647, + "learning_rate": 4.519260797096187e-05, + "loss": 0.7967, + "step": 1625 + }, + { + "epoch": 1.0605074821080025, + "grad_norm": 0.7201557755470276, + "learning_rate": 4.5162132438697615e-05, + "loss": 0.8087, + "step": 1630 + }, + { + "epoch": 1.0637605725439168, + "grad_norm": 1.3476982116699219, + "learning_rate": 4.513157095857586e-05, + "loss": 0.8152, + "step": 1635 + }, + { + "epoch": 1.0670136629798308, + "grad_norm": 2.0568456649780273, + "learning_rate": 4.510092366087518e-05, + "loss": 0.7879, + "step": 1640 + }, + { + "epoch": 1.070266753415745, + "grad_norm": 0.805178701877594, + "learning_rate": 4.507019067623997e-05, + "loss": 0.8083, + "step": 1645 + }, + { + "epoch": 1.073519843851659, + "grad_norm": 0.8525136709213257, + "learning_rate": 4.5039372135679883e-05, + "loss": 0.8044, + "step": 1650 + }, + { + "epoch": 1.0767729342875731, + "grad_norm": 0.7201101183891296, + "learning_rate": 4.5008468170569295e-05, + "loss": 0.798, + "step": 1655 + }, + { + "epoch": 1.0800260247234874, + "grad_norm": 0.8228124976158142, + "learning_rate": 4.497747891264675e-05, + "loss": 0.7921, + "step": 1660 + }, + { + "epoch": 1.0832791151594015, + "grad_norm": 0.8848757147789001, + "learning_rate": 4.494640449401434e-05, + "loss": 0.789, + "step": 1665 + }, + { + "epoch": 1.0865322055953155, + "grad_norm": 0.7168120741844177, + "learning_rate": 4.491524504713722e-05, + "loss": 0.8081, + "step": 1670 + }, + { + "epoch": 1.0897852960312298, + "grad_norm": 0.7164594531059265, + "learning_rate": 4.4884000704842976e-05, + "loss": 0.8004, + "step": 1675 + }, + { + "epoch": 1.0930383864671438, + "grad_norm": 0.7822607159614563, + "learning_rate": 4.485267160032112e-05, + "loss": 0.811, + "step": 1680 + }, + { + "epoch": 1.0962914769030578, + "grad_norm": 1.0780830383300781, + "learning_rate": 4.4821257867122475e-05, + "loss": 0.8068, + "step": 1685 + }, + { + "epoch": 1.099544567338972, + "grad_norm": 0.9002332091331482, + "learning_rate": 4.478975963915861e-05, + "loss": 0.7883, + "step": 1690 + }, + { + "epoch": 1.1027976577748861, + "grad_norm": 0.8772884011268616, + "learning_rate": 4.475817705070132e-05, + "loss": 0.8103, + "step": 1695 + }, + { + "epoch": 1.1060507482108002, + "grad_norm": 0.7703087329864502, + "learning_rate": 4.472651023638196e-05, + "loss": 0.7852, + "step": 1700 + }, + { + "epoch": 1.1093038386467144, + "grad_norm": 0.6608516573905945, + "learning_rate": 4.469475933119098e-05, + "loss": 0.8177, + "step": 1705 + }, + { + "epoch": 1.1125569290826285, + "grad_norm": 0.6942402720451355, + "learning_rate": 4.4662924470477255e-05, + "loss": 0.7958, + "step": 1710 + }, + { + "epoch": 1.1158100195185425, + "grad_norm": 1.1164368391036987, + "learning_rate": 4.4631005789947576e-05, + "loss": 0.8265, + "step": 1715 + }, + { + "epoch": 1.1190631099544568, + "grad_norm": 0.9671571850776672, + "learning_rate": 4.4599003425666026e-05, + "loss": 0.828, + "step": 1720 + }, + { + "epoch": 1.1223162003903708, + "grad_norm": 0.7158306837081909, + "learning_rate": 4.456691751405343e-05, + "loss": 0.8068, + "step": 1725 + }, + { + "epoch": 1.1255692908262849, + "grad_norm": 0.7458908557891846, + "learning_rate": 4.453474819188675e-05, + "loss": 0.8044, + "step": 1730 + }, + { + "epoch": 1.1288223812621991, + "grad_norm": 0.832358181476593, + "learning_rate": 4.450249559629853e-05, + "loss": 0.8041, + "step": 1735 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 1.0899025201797485, + "learning_rate": 4.447015986477628e-05, + "loss": 0.8171, + "step": 1740 + }, + { + "epoch": 1.1353285621340272, + "grad_norm": 0.9264464378356934, + "learning_rate": 4.443774113516192e-05, + "loss": 0.7795, + "step": 1745 + }, + { + "epoch": 1.1385816525699415, + "grad_norm": 1.837517261505127, + "learning_rate": 4.440523954565114e-05, + "loss": 0.8011, + "step": 1750 + }, + { + "epoch": 1.1418347430058555, + "grad_norm": 0.7183268070220947, + "learning_rate": 4.437265523479291e-05, + "loss": 0.8071, + "step": 1755 + }, + { + "epoch": 1.1450878334417696, + "grad_norm": 1.3461638689041138, + "learning_rate": 4.433998834148877e-05, + "loss": 0.8142, + "step": 1760 + }, + { + "epoch": 1.1483409238776838, + "grad_norm": 1.060806155204773, + "learning_rate": 4.430723900499232e-05, + "loss": 0.7857, + "step": 1765 + }, + { + "epoch": 1.1515940143135979, + "grad_norm": 0.9301265478134155, + "learning_rate": 4.427440736490861e-05, + "loss": 0.8009, + "step": 1770 + }, + { + "epoch": 1.1548471047495121, + "grad_norm": 1.4558310508728027, + "learning_rate": 4.4241493561193515e-05, + "loss": 0.8203, + "step": 1775 + }, + { + "epoch": 1.1581001951854262, + "grad_norm": 0.7694815993309021, + "learning_rate": 4.4208497734153177e-05, + "loss": 0.7825, + "step": 1780 + }, + { + "epoch": 1.1613532856213402, + "grad_norm": 0.7246577739715576, + "learning_rate": 4.417542002444339e-05, + "loss": 0.8157, + "step": 1785 + }, + { + "epoch": 1.1646063760572545, + "grad_norm": 1.0033169984817505, + "learning_rate": 4.4142260573068993e-05, + "loss": 0.8013, + "step": 1790 + }, + { + "epoch": 1.1678594664931685, + "grad_norm": 1.7446528673171997, + "learning_rate": 4.410901952138326e-05, + "loss": 0.8004, + "step": 1795 + }, + { + "epoch": 1.1711125569290826, + "grad_norm": 1.0588308572769165, + "learning_rate": 4.407569701108737e-05, + "loss": 0.8055, + "step": 1800 + }, + { + "epoch": 1.1743656473649968, + "grad_norm": 0.7162896990776062, + "learning_rate": 4.404229318422968e-05, + "loss": 0.8091, + "step": 1805 + }, + { + "epoch": 1.1776187378009109, + "grad_norm": 1.3299797773361206, + "learning_rate": 4.400880818320521e-05, + "loss": 0.8068, + "step": 1810 + }, + { + "epoch": 1.180871828236825, + "grad_norm": 0.7723076939582825, + "learning_rate": 4.397524215075504e-05, + "loss": 0.8065, + "step": 1815 + }, + { + "epoch": 1.1841249186727392, + "grad_norm": 0.700183629989624, + "learning_rate": 4.3941595229965636e-05, + "loss": 0.8006, + "step": 1820 + }, + { + "epoch": 1.1873780091086532, + "grad_norm": 0.7281947135925293, + "learning_rate": 4.390786756426829e-05, + "loss": 0.8026, + "step": 1825 + }, + { + "epoch": 1.1906310995445673, + "grad_norm": 0.8081494569778442, + "learning_rate": 4.3874059297438515e-05, + "loss": 0.7887, + "step": 1830 + }, + { + "epoch": 1.1938841899804815, + "grad_norm": 0.749001681804657, + "learning_rate": 4.384017057359538e-05, + "loss": 0.8007, + "step": 1835 + }, + { + "epoch": 1.1971372804163956, + "grad_norm": 0.8392378091812134, + "learning_rate": 4.380620153720095e-05, + "loss": 0.8228, + "step": 1840 + }, + { + "epoch": 1.2003903708523098, + "grad_norm": 0.9195884466171265, + "learning_rate": 4.377215233305966e-05, + "loss": 0.8009, + "step": 1845 + }, + { + "epoch": 1.2036434612882239, + "grad_norm": 0.9343377947807312, + "learning_rate": 4.373802310631765e-05, + "loss": 0.7785, + "step": 1850 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 0.9939282536506653, + "learning_rate": 4.370381400246221e-05, + "loss": 0.8228, + "step": 1855 + }, + { + "epoch": 1.2101496421600522, + "grad_norm": 4.1950907707214355, + "learning_rate": 4.366952516732114e-05, + "loss": 0.8051, + "step": 1860 + }, + { + "epoch": 1.2134027325959662, + "grad_norm": 1.1269813776016235, + "learning_rate": 4.3635156747062105e-05, + "loss": 0.8059, + "step": 1865 + }, + { + "epoch": 1.2166558230318802, + "grad_norm": 1.8155053853988647, + "learning_rate": 4.360070888819203e-05, + "loss": 0.8157, + "step": 1870 + }, + { + "epoch": 1.2199089134677945, + "grad_norm": 0.7301884889602661, + "learning_rate": 4.356618173755648e-05, + "loss": 0.786, + "step": 1875 + }, + { + "epoch": 1.2231620039037086, + "grad_norm": 0.6721681952476501, + "learning_rate": 4.353157544233902e-05, + "loss": 0.818, + "step": 1880 + }, + { + "epoch": 1.2264150943396226, + "grad_norm": 0.9088836312294006, + "learning_rate": 4.349689015006061e-05, + "loss": 0.7883, + "step": 1885 + }, + { + "epoch": 1.2296681847755369, + "grad_norm": 0.7416070103645325, + "learning_rate": 4.3462126008578936e-05, + "loss": 0.8033, + "step": 1890 + }, + { + "epoch": 1.232921275211451, + "grad_norm": 0.6465050578117371, + "learning_rate": 4.342728316608783e-05, + "loss": 0.8111, + "step": 1895 + }, + { + "epoch": 1.236174365647365, + "grad_norm": 0.870087206363678, + "learning_rate": 4.3392361771116604e-05, + "loss": 0.8307, + "step": 1900 + }, + { + "epoch": 1.2394274560832792, + "grad_norm": 0.7588717937469482, + "learning_rate": 4.335736197252942e-05, + "loss": 0.8024, + "step": 1905 + }, + { + "epoch": 1.2426805465191932, + "grad_norm": 24.7558650970459, + "learning_rate": 4.332228391952469e-05, + "loss": 0.8089, + "step": 1910 + }, + { + "epoch": 1.2459336369551073, + "grad_norm": 0.7845138907432556, + "learning_rate": 4.328712776163436e-05, + "loss": 0.8092, + "step": 1915 + }, + { + "epoch": 1.2491867273910215, + "grad_norm": 0.5970214605331421, + "learning_rate": 4.325189364872337e-05, + "loss": 0.768, + "step": 1920 + }, + { + "epoch": 1.2524398178269356, + "grad_norm": 0.6692010164260864, + "learning_rate": 4.321658173098895e-05, + "loss": 0.7974, + "step": 1925 + }, + { + "epoch": 1.2556929082628496, + "grad_norm": 0.6391417384147644, + "learning_rate": 4.318119215896001e-05, + "loss": 0.8151, + "step": 1930 + }, + { + "epoch": 1.258945998698764, + "grad_norm": 1.4557222127914429, + "learning_rate": 4.314572508349646e-05, + "loss": 0.817, + "step": 1935 + }, + { + "epoch": 1.262199089134678, + "grad_norm": 0.6106812357902527, + "learning_rate": 4.311018065578864e-05, + "loss": 0.8154, + "step": 1940 + }, + { + "epoch": 1.265452179570592, + "grad_norm": 0.7062475681304932, + "learning_rate": 4.307455902735659e-05, + "loss": 0.8106, + "step": 1945 + }, + { + "epoch": 1.2687052700065062, + "grad_norm": 0.6937923431396484, + "learning_rate": 4.303886035004947e-05, + "loss": 0.8193, + "step": 1950 + }, + { + "epoch": 1.2719583604424203, + "grad_norm": 0.5820494294166565, + "learning_rate": 4.3003084776044855e-05, + "loss": 0.8166, + "step": 1955 + }, + { + "epoch": 1.2752114508783343, + "grad_norm": 0.9809524416923523, + "learning_rate": 4.2967232457848154e-05, + "loss": 0.7983, + "step": 1960 + }, + { + "epoch": 1.2784645413142486, + "grad_norm": 0.6147664785385132, + "learning_rate": 4.293130354829191e-05, + "loss": 0.8195, + "step": 1965 + }, + { + "epoch": 1.2817176317501626, + "grad_norm": 0.632622241973877, + "learning_rate": 4.289529820053515e-05, + "loss": 0.7964, + "step": 1970 + }, + { + "epoch": 1.2849707221860767, + "grad_norm": 0.5672839283943176, + "learning_rate": 4.285921656806276e-05, + "loss": 0.7854, + "step": 1975 + }, + { + "epoch": 1.288223812621991, + "grad_norm": 0.7861791849136353, + "learning_rate": 4.2823058804684815e-05, + "loss": 0.7772, + "step": 1980 + }, + { + "epoch": 1.291476903057905, + "grad_norm": 0.627170741558075, + "learning_rate": 4.2786825064535905e-05, + "loss": 0.8033, + "step": 1985 + }, + { + "epoch": 1.294729993493819, + "grad_norm": 0.7018983960151672, + "learning_rate": 4.275051550207453e-05, + "loss": 0.8067, + "step": 1990 + }, + { + "epoch": 1.2979830839297333, + "grad_norm": 0.6708381772041321, + "learning_rate": 4.2714130272082365e-05, + "loss": 0.8019, + "step": 1995 + }, + { + "epoch": 1.3012361743656473, + "grad_norm": 1.4676740169525146, + "learning_rate": 4.267766952966369e-05, + "loss": 0.8366, + "step": 2000 + }, + { + "epoch": 1.3044892648015614, + "grad_norm": 0.6410109996795654, + "learning_rate": 4.2641133430244644e-05, + "loss": 0.7767, + "step": 2005 + }, + { + "epoch": 1.3077423552374756, + "grad_norm": 0.6521077752113342, + "learning_rate": 4.2604522129572624e-05, + "loss": 0.8134, + "step": 2010 + }, + { + "epoch": 1.3109954456733897, + "grad_norm": 0.605689525604248, + "learning_rate": 4.256783578371557e-05, + "loss": 0.7803, + "step": 2015 + }, + { + "epoch": 1.3142485361093037, + "grad_norm": 1.1350992918014526, + "learning_rate": 4.253107454906137e-05, + "loss": 0.787, + "step": 2020 + }, + { + "epoch": 1.317501626545218, + "grad_norm": 0.7411300539970398, + "learning_rate": 4.2494238582317114e-05, + "loss": 0.8047, + "step": 2025 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 0.8492873311042786, + "learning_rate": 4.2457328040508484e-05, + "loss": 0.788, + "step": 2030 + }, + { + "epoch": 1.3240078074170463, + "grad_norm": 0.7676826119422913, + "learning_rate": 4.2420343080979035e-05, + "loss": 0.808, + "step": 2035 + }, + { + "epoch": 1.3272608978529603, + "grad_norm": 0.6066752076148987, + "learning_rate": 4.238328386138959e-05, + "loss": 0.7894, + "step": 2040 + }, + { + "epoch": 1.3305139882888743, + "grad_norm": 0.9500682353973389, + "learning_rate": 4.234615053971751e-05, + "loss": 0.7933, + "step": 2045 + }, + { + "epoch": 1.3337670787247886, + "grad_norm": 0.6935995221138, + "learning_rate": 4.230894327425604e-05, + "loss": 0.7949, + "step": 2050 + }, + { + "epoch": 1.3370201691607027, + "grad_norm": 0.701280415058136, + "learning_rate": 4.227166222361364e-05, + "loss": 0.7879, + "step": 2055 + }, + { + "epoch": 1.340273259596617, + "grad_norm": 0.6840693354606628, + "learning_rate": 4.2234307546713305e-05, + "loss": 0.8095, + "step": 2060 + }, + { + "epoch": 1.343526350032531, + "grad_norm": 0.6785464286804199, + "learning_rate": 4.219687940279188e-05, + "loss": 0.7931, + "step": 2065 + }, + { + "epoch": 1.346779440468445, + "grad_norm": 1.4129618406295776, + "learning_rate": 4.2159377951399385e-05, + "loss": 0.8222, + "step": 2070 + }, + { + "epoch": 1.3500325309043593, + "grad_norm": 0.9987890124320984, + "learning_rate": 4.212180335239836e-05, + "loss": 0.8086, + "step": 2075 + }, + { + "epoch": 1.3532856213402733, + "grad_norm": 0.7902525067329407, + "learning_rate": 4.208415576596315e-05, + "loss": 0.808, + "step": 2080 + }, + { + "epoch": 1.3565387117761873, + "grad_norm": 1.0341746807098389, + "learning_rate": 4.2046435352579206e-05, + "loss": 0.8218, + "step": 2085 + }, + { + "epoch": 1.3597918022121016, + "grad_norm": 0.6914475560188293, + "learning_rate": 4.200864227304247e-05, + "loss": 0.8033, + "step": 2090 + }, + { + "epoch": 1.3630448926480156, + "grad_norm": 0.8766419291496277, + "learning_rate": 4.1970776688458624e-05, + "loss": 0.7885, + "step": 2095 + }, + { + "epoch": 1.3662979830839297, + "grad_norm": 0.7936528325080872, + "learning_rate": 4.1932838760242445e-05, + "loss": 0.8163, + "step": 2100 + }, + { + "epoch": 1.369551073519844, + "grad_norm": 0.6358763575553894, + "learning_rate": 4.189482865011706e-05, + "loss": 0.8042, + "step": 2105 + }, + { + "epoch": 1.372804163955758, + "grad_norm": 0.6236315965652466, + "learning_rate": 4.1856746520113345e-05, + "loss": 0.7926, + "step": 2110 + }, + { + "epoch": 1.376057254391672, + "grad_norm": 0.6494086980819702, + "learning_rate": 4.181859253256916e-05, + "loss": 0.8001, + "step": 2115 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.5840818881988525, + "learning_rate": 4.178036685012868e-05, + "loss": 0.7936, + "step": 2120 + }, + { + "epoch": 1.3825634352635003, + "grad_norm": 0.6631473898887634, + "learning_rate": 4.174206963574171e-05, + "loss": 0.8022, + "step": 2125 + }, + { + "epoch": 1.3858165256994144, + "grad_norm": 0.6331456303596497, + "learning_rate": 4.1703701052662974e-05, + "loss": 0.8023, + "step": 2130 + }, + { + "epoch": 1.3890696161353286, + "grad_norm": 0.9553548693656921, + "learning_rate": 4.166526126445145e-05, + "loss": 0.7927, + "step": 2135 + }, + { + "epoch": 1.3923227065712427, + "grad_norm": 0.5648055076599121, + "learning_rate": 4.162675043496963e-05, + "loss": 0.8094, + "step": 2140 + }, + { + "epoch": 1.3955757970071567, + "grad_norm": 0.7545254826545715, + "learning_rate": 4.158816872838285e-05, + "loss": 0.7898, + "step": 2145 + }, + { + "epoch": 1.398828887443071, + "grad_norm": 0.6327040791511536, + "learning_rate": 4.1549516309158586e-05, + "loss": 0.7933, + "step": 2150 + }, + { + "epoch": 1.402081977878985, + "grad_norm": 0.897237241268158, + "learning_rate": 4.151079334206577e-05, + "loss": 0.8026, + "step": 2155 + }, + { + "epoch": 1.405335068314899, + "grad_norm": 0.6637735962867737, + "learning_rate": 4.147199999217402e-05, + "loss": 0.8089, + "step": 2160 + }, + { + "epoch": 1.4085881587508133, + "grad_norm": 0.7697905898094177, + "learning_rate": 4.143313642485302e-05, + "loss": 0.7986, + "step": 2165 + }, + { + "epoch": 1.4118412491867274, + "grad_norm": 0.6807245016098022, + "learning_rate": 4.139420280577177e-05, + "loss": 0.8095, + "step": 2170 + }, + { + "epoch": 1.4150943396226414, + "grad_norm": 0.6346644163131714, + "learning_rate": 4.1355199300897894e-05, + "loss": 0.7874, + "step": 2175 + }, + { + "epoch": 1.4183474300585557, + "grad_norm": 1.3532865047454834, + "learning_rate": 4.1316126076496935e-05, + "loss": 0.8077, + "step": 2180 + }, + { + "epoch": 1.4216005204944697, + "grad_norm": 0.7018231749534607, + "learning_rate": 4.127698329913161e-05, + "loss": 0.7826, + "step": 2185 + }, + { + "epoch": 1.4248536109303838, + "grad_norm": 0.6970395445823669, + "learning_rate": 4.1237771135661164e-05, + "loss": 0.7959, + "step": 2190 + }, + { + "epoch": 1.428106701366298, + "grad_norm": 0.802807092666626, + "learning_rate": 4.119848975324059e-05, + "loss": 0.8093, + "step": 2195 + }, + { + "epoch": 1.431359791802212, + "grad_norm": 0.6809616684913635, + "learning_rate": 4.115913931931997e-05, + "loss": 0.7949, + "step": 2200 + }, + { + "epoch": 1.434612882238126, + "grad_norm": 0.6927191615104675, + "learning_rate": 4.1119720001643745e-05, + "loss": 0.8072, + "step": 2205 + }, + { + "epoch": 1.4378659726740404, + "grad_norm": 0.8336883187294006, + "learning_rate": 4.108023196824998e-05, + "loss": 0.8058, + "step": 2210 + }, + { + "epoch": 1.4411190631099544, + "grad_norm": 0.9360595345497131, + "learning_rate": 4.1040675387469685e-05, + "loss": 0.796, + "step": 2215 + }, + { + "epoch": 1.4443721535458685, + "grad_norm": 0.7317126989364624, + "learning_rate": 4.1001050427926045e-05, + "loss": 0.789, + "step": 2220 + }, + { + "epoch": 1.4476252439817827, + "grad_norm": 0.7633559703826904, + "learning_rate": 4.0961357258533774e-05, + "loss": 0.7885, + "step": 2225 + }, + { + "epoch": 1.4508783344176968, + "grad_norm": 0.6734256744384766, + "learning_rate": 4.0921596048498315e-05, + "loss": 0.7852, + "step": 2230 + }, + { + "epoch": 1.4541314248536108, + "grad_norm": 1.5330891609191895, + "learning_rate": 4.088176696731517e-05, + "loss": 0.8067, + "step": 2235 + }, + { + "epoch": 1.457384515289525, + "grad_norm": 1.4686920642852783, + "learning_rate": 4.084187018476918e-05, + "loss": 0.8, + "step": 2240 + }, + { + "epoch": 1.460637605725439, + "grad_norm": 0.8620485067367554, + "learning_rate": 4.0801905870933764e-05, + "loss": 0.7865, + "step": 2245 + }, + { + "epoch": 1.4638906961613534, + "grad_norm": 0.8238881826400757, + "learning_rate": 4.076187419617024e-05, + "loss": 0.8486, + "step": 2250 + }, + { + "epoch": 1.4671437865972674, + "grad_norm": 0.5753189921379089, + "learning_rate": 4.072177533112703e-05, + "loss": 0.7975, + "step": 2255 + }, + { + "epoch": 1.4703968770331814, + "grad_norm": 0.707665741443634, + "learning_rate": 4.068160944673903e-05, + "loss": 0.8067, + "step": 2260 + }, + { + "epoch": 1.4736499674690957, + "grad_norm": 0.702339768409729, + "learning_rate": 4.0641376714226795e-05, + "loss": 0.7823, + "step": 2265 + }, + { + "epoch": 1.4769030579050098, + "grad_norm": 1.500267505645752, + "learning_rate": 4.060107730509587e-05, + "loss": 0.8159, + "step": 2270 + }, + { + "epoch": 1.480156148340924, + "grad_norm": 0.8150926232337952, + "learning_rate": 4.0560711391135986e-05, + "loss": 0.8227, + "step": 2275 + }, + { + "epoch": 1.483409238776838, + "grad_norm": 0.8220822215080261, + "learning_rate": 4.052027914442043e-05, + "loss": 0.8067, + "step": 2280 + }, + { + "epoch": 1.486662329212752, + "grad_norm": 0.6167823672294617, + "learning_rate": 4.047978073730519e-05, + "loss": 0.8158, + "step": 2285 + }, + { + "epoch": 1.4899154196486664, + "grad_norm": 36.838592529296875, + "learning_rate": 4.043921634242836e-05, + "loss": 0.8178, + "step": 2290 + }, + { + "epoch": 1.4931685100845804, + "grad_norm": 0.6391186714172363, + "learning_rate": 4.039858613270927e-05, + "loss": 0.7954, + "step": 2295 + }, + { + "epoch": 1.4964216005204944, + "grad_norm": 0.7283660769462585, + "learning_rate": 4.035789028134782e-05, + "loss": 0.8045, + "step": 2300 + }, + { + "epoch": 1.4996746909564087, + "grad_norm": 1.2124541997909546, + "learning_rate": 4.031712896182376e-05, + "loss": 0.7941, + "step": 2305 + }, + { + "epoch": 1.5029277813923227, + "grad_norm": 0.6527173519134521, + "learning_rate": 4.0276302347895864e-05, + "loss": 0.808, + "step": 2310 + }, + { + "epoch": 1.5061808718282368, + "grad_norm": 0.7583323121070862, + "learning_rate": 4.023541061360131e-05, + "loss": 0.8228, + "step": 2315 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 2.344557762145996, + "learning_rate": 4.019445393325483e-05, + "loss": 0.798, + "step": 2320 + }, + { + "epoch": 1.512687052700065, + "grad_norm": 0.8352357149124146, + "learning_rate": 4.0153432481448027e-05, + "loss": 0.7912, + "step": 2325 + }, + { + "epoch": 1.5159401431359791, + "grad_norm": 0.8066681623458862, + "learning_rate": 4.01123464330486e-05, + "loss": 0.8125, + "step": 2330 + }, + { + "epoch": 1.5191932335718934, + "grad_norm": 0.7398434281349182, + "learning_rate": 4.007119596319962e-05, + "loss": 0.7997, + "step": 2335 + }, + { + "epoch": 1.5224463240078074, + "grad_norm": 0.9586179852485657, + "learning_rate": 4.002998124731879e-05, + "loss": 0.7994, + "step": 2340 + }, + { + "epoch": 1.5256994144437215, + "grad_norm": 0.6949347853660583, + "learning_rate": 3.998870246109767e-05, + "loss": 0.8192, + "step": 2345 + }, + { + "epoch": 1.5289525048796357, + "grad_norm": 1.5228863954544067, + "learning_rate": 3.994735978050094e-05, + "loss": 0.7902, + "step": 2350 + }, + { + "epoch": 1.5322055953155498, + "grad_norm": 0.6021274328231812, + "learning_rate": 3.990595338176564e-05, + "loss": 0.7995, + "step": 2355 + }, + { + "epoch": 1.5354586857514638, + "grad_norm": 0.6251325607299805, + "learning_rate": 3.986448344140047e-05, + "loss": 0.7943, + "step": 2360 + }, + { + "epoch": 1.538711776187378, + "grad_norm": 0.6837208867073059, + "learning_rate": 3.9822950136184946e-05, + "loss": 0.8093, + "step": 2365 + }, + { + "epoch": 1.5419648666232921, + "grad_norm": 1.162960171699524, + "learning_rate": 3.978135364316874e-05, + "loss": 0.8309, + "step": 2370 + }, + { + "epoch": 1.5452179570592062, + "grad_norm": 0.6852900385856628, + "learning_rate": 3.973969413967086e-05, + "loss": 0.803, + "step": 2375 + }, + { + "epoch": 1.5484710474951204, + "grad_norm": 0.8108793497085571, + "learning_rate": 3.9697971803278924e-05, + "loss": 0.8035, + "step": 2380 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 0.7817934155464172, + "learning_rate": 3.9656186811848395e-05, + "loss": 0.7872, + "step": 2385 + }, + { + "epoch": 1.5549772283669485, + "grad_norm": 0.6819909811019897, + "learning_rate": 3.9614339343501836e-05, + "loss": 0.7978, + "step": 2390 + }, + { + "epoch": 1.5582303188028628, + "grad_norm": 1.547499179840088, + "learning_rate": 3.9572429576628114e-05, + "loss": 0.8042, + "step": 2395 + }, + { + "epoch": 1.5614834092387768, + "grad_norm": 0.6589548587799072, + "learning_rate": 3.9530457689881684e-05, + "loss": 0.8112, + "step": 2400 + }, + { + "epoch": 1.5647364996746909, + "grad_norm": 1.0360349416732788, + "learning_rate": 3.94884238621818e-05, + "loss": 0.7949, + "step": 2405 + }, + { + "epoch": 1.5679895901106051, + "grad_norm": 0.8563544154167175, + "learning_rate": 3.944632827271176e-05, + "loss": 0.8193, + "step": 2410 + }, + { + "epoch": 1.5712426805465192, + "grad_norm": 0.8043304681777954, + "learning_rate": 3.940417110091816e-05, + "loss": 0.7986, + "step": 2415 + }, + { + "epoch": 1.5744957709824332, + "grad_norm": 0.7991480827331543, + "learning_rate": 3.9361952526510085e-05, + "loss": 0.791, + "step": 2420 + }, + { + "epoch": 1.5777488614183475, + "grad_norm": 0.8276852369308472, + "learning_rate": 3.9319672729458376e-05, + "loss": 0.8065, + "step": 2425 + }, + { + "epoch": 1.5810019518542615, + "grad_norm": 0.601204514503479, + "learning_rate": 3.927733188999486e-05, + "loss": 0.803, + "step": 2430 + }, + { + "epoch": 1.5842550422901756, + "grad_norm": 0.77410888671875, + "learning_rate": 3.92349301886116e-05, + "loss": 0.8056, + "step": 2435 + }, + { + "epoch": 1.5875081327260898, + "grad_norm": 1.0822545289993286, + "learning_rate": 3.9192467806060044e-05, + "loss": 0.7781, + "step": 2440 + }, + { + "epoch": 1.5907612231620039, + "grad_norm": 0.602203905582428, + "learning_rate": 3.914994492335038e-05, + "loss": 0.7805, + "step": 2445 + }, + { + "epoch": 1.594014313597918, + "grad_norm": 0.6420490741729736, + "learning_rate": 3.910736172175066e-05, + "loss": 0.8081, + "step": 2450 + }, + { + "epoch": 1.5972674040338322, + "grad_norm": 1.321158766746521, + "learning_rate": 3.9064718382786076e-05, + "loss": 0.8251, + "step": 2455 + }, + { + "epoch": 1.6005204944697464, + "grad_norm": 1.2666637897491455, + "learning_rate": 3.9022015088238174e-05, + "loss": 0.8017, + "step": 2460 + }, + { + "epoch": 1.6037735849056602, + "grad_norm": 0.7312076687812805, + "learning_rate": 3.897925202014409e-05, + "loss": 0.8193, + "step": 2465 + }, + { + "epoch": 1.6070266753415745, + "grad_norm": 0.6371551156044006, + "learning_rate": 3.8936429360795745e-05, + "loss": 0.8028, + "step": 2470 + }, + { + "epoch": 1.6102797657774888, + "grad_norm": 0.7501451373100281, + "learning_rate": 3.88935472927391e-05, + "loss": 0.809, + "step": 2475 + }, + { + "epoch": 1.6135328562134026, + "grad_norm": 0.6462128162384033, + "learning_rate": 3.885060599877337e-05, + "loss": 0.7898, + "step": 2480 + }, + { + "epoch": 1.6167859466493169, + "grad_norm": 0.6399087905883789, + "learning_rate": 3.880760566195023e-05, + "loss": 0.7848, + "step": 2485 + }, + { + "epoch": 1.6200390370852311, + "grad_norm": 0.6313064694404602, + "learning_rate": 3.876454646557305e-05, + "loss": 0.7907, + "step": 2490 + }, + { + "epoch": 1.623292127521145, + "grad_norm": 0.5535691976547241, + "learning_rate": 3.872142859319612e-05, + "loss": 0.8071, + "step": 2495 + }, + { + "epoch": 1.6265452179570592, + "grad_norm": 0.7361935377120972, + "learning_rate": 3.867825222862383e-05, + "loss": 0.8034, + "step": 2500 + }, + { + "epoch": 1.6297983083929735, + "grad_norm": 0.7017170190811157, + "learning_rate": 3.863501755590994e-05, + "loss": 0.8202, + "step": 2505 + }, + { + "epoch": 1.6330513988288873, + "grad_norm": 2.15995717048645, + "learning_rate": 3.8591724759356734e-05, + "loss": 0.8064, + "step": 2510 + }, + { + "epoch": 1.6363044892648015, + "grad_norm": 0.6328789591789246, + "learning_rate": 3.854837402351431e-05, + "loss": 0.7959, + "step": 2515 + }, + { + "epoch": 1.6395575797007158, + "grad_norm": 0.7377115488052368, + "learning_rate": 3.8504965533179724e-05, + "loss": 0.7826, + "step": 2520 + }, + { + "epoch": 1.6428106701366298, + "grad_norm": 0.7871087193489075, + "learning_rate": 3.8461499473396246e-05, + "loss": 0.7836, + "step": 2525 + }, + { + "epoch": 1.6460637605725439, + "grad_norm": 0.7310390472412109, + "learning_rate": 3.841797602945254e-05, + "loss": 0.82, + "step": 2530 + }, + { + "epoch": 1.6493168510084582, + "grad_norm": 0.6992024779319763, + "learning_rate": 3.837439538688189e-05, + "loss": 0.7865, + "step": 2535 + }, + { + "epoch": 1.6525699414443722, + "grad_norm": 0.6299036741256714, + "learning_rate": 3.833075773146142e-05, + "loss": 0.8071, + "step": 2540 + }, + { + "epoch": 1.6558230318802862, + "grad_norm": 0.7134023904800415, + "learning_rate": 3.828706324921128e-05, + "loss": 0.7931, + "step": 2545 + }, + { + "epoch": 1.6590761223162005, + "grad_norm": 0.9033751487731934, + "learning_rate": 3.824331212639388e-05, + "loss": 0.8144, + "step": 2550 + }, + { + "epoch": 1.6623292127521145, + "grad_norm": 0.6907141208648682, + "learning_rate": 3.8199504549513055e-05, + "loss": 0.7866, + "step": 2555 + }, + { + "epoch": 1.6655823031880286, + "grad_norm": 0.9282029271125793, + "learning_rate": 3.81556407053133e-05, + "loss": 0.7935, + "step": 2560 + }, + { + "epoch": 1.6688353936239428, + "grad_norm": 0.8306324481964111, + "learning_rate": 3.811172078077899e-05, + "loss": 0.7896, + "step": 2565 + }, + { + "epoch": 1.6720884840598569, + "grad_norm": 0.8237498998641968, + "learning_rate": 3.806774496313355e-05, + "loss": 0.7894, + "step": 2570 + }, + { + "epoch": 1.675341574495771, + "grad_norm": 1.046177625656128, + "learning_rate": 3.802371343983865e-05, + "loss": 0.7965, + "step": 2575 + }, + { + "epoch": 1.6785946649316852, + "grad_norm": 0.8086977601051331, + "learning_rate": 3.797962639859344e-05, + "loss": 0.7962, + "step": 2580 + }, + { + "epoch": 1.6818477553675992, + "grad_norm": 0.8723990321159363, + "learning_rate": 3.7935484027333746e-05, + "loss": 0.8, + "step": 2585 + }, + { + "epoch": 1.6851008458035133, + "grad_norm": 0.6789833903312683, + "learning_rate": 3.7891286514231225e-05, + "loss": 0.801, + "step": 2590 + }, + { + "epoch": 1.6883539362394275, + "grad_norm": 0.7277116179466248, + "learning_rate": 3.784703404769263e-05, + "loss": 0.7984, + "step": 2595 + }, + { + "epoch": 1.6916070266753416, + "grad_norm": 1.03284752368927, + "learning_rate": 3.780272681635894e-05, + "loss": 0.8021, + "step": 2600 + }, + { + "epoch": 1.6948601171112556, + "grad_norm": 0.9302812218666077, + "learning_rate": 3.77583650091046e-05, + "loss": 0.8046, + "step": 2605 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 1.2493805885314941, + "learning_rate": 3.771394881503673e-05, + "loss": 0.805, + "step": 2610 + }, + { + "epoch": 1.701366297983084, + "grad_norm": 1.7447010278701782, + "learning_rate": 3.766947842349423e-05, + "loss": 0.7968, + "step": 2615 + }, + { + "epoch": 1.704619388418998, + "grad_norm": 0.5957579016685486, + "learning_rate": 3.76249540240471e-05, + "loss": 0.7958, + "step": 2620 + }, + { + "epoch": 1.7078724788549122, + "grad_norm": 0.8952409625053406, + "learning_rate": 3.7580375806495524e-05, + "loss": 0.7974, + "step": 2625 + }, + { + "epoch": 1.7111255692908263, + "grad_norm": 0.5696317553520203, + "learning_rate": 3.753574396086913e-05, + "loss": 0.8037, + "step": 2630 + }, + { + "epoch": 1.7143786597267403, + "grad_norm": 1.1165753602981567, + "learning_rate": 3.7491058677426135e-05, + "loss": 0.8038, + "step": 2635 + }, + { + "epoch": 1.7176317501626546, + "grad_norm": 0.7049902081489563, + "learning_rate": 3.7446320146652556e-05, + "loss": 0.7678, + "step": 2640 + }, + { + "epoch": 1.7208848405985686, + "grad_norm": 0.6802716851234436, + "learning_rate": 3.740152855926139e-05, + "loss": 0.8163, + "step": 2645 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 0.808645486831665, + "learning_rate": 3.735668410619183e-05, + "loss": 0.8045, + "step": 2650 + }, + { + "epoch": 1.727391021470397, + "grad_norm": 0.6994810104370117, + "learning_rate": 3.7311786978608415e-05, + "loss": 0.794, + "step": 2655 + }, + { + "epoch": 1.730644111906311, + "grad_norm": 0.9750849604606628, + "learning_rate": 3.726683736790022e-05, + "loss": 0.8192, + "step": 2660 + }, + { + "epoch": 1.733897202342225, + "grad_norm": 0.8701574206352234, + "learning_rate": 3.7221835465680024e-05, + "loss": 0.8072, + "step": 2665 + }, + { + "epoch": 1.7371502927781393, + "grad_norm": 0.7430034875869751, + "learning_rate": 3.717678146378357e-05, + "loss": 0.8147, + "step": 2670 + }, + { + "epoch": 1.7404033832140533, + "grad_norm": 0.7101485729217529, + "learning_rate": 3.7131675554268654e-05, + "loss": 0.8174, + "step": 2675 + }, + { + "epoch": 1.7436564736499673, + "grad_norm": 0.6719719767570496, + "learning_rate": 3.7086517929414346e-05, + "loss": 0.7912, + "step": 2680 + }, + { + "epoch": 1.7469095640858816, + "grad_norm": 0.8749985098838806, + "learning_rate": 3.70413087817202e-05, + "loss": 0.7914, + "step": 2685 + }, + { + "epoch": 1.7501626545217959, + "grad_norm": 0.6824137568473816, + "learning_rate": 3.699604830390537e-05, + "loss": 0.7974, + "step": 2690 + }, + { + "epoch": 1.7534157449577097, + "grad_norm": 0.6715524196624756, + "learning_rate": 3.695073668890785e-05, + "loss": 0.8134, + "step": 2695 + }, + { + "epoch": 1.756668835393624, + "grad_norm": 1.2192811965942383, + "learning_rate": 3.690537412988359e-05, + "loss": 0.8042, + "step": 2700 + }, + { + "epoch": 1.7599219258295382, + "grad_norm": 0.5974992513656616, + "learning_rate": 3.685996082020574e-05, + "loss": 0.7871, + "step": 2705 + }, + { + "epoch": 1.763175016265452, + "grad_norm": 0.7278191447257996, + "learning_rate": 3.681449695346376e-05, + "loss": 0.8104, + "step": 2710 + }, + { + "epoch": 1.7664281067013663, + "grad_norm": 0.662208080291748, + "learning_rate": 3.676898272346266e-05, + "loss": 0.7831, + "step": 2715 + }, + { + "epoch": 1.7696811971372806, + "grad_norm": 0.891823410987854, + "learning_rate": 3.6723418324222126e-05, + "loss": 0.8045, + "step": 2720 + }, + { + "epoch": 1.7729342875731944, + "grad_norm": 0.6170246005058289, + "learning_rate": 3.667780394997569e-05, + "loss": 0.7881, + "step": 2725 + }, + { + "epoch": 1.7761873780091086, + "grad_norm": 0.6385141015052795, + "learning_rate": 3.663213979516994e-05, + "loss": 0.7926, + "step": 2730 + }, + { + "epoch": 1.779440468445023, + "grad_norm": 0.827928900718689, + "learning_rate": 3.658642605446367e-05, + "loss": 0.7865, + "step": 2735 + }, + { + "epoch": 1.7826935588809367, + "grad_norm": 0.8710260987281799, + "learning_rate": 3.6540662922727034e-05, + "loss": 0.8045, + "step": 2740 + }, + { + "epoch": 1.785946649316851, + "grad_norm": 0.7606410980224609, + "learning_rate": 3.6494850595040745e-05, + "loss": 0.8041, + "step": 2745 + }, + { + "epoch": 1.7891997397527653, + "grad_norm": 0.598753809928894, + "learning_rate": 3.644898926669524e-05, + "loss": 0.8023, + "step": 2750 + }, + { + "epoch": 1.7924528301886793, + "grad_norm": 0.6432173848152161, + "learning_rate": 3.640307913318982e-05, + "loss": 0.8165, + "step": 2755 + }, + { + "epoch": 1.7957059206245933, + "grad_norm": 0.6266681551933289, + "learning_rate": 3.6357120390231825e-05, + "loss": 0.8133, + "step": 2760 + }, + { + "epoch": 1.7989590110605076, + "grad_norm": 1.0969923734664917, + "learning_rate": 3.6311113233735836e-05, + "loss": 0.8016, + "step": 2765 + }, + { + "epoch": 1.8022121014964216, + "grad_norm": 1.2453726530075073, + "learning_rate": 3.626505785982281e-05, + "loss": 0.7391, + "step": 2770 + }, + { + "epoch": 1.8054651919323357, + "grad_norm": 0.8763654828071594, + "learning_rate": 3.6218954464819224e-05, + "loss": 0.7857, + "step": 2775 + }, + { + "epoch": 1.80871828236825, + "grad_norm": 0.7538381218910217, + "learning_rate": 3.6172803245256284e-05, + "loss": 0.7857, + "step": 2780 + }, + { + "epoch": 1.811971372804164, + "grad_norm": 0.6540011763572693, + "learning_rate": 3.612660439786904e-05, + "loss": 0.7982, + "step": 2785 + }, + { + "epoch": 1.815224463240078, + "grad_norm": 0.7887677550315857, + "learning_rate": 3.608035811959561e-05, + "loss": 0.8012, + "step": 2790 + }, + { + "epoch": 1.8184775536759923, + "grad_norm": 0.6847140192985535, + "learning_rate": 3.603406460757627e-05, + "loss": 0.8011, + "step": 2795 + }, + { + "epoch": 1.8217306441119063, + "grad_norm": 0.5890961289405823, + "learning_rate": 3.598772405915264e-05, + "loss": 0.7934, + "step": 2800 + }, + { + "epoch": 1.8249837345478204, + "grad_norm": 0.5724996328353882, + "learning_rate": 3.594133667186688e-05, + "loss": 0.7992, + "step": 2805 + }, + { + "epoch": 1.8282368249837346, + "grad_norm": 0.6108303070068359, + "learning_rate": 3.58949026434608e-05, + "loss": 0.7798, + "step": 2810 + }, + { + "epoch": 1.8314899154196487, + "grad_norm": 1.2907392978668213, + "learning_rate": 3.584842217187503e-05, + "loss": 0.7767, + "step": 2815 + }, + { + "epoch": 1.8347430058555627, + "grad_norm": 0.6735864877700806, + "learning_rate": 3.580189545524818e-05, + "loss": 0.7884, + "step": 2820 + }, + { + "epoch": 1.837996096291477, + "grad_norm": 1.0391428470611572, + "learning_rate": 3.575532269191599e-05, + "loss": 0.7944, + "step": 2825 + }, + { + "epoch": 1.841249186727391, + "grad_norm": 0.7969436645507812, + "learning_rate": 3.57087040804105e-05, + "loss": 0.7756, + "step": 2830 + }, + { + "epoch": 1.844502277163305, + "grad_norm": 0.6109620332717896, + "learning_rate": 3.566203981945921e-05, + "loss": 0.802, + "step": 2835 + }, + { + "epoch": 1.8477553675992193, + "grad_norm": 0.6684165000915527, + "learning_rate": 3.561533010798418e-05, + "loss": 0.7942, + "step": 2840 + }, + { + "epoch": 1.8510084580351334, + "grad_norm": 0.6041696071624756, + "learning_rate": 3.556857514510123e-05, + "loss": 0.8121, + "step": 2845 + }, + { + "epoch": 1.8542615484710474, + "grad_norm": 0.844451904296875, + "learning_rate": 3.5521775130119095e-05, + "loss": 0.7941, + "step": 2850 + }, + { + "epoch": 1.8575146389069617, + "grad_norm": 0.8286609649658203, + "learning_rate": 3.547493026253854e-05, + "loss": 0.7955, + "step": 2855 + }, + { + "epoch": 1.8607677293428757, + "grad_norm": 0.5692376494407654, + "learning_rate": 3.542804074205155e-05, + "loss": 0.7736, + "step": 2860 + }, + { + "epoch": 1.8640208197787898, + "grad_norm": 0.80513995885849, + "learning_rate": 3.5381106768540426e-05, + "loss": 0.819, + "step": 2865 + }, + { + "epoch": 1.867273910214704, + "grad_norm": 0.748439371585846, + "learning_rate": 3.5334128542077004e-05, + "loss": 0.8042, + "step": 2870 + }, + { + "epoch": 1.870527000650618, + "grad_norm": 0.5762147903442383, + "learning_rate": 3.528710626292174e-05, + "loss": 0.7935, + "step": 2875 + }, + { + "epoch": 1.873780091086532, + "grad_norm": 0.8298909664154053, + "learning_rate": 3.5240040131522876e-05, + "loss": 0.796, + "step": 2880 + }, + { + "epoch": 1.8770331815224464, + "grad_norm": 0.805218517780304, + "learning_rate": 3.519293034851559e-05, + "loss": 0.777, + "step": 2885 + }, + { + "epoch": 1.8802862719583604, + "grad_norm": 0.617969810962677, + "learning_rate": 3.514577711472117e-05, + "loss": 0.7925, + "step": 2890 + }, + { + "epoch": 1.8835393623942744, + "grad_norm": 0.6062343120574951, + "learning_rate": 3.509858063114608e-05, + "loss": 0.7853, + "step": 2895 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 0.8101537823677063, + "learning_rate": 3.505134109898118e-05, + "loss": 0.7601, + "step": 2900 + }, + { + "epoch": 1.8900455432661027, + "grad_norm": 0.8260220885276794, + "learning_rate": 3.500405871960085e-05, + "loss": 0.7946, + "step": 2905 + }, + { + "epoch": 1.8932986337020168, + "grad_norm": 0.8925743699073792, + "learning_rate": 3.495673369456207e-05, + "loss": 0.799, + "step": 2910 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 0.5829617977142334, + "learning_rate": 3.490936622560368e-05, + "loss": 0.8058, + "step": 2915 + }, + { + "epoch": 1.8998048145738453, + "grad_norm": 0.6162145137786865, + "learning_rate": 3.4861956514645386e-05, + "loss": 0.7715, + "step": 2920 + }, + { + "epoch": 1.9030579050097591, + "grad_norm": 0.641473114490509, + "learning_rate": 3.481450476378703e-05, + "loss": 0.7854, + "step": 2925 + }, + { + "epoch": 1.9063109954456734, + "grad_norm": 0.6128933429718018, + "learning_rate": 3.4767011175307595e-05, + "loss": 0.8211, + "step": 2930 + }, + { + "epoch": 1.9095640858815877, + "grad_norm": 0.6579800248146057, + "learning_rate": 3.4719475951664464e-05, + "loss": 0.793, + "step": 2935 + }, + { + "epoch": 1.9128171763175015, + "grad_norm": 0.7673735022544861, + "learning_rate": 3.4671899295492485e-05, + "loss": 0.7965, + "step": 2940 + }, + { + "epoch": 1.9160702667534157, + "grad_norm": 0.6426681876182556, + "learning_rate": 3.462428140960311e-05, + "loss": 0.7817, + "step": 2945 + }, + { + "epoch": 1.91932335718933, + "grad_norm": 1.0133545398712158, + "learning_rate": 3.4576622496983575e-05, + "loss": 0.806, + "step": 2950 + }, + { + "epoch": 1.9225764476252438, + "grad_norm": 0.5959199666976929, + "learning_rate": 3.452892276079599e-05, + "loss": 0.7939, + "step": 2955 + }, + { + "epoch": 1.925829538061158, + "grad_norm": 0.6069105267524719, + "learning_rate": 3.4481182404376485e-05, + "loss": 0.8011, + "step": 2960 + }, + { + "epoch": 1.9290826284970723, + "grad_norm": 0.6653848886489868, + "learning_rate": 3.443340163123437e-05, + "loss": 0.7864, + "step": 2965 + }, + { + "epoch": 1.9323357189329864, + "grad_norm": 0.8323044180870056, + "learning_rate": 3.4385580645051216e-05, + "loss": 0.8096, + "step": 2970 + }, + { + "epoch": 1.9355888093689004, + "grad_norm": 0.7751704454421997, + "learning_rate": 3.433771964968004e-05, + "loss": 0.8101, + "step": 2975 + }, + { + "epoch": 1.9388418998048147, + "grad_norm": 0.7757251262664795, + "learning_rate": 3.4289818849144384e-05, + "loss": 0.7871, + "step": 2980 + }, + { + "epoch": 1.9420949902407287, + "grad_norm": 0.6133014559745789, + "learning_rate": 3.424187844763751e-05, + "loss": 0.7759, + "step": 2985 + }, + { + "epoch": 1.9453480806766428, + "grad_norm": 0.6651121973991394, + "learning_rate": 3.419389864952145e-05, + "loss": 0.8079, + "step": 2990 + }, + { + "epoch": 1.948601171112557, + "grad_norm": 0.6148653626441956, + "learning_rate": 3.414587965932622e-05, + "loss": 0.8075, + "step": 2995 + }, + { + "epoch": 1.951854261548471, + "grad_norm": 0.7674732208251953, + "learning_rate": 3.409782168174887e-05, + "loss": 0.7836, + "step": 3000 + }, + { + "epoch": 1.9551073519843851, + "grad_norm": 0.7626878619194031, + "learning_rate": 3.404972492165267e-05, + "loss": 0.787, + "step": 3005 + }, + { + "epoch": 1.9583604424202994, + "grad_norm": 0.6008310914039612, + "learning_rate": 3.40015895840662e-05, + "loss": 0.8043, + "step": 3010 + }, + { + "epoch": 1.9616135328562134, + "grad_norm": 0.6407262682914734, + "learning_rate": 3.3953415874182495e-05, + "loss": 0.7847, + "step": 3015 + }, + { + "epoch": 1.9648666232921275, + "grad_norm": 0.8387401103973389, + "learning_rate": 3.390520399735818e-05, + "loss": 0.7885, + "step": 3020 + }, + { + "epoch": 1.9681197137280417, + "grad_norm": 0.9059749245643616, + "learning_rate": 3.385695415911253e-05, + "loss": 0.7885, + "step": 3025 + }, + { + "epoch": 1.9713728041639558, + "grad_norm": 0.7701427340507507, + "learning_rate": 3.38086665651267e-05, + "loss": 0.7951, + "step": 3030 + }, + { + "epoch": 1.9746258945998698, + "grad_norm": 0.5787619352340698, + "learning_rate": 3.376034142124277e-05, + "loss": 0.7692, + "step": 3035 + }, + { + "epoch": 1.977878985035784, + "grad_norm": 0.5660680532455444, + "learning_rate": 3.371197893346288e-05, + "loss": 0.7935, + "step": 3040 + }, + { + "epoch": 1.9811320754716981, + "grad_norm": 0.6145839095115662, + "learning_rate": 3.3663579307948365e-05, + "loss": 0.774, + "step": 3045 + }, + { + "epoch": 1.9843851659076122, + "grad_norm": 0.9014440178871155, + "learning_rate": 3.3615142751018894e-05, + "loss": 0.795, + "step": 3050 + }, + { + "epoch": 1.9876382563435264, + "grad_norm": 1.5400235652923584, + "learning_rate": 3.356666946915152e-05, + "loss": 0.8015, + "step": 3055 + }, + { + "epoch": 1.9908913467794405, + "grad_norm": 0.7585124373435974, + "learning_rate": 3.35181596689799e-05, + "loss": 0.7751, + "step": 3060 + }, + { + "epoch": 1.9941444372153545, + "grad_norm": 0.675399124622345, + "learning_rate": 3.3469613557293345e-05, + "loss": 0.7552, + "step": 3065 + }, + { + "epoch": 1.9973975276512688, + "grad_norm": 0.7362255454063416, + "learning_rate": 3.342103134103593e-05, + "loss": 0.7674, + "step": 3070 + }, + { + "epoch": 2.0, + "eval_f1": 0.8125089732634697, + "eval_loss": 0.419677734375, + "eval_precision": 0.812527609369082, + "eval_recall": 0.8124974017274719, + "eval_runtime": 298.2094, + "eval_samples_per_second": 1319.325, + "eval_steps_per_second": 1.291, + "step": 3074 + }, + { + "epoch": 2.000650618087183, + "grad_norm": 0.6836552023887634, + "learning_rate": 3.3372413227305684e-05, + "loss": 0.7623, + "step": 3075 + }, + { + "epoch": 2.003903708523097, + "grad_norm": 0.7960084676742554, + "learning_rate": 3.3323759423353615e-05, + "loss": 0.6671, + "step": 3080 + }, + { + "epoch": 2.007156798959011, + "grad_norm": 0.7370645999908447, + "learning_rate": 3.327507013658291e-05, + "loss": 0.6733, + "step": 3085 + }, + { + "epoch": 2.0104098893949254, + "grad_norm": 1.088987112045288, + "learning_rate": 3.3226345574548e-05, + "loss": 0.6668, + "step": 3090 + }, + { + "epoch": 2.013662979830839, + "grad_norm": 0.9499191045761108, + "learning_rate": 3.317758594495367e-05, + "loss": 0.6749, + "step": 3095 + }, + { + "epoch": 2.0169160702667535, + "grad_norm": 0.8411104083061218, + "learning_rate": 3.312879145565422e-05, + "loss": 0.6643, + "step": 3100 + }, + { + "epoch": 2.0201691607026677, + "grad_norm": 0.8027604818344116, + "learning_rate": 3.307996231465254e-05, + "loss": 0.6604, + "step": 3105 + }, + { + "epoch": 2.0234222511385815, + "grad_norm": 0.8328303694725037, + "learning_rate": 3.303109873009922e-05, + "loss": 0.6921, + "step": 3110 + }, + { + "epoch": 2.026675341574496, + "grad_norm": 0.8239777088165283, + "learning_rate": 3.298220091029171e-05, + "loss": 0.6619, + "step": 3115 + }, + { + "epoch": 2.02992843201041, + "grad_norm": 0.8349932432174683, + "learning_rate": 3.293326906367338e-05, + "loss": 0.6289, + "step": 3120 + }, + { + "epoch": 2.033181522446324, + "grad_norm": 1.271503210067749, + "learning_rate": 3.2884303398832634e-05, + "loss": 0.6643, + "step": 3125 + }, + { + "epoch": 2.036434612882238, + "grad_norm": 0.8596307039260864, + "learning_rate": 3.283530412450207e-05, + "loss": 0.6434, + "step": 3130 + }, + { + "epoch": 2.0396877033181524, + "grad_norm": 0.8263525366783142, + "learning_rate": 3.278627144955754e-05, + "loss": 0.6485, + "step": 3135 + }, + { + "epoch": 2.0429407937540662, + "grad_norm": 0.8425394892692566, + "learning_rate": 3.2737205583017286e-05, + "loss": 0.6428, + "step": 3140 + }, + { + "epoch": 2.0461938841899805, + "grad_norm": 1.253609299659729, + "learning_rate": 3.268810673404102e-05, + "loss": 0.6427, + "step": 3145 + }, + { + "epoch": 2.0494469746258948, + "grad_norm": 0.8926024436950684, + "learning_rate": 3.2638975111929084e-05, + "loss": 0.6748, + "step": 3150 + }, + { + "epoch": 2.0527000650618086, + "grad_norm": 0.959561288356781, + "learning_rate": 3.25898109261215e-05, + "loss": 0.6542, + "step": 3155 + }, + { + "epoch": 2.055953155497723, + "grad_norm": 1.0102553367614746, + "learning_rate": 3.254061438619711e-05, + "loss": 0.6661, + "step": 3160 + }, + { + "epoch": 2.059206245933637, + "grad_norm": 1.3455661535263062, + "learning_rate": 3.249138570187268e-05, + "loss": 0.6656, + "step": 3165 + }, + { + "epoch": 2.062459336369551, + "grad_norm": 1.2887942790985107, + "learning_rate": 3.244212508300201e-05, + "loss": 0.6641, + "step": 3170 + }, + { + "epoch": 2.065712426805465, + "grad_norm": 0.8388302326202393, + "learning_rate": 3.239283273957502e-05, + "loss": 0.6493, + "step": 3175 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 1.096605658531189, + "learning_rate": 3.2343508881716874e-05, + "loss": 0.6664, + "step": 3180 + }, + { + "epoch": 2.0722186076772933, + "grad_norm": 0.8951081037521362, + "learning_rate": 3.229415371968706e-05, + "loss": 0.6772, + "step": 3185 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 0.8592993021011353, + "learning_rate": 3.2244767463878525e-05, + "loss": 0.6688, + "step": 3190 + }, + { + "epoch": 2.078724788549122, + "grad_norm": 0.8971413969993591, + "learning_rate": 3.219535032481679e-05, + "loss": 0.6739, + "step": 3195 + }, + { + "epoch": 2.0819778789850356, + "grad_norm": 2.3976173400878906, + "learning_rate": 3.214590251315896e-05, + "loss": 0.663, + "step": 3200 + }, + { + "epoch": 2.08523096942095, + "grad_norm": 1.024121642112732, + "learning_rate": 3.209642423969296e-05, + "loss": 0.6618, + "step": 3205 + }, + { + "epoch": 2.088484059856864, + "grad_norm": 0.8724255561828613, + "learning_rate": 3.204691571533652e-05, + "loss": 0.6459, + "step": 3210 + }, + { + "epoch": 2.091737150292778, + "grad_norm": 0.9145408272743225, + "learning_rate": 3.1997377151136356e-05, + "loss": 0.6759, + "step": 3215 + }, + { + "epoch": 2.094990240728692, + "grad_norm": 1.3194828033447266, + "learning_rate": 3.194780875826723e-05, + "loss": 0.6687, + "step": 3220 + }, + { + "epoch": 2.0982433311646065, + "grad_norm": 1.0560972690582275, + "learning_rate": 3.189821074803103e-05, + "loss": 0.6554, + "step": 3225 + }, + { + "epoch": 2.1014964216005203, + "grad_norm": 0.8973615765571594, + "learning_rate": 3.1848583331855954e-05, + "loss": 0.6554, + "step": 3230 + }, + { + "epoch": 2.1047495120364346, + "grad_norm": 1.0747287273406982, + "learning_rate": 3.1808860368799674e-05, + "loss": 0.6729, + "step": 3235 + }, + { + "epoch": 2.108002602472349, + "grad_norm": 1.3772774934768677, + "learning_rate": 3.1759180555133126e-05, + "loss": 0.6449, + "step": 3240 + }, + { + "epoch": 2.1112556929082626, + "grad_norm": 0.8960999250411987, + "learning_rate": 3.170947192819057e-05, + "loss": 0.6678, + "step": 3245 + }, + { + "epoch": 2.114508783344177, + "grad_norm": 0.891995906829834, + "learning_rate": 3.165973469987168e-05, + "loss": 0.6542, + "step": 3250 + }, + { + "epoch": 2.117761873780091, + "grad_norm": 0.8801698684692383, + "learning_rate": 3.160996908219812e-05, + "loss": 0.6588, + "step": 3255 + }, + { + "epoch": 2.121014964216005, + "grad_norm": 1.0145418643951416, + "learning_rate": 3.1560175287312534e-05, + "loss": 0.6561, + "step": 3260 + }, + { + "epoch": 2.1242680546519193, + "grad_norm": 1.2028182744979858, + "learning_rate": 3.151035352747767e-05, + "loss": 0.6689, + "step": 3265 + }, + { + "epoch": 2.1275211450878335, + "grad_norm": 0.9466059803962708, + "learning_rate": 3.1460504015075525e-05, + "loss": 0.6518, + "step": 3270 + }, + { + "epoch": 2.130774235523748, + "grad_norm": 0.938401460647583, + "learning_rate": 3.141062696260636e-05, + "loss": 0.6649, + "step": 3275 + }, + { + "epoch": 2.1340273259596616, + "grad_norm": 2.2599849700927734, + "learning_rate": 3.1360722582687876e-05, + "loss": 0.6607, + "step": 3280 + }, + { + "epoch": 2.137280416395576, + "grad_norm": 0.8594357371330261, + "learning_rate": 3.1310791088054225e-05, + "loss": 0.6572, + "step": 3285 + }, + { + "epoch": 2.14053350683149, + "grad_norm": 1.1824400424957275, + "learning_rate": 3.126083269155517e-05, + "loss": 0.6812, + "step": 3290 + }, + { + "epoch": 2.143786597267404, + "grad_norm": 1.1186939477920532, + "learning_rate": 3.121084760615515e-05, + "loss": 0.6653, + "step": 3295 + }, + { + "epoch": 2.147039687703318, + "grad_norm": 0.9989317059516907, + "learning_rate": 3.116083604493236e-05, + "loss": 0.6934, + "step": 3300 + }, + { + "epoch": 2.1502927781392325, + "grad_norm": 0.8350909948348999, + "learning_rate": 3.111079822107788e-05, + "loss": 0.6447, + "step": 3305 + }, + { + "epoch": 2.1535458685751463, + "grad_norm": 0.9244917035102844, + "learning_rate": 3.106073434789472e-05, + "loss": 0.6492, + "step": 3310 + }, + { + "epoch": 2.1567989590110606, + "grad_norm": 0.9779621362686157, + "learning_rate": 3.1010644638796956e-05, + "loss": 0.6679, + "step": 3315 + }, + { + "epoch": 2.160052049446975, + "grad_norm": 1.2001359462738037, + "learning_rate": 3.096052930730877e-05, + "loss": 0.6752, + "step": 3320 + }, + { + "epoch": 2.1633051398828886, + "grad_norm": 0.8512766361236572, + "learning_rate": 3.091038856706361e-05, + "loss": 0.6601, + "step": 3325 + }, + { + "epoch": 2.166558230318803, + "grad_norm": 3.723388433456421, + "learning_rate": 3.086022263180318e-05, + "loss": 0.6976, + "step": 3330 + }, + { + "epoch": 2.169811320754717, + "grad_norm": 0.9627018570899963, + "learning_rate": 3.081003171537665e-05, + "loss": 0.666, + "step": 3335 + }, + { + "epoch": 2.173064411190631, + "grad_norm": 0.9980655312538147, + "learning_rate": 3.075981603173963e-05, + "loss": 0.6729, + "step": 3340 + }, + { + "epoch": 2.1763175016265452, + "grad_norm": 1.0389659404754639, + "learning_rate": 3.070957579495333e-05, + "loss": 0.6772, + "step": 3345 + }, + { + "epoch": 2.1795705920624595, + "grad_norm": 1.0569038391113281, + "learning_rate": 3.065931121918364e-05, + "loss": 0.6711, + "step": 3350 + }, + { + "epoch": 2.1828236824983733, + "grad_norm": 1.2096091508865356, + "learning_rate": 3.060902251870017e-05, + "loss": 0.6699, + "step": 3355 + }, + { + "epoch": 2.1860767729342876, + "grad_norm": 1.1020735502243042, + "learning_rate": 3.0558709907875385e-05, + "loss": 0.6426, + "step": 3360 + }, + { + "epoch": 2.189329863370202, + "grad_norm": 0.9492762088775635, + "learning_rate": 3.0508373601183695e-05, + "loss": 0.6712, + "step": 3365 + }, + { + "epoch": 2.1925829538061157, + "grad_norm": 0.9105232954025269, + "learning_rate": 3.045801381320048e-05, + "loss": 0.6722, + "step": 3370 + }, + { + "epoch": 2.19583604424203, + "grad_norm": 0.8517335653305054, + "learning_rate": 3.0407630758601256e-05, + "loss": 0.6727, + "step": 3375 + }, + { + "epoch": 2.199089134677944, + "grad_norm": 0.9834687113761902, + "learning_rate": 3.035722465216071e-05, + "loss": 0.6581, + "step": 3380 + }, + { + "epoch": 2.202342225113858, + "grad_norm": 1.1220489740371704, + "learning_rate": 3.030679570875177e-05, + "loss": 0.6648, + "step": 3385 + }, + { + "epoch": 2.2055953155497723, + "grad_norm": 1.169081211090088, + "learning_rate": 3.0256344143344765e-05, + "loss": 0.6567, + "step": 3390 + }, + { + "epoch": 2.2088484059856865, + "grad_norm": 1.303977131843567, + "learning_rate": 3.02058701710064e-05, + "loss": 0.6429, + "step": 3395 + }, + { + "epoch": 2.2121014964216004, + "grad_norm": 1.1148353815078735, + "learning_rate": 3.0155374006898946e-05, + "loss": 0.686, + "step": 3400 + }, + { + "epoch": 2.2153545868575146, + "grad_norm": 1.0681456327438354, + "learning_rate": 3.010485586627924e-05, + "loss": 0.6585, + "step": 3405 + }, + { + "epoch": 2.218607677293429, + "grad_norm": 1.259251356124878, + "learning_rate": 3.005431596449782e-05, + "loss": 0.6525, + "step": 3410 + }, + { + "epoch": 2.2218607677293427, + "grad_norm": 1.041143774986267, + "learning_rate": 3.0003754516997984e-05, + "loss": 0.6983, + "step": 3415 + }, + { + "epoch": 2.225113858165257, + "grad_norm": 0.8610508441925049, + "learning_rate": 2.9953171739314867e-05, + "loss": 0.6491, + "step": 3420 + }, + { + "epoch": 2.2283669486011712, + "grad_norm": 1.259366750717163, + "learning_rate": 2.9902567847074537e-05, + "loss": 0.6771, + "step": 3425 + }, + { + "epoch": 2.231620039037085, + "grad_norm": 0.8394737243652344, + "learning_rate": 2.9851943055993088e-05, + "loss": 0.6659, + "step": 3430 + }, + { + "epoch": 2.2348731294729993, + "grad_norm": 0.8857366442680359, + "learning_rate": 2.980129758187567e-05, + "loss": 0.6525, + "step": 3435 + }, + { + "epoch": 2.2381262199089136, + "grad_norm": 1.0085824728012085, + "learning_rate": 2.9750631640615617e-05, + "loss": 0.6437, + "step": 3440 + }, + { + "epoch": 2.2413793103448274, + "grad_norm": 0.8579273819923401, + "learning_rate": 2.969994544819352e-05, + "loss": 0.6583, + "step": 3445 + }, + { + "epoch": 2.2446324007807417, + "grad_norm": 1.7075871229171753, + "learning_rate": 2.9649239220676285e-05, + "loss": 0.6572, + "step": 3450 + }, + { + "epoch": 2.247885491216656, + "grad_norm": 1.0663272142410278, + "learning_rate": 2.959851317421622e-05, + "loss": 0.656, + "step": 3455 + }, + { + "epoch": 2.2511385816525697, + "grad_norm": 1.072970986366272, + "learning_rate": 2.9547767525050142e-05, + "loss": 0.6809, + "step": 3460 + }, + { + "epoch": 2.254391672088484, + "grad_norm": 0.9220450520515442, + "learning_rate": 2.9497002489498393e-05, + "loss": 0.6804, + "step": 3465 + }, + { + "epoch": 2.2576447625243983, + "grad_norm": 1.0572975873947144, + "learning_rate": 2.9446218283964e-05, + "loss": 0.6561, + "step": 3470 + }, + { + "epoch": 2.260897852960312, + "grad_norm": 1.0907764434814453, + "learning_rate": 2.939541512493167e-05, + "loss": 0.6555, + "step": 3475 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 0.9391184449195862, + "learning_rate": 2.9344593228966925e-05, + "loss": 0.6512, + "step": 3480 + }, + { + "epoch": 2.2674040338321406, + "grad_norm": 0.9960388541221619, + "learning_rate": 2.929375281271517e-05, + "loss": 0.6694, + "step": 3485 + }, + { + "epoch": 2.2706571242680544, + "grad_norm": 0.9363919496536255, + "learning_rate": 2.9242894092900725e-05, + "loss": 0.6748, + "step": 3490 + }, + { + "epoch": 2.2739102147039687, + "grad_norm": 0.9546090364456177, + "learning_rate": 2.9192017286325973e-05, + "loss": 0.6509, + "step": 3495 + }, + { + "epoch": 2.277163305139883, + "grad_norm": 1.040223240852356, + "learning_rate": 2.9141122609870364e-05, + "loss": 0.6519, + "step": 3500 + }, + { + "epoch": 2.280416395575797, + "grad_norm": 0.921008288860321, + "learning_rate": 2.909021028048955e-05, + "loss": 0.6645, + "step": 3505 + }, + { + "epoch": 2.283669486011711, + "grad_norm": 1.0411720275878906, + "learning_rate": 2.9039280515214428e-05, + "loss": 0.6607, + "step": 3510 + }, + { + "epoch": 2.2869225764476253, + "grad_norm": 0.9046562910079956, + "learning_rate": 2.898833353115021e-05, + "loss": 0.6588, + "step": 3515 + }, + { + "epoch": 2.290175666883539, + "grad_norm": 0.932158350944519, + "learning_rate": 2.8937369545475517e-05, + "loss": 0.6718, + "step": 3520 + }, + { + "epoch": 2.2934287573194534, + "grad_norm": 1.0239461660385132, + "learning_rate": 2.8886388775441457e-05, + "loss": 0.6896, + "step": 3525 + }, + { + "epoch": 2.2966818477553677, + "grad_norm": 2.0023818016052246, + "learning_rate": 2.8835391438370664e-05, + "loss": 0.6653, + "step": 3530 + }, + { + "epoch": 2.2999349381912815, + "grad_norm": 1.012035608291626, + "learning_rate": 2.8784377751656416e-05, + "loss": 0.657, + "step": 3535 + }, + { + "epoch": 2.3031880286271957, + "grad_norm": 0.922524631023407, + "learning_rate": 2.873334793276166e-05, + "loss": 0.6805, + "step": 3540 + }, + { + "epoch": 2.30644111906311, + "grad_norm": 1.0384365320205688, + "learning_rate": 2.8682302199218148e-05, + "loss": 0.6643, + "step": 3545 + }, + { + "epoch": 2.3096942094990243, + "grad_norm": 2.3190078735351562, + "learning_rate": 2.8631240768625446e-05, + "loss": 0.6761, + "step": 3550 + }, + { + "epoch": 2.312947299934938, + "grad_norm": 1.3237860202789307, + "learning_rate": 2.8580163858650038e-05, + "loss": 0.6532, + "step": 3555 + }, + { + "epoch": 2.3162003903708523, + "grad_norm": 1.0602989196777344, + "learning_rate": 2.85290716870244e-05, + "loss": 0.6546, + "step": 3560 + }, + { + "epoch": 2.3194534808067666, + "grad_norm": 1.1202350854873657, + "learning_rate": 2.8477964471546077e-05, + "loss": 0.6703, + "step": 3565 + }, + { + "epoch": 2.3227065712426804, + "grad_norm": 0.7759214043617249, + "learning_rate": 2.8426842430076712e-05, + "loss": 0.6569, + "step": 3570 + }, + { + "epoch": 2.3259596616785947, + "grad_norm": 1.013677716255188, + "learning_rate": 2.8375705780541173e-05, + "loss": 0.6719, + "step": 3575 + }, + { + "epoch": 2.329212752114509, + "grad_norm": 0.9612709283828735, + "learning_rate": 2.8324554740926594e-05, + "loss": 0.6685, + "step": 3580 + }, + { + "epoch": 2.3324658425504228, + "grad_norm": 0.9653801321983337, + "learning_rate": 2.827338952928146e-05, + "loss": 0.6578, + "step": 3585 + }, + { + "epoch": 2.335718932986337, + "grad_norm": 0.9308706521987915, + "learning_rate": 2.8222210363714653e-05, + "loss": 0.6446, + "step": 3590 + }, + { + "epoch": 2.3389720234222513, + "grad_norm": 0.9803590774536133, + "learning_rate": 2.8171017462394546e-05, + "loss": 0.6395, + "step": 3595 + }, + { + "epoch": 2.342225113858165, + "grad_norm": 1.5392948389053345, + "learning_rate": 2.8119811043548063e-05, + "loss": 0.6452, + "step": 3600 + }, + { + "epoch": 2.3454782042940794, + "grad_norm": 1.1688051223754883, + "learning_rate": 2.806859132545975e-05, + "loss": 0.6619, + "step": 3605 + }, + { + "epoch": 2.3487312947299936, + "grad_norm": 2.6195671558380127, + "learning_rate": 2.801735852647086e-05, + "loss": 0.6603, + "step": 3610 + }, + { + "epoch": 2.3519843851659075, + "grad_norm": 0.9085677266120911, + "learning_rate": 2.79661128649784e-05, + "loss": 0.6562, + "step": 3615 + }, + { + "epoch": 2.3552374756018217, + "grad_norm": 1.1864854097366333, + "learning_rate": 2.791485455943419e-05, + "loss": 0.6566, + "step": 3620 + }, + { + "epoch": 2.358490566037736, + "grad_norm": 1.0113550424575806, + "learning_rate": 2.7863583828343964e-05, + "loss": 0.6555, + "step": 3625 + }, + { + "epoch": 2.36174365647365, + "grad_norm": 0.8967930674552917, + "learning_rate": 2.7812300890266442e-05, + "loss": 0.6351, + "step": 3630 + }, + { + "epoch": 2.364996746909564, + "grad_norm": 0.9505155682563782, + "learning_rate": 2.7761005963812337e-05, + "loss": 0.6717, + "step": 3635 + }, + { + "epoch": 2.3682498373454783, + "grad_norm": 17.054515838623047, + "learning_rate": 2.7709699267643503e-05, + "loss": 0.6866, + "step": 3640 + }, + { + "epoch": 2.371502927781392, + "grad_norm": 1.044188141822815, + "learning_rate": 2.7658381020471964e-05, + "loss": 0.6564, + "step": 3645 + }, + { + "epoch": 2.3747560182173064, + "grad_norm": 1.0670133829116821, + "learning_rate": 2.7607051441058958e-05, + "loss": 0.677, + "step": 3650 + }, + { + "epoch": 2.3780091086532207, + "grad_norm": 0.9037075638771057, + "learning_rate": 2.7555710748214064e-05, + "loss": 0.6675, + "step": 3655 + }, + { + "epoch": 2.3812621990891345, + "grad_norm": 1.2479665279388428, + "learning_rate": 2.75043591607942e-05, + "loss": 0.6359, + "step": 3660 + }, + { + "epoch": 2.3845152895250488, + "grad_norm": 1.097076654434204, + "learning_rate": 2.7452996897702765e-05, + "loss": 0.6477, + "step": 3665 + }, + { + "epoch": 2.387768379960963, + "grad_norm": 0.9353396892547607, + "learning_rate": 2.7401624177888636e-05, + "loss": 0.6452, + "step": 3670 + }, + { + "epoch": 2.391021470396877, + "grad_norm": 1.2035560607910156, + "learning_rate": 2.7350241220345274e-05, + "loss": 0.6522, + "step": 3675 + }, + { + "epoch": 2.394274560832791, + "grad_norm": 1.0560545921325684, + "learning_rate": 2.729884824410979e-05, + "loss": 0.6655, + "step": 3680 + }, + { + "epoch": 2.3975276512687054, + "grad_norm": 1.2608094215393066, + "learning_rate": 2.724744546826199e-05, + "loss": 0.6566, + "step": 3685 + }, + { + "epoch": 2.4007807417046196, + "grad_norm": 1.0126628875732422, + "learning_rate": 2.719603311192347e-05, + "loss": 0.6596, + "step": 3690 + }, + { + "epoch": 2.4040338321405335, + "grad_norm": 0.8379278182983398, + "learning_rate": 2.7144611394256653e-05, + "loss": 0.6581, + "step": 3695 + }, + { + "epoch": 2.4072869225764477, + "grad_norm": 1.0668292045593262, + "learning_rate": 2.7093180534463863e-05, + "loss": 0.6623, + "step": 3700 + }, + { + "epoch": 2.410540013012362, + "grad_norm": 2.791311264038086, + "learning_rate": 2.7041740751786408e-05, + "loss": 0.6364, + "step": 3705 + }, + { + "epoch": 2.413793103448276, + "grad_norm": 1.5181926488876343, + "learning_rate": 2.6990292265503646e-05, + "loss": 0.6522, + "step": 3710 + }, + { + "epoch": 2.41704619388419, + "grad_norm": 1.028696894645691, + "learning_rate": 2.6938835294931996e-05, + "loss": 0.6755, + "step": 3715 + }, + { + "epoch": 2.4202992843201043, + "grad_norm": 2.595792531967163, + "learning_rate": 2.6887370059424078e-05, + "loss": 0.6704, + "step": 3720 + }, + { + "epoch": 2.423552374756018, + "grad_norm": 1.041309118270874, + "learning_rate": 2.6835896778367738e-05, + "loss": 0.6489, + "step": 3725 + }, + { + "epoch": 2.4268054651919324, + "grad_norm": 1.087664246559143, + "learning_rate": 2.6784415671185104e-05, + "loss": 0.6521, + "step": 3730 + }, + { + "epoch": 2.4300585556278467, + "grad_norm": 1.149911880493164, + "learning_rate": 2.6732926957331688e-05, + "loss": 0.6461, + "step": 3735 + }, + { + "epoch": 2.4333116460637605, + "grad_norm": 0.9765056371688843, + "learning_rate": 2.668143085629541e-05, + "loss": 0.6408, + "step": 3740 + }, + { + "epoch": 2.4365647364996748, + "grad_norm": 1.053946614265442, + "learning_rate": 2.6629927587595688e-05, + "loss": 0.658, + "step": 3745 + }, + { + "epoch": 2.439817826935589, + "grad_norm": 1.1712669134140015, + "learning_rate": 2.65784173707825e-05, + "loss": 0.6504, + "step": 3750 + }, + { + "epoch": 2.443070917371503, + "grad_norm": 0.9881893396377563, + "learning_rate": 2.6526900425435425e-05, + "loss": 0.6709, + "step": 3755 + }, + { + "epoch": 2.446324007807417, + "grad_norm": 0.8471539616584778, + "learning_rate": 2.6475376971162734e-05, + "loss": 0.6754, + "step": 3760 + }, + { + "epoch": 2.4495770982433314, + "grad_norm": 0.9622436165809631, + "learning_rate": 2.642384722760046e-05, + "loss": 0.6597, + "step": 3765 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 0.7975105047225952, + "learning_rate": 2.6372311414411427e-05, + "loss": 0.662, + "step": 3770 + }, + { + "epoch": 2.4560832791151594, + "grad_norm": 1.1026084423065186, + "learning_rate": 2.6320769751284335e-05, + "loss": 0.6651, + "step": 3775 + }, + { + "epoch": 2.4593363695510737, + "grad_norm": 1.0752489566802979, + "learning_rate": 2.6269222457932824e-05, + "loss": 0.6366, + "step": 3780 + }, + { + "epoch": 2.4625894599869875, + "grad_norm": 0.8716150522232056, + "learning_rate": 2.621766975409453e-05, + "loss": 0.6653, + "step": 3785 + }, + { + "epoch": 2.465842550422902, + "grad_norm": 1.0649757385253906, + "learning_rate": 2.616611185953018e-05, + "loss": 0.6869, + "step": 3790 + }, + { + "epoch": 2.469095640858816, + "grad_norm": 1.258510947227478, + "learning_rate": 2.6114548994022576e-05, + "loss": 0.6564, + "step": 3795 + }, + { + "epoch": 2.47234873129473, + "grad_norm": 0.9622224569320679, + "learning_rate": 2.6062981377375762e-05, + "loss": 0.6615, + "step": 3800 + }, + { + "epoch": 2.475601821730644, + "grad_norm": 2.6835622787475586, + "learning_rate": 2.6011409229414003e-05, + "loss": 0.6793, + "step": 3805 + }, + { + "epoch": 2.4788549121665584, + "grad_norm": 1.3989510536193848, + "learning_rate": 2.59598327699809e-05, + "loss": 0.6462, + "step": 3810 + }, + { + "epoch": 2.482108002602472, + "grad_norm": 0.9427128434181213, + "learning_rate": 2.5908252218938423e-05, + "loss": 0.6489, + "step": 3815 + }, + { + "epoch": 2.4853610930383865, + "grad_norm": 1.0668915510177612, + "learning_rate": 2.585666779616598e-05, + "loss": 0.6633, + "step": 3820 + }, + { + "epoch": 2.4886141834743007, + "grad_norm": 0.9310431480407715, + "learning_rate": 2.5805079721559494e-05, + "loss": 0.6798, + "step": 3825 + }, + { + "epoch": 2.4918672739102146, + "grad_norm": 1.0935829877853394, + "learning_rate": 2.5753488215030448e-05, + "loss": 0.669, + "step": 3830 + }, + { + "epoch": 2.495120364346129, + "grad_norm": 0.8651193380355835, + "learning_rate": 2.5701893496504953e-05, + "loss": 0.6789, + "step": 3835 + }, + { + "epoch": 2.498373454782043, + "grad_norm": 1.0802959203720093, + "learning_rate": 2.5650295785922817e-05, + "loss": 0.6656, + "step": 3840 + }, + { + "epoch": 2.501626545217957, + "grad_norm": 1.0248281955718994, + "learning_rate": 2.5598695303236615e-05, + "loss": 0.6506, + "step": 3845 + }, + { + "epoch": 2.504879635653871, + "grad_norm": 1.0692211389541626, + "learning_rate": 2.5547092268410703e-05, + "loss": 0.6667, + "step": 3850 + }, + { + "epoch": 2.5081327260897854, + "grad_norm": 1.2791413068771362, + "learning_rate": 2.5495486901420362e-05, + "loss": 0.6506, + "step": 3855 + }, + { + "epoch": 2.5113858165256993, + "grad_norm": 1.1521552801132202, + "learning_rate": 2.5443879422250767e-05, + "loss": 0.6698, + "step": 3860 + }, + { + "epoch": 2.5146389069616135, + "grad_norm": 0.9791963696479797, + "learning_rate": 2.539227005089614e-05, + "loss": 0.6732, + "step": 3865 + }, + { + "epoch": 2.517891997397528, + "grad_norm": 0.9954161643981934, + "learning_rate": 2.5340659007358742e-05, + "loss": 0.6599, + "step": 3870 + }, + { + "epoch": 2.5211450878334416, + "grad_norm": 1.1552543640136719, + "learning_rate": 2.5289046511647972e-05, + "loss": 0.6849, + "step": 3875 + }, + { + "epoch": 2.524398178269356, + "grad_norm": 0.9102405905723572, + "learning_rate": 2.523743278377943e-05, + "loss": 0.6375, + "step": 3880 + }, + { + "epoch": 2.52765126870527, + "grad_norm": 1.0647556781768799, + "learning_rate": 2.518581804377394e-05, + "loss": 0.6585, + "step": 3885 + }, + { + "epoch": 2.530904359141184, + "grad_norm": 1.022453784942627, + "learning_rate": 2.5134202511656658e-05, + "loss": 0.6667, + "step": 3890 + }, + { + "epoch": 2.534157449577098, + "grad_norm": 0.9109323024749756, + "learning_rate": 2.5082586407456134e-05, + "loss": 0.6754, + "step": 3895 + }, + { + "epoch": 2.5374105400130125, + "grad_norm": 0.8672389388084412, + "learning_rate": 2.5030969951203316e-05, + "loss": 0.6432, + "step": 3900 + }, + { + "epoch": 2.5406636304489263, + "grad_norm": 1.022900938987732, + "learning_rate": 2.4979353362930685e-05, + "loss": 0.6512, + "step": 3905 + }, + { + "epoch": 2.5439167208848406, + "grad_norm": 0.9560525417327881, + "learning_rate": 2.492773686267128e-05, + "loss": 0.6528, + "step": 3910 + }, + { + "epoch": 2.547169811320755, + "grad_norm": 1.1724059581756592, + "learning_rate": 2.4876120670457754e-05, + "loss": 0.7026, + "step": 3915 + }, + { + "epoch": 2.5504229017566686, + "grad_norm": 0.9830509424209595, + "learning_rate": 2.482450500632145e-05, + "loss": 0.6411, + "step": 3920 + }, + { + "epoch": 2.553675992192583, + "grad_norm": 0.8978729844093323, + "learning_rate": 2.477289009029147e-05, + "loss": 0.6656, + "step": 3925 + }, + { + "epoch": 2.556929082628497, + "grad_norm": 2.0030722618103027, + "learning_rate": 2.4721276142393714e-05, + "loss": 0.6554, + "step": 3930 + }, + { + "epoch": 2.560182173064411, + "grad_norm": 14.189367294311523, + "learning_rate": 2.4669663382649967e-05, + "loss": 0.6196, + "step": 3935 + }, + { + "epoch": 2.5634352635003252, + "grad_norm": 1.2099864482879639, + "learning_rate": 2.4618052031076933e-05, + "loss": 0.651, + "step": 3940 + }, + { + "epoch": 2.5666883539362395, + "grad_norm": 0.9091722965240479, + "learning_rate": 2.4566442307685325e-05, + "loss": 0.6533, + "step": 3945 + }, + { + "epoch": 2.5699414443721533, + "grad_norm": 1.1715190410614014, + "learning_rate": 2.4514834432478927e-05, + "loss": 0.6578, + "step": 3950 + }, + { + "epoch": 2.5731945348080676, + "grad_norm": 0.9423794150352478, + "learning_rate": 2.4463228625453607e-05, + "loss": 0.665, + "step": 3955 + }, + { + "epoch": 2.576447625243982, + "grad_norm": 0.8811748027801514, + "learning_rate": 2.4411625106596457e-05, + "loss": 0.6589, + "step": 3960 + }, + { + "epoch": 2.5797007156798957, + "grad_norm": 0.9299075603485107, + "learning_rate": 2.43600240958848e-05, + "loss": 0.6568, + "step": 3965 + }, + { + "epoch": 2.58295380611581, + "grad_norm": 0.9680789709091187, + "learning_rate": 2.4308425813285255e-05, + "loss": 0.654, + "step": 3970 + }, + { + "epoch": 2.586206896551724, + "grad_norm": 1.6087170839309692, + "learning_rate": 2.425683047875282e-05, + "loss": 0.6812, + "step": 3975 + }, + { + "epoch": 2.589459986987638, + "grad_norm": 0.9408166408538818, + "learning_rate": 2.420523831222994e-05, + "loss": 0.6292, + "step": 3980 + }, + { + "epoch": 2.5927130774235523, + "grad_norm": 0.9801003336906433, + "learning_rate": 2.4153649533645545e-05, + "loss": 0.6536, + "step": 3985 + }, + { + "epoch": 2.5959661678594665, + "grad_norm": 0.908278226852417, + "learning_rate": 2.4102064362914108e-05, + "loss": 0.6494, + "step": 3990 + }, + { + "epoch": 2.5992192582953804, + "grad_norm": 2.5500690937042236, + "learning_rate": 2.4050483019934737e-05, + "loss": 0.6338, + "step": 3995 + }, + { + "epoch": 2.6024723487312946, + "grad_norm": 1.1047178506851196, + "learning_rate": 2.3998905724590237e-05, + "loss": 0.656, + "step": 4000 + }, + { + "epoch": 2.605725439167209, + "grad_norm": 1.0765790939331055, + "learning_rate": 2.3947332696746122e-05, + "loss": 0.6445, + "step": 4005 + }, + { + "epoch": 2.6089785296031227, + "grad_norm": 0.9311931133270264, + "learning_rate": 2.3895764156249746e-05, + "loss": 0.6472, + "step": 4010 + }, + { + "epoch": 2.612231620039037, + "grad_norm": 1.5003751516342163, + "learning_rate": 2.3844200322929323e-05, + "loss": 0.6713, + "step": 4015 + }, + { + "epoch": 2.6154847104749512, + "grad_norm": 0.8282221555709839, + "learning_rate": 2.3792641416592994e-05, + "loss": 0.6709, + "step": 4020 + }, + { + "epoch": 2.618737800910865, + "grad_norm": 1.0039994716644287, + "learning_rate": 2.3741087657027912e-05, + "loss": 0.6723, + "step": 4025 + }, + { + "epoch": 2.6219908913467793, + "grad_norm": 1.1094609498977661, + "learning_rate": 2.3689539263999286e-05, + "loss": 0.6519, + "step": 4030 + }, + { + "epoch": 2.6252439817826936, + "grad_norm": 0.9982606768608093, + "learning_rate": 2.3637996457249434e-05, + "loss": 0.6444, + "step": 4035 + }, + { + "epoch": 2.6284970722186074, + "grad_norm": 0.9692511558532715, + "learning_rate": 2.3586459456496877e-05, + "loss": 0.6525, + "step": 4040 + }, + { + "epoch": 2.6317501626545217, + "grad_norm": 1.5806814432144165, + "learning_rate": 2.3534928481435388e-05, + "loss": 0.6756, + "step": 4045 + }, + { + "epoch": 2.635003253090436, + "grad_norm": 0.9672693014144897, + "learning_rate": 2.348340375173303e-05, + "loss": 0.635, + "step": 4050 + }, + { + "epoch": 2.63825634352635, + "grad_norm": 0.9741607904434204, + "learning_rate": 2.3442188612229703e-05, + "loss": 0.6786, + "step": 4055 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 1.4344401359558105, + "learning_rate": 2.3390675677651777e-05, + "loss": 0.6695, + "step": 4060 + }, + { + "epoch": 2.6447625243981783, + "grad_norm": 0.9737870693206787, + "learning_rate": 2.3339169603358997e-05, + "loss": 0.6562, + "step": 4065 + }, + { + "epoch": 2.6480156148340925, + "grad_norm": 1.122745156288147, + "learning_rate": 2.328767060891328e-05, + "loss": 0.6428, + "step": 4070 + }, + { + "epoch": 2.6512687052700064, + "grad_norm": 1.1388943195343018, + "learning_rate": 2.323617891384638e-05, + "loss": 0.6592, + "step": 4075 + }, + { + "epoch": 2.6545217957059206, + "grad_norm": 1.3511914014816284, + "learning_rate": 2.3184694737658942e-05, + "loss": 0.6811, + "step": 4080 + }, + { + "epoch": 2.657774886141835, + "grad_norm": 0.9977470636367798, + "learning_rate": 2.3133218299819536e-05, + "loss": 0.6489, + "step": 4085 + }, + { + "epoch": 2.6610279765777487, + "grad_norm": 6.271300792694092, + "learning_rate": 2.308174981976377e-05, + "loss": 0.6539, + "step": 4090 + }, + { + "epoch": 2.664281067013663, + "grad_norm": 1.12664794921875, + "learning_rate": 2.3030289516893306e-05, + "loss": 0.6874, + "step": 4095 + }, + { + "epoch": 2.6675341574495772, + "grad_norm": 1.1613657474517822, + "learning_rate": 2.2978837610574964e-05, + "loss": 0.649, + "step": 4100 + }, + { + "epoch": 2.6707872478854915, + "grad_norm": 0.9490109086036682, + "learning_rate": 2.2927394320139765e-05, + "loss": 0.6316, + "step": 4105 + }, + { + "epoch": 2.6740403383214053, + "grad_norm": 1.0231504440307617, + "learning_rate": 2.2875959864882002e-05, + "loss": 0.6496, + "step": 4110 + }, + { + "epoch": 2.6772934287573196, + "grad_norm": 1.2208635807037354, + "learning_rate": 2.2824534464058314e-05, + "loss": 0.6763, + "step": 4115 + }, + { + "epoch": 2.680546519193234, + "grad_norm": 1.1819181442260742, + "learning_rate": 2.2773118336886724e-05, + "loss": 0.6631, + "step": 4120 + }, + { + "epoch": 2.6837996096291477, + "grad_norm": 0.9770584106445312, + "learning_rate": 2.2721711702545735e-05, + "loss": 0.6727, + "step": 4125 + }, + { + "epoch": 2.687052700065062, + "grad_norm": 0.9494980573654175, + "learning_rate": 2.267031478017339e-05, + "loss": 0.6555, + "step": 4130 + }, + { + "epoch": 2.690305790500976, + "grad_norm": 1.0622323751449585, + "learning_rate": 2.2618927788866316e-05, + "loss": 0.6688, + "step": 4135 + }, + { + "epoch": 2.69355888093689, + "grad_norm": 1.1738066673278809, + "learning_rate": 2.2567550947678812e-05, + "loss": 0.6665, + "step": 4140 + }, + { + "epoch": 2.6968119713728043, + "grad_norm": 0.9498983025550842, + "learning_rate": 2.2516184475621915e-05, + "loss": 0.6593, + "step": 4145 + }, + { + "epoch": 2.7000650618087185, + "grad_norm": 0.9943968057632446, + "learning_rate": 2.246482859166245e-05, + "loss": 0.6394, + "step": 4150 + }, + { + "epoch": 2.7033181522446323, + "grad_norm": 0.9715344905853271, + "learning_rate": 2.2413483514722117e-05, + "loss": 0.6451, + "step": 4155 + }, + { + "epoch": 2.7065712426805466, + "grad_norm": 0.9957199692726135, + "learning_rate": 2.2362149463676536e-05, + "loss": 0.654, + "step": 4160 + }, + { + "epoch": 2.709824333116461, + "grad_norm": 1.3196548223495483, + "learning_rate": 2.231082665735433e-05, + "loss": 0.6377, + "step": 4165 + }, + { + "epoch": 2.7130774235523747, + "grad_norm": 1.0311577320098877, + "learning_rate": 2.22595153145362e-05, + "loss": 0.6532, + "step": 4170 + }, + { + "epoch": 2.716330513988289, + "grad_norm": 0.967720627784729, + "learning_rate": 2.220821565395395e-05, + "loss": 0.6723, + "step": 4175 + }, + { + "epoch": 2.719583604424203, + "grad_norm": 0.9087415933609009, + "learning_rate": 2.215692789428962e-05, + "loss": 0.6738, + "step": 4180 + }, + { + "epoch": 2.722836694860117, + "grad_norm": 0.9315296411514282, + "learning_rate": 2.21056522541745e-05, + "loss": 0.6727, + "step": 4185 + }, + { + "epoch": 2.7260897852960313, + "grad_norm": 1.8225336074829102, + "learning_rate": 2.2054388952188205e-05, + "loss": 0.6787, + "step": 4190 + }, + { + "epoch": 2.7293428757319456, + "grad_norm": 1.4423229694366455, + "learning_rate": 2.2003138206857782e-05, + "loss": 0.6549, + "step": 4195 + }, + { + "epoch": 2.7325959661678594, + "grad_norm": 0.9800527691841125, + "learning_rate": 2.1951900236656732e-05, + "loss": 0.6622, + "step": 4200 + }, + { + "epoch": 2.7358490566037736, + "grad_norm": 1.0587884187698364, + "learning_rate": 2.1900675260004102e-05, + "loss": 0.6575, + "step": 4205 + }, + { + "epoch": 2.739102147039688, + "grad_norm": 0.9304030537605286, + "learning_rate": 2.1849463495263546e-05, + "loss": 0.6374, + "step": 4210 + }, + { + "epoch": 2.7423552374756017, + "grad_norm": 1.906974196434021, + "learning_rate": 2.1798265160742413e-05, + "loss": 0.6516, + "step": 4215 + }, + { + "epoch": 2.745608327911516, + "grad_norm": 1.2081124782562256, + "learning_rate": 2.1747080474690778e-05, + "loss": 0.6872, + "step": 4220 + }, + { + "epoch": 2.7488614183474303, + "grad_norm": 0.8841726779937744, + "learning_rate": 2.169590965530056e-05, + "loss": 0.6585, + "step": 4225 + }, + { + "epoch": 2.752114508783344, + "grad_norm": 1.3173036575317383, + "learning_rate": 2.1644752920704534e-05, + "loss": 0.6584, + "step": 4230 + }, + { + "epoch": 2.7553675992192583, + "grad_norm": 1.285091519355774, + "learning_rate": 2.1593610488975468e-05, + "loss": 0.6578, + "step": 4235 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 1.0926772356033325, + "learning_rate": 2.1542482578125143e-05, + "loss": 0.6489, + "step": 4240 + }, + { + "epoch": 2.7618737800910864, + "grad_norm": 1.1347663402557373, + "learning_rate": 2.149136940610343e-05, + "loss": 0.6455, + "step": 4245 + }, + { + "epoch": 2.7651268705270007, + "grad_norm": 1.2697584629058838, + "learning_rate": 2.1440271190797403e-05, + "loss": 0.6708, + "step": 4250 + }, + { + "epoch": 2.768379960962915, + "grad_norm": 0.9204789996147156, + "learning_rate": 2.1389188150030344e-05, + "loss": 0.6889, + "step": 4255 + }, + { + "epoch": 2.7716330513988288, + "grad_norm": 1.154973030090332, + "learning_rate": 2.1338120501560862e-05, + "loss": 0.6653, + "step": 4260 + }, + { + "epoch": 2.774886141834743, + "grad_norm": 0.9574093222618103, + "learning_rate": 2.128706846308196e-05, + "loss": 0.6548, + "step": 4265 + }, + { + "epoch": 2.7781392322706573, + "grad_norm": 0.9033051133155823, + "learning_rate": 2.123603225222007e-05, + "loss": 0.6521, + "step": 4270 + }, + { + "epoch": 2.781392322706571, + "grad_norm": 0.9509397745132446, + "learning_rate": 2.11850120865342e-05, + "loss": 0.6542, + "step": 4275 + }, + { + "epoch": 2.7846454131424854, + "grad_norm": 0.9007295370101929, + "learning_rate": 2.1134008183514906e-05, + "loss": 0.6354, + "step": 4280 + }, + { + "epoch": 2.7878985035783996, + "grad_norm": 1.7208142280578613, + "learning_rate": 2.108302076058346e-05, + "loss": 0.6691, + "step": 4285 + }, + { + "epoch": 2.7911515940143135, + "grad_norm": 0.8821661472320557, + "learning_rate": 2.1032050035090865e-05, + "loss": 0.6784, + "step": 4290 + }, + { + "epoch": 2.7944046844502277, + "grad_norm": 0.9296684861183167, + "learning_rate": 2.0981096224316944e-05, + "loss": 0.6635, + "step": 4295 + }, + { + "epoch": 2.797657774886142, + "grad_norm": 1.5931028127670288, + "learning_rate": 2.093015954546942e-05, + "loss": 0.6763, + "step": 4300 + }, + { + "epoch": 2.800910865322056, + "grad_norm": 3.0784621238708496, + "learning_rate": 2.0879240215683e-05, + "loss": 0.6472, + "step": 4305 + }, + { + "epoch": 2.80416395575797, + "grad_norm": 1.1016740798950195, + "learning_rate": 2.0828338452018396e-05, + "loss": 0.6822, + "step": 4310 + }, + { + "epoch": 2.8074170461938843, + "grad_norm": 1.020324945449829, + "learning_rate": 2.0777454471461476e-05, + "loss": 0.6374, + "step": 4315 + }, + { + "epoch": 2.810670136629798, + "grad_norm": 1.3198879957199097, + "learning_rate": 2.0726588490922288e-05, + "loss": 0.6525, + "step": 4320 + }, + { + "epoch": 2.8139232270657124, + "grad_norm": 1.068981647491455, + "learning_rate": 2.0675740727234142e-05, + "loss": 0.6243, + "step": 4325 + }, + { + "epoch": 2.8171763175016267, + "grad_norm": 1.2015876770019531, + "learning_rate": 2.062491139715271e-05, + "loss": 0.6779, + "step": 4330 + }, + { + "epoch": 2.8204294079375405, + "grad_norm": 0.9587366580963135, + "learning_rate": 2.057410071735506e-05, + "loss": 0.6594, + "step": 4335 + }, + { + "epoch": 2.8236824983734548, + "grad_norm": 0.9465182423591614, + "learning_rate": 2.0523308904438775e-05, + "loss": 0.6545, + "step": 4340 + }, + { + "epoch": 2.826935588809369, + "grad_norm": 0.9994252324104309, + "learning_rate": 2.0472536174921017e-05, + "loss": 0.651, + "step": 4345 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 0.9640632271766663, + "learning_rate": 2.0421782745237574e-05, + "loss": 0.6617, + "step": 4350 + }, + { + "epoch": 2.833441769681197, + "grad_norm": 0.8937436938285828, + "learning_rate": 2.0371048831741987e-05, + "loss": 0.6459, + "step": 4355 + }, + { + "epoch": 2.8366948601171114, + "grad_norm": 1.2173950672149658, + "learning_rate": 2.0320334650704594e-05, + "loss": 0.658, + "step": 4360 + }, + { + "epoch": 2.839947950553025, + "grad_norm": 1.0267751216888428, + "learning_rate": 2.0269640418311608e-05, + "loss": 0.6494, + "step": 4365 + }, + { + "epoch": 2.8432010409889394, + "grad_norm": 3.8074207305908203, + "learning_rate": 2.021896635066421e-05, + "loss": 0.6454, + "step": 4370 + }, + { + "epoch": 2.8464541314248537, + "grad_norm": 1.9503079652786255, + "learning_rate": 2.0168312663777638e-05, + "loss": 0.6446, + "step": 4375 + }, + { + "epoch": 2.8497072218607675, + "grad_norm": 0.942142903804779, + "learning_rate": 2.011767957358021e-05, + "loss": 0.6374, + "step": 4380 + }, + { + "epoch": 2.852960312296682, + "grad_norm": 1.3679735660552979, + "learning_rate": 2.0067067295912494e-05, + "loss": 0.6554, + "step": 4385 + }, + { + "epoch": 2.856213402732596, + "grad_norm": 1.217850923538208, + "learning_rate": 2.0016476046526305e-05, + "loss": 0.6332, + "step": 4390 + }, + { + "epoch": 2.85946649316851, + "grad_norm": 0.9636296033859253, + "learning_rate": 1.996590604108383e-05, + "loss": 0.6601, + "step": 4395 + }, + { + "epoch": 2.862719583604424, + "grad_norm": 1.0870169401168823, + "learning_rate": 1.991535749515668e-05, + "loss": 0.6488, + "step": 4400 + }, + { + "epoch": 2.8659726740403384, + "grad_norm": 0.9022223353385925, + "learning_rate": 1.9864830624225005e-05, + "loss": 0.6379, + "step": 4405 + }, + { + "epoch": 2.869225764476252, + "grad_norm": 0.9370593428611755, + "learning_rate": 1.981432564367657e-05, + "loss": 0.6288, + "step": 4410 + }, + { + "epoch": 2.8724788549121665, + "grad_norm": 1.368965744972229, + "learning_rate": 1.976384276880578e-05, + "loss": 0.6723, + "step": 4415 + }, + { + "epoch": 2.8757319453480807, + "grad_norm": 1.0498238801956177, + "learning_rate": 1.971338221481285e-05, + "loss": 0.6348, + "step": 4420 + }, + { + "epoch": 2.8789850357839946, + "grad_norm": 11.295182228088379, + "learning_rate": 1.966294419680283e-05, + "loss": 0.6694, + "step": 4425 + }, + { + "epoch": 2.882238126219909, + "grad_norm": 1.0129636526107788, + "learning_rate": 1.96125289297847e-05, + "loss": 0.6533, + "step": 4430 + }, + { + "epoch": 2.885491216655823, + "grad_norm": 3.2188525199890137, + "learning_rate": 1.9562136628670464e-05, + "loss": 0.6661, + "step": 4435 + }, + { + "epoch": 2.888744307091737, + "grad_norm": 1.130278468132019, + "learning_rate": 1.9511767508274214e-05, + "loss": 0.6489, + "step": 4440 + }, + { + "epoch": 2.891997397527651, + "grad_norm": 1.2190179824829102, + "learning_rate": 1.946142178331124e-05, + "loss": 0.6323, + "step": 4445 + }, + { + "epoch": 2.8952504879635654, + "grad_norm": 0.9841941595077515, + "learning_rate": 1.9411099668397085e-05, + "loss": 0.6623, + "step": 4450 + }, + { + "epoch": 2.8985035783994793, + "grad_norm": 0.9542006850242615, + "learning_rate": 1.9360801378046666e-05, + "loss": 0.6389, + "step": 4455 + }, + { + "epoch": 2.9017566688353935, + "grad_norm": 1.037227988243103, + "learning_rate": 1.931052712667332e-05, + "loss": 0.6504, + "step": 4460 + }, + { + "epoch": 2.905009759271308, + "grad_norm": 1.146600365638733, + "learning_rate": 1.9260277128587936e-05, + "loss": 0.6558, + "step": 4465 + }, + { + "epoch": 2.9082628497072216, + "grad_norm": 0.8806731104850769, + "learning_rate": 1.921005159799798e-05, + "loss": 0.6534, + "step": 4470 + }, + { + "epoch": 2.911515940143136, + "grad_norm": 0.926228404045105, + "learning_rate": 1.915985074900664e-05, + "loss": 0.6416, + "step": 4475 + }, + { + "epoch": 2.91476903057905, + "grad_norm": 3.157443046569824, + "learning_rate": 1.9109674795611898e-05, + "loss": 0.6614, + "step": 4480 + }, + { + "epoch": 2.918022121014964, + "grad_norm": 0.9215693473815918, + "learning_rate": 1.9059523951705585e-05, + "loss": 0.6738, + "step": 4485 + }, + { + "epoch": 2.921275211450878, + "grad_norm": 1.0931910276412964, + "learning_rate": 1.900939843107251e-05, + "loss": 0.6587, + "step": 4490 + }, + { + "epoch": 2.9245283018867925, + "grad_norm": 0.8158690333366394, + "learning_rate": 1.895929844738954e-05, + "loss": 0.6562, + "step": 4495 + }, + { + "epoch": 2.9277813923227067, + "grad_norm": 0.8992384672164917, + "learning_rate": 1.8909224214224662e-05, + "loss": 0.6552, + "step": 4500 + }, + { + "epoch": 2.9310344827586206, + "grad_norm": 0.9259112477302551, + "learning_rate": 1.885917594503611e-05, + "loss": 0.6391, + "step": 4505 + }, + { + "epoch": 2.934287573194535, + "grad_norm": 1.9078396558761597, + "learning_rate": 1.8809153853171426e-05, + "loss": 0.6419, + "step": 4510 + }, + { + "epoch": 2.937540663630449, + "grad_norm": 1.015743374824524, + "learning_rate": 1.875915815186657e-05, + "loss": 0.6437, + "step": 4515 + }, + { + "epoch": 2.940793754066363, + "grad_norm": 0.8620851039886475, + "learning_rate": 1.8709189054244996e-05, + "loss": 0.6496, + "step": 4520 + }, + { + "epoch": 2.944046844502277, + "grad_norm": 0.9002664089202881, + "learning_rate": 1.865924677331677e-05, + "loss": 0.6468, + "step": 4525 + }, + { + "epoch": 2.9472999349381914, + "grad_norm": 1.9784495830535889, + "learning_rate": 1.8609331521977623e-05, + "loss": 0.6707, + "step": 4530 + }, + { + "epoch": 2.9505530253741052, + "grad_norm": 0.902123749256134, + "learning_rate": 1.8559443513008067e-05, + "loss": 0.6545, + "step": 4535 + }, + { + "epoch": 2.9538061158100195, + "grad_norm": 1.239797592163086, + "learning_rate": 1.8509582959072486e-05, + "loss": 0.641, + "step": 4540 + }, + { + "epoch": 2.9570592062459338, + "grad_norm": 1.3703653812408447, + "learning_rate": 1.8459750072718235e-05, + "loss": 0.6373, + "step": 4545 + }, + { + "epoch": 2.960312296681848, + "grad_norm": 3.020883083343506, + "learning_rate": 1.8409945066374706e-05, + "loss": 0.6318, + "step": 4550 + }, + { + "epoch": 2.963565387117762, + "grad_norm": 1.026063084602356, + "learning_rate": 1.8360168152352472e-05, + "loss": 0.6294, + "step": 4555 + }, + { + "epoch": 2.966818477553676, + "grad_norm": 0.9441201090812683, + "learning_rate": 1.8310419542842327e-05, + "loss": 0.6543, + "step": 4560 + }, + { + "epoch": 2.9700715679895904, + "grad_norm": 1.139441728591919, + "learning_rate": 1.826069944991442e-05, + "loss": 0.6433, + "step": 4565 + }, + { + "epoch": 2.973324658425504, + "grad_norm": 1.0269153118133545, + "learning_rate": 1.821100808551735e-05, + "loss": 0.6669, + "step": 4570 + }, + { + "epoch": 2.9765777488614185, + "grad_norm": 0.9721339344978333, + "learning_rate": 1.8161345661477215e-05, + "loss": 0.6481, + "step": 4575 + }, + { + "epoch": 2.9798308392973327, + "grad_norm": 1.0770370960235596, + "learning_rate": 1.811171238949679e-05, + "loss": 0.6532, + "step": 4580 + }, + { + "epoch": 2.9830839297332465, + "grad_norm": 0.8994714021682739, + "learning_rate": 1.8062108481154545e-05, + "loss": 0.6503, + "step": 4585 + }, + { + "epoch": 2.986337020169161, + "grad_norm": 1.0329276323318481, + "learning_rate": 1.801253414790379e-05, + "loss": 0.6295, + "step": 4590 + }, + { + "epoch": 2.989590110605075, + "grad_norm": 5.5768232345581055, + "learning_rate": 1.796298960107177e-05, + "loss": 0.6711, + "step": 4595 + }, + { + "epoch": 2.992843201040989, + "grad_norm": 1.2496637105941772, + "learning_rate": 1.7913475051858744e-05, + "loss": 0.6501, + "step": 4600 + }, + { + "epoch": 2.996096291476903, + "grad_norm": 1.1241545677185059, + "learning_rate": 1.7863990711337093e-05, + "loss": 0.6334, + "step": 4605 + }, + { + "epoch": 2.9993493819128174, + "grad_norm": 1.352886438369751, + "learning_rate": 1.7814536790450437e-05, + "loss": 0.6427, + "step": 4610 + }, + { + "epoch": 3.0, + "eval_f1": 0.812837658080022, + "eval_loss": 0.43994140625, + "eval_precision": 0.8124680334875379, + "eval_recall": 0.8133431620386601, + "eval_runtime": 406.4496, + "eval_samples_per_second": 967.98, + "eval_steps_per_second": 0.947, + "step": 4611 + }, + { + "epoch": 3.0026024723487312, + "grad_norm": 1.0044734477996826, + "learning_rate": 1.7765113500012706e-05, + "loss": 0.5134, + "step": 4615 + }, + { + "epoch": 3.0058555627846455, + "grad_norm": 1.0504697561264038, + "learning_rate": 1.771572105070727e-05, + "loss": 0.4684, + "step": 4620 + }, + { + "epoch": 3.0091086532205593, + "grad_norm": 1.143985390663147, + "learning_rate": 1.766635965308603e-05, + "loss": 0.4485, + "step": 4625 + }, + { + "epoch": 3.0123617436564736, + "grad_norm": 1.933708667755127, + "learning_rate": 1.7617029517568502e-05, + "loss": 0.469, + "step": 4630 + }, + { + "epoch": 3.015614834092388, + "grad_norm": 1.3680862188339233, + "learning_rate": 1.756773085444095e-05, + "loss": 0.4556, + "step": 4635 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 1.3127816915512085, + "learning_rate": 1.7518463873855486e-05, + "loss": 0.4536, + "step": 4640 + }, + { + "epoch": 3.022121014964216, + "grad_norm": 1.4003212451934814, + "learning_rate": 1.746922878582914e-05, + "loss": 0.4553, + "step": 4645 + }, + { + "epoch": 3.02537410540013, + "grad_norm": 1.3466031551361084, + "learning_rate": 1.7420025800243e-05, + "loss": 0.4634, + "step": 4650 + }, + { + "epoch": 3.0286271958360445, + "grad_norm": 1.3510533571243286, + "learning_rate": 1.7370855126841314e-05, + "loss": 0.4385, + "step": 4655 + }, + { + "epoch": 3.0318802862719583, + "grad_norm": 1.4002180099487305, + "learning_rate": 1.732171697523059e-05, + "loss": 0.4455, + "step": 4660 + }, + { + "epoch": 3.0351333767078725, + "grad_norm": 1.432400107383728, + "learning_rate": 1.7272611554878678e-05, + "loss": 0.443, + "step": 4665 + }, + { + "epoch": 3.038386467143787, + "grad_norm": 3.641195774078369, + "learning_rate": 1.722353907511393e-05, + "loss": 0.4351, + "step": 4670 + }, + { + "epoch": 3.0416395575797006, + "grad_norm": 1.3562887907028198, + "learning_rate": 1.717449974512426e-05, + "loss": 0.4452, + "step": 4675 + }, + { + "epoch": 3.044892648015615, + "grad_norm": 1.4275323152542114, + "learning_rate": 1.7125493773956265e-05, + "loss": 0.4424, + "step": 4680 + }, + { + "epoch": 3.048145738451529, + "grad_norm": 1.6079832315444946, + "learning_rate": 1.7076521370514355e-05, + "loss": 0.4532, + "step": 4685 + }, + { + "epoch": 3.051398828887443, + "grad_norm": 1.3562263250350952, + "learning_rate": 1.7027582743559843e-05, + "loss": 0.4417, + "step": 4690 + }, + { + "epoch": 3.0546519193233572, + "grad_norm": 1.7932298183441162, + "learning_rate": 1.6978678101710043e-05, + "loss": 0.4375, + "step": 4695 + }, + { + "epoch": 3.0579050097592715, + "grad_norm": 1.6265188455581665, + "learning_rate": 1.6929807653437412e-05, + "loss": 0.4307, + "step": 4700 + }, + { + "epoch": 3.0611581001951853, + "grad_norm": 1.4614055156707764, + "learning_rate": 1.6880971607068646e-05, + "loss": 0.4275, + "step": 4705 + }, + { + "epoch": 3.0644111906310996, + "grad_norm": 2.0057411193847656, + "learning_rate": 1.6832170170783776e-05, + "loss": 0.4359, + "step": 4710 + }, + { + "epoch": 3.067664281067014, + "grad_norm": 1.3620643615722656, + "learning_rate": 1.6783403552615314e-05, + "loss": 0.4267, + "step": 4715 + }, + { + "epoch": 3.0709173715029277, + "grad_norm": 1.4945169687271118, + "learning_rate": 1.6734671960447333e-05, + "loss": 0.4401, + "step": 4720 + }, + { + "epoch": 3.074170461938842, + "grad_norm": 1.7050446271896362, + "learning_rate": 1.6685975602014604e-05, + "loss": 0.4553, + "step": 4725 + }, + { + "epoch": 3.077423552374756, + "grad_norm": 2.7887024879455566, + "learning_rate": 1.6637314684901713e-05, + "loss": 0.4312, + "step": 4730 + }, + { + "epoch": 3.08067664281067, + "grad_norm": 1.5501539707183838, + "learning_rate": 1.658868941654213e-05, + "loss": 0.4495, + "step": 4735 + }, + { + "epoch": 3.0839297332465843, + "grad_norm": 2.2847344875335693, + "learning_rate": 1.6540100004217402e-05, + "loss": 0.4535, + "step": 4740 + }, + { + "epoch": 3.0871828236824985, + "grad_norm": 1.3970599174499512, + "learning_rate": 1.6491546655056208e-05, + "loss": 0.4329, + "step": 4745 + }, + { + "epoch": 3.0904359141184123, + "grad_norm": 1.7147552967071533, + "learning_rate": 1.644302957603349e-05, + "loss": 0.4465, + "step": 4750 + }, + { + "epoch": 3.0936890045543266, + "grad_norm": 1.4633418321609497, + "learning_rate": 1.6394548973969588e-05, + "loss": 0.4163, + "step": 4755 + }, + { + "epoch": 3.096942094990241, + "grad_norm": 2.099158763885498, + "learning_rate": 1.634610505552934e-05, + "loss": 0.4554, + "step": 4760 + }, + { + "epoch": 3.1001951854261547, + "grad_norm": 1.6046409606933594, + "learning_rate": 1.6297698027221216e-05, + "loss": 0.4331, + "step": 4765 + }, + { + "epoch": 3.103448275862069, + "grad_norm": 1.6139500141143799, + "learning_rate": 1.6249328095396415e-05, + "loss": 0.451, + "step": 4770 + }, + { + "epoch": 3.106701366297983, + "grad_norm": 2.9759113788604736, + "learning_rate": 1.6200995466248014e-05, + "loss": 0.4275, + "step": 4775 + }, + { + "epoch": 3.109954456733897, + "grad_norm": 1.3965107202529907, + "learning_rate": 1.6152700345810063e-05, + "loss": 0.4394, + "step": 4780 + }, + { + "epoch": 3.1132075471698113, + "grad_norm": 1.3388948440551758, + "learning_rate": 1.6104442939956733e-05, + "loss": 0.4496, + "step": 4785 + }, + { + "epoch": 3.1164606376057256, + "grad_norm": 1.3315989971160889, + "learning_rate": 1.6056223454401396e-05, + "loss": 0.4359, + "step": 4790 + }, + { + "epoch": 3.1197137280416394, + "grad_norm": 1.7022862434387207, + "learning_rate": 1.6008042094695825e-05, + "loss": 0.4239, + "step": 4795 + }, + { + "epoch": 3.1229668184775536, + "grad_norm": 4.944558620452881, + "learning_rate": 1.5959899066229218e-05, + "loss": 0.4322, + "step": 4800 + }, + { + "epoch": 3.126219908913468, + "grad_norm": 2.622772693634033, + "learning_rate": 1.5911794574227402e-05, + "loss": 0.4512, + "step": 4805 + }, + { + "epoch": 3.1294729993493817, + "grad_norm": 1.4150971174240112, + "learning_rate": 1.5863728823751923e-05, + "loss": 0.4152, + "step": 4810 + }, + { + "epoch": 3.132726089785296, + "grad_norm": 1.6928874254226685, + "learning_rate": 1.5815702019699168e-05, + "loss": 0.4388, + "step": 4815 + }, + { + "epoch": 3.1359791802212102, + "grad_norm": 1.9819401502609253, + "learning_rate": 1.576771436679952e-05, + "loss": 0.4358, + "step": 4820 + }, + { + "epoch": 3.139232270657124, + "grad_norm": 4.905421733856201, + "learning_rate": 1.5719766069616457e-05, + "loss": 0.4296, + "step": 4825 + }, + { + "epoch": 3.1424853610930383, + "grad_norm": 1.4010788202285767, + "learning_rate": 1.5671857332545685e-05, + "loss": 0.4252, + "step": 4830 + }, + { + "epoch": 3.1457384515289526, + "grad_norm": 1.8857386112213135, + "learning_rate": 1.5623988359814285e-05, + "loss": 0.4344, + "step": 4835 + }, + { + "epoch": 3.1489915419648664, + "grad_norm": 1.8753392696380615, + "learning_rate": 1.5576159355479812e-05, + "loss": 0.4331, + "step": 4840 + }, + { + "epoch": 3.1522446324007807, + "grad_norm": 1.5203431844711304, + "learning_rate": 1.5528370523429465e-05, + "loss": 0.4366, + "step": 4845 + }, + { + "epoch": 3.155497722836695, + "grad_norm": 1.7210590839385986, + "learning_rate": 1.5480622067379176e-05, + "loss": 0.4161, + "step": 4850 + }, + { + "epoch": 3.1587508132726088, + "grad_norm": 1.7494614124298096, + "learning_rate": 1.5432914190872757e-05, + "loss": 0.428, + "step": 4855 + }, + { + "epoch": 3.162003903708523, + "grad_norm": 1.7109274864196777, + "learning_rate": 1.538524709728106e-05, + "loss": 0.423, + "step": 4860 + }, + { + "epoch": 3.1652569941444373, + "grad_norm": 2.6436519622802734, + "learning_rate": 1.533762098980107e-05, + "loss": 0.4308, + "step": 4865 + }, + { + "epoch": 3.168510084580351, + "grad_norm": 6.52752161026001, + "learning_rate": 1.5290036071455055e-05, + "loss": 0.4425, + "step": 4870 + }, + { + "epoch": 3.1717631750162654, + "grad_norm": 1.7809419631958008, + "learning_rate": 1.5242492545089698e-05, + "loss": 0.4444, + "step": 4875 + }, + { + "epoch": 3.1750162654521796, + "grad_norm": 4.83860969543457, + "learning_rate": 1.5194990613375253e-05, + "loss": 0.4315, + "step": 4880 + }, + { + "epoch": 3.178269355888094, + "grad_norm": 1.6642948389053345, + "learning_rate": 1.5147530478804634e-05, + "loss": 0.4348, + "step": 4885 + }, + { + "epoch": 3.1815224463240077, + "grad_norm": 1.948080062866211, + "learning_rate": 1.5100112343692604e-05, + "loss": 0.4334, + "step": 4890 + }, + { + "epoch": 3.184775536759922, + "grad_norm": 1.5025594234466553, + "learning_rate": 1.5052736410174877e-05, + "loss": 0.4436, + "step": 4895 + }, + { + "epoch": 3.1880286271958362, + "grad_norm": 1.6708216667175293, + "learning_rate": 1.5005402880207273e-05, + "loss": 0.446, + "step": 4900 + }, + { + "epoch": 3.19128171763175, + "grad_norm": 3.007356643676758, + "learning_rate": 1.495811195556486e-05, + "loss": 0.42, + "step": 4905 + }, + { + "epoch": 3.1945348080676643, + "grad_norm": 1.6949506998062134, + "learning_rate": 1.4910863837841068e-05, + "loss": 0.4226, + "step": 4910 + }, + { + "epoch": 3.1977878985035786, + "grad_norm": 2.3278913497924805, + "learning_rate": 1.4863658728446864e-05, + "loss": 0.4101, + "step": 4915 + }, + { + "epoch": 3.2010409889394924, + "grad_norm": 1.6102315187454224, + "learning_rate": 1.4816496828609878e-05, + "loss": 0.4139, + "step": 4920 + }, + { + "epoch": 3.2042940793754067, + "grad_norm": 1.5101537704467773, + "learning_rate": 1.476937833937352e-05, + "loss": 0.4361, + "step": 4925 + }, + { + "epoch": 3.207547169811321, + "grad_norm": 1.7118359804153442, + "learning_rate": 1.472230346159619e-05, + "loss": 0.4197, + "step": 4930 + }, + { + "epoch": 3.2108002602472347, + "grad_norm": 2.325700044631958, + "learning_rate": 1.4675272395950345e-05, + "loss": 0.4322, + "step": 4935 + }, + { + "epoch": 3.214053350683149, + "grad_norm": 2.856618642807007, + "learning_rate": 1.46282853429217e-05, + "loss": 0.439, + "step": 4940 + }, + { + "epoch": 3.2173064411190633, + "grad_norm": 1.6420326232910156, + "learning_rate": 1.4581342502808321e-05, + "loss": 0.4078, + "step": 4945 + }, + { + "epoch": 3.220559531554977, + "grad_norm": 1.6206941604614258, + "learning_rate": 1.4534444075719839e-05, + "loss": 0.4031, + "step": 4950 + }, + { + "epoch": 3.2238126219908914, + "grad_norm": 2.0076100826263428, + "learning_rate": 1.4487590261576542e-05, + "loss": 0.428, + "step": 4955 + }, + { + "epoch": 3.2270657124268056, + "grad_norm": 1.8135625123977661, + "learning_rate": 1.4440781260108521e-05, + "loss": 0.4177, + "step": 4960 + }, + { + "epoch": 3.2303188028627194, + "grad_norm": 1.494215488433838, + "learning_rate": 1.4394017270854887e-05, + "loss": 0.4233, + "step": 4965 + }, + { + "epoch": 3.2335718932986337, + "grad_norm": 1.5636515617370605, + "learning_rate": 1.4347298493162823e-05, + "loss": 0.4354, + "step": 4970 + }, + { + "epoch": 3.236824983734548, + "grad_norm": 1.34303879737854, + "learning_rate": 1.4300625126186806e-05, + "loss": 0.4313, + "step": 4975 + }, + { + "epoch": 3.240078074170462, + "grad_norm": 1.4629467725753784, + "learning_rate": 1.4253997368887717e-05, + "loss": 0.4272, + "step": 4980 + }, + { + "epoch": 3.243331164606376, + "grad_norm": 1.7214086055755615, + "learning_rate": 1.4207415420032044e-05, + "loss": 0.435, + "step": 4985 + }, + { + "epoch": 3.2465842550422903, + "grad_norm": 1.5725369453430176, + "learning_rate": 1.4160879478190974e-05, + "loss": 0.4239, + "step": 4990 + }, + { + "epoch": 3.249837345478204, + "grad_norm": 2.499541997909546, + "learning_rate": 1.411438974173957e-05, + "loss": 0.438, + "step": 4995 + }, + { + "epoch": 3.2530904359141184, + "grad_norm": 1.4437288045883179, + "learning_rate": 1.4067946408855953e-05, + "loss": 0.427, + "step": 5000 + }, + { + "epoch": 3.2563435263500327, + "grad_norm": 1.851952075958252, + "learning_rate": 1.4021549677520415e-05, + "loss": 0.4419, + "step": 5005 + }, + { + "epoch": 3.2595966167859465, + "grad_norm": 1.586517572402954, + "learning_rate": 1.3975199745514587e-05, + "loss": 0.4193, + "step": 5010 + }, + { + "epoch": 3.2628497072218607, + "grad_norm": 1.516689419746399, + "learning_rate": 1.392889681042063e-05, + "loss": 0.4273, + "step": 5015 + }, + { + "epoch": 3.266102797657775, + "grad_norm": 1.5535813570022583, + "learning_rate": 1.3882641069620339e-05, + "loss": 0.4326, + "step": 5020 + }, + { + "epoch": 3.269355888093689, + "grad_norm": 2.197828531265259, + "learning_rate": 1.3836432720294329e-05, + "loss": 0.4381, + "step": 5025 + }, + { + "epoch": 3.272608978529603, + "grad_norm": 1.452176570892334, + "learning_rate": 1.3790271959421219e-05, + "loss": 0.426, + "step": 5030 + }, + { + "epoch": 3.2758620689655173, + "grad_norm": 1.550866723060608, + "learning_rate": 1.3744158983776733e-05, + "loss": 0.4284, + "step": 5035 + }, + { + "epoch": 3.279115159401431, + "grad_norm": 1.6225146055221558, + "learning_rate": 1.3698093989932904e-05, + "loss": 0.4319, + "step": 5040 + }, + { + "epoch": 3.2823682498373454, + "grad_norm": 1.6624497175216675, + "learning_rate": 1.3652077174257249e-05, + "loss": 0.4291, + "step": 5045 + }, + { + "epoch": 3.2856213402732597, + "grad_norm": 2.7444093227386475, + "learning_rate": 1.3606108732911882e-05, + "loss": 0.4292, + "step": 5050 + }, + { + "epoch": 3.288874430709174, + "grad_norm": 1.5146102905273438, + "learning_rate": 1.3560188861852702e-05, + "loss": 0.4466, + "step": 5055 + }, + { + "epoch": 3.2921275211450878, + "grad_norm": 1.6835765838623047, + "learning_rate": 1.3514317756828587e-05, + "loss": 0.4188, + "step": 5060 + }, + { + "epoch": 3.295380611581002, + "grad_norm": 1.7336374521255493, + "learning_rate": 1.3468495613380533e-05, + "loss": 0.4331, + "step": 5065 + }, + { + "epoch": 3.2986337020169163, + "grad_norm": 1.7082960605621338, + "learning_rate": 1.3422722626840791e-05, + "loss": 0.4161, + "step": 5070 + }, + { + "epoch": 3.30188679245283, + "grad_norm": 2.2675247192382812, + "learning_rate": 1.3376998992332076e-05, + "loss": 0.4111, + "step": 5075 + }, + { + "epoch": 3.3051398828887444, + "grad_norm": 1.4138269424438477, + "learning_rate": 1.3331324904766745e-05, + "loss": 0.4244, + "step": 5080 + }, + { + "epoch": 3.3083929733246586, + "grad_norm": 2.2490038871765137, + "learning_rate": 1.328570055884592e-05, + "loss": 0.4208, + "step": 5085 + }, + { + "epoch": 3.3116460637605725, + "grad_norm": 1.6177328824996948, + "learning_rate": 1.3240126149058685e-05, + "loss": 0.4227, + "step": 5090 + }, + { + "epoch": 3.3148991541964867, + "grad_norm": 1.7124465703964233, + "learning_rate": 1.3194601869681272e-05, + "loss": 0.4202, + "step": 5095 + }, + { + "epoch": 3.318152244632401, + "grad_norm": 1.7719941139221191, + "learning_rate": 1.3149127914776196e-05, + "loss": 0.417, + "step": 5100 + }, + { + "epoch": 3.321405335068315, + "grad_norm": 1.6123124361038208, + "learning_rate": 1.3103704478191448e-05, + "loss": 0.4398, + "step": 5105 + }, + { + "epoch": 3.324658425504229, + "grad_norm": 1.724311113357544, + "learning_rate": 1.3058331753559688e-05, + "loss": 0.4456, + "step": 5110 + }, + { + "epoch": 3.3279115159401433, + "grad_norm": 1.976486086845398, + "learning_rate": 1.301300993429738e-05, + "loss": 0.4207, + "step": 5115 + }, + { + "epoch": 3.331164606376057, + "grad_norm": 1.5178265571594238, + "learning_rate": 1.296773921360398e-05, + "loss": 0.4138, + "step": 5120 + }, + { + "epoch": 3.3344176968119714, + "grad_norm": 2.115887403488159, + "learning_rate": 1.2922519784461154e-05, + "loss": 0.4237, + "step": 5125 + }, + { + "epoch": 3.3376707872478857, + "grad_norm": 1.584922194480896, + "learning_rate": 1.2877351839631884e-05, + "loss": 0.4331, + "step": 5130 + }, + { + "epoch": 3.3409238776837995, + "grad_norm": 1.9146180152893066, + "learning_rate": 1.283223557165969e-05, + "loss": 0.4213, + "step": 5135 + }, + { + "epoch": 3.3441769681197138, + "grad_norm": 2.934779644012451, + "learning_rate": 1.2787171172867826e-05, + "loss": 0.4304, + "step": 5140 + }, + { + "epoch": 3.347430058555628, + "grad_norm": 1.7485958337783813, + "learning_rate": 1.2742158835358412e-05, + "loss": 0.4402, + "step": 5145 + }, + { + "epoch": 3.350683148991542, + "grad_norm": 1.497227668762207, + "learning_rate": 1.2697198751011641e-05, + "loss": 0.4235, + "step": 5150 + }, + { + "epoch": 3.353936239427456, + "grad_norm": 1.3970587253570557, + "learning_rate": 1.2652291111484962e-05, + "loss": 0.41, + "step": 5155 + }, + { + "epoch": 3.3571893298633704, + "grad_norm": 2.238862991333008, + "learning_rate": 1.2607436108212278e-05, + "loss": 0.4398, + "step": 5160 + }, + { + "epoch": 3.360442420299284, + "grad_norm": 4.999053478240967, + "learning_rate": 1.256263393240309e-05, + "loss": 0.4316, + "step": 5165 + }, + { + "epoch": 3.3636955107351985, + "grad_norm": 2.9387760162353516, + "learning_rate": 1.25178847750417e-05, + "loss": 0.4391, + "step": 5170 + }, + { + "epoch": 3.3669486011711127, + "grad_norm": 7.236008167266846, + "learning_rate": 1.2473188826886428e-05, + "loss": 0.4247, + "step": 5175 + }, + { + "epoch": 3.3702016916070265, + "grad_norm": 1.8067480325698853, + "learning_rate": 1.2428546278468753e-05, + "loss": 0.4363, + "step": 5180 + }, + { + "epoch": 3.373454782042941, + "grad_norm": 2.2515077590942383, + "learning_rate": 1.2383957320092512e-05, + "loss": 0.4218, + "step": 5185 + }, + { + "epoch": 3.376707872478855, + "grad_norm": 2.5399014949798584, + "learning_rate": 1.2339422141833127e-05, + "loss": 0.4344, + "step": 5190 + }, + { + "epoch": 3.379960962914769, + "grad_norm": 3.233678102493286, + "learning_rate": 1.2294940933536725e-05, + "loss": 0.4214, + "step": 5195 + }, + { + "epoch": 3.383214053350683, + "grad_norm": 1.5409492254257202, + "learning_rate": 1.2250513884819403e-05, + "loss": 0.4225, + "step": 5200 + }, + { + "epoch": 3.3864671437865974, + "grad_norm": 1.2861021757125854, + "learning_rate": 1.2206141185066359e-05, + "loss": 0.4268, + "step": 5205 + }, + { + "epoch": 3.3897202342225112, + "grad_norm": 1.417588472366333, + "learning_rate": 1.2161823023431133e-05, + "loss": 0.4188, + "step": 5210 + }, + { + "epoch": 3.3929733246584255, + "grad_norm": 3.5109989643096924, + "learning_rate": 1.2117559588834757e-05, + "loss": 0.4243, + "step": 5215 + }, + { + "epoch": 3.3962264150943398, + "grad_norm": 6.47197151184082, + "learning_rate": 1.207335106996497e-05, + "loss": 0.4229, + "step": 5220 + }, + { + "epoch": 3.3994795055302536, + "grad_norm": 1.6867964267730713, + "learning_rate": 1.2029197655275442e-05, + "loss": 0.4287, + "step": 5225 + }, + { + "epoch": 3.402732595966168, + "grad_norm": 1.575161099433899, + "learning_rate": 1.1985099532984917e-05, + "loss": 0.4304, + "step": 5230 + }, + { + "epoch": 3.405985686402082, + "grad_norm": 1.444969654083252, + "learning_rate": 1.1941056891076432e-05, + "loss": 0.4325, + "step": 5235 + }, + { + "epoch": 3.409238776837996, + "grad_norm": 1.6448862552642822, + "learning_rate": 1.1897069917296555e-05, + "loss": 0.4358, + "step": 5240 + }, + { + "epoch": 3.41249186727391, + "grad_norm": 1.5217825174331665, + "learning_rate": 1.1853138799154514e-05, + "loss": 0.4404, + "step": 5245 + }, + { + "epoch": 3.4157449577098244, + "grad_norm": 10.740922927856445, + "learning_rate": 1.1809263723921438e-05, + "loss": 0.4421, + "step": 5250 + }, + { + "epoch": 3.4189980481457383, + "grad_norm": 2.3015806674957275, + "learning_rate": 1.1765444878629583e-05, + "loss": 0.4376, + "step": 5255 + }, + { + "epoch": 3.4222511385816525, + "grad_norm": 1.4035224914550781, + "learning_rate": 1.1721682450071476e-05, + "loss": 0.453, + "step": 5260 + }, + { + "epoch": 3.425504229017567, + "grad_norm": 1.4976873397827148, + "learning_rate": 1.167797662479915e-05, + "loss": 0.4411, + "step": 5265 + }, + { + "epoch": 3.4287573194534806, + "grad_norm": 1.4919233322143555, + "learning_rate": 1.1634327589123373e-05, + "loss": 0.4357, + "step": 5270 + }, + { + "epoch": 3.432010409889395, + "grad_norm": 1.5440754890441895, + "learning_rate": 1.1590735529112806e-05, + "loss": 0.4288, + "step": 5275 + }, + { + "epoch": 3.435263500325309, + "grad_norm": 1.637131690979004, + "learning_rate": 1.1547200630593224e-05, + "loss": 0.4379, + "step": 5280 + }, + { + "epoch": 3.438516590761223, + "grad_norm": 1.525527834892273, + "learning_rate": 1.1503723079146766e-05, + "loss": 0.4158, + "step": 5285 + }, + { + "epoch": 3.441769681197137, + "grad_norm": 1.5343090295791626, + "learning_rate": 1.1460303060111083e-05, + "loss": 0.4128, + "step": 5290 + }, + { + "epoch": 3.4450227716330515, + "grad_norm": 1.7940443754196167, + "learning_rate": 1.1416940758578567e-05, + "loss": 0.4389, + "step": 5295 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 1.8386902809143066, + "learning_rate": 1.137363635939561e-05, + "loss": 0.4537, + "step": 5300 + }, + { + "epoch": 3.4515289525048796, + "grad_norm": 1.498874306678772, + "learning_rate": 1.1330390047161729e-05, + "loss": 0.4297, + "step": 5305 + }, + { + "epoch": 3.454782042940794, + "grad_norm": 1.4000130891799927, + "learning_rate": 1.1287202006228858e-05, + "loss": 0.4235, + "step": 5310 + }, + { + "epoch": 3.4580351333767076, + "grad_norm": 6.055357933044434, + "learning_rate": 1.1244072420700502e-05, + "loss": 0.4282, + "step": 5315 + }, + { + "epoch": 3.461288223812622, + "grad_norm": 1.7397581338882446, + "learning_rate": 1.1201001474431022e-05, + "loss": 0.4053, + "step": 5320 + }, + { + "epoch": 3.464541314248536, + "grad_norm": 1.7397915124893188, + "learning_rate": 1.1157989351024767e-05, + "loss": 0.4213, + "step": 5325 + }, + { + "epoch": 3.46779440468445, + "grad_norm": 1.6159850358963013, + "learning_rate": 1.1115036233835349e-05, + "loss": 0.4241, + "step": 5330 + }, + { + "epoch": 3.4710474951203643, + "grad_norm": 1.4938251972198486, + "learning_rate": 1.1072142305964855e-05, + "loss": 0.425, + "step": 5335 + }, + { + "epoch": 3.4743005855562785, + "grad_norm": 2.087301015853882, + "learning_rate": 1.102930775026306e-05, + "loss": 0.4224, + "step": 5340 + }, + { + "epoch": 3.4775536759921923, + "grad_norm": 1.5332581996917725, + "learning_rate": 1.098653274932662e-05, + "loss": 0.4281, + "step": 5345 + }, + { + "epoch": 3.4808067664281066, + "grad_norm": 1.481587529182434, + "learning_rate": 1.094381748549835e-05, + "loss": 0.4083, + "step": 5350 + }, + { + "epoch": 3.484059856864021, + "grad_norm": 1.8097538948059082, + "learning_rate": 1.0901162140866395e-05, + "loss": 0.4231, + "step": 5355 + }, + { + "epoch": 3.487312947299935, + "grad_norm": 1.6291629076004028, + "learning_rate": 1.0858566897263475e-05, + "loss": 0.4298, + "step": 5360 + }, + { + "epoch": 3.490566037735849, + "grad_norm": 1.3684316873550415, + "learning_rate": 1.081603193626611e-05, + "loss": 0.4045, + "step": 5365 + }, + { + "epoch": 3.493819128171763, + "grad_norm": 1.9993515014648438, + "learning_rate": 1.0773557439193865e-05, + "loss": 0.4223, + "step": 5370 + }, + { + "epoch": 3.4970722186076775, + "grad_norm": 1.364740014076233, + "learning_rate": 1.0731143587108533e-05, + "loss": 0.4472, + "step": 5375 + }, + { + "epoch": 3.5003253090435913, + "grad_norm": 1.4979236125946045, + "learning_rate": 1.0688790560813388e-05, + "loss": 0.4232, + "step": 5380 + }, + { + "epoch": 3.5035783994795056, + "grad_norm": 1.574204921722412, + "learning_rate": 1.064649854085244e-05, + "loss": 0.4228, + "step": 5385 + }, + { + "epoch": 3.5068314899154194, + "grad_norm": 2.251814842224121, + "learning_rate": 1.0604267707509608e-05, + "loss": 0.4091, + "step": 5390 + }, + { + "epoch": 3.5100845803513336, + "grad_norm": 2.445500135421753, + "learning_rate": 1.0562098240807989e-05, + "loss": 0.4257, + "step": 5395 + }, + { + "epoch": 3.513337670787248, + "grad_norm": 1.3687506914138794, + "learning_rate": 1.0519990320509104e-05, + "loss": 0.4132, + "step": 5400 + }, + { + "epoch": 3.516590761223162, + "grad_norm": 1.6088290214538574, + "learning_rate": 1.0477944126112097e-05, + "loss": 0.4151, + "step": 5405 + }, + { + "epoch": 3.519843851659076, + "grad_norm": 1.3225458860397339, + "learning_rate": 1.0435959836852967e-05, + "loss": 0.4173, + "step": 5410 + }, + { + "epoch": 3.5230969420949902, + "grad_norm": 2.5459144115448, + "learning_rate": 1.0394037631703867e-05, + "loss": 0.4344, + "step": 5415 + }, + { + "epoch": 3.5263500325309045, + "grad_norm": 1.7091197967529297, + "learning_rate": 1.0352177689372256e-05, + "loss": 0.4328, + "step": 5420 + }, + { + "epoch": 3.5296031229668183, + "grad_norm": 1.490488052368164, + "learning_rate": 1.0310380188300178e-05, + "loss": 0.4153, + "step": 5425 + }, + { + "epoch": 3.5328562134027326, + "grad_norm": 8.786995887756348, + "learning_rate": 1.0268645306663532e-05, + "loss": 0.4466, + "step": 5430 + }, + { + "epoch": 3.536109303838647, + "grad_norm": 2.5409798622131348, + "learning_rate": 1.0226973222371253e-05, + "loss": 0.4174, + "step": 5435 + }, + { + "epoch": 3.5393623942745607, + "grad_norm": 2.2137346267700195, + "learning_rate": 1.0185364113064577e-05, + "loss": 0.4296, + "step": 5440 + }, + { + "epoch": 3.542615484710475, + "grad_norm": 1.8226664066314697, + "learning_rate": 1.0143818156116323e-05, + "loss": 0.4076, + "step": 5445 + }, + { + "epoch": 3.545868575146389, + "grad_norm": 1.7435983419418335, + "learning_rate": 1.0102335528630061e-05, + "loss": 0.4241, + "step": 5450 + }, + { + "epoch": 3.5491216655823035, + "grad_norm": 2.1171414852142334, + "learning_rate": 1.0060916407439413e-05, + "loss": 0.4094, + "step": 5455 + }, + { + "epoch": 3.5523747560182173, + "grad_norm": 1.3854146003723145, + "learning_rate": 1.0019560969107302e-05, + "loss": 0.4165, + "step": 5460 + }, + { + "epoch": 3.5556278464541315, + "grad_norm": 1.7370872497558594, + "learning_rate": 9.978269389925157e-06, + "loss": 0.4179, + "step": 5465 + }, + { + "epoch": 3.558880936890046, + "grad_norm": 2.7952463626861572, + "learning_rate": 9.937041845912188e-06, + "loss": 0.4366, + "step": 5470 + }, + { + "epoch": 3.5621340273259596, + "grad_norm": 2.453916311264038, + "learning_rate": 9.895878512814647e-06, + "loss": 0.4105, + "step": 5475 + }, + { + "epoch": 3.565387117761874, + "grad_norm": 3.428210973739624, + "learning_rate": 9.854779566105068e-06, + "loss": 0.4235, + "step": 5480 + }, + { + "epoch": 3.568640208197788, + "grad_norm": 1.5725935697555542, + "learning_rate": 9.813745180981502e-06, + "loss": 0.4165, + "step": 5485 + }, + { + "epoch": 3.571893298633702, + "grad_norm": 1.8538109064102173, + "learning_rate": 9.772775532366774e-06, + "loss": 0.4427, + "step": 5490 + }, + { + "epoch": 3.5751463890696162, + "grad_norm": 2.118178606033325, + "learning_rate": 9.731870794907789e-06, + "loss": 0.4299, + "step": 5495 + }, + { + "epoch": 3.5783994795055305, + "grad_norm": 6.738338470458984, + "learning_rate": 9.691031142974707e-06, + "loss": 0.433, + "step": 5500 + }, + { + "epoch": 3.5816525699414443, + "grad_norm": 2.768328905105591, + "learning_rate": 9.65025675066025e-06, + "loss": 0.4251, + "step": 5505 + }, + { + "epoch": 3.5849056603773586, + "grad_norm": 1.6791824102401733, + "learning_rate": 9.609547791778964e-06, + "loss": 0.4195, + "step": 5510 + }, + { + "epoch": 3.588158750813273, + "grad_norm": 1.43679940700531, + "learning_rate": 9.568904439866444e-06, + "loss": 0.4125, + "step": 5515 + }, + { + "epoch": 3.5914118412491867, + "grad_norm": 1.60779869556427, + "learning_rate": 9.528326868178616e-06, + "loss": 0.4309, + "step": 5520 + }, + { + "epoch": 3.594664931685101, + "grad_norm": 2.161658763885498, + "learning_rate": 9.487815249691012e-06, + "loss": 0.4391, + "step": 5525 + }, + { + "epoch": 3.597918022121015, + "grad_norm": 1.7440755367279053, + "learning_rate": 9.447369757098002e-06, + "loss": 0.4046, + "step": 5530 + }, + { + "epoch": 3.601171112556929, + "grad_norm": 1.9223883152008057, + "learning_rate": 9.406990562812068e-06, + "loss": 0.3904, + "step": 5535 + }, + { + "epoch": 3.6044242029928433, + "grad_norm": 1.7541393041610718, + "learning_rate": 9.366677838963078e-06, + "loss": 0.4238, + "step": 5540 + }, + { + "epoch": 3.6076772934287575, + "grad_norm": 1.4891860485076904, + "learning_rate": 9.32643175739756e-06, + "loss": 0.4173, + "step": 5545 + }, + { + "epoch": 3.6109303838646714, + "grad_norm": 2.7674455642700195, + "learning_rate": 9.286252489677944e-06, + "loss": 0.4033, + "step": 5550 + }, + { + "epoch": 3.6141834743005856, + "grad_norm": 2.088984727859497, + "learning_rate": 9.246140207081833e-06, + "loss": 0.4468, + "step": 5555 + }, + { + "epoch": 3.6174365647365, + "grad_norm": 1.516174554824829, + "learning_rate": 9.206095080601319e-06, + "loss": 0.4194, + "step": 5560 + }, + { + "epoch": 3.6206896551724137, + "grad_norm": 2.387362241744995, + "learning_rate": 9.16611728094218e-06, + "loss": 0.4142, + "step": 5565 + }, + { + "epoch": 3.623942745608328, + "grad_norm": 1.8257601261138916, + "learning_rate": 9.126206978523202e-06, + "loss": 0.4221, + "step": 5570 + }, + { + "epoch": 3.6271958360442422, + "grad_norm": 1.815144419670105, + "learning_rate": 9.086364343475461e-06, + "loss": 0.4356, + "step": 5575 + }, + { + "epoch": 3.630448926480156, + "grad_norm": 1.5288091897964478, + "learning_rate": 9.04658954564156e-06, + "loss": 0.4077, + "step": 5580 + }, + { + "epoch": 3.6337020169160703, + "grad_norm": 1.570156455039978, + "learning_rate": 9.006882754574914e-06, + "loss": 0.4136, + "step": 5585 + }, + { + "epoch": 3.6369551073519846, + "grad_norm": 2.161524772644043, + "learning_rate": 8.967244139539064e-06, + "loss": 0.4204, + "step": 5590 + }, + { + "epoch": 3.6402081977878984, + "grad_norm": 1.590711236000061, + "learning_rate": 8.927673869506905e-06, + "loss": 0.4133, + "step": 5595 + }, + { + "epoch": 3.6434612882238127, + "grad_norm": 1.5367971658706665, + "learning_rate": 8.888172113159989e-06, + "loss": 0.4152, + "step": 5600 + }, + { + "epoch": 3.646714378659727, + "grad_norm": 1.5242828130722046, + "learning_rate": 8.848739038887822e-06, + "loss": 0.4128, + "step": 5605 + }, + { + "epoch": 3.6499674690956407, + "grad_norm": 1.4691710472106934, + "learning_rate": 8.809374814787124e-06, + "loss": 0.4247, + "step": 5610 + }, + { + "epoch": 3.653220559531555, + "grad_norm": 3.0192527770996094, + "learning_rate": 8.770079608661108e-06, + "loss": 0.4279, + "step": 5615 + }, + { + "epoch": 3.6564736499674693, + "grad_norm": 1.5646493434906006, + "learning_rate": 8.730853588018772e-06, + "loss": 0.4055, + "step": 5620 + }, + { + "epoch": 3.659726740403383, + "grad_norm": 1.502172589302063, + "learning_rate": 8.691696920074214e-06, + "loss": 0.4314, + "step": 5625 + }, + { + "epoch": 3.6629798308392973, + "grad_norm": 1.4211641550064087, + "learning_rate": 8.652609771745862e-06, + "loss": 0.4218, + "step": 5630 + }, + { + "epoch": 3.6662329212752116, + "grad_norm": 1.905414342880249, + "learning_rate": 8.613592309655804e-06, + "loss": 0.4178, + "step": 5635 + }, + { + "epoch": 3.6694860117111254, + "grad_norm": 2.1291544437408447, + "learning_rate": 8.574644700129087e-06, + "loss": 0.4224, + "step": 5640 + }, + { + "epoch": 3.6727391021470397, + "grad_norm": 1.6421247720718384, + "learning_rate": 8.535767109192955e-06, + "loss": 0.4378, + "step": 5645 + }, + { + "epoch": 3.675992192582954, + "grad_norm": 1.5527359247207642, + "learning_rate": 8.496959702576187e-06, + "loss": 0.4242, + "step": 5650 + }, + { + "epoch": 3.6792452830188678, + "grad_norm": 2.3626434803009033, + "learning_rate": 8.458222645708394e-06, + "loss": 0.4187, + "step": 5655 + }, + { + "epoch": 3.682498373454782, + "grad_norm": 1.3949254751205444, + "learning_rate": 8.419556103719279e-06, + "loss": 0.4094, + "step": 5660 + }, + { + "epoch": 3.6857514638906963, + "grad_norm": 1.7462977170944214, + "learning_rate": 8.380960241437947e-06, + "loss": 0.4228, + "step": 5665 + }, + { + "epoch": 3.68900455432661, + "grad_norm": 1.5994423627853394, + "learning_rate": 8.342435223392232e-06, + "loss": 0.4216, + "step": 5670 + }, + { + "epoch": 3.6922576447625244, + "grad_norm": 2.040391206741333, + "learning_rate": 8.303981213807947e-06, + "loss": 0.419, + "step": 5675 + }, + { + "epoch": 3.6955107351984386, + "grad_norm": 1.5465586185455322, + "learning_rate": 8.265598376608211e-06, + "loss": 0.4105, + "step": 5680 + }, + { + "epoch": 3.6987638256343525, + "grad_norm": 1.965409278869629, + "learning_rate": 8.227286875412766e-06, + "loss": 0.4056, + "step": 5685 + }, + { + "epoch": 3.7020169160702667, + "grad_norm": 1.9103503227233887, + "learning_rate": 8.189046873537237e-06, + "loss": 0.4142, + "step": 5690 + }, + { + "epoch": 3.705270006506181, + "grad_norm": 1.6853419542312622, + "learning_rate": 8.150878533992458e-06, + "loss": 0.4421, + "step": 5695 + }, + { + "epoch": 3.708523096942095, + "grad_norm": 1.7083536386489868, + "learning_rate": 8.112782019483813e-06, + "loss": 0.4175, + "step": 5700 + }, + { + "epoch": 3.711776187378009, + "grad_norm": 1.8322556018829346, + "learning_rate": 8.074757492410471e-06, + "loss": 0.4202, + "step": 5705 + }, + { + "epoch": 3.7150292778139233, + "grad_norm": 1.8045029640197754, + "learning_rate": 8.036805114864736e-06, + "loss": 0.4232, + "step": 5710 + }, + { + "epoch": 3.718282368249837, + "grad_norm": 1.9806746244430542, + "learning_rate": 7.998925048631362e-06, + "loss": 0.4176, + "step": 5715 + }, + { + "epoch": 3.7215354586857514, + "grad_norm": 1.4905263185501099, + "learning_rate": 7.96111745518685e-06, + "loss": 0.422, + "step": 5720 + }, + { + "epoch": 3.7247885491216657, + "grad_norm": 1.4577668905258179, + "learning_rate": 7.923382495698758e-06, + "loss": 0.406, + "step": 5725 + }, + { + "epoch": 3.7280416395575795, + "grad_norm": 2.971972942352295, + "learning_rate": 7.88572033102501e-06, + "loss": 0.4168, + "step": 5730 + }, + { + "epoch": 3.7312947299934938, + "grad_norm": 1.7888929843902588, + "learning_rate": 7.848131121713234e-06, + "loss": 0.4397, + "step": 5735 + }, + { + "epoch": 3.734547820429408, + "grad_norm": 1.5038715600967407, + "learning_rate": 7.810615028000045e-06, + "loss": 0.4017, + "step": 5740 + }, + { + "epoch": 3.737800910865322, + "grad_norm": 1.679883599281311, + "learning_rate": 7.773172209810397e-06, + "loss": 0.4221, + "step": 5745 + }, + { + "epoch": 3.741054001301236, + "grad_norm": 1.669687271118164, + "learning_rate": 7.735802826756856e-06, + "loss": 0.408, + "step": 5750 + }, + { + "epoch": 3.7443070917371504, + "grad_norm": 2.8292086124420166, + "learning_rate": 7.698507038138978e-06, + "loss": 0.4408, + "step": 5755 + }, + { + "epoch": 3.747560182173064, + "grad_norm": 1.6891485452651978, + "learning_rate": 7.661285002942572e-06, + "loss": 0.4202, + "step": 5760 + }, + { + "epoch": 3.7508132726089785, + "grad_norm": 18.631166458129883, + "learning_rate": 7.624136879839053e-06, + "loss": 0.4368, + "step": 5765 + }, + { + "epoch": 3.7540663630448927, + "grad_norm": 2.31217622756958, + "learning_rate": 7.5870628271847765e-06, + "loss": 0.4099, + "step": 5770 + }, + { + "epoch": 3.7573194534808065, + "grad_norm": 1.7663567066192627, + "learning_rate": 7.550063003020333e-06, + "loss": 0.4242, + "step": 5775 + }, + { + "epoch": 3.760572543916721, + "grad_norm": 1.651605486869812, + "learning_rate": 7.5131375650698835e-06, + "loss": 0.428, + "step": 5780 + }, + { + "epoch": 3.763825634352635, + "grad_norm": 1.542211890220642, + "learning_rate": 7.476286670740518e-06, + "loss": 0.4163, + "step": 5785 + }, + { + "epoch": 3.767078724788549, + "grad_norm": 1.7693531513214111, + "learning_rate": 7.439510477121536e-06, + "loss": 0.4192, + "step": 5790 + }, + { + "epoch": 3.770331815224463, + "grad_norm": 2.135603666305542, + "learning_rate": 7.402809140983799e-06, + "loss": 0.4155, + "step": 5795 + }, + { + "epoch": 3.7735849056603774, + "grad_norm": 2.1200711727142334, + "learning_rate": 7.366182818779088e-06, + "loss": 0.4056, + "step": 5800 + }, + { + "epoch": 3.7768379960962912, + "grad_norm": 2.6680068969726562, + "learning_rate": 7.329631666639392e-06, + "loss": 0.4124, + "step": 5805 + }, + { + "epoch": 3.7800910865322055, + "grad_norm": 2.2767679691314697, + "learning_rate": 7.2931558403762535e-06, + "loss": 0.3993, + "step": 5810 + }, + { + "epoch": 3.7833441769681198, + "grad_norm": 1.7897844314575195, + "learning_rate": 7.256755495480141e-06, + "loss": 0.3827, + "step": 5815 + }, + { + "epoch": 3.7865972674040336, + "grad_norm": 1.7870675325393677, + "learning_rate": 7.220430787119742e-06, + "loss": 0.4199, + "step": 5820 + }, + { + "epoch": 3.789850357839948, + "grad_norm": 1.4809794425964355, + "learning_rate": 7.184181870141307e-06, + "loss": 0.4056, + "step": 5825 + }, + { + "epoch": 3.793103448275862, + "grad_norm": 2.3490641117095947, + "learning_rate": 7.148008899068029e-06, + "loss": 0.4084, + "step": 5830 + }, + { + "epoch": 3.796356538711776, + "grad_norm": 2.2586324214935303, + "learning_rate": 7.1119120280993295e-06, + "loss": 0.4125, + "step": 5835 + }, + { + "epoch": 3.79960962914769, + "grad_norm": 3.2746353149414062, + "learning_rate": 7.0758914111102335e-06, + "loss": 0.3964, + "step": 5840 + }, + { + "epoch": 3.8028627195836044, + "grad_norm": 1.9759531021118164, + "learning_rate": 7.039947201650726e-06, + "loss": 0.4151, + "step": 5845 + }, + { + "epoch": 3.8061158100195187, + "grad_norm": 1.395560622215271, + "learning_rate": 7.004079552945062e-06, + "loss": 0.415, + "step": 5850 + }, + { + "epoch": 3.8093689004554325, + "grad_norm": 1.6155577898025513, + "learning_rate": 6.968288617891116e-06, + "loss": 0.4093, + "step": 5855 + }, + { + "epoch": 3.812621990891347, + "grad_norm": 1.647705078125, + "learning_rate": 6.932574549059789e-06, + "loss": 0.4033, + "step": 5860 + }, + { + "epoch": 3.815875081327261, + "grad_norm": 1.5718135833740234, + "learning_rate": 6.8969374986942784e-06, + "loss": 0.4281, + "step": 5865 + }, + { + "epoch": 3.819128171763175, + "grad_norm": 6.746004581451416, + "learning_rate": 6.861377618709466e-06, + "loss": 0.4199, + "step": 5870 + }, + { + "epoch": 3.822381262199089, + "grad_norm": 1.6057871580123901, + "learning_rate": 6.825895060691273e-06, + "loss": 0.4059, + "step": 5875 + }, + { + "epoch": 3.8256343526350034, + "grad_norm": 1.5333271026611328, + "learning_rate": 6.790489975896033e-06, + "loss": 0.4311, + "step": 5880 + }, + { + "epoch": 3.828887443070917, + "grad_norm": 4.261138439178467, + "learning_rate": 6.755162515249799e-06, + "loss": 0.3987, + "step": 5885 + }, + { + "epoch": 3.8321405335068315, + "grad_norm": 1.475321650505066, + "learning_rate": 6.719912829347716e-06, + "loss": 0.424, + "step": 5890 + }, + { + "epoch": 3.8353936239427457, + "grad_norm": 1.6494297981262207, + "learning_rate": 6.6847410684534235e-06, + "loss": 0.4367, + "step": 5895 + }, + { + "epoch": 3.8386467143786596, + "grad_norm": 1.8216992616653442, + "learning_rate": 6.649647382498345e-06, + "loss": 0.4201, + "step": 5900 + }, + { + "epoch": 3.841899804814574, + "grad_norm": 1.7172224521636963, + "learning_rate": 6.6146319210810935e-06, + "loss": 0.4279, + "step": 5905 + }, + { + "epoch": 3.845152895250488, + "grad_norm": 2.8617303371429443, + "learning_rate": 6.579694833466843e-06, + "loss": 0.4219, + "step": 5910 + }, + { + "epoch": 3.8484059856864024, + "grad_norm": 1.705623984336853, + "learning_rate": 6.5448362685866485e-06, + "loss": 0.4085, + "step": 5915 + }, + { + "epoch": 3.851659076122316, + "grad_norm": 1.7146087884902954, + "learning_rate": 6.510056375036841e-06, + "loss": 0.4089, + "step": 5920 + }, + { + "epoch": 3.8549121665582304, + "grad_norm": 1.4763227701187134, + "learning_rate": 6.47535530107839e-06, + "loss": 0.408, + "step": 5925 + }, + { + "epoch": 3.8581652569941447, + "grad_norm": 2.6685636043548584, + "learning_rate": 6.440733194636281e-06, + "loss": 0.4216, + "step": 5930 + }, + { + "epoch": 3.8614183474300585, + "grad_norm": 1.5533685684204102, + "learning_rate": 6.406190203298859e-06, + "loss": 0.4182, + "step": 5935 + }, + { + "epoch": 3.864671437865973, + "grad_norm": 1.7280486822128296, + "learning_rate": 6.3717264743172134e-06, + "loss": 0.415, + "step": 5940 + }, + { + "epoch": 3.867924528301887, + "grad_norm": 1.7348867654800415, + "learning_rate": 6.337342154604573e-06, + "loss": 0.4046, + "step": 5945 + }, + { + "epoch": 3.871177618737801, + "grad_norm": 1.730487585067749, + "learning_rate": 6.303037390735634e-06, + "loss": 0.4186, + "step": 5950 + }, + { + "epoch": 3.874430709173715, + "grad_norm": 1.4803285598754883, + "learning_rate": 6.268812328945961e-06, + "loss": 0.4071, + "step": 5955 + }, + { + "epoch": 3.8776837996096294, + "grad_norm": 1.5369083881378174, + "learning_rate": 6.234667115131382e-06, + "loss": 0.4205, + "step": 5960 + }, + { + "epoch": 3.880936890045543, + "grad_norm": 2.9555954933166504, + "learning_rate": 6.200601894847324e-06, + "loss": 0.4054, + "step": 5965 + }, + { + "epoch": 3.8841899804814575, + "grad_norm": 1.5368090867996216, + "learning_rate": 6.166616813308221e-06, + "loss": 0.3972, + "step": 5970 + }, + { + "epoch": 3.8874430709173717, + "grad_norm": 1.577239990234375, + "learning_rate": 6.132712015386902e-06, + "loss": 0.4253, + "step": 5975 + }, + { + "epoch": 3.8906961613532856, + "grad_norm": 1.5454272031784058, + "learning_rate": 6.098887645613943e-06, + "loss": 0.4081, + "step": 5980 + }, + { + "epoch": 3.8939492517892, + "grad_norm": 1.506635069847107, + "learning_rate": 6.065143848177066e-06, + "loss": 0.4041, + "step": 5985 + }, + { + "epoch": 3.897202342225114, + "grad_norm": 5.971263408660889, + "learning_rate": 6.03148076692055e-06, + "loss": 0.4031, + "step": 5990 + }, + { + "epoch": 3.900455432661028, + "grad_norm": 2.4734601974487305, + "learning_rate": 5.997898545344571e-06, + "loss": 0.43, + "step": 5995 + }, + { + "epoch": 3.903708523096942, + "grad_norm": 2.6702654361724854, + "learning_rate": 5.9643973266046145e-06, + "loss": 0.416, + "step": 6000 + }, + { + "epoch": 3.9069616135328564, + "grad_norm": 1.439784288406372, + "learning_rate": 5.930977253510886e-06, + "loss": 0.4059, + "step": 6005 + }, + { + "epoch": 3.9102147039687702, + "grad_norm": 1.8066974878311157, + "learning_rate": 5.897638468527653e-06, + "loss": 0.4147, + "step": 6010 + }, + { + "epoch": 3.9134677944046845, + "grad_norm": 1.603287935256958, + "learning_rate": 5.864381113772685e-06, + "loss": 0.4168, + "step": 6015 + }, + { + "epoch": 3.9167208848405988, + "grad_norm": 1.6566202640533447, + "learning_rate": 5.831205331016612e-06, + "loss": 0.4095, + "step": 6020 + }, + { + "epoch": 3.9199739752765126, + "grad_norm": 2.173736095428467, + "learning_rate": 5.798111261682357e-06, + "loss": 0.3955, + "step": 6025 + }, + { + "epoch": 3.923227065712427, + "grad_norm": 1.4487274885177612, + "learning_rate": 5.765099046844491e-06, + "loss": 0.4048, + "step": 6030 + }, + { + "epoch": 3.926480156148341, + "grad_norm": 1.4893627166748047, + "learning_rate": 5.7321688272286596e-06, + "loss": 0.3908, + "step": 6035 + }, + { + "epoch": 3.929733246584255, + "grad_norm": 1.9365359544754028, + "learning_rate": 5.699320743210984e-06, + "loss": 0.4141, + "step": 6040 + }, + { + "epoch": 3.932986337020169, + "grad_norm": 1.6186310052871704, + "learning_rate": 5.666554934817447e-06, + "loss": 0.3887, + "step": 6045 + }, + { + "epoch": 3.9362394274560835, + "grad_norm": 7.201216697692871, + "learning_rate": 5.633871541723295e-06, + "loss": 0.4115, + "step": 6050 + }, + { + "epoch": 3.9394925178919973, + "grad_norm": 1.8057231903076172, + "learning_rate": 5.601270703252481e-06, + "loss": 0.3989, + "step": 6055 + }, + { + "epoch": 3.9427456083279115, + "grad_norm": 2.119253158569336, + "learning_rate": 5.5687525583770135e-06, + "loss": 0.4295, + "step": 6060 + }, + { + "epoch": 3.945998698763826, + "grad_norm": 1.8245245218276978, + "learning_rate": 5.536317245716391e-06, + "loss": 0.4213, + "step": 6065 + }, + { + "epoch": 3.9492517891997396, + "grad_norm": 1.69149649143219, + "learning_rate": 5.503964903537037e-06, + "loss": 0.4128, + "step": 6070 + }, + { + "epoch": 3.952504879635654, + "grad_norm": 1.5030890703201294, + "learning_rate": 5.471695669751664e-06, + "loss": 0.3899, + "step": 6075 + }, + { + "epoch": 3.955757970071568, + "grad_norm": 1.764691710472107, + "learning_rate": 5.439509681918703e-06, + "loss": 0.4024, + "step": 6080 + }, + { + "epoch": 3.959011060507482, + "grad_norm": 1.7484804391860962, + "learning_rate": 5.407407077241749e-06, + "loss": 0.4088, + "step": 6085 + }, + { + "epoch": 3.9622641509433962, + "grad_norm": 1.4906771183013916, + "learning_rate": 5.381785121346411e-06, + "loss": 0.4077, + "step": 6090 + }, + { + "epoch": 3.9655172413793105, + "grad_norm": 2.1942009925842285, + "learning_rate": 5.349832950968298e-06, + "loss": 0.4025, + "step": 6095 + }, + { + "epoch": 3.9687703318152243, + "grad_norm": 1.9535316228866577, + "learning_rate": 5.3179645460233574e-06, + "loss": 0.4079, + "step": 6100 + }, + { + "epoch": 3.9720234222511386, + "grad_norm": 1.6382977962493896, + "learning_rate": 5.286180042361361e-06, + "loss": 0.4072, + "step": 6105 + }, + { + "epoch": 3.975276512687053, + "grad_norm": 2.194603204727173, + "learning_rate": 5.254479575474411e-06, + "loss": 0.4147, + "step": 6110 + }, + { + "epoch": 3.9785296031229667, + "grad_norm": 2.237362861633301, + "learning_rate": 5.222863280496406e-06, + "loss": 0.4174, + "step": 6115 + }, + { + "epoch": 3.981782693558881, + "grad_norm": 1.5425729751586914, + "learning_rate": 5.191331292202409e-06, + "loss": 0.3867, + "step": 6120 + }, + { + "epoch": 3.985035783994795, + "grad_norm": 3.286719799041748, + "learning_rate": 5.159883745008099e-06, + "loss": 0.4134, + "step": 6125 + }, + { + "epoch": 3.988288874430709, + "grad_norm": 1.8072408437728882, + "learning_rate": 5.1285207729692146e-06, + "loss": 0.409, + "step": 6130 + }, + { + "epoch": 3.9915419648666233, + "grad_norm": 2.3535733222961426, + "learning_rate": 5.097242509780945e-06, + "loss": 0.3986, + "step": 6135 + }, + { + "epoch": 3.9947950553025375, + "grad_norm": 2.018533706665039, + "learning_rate": 5.06604908877738e-06, + "loss": 0.4115, + "step": 6140 + }, + { + "epoch": 3.9980481457384514, + "grad_norm": 1.9201387166976929, + "learning_rate": 5.03494064293096e-06, + "loss": 0.4206, + "step": 6145 + }, + { + "epoch": 4.0, + "eval_f1": 0.8072647932831747, + "eval_loss": 0.55810546875, + "eval_precision": 0.8068495156761535, + "eval_recall": 0.8078495084242743, + "eval_runtime": 352.5087, + "eval_samples_per_second": 1116.1, + "eval_steps_per_second": 1.092, + "step": 6148 + }, + { + "epoch": 4.001301236174366, + "grad_norm": 1.1841827630996704, + "learning_rate": 5.003917304851868e-06, + "loss": 0.3531, + "step": 6150 + }, + { + "epoch": 4.00455432661028, + "grad_norm": 1.870800495147705, + "learning_rate": 4.972979206787503e-06, + "loss": 0.2658, + "step": 6155 + }, + { + "epoch": 4.007807417046194, + "grad_norm": 6.779404163360596, + "learning_rate": 4.9421264806218865e-06, + "loss": 0.2545, + "step": 6160 + }, + { + "epoch": 4.011060507482108, + "grad_norm": 1.2459709644317627, + "learning_rate": 4.911359257875131e-06, + "loss": 0.2436, + "step": 6165 + }, + { + "epoch": 4.014313597918022, + "grad_norm": 1.5694559812545776, + "learning_rate": 4.880677669702846e-06, + "loss": 0.2433, + "step": 6170 + }, + { + "epoch": 4.017566688353936, + "grad_norm": 1.46892249584198, + "learning_rate": 4.850081846895591e-06, + "loss": 0.2388, + "step": 6175 + }, + { + "epoch": 4.020819778789851, + "grad_norm": 32.99214553833008, + "learning_rate": 4.819571919878346e-06, + "loss": 0.2353, + "step": 6180 + }, + { + "epoch": 4.024072869225765, + "grad_norm": 1.623369574546814, + "learning_rate": 4.78914801870991e-06, + "loss": 0.2389, + "step": 6185 + }, + { + "epoch": 4.027325959661678, + "grad_norm": 1.8111308813095093, + "learning_rate": 4.7588102730823676e-06, + "loss": 0.2362, + "step": 6190 + }, + { + "epoch": 4.030579050097593, + "grad_norm": 1.494247555732727, + "learning_rate": 4.7285588123205546e-06, + "loss": 0.2411, + "step": 6195 + }, + { + "epoch": 4.033832140533507, + "grad_norm": 1.8853955268859863, + "learning_rate": 4.698393765381473e-06, + "loss": 0.2434, + "step": 6200 + }, + { + "epoch": 4.037085230969421, + "grad_norm": 2.163872241973877, + "learning_rate": 4.668315260853753e-06, + "loss": 0.2634, + "step": 6205 + }, + { + "epoch": 4.040338321405335, + "grad_norm": 2.3891079425811768, + "learning_rate": 4.6383234269571305e-06, + "loss": 0.2346, + "step": 6210 + }, + { + "epoch": 4.043591411841249, + "grad_norm": 3.2441794872283936, + "learning_rate": 4.608418391541861e-06, + "loss": 0.25, + "step": 6215 + }, + { + "epoch": 4.046844502277163, + "grad_norm": 2.0283098220825195, + "learning_rate": 4.578600282088186e-06, + "loss": 0.241, + "step": 6220 + }, + { + "epoch": 4.050097592713078, + "grad_norm": 1.6591633558273315, + "learning_rate": 4.548869225705821e-06, + "loss": 0.2366, + "step": 6225 + }, + { + "epoch": 4.053350683148992, + "grad_norm": 1.9975348711013794, + "learning_rate": 4.5192253491333656e-06, + "loss": 0.2287, + "step": 6230 + }, + { + "epoch": 4.056603773584905, + "grad_norm": 1.8785626888275146, + "learning_rate": 4.489668778737793e-06, + "loss": 0.2329, + "step": 6235 + }, + { + "epoch": 4.05985686402082, + "grad_norm": 1.7227510213851929, + "learning_rate": 4.460199640513912e-06, + "loss": 0.2456, + "step": 6240 + }, + { + "epoch": 4.063109954456734, + "grad_norm": 2.2654736042022705, + "learning_rate": 4.430818060083816e-06, + "loss": 0.2271, + "step": 6245 + }, + { + "epoch": 4.066363044892648, + "grad_norm": 1.5479950904846191, + "learning_rate": 4.4015241626963436e-06, + "loss": 0.2404, + "step": 6250 + }, + { + "epoch": 4.0696161353285625, + "grad_norm": 2.179954767227173, + "learning_rate": 4.372318073226583e-06, + "loss": 0.2358, + "step": 6255 + }, + { + "epoch": 4.072869225764476, + "grad_norm": 1.5730780363082886, + "learning_rate": 4.343199916175284e-06, + "loss": 0.2256, + "step": 6260 + }, + { + "epoch": 4.07612231620039, + "grad_norm": 1.6954352855682373, + "learning_rate": 4.3141698156683645e-06, + "loss": 0.2302, + "step": 6265 + }, + { + "epoch": 4.079375406636305, + "grad_norm": 2.26914644241333, + "learning_rate": 4.285227895456373e-06, + "loss": 0.2262, + "step": 6270 + }, + { + "epoch": 4.082628497072219, + "grad_norm": 1.8111881017684937, + "learning_rate": 4.2563742789139635e-06, + "loss": 0.2368, + "step": 6275 + }, + { + "epoch": 4.0858815875081325, + "grad_norm": 2.6787023544311523, + "learning_rate": 4.227609089039361e-06, + "loss": 0.2299, + "step": 6280 + }, + { + "epoch": 4.089134677944047, + "grad_norm": 1.6558953523635864, + "learning_rate": 4.198932448453832e-06, + "loss": 0.2502, + "step": 6285 + }, + { + "epoch": 4.092387768379961, + "grad_norm": 2.19753360748291, + "learning_rate": 4.170344479401203e-06, + "loss": 0.2229, + "step": 6290 + }, + { + "epoch": 4.095640858815875, + "grad_norm": 1.825623631477356, + "learning_rate": 4.14184530374728e-06, + "loss": 0.2334, + "step": 6295 + }, + { + "epoch": 4.0988939492517895, + "grad_norm": 5.066878795623779, + "learning_rate": 4.113435042979357e-06, + "loss": 0.2323, + "step": 6300 + }, + { + "epoch": 4.102147039687703, + "grad_norm": 1.8815892934799194, + "learning_rate": 4.085113818205724e-06, + "loss": 0.2439, + "step": 6305 + }, + { + "epoch": 4.105400130123617, + "grad_norm": 1.726470708847046, + "learning_rate": 4.056881750155095e-06, + "loss": 0.2469, + "step": 6310 + }, + { + "epoch": 4.108653220559532, + "grad_norm": 2.11625075340271, + "learning_rate": 4.028738959176143e-06, + "loss": 0.234, + "step": 6315 + }, + { + "epoch": 4.111906310995446, + "grad_norm": 1.8043195009231567, + "learning_rate": 4.000685565236953e-06, + "loss": 0.235, + "step": 6320 + }, + { + "epoch": 4.1151594014313595, + "grad_norm": 1.5731197595596313, + "learning_rate": 3.972721687924546e-06, + "loss": 0.2369, + "step": 6325 + }, + { + "epoch": 4.118412491867274, + "grad_norm": 2.3870668411254883, + "learning_rate": 3.94484744644433e-06, + "loss": 0.2354, + "step": 6330 + }, + { + "epoch": 4.121665582303188, + "grad_norm": 2.9168872833251953, + "learning_rate": 3.917062959619611e-06, + "loss": 0.2391, + "step": 6335 + }, + { + "epoch": 4.124918672739102, + "grad_norm": 2.253788709640503, + "learning_rate": 3.889368345891101e-06, + "loss": 0.2414, + "step": 6340 + }, + { + "epoch": 4.1281717631750166, + "grad_norm": 1.9010400772094727, + "learning_rate": 3.86176372331638e-06, + "loss": 0.2385, + "step": 6345 + }, + { + "epoch": 4.13142485361093, + "grad_norm": 2.5451037883758545, + "learning_rate": 3.834249209569415e-06, + "loss": 0.2434, + "step": 6350 + }, + { + "epoch": 4.134677944046844, + "grad_norm": 1.9200527667999268, + "learning_rate": 3.806824921940069e-06, + "loss": 0.2327, + "step": 6355 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 1.741262435913086, + "learning_rate": 3.7794909773335664e-06, + "loss": 0.2218, + "step": 6360 + }, + { + "epoch": 4.141184124918673, + "grad_norm": 2.2827839851379395, + "learning_rate": 3.752247492270017e-06, + "loss": 0.238, + "step": 6365 + }, + { + "epoch": 4.1444372153545865, + "grad_norm": 2.077956199645996, + "learning_rate": 3.7250945828839286e-06, + "loss": 0.2371, + "step": 6370 + }, + { + "epoch": 4.147690305790501, + "grad_norm": 1.9408165216445923, + "learning_rate": 3.6980323649236925e-06, + "loss": 0.2241, + "step": 6375 + }, + { + "epoch": 4.150943396226415, + "grad_norm": 2.407376766204834, + "learning_rate": 3.671060953751085e-06, + "loss": 0.2444, + "step": 6380 + }, + { + "epoch": 4.154196486662329, + "grad_norm": 1.6587666273117065, + "learning_rate": 3.6441804643408156e-06, + "loss": 0.2388, + "step": 6385 + }, + { + "epoch": 4.157449577098244, + "grad_norm": 1.826478362083435, + "learning_rate": 3.617391011279986e-06, + "loss": 0.2361, + "step": 6390 + }, + { + "epoch": 4.160702667534157, + "grad_norm": 1.628882646560669, + "learning_rate": 3.590692708767626e-06, + "loss": 0.2246, + "step": 6395 + }, + { + "epoch": 4.163955757970071, + "grad_norm": 1.8005129098892212, + "learning_rate": 3.5640856706142283e-06, + "loss": 0.2435, + "step": 6400 + }, + { + "epoch": 4.167208848405986, + "grad_norm": 1.8075467348098755, + "learning_rate": 3.5375700102412118e-06, + "loss": 0.2283, + "step": 6405 + }, + { + "epoch": 4.1704619388419, + "grad_norm": 1.7112232446670532, + "learning_rate": 3.51114584068048e-06, + "loss": 0.2428, + "step": 6410 + }, + { + "epoch": 4.173715029277814, + "grad_norm": 3.4661080837249756, + "learning_rate": 3.484813274573931e-06, + "loss": 0.2221, + "step": 6415 + }, + { + "epoch": 4.176968119713728, + "grad_norm": 1.970160961151123, + "learning_rate": 3.458572424172962e-06, + "loss": 0.2409, + "step": 6420 + }, + { + "epoch": 4.180221210149642, + "grad_norm": 2.032346248626709, + "learning_rate": 3.432423401338014e-06, + "loss": 0.2259, + "step": 6425 + }, + { + "epoch": 4.183474300585556, + "grad_norm": 2.329317808151245, + "learning_rate": 3.4063663175380622e-06, + "loss": 0.2277, + "step": 6430 + }, + { + "epoch": 4.186727391021471, + "grad_norm": 2.001601457595825, + "learning_rate": 3.3804012838501877e-06, + "loss": 0.2392, + "step": 6435 + }, + { + "epoch": 4.189980481457384, + "grad_norm": 2.017449140548706, + "learning_rate": 3.354528410959054e-06, + "loss": 0.2223, + "step": 6440 + }, + { + "epoch": 4.193233571893298, + "grad_norm": 2.0057036876678467, + "learning_rate": 3.3287478091564628e-06, + "loss": 0.2282, + "step": 6445 + }, + { + "epoch": 4.196486662329213, + "grad_norm": 1.7296228408813477, + "learning_rate": 3.3030595883408953e-06, + "loss": 0.2234, + "step": 6450 + }, + { + "epoch": 4.199739752765127, + "grad_norm": 1.9821724891662598, + "learning_rate": 3.2774638580170075e-06, + "loss": 0.2491, + "step": 6455 + }, + { + "epoch": 4.202992843201041, + "grad_norm": 1.769161581993103, + "learning_rate": 3.2519607272951862e-06, + "loss": 0.2342, + "step": 6460 + }, + { + "epoch": 4.206245933636955, + "grad_norm": 1.7646567821502686, + "learning_rate": 3.226550304891099e-06, + "loss": 0.231, + "step": 6465 + }, + { + "epoch": 4.209499024072869, + "grad_norm": 1.8771380186080933, + "learning_rate": 3.201232699125198e-06, + "loss": 0.2246, + "step": 6470 + }, + { + "epoch": 4.212752114508783, + "grad_norm": 7.158396244049072, + "learning_rate": 3.1760080179222663e-06, + "loss": 0.2281, + "step": 6475 + }, + { + "epoch": 4.216005204944698, + "grad_norm": 2.6318135261535645, + "learning_rate": 3.15087636881099e-06, + "loss": 0.2326, + "step": 6480 + }, + { + "epoch": 4.2192582953806115, + "grad_norm": 1.8173600435256958, + "learning_rate": 3.125837858923453e-06, + "loss": 0.2236, + "step": 6485 + }, + { + "epoch": 4.222511385816525, + "grad_norm": 1.9394645690917969, + "learning_rate": 3.100892594994706e-06, + "loss": 0.2356, + "step": 6490 + }, + { + "epoch": 4.22576447625244, + "grad_norm": 2.29449725151062, + "learning_rate": 3.076040683362308e-06, + "loss": 0.2336, + "step": 6495 + }, + { + "epoch": 4.229017566688354, + "grad_norm": 1.8967347145080566, + "learning_rate": 3.0512822299658824e-06, + "loss": 0.2251, + "step": 6500 + }, + { + "epoch": 4.232270657124268, + "grad_norm": 1.6016850471496582, + "learning_rate": 3.0266173403466438e-06, + "loss": 0.2119, + "step": 6505 + }, + { + "epoch": 4.235523747560182, + "grad_norm": 1.7070870399475098, + "learning_rate": 3.002046119646959e-06, + "loss": 0.2228, + "step": 6510 + }, + { + "epoch": 4.238776837996096, + "grad_norm": 2.9714415073394775, + "learning_rate": 2.977568672609915e-06, + "loss": 0.2317, + "step": 6515 + }, + { + "epoch": 4.24202992843201, + "grad_norm": 1.7705700397491455, + "learning_rate": 2.95318510357884e-06, + "loss": 0.2135, + "step": 6520 + }, + { + "epoch": 4.245283018867925, + "grad_norm": 1.5718425512313843, + "learning_rate": 2.9288955164968766e-06, + "loss": 0.2309, + "step": 6525 + }, + { + "epoch": 4.2485361093038385, + "grad_norm": 6.268550872802734, + "learning_rate": 2.904700014906553e-06, + "loss": 0.2259, + "step": 6530 + }, + { + "epoch": 4.251789199739752, + "grad_norm": 10.964041709899902, + "learning_rate": 2.8805987019493137e-06, + "loss": 0.2255, + "step": 6535 + }, + { + "epoch": 4.255042290175667, + "grad_norm": 6.053678512573242, + "learning_rate": 2.8565916803650866e-06, + "loss": 0.2171, + "step": 6540 + }, + { + "epoch": 4.258295380611581, + "grad_norm": 1.9141877889633179, + "learning_rate": 2.8326790524918765e-06, + "loss": 0.2229, + "step": 6545 + }, + { + "epoch": 4.261548471047496, + "grad_norm": 1.7668460607528687, + "learning_rate": 2.8088609202652742e-06, + "loss": 0.2281, + "step": 6550 + }, + { + "epoch": 4.264801561483409, + "grad_norm": 4.9379048347473145, + "learning_rate": 2.7851373852180617e-06, + "loss": 0.228, + "step": 6555 + }, + { + "epoch": 4.268054651919323, + "grad_norm": 1.8191304206848145, + "learning_rate": 2.761508548479777e-06, + "loss": 0.2223, + "step": 6560 + }, + { + "epoch": 4.271307742355237, + "grad_norm": 3.2812600135803223, + "learning_rate": 2.7379745107762726e-06, + "loss": 0.2254, + "step": 6565 + }, + { + "epoch": 4.274560832791152, + "grad_norm": 1.9550873041152954, + "learning_rate": 2.7145353724292776e-06, + "loss": 0.2245, + "step": 6570 + }, + { + "epoch": 4.2778139232270656, + "grad_norm": 1.6830195188522339, + "learning_rate": 2.691191233355986e-06, + "loss": 0.2343, + "step": 6575 + }, + { + "epoch": 4.28106701366298, + "grad_norm": 1.9676400423049927, + "learning_rate": 2.6679421930686317e-06, + "loss": 0.2418, + "step": 6580 + }, + { + "epoch": 4.284320104098894, + "grad_norm": 2.293408155441284, + "learning_rate": 2.644788350674049e-06, + "loss": 0.2352, + "step": 6585 + }, + { + "epoch": 4.287573194534808, + "grad_norm": 2.1475830078125, + "learning_rate": 2.6217298048732604e-06, + "loss": 0.2263, + "step": 6590 + }, + { + "epoch": 4.290826284970722, + "grad_norm": 1.7658709287643433, + "learning_rate": 2.598766653961068e-06, + "loss": 0.2369, + "step": 6595 + }, + { + "epoch": 4.294079375406636, + "grad_norm": 1.8443782329559326, + "learning_rate": 2.5758989958256043e-06, + "loss": 0.2362, + "step": 6600 + }, + { + "epoch": 4.29733246584255, + "grad_norm": 1.6136929988861084, + "learning_rate": 2.5531269279479325e-06, + "loss": 0.2475, + "step": 6605 + }, + { + "epoch": 4.300585556278465, + "grad_norm": 1.7001075744628906, + "learning_rate": 2.530450547401647e-06, + "loss": 0.2257, + "step": 6610 + }, + { + "epoch": 4.303838646714379, + "grad_norm": 1.7640526294708252, + "learning_rate": 2.5078699508524288e-06, + "loss": 0.2359, + "step": 6615 + }, + { + "epoch": 4.307091737150293, + "grad_norm": 1.509757161140442, + "learning_rate": 2.485385234557641e-06, + "loss": 0.2163, + "step": 6620 + }, + { + "epoch": 4.310344827586207, + "grad_norm": 1.9168065786361694, + "learning_rate": 2.462996494365949e-06, + "loss": 0.2209, + "step": 6625 + }, + { + "epoch": 4.313597918022121, + "grad_norm": 1.701675295829773, + "learning_rate": 2.440703825716867e-06, + "loss": 0.2263, + "step": 6630 + }, + { + "epoch": 4.316851008458035, + "grad_norm": 3.188270330429077, + "learning_rate": 2.4185073236403707e-06, + "loss": 0.2286, + "step": 6635 + }, + { + "epoch": 4.32010409889395, + "grad_norm": 2.1177477836608887, + "learning_rate": 2.396407082756513e-06, + "loss": 0.2153, + "step": 6640 + }, + { + "epoch": 4.3233571893298635, + "grad_norm": 1.7593662738800049, + "learning_rate": 2.3744031972749826e-06, + "loss": 0.2378, + "step": 6645 + }, + { + "epoch": 4.326610279765777, + "grad_norm": 1.7856861352920532, + "learning_rate": 2.352495760994733e-06, + "loss": 0.2222, + "step": 6650 + }, + { + "epoch": 4.329863370201692, + "grad_norm": 2.1246399879455566, + "learning_rate": 2.3306848673035536e-06, + "loss": 0.2222, + "step": 6655 + }, + { + "epoch": 4.333116460637606, + "grad_norm": 1.8715474605560303, + "learning_rate": 2.308970609177713e-06, + "loss": 0.2283, + "step": 6660 + }, + { + "epoch": 4.33636955107352, + "grad_norm": 1.6002229452133179, + "learning_rate": 2.28735307918152e-06, + "loss": 0.2311, + "step": 6665 + }, + { + "epoch": 4.339622641509434, + "grad_norm": 1.71176016330719, + "learning_rate": 2.2658323694669498e-06, + "loss": 0.2261, + "step": 6670 + }, + { + "epoch": 4.342875731945348, + "grad_norm": 4.760283946990967, + "learning_rate": 2.24440857177326e-06, + "loss": 0.2174, + "step": 6675 + }, + { + "epoch": 4.346128822381262, + "grad_norm": 1.7307775020599365, + "learning_rate": 2.2230817774265724e-06, + "loss": 0.2235, + "step": 6680 + }, + { + "epoch": 4.349381912817177, + "grad_norm": 2.089346170425415, + "learning_rate": 2.201852077339506e-06, + "loss": 0.2387, + "step": 6685 + }, + { + "epoch": 4.3526350032530905, + "grad_norm": 1.815763235092163, + "learning_rate": 2.1807195620107914e-06, + "loss": 0.2406, + "step": 6690 + }, + { + "epoch": 4.355888093689004, + "grad_norm": 1.9279545545578003, + "learning_rate": 2.15968432152486e-06, + "loss": 0.2385, + "step": 6695 + }, + { + "epoch": 4.359141184124919, + "grad_norm": 1.6756705045700073, + "learning_rate": 2.1387464455514928e-06, + "loss": 0.2232, + "step": 6700 + }, + { + "epoch": 4.362394274560833, + "grad_norm": 1.606360912322998, + "learning_rate": 2.117906023345406e-06, + "loss": 0.2275, + "step": 6705 + }, + { + "epoch": 4.365647364996747, + "grad_norm": 1.7262459993362427, + "learning_rate": 2.097163143745909e-06, + "loss": 0.2232, + "step": 6710 + }, + { + "epoch": 4.368900455432661, + "grad_norm": 1.5949680805206299, + "learning_rate": 2.0765178951764774e-06, + "loss": 0.2306, + "step": 6715 + }, + { + "epoch": 4.372153545868575, + "grad_norm": 1.6405928134918213, + "learning_rate": 2.0559703656444107e-06, + "loss": 0.2286, + "step": 6720 + }, + { + "epoch": 4.375406636304489, + "grad_norm": 1.8739897012710571, + "learning_rate": 2.0355206427404626e-06, + "loss": 0.2128, + "step": 6725 + }, + { + "epoch": 4.378659726740404, + "grad_norm": 1.7278401851654053, + "learning_rate": 2.015168813638435e-06, + "loss": 0.2217, + "step": 6730 + }, + { + "epoch": 4.3819128171763175, + "grad_norm": 1.776465654373169, + "learning_rate": 1.9949149650948267e-06, + "loss": 0.2492, + "step": 6735 + }, + { + "epoch": 4.385165907612231, + "grad_norm": 3.8171842098236084, + "learning_rate": 1.974759183448477e-06, + "loss": 0.2262, + "step": 6740 + }, + { + "epoch": 4.388418998048146, + "grad_norm": 1.7197240591049194, + "learning_rate": 1.954701554620164e-06, + "loss": 0.2287, + "step": 6745 + }, + { + "epoch": 4.39167208848406, + "grad_norm": 3.6815900802612305, + "learning_rate": 1.9347421641122576e-06, + "loss": 0.2285, + "step": 6750 + }, + { + "epoch": 4.394925178919974, + "grad_norm": 2.5846571922302246, + "learning_rate": 1.9148810970083725e-06, + "loss": 0.2228, + "step": 6755 + }, + { + "epoch": 4.398178269355888, + "grad_norm": 1.7526190280914307, + "learning_rate": 1.8951184379729674e-06, + "loss": 0.2268, + "step": 6760 + }, + { + "epoch": 4.401431359791802, + "grad_norm": 3.6665701866149902, + "learning_rate": 1.8754542712510065e-06, + "loss": 0.2295, + "step": 6765 + }, + { + "epoch": 4.404684450227716, + "grad_norm": 2.0210907459259033, + "learning_rate": 1.8558886806676112e-06, + "loss": 0.2212, + "step": 6770 + }, + { + "epoch": 4.407937540663631, + "grad_norm": 1.6685539484024048, + "learning_rate": 1.8364217496276731e-06, + "loss": 0.2336, + "step": 6775 + }, + { + "epoch": 4.411190631099545, + "grad_norm": 1.7345695495605469, + "learning_rate": 1.8170535611155143e-06, + "loss": 0.2349, + "step": 6780 + }, + { + "epoch": 4.414443721535458, + "grad_norm": 1.5429155826568604, + "learning_rate": 1.797784197694552e-06, + "loss": 0.2206, + "step": 6785 + }, + { + "epoch": 4.417696811971373, + "grad_norm": 1.8358302116394043, + "learning_rate": 1.7786137415069126e-06, + "loss": 0.225, + "step": 6790 + }, + { + "epoch": 4.420949902407287, + "grad_norm": 1.858405590057373, + "learning_rate": 1.7595422742730905e-06, + "loss": 0.2049, + "step": 6795 + }, + { + "epoch": 4.424202992843201, + "grad_norm": 1.6918565034866333, + "learning_rate": 1.7405698772916313e-06, + "loss": 0.2224, + "step": 6800 + }, + { + "epoch": 4.427456083279115, + "grad_norm": 1.7888590097427368, + "learning_rate": 1.7216966314387378e-06, + "loss": 0.2359, + "step": 6805 + }, + { + "epoch": 4.430709173715029, + "grad_norm": 4.2224273681640625, + "learning_rate": 1.7029226171679542e-06, + "loss": 0.2278, + "step": 6810 + }, + { + "epoch": 4.433962264150943, + "grad_norm": 3.129756450653076, + "learning_rate": 1.684247914509826e-06, + "loss": 0.2317, + "step": 6815 + }, + { + "epoch": 4.437215354586858, + "grad_norm": 1.7897382974624634, + "learning_rate": 1.6656726030715358e-06, + "loss": 0.2305, + "step": 6820 + }, + { + "epoch": 4.440468445022772, + "grad_norm": 2.1024298667907715, + "learning_rate": 1.6471967620365846e-06, + "loss": 0.2433, + "step": 6825 + }, + { + "epoch": 4.443721535458685, + "grad_norm": 1.8943312168121338, + "learning_rate": 1.6288204701644382e-06, + "loss": 0.2301, + "step": 6830 + }, + { + "epoch": 4.4469746258946, + "grad_norm": 1.8938087224960327, + "learning_rate": 1.6105438057902295e-06, + "loss": 0.2245, + "step": 6835 + }, + { + "epoch": 4.450227716330514, + "grad_norm": 1.9831109046936035, + "learning_rate": 1.592366846824364e-06, + "loss": 0.2471, + "step": 6840 + }, + { + "epoch": 4.453480806766428, + "grad_norm": 4.050066947937012, + "learning_rate": 1.5742896707522242e-06, + "loss": 0.2437, + "step": 6845 + }, + { + "epoch": 4.4567338972023425, + "grad_norm": 1.9161465167999268, + "learning_rate": 1.5563123546338572e-06, + "loss": 0.2232, + "step": 6850 + }, + { + "epoch": 4.459986987638256, + "grad_norm": 2.1633474826812744, + "learning_rate": 1.5384349751035948e-06, + "loss": 0.2387, + "step": 6855 + }, + { + "epoch": 4.46324007807417, + "grad_norm": 1.729988694190979, + "learning_rate": 1.5206576083697687e-06, + "loss": 0.2387, + "step": 6860 + }, + { + "epoch": 4.466493168510085, + "grad_norm": 3.217106342315674, + "learning_rate": 1.502980330214379e-06, + "loss": 0.222, + "step": 6865 + }, + { + "epoch": 4.469746258945999, + "grad_norm": 3.8715853691101074, + "learning_rate": 1.4854032159927562e-06, + "loss": 0.2256, + "step": 6870 + }, + { + "epoch": 4.4729993493819125, + "grad_norm": 1.8818020820617676, + "learning_rate": 1.4679263406332467e-06, + "loss": 0.2282, + "step": 6875 + }, + { + "epoch": 4.476252439817827, + "grad_norm": 1.7817474603652954, + "learning_rate": 1.450549778636895e-06, + "loss": 0.2336, + "step": 6880 + }, + { + "epoch": 4.479505530253741, + "grad_norm": 1.9050745964050293, + "learning_rate": 1.43327360407714e-06, + "loss": 0.2239, + "step": 6885 + }, + { + "epoch": 4.482758620689655, + "grad_norm": 1.8388644456863403, + "learning_rate": 1.416097890599466e-06, + "loss": 0.2231, + "step": 6890 + }, + { + "epoch": 4.4860117111255695, + "grad_norm": 1.8944287300109863, + "learning_rate": 1.3990227114211191e-06, + "loss": 0.2252, + "step": 6895 + }, + { + "epoch": 4.489264801561483, + "grad_norm": 1.994055986404419, + "learning_rate": 1.3820481393307855e-06, + "loss": 0.2351, + "step": 6900 + }, + { + "epoch": 4.492517891997397, + "grad_norm": 1.8318955898284912, + "learning_rate": 1.365174246688275e-06, + "loss": 0.2328, + "step": 6905 + }, + { + "epoch": 4.495770982433312, + "grad_norm": 1.8265342712402344, + "learning_rate": 1.3484011054242157e-06, + "loss": 0.2202, + "step": 6910 + }, + { + "epoch": 4.499024072869226, + "grad_norm": 2.5501973628997803, + "learning_rate": 1.3317287870397572e-06, + "loss": 0.2283, + "step": 6915 + }, + { + "epoch": 4.5022771633051395, + "grad_norm": 2.1136600971221924, + "learning_rate": 1.3151573626062535e-06, + "loss": 0.2362, + "step": 6920 + }, + { + "epoch": 4.505530253741054, + "grad_norm": 2.1394035816192627, + "learning_rate": 1.298686902764959e-06, + "loss": 0.2262, + "step": 6925 + }, + { + "epoch": 4.508783344176968, + "grad_norm": 2.5853805541992188, + "learning_rate": 1.2823174777267439e-06, + "loss": 0.2235, + "step": 6930 + }, + { + "epoch": 4.512036434612883, + "grad_norm": 11.947797775268555, + "learning_rate": 1.266049157271773e-06, + "loss": 0.2135, + "step": 6935 + }, + { + "epoch": 4.5152895250487965, + "grad_norm": 1.589674472808838, + "learning_rate": 1.2498820107492204e-06, + "loss": 0.232, + "step": 6940 + }, + { + "epoch": 4.51854261548471, + "grad_norm": 2.020092248916626, + "learning_rate": 1.233816107076985e-06, + "loss": 0.2373, + "step": 6945 + }, + { + "epoch": 4.521795705920624, + "grad_norm": 1.9216246604919434, + "learning_rate": 1.2178515147413665e-06, + "loss": 0.2275, + "step": 6950 + }, + { + "epoch": 4.525048796356539, + "grad_norm": 2.0308332443237305, + "learning_rate": 1.2019883017967943e-06, + "loss": 0.225, + "step": 6955 + }, + { + "epoch": 4.528301886792453, + "grad_norm": 1.7747068405151367, + "learning_rate": 1.1862265358655505e-06, + "loss": 0.2252, + "step": 6960 + }, + { + "epoch": 4.531554977228367, + "grad_norm": 3.725299119949341, + "learning_rate": 1.170566284137442e-06, + "loss": 0.2343, + "step": 6965 + }, + { + "epoch": 4.534808067664281, + "grad_norm": 2.894679069519043, + "learning_rate": 1.1550076133695604e-06, + "loss": 0.2436, + "step": 6970 + }, + { + "epoch": 4.538061158100195, + "grad_norm": 1.8062316179275513, + "learning_rate": 1.1395505898859487e-06, + "loss": 0.237, + "step": 6975 + }, + { + "epoch": 4.541314248536109, + "grad_norm": 1.745668888092041, + "learning_rate": 1.1241952795773697e-06, + "loss": 0.2324, + "step": 6980 + }, + { + "epoch": 4.544567338972024, + "grad_norm": 1.908809781074524, + "learning_rate": 1.108941747900985e-06, + "loss": 0.2356, + "step": 6985 + }, + { + "epoch": 4.547820429407937, + "grad_norm": 6.190516948699951, + "learning_rate": 1.0937900598800872e-06, + "loss": 0.23, + "step": 6990 + }, + { + "epoch": 4.551073519843852, + "grad_norm": 2.0687854290008545, + "learning_rate": 1.0787402801038405e-06, + "loss": 0.2376, + "step": 6995 + }, + { + "epoch": 4.554326610279766, + "grad_norm": 2.2060608863830566, + "learning_rate": 1.0637924727269822e-06, + "loss": 0.2179, + "step": 7000 + }, + { + "epoch": 4.55757970071568, + "grad_norm": 1.8114817142486572, + "learning_rate": 1.0489467014695526e-06, + "loss": 0.2445, + "step": 7005 + }, + { + "epoch": 4.560832791151594, + "grad_norm": 1.8267011642456055, + "learning_rate": 1.0342030296166428e-06, + "loss": 0.2249, + "step": 7010 + }, + { + "epoch": 4.564085881587508, + "grad_norm": 1.8274999856948853, + "learning_rate": 1.0195615200180974e-06, + "loss": 0.2198, + "step": 7015 + }, + { + "epoch": 4.567338972023422, + "grad_norm": 4.727597713470459, + "learning_rate": 1.0050222350882682e-06, + "loss": 0.23, + "step": 7020 + }, + { + "epoch": 4.570592062459337, + "grad_norm": 17.39756965637207, + "learning_rate": 9.905852368057383e-07, + "loss": 0.2217, + "step": 7025 + }, + { + "epoch": 4.573845152895251, + "grad_norm": 1.8855489492416382, + "learning_rate": 9.762505867130594e-07, + "loss": 0.2233, + "step": 7030 + }, + { + "epoch": 4.577098243331164, + "grad_norm": 1.946487545967102, + "learning_rate": 9.620183459164878e-07, + "loss": 0.2398, + "step": 7035 + }, + { + "epoch": 4.580351333767078, + "grad_norm": 5.050466060638428, + "learning_rate": 9.478885750857285e-07, + "loss": 0.2437, + "step": 7040 + }, + { + "epoch": 4.583604424202993, + "grad_norm": 2.065145254135132, + "learning_rate": 9.338613344536701e-07, + "loss": 0.235, + "step": 7045 + }, + { + "epoch": 4.586857514638907, + "grad_norm": 1.8699109554290771, + "learning_rate": 9.199366838161389e-07, + "loss": 0.2314, + "step": 7050 + }, + { + "epoch": 4.5901106050748215, + "grad_norm": 1.8447792530059814, + "learning_rate": 9.06114682531628e-07, + "loss": 0.2261, + "step": 7055 + }, + { + "epoch": 4.593363695510735, + "grad_norm": 7.244560241699219, + "learning_rate": 8.923953895210612e-07, + "loss": 0.2094, + "step": 7060 + }, + { + "epoch": 4.596616785946649, + "grad_norm": 1.6492129564285278, + "learning_rate": 8.787788632675293e-07, + "loss": 0.2185, + "step": 7065 + }, + { + "epoch": 4.599869876382563, + "grad_norm": 1.8190096616744995, + "learning_rate": 8.652651618160424e-07, + "loss": 0.2368, + "step": 7070 + }, + { + "epoch": 4.603122966818478, + "grad_norm": 1.6560496091842651, + "learning_rate": 8.51854342773295e-07, + "loss": 0.2297, + "step": 7075 + }, + { + "epoch": 4.6063760572543915, + "grad_norm": 1.6347981691360474, + "learning_rate": 8.385464633074019e-07, + "loss": 0.2228, + "step": 7080 + }, + { + "epoch": 4.609629147690306, + "grad_norm": 2.087787628173828, + "learning_rate": 8.25341580147665e-07, + "loss": 0.2277, + "step": 7085 + }, + { + "epoch": 4.61288223812622, + "grad_norm": 1.9914604425430298, + "learning_rate": 8.122397495843343e-07, + "loss": 0.2332, + "step": 7090 + }, + { + "epoch": 4.616135328562134, + "grad_norm": 2.1197216510772705, + "learning_rate": 7.992410274683615e-07, + "loss": 0.2218, + "step": 7095 + }, + { + "epoch": 4.6193884189980485, + "grad_norm": 3.4777309894561768, + "learning_rate": 7.863454692111583e-07, + "loss": 0.224, + "step": 7100 + }, + { + "epoch": 4.622641509433962, + "grad_norm": 1.6859432458877563, + "learning_rate": 7.735531297843713e-07, + "loss": 0.2434, + "step": 7105 + }, + { + "epoch": 4.625894599869876, + "grad_norm": 3.000272035598755, + "learning_rate": 7.60864063719649e-07, + "loss": 0.2296, + "step": 7110 + }, + { + "epoch": 4.629147690305791, + "grad_norm": 1.8207471370697021, + "learning_rate": 7.482783251083869e-07, + "loss": 0.2252, + "step": 7115 + }, + { + "epoch": 4.632400780741705, + "grad_norm": 2.1532914638519287, + "learning_rate": 7.357959676015214e-07, + "loss": 0.2275, + "step": 7120 + }, + { + "epoch": 4.6356538711776185, + "grad_norm": 1.644968867301941, + "learning_rate": 7.234170444092942e-07, + "loss": 0.2384, + "step": 7125 + }, + { + "epoch": 4.638906961613533, + "grad_norm": 1.7306429147720337, + "learning_rate": 7.11141608301022e-07, + "loss": 0.2202, + "step": 7130 + }, + { + "epoch": 4.642160052049447, + "grad_norm": 2.0186638832092285, + "learning_rate": 6.989697116048633e-07, + "loss": 0.2242, + "step": 7135 + }, + { + "epoch": 4.645413142485361, + "grad_norm": 1.7209802865982056, + "learning_rate": 6.86901406207624e-07, + "loss": 0.2152, + "step": 7140 + }, + { + "epoch": 4.648666232921276, + "grad_norm": 1.6327604055404663, + "learning_rate": 6.749367435545024e-07, + "loss": 0.2384, + "step": 7145 + }, + { + "epoch": 4.651919323357189, + "grad_norm": 2.552006721496582, + "learning_rate": 6.630757746488886e-07, + "loss": 0.2287, + "step": 7150 + }, + { + "epoch": 4.655172413793103, + "grad_norm": 1.8012171983718872, + "learning_rate": 6.513185500521463e-07, + "loss": 0.2148, + "step": 7155 + }, + { + "epoch": 4.658425504229018, + "grad_norm": 1.9249675273895264, + "learning_rate": 6.396651198833897e-07, + "loss": 0.2249, + "step": 7160 + }, + { + "epoch": 4.661678594664932, + "grad_norm": 1.640886664390564, + "learning_rate": 6.281155338192762e-07, + "loss": 0.2167, + "step": 7165 + }, + { + "epoch": 4.6649316851008455, + "grad_norm": 1.9455459117889404, + "learning_rate": 6.166698410937949e-07, + "loss": 0.2264, + "step": 7170 + }, + { + "epoch": 4.66818477553676, + "grad_norm": 2.079929828643799, + "learning_rate": 6.053280904980557e-07, + "loss": 0.2249, + "step": 7175 + }, + { + "epoch": 4.671437865972674, + "grad_norm": 1.5457408428192139, + "learning_rate": 5.940903303800705e-07, + "loss": 0.218, + "step": 7180 + }, + { + "epoch": 4.674690956408588, + "grad_norm": 2.4226558208465576, + "learning_rate": 5.829566086445721e-07, + "loss": 0.2229, + "step": 7185 + }, + { + "epoch": 4.677944046844503, + "grad_norm": 2.8459372520446777, + "learning_rate": 5.719269727527843e-07, + "loss": 0.2206, + "step": 7190 + }, + { + "epoch": 4.681197137280416, + "grad_norm": 1.8545024394989014, + "learning_rate": 5.610014697222249e-07, + "loss": 0.2157, + "step": 7195 + }, + { + "epoch": 4.68445022771633, + "grad_norm": 2.315584421157837, + "learning_rate": 5.501801461265304e-07, + "loss": 0.2179, + "step": 7200 + }, + { + "epoch": 4.687703318152245, + "grad_norm": 1.7075843811035156, + "learning_rate": 5.394630480952178e-07, + "loss": 0.2147, + "step": 7205 + }, + { + "epoch": 4.690956408588159, + "grad_norm": 1.7120945453643799, + "learning_rate": 5.288502213135149e-07, + "loss": 0.2329, + "step": 7210 + }, + { + "epoch": 4.694209499024073, + "grad_norm": 1.6173934936523438, + "learning_rate": 5.183417110221606e-07, + "loss": 0.2338, + "step": 7215 + }, + { + "epoch": 4.697462589459987, + "grad_norm": 1.8131818771362305, + "learning_rate": 5.07937562017205e-07, + "loss": 0.231, + "step": 7220 + }, + { + "epoch": 4.700715679895901, + "grad_norm": 2.225557327270508, + "learning_rate": 4.976378186498293e-07, + "loss": 0.2303, + "step": 7225 + }, + { + "epoch": 4.703968770331815, + "grad_norm": 1.9588440656661987, + "learning_rate": 4.874425248261428e-07, + "loss": 0.2254, + "step": 7230 + }, + { + "epoch": 4.70722186076773, + "grad_norm": 1.5929067134857178, + "learning_rate": 4.773517240070108e-07, + "loss": 0.2177, + "step": 7235 + }, + { + "epoch": 4.7104749512036435, + "grad_norm": 1.794873833656311, + "learning_rate": 4.67365459207858e-07, + "loss": 0.2268, + "step": 7240 + }, + { + "epoch": 4.713728041639557, + "grad_norm": 2.0999152660369873, + "learning_rate": 4.5748377299849045e-07, + "loss": 0.2201, + "step": 7245 + }, + { + "epoch": 4.716981132075472, + "grad_norm": 1.7107945680618286, + "learning_rate": 4.477067075029123e-07, + "loss": 0.2272, + "step": 7250 + }, + { + "epoch": 4.720234222511386, + "grad_norm": 1.8921961784362793, + "learning_rate": 4.3803430439915137e-07, + "loss": 0.2176, + "step": 7255 + }, + { + "epoch": 4.7234873129473, + "grad_norm": 2.4301459789276123, + "learning_rate": 4.284666049190644e-07, + "loss": 0.2146, + "step": 7260 + }, + { + "epoch": 4.726740403383214, + "grad_norm": 1.8662853240966797, + "learning_rate": 4.1900364984818754e-07, + "loss": 0.2483, + "step": 7265 + }, + { + "epoch": 4.729993493819128, + "grad_norm": 1.5476515293121338, + "learning_rate": 4.0964547952554443e-07, + "loss": 0.2265, + "step": 7270 + }, + { + "epoch": 4.733246584255042, + "grad_norm": 2.1670968532562256, + "learning_rate": 4.0039213384347187e-07, + "loss": 0.2384, + "step": 7275 + }, + { + "epoch": 4.736499674690957, + "grad_norm": 1.7863905429840088, + "learning_rate": 3.912436522474666e-07, + "loss": 0.2287, + "step": 7280 + }, + { + "epoch": 4.7397527651268705, + "grad_norm": 2.369838237762451, + "learning_rate": 3.822000737360026e-07, + "loss": 0.2362, + "step": 7285 + }, + { + "epoch": 4.743005855562784, + "grad_norm": 2.5364902019500732, + "learning_rate": 3.7326143686036706e-07, + "loss": 0.2211, + "step": 7290 + }, + { + "epoch": 4.746258945998699, + "grad_norm": 2.2473394870758057, + "learning_rate": 3.644277797244966e-07, + "loss": 0.2349, + "step": 7295 + }, + { + "epoch": 4.749512036434613, + "grad_norm": 2.3497040271759033, + "learning_rate": 3.556991399848275e-07, + "loss": 0.2254, + "step": 7300 + }, + { + "epoch": 4.752765126870527, + "grad_norm": 1.744187831878662, + "learning_rate": 3.4707555485011533e-07, + "loss": 0.2231, + "step": 7305 + }, + { + "epoch": 4.756018217306441, + "grad_norm": 2.6059043407440186, + "learning_rate": 3.385570610812794e-07, + "loss": 0.2141, + "step": 7310 + }, + { + "epoch": 4.759271307742355, + "grad_norm": 2.001283645629883, + "learning_rate": 3.3014369499126675e-07, + "loss": 0.2219, + "step": 7315 + }, + { + "epoch": 4.762524398178269, + "grad_norm": 2.1158323287963867, + "learning_rate": 3.218354924448719e-07, + "loss": 0.237, + "step": 7320 + }, + { + "epoch": 4.765777488614184, + "grad_norm": 1.978641152381897, + "learning_rate": 3.1363248885859506e-07, + "loss": 0.2225, + "step": 7325 + }, + { + "epoch": 4.7690305790500975, + "grad_norm": 2.592278003692627, + "learning_rate": 3.055347192004954e-07, + "loss": 0.2235, + "step": 7330 + }, + { + "epoch": 4.772283669486011, + "grad_norm": 3.1187829971313477, + "learning_rate": 2.9754221799003503e-07, + "loss": 0.2324, + "step": 7335 + }, + { + "epoch": 4.775536759921926, + "grad_norm": 1.6826578378677368, + "learning_rate": 2.8965501929792695e-07, + "loss": 0.2309, + "step": 7340 + }, + { + "epoch": 4.77878985035784, + "grad_norm": 1.6795811653137207, + "learning_rate": 2.818731567460098e-07, + "loss": 0.2087, + "step": 7345 + }, + { + "epoch": 4.782042940793754, + "grad_norm": 1.9984930753707886, + "learning_rate": 2.741966635070842e-07, + "loss": 0.22, + "step": 7350 + }, + { + "epoch": 4.785296031229668, + "grad_norm": 2.397368907928467, + "learning_rate": 2.6662557230477667e-07, + "loss": 0.227, + "step": 7355 + }, + { + "epoch": 4.788549121665582, + "grad_norm": 1.748075008392334, + "learning_rate": 2.5915991541340667e-07, + "loss": 0.2206, + "step": 7360 + }, + { + "epoch": 4.791802212101496, + "grad_norm": 2.152040481567383, + "learning_rate": 2.5179972465784186e-07, + "loss": 0.2214, + "step": 7365 + }, + { + "epoch": 4.795055302537411, + "grad_norm": 1.829852819442749, + "learning_rate": 2.4454503141336513e-07, + "loss": 0.213, + "step": 7370 + }, + { + "epoch": 4.798308392973325, + "grad_norm": 2.7500312328338623, + "learning_rate": 2.3739586660554148e-07, + "loss": 0.2188, + "step": 7375 + }, + { + "epoch": 4.801561483409239, + "grad_norm": 2.0260448455810547, + "learning_rate": 2.3035226071008997e-07, + "loss": 0.2264, + "step": 7380 + }, + { + "epoch": 4.804814573845153, + "grad_norm": 2.0971879959106445, + "learning_rate": 2.2341424375274256e-07, + "loss": 0.2382, + "step": 7385 + }, + { + "epoch": 4.808067664281067, + "grad_norm": 1.875805139541626, + "learning_rate": 2.165818453091245e-07, + "loss": 0.2181, + "step": 7390 + }, + { + "epoch": 4.811320754716981, + "grad_norm": 2.7413530349731445, + "learning_rate": 2.098550945046268e-07, + "loss": 0.2342, + "step": 7395 + }, + { + "epoch": 4.814573845152895, + "grad_norm": 1.8814730644226074, + "learning_rate": 2.0323402001428682e-07, + "loss": 0.2251, + "step": 7400 + }, + { + "epoch": 4.817826935588809, + "grad_norm": 2.818944215774536, + "learning_rate": 1.9671865006265223e-07, + "loss": 0.2247, + "step": 7405 + }, + { + "epoch": 4.821080026024724, + "grad_norm": 1.7889087200164795, + "learning_rate": 1.9030901242367837e-07, + "loss": 0.2401, + "step": 7410 + }, + { + "epoch": 4.824333116460638, + "grad_norm": 2.2675249576568604, + "learning_rate": 1.8400513442059786e-07, + "loss": 0.2343, + "step": 7415 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 2.051952362060547, + "learning_rate": 1.7780704292580107e-07, + "loss": 0.2222, + "step": 7420 + }, + { + "epoch": 4.830839297332465, + "grad_norm": 1.9150989055633545, + "learning_rate": 1.7292475378629936e-07, + "loss": 0.2197, + "step": 7425 + }, + { + "epoch": 4.83409238776838, + "grad_norm": 2.003403902053833, + "learning_rate": 1.6691714428535288e-07, + "loss": 0.2274, + "step": 7430 + }, + { + "epoch": 4.837345478204294, + "grad_norm": 1.729634165763855, + "learning_rate": 1.6101539413598822e-07, + "loss": 0.2313, + "step": 7435 + }, + { + "epoch": 4.840598568640209, + "grad_norm": 2.2372894287109375, + "learning_rate": 1.5521952849639476e-07, + "loss": 0.2227, + "step": 7440 + }, + { + "epoch": 4.8438516590761225, + "grad_norm": 1.8825215101242065, + "learning_rate": 1.4952957207339802e-07, + "loss": 0.2182, + "step": 7445 + }, + { + "epoch": 4.847104749512036, + "grad_norm": 1.7914789915084839, + "learning_rate": 1.439455491223457e-07, + "loss": 0.2104, + "step": 7450 + }, + { + "epoch": 4.85035783994795, + "grad_norm": 1.8011417388916016, + "learning_rate": 1.3846748344701065e-07, + "loss": 0.2243, + "step": 7455 + }, + { + "epoch": 4.853610930383865, + "grad_norm": 1.737289547920227, + "learning_rate": 1.3309539839948538e-07, + "loss": 0.2272, + "step": 7460 + }, + { + "epoch": 4.856864020819779, + "grad_norm": 2.1911211013793945, + "learning_rate": 1.2782931688008482e-07, + "loss": 0.2273, + "step": 7465 + }, + { + "epoch": 4.860117111255693, + "grad_norm": 6.419521331787109, + "learning_rate": 1.2266926133725487e-07, + "loss": 0.2332, + "step": 7470 + }, + { + "epoch": 4.863370201691607, + "grad_norm": 7.40968656539917, + "learning_rate": 1.1761525376745575e-07, + "loss": 0.2092, + "step": 7475 + }, + { + "epoch": 4.866623292127521, + "grad_norm": 1.6772918701171875, + "learning_rate": 1.1266731571509815e-07, + "loss": 0.222, + "step": 7480 + }, + { + "epoch": 4.869876382563435, + "grad_norm": 1.780863881111145, + "learning_rate": 1.0782546827242667e-07, + "loss": 0.2239, + "step": 7485 + }, + { + "epoch": 4.8731294729993495, + "grad_norm": 1.984481930732727, + "learning_rate": 1.0308973207944217e-07, + "loss": 0.2212, + "step": 7490 + }, + { + "epoch": 4.876382563435263, + "grad_norm": 1.720576286315918, + "learning_rate": 9.846012732380727e-08, + "loss": 0.2308, + "step": 7495 + }, + { + "epoch": 4.879635653871178, + "grad_norm": 8.550189971923828, + "learning_rate": 9.483287143148001e-08, + "loss": 0.2275, + "step": 7500 + }, + { + "epoch": 4.882888744307092, + "grad_norm": 2.3628833293914795, + "learning_rate": 9.039435269181384e-08, + "loss": 0.2137, + "step": 7505 + }, + { + "epoch": 4.886141834743006, + "grad_norm": 2.029315233230591, + "learning_rate": 8.606201950781267e-08, + "loss": 0.2229, + "step": 7510 + }, + { + "epoch": 4.8893949251789195, + "grad_norm": 4.765905857086182, + "learning_rate": 8.183589034750639e-08, + "loss": 0.2361, + "step": 7515 + }, + { + "epoch": 4.892648015614834, + "grad_norm": 1.9303253889083862, + "learning_rate": 7.771598322618422e-08, + "loss": 0.2198, + "step": 7520 + }, + { + "epoch": 4.895901106050748, + "grad_norm": 2.502507209777832, + "learning_rate": 7.370231570633656e-08, + "loss": 0.2263, + "step": 7525 + }, + { + "epoch": 4.899154196486663, + "grad_norm": 2.3058369159698486, + "learning_rate": 6.979490489756601e-08, + "loss": 0.2224, + "step": 7530 + }, + { + "epoch": 4.9024072869225765, + "grad_norm": 1.8423314094543457, + "learning_rate": 6.599376745652641e-08, + "loss": 0.2237, + "step": 7535 + }, + { + "epoch": 4.90566037735849, + "grad_norm": 1.7330564260482788, + "learning_rate": 6.229891958683675e-08, + "loss": 0.2248, + "step": 7540 + }, + { + "epoch": 4.908913467794405, + "grad_norm": 1.9083019495010376, + "learning_rate": 5.8710377039031264e-08, + "loss": 0.2174, + "step": 7545 + }, + { + "epoch": 4.912166558230319, + "grad_norm": 4.153858661651611, + "learning_rate": 5.52281551104733e-08, + "loss": 0.2279, + "step": 7550 + }, + { + "epoch": 4.915419648666233, + "grad_norm": 1.6192256212234497, + "learning_rate": 5.185226864530546e-08, + "loss": 0.2263, + "step": 7555 + }, + { + "epoch": 4.918672739102147, + "grad_norm": 1.8693500757217407, + "learning_rate": 4.8582732034374575e-08, + "loss": 0.2234, + "step": 7560 + }, + { + "epoch": 4.921925829538061, + "grad_norm": 1.9380360841751099, + "learning_rate": 4.541955921518182e-08, + "loss": 0.2182, + "step": 7565 + }, + { + "epoch": 4.925178919973975, + "grad_norm": 1.9134066104888916, + "learning_rate": 4.236276367180769e-08, + "loss": 0.2283, + "step": 7570 + }, + { + "epoch": 4.92843201040989, + "grad_norm": 1.6409050226211548, + "learning_rate": 3.9412358434876003e-08, + "loss": 0.2325, + "step": 7575 + }, + { + "epoch": 4.931685100845804, + "grad_norm": 1.7989153861999512, + "learning_rate": 3.6568356081473354e-08, + "loss": 0.2357, + "step": 7580 + }, + { + "epoch": 4.934938191281717, + "grad_norm": 2.0496795177459717, + "learning_rate": 3.383076873511859e-08, + "loss": 0.2218, + "step": 7585 + }, + { + "epoch": 4.938191281717632, + "grad_norm": 1.7194660902023315, + "learning_rate": 3.119960806569344e-08, + "loss": 0.2324, + "step": 7590 + }, + { + "epoch": 4.941444372153546, + "grad_norm": 2.1285674571990967, + "learning_rate": 2.867488528940643e-08, + "loss": 0.2339, + "step": 7595 + }, + { + "epoch": 4.94469746258946, + "grad_norm": 2.114654779434204, + "learning_rate": 2.6256611168734568e-08, + "loss": 0.2403, + "step": 7600 + }, + { + "epoch": 4.9479505530253745, + "grad_norm": 7.625702857971191, + "learning_rate": 2.3944796012381754e-08, + "loss": 0.2255, + "step": 7605 + }, + { + "epoch": 4.951203643461288, + "grad_norm": 1.6961826086044312, + "learning_rate": 2.173944967523711e-08, + "loss": 0.2197, + "step": 7610 + }, + { + "epoch": 4.954456733897202, + "grad_norm": 3.5485808849334717, + "learning_rate": 1.9640581558330594e-08, + "loss": 0.2309, + "step": 7615 + }, + { + "epoch": 4.957709824333117, + "grad_norm": 1.928830862045288, + "learning_rate": 1.7648200608791353e-08, + "loss": 0.2205, + "step": 7620 + }, + { + "epoch": 4.960962914769031, + "grad_norm": 2.2189247608184814, + "learning_rate": 1.5762315319814425e-08, + "loss": 0.2159, + "step": 7625 + }, + { + "epoch": 4.964216005204944, + "grad_norm": 1.728050947189331, + "learning_rate": 1.3982933730613545e-08, + "loss": 0.223, + "step": 7630 + }, + { + "epoch": 4.967469095640859, + "grad_norm": 7.718484878540039, + "learning_rate": 1.2310063426404506e-08, + "loss": 0.2345, + "step": 7635 + }, + { + "epoch": 4.970722186076773, + "grad_norm": 1.8739633560180664, + "learning_rate": 1.0743711538357959e-08, + "loss": 0.2354, + "step": 7640 + }, + { + "epoch": 4.973975276512687, + "grad_norm": 2.505018711090088, + "learning_rate": 9.28388474357167e-09, + "loss": 0.2341, + "step": 7645 + }, + { + "epoch": 4.9772283669486015, + "grad_norm": 2.42279314994812, + "learning_rate": 7.930589265051081e-09, + "loss": 0.2242, + "step": 7650 + }, + { + "epoch": 4.980481457384515, + "grad_norm": 1.8265007734298706, + "learning_rate": 6.683830871667685e-09, + "loss": 0.2052, + "step": 7655 + }, + { + "epoch": 4.983734547820429, + "grad_norm": 1.9440053701400757, + "learning_rate": 5.543614878153469e-09, + "loss": 0.2132, + "step": 7660 + }, + { + "epoch": 4.986987638256344, + "grad_norm": 3.599294900894165, + "learning_rate": 4.509946145059285e-09, + "loss": 0.2236, + "step": 7665 + }, + { + "epoch": 4.990240728692258, + "grad_norm": 6.0209126472473145, + "learning_rate": 3.5828290787437436e-09, + "loss": 0.2376, + "step": 7670 + }, + { + "epoch": 4.9934938191281715, + "grad_norm": 1.537784218788147, + "learning_rate": 2.762267631356563e-09, + "loss": 0.2264, + "step": 7675 + }, + { + "epoch": 4.996746909564086, + "grad_norm": 2.2779605388641357, + "learning_rate": 2.0482653008163654e-09, + "loss": 0.2452, + "step": 7680 + }, + { + "epoch": 5.0, + "grad_norm": 3.292484998703003, + "learning_rate": 1.440825130796797e-09, + "loss": 0.2322, + "step": 7685 + }, + { + "epoch": 5.0, + "eval_f1": 0.801246098825816, + "eval_loss": 0.7490234375, + "eval_precision": 0.8011274883507161, + "eval_recall": 0.8013850778258188, + "eval_runtime": 255.3842, + "eval_samples_per_second": 1540.561, + "eval_steps_per_second": 1.508, + "step": 7685 + }, + { + "epoch": 5.0, + "step": 7685, + "total_flos": 1.664464427634026e+19, + "train_loss": 0.6255086388770993, + "train_runtime": 65052.9958, + "train_samples_per_second": 241.916, + "train_steps_per_second": 0.118 + } + ], + "logging_steps": 5, + "max_steps": 7685, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 5.0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.664464427634026e+19, + "train_batch_size": 512, + "trial_name": null, + "trial_params": null +}