{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 105393, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014232444279980643, "grad_norm": 125.48007202148438, "learning_rate": 4.976279259533366e-05, "loss": 1.262, "step": 500 }, { "epoch": 0.028464888559961286, "grad_norm": 3.935548782348633, "learning_rate": 4.952558519066731e-05, "loss": 1.3135, "step": 1000 }, { "epoch": 0.04269733283994193, "grad_norm": 432.8229675292969, "learning_rate": 4.9288377786000975e-05, "loss": 1.3926, "step": 1500 }, { "epoch": 0.05692977711992257, "grad_norm": 44.97547149658203, "learning_rate": 4.9051170381334626e-05, "loss": 1.5597, "step": 2000 }, { "epoch": 0.07116222139990322, "grad_norm": 26.80133056640625, "learning_rate": 4.8813962976668284e-05, "loss": 1.4146, "step": 2500 }, { "epoch": 0.08539466567988387, "grad_norm": 86.9864273071289, "learning_rate": 4.8576755572001934e-05, "loss": 1.4553, "step": 3000 }, { "epoch": 0.09962710995986451, "grad_norm": 1.6426622867584229, "learning_rate": 4.83395481673356e-05, "loss": 1.2304, "step": 3500 }, { "epoch": 0.11385955423984515, "grad_norm": 280.5526428222656, "learning_rate": 4.810234076266925e-05, "loss": 1.2387, "step": 4000 }, { "epoch": 0.1280919985198258, "grad_norm": 19.09530258178711, "learning_rate": 4.786513335800291e-05, "loss": 1.5957, "step": 4500 }, { "epoch": 0.14232444279980644, "grad_norm": 8.927694320678711, "learning_rate": 4.762792595333656e-05, "loss": 1.3537, "step": 5000 }, { "epoch": 0.15655688707978707, "grad_norm": 214.30294799804688, "learning_rate": 4.7390718548670216e-05, "loss": 1.4233, "step": 5500 }, { "epoch": 0.17078933135976773, "grad_norm": 7.38736629486084, "learning_rate": 4.715351114400387e-05, "loss": 1.3794, "step": 6000 }, { "epoch": 0.18502177563974836, "grad_norm": 153.38523864746094, "learning_rate": 4.691630373933753e-05, "loss": 1.6875, "step": 6500 }, { "epoch": 0.19925421991972903, "grad_norm": 0.02275083400309086, "learning_rate": 4.667909633467118e-05, "loss": 1.4558, "step": 7000 }, { "epoch": 0.21348666419970966, "grad_norm": 320.4385070800781, "learning_rate": 4.644188893000484e-05, "loss": 1.3569, "step": 7500 }, { "epoch": 0.2277191084796903, "grad_norm": 3.4960036277770996, "learning_rate": 4.62046815253385e-05, "loss": 1.5315, "step": 8000 }, { "epoch": 0.24195155275967095, "grad_norm": 37.32798767089844, "learning_rate": 4.5967474120672155e-05, "loss": 1.4474, "step": 8500 }, { "epoch": 0.2561839970396516, "grad_norm": 46.48209762573242, "learning_rate": 4.573026671600581e-05, "loss": 1.3646, "step": 9000 }, { "epoch": 0.2704164413196322, "grad_norm": 0.00015533728583250195, "learning_rate": 4.549305931133946e-05, "loss": 1.1472, "step": 9500 }, { "epoch": 0.2846488855996129, "grad_norm": 0.002173429122194648, "learning_rate": 4.525585190667312e-05, "loss": 1.5633, "step": 10000 }, { "epoch": 0.29888132987959354, "grad_norm": 186.76336669921875, "learning_rate": 4.501864450200678e-05, "loss": 1.3533, "step": 10500 }, { "epoch": 0.31311377415957414, "grad_norm": 10.35462474822998, "learning_rate": 4.4781437097340436e-05, "loss": 1.4215, "step": 11000 }, { "epoch": 0.3273462184395548, "grad_norm": 0.01028844341635704, "learning_rate": 4.454422969267409e-05, "loss": 1.3768, "step": 11500 }, { "epoch": 0.34157866271953546, "grad_norm": 0.011700475588440895, "learning_rate": 4.4307022288007744e-05, "loss": 1.3978, "step": 12000 }, { "epoch": 0.3558111069995161, "grad_norm": 487.6321105957031, "learning_rate": 4.4069814883341395e-05, "loss": 1.3882, "step": 12500 }, { "epoch": 0.37004355127949673, "grad_norm": 7.461075782775879, "learning_rate": 4.383260747867506e-05, "loss": 1.4489, "step": 13000 }, { "epoch": 0.3842759955594774, "grad_norm": 0.00012495643750298768, "learning_rate": 4.359540007400871e-05, "loss": 1.413, "step": 13500 }, { "epoch": 0.39850843983945805, "grad_norm": 70.00542449951172, "learning_rate": 4.335819266934237e-05, "loss": 1.3233, "step": 14000 }, { "epoch": 0.41274088411943866, "grad_norm": 167.84930419921875, "learning_rate": 4.3120985264676026e-05, "loss": 1.425, "step": 14500 }, { "epoch": 0.4269733283994193, "grad_norm": 285.5869445800781, "learning_rate": 4.2883777860009676e-05, "loss": 1.4104, "step": 15000 }, { "epoch": 0.4412057726794, "grad_norm": 99.11154174804688, "learning_rate": 4.264657045534334e-05, "loss": 1.3, "step": 15500 }, { "epoch": 0.4554382169593806, "grad_norm": 9.479164145886898e-05, "learning_rate": 4.240936305067699e-05, "loss": 1.2376, "step": 16000 }, { "epoch": 0.46967066123936124, "grad_norm": 33.981483459472656, "learning_rate": 4.217215564601065e-05, "loss": 1.5736, "step": 16500 }, { "epoch": 0.4839031055193419, "grad_norm": 2.699108839035034, "learning_rate": 4.19349482413443e-05, "loss": 1.4016, "step": 17000 }, { "epoch": 0.49813554979932256, "grad_norm": 0.3147715628147125, "learning_rate": 4.1697740836677964e-05, "loss": 1.3702, "step": 17500 }, { "epoch": 0.5123679940793032, "grad_norm": 0.14605668187141418, "learning_rate": 4.1460533432011615e-05, "loss": 1.3285, "step": 18000 }, { "epoch": 0.5266004383592838, "grad_norm": 0.1937469094991684, "learning_rate": 4.122332602734527e-05, "loss": 1.348, "step": 18500 }, { "epoch": 0.5408328826392644, "grad_norm": 89.56853485107422, "learning_rate": 4.0986118622678924e-05, "loss": 1.482, "step": 19000 }, { "epoch": 0.5550653269192451, "grad_norm": 0.07370008528232574, "learning_rate": 4.074891121801258e-05, "loss": 1.2254, "step": 19500 }, { "epoch": 0.5692977711992258, "grad_norm": 199.52772521972656, "learning_rate": 4.051170381334624e-05, "loss": 1.4102, "step": 20000 }, { "epoch": 0.5835302154792064, "grad_norm": 45.664859771728516, "learning_rate": 4.0274496408679896e-05, "loss": 1.3631, "step": 20500 }, { "epoch": 0.5977626597591871, "grad_norm": 3.712595798788243e-06, "learning_rate": 4.003728900401355e-05, "loss": 1.3419, "step": 21000 }, { "epoch": 0.6119951040391677, "grad_norm": 8.145997708197683e-05, "learning_rate": 3.9800081599347205e-05, "loss": 1.3845, "step": 21500 }, { "epoch": 0.6262275483191483, "grad_norm": 248.4174346923828, "learning_rate": 3.956287419468086e-05, "loss": 1.1569, "step": 22000 }, { "epoch": 0.640459992599129, "grad_norm": 3.249052679166198e-05, "learning_rate": 3.932566679001452e-05, "loss": 1.306, "step": 22500 }, { "epoch": 0.6546924368791096, "grad_norm": 3.5558118725020904e-06, "learning_rate": 3.908845938534818e-05, "loss": 1.4762, "step": 23000 }, { "epoch": 0.6689248811590902, "grad_norm": 4.918412208557129, "learning_rate": 3.885125198068183e-05, "loss": 1.2786, "step": 23500 }, { "epoch": 0.6831573254390709, "grad_norm": 4.046954154968262, "learning_rate": 3.8614044576015486e-05, "loss": 1.4232, "step": 24000 }, { "epoch": 0.6973897697190515, "grad_norm": 139.35830688476562, "learning_rate": 3.8376837171349144e-05, "loss": 1.2306, "step": 24500 }, { "epoch": 0.7116222139990322, "grad_norm": 0.0009823115542531013, "learning_rate": 3.81396297666828e-05, "loss": 1.2463, "step": 25000 }, { "epoch": 0.7258546582790129, "grad_norm": 2.3099827766418457, "learning_rate": 3.790242236201645e-05, "loss": 1.4433, "step": 25500 }, { "epoch": 0.7400871025589935, "grad_norm": 137.370361328125, "learning_rate": 3.766521495735011e-05, "loss": 1.3623, "step": 26000 }, { "epoch": 0.7543195468389742, "grad_norm": 0.5372253656387329, "learning_rate": 3.742800755268376e-05, "loss": 1.432, "step": 26500 }, { "epoch": 0.7685519911189548, "grad_norm": 3.4454299679964606e-07, "learning_rate": 3.7190800148017425e-05, "loss": 1.4002, "step": 27000 }, { "epoch": 0.7827844353989354, "grad_norm": 0.3542407155036926, "learning_rate": 3.6953592743351076e-05, "loss": 1.3797, "step": 27500 }, { "epoch": 0.7970168796789161, "grad_norm": 0.013401851058006287, "learning_rate": 3.6716385338684733e-05, "loss": 1.2473, "step": 28000 }, { "epoch": 0.8112493239588967, "grad_norm": 251.7378692626953, "learning_rate": 3.647917793401839e-05, "loss": 1.2836, "step": 28500 }, { "epoch": 0.8254817682388773, "grad_norm": 0.6462361216545105, "learning_rate": 3.624197052935205e-05, "loss": 1.3998, "step": 29000 }, { "epoch": 0.839714212518858, "grad_norm": 0.006176416762173176, "learning_rate": 3.6004763124685706e-05, "loss": 1.2388, "step": 29500 }, { "epoch": 0.8539466567988386, "grad_norm": 0.6315366625785828, "learning_rate": 3.576755572001936e-05, "loss": 1.3803, "step": 30000 }, { "epoch": 0.8681791010788192, "grad_norm": 0.3394842743873596, "learning_rate": 3.5530348315353015e-05, "loss": 1.3189, "step": 30500 }, { "epoch": 0.8824115453588, "grad_norm": 64.10663604736328, "learning_rate": 3.5293140910686666e-05, "loss": 1.3589, "step": 31000 }, { "epoch": 0.8966439896387806, "grad_norm": 0.08269359171390533, "learning_rate": 3.505593350602033e-05, "loss": 1.2997, "step": 31500 }, { "epoch": 0.9108764339187612, "grad_norm": 101.3696060180664, "learning_rate": 3.481872610135398e-05, "loss": 1.3141, "step": 32000 }, { "epoch": 0.9251088781987419, "grad_norm": 84.51553344726562, "learning_rate": 3.458151869668764e-05, "loss": 1.2003, "step": 32500 }, { "epoch": 0.9393413224787225, "grad_norm": 17.5670108795166, "learning_rate": 3.434431129202129e-05, "loss": 1.2641, "step": 33000 }, { "epoch": 0.9535737667587031, "grad_norm": 1.7186918258666992, "learning_rate": 3.4107103887354954e-05, "loss": 1.3127, "step": 33500 }, { "epoch": 0.9678062110386838, "grad_norm": 173.8558807373047, "learning_rate": 3.3869896482688604e-05, "loss": 1.2033, "step": 34000 }, { "epoch": 0.9820386553186644, "grad_norm": 0.0010143565014004707, "learning_rate": 3.363268907802226e-05, "loss": 1.3354, "step": 34500 }, { "epoch": 0.9962710995986451, "grad_norm": 0.00039917067624628544, "learning_rate": 3.339548167335591e-05, "loss": 1.2617, "step": 35000 }, { "epoch": 1.0105035438786256, "grad_norm": 327.86859130859375, "learning_rate": 3.315827426868957e-05, "loss": 1.1885, "step": 35500 }, { "epoch": 1.0247359881586064, "grad_norm": 244.52615356445312, "learning_rate": 3.292106686402323e-05, "loss": 1.2093, "step": 36000 }, { "epoch": 1.038968432438587, "grad_norm": 7.460901997546898e-06, "learning_rate": 3.2683859459356886e-05, "loss": 1.5839, "step": 36500 }, { "epoch": 1.0532008767185677, "grad_norm": 1.3049445152282715, "learning_rate": 3.244665205469054e-05, "loss": 1.2053, "step": 37000 }, { "epoch": 1.0674333209985483, "grad_norm": 3.011463718394225e-07, "learning_rate": 3.2209444650024194e-05, "loss": 1.2656, "step": 37500 }, { "epoch": 1.0816657652785289, "grad_norm": 3.12375807762146, "learning_rate": 3.197223724535785e-05, "loss": 1.281, "step": 38000 }, { "epoch": 1.0958982095585097, "grad_norm": 0.007164264563471079, "learning_rate": 3.173502984069151e-05, "loss": 1.2791, "step": 38500 }, { "epoch": 1.1101306538384903, "grad_norm": 1.040968345478177e-05, "learning_rate": 3.149782243602517e-05, "loss": 1.3826, "step": 39000 }, { "epoch": 1.124363098118471, "grad_norm": 0.4420815110206604, "learning_rate": 3.126061503135882e-05, "loss": 1.3168, "step": 39500 }, { "epoch": 1.1385955423984515, "grad_norm": 212.03660583496094, "learning_rate": 3.1023407626692475e-05, "loss": 1.2582, "step": 40000 }, { "epoch": 1.1528279866784321, "grad_norm": 90.19596099853516, "learning_rate": 3.078620022202613e-05, "loss": 1.301, "step": 40500 }, { "epoch": 1.1670604309584127, "grad_norm": 2.8100829124450684, "learning_rate": 3.054899281735979e-05, "loss": 1.1346, "step": 41000 }, { "epoch": 1.1812928752383933, "grad_norm": 218.48757934570312, "learning_rate": 3.0311785412693445e-05, "loss": 1.2074, "step": 41500 }, { "epoch": 1.1955253195183742, "grad_norm": 157.1456298828125, "learning_rate": 3.00745780080271e-05, "loss": 1.1437, "step": 42000 }, { "epoch": 1.2097577637983548, "grad_norm": 0.2799847722053528, "learning_rate": 2.9837370603360753e-05, "loss": 1.2164, "step": 42500 }, { "epoch": 1.2239902080783354, "grad_norm": 4.315948963165283, "learning_rate": 2.9600163198694414e-05, "loss": 1.1489, "step": 43000 }, { "epoch": 1.238222652358316, "grad_norm": 8.09383487701416, "learning_rate": 2.936295579402807e-05, "loss": 1.2479, "step": 43500 }, { "epoch": 1.2524550966382968, "grad_norm": 357.42626953125, "learning_rate": 2.9125748389361723e-05, "loss": 1.1489, "step": 44000 }, { "epoch": 1.2666875409182774, "grad_norm": 0.011236801743507385, "learning_rate": 2.8888540984695377e-05, "loss": 1.2963, "step": 44500 }, { "epoch": 1.280919985198258, "grad_norm": 356.8849792480469, "learning_rate": 2.8651333580029034e-05, "loss": 1.2021, "step": 45000 }, { "epoch": 1.2951524294782386, "grad_norm": 1.403478741645813, "learning_rate": 2.8414126175362692e-05, "loss": 1.2875, "step": 45500 }, { "epoch": 1.3093848737582192, "grad_norm": 0.10168521851301193, "learning_rate": 2.8176918770696346e-05, "loss": 1.1594, "step": 46000 }, { "epoch": 1.3236173180381998, "grad_norm": 0.0002031529729720205, "learning_rate": 2.7939711366030004e-05, "loss": 1.1856, "step": 46500 }, { "epoch": 1.3378497623181804, "grad_norm": 0.19232670962810516, "learning_rate": 2.7702503961363658e-05, "loss": 1.3044, "step": 47000 }, { "epoch": 1.3520822065981613, "grad_norm": 3.5715222358703613, "learning_rate": 2.746529655669732e-05, "loss": 1.1655, "step": 47500 }, { "epoch": 1.3663146508781419, "grad_norm": 187.40589904785156, "learning_rate": 2.7228089152030973e-05, "loss": 1.2214, "step": 48000 }, { "epoch": 1.3805470951581225, "grad_norm": 0.8012977242469788, "learning_rate": 2.6990881747364628e-05, "loss": 1.2209, "step": 48500 }, { "epoch": 1.394779539438103, "grad_norm": 57.39756774902344, "learning_rate": 2.6753674342698282e-05, "loss": 1.3786, "step": 49000 }, { "epoch": 1.4090119837180837, "grad_norm": 7.3047776222229, "learning_rate": 2.6516466938031936e-05, "loss": 1.3933, "step": 49500 }, { "epoch": 1.4232444279980645, "grad_norm": 510.1078796386719, "learning_rate": 2.6279259533365597e-05, "loss": 1.3421, "step": 50000 }, { "epoch": 1.437476872278045, "grad_norm": 0.0005555571406148374, "learning_rate": 2.604205212869925e-05, "loss": 1.4459, "step": 50500 }, { "epoch": 1.4517093165580257, "grad_norm": 357.9839172363281, "learning_rate": 2.5804844724032905e-05, "loss": 1.5949, "step": 51000 }, { "epoch": 1.4659417608380063, "grad_norm": 10.876310348510742, "learning_rate": 2.556763731936656e-05, "loss": 1.5693, "step": 51500 }, { "epoch": 1.480174205117987, "grad_norm": 0.9166773557662964, "learning_rate": 2.533042991470022e-05, "loss": 1.5855, "step": 52000 }, { "epoch": 1.4944066493979675, "grad_norm": 0.0008015409694053233, "learning_rate": 2.5093222510033875e-05, "loss": 1.5393, "step": 52500 }, { "epoch": 1.5086390936779481, "grad_norm": 565.9046630859375, "learning_rate": 2.485601510536753e-05, "loss": 1.4058, "step": 53000 }, { "epoch": 1.5228715379579287, "grad_norm": 0.002393967006355524, "learning_rate": 2.4618807700701187e-05, "loss": 1.5389, "step": 53500 }, { "epoch": 1.5371039822379096, "grad_norm": 396.0699157714844, "learning_rate": 2.4381600296034844e-05, "loss": 1.6004, "step": 54000 }, { "epoch": 1.5513364265178902, "grad_norm": 0.6404986381530762, "learning_rate": 2.41443928913685e-05, "loss": 1.5296, "step": 54500 }, { "epoch": 1.565568870797871, "grad_norm": 335.4157409667969, "learning_rate": 2.3907185486702156e-05, "loss": 1.4802, "step": 55000 }, { "epoch": 1.5798013150778516, "grad_norm": 210.5367431640625, "learning_rate": 2.366997808203581e-05, "loss": 1.2892, "step": 55500 }, { "epoch": 1.5940337593578322, "grad_norm": 0.0008832117891870439, "learning_rate": 2.3432770677369468e-05, "loss": 1.305, "step": 56000 }, { "epoch": 1.6082662036378128, "grad_norm": 157.38621520996094, "learning_rate": 2.3195563272703122e-05, "loss": 1.3269, "step": 56500 }, { "epoch": 1.6224986479177934, "grad_norm": 0.49598127603530884, "learning_rate": 2.2958355868036776e-05, "loss": 0.9746, "step": 57000 }, { "epoch": 1.636731092197774, "grad_norm": 5.005952630199317e-07, "learning_rate": 2.2721148463370434e-05, "loss": 1.3408, "step": 57500 }, { "epoch": 1.6509635364777546, "grad_norm": 0.018824387341737747, "learning_rate": 2.2483941058704088e-05, "loss": 1.2307, "step": 58000 }, { "epoch": 1.6651959807577352, "grad_norm": 0.004480020143091679, "learning_rate": 2.2246733654037746e-05, "loss": 1.2367, "step": 58500 }, { "epoch": 1.6794284250377158, "grad_norm": 11.437743186950684, "learning_rate": 2.20095262493714e-05, "loss": 1.3383, "step": 59000 }, { "epoch": 1.6936608693176967, "grad_norm": 0.06036483868956566, "learning_rate": 2.1772318844705058e-05, "loss": 1.3084, "step": 59500 }, { "epoch": 1.7078933135976773, "grad_norm": 1.3579503297805786, "learning_rate": 2.1535111440038712e-05, "loss": 1.4009, "step": 60000 }, { "epoch": 1.7221257578776579, "grad_norm": 21.6284236907959, "learning_rate": 2.129790403537237e-05, "loss": 1.3342, "step": 60500 }, { "epoch": 1.7363582021576387, "grad_norm": 29.55640983581543, "learning_rate": 2.1060696630706027e-05, "loss": 1.0924, "step": 61000 }, { "epoch": 1.7505906464376193, "grad_norm": 173.9644012451172, "learning_rate": 2.082348922603968e-05, "loss": 1.2447, "step": 61500 }, { "epoch": 1.7648230907176, "grad_norm": 6.281250476837158, "learning_rate": 2.058628182137334e-05, "loss": 1.3199, "step": 62000 }, { "epoch": 1.7790555349975805, "grad_norm": 18.333364486694336, "learning_rate": 2.0349074416706993e-05, "loss": 1.2088, "step": 62500 }, { "epoch": 1.7932879792775611, "grad_norm": 0.01794009655714035, "learning_rate": 2.011186701204065e-05, "loss": 1.2375, "step": 63000 }, { "epoch": 1.8075204235575417, "grad_norm": 2.168732166290283, "learning_rate": 1.9874659607374305e-05, "loss": 1.2492, "step": 63500 }, { "epoch": 1.8217528678375223, "grad_norm": 8.638180588604882e-05, "learning_rate": 1.963745220270796e-05, "loss": 1.3141, "step": 64000 }, { "epoch": 1.835985312117503, "grad_norm": 19.142189025878906, "learning_rate": 1.9400244798041617e-05, "loss": 1.2977, "step": 64500 }, { "epoch": 1.8502177563974838, "grad_norm": 308.3004455566406, "learning_rate": 1.916303739337527e-05, "loss": 1.1852, "step": 65000 }, { "epoch": 1.8644502006774644, "grad_norm": 1.4798052783149274e-09, "learning_rate": 1.892582998870893e-05, "loss": 1.2323, "step": 65500 }, { "epoch": 1.878682644957445, "grad_norm": 5.560269832611084, "learning_rate": 1.8688622584042583e-05, "loss": 1.4425, "step": 66000 }, { "epoch": 1.8929150892374258, "grad_norm": 0.2333667278289795, "learning_rate": 1.845141517937624e-05, "loss": 1.2208, "step": 66500 }, { "epoch": 1.9071475335174064, "grad_norm": 0.011270904913544655, "learning_rate": 1.8214207774709895e-05, "loss": 1.1584, "step": 67000 }, { "epoch": 1.921379977797387, "grad_norm": 291.5809020996094, "learning_rate": 1.7977000370043552e-05, "loss": 1.4176, "step": 67500 }, { "epoch": 1.9356124220773676, "grad_norm": 1.7415293455123901, "learning_rate": 1.773979296537721e-05, "loss": 1.1936, "step": 68000 }, { "epoch": 1.9498448663573482, "grad_norm": 85.77689361572266, "learning_rate": 1.7502585560710864e-05, "loss": 0.9786, "step": 68500 }, { "epoch": 1.9640773106373288, "grad_norm": 208.44168090820312, "learning_rate": 1.726537815604452e-05, "loss": 1.3196, "step": 69000 }, { "epoch": 1.9783097549173094, "grad_norm": 5.191640853881836, "learning_rate": 1.7028170751378176e-05, "loss": 1.2343, "step": 69500 }, { "epoch": 1.99254219919729, "grad_norm": 0.000520756293553859, "learning_rate": 1.6790963346711833e-05, "loss": 1.2603, "step": 70000 }, { "epoch": 2.0067746434772706, "grad_norm": 0.48437151312828064, "learning_rate": 1.6553755942045488e-05, "loss": 1.1047, "step": 70500 }, { "epoch": 2.0210070877572512, "grad_norm": 66.11140441894531, "learning_rate": 1.6316548537379145e-05, "loss": 1.2467, "step": 71000 }, { "epoch": 2.0352395320372323, "grad_norm": 6.315969949355349e-05, "learning_rate": 1.60793411327128e-05, "loss": 1.1051, "step": 71500 }, { "epoch": 2.049471976317213, "grad_norm": 0.015222056768834591, "learning_rate": 1.5842133728046454e-05, "loss": 1.1647, "step": 72000 }, { "epoch": 2.0637044205971935, "grad_norm": 0.0003228056593798101, "learning_rate": 1.560492632338011e-05, "loss": 1.1471, "step": 72500 }, { "epoch": 2.077936864877174, "grad_norm": 0.00011044983693864197, "learning_rate": 1.5367718918713766e-05, "loss": 1.0928, "step": 73000 }, { "epoch": 2.0921693091571547, "grad_norm": 0.002029678551480174, "learning_rate": 1.5130511514047425e-05, "loss": 1.2604, "step": 73500 }, { "epoch": 2.1064017534371353, "grad_norm": 437.6989440917969, "learning_rate": 1.4893304109381079e-05, "loss": 1.1682, "step": 74000 }, { "epoch": 2.120634197717116, "grad_norm": 0.008761188015341759, "learning_rate": 1.4656096704714737e-05, "loss": 1.2494, "step": 74500 }, { "epoch": 2.1348666419970965, "grad_norm": 22.72408676147461, "learning_rate": 1.4418889300048391e-05, "loss": 1.2037, "step": 75000 }, { "epoch": 2.149099086277077, "grad_norm": 0.01661345176398754, "learning_rate": 1.4181681895382045e-05, "loss": 1.1032, "step": 75500 }, { "epoch": 2.1633315305570577, "grad_norm": 194.62342834472656, "learning_rate": 1.3944474490715703e-05, "loss": 1.4635, "step": 76000 }, { "epoch": 2.1775639748370383, "grad_norm": 1.419067621231079, "learning_rate": 1.3707267086049359e-05, "loss": 1.1742, "step": 76500 }, { "epoch": 2.1917964191170194, "grad_norm": 0.3263187110424042, "learning_rate": 1.3470059681383016e-05, "loss": 1.2213, "step": 77000 }, { "epoch": 2.206028863397, "grad_norm": 341.4047546386719, "learning_rate": 1.323285227671667e-05, "loss": 1.3171, "step": 77500 }, { "epoch": 2.2202613076769806, "grad_norm": 0.010969799011945724, "learning_rate": 1.2995644872050328e-05, "loss": 1.1598, "step": 78000 }, { "epoch": 2.234493751956961, "grad_norm": 1.907973289489746, "learning_rate": 1.2758437467383982e-05, "loss": 1.233, "step": 78500 }, { "epoch": 2.248726196236942, "grad_norm": 0.5124784708023071, "learning_rate": 1.2521230062717636e-05, "loss": 1.034, "step": 79000 }, { "epoch": 2.2629586405169224, "grad_norm": 3.334690177325683e-07, "learning_rate": 1.2284022658051294e-05, "loss": 1.1803, "step": 79500 }, { "epoch": 2.277191084796903, "grad_norm": 147.6239471435547, "learning_rate": 1.204681525338495e-05, "loss": 1.1318, "step": 80000 }, { "epoch": 2.2914235290768836, "grad_norm": 0.00035827644751407206, "learning_rate": 1.1809607848718608e-05, "loss": 1.1722, "step": 80500 }, { "epoch": 2.3056559733568642, "grad_norm": 4.226629810033522e-11, "learning_rate": 1.1572400444052262e-05, "loss": 1.102, "step": 81000 }, { "epoch": 2.319888417636845, "grad_norm": 0.029256457462906837, "learning_rate": 1.1335193039385918e-05, "loss": 1.3087, "step": 81500 }, { "epoch": 2.3341208619168254, "grad_norm": 0.40314897894859314, "learning_rate": 1.1097985634719574e-05, "loss": 1.2656, "step": 82000 }, { "epoch": 2.348353306196806, "grad_norm": 0.001224256120622158, "learning_rate": 1.086077823005323e-05, "loss": 1.1994, "step": 82500 }, { "epoch": 2.3625857504767867, "grad_norm": 0.288789302110672, "learning_rate": 1.0623570825386885e-05, "loss": 1.4443, "step": 83000 }, { "epoch": 2.3768181947567677, "grad_norm": 329.513671875, "learning_rate": 1.0386363420720541e-05, "loss": 1.1175, "step": 83500 }, { "epoch": 2.3910506390367483, "grad_norm": 0.004081339109688997, "learning_rate": 1.0149156016054199e-05, "loss": 1.0974, "step": 84000 }, { "epoch": 2.405283083316729, "grad_norm": 30.36007308959961, "learning_rate": 9.911948611387853e-06, "loss": 1.1901, "step": 84500 }, { "epoch": 2.4195155275967095, "grad_norm": 235.61441040039062, "learning_rate": 9.674741206721509e-06, "loss": 1.2007, "step": 85000 }, { "epoch": 2.43374797187669, "grad_norm": 312.31610107421875, "learning_rate": 9.437533802055165e-06, "loss": 1.1889, "step": 85500 }, { "epoch": 2.4479804161566707, "grad_norm": 17.370275497436523, "learning_rate": 9.200326397388821e-06, "loss": 1.254, "step": 86000 }, { "epoch": 2.4622128604366513, "grad_norm": 199.55438232421875, "learning_rate": 8.963118992722477e-06, "loss": 1.1899, "step": 86500 }, { "epoch": 2.476445304716632, "grad_norm": 0.0002413564798189327, "learning_rate": 8.725911588056133e-06, "loss": 1.248, "step": 87000 }, { "epoch": 2.4906777489966125, "grad_norm": 230.59945678710938, "learning_rate": 8.48870418338979e-06, "loss": 1.0638, "step": 87500 }, { "epoch": 2.5049101932765936, "grad_norm": 9.951874879732259e-09, "learning_rate": 8.251496778723446e-06, "loss": 1.4321, "step": 88000 }, { "epoch": 2.519142637556574, "grad_norm": 21.62441062927246, "learning_rate": 8.0142893740571e-06, "loss": 1.101, "step": 88500 }, { "epoch": 2.533375081836555, "grad_norm": 2.4028645384532865e-06, "learning_rate": 7.777081969390756e-06, "loss": 1.2014, "step": 89000 }, { "epoch": 2.5476075261165354, "grad_norm": 164.88087463378906, "learning_rate": 7.539874564724412e-06, "loss": 1.0345, "step": 89500 }, { "epoch": 2.561839970396516, "grad_norm": 6.1404312745594325e-09, "learning_rate": 7.302667160058069e-06, "loss": 1.3437, "step": 90000 }, { "epoch": 2.5760724146764966, "grad_norm": 0.0006744582788087428, "learning_rate": 7.065459755391725e-06, "loss": 1.2536, "step": 90500 }, { "epoch": 2.590304858956477, "grad_norm": 213.77198791503906, "learning_rate": 6.828252350725381e-06, "loss": 1.2701, "step": 91000 }, { "epoch": 2.604537303236458, "grad_norm": 323.82061767578125, "learning_rate": 6.591044946059037e-06, "loss": 1.2165, "step": 91500 }, { "epoch": 2.6187697475164384, "grad_norm": 0.0007768824580125511, "learning_rate": 6.353837541392692e-06, "loss": 0.9871, "step": 92000 }, { "epoch": 2.633002191796419, "grad_norm": 1.1964664814456683e-08, "learning_rate": 6.116630136726349e-06, "loss": 1.1809, "step": 92500 }, { "epoch": 2.6472346360763996, "grad_norm": 0.01886417344212532, "learning_rate": 5.879422732060004e-06, "loss": 1.2455, "step": 93000 }, { "epoch": 2.6614670803563802, "grad_norm": 1.3518683910369873, "learning_rate": 5.6422153273936604e-06, "loss": 1.0477, "step": 93500 }, { "epoch": 2.675699524636361, "grad_norm": 89.34034729003906, "learning_rate": 5.405007922727316e-06, "loss": 1.3728, "step": 94000 }, { "epoch": 2.6899319689163415, "grad_norm": 24.910266876220703, "learning_rate": 5.167800518060972e-06, "loss": 1.2636, "step": 94500 }, { "epoch": 2.7041644131963225, "grad_norm": 50.8535270690918, "learning_rate": 4.930593113394627e-06, "loss": 1.396, "step": 95000 }, { "epoch": 2.718396857476303, "grad_norm": 0.004643031395971775, "learning_rate": 4.693385708728284e-06, "loss": 1.1522, "step": 95500 }, { "epoch": 2.7326293017562837, "grad_norm": 0.933087944984436, "learning_rate": 4.45617830406194e-06, "loss": 1.3026, "step": 96000 }, { "epoch": 2.7468617460362643, "grad_norm": 3.416786000798311e-07, "learning_rate": 4.218970899395596e-06, "loss": 1.0283, "step": 96500 }, { "epoch": 2.761094190316245, "grad_norm": 130.11692810058594, "learning_rate": 3.981763494729252e-06, "loss": 1.3739, "step": 97000 }, { "epoch": 2.7753266345962255, "grad_norm": 2.1324740373529494e-05, "learning_rate": 3.7445560900629077e-06, "loss": 1.3277, "step": 97500 }, { "epoch": 2.789559078876206, "grad_norm": 0.00010976188787026331, "learning_rate": 3.5073486853965636e-06, "loss": 1.2325, "step": 98000 }, { "epoch": 2.8037915231561867, "grad_norm": 238.07444763183594, "learning_rate": 3.270141280730219e-06, "loss": 1.1932, "step": 98500 }, { "epoch": 2.8180239674361673, "grad_norm": 0.0008449407760053873, "learning_rate": 3.032933876063875e-06, "loss": 1.2113, "step": 99000 }, { "epoch": 2.8322564117161484, "grad_norm": 2.691925048828125, "learning_rate": 2.7957264713975314e-06, "loss": 1.0264, "step": 99500 }, { "epoch": 2.846488855996129, "grad_norm": 1.2127447128295898, "learning_rate": 2.5585190667311873e-06, "loss": 1.2739, "step": 100000 }, { "epoch": 2.8607213002761096, "grad_norm": 2.0609796047210693, "learning_rate": 2.321311662064843e-06, "loss": 1.0268, "step": 100500 }, { "epoch": 2.87495374455609, "grad_norm": 0.4568251669406891, "learning_rate": 2.084104257398499e-06, "loss": 1.3471, "step": 101000 }, { "epoch": 2.889186188836071, "grad_norm": 239.36712646484375, "learning_rate": 1.846896852732155e-06, "loss": 1.1087, "step": 101500 }, { "epoch": 2.9034186331160514, "grad_norm": 0.14541205763816833, "learning_rate": 1.609689448065811e-06, "loss": 1.0582, "step": 102000 }, { "epoch": 2.917651077396032, "grad_norm": 247.89581298828125, "learning_rate": 1.3724820433994668e-06, "loss": 1.1325, "step": 102500 }, { "epoch": 2.9318835216760126, "grad_norm": 0.002974768402054906, "learning_rate": 1.1352746387331228e-06, "loss": 1.3426, "step": 103000 }, { "epoch": 2.9461159659559932, "grad_norm": 0.0011548080947250128, "learning_rate": 8.980672340667787e-07, "loss": 1.1403, "step": 103500 }, { "epoch": 2.960348410235974, "grad_norm": 0.3729552924633026, "learning_rate": 6.608598294004346e-07, "loss": 1.1825, "step": 104000 }, { "epoch": 2.9745808545159544, "grad_norm": 268.66912841796875, "learning_rate": 4.236524247340905e-07, "loss": 1.3315, "step": 104500 }, { "epoch": 2.988813298795935, "grad_norm": 43.98519515991211, "learning_rate": 1.8644502006774646e-07, "loss": 1.2468, "step": 105000 } ], "logging_steps": 500, "max_steps": 105393, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }