{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997953488372093, "eval_steps": 500, "global_step": 2013, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014883720930232559, "grad_norm": 8.902280630176788, "learning_rate": 4.950495049504951e-07, "loss": 0.8797, "step": 10 }, { "epoch": 0.029767441860465118, "grad_norm": 3.3029491558899418, "learning_rate": 9.900990099009902e-07, "loss": 0.774, "step": 20 }, { "epoch": 0.044651162790697675, "grad_norm": 1.9413977734691883, "learning_rate": 1.4851485148514852e-06, "loss": 0.6921, "step": 30 }, { "epoch": 0.059534883720930236, "grad_norm": 1.5937915147958766, "learning_rate": 1.9801980198019803e-06, "loss": 0.6479, "step": 40 }, { "epoch": 0.07441860465116279, "grad_norm": 1.6800049101706125, "learning_rate": 2.4752475247524753e-06, "loss": 0.6181, "step": 50 }, { "epoch": 0.08930232558139535, "grad_norm": 2.31629043986904, "learning_rate": 2.9702970297029703e-06, "loss": 0.598, "step": 60 }, { "epoch": 0.10418604651162791, "grad_norm": 1.813171169001016, "learning_rate": 3.4653465346534653e-06, "loss": 0.5869, "step": 70 }, { "epoch": 0.11906976744186047, "grad_norm": 2.4807755593084577, "learning_rate": 3.960396039603961e-06, "loss": 0.5731, "step": 80 }, { "epoch": 0.13395348837209303, "grad_norm": 2.627244335259517, "learning_rate": 4.455445544554456e-06, "loss": 0.5715, "step": 90 }, { "epoch": 0.14883720930232558, "grad_norm": 2.1854899385391042, "learning_rate": 4.950495049504951e-06, "loss": 0.5589, "step": 100 }, { "epoch": 0.16372093023255813, "grad_norm": 2.1643320968352504, "learning_rate": 4.999753989526703e-06, "loss": 0.5591, "step": 110 }, { "epoch": 0.1786046511627907, "grad_norm": 1.6906833870682114, "learning_rate": 4.998903652018798e-06, "loss": 0.5473, "step": 120 }, { "epoch": 0.19348837209302325, "grad_norm": 1.6954868322135426, "learning_rate": 4.997446179820209e-06, "loss": 0.5418, "step": 130 }, { "epoch": 0.20837209302325582, "grad_norm": 3.1018130514636666, "learning_rate": 4.995381966403521e-06, "loss": 0.5392, "step": 140 }, { "epoch": 0.22325581395348837, "grad_norm": 2.249001753991247, "learning_rate": 4.9927115690427536e-06, "loss": 0.538, "step": 150 }, { "epoch": 0.23813953488372094, "grad_norm": 2.5823816443281173, "learning_rate": 4.989435708662909e-06, "loss": 0.5337, "step": 160 }, { "epoch": 0.25302325581395346, "grad_norm": 2.271030426623902, "learning_rate": 4.985555269645351e-06, "loss": 0.5236, "step": 170 }, { "epoch": 0.26790697674418606, "grad_norm": 2.318489677318734, "learning_rate": 4.981071299589047e-06, "loss": 0.5245, "step": 180 }, { "epoch": 0.2827906976744186, "grad_norm": 1.9206844645280905, "learning_rate": 4.975985009027748e-06, "loss": 0.5218, "step": 190 }, { "epoch": 0.29767441860465116, "grad_norm": 2.9110235751406983, "learning_rate": 4.970297771103183e-06, "loss": 0.5225, "step": 200 }, { "epoch": 0.3125581395348837, "grad_norm": 2.4648347140615376, "learning_rate": 4.964011121194349e-06, "loss": 0.5125, "step": 210 }, { "epoch": 0.32744186046511625, "grad_norm": 1.8762938609852826, "learning_rate": 4.957126756503014e-06, "loss": 0.5119, "step": 220 }, { "epoch": 0.34232558139534885, "grad_norm": 2.634519062843505, "learning_rate": 4.949646535595514e-06, "loss": 0.5089, "step": 230 }, { "epoch": 0.3572093023255814, "grad_norm": 2.7133170705876712, "learning_rate": 4.941572477901008e-06, "loss": 0.5028, "step": 240 }, { "epoch": 0.37209302325581395, "grad_norm": 1.772078056634877, "learning_rate": 4.932906763166286e-06, "loss": 0.5036, "step": 250 }, { "epoch": 0.3869767441860465, "grad_norm": 2.3772772619843856, "learning_rate": 4.9236517308673135e-06, "loss": 0.5051, "step": 260 }, { "epoch": 0.4018604651162791, "grad_norm": 1.7586683386687694, "learning_rate": 4.9138098795776335e-06, "loss": 0.4996, "step": 270 }, { "epoch": 0.41674418604651164, "grad_norm": 2.441721431571048, "learning_rate": 4.903383866293839e-06, "loss": 0.5003, "step": 280 }, { "epoch": 0.4316279069767442, "grad_norm": 1.6335890264070398, "learning_rate": 4.89237650571826e-06, "loss": 0.5009, "step": 290 }, { "epoch": 0.44651162790697674, "grad_norm": 1.8052385644508027, "learning_rate": 4.880790769499083e-06, "loss": 0.4983, "step": 300 }, { "epoch": 0.4613953488372093, "grad_norm": 1.800037297627897, "learning_rate": 4.868629785428096e-06, "loss": 0.497, "step": 310 }, { "epoch": 0.4762790697674419, "grad_norm": 1.605966469734369, "learning_rate": 4.855896836596282e-06, "loss": 0.4929, "step": 320 }, { "epoch": 0.49116279069767443, "grad_norm": 1.994957648259566, "learning_rate": 4.842595360507486e-06, "loss": 0.4966, "step": 330 }, { "epoch": 0.5060465116279069, "grad_norm": 1.7903983897538154, "learning_rate": 4.828728948150395e-06, "loss": 0.4948, "step": 340 }, { "epoch": 0.5209302325581395, "grad_norm": 1.5566561390721225, "learning_rate": 4.8143013430290805e-06, "loss": 0.4918, "step": 350 }, { "epoch": 0.5358139534883721, "grad_norm": 1.449953579251504, "learning_rate": 4.799316440152367e-06, "loss": 0.4899, "step": 360 }, { "epoch": 0.5506976744186046, "grad_norm": 1.945851570880214, "learning_rate": 4.783778284982303e-06, "loss": 0.4859, "step": 370 }, { "epoch": 0.5655813953488372, "grad_norm": 1.6223980803827658, "learning_rate": 4.767691072342006e-06, "loss": 0.4884, "step": 380 }, { "epoch": 0.5804651162790697, "grad_norm": 1.8375917251155103, "learning_rate": 4.7510591452831975e-06, "loss": 0.4809, "step": 390 }, { "epoch": 0.5953488372093023, "grad_norm": 1.742322917798526, "learning_rate": 4.733886993913704e-06, "loss": 0.4857, "step": 400 }, { "epoch": 0.6102325581395349, "grad_norm": 1.6356424508667176, "learning_rate": 4.7161792541852675e-06, "loss": 0.4872, "step": 410 }, { "epoch": 0.6251162790697674, "grad_norm": 1.5526421294000088, "learning_rate": 4.69794070664199e-06, "loss": 0.4829, "step": 420 }, { "epoch": 0.64, "grad_norm": 1.4232942412821412, "learning_rate": 4.6791762751297236e-06, "loss": 0.481, "step": 430 }, { "epoch": 0.6548837209302325, "grad_norm": 1.483668935893611, "learning_rate": 4.65989102546679e-06, "loss": 0.4821, "step": 440 }, { "epoch": 0.6697674418604651, "grad_norm": 2.1539928952738405, "learning_rate": 4.640090164076361e-06, "loss": 0.4749, "step": 450 }, { "epoch": 0.6846511627906977, "grad_norm": 2.1025050973956767, "learning_rate": 4.61977903658089e-06, "loss": 0.4804, "step": 460 }, { "epoch": 0.6995348837209302, "grad_norm": 1.7519293478931726, "learning_rate": 4.5989631263589546e-06, "loss": 0.4743, "step": 470 }, { "epoch": 0.7144186046511628, "grad_norm": 2.0423876723189243, "learning_rate": 4.5776480530649155e-06, "loss": 0.4726, "step": 480 }, { "epoch": 0.7293023255813953, "grad_norm": 1.771995353632526, "learning_rate": 4.555839571111782e-06, "loss": 0.4728, "step": 490 }, { "epoch": 0.7441860465116279, "grad_norm": 1.7344799986510258, "learning_rate": 4.533543568117697e-06, "loss": 0.4725, "step": 500 }, { "epoch": 0.7590697674418605, "grad_norm": 1.3702958835212744, "learning_rate": 4.5107660633164645e-06, "loss": 0.475, "step": 510 }, { "epoch": 0.773953488372093, "grad_norm": 1.5331038199927272, "learning_rate": 4.487513205932537e-06, "loss": 0.4758, "step": 520 }, { "epoch": 0.7888372093023256, "grad_norm": 1.4415751132180146, "learning_rate": 4.46379127352092e-06, "loss": 0.4651, "step": 530 }, { "epoch": 0.8037209302325582, "grad_norm": 1.5387474607067084, "learning_rate": 4.439606670272421e-06, "loss": 0.4717, "step": 540 }, { "epoch": 0.8186046511627907, "grad_norm": 1.3930637138971305, "learning_rate": 4.414965925284719e-06, "loss": 0.4683, "step": 550 }, { "epoch": 0.8334883720930233, "grad_norm": 1.4357407382473695, "learning_rate": 4.389875690799706e-06, "loss": 0.4705, "step": 560 }, { "epoch": 0.8483720930232558, "grad_norm": 1.867160837622047, "learning_rate": 4.364342740407589e-06, "loss": 0.4684, "step": 570 }, { "epoch": 0.8632558139534884, "grad_norm": 2.079586326625279, "learning_rate": 4.338373967218229e-06, "loss": 0.4629, "step": 580 }, { "epoch": 0.878139534883721, "grad_norm": 1.6049505438520633, "learning_rate": 4.3119763820002105e-06, "loss": 0.4643, "step": 590 }, { "epoch": 0.8930232558139535, "grad_norm": 1.2874061526414027, "learning_rate": 4.285157111288156e-06, "loss": 0.4642, "step": 600 }, { "epoch": 0.9079069767441861, "grad_norm": 1.545108445973155, "learning_rate": 4.257923395458778e-06, "loss": 0.4606, "step": 610 }, { "epoch": 0.9227906976744186, "grad_norm": 1.6451186302266136, "learning_rate": 4.230282586776198e-06, "loss": 0.4584, "step": 620 }, { "epoch": 0.9376744186046512, "grad_norm": 2.1325124535031263, "learning_rate": 4.202242147407065e-06, "loss": 0.4621, "step": 630 }, { "epoch": 0.9525581395348838, "grad_norm": 2.0011178228576862, "learning_rate": 4.173809647406001e-06, "loss": 0.4601, "step": 640 }, { "epoch": 0.9674418604651163, "grad_norm": 1.5742244911614134, "learning_rate": 4.1449927626719164e-06, "loss": 0.456, "step": 650 }, { "epoch": 0.9823255813953489, "grad_norm": 1.7386928439484917, "learning_rate": 4.115799272875756e-06, "loss": 0.4548, "step": 660 }, { "epoch": 0.9972093023255814, "grad_norm": 1.5311577866040798, "learning_rate": 4.086237059360228e-06, "loss": 0.4624, "step": 670 }, { "epoch": 0.9986976744186047, "eval_loss": 0.05731714889407158, "eval_runtime": 455.161, "eval_samples_per_second": 39.773, "eval_steps_per_second": 0.622, "step": 671 }, { "epoch": 1.0130232558139536, "grad_norm": 1.9657672125173562, "learning_rate": 4.056314103012081e-06, "loss": 0.3827, "step": 680 }, { "epoch": 1.027906976744186, "grad_norm": 1.6977560049208136, "learning_rate": 4.026038482107515e-06, "loss": 0.3642, "step": 690 }, { "epoch": 1.0427906976744186, "grad_norm": 1.7125394386865773, "learning_rate": 3.995418370131294e-06, "loss": 0.3649, "step": 700 }, { "epoch": 1.0576744186046512, "grad_norm": 1.8561895728990827, "learning_rate": 3.964462033570154e-06, "loss": 0.3662, "step": 710 }, { "epoch": 1.0725581395348838, "grad_norm": 1.5618111186332968, "learning_rate": 3.9331778296811126e-06, "loss": 0.3658, "step": 720 }, { "epoch": 1.0874418604651164, "grad_norm": 1.6589684668519427, "learning_rate": 3.9015742042352575e-06, "loss": 0.3633, "step": 730 }, { "epoch": 1.1023255813953488, "grad_norm": 1.6737692269771278, "learning_rate": 3.8696596892376615e-06, "loss": 0.3683, "step": 740 }, { "epoch": 1.1172093023255814, "grad_norm": 1.9826343984663213, "learning_rate": 3.8374429006239915e-06, "loss": 0.366, "step": 750 }, { "epoch": 1.132093023255814, "grad_norm": 1.7884778638246734, "learning_rate": 3.8049325359344804e-06, "loss": 0.3692, "step": 760 }, { "epoch": 1.1469767441860466, "grad_norm": 2.611566598798586, "learning_rate": 3.7721373719658526e-06, "loss": 0.3712, "step": 770 }, { "epoch": 1.1618604651162792, "grad_norm": 1.7457709366563403, "learning_rate": 3.7390662624018648e-06, "loss": 0.3693, "step": 780 }, { "epoch": 1.1767441860465115, "grad_norm": 1.9569418242995398, "learning_rate": 3.7057281354230794e-06, "loss": 0.3653, "step": 790 }, { "epoch": 1.1916279069767441, "grad_norm": 2.173634628655407, "learning_rate": 3.6721319912965366e-06, "loss": 0.3649, "step": 800 }, { "epoch": 1.2065116279069767, "grad_norm": 2.2801578585083324, "learning_rate": 3.6382868999459524e-06, "loss": 0.3685, "step": 810 }, { "epoch": 1.2213953488372093, "grad_norm": 2.3692062383608197, "learning_rate": 3.6042019985031244e-06, "loss": 0.3712, "step": 820 }, { "epoch": 1.236279069767442, "grad_norm": 2.3703941397431945, "learning_rate": 3.569886488841187e-06, "loss": 0.3659, "step": 830 }, { "epoch": 1.2511627906976743, "grad_norm": 1.8705318246188423, "learning_rate": 3.535349635090386e-06, "loss": 0.3682, "step": 840 }, { "epoch": 1.266046511627907, "grad_norm": 1.404469365060906, "learning_rate": 3.5006007611370513e-06, "loss": 0.3662, "step": 850 }, { "epoch": 1.2809302325581395, "grad_norm": 1.797601704761619, "learning_rate": 3.465649248106435e-06, "loss": 0.3661, "step": 860 }, { "epoch": 1.2958139534883721, "grad_norm": 1.8585619220995928, "learning_rate": 3.4305045318300974e-06, "loss": 0.3647, "step": 870 }, { "epoch": 1.3106976744186047, "grad_norm": 1.4765933716397583, "learning_rate": 3.3951761002985184e-06, "loss": 0.3673, "step": 880 }, { "epoch": 1.3255813953488373, "grad_norm": 1.4125161732202964, "learning_rate": 3.3596734910996397e-06, "loss": 0.3642, "step": 890 }, { "epoch": 1.3404651162790697, "grad_norm": 1.4190852276081691, "learning_rate": 3.3240062888440046e-06, "loss": 0.3688, "step": 900 }, { "epoch": 1.3553488372093023, "grad_norm": 1.7022894617337727, "learning_rate": 3.2881841225772097e-06, "loss": 0.3697, "step": 910 }, { "epoch": 1.370232558139535, "grad_norm": 1.4899520481456272, "learning_rate": 3.2522166631803616e-06, "loss": 0.371, "step": 920 }, { "epoch": 1.3851162790697673, "grad_norm": 1.5320087948001035, "learning_rate": 3.2161136207592323e-06, "loss": 0.3661, "step": 930 }, { "epoch": 1.4, "grad_norm": 1.5107565046345355, "learning_rate": 3.1798847420228358e-06, "loss": 0.3716, "step": 940 }, { "epoch": 1.4148837209302325, "grad_norm": 1.4964498805556543, "learning_rate": 3.14353980765211e-06, "loss": 0.3681, "step": 950 }, { "epoch": 1.4297674418604651, "grad_norm": 1.4602567799649149, "learning_rate": 3.1070886296594427e-06, "loss": 0.367, "step": 960 }, { "epoch": 1.4446511627906977, "grad_norm": 1.5151588768049005, "learning_rate": 3.0705410487397214e-06, "loss": 0.3634, "step": 970 }, { "epoch": 1.4595348837209303, "grad_norm": 1.4429762741417749, "learning_rate": 3.0339069316136573e-06, "loss": 0.3692, "step": 980 }, { "epoch": 1.474418604651163, "grad_norm": 1.795013045267996, "learning_rate": 2.9971961683640683e-06, "loss": 0.3677, "step": 990 }, { "epoch": 1.4893023255813953, "grad_norm": 1.772230394205441, "learning_rate": 2.9604186697658642e-06, "loss": 0.3653, "step": 1000 }, { "epoch": 1.504186046511628, "grad_norm": 1.517966913164345, "learning_rate": 2.923584364610444e-06, "loss": 0.3674, "step": 1010 }, { "epoch": 1.5190697674418605, "grad_norm": 1.3320746245185962, "learning_rate": 2.8867031970252262e-06, "loss": 0.3654, "step": 1020 }, { "epoch": 1.5339534883720929, "grad_norm": 1.4422364510340309, "learning_rate": 2.84978512378904e-06, "loss": 0.3649, "step": 1030 }, { "epoch": 1.5488372093023255, "grad_norm": 1.422279942394409, "learning_rate": 2.8128401116441058e-06, "loss": 0.3637, "step": 1040 }, { "epoch": 1.563720930232558, "grad_norm": 1.325658675407642, "learning_rate": 2.7758781346053165e-06, "loss": 0.365, "step": 1050 }, { "epoch": 1.5786046511627907, "grad_norm": 1.3384490232214399, "learning_rate": 2.738909171267566e-06, "loss": 0.3673, "step": 1060 }, { "epoch": 1.5934883720930233, "grad_norm": 1.4369227508461269, "learning_rate": 2.7019432021118314e-06, "loss": 0.3667, "step": 1070 }, { "epoch": 1.608372093023256, "grad_norm": 1.550458179933586, "learning_rate": 2.664990206810755e-06, "loss": 0.3609, "step": 1080 }, { "epoch": 1.6232558139534885, "grad_norm": 1.4665966921755653, "learning_rate": 2.628060161534437e-06, "loss": 0.3677, "step": 1090 }, { "epoch": 1.6381395348837209, "grad_norm": 1.4478720169649395, "learning_rate": 2.5911630362571787e-06, "loss": 0.3663, "step": 1100 }, { "epoch": 1.6530232558139535, "grad_norm": 1.427086254069164, "learning_rate": 2.5543087920658945e-06, "loss": 0.3639, "step": 1110 }, { "epoch": 1.667906976744186, "grad_norm": 1.354522173953887, "learning_rate": 2.517507378470929e-06, "loss": 0.3611, "step": 1120 }, { "epoch": 1.6827906976744185, "grad_norm": 1.3693977150384091, "learning_rate": 2.480768730719992e-06, "loss": 0.3652, "step": 1130 }, { "epoch": 1.697674418604651, "grad_norm": 1.2855988884013045, "learning_rate": 2.4441027671159503e-06, "loss": 0.3639, "step": 1140 }, { "epoch": 1.7125581395348837, "grad_norm": 1.403080844114371, "learning_rate": 2.4075193863391906e-06, "loss": 0.3647, "step": 1150 }, { "epoch": 1.7274418604651163, "grad_norm": 1.4103414505961822, "learning_rate": 2.3710284647752805e-06, "loss": 0.3656, "step": 1160 }, { "epoch": 1.7423255813953489, "grad_norm": 1.328279125442394, "learning_rate": 2.3346398538486488e-06, "loss": 0.3601, "step": 1170 }, { "epoch": 1.7572093023255815, "grad_norm": 1.4237569279155826, "learning_rate": 2.2983633773630056e-06, "loss": 0.3648, "step": 1180 }, { "epoch": 1.772093023255814, "grad_norm": 1.3716090849409592, "learning_rate": 2.2622088288492166e-06, "loss": 0.3608, "step": 1190 }, { "epoch": 1.7869767441860465, "grad_norm": 1.3654603526464721, "learning_rate": 2.2261859689213523e-06, "loss": 0.3597, "step": 1200 }, { "epoch": 1.801860465116279, "grad_norm": 1.2691881731891166, "learning_rate": 2.1903045226416216e-06, "loss": 0.362, "step": 1210 }, { "epoch": 1.8167441860465117, "grad_norm": 1.3157985277229747, "learning_rate": 2.1545741768949085e-06, "loss": 0.3611, "step": 1220 }, { "epoch": 1.831627906976744, "grad_norm": 1.3583216702788425, "learning_rate": 2.1190045777736057e-06, "loss": 0.3613, "step": 1230 }, { "epoch": 1.8465116279069766, "grad_norm": 1.2745346937557744, "learning_rate": 2.0836053279734723e-06, "loss": 0.3569, "step": 1240 }, { "epoch": 1.8613953488372093, "grad_norm": 1.3036190956031755, "learning_rate": 2.0483859842011976e-06, "loss": 0.3597, "step": 1250 }, { "epoch": 1.8762790697674419, "grad_norm": 1.4119443086776595, "learning_rate": 2.0133560545943902e-06, "loss": 0.3636, "step": 1260 }, { "epoch": 1.8911627906976745, "grad_norm": 1.2897404474298366, "learning_rate": 1.9785249961546668e-06, "loss": 0.3575, "step": 1270 }, { "epoch": 1.906046511627907, "grad_norm": 1.3639990026448434, "learning_rate": 1.94390221219456e-06, "loss": 0.3592, "step": 1280 }, { "epoch": 1.9209302325581397, "grad_norm": 1.2546167591452697, "learning_rate": 1.909497049798906e-06, "loss": 0.3584, "step": 1290 }, { "epoch": 1.935813953488372, "grad_norm": 1.3438380702724106, "learning_rate": 1.8753187973014302e-06, "loss": 0.3623, "step": 1300 }, { "epoch": 1.9506976744186046, "grad_norm": 1.3452056759551596, "learning_rate": 1.8413766817771716e-06, "loss": 0.3597, "step": 1310 }, { "epoch": 1.9655813953488372, "grad_norm": 1.3834284182053571, "learning_rate": 1.8076798665514672e-06, "loss": 0.3586, "step": 1320 }, { "epoch": 1.9804651162790696, "grad_norm": 1.275093148045728, "learning_rate": 1.7742374487261275e-06, "loss": 0.3556, "step": 1330 }, { "epoch": 1.9953488372093022, "grad_norm": 1.3336419955281094, "learning_rate": 1.7410584567235063e-06, "loss": 0.3593, "step": 1340 }, { "epoch": 1.9983255813953489, "eval_loss": 0.05562544986605644, "eval_runtime": 454.3217, "eval_samples_per_second": 39.846, "eval_steps_per_second": 0.623, "step": 1342 }, { "epoch": 2.0111627906976746, "grad_norm": 1.8801844172458124, "learning_rate": 1.7081518478491024e-06, "loss": 0.2974, "step": 1350 }, { "epoch": 2.026046511627907, "grad_norm": 1.5466963455442697, "learning_rate": 1.6755265058733625e-06, "loss": 0.2705, "step": 1360 }, { "epoch": 2.0409302325581393, "grad_norm": 1.6491531999787248, "learning_rate": 1.6431912386333337e-06, "loss": 0.2715, "step": 1370 }, { "epoch": 2.055813953488372, "grad_norm": 1.3549718677985036, "learning_rate": 1.61115477565483e-06, "loss": 0.2711, "step": 1380 }, { "epoch": 2.0706976744186045, "grad_norm": 1.4210689732513506, "learning_rate": 1.5794257657957149e-06, "loss": 0.2678, "step": 1390 }, { "epoch": 2.085581395348837, "grad_norm": 1.5199527949561047, "learning_rate": 1.5480127749109867e-06, "loss": 0.2715, "step": 1400 }, { "epoch": 2.1004651162790697, "grad_norm": 1.3921480720538892, "learning_rate": 1.516924283540257e-06, "loss": 0.268, "step": 1410 }, { "epoch": 2.1153488372093023, "grad_norm": 1.5217951800344998, "learning_rate": 1.486168684618268e-06, "loss": 0.2666, "step": 1420 }, { "epoch": 2.130232558139535, "grad_norm": 1.4410321423758377, "learning_rate": 1.4557542812090574e-06, "loss": 0.2698, "step": 1430 }, { "epoch": 2.1451162790697675, "grad_norm": 1.4576711215393965, "learning_rate": 1.4256892842643893e-06, "loss": 0.2675, "step": 1440 }, { "epoch": 2.16, "grad_norm": 1.40927027905477, "learning_rate": 1.3959818104070452e-06, "loss": 0.2687, "step": 1450 }, { "epoch": 2.1748837209302327, "grad_norm": 1.4616727679236827, "learning_rate": 1.3666398797395948e-06, "loss": 0.2707, "step": 1460 }, { "epoch": 2.1897674418604653, "grad_norm": 1.3795657326148565, "learning_rate": 1.3376714136792034e-06, "loss": 0.2665, "step": 1470 }, { "epoch": 2.2046511627906975, "grad_norm": 1.5330397219497653, "learning_rate": 1.3090842328191053e-06, "loss": 0.2699, "step": 1480 }, { "epoch": 2.21953488372093, "grad_norm": 1.4083642956808657, "learning_rate": 1.280886054817277e-06, "loss": 0.2709, "step": 1490 }, { "epoch": 2.2344186046511627, "grad_norm": 1.4908783738210238, "learning_rate": 1.2530844923129096e-06, "loss": 0.2712, "step": 1500 }, { "epoch": 2.2493023255813953, "grad_norm": 1.3954198116255683, "learning_rate": 1.225687050871231e-06, "loss": 0.2705, "step": 1510 }, { "epoch": 2.264186046511628, "grad_norm": 1.38515024685356, "learning_rate": 1.1987011269572357e-06, "loss": 0.2701, "step": 1520 }, { "epoch": 2.2790697674418605, "grad_norm": 1.3876830410349839, "learning_rate": 1.1721340059388617e-06, "loss": 0.2672, "step": 1530 }, { "epoch": 2.293953488372093, "grad_norm": 1.5349649821163696, "learning_rate": 1.1459928601201756e-06, "loss": 0.2696, "step": 1540 }, { "epoch": 2.3088372093023257, "grad_norm": 1.6222404726063406, "learning_rate": 1.1202847468050597e-06, "loss": 0.2689, "step": 1550 }, { "epoch": 2.3237209302325583, "grad_norm": 1.4018802032895803, "learning_rate": 1.0950166063919694e-06, "loss": 0.2701, "step": 1560 }, { "epoch": 2.3386046511627905, "grad_norm": 1.3526044361279066, "learning_rate": 1.0701952605002275e-06, "loss": 0.2676, "step": 1570 }, { "epoch": 2.353488372093023, "grad_norm": 1.4442038534402912, "learning_rate": 1.045827410128407e-06, "loss": 0.2661, "step": 1580 }, { "epoch": 2.3683720930232557, "grad_norm": 1.4163720971352327, "learning_rate": 1.0219196338452623e-06, "loss": 0.2689, "step": 1590 }, { "epoch": 2.3832558139534883, "grad_norm": 1.356177063119025, "learning_rate": 9.984783860137213e-07, "loss": 0.2676, "step": 1600 }, { "epoch": 2.398139534883721, "grad_norm": 1.3502869776338615, "learning_rate": 9.75509995048404e-07, "loss": 0.2681, "step": 1610 }, { "epoch": 2.4130232558139535, "grad_norm": 1.337863106664254, "learning_rate": 9.53020661707148e-07, "loss": 0.2695, "step": 1620 }, { "epoch": 2.427906976744186, "grad_norm": 1.3624966178122666, "learning_rate": 9.310164574169911e-07, "loss": 0.2661, "step": 1630 }, { "epoch": 2.4427906976744187, "grad_norm": 1.4427762559619615, "learning_rate": 9.095033226350787e-07, "loss": 0.2682, "step": 1640 }, { "epoch": 2.4576744186046513, "grad_norm": 1.3824477379757505, "learning_rate": 8.884870652449176e-07, "loss": 0.2683, "step": 1650 }, { "epoch": 2.472558139534884, "grad_norm": 1.3968898578295752, "learning_rate": 8.679733589884308e-07, "loss": 0.2676, "step": 1660 }, { "epoch": 2.4874418604651165, "grad_norm": 1.403396644885307, "learning_rate": 8.479677419342195e-07, "loss": 0.2675, "step": 1670 }, { "epoch": 2.5023255813953487, "grad_norm": 1.4635100367305935, "learning_rate": 8.284756149824561e-07, "loss": 0.2691, "step": 1680 }, { "epoch": 2.5172093023255813, "grad_norm": 1.4243622651693753, "learning_rate": 8.095022404068078e-07, "loss": 0.269, "step": 1690 }, { "epoch": 2.532093023255814, "grad_norm": 1.379314761648038, "learning_rate": 7.910527404337846e-07, "loss": 0.2687, "step": 1700 }, { "epoch": 2.5469767441860465, "grad_norm": 1.3800285755689508, "learning_rate": 7.731320958598944e-07, "loss": 0.2687, "step": 1710 }, { "epoch": 2.561860465116279, "grad_norm": 1.4168008157356147, "learning_rate": 7.557451447069862e-07, "loss": 0.2686, "step": 1720 }, { "epoch": 2.5767441860465117, "grad_norm": 1.405042195482243, "learning_rate": 7.388965809161264e-07, "loss": 0.273, "step": 1730 }, { "epoch": 2.5916279069767443, "grad_norm": 1.339874655650546, "learning_rate": 7.225909530803849e-07, "loss": 0.2702, "step": 1740 }, { "epoch": 2.606511627906977, "grad_norm": 1.376936805811691, "learning_rate": 7.068326632168529e-07, "loss": 0.2682, "step": 1750 }, { "epoch": 2.6213953488372095, "grad_norm": 1.370179020476883, "learning_rate": 6.91625965578234e-07, "loss": 0.268, "step": 1760 }, { "epoch": 2.6362790697674416, "grad_norm": 1.3701048711478194, "learning_rate": 6.769749655043278e-07, "loss": 0.2678, "step": 1770 }, { "epoch": 2.6511627906976747, "grad_norm": 1.4206830049678218, "learning_rate": 6.628836183137136e-07, "loss": 0.2701, "step": 1780 }, { "epoch": 2.666046511627907, "grad_norm": 1.393374140302477, "learning_rate": 6.493557282359362e-07, "loss": 0.2687, "step": 1790 }, { "epoch": 2.6809302325581394, "grad_norm": 1.461263759783157, "learning_rate": 6.363949473844831e-07, "loss": 0.268, "step": 1800 }, { "epoch": 2.695813953488372, "grad_norm": 1.4734006960029564, "learning_rate": 6.240047747708234e-07, "loss": 0.2677, "step": 1810 }, { "epoch": 2.7106976744186047, "grad_norm": 1.3366661643587023, "learning_rate": 6.121885553597864e-07, "loss": 0.2681, "step": 1820 }, { "epoch": 2.7255813953488373, "grad_norm": 1.422096088237659, "learning_rate": 6.009494791665193e-07, "loss": 0.2696, "step": 1830 }, { "epoch": 2.74046511627907, "grad_norm": 1.34042220211478, "learning_rate": 5.902905803952853e-07, "loss": 0.2707, "step": 1840 }, { "epoch": 2.7553488372093025, "grad_norm": 1.43033974764486, "learning_rate": 5.802147366203209e-07, "loss": 0.2682, "step": 1850 }, { "epoch": 2.7702325581395346, "grad_norm": 1.3377632425473605, "learning_rate": 5.707246680089786e-07, "loss": 0.2682, "step": 1860 }, { "epoch": 2.7851162790697677, "grad_norm": 1.3552116715763636, "learning_rate": 5.618229365873664e-07, "loss": 0.2679, "step": 1870 }, { "epoch": 2.8, "grad_norm": 1.407612240704182, "learning_rate": 5.535119455486798e-07, "loss": 0.2671, "step": 1880 }, { "epoch": 2.8148837209302324, "grad_norm": 1.3307680080669904, "learning_rate": 5.457939386044124e-07, "loss": 0.2691, "step": 1890 }, { "epoch": 2.829767441860465, "grad_norm": 1.33966014097135, "learning_rate": 5.386709993786254e-07, "loss": 0.2691, "step": 1900 }, { "epoch": 2.8446511627906976, "grad_norm": 1.3388112782785693, "learning_rate": 5.321450508454304e-07, "loss": 0.2678, "step": 1910 }, { "epoch": 2.8595348837209302, "grad_norm": 1.3186034865675909, "learning_rate": 5.262178548098479e-07, "loss": 0.2668, "step": 1920 }, { "epoch": 2.874418604651163, "grad_norm": 1.3351972694593512, "learning_rate": 5.208910114321729e-07, "loss": 0.2662, "step": 1930 }, { "epoch": 2.8893023255813954, "grad_norm": 1.373685192971363, "learning_rate": 5.161659587959818e-07, "loss": 0.2669, "step": 1940 }, { "epoch": 2.904186046511628, "grad_norm": 1.3931156997045835, "learning_rate": 5.120439725198932e-07, "loss": 0.2663, "step": 1950 }, { "epoch": 2.9190697674418606, "grad_norm": 1.2915260316130637, "learning_rate": 5.085261654131918e-07, "loss": 0.2632, "step": 1960 }, { "epoch": 2.933953488372093, "grad_norm": 1.3421708919414022, "learning_rate": 5.056134871754014e-07, "loss": 0.2705, "step": 1970 }, { "epoch": 2.948837209302326, "grad_norm": 1.3219790823176543, "learning_rate": 5.03306724139899e-07, "loss": 0.2675, "step": 1980 }, { "epoch": 2.963720930232558, "grad_norm": 1.3110383511595092, "learning_rate": 5.016064990616251e-07, "loss": 0.2652, "step": 1990 }, { "epoch": 2.9786046511627906, "grad_norm": 1.312152439222843, "learning_rate": 5.005132709489625e-07, "loss": 0.2666, "step": 2000 }, { "epoch": 2.993488372093023, "grad_norm": 1.385415982308588, "learning_rate": 5.000273349398159e-07, "loss": 0.2664, "step": 2010 }, { "epoch": 2.997953488372093, "eval_loss": 0.05878664180636406, "eval_runtime": 453.0631, "eval_samples_per_second": 39.957, "eval_steps_per_second": 0.625, "step": 2013 }, { "epoch": 2.997953488372093, "step": 2013, "total_flos": 3371640595415040.0, "train_loss": 0.382664002355982, "train_runtime": 66379.5775, "train_samples_per_second": 15.545, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 2013, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3371640595415040.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }