tFINE-850m-24x24-instruct-L2 / trainer_state.json
pszemraj's picture
End of training
59876ca verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 2000,
"global_step": 7916,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012632642748863063,
"grad_norm": 0.3052225708961487,
"learning_rate": 1.4705882352941177e-06,
"loss": 1.3368,
"num_input_tokens_seen": 1013408,
"step": 10
},
{
"epoch": 0.0025265285497726125,
"grad_norm": 0.2797602713108063,
"learning_rate": 2.9411764705882355e-06,
"loss": 1.3885,
"num_input_tokens_seen": 1928736,
"step": 20
},
{
"epoch": 0.0037897928246589186,
"grad_norm": 0.2704920470714569,
"learning_rate": 4.4117647058823526e-06,
"loss": 1.4081,
"num_input_tokens_seen": 2911616,
"step": 30
},
{
"epoch": 0.005053057099545225,
"grad_norm": 0.26865851879119873,
"learning_rate": 5.882352941176471e-06,
"loss": 1.4143,
"num_input_tokens_seen": 3775872,
"step": 40
},
{
"epoch": 0.006316321374431531,
"grad_norm": 0.29596275091171265,
"learning_rate": 7.352941176470588e-06,
"loss": 1.4061,
"num_input_tokens_seen": 4755264,
"step": 50
},
{
"epoch": 0.007579585649317837,
"grad_norm": 0.26764747500419617,
"learning_rate": 8.823529411764705e-06,
"loss": 1.3282,
"num_input_tokens_seen": 5670016,
"step": 60
},
{
"epoch": 0.008842849924204144,
"grad_norm": 0.2855396866798401,
"learning_rate": 1.0294117647058823e-05,
"loss": 1.3441,
"num_input_tokens_seen": 6541472,
"step": 70
},
{
"epoch": 0.01010611419909045,
"grad_norm": 0.27064523100852966,
"learning_rate": 1.1764705882352942e-05,
"loss": 1.3611,
"num_input_tokens_seen": 7502848,
"step": 80
},
{
"epoch": 0.011369378473976757,
"grad_norm": 0.26372382044792175,
"learning_rate": 1.3235294117647058e-05,
"loss": 1.3355,
"num_input_tokens_seen": 8450880,
"step": 90
},
{
"epoch": 0.012632642748863061,
"grad_norm": 0.2819940149784088,
"learning_rate": 1.4705882352941175e-05,
"loss": 1.4074,
"num_input_tokens_seen": 9363776,
"step": 100
},
{
"epoch": 0.013895907023749368,
"grad_norm": 0.27858778834342957,
"learning_rate": 1.6176470588235293e-05,
"loss": 1.3922,
"num_input_tokens_seen": 10362848,
"step": 110
},
{
"epoch": 0.015159171298635674,
"grad_norm": 0.26853179931640625,
"learning_rate": 1.764705882352941e-05,
"loss": 1.3606,
"num_input_tokens_seen": 11337152,
"step": 120
},
{
"epoch": 0.016422435573521982,
"grad_norm": 0.29751917719841003,
"learning_rate": 1.9117647058823524e-05,
"loss": 1.3469,
"num_input_tokens_seen": 12268864,
"step": 130
},
{
"epoch": 0.017685699848408287,
"grad_norm": 0.29996374249458313,
"learning_rate": 2.0588235294117645e-05,
"loss": 1.3455,
"num_input_tokens_seen": 13304544,
"step": 140
},
{
"epoch": 0.018948964123294592,
"grad_norm": 0.26638367772102356,
"learning_rate": 2.2058823529411763e-05,
"loss": 1.3529,
"num_input_tokens_seen": 14245088,
"step": 150
},
{
"epoch": 0.0202122283981809,
"grad_norm": 0.2829771041870117,
"learning_rate": 2.3529411764705884e-05,
"loss": 1.3517,
"num_input_tokens_seen": 15277408,
"step": 160
},
{
"epoch": 0.021475492673067205,
"grad_norm": 0.28468722105026245,
"learning_rate": 2.4999999999999998e-05,
"loss": 1.3756,
"num_input_tokens_seen": 16296480,
"step": 170
},
{
"epoch": 0.022738756947953513,
"grad_norm": 0.2717965841293335,
"learning_rate": 2.6470588235294115e-05,
"loss": 1.3094,
"num_input_tokens_seen": 17249088,
"step": 180
},
{
"epoch": 0.024002021222839818,
"grad_norm": 0.2902025878429413,
"learning_rate": 2.7941176470588236e-05,
"loss": 1.3894,
"num_input_tokens_seen": 18267872,
"step": 190
},
{
"epoch": 0.025265285497726123,
"grad_norm": 0.27164924144744873,
"learning_rate": 2.941176470588235e-05,
"loss": 1.3471,
"num_input_tokens_seen": 19228288,
"step": 200
},
{
"epoch": 0.02652854977261243,
"grad_norm": 0.2791699767112732,
"learning_rate": 3.088235294117647e-05,
"loss": 1.3676,
"num_input_tokens_seen": 20112768,
"step": 210
},
{
"epoch": 0.027791814047498736,
"grad_norm": 0.27457180619239807,
"learning_rate": 3.2352941176470585e-05,
"loss": 1.3667,
"num_input_tokens_seen": 21080384,
"step": 220
},
{
"epoch": 0.029055078322385044,
"grad_norm": 0.2744538486003876,
"learning_rate": 3.38235294117647e-05,
"loss": 1.3791,
"num_input_tokens_seen": 21978464,
"step": 230
},
{
"epoch": 0.03031834259727135,
"grad_norm": 0.27631092071533203,
"learning_rate": 3.49999941403517e-05,
"loss": 1.3032,
"num_input_tokens_seen": 22891136,
"step": 240
},
{
"epoch": 0.03158160687215766,
"grad_norm": 0.25807875394821167,
"learning_rate": 3.499978905307333e-05,
"loss": 1.3203,
"num_input_tokens_seen": 23788384,
"step": 250
},
{
"epoch": 0.032844871147043965,
"grad_norm": 0.282926470041275,
"learning_rate": 3.499929098730414e-05,
"loss": 1.3487,
"num_input_tokens_seen": 24732448,
"step": 260
},
{
"epoch": 0.034108135421930266,
"grad_norm": 0.3243197500705719,
"learning_rate": 3.499849995138268e-05,
"loss": 1.3335,
"num_input_tokens_seen": 25651072,
"step": 270
},
{
"epoch": 0.035371399696816574,
"grad_norm": 0.28631719946861267,
"learning_rate": 3.499741595855233e-05,
"loss": 1.3104,
"num_input_tokens_seen": 26588256,
"step": 280
},
{
"epoch": 0.03663466397170288,
"grad_norm": 0.2739802598953247,
"learning_rate": 3.499603902696111e-05,
"loss": 1.3294,
"num_input_tokens_seen": 27506400,
"step": 290
},
{
"epoch": 0.037897928246589184,
"grad_norm": 0.25884002447128296,
"learning_rate": 3.499436917966138e-05,
"loss": 1.3253,
"num_input_tokens_seen": 28436096,
"step": 300
},
{
"epoch": 0.03916119252147549,
"grad_norm": 0.3526857793331146,
"learning_rate": 3.4992406444609434e-05,
"loss": 1.3731,
"num_input_tokens_seen": 29415744,
"step": 310
},
{
"epoch": 0.0404244567963618,
"grad_norm": 0.3010634183883667,
"learning_rate": 3.499015085466505e-05,
"loss": 1.3604,
"num_input_tokens_seen": 30396288,
"step": 320
},
{
"epoch": 0.0416877210712481,
"grad_norm": 0.30412164330482483,
"learning_rate": 3.498760244759094e-05,
"loss": 1.3192,
"num_input_tokens_seen": 31281632,
"step": 330
},
{
"epoch": 0.04295098534613441,
"grad_norm": 0.28709614276885986,
"learning_rate": 3.498476126605209e-05,
"loss": 1.3405,
"num_input_tokens_seen": 32139296,
"step": 340
},
{
"epoch": 0.04421424962102072,
"grad_norm": 0.2636132836341858,
"learning_rate": 3.4981627357615085e-05,
"loss": 1.3796,
"num_input_tokens_seen": 33140544,
"step": 350
},
{
"epoch": 0.045477513895907026,
"grad_norm": 0.27414971590042114,
"learning_rate": 3.497820077474728e-05,
"loss": 1.3502,
"num_input_tokens_seen": 34072480,
"step": 360
},
{
"epoch": 0.04674077817079333,
"grad_norm": 0.29717832803726196,
"learning_rate": 3.4974481574815955e-05,
"loss": 1.3218,
"num_input_tokens_seen": 35043552,
"step": 370
},
{
"epoch": 0.048004042445679636,
"grad_norm": 0.274935781955719,
"learning_rate": 3.49704698200873e-05,
"loss": 1.3101,
"num_input_tokens_seen": 36057536,
"step": 380
},
{
"epoch": 0.049267306720565944,
"grad_norm": 0.2995646893978119,
"learning_rate": 3.496616557772545e-05,
"loss": 1.3231,
"num_input_tokens_seen": 37053280,
"step": 390
},
{
"epoch": 0.050530570995452245,
"grad_norm": 0.2813841998577118,
"learning_rate": 3.4961568919791295e-05,
"loss": 1.3073,
"num_input_tokens_seen": 37949760,
"step": 400
},
{
"epoch": 0.051793835270338554,
"grad_norm": 0.25323453545570374,
"learning_rate": 3.49566799232413e-05,
"loss": 1.4188,
"num_input_tokens_seen": 38825888,
"step": 410
},
{
"epoch": 0.05305709954522486,
"grad_norm": 0.3185766339302063,
"learning_rate": 3.4951498669926205e-05,
"loss": 1.2551,
"num_input_tokens_seen": 39816832,
"step": 420
},
{
"epoch": 0.05432036382011117,
"grad_norm": 0.282988041639328,
"learning_rate": 3.494602524658968e-05,
"loss": 1.3429,
"num_input_tokens_seen": 40746208,
"step": 430
},
{
"epoch": 0.05558362809499747,
"grad_norm": 0.29383236169815063,
"learning_rate": 3.494025974486684e-05,
"loss": 1.2908,
"num_input_tokens_seen": 41732576,
"step": 440
},
{
"epoch": 0.05684689236988378,
"grad_norm": 0.2495247721672058,
"learning_rate": 3.4934202261282736e-05,
"loss": 1.3379,
"num_input_tokens_seen": 42725664,
"step": 450
},
{
"epoch": 0.05811015664477009,
"grad_norm": 0.27226462960243225,
"learning_rate": 3.4927852897250736e-05,
"loss": 1.2906,
"num_input_tokens_seen": 43636000,
"step": 460
},
{
"epoch": 0.05937342091965639,
"grad_norm": 0.2738124430179596,
"learning_rate": 3.49212117590708e-05,
"loss": 1.3382,
"num_input_tokens_seen": 44584384,
"step": 470
},
{
"epoch": 0.0606366851945427,
"grad_norm": 0.2823927700519562,
"learning_rate": 3.4914278957927746e-05,
"loss": 1.3572,
"num_input_tokens_seen": 45563296,
"step": 480
},
{
"epoch": 0.061899949469429005,
"grad_norm": 0.3090139329433441,
"learning_rate": 3.490705460988934e-05,
"loss": 1.3633,
"num_input_tokens_seen": 46504000,
"step": 490
},
{
"epoch": 0.06316321374431531,
"grad_norm": 0.2648494839668274,
"learning_rate": 3.4899538835904395e-05,
"loss": 1.296,
"num_input_tokens_seen": 47469568,
"step": 500
},
{
"epoch": 0.06442647801920162,
"grad_norm": 0.26772260665893555,
"learning_rate": 3.489173176180072e-05,
"loss": 1.3468,
"num_input_tokens_seen": 48428992,
"step": 510
},
{
"epoch": 0.06568974229408793,
"grad_norm": 0.2722509503364563,
"learning_rate": 3.488363351828301e-05,
"loss": 1.3298,
"num_input_tokens_seen": 49435616,
"step": 520
},
{
"epoch": 0.06695300656897422,
"grad_norm": 0.33240431547164917,
"learning_rate": 3.48752442409307e-05,
"loss": 1.3395,
"num_input_tokens_seen": 50444960,
"step": 530
},
{
"epoch": 0.06821627084386053,
"grad_norm": 0.33877724409103394,
"learning_rate": 3.4866564070195623e-05,
"loss": 1.3627,
"num_input_tokens_seen": 51354144,
"step": 540
},
{
"epoch": 0.06947953511874684,
"grad_norm": 0.25358885526657104,
"learning_rate": 3.485759315139974e-05,
"loss": 1.3665,
"num_input_tokens_seen": 52353568,
"step": 550
},
{
"epoch": 0.07074279939363315,
"grad_norm": 0.3228625953197479,
"learning_rate": 3.484833163473263e-05,
"loss": 1.3603,
"num_input_tokens_seen": 53330208,
"step": 560
},
{
"epoch": 0.07200606366851946,
"grad_norm": 0.27047306299209595,
"learning_rate": 3.483877967524903e-05,
"loss": 1.3918,
"num_input_tokens_seen": 54292704,
"step": 570
},
{
"epoch": 0.07326932794340577,
"grad_norm": 0.23836977779865265,
"learning_rate": 3.482893743286624e-05,
"loss": 1.3265,
"num_input_tokens_seen": 55289088,
"step": 580
},
{
"epoch": 0.07453259221829207,
"grad_norm": 0.2790107727050781,
"learning_rate": 3.4818805072361394e-05,
"loss": 1.34,
"num_input_tokens_seen": 56191520,
"step": 590
},
{
"epoch": 0.07579585649317837,
"grad_norm": 0.2909539043903351,
"learning_rate": 3.4808382763368746e-05,
"loss": 1.3827,
"num_input_tokens_seen": 57130144,
"step": 600
},
{
"epoch": 0.07705912076806468,
"grad_norm": 0.2930690050125122,
"learning_rate": 3.479767068037682e-05,
"loss": 1.2993,
"num_input_tokens_seen": 58166976,
"step": 610
},
{
"epoch": 0.07832238504295098,
"grad_norm": 0.2910405993461609,
"learning_rate": 3.4786669002725486e-05,
"loss": 1.4025,
"num_input_tokens_seen": 59115968,
"step": 620
},
{
"epoch": 0.07958564931783729,
"grad_norm": 0.2609618008136749,
"learning_rate": 3.477537791460297e-05,
"loss": 1.3454,
"num_input_tokens_seen": 60097152,
"step": 630
},
{
"epoch": 0.0808489135927236,
"grad_norm": 0.2621832489967346,
"learning_rate": 3.4763797605042735e-05,
"loss": 1.3193,
"num_input_tokens_seen": 61038400,
"step": 640
},
{
"epoch": 0.08211217786760991,
"grad_norm": 0.2869206666946411,
"learning_rate": 3.475192826792036e-05,
"loss": 1.3755,
"num_input_tokens_seen": 62005408,
"step": 650
},
{
"epoch": 0.0833754421424962,
"grad_norm": 0.2955986261367798,
"learning_rate": 3.473977010195027e-05,
"loss": 1.3446,
"num_input_tokens_seen": 62938944,
"step": 660
},
{
"epoch": 0.08463870641738251,
"grad_norm": 0.27759358286857605,
"learning_rate": 3.47273233106824e-05,
"loss": 1.3243,
"num_input_tokens_seen": 63825280,
"step": 670
},
{
"epoch": 0.08590197069226882,
"grad_norm": 0.2854154706001282,
"learning_rate": 3.471458810249883e-05,
"loss": 1.3274,
"num_input_tokens_seen": 64772224,
"step": 680
},
{
"epoch": 0.08716523496715513,
"grad_norm": 0.26865917444229126,
"learning_rate": 3.470156469061023e-05,
"loss": 1.3368,
"num_input_tokens_seen": 65757408,
"step": 690
},
{
"epoch": 0.08842849924204144,
"grad_norm": 0.3124206066131592,
"learning_rate": 3.468825329305235e-05,
"loss": 1.3619,
"num_input_tokens_seen": 66653856,
"step": 700
},
{
"epoch": 0.08969176351692774,
"grad_norm": 0.257878839969635,
"learning_rate": 3.467465413268235e-05,
"loss": 1.3705,
"num_input_tokens_seen": 67551136,
"step": 710
},
{
"epoch": 0.09095502779181405,
"grad_norm": 0.3039745092391968,
"learning_rate": 3.466076743717506e-05,
"loss": 1.3407,
"num_input_tokens_seen": 68461888,
"step": 720
},
{
"epoch": 0.09221829206670035,
"grad_norm": 0.297577828168869,
"learning_rate": 3.4646593439019164e-05,
"loss": 1.3068,
"num_input_tokens_seen": 69439936,
"step": 730
},
{
"epoch": 0.09348155634158666,
"grad_norm": 0.26858824491500854,
"learning_rate": 3.463213237551333e-05,
"loss": 1.3362,
"num_input_tokens_seen": 70315520,
"step": 740
},
{
"epoch": 0.09474482061647296,
"grad_norm": 0.32382968068122864,
"learning_rate": 3.461738448876223e-05,
"loss": 1.2972,
"num_input_tokens_seen": 71249088,
"step": 750
},
{
"epoch": 0.09600808489135927,
"grad_norm": 0.2890531122684479,
"learning_rate": 3.460235002567247e-05,
"loss": 1.2899,
"num_input_tokens_seen": 72123200,
"step": 760
},
{
"epoch": 0.09727134916624558,
"grad_norm": 0.2724192440509796,
"learning_rate": 3.458702923794847e-05,
"loss": 1.3435,
"num_input_tokens_seen": 73014048,
"step": 770
},
{
"epoch": 0.09853461344113189,
"grad_norm": 0.2698012888431549,
"learning_rate": 3.457142238208826e-05,
"loss": 1.3823,
"num_input_tokens_seen": 73970912,
"step": 780
},
{
"epoch": 0.0997978777160182,
"grad_norm": 0.25855422019958496,
"learning_rate": 3.455552971937915e-05,
"loss": 1.3545,
"num_input_tokens_seen": 74960032,
"step": 790
},
{
"epoch": 0.10106114199090449,
"grad_norm": 0.3183737099170685,
"learning_rate": 3.453935151589341e-05,
"loss": 1.3597,
"num_input_tokens_seen": 75886048,
"step": 800
},
{
"epoch": 0.1023244062657908,
"grad_norm": 0.2935165464878082,
"learning_rate": 3.4522888042483766e-05,
"loss": 1.3745,
"num_input_tokens_seen": 76882752,
"step": 810
},
{
"epoch": 0.10358767054067711,
"grad_norm": 0.2568333148956299,
"learning_rate": 3.450613957477889e-05,
"loss": 1.3502,
"num_input_tokens_seen": 77780736,
"step": 820
},
{
"epoch": 0.10485093481556342,
"grad_norm": 0.29373618960380554,
"learning_rate": 3.4489106393178774e-05,
"loss": 1.33,
"num_input_tokens_seen": 78738272,
"step": 830
},
{
"epoch": 0.10611419909044972,
"grad_norm": 0.2722548246383667,
"learning_rate": 3.447178878285004e-05,
"loss": 1.3533,
"num_input_tokens_seen": 79636736,
"step": 840
},
{
"epoch": 0.10737746336533603,
"grad_norm": 0.29016321897506714,
"learning_rate": 3.445418703372119e-05,
"loss": 1.365,
"num_input_tokens_seen": 80603008,
"step": 850
},
{
"epoch": 0.10864072764022234,
"grad_norm": 0.2636987268924713,
"learning_rate": 3.443630144047771e-05,
"loss": 1.3284,
"num_input_tokens_seen": 81556992,
"step": 860
},
{
"epoch": 0.10990399191510863,
"grad_norm": 0.2925853133201599,
"learning_rate": 3.441813230255714e-05,
"loss": 1.306,
"num_input_tokens_seen": 82544128,
"step": 870
},
{
"epoch": 0.11116725618999494,
"grad_norm": 0.32026803493499756,
"learning_rate": 3.439967992414412e-05,
"loss": 1.2703,
"num_input_tokens_seen": 83488864,
"step": 880
},
{
"epoch": 0.11243052046488125,
"grad_norm": 0.2739593982696533,
"learning_rate": 3.438094461416522e-05,
"loss": 1.3276,
"num_input_tokens_seen": 84447232,
"step": 890
},
{
"epoch": 0.11369378473976756,
"grad_norm": 0.26780998706817627,
"learning_rate": 3.4361926686283805e-05,
"loss": 1.3311,
"num_input_tokens_seen": 85353344,
"step": 900
},
{
"epoch": 0.11495704901465387,
"grad_norm": 0.3547651469707489,
"learning_rate": 3.43426264588948e-05,
"loss": 1.3696,
"num_input_tokens_seen": 86331744,
"step": 910
},
{
"epoch": 0.11622031328954018,
"grad_norm": 0.2572576105594635,
"learning_rate": 3.4323044255119314e-05,
"loss": 1.3226,
"num_input_tokens_seen": 87350592,
"step": 920
},
{
"epoch": 0.11748357756442648,
"grad_norm": 0.26348087191581726,
"learning_rate": 3.430318040279929e-05,
"loss": 1.339,
"num_input_tokens_seen": 88312000,
"step": 930
},
{
"epoch": 0.11874684183931278,
"grad_norm": 0.2919277846813202,
"learning_rate": 3.428303523449194e-05,
"loss": 1.3158,
"num_input_tokens_seen": 89257856,
"step": 940
},
{
"epoch": 0.12001010611419909,
"grad_norm": 0.2658417820930481,
"learning_rate": 3.426260908746427e-05,
"loss": 1.3073,
"num_input_tokens_seen": 90244352,
"step": 950
},
{
"epoch": 0.1212733703890854,
"grad_norm": 0.28189846873283386,
"learning_rate": 3.424190230368733e-05,
"loss": 1.3125,
"num_input_tokens_seen": 91129440,
"step": 960
},
{
"epoch": 0.1225366346639717,
"grad_norm": 0.279550701379776,
"learning_rate": 3.422091522983059e-05,
"loss": 1.2755,
"num_input_tokens_seen": 92033408,
"step": 970
},
{
"epoch": 0.12379989893885801,
"grad_norm": 0.28984683752059937,
"learning_rate": 3.419964821725607e-05,
"loss": 1.3188,
"num_input_tokens_seen": 92960864,
"step": 980
},
{
"epoch": 0.1250631632137443,
"grad_norm": 0.2627594769001007,
"learning_rate": 3.417810162201247e-05,
"loss": 1.3248,
"num_input_tokens_seen": 93996960,
"step": 990
},
{
"epoch": 0.12632642748863063,
"grad_norm": 0.2966674864292145,
"learning_rate": 3.415627580482923e-05,
"loss": 1.3486,
"num_input_tokens_seen": 94925600,
"step": 1000
},
{
"epoch": 0.12758969176351692,
"grad_norm": 0.2634032666683197,
"learning_rate": 3.413417113111045e-05,
"loss": 1.3315,
"num_input_tokens_seen": 95851200,
"step": 1010
},
{
"epoch": 0.12885295603840324,
"grad_norm": 0.29642611742019653,
"learning_rate": 3.4111787970928835e-05,
"loss": 1.2694,
"num_input_tokens_seen": 96800640,
"step": 1020
},
{
"epoch": 0.13011622031328954,
"grad_norm": 0.25690603256225586,
"learning_rate": 3.408912669901943e-05,
"loss": 1.3334,
"num_input_tokens_seen": 97827232,
"step": 1030
},
{
"epoch": 0.13137948458817586,
"grad_norm": 0.2836136817932129,
"learning_rate": 3.40661876947734e-05,
"loss": 1.3122,
"num_input_tokens_seen": 98797088,
"step": 1040
},
{
"epoch": 0.13264274886306215,
"grad_norm": 0.2613033354282379,
"learning_rate": 3.4042971342231655e-05,
"loss": 1.3665,
"num_input_tokens_seen": 99772384,
"step": 1050
},
{
"epoch": 0.13390601313794845,
"grad_norm": 0.2632371485233307,
"learning_rate": 3.401947803007841e-05,
"loss": 1.342,
"num_input_tokens_seen": 100704544,
"step": 1060
},
{
"epoch": 0.13516927741283477,
"grad_norm": 0.25628045201301575,
"learning_rate": 3.399570815163471e-05,
"loss": 1.3686,
"num_input_tokens_seen": 101608800,
"step": 1070
},
{
"epoch": 0.13643254168772107,
"grad_norm": 0.23973917961120605,
"learning_rate": 3.397166210485182e-05,
"loss": 1.393,
"num_input_tokens_seen": 102571712,
"step": 1080
},
{
"epoch": 0.1376958059626074,
"grad_norm": 0.32102668285369873,
"learning_rate": 3.394734029230454e-05,
"loss": 1.2795,
"num_input_tokens_seen": 103472640,
"step": 1090
},
{
"epoch": 0.13895907023749368,
"grad_norm": 0.2778148651123047,
"learning_rate": 3.3922743121184533e-05,
"loss": 1.2751,
"num_input_tokens_seen": 104464224,
"step": 1100
},
{
"epoch": 0.14022233451238,
"grad_norm": 0.2992386221885681,
"learning_rate": 3.3897871003293454e-05,
"loss": 1.2715,
"num_input_tokens_seen": 105472736,
"step": 1110
},
{
"epoch": 0.1414855987872663,
"grad_norm": 0.2530061900615692,
"learning_rate": 3.3872724355036066e-05,
"loss": 1.3162,
"num_input_tokens_seen": 106384480,
"step": 1120
},
{
"epoch": 0.1427488630621526,
"grad_norm": 0.2719084918498993,
"learning_rate": 3.384730359741327e-05,
"loss": 1.2827,
"num_input_tokens_seen": 107319712,
"step": 1130
},
{
"epoch": 0.14401212733703891,
"grad_norm": 0.26223063468933105,
"learning_rate": 3.3821609156015086e-05,
"loss": 1.3352,
"num_input_tokens_seen": 108260576,
"step": 1140
},
{
"epoch": 0.1452753916119252,
"grad_norm": 0.28642159700393677,
"learning_rate": 3.3795641461013454e-05,
"loss": 1.3423,
"num_input_tokens_seen": 109234720,
"step": 1150
},
{
"epoch": 0.14653865588681153,
"grad_norm": 0.3532911539077759,
"learning_rate": 3.376940094715512e-05,
"loss": 1.3319,
"num_input_tokens_seen": 110154176,
"step": 1160
},
{
"epoch": 0.14780192016169783,
"grad_norm": 0.2519535720348358,
"learning_rate": 3.3742888053754295e-05,
"loss": 1.3348,
"num_input_tokens_seen": 111066432,
"step": 1170
},
{
"epoch": 0.14906518443658415,
"grad_norm": 0.28797778487205505,
"learning_rate": 3.371610322468534e-05,
"loss": 1.3478,
"num_input_tokens_seen": 112032064,
"step": 1180
},
{
"epoch": 0.15032844871147044,
"grad_norm": 0.2780948281288147,
"learning_rate": 3.368904690837529e-05,
"loss": 1.3099,
"num_input_tokens_seen": 113065184,
"step": 1190
},
{
"epoch": 0.15159171298635674,
"grad_norm": 0.3206534683704376,
"learning_rate": 3.3661719557796405e-05,
"loss": 1.3218,
"num_input_tokens_seen": 114056096,
"step": 1200
},
{
"epoch": 0.15285497726124306,
"grad_norm": 0.30456361174583435,
"learning_rate": 3.363412163045853e-05,
"loss": 1.3439,
"num_input_tokens_seen": 115039808,
"step": 1210
},
{
"epoch": 0.15411824153612935,
"grad_norm": 0.27767330408096313,
"learning_rate": 3.3606253588401474e-05,
"loss": 1.2642,
"num_input_tokens_seen": 115943872,
"step": 1220
},
{
"epoch": 0.15538150581101567,
"grad_norm": 0.25447219610214233,
"learning_rate": 3.357811589818724e-05,
"loss": 1.3209,
"num_input_tokens_seen": 116934144,
"step": 1230
},
{
"epoch": 0.15664477008590197,
"grad_norm": 0.28984275460243225,
"learning_rate": 3.354970903089228e-05,
"loss": 1.2694,
"num_input_tokens_seen": 117866592,
"step": 1240
},
{
"epoch": 0.15790803436078826,
"grad_norm": 0.2603750228881836,
"learning_rate": 3.3521033462099505e-05,
"loss": 1.3538,
"num_input_tokens_seen": 118792000,
"step": 1250
},
{
"epoch": 0.15917129863567459,
"grad_norm": 0.2679465413093567,
"learning_rate": 3.3492089671890414e-05,
"loss": 1.3708,
"num_input_tokens_seen": 119700608,
"step": 1260
},
{
"epoch": 0.16043456291056088,
"grad_norm": 0.2753802537918091,
"learning_rate": 3.346287814483703e-05,
"loss": 1.2785,
"num_input_tokens_seen": 120664544,
"step": 1270
},
{
"epoch": 0.1616978271854472,
"grad_norm": 0.2532285153865814,
"learning_rate": 3.3433399369993764e-05,
"loss": 1.3176,
"num_input_tokens_seen": 121630656,
"step": 1280
},
{
"epoch": 0.1629610914603335,
"grad_norm": 0.2713632583618164,
"learning_rate": 3.340365384088924e-05,
"loss": 1.2721,
"num_input_tokens_seen": 122593728,
"step": 1290
},
{
"epoch": 0.16422435573521982,
"grad_norm": 0.31818637251853943,
"learning_rate": 3.337364205551805e-05,
"loss": 1.3474,
"num_input_tokens_seen": 123604064,
"step": 1300
},
{
"epoch": 0.1654876200101061,
"grad_norm": 0.28953075408935547,
"learning_rate": 3.3343364516332404e-05,
"loss": 1.3117,
"num_input_tokens_seen": 124606080,
"step": 1310
},
{
"epoch": 0.1667508842849924,
"grad_norm": 0.32029005885124207,
"learning_rate": 3.331282173023371e-05,
"loss": 1.3281,
"num_input_tokens_seen": 125569664,
"step": 1320
},
{
"epoch": 0.16801414855987873,
"grad_norm": 0.2608253061771393,
"learning_rate": 3.328201420856409e-05,
"loss": 1.2915,
"num_input_tokens_seen": 126460768,
"step": 1330
},
{
"epoch": 0.16927741283476502,
"grad_norm": 0.2563798725605011,
"learning_rate": 3.3250942467097835e-05,
"loss": 1.3308,
"num_input_tokens_seen": 127405408,
"step": 1340
},
{
"epoch": 0.17054067710965135,
"grad_norm": 0.26563408970832825,
"learning_rate": 3.3219607026032747e-05,
"loss": 1.294,
"num_input_tokens_seen": 128331968,
"step": 1350
},
{
"epoch": 0.17180394138453764,
"grad_norm": 0.2531772553920746,
"learning_rate": 3.318800840998146e-05,
"loss": 1.3276,
"num_input_tokens_seen": 129301248,
"step": 1360
},
{
"epoch": 0.17306720565942396,
"grad_norm": 0.2774362862110138,
"learning_rate": 3.3156147147962623e-05,
"loss": 1.2639,
"num_input_tokens_seen": 130282336,
"step": 1370
},
{
"epoch": 0.17433046993431026,
"grad_norm": 0.284277081489563,
"learning_rate": 3.312402377339206e-05,
"loss": 1.3216,
"num_input_tokens_seen": 131225056,
"step": 1380
},
{
"epoch": 0.17559373420919655,
"grad_norm": 0.2917383015155792,
"learning_rate": 3.309163882407384e-05,
"loss": 1.2568,
"num_input_tokens_seen": 132157504,
"step": 1390
},
{
"epoch": 0.17685699848408287,
"grad_norm": 0.2731410264968872,
"learning_rate": 3.305899284219128e-05,
"loss": 1.3375,
"num_input_tokens_seen": 133115200,
"step": 1400
},
{
"epoch": 0.17812026275896917,
"grad_norm": 0.28233301639556885,
"learning_rate": 3.302608637429786e-05,
"loss": 1.2466,
"num_input_tokens_seen": 134032192,
"step": 1410
},
{
"epoch": 0.1793835270338555,
"grad_norm": 0.2799434959888458,
"learning_rate": 3.2992919971308055e-05,
"loss": 1.2824,
"num_input_tokens_seen": 134994208,
"step": 1420
},
{
"epoch": 0.18064679130874178,
"grad_norm": 0.29594945907592773,
"learning_rate": 3.295949418848814e-05,
"loss": 1.3309,
"num_input_tokens_seen": 135938144,
"step": 1430
},
{
"epoch": 0.1819100555836281,
"grad_norm": 0.318526953458786,
"learning_rate": 3.29258095854469e-05,
"loss": 1.2905,
"num_input_tokens_seen": 136866336,
"step": 1440
},
{
"epoch": 0.1831733198585144,
"grad_norm": 0.2683306634426117,
"learning_rate": 3.289186672612621e-05,
"loss": 1.2648,
"num_input_tokens_seen": 137815456,
"step": 1450
},
{
"epoch": 0.1844365841334007,
"grad_norm": 0.27116644382476807,
"learning_rate": 3.2857666178791656e-05,
"loss": 1.2829,
"num_input_tokens_seen": 138780256,
"step": 1460
},
{
"epoch": 0.18569984840828702,
"grad_norm": 0.28254273533821106,
"learning_rate": 3.282320851602298e-05,
"loss": 1.3141,
"num_input_tokens_seen": 139750496,
"step": 1470
},
{
"epoch": 0.1869631126831733,
"grad_norm": 0.26385799050331116,
"learning_rate": 3.2788494314704503e-05,
"loss": 1.329,
"num_input_tokens_seen": 140654176,
"step": 1480
},
{
"epoch": 0.18822637695805963,
"grad_norm": 0.273930162191391,
"learning_rate": 3.275352415601548e-05,
"loss": 1.3267,
"num_input_tokens_seen": 141615424,
"step": 1490
},
{
"epoch": 0.18948964123294593,
"grad_norm": 0.2711365520954132,
"learning_rate": 3.2718298625420366e-05,
"loss": 1.2756,
"num_input_tokens_seen": 142543328,
"step": 1500
},
{
"epoch": 0.19075290550783225,
"grad_norm": 0.27136221528053284,
"learning_rate": 3.268281831265899e-05,
"loss": 1.3284,
"num_input_tokens_seen": 143524416,
"step": 1510
},
{
"epoch": 0.19201616978271854,
"grad_norm": 0.31618639826774597,
"learning_rate": 3.264708381173672e-05,
"loss": 1.3199,
"num_input_tokens_seen": 144454016,
"step": 1520
},
{
"epoch": 0.19327943405760484,
"grad_norm": 0.4721730053424835,
"learning_rate": 3.261109572091448e-05,
"loss": 1.3317,
"num_input_tokens_seen": 145434336,
"step": 1530
},
{
"epoch": 0.19454269833249116,
"grad_norm": 0.2652052342891693,
"learning_rate": 3.257485464269878e-05,
"loss": 1.3733,
"num_input_tokens_seen": 146342112,
"step": 1540
},
{
"epoch": 0.19580596260737745,
"grad_norm": 0.25424447655677795,
"learning_rate": 3.253836118383157e-05,
"loss": 1.2725,
"num_input_tokens_seen": 147287264,
"step": 1550
},
{
"epoch": 0.19706922688226378,
"grad_norm": 0.2884797751903534,
"learning_rate": 3.2501615955280134e-05,
"loss": 1.3223,
"num_input_tokens_seen": 148183456,
"step": 1560
},
{
"epoch": 0.19833249115715007,
"grad_norm": 0.2777753174304962,
"learning_rate": 3.2464619572226836e-05,
"loss": 1.3182,
"num_input_tokens_seen": 149094624,
"step": 1570
},
{
"epoch": 0.1995957554320364,
"grad_norm": 0.27247852087020874,
"learning_rate": 3.242737265405882e-05,
"loss": 1.3171,
"num_input_tokens_seen": 149997920,
"step": 1580
},
{
"epoch": 0.2008590197069227,
"grad_norm": 0.2738061249256134,
"learning_rate": 3.238987582435767e-05,
"loss": 1.2938,
"num_input_tokens_seen": 150960064,
"step": 1590
},
{
"epoch": 0.20212228398180898,
"grad_norm": 0.2913673520088196,
"learning_rate": 3.235212971088891e-05,
"loss": 1.3214,
"num_input_tokens_seen": 151918208,
"step": 1600
},
{
"epoch": 0.2033855482566953,
"grad_norm": 0.279725044965744,
"learning_rate": 3.231413494559156e-05,
"loss": 1.2746,
"num_input_tokens_seen": 152856864,
"step": 1610
},
{
"epoch": 0.2046488125315816,
"grad_norm": 0.27453747391700745,
"learning_rate": 3.227589216456752e-05,
"loss": 1.3174,
"num_input_tokens_seen": 153804192,
"step": 1620
},
{
"epoch": 0.20591207680646792,
"grad_norm": 0.22528155148029327,
"learning_rate": 3.223740200807091e-05,
"loss": 1.2745,
"num_input_tokens_seen": 154817632,
"step": 1630
},
{
"epoch": 0.20717534108135421,
"grad_norm": 0.27404505014419556,
"learning_rate": 3.2198665120497394e-05,
"loss": 1.3032,
"num_input_tokens_seen": 155756448,
"step": 1640
},
{
"epoch": 0.20843860535624054,
"grad_norm": 0.32085704803466797,
"learning_rate": 3.215968215037334e-05,
"loss": 1.3325,
"num_input_tokens_seen": 156763232,
"step": 1650
},
{
"epoch": 0.20970186963112683,
"grad_norm": 0.27827686071395874,
"learning_rate": 3.212045375034501e-05,
"loss": 1.2955,
"num_input_tokens_seen": 157709600,
"step": 1660
},
{
"epoch": 0.21096513390601312,
"grad_norm": 0.2595587968826294,
"learning_rate": 3.20809805771676e-05,
"loss": 1.2932,
"num_input_tokens_seen": 158695680,
"step": 1670
},
{
"epoch": 0.21222839818089945,
"grad_norm": 0.26113271713256836,
"learning_rate": 3.204126329169426e-05,
"loss": 1.2886,
"num_input_tokens_seen": 159651584,
"step": 1680
},
{
"epoch": 0.21349166245578574,
"grad_norm": 0.3666292428970337,
"learning_rate": 3.200130255886503e-05,
"loss": 1.3232,
"num_input_tokens_seen": 160621120,
"step": 1690
},
{
"epoch": 0.21475492673067206,
"grad_norm": 0.30534592270851135,
"learning_rate": 3.196109904769568e-05,
"loss": 1.3539,
"num_input_tokens_seen": 161585024,
"step": 1700
},
{
"epoch": 0.21601819100555836,
"grad_norm": 0.2684236466884613,
"learning_rate": 3.192065343126658e-05,
"loss": 1.2818,
"num_input_tokens_seen": 162539520,
"step": 1710
},
{
"epoch": 0.21728145528044468,
"grad_norm": 0.26715096831321716,
"learning_rate": 3.187996638671134e-05,
"loss": 1.2616,
"num_input_tokens_seen": 163462688,
"step": 1720
},
{
"epoch": 0.21854471955533097,
"grad_norm": 0.26400476694107056,
"learning_rate": 3.1839038595205555e-05,
"loss": 1.3017,
"num_input_tokens_seen": 164408768,
"step": 1730
},
{
"epoch": 0.21980798383021727,
"grad_norm": 0.2887386381626129,
"learning_rate": 3.1797870741955326e-05,
"loss": 1.2897,
"num_input_tokens_seen": 165382816,
"step": 1740
},
{
"epoch": 0.2210712481051036,
"grad_norm": 0.26668059825897217,
"learning_rate": 3.175646351618586e-05,
"loss": 1.3151,
"num_input_tokens_seen": 166320832,
"step": 1750
},
{
"epoch": 0.22233451237998988,
"grad_norm": 0.2531121075153351,
"learning_rate": 3.171481761112989e-05,
"loss": 1.3027,
"num_input_tokens_seen": 167349856,
"step": 1760
},
{
"epoch": 0.2235977766548762,
"grad_norm": 0.24423161149024963,
"learning_rate": 3.167293372401606e-05,
"loss": 1.3245,
"num_input_tokens_seen": 168295712,
"step": 1770
},
{
"epoch": 0.2248610409297625,
"grad_norm": 0.31519579887390137,
"learning_rate": 3.163081255605729e-05,
"loss": 1.2645,
"num_input_tokens_seen": 169282112,
"step": 1780
},
{
"epoch": 0.22612430520464882,
"grad_norm": 0.26210370659828186,
"learning_rate": 3.1588454812439e-05,
"loss": 1.3267,
"num_input_tokens_seen": 170222336,
"step": 1790
},
{
"epoch": 0.22738756947953512,
"grad_norm": 0.27912288904190063,
"learning_rate": 3.154586120230734e-05,
"loss": 1.277,
"num_input_tokens_seen": 171119488,
"step": 1800
},
{
"epoch": 0.2286508337544214,
"grad_norm": 0.26281440258026123,
"learning_rate": 3.150303243875727e-05,
"loss": 1.2892,
"num_input_tokens_seen": 172093984,
"step": 1810
},
{
"epoch": 0.22991409802930773,
"grad_norm": 0.2663213908672333,
"learning_rate": 3.1459969238820664e-05,
"loss": 1.3388,
"num_input_tokens_seen": 172993696,
"step": 1820
},
{
"epoch": 0.23117736230419403,
"grad_norm": 0.27080100774765015,
"learning_rate": 3.141667232345429e-05,
"loss": 1.3374,
"num_input_tokens_seen": 173906304,
"step": 1830
},
{
"epoch": 0.23244062657908035,
"grad_norm": 0.2679150104522705,
"learning_rate": 3.137314241752775e-05,
"loss": 1.288,
"num_input_tokens_seen": 174847680,
"step": 1840
},
{
"epoch": 0.23370389085396664,
"grad_norm": 0.2680162489414215,
"learning_rate": 3.1329380249811304e-05,
"loss": 1.3088,
"num_input_tokens_seen": 175814240,
"step": 1850
},
{
"epoch": 0.23496715512885297,
"grad_norm": 0.27686336636543274,
"learning_rate": 3.128538655296373e-05,
"loss": 1.2868,
"num_input_tokens_seen": 176805408,
"step": 1860
},
{
"epoch": 0.23623041940373926,
"grad_norm": 0.2732996344566345,
"learning_rate": 3.1241162063520015e-05,
"loss": 1.3692,
"num_input_tokens_seen": 177763168,
"step": 1870
},
{
"epoch": 0.23749368367862556,
"grad_norm": 0.25114187598228455,
"learning_rate": 3.1196707521879027e-05,
"loss": 1.3054,
"num_input_tokens_seen": 178689312,
"step": 1880
},
{
"epoch": 0.23875694795351188,
"grad_norm": 0.29648059606552124,
"learning_rate": 3.115202367229115e-05,
"loss": 1.3289,
"num_input_tokens_seen": 179578144,
"step": 1890
},
{
"epoch": 0.24002021222839817,
"grad_norm": 0.25034409761428833,
"learning_rate": 3.110711126284578e-05,
"loss": 1.305,
"num_input_tokens_seen": 180480192,
"step": 1900
},
{
"epoch": 0.2412834765032845,
"grad_norm": 0.26325249671936035,
"learning_rate": 3.106197104545884e-05,
"loss": 1.2645,
"num_input_tokens_seen": 181482336,
"step": 1910
},
{
"epoch": 0.2425467407781708,
"grad_norm": 0.279535710811615,
"learning_rate": 3.101660377586017e-05,
"loss": 1.2723,
"num_input_tokens_seen": 182353792,
"step": 1920
},
{
"epoch": 0.2438100050530571,
"grad_norm": 0.27417901158332825,
"learning_rate": 3.097101021358088e-05,
"loss": 1.2933,
"num_input_tokens_seen": 183284000,
"step": 1930
},
{
"epoch": 0.2450732693279434,
"grad_norm": 0.2854447066783905,
"learning_rate": 3.092519112194063e-05,
"loss": 1.2642,
"num_input_tokens_seen": 184244640,
"step": 1940
},
{
"epoch": 0.2463365336028297,
"grad_norm": 0.2935086190700531,
"learning_rate": 3.087914726803486e-05,
"loss": 1.3183,
"num_input_tokens_seen": 185157728,
"step": 1950
},
{
"epoch": 0.24759979787771602,
"grad_norm": 0.255464643239975,
"learning_rate": 3.0832879422721926e-05,
"loss": 1.2957,
"num_input_tokens_seen": 186099200,
"step": 1960
},
{
"epoch": 0.24886306215260232,
"grad_norm": 0.2608180642127991,
"learning_rate": 3.078638836061023e-05,
"loss": 1.3333,
"num_input_tokens_seen": 187017280,
"step": 1970
},
{
"epoch": 0.2501263264274886,
"grad_norm": 0.3294975459575653,
"learning_rate": 3.073967486004523e-05,
"loss": 1.332,
"num_input_tokens_seen": 187879360,
"step": 1980
},
{
"epoch": 0.25138959070237493,
"grad_norm": 0.2539006769657135,
"learning_rate": 3.069273970309639e-05,
"loss": 1.2726,
"num_input_tokens_seen": 188825632,
"step": 1990
},
{
"epoch": 0.25265285497726125,
"grad_norm": 0.282306969165802,
"learning_rate": 3.064558367554414e-05,
"loss": 1.32,
"num_input_tokens_seen": 189801824,
"step": 2000
},
{
"epoch": 0.25265285497726125,
"eval_loss": 1.321367859840393,
"eval_runtime": 11.9892,
"eval_samples_per_second": 12.511,
"eval_steps_per_second": 0.834,
"num_input_tokens_seen": 189801824,
"step": 2000
},
{
"epoch": 0.2539161192521476,
"grad_norm": 0.30715829133987427,
"learning_rate": 3.0598207566866656e-05,
"loss": 1.2423,
"num_input_tokens_seen": 190754304,
"step": 2010
},
{
"epoch": 0.25517938352703384,
"grad_norm": 0.2773028016090393,
"learning_rate": 3.055061217022669e-05,
"loss": 1.2411,
"num_input_tokens_seen": 191695456,
"step": 2020
},
{
"epoch": 0.25644264780192016,
"grad_norm": 0.267785906791687,
"learning_rate": 3.0502798282458278e-05,
"loss": 1.2461,
"num_input_tokens_seen": 192625312,
"step": 2030
},
{
"epoch": 0.2577059120768065,
"grad_norm": 0.2458842545747757,
"learning_rate": 3.0454766704053395e-05,
"loss": 1.2419,
"num_input_tokens_seen": 193574848,
"step": 2040
},
{
"epoch": 0.25896917635169275,
"grad_norm": 0.27695903182029724,
"learning_rate": 3.040651823914855e-05,
"loss": 1.3366,
"num_input_tokens_seen": 194470688,
"step": 2050
},
{
"epoch": 0.2602324406265791,
"grad_norm": 0.3028598725795746,
"learning_rate": 3.0358053695511335e-05,
"loss": 1.3199,
"num_input_tokens_seen": 195437280,
"step": 2060
},
{
"epoch": 0.2614957049014654,
"grad_norm": 0.2882876396179199,
"learning_rate": 3.030937388452689e-05,
"loss": 1.3221,
"num_input_tokens_seen": 196396320,
"step": 2070
},
{
"epoch": 0.2627589691763517,
"grad_norm": 0.29042840003967285,
"learning_rate": 3.026047962118433e-05,
"loss": 1.2693,
"num_input_tokens_seen": 197314176,
"step": 2080
},
{
"epoch": 0.264022233451238,
"grad_norm": 0.3192022740840912,
"learning_rate": 3.0211371724063097e-05,
"loss": 1.2668,
"num_input_tokens_seen": 198295456,
"step": 2090
},
{
"epoch": 0.2652854977261243,
"grad_norm": 0.250468373298645,
"learning_rate": 3.016205101531925e-05,
"loss": 1.2951,
"num_input_tokens_seen": 199239264,
"step": 2100
},
{
"epoch": 0.26654876200101063,
"grad_norm": 0.2620362639427185,
"learning_rate": 3.0112518320671694e-05,
"loss": 1.2826,
"num_input_tokens_seen": 200166720,
"step": 2110
},
{
"epoch": 0.2678120262758969,
"grad_norm": 0.2919938862323761,
"learning_rate": 3.0062774469388378e-05,
"loss": 1.3001,
"num_input_tokens_seen": 201163456,
"step": 2120
},
{
"epoch": 0.2690752905507832,
"grad_norm": 0.26850852370262146,
"learning_rate": 3.0012820294272402e-05,
"loss": 1.3118,
"num_input_tokens_seen": 202055360,
"step": 2130
},
{
"epoch": 0.27033855482566954,
"grad_norm": 0.2463986724615097,
"learning_rate": 2.9962656631648068e-05,
"loss": 1.2797,
"num_input_tokens_seen": 202973376,
"step": 2140
},
{
"epoch": 0.27160181910055586,
"grad_norm": 0.3001090884208679,
"learning_rate": 2.991228432134687e-05,
"loss": 1.2917,
"num_input_tokens_seen": 203918208,
"step": 2150
},
{
"epoch": 0.27286508337544213,
"grad_norm": 0.2551255524158478,
"learning_rate": 2.9861704206693464e-05,
"loss": 1.299,
"num_input_tokens_seen": 204934080,
"step": 2160
},
{
"epoch": 0.27412834765032845,
"grad_norm": 0.26097556948661804,
"learning_rate": 2.9810917134491515e-05,
"loss": 1.2935,
"num_input_tokens_seen": 205865376,
"step": 2170
},
{
"epoch": 0.2753916119252148,
"grad_norm": 0.2827478051185608,
"learning_rate": 2.975992395500956e-05,
"loss": 1.3006,
"num_input_tokens_seen": 206770144,
"step": 2180
},
{
"epoch": 0.27665487620010104,
"grad_norm": 0.28954237699508667,
"learning_rate": 2.9708725521966717e-05,
"loss": 1.3424,
"num_input_tokens_seen": 207706784,
"step": 2190
},
{
"epoch": 0.27791814047498736,
"grad_norm": 0.2639777660369873,
"learning_rate": 2.9657322692518452e-05,
"loss": 1.231,
"num_input_tokens_seen": 208641184,
"step": 2200
},
{
"epoch": 0.2791814047498737,
"grad_norm": 0.24287603795528412,
"learning_rate": 2.9605716327242188e-05,
"loss": 1.297,
"num_input_tokens_seen": 209596512,
"step": 2210
},
{
"epoch": 0.28044466902476,
"grad_norm": 0.2651768624782562,
"learning_rate": 2.9553907290122907e-05,
"loss": 1.3049,
"num_input_tokens_seen": 210586464,
"step": 2220
},
{
"epoch": 0.2817079332996463,
"grad_norm": 0.2656504809856415,
"learning_rate": 2.9501896448538696e-05,
"loss": 1.3497,
"num_input_tokens_seen": 211556992,
"step": 2230
},
{
"epoch": 0.2829711975745326,
"grad_norm": 0.26418015360832214,
"learning_rate": 2.9449684673246218e-05,
"loss": 1.2702,
"num_input_tokens_seen": 212522560,
"step": 2240
},
{
"epoch": 0.2842344618494189,
"grad_norm": 0.2586632966995239,
"learning_rate": 2.9397272838366127e-05,
"loss": 1.3232,
"num_input_tokens_seen": 213488448,
"step": 2250
},
{
"epoch": 0.2854977261243052,
"grad_norm": 0.28703370690345764,
"learning_rate": 2.934466182136845e-05,
"loss": 1.3158,
"num_input_tokens_seen": 214453408,
"step": 2260
},
{
"epoch": 0.2867609903991915,
"grad_norm": 0.2626774311065674,
"learning_rate": 2.9291852503057874e-05,
"loss": 1.3394,
"num_input_tokens_seen": 215412832,
"step": 2270
},
{
"epoch": 0.28802425467407783,
"grad_norm": 0.256173312664032,
"learning_rate": 2.923884576755903e-05,
"loss": 1.3325,
"num_input_tokens_seen": 216335968,
"step": 2280
},
{
"epoch": 0.28928751894896415,
"grad_norm": 0.26622363924980164,
"learning_rate": 2.9185642502301656e-05,
"loss": 1.2535,
"num_input_tokens_seen": 217269728,
"step": 2290
},
{
"epoch": 0.2905507832238504,
"grad_norm": 0.3084118068218231,
"learning_rate": 2.9132243598005775e-05,
"loss": 1.2808,
"num_input_tokens_seen": 218189440,
"step": 2300
},
{
"epoch": 0.29181404749873674,
"grad_norm": 0.32699644565582275,
"learning_rate": 2.9078649948666754e-05,
"loss": 1.3637,
"num_input_tokens_seen": 219151008,
"step": 2310
},
{
"epoch": 0.29307731177362306,
"grad_norm": 0.2988159954547882,
"learning_rate": 2.902486245154035e-05,
"loss": 1.2898,
"num_input_tokens_seen": 220065312,
"step": 2320
},
{
"epoch": 0.29434057604850933,
"grad_norm": 0.27708715200424194,
"learning_rate": 2.897088200712769e-05,
"loss": 1.2583,
"num_input_tokens_seen": 220958560,
"step": 2330
},
{
"epoch": 0.29560384032339565,
"grad_norm": 0.2532431185245514,
"learning_rate": 2.8916709519160187e-05,
"loss": 1.2647,
"num_input_tokens_seen": 221960800,
"step": 2340
},
{
"epoch": 0.296867104598282,
"grad_norm": 0.2507975101470947,
"learning_rate": 2.8862345894584418e-05,
"loss": 1.2569,
"num_input_tokens_seen": 222927616,
"step": 2350
},
{
"epoch": 0.2981303688731683,
"grad_norm": 0.30082589387893677,
"learning_rate": 2.880779204354694e-05,
"loss": 1.2582,
"num_input_tokens_seen": 223897536,
"step": 2360
},
{
"epoch": 0.29939363314805456,
"grad_norm": 0.25084131956100464,
"learning_rate": 2.875304887937904e-05,
"loss": 1.2445,
"num_input_tokens_seen": 224856256,
"step": 2370
},
{
"epoch": 0.3006568974229409,
"grad_norm": 0.27553117275238037,
"learning_rate": 2.869811731858146e-05,
"loss": 1.2693,
"num_input_tokens_seen": 225789760,
"step": 2380
},
{
"epoch": 0.3019201616978272,
"grad_norm": 0.31296080350875854,
"learning_rate": 2.864299828080905e-05,
"loss": 1.3125,
"num_input_tokens_seen": 226730144,
"step": 2390
},
{
"epoch": 0.30318342597271347,
"grad_norm": 0.2597751021385193,
"learning_rate": 2.858769268885535e-05,
"loss": 1.2959,
"num_input_tokens_seen": 227688608,
"step": 2400
},
{
"epoch": 0.3044466902475998,
"grad_norm": 0.27299267053604126,
"learning_rate": 2.8532201468637184e-05,
"loss": 1.2932,
"num_input_tokens_seen": 228590528,
"step": 2410
},
{
"epoch": 0.3057099545224861,
"grad_norm": 0.2804098129272461,
"learning_rate": 2.8476525549179103e-05,
"loss": 1.3001,
"num_input_tokens_seen": 229560000,
"step": 2420
},
{
"epoch": 0.30697321879737244,
"grad_norm": 0.30946534872055054,
"learning_rate": 2.8420665862597894e-05,
"loss": 1.2657,
"num_input_tokens_seen": 230542208,
"step": 2430
},
{
"epoch": 0.3082364830722587,
"grad_norm": 0.2868455648422241,
"learning_rate": 2.8364623344086917e-05,
"loss": 1.3603,
"num_input_tokens_seen": 231454912,
"step": 2440
},
{
"epoch": 0.309499747347145,
"grad_norm": 0.27222952246665955,
"learning_rate": 2.8308398931900488e-05,
"loss": 1.2796,
"num_input_tokens_seen": 232387808,
"step": 2450
},
{
"epoch": 0.31076301162203135,
"grad_norm": 0.29506227374076843,
"learning_rate": 2.825199356733814e-05,
"loss": 1.2863,
"num_input_tokens_seen": 233295584,
"step": 2460
},
{
"epoch": 0.3120262758969176,
"grad_norm": 0.25060921907424927,
"learning_rate": 2.8195408194728893e-05,
"loss": 1.2725,
"num_input_tokens_seen": 234308960,
"step": 2470
},
{
"epoch": 0.31328954017180394,
"grad_norm": 0.29915860295295715,
"learning_rate": 2.8138643761415432e-05,
"loss": 1.2656,
"num_input_tokens_seen": 235218880,
"step": 2480
},
{
"epoch": 0.31455280444669026,
"grad_norm": 0.30492904782295227,
"learning_rate": 2.8081701217738234e-05,
"loss": 1.2962,
"num_input_tokens_seen": 236173888,
"step": 2490
},
{
"epoch": 0.3158160687215765,
"grad_norm": 0.2989721894264221,
"learning_rate": 2.8024581517019686e-05,
"loss": 1.272,
"num_input_tokens_seen": 237219584,
"step": 2500
},
{
"epoch": 0.31707933299646285,
"grad_norm": 0.2604142725467682,
"learning_rate": 2.7967285615548084e-05,
"loss": 1.2846,
"num_input_tokens_seen": 238150432,
"step": 2510
},
{
"epoch": 0.31834259727134917,
"grad_norm": 0.2856138050556183,
"learning_rate": 2.790981447256168e-05,
"loss": 1.309,
"num_input_tokens_seen": 239091040,
"step": 2520
},
{
"epoch": 0.3196058615462355,
"grad_norm": 0.26201140880584717,
"learning_rate": 2.785216905023256e-05,
"loss": 1.3273,
"num_input_tokens_seen": 240005152,
"step": 2530
},
{
"epoch": 0.32086912582112176,
"grad_norm": 0.2805967628955841,
"learning_rate": 2.7794350313650574e-05,
"loss": 1.3044,
"num_input_tokens_seen": 240957856,
"step": 2540
},
{
"epoch": 0.3221323900960081,
"grad_norm": 0.25588178634643555,
"learning_rate": 2.7736359230807183e-05,
"loss": 1.4082,
"num_input_tokens_seen": 241939904,
"step": 2550
},
{
"epoch": 0.3233956543708944,
"grad_norm": 0.25974375009536743,
"learning_rate": 2.767819677257922e-05,
"loss": 1.3256,
"num_input_tokens_seen": 242886176,
"step": 2560
},
{
"epoch": 0.32465891864578067,
"grad_norm": 0.2552843689918518,
"learning_rate": 2.761986391271267e-05,
"loss": 1.3003,
"num_input_tokens_seen": 243769600,
"step": 2570
},
{
"epoch": 0.325922182920667,
"grad_norm": 0.2774961590766907,
"learning_rate": 2.7561361627806343e-05,
"loss": 1.3239,
"num_input_tokens_seen": 244675136,
"step": 2580
},
{
"epoch": 0.3271854471955533,
"grad_norm": 0.27106648683547974,
"learning_rate": 2.7502690897295546e-05,
"loss": 1.3087,
"num_input_tokens_seen": 245566400,
"step": 2590
},
{
"epoch": 0.32844871147043964,
"grad_norm": 0.253461629152298,
"learning_rate": 2.7443852703435657e-05,
"loss": 1.2503,
"num_input_tokens_seen": 246513216,
"step": 2600
},
{
"epoch": 0.3297119757453259,
"grad_norm": 0.290099173784256,
"learning_rate": 2.738484803128571e-05,
"loss": 1.3034,
"num_input_tokens_seen": 247488992,
"step": 2610
},
{
"epoch": 0.3309752400202122,
"grad_norm": 0.2331458479166031,
"learning_rate": 2.7325677868691897e-05,
"loss": 1.2443,
"num_input_tokens_seen": 248404800,
"step": 2620
},
{
"epoch": 0.33223850429509855,
"grad_norm": 0.2953519821166992,
"learning_rate": 2.7266343206271e-05,
"loss": 1.2703,
"num_input_tokens_seen": 249396800,
"step": 2630
},
{
"epoch": 0.3335017685699848,
"grad_norm": 0.2447034865617752,
"learning_rate": 2.7206845037393847e-05,
"loss": 1.2079,
"num_input_tokens_seen": 250344864,
"step": 2640
},
{
"epoch": 0.33476503284487114,
"grad_norm": 0.2688887417316437,
"learning_rate": 2.7147184358168654e-05,
"loss": 1.2866,
"num_input_tokens_seen": 251205088,
"step": 2650
},
{
"epoch": 0.33602829711975746,
"grad_norm": 0.284983366727829,
"learning_rate": 2.7087362167424363e-05,
"loss": 1.2328,
"num_input_tokens_seen": 252125664,
"step": 2660
},
{
"epoch": 0.3372915613946438,
"grad_norm": 0.26568886637687683,
"learning_rate": 2.7027379466693918e-05,
"loss": 1.3343,
"num_input_tokens_seen": 253090112,
"step": 2670
},
{
"epoch": 0.33855482566953005,
"grad_norm": 0.2735290229320526,
"learning_rate": 2.6967237260197486e-05,
"loss": 1.3117,
"num_input_tokens_seen": 254002816,
"step": 2680
},
{
"epoch": 0.33981808994441637,
"grad_norm": 0.2602190673351288,
"learning_rate": 2.6906936554825652e-05,
"loss": 1.2729,
"num_input_tokens_seen": 254977856,
"step": 2690
},
{
"epoch": 0.3410813542193027,
"grad_norm": 0.279680997133255,
"learning_rate": 2.6846478360122567e-05,
"loss": 1.2494,
"num_input_tokens_seen": 255872864,
"step": 2700
},
{
"epoch": 0.34234461849418896,
"grad_norm": 0.29687556624412537,
"learning_rate": 2.6785863688269038e-05,
"loss": 1.3039,
"num_input_tokens_seen": 256788352,
"step": 2710
},
{
"epoch": 0.3436078827690753,
"grad_norm": 0.24734219908714294,
"learning_rate": 2.6725093554065596e-05,
"loss": 1.2728,
"num_input_tokens_seen": 257691904,
"step": 2720
},
{
"epoch": 0.3448711470439616,
"grad_norm": 0.2798856496810913,
"learning_rate": 2.666416897491548e-05,
"loss": 1.2519,
"num_input_tokens_seen": 258613408,
"step": 2730
},
{
"epoch": 0.3461344113188479,
"grad_norm": 0.3039948046207428,
"learning_rate": 2.660309097080763e-05,
"loss": 1.354,
"num_input_tokens_seen": 259569248,
"step": 2740
},
{
"epoch": 0.3473976755937342,
"grad_norm": 0.25825923681259155,
"learning_rate": 2.6541860564299605e-05,
"loss": 1.265,
"num_input_tokens_seen": 260534624,
"step": 2750
},
{
"epoch": 0.3486609398686205,
"grad_norm": 0.2977043390274048,
"learning_rate": 2.6480478780500435e-05,
"loss": 1.3044,
"num_input_tokens_seen": 261467520,
"step": 2760
},
{
"epoch": 0.34992420414350683,
"grad_norm": 0.2831237018108368,
"learning_rate": 2.6418946647053525e-05,
"loss": 1.2419,
"num_input_tokens_seen": 262404128,
"step": 2770
},
{
"epoch": 0.3511874684183931,
"grad_norm": 0.27858638763427734,
"learning_rate": 2.635726519411936e-05,
"loss": 1.2902,
"num_input_tokens_seen": 263348320,
"step": 2780
},
{
"epoch": 0.3524507326932794,
"grad_norm": 0.2645137310028076,
"learning_rate": 2.629543545435835e-05,
"loss": 1.2151,
"num_input_tokens_seen": 264335616,
"step": 2790
},
{
"epoch": 0.35371399696816574,
"grad_norm": 0.2533610165119171,
"learning_rate": 2.623345846291347e-05,
"loss": 1.2592,
"num_input_tokens_seen": 265353120,
"step": 2800
},
{
"epoch": 0.35497726124305207,
"grad_norm": 0.25733280181884766,
"learning_rate": 2.6171335257392957e-05,
"loss": 1.3101,
"num_input_tokens_seen": 266300480,
"step": 2810
},
{
"epoch": 0.35624052551793833,
"grad_norm": 0.2579527199268341,
"learning_rate": 2.610906687785296e-05,
"loss": 1.3144,
"num_input_tokens_seen": 267223328,
"step": 2820
},
{
"epoch": 0.35750378979282466,
"grad_norm": 0.2560044527053833,
"learning_rate": 2.6046654366780096e-05,
"loss": 1.2442,
"num_input_tokens_seen": 268154112,
"step": 2830
},
{
"epoch": 0.358767054067711,
"grad_norm": 0.24506497383117676,
"learning_rate": 2.5984098769073995e-05,
"loss": 1.3063,
"num_input_tokens_seen": 269044736,
"step": 2840
},
{
"epoch": 0.36003031834259724,
"grad_norm": 0.27899622917175293,
"learning_rate": 2.592140113202984e-05,
"loss": 1.2877,
"num_input_tokens_seen": 270024064,
"step": 2850
},
{
"epoch": 0.36129358261748357,
"grad_norm": 0.2520020604133606,
"learning_rate": 2.5858562505320787e-05,
"loss": 1.2984,
"num_input_tokens_seen": 270993600,
"step": 2860
},
{
"epoch": 0.3625568468923699,
"grad_norm": 0.24186141788959503,
"learning_rate": 2.5795583940980456e-05,
"loss": 1.2663,
"num_input_tokens_seen": 271930176,
"step": 2870
},
{
"epoch": 0.3638201111672562,
"grad_norm": 0.28816744685173035,
"learning_rate": 2.5732466493385238e-05,
"loss": 1.281,
"num_input_tokens_seen": 272857216,
"step": 2880
},
{
"epoch": 0.3650833754421425,
"grad_norm": 0.29359421133995056,
"learning_rate": 2.566921121923671e-05,
"loss": 1.2804,
"num_input_tokens_seen": 273869376,
"step": 2890
},
{
"epoch": 0.3663466397170288,
"grad_norm": 0.2661145329475403,
"learning_rate": 2.5605819177543906e-05,
"loss": 1.3292,
"num_input_tokens_seen": 274802592,
"step": 2900
},
{
"epoch": 0.3676099039919151,
"grad_norm": 0.26722949743270874,
"learning_rate": 2.55422914296056e-05,
"loss": 1.3162,
"num_input_tokens_seen": 275777312,
"step": 2910
},
{
"epoch": 0.3688731682668014,
"grad_norm": 0.2770121991634369,
"learning_rate": 2.5478629038992545e-05,
"loss": 1.2678,
"num_input_tokens_seen": 276772352,
"step": 2920
},
{
"epoch": 0.3701364325416877,
"grad_norm": 0.24549973011016846,
"learning_rate": 2.5414833071529645e-05,
"loss": 1.2787,
"num_input_tokens_seen": 277728896,
"step": 2930
},
{
"epoch": 0.37139969681657403,
"grad_norm": 0.25942620635032654,
"learning_rate": 2.5350904595278142e-05,
"loss": 1.2834,
"num_input_tokens_seen": 278658272,
"step": 2940
},
{
"epoch": 0.37266296109146035,
"grad_norm": 0.25496846437454224,
"learning_rate": 2.52868446805177e-05,
"loss": 1.2753,
"num_input_tokens_seen": 279635456,
"step": 2950
},
{
"epoch": 0.3739262253663466,
"grad_norm": 0.26107245683670044,
"learning_rate": 2.5222654399728518e-05,
"loss": 1.2995,
"num_input_tokens_seen": 280610176,
"step": 2960
},
{
"epoch": 0.37518948964123294,
"grad_norm": 0.29526421427726746,
"learning_rate": 2.515833482757335e-05,
"loss": 1.2749,
"num_input_tokens_seen": 281500224,
"step": 2970
},
{
"epoch": 0.37645275391611926,
"grad_norm": 0.2750958204269409,
"learning_rate": 2.5093887040879536e-05,
"loss": 1.2654,
"num_input_tokens_seen": 282466240,
"step": 2980
},
{
"epoch": 0.37771601819100553,
"grad_norm": 0.26100271940231323,
"learning_rate": 2.502931211862095e-05,
"loss": 1.2777,
"num_input_tokens_seen": 283435136,
"step": 2990
},
{
"epoch": 0.37897928246589185,
"grad_norm": 0.29179760813713074,
"learning_rate": 2.4964611141899948e-05,
"loss": 1.258,
"num_input_tokens_seen": 284388960,
"step": 3000
},
{
"epoch": 0.3802425467407782,
"grad_norm": 0.2875267565250397,
"learning_rate": 2.489978519392929e-05,
"loss": 1.277,
"num_input_tokens_seen": 285277344,
"step": 3010
},
{
"epoch": 0.3815058110156645,
"grad_norm": 0.28722459077835083,
"learning_rate": 2.4834835360013953e-05,
"loss": 1.2274,
"num_input_tokens_seen": 286206112,
"step": 3020
},
{
"epoch": 0.38276907529055076,
"grad_norm": 0.2907884418964386,
"learning_rate": 2.476976272753301e-05,
"loss": 1.26,
"num_input_tokens_seen": 287188160,
"step": 3030
},
{
"epoch": 0.3840323395654371,
"grad_norm": 0.2554284334182739,
"learning_rate": 2.4704568385921404e-05,
"loss": 1.2949,
"num_input_tokens_seen": 288111200,
"step": 3040
},
{
"epoch": 0.3852956038403234,
"grad_norm": 0.24661648273468018,
"learning_rate": 2.4639253426651703e-05,
"loss": 1.2442,
"num_input_tokens_seen": 289071840,
"step": 3050
},
{
"epoch": 0.3865588681152097,
"grad_norm": 0.2564159035682678,
"learning_rate": 2.457381894321585e-05,
"loss": 1.2549,
"num_input_tokens_seen": 290037344,
"step": 3060
},
{
"epoch": 0.387822132390096,
"grad_norm": 0.24792881309986115,
"learning_rate": 2.4508266031106835e-05,
"loss": 1.2534,
"num_input_tokens_seen": 290963680,
"step": 3070
},
{
"epoch": 0.3890853966649823,
"grad_norm": 0.29164549708366394,
"learning_rate": 2.4442595787800345e-05,
"loss": 1.2799,
"num_input_tokens_seen": 291992224,
"step": 3080
},
{
"epoch": 0.39034866093986864,
"grad_norm": 0.24966460466384888,
"learning_rate": 2.4376809312736438e-05,
"loss": 1.2712,
"num_input_tokens_seen": 292976480,
"step": 3090
},
{
"epoch": 0.3916119252147549,
"grad_norm": 0.28835946321487427,
"learning_rate": 2.431090770730107e-05,
"loss": 1.3135,
"num_input_tokens_seen": 293943776,
"step": 3100
},
{
"epoch": 0.39287518948964123,
"grad_norm": 0.25582680106163025,
"learning_rate": 2.4244892074807714e-05,
"loss": 1.1963,
"num_input_tokens_seen": 294860864,
"step": 3110
},
{
"epoch": 0.39413845376452755,
"grad_norm": 0.24214211106300354,
"learning_rate": 2.4178763520478864e-05,
"loss": 1.225,
"num_input_tokens_seen": 295732256,
"step": 3120
},
{
"epoch": 0.3954017180394138,
"grad_norm": 0.30721724033355713,
"learning_rate": 2.4112523151427515e-05,
"loss": 1.2633,
"num_input_tokens_seen": 296664736,
"step": 3130
},
{
"epoch": 0.39666498231430014,
"grad_norm": 0.30337947607040405,
"learning_rate": 2.4046172076638657e-05,
"loss": 1.2676,
"num_input_tokens_seen": 297635488,
"step": 3140
},
{
"epoch": 0.39792824658918646,
"grad_norm": 0.28588712215423584,
"learning_rate": 2.3979711406950688e-05,
"loss": 1.2635,
"num_input_tokens_seen": 298546208,
"step": 3150
},
{
"epoch": 0.3991915108640728,
"grad_norm": 0.27065521478652954,
"learning_rate": 2.3913142255036848e-05,
"loss": 1.3024,
"num_input_tokens_seen": 299442720,
"step": 3160
},
{
"epoch": 0.40045477513895905,
"grad_norm": 0.2623492181301117,
"learning_rate": 2.384646573538654e-05,
"loss": 1.2968,
"num_input_tokens_seen": 300421664,
"step": 3170
},
{
"epoch": 0.4017180394138454,
"grad_norm": 0.27391478419303894,
"learning_rate": 2.3779682964286715e-05,
"loss": 1.2181,
"num_input_tokens_seen": 301369824,
"step": 3180
},
{
"epoch": 0.4029813036887317,
"grad_norm": 0.2633381187915802,
"learning_rate": 2.3712795059803166e-05,
"loss": 1.2459,
"num_input_tokens_seen": 302411648,
"step": 3190
},
{
"epoch": 0.40424456796361796,
"grad_norm": 0.2716757655143738,
"learning_rate": 2.36458031417618e-05,
"loss": 1.2883,
"num_input_tokens_seen": 303342464,
"step": 3200
},
{
"epoch": 0.4055078322385043,
"grad_norm": 0.26981112360954285,
"learning_rate": 2.3578708331729927e-05,
"loss": 1.2978,
"num_input_tokens_seen": 304307424,
"step": 3210
},
{
"epoch": 0.4067710965133906,
"grad_norm": 0.24773098528385162,
"learning_rate": 2.3511511752997423e-05,
"loss": 1.3291,
"num_input_tokens_seen": 305311648,
"step": 3220
},
{
"epoch": 0.40803436078827693,
"grad_norm": 0.2609155774116516,
"learning_rate": 2.3444214530557985e-05,
"loss": 1.2416,
"num_input_tokens_seen": 306299200,
"step": 3230
},
{
"epoch": 0.4092976250631632,
"grad_norm": 0.258277028799057,
"learning_rate": 2.3376817791090263e-05,
"loss": 1.2476,
"num_input_tokens_seen": 307199776,
"step": 3240
},
{
"epoch": 0.4105608893380495,
"grad_norm": 0.3055669963359833,
"learning_rate": 2.3309322662938994e-05,
"loss": 1.2846,
"num_input_tokens_seen": 308118080,
"step": 3250
},
{
"epoch": 0.41182415361293584,
"grad_norm": 0.28719931840896606,
"learning_rate": 2.3241730276096136e-05,
"loss": 1.2432,
"num_input_tokens_seen": 309095584,
"step": 3260
},
{
"epoch": 0.4130874178878221,
"grad_norm": 0.2620775103569031,
"learning_rate": 2.3174041762181924e-05,
"loss": 1.3018,
"num_input_tokens_seen": 310052032,
"step": 3270
},
{
"epoch": 0.41435068216270843,
"grad_norm": 0.2525536119937897,
"learning_rate": 2.310625825442595e-05,
"loss": 1.2721,
"num_input_tokens_seen": 311011040,
"step": 3280
},
{
"epoch": 0.41561394643759475,
"grad_norm": 0.24205638468265533,
"learning_rate": 2.3038380887648158e-05,
"loss": 1.283,
"num_input_tokens_seen": 311953920,
"step": 3290
},
{
"epoch": 0.41687721071248107,
"grad_norm": 0.2821497321128845,
"learning_rate": 2.2970410798239875e-05,
"loss": 1.2184,
"num_input_tokens_seen": 312900064,
"step": 3300
},
{
"epoch": 0.41814047498736734,
"grad_norm": 0.26797381043434143,
"learning_rate": 2.290234912414478e-05,
"loss": 1.2682,
"num_input_tokens_seen": 313856160,
"step": 3310
},
{
"epoch": 0.41940373926225366,
"grad_norm": 0.26029297709465027,
"learning_rate": 2.2834197004839832e-05,
"loss": 1.2241,
"num_input_tokens_seen": 314758112,
"step": 3320
},
{
"epoch": 0.42066700353714,
"grad_norm": 0.2785716950893402,
"learning_rate": 2.276595558131622e-05,
"loss": 1.1807,
"num_input_tokens_seen": 315687232,
"step": 3330
},
{
"epoch": 0.42193026781202625,
"grad_norm": 0.282991886138916,
"learning_rate": 2.2697625996060242e-05,
"loss": 1.2337,
"num_input_tokens_seen": 316675552,
"step": 3340
},
{
"epoch": 0.42319353208691257,
"grad_norm": 0.26791542768478394,
"learning_rate": 2.2629209393034202e-05,
"loss": 1.277,
"num_input_tokens_seen": 317594112,
"step": 3350
},
{
"epoch": 0.4244567963617989,
"grad_norm": 0.2645999789237976,
"learning_rate": 2.256070691765721e-05,
"loss": 1.2995,
"num_input_tokens_seen": 318542656,
"step": 3360
},
{
"epoch": 0.4257200606366852,
"grad_norm": 0.2621070146560669,
"learning_rate": 2.249211971678606e-05,
"loss": 1.2712,
"num_input_tokens_seen": 319529632,
"step": 3370
},
{
"epoch": 0.4269833249115715,
"grad_norm": 0.292126327753067,
"learning_rate": 2.2423448938696008e-05,
"loss": 1.281,
"num_input_tokens_seen": 320495008,
"step": 3380
},
{
"epoch": 0.4282465891864578,
"grad_norm": 0.26194462180137634,
"learning_rate": 2.235469573306152e-05,
"loss": 1.2705,
"num_input_tokens_seen": 321386944,
"step": 3390
},
{
"epoch": 0.4295098534613441,
"grad_norm": 0.26072680950164795,
"learning_rate": 2.2285861250937078e-05,
"loss": 1.3382,
"num_input_tokens_seen": 322285280,
"step": 3400
},
{
"epoch": 0.4307731177362304,
"grad_norm": 0.308788001537323,
"learning_rate": 2.2216946644737867e-05,
"loss": 1.3189,
"num_input_tokens_seen": 323297568,
"step": 3410
},
{
"epoch": 0.4320363820111167,
"grad_norm": 0.26922985911369324,
"learning_rate": 2.2147953068220498e-05,
"loss": 1.2132,
"num_input_tokens_seen": 324283360,
"step": 3420
},
{
"epoch": 0.43329964628600304,
"grad_norm": 0.27006080746650696,
"learning_rate": 2.207888167646369e-05,
"loss": 1.2268,
"num_input_tokens_seen": 325189760,
"step": 3430
},
{
"epoch": 0.43456291056088936,
"grad_norm": 0.26316067576408386,
"learning_rate": 2.2009733625848932e-05,
"loss": 1.2945,
"num_input_tokens_seen": 326144000,
"step": 3440
},
{
"epoch": 0.4358261748357756,
"grad_norm": 0.2620113790035248,
"learning_rate": 2.1940510074041124e-05,
"loss": 1.2857,
"num_input_tokens_seen": 327078432,
"step": 3450
},
{
"epoch": 0.43708943911066195,
"grad_norm": 0.3018427789211273,
"learning_rate": 2.1871212179969193e-05,
"loss": 1.2732,
"num_input_tokens_seen": 327975328,
"step": 3460
},
{
"epoch": 0.43835270338554827,
"grad_norm": 0.3014253079891205,
"learning_rate": 2.180184110380668e-05,
"loss": 1.2944,
"num_input_tokens_seen": 328923296,
"step": 3470
},
{
"epoch": 0.43961596766043454,
"grad_norm": 0.26709380745887756,
"learning_rate": 2.173239800695235e-05,
"loss": 1.2801,
"num_input_tokens_seen": 329852576,
"step": 3480
},
{
"epoch": 0.44087923193532086,
"grad_norm": 0.26904571056365967,
"learning_rate": 2.1662884052010715e-05,
"loss": 1.3081,
"num_input_tokens_seen": 330887712,
"step": 3490
},
{
"epoch": 0.4421424962102072,
"grad_norm": 0.2532831132411957,
"learning_rate": 2.1593300402772578e-05,
"loss": 1.2399,
"num_input_tokens_seen": 331852448,
"step": 3500
},
{
"epoch": 0.4434057604850935,
"grad_norm": 0.2727656364440918,
"learning_rate": 2.1523648224195553e-05,
"loss": 1.3334,
"num_input_tokens_seen": 332849824,
"step": 3510
},
{
"epoch": 0.44466902475997977,
"grad_norm": 0.2567518353462219,
"learning_rate": 2.1453928682384567e-05,
"loss": 1.2469,
"num_input_tokens_seen": 333796544,
"step": 3520
},
{
"epoch": 0.4459322890348661,
"grad_norm": 0.27944666147232056,
"learning_rate": 2.1384142944572327e-05,
"loss": 1.2182,
"num_input_tokens_seen": 334769728,
"step": 3530
},
{
"epoch": 0.4471955533097524,
"grad_norm": 0.26202327013015747,
"learning_rate": 2.131429217909978e-05,
"loss": 1.2556,
"num_input_tokens_seen": 335697824,
"step": 3540
},
{
"epoch": 0.4484588175846387,
"grad_norm": 0.2528652250766754,
"learning_rate": 2.1244377555396552e-05,
"loss": 1.2889,
"num_input_tokens_seen": 336718816,
"step": 3550
},
{
"epoch": 0.449722081859525,
"grad_norm": 0.27603092789649963,
"learning_rate": 2.1174400243961384e-05,
"loss": 1.2786,
"num_input_tokens_seen": 337621120,
"step": 3560
},
{
"epoch": 0.4509853461344113,
"grad_norm": 0.2740069627761841,
"learning_rate": 2.1104361416342515e-05,
"loss": 1.2048,
"num_input_tokens_seen": 338654368,
"step": 3570
},
{
"epoch": 0.45224861040929765,
"grad_norm": 0.2614036798477173,
"learning_rate": 2.1034262245118083e-05,
"loss": 1.299,
"num_input_tokens_seen": 339635072,
"step": 3580
},
{
"epoch": 0.4535118746841839,
"grad_norm": 0.2862122058868408,
"learning_rate": 2.0964103903876478e-05,
"loss": 1.2675,
"num_input_tokens_seen": 340587008,
"step": 3590
},
{
"epoch": 0.45477513895907024,
"grad_norm": 0.2503550946712494,
"learning_rate": 2.089388756719672e-05,
"loss": 1.3265,
"num_input_tokens_seen": 341507104,
"step": 3600
},
{
"epoch": 0.45603840323395656,
"grad_norm": 0.2760883867740631,
"learning_rate": 2.0823614410628762e-05,
"loss": 1.2568,
"num_input_tokens_seen": 342452832,
"step": 3610
},
{
"epoch": 0.4573016675088428,
"grad_norm": 0.25591230392456055,
"learning_rate": 2.075328561067385e-05,
"loss": 1.2854,
"num_input_tokens_seen": 343443968,
"step": 3620
},
{
"epoch": 0.45856493178372915,
"grad_norm": 0.247548446059227,
"learning_rate": 2.0682902344764768e-05,
"loss": 1.2427,
"num_input_tokens_seen": 344422112,
"step": 3630
},
{
"epoch": 0.45982819605861547,
"grad_norm": 0.2951701879501343,
"learning_rate": 2.0612465791246192e-05,
"loss": 1.2824,
"num_input_tokens_seen": 345312448,
"step": 3640
},
{
"epoch": 0.4610914603335018,
"grad_norm": 0.2961169481277466,
"learning_rate": 2.0541977129354912e-05,
"loss": 1.266,
"num_input_tokens_seen": 346277152,
"step": 3650
},
{
"epoch": 0.46235472460838806,
"grad_norm": 0.27115508913993835,
"learning_rate": 2.0471437539200107e-05,
"loss": 1.3118,
"num_input_tokens_seen": 347211840,
"step": 3660
},
{
"epoch": 0.4636179888832744,
"grad_norm": 0.27469298243522644,
"learning_rate": 2.0400848201743608e-05,
"loss": 1.1801,
"num_input_tokens_seen": 348124992,
"step": 3670
},
{
"epoch": 0.4648812531581607,
"grad_norm": 0.26864269375801086,
"learning_rate": 2.033021029878008e-05,
"loss": 1.2319,
"num_input_tokens_seen": 349074176,
"step": 3680
},
{
"epoch": 0.46614451743304697,
"grad_norm": 0.2966035008430481,
"learning_rate": 2.0259525012917273e-05,
"loss": 1.3158,
"num_input_tokens_seen": 350022112,
"step": 3690
},
{
"epoch": 0.4674077817079333,
"grad_norm": 0.24909211695194244,
"learning_rate": 2.0188793527556226e-05,
"loss": 1.2902,
"num_input_tokens_seen": 350974272,
"step": 3700
},
{
"epoch": 0.4686710459828196,
"grad_norm": 0.256197065114975,
"learning_rate": 2.011801702687142e-05,
"loss": 1.2275,
"num_input_tokens_seen": 351958848,
"step": 3710
},
{
"epoch": 0.46993431025770593,
"grad_norm": 0.2664201259613037,
"learning_rate": 2.0047196695791006e-05,
"loss": 1.2488,
"num_input_tokens_seen": 352921472,
"step": 3720
},
{
"epoch": 0.4711975745325922,
"grad_norm": 0.2655077278614044,
"learning_rate": 1.997633371997689e-05,
"loss": 1.2214,
"num_input_tokens_seen": 353841344,
"step": 3730
},
{
"epoch": 0.4724608388074785,
"grad_norm": 0.2981346845626831,
"learning_rate": 1.9905429285804987e-05,
"loss": 1.2257,
"num_input_tokens_seen": 354788480,
"step": 3740
},
{
"epoch": 0.47372410308236484,
"grad_norm": 0.3032223880290985,
"learning_rate": 1.9834484580345248e-05,
"loss": 1.2228,
"num_input_tokens_seen": 355683616,
"step": 3750
},
{
"epoch": 0.4749873673572511,
"grad_norm": 0.2835098206996918,
"learning_rate": 1.976350079134187e-05,
"loss": 1.2498,
"num_input_tokens_seen": 356653312,
"step": 3760
},
{
"epoch": 0.47625063163213743,
"grad_norm": 0.2348804771900177,
"learning_rate": 1.9692479107193365e-05,
"loss": 1.2461,
"num_input_tokens_seen": 357609024,
"step": 3770
},
{
"epoch": 0.47751389590702376,
"grad_norm": 0.28105470538139343,
"learning_rate": 1.962142071693269e-05,
"loss": 1.2909,
"num_input_tokens_seen": 358542368,
"step": 3780
},
{
"epoch": 0.4787771601819101,
"grad_norm": 0.27118179202079773,
"learning_rate": 1.9550326810207325e-05,
"loss": 1.2809,
"num_input_tokens_seen": 359444576,
"step": 3790
},
{
"epoch": 0.48004042445679634,
"grad_norm": 0.2707975506782532,
"learning_rate": 1.9479198577259356e-05,
"loss": 1.2116,
"num_input_tokens_seen": 360334912,
"step": 3800
},
{
"epoch": 0.48130368873168267,
"grad_norm": 0.2806662619113922,
"learning_rate": 1.9408037208905558e-05,
"loss": 1.2828,
"num_input_tokens_seen": 361304576,
"step": 3810
},
{
"epoch": 0.482566953006569,
"grad_norm": 0.2591959834098816,
"learning_rate": 1.9336843896517458e-05,
"loss": 1.1958,
"num_input_tokens_seen": 362211520,
"step": 3820
},
{
"epoch": 0.48383021728145525,
"grad_norm": 0.2818770706653595,
"learning_rate": 1.926561983200137e-05,
"loss": 1.3481,
"num_input_tokens_seen": 363114336,
"step": 3830
},
{
"epoch": 0.4850934815563416,
"grad_norm": 0.25823378562927246,
"learning_rate": 1.919436620777847e-05,
"loss": 1.2547,
"num_input_tokens_seen": 364014272,
"step": 3840
},
{
"epoch": 0.4863567458312279,
"grad_norm": 0.254759818315506,
"learning_rate": 1.9123084216764807e-05,
"loss": 1.2323,
"num_input_tokens_seen": 364978528,
"step": 3850
},
{
"epoch": 0.4876200101061142,
"grad_norm": 0.26032665371894836,
"learning_rate": 1.9051775052351343e-05,
"loss": 1.3204,
"num_input_tokens_seen": 365890720,
"step": 3860
},
{
"epoch": 0.4888832743810005,
"grad_norm": 0.26584163308143616,
"learning_rate": 1.8980439908383986e-05,
"loss": 1.2814,
"num_input_tokens_seen": 366818304,
"step": 3870
},
{
"epoch": 0.4901465386558868,
"grad_norm": 0.2640645205974579,
"learning_rate": 1.890907997914357e-05,
"loss": 1.2683,
"num_input_tokens_seen": 367770048,
"step": 3880
},
{
"epoch": 0.49140980293077313,
"grad_norm": 0.27595484256744385,
"learning_rate": 1.8837696459325896e-05,
"loss": 1.3023,
"num_input_tokens_seen": 368716352,
"step": 3890
},
{
"epoch": 0.4926730672056594,
"grad_norm": 0.2723195552825928,
"learning_rate": 1.8766290544021696e-05,
"loss": 1.2429,
"num_input_tokens_seen": 369700736,
"step": 3900
},
{
"epoch": 0.4939363314805457,
"grad_norm": 0.2871018052101135,
"learning_rate": 1.869486342869667e-05,
"loss": 1.3019,
"num_input_tokens_seen": 370702016,
"step": 3910
},
{
"epoch": 0.49519959575543204,
"grad_norm": 0.299991250038147,
"learning_rate": 1.8623416309171423e-05,
"loss": 1.2597,
"num_input_tokens_seen": 371647904,
"step": 3920
},
{
"epoch": 0.49646286003031836,
"grad_norm": 0.29281744360923767,
"learning_rate": 1.8551950381601466e-05,
"loss": 1.2109,
"num_input_tokens_seen": 372649376,
"step": 3930
},
{
"epoch": 0.49772612430520463,
"grad_norm": 0.2941571772098541,
"learning_rate": 1.8480466842457208e-05,
"loss": 1.2597,
"num_input_tokens_seen": 373577504,
"step": 3940
},
{
"epoch": 0.49898938858009095,
"grad_norm": 0.25515016913414,
"learning_rate": 1.8408966888503894e-05,
"loss": 1.2588,
"num_input_tokens_seen": 374508256,
"step": 3950
},
{
"epoch": 0.5002526528549772,
"grad_norm": 0.2905372083187103,
"learning_rate": 1.8337451716781592e-05,
"loss": 1.2734,
"num_input_tokens_seen": 375425088,
"step": 3960
},
{
"epoch": 0.5015159171298635,
"grad_norm": 0.27142760157585144,
"learning_rate": 1.8265922524585137e-05,
"loss": 1.2444,
"num_input_tokens_seen": 376367264,
"step": 3970
},
{
"epoch": 0.5027791814047499,
"grad_norm": 0.26266419887542725,
"learning_rate": 1.8194380509444095e-05,
"loss": 1.2504,
"num_input_tokens_seen": 377307360,
"step": 3980
},
{
"epoch": 0.5040424456796362,
"grad_norm": 0.24885958433151245,
"learning_rate": 1.8122826869102706e-05,
"loss": 1.2403,
"num_input_tokens_seen": 378238624,
"step": 3990
},
{
"epoch": 0.5053057099545225,
"grad_norm": 0.2766496241092682,
"learning_rate": 1.8051262801499845e-05,
"loss": 1.2614,
"num_input_tokens_seen": 379241088,
"step": 4000
},
{
"epoch": 0.5053057099545225,
"eval_loss": 1.2814823389053345,
"eval_runtime": 12.3847,
"eval_samples_per_second": 12.112,
"eval_steps_per_second": 0.807,
"num_input_tokens_seen": 379241088,
"step": 4000
},
{
"epoch": 0.5065689742294088,
"grad_norm": 0.2559678256511688,
"learning_rate": 1.7979689504748963e-05,
"loss": 1.2359,
"num_input_tokens_seen": 380145024,
"step": 4010
},
{
"epoch": 0.5078322385042952,
"grad_norm": 0.276067852973938,
"learning_rate": 1.7908108177118005e-05,
"loss": 1.2247,
"num_input_tokens_seen": 381154496,
"step": 4020
},
{
"epoch": 0.5090955027791814,
"grad_norm": 0.26673588156700134,
"learning_rate": 1.7836520017009383e-05,
"loss": 1.2377,
"num_input_tokens_seen": 382081728,
"step": 4030
},
{
"epoch": 0.5103587670540677,
"grad_norm": 0.2775169014930725,
"learning_rate": 1.7764926222939893e-05,
"loss": 1.2305,
"num_input_tokens_seen": 383040896,
"step": 4040
},
{
"epoch": 0.511622031328954,
"grad_norm": 0.2704101502895355,
"learning_rate": 1.7693327993520654e-05,
"loss": 1.2809,
"num_input_tokens_seen": 383997344,
"step": 4050
},
{
"epoch": 0.5128852956038403,
"grad_norm": 0.2597109079360962,
"learning_rate": 1.7621726527437044e-05,
"loss": 1.2637,
"num_input_tokens_seen": 384951744,
"step": 4060
},
{
"epoch": 0.5141485598787267,
"grad_norm": 0.265578955411911,
"learning_rate": 1.7550123023428622e-05,
"loss": 1.306,
"num_input_tokens_seen": 385818784,
"step": 4070
},
{
"epoch": 0.515411824153613,
"grad_norm": 0.2557640075683594,
"learning_rate": 1.7478518680269075e-05,
"loss": 1.2842,
"num_input_tokens_seen": 386759680,
"step": 4080
},
{
"epoch": 0.5166750884284993,
"grad_norm": 0.25985798239707947,
"learning_rate": 1.740691469674612e-05,
"loss": 1.2464,
"num_input_tokens_seen": 387730016,
"step": 4090
},
{
"epoch": 0.5179383527033855,
"grad_norm": 0.25625666975975037,
"learning_rate": 1.733531227164148e-05,
"loss": 1.2265,
"num_input_tokens_seen": 388693952,
"step": 4100
},
{
"epoch": 0.5192016169782718,
"grad_norm": 0.2758398950099945,
"learning_rate": 1.726371260371076e-05,
"loss": 1.2007,
"num_input_tokens_seen": 389669216,
"step": 4110
},
{
"epoch": 0.5204648812531582,
"grad_norm": 0.27401378750801086,
"learning_rate": 1.7192116891663433e-05,
"loss": 1.2657,
"num_input_tokens_seen": 390647360,
"step": 4120
},
{
"epoch": 0.5217281455280445,
"grad_norm": 0.29113706946372986,
"learning_rate": 1.712052633414272e-05,
"loss": 1.2834,
"num_input_tokens_seen": 391549504,
"step": 4130
},
{
"epoch": 0.5229914098029308,
"grad_norm": 0.2795151472091675,
"learning_rate": 1.7048942129705552e-05,
"loss": 1.2343,
"num_input_tokens_seen": 392518208,
"step": 4140
},
{
"epoch": 0.5242546740778171,
"grad_norm": 0.3003349006175995,
"learning_rate": 1.6977365476802505e-05,
"loss": 1.28,
"num_input_tokens_seen": 393502048,
"step": 4150
},
{
"epoch": 0.5255179383527034,
"grad_norm": 0.28123393654823303,
"learning_rate": 1.690579757375772e-05,
"loss": 1.2696,
"num_input_tokens_seen": 394482816,
"step": 4160
},
{
"epoch": 0.5267812026275897,
"grad_norm": 0.25133296847343445,
"learning_rate": 1.6834239618748856e-05,
"loss": 1.2744,
"num_input_tokens_seen": 395421792,
"step": 4170
},
{
"epoch": 0.528044466902476,
"grad_norm": 0.2568908631801605,
"learning_rate": 1.6762692809787007e-05,
"loss": 1.2162,
"num_input_tokens_seen": 396370464,
"step": 4180
},
{
"epoch": 0.5293077311773623,
"grad_norm": 0.24872644245624542,
"learning_rate": 1.66911583446967e-05,
"loss": 1.2291,
"num_input_tokens_seen": 397275616,
"step": 4190
},
{
"epoch": 0.5305709954522486,
"grad_norm": 0.2645767033100128,
"learning_rate": 1.6619637421095762e-05,
"loss": 1.2803,
"num_input_tokens_seen": 398260032,
"step": 4200
},
{
"epoch": 0.5318342597271349,
"grad_norm": 0.2733348608016968,
"learning_rate": 1.654813123637533e-05,
"loss": 1.2447,
"num_input_tokens_seen": 399281952,
"step": 4210
},
{
"epoch": 0.5330975240020213,
"grad_norm": 0.27618396282196045,
"learning_rate": 1.6476640987679787e-05,
"loss": 1.2296,
"num_input_tokens_seen": 400197792,
"step": 4220
},
{
"epoch": 0.5343607882769076,
"grad_norm": 0.2598818242549896,
"learning_rate": 1.64051678718867e-05,
"loss": 1.258,
"num_input_tokens_seen": 401102336,
"step": 4230
},
{
"epoch": 0.5356240525517938,
"grad_norm": 0.254782497882843,
"learning_rate": 1.6333713085586823e-05,
"loss": 1.2465,
"num_input_tokens_seen": 402011040,
"step": 4240
},
{
"epoch": 0.5368873168266801,
"grad_norm": 0.26978209614753723,
"learning_rate": 1.6262277825064032e-05,
"loss": 1.279,
"num_input_tokens_seen": 402950816,
"step": 4250
},
{
"epoch": 0.5381505811015664,
"grad_norm": 0.2889060378074646,
"learning_rate": 1.6190863286275296e-05,
"loss": 1.3152,
"num_input_tokens_seen": 403935136,
"step": 4260
},
{
"epoch": 0.5394138453764528,
"grad_norm": 0.3075631856918335,
"learning_rate": 1.611947066483068e-05,
"loss": 1.2845,
"num_input_tokens_seen": 404952864,
"step": 4270
},
{
"epoch": 0.5406771096513391,
"grad_norm": 0.27360478043556213,
"learning_rate": 1.6048101155973297e-05,
"loss": 1.2516,
"num_input_tokens_seen": 405957920,
"step": 4280
},
{
"epoch": 0.5419403739262254,
"grad_norm": 0.24361246824264526,
"learning_rate": 1.597675595455933e-05,
"loss": 1.2319,
"num_input_tokens_seen": 406898048,
"step": 4290
},
{
"epoch": 0.5432036382011117,
"grad_norm": 0.25894516706466675,
"learning_rate": 1.5905436255038e-05,
"loss": 1.3278,
"num_input_tokens_seen": 407848352,
"step": 4300
},
{
"epoch": 0.5444669024759979,
"grad_norm": 0.2489163875579834,
"learning_rate": 1.583414325143158e-05,
"loss": 1.2478,
"num_input_tokens_seen": 408813152,
"step": 4310
},
{
"epoch": 0.5457301667508843,
"grad_norm": 0.2795446217060089,
"learning_rate": 1.5762878137315406e-05,
"loss": 1.1847,
"num_input_tokens_seen": 409756608,
"step": 4320
},
{
"epoch": 0.5469934310257706,
"grad_norm": 0.2824794352054596,
"learning_rate": 1.5691642105797883e-05,
"loss": 1.2562,
"num_input_tokens_seen": 410623968,
"step": 4330
},
{
"epoch": 0.5482566953006569,
"grad_norm": 0.2690293788909912,
"learning_rate": 1.5620436349500548e-05,
"loss": 1.2486,
"num_input_tokens_seen": 411572768,
"step": 4340
},
{
"epoch": 0.5495199595755432,
"grad_norm": 0.3064996302127838,
"learning_rate": 1.5549262060538054e-05,
"loss": 1.2568,
"num_input_tokens_seen": 412493568,
"step": 4350
},
{
"epoch": 0.5507832238504295,
"grad_norm": 0.2691975235939026,
"learning_rate": 1.547812043049823e-05,
"loss": 1.275,
"num_input_tokens_seen": 413427264,
"step": 4360
},
{
"epoch": 0.5520464881253159,
"grad_norm": 0.27678680419921875,
"learning_rate": 1.5407012650422146e-05,
"loss": 1.2137,
"num_input_tokens_seen": 414404288,
"step": 4370
},
{
"epoch": 0.5533097524002021,
"grad_norm": 0.2862233519554138,
"learning_rate": 1.533593991078415e-05,
"loss": 1.2782,
"num_input_tokens_seen": 415391456,
"step": 4380
},
{
"epoch": 0.5545730166750884,
"grad_norm": 0.2569049298763275,
"learning_rate": 1.5264903401471965e-05,
"loss": 1.2294,
"num_input_tokens_seen": 416316512,
"step": 4390
},
{
"epoch": 0.5558362809499747,
"grad_norm": 0.291337788105011,
"learning_rate": 1.519390431176674e-05,
"loss": 1.1881,
"num_input_tokens_seen": 417250912,
"step": 4400
},
{
"epoch": 0.557099545224861,
"grad_norm": 0.28458911180496216,
"learning_rate": 1.5122943830323157e-05,
"loss": 1.2479,
"num_input_tokens_seen": 418203936,
"step": 4410
},
{
"epoch": 0.5583628094997474,
"grad_norm": 0.2543714642524719,
"learning_rate": 1.505202314514952e-05,
"loss": 1.2394,
"num_input_tokens_seen": 419118304,
"step": 4420
},
{
"epoch": 0.5596260737746337,
"grad_norm": 0.2531825304031372,
"learning_rate": 1.4981143443587867e-05,
"loss": 1.259,
"num_input_tokens_seen": 420057056,
"step": 4430
},
{
"epoch": 0.56088933804952,
"grad_norm": 0.2655525207519531,
"learning_rate": 1.4910305912294114e-05,
"loss": 1.2547,
"num_input_tokens_seen": 421040064,
"step": 4440
},
{
"epoch": 0.5621526023244062,
"grad_norm": 0.2566235363483429,
"learning_rate": 1.4839511737218156e-05,
"loss": 1.2314,
"num_input_tokens_seen": 421967616,
"step": 4450
},
{
"epoch": 0.5634158665992925,
"grad_norm": 0.2777341306209564,
"learning_rate": 1.476876210358402e-05,
"loss": 1.2543,
"num_input_tokens_seen": 422913952,
"step": 4460
},
{
"epoch": 0.5646791308741789,
"grad_norm": 0.26129183173179626,
"learning_rate": 1.4698058195870038e-05,
"loss": 1.247,
"num_input_tokens_seen": 423912288,
"step": 4470
},
{
"epoch": 0.5659423951490652,
"grad_norm": 0.2949627637863159,
"learning_rate": 1.462740119778899e-05,
"loss": 1.2653,
"num_input_tokens_seen": 424904672,
"step": 4480
},
{
"epoch": 0.5672056594239515,
"grad_norm": 0.2683241367340088,
"learning_rate": 1.4556792292268341e-05,
"loss": 1.2303,
"num_input_tokens_seen": 425895936,
"step": 4490
},
{
"epoch": 0.5684689236988378,
"grad_norm": 0.26744595170021057,
"learning_rate": 1.4486232661430359e-05,
"loss": 1.193,
"num_input_tokens_seen": 426778336,
"step": 4500
},
{
"epoch": 0.5697321879737242,
"grad_norm": 0.28104472160339355,
"learning_rate": 1.4415723486572379e-05,
"loss": 1.2065,
"num_input_tokens_seen": 427702848,
"step": 4510
},
{
"epoch": 0.5709954522486104,
"grad_norm": 0.2564327120780945,
"learning_rate": 1.434526594814701e-05,
"loss": 1.2315,
"num_input_tokens_seen": 428663616,
"step": 4520
},
{
"epoch": 0.5722587165234967,
"grad_norm": 0.246286079287529,
"learning_rate": 1.4274861225742369e-05,
"loss": 1.2768,
"num_input_tokens_seen": 429622080,
"step": 4530
},
{
"epoch": 0.573521980798383,
"grad_norm": 0.2924240529537201,
"learning_rate": 1.4204510498062347e-05,
"loss": 1.2405,
"num_input_tokens_seen": 430489344,
"step": 4540
},
{
"epoch": 0.5747852450732693,
"grad_norm": 0.26321151852607727,
"learning_rate": 1.4134214942906854e-05,
"loss": 1.2082,
"num_input_tokens_seen": 431465248,
"step": 4550
},
{
"epoch": 0.5760485093481557,
"grad_norm": 0.2737989127635956,
"learning_rate": 1.4063975737152111e-05,
"loss": 1.2378,
"num_input_tokens_seen": 432344320,
"step": 4560
},
{
"epoch": 0.577311773623042,
"grad_norm": 0.23963995277881622,
"learning_rate": 1.3993794056730945e-05,
"loss": 1.2195,
"num_input_tokens_seen": 433296800,
"step": 4570
},
{
"epoch": 0.5785750378979283,
"grad_norm": 0.25392717123031616,
"learning_rate": 1.3923671076613121e-05,
"loss": 1.2768,
"num_input_tokens_seen": 434228672,
"step": 4580
},
{
"epoch": 0.5798383021728145,
"grad_norm": 0.2499849945306778,
"learning_rate": 1.3853607970785636e-05,
"loss": 1.2608,
"num_input_tokens_seen": 435125376,
"step": 4590
},
{
"epoch": 0.5811015664477008,
"grad_norm": 0.2485542893409729,
"learning_rate": 1.3783605912233086e-05,
"loss": 1.3271,
"num_input_tokens_seen": 436060128,
"step": 4600
},
{
"epoch": 0.5823648307225872,
"grad_norm": 0.26257503032684326,
"learning_rate": 1.3713666072918025e-05,
"loss": 1.2772,
"num_input_tokens_seen": 437054208,
"step": 4610
},
{
"epoch": 0.5836280949974735,
"grad_norm": 0.27504444122314453,
"learning_rate": 1.3643789623761335e-05,
"loss": 1.2807,
"num_input_tokens_seen": 437972832,
"step": 4620
},
{
"epoch": 0.5848913592723598,
"grad_norm": 0.2476516216993332,
"learning_rate": 1.3573977734622654e-05,
"loss": 1.2403,
"num_input_tokens_seen": 438912832,
"step": 4630
},
{
"epoch": 0.5861546235472461,
"grad_norm": 0.26506373286247253,
"learning_rate": 1.3504231574280742e-05,
"loss": 1.2203,
"num_input_tokens_seen": 439899168,
"step": 4640
},
{
"epoch": 0.5874178878221324,
"grad_norm": 0.29639938473701477,
"learning_rate": 1.3434552310413948e-05,
"loss": 1.314,
"num_input_tokens_seen": 440917152,
"step": 4650
},
{
"epoch": 0.5886811520970187,
"grad_norm": 0.26634323596954346,
"learning_rate": 1.336494110958066e-05,
"loss": 1.2586,
"num_input_tokens_seen": 441860704,
"step": 4660
},
{
"epoch": 0.589944416371905,
"grad_norm": 0.26301464438438416,
"learning_rate": 1.3295399137199744e-05,
"loss": 1.2541,
"num_input_tokens_seen": 442838240,
"step": 4670
},
{
"epoch": 0.5912076806467913,
"grad_norm": 0.26125144958496094,
"learning_rate": 1.3225927557531086e-05,
"loss": 1.2743,
"num_input_tokens_seen": 443835552,
"step": 4680
},
{
"epoch": 0.5924709449216776,
"grad_norm": 0.2652340829372406,
"learning_rate": 1.3156527533656041e-05,
"loss": 1.2308,
"num_input_tokens_seen": 444788896,
"step": 4690
},
{
"epoch": 0.593734209196564,
"grad_norm": 0.2752208411693573,
"learning_rate": 1.3087200227458005e-05,
"loss": 1.2548,
"num_input_tokens_seen": 445779392,
"step": 4700
},
{
"epoch": 0.5949974734714503,
"grad_norm": 0.28993070125579834,
"learning_rate": 1.3017946799602943e-05,
"loss": 1.2103,
"num_input_tokens_seen": 446716864,
"step": 4710
},
{
"epoch": 0.5962607377463366,
"grad_norm": 0.248098686337471,
"learning_rate": 1.294876840951995e-05,
"loss": 1.2628,
"num_input_tokens_seen": 447604192,
"step": 4720
},
{
"epoch": 0.5975240020212228,
"grad_norm": 0.26949024200439453,
"learning_rate": 1.2879666215381881e-05,
"loss": 1.219,
"num_input_tokens_seen": 448549600,
"step": 4730
},
{
"epoch": 0.5987872662961091,
"grad_norm": 0.2639176547527313,
"learning_rate": 1.2810641374085904e-05,
"loss": 1.194,
"num_input_tokens_seen": 449481280,
"step": 4740
},
{
"epoch": 0.6000505305709954,
"grad_norm": 0.2593153417110443,
"learning_rate": 1.2741695041234165e-05,
"loss": 1.2001,
"num_input_tokens_seen": 450464096,
"step": 4750
},
{
"epoch": 0.6013137948458818,
"grad_norm": 0.2578306794166565,
"learning_rate": 1.2672828371114441e-05,
"loss": 1.1945,
"num_input_tokens_seen": 451387360,
"step": 4760
},
{
"epoch": 0.6025770591207681,
"grad_norm": 0.2578235864639282,
"learning_rate": 1.2604042516680797e-05,
"loss": 1.2215,
"num_input_tokens_seen": 452345664,
"step": 4770
},
{
"epoch": 0.6038403233956544,
"grad_norm": 0.2732868790626526,
"learning_rate": 1.2535338629534321e-05,
"loss": 1.2748,
"num_input_tokens_seen": 453247008,
"step": 4780
},
{
"epoch": 0.6051035876705407,
"grad_norm": 0.24936838448047638,
"learning_rate": 1.2466717859903794e-05,
"loss": 1.2132,
"num_input_tokens_seen": 454143616,
"step": 4790
},
{
"epoch": 0.6063668519454269,
"grad_norm": 0.2849110960960388,
"learning_rate": 1.2398181356626464e-05,
"loss": 1.2112,
"num_input_tokens_seen": 455058880,
"step": 4800
},
{
"epoch": 0.6076301162203133,
"grad_norm": 0.2991189956665039,
"learning_rate": 1.2329730267128808e-05,
"loss": 1.2349,
"num_input_tokens_seen": 456022464,
"step": 4810
},
{
"epoch": 0.6088933804951996,
"grad_norm": 0.262685626745224,
"learning_rate": 1.2261365737407316e-05,
"loss": 1.2596,
"num_input_tokens_seen": 457002592,
"step": 4820
},
{
"epoch": 0.6101566447700859,
"grad_norm": 0.25802651047706604,
"learning_rate": 1.2193088912009321e-05,
"loss": 1.1975,
"num_input_tokens_seen": 457977152,
"step": 4830
},
{
"epoch": 0.6114199090449722,
"grad_norm": 0.25570937991142273,
"learning_rate": 1.2124900934013812e-05,
"loss": 1.2774,
"num_input_tokens_seen": 458946368,
"step": 4840
},
{
"epoch": 0.6126831733198586,
"grad_norm": 0.2608765959739685,
"learning_rate": 1.2056802945012316e-05,
"loss": 1.2298,
"num_input_tokens_seen": 459789536,
"step": 4850
},
{
"epoch": 0.6139464375947449,
"grad_norm": 0.27471068501472473,
"learning_rate": 1.1988796085089777e-05,
"loss": 1.2663,
"num_input_tokens_seen": 460781856,
"step": 4860
},
{
"epoch": 0.6152097018696311,
"grad_norm": 0.30232349038124084,
"learning_rate": 1.1920881492805467e-05,
"loss": 1.2709,
"num_input_tokens_seen": 461735360,
"step": 4870
},
{
"epoch": 0.6164729661445174,
"grad_norm": 0.2713924050331116,
"learning_rate": 1.1853060305173947e-05,
"loss": 1.2925,
"num_input_tokens_seen": 462762272,
"step": 4880
},
{
"epoch": 0.6177362304194037,
"grad_norm": 0.2612393796443939,
"learning_rate": 1.1785333657645997e-05,
"loss": 1.2671,
"num_input_tokens_seen": 463701440,
"step": 4890
},
{
"epoch": 0.61899949469429,
"grad_norm": 0.2994194030761719,
"learning_rate": 1.1717702684089622e-05,
"loss": 1.2685,
"num_input_tokens_seen": 464628288,
"step": 4900
},
{
"epoch": 0.6202627589691764,
"grad_norm": 0.27403557300567627,
"learning_rate": 1.1650168516771077e-05,
"loss": 1.2313,
"num_input_tokens_seen": 465563264,
"step": 4910
},
{
"epoch": 0.6215260232440627,
"grad_norm": 0.2665519118309021,
"learning_rate": 1.1582732286335892e-05,
"loss": 1.2608,
"num_input_tokens_seen": 466527296,
"step": 4920
},
{
"epoch": 0.622789287518949,
"grad_norm": 0.2931445837020874,
"learning_rate": 1.151539512178998e-05,
"loss": 1.1978,
"num_input_tokens_seen": 467422144,
"step": 4930
},
{
"epoch": 0.6240525517938352,
"grad_norm": 0.243869349360466,
"learning_rate": 1.1448158150480684e-05,
"loss": 1.2584,
"num_input_tokens_seen": 468346080,
"step": 4940
},
{
"epoch": 0.6253158160687216,
"grad_norm": 0.24073927104473114,
"learning_rate": 1.1381022498077936e-05,
"loss": 1.2786,
"num_input_tokens_seen": 469268160,
"step": 4950
},
{
"epoch": 0.6265790803436079,
"grad_norm": 0.2580939531326294,
"learning_rate": 1.1313989288555403e-05,
"loss": 1.3028,
"num_input_tokens_seen": 470217248,
"step": 4960
},
{
"epoch": 0.6278423446184942,
"grad_norm": 0.27437812089920044,
"learning_rate": 1.1247059644171683e-05,
"loss": 1.1893,
"num_input_tokens_seen": 471134528,
"step": 4970
},
{
"epoch": 0.6291056088933805,
"grad_norm": 0.27005961537361145,
"learning_rate": 1.1180234685451485e-05,
"loss": 1.2873,
"num_input_tokens_seen": 472091616,
"step": 4980
},
{
"epoch": 0.6303688731682668,
"grad_norm": 0.2728407680988312,
"learning_rate": 1.1113515531166905e-05,
"loss": 1.2812,
"num_input_tokens_seen": 473036928,
"step": 4990
},
{
"epoch": 0.631632137443153,
"grad_norm": 0.2591012716293335,
"learning_rate": 1.1046903298318667e-05,
"loss": 1.2289,
"num_input_tokens_seen": 474006976,
"step": 5000
},
{
"epoch": 0.6328954017180394,
"grad_norm": 0.23528583347797394,
"learning_rate": 1.0980399102117435e-05,
"loss": 1.2315,
"num_input_tokens_seen": 474996096,
"step": 5010
},
{
"epoch": 0.6341586659929257,
"grad_norm": 0.27465859055519104,
"learning_rate": 1.0914004055965161e-05,
"loss": 1.3264,
"num_input_tokens_seen": 475933248,
"step": 5020
},
{
"epoch": 0.635421930267812,
"grad_norm": 0.27259302139282227,
"learning_rate": 1.08477192714364e-05,
"loss": 1.2479,
"num_input_tokens_seen": 476921888,
"step": 5030
},
{
"epoch": 0.6366851945426983,
"grad_norm": 0.2752089202404022,
"learning_rate": 1.078154585825974e-05,
"loss": 1.1889,
"num_input_tokens_seen": 477911648,
"step": 5040
},
{
"epoch": 0.6379484588175847,
"grad_norm": 0.2641167938709259,
"learning_rate": 1.0715484924299207e-05,
"loss": 1.1821,
"num_input_tokens_seen": 478897216,
"step": 5050
},
{
"epoch": 0.639211723092471,
"grad_norm": 0.24626615643501282,
"learning_rate": 1.0649537575535706e-05,
"loss": 1.3228,
"num_input_tokens_seen": 479897216,
"step": 5060
},
{
"epoch": 0.6404749873673572,
"grad_norm": 0.25866448879241943,
"learning_rate": 1.0583704916048546e-05,
"loss": 1.2286,
"num_input_tokens_seen": 480879104,
"step": 5070
},
{
"epoch": 0.6417382516422435,
"grad_norm": 0.2469986230134964,
"learning_rate": 1.05179880479969e-05,
"loss": 1.2382,
"num_input_tokens_seen": 481884800,
"step": 5080
},
{
"epoch": 0.6430015159171298,
"grad_norm": 0.26307523250579834,
"learning_rate": 1.0452388071601396e-05,
"loss": 1.2541,
"num_input_tokens_seen": 482806624,
"step": 5090
},
{
"epoch": 0.6442647801920162,
"grad_norm": 0.2624097168445587,
"learning_rate": 1.0386906085125676e-05,
"loss": 1.2405,
"num_input_tokens_seen": 483727232,
"step": 5100
},
{
"epoch": 0.6455280444669025,
"grad_norm": 0.25804755091667175,
"learning_rate": 1.0321543184858012e-05,
"loss": 1.2258,
"num_input_tokens_seen": 484757024,
"step": 5110
},
{
"epoch": 0.6467913087417888,
"grad_norm": 0.26082345843315125,
"learning_rate": 1.0256300465092968e-05,
"loss": 1.2453,
"num_input_tokens_seen": 485694944,
"step": 5120
},
{
"epoch": 0.6480545730166751,
"grad_norm": 0.26765161752700806,
"learning_rate": 1.0191179018113052e-05,
"loss": 1.2447,
"num_input_tokens_seen": 486613664,
"step": 5130
},
{
"epoch": 0.6493178372915613,
"grad_norm": 0.2676701545715332,
"learning_rate": 1.0126179934170446e-05,
"loss": 1.3095,
"num_input_tokens_seen": 487574816,
"step": 5140
},
{
"epoch": 0.6505811015664477,
"grad_norm": 0.2636936604976654,
"learning_rate": 1.0061304301468766e-05,
"loss": 1.2053,
"num_input_tokens_seen": 488516544,
"step": 5150
},
{
"epoch": 0.651844365841334,
"grad_norm": 0.2662390172481537,
"learning_rate": 9.996553206144797e-06,
"loss": 1.2751,
"num_input_tokens_seen": 489412608,
"step": 5160
},
{
"epoch": 0.6531076301162203,
"grad_norm": 0.26386016607284546,
"learning_rate": 9.931927732250374e-06,
"loss": 1.2631,
"num_input_tokens_seen": 490374624,
"step": 5170
},
{
"epoch": 0.6543708943911066,
"grad_norm": 0.27195560932159424,
"learning_rate": 9.867428961734188e-06,
"loss": 1.2587,
"num_input_tokens_seen": 491366592,
"step": 5180
},
{
"epoch": 0.655634158665993,
"grad_norm": 0.2867816686630249,
"learning_rate": 9.803057974423667e-06,
"loss": 1.2609,
"num_input_tokens_seen": 492314912,
"step": 5190
},
{
"epoch": 0.6568974229408793,
"grad_norm": 0.28000280261039734,
"learning_rate": 9.738815848006945e-06,
"loss": 1.2562,
"num_input_tokens_seen": 493215136,
"step": 5200
},
{
"epoch": 0.6581606872157655,
"grad_norm": 0.27017146348953247,
"learning_rate": 9.674703658014749e-06,
"loss": 1.2261,
"num_input_tokens_seen": 494146080,
"step": 5210
},
{
"epoch": 0.6594239514906518,
"grad_norm": 0.2675604522228241,
"learning_rate": 9.610722477802483e-06,
"loss": 1.292,
"num_input_tokens_seen": 495103840,
"step": 5220
},
{
"epoch": 0.6606872157655381,
"grad_norm": 0.2377164214849472,
"learning_rate": 9.546873378532158e-06,
"loss": 1.2278,
"num_input_tokens_seen": 496014752,
"step": 5230
},
{
"epoch": 0.6619504800404244,
"grad_norm": 0.2551622688770294,
"learning_rate": 9.483157429154547e-06,
"loss": 1.247,
"num_input_tokens_seen": 496955936,
"step": 5240
},
{
"epoch": 0.6632137443153108,
"grad_norm": 0.2615555226802826,
"learning_rate": 9.419575696391218e-06,
"loss": 1.2705,
"num_input_tokens_seen": 497881920,
"step": 5250
},
{
"epoch": 0.6644770085901971,
"grad_norm": 0.2722395956516266,
"learning_rate": 9.356129244716729e-06,
"loss": 1.2736,
"num_input_tokens_seen": 498859040,
"step": 5260
},
{
"epoch": 0.6657402728650834,
"grad_norm": 0.2843475639820099,
"learning_rate": 9.29281913634078e-06,
"loss": 1.2112,
"num_input_tokens_seen": 499848032,
"step": 5270
},
{
"epoch": 0.6670035371399696,
"grad_norm": 0.260781466960907,
"learning_rate": 9.22964643119044e-06,
"loss": 1.2301,
"num_input_tokens_seen": 500782656,
"step": 5280
},
{
"epoch": 0.668266801414856,
"grad_norm": 0.28937065601348877,
"learning_rate": 9.166612186892376e-06,
"loss": 1.2573,
"num_input_tokens_seen": 501775328,
"step": 5290
},
{
"epoch": 0.6695300656897423,
"grad_norm": 0.24364541471004486,
"learning_rate": 9.103717458755188e-06,
"loss": 1.2888,
"num_input_tokens_seen": 502721632,
"step": 5300
},
{
"epoch": 0.6707933299646286,
"grad_norm": 0.32249847054481506,
"learning_rate": 9.040963299751722e-06,
"loss": 1.2103,
"num_input_tokens_seen": 503649088,
"step": 5310
},
{
"epoch": 0.6720565942395149,
"grad_norm": 0.274586945772171,
"learning_rate": 8.978350760501413e-06,
"loss": 1.2604,
"num_input_tokens_seen": 504589696,
"step": 5320
},
{
"epoch": 0.6733198585144012,
"grad_norm": 0.25306662917137146,
"learning_rate": 8.915880889252758e-06,
"loss": 1.212,
"num_input_tokens_seen": 505495648,
"step": 5330
},
{
"epoch": 0.6745831227892876,
"grad_norm": 0.2675648629665375,
"learning_rate": 8.853554731865696e-06,
"loss": 1.2735,
"num_input_tokens_seen": 506399776,
"step": 5340
},
{
"epoch": 0.6758463870641738,
"grad_norm": 0.25868740677833557,
"learning_rate": 8.791373331794155e-06,
"loss": 1.2346,
"num_input_tokens_seen": 507369920,
"step": 5350
},
{
"epoch": 0.6771096513390601,
"grad_norm": 0.26915502548217773,
"learning_rate": 8.729337730068559e-06,
"loss": 1.2514,
"num_input_tokens_seen": 508312480,
"step": 5360
},
{
"epoch": 0.6783729156139464,
"grad_norm": 0.27946212887763977,
"learning_rate": 8.667448965278404e-06,
"loss": 1.2084,
"num_input_tokens_seen": 509257024,
"step": 5370
},
{
"epoch": 0.6796361798888327,
"grad_norm": 0.2765122950077057,
"learning_rate": 8.60570807355484e-06,
"loss": 1.2396,
"num_input_tokens_seen": 510240480,
"step": 5380
},
{
"epoch": 0.6808994441637191,
"grad_norm": 0.24776999652385712,
"learning_rate": 8.54411608855339e-06,
"loss": 1.1789,
"num_input_tokens_seen": 511188832,
"step": 5390
},
{
"epoch": 0.6821627084386054,
"grad_norm": 0.2991964519023895,
"learning_rate": 8.482674041436567e-06,
"loss": 1.2665,
"num_input_tokens_seen": 512158368,
"step": 5400
},
{
"epoch": 0.6834259727134917,
"grad_norm": 0.28031983971595764,
"learning_rate": 8.421382960856695e-06,
"loss": 1.2297,
"num_input_tokens_seen": 513132704,
"step": 5410
},
{
"epoch": 0.6846892369883779,
"grad_norm": 0.2627319395542145,
"learning_rate": 8.360243872938599e-06,
"loss": 1.2734,
"num_input_tokens_seen": 514124160,
"step": 5420
},
{
"epoch": 0.6859525012632642,
"grad_norm": 0.2459687888622284,
"learning_rate": 8.299257801262496e-06,
"loss": 1.2091,
"num_input_tokens_seen": 515011840,
"step": 5430
},
{
"epoch": 0.6872157655381506,
"grad_norm": 0.26756593585014343,
"learning_rate": 8.238425766846812e-06,
"loss": 1.2104,
"num_input_tokens_seen": 515957856,
"step": 5440
},
{
"epoch": 0.6884790298130369,
"grad_norm": 0.293277382850647,
"learning_rate": 8.177748788131119e-06,
"loss": 1.2523,
"num_input_tokens_seen": 516907040,
"step": 5450
},
{
"epoch": 0.6897422940879232,
"grad_norm": 0.2430182844400406,
"learning_rate": 8.117227880959081e-06,
"loss": 1.2209,
"num_input_tokens_seen": 517874624,
"step": 5460
},
{
"epoch": 0.6910055583628095,
"grad_norm": 0.26824715733528137,
"learning_rate": 8.056864058561416e-06,
"loss": 1.2237,
"num_input_tokens_seen": 518780064,
"step": 5470
},
{
"epoch": 0.6922688226376958,
"grad_norm": 0.2571701407432556,
"learning_rate": 7.996658331538978e-06,
"loss": 1.2251,
"num_input_tokens_seen": 519746560,
"step": 5480
},
{
"epoch": 0.6935320869125821,
"grad_norm": 0.25399723649024963,
"learning_rate": 7.936611707845793e-06,
"loss": 1.2448,
"num_input_tokens_seen": 520710432,
"step": 5490
},
{
"epoch": 0.6947953511874684,
"grad_norm": 0.24103257060050964,
"learning_rate": 7.876725192772224e-06,
"loss": 1.1599,
"num_input_tokens_seen": 521672128,
"step": 5500
},
{
"epoch": 0.6960586154623547,
"grad_norm": 0.2598767876625061,
"learning_rate": 7.816999788928119e-06,
"loss": 1.2595,
"num_input_tokens_seen": 522644576,
"step": 5510
},
{
"epoch": 0.697321879737241,
"grad_norm": 0.28568968176841736,
"learning_rate": 7.757436496226034e-06,
"loss": 1.2672,
"num_input_tokens_seen": 523695168,
"step": 5520
},
{
"epoch": 0.6985851440121273,
"grad_norm": 0.264839768409729,
"learning_rate": 7.698036311864467e-06,
"loss": 1.2521,
"num_input_tokens_seen": 524620992,
"step": 5530
},
{
"epoch": 0.6998484082870137,
"grad_norm": 0.27619093656539917,
"learning_rate": 7.638800230311206e-06,
"loss": 1.1977,
"num_input_tokens_seen": 525573280,
"step": 5540
},
{
"epoch": 0.7011116725619,
"grad_norm": 0.2585349380970001,
"learning_rate": 7.579729243286638e-06,
"loss": 1.2956,
"num_input_tokens_seen": 526491552,
"step": 5550
},
{
"epoch": 0.7023749368367862,
"grad_norm": 0.26802536845207214,
"learning_rate": 7.5208243397471995e-06,
"loss": 1.2719,
"num_input_tokens_seen": 527423648,
"step": 5560
},
{
"epoch": 0.7036382011116725,
"grad_norm": 0.2632644474506378,
"learning_rate": 7.462086505868744e-06,
"loss": 1.208,
"num_input_tokens_seen": 528368960,
"step": 5570
},
{
"epoch": 0.7049014653865588,
"grad_norm": 0.25977852940559387,
"learning_rate": 7.4035167250301035e-06,
"loss": 1.1928,
"num_input_tokens_seen": 529333984,
"step": 5580
},
{
"epoch": 0.7061647296614452,
"grad_norm": 0.2557479739189148,
"learning_rate": 7.345115977796573e-06,
"loss": 1.1766,
"num_input_tokens_seen": 530305760,
"step": 5590
},
{
"epoch": 0.7074279939363315,
"grad_norm": 0.2768225073814392,
"learning_rate": 7.286885241903531e-06,
"loss": 1.2209,
"num_input_tokens_seen": 531239232,
"step": 5600
},
{
"epoch": 0.7086912582112178,
"grad_norm": 0.27175867557525635,
"learning_rate": 7.2288254922400575e-06,
"loss": 1.2839,
"num_input_tokens_seen": 532124640,
"step": 5610
},
{
"epoch": 0.7099545224861041,
"grad_norm": 0.28098565340042114,
"learning_rate": 7.1709377008325895e-06,
"loss": 1.2523,
"num_input_tokens_seen": 533148320,
"step": 5620
},
{
"epoch": 0.7112177867609903,
"grad_norm": 0.2613276541233063,
"learning_rate": 7.113222836828695e-06,
"loss": 1.1796,
"num_input_tokens_seen": 534125856,
"step": 5630
},
{
"epoch": 0.7124810510358767,
"grad_norm": 0.24941375851631165,
"learning_rate": 7.055681866480792e-06,
"loss": 1.2102,
"num_input_tokens_seen": 535057408,
"step": 5640
},
{
"epoch": 0.713744315310763,
"grad_norm": 0.28444018959999084,
"learning_rate": 6.998315753130024e-06,
"loss": 1.1713,
"num_input_tokens_seen": 536041280,
"step": 5650
},
{
"epoch": 0.7150075795856493,
"grad_norm": 0.2781004309654236,
"learning_rate": 6.9411254571901e-06,
"loss": 1.2121,
"num_input_tokens_seen": 536970048,
"step": 5660
},
{
"epoch": 0.7162708438605356,
"grad_norm": 0.2684124708175659,
"learning_rate": 6.884111936131231e-06,
"loss": 1.2733,
"num_input_tokens_seen": 537863008,
"step": 5670
},
{
"epoch": 0.717534108135422,
"grad_norm": 0.27960875630378723,
"learning_rate": 6.82727614446407e-06,
"loss": 1.1975,
"num_input_tokens_seen": 538773152,
"step": 5680
},
{
"epoch": 0.7187973724103083,
"grad_norm": 0.24374781548976898,
"learning_rate": 6.770619033723783e-06,
"loss": 1.2273,
"num_input_tokens_seen": 539793088,
"step": 5690
},
{
"epoch": 0.7200606366851945,
"grad_norm": 0.2838081121444702,
"learning_rate": 6.714141552454072e-06,
"loss": 1.2066,
"num_input_tokens_seen": 540656768,
"step": 5700
},
{
"epoch": 0.7213239009600808,
"grad_norm": 0.24478621780872345,
"learning_rate": 6.657844646191328e-06,
"loss": 1.2102,
"num_input_tokens_seen": 541561248,
"step": 5710
},
{
"epoch": 0.7225871652349671,
"grad_norm": 0.2654918432235718,
"learning_rate": 6.6017292574487635e-06,
"loss": 1.2756,
"num_input_tokens_seen": 542457408,
"step": 5720
},
{
"epoch": 0.7238504295098535,
"grad_norm": 0.24361199140548706,
"learning_rate": 6.545796325700683e-06,
"loss": 1.1843,
"num_input_tokens_seen": 543394112,
"step": 5730
},
{
"epoch": 0.7251136937847398,
"grad_norm": 0.27256685495376587,
"learning_rate": 6.4900467873667e-06,
"loss": 1.2305,
"num_input_tokens_seen": 544360768,
"step": 5740
},
{
"epoch": 0.7263769580596261,
"grad_norm": 0.24635472893714905,
"learning_rate": 6.434481575796107e-06,
"loss": 1.243,
"num_input_tokens_seen": 545282080,
"step": 5750
},
{
"epoch": 0.7276402223345124,
"grad_norm": 0.306068480014801,
"learning_rate": 6.3791016212522256e-06,
"loss": 1.2045,
"num_input_tokens_seen": 546234848,
"step": 5760
},
{
"epoch": 0.7289034866093986,
"grad_norm": 0.26721495389938354,
"learning_rate": 6.32390785089682e-06,
"loss": 1.2897,
"num_input_tokens_seen": 547182400,
"step": 5770
},
{
"epoch": 0.730166750884285,
"grad_norm": 0.25117790699005127,
"learning_rate": 6.268901188774617e-06,
"loss": 1.2824,
"num_input_tokens_seen": 548096000,
"step": 5780
},
{
"epoch": 0.7314300151591713,
"grad_norm": 0.2862393260002136,
"learning_rate": 6.2140825557977745e-06,
"loss": 1.2498,
"num_input_tokens_seen": 549029216,
"step": 5790
},
{
"epoch": 0.7326932794340576,
"grad_norm": 0.25375497341156006,
"learning_rate": 6.159452869730546e-06,
"loss": 1.2498,
"num_input_tokens_seen": 550029152,
"step": 5800
},
{
"epoch": 0.7339565437089439,
"grad_norm": 0.2733435034751892,
"learning_rate": 6.1050130451738186e-06,
"loss": 1.1756,
"num_input_tokens_seen": 551018848,
"step": 5810
},
{
"epoch": 0.7352198079838302,
"grad_norm": 0.25357958674430847,
"learning_rate": 6.050763993549884e-06,
"loss": 1.1967,
"num_input_tokens_seen": 551936608,
"step": 5820
},
{
"epoch": 0.7364830722587166,
"grad_norm": 0.2535962760448456,
"learning_rate": 5.996706623087126e-06,
"loss": 1.251,
"num_input_tokens_seen": 552928192,
"step": 5830
},
{
"epoch": 0.7377463365336028,
"grad_norm": 0.26090991497039795,
"learning_rate": 5.942841838804848e-06,
"loss": 1.2385,
"num_input_tokens_seen": 553912960,
"step": 5840
},
{
"epoch": 0.7390096008084891,
"grad_norm": 0.2640230357646942,
"learning_rate": 5.889170542498102e-06,
"loss": 1.2426,
"num_input_tokens_seen": 554837248,
"step": 5850
},
{
"epoch": 0.7402728650833754,
"grad_norm": 0.24669994413852692,
"learning_rate": 5.835693632722607e-06,
"loss": 1.1978,
"num_input_tokens_seen": 555733696,
"step": 5860
},
{
"epoch": 0.7415361293582617,
"grad_norm": 0.2583445608615875,
"learning_rate": 5.7824120047796725e-06,
"loss": 1.2602,
"num_input_tokens_seen": 556739392,
"step": 5870
},
{
"epoch": 0.7427993936331481,
"grad_norm": 0.24428869783878326,
"learning_rate": 5.729326550701263e-06,
"loss": 1.2476,
"num_input_tokens_seen": 557767840,
"step": 5880
},
{
"epoch": 0.7440626579080344,
"grad_norm": 0.26555436849594116,
"learning_rate": 5.676438159235005e-06,
"loss": 1.265,
"num_input_tokens_seen": 558685312,
"step": 5890
},
{
"epoch": 0.7453259221829207,
"grad_norm": 0.29612812399864197,
"learning_rate": 5.623747715829356e-06,
"loss": 1.2436,
"num_input_tokens_seen": 559607904,
"step": 5900
},
{
"epoch": 0.7465891864578069,
"grad_norm": 0.26325854659080505,
"learning_rate": 5.571256102618758e-06,
"loss": 1.2447,
"num_input_tokens_seen": 560536256,
"step": 5910
},
{
"epoch": 0.7478524507326932,
"grad_norm": 0.2596051096916199,
"learning_rate": 5.518964198408862e-06,
"loss": 1.2401,
"num_input_tokens_seen": 561426784,
"step": 5920
},
{
"epoch": 0.7491157150075796,
"grad_norm": 0.28517597913742065,
"learning_rate": 5.466872878661839e-06,
"loss": 1.2213,
"num_input_tokens_seen": 562311360,
"step": 5930
},
{
"epoch": 0.7503789792824659,
"grad_norm": 0.24300004541873932,
"learning_rate": 5.414983015481682e-06,
"loss": 1.2828,
"num_input_tokens_seen": 563216640,
"step": 5940
},
{
"epoch": 0.7516422435573522,
"grad_norm": 0.26081758737564087,
"learning_rate": 5.363295477599677e-06,
"loss": 1.2356,
"num_input_tokens_seen": 564140992,
"step": 5950
},
{
"epoch": 0.7529055078322385,
"grad_norm": 0.30684077739715576,
"learning_rate": 5.311811130359772e-06,
"loss": 1.2487,
"num_input_tokens_seen": 565051296,
"step": 5960
},
{
"epoch": 0.7541687721071249,
"grad_norm": 0.243248850107193,
"learning_rate": 5.260530835704159e-06,
"loss": 1.2313,
"num_input_tokens_seen": 566038848,
"step": 5970
},
{
"epoch": 0.7554320363820111,
"grad_norm": 0.2502289116382599,
"learning_rate": 5.209455452158796e-06,
"loss": 1.2092,
"num_input_tokens_seen": 567044608,
"step": 5980
},
{
"epoch": 0.7566953006568974,
"grad_norm": 0.26396942138671875,
"learning_rate": 5.1585858348190666e-06,
"loss": 1.2309,
"num_input_tokens_seen": 567994848,
"step": 5990
},
{
"epoch": 0.7579585649317837,
"grad_norm": 0.2504906952381134,
"learning_rate": 5.107922835335452e-06,
"loss": 1.2367,
"num_input_tokens_seen": 568955808,
"step": 6000
},
{
"epoch": 0.7579585649317837,
"eval_loss": 1.2595031261444092,
"eval_runtime": 13.0677,
"eval_samples_per_second": 11.479,
"eval_steps_per_second": 0.765,
"num_input_tokens_seen": 568955808,
"step": 6000
},
{
"epoch": 0.75922182920667,
"grad_norm": 0.2684820592403412,
"learning_rate": 5.057467301899274e-06,
"loss": 1.1746,
"num_input_tokens_seen": 569895776,
"step": 6010
},
{
"epoch": 0.7604850934815564,
"grad_norm": 0.2721717655658722,
"learning_rate": 5.007220079228478e-06,
"loss": 1.2066,
"num_input_tokens_seen": 570859552,
"step": 6020
},
{
"epoch": 0.7617483577564427,
"grad_norm": 0.25938835740089417,
"learning_rate": 4.957182008553527e-06,
"loss": 1.2192,
"num_input_tokens_seen": 571787136,
"step": 6030
},
{
"epoch": 0.763011622031329,
"grad_norm": 0.25528407096862793,
"learning_rate": 4.9073539276032756e-06,
"loss": 1.2433,
"num_input_tokens_seen": 572685056,
"step": 6040
},
{
"epoch": 0.7642748863062152,
"grad_norm": 0.22747959196567535,
"learning_rate": 4.857736670590982e-06,
"loss": 1.2425,
"num_input_tokens_seen": 573630944,
"step": 6050
},
{
"epoch": 0.7655381505811015,
"grad_norm": 0.23501618206501007,
"learning_rate": 4.808331068200329e-06,
"loss": 1.3179,
"num_input_tokens_seen": 574504000,
"step": 6060
},
{
"epoch": 0.7668014148559879,
"grad_norm": 0.2590336203575134,
"learning_rate": 4.759137947571491e-06,
"loss": 1.2479,
"num_input_tokens_seen": 575465184,
"step": 6070
},
{
"epoch": 0.7680646791308742,
"grad_norm": 0.2563855051994324,
"learning_rate": 4.710158132287332e-06,
"loss": 1.2028,
"num_input_tokens_seen": 576397088,
"step": 6080
},
{
"epoch": 0.7693279434057605,
"grad_norm": 0.29565200209617615,
"learning_rate": 4.661392442359582e-06,
"loss": 1.2799,
"num_input_tokens_seen": 577387744,
"step": 6090
},
{
"epoch": 0.7705912076806468,
"grad_norm": 0.26293325424194336,
"learning_rate": 4.612841694215136e-06,
"loss": 1.2272,
"num_input_tokens_seen": 578310496,
"step": 6100
},
{
"epoch": 0.7718544719555331,
"grad_norm": 0.2616961598396301,
"learning_rate": 4.56450670068234e-06,
"loss": 1.2489,
"num_input_tokens_seen": 579258496,
"step": 6110
},
{
"epoch": 0.7731177362304194,
"grad_norm": 0.24685987830162048,
"learning_rate": 4.51638827097745e-06,
"loss": 1.2588,
"num_input_tokens_seen": 580197760,
"step": 6120
},
{
"epoch": 0.7743810005053057,
"grad_norm": 0.2490658164024353,
"learning_rate": 4.46848721069101e-06,
"loss": 1.293,
"num_input_tokens_seen": 581108448,
"step": 6130
},
{
"epoch": 0.775644264780192,
"grad_norm": 0.24475279450416565,
"learning_rate": 4.420804321774441e-06,
"loss": 1.287,
"num_input_tokens_seen": 582039072,
"step": 6140
},
{
"epoch": 0.7769075290550783,
"grad_norm": 0.2623221278190613,
"learning_rate": 4.373340402526543e-06,
"loss": 1.2117,
"num_input_tokens_seen": 582932992,
"step": 6150
},
{
"epoch": 0.7781707933299646,
"grad_norm": 0.27465909719467163,
"learning_rate": 4.326096247580186e-06,
"loss": 1.2135,
"num_input_tokens_seen": 583861568,
"step": 6160
},
{
"epoch": 0.779434057604851,
"grad_norm": 0.28181222081184387,
"learning_rate": 4.27907264788896e-06,
"loss": 1.2537,
"num_input_tokens_seen": 584843136,
"step": 6170
},
{
"epoch": 0.7806973218797373,
"grad_norm": 0.2493135631084442,
"learning_rate": 4.23227039071398e-06,
"loss": 1.2263,
"num_input_tokens_seen": 585837664,
"step": 6180
},
{
"epoch": 0.7819605861546235,
"grad_norm": 0.26791173219680786,
"learning_rate": 4.1856902596106726e-06,
"loss": 1.2273,
"num_input_tokens_seen": 586797536,
"step": 6190
},
{
"epoch": 0.7832238504295098,
"grad_norm": 0.26550182700157166,
"learning_rate": 4.139333034415663e-06,
"loss": 1.2031,
"num_input_tokens_seen": 587734880,
"step": 6200
},
{
"epoch": 0.7844871147043961,
"grad_norm": 0.27607518434524536,
"learning_rate": 4.0931994912337345e-06,
"loss": 1.2426,
"num_input_tokens_seen": 588659360,
"step": 6210
},
{
"epoch": 0.7857503789792825,
"grad_norm": 0.2891901433467865,
"learning_rate": 4.047290402424806e-06,
"loss": 1.2864,
"num_input_tokens_seen": 589628256,
"step": 6220
},
{
"epoch": 0.7870136432541688,
"grad_norm": 0.2835799753665924,
"learning_rate": 4.001606536591042e-06,
"loss": 1.2634,
"num_input_tokens_seen": 590567904,
"step": 6230
},
{
"epoch": 0.7882769075290551,
"grad_norm": 0.2466340959072113,
"learning_rate": 3.956148658563945e-06,
"loss": 1.1893,
"num_input_tokens_seen": 591514912,
"step": 6240
},
{
"epoch": 0.7895401718039414,
"grad_norm": 0.2408566027879715,
"learning_rate": 3.910917529391582e-06,
"loss": 1.1672,
"num_input_tokens_seen": 592500416,
"step": 6250
},
{
"epoch": 0.7908034360788276,
"grad_norm": 0.28810036182403564,
"learning_rate": 3.8659139063258146e-06,
"loss": 1.2376,
"num_input_tokens_seen": 593538144,
"step": 6260
},
{
"epoch": 0.792066700353714,
"grad_norm": 0.26853030920028687,
"learning_rate": 3.8211385428096474e-06,
"loss": 1.2726,
"num_input_tokens_seen": 594506272,
"step": 6270
},
{
"epoch": 0.7933299646286003,
"grad_norm": 0.2816145122051239,
"learning_rate": 3.7765921884645917e-06,
"loss": 1.3003,
"num_input_tokens_seen": 595431904,
"step": 6280
},
{
"epoch": 0.7945932289034866,
"grad_norm": 0.26149782538414,
"learning_rate": 3.7322755890781368e-06,
"loss": 1.2477,
"num_input_tokens_seen": 596461440,
"step": 6290
},
{
"epoch": 0.7958564931783729,
"grad_norm": 0.260708749294281,
"learning_rate": 3.68818948659125e-06,
"loss": 1.256,
"num_input_tokens_seen": 597473312,
"step": 6300
},
{
"epoch": 0.7971197574532592,
"grad_norm": 0.27105608582496643,
"learning_rate": 3.6443346190859598e-06,
"loss": 1.2488,
"num_input_tokens_seen": 598412000,
"step": 6310
},
{
"epoch": 0.7983830217281456,
"grad_norm": 0.24419113993644714,
"learning_rate": 3.600711720772991e-06,
"loss": 1.2774,
"num_input_tokens_seen": 599430656,
"step": 6320
},
{
"epoch": 0.7996462860030318,
"grad_norm": 0.25261548161506653,
"learning_rate": 3.557321521979489e-06,
"loss": 1.2279,
"num_input_tokens_seen": 600412224,
"step": 6330
},
{
"epoch": 0.8009095502779181,
"grad_norm": 0.25508007407188416,
"learning_rate": 3.51416474913678e-06,
"loss": 1.251,
"num_input_tokens_seen": 601375968,
"step": 6340
},
{
"epoch": 0.8021728145528044,
"grad_norm": 0.2806225121021271,
"learning_rate": 3.471242124768207e-06,
"loss": 1.2055,
"num_input_tokens_seen": 602286496,
"step": 6350
},
{
"epoch": 0.8034360788276907,
"grad_norm": 0.32982784509658813,
"learning_rate": 3.42855436747705e-06,
"loss": 1.2309,
"num_input_tokens_seen": 603281216,
"step": 6360
},
{
"epoch": 0.8046993431025771,
"grad_norm": 0.27231696248054504,
"learning_rate": 3.3861021919344735e-06,
"loss": 1.1807,
"num_input_tokens_seen": 604231360,
"step": 6370
},
{
"epoch": 0.8059626073774634,
"grad_norm": 0.2853865325450897,
"learning_rate": 3.3438863088675783e-06,
"loss": 1.2638,
"num_input_tokens_seen": 605138944,
"step": 6380
},
{
"epoch": 0.8072258716523497,
"grad_norm": 0.2520991563796997,
"learning_rate": 3.301907425047496e-06,
"loss": 1.2291,
"num_input_tokens_seen": 606092896,
"step": 6390
},
{
"epoch": 0.8084891359272359,
"grad_norm": 0.2628813683986664,
"learning_rate": 3.260166243277564e-06,
"loss": 1.2588,
"num_input_tokens_seen": 607004512,
"step": 6400
},
{
"epoch": 0.8097524002021222,
"grad_norm": 0.24886657297611237,
"learning_rate": 3.2186634623815337e-06,
"loss": 1.2636,
"num_input_tokens_seen": 607919360,
"step": 6410
},
{
"epoch": 0.8110156644770086,
"grad_norm": 0.2556428909301758,
"learning_rate": 3.177399777191912e-06,
"loss": 1.2427,
"num_input_tokens_seen": 608921984,
"step": 6420
},
{
"epoch": 0.8122789287518949,
"grad_norm": 0.24436554312705994,
"learning_rate": 3.1363758785382866e-06,
"loss": 1.2667,
"num_input_tokens_seen": 609854816,
"step": 6430
},
{
"epoch": 0.8135421930267812,
"grad_norm": 0.26374685764312744,
"learning_rate": 3.0955924532357908e-06,
"loss": 1.2398,
"num_input_tokens_seen": 610815712,
"step": 6440
},
{
"epoch": 0.8148054573016675,
"grad_norm": 0.28322839736938477,
"learning_rate": 3.055050184073599e-06,
"loss": 1.2552,
"num_input_tokens_seen": 611770144,
"step": 6450
},
{
"epoch": 0.8160687215765539,
"grad_norm": 0.2539218068122864,
"learning_rate": 3.0147497498034735e-06,
"loss": 1.202,
"num_input_tokens_seen": 612729024,
"step": 6460
},
{
"epoch": 0.8173319858514401,
"grad_norm": 0.27928316593170166,
"learning_rate": 2.974691825128433e-06,
"loss": 1.2777,
"num_input_tokens_seen": 613643488,
"step": 6470
},
{
"epoch": 0.8185952501263264,
"grad_norm": 0.26042285561561584,
"learning_rate": 2.934877080691438e-06,
"loss": 1.2077,
"num_input_tokens_seen": 614610560,
"step": 6480
},
{
"epoch": 0.8198585144012127,
"grad_norm": 0.24354539811611176,
"learning_rate": 2.8953061830641663e-06,
"loss": 1.191,
"num_input_tokens_seen": 615577216,
"step": 6490
},
{
"epoch": 0.821121778676099,
"grad_norm": 0.2690410912036896,
"learning_rate": 2.8559797947358463e-06,
"loss": 1.1872,
"num_input_tokens_seen": 616548384,
"step": 6500
},
{
"epoch": 0.8223850429509854,
"grad_norm": 0.2414551079273224,
"learning_rate": 2.8168985741021875e-06,
"loss": 1.2318,
"num_input_tokens_seen": 617543904,
"step": 6510
},
{
"epoch": 0.8236483072258717,
"grad_norm": 0.23589564859867096,
"learning_rate": 2.7780631754543265e-06,
"loss": 1.2087,
"num_input_tokens_seen": 618540128,
"step": 6520
},
{
"epoch": 0.824911571500758,
"grad_norm": 0.25712019205093384,
"learning_rate": 2.739474248967916e-06,
"loss": 1.1912,
"num_input_tokens_seen": 619500352,
"step": 6530
},
{
"epoch": 0.8261748357756442,
"grad_norm": 0.26267293095588684,
"learning_rate": 2.7011324406921816e-06,
"loss": 1.2882,
"num_input_tokens_seen": 620453920,
"step": 6540
},
{
"epoch": 0.8274381000505305,
"grad_norm": 0.2525344789028168,
"learning_rate": 2.6630383925391654e-06,
"loss": 1.2602,
"num_input_tokens_seen": 621427552,
"step": 6550
},
{
"epoch": 0.8287013643254169,
"grad_norm": 0.25016433000564575,
"learning_rate": 2.6251927422729305e-06,
"loss": 1.2071,
"num_input_tokens_seen": 622454432,
"step": 6560
},
{
"epoch": 0.8299646286003032,
"grad_norm": 0.24579358100891113,
"learning_rate": 2.5875961234989185e-06,
"loss": 1.2262,
"num_input_tokens_seen": 623389792,
"step": 6570
},
{
"epoch": 0.8312278928751895,
"grad_norm": 0.24960210919380188,
"learning_rate": 2.5502491656533293e-06,
"loss": 1.1894,
"num_input_tokens_seen": 624352928,
"step": 6580
},
{
"epoch": 0.8324911571500758,
"grad_norm": 0.2529809772968292,
"learning_rate": 2.513152493992568e-06,
"loss": 1.2355,
"num_input_tokens_seen": 625237472,
"step": 6590
},
{
"epoch": 0.8337544214249621,
"grad_norm": 0.2756924331188202,
"learning_rate": 2.4763067295828053e-06,
"loss": 1.1959,
"num_input_tokens_seen": 626200416,
"step": 6600
},
{
"epoch": 0.8350176856998484,
"grad_norm": 0.2560481131076813,
"learning_rate": 2.439712489289555e-06,
"loss": 1.1686,
"num_input_tokens_seen": 627085760,
"step": 6610
},
{
"epoch": 0.8362809499747347,
"grad_norm": 0.2564622461795807,
"learning_rate": 2.403370385767364e-06,
"loss": 1.2475,
"num_input_tokens_seen": 628078240,
"step": 6620
},
{
"epoch": 0.837544214249621,
"grad_norm": 0.2827485203742981,
"learning_rate": 2.367281027449548e-06,
"loss": 1.1958,
"num_input_tokens_seen": 629016384,
"step": 6630
},
{
"epoch": 0.8388074785245073,
"grad_norm": 0.2654615342617035,
"learning_rate": 2.3314450185380047e-06,
"loss": 1.278,
"num_input_tokens_seen": 629963040,
"step": 6640
},
{
"epoch": 0.8400707427993936,
"grad_norm": 0.26686492562294006,
"learning_rate": 2.295862958993091e-06,
"loss": 1.2544,
"num_input_tokens_seen": 630921504,
"step": 6650
},
{
"epoch": 0.84133400707428,
"grad_norm": 0.2568102180957794,
"learning_rate": 2.2605354445236036e-06,
"loss": 1.1788,
"num_input_tokens_seen": 631837184,
"step": 6660
},
{
"epoch": 0.8425972713491663,
"grad_norm": 0.2527879476547241,
"learning_rate": 2.2254630665767636e-06,
"loss": 1.2889,
"num_input_tokens_seen": 632828288,
"step": 6670
},
{
"epoch": 0.8438605356240525,
"grad_norm": 0.26815953850746155,
"learning_rate": 2.1906464123283744e-06,
"loss": 1.2576,
"num_input_tokens_seen": 633815520,
"step": 6680
},
{
"epoch": 0.8451237998989388,
"grad_norm": 0.2878230810165405,
"learning_rate": 2.156086064672924e-06,
"loss": 1.2808,
"num_input_tokens_seen": 634722208,
"step": 6690
},
{
"epoch": 0.8463870641738251,
"grad_norm": 0.2378537356853485,
"learning_rate": 2.1217826022138783e-06,
"loss": 1.1683,
"num_input_tokens_seen": 635706144,
"step": 6700
},
{
"epoch": 0.8476503284487115,
"grad_norm": 0.25701719522476196,
"learning_rate": 2.0877365992539653e-06,
"loss": 1.2215,
"num_input_tokens_seen": 636619104,
"step": 6710
},
{
"epoch": 0.8489135927235978,
"grad_norm": 0.24454209208488464,
"learning_rate": 2.0539486257855774e-06,
"loss": 1.262,
"num_input_tokens_seen": 637517568,
"step": 6720
},
{
"epoch": 0.8501768569984841,
"grad_norm": 0.2640119791030884,
"learning_rate": 2.0204192474812166e-06,
"loss": 1.2826,
"num_input_tokens_seen": 638479936,
"step": 6730
},
{
"epoch": 0.8514401212733704,
"grad_norm": 0.2534317076206207,
"learning_rate": 1.987149025684028e-06,
"loss": 1.2236,
"num_input_tokens_seen": 639357088,
"step": 6740
},
{
"epoch": 0.8527033855482566,
"grad_norm": 0.2551516890525818,
"learning_rate": 1.9541385173984074e-06,
"loss": 1.1855,
"num_input_tokens_seen": 640362912,
"step": 6750
},
{
"epoch": 0.853966649823143,
"grad_norm": 0.257917582988739,
"learning_rate": 1.921388275280664e-06,
"loss": 1.2111,
"num_input_tokens_seen": 641336448,
"step": 6760
},
{
"epoch": 0.8552299140980293,
"grad_norm": 0.2687523663043976,
"learning_rate": 1.888898847629779e-06,
"loss": 1.2092,
"num_input_tokens_seen": 642348704,
"step": 6770
},
{
"epoch": 0.8564931783729156,
"grad_norm": 0.27500104904174805,
"learning_rate": 1.8566707783782231e-06,
"loss": 1.2022,
"num_input_tokens_seen": 643290272,
"step": 6780
},
{
"epoch": 0.8577564426478019,
"grad_norm": 0.27554988861083984,
"learning_rate": 1.8247046070828535e-06,
"loss": 1.1901,
"num_input_tokens_seen": 644221792,
"step": 6790
},
{
"epoch": 0.8590197069226883,
"grad_norm": 0.2787459194660187,
"learning_rate": 1.7930008689158637e-06,
"loss": 1.2127,
"num_input_tokens_seen": 645176224,
"step": 6800
},
{
"epoch": 0.8602829711975746,
"grad_norm": 0.23403003811836243,
"learning_rate": 1.761560094655851e-06,
"loss": 1.2688,
"num_input_tokens_seen": 646193152,
"step": 6810
},
{
"epoch": 0.8615462354724608,
"grad_norm": 0.2776746451854706,
"learning_rate": 1.730382810678895e-06,
"loss": 1.2174,
"num_input_tokens_seen": 647194528,
"step": 6820
},
{
"epoch": 0.8628094997473471,
"grad_norm": 0.2932538092136383,
"learning_rate": 1.6994695389497982e-06,
"loss": 1.1361,
"num_input_tokens_seen": 648208224,
"step": 6830
},
{
"epoch": 0.8640727640222334,
"grad_norm": 0.26842474937438965,
"learning_rate": 1.6688207970132808e-06,
"loss": 1.2041,
"num_input_tokens_seen": 649171072,
"step": 6840
},
{
"epoch": 0.8653360282971198,
"grad_norm": 0.2833315134048462,
"learning_rate": 1.6384370979853776e-06,
"loss": 1.27,
"num_input_tokens_seen": 650172224,
"step": 6850
},
{
"epoch": 0.8665992925720061,
"grad_norm": 0.26029422879219055,
"learning_rate": 1.6083189505447964e-06,
"loss": 1.2732,
"num_input_tokens_seen": 651096864,
"step": 6860
},
{
"epoch": 0.8678625568468924,
"grad_norm": 0.2853679060935974,
"learning_rate": 1.578466858924442e-06,
"loss": 1.1936,
"num_input_tokens_seen": 652020192,
"step": 6870
},
{
"epoch": 0.8691258211217787,
"grad_norm": 0.28354784846305847,
"learning_rate": 1.548881322902959e-06,
"loss": 1.2461,
"num_input_tokens_seen": 652919488,
"step": 6880
},
{
"epoch": 0.8703890853966649,
"grad_norm": 0.2513621747493744,
"learning_rate": 1.5195628377963493e-06,
"loss": 1.2352,
"num_input_tokens_seen": 653868192,
"step": 6890
},
{
"epoch": 0.8716523496715513,
"grad_norm": 0.2537190616130829,
"learning_rate": 1.4905118944497058e-06,
"loss": 1.1954,
"num_input_tokens_seen": 654866304,
"step": 6900
},
{
"epoch": 0.8729156139464376,
"grad_norm": 0.26647478342056274,
"learning_rate": 1.4617289792289743e-06,
"loss": 1.2386,
"num_input_tokens_seen": 655850752,
"step": 6910
},
{
"epoch": 0.8741788782213239,
"grad_norm": 0.2586477994918823,
"learning_rate": 1.4332145740128345e-06,
"loss": 1.256,
"num_input_tokens_seen": 656778176,
"step": 6920
},
{
"epoch": 0.8754421424962102,
"grad_norm": 0.2705184817314148,
"learning_rate": 1.4049691561845975e-06,
"loss": 1.2329,
"num_input_tokens_seen": 657784128,
"step": 6930
},
{
"epoch": 0.8767054067710965,
"grad_norm": 0.2453477680683136,
"learning_rate": 1.376993198624248e-06,
"loss": 1.1833,
"num_input_tokens_seen": 658703168,
"step": 6940
},
{
"epoch": 0.8779686710459829,
"grad_norm": 0.25567731261253357,
"learning_rate": 1.3492871697005042e-06,
"loss": 1.2284,
"num_input_tokens_seen": 659688864,
"step": 6950
},
{
"epoch": 0.8792319353208691,
"grad_norm": 0.29871034622192383,
"learning_rate": 1.3218515332629892e-06,
"loss": 1.2664,
"num_input_tokens_seen": 660603104,
"step": 6960
},
{
"epoch": 0.8804951995957554,
"grad_norm": 0.25376957654953003,
"learning_rate": 1.2946867486344597e-06,
"loss": 1.2197,
"num_input_tokens_seen": 661552704,
"step": 6970
},
{
"epoch": 0.8817584638706417,
"grad_norm": 0.3075960874557495,
"learning_rate": 1.267793270603122e-06,
"loss": 1.1982,
"num_input_tokens_seen": 662524096,
"step": 6980
},
{
"epoch": 0.883021728145528,
"grad_norm": 0.2471645623445511,
"learning_rate": 1.2411715494150024e-06,
"loss": 1.1913,
"num_input_tokens_seen": 663442336,
"step": 6990
},
{
"epoch": 0.8842849924204144,
"grad_norm": 0.2692629098892212,
"learning_rate": 1.214822030766437e-06,
"loss": 1.2643,
"num_input_tokens_seen": 664365344,
"step": 7000
},
{
"epoch": 0.8855482566953007,
"grad_norm": 0.2840708792209625,
"learning_rate": 1.1887451557965732e-06,
"loss": 1.1826,
"num_input_tokens_seen": 665290880,
"step": 7010
},
{
"epoch": 0.886811520970187,
"grad_norm": 0.2730172574520111,
"learning_rate": 1.1629413610800198e-06,
"loss": 1.2738,
"num_input_tokens_seen": 666231392,
"step": 7020
},
{
"epoch": 0.8880747852450732,
"grad_norm": 0.28216251730918884,
"learning_rate": 1.1374110786195212e-06,
"loss": 1.1925,
"num_input_tokens_seen": 667211072,
"step": 7030
},
{
"epoch": 0.8893380495199595,
"grad_norm": 0.25766119360923767,
"learning_rate": 1.1121547358387154e-06,
"loss": 1.2013,
"num_input_tokens_seen": 668144320,
"step": 7040
},
{
"epoch": 0.8906013137948459,
"grad_norm": 0.24992607533931732,
"learning_rate": 1.087172755575001e-06,
"loss": 1.1939,
"num_input_tokens_seen": 669092064,
"step": 7050
},
{
"epoch": 0.8918645780697322,
"grad_norm": 0.26488760113716125,
"learning_rate": 1.0624655560724363e-06,
"loss": 1.2276,
"num_input_tokens_seen": 670011840,
"step": 7060
},
{
"epoch": 0.8931278423446185,
"grad_norm": 0.25586891174316406,
"learning_rate": 1.0380335509747583e-06,
"loss": 1.2528,
"num_input_tokens_seen": 670906560,
"step": 7070
},
{
"epoch": 0.8943911066195048,
"grad_norm": 0.2638219892978668,
"learning_rate": 1.0138771493184352e-06,
"loss": 1.2721,
"num_input_tokens_seen": 671885760,
"step": 7080
},
{
"epoch": 0.8956543708943911,
"grad_norm": 0.25774410367012024,
"learning_rate": 9.899967555258347e-07,
"loss": 1.2788,
"num_input_tokens_seen": 672838336,
"step": 7090
},
{
"epoch": 0.8969176351692774,
"grad_norm": 0.24537810683250427,
"learning_rate": 9.663927693984438e-07,
"loss": 1.2218,
"num_input_tokens_seen": 673773728,
"step": 7100
},
{
"epoch": 0.8981808994441637,
"grad_norm": 0.269209623336792,
"learning_rate": 9.430655861101829e-07,
"loss": 1.1914,
"num_input_tokens_seen": 674686496,
"step": 7110
},
{
"epoch": 0.89944416371905,
"grad_norm": 0.2713133692741394,
"learning_rate": 9.200155962007868e-07,
"loss": 1.221,
"num_input_tokens_seen": 675659040,
"step": 7120
},
{
"epoch": 0.9007074279939363,
"grad_norm": 0.2782800793647766,
"learning_rate": 8.972431855692685e-07,
"loss": 1.2197,
"num_input_tokens_seen": 676523936,
"step": 7130
},
{
"epoch": 0.9019706922688226,
"grad_norm": 0.28656941652297974,
"learning_rate": 8.747487354674457e-07,
"loss": 1.2924,
"num_input_tokens_seen": 677481408,
"step": 7140
},
{
"epoch": 0.903233956543709,
"grad_norm": 0.2603612542152405,
"learning_rate": 8.525326224935794e-07,
"loss": 1.2418,
"num_input_tokens_seen": 678461056,
"step": 7150
},
{
"epoch": 0.9044972208185953,
"grad_norm": 0.2789015471935272,
"learning_rate": 8.305952185860484e-07,
"loss": 1.1934,
"num_input_tokens_seen": 679452256,
"step": 7160
},
{
"epoch": 0.9057604850934815,
"grad_norm": 0.29948341846466064,
"learning_rate": 8.089368910171396e-07,
"loss": 1.2467,
"num_input_tokens_seen": 680371648,
"step": 7170
},
{
"epoch": 0.9070237493683678,
"grad_norm": 0.26572108268737793,
"learning_rate": 7.875580023868885e-07,
"loss": 1.1925,
"num_input_tokens_seen": 681355648,
"step": 7180
},
{
"epoch": 0.9082870136432541,
"grad_norm": 0.24899084866046906,
"learning_rate": 7.664589106170069e-07,
"loss": 1.252,
"num_input_tokens_seen": 682361344,
"step": 7190
},
{
"epoch": 0.9095502779181405,
"grad_norm": 0.24572855234146118,
"learning_rate": 7.456399689449052e-07,
"loss": 1.2339,
"num_input_tokens_seen": 683316896,
"step": 7200
},
{
"epoch": 0.9108135421930268,
"grad_norm": 0.2785273492336273,
"learning_rate": 7.251015259177561e-07,
"loss": 1.2259,
"num_input_tokens_seen": 684286528,
"step": 7210
},
{
"epoch": 0.9120768064679131,
"grad_norm": 0.24116089940071106,
"learning_rate": 7.048439253866866e-07,
"loss": 1.1971,
"num_input_tokens_seen": 685241440,
"step": 7220
},
{
"epoch": 0.9133400707427994,
"grad_norm": 0.25249651074409485,
"learning_rate": 6.848675065009904e-07,
"loss": 1.1883,
"num_input_tokens_seen": 686179008,
"step": 7230
},
{
"epoch": 0.9146033350176856,
"grad_norm": 0.24898767471313477,
"learning_rate": 6.651726037024796e-07,
"loss": 1.2214,
"num_input_tokens_seen": 687148992,
"step": 7240
},
{
"epoch": 0.915866599292572,
"grad_norm": 0.2656947672367096,
"learning_rate": 6.457595467198567e-07,
"loss": 1.1936,
"num_input_tokens_seen": 688136000,
"step": 7250
},
{
"epoch": 0.9171298635674583,
"grad_norm": 0.2621888816356659,
"learning_rate": 6.266286605632295e-07,
"loss": 1.2068,
"num_input_tokens_seen": 689067328,
"step": 7260
},
{
"epoch": 0.9183931278423446,
"grad_norm": 0.2367779016494751,
"learning_rate": 6.07780265518632e-07,
"loss": 1.2581,
"num_input_tokens_seen": 690001664,
"step": 7270
},
{
"epoch": 0.9196563921172309,
"grad_norm": 0.24973830580711365,
"learning_rate": 5.892146771426915e-07,
"loss": 1.2381,
"num_input_tokens_seen": 690943648,
"step": 7280
},
{
"epoch": 0.9209196563921173,
"grad_norm": 0.2687539756298065,
"learning_rate": 5.70932206257326e-07,
"loss": 1.2386,
"num_input_tokens_seen": 691864224,
"step": 7290
},
{
"epoch": 0.9221829206670036,
"grad_norm": 0.25320330262184143,
"learning_rate": 5.529331589445516e-07,
"loss": 1.2678,
"num_input_tokens_seen": 692833472,
"step": 7300
},
{
"epoch": 0.9234461849418898,
"grad_norm": 0.2584136426448822,
"learning_rate": 5.35217836541362e-07,
"loss": 1.2621,
"num_input_tokens_seen": 693706112,
"step": 7310
},
{
"epoch": 0.9247094492167761,
"grad_norm": 0.2527817487716675,
"learning_rate": 5.177865356346644e-07,
"loss": 1.2521,
"num_input_tokens_seen": 694636736,
"step": 7320
},
{
"epoch": 0.9259727134916624,
"grad_norm": 0.24299506843090057,
"learning_rate": 5.00639548056338e-07,
"loss": 1.2517,
"num_input_tokens_seen": 695631264,
"step": 7330
},
{
"epoch": 0.9272359777665488,
"grad_norm": 0.24970118701457977,
"learning_rate": 4.837771608783264e-07,
"loss": 1.2364,
"num_input_tokens_seen": 696587872,
"step": 7340
},
{
"epoch": 0.9284992420414351,
"grad_norm": 0.2587854564189911,
"learning_rate": 4.6719965640784676e-07,
"loss": 1.2376,
"num_input_tokens_seen": 697601376,
"step": 7350
},
{
"epoch": 0.9297625063163214,
"grad_norm": 0.26746806502342224,
"learning_rate": 4.509073121826623e-07,
"loss": 1.2466,
"num_input_tokens_seen": 698550432,
"step": 7360
},
{
"epoch": 0.9310257705912077,
"grad_norm": 0.269715815782547,
"learning_rate": 4.349004009664275e-07,
"loss": 1.2421,
"num_input_tokens_seen": 699511744,
"step": 7370
},
{
"epoch": 0.9322890348660939,
"grad_norm": 0.24946600198745728,
"learning_rate": 4.1917919074412416e-07,
"loss": 1.1982,
"num_input_tokens_seen": 700446176,
"step": 7380
},
{
"epoch": 0.9335522991409803,
"grad_norm": 0.281342089176178,
"learning_rate": 4.037439447175789e-07,
"loss": 1.2408,
"num_input_tokens_seen": 701373568,
"step": 7390
},
{
"epoch": 0.9348155634158666,
"grad_norm": 0.2512856125831604,
"learning_rate": 3.88594921301055e-07,
"loss": 1.2414,
"num_input_tokens_seen": 702294016,
"step": 7400
},
{
"epoch": 0.9360788276907529,
"grad_norm": 0.2601119577884674,
"learning_rate": 3.737323741169257e-07,
"loss": 1.2491,
"num_input_tokens_seen": 703232672,
"step": 7410
},
{
"epoch": 0.9373420919656392,
"grad_norm": 0.270298033952713,
"learning_rate": 3.5915655199142663e-07,
"loss": 1.2174,
"num_input_tokens_seen": 704175744,
"step": 7420
},
{
"epoch": 0.9386053562405255,
"grad_norm": 0.23530983924865723,
"learning_rate": 3.448676989504925e-07,
"loss": 1.2368,
"num_input_tokens_seen": 705141664,
"step": 7430
},
{
"epoch": 0.9398686205154119,
"grad_norm": 0.2633696496486664,
"learning_rate": 3.308660542156694e-07,
"loss": 1.2018,
"num_input_tokens_seen": 706067200,
"step": 7440
},
{
"epoch": 0.9411318847902981,
"grad_norm": 0.26215797662734985,
"learning_rate": 3.1715185220010984e-07,
"loss": 1.2193,
"num_input_tokens_seen": 706966304,
"step": 7450
},
{
"epoch": 0.9423951490651844,
"grad_norm": 0.27117466926574707,
"learning_rate": 3.037253225046529e-07,
"loss": 1.2907,
"num_input_tokens_seen": 707921440,
"step": 7460
},
{
"epoch": 0.9436584133400707,
"grad_norm": 0.27227288484573364,
"learning_rate": 2.905866899139708e-07,
"loss": 1.251,
"num_input_tokens_seen": 708838784,
"step": 7470
},
{
"epoch": 0.944921677614957,
"grad_norm": 0.26309284567832947,
"learning_rate": 2.777361743928194e-07,
"loss": 1.2574,
"num_input_tokens_seen": 709754176,
"step": 7480
},
{
"epoch": 0.9461849418898434,
"grad_norm": 0.24601784348487854,
"learning_rate": 2.6517399108233886e-07,
"loss": 1.1808,
"num_input_tokens_seen": 710722944,
"step": 7490
},
{
"epoch": 0.9474482061647297,
"grad_norm": 0.28660014271736145,
"learning_rate": 2.5290035029646523e-07,
"loss": 1.2572,
"num_input_tokens_seen": 711716256,
"step": 7500
},
{
"epoch": 0.948711470439616,
"grad_norm": 0.2446954995393753,
"learning_rate": 2.409154575184077e-07,
"loss": 1.1996,
"num_input_tokens_seen": 712625856,
"step": 7510
},
{
"epoch": 0.9499747347145022,
"grad_norm": 0.2447938770055771,
"learning_rate": 2.2921951339720053e-07,
"loss": 1.2414,
"num_input_tokens_seen": 713581728,
"step": 7520
},
{
"epoch": 0.9512379989893885,
"grad_norm": 0.2409149706363678,
"learning_rate": 2.178127137443489e-07,
"loss": 1.1916,
"num_input_tokens_seen": 714471360,
"step": 7530
},
{
"epoch": 0.9525012632642749,
"grad_norm": 0.25430941581726074,
"learning_rate": 2.0669524953055377e-07,
"loss": 1.2343,
"num_input_tokens_seen": 715391488,
"step": 7540
},
{
"epoch": 0.9537645275391612,
"grad_norm": 0.27573850750923157,
"learning_rate": 1.9586730688250395e-07,
"loss": 1.2559,
"num_input_tokens_seen": 716352896,
"step": 7550
},
{
"epoch": 0.9550277918140475,
"grad_norm": 0.2683832347393036,
"learning_rate": 1.8532906707978106e-07,
"loss": 1.2169,
"num_input_tokens_seen": 717298784,
"step": 7560
},
{
"epoch": 0.9562910560889338,
"grad_norm": 0.28321197628974915,
"learning_rate": 1.7508070655179757e-07,
"loss": 1.2796,
"num_input_tokens_seen": 718316000,
"step": 7570
},
{
"epoch": 0.9575543203638202,
"grad_norm": 0.25757691264152527,
"learning_rate": 1.65122396874863e-07,
"loss": 1.2222,
"num_input_tokens_seen": 719217248,
"step": 7580
},
{
"epoch": 0.9588175846387064,
"grad_norm": 0.2687084972858429,
"learning_rate": 1.5545430476930465e-07,
"loss": 1.1853,
"num_input_tokens_seen": 720198464,
"step": 7590
},
{
"epoch": 0.9600808489135927,
"grad_norm": 0.2586497664451599,
"learning_rate": 1.4607659209667165e-07,
"loss": 1.2438,
"num_input_tokens_seen": 721068160,
"step": 7600
},
{
"epoch": 0.961344113188479,
"grad_norm": 0.24861587584018707,
"learning_rate": 1.3698941585704033e-07,
"loss": 1.2712,
"num_input_tokens_seen": 722061472,
"step": 7610
},
{
"epoch": 0.9626073774633653,
"grad_norm": 0.244459331035614,
"learning_rate": 1.281929281863639e-07,
"loss": 1.1897,
"num_input_tokens_seen": 723015232,
"step": 7620
},
{
"epoch": 0.9638706417382517,
"grad_norm": 0.225861594080925,
"learning_rate": 1.1968727635394497e-07,
"loss": 1.2689,
"num_input_tokens_seen": 724000384,
"step": 7630
},
{
"epoch": 0.965133906013138,
"grad_norm": 0.246552512049675,
"learning_rate": 1.1147260275995634e-07,
"loss": 1.1784,
"num_input_tokens_seen": 724964992,
"step": 7640
},
{
"epoch": 0.9663971702880243,
"grad_norm": 0.2584232687950134,
"learning_rate": 1.0354904493306865e-07,
"loss": 1.2263,
"num_input_tokens_seen": 725923104,
"step": 7650
},
{
"epoch": 0.9676604345629105,
"grad_norm": 0.25840452313423157,
"learning_rate": 9.591673552813844e-08,
"loss": 1.2081,
"num_input_tokens_seen": 726876224,
"step": 7660
},
{
"epoch": 0.9689236988377968,
"grad_norm": 0.28871768712997437,
"learning_rate": 8.85758023239913e-08,
"loss": 1.2545,
"num_input_tokens_seen": 727721568,
"step": 7670
},
{
"epoch": 0.9701869631126832,
"grad_norm": 0.29037731885910034,
"learning_rate": 8.152636822127883e-08,
"loss": 1.2221,
"num_input_tokens_seen": 728634912,
"step": 7680
},
{
"epoch": 0.9714502273875695,
"grad_norm": 0.2691645324230194,
"learning_rate": 7.476855124043086e-08,
"loss": 1.2158,
"num_input_tokens_seen": 729574464,
"step": 7690
},
{
"epoch": 0.9727134916624558,
"grad_norm": 0.2742849290370941,
"learning_rate": 6.830246451966975e-08,
"loss": 1.2089,
"num_input_tokens_seen": 730499136,
"step": 7700
},
{
"epoch": 0.9739767559373421,
"grad_norm": 0.26165613532066345,
"learning_rate": 6.212821631311621e-08,
"loss": 1.2314,
"num_input_tokens_seen": 731461280,
"step": 7710
},
{
"epoch": 0.9752400202122284,
"grad_norm": 0.24117015302181244,
"learning_rate": 5.624590998898615e-08,
"loss": 1.2055,
"num_input_tokens_seen": 732374848,
"step": 7720
},
{
"epoch": 0.9765032844871147,
"grad_norm": 0.2643440365791321,
"learning_rate": 5.0655644027847994e-08,
"loss": 1.2044,
"num_input_tokens_seen": 733271648,
"step": 7730
},
{
"epoch": 0.977766548762001,
"grad_norm": 0.24681268632411957,
"learning_rate": 4.5357512020986755e-08,
"loss": 1.1749,
"num_input_tokens_seen": 734233312,
"step": 7740
},
{
"epoch": 0.9790298130368873,
"grad_norm": 0.28687500953674316,
"learning_rate": 4.0351602668824423e-08,
"loss": 1.2237,
"num_input_tokens_seen": 735189120,
"step": 7750
},
{
"epoch": 0.9802930773117736,
"grad_norm": 0.2667155861854553,
"learning_rate": 3.563799977944537e-08,
"loss": 1.2138,
"num_input_tokens_seen": 736120128,
"step": 7760
},
{
"epoch": 0.9815563415866599,
"grad_norm": 0.25432640314102173,
"learning_rate": 3.121678226718577e-08,
"loss": 1.1976,
"num_input_tokens_seen": 737063456,
"step": 7770
},
{
"epoch": 0.9828196058615463,
"grad_norm": 0.2468518167734146,
"learning_rate": 2.708802415131828e-08,
"loss": 1.2268,
"num_input_tokens_seen": 738004096,
"step": 7780
},
{
"epoch": 0.9840828701364326,
"grad_norm": 0.27853333950042725,
"learning_rate": 2.3251794554806636e-08,
"loss": 1.2074,
"num_input_tokens_seen": 739017440,
"step": 7790
},
{
"epoch": 0.9853461344113188,
"grad_norm": 0.26621630787849426,
"learning_rate": 1.9708157703157424e-08,
"loss": 1.213,
"num_input_tokens_seen": 740034656,
"step": 7800
},
{
"epoch": 0.9866093986862051,
"grad_norm": 0.2626071572303772,
"learning_rate": 1.645717292333204e-08,
"loss": 1.2604,
"num_input_tokens_seen": 741063104,
"step": 7810
},
{
"epoch": 0.9878726629610914,
"grad_norm": 0.26386693120002747,
"learning_rate": 1.3498894642769432e-08,
"loss": 1.2779,
"num_input_tokens_seen": 742014688,
"step": 7820
},
{
"epoch": 0.9891359272359778,
"grad_norm": 0.2615217864513397,
"learning_rate": 1.0833372388455442e-08,
"loss": 1.2108,
"num_input_tokens_seen": 742960160,
"step": 7830
},
{
"epoch": 0.9903991915108641,
"grad_norm": 0.2661604881286621,
"learning_rate": 8.460650786114576e-09,
"loss": 1.1899,
"num_input_tokens_seen": 743845760,
"step": 7840
},
{
"epoch": 0.9916624557857504,
"grad_norm": 0.26591452956199646,
"learning_rate": 6.380769559444499e-09,
"loss": 1.2474,
"num_input_tokens_seen": 744760672,
"step": 7850
},
{
"epoch": 0.9929257200606367,
"grad_norm": 0.27036914229393005,
"learning_rate": 4.5937635294671094e-09,
"loss": 1.2709,
"num_input_tokens_seen": 745728352,
"step": 7860
},
{
"epoch": 0.9941889843355229,
"grad_norm": 0.24849487841129303,
"learning_rate": 3.099662613930132e-09,
"loss": 1.2096,
"num_input_tokens_seen": 746640928,
"step": 7870
},
{
"epoch": 0.9954522486104093,
"grad_norm": 0.2538692057132721,
"learning_rate": 1.8984918268175055e-09,
"loss": 1.2464,
"num_input_tokens_seen": 747588896,
"step": 7880
},
{
"epoch": 0.9967155128852956,
"grad_norm": 0.26595503091812134,
"learning_rate": 9.902712779277788e-10,
"loss": 1.2883,
"num_input_tokens_seen": 748464864,
"step": 7890
},
{
"epoch": 0.9979787771601819,
"grad_norm": 0.27239322662353516,
"learning_rate": 3.7501617253216096e-10,
"loss": 1.1961,
"num_input_tokens_seen": 749490752,
"step": 7900
},
{
"epoch": 0.9992420414350682,
"grad_norm": 0.2784164249897003,
"learning_rate": 5.2736811129716613e-11,
"loss": 1.2785,
"num_input_tokens_seen": 750395392,
"step": 7910
},
{
"epoch": 1.0,
"num_input_tokens_seen": 750938410,
"step": 7916,
"total_flos": 3.6248418467253043e+18,
"train_loss": 1.2696220230851407,
"train_runtime": 79988.0702,
"train_samples_per_second": 12.667,
"train_steps_per_second": 0.099
}
],
"logging_steps": 10,
"max_steps": 7916,
"num_input_tokens_seen": 750938410,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.6248418467253043e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}