root
add ckpt27
070e573
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.6877706435717483,
"eval_steps": 1000,
"global_step": 4500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005972823652381664,
"grad_norm": 0.5743309259414673,
"learning_rate": 1.5904572564612327e-06,
"loss": 2.7537,
"step": 10
},
{
"epoch": 0.011945647304763328,
"grad_norm": 0.5460094809532166,
"learning_rate": 3.1809145129224655e-06,
"loss": 2.7612,
"step": 20
},
{
"epoch": 0.01791847095714499,
"grad_norm": 0.5363145470619202,
"learning_rate": 4.7713717693836985e-06,
"loss": 2.7609,
"step": 30
},
{
"epoch": 0.023891294609526655,
"grad_norm": 0.5279455184936523,
"learning_rate": 6.361829025844931e-06,
"loss": 2.7607,
"step": 40
},
{
"epoch": 0.029864118261908316,
"grad_norm": 0.5061234831809998,
"learning_rate": 7.952286282306164e-06,
"loss": 2.784,
"step": 50
},
{
"epoch": 0.03583694191428998,
"grad_norm": 0.476898729801178,
"learning_rate": 9.542743538767397e-06,
"loss": 2.762,
"step": 60
},
{
"epoch": 0.041809765566671646,
"grad_norm": 0.4454072415828705,
"learning_rate": 1.113320079522863e-05,
"loss": 2.7716,
"step": 70
},
{
"epoch": 0.04778258921905331,
"grad_norm": 3.1541287899017334,
"learning_rate": 1.2723658051689862e-05,
"loss": 2.8849,
"step": 80
},
{
"epoch": 0.05375541287143497,
"grad_norm": 0.19107532501220703,
"learning_rate": 1.4314115308151095e-05,
"loss": 3.1147,
"step": 90
},
{
"epoch": 0.05972823652381663,
"grad_norm": 0.13281038403511047,
"learning_rate": 1.590457256461233e-05,
"loss": 2.5574,
"step": 100
},
{
"epoch": 0.0657010601761983,
"grad_norm": 0.08191326260566711,
"learning_rate": 1.749502982107356e-05,
"loss": 2.4446,
"step": 110
},
{
"epoch": 0.07167388382857996,
"grad_norm": 0.08300579339265823,
"learning_rate": 1.9085487077534794e-05,
"loss": 2.3524,
"step": 120
},
{
"epoch": 0.07764670748096163,
"grad_norm": 0.0590679906308651,
"learning_rate": 2.0675944333996028e-05,
"loss": 2.2819,
"step": 130
},
{
"epoch": 0.08361953113334329,
"grad_norm": 0.052923623472452164,
"learning_rate": 2.226640159045726e-05,
"loss": 2.2261,
"step": 140
},
{
"epoch": 0.08959235478572496,
"grad_norm": 0.05208205804228783,
"learning_rate": 2.385685884691849e-05,
"loss": 2.1889,
"step": 150
},
{
"epoch": 0.09556517843810662,
"grad_norm": 0.0485885925590992,
"learning_rate": 2.5447316103379724e-05,
"loss": 2.1694,
"step": 160
},
{
"epoch": 0.10153800209048827,
"grad_norm": 0.04901551082730293,
"learning_rate": 2.7037773359840955e-05,
"loss": 2.1272,
"step": 170
},
{
"epoch": 0.10751082574286994,
"grad_norm": 0.04524153470993042,
"learning_rate": 2.862823061630219e-05,
"loss": 2.1085,
"step": 180
},
{
"epoch": 0.1134836493952516,
"grad_norm": 0.04201298579573631,
"learning_rate": 3.021868787276342e-05,
"loss": 2.0902,
"step": 190
},
{
"epoch": 0.11945647304763327,
"grad_norm": 0.053612083196640015,
"learning_rate": 3.180914512922466e-05,
"loss": 2.0855,
"step": 200
},
{
"epoch": 0.12542929670001493,
"grad_norm": 0.04812688007950783,
"learning_rate": 3.3399602385685885e-05,
"loss": 2.0469,
"step": 210
},
{
"epoch": 0.1314021203523966,
"grad_norm": 0.0483262836933136,
"learning_rate": 3.499005964214712e-05,
"loss": 2.0264,
"step": 220
},
{
"epoch": 0.13737494400477826,
"grad_norm": 0.05456310138106346,
"learning_rate": 3.6580516898608353e-05,
"loss": 2.0201,
"step": 230
},
{
"epoch": 0.14334776765715992,
"grad_norm": 0.06978671252727509,
"learning_rate": 3.817097415506959e-05,
"loss": 1.9967,
"step": 240
},
{
"epoch": 0.1493205913095416,
"grad_norm": 0.049219317734241486,
"learning_rate": 3.976143141153082e-05,
"loss": 1.9909,
"step": 250
},
{
"epoch": 0.15529341496192325,
"grad_norm": 0.04814588651061058,
"learning_rate": 4.1351888667992056e-05,
"loss": 1.9793,
"step": 260
},
{
"epoch": 0.16126623861430492,
"grad_norm": 0.06128086522221565,
"learning_rate": 4.2942345924453284e-05,
"loss": 1.9703,
"step": 270
},
{
"epoch": 0.16723906226668658,
"grad_norm": 0.06803273409605026,
"learning_rate": 4.453280318091452e-05,
"loss": 1.9484,
"step": 280
},
{
"epoch": 0.17321188591906825,
"grad_norm": 0.06598497182130814,
"learning_rate": 4.612326043737575e-05,
"loss": 1.9251,
"step": 290
},
{
"epoch": 0.1791847095714499,
"grad_norm": 0.05581754818558693,
"learning_rate": 4.771371769383698e-05,
"loss": 1.9211,
"step": 300
},
{
"epoch": 0.18515753322383158,
"grad_norm": 0.06264442205429077,
"learning_rate": 4.9304174950298214e-05,
"loss": 1.9047,
"step": 310
},
{
"epoch": 0.19113035687621324,
"grad_norm": 0.05809122323989868,
"learning_rate": 5.089463220675945e-05,
"loss": 1.8948,
"step": 320
},
{
"epoch": 0.1971031805285949,
"grad_norm": 0.05478562042117119,
"learning_rate": 5.248508946322068e-05,
"loss": 1.8924,
"step": 330
},
{
"epoch": 0.20307600418097654,
"grad_norm": 0.060149796307086945,
"learning_rate": 5.407554671968191e-05,
"loss": 1.8776,
"step": 340
},
{
"epoch": 0.2090488278333582,
"grad_norm": 0.06282585859298706,
"learning_rate": 5.5666003976143144e-05,
"loss": 1.8752,
"step": 350
},
{
"epoch": 0.21502165148573987,
"grad_norm": 0.06441989541053772,
"learning_rate": 5.725646123260438e-05,
"loss": 1.8632,
"step": 360
},
{
"epoch": 0.22099447513812154,
"grad_norm": 0.05681062117218971,
"learning_rate": 5.8846918489065606e-05,
"loss": 1.8475,
"step": 370
},
{
"epoch": 0.2269672987905032,
"grad_norm": 0.05155131593346596,
"learning_rate": 6.043737574552684e-05,
"loss": 1.8431,
"step": 380
},
{
"epoch": 0.23294012244288487,
"grad_norm": 0.05347074940800667,
"learning_rate": 6.202783300198807e-05,
"loss": 1.8416,
"step": 390
},
{
"epoch": 0.23891294609526653,
"grad_norm": 0.06694310158491135,
"learning_rate": 6.361829025844931e-05,
"loss": 1.8344,
"step": 400
},
{
"epoch": 0.2448857697476482,
"grad_norm": 0.06079185754060745,
"learning_rate": 6.520874751491054e-05,
"loss": 1.8297,
"step": 410
},
{
"epoch": 0.25085859340002986,
"grad_norm": 0.05415233224630356,
"learning_rate": 6.679920477137177e-05,
"loss": 1.82,
"step": 420
},
{
"epoch": 0.2568314170524115,
"grad_norm": 0.0645110234618187,
"learning_rate": 6.838966202783301e-05,
"loss": 1.8137,
"step": 430
},
{
"epoch": 0.2628042407047932,
"grad_norm": 0.06045007333159447,
"learning_rate": 6.998011928429424e-05,
"loss": 1.8048,
"step": 440
},
{
"epoch": 0.26877706435717486,
"grad_norm": 0.05600131303071976,
"learning_rate": 7.157057654075547e-05,
"loss": 1.7854,
"step": 450
},
{
"epoch": 0.2747498880095565,
"grad_norm": 0.06498062610626221,
"learning_rate": 7.316103379721671e-05,
"loss": 1.798,
"step": 460
},
{
"epoch": 0.2807227116619382,
"grad_norm": 0.053577929735183716,
"learning_rate": 7.475149105367795e-05,
"loss": 1.7883,
"step": 470
},
{
"epoch": 0.28669553531431985,
"grad_norm": 0.09097382426261902,
"learning_rate": 7.634194831013918e-05,
"loss": 1.78,
"step": 480
},
{
"epoch": 0.2926683589667015,
"grad_norm": 0.057212598621845245,
"learning_rate": 7.79324055666004e-05,
"loss": 1.7705,
"step": 490
},
{
"epoch": 0.2986411826190832,
"grad_norm": 0.055311623960733414,
"learning_rate": 7.952286282306164e-05,
"loss": 1.7739,
"step": 500
},
{
"epoch": 0.30461400627146484,
"grad_norm": 0.07679615169763565,
"learning_rate": 7.999952636882403e-05,
"loss": 1.7705,
"step": 510
},
{
"epoch": 0.3105868299238465,
"grad_norm": 0.10281822085380554,
"learning_rate": 7.999720656965739e-05,
"loss": 1.7639,
"step": 520
},
{
"epoch": 0.3165596535762282,
"grad_norm": 0.07636060565710068,
"learning_rate": 7.999295372099362e-05,
"loss": 1.7539,
"step": 530
},
{
"epoch": 0.32253247722860984,
"grad_norm": 0.057714689522981644,
"learning_rate": 7.998676802837124e-05,
"loss": 1.7541,
"step": 540
},
{
"epoch": 0.3285053008809915,
"grad_norm": 0.06505981832742691,
"learning_rate": 7.997864979074237e-05,
"loss": 1.7487,
"step": 550
},
{
"epoch": 0.33447812453337317,
"grad_norm": 0.05842842161655426,
"learning_rate": 7.996859940045832e-05,
"loss": 1.739,
"step": 560
},
{
"epoch": 0.34045094818575483,
"grad_norm": 0.051559966057538986,
"learning_rate": 7.995661734325054e-05,
"loss": 1.7443,
"step": 570
},
{
"epoch": 0.3464237718381365,
"grad_norm": 0.20853149890899658,
"learning_rate": 7.994270419820721e-05,
"loss": 1.7719,
"step": 580
},
{
"epoch": 0.35239659549051816,
"grad_norm": 0.09151974320411682,
"learning_rate": 7.992686063774525e-05,
"loss": 1.7817,
"step": 590
},
{
"epoch": 0.3583694191428998,
"grad_norm": 0.05926055088639259,
"learning_rate": 7.99090874275778e-05,
"loss": 1.7469,
"step": 600
},
{
"epoch": 0.3643422427952815,
"grad_norm": 0.044228848069906235,
"learning_rate": 7.988938542667721e-05,
"loss": 1.7393,
"step": 610
},
{
"epoch": 0.37031506644766315,
"grad_norm": 0.0427553653717041,
"learning_rate": 7.986775558723355e-05,
"loss": 1.7307,
"step": 620
},
{
"epoch": 0.3762878901000448,
"grad_norm": 0.0548509880900383,
"learning_rate": 7.984419895460858e-05,
"loss": 1.7205,
"step": 630
},
{
"epoch": 0.3822607137524265,
"grad_norm": 0.057041749358177185,
"learning_rate": 7.981871666728525e-05,
"loss": 1.7225,
"step": 640
},
{
"epoch": 0.38823353740480815,
"grad_norm": 0.056601762771606445,
"learning_rate": 7.979130995681263e-05,
"loss": 1.7088,
"step": 650
},
{
"epoch": 0.3942063610571898,
"grad_norm": 0.06844093650579453,
"learning_rate": 7.976198014774637e-05,
"loss": 1.7073,
"step": 660
},
{
"epoch": 0.4001791847095714,
"grad_norm": 0.0546780526638031,
"learning_rate": 7.973072865758483e-05,
"loss": 1.7121,
"step": 670
},
{
"epoch": 0.4061520083619531,
"grad_norm": 0.04654558375477791,
"learning_rate": 7.969755699670041e-05,
"loss": 1.6951,
"step": 680
},
{
"epoch": 0.41212483201433475,
"grad_norm": 0.06478898227214813,
"learning_rate": 7.966246676826661e-05,
"loss": 1.7055,
"step": 690
},
{
"epoch": 0.4180976556667164,
"grad_norm": 0.06878198683261871,
"learning_rate": 7.962545966818062e-05,
"loss": 1.6987,
"step": 700
},
{
"epoch": 0.4240704793190981,
"grad_norm": 0.05675249919295311,
"learning_rate": 7.95865374849812e-05,
"loss": 1.6998,
"step": 710
},
{
"epoch": 0.43004330297147975,
"grad_norm": 0.05516457185149193,
"learning_rate": 7.954570209976239e-05,
"loss": 1.6852,
"step": 720
},
{
"epoch": 0.4360161266238614,
"grad_norm": 0.05688585340976715,
"learning_rate": 7.950295548608256e-05,
"loss": 1.6901,
"step": 730
},
{
"epoch": 0.4419889502762431,
"grad_norm": 0.07187242805957794,
"learning_rate": 7.945829970986898e-05,
"loss": 1.6894,
"step": 740
},
{
"epoch": 0.44796177392862474,
"grad_norm": 0.0548662506043911,
"learning_rate": 7.941173692931801e-05,
"loss": 1.6819,
"step": 750
},
{
"epoch": 0.4539345975810064,
"grad_norm": 0.0926741436123848,
"learning_rate": 7.93632693947908e-05,
"loss": 1.6797,
"step": 760
},
{
"epoch": 0.45990742123338807,
"grad_norm": 0.04921697825193405,
"learning_rate": 7.931289944870448e-05,
"loss": 1.6629,
"step": 770
},
{
"epoch": 0.46588024488576973,
"grad_norm": 0.07487112283706665,
"learning_rate": 7.92606295254191e-05,
"loss": 1.6737,
"step": 780
},
{
"epoch": 0.4718530685381514,
"grad_norm": 0.07180643826723099,
"learning_rate": 7.920646215111973e-05,
"loss": 1.6716,
"step": 790
},
{
"epoch": 0.47782589219053306,
"grad_norm": 0.050522662699222565,
"learning_rate": 7.915039994369462e-05,
"loss": 1.6597,
"step": 800
},
{
"epoch": 0.48379871584291473,
"grad_norm": 0.0628654807806015,
"learning_rate": 7.909244561260855e-05,
"loss": 1.6722,
"step": 810
},
{
"epoch": 0.4897715394952964,
"grad_norm": 0.07348821312189102,
"learning_rate": 7.903260195877184e-05,
"loss": 1.6718,
"step": 820
},
{
"epoch": 0.49574436314767806,
"grad_norm": 0.0689951702952385,
"learning_rate": 7.897087187440512e-05,
"loss": 1.6658,
"step": 830
},
{
"epoch": 0.5017171868000597,
"grad_norm": 0.05663711205124855,
"learning_rate": 7.890725834289946e-05,
"loss": 1.6636,
"step": 840
},
{
"epoch": 0.5076900104524414,
"grad_norm": 0.050597622990608215,
"learning_rate": 7.884176443867219e-05,
"loss": 1.6648,
"step": 850
},
{
"epoch": 0.513662834104823,
"grad_norm": 0.05792626738548279,
"learning_rate": 7.87743933270183e-05,
"loss": 1.6582,
"step": 860
},
{
"epoch": 0.5196356577572048,
"grad_norm": 0.05193015933036804,
"learning_rate": 7.870514826395755e-05,
"loss": 1.664,
"step": 870
},
{
"epoch": 0.5256084814095864,
"grad_norm": 0.05836218595504761,
"learning_rate": 7.863403259607698e-05,
"loss": 1.6535,
"step": 880
},
{
"epoch": 0.531581305061968,
"grad_norm": 0.08420410752296448,
"learning_rate": 7.856104976036928e-05,
"loss": 1.6463,
"step": 890
},
{
"epoch": 0.5375541287143497,
"grad_norm": 0.06460799276828766,
"learning_rate": 7.848620328406663e-05,
"loss": 1.6615,
"step": 900
},
{
"epoch": 0.5435269523667313,
"grad_norm": 0.08191855251789093,
"learning_rate": 7.840949678447022e-05,
"loss": 1.6529,
"step": 910
},
{
"epoch": 0.549499776019113,
"grad_norm": 0.04835124313831329,
"learning_rate": 7.833093396877546e-05,
"loss": 1.6508,
"step": 920
},
{
"epoch": 0.5554725996714946,
"grad_norm": 0.047752317041158676,
"learning_rate": 7.82505186338928e-05,
"loss": 1.6484,
"step": 930
},
{
"epoch": 0.5614454233238764,
"grad_norm": 0.054417744278907776,
"learning_rate": 7.816825466626419e-05,
"loss": 1.6443,
"step": 940
},
{
"epoch": 0.567418246976258,
"grad_norm": 0.0538078136742115,
"learning_rate": 7.808414604167537e-05,
"loss": 1.6422,
"step": 950
},
{
"epoch": 0.5733910706286397,
"grad_norm": 0.04438367858529091,
"learning_rate": 7.799819682506353e-05,
"loss": 1.6443,
"step": 960
},
{
"epoch": 0.5793638942810213,
"grad_norm": 0.056033167988061905,
"learning_rate": 7.791041117032102e-05,
"loss": 1.6428,
"step": 970
},
{
"epoch": 0.585336717933403,
"grad_norm": 0.07095460593700409,
"learning_rate": 7.782079332009454e-05,
"loss": 1.6425,
"step": 980
},
{
"epoch": 0.5913095415857846,
"grad_norm": 0.05874691903591156,
"learning_rate": 7.772934760558005e-05,
"loss": 1.6346,
"step": 990
},
{
"epoch": 0.5972823652381664,
"grad_norm": 0.0521966814994812,
"learning_rate": 7.76360784463135e-05,
"loss": 1.6359,
"step": 1000
},
{
"epoch": 0.5972823652381664,
"eval_loss": 1.634853482246399,
"eval_runtime": 28.9256,
"eval_samples_per_second": 1197.311,
"eval_steps_per_second": 9.369,
"step": 1000
},
{
"epoch": 0.603255188890548,
"grad_norm": 0.052664998918771744,
"learning_rate": 7.754099034995727e-05,
"loss": 1.6383,
"step": 1010
},
{
"epoch": 0.6092280125429297,
"grad_norm": 0.08000710606575012,
"learning_rate": 7.744408791208214e-05,
"loss": 1.639,
"step": 1020
},
{
"epoch": 0.6152008361953113,
"grad_norm": 0.05873206630349159,
"learning_rate": 7.734537581594545e-05,
"loss": 1.632,
"step": 1030
},
{
"epoch": 0.621173659847693,
"grad_norm": 0.06116827204823494,
"learning_rate": 7.724485883226454e-05,
"loss": 1.6351,
"step": 1040
},
{
"epoch": 0.6271464835000746,
"grad_norm": 0.057659681886434555,
"learning_rate": 7.714254181898627e-05,
"loss": 1.637,
"step": 1050
},
{
"epoch": 0.6331193071524563,
"grad_norm": 0.05905848369002342,
"learning_rate": 7.703842972105228e-05,
"loss": 1.626,
"step": 1060
},
{
"epoch": 0.639092130804838,
"grad_norm": 0.0539986751973629,
"learning_rate": 7.693252757015991e-05,
"loss": 1.6278,
"step": 1070
},
{
"epoch": 0.6450649544572197,
"grad_norm": 0.062365371733903885,
"learning_rate": 7.682484048451908e-05,
"loss": 1.6187,
"step": 1080
},
{
"epoch": 0.6510377781096013,
"grad_norm": 0.0486634224653244,
"learning_rate": 7.671537366860494e-05,
"loss": 1.6223,
"step": 1090
},
{
"epoch": 0.657010601761983,
"grad_norm": 0.04700983688235283,
"learning_rate": 7.660413241290626e-05,
"loss": 1.6237,
"step": 1100
},
{
"epoch": 0.6629834254143646,
"grad_norm": 0.06423746794462204,
"learning_rate": 7.649112209366985e-05,
"loss": 1.6349,
"step": 1110
},
{
"epoch": 0.6689562490667463,
"grad_norm": 0.05183717608451843,
"learning_rate": 7.637634817264064e-05,
"loss": 1.6203,
"step": 1120
},
{
"epoch": 0.6749290727191279,
"grad_norm": 0.05448286980390549,
"learning_rate": 7.625981619679777e-05,
"loss": 1.6159,
"step": 1130
},
{
"epoch": 0.6809018963715097,
"grad_norm": 0.06012860685586929,
"learning_rate": 7.61415317980865e-05,
"loss": 1.6106,
"step": 1140
},
{
"epoch": 0.6868747200238913,
"grad_norm": 0.0491897277534008,
"learning_rate": 7.602150069314598e-05,
"loss": 1.613,
"step": 1150
},
{
"epoch": 0.692847543676273,
"grad_norm": 0.05050448700785637,
"learning_rate": 7.589972868303301e-05,
"loss": 1.6158,
"step": 1160
},
{
"epoch": 0.6988203673286546,
"grad_norm": 0.05027921870350838,
"learning_rate": 7.577622165294165e-05,
"loss": 1.6166,
"step": 1170
},
{
"epoch": 0.7047931909810363,
"grad_norm": 0.061239466071128845,
"learning_rate": 7.565098557191882e-05,
"loss": 1.607,
"step": 1180
},
{
"epoch": 0.7107660146334179,
"grad_norm": 0.04995877295732498,
"learning_rate": 7.552402649257578e-05,
"loss": 1.6152,
"step": 1190
},
{
"epoch": 0.7167388382857997,
"grad_norm": 0.04830503091216087,
"learning_rate": 7.539535055079569e-05,
"loss": 1.613,
"step": 1200
},
{
"epoch": 0.7227116619381813,
"grad_norm": 0.05787483602762222,
"learning_rate": 7.526496396543691e-05,
"loss": 1.614,
"step": 1210
},
{
"epoch": 0.728684485590563,
"grad_norm": 0.07437578588724136,
"learning_rate": 7.513287303803263e-05,
"loss": 1.6127,
"step": 1220
},
{
"epoch": 0.7346573092429446,
"grad_norm": 0.06587845832109451,
"learning_rate": 7.499908415248616e-05,
"loss": 1.6015,
"step": 1230
},
{
"epoch": 0.7406301328953263,
"grad_norm": 0.0692521184682846,
"learning_rate": 7.486360377476255e-05,
"loss": 1.6026,
"step": 1240
},
{
"epoch": 0.7466029565477079,
"grad_norm": 0.061289019882678986,
"learning_rate": 7.472643845257592e-05,
"loss": 1.6108,
"step": 1250
},
{
"epoch": 0.7525757802000896,
"grad_norm": 0.056076616048812866,
"learning_rate": 7.458759481507318e-05,
"loss": 1.6018,
"step": 1260
},
{
"epoch": 0.7585486038524712,
"grad_norm": 0.06620051711797714,
"learning_rate": 7.444707957251354e-05,
"loss": 1.6048,
"step": 1270
},
{
"epoch": 0.764521427504853,
"grad_norm": 0.05557152256369591,
"learning_rate": 7.430489951594422e-05,
"loss": 1.6091,
"step": 1280
},
{
"epoch": 0.7704942511572346,
"grad_norm": 0.04953812435269356,
"learning_rate": 7.416106151687224e-05,
"loss": 1.6026,
"step": 1290
},
{
"epoch": 0.7764670748096163,
"grad_norm": 0.042427971959114075,
"learning_rate": 7.40155725269324e-05,
"loss": 1.5983,
"step": 1300
},
{
"epoch": 0.7824398984619979,
"grad_norm": 0.05906856432557106,
"learning_rate": 7.386843957755123e-05,
"loss": 1.6008,
"step": 1310
},
{
"epoch": 0.7884127221143796,
"grad_norm": 0.04983474314212799,
"learning_rate": 7.371966977960713e-05,
"loss": 1.5973,
"step": 1320
},
{
"epoch": 0.7943855457667612,
"grad_norm": 0.0590224526822567,
"learning_rate": 7.356927032308682e-05,
"loss": 1.6011,
"step": 1330
},
{
"epoch": 0.8003583694191428,
"grad_norm": 0.057693641632795334,
"learning_rate": 7.341724847673775e-05,
"loss": 1.5942,
"step": 1340
},
{
"epoch": 0.8063311930715246,
"grad_norm": 0.040723856538534164,
"learning_rate": 7.326361158771688e-05,
"loss": 1.6011,
"step": 1350
},
{
"epoch": 0.8123040167239062,
"grad_norm": 0.05768086016178131,
"learning_rate": 7.31083670812355e-05,
"loss": 1.5999,
"step": 1360
},
{
"epoch": 0.8182768403762879,
"grad_norm": 0.06345749646425247,
"learning_rate": 7.29515224602005e-05,
"loss": 1.5985,
"step": 1370
},
{
"epoch": 0.8242496640286695,
"grad_norm": 0.06176001578569412,
"learning_rate": 7.27930853048516e-05,
"loss": 1.5971,
"step": 1380
},
{
"epoch": 0.8302224876810512,
"grad_norm": 0.05247745290398598,
"learning_rate": 7.263306327239516e-05,
"loss": 1.5958,
"step": 1390
},
{
"epoch": 0.8361953113334328,
"grad_norm": 0.05218351632356644,
"learning_rate": 7.247146409663401e-05,
"loss": 1.5981,
"step": 1400
},
{
"epoch": 0.8421681349858146,
"grad_norm": 0.0629679337143898,
"learning_rate": 7.23082955875937e-05,
"loss": 1.5949,
"step": 1410
},
{
"epoch": 0.8481409586381962,
"grad_norm": 0.061205677688121796,
"learning_rate": 7.214356563114505e-05,
"loss": 1.5957,
"step": 1420
},
{
"epoch": 0.8541137822905779,
"grad_norm": 0.06122026965022087,
"learning_rate": 7.197728218862306e-05,
"loss": 1.5911,
"step": 1430
},
{
"epoch": 0.8600866059429595,
"grad_norm": 0.054293327033519745,
"learning_rate": 7.180945329644204e-05,
"loss": 1.5885,
"step": 1440
},
{
"epoch": 0.8660594295953412,
"grad_norm": 0.04569542035460472,
"learning_rate": 7.164008706570736e-05,
"loss": 1.5893,
"step": 1450
},
{
"epoch": 0.8720322532477228,
"grad_norm": 0.04415179416537285,
"learning_rate": 7.146919168182333e-05,
"loss": 1.5951,
"step": 1460
},
{
"epoch": 0.8780050769001045,
"grad_norm": 0.052418701350688934,
"learning_rate": 7.129677540409762e-05,
"loss": 1.5999,
"step": 1470
},
{
"epoch": 0.8839779005524862,
"grad_norm": 0.053583066910505295,
"learning_rate": 7.112284656534215e-05,
"loss": 1.5979,
"step": 1480
},
{
"epoch": 0.8899507242048679,
"grad_norm": 0.06733547151088715,
"learning_rate": 7.09474135714703e-05,
"loss": 1.5871,
"step": 1490
},
{
"epoch": 0.8959235478572495,
"grad_norm": 0.05455510690808296,
"learning_rate": 7.07704849010907e-05,
"loss": 1.5912,
"step": 1500
},
{
"epoch": 0.9018963715096312,
"grad_norm": 0.05950945243239403,
"learning_rate": 7.059206910509745e-05,
"loss": 1.5958,
"step": 1510
},
{
"epoch": 0.9078691951620128,
"grad_norm": 0.0513860359787941,
"learning_rate": 7.041217480625683e-05,
"loss": 1.5856,
"step": 1520
},
{
"epoch": 0.9138420188143945,
"grad_norm": 0.05268612131476402,
"learning_rate": 7.023081069879062e-05,
"loss": 1.5846,
"step": 1530
},
{
"epoch": 0.9198148424667761,
"grad_norm": 0.05923028290271759,
"learning_rate": 7.004798554795586e-05,
"loss": 1.5739,
"step": 1540
},
{
"epoch": 0.9257876661191579,
"grad_norm": 0.04859180748462677,
"learning_rate": 6.986370818962125e-05,
"loss": 1.5927,
"step": 1550
},
{
"epoch": 0.9317604897715395,
"grad_norm": 0.060852836817502975,
"learning_rate": 6.967798752984012e-05,
"loss": 1.5769,
"step": 1560
},
{
"epoch": 0.9377333134239212,
"grad_norm": 0.053088609129190445,
"learning_rate": 6.949083254442001e-05,
"loss": 1.5845,
"step": 1570
},
{
"epoch": 0.9437061370763028,
"grad_norm": 0.06042907387018204,
"learning_rate": 6.930225227848887e-05,
"loss": 1.5808,
"step": 1580
},
{
"epoch": 0.9496789607286845,
"grad_norm": 0.05746331810951233,
"learning_rate": 6.911225584605787e-05,
"loss": 1.5821,
"step": 1590
},
{
"epoch": 0.9556517843810661,
"grad_norm": 0.04398033022880554,
"learning_rate": 6.892085242958098e-05,
"loss": 1.5775,
"step": 1600
},
{
"epoch": 0.9616246080334478,
"grad_norm": 0.050728365778923035,
"learning_rate": 6.872805127951115e-05,
"loss": 1.5749,
"step": 1610
},
{
"epoch": 0.9675974316858295,
"grad_norm": 0.0519120879471302,
"learning_rate": 6.85338617138533e-05,
"loss": 1.5726,
"step": 1620
},
{
"epoch": 0.9735702553382112,
"grad_norm": 0.052526745945215225,
"learning_rate": 6.833829311771388e-05,
"loss": 1.5793,
"step": 1630
},
{
"epoch": 0.9795430789905928,
"grad_norm": 0.050527602434158325,
"learning_rate": 6.814135494284735e-05,
"loss": 1.5694,
"step": 1640
},
{
"epoch": 0.9855159026429745,
"grad_norm": 0.08685663342475891,
"learning_rate": 6.794305670719945e-05,
"loss": 1.5803,
"step": 1650
},
{
"epoch": 0.9914887262953561,
"grad_norm": 0.054428499191999435,
"learning_rate": 6.774340799444703e-05,
"loss": 1.5757,
"step": 1660
},
{
"epoch": 0.9974615499477378,
"grad_norm": 0.05870772898197174,
"learning_rate": 6.754241845353506e-05,
"loss": 1.571,
"step": 1670
},
{
"epoch": 1.0034343736001194,
"grad_norm": 0.05581633001565933,
"learning_rate": 6.734009779821018e-05,
"loss": 1.5659,
"step": 1680
},
{
"epoch": 1.0094071972525012,
"grad_norm": 0.05493481829762459,
"learning_rate": 6.713645580655125e-05,
"loss": 1.5686,
"step": 1690
},
{
"epoch": 1.0153800209048829,
"grad_norm": 0.05471092462539673,
"learning_rate": 6.693150232049686e-05,
"loss": 1.5649,
"step": 1700
},
{
"epoch": 1.0213528445572644,
"grad_norm": 0.053526680916547775,
"learning_rate": 6.672524724536956e-05,
"loss": 1.5671,
"step": 1710
},
{
"epoch": 1.027325668209646,
"grad_norm": 0.06532900780439377,
"learning_rate": 6.651770054939722e-05,
"loss": 1.5614,
"step": 1720
},
{
"epoch": 1.0332984918620278,
"grad_norm": 0.051929574459791183,
"learning_rate": 6.630887226323128e-05,
"loss": 1.556,
"step": 1730
},
{
"epoch": 1.0392713155144095,
"grad_norm": 0.06289497762918472,
"learning_rate": 6.609877247946186e-05,
"loss": 1.5634,
"step": 1740
},
{
"epoch": 1.045244139166791,
"grad_norm": 0.05371445044875145,
"learning_rate": 6.588741135213012e-05,
"loss": 1.5645,
"step": 1750
},
{
"epoch": 1.0512169628191728,
"grad_norm": 0.04851632937788963,
"learning_rate": 6.567479909623746e-05,
"loss": 1.5648,
"step": 1760
},
{
"epoch": 1.0571897864715545,
"grad_norm": 0.06357111036777496,
"learning_rate": 6.546094598725186e-05,
"loss": 1.5568,
"step": 1770
},
{
"epoch": 1.063162610123936,
"grad_norm": 0.07035905867815018,
"learning_rate": 6.524586236061117e-05,
"loss": 1.5519,
"step": 1780
},
{
"epoch": 1.0691354337763177,
"grad_norm": 0.05517163127660751,
"learning_rate": 6.502955861122377e-05,
"loss": 1.5566,
"step": 1790
},
{
"epoch": 1.0751082574286994,
"grad_norm": 0.0504322424530983,
"learning_rate": 6.481204519296606e-05,
"loss": 1.5668,
"step": 1800
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.051910221576690674,
"learning_rate": 6.459333261817726e-05,
"loss": 1.5585,
"step": 1810
},
{
"epoch": 1.0870539047334629,
"grad_norm": 0.07319536805152893,
"learning_rate": 6.43734314571514e-05,
"loss": 1.5599,
"step": 1820
},
{
"epoch": 1.0930267283858444,
"grad_norm": 0.05212223529815674,
"learning_rate": 6.415235233762635e-05,
"loss": 1.5597,
"step": 1830
},
{
"epoch": 1.098999552038226,
"grad_norm": 0.05524059012532234,
"learning_rate": 6.393010594427034e-05,
"loss": 1.5449,
"step": 1840
},
{
"epoch": 1.1049723756906078,
"grad_norm": 0.044485364109277725,
"learning_rate": 6.370670301816544e-05,
"loss": 1.5584,
"step": 1850
},
{
"epoch": 1.1109451993429893,
"grad_norm": 0.04716966673731804,
"learning_rate": 6.348215435628852e-05,
"loss": 1.5577,
"step": 1860
},
{
"epoch": 1.116918022995371,
"grad_norm": 0.04776601493358612,
"learning_rate": 6.32564708109894e-05,
"loss": 1.5597,
"step": 1870
},
{
"epoch": 1.1228908466477527,
"grad_norm": 0.05379948392510414,
"learning_rate": 6.302966328946638e-05,
"loss": 1.5542,
"step": 1880
},
{
"epoch": 1.1288636703001345,
"grad_norm": 0.05076327919960022,
"learning_rate": 6.280174275323915e-05,
"loss": 1.5564,
"step": 1890
},
{
"epoch": 1.134836493952516,
"grad_norm": 0.0562434047460556,
"learning_rate": 6.257272021761884e-05,
"loss": 1.5597,
"step": 1900
},
{
"epoch": 1.1408093176048977,
"grad_norm": 0.045845337212085724,
"learning_rate": 6.234260675117595e-05,
"loss": 1.5535,
"step": 1910
},
{
"epoch": 1.1467821412572794,
"grad_norm": 0.04580407217144966,
"learning_rate": 6.21114134752051e-05,
"loss": 1.5486,
"step": 1920
},
{
"epoch": 1.1527549649096611,
"grad_norm": 0.05752042680978775,
"learning_rate": 6.187915156318775e-05,
"loss": 1.5454,
"step": 1930
},
{
"epoch": 1.1587277885620426,
"grad_norm": 0.05608632043004036,
"learning_rate": 6.164583224025215e-05,
"loss": 1.5545,
"step": 1940
},
{
"epoch": 1.1647006122144243,
"grad_norm": 0.047604430466890335,
"learning_rate": 6.141146678263076e-05,
"loss": 1.5531,
"step": 1950
},
{
"epoch": 1.170673435866806,
"grad_norm": 0.04514037445187569,
"learning_rate": 6.117606651711537e-05,
"loss": 1.5547,
"step": 1960
},
{
"epoch": 1.1766462595191878,
"grad_norm": 0.05768571048974991,
"learning_rate": 6.0939642820509564e-05,
"loss": 1.5496,
"step": 1970
},
{
"epoch": 1.1826190831715693,
"grad_norm": 0.04222779721021652,
"learning_rate": 6.070220711907903e-05,
"loss": 1.5469,
"step": 1980
},
{
"epoch": 1.188591906823951,
"grad_norm": 0.05183190852403641,
"learning_rate": 6.046377088799923e-05,
"loss": 1.5526,
"step": 1990
},
{
"epoch": 1.1945647304763327,
"grad_norm": 0.04888539016246796,
"learning_rate": 6.0224345650800826e-05,
"loss": 1.5579,
"step": 2000
},
{
"epoch": 1.1945647304763327,
"eval_loss": 1.5546131134033203,
"eval_runtime": 20.1679,
"eval_samples_per_second": 1717.237,
"eval_steps_per_second": 13.437,
"step": 2000
},
{
"epoch": 1.2005375541287144,
"grad_norm": 0.049841009080410004,
"learning_rate": 5.998394297881277e-05,
"loss": 1.5531,
"step": 2010
},
{
"epoch": 1.206510377781096,
"grad_norm": 0.04911394044756889,
"learning_rate": 5.974257449060306e-05,
"loss": 1.5512,
"step": 2020
},
{
"epoch": 1.2124832014334777,
"grad_norm": 0.05170886963605881,
"learning_rate": 5.9500251851417206e-05,
"loss": 1.5439,
"step": 2030
},
{
"epoch": 1.2184560250858594,
"grad_norm": 0.04615171626210213,
"learning_rate": 5.925698677261449e-05,
"loss": 1.5453,
"step": 2040
},
{
"epoch": 1.224428848738241,
"grad_norm": 0.04724368825554848,
"learning_rate": 5.901279101110191e-05,
"loss": 1.5434,
"step": 2050
},
{
"epoch": 1.2304016723906226,
"grad_norm": 0.06991260498762131,
"learning_rate": 5.8767676368766016e-05,
"loss": 1.5489,
"step": 2060
},
{
"epoch": 1.2363744960430043,
"grad_norm": 0.055575910955667496,
"learning_rate": 5.852165469190251e-05,
"loss": 1.5514,
"step": 2070
},
{
"epoch": 1.242347319695386,
"grad_norm": 0.04874608293175697,
"learning_rate": 5.82747378706437e-05,
"loss": 1.5523,
"step": 2080
},
{
"epoch": 1.2483201433477678,
"grad_norm": 0.05960864573717117,
"learning_rate": 5.8026937838383914e-05,
"loss": 1.5469,
"step": 2090
},
{
"epoch": 1.2542929670001493,
"grad_norm": 0.07086056470870972,
"learning_rate": 5.77782665712027e-05,
"loss": 1.5497,
"step": 2100
},
{
"epoch": 1.260265790652531,
"grad_norm": 0.0472436398267746,
"learning_rate": 5.752873608728603e-05,
"loss": 1.5425,
"step": 2110
},
{
"epoch": 1.2662386143049127,
"grad_norm": 0.06843575835227966,
"learning_rate": 5.7278358446345545e-05,
"loss": 1.542,
"step": 2120
},
{
"epoch": 1.2722114379572944,
"grad_norm": 0.04991114139556885,
"learning_rate": 5.702714574903561e-05,
"loss": 1.5423,
"step": 2130
},
{
"epoch": 1.278184261609676,
"grad_norm": 0.04601559415459633,
"learning_rate": 5.6775110136368576e-05,
"loss": 1.5357,
"step": 2140
},
{
"epoch": 1.2841570852620576,
"grad_norm": 0.042647868394851685,
"learning_rate": 5.6522263789127937e-05,
"loss": 1.5386,
"step": 2150
},
{
"epoch": 1.2901299089144393,
"grad_norm": 0.06261768937110901,
"learning_rate": 5.626861892727969e-05,
"loss": 1.5428,
"step": 2160
},
{
"epoch": 1.2961027325668208,
"grad_norm": 0.04735434427857399,
"learning_rate": 5.601418780938175e-05,
"loss": 1.5395,
"step": 2170
},
{
"epoch": 1.3020755562192026,
"grad_norm": 0.048824459314346313,
"learning_rate": 5.575898273199146e-05,
"loss": 1.5418,
"step": 2180
},
{
"epoch": 1.3080483798715843,
"grad_norm": 0.04974917694926262,
"learning_rate": 5.5503016029071354e-05,
"loss": 1.5371,
"step": 2190
},
{
"epoch": 1.314021203523966,
"grad_norm": 0.05275791883468628,
"learning_rate": 5.5246300071392985e-05,
"loss": 1.5364,
"step": 2200
},
{
"epoch": 1.3199940271763477,
"grad_norm": 0.0487825907766819,
"learning_rate": 5.4988847265939146e-05,
"loss": 1.5436,
"step": 2210
},
{
"epoch": 1.3259668508287292,
"grad_norm": 0.06100558117032051,
"learning_rate": 5.473067005530416e-05,
"loss": 1.5351,
"step": 2220
},
{
"epoch": 1.331939674481111,
"grad_norm": 0.07098929584026337,
"learning_rate": 5.447178091709262e-05,
"loss": 1.5463,
"step": 2230
},
{
"epoch": 1.3379124981334927,
"grad_norm": 0.06729080528020859,
"learning_rate": 5.421219236331624e-05,
"loss": 1.5382,
"step": 2240
},
{
"epoch": 1.3438853217858742,
"grad_norm": 0.05485675856471062,
"learning_rate": 5.395191693978927e-05,
"loss": 1.5349,
"step": 2250
},
{
"epoch": 1.3498581454382559,
"grad_norm": 0.05816954746842384,
"learning_rate": 5.3690967225522076e-05,
"loss": 1.5406,
"step": 2260
},
{
"epoch": 1.3558309690906376,
"grad_norm": 0.044427741318941116,
"learning_rate": 5.342935583211327e-05,
"loss": 1.5309,
"step": 2270
},
{
"epoch": 1.3618037927430193,
"grad_norm": 0.05544894561171532,
"learning_rate": 5.31670954031401e-05,
"loss": 1.5365,
"step": 2280
},
{
"epoch": 1.367776616395401,
"grad_norm": 0.04774465411901474,
"learning_rate": 5.290419861354753e-05,
"loss": 1.5303,
"step": 2290
},
{
"epoch": 1.3737494400477825,
"grad_norm": 0.050910986959934235,
"learning_rate": 5.264067816903552e-05,
"loss": 1.5384,
"step": 2300
},
{
"epoch": 1.3797222637001643,
"grad_norm": 0.05830187723040581,
"learning_rate": 5.2376546805445054e-05,
"loss": 1.535,
"step": 2310
},
{
"epoch": 1.385695087352546,
"grad_norm": 0.0521889254450798,
"learning_rate": 5.211181728814262e-05,
"loss": 1.5348,
"step": 2320
},
{
"epoch": 1.3916679110049275,
"grad_norm": 0.04742933064699173,
"learning_rate": 5.18465024114032e-05,
"loss": 1.5421,
"step": 2330
},
{
"epoch": 1.3976407346573092,
"grad_norm": 0.05169609189033508,
"learning_rate": 5.158061499779201e-05,
"loss": 1.5322,
"step": 2340
},
{
"epoch": 1.403613558309691,
"grad_norm": 0.05307742580771446,
"learning_rate": 5.131416789754472e-05,
"loss": 1.538,
"step": 2350
},
{
"epoch": 1.4095863819620726,
"grad_norm": 0.04581635445356369,
"learning_rate": 5.1047173987946474e-05,
"loss": 1.5313,
"step": 2360
},
{
"epoch": 1.4155592056144544,
"grad_norm": 0.04794102534651756,
"learning_rate": 5.077964617270947e-05,
"loss": 1.5357,
"step": 2370
},
{
"epoch": 1.4215320292668359,
"grad_norm": 0.043038323521614075,
"learning_rate": 5.051159738134937e-05,
"loss": 1.5362,
"step": 2380
},
{
"epoch": 1.4275048529192176,
"grad_norm": 0.052804794162511826,
"learning_rate": 5.024304056856039e-05,
"loss": 1.5299,
"step": 2390
},
{
"epoch": 1.4334776765715993,
"grad_norm": 0.051046222448349,
"learning_rate": 4.997398871358928e-05,
"loss": 1.529,
"step": 2400
},
{
"epoch": 1.4394505002239808,
"grad_norm": 0.056139182299375534,
"learning_rate": 4.970445481960793e-05,
"loss": 1.5368,
"step": 2410
},
{
"epoch": 1.4454233238763625,
"grad_norm": 0.04890932887792587,
"learning_rate": 4.9434451913085e-05,
"loss": 1.5308,
"step": 2420
},
{
"epoch": 1.4513961475287442,
"grad_norm": 0.04679281637072563,
"learning_rate": 4.916399304315636e-05,
"loss": 1.5353,
"step": 2430
},
{
"epoch": 1.457368971181126,
"grad_norm": 0.05536729097366333,
"learning_rate": 4.8893091280994415e-05,
"loss": 1.5314,
"step": 2440
},
{
"epoch": 1.4633417948335075,
"grad_norm": 0.04933058097958565,
"learning_rate": 4.862175971917637e-05,
"loss": 1.5301,
"step": 2450
},
{
"epoch": 1.4693146184858892,
"grad_norm": 0.05884556844830513,
"learning_rate": 4.835001147105148e-05,
"loss": 1.5213,
"step": 2460
},
{
"epoch": 1.475287442138271,
"grad_norm": 0.04465237259864807,
"learning_rate": 4.807785967010729e-05,
"loss": 1.5288,
"step": 2470
},
{
"epoch": 1.4812602657906524,
"grad_norm": 0.04548431187868118,
"learning_rate": 4.780531746933491e-05,
"loss": 1.5353,
"step": 2480
},
{
"epoch": 1.4872330894430341,
"grad_norm": 0.047798071056604385,
"learning_rate": 4.7532398040593295e-05,
"loss": 1.5261,
"step": 2490
},
{
"epoch": 1.4932059130954158,
"grad_norm": 0.05616561323404312,
"learning_rate": 4.7259114573972715e-05,
"loss": 1.5343,
"step": 2500
},
{
"epoch": 1.4991787367477976,
"grad_norm": 0.053861986845731735,
"learning_rate": 4.6985480277157215e-05,
"loss": 1.5249,
"step": 2510
},
{
"epoch": 1.5051515604001793,
"grad_norm": 0.05890486761927605,
"learning_rate": 4.671150837478634e-05,
"loss": 1.5357,
"step": 2520
},
{
"epoch": 1.511124384052561,
"grad_norm": 0.056382015347480774,
"learning_rate": 4.643721210781601e-05,
"loss": 1.5159,
"step": 2530
},
{
"epoch": 1.5170972077049425,
"grad_norm": 0.051396943628787994,
"learning_rate": 4.6162604732878515e-05,
"loss": 1.5301,
"step": 2540
},
{
"epoch": 1.5230700313573242,
"grad_norm": 0.04754629358649254,
"learning_rate": 4.588769952164191e-05,
"loss": 1.5277,
"step": 2550
},
{
"epoch": 1.5290428550097057,
"grad_norm": 0.0532587394118309,
"learning_rate": 4.561250976016851e-05,
"loss": 1.5201,
"step": 2560
},
{
"epoch": 1.5350156786620874,
"grad_norm": 0.059257134795188904,
"learning_rate": 4.5337048748272905e-05,
"loss": 1.5265,
"step": 2570
},
{
"epoch": 1.5409885023144692,
"grad_norm": 0.05495699495077133,
"learning_rate": 4.5061329798879064e-05,
"loss": 1.5247,
"step": 2580
},
{
"epoch": 1.5469613259668509,
"grad_norm": 0.04833153635263443,
"learning_rate": 4.478536623737699e-05,
"loss": 1.5291,
"step": 2590
},
{
"epoch": 1.5529341496192326,
"grad_norm": 0.048605091869831085,
"learning_rate": 4.450917140097869e-05,
"loss": 1.5277,
"step": 2600
},
{
"epoch": 1.5589069732716143,
"grad_norm": 0.06368768960237503,
"learning_rate": 4.4232758638073585e-05,
"loss": 1.5306,
"step": 2610
},
{
"epoch": 1.5648797969239958,
"grad_norm": 0.04569351673126221,
"learning_rate": 4.395614130758344e-05,
"loss": 1.5208,
"step": 2620
},
{
"epoch": 1.5708526205763775,
"grad_norm": 0.07877717167139053,
"learning_rate": 4.367933277831666e-05,
"loss": 1.5152,
"step": 2630
},
{
"epoch": 1.576825444228759,
"grad_norm": 0.05059320852160454,
"learning_rate": 4.34023464283222e-05,
"loss": 1.5199,
"step": 2640
},
{
"epoch": 1.5827982678811408,
"grad_norm": 0.05248813331127167,
"learning_rate": 4.312519564424306e-05,
"loss": 1.5236,
"step": 2650
},
{
"epoch": 1.5887710915335225,
"grad_norm": 0.051895346492528915,
"learning_rate": 4.2847893820669244e-05,
"loss": 1.5225,
"step": 2660
},
{
"epoch": 1.5947439151859042,
"grad_norm": 0.048129428178071976,
"learning_rate": 4.2570454359490455e-05,
"loss": 1.5259,
"step": 2670
},
{
"epoch": 1.600716738838286,
"grad_norm": 0.049009375274181366,
"learning_rate": 4.2292890669248364e-05,
"loss": 1.533,
"step": 2680
},
{
"epoch": 1.6066895624906674,
"grad_norm": 0.05925741046667099,
"learning_rate": 4.2015216164488575e-05,
"loss": 1.5242,
"step": 2690
},
{
"epoch": 1.6126623861430491,
"grad_norm": 0.051209457218647,
"learning_rate": 4.173744426511231e-05,
"loss": 1.5348,
"step": 2700
},
{
"epoch": 1.6186352097954306,
"grad_norm": 0.04731997102499008,
"learning_rate": 4.1459588395727876e-05,
"loss": 1.5179,
"step": 2710
},
{
"epoch": 1.6246080334478123,
"grad_norm": 0.04640951007604599,
"learning_rate": 4.118166198500178e-05,
"loss": 1.5218,
"step": 2720
},
{
"epoch": 1.630580857100194,
"grad_norm": 0.05060356855392456,
"learning_rate": 4.090367846500976e-05,
"loss": 1.5184,
"step": 2730
},
{
"epoch": 1.6365536807525758,
"grad_norm": 0.04525948315858841,
"learning_rate": 4.062565127058764e-05,
"loss": 1.5207,
"step": 2740
},
{
"epoch": 1.6425265044049575,
"grad_norm": 0.0447864904999733,
"learning_rate": 4.0347593838682016e-05,
"loss": 1.5265,
"step": 2750
},
{
"epoch": 1.6484993280573392,
"grad_norm": 0.06339412927627563,
"learning_rate": 4.006951960770084e-05,
"loss": 1.5296,
"step": 2760
},
{
"epoch": 1.6544721517097207,
"grad_norm": 0.05479173734784126,
"learning_rate": 3.979144201686396e-05,
"loss": 1.5167,
"step": 2770
},
{
"epoch": 1.6604449753621024,
"grad_norm": 0.05605393648147583,
"learning_rate": 3.951337450555361e-05,
"loss": 1.5208,
"step": 2780
},
{
"epoch": 1.666417799014484,
"grad_norm": 0.04500933736562729,
"learning_rate": 3.923533051266486e-05,
"loss": 1.5199,
"step": 2790
},
{
"epoch": 1.6723906226668657,
"grad_norm": 0.044439464807510376,
"learning_rate": 3.8957323475956165e-05,
"loss": 1.5254,
"step": 2800
},
{
"epoch": 1.6783634463192474,
"grad_norm": 0.051942795515060425,
"learning_rate": 3.867936683139991e-05,
"loss": 1.5168,
"step": 2810
},
{
"epoch": 1.684336269971629,
"grad_norm": 0.05696643143892288,
"learning_rate": 3.840147401253305e-05,
"loss": 1.5261,
"step": 2820
},
{
"epoch": 1.6903090936240108,
"grad_norm": 0.0423273928463459,
"learning_rate": 3.812365844980782e-05,
"loss": 1.5166,
"step": 2830
},
{
"epoch": 1.6962819172763925,
"grad_norm": 0.04251600056886673,
"learning_rate": 3.784593356994275e-05,
"loss": 1.514,
"step": 2840
},
{
"epoch": 1.702254740928774,
"grad_norm": 0.06778108328580856,
"learning_rate": 3.7568312795273675e-05,
"loss": 1.5161,
"step": 2850
},
{
"epoch": 1.7082275645811558,
"grad_norm": 0.046843383461236954,
"learning_rate": 3.729080954310509e-05,
"loss": 1.5215,
"step": 2860
},
{
"epoch": 1.7142003882335373,
"grad_norm": 0.04683705046772957,
"learning_rate": 3.701343722506164e-05,
"loss": 1.5191,
"step": 2870
},
{
"epoch": 1.720173211885919,
"grad_norm": 0.04883548244833946,
"learning_rate": 3.673620924644e-05,
"loss": 1.5175,
"step": 2880
},
{
"epoch": 1.7261460355383007,
"grad_norm": 0.047556836158037186,
"learning_rate": 3.6459139005560966e-05,
"loss": 1.5191,
"step": 2890
},
{
"epoch": 1.7321188591906824,
"grad_norm": 0.04096701368689537,
"learning_rate": 3.618223989312195e-05,
"loss": 1.5195,
"step": 2900
},
{
"epoch": 1.7380916828430641,
"grad_norm": 0.043791547417640686,
"learning_rate": 3.590552529154974e-05,
"loss": 1.5149,
"step": 2910
},
{
"epoch": 1.7440645064954459,
"grad_norm": 0.06429862976074219,
"learning_rate": 3.562900857435384e-05,
"loss": 1.5136,
"step": 2920
},
{
"epoch": 1.7500373301478274,
"grad_norm": 0.04811246693134308,
"learning_rate": 3.535270310548007e-05,
"loss": 1.5178,
"step": 2930
},
{
"epoch": 1.756010153800209,
"grad_norm": 0.05720449239015579,
"learning_rate": 3.5076622238664675e-05,
"loss": 1.5112,
"step": 2940
},
{
"epoch": 1.7619829774525906,
"grad_norm": 0.04717197269201279,
"learning_rate": 3.480077931678899e-05,
"loss": 1.5147,
"step": 2950
},
{
"epoch": 1.7679558011049723,
"grad_norm": 0.04889809712767601,
"learning_rate": 3.452518767123456e-05,
"loss": 1.5186,
"step": 2960
},
{
"epoch": 1.773928624757354,
"grad_norm": 0.055686600506305695,
"learning_rate": 3.424986062123883e-05,
"loss": 1.5105,
"step": 2970
},
{
"epoch": 1.7799014484097357,
"grad_norm": 0.045671623200178146,
"learning_rate": 3.397481147325146e-05,
"loss": 1.5236,
"step": 2980
},
{
"epoch": 1.7858742720621175,
"grad_norm": 0.0518915057182312,
"learning_rate": 3.370005352029122e-05,
"loss": 1.5082,
"step": 2990
},
{
"epoch": 1.7918470957144992,
"grad_norm": 0.0466337613761425,
"learning_rate": 3.342560004130351e-05,
"loss": 1.5246,
"step": 3000
},
{
"epoch": 1.7918470957144992,
"eval_loss": 1.5170252323150635,
"eval_runtime": 20.1093,
"eval_samples_per_second": 1722.235,
"eval_steps_per_second": 13.476,
"step": 3000
},
{
"epoch": 1.7978199193668807,
"grad_norm": 0.04238193854689598,
"learning_rate": 3.3151464300518634e-05,
"loss": 1.5097,
"step": 3010
},
{
"epoch": 1.8037927430192624,
"grad_norm": 0.050784409046173096,
"learning_rate": 3.2877659546810745e-05,
"loss": 1.5195,
"step": 3020
},
{
"epoch": 1.809765566671644,
"grad_norm": 0.04055749997496605,
"learning_rate": 3.260419901305751e-05,
"loss": 1.5171,
"step": 3030
},
{
"epoch": 1.8157383903240256,
"grad_norm": 0.05311364307999611,
"learning_rate": 3.2331095915500564e-05,
"loss": 1.5136,
"step": 3040
},
{
"epoch": 1.8217112139764073,
"grad_norm": 0.0499190054833889,
"learning_rate": 3.205836345310681e-05,
"loss": 1.5081,
"step": 3050
},
{
"epoch": 1.827684037628789,
"grad_norm": 0.056762441992759705,
"learning_rate": 3.178601480693048e-05,
"loss": 1.5243,
"step": 3060
},
{
"epoch": 1.8336568612811708,
"grad_norm": 0.04753740131855011,
"learning_rate": 3.151406313947615e-05,
"loss": 1.5069,
"step": 3070
},
{
"epoch": 1.8396296849335525,
"grad_norm": 0.054608915001153946,
"learning_rate": 3.124252159406251e-05,
"loss": 1.5172,
"step": 3080
},
{
"epoch": 1.845602508585934,
"grad_norm": 0.04840042069554329,
"learning_rate": 3.097140329418726e-05,
"loss": 1.5126,
"step": 3090
},
{
"epoch": 1.8515753322383157,
"grad_norm": 0.05584624037146568,
"learning_rate": 3.07007213428928e-05,
"loss": 1.5091,
"step": 3100
},
{
"epoch": 1.8575481558906972,
"grad_norm": 0.0425049252808094,
"learning_rate": 3.0430488822132957e-05,
"loss": 1.5155,
"step": 3110
},
{
"epoch": 1.863520979543079,
"grad_norm": 0.043588876724243164,
"learning_rate": 3.016071879214077e-05,
"loss": 1.5099,
"step": 3120
},
{
"epoch": 1.8694938031954607,
"grad_norm": 0.041503310203552246,
"learning_rate": 2.989142429079725e-05,
"loss": 1.509,
"step": 3130
},
{
"epoch": 1.8754666268478424,
"grad_norm": 0.04797055944800377,
"learning_rate": 2.962261833300133e-05,
"loss": 1.507,
"step": 3140
},
{
"epoch": 1.881439450500224,
"grad_norm": 0.05003626272082329,
"learning_rate": 2.935431391004081e-05,
"loss": 1.5177,
"step": 3150
},
{
"epoch": 1.8874122741526056,
"grad_norm": 0.04475341737270355,
"learning_rate": 2.9086523988964478e-05,
"loss": 1.5077,
"step": 3160
},
{
"epoch": 1.8933850978049873,
"grad_norm": 0.04602671042084694,
"learning_rate": 2.881926151195547e-05,
"loss": 1.5037,
"step": 3170
},
{
"epoch": 1.8993579214573688,
"grad_norm": 0.04945210739970207,
"learning_rate": 2.855253939570578e-05,
"loss": 1.503,
"step": 3180
},
{
"epoch": 1.9053307451097505,
"grad_norm": 0.04730582609772682,
"learning_rate": 2.8286370530791914e-05,
"loss": 1.5064,
"step": 3190
},
{
"epoch": 1.9113035687621323,
"grad_norm": 0.05128956586122513,
"learning_rate": 2.8020767781052016e-05,
"loss": 1.5126,
"step": 3200
},
{
"epoch": 1.917276392414514,
"grad_norm": 0.055559854954481125,
"learning_rate": 2.7755743982964066e-05,
"loss": 1.5052,
"step": 3210
},
{
"epoch": 1.9232492160668957,
"grad_norm": 0.036298781633377075,
"learning_rate": 2.749131194502555e-05,
"loss": 1.5092,
"step": 3220
},
{
"epoch": 1.9292220397192774,
"grad_norm": 0.042619943618774414,
"learning_rate": 2.7227484447134398e-05,
"loss": 1.5044,
"step": 3230
},
{
"epoch": 1.935194863371659,
"grad_norm": 0.052806805819272995,
"learning_rate": 2.696427423997138e-05,
"loss": 1.5056,
"step": 3240
},
{
"epoch": 1.9411676870240406,
"grad_norm": 0.044467948377132416,
"learning_rate": 2.670169404438383e-05,
"loss": 1.5114,
"step": 3250
},
{
"epoch": 1.9471405106764221,
"grad_norm": 0.038638997822999954,
"learning_rate": 2.6439756550770872e-05,
"loss": 1.5154,
"step": 3260
},
{
"epoch": 1.9531133343288039,
"grad_norm": 0.04845379292964935,
"learning_rate": 2.617847441847007e-05,
"loss": 1.51,
"step": 3270
},
{
"epoch": 1.9590861579811856,
"grad_norm": 0.0445607528090477,
"learning_rate": 2.5917860275145658e-05,
"loss": 1.5047,
"step": 3280
},
{
"epoch": 1.9650589816335673,
"grad_norm": 0.045905206352472305,
"learning_rate": 2.5657926716178217e-05,
"loss": 1.5118,
"step": 3290
},
{
"epoch": 1.971031805285949,
"grad_norm": 0.04530317336320877,
"learning_rate": 2.539868630405594e-05,
"loss": 1.5099,
"step": 3300
},
{
"epoch": 1.9770046289383307,
"grad_norm": 0.04195258021354675,
"learning_rate": 2.5140151567767505e-05,
"loss": 1.5075,
"step": 3310
},
{
"epoch": 1.9829774525907122,
"grad_norm": 0.043815840035676956,
"learning_rate": 2.4882335002196553e-05,
"loss": 1.5096,
"step": 3320
},
{
"epoch": 1.988950276243094,
"grad_norm": 0.04683714732527733,
"learning_rate": 2.4625249067517803e-05,
"loss": 1.5057,
"step": 3330
},
{
"epoch": 1.9949230998954754,
"grad_norm": 0.049690209329128265,
"learning_rate": 2.4368906188594877e-05,
"loss": 1.5106,
"step": 3340
},
{
"epoch": 2.000895923547857,
"grad_norm": 0.048324376344680786,
"learning_rate": 2.4113318754379816e-05,
"loss": 1.5042,
"step": 3350
},
{
"epoch": 2.006868747200239,
"grad_norm": 0.05503029376268387,
"learning_rate": 2.385849911731426e-05,
"loss": 1.4922,
"step": 3360
},
{
"epoch": 2.0128415708526206,
"grad_norm": 0.049435921013355255,
"learning_rate": 2.360445959273255e-05,
"loss": 1.4962,
"step": 3370
},
{
"epoch": 2.0188143945050023,
"grad_norm": 0.05086649954319,
"learning_rate": 2.3351212458266512e-05,
"loss": 1.4918,
"step": 3380
},
{
"epoch": 2.024787218157384,
"grad_norm": 0.045887332409620285,
"learning_rate": 2.3098769953252002e-05,
"loss": 1.4868,
"step": 3390
},
{
"epoch": 2.0307600418097658,
"grad_norm": 0.04303443059325218,
"learning_rate": 2.2847144278137502e-05,
"loss": 1.4982,
"step": 3400
},
{
"epoch": 2.036732865462147,
"grad_norm": 0.043649692088365555,
"learning_rate": 2.2596347593894387e-05,
"loss": 1.5,
"step": 3410
},
{
"epoch": 2.0427056891145288,
"grad_norm": 0.04276139661669731,
"learning_rate": 2.2346392021429254e-05,
"loss": 1.4903,
"step": 3420
},
{
"epoch": 2.0486785127669105,
"grad_norm": 0.04298582300543785,
"learning_rate": 2.2097289640998074e-05,
"loss": 1.5032,
"step": 3430
},
{
"epoch": 2.054651336419292,
"grad_norm": 0.053750213235616684,
"learning_rate": 2.1849052491622374e-05,
"loss": 1.4942,
"step": 3440
},
{
"epoch": 2.060624160071674,
"grad_norm": 0.042636483907699585,
"learning_rate": 2.160169257050742e-05,
"loss": 1.4976,
"step": 3450
},
{
"epoch": 2.0665969837240556,
"grad_norm": 0.05124128982424736,
"learning_rate": 2.135522183246237e-05,
"loss": 1.4981,
"step": 3460
},
{
"epoch": 2.0725698073764374,
"grad_norm": 0.047978244721889496,
"learning_rate": 2.110965218932247e-05,
"loss": 1.4975,
"step": 3470
},
{
"epoch": 2.078542631028819,
"grad_norm": 0.045476969331502914,
"learning_rate": 2.0864995509373448e-05,
"loss": 1.4958,
"step": 3480
},
{
"epoch": 2.0845154546812004,
"grad_norm": 0.05264231190085411,
"learning_rate": 2.062126361677786e-05,
"loss": 1.4996,
"step": 3490
},
{
"epoch": 2.090488278333582,
"grad_norm": 0.05144358426332474,
"learning_rate": 2.037846829100364e-05,
"loss": 1.5077,
"step": 3500
},
{
"epoch": 2.096461101985964,
"grad_norm": 0.048265036195516586,
"learning_rate": 2.013662126625482e-05,
"loss": 1.4987,
"step": 3510
},
{
"epoch": 2.1024339256383455,
"grad_norm": 0.04586884751915932,
"learning_rate": 1.9895734230904396e-05,
"loss": 1.5044,
"step": 3520
},
{
"epoch": 2.1084067492907272,
"grad_norm": 0.03930211812257767,
"learning_rate": 1.965581882692949e-05,
"loss": 1.4951,
"step": 3530
},
{
"epoch": 2.114379572943109,
"grad_norm": 0.051928870379924774,
"learning_rate": 1.9416886649348575e-05,
"loss": 1.4962,
"step": 3540
},
{
"epoch": 2.1203523965954907,
"grad_norm": 0.04466070607304573,
"learning_rate": 1.917894924566125e-05,
"loss": 1.4874,
"step": 3550
},
{
"epoch": 2.126325220247872,
"grad_norm": 0.044879212975502014,
"learning_rate": 1.8942018115290063e-05,
"loss": 1.4896,
"step": 3560
},
{
"epoch": 2.1322980439002537,
"grad_norm": 0.04508794844150543,
"learning_rate": 1.8706104709024715e-05,
"loss": 1.4915,
"step": 3570
},
{
"epoch": 2.1382708675526354,
"grad_norm": 0.06577686965465546,
"learning_rate": 1.8471220428468745e-05,
"loss": 1.4981,
"step": 3580
},
{
"epoch": 2.144243691205017,
"grad_norm": 0.03995177894830704,
"learning_rate": 1.823737662548843e-05,
"loss": 1.4973,
"step": 3590
},
{
"epoch": 2.150216514857399,
"grad_norm": 0.06114717572927475,
"learning_rate": 1.800458460166417e-05,
"loss": 1.4942,
"step": 3600
},
{
"epoch": 2.1561893385097806,
"grad_norm": 0.04745366424322128,
"learning_rate": 1.7772855607744284e-05,
"loss": 1.5004,
"step": 3610
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.045220714062452316,
"learning_rate": 1.7542200843101267e-05,
"loss": 1.494,
"step": 3620
},
{
"epoch": 2.168134985814544,
"grad_norm": 0.04914199188351631,
"learning_rate": 1.7312631455190528e-05,
"loss": 1.491,
"step": 3630
},
{
"epoch": 2.1741078094669257,
"grad_norm": 0.044854309409856796,
"learning_rate": 1.708415853901166e-05,
"loss": 1.4974,
"step": 3640
},
{
"epoch": 2.180080633119307,
"grad_norm": 0.0511915348470211,
"learning_rate": 1.6856793136572155e-05,
"loss": 1.4978,
"step": 3650
},
{
"epoch": 2.1860534567716887,
"grad_norm": 0.052235160022974014,
"learning_rate": 1.6630546236353833e-05,
"loss": 1.4884,
"step": 3660
},
{
"epoch": 2.1920262804240704,
"grad_norm": 0.03959416225552559,
"learning_rate": 1.6405428772781724e-05,
"loss": 1.4897,
"step": 3670
},
{
"epoch": 2.197999104076452,
"grad_norm": 0.04642707481980324,
"learning_rate": 1.618145162569563e-05,
"loss": 1.489,
"step": 3680
},
{
"epoch": 2.203971927728834,
"grad_norm": 0.05590491741895676,
"learning_rate": 1.5958625619824286e-05,
"loss": 1.4946,
"step": 3690
},
{
"epoch": 2.2099447513812156,
"grad_norm": 0.050484009087085724,
"learning_rate": 1.5736961524262232e-05,
"loss": 1.5011,
"step": 3700
},
{
"epoch": 2.2159175750335973,
"grad_norm": 0.04109204187989235,
"learning_rate": 1.551647005194932e-05,
"loss": 1.4993,
"step": 3710
},
{
"epoch": 2.2218903986859786,
"grad_norm": 0.04570942744612694,
"learning_rate": 1.5297161859152986e-05,
"loss": 1.491,
"step": 3720
},
{
"epoch": 2.2278632223383603,
"grad_norm": 0.041420578956604004,
"learning_rate": 1.5079047544953227e-05,
"loss": 1.4874,
"step": 3730
},
{
"epoch": 2.233836045990742,
"grad_norm": 0.04918381944298744,
"learning_rate": 1.486213765073032e-05,
"loss": 1.4939,
"step": 3740
},
{
"epoch": 2.2398088696431238,
"grad_norm": 0.05086056888103485,
"learning_rate": 1.4646442659655425e-05,
"loss": 1.4992,
"step": 3750
},
{
"epoch": 2.2457816932955055,
"grad_norm": 0.061345502734184265,
"learning_rate": 1.4431972996183894e-05,
"loss": 1.4935,
"step": 3760
},
{
"epoch": 2.251754516947887,
"grad_norm": 0.03802775219082832,
"learning_rate": 1.4218739025551469e-05,
"loss": 1.487,
"step": 3770
},
{
"epoch": 2.257727340600269,
"grad_norm": 0.039830368012189865,
"learning_rate": 1.4006751053273338e-05,
"loss": 1.4943,
"step": 3780
},
{
"epoch": 2.2637001642526506,
"grad_norm": 0.04441362991929054,
"learning_rate": 1.3796019324646062e-05,
"loss": 1.4907,
"step": 3790
},
{
"epoch": 2.269672987905032,
"grad_norm": 0.04267200455069542,
"learning_rate": 1.358655402425245e-05,
"loss": 1.4905,
"step": 3800
},
{
"epoch": 2.2756458115574136,
"grad_norm": 0.04467471316456795,
"learning_rate": 1.3378365275469322e-05,
"loss": 1.4865,
"step": 3810
},
{
"epoch": 2.2816186352097954,
"grad_norm": 0.04877958446741104,
"learning_rate": 1.3171463139978222e-05,
"loss": 1.4978,
"step": 3820
},
{
"epoch": 2.287591458862177,
"grad_norm": 0.04458734765648842,
"learning_rate": 1.2965857617279216e-05,
"loss": 1.4931,
"step": 3830
},
{
"epoch": 2.293564282514559,
"grad_norm": 0.043027278035879135,
"learning_rate": 1.2761558644207547e-05,
"loss": 1.495,
"step": 3840
},
{
"epoch": 2.2995371061669405,
"grad_norm": 0.03808119520545006,
"learning_rate": 1.2558576094453435e-05,
"loss": 1.4922,
"step": 3850
},
{
"epoch": 2.3055099298193222,
"grad_norm": 0.038997333496809006,
"learning_rate": 1.2356919778084867e-05,
"loss": 1.4915,
"step": 3860
},
{
"epoch": 2.3114827534717035,
"grad_norm": 0.04020654410123825,
"learning_rate": 1.2156599441073488e-05,
"loss": 1.4874,
"step": 3870
},
{
"epoch": 2.3174555771240852,
"grad_norm": 0.04891055077314377,
"learning_rate": 1.1957624764823566e-05,
"loss": 1.5016,
"step": 3880
},
{
"epoch": 2.323428400776467,
"grad_norm": 0.046524520963430405,
"learning_rate": 1.176000536570412e-05,
"loss": 1.4928,
"step": 3890
},
{
"epoch": 2.3294012244288487,
"grad_norm": 0.04302162304520607,
"learning_rate": 1.1563750794584156e-05,
"loss": 1.4905,
"step": 3900
},
{
"epoch": 2.3353740480812304,
"grad_norm": 0.046545591205358505,
"learning_rate": 1.1368870536371036e-05,
"loss": 1.4911,
"step": 3910
},
{
"epoch": 2.341346871733612,
"grad_norm": 0.04680660367012024,
"learning_rate": 1.1175374009552159e-05,
"loss": 1.4832,
"step": 3920
},
{
"epoch": 2.347319695385994,
"grad_norm": 0.04679818078875542,
"learning_rate": 1.0983270565739668e-05,
"loss": 1.4892,
"step": 3930
},
{
"epoch": 2.3532925190383756,
"grad_norm": 0.04409361630678177,
"learning_rate": 1.0792569489218598e-05,
"loss": 1.4907,
"step": 3940
},
{
"epoch": 2.3592653426907573,
"grad_norm": 0.04122375324368477,
"learning_rate": 1.0603279996498089e-05,
"loss": 1.4936,
"step": 3950
},
{
"epoch": 2.3652381663431385,
"grad_norm": 0.045084912329912186,
"learning_rate": 1.0415411235865979e-05,
"loss": 1.4852,
"step": 3960
},
{
"epoch": 2.3712109899955203,
"grad_norm": 0.04110685735940933,
"learning_rate": 1.0228972286946695e-05,
"loss": 1.494,
"step": 3970
},
{
"epoch": 2.377183813647902,
"grad_norm": 0.04527169466018677,
"learning_rate": 1.0043972160262392e-05,
"loss": 1.4955,
"step": 3980
},
{
"epoch": 2.3831566373002837,
"grad_norm": 0.04808187112212181,
"learning_rate": 9.860419796797527e-06,
"loss": 1.4858,
"step": 3990
},
{
"epoch": 2.3891294609526654,
"grad_norm": 0.03969137370586395,
"learning_rate": 9.678324067566716e-06,
"loss": 1.497,
"step": 4000
},
{
"epoch": 2.3891294609526654,
"eval_loss": 1.4980565309524536,
"eval_runtime": 20.0226,
"eval_samples_per_second": 1729.697,
"eval_steps_per_second": 13.535,
"step": 4000
},
{
"epoch": 2.395102284605047,
"grad_norm": 0.039191678166389465,
"learning_rate": 9.497693773185985e-06,
"loss": 1.491,
"step": 4010
},
{
"epoch": 2.401075108257429,
"grad_norm": 0.04326602816581726,
"learning_rate": 9.318537643447488e-06,
"loss": 1.4897,
"step": 4020
},
{
"epoch": 2.40704793190981,
"grad_norm": 0.04062432423233986,
"learning_rate": 9.140864336897559e-06,
"loss": 1.4834,
"step": 4030
},
{
"epoch": 2.413020755562192,
"grad_norm": 0.043511949479579926,
"learning_rate": 8.964682440418272e-06,
"loss": 1.4899,
"step": 4040
},
{
"epoch": 2.4189935792145736,
"grad_norm": 0.041364822536706924,
"learning_rate": 8.79000046881242e-06,
"loss": 1.4876,
"step": 4050
},
{
"epoch": 2.4249664028669553,
"grad_norm": 0.03720170632004738,
"learning_rate": 8.61682686439202e-06,
"loss": 1.4926,
"step": 4060
},
{
"epoch": 2.430939226519337,
"grad_norm": 0.04620780423283577,
"learning_rate": 8.44516999657027e-06,
"loss": 1.4929,
"step": 4070
},
{
"epoch": 2.4369120501717187,
"grad_norm": 0.03785783797502518,
"learning_rate": 8.275038161457094e-06,
"loss": 1.4917,
"step": 4080
},
{
"epoch": 2.4428848738241005,
"grad_norm": 0.047655072063207626,
"learning_rate": 8.106439581458177e-06,
"loss": 1.4923,
"step": 4090
},
{
"epoch": 2.448857697476482,
"grad_norm": 0.04838723689317703,
"learning_rate": 7.939382404877545e-06,
"loss": 1.4902,
"step": 4100
},
{
"epoch": 2.454830521128864,
"grad_norm": 0.0498916357755661,
"learning_rate": 7.773874705523826e-06,
"loss": 1.4846,
"step": 4110
},
{
"epoch": 2.460803344781245,
"grad_norm": 0.044865112751722336,
"learning_rate": 7.609924482320013e-06,
"loss": 1.4867,
"step": 4120
},
{
"epoch": 2.466776168433627,
"grad_norm": 0.041775912046432495,
"learning_rate": 7.447539658916869e-06,
"loss": 1.4869,
"step": 4130
},
{
"epoch": 2.4727489920860086,
"grad_norm": 0.03888450190424919,
"learning_rate": 7.286728083309995e-06,
"loss": 1.4824,
"step": 4140
},
{
"epoch": 2.4787218157383903,
"grad_norm": 0.05169163644313812,
"learning_rate": 7.127497527460541e-06,
"loss": 1.4856,
"step": 4150
},
{
"epoch": 2.484694639390772,
"grad_norm": 0.04095705598592758,
"learning_rate": 6.969855686919573e-06,
"loss": 1.4899,
"step": 4160
},
{
"epoch": 2.490667463043154,
"grad_norm": 0.0429367758333683,
"learning_rate": 6.81381018045618e-06,
"loss": 1.4848,
"step": 4170
},
{
"epoch": 2.4966402866955355,
"grad_norm": 0.04392432048916817,
"learning_rate": 6.659368549689209e-06,
"loss": 1.4832,
"step": 4180
},
{
"epoch": 2.502613110347917,
"grad_norm": 0.04673699662089348,
"learning_rate": 6.506538258722859e-06,
"loss": 1.4855,
"step": 4190
},
{
"epoch": 2.5085859340002985,
"grad_norm": 0.04074994474649429,
"learning_rate": 6.355326693785868e-06,
"loss": 1.4789,
"step": 4200
},
{
"epoch": 2.51455875765268,
"grad_norm": 0.035382091999053955,
"learning_rate": 6.2057411628745875e-06,
"loss": 1.4862,
"step": 4210
},
{
"epoch": 2.520531581305062,
"grad_norm": 0.03829929605126381,
"learning_rate": 6.057788895399781e-06,
"loss": 1.4852,
"step": 4220
},
{
"epoch": 2.5265044049574437,
"grad_norm": 0.04219154641032219,
"learning_rate": 5.9114770418372015e-06,
"loss": 1.4865,
"step": 4230
},
{
"epoch": 2.5324772286098254,
"grad_norm": 0.04591584950685501,
"learning_rate": 5.7668126733820476e-06,
"loss": 1.4737,
"step": 4240
},
{
"epoch": 2.538450052262207,
"grad_norm": 0.045854389667510986,
"learning_rate": 5.623802781607204e-06,
"loss": 1.4872,
"step": 4250
},
{
"epoch": 2.544422875914589,
"grad_norm": 0.04153481870889664,
"learning_rate": 5.48245427812534e-06,
"loss": 1.4806,
"step": 4260
},
{
"epoch": 2.5503956995669705,
"grad_norm": 0.03822470083832741,
"learning_rate": 5.342773994254842e-06,
"loss": 1.4792,
"step": 4270
},
{
"epoch": 2.556368523219352,
"grad_norm": 0.03870686888694763,
"learning_rate": 5.204768680689727e-06,
"loss": 1.4771,
"step": 4280
},
{
"epoch": 2.5623413468717335,
"grad_norm": 0.05567542836070061,
"learning_rate": 5.068445007173331e-06,
"loss": 1.4812,
"step": 4290
},
{
"epoch": 2.5683141705241153,
"grad_norm": 0.03914303705096245,
"learning_rate": 4.933809562175982e-06,
"loss": 1.4952,
"step": 4300
},
{
"epoch": 2.574286994176497,
"grad_norm": 0.04728810861706734,
"learning_rate": 4.800868852576561e-06,
"loss": 1.4813,
"step": 4310
},
{
"epoch": 2.5802598178288787,
"grad_norm": 0.04394581541419029,
"learning_rate": 4.669629303348066e-06,
"loss": 1.4779,
"step": 4320
},
{
"epoch": 2.5862326414812604,
"grad_norm": 0.042139682918787,
"learning_rate": 4.540097257247062e-06,
"loss": 1.4847,
"step": 4330
},
{
"epoch": 2.5922054651336417,
"grad_norm": 0.04580564424395561,
"learning_rate": 4.412278974507151e-06,
"loss": 1.4767,
"step": 4340
},
{
"epoch": 2.5981782887860234,
"grad_norm": 0.03395635262131691,
"learning_rate": 4.286180632536421e-06,
"loss": 1.4871,
"step": 4350
},
{
"epoch": 2.604151112438405,
"grad_norm": 0.04606311395764351,
"learning_rate": 4.161808325618886e-06,
"loss": 1.4865,
"step": 4360
},
{
"epoch": 2.610123936090787,
"grad_norm": 0.046741172671318054,
"learning_rate": 4.039168064619938e-06,
"loss": 1.4896,
"step": 4370
},
{
"epoch": 2.6160967597431686,
"grad_norm": 0.04130960628390312,
"learning_rate": 3.918265776695891e-06,
"loss": 1.4837,
"step": 4380
},
{
"epoch": 2.6220695833955503,
"grad_norm": 0.043055951595306396,
"learning_rate": 3.7991073050074678e-06,
"loss": 1.4841,
"step": 4390
},
{
"epoch": 2.628042407047932,
"grad_norm": 0.04418269917368889,
"learning_rate": 3.6816984084374485e-06,
"loss": 1.4831,
"step": 4400
},
{
"epoch": 2.6340152307003137,
"grad_norm": 0.036886971443891525,
"learning_rate": 3.5660447613123086e-06,
"loss": 1.4892,
"step": 4410
},
{
"epoch": 2.6399880543526955,
"grad_norm": 0.04421091824769974,
"learning_rate": 3.452151953128007e-06,
"loss": 1.4848,
"step": 4420
},
{
"epoch": 2.645960878005077,
"grad_norm": 0.042877208441495895,
"learning_rate": 3.3400254882798435e-06,
"loss": 1.4888,
"step": 4430
},
{
"epoch": 2.6519337016574585,
"grad_norm": 0.04234934598207474,
"learning_rate": 3.2296707857964125e-06,
"loss": 1.4796,
"step": 4440
},
{
"epoch": 2.65790652530984,
"grad_norm": 0.035217370837926865,
"learning_rate": 3.121093179077739e-06,
"loss": 1.481,
"step": 4450
},
{
"epoch": 2.663879348962222,
"grad_norm": 0.040508221834897995,
"learning_rate": 3.0142979156374806e-06,
"loss": 1.4819,
"step": 4460
},
{
"epoch": 2.6698521726146036,
"grad_norm": 0.041981033980846405,
"learning_rate": 2.9092901568493446e-06,
"loss": 1.4804,
"step": 4470
},
{
"epoch": 2.6758249962669853,
"grad_norm": 0.03790983185172081,
"learning_rate": 2.80607497769763e-06,
"loss": 1.4894,
"step": 4480
},
{
"epoch": 2.6817978199193666,
"grad_norm": 0.038940299302339554,
"learning_rate": 2.70465736653196e-06,
"loss": 1.4827,
"step": 4490
},
{
"epoch": 2.6877706435717483,
"grad_norm": 0.04031272605061531,
"learning_rate": 2.605042224826182e-06,
"loss": 1.4845,
"step": 4500
}
],
"logging_steps": 10,
"max_steps": 5022,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.9327446823064306e+19,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}