{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.6877706435717483, "eval_steps": 1000, "global_step": 4500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005972823652381664, "grad_norm": 0.5743309259414673, "learning_rate": 1.5904572564612327e-06, "loss": 2.7537, "step": 10 }, { "epoch": 0.011945647304763328, "grad_norm": 0.5460094809532166, "learning_rate": 3.1809145129224655e-06, "loss": 2.7612, "step": 20 }, { "epoch": 0.01791847095714499, "grad_norm": 0.5363145470619202, "learning_rate": 4.7713717693836985e-06, "loss": 2.7609, "step": 30 }, { "epoch": 0.023891294609526655, "grad_norm": 0.5279455184936523, "learning_rate": 6.361829025844931e-06, "loss": 2.7607, "step": 40 }, { "epoch": 0.029864118261908316, "grad_norm": 0.5061234831809998, "learning_rate": 7.952286282306164e-06, "loss": 2.784, "step": 50 }, { "epoch": 0.03583694191428998, "grad_norm": 0.476898729801178, "learning_rate": 9.542743538767397e-06, "loss": 2.762, "step": 60 }, { "epoch": 0.041809765566671646, "grad_norm": 0.4454072415828705, "learning_rate": 1.113320079522863e-05, "loss": 2.7716, "step": 70 }, { "epoch": 0.04778258921905331, "grad_norm": 3.1541287899017334, "learning_rate": 1.2723658051689862e-05, "loss": 2.8849, "step": 80 }, { "epoch": 0.05375541287143497, "grad_norm": 0.19107532501220703, "learning_rate": 1.4314115308151095e-05, "loss": 3.1147, "step": 90 }, { "epoch": 0.05972823652381663, "grad_norm": 0.13281038403511047, "learning_rate": 1.590457256461233e-05, "loss": 2.5574, "step": 100 }, { "epoch": 0.0657010601761983, "grad_norm": 0.08191326260566711, "learning_rate": 1.749502982107356e-05, "loss": 2.4446, "step": 110 }, { "epoch": 0.07167388382857996, "grad_norm": 0.08300579339265823, "learning_rate": 1.9085487077534794e-05, "loss": 2.3524, "step": 120 }, { "epoch": 0.07764670748096163, "grad_norm": 0.0590679906308651, "learning_rate": 2.0675944333996028e-05, "loss": 2.2819, "step": 130 }, { "epoch": 0.08361953113334329, "grad_norm": 0.052923623472452164, "learning_rate": 2.226640159045726e-05, "loss": 2.2261, "step": 140 }, { "epoch": 0.08959235478572496, "grad_norm": 0.05208205804228783, "learning_rate": 2.385685884691849e-05, "loss": 2.1889, "step": 150 }, { "epoch": 0.09556517843810662, "grad_norm": 0.0485885925590992, "learning_rate": 2.5447316103379724e-05, "loss": 2.1694, "step": 160 }, { "epoch": 0.10153800209048827, "grad_norm": 0.04901551082730293, "learning_rate": 2.7037773359840955e-05, "loss": 2.1272, "step": 170 }, { "epoch": 0.10751082574286994, "grad_norm": 0.04524153470993042, "learning_rate": 2.862823061630219e-05, "loss": 2.1085, "step": 180 }, { "epoch": 0.1134836493952516, "grad_norm": 0.04201298579573631, "learning_rate": 3.021868787276342e-05, "loss": 2.0902, "step": 190 }, { "epoch": 0.11945647304763327, "grad_norm": 0.053612083196640015, "learning_rate": 3.180914512922466e-05, "loss": 2.0855, "step": 200 }, { "epoch": 0.12542929670001493, "grad_norm": 0.04812688007950783, "learning_rate": 3.3399602385685885e-05, "loss": 2.0469, "step": 210 }, { "epoch": 0.1314021203523966, "grad_norm": 0.0483262836933136, "learning_rate": 3.499005964214712e-05, "loss": 2.0264, "step": 220 }, { "epoch": 0.13737494400477826, "grad_norm": 0.05456310138106346, "learning_rate": 3.6580516898608353e-05, "loss": 2.0201, "step": 230 }, { "epoch": 0.14334776765715992, "grad_norm": 0.06978671252727509, "learning_rate": 3.817097415506959e-05, "loss": 1.9967, "step": 240 }, { "epoch": 0.1493205913095416, "grad_norm": 0.049219317734241486, "learning_rate": 3.976143141153082e-05, "loss": 1.9909, "step": 250 }, { "epoch": 0.15529341496192325, "grad_norm": 0.04814588651061058, "learning_rate": 4.1351888667992056e-05, "loss": 1.9793, "step": 260 }, { "epoch": 0.16126623861430492, "grad_norm": 0.06128086522221565, "learning_rate": 4.2942345924453284e-05, "loss": 1.9703, "step": 270 }, { "epoch": 0.16723906226668658, "grad_norm": 0.06803273409605026, "learning_rate": 4.453280318091452e-05, "loss": 1.9484, "step": 280 }, { "epoch": 0.17321188591906825, "grad_norm": 0.06598497182130814, "learning_rate": 4.612326043737575e-05, "loss": 1.9251, "step": 290 }, { "epoch": 0.1791847095714499, "grad_norm": 0.05581754818558693, "learning_rate": 4.771371769383698e-05, "loss": 1.9211, "step": 300 }, { "epoch": 0.18515753322383158, "grad_norm": 0.06264442205429077, "learning_rate": 4.9304174950298214e-05, "loss": 1.9047, "step": 310 }, { "epoch": 0.19113035687621324, "grad_norm": 0.05809122323989868, "learning_rate": 5.089463220675945e-05, "loss": 1.8948, "step": 320 }, { "epoch": 0.1971031805285949, "grad_norm": 0.05478562042117119, "learning_rate": 5.248508946322068e-05, "loss": 1.8924, "step": 330 }, { "epoch": 0.20307600418097654, "grad_norm": 0.060149796307086945, "learning_rate": 5.407554671968191e-05, "loss": 1.8776, "step": 340 }, { "epoch": 0.2090488278333582, "grad_norm": 0.06282585859298706, "learning_rate": 5.5666003976143144e-05, "loss": 1.8752, "step": 350 }, { "epoch": 0.21502165148573987, "grad_norm": 0.06441989541053772, "learning_rate": 5.725646123260438e-05, "loss": 1.8632, "step": 360 }, { "epoch": 0.22099447513812154, "grad_norm": 0.05681062117218971, "learning_rate": 5.8846918489065606e-05, "loss": 1.8475, "step": 370 }, { "epoch": 0.2269672987905032, "grad_norm": 0.05155131593346596, "learning_rate": 6.043737574552684e-05, "loss": 1.8431, "step": 380 }, { "epoch": 0.23294012244288487, "grad_norm": 0.05347074940800667, "learning_rate": 6.202783300198807e-05, "loss": 1.8416, "step": 390 }, { "epoch": 0.23891294609526653, "grad_norm": 0.06694310158491135, "learning_rate": 6.361829025844931e-05, "loss": 1.8344, "step": 400 }, { "epoch": 0.2448857697476482, "grad_norm": 0.06079185754060745, "learning_rate": 6.520874751491054e-05, "loss": 1.8297, "step": 410 }, { "epoch": 0.25085859340002986, "grad_norm": 0.05415233224630356, "learning_rate": 6.679920477137177e-05, "loss": 1.82, "step": 420 }, { "epoch": 0.2568314170524115, "grad_norm": 0.0645110234618187, "learning_rate": 6.838966202783301e-05, "loss": 1.8137, "step": 430 }, { "epoch": 0.2628042407047932, "grad_norm": 0.06045007333159447, "learning_rate": 6.998011928429424e-05, "loss": 1.8048, "step": 440 }, { "epoch": 0.26877706435717486, "grad_norm": 0.05600131303071976, "learning_rate": 7.157057654075547e-05, "loss": 1.7854, "step": 450 }, { "epoch": 0.2747498880095565, "grad_norm": 0.06498062610626221, "learning_rate": 7.316103379721671e-05, "loss": 1.798, "step": 460 }, { "epoch": 0.2807227116619382, "grad_norm": 0.053577929735183716, "learning_rate": 7.475149105367795e-05, "loss": 1.7883, "step": 470 }, { "epoch": 0.28669553531431985, "grad_norm": 0.09097382426261902, "learning_rate": 7.634194831013918e-05, "loss": 1.78, "step": 480 }, { "epoch": 0.2926683589667015, "grad_norm": 0.057212598621845245, "learning_rate": 7.79324055666004e-05, "loss": 1.7705, "step": 490 }, { "epoch": 0.2986411826190832, "grad_norm": 0.055311623960733414, "learning_rate": 7.952286282306164e-05, "loss": 1.7739, "step": 500 }, { "epoch": 0.30461400627146484, "grad_norm": 0.07679615169763565, "learning_rate": 7.999952636882403e-05, "loss": 1.7705, "step": 510 }, { "epoch": 0.3105868299238465, "grad_norm": 0.10281822085380554, "learning_rate": 7.999720656965739e-05, "loss": 1.7639, "step": 520 }, { "epoch": 0.3165596535762282, "grad_norm": 0.07636060565710068, "learning_rate": 7.999295372099362e-05, "loss": 1.7539, "step": 530 }, { "epoch": 0.32253247722860984, "grad_norm": 0.057714689522981644, "learning_rate": 7.998676802837124e-05, "loss": 1.7541, "step": 540 }, { "epoch": 0.3285053008809915, "grad_norm": 0.06505981832742691, "learning_rate": 7.997864979074237e-05, "loss": 1.7487, "step": 550 }, { "epoch": 0.33447812453337317, "grad_norm": 0.05842842161655426, "learning_rate": 7.996859940045832e-05, "loss": 1.739, "step": 560 }, { "epoch": 0.34045094818575483, "grad_norm": 0.051559966057538986, "learning_rate": 7.995661734325054e-05, "loss": 1.7443, "step": 570 }, { "epoch": 0.3464237718381365, "grad_norm": 0.20853149890899658, "learning_rate": 7.994270419820721e-05, "loss": 1.7719, "step": 580 }, { "epoch": 0.35239659549051816, "grad_norm": 0.09151974320411682, "learning_rate": 7.992686063774525e-05, "loss": 1.7817, "step": 590 }, { "epoch": 0.3583694191428998, "grad_norm": 0.05926055088639259, "learning_rate": 7.99090874275778e-05, "loss": 1.7469, "step": 600 }, { "epoch": 0.3643422427952815, "grad_norm": 0.044228848069906235, "learning_rate": 7.988938542667721e-05, "loss": 1.7393, "step": 610 }, { "epoch": 0.37031506644766315, "grad_norm": 0.0427553653717041, "learning_rate": 7.986775558723355e-05, "loss": 1.7307, "step": 620 }, { "epoch": 0.3762878901000448, "grad_norm": 0.0548509880900383, "learning_rate": 7.984419895460858e-05, "loss": 1.7205, "step": 630 }, { "epoch": 0.3822607137524265, "grad_norm": 0.057041749358177185, "learning_rate": 7.981871666728525e-05, "loss": 1.7225, "step": 640 }, { "epoch": 0.38823353740480815, "grad_norm": 0.056601762771606445, "learning_rate": 7.979130995681263e-05, "loss": 1.7088, "step": 650 }, { "epoch": 0.3942063610571898, "grad_norm": 0.06844093650579453, "learning_rate": 7.976198014774637e-05, "loss": 1.7073, "step": 660 }, { "epoch": 0.4001791847095714, "grad_norm": 0.0546780526638031, "learning_rate": 7.973072865758483e-05, "loss": 1.7121, "step": 670 }, { "epoch": 0.4061520083619531, "grad_norm": 0.04654558375477791, "learning_rate": 7.969755699670041e-05, "loss": 1.6951, "step": 680 }, { "epoch": 0.41212483201433475, "grad_norm": 0.06478898227214813, "learning_rate": 7.966246676826661e-05, "loss": 1.7055, "step": 690 }, { "epoch": 0.4180976556667164, "grad_norm": 0.06878198683261871, "learning_rate": 7.962545966818062e-05, "loss": 1.6987, "step": 700 }, { "epoch": 0.4240704793190981, "grad_norm": 0.05675249919295311, "learning_rate": 7.95865374849812e-05, "loss": 1.6998, "step": 710 }, { "epoch": 0.43004330297147975, "grad_norm": 0.05516457185149193, "learning_rate": 7.954570209976239e-05, "loss": 1.6852, "step": 720 }, { "epoch": 0.4360161266238614, "grad_norm": 0.05688585340976715, "learning_rate": 7.950295548608256e-05, "loss": 1.6901, "step": 730 }, { "epoch": 0.4419889502762431, "grad_norm": 0.07187242805957794, "learning_rate": 7.945829970986898e-05, "loss": 1.6894, "step": 740 }, { "epoch": 0.44796177392862474, "grad_norm": 0.0548662506043911, "learning_rate": 7.941173692931801e-05, "loss": 1.6819, "step": 750 }, { "epoch": 0.4539345975810064, "grad_norm": 0.0926741436123848, "learning_rate": 7.93632693947908e-05, "loss": 1.6797, "step": 760 }, { "epoch": 0.45990742123338807, "grad_norm": 0.04921697825193405, "learning_rate": 7.931289944870448e-05, "loss": 1.6629, "step": 770 }, { "epoch": 0.46588024488576973, "grad_norm": 0.07487112283706665, "learning_rate": 7.92606295254191e-05, "loss": 1.6737, "step": 780 }, { "epoch": 0.4718530685381514, "grad_norm": 0.07180643826723099, "learning_rate": 7.920646215111973e-05, "loss": 1.6716, "step": 790 }, { "epoch": 0.47782589219053306, "grad_norm": 0.050522662699222565, "learning_rate": 7.915039994369462e-05, "loss": 1.6597, "step": 800 }, { "epoch": 0.48379871584291473, "grad_norm": 0.0628654807806015, "learning_rate": 7.909244561260855e-05, "loss": 1.6722, "step": 810 }, { "epoch": 0.4897715394952964, "grad_norm": 0.07348821312189102, "learning_rate": 7.903260195877184e-05, "loss": 1.6718, "step": 820 }, { "epoch": 0.49574436314767806, "grad_norm": 0.0689951702952385, "learning_rate": 7.897087187440512e-05, "loss": 1.6658, "step": 830 }, { "epoch": 0.5017171868000597, "grad_norm": 0.05663711205124855, "learning_rate": 7.890725834289946e-05, "loss": 1.6636, "step": 840 }, { "epoch": 0.5076900104524414, "grad_norm": 0.050597622990608215, "learning_rate": 7.884176443867219e-05, "loss": 1.6648, "step": 850 }, { "epoch": 0.513662834104823, "grad_norm": 0.05792626738548279, "learning_rate": 7.87743933270183e-05, "loss": 1.6582, "step": 860 }, { "epoch": 0.5196356577572048, "grad_norm": 0.05193015933036804, "learning_rate": 7.870514826395755e-05, "loss": 1.664, "step": 870 }, { "epoch": 0.5256084814095864, "grad_norm": 0.05836218595504761, "learning_rate": 7.863403259607698e-05, "loss": 1.6535, "step": 880 }, { "epoch": 0.531581305061968, "grad_norm": 0.08420410752296448, "learning_rate": 7.856104976036928e-05, "loss": 1.6463, "step": 890 }, { "epoch": 0.5375541287143497, "grad_norm": 0.06460799276828766, "learning_rate": 7.848620328406663e-05, "loss": 1.6615, "step": 900 }, { "epoch": 0.5435269523667313, "grad_norm": 0.08191855251789093, "learning_rate": 7.840949678447022e-05, "loss": 1.6529, "step": 910 }, { "epoch": 0.549499776019113, "grad_norm": 0.04835124313831329, "learning_rate": 7.833093396877546e-05, "loss": 1.6508, "step": 920 }, { "epoch": 0.5554725996714946, "grad_norm": 0.047752317041158676, "learning_rate": 7.82505186338928e-05, "loss": 1.6484, "step": 930 }, { "epoch": 0.5614454233238764, "grad_norm": 0.054417744278907776, "learning_rate": 7.816825466626419e-05, "loss": 1.6443, "step": 940 }, { "epoch": 0.567418246976258, "grad_norm": 0.0538078136742115, "learning_rate": 7.808414604167537e-05, "loss": 1.6422, "step": 950 }, { "epoch": 0.5733910706286397, "grad_norm": 0.04438367858529091, "learning_rate": 7.799819682506353e-05, "loss": 1.6443, "step": 960 }, { "epoch": 0.5793638942810213, "grad_norm": 0.056033167988061905, "learning_rate": 7.791041117032102e-05, "loss": 1.6428, "step": 970 }, { "epoch": 0.585336717933403, "grad_norm": 0.07095460593700409, "learning_rate": 7.782079332009454e-05, "loss": 1.6425, "step": 980 }, { "epoch": 0.5913095415857846, "grad_norm": 0.05874691903591156, "learning_rate": 7.772934760558005e-05, "loss": 1.6346, "step": 990 }, { "epoch": 0.5972823652381664, "grad_norm": 0.0521966814994812, "learning_rate": 7.76360784463135e-05, "loss": 1.6359, "step": 1000 }, { "epoch": 0.5972823652381664, "eval_loss": 1.634853482246399, "eval_runtime": 28.9256, "eval_samples_per_second": 1197.311, "eval_steps_per_second": 9.369, "step": 1000 }, { "epoch": 0.603255188890548, "grad_norm": 0.052664998918771744, "learning_rate": 7.754099034995727e-05, "loss": 1.6383, "step": 1010 }, { "epoch": 0.6092280125429297, "grad_norm": 0.08000710606575012, "learning_rate": 7.744408791208214e-05, "loss": 1.639, "step": 1020 }, { "epoch": 0.6152008361953113, "grad_norm": 0.05873206630349159, "learning_rate": 7.734537581594545e-05, "loss": 1.632, "step": 1030 }, { "epoch": 0.621173659847693, "grad_norm": 0.06116827204823494, "learning_rate": 7.724485883226454e-05, "loss": 1.6351, "step": 1040 }, { "epoch": 0.6271464835000746, "grad_norm": 0.057659681886434555, "learning_rate": 7.714254181898627e-05, "loss": 1.637, "step": 1050 }, { "epoch": 0.6331193071524563, "grad_norm": 0.05905848369002342, "learning_rate": 7.703842972105228e-05, "loss": 1.626, "step": 1060 }, { "epoch": 0.639092130804838, "grad_norm": 0.0539986751973629, "learning_rate": 7.693252757015991e-05, "loss": 1.6278, "step": 1070 }, { "epoch": 0.6450649544572197, "grad_norm": 0.062365371733903885, "learning_rate": 7.682484048451908e-05, "loss": 1.6187, "step": 1080 }, { "epoch": 0.6510377781096013, "grad_norm": 0.0486634224653244, "learning_rate": 7.671537366860494e-05, "loss": 1.6223, "step": 1090 }, { "epoch": 0.657010601761983, "grad_norm": 0.04700983688235283, "learning_rate": 7.660413241290626e-05, "loss": 1.6237, "step": 1100 }, { "epoch": 0.6629834254143646, "grad_norm": 0.06423746794462204, "learning_rate": 7.649112209366985e-05, "loss": 1.6349, "step": 1110 }, { "epoch": 0.6689562490667463, "grad_norm": 0.05183717608451843, "learning_rate": 7.637634817264064e-05, "loss": 1.6203, "step": 1120 }, { "epoch": 0.6749290727191279, "grad_norm": 0.05448286980390549, "learning_rate": 7.625981619679777e-05, "loss": 1.6159, "step": 1130 }, { "epoch": 0.6809018963715097, "grad_norm": 0.06012860685586929, "learning_rate": 7.61415317980865e-05, "loss": 1.6106, "step": 1140 }, { "epoch": 0.6868747200238913, "grad_norm": 0.0491897277534008, "learning_rate": 7.602150069314598e-05, "loss": 1.613, "step": 1150 }, { "epoch": 0.692847543676273, "grad_norm": 0.05050448700785637, "learning_rate": 7.589972868303301e-05, "loss": 1.6158, "step": 1160 }, { "epoch": 0.6988203673286546, "grad_norm": 0.05027921870350838, "learning_rate": 7.577622165294165e-05, "loss": 1.6166, "step": 1170 }, { "epoch": 0.7047931909810363, "grad_norm": 0.061239466071128845, "learning_rate": 7.565098557191882e-05, "loss": 1.607, "step": 1180 }, { "epoch": 0.7107660146334179, "grad_norm": 0.04995877295732498, "learning_rate": 7.552402649257578e-05, "loss": 1.6152, "step": 1190 }, { "epoch": 0.7167388382857997, "grad_norm": 0.04830503091216087, "learning_rate": 7.539535055079569e-05, "loss": 1.613, "step": 1200 }, { "epoch": 0.7227116619381813, "grad_norm": 0.05787483602762222, "learning_rate": 7.526496396543691e-05, "loss": 1.614, "step": 1210 }, { "epoch": 0.728684485590563, "grad_norm": 0.07437578588724136, "learning_rate": 7.513287303803263e-05, "loss": 1.6127, "step": 1220 }, { "epoch": 0.7346573092429446, "grad_norm": 0.06587845832109451, "learning_rate": 7.499908415248616e-05, "loss": 1.6015, "step": 1230 }, { "epoch": 0.7406301328953263, "grad_norm": 0.0692521184682846, "learning_rate": 7.486360377476255e-05, "loss": 1.6026, "step": 1240 }, { "epoch": 0.7466029565477079, "grad_norm": 0.061289019882678986, "learning_rate": 7.472643845257592e-05, "loss": 1.6108, "step": 1250 }, { "epoch": 0.7525757802000896, "grad_norm": 0.056076616048812866, "learning_rate": 7.458759481507318e-05, "loss": 1.6018, "step": 1260 }, { "epoch": 0.7585486038524712, "grad_norm": 0.06620051711797714, "learning_rate": 7.444707957251354e-05, "loss": 1.6048, "step": 1270 }, { "epoch": 0.764521427504853, "grad_norm": 0.05557152256369591, "learning_rate": 7.430489951594422e-05, "loss": 1.6091, "step": 1280 }, { "epoch": 0.7704942511572346, "grad_norm": 0.04953812435269356, "learning_rate": 7.416106151687224e-05, "loss": 1.6026, "step": 1290 }, { "epoch": 0.7764670748096163, "grad_norm": 0.042427971959114075, "learning_rate": 7.40155725269324e-05, "loss": 1.5983, "step": 1300 }, { "epoch": 0.7824398984619979, "grad_norm": 0.05906856432557106, "learning_rate": 7.386843957755123e-05, "loss": 1.6008, "step": 1310 }, { "epoch": 0.7884127221143796, "grad_norm": 0.04983474314212799, "learning_rate": 7.371966977960713e-05, "loss": 1.5973, "step": 1320 }, { "epoch": 0.7943855457667612, "grad_norm": 0.0590224526822567, "learning_rate": 7.356927032308682e-05, "loss": 1.6011, "step": 1330 }, { "epoch": 0.8003583694191428, "grad_norm": 0.057693641632795334, "learning_rate": 7.341724847673775e-05, "loss": 1.5942, "step": 1340 }, { "epoch": 0.8063311930715246, "grad_norm": 0.040723856538534164, "learning_rate": 7.326361158771688e-05, "loss": 1.6011, "step": 1350 }, { "epoch": 0.8123040167239062, "grad_norm": 0.05768086016178131, "learning_rate": 7.31083670812355e-05, "loss": 1.5999, "step": 1360 }, { "epoch": 0.8182768403762879, "grad_norm": 0.06345749646425247, "learning_rate": 7.29515224602005e-05, "loss": 1.5985, "step": 1370 }, { "epoch": 0.8242496640286695, "grad_norm": 0.06176001578569412, "learning_rate": 7.27930853048516e-05, "loss": 1.5971, "step": 1380 }, { "epoch": 0.8302224876810512, "grad_norm": 0.05247745290398598, "learning_rate": 7.263306327239516e-05, "loss": 1.5958, "step": 1390 }, { "epoch": 0.8361953113334328, "grad_norm": 0.05218351632356644, "learning_rate": 7.247146409663401e-05, "loss": 1.5981, "step": 1400 }, { "epoch": 0.8421681349858146, "grad_norm": 0.0629679337143898, "learning_rate": 7.23082955875937e-05, "loss": 1.5949, "step": 1410 }, { "epoch": 0.8481409586381962, "grad_norm": 0.061205677688121796, "learning_rate": 7.214356563114505e-05, "loss": 1.5957, "step": 1420 }, { "epoch": 0.8541137822905779, "grad_norm": 0.06122026965022087, "learning_rate": 7.197728218862306e-05, "loss": 1.5911, "step": 1430 }, { "epoch": 0.8600866059429595, "grad_norm": 0.054293327033519745, "learning_rate": 7.180945329644204e-05, "loss": 1.5885, "step": 1440 }, { "epoch": 0.8660594295953412, "grad_norm": 0.04569542035460472, "learning_rate": 7.164008706570736e-05, "loss": 1.5893, "step": 1450 }, { "epoch": 0.8720322532477228, "grad_norm": 0.04415179416537285, "learning_rate": 7.146919168182333e-05, "loss": 1.5951, "step": 1460 }, { "epoch": 0.8780050769001045, "grad_norm": 0.052418701350688934, "learning_rate": 7.129677540409762e-05, "loss": 1.5999, "step": 1470 }, { "epoch": 0.8839779005524862, "grad_norm": 0.053583066910505295, "learning_rate": 7.112284656534215e-05, "loss": 1.5979, "step": 1480 }, { "epoch": 0.8899507242048679, "grad_norm": 0.06733547151088715, "learning_rate": 7.09474135714703e-05, "loss": 1.5871, "step": 1490 }, { "epoch": 0.8959235478572495, "grad_norm": 0.05455510690808296, "learning_rate": 7.07704849010907e-05, "loss": 1.5912, "step": 1500 }, { "epoch": 0.9018963715096312, "grad_norm": 0.05950945243239403, "learning_rate": 7.059206910509745e-05, "loss": 1.5958, "step": 1510 }, { "epoch": 0.9078691951620128, "grad_norm": 0.0513860359787941, "learning_rate": 7.041217480625683e-05, "loss": 1.5856, "step": 1520 }, { "epoch": 0.9138420188143945, "grad_norm": 0.05268612131476402, "learning_rate": 7.023081069879062e-05, "loss": 1.5846, "step": 1530 }, { "epoch": 0.9198148424667761, "grad_norm": 0.05923028290271759, "learning_rate": 7.004798554795586e-05, "loss": 1.5739, "step": 1540 }, { "epoch": 0.9257876661191579, "grad_norm": 0.04859180748462677, "learning_rate": 6.986370818962125e-05, "loss": 1.5927, "step": 1550 }, { "epoch": 0.9317604897715395, "grad_norm": 0.060852836817502975, "learning_rate": 6.967798752984012e-05, "loss": 1.5769, "step": 1560 }, { "epoch": 0.9377333134239212, "grad_norm": 0.053088609129190445, "learning_rate": 6.949083254442001e-05, "loss": 1.5845, "step": 1570 }, { "epoch": 0.9437061370763028, "grad_norm": 0.06042907387018204, "learning_rate": 6.930225227848887e-05, "loss": 1.5808, "step": 1580 }, { "epoch": 0.9496789607286845, "grad_norm": 0.05746331810951233, "learning_rate": 6.911225584605787e-05, "loss": 1.5821, "step": 1590 }, { "epoch": 0.9556517843810661, "grad_norm": 0.04398033022880554, "learning_rate": 6.892085242958098e-05, "loss": 1.5775, "step": 1600 }, { "epoch": 0.9616246080334478, "grad_norm": 0.050728365778923035, "learning_rate": 6.872805127951115e-05, "loss": 1.5749, "step": 1610 }, { "epoch": 0.9675974316858295, "grad_norm": 0.0519120879471302, "learning_rate": 6.85338617138533e-05, "loss": 1.5726, "step": 1620 }, { "epoch": 0.9735702553382112, "grad_norm": 0.052526745945215225, "learning_rate": 6.833829311771388e-05, "loss": 1.5793, "step": 1630 }, { "epoch": 0.9795430789905928, "grad_norm": 0.050527602434158325, "learning_rate": 6.814135494284735e-05, "loss": 1.5694, "step": 1640 }, { "epoch": 0.9855159026429745, "grad_norm": 0.08685663342475891, "learning_rate": 6.794305670719945e-05, "loss": 1.5803, "step": 1650 }, { "epoch": 0.9914887262953561, "grad_norm": 0.054428499191999435, "learning_rate": 6.774340799444703e-05, "loss": 1.5757, "step": 1660 }, { "epoch": 0.9974615499477378, "grad_norm": 0.05870772898197174, "learning_rate": 6.754241845353506e-05, "loss": 1.571, "step": 1670 }, { "epoch": 1.0034343736001194, "grad_norm": 0.05581633001565933, "learning_rate": 6.734009779821018e-05, "loss": 1.5659, "step": 1680 }, { "epoch": 1.0094071972525012, "grad_norm": 0.05493481829762459, "learning_rate": 6.713645580655125e-05, "loss": 1.5686, "step": 1690 }, { "epoch": 1.0153800209048829, "grad_norm": 0.05471092462539673, "learning_rate": 6.693150232049686e-05, "loss": 1.5649, "step": 1700 }, { "epoch": 1.0213528445572644, "grad_norm": 0.053526680916547775, "learning_rate": 6.672524724536956e-05, "loss": 1.5671, "step": 1710 }, { "epoch": 1.027325668209646, "grad_norm": 0.06532900780439377, "learning_rate": 6.651770054939722e-05, "loss": 1.5614, "step": 1720 }, { "epoch": 1.0332984918620278, "grad_norm": 0.051929574459791183, "learning_rate": 6.630887226323128e-05, "loss": 1.556, "step": 1730 }, { "epoch": 1.0392713155144095, "grad_norm": 0.06289497762918472, "learning_rate": 6.609877247946186e-05, "loss": 1.5634, "step": 1740 }, { "epoch": 1.045244139166791, "grad_norm": 0.05371445044875145, "learning_rate": 6.588741135213012e-05, "loss": 1.5645, "step": 1750 }, { "epoch": 1.0512169628191728, "grad_norm": 0.04851632937788963, "learning_rate": 6.567479909623746e-05, "loss": 1.5648, "step": 1760 }, { "epoch": 1.0571897864715545, "grad_norm": 0.06357111036777496, "learning_rate": 6.546094598725186e-05, "loss": 1.5568, "step": 1770 }, { "epoch": 1.063162610123936, "grad_norm": 0.07035905867815018, "learning_rate": 6.524586236061117e-05, "loss": 1.5519, "step": 1780 }, { "epoch": 1.0691354337763177, "grad_norm": 0.05517163127660751, "learning_rate": 6.502955861122377e-05, "loss": 1.5566, "step": 1790 }, { "epoch": 1.0751082574286994, "grad_norm": 0.0504322424530983, "learning_rate": 6.481204519296606e-05, "loss": 1.5668, "step": 1800 }, { "epoch": 1.0810810810810811, "grad_norm": 0.051910221576690674, "learning_rate": 6.459333261817726e-05, "loss": 1.5585, "step": 1810 }, { "epoch": 1.0870539047334629, "grad_norm": 0.07319536805152893, "learning_rate": 6.43734314571514e-05, "loss": 1.5599, "step": 1820 }, { "epoch": 1.0930267283858444, "grad_norm": 0.05212223529815674, "learning_rate": 6.415235233762635e-05, "loss": 1.5597, "step": 1830 }, { "epoch": 1.098999552038226, "grad_norm": 0.05524059012532234, "learning_rate": 6.393010594427034e-05, "loss": 1.5449, "step": 1840 }, { "epoch": 1.1049723756906078, "grad_norm": 0.044485364109277725, "learning_rate": 6.370670301816544e-05, "loss": 1.5584, "step": 1850 }, { "epoch": 1.1109451993429893, "grad_norm": 0.04716966673731804, "learning_rate": 6.348215435628852e-05, "loss": 1.5577, "step": 1860 }, { "epoch": 1.116918022995371, "grad_norm": 0.04776601493358612, "learning_rate": 6.32564708109894e-05, "loss": 1.5597, "step": 1870 }, { "epoch": 1.1228908466477527, "grad_norm": 0.05379948392510414, "learning_rate": 6.302966328946638e-05, "loss": 1.5542, "step": 1880 }, { "epoch": 1.1288636703001345, "grad_norm": 0.05076327919960022, "learning_rate": 6.280174275323915e-05, "loss": 1.5564, "step": 1890 }, { "epoch": 1.134836493952516, "grad_norm": 0.0562434047460556, "learning_rate": 6.257272021761884e-05, "loss": 1.5597, "step": 1900 }, { "epoch": 1.1408093176048977, "grad_norm": 0.045845337212085724, "learning_rate": 6.234260675117595e-05, "loss": 1.5535, "step": 1910 }, { "epoch": 1.1467821412572794, "grad_norm": 0.04580407217144966, "learning_rate": 6.21114134752051e-05, "loss": 1.5486, "step": 1920 }, { "epoch": 1.1527549649096611, "grad_norm": 0.05752042680978775, "learning_rate": 6.187915156318775e-05, "loss": 1.5454, "step": 1930 }, { "epoch": 1.1587277885620426, "grad_norm": 0.05608632043004036, "learning_rate": 6.164583224025215e-05, "loss": 1.5545, "step": 1940 }, { "epoch": 1.1647006122144243, "grad_norm": 0.047604430466890335, "learning_rate": 6.141146678263076e-05, "loss": 1.5531, "step": 1950 }, { "epoch": 1.170673435866806, "grad_norm": 0.04514037445187569, "learning_rate": 6.117606651711537e-05, "loss": 1.5547, "step": 1960 }, { "epoch": 1.1766462595191878, "grad_norm": 0.05768571048974991, "learning_rate": 6.0939642820509564e-05, "loss": 1.5496, "step": 1970 }, { "epoch": 1.1826190831715693, "grad_norm": 0.04222779721021652, "learning_rate": 6.070220711907903e-05, "loss": 1.5469, "step": 1980 }, { "epoch": 1.188591906823951, "grad_norm": 0.05183190852403641, "learning_rate": 6.046377088799923e-05, "loss": 1.5526, "step": 1990 }, { "epoch": 1.1945647304763327, "grad_norm": 0.04888539016246796, "learning_rate": 6.0224345650800826e-05, "loss": 1.5579, "step": 2000 }, { "epoch": 1.1945647304763327, "eval_loss": 1.5546131134033203, "eval_runtime": 20.1679, "eval_samples_per_second": 1717.237, "eval_steps_per_second": 13.437, "step": 2000 }, { "epoch": 1.2005375541287144, "grad_norm": 0.049841009080410004, "learning_rate": 5.998394297881277e-05, "loss": 1.5531, "step": 2010 }, { "epoch": 1.206510377781096, "grad_norm": 0.04911394044756889, "learning_rate": 5.974257449060306e-05, "loss": 1.5512, "step": 2020 }, { "epoch": 1.2124832014334777, "grad_norm": 0.05170886963605881, "learning_rate": 5.9500251851417206e-05, "loss": 1.5439, "step": 2030 }, { "epoch": 1.2184560250858594, "grad_norm": 0.04615171626210213, "learning_rate": 5.925698677261449e-05, "loss": 1.5453, "step": 2040 }, { "epoch": 1.224428848738241, "grad_norm": 0.04724368825554848, "learning_rate": 5.901279101110191e-05, "loss": 1.5434, "step": 2050 }, { "epoch": 1.2304016723906226, "grad_norm": 0.06991260498762131, "learning_rate": 5.8767676368766016e-05, "loss": 1.5489, "step": 2060 }, { "epoch": 1.2363744960430043, "grad_norm": 0.055575910955667496, "learning_rate": 5.852165469190251e-05, "loss": 1.5514, "step": 2070 }, { "epoch": 1.242347319695386, "grad_norm": 0.04874608293175697, "learning_rate": 5.82747378706437e-05, "loss": 1.5523, "step": 2080 }, { "epoch": 1.2483201433477678, "grad_norm": 0.05960864573717117, "learning_rate": 5.8026937838383914e-05, "loss": 1.5469, "step": 2090 }, { "epoch": 1.2542929670001493, "grad_norm": 0.07086056470870972, "learning_rate": 5.77782665712027e-05, "loss": 1.5497, "step": 2100 }, { "epoch": 1.260265790652531, "grad_norm": 0.0472436398267746, "learning_rate": 5.752873608728603e-05, "loss": 1.5425, "step": 2110 }, { "epoch": 1.2662386143049127, "grad_norm": 0.06843575835227966, "learning_rate": 5.7278358446345545e-05, "loss": 1.542, "step": 2120 }, { "epoch": 1.2722114379572944, "grad_norm": 0.04991114139556885, "learning_rate": 5.702714574903561e-05, "loss": 1.5423, "step": 2130 }, { "epoch": 1.278184261609676, "grad_norm": 0.04601559415459633, "learning_rate": 5.6775110136368576e-05, "loss": 1.5357, "step": 2140 }, { "epoch": 1.2841570852620576, "grad_norm": 0.042647868394851685, "learning_rate": 5.6522263789127937e-05, "loss": 1.5386, "step": 2150 }, { "epoch": 1.2901299089144393, "grad_norm": 0.06261768937110901, "learning_rate": 5.626861892727969e-05, "loss": 1.5428, "step": 2160 }, { "epoch": 1.2961027325668208, "grad_norm": 0.04735434427857399, "learning_rate": 5.601418780938175e-05, "loss": 1.5395, "step": 2170 }, { "epoch": 1.3020755562192026, "grad_norm": 0.048824459314346313, "learning_rate": 5.575898273199146e-05, "loss": 1.5418, "step": 2180 }, { "epoch": 1.3080483798715843, "grad_norm": 0.04974917694926262, "learning_rate": 5.5503016029071354e-05, "loss": 1.5371, "step": 2190 }, { "epoch": 1.314021203523966, "grad_norm": 0.05275791883468628, "learning_rate": 5.5246300071392985e-05, "loss": 1.5364, "step": 2200 }, { "epoch": 1.3199940271763477, "grad_norm": 0.0487825907766819, "learning_rate": 5.4988847265939146e-05, "loss": 1.5436, "step": 2210 }, { "epoch": 1.3259668508287292, "grad_norm": 0.06100558117032051, "learning_rate": 5.473067005530416e-05, "loss": 1.5351, "step": 2220 }, { "epoch": 1.331939674481111, "grad_norm": 0.07098929584026337, "learning_rate": 5.447178091709262e-05, "loss": 1.5463, "step": 2230 }, { "epoch": 1.3379124981334927, "grad_norm": 0.06729080528020859, "learning_rate": 5.421219236331624e-05, "loss": 1.5382, "step": 2240 }, { "epoch": 1.3438853217858742, "grad_norm": 0.05485675856471062, "learning_rate": 5.395191693978927e-05, "loss": 1.5349, "step": 2250 }, { "epoch": 1.3498581454382559, "grad_norm": 0.05816954746842384, "learning_rate": 5.3690967225522076e-05, "loss": 1.5406, "step": 2260 }, { "epoch": 1.3558309690906376, "grad_norm": 0.044427741318941116, "learning_rate": 5.342935583211327e-05, "loss": 1.5309, "step": 2270 }, { "epoch": 1.3618037927430193, "grad_norm": 0.05544894561171532, "learning_rate": 5.31670954031401e-05, "loss": 1.5365, "step": 2280 }, { "epoch": 1.367776616395401, "grad_norm": 0.04774465411901474, "learning_rate": 5.290419861354753e-05, "loss": 1.5303, "step": 2290 }, { "epoch": 1.3737494400477825, "grad_norm": 0.050910986959934235, "learning_rate": 5.264067816903552e-05, "loss": 1.5384, "step": 2300 }, { "epoch": 1.3797222637001643, "grad_norm": 0.05830187723040581, "learning_rate": 5.2376546805445054e-05, "loss": 1.535, "step": 2310 }, { "epoch": 1.385695087352546, "grad_norm": 0.0521889254450798, "learning_rate": 5.211181728814262e-05, "loss": 1.5348, "step": 2320 }, { "epoch": 1.3916679110049275, "grad_norm": 0.04742933064699173, "learning_rate": 5.18465024114032e-05, "loss": 1.5421, "step": 2330 }, { "epoch": 1.3976407346573092, "grad_norm": 0.05169609189033508, "learning_rate": 5.158061499779201e-05, "loss": 1.5322, "step": 2340 }, { "epoch": 1.403613558309691, "grad_norm": 0.05307742580771446, "learning_rate": 5.131416789754472e-05, "loss": 1.538, "step": 2350 }, { "epoch": 1.4095863819620726, "grad_norm": 0.04581635445356369, "learning_rate": 5.1047173987946474e-05, "loss": 1.5313, "step": 2360 }, { "epoch": 1.4155592056144544, "grad_norm": 0.04794102534651756, "learning_rate": 5.077964617270947e-05, "loss": 1.5357, "step": 2370 }, { "epoch": 1.4215320292668359, "grad_norm": 0.043038323521614075, "learning_rate": 5.051159738134937e-05, "loss": 1.5362, "step": 2380 }, { "epoch": 1.4275048529192176, "grad_norm": 0.052804794162511826, "learning_rate": 5.024304056856039e-05, "loss": 1.5299, "step": 2390 }, { "epoch": 1.4334776765715993, "grad_norm": 0.051046222448349, "learning_rate": 4.997398871358928e-05, "loss": 1.529, "step": 2400 }, { "epoch": 1.4394505002239808, "grad_norm": 0.056139182299375534, "learning_rate": 4.970445481960793e-05, "loss": 1.5368, "step": 2410 }, { "epoch": 1.4454233238763625, "grad_norm": 0.04890932887792587, "learning_rate": 4.9434451913085e-05, "loss": 1.5308, "step": 2420 }, { "epoch": 1.4513961475287442, "grad_norm": 0.04679281637072563, "learning_rate": 4.916399304315636e-05, "loss": 1.5353, "step": 2430 }, { "epoch": 1.457368971181126, "grad_norm": 0.05536729097366333, "learning_rate": 4.8893091280994415e-05, "loss": 1.5314, "step": 2440 }, { "epoch": 1.4633417948335075, "grad_norm": 0.04933058097958565, "learning_rate": 4.862175971917637e-05, "loss": 1.5301, "step": 2450 }, { "epoch": 1.4693146184858892, "grad_norm": 0.05884556844830513, "learning_rate": 4.835001147105148e-05, "loss": 1.5213, "step": 2460 }, { "epoch": 1.475287442138271, "grad_norm": 0.04465237259864807, "learning_rate": 4.807785967010729e-05, "loss": 1.5288, "step": 2470 }, { "epoch": 1.4812602657906524, "grad_norm": 0.04548431187868118, "learning_rate": 4.780531746933491e-05, "loss": 1.5353, "step": 2480 }, { "epoch": 1.4872330894430341, "grad_norm": 0.047798071056604385, "learning_rate": 4.7532398040593295e-05, "loss": 1.5261, "step": 2490 }, { "epoch": 1.4932059130954158, "grad_norm": 0.05616561323404312, "learning_rate": 4.7259114573972715e-05, "loss": 1.5343, "step": 2500 }, { "epoch": 1.4991787367477976, "grad_norm": 0.053861986845731735, "learning_rate": 4.6985480277157215e-05, "loss": 1.5249, "step": 2510 }, { "epoch": 1.5051515604001793, "grad_norm": 0.05890486761927605, "learning_rate": 4.671150837478634e-05, "loss": 1.5357, "step": 2520 }, { "epoch": 1.511124384052561, "grad_norm": 0.056382015347480774, "learning_rate": 4.643721210781601e-05, "loss": 1.5159, "step": 2530 }, { "epoch": 1.5170972077049425, "grad_norm": 0.051396943628787994, "learning_rate": 4.6162604732878515e-05, "loss": 1.5301, "step": 2540 }, { "epoch": 1.5230700313573242, "grad_norm": 0.04754629358649254, "learning_rate": 4.588769952164191e-05, "loss": 1.5277, "step": 2550 }, { "epoch": 1.5290428550097057, "grad_norm": 0.0532587394118309, "learning_rate": 4.561250976016851e-05, "loss": 1.5201, "step": 2560 }, { "epoch": 1.5350156786620874, "grad_norm": 0.059257134795188904, "learning_rate": 4.5337048748272905e-05, "loss": 1.5265, "step": 2570 }, { "epoch": 1.5409885023144692, "grad_norm": 0.05495699495077133, "learning_rate": 4.5061329798879064e-05, "loss": 1.5247, "step": 2580 }, { "epoch": 1.5469613259668509, "grad_norm": 0.04833153635263443, "learning_rate": 4.478536623737699e-05, "loss": 1.5291, "step": 2590 }, { "epoch": 1.5529341496192326, "grad_norm": 0.048605091869831085, "learning_rate": 4.450917140097869e-05, "loss": 1.5277, "step": 2600 }, { "epoch": 1.5589069732716143, "grad_norm": 0.06368768960237503, "learning_rate": 4.4232758638073585e-05, "loss": 1.5306, "step": 2610 }, { "epoch": 1.5648797969239958, "grad_norm": 0.04569351673126221, "learning_rate": 4.395614130758344e-05, "loss": 1.5208, "step": 2620 }, { "epoch": 1.5708526205763775, "grad_norm": 0.07877717167139053, "learning_rate": 4.367933277831666e-05, "loss": 1.5152, "step": 2630 }, { "epoch": 1.576825444228759, "grad_norm": 0.05059320852160454, "learning_rate": 4.34023464283222e-05, "loss": 1.5199, "step": 2640 }, { "epoch": 1.5827982678811408, "grad_norm": 0.05248813331127167, "learning_rate": 4.312519564424306e-05, "loss": 1.5236, "step": 2650 }, { "epoch": 1.5887710915335225, "grad_norm": 0.051895346492528915, "learning_rate": 4.2847893820669244e-05, "loss": 1.5225, "step": 2660 }, { "epoch": 1.5947439151859042, "grad_norm": 0.048129428178071976, "learning_rate": 4.2570454359490455e-05, "loss": 1.5259, "step": 2670 }, { "epoch": 1.600716738838286, "grad_norm": 0.049009375274181366, "learning_rate": 4.2292890669248364e-05, "loss": 1.533, "step": 2680 }, { "epoch": 1.6066895624906674, "grad_norm": 0.05925741046667099, "learning_rate": 4.2015216164488575e-05, "loss": 1.5242, "step": 2690 }, { "epoch": 1.6126623861430491, "grad_norm": 0.051209457218647, "learning_rate": 4.173744426511231e-05, "loss": 1.5348, "step": 2700 }, { "epoch": 1.6186352097954306, "grad_norm": 0.04731997102499008, "learning_rate": 4.1459588395727876e-05, "loss": 1.5179, "step": 2710 }, { "epoch": 1.6246080334478123, "grad_norm": 0.04640951007604599, "learning_rate": 4.118166198500178e-05, "loss": 1.5218, "step": 2720 }, { "epoch": 1.630580857100194, "grad_norm": 0.05060356855392456, "learning_rate": 4.090367846500976e-05, "loss": 1.5184, "step": 2730 }, { "epoch": 1.6365536807525758, "grad_norm": 0.04525948315858841, "learning_rate": 4.062565127058764e-05, "loss": 1.5207, "step": 2740 }, { "epoch": 1.6425265044049575, "grad_norm": 0.0447864904999733, "learning_rate": 4.0347593838682016e-05, "loss": 1.5265, "step": 2750 }, { "epoch": 1.6484993280573392, "grad_norm": 0.06339412927627563, "learning_rate": 4.006951960770084e-05, "loss": 1.5296, "step": 2760 }, { "epoch": 1.6544721517097207, "grad_norm": 0.05479173734784126, "learning_rate": 3.979144201686396e-05, "loss": 1.5167, "step": 2770 }, { "epoch": 1.6604449753621024, "grad_norm": 0.05605393648147583, "learning_rate": 3.951337450555361e-05, "loss": 1.5208, "step": 2780 }, { "epoch": 1.666417799014484, "grad_norm": 0.04500933736562729, "learning_rate": 3.923533051266486e-05, "loss": 1.5199, "step": 2790 }, { "epoch": 1.6723906226668657, "grad_norm": 0.044439464807510376, "learning_rate": 3.8957323475956165e-05, "loss": 1.5254, "step": 2800 }, { "epoch": 1.6783634463192474, "grad_norm": 0.051942795515060425, "learning_rate": 3.867936683139991e-05, "loss": 1.5168, "step": 2810 }, { "epoch": 1.684336269971629, "grad_norm": 0.05696643143892288, "learning_rate": 3.840147401253305e-05, "loss": 1.5261, "step": 2820 }, { "epoch": 1.6903090936240108, "grad_norm": 0.0423273928463459, "learning_rate": 3.812365844980782e-05, "loss": 1.5166, "step": 2830 }, { "epoch": 1.6962819172763925, "grad_norm": 0.04251600056886673, "learning_rate": 3.784593356994275e-05, "loss": 1.514, "step": 2840 }, { "epoch": 1.702254740928774, "grad_norm": 0.06778108328580856, "learning_rate": 3.7568312795273675e-05, "loss": 1.5161, "step": 2850 }, { "epoch": 1.7082275645811558, "grad_norm": 0.046843383461236954, "learning_rate": 3.729080954310509e-05, "loss": 1.5215, "step": 2860 }, { "epoch": 1.7142003882335373, "grad_norm": 0.04683705046772957, "learning_rate": 3.701343722506164e-05, "loss": 1.5191, "step": 2870 }, { "epoch": 1.720173211885919, "grad_norm": 0.04883548244833946, "learning_rate": 3.673620924644e-05, "loss": 1.5175, "step": 2880 }, { "epoch": 1.7261460355383007, "grad_norm": 0.047556836158037186, "learning_rate": 3.6459139005560966e-05, "loss": 1.5191, "step": 2890 }, { "epoch": 1.7321188591906824, "grad_norm": 0.04096701368689537, "learning_rate": 3.618223989312195e-05, "loss": 1.5195, "step": 2900 }, { "epoch": 1.7380916828430641, "grad_norm": 0.043791547417640686, "learning_rate": 3.590552529154974e-05, "loss": 1.5149, "step": 2910 }, { "epoch": 1.7440645064954459, "grad_norm": 0.06429862976074219, "learning_rate": 3.562900857435384e-05, "loss": 1.5136, "step": 2920 }, { "epoch": 1.7500373301478274, "grad_norm": 0.04811246693134308, "learning_rate": 3.535270310548007e-05, "loss": 1.5178, "step": 2930 }, { "epoch": 1.756010153800209, "grad_norm": 0.05720449239015579, "learning_rate": 3.5076622238664675e-05, "loss": 1.5112, "step": 2940 }, { "epoch": 1.7619829774525906, "grad_norm": 0.04717197269201279, "learning_rate": 3.480077931678899e-05, "loss": 1.5147, "step": 2950 }, { "epoch": 1.7679558011049723, "grad_norm": 0.04889809712767601, "learning_rate": 3.452518767123456e-05, "loss": 1.5186, "step": 2960 }, { "epoch": 1.773928624757354, "grad_norm": 0.055686600506305695, "learning_rate": 3.424986062123883e-05, "loss": 1.5105, "step": 2970 }, { "epoch": 1.7799014484097357, "grad_norm": 0.045671623200178146, "learning_rate": 3.397481147325146e-05, "loss": 1.5236, "step": 2980 }, { "epoch": 1.7858742720621175, "grad_norm": 0.0518915057182312, "learning_rate": 3.370005352029122e-05, "loss": 1.5082, "step": 2990 }, { "epoch": 1.7918470957144992, "grad_norm": 0.0466337613761425, "learning_rate": 3.342560004130351e-05, "loss": 1.5246, "step": 3000 }, { "epoch": 1.7918470957144992, "eval_loss": 1.5170252323150635, "eval_runtime": 20.1093, "eval_samples_per_second": 1722.235, "eval_steps_per_second": 13.476, "step": 3000 }, { "epoch": 1.7978199193668807, "grad_norm": 0.04238193854689598, "learning_rate": 3.3151464300518634e-05, "loss": 1.5097, "step": 3010 }, { "epoch": 1.8037927430192624, "grad_norm": 0.050784409046173096, "learning_rate": 3.2877659546810745e-05, "loss": 1.5195, "step": 3020 }, { "epoch": 1.809765566671644, "grad_norm": 0.04055749997496605, "learning_rate": 3.260419901305751e-05, "loss": 1.5171, "step": 3030 }, { "epoch": 1.8157383903240256, "grad_norm": 0.05311364307999611, "learning_rate": 3.2331095915500564e-05, "loss": 1.5136, "step": 3040 }, { "epoch": 1.8217112139764073, "grad_norm": 0.0499190054833889, "learning_rate": 3.205836345310681e-05, "loss": 1.5081, "step": 3050 }, { "epoch": 1.827684037628789, "grad_norm": 0.056762441992759705, "learning_rate": 3.178601480693048e-05, "loss": 1.5243, "step": 3060 }, { "epoch": 1.8336568612811708, "grad_norm": 0.04753740131855011, "learning_rate": 3.151406313947615e-05, "loss": 1.5069, "step": 3070 }, { "epoch": 1.8396296849335525, "grad_norm": 0.054608915001153946, "learning_rate": 3.124252159406251e-05, "loss": 1.5172, "step": 3080 }, { "epoch": 1.845602508585934, "grad_norm": 0.04840042069554329, "learning_rate": 3.097140329418726e-05, "loss": 1.5126, "step": 3090 }, { "epoch": 1.8515753322383157, "grad_norm": 0.05584624037146568, "learning_rate": 3.07007213428928e-05, "loss": 1.5091, "step": 3100 }, { "epoch": 1.8575481558906972, "grad_norm": 0.0425049252808094, "learning_rate": 3.0430488822132957e-05, "loss": 1.5155, "step": 3110 }, { "epoch": 1.863520979543079, "grad_norm": 0.043588876724243164, "learning_rate": 3.016071879214077e-05, "loss": 1.5099, "step": 3120 }, { "epoch": 1.8694938031954607, "grad_norm": 0.041503310203552246, "learning_rate": 2.989142429079725e-05, "loss": 1.509, "step": 3130 }, { "epoch": 1.8754666268478424, "grad_norm": 0.04797055944800377, "learning_rate": 2.962261833300133e-05, "loss": 1.507, "step": 3140 }, { "epoch": 1.881439450500224, "grad_norm": 0.05003626272082329, "learning_rate": 2.935431391004081e-05, "loss": 1.5177, "step": 3150 }, { "epoch": 1.8874122741526056, "grad_norm": 0.04475341737270355, "learning_rate": 2.9086523988964478e-05, "loss": 1.5077, "step": 3160 }, { "epoch": 1.8933850978049873, "grad_norm": 0.04602671042084694, "learning_rate": 2.881926151195547e-05, "loss": 1.5037, "step": 3170 }, { "epoch": 1.8993579214573688, "grad_norm": 0.04945210739970207, "learning_rate": 2.855253939570578e-05, "loss": 1.503, "step": 3180 }, { "epoch": 1.9053307451097505, "grad_norm": 0.04730582609772682, "learning_rate": 2.8286370530791914e-05, "loss": 1.5064, "step": 3190 }, { "epoch": 1.9113035687621323, "grad_norm": 0.05128956586122513, "learning_rate": 2.8020767781052016e-05, "loss": 1.5126, "step": 3200 }, { "epoch": 1.917276392414514, "grad_norm": 0.055559854954481125, "learning_rate": 2.7755743982964066e-05, "loss": 1.5052, "step": 3210 }, { "epoch": 1.9232492160668957, "grad_norm": 0.036298781633377075, "learning_rate": 2.749131194502555e-05, "loss": 1.5092, "step": 3220 }, { "epoch": 1.9292220397192774, "grad_norm": 0.042619943618774414, "learning_rate": 2.7227484447134398e-05, "loss": 1.5044, "step": 3230 }, { "epoch": 1.935194863371659, "grad_norm": 0.052806805819272995, "learning_rate": 2.696427423997138e-05, "loss": 1.5056, "step": 3240 }, { "epoch": 1.9411676870240406, "grad_norm": 0.044467948377132416, "learning_rate": 2.670169404438383e-05, "loss": 1.5114, "step": 3250 }, { "epoch": 1.9471405106764221, "grad_norm": 0.038638997822999954, "learning_rate": 2.6439756550770872e-05, "loss": 1.5154, "step": 3260 }, { "epoch": 1.9531133343288039, "grad_norm": 0.04845379292964935, "learning_rate": 2.617847441847007e-05, "loss": 1.51, "step": 3270 }, { "epoch": 1.9590861579811856, "grad_norm": 0.0445607528090477, "learning_rate": 2.5917860275145658e-05, "loss": 1.5047, "step": 3280 }, { "epoch": 1.9650589816335673, "grad_norm": 0.045905206352472305, "learning_rate": 2.5657926716178217e-05, "loss": 1.5118, "step": 3290 }, { "epoch": 1.971031805285949, "grad_norm": 0.04530317336320877, "learning_rate": 2.539868630405594e-05, "loss": 1.5099, "step": 3300 }, { "epoch": 1.9770046289383307, "grad_norm": 0.04195258021354675, "learning_rate": 2.5140151567767505e-05, "loss": 1.5075, "step": 3310 }, { "epoch": 1.9829774525907122, "grad_norm": 0.043815840035676956, "learning_rate": 2.4882335002196553e-05, "loss": 1.5096, "step": 3320 }, { "epoch": 1.988950276243094, "grad_norm": 0.04683714732527733, "learning_rate": 2.4625249067517803e-05, "loss": 1.5057, "step": 3330 }, { "epoch": 1.9949230998954754, "grad_norm": 0.049690209329128265, "learning_rate": 2.4368906188594877e-05, "loss": 1.5106, "step": 3340 }, { "epoch": 2.000895923547857, "grad_norm": 0.048324376344680786, "learning_rate": 2.4113318754379816e-05, "loss": 1.5042, "step": 3350 }, { "epoch": 2.006868747200239, "grad_norm": 0.05503029376268387, "learning_rate": 2.385849911731426e-05, "loss": 1.4922, "step": 3360 }, { "epoch": 2.0128415708526206, "grad_norm": 0.049435921013355255, "learning_rate": 2.360445959273255e-05, "loss": 1.4962, "step": 3370 }, { "epoch": 2.0188143945050023, "grad_norm": 0.05086649954319, "learning_rate": 2.3351212458266512e-05, "loss": 1.4918, "step": 3380 }, { "epoch": 2.024787218157384, "grad_norm": 0.045887332409620285, "learning_rate": 2.3098769953252002e-05, "loss": 1.4868, "step": 3390 }, { "epoch": 2.0307600418097658, "grad_norm": 0.04303443059325218, "learning_rate": 2.2847144278137502e-05, "loss": 1.4982, "step": 3400 }, { "epoch": 2.036732865462147, "grad_norm": 0.043649692088365555, "learning_rate": 2.2596347593894387e-05, "loss": 1.5, "step": 3410 }, { "epoch": 2.0427056891145288, "grad_norm": 0.04276139661669731, "learning_rate": 2.2346392021429254e-05, "loss": 1.4903, "step": 3420 }, { "epoch": 2.0486785127669105, "grad_norm": 0.04298582300543785, "learning_rate": 2.2097289640998074e-05, "loss": 1.5032, "step": 3430 }, { "epoch": 2.054651336419292, "grad_norm": 0.053750213235616684, "learning_rate": 2.1849052491622374e-05, "loss": 1.4942, "step": 3440 }, { "epoch": 2.060624160071674, "grad_norm": 0.042636483907699585, "learning_rate": 2.160169257050742e-05, "loss": 1.4976, "step": 3450 }, { "epoch": 2.0665969837240556, "grad_norm": 0.05124128982424736, "learning_rate": 2.135522183246237e-05, "loss": 1.4981, "step": 3460 }, { "epoch": 2.0725698073764374, "grad_norm": 0.047978244721889496, "learning_rate": 2.110965218932247e-05, "loss": 1.4975, "step": 3470 }, { "epoch": 2.078542631028819, "grad_norm": 0.045476969331502914, "learning_rate": 2.0864995509373448e-05, "loss": 1.4958, "step": 3480 }, { "epoch": 2.0845154546812004, "grad_norm": 0.05264231190085411, "learning_rate": 2.062126361677786e-05, "loss": 1.4996, "step": 3490 }, { "epoch": 2.090488278333582, "grad_norm": 0.05144358426332474, "learning_rate": 2.037846829100364e-05, "loss": 1.5077, "step": 3500 }, { "epoch": 2.096461101985964, "grad_norm": 0.048265036195516586, "learning_rate": 2.013662126625482e-05, "loss": 1.4987, "step": 3510 }, { "epoch": 2.1024339256383455, "grad_norm": 0.04586884751915932, "learning_rate": 1.9895734230904396e-05, "loss": 1.5044, "step": 3520 }, { "epoch": 2.1084067492907272, "grad_norm": 0.03930211812257767, "learning_rate": 1.965581882692949e-05, "loss": 1.4951, "step": 3530 }, { "epoch": 2.114379572943109, "grad_norm": 0.051928870379924774, "learning_rate": 1.9416886649348575e-05, "loss": 1.4962, "step": 3540 }, { "epoch": 2.1203523965954907, "grad_norm": 0.04466070607304573, "learning_rate": 1.917894924566125e-05, "loss": 1.4874, "step": 3550 }, { "epoch": 2.126325220247872, "grad_norm": 0.044879212975502014, "learning_rate": 1.8942018115290063e-05, "loss": 1.4896, "step": 3560 }, { "epoch": 2.1322980439002537, "grad_norm": 0.04508794844150543, "learning_rate": 1.8706104709024715e-05, "loss": 1.4915, "step": 3570 }, { "epoch": 2.1382708675526354, "grad_norm": 0.06577686965465546, "learning_rate": 1.8471220428468745e-05, "loss": 1.4981, "step": 3580 }, { "epoch": 2.144243691205017, "grad_norm": 0.03995177894830704, "learning_rate": 1.823737662548843e-05, "loss": 1.4973, "step": 3590 }, { "epoch": 2.150216514857399, "grad_norm": 0.06114717572927475, "learning_rate": 1.800458460166417e-05, "loss": 1.4942, "step": 3600 }, { "epoch": 2.1561893385097806, "grad_norm": 0.04745366424322128, "learning_rate": 1.7772855607744284e-05, "loss": 1.5004, "step": 3610 }, { "epoch": 2.1621621621621623, "grad_norm": 0.045220714062452316, "learning_rate": 1.7542200843101267e-05, "loss": 1.494, "step": 3620 }, { "epoch": 2.168134985814544, "grad_norm": 0.04914199188351631, "learning_rate": 1.7312631455190528e-05, "loss": 1.491, "step": 3630 }, { "epoch": 2.1741078094669257, "grad_norm": 0.044854309409856796, "learning_rate": 1.708415853901166e-05, "loss": 1.4974, "step": 3640 }, { "epoch": 2.180080633119307, "grad_norm": 0.0511915348470211, "learning_rate": 1.6856793136572155e-05, "loss": 1.4978, "step": 3650 }, { "epoch": 2.1860534567716887, "grad_norm": 0.052235160022974014, "learning_rate": 1.6630546236353833e-05, "loss": 1.4884, "step": 3660 }, { "epoch": 2.1920262804240704, "grad_norm": 0.03959416225552559, "learning_rate": 1.6405428772781724e-05, "loss": 1.4897, "step": 3670 }, { "epoch": 2.197999104076452, "grad_norm": 0.04642707481980324, "learning_rate": 1.618145162569563e-05, "loss": 1.489, "step": 3680 }, { "epoch": 2.203971927728834, "grad_norm": 0.05590491741895676, "learning_rate": 1.5958625619824286e-05, "loss": 1.4946, "step": 3690 }, { "epoch": 2.2099447513812156, "grad_norm": 0.050484009087085724, "learning_rate": 1.5736961524262232e-05, "loss": 1.5011, "step": 3700 }, { "epoch": 2.2159175750335973, "grad_norm": 0.04109204187989235, "learning_rate": 1.551647005194932e-05, "loss": 1.4993, "step": 3710 }, { "epoch": 2.2218903986859786, "grad_norm": 0.04570942744612694, "learning_rate": 1.5297161859152986e-05, "loss": 1.491, "step": 3720 }, { "epoch": 2.2278632223383603, "grad_norm": 0.041420578956604004, "learning_rate": 1.5079047544953227e-05, "loss": 1.4874, "step": 3730 }, { "epoch": 2.233836045990742, "grad_norm": 0.04918381944298744, "learning_rate": 1.486213765073032e-05, "loss": 1.4939, "step": 3740 }, { "epoch": 2.2398088696431238, "grad_norm": 0.05086056888103485, "learning_rate": 1.4646442659655425e-05, "loss": 1.4992, "step": 3750 }, { "epoch": 2.2457816932955055, "grad_norm": 0.061345502734184265, "learning_rate": 1.4431972996183894e-05, "loss": 1.4935, "step": 3760 }, { "epoch": 2.251754516947887, "grad_norm": 0.03802775219082832, "learning_rate": 1.4218739025551469e-05, "loss": 1.487, "step": 3770 }, { "epoch": 2.257727340600269, "grad_norm": 0.039830368012189865, "learning_rate": 1.4006751053273338e-05, "loss": 1.4943, "step": 3780 }, { "epoch": 2.2637001642526506, "grad_norm": 0.04441362991929054, "learning_rate": 1.3796019324646062e-05, "loss": 1.4907, "step": 3790 }, { "epoch": 2.269672987905032, "grad_norm": 0.04267200455069542, "learning_rate": 1.358655402425245e-05, "loss": 1.4905, "step": 3800 }, { "epoch": 2.2756458115574136, "grad_norm": 0.04467471316456795, "learning_rate": 1.3378365275469322e-05, "loss": 1.4865, "step": 3810 }, { "epoch": 2.2816186352097954, "grad_norm": 0.04877958446741104, "learning_rate": 1.3171463139978222e-05, "loss": 1.4978, "step": 3820 }, { "epoch": 2.287591458862177, "grad_norm": 0.04458734765648842, "learning_rate": 1.2965857617279216e-05, "loss": 1.4931, "step": 3830 }, { "epoch": 2.293564282514559, "grad_norm": 0.043027278035879135, "learning_rate": 1.2761558644207547e-05, "loss": 1.495, "step": 3840 }, { "epoch": 2.2995371061669405, "grad_norm": 0.03808119520545006, "learning_rate": 1.2558576094453435e-05, "loss": 1.4922, "step": 3850 }, { "epoch": 2.3055099298193222, "grad_norm": 0.038997333496809006, "learning_rate": 1.2356919778084867e-05, "loss": 1.4915, "step": 3860 }, { "epoch": 2.3114827534717035, "grad_norm": 0.04020654410123825, "learning_rate": 1.2156599441073488e-05, "loss": 1.4874, "step": 3870 }, { "epoch": 2.3174555771240852, "grad_norm": 0.04891055077314377, "learning_rate": 1.1957624764823566e-05, "loss": 1.5016, "step": 3880 }, { "epoch": 2.323428400776467, "grad_norm": 0.046524520963430405, "learning_rate": 1.176000536570412e-05, "loss": 1.4928, "step": 3890 }, { "epoch": 2.3294012244288487, "grad_norm": 0.04302162304520607, "learning_rate": 1.1563750794584156e-05, "loss": 1.4905, "step": 3900 }, { "epoch": 2.3353740480812304, "grad_norm": 0.046545591205358505, "learning_rate": 1.1368870536371036e-05, "loss": 1.4911, "step": 3910 }, { "epoch": 2.341346871733612, "grad_norm": 0.04680660367012024, "learning_rate": 1.1175374009552159e-05, "loss": 1.4832, "step": 3920 }, { "epoch": 2.347319695385994, "grad_norm": 0.04679818078875542, "learning_rate": 1.0983270565739668e-05, "loss": 1.4892, "step": 3930 }, { "epoch": 2.3532925190383756, "grad_norm": 0.04409361630678177, "learning_rate": 1.0792569489218598e-05, "loss": 1.4907, "step": 3940 }, { "epoch": 2.3592653426907573, "grad_norm": 0.04122375324368477, "learning_rate": 1.0603279996498089e-05, "loss": 1.4936, "step": 3950 }, { "epoch": 2.3652381663431385, "grad_norm": 0.045084912329912186, "learning_rate": 1.0415411235865979e-05, "loss": 1.4852, "step": 3960 }, { "epoch": 2.3712109899955203, "grad_norm": 0.04110685735940933, "learning_rate": 1.0228972286946695e-05, "loss": 1.494, "step": 3970 }, { "epoch": 2.377183813647902, "grad_norm": 0.04527169466018677, "learning_rate": 1.0043972160262392e-05, "loss": 1.4955, "step": 3980 }, { "epoch": 2.3831566373002837, "grad_norm": 0.04808187112212181, "learning_rate": 9.860419796797527e-06, "loss": 1.4858, "step": 3990 }, { "epoch": 2.3891294609526654, "grad_norm": 0.03969137370586395, "learning_rate": 9.678324067566716e-06, "loss": 1.497, "step": 4000 }, { "epoch": 2.3891294609526654, "eval_loss": 1.4980565309524536, "eval_runtime": 20.0226, "eval_samples_per_second": 1729.697, "eval_steps_per_second": 13.535, "step": 4000 }, { "epoch": 2.395102284605047, "grad_norm": 0.039191678166389465, "learning_rate": 9.497693773185985e-06, "loss": 1.491, "step": 4010 }, { "epoch": 2.401075108257429, "grad_norm": 0.04326602816581726, "learning_rate": 9.318537643447488e-06, "loss": 1.4897, "step": 4020 }, { "epoch": 2.40704793190981, "grad_norm": 0.04062432423233986, "learning_rate": 9.140864336897559e-06, "loss": 1.4834, "step": 4030 }, { "epoch": 2.413020755562192, "grad_norm": 0.043511949479579926, "learning_rate": 8.964682440418272e-06, "loss": 1.4899, "step": 4040 }, { "epoch": 2.4189935792145736, "grad_norm": 0.041364822536706924, "learning_rate": 8.79000046881242e-06, "loss": 1.4876, "step": 4050 }, { "epoch": 2.4249664028669553, "grad_norm": 0.03720170632004738, "learning_rate": 8.61682686439202e-06, "loss": 1.4926, "step": 4060 }, { "epoch": 2.430939226519337, "grad_norm": 0.04620780423283577, "learning_rate": 8.44516999657027e-06, "loss": 1.4929, "step": 4070 }, { "epoch": 2.4369120501717187, "grad_norm": 0.03785783797502518, "learning_rate": 8.275038161457094e-06, "loss": 1.4917, "step": 4080 }, { "epoch": 2.4428848738241005, "grad_norm": 0.047655072063207626, "learning_rate": 8.106439581458177e-06, "loss": 1.4923, "step": 4090 }, { "epoch": 2.448857697476482, "grad_norm": 0.04838723689317703, "learning_rate": 7.939382404877545e-06, "loss": 1.4902, "step": 4100 }, { "epoch": 2.454830521128864, "grad_norm": 0.0498916357755661, "learning_rate": 7.773874705523826e-06, "loss": 1.4846, "step": 4110 }, { "epoch": 2.460803344781245, "grad_norm": 0.044865112751722336, "learning_rate": 7.609924482320013e-06, "loss": 1.4867, "step": 4120 }, { "epoch": 2.466776168433627, "grad_norm": 0.041775912046432495, "learning_rate": 7.447539658916869e-06, "loss": 1.4869, "step": 4130 }, { "epoch": 2.4727489920860086, "grad_norm": 0.03888450190424919, "learning_rate": 7.286728083309995e-06, "loss": 1.4824, "step": 4140 }, { "epoch": 2.4787218157383903, "grad_norm": 0.05169163644313812, "learning_rate": 7.127497527460541e-06, "loss": 1.4856, "step": 4150 }, { "epoch": 2.484694639390772, "grad_norm": 0.04095705598592758, "learning_rate": 6.969855686919573e-06, "loss": 1.4899, "step": 4160 }, { "epoch": 2.490667463043154, "grad_norm": 0.0429367758333683, "learning_rate": 6.81381018045618e-06, "loss": 1.4848, "step": 4170 }, { "epoch": 2.4966402866955355, "grad_norm": 0.04392432048916817, "learning_rate": 6.659368549689209e-06, "loss": 1.4832, "step": 4180 }, { "epoch": 2.502613110347917, "grad_norm": 0.04673699662089348, "learning_rate": 6.506538258722859e-06, "loss": 1.4855, "step": 4190 }, { "epoch": 2.5085859340002985, "grad_norm": 0.04074994474649429, "learning_rate": 6.355326693785868e-06, "loss": 1.4789, "step": 4200 }, { "epoch": 2.51455875765268, "grad_norm": 0.035382091999053955, "learning_rate": 6.2057411628745875e-06, "loss": 1.4862, "step": 4210 }, { "epoch": 2.520531581305062, "grad_norm": 0.03829929605126381, "learning_rate": 6.057788895399781e-06, "loss": 1.4852, "step": 4220 }, { "epoch": 2.5265044049574437, "grad_norm": 0.04219154641032219, "learning_rate": 5.9114770418372015e-06, "loss": 1.4865, "step": 4230 }, { "epoch": 2.5324772286098254, "grad_norm": 0.04591584950685501, "learning_rate": 5.7668126733820476e-06, "loss": 1.4737, "step": 4240 }, { "epoch": 2.538450052262207, "grad_norm": 0.045854389667510986, "learning_rate": 5.623802781607204e-06, "loss": 1.4872, "step": 4250 }, { "epoch": 2.544422875914589, "grad_norm": 0.04153481870889664, "learning_rate": 5.48245427812534e-06, "loss": 1.4806, "step": 4260 }, { "epoch": 2.5503956995669705, "grad_norm": 0.03822470083832741, "learning_rate": 5.342773994254842e-06, "loss": 1.4792, "step": 4270 }, { "epoch": 2.556368523219352, "grad_norm": 0.03870686888694763, "learning_rate": 5.204768680689727e-06, "loss": 1.4771, "step": 4280 }, { "epoch": 2.5623413468717335, "grad_norm": 0.05567542836070061, "learning_rate": 5.068445007173331e-06, "loss": 1.4812, "step": 4290 }, { "epoch": 2.5683141705241153, "grad_norm": 0.03914303705096245, "learning_rate": 4.933809562175982e-06, "loss": 1.4952, "step": 4300 }, { "epoch": 2.574286994176497, "grad_norm": 0.04728810861706734, "learning_rate": 4.800868852576561e-06, "loss": 1.4813, "step": 4310 }, { "epoch": 2.5802598178288787, "grad_norm": 0.04394581541419029, "learning_rate": 4.669629303348066e-06, "loss": 1.4779, "step": 4320 }, { "epoch": 2.5862326414812604, "grad_norm": 0.042139682918787, "learning_rate": 4.540097257247062e-06, "loss": 1.4847, "step": 4330 }, { "epoch": 2.5922054651336417, "grad_norm": 0.04580564424395561, "learning_rate": 4.412278974507151e-06, "loss": 1.4767, "step": 4340 }, { "epoch": 2.5981782887860234, "grad_norm": 0.03395635262131691, "learning_rate": 4.286180632536421e-06, "loss": 1.4871, "step": 4350 }, { "epoch": 2.604151112438405, "grad_norm": 0.04606311395764351, "learning_rate": 4.161808325618886e-06, "loss": 1.4865, "step": 4360 }, { "epoch": 2.610123936090787, "grad_norm": 0.046741172671318054, "learning_rate": 4.039168064619938e-06, "loss": 1.4896, "step": 4370 }, { "epoch": 2.6160967597431686, "grad_norm": 0.04130960628390312, "learning_rate": 3.918265776695891e-06, "loss": 1.4837, "step": 4380 }, { "epoch": 2.6220695833955503, "grad_norm": 0.043055951595306396, "learning_rate": 3.7991073050074678e-06, "loss": 1.4841, "step": 4390 }, { "epoch": 2.628042407047932, "grad_norm": 0.04418269917368889, "learning_rate": 3.6816984084374485e-06, "loss": 1.4831, "step": 4400 }, { "epoch": 2.6340152307003137, "grad_norm": 0.036886971443891525, "learning_rate": 3.5660447613123086e-06, "loss": 1.4892, "step": 4410 }, { "epoch": 2.6399880543526955, "grad_norm": 0.04421091824769974, "learning_rate": 3.452151953128007e-06, "loss": 1.4848, "step": 4420 }, { "epoch": 2.645960878005077, "grad_norm": 0.042877208441495895, "learning_rate": 3.3400254882798435e-06, "loss": 1.4888, "step": 4430 }, { "epoch": 2.6519337016574585, "grad_norm": 0.04234934598207474, "learning_rate": 3.2296707857964125e-06, "loss": 1.4796, "step": 4440 }, { "epoch": 2.65790652530984, "grad_norm": 0.035217370837926865, "learning_rate": 3.121093179077739e-06, "loss": 1.481, "step": 4450 }, { "epoch": 2.663879348962222, "grad_norm": 0.040508221834897995, "learning_rate": 3.0142979156374806e-06, "loss": 1.4819, "step": 4460 }, { "epoch": 2.6698521726146036, "grad_norm": 0.041981033980846405, "learning_rate": 2.9092901568493446e-06, "loss": 1.4804, "step": 4470 }, { "epoch": 2.6758249962669853, "grad_norm": 0.03790983185172081, "learning_rate": 2.80607497769763e-06, "loss": 1.4894, "step": 4480 }, { "epoch": 2.6817978199193666, "grad_norm": 0.038940299302339554, "learning_rate": 2.70465736653196e-06, "loss": 1.4827, "step": 4490 }, { "epoch": 2.6877706435717483, "grad_norm": 0.04031272605061531, "learning_rate": 2.605042224826182e-06, "loss": 1.4845, "step": 4500 } ], "logging_steps": 10, "max_steps": 5022, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9327446823064306e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }