|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.6877706435717483, |
|
"eval_steps": 1000, |
|
"global_step": 4500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005972823652381664, |
|
"grad_norm": 0.5743309259414673, |
|
"learning_rate": 1.5904572564612327e-06, |
|
"loss": 2.7537, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.011945647304763328, |
|
"grad_norm": 0.5460094809532166, |
|
"learning_rate": 3.1809145129224655e-06, |
|
"loss": 2.7612, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01791847095714499, |
|
"grad_norm": 0.5363145470619202, |
|
"learning_rate": 4.7713717693836985e-06, |
|
"loss": 2.7609, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.023891294609526655, |
|
"grad_norm": 0.5279455184936523, |
|
"learning_rate": 6.361829025844931e-06, |
|
"loss": 2.7607, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.029864118261908316, |
|
"grad_norm": 0.5061234831809998, |
|
"learning_rate": 7.952286282306164e-06, |
|
"loss": 2.784, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03583694191428998, |
|
"grad_norm": 0.476898729801178, |
|
"learning_rate": 9.542743538767397e-06, |
|
"loss": 2.762, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.041809765566671646, |
|
"grad_norm": 0.4454072415828705, |
|
"learning_rate": 1.113320079522863e-05, |
|
"loss": 2.7716, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04778258921905331, |
|
"grad_norm": 3.1541287899017334, |
|
"learning_rate": 1.2723658051689862e-05, |
|
"loss": 2.8849, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05375541287143497, |
|
"grad_norm": 0.19107532501220703, |
|
"learning_rate": 1.4314115308151095e-05, |
|
"loss": 3.1147, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05972823652381663, |
|
"grad_norm": 0.13281038403511047, |
|
"learning_rate": 1.590457256461233e-05, |
|
"loss": 2.5574, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0657010601761983, |
|
"grad_norm": 0.08191326260566711, |
|
"learning_rate": 1.749502982107356e-05, |
|
"loss": 2.4446, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07167388382857996, |
|
"grad_norm": 0.08300579339265823, |
|
"learning_rate": 1.9085487077534794e-05, |
|
"loss": 2.3524, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07764670748096163, |
|
"grad_norm": 0.0590679906308651, |
|
"learning_rate": 2.0675944333996028e-05, |
|
"loss": 2.2819, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08361953113334329, |
|
"grad_norm": 0.052923623472452164, |
|
"learning_rate": 2.226640159045726e-05, |
|
"loss": 2.2261, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08959235478572496, |
|
"grad_norm": 0.05208205804228783, |
|
"learning_rate": 2.385685884691849e-05, |
|
"loss": 2.1889, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09556517843810662, |
|
"grad_norm": 0.0485885925590992, |
|
"learning_rate": 2.5447316103379724e-05, |
|
"loss": 2.1694, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.10153800209048827, |
|
"grad_norm": 0.04901551082730293, |
|
"learning_rate": 2.7037773359840955e-05, |
|
"loss": 2.1272, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10751082574286994, |
|
"grad_norm": 0.04524153470993042, |
|
"learning_rate": 2.862823061630219e-05, |
|
"loss": 2.1085, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1134836493952516, |
|
"grad_norm": 0.04201298579573631, |
|
"learning_rate": 3.021868787276342e-05, |
|
"loss": 2.0902, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11945647304763327, |
|
"grad_norm": 0.053612083196640015, |
|
"learning_rate": 3.180914512922466e-05, |
|
"loss": 2.0855, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12542929670001493, |
|
"grad_norm": 0.04812688007950783, |
|
"learning_rate": 3.3399602385685885e-05, |
|
"loss": 2.0469, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1314021203523966, |
|
"grad_norm": 0.0483262836933136, |
|
"learning_rate": 3.499005964214712e-05, |
|
"loss": 2.0264, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.13737494400477826, |
|
"grad_norm": 0.05456310138106346, |
|
"learning_rate": 3.6580516898608353e-05, |
|
"loss": 2.0201, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.14334776765715992, |
|
"grad_norm": 0.06978671252727509, |
|
"learning_rate": 3.817097415506959e-05, |
|
"loss": 1.9967, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1493205913095416, |
|
"grad_norm": 0.049219317734241486, |
|
"learning_rate": 3.976143141153082e-05, |
|
"loss": 1.9909, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.15529341496192325, |
|
"grad_norm": 0.04814588651061058, |
|
"learning_rate": 4.1351888667992056e-05, |
|
"loss": 1.9793, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.16126623861430492, |
|
"grad_norm": 0.06128086522221565, |
|
"learning_rate": 4.2942345924453284e-05, |
|
"loss": 1.9703, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.16723906226668658, |
|
"grad_norm": 0.06803273409605026, |
|
"learning_rate": 4.453280318091452e-05, |
|
"loss": 1.9484, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.17321188591906825, |
|
"grad_norm": 0.06598497182130814, |
|
"learning_rate": 4.612326043737575e-05, |
|
"loss": 1.9251, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1791847095714499, |
|
"grad_norm": 0.05581754818558693, |
|
"learning_rate": 4.771371769383698e-05, |
|
"loss": 1.9211, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.18515753322383158, |
|
"grad_norm": 0.06264442205429077, |
|
"learning_rate": 4.9304174950298214e-05, |
|
"loss": 1.9047, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.19113035687621324, |
|
"grad_norm": 0.05809122323989868, |
|
"learning_rate": 5.089463220675945e-05, |
|
"loss": 1.8948, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1971031805285949, |
|
"grad_norm": 0.05478562042117119, |
|
"learning_rate": 5.248508946322068e-05, |
|
"loss": 1.8924, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.20307600418097654, |
|
"grad_norm": 0.060149796307086945, |
|
"learning_rate": 5.407554671968191e-05, |
|
"loss": 1.8776, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2090488278333582, |
|
"grad_norm": 0.06282585859298706, |
|
"learning_rate": 5.5666003976143144e-05, |
|
"loss": 1.8752, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.21502165148573987, |
|
"grad_norm": 0.06441989541053772, |
|
"learning_rate": 5.725646123260438e-05, |
|
"loss": 1.8632, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.22099447513812154, |
|
"grad_norm": 0.05681062117218971, |
|
"learning_rate": 5.8846918489065606e-05, |
|
"loss": 1.8475, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2269672987905032, |
|
"grad_norm": 0.05155131593346596, |
|
"learning_rate": 6.043737574552684e-05, |
|
"loss": 1.8431, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.23294012244288487, |
|
"grad_norm": 0.05347074940800667, |
|
"learning_rate": 6.202783300198807e-05, |
|
"loss": 1.8416, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.23891294609526653, |
|
"grad_norm": 0.06694310158491135, |
|
"learning_rate": 6.361829025844931e-05, |
|
"loss": 1.8344, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2448857697476482, |
|
"grad_norm": 0.06079185754060745, |
|
"learning_rate": 6.520874751491054e-05, |
|
"loss": 1.8297, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.25085859340002986, |
|
"grad_norm": 0.05415233224630356, |
|
"learning_rate": 6.679920477137177e-05, |
|
"loss": 1.82, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2568314170524115, |
|
"grad_norm": 0.0645110234618187, |
|
"learning_rate": 6.838966202783301e-05, |
|
"loss": 1.8137, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2628042407047932, |
|
"grad_norm": 0.06045007333159447, |
|
"learning_rate": 6.998011928429424e-05, |
|
"loss": 1.8048, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.26877706435717486, |
|
"grad_norm": 0.05600131303071976, |
|
"learning_rate": 7.157057654075547e-05, |
|
"loss": 1.7854, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2747498880095565, |
|
"grad_norm": 0.06498062610626221, |
|
"learning_rate": 7.316103379721671e-05, |
|
"loss": 1.798, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2807227116619382, |
|
"grad_norm": 0.053577929735183716, |
|
"learning_rate": 7.475149105367795e-05, |
|
"loss": 1.7883, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.28669553531431985, |
|
"grad_norm": 0.09097382426261902, |
|
"learning_rate": 7.634194831013918e-05, |
|
"loss": 1.78, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2926683589667015, |
|
"grad_norm": 0.057212598621845245, |
|
"learning_rate": 7.79324055666004e-05, |
|
"loss": 1.7705, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2986411826190832, |
|
"grad_norm": 0.055311623960733414, |
|
"learning_rate": 7.952286282306164e-05, |
|
"loss": 1.7739, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.30461400627146484, |
|
"grad_norm": 0.07679615169763565, |
|
"learning_rate": 7.999952636882403e-05, |
|
"loss": 1.7705, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3105868299238465, |
|
"grad_norm": 0.10281822085380554, |
|
"learning_rate": 7.999720656965739e-05, |
|
"loss": 1.7639, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3165596535762282, |
|
"grad_norm": 0.07636060565710068, |
|
"learning_rate": 7.999295372099362e-05, |
|
"loss": 1.7539, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.32253247722860984, |
|
"grad_norm": 0.057714689522981644, |
|
"learning_rate": 7.998676802837124e-05, |
|
"loss": 1.7541, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3285053008809915, |
|
"grad_norm": 0.06505981832742691, |
|
"learning_rate": 7.997864979074237e-05, |
|
"loss": 1.7487, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.33447812453337317, |
|
"grad_norm": 0.05842842161655426, |
|
"learning_rate": 7.996859940045832e-05, |
|
"loss": 1.739, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.34045094818575483, |
|
"grad_norm": 0.051559966057538986, |
|
"learning_rate": 7.995661734325054e-05, |
|
"loss": 1.7443, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3464237718381365, |
|
"grad_norm": 0.20853149890899658, |
|
"learning_rate": 7.994270419820721e-05, |
|
"loss": 1.7719, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.35239659549051816, |
|
"grad_norm": 0.09151974320411682, |
|
"learning_rate": 7.992686063774525e-05, |
|
"loss": 1.7817, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3583694191428998, |
|
"grad_norm": 0.05926055088639259, |
|
"learning_rate": 7.99090874275778e-05, |
|
"loss": 1.7469, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3643422427952815, |
|
"grad_norm": 0.044228848069906235, |
|
"learning_rate": 7.988938542667721e-05, |
|
"loss": 1.7393, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.37031506644766315, |
|
"grad_norm": 0.0427553653717041, |
|
"learning_rate": 7.986775558723355e-05, |
|
"loss": 1.7307, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3762878901000448, |
|
"grad_norm": 0.0548509880900383, |
|
"learning_rate": 7.984419895460858e-05, |
|
"loss": 1.7205, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3822607137524265, |
|
"grad_norm": 0.057041749358177185, |
|
"learning_rate": 7.981871666728525e-05, |
|
"loss": 1.7225, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.38823353740480815, |
|
"grad_norm": 0.056601762771606445, |
|
"learning_rate": 7.979130995681263e-05, |
|
"loss": 1.7088, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3942063610571898, |
|
"grad_norm": 0.06844093650579453, |
|
"learning_rate": 7.976198014774637e-05, |
|
"loss": 1.7073, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.4001791847095714, |
|
"grad_norm": 0.0546780526638031, |
|
"learning_rate": 7.973072865758483e-05, |
|
"loss": 1.7121, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.4061520083619531, |
|
"grad_norm": 0.04654558375477791, |
|
"learning_rate": 7.969755699670041e-05, |
|
"loss": 1.6951, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.41212483201433475, |
|
"grad_norm": 0.06478898227214813, |
|
"learning_rate": 7.966246676826661e-05, |
|
"loss": 1.7055, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4180976556667164, |
|
"grad_norm": 0.06878198683261871, |
|
"learning_rate": 7.962545966818062e-05, |
|
"loss": 1.6987, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4240704793190981, |
|
"grad_norm": 0.05675249919295311, |
|
"learning_rate": 7.95865374849812e-05, |
|
"loss": 1.6998, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.43004330297147975, |
|
"grad_norm": 0.05516457185149193, |
|
"learning_rate": 7.954570209976239e-05, |
|
"loss": 1.6852, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4360161266238614, |
|
"grad_norm": 0.05688585340976715, |
|
"learning_rate": 7.950295548608256e-05, |
|
"loss": 1.6901, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.4419889502762431, |
|
"grad_norm": 0.07187242805957794, |
|
"learning_rate": 7.945829970986898e-05, |
|
"loss": 1.6894, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.44796177392862474, |
|
"grad_norm": 0.0548662506043911, |
|
"learning_rate": 7.941173692931801e-05, |
|
"loss": 1.6819, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4539345975810064, |
|
"grad_norm": 0.0926741436123848, |
|
"learning_rate": 7.93632693947908e-05, |
|
"loss": 1.6797, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.45990742123338807, |
|
"grad_norm": 0.04921697825193405, |
|
"learning_rate": 7.931289944870448e-05, |
|
"loss": 1.6629, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.46588024488576973, |
|
"grad_norm": 0.07487112283706665, |
|
"learning_rate": 7.92606295254191e-05, |
|
"loss": 1.6737, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4718530685381514, |
|
"grad_norm": 0.07180643826723099, |
|
"learning_rate": 7.920646215111973e-05, |
|
"loss": 1.6716, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.47782589219053306, |
|
"grad_norm": 0.050522662699222565, |
|
"learning_rate": 7.915039994369462e-05, |
|
"loss": 1.6597, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.48379871584291473, |
|
"grad_norm": 0.0628654807806015, |
|
"learning_rate": 7.909244561260855e-05, |
|
"loss": 1.6722, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.4897715394952964, |
|
"grad_norm": 0.07348821312189102, |
|
"learning_rate": 7.903260195877184e-05, |
|
"loss": 1.6718, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.49574436314767806, |
|
"grad_norm": 0.0689951702952385, |
|
"learning_rate": 7.897087187440512e-05, |
|
"loss": 1.6658, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.5017171868000597, |
|
"grad_norm": 0.05663711205124855, |
|
"learning_rate": 7.890725834289946e-05, |
|
"loss": 1.6636, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5076900104524414, |
|
"grad_norm": 0.050597622990608215, |
|
"learning_rate": 7.884176443867219e-05, |
|
"loss": 1.6648, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.513662834104823, |
|
"grad_norm": 0.05792626738548279, |
|
"learning_rate": 7.87743933270183e-05, |
|
"loss": 1.6582, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5196356577572048, |
|
"grad_norm": 0.05193015933036804, |
|
"learning_rate": 7.870514826395755e-05, |
|
"loss": 1.664, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5256084814095864, |
|
"grad_norm": 0.05836218595504761, |
|
"learning_rate": 7.863403259607698e-05, |
|
"loss": 1.6535, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.531581305061968, |
|
"grad_norm": 0.08420410752296448, |
|
"learning_rate": 7.856104976036928e-05, |
|
"loss": 1.6463, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5375541287143497, |
|
"grad_norm": 0.06460799276828766, |
|
"learning_rate": 7.848620328406663e-05, |
|
"loss": 1.6615, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5435269523667313, |
|
"grad_norm": 0.08191855251789093, |
|
"learning_rate": 7.840949678447022e-05, |
|
"loss": 1.6529, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.549499776019113, |
|
"grad_norm": 0.04835124313831329, |
|
"learning_rate": 7.833093396877546e-05, |
|
"loss": 1.6508, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5554725996714946, |
|
"grad_norm": 0.047752317041158676, |
|
"learning_rate": 7.82505186338928e-05, |
|
"loss": 1.6484, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5614454233238764, |
|
"grad_norm": 0.054417744278907776, |
|
"learning_rate": 7.816825466626419e-05, |
|
"loss": 1.6443, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.567418246976258, |
|
"grad_norm": 0.0538078136742115, |
|
"learning_rate": 7.808414604167537e-05, |
|
"loss": 1.6422, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5733910706286397, |
|
"grad_norm": 0.04438367858529091, |
|
"learning_rate": 7.799819682506353e-05, |
|
"loss": 1.6443, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5793638942810213, |
|
"grad_norm": 0.056033167988061905, |
|
"learning_rate": 7.791041117032102e-05, |
|
"loss": 1.6428, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.585336717933403, |
|
"grad_norm": 0.07095460593700409, |
|
"learning_rate": 7.782079332009454e-05, |
|
"loss": 1.6425, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5913095415857846, |
|
"grad_norm": 0.05874691903591156, |
|
"learning_rate": 7.772934760558005e-05, |
|
"loss": 1.6346, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5972823652381664, |
|
"grad_norm": 0.0521966814994812, |
|
"learning_rate": 7.76360784463135e-05, |
|
"loss": 1.6359, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5972823652381664, |
|
"eval_loss": 1.634853482246399, |
|
"eval_runtime": 28.9256, |
|
"eval_samples_per_second": 1197.311, |
|
"eval_steps_per_second": 9.369, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.603255188890548, |
|
"grad_norm": 0.052664998918771744, |
|
"learning_rate": 7.754099034995727e-05, |
|
"loss": 1.6383, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.6092280125429297, |
|
"grad_norm": 0.08000710606575012, |
|
"learning_rate": 7.744408791208214e-05, |
|
"loss": 1.639, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.6152008361953113, |
|
"grad_norm": 0.05873206630349159, |
|
"learning_rate": 7.734537581594545e-05, |
|
"loss": 1.632, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.621173659847693, |
|
"grad_norm": 0.06116827204823494, |
|
"learning_rate": 7.724485883226454e-05, |
|
"loss": 1.6351, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.6271464835000746, |
|
"grad_norm": 0.057659681886434555, |
|
"learning_rate": 7.714254181898627e-05, |
|
"loss": 1.637, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6331193071524563, |
|
"grad_norm": 0.05905848369002342, |
|
"learning_rate": 7.703842972105228e-05, |
|
"loss": 1.626, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.639092130804838, |
|
"grad_norm": 0.0539986751973629, |
|
"learning_rate": 7.693252757015991e-05, |
|
"loss": 1.6278, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6450649544572197, |
|
"grad_norm": 0.062365371733903885, |
|
"learning_rate": 7.682484048451908e-05, |
|
"loss": 1.6187, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.6510377781096013, |
|
"grad_norm": 0.0486634224653244, |
|
"learning_rate": 7.671537366860494e-05, |
|
"loss": 1.6223, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.657010601761983, |
|
"grad_norm": 0.04700983688235283, |
|
"learning_rate": 7.660413241290626e-05, |
|
"loss": 1.6237, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6629834254143646, |
|
"grad_norm": 0.06423746794462204, |
|
"learning_rate": 7.649112209366985e-05, |
|
"loss": 1.6349, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6689562490667463, |
|
"grad_norm": 0.05183717608451843, |
|
"learning_rate": 7.637634817264064e-05, |
|
"loss": 1.6203, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6749290727191279, |
|
"grad_norm": 0.05448286980390549, |
|
"learning_rate": 7.625981619679777e-05, |
|
"loss": 1.6159, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6809018963715097, |
|
"grad_norm": 0.06012860685586929, |
|
"learning_rate": 7.61415317980865e-05, |
|
"loss": 1.6106, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.6868747200238913, |
|
"grad_norm": 0.0491897277534008, |
|
"learning_rate": 7.602150069314598e-05, |
|
"loss": 1.613, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.692847543676273, |
|
"grad_norm": 0.05050448700785637, |
|
"learning_rate": 7.589972868303301e-05, |
|
"loss": 1.6158, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6988203673286546, |
|
"grad_norm": 0.05027921870350838, |
|
"learning_rate": 7.577622165294165e-05, |
|
"loss": 1.6166, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.7047931909810363, |
|
"grad_norm": 0.061239466071128845, |
|
"learning_rate": 7.565098557191882e-05, |
|
"loss": 1.607, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.7107660146334179, |
|
"grad_norm": 0.04995877295732498, |
|
"learning_rate": 7.552402649257578e-05, |
|
"loss": 1.6152, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.7167388382857997, |
|
"grad_norm": 0.04830503091216087, |
|
"learning_rate": 7.539535055079569e-05, |
|
"loss": 1.613, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7227116619381813, |
|
"grad_norm": 0.05787483602762222, |
|
"learning_rate": 7.526496396543691e-05, |
|
"loss": 1.614, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.728684485590563, |
|
"grad_norm": 0.07437578588724136, |
|
"learning_rate": 7.513287303803263e-05, |
|
"loss": 1.6127, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7346573092429446, |
|
"grad_norm": 0.06587845832109451, |
|
"learning_rate": 7.499908415248616e-05, |
|
"loss": 1.6015, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7406301328953263, |
|
"grad_norm": 0.0692521184682846, |
|
"learning_rate": 7.486360377476255e-05, |
|
"loss": 1.6026, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.7466029565477079, |
|
"grad_norm": 0.061289019882678986, |
|
"learning_rate": 7.472643845257592e-05, |
|
"loss": 1.6108, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.7525757802000896, |
|
"grad_norm": 0.056076616048812866, |
|
"learning_rate": 7.458759481507318e-05, |
|
"loss": 1.6018, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.7585486038524712, |
|
"grad_norm": 0.06620051711797714, |
|
"learning_rate": 7.444707957251354e-05, |
|
"loss": 1.6048, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.764521427504853, |
|
"grad_norm": 0.05557152256369591, |
|
"learning_rate": 7.430489951594422e-05, |
|
"loss": 1.6091, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.7704942511572346, |
|
"grad_norm": 0.04953812435269356, |
|
"learning_rate": 7.416106151687224e-05, |
|
"loss": 1.6026, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.7764670748096163, |
|
"grad_norm": 0.042427971959114075, |
|
"learning_rate": 7.40155725269324e-05, |
|
"loss": 1.5983, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7824398984619979, |
|
"grad_norm": 0.05906856432557106, |
|
"learning_rate": 7.386843957755123e-05, |
|
"loss": 1.6008, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.7884127221143796, |
|
"grad_norm": 0.04983474314212799, |
|
"learning_rate": 7.371966977960713e-05, |
|
"loss": 1.5973, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7943855457667612, |
|
"grad_norm": 0.0590224526822567, |
|
"learning_rate": 7.356927032308682e-05, |
|
"loss": 1.6011, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.8003583694191428, |
|
"grad_norm": 0.057693641632795334, |
|
"learning_rate": 7.341724847673775e-05, |
|
"loss": 1.5942, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.8063311930715246, |
|
"grad_norm": 0.040723856538534164, |
|
"learning_rate": 7.326361158771688e-05, |
|
"loss": 1.6011, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.8123040167239062, |
|
"grad_norm": 0.05768086016178131, |
|
"learning_rate": 7.31083670812355e-05, |
|
"loss": 1.5999, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.8182768403762879, |
|
"grad_norm": 0.06345749646425247, |
|
"learning_rate": 7.29515224602005e-05, |
|
"loss": 1.5985, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.8242496640286695, |
|
"grad_norm": 0.06176001578569412, |
|
"learning_rate": 7.27930853048516e-05, |
|
"loss": 1.5971, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.8302224876810512, |
|
"grad_norm": 0.05247745290398598, |
|
"learning_rate": 7.263306327239516e-05, |
|
"loss": 1.5958, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.8361953113334328, |
|
"grad_norm": 0.05218351632356644, |
|
"learning_rate": 7.247146409663401e-05, |
|
"loss": 1.5981, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8421681349858146, |
|
"grad_norm": 0.0629679337143898, |
|
"learning_rate": 7.23082955875937e-05, |
|
"loss": 1.5949, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.8481409586381962, |
|
"grad_norm": 0.061205677688121796, |
|
"learning_rate": 7.214356563114505e-05, |
|
"loss": 1.5957, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.8541137822905779, |
|
"grad_norm": 0.06122026965022087, |
|
"learning_rate": 7.197728218862306e-05, |
|
"loss": 1.5911, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.8600866059429595, |
|
"grad_norm": 0.054293327033519745, |
|
"learning_rate": 7.180945329644204e-05, |
|
"loss": 1.5885, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.8660594295953412, |
|
"grad_norm": 0.04569542035460472, |
|
"learning_rate": 7.164008706570736e-05, |
|
"loss": 1.5893, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.8720322532477228, |
|
"grad_norm": 0.04415179416537285, |
|
"learning_rate": 7.146919168182333e-05, |
|
"loss": 1.5951, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.8780050769001045, |
|
"grad_norm": 0.052418701350688934, |
|
"learning_rate": 7.129677540409762e-05, |
|
"loss": 1.5999, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.8839779005524862, |
|
"grad_norm": 0.053583066910505295, |
|
"learning_rate": 7.112284656534215e-05, |
|
"loss": 1.5979, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.8899507242048679, |
|
"grad_norm": 0.06733547151088715, |
|
"learning_rate": 7.09474135714703e-05, |
|
"loss": 1.5871, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.8959235478572495, |
|
"grad_norm": 0.05455510690808296, |
|
"learning_rate": 7.07704849010907e-05, |
|
"loss": 1.5912, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9018963715096312, |
|
"grad_norm": 0.05950945243239403, |
|
"learning_rate": 7.059206910509745e-05, |
|
"loss": 1.5958, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.9078691951620128, |
|
"grad_norm": 0.0513860359787941, |
|
"learning_rate": 7.041217480625683e-05, |
|
"loss": 1.5856, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.9138420188143945, |
|
"grad_norm": 0.05268612131476402, |
|
"learning_rate": 7.023081069879062e-05, |
|
"loss": 1.5846, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.9198148424667761, |
|
"grad_norm": 0.05923028290271759, |
|
"learning_rate": 7.004798554795586e-05, |
|
"loss": 1.5739, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.9257876661191579, |
|
"grad_norm": 0.04859180748462677, |
|
"learning_rate": 6.986370818962125e-05, |
|
"loss": 1.5927, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.9317604897715395, |
|
"grad_norm": 0.060852836817502975, |
|
"learning_rate": 6.967798752984012e-05, |
|
"loss": 1.5769, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.9377333134239212, |
|
"grad_norm": 0.053088609129190445, |
|
"learning_rate": 6.949083254442001e-05, |
|
"loss": 1.5845, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.9437061370763028, |
|
"grad_norm": 0.06042907387018204, |
|
"learning_rate": 6.930225227848887e-05, |
|
"loss": 1.5808, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.9496789607286845, |
|
"grad_norm": 0.05746331810951233, |
|
"learning_rate": 6.911225584605787e-05, |
|
"loss": 1.5821, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.9556517843810661, |
|
"grad_norm": 0.04398033022880554, |
|
"learning_rate": 6.892085242958098e-05, |
|
"loss": 1.5775, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9616246080334478, |
|
"grad_norm": 0.050728365778923035, |
|
"learning_rate": 6.872805127951115e-05, |
|
"loss": 1.5749, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.9675974316858295, |
|
"grad_norm": 0.0519120879471302, |
|
"learning_rate": 6.85338617138533e-05, |
|
"loss": 1.5726, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.9735702553382112, |
|
"grad_norm": 0.052526745945215225, |
|
"learning_rate": 6.833829311771388e-05, |
|
"loss": 1.5793, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.9795430789905928, |
|
"grad_norm": 0.050527602434158325, |
|
"learning_rate": 6.814135494284735e-05, |
|
"loss": 1.5694, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.9855159026429745, |
|
"grad_norm": 0.08685663342475891, |
|
"learning_rate": 6.794305670719945e-05, |
|
"loss": 1.5803, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.9914887262953561, |
|
"grad_norm": 0.054428499191999435, |
|
"learning_rate": 6.774340799444703e-05, |
|
"loss": 1.5757, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.9974615499477378, |
|
"grad_norm": 0.05870772898197174, |
|
"learning_rate": 6.754241845353506e-05, |
|
"loss": 1.571, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.0034343736001194, |
|
"grad_norm": 0.05581633001565933, |
|
"learning_rate": 6.734009779821018e-05, |
|
"loss": 1.5659, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.0094071972525012, |
|
"grad_norm": 0.05493481829762459, |
|
"learning_rate": 6.713645580655125e-05, |
|
"loss": 1.5686, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.0153800209048829, |
|
"grad_norm": 0.05471092462539673, |
|
"learning_rate": 6.693150232049686e-05, |
|
"loss": 1.5649, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.0213528445572644, |
|
"grad_norm": 0.053526680916547775, |
|
"learning_rate": 6.672524724536956e-05, |
|
"loss": 1.5671, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.027325668209646, |
|
"grad_norm": 0.06532900780439377, |
|
"learning_rate": 6.651770054939722e-05, |
|
"loss": 1.5614, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.0332984918620278, |
|
"grad_norm": 0.051929574459791183, |
|
"learning_rate": 6.630887226323128e-05, |
|
"loss": 1.556, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.0392713155144095, |
|
"grad_norm": 0.06289497762918472, |
|
"learning_rate": 6.609877247946186e-05, |
|
"loss": 1.5634, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.045244139166791, |
|
"grad_norm": 0.05371445044875145, |
|
"learning_rate": 6.588741135213012e-05, |
|
"loss": 1.5645, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.0512169628191728, |
|
"grad_norm": 0.04851632937788963, |
|
"learning_rate": 6.567479909623746e-05, |
|
"loss": 1.5648, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.0571897864715545, |
|
"grad_norm": 0.06357111036777496, |
|
"learning_rate": 6.546094598725186e-05, |
|
"loss": 1.5568, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.063162610123936, |
|
"grad_norm": 0.07035905867815018, |
|
"learning_rate": 6.524586236061117e-05, |
|
"loss": 1.5519, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.0691354337763177, |
|
"grad_norm": 0.05517163127660751, |
|
"learning_rate": 6.502955861122377e-05, |
|
"loss": 1.5566, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.0751082574286994, |
|
"grad_norm": 0.0504322424530983, |
|
"learning_rate": 6.481204519296606e-05, |
|
"loss": 1.5668, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 0.051910221576690674, |
|
"learning_rate": 6.459333261817726e-05, |
|
"loss": 1.5585, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.0870539047334629, |
|
"grad_norm": 0.07319536805152893, |
|
"learning_rate": 6.43734314571514e-05, |
|
"loss": 1.5599, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.0930267283858444, |
|
"grad_norm": 0.05212223529815674, |
|
"learning_rate": 6.415235233762635e-05, |
|
"loss": 1.5597, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.098999552038226, |
|
"grad_norm": 0.05524059012532234, |
|
"learning_rate": 6.393010594427034e-05, |
|
"loss": 1.5449, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.1049723756906078, |
|
"grad_norm": 0.044485364109277725, |
|
"learning_rate": 6.370670301816544e-05, |
|
"loss": 1.5584, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.1109451993429893, |
|
"grad_norm": 0.04716966673731804, |
|
"learning_rate": 6.348215435628852e-05, |
|
"loss": 1.5577, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.116918022995371, |
|
"grad_norm": 0.04776601493358612, |
|
"learning_rate": 6.32564708109894e-05, |
|
"loss": 1.5597, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.1228908466477527, |
|
"grad_norm": 0.05379948392510414, |
|
"learning_rate": 6.302966328946638e-05, |
|
"loss": 1.5542, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.1288636703001345, |
|
"grad_norm": 0.05076327919960022, |
|
"learning_rate": 6.280174275323915e-05, |
|
"loss": 1.5564, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.134836493952516, |
|
"grad_norm": 0.0562434047460556, |
|
"learning_rate": 6.257272021761884e-05, |
|
"loss": 1.5597, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.1408093176048977, |
|
"grad_norm": 0.045845337212085724, |
|
"learning_rate": 6.234260675117595e-05, |
|
"loss": 1.5535, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.1467821412572794, |
|
"grad_norm": 0.04580407217144966, |
|
"learning_rate": 6.21114134752051e-05, |
|
"loss": 1.5486, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.1527549649096611, |
|
"grad_norm": 0.05752042680978775, |
|
"learning_rate": 6.187915156318775e-05, |
|
"loss": 1.5454, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.1587277885620426, |
|
"grad_norm": 0.05608632043004036, |
|
"learning_rate": 6.164583224025215e-05, |
|
"loss": 1.5545, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.1647006122144243, |
|
"grad_norm": 0.047604430466890335, |
|
"learning_rate": 6.141146678263076e-05, |
|
"loss": 1.5531, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.170673435866806, |
|
"grad_norm": 0.04514037445187569, |
|
"learning_rate": 6.117606651711537e-05, |
|
"loss": 1.5547, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.1766462595191878, |
|
"grad_norm": 0.05768571048974991, |
|
"learning_rate": 6.0939642820509564e-05, |
|
"loss": 1.5496, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.1826190831715693, |
|
"grad_norm": 0.04222779721021652, |
|
"learning_rate": 6.070220711907903e-05, |
|
"loss": 1.5469, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.188591906823951, |
|
"grad_norm": 0.05183190852403641, |
|
"learning_rate": 6.046377088799923e-05, |
|
"loss": 1.5526, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.1945647304763327, |
|
"grad_norm": 0.04888539016246796, |
|
"learning_rate": 6.0224345650800826e-05, |
|
"loss": 1.5579, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.1945647304763327, |
|
"eval_loss": 1.5546131134033203, |
|
"eval_runtime": 20.1679, |
|
"eval_samples_per_second": 1717.237, |
|
"eval_steps_per_second": 13.437, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.2005375541287144, |
|
"grad_norm": 0.049841009080410004, |
|
"learning_rate": 5.998394297881277e-05, |
|
"loss": 1.5531, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.206510377781096, |
|
"grad_norm": 0.04911394044756889, |
|
"learning_rate": 5.974257449060306e-05, |
|
"loss": 1.5512, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.2124832014334777, |
|
"grad_norm": 0.05170886963605881, |
|
"learning_rate": 5.9500251851417206e-05, |
|
"loss": 1.5439, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.2184560250858594, |
|
"grad_norm": 0.04615171626210213, |
|
"learning_rate": 5.925698677261449e-05, |
|
"loss": 1.5453, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.224428848738241, |
|
"grad_norm": 0.04724368825554848, |
|
"learning_rate": 5.901279101110191e-05, |
|
"loss": 1.5434, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.2304016723906226, |
|
"grad_norm": 0.06991260498762131, |
|
"learning_rate": 5.8767676368766016e-05, |
|
"loss": 1.5489, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.2363744960430043, |
|
"grad_norm": 0.055575910955667496, |
|
"learning_rate": 5.852165469190251e-05, |
|
"loss": 1.5514, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.242347319695386, |
|
"grad_norm": 0.04874608293175697, |
|
"learning_rate": 5.82747378706437e-05, |
|
"loss": 1.5523, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.2483201433477678, |
|
"grad_norm": 0.05960864573717117, |
|
"learning_rate": 5.8026937838383914e-05, |
|
"loss": 1.5469, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.2542929670001493, |
|
"grad_norm": 0.07086056470870972, |
|
"learning_rate": 5.77782665712027e-05, |
|
"loss": 1.5497, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.260265790652531, |
|
"grad_norm": 0.0472436398267746, |
|
"learning_rate": 5.752873608728603e-05, |
|
"loss": 1.5425, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.2662386143049127, |
|
"grad_norm": 0.06843575835227966, |
|
"learning_rate": 5.7278358446345545e-05, |
|
"loss": 1.542, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.2722114379572944, |
|
"grad_norm": 0.04991114139556885, |
|
"learning_rate": 5.702714574903561e-05, |
|
"loss": 1.5423, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.278184261609676, |
|
"grad_norm": 0.04601559415459633, |
|
"learning_rate": 5.6775110136368576e-05, |
|
"loss": 1.5357, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.2841570852620576, |
|
"grad_norm": 0.042647868394851685, |
|
"learning_rate": 5.6522263789127937e-05, |
|
"loss": 1.5386, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.2901299089144393, |
|
"grad_norm": 0.06261768937110901, |
|
"learning_rate": 5.626861892727969e-05, |
|
"loss": 1.5428, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.2961027325668208, |
|
"grad_norm": 0.04735434427857399, |
|
"learning_rate": 5.601418780938175e-05, |
|
"loss": 1.5395, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.3020755562192026, |
|
"grad_norm": 0.048824459314346313, |
|
"learning_rate": 5.575898273199146e-05, |
|
"loss": 1.5418, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.3080483798715843, |
|
"grad_norm": 0.04974917694926262, |
|
"learning_rate": 5.5503016029071354e-05, |
|
"loss": 1.5371, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.314021203523966, |
|
"grad_norm": 0.05275791883468628, |
|
"learning_rate": 5.5246300071392985e-05, |
|
"loss": 1.5364, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.3199940271763477, |
|
"grad_norm": 0.0487825907766819, |
|
"learning_rate": 5.4988847265939146e-05, |
|
"loss": 1.5436, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.3259668508287292, |
|
"grad_norm": 0.06100558117032051, |
|
"learning_rate": 5.473067005530416e-05, |
|
"loss": 1.5351, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.331939674481111, |
|
"grad_norm": 0.07098929584026337, |
|
"learning_rate": 5.447178091709262e-05, |
|
"loss": 1.5463, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.3379124981334927, |
|
"grad_norm": 0.06729080528020859, |
|
"learning_rate": 5.421219236331624e-05, |
|
"loss": 1.5382, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.3438853217858742, |
|
"grad_norm": 0.05485675856471062, |
|
"learning_rate": 5.395191693978927e-05, |
|
"loss": 1.5349, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.3498581454382559, |
|
"grad_norm": 0.05816954746842384, |
|
"learning_rate": 5.3690967225522076e-05, |
|
"loss": 1.5406, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.3558309690906376, |
|
"grad_norm": 0.044427741318941116, |
|
"learning_rate": 5.342935583211327e-05, |
|
"loss": 1.5309, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.3618037927430193, |
|
"grad_norm": 0.05544894561171532, |
|
"learning_rate": 5.31670954031401e-05, |
|
"loss": 1.5365, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.367776616395401, |
|
"grad_norm": 0.04774465411901474, |
|
"learning_rate": 5.290419861354753e-05, |
|
"loss": 1.5303, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.3737494400477825, |
|
"grad_norm": 0.050910986959934235, |
|
"learning_rate": 5.264067816903552e-05, |
|
"loss": 1.5384, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.3797222637001643, |
|
"grad_norm": 0.05830187723040581, |
|
"learning_rate": 5.2376546805445054e-05, |
|
"loss": 1.535, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.385695087352546, |
|
"grad_norm": 0.0521889254450798, |
|
"learning_rate": 5.211181728814262e-05, |
|
"loss": 1.5348, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.3916679110049275, |
|
"grad_norm": 0.04742933064699173, |
|
"learning_rate": 5.18465024114032e-05, |
|
"loss": 1.5421, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.3976407346573092, |
|
"grad_norm": 0.05169609189033508, |
|
"learning_rate": 5.158061499779201e-05, |
|
"loss": 1.5322, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.403613558309691, |
|
"grad_norm": 0.05307742580771446, |
|
"learning_rate": 5.131416789754472e-05, |
|
"loss": 1.538, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.4095863819620726, |
|
"grad_norm": 0.04581635445356369, |
|
"learning_rate": 5.1047173987946474e-05, |
|
"loss": 1.5313, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.4155592056144544, |
|
"grad_norm": 0.04794102534651756, |
|
"learning_rate": 5.077964617270947e-05, |
|
"loss": 1.5357, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.4215320292668359, |
|
"grad_norm": 0.043038323521614075, |
|
"learning_rate": 5.051159738134937e-05, |
|
"loss": 1.5362, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.4275048529192176, |
|
"grad_norm": 0.052804794162511826, |
|
"learning_rate": 5.024304056856039e-05, |
|
"loss": 1.5299, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.4334776765715993, |
|
"grad_norm": 0.051046222448349, |
|
"learning_rate": 4.997398871358928e-05, |
|
"loss": 1.529, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.4394505002239808, |
|
"grad_norm": 0.056139182299375534, |
|
"learning_rate": 4.970445481960793e-05, |
|
"loss": 1.5368, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.4454233238763625, |
|
"grad_norm": 0.04890932887792587, |
|
"learning_rate": 4.9434451913085e-05, |
|
"loss": 1.5308, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.4513961475287442, |
|
"grad_norm": 0.04679281637072563, |
|
"learning_rate": 4.916399304315636e-05, |
|
"loss": 1.5353, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.457368971181126, |
|
"grad_norm": 0.05536729097366333, |
|
"learning_rate": 4.8893091280994415e-05, |
|
"loss": 1.5314, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.4633417948335075, |
|
"grad_norm": 0.04933058097958565, |
|
"learning_rate": 4.862175971917637e-05, |
|
"loss": 1.5301, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.4693146184858892, |
|
"grad_norm": 0.05884556844830513, |
|
"learning_rate": 4.835001147105148e-05, |
|
"loss": 1.5213, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.475287442138271, |
|
"grad_norm": 0.04465237259864807, |
|
"learning_rate": 4.807785967010729e-05, |
|
"loss": 1.5288, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.4812602657906524, |
|
"grad_norm": 0.04548431187868118, |
|
"learning_rate": 4.780531746933491e-05, |
|
"loss": 1.5353, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.4872330894430341, |
|
"grad_norm": 0.047798071056604385, |
|
"learning_rate": 4.7532398040593295e-05, |
|
"loss": 1.5261, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.4932059130954158, |
|
"grad_norm": 0.05616561323404312, |
|
"learning_rate": 4.7259114573972715e-05, |
|
"loss": 1.5343, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.4991787367477976, |
|
"grad_norm": 0.053861986845731735, |
|
"learning_rate": 4.6985480277157215e-05, |
|
"loss": 1.5249, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.5051515604001793, |
|
"grad_norm": 0.05890486761927605, |
|
"learning_rate": 4.671150837478634e-05, |
|
"loss": 1.5357, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.511124384052561, |
|
"grad_norm": 0.056382015347480774, |
|
"learning_rate": 4.643721210781601e-05, |
|
"loss": 1.5159, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.5170972077049425, |
|
"grad_norm": 0.051396943628787994, |
|
"learning_rate": 4.6162604732878515e-05, |
|
"loss": 1.5301, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.5230700313573242, |
|
"grad_norm": 0.04754629358649254, |
|
"learning_rate": 4.588769952164191e-05, |
|
"loss": 1.5277, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.5290428550097057, |
|
"grad_norm": 0.0532587394118309, |
|
"learning_rate": 4.561250976016851e-05, |
|
"loss": 1.5201, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.5350156786620874, |
|
"grad_norm": 0.059257134795188904, |
|
"learning_rate": 4.5337048748272905e-05, |
|
"loss": 1.5265, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.5409885023144692, |
|
"grad_norm": 0.05495699495077133, |
|
"learning_rate": 4.5061329798879064e-05, |
|
"loss": 1.5247, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.5469613259668509, |
|
"grad_norm": 0.04833153635263443, |
|
"learning_rate": 4.478536623737699e-05, |
|
"loss": 1.5291, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.5529341496192326, |
|
"grad_norm": 0.048605091869831085, |
|
"learning_rate": 4.450917140097869e-05, |
|
"loss": 1.5277, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.5589069732716143, |
|
"grad_norm": 0.06368768960237503, |
|
"learning_rate": 4.4232758638073585e-05, |
|
"loss": 1.5306, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.5648797969239958, |
|
"grad_norm": 0.04569351673126221, |
|
"learning_rate": 4.395614130758344e-05, |
|
"loss": 1.5208, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.5708526205763775, |
|
"grad_norm": 0.07877717167139053, |
|
"learning_rate": 4.367933277831666e-05, |
|
"loss": 1.5152, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.576825444228759, |
|
"grad_norm": 0.05059320852160454, |
|
"learning_rate": 4.34023464283222e-05, |
|
"loss": 1.5199, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.5827982678811408, |
|
"grad_norm": 0.05248813331127167, |
|
"learning_rate": 4.312519564424306e-05, |
|
"loss": 1.5236, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.5887710915335225, |
|
"grad_norm": 0.051895346492528915, |
|
"learning_rate": 4.2847893820669244e-05, |
|
"loss": 1.5225, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.5947439151859042, |
|
"grad_norm": 0.048129428178071976, |
|
"learning_rate": 4.2570454359490455e-05, |
|
"loss": 1.5259, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.600716738838286, |
|
"grad_norm": 0.049009375274181366, |
|
"learning_rate": 4.2292890669248364e-05, |
|
"loss": 1.533, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.6066895624906674, |
|
"grad_norm": 0.05925741046667099, |
|
"learning_rate": 4.2015216164488575e-05, |
|
"loss": 1.5242, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.6126623861430491, |
|
"grad_norm": 0.051209457218647, |
|
"learning_rate": 4.173744426511231e-05, |
|
"loss": 1.5348, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.6186352097954306, |
|
"grad_norm": 0.04731997102499008, |
|
"learning_rate": 4.1459588395727876e-05, |
|
"loss": 1.5179, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.6246080334478123, |
|
"grad_norm": 0.04640951007604599, |
|
"learning_rate": 4.118166198500178e-05, |
|
"loss": 1.5218, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.630580857100194, |
|
"grad_norm": 0.05060356855392456, |
|
"learning_rate": 4.090367846500976e-05, |
|
"loss": 1.5184, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.6365536807525758, |
|
"grad_norm": 0.04525948315858841, |
|
"learning_rate": 4.062565127058764e-05, |
|
"loss": 1.5207, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.6425265044049575, |
|
"grad_norm": 0.0447864904999733, |
|
"learning_rate": 4.0347593838682016e-05, |
|
"loss": 1.5265, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.6484993280573392, |
|
"grad_norm": 0.06339412927627563, |
|
"learning_rate": 4.006951960770084e-05, |
|
"loss": 1.5296, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.6544721517097207, |
|
"grad_norm": 0.05479173734784126, |
|
"learning_rate": 3.979144201686396e-05, |
|
"loss": 1.5167, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.6604449753621024, |
|
"grad_norm": 0.05605393648147583, |
|
"learning_rate": 3.951337450555361e-05, |
|
"loss": 1.5208, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.666417799014484, |
|
"grad_norm": 0.04500933736562729, |
|
"learning_rate": 3.923533051266486e-05, |
|
"loss": 1.5199, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.6723906226668657, |
|
"grad_norm": 0.044439464807510376, |
|
"learning_rate": 3.8957323475956165e-05, |
|
"loss": 1.5254, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.6783634463192474, |
|
"grad_norm": 0.051942795515060425, |
|
"learning_rate": 3.867936683139991e-05, |
|
"loss": 1.5168, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.684336269971629, |
|
"grad_norm": 0.05696643143892288, |
|
"learning_rate": 3.840147401253305e-05, |
|
"loss": 1.5261, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.6903090936240108, |
|
"grad_norm": 0.0423273928463459, |
|
"learning_rate": 3.812365844980782e-05, |
|
"loss": 1.5166, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.6962819172763925, |
|
"grad_norm": 0.04251600056886673, |
|
"learning_rate": 3.784593356994275e-05, |
|
"loss": 1.514, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.702254740928774, |
|
"grad_norm": 0.06778108328580856, |
|
"learning_rate": 3.7568312795273675e-05, |
|
"loss": 1.5161, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.7082275645811558, |
|
"grad_norm": 0.046843383461236954, |
|
"learning_rate": 3.729080954310509e-05, |
|
"loss": 1.5215, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.7142003882335373, |
|
"grad_norm": 0.04683705046772957, |
|
"learning_rate": 3.701343722506164e-05, |
|
"loss": 1.5191, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.720173211885919, |
|
"grad_norm": 0.04883548244833946, |
|
"learning_rate": 3.673620924644e-05, |
|
"loss": 1.5175, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.7261460355383007, |
|
"grad_norm": 0.047556836158037186, |
|
"learning_rate": 3.6459139005560966e-05, |
|
"loss": 1.5191, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.7321188591906824, |
|
"grad_norm": 0.04096701368689537, |
|
"learning_rate": 3.618223989312195e-05, |
|
"loss": 1.5195, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.7380916828430641, |
|
"grad_norm": 0.043791547417640686, |
|
"learning_rate": 3.590552529154974e-05, |
|
"loss": 1.5149, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.7440645064954459, |
|
"grad_norm": 0.06429862976074219, |
|
"learning_rate": 3.562900857435384e-05, |
|
"loss": 1.5136, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.7500373301478274, |
|
"grad_norm": 0.04811246693134308, |
|
"learning_rate": 3.535270310548007e-05, |
|
"loss": 1.5178, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.756010153800209, |
|
"grad_norm": 0.05720449239015579, |
|
"learning_rate": 3.5076622238664675e-05, |
|
"loss": 1.5112, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.7619829774525906, |
|
"grad_norm": 0.04717197269201279, |
|
"learning_rate": 3.480077931678899e-05, |
|
"loss": 1.5147, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.7679558011049723, |
|
"grad_norm": 0.04889809712767601, |
|
"learning_rate": 3.452518767123456e-05, |
|
"loss": 1.5186, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.773928624757354, |
|
"grad_norm": 0.055686600506305695, |
|
"learning_rate": 3.424986062123883e-05, |
|
"loss": 1.5105, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.7799014484097357, |
|
"grad_norm": 0.045671623200178146, |
|
"learning_rate": 3.397481147325146e-05, |
|
"loss": 1.5236, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.7858742720621175, |
|
"grad_norm": 0.0518915057182312, |
|
"learning_rate": 3.370005352029122e-05, |
|
"loss": 1.5082, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.7918470957144992, |
|
"grad_norm": 0.0466337613761425, |
|
"learning_rate": 3.342560004130351e-05, |
|
"loss": 1.5246, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.7918470957144992, |
|
"eval_loss": 1.5170252323150635, |
|
"eval_runtime": 20.1093, |
|
"eval_samples_per_second": 1722.235, |
|
"eval_steps_per_second": 13.476, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.7978199193668807, |
|
"grad_norm": 0.04238193854689598, |
|
"learning_rate": 3.3151464300518634e-05, |
|
"loss": 1.5097, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.8037927430192624, |
|
"grad_norm": 0.050784409046173096, |
|
"learning_rate": 3.2877659546810745e-05, |
|
"loss": 1.5195, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.809765566671644, |
|
"grad_norm": 0.04055749997496605, |
|
"learning_rate": 3.260419901305751e-05, |
|
"loss": 1.5171, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.8157383903240256, |
|
"grad_norm": 0.05311364307999611, |
|
"learning_rate": 3.2331095915500564e-05, |
|
"loss": 1.5136, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.8217112139764073, |
|
"grad_norm": 0.0499190054833889, |
|
"learning_rate": 3.205836345310681e-05, |
|
"loss": 1.5081, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.827684037628789, |
|
"grad_norm": 0.056762441992759705, |
|
"learning_rate": 3.178601480693048e-05, |
|
"loss": 1.5243, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.8336568612811708, |
|
"grad_norm": 0.04753740131855011, |
|
"learning_rate": 3.151406313947615e-05, |
|
"loss": 1.5069, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.8396296849335525, |
|
"grad_norm": 0.054608915001153946, |
|
"learning_rate": 3.124252159406251e-05, |
|
"loss": 1.5172, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.845602508585934, |
|
"grad_norm": 0.04840042069554329, |
|
"learning_rate": 3.097140329418726e-05, |
|
"loss": 1.5126, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.8515753322383157, |
|
"grad_norm": 0.05584624037146568, |
|
"learning_rate": 3.07007213428928e-05, |
|
"loss": 1.5091, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.8575481558906972, |
|
"grad_norm": 0.0425049252808094, |
|
"learning_rate": 3.0430488822132957e-05, |
|
"loss": 1.5155, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.863520979543079, |
|
"grad_norm": 0.043588876724243164, |
|
"learning_rate": 3.016071879214077e-05, |
|
"loss": 1.5099, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.8694938031954607, |
|
"grad_norm": 0.041503310203552246, |
|
"learning_rate": 2.989142429079725e-05, |
|
"loss": 1.509, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.8754666268478424, |
|
"grad_norm": 0.04797055944800377, |
|
"learning_rate": 2.962261833300133e-05, |
|
"loss": 1.507, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.881439450500224, |
|
"grad_norm": 0.05003626272082329, |
|
"learning_rate": 2.935431391004081e-05, |
|
"loss": 1.5177, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.8874122741526056, |
|
"grad_norm": 0.04475341737270355, |
|
"learning_rate": 2.9086523988964478e-05, |
|
"loss": 1.5077, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.8933850978049873, |
|
"grad_norm": 0.04602671042084694, |
|
"learning_rate": 2.881926151195547e-05, |
|
"loss": 1.5037, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.8993579214573688, |
|
"grad_norm": 0.04945210739970207, |
|
"learning_rate": 2.855253939570578e-05, |
|
"loss": 1.503, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.9053307451097505, |
|
"grad_norm": 0.04730582609772682, |
|
"learning_rate": 2.8286370530791914e-05, |
|
"loss": 1.5064, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.9113035687621323, |
|
"grad_norm": 0.05128956586122513, |
|
"learning_rate": 2.8020767781052016e-05, |
|
"loss": 1.5126, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.917276392414514, |
|
"grad_norm": 0.055559854954481125, |
|
"learning_rate": 2.7755743982964066e-05, |
|
"loss": 1.5052, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.9232492160668957, |
|
"grad_norm": 0.036298781633377075, |
|
"learning_rate": 2.749131194502555e-05, |
|
"loss": 1.5092, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.9292220397192774, |
|
"grad_norm": 0.042619943618774414, |
|
"learning_rate": 2.7227484447134398e-05, |
|
"loss": 1.5044, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.935194863371659, |
|
"grad_norm": 0.052806805819272995, |
|
"learning_rate": 2.696427423997138e-05, |
|
"loss": 1.5056, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.9411676870240406, |
|
"grad_norm": 0.044467948377132416, |
|
"learning_rate": 2.670169404438383e-05, |
|
"loss": 1.5114, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.9471405106764221, |
|
"grad_norm": 0.038638997822999954, |
|
"learning_rate": 2.6439756550770872e-05, |
|
"loss": 1.5154, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.9531133343288039, |
|
"grad_norm": 0.04845379292964935, |
|
"learning_rate": 2.617847441847007e-05, |
|
"loss": 1.51, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.9590861579811856, |
|
"grad_norm": 0.0445607528090477, |
|
"learning_rate": 2.5917860275145658e-05, |
|
"loss": 1.5047, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.9650589816335673, |
|
"grad_norm": 0.045905206352472305, |
|
"learning_rate": 2.5657926716178217e-05, |
|
"loss": 1.5118, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.971031805285949, |
|
"grad_norm": 0.04530317336320877, |
|
"learning_rate": 2.539868630405594e-05, |
|
"loss": 1.5099, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.9770046289383307, |
|
"grad_norm": 0.04195258021354675, |
|
"learning_rate": 2.5140151567767505e-05, |
|
"loss": 1.5075, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.9829774525907122, |
|
"grad_norm": 0.043815840035676956, |
|
"learning_rate": 2.4882335002196553e-05, |
|
"loss": 1.5096, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.988950276243094, |
|
"grad_norm": 0.04683714732527733, |
|
"learning_rate": 2.4625249067517803e-05, |
|
"loss": 1.5057, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.9949230998954754, |
|
"grad_norm": 0.049690209329128265, |
|
"learning_rate": 2.4368906188594877e-05, |
|
"loss": 1.5106, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.000895923547857, |
|
"grad_norm": 0.048324376344680786, |
|
"learning_rate": 2.4113318754379816e-05, |
|
"loss": 1.5042, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.006868747200239, |
|
"grad_norm": 0.05503029376268387, |
|
"learning_rate": 2.385849911731426e-05, |
|
"loss": 1.4922, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.0128415708526206, |
|
"grad_norm": 0.049435921013355255, |
|
"learning_rate": 2.360445959273255e-05, |
|
"loss": 1.4962, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.0188143945050023, |
|
"grad_norm": 0.05086649954319, |
|
"learning_rate": 2.3351212458266512e-05, |
|
"loss": 1.4918, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.024787218157384, |
|
"grad_norm": 0.045887332409620285, |
|
"learning_rate": 2.3098769953252002e-05, |
|
"loss": 1.4868, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.0307600418097658, |
|
"grad_norm": 0.04303443059325218, |
|
"learning_rate": 2.2847144278137502e-05, |
|
"loss": 1.4982, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.036732865462147, |
|
"grad_norm": 0.043649692088365555, |
|
"learning_rate": 2.2596347593894387e-05, |
|
"loss": 1.5, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.0427056891145288, |
|
"grad_norm": 0.04276139661669731, |
|
"learning_rate": 2.2346392021429254e-05, |
|
"loss": 1.4903, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.0486785127669105, |
|
"grad_norm": 0.04298582300543785, |
|
"learning_rate": 2.2097289640998074e-05, |
|
"loss": 1.5032, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.054651336419292, |
|
"grad_norm": 0.053750213235616684, |
|
"learning_rate": 2.1849052491622374e-05, |
|
"loss": 1.4942, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.060624160071674, |
|
"grad_norm": 0.042636483907699585, |
|
"learning_rate": 2.160169257050742e-05, |
|
"loss": 1.4976, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.0665969837240556, |
|
"grad_norm": 0.05124128982424736, |
|
"learning_rate": 2.135522183246237e-05, |
|
"loss": 1.4981, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.0725698073764374, |
|
"grad_norm": 0.047978244721889496, |
|
"learning_rate": 2.110965218932247e-05, |
|
"loss": 1.4975, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 2.078542631028819, |
|
"grad_norm": 0.045476969331502914, |
|
"learning_rate": 2.0864995509373448e-05, |
|
"loss": 1.4958, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 2.0845154546812004, |
|
"grad_norm": 0.05264231190085411, |
|
"learning_rate": 2.062126361677786e-05, |
|
"loss": 1.4996, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.090488278333582, |
|
"grad_norm": 0.05144358426332474, |
|
"learning_rate": 2.037846829100364e-05, |
|
"loss": 1.5077, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.096461101985964, |
|
"grad_norm": 0.048265036195516586, |
|
"learning_rate": 2.013662126625482e-05, |
|
"loss": 1.4987, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.1024339256383455, |
|
"grad_norm": 0.04586884751915932, |
|
"learning_rate": 1.9895734230904396e-05, |
|
"loss": 1.5044, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.1084067492907272, |
|
"grad_norm": 0.03930211812257767, |
|
"learning_rate": 1.965581882692949e-05, |
|
"loss": 1.4951, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.114379572943109, |
|
"grad_norm": 0.051928870379924774, |
|
"learning_rate": 1.9416886649348575e-05, |
|
"loss": 1.4962, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.1203523965954907, |
|
"grad_norm": 0.04466070607304573, |
|
"learning_rate": 1.917894924566125e-05, |
|
"loss": 1.4874, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.126325220247872, |
|
"grad_norm": 0.044879212975502014, |
|
"learning_rate": 1.8942018115290063e-05, |
|
"loss": 1.4896, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.1322980439002537, |
|
"grad_norm": 0.04508794844150543, |
|
"learning_rate": 1.8706104709024715e-05, |
|
"loss": 1.4915, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.1382708675526354, |
|
"grad_norm": 0.06577686965465546, |
|
"learning_rate": 1.8471220428468745e-05, |
|
"loss": 1.4981, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.144243691205017, |
|
"grad_norm": 0.03995177894830704, |
|
"learning_rate": 1.823737662548843e-05, |
|
"loss": 1.4973, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.150216514857399, |
|
"grad_norm": 0.06114717572927475, |
|
"learning_rate": 1.800458460166417e-05, |
|
"loss": 1.4942, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.1561893385097806, |
|
"grad_norm": 0.04745366424322128, |
|
"learning_rate": 1.7772855607744284e-05, |
|
"loss": 1.5004, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 0.045220714062452316, |
|
"learning_rate": 1.7542200843101267e-05, |
|
"loss": 1.494, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.168134985814544, |
|
"grad_norm": 0.04914199188351631, |
|
"learning_rate": 1.7312631455190528e-05, |
|
"loss": 1.491, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.1741078094669257, |
|
"grad_norm": 0.044854309409856796, |
|
"learning_rate": 1.708415853901166e-05, |
|
"loss": 1.4974, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.180080633119307, |
|
"grad_norm": 0.0511915348470211, |
|
"learning_rate": 1.6856793136572155e-05, |
|
"loss": 1.4978, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.1860534567716887, |
|
"grad_norm": 0.052235160022974014, |
|
"learning_rate": 1.6630546236353833e-05, |
|
"loss": 1.4884, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.1920262804240704, |
|
"grad_norm": 0.03959416225552559, |
|
"learning_rate": 1.6405428772781724e-05, |
|
"loss": 1.4897, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.197999104076452, |
|
"grad_norm": 0.04642707481980324, |
|
"learning_rate": 1.618145162569563e-05, |
|
"loss": 1.489, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.203971927728834, |
|
"grad_norm": 0.05590491741895676, |
|
"learning_rate": 1.5958625619824286e-05, |
|
"loss": 1.4946, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.2099447513812156, |
|
"grad_norm": 0.050484009087085724, |
|
"learning_rate": 1.5736961524262232e-05, |
|
"loss": 1.5011, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.2159175750335973, |
|
"grad_norm": 0.04109204187989235, |
|
"learning_rate": 1.551647005194932e-05, |
|
"loss": 1.4993, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 2.2218903986859786, |
|
"grad_norm": 0.04570942744612694, |
|
"learning_rate": 1.5297161859152986e-05, |
|
"loss": 1.491, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.2278632223383603, |
|
"grad_norm": 0.041420578956604004, |
|
"learning_rate": 1.5079047544953227e-05, |
|
"loss": 1.4874, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 2.233836045990742, |
|
"grad_norm": 0.04918381944298744, |
|
"learning_rate": 1.486213765073032e-05, |
|
"loss": 1.4939, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.2398088696431238, |
|
"grad_norm": 0.05086056888103485, |
|
"learning_rate": 1.4646442659655425e-05, |
|
"loss": 1.4992, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.2457816932955055, |
|
"grad_norm": 0.061345502734184265, |
|
"learning_rate": 1.4431972996183894e-05, |
|
"loss": 1.4935, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 2.251754516947887, |
|
"grad_norm": 0.03802775219082832, |
|
"learning_rate": 1.4218739025551469e-05, |
|
"loss": 1.487, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 2.257727340600269, |
|
"grad_norm": 0.039830368012189865, |
|
"learning_rate": 1.4006751053273338e-05, |
|
"loss": 1.4943, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 2.2637001642526506, |
|
"grad_norm": 0.04441362991929054, |
|
"learning_rate": 1.3796019324646062e-05, |
|
"loss": 1.4907, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 2.269672987905032, |
|
"grad_norm": 0.04267200455069542, |
|
"learning_rate": 1.358655402425245e-05, |
|
"loss": 1.4905, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.2756458115574136, |
|
"grad_norm": 0.04467471316456795, |
|
"learning_rate": 1.3378365275469322e-05, |
|
"loss": 1.4865, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 2.2816186352097954, |
|
"grad_norm": 0.04877958446741104, |
|
"learning_rate": 1.3171463139978222e-05, |
|
"loss": 1.4978, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 2.287591458862177, |
|
"grad_norm": 0.04458734765648842, |
|
"learning_rate": 1.2965857617279216e-05, |
|
"loss": 1.4931, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 2.293564282514559, |
|
"grad_norm": 0.043027278035879135, |
|
"learning_rate": 1.2761558644207547e-05, |
|
"loss": 1.495, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 2.2995371061669405, |
|
"grad_norm": 0.03808119520545006, |
|
"learning_rate": 1.2558576094453435e-05, |
|
"loss": 1.4922, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.3055099298193222, |
|
"grad_norm": 0.038997333496809006, |
|
"learning_rate": 1.2356919778084867e-05, |
|
"loss": 1.4915, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 2.3114827534717035, |
|
"grad_norm": 0.04020654410123825, |
|
"learning_rate": 1.2156599441073488e-05, |
|
"loss": 1.4874, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 2.3174555771240852, |
|
"grad_norm": 0.04891055077314377, |
|
"learning_rate": 1.1957624764823566e-05, |
|
"loss": 1.5016, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 2.323428400776467, |
|
"grad_norm": 0.046524520963430405, |
|
"learning_rate": 1.176000536570412e-05, |
|
"loss": 1.4928, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 2.3294012244288487, |
|
"grad_norm": 0.04302162304520607, |
|
"learning_rate": 1.1563750794584156e-05, |
|
"loss": 1.4905, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.3353740480812304, |
|
"grad_norm": 0.046545591205358505, |
|
"learning_rate": 1.1368870536371036e-05, |
|
"loss": 1.4911, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 2.341346871733612, |
|
"grad_norm": 0.04680660367012024, |
|
"learning_rate": 1.1175374009552159e-05, |
|
"loss": 1.4832, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 2.347319695385994, |
|
"grad_norm": 0.04679818078875542, |
|
"learning_rate": 1.0983270565739668e-05, |
|
"loss": 1.4892, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 2.3532925190383756, |
|
"grad_norm": 0.04409361630678177, |
|
"learning_rate": 1.0792569489218598e-05, |
|
"loss": 1.4907, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 2.3592653426907573, |
|
"grad_norm": 0.04122375324368477, |
|
"learning_rate": 1.0603279996498089e-05, |
|
"loss": 1.4936, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.3652381663431385, |
|
"grad_norm": 0.045084912329912186, |
|
"learning_rate": 1.0415411235865979e-05, |
|
"loss": 1.4852, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 2.3712109899955203, |
|
"grad_norm": 0.04110685735940933, |
|
"learning_rate": 1.0228972286946695e-05, |
|
"loss": 1.494, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 2.377183813647902, |
|
"grad_norm": 0.04527169466018677, |
|
"learning_rate": 1.0043972160262392e-05, |
|
"loss": 1.4955, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 2.3831566373002837, |
|
"grad_norm": 0.04808187112212181, |
|
"learning_rate": 9.860419796797527e-06, |
|
"loss": 1.4858, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 2.3891294609526654, |
|
"grad_norm": 0.03969137370586395, |
|
"learning_rate": 9.678324067566716e-06, |
|
"loss": 1.497, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.3891294609526654, |
|
"eval_loss": 1.4980565309524536, |
|
"eval_runtime": 20.0226, |
|
"eval_samples_per_second": 1729.697, |
|
"eval_steps_per_second": 13.535, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.395102284605047, |
|
"grad_norm": 0.039191678166389465, |
|
"learning_rate": 9.497693773185985e-06, |
|
"loss": 1.491, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 2.401075108257429, |
|
"grad_norm": 0.04326602816581726, |
|
"learning_rate": 9.318537643447488e-06, |
|
"loss": 1.4897, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 2.40704793190981, |
|
"grad_norm": 0.04062432423233986, |
|
"learning_rate": 9.140864336897559e-06, |
|
"loss": 1.4834, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 2.413020755562192, |
|
"grad_norm": 0.043511949479579926, |
|
"learning_rate": 8.964682440418272e-06, |
|
"loss": 1.4899, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 2.4189935792145736, |
|
"grad_norm": 0.041364822536706924, |
|
"learning_rate": 8.79000046881242e-06, |
|
"loss": 1.4876, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.4249664028669553, |
|
"grad_norm": 0.03720170632004738, |
|
"learning_rate": 8.61682686439202e-06, |
|
"loss": 1.4926, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 2.430939226519337, |
|
"grad_norm": 0.04620780423283577, |
|
"learning_rate": 8.44516999657027e-06, |
|
"loss": 1.4929, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 2.4369120501717187, |
|
"grad_norm": 0.03785783797502518, |
|
"learning_rate": 8.275038161457094e-06, |
|
"loss": 1.4917, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 2.4428848738241005, |
|
"grad_norm": 0.047655072063207626, |
|
"learning_rate": 8.106439581458177e-06, |
|
"loss": 1.4923, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 2.448857697476482, |
|
"grad_norm": 0.04838723689317703, |
|
"learning_rate": 7.939382404877545e-06, |
|
"loss": 1.4902, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.454830521128864, |
|
"grad_norm": 0.0498916357755661, |
|
"learning_rate": 7.773874705523826e-06, |
|
"loss": 1.4846, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 2.460803344781245, |
|
"grad_norm": 0.044865112751722336, |
|
"learning_rate": 7.609924482320013e-06, |
|
"loss": 1.4867, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 2.466776168433627, |
|
"grad_norm": 0.041775912046432495, |
|
"learning_rate": 7.447539658916869e-06, |
|
"loss": 1.4869, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 2.4727489920860086, |
|
"grad_norm": 0.03888450190424919, |
|
"learning_rate": 7.286728083309995e-06, |
|
"loss": 1.4824, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 2.4787218157383903, |
|
"grad_norm": 0.05169163644313812, |
|
"learning_rate": 7.127497527460541e-06, |
|
"loss": 1.4856, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.484694639390772, |
|
"grad_norm": 0.04095705598592758, |
|
"learning_rate": 6.969855686919573e-06, |
|
"loss": 1.4899, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 2.490667463043154, |
|
"grad_norm": 0.0429367758333683, |
|
"learning_rate": 6.81381018045618e-06, |
|
"loss": 1.4848, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 2.4966402866955355, |
|
"grad_norm": 0.04392432048916817, |
|
"learning_rate": 6.659368549689209e-06, |
|
"loss": 1.4832, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 2.502613110347917, |
|
"grad_norm": 0.04673699662089348, |
|
"learning_rate": 6.506538258722859e-06, |
|
"loss": 1.4855, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 2.5085859340002985, |
|
"grad_norm": 0.04074994474649429, |
|
"learning_rate": 6.355326693785868e-06, |
|
"loss": 1.4789, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.51455875765268, |
|
"grad_norm": 0.035382091999053955, |
|
"learning_rate": 6.2057411628745875e-06, |
|
"loss": 1.4862, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 2.520531581305062, |
|
"grad_norm": 0.03829929605126381, |
|
"learning_rate": 6.057788895399781e-06, |
|
"loss": 1.4852, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 2.5265044049574437, |
|
"grad_norm": 0.04219154641032219, |
|
"learning_rate": 5.9114770418372015e-06, |
|
"loss": 1.4865, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 2.5324772286098254, |
|
"grad_norm": 0.04591584950685501, |
|
"learning_rate": 5.7668126733820476e-06, |
|
"loss": 1.4737, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 2.538450052262207, |
|
"grad_norm": 0.045854389667510986, |
|
"learning_rate": 5.623802781607204e-06, |
|
"loss": 1.4872, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.544422875914589, |
|
"grad_norm": 0.04153481870889664, |
|
"learning_rate": 5.48245427812534e-06, |
|
"loss": 1.4806, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 2.5503956995669705, |
|
"grad_norm": 0.03822470083832741, |
|
"learning_rate": 5.342773994254842e-06, |
|
"loss": 1.4792, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 2.556368523219352, |
|
"grad_norm": 0.03870686888694763, |
|
"learning_rate": 5.204768680689727e-06, |
|
"loss": 1.4771, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 2.5623413468717335, |
|
"grad_norm": 0.05567542836070061, |
|
"learning_rate": 5.068445007173331e-06, |
|
"loss": 1.4812, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 2.5683141705241153, |
|
"grad_norm": 0.03914303705096245, |
|
"learning_rate": 4.933809562175982e-06, |
|
"loss": 1.4952, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.574286994176497, |
|
"grad_norm": 0.04728810861706734, |
|
"learning_rate": 4.800868852576561e-06, |
|
"loss": 1.4813, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 2.5802598178288787, |
|
"grad_norm": 0.04394581541419029, |
|
"learning_rate": 4.669629303348066e-06, |
|
"loss": 1.4779, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 2.5862326414812604, |
|
"grad_norm": 0.042139682918787, |
|
"learning_rate": 4.540097257247062e-06, |
|
"loss": 1.4847, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 2.5922054651336417, |
|
"grad_norm": 0.04580564424395561, |
|
"learning_rate": 4.412278974507151e-06, |
|
"loss": 1.4767, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 2.5981782887860234, |
|
"grad_norm": 0.03395635262131691, |
|
"learning_rate": 4.286180632536421e-06, |
|
"loss": 1.4871, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.604151112438405, |
|
"grad_norm": 0.04606311395764351, |
|
"learning_rate": 4.161808325618886e-06, |
|
"loss": 1.4865, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 2.610123936090787, |
|
"grad_norm": 0.046741172671318054, |
|
"learning_rate": 4.039168064619938e-06, |
|
"loss": 1.4896, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 2.6160967597431686, |
|
"grad_norm": 0.04130960628390312, |
|
"learning_rate": 3.918265776695891e-06, |
|
"loss": 1.4837, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 2.6220695833955503, |
|
"grad_norm": 0.043055951595306396, |
|
"learning_rate": 3.7991073050074678e-06, |
|
"loss": 1.4841, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 2.628042407047932, |
|
"grad_norm": 0.04418269917368889, |
|
"learning_rate": 3.6816984084374485e-06, |
|
"loss": 1.4831, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.6340152307003137, |
|
"grad_norm": 0.036886971443891525, |
|
"learning_rate": 3.5660447613123086e-06, |
|
"loss": 1.4892, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 2.6399880543526955, |
|
"grad_norm": 0.04421091824769974, |
|
"learning_rate": 3.452151953128007e-06, |
|
"loss": 1.4848, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 2.645960878005077, |
|
"grad_norm": 0.042877208441495895, |
|
"learning_rate": 3.3400254882798435e-06, |
|
"loss": 1.4888, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 2.6519337016574585, |
|
"grad_norm": 0.04234934598207474, |
|
"learning_rate": 3.2296707857964125e-06, |
|
"loss": 1.4796, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 2.65790652530984, |
|
"grad_norm": 0.035217370837926865, |
|
"learning_rate": 3.121093179077739e-06, |
|
"loss": 1.481, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.663879348962222, |
|
"grad_norm": 0.040508221834897995, |
|
"learning_rate": 3.0142979156374806e-06, |
|
"loss": 1.4819, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 2.6698521726146036, |
|
"grad_norm": 0.041981033980846405, |
|
"learning_rate": 2.9092901568493446e-06, |
|
"loss": 1.4804, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 2.6758249962669853, |
|
"grad_norm": 0.03790983185172081, |
|
"learning_rate": 2.80607497769763e-06, |
|
"loss": 1.4894, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 2.6817978199193666, |
|
"grad_norm": 0.038940299302339554, |
|
"learning_rate": 2.70465736653196e-06, |
|
"loss": 1.4827, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 2.6877706435717483, |
|
"grad_norm": 0.04031272605061531, |
|
"learning_rate": 2.605042224826182e-06, |
|
"loss": 1.4845, |
|
"step": 4500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5022, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9327446823064306e+19, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|