TCS_MLM_50 / last-checkpoint /trainer_state.json
mgh6's picture
Training in progress, step 9400, checkpoint
87c0f4b verified
raw
history blame
37.7 kB
{
"best_metric": 1.0438764095306396,
"best_model_checkpoint": "mgh6/TCS_MLM_50/checkpoint-8900",
"epoch": 2.7285921625544267,
"eval_steps": 100,
"global_step": 9400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02902757619738752,
"grad_norm": 1.131932258605957,
"learning_rate": 9.970972423802612e-05,
"loss": 2.8244,
"step": 100
},
{
"epoch": 0.02902757619738752,
"eval_loss": 1.2662084102630615,
"eval_runtime": 213.5614,
"eval_samples_per_second": 213.049,
"eval_steps_per_second": 3.329,
"step": 100
},
{
"epoch": 0.05805515239477504,
"grad_norm": 1.0239707231521606,
"learning_rate": 9.941944847605225e-05,
"loss": 2.7081,
"step": 200
},
{
"epoch": 0.05805515239477504,
"eval_loss": 1.2453378438949585,
"eval_runtime": 212.9056,
"eval_samples_per_second": 213.705,
"eval_steps_per_second": 3.34,
"step": 200
},
{
"epoch": 0.08708272859216255,
"grad_norm": 1.1205116510391235,
"learning_rate": 9.912917271407838e-05,
"loss": 2.642,
"step": 300
},
{
"epoch": 0.08708272859216255,
"eval_loss": 1.2237757444381714,
"eval_runtime": 214.4447,
"eval_samples_per_second": 212.171,
"eval_steps_per_second": 3.316,
"step": 300
},
{
"epoch": 0.11611030478955008,
"grad_norm": 1.0193355083465576,
"learning_rate": 9.883889695210451e-05,
"loss": 2.6037,
"step": 400
},
{
"epoch": 0.11611030478955008,
"eval_loss": 1.2148627042770386,
"eval_runtime": 213.5123,
"eval_samples_per_second": 213.098,
"eval_steps_per_second": 3.33,
"step": 400
},
{
"epoch": 0.14513788098693758,
"grad_norm": 1.05299711227417,
"learning_rate": 9.854862119013063e-05,
"loss": 2.5791,
"step": 500
},
{
"epoch": 0.14513788098693758,
"eval_loss": 1.2020208835601807,
"eval_runtime": 213.769,
"eval_samples_per_second": 212.842,
"eval_steps_per_second": 3.326,
"step": 500
},
{
"epoch": 0.1741654571843251,
"grad_norm": 1.0508314371109009,
"learning_rate": 9.825834542815675e-05,
"loss": 2.5464,
"step": 600
},
{
"epoch": 0.1741654571843251,
"eval_loss": 1.1960116624832153,
"eval_runtime": 214.1083,
"eval_samples_per_second": 212.505,
"eval_steps_per_second": 3.321,
"step": 600
},
{
"epoch": 0.20319303338171263,
"grad_norm": 1.158460021018982,
"learning_rate": 9.796806966618288e-05,
"loss": 2.5391,
"step": 700
},
{
"epoch": 0.20319303338171263,
"eval_loss": 1.186664342880249,
"eval_runtime": 213.4364,
"eval_samples_per_second": 213.174,
"eval_steps_per_second": 3.331,
"step": 700
},
{
"epoch": 0.23222060957910015,
"grad_norm": 1.0704821348190308,
"learning_rate": 9.767779390420901e-05,
"loss": 2.4944,
"step": 800
},
{
"epoch": 0.23222060957910015,
"eval_loss": 1.1850290298461914,
"eval_runtime": 213.63,
"eval_samples_per_second": 212.98,
"eval_steps_per_second": 3.328,
"step": 800
},
{
"epoch": 0.2612481857764877,
"grad_norm": 1.0562227964401245,
"learning_rate": 9.738751814223513e-05,
"loss": 2.4879,
"step": 900
},
{
"epoch": 0.2612481857764877,
"eval_loss": 1.1725127696990967,
"eval_runtime": 213.7307,
"eval_samples_per_second": 212.88,
"eval_steps_per_second": 3.327,
"step": 900
},
{
"epoch": 0.29027576197387517,
"grad_norm": 1.136777639389038,
"learning_rate": 9.709724238026126e-05,
"loss": 2.4647,
"step": 1000
},
{
"epoch": 0.29027576197387517,
"eval_loss": 1.1709253787994385,
"eval_runtime": 213.2147,
"eval_samples_per_second": 213.395,
"eval_steps_per_second": 3.335,
"step": 1000
},
{
"epoch": 0.3193033381712627,
"grad_norm": 1.0949931144714355,
"learning_rate": 9.680696661828737e-05,
"loss": 2.4441,
"step": 1100
},
{
"epoch": 0.3193033381712627,
"eval_loss": 1.1647560596466064,
"eval_runtime": 213.5056,
"eval_samples_per_second": 213.104,
"eval_steps_per_second": 3.33,
"step": 1100
},
{
"epoch": 0.3483309143686502,
"grad_norm": 1.2719751596450806,
"learning_rate": 9.65166908563135e-05,
"loss": 2.432,
"step": 1200
},
{
"epoch": 0.3483309143686502,
"eval_loss": 1.1668621301651,
"eval_runtime": 213.8017,
"eval_samples_per_second": 212.809,
"eval_steps_per_second": 3.326,
"step": 1200
},
{
"epoch": 0.37735849056603776,
"grad_norm": 1.1357173919677734,
"learning_rate": 9.622641509433963e-05,
"loss": 2.4173,
"step": 1300
},
{
"epoch": 0.37735849056603776,
"eval_loss": 1.1585583686828613,
"eval_runtime": 212.8448,
"eval_samples_per_second": 213.766,
"eval_steps_per_second": 3.34,
"step": 1300
},
{
"epoch": 0.40638606676342526,
"grad_norm": 1.1240577697753906,
"learning_rate": 9.593613933236575e-05,
"loss": 2.4029,
"step": 1400
},
{
"epoch": 0.40638606676342526,
"eval_loss": 1.1513617038726807,
"eval_runtime": 214.5547,
"eval_samples_per_second": 212.063,
"eval_steps_per_second": 3.314,
"step": 1400
},
{
"epoch": 0.43541364296081275,
"grad_norm": 1.074048399925232,
"learning_rate": 9.564586357039188e-05,
"loss": 2.3964,
"step": 1500
},
{
"epoch": 0.43541364296081275,
"eval_loss": 1.1514214277267456,
"eval_runtime": 213.8115,
"eval_samples_per_second": 212.8,
"eval_steps_per_second": 3.325,
"step": 1500
},
{
"epoch": 0.4644412191582003,
"grad_norm": 1.2565686702728271,
"learning_rate": 9.5355587808418e-05,
"loss": 2.3548,
"step": 1600
},
{
"epoch": 0.4644412191582003,
"eval_loss": 1.1476994752883911,
"eval_runtime": 214.3759,
"eval_samples_per_second": 212.239,
"eval_steps_per_second": 3.317,
"step": 1600
},
{
"epoch": 0.4934687953555878,
"grad_norm": 1.1474090814590454,
"learning_rate": 9.506531204644412e-05,
"loss": 2.36,
"step": 1700
},
{
"epoch": 0.4934687953555878,
"eval_loss": 1.1446571350097656,
"eval_runtime": 213.458,
"eval_samples_per_second": 213.152,
"eval_steps_per_second": 3.331,
"step": 1700
},
{
"epoch": 0.5224963715529753,
"grad_norm": 1.2290916442871094,
"learning_rate": 9.477503628447025e-05,
"loss": 2.3438,
"step": 1800
},
{
"epoch": 0.5224963715529753,
"eval_loss": 1.1393438577651978,
"eval_runtime": 213.014,
"eval_samples_per_second": 213.596,
"eval_steps_per_second": 3.338,
"step": 1800
},
{
"epoch": 0.5515239477503628,
"grad_norm": 1.1700950860977173,
"learning_rate": 9.448476052249638e-05,
"loss": 2.3416,
"step": 1900
},
{
"epoch": 0.5515239477503628,
"eval_loss": 1.1348192691802979,
"eval_runtime": 213.2252,
"eval_samples_per_second": 213.385,
"eval_steps_per_second": 3.335,
"step": 1900
},
{
"epoch": 0.5805515239477503,
"grad_norm": 1.1090705394744873,
"learning_rate": 9.419448476052251e-05,
"loss": 2.3289,
"step": 2000
},
{
"epoch": 0.5805515239477503,
"eval_loss": 1.130873203277588,
"eval_runtime": 212.7564,
"eval_samples_per_second": 213.855,
"eval_steps_per_second": 3.342,
"step": 2000
},
{
"epoch": 0.6095791001451378,
"grad_norm": 1.17753267288208,
"learning_rate": 9.390420899854863e-05,
"loss": 2.3218,
"step": 2100
},
{
"epoch": 0.6095791001451378,
"eval_loss": 1.1335190534591675,
"eval_runtime": 212.7619,
"eval_samples_per_second": 213.849,
"eval_steps_per_second": 3.342,
"step": 2100
},
{
"epoch": 0.6386066763425254,
"grad_norm": 1.087358832359314,
"learning_rate": 9.361393323657474e-05,
"loss": 2.3072,
"step": 2200
},
{
"epoch": 0.6386066763425254,
"eval_loss": 1.1303313970565796,
"eval_runtime": 213.3449,
"eval_samples_per_second": 213.265,
"eval_steps_per_second": 3.333,
"step": 2200
},
{
"epoch": 0.6676342525399129,
"grad_norm": 1.1286981105804443,
"learning_rate": 9.332365747460087e-05,
"loss": 2.2881,
"step": 2300
},
{
"epoch": 0.6676342525399129,
"eval_loss": 1.1234804391860962,
"eval_runtime": 213.3465,
"eval_samples_per_second": 213.263,
"eval_steps_per_second": 3.333,
"step": 2300
},
{
"epoch": 0.6966618287373004,
"grad_norm": 1.1590163707733154,
"learning_rate": 9.3033381712627e-05,
"loss": 2.2751,
"step": 2400
},
{
"epoch": 0.6966618287373004,
"eval_loss": 1.120328664779663,
"eval_runtime": 213.9246,
"eval_samples_per_second": 212.687,
"eval_steps_per_second": 3.324,
"step": 2400
},
{
"epoch": 0.7256894049346879,
"grad_norm": 1.3988169431686401,
"learning_rate": 9.274310595065312e-05,
"loss": 2.2666,
"step": 2500
},
{
"epoch": 0.7256894049346879,
"eval_loss": 1.1266223192214966,
"eval_runtime": 214.3634,
"eval_samples_per_second": 212.252,
"eval_steps_per_second": 3.317,
"step": 2500
},
{
"epoch": 0.7547169811320755,
"grad_norm": 1.239560842514038,
"learning_rate": 9.245283018867925e-05,
"loss": 2.2702,
"step": 2600
},
{
"epoch": 0.7547169811320755,
"eval_loss": 1.1224210262298584,
"eval_runtime": 213.2424,
"eval_samples_per_second": 213.367,
"eval_steps_per_second": 3.334,
"step": 2600
},
{
"epoch": 0.783744557329463,
"grad_norm": 1.1289948225021362,
"learning_rate": 9.216255442670537e-05,
"loss": 2.256,
"step": 2700
},
{
"epoch": 0.783744557329463,
"eval_loss": 1.1150513887405396,
"eval_runtime": 213.4486,
"eval_samples_per_second": 213.161,
"eval_steps_per_second": 3.331,
"step": 2700
},
{
"epoch": 0.8127721335268505,
"grad_norm": 1.1463016271591187,
"learning_rate": 9.18722786647315e-05,
"loss": 2.2483,
"step": 2800
},
{
"epoch": 0.8127721335268505,
"eval_loss": 1.1185483932495117,
"eval_runtime": 212.704,
"eval_samples_per_second": 213.908,
"eval_steps_per_second": 3.343,
"step": 2800
},
{
"epoch": 0.841799709724238,
"grad_norm": 1.1233168840408325,
"learning_rate": 9.158200290275763e-05,
"loss": 2.2328,
"step": 2900
},
{
"epoch": 0.841799709724238,
"eval_loss": 1.1085420846939087,
"eval_runtime": 213.7255,
"eval_samples_per_second": 212.885,
"eval_steps_per_second": 3.327,
"step": 2900
},
{
"epoch": 0.8708272859216255,
"grad_norm": 1.1887527704238892,
"learning_rate": 9.129172714078375e-05,
"loss": 2.235,
"step": 3000
},
{
"epoch": 0.8708272859216255,
"eval_loss": 1.1104073524475098,
"eval_runtime": 213.9252,
"eval_samples_per_second": 212.687,
"eval_steps_per_second": 3.324,
"step": 3000
},
{
"epoch": 0.8998548621190131,
"grad_norm": 1.2834577560424805,
"learning_rate": 9.100145137880988e-05,
"loss": 2.2209,
"step": 3100
},
{
"epoch": 0.8998548621190131,
"eval_loss": 1.1137757301330566,
"eval_runtime": 213.6201,
"eval_samples_per_second": 212.99,
"eval_steps_per_second": 3.328,
"step": 3100
},
{
"epoch": 0.9288824383164006,
"grad_norm": 1.3034873008728027,
"learning_rate": 9.0711175616836e-05,
"loss": 2.2185,
"step": 3200
},
{
"epoch": 0.9288824383164006,
"eval_loss": 1.107863187789917,
"eval_runtime": 213.1098,
"eval_samples_per_second": 213.5,
"eval_steps_per_second": 3.336,
"step": 3200
},
{
"epoch": 0.9579100145137881,
"grad_norm": 1.1802492141723633,
"learning_rate": 9.042089985486212e-05,
"loss": 2.2147,
"step": 3300
},
{
"epoch": 0.9579100145137881,
"eval_loss": 1.1041762828826904,
"eval_runtime": 213.2962,
"eval_samples_per_second": 213.314,
"eval_steps_per_second": 3.333,
"step": 3300
},
{
"epoch": 0.9869375907111756,
"grad_norm": 1.2992894649505615,
"learning_rate": 9.013062409288826e-05,
"loss": 2.216,
"step": 3400
},
{
"epoch": 0.9869375907111756,
"eval_loss": 1.1009138822555542,
"eval_runtime": 213.7998,
"eval_samples_per_second": 212.811,
"eval_steps_per_second": 3.326,
"step": 3400
},
{
"epoch": 1.0159651669085632,
"grad_norm": 1.1432065963745117,
"learning_rate": 8.984034833091437e-05,
"loss": 2.1952,
"step": 3500
},
{
"epoch": 1.0159651669085632,
"eval_loss": 1.106726884841919,
"eval_runtime": 213.7054,
"eval_samples_per_second": 212.905,
"eval_steps_per_second": 3.327,
"step": 3500
},
{
"epoch": 1.0449927431059507,
"grad_norm": 1.1603158712387085,
"learning_rate": 8.95500725689405e-05,
"loss": 2.2019,
"step": 3600
},
{
"epoch": 1.0449927431059507,
"eval_loss": 1.1014330387115479,
"eval_runtime": 213.1977,
"eval_samples_per_second": 213.412,
"eval_steps_per_second": 3.335,
"step": 3600
},
{
"epoch": 1.0740203193033382,
"grad_norm": 1.2428488731384277,
"learning_rate": 8.925979680696662e-05,
"loss": 2.1959,
"step": 3700
},
{
"epoch": 1.0740203193033382,
"eval_loss": 1.1004406213760376,
"eval_runtime": 213.3658,
"eval_samples_per_second": 213.244,
"eval_steps_per_second": 3.332,
"step": 3700
},
{
"epoch": 1.1030478955007257,
"grad_norm": 1.1615545749664307,
"learning_rate": 8.896952104499274e-05,
"loss": 2.1776,
"step": 3800
},
{
"epoch": 1.1030478955007257,
"eval_loss": 1.0938160419464111,
"eval_runtime": 213.3987,
"eval_samples_per_second": 213.211,
"eval_steps_per_second": 3.332,
"step": 3800
},
{
"epoch": 1.1320754716981132,
"grad_norm": 1.1921610832214355,
"learning_rate": 8.867924528301888e-05,
"loss": 2.1762,
"step": 3900
},
{
"epoch": 1.1320754716981132,
"eval_loss": 1.0960694551467896,
"eval_runtime": 213.1832,
"eval_samples_per_second": 213.427,
"eval_steps_per_second": 3.335,
"step": 3900
},
{
"epoch": 1.1611030478955007,
"grad_norm": 1.1980363130569458,
"learning_rate": 8.8388969521045e-05,
"loss": 2.1717,
"step": 4000
},
{
"epoch": 1.1611030478955007,
"eval_loss": 1.0951919555664062,
"eval_runtime": 213.4024,
"eval_samples_per_second": 213.207,
"eval_steps_per_second": 3.332,
"step": 4000
},
{
"epoch": 1.1901306240928882,
"grad_norm": 1.217236042022705,
"learning_rate": 8.809869375907113e-05,
"loss": 2.1534,
"step": 4100
},
{
"epoch": 1.1901306240928882,
"eval_loss": 1.0937577486038208,
"eval_runtime": 213.8113,
"eval_samples_per_second": 212.8,
"eval_steps_per_second": 3.325,
"step": 4100
},
{
"epoch": 1.2191582002902757,
"grad_norm": 1.2121118307113647,
"learning_rate": 8.780841799709725e-05,
"loss": 2.1639,
"step": 4200
},
{
"epoch": 1.2191582002902757,
"eval_loss": 1.0909945964813232,
"eval_runtime": 212.8308,
"eval_samples_per_second": 213.78,
"eval_steps_per_second": 3.341,
"step": 4200
},
{
"epoch": 1.2481857764876634,
"grad_norm": 1.17587411403656,
"learning_rate": 8.751814223512336e-05,
"loss": 2.146,
"step": 4300
},
{
"epoch": 1.2481857764876634,
"eval_loss": 1.0888868570327759,
"eval_runtime": 213.8752,
"eval_samples_per_second": 212.736,
"eval_steps_per_second": 3.324,
"step": 4300
},
{
"epoch": 1.2772133526850509,
"grad_norm": 1.2848412990570068,
"learning_rate": 8.722786647314949e-05,
"loss": 2.1357,
"step": 4400
},
{
"epoch": 1.2772133526850509,
"eval_loss": 1.091068983078003,
"eval_runtime": 213.4081,
"eval_samples_per_second": 213.202,
"eval_steps_per_second": 3.332,
"step": 4400
},
{
"epoch": 1.3062409288824384,
"grad_norm": 1.2059731483459473,
"learning_rate": 8.693759071117562e-05,
"loss": 2.1456,
"step": 4500
},
{
"epoch": 1.3062409288824384,
"eval_loss": 1.0857021808624268,
"eval_runtime": 213.7314,
"eval_samples_per_second": 212.879,
"eval_steps_per_second": 3.327,
"step": 4500
},
{
"epoch": 1.3352685050798259,
"grad_norm": 1.226241946220398,
"learning_rate": 8.664731494920174e-05,
"loss": 2.1453,
"step": 4600
},
{
"epoch": 1.3352685050798259,
"eval_loss": 1.0845140218734741,
"eval_runtime": 213.4698,
"eval_samples_per_second": 213.14,
"eval_steps_per_second": 3.331,
"step": 4600
},
{
"epoch": 1.3642960812772134,
"grad_norm": 1.1810499429702759,
"learning_rate": 8.635703918722787e-05,
"loss": 2.1425,
"step": 4700
},
{
"epoch": 1.3642960812772134,
"eval_loss": 1.0831544399261475,
"eval_runtime": 214.2077,
"eval_samples_per_second": 212.406,
"eval_steps_per_second": 3.319,
"step": 4700
},
{
"epoch": 1.3933236574746009,
"grad_norm": 1.155281662940979,
"learning_rate": 8.606676342525399e-05,
"loss": 2.1173,
"step": 4800
},
{
"epoch": 1.3933236574746009,
"eval_loss": 1.0785441398620605,
"eval_runtime": 213.6973,
"eval_samples_per_second": 212.913,
"eval_steps_per_second": 3.327,
"step": 4800
},
{
"epoch": 1.4223512336719883,
"grad_norm": 1.2070744037628174,
"learning_rate": 8.577648766328012e-05,
"loss": 2.1183,
"step": 4900
},
{
"epoch": 1.4223512336719883,
"eval_loss": 1.0808286666870117,
"eval_runtime": 213.4564,
"eval_samples_per_second": 213.154,
"eval_steps_per_second": 3.331,
"step": 4900
},
{
"epoch": 1.4513788098693758,
"grad_norm": 1.1901525259017944,
"learning_rate": 8.548621190130625e-05,
"loss": 2.1274,
"step": 5000
},
{
"epoch": 1.4513788098693758,
"eval_loss": 1.0827044248580933,
"eval_runtime": 212.5592,
"eval_samples_per_second": 214.053,
"eval_steps_per_second": 3.345,
"step": 5000
},
{
"epoch": 1.4804063860667633,
"grad_norm": 1.1999766826629639,
"learning_rate": 8.519593613933237e-05,
"loss": 2.1145,
"step": 5100
},
{
"epoch": 1.4804063860667633,
"eval_loss": 1.078644037246704,
"eval_runtime": 213.0532,
"eval_samples_per_second": 213.557,
"eval_steps_per_second": 3.337,
"step": 5100
},
{
"epoch": 1.509433962264151,
"grad_norm": 1.2294871807098389,
"learning_rate": 8.49056603773585e-05,
"loss": 2.1067,
"step": 5200
},
{
"epoch": 1.509433962264151,
"eval_loss": 1.0794402360916138,
"eval_runtime": 212.9617,
"eval_samples_per_second": 213.649,
"eval_steps_per_second": 3.339,
"step": 5200
},
{
"epoch": 1.5384615384615383,
"grad_norm": 1.2571580410003662,
"learning_rate": 8.461538461538461e-05,
"loss": 2.1032,
"step": 5300
},
{
"epoch": 1.5384615384615383,
"eval_loss": 1.0783346891403198,
"eval_runtime": 213.4656,
"eval_samples_per_second": 213.144,
"eval_steps_per_second": 3.331,
"step": 5300
},
{
"epoch": 1.567489114658926,
"grad_norm": 1.2078722715377808,
"learning_rate": 8.432510885341074e-05,
"loss": 2.0912,
"step": 5400
},
{
"epoch": 1.567489114658926,
"eval_loss": 1.0764219760894775,
"eval_runtime": 213.826,
"eval_samples_per_second": 212.785,
"eval_steps_per_second": 3.325,
"step": 5400
},
{
"epoch": 1.5965166908563135,
"grad_norm": 1.272294521331787,
"learning_rate": 8.403483309143688e-05,
"loss": 2.0784,
"step": 5500
},
{
"epoch": 1.5965166908563135,
"eval_loss": 1.0817687511444092,
"eval_runtime": 213.443,
"eval_samples_per_second": 213.167,
"eval_steps_per_second": 3.331,
"step": 5500
},
{
"epoch": 1.625544267053701,
"grad_norm": 1.2367442846298218,
"learning_rate": 8.374455732946299e-05,
"loss": 2.0997,
"step": 5600
},
{
"epoch": 1.625544267053701,
"eval_loss": 1.079858660697937,
"eval_runtime": 213.7339,
"eval_samples_per_second": 212.877,
"eval_steps_per_second": 3.327,
"step": 5600
},
{
"epoch": 1.6545718432510885,
"grad_norm": 1.2720229625701904,
"learning_rate": 8.345428156748912e-05,
"loss": 2.093,
"step": 5700
},
{
"epoch": 1.6545718432510885,
"eval_loss": 1.0779507160186768,
"eval_runtime": 213.2034,
"eval_samples_per_second": 213.407,
"eval_steps_per_second": 3.335,
"step": 5700
},
{
"epoch": 1.683599419448476,
"grad_norm": 1.1694726943969727,
"learning_rate": 8.316400580551524e-05,
"loss": 2.0822,
"step": 5800
},
{
"epoch": 1.683599419448476,
"eval_loss": 1.068250060081482,
"eval_runtime": 213.1022,
"eval_samples_per_second": 213.508,
"eval_steps_per_second": 3.336,
"step": 5800
},
{
"epoch": 1.7126269956458637,
"grad_norm": 1.2155323028564453,
"learning_rate": 8.287373004354137e-05,
"loss": 2.0792,
"step": 5900
},
{
"epoch": 1.7126269956458637,
"eval_loss": 1.0666776895523071,
"eval_runtime": 213.4935,
"eval_samples_per_second": 213.117,
"eval_steps_per_second": 3.33,
"step": 5900
},
{
"epoch": 1.741654571843251,
"grad_norm": 1.3163602352142334,
"learning_rate": 8.25834542815675e-05,
"loss": 2.0712,
"step": 6000
},
{
"epoch": 1.741654571843251,
"eval_loss": 1.0677340030670166,
"eval_runtime": 213.751,
"eval_samples_per_second": 212.86,
"eval_steps_per_second": 3.326,
"step": 6000
},
{
"epoch": 1.7706821480406387,
"grad_norm": 1.1972286701202393,
"learning_rate": 8.229317851959362e-05,
"loss": 2.0679,
"step": 6100
},
{
"epoch": 1.7706821480406387,
"eval_loss": 1.0662775039672852,
"eval_runtime": 213.7705,
"eval_samples_per_second": 212.84,
"eval_steps_per_second": 3.326,
"step": 6100
},
{
"epoch": 1.799709724238026,
"grad_norm": 1.189395546913147,
"learning_rate": 8.200290275761974e-05,
"loss": 2.0753,
"step": 6200
},
{
"epoch": 1.799709724238026,
"eval_loss": 1.0646038055419922,
"eval_runtime": 213.3945,
"eval_samples_per_second": 213.215,
"eval_steps_per_second": 3.332,
"step": 6200
},
{
"epoch": 1.8287373004354137,
"grad_norm": 1.2696415185928345,
"learning_rate": 8.171262699564587e-05,
"loss": 2.063,
"step": 6300
},
{
"epoch": 1.8287373004354137,
"eval_loss": 1.0669814348220825,
"eval_runtime": 213.7587,
"eval_samples_per_second": 212.852,
"eval_steps_per_second": 3.326,
"step": 6300
},
{
"epoch": 1.8577648766328012,
"grad_norm": 1.241452693939209,
"learning_rate": 8.142235123367198e-05,
"loss": 2.0508,
"step": 6400
},
{
"epoch": 1.8577648766328012,
"eval_loss": 1.072275996208191,
"eval_runtime": 213.3197,
"eval_samples_per_second": 213.29,
"eval_steps_per_second": 3.333,
"step": 6400
},
{
"epoch": 1.8867924528301887,
"grad_norm": 1.22267484664917,
"learning_rate": 8.113207547169813e-05,
"loss": 2.07,
"step": 6500
},
{
"epoch": 1.8867924528301887,
"eval_loss": 1.0654535293579102,
"eval_runtime": 214.0386,
"eval_samples_per_second": 212.574,
"eval_steps_per_second": 3.322,
"step": 6500
},
{
"epoch": 1.9158200290275762,
"grad_norm": 1.2704839706420898,
"learning_rate": 8.084179970972424e-05,
"loss": 2.0646,
"step": 6600
},
{
"epoch": 1.9158200290275762,
"eval_loss": 1.0614382028579712,
"eval_runtime": 213.4971,
"eval_samples_per_second": 213.113,
"eval_steps_per_second": 3.33,
"step": 6600
},
{
"epoch": 1.9448476052249637,
"grad_norm": 1.3870867490768433,
"learning_rate": 8.055152394775036e-05,
"loss": 2.0598,
"step": 6700
},
{
"epoch": 1.9448476052249637,
"eval_loss": 1.067047357559204,
"eval_runtime": 214.0952,
"eval_samples_per_second": 212.518,
"eval_steps_per_second": 3.321,
"step": 6700
},
{
"epoch": 1.9738751814223512,
"grad_norm": 1.3581643104553223,
"learning_rate": 8.026124818577649e-05,
"loss": 2.0501,
"step": 6800
},
{
"epoch": 1.9738751814223512,
"eval_loss": 1.0663081407546997,
"eval_runtime": 213.8995,
"eval_samples_per_second": 212.712,
"eval_steps_per_second": 3.324,
"step": 6800
},
{
"epoch": 2.0029027576197387,
"grad_norm": 1.3438752889633179,
"learning_rate": 7.997097242380261e-05,
"loss": 2.0332,
"step": 6900
},
{
"epoch": 2.0029027576197387,
"eval_loss": 1.059921383857727,
"eval_runtime": 213.0183,
"eval_samples_per_second": 213.592,
"eval_steps_per_second": 3.338,
"step": 6900
},
{
"epoch": 2.0319303338171264,
"grad_norm": 1.3646849393844604,
"learning_rate": 7.968069666182875e-05,
"loss": 2.0463,
"step": 7000
},
{
"epoch": 2.0319303338171264,
"eval_loss": 1.0679893493652344,
"eval_runtime": 213.3912,
"eval_samples_per_second": 213.219,
"eval_steps_per_second": 3.332,
"step": 7000
},
{
"epoch": 2.0609579100145137,
"grad_norm": 1.2047359943389893,
"learning_rate": 7.939042089985487e-05,
"loss": 2.0376,
"step": 7100
},
{
"epoch": 2.0609579100145137,
"eval_loss": 1.0566322803497314,
"eval_runtime": 213.6266,
"eval_samples_per_second": 212.984,
"eval_steps_per_second": 3.328,
"step": 7100
},
{
"epoch": 2.0899854862119014,
"grad_norm": 1.2285219430923462,
"learning_rate": 7.910014513788099e-05,
"loss": 2.0327,
"step": 7200
},
{
"epoch": 2.0899854862119014,
"eval_loss": 1.058618426322937,
"eval_runtime": 213.6922,
"eval_samples_per_second": 212.918,
"eval_steps_per_second": 3.327,
"step": 7200
},
{
"epoch": 2.1190130624092887,
"grad_norm": 1.2674715518951416,
"learning_rate": 7.880986937590712e-05,
"loss": 2.0347,
"step": 7300
},
{
"epoch": 2.1190130624092887,
"eval_loss": 1.0599507093429565,
"eval_runtime": 213.5256,
"eval_samples_per_second": 213.085,
"eval_steps_per_second": 3.33,
"step": 7300
},
{
"epoch": 2.1480406386066764,
"grad_norm": 1.3713229894638062,
"learning_rate": 7.851959361393323e-05,
"loss": 2.0321,
"step": 7400
},
{
"epoch": 2.1480406386066764,
"eval_loss": 1.0617178678512573,
"eval_runtime": 213.0273,
"eval_samples_per_second": 213.583,
"eval_steps_per_second": 3.338,
"step": 7400
},
{
"epoch": 2.1770682148040637,
"grad_norm": 1.292090654373169,
"learning_rate": 7.822931785195937e-05,
"loss": 2.01,
"step": 7500
},
{
"epoch": 2.1770682148040637,
"eval_loss": 1.0593364238739014,
"eval_runtime": 213.421,
"eval_samples_per_second": 213.189,
"eval_steps_per_second": 3.331,
"step": 7500
},
{
"epoch": 2.2060957910014514,
"grad_norm": 1.1819452047348022,
"learning_rate": 7.79390420899855e-05,
"loss": 2.0209,
"step": 7600
},
{
"epoch": 2.2060957910014514,
"eval_loss": 1.0524711608886719,
"eval_runtime": 214.0149,
"eval_samples_per_second": 212.597,
"eval_steps_per_second": 3.322,
"step": 7600
},
{
"epoch": 2.235123367198839,
"grad_norm": 1.2881128787994385,
"learning_rate": 7.764876632801161e-05,
"loss": 2.0085,
"step": 7700
},
{
"epoch": 2.235123367198839,
"eval_loss": 1.0567752122879028,
"eval_runtime": 213.6228,
"eval_samples_per_second": 212.988,
"eval_steps_per_second": 3.328,
"step": 7700
},
{
"epoch": 2.2641509433962264,
"grad_norm": 1.2962584495544434,
"learning_rate": 7.735849056603774e-05,
"loss": 2.0204,
"step": 7800
},
{
"epoch": 2.2641509433962264,
"eval_loss": 1.0586293935775757,
"eval_runtime": 213.3516,
"eval_samples_per_second": 213.258,
"eval_steps_per_second": 3.333,
"step": 7800
},
{
"epoch": 2.293178519593614,
"grad_norm": 1.2214884757995605,
"learning_rate": 7.706821480406386e-05,
"loss": 2.0184,
"step": 7900
},
{
"epoch": 2.293178519593614,
"eval_loss": 1.0525050163269043,
"eval_runtime": 212.5483,
"eval_samples_per_second": 214.064,
"eval_steps_per_second": 3.345,
"step": 7900
},
{
"epoch": 2.3222060957910013,
"grad_norm": 1.2622853517532349,
"learning_rate": 7.677793904208999e-05,
"loss": 2.0162,
"step": 8000
},
{
"epoch": 2.3222060957910013,
"eval_loss": 1.0512940883636475,
"eval_runtime": 212.6462,
"eval_samples_per_second": 213.966,
"eval_steps_per_second": 3.344,
"step": 8000
},
{
"epoch": 2.351233671988389,
"grad_norm": 1.2338088750839233,
"learning_rate": 7.648766328011612e-05,
"loss": 2.0029,
"step": 8100
},
{
"epoch": 2.351233671988389,
"eval_loss": 1.0521414279937744,
"eval_runtime": 213.5358,
"eval_samples_per_second": 213.074,
"eval_steps_per_second": 3.33,
"step": 8100
},
{
"epoch": 2.3802612481857763,
"grad_norm": 1.2111109495162964,
"learning_rate": 7.619738751814224e-05,
"loss": 2.0101,
"step": 8200
},
{
"epoch": 2.3802612481857763,
"eval_loss": 1.0501890182495117,
"eval_runtime": 213.0351,
"eval_samples_per_second": 213.575,
"eval_steps_per_second": 3.337,
"step": 8200
},
{
"epoch": 2.409288824383164,
"grad_norm": 1.2333025932312012,
"learning_rate": 7.590711175616836e-05,
"loss": 2.0,
"step": 8300
},
{
"epoch": 2.409288824383164,
"eval_loss": 1.051579236984253,
"eval_runtime": 213.5529,
"eval_samples_per_second": 213.057,
"eval_steps_per_second": 3.329,
"step": 8300
},
{
"epoch": 2.4383164005805513,
"grad_norm": 1.3394699096679688,
"learning_rate": 7.561683599419449e-05,
"loss": 1.9986,
"step": 8400
},
{
"epoch": 2.4383164005805513,
"eval_loss": 1.0520364046096802,
"eval_runtime": 212.3818,
"eval_samples_per_second": 214.232,
"eval_steps_per_second": 3.348,
"step": 8400
},
{
"epoch": 2.467343976777939,
"grad_norm": 1.334936261177063,
"learning_rate": 7.532656023222062e-05,
"loss": 1.993,
"step": 8500
},
{
"epoch": 2.467343976777939,
"eval_loss": 1.0490361452102661,
"eval_runtime": 213.9415,
"eval_samples_per_second": 212.67,
"eval_steps_per_second": 3.323,
"step": 8500
},
{
"epoch": 2.4963715529753268,
"grad_norm": 1.3085263967514038,
"learning_rate": 7.503628447024675e-05,
"loss": 1.9771,
"step": 8600
},
{
"epoch": 2.4963715529753268,
"eval_loss": 1.0522186756134033,
"eval_runtime": 212.3302,
"eval_samples_per_second": 214.284,
"eval_steps_per_second": 3.349,
"step": 8600
},
{
"epoch": 2.525399129172714,
"grad_norm": 1.4204107522964478,
"learning_rate": 7.474600870827286e-05,
"loss": 1.9848,
"step": 8700
},
{
"epoch": 2.525399129172714,
"eval_loss": 1.0486035346984863,
"eval_runtime": 213.5477,
"eval_samples_per_second": 213.062,
"eval_steps_per_second": 3.329,
"step": 8700
},
{
"epoch": 2.5544267053701017,
"grad_norm": 1.2411503791809082,
"learning_rate": 7.445573294629898e-05,
"loss": 2.0016,
"step": 8800
},
{
"epoch": 2.5544267053701017,
"eval_loss": 1.0516774654388428,
"eval_runtime": 213.2425,
"eval_samples_per_second": 213.367,
"eval_steps_per_second": 3.334,
"step": 8800
},
{
"epoch": 2.583454281567489,
"grad_norm": 1.2166720628738403,
"learning_rate": 7.416545718432511e-05,
"loss": 1.9761,
"step": 8900
},
{
"epoch": 2.583454281567489,
"eval_loss": 1.0438764095306396,
"eval_runtime": 213.2447,
"eval_samples_per_second": 213.365,
"eval_steps_per_second": 3.334,
"step": 8900
},
{
"epoch": 2.6124818577648767,
"grad_norm": 1.307707667350769,
"learning_rate": 7.387518142235124e-05,
"loss": 1.9753,
"step": 9000
},
{
"epoch": 2.6124818577648767,
"eval_loss": 1.0445740222930908,
"eval_runtime": 212.5813,
"eval_samples_per_second": 214.031,
"eval_steps_per_second": 3.345,
"step": 9000
},
{
"epoch": 2.641509433962264,
"grad_norm": 1.3446862697601318,
"learning_rate": 7.358490566037736e-05,
"loss": 1.9795,
"step": 9100
},
{
"epoch": 2.641509433962264,
"eval_loss": 1.0461750030517578,
"eval_runtime": 213.2022,
"eval_samples_per_second": 213.408,
"eval_steps_per_second": 3.335,
"step": 9100
},
{
"epoch": 2.6705370101596517,
"grad_norm": 1.25364351272583,
"learning_rate": 7.329462989840349e-05,
"loss": 1.966,
"step": 9200
},
{
"epoch": 2.6705370101596517,
"eval_loss": 1.0489540100097656,
"eval_runtime": 213.3373,
"eval_samples_per_second": 213.273,
"eval_steps_per_second": 3.333,
"step": 9200
},
{
"epoch": 2.699564586357039,
"grad_norm": 1.317325472831726,
"learning_rate": 7.300435413642961e-05,
"loss": 1.9853,
"step": 9300
},
{
"epoch": 2.699564586357039,
"eval_loss": 1.04426109790802,
"eval_runtime": 212.5953,
"eval_samples_per_second": 214.017,
"eval_steps_per_second": 3.344,
"step": 9300
},
{
"epoch": 2.7285921625544267,
"grad_norm": 1.2580476999282837,
"learning_rate": 7.271407837445574e-05,
"loss": 1.9873,
"step": 9400
},
{
"epoch": 2.7285921625544267,
"eval_loss": 1.0441796779632568,
"eval_runtime": 213.1744,
"eval_samples_per_second": 213.436,
"eval_steps_per_second": 3.335,
"step": 9400
}
],
"logging_steps": 100,
"max_steps": 34450,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 5
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.403409048272896e+16,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}