TCS_MLM_50 / last-checkpoint /trainer_state.json
mgh6's picture
Training in progress, step 13000, checkpoint
0a75514 verified
{
"best_metric": 1.0211207866668701,
"best_model_checkpoint": "mgh6/TCS_MLM_50/checkpoint-12500",
"epoch": 3.7735849056603774,
"eval_steps": 100,
"global_step": 13000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02902757619738752,
"grad_norm": 1.1695395708084106,
"learning_rate": 9.970972423802612e-05,
"loss": 2.8263,
"step": 100
},
{
"epoch": 0.02902757619738752,
"eval_loss": 1.2625532150268555,
"eval_runtime": 213.9369,
"eval_samples_per_second": 212.651,
"eval_steps_per_second": 3.323,
"step": 100
},
{
"epoch": 0.05805515239477504,
"grad_norm": 1.1860003471374512,
"learning_rate": 9.941944847605225e-05,
"loss": 2.7152,
"step": 200
},
{
"epoch": 0.05805515239477504,
"eval_loss": 1.2428085803985596,
"eval_runtime": 214.2707,
"eval_samples_per_second": 212.32,
"eval_steps_per_second": 3.318,
"step": 200
},
{
"epoch": 0.08708272859216255,
"grad_norm": 1.138083577156067,
"learning_rate": 9.912917271407838e-05,
"loss": 2.6496,
"step": 300
},
{
"epoch": 0.08708272859216255,
"eval_loss": 1.2260128259658813,
"eval_runtime": 214.1595,
"eval_samples_per_second": 212.43,
"eval_steps_per_second": 3.32,
"step": 300
},
{
"epoch": 0.11611030478955008,
"grad_norm": 1.118958592414856,
"learning_rate": 9.883889695210451e-05,
"loss": 2.6016,
"step": 400
},
{
"epoch": 0.11611030478955008,
"eval_loss": 1.2176499366760254,
"eval_runtime": 214.7144,
"eval_samples_per_second": 211.881,
"eval_steps_per_second": 3.311,
"step": 400
},
{
"epoch": 0.14513788098693758,
"grad_norm": 1.0901985168457031,
"learning_rate": 9.854862119013063e-05,
"loss": 2.5744,
"step": 500
},
{
"epoch": 0.14513788098693758,
"eval_loss": 1.200432300567627,
"eval_runtime": 214.0554,
"eval_samples_per_second": 212.534,
"eval_steps_per_second": 3.322,
"step": 500
},
{
"epoch": 0.1741654571843251,
"grad_norm": 1.1364562511444092,
"learning_rate": 9.825834542815675e-05,
"loss": 2.5412,
"step": 600
},
{
"epoch": 0.1741654571843251,
"eval_loss": 1.19223153591156,
"eval_runtime": 214.4792,
"eval_samples_per_second": 212.114,
"eval_steps_per_second": 3.315,
"step": 600
},
{
"epoch": 0.20319303338171263,
"grad_norm": 1.1283212900161743,
"learning_rate": 9.796806966618288e-05,
"loss": 2.5318,
"step": 700
},
{
"epoch": 0.20319303338171263,
"eval_loss": 1.1891732215881348,
"eval_runtime": 214.0534,
"eval_samples_per_second": 212.536,
"eval_steps_per_second": 3.322,
"step": 700
},
{
"epoch": 0.23222060957910015,
"grad_norm": 1.123288631439209,
"learning_rate": 9.767779390420901e-05,
"loss": 2.4897,
"step": 800
},
{
"epoch": 0.23222060957910015,
"eval_loss": 1.1818548440933228,
"eval_runtime": 214.4047,
"eval_samples_per_second": 212.187,
"eval_steps_per_second": 3.316,
"step": 800
},
{
"epoch": 0.2612481857764877,
"grad_norm": 1.2019628286361694,
"learning_rate": 9.738751814223513e-05,
"loss": 2.4833,
"step": 900
},
{
"epoch": 0.2612481857764877,
"eval_loss": 1.169024109840393,
"eval_runtime": 214.4504,
"eval_samples_per_second": 212.142,
"eval_steps_per_second": 3.315,
"step": 900
},
{
"epoch": 0.29027576197387517,
"grad_norm": 1.1262823343276978,
"learning_rate": 9.709724238026126e-05,
"loss": 2.4637,
"step": 1000
},
{
"epoch": 0.29027576197387517,
"eval_loss": 1.1714328527450562,
"eval_runtime": 213.9877,
"eval_samples_per_second": 212.601,
"eval_steps_per_second": 3.323,
"step": 1000
},
{
"epoch": 0.3193033381712627,
"grad_norm": 1.1507551670074463,
"learning_rate": 9.680696661828737e-05,
"loss": 2.4408,
"step": 1100
},
{
"epoch": 0.3193033381712627,
"eval_loss": 1.165282130241394,
"eval_runtime": 214.1235,
"eval_samples_per_second": 212.466,
"eval_steps_per_second": 3.321,
"step": 1100
},
{
"epoch": 0.3483309143686502,
"grad_norm": 1.1482508182525635,
"learning_rate": 9.65166908563135e-05,
"loss": 2.4353,
"step": 1200
},
{
"epoch": 0.3483309143686502,
"eval_loss": 1.162631630897522,
"eval_runtime": 214.0787,
"eval_samples_per_second": 212.511,
"eval_steps_per_second": 3.321,
"step": 1200
},
{
"epoch": 0.37735849056603776,
"grad_norm": 1.1512328386306763,
"learning_rate": 9.622641509433963e-05,
"loss": 2.404,
"step": 1300
},
{
"epoch": 0.37735849056603776,
"eval_loss": 1.1568048000335693,
"eval_runtime": 214.1359,
"eval_samples_per_second": 212.454,
"eval_steps_per_second": 3.32,
"step": 1300
},
{
"epoch": 0.40638606676342526,
"grad_norm": 1.1897668838500977,
"learning_rate": 9.593613933236575e-05,
"loss": 2.3905,
"step": 1400
},
{
"epoch": 0.40638606676342526,
"eval_loss": 1.1544520854949951,
"eval_runtime": 214.4841,
"eval_samples_per_second": 212.109,
"eval_steps_per_second": 3.315,
"step": 1400
},
{
"epoch": 0.43541364296081275,
"grad_norm": 1.1775190830230713,
"learning_rate": 9.564586357039188e-05,
"loss": 2.3792,
"step": 1500
},
{
"epoch": 0.43541364296081275,
"eval_loss": 1.1439248323440552,
"eval_runtime": 213.8637,
"eval_samples_per_second": 212.724,
"eval_steps_per_second": 3.325,
"step": 1500
},
{
"epoch": 0.4644412191582003,
"grad_norm": 1.1436594724655151,
"learning_rate": 9.5355587808418e-05,
"loss": 2.3592,
"step": 1600
},
{
"epoch": 0.4644412191582003,
"eval_loss": 1.1428674459457397,
"eval_runtime": 214.3984,
"eval_samples_per_second": 212.194,
"eval_steps_per_second": 3.316,
"step": 1600
},
{
"epoch": 0.4934687953555878,
"grad_norm": 1.1852160692214966,
"learning_rate": 9.506531204644412e-05,
"loss": 2.3539,
"step": 1700
},
{
"epoch": 0.4934687953555878,
"eval_loss": 1.1439515352249146,
"eval_runtime": 214.0153,
"eval_samples_per_second": 212.574,
"eval_steps_per_second": 3.322,
"step": 1700
},
{
"epoch": 0.5224963715529753,
"grad_norm": 1.2375448942184448,
"learning_rate": 9.477503628447025e-05,
"loss": 2.3489,
"step": 1800
},
{
"epoch": 0.5224963715529753,
"eval_loss": 1.1391005516052246,
"eval_runtime": 214.5494,
"eval_samples_per_second": 212.044,
"eval_steps_per_second": 3.314,
"step": 1800
},
{
"epoch": 0.5515239477503628,
"grad_norm": 1.1505770683288574,
"learning_rate": 9.448476052249638e-05,
"loss": 2.3336,
"step": 1900
},
{
"epoch": 0.5515239477503628,
"eval_loss": 1.1322171688079834,
"eval_runtime": 213.2802,
"eval_samples_per_second": 213.306,
"eval_steps_per_second": 3.334,
"step": 1900
},
{
"epoch": 0.5805515239477503,
"grad_norm": 1.1152174472808838,
"learning_rate": 9.419448476052251e-05,
"loss": 2.3321,
"step": 2000
},
{
"epoch": 0.5805515239477503,
"eval_loss": 1.1339818239212036,
"eval_runtime": 213.6442,
"eval_samples_per_second": 212.943,
"eval_steps_per_second": 3.328,
"step": 2000
},
{
"epoch": 0.6095791001451378,
"grad_norm": 1.1027612686157227,
"learning_rate": 9.390420899854863e-05,
"loss": 2.3039,
"step": 2100
},
{
"epoch": 0.6095791001451378,
"eval_loss": 1.1304194927215576,
"eval_runtime": 213.0147,
"eval_samples_per_second": 213.572,
"eval_steps_per_second": 3.338,
"step": 2100
},
{
"epoch": 0.6386066763425254,
"grad_norm": 1.1585232019424438,
"learning_rate": 9.361393323657474e-05,
"loss": 2.3101,
"step": 2200
},
{
"epoch": 0.6386066763425254,
"eval_loss": 1.1316287517547607,
"eval_runtime": 214.3522,
"eval_samples_per_second": 212.239,
"eval_steps_per_second": 3.317,
"step": 2200
},
{
"epoch": 0.6676342525399129,
"grad_norm": 1.1749528646469116,
"learning_rate": 9.332365747460087e-05,
"loss": 2.3048,
"step": 2300
},
{
"epoch": 0.6676342525399129,
"eval_loss": 1.1262996196746826,
"eval_runtime": 214.6823,
"eval_samples_per_second": 211.913,
"eval_steps_per_second": 3.312,
"step": 2300
},
{
"epoch": 0.6966618287373004,
"grad_norm": 1.1533962488174438,
"learning_rate": 9.3033381712627e-05,
"loss": 2.2808,
"step": 2400
},
{
"epoch": 0.6966618287373004,
"eval_loss": 1.1249016523361206,
"eval_runtime": 214.0327,
"eval_samples_per_second": 212.556,
"eval_steps_per_second": 3.322,
"step": 2400
},
{
"epoch": 0.7256894049346879,
"grad_norm": 1.1524910926818848,
"learning_rate": 9.274310595065312e-05,
"loss": 2.2865,
"step": 2500
},
{
"epoch": 0.7256894049346879,
"eval_loss": 1.1257150173187256,
"eval_runtime": 213.721,
"eval_samples_per_second": 212.866,
"eval_steps_per_second": 3.327,
"step": 2500
},
{
"epoch": 0.7547169811320755,
"grad_norm": 1.1282308101654053,
"learning_rate": 9.245283018867925e-05,
"loss": 2.2654,
"step": 2600
},
{
"epoch": 0.7547169811320755,
"eval_loss": 1.1186834573745728,
"eval_runtime": 214.3515,
"eval_samples_per_second": 212.24,
"eval_steps_per_second": 3.317,
"step": 2600
},
{
"epoch": 0.783744557329463,
"grad_norm": 1.239816427230835,
"learning_rate": 9.216255442670537e-05,
"loss": 2.2564,
"step": 2700
},
{
"epoch": 0.783744557329463,
"eval_loss": 1.1156889200210571,
"eval_runtime": 213.6092,
"eval_samples_per_second": 212.978,
"eval_steps_per_second": 3.329,
"step": 2700
},
{
"epoch": 0.8127721335268505,
"grad_norm": 1.2036716938018799,
"learning_rate": 9.18722786647315e-05,
"loss": 2.2453,
"step": 2800
},
{
"epoch": 0.8127721335268505,
"eval_loss": 1.118744134902954,
"eval_runtime": 214.1658,
"eval_samples_per_second": 212.424,
"eval_steps_per_second": 3.32,
"step": 2800
},
{
"epoch": 0.841799709724238,
"grad_norm": 1.2474415302276611,
"learning_rate": 9.158200290275763e-05,
"loss": 2.2402,
"step": 2900
},
{
"epoch": 0.841799709724238,
"eval_loss": 1.1129833459854126,
"eval_runtime": 214.2278,
"eval_samples_per_second": 212.363,
"eval_steps_per_second": 3.319,
"step": 2900
},
{
"epoch": 0.8708272859216255,
"grad_norm": 1.2137649059295654,
"learning_rate": 9.129172714078375e-05,
"loss": 2.2243,
"step": 3000
},
{
"epoch": 0.8708272859216255,
"eval_loss": 1.1113933324813843,
"eval_runtime": 213.6564,
"eval_samples_per_second": 212.931,
"eval_steps_per_second": 3.328,
"step": 3000
},
{
"epoch": 0.8998548621190131,
"grad_norm": 1.2188935279846191,
"learning_rate": 9.100145137880988e-05,
"loss": 2.2324,
"step": 3100
},
{
"epoch": 0.8998548621190131,
"eval_loss": 1.1111185550689697,
"eval_runtime": 214.2911,
"eval_samples_per_second": 212.3,
"eval_steps_per_second": 3.318,
"step": 3100
},
{
"epoch": 0.9288824383164006,
"grad_norm": 1.2475199699401855,
"learning_rate": 9.0711175616836e-05,
"loss": 2.2329,
"step": 3200
},
{
"epoch": 0.9288824383164006,
"eval_loss": 1.1126782894134521,
"eval_runtime": 214.309,
"eval_samples_per_second": 212.282,
"eval_steps_per_second": 3.318,
"step": 3200
},
{
"epoch": 0.9579100145137881,
"grad_norm": 1.1850870847702026,
"learning_rate": 9.042089985486212e-05,
"loss": 2.2292,
"step": 3300
},
{
"epoch": 0.9579100145137881,
"eval_loss": 1.1046797037124634,
"eval_runtime": 214.0726,
"eval_samples_per_second": 212.517,
"eval_steps_per_second": 3.321,
"step": 3300
},
{
"epoch": 0.9869375907111756,
"grad_norm": 1.1915068626403809,
"learning_rate": 9.013062409288826e-05,
"loss": 2.2169,
"step": 3400
},
{
"epoch": 0.9869375907111756,
"eval_loss": 1.1029560565948486,
"eval_runtime": 214.518,
"eval_samples_per_second": 212.075,
"eval_steps_per_second": 3.314,
"step": 3400
},
{
"epoch": 1.0159651669085632,
"grad_norm": 1.3059227466583252,
"learning_rate": 8.984034833091437e-05,
"loss": 2.2112,
"step": 3500
},
{
"epoch": 1.0159651669085632,
"eval_loss": 1.1044234037399292,
"eval_runtime": 214.4346,
"eval_samples_per_second": 212.158,
"eval_steps_per_second": 3.316,
"step": 3500
},
{
"epoch": 1.0449927431059507,
"grad_norm": 1.3193408250808716,
"learning_rate": 8.95500725689405e-05,
"loss": 2.186,
"step": 3600
},
{
"epoch": 1.0449927431059507,
"eval_loss": 1.1042026281356812,
"eval_runtime": 214.3105,
"eval_samples_per_second": 212.281,
"eval_steps_per_second": 3.318,
"step": 3600
},
{
"epoch": 1.0740203193033382,
"grad_norm": 1.1710057258605957,
"learning_rate": 8.925979680696662e-05,
"loss": 2.1882,
"step": 3700
},
{
"epoch": 1.0740203193033382,
"eval_loss": 1.0975611209869385,
"eval_runtime": 214.3954,
"eval_samples_per_second": 212.197,
"eval_steps_per_second": 3.316,
"step": 3700
},
{
"epoch": 1.1030478955007257,
"grad_norm": 1.1426420211791992,
"learning_rate": 8.896952104499274e-05,
"loss": 2.1697,
"step": 3800
},
{
"epoch": 1.1030478955007257,
"eval_loss": 1.0976998805999756,
"eval_runtime": 212.987,
"eval_samples_per_second": 213.6,
"eval_steps_per_second": 3.338,
"step": 3800
},
{
"epoch": 1.1320754716981132,
"grad_norm": 1.1272858381271362,
"learning_rate": 8.867924528301888e-05,
"loss": 2.1836,
"step": 3900
},
{
"epoch": 1.1320754716981132,
"eval_loss": 1.0982595682144165,
"eval_runtime": 214.3281,
"eval_samples_per_second": 212.263,
"eval_steps_per_second": 3.317,
"step": 3900
},
{
"epoch": 1.1611030478955007,
"grad_norm": 1.141606330871582,
"learning_rate": 8.8388969521045e-05,
"loss": 2.1668,
"step": 4000
},
{
"epoch": 1.1611030478955007,
"eval_loss": 1.0947861671447754,
"eval_runtime": 214.7163,
"eval_samples_per_second": 211.88,
"eval_steps_per_second": 3.311,
"step": 4000
},
{
"epoch": 1.1901306240928882,
"grad_norm": 1.197513222694397,
"learning_rate": 8.809869375907113e-05,
"loss": 2.1537,
"step": 4100
},
{
"epoch": 1.1901306240928882,
"eval_loss": 1.0914397239685059,
"eval_runtime": 214.3796,
"eval_samples_per_second": 212.212,
"eval_steps_per_second": 3.317,
"step": 4100
},
{
"epoch": 1.2191582002902757,
"grad_norm": 1.2622817754745483,
"learning_rate": 8.780841799709725e-05,
"loss": 2.1609,
"step": 4200
},
{
"epoch": 1.2191582002902757,
"eval_loss": 1.0932444334030151,
"eval_runtime": 214.4603,
"eval_samples_per_second": 212.133,
"eval_steps_per_second": 3.315,
"step": 4200
},
{
"epoch": 1.2481857764876634,
"grad_norm": 1.1745682954788208,
"learning_rate": 8.751814223512336e-05,
"loss": 2.1448,
"step": 4300
},
{
"epoch": 1.2481857764876634,
"eval_loss": 1.0895456075668335,
"eval_runtime": 214.0416,
"eval_samples_per_second": 212.548,
"eval_steps_per_second": 3.322,
"step": 4300
},
{
"epoch": 1.2772133526850509,
"grad_norm": 1.1918201446533203,
"learning_rate": 8.722786647314949e-05,
"loss": 2.1552,
"step": 4400
},
{
"epoch": 1.2772133526850509,
"eval_loss": 1.089804768562317,
"eval_runtime": 214.182,
"eval_samples_per_second": 212.408,
"eval_steps_per_second": 3.32,
"step": 4400
},
{
"epoch": 1.3062409288824384,
"grad_norm": 1.2561489343643188,
"learning_rate": 8.693759071117562e-05,
"loss": 2.1421,
"step": 4500
},
{
"epoch": 1.3062409288824384,
"eval_loss": 1.0928661823272705,
"eval_runtime": 214.3073,
"eval_samples_per_second": 212.284,
"eval_steps_per_second": 3.318,
"step": 4500
},
{
"epoch": 1.3352685050798259,
"grad_norm": 1.1966407299041748,
"learning_rate": 8.664731494920174e-05,
"loss": 2.1426,
"step": 4600
},
{
"epoch": 1.3352685050798259,
"eval_loss": 1.0840479135513306,
"eval_runtime": 214.538,
"eval_samples_per_second": 212.056,
"eval_steps_per_second": 3.314,
"step": 4600
},
{
"epoch": 1.3642960812772134,
"grad_norm": 1.20412278175354,
"learning_rate": 8.635703918722787e-05,
"loss": 2.1256,
"step": 4700
},
{
"epoch": 1.3642960812772134,
"eval_loss": 1.085578441619873,
"eval_runtime": 214.0855,
"eval_samples_per_second": 212.504,
"eval_steps_per_second": 3.321,
"step": 4700
},
{
"epoch": 1.3933236574746009,
"grad_norm": 1.1835148334503174,
"learning_rate": 8.606676342525399e-05,
"loss": 2.1398,
"step": 4800
},
{
"epoch": 1.3933236574746009,
"eval_loss": 1.0841166973114014,
"eval_runtime": 213.7271,
"eval_samples_per_second": 212.86,
"eval_steps_per_second": 3.327,
"step": 4800
},
{
"epoch": 1.4223512336719883,
"grad_norm": 1.1613247394561768,
"learning_rate": 8.577648766328012e-05,
"loss": 2.1202,
"step": 4900
},
{
"epoch": 1.4223512336719883,
"eval_loss": 1.085669994354248,
"eval_runtime": 215.8008,
"eval_samples_per_second": 210.815,
"eval_steps_per_second": 3.295,
"step": 4900
},
{
"epoch": 1.4513788098693758,
"grad_norm": 1.1468629837036133,
"learning_rate": 8.548621190130625e-05,
"loss": 2.1123,
"step": 5000
},
{
"epoch": 1.4513788098693758,
"eval_loss": 1.0774667263031006,
"eval_runtime": 214.0808,
"eval_samples_per_second": 212.509,
"eval_steps_per_second": 3.321,
"step": 5000
},
{
"epoch": 1.4804063860667633,
"grad_norm": 1.2450999021530151,
"learning_rate": 8.519593613933237e-05,
"loss": 2.1104,
"step": 5100
},
{
"epoch": 1.4804063860667633,
"eval_loss": 1.0789214372634888,
"eval_runtime": 214.4581,
"eval_samples_per_second": 212.135,
"eval_steps_per_second": 3.315,
"step": 5100
},
{
"epoch": 1.509433962264151,
"grad_norm": 1.1406731605529785,
"learning_rate": 8.49056603773585e-05,
"loss": 2.1076,
"step": 5200
},
{
"epoch": 1.509433962264151,
"eval_loss": 1.0758976936340332,
"eval_runtime": 214.3317,
"eval_samples_per_second": 212.26,
"eval_steps_per_second": 3.317,
"step": 5200
},
{
"epoch": 1.5384615384615383,
"grad_norm": 1.2358899116516113,
"learning_rate": 8.461538461538461e-05,
"loss": 2.1149,
"step": 5300
},
{
"epoch": 1.5384615384615383,
"eval_loss": 1.0744006633758545,
"eval_runtime": 214.6001,
"eval_samples_per_second": 211.994,
"eval_steps_per_second": 3.313,
"step": 5300
},
{
"epoch": 1.567489114658926,
"grad_norm": 1.3809137344360352,
"learning_rate": 8.432510885341074e-05,
"loss": 2.1052,
"step": 5400
},
{
"epoch": 1.567489114658926,
"eval_loss": 1.0817296504974365,
"eval_runtime": 214.5107,
"eval_samples_per_second": 212.083,
"eval_steps_per_second": 3.315,
"step": 5400
},
{
"epoch": 1.5965166908563135,
"grad_norm": 1.1924511194229126,
"learning_rate": 8.403483309143688e-05,
"loss": 2.093,
"step": 5500
},
{
"epoch": 1.5965166908563135,
"eval_loss": 1.072150707244873,
"eval_runtime": 214.3047,
"eval_samples_per_second": 212.286,
"eval_steps_per_second": 3.318,
"step": 5500
},
{
"epoch": 1.625544267053701,
"grad_norm": 1.1757687330245972,
"learning_rate": 8.374455732946299e-05,
"loss": 2.0911,
"step": 5600
},
{
"epoch": 1.625544267053701,
"eval_loss": 1.0715994834899902,
"eval_runtime": 214.1638,
"eval_samples_per_second": 212.426,
"eval_steps_per_second": 3.32,
"step": 5600
},
{
"epoch": 1.6545718432510885,
"grad_norm": 1.263269066810608,
"learning_rate": 8.345428156748912e-05,
"loss": 2.0912,
"step": 5700
},
{
"epoch": 1.6545718432510885,
"eval_loss": 1.0753726959228516,
"eval_runtime": 214.272,
"eval_samples_per_second": 212.319,
"eval_steps_per_second": 3.318,
"step": 5700
},
{
"epoch": 1.683599419448476,
"grad_norm": 1.1598068475723267,
"learning_rate": 8.316400580551524e-05,
"loss": 2.085,
"step": 5800
},
{
"epoch": 1.683599419448476,
"eval_loss": 1.0767669677734375,
"eval_runtime": 214.1942,
"eval_samples_per_second": 212.396,
"eval_steps_per_second": 3.319,
"step": 5800
},
{
"epoch": 1.7126269956458637,
"grad_norm": 1.2049143314361572,
"learning_rate": 8.287373004354137e-05,
"loss": 2.091,
"step": 5900
},
{
"epoch": 1.7126269956458637,
"eval_loss": 1.0710922479629517,
"eval_runtime": 214.6737,
"eval_samples_per_second": 211.922,
"eval_steps_per_second": 3.312,
"step": 5900
},
{
"epoch": 1.741654571843251,
"grad_norm": 1.2534894943237305,
"learning_rate": 8.25834542815675e-05,
"loss": 2.0744,
"step": 6000
},
{
"epoch": 1.741654571843251,
"eval_loss": 1.07111656665802,
"eval_runtime": 214.3274,
"eval_samples_per_second": 212.264,
"eval_steps_per_second": 3.317,
"step": 6000
},
{
"epoch": 1.7706821480406387,
"grad_norm": 1.260311245918274,
"learning_rate": 8.229317851959362e-05,
"loss": 2.082,
"step": 6100
},
{
"epoch": 1.7706821480406387,
"eval_loss": 1.0758848190307617,
"eval_runtime": 214.3528,
"eval_samples_per_second": 212.239,
"eval_steps_per_second": 3.317,
"step": 6100
},
{
"epoch": 1.799709724238026,
"grad_norm": 1.192262887954712,
"learning_rate": 8.200290275761974e-05,
"loss": 2.0647,
"step": 6200
},
{
"epoch": 1.799709724238026,
"eval_loss": 1.0665150880813599,
"eval_runtime": 214.1927,
"eval_samples_per_second": 212.398,
"eval_steps_per_second": 3.319,
"step": 6200
},
{
"epoch": 1.8287373004354137,
"grad_norm": 1.2158530950546265,
"learning_rate": 8.171262699564587e-05,
"loss": 2.0524,
"step": 6300
},
{
"epoch": 1.8287373004354137,
"eval_loss": 1.0663120746612549,
"eval_runtime": 214.6232,
"eval_samples_per_second": 211.971,
"eval_steps_per_second": 3.313,
"step": 6300
},
{
"epoch": 1.8577648766328012,
"grad_norm": 1.1896952390670776,
"learning_rate": 8.142235123367198e-05,
"loss": 2.0654,
"step": 6400
},
{
"epoch": 1.8577648766328012,
"eval_loss": 1.064207911491394,
"eval_runtime": 213.7354,
"eval_samples_per_second": 212.852,
"eval_steps_per_second": 3.327,
"step": 6400
},
{
"epoch": 1.8867924528301887,
"grad_norm": 1.1889102458953857,
"learning_rate": 8.113207547169813e-05,
"loss": 2.0549,
"step": 6500
},
{
"epoch": 1.8867924528301887,
"eval_loss": 1.0605015754699707,
"eval_runtime": 214.211,
"eval_samples_per_second": 212.379,
"eval_steps_per_second": 3.319,
"step": 6500
},
{
"epoch": 1.9158200290275762,
"grad_norm": 1.2628164291381836,
"learning_rate": 8.084179970972424e-05,
"loss": 2.056,
"step": 6600
},
{
"epoch": 1.9158200290275762,
"eval_loss": 1.060520052909851,
"eval_runtime": 214.2715,
"eval_samples_per_second": 212.319,
"eval_steps_per_second": 3.318,
"step": 6600
},
{
"epoch": 1.9448476052249637,
"grad_norm": 1.203740119934082,
"learning_rate": 8.055152394775036e-05,
"loss": 2.0645,
"step": 6700
},
{
"epoch": 1.9448476052249637,
"eval_loss": 1.0628570318222046,
"eval_runtime": 213.9992,
"eval_samples_per_second": 212.59,
"eval_steps_per_second": 3.322,
"step": 6700
},
{
"epoch": 1.9738751814223512,
"grad_norm": 1.228109359741211,
"learning_rate": 8.026124818577649e-05,
"loss": 2.0528,
"step": 6800
},
{
"epoch": 1.9738751814223512,
"eval_loss": 1.0618752241134644,
"eval_runtime": 214.3127,
"eval_samples_per_second": 212.279,
"eval_steps_per_second": 3.318,
"step": 6800
},
{
"epoch": 2.0029027576197387,
"grad_norm": 1.2511652708053589,
"learning_rate": 7.997097242380261e-05,
"loss": 2.0643,
"step": 6900
},
{
"epoch": 2.0029027576197387,
"eval_loss": 1.0580365657806396,
"eval_runtime": 214.5925,
"eval_samples_per_second": 212.002,
"eval_steps_per_second": 3.313,
"step": 6900
},
{
"epoch": 2.0319303338171264,
"grad_norm": 1.2169101238250732,
"learning_rate": 7.968069666182875e-05,
"loss": 2.0342,
"step": 7000
},
{
"epoch": 2.0319303338171264,
"eval_loss": 1.0617866516113281,
"eval_runtime": 214.3766,
"eval_samples_per_second": 212.215,
"eval_steps_per_second": 3.317,
"step": 7000
},
{
"epoch": 2.0609579100145137,
"grad_norm": 1.1878671646118164,
"learning_rate": 7.939042089985487e-05,
"loss": 2.0391,
"step": 7100
},
{
"epoch": 2.0609579100145137,
"eval_loss": 1.0561269521713257,
"eval_runtime": 213.6515,
"eval_samples_per_second": 212.936,
"eval_steps_per_second": 3.328,
"step": 7100
},
{
"epoch": 2.0899854862119014,
"grad_norm": 1.2561451196670532,
"learning_rate": 7.910014513788099e-05,
"loss": 2.0368,
"step": 7200
},
{
"epoch": 2.0899854862119014,
"eval_loss": 1.0582093000411987,
"eval_runtime": 214.1893,
"eval_samples_per_second": 212.401,
"eval_steps_per_second": 3.319,
"step": 7200
},
{
"epoch": 2.1190130624092887,
"grad_norm": 1.3752440214157104,
"learning_rate": 7.880986937590712e-05,
"loss": 2.0223,
"step": 7300
},
{
"epoch": 2.1190130624092887,
"eval_loss": 1.0552905797958374,
"eval_runtime": 214.1025,
"eval_samples_per_second": 212.487,
"eval_steps_per_second": 3.321,
"step": 7300
},
{
"epoch": 2.1480406386066764,
"grad_norm": 1.2082586288452148,
"learning_rate": 7.851959361393323e-05,
"loss": 2.0219,
"step": 7400
},
{
"epoch": 2.1480406386066764,
"eval_loss": 1.056668996810913,
"eval_runtime": 214.2573,
"eval_samples_per_second": 212.333,
"eval_steps_per_second": 3.318,
"step": 7400
},
{
"epoch": 2.1770682148040637,
"grad_norm": 1.335627555847168,
"learning_rate": 7.822931785195937e-05,
"loss": 2.0191,
"step": 7500
},
{
"epoch": 2.1770682148040637,
"eval_loss": 1.0617352724075317,
"eval_runtime": 214.1612,
"eval_samples_per_second": 212.429,
"eval_steps_per_second": 3.32,
"step": 7500
},
{
"epoch": 2.2060957910014514,
"grad_norm": 1.3789772987365723,
"learning_rate": 7.79390420899855e-05,
"loss": 2.0163,
"step": 7600
},
{
"epoch": 2.2060957910014514,
"eval_loss": 1.0667177438735962,
"eval_runtime": 214.4271,
"eval_samples_per_second": 212.165,
"eval_steps_per_second": 3.316,
"step": 7600
},
{
"epoch": 2.235123367198839,
"grad_norm": 1.2630983591079712,
"learning_rate": 7.764876632801161e-05,
"loss": 2.0075,
"step": 7700
},
{
"epoch": 2.235123367198839,
"eval_loss": 1.053751826286316,
"eval_runtime": 214.4933,
"eval_samples_per_second": 212.1,
"eval_steps_per_second": 3.315,
"step": 7700
},
{
"epoch": 2.2641509433962264,
"grad_norm": 1.3576209545135498,
"learning_rate": 7.735849056603774e-05,
"loss": 2.018,
"step": 7800
},
{
"epoch": 2.2641509433962264,
"eval_loss": 1.0545238256454468,
"eval_runtime": 214.4112,
"eval_samples_per_second": 212.181,
"eval_steps_per_second": 3.316,
"step": 7800
},
{
"epoch": 2.293178519593614,
"grad_norm": 1.2727316617965698,
"learning_rate": 7.706821480406386e-05,
"loss": 2.0123,
"step": 7900
},
{
"epoch": 2.293178519593614,
"eval_loss": 1.0540556907653809,
"eval_runtime": 214.501,
"eval_samples_per_second": 212.092,
"eval_steps_per_second": 3.315,
"step": 7900
},
{
"epoch": 2.3222060957910013,
"grad_norm": 1.2817336320877075,
"learning_rate": 7.677793904208999e-05,
"loss": 2.0129,
"step": 8000
},
{
"epoch": 2.3222060957910013,
"eval_loss": 1.053106427192688,
"eval_runtime": 214.7491,
"eval_samples_per_second": 211.847,
"eval_steps_per_second": 3.311,
"step": 8000
},
{
"epoch": 2.351233671988389,
"grad_norm": 1.1629624366760254,
"learning_rate": 7.648766328011612e-05,
"loss": 1.9998,
"step": 8100
},
{
"epoch": 2.351233671988389,
"eval_loss": 1.0525621175765991,
"eval_runtime": 214.4605,
"eval_samples_per_second": 212.132,
"eval_steps_per_second": 3.315,
"step": 8100
},
{
"epoch": 2.3802612481857763,
"grad_norm": 1.225195050239563,
"learning_rate": 7.619738751814224e-05,
"loss": 1.9998,
"step": 8200
},
{
"epoch": 2.3802612481857763,
"eval_loss": 1.0501279830932617,
"eval_runtime": 214.3635,
"eval_samples_per_second": 212.228,
"eval_steps_per_second": 3.317,
"step": 8200
},
{
"epoch": 2.409288824383164,
"grad_norm": 1.167968988418579,
"learning_rate": 7.590711175616836e-05,
"loss": 2.0127,
"step": 8300
},
{
"epoch": 2.409288824383164,
"eval_loss": 1.0495474338531494,
"eval_runtime": 214.3385,
"eval_samples_per_second": 212.253,
"eval_steps_per_second": 3.317,
"step": 8300
},
{
"epoch": 2.4383164005805513,
"grad_norm": 1.2802715301513672,
"learning_rate": 7.561683599419449e-05,
"loss": 2.0046,
"step": 8400
},
{
"epoch": 2.4383164005805513,
"eval_loss": 1.0517114400863647,
"eval_runtime": 213.9947,
"eval_samples_per_second": 212.594,
"eval_steps_per_second": 3.323,
"step": 8400
},
{
"epoch": 2.467343976777939,
"grad_norm": 1.2801434993743896,
"learning_rate": 7.532656023222062e-05,
"loss": 1.9913,
"step": 8500
},
{
"epoch": 2.467343976777939,
"eval_loss": 1.0506008863449097,
"eval_runtime": 214.0451,
"eval_samples_per_second": 212.544,
"eval_steps_per_second": 3.322,
"step": 8500
},
{
"epoch": 2.4963715529753268,
"grad_norm": 1.3369925022125244,
"learning_rate": 7.503628447024675e-05,
"loss": 1.9895,
"step": 8600
},
{
"epoch": 2.4963715529753268,
"eval_loss": 1.0505975484848022,
"eval_runtime": 214.2989,
"eval_samples_per_second": 212.292,
"eval_steps_per_second": 3.318,
"step": 8600
},
{
"epoch": 2.525399129172714,
"grad_norm": 1.2676314115524292,
"learning_rate": 7.474600870827286e-05,
"loss": 1.9963,
"step": 8700
},
{
"epoch": 2.525399129172714,
"eval_loss": 1.0470978021621704,
"eval_runtime": 214.1975,
"eval_samples_per_second": 212.393,
"eval_steps_per_second": 3.319,
"step": 8700
},
{
"epoch": 2.5544267053701017,
"grad_norm": 1.2529655694961548,
"learning_rate": 7.445573294629898e-05,
"loss": 1.9858,
"step": 8800
},
{
"epoch": 2.5544267053701017,
"eval_loss": 1.045462965965271,
"eval_runtime": 213.8996,
"eval_samples_per_second": 212.689,
"eval_steps_per_second": 3.324,
"step": 8800
},
{
"epoch": 2.583454281567489,
"grad_norm": 1.227094054222107,
"learning_rate": 7.416545718432511e-05,
"loss": 1.9877,
"step": 8900
},
{
"epoch": 2.583454281567489,
"eval_loss": 1.0446746349334717,
"eval_runtime": 214.2422,
"eval_samples_per_second": 212.348,
"eval_steps_per_second": 3.319,
"step": 8900
},
{
"epoch": 2.6124818577648767,
"grad_norm": 1.22869074344635,
"learning_rate": 7.387518142235124e-05,
"loss": 1.9914,
"step": 9000
},
{
"epoch": 2.6124818577648767,
"eval_loss": 1.045985460281372,
"eval_runtime": 213.626,
"eval_samples_per_second": 212.961,
"eval_steps_per_second": 3.328,
"step": 9000
},
{
"epoch": 2.641509433962264,
"grad_norm": 1.3192973136901855,
"learning_rate": 7.358490566037736e-05,
"loss": 1.9686,
"step": 9100
},
{
"epoch": 2.641509433962264,
"eval_loss": 1.0464129447937012,
"eval_runtime": 214.0751,
"eval_samples_per_second": 212.514,
"eval_steps_per_second": 3.321,
"step": 9100
},
{
"epoch": 2.6705370101596517,
"grad_norm": 1.3081276416778564,
"learning_rate": 7.329462989840349e-05,
"loss": 1.9731,
"step": 9200
},
{
"epoch": 2.6705370101596517,
"eval_loss": 1.047652006149292,
"eval_runtime": 214.1138,
"eval_samples_per_second": 212.476,
"eval_steps_per_second": 3.321,
"step": 9200
},
{
"epoch": 2.699564586357039,
"grad_norm": 1.309837818145752,
"learning_rate": 7.300435413642961e-05,
"loss": 1.9722,
"step": 9300
},
{
"epoch": 2.699564586357039,
"eval_loss": 1.0429437160491943,
"eval_runtime": 213.1703,
"eval_samples_per_second": 213.416,
"eval_steps_per_second": 3.335,
"step": 9300
},
{
"epoch": 2.7285921625544267,
"grad_norm": 1.3633908033370972,
"learning_rate": 7.271407837445574e-05,
"loss": 1.9837,
"step": 9400
},
{
"epoch": 2.7285921625544267,
"eval_loss": 1.041870355606079,
"eval_runtime": 214.2854,
"eval_samples_per_second": 212.306,
"eval_steps_per_second": 3.318,
"step": 9400
},
{
"epoch": 2.7576197387518144,
"grad_norm": 1.195707082748413,
"learning_rate": 7.242380261248185e-05,
"loss": 1.9657,
"step": 9500
},
{
"epoch": 2.7576197387518144,
"eval_loss": 1.0397106409072876,
"eval_runtime": 214.0497,
"eval_samples_per_second": 212.539,
"eval_steps_per_second": 3.322,
"step": 9500
},
{
"epoch": 2.7866473149492017,
"grad_norm": 1.2074401378631592,
"learning_rate": 7.213352685050799e-05,
"loss": 1.9782,
"step": 9600
},
{
"epoch": 2.7866473149492017,
"eval_loss": 1.0388689041137695,
"eval_runtime": 213.9256,
"eval_samples_per_second": 212.663,
"eval_steps_per_second": 3.324,
"step": 9600
},
{
"epoch": 2.8156748911465894,
"grad_norm": 1.42034113407135,
"learning_rate": 7.184325108853412e-05,
"loss": 1.9678,
"step": 9700
},
{
"epoch": 2.8156748911465894,
"eval_loss": 1.0477054119110107,
"eval_runtime": 214.1129,
"eval_samples_per_second": 212.477,
"eval_steps_per_second": 3.321,
"step": 9700
},
{
"epoch": 2.8447024673439767,
"grad_norm": 1.2497634887695312,
"learning_rate": 7.155297532656023e-05,
"loss": 1.9499,
"step": 9800
},
{
"epoch": 2.8447024673439767,
"eval_loss": 1.0382879972457886,
"eval_runtime": 214.4692,
"eval_samples_per_second": 212.124,
"eval_steps_per_second": 3.315,
"step": 9800
},
{
"epoch": 2.8737300435413644,
"grad_norm": 1.2587764263153076,
"learning_rate": 7.126269956458636e-05,
"loss": 1.9596,
"step": 9900
},
{
"epoch": 2.8737300435413644,
"eval_loss": 1.0374723672866821,
"eval_runtime": 214.3582,
"eval_samples_per_second": 212.234,
"eval_steps_per_second": 3.317,
"step": 9900
},
{
"epoch": 2.9027576197387517,
"grad_norm": 1.2650773525238037,
"learning_rate": 7.097242380261248e-05,
"loss": 1.9632,
"step": 10000
},
{
"epoch": 2.9027576197387517,
"eval_loss": 1.0395891666412354,
"eval_runtime": 214.2184,
"eval_samples_per_second": 212.372,
"eval_steps_per_second": 3.319,
"step": 10000
},
{
"epoch": 2.9317851959361394,
"grad_norm": 1.237382411956787,
"learning_rate": 7.068214804063861e-05,
"loss": 1.9448,
"step": 10100
},
{
"epoch": 2.9317851959361394,
"eval_loss": 1.0347273349761963,
"eval_runtime": 214.6802,
"eval_samples_per_second": 211.915,
"eval_steps_per_second": 3.312,
"step": 10100
},
{
"epoch": 2.9608127721335267,
"grad_norm": 1.2535216808319092,
"learning_rate": 7.039187227866474e-05,
"loss": 1.9633,
"step": 10200
},
{
"epoch": 2.9608127721335267,
"eval_loss": 1.0382635593414307,
"eval_runtime": 214.2729,
"eval_samples_per_second": 212.318,
"eval_steps_per_second": 3.318,
"step": 10200
},
{
"epoch": 2.9898403483309144,
"grad_norm": 1.2122920751571655,
"learning_rate": 7.010159651669086e-05,
"loss": 1.9531,
"step": 10300
},
{
"epoch": 2.9898403483309144,
"eval_loss": 1.0362297296524048,
"eval_runtime": 214.3174,
"eval_samples_per_second": 212.274,
"eval_steps_per_second": 3.318,
"step": 10300
},
{
"epoch": 3.018867924528302,
"grad_norm": 1.207924723625183,
"learning_rate": 6.981132075471698e-05,
"loss": 1.9597,
"step": 10400
},
{
"epoch": 3.018867924528302,
"eval_loss": 1.0346544981002808,
"eval_runtime": 214.0838,
"eval_samples_per_second": 212.506,
"eval_steps_per_second": 3.321,
"step": 10400
},
{
"epoch": 3.0478955007256894,
"grad_norm": 1.3156700134277344,
"learning_rate": 6.95210449927431e-05,
"loss": 1.9284,
"step": 10500
},
{
"epoch": 3.0478955007256894,
"eval_loss": 1.0392136573791504,
"eval_runtime": 214.2728,
"eval_samples_per_second": 212.318,
"eval_steps_per_second": 3.318,
"step": 10500
},
{
"epoch": 3.076923076923077,
"grad_norm": 1.2844287157058716,
"learning_rate": 6.923076923076924e-05,
"loss": 1.9524,
"step": 10600
},
{
"epoch": 3.076923076923077,
"eval_loss": 1.0422698259353638,
"eval_runtime": 214.2459,
"eval_samples_per_second": 212.345,
"eval_steps_per_second": 3.319,
"step": 10600
},
{
"epoch": 3.1059506531204644,
"grad_norm": 1.3154046535491943,
"learning_rate": 6.894049346879537e-05,
"loss": 1.9321,
"step": 10700
},
{
"epoch": 3.1059506531204644,
"eval_loss": 1.0372092723846436,
"eval_runtime": 214.4246,
"eval_samples_per_second": 212.168,
"eval_steps_per_second": 3.316,
"step": 10700
},
{
"epoch": 3.134978229317852,
"grad_norm": 1.30637788772583,
"learning_rate": 6.865021770682148e-05,
"loss": 1.9414,
"step": 10800
},
{
"epoch": 3.134978229317852,
"eval_loss": 1.0316834449768066,
"eval_runtime": 214.2895,
"eval_samples_per_second": 212.302,
"eval_steps_per_second": 3.318,
"step": 10800
},
{
"epoch": 3.1640058055152394,
"grad_norm": 1.375622272491455,
"learning_rate": 6.83599419448476e-05,
"loss": 1.9255,
"step": 10900
},
{
"epoch": 3.1640058055152394,
"eval_loss": 1.0339484214782715,
"eval_runtime": 214.0141,
"eval_samples_per_second": 212.575,
"eval_steps_per_second": 3.322,
"step": 10900
},
{
"epoch": 3.193033381712627,
"grad_norm": 1.2978899478912354,
"learning_rate": 6.806966618287373e-05,
"loss": 1.9384,
"step": 11000
},
{
"epoch": 3.193033381712627,
"eval_loss": 1.033180832862854,
"eval_runtime": 214.382,
"eval_samples_per_second": 212.21,
"eval_steps_per_second": 3.317,
"step": 11000
},
{
"epoch": 3.2220609579100143,
"grad_norm": 1.233608603477478,
"learning_rate": 6.777939042089986e-05,
"loss": 1.9297,
"step": 11100
},
{
"epoch": 3.2220609579100143,
"eval_loss": 1.0305285453796387,
"eval_runtime": 214.0802,
"eval_samples_per_second": 212.509,
"eval_steps_per_second": 3.321,
"step": 11100
},
{
"epoch": 3.251088534107402,
"grad_norm": 1.2634618282318115,
"learning_rate": 6.748911465892598e-05,
"loss": 1.9315,
"step": 11200
},
{
"epoch": 3.251088534107402,
"eval_loss": 1.0329853296279907,
"eval_runtime": 214.8185,
"eval_samples_per_second": 211.779,
"eval_steps_per_second": 3.31,
"step": 11200
},
{
"epoch": 3.28011611030479,
"grad_norm": 1.3260959386825562,
"learning_rate": 6.719883889695211e-05,
"loss": 1.9331,
"step": 11300
},
{
"epoch": 3.28011611030479,
"eval_loss": 1.0363577604293823,
"eval_runtime": 214.1897,
"eval_samples_per_second": 212.401,
"eval_steps_per_second": 3.319,
"step": 11300
},
{
"epoch": 3.309143686502177,
"grad_norm": 1.330241322517395,
"learning_rate": 6.690856313497823e-05,
"loss": 1.9355,
"step": 11400
},
{
"epoch": 3.309143686502177,
"eval_loss": 1.0366979837417603,
"eval_runtime": 214.2627,
"eval_samples_per_second": 212.328,
"eval_steps_per_second": 3.318,
"step": 11400
},
{
"epoch": 3.3381712626995648,
"grad_norm": 1.3124949932098389,
"learning_rate": 6.661828737300436e-05,
"loss": 1.9141,
"step": 11500
},
{
"epoch": 3.3381712626995648,
"eval_loss": 1.0314677953720093,
"eval_runtime": 214.3893,
"eval_samples_per_second": 212.203,
"eval_steps_per_second": 3.316,
"step": 11500
},
{
"epoch": 3.367198838896952,
"grad_norm": 1.2886366844177246,
"learning_rate": 6.632801161103049e-05,
"loss": 1.918,
"step": 11600
},
{
"epoch": 3.367198838896952,
"eval_loss": 1.029552698135376,
"eval_runtime": 214.4197,
"eval_samples_per_second": 212.173,
"eval_steps_per_second": 3.316,
"step": 11600
},
{
"epoch": 3.3962264150943398,
"grad_norm": 1.4406765699386597,
"learning_rate": 6.60377358490566e-05,
"loss": 1.9192,
"step": 11700
},
{
"epoch": 3.3962264150943398,
"eval_loss": 1.0297138690948486,
"eval_runtime": 214.4728,
"eval_samples_per_second": 212.12,
"eval_steps_per_second": 3.315,
"step": 11700
},
{
"epoch": 3.425253991291727,
"grad_norm": 1.3517920970916748,
"learning_rate": 6.574746008708274e-05,
"loss": 1.9146,
"step": 11800
},
{
"epoch": 3.425253991291727,
"eval_loss": 1.0310994386672974,
"eval_runtime": 214.5121,
"eval_samples_per_second": 212.081,
"eval_steps_per_second": 3.314,
"step": 11800
},
{
"epoch": 3.4542815674891147,
"grad_norm": 1.31048583984375,
"learning_rate": 6.545718432510885e-05,
"loss": 1.9235,
"step": 11900
},
{
"epoch": 3.4542815674891147,
"eval_loss": 1.029317021369934,
"eval_runtime": 214.205,
"eval_samples_per_second": 212.385,
"eval_steps_per_second": 3.319,
"step": 11900
},
{
"epoch": 3.483309143686502,
"grad_norm": 1.2714518308639526,
"learning_rate": 6.516690856313497e-05,
"loss": 1.9161,
"step": 12000
},
{
"epoch": 3.483309143686502,
"eval_loss": 1.0265744924545288,
"eval_runtime": 214.3435,
"eval_samples_per_second": 212.248,
"eval_steps_per_second": 3.317,
"step": 12000
},
{
"epoch": 3.5123367198838897,
"grad_norm": 1.274511456489563,
"learning_rate": 6.487663280116111e-05,
"loss": 1.9295,
"step": 12100
},
{
"epoch": 3.5123367198838897,
"eval_loss": 1.026885747909546,
"eval_runtime": 214.622,
"eval_samples_per_second": 211.973,
"eval_steps_per_second": 3.313,
"step": 12100
},
{
"epoch": 3.5413642960812775,
"grad_norm": 1.4020469188690186,
"learning_rate": 6.458635703918723e-05,
"loss": 1.9214,
"step": 12200
},
{
"epoch": 3.5413642960812775,
"eval_loss": 1.0313502550125122,
"eval_runtime": 214.5336,
"eval_samples_per_second": 212.06,
"eval_steps_per_second": 3.314,
"step": 12200
},
{
"epoch": 3.5703918722786647,
"grad_norm": 1.329451322555542,
"learning_rate": 6.429608127721336e-05,
"loss": 1.8986,
"step": 12300
},
{
"epoch": 3.5703918722786647,
"eval_loss": 1.027103304862976,
"eval_runtime": 215.1871,
"eval_samples_per_second": 211.416,
"eval_steps_per_second": 3.304,
"step": 12300
},
{
"epoch": 3.599419448476052,
"grad_norm": 1.2377736568450928,
"learning_rate": 6.400580551523948e-05,
"loss": 1.8982,
"step": 12400
},
{
"epoch": 3.599419448476052,
"eval_loss": 1.0257542133331299,
"eval_runtime": 214.3661,
"eval_samples_per_second": 212.226,
"eval_steps_per_second": 3.317,
"step": 12400
},
{
"epoch": 3.6284470246734397,
"grad_norm": 1.2443993091583252,
"learning_rate": 6.37155297532656e-05,
"loss": 1.909,
"step": 12500
},
{
"epoch": 3.6284470246734397,
"eval_loss": 1.0211207866668701,
"eval_runtime": 214.1724,
"eval_samples_per_second": 212.418,
"eval_steps_per_second": 3.32,
"step": 12500
},
{
"epoch": 3.6574746008708274,
"grad_norm": 1.3550719022750854,
"learning_rate": 6.342525399129173e-05,
"loss": 1.8973,
"step": 12600
},
{
"epoch": 3.6574746008708274,
"eval_loss": 1.0253050327301025,
"eval_runtime": 214.3603,
"eval_samples_per_second": 212.231,
"eval_steps_per_second": 3.317,
"step": 12600
},
{
"epoch": 3.6865021770682147,
"grad_norm": 1.2715822458267212,
"learning_rate": 6.313497822931786e-05,
"loss": 1.8928,
"step": 12700
},
{
"epoch": 3.6865021770682147,
"eval_loss": 1.0215857028961182,
"eval_runtime": 214.2543,
"eval_samples_per_second": 212.336,
"eval_steps_per_second": 3.318,
"step": 12700
},
{
"epoch": 3.7155297532656024,
"grad_norm": 1.230591893196106,
"learning_rate": 6.284470246734397e-05,
"loss": 1.8998,
"step": 12800
},
{
"epoch": 3.7155297532656024,
"eval_loss": 1.0226044654846191,
"eval_runtime": 214.8109,
"eval_samples_per_second": 211.786,
"eval_steps_per_second": 3.31,
"step": 12800
},
{
"epoch": 3.7445573294629897,
"grad_norm": 1.2558367252349854,
"learning_rate": 6.25544267053701e-05,
"loss": 1.9083,
"step": 12900
},
{
"epoch": 3.7445573294629897,
"eval_loss": 1.0279453992843628,
"eval_runtime": 214.5403,
"eval_samples_per_second": 212.053,
"eval_steps_per_second": 3.314,
"step": 12900
},
{
"epoch": 3.7735849056603774,
"grad_norm": 1.3605984449386597,
"learning_rate": 6.226415094339622e-05,
"loss": 1.8947,
"step": 13000
},
{
"epoch": 3.7735849056603774,
"eval_loss": 1.0254005193710327,
"eval_runtime": 214.5443,
"eval_samples_per_second": 212.049,
"eval_steps_per_second": 3.314,
"step": 13000
}
],
"logging_steps": 100,
"max_steps": 34450,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 5
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3003632111570125e+17,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}