|
{ |
|
"best_metric": 1.0438764095306396, |
|
"best_model_checkpoint": "mgh6/TCS_MLM_50/checkpoint-8900", |
|
"epoch": 2.7285921625544267, |
|
"eval_steps": 100, |
|
"global_step": 9400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02902757619738752, |
|
"grad_norm": 1.131932258605957, |
|
"learning_rate": 9.970972423802612e-05, |
|
"loss": 2.8244, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02902757619738752, |
|
"eval_loss": 1.2662084102630615, |
|
"eval_runtime": 213.5614, |
|
"eval_samples_per_second": 213.049, |
|
"eval_steps_per_second": 3.329, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05805515239477504, |
|
"grad_norm": 1.0239707231521606, |
|
"learning_rate": 9.941944847605225e-05, |
|
"loss": 2.7081, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05805515239477504, |
|
"eval_loss": 1.2453378438949585, |
|
"eval_runtime": 212.9056, |
|
"eval_samples_per_second": 213.705, |
|
"eval_steps_per_second": 3.34, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08708272859216255, |
|
"grad_norm": 1.1205116510391235, |
|
"learning_rate": 9.912917271407838e-05, |
|
"loss": 2.642, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08708272859216255, |
|
"eval_loss": 1.2237757444381714, |
|
"eval_runtime": 214.4447, |
|
"eval_samples_per_second": 212.171, |
|
"eval_steps_per_second": 3.316, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11611030478955008, |
|
"grad_norm": 1.0193355083465576, |
|
"learning_rate": 9.883889695210451e-05, |
|
"loss": 2.6037, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11611030478955008, |
|
"eval_loss": 1.2148627042770386, |
|
"eval_runtime": 213.5123, |
|
"eval_samples_per_second": 213.098, |
|
"eval_steps_per_second": 3.33, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14513788098693758, |
|
"grad_norm": 1.05299711227417, |
|
"learning_rate": 9.854862119013063e-05, |
|
"loss": 2.5791, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14513788098693758, |
|
"eval_loss": 1.2020208835601807, |
|
"eval_runtime": 213.769, |
|
"eval_samples_per_second": 212.842, |
|
"eval_steps_per_second": 3.326, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1741654571843251, |
|
"grad_norm": 1.0508314371109009, |
|
"learning_rate": 9.825834542815675e-05, |
|
"loss": 2.5464, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1741654571843251, |
|
"eval_loss": 1.1960116624832153, |
|
"eval_runtime": 214.1083, |
|
"eval_samples_per_second": 212.505, |
|
"eval_steps_per_second": 3.321, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.20319303338171263, |
|
"grad_norm": 1.158460021018982, |
|
"learning_rate": 9.796806966618288e-05, |
|
"loss": 2.5391, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.20319303338171263, |
|
"eval_loss": 1.186664342880249, |
|
"eval_runtime": 213.4364, |
|
"eval_samples_per_second": 213.174, |
|
"eval_steps_per_second": 3.331, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.23222060957910015, |
|
"grad_norm": 1.0704821348190308, |
|
"learning_rate": 9.767779390420901e-05, |
|
"loss": 2.4944, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.23222060957910015, |
|
"eval_loss": 1.1850290298461914, |
|
"eval_runtime": 213.63, |
|
"eval_samples_per_second": 212.98, |
|
"eval_steps_per_second": 3.328, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2612481857764877, |
|
"grad_norm": 1.0562227964401245, |
|
"learning_rate": 9.738751814223513e-05, |
|
"loss": 2.4879, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2612481857764877, |
|
"eval_loss": 1.1725127696990967, |
|
"eval_runtime": 213.7307, |
|
"eval_samples_per_second": 212.88, |
|
"eval_steps_per_second": 3.327, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.29027576197387517, |
|
"grad_norm": 1.136777639389038, |
|
"learning_rate": 9.709724238026126e-05, |
|
"loss": 2.4647, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.29027576197387517, |
|
"eval_loss": 1.1709253787994385, |
|
"eval_runtime": 213.2147, |
|
"eval_samples_per_second": 213.395, |
|
"eval_steps_per_second": 3.335, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3193033381712627, |
|
"grad_norm": 1.0949931144714355, |
|
"learning_rate": 9.680696661828737e-05, |
|
"loss": 2.4441, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3193033381712627, |
|
"eval_loss": 1.1647560596466064, |
|
"eval_runtime": 213.5056, |
|
"eval_samples_per_second": 213.104, |
|
"eval_steps_per_second": 3.33, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3483309143686502, |
|
"grad_norm": 1.2719751596450806, |
|
"learning_rate": 9.65166908563135e-05, |
|
"loss": 2.432, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3483309143686502, |
|
"eval_loss": 1.1668621301651, |
|
"eval_runtime": 213.8017, |
|
"eval_samples_per_second": 212.809, |
|
"eval_steps_per_second": 3.326, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 1.1357173919677734, |
|
"learning_rate": 9.622641509433963e-05, |
|
"loss": 2.4173, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"eval_loss": 1.1585583686828613, |
|
"eval_runtime": 212.8448, |
|
"eval_samples_per_second": 213.766, |
|
"eval_steps_per_second": 3.34, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.40638606676342526, |
|
"grad_norm": 1.1240577697753906, |
|
"learning_rate": 9.593613933236575e-05, |
|
"loss": 2.4029, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.40638606676342526, |
|
"eval_loss": 1.1513617038726807, |
|
"eval_runtime": 214.5547, |
|
"eval_samples_per_second": 212.063, |
|
"eval_steps_per_second": 3.314, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.43541364296081275, |
|
"grad_norm": 1.074048399925232, |
|
"learning_rate": 9.564586357039188e-05, |
|
"loss": 2.3964, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.43541364296081275, |
|
"eval_loss": 1.1514214277267456, |
|
"eval_runtime": 213.8115, |
|
"eval_samples_per_second": 212.8, |
|
"eval_steps_per_second": 3.325, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4644412191582003, |
|
"grad_norm": 1.2565686702728271, |
|
"learning_rate": 9.5355587808418e-05, |
|
"loss": 2.3548, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4644412191582003, |
|
"eval_loss": 1.1476994752883911, |
|
"eval_runtime": 214.3759, |
|
"eval_samples_per_second": 212.239, |
|
"eval_steps_per_second": 3.317, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4934687953555878, |
|
"grad_norm": 1.1474090814590454, |
|
"learning_rate": 9.506531204644412e-05, |
|
"loss": 2.36, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4934687953555878, |
|
"eval_loss": 1.1446571350097656, |
|
"eval_runtime": 213.458, |
|
"eval_samples_per_second": 213.152, |
|
"eval_steps_per_second": 3.331, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5224963715529753, |
|
"grad_norm": 1.2290916442871094, |
|
"learning_rate": 9.477503628447025e-05, |
|
"loss": 2.3438, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5224963715529753, |
|
"eval_loss": 1.1393438577651978, |
|
"eval_runtime": 213.014, |
|
"eval_samples_per_second": 213.596, |
|
"eval_steps_per_second": 3.338, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5515239477503628, |
|
"grad_norm": 1.1700950860977173, |
|
"learning_rate": 9.448476052249638e-05, |
|
"loss": 2.3416, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5515239477503628, |
|
"eval_loss": 1.1348192691802979, |
|
"eval_runtime": 213.2252, |
|
"eval_samples_per_second": 213.385, |
|
"eval_steps_per_second": 3.335, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5805515239477503, |
|
"grad_norm": 1.1090705394744873, |
|
"learning_rate": 9.419448476052251e-05, |
|
"loss": 2.3289, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5805515239477503, |
|
"eval_loss": 1.130873203277588, |
|
"eval_runtime": 212.7564, |
|
"eval_samples_per_second": 213.855, |
|
"eval_steps_per_second": 3.342, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6095791001451378, |
|
"grad_norm": 1.17753267288208, |
|
"learning_rate": 9.390420899854863e-05, |
|
"loss": 2.3218, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6095791001451378, |
|
"eval_loss": 1.1335190534591675, |
|
"eval_runtime": 212.7619, |
|
"eval_samples_per_second": 213.849, |
|
"eval_steps_per_second": 3.342, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6386066763425254, |
|
"grad_norm": 1.087358832359314, |
|
"learning_rate": 9.361393323657474e-05, |
|
"loss": 2.3072, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6386066763425254, |
|
"eval_loss": 1.1303313970565796, |
|
"eval_runtime": 213.3449, |
|
"eval_samples_per_second": 213.265, |
|
"eval_steps_per_second": 3.333, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6676342525399129, |
|
"grad_norm": 1.1286981105804443, |
|
"learning_rate": 9.332365747460087e-05, |
|
"loss": 2.2881, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6676342525399129, |
|
"eval_loss": 1.1234804391860962, |
|
"eval_runtime": 213.3465, |
|
"eval_samples_per_second": 213.263, |
|
"eval_steps_per_second": 3.333, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6966618287373004, |
|
"grad_norm": 1.1590163707733154, |
|
"learning_rate": 9.3033381712627e-05, |
|
"loss": 2.2751, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6966618287373004, |
|
"eval_loss": 1.120328664779663, |
|
"eval_runtime": 213.9246, |
|
"eval_samples_per_second": 212.687, |
|
"eval_steps_per_second": 3.324, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7256894049346879, |
|
"grad_norm": 1.3988169431686401, |
|
"learning_rate": 9.274310595065312e-05, |
|
"loss": 2.2666, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7256894049346879, |
|
"eval_loss": 1.1266223192214966, |
|
"eval_runtime": 214.3634, |
|
"eval_samples_per_second": 212.252, |
|
"eval_steps_per_second": 3.317, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 1.239560842514038, |
|
"learning_rate": 9.245283018867925e-05, |
|
"loss": 2.2702, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"eval_loss": 1.1224210262298584, |
|
"eval_runtime": 213.2424, |
|
"eval_samples_per_second": 213.367, |
|
"eval_steps_per_second": 3.334, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.783744557329463, |
|
"grad_norm": 1.1289948225021362, |
|
"learning_rate": 9.216255442670537e-05, |
|
"loss": 2.256, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.783744557329463, |
|
"eval_loss": 1.1150513887405396, |
|
"eval_runtime": 213.4486, |
|
"eval_samples_per_second": 213.161, |
|
"eval_steps_per_second": 3.331, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.8127721335268505, |
|
"grad_norm": 1.1463016271591187, |
|
"learning_rate": 9.18722786647315e-05, |
|
"loss": 2.2483, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8127721335268505, |
|
"eval_loss": 1.1185483932495117, |
|
"eval_runtime": 212.704, |
|
"eval_samples_per_second": 213.908, |
|
"eval_steps_per_second": 3.343, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.841799709724238, |
|
"grad_norm": 1.1233168840408325, |
|
"learning_rate": 9.158200290275763e-05, |
|
"loss": 2.2328, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.841799709724238, |
|
"eval_loss": 1.1085420846939087, |
|
"eval_runtime": 213.7255, |
|
"eval_samples_per_second": 212.885, |
|
"eval_steps_per_second": 3.327, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8708272859216255, |
|
"grad_norm": 1.1887527704238892, |
|
"learning_rate": 9.129172714078375e-05, |
|
"loss": 2.235, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8708272859216255, |
|
"eval_loss": 1.1104073524475098, |
|
"eval_runtime": 213.9252, |
|
"eval_samples_per_second": 212.687, |
|
"eval_steps_per_second": 3.324, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8998548621190131, |
|
"grad_norm": 1.2834577560424805, |
|
"learning_rate": 9.100145137880988e-05, |
|
"loss": 2.2209, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.8998548621190131, |
|
"eval_loss": 1.1137757301330566, |
|
"eval_runtime": 213.6201, |
|
"eval_samples_per_second": 212.99, |
|
"eval_steps_per_second": 3.328, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9288824383164006, |
|
"grad_norm": 1.3034873008728027, |
|
"learning_rate": 9.0711175616836e-05, |
|
"loss": 2.2185, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9288824383164006, |
|
"eval_loss": 1.107863187789917, |
|
"eval_runtime": 213.1098, |
|
"eval_samples_per_second": 213.5, |
|
"eval_steps_per_second": 3.336, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9579100145137881, |
|
"grad_norm": 1.1802492141723633, |
|
"learning_rate": 9.042089985486212e-05, |
|
"loss": 2.2147, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9579100145137881, |
|
"eval_loss": 1.1041762828826904, |
|
"eval_runtime": 213.2962, |
|
"eval_samples_per_second": 213.314, |
|
"eval_steps_per_second": 3.333, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9869375907111756, |
|
"grad_norm": 1.2992894649505615, |
|
"learning_rate": 9.013062409288826e-05, |
|
"loss": 2.216, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.9869375907111756, |
|
"eval_loss": 1.1009138822555542, |
|
"eval_runtime": 213.7998, |
|
"eval_samples_per_second": 212.811, |
|
"eval_steps_per_second": 3.326, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.0159651669085632, |
|
"grad_norm": 1.1432065963745117, |
|
"learning_rate": 8.984034833091437e-05, |
|
"loss": 2.1952, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.0159651669085632, |
|
"eval_loss": 1.106726884841919, |
|
"eval_runtime": 213.7054, |
|
"eval_samples_per_second": 212.905, |
|
"eval_steps_per_second": 3.327, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.0449927431059507, |
|
"grad_norm": 1.1603158712387085, |
|
"learning_rate": 8.95500725689405e-05, |
|
"loss": 2.2019, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.0449927431059507, |
|
"eval_loss": 1.1014330387115479, |
|
"eval_runtime": 213.1977, |
|
"eval_samples_per_second": 213.412, |
|
"eval_steps_per_second": 3.335, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.0740203193033382, |
|
"grad_norm": 1.2428488731384277, |
|
"learning_rate": 8.925979680696662e-05, |
|
"loss": 2.1959, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.0740203193033382, |
|
"eval_loss": 1.1004406213760376, |
|
"eval_runtime": 213.3658, |
|
"eval_samples_per_second": 213.244, |
|
"eval_steps_per_second": 3.332, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.1030478955007257, |
|
"grad_norm": 1.1615545749664307, |
|
"learning_rate": 8.896952104499274e-05, |
|
"loss": 2.1776, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.1030478955007257, |
|
"eval_loss": 1.0938160419464111, |
|
"eval_runtime": 213.3987, |
|
"eval_samples_per_second": 213.211, |
|
"eval_steps_per_second": 3.332, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.1320754716981132, |
|
"grad_norm": 1.1921610832214355, |
|
"learning_rate": 8.867924528301888e-05, |
|
"loss": 2.1762, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.1320754716981132, |
|
"eval_loss": 1.0960694551467896, |
|
"eval_runtime": 213.1832, |
|
"eval_samples_per_second": 213.427, |
|
"eval_steps_per_second": 3.335, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.1611030478955007, |
|
"grad_norm": 1.1980363130569458, |
|
"learning_rate": 8.8388969521045e-05, |
|
"loss": 2.1717, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.1611030478955007, |
|
"eval_loss": 1.0951919555664062, |
|
"eval_runtime": 213.4024, |
|
"eval_samples_per_second": 213.207, |
|
"eval_steps_per_second": 3.332, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.1901306240928882, |
|
"grad_norm": 1.217236042022705, |
|
"learning_rate": 8.809869375907113e-05, |
|
"loss": 2.1534, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.1901306240928882, |
|
"eval_loss": 1.0937577486038208, |
|
"eval_runtime": 213.8113, |
|
"eval_samples_per_second": 212.8, |
|
"eval_steps_per_second": 3.325, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.2191582002902757, |
|
"grad_norm": 1.2121118307113647, |
|
"learning_rate": 8.780841799709725e-05, |
|
"loss": 2.1639, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.2191582002902757, |
|
"eval_loss": 1.0909945964813232, |
|
"eval_runtime": 212.8308, |
|
"eval_samples_per_second": 213.78, |
|
"eval_steps_per_second": 3.341, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.2481857764876634, |
|
"grad_norm": 1.17587411403656, |
|
"learning_rate": 8.751814223512336e-05, |
|
"loss": 2.146, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.2481857764876634, |
|
"eval_loss": 1.0888868570327759, |
|
"eval_runtime": 213.8752, |
|
"eval_samples_per_second": 212.736, |
|
"eval_steps_per_second": 3.324, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.2772133526850509, |
|
"grad_norm": 1.2848412990570068, |
|
"learning_rate": 8.722786647314949e-05, |
|
"loss": 2.1357, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.2772133526850509, |
|
"eval_loss": 1.091068983078003, |
|
"eval_runtime": 213.4081, |
|
"eval_samples_per_second": 213.202, |
|
"eval_steps_per_second": 3.332, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.3062409288824384, |
|
"grad_norm": 1.2059731483459473, |
|
"learning_rate": 8.693759071117562e-05, |
|
"loss": 2.1456, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.3062409288824384, |
|
"eval_loss": 1.0857021808624268, |
|
"eval_runtime": 213.7314, |
|
"eval_samples_per_second": 212.879, |
|
"eval_steps_per_second": 3.327, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.3352685050798259, |
|
"grad_norm": 1.226241946220398, |
|
"learning_rate": 8.664731494920174e-05, |
|
"loss": 2.1453, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.3352685050798259, |
|
"eval_loss": 1.0845140218734741, |
|
"eval_runtime": 213.4698, |
|
"eval_samples_per_second": 213.14, |
|
"eval_steps_per_second": 3.331, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.3642960812772134, |
|
"grad_norm": 1.1810499429702759, |
|
"learning_rate": 8.635703918722787e-05, |
|
"loss": 2.1425, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.3642960812772134, |
|
"eval_loss": 1.0831544399261475, |
|
"eval_runtime": 214.2077, |
|
"eval_samples_per_second": 212.406, |
|
"eval_steps_per_second": 3.319, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.3933236574746009, |
|
"grad_norm": 1.155281662940979, |
|
"learning_rate": 8.606676342525399e-05, |
|
"loss": 2.1173, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.3933236574746009, |
|
"eval_loss": 1.0785441398620605, |
|
"eval_runtime": 213.6973, |
|
"eval_samples_per_second": 212.913, |
|
"eval_steps_per_second": 3.327, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.4223512336719883, |
|
"grad_norm": 1.2070744037628174, |
|
"learning_rate": 8.577648766328012e-05, |
|
"loss": 2.1183, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.4223512336719883, |
|
"eval_loss": 1.0808286666870117, |
|
"eval_runtime": 213.4564, |
|
"eval_samples_per_second": 213.154, |
|
"eval_steps_per_second": 3.331, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.4513788098693758, |
|
"grad_norm": 1.1901525259017944, |
|
"learning_rate": 8.548621190130625e-05, |
|
"loss": 2.1274, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.4513788098693758, |
|
"eval_loss": 1.0827044248580933, |
|
"eval_runtime": 212.5592, |
|
"eval_samples_per_second": 214.053, |
|
"eval_steps_per_second": 3.345, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.4804063860667633, |
|
"grad_norm": 1.1999766826629639, |
|
"learning_rate": 8.519593613933237e-05, |
|
"loss": 2.1145, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.4804063860667633, |
|
"eval_loss": 1.078644037246704, |
|
"eval_runtime": 213.0532, |
|
"eval_samples_per_second": 213.557, |
|
"eval_steps_per_second": 3.337, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.509433962264151, |
|
"grad_norm": 1.2294871807098389, |
|
"learning_rate": 8.49056603773585e-05, |
|
"loss": 2.1067, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.509433962264151, |
|
"eval_loss": 1.0794402360916138, |
|
"eval_runtime": 212.9617, |
|
"eval_samples_per_second": 213.649, |
|
"eval_steps_per_second": 3.339, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 1.2571580410003662, |
|
"learning_rate": 8.461538461538461e-05, |
|
"loss": 2.1032, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 1.0783346891403198, |
|
"eval_runtime": 213.4656, |
|
"eval_samples_per_second": 213.144, |
|
"eval_steps_per_second": 3.331, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.567489114658926, |
|
"grad_norm": 1.2078722715377808, |
|
"learning_rate": 8.432510885341074e-05, |
|
"loss": 2.0912, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.567489114658926, |
|
"eval_loss": 1.0764219760894775, |
|
"eval_runtime": 213.826, |
|
"eval_samples_per_second": 212.785, |
|
"eval_steps_per_second": 3.325, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.5965166908563135, |
|
"grad_norm": 1.272294521331787, |
|
"learning_rate": 8.403483309143688e-05, |
|
"loss": 2.0784, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.5965166908563135, |
|
"eval_loss": 1.0817687511444092, |
|
"eval_runtime": 213.443, |
|
"eval_samples_per_second": 213.167, |
|
"eval_steps_per_second": 3.331, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.625544267053701, |
|
"grad_norm": 1.2367442846298218, |
|
"learning_rate": 8.374455732946299e-05, |
|
"loss": 2.0997, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.625544267053701, |
|
"eval_loss": 1.079858660697937, |
|
"eval_runtime": 213.7339, |
|
"eval_samples_per_second": 212.877, |
|
"eval_steps_per_second": 3.327, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.6545718432510885, |
|
"grad_norm": 1.2720229625701904, |
|
"learning_rate": 8.345428156748912e-05, |
|
"loss": 2.093, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.6545718432510885, |
|
"eval_loss": 1.0779507160186768, |
|
"eval_runtime": 213.2034, |
|
"eval_samples_per_second": 213.407, |
|
"eval_steps_per_second": 3.335, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.683599419448476, |
|
"grad_norm": 1.1694726943969727, |
|
"learning_rate": 8.316400580551524e-05, |
|
"loss": 2.0822, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.683599419448476, |
|
"eval_loss": 1.068250060081482, |
|
"eval_runtime": 213.1022, |
|
"eval_samples_per_second": 213.508, |
|
"eval_steps_per_second": 3.336, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.7126269956458637, |
|
"grad_norm": 1.2155323028564453, |
|
"learning_rate": 8.287373004354137e-05, |
|
"loss": 2.0792, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.7126269956458637, |
|
"eval_loss": 1.0666776895523071, |
|
"eval_runtime": 213.4935, |
|
"eval_samples_per_second": 213.117, |
|
"eval_steps_per_second": 3.33, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.741654571843251, |
|
"grad_norm": 1.3163602352142334, |
|
"learning_rate": 8.25834542815675e-05, |
|
"loss": 2.0712, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.741654571843251, |
|
"eval_loss": 1.0677340030670166, |
|
"eval_runtime": 213.751, |
|
"eval_samples_per_second": 212.86, |
|
"eval_steps_per_second": 3.326, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.7706821480406387, |
|
"grad_norm": 1.1972286701202393, |
|
"learning_rate": 8.229317851959362e-05, |
|
"loss": 2.0679, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.7706821480406387, |
|
"eval_loss": 1.0662775039672852, |
|
"eval_runtime": 213.7705, |
|
"eval_samples_per_second": 212.84, |
|
"eval_steps_per_second": 3.326, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.799709724238026, |
|
"grad_norm": 1.189395546913147, |
|
"learning_rate": 8.200290275761974e-05, |
|
"loss": 2.0753, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.799709724238026, |
|
"eval_loss": 1.0646038055419922, |
|
"eval_runtime": 213.3945, |
|
"eval_samples_per_second": 213.215, |
|
"eval_steps_per_second": 3.332, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.8287373004354137, |
|
"grad_norm": 1.2696415185928345, |
|
"learning_rate": 8.171262699564587e-05, |
|
"loss": 2.063, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.8287373004354137, |
|
"eval_loss": 1.0669814348220825, |
|
"eval_runtime": 213.7587, |
|
"eval_samples_per_second": 212.852, |
|
"eval_steps_per_second": 3.326, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.8577648766328012, |
|
"grad_norm": 1.241452693939209, |
|
"learning_rate": 8.142235123367198e-05, |
|
"loss": 2.0508, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.8577648766328012, |
|
"eval_loss": 1.072275996208191, |
|
"eval_runtime": 213.3197, |
|
"eval_samples_per_second": 213.29, |
|
"eval_steps_per_second": 3.333, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.8867924528301887, |
|
"grad_norm": 1.22267484664917, |
|
"learning_rate": 8.113207547169813e-05, |
|
"loss": 2.07, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.8867924528301887, |
|
"eval_loss": 1.0654535293579102, |
|
"eval_runtime": 214.0386, |
|
"eval_samples_per_second": 212.574, |
|
"eval_steps_per_second": 3.322, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.9158200290275762, |
|
"grad_norm": 1.2704839706420898, |
|
"learning_rate": 8.084179970972424e-05, |
|
"loss": 2.0646, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.9158200290275762, |
|
"eval_loss": 1.0614382028579712, |
|
"eval_runtime": 213.4971, |
|
"eval_samples_per_second": 213.113, |
|
"eval_steps_per_second": 3.33, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.9448476052249637, |
|
"grad_norm": 1.3870867490768433, |
|
"learning_rate": 8.055152394775036e-05, |
|
"loss": 2.0598, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.9448476052249637, |
|
"eval_loss": 1.067047357559204, |
|
"eval_runtime": 214.0952, |
|
"eval_samples_per_second": 212.518, |
|
"eval_steps_per_second": 3.321, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.9738751814223512, |
|
"grad_norm": 1.3581643104553223, |
|
"learning_rate": 8.026124818577649e-05, |
|
"loss": 2.0501, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.9738751814223512, |
|
"eval_loss": 1.0663081407546997, |
|
"eval_runtime": 213.8995, |
|
"eval_samples_per_second": 212.712, |
|
"eval_steps_per_second": 3.324, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 2.0029027576197387, |
|
"grad_norm": 1.3438752889633179, |
|
"learning_rate": 7.997097242380261e-05, |
|
"loss": 2.0332, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.0029027576197387, |
|
"eval_loss": 1.059921383857727, |
|
"eval_runtime": 213.0183, |
|
"eval_samples_per_second": 213.592, |
|
"eval_steps_per_second": 3.338, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.0319303338171264, |
|
"grad_norm": 1.3646849393844604, |
|
"learning_rate": 7.968069666182875e-05, |
|
"loss": 2.0463, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.0319303338171264, |
|
"eval_loss": 1.0679893493652344, |
|
"eval_runtime": 213.3912, |
|
"eval_samples_per_second": 213.219, |
|
"eval_steps_per_second": 3.332, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.0609579100145137, |
|
"grad_norm": 1.2047359943389893, |
|
"learning_rate": 7.939042089985487e-05, |
|
"loss": 2.0376, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.0609579100145137, |
|
"eval_loss": 1.0566322803497314, |
|
"eval_runtime": 213.6266, |
|
"eval_samples_per_second": 212.984, |
|
"eval_steps_per_second": 3.328, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.0899854862119014, |
|
"grad_norm": 1.2285219430923462, |
|
"learning_rate": 7.910014513788099e-05, |
|
"loss": 2.0327, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.0899854862119014, |
|
"eval_loss": 1.058618426322937, |
|
"eval_runtime": 213.6922, |
|
"eval_samples_per_second": 212.918, |
|
"eval_steps_per_second": 3.327, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.1190130624092887, |
|
"grad_norm": 1.2674715518951416, |
|
"learning_rate": 7.880986937590712e-05, |
|
"loss": 2.0347, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.1190130624092887, |
|
"eval_loss": 1.0599507093429565, |
|
"eval_runtime": 213.5256, |
|
"eval_samples_per_second": 213.085, |
|
"eval_steps_per_second": 3.33, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.1480406386066764, |
|
"grad_norm": 1.3713229894638062, |
|
"learning_rate": 7.851959361393323e-05, |
|
"loss": 2.0321, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.1480406386066764, |
|
"eval_loss": 1.0617178678512573, |
|
"eval_runtime": 213.0273, |
|
"eval_samples_per_second": 213.583, |
|
"eval_steps_per_second": 3.338, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.1770682148040637, |
|
"grad_norm": 1.292090654373169, |
|
"learning_rate": 7.822931785195937e-05, |
|
"loss": 2.01, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.1770682148040637, |
|
"eval_loss": 1.0593364238739014, |
|
"eval_runtime": 213.421, |
|
"eval_samples_per_second": 213.189, |
|
"eval_steps_per_second": 3.331, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.2060957910014514, |
|
"grad_norm": 1.1819452047348022, |
|
"learning_rate": 7.79390420899855e-05, |
|
"loss": 2.0209, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.2060957910014514, |
|
"eval_loss": 1.0524711608886719, |
|
"eval_runtime": 214.0149, |
|
"eval_samples_per_second": 212.597, |
|
"eval_steps_per_second": 3.322, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.235123367198839, |
|
"grad_norm": 1.2881128787994385, |
|
"learning_rate": 7.764876632801161e-05, |
|
"loss": 2.0085, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.235123367198839, |
|
"eval_loss": 1.0567752122879028, |
|
"eval_runtime": 213.6228, |
|
"eval_samples_per_second": 212.988, |
|
"eval_steps_per_second": 3.328, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.2641509433962264, |
|
"grad_norm": 1.2962584495544434, |
|
"learning_rate": 7.735849056603774e-05, |
|
"loss": 2.0204, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.2641509433962264, |
|
"eval_loss": 1.0586293935775757, |
|
"eval_runtime": 213.3516, |
|
"eval_samples_per_second": 213.258, |
|
"eval_steps_per_second": 3.333, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.293178519593614, |
|
"grad_norm": 1.2214884757995605, |
|
"learning_rate": 7.706821480406386e-05, |
|
"loss": 2.0184, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.293178519593614, |
|
"eval_loss": 1.0525050163269043, |
|
"eval_runtime": 212.5483, |
|
"eval_samples_per_second": 214.064, |
|
"eval_steps_per_second": 3.345, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.3222060957910013, |
|
"grad_norm": 1.2622853517532349, |
|
"learning_rate": 7.677793904208999e-05, |
|
"loss": 2.0162, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.3222060957910013, |
|
"eval_loss": 1.0512940883636475, |
|
"eval_runtime": 212.6462, |
|
"eval_samples_per_second": 213.966, |
|
"eval_steps_per_second": 3.344, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.351233671988389, |
|
"grad_norm": 1.2338088750839233, |
|
"learning_rate": 7.648766328011612e-05, |
|
"loss": 2.0029, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.351233671988389, |
|
"eval_loss": 1.0521414279937744, |
|
"eval_runtime": 213.5358, |
|
"eval_samples_per_second": 213.074, |
|
"eval_steps_per_second": 3.33, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.3802612481857763, |
|
"grad_norm": 1.2111109495162964, |
|
"learning_rate": 7.619738751814224e-05, |
|
"loss": 2.0101, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.3802612481857763, |
|
"eval_loss": 1.0501890182495117, |
|
"eval_runtime": 213.0351, |
|
"eval_samples_per_second": 213.575, |
|
"eval_steps_per_second": 3.337, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.409288824383164, |
|
"grad_norm": 1.2333025932312012, |
|
"learning_rate": 7.590711175616836e-05, |
|
"loss": 2.0, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.409288824383164, |
|
"eval_loss": 1.051579236984253, |
|
"eval_runtime": 213.5529, |
|
"eval_samples_per_second": 213.057, |
|
"eval_steps_per_second": 3.329, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.4383164005805513, |
|
"grad_norm": 1.3394699096679688, |
|
"learning_rate": 7.561683599419449e-05, |
|
"loss": 1.9986, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.4383164005805513, |
|
"eval_loss": 1.0520364046096802, |
|
"eval_runtime": 212.3818, |
|
"eval_samples_per_second": 214.232, |
|
"eval_steps_per_second": 3.348, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.467343976777939, |
|
"grad_norm": 1.334936261177063, |
|
"learning_rate": 7.532656023222062e-05, |
|
"loss": 1.993, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.467343976777939, |
|
"eval_loss": 1.0490361452102661, |
|
"eval_runtime": 213.9415, |
|
"eval_samples_per_second": 212.67, |
|
"eval_steps_per_second": 3.323, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.4963715529753268, |
|
"grad_norm": 1.3085263967514038, |
|
"learning_rate": 7.503628447024675e-05, |
|
"loss": 1.9771, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.4963715529753268, |
|
"eval_loss": 1.0522186756134033, |
|
"eval_runtime": 212.3302, |
|
"eval_samples_per_second": 214.284, |
|
"eval_steps_per_second": 3.349, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.525399129172714, |
|
"grad_norm": 1.4204107522964478, |
|
"learning_rate": 7.474600870827286e-05, |
|
"loss": 1.9848, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.525399129172714, |
|
"eval_loss": 1.0486035346984863, |
|
"eval_runtime": 213.5477, |
|
"eval_samples_per_second": 213.062, |
|
"eval_steps_per_second": 3.329, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.5544267053701017, |
|
"grad_norm": 1.2411503791809082, |
|
"learning_rate": 7.445573294629898e-05, |
|
"loss": 2.0016, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.5544267053701017, |
|
"eval_loss": 1.0516774654388428, |
|
"eval_runtime": 213.2425, |
|
"eval_samples_per_second": 213.367, |
|
"eval_steps_per_second": 3.334, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.583454281567489, |
|
"grad_norm": 1.2166720628738403, |
|
"learning_rate": 7.416545718432511e-05, |
|
"loss": 1.9761, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.583454281567489, |
|
"eval_loss": 1.0438764095306396, |
|
"eval_runtime": 213.2447, |
|
"eval_samples_per_second": 213.365, |
|
"eval_steps_per_second": 3.334, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.6124818577648767, |
|
"grad_norm": 1.307707667350769, |
|
"learning_rate": 7.387518142235124e-05, |
|
"loss": 1.9753, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.6124818577648767, |
|
"eval_loss": 1.0445740222930908, |
|
"eval_runtime": 212.5813, |
|
"eval_samples_per_second": 214.031, |
|
"eval_steps_per_second": 3.345, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.641509433962264, |
|
"grad_norm": 1.3446862697601318, |
|
"learning_rate": 7.358490566037736e-05, |
|
"loss": 1.9795, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.641509433962264, |
|
"eval_loss": 1.0461750030517578, |
|
"eval_runtime": 213.2022, |
|
"eval_samples_per_second": 213.408, |
|
"eval_steps_per_second": 3.335, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.6705370101596517, |
|
"grad_norm": 1.25364351272583, |
|
"learning_rate": 7.329462989840349e-05, |
|
"loss": 1.966, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.6705370101596517, |
|
"eval_loss": 1.0489540100097656, |
|
"eval_runtime": 213.3373, |
|
"eval_samples_per_second": 213.273, |
|
"eval_steps_per_second": 3.333, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.699564586357039, |
|
"grad_norm": 1.317325472831726, |
|
"learning_rate": 7.300435413642961e-05, |
|
"loss": 1.9853, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.699564586357039, |
|
"eval_loss": 1.04426109790802, |
|
"eval_runtime": 212.5953, |
|
"eval_samples_per_second": 214.017, |
|
"eval_steps_per_second": 3.344, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.7285921625544267, |
|
"grad_norm": 1.2580476999282837, |
|
"learning_rate": 7.271407837445574e-05, |
|
"loss": 1.9873, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 2.7285921625544267, |
|
"eval_loss": 1.0441796779632568, |
|
"eval_runtime": 213.1744, |
|
"eval_samples_per_second": 213.436, |
|
"eval_steps_per_second": 3.335, |
|
"step": 9400 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 34450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 5 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.403409048272896e+16, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|