|
{ |
|
"best_metric": 1.0211207866668701, |
|
"best_model_checkpoint": "mgh6/TCS_MLM_50/checkpoint-12500", |
|
"epoch": 3.7735849056603774, |
|
"eval_steps": 100, |
|
"global_step": 13000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02902757619738752, |
|
"grad_norm": 1.1695395708084106, |
|
"learning_rate": 9.970972423802612e-05, |
|
"loss": 2.8263, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02902757619738752, |
|
"eval_loss": 1.2625532150268555, |
|
"eval_runtime": 213.9369, |
|
"eval_samples_per_second": 212.651, |
|
"eval_steps_per_second": 3.323, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05805515239477504, |
|
"grad_norm": 1.1860003471374512, |
|
"learning_rate": 9.941944847605225e-05, |
|
"loss": 2.7152, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05805515239477504, |
|
"eval_loss": 1.2428085803985596, |
|
"eval_runtime": 214.2707, |
|
"eval_samples_per_second": 212.32, |
|
"eval_steps_per_second": 3.318, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08708272859216255, |
|
"grad_norm": 1.138083577156067, |
|
"learning_rate": 9.912917271407838e-05, |
|
"loss": 2.6496, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08708272859216255, |
|
"eval_loss": 1.2260128259658813, |
|
"eval_runtime": 214.1595, |
|
"eval_samples_per_second": 212.43, |
|
"eval_steps_per_second": 3.32, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11611030478955008, |
|
"grad_norm": 1.118958592414856, |
|
"learning_rate": 9.883889695210451e-05, |
|
"loss": 2.6016, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11611030478955008, |
|
"eval_loss": 1.2176499366760254, |
|
"eval_runtime": 214.7144, |
|
"eval_samples_per_second": 211.881, |
|
"eval_steps_per_second": 3.311, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14513788098693758, |
|
"grad_norm": 1.0901985168457031, |
|
"learning_rate": 9.854862119013063e-05, |
|
"loss": 2.5744, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14513788098693758, |
|
"eval_loss": 1.200432300567627, |
|
"eval_runtime": 214.0554, |
|
"eval_samples_per_second": 212.534, |
|
"eval_steps_per_second": 3.322, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1741654571843251, |
|
"grad_norm": 1.1364562511444092, |
|
"learning_rate": 9.825834542815675e-05, |
|
"loss": 2.5412, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1741654571843251, |
|
"eval_loss": 1.19223153591156, |
|
"eval_runtime": 214.4792, |
|
"eval_samples_per_second": 212.114, |
|
"eval_steps_per_second": 3.315, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.20319303338171263, |
|
"grad_norm": 1.1283212900161743, |
|
"learning_rate": 9.796806966618288e-05, |
|
"loss": 2.5318, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.20319303338171263, |
|
"eval_loss": 1.1891732215881348, |
|
"eval_runtime": 214.0534, |
|
"eval_samples_per_second": 212.536, |
|
"eval_steps_per_second": 3.322, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.23222060957910015, |
|
"grad_norm": 1.123288631439209, |
|
"learning_rate": 9.767779390420901e-05, |
|
"loss": 2.4897, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.23222060957910015, |
|
"eval_loss": 1.1818548440933228, |
|
"eval_runtime": 214.4047, |
|
"eval_samples_per_second": 212.187, |
|
"eval_steps_per_second": 3.316, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2612481857764877, |
|
"grad_norm": 1.2019628286361694, |
|
"learning_rate": 9.738751814223513e-05, |
|
"loss": 2.4833, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2612481857764877, |
|
"eval_loss": 1.169024109840393, |
|
"eval_runtime": 214.4504, |
|
"eval_samples_per_second": 212.142, |
|
"eval_steps_per_second": 3.315, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.29027576197387517, |
|
"grad_norm": 1.1262823343276978, |
|
"learning_rate": 9.709724238026126e-05, |
|
"loss": 2.4637, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.29027576197387517, |
|
"eval_loss": 1.1714328527450562, |
|
"eval_runtime": 213.9877, |
|
"eval_samples_per_second": 212.601, |
|
"eval_steps_per_second": 3.323, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3193033381712627, |
|
"grad_norm": 1.1507551670074463, |
|
"learning_rate": 9.680696661828737e-05, |
|
"loss": 2.4408, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3193033381712627, |
|
"eval_loss": 1.165282130241394, |
|
"eval_runtime": 214.1235, |
|
"eval_samples_per_second": 212.466, |
|
"eval_steps_per_second": 3.321, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3483309143686502, |
|
"grad_norm": 1.1482508182525635, |
|
"learning_rate": 9.65166908563135e-05, |
|
"loss": 2.4353, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3483309143686502, |
|
"eval_loss": 1.162631630897522, |
|
"eval_runtime": 214.0787, |
|
"eval_samples_per_second": 212.511, |
|
"eval_steps_per_second": 3.321, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 1.1512328386306763, |
|
"learning_rate": 9.622641509433963e-05, |
|
"loss": 2.404, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"eval_loss": 1.1568048000335693, |
|
"eval_runtime": 214.1359, |
|
"eval_samples_per_second": 212.454, |
|
"eval_steps_per_second": 3.32, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.40638606676342526, |
|
"grad_norm": 1.1897668838500977, |
|
"learning_rate": 9.593613933236575e-05, |
|
"loss": 2.3905, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.40638606676342526, |
|
"eval_loss": 1.1544520854949951, |
|
"eval_runtime": 214.4841, |
|
"eval_samples_per_second": 212.109, |
|
"eval_steps_per_second": 3.315, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.43541364296081275, |
|
"grad_norm": 1.1775190830230713, |
|
"learning_rate": 9.564586357039188e-05, |
|
"loss": 2.3792, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.43541364296081275, |
|
"eval_loss": 1.1439248323440552, |
|
"eval_runtime": 213.8637, |
|
"eval_samples_per_second": 212.724, |
|
"eval_steps_per_second": 3.325, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4644412191582003, |
|
"grad_norm": 1.1436594724655151, |
|
"learning_rate": 9.5355587808418e-05, |
|
"loss": 2.3592, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4644412191582003, |
|
"eval_loss": 1.1428674459457397, |
|
"eval_runtime": 214.3984, |
|
"eval_samples_per_second": 212.194, |
|
"eval_steps_per_second": 3.316, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4934687953555878, |
|
"grad_norm": 1.1852160692214966, |
|
"learning_rate": 9.506531204644412e-05, |
|
"loss": 2.3539, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4934687953555878, |
|
"eval_loss": 1.1439515352249146, |
|
"eval_runtime": 214.0153, |
|
"eval_samples_per_second": 212.574, |
|
"eval_steps_per_second": 3.322, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5224963715529753, |
|
"grad_norm": 1.2375448942184448, |
|
"learning_rate": 9.477503628447025e-05, |
|
"loss": 2.3489, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5224963715529753, |
|
"eval_loss": 1.1391005516052246, |
|
"eval_runtime": 214.5494, |
|
"eval_samples_per_second": 212.044, |
|
"eval_steps_per_second": 3.314, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5515239477503628, |
|
"grad_norm": 1.1505770683288574, |
|
"learning_rate": 9.448476052249638e-05, |
|
"loss": 2.3336, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5515239477503628, |
|
"eval_loss": 1.1322171688079834, |
|
"eval_runtime": 213.2802, |
|
"eval_samples_per_second": 213.306, |
|
"eval_steps_per_second": 3.334, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5805515239477503, |
|
"grad_norm": 1.1152174472808838, |
|
"learning_rate": 9.419448476052251e-05, |
|
"loss": 2.3321, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5805515239477503, |
|
"eval_loss": 1.1339818239212036, |
|
"eval_runtime": 213.6442, |
|
"eval_samples_per_second": 212.943, |
|
"eval_steps_per_second": 3.328, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6095791001451378, |
|
"grad_norm": 1.1027612686157227, |
|
"learning_rate": 9.390420899854863e-05, |
|
"loss": 2.3039, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6095791001451378, |
|
"eval_loss": 1.1304194927215576, |
|
"eval_runtime": 213.0147, |
|
"eval_samples_per_second": 213.572, |
|
"eval_steps_per_second": 3.338, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6386066763425254, |
|
"grad_norm": 1.1585232019424438, |
|
"learning_rate": 9.361393323657474e-05, |
|
"loss": 2.3101, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6386066763425254, |
|
"eval_loss": 1.1316287517547607, |
|
"eval_runtime": 214.3522, |
|
"eval_samples_per_second": 212.239, |
|
"eval_steps_per_second": 3.317, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6676342525399129, |
|
"grad_norm": 1.1749528646469116, |
|
"learning_rate": 9.332365747460087e-05, |
|
"loss": 2.3048, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6676342525399129, |
|
"eval_loss": 1.1262996196746826, |
|
"eval_runtime": 214.6823, |
|
"eval_samples_per_second": 211.913, |
|
"eval_steps_per_second": 3.312, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6966618287373004, |
|
"grad_norm": 1.1533962488174438, |
|
"learning_rate": 9.3033381712627e-05, |
|
"loss": 2.2808, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6966618287373004, |
|
"eval_loss": 1.1249016523361206, |
|
"eval_runtime": 214.0327, |
|
"eval_samples_per_second": 212.556, |
|
"eval_steps_per_second": 3.322, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7256894049346879, |
|
"grad_norm": 1.1524910926818848, |
|
"learning_rate": 9.274310595065312e-05, |
|
"loss": 2.2865, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7256894049346879, |
|
"eval_loss": 1.1257150173187256, |
|
"eval_runtime": 213.721, |
|
"eval_samples_per_second": 212.866, |
|
"eval_steps_per_second": 3.327, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 1.1282308101654053, |
|
"learning_rate": 9.245283018867925e-05, |
|
"loss": 2.2654, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"eval_loss": 1.1186834573745728, |
|
"eval_runtime": 214.3515, |
|
"eval_samples_per_second": 212.24, |
|
"eval_steps_per_second": 3.317, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.783744557329463, |
|
"grad_norm": 1.239816427230835, |
|
"learning_rate": 9.216255442670537e-05, |
|
"loss": 2.2564, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.783744557329463, |
|
"eval_loss": 1.1156889200210571, |
|
"eval_runtime": 213.6092, |
|
"eval_samples_per_second": 212.978, |
|
"eval_steps_per_second": 3.329, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.8127721335268505, |
|
"grad_norm": 1.2036716938018799, |
|
"learning_rate": 9.18722786647315e-05, |
|
"loss": 2.2453, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8127721335268505, |
|
"eval_loss": 1.118744134902954, |
|
"eval_runtime": 214.1658, |
|
"eval_samples_per_second": 212.424, |
|
"eval_steps_per_second": 3.32, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.841799709724238, |
|
"grad_norm": 1.2474415302276611, |
|
"learning_rate": 9.158200290275763e-05, |
|
"loss": 2.2402, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.841799709724238, |
|
"eval_loss": 1.1129833459854126, |
|
"eval_runtime": 214.2278, |
|
"eval_samples_per_second": 212.363, |
|
"eval_steps_per_second": 3.319, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8708272859216255, |
|
"grad_norm": 1.2137649059295654, |
|
"learning_rate": 9.129172714078375e-05, |
|
"loss": 2.2243, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8708272859216255, |
|
"eval_loss": 1.1113933324813843, |
|
"eval_runtime": 213.6564, |
|
"eval_samples_per_second": 212.931, |
|
"eval_steps_per_second": 3.328, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8998548621190131, |
|
"grad_norm": 1.2188935279846191, |
|
"learning_rate": 9.100145137880988e-05, |
|
"loss": 2.2324, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.8998548621190131, |
|
"eval_loss": 1.1111185550689697, |
|
"eval_runtime": 214.2911, |
|
"eval_samples_per_second": 212.3, |
|
"eval_steps_per_second": 3.318, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9288824383164006, |
|
"grad_norm": 1.2475199699401855, |
|
"learning_rate": 9.0711175616836e-05, |
|
"loss": 2.2329, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9288824383164006, |
|
"eval_loss": 1.1126782894134521, |
|
"eval_runtime": 214.309, |
|
"eval_samples_per_second": 212.282, |
|
"eval_steps_per_second": 3.318, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9579100145137881, |
|
"grad_norm": 1.1850870847702026, |
|
"learning_rate": 9.042089985486212e-05, |
|
"loss": 2.2292, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9579100145137881, |
|
"eval_loss": 1.1046797037124634, |
|
"eval_runtime": 214.0726, |
|
"eval_samples_per_second": 212.517, |
|
"eval_steps_per_second": 3.321, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9869375907111756, |
|
"grad_norm": 1.1915068626403809, |
|
"learning_rate": 9.013062409288826e-05, |
|
"loss": 2.2169, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.9869375907111756, |
|
"eval_loss": 1.1029560565948486, |
|
"eval_runtime": 214.518, |
|
"eval_samples_per_second": 212.075, |
|
"eval_steps_per_second": 3.314, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.0159651669085632, |
|
"grad_norm": 1.3059227466583252, |
|
"learning_rate": 8.984034833091437e-05, |
|
"loss": 2.2112, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.0159651669085632, |
|
"eval_loss": 1.1044234037399292, |
|
"eval_runtime": 214.4346, |
|
"eval_samples_per_second": 212.158, |
|
"eval_steps_per_second": 3.316, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.0449927431059507, |
|
"grad_norm": 1.3193408250808716, |
|
"learning_rate": 8.95500725689405e-05, |
|
"loss": 2.186, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.0449927431059507, |
|
"eval_loss": 1.1042026281356812, |
|
"eval_runtime": 214.3105, |
|
"eval_samples_per_second": 212.281, |
|
"eval_steps_per_second": 3.318, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.0740203193033382, |
|
"grad_norm": 1.1710057258605957, |
|
"learning_rate": 8.925979680696662e-05, |
|
"loss": 2.1882, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.0740203193033382, |
|
"eval_loss": 1.0975611209869385, |
|
"eval_runtime": 214.3954, |
|
"eval_samples_per_second": 212.197, |
|
"eval_steps_per_second": 3.316, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.1030478955007257, |
|
"grad_norm": 1.1426420211791992, |
|
"learning_rate": 8.896952104499274e-05, |
|
"loss": 2.1697, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.1030478955007257, |
|
"eval_loss": 1.0976998805999756, |
|
"eval_runtime": 212.987, |
|
"eval_samples_per_second": 213.6, |
|
"eval_steps_per_second": 3.338, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.1320754716981132, |
|
"grad_norm": 1.1272858381271362, |
|
"learning_rate": 8.867924528301888e-05, |
|
"loss": 2.1836, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.1320754716981132, |
|
"eval_loss": 1.0982595682144165, |
|
"eval_runtime": 214.3281, |
|
"eval_samples_per_second": 212.263, |
|
"eval_steps_per_second": 3.317, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.1611030478955007, |
|
"grad_norm": 1.141606330871582, |
|
"learning_rate": 8.8388969521045e-05, |
|
"loss": 2.1668, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.1611030478955007, |
|
"eval_loss": 1.0947861671447754, |
|
"eval_runtime": 214.7163, |
|
"eval_samples_per_second": 211.88, |
|
"eval_steps_per_second": 3.311, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.1901306240928882, |
|
"grad_norm": 1.197513222694397, |
|
"learning_rate": 8.809869375907113e-05, |
|
"loss": 2.1537, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.1901306240928882, |
|
"eval_loss": 1.0914397239685059, |
|
"eval_runtime": 214.3796, |
|
"eval_samples_per_second": 212.212, |
|
"eval_steps_per_second": 3.317, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.2191582002902757, |
|
"grad_norm": 1.2622817754745483, |
|
"learning_rate": 8.780841799709725e-05, |
|
"loss": 2.1609, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.2191582002902757, |
|
"eval_loss": 1.0932444334030151, |
|
"eval_runtime": 214.4603, |
|
"eval_samples_per_second": 212.133, |
|
"eval_steps_per_second": 3.315, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.2481857764876634, |
|
"grad_norm": 1.1745682954788208, |
|
"learning_rate": 8.751814223512336e-05, |
|
"loss": 2.1448, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.2481857764876634, |
|
"eval_loss": 1.0895456075668335, |
|
"eval_runtime": 214.0416, |
|
"eval_samples_per_second": 212.548, |
|
"eval_steps_per_second": 3.322, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.2772133526850509, |
|
"grad_norm": 1.1918201446533203, |
|
"learning_rate": 8.722786647314949e-05, |
|
"loss": 2.1552, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.2772133526850509, |
|
"eval_loss": 1.089804768562317, |
|
"eval_runtime": 214.182, |
|
"eval_samples_per_second": 212.408, |
|
"eval_steps_per_second": 3.32, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.3062409288824384, |
|
"grad_norm": 1.2561489343643188, |
|
"learning_rate": 8.693759071117562e-05, |
|
"loss": 2.1421, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.3062409288824384, |
|
"eval_loss": 1.0928661823272705, |
|
"eval_runtime": 214.3073, |
|
"eval_samples_per_second": 212.284, |
|
"eval_steps_per_second": 3.318, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.3352685050798259, |
|
"grad_norm": 1.1966407299041748, |
|
"learning_rate": 8.664731494920174e-05, |
|
"loss": 2.1426, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.3352685050798259, |
|
"eval_loss": 1.0840479135513306, |
|
"eval_runtime": 214.538, |
|
"eval_samples_per_second": 212.056, |
|
"eval_steps_per_second": 3.314, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.3642960812772134, |
|
"grad_norm": 1.20412278175354, |
|
"learning_rate": 8.635703918722787e-05, |
|
"loss": 2.1256, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.3642960812772134, |
|
"eval_loss": 1.085578441619873, |
|
"eval_runtime": 214.0855, |
|
"eval_samples_per_second": 212.504, |
|
"eval_steps_per_second": 3.321, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.3933236574746009, |
|
"grad_norm": 1.1835148334503174, |
|
"learning_rate": 8.606676342525399e-05, |
|
"loss": 2.1398, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.3933236574746009, |
|
"eval_loss": 1.0841166973114014, |
|
"eval_runtime": 213.7271, |
|
"eval_samples_per_second": 212.86, |
|
"eval_steps_per_second": 3.327, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.4223512336719883, |
|
"grad_norm": 1.1613247394561768, |
|
"learning_rate": 8.577648766328012e-05, |
|
"loss": 2.1202, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.4223512336719883, |
|
"eval_loss": 1.085669994354248, |
|
"eval_runtime": 215.8008, |
|
"eval_samples_per_second": 210.815, |
|
"eval_steps_per_second": 3.295, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.4513788098693758, |
|
"grad_norm": 1.1468629837036133, |
|
"learning_rate": 8.548621190130625e-05, |
|
"loss": 2.1123, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.4513788098693758, |
|
"eval_loss": 1.0774667263031006, |
|
"eval_runtime": 214.0808, |
|
"eval_samples_per_second": 212.509, |
|
"eval_steps_per_second": 3.321, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.4804063860667633, |
|
"grad_norm": 1.2450999021530151, |
|
"learning_rate": 8.519593613933237e-05, |
|
"loss": 2.1104, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.4804063860667633, |
|
"eval_loss": 1.0789214372634888, |
|
"eval_runtime": 214.4581, |
|
"eval_samples_per_second": 212.135, |
|
"eval_steps_per_second": 3.315, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.509433962264151, |
|
"grad_norm": 1.1406731605529785, |
|
"learning_rate": 8.49056603773585e-05, |
|
"loss": 2.1076, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.509433962264151, |
|
"eval_loss": 1.0758976936340332, |
|
"eval_runtime": 214.3317, |
|
"eval_samples_per_second": 212.26, |
|
"eval_steps_per_second": 3.317, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 1.2358899116516113, |
|
"learning_rate": 8.461538461538461e-05, |
|
"loss": 2.1149, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 1.0744006633758545, |
|
"eval_runtime": 214.6001, |
|
"eval_samples_per_second": 211.994, |
|
"eval_steps_per_second": 3.313, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.567489114658926, |
|
"grad_norm": 1.3809137344360352, |
|
"learning_rate": 8.432510885341074e-05, |
|
"loss": 2.1052, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.567489114658926, |
|
"eval_loss": 1.0817296504974365, |
|
"eval_runtime": 214.5107, |
|
"eval_samples_per_second": 212.083, |
|
"eval_steps_per_second": 3.315, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.5965166908563135, |
|
"grad_norm": 1.1924511194229126, |
|
"learning_rate": 8.403483309143688e-05, |
|
"loss": 2.093, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.5965166908563135, |
|
"eval_loss": 1.072150707244873, |
|
"eval_runtime": 214.3047, |
|
"eval_samples_per_second": 212.286, |
|
"eval_steps_per_second": 3.318, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.625544267053701, |
|
"grad_norm": 1.1757687330245972, |
|
"learning_rate": 8.374455732946299e-05, |
|
"loss": 2.0911, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.625544267053701, |
|
"eval_loss": 1.0715994834899902, |
|
"eval_runtime": 214.1638, |
|
"eval_samples_per_second": 212.426, |
|
"eval_steps_per_second": 3.32, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.6545718432510885, |
|
"grad_norm": 1.263269066810608, |
|
"learning_rate": 8.345428156748912e-05, |
|
"loss": 2.0912, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.6545718432510885, |
|
"eval_loss": 1.0753726959228516, |
|
"eval_runtime": 214.272, |
|
"eval_samples_per_second": 212.319, |
|
"eval_steps_per_second": 3.318, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.683599419448476, |
|
"grad_norm": 1.1598068475723267, |
|
"learning_rate": 8.316400580551524e-05, |
|
"loss": 2.085, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.683599419448476, |
|
"eval_loss": 1.0767669677734375, |
|
"eval_runtime": 214.1942, |
|
"eval_samples_per_second": 212.396, |
|
"eval_steps_per_second": 3.319, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.7126269956458637, |
|
"grad_norm": 1.2049143314361572, |
|
"learning_rate": 8.287373004354137e-05, |
|
"loss": 2.091, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.7126269956458637, |
|
"eval_loss": 1.0710922479629517, |
|
"eval_runtime": 214.6737, |
|
"eval_samples_per_second": 211.922, |
|
"eval_steps_per_second": 3.312, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.741654571843251, |
|
"grad_norm": 1.2534894943237305, |
|
"learning_rate": 8.25834542815675e-05, |
|
"loss": 2.0744, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.741654571843251, |
|
"eval_loss": 1.07111656665802, |
|
"eval_runtime": 214.3274, |
|
"eval_samples_per_second": 212.264, |
|
"eval_steps_per_second": 3.317, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.7706821480406387, |
|
"grad_norm": 1.260311245918274, |
|
"learning_rate": 8.229317851959362e-05, |
|
"loss": 2.082, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.7706821480406387, |
|
"eval_loss": 1.0758848190307617, |
|
"eval_runtime": 214.3528, |
|
"eval_samples_per_second": 212.239, |
|
"eval_steps_per_second": 3.317, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.799709724238026, |
|
"grad_norm": 1.192262887954712, |
|
"learning_rate": 8.200290275761974e-05, |
|
"loss": 2.0647, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.799709724238026, |
|
"eval_loss": 1.0665150880813599, |
|
"eval_runtime": 214.1927, |
|
"eval_samples_per_second": 212.398, |
|
"eval_steps_per_second": 3.319, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.8287373004354137, |
|
"grad_norm": 1.2158530950546265, |
|
"learning_rate": 8.171262699564587e-05, |
|
"loss": 2.0524, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.8287373004354137, |
|
"eval_loss": 1.0663120746612549, |
|
"eval_runtime": 214.6232, |
|
"eval_samples_per_second": 211.971, |
|
"eval_steps_per_second": 3.313, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.8577648766328012, |
|
"grad_norm": 1.1896952390670776, |
|
"learning_rate": 8.142235123367198e-05, |
|
"loss": 2.0654, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.8577648766328012, |
|
"eval_loss": 1.064207911491394, |
|
"eval_runtime": 213.7354, |
|
"eval_samples_per_second": 212.852, |
|
"eval_steps_per_second": 3.327, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.8867924528301887, |
|
"grad_norm": 1.1889102458953857, |
|
"learning_rate": 8.113207547169813e-05, |
|
"loss": 2.0549, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.8867924528301887, |
|
"eval_loss": 1.0605015754699707, |
|
"eval_runtime": 214.211, |
|
"eval_samples_per_second": 212.379, |
|
"eval_steps_per_second": 3.319, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.9158200290275762, |
|
"grad_norm": 1.2628164291381836, |
|
"learning_rate": 8.084179970972424e-05, |
|
"loss": 2.056, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.9158200290275762, |
|
"eval_loss": 1.060520052909851, |
|
"eval_runtime": 214.2715, |
|
"eval_samples_per_second": 212.319, |
|
"eval_steps_per_second": 3.318, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.9448476052249637, |
|
"grad_norm": 1.203740119934082, |
|
"learning_rate": 8.055152394775036e-05, |
|
"loss": 2.0645, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.9448476052249637, |
|
"eval_loss": 1.0628570318222046, |
|
"eval_runtime": 213.9992, |
|
"eval_samples_per_second": 212.59, |
|
"eval_steps_per_second": 3.322, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.9738751814223512, |
|
"grad_norm": 1.228109359741211, |
|
"learning_rate": 8.026124818577649e-05, |
|
"loss": 2.0528, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.9738751814223512, |
|
"eval_loss": 1.0618752241134644, |
|
"eval_runtime": 214.3127, |
|
"eval_samples_per_second": 212.279, |
|
"eval_steps_per_second": 3.318, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 2.0029027576197387, |
|
"grad_norm": 1.2511652708053589, |
|
"learning_rate": 7.997097242380261e-05, |
|
"loss": 2.0643, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.0029027576197387, |
|
"eval_loss": 1.0580365657806396, |
|
"eval_runtime": 214.5925, |
|
"eval_samples_per_second": 212.002, |
|
"eval_steps_per_second": 3.313, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.0319303338171264, |
|
"grad_norm": 1.2169101238250732, |
|
"learning_rate": 7.968069666182875e-05, |
|
"loss": 2.0342, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.0319303338171264, |
|
"eval_loss": 1.0617866516113281, |
|
"eval_runtime": 214.3766, |
|
"eval_samples_per_second": 212.215, |
|
"eval_steps_per_second": 3.317, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.0609579100145137, |
|
"grad_norm": 1.1878671646118164, |
|
"learning_rate": 7.939042089985487e-05, |
|
"loss": 2.0391, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.0609579100145137, |
|
"eval_loss": 1.0561269521713257, |
|
"eval_runtime": 213.6515, |
|
"eval_samples_per_second": 212.936, |
|
"eval_steps_per_second": 3.328, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.0899854862119014, |
|
"grad_norm": 1.2561451196670532, |
|
"learning_rate": 7.910014513788099e-05, |
|
"loss": 2.0368, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.0899854862119014, |
|
"eval_loss": 1.0582093000411987, |
|
"eval_runtime": 214.1893, |
|
"eval_samples_per_second": 212.401, |
|
"eval_steps_per_second": 3.319, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.1190130624092887, |
|
"grad_norm": 1.3752440214157104, |
|
"learning_rate": 7.880986937590712e-05, |
|
"loss": 2.0223, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.1190130624092887, |
|
"eval_loss": 1.0552905797958374, |
|
"eval_runtime": 214.1025, |
|
"eval_samples_per_second": 212.487, |
|
"eval_steps_per_second": 3.321, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.1480406386066764, |
|
"grad_norm": 1.2082586288452148, |
|
"learning_rate": 7.851959361393323e-05, |
|
"loss": 2.0219, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.1480406386066764, |
|
"eval_loss": 1.056668996810913, |
|
"eval_runtime": 214.2573, |
|
"eval_samples_per_second": 212.333, |
|
"eval_steps_per_second": 3.318, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.1770682148040637, |
|
"grad_norm": 1.335627555847168, |
|
"learning_rate": 7.822931785195937e-05, |
|
"loss": 2.0191, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.1770682148040637, |
|
"eval_loss": 1.0617352724075317, |
|
"eval_runtime": 214.1612, |
|
"eval_samples_per_second": 212.429, |
|
"eval_steps_per_second": 3.32, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.2060957910014514, |
|
"grad_norm": 1.3789772987365723, |
|
"learning_rate": 7.79390420899855e-05, |
|
"loss": 2.0163, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.2060957910014514, |
|
"eval_loss": 1.0667177438735962, |
|
"eval_runtime": 214.4271, |
|
"eval_samples_per_second": 212.165, |
|
"eval_steps_per_second": 3.316, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.235123367198839, |
|
"grad_norm": 1.2630983591079712, |
|
"learning_rate": 7.764876632801161e-05, |
|
"loss": 2.0075, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.235123367198839, |
|
"eval_loss": 1.053751826286316, |
|
"eval_runtime": 214.4933, |
|
"eval_samples_per_second": 212.1, |
|
"eval_steps_per_second": 3.315, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.2641509433962264, |
|
"grad_norm": 1.3576209545135498, |
|
"learning_rate": 7.735849056603774e-05, |
|
"loss": 2.018, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.2641509433962264, |
|
"eval_loss": 1.0545238256454468, |
|
"eval_runtime": 214.4112, |
|
"eval_samples_per_second": 212.181, |
|
"eval_steps_per_second": 3.316, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.293178519593614, |
|
"grad_norm": 1.2727316617965698, |
|
"learning_rate": 7.706821480406386e-05, |
|
"loss": 2.0123, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.293178519593614, |
|
"eval_loss": 1.0540556907653809, |
|
"eval_runtime": 214.501, |
|
"eval_samples_per_second": 212.092, |
|
"eval_steps_per_second": 3.315, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.3222060957910013, |
|
"grad_norm": 1.2817336320877075, |
|
"learning_rate": 7.677793904208999e-05, |
|
"loss": 2.0129, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.3222060957910013, |
|
"eval_loss": 1.053106427192688, |
|
"eval_runtime": 214.7491, |
|
"eval_samples_per_second": 211.847, |
|
"eval_steps_per_second": 3.311, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.351233671988389, |
|
"grad_norm": 1.1629624366760254, |
|
"learning_rate": 7.648766328011612e-05, |
|
"loss": 1.9998, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.351233671988389, |
|
"eval_loss": 1.0525621175765991, |
|
"eval_runtime": 214.4605, |
|
"eval_samples_per_second": 212.132, |
|
"eval_steps_per_second": 3.315, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.3802612481857763, |
|
"grad_norm": 1.225195050239563, |
|
"learning_rate": 7.619738751814224e-05, |
|
"loss": 1.9998, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.3802612481857763, |
|
"eval_loss": 1.0501279830932617, |
|
"eval_runtime": 214.3635, |
|
"eval_samples_per_second": 212.228, |
|
"eval_steps_per_second": 3.317, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.409288824383164, |
|
"grad_norm": 1.167968988418579, |
|
"learning_rate": 7.590711175616836e-05, |
|
"loss": 2.0127, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.409288824383164, |
|
"eval_loss": 1.0495474338531494, |
|
"eval_runtime": 214.3385, |
|
"eval_samples_per_second": 212.253, |
|
"eval_steps_per_second": 3.317, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.4383164005805513, |
|
"grad_norm": 1.2802715301513672, |
|
"learning_rate": 7.561683599419449e-05, |
|
"loss": 2.0046, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.4383164005805513, |
|
"eval_loss": 1.0517114400863647, |
|
"eval_runtime": 213.9947, |
|
"eval_samples_per_second": 212.594, |
|
"eval_steps_per_second": 3.323, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.467343976777939, |
|
"grad_norm": 1.2801434993743896, |
|
"learning_rate": 7.532656023222062e-05, |
|
"loss": 1.9913, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.467343976777939, |
|
"eval_loss": 1.0506008863449097, |
|
"eval_runtime": 214.0451, |
|
"eval_samples_per_second": 212.544, |
|
"eval_steps_per_second": 3.322, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.4963715529753268, |
|
"grad_norm": 1.3369925022125244, |
|
"learning_rate": 7.503628447024675e-05, |
|
"loss": 1.9895, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.4963715529753268, |
|
"eval_loss": 1.0505975484848022, |
|
"eval_runtime": 214.2989, |
|
"eval_samples_per_second": 212.292, |
|
"eval_steps_per_second": 3.318, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.525399129172714, |
|
"grad_norm": 1.2676314115524292, |
|
"learning_rate": 7.474600870827286e-05, |
|
"loss": 1.9963, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.525399129172714, |
|
"eval_loss": 1.0470978021621704, |
|
"eval_runtime": 214.1975, |
|
"eval_samples_per_second": 212.393, |
|
"eval_steps_per_second": 3.319, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.5544267053701017, |
|
"grad_norm": 1.2529655694961548, |
|
"learning_rate": 7.445573294629898e-05, |
|
"loss": 1.9858, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.5544267053701017, |
|
"eval_loss": 1.045462965965271, |
|
"eval_runtime": 213.8996, |
|
"eval_samples_per_second": 212.689, |
|
"eval_steps_per_second": 3.324, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.583454281567489, |
|
"grad_norm": 1.227094054222107, |
|
"learning_rate": 7.416545718432511e-05, |
|
"loss": 1.9877, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.583454281567489, |
|
"eval_loss": 1.0446746349334717, |
|
"eval_runtime": 214.2422, |
|
"eval_samples_per_second": 212.348, |
|
"eval_steps_per_second": 3.319, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.6124818577648767, |
|
"grad_norm": 1.22869074344635, |
|
"learning_rate": 7.387518142235124e-05, |
|
"loss": 1.9914, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.6124818577648767, |
|
"eval_loss": 1.045985460281372, |
|
"eval_runtime": 213.626, |
|
"eval_samples_per_second": 212.961, |
|
"eval_steps_per_second": 3.328, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.641509433962264, |
|
"grad_norm": 1.3192973136901855, |
|
"learning_rate": 7.358490566037736e-05, |
|
"loss": 1.9686, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.641509433962264, |
|
"eval_loss": 1.0464129447937012, |
|
"eval_runtime": 214.0751, |
|
"eval_samples_per_second": 212.514, |
|
"eval_steps_per_second": 3.321, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.6705370101596517, |
|
"grad_norm": 1.3081276416778564, |
|
"learning_rate": 7.329462989840349e-05, |
|
"loss": 1.9731, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.6705370101596517, |
|
"eval_loss": 1.047652006149292, |
|
"eval_runtime": 214.1138, |
|
"eval_samples_per_second": 212.476, |
|
"eval_steps_per_second": 3.321, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.699564586357039, |
|
"grad_norm": 1.309837818145752, |
|
"learning_rate": 7.300435413642961e-05, |
|
"loss": 1.9722, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.699564586357039, |
|
"eval_loss": 1.0429437160491943, |
|
"eval_runtime": 213.1703, |
|
"eval_samples_per_second": 213.416, |
|
"eval_steps_per_second": 3.335, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.7285921625544267, |
|
"grad_norm": 1.3633908033370972, |
|
"learning_rate": 7.271407837445574e-05, |
|
"loss": 1.9837, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 2.7285921625544267, |
|
"eval_loss": 1.041870355606079, |
|
"eval_runtime": 214.2854, |
|
"eval_samples_per_second": 212.306, |
|
"eval_steps_per_second": 3.318, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 2.7576197387518144, |
|
"grad_norm": 1.195707082748413, |
|
"learning_rate": 7.242380261248185e-05, |
|
"loss": 1.9657, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.7576197387518144, |
|
"eval_loss": 1.0397106409072876, |
|
"eval_runtime": 214.0497, |
|
"eval_samples_per_second": 212.539, |
|
"eval_steps_per_second": 3.322, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.7866473149492017, |
|
"grad_norm": 1.2074401378631592, |
|
"learning_rate": 7.213352685050799e-05, |
|
"loss": 1.9782, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.7866473149492017, |
|
"eval_loss": 1.0388689041137695, |
|
"eval_runtime": 213.9256, |
|
"eval_samples_per_second": 212.663, |
|
"eval_steps_per_second": 3.324, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.8156748911465894, |
|
"grad_norm": 1.42034113407135, |
|
"learning_rate": 7.184325108853412e-05, |
|
"loss": 1.9678, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 2.8156748911465894, |
|
"eval_loss": 1.0477054119110107, |
|
"eval_runtime": 214.1129, |
|
"eval_samples_per_second": 212.477, |
|
"eval_steps_per_second": 3.321, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 2.8447024673439767, |
|
"grad_norm": 1.2497634887695312, |
|
"learning_rate": 7.155297532656023e-05, |
|
"loss": 1.9499, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 2.8447024673439767, |
|
"eval_loss": 1.0382879972457886, |
|
"eval_runtime": 214.4692, |
|
"eval_samples_per_second": 212.124, |
|
"eval_steps_per_second": 3.315, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 2.8737300435413644, |
|
"grad_norm": 1.2587764263153076, |
|
"learning_rate": 7.126269956458636e-05, |
|
"loss": 1.9596, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 2.8737300435413644, |
|
"eval_loss": 1.0374723672866821, |
|
"eval_runtime": 214.3582, |
|
"eval_samples_per_second": 212.234, |
|
"eval_steps_per_second": 3.317, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 2.9027576197387517, |
|
"grad_norm": 1.2650773525238037, |
|
"learning_rate": 7.097242380261248e-05, |
|
"loss": 1.9632, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.9027576197387517, |
|
"eval_loss": 1.0395891666412354, |
|
"eval_runtime": 214.2184, |
|
"eval_samples_per_second": 212.372, |
|
"eval_steps_per_second": 3.319, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.9317851959361394, |
|
"grad_norm": 1.237382411956787, |
|
"learning_rate": 7.068214804063861e-05, |
|
"loss": 1.9448, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 2.9317851959361394, |
|
"eval_loss": 1.0347273349761963, |
|
"eval_runtime": 214.6802, |
|
"eval_samples_per_second": 211.915, |
|
"eval_steps_per_second": 3.312, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 2.9608127721335267, |
|
"grad_norm": 1.2535216808319092, |
|
"learning_rate": 7.039187227866474e-05, |
|
"loss": 1.9633, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.9608127721335267, |
|
"eval_loss": 1.0382635593414307, |
|
"eval_runtime": 214.2729, |
|
"eval_samples_per_second": 212.318, |
|
"eval_steps_per_second": 3.318, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.9898403483309144, |
|
"grad_norm": 1.2122920751571655, |
|
"learning_rate": 7.010159651669086e-05, |
|
"loss": 1.9531, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 2.9898403483309144, |
|
"eval_loss": 1.0362297296524048, |
|
"eval_runtime": 214.3174, |
|
"eval_samples_per_second": 212.274, |
|
"eval_steps_per_second": 3.318, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 3.018867924528302, |
|
"grad_norm": 1.207924723625183, |
|
"learning_rate": 6.981132075471698e-05, |
|
"loss": 1.9597, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 3.018867924528302, |
|
"eval_loss": 1.0346544981002808, |
|
"eval_runtime": 214.0838, |
|
"eval_samples_per_second": 212.506, |
|
"eval_steps_per_second": 3.321, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 3.0478955007256894, |
|
"grad_norm": 1.3156700134277344, |
|
"learning_rate": 6.95210449927431e-05, |
|
"loss": 1.9284, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.0478955007256894, |
|
"eval_loss": 1.0392136573791504, |
|
"eval_runtime": 214.2728, |
|
"eval_samples_per_second": 212.318, |
|
"eval_steps_per_second": 3.318, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 1.2844287157058716, |
|
"learning_rate": 6.923076923076924e-05, |
|
"loss": 1.9524, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 1.0422698259353638, |
|
"eval_runtime": 214.2459, |
|
"eval_samples_per_second": 212.345, |
|
"eval_steps_per_second": 3.319, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 3.1059506531204644, |
|
"grad_norm": 1.3154046535491943, |
|
"learning_rate": 6.894049346879537e-05, |
|
"loss": 1.9321, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 3.1059506531204644, |
|
"eval_loss": 1.0372092723846436, |
|
"eval_runtime": 214.4246, |
|
"eval_samples_per_second": 212.168, |
|
"eval_steps_per_second": 3.316, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 3.134978229317852, |
|
"grad_norm": 1.30637788772583, |
|
"learning_rate": 6.865021770682148e-05, |
|
"loss": 1.9414, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 3.134978229317852, |
|
"eval_loss": 1.0316834449768066, |
|
"eval_runtime": 214.2895, |
|
"eval_samples_per_second": 212.302, |
|
"eval_steps_per_second": 3.318, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 3.1640058055152394, |
|
"grad_norm": 1.375622272491455, |
|
"learning_rate": 6.83599419448476e-05, |
|
"loss": 1.9255, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 3.1640058055152394, |
|
"eval_loss": 1.0339484214782715, |
|
"eval_runtime": 214.0141, |
|
"eval_samples_per_second": 212.575, |
|
"eval_steps_per_second": 3.322, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 3.193033381712627, |
|
"grad_norm": 1.2978899478912354, |
|
"learning_rate": 6.806966618287373e-05, |
|
"loss": 1.9384, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.193033381712627, |
|
"eval_loss": 1.033180832862854, |
|
"eval_runtime": 214.382, |
|
"eval_samples_per_second": 212.21, |
|
"eval_steps_per_second": 3.317, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.2220609579100143, |
|
"grad_norm": 1.233608603477478, |
|
"learning_rate": 6.777939042089986e-05, |
|
"loss": 1.9297, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 3.2220609579100143, |
|
"eval_loss": 1.0305285453796387, |
|
"eval_runtime": 214.0802, |
|
"eval_samples_per_second": 212.509, |
|
"eval_steps_per_second": 3.321, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 3.251088534107402, |
|
"grad_norm": 1.2634618282318115, |
|
"learning_rate": 6.748911465892598e-05, |
|
"loss": 1.9315, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 3.251088534107402, |
|
"eval_loss": 1.0329853296279907, |
|
"eval_runtime": 214.8185, |
|
"eval_samples_per_second": 211.779, |
|
"eval_steps_per_second": 3.31, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 3.28011611030479, |
|
"grad_norm": 1.3260959386825562, |
|
"learning_rate": 6.719883889695211e-05, |
|
"loss": 1.9331, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 3.28011611030479, |
|
"eval_loss": 1.0363577604293823, |
|
"eval_runtime": 214.1897, |
|
"eval_samples_per_second": 212.401, |
|
"eval_steps_per_second": 3.319, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 3.309143686502177, |
|
"grad_norm": 1.330241322517395, |
|
"learning_rate": 6.690856313497823e-05, |
|
"loss": 1.9355, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 3.309143686502177, |
|
"eval_loss": 1.0366979837417603, |
|
"eval_runtime": 214.2627, |
|
"eval_samples_per_second": 212.328, |
|
"eval_steps_per_second": 3.318, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 3.3381712626995648, |
|
"grad_norm": 1.3124949932098389, |
|
"learning_rate": 6.661828737300436e-05, |
|
"loss": 1.9141, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.3381712626995648, |
|
"eval_loss": 1.0314677953720093, |
|
"eval_runtime": 214.3893, |
|
"eval_samples_per_second": 212.203, |
|
"eval_steps_per_second": 3.316, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.367198838896952, |
|
"grad_norm": 1.2886366844177246, |
|
"learning_rate": 6.632801161103049e-05, |
|
"loss": 1.918, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 3.367198838896952, |
|
"eval_loss": 1.029552698135376, |
|
"eval_runtime": 214.4197, |
|
"eval_samples_per_second": 212.173, |
|
"eval_steps_per_second": 3.316, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 3.3962264150943398, |
|
"grad_norm": 1.4406765699386597, |
|
"learning_rate": 6.60377358490566e-05, |
|
"loss": 1.9192, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 3.3962264150943398, |
|
"eval_loss": 1.0297138690948486, |
|
"eval_runtime": 214.4728, |
|
"eval_samples_per_second": 212.12, |
|
"eval_steps_per_second": 3.315, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 3.425253991291727, |
|
"grad_norm": 1.3517920970916748, |
|
"learning_rate": 6.574746008708274e-05, |
|
"loss": 1.9146, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 3.425253991291727, |
|
"eval_loss": 1.0310994386672974, |
|
"eval_runtime": 214.5121, |
|
"eval_samples_per_second": 212.081, |
|
"eval_steps_per_second": 3.314, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 3.4542815674891147, |
|
"grad_norm": 1.31048583984375, |
|
"learning_rate": 6.545718432510885e-05, |
|
"loss": 1.9235, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 3.4542815674891147, |
|
"eval_loss": 1.029317021369934, |
|
"eval_runtime": 214.205, |
|
"eval_samples_per_second": 212.385, |
|
"eval_steps_per_second": 3.319, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 3.483309143686502, |
|
"grad_norm": 1.2714518308639526, |
|
"learning_rate": 6.516690856313497e-05, |
|
"loss": 1.9161, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.483309143686502, |
|
"eval_loss": 1.0265744924545288, |
|
"eval_runtime": 214.3435, |
|
"eval_samples_per_second": 212.248, |
|
"eval_steps_per_second": 3.317, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.5123367198838897, |
|
"grad_norm": 1.274511456489563, |
|
"learning_rate": 6.487663280116111e-05, |
|
"loss": 1.9295, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 3.5123367198838897, |
|
"eval_loss": 1.026885747909546, |
|
"eval_runtime": 214.622, |
|
"eval_samples_per_second": 211.973, |
|
"eval_steps_per_second": 3.313, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 3.5413642960812775, |
|
"grad_norm": 1.4020469188690186, |
|
"learning_rate": 6.458635703918723e-05, |
|
"loss": 1.9214, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 3.5413642960812775, |
|
"eval_loss": 1.0313502550125122, |
|
"eval_runtime": 214.5336, |
|
"eval_samples_per_second": 212.06, |
|
"eval_steps_per_second": 3.314, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 3.5703918722786647, |
|
"grad_norm": 1.329451322555542, |
|
"learning_rate": 6.429608127721336e-05, |
|
"loss": 1.8986, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 3.5703918722786647, |
|
"eval_loss": 1.027103304862976, |
|
"eval_runtime": 215.1871, |
|
"eval_samples_per_second": 211.416, |
|
"eval_steps_per_second": 3.304, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 3.599419448476052, |
|
"grad_norm": 1.2377736568450928, |
|
"learning_rate": 6.400580551523948e-05, |
|
"loss": 1.8982, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 3.599419448476052, |
|
"eval_loss": 1.0257542133331299, |
|
"eval_runtime": 214.3661, |
|
"eval_samples_per_second": 212.226, |
|
"eval_steps_per_second": 3.317, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 3.6284470246734397, |
|
"grad_norm": 1.2443993091583252, |
|
"learning_rate": 6.37155297532656e-05, |
|
"loss": 1.909, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.6284470246734397, |
|
"eval_loss": 1.0211207866668701, |
|
"eval_runtime": 214.1724, |
|
"eval_samples_per_second": 212.418, |
|
"eval_steps_per_second": 3.32, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.6574746008708274, |
|
"grad_norm": 1.3550719022750854, |
|
"learning_rate": 6.342525399129173e-05, |
|
"loss": 1.8973, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 3.6574746008708274, |
|
"eval_loss": 1.0253050327301025, |
|
"eval_runtime": 214.3603, |
|
"eval_samples_per_second": 212.231, |
|
"eval_steps_per_second": 3.317, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 3.6865021770682147, |
|
"grad_norm": 1.2715822458267212, |
|
"learning_rate": 6.313497822931786e-05, |
|
"loss": 1.8928, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 3.6865021770682147, |
|
"eval_loss": 1.0215857028961182, |
|
"eval_runtime": 214.2543, |
|
"eval_samples_per_second": 212.336, |
|
"eval_steps_per_second": 3.318, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 3.7155297532656024, |
|
"grad_norm": 1.230591893196106, |
|
"learning_rate": 6.284470246734397e-05, |
|
"loss": 1.8998, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 3.7155297532656024, |
|
"eval_loss": 1.0226044654846191, |
|
"eval_runtime": 214.8109, |
|
"eval_samples_per_second": 211.786, |
|
"eval_steps_per_second": 3.31, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 3.7445573294629897, |
|
"grad_norm": 1.2558367252349854, |
|
"learning_rate": 6.25544267053701e-05, |
|
"loss": 1.9083, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 3.7445573294629897, |
|
"eval_loss": 1.0279453992843628, |
|
"eval_runtime": 214.5403, |
|
"eval_samples_per_second": 212.053, |
|
"eval_steps_per_second": 3.314, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 3.7735849056603774, |
|
"grad_norm": 1.3605984449386597, |
|
"learning_rate": 6.226415094339622e-05, |
|
"loss": 1.8947, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.7735849056603774, |
|
"eval_loss": 1.0254005193710327, |
|
"eval_runtime": 214.5443, |
|
"eval_samples_per_second": 212.049, |
|
"eval_steps_per_second": 3.314, |
|
"step": 13000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 34450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 5 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3003632111570125e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|