|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 620, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0032258064516129032, |
|
"grad_norm": 3.971249580383301, |
|
"learning_rate": 1.6129032258064518e-07, |
|
"loss": 0.6284, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016129032258064516, |
|
"grad_norm": 5.593815326690674, |
|
"learning_rate": 8.064516129032258e-07, |
|
"loss": 0.6406, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03225806451612903, |
|
"grad_norm": 3.338022232055664, |
|
"learning_rate": 1.6129032258064516e-06, |
|
"loss": 0.6179, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04838709677419355, |
|
"grad_norm": 2.572110891342163, |
|
"learning_rate": 2.4193548387096776e-06, |
|
"loss": 0.5695, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06451612903225806, |
|
"grad_norm": 2.096151113510132, |
|
"learning_rate": 3.225806451612903e-06, |
|
"loss": 0.5072, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08064516129032258, |
|
"grad_norm": 2.135874032974243, |
|
"learning_rate": 4.032258064516129e-06, |
|
"loss": 0.4539, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0967741935483871, |
|
"grad_norm": 1.8665850162506104, |
|
"learning_rate": 4.838709677419355e-06, |
|
"loss": 0.394, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11290322580645161, |
|
"grad_norm": 1.8506455421447754, |
|
"learning_rate": 5.645161290322582e-06, |
|
"loss": 0.3099, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"grad_norm": 1.882623314857483, |
|
"learning_rate": 6.451612903225806e-06, |
|
"loss": 0.2164, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14516129032258066, |
|
"grad_norm": 2.4444594383239746, |
|
"learning_rate": 7.258064516129033e-06, |
|
"loss": 0.1262, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 1.5979326963424683, |
|
"learning_rate": 8.064516129032258e-06, |
|
"loss": 0.068, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1774193548387097, |
|
"grad_norm": 3.323439359664917, |
|
"learning_rate": 8.870967741935484e-06, |
|
"loss": 0.0337, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1935483870967742, |
|
"grad_norm": 1.5633885860443115, |
|
"learning_rate": 9.67741935483871e-06, |
|
"loss": 0.0272, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20967741935483872, |
|
"grad_norm": 1.1317998170852661, |
|
"learning_rate": 9.99928681279855e-06, |
|
"loss": 0.0208, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.22580645161290322, |
|
"grad_norm": 0.4592137932777405, |
|
"learning_rate": 9.994929183335237e-06, |
|
"loss": 0.0158, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24193548387096775, |
|
"grad_norm": 0.7832735180854797, |
|
"learning_rate": 9.986613588305435e-06, |
|
"loss": 0.0145, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"grad_norm": 0.8761097192764282, |
|
"learning_rate": 9.974346616959476e-06, |
|
"loss": 0.0104, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27419354838709675, |
|
"grad_norm": 0.9911714196205139, |
|
"learning_rate": 9.95813798960538e-06, |
|
"loss": 0.0115, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2903225806451613, |
|
"grad_norm": 0.48290345072746277, |
|
"learning_rate": 9.938000549906509e-06, |
|
"loss": 0.0092, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3064516129032258, |
|
"grad_norm": 0.8782544136047363, |
|
"learning_rate": 9.913950254704291e-06, |
|
"loss": 0.0167, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 0.827684223651886, |
|
"learning_rate": 9.88600616137407e-06, |
|
"loss": 0.0087, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3387096774193548, |
|
"grad_norm": 0.4351416826248169, |
|
"learning_rate": 9.854190412724114e-06, |
|
"loss": 0.0092, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3548387096774194, |
|
"grad_norm": 0.617416262626648, |
|
"learning_rate": 9.818528219449705e-06, |
|
"loss": 0.0106, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3709677419354839, |
|
"grad_norm": 0.3410218060016632, |
|
"learning_rate": 9.779047840156288e-06, |
|
"loss": 0.0092, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 0.6016420722007751, |
|
"learning_rate": 9.735780558967434e-06, |
|
"loss": 0.0081, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4032258064516129, |
|
"grad_norm": 0.34005704522132874, |
|
"learning_rate": 9.688760660735403e-06, |
|
"loss": 0.0075, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.41935483870967744, |
|
"grad_norm": 1.3894850015640259, |
|
"learning_rate": 9.638025403873939e-06, |
|
"loss": 0.0074, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.43548387096774194, |
|
"grad_norm": 0.3171682059764862, |
|
"learning_rate": 9.58361499083483e-06, |
|
"loss": 0.007, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.45161290322580644, |
|
"grad_norm": 0.5876194834709167, |
|
"learning_rate": 9.525572536251608e-06, |
|
"loss": 0.0085, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46774193548387094, |
|
"grad_norm": 0.47260814905166626, |
|
"learning_rate": 9.46394403277566e-06, |
|
"loss": 0.0067, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 0.6115548014640808, |
|
"learning_rate": 9.398778314631801e-06, |
|
"loss": 0.0084, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.44270601868629456, |
|
"learning_rate": 9.330127018922195e-06, |
|
"loss": 0.0063, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5161290322580645, |
|
"grad_norm": 0.5065975785255432, |
|
"learning_rate": 9.258044544709276e-06, |
|
"loss": 0.0079, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.532258064516129, |
|
"grad_norm": 0.9441617131233215, |
|
"learning_rate": 9.182588009910119e-06, |
|
"loss": 0.0075, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5483870967741935, |
|
"grad_norm": 0.7031365036964417, |
|
"learning_rate": 9.103817206036383e-06, |
|
"loss": 0.0067, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5645161290322581, |
|
"grad_norm": 0.6422185301780701, |
|
"learning_rate": 9.021794550815713e-06, |
|
"loss": 0.0088, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5806451612903226, |
|
"grad_norm": 0.547900378704071, |
|
"learning_rate": 8.936585038732143e-06, |
|
"loss": 0.006, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5967741935483871, |
|
"grad_norm": 0.6972706913948059, |
|
"learning_rate": 8.848256189524661e-06, |
|
"loss": 0.0057, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6129032258064516, |
|
"grad_norm": 0.37197422981262207, |
|
"learning_rate": 8.756877994684818e-06, |
|
"loss": 0.0049, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6290322580645161, |
|
"grad_norm": 0.2147480994462967, |
|
"learning_rate": 8.66252286199567e-06, |
|
"loss": 0.0081, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 0.44903531670570374, |
|
"learning_rate": 8.565265558156101e-06, |
|
"loss": 0.0055, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6612903225806451, |
|
"grad_norm": 1.0128227472305298, |
|
"learning_rate": 8.465183149535939e-06, |
|
"loss": 0.0054, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6774193548387096, |
|
"grad_norm": 0.4877210557460785, |
|
"learning_rate": 8.362354941108803e-06, |
|
"loss": 0.0057, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6935483870967742, |
|
"grad_norm": 0.2644139230251312, |
|
"learning_rate": 8.256862413611113e-06, |
|
"loss": 0.005, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7096774193548387, |
|
"grad_norm": 0.6257811784744263, |
|
"learning_rate": 8.148789158977012e-06, |
|
"loss": 0.0055, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7258064516129032, |
|
"grad_norm": 0.19504040479660034, |
|
"learning_rate": 8.038220814100403e-06, |
|
"loss": 0.005, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7419354838709677, |
|
"grad_norm": 0.5125346779823303, |
|
"learning_rate": 7.925244992976538e-06, |
|
"loss": 0.0055, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7580645161290323, |
|
"grad_norm": 0.46777665615081787, |
|
"learning_rate": 7.809951217276986e-06, |
|
"loss": 0.0052, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7741935483870968, |
|
"grad_norm": 0.5273892283439636, |
|
"learning_rate": 7.692430845412946e-06, |
|
"loss": 0.006, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7903225806451613, |
|
"grad_norm": 0.5478794574737549, |
|
"learning_rate": 7.572777000143145e-06, |
|
"loss": 0.0078, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"grad_norm": 0.35614484548568726, |
|
"learning_rate": 7.451084494783668e-06, |
|
"loss": 0.0051, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8225806451612904, |
|
"grad_norm": 0.42776939272880554, |
|
"learning_rate": 7.327449758078194e-06, |
|
"loss": 0.0057, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8387096774193549, |
|
"grad_norm": 0.19282685220241547, |
|
"learning_rate": 7.201970757788172e-06, |
|
"loss": 0.0039, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8548387096774194, |
|
"grad_norm": 0.7167965173721313, |
|
"learning_rate": 7.074746923063497e-06, |
|
"loss": 0.0042, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8709677419354839, |
|
"grad_norm": 3.5436365604400635, |
|
"learning_rate": 6.945879065655164e-06, |
|
"loss": 0.0042, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8870967741935484, |
|
"grad_norm": 0.3570103347301483, |
|
"learning_rate": 6.815469300032374e-06, |
|
"loss": 0.0043, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9032258064516129, |
|
"grad_norm": 0.252421110868454, |
|
"learning_rate": 6.6836209624673575e-06, |
|
"loss": 0.0056, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9193548387096774, |
|
"grad_norm": 0.2856729030609131, |
|
"learning_rate": 6.5504385291520554e-06, |
|
"loss": 0.0052, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9354838709677419, |
|
"grad_norm": 0.25228211283683777, |
|
"learning_rate": 6.41602753341152e-06, |
|
"loss": 0.0055, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9516129032258065, |
|
"grad_norm": 0.29258614778518677, |
|
"learning_rate": 6.2804944820796596e-06, |
|
"loss": 0.0037, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 0.1690083146095276, |
|
"learning_rate": 6.143946771103561e-06, |
|
"loss": 0.0032, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9838709677419355, |
|
"grad_norm": 0.45113542675971985, |
|
"learning_rate": 6.006492600443301e-06, |
|
"loss": 0.0036, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.34520477056503296, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 0.0049, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.004980116616934538, |
|
"eval_runtime": 3.8044, |
|
"eval_samples_per_second": 0.789, |
|
"eval_steps_per_second": 0.789, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0161290322580645, |
|
"grad_norm": 0.19288980960845947, |
|
"learning_rate": 5.729301184982622e-06, |
|
"loss": 0.0034, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.032258064516129, |
|
"grad_norm": 0.19823557138442993, |
|
"learning_rate": 5.5897835857542315e-06, |
|
"loss": 0.0026, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0483870967741935, |
|
"grad_norm": 0.09697025269269943, |
|
"learning_rate": 5.449798643939305e-06, |
|
"loss": 0.0024, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.064516129032258, |
|
"grad_norm": 0.23041202127933502, |
|
"learning_rate": 5.30945728314841e-06, |
|
"loss": 0.0028, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0806451612903225, |
|
"grad_norm": 0.27025359869003296, |
|
"learning_rate": 5.168870709417342e-06, |
|
"loss": 0.0034, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.096774193548387, |
|
"grad_norm": 0.3000248968601227, |
|
"learning_rate": 5.0281503230878304e-06, |
|
"loss": 0.0033, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1129032258064515, |
|
"grad_norm": 0.22583173215389252, |
|
"learning_rate": 4.887407630534271e-06, |
|
"loss": 0.0027, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.129032258064516, |
|
"grad_norm": 0.21011537313461304, |
|
"learning_rate": 4.746754155806437e-06, |
|
"loss": 0.0026, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1451612903225807, |
|
"grad_norm": 0.13884232938289642, |
|
"learning_rate": 4.606301352258192e-06, |
|
"loss": 0.0028, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.1612903225806452, |
|
"grad_norm": 0.25201186537742615, |
|
"learning_rate": 4.466160514232206e-06, |
|
"loss": 0.0027, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1774193548387097, |
|
"grad_norm": 0.22268442809581757, |
|
"learning_rate": 4.326442688870697e-06, |
|
"loss": 0.0028, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.1935483870967742, |
|
"grad_norm": 0.21970783174037933, |
|
"learning_rate": 4.187258588122019e-06, |
|
"loss": 0.0031, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2096774193548387, |
|
"grad_norm": 0.26229333877563477, |
|
"learning_rate": 4.048718501012895e-06, |
|
"loss": 0.0026, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.2258064516129032, |
|
"grad_norm": 0.25772520899772644, |
|
"learning_rate": 3.910932206255742e-06, |
|
"loss": 0.0022, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2419354838709677, |
|
"grad_norm": 0.4050711691379547, |
|
"learning_rate": 3.77400888526038e-06, |
|
"loss": 0.0023, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.2580645161290323, |
|
"grad_norm": 0.2486872673034668, |
|
"learning_rate": 3.6380570356190346e-06, |
|
"loss": 0.0036, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2741935483870968, |
|
"grad_norm": 0.20115166902542114, |
|
"learning_rate": 3.5031843851332105e-06, |
|
"loss": 0.0017, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.2903225806451613, |
|
"grad_norm": 0.19394518435001373, |
|
"learning_rate": 3.3694978064505258e-06, |
|
"loss": 0.0029, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3064516129032258, |
|
"grad_norm": 0.25958162546157837, |
|
"learning_rate": 3.2371032323791757e-06, |
|
"loss": 0.003, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.3225806451612903, |
|
"grad_norm": 0.27680277824401855, |
|
"learning_rate": 3.10610557194712e-06, |
|
"loss": 0.002, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3387096774193548, |
|
"grad_norm": 0.31953132152557373, |
|
"learning_rate": 2.97660862727252e-06, |
|
"loss": 0.0039, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.3548387096774195, |
|
"grad_norm": 0.3594481647014618, |
|
"learning_rate": 2.848715011311271e-06, |
|
"loss": 0.0031, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.370967741935484, |
|
"grad_norm": 0.15407758951187134, |
|
"learning_rate": 2.72252606654683e-06, |
|
"loss": 0.0032, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.3870967741935485, |
|
"grad_norm": 0.25268280506134033, |
|
"learning_rate": 2.5981417846867753e-06, |
|
"loss": 0.0029, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.403225806451613, |
|
"grad_norm": 0.19317582249641418, |
|
"learning_rate": 2.4756607274296844e-06, |
|
"loss": 0.0035, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.4193548387096775, |
|
"grad_norm": 0.15693552792072296, |
|
"learning_rate": 2.3551799483651894e-06, |
|
"loss": 0.0026, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.435483870967742, |
|
"grad_norm": 0.19608916342258453, |
|
"learning_rate": 2.236794916069007e-06, |
|
"loss": 0.0028, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.4516129032258065, |
|
"grad_norm": 0.403870552778244, |
|
"learning_rate": 2.120599438453968e-06, |
|
"loss": 0.0032, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.467741935483871, |
|
"grad_norm": 0.16400307416915894, |
|
"learning_rate": 2.0066855884369246e-06, |
|
"loss": 0.0025, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.4838709677419355, |
|
"grad_norm": 0.954981803894043, |
|
"learning_rate": 1.8951436309804766e-06, |
|
"loss": 0.0024, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.23671939969062805, |
|
"learning_rate": 1.7860619515673034e-06, |
|
"loss": 0.0029, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.5161290322580645, |
|
"grad_norm": 0.1934683918952942, |
|
"learning_rate": 1.6795269861638041e-06, |
|
"loss": 0.0033, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.532258064516129, |
|
"grad_norm": 0.12341190874576569, |
|
"learning_rate": 1.5756231527285181e-06, |
|
"loss": 0.0027, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.5483870967741935, |
|
"grad_norm": 0.44817498326301575, |
|
"learning_rate": 1.4744327843196043e-06, |
|
"loss": 0.0034, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.564516129032258, |
|
"grad_norm": 0.29806482791900635, |
|
"learning_rate": 1.3760360638544012e-06, |
|
"loss": 0.0031, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.5806451612903225, |
|
"grad_norm": 0.19129404425621033, |
|
"learning_rate": 1.280510960572745e-06, |
|
"loss": 0.0017, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.596774193548387, |
|
"grad_norm": 0.21360230445861816, |
|
"learning_rate": 1.1879331682543972e-06, |
|
"loss": 0.0034, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.6129032258064515, |
|
"grad_norm": 0.2776956260204315, |
|
"learning_rate": 1.0983760452395415e-06, |
|
"loss": 0.0021, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.629032258064516, |
|
"grad_norm": 0.15755566954612732, |
|
"learning_rate": 1.01191055629987e-06, |
|
"loss": 0.0019, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.6451612903225805, |
|
"grad_norm": 0.3595326840877533, |
|
"learning_rate": 9.286052164063369e-07, |
|
"loss": 0.0023, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.661290322580645, |
|
"grad_norm": 0.19125986099243164, |
|
"learning_rate": 8.485260364381187e-07, |
|
"loss": 0.0041, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.6774193548387095, |
|
"grad_norm": 0.3362486958503723, |
|
"learning_rate": 7.717364708758024e-07, |
|
"loss": 0.002, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6935483870967742, |
|
"grad_norm": 0.23303864896297455, |
|
"learning_rate": 6.982973675202676e-07, |
|
"loss": 0.0017, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.7096774193548387, |
|
"grad_norm": 0.13133780658245087, |
|
"learning_rate": 6.282669192770896e-07, |
|
"loss": 0.0024, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.7258064516129032, |
|
"grad_norm": 0.35981041193008423, |
|
"learning_rate": 5.617006180446688e-07, |
|
"loss": 0.0033, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.7419354838709677, |
|
"grad_norm": 0.41394540667533875, |
|
"learning_rate": 4.986512107426283e-07, |
|
"loss": 0.003, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7580645161290323, |
|
"grad_norm": 0.24306242167949677, |
|
"learning_rate": 4.3916865751533313e-07, |
|
"loss": 0.003, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.7741935483870968, |
|
"grad_norm": 0.1294822096824646, |
|
"learning_rate": 3.8330009214363197e-07, |
|
"loss": 0.0028, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7903225806451613, |
|
"grad_norm": 0.12696751952171326, |
|
"learning_rate": 3.310897846962041e-07, |
|
"loss": 0.0027, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.8064516129032258, |
|
"grad_norm": 0.27052465081214905, |
|
"learning_rate": 2.8257910645009935e-07, |
|
"loss": 0.0034, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8225806451612905, |
|
"grad_norm": 0.21007302403450012, |
|
"learning_rate": 2.3780649710827552e-07, |
|
"loss": 0.0029, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.838709677419355, |
|
"grad_norm": 0.20920442044734955, |
|
"learning_rate": 1.9680743434010385e-07, |
|
"loss": 0.0026, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8548387096774195, |
|
"grad_norm": 0.2021329551935196, |
|
"learning_rate": 1.5961440566897913e-07, |
|
"loss": 0.0026, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.870967741935484, |
|
"grad_norm": 0.23637109994888306, |
|
"learning_rate": 1.2625688272930925e-07, |
|
"loss": 0.0036, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8870967741935485, |
|
"grad_norm": 0.17650996148586273, |
|
"learning_rate": 9.676129791329481e-08, |
|
"loss": 0.0024, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.903225806451613, |
|
"grad_norm": 0.19747471809387207, |
|
"learning_rate": 7.115102342598101e-08, |
|
"loss": 0.0035, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.9193548387096775, |
|
"grad_norm": 0.6039708852767944, |
|
"learning_rate": 4.944635276520393e-08, |
|
"loss": 0.0029, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.935483870967742, |
|
"grad_norm": 0.3589779734611511, |
|
"learning_rate": 3.166448464108629e-08, |
|
"loss": 0.0049, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9516129032258065, |
|
"grad_norm": 0.12553617358207703, |
|
"learning_rate": 1.781950934783505e-08, |
|
"loss": 0.0027, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.967741935483871, |
|
"grad_norm": 0.1768651306629181, |
|
"learning_rate": 7.922397598642551e-09, |
|
"loss": 0.0018, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9838709677419355, |
|
"grad_norm": 0.24496199190616608, |
|
"learning_rate": 1.980991832524759e-09, |
|
"loss": 0.0021, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.2647761106491089, |
|
"learning_rate": 0.0, |
|
"loss": 0.0026, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.004049910232424736, |
|
"eval_runtime": 3.8097, |
|
"eval_samples_per_second": 0.787, |
|
"eval_steps_per_second": 0.787, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 620, |
|
"total_flos": 1.1181005894949274e+17, |
|
"train_loss": 0.03632537112270873, |
|
"train_runtime": 2624.7531, |
|
"train_samples_per_second": 0.236, |
|
"train_steps_per_second": 0.236 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 620, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1181005894949274e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|