|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9934640522875817, |
|
"global_step": 304, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0001125, |
|
"loss": 3.8539, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.000225, |
|
"loss": 3.5808, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00033749999999999996, |
|
"loss": 3.335, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00045, |
|
"loss": 3.2031, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0005625, |
|
"loss": 3.0706, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 0.0005999286081239726, |
|
"loss": 3.084, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 0.0005995538936819289, |
|
"loss": 2.9744, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 0.0005988584094275236, |
|
"loss": 2.8987, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 0.0005978429001027164, |
|
"loss": 2.8293, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 0.0005965084531403281, |
|
"loss": 2.7609, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.0005948564974995903, |
|
"loss": 2.7155, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 0.00059288880213598, |
|
"loss": 2.7184, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 0.0005906074741069779, |
|
"loss": 2.6533, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 0.0005880149563157786, |
|
"loss": 2.6509, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.0005851140248953683, |
|
"loss": 2.5859, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 0.0005819077862357724, |
|
"loss": 2.5516, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 0.0005783996736576553, |
|
"loss": 2.5855, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 0.0005745934437358341, |
|
"loss": 2.5306, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 0.0005704931722766448, |
|
"loss": 2.5159, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 0.0005661032499534664, |
|
"loss": 2.5304, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 0.0005614283776050784, |
|
"loss": 2.5005, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 0.0005564735612018839, |
|
"loss": 2.4884, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 0.0005512441064853923, |
|
"loss": 2.4729, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 0.0005457456132866975, |
|
"loss": 2.4226, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 0.0005399839695300389, |
|
"loss": 2.412, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"eval_accuracy": 0.5457690353760842, |
|
"eval_loss": 2.502744436264038, |
|
"eval_runtime": 56.9596, |
|
"eval_samples_per_second": 19.154, |
|
"eval_steps_per_second": 19.154, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"learning_rate": 0.0005339653449278644, |
|
"loss": 2.6265, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"learning_rate": 0.0005276961843741485, |
|
"loss": 2.148, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"learning_rate": 0.0005211832010430372, |
|
"loss": 2.1056, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"learning_rate": 0.0005144333692002139, |
|
"loss": 2.1074, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 0.0005074539167346808, |
|
"loss": 2.118, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"learning_rate": 0.0005002523174189542, |
|
"loss": 2.1045, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 0.0004928362829059618, |
|
"loss": 2.061, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"learning_rate": 0.0004852137544712115, |
|
"loss": 2.0959, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 0.0004773928945090747, |
|
"loss": 2.0637, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"learning_rate": 0.0004693820777922901, |
|
"loss": 2.004, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"learning_rate": 0.00046118988250404714, |
|
"loss": 2.0516, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"learning_rate": 0.00045282508105225254, |
|
"loss": 2.1182, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"learning_rate": 0.00044429663067581626, |
|
"loss": 2.0648, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"learning_rate": 0.0004356136638530159, |
|
"loss": 2.058, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"learning_rate": 0.0004267854785222098, |
|
"loss": 2.012, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"learning_rate": 0.00041782152812537223, |
|
"loss": 2.0105, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"learning_rate": 0.00040873141148511043, |
|
"loss": 1.9976, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"learning_rate": 0.00039952486252600565, |
|
"loss": 2.0034, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"learning_rate": 0.00039021173985128186, |
|
"loss": 2.0171, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"learning_rate": 0.00038080201618596784, |
|
"loss": 2.0163, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"learning_rate": 0.0003713057676978519, |
|
"loss": 1.9683, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"learning_rate": 0.00036173316320767046, |
|
"loss": 2.0209, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"learning_rate": 0.0003520944533000791, |
|
"loss": 1.9655, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"learning_rate": 0.0003423999593470703, |
|
"loss": 2.0127, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"learning_rate": 0.00033266006245558934, |
|
"loss": 1.9702, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"eval_accuracy": 0.5849799255079998, |
|
"eval_loss": 2.2756919860839844, |
|
"eval_runtime": 56.7417, |
|
"eval_samples_per_second": 19.227, |
|
"eval_steps_per_second": 19.227, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"learning_rate": 0.00032288519235118573, |
|
"loss": 2.1744, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"learning_rate": 0.00031308581620960083, |
|
"loss": 1.6022, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"learning_rate": 0.0003032724274482547, |
|
"loss": 1.6168, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"learning_rate": 0.0002934555344896317, |
|
"loss": 1.5467, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"learning_rate": 0.00028364564950859807, |
|
"loss": 1.6023, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"learning_rate": 0.0002738532771757025, |
|
"loss": 1.4977, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"learning_rate": 0.0002640889034085113, |
|
"loss": 1.5448, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"learning_rate": 0.00025436298414302494, |
|
"loss": 1.5738, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"learning_rate": 0.000244685934137201, |
|
"loss": 1.559, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"learning_rate": 0.00023506811581856912, |
|
"loss": 1.5734, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"learning_rate": 0.00022551982818788506, |
|
"loss": 1.4986, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"learning_rate": 0.00021605129579070238, |
|
"loss": 1.545, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"learning_rate": 0.00020667265776867276, |
|
"loss": 1.5496, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"learning_rate": 0.00019739395700229937, |
|
"loss": 1.5426, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"learning_rate": 0.0001882251293567691, |
|
"loss": 1.4687, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"learning_rate": 0.00017917599304237886, |
|
"loss": 1.5415, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 0.0001702562381009501, |
|
"loss": 1.5289, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"learning_rate": 0.0001614754160294899, |
|
"loss": 1.5449, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"learning_rate": 0.0001528429295522076, |
|
"loss": 1.5273, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"learning_rate": 0.0001443680225518435, |
|
"loss": 1.5146, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"learning_rate": 0.000136059770171087, |
|
"loss": 1.5164, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"learning_rate": 0.00012792706909468623, |
|
"loss": 1.5239, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"learning_rate": 0.00011997862802265573, |
|
"loss": 1.4905, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"learning_rate": 0.00011222295834478227, |
|
"loss": 1.4968, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"learning_rate": 0.0001046683650264153, |
|
"loss": 1.4934, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"learning_rate": 9.732293771530192e-05, |
|
"loss": 1.4628, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"eval_accuracy": 0.6081706452777681, |
|
"eval_loss": 2.2162108421325684, |
|
"eval_runtime": 57.053, |
|
"eval_samples_per_second": 19.123, |
|
"eval_steps_per_second": 19.123, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"learning_rate": 9.019454207898983e-05, |
|
"loss": 1.4385, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"learning_rate": 8.329081138207334e-05, |
|
"loss": 1.2032, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"learning_rate": 7.661913831230212e-05, |
|
"loss": 1.1659, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"learning_rate": 7.018666706430662e-05, |
|
"loss": 1.1521, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"learning_rate": 6.40002856894149e-05, |
|
"loss": 1.1916, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"learning_rate": 5.8066618719755195e-05, |
|
"loss": 1.163, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"learning_rate": 5.239202007454086e-05, |
|
"loss": 1.1624, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"learning_rate": 4.698256625613435e-05, |
|
"loss": 1.1203, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"learning_rate": 4.1844049843176334e-05, |
|
"loss": 1.1709, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"learning_rate": 3.698197328774769e-05, |
|
"loss": 1.1263, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"learning_rate": 3.2401543023205764e-05, |
|
"loss": 1.1277, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"learning_rate": 2.8107663889005016e-05, |
|
"loss": 1.1759, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"learning_rate": 2.410493387847232e-05, |
|
"loss": 1.157, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"learning_rate": 2.0397639215160466e-05, |
|
"loss": 1.1721, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"learning_rate": 1.698974976305243e-05, |
|
"loss": 1.1577, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"learning_rate": 1.3884914775531952e-05, |
|
"loss": 1.1546, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"learning_rate": 1.1086458987671187e-05, |
|
"loss": 1.1564, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"learning_rate": 8.59737905602157e-06, |
|
"loss": 1.1241, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"learning_rate": 6.4203403497185e-06, |
|
"loss": 1.1605, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"learning_rate": 4.557674096337593e-06, |
|
"loss": 1.1114, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"learning_rate": 3.011374885557638e-06, |
|
"loss": 1.1377, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"learning_rate": 1.783098533304106e-06, |
|
"loss": 1.1541, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"learning_rate": 8.741603086600102e-07, |
|
"loss": 1.174, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"learning_rate": 2.855335254426605e-07, |
|
"loss": 1.1415, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"learning_rate": 1.7848499955075423e-08, |
|
"loss": 1.1662, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"eval_accuracy": 0.611281497151223, |
|
"eval_loss": 2.2855756282806396, |
|
"eval_runtime": 56.9619, |
|
"eval_samples_per_second": 19.153, |
|
"eval_steps_per_second": 19.153, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"step": 304, |
|
"total_flos": 2.9714731304484864e+16, |
|
"train_loss": 1.8919498779271777, |
|
"train_runtime": 5682.4938, |
|
"train_samples_per_second": 6.892, |
|
"train_steps_per_second": 0.053 |
|
} |
|
], |
|
"max_steps": 304, |
|
"num_train_epochs": 4, |
|
"total_flos": 2.9714731304484864e+16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|