|
{ |
|
"best_metric": 0.9182948490230906, |
|
"best_model_checkpoint": "./results/checkpoint-29910", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 29910, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.954967021942139, |
|
"learning_rate": 4.9832831828819794e-05, |
|
"loss": 4.7403, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 9.918214797973633, |
|
"learning_rate": 4.9665663657639585e-05, |
|
"loss": 4.7281, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.391179084777832, |
|
"learning_rate": 4.949849548645938e-05, |
|
"loss": 4.6786, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.756315231323242, |
|
"learning_rate": 4.9331327315279175e-05, |
|
"loss": 4.6128, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 8.407713890075684, |
|
"learning_rate": 4.916415914409897e-05, |
|
"loss": 4.4836, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 8.354033470153809, |
|
"learning_rate": 4.899699097291876e-05, |
|
"loss": 4.3776, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 7.996518611907959, |
|
"learning_rate": 4.882982280173855e-05, |
|
"loss": 4.2701, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 14.100532531738281, |
|
"learning_rate": 4.866265463055835e-05, |
|
"loss": 4.1032, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 10.907315254211426, |
|
"learning_rate": 4.849548645937814e-05, |
|
"loss": 3.952, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 9.731605529785156, |
|
"learning_rate": 4.8328318288197924e-05, |
|
"loss": 3.732, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 9.989665985107422, |
|
"learning_rate": 4.816115011701772e-05, |
|
"loss": 3.5489, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 9.542133331298828, |
|
"learning_rate": 4.7993981945837514e-05, |
|
"loss": 3.3949, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 11.988595008850098, |
|
"learning_rate": 4.7826813774657305e-05, |
|
"loss": 3.216, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 13.553967475891113, |
|
"learning_rate": 4.76596456034771e-05, |
|
"loss": 2.9855, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 42.20621109008789, |
|
"learning_rate": 4.7492477432296895e-05, |
|
"loss": 2.7659, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 18.790130615234375, |
|
"learning_rate": 4.732530926111669e-05, |
|
"loss": 2.5604, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 20.554113388061523, |
|
"learning_rate": 4.715814108993648e-05, |
|
"loss": 2.4376, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 18.882707595825195, |
|
"learning_rate": 4.699097291875627e-05, |
|
"loss": 2.3501, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 14.733109474182129, |
|
"learning_rate": 4.682380474757606e-05, |
|
"loss": 2.168, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 17.430740356445312, |
|
"learning_rate": 4.665663657639586e-05, |
|
"loss": 2.0081, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 21.797836303710938, |
|
"learning_rate": 4.6489468405215644e-05, |
|
"loss": 1.9554, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 13.148958206176758, |
|
"learning_rate": 4.632230023403544e-05, |
|
"loss": 1.8524, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 14.161394119262695, |
|
"learning_rate": 4.6155132062855234e-05, |
|
"loss": 1.793, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 20.908519744873047, |
|
"learning_rate": 4.5987963891675026e-05, |
|
"loss": 1.6493, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 15.107952117919922, |
|
"learning_rate": 4.582079572049482e-05, |
|
"loss": 1.5724, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 18.561201095581055, |
|
"learning_rate": 4.5653627549314615e-05, |
|
"loss": 1.4915, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 15.365275382995605, |
|
"learning_rate": 4.548645937813441e-05, |
|
"loss": 1.488, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 16.04875946044922, |
|
"learning_rate": 4.53192912069542e-05, |
|
"loss": 1.4316, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 13.593673706054688, |
|
"learning_rate": 4.515212303577399e-05, |
|
"loss": 1.456, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 16.379798889160156, |
|
"learning_rate": 4.498495486459378e-05, |
|
"loss": 1.3006, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 13.564205169677734, |
|
"learning_rate": 4.481778669341358e-05, |
|
"loss": 1.2661, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 15.44586181640625, |
|
"learning_rate": 4.4650618522233364e-05, |
|
"loss": 1.2917, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 12.80644416809082, |
|
"learning_rate": 4.448345035105316e-05, |
|
"loss": 1.1765, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 19.545106887817383, |
|
"learning_rate": 4.4316282179872954e-05, |
|
"loss": 1.1622, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 14.377379417419434, |
|
"learning_rate": 4.414911400869275e-05, |
|
"loss": 1.1047, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 21.595245361328125, |
|
"learning_rate": 4.398194583751254e-05, |
|
"loss": 1.1384, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 14.641448020935059, |
|
"learning_rate": 4.3814777666332335e-05, |
|
"loss": 1.0872, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 13.082781791687012, |
|
"learning_rate": 4.364760949515213e-05, |
|
"loss": 1.0366, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 18.576641082763672, |
|
"learning_rate": 4.348044132397192e-05, |
|
"loss": 1.0953, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 9.915220260620117, |
|
"learning_rate": 4.331327315279171e-05, |
|
"loss": 1.001, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 12.059024810791016, |
|
"learning_rate": 4.31461049816115e-05, |
|
"loss": 1.0585, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 17.607337951660156, |
|
"learning_rate": 4.29789368104313e-05, |
|
"loss": 1.0179, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 16.324430465698242, |
|
"learning_rate": 4.2811768639251084e-05, |
|
"loss": 0.9491, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 19.5161075592041, |
|
"learning_rate": 4.264460046807088e-05, |
|
"loss": 0.9374, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 20.448488235473633, |
|
"learning_rate": 4.2477432296890674e-05, |
|
"loss": 0.9146, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 10.544804573059082, |
|
"learning_rate": 4.231026412571047e-05, |
|
"loss": 0.9187, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 17.095731735229492, |
|
"learning_rate": 4.214309595453026e-05, |
|
"loss": 0.8732, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 18.1314754486084, |
|
"learning_rate": 4.197592778335005e-05, |
|
"loss": 0.9072, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 8.516233444213867, |
|
"learning_rate": 4.180875961216985e-05, |
|
"loss": 0.8264, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 12.620676040649414, |
|
"learning_rate": 4.164159144098964e-05, |
|
"loss": 0.8425, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 23.544219970703125, |
|
"learning_rate": 4.147442326980943e-05, |
|
"loss": 0.8371, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 15.980536460876465, |
|
"learning_rate": 4.130725509862922e-05, |
|
"loss": 0.8257, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 16.621524810791016, |
|
"learning_rate": 4.114008692744902e-05, |
|
"loss": 0.7705, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 25.2496280670166, |
|
"learning_rate": 4.0972918756268804e-05, |
|
"loss": 0.7741, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 12.541385650634766, |
|
"learning_rate": 4.08057505850886e-05, |
|
"loss": 0.7408, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 30.975236892700195, |
|
"learning_rate": 4.0638582413908394e-05, |
|
"loss": 0.7417, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 16.33625030517578, |
|
"learning_rate": 4.0471414242728186e-05, |
|
"loss": 0.766, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 17.48399543762207, |
|
"learning_rate": 4.030424607154798e-05, |
|
"loss": 0.8336, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 19.421096801757812, |
|
"learning_rate": 4.013707790036777e-05, |
|
"loss": 0.7135, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.7995263469508584, |
|
"eval_f1": 0.7955612032049123, |
|
"eval_loss": 0.7165877223014832, |
|
"eval_precision": 0.805591523931921, |
|
"eval_recall": 0.7995263469508584, |
|
"eval_runtime": 64.1068, |
|
"eval_samples_per_second": 131.733, |
|
"eval_steps_per_second": 8.236, |
|
"step": 5982 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 10.486939430236816, |
|
"learning_rate": 3.996990972918757e-05, |
|
"loss": 0.685, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 19.489837646484375, |
|
"learning_rate": 3.980274155800736e-05, |
|
"loss": 0.6431, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 8.935369491577148, |
|
"learning_rate": 3.963557338682715e-05, |
|
"loss": 0.6402, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 10.298083305358887, |
|
"learning_rate": 3.946840521564694e-05, |
|
"loss": 0.6261, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 18.606569290161133, |
|
"learning_rate": 3.930123704446674e-05, |
|
"loss": 0.5874, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 12.412484169006348, |
|
"learning_rate": 3.913406887328653e-05, |
|
"loss": 0.5923, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 9.3939847946167, |
|
"learning_rate": 3.8966900702106316e-05, |
|
"loss": 0.6091, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 14.168825149536133, |
|
"learning_rate": 3.8799732530926114e-05, |
|
"loss": 0.6259, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 18.846487045288086, |
|
"learning_rate": 3.8632564359745906e-05, |
|
"loss": 0.5543, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 7.268430709838867, |
|
"learning_rate": 3.84653961885657e-05, |
|
"loss": 0.5615, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 6.565930366516113, |
|
"learning_rate": 3.829822801738549e-05, |
|
"loss": 0.5725, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 11.122172355651855, |
|
"learning_rate": 3.813105984620529e-05, |
|
"loss": 0.543, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 15.909794807434082, |
|
"learning_rate": 3.796389167502508e-05, |
|
"loss": 0.5053, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 17.935998916625977, |
|
"learning_rate": 3.779672350384487e-05, |
|
"loss": 0.5866, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 7.46903657913208, |
|
"learning_rate": 3.762955533266466e-05, |
|
"loss": 0.5573, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 10.208723068237305, |
|
"learning_rate": 3.746238716148446e-05, |
|
"loss": 0.511, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 15.062224388122559, |
|
"learning_rate": 3.729521899030425e-05, |
|
"loss": 0.5211, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 11.787239074707031, |
|
"learning_rate": 3.7128050819124036e-05, |
|
"loss": 0.5687, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 20.22210693359375, |
|
"learning_rate": 3.6960882647943834e-05, |
|
"loss": 0.544, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 22.17251205444336, |
|
"learning_rate": 3.6793714476763626e-05, |
|
"loss": 0.5223, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 16.83318519592285, |
|
"learning_rate": 3.662654630558342e-05, |
|
"loss": 0.5043, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 10.143548965454102, |
|
"learning_rate": 3.645937813440321e-05, |
|
"loss": 0.5181, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 20.629831314086914, |
|
"learning_rate": 3.629220996322301e-05, |
|
"loss": 0.4886, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 12.14686107635498, |
|
"learning_rate": 3.61250417920428e-05, |
|
"loss": 0.5667, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 17.1881160736084, |
|
"learning_rate": 3.595787362086259e-05, |
|
"loss": 0.5211, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 7.506267070770264, |
|
"learning_rate": 3.579070544968238e-05, |
|
"loss": 0.5356, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 23.122560501098633, |
|
"learning_rate": 3.562353727850217e-05, |
|
"loss": 0.5044, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 21.808191299438477, |
|
"learning_rate": 3.545636910732197e-05, |
|
"loss": 0.5059, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 12.899435997009277, |
|
"learning_rate": 3.5289200936141756e-05, |
|
"loss": 0.5082, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 11.228046417236328, |
|
"learning_rate": 3.5122032764961554e-05, |
|
"loss": 0.4466, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 15.656624794006348, |
|
"learning_rate": 3.4954864593781346e-05, |
|
"loss": 0.4877, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 14.958187103271484, |
|
"learning_rate": 3.478769642260114e-05, |
|
"loss": 0.4283, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 27.727924346923828, |
|
"learning_rate": 3.462052825142093e-05, |
|
"loss": 0.504, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 21.103147506713867, |
|
"learning_rate": 3.445336008024073e-05, |
|
"loss": 0.5081, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 14.884688377380371, |
|
"learning_rate": 3.428619190906052e-05, |
|
"loss": 0.47, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 26.825908660888672, |
|
"learning_rate": 3.411902373788031e-05, |
|
"loss": 0.4587, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 23.39227867126465, |
|
"learning_rate": 3.39518555667001e-05, |
|
"loss": 0.4621, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 15.503640174865723, |
|
"learning_rate": 3.378468739551989e-05, |
|
"loss": 0.5122, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 13.298539161682129, |
|
"learning_rate": 3.361751922433969e-05, |
|
"loss": 0.4846, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 17.961261749267578, |
|
"learning_rate": 3.3450351053159476e-05, |
|
"loss": 0.4576, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 15.622933387756348, |
|
"learning_rate": 3.3283182881979274e-05, |
|
"loss": 0.4239, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 15.286486625671387, |
|
"learning_rate": 3.3116014710799066e-05, |
|
"loss": 0.4478, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 28.045799255371094, |
|
"learning_rate": 3.294884653961886e-05, |
|
"loss": 0.4457, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 23.578136444091797, |
|
"learning_rate": 3.278167836843865e-05, |
|
"loss": 0.464, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 12.858305931091309, |
|
"learning_rate": 3.261451019725844e-05, |
|
"loss": 0.4507, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 18.197952270507812, |
|
"learning_rate": 3.244734202607824e-05, |
|
"loss": 0.4158, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 5.134513854980469, |
|
"learning_rate": 3.228017385489803e-05, |
|
"loss": 0.4088, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.1014363765716553, |
|
"learning_rate": 3.211300568371782e-05, |
|
"loss": 0.4524, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 14.459040641784668, |
|
"learning_rate": 3.194583751253761e-05, |
|
"loss": 0.4637, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 29.922468185424805, |
|
"learning_rate": 3.177866934135741e-05, |
|
"loss": 0.4302, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 23.523460388183594, |
|
"learning_rate": 3.1611501170177196e-05, |
|
"loss": 0.4155, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 11.668371200561523, |
|
"learning_rate": 3.1444332998996994e-05, |
|
"loss": 0.4238, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 15.930005073547363, |
|
"learning_rate": 3.1277164827816786e-05, |
|
"loss": 0.4072, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 18.61160659790039, |
|
"learning_rate": 3.110999665663658e-05, |
|
"loss": 0.4348, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 27.475053787231445, |
|
"learning_rate": 3.094282848545637e-05, |
|
"loss": 0.4648, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 6.477468013763428, |
|
"learning_rate": 3.077566031427616e-05, |
|
"loss": 0.4241, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 26.99014663696289, |
|
"learning_rate": 3.060849214309596e-05, |
|
"loss": 0.4243, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 16.152755737304688, |
|
"learning_rate": 3.0441323971915747e-05, |
|
"loss": 0.4186, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 15.536150932312012, |
|
"learning_rate": 3.0274155800735542e-05, |
|
"loss": 0.3808, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 23.708145141601562, |
|
"learning_rate": 3.0106987629555333e-05, |
|
"loss": 0.4365, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8680876258140912, |
|
"eval_f1": 0.8628914936078326, |
|
"eval_loss": 0.4633374810218811, |
|
"eval_precision": 0.8684864554322808, |
|
"eval_recall": 0.8680876258140912, |
|
"eval_runtime": 64.0052, |
|
"eval_samples_per_second": 131.942, |
|
"eval_steps_per_second": 8.249, |
|
"step": 11964 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 10.474257469177246, |
|
"learning_rate": 2.9939819458375128e-05, |
|
"loss": 0.3853, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 15.668170928955078, |
|
"learning_rate": 2.977265128719492e-05, |
|
"loss": 0.2858, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 10.29902172088623, |
|
"learning_rate": 2.960548311601471e-05, |
|
"loss": 0.2803, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 33.27579116821289, |
|
"learning_rate": 2.9438314944834506e-05, |
|
"loss": 0.2858, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 13.799466133117676, |
|
"learning_rate": 2.9271146773654294e-05, |
|
"loss": 0.2793, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 4.722692489624023, |
|
"learning_rate": 2.9103978602474092e-05, |
|
"loss": 0.2935, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 8.643231391906738, |
|
"learning_rate": 2.893681043129388e-05, |
|
"loss": 0.2825, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 10.378469467163086, |
|
"learning_rate": 2.876964226011368e-05, |
|
"loss": 0.2845, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 9.1376953125, |
|
"learning_rate": 2.8602474088933467e-05, |
|
"loss": 0.2725, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 10.372312545776367, |
|
"learning_rate": 2.8435305917753262e-05, |
|
"loss": 0.3067, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 23.952699661254883, |
|
"learning_rate": 2.8268137746573053e-05, |
|
"loss": 0.2934, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 2.125562906265259, |
|
"learning_rate": 2.8100969575392848e-05, |
|
"loss": 0.2535, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 8.090828895568848, |
|
"learning_rate": 2.793380140421264e-05, |
|
"loss": 0.295, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 13.274210929870605, |
|
"learning_rate": 2.776663323303243e-05, |
|
"loss": 0.2851, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 5.6807732582092285, |
|
"learning_rate": 2.7599465061852226e-05, |
|
"loss": 0.2662, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 11.885269165039062, |
|
"learning_rate": 2.7432296890672014e-05, |
|
"loss": 0.2969, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 21.52318000793457, |
|
"learning_rate": 2.7265128719491812e-05, |
|
"loss": 0.2706, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 21.661279678344727, |
|
"learning_rate": 2.70979605483116e-05, |
|
"loss": 0.2715, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 27.985078811645508, |
|
"learning_rate": 2.69307923771314e-05, |
|
"loss": 0.3016, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 11.431729316711426, |
|
"learning_rate": 2.6763624205951187e-05, |
|
"loss": 0.2501, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 5.3406901359558105, |
|
"learning_rate": 2.6596456034770982e-05, |
|
"loss": 0.2762, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 11.077746391296387, |
|
"learning_rate": 2.6429287863590773e-05, |
|
"loss": 0.2819, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 17.451330184936523, |
|
"learning_rate": 2.6262119692410565e-05, |
|
"loss": 0.3074, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 7.353370189666748, |
|
"learning_rate": 2.609495152123036e-05, |
|
"loss": 0.3068, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 12.055102348327637, |
|
"learning_rate": 2.592778335005015e-05, |
|
"loss": 0.2779, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 17.555917739868164, |
|
"learning_rate": 2.5760615178869946e-05, |
|
"loss": 0.2421, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 22.887771606445312, |
|
"learning_rate": 2.5593447007689734e-05, |
|
"loss": 0.3016, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.915899395942688, |
|
"learning_rate": 2.5426278836509533e-05, |
|
"loss": 0.2638, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 13.446496963500977, |
|
"learning_rate": 2.525911066532932e-05, |
|
"loss": 0.293, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 12.734638214111328, |
|
"learning_rate": 2.509194249414912e-05, |
|
"loss": 0.2668, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 15.557112693786621, |
|
"learning_rate": 2.4924774322968907e-05, |
|
"loss": 0.2691, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 10.383445739746094, |
|
"learning_rate": 2.4757606151788702e-05, |
|
"loss": 0.2204, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 7.19666862487793, |
|
"learning_rate": 2.4590437980608493e-05, |
|
"loss": 0.2447, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 17.903339385986328, |
|
"learning_rate": 2.442326980942829e-05, |
|
"loss": 0.2504, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 10.492616653442383, |
|
"learning_rate": 2.425610163824808e-05, |
|
"loss": 0.2256, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 11.051074028015137, |
|
"learning_rate": 2.408893346706787e-05, |
|
"loss": 0.259, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 23.400402069091797, |
|
"learning_rate": 2.3921765295887663e-05, |
|
"loss": 0.2487, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 20.601686477661133, |
|
"learning_rate": 2.3754597124707458e-05, |
|
"loss": 0.2338, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 12.519159317016602, |
|
"learning_rate": 2.358742895352725e-05, |
|
"loss": 0.2652, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 21.95683479309082, |
|
"learning_rate": 2.342026078234704e-05, |
|
"loss": 0.2306, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 24.98236656188965, |
|
"learning_rate": 2.3253092611166836e-05, |
|
"loss": 0.2475, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 6.362200736999512, |
|
"learning_rate": 2.3085924439986627e-05, |
|
"loss": 0.2646, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 14.293391227722168, |
|
"learning_rate": 2.2918756268806422e-05, |
|
"loss": 0.2404, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 11.405878067016602, |
|
"learning_rate": 2.2751588097626213e-05, |
|
"loss": 0.2651, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 15.082180976867676, |
|
"learning_rate": 2.258441992644601e-05, |
|
"loss": 0.281, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 27.33397674560547, |
|
"learning_rate": 2.2417251755265796e-05, |
|
"loss": 0.2492, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 10.052102088928223, |
|
"learning_rate": 2.225008358408559e-05, |
|
"loss": 0.2382, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 15.405964851379395, |
|
"learning_rate": 2.2082915412905383e-05, |
|
"loss": 0.2496, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 7.162382125854492, |
|
"learning_rate": 2.1915747241725178e-05, |
|
"loss": 0.2343, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 11.130888938903809, |
|
"learning_rate": 2.174857907054497e-05, |
|
"loss": 0.2474, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 8.277360916137695, |
|
"learning_rate": 2.158141089936476e-05, |
|
"loss": 0.2687, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 31.100744247436523, |
|
"learning_rate": 2.1414242728184556e-05, |
|
"loss": 0.2422, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 12.757442474365234, |
|
"learning_rate": 2.1247074557004347e-05, |
|
"loss": 0.2275, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 4.860738277435303, |
|
"learning_rate": 2.1079906385824142e-05, |
|
"loss": 0.2252, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 10.574835777282715, |
|
"learning_rate": 2.091273821464393e-05, |
|
"loss": 0.2114, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 13.01117992401123, |
|
"learning_rate": 2.0745570043463725e-05, |
|
"loss": 0.2407, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 4.970390319824219, |
|
"learning_rate": 2.0578401872283517e-05, |
|
"loss": 0.2509, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 18.95350456237793, |
|
"learning_rate": 2.041123370110331e-05, |
|
"loss": 0.2814, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 1.5296308994293213, |
|
"learning_rate": 2.0244065529923103e-05, |
|
"loss": 0.235, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 12.501904487609863, |
|
"learning_rate": 2.0076897358742898e-05, |
|
"loss": 0.2479, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.8965068087625814, |
|
"eval_f1": 0.8930257247589533, |
|
"eval_loss": 0.36622655391693115, |
|
"eval_precision": 0.8950199629292306, |
|
"eval_recall": 0.8965068087625814, |
|
"eval_runtime": 64.0862, |
|
"eval_samples_per_second": 131.776, |
|
"eval_steps_per_second": 8.239, |
|
"step": 17946 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 19.13836097717285, |
|
"learning_rate": 1.990972918756269e-05, |
|
"loss": 0.2272, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 8.622084617614746, |
|
"learning_rate": 1.9742561016382484e-05, |
|
"loss": 0.131, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 32.99411392211914, |
|
"learning_rate": 1.9575392845202276e-05, |
|
"loss": 0.1477, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 5.467390060424805, |
|
"learning_rate": 1.9408224674022067e-05, |
|
"loss": 0.1439, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 2.5153982639312744, |
|
"learning_rate": 1.924105650284186e-05, |
|
"loss": 0.1405, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 20.424579620361328, |
|
"learning_rate": 1.907388833166165e-05, |
|
"loss": 0.1594, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 5.207544803619385, |
|
"learning_rate": 1.8906720160481445e-05, |
|
"loss": 0.1323, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 8.750362396240234, |
|
"learning_rate": 1.8739551989301237e-05, |
|
"loss": 0.1683, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 2.464329481124878, |
|
"learning_rate": 1.857238381812103e-05, |
|
"loss": 0.1388, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 3.784031867980957, |
|
"learning_rate": 1.8405215646940823e-05, |
|
"loss": 0.149, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 2.632542610168457, |
|
"learning_rate": 1.8238047475760618e-05, |
|
"loss": 0.1284, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 11.050533294677734, |
|
"learning_rate": 1.807087930458041e-05, |
|
"loss": 0.1525, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 7.363661766052246, |
|
"learning_rate": 1.7903711133400204e-05, |
|
"loss": 0.1481, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 9.882287979125977, |
|
"learning_rate": 1.7736542962219992e-05, |
|
"loss": 0.1231, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 24.93657684326172, |
|
"learning_rate": 1.7569374791039787e-05, |
|
"loss": 0.1332, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 2.2802133560180664, |
|
"learning_rate": 1.740220661985958e-05, |
|
"loss": 0.1425, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.5991661548614502, |
|
"learning_rate": 1.7235038448679374e-05, |
|
"loss": 0.1283, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 8.344457626342773, |
|
"learning_rate": 1.7067870277499165e-05, |
|
"loss": 0.1502, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 12.95904541015625, |
|
"learning_rate": 1.6900702106318957e-05, |
|
"loss": 0.1287, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 20.562625885009766, |
|
"learning_rate": 1.673353393513875e-05, |
|
"loss": 0.1422, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 4.20346736907959, |
|
"learning_rate": 1.6566365763958543e-05, |
|
"loss": 0.1082, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 25.636775970458984, |
|
"learning_rate": 1.6399197592778338e-05, |
|
"loss": 0.1416, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 23.23301887512207, |
|
"learning_rate": 1.6232029421598126e-05, |
|
"loss": 0.1497, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 22.21303939819336, |
|
"learning_rate": 1.606486125041792e-05, |
|
"loss": 0.1568, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 21.14128303527832, |
|
"learning_rate": 1.5897693079237712e-05, |
|
"loss": 0.139, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 22.63404083251953, |
|
"learning_rate": 1.5730524908057507e-05, |
|
"loss": 0.1518, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 13.030010223388672, |
|
"learning_rate": 1.55633567368773e-05, |
|
"loss": 0.1319, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 18.308670043945312, |
|
"learning_rate": 1.5396188565697094e-05, |
|
"loss": 0.1494, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 24.907419204711914, |
|
"learning_rate": 1.5229020394516885e-05, |
|
"loss": 0.1425, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 19.32282066345215, |
|
"learning_rate": 1.5061852223336678e-05, |
|
"loss": 0.1264, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 17.444271087646484, |
|
"learning_rate": 1.4894684052156472e-05, |
|
"loss": 0.14, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 1.832461953163147, |
|
"learning_rate": 1.4727515880976261e-05, |
|
"loss": 0.1438, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 10.410861015319824, |
|
"learning_rate": 1.4560347709796055e-05, |
|
"loss": 0.1393, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 3.6459202766418457, |
|
"learning_rate": 1.4393179538615848e-05, |
|
"loss": 0.1077, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 3.216399669647217, |
|
"learning_rate": 1.4226011367435641e-05, |
|
"loss": 0.1154, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 5.621729373931885, |
|
"learning_rate": 1.4058843196255434e-05, |
|
"loss": 0.1208, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 5.559453010559082, |
|
"learning_rate": 1.3891675025075226e-05, |
|
"loss": 0.1441, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 22.32745933532715, |
|
"learning_rate": 1.3724506853895019e-05, |
|
"loss": 0.1176, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 4.509443759918213, |
|
"learning_rate": 1.3557338682714812e-05, |
|
"loss": 0.1382, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 15.154895782470703, |
|
"learning_rate": 1.3390170511534605e-05, |
|
"loss": 0.1475, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.8804099559783936, |
|
"learning_rate": 1.3223002340354398e-05, |
|
"loss": 0.1325, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 1.9917913675308228, |
|
"learning_rate": 1.3055834169174188e-05, |
|
"loss": 0.1255, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 16.314374923706055, |
|
"learning_rate": 1.2888665997993981e-05, |
|
"loss": 0.1275, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 5.355242729187012, |
|
"learning_rate": 1.2721497826813775e-05, |
|
"loss": 0.1185, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 20.218473434448242, |
|
"learning_rate": 1.2554329655633568e-05, |
|
"loss": 0.1203, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.39955735206604, |
|
"learning_rate": 1.2387161484453361e-05, |
|
"loss": 0.1636, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 17.855899810791016, |
|
"learning_rate": 1.2219993313273154e-05, |
|
"loss": 0.1369, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 14.41054630279541, |
|
"learning_rate": 1.2052825142092947e-05, |
|
"loss": 0.1245, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 11.451350212097168, |
|
"learning_rate": 1.1885656970912739e-05, |
|
"loss": 0.1508, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 9.41112995147705, |
|
"learning_rate": 1.171848879973253e-05, |
|
"loss": 0.125, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 29.826963424682617, |
|
"learning_rate": 1.1551320628552324e-05, |
|
"loss": 0.1545, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 11.454690933227539, |
|
"learning_rate": 1.1384152457372117e-05, |
|
"loss": 0.1353, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 12.364923477172852, |
|
"learning_rate": 1.121698428619191e-05, |
|
"loss": 0.1346, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.8181456327438354, |
|
"learning_rate": 1.1049816115011702e-05, |
|
"loss": 0.1092, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 30.87436866760254, |
|
"learning_rate": 1.0882647943831495e-05, |
|
"loss": 0.1059, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 16.423452377319336, |
|
"learning_rate": 1.0715479772651288e-05, |
|
"loss": 0.1157, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 27.86665153503418, |
|
"learning_rate": 1.0548311601471081e-05, |
|
"loss": 0.1317, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 24.479764938354492, |
|
"learning_rate": 1.0381143430290873e-05, |
|
"loss": 0.1184, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 1.4079170227050781, |
|
"learning_rate": 1.0213975259110666e-05, |
|
"loss": 0.1303, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 4.259897232055664, |
|
"learning_rate": 1.0046807087930459e-05, |
|
"loss": 0.1322, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9113084665482534, |
|
"eval_f1": 0.9092055511030135, |
|
"eval_loss": 0.3260073661804199, |
|
"eval_precision": 0.9099757491171729, |
|
"eval_recall": 0.9113084665482534, |
|
"eval_runtime": 64.1166, |
|
"eval_samples_per_second": 131.713, |
|
"eval_steps_per_second": 8.235, |
|
"step": 23928 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 13.925552368164062, |
|
"learning_rate": 9.879638916750252e-06, |
|
"loss": 0.0687, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.18495211005210876, |
|
"learning_rate": 9.712470745570044e-06, |
|
"loss": 0.066, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 1.0808857679367065, |
|
"learning_rate": 9.545302574389837e-06, |
|
"loss": 0.0648, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 1.0073552131652832, |
|
"learning_rate": 9.378134403209628e-06, |
|
"loss": 0.071, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 15.166232109069824, |
|
"learning_rate": 9.210966232029422e-06, |
|
"loss": 0.0666, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 18.000640869140625, |
|
"learning_rate": 9.043798060849215e-06, |
|
"loss": 0.0778, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 1.214728593826294, |
|
"learning_rate": 8.876629889669008e-06, |
|
"loss": 0.07, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 1.982407808303833, |
|
"learning_rate": 8.7094617184888e-06, |
|
"loss": 0.0752, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 20.929153442382812, |
|
"learning_rate": 8.542293547308593e-06, |
|
"loss": 0.0785, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.8963820934295654, |
|
"learning_rate": 8.375125376128386e-06, |
|
"loss": 0.0524, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 3.5774483680725098, |
|
"learning_rate": 8.207957204948179e-06, |
|
"loss": 0.0692, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 3.7253074645996094, |
|
"learning_rate": 8.04078903376797e-06, |
|
"loss": 0.0641, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 1.2855291366577148, |
|
"learning_rate": 7.873620862587764e-06, |
|
"loss": 0.0699, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 1.9972455501556396, |
|
"learning_rate": 7.706452691407557e-06, |
|
"loss": 0.062, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 1.0809322595596313, |
|
"learning_rate": 7.539284520227349e-06, |
|
"loss": 0.058, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 3.876232862472534, |
|
"learning_rate": 7.3721163490471425e-06, |
|
"loss": 0.0693, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 6.069151878356934, |
|
"learning_rate": 7.204948177866934e-06, |
|
"loss": 0.0617, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.895815372467041, |
|
"learning_rate": 7.037780006686727e-06, |
|
"loss": 0.0623, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.4176822602748871, |
|
"learning_rate": 6.8706118355065195e-06, |
|
"loss": 0.0833, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.6760619878768921, |
|
"learning_rate": 6.703443664326313e-06, |
|
"loss": 0.0567, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 14.889734268188477, |
|
"learning_rate": 6.536275493146106e-06, |
|
"loss": 0.053, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.5385121703147888, |
|
"learning_rate": 6.369107321965897e-06, |
|
"loss": 0.0703, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 6.336006164550781, |
|
"learning_rate": 6.201939150785691e-06, |
|
"loss": 0.063, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.20758749544620514, |
|
"learning_rate": 6.034770979605484e-06, |
|
"loss": 0.0753, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 11.717066764831543, |
|
"learning_rate": 5.867602808425276e-06, |
|
"loss": 0.0598, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 26.475128173828125, |
|
"learning_rate": 5.7004346372450685e-06, |
|
"loss": 0.064, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 20.872194290161133, |
|
"learning_rate": 5.533266466064862e-06, |
|
"loss": 0.0708, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 1.2749828100204468, |
|
"learning_rate": 5.366098294884654e-06, |
|
"loss": 0.0705, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 6.7912702560424805, |
|
"learning_rate": 5.198930123704447e-06, |
|
"loss": 0.0742, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 10.904654502868652, |
|
"learning_rate": 5.03176195252424e-06, |
|
"loss": 0.0665, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 6.191511154174805, |
|
"learning_rate": 4.864593781344033e-06, |
|
"loss": 0.0549, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 2.479524850845337, |
|
"learning_rate": 4.697425610163825e-06, |
|
"loss": 0.0539, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 0.7285805940628052, |
|
"learning_rate": 4.5302574389836175e-06, |
|
"loss": 0.0662, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 4.313304901123047, |
|
"learning_rate": 4.363089267803411e-06, |
|
"loss": 0.0571, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 17.61699867248535, |
|
"learning_rate": 4.195921096623203e-06, |
|
"loss": 0.0634, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 1.3776081800460815, |
|
"learning_rate": 4.028752925442996e-06, |
|
"loss": 0.0526, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 0.36369597911834717, |
|
"learning_rate": 3.8615847542627886e-06, |
|
"loss": 0.0669, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 4.591643333435059, |
|
"learning_rate": 3.6944165830825813e-06, |
|
"loss": 0.0578, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.930225670337677, |
|
"learning_rate": 3.5272484119023737e-06, |
|
"loss": 0.0456, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 1.136043906211853, |
|
"learning_rate": 3.360080240722167e-06, |
|
"loss": 0.0617, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.6426201462745667, |
|
"learning_rate": 3.1929120695419596e-06, |
|
"loss": 0.0568, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 2.6884241104125977, |
|
"learning_rate": 3.025743898361752e-06, |
|
"loss": 0.0606, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 0.4525424838066101, |
|
"learning_rate": 2.8585757271815448e-06, |
|
"loss": 0.066, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 1.0276681184768677, |
|
"learning_rate": 2.6914075560013375e-06, |
|
"loss": 0.0444, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 7.886939525604248, |
|
"learning_rate": 2.5242393848211303e-06, |
|
"loss": 0.065, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.37203583121299744, |
|
"learning_rate": 2.357071213640923e-06, |
|
"loss": 0.0559, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 6.219501495361328, |
|
"learning_rate": 2.1899030424607154e-06, |
|
"loss": 0.07, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 8.10631275177002, |
|
"learning_rate": 2.022734871280508e-06, |
|
"loss": 0.0623, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 24.999059677124023, |
|
"learning_rate": 1.855566700100301e-06, |
|
"loss": 0.0701, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 3.5445597171783447, |
|
"learning_rate": 1.6883985289200935e-06, |
|
"loss": 0.0561, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 11.693018913269043, |
|
"learning_rate": 1.5212303577398863e-06, |
|
"loss": 0.062, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 17.059640884399414, |
|
"learning_rate": 1.354062186559679e-06, |
|
"loss": 0.0663, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 3.2128794193267822, |
|
"learning_rate": 1.1868940153794718e-06, |
|
"loss": 0.0541, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 1.6803439855575562, |
|
"learning_rate": 1.0197258441992646e-06, |
|
"loss": 0.0619, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 7.980160236358643, |
|
"learning_rate": 8.525576730190572e-07, |
|
"loss": 0.0649, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 0.3919593393802643, |
|
"learning_rate": 6.853895018388499e-07, |
|
"loss": 0.0753, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 2.870180368423462, |
|
"learning_rate": 5.182213306586426e-07, |
|
"loss": 0.0461, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.5204899907112122, |
|
"learning_rate": 3.510531594784353e-07, |
|
"loss": 0.0446, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 2.318403482437134, |
|
"learning_rate": 1.8388498829822804e-07, |
|
"loss": 0.0588, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.1591626405715942, |
|
"learning_rate": 1.6716817118020728e-08, |
|
"loss": 0.0589, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9182948490230906, |
|
"eval_f1": 0.9165254517429693, |
|
"eval_loss": 0.3342040479183197, |
|
"eval_precision": 0.9170562701684628, |
|
"eval_recall": 0.9182948490230906, |
|
"eval_runtime": 63.9141, |
|
"eval_samples_per_second": 132.131, |
|
"eval_steps_per_second": 8.261, |
|
"step": 29910 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 29910, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 1.15579279766016e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|