{ "best_metric": 0.728, "best_model_checkpoint": "VT_15/checkpoint-7279", "epoch": 29.0, "eval_steps": 500, "global_step": 7279, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.398406374501992, "grad_norm": 11.727704048156738, "learning_rate": 9.867197875166003e-05, "loss": 1.0157, "step": 100 }, { "epoch": 0.796812749003984, "grad_norm": 11.50269603729248, "learning_rate": 9.734395750332006e-05, "loss": 0.8938, "step": 200 }, { "epoch": 1.0, "eval_accuracy": 0.548, "eval_f1": 0.4882155949880584, "eval_loss": 0.9423586130142212, "eval_precision": 0.5733499456903712, "eval_recall": 0.5448468191272983, "eval_runtime": 16.5911, "eval_samples_per_second": 30.137, "eval_steps_per_second": 3.797, "step": 251 }, { "epoch": 1.1952191235059761, "grad_norm": 10.47740650177002, "learning_rate": 9.601593625498009e-05, "loss": 0.8485, "step": 300 }, { "epoch": 1.593625498007968, "grad_norm": 12.222431182861328, "learning_rate": 9.468791500664011e-05, "loss": 0.7978, "step": 400 }, { "epoch": 1.9920318725099602, "grad_norm": 13.02956771850586, "learning_rate": 9.335989375830013e-05, "loss": 0.8093, "step": 500 }, { "epoch": 2.0, "eval_accuracy": 0.61, "eval_f1": 0.584707628587346, "eval_loss": 0.8303987979888916, "eval_precision": 0.6022599150943656, "eval_recall": 0.6097290164964017, "eval_runtime": 15.8756, "eval_samples_per_second": 31.495, "eval_steps_per_second": 3.968, "step": 502 }, { "epoch": 2.3904382470119523, "grad_norm": 11.920299530029297, "learning_rate": 9.203187250996016e-05, "loss": 0.7275, "step": 600 }, { "epoch": 2.7888446215139444, "grad_norm": 12.057920455932617, "learning_rate": 9.070385126162018e-05, "loss": 0.7444, "step": 700 }, { "epoch": 3.0, "eval_accuracy": 0.578, "eval_f1": 0.5180026990553307, "eval_loss": 0.8966869115829468, "eval_precision": 0.6246163183733936, "eval_recall": 0.5751176873923239, "eval_runtime": 16.5617, "eval_samples_per_second": 30.19, "eval_steps_per_second": 3.804, "step": 753 }, { "epoch": 3.187250996015936, "grad_norm": 12.730193138122559, "learning_rate": 8.937583001328021e-05, "loss": 0.7091, "step": 800 }, { "epoch": 3.585657370517928, "grad_norm": 11.721458435058594, "learning_rate": 8.804780876494024e-05, "loss": 0.6502, "step": 900 }, { "epoch": 3.9840637450199203, "grad_norm": 11.902241706848145, "learning_rate": 8.671978751660027e-05, "loss": 0.6391, "step": 1000 }, { "epoch": 4.0, "eval_accuracy": 0.624, "eval_f1": 0.6176260916415671, "eval_loss": 0.8131240010261536, "eval_precision": 0.6213088498802785, "eval_recall": 0.6225291409540601, "eval_runtime": 16.9186, "eval_samples_per_second": 29.553, "eval_steps_per_second": 3.724, "step": 1004 }, { "epoch": 4.382470119521912, "grad_norm": 11.794739723205566, "learning_rate": 8.539176626826029e-05, "loss": 0.5683, "step": 1100 }, { "epoch": 4.780876494023905, "grad_norm": 11.683808326721191, "learning_rate": 8.406374501992032e-05, "loss": 0.5691, "step": 1200 }, { "epoch": 5.0, "eval_accuracy": 0.622, "eval_f1": 0.5716700610476999, "eval_loss": 0.8928351998329163, "eval_precision": 0.6410496659546192, "eval_recall": 0.6199959090306707, "eval_runtime": 16.6354, "eval_samples_per_second": 30.056, "eval_steps_per_second": 3.787, "step": 1255 }, { "epoch": 5.179282868525896, "grad_norm": 13.705164909362793, "learning_rate": 8.273572377158035e-05, "loss": 0.5855, "step": 1300 }, { "epoch": 5.577689243027889, "grad_norm": 11.817400932312012, "learning_rate": 8.140770252324038e-05, "loss": 0.513, "step": 1400 }, { "epoch": 5.9760956175298805, "grad_norm": 11.755231857299805, "learning_rate": 8.00796812749004e-05, "loss": 0.5009, "step": 1500 }, { "epoch": 6.0, "eval_accuracy": 0.656, "eval_f1": 0.6379970132206839, "eval_loss": 0.8215978741645813, "eval_precision": 0.6485419139105967, "eval_recall": 0.6545787165880164, "eval_runtime": 17.7464, "eval_samples_per_second": 28.175, "eval_steps_per_second": 3.55, "step": 1506 }, { "epoch": 6.374501992031872, "grad_norm": 10.913691520690918, "learning_rate": 7.875166002656043e-05, "loss": 0.4488, "step": 1600 }, { "epoch": 6.772908366533865, "grad_norm": 10.615025520324707, "learning_rate": 7.742363877822046e-05, "loss": 0.4855, "step": 1700 }, { "epoch": 7.0, "eval_accuracy": 0.654, "eval_f1": 0.6439337408377656, "eval_loss": 0.8311923742294312, "eval_precision": 0.6495670995670996, "eval_recall": 0.6520354012895196, "eval_runtime": 17.519, "eval_samples_per_second": 28.54, "eval_steps_per_second": 3.596, "step": 1757 }, { "epoch": 7.171314741035856, "grad_norm": 12.026023864746094, "learning_rate": 7.609561752988048e-05, "loss": 0.4177, "step": 1800 }, { "epoch": 7.569721115537849, "grad_norm": 10.010376930236816, "learning_rate": 7.476759628154051e-05, "loss": 0.409, "step": 1900 }, { "epoch": 7.968127490039841, "grad_norm": 12.596341133117676, "learning_rate": 7.343957503320054e-05, "loss": 0.39, "step": 2000 }, { "epoch": 8.0, "eval_accuracy": 0.682, "eval_f1": 0.6538606492353024, "eval_loss": 0.9214051365852356, "eval_precision": 0.7101278814728985, "eval_recall": 0.6804800262743945, "eval_runtime": 17.2651, "eval_samples_per_second": 28.96, "eval_steps_per_second": 3.649, "step": 2008 }, { "epoch": 8.366533864541832, "grad_norm": 14.169309616088867, "learning_rate": 7.211155378486057e-05, "loss": 0.3646, "step": 2100 }, { "epoch": 8.764940239043824, "grad_norm": 12.020890235900879, "learning_rate": 7.07835325365206e-05, "loss": 0.3708, "step": 2200 }, { "epoch": 9.0, "eval_accuracy": 0.598, "eval_f1": 0.5725541685903895, "eval_loss": 1.0422428846359253, "eval_precision": 0.6090800979488745, "eval_recall": 0.5948744763847355, "eval_runtime": 17.2984, "eval_samples_per_second": 28.904, "eval_steps_per_second": 3.642, "step": 2259 }, { "epoch": 9.163346613545817, "grad_norm": 10.583425521850586, "learning_rate": 6.945551128818062e-05, "loss": 0.3533, "step": 2300 }, { "epoch": 9.56175298804781, "grad_norm": 13.7178316116333, "learning_rate": 6.812749003984064e-05, "loss": 0.2984, "step": 2400 }, { "epoch": 9.9601593625498, "grad_norm": 10.42063045501709, "learning_rate": 6.679946879150066e-05, "loss": 0.3328, "step": 2500 }, { "epoch": 10.0, "eval_accuracy": 0.718, "eval_f1": 0.715429962270956, "eval_loss": 0.7483692765235901, "eval_precision": 0.7195966559320596, "eval_recall": 0.7178075285359515, "eval_runtime": 17.492, "eval_samples_per_second": 28.584, "eval_steps_per_second": 3.602, "step": 2510 }, { "epoch": 10.358565737051793, "grad_norm": 10.752534866333008, "learning_rate": 6.547144754316069e-05, "loss": 0.2561, "step": 2600 }, { "epoch": 10.756972111553784, "grad_norm": 10.988365173339844, "learning_rate": 6.414342629482072e-05, "loss": 0.3092, "step": 2700 }, { "epoch": 11.0, "eval_accuracy": 0.72, "eval_f1": 0.7160176967190494, "eval_loss": 0.8250208497047424, "eval_precision": 0.7184132303947973, "eval_recall": 0.7186398391269527, "eval_runtime": 17.2463, "eval_samples_per_second": 28.992, "eval_steps_per_second": 3.653, "step": 2761 }, { "epoch": 11.155378486055778, "grad_norm": 10.967025756835938, "learning_rate": 6.281540504648075e-05, "loss": 0.28, "step": 2800 }, { "epoch": 11.55378486055777, "grad_norm": 11.933313369750977, "learning_rate": 6.148738379814077e-05, "loss": 0.2747, "step": 2900 }, { "epoch": 11.952191235059761, "grad_norm": 12.90857219696045, "learning_rate": 6.01593625498008e-05, "loss": 0.281, "step": 3000 }, { "epoch": 12.0, "eval_accuracy": 0.694, "eval_f1": 0.6781251589992235, "eval_loss": 0.9806899428367615, "eval_precision": 0.6976797604396068, "eval_recall": 0.6921398650556313, "eval_runtime": 17.2981, "eval_samples_per_second": 28.905, "eval_steps_per_second": 3.642, "step": 3012 }, { "epoch": 12.350597609561753, "grad_norm": 11.363082885742188, "learning_rate": 5.883134130146083e-05, "loss": 0.2596, "step": 3100 }, { "epoch": 12.749003984063744, "grad_norm": 12.736093521118164, "learning_rate": 5.7503320053120855e-05, "loss": 0.2162, "step": 3200 }, { "epoch": 13.0, "eval_accuracy": 0.708, "eval_f1": 0.7021505447248022, "eval_loss": 0.9850034117698669, "eval_precision": 0.7101112865680695, "eval_recall": 0.7063781669000248, "eval_runtime": 17.6848, "eval_samples_per_second": 28.273, "eval_steps_per_second": 3.562, "step": 3263 }, { "epoch": 13.147410358565738, "grad_norm": 11.957535743713379, "learning_rate": 5.6175298804780876e-05, "loss": 0.2271, "step": 3300 }, { "epoch": 13.54581673306773, "grad_norm": 13.91019058227539, "learning_rate": 5.48472775564409e-05, "loss": 0.2284, "step": 3400 }, { "epoch": 13.944223107569721, "grad_norm": 10.606439590454102, "learning_rate": 5.351925630810093e-05, "loss": 0.2352, "step": 3500 }, { "epoch": 14.0, "eval_accuracy": 0.698, "eval_f1": 0.6875106838790609, "eval_loss": 0.9122900366783142, "eval_precision": 0.6937544840437923, "eval_recall": 0.6964621758194903, "eval_runtime": 17.3918, "eval_samples_per_second": 28.749, "eval_steps_per_second": 3.622, "step": 3514 }, { "epoch": 14.342629482071713, "grad_norm": 10.729408264160156, "learning_rate": 5.219123505976096e-05, "loss": 0.2339, "step": 3600 }, { "epoch": 14.741035856573705, "grad_norm": 10.717667579650879, "learning_rate": 5.0863213811420985e-05, "loss": 0.1947, "step": 3700 }, { "epoch": 15.0, "eval_accuracy": 0.694, "eval_f1": 0.6847697638967624, "eval_loss": 1.0269464254379272, "eval_precision": 0.6984319398216817, "eval_recall": 0.6918425495381815, "eval_runtime": 17.3612, "eval_samples_per_second": 28.8, "eval_steps_per_second": 3.629, "step": 3765 }, { "epoch": 15.139442231075698, "grad_norm": 15.637863159179688, "learning_rate": 4.953519256308101e-05, "loss": 0.2033, "step": 3800 }, { "epoch": 15.53784860557769, "grad_norm": 11.52315902709961, "learning_rate": 4.820717131474104e-05, "loss": 0.2073, "step": 3900 }, { "epoch": 15.936254980079681, "grad_norm": 10.585031509399414, "learning_rate": 4.687915006640107e-05, "loss": 0.1902, "step": 4000 }, { "epoch": 16.0, "eval_accuracy": 0.702, "eval_f1": 0.6936988146223305, "eval_loss": 1.0092582702636719, "eval_precision": 0.704450992084371, "eval_recall": 0.6998746780522377, "eval_runtime": 17.2437, "eval_samples_per_second": 28.996, "eval_steps_per_second": 3.654, "step": 4016 }, { "epoch": 16.334661354581673, "grad_norm": 13.553791046142578, "learning_rate": 4.555112881806109e-05, "loss": 0.1961, "step": 4100 }, { "epoch": 16.733067729083665, "grad_norm": 10.33850383758545, "learning_rate": 4.4223107569721116e-05, "loss": 0.1912, "step": 4200 }, { "epoch": 17.0, "eval_accuracy": 0.712, "eval_f1": 0.7037256290589013, "eval_loss": 0.9451501369476318, "eval_precision": 0.7113280708081392, "eval_recall": 0.710350440499444, "eval_runtime": 17.5777, "eval_samples_per_second": 28.445, "eval_steps_per_second": 3.584, "step": 4267 }, { "epoch": 17.131474103585656, "grad_norm": 10.550813674926758, "learning_rate": 4.289508632138114e-05, "loss": 0.1724, "step": 4300 }, { "epoch": 17.529880478087648, "grad_norm": 11.269770622253418, "learning_rate": 4.156706507304117e-05, "loss": 0.166, "step": 4400 }, { "epoch": 17.92828685258964, "grad_norm": 11.22702693939209, "learning_rate": 4.02390438247012e-05, "loss": 0.1626, "step": 4500 }, { "epoch": 18.0, "eval_accuracy": 0.71, "eval_f1": 0.6965991557822268, "eval_loss": 1.0229520797729492, "eval_precision": 0.7119117791531585, "eval_recall": 0.7080606500607883, "eval_runtime": 17.3563, "eval_samples_per_second": 28.808, "eval_steps_per_second": 3.63, "step": 4518 }, { "epoch": 18.326693227091635, "grad_norm": 11.750710487365723, "learning_rate": 3.8911022576361225e-05, "loss": 0.146, "step": 4600 }, { "epoch": 18.725099601593627, "grad_norm": 10.37628173828125, "learning_rate": 3.758300132802125e-05, "loss": 0.1524, "step": 4700 }, { "epoch": 19.0, "eval_accuracy": 0.716, "eval_f1": 0.7120852228214192, "eval_loss": 0.9977978467941284, "eval_precision": 0.7210033022533023, "eval_recall": 0.7144085668354911, "eval_runtime": 17.5249, "eval_samples_per_second": 28.531, "eval_steps_per_second": 3.595, "step": 4769 }, { "epoch": 19.12350597609562, "grad_norm": 12.99516487121582, "learning_rate": 3.625498007968128e-05, "loss": 0.1321, "step": 4800 }, { "epoch": 19.52191235059761, "grad_norm": 11.697456359863281, "learning_rate": 3.492695883134131e-05, "loss": 0.1508, "step": 4900 }, { "epoch": 19.9203187250996, "grad_norm": 11.452008247375488, "learning_rate": 3.359893758300133e-05, "loss": 0.1258, "step": 5000 }, { "epoch": 20.0, "eval_accuracy": 0.71, "eval_f1": 0.7074786456937486, "eval_loss": 1.050653338432312, "eval_precision": 0.7215994120996662, "eval_recall": 0.7083547965174904, "eval_runtime": 29.4041, "eval_samples_per_second": 17.004, "eval_steps_per_second": 2.143, "step": 5020 }, { "epoch": 20.318725099601593, "grad_norm": 12.027978897094727, "learning_rate": 3.2270916334661356e-05, "loss": 0.1387, "step": 5100 }, { "epoch": 20.717131474103585, "grad_norm": 13.427599906921387, "learning_rate": 3.094289508632138e-05, "loss": 0.1116, "step": 5200 }, { "epoch": 21.0, "eval_accuracy": 0.724, "eval_f1": 0.7153409174915838, "eval_loss": 1.0689764022827148, "eval_precision": 0.7231692880094706, "eval_recall": 0.722209642011374, "eval_runtime": 17.4789, "eval_samples_per_second": 28.606, "eval_steps_per_second": 3.604, "step": 5271 }, { "epoch": 21.115537848605577, "grad_norm": 12.410263061523438, "learning_rate": 2.961487383798141e-05, "loss": 0.1378, "step": 5300 }, { "epoch": 21.51394422310757, "grad_norm": 12.236252784729004, "learning_rate": 2.8286852589641438e-05, "loss": 0.1181, "step": 5400 }, { "epoch": 21.91235059760956, "grad_norm": 11.362260818481445, "learning_rate": 2.6958831341301462e-05, "loss": 0.1158, "step": 5500 }, { "epoch": 22.0, "eval_accuracy": 0.702, "eval_f1": 0.6967439243675191, "eval_loss": 1.1378962993621826, "eval_precision": 0.7034445997704206, "eval_recall": 0.7004485661440597, "eval_runtime": 17.752, "eval_samples_per_second": 28.166, "eval_steps_per_second": 3.549, "step": 5522 }, { "epoch": 22.310756972111555, "grad_norm": 12.567873001098633, "learning_rate": 2.563081009296149e-05, "loss": 0.1089, "step": 5600 }, { "epoch": 22.709163346613547, "grad_norm": 10.872307777404785, "learning_rate": 2.4302788844621517e-05, "loss": 0.1069, "step": 5700 }, { "epoch": 23.0, "eval_accuracy": 0.722, "eval_f1": 0.7172758119553166, "eval_loss": 1.157360553741455, "eval_precision": 0.727205590108816, "eval_recall": 0.7205813209797584, "eval_runtime": 17.8753, "eval_samples_per_second": 27.972, "eval_steps_per_second": 3.524, "step": 5773 }, { "epoch": 23.10756972111554, "grad_norm": 10.918773651123047, "learning_rate": 2.297476759628154e-05, "loss": 0.1112, "step": 5800 }, { "epoch": 23.50597609561753, "grad_norm": 11.303016662597656, "learning_rate": 2.1646746347941568e-05, "loss": 0.0954, "step": 5900 }, { "epoch": 23.904382470119522, "grad_norm": 10.82700252532959, "learning_rate": 2.0318725099601595e-05, "loss": 0.1089, "step": 6000 }, { "epoch": 24.0, "eval_accuracy": 0.712, "eval_f1": 0.7075426800060708, "eval_loss": 1.1160012483596802, "eval_precision": 0.7194793034050283, "eval_recall": 0.7103697429603638, "eval_runtime": 17.7441, "eval_samples_per_second": 28.178, "eval_steps_per_second": 3.55, "step": 6024 }, { "epoch": 24.302788844621514, "grad_norm": 10.5631742477417, "learning_rate": 1.899070385126162e-05, "loss": 0.0864, "step": 6100 }, { "epoch": 24.701195219123505, "grad_norm": 10.918201446533203, "learning_rate": 1.7662682602921647e-05, "loss": 0.0999, "step": 6200 }, { "epoch": 25.0, "eval_accuracy": 0.716, "eval_f1": 0.7090935362771184, "eval_loss": 1.0727450847625732, "eval_precision": 0.7106284520077623, "eval_recall": 0.7145099767794276, "eval_runtime": 17.8845, "eval_samples_per_second": 27.957, "eval_steps_per_second": 3.523, "step": 6275 }, { "epoch": 25.099601593625497, "grad_norm": 11.402228355407715, "learning_rate": 1.6334661354581674e-05, "loss": 0.1042, "step": 6300 }, { "epoch": 25.49800796812749, "grad_norm": 11.740915298461914, "learning_rate": 1.5006640106241702e-05, "loss": 0.089, "step": 6400 }, { "epoch": 25.89641434262948, "grad_norm": 11.218791961669922, "learning_rate": 1.3678618857901726e-05, "loss": 0.0738, "step": 6500 }, { "epoch": 26.0, "eval_accuracy": 0.706, "eval_f1": 0.699550751079995, "eval_loss": 1.2584666013717651, "eval_precision": 0.7133200179296525, "eval_recall": 0.704105950343699, "eval_runtime": 18.0765, "eval_samples_per_second": 27.66, "eval_steps_per_second": 3.485, "step": 6526 }, { "epoch": 26.294820717131476, "grad_norm": 9.9302339553833, "learning_rate": 1.2350597609561753e-05, "loss": 0.0914, "step": 6600 }, { "epoch": 26.693227091633467, "grad_norm": 11.051177024841309, "learning_rate": 1.102257636122178e-05, "loss": 0.0836, "step": 6700 }, { "epoch": 27.0, "eval_accuracy": 0.718, "eval_f1": 0.7104355302219595, "eval_loss": 1.1709084510803223, "eval_precision": 0.7172922964310544, "eval_recall": 0.7163028008735083, "eval_runtime": 17.5336, "eval_samples_per_second": 28.517, "eval_steps_per_second": 3.593, "step": 6777 }, { "epoch": 27.09163346613546, "grad_norm": 10.727697372436523, "learning_rate": 9.694555112881806e-06, "loss": 0.0986, "step": 6800 }, { "epoch": 27.49003984063745, "grad_norm": 11.817888259887695, "learning_rate": 8.366533864541832e-06, "loss": 0.07, "step": 6900 }, { "epoch": 27.888446215139442, "grad_norm": 10.357769966125488, "learning_rate": 7.03851261620186e-06, "loss": 0.0775, "step": 7000 }, { "epoch": 28.0, "eval_accuracy": 0.722, "eval_f1": 0.7145836341124611, "eval_loss": 1.2422434091567993, "eval_precision": 0.7256025662918439, "eval_recall": 0.720296105512437, "eval_runtime": 17.5709, "eval_samples_per_second": 28.456, "eval_steps_per_second": 3.585, "step": 7028 }, { "epoch": 28.286852589641434, "grad_norm": 10.4796142578125, "learning_rate": 5.710491367861886e-06, "loss": 0.0713, "step": 7100 }, { "epoch": 28.685258964143426, "grad_norm": 9.803996086120605, "learning_rate": 4.382470119521913e-06, "loss": 0.0752, "step": 7200 }, { "epoch": 29.0, "eval_accuracy": 0.728, "eval_f1": 0.7205734767025089, "eval_loss": 1.2145317792892456, "eval_precision": 0.730059540405073, "eval_recall": 0.7263936664880468, "eval_runtime": 17.625, "eval_samples_per_second": 28.369, "eval_steps_per_second": 3.574, "step": 7279 } ], "logging_steps": 100, "max_steps": 7530, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.995893225012062e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }