{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.984984984984985, "global_step": 246, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 0.00015384615384615385, "loss": 2.9074, "step": 2 }, { "epoch": 0.1, "learning_rate": 0.0003076923076923077, "loss": 2.7922, "step": 4 }, { "epoch": 0.14, "learning_rate": 0.0004615384615384616, "loss": 2.6824, "step": 6 }, { "epoch": 0.19, "learning_rate": 0.0006153846153846154, "loss": 2.6036, "step": 8 }, { "epoch": 0.24, "learning_rate": 0.0007692307692307693, "loss": 2.5524, "step": 10 }, { "epoch": 0.29, "learning_rate": 0.0009230769230769232, "loss": 2.5342, "step": 12 }, { "epoch": 0.34, "learning_rate": 0.0009999545513138964, "loss": 2.5052, "step": 14 }, { "epoch": 0.38, "learning_rate": 0.000999591011397558, "loss": 2.4689, "step": 16 }, { "epoch": 0.43, "learning_rate": 0.000998864195911451, "loss": 2.4487, "step": 18 }, { "epoch": 0.48, "learning_rate": 0.000997774633356497, "loss": 2.4068, "step": 20 }, { "epoch": 0.53, "learning_rate": 0.0009963231160036714, "loss": 2.3813, "step": 22 }, { "epoch": 0.58, "learning_rate": 0.0009945106993179073, "loss": 2.3558, "step": 24 }, { "epoch": 0.62, "learning_rate": 0.000992338701190618, "loss": 2.3425, "step": 26 }, { "epoch": 0.67, "learning_rate": 0.0009898087009813986, "loss": 2.3254, "step": 28 }, { "epoch": 0.72, "learning_rate": 0.000986922538369599, "loss": 2.3128, "step": 30 }, { "epoch": 0.77, "learning_rate": 0.0009836823120166116, "loss": 2.3026, "step": 32 }, { "epoch": 0.82, "learning_rate": 0.0009800903780398356, "loss": 2.3005, "step": 34 }, { "epoch": 0.86, "learning_rate": 0.0009761493482994372, "loss": 2.2828, "step": 36 }, { "epoch": 0.91, "learning_rate": 0.0009718620884991454, "loss": 2.2779, "step": 38 }, { "epoch": 0.96, "learning_rate": 0.0009672317161024677, "loss": 2.2485, "step": 40 }, { "epoch": 0.98, "eval_accuracy": 0.5157865687819605, "eval_loss": 2.1457464694976807, "eval_runtime": 496.7153, "eval_samples_per_second": 1.184, "eval_steps_per_second": 0.296, "step": 41 }, { "epoch": 1.02, "learning_rate": 0.000962261598065839, "loss": 2.9045, "step": 42 }, { "epoch": 1.07, "learning_rate": 0.000956955348390353, "loss": 2.16, "step": 44 }, { "epoch": 1.12, "learning_rate": 0.000951316825493856, "loss": 2.1395, "step": 46 }, { "epoch": 1.17, "learning_rate": 0.0009453501294053137, "loss": 2.1327, "step": 48 }, { "epoch": 1.22, "learning_rate": 0.0009390595987834929, "loss": 2.128, "step": 50 }, { "epoch": 1.26, "learning_rate": 0.000932449807762122, "loss": 2.1251, "step": 52 }, { "epoch": 1.31, "learning_rate": 0.0009255255626238293, "loss": 2.1138, "step": 54 }, { "epoch": 1.36, "learning_rate": 0.0009182918983052741, "loss": 2.118, "step": 56 }, { "epoch": 1.41, "learning_rate": 0.0009107540747360124, "loss": 2.1034, "step": 58 }, { "epoch": 1.46, "learning_rate": 0.0009029175730137609, "loss": 2.1016, "step": 60 }, { "epoch": 1.5, "learning_rate": 0.0008947880914188397, "loss": 2.1027, "step": 62 }, { "epoch": 1.55, "learning_rate": 0.0008863715412706897, "loss": 2.0969, "step": 64 }, { "epoch": 1.6, "learning_rate": 0.0008776740426294817, "loss": 2.0926, "step": 66 }, { "epoch": 1.65, "learning_rate": 0.0008687019198459394, "loss": 2.0946, "step": 68 }, { "epoch": 1.7, "learning_rate": 0.0008594616969626134, "loss": 2.0968, "step": 70 }, { "epoch": 1.74, "learning_rate": 0.0008499600929699501, "loss": 2.0932, "step": 72 }, { "epoch": 1.79, "learning_rate": 0.0008402040169206053, "loss": 2.0813, "step": 74 }, { "epoch": 1.84, "learning_rate": 0.0008302005629055549, "loss": 2.0815, "step": 76 }, { "epoch": 1.89, "learning_rate": 0.0008199570048956552, "loss": 2.0809, "step": 78 }, { "epoch": 1.94, "learning_rate": 0.0008094807914524047, "loss": 2.0704, "step": 80 }, { "epoch": 1.98, "learning_rate": 0.0007987795403117528, "loss": 2.0757, "step": 82 }, { "epoch": 1.98, "eval_accuracy": 0.530352903624793, "eval_loss": 2.058377742767334, "eval_runtime": 493.7321, "eval_samples_per_second": 1.191, "eval_steps_per_second": 0.298, "step": 82 }, { "epoch": 2.05, "learning_rate": 0.0007878610328448947, "loss": 2.615, "step": 84 }, { "epoch": 2.1, "learning_rate": 0.0007767332084000785, "loss": 1.9638, "step": 86 }, { "epoch": 2.14, "learning_rate": 0.0007654041585295399, "loss": 1.979, "step": 88 }, { "epoch": 2.19, "learning_rate": 0.0007538821211057647, "loss": 1.9628, "step": 90 }, { "epoch": 2.24, "learning_rate": 0.0007421754743313514, "loss": 1.9717, "step": 92 }, { "epoch": 2.29, "learning_rate": 0.0007302927306468364, "loss": 1.9625, "step": 94 }, { "epoch": 2.34, "learning_rate": 0.0007182425305409072, "loss": 1.9683, "step": 96 }, { "epoch": 2.38, "learning_rate": 0.0007060336362675068, "loss": 1.9556, "step": 98 }, { "epoch": 2.43, "learning_rate": 0.0006936749254743951, "loss": 1.9565, "step": 100 }, { "epoch": 2.48, "learning_rate": 0.000681175384747805, "loss": 1.9601, "step": 102 }, { "epoch": 2.53, "learning_rate": 0.0006685441030778817, "loss": 1.9612, "step": 104 }, { "epoch": 2.58, "learning_rate": 0.0006557902652496611, "loss": 1.9539, "step": 106 }, { "epoch": 2.62, "learning_rate": 0.0006429231451643907, "loss": 1.9563, "step": 108 }, { "epoch": 2.67, "learning_rate": 0.0006299520990960496, "loss": 1.9597, "step": 110 }, { "epoch": 2.72, "learning_rate": 0.000616886558887973, "loss": 1.9625, "step": 112 }, { "epoch": 2.77, "learning_rate": 0.0006037360250945242, "loss": 1.965, "step": 114 }, { "epoch": 2.82, "learning_rate": 0.0005905100600728067, "loss": 1.9588, "step": 116 }, { "epoch": 2.86, "learning_rate": 0.0005772182810294344, "loss": 1.965, "step": 118 }, { "epoch": 2.91, "learning_rate": 0.0005638703530274187, "loss": 1.9595, "step": 120 }, { "epoch": 2.96, "learning_rate": 0.0005504759819582581, "loss": 1.966, "step": 122 }, { "epoch": 2.98, "eval_accuracy": 0.5375845352803879, "eval_loss": 2.0210342407226562, "eval_runtime": 475.4248, "eval_samples_per_second": 1.237, "eval_steps_per_second": 0.309, "step": 123 }, { "epoch": 3.02, "learning_rate": 0.000537044907484338, "loss": 2.5101, "step": 124 }, { "epoch": 3.07, "learning_rate": 0.0005235868959567755, "loss": 1.8782, "step": 126 }, { "epoch": 3.12, "learning_rate": 0.0005101117333138558, "loss": 1.8556, "step": 128 }, { "epoch": 3.17, "learning_rate": 0.0004966292179652285, "loss": 1.8622, "step": 130 }, { "epoch": 3.22, "learning_rate": 0.00048314915366703335, "loss": 1.8668, "step": 132 }, { "epoch": 3.26, "learning_rate": 0.0004696813423931381, "loss": 1.873, "step": 134 }, { "epoch": 3.31, "learning_rate": 0.000456235577207674, "loss": 1.8694, "step": 136 }, { "epoch": 3.36, "learning_rate": 0.00044282163514404915, "loss": 1.8742, "step": 138 }, { "epoch": 3.41, "learning_rate": 0.00042944927009561784, "loss": 1.8694, "step": 140 }, { "epoch": 3.46, "learning_rate": 0.00041612820572317757, "loss": 1.8569, "step": 142 }, { "epoch": 3.5, "learning_rate": 0.0004028681283844471, "loss": 1.8619, "step": 144 }, { "epoch": 3.55, "learning_rate": 0.0003896786800906733, "loss": 1.8734, "step": 146 }, { "epoch": 3.6, "learning_rate": 0.00037656945149547956, "loss": 1.8677, "step": 148 }, { "epoch": 3.65, "learning_rate": 0.00036354997492106257, "loss": 1.872, "step": 150 }, { "epoch": 3.7, "learning_rate": 0.00035062971742680246, "loss": 1.847, "step": 152 }, { "epoch": 3.74, "learning_rate": 0.00033781807392532895, "loss": 1.8733, "step": 154 }, { "epoch": 3.79, "learning_rate": 0.00032512436035104966, "loss": 1.8657, "step": 156 }, { "epoch": 3.84, "learning_rate": 0.00031255780688610506, "loss": 1.8581, "step": 158 }, { "epoch": 3.89, "learning_rate": 0.00030012755124868095, "loss": 1.8672, "step": 160 }, { "epoch": 3.94, "learning_rate": 0.00028784263204855175, "loss": 1.8585, "step": 162 }, { "epoch": 3.98, "learning_rate": 0.00027571198221469395, "loss": 1.8602, "step": 164 }, { "epoch": 3.98, "eval_accuracy": 0.5422011424315571, "eval_loss": 2.0011844635009766, "eval_runtime": 493.1263, "eval_samples_per_second": 1.192, "eval_steps_per_second": 0.298, "step": 164 }, { "epoch": 4.05, "learning_rate": 0.0002637444224997421, "loss": 2.4006, "step": 166 }, { "epoch": 4.1, "learning_rate": 0.00025194865506601504, "loss": 1.8118, "step": 168 }, { "epoch": 4.14, "learning_rate": 0.00024033325715777377, "loss": 1.7888, "step": 170 }, { "epoch": 4.19, "learning_rate": 0.00022890667486431295, "loss": 1.798, "step": 172 }, { "epoch": 4.24, "learning_rate": 0.00021767721697842242, "loss": 1.8052, "step": 174 }, { "epoch": 4.29, "learning_rate": 0.00020665304895468113, "loss": 1.8054, "step": 176 }, { "epoch": 4.34, "learning_rate": 0.0001958421869719807, "loss": 1.7966, "step": 178 }, { "epoch": 4.38, "learning_rate": 0.00018525249210459344, "loss": 1.8049, "step": 180 }, { "epoch": 4.43, "learning_rate": 0.00017489166460602496, "loss": 1.7955, "step": 182 }, { "epoch": 4.48, "learning_rate": 0.0001647672383098045, "loss": 1.8062, "step": 184 }, { "epoch": 4.53, "learning_rate": 0.00015488657515129006, "loss": 1.8132, "step": 186 }, { "epoch": 4.58, "learning_rate": 0.0001452568598144668, "loss": 1.8027, "step": 188 }, { "epoch": 4.62, "learning_rate": 0.00013588509450763282, "loss": 1.8066, "step": 190 }, { "epoch": 4.67, "learning_rate": 0.0001267780938717722, "loss": 1.7989, "step": 192 }, { "epoch": 4.72, "learning_rate": 0.00011794248002531643, "loss": 1.7966, "step": 194 }, { "epoch": 4.77, "learning_rate": 0.00010938467774889881, "loss": 1.8074, "step": 196 }, { "epoch": 4.82, "learning_rate": 0.00010111090981359961, "loss": 1.8073, "step": 198 }, { "epoch": 4.86, "learning_rate": 9.312719245608486e-05, "loss": 1.8022, "step": 200 }, { "epoch": 4.91, "learning_rate": 8.54393310039246e-05, "loss": 1.8014, "step": 202 }, { "epoch": 4.96, "learning_rate": 7.805291565427064e-05, "loss": 1.8089, "step": 204 }, { "epoch": 4.98, "eval_accuracy": 0.5435859583325021, "eval_loss": 1.9977257251739502, "eval_runtime": 497.2866, "eval_samples_per_second": 1.182, "eval_steps_per_second": 0.296, "step": 205 }, { "epoch": 5.02, "learning_rate": 7.097331740896996e-05, "loss": 2.3557, "step": 206 }, { "epoch": 5.07, "learning_rate": 6.420568416906064e-05, "loss": 1.7835, "step": 208 }, { "epoch": 5.12, "learning_rate": 5.775493699149753e-05, "loss": 1.7641, "step": 210 }, { "epoch": 5.17, "learning_rate": 5.16257665108254e-05, "loss": 1.7754, "step": 212 }, { "epoch": 5.22, "learning_rate": 4.582262952840355e-05, "loss": 1.7765, "step": 214 }, { "epoch": 5.26, "learning_rate": 4.034974577166023e-05, "loss": 1.7732, "step": 216 }, { "epoch": 5.31, "learning_rate": 3.521109482573515e-05, "loss": 1.7663, "step": 218 }, { "epoch": 5.36, "learning_rate": 3.0410413239740975e-05, "loss": 1.7779, "step": 220 }, { "epoch": 5.41, "learning_rate": 2.5951191809746145e-05, "loss": 1.7739, "step": 222 }, { "epoch": 5.46, "learning_rate": 2.1836673040456943e-05, "loss": 1.7602, "step": 224 }, { "epoch": 5.5, "learning_rate": 1.8069848787443555e-05, "loss": 1.7773, "step": 226 }, { "epoch": 5.55, "learning_rate": 1.4653458081624272e-05, "loss": 1.7841, "step": 228 }, { "epoch": 5.6, "learning_rate": 1.1589985137590519e-05, "loss": 1.7746, "step": 230 }, { "epoch": 5.65, "learning_rate": 8.881657547219868e-06, "loss": 1.7801, "step": 232 }, { "epoch": 5.7, "learning_rate": 6.530444659892443e-06, "loss": 1.7639, "step": 234 }, { "epoch": 5.74, "learning_rate": 4.538056150485858e-06, "loss": 1.7771, "step": 236 }, { "epoch": 5.79, "learning_rate": 2.905940776192384e-06, "loss": 1.7724, "step": 238 }, { "epoch": 5.84, "learning_rate": 1.6352853230609532e-06, "loss": 1.7876, "step": 240 }, { "epoch": 5.89, "learning_rate": 7.270137430306356e-07, "loss": 1.7718, "step": 242 }, { "epoch": 5.94, "learning_rate": 1.817864820827242e-07, "loss": 1.779, "step": 244 }, { "epoch": 5.98, "learning_rate": 0.0, "loss": 1.7698, "step": 246 }, { "epoch": 5.98, "eval_accuracy": 0.544073054441718, "eval_loss": 1.9982993602752686, "eval_runtime": 495.216, "eval_samples_per_second": 1.187, "eval_steps_per_second": 0.297, "step": 246 }, { "before_init_mem_cpu": 699817984, "before_init_mem_gpu": 0, "epoch": 5.98, "init_mem_cpu_alloc_delta": -283938816, "init_mem_cpu_peaked_delta": 332918784, "init_mem_gpu_alloc_delta": 334744576, "init_mem_gpu_peaked_delta": 0, "step": 246, "total_flos": 8328311331618816.0, "train_loss": 2.0183091745143984, "train_mem_cpu_alloc_delta": 1155149824, "train_mem_cpu_peaked_delta": 308719616, "train_mem_gpu_alloc_delta": 988995584, "train_mem_gpu_peaked_delta": 4500811776, "train_runtime": 35067.5673, "train_samples_per_second": 0.911, "train_steps_per_second": 0.007 } ], "max_steps": 246, "num_train_epochs": 6, "total_flos": 8328311331618816.0, "trial_name": null, "trial_params": null }