{ "best_metric": 0.6654545454545454, "best_model_checkpoint": "BEiT-RD-DA\\checkpoint-2112", "epoch": 40.0, "eval_steps": 500, "global_step": 3840, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "learning_rate": 1.3020833333333335e-06, "loss": 1.6093, "step": 10 }, { "epoch": 0.21, "learning_rate": 2.604166666666667e-06, "loss": 1.6082, "step": 20 }, { "epoch": 0.31, "learning_rate": 3.90625e-06, "loss": 1.6045, "step": 30 }, { "epoch": 0.42, "learning_rate": 5.208333333333334e-06, "loss": 1.5939, "step": 40 }, { "epoch": 0.52, "learning_rate": 6.510416666666667e-06, "loss": 1.576, "step": 50 }, { "epoch": 0.62, "learning_rate": 7.8125e-06, "loss": 1.5501, "step": 60 }, { "epoch": 0.73, "learning_rate": 9.114583333333334e-06, "loss": 1.5116, "step": 70 }, { "epoch": 0.83, "learning_rate": 1.0416666666666668e-05, "loss": 1.462, "step": 80 }, { "epoch": 0.94, "learning_rate": 1.171875e-05, "loss": 1.4123, "step": 90 }, { "epoch": 1.0, "eval_accuracy": 0.49272727272727274, "eval_loss": 1.4099408388137817, "eval_runtime": 7.0436, "eval_samples_per_second": 78.085, "eval_steps_per_second": 4.969, "step": 96 }, { "epoch": 1.04, "learning_rate": 1.3020833333333334e-05, "loss": 1.3572, "step": 100 }, { "epoch": 1.15, "learning_rate": 1.4322916666666666e-05, "loss": 1.2997, "step": 110 }, { "epoch": 1.25, "learning_rate": 1.5625e-05, "loss": 1.2359, "step": 120 }, { "epoch": 1.35, "learning_rate": 1.6927083333333336e-05, "loss": 1.1928, "step": 130 }, { "epoch": 1.46, "learning_rate": 1.8229166666666668e-05, "loss": 1.1188, "step": 140 }, { "epoch": 1.56, "learning_rate": 1.953125e-05, "loss": 1.0731, "step": 150 }, { "epoch": 1.67, "learning_rate": 2.0833333333333336e-05, "loss": 1.0396, "step": 160 }, { "epoch": 1.77, "learning_rate": 2.2135416666666668e-05, "loss": 0.9933, "step": 170 }, { "epoch": 1.88, "learning_rate": 2.34375e-05, "loss": 0.9498, "step": 180 }, { "epoch": 1.98, "learning_rate": 2.4739583333333336e-05, "loss": 0.9503, "step": 190 }, { "epoch": 2.0, "eval_accuracy": 0.49272727272727274, "eval_loss": 1.8851675987243652, "eval_runtime": 3.6814, "eval_samples_per_second": 149.401, "eval_steps_per_second": 9.507, "step": 192 }, { "epoch": 2.08, "learning_rate": 2.604166666666667e-05, "loss": 0.9379, "step": 200 }, { "epoch": 2.19, "learning_rate": 2.734375e-05, "loss": 0.8963, "step": 210 }, { "epoch": 2.29, "learning_rate": 2.8645833333333333e-05, "loss": 0.9422, "step": 220 }, { "epoch": 2.4, "learning_rate": 2.994791666666667e-05, "loss": 0.904, "step": 230 }, { "epoch": 2.5, "learning_rate": 3.125e-05, "loss": 0.9131, "step": 240 }, { "epoch": 2.6, "learning_rate": 3.255208333333333e-05, "loss": 0.9078, "step": 250 }, { "epoch": 2.71, "learning_rate": 3.385416666666667e-05, "loss": 0.851, "step": 260 }, { "epoch": 2.81, "learning_rate": 3.5156250000000004e-05, "loss": 0.9054, "step": 270 }, { "epoch": 2.92, "learning_rate": 3.6458333333333336e-05, "loss": 0.8284, "step": 280 }, { "epoch": 3.0, "eval_accuracy": 0.5072727272727273, "eval_loss": 2.1701955795288086, "eval_runtime": 3.6684, "eval_samples_per_second": 149.93, "eval_steps_per_second": 9.541, "step": 288 }, { "epoch": 3.02, "learning_rate": 3.776041666666667e-05, "loss": 0.7817, "step": 290 }, { "epoch": 3.12, "learning_rate": 3.90625e-05, "loss": 0.8264, "step": 300 }, { "epoch": 3.23, "learning_rate": 4.036458333333333e-05, "loss": 0.8271, "step": 310 }, { "epoch": 3.33, "learning_rate": 4.166666666666667e-05, "loss": 0.8185, "step": 320 }, { "epoch": 3.44, "learning_rate": 4.2968750000000004e-05, "loss": 0.7828, "step": 330 }, { "epoch": 3.54, "learning_rate": 4.4270833333333337e-05, "loss": 0.7351, "step": 340 }, { "epoch": 3.65, "learning_rate": 4.557291666666667e-05, "loss": 0.7248, "step": 350 }, { "epoch": 3.75, "learning_rate": 4.6875e-05, "loss": 0.7981, "step": 360 }, { "epoch": 3.85, "learning_rate": 4.817708333333333e-05, "loss": 0.8329, "step": 370 }, { "epoch": 3.96, "learning_rate": 4.947916666666667e-05, "loss": 0.7677, "step": 380 }, { "epoch": 4.0, "eval_accuracy": 0.5345454545454545, "eval_loss": 2.0407912731170654, "eval_runtime": 4.124, "eval_samples_per_second": 133.366, "eval_steps_per_second": 8.487, "step": 384 }, { "epoch": 4.06, "learning_rate": 4.991319444444445e-05, "loss": 0.7784, "step": 390 }, { "epoch": 4.17, "learning_rate": 4.976851851851852e-05, "loss": 0.7742, "step": 400 }, { "epoch": 4.27, "learning_rate": 4.96238425925926e-05, "loss": 0.7786, "step": 410 }, { "epoch": 4.38, "learning_rate": 4.947916666666667e-05, "loss": 0.7264, "step": 420 }, { "epoch": 4.48, "learning_rate": 4.933449074074074e-05, "loss": 0.752, "step": 430 }, { "epoch": 4.58, "learning_rate": 4.9189814814814815e-05, "loss": 0.7815, "step": 440 }, { "epoch": 4.69, "learning_rate": 4.904513888888889e-05, "loss": 0.7081, "step": 450 }, { "epoch": 4.79, "learning_rate": 4.8900462962962965e-05, "loss": 0.7476, "step": 460 }, { "epoch": 4.9, "learning_rate": 4.875578703703704e-05, "loss": 0.6773, "step": 470 }, { "epoch": 5.0, "learning_rate": 4.8611111111111115e-05, "loss": 0.788, "step": 480 }, { "epoch": 5.0, "eval_accuracy": 0.5127272727272727, "eval_loss": 2.7991154193878174, "eval_runtime": 3.8459, "eval_samples_per_second": 143.009, "eval_steps_per_second": 9.101, "step": 480 }, { "epoch": 5.1, "learning_rate": 4.846643518518519e-05, "loss": 0.7058, "step": 490 }, { "epoch": 5.21, "learning_rate": 4.8321759259259265e-05, "loss": 0.7083, "step": 500 }, { "epoch": 5.31, "learning_rate": 4.817708333333333e-05, "loss": 0.6479, "step": 510 }, { "epoch": 5.42, "learning_rate": 4.803240740740741e-05, "loss": 0.658, "step": 520 }, { "epoch": 5.52, "learning_rate": 4.788773148148148e-05, "loss": 0.7756, "step": 530 }, { "epoch": 5.62, "learning_rate": 4.774305555555556e-05, "loss": 0.6294, "step": 540 }, { "epoch": 5.73, "learning_rate": 4.759837962962963e-05, "loss": 0.6727, "step": 550 }, { "epoch": 5.83, "learning_rate": 4.745370370370371e-05, "loss": 0.6476, "step": 560 }, { "epoch": 5.94, "learning_rate": 4.730902777777778e-05, "loss": 0.5822, "step": 570 }, { "epoch": 6.0, "eval_accuracy": 0.5636363636363636, "eval_loss": 2.0950703620910645, "eval_runtime": 3.9745, "eval_samples_per_second": 138.384, "eval_steps_per_second": 8.806, "step": 576 }, { "epoch": 6.04, "learning_rate": 4.716435185185186e-05, "loss": 0.5656, "step": 580 }, { "epoch": 6.15, "learning_rate": 4.7019675925925926e-05, "loss": 0.6338, "step": 590 }, { "epoch": 6.25, "learning_rate": 4.6875e-05, "loss": 0.62, "step": 600 }, { "epoch": 6.35, "learning_rate": 4.6730324074074076e-05, "loss": 0.5603, "step": 610 }, { "epoch": 6.46, "learning_rate": 4.658564814814815e-05, "loss": 0.5726, "step": 620 }, { "epoch": 6.56, "learning_rate": 4.6440972222222226e-05, "loss": 0.5767, "step": 630 }, { "epoch": 6.67, "learning_rate": 4.62962962962963e-05, "loss": 0.5546, "step": 640 }, { "epoch": 6.77, "learning_rate": 4.6151620370370376e-05, "loss": 0.5003, "step": 650 }, { "epoch": 6.88, "learning_rate": 4.6006944444444444e-05, "loss": 0.564, "step": 660 }, { "epoch": 6.98, "learning_rate": 4.586226851851852e-05, "loss": 0.5172, "step": 670 }, { "epoch": 7.0, "eval_accuracy": 0.5363636363636364, "eval_loss": 2.5976808071136475, "eval_runtime": 4.1525, "eval_samples_per_second": 132.451, "eval_steps_per_second": 8.429, "step": 672 }, { "epoch": 7.08, "learning_rate": 4.5717592592592594e-05, "loss": 0.4963, "step": 680 }, { "epoch": 7.19, "learning_rate": 4.557291666666667e-05, "loss": 0.5223, "step": 690 }, { "epoch": 7.29, "learning_rate": 4.5428240740740744e-05, "loss": 0.5279, "step": 700 }, { "epoch": 7.4, "learning_rate": 4.528356481481482e-05, "loss": 0.4685, "step": 710 }, { "epoch": 7.5, "learning_rate": 4.5138888888888894e-05, "loss": 0.4652, "step": 720 }, { "epoch": 7.6, "learning_rate": 4.499421296296297e-05, "loss": 0.3945, "step": 730 }, { "epoch": 7.71, "learning_rate": 4.484953703703704e-05, "loss": 0.4617, "step": 740 }, { "epoch": 7.81, "learning_rate": 4.470486111111111e-05, "loss": 0.471, "step": 750 }, { "epoch": 7.92, "learning_rate": 4.456018518518519e-05, "loss": 0.4615, "step": 760 }, { "epoch": 8.0, "eval_accuracy": 0.58, "eval_loss": 2.096754789352417, "eval_runtime": 4.078, "eval_samples_per_second": 134.871, "eval_steps_per_second": 8.583, "step": 768 }, { "epoch": 8.02, "learning_rate": 4.441550925925926e-05, "loss": 0.4487, "step": 770 }, { "epoch": 8.12, "learning_rate": 4.4270833333333337e-05, "loss": 0.4692, "step": 780 }, { "epoch": 8.23, "learning_rate": 4.412615740740741e-05, "loss": 0.4161, "step": 790 }, { "epoch": 8.33, "learning_rate": 4.3981481481481486e-05, "loss": 0.4709, "step": 800 }, { "epoch": 8.44, "learning_rate": 4.383680555555556e-05, "loss": 0.4199, "step": 810 }, { "epoch": 8.54, "learning_rate": 4.369212962962963e-05, "loss": 0.4416, "step": 820 }, { "epoch": 8.65, "learning_rate": 4.3547453703703704e-05, "loss": 0.4253, "step": 830 }, { "epoch": 8.75, "learning_rate": 4.340277777777778e-05, "loss": 0.3953, "step": 840 }, { "epoch": 8.85, "learning_rate": 4.3258101851851854e-05, "loss": 0.4495, "step": 850 }, { "epoch": 8.96, "learning_rate": 4.311342592592593e-05, "loss": 0.3672, "step": 860 }, { "epoch": 9.0, "eval_accuracy": 0.5436363636363636, "eval_loss": 2.8535282611846924, "eval_runtime": 3.6989, "eval_samples_per_second": 148.694, "eval_steps_per_second": 9.462, "step": 864 }, { "epoch": 9.06, "learning_rate": 4.2968750000000004e-05, "loss": 0.3812, "step": 870 }, { "epoch": 9.17, "learning_rate": 4.282407407407408e-05, "loss": 0.3683, "step": 880 }, { "epoch": 9.27, "learning_rate": 4.267939814814815e-05, "loss": 0.4462, "step": 890 }, { "epoch": 9.38, "learning_rate": 4.253472222222222e-05, "loss": 0.31, "step": 900 }, { "epoch": 9.48, "learning_rate": 4.23900462962963e-05, "loss": 0.3963, "step": 910 }, { "epoch": 9.58, "learning_rate": 4.224537037037037e-05, "loss": 0.3393, "step": 920 }, { "epoch": 9.69, "learning_rate": 4.210069444444445e-05, "loss": 0.3853, "step": 930 }, { "epoch": 9.79, "learning_rate": 4.195601851851852e-05, "loss": 0.4014, "step": 940 }, { "epoch": 9.9, "learning_rate": 4.18113425925926e-05, "loss": 0.3931, "step": 950 }, { "epoch": 10.0, "learning_rate": 4.166666666666667e-05, "loss": 0.379, "step": 960 }, { "epoch": 10.0, "eval_accuracy": 0.5381818181818182, "eval_loss": 2.9514999389648438, "eval_runtime": 3.6769, "eval_samples_per_second": 149.583, "eval_steps_per_second": 9.519, "step": 960 }, { "epoch": 10.1, "learning_rate": 4.152199074074074e-05, "loss": 0.3544, "step": 970 }, { "epoch": 10.21, "learning_rate": 4.1377314814814815e-05, "loss": 0.3334, "step": 980 }, { "epoch": 10.31, "learning_rate": 4.123263888888889e-05, "loss": 0.2906, "step": 990 }, { "epoch": 10.42, "learning_rate": 4.1087962962962965e-05, "loss": 0.3518, "step": 1000 }, { "epoch": 10.52, "learning_rate": 4.094328703703704e-05, "loss": 0.3341, "step": 1010 }, { "epoch": 10.62, "learning_rate": 4.0798611111111115e-05, "loss": 0.3725, "step": 1020 }, { "epoch": 10.73, "learning_rate": 4.065393518518519e-05, "loss": 0.3435, "step": 1030 }, { "epoch": 10.83, "learning_rate": 4.0509259259259265e-05, "loss": 0.3219, "step": 1040 }, { "epoch": 10.94, "learning_rate": 4.036458333333333e-05, "loss": 0.3301, "step": 1050 }, { "epoch": 11.0, "eval_accuracy": 0.5581818181818182, "eval_loss": 2.7200424671173096, "eval_runtime": 3.7269, "eval_samples_per_second": 147.576, "eval_steps_per_second": 9.391, "step": 1056 }, { "epoch": 11.04, "learning_rate": 4.021990740740741e-05, "loss": 0.3121, "step": 1060 }, { "epoch": 11.15, "learning_rate": 4.007523148148148e-05, "loss": 0.3416, "step": 1070 }, { "epoch": 11.25, "learning_rate": 3.993055555555556e-05, "loss": 0.2896, "step": 1080 }, { "epoch": 11.35, "learning_rate": 3.978587962962963e-05, "loss": 0.3542, "step": 1090 }, { "epoch": 11.46, "learning_rate": 3.964120370370371e-05, "loss": 0.3516, "step": 1100 }, { "epoch": 11.56, "learning_rate": 3.949652777777778e-05, "loss": 0.295, "step": 1110 }, { "epoch": 11.67, "learning_rate": 3.935185185185186e-05, "loss": 0.3252, "step": 1120 }, { "epoch": 11.77, "learning_rate": 3.9207175925925926e-05, "loss": 0.2624, "step": 1130 }, { "epoch": 11.88, "learning_rate": 3.90625e-05, "loss": 0.329, "step": 1140 }, { "epoch": 11.98, "learning_rate": 3.8917824074074076e-05, "loss": 0.2786, "step": 1150 }, { "epoch": 12.0, "eval_accuracy": 0.6272727272727273, "eval_loss": 1.8999561071395874, "eval_runtime": 3.6939, "eval_samples_per_second": 148.893, "eval_steps_per_second": 9.475, "step": 1152 }, { "epoch": 12.08, "learning_rate": 3.877314814814815e-05, "loss": 0.2833, "step": 1160 }, { "epoch": 12.19, "learning_rate": 3.8628472222222226e-05, "loss": 0.2536, "step": 1170 }, { "epoch": 12.29, "learning_rate": 3.84837962962963e-05, "loss": 0.3003, "step": 1180 }, { "epoch": 12.4, "learning_rate": 3.8339120370370376e-05, "loss": 0.2713, "step": 1190 }, { "epoch": 12.5, "learning_rate": 3.8194444444444444e-05, "loss": 0.2854, "step": 1200 }, { "epoch": 12.6, "learning_rate": 3.804976851851852e-05, "loss": 0.2695, "step": 1210 }, { "epoch": 12.71, "learning_rate": 3.7905092592592594e-05, "loss": 0.2702, "step": 1220 }, { "epoch": 12.81, "learning_rate": 3.776041666666667e-05, "loss": 0.3312, "step": 1230 }, { "epoch": 12.92, "learning_rate": 3.7615740740740744e-05, "loss": 0.2746, "step": 1240 }, { "epoch": 13.0, "eval_accuracy": 0.5363636363636364, "eval_loss": 3.1768321990966797, "eval_runtime": 3.7274, "eval_samples_per_second": 147.555, "eval_steps_per_second": 9.39, "step": 1248 }, { "epoch": 13.02, "learning_rate": 3.747106481481482e-05, "loss": 0.2632, "step": 1250 }, { "epoch": 13.12, "learning_rate": 3.7326388888888893e-05, "loss": 0.2495, "step": 1260 }, { "epoch": 13.23, "learning_rate": 3.718171296296297e-05, "loss": 0.2761, "step": 1270 }, { "epoch": 13.33, "learning_rate": 3.7037037037037037e-05, "loss": 0.2879, "step": 1280 }, { "epoch": 13.44, "learning_rate": 3.689236111111111e-05, "loss": 0.2658, "step": 1290 }, { "epoch": 13.54, "learning_rate": 3.6747685185185186e-05, "loss": 0.2652, "step": 1300 }, { "epoch": 13.65, "learning_rate": 3.660300925925926e-05, "loss": 0.2604, "step": 1310 }, { "epoch": 13.75, "learning_rate": 3.6458333333333336e-05, "loss": 0.2468, "step": 1320 }, { "epoch": 13.85, "learning_rate": 3.631365740740741e-05, "loss": 0.2552, "step": 1330 }, { "epoch": 13.96, "learning_rate": 3.6168981481481486e-05, "loss": 0.2298, "step": 1340 }, { "epoch": 14.0, "eval_accuracy": 0.5527272727272727, "eval_loss": 3.100266218185425, "eval_runtime": 3.7819, "eval_samples_per_second": 145.429, "eval_steps_per_second": 9.255, "step": 1344 }, { "epoch": 14.06, "learning_rate": 3.602430555555556e-05, "loss": 0.2423, "step": 1350 }, { "epoch": 14.17, "learning_rate": 3.587962962962963e-05, "loss": 0.2102, "step": 1360 }, { "epoch": 14.27, "learning_rate": 3.5734953703703704e-05, "loss": 0.207, "step": 1370 }, { "epoch": 14.38, "learning_rate": 3.559027777777778e-05, "loss": 0.2548, "step": 1380 }, { "epoch": 14.48, "learning_rate": 3.5445601851851854e-05, "loss": 0.2428, "step": 1390 }, { "epoch": 14.58, "learning_rate": 3.530092592592593e-05, "loss": 0.2344, "step": 1400 }, { "epoch": 14.69, "learning_rate": 3.5156250000000004e-05, "loss": 0.2847, "step": 1410 }, { "epoch": 14.79, "learning_rate": 3.501157407407408e-05, "loss": 0.2296, "step": 1420 }, { "epoch": 14.9, "learning_rate": 3.486689814814815e-05, "loss": 0.2894, "step": 1430 }, { "epoch": 15.0, "learning_rate": 3.472222222222222e-05, "loss": 0.2013, "step": 1440 }, { "epoch": 15.0, "eval_accuracy": 0.6181818181818182, "eval_loss": 2.344068765640259, "eval_runtime": 3.8014, "eval_samples_per_second": 144.683, "eval_steps_per_second": 9.207, "step": 1440 }, { "epoch": 15.1, "learning_rate": 3.45775462962963e-05, "loss": 0.2093, "step": 1450 }, { "epoch": 15.21, "learning_rate": 3.443287037037037e-05, "loss": 0.2092, "step": 1460 }, { "epoch": 15.31, "learning_rate": 3.428819444444444e-05, "loss": 0.1995, "step": 1470 }, { "epoch": 15.42, "learning_rate": 3.414351851851852e-05, "loss": 0.2162, "step": 1480 }, { "epoch": 15.52, "learning_rate": 3.39988425925926e-05, "loss": 0.2444, "step": 1490 }, { "epoch": 15.62, "learning_rate": 3.385416666666667e-05, "loss": 0.1702, "step": 1500 }, { "epoch": 15.73, "learning_rate": 3.370949074074074e-05, "loss": 0.28, "step": 1510 }, { "epoch": 15.83, "learning_rate": 3.3564814814814815e-05, "loss": 0.2303, "step": 1520 }, { "epoch": 15.94, "learning_rate": 3.342013888888889e-05, "loss": 0.2225, "step": 1530 }, { "epoch": 16.0, "eval_accuracy": 0.5709090909090909, "eval_loss": 3.0213537216186523, "eval_runtime": 3.7244, "eval_samples_per_second": 147.674, "eval_steps_per_second": 9.397, "step": 1536 }, { "epoch": 16.04, "learning_rate": 3.3275462962962965e-05, "loss": 0.2012, "step": 1540 }, { "epoch": 16.15, "learning_rate": 3.313078703703704e-05, "loss": 0.1761, "step": 1550 }, { "epoch": 16.25, "learning_rate": 3.2986111111111115e-05, "loss": 0.229, "step": 1560 }, { "epoch": 16.35, "learning_rate": 3.284143518518519e-05, "loss": 0.2114, "step": 1570 }, { "epoch": 16.46, "learning_rate": 3.2696759259259265e-05, "loss": 0.2284, "step": 1580 }, { "epoch": 16.56, "learning_rate": 3.255208333333333e-05, "loss": 0.2137, "step": 1590 }, { "epoch": 16.67, "learning_rate": 3.240740740740741e-05, "loss": 0.2902, "step": 1600 }, { "epoch": 16.77, "learning_rate": 3.226273148148148e-05, "loss": 0.2273, "step": 1610 }, { "epoch": 16.88, "learning_rate": 3.211805555555556e-05, "loss": 0.182, "step": 1620 }, { "epoch": 16.98, "learning_rate": 3.197337962962963e-05, "loss": 0.2229, "step": 1630 }, { "epoch": 17.0, "eval_accuracy": 0.6163636363636363, "eval_loss": 2.067636728286743, "eval_runtime": 3.8554, "eval_samples_per_second": 142.656, "eval_steps_per_second": 9.078, "step": 1632 }, { "epoch": 17.08, "learning_rate": 3.182870370370371e-05, "loss": 0.1704, "step": 1640 }, { "epoch": 17.19, "learning_rate": 3.168402777777778e-05, "loss": 0.2184, "step": 1650 }, { "epoch": 17.29, "learning_rate": 3.153935185185186e-05, "loss": 0.1662, "step": 1660 }, { "epoch": 17.4, "learning_rate": 3.1394675925925926e-05, "loss": 0.1965, "step": 1670 }, { "epoch": 17.5, "learning_rate": 3.125e-05, "loss": 0.1384, "step": 1680 }, { "epoch": 17.6, "learning_rate": 3.1105324074074076e-05, "loss": 0.213, "step": 1690 }, { "epoch": 17.71, "learning_rate": 3.0960648148148144e-05, "loss": 0.2053, "step": 1700 }, { "epoch": 17.81, "learning_rate": 3.0815972222222225e-05, "loss": 0.1938, "step": 1710 }, { "epoch": 17.92, "learning_rate": 3.06712962962963e-05, "loss": 0.2024, "step": 1720 }, { "epoch": 18.0, "eval_accuracy": 0.5672727272727273, "eval_loss": 2.6478159427642822, "eval_runtime": 3.8359, "eval_samples_per_second": 143.382, "eval_steps_per_second": 9.124, "step": 1728 }, { "epoch": 18.02, "learning_rate": 3.0526620370370375e-05, "loss": 0.1643, "step": 1730 }, { "epoch": 18.12, "learning_rate": 3.0381944444444444e-05, "loss": 0.2376, "step": 1740 }, { "epoch": 18.23, "learning_rate": 3.023726851851852e-05, "loss": 0.2088, "step": 1750 }, { "epoch": 18.33, "learning_rate": 3.0092592592592593e-05, "loss": 0.181, "step": 1760 }, { "epoch": 18.44, "learning_rate": 2.994791666666667e-05, "loss": 0.19, "step": 1770 }, { "epoch": 18.54, "learning_rate": 2.980324074074074e-05, "loss": 0.2008, "step": 1780 }, { "epoch": 18.65, "learning_rate": 2.9658564814814815e-05, "loss": 0.2109, "step": 1790 }, { "epoch": 18.75, "learning_rate": 2.951388888888889e-05, "loss": 0.1904, "step": 1800 }, { "epoch": 18.85, "learning_rate": 2.9369212962962965e-05, "loss": 0.2136, "step": 1810 }, { "epoch": 18.96, "learning_rate": 2.9224537037037036e-05, "loss": 0.1401, "step": 1820 }, { "epoch": 19.0, "eval_accuracy": 0.5636363636363636, "eval_loss": 2.8951961994171143, "eval_runtime": 3.8754, "eval_samples_per_second": 141.92, "eval_steps_per_second": 9.031, "step": 1824 }, { "epoch": 19.06, "learning_rate": 2.907986111111111e-05, "loss": 0.1974, "step": 1830 }, { "epoch": 19.17, "learning_rate": 2.8935185185185186e-05, "loss": 0.1752, "step": 1840 }, { "epoch": 19.27, "learning_rate": 2.879050925925926e-05, "loss": 0.1814, "step": 1850 }, { "epoch": 19.38, "learning_rate": 2.8645833333333333e-05, "loss": 0.1853, "step": 1860 }, { "epoch": 19.48, "learning_rate": 2.8501157407407408e-05, "loss": 0.1992, "step": 1870 }, { "epoch": 19.58, "learning_rate": 2.8356481481481483e-05, "loss": 0.1774, "step": 1880 }, { "epoch": 19.69, "learning_rate": 2.821180555555556e-05, "loss": 0.1836, "step": 1890 }, { "epoch": 19.79, "learning_rate": 2.806712962962963e-05, "loss": 0.1556, "step": 1900 }, { "epoch": 19.9, "learning_rate": 2.7922453703703704e-05, "loss": 0.175, "step": 1910 }, { "epoch": 20.0, "learning_rate": 2.777777777777778e-05, "loss": 0.1984, "step": 1920 }, { "epoch": 20.0, "eval_accuracy": 0.6145454545454545, "eval_loss": 2.308309555053711, "eval_runtime": 3.9255, "eval_samples_per_second": 140.111, "eval_steps_per_second": 8.916, "step": 1920 }, { "epoch": 20.1, "learning_rate": 2.7633101851851857e-05, "loss": 0.1598, "step": 1930 }, { "epoch": 20.21, "learning_rate": 2.7488425925925926e-05, "loss": 0.1761, "step": 1940 }, { "epoch": 20.31, "learning_rate": 2.734375e-05, "loss": 0.1647, "step": 1950 }, { "epoch": 20.42, "learning_rate": 2.7199074074074076e-05, "loss": 0.1553, "step": 1960 }, { "epoch": 20.52, "learning_rate": 2.7054398148148147e-05, "loss": 0.1591, "step": 1970 }, { "epoch": 20.62, "learning_rate": 2.6909722222222222e-05, "loss": 0.1736, "step": 1980 }, { "epoch": 20.73, "learning_rate": 2.6765046296296297e-05, "loss": 0.1892, "step": 1990 }, { "epoch": 20.83, "learning_rate": 2.6620370370370372e-05, "loss": 0.1596, "step": 2000 }, { "epoch": 20.94, "learning_rate": 2.6475694444444443e-05, "loss": 0.1788, "step": 2010 }, { "epoch": 21.0, "eval_accuracy": 0.52, "eval_loss": 3.7701735496520996, "eval_runtime": 3.9464, "eval_samples_per_second": 139.366, "eval_steps_per_second": 8.869, "step": 2016 }, { "epoch": 21.04, "learning_rate": 2.633101851851852e-05, "loss": 0.1783, "step": 2020 }, { "epoch": 21.15, "learning_rate": 2.6186342592592593e-05, "loss": 0.1855, "step": 2030 }, { "epoch": 21.25, "learning_rate": 2.604166666666667e-05, "loss": 0.2003, "step": 2040 }, { "epoch": 21.35, "learning_rate": 2.589699074074074e-05, "loss": 0.1726, "step": 2050 }, { "epoch": 21.46, "learning_rate": 2.5752314814814815e-05, "loss": 0.153, "step": 2060 }, { "epoch": 21.56, "learning_rate": 2.560763888888889e-05, "loss": 0.1474, "step": 2070 }, { "epoch": 21.67, "learning_rate": 2.5462962962962965e-05, "loss": 0.1354, "step": 2080 }, { "epoch": 21.77, "learning_rate": 2.5318287037037036e-05, "loss": 0.1553, "step": 2090 }, { "epoch": 21.88, "learning_rate": 2.517361111111111e-05, "loss": 0.1832, "step": 2100 }, { "epoch": 21.98, "learning_rate": 2.5028935185185186e-05, "loss": 0.1907, "step": 2110 }, { "epoch": 22.0, "eval_accuracy": 0.6654545454545454, "eval_loss": 1.9616814851760864, "eval_runtime": 3.8632, "eval_samples_per_second": 142.368, "eval_steps_per_second": 9.06, "step": 2112 }, { "epoch": 22.08, "learning_rate": 2.488425925925926e-05, "loss": 0.159, "step": 2120 }, { "epoch": 22.19, "learning_rate": 2.4739583333333336e-05, "loss": 0.1397, "step": 2130 }, { "epoch": 22.29, "learning_rate": 2.4594907407407408e-05, "loss": 0.1511, "step": 2140 }, { "epoch": 22.4, "learning_rate": 2.4450231481481483e-05, "loss": 0.1704, "step": 2150 }, { "epoch": 22.5, "learning_rate": 2.4305555555555558e-05, "loss": 0.1475, "step": 2160 }, { "epoch": 22.6, "learning_rate": 2.4160879629629633e-05, "loss": 0.1568, "step": 2170 }, { "epoch": 22.71, "learning_rate": 2.4016203703703704e-05, "loss": 0.1394, "step": 2180 }, { "epoch": 22.81, "learning_rate": 2.387152777777778e-05, "loss": 0.1274, "step": 2190 }, { "epoch": 22.92, "learning_rate": 2.3726851851851854e-05, "loss": 0.1113, "step": 2200 }, { "epoch": 23.0, "eval_accuracy": 0.5963636363636363, "eval_loss": 2.654604434967041, "eval_runtime": 3.8399, "eval_samples_per_second": 143.233, "eval_steps_per_second": 9.115, "step": 2208 }, { "epoch": 23.02, "learning_rate": 2.358217592592593e-05, "loss": 0.1994, "step": 2210 }, { "epoch": 23.12, "learning_rate": 2.34375e-05, "loss": 0.1513, "step": 2220 }, { "epoch": 23.23, "learning_rate": 2.3292824074074075e-05, "loss": 0.1596, "step": 2230 }, { "epoch": 23.33, "learning_rate": 2.314814814814815e-05, "loss": 0.1276, "step": 2240 }, { "epoch": 23.44, "learning_rate": 2.3003472222222222e-05, "loss": 0.1552, "step": 2250 }, { "epoch": 23.54, "learning_rate": 2.2858796296296297e-05, "loss": 0.1419, "step": 2260 }, { "epoch": 23.65, "learning_rate": 2.2714120370370372e-05, "loss": 0.1798, "step": 2270 }, { "epoch": 23.75, "learning_rate": 2.2569444444444447e-05, "loss": 0.1832, "step": 2280 }, { "epoch": 23.85, "learning_rate": 2.242476851851852e-05, "loss": 0.1656, "step": 2290 }, { "epoch": 23.96, "learning_rate": 2.2280092592592593e-05, "loss": 0.1293, "step": 2300 }, { "epoch": 24.0, "eval_accuracy": 0.6036363636363636, "eval_loss": 2.6427228450775146, "eval_runtime": 3.9185, "eval_samples_per_second": 140.361, "eval_steps_per_second": 8.932, "step": 2304 }, { "epoch": 24.06, "learning_rate": 2.2135416666666668e-05, "loss": 0.1278, "step": 2310 }, { "epoch": 24.17, "learning_rate": 2.1990740740740743e-05, "loss": 0.1154, "step": 2320 }, { "epoch": 24.27, "learning_rate": 2.1846064814814815e-05, "loss": 0.1542, "step": 2330 }, { "epoch": 24.38, "learning_rate": 2.170138888888889e-05, "loss": 0.2238, "step": 2340 }, { "epoch": 24.48, "learning_rate": 2.1556712962962965e-05, "loss": 0.16, "step": 2350 }, { "epoch": 24.58, "learning_rate": 2.141203703703704e-05, "loss": 0.1713, "step": 2360 }, { "epoch": 24.69, "learning_rate": 2.126736111111111e-05, "loss": 0.1392, "step": 2370 }, { "epoch": 24.79, "learning_rate": 2.1122685185185186e-05, "loss": 0.1586, "step": 2380 }, { "epoch": 24.9, "learning_rate": 2.097800925925926e-05, "loss": 0.0984, "step": 2390 }, { "epoch": 25.0, "learning_rate": 2.0833333333333336e-05, "loss": 0.1354, "step": 2400 }, { "epoch": 25.0, "eval_accuracy": 0.5527272727272727, "eval_loss": 3.410457134246826, "eval_runtime": 3.9002, "eval_samples_per_second": 141.019, "eval_steps_per_second": 8.974, "step": 2400 }, { "epoch": 25.1, "learning_rate": 2.0688657407407408e-05, "loss": 0.1183, "step": 2410 }, { "epoch": 25.21, "learning_rate": 2.0543981481481483e-05, "loss": 0.1221, "step": 2420 }, { "epoch": 25.31, "learning_rate": 2.0399305555555557e-05, "loss": 0.1588, "step": 2430 }, { "epoch": 25.42, "learning_rate": 2.0254629629629632e-05, "loss": 0.1581, "step": 2440 }, { "epoch": 25.52, "learning_rate": 2.0109953703703704e-05, "loss": 0.1372, "step": 2450 }, { "epoch": 25.62, "learning_rate": 1.996527777777778e-05, "loss": 0.1165, "step": 2460 }, { "epoch": 25.73, "learning_rate": 1.9820601851851854e-05, "loss": 0.124, "step": 2470 }, { "epoch": 25.83, "learning_rate": 1.967592592592593e-05, "loss": 0.1127, "step": 2480 }, { "epoch": 25.94, "learning_rate": 1.953125e-05, "loss": 0.1447, "step": 2490 }, { "epoch": 26.0, "eval_accuracy": 0.6127272727272727, "eval_loss": 2.545973062515259, "eval_runtime": 4.0556, "eval_samples_per_second": 135.614, "eval_steps_per_second": 8.63, "step": 2496 }, { "epoch": 26.04, "learning_rate": 1.9386574074074075e-05, "loss": 0.1788, "step": 2500 }, { "epoch": 26.15, "learning_rate": 1.924189814814815e-05, "loss": 0.1482, "step": 2510 }, { "epoch": 26.25, "learning_rate": 1.9097222222222222e-05, "loss": 0.126, "step": 2520 }, { "epoch": 26.35, "learning_rate": 1.8952546296296297e-05, "loss": 0.1427, "step": 2530 }, { "epoch": 26.46, "learning_rate": 1.8807870370370372e-05, "loss": 0.1603, "step": 2540 }, { "epoch": 26.56, "learning_rate": 1.8663194444444447e-05, "loss": 0.1391, "step": 2550 }, { "epoch": 26.67, "learning_rate": 1.8518518518518518e-05, "loss": 0.1036, "step": 2560 }, { "epoch": 26.77, "learning_rate": 1.8373842592592593e-05, "loss": 0.1336, "step": 2570 }, { "epoch": 26.88, "learning_rate": 1.8229166666666668e-05, "loss": 0.1348, "step": 2580 }, { "epoch": 26.98, "learning_rate": 1.8084490740740743e-05, "loss": 0.0995, "step": 2590 }, { "epoch": 27.0, "eval_accuracy": 0.5854545454545454, "eval_loss": 2.9865081310272217, "eval_runtime": 3.9958, "eval_samples_per_second": 137.646, "eval_steps_per_second": 8.759, "step": 2592 }, { "epoch": 27.08, "learning_rate": 1.7939814814814815e-05, "loss": 0.1093, "step": 2600 }, { "epoch": 27.19, "learning_rate": 1.779513888888889e-05, "loss": 0.1454, "step": 2610 }, { "epoch": 27.29, "learning_rate": 1.7650462962962965e-05, "loss": 0.1408, "step": 2620 }, { "epoch": 27.4, "learning_rate": 1.750578703703704e-05, "loss": 0.1415, "step": 2630 }, { "epoch": 27.5, "learning_rate": 1.736111111111111e-05, "loss": 0.1032, "step": 2640 }, { "epoch": 27.6, "learning_rate": 1.7216435185185186e-05, "loss": 0.1551, "step": 2650 }, { "epoch": 27.71, "learning_rate": 1.707175925925926e-05, "loss": 0.084, "step": 2660 }, { "epoch": 27.81, "learning_rate": 1.6927083333333336e-05, "loss": 0.1088, "step": 2670 }, { "epoch": 27.92, "learning_rate": 1.6782407407407408e-05, "loss": 0.1369, "step": 2680 }, { "epoch": 28.0, "eval_accuracy": 0.5545454545454546, "eval_loss": 3.5280685424804688, "eval_runtime": 3.88, "eval_samples_per_second": 141.752, "eval_steps_per_second": 9.021, "step": 2688 }, { "epoch": 28.02, "learning_rate": 1.6637731481481482e-05, "loss": 0.14, "step": 2690 }, { "epoch": 28.12, "learning_rate": 1.6493055555555557e-05, "loss": 0.1198, "step": 2700 }, { "epoch": 28.23, "learning_rate": 1.6348379629629632e-05, "loss": 0.1132, "step": 2710 }, { "epoch": 28.33, "learning_rate": 1.6203703703703704e-05, "loss": 0.1526, "step": 2720 }, { "epoch": 28.44, "learning_rate": 1.605902777777778e-05, "loss": 0.1114, "step": 2730 }, { "epoch": 28.54, "learning_rate": 1.5914351851851854e-05, "loss": 0.1157, "step": 2740 }, { "epoch": 28.65, "learning_rate": 1.576967592592593e-05, "loss": 0.1322, "step": 2750 }, { "epoch": 28.75, "learning_rate": 1.5625e-05, "loss": 0.0794, "step": 2760 }, { "epoch": 28.85, "learning_rate": 1.5480324074074072e-05, "loss": 0.1354, "step": 2770 }, { "epoch": 28.96, "learning_rate": 1.533564814814815e-05, "loss": 0.1238, "step": 2780 }, { "epoch": 29.0, "eval_accuracy": 0.6018181818181818, "eval_loss": 2.816072702407837, "eval_runtime": 3.8975, "eval_samples_per_second": 141.115, "eval_steps_per_second": 8.98, "step": 2784 }, { "epoch": 29.06, "learning_rate": 1.5190972222222222e-05, "loss": 0.1339, "step": 2790 }, { "epoch": 29.17, "learning_rate": 1.5046296296296297e-05, "loss": 0.098, "step": 2800 }, { "epoch": 29.27, "learning_rate": 1.490162037037037e-05, "loss": 0.1411, "step": 2810 }, { "epoch": 29.38, "learning_rate": 1.4756944444444445e-05, "loss": 0.0686, "step": 2820 }, { "epoch": 29.48, "learning_rate": 1.4612268518518518e-05, "loss": 0.1029, "step": 2830 }, { "epoch": 29.58, "learning_rate": 1.4467592592592593e-05, "loss": 0.1361, "step": 2840 }, { "epoch": 29.69, "learning_rate": 1.4322916666666666e-05, "loss": 0.0927, "step": 2850 }, { "epoch": 29.79, "learning_rate": 1.4178240740740741e-05, "loss": 0.1329, "step": 2860 }, { "epoch": 29.9, "learning_rate": 1.4033564814814815e-05, "loss": 0.1146, "step": 2870 }, { "epoch": 30.0, "learning_rate": 1.388888888888889e-05, "loss": 0.1256, "step": 2880 }, { "epoch": 30.0, "eval_accuracy": 0.5490909090909091, "eval_loss": 3.491748332977295, "eval_runtime": 3.8988, "eval_samples_per_second": 141.071, "eval_steps_per_second": 8.977, "step": 2880 }, { "epoch": 30.1, "learning_rate": 1.3744212962962963e-05, "loss": 0.1046, "step": 2890 }, { "epoch": 30.21, "learning_rate": 1.3599537037037038e-05, "loss": 0.1199, "step": 2900 }, { "epoch": 30.31, "learning_rate": 1.3454861111111111e-05, "loss": 0.1524, "step": 2910 }, { "epoch": 30.42, "learning_rate": 1.3310185185185186e-05, "loss": 0.1064, "step": 2920 }, { "epoch": 30.52, "learning_rate": 1.316550925925926e-05, "loss": 0.1227, "step": 2930 }, { "epoch": 30.62, "learning_rate": 1.3020833333333334e-05, "loss": 0.1043, "step": 2940 }, { "epoch": 30.73, "learning_rate": 1.2876157407407407e-05, "loss": 0.1002, "step": 2950 }, { "epoch": 30.83, "learning_rate": 1.2731481481481482e-05, "loss": 0.1153, "step": 2960 }, { "epoch": 30.94, "learning_rate": 1.2586805555555556e-05, "loss": 0.1064, "step": 2970 }, { "epoch": 31.0, "eval_accuracy": 0.58, "eval_loss": 3.065882682800293, "eval_runtime": 3.8866, "eval_samples_per_second": 141.514, "eval_steps_per_second": 9.005, "step": 2976 }, { "epoch": 31.04, "learning_rate": 1.244212962962963e-05, "loss": 0.1036, "step": 2980 }, { "epoch": 31.15, "learning_rate": 1.2297453703703704e-05, "loss": 0.1149, "step": 2990 }, { "epoch": 31.25, "learning_rate": 1.2152777777777779e-05, "loss": 0.1135, "step": 3000 }, { "epoch": 31.35, "learning_rate": 1.2008101851851852e-05, "loss": 0.0999, "step": 3010 }, { "epoch": 31.46, "learning_rate": 1.1863425925925927e-05, "loss": 0.1174, "step": 3020 }, { "epoch": 31.56, "learning_rate": 1.171875e-05, "loss": 0.1161, "step": 3030 }, { "epoch": 31.67, "learning_rate": 1.1574074074074075e-05, "loss": 0.1123, "step": 3040 }, { "epoch": 31.77, "learning_rate": 1.1429398148148148e-05, "loss": 0.1167, "step": 3050 }, { "epoch": 31.88, "learning_rate": 1.1284722222222223e-05, "loss": 0.0939, "step": 3060 }, { "epoch": 31.98, "learning_rate": 1.1140046296296297e-05, "loss": 0.1333, "step": 3070 }, { "epoch": 32.0, "eval_accuracy": 0.5472727272727272, "eval_loss": 3.5971763134002686, "eval_runtime": 3.9815, "eval_samples_per_second": 138.137, "eval_steps_per_second": 8.791, "step": 3072 }, { "epoch": 32.08, "learning_rate": 1.0995370370370372e-05, "loss": 0.1048, "step": 3080 }, { "epoch": 32.19, "learning_rate": 1.0850694444444445e-05, "loss": 0.118, "step": 3090 }, { "epoch": 32.29, "learning_rate": 1.070601851851852e-05, "loss": 0.1366, "step": 3100 }, { "epoch": 32.4, "learning_rate": 1.0561342592592593e-05, "loss": 0.0787, "step": 3110 }, { "epoch": 32.5, "learning_rate": 1.0416666666666668e-05, "loss": 0.1368, "step": 3120 }, { "epoch": 32.6, "learning_rate": 1.0271990740740741e-05, "loss": 0.1137, "step": 3130 }, { "epoch": 32.71, "learning_rate": 1.0127314814814816e-05, "loss": 0.1161, "step": 3140 }, { "epoch": 32.81, "learning_rate": 9.98263888888889e-06, "loss": 0.1096, "step": 3150 }, { "epoch": 32.92, "learning_rate": 9.837962962962964e-06, "loss": 0.1134, "step": 3160 }, { "epoch": 33.0, "eval_accuracy": 0.54, "eval_loss": 3.611628293991089, "eval_runtime": 3.8603, "eval_samples_per_second": 142.477, "eval_steps_per_second": 9.067, "step": 3168 }, { "epoch": 33.02, "learning_rate": 9.693287037037038e-06, "loss": 0.0905, "step": 3170 }, { "epoch": 33.12, "learning_rate": 9.548611111111111e-06, "loss": 0.1007, "step": 3180 }, { "epoch": 33.23, "learning_rate": 9.403935185185186e-06, "loss": 0.1342, "step": 3190 }, { "epoch": 33.33, "learning_rate": 9.259259259259259e-06, "loss": 0.1072, "step": 3200 }, { "epoch": 33.44, "learning_rate": 9.114583333333334e-06, "loss": 0.0824, "step": 3210 }, { "epoch": 33.54, "learning_rate": 8.969907407407407e-06, "loss": 0.123, "step": 3220 }, { "epoch": 33.65, "learning_rate": 8.825231481481482e-06, "loss": 0.1083, "step": 3230 }, { "epoch": 33.75, "learning_rate": 8.680555555555556e-06, "loss": 0.1073, "step": 3240 }, { "epoch": 33.85, "learning_rate": 8.53587962962963e-06, "loss": 0.0854, "step": 3250 }, { "epoch": 33.96, "learning_rate": 8.391203703703704e-06, "loss": 0.0831, "step": 3260 }, { "epoch": 34.0, "eval_accuracy": 0.5509090909090909, "eval_loss": 3.530831813812256, "eval_runtime": 3.9226, "eval_samples_per_second": 140.214, "eval_steps_per_second": 8.923, "step": 3264 }, { "epoch": 34.06, "learning_rate": 8.246527777777779e-06, "loss": 0.1373, "step": 3270 }, { "epoch": 34.17, "learning_rate": 8.101851851851852e-06, "loss": 0.0997, "step": 3280 }, { "epoch": 34.27, "learning_rate": 7.957175925925927e-06, "loss": 0.0959, "step": 3290 }, { "epoch": 34.38, "learning_rate": 7.8125e-06, "loss": 0.0999, "step": 3300 }, { "epoch": 34.48, "learning_rate": 7.667824074074075e-06, "loss": 0.0585, "step": 3310 }, { "epoch": 34.58, "learning_rate": 7.523148148148148e-06, "loss": 0.0815, "step": 3320 }, { "epoch": 34.69, "learning_rate": 7.3784722222222225e-06, "loss": 0.073, "step": 3330 }, { "epoch": 34.79, "learning_rate": 7.2337962962962966e-06, "loss": 0.1152, "step": 3340 }, { "epoch": 34.9, "learning_rate": 7.089120370370371e-06, "loss": 0.0926, "step": 3350 }, { "epoch": 35.0, "learning_rate": 6.944444444444445e-06, "loss": 0.1035, "step": 3360 }, { "epoch": 35.0, "eval_accuracy": 0.5581818181818182, "eval_loss": 3.4789416790008545, "eval_runtime": 3.9255, "eval_samples_per_second": 140.108, "eval_steps_per_second": 8.916, "step": 3360 }, { "epoch": 35.1, "learning_rate": 6.799768518518519e-06, "loss": 0.1094, "step": 3370 }, { "epoch": 35.21, "learning_rate": 6.655092592592593e-06, "loss": 0.1014, "step": 3380 }, { "epoch": 35.31, "learning_rate": 6.510416666666667e-06, "loss": 0.0975, "step": 3390 }, { "epoch": 35.42, "learning_rate": 6.365740740740741e-06, "loss": 0.0769, "step": 3400 }, { "epoch": 35.52, "learning_rate": 6.221064814814815e-06, "loss": 0.0742, "step": 3410 }, { "epoch": 35.62, "learning_rate": 6.076388888888889e-06, "loss": 0.0994, "step": 3420 }, { "epoch": 35.73, "learning_rate": 5.9317129629629635e-06, "loss": 0.0755, "step": 3430 }, { "epoch": 35.83, "learning_rate": 5.787037037037038e-06, "loss": 0.0833, "step": 3440 }, { "epoch": 35.94, "learning_rate": 5.642361111111112e-06, "loss": 0.0957, "step": 3450 }, { "epoch": 36.0, "eval_accuracy": 0.5509090909090909, "eval_loss": 3.6358087062835693, "eval_runtime": 3.9255, "eval_samples_per_second": 140.108, "eval_steps_per_second": 8.916, "step": 3456 }, { "epoch": 36.04, "learning_rate": 5.497685185185186e-06, "loss": 0.1083, "step": 3460 }, { "epoch": 36.15, "learning_rate": 5.35300925925926e-06, "loss": 0.0927, "step": 3470 }, { "epoch": 36.25, "learning_rate": 5.208333333333334e-06, "loss": 0.0876, "step": 3480 }, { "epoch": 36.35, "learning_rate": 5.063657407407408e-06, "loss": 0.0921, "step": 3490 }, { "epoch": 36.46, "learning_rate": 4.918981481481482e-06, "loss": 0.0909, "step": 3500 }, { "epoch": 36.56, "learning_rate": 4.7743055555555555e-06, "loss": 0.0927, "step": 3510 }, { "epoch": 36.67, "learning_rate": 4.6296296296296296e-06, "loss": 0.1031, "step": 3520 }, { "epoch": 36.77, "learning_rate": 4.484953703703704e-06, "loss": 0.1017, "step": 3530 }, { "epoch": 36.88, "learning_rate": 4.340277777777778e-06, "loss": 0.064, "step": 3540 }, { "epoch": 36.98, "learning_rate": 4.195601851851852e-06, "loss": 0.0764, "step": 3550 }, { "epoch": 37.0, "eval_accuracy": 0.5709090909090909, "eval_loss": 3.363851547241211, "eval_runtime": 3.9482, "eval_samples_per_second": 139.304, "eval_steps_per_second": 8.865, "step": 3552 }, { "epoch": 37.08, "learning_rate": 4.050925925925926e-06, "loss": 0.1104, "step": 3560 }, { "epoch": 37.19, "learning_rate": 3.90625e-06, "loss": 0.0785, "step": 3570 }, { "epoch": 37.29, "learning_rate": 3.761574074074074e-06, "loss": 0.0777, "step": 3580 }, { "epoch": 37.4, "learning_rate": 3.6168981481481483e-06, "loss": 0.0696, "step": 3590 }, { "epoch": 37.5, "learning_rate": 3.4722222222222224e-06, "loss": 0.0789, "step": 3600 }, { "epoch": 37.6, "learning_rate": 3.3275462962962965e-06, "loss": 0.0921, "step": 3610 }, { "epoch": 37.71, "learning_rate": 3.1828703703703706e-06, "loss": 0.0932, "step": 3620 }, { "epoch": 37.81, "learning_rate": 3.0381944444444447e-06, "loss": 0.0614, "step": 3630 }, { "epoch": 37.92, "learning_rate": 2.893518518518519e-06, "loss": 0.072, "step": 3640 }, { "epoch": 38.0, "eval_accuracy": 0.5563636363636364, "eval_loss": 3.5639150142669678, "eval_runtime": 3.9387, "eval_samples_per_second": 139.642, "eval_steps_per_second": 8.886, "step": 3648 }, { "epoch": 38.02, "learning_rate": 2.748842592592593e-06, "loss": 0.1215, "step": 3650 }, { "epoch": 38.12, "learning_rate": 2.604166666666667e-06, "loss": 0.0942, "step": 3660 }, { "epoch": 38.23, "learning_rate": 2.459490740740741e-06, "loss": 0.0525, "step": 3670 }, { "epoch": 38.33, "learning_rate": 2.3148148148148148e-06, "loss": 0.0871, "step": 3680 }, { "epoch": 38.44, "learning_rate": 2.170138888888889e-06, "loss": 0.0973, "step": 3690 }, { "epoch": 38.54, "learning_rate": 2.025462962962963e-06, "loss": 0.1067, "step": 3700 }, { "epoch": 38.65, "learning_rate": 1.880787037037037e-06, "loss": 0.1219, "step": 3710 }, { "epoch": 38.75, "learning_rate": 1.7361111111111112e-06, "loss": 0.0806, "step": 3720 }, { "epoch": 38.85, "learning_rate": 1.5914351851851853e-06, "loss": 0.0646, "step": 3730 }, { "epoch": 38.96, "learning_rate": 1.4467592592592594e-06, "loss": 0.0727, "step": 3740 }, { "epoch": 39.0, "eval_accuracy": 0.5581818181818182, "eval_loss": 3.519263744354248, "eval_runtime": 3.9518, "eval_samples_per_second": 139.177, "eval_steps_per_second": 8.857, "step": 3744 }, { "epoch": 39.06, "learning_rate": 1.3020833333333335e-06, "loss": 0.1025, "step": 3750 }, { "epoch": 39.17, "learning_rate": 1.1574074074074074e-06, "loss": 0.1064, "step": 3760 }, { "epoch": 39.27, "learning_rate": 1.0127314814814815e-06, "loss": 0.0921, "step": 3770 }, { "epoch": 39.38, "learning_rate": 8.680555555555556e-07, "loss": 0.0786, "step": 3780 }, { "epoch": 39.48, "learning_rate": 7.233796296296297e-07, "loss": 0.0767, "step": 3790 }, { "epoch": 39.58, "learning_rate": 5.787037037037037e-07, "loss": 0.0661, "step": 3800 }, { "epoch": 39.69, "learning_rate": 4.340277777777778e-07, "loss": 0.0873, "step": 3810 }, { "epoch": 39.79, "learning_rate": 2.8935185185185185e-07, "loss": 0.0692, "step": 3820 }, { "epoch": 39.9, "learning_rate": 1.4467592592592592e-07, "loss": 0.1063, "step": 3830 }, { "epoch": 40.0, "learning_rate": 0.0, "loss": 0.0619, "step": 3840 }, { "epoch": 40.0, "eval_accuracy": 0.5581818181818182, "eval_loss": 3.5836477279663086, "eval_runtime": 3.9696, "eval_samples_per_second": 138.553, "eval_steps_per_second": 8.817, "step": 3840 }, { "epoch": 40.0, "step": 3840, "total_flos": 1.9002720805055447e+19, "train_loss": 0.31003439707371094, "train_runtime": 4246.6951, "train_samples_per_second": 57.767, "train_steps_per_second": 0.904 } ], "logging_steps": 10, "max_steps": 3840, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 500, "total_flos": 1.9002720805055447e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }