{ "best_metric": 0.6874601244926453, "best_model_checkpoint": "./beans_outputs/checkpoint-2600", "epoch": 20.0, "eval_steps": 500, "global_step": 2600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07692307692307693, "grad_norm": 2.136049747467041, "learning_rate": 1.98974358974359e-05, "loss": 1.1239, "step": 10 }, { "epoch": 0.15384615384615385, "grad_norm": 1.8187670707702637, "learning_rate": 1.9794871794871798e-05, "loss": 1.1221, "step": 20 }, { "epoch": 0.23076923076923078, "grad_norm": 2.0219993591308594, "learning_rate": 1.9692307692307696e-05, "loss": 1.1164, "step": 30 }, { "epoch": 0.3076923076923077, "grad_norm": 2.4619803428649902, "learning_rate": 1.958974358974359e-05, "loss": 1.1044, "step": 40 }, { "epoch": 0.38461538461538464, "grad_norm": 1.6733014583587646, "learning_rate": 1.9487179487179488e-05, "loss": 1.1082, "step": 50 }, { "epoch": 0.46153846153846156, "grad_norm": 1.4969494342803955, "learning_rate": 1.9384615384615386e-05, "loss": 1.1043, "step": 60 }, { "epoch": 0.5384615384615384, "grad_norm": 2.6347556114196777, "learning_rate": 1.9282051282051284e-05, "loss": 1.1028, "step": 70 }, { "epoch": 0.6153846153846154, "grad_norm": 2.5843420028686523, "learning_rate": 1.9179487179487182e-05, "loss": 1.0908, "step": 80 }, { "epoch": 0.6923076923076923, "grad_norm": 2.0522916316986084, "learning_rate": 1.907692307692308e-05, "loss": 1.094, "step": 90 }, { "epoch": 0.7692307692307693, "grad_norm": 2.4885082244873047, "learning_rate": 1.8974358974358975e-05, "loss": 1.0912, "step": 100 }, { "epoch": 0.8461538461538461, "grad_norm": 1.7014166116714478, "learning_rate": 1.8871794871794873e-05, "loss": 1.0949, "step": 110 }, { "epoch": 0.9230769230769231, "grad_norm": 2.294283866882324, "learning_rate": 1.876923076923077e-05, "loss": 1.0992, "step": 120 }, { "epoch": 1.0, "grad_norm": 4.129885673522949, "learning_rate": 1.866666666666667e-05, "loss": 1.0864, "step": 130 }, { "epoch": 1.0, "eval_accuracy": 0.42857142857142855, "eval_loss": 1.0877832174301147, "eval_runtime": 0.7833, "eval_samples_per_second": 169.792, "eval_steps_per_second": 21.703, "step": 130 }, { "epoch": 1.0769230769230769, "grad_norm": 2.331717014312744, "learning_rate": 1.8564102564102567e-05, "loss": 1.0774, "step": 140 }, { "epoch": 1.1538461538461537, "grad_norm": 2.5262138843536377, "learning_rate": 1.8461538461538465e-05, "loss": 1.0719, "step": 150 }, { "epoch": 1.2307692307692308, "grad_norm": 1.5971320867538452, "learning_rate": 1.835897435897436e-05, "loss": 1.0781, "step": 160 }, { "epoch": 1.3076923076923077, "grad_norm": 2.383288860321045, "learning_rate": 1.8256410256410257e-05, "loss": 1.0929, "step": 170 }, { "epoch": 1.3846153846153846, "grad_norm": 2.169706106185913, "learning_rate": 1.8153846153846155e-05, "loss": 1.0805, "step": 180 }, { "epoch": 1.4615384615384617, "grad_norm": 2.1174418926239014, "learning_rate": 1.8051282051282053e-05, "loss": 1.08, "step": 190 }, { "epoch": 1.5384615384615383, "grad_norm": 1.7236179113388062, "learning_rate": 1.794871794871795e-05, "loss": 1.0766, "step": 200 }, { "epoch": 1.6153846153846154, "grad_norm": 1.7772722244262695, "learning_rate": 1.784615384615385e-05, "loss": 1.0676, "step": 210 }, { "epoch": 1.6923076923076923, "grad_norm": 3.53834867477417, "learning_rate": 1.7743589743589744e-05, "loss": 1.0695, "step": 220 }, { "epoch": 1.7692307692307692, "grad_norm": 2.0417070388793945, "learning_rate": 1.7641025641025642e-05, "loss": 1.0706, "step": 230 }, { "epoch": 1.8461538461538463, "grad_norm": 1.9734611511230469, "learning_rate": 1.753846153846154e-05, "loss": 1.0863, "step": 240 }, { "epoch": 1.9230769230769231, "grad_norm": 1.9997600317001343, "learning_rate": 1.7435897435897438e-05, "loss": 1.068, "step": 250 }, { "epoch": 2.0, "grad_norm": 3.0024373531341553, "learning_rate": 1.7333333333333336e-05, "loss": 1.0629, "step": 260 }, { "epoch": 2.0, "eval_accuracy": 0.5488721804511278, "eval_loss": 1.0593525171279907, "eval_runtime": 0.7442, "eval_samples_per_second": 178.706, "eval_steps_per_second": 22.842, "step": 260 }, { "epoch": 2.076923076923077, "grad_norm": 1.977807641029358, "learning_rate": 1.7230769230769234e-05, "loss": 1.0711, "step": 270 }, { "epoch": 2.1538461538461537, "grad_norm": 2.3906102180480957, "learning_rate": 1.7128205128205128e-05, "loss": 1.0597, "step": 280 }, { "epoch": 2.230769230769231, "grad_norm": 2.3670897483825684, "learning_rate": 1.7025641025641026e-05, "loss": 1.0576, "step": 290 }, { "epoch": 2.3076923076923075, "grad_norm": 3.026155948638916, "learning_rate": 1.6923076923076924e-05, "loss": 1.0434, "step": 300 }, { "epoch": 2.3846153846153846, "grad_norm": 1.9082350730895996, "learning_rate": 1.6820512820512822e-05, "loss": 1.0566, "step": 310 }, { "epoch": 2.4615384615384617, "grad_norm": 1.9187153577804565, "learning_rate": 1.671794871794872e-05, "loss": 1.0476, "step": 320 }, { "epoch": 2.5384615384615383, "grad_norm": 1.4435549974441528, "learning_rate": 1.6615384615384618e-05, "loss": 1.032, "step": 330 }, { "epoch": 2.6153846153846154, "grad_norm": 2.1457245349884033, "learning_rate": 1.6512820512820513e-05, "loss": 1.0475, "step": 340 }, { "epoch": 2.6923076923076925, "grad_norm": 1.9391709566116333, "learning_rate": 1.641025641025641e-05, "loss": 1.0486, "step": 350 }, { "epoch": 2.769230769230769, "grad_norm": 1.8148127794265747, "learning_rate": 1.630769230769231e-05, "loss": 1.0407, "step": 360 }, { "epoch": 2.8461538461538463, "grad_norm": 2.444157123565674, "learning_rate": 1.6205128205128207e-05, "loss": 1.0356, "step": 370 }, { "epoch": 2.9230769230769234, "grad_norm": 1.9061695337295532, "learning_rate": 1.6102564102564105e-05, "loss": 1.0239, "step": 380 }, { "epoch": 3.0, "grad_norm": 4.859686851501465, "learning_rate": 1.6000000000000003e-05, "loss": 1.0434, "step": 390 }, { "epoch": 3.0, "eval_accuracy": 0.6766917293233082, "eval_loss": 1.0230107307434082, "eval_runtime": 0.7471, "eval_samples_per_second": 178.027, "eval_steps_per_second": 22.755, "step": 390 }, { "epoch": 3.076923076923077, "grad_norm": 2.2021689414978027, "learning_rate": 1.5897435897435897e-05, "loss": 1.0424, "step": 400 }, { "epoch": 3.1538461538461537, "grad_norm": 1.8670283555984497, "learning_rate": 1.5794871794871795e-05, "loss": 1.0299, "step": 410 }, { "epoch": 3.230769230769231, "grad_norm": 2.193986654281616, "learning_rate": 1.5692307692307693e-05, "loss": 1.0369, "step": 420 }, { "epoch": 3.3076923076923075, "grad_norm": 2.26470685005188, "learning_rate": 1.558974358974359e-05, "loss": 1.0159, "step": 430 }, { "epoch": 3.3846153846153846, "grad_norm": 2.18507981300354, "learning_rate": 1.548717948717949e-05, "loss": 1.0282, "step": 440 }, { "epoch": 3.4615384615384617, "grad_norm": 1.8047341108322144, "learning_rate": 1.5384615384615387e-05, "loss": 1.0381, "step": 450 }, { "epoch": 3.5384615384615383, "grad_norm": 2.6463704109191895, "learning_rate": 1.5282051282051282e-05, "loss": 1.0322, "step": 460 }, { "epoch": 3.6153846153846154, "grad_norm": 1.6456215381622314, "learning_rate": 1.517948717948718e-05, "loss": 1.0049, "step": 470 }, { "epoch": 3.6923076923076925, "grad_norm": 2.774256706237793, "learning_rate": 1.5076923076923078e-05, "loss": 1.0091, "step": 480 }, { "epoch": 3.769230769230769, "grad_norm": 1.572251319885254, "learning_rate": 1.4974358974358976e-05, "loss": 0.998, "step": 490 }, { "epoch": 3.8461538461538463, "grad_norm": 1.5640805959701538, "learning_rate": 1.4871794871794874e-05, "loss": 1.0222, "step": 500 }, { "epoch": 3.9230769230769234, "grad_norm": 1.9231537580490112, "learning_rate": 1.4769230769230772e-05, "loss": 0.9979, "step": 510 }, { "epoch": 4.0, "grad_norm": 5.481942176818848, "learning_rate": 1.4666666666666666e-05, "loss": 1.0214, "step": 520 }, { "epoch": 4.0, "eval_accuracy": 0.6766917293233082, "eval_loss": 0.9964542388916016, "eval_runtime": 0.7616, "eval_samples_per_second": 174.638, "eval_steps_per_second": 22.322, "step": 520 }, { "epoch": 4.076923076923077, "grad_norm": 2.7514402866363525, "learning_rate": 1.4564102564102564e-05, "loss": 1.0128, "step": 530 }, { "epoch": 4.153846153846154, "grad_norm": 1.8411396741867065, "learning_rate": 1.4461538461538462e-05, "loss": 1.0145, "step": 540 }, { "epoch": 4.230769230769231, "grad_norm": 2.670154571533203, "learning_rate": 1.435897435897436e-05, "loss": 1.0227, "step": 550 }, { "epoch": 4.3076923076923075, "grad_norm": 2.1951498985290527, "learning_rate": 1.4256410256410258e-05, "loss": 1.0321, "step": 560 }, { "epoch": 4.384615384615385, "grad_norm": 1.9692825078964233, "learning_rate": 1.4153846153846156e-05, "loss": 0.9829, "step": 570 }, { "epoch": 4.461538461538462, "grad_norm": 2.611340284347534, "learning_rate": 1.405128205128205e-05, "loss": 0.9918, "step": 580 }, { "epoch": 4.538461538461538, "grad_norm": 2.4288899898529053, "learning_rate": 1.3948717948717949e-05, "loss": 0.9879, "step": 590 }, { "epoch": 4.615384615384615, "grad_norm": 1.7537823915481567, "learning_rate": 1.3846153846153847e-05, "loss": 0.9793, "step": 600 }, { "epoch": 4.6923076923076925, "grad_norm": 3.1588003635406494, "learning_rate": 1.3743589743589745e-05, "loss": 1.0002, "step": 610 }, { "epoch": 4.769230769230769, "grad_norm": 2.2472622394561768, "learning_rate": 1.3641025641025643e-05, "loss": 1.0094, "step": 620 }, { "epoch": 4.846153846153846, "grad_norm": 1.7958937883377075, "learning_rate": 1.353846153846154e-05, "loss": 0.9703, "step": 630 }, { "epoch": 4.923076923076923, "grad_norm": 2.415766477584839, "learning_rate": 1.3435897435897435e-05, "loss": 0.9703, "step": 640 }, { "epoch": 5.0, "grad_norm": 4.948933124542236, "learning_rate": 1.3333333333333333e-05, "loss": 1.0026, "step": 650 }, { "epoch": 5.0, "eval_accuracy": 0.7443609022556391, "eval_loss": 0.9569369554519653, "eval_runtime": 0.7647, "eval_samples_per_second": 173.928, "eval_steps_per_second": 22.231, "step": 650 }, { "epoch": 5.076923076923077, "grad_norm": 2.1397032737731934, "learning_rate": 1.3230769230769231e-05, "loss": 0.9645, "step": 660 }, { "epoch": 5.153846153846154, "grad_norm": 2.7277321815490723, "learning_rate": 1.312820512820513e-05, "loss": 1.0063, "step": 670 }, { "epoch": 5.230769230769231, "grad_norm": 2.391350030899048, "learning_rate": 1.3025641025641027e-05, "loss": 0.9918, "step": 680 }, { "epoch": 5.3076923076923075, "grad_norm": 2.751174211502075, "learning_rate": 1.2923076923076925e-05, "loss": 0.9849, "step": 690 }, { "epoch": 5.384615384615385, "grad_norm": 2.77424693107605, "learning_rate": 1.2820512820512823e-05, "loss": 0.9745, "step": 700 }, { "epoch": 5.461538461538462, "grad_norm": 1.9156702756881714, "learning_rate": 1.2717948717948718e-05, "loss": 0.9684, "step": 710 }, { "epoch": 5.538461538461538, "grad_norm": 1.9521454572677612, "learning_rate": 1.2615384615384616e-05, "loss": 0.9503, "step": 720 }, { "epoch": 5.615384615384615, "grad_norm": 2.468419313430786, "learning_rate": 1.2512820512820514e-05, "loss": 0.9641, "step": 730 }, { "epoch": 5.6923076923076925, "grad_norm": 2.520923614501953, "learning_rate": 1.2410256410256412e-05, "loss": 0.9471, "step": 740 }, { "epoch": 5.769230769230769, "grad_norm": 2.1003758907318115, "learning_rate": 1.230769230769231e-05, "loss": 0.9513, "step": 750 }, { "epoch": 5.846153846153846, "grad_norm": 2.192279100418091, "learning_rate": 1.2205128205128208e-05, "loss": 0.9527, "step": 760 }, { "epoch": 5.923076923076923, "grad_norm": 3.8428618907928467, "learning_rate": 1.2102564102564102e-05, "loss": 0.938, "step": 770 }, { "epoch": 6.0, "grad_norm": 4.9151530265808105, "learning_rate": 1.2e-05, "loss": 0.9753, "step": 780 }, { "epoch": 6.0, "eval_accuracy": 0.7819548872180451, "eval_loss": 0.9288201332092285, "eval_runtime": 0.7499, "eval_samples_per_second": 177.349, "eval_steps_per_second": 22.669, "step": 780 }, { "epoch": 6.076923076923077, "grad_norm": 2.7967398166656494, "learning_rate": 1.1897435897435898e-05, "loss": 0.9428, "step": 790 }, { "epoch": 6.153846153846154, "grad_norm": 2.5342345237731934, "learning_rate": 1.1794871794871796e-05, "loss": 0.9406, "step": 800 }, { "epoch": 6.230769230769231, "grad_norm": 1.877543330192566, "learning_rate": 1.1692307692307694e-05, "loss": 0.9319, "step": 810 }, { "epoch": 6.3076923076923075, "grad_norm": 2.4524621963500977, "learning_rate": 1.1589743589743592e-05, "loss": 0.9332, "step": 820 }, { "epoch": 6.384615384615385, "grad_norm": 2.4967362880706787, "learning_rate": 1.1487179487179487e-05, "loss": 0.9367, "step": 830 }, { "epoch": 6.461538461538462, "grad_norm": 3.2078776359558105, "learning_rate": 1.1384615384615385e-05, "loss": 0.9339, "step": 840 }, { "epoch": 6.538461538461538, "grad_norm": 2.926706075668335, "learning_rate": 1.1282051282051283e-05, "loss": 0.9416, "step": 850 }, { "epoch": 6.615384615384615, "grad_norm": 1.8625017404556274, "learning_rate": 1.117948717948718e-05, "loss": 0.9111, "step": 860 }, { "epoch": 6.6923076923076925, "grad_norm": 2.7141189575195312, "learning_rate": 1.1076923076923079e-05, "loss": 0.9574, "step": 870 }, { "epoch": 6.769230769230769, "grad_norm": 2.307347536087036, "learning_rate": 1.0974358974358977e-05, "loss": 0.9259, "step": 880 }, { "epoch": 6.846153846153846, "grad_norm": 2.3937132358551025, "learning_rate": 1.0871794871794871e-05, "loss": 0.9207, "step": 890 }, { "epoch": 6.923076923076923, "grad_norm": 3.0794668197631836, "learning_rate": 1.076923076923077e-05, "loss": 0.9418, "step": 900 }, { "epoch": 7.0, "grad_norm": 4.111669063568115, "learning_rate": 1.0666666666666667e-05, "loss": 0.9252, "step": 910 }, { "epoch": 7.0, "eval_accuracy": 0.7969924812030075, "eval_loss": 0.8874692916870117, "eval_runtime": 0.7823, "eval_samples_per_second": 170.013, "eval_steps_per_second": 21.731, "step": 910 }, { "epoch": 7.076923076923077, "grad_norm": 2.7561662197113037, "learning_rate": 1.0564102564102565e-05, "loss": 0.911, "step": 920 }, { "epoch": 7.153846153846154, "grad_norm": 3.2020223140716553, "learning_rate": 1.0461538461538463e-05, "loss": 0.912, "step": 930 }, { "epoch": 7.230769230769231, "grad_norm": 3.459304094314575, "learning_rate": 1.0358974358974361e-05, "loss": 0.8994, "step": 940 }, { "epoch": 7.3076923076923075, "grad_norm": 2.774078369140625, "learning_rate": 1.0256410256410256e-05, "loss": 0.9079, "step": 950 }, { "epoch": 7.384615384615385, "grad_norm": 2.7169668674468994, "learning_rate": 1.0153846153846154e-05, "loss": 0.9256, "step": 960 }, { "epoch": 7.461538461538462, "grad_norm": 2.171323299407959, "learning_rate": 1.0051282051282052e-05, "loss": 0.8898, "step": 970 }, { "epoch": 7.538461538461538, "grad_norm": 2.7350351810455322, "learning_rate": 9.94871794871795e-06, "loss": 0.9243, "step": 980 }, { "epoch": 7.615384615384615, "grad_norm": 2.3926539421081543, "learning_rate": 9.846153846153848e-06, "loss": 0.8868, "step": 990 }, { "epoch": 7.6923076923076925, "grad_norm": 2.0602715015411377, "learning_rate": 9.743589743589744e-06, "loss": 0.8837, "step": 1000 }, { "epoch": 7.769230769230769, "grad_norm": 2.885303497314453, "learning_rate": 9.641025641025642e-06, "loss": 0.8827, "step": 1010 }, { "epoch": 7.846153846153846, "grad_norm": 2.261361837387085, "learning_rate": 9.53846153846154e-06, "loss": 0.9047, "step": 1020 }, { "epoch": 7.923076923076923, "grad_norm": 2.6180179119110107, "learning_rate": 9.435897435897436e-06, "loss": 0.861, "step": 1030 }, { "epoch": 8.0, "grad_norm": 4.225304126739502, "learning_rate": 9.333333333333334e-06, "loss": 0.9192, "step": 1040 }, { "epoch": 8.0, "eval_accuracy": 0.8120300751879699, "eval_loss": 0.850643515586853, "eval_runtime": 0.756, "eval_samples_per_second": 175.92, "eval_steps_per_second": 22.486, "step": 1040 }, { "epoch": 8.076923076923077, "grad_norm": 2.1875813007354736, "learning_rate": 9.230769230769232e-06, "loss": 0.8953, "step": 1050 }, { "epoch": 8.153846153846153, "grad_norm": 2.1640567779541016, "learning_rate": 9.128205128205129e-06, "loss": 0.8658, "step": 1060 }, { "epoch": 8.23076923076923, "grad_norm": 2.660614490509033, "learning_rate": 9.025641025641027e-06, "loss": 0.8995, "step": 1070 }, { "epoch": 8.307692307692308, "grad_norm": 2.104029417037964, "learning_rate": 8.923076923076925e-06, "loss": 0.8569, "step": 1080 }, { "epoch": 8.384615384615385, "grad_norm": 2.2643303871154785, "learning_rate": 8.820512820512821e-06, "loss": 0.8972, "step": 1090 }, { "epoch": 8.461538461538462, "grad_norm": 2.632410764694214, "learning_rate": 8.717948717948719e-06, "loss": 0.8715, "step": 1100 }, { "epoch": 8.538461538461538, "grad_norm": 1.6500084400177002, "learning_rate": 8.615384615384617e-06, "loss": 0.8716, "step": 1110 }, { "epoch": 8.615384615384615, "grad_norm": 6.204855442047119, "learning_rate": 8.512820512820513e-06, "loss": 0.8985, "step": 1120 }, { "epoch": 8.692307692307692, "grad_norm": 3.729611873626709, "learning_rate": 8.410256410256411e-06, "loss": 0.8837, "step": 1130 }, { "epoch": 8.76923076923077, "grad_norm": 3.685739278793335, "learning_rate": 8.307692307692309e-06, "loss": 0.8865, "step": 1140 }, { "epoch": 8.846153846153847, "grad_norm": 2.7028560638427734, "learning_rate": 8.205128205128205e-06, "loss": 0.875, "step": 1150 }, { "epoch": 8.923076923076923, "grad_norm": 2.7692482471466064, "learning_rate": 8.102564102564103e-06, "loss": 0.8867, "step": 1160 }, { "epoch": 9.0, "grad_norm": 3.9854462146759033, "learning_rate": 8.000000000000001e-06, "loss": 0.9008, "step": 1170 }, { "epoch": 9.0, "eval_accuracy": 0.8045112781954887, "eval_loss": 0.8337866067886353, "eval_runtime": 0.7963, "eval_samples_per_second": 167.03, "eval_steps_per_second": 21.35, "step": 1170 }, { "epoch": 9.076923076923077, "grad_norm": 1.9381572008132935, "learning_rate": 7.897435897435898e-06, "loss": 0.8969, "step": 1180 }, { "epoch": 9.153846153846153, "grad_norm": 2.219219446182251, "learning_rate": 7.794871794871796e-06, "loss": 0.8412, "step": 1190 }, { "epoch": 9.23076923076923, "grad_norm": 2.1302294731140137, "learning_rate": 7.692307692307694e-06, "loss": 0.8483, "step": 1200 }, { "epoch": 9.307692307692308, "grad_norm": 2.541210174560547, "learning_rate": 7.58974358974359e-06, "loss": 0.8536, "step": 1210 }, { "epoch": 9.384615384615385, "grad_norm": 1.952871322631836, "learning_rate": 7.487179487179488e-06, "loss": 0.8707, "step": 1220 }, { "epoch": 9.461538461538462, "grad_norm": 3.273028612136841, "learning_rate": 7.384615384615386e-06, "loss": 0.8547, "step": 1230 }, { "epoch": 9.538461538461538, "grad_norm": 2.6495628356933594, "learning_rate": 7.282051282051282e-06, "loss": 0.8709, "step": 1240 }, { "epoch": 9.615384615384615, "grad_norm": 1.998024582862854, "learning_rate": 7.17948717948718e-06, "loss": 0.8278, "step": 1250 }, { "epoch": 9.692307692307692, "grad_norm": 2.7621707916259766, "learning_rate": 7.076923076923078e-06, "loss": 0.8544, "step": 1260 }, { "epoch": 9.76923076923077, "grad_norm": 1.844375491142273, "learning_rate": 6.974358974358974e-06, "loss": 0.8324, "step": 1270 }, { "epoch": 9.846153846153847, "grad_norm": 2.149479866027832, "learning_rate": 6.871794871794872e-06, "loss": 0.8146, "step": 1280 }, { "epoch": 9.923076923076923, "grad_norm": 2.2224795818328857, "learning_rate": 6.76923076923077e-06, "loss": 0.8367, "step": 1290 }, { "epoch": 10.0, "grad_norm": 3.8497843742370605, "learning_rate": 6.666666666666667e-06, "loss": 0.8079, "step": 1300 }, { "epoch": 10.0, "eval_accuracy": 0.8421052631578947, "eval_loss": 0.8103837370872498, "eval_runtime": 0.7593, "eval_samples_per_second": 175.164, "eval_steps_per_second": 22.389, "step": 1300 }, { "epoch": 10.076923076923077, "grad_norm": 2.0343823432922363, "learning_rate": 6.564102564102565e-06, "loss": 0.8408, "step": 1310 }, { "epoch": 10.153846153846153, "grad_norm": 2.4245193004608154, "learning_rate": 6.461538461538463e-06, "loss": 0.899, "step": 1320 }, { "epoch": 10.23076923076923, "grad_norm": 2.3912925720214844, "learning_rate": 6.358974358974359e-06, "loss": 0.8758, "step": 1330 }, { "epoch": 10.307692307692308, "grad_norm": 2.1387076377868652, "learning_rate": 6.256410256410257e-06, "loss": 0.8295, "step": 1340 }, { "epoch": 10.384615384615385, "grad_norm": 2.142160415649414, "learning_rate": 6.153846153846155e-06, "loss": 0.8075, "step": 1350 }, { "epoch": 10.461538461538462, "grad_norm": 2.6838831901550293, "learning_rate": 6.051282051282051e-06, "loss": 0.8448, "step": 1360 }, { "epoch": 10.538461538461538, "grad_norm": 2.476369857788086, "learning_rate": 5.948717948717949e-06, "loss": 0.817, "step": 1370 }, { "epoch": 10.615384615384615, "grad_norm": 3.031463861465454, "learning_rate": 5.846153846153847e-06, "loss": 0.8177, "step": 1380 }, { "epoch": 10.692307692307692, "grad_norm": 2.2818636894226074, "learning_rate": 5.743589743589743e-06, "loss": 0.8124, "step": 1390 }, { "epoch": 10.76923076923077, "grad_norm": 3.245805263519287, "learning_rate": 5.641025641025641e-06, "loss": 0.8674, "step": 1400 }, { "epoch": 10.846153846153847, "grad_norm": 2.194627046585083, "learning_rate": 5.538461538461539e-06, "loss": 0.831, "step": 1410 }, { "epoch": 10.923076923076923, "grad_norm": 1.8149436712265015, "learning_rate": 5.435897435897436e-06, "loss": 0.8391, "step": 1420 }, { "epoch": 11.0, "grad_norm": 4.0584821701049805, "learning_rate": 5.333333333333334e-06, "loss": 0.8332, "step": 1430 }, { "epoch": 11.0, "eval_accuracy": 0.8345864661654135, "eval_loss": 0.7806060314178467, "eval_runtime": 0.742, "eval_samples_per_second": 179.256, "eval_steps_per_second": 22.912, "step": 1430 }, { "epoch": 11.076923076923077, "grad_norm": 1.9833248853683472, "learning_rate": 5.230769230769232e-06, "loss": 0.8484, "step": 1440 }, { "epoch": 11.153846153846153, "grad_norm": 5.478232383728027, "learning_rate": 5.128205128205128e-06, "loss": 0.8308, "step": 1450 }, { "epoch": 11.23076923076923, "grad_norm": 2.5792922973632812, "learning_rate": 5.025641025641026e-06, "loss": 0.802, "step": 1460 }, { "epoch": 11.307692307692308, "grad_norm": 2.730989694595337, "learning_rate": 4.923076923076924e-06, "loss": 0.8225, "step": 1470 }, { "epoch": 11.384615384615385, "grad_norm": 2.7447853088378906, "learning_rate": 4.820512820512821e-06, "loss": 0.8176, "step": 1480 }, { "epoch": 11.461538461538462, "grad_norm": 2.6465837955474854, "learning_rate": 4.717948717948718e-06, "loss": 0.8471, "step": 1490 }, { "epoch": 11.538461538461538, "grad_norm": 2.4876015186309814, "learning_rate": 4.615384615384616e-06, "loss": 0.8349, "step": 1500 }, { "epoch": 11.615384615384615, "grad_norm": 3.2605788707733154, "learning_rate": 4.512820512820513e-06, "loss": 0.8285, "step": 1510 }, { "epoch": 11.692307692307692, "grad_norm": 3.278341293334961, "learning_rate": 4.4102564102564104e-06, "loss": 0.8546, "step": 1520 }, { "epoch": 11.76923076923077, "grad_norm": 2.0945637226104736, "learning_rate": 4.307692307692308e-06, "loss": 0.8096, "step": 1530 }, { "epoch": 11.846153846153847, "grad_norm": 2.161726474761963, "learning_rate": 4.2051282051282055e-06, "loss": 0.7938, "step": 1540 }, { "epoch": 11.923076923076923, "grad_norm": 2.1052703857421875, "learning_rate": 4.102564102564103e-06, "loss": 0.8295, "step": 1550 }, { "epoch": 12.0, "grad_norm": 3.460094451904297, "learning_rate": 4.000000000000001e-06, "loss": 0.8103, "step": 1560 }, { "epoch": 12.0, "eval_accuracy": 0.8345864661654135, "eval_loss": 0.7585543990135193, "eval_runtime": 0.7508, "eval_samples_per_second": 177.133, "eval_steps_per_second": 22.641, "step": 1560 }, { "epoch": 12.076923076923077, "grad_norm": 2.943866014480591, "learning_rate": 3.897435897435898e-06, "loss": 0.7903, "step": 1570 }, { "epoch": 12.153846153846153, "grad_norm": 2.6185402870178223, "learning_rate": 3.794871794871795e-06, "loss": 0.8229, "step": 1580 }, { "epoch": 12.23076923076923, "grad_norm": 1.6378310918807983, "learning_rate": 3.692307692307693e-06, "loss": 0.8246, "step": 1590 }, { "epoch": 12.307692307692308, "grad_norm": 2.3109569549560547, "learning_rate": 3.58974358974359e-06, "loss": 0.8363, "step": 1600 }, { "epoch": 12.384615384615385, "grad_norm": 2.3602941036224365, "learning_rate": 3.487179487179487e-06, "loss": 0.8078, "step": 1610 }, { "epoch": 12.461538461538462, "grad_norm": 3.0623390674591064, "learning_rate": 3.384615384615385e-06, "loss": 0.794, "step": 1620 }, { "epoch": 12.538461538461538, "grad_norm": 2.947983741760254, "learning_rate": 3.2820512820512823e-06, "loss": 0.8033, "step": 1630 }, { "epoch": 12.615384615384615, "grad_norm": 1.8083330392837524, "learning_rate": 3.1794871794871795e-06, "loss": 0.8158, "step": 1640 }, { "epoch": 12.692307692307692, "grad_norm": 3.2873637676239014, "learning_rate": 3.0769230769230774e-06, "loss": 0.7651, "step": 1650 }, { "epoch": 12.76923076923077, "grad_norm": 2.3777670860290527, "learning_rate": 2.9743589743589746e-06, "loss": 0.8566, "step": 1660 }, { "epoch": 12.846153846153847, "grad_norm": 1.8692084550857544, "learning_rate": 2.8717948717948717e-06, "loss": 0.8218, "step": 1670 }, { "epoch": 12.923076923076923, "grad_norm": 2.2379138469696045, "learning_rate": 2.7692307692307697e-06, "loss": 0.7984, "step": 1680 }, { "epoch": 13.0, "grad_norm": 4.131476879119873, "learning_rate": 2.666666666666667e-06, "loss": 0.8149, "step": 1690 }, { "epoch": 13.0, "eval_accuracy": 0.8421052631578947, "eval_loss": 0.757113516330719, "eval_runtime": 0.7762, "eval_samples_per_second": 171.337, "eval_steps_per_second": 21.9, "step": 1690 }, { "epoch": 13.076923076923077, "grad_norm": 2.9936656951904297, "learning_rate": 2.564102564102564e-06, "loss": 0.7917, "step": 1700 }, { "epoch": 13.153846153846153, "grad_norm": 2.5392699241638184, "learning_rate": 2.461538461538462e-06, "loss": 0.8241, "step": 1710 }, { "epoch": 13.23076923076923, "grad_norm": 3.0166265964508057, "learning_rate": 2.358974358974359e-06, "loss": 0.8117, "step": 1720 }, { "epoch": 13.307692307692308, "grad_norm": 1.8728867769241333, "learning_rate": 2.2564102564102566e-06, "loss": 0.8155, "step": 1730 }, { "epoch": 13.384615384615385, "grad_norm": 2.50715708732605, "learning_rate": 2.153846153846154e-06, "loss": 0.7814, "step": 1740 }, { "epoch": 13.461538461538462, "grad_norm": 5.447348594665527, "learning_rate": 2.0512820512820513e-06, "loss": 0.8253, "step": 1750 }, { "epoch": 13.538461538461538, "grad_norm": 2.6522035598754883, "learning_rate": 1.948717948717949e-06, "loss": 0.8486, "step": 1760 }, { "epoch": 13.615384615384615, "grad_norm": 2.1300199031829834, "learning_rate": 1.8461538461538465e-06, "loss": 0.8027, "step": 1770 }, { "epoch": 13.692307692307692, "grad_norm": 2.1135923862457275, "learning_rate": 1.7435897435897436e-06, "loss": 0.7852, "step": 1780 }, { "epoch": 13.76923076923077, "grad_norm": 1.871300220489502, "learning_rate": 1.6410256410256412e-06, "loss": 0.8224, "step": 1790 }, { "epoch": 13.846153846153847, "grad_norm": 3.240356206893921, "learning_rate": 1.5384615384615387e-06, "loss": 0.7895, "step": 1800 }, { "epoch": 13.923076923076923, "grad_norm": 2.5182340145111084, "learning_rate": 1.4358974358974359e-06, "loss": 0.7316, "step": 1810 }, { "epoch": 14.0, "grad_norm": 4.281803607940674, "learning_rate": 1.3333333333333334e-06, "loss": 0.8186, "step": 1820 }, { "epoch": 14.0, "eval_accuracy": 0.8270676691729323, "eval_loss": 0.7540305852890015, "eval_runtime": 0.7703, "eval_samples_per_second": 172.654, "eval_steps_per_second": 22.069, "step": 1820 }, { "epoch": 14.076923076923077, "grad_norm": 2.050518751144409, "learning_rate": 1.230769230769231e-06, "loss": 0.8222, "step": 1830 }, { "epoch": 14.153846153846153, "grad_norm": 2.051259994506836, "learning_rate": 1.1282051282051283e-06, "loss": 0.7878, "step": 1840 }, { "epoch": 14.23076923076923, "grad_norm": 2.8861193656921387, "learning_rate": 1.0256410256410257e-06, "loss": 0.78, "step": 1850 }, { "epoch": 14.307692307692308, "grad_norm": 4.159270763397217, "learning_rate": 9.230769230769232e-07, "loss": 0.774, "step": 1860 }, { "epoch": 14.384615384615385, "grad_norm": 2.8624985218048096, "learning_rate": 8.205128205128206e-07, "loss": 0.7882, "step": 1870 }, { "epoch": 14.461538461538462, "grad_norm": 2.5051703453063965, "learning_rate": 7.179487179487179e-07, "loss": 0.7883, "step": 1880 }, { "epoch": 14.538461538461538, "grad_norm": 3.003545045852661, "learning_rate": 6.153846153846155e-07, "loss": 0.7817, "step": 1890 }, { "epoch": 14.615384615384615, "grad_norm": 2.8403878211975098, "learning_rate": 5.128205128205128e-07, "loss": 0.8294, "step": 1900 }, { "epoch": 14.692307692307692, "grad_norm": 2.124030590057373, "learning_rate": 4.102564102564103e-07, "loss": 0.7978, "step": 1910 }, { "epoch": 14.76923076923077, "grad_norm": 4.762181758880615, "learning_rate": 3.0769230769230774e-07, "loss": 0.8038, "step": 1920 }, { "epoch": 14.846153846153847, "grad_norm": 3.256133794784546, "learning_rate": 2.0512820512820514e-07, "loss": 0.8535, "step": 1930 }, { "epoch": 14.923076923076923, "grad_norm": 2.355344772338867, "learning_rate": 1.0256410256410257e-07, "loss": 0.7587, "step": 1940 }, { "epoch": 15.0, "grad_norm": 4.202574729919434, "learning_rate": 0.0, "loss": 0.7929, "step": 1950 }, { "epoch": 15.0, "eval_accuracy": 0.8120300751879699, "eval_loss": 0.7412300109863281, "eval_runtime": 0.8087, "eval_samples_per_second": 164.47, "eval_steps_per_second": 21.022, "step": 1950 }, { "epoch": 15.076923076923077, "grad_norm": 3.3559834957122803, "learning_rate": 4.923076923076924e-06, "loss": 0.7535, "step": 1960 }, { "epoch": 15.153846153846153, "grad_norm": 1.8613739013671875, "learning_rate": 4.8461538461538465e-06, "loss": 0.7581, "step": 1970 }, { "epoch": 15.23076923076923, "grad_norm": 2.3707966804504395, "learning_rate": 4.76923076923077e-06, "loss": 0.7836, "step": 1980 }, { "epoch": 15.307692307692308, "grad_norm": 2.6265199184417725, "learning_rate": 4.692307692307693e-06, "loss": 0.8334, "step": 1990 }, { "epoch": 15.384615384615385, "grad_norm": 2.078848123550415, "learning_rate": 4.615384615384616e-06, "loss": 0.7772, "step": 2000 }, { "epoch": 15.461538461538462, "grad_norm": 2.6433162689208984, "learning_rate": 4.538461538461539e-06, "loss": 0.7955, "step": 2010 }, { "epoch": 15.538461538461538, "grad_norm": 3.458962917327881, "learning_rate": 4.461538461538462e-06, "loss": 0.787, "step": 2020 }, { "epoch": 15.615384615384615, "grad_norm": 5.090147495269775, "learning_rate": 4.384615384615385e-06, "loss": 0.7875, "step": 2030 }, { "epoch": 15.692307692307692, "grad_norm": 1.9066407680511475, "learning_rate": 4.307692307692308e-06, "loss": 0.7764, "step": 2040 }, { "epoch": 15.76923076923077, "grad_norm": 3.097341299057007, "learning_rate": 4.230769230769231e-06, "loss": 0.7335, "step": 2050 }, { "epoch": 15.846153846153847, "grad_norm": 2.7201600074768066, "learning_rate": 4.1538461538461545e-06, "loss": 0.7747, "step": 2060 }, { "epoch": 15.923076923076923, "grad_norm": 2.303032398223877, "learning_rate": 4.076923076923077e-06, "loss": 0.7738, "step": 2070 }, { "epoch": 16.0, "grad_norm": 4.420492172241211, "learning_rate": 4.000000000000001e-06, "loss": 0.774, "step": 2080 }, { "epoch": 16.0, "eval_accuracy": 0.849624060150376, "eval_loss": 0.7370420694351196, "eval_runtime": 0.776, "eval_samples_per_second": 171.384, "eval_steps_per_second": 21.906, "step": 2080 }, { "epoch": 16.076923076923077, "grad_norm": 3.9969003200531006, "learning_rate": 3.923076923076923e-06, "loss": 0.8316, "step": 2090 }, { "epoch": 16.153846153846153, "grad_norm": 2.3731822967529297, "learning_rate": 3.846153846153847e-06, "loss": 0.8162, "step": 2100 }, { "epoch": 16.23076923076923, "grad_norm": 2.232074737548828, "learning_rate": 3.7692307692307694e-06, "loss": 0.8138, "step": 2110 }, { "epoch": 16.307692307692307, "grad_norm": 2.8799118995666504, "learning_rate": 3.692307692307693e-06, "loss": 0.8434, "step": 2120 }, { "epoch": 16.384615384615383, "grad_norm": 2.2093818187713623, "learning_rate": 3.6153846153846156e-06, "loss": 0.7886, "step": 2130 }, { "epoch": 16.46153846153846, "grad_norm": 1.984840750694275, "learning_rate": 3.538461538461539e-06, "loss": 0.7682, "step": 2140 }, { "epoch": 16.53846153846154, "grad_norm": 2.711601495742798, "learning_rate": 3.4615384615384617e-06, "loss": 0.7471, "step": 2150 }, { "epoch": 16.615384615384617, "grad_norm": 2.130311965942383, "learning_rate": 3.384615384615385e-06, "loss": 0.7535, "step": 2160 }, { "epoch": 16.692307692307693, "grad_norm": 2.327207565307617, "learning_rate": 3.307692307692308e-06, "loss": 0.718, "step": 2170 }, { "epoch": 16.76923076923077, "grad_norm": 2.198944091796875, "learning_rate": 3.2307692307692313e-06, "loss": 0.8146, "step": 2180 }, { "epoch": 16.846153846153847, "grad_norm": 2.388453483581543, "learning_rate": 3.153846153846154e-06, "loss": 0.8368, "step": 2190 }, { "epoch": 16.923076923076923, "grad_norm": 2.2575690746307373, "learning_rate": 3.0769230769230774e-06, "loss": 0.749, "step": 2200 }, { "epoch": 17.0, "grad_norm": 6.020498275756836, "learning_rate": 3e-06, "loss": 0.7613, "step": 2210 }, { "epoch": 17.0, "eval_accuracy": 0.849624060150376, "eval_loss": 0.7059224247932434, "eval_runtime": 0.7496, "eval_samples_per_second": 177.439, "eval_steps_per_second": 22.68, "step": 2210 }, { "epoch": 17.076923076923077, "grad_norm": 3.134481430053711, "learning_rate": 2.9230769230769236e-06, "loss": 0.7609, "step": 2220 }, { "epoch": 17.153846153846153, "grad_norm": 2.0070559978485107, "learning_rate": 2.846153846153846e-06, "loss": 0.7483, "step": 2230 }, { "epoch": 17.23076923076923, "grad_norm": 3.491682291030884, "learning_rate": 2.7692307692307697e-06, "loss": 0.7696, "step": 2240 }, { "epoch": 17.307692307692307, "grad_norm": 1.9866397380828857, "learning_rate": 2.6923076923076923e-06, "loss": 0.7609, "step": 2250 }, { "epoch": 17.384615384615383, "grad_norm": 3.458582878112793, "learning_rate": 2.615384615384616e-06, "loss": 0.7813, "step": 2260 }, { "epoch": 17.46153846153846, "grad_norm": 2.1126835346221924, "learning_rate": 2.5384615384615385e-06, "loss": 0.7003, "step": 2270 }, { "epoch": 17.53846153846154, "grad_norm": 3.5276880264282227, "learning_rate": 2.461538461538462e-06, "loss": 0.8305, "step": 2280 }, { "epoch": 17.615384615384617, "grad_norm": 2.3967173099517822, "learning_rate": 2.384615384615385e-06, "loss": 0.7627, "step": 2290 }, { "epoch": 17.692307692307693, "grad_norm": 4.473978042602539, "learning_rate": 2.307692307692308e-06, "loss": 0.7332, "step": 2300 }, { "epoch": 17.76923076923077, "grad_norm": 2.1642568111419678, "learning_rate": 2.230769230769231e-06, "loss": 0.7678, "step": 2310 }, { "epoch": 17.846153846153847, "grad_norm": 3.03192138671875, "learning_rate": 2.153846153846154e-06, "loss": 0.7565, "step": 2320 }, { "epoch": 17.923076923076923, "grad_norm": 2.9610419273376465, "learning_rate": 2.0769230769230773e-06, "loss": 0.7651, "step": 2330 }, { "epoch": 18.0, "grad_norm": 4.160178184509277, "learning_rate": 2.0000000000000003e-06, "loss": 0.7778, "step": 2340 }, { "epoch": 18.0, "eval_accuracy": 0.8270676691729323, "eval_loss": 0.6930322647094727, "eval_runtime": 0.7854, "eval_samples_per_second": 169.332, "eval_steps_per_second": 21.644, "step": 2340 }, { "epoch": 18.076923076923077, "grad_norm": 2.168921947479248, "learning_rate": 1.9230769230769234e-06, "loss": 0.7234, "step": 2350 }, { "epoch": 18.153846153846153, "grad_norm": 3.935608386993408, "learning_rate": 1.8461538461538465e-06, "loss": 0.8192, "step": 2360 }, { "epoch": 18.23076923076923, "grad_norm": 1.8215328454971313, "learning_rate": 1.7692307692307695e-06, "loss": 0.7271, "step": 2370 }, { "epoch": 18.307692307692307, "grad_norm": 2.687016010284424, "learning_rate": 1.6923076923076926e-06, "loss": 0.8063, "step": 2380 }, { "epoch": 18.384615384615383, "grad_norm": 2.3364577293395996, "learning_rate": 1.6153846153846157e-06, "loss": 0.7699, "step": 2390 }, { "epoch": 18.46153846153846, "grad_norm": 2.7465319633483887, "learning_rate": 1.5384615384615387e-06, "loss": 0.8214, "step": 2400 }, { "epoch": 18.53846153846154, "grad_norm": 3.3499436378479004, "learning_rate": 1.4615384615384618e-06, "loss": 0.7432, "step": 2410 }, { "epoch": 18.615384615384617, "grad_norm": 3.7266149520874023, "learning_rate": 1.3846153846153848e-06, "loss": 0.797, "step": 2420 }, { "epoch": 18.692307692307693, "grad_norm": 2.661741256713867, "learning_rate": 1.307692307692308e-06, "loss": 0.7404, "step": 2430 }, { "epoch": 18.76923076923077, "grad_norm": 3.166747808456421, "learning_rate": 1.230769230769231e-06, "loss": 0.8197, "step": 2440 }, { "epoch": 18.846153846153847, "grad_norm": 3.200448989868164, "learning_rate": 1.153846153846154e-06, "loss": 0.8068, "step": 2450 }, { "epoch": 18.923076923076923, "grad_norm": 2.4404191970825195, "learning_rate": 1.076923076923077e-06, "loss": 0.788, "step": 2460 }, { "epoch": 19.0, "grad_norm": 3.8639049530029297, "learning_rate": 1.0000000000000002e-06, "loss": 0.8081, "step": 2470 }, { "epoch": 19.0, "eval_accuracy": 0.8646616541353384, "eval_loss": 0.6890266537666321, "eval_runtime": 0.7797, "eval_samples_per_second": 170.576, "eval_steps_per_second": 21.803, "step": 2470 }, { "epoch": 19.076923076923077, "grad_norm": 1.7245137691497803, "learning_rate": 9.230769230769232e-07, "loss": 0.7929, "step": 2480 }, { "epoch": 19.153846153846153, "grad_norm": 3.7959182262420654, "learning_rate": 8.461538461538463e-07, "loss": 0.7397, "step": 2490 }, { "epoch": 19.23076923076923, "grad_norm": 2.798788070678711, "learning_rate": 7.692307692307694e-07, "loss": 0.7928, "step": 2500 }, { "epoch": 19.307692307692307, "grad_norm": 2.1275336742401123, "learning_rate": 6.923076923076924e-07, "loss": 0.7672, "step": 2510 }, { "epoch": 19.384615384615383, "grad_norm": 2.9216866493225098, "learning_rate": 6.153846153846155e-07, "loss": 0.7918, "step": 2520 }, { "epoch": 19.46153846153846, "grad_norm": 2.3012797832489014, "learning_rate": 5.384615384615386e-07, "loss": 0.7418, "step": 2530 }, { "epoch": 19.53846153846154, "grad_norm": 2.5353312492370605, "learning_rate": 4.615384615384616e-07, "loss": 0.8115, "step": 2540 }, { "epoch": 19.615384615384617, "grad_norm": 3.469372510910034, "learning_rate": 3.846153846153847e-07, "loss": 0.7698, "step": 2550 }, { "epoch": 19.692307692307693, "grad_norm": 2.3621013164520264, "learning_rate": 3.0769230769230774e-07, "loss": 0.6997, "step": 2560 }, { "epoch": 19.76923076923077, "grad_norm": 1.7231149673461914, "learning_rate": 2.307692307692308e-07, "loss": 0.7207, "step": 2570 }, { "epoch": 19.846153846153847, "grad_norm": 5.3792924880981445, "learning_rate": 1.5384615384615387e-07, "loss": 0.7656, "step": 2580 }, { "epoch": 19.923076923076923, "grad_norm": 1.9618691205978394, "learning_rate": 7.692307692307694e-08, "loss": 0.6919, "step": 2590 }, { "epoch": 20.0, "grad_norm": 4.051193714141846, "learning_rate": 0.0, "loss": 0.7916, "step": 2600 }, { "epoch": 20.0, "eval_accuracy": 0.8646616541353384, "eval_loss": 0.6874601244926453, "eval_runtime": 0.8096, "eval_samples_per_second": 164.271, "eval_steps_per_second": 20.997, "step": 2600 }, { "epoch": 20.0, "step": 2600, "total_flos": 2.0877820672794624e+17, "train_loss": 0.19350949709232038, "train_runtime": 49.806, "train_samples_per_second": 415.211, "train_steps_per_second": 52.203 } ], "logging_steps": 10, "max_steps": 2600, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0877820672794624e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }