|
{ |
|
"best_metric": 0.6874601244926453, |
|
"best_model_checkpoint": "./beans_outputs/checkpoint-2600", |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 2600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07692307692307693, |
|
"grad_norm": 2.136049747467041, |
|
"learning_rate": 1.98974358974359e-05, |
|
"loss": 1.1239, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 1.8187670707702637, |
|
"learning_rate": 1.9794871794871798e-05, |
|
"loss": 1.1221, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.23076923076923078, |
|
"grad_norm": 2.0219993591308594, |
|
"learning_rate": 1.9692307692307696e-05, |
|
"loss": 1.1164, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 2.4619803428649902, |
|
"learning_rate": 1.958974358974359e-05, |
|
"loss": 1.1044, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 1.6733014583587646, |
|
"learning_rate": 1.9487179487179488e-05, |
|
"loss": 1.1082, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 1.4969494342803955, |
|
"learning_rate": 1.9384615384615386e-05, |
|
"loss": 1.1043, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5384615384615384, |
|
"grad_norm": 2.6347556114196777, |
|
"learning_rate": 1.9282051282051284e-05, |
|
"loss": 1.1028, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 2.5843420028686523, |
|
"learning_rate": 1.9179487179487182e-05, |
|
"loss": 1.0908, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6923076923076923, |
|
"grad_norm": 2.0522916316986084, |
|
"learning_rate": 1.907692307692308e-05, |
|
"loss": 1.094, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 2.4885082244873047, |
|
"learning_rate": 1.8974358974358975e-05, |
|
"loss": 1.0912, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8461538461538461, |
|
"grad_norm": 1.7014166116714478, |
|
"learning_rate": 1.8871794871794873e-05, |
|
"loss": 1.0949, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 2.294283866882324, |
|
"learning_rate": 1.876923076923077e-05, |
|
"loss": 1.0992, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.129885673522949, |
|
"learning_rate": 1.866666666666667e-05, |
|
"loss": 1.0864, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.42857142857142855, |
|
"eval_loss": 1.0877832174301147, |
|
"eval_runtime": 0.7833, |
|
"eval_samples_per_second": 169.792, |
|
"eval_steps_per_second": 21.703, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 2.331717014312744, |
|
"learning_rate": 1.8564102564102567e-05, |
|
"loss": 1.0774, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 2.5262138843536377, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 1.0719, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 1.5971320867538452, |
|
"learning_rate": 1.835897435897436e-05, |
|
"loss": 1.0781, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3076923076923077, |
|
"grad_norm": 2.383288860321045, |
|
"learning_rate": 1.8256410256410257e-05, |
|
"loss": 1.0929, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 2.169706106185913, |
|
"learning_rate": 1.8153846153846155e-05, |
|
"loss": 1.0805, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.4615384615384617, |
|
"grad_norm": 2.1174418926239014, |
|
"learning_rate": 1.8051282051282053e-05, |
|
"loss": 1.08, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 1.7236179113388062, |
|
"learning_rate": 1.794871794871795e-05, |
|
"loss": 1.0766, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6153846153846154, |
|
"grad_norm": 1.7772722244262695, |
|
"learning_rate": 1.784615384615385e-05, |
|
"loss": 1.0676, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 3.53834867477417, |
|
"learning_rate": 1.7743589743589744e-05, |
|
"loss": 1.0695, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.7692307692307692, |
|
"grad_norm": 2.0417070388793945, |
|
"learning_rate": 1.7641025641025642e-05, |
|
"loss": 1.0706, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 1.9734611511230469, |
|
"learning_rate": 1.753846153846154e-05, |
|
"loss": 1.0863, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 1.9997600317001343, |
|
"learning_rate": 1.7435897435897438e-05, |
|
"loss": 1.068, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.0024373531341553, |
|
"learning_rate": 1.7333333333333336e-05, |
|
"loss": 1.0629, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.5488721804511278, |
|
"eval_loss": 1.0593525171279907, |
|
"eval_runtime": 0.7442, |
|
"eval_samples_per_second": 178.706, |
|
"eval_steps_per_second": 22.842, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.076923076923077, |
|
"grad_norm": 1.977807641029358, |
|
"learning_rate": 1.7230769230769234e-05, |
|
"loss": 1.0711, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 2.3906102180480957, |
|
"learning_rate": 1.7128205128205128e-05, |
|
"loss": 1.0597, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.230769230769231, |
|
"grad_norm": 2.3670897483825684, |
|
"learning_rate": 1.7025641025641026e-05, |
|
"loss": 1.0576, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 3.026155948638916, |
|
"learning_rate": 1.6923076923076924e-05, |
|
"loss": 1.0434, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.3846153846153846, |
|
"grad_norm": 1.9082350730895996, |
|
"learning_rate": 1.6820512820512822e-05, |
|
"loss": 1.0566, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 1.9187153577804565, |
|
"learning_rate": 1.671794871794872e-05, |
|
"loss": 1.0476, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.5384615384615383, |
|
"grad_norm": 1.4435549974441528, |
|
"learning_rate": 1.6615384615384618e-05, |
|
"loss": 1.032, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 2.1457245349884033, |
|
"learning_rate": 1.6512820512820513e-05, |
|
"loss": 1.0475, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 1.9391709566116333, |
|
"learning_rate": 1.641025641025641e-05, |
|
"loss": 1.0486, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 1.8148127794265747, |
|
"learning_rate": 1.630769230769231e-05, |
|
"loss": 1.0407, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.8461538461538463, |
|
"grad_norm": 2.444157123565674, |
|
"learning_rate": 1.6205128205128207e-05, |
|
"loss": 1.0356, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 1.9061695337295532, |
|
"learning_rate": 1.6102564102564105e-05, |
|
"loss": 1.0239, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 4.859686851501465, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.0434, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.6766917293233082, |
|
"eval_loss": 1.0230107307434082, |
|
"eval_runtime": 0.7471, |
|
"eval_samples_per_second": 178.027, |
|
"eval_steps_per_second": 22.755, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 2.2021689414978027, |
|
"learning_rate": 1.5897435897435897e-05, |
|
"loss": 1.0424, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.1538461538461537, |
|
"grad_norm": 1.8670283555984497, |
|
"learning_rate": 1.5794871794871795e-05, |
|
"loss": 1.0299, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.230769230769231, |
|
"grad_norm": 2.193986654281616, |
|
"learning_rate": 1.5692307692307693e-05, |
|
"loss": 1.0369, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.3076923076923075, |
|
"grad_norm": 2.26470685005188, |
|
"learning_rate": 1.558974358974359e-05, |
|
"loss": 1.0159, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"grad_norm": 2.18507981300354, |
|
"learning_rate": 1.548717948717949e-05, |
|
"loss": 1.0282, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.4615384615384617, |
|
"grad_norm": 1.8047341108322144, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 1.0381, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"grad_norm": 2.6463704109191895, |
|
"learning_rate": 1.5282051282051282e-05, |
|
"loss": 1.0322, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.6153846153846154, |
|
"grad_norm": 1.6456215381622314, |
|
"learning_rate": 1.517948717948718e-05, |
|
"loss": 1.0049, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"grad_norm": 2.774256706237793, |
|
"learning_rate": 1.5076923076923078e-05, |
|
"loss": 1.0091, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.769230769230769, |
|
"grad_norm": 1.572251319885254, |
|
"learning_rate": 1.4974358974358976e-05, |
|
"loss": 0.998, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 1.5640805959701538, |
|
"learning_rate": 1.4871794871794874e-05, |
|
"loss": 1.0222, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.9230769230769234, |
|
"grad_norm": 1.9231537580490112, |
|
"learning_rate": 1.4769230769230772e-05, |
|
"loss": 0.9979, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 5.481942176818848, |
|
"learning_rate": 1.4666666666666666e-05, |
|
"loss": 1.0214, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.6766917293233082, |
|
"eval_loss": 0.9964542388916016, |
|
"eval_runtime": 0.7616, |
|
"eval_samples_per_second": 174.638, |
|
"eval_steps_per_second": 22.322, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.076923076923077, |
|
"grad_norm": 2.7514402866363525, |
|
"learning_rate": 1.4564102564102564e-05, |
|
"loss": 1.0128, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.153846153846154, |
|
"grad_norm": 1.8411396741867065, |
|
"learning_rate": 1.4461538461538462e-05, |
|
"loss": 1.0145, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.230769230769231, |
|
"grad_norm": 2.670154571533203, |
|
"learning_rate": 1.435897435897436e-05, |
|
"loss": 1.0227, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 2.1951498985290527, |
|
"learning_rate": 1.4256410256410258e-05, |
|
"loss": 1.0321, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.384615384615385, |
|
"grad_norm": 1.9692825078964233, |
|
"learning_rate": 1.4153846153846156e-05, |
|
"loss": 0.9829, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.461538461538462, |
|
"grad_norm": 2.611340284347534, |
|
"learning_rate": 1.405128205128205e-05, |
|
"loss": 0.9918, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.538461538461538, |
|
"grad_norm": 2.4288899898529053, |
|
"learning_rate": 1.3948717948717949e-05, |
|
"loss": 0.9879, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 1.7537823915481567, |
|
"learning_rate": 1.3846153846153847e-05, |
|
"loss": 0.9793, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.6923076923076925, |
|
"grad_norm": 3.1588003635406494, |
|
"learning_rate": 1.3743589743589745e-05, |
|
"loss": 1.0002, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.769230769230769, |
|
"grad_norm": 2.2472622394561768, |
|
"learning_rate": 1.3641025641025643e-05, |
|
"loss": 1.0094, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.846153846153846, |
|
"grad_norm": 1.7958937883377075, |
|
"learning_rate": 1.353846153846154e-05, |
|
"loss": 0.9703, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 2.415766477584839, |
|
"learning_rate": 1.3435897435897435e-05, |
|
"loss": 0.9703, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 4.948933124542236, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.0026, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.7443609022556391, |
|
"eval_loss": 0.9569369554519653, |
|
"eval_runtime": 0.7647, |
|
"eval_samples_per_second": 173.928, |
|
"eval_steps_per_second": 22.231, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.076923076923077, |
|
"grad_norm": 2.1397032737731934, |
|
"learning_rate": 1.3230769230769231e-05, |
|
"loss": 0.9645, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 5.153846153846154, |
|
"grad_norm": 2.7277321815490723, |
|
"learning_rate": 1.312820512820513e-05, |
|
"loss": 1.0063, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 5.230769230769231, |
|
"grad_norm": 2.391350030899048, |
|
"learning_rate": 1.3025641025641027e-05, |
|
"loss": 0.9918, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.3076923076923075, |
|
"grad_norm": 2.751174211502075, |
|
"learning_rate": 1.2923076923076925e-05, |
|
"loss": 0.9849, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 5.384615384615385, |
|
"grad_norm": 2.77424693107605, |
|
"learning_rate": 1.2820512820512823e-05, |
|
"loss": 0.9745, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.461538461538462, |
|
"grad_norm": 1.9156702756881714, |
|
"learning_rate": 1.2717948717948718e-05, |
|
"loss": 0.9684, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.538461538461538, |
|
"grad_norm": 1.9521454572677612, |
|
"learning_rate": 1.2615384615384616e-05, |
|
"loss": 0.9503, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.615384615384615, |
|
"grad_norm": 2.468419313430786, |
|
"learning_rate": 1.2512820512820514e-05, |
|
"loss": 0.9641, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.6923076923076925, |
|
"grad_norm": 2.520923614501953, |
|
"learning_rate": 1.2410256410256412e-05, |
|
"loss": 0.9471, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.769230769230769, |
|
"grad_norm": 2.1003758907318115, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.9513, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 5.846153846153846, |
|
"grad_norm": 2.192279100418091, |
|
"learning_rate": 1.2205128205128208e-05, |
|
"loss": 0.9527, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 5.923076923076923, |
|
"grad_norm": 3.8428618907928467, |
|
"learning_rate": 1.2102564102564102e-05, |
|
"loss": 0.938, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 4.9151530265808105, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.9753, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.7819548872180451, |
|
"eval_loss": 0.9288201332092285, |
|
"eval_runtime": 0.7499, |
|
"eval_samples_per_second": 177.349, |
|
"eval_steps_per_second": 22.669, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 6.076923076923077, |
|
"grad_norm": 2.7967398166656494, |
|
"learning_rate": 1.1897435897435898e-05, |
|
"loss": 0.9428, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 2.5342345237731934, |
|
"learning_rate": 1.1794871794871796e-05, |
|
"loss": 0.9406, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.230769230769231, |
|
"grad_norm": 1.877543330192566, |
|
"learning_rate": 1.1692307692307694e-05, |
|
"loss": 0.9319, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 6.3076923076923075, |
|
"grad_norm": 2.4524621963500977, |
|
"learning_rate": 1.1589743589743592e-05, |
|
"loss": 0.9332, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 6.384615384615385, |
|
"grad_norm": 2.4967362880706787, |
|
"learning_rate": 1.1487179487179487e-05, |
|
"loss": 0.9367, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 6.461538461538462, |
|
"grad_norm": 3.2078776359558105, |
|
"learning_rate": 1.1384615384615385e-05, |
|
"loss": 0.9339, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 6.538461538461538, |
|
"grad_norm": 2.926706075668335, |
|
"learning_rate": 1.1282051282051283e-05, |
|
"loss": 0.9416, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.615384615384615, |
|
"grad_norm": 1.8625017404556274, |
|
"learning_rate": 1.117948717948718e-05, |
|
"loss": 0.9111, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 6.6923076923076925, |
|
"grad_norm": 2.7141189575195312, |
|
"learning_rate": 1.1076923076923079e-05, |
|
"loss": 0.9574, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 6.769230769230769, |
|
"grad_norm": 2.307347536087036, |
|
"learning_rate": 1.0974358974358977e-05, |
|
"loss": 0.9259, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 6.846153846153846, |
|
"grad_norm": 2.3937132358551025, |
|
"learning_rate": 1.0871794871794871e-05, |
|
"loss": 0.9207, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 6.923076923076923, |
|
"grad_norm": 3.0794668197631836, |
|
"learning_rate": 1.076923076923077e-05, |
|
"loss": 0.9418, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 4.111669063568115, |
|
"learning_rate": 1.0666666666666667e-05, |
|
"loss": 0.9252, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7969924812030075, |
|
"eval_loss": 0.8874692916870117, |
|
"eval_runtime": 0.7823, |
|
"eval_samples_per_second": 170.013, |
|
"eval_steps_per_second": 21.731, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 7.076923076923077, |
|
"grad_norm": 2.7561662197113037, |
|
"learning_rate": 1.0564102564102565e-05, |
|
"loss": 0.911, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 7.153846153846154, |
|
"grad_norm": 3.2020223140716553, |
|
"learning_rate": 1.0461538461538463e-05, |
|
"loss": 0.912, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 7.230769230769231, |
|
"grad_norm": 3.459304094314575, |
|
"learning_rate": 1.0358974358974361e-05, |
|
"loss": 0.8994, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 7.3076923076923075, |
|
"grad_norm": 2.774078369140625, |
|
"learning_rate": 1.0256410256410256e-05, |
|
"loss": 0.9079, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 7.384615384615385, |
|
"grad_norm": 2.7169668674468994, |
|
"learning_rate": 1.0153846153846154e-05, |
|
"loss": 0.9256, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 7.461538461538462, |
|
"grad_norm": 2.171323299407959, |
|
"learning_rate": 1.0051282051282052e-05, |
|
"loss": 0.8898, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 7.538461538461538, |
|
"grad_norm": 2.7350351810455322, |
|
"learning_rate": 9.94871794871795e-06, |
|
"loss": 0.9243, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 7.615384615384615, |
|
"grad_norm": 2.3926539421081543, |
|
"learning_rate": 9.846153846153848e-06, |
|
"loss": 0.8868, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 2.0602715015411377, |
|
"learning_rate": 9.743589743589744e-06, |
|
"loss": 0.8837, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 7.769230769230769, |
|
"grad_norm": 2.885303497314453, |
|
"learning_rate": 9.641025641025642e-06, |
|
"loss": 0.8827, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 7.846153846153846, |
|
"grad_norm": 2.261361837387085, |
|
"learning_rate": 9.53846153846154e-06, |
|
"loss": 0.9047, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 7.923076923076923, |
|
"grad_norm": 2.6180179119110107, |
|
"learning_rate": 9.435897435897436e-06, |
|
"loss": 0.861, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 4.225304126739502, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 0.9192, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.8120300751879699, |
|
"eval_loss": 0.850643515586853, |
|
"eval_runtime": 0.756, |
|
"eval_samples_per_second": 175.92, |
|
"eval_steps_per_second": 22.486, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 8.076923076923077, |
|
"grad_norm": 2.1875813007354736, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.8953, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 8.153846153846153, |
|
"grad_norm": 2.1640567779541016, |
|
"learning_rate": 9.128205128205129e-06, |
|
"loss": 0.8658, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 8.23076923076923, |
|
"grad_norm": 2.660614490509033, |
|
"learning_rate": 9.025641025641027e-06, |
|
"loss": 0.8995, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 8.307692307692308, |
|
"grad_norm": 2.104029417037964, |
|
"learning_rate": 8.923076923076925e-06, |
|
"loss": 0.8569, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 8.384615384615385, |
|
"grad_norm": 2.2643303871154785, |
|
"learning_rate": 8.820512820512821e-06, |
|
"loss": 0.8972, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 8.461538461538462, |
|
"grad_norm": 2.632410764694214, |
|
"learning_rate": 8.717948717948719e-06, |
|
"loss": 0.8715, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 8.538461538461538, |
|
"grad_norm": 1.6500084400177002, |
|
"learning_rate": 8.615384615384617e-06, |
|
"loss": 0.8716, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 8.615384615384615, |
|
"grad_norm": 6.204855442047119, |
|
"learning_rate": 8.512820512820513e-06, |
|
"loss": 0.8985, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 8.692307692307692, |
|
"grad_norm": 3.729611873626709, |
|
"learning_rate": 8.410256410256411e-06, |
|
"loss": 0.8837, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 8.76923076923077, |
|
"grad_norm": 3.685739278793335, |
|
"learning_rate": 8.307692307692309e-06, |
|
"loss": 0.8865, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 8.846153846153847, |
|
"grad_norm": 2.7028560638427734, |
|
"learning_rate": 8.205128205128205e-06, |
|
"loss": 0.875, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 8.923076923076923, |
|
"grad_norm": 2.7692482471466064, |
|
"learning_rate": 8.102564102564103e-06, |
|
"loss": 0.8867, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 3.9854462146759033, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.9008, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.8045112781954887, |
|
"eval_loss": 0.8337866067886353, |
|
"eval_runtime": 0.7963, |
|
"eval_samples_per_second": 167.03, |
|
"eval_steps_per_second": 21.35, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 9.076923076923077, |
|
"grad_norm": 1.9381572008132935, |
|
"learning_rate": 7.897435897435898e-06, |
|
"loss": 0.8969, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 9.153846153846153, |
|
"grad_norm": 2.219219446182251, |
|
"learning_rate": 7.794871794871796e-06, |
|
"loss": 0.8412, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 2.1302294731140137, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.8483, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 9.307692307692308, |
|
"grad_norm": 2.541210174560547, |
|
"learning_rate": 7.58974358974359e-06, |
|
"loss": 0.8536, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 9.384615384615385, |
|
"grad_norm": 1.952871322631836, |
|
"learning_rate": 7.487179487179488e-06, |
|
"loss": 0.8707, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 9.461538461538462, |
|
"grad_norm": 3.273028612136841, |
|
"learning_rate": 7.384615384615386e-06, |
|
"loss": 0.8547, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 9.538461538461538, |
|
"grad_norm": 2.6495628356933594, |
|
"learning_rate": 7.282051282051282e-06, |
|
"loss": 0.8709, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 9.615384615384615, |
|
"grad_norm": 1.998024582862854, |
|
"learning_rate": 7.17948717948718e-06, |
|
"loss": 0.8278, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 9.692307692307692, |
|
"grad_norm": 2.7621707916259766, |
|
"learning_rate": 7.076923076923078e-06, |
|
"loss": 0.8544, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 9.76923076923077, |
|
"grad_norm": 1.844375491142273, |
|
"learning_rate": 6.974358974358974e-06, |
|
"loss": 0.8324, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 9.846153846153847, |
|
"grad_norm": 2.149479866027832, |
|
"learning_rate": 6.871794871794872e-06, |
|
"loss": 0.8146, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 9.923076923076923, |
|
"grad_norm": 2.2224795818328857, |
|
"learning_rate": 6.76923076923077e-06, |
|
"loss": 0.8367, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 3.8497843742370605, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.8079, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.8421052631578947, |
|
"eval_loss": 0.8103837370872498, |
|
"eval_runtime": 0.7593, |
|
"eval_samples_per_second": 175.164, |
|
"eval_steps_per_second": 22.389, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 10.076923076923077, |
|
"grad_norm": 2.0343823432922363, |
|
"learning_rate": 6.564102564102565e-06, |
|
"loss": 0.8408, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 10.153846153846153, |
|
"grad_norm": 2.4245193004608154, |
|
"learning_rate": 6.461538461538463e-06, |
|
"loss": 0.899, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 10.23076923076923, |
|
"grad_norm": 2.3912925720214844, |
|
"learning_rate": 6.358974358974359e-06, |
|
"loss": 0.8758, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 10.307692307692308, |
|
"grad_norm": 2.1387076377868652, |
|
"learning_rate": 6.256410256410257e-06, |
|
"loss": 0.8295, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 10.384615384615385, |
|
"grad_norm": 2.142160415649414, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.8075, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 10.461538461538462, |
|
"grad_norm": 2.6838831901550293, |
|
"learning_rate": 6.051282051282051e-06, |
|
"loss": 0.8448, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 10.538461538461538, |
|
"grad_norm": 2.476369857788086, |
|
"learning_rate": 5.948717948717949e-06, |
|
"loss": 0.817, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 10.615384615384615, |
|
"grad_norm": 3.031463861465454, |
|
"learning_rate": 5.846153846153847e-06, |
|
"loss": 0.8177, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 10.692307692307692, |
|
"grad_norm": 2.2818636894226074, |
|
"learning_rate": 5.743589743589743e-06, |
|
"loss": 0.8124, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 10.76923076923077, |
|
"grad_norm": 3.245805263519287, |
|
"learning_rate": 5.641025641025641e-06, |
|
"loss": 0.8674, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 10.846153846153847, |
|
"grad_norm": 2.194627046585083, |
|
"learning_rate": 5.538461538461539e-06, |
|
"loss": 0.831, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 10.923076923076923, |
|
"grad_norm": 1.8149436712265015, |
|
"learning_rate": 5.435897435897436e-06, |
|
"loss": 0.8391, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 4.0584821701049805, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 0.8332, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.8345864661654135, |
|
"eval_loss": 0.7806060314178467, |
|
"eval_runtime": 0.742, |
|
"eval_samples_per_second": 179.256, |
|
"eval_steps_per_second": 22.912, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 11.076923076923077, |
|
"grad_norm": 1.9833248853683472, |
|
"learning_rate": 5.230769230769232e-06, |
|
"loss": 0.8484, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 11.153846153846153, |
|
"grad_norm": 5.478232383728027, |
|
"learning_rate": 5.128205128205128e-06, |
|
"loss": 0.8308, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 11.23076923076923, |
|
"grad_norm": 2.5792922973632812, |
|
"learning_rate": 5.025641025641026e-06, |
|
"loss": 0.802, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 11.307692307692308, |
|
"grad_norm": 2.730989694595337, |
|
"learning_rate": 4.923076923076924e-06, |
|
"loss": 0.8225, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 11.384615384615385, |
|
"grad_norm": 2.7447853088378906, |
|
"learning_rate": 4.820512820512821e-06, |
|
"loss": 0.8176, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 11.461538461538462, |
|
"grad_norm": 2.6465837955474854, |
|
"learning_rate": 4.717948717948718e-06, |
|
"loss": 0.8471, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 11.538461538461538, |
|
"grad_norm": 2.4876015186309814, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 0.8349, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 11.615384615384615, |
|
"grad_norm": 3.2605788707733154, |
|
"learning_rate": 4.512820512820513e-06, |
|
"loss": 0.8285, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 11.692307692307692, |
|
"grad_norm": 3.278341293334961, |
|
"learning_rate": 4.4102564102564104e-06, |
|
"loss": 0.8546, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 11.76923076923077, |
|
"grad_norm": 2.0945637226104736, |
|
"learning_rate": 4.307692307692308e-06, |
|
"loss": 0.8096, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 11.846153846153847, |
|
"grad_norm": 2.161726474761963, |
|
"learning_rate": 4.2051282051282055e-06, |
|
"loss": 0.7938, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 11.923076923076923, |
|
"grad_norm": 2.1052703857421875, |
|
"learning_rate": 4.102564102564103e-06, |
|
"loss": 0.8295, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 3.460094451904297, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.8103, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.8345864661654135, |
|
"eval_loss": 0.7585543990135193, |
|
"eval_runtime": 0.7508, |
|
"eval_samples_per_second": 177.133, |
|
"eval_steps_per_second": 22.641, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 12.076923076923077, |
|
"grad_norm": 2.943866014480591, |
|
"learning_rate": 3.897435897435898e-06, |
|
"loss": 0.7903, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 12.153846153846153, |
|
"grad_norm": 2.6185402870178223, |
|
"learning_rate": 3.794871794871795e-06, |
|
"loss": 0.8229, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 12.23076923076923, |
|
"grad_norm": 1.6378310918807983, |
|
"learning_rate": 3.692307692307693e-06, |
|
"loss": 0.8246, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"grad_norm": 2.3109569549560547, |
|
"learning_rate": 3.58974358974359e-06, |
|
"loss": 0.8363, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 12.384615384615385, |
|
"grad_norm": 2.3602941036224365, |
|
"learning_rate": 3.487179487179487e-06, |
|
"loss": 0.8078, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 12.461538461538462, |
|
"grad_norm": 3.0623390674591064, |
|
"learning_rate": 3.384615384615385e-06, |
|
"loss": 0.794, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 12.538461538461538, |
|
"grad_norm": 2.947983741760254, |
|
"learning_rate": 3.2820512820512823e-06, |
|
"loss": 0.8033, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 12.615384615384615, |
|
"grad_norm": 1.8083330392837524, |
|
"learning_rate": 3.1794871794871795e-06, |
|
"loss": 0.8158, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 12.692307692307692, |
|
"grad_norm": 3.2873637676239014, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.7651, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 12.76923076923077, |
|
"grad_norm": 2.3777670860290527, |
|
"learning_rate": 2.9743589743589746e-06, |
|
"loss": 0.8566, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 12.846153846153847, |
|
"grad_norm": 1.8692084550857544, |
|
"learning_rate": 2.8717948717948717e-06, |
|
"loss": 0.8218, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 12.923076923076923, |
|
"grad_norm": 2.2379138469696045, |
|
"learning_rate": 2.7692307692307697e-06, |
|
"loss": 0.7984, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 4.131476879119873, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.8149, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.8421052631578947, |
|
"eval_loss": 0.757113516330719, |
|
"eval_runtime": 0.7762, |
|
"eval_samples_per_second": 171.337, |
|
"eval_steps_per_second": 21.9, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 13.076923076923077, |
|
"grad_norm": 2.9936656951904297, |
|
"learning_rate": 2.564102564102564e-06, |
|
"loss": 0.7917, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 13.153846153846153, |
|
"grad_norm": 2.5392699241638184, |
|
"learning_rate": 2.461538461538462e-06, |
|
"loss": 0.8241, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 13.23076923076923, |
|
"grad_norm": 3.0166265964508057, |
|
"learning_rate": 2.358974358974359e-06, |
|
"loss": 0.8117, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 13.307692307692308, |
|
"grad_norm": 1.8728867769241333, |
|
"learning_rate": 2.2564102564102566e-06, |
|
"loss": 0.8155, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 13.384615384615385, |
|
"grad_norm": 2.50715708732605, |
|
"learning_rate": 2.153846153846154e-06, |
|
"loss": 0.7814, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 13.461538461538462, |
|
"grad_norm": 5.447348594665527, |
|
"learning_rate": 2.0512820512820513e-06, |
|
"loss": 0.8253, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 13.538461538461538, |
|
"grad_norm": 2.6522035598754883, |
|
"learning_rate": 1.948717948717949e-06, |
|
"loss": 0.8486, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 13.615384615384615, |
|
"grad_norm": 2.1300199031829834, |
|
"learning_rate": 1.8461538461538465e-06, |
|
"loss": 0.8027, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 13.692307692307692, |
|
"grad_norm": 2.1135923862457275, |
|
"learning_rate": 1.7435897435897436e-06, |
|
"loss": 0.7852, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 13.76923076923077, |
|
"grad_norm": 1.871300220489502, |
|
"learning_rate": 1.6410256410256412e-06, |
|
"loss": 0.8224, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 13.846153846153847, |
|
"grad_norm": 3.240356206893921, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 0.7895, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 13.923076923076923, |
|
"grad_norm": 2.5182340145111084, |
|
"learning_rate": 1.4358974358974359e-06, |
|
"loss": 0.7316, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 4.281803607940674, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 0.8186, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.8270676691729323, |
|
"eval_loss": 0.7540305852890015, |
|
"eval_runtime": 0.7703, |
|
"eval_samples_per_second": 172.654, |
|
"eval_steps_per_second": 22.069, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 14.076923076923077, |
|
"grad_norm": 2.050518751144409, |
|
"learning_rate": 1.230769230769231e-06, |
|
"loss": 0.8222, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 14.153846153846153, |
|
"grad_norm": 2.051259994506836, |
|
"learning_rate": 1.1282051282051283e-06, |
|
"loss": 0.7878, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 14.23076923076923, |
|
"grad_norm": 2.8861193656921387, |
|
"learning_rate": 1.0256410256410257e-06, |
|
"loss": 0.78, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 14.307692307692308, |
|
"grad_norm": 4.159270763397217, |
|
"learning_rate": 9.230769230769232e-07, |
|
"loss": 0.774, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 14.384615384615385, |
|
"grad_norm": 2.8624985218048096, |
|
"learning_rate": 8.205128205128206e-07, |
|
"loss": 0.7882, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 14.461538461538462, |
|
"grad_norm": 2.5051703453063965, |
|
"learning_rate": 7.179487179487179e-07, |
|
"loss": 0.7883, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 14.538461538461538, |
|
"grad_norm": 3.003545045852661, |
|
"learning_rate": 6.153846153846155e-07, |
|
"loss": 0.7817, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 14.615384615384615, |
|
"grad_norm": 2.8403878211975098, |
|
"learning_rate": 5.128205128205128e-07, |
|
"loss": 0.8294, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 14.692307692307692, |
|
"grad_norm": 2.124030590057373, |
|
"learning_rate": 4.102564102564103e-07, |
|
"loss": 0.7978, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 14.76923076923077, |
|
"grad_norm": 4.762181758880615, |
|
"learning_rate": 3.0769230769230774e-07, |
|
"loss": 0.8038, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 14.846153846153847, |
|
"grad_norm": 3.256133794784546, |
|
"learning_rate": 2.0512820512820514e-07, |
|
"loss": 0.8535, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 14.923076923076923, |
|
"grad_norm": 2.355344772338867, |
|
"learning_rate": 1.0256410256410257e-07, |
|
"loss": 0.7587, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 4.202574729919434, |
|
"learning_rate": 0.0, |
|
"loss": 0.7929, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.8120300751879699, |
|
"eval_loss": 0.7412300109863281, |
|
"eval_runtime": 0.8087, |
|
"eval_samples_per_second": 164.47, |
|
"eval_steps_per_second": 21.022, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 15.076923076923077, |
|
"grad_norm": 3.3559834957122803, |
|
"learning_rate": 4.923076923076924e-06, |
|
"loss": 0.7535, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 15.153846153846153, |
|
"grad_norm": 1.8613739013671875, |
|
"learning_rate": 4.8461538461538465e-06, |
|
"loss": 0.7581, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 15.23076923076923, |
|
"grad_norm": 2.3707966804504395, |
|
"learning_rate": 4.76923076923077e-06, |
|
"loss": 0.7836, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 15.307692307692308, |
|
"grad_norm": 2.6265199184417725, |
|
"learning_rate": 4.692307692307693e-06, |
|
"loss": 0.8334, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"grad_norm": 2.078848123550415, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 0.7772, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 15.461538461538462, |
|
"grad_norm": 2.6433162689208984, |
|
"learning_rate": 4.538461538461539e-06, |
|
"loss": 0.7955, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 15.538461538461538, |
|
"grad_norm": 3.458962917327881, |
|
"learning_rate": 4.461538461538462e-06, |
|
"loss": 0.787, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 15.615384615384615, |
|
"grad_norm": 5.090147495269775, |
|
"learning_rate": 4.384615384615385e-06, |
|
"loss": 0.7875, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 15.692307692307692, |
|
"grad_norm": 1.9066407680511475, |
|
"learning_rate": 4.307692307692308e-06, |
|
"loss": 0.7764, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 15.76923076923077, |
|
"grad_norm": 3.097341299057007, |
|
"learning_rate": 4.230769230769231e-06, |
|
"loss": 0.7335, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 15.846153846153847, |
|
"grad_norm": 2.7201600074768066, |
|
"learning_rate": 4.1538461538461545e-06, |
|
"loss": 0.7747, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 15.923076923076923, |
|
"grad_norm": 2.303032398223877, |
|
"learning_rate": 4.076923076923077e-06, |
|
"loss": 0.7738, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 4.420492172241211, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.774, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.849624060150376, |
|
"eval_loss": 0.7370420694351196, |
|
"eval_runtime": 0.776, |
|
"eval_samples_per_second": 171.384, |
|
"eval_steps_per_second": 21.906, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 16.076923076923077, |
|
"grad_norm": 3.9969003200531006, |
|
"learning_rate": 3.923076923076923e-06, |
|
"loss": 0.8316, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 16.153846153846153, |
|
"grad_norm": 2.3731822967529297, |
|
"learning_rate": 3.846153846153847e-06, |
|
"loss": 0.8162, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 16.23076923076923, |
|
"grad_norm": 2.232074737548828, |
|
"learning_rate": 3.7692307692307694e-06, |
|
"loss": 0.8138, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 16.307692307692307, |
|
"grad_norm": 2.8799118995666504, |
|
"learning_rate": 3.692307692307693e-06, |
|
"loss": 0.8434, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 16.384615384615383, |
|
"grad_norm": 2.2093818187713623, |
|
"learning_rate": 3.6153846153846156e-06, |
|
"loss": 0.7886, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 16.46153846153846, |
|
"grad_norm": 1.984840750694275, |
|
"learning_rate": 3.538461538461539e-06, |
|
"loss": 0.7682, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 16.53846153846154, |
|
"grad_norm": 2.711601495742798, |
|
"learning_rate": 3.4615384615384617e-06, |
|
"loss": 0.7471, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 16.615384615384617, |
|
"grad_norm": 2.130311965942383, |
|
"learning_rate": 3.384615384615385e-06, |
|
"loss": 0.7535, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 16.692307692307693, |
|
"grad_norm": 2.327207565307617, |
|
"learning_rate": 3.307692307692308e-06, |
|
"loss": 0.718, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 16.76923076923077, |
|
"grad_norm": 2.198944091796875, |
|
"learning_rate": 3.2307692307692313e-06, |
|
"loss": 0.8146, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 16.846153846153847, |
|
"grad_norm": 2.388453483581543, |
|
"learning_rate": 3.153846153846154e-06, |
|
"loss": 0.8368, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 16.923076923076923, |
|
"grad_norm": 2.2575690746307373, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.749, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 6.020498275756836, |
|
"learning_rate": 3e-06, |
|
"loss": 0.7613, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.849624060150376, |
|
"eval_loss": 0.7059224247932434, |
|
"eval_runtime": 0.7496, |
|
"eval_samples_per_second": 177.439, |
|
"eval_steps_per_second": 22.68, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 17.076923076923077, |
|
"grad_norm": 3.134481430053711, |
|
"learning_rate": 2.9230769230769236e-06, |
|
"loss": 0.7609, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 17.153846153846153, |
|
"grad_norm": 2.0070559978485107, |
|
"learning_rate": 2.846153846153846e-06, |
|
"loss": 0.7483, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 17.23076923076923, |
|
"grad_norm": 3.491682291030884, |
|
"learning_rate": 2.7692307692307697e-06, |
|
"loss": 0.7696, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 17.307692307692307, |
|
"grad_norm": 1.9866397380828857, |
|
"learning_rate": 2.6923076923076923e-06, |
|
"loss": 0.7609, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 17.384615384615383, |
|
"grad_norm": 3.458582878112793, |
|
"learning_rate": 2.615384615384616e-06, |
|
"loss": 0.7813, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 17.46153846153846, |
|
"grad_norm": 2.1126835346221924, |
|
"learning_rate": 2.5384615384615385e-06, |
|
"loss": 0.7003, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 17.53846153846154, |
|
"grad_norm": 3.5276880264282227, |
|
"learning_rate": 2.461538461538462e-06, |
|
"loss": 0.8305, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 17.615384615384617, |
|
"grad_norm": 2.3967173099517822, |
|
"learning_rate": 2.384615384615385e-06, |
|
"loss": 0.7627, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 17.692307692307693, |
|
"grad_norm": 4.473978042602539, |
|
"learning_rate": 2.307692307692308e-06, |
|
"loss": 0.7332, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 17.76923076923077, |
|
"grad_norm": 2.1642568111419678, |
|
"learning_rate": 2.230769230769231e-06, |
|
"loss": 0.7678, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 17.846153846153847, |
|
"grad_norm": 3.03192138671875, |
|
"learning_rate": 2.153846153846154e-06, |
|
"loss": 0.7565, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 17.923076923076923, |
|
"grad_norm": 2.9610419273376465, |
|
"learning_rate": 2.0769230769230773e-06, |
|
"loss": 0.7651, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 4.160178184509277, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.7778, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.8270676691729323, |
|
"eval_loss": 0.6930322647094727, |
|
"eval_runtime": 0.7854, |
|
"eval_samples_per_second": 169.332, |
|
"eval_steps_per_second": 21.644, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 18.076923076923077, |
|
"grad_norm": 2.168921947479248, |
|
"learning_rate": 1.9230769230769234e-06, |
|
"loss": 0.7234, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 18.153846153846153, |
|
"grad_norm": 3.935608386993408, |
|
"learning_rate": 1.8461538461538465e-06, |
|
"loss": 0.8192, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 18.23076923076923, |
|
"grad_norm": 1.8215328454971313, |
|
"learning_rate": 1.7692307692307695e-06, |
|
"loss": 0.7271, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 18.307692307692307, |
|
"grad_norm": 2.687016010284424, |
|
"learning_rate": 1.6923076923076926e-06, |
|
"loss": 0.8063, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 18.384615384615383, |
|
"grad_norm": 2.3364577293395996, |
|
"learning_rate": 1.6153846153846157e-06, |
|
"loss": 0.7699, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 18.46153846153846, |
|
"grad_norm": 2.7465319633483887, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 0.8214, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 18.53846153846154, |
|
"grad_norm": 3.3499436378479004, |
|
"learning_rate": 1.4615384615384618e-06, |
|
"loss": 0.7432, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 18.615384615384617, |
|
"grad_norm": 3.7266149520874023, |
|
"learning_rate": 1.3846153846153848e-06, |
|
"loss": 0.797, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 18.692307692307693, |
|
"grad_norm": 2.661741256713867, |
|
"learning_rate": 1.307692307692308e-06, |
|
"loss": 0.7404, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 18.76923076923077, |
|
"grad_norm": 3.166747808456421, |
|
"learning_rate": 1.230769230769231e-06, |
|
"loss": 0.8197, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 18.846153846153847, |
|
"grad_norm": 3.200448989868164, |
|
"learning_rate": 1.153846153846154e-06, |
|
"loss": 0.8068, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 18.923076923076923, |
|
"grad_norm": 2.4404191970825195, |
|
"learning_rate": 1.076923076923077e-06, |
|
"loss": 0.788, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 3.8639049530029297, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.8081, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.8646616541353384, |
|
"eval_loss": 0.6890266537666321, |
|
"eval_runtime": 0.7797, |
|
"eval_samples_per_second": 170.576, |
|
"eval_steps_per_second": 21.803, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 19.076923076923077, |
|
"grad_norm": 1.7245137691497803, |
|
"learning_rate": 9.230769230769232e-07, |
|
"loss": 0.7929, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 19.153846153846153, |
|
"grad_norm": 3.7959182262420654, |
|
"learning_rate": 8.461538461538463e-07, |
|
"loss": 0.7397, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 19.23076923076923, |
|
"grad_norm": 2.798788070678711, |
|
"learning_rate": 7.692307692307694e-07, |
|
"loss": 0.7928, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 19.307692307692307, |
|
"grad_norm": 2.1275336742401123, |
|
"learning_rate": 6.923076923076924e-07, |
|
"loss": 0.7672, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 19.384615384615383, |
|
"grad_norm": 2.9216866493225098, |
|
"learning_rate": 6.153846153846155e-07, |
|
"loss": 0.7918, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 19.46153846153846, |
|
"grad_norm": 2.3012797832489014, |
|
"learning_rate": 5.384615384615386e-07, |
|
"loss": 0.7418, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 19.53846153846154, |
|
"grad_norm": 2.5353312492370605, |
|
"learning_rate": 4.615384615384616e-07, |
|
"loss": 0.8115, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 19.615384615384617, |
|
"grad_norm": 3.469372510910034, |
|
"learning_rate": 3.846153846153847e-07, |
|
"loss": 0.7698, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 19.692307692307693, |
|
"grad_norm": 2.3621013164520264, |
|
"learning_rate": 3.0769230769230774e-07, |
|
"loss": 0.6997, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 19.76923076923077, |
|
"grad_norm": 1.7231149673461914, |
|
"learning_rate": 2.307692307692308e-07, |
|
"loss": 0.7207, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 19.846153846153847, |
|
"grad_norm": 5.3792924880981445, |
|
"learning_rate": 1.5384615384615387e-07, |
|
"loss": 0.7656, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 19.923076923076923, |
|
"grad_norm": 1.9618691205978394, |
|
"learning_rate": 7.692307692307694e-08, |
|
"loss": 0.6919, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 4.051193714141846, |
|
"learning_rate": 0.0, |
|
"loss": 0.7916, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.8646616541353384, |
|
"eval_loss": 0.6874601244926453, |
|
"eval_runtime": 0.8096, |
|
"eval_samples_per_second": 164.271, |
|
"eval_steps_per_second": 20.997, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 2600, |
|
"total_flos": 2.0877820672794624e+17, |
|
"train_loss": 0.19350949709232038, |
|
"train_runtime": 49.806, |
|
"train_samples_per_second": 415.211, |
|
"train_steps_per_second": 52.203 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.0877820672794624e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|