|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 100, |
|
"global_step": 2310, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.021645021645021644, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9442, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04329004329004329, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6729, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06493506493506493, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00015, |
|
"loss": 0.5235, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08658008658008658, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3883, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10822510822510822, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00025, |
|
"loss": 0.3269, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12987012987012986, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2779, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00035, |
|
"loss": 0.2607, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.17316017316017315, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0004, |
|
"loss": 0.246, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.19480519480519481, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 0.2469, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.21645021645021645, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0005, |
|
"loss": 0.2327, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21645021645021645, |
|
"eval_loss": 0.1517176628112793, |
|
"eval_runtime": 23.1527, |
|
"eval_samples_per_second": 21.596, |
|
"eval_steps_per_second": 0.691, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.000497737556561086, |
|
"loss": 0.231, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.000495475113122172, |
|
"loss": 0.214, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2813852813852814, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.000493212669683258, |
|
"loss": 0.2164, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0004909502262443439, |
|
"loss": 0.2327, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3246753246753247, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0004886877828054299, |
|
"loss": 0.2104, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3463203463203463, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00048642533936651587, |
|
"loss": 0.2249, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.36796536796536794, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0004841628959276018, |
|
"loss": 0.2179, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.38961038961038963, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0004819004524886878, |
|
"loss": 0.2174, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.41125541125541126, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0004796380090497738, |
|
"loss": 0.2159, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4329004329004329, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0004773755656108598, |
|
"loss": 0.2166, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4329004329004329, |
|
"eval_loss": 0.13433273136615753, |
|
"eval_runtime": 18.3553, |
|
"eval_samples_per_second": 27.24, |
|
"eval_steps_per_second": 0.872, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.00047511312217194567, |
|
"loss": 0.2232, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00047285067873303167, |
|
"loss": 0.2034, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.49783549783549785, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00047058823529411766, |
|
"loss": 0.2222, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00046832579185520365, |
|
"loss": 0.2043, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5411255411255411, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.0004660633484162896, |
|
"loss": 0.2054, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5627705627705628, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.0004638009049773756, |
|
"loss": 0.1922, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5844155844155844, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0004615384615384616, |
|
"loss": 0.196, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0004592760180995475, |
|
"loss": 0.2008, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6277056277056277, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00045701357466063346, |
|
"loss": 0.1983, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00045475113122171945, |
|
"loss": 0.1964, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"eval_loss": 0.12437203526496887, |
|
"eval_runtime": 19.6174, |
|
"eval_samples_per_second": 25.488, |
|
"eval_steps_per_second": 0.816, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.670995670995671, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00045248868778280545, |
|
"loss": 0.1922, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6926406926406926, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00045022624434389144, |
|
"loss": 0.1912, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.0004479638009049774, |
|
"loss": 0.1927, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7359307359307359, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0004457013574660634, |
|
"loss": 0.1919, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7575757575757576, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0004434389140271493, |
|
"loss": 0.1941, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0004411764705882353, |
|
"loss": 0.1939, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8008658008658008, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00043891402714932125, |
|
"loss": 0.2017, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8225108225108225, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00043665158371040724, |
|
"loss": 0.2463, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8441558441558441, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00043438914027149324, |
|
"loss": 0.2453, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8658008658008658, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00043212669683257923, |
|
"loss": 0.2457, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8658008658008658, |
|
"eval_loss": 0.15143588185310364, |
|
"eval_runtime": 20.4597, |
|
"eval_samples_per_second": 24.438, |
|
"eval_steps_per_second": 0.782, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8874458874458875, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00042986425339366517, |
|
"loss": 0.228, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.0004276018099547511, |
|
"loss": 0.2139, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9307359307359307, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0004253393665158371, |
|
"loss": 0.2224, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0004230769230769231, |
|
"loss": 0.2108, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.974025974025974, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00042081447963800904, |
|
"loss": 0.2003, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9956709956709957, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00041855203619909503, |
|
"loss": 0.1827, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.0173160173160174, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.000416289592760181, |
|
"loss": 0.1791, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.0389610389610389, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.00041402714932126697, |
|
"loss": 0.1836, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.0606060606060606, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0004117647058823529, |
|
"loss": 0.1804, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0822510822510822, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0004095022624434389, |
|
"loss": 0.1977, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0822510822510822, |
|
"eval_loss": 0.1261492371559143, |
|
"eval_runtime": 20.0605, |
|
"eval_samples_per_second": 24.925, |
|
"eval_steps_per_second": 0.798, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.103896103896104, |
|
"grad_norm": 32.5, |
|
"learning_rate": 0.0004072398190045249, |
|
"loss": 0.7592, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.1255411255411256, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0004049773755656109, |
|
"loss": 0.614, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.1471861471861473, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 0.0004027149321266968, |
|
"loss": 0.2444, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.1688311688311688, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.0004004524886877828, |
|
"loss": 0.1821, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00039819004524886876, |
|
"loss": 0.1811, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00039592760180995475, |
|
"loss": 0.1717, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.2337662337662338, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00039366515837104075, |
|
"loss": 0.1725, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.2554112554112553, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0003914027149321267, |
|
"loss": 0.1713, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.277056277056277, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0003891402714932127, |
|
"loss": 0.1655, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.2987012987012987, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0003868778280542987, |
|
"loss": 0.1835, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2987012987012987, |
|
"eval_loss": 0.12451652437448502, |
|
"eval_runtime": 20.7534, |
|
"eval_samples_per_second": 24.092, |
|
"eval_steps_per_second": 0.771, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3203463203463204, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00038461538461538467, |
|
"loss": 0.1626, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.341991341991342, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00038235294117647055, |
|
"loss": 0.1713, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00038009049773755655, |
|
"loss": 0.161, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.3852813852813852, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00037782805429864254, |
|
"loss": 0.1823, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.406926406926407, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00037556561085972854, |
|
"loss": 0.1822, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0003733031674208145, |
|
"loss": 0.1675, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.4502164502164503, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00037104072398190047, |
|
"loss": 0.1736, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.4718614718614718, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00036877828054298646, |
|
"loss": 0.1614, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.4935064935064934, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0003665158371040724, |
|
"loss": 0.1795, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00036425339366515834, |
|
"loss": 0.183, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"eval_loss": 0.10722990334033966, |
|
"eval_runtime": 19.3953, |
|
"eval_samples_per_second": 25.779, |
|
"eval_steps_per_second": 0.825, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5367965367965368, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00036199095022624434, |
|
"loss": 0.1647, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.5584415584415585, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00035972850678733033, |
|
"loss": 0.1649, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.5800865800865802, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0003574660633484163, |
|
"loss": 0.1676, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.601731601731602, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00035520361990950226, |
|
"loss": 0.1625, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.6233766233766234, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00035294117647058826, |
|
"loss": 0.1714, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.645021645021645, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0003506787330316742, |
|
"loss": 0.1612, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0003484162895927602, |
|
"loss": 0.1623, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.6883116883116882, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00034615384615384613, |
|
"loss": 0.1508, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.70995670995671, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0003438914027149321, |
|
"loss": 0.1632, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.7316017316017316, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0003416289592760181, |
|
"loss": 0.1682, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7316017316017316, |
|
"eval_loss": 0.10919010639190674, |
|
"eval_runtime": 27.8108, |
|
"eval_samples_per_second": 17.979, |
|
"eval_steps_per_second": 0.575, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7532467532467533, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0003393665158371041, |
|
"loss": 0.1514, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.774891774891775, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00033710407239819005, |
|
"loss": 0.1536, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.7965367965367967, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.000334841628959276, |
|
"loss": 0.1723, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.000332579185520362, |
|
"loss": 0.168, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.8398268398268398, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.000330316742081448, |
|
"loss": 0.1515, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.8614718614718615, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0003280542986425339, |
|
"loss": 0.1694, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.883116883116883, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0003257918552036199, |
|
"loss": 0.1679, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0003235294117647059, |
|
"loss": 0.1632, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.9264069264069263, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0003212669683257919, |
|
"loss": 0.1619, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.948051948051948, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0003190045248868778, |
|
"loss": 0.1632, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.948051948051948, |
|
"eval_loss": 0.11044134944677353, |
|
"eval_runtime": 18.793, |
|
"eval_samples_per_second": 26.606, |
|
"eval_steps_per_second": 0.851, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.9696969696969697, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0003167420814479638, |
|
"loss": 0.1625, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.9913419913419914, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0003144796380090498, |
|
"loss": 0.1616, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.012987012987013, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00031221719457013577, |
|
"loss": 0.1433, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.034632034632035, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0003099547511312217, |
|
"loss": 0.1483, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.0562770562770565, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0003076923076923077, |
|
"loss": 0.1398, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.0779220779220777, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.0003054298642533937, |
|
"loss": 0.1488, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.0995670995670994, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00030316742081447964, |
|
"loss": 0.1468, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.121212121212121, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0003009049773755656, |
|
"loss": 0.1503, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00029864253393665157, |
|
"loss": 0.1326, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.1645021645021645, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00029638009049773756, |
|
"loss": 0.1335, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.1645021645021645, |
|
"eval_loss": 0.11199549585580826, |
|
"eval_runtime": 20.4528, |
|
"eval_samples_per_second": 24.447, |
|
"eval_steps_per_second": 0.782, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.186147186147186, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00029411764705882356, |
|
"loss": 0.1375, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.207792207792208, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00029185520361990955, |
|
"loss": 0.1334, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.2294372294372296, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002895927601809955, |
|
"loss": 0.1332, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.2510822510822512, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.00028733031674208143, |
|
"loss": 0.1327, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002850678733031674, |
|
"loss": 0.141, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.2943722943722946, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0002828054298642534, |
|
"loss": 0.1432, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.316017316017316, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00028054298642533936, |
|
"loss": 0.1447, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.3376623376623376, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00027828054298642535, |
|
"loss": 0.1376, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.3593073593073592, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00027601809954751135, |
|
"loss": 0.1287, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00027375565610859734, |
|
"loss": 0.1399, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"eval_loss": 0.09553591907024384, |
|
"eval_runtime": 20.0139, |
|
"eval_samples_per_second": 24.983, |
|
"eval_steps_per_second": 0.799, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.4025974025974026, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0002714932126696832, |
|
"loss": 0.1417, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002692307692307692, |
|
"loss": 0.1428, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.445887445887446, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0002669683257918552, |
|
"loss": 0.1367, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.4675324675324677, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002647058823529412, |
|
"loss": 0.1383, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.4891774891774894, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00026244343891402715, |
|
"loss": 0.1438, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.5108225108225106, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00026018099547511314, |
|
"loss": 0.1394, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.5324675324675323, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00025791855203619913, |
|
"loss": 0.1358, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.554112554112554, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0002556561085972851, |
|
"loss": 0.1266, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.5757575757575757, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.000253393665158371, |
|
"loss": 0.1353, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.5974025974025974, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.000251131221719457, |
|
"loss": 0.1483, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.5974025974025974, |
|
"eval_loss": 0.10388709604740143, |
|
"eval_runtime": 19.843, |
|
"eval_samples_per_second": 25.198, |
|
"eval_steps_per_second": 0.806, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.000248868778280543, |
|
"loss": 0.1364, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.6406926406926408, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.000246606334841629, |
|
"loss": 0.1285, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.6623376623376624, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00024434389140271494, |
|
"loss": 0.1264, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.683982683982684, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002420814479638009, |
|
"loss": 0.1331, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.7056277056277054, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0002398190045248869, |
|
"loss": 0.15, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00023755656108597284, |
|
"loss": 0.1367, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.7489177489177488, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00023529411764705883, |
|
"loss": 0.13, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.7705627705627704, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0002330316742081448, |
|
"loss": 0.1324, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.792207792207792, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0002307692307692308, |
|
"loss": 0.1336, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.813852813852814, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00022850678733031673, |
|
"loss": 0.1314, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.813852813852814, |
|
"eval_loss": 0.10149528831243515, |
|
"eval_runtime": 19.1293, |
|
"eval_samples_per_second": 26.138, |
|
"eval_steps_per_second": 0.836, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.8354978354978355, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00022624434389140272, |
|
"loss": 0.1309, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0002239819004524887, |
|
"loss": 0.1353, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.878787878787879, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00022171945701357466, |
|
"loss": 0.1446, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.9004329004329006, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00021945701357466062, |
|
"loss": 0.1333, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.9220779220779223, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.00021719457013574662, |
|
"loss": 0.1383, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.9437229437229435, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00021493212669683259, |
|
"loss": 0.1496, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.965367965367965, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00021266968325791855, |
|
"loss": 0.1399, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.987012987012987, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00021040723981900452, |
|
"loss": 0.139, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.0086580086580086, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0002081447963800905, |
|
"loss": 0.127, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00020588235294117645, |
|
"loss": 0.114, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"eval_loss": 0.09961362928152084, |
|
"eval_runtime": 20.1151, |
|
"eval_samples_per_second": 24.857, |
|
"eval_steps_per_second": 0.795, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.051948051948052, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00020361990950226245, |
|
"loss": 0.1222, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.0735930735930737, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0002013574660633484, |
|
"loss": 0.1174, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.0952380952380953, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00019909502262443438, |
|
"loss": 0.121, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.116883116883117, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00019683257918552037, |
|
"loss": 0.1179, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.1385281385281387, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00019457013574660634, |
|
"loss": 0.1163, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.16017316017316, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00019230769230769233, |
|
"loss": 0.1173, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00019004524886877827, |
|
"loss": 0.1193, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.2034632034632033, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00018778280542986427, |
|
"loss": 0.1208, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.225108225108225, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00018552036199095024, |
|
"loss": 0.1229, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.2467532467532467, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0001832579185520362, |
|
"loss": 0.1202, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.2467532467532467, |
|
"eval_loss": 0.10085491091012955, |
|
"eval_runtime": 21.5392, |
|
"eval_samples_per_second": 23.214, |
|
"eval_steps_per_second": 0.743, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.2683982683982684, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.00018099547511312217, |
|
"loss": 0.119, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.29004329004329, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.00017873303167420816, |
|
"loss": 0.124, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.311688311688312, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00017647058823529413, |
|
"loss": 0.1214, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001742081447963801, |
|
"loss": 0.1236, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.354978354978355, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00017194570135746606, |
|
"loss": 0.1175, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.3766233766233764, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.00016968325791855206, |
|
"loss": 0.1163, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.398268398268398, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.000167420814479638, |
|
"loss": 0.1114, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.41991341991342, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.000165158371040724, |
|
"loss": 0.1164, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.4415584415584415, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.00016289592760180996, |
|
"loss": 0.1128, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.463203463203463, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00016063348416289595, |
|
"loss": 0.1163, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.463203463203463, |
|
"eval_loss": 0.10191706568002701, |
|
"eval_runtime": 23.6847, |
|
"eval_samples_per_second": 21.111, |
|
"eval_steps_per_second": 0.676, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.484848484848485, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0001583710407239819, |
|
"loss": 0.1171, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.5064935064935066, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00015610859728506788, |
|
"loss": 0.1159, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.5281385281385282, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00015384615384615385, |
|
"loss": 0.12, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.54978354978355, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.00015158371040723982, |
|
"loss": 0.1048, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 0.00014932126696832579, |
|
"loss": 0.1061, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.5930735930735933, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00014705882352941178, |
|
"loss": 0.1283, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.6147186147186146, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00014479638009049775, |
|
"loss": 0.1194, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0001425339366515837, |
|
"loss": 0.1132, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.658008658008658, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00014027149321266968, |
|
"loss": 0.1226, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.6796536796536796, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00013800904977375567, |
|
"loss": 0.1245, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.6796536796536796, |
|
"eval_loss": 0.09824839979410172, |
|
"eval_runtime": 19.2671, |
|
"eval_samples_per_second": 25.951, |
|
"eval_steps_per_second": 0.83, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.7012987012987013, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0001357466063348416, |
|
"loss": 0.1219, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.722943722943723, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0001334841628959276, |
|
"loss": 0.1184, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.7445887445887447, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00013122171945701357, |
|
"loss": 0.1157, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.7662337662337664, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00012895927601809957, |
|
"loss": 0.1075, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.787878787878788, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001266968325791855, |
|
"loss": 0.1102, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0001244343891402715, |
|
"loss": 0.1184, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.8311688311688314, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00012217194570135747, |
|
"loss": 0.1081, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.8528138528138527, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00011990950226244345, |
|
"loss": 0.1222, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.8744588744588744, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00011764705882352942, |
|
"loss": 0.1224, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.896103896103896, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0001153846153846154, |
|
"loss": 0.1032, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.896103896103896, |
|
"eval_loss": 0.09573096036911011, |
|
"eval_runtime": 23.7902, |
|
"eval_samples_per_second": 21.017, |
|
"eval_steps_per_second": 0.673, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.9177489177489178, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00011312217194570136, |
|
"loss": 0.1182, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.9393939393939394, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00011085972850678733, |
|
"loss": 0.1191, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.961038961038961, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00010859728506787331, |
|
"loss": 0.1125, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.982683982683983, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.00010633484162895928, |
|
"loss": 0.1188, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 4.004329004329004, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00010407239819004526, |
|
"loss": 0.1203, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 4.025974025974026, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00010180995475113122, |
|
"loss": 0.1077, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 4.0476190476190474, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 9.954751131221719e-05, |
|
"loss": 0.1065, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 4.06926406926407, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 9.728506787330317e-05, |
|
"loss": 0.1112, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 4.090909090909091, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 9.502262443438914e-05, |
|
"loss": 0.1006, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 4.112554112554113, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 9.276018099547512e-05, |
|
"loss": 0.1043, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.112554112554113, |
|
"eval_loss": 0.09408007562160492, |
|
"eval_runtime": 19.9413, |
|
"eval_samples_per_second": 25.074, |
|
"eval_steps_per_second": 0.802, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.134199134199134, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 9.049773755656108e-05, |
|
"loss": 0.1013, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 4.1558441558441555, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 8.823529411764706e-05, |
|
"loss": 0.1058, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 4.177489177489178, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 8.597285067873303e-05, |
|
"loss": 0.1069, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 4.199134199134199, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 8.3710407239819e-05, |
|
"loss": 0.1089, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 4.220779220779221, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 8.144796380090498e-05, |
|
"loss": 0.1026, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.242424242424242, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 7.918552036199095e-05, |
|
"loss": 0.1043, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 4.264069264069264, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 0.1136, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 7.466063348416289e-05, |
|
"loss": 0.1045, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 4.307359307359308, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 7.239819004524887e-05, |
|
"loss": 0.1032, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 4.329004329004329, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 7.013574660633484e-05, |
|
"loss": 0.0977, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.329004329004329, |
|
"eval_loss": 0.09685727208852768, |
|
"eval_runtime": 20.4475, |
|
"eval_samples_per_second": 24.453, |
|
"eval_steps_per_second": 0.782, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.35064935064935, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 6.78733031674208e-05, |
|
"loss": 0.1019, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 4.372294372294372, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 6.561085972850679e-05, |
|
"loss": 0.1076, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 4.393939393939394, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 6.334841628959275e-05, |
|
"loss": 0.0983, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 4.415584415584416, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 6.108597285067873e-05, |
|
"loss": 0.1023, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 4.437229437229437, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 5.882352941176471e-05, |
|
"loss": 0.0982, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.458874458874459, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 5.656108597285068e-05, |
|
"loss": 0.1071, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 4.48051948051948, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 5.4298642533936655e-05, |
|
"loss": 0.1095, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 4.5021645021645025, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 5.203619909502263e-05, |
|
"loss": 0.1105, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 4.523809523809524, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 4.9773755656108595e-05, |
|
"loss": 0.0961, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 4.751131221719457e-05, |
|
"loss": 0.1016, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"eval_loss": 0.09792981296777725, |
|
"eval_runtime": 20.8232, |
|
"eval_samples_per_second": 24.012, |
|
"eval_steps_per_second": 0.768, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.567099567099567, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 4.524886877828054e-05, |
|
"loss": 0.0953, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 4.588744588744589, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 4.2986425339366516e-05, |
|
"loss": 0.105, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 4.6103896103896105, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 4.072398190045249e-05, |
|
"loss": 0.1063, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 4.632034632034632, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 3.846153846153846e-05, |
|
"loss": 0.0982, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 4.653679653679654, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 3.6199095022624436e-05, |
|
"loss": 0.1068, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.675324675324675, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 3.39366515837104e-05, |
|
"loss": 0.1111, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 4.696969696969697, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 3.167420814479638e-05, |
|
"loss": 0.1084, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 4.7186147186147185, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 2.9411764705882354e-05, |
|
"loss": 0.103, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 4.740259740259741, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 2.7149321266968327e-05, |
|
"loss": 0.1089, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 2.4886877828054298e-05, |
|
"loss": 0.097, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"eval_loss": 0.09769493341445923, |
|
"eval_runtime": 19.6594, |
|
"eval_samples_per_second": 25.433, |
|
"eval_steps_per_second": 0.814, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.783549783549784, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 2.262443438914027e-05, |
|
"loss": 0.1083, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 4.805194805194805, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 2.0361990950226245e-05, |
|
"loss": 0.1046, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 4.8268398268398265, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 1.8099547511312218e-05, |
|
"loss": 0.1139, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 4.848484848484849, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.583710407239819e-05, |
|
"loss": 0.1038, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 4.87012987012987, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 1.3574660633484164e-05, |
|
"loss": 0.1089, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.891774891774892, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 1.1312217194570136e-05, |
|
"loss": 0.0922, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.913419913419913, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.049773755656109e-06, |
|
"loss": 0.1041, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 4.935064935064935, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 6.787330316742082e-06, |
|
"loss": 0.0933, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 4.956709956709957, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 4.5248868778280546e-06, |
|
"loss": 0.1089, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.978354978354979, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 2.2624434389140273e-06, |
|
"loss": 0.1094, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.978354978354979, |
|
"eval_loss": 0.09713861346244812, |
|
"eval_runtime": 20.5975, |
|
"eval_samples_per_second": 24.275, |
|
"eval_steps_per_second": 0.777, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0, |
|
"loss": 0.1107, |
|
"step": 2310 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2310, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.556491100242671e+18, |
|
"train_batch_size": 256, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|