{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 100, "global_step": 2310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021645021645021644, "grad_norm": 2.59375, "learning_rate": 5e-05, "loss": 0.9442, "step": 10 }, { "epoch": 0.04329004329004329, "grad_norm": 1.109375, "learning_rate": 0.0001, "loss": 0.6729, "step": 20 }, { "epoch": 0.06493506493506493, "grad_norm": 0.765625, "learning_rate": 0.00015, "loss": 0.5235, "step": 30 }, { "epoch": 0.08658008658008658, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.3883, "step": 40 }, { "epoch": 0.10822510822510822, "grad_norm": 0.8203125, "learning_rate": 0.00025, "loss": 0.3269, "step": 50 }, { "epoch": 0.12987012987012986, "grad_norm": 0.5, "learning_rate": 0.0003, "loss": 0.2779, "step": 60 }, { "epoch": 0.15151515151515152, "grad_norm": 0.54296875, "learning_rate": 0.00035, "loss": 0.2607, "step": 70 }, { "epoch": 0.17316017316017315, "grad_norm": 0.78515625, "learning_rate": 0.0004, "loss": 0.246, "step": 80 }, { "epoch": 0.19480519480519481, "grad_norm": 0.474609375, "learning_rate": 0.00045000000000000004, "loss": 0.2469, "step": 90 }, { "epoch": 0.21645021645021645, "grad_norm": 0.55078125, "learning_rate": 0.0005, "loss": 0.2327, "step": 100 }, { "epoch": 0.21645021645021645, "eval_loss": 0.1517176628112793, "eval_runtime": 23.1527, "eval_samples_per_second": 21.596, "eval_steps_per_second": 0.691, "step": 100 }, { "epoch": 0.23809523809523808, "grad_norm": 0.51953125, "learning_rate": 0.000497737556561086, "loss": 0.231, "step": 110 }, { "epoch": 0.2597402597402597, "grad_norm": 0.46875, "learning_rate": 0.000495475113122172, "loss": 0.214, "step": 120 }, { "epoch": 0.2813852813852814, "grad_norm": 0.4453125, "learning_rate": 0.000493212669683258, "loss": 0.2164, "step": 130 }, { "epoch": 0.30303030303030304, "grad_norm": 0.470703125, "learning_rate": 0.0004909502262443439, "loss": 0.2327, "step": 140 }, { "epoch": 0.3246753246753247, "grad_norm": 0.392578125, "learning_rate": 0.0004886877828054299, "loss": 0.2104, "step": 150 }, { "epoch": 0.3463203463203463, "grad_norm": 0.4765625, "learning_rate": 0.00048642533936651587, "loss": 0.2249, "step": 160 }, { "epoch": 0.36796536796536794, "grad_norm": 0.3125, "learning_rate": 0.0004841628959276018, "loss": 0.2179, "step": 170 }, { "epoch": 0.38961038961038963, "grad_norm": 0.453125, "learning_rate": 0.0004819004524886878, "loss": 0.2174, "step": 180 }, { "epoch": 0.41125541125541126, "grad_norm": 0.5390625, "learning_rate": 0.0004796380090497738, "loss": 0.2159, "step": 190 }, { "epoch": 0.4329004329004329, "grad_norm": 0.392578125, "learning_rate": 0.0004773755656108598, "loss": 0.2166, "step": 200 }, { "epoch": 0.4329004329004329, "eval_loss": 0.13433273136615753, "eval_runtime": 18.3553, "eval_samples_per_second": 27.24, "eval_steps_per_second": 0.872, "step": 200 }, { "epoch": 0.45454545454545453, "grad_norm": 0.44921875, "learning_rate": 0.00047511312217194567, "loss": 0.2232, "step": 210 }, { "epoch": 0.47619047619047616, "grad_norm": 0.447265625, "learning_rate": 0.00047285067873303167, "loss": 0.2034, "step": 220 }, { "epoch": 0.49783549783549785, "grad_norm": 0.36328125, "learning_rate": 0.00047058823529411766, "loss": 0.2222, "step": 230 }, { "epoch": 0.5194805194805194, "grad_norm": 0.68359375, "learning_rate": 0.00046832579185520365, "loss": 0.2043, "step": 240 }, { "epoch": 0.5411255411255411, "grad_norm": 0.431640625, "learning_rate": 0.0004660633484162896, "loss": 0.2054, "step": 250 }, { "epoch": 0.5627705627705628, "grad_norm": 0.38671875, "learning_rate": 0.0004638009049773756, "loss": 0.1922, "step": 260 }, { "epoch": 0.5844155844155844, "grad_norm": 0.349609375, "learning_rate": 0.0004615384615384616, "loss": 0.196, "step": 270 }, { "epoch": 0.6060606060606061, "grad_norm": 0.37109375, "learning_rate": 0.0004592760180995475, "loss": 0.2008, "step": 280 }, { "epoch": 0.6277056277056277, "grad_norm": 0.431640625, "learning_rate": 0.00045701357466063346, "loss": 0.1983, "step": 290 }, { "epoch": 0.6493506493506493, "grad_norm": 0.4609375, "learning_rate": 0.00045475113122171945, "loss": 0.1964, "step": 300 }, { "epoch": 0.6493506493506493, "eval_loss": 0.12437203526496887, "eval_runtime": 19.6174, "eval_samples_per_second": 25.488, "eval_steps_per_second": 0.816, "step": 300 }, { "epoch": 0.670995670995671, "grad_norm": 0.369140625, "learning_rate": 0.00045248868778280545, "loss": 0.1922, "step": 310 }, { "epoch": 0.6926406926406926, "grad_norm": 0.3671875, "learning_rate": 0.00045022624434389144, "loss": 0.1912, "step": 320 }, { "epoch": 0.7142857142857143, "grad_norm": 0.435546875, "learning_rate": 0.0004479638009049774, "loss": 0.1927, "step": 330 }, { "epoch": 0.7359307359307359, "grad_norm": 0.375, "learning_rate": 0.0004457013574660634, "loss": 0.1919, "step": 340 }, { "epoch": 0.7575757575757576, "grad_norm": 0.365234375, "learning_rate": 0.0004434389140271493, "loss": 0.1941, "step": 350 }, { "epoch": 0.7792207792207793, "grad_norm": 0.50390625, "learning_rate": 0.0004411764705882353, "loss": 0.1939, "step": 360 }, { "epoch": 0.8008658008658008, "grad_norm": 0.9765625, "learning_rate": 0.00043891402714932125, "loss": 0.2017, "step": 370 }, { "epoch": 0.8225108225108225, "grad_norm": 0.240234375, "learning_rate": 0.00043665158371040724, "loss": 0.2463, "step": 380 }, { "epoch": 0.8441558441558441, "grad_norm": 0.322265625, "learning_rate": 0.00043438914027149324, "loss": 0.2453, "step": 390 }, { "epoch": 0.8658008658008658, "grad_norm": 0.26953125, "learning_rate": 0.00043212669683257923, "loss": 0.2457, "step": 400 }, { "epoch": 0.8658008658008658, "eval_loss": 0.15143588185310364, "eval_runtime": 20.4597, "eval_samples_per_second": 24.438, "eval_steps_per_second": 0.782, "step": 400 }, { "epoch": 0.8874458874458875, "grad_norm": 0.28515625, "learning_rate": 0.00042986425339366517, "loss": 0.228, "step": 410 }, { "epoch": 0.9090909090909091, "grad_norm": 0.2177734375, "learning_rate": 0.0004276018099547511, "loss": 0.2139, "step": 420 }, { "epoch": 0.9307359307359307, "grad_norm": 0.271484375, "learning_rate": 0.0004253393665158371, "loss": 0.2224, "step": 430 }, { "epoch": 0.9523809523809523, "grad_norm": 0.375, "learning_rate": 0.0004230769230769231, "loss": 0.2108, "step": 440 }, { "epoch": 0.974025974025974, "grad_norm": 0.322265625, "learning_rate": 0.00042081447963800904, "loss": 0.2003, "step": 450 }, { "epoch": 0.9956709956709957, "grad_norm": 0.47265625, "learning_rate": 0.00041855203619909503, "loss": 0.1827, "step": 460 }, { "epoch": 1.0173160173160174, "grad_norm": 0.53515625, "learning_rate": 0.000416289592760181, "loss": 0.1791, "step": 470 }, { "epoch": 1.0389610389610389, "grad_norm": 1.84375, "learning_rate": 0.00041402714932126697, "loss": 0.1836, "step": 480 }, { "epoch": 1.0606060606060606, "grad_norm": 0.36328125, "learning_rate": 0.0004117647058823529, "loss": 0.1804, "step": 490 }, { "epoch": 1.0822510822510822, "grad_norm": 0.375, "learning_rate": 0.0004095022624434389, "loss": 0.1977, "step": 500 }, { "epoch": 1.0822510822510822, "eval_loss": 0.1261492371559143, "eval_runtime": 20.0605, "eval_samples_per_second": 24.925, "eval_steps_per_second": 0.798, "step": 500 }, { "epoch": 1.103896103896104, "grad_norm": 32.5, "learning_rate": 0.0004072398190045249, "loss": 0.7592, "step": 510 }, { "epoch": 1.1255411255411256, "grad_norm": 1.0703125, "learning_rate": 0.0004049773755656109, "loss": 0.614, "step": 520 }, { "epoch": 1.1471861471861473, "grad_norm": 3.71875, "learning_rate": 0.0004027149321266968, "loss": 0.2444, "step": 530 }, { "epoch": 1.1688311688311688, "grad_norm": 1.2421875, "learning_rate": 0.0004004524886877828, "loss": 0.1821, "step": 540 }, { "epoch": 1.1904761904761905, "grad_norm": 0.5546875, "learning_rate": 0.00039819004524886876, "loss": 0.1811, "step": 550 }, { "epoch": 1.2121212121212122, "grad_norm": 0.373046875, "learning_rate": 0.00039592760180995475, "loss": 0.1717, "step": 560 }, { "epoch": 1.2337662337662338, "grad_norm": 0.58984375, "learning_rate": 0.00039366515837104075, "loss": 0.1725, "step": 570 }, { "epoch": 1.2554112554112553, "grad_norm": 0.298828125, "learning_rate": 0.0003914027149321267, "loss": 0.1713, "step": 580 }, { "epoch": 1.277056277056277, "grad_norm": 0.58203125, "learning_rate": 0.0003891402714932127, "loss": 0.1655, "step": 590 }, { "epoch": 1.2987012987012987, "grad_norm": 0.8046875, "learning_rate": 0.0003868778280542987, "loss": 0.1835, "step": 600 }, { "epoch": 1.2987012987012987, "eval_loss": 0.12451652437448502, "eval_runtime": 20.7534, "eval_samples_per_second": 24.092, "eval_steps_per_second": 0.771, "step": 600 }, { "epoch": 1.3203463203463204, "grad_norm": 0.421875, "learning_rate": 0.00038461538461538467, "loss": 0.1626, "step": 610 }, { "epoch": 1.341991341991342, "grad_norm": 0.439453125, "learning_rate": 0.00038235294117647055, "loss": 0.1713, "step": 620 }, { "epoch": 1.3636363636363638, "grad_norm": 0.380859375, "learning_rate": 0.00038009049773755655, "loss": 0.161, "step": 630 }, { "epoch": 1.3852813852813852, "grad_norm": 0.478515625, "learning_rate": 0.00037782805429864254, "loss": 0.1823, "step": 640 }, { "epoch": 1.406926406926407, "grad_norm": 0.404296875, "learning_rate": 0.00037556561085972854, "loss": 0.1822, "step": 650 }, { "epoch": 1.4285714285714286, "grad_norm": 0.4296875, "learning_rate": 0.0003733031674208145, "loss": 0.1675, "step": 660 }, { "epoch": 1.4502164502164503, "grad_norm": 0.376953125, "learning_rate": 0.00037104072398190047, "loss": 0.1736, "step": 670 }, { "epoch": 1.4718614718614718, "grad_norm": 0.412109375, "learning_rate": 0.00036877828054298646, "loss": 0.1614, "step": 680 }, { "epoch": 1.4935064935064934, "grad_norm": 0.373046875, "learning_rate": 0.0003665158371040724, "loss": 0.1795, "step": 690 }, { "epoch": 1.5151515151515151, "grad_norm": 0.345703125, "learning_rate": 0.00036425339366515834, "loss": 0.183, "step": 700 }, { "epoch": 1.5151515151515151, "eval_loss": 0.10722990334033966, "eval_runtime": 19.3953, "eval_samples_per_second": 25.779, "eval_steps_per_second": 0.825, "step": 700 }, { "epoch": 1.5367965367965368, "grad_norm": 0.380859375, "learning_rate": 0.00036199095022624434, "loss": 0.1647, "step": 710 }, { "epoch": 1.5584415584415585, "grad_norm": 0.369140625, "learning_rate": 0.00035972850678733033, "loss": 0.1649, "step": 720 }, { "epoch": 1.5800865800865802, "grad_norm": 0.283203125, "learning_rate": 0.0003574660633484163, "loss": 0.1676, "step": 730 }, { "epoch": 1.601731601731602, "grad_norm": 0.41796875, "learning_rate": 0.00035520361990950226, "loss": 0.1625, "step": 740 }, { "epoch": 1.6233766233766234, "grad_norm": 0.365234375, "learning_rate": 0.00035294117647058826, "loss": 0.1714, "step": 750 }, { "epoch": 1.645021645021645, "grad_norm": 0.31640625, "learning_rate": 0.0003506787330316742, "loss": 0.1612, "step": 760 }, { "epoch": 1.6666666666666665, "grad_norm": 0.4921875, "learning_rate": 0.0003484162895927602, "loss": 0.1623, "step": 770 }, { "epoch": 1.6883116883116882, "grad_norm": 0.380859375, "learning_rate": 0.00034615384615384613, "loss": 0.1508, "step": 780 }, { "epoch": 1.70995670995671, "grad_norm": 0.34375, "learning_rate": 0.0003438914027149321, "loss": 0.1632, "step": 790 }, { "epoch": 1.7316017316017316, "grad_norm": 0.34765625, "learning_rate": 0.0003416289592760181, "loss": 0.1682, "step": 800 }, { "epoch": 1.7316017316017316, "eval_loss": 0.10919010639190674, "eval_runtime": 27.8108, "eval_samples_per_second": 17.979, "eval_steps_per_second": 0.575, "step": 800 }, { "epoch": 1.7532467532467533, "grad_norm": 0.310546875, "learning_rate": 0.0003393665158371041, "loss": 0.1514, "step": 810 }, { "epoch": 1.774891774891775, "grad_norm": 0.359375, "learning_rate": 0.00033710407239819005, "loss": 0.1536, "step": 820 }, { "epoch": 1.7965367965367967, "grad_norm": 0.248046875, "learning_rate": 0.000334841628959276, "loss": 0.1723, "step": 830 }, { "epoch": 1.8181818181818183, "grad_norm": 0.484375, "learning_rate": 0.000332579185520362, "loss": 0.168, "step": 840 }, { "epoch": 1.8398268398268398, "grad_norm": 0.35546875, "learning_rate": 0.000330316742081448, "loss": 0.1515, "step": 850 }, { "epoch": 1.8614718614718615, "grad_norm": 0.298828125, "learning_rate": 0.0003280542986425339, "loss": 0.1694, "step": 860 }, { "epoch": 1.883116883116883, "grad_norm": 0.3671875, "learning_rate": 0.0003257918552036199, "loss": 0.1679, "step": 870 }, { "epoch": 1.9047619047619047, "grad_norm": 0.357421875, "learning_rate": 0.0003235294117647059, "loss": 0.1632, "step": 880 }, { "epoch": 1.9264069264069263, "grad_norm": 0.3828125, "learning_rate": 0.0003212669683257919, "loss": 0.1619, "step": 890 }, { "epoch": 1.948051948051948, "grad_norm": 0.380859375, "learning_rate": 0.0003190045248868778, "loss": 0.1632, "step": 900 }, { "epoch": 1.948051948051948, "eval_loss": 0.11044134944677353, "eval_runtime": 18.793, "eval_samples_per_second": 26.606, "eval_steps_per_second": 0.851, "step": 900 }, { "epoch": 1.9696969696969697, "grad_norm": 0.306640625, "learning_rate": 0.0003167420814479638, "loss": 0.1625, "step": 910 }, { "epoch": 1.9913419913419914, "grad_norm": 0.384765625, "learning_rate": 0.0003144796380090498, "loss": 0.1616, "step": 920 }, { "epoch": 2.012987012987013, "grad_norm": 0.318359375, "learning_rate": 0.00031221719457013577, "loss": 0.1433, "step": 930 }, { "epoch": 2.034632034632035, "grad_norm": 0.310546875, "learning_rate": 0.0003099547511312217, "loss": 0.1483, "step": 940 }, { "epoch": 2.0562770562770565, "grad_norm": 0.322265625, "learning_rate": 0.0003076923076923077, "loss": 0.1398, "step": 950 }, { "epoch": 2.0779220779220777, "grad_norm": 0.83203125, "learning_rate": 0.0003054298642533937, "loss": 0.1488, "step": 960 }, { "epoch": 2.0995670995670994, "grad_norm": 0.279296875, "learning_rate": 0.00030316742081447964, "loss": 0.1468, "step": 970 }, { "epoch": 2.121212121212121, "grad_norm": 0.337890625, "learning_rate": 0.0003009049773755656, "loss": 0.1503, "step": 980 }, { "epoch": 2.142857142857143, "grad_norm": 0.2373046875, "learning_rate": 0.00029864253393665157, "loss": 0.1326, "step": 990 }, { "epoch": 2.1645021645021645, "grad_norm": 0.341796875, "learning_rate": 0.00029638009049773756, "loss": 0.1335, "step": 1000 }, { "epoch": 2.1645021645021645, "eval_loss": 0.11199549585580826, "eval_runtime": 20.4528, "eval_samples_per_second": 24.447, "eval_steps_per_second": 0.782, "step": 1000 }, { "epoch": 2.186147186147186, "grad_norm": 0.369140625, "learning_rate": 0.00029411764705882356, "loss": 0.1375, "step": 1010 }, { "epoch": 2.207792207792208, "grad_norm": 0.271484375, "learning_rate": 0.00029185520361990955, "loss": 0.1334, "step": 1020 }, { "epoch": 2.2294372294372296, "grad_norm": 0.294921875, "learning_rate": 0.0002895927601809955, "loss": 0.1332, "step": 1030 }, { "epoch": 2.2510822510822512, "grad_norm": 0.392578125, "learning_rate": 0.00028733031674208143, "loss": 0.1327, "step": 1040 }, { "epoch": 2.2727272727272725, "grad_norm": 0.271484375, "learning_rate": 0.0002850678733031674, "loss": 0.141, "step": 1050 }, { "epoch": 2.2943722943722946, "grad_norm": 0.357421875, "learning_rate": 0.0002828054298642534, "loss": 0.1432, "step": 1060 }, { "epoch": 2.316017316017316, "grad_norm": 0.2890625, "learning_rate": 0.00028054298642533936, "loss": 0.1447, "step": 1070 }, { "epoch": 2.3376623376623376, "grad_norm": 0.2890625, "learning_rate": 0.00027828054298642535, "loss": 0.1376, "step": 1080 }, { "epoch": 2.3593073593073592, "grad_norm": 0.29296875, "learning_rate": 0.00027601809954751135, "loss": 0.1287, "step": 1090 }, { "epoch": 2.380952380952381, "grad_norm": 0.333984375, "learning_rate": 0.00027375565610859734, "loss": 0.1399, "step": 1100 }, { "epoch": 2.380952380952381, "eval_loss": 0.09553591907024384, "eval_runtime": 20.0139, "eval_samples_per_second": 24.983, "eval_steps_per_second": 0.799, "step": 1100 }, { "epoch": 2.4025974025974026, "grad_norm": 0.318359375, "learning_rate": 0.0002714932126696832, "loss": 0.1417, "step": 1110 }, { "epoch": 2.4242424242424243, "grad_norm": 0.294921875, "learning_rate": 0.0002692307692307692, "loss": 0.1428, "step": 1120 }, { "epoch": 2.445887445887446, "grad_norm": 0.30078125, "learning_rate": 0.0002669683257918552, "loss": 0.1367, "step": 1130 }, { "epoch": 2.4675324675324677, "grad_norm": 0.271484375, "learning_rate": 0.0002647058823529412, "loss": 0.1383, "step": 1140 }, { "epoch": 2.4891774891774894, "grad_norm": 0.2578125, "learning_rate": 0.00026244343891402715, "loss": 0.1438, "step": 1150 }, { "epoch": 2.5108225108225106, "grad_norm": 0.3515625, "learning_rate": 0.00026018099547511314, "loss": 0.1394, "step": 1160 }, { "epoch": 2.5324675324675323, "grad_norm": 0.291015625, "learning_rate": 0.00025791855203619913, "loss": 0.1358, "step": 1170 }, { "epoch": 2.554112554112554, "grad_norm": 0.326171875, "learning_rate": 0.0002556561085972851, "loss": 0.1266, "step": 1180 }, { "epoch": 2.5757575757575757, "grad_norm": 0.3828125, "learning_rate": 0.000253393665158371, "loss": 0.1353, "step": 1190 }, { "epoch": 2.5974025974025974, "grad_norm": 0.30078125, "learning_rate": 0.000251131221719457, "loss": 0.1483, "step": 1200 }, { "epoch": 2.5974025974025974, "eval_loss": 0.10388709604740143, "eval_runtime": 19.843, "eval_samples_per_second": 25.198, "eval_steps_per_second": 0.806, "step": 1200 }, { "epoch": 2.619047619047619, "grad_norm": 0.2490234375, "learning_rate": 0.000248868778280543, "loss": 0.1364, "step": 1210 }, { "epoch": 2.6406926406926408, "grad_norm": 0.2080078125, "learning_rate": 0.000246606334841629, "loss": 0.1285, "step": 1220 }, { "epoch": 2.6623376623376624, "grad_norm": 0.251953125, "learning_rate": 0.00024434389140271494, "loss": 0.1264, "step": 1230 }, { "epoch": 2.683982683982684, "grad_norm": 0.287109375, "learning_rate": 0.0002420814479638009, "loss": 0.1331, "step": 1240 }, { "epoch": 2.7056277056277054, "grad_norm": 0.396484375, "learning_rate": 0.0002398190045248869, "loss": 0.15, "step": 1250 }, { "epoch": 2.7272727272727275, "grad_norm": 0.275390625, "learning_rate": 0.00023755656108597284, "loss": 0.1367, "step": 1260 }, { "epoch": 2.7489177489177488, "grad_norm": 0.275390625, "learning_rate": 0.00023529411764705883, "loss": 0.13, "step": 1270 }, { "epoch": 2.7705627705627704, "grad_norm": 0.33203125, "learning_rate": 0.0002330316742081448, "loss": 0.1324, "step": 1280 }, { "epoch": 2.792207792207792, "grad_norm": 0.302734375, "learning_rate": 0.0002307692307692308, "loss": 0.1336, "step": 1290 }, { "epoch": 2.813852813852814, "grad_norm": 0.23828125, "learning_rate": 0.00022850678733031673, "loss": 0.1314, "step": 1300 }, { "epoch": 2.813852813852814, "eval_loss": 0.10149528831243515, "eval_runtime": 19.1293, "eval_samples_per_second": 26.138, "eval_steps_per_second": 0.836, "step": 1300 }, { "epoch": 2.8354978354978355, "grad_norm": 0.306640625, "learning_rate": 0.00022624434389140272, "loss": 0.1309, "step": 1310 }, { "epoch": 2.857142857142857, "grad_norm": 0.2734375, "learning_rate": 0.0002239819004524887, "loss": 0.1353, "step": 1320 }, { "epoch": 2.878787878787879, "grad_norm": 0.462890625, "learning_rate": 0.00022171945701357466, "loss": 0.1446, "step": 1330 }, { "epoch": 2.9004329004329006, "grad_norm": 0.26953125, "learning_rate": 0.00021945701357466062, "loss": 0.1333, "step": 1340 }, { "epoch": 2.9220779220779223, "grad_norm": 0.2080078125, "learning_rate": 0.00021719457013574662, "loss": 0.1383, "step": 1350 }, { "epoch": 2.9437229437229435, "grad_norm": 0.267578125, "learning_rate": 0.00021493212669683259, "loss": 0.1496, "step": 1360 }, { "epoch": 2.965367965367965, "grad_norm": 0.283203125, "learning_rate": 0.00021266968325791855, "loss": 0.1399, "step": 1370 }, { "epoch": 2.987012987012987, "grad_norm": 0.380859375, "learning_rate": 0.00021040723981900452, "loss": 0.139, "step": 1380 }, { "epoch": 3.0086580086580086, "grad_norm": 0.201171875, "learning_rate": 0.0002081447963800905, "loss": 0.127, "step": 1390 }, { "epoch": 3.0303030303030303, "grad_norm": 0.2578125, "learning_rate": 0.00020588235294117645, "loss": 0.114, "step": 1400 }, { "epoch": 3.0303030303030303, "eval_loss": 0.09961362928152084, "eval_runtime": 20.1151, "eval_samples_per_second": 24.857, "eval_steps_per_second": 0.795, "step": 1400 }, { "epoch": 3.051948051948052, "grad_norm": 0.298828125, "learning_rate": 0.00020361990950226245, "loss": 0.1222, "step": 1410 }, { "epoch": 3.0735930735930737, "grad_norm": 0.2490234375, "learning_rate": 0.0002013574660633484, "loss": 0.1174, "step": 1420 }, { "epoch": 3.0952380952380953, "grad_norm": 0.2275390625, "learning_rate": 0.00019909502262443438, "loss": 0.121, "step": 1430 }, { "epoch": 3.116883116883117, "grad_norm": 0.33203125, "learning_rate": 0.00019683257918552037, "loss": 0.1179, "step": 1440 }, { "epoch": 3.1385281385281387, "grad_norm": 0.2373046875, "learning_rate": 0.00019457013574660634, "loss": 0.1163, "step": 1450 }, { "epoch": 3.16017316017316, "grad_norm": 0.232421875, "learning_rate": 0.00019230769230769233, "loss": 0.1173, "step": 1460 }, { "epoch": 3.1818181818181817, "grad_norm": 0.2734375, "learning_rate": 0.00019004524886877827, "loss": 0.1193, "step": 1470 }, { "epoch": 3.2034632034632033, "grad_norm": 0.458984375, "learning_rate": 0.00018778280542986427, "loss": 0.1208, "step": 1480 }, { "epoch": 3.225108225108225, "grad_norm": 0.279296875, "learning_rate": 0.00018552036199095024, "loss": 0.1229, "step": 1490 }, { "epoch": 3.2467532467532467, "grad_norm": 0.283203125, "learning_rate": 0.0001832579185520362, "loss": 0.1202, "step": 1500 }, { "epoch": 3.2467532467532467, "eval_loss": 0.10085491091012955, "eval_runtime": 21.5392, "eval_samples_per_second": 23.214, "eval_steps_per_second": 0.743, "step": 1500 }, { "epoch": 3.2683982683982684, "grad_norm": 0.1943359375, "learning_rate": 0.00018099547511312217, "loss": 0.119, "step": 1510 }, { "epoch": 3.29004329004329, "grad_norm": 0.45703125, "learning_rate": 0.00017873303167420816, "loss": 0.124, "step": 1520 }, { "epoch": 3.311688311688312, "grad_norm": 0.296875, "learning_rate": 0.00017647058823529413, "loss": 0.1214, "step": 1530 }, { "epoch": 3.3333333333333335, "grad_norm": 0.326171875, "learning_rate": 0.0001742081447963801, "loss": 0.1236, "step": 1540 }, { "epoch": 3.354978354978355, "grad_norm": 0.369140625, "learning_rate": 0.00017194570135746606, "loss": 0.1175, "step": 1550 }, { "epoch": 3.3766233766233764, "grad_norm": 0.2236328125, "learning_rate": 0.00016968325791855206, "loss": 0.1163, "step": 1560 }, { "epoch": 3.398268398268398, "grad_norm": 0.361328125, "learning_rate": 0.000167420814479638, "loss": 0.1114, "step": 1570 }, { "epoch": 3.41991341991342, "grad_norm": 0.1845703125, "learning_rate": 0.000165158371040724, "loss": 0.1164, "step": 1580 }, { "epoch": 3.4415584415584415, "grad_norm": 0.2060546875, "learning_rate": 0.00016289592760180996, "loss": 0.1128, "step": 1590 }, { "epoch": 3.463203463203463, "grad_norm": 0.3125, "learning_rate": 0.00016063348416289595, "loss": 0.1163, "step": 1600 }, { "epoch": 3.463203463203463, "eval_loss": 0.10191706568002701, "eval_runtime": 23.6847, "eval_samples_per_second": 21.111, "eval_steps_per_second": 0.676, "step": 1600 }, { "epoch": 3.484848484848485, "grad_norm": 0.287109375, "learning_rate": 0.0001583710407239819, "loss": 0.1171, "step": 1610 }, { "epoch": 3.5064935064935066, "grad_norm": 0.310546875, "learning_rate": 0.00015610859728506788, "loss": 0.1159, "step": 1620 }, { "epoch": 3.5281385281385282, "grad_norm": 0.2451171875, "learning_rate": 0.00015384615384615385, "loss": 0.12, "step": 1630 }, { "epoch": 3.54978354978355, "grad_norm": 0.208984375, "learning_rate": 0.00015158371040723982, "loss": 0.1048, "step": 1640 }, { "epoch": 3.571428571428571, "grad_norm": 0.1591796875, "learning_rate": 0.00014932126696832579, "loss": 0.1061, "step": 1650 }, { "epoch": 3.5930735930735933, "grad_norm": 0.416015625, "learning_rate": 0.00014705882352941178, "loss": 0.1283, "step": 1660 }, { "epoch": 3.6147186147186146, "grad_norm": 0.248046875, "learning_rate": 0.00014479638009049775, "loss": 0.1194, "step": 1670 }, { "epoch": 3.6363636363636362, "grad_norm": 0.228515625, "learning_rate": 0.0001425339366515837, "loss": 0.1132, "step": 1680 }, { "epoch": 3.658008658008658, "grad_norm": 0.361328125, "learning_rate": 0.00014027149321266968, "loss": 0.1226, "step": 1690 }, { "epoch": 3.6796536796536796, "grad_norm": 0.32421875, "learning_rate": 0.00013800904977375567, "loss": 0.1245, "step": 1700 }, { "epoch": 3.6796536796536796, "eval_loss": 0.09824839979410172, "eval_runtime": 19.2671, "eval_samples_per_second": 25.951, "eval_steps_per_second": 0.83, "step": 1700 }, { "epoch": 3.7012987012987013, "grad_norm": 0.365234375, "learning_rate": 0.0001357466063348416, "loss": 0.1219, "step": 1710 }, { "epoch": 3.722943722943723, "grad_norm": 0.365234375, "learning_rate": 0.0001334841628959276, "loss": 0.1184, "step": 1720 }, { "epoch": 3.7445887445887447, "grad_norm": 0.2451171875, "learning_rate": 0.00013122171945701357, "loss": 0.1157, "step": 1730 }, { "epoch": 3.7662337662337664, "grad_norm": 0.263671875, "learning_rate": 0.00012895927601809957, "loss": 0.1075, "step": 1740 }, { "epoch": 3.787878787878788, "grad_norm": 0.265625, "learning_rate": 0.0001266968325791855, "loss": 0.1102, "step": 1750 }, { "epoch": 3.8095238095238093, "grad_norm": 0.28125, "learning_rate": 0.0001244343891402715, "loss": 0.1184, "step": 1760 }, { "epoch": 3.8311688311688314, "grad_norm": 0.27734375, "learning_rate": 0.00012217194570135747, "loss": 0.1081, "step": 1770 }, { "epoch": 3.8528138528138527, "grad_norm": 0.365234375, "learning_rate": 0.00011990950226244345, "loss": 0.1222, "step": 1780 }, { "epoch": 3.8744588744588744, "grad_norm": 0.328125, "learning_rate": 0.00011764705882352942, "loss": 0.1224, "step": 1790 }, { "epoch": 3.896103896103896, "grad_norm": 0.228515625, "learning_rate": 0.0001153846153846154, "loss": 0.1032, "step": 1800 }, { "epoch": 3.896103896103896, "eval_loss": 0.09573096036911011, "eval_runtime": 23.7902, "eval_samples_per_second": 21.017, "eval_steps_per_second": 0.673, "step": 1800 }, { "epoch": 3.9177489177489178, "grad_norm": 0.33984375, "learning_rate": 0.00011312217194570136, "loss": 0.1182, "step": 1810 }, { "epoch": 3.9393939393939394, "grad_norm": 0.3828125, "learning_rate": 0.00011085972850678733, "loss": 0.1191, "step": 1820 }, { "epoch": 3.961038961038961, "grad_norm": 0.3359375, "learning_rate": 0.00010859728506787331, "loss": 0.1125, "step": 1830 }, { "epoch": 3.982683982683983, "grad_norm": 0.2138671875, "learning_rate": 0.00010633484162895928, "loss": 0.1188, "step": 1840 }, { "epoch": 4.004329004329004, "grad_norm": 0.30078125, "learning_rate": 0.00010407239819004526, "loss": 0.1203, "step": 1850 }, { "epoch": 4.025974025974026, "grad_norm": 0.232421875, "learning_rate": 0.00010180995475113122, "loss": 0.1077, "step": 1860 }, { "epoch": 4.0476190476190474, "grad_norm": 0.2080078125, "learning_rate": 9.954751131221719e-05, "loss": 0.1065, "step": 1870 }, { "epoch": 4.06926406926407, "grad_norm": 0.27734375, "learning_rate": 9.728506787330317e-05, "loss": 0.1112, "step": 1880 }, { "epoch": 4.090909090909091, "grad_norm": 0.2333984375, "learning_rate": 9.502262443438914e-05, "loss": 0.1006, "step": 1890 }, { "epoch": 4.112554112554113, "grad_norm": 0.205078125, "learning_rate": 9.276018099547512e-05, "loss": 0.1043, "step": 1900 }, { "epoch": 4.112554112554113, "eval_loss": 0.09408007562160492, "eval_runtime": 19.9413, "eval_samples_per_second": 25.074, "eval_steps_per_second": 0.802, "step": 1900 }, { "epoch": 4.134199134199134, "grad_norm": 0.265625, "learning_rate": 9.049773755656108e-05, "loss": 0.1013, "step": 1910 }, { "epoch": 4.1558441558441555, "grad_norm": 0.2109375, "learning_rate": 8.823529411764706e-05, "loss": 0.1058, "step": 1920 }, { "epoch": 4.177489177489178, "grad_norm": 0.19140625, "learning_rate": 8.597285067873303e-05, "loss": 0.1069, "step": 1930 }, { "epoch": 4.199134199134199, "grad_norm": 0.267578125, "learning_rate": 8.3710407239819e-05, "loss": 0.1089, "step": 1940 }, { "epoch": 4.220779220779221, "grad_norm": 0.283203125, "learning_rate": 8.144796380090498e-05, "loss": 0.1026, "step": 1950 }, { "epoch": 4.242424242424242, "grad_norm": 0.357421875, "learning_rate": 7.918552036199095e-05, "loss": 0.1043, "step": 1960 }, { "epoch": 4.264069264069264, "grad_norm": 0.21484375, "learning_rate": 7.692307692307693e-05, "loss": 0.1136, "step": 1970 }, { "epoch": 4.285714285714286, "grad_norm": 0.3203125, "learning_rate": 7.466063348416289e-05, "loss": 0.1045, "step": 1980 }, { "epoch": 4.307359307359308, "grad_norm": 0.244140625, "learning_rate": 7.239819004524887e-05, "loss": 0.1032, "step": 1990 }, { "epoch": 4.329004329004329, "grad_norm": 0.19140625, "learning_rate": 7.013574660633484e-05, "loss": 0.0977, "step": 2000 }, { "epoch": 4.329004329004329, "eval_loss": 0.09685727208852768, "eval_runtime": 20.4475, "eval_samples_per_second": 24.453, "eval_steps_per_second": 0.782, "step": 2000 }, { "epoch": 4.35064935064935, "grad_norm": 0.2333984375, "learning_rate": 6.78733031674208e-05, "loss": 0.1019, "step": 2010 }, { "epoch": 4.372294372294372, "grad_norm": 0.2294921875, "learning_rate": 6.561085972850679e-05, "loss": 0.1076, "step": 2020 }, { "epoch": 4.393939393939394, "grad_norm": 0.1650390625, "learning_rate": 6.334841628959275e-05, "loss": 0.0983, "step": 2030 }, { "epoch": 4.415584415584416, "grad_norm": 0.21484375, "learning_rate": 6.108597285067873e-05, "loss": 0.1023, "step": 2040 }, { "epoch": 4.437229437229437, "grad_norm": 0.26953125, "learning_rate": 5.882352941176471e-05, "loss": 0.0982, "step": 2050 }, { "epoch": 4.458874458874459, "grad_norm": 0.306640625, "learning_rate": 5.656108597285068e-05, "loss": 0.1071, "step": 2060 }, { "epoch": 4.48051948051948, "grad_norm": 0.2265625, "learning_rate": 5.4298642533936655e-05, "loss": 0.1095, "step": 2070 }, { "epoch": 4.5021645021645025, "grad_norm": 0.240234375, "learning_rate": 5.203619909502263e-05, "loss": 0.1105, "step": 2080 }, { "epoch": 4.523809523809524, "grad_norm": 0.2734375, "learning_rate": 4.9773755656108595e-05, "loss": 0.0961, "step": 2090 }, { "epoch": 4.545454545454545, "grad_norm": 0.2412109375, "learning_rate": 4.751131221719457e-05, "loss": 0.1016, "step": 2100 }, { "epoch": 4.545454545454545, "eval_loss": 0.09792981296777725, "eval_runtime": 20.8232, "eval_samples_per_second": 24.012, "eval_steps_per_second": 0.768, "step": 2100 }, { "epoch": 4.567099567099567, "grad_norm": 0.3125, "learning_rate": 4.524886877828054e-05, "loss": 0.0953, "step": 2110 }, { "epoch": 4.588744588744589, "grad_norm": 0.2421875, "learning_rate": 4.2986425339366516e-05, "loss": 0.105, "step": 2120 }, { "epoch": 4.6103896103896105, "grad_norm": 0.255859375, "learning_rate": 4.072398190045249e-05, "loss": 0.1063, "step": 2130 }, { "epoch": 4.632034632034632, "grad_norm": 0.33203125, "learning_rate": 3.846153846153846e-05, "loss": 0.0982, "step": 2140 }, { "epoch": 4.653679653679654, "grad_norm": 0.265625, "learning_rate": 3.6199095022624436e-05, "loss": 0.1068, "step": 2150 }, { "epoch": 4.675324675324675, "grad_norm": 0.2421875, "learning_rate": 3.39366515837104e-05, "loss": 0.1111, "step": 2160 }, { "epoch": 4.696969696969697, "grad_norm": 0.189453125, "learning_rate": 3.167420814479638e-05, "loss": 0.1084, "step": 2170 }, { "epoch": 4.7186147186147185, "grad_norm": 0.2734375, "learning_rate": 2.9411764705882354e-05, "loss": 0.103, "step": 2180 }, { "epoch": 4.740259740259741, "grad_norm": 0.337890625, "learning_rate": 2.7149321266968327e-05, "loss": 0.1089, "step": 2190 }, { "epoch": 4.761904761904762, "grad_norm": 0.3046875, "learning_rate": 2.4886877828054298e-05, "loss": 0.097, "step": 2200 }, { "epoch": 4.761904761904762, "eval_loss": 0.09769493341445923, "eval_runtime": 19.6594, "eval_samples_per_second": 25.433, "eval_steps_per_second": 0.814, "step": 2200 }, { "epoch": 4.783549783549784, "grad_norm": 0.296875, "learning_rate": 2.262443438914027e-05, "loss": 0.1083, "step": 2210 }, { "epoch": 4.805194805194805, "grad_norm": 0.2158203125, "learning_rate": 2.0361990950226245e-05, "loss": 0.1046, "step": 2220 }, { "epoch": 4.8268398268398265, "grad_norm": 0.26953125, "learning_rate": 1.8099547511312218e-05, "loss": 0.1139, "step": 2230 }, { "epoch": 4.848484848484849, "grad_norm": 0.28515625, "learning_rate": 1.583710407239819e-05, "loss": 0.1038, "step": 2240 }, { "epoch": 4.87012987012987, "grad_norm": 0.298828125, "learning_rate": 1.3574660633484164e-05, "loss": 0.1089, "step": 2250 }, { "epoch": 4.891774891774892, "grad_norm": 0.23046875, "learning_rate": 1.1312217194570136e-05, "loss": 0.0922, "step": 2260 }, { "epoch": 4.913419913419913, "grad_norm": 0.2578125, "learning_rate": 9.049773755656109e-06, "loss": 0.1041, "step": 2270 }, { "epoch": 4.935064935064935, "grad_norm": 0.255859375, "learning_rate": 6.787330316742082e-06, "loss": 0.0933, "step": 2280 }, { "epoch": 4.956709956709957, "grad_norm": 0.29296875, "learning_rate": 4.5248868778280546e-06, "loss": 0.1089, "step": 2290 }, { "epoch": 4.978354978354979, "grad_norm": 0.296875, "learning_rate": 2.2624434389140273e-06, "loss": 0.1094, "step": 2300 }, { "epoch": 4.978354978354979, "eval_loss": 0.09713861346244812, "eval_runtime": 20.5975, "eval_samples_per_second": 24.275, "eval_steps_per_second": 0.777, "step": 2300 }, { "epoch": 5.0, "grad_norm": 0.3359375, "learning_rate": 0.0, "loss": 0.1107, "step": 2310 } ], "logging_steps": 10, "max_steps": 2310, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.556491100242671e+18, "train_batch_size": 256, "trial_name": null, "trial_params": null }