diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2812 +1,5612 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.016160506147052525, + "epoch": 0.03232101229410505, "eval_steps": 500, - "global_step": 4000, + "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.040126536763131e-05, - "grad_norm": 34776792.0, - "learning_rate": 1.6e-08, - "loss": 1112513.0, + "grad_norm": 39615392.0, + "learning_rate": 8e-09, + "loss": 1424006.8, "step": 10 }, { "epoch": 8.080253073526263e-05, - "grad_norm": 8441178.0, - "learning_rate": 3.2e-08, - "loss": 1301709.2, + "grad_norm": 9910228.0, + "learning_rate": 1.6e-08, + "loss": 1709117.6, "step": 20 }, { "epoch": 0.00012120379610289395, - "grad_norm": 12542169.0, - "learning_rate": 4.8e-08, - "loss": 1175913.9, + "grad_norm": 14268973.0, + "learning_rate": 2.4e-08, + "loss": 1516864.0, "step": 30 }, { "epoch": 0.00016160506147052525, - "grad_norm": 42386256.0, - "learning_rate": 6.4e-08, - "loss": 1313873.9, + "grad_norm": 49531332.0, + "learning_rate": 3.2e-08, + "loss": 1759415.6, "step": 40 }, { "epoch": 0.00020200632683815657, - "grad_norm": 33162978.0, - "learning_rate": 8e-08, - "loss": 1142017.2, + "grad_norm": 38448884.0, + "learning_rate": 4e-08, + "loss": 1538786.6, "step": 50 }, { "epoch": 0.0002424075922057879, - "grad_norm": 52537692.0, - "learning_rate": 9.6e-08, - "loss": 1040597.1, + "grad_norm": 63421764.0, + "learning_rate": 4.8e-08, + "loss": 1410260.4, "step": 60 }, { "epoch": 0.0002828088575734192, - "grad_norm": 17375308.0, - "learning_rate": 1.1200000000000001e-07, - "loss": 1494345.4, + "grad_norm": 21479650.0, + "learning_rate": 5.6000000000000005e-08, + "loss": 2083620.0, "step": 70 }, { "epoch": 0.0003232101229410505, - "grad_norm": 95491040.0, - "learning_rate": 1.28e-07, - "loss": 1404674.8, + "grad_norm": 113106224.0, + "learning_rate": 6.4e-08, + "loss": 1987047.6, "step": 80 }, { "epoch": 0.0003636113883086818, - "grad_norm": 13118159.0, - "learning_rate": 1.4400000000000002e-07, - "loss": 1204737.6, + "grad_norm": 15781740.0, + "learning_rate": 7.200000000000001e-08, + "loss": 1750117.0, "step": 90 }, { "epoch": 0.00040401265367631315, - "grad_norm": 18581922.0, - "learning_rate": 1.6e-07, - "loss": 783122.2, + "grad_norm": 24116786.0, + "learning_rate": 8e-08, + "loss": 1215168.9, "step": 100 }, { "epoch": 0.00044441391904394446, - "grad_norm": 15600268.0, - "learning_rate": 1.7600000000000001e-07, - "loss": 1342865.9, + "grad_norm": 21128408.0, + "learning_rate": 8.800000000000001e-08, + "loss": 2103023.0, "step": 110 }, { "epoch": 0.0004848151844115758, - "grad_norm": 5757326.5, - "learning_rate": 1.92e-07, - "loss": 998477.5, + "grad_norm": 8172304.5, + "learning_rate": 9.6e-08, + "loss": 1638824.6, "step": 120 }, { "epoch": 0.000525216449779207, - "grad_norm": 37603152.0, - "learning_rate": 2.08e-07, - "loss": 1113327.2, + "grad_norm": 53589368.0, + "learning_rate": 1.04e-07, + "loss": 1890719.8, "step": 130 }, { "epoch": 0.0005656177151468384, - "grad_norm": 3987340.5, - "learning_rate": 2.2400000000000002e-07, - "loss": 815339.55, + "grad_norm": 5801806.5, + "learning_rate": 1.1200000000000001e-07, + "loss": 1525640.4, "step": 140 }, { "epoch": 0.0006060189805144697, - "grad_norm": 17012036.0, - "learning_rate": 2.4000000000000003e-07, - "loss": 799237.55, + "grad_norm": 27430064.0, + "learning_rate": 1.2000000000000002e-07, + "loss": 1521387.9, "step": 150 }, { "epoch": 0.000646420245882101, - "grad_norm": 15123844.0, - "learning_rate": 2.56e-07, - "loss": 795830.9, + "grad_norm": 26036798.0, + "learning_rate": 1.28e-07, + "loss": 1613663.5, "step": 160 }, { "epoch": 0.0006868215112497323, - "grad_norm": 13699358.0, - "learning_rate": 2.72e-07, - "loss": 675074.05, + "grad_norm": 23205832.0, + "learning_rate": 1.36e-07, + "loss": 1461856.4, "step": 170 }, { "epoch": 0.0007272227766173637, - "grad_norm": 5762779.5, - "learning_rate": 2.8800000000000004e-07, - "loss": 397814.875, + "grad_norm": 11449390.0, + "learning_rate": 1.4400000000000002e-07, + "loss": 926304.4, "step": 180 }, { "epoch": 0.000767624041984995, - "grad_norm": 3545711.25, - "learning_rate": 3.04e-07, - "loss": 278445.225, + "grad_norm": 7191616.0, + "learning_rate": 1.52e-07, + "loss": 729461.4, "step": 190 }, { "epoch": 0.0008080253073526263, - "grad_norm": 22483670.0, - "learning_rate": 3.2e-07, - "loss": 350395.65, + "grad_norm": 56028556.0, + "learning_rate": 1.6e-07, + "loss": 1010549.8, "step": 200 }, { "epoch": 0.0008484265727202576, - "grad_norm": 3111300.75, - "learning_rate": 3.36e-07, - "loss": 215067.95, + "grad_norm": 7823445.0, + "learning_rate": 1.68e-07, + "loss": 670322.55, "step": 210 }, { "epoch": 0.0008888278380878889, - "grad_norm": 4985113.0, - "learning_rate": 3.5200000000000003e-07, - "loss": 160062.15, + "grad_norm": 14022711.0, + "learning_rate": 1.7600000000000001e-07, + "loss": 549696.2, "step": 220 }, { "epoch": 0.0009292291034555202, - "grad_norm": 10360142.0, - "learning_rate": 3.68e-07, - "loss": 177085.5, + "grad_norm": 31492678.0, + "learning_rate": 1.84e-07, + "loss": 666658.65, "step": 230 }, { "epoch": 0.0009696303688231516, - "grad_norm": 4212897.0, - "learning_rate": 3.84e-07, - "loss": 156687.3375, + "grad_norm": 15210909.0, + "learning_rate": 1.92e-07, + "loss": 677519.3, "step": 240 }, { "epoch": 0.0010100316341907828, - "grad_norm": 2492394.75, - "learning_rate": 4.0000000000000003e-07, - "loss": 218777.15, + "grad_norm": 10931707.0, + "learning_rate": 2.0000000000000002e-07, + "loss": 1081470.4, "step": 250 }, { "epoch": 0.001050432899558414, - "grad_norm": 1271087.375, - "learning_rate": 4.16e-07, - "loss": 74811.1875, + "grad_norm": 6009514.5, + "learning_rate": 2.08e-07, + "loss": 442807.65, "step": 260 }, { "epoch": 0.0010908341649260454, - "grad_norm": 11904261.0, - "learning_rate": 4.3200000000000006e-07, - "loss": 78403.6125, + "grad_norm": 61020272.0, + "learning_rate": 2.1600000000000003e-07, + "loss": 536978.65, "step": 270 }, { "epoch": 0.0011312354302936767, - "grad_norm": 1911185.875, - "learning_rate": 4.4800000000000004e-07, - "loss": 112190.75, + "grad_norm": 11584516.0, + "learning_rate": 2.2400000000000002e-07, + "loss": 889609.3, "step": 280 }, { "epoch": 0.001171636695661308, - "grad_norm": 1619918.75, - "learning_rate": 4.64e-07, - "loss": 33613.9688, + "grad_norm": 12812539.0, + "learning_rate": 2.32e-07, + "loss": 307167.85, "step": 290 }, { "epoch": 0.0012120379610289394, - "grad_norm": 1365019.625, - "learning_rate": 4.800000000000001e-07, - "loss": 32836.45, + "grad_norm": 12023638.0, + "learning_rate": 2.4000000000000003e-07, + "loss": 383230.1, "step": 300 }, { "epoch": 0.0012524392263965707, - "grad_norm": 1140846.75, - "learning_rate": 4.96e-07, - "loss": 38892.625, + "grad_norm": 10079877.0, + "learning_rate": 2.48e-07, + "loss": 481844.4, "step": 310 }, { "epoch": 0.001292840491764202, - "grad_norm": 950318.8125, - "learning_rate": 5.12e-07, - "loss": 43860.9844, + "grad_norm": 13915200.0, + "learning_rate": 2.56e-07, + "loss": 669676.55, "step": 320 }, { "epoch": 0.0013332417571318333, - "grad_norm": 191924.40625, - "learning_rate": 5.280000000000001e-07, - "loss": 19675.0312, + "grad_norm": 3249964.0, + "learning_rate": 2.6400000000000003e-07, + "loss": 374595.725, "step": 330 }, { "epoch": 0.0013736430224994647, - "grad_norm": 354602.96875, - "learning_rate": 5.44e-07, - "loss": 17043.5047, + "grad_norm": 6385851.0, + "learning_rate": 2.72e-07, + "loss": 401643.825, "step": 340 }, { "epoch": 0.001414044287867096, - "grad_norm": 64334.99609375, - "learning_rate": 5.6e-07, - "loss": 6989.507, + "grad_norm": 1699793.75, + "learning_rate": 2.8e-07, + "loss": 230613.2, "step": 350 }, { "epoch": 0.0014544455532347273, - "grad_norm": 29138.234375, - "learning_rate": 5.760000000000001e-07, - "loss": 2483.216, + "grad_norm": 1112038.625, + "learning_rate": 2.8800000000000004e-07, + "loss": 100188.5875, "step": 360 }, { "epoch": 0.0014948468186023586, - "grad_norm": 4881.9619140625, - "learning_rate": 5.920000000000001e-07, - "loss": 4704.4551, + "grad_norm": 236119.0625, + "learning_rate": 2.9600000000000006e-07, + "loss": 239948.025, "step": 370 }, { "epoch": 0.00153524808396999, - "grad_norm": 73014.8671875, - "learning_rate": 6.08e-07, - "loss": 1492.4164, + "grad_norm": 5255031.5, + "learning_rate": 3.04e-07, + "loss": 102027.1625, "step": 380 }, { "epoch": 0.0015756493493376213, - "grad_norm": 86664.8203125, - "learning_rate": 6.24e-07, - "loss": 1407.5383, + "grad_norm": 8576087.0, + "learning_rate": 3.12e-07, + "loss": 117053.875, "step": 390 }, { "epoch": 0.0016160506147052526, - "grad_norm": 91074.265625, - "learning_rate": 6.4e-07, - "loss": 932.89, + "grad_norm": 11559298.0, + "learning_rate": 3.2e-07, + "loss": 92698.1, "step": 400 }, { "epoch": 0.001656451880072884, - "grad_norm": 5759.06396484375, - "learning_rate": 6.560000000000002e-07, - "loss": 856.515, + "grad_norm": 1147102.625, + "learning_rate": 3.280000000000001e-07, + "loss": 101246.725, "step": 410 }, { "epoch": 0.0016968531454405152, - "grad_norm": 1454.9195556640625, - "learning_rate": 6.72e-07, - "loss": 419.0969, + "grad_norm": 660331.6875, + "learning_rate": 3.36e-07, + "loss": 40889.6125, "step": 420 }, { "epoch": 0.0017372544108081465, - "grad_norm": 673.9178466796875, - "learning_rate": 6.88e-07, - "loss": 621.3434, + "grad_norm": 357817.46875, + "learning_rate": 3.44e-07, + "loss": 62075.0188, "step": 430 }, { "epoch": 0.0017776556761757779, - "grad_norm": 1015.685546875, - "learning_rate": 7.040000000000001e-07, - "loss": 498.2878, + "grad_norm": 471172.03125, + "learning_rate": 3.5200000000000003e-07, + "loss": 30728.0594, "step": 440 }, { "epoch": 0.0018180569415434092, - "grad_norm": 6716.2080078125, - "learning_rate": 7.2e-07, - "loss": 556.3153, + "grad_norm": 4107177.75, + "learning_rate": 3.6e-07, + "loss": 33375.3938, "step": 450 }, { "epoch": 0.0018584582069110405, - "grad_norm": 1468.1954345703125, - "learning_rate": 7.36e-07, - "loss": 488.196, + "grad_norm": 266172.03125, + "learning_rate": 3.68e-07, + "loss": 16775.7219, "step": 460 }, { "epoch": 0.0018988594722786718, - "grad_norm": 5355.9794921875, - "learning_rate": 7.520000000000001e-07, - "loss": 553.4161, + "grad_norm": 1377274.375, + "learning_rate": 3.7600000000000003e-07, + "loss": 19591.675, "step": 470 }, { "epoch": 0.0019392607376463031, - "grad_norm": 1276.7708740234375, - "learning_rate": 7.68e-07, - "loss": 575.7385, + "grad_norm": 357453.53125, + "learning_rate": 3.84e-07, + "loss": 12265.5023, "step": 480 }, { "epoch": 0.0019796620030139342, - "grad_norm": 1997.7783203125, - "learning_rate": 7.84e-07, - "loss": 449.5601, + "grad_norm": 111214.6640625, + "learning_rate": 3.92e-07, + "loss": 9045.2516, "step": 490 }, { "epoch": 0.0020200632683815656, - "grad_norm": 1234.7044677734375, - "learning_rate": 8.000000000000001e-07, - "loss": 454.7207, + "grad_norm": 56817.890625, + "learning_rate": 4.0000000000000003e-07, + "loss": 4109.9199, "step": 500 }, { "epoch": 0.002060464533749197, - "grad_norm": 1192.5169677734375, - "learning_rate": 8.160000000000001e-07, - "loss": 459.7037, + "grad_norm": 7607.2109375, + "learning_rate": 4.0800000000000005e-07, + "loss": 3645.5437, "step": 510 }, { "epoch": 0.002100865799116828, - "grad_norm": 1192.2779541015625, - "learning_rate": 8.32e-07, - "loss": 406.1772, + "grad_norm": 74048.3828125, + "learning_rate": 4.16e-07, + "loss": 2405.6049, "step": 520 }, { "epoch": 0.0021412670644844595, - "grad_norm": 1168.502685546875, - "learning_rate": 8.480000000000001e-07, - "loss": 520.4339, + "grad_norm": 4505.02197265625, + "learning_rate": 4.2400000000000004e-07, + "loss": 2230.874, "step": 530 }, { "epoch": 0.002181668329852091, - "grad_norm": 1580.9521484375, - "learning_rate": 8.640000000000001e-07, - "loss": 282.541, + "grad_norm": 855.5910034179688, + "learning_rate": 4.3200000000000006e-07, + "loss": 681.2865, "step": 540 }, { "epoch": 0.002222069595219722, - "grad_norm": 12831.9287109375, - "learning_rate": 8.8e-07, - "loss": 531.3956, + "grad_norm": 110704.7734375, + "learning_rate": 4.4e-07, + "loss": 1034.9506, "step": 550 }, { "epoch": 0.0022624708605873535, - "grad_norm": 6975.4140625, - "learning_rate": 8.960000000000001e-07, - "loss": 377.4974, + "grad_norm": 35438.12109375, + "learning_rate": 4.4800000000000004e-07, + "loss": 542.2862, "step": 560 }, { "epoch": 0.002302872125954985, - "grad_norm": 3642.0546875, - "learning_rate": 9.120000000000001e-07, - "loss": 325.7552, + "grad_norm": 12829.02734375, + "learning_rate": 4.5600000000000006e-07, + "loss": 418.173, "step": 570 }, { "epoch": 0.002343273391322616, - "grad_norm": 1394.17822265625, - "learning_rate": 9.28e-07, - "loss": 450.4489, + "grad_norm": 3379.951904296875, + "learning_rate": 4.64e-07, + "loss": 525.2813, "step": 580 }, { "epoch": 0.0023836746566902474, - "grad_norm": 6102.92626953125, - "learning_rate": 9.440000000000001e-07, - "loss": 581.4113, + "grad_norm": 9954.07421875, + "learning_rate": 4.7200000000000004e-07, + "loss": 641.1523, "step": 590 }, { "epoch": 0.0024240759220578788, - "grad_norm": 942.1673583984375, - "learning_rate": 9.600000000000001e-07, - "loss": 327.3541, + "grad_norm": 1068.71875, + "learning_rate": 4.800000000000001e-07, + "loss": 357.9712, "step": 600 }, { "epoch": 0.00246447718742551, - "grad_norm": 7506.9111328125, - "learning_rate": 9.76e-07, - "loss": 562.9349, + "grad_norm": 6292.51611328125, + "learning_rate": 4.88e-07, + "loss": 608.0323, "step": 610 }, { "epoch": 0.0025048784527931414, - "grad_norm": 828.693603515625, - "learning_rate": 9.92e-07, - "loss": 480.7408, + "grad_norm": 989.401611328125, + "learning_rate": 4.96e-07, + "loss": 516.642, "step": 620 }, { "epoch": 0.0025452797181607727, - "grad_norm": 1101.0645751953125, - "learning_rate": 1.0080000000000001e-06, - "loss": 510.9642, + "grad_norm": 1106.1527099609375, + "learning_rate": 5.040000000000001e-07, + "loss": 554.4714, "step": 630 }, { "epoch": 0.002585680983528404, - "grad_norm": 4109.21728515625, - "learning_rate": 1.024e-06, - "loss": 380.4708, + "grad_norm": 2721.04541015625, + "learning_rate": 5.12e-07, + "loss": 402.3794, "step": 640 }, { "epoch": 0.0026260822488960354, - "grad_norm": 1035.9979248046875, - "learning_rate": 1.04e-06, - "loss": 438.6869, + "grad_norm": 1019.2467041015625, + "learning_rate": 5.2e-07, + "loss": 467.6232, "step": 650 }, { "epoch": 0.0026664835142636667, - "grad_norm": 857.4368896484375, - "learning_rate": 1.0560000000000001e-06, - "loss": 333.5929, + "grad_norm": 859.8365478515625, + "learning_rate": 5.280000000000001e-07, + "loss": 351.4235, "step": 660 }, { "epoch": 0.002706884779631298, - "grad_norm": 861.3447875976562, - "learning_rate": 1.072e-06, - "loss": 369.1007, + "grad_norm": 1069.44580078125, + "learning_rate": 5.36e-07, + "loss": 396.704, "step": 670 }, { "epoch": 0.0027472860449989293, - "grad_norm": 1937.781005859375, - "learning_rate": 1.088e-06, - "loss": 467.2207, + "grad_norm": 2269.23828125, + "learning_rate": 5.44e-07, + "loss": 497.2921, "step": 680 }, { "epoch": 0.0027876873103665606, - "grad_norm": 11980.4130859375, - "learning_rate": 1.1040000000000001e-06, - "loss": 461.9273, + "grad_norm": 16940.8125, + "learning_rate": 5.520000000000001e-07, + "loss": 503.0415, "step": 690 }, { "epoch": 0.002828088575734192, - "grad_norm": 1039.8837890625, - "learning_rate": 1.12e-06, - "loss": 376.5904, + "grad_norm": 1267.16650390625, + "learning_rate": 5.6e-07, + "loss": 409.2578, "step": 700 }, { "epoch": 0.0028684898411018233, - "grad_norm": 2001.8681640625, - "learning_rate": 1.1360000000000002e-06, - "loss": 397.0597, + "grad_norm": 1730.9649658203125, + "learning_rate": 5.680000000000001e-07, + "loss": 430.1885, "step": 710 }, { "epoch": 0.0029088911064694546, - "grad_norm": 4143.15087890625, - "learning_rate": 1.1520000000000002e-06, - "loss": 524.1538, + "grad_norm": 4848.76611328125, + "learning_rate": 5.760000000000001e-07, + "loss": 561.9834, "step": 720 }, { "epoch": 0.002949292371837086, - "grad_norm": 1036.8919677734375, - "learning_rate": 1.168e-06, - "loss": 367.9916, + "grad_norm": 1300.886962890625, + "learning_rate": 5.84e-07, + "loss": 396.4536, "step": 730 }, { "epoch": 0.0029896936372047172, - "grad_norm": 8330.5068359375, - "learning_rate": 1.1840000000000002e-06, - "loss": 394.5002, + "grad_norm": 7298.6962890625, + "learning_rate": 5.920000000000001e-07, + "loss": 419.3825, "step": 740 }, { "epoch": 0.0030300949025723486, - "grad_norm": 2307.091064453125, - "learning_rate": 1.2000000000000002e-06, - "loss": 295.3556, + "grad_norm": 1165.7750244140625, + "learning_rate": 6.000000000000001e-07, + "loss": 310.7019, "step": 750 }, { "epoch": 0.00307049616793998, - "grad_norm": 1893.0789794921875, - "learning_rate": 1.216e-06, - "loss": 265.5479, + "grad_norm": 2274.927490234375, + "learning_rate": 6.08e-07, + "loss": 286.0465, "step": 760 }, { "epoch": 0.003110897433307611, - "grad_norm": 2477.446533203125, - "learning_rate": 1.2320000000000002e-06, - "loss": 534.005, + "grad_norm": 2981.280029296875, + "learning_rate": 6.160000000000001e-07, + "loss": 570.9983, "step": 770 }, { "epoch": 0.0031512986986752425, - "grad_norm": 916.7561645507812, - "learning_rate": 1.248e-06, - "loss": 334.4991, + "grad_norm": 797.6749877929688, + "learning_rate": 6.24e-07, + "loss": 359.5495, "step": 780 }, { "epoch": 0.003191699964042874, - "grad_norm": 13106.6064453125, - "learning_rate": 1.2640000000000003e-06, - "loss": 464.4099, + "grad_norm": 13082.837890625, + "learning_rate": 6.320000000000002e-07, + "loss": 491.772, "step": 790 }, { "epoch": 0.003232101229410505, - "grad_norm": 1100.711669921875, - "learning_rate": 1.28e-06, - "loss": 302.2139, + "grad_norm": 1186.2406005859375, + "learning_rate": 6.4e-07, + "loss": 325.208, "step": 800 }, { "epoch": 0.0032725024947781365, - "grad_norm": 2172.443603515625, - "learning_rate": 1.296e-06, - "loss": 389.6274, + "grad_norm": 1491.949951171875, + "learning_rate": 6.48e-07, + "loss": 408.7208, "step": 810 }, { "epoch": 0.003312903760145768, - "grad_norm": 1690.6312255859375, - "learning_rate": 1.3120000000000003e-06, - "loss": 361.3152, + "grad_norm": 2081.479248046875, + "learning_rate": 6.560000000000002e-07, + "loss": 386.7589, "step": 820 }, { "epoch": 0.003353305025513399, - "grad_norm": 924.0753173828125, - "learning_rate": 1.328e-06, - "loss": 247.9785, + "grad_norm": 1019.1475219726562, + "learning_rate": 6.64e-07, + "loss": 262.9931, "step": 830 }, { "epoch": 0.0033937062908810304, - "grad_norm": 1367.6988525390625, - "learning_rate": 1.344e-06, - "loss": 320.1396, + "grad_norm": 1897.841064453125, + "learning_rate": 6.72e-07, + "loss": 340.3138, "step": 840 }, { "epoch": 0.0034341075562486618, - "grad_norm": 731.0768432617188, - "learning_rate": 1.3600000000000001e-06, - "loss": 394.588, + "grad_norm": 702.2278442382812, + "learning_rate": 6.800000000000001e-07, + "loss": 423.125, "step": 850 }, { "epoch": 0.003474508821616293, - "grad_norm": 788.498046875, - "learning_rate": 1.376e-06, - "loss": 430.4721, + "grad_norm": 1053.8526611328125, + "learning_rate": 6.88e-07, + "loss": 453.3081, "step": 860 }, { "epoch": 0.0035149100869839244, - "grad_norm": 989.5845336914062, - "learning_rate": 1.392e-06, - "loss": 343.7414, + "grad_norm": 917.5226440429688, + "learning_rate": 6.96e-07, + "loss": 368.6665, "step": 870 }, { "epoch": 0.0035553113523515557, - "grad_norm": 661.3421020507812, - "learning_rate": 1.4080000000000001e-06, - "loss": 382.3532, + "grad_norm": 694.537353515625, + "learning_rate": 7.040000000000001e-07, + "loss": 412.8042, "step": 880 }, { "epoch": 0.003595712617719187, - "grad_norm": 831.7238159179688, - "learning_rate": 1.424e-06, - "loss": 442.5967, + "grad_norm": 1054.6588134765625, + "learning_rate": 7.12e-07, + "loss": 472.1723, "step": 890 }, { "epoch": 0.0036361138830868184, - "grad_norm": 1258.17236328125, - "learning_rate": 1.44e-06, - "loss": 329.9442, + "grad_norm": 1912.3931884765625, + "learning_rate": 7.2e-07, + "loss": 357.6209, "step": 900 }, { "epoch": 0.0036765151484544497, - "grad_norm": 1166.2576904296875, - "learning_rate": 1.4560000000000001e-06, - "loss": 332.621, + "grad_norm": 1250.499267578125, + "learning_rate": 7.280000000000001e-07, + "loss": 355.1088, "step": 910 }, { "epoch": 0.003716916413822081, - "grad_norm": 877.9427490234375, - "learning_rate": 1.472e-06, - "loss": 464.8878, + "grad_norm": 865.3861694335938, + "learning_rate": 7.36e-07, + "loss": 503.4024, "step": 920 }, { "epoch": 0.0037573176791897123, - "grad_norm": 1044.3133544921875, - "learning_rate": 1.488e-06, - "loss": 292.7577, + "grad_norm": 899.6528930664062, + "learning_rate": 7.44e-07, + "loss": 310.2518, "step": 930 }, { "epoch": 0.0037977189445573436, - "grad_norm": 1089.46728515625, - "learning_rate": 1.5040000000000001e-06, - "loss": 460.0909, + "grad_norm": 1164.725341796875, + "learning_rate": 7.520000000000001e-07, + "loss": 489.243, "step": 940 }, { "epoch": 0.003838120209924975, - "grad_norm": 797.8550415039062, - "learning_rate": 1.52e-06, - "loss": 346.0366, + "grad_norm": 752.870361328125, + "learning_rate": 7.6e-07, + "loss": 368.5173, "step": 950 }, { "epoch": 0.0038785214752926063, - "grad_norm": 798.5408325195312, - "learning_rate": 1.536e-06, - "loss": 310.8797, + "grad_norm": 852.3152465820312, + "learning_rate": 7.68e-07, + "loss": 331.0503, "step": 960 }, { "epoch": 0.003918922740660237, - "grad_norm": 2262.073974609375, - "learning_rate": 1.5520000000000001e-06, - "loss": 370.4966, + "grad_norm": 2589.0009765625, + "learning_rate": 7.760000000000001e-07, + "loss": 392.7393, "step": 970 }, { "epoch": 0.0039593240060278685, - "grad_norm": 2735.94482421875, - "learning_rate": 1.568e-06, - "loss": 323.6035, + "grad_norm": 2873.308837890625, + "learning_rate": 7.84e-07, + "loss": 346.8862, "step": 980 }, { "epoch": 0.0039997252713955, - "grad_norm": 646.0745849609375, - "learning_rate": 1.5840000000000002e-06, - "loss": 339.8437, + "grad_norm": 653.263916015625, + "learning_rate": 7.920000000000001e-07, + "loss": 353.1814, "step": 990 }, { "epoch": 0.004040126536763131, - "grad_norm": 1748.4075927734375, - "learning_rate": 1.6000000000000001e-06, - "loss": 317.4156, + "grad_norm": 1120.7786865234375, + "learning_rate": 8.000000000000001e-07, + "loss": 330.2107, "step": 1000 }, { "epoch": 0.0040805278021307624, - "grad_norm": 780.6202392578125, - "learning_rate": 1.616e-06, - "loss": 296.7254, + "grad_norm": 741.6884155273438, + "learning_rate": 8.08e-07, + "loss": 312.3269, "step": 1010 }, { "epoch": 0.004120929067498394, - "grad_norm": 1253.2996826171875, - "learning_rate": 1.6320000000000002e-06, - "loss": 384.1156, + "grad_norm": 1314.5491943359375, + "learning_rate": 8.160000000000001e-07, + "loss": 410.496, "step": 1020 }, { "epoch": 0.004161330332866025, - "grad_norm": 1055.1898193359375, - "learning_rate": 1.6480000000000001e-06, - "loss": 397.7781, + "grad_norm": 1006.4945068359375, + "learning_rate": 8.240000000000001e-07, + "loss": 418.2414, "step": 1030 }, { "epoch": 0.004201731598233656, - "grad_norm": 803.1446533203125, - "learning_rate": 1.664e-06, - "loss": 359.3899, + "grad_norm": 794.8612670898438, + "learning_rate": 8.32e-07, + "loss": 377.3666, "step": 1040 }, { "epoch": 0.004242132863601288, - "grad_norm": 798.8219604492188, - "learning_rate": 1.6800000000000002e-06, - "loss": 381.0167, + "grad_norm": 659.0382690429688, + "learning_rate": 8.400000000000001e-07, + "loss": 401.3201, "step": 1050 }, { "epoch": 0.004282534128968919, - "grad_norm": 3304.813232421875, - "learning_rate": 1.6960000000000002e-06, - "loss": 391.1282, + "grad_norm": 3303.1611328125, + "learning_rate": 8.480000000000001e-07, + "loss": 408.5207, "step": 1060 }, { "epoch": 0.00432293539433655, - "grad_norm": 2308.608154296875, - "learning_rate": 1.712e-06, - "loss": 453.3599, + "grad_norm": 1630.378662109375, + "learning_rate": 8.56e-07, + "loss": 475.4568, "step": 1070 }, { "epoch": 0.004363336659704182, - "grad_norm": 744.8621215820312, - "learning_rate": 1.7280000000000002e-06, - "loss": 306.6811, + "grad_norm": 733.5653686523438, + "learning_rate": 8.640000000000001e-07, + "loss": 326.4004, "step": 1080 }, { "epoch": 0.004403737925071813, - "grad_norm": 5821.24267578125, - "learning_rate": 1.7440000000000002e-06, - "loss": 608.5389, + "grad_norm": 4627.86181640625, + "learning_rate": 8.720000000000001e-07, + "loss": 645.257, "step": 1090 }, { "epoch": 0.004444139190439444, - "grad_norm": 865.3998413085938, - "learning_rate": 1.76e-06, - "loss": 242.1831, + "grad_norm": 941.9146728515625, + "learning_rate": 8.8e-07, + "loss": 248.9732, "step": 1100 }, { "epoch": 0.004484540455807076, - "grad_norm": 955.5283203125, - "learning_rate": 1.7760000000000002e-06, - "loss": 406.3821, + "grad_norm": 1176.5272216796875, + "learning_rate": 8.880000000000001e-07, + "loss": 428.5027, "step": 1110 }, { "epoch": 0.004524941721174707, - "grad_norm": 1257.974853515625, - "learning_rate": 1.7920000000000002e-06, - "loss": 337.8827, + "grad_norm": 992.4073486328125, + "learning_rate": 8.960000000000001e-07, + "loss": 349.8248, "step": 1120 }, { "epoch": 0.004565342986542338, - "grad_norm": 848.3350830078125, - "learning_rate": 1.808e-06, - "loss": 383.7443, + "grad_norm": 840.597412109375, + "learning_rate": 9.04e-07, + "loss": 402.2111, "step": 1130 }, { "epoch": 0.00460574425190997, - "grad_norm": 2609.029541015625, - "learning_rate": 1.8240000000000002e-06, - "loss": 394.2932, + "grad_norm": 1858.921142578125, + "learning_rate": 9.120000000000001e-07, + "loss": 411.0437, "step": 1140 }, { "epoch": 0.004646145517277601, - "grad_norm": 1440.8753662109375, - "learning_rate": 1.8400000000000002e-06, - "loss": 433.8044, + "grad_norm": 1351.7413330078125, + "learning_rate": 9.200000000000001e-07, + "loss": 454.9886, "step": 1150 }, { "epoch": 0.004686546782645232, - "grad_norm": 1531.273681640625, - "learning_rate": 1.856e-06, - "loss": 424.2002, + "grad_norm": 1666.19189453125, + "learning_rate": 9.28e-07, + "loss": 443.8775, "step": 1160 }, { "epoch": 0.0047269480480128636, - "grad_norm": 2845.63623046875, - "learning_rate": 1.8720000000000002e-06, - "loss": 343.2191, + "grad_norm": 1765.3035888671875, + "learning_rate": 9.360000000000001e-07, + "loss": 353.4751, "step": 1170 }, { "epoch": 0.004767349313380495, - "grad_norm": 679.157470703125, - "learning_rate": 1.8880000000000002e-06, - "loss": 327.4853, + "grad_norm": 708.845458984375, + "learning_rate": 9.440000000000001e-07, + "loss": 336.5612, "step": 1180 }, { "epoch": 0.004807750578748126, - "grad_norm": 1685.8907470703125, - "learning_rate": 1.9040000000000003e-06, - "loss": 343.0018, + "grad_norm": 1365.6417236328125, + "learning_rate": 9.520000000000002e-07, + "loss": 354.25, "step": 1190 }, { "epoch": 0.0048481518441157575, - "grad_norm": 961.8662719726562, - "learning_rate": 1.9200000000000003e-06, - "loss": 345.5244, + "grad_norm": 789.3729858398438, + "learning_rate": 9.600000000000001e-07, + "loss": 359.4335, "step": 1200 }, { "epoch": 0.004888553109483389, - "grad_norm": 1039.9063720703125, - "learning_rate": 1.936e-06, - "loss": 286.9379, + "grad_norm": 1061.9530029296875, + "learning_rate": 9.68e-07, + "loss": 299.4941, "step": 1210 }, { "epoch": 0.00492895437485102, - "grad_norm": 915.8055419921875, - "learning_rate": 1.952e-06, - "loss": 291.1856, + "grad_norm": 998.1884155273438, + "learning_rate": 9.76e-07, + "loss": 304.5467, "step": 1220 }, { "epoch": 0.0049693556402186515, - "grad_norm": 2453.09716796875, - "learning_rate": 1.968e-06, - "loss": 357.5581, + "grad_norm": 1769.4918212890625, + "learning_rate": 9.84e-07, + "loss": 367.9012, "step": 1230 }, { "epoch": 0.005009756905586283, - "grad_norm": 1281.601318359375, - "learning_rate": 1.984e-06, - "loss": 370.0242, + "grad_norm": 1094.5677490234375, + "learning_rate": 9.92e-07, + "loss": 383.8861, "step": 1240 }, { "epoch": 0.005050158170953914, - "grad_norm": 1927.247802734375, - "learning_rate": 2.0000000000000003e-06, - "loss": 420.42, + "grad_norm": 1168.4744873046875, + "learning_rate": 1.0000000000000002e-06, + "loss": 436.8746, "step": 1250 }, { "epoch": 0.0050905594363215454, - "grad_norm": 1159.7850341796875, - "learning_rate": 2.0160000000000003e-06, - "loss": 381.0742, + "grad_norm": 1109.05224609375, + "learning_rate": 1.0080000000000001e-06, + "loss": 398.7812, "step": 1260 }, { "epoch": 0.005130960701689177, - "grad_norm": 1102.625244140625, - "learning_rate": 2.032e-06, - "loss": 441.1049, + "grad_norm": 1094.9896240234375, + "learning_rate": 1.016e-06, + "loss": 455.3362, "step": 1270 }, { "epoch": 0.005171361967056808, - "grad_norm": 1439.484130859375, - "learning_rate": 2.048e-06, - "loss": 277.274, + "grad_norm": 1282.1097412109375, + "learning_rate": 1.024e-06, + "loss": 290.201, "step": 1280 }, { "epoch": 0.005211763232424439, - "grad_norm": 5610.380859375, - "learning_rate": 2.064e-06, - "loss": 421.7488, + "grad_norm": 4188.5, + "learning_rate": 1.032e-06, + "loss": 434.253, "step": 1290 }, { "epoch": 0.005252164497792071, - "grad_norm": 982.2139892578125, - "learning_rate": 2.08e-06, - "loss": 340.8586, + "grad_norm": 947.2262573242188, + "learning_rate": 1.04e-06, + "loss": 351.7767, "step": 1300 }, { "epoch": 0.005292565763159702, - "grad_norm": 791.9103393554688, - "learning_rate": 2.0960000000000003e-06, - "loss": 363.6415, + "grad_norm": 697.3817138671875, + "learning_rate": 1.0480000000000002e-06, + "loss": 369.6858, "step": 1310 }, { "epoch": 0.005332967028527333, "grad_norm": 0.0, - "learning_rate": 2.1120000000000003e-06, - "loss": 443.9375, + "learning_rate": 1.0560000000000001e-06, + "loss": 454.9501, "step": 1320 }, { "epoch": 0.005373368293894965, - "grad_norm": 1269.4005126953125, - "learning_rate": 2.128e-06, - "loss": 352.3543, + "grad_norm": 1082.7498779296875, + "learning_rate": 1.064e-06, + "loss": 365.2526, "step": 1330 }, { "epoch": 0.005413769559262596, - "grad_norm": 1225.1302490234375, - "learning_rate": 2.144e-06, - "loss": 357.3038, + "grad_norm": 1059.6636962890625, + "learning_rate": 1.072e-06, + "loss": 368.3111, "step": 1340 }, { "epoch": 0.005454170824630227, - "grad_norm": 827.63720703125, - "learning_rate": 2.16e-06, - "loss": 303.4535, + "grad_norm": 960.380859375, + "learning_rate": 1.08e-06, + "loss": 310.9944, "step": 1350 }, { "epoch": 0.005494572089997859, - "grad_norm": 848.7166137695312, - "learning_rate": 2.176e-06, - "loss": 268.09, + "grad_norm": 831.2578735351562, + "learning_rate": 1.088e-06, + "loss": 276.7268, "step": 1360 }, { "epoch": 0.00553497335536549, - "grad_norm": 1410.2564697265625, - "learning_rate": 2.1920000000000004e-06, - "loss": 389.759, + "grad_norm": 1476.62451171875, + "learning_rate": 1.0960000000000002e-06, + "loss": 410.6641, "step": 1370 }, { "epoch": 0.005575374620733121, - "grad_norm": 1602.010009765625, - "learning_rate": 2.2080000000000003e-06, - "loss": 412.8606, + "grad_norm": 1813.6104736328125, + "learning_rate": 1.1040000000000001e-06, + "loss": 423.7998, "step": 1380 }, { "epoch": 0.005615775886100753, - "grad_norm": 1337.7384033203125, - "learning_rate": 2.2240000000000002e-06, - "loss": 428.547, + "grad_norm": 1251.9375, + "learning_rate": 1.1120000000000001e-06, + "loss": 443.5602, "step": 1390 }, { "epoch": 0.005656177151468384, - "grad_norm": 867.4197387695312, - "learning_rate": 2.24e-06, - "loss": 378.6157, + "grad_norm": 819.0733032226562, + "learning_rate": 1.12e-06, + "loss": 391.8192, "step": 1400 }, { "epoch": 0.005696578416836015, - "grad_norm": 731.271728515625, - "learning_rate": 2.256e-06, - "loss": 374.0744, + "grad_norm": 737.3008422851562, + "learning_rate": 1.128e-06, + "loss": 382.7008, "step": 1410 }, { "epoch": 0.0057369796822036466, - "grad_norm": 2343.665771484375, - "learning_rate": 2.2720000000000004e-06, - "loss": 379.0267, + "grad_norm": 2222.40625, + "learning_rate": 1.1360000000000002e-06, + "loss": 393.2168, "step": 1420 }, { "epoch": 0.005777380947571278, - "grad_norm": 1540.48388671875, - "learning_rate": 2.2880000000000004e-06, - "loss": 382.6617, + "grad_norm": 1596.0970458984375, + "learning_rate": 1.1440000000000002e-06, + "loss": 395.7532, "step": 1430 }, { "epoch": 0.005817782212938909, - "grad_norm": 1533.5958251953125, - "learning_rate": 2.3040000000000003e-06, - "loss": 396.0985, + "grad_norm": 1783.0640869140625, + "learning_rate": 1.1520000000000002e-06, + "loss": 409.5724, "step": 1440 }, { "epoch": 0.0058581834783065405, - "grad_norm": 723.0481567382812, - "learning_rate": 2.3200000000000002e-06, - "loss": 308.0563, + "grad_norm": 706.705078125, + "learning_rate": 1.1600000000000001e-06, + "loss": 319.6016, "step": 1450 }, { "epoch": 0.005898584743674172, - "grad_norm": 2209.6083984375, - "learning_rate": 2.336e-06, - "loss": 424.7519, + "grad_norm": 1962.7529296875, + "learning_rate": 1.168e-06, + "loss": 441.5875, "step": 1460 }, { "epoch": 0.005938986009041803, - "grad_norm": 566.4544067382812, - "learning_rate": 2.352e-06, - "loss": 248.9047, + "grad_norm": 507.7200012207031, + "learning_rate": 1.176e-06, + "loss": 253.8281, "step": 1470 }, { "epoch": 0.0059793872744094345, - "grad_norm": 868.7781982421875, - "learning_rate": 2.3680000000000005e-06, - "loss": 257.8537, + "grad_norm": 907.0985717773438, + "learning_rate": 1.1840000000000002e-06, + "loss": 273.5823, "step": 1480 }, { "epoch": 0.006019788539777066, - "grad_norm": 732.210205078125, - "learning_rate": 2.3840000000000004e-06, - "loss": 304.146, + "grad_norm": 743.5198364257812, + "learning_rate": 1.1920000000000002e-06, + "loss": 318.0907, "step": 1490 }, { "epoch": 0.006060189805144697, - "grad_norm": 946.9075317382812, - "learning_rate": 2.4000000000000003e-06, - "loss": 239.7318, + "grad_norm": 964.8601684570312, + "learning_rate": 1.2000000000000002e-06, + "loss": 251.7046, "step": 1500 }, { "epoch": 0.0061005910705123284, - "grad_norm": 13990.4541015625, - "learning_rate": 2.4160000000000002e-06, - "loss": 317.3043, + "grad_norm": 10732.2978515625, + "learning_rate": 1.2080000000000001e-06, + "loss": 328.2423, "step": 1510 }, { "epoch": 0.00614099233587996, - "grad_norm": 1053.9326171875, - "learning_rate": 2.432e-06, - "loss": 306.3782, + "grad_norm": 929.490966796875, + "learning_rate": 1.216e-06, + "loss": 318.3024, "step": 1520 }, { "epoch": 0.006181393601247591, - "grad_norm": 1479.0665283203125, - "learning_rate": 2.448e-06, - "loss": 321.1809, + "grad_norm": 1505.0806884765625, + "learning_rate": 1.224e-06, + "loss": 328.7247, "step": 1530 }, { "epoch": 0.006221794866615222, "grad_norm": 0.0, - "learning_rate": 2.4640000000000005e-06, - "loss": 272.3122, + "learning_rate": 1.2320000000000002e-06, + "loss": 287.029, "step": 1540 }, { "epoch": 0.006262196131982854, - "grad_norm": 836.43994140625, - "learning_rate": 2.4800000000000004e-06, - "loss": 276.743, + "grad_norm": 752.0944213867188, + "learning_rate": 1.2400000000000002e-06, + "loss": 292.7336, "step": 1550 }, { "epoch": 0.006302597397350485, - "grad_norm": 1576.191650390625, - "learning_rate": 2.496e-06, - "loss": 293.9756, + "grad_norm": 1774.85498046875, + "learning_rate": 1.248e-06, + "loss": 303.2398, "step": 1560 }, { "epoch": 0.006342998662718116, - "grad_norm": 1241.993408203125, - "learning_rate": 2.512e-06, - "loss": 324.3042, + "grad_norm": 1202.3148193359375, + "learning_rate": 1.256e-06, + "loss": 334.9559, "step": 1570 }, { "epoch": 0.006383399928085748, - "grad_norm": 3129.779052734375, - "learning_rate": 2.5280000000000006e-06, - "loss": 282.3287, + "grad_norm": 2789.060791015625, + "learning_rate": 1.2640000000000003e-06, + "loss": 288.0462, "step": 1580 }, { "epoch": 0.006423801193453379, - "grad_norm": 824.8917846679688, - "learning_rate": 2.5440000000000005e-06, - "loss": 332.7758, + "grad_norm": 893.4109497070312, + "learning_rate": 1.2720000000000003e-06, + "loss": 344.6392, "step": 1590 }, { "epoch": 0.00646420245882101, - "grad_norm": 1429.83154296875, - "learning_rate": 2.56e-06, - "loss": 341.1543, + "grad_norm": 1351.9661865234375, + "learning_rate": 1.28e-06, + "loss": 351.581, "step": 1600 }, { "epoch": 0.006504603724188642, - "grad_norm": 1096.0162353515625, - "learning_rate": 2.576e-06, - "loss": 370.956, + "grad_norm": 1121.8851318359375, + "learning_rate": 1.288e-06, + "loss": 384.2762, "step": 1610 }, { "epoch": 0.006545004989556273, - "grad_norm": 2190.672607421875, - "learning_rate": 2.592e-06, - "loss": 393.7615, + "grad_norm": 1744.908935546875, + "learning_rate": 1.296e-06, + "loss": 401.3335, "step": 1620 }, { "epoch": 0.006585406254923904, - "grad_norm": 659.6456909179688, - "learning_rate": 2.608e-06, - "loss": 371.8131, + "grad_norm": 617.60009765625, + "learning_rate": 1.304e-06, + "loss": 384.25, "step": 1630 }, { "epoch": 0.006625807520291536, - "grad_norm": 1114.4139404296875, - "learning_rate": 2.6240000000000006e-06, - "loss": 387.3432, + "grad_norm": 936.4777221679688, + "learning_rate": 1.3120000000000003e-06, + "loss": 394.3849, "step": 1640 }, { "epoch": 0.006666208785659167, - "grad_norm": 1945.1435546875, - "learning_rate": 2.64e-06, - "loss": 419.9613, + "grad_norm": 1067.474609375, + "learning_rate": 1.32e-06, + "loss": 427.2508, "step": 1650 }, { "epoch": 0.006706610051026798, - "grad_norm": 1134.198486328125, - "learning_rate": 2.656e-06, - "loss": 261.7552, + "grad_norm": 974.2322387695312, + "learning_rate": 1.328e-06, + "loss": 269.7852, "step": 1660 }, { "epoch": 0.0067470113163944296, - "grad_norm": 625.1832885742188, - "learning_rate": 2.672e-06, - "loss": 348.9073, + "grad_norm": 641.1205444335938, + "learning_rate": 1.336e-06, + "loss": 362.7043, "step": 1670 }, { "epoch": 0.006787412581762061, - "grad_norm": 1558.600830078125, - "learning_rate": 2.688e-06, - "loss": 329.5783, + "grad_norm": 2192.059814453125, + "learning_rate": 1.344e-06, + "loss": 331.464, "step": 1680 }, { "epoch": 0.006827813847129692, - "grad_norm": 903.2876586914062, - "learning_rate": 2.704e-06, - "loss": 288.3402, + "grad_norm": 976.3654174804688, + "learning_rate": 1.352e-06, + "loss": 296.1574, "step": 1690 }, { "epoch": 0.0068682151124973235, - "grad_norm": 4166.87158203125, - "learning_rate": 2.7200000000000002e-06, - "loss": 357.7317, + "grad_norm": 1700.8463134765625, + "learning_rate": 1.3600000000000001e-06, + "loss": 372.8524, "step": 1700 }, { "epoch": 0.006908616377864955, - "grad_norm": 784.2251586914062, - "learning_rate": 2.736e-06, - "loss": 318.5063, + "grad_norm": 783.0970458984375, + "learning_rate": 1.368e-06, + "loss": 324.4404, "step": 1710 }, { "epoch": 0.006949017643232586, - "grad_norm": 5029.16552734375, - "learning_rate": 2.752e-06, - "loss": 442.4195, + "grad_norm": 5650.396484375, + "learning_rate": 1.376e-06, + "loss": 458.0947, "step": 1720 }, { "epoch": 0.0069894189086002175, - "grad_norm": 781.5169677734375, - "learning_rate": 2.768e-06, - "loss": 306.0392, + "grad_norm": 728.2642211914062, + "learning_rate": 1.384e-06, + "loss": 317.8467, "step": 1730 }, { "epoch": 0.007029820173967849, - "grad_norm": 1048.294921875, - "learning_rate": 2.784e-06, - "loss": 359.3845, + "grad_norm": 889.5052490234375, + "learning_rate": 1.392e-06, + "loss": 368.6525, "step": 1740 }, { "epoch": 0.00707022143933548, - "grad_norm": 1155.3671875, - "learning_rate": 2.8000000000000003e-06, - "loss": 359.3311, + "grad_norm": 1238.4498291015625, + "learning_rate": 1.4000000000000001e-06, + "loss": 367.1417, "step": 1750 }, { "epoch": 0.0071106227047031114, - "grad_norm": 857.3798217773438, - "learning_rate": 2.8160000000000002e-06, - "loss": 391.5449, + "grad_norm": 753.6063232421875, + "learning_rate": 1.4080000000000001e-06, + "loss": 406.0103, "step": 1760 }, { "epoch": 0.007151023970070743, - "grad_norm": 933.0081787109375, - "learning_rate": 2.832e-06, - "loss": 297.32, + "grad_norm": 981.9603881835938, + "learning_rate": 1.416e-06, + "loss": 301.9476, "step": 1770 }, { "epoch": 0.007191425235438374, - "grad_norm": 1060.47314453125, - "learning_rate": 2.848e-06, - "loss": 411.9166, + "grad_norm": 1961.83203125, + "learning_rate": 1.424e-06, + "loss": 426.6, "step": 1780 }, { "epoch": 0.007231826500806005, - "grad_norm": 707.4050903320312, - "learning_rate": 2.864e-06, - "loss": 238.9362, + "grad_norm": 799.2784423828125, + "learning_rate": 1.432e-06, + "loss": 246.6201, "step": 1790 }, { "epoch": 0.007272227766173637, - "grad_norm": 526.6619873046875, - "learning_rate": 2.88e-06, - "loss": 290.1179, + "grad_norm": 500.31414794921875, + "learning_rate": 1.44e-06, + "loss": 295.6354, "step": 1800 }, { "epoch": 0.007312629031541268, - "grad_norm": 829.2198486328125, - "learning_rate": 2.8960000000000003e-06, - "loss": 347.0842, + "grad_norm": 685.4234619140625, + "learning_rate": 1.4480000000000002e-06, + "loss": 359.7325, "step": 1810 }, { "epoch": 0.007353030296908899, - "grad_norm": 716.2144165039062, - "learning_rate": 2.9120000000000002e-06, - "loss": 295.0978, + "grad_norm": 702.3242797851562, + "learning_rate": 1.4560000000000001e-06, + "loss": 304.3423, "step": 1820 }, { "epoch": 0.007393431562276531, - "grad_norm": 1365.7164306640625, - "learning_rate": 2.928e-06, - "loss": 399.1327, + "grad_norm": 975.6823120117188, + "learning_rate": 1.464e-06, + "loss": 411.416, "step": 1830 }, { "epoch": 0.007433832827644162, - "grad_norm": 1474.3541259765625, - "learning_rate": 2.944e-06, - "loss": 249.1963, + "grad_norm": 1523.72412109375, + "learning_rate": 1.472e-06, + "loss": 261.4839, "step": 1840 }, { "epoch": 0.007474234093011793, - "grad_norm": 958.6668090820312, - "learning_rate": 2.96e-06, - "loss": 405.7022, + "grad_norm": 900.5665283203125, + "learning_rate": 1.48e-06, + "loss": 415.8908, "step": 1850 }, { "epoch": 0.007514635358379425, - "grad_norm": 928.7461547851562, - "learning_rate": 2.976e-06, - "loss": 392.5513, + "grad_norm": 829.6421508789062, + "learning_rate": 1.488e-06, + "loss": 404.1582, "step": 1860 }, { "epoch": 0.007555036623747056, "grad_norm": 0.0, - "learning_rate": 2.9920000000000003e-06, - "loss": 235.2757, + "learning_rate": 1.4960000000000002e-06, + "loss": 243.9216, "step": 1870 }, { "epoch": 0.007595437889114687, - "grad_norm": 13806.5771484375, - "learning_rate": 3.0080000000000003e-06, - "loss": 338.4041, + "grad_norm": 9396.9931640625, + "learning_rate": 1.5040000000000001e-06, + "loss": 342.0865, "step": 1880 }, { "epoch": 0.007635839154482319, - "grad_norm": 2829.76513671875, - "learning_rate": 3.024e-06, - "loss": 365.0503, + "grad_norm": 2817.982177734375, + "learning_rate": 1.512e-06, + "loss": 369.8314, "step": 1890 }, { "epoch": 0.00767624041984995, - "grad_norm": 1206.556640625, - "learning_rate": 3.04e-06, - "loss": 372.7524, + "grad_norm": 1293.2032470703125, + "learning_rate": 1.52e-06, + "loss": 379.3789, "step": 1900 }, { "epoch": 0.007716641685217581, - "grad_norm": 1795.8287353515625, - "learning_rate": 3.056e-06, - "loss": 355.855, + "grad_norm": 2410.448974609375, + "learning_rate": 1.528e-06, + "loss": 370.2495, "step": 1910 }, { "epoch": 0.0077570429505852126, - "grad_norm": 733.5438232421875, - "learning_rate": 3.072e-06, - "loss": 369.4895, + "grad_norm": 667.0952758789062, + "learning_rate": 1.536e-06, + "loss": 386.5071, "step": 1920 }, { "epoch": 0.007797444215952844, - "grad_norm": 1936.3282470703125, - "learning_rate": 3.0880000000000003e-06, - "loss": 331.1103, + "grad_norm": 2129.815673828125, + "learning_rate": 1.5440000000000002e-06, + "loss": 336.1956, "step": 1930 }, { "epoch": 0.007837845481320474, - "grad_norm": 2228.966552734375, - "learning_rate": 3.1040000000000003e-06, - "loss": 441.473, + "grad_norm": 1978.9732666015625, + "learning_rate": 1.5520000000000001e-06, + "loss": 441.1761, "step": 1940 }, { "epoch": 0.007878246746688106, - "grad_norm": 1527.3521728515625, - "learning_rate": 3.12e-06, - "loss": 286.5484, + "grad_norm": 1219.0206298828125, + "learning_rate": 1.56e-06, + "loss": 293.0514, "step": 1950 }, { "epoch": 0.007918648012055737, - "grad_norm": 1155.413330078125, - "learning_rate": 3.136e-06, - "loss": 325.0518, + "grad_norm": 914.8988037109375, + "learning_rate": 1.568e-06, + "loss": 334.6477, "step": 1960 }, { "epoch": 0.007959049277423368, - "grad_norm": 5305.16845703125, - "learning_rate": 3.152e-06, - "loss": 426.8875, + "grad_norm": 4216.171875, + "learning_rate": 1.576e-06, + "loss": 434.4091, "step": 1970 }, { "epoch": 0.007999450542791, - "grad_norm": 497.835693359375, - "learning_rate": 3.1680000000000004e-06, - "loss": 305.8856, + "grad_norm": 489.4588928222656, + "learning_rate": 1.5840000000000002e-06, + "loss": 311.9005, "step": 1980 }, { "epoch": 0.008039851808158631, - "grad_norm": 950.1622924804688, - "learning_rate": 3.1840000000000003e-06, - "loss": 298.3484, + "grad_norm": 1002.6290283203125, + "learning_rate": 1.5920000000000002e-06, + "loss": 308.7559, "step": 1990 }, { "epoch": 0.008080253073526262, - "grad_norm": 977.10546875, - "learning_rate": 3.2000000000000003e-06, - "loss": 385.7309, + "grad_norm": 831.8458251953125, + "learning_rate": 1.6000000000000001e-06, + "loss": 394.3967, "step": 2000 }, { "epoch": 0.008120654338893894, - "grad_norm": 1347.1170654296875, - "learning_rate": 3.216e-06, - "loss": 311.8849, + "grad_norm": 1673.561279296875, + "learning_rate": 1.608e-06, + "loss": 320.009, "step": 2010 }, { "epoch": 0.008161055604261525, - "grad_norm": 7115.32470703125, - "learning_rate": 3.232e-06, - "loss": 310.0182, + "grad_norm": 6472.73779296875, + "learning_rate": 1.616e-06, + "loss": 315.0331, "step": 2020 }, { "epoch": 0.008201456869629156, - "grad_norm": 979.557861328125, - "learning_rate": 3.248e-06, - "loss": 378.8968, + "grad_norm": 995.5790405273438, + "learning_rate": 1.624e-06, + "loss": 384.2618, "step": 2030 }, { "epoch": 0.008241858134996788, - "grad_norm": 713.5515747070312, - "learning_rate": 3.2640000000000004e-06, - "loss": 333.8391, + "grad_norm": 712.1622314453125, + "learning_rate": 1.6320000000000002e-06, + "loss": 340.4509, "step": 2040 }, { "epoch": 0.008282259400364419, - "grad_norm": 1746.4747314453125, - "learning_rate": 3.2800000000000004e-06, - "loss": 211.6272, + "grad_norm": 1826.845458984375, + "learning_rate": 1.6400000000000002e-06, + "loss": 219.4854, "step": 2050 }, { "epoch": 0.00832266066573205, - "grad_norm": 3959.4423828125, - "learning_rate": 3.2960000000000003e-06, - "loss": 418.0941, + "grad_norm": 6947.80615234375, + "learning_rate": 1.6480000000000001e-06, + "loss": 432.1352, "step": 2060 }, { "epoch": 0.008363061931099681, - "grad_norm": 1449.259765625, - "learning_rate": 3.3120000000000002e-06, - "loss": 366.1021, + "grad_norm": 1190.05859375, + "learning_rate": 1.6560000000000001e-06, + "loss": 371.0119, "step": 2070 }, { "epoch": 0.008403463196467313, - "grad_norm": 686.3818359375, - "learning_rate": 3.328e-06, - "loss": 317.5829, + "grad_norm": 649.7431030273438, + "learning_rate": 1.664e-06, + "loss": 327.9862, "step": 2080 }, { "epoch": 0.008443864461834944, - "grad_norm": 1431.3048095703125, - "learning_rate": 3.344e-06, - "loss": 341.1351, + "grad_norm": 1118.436767578125, + "learning_rate": 1.672e-06, + "loss": 363.9542, "step": 2090 }, { "epoch": 0.008484265727202575, - "grad_norm": 1141.1195068359375, - "learning_rate": 3.3600000000000004e-06, - "loss": 284.9502, + "grad_norm": 1624.8919677734375, + "learning_rate": 1.6800000000000002e-06, + "loss": 291.973, "step": 2100 }, { "epoch": 0.008524666992570207, - "grad_norm": 583.6392211914062, - "learning_rate": 3.3760000000000004e-06, - "loss": 246.5204, + "grad_norm": 750.1419677734375, + "learning_rate": 1.6880000000000002e-06, + "loss": 254.5157, "step": 2110 }, { "epoch": 0.008565068257937838, - "grad_norm": 1383.4288330078125, - "learning_rate": 3.3920000000000003e-06, - "loss": 335.3397, + "grad_norm": 1894.7906494140625, + "learning_rate": 1.6960000000000002e-06, + "loss": 337.0174, "step": 2120 }, { "epoch": 0.00860546952330547, - "grad_norm": 720.5582275390625, - "learning_rate": 3.4080000000000002e-06, - "loss": 300.3475, + "grad_norm": 970.1087646484375, + "learning_rate": 1.7040000000000001e-06, + "loss": 307.9366, "step": 2130 }, { "epoch": 0.0086458707886731, - "grad_norm": 1018.2730712890625, - "learning_rate": 3.424e-06, - "loss": 364.7046, + "grad_norm": 1048.277099609375, + "learning_rate": 1.712e-06, + "loss": 375.4305, "step": 2140 }, { "epoch": 0.008686272054040732, - "grad_norm": 836.5111694335938, - "learning_rate": 3.44e-06, - "loss": 215.2347, + "grad_norm": 861.28564453125, + "learning_rate": 1.72e-06, + "loss": 220.679, "step": 2150 }, { "epoch": 0.008726673319408363, - "grad_norm": 1106.7752685546875, - "learning_rate": 3.4560000000000005e-06, - "loss": 392.8156, + "grad_norm": 1113.1458740234375, + "learning_rate": 1.7280000000000002e-06, + "loss": 398.2784, "step": 2160 }, { "epoch": 0.008767074584775995, - "grad_norm": 838.0346069335938, - "learning_rate": 3.4720000000000004e-06, - "loss": 328.1276, + "grad_norm": 876.524658203125, + "learning_rate": 1.7360000000000002e-06, + "loss": 336.294, "step": 2170 }, { "epoch": 0.008807475850143626, - "grad_norm": 870.4414672851562, - "learning_rate": 3.4880000000000003e-06, - "loss": 313.8927, + "grad_norm": 846.8402099609375, + "learning_rate": 1.7440000000000002e-06, + "loss": 321.9455, "step": 2180 }, { "epoch": 0.008847877115511257, - "grad_norm": 2859.623046875, - "learning_rate": 3.5040000000000002e-06, - "loss": 241.9949, + "grad_norm": 3380.1220703125, + "learning_rate": 1.7520000000000001e-06, + "loss": 247.7712, "step": 2190 }, { "epoch": 0.008888278380878889, - "grad_norm": 971.9884033203125, - "learning_rate": 3.52e-06, - "loss": 349.5033, + "grad_norm": 912.8999633789062, + "learning_rate": 1.76e-06, + "loss": 359.8481, "step": 2200 }, { "epoch": 0.00892867964624652, - "grad_norm": 670.4691772460938, - "learning_rate": 3.5360000000000005e-06, - "loss": 239.4784, + "grad_norm": 694.2388305664062, + "learning_rate": 1.7680000000000003e-06, + "loss": 245.2954, "step": 2210 }, { "epoch": 0.008969080911614151, - "grad_norm": 784.5670166015625, - "learning_rate": 3.5520000000000005e-06, - "loss": 310.9449, + "grad_norm": 1017.18115234375, + "learning_rate": 1.7760000000000002e-06, + "loss": 319.9184, "step": 2220 }, { "epoch": 0.009009482176981783, - "grad_norm": 915.0491333007812, - "learning_rate": 3.5680000000000004e-06, - "loss": 413.7835, + "grad_norm": 777.8660888671875, + "learning_rate": 1.7840000000000002e-06, + "loss": 427.6461, "step": 2230 }, { "epoch": 0.009049883442349414, - "grad_norm": 893.2843627929688, - "learning_rate": 3.5840000000000003e-06, - "loss": 365.2546, + "grad_norm": 1069.22265625, + "learning_rate": 1.7920000000000002e-06, + "loss": 374.4266, "step": 2240 }, { "epoch": 0.009090284707717045, - "grad_norm": 1068.404296875, - "learning_rate": 3.6000000000000003e-06, - "loss": 302.3378, + "grad_norm": 863.7400512695312, + "learning_rate": 1.8000000000000001e-06, + "loss": 308.5469, "step": 2250 }, { "epoch": 0.009130685973084677, - "grad_norm": 1454.05859375, - "learning_rate": 3.616e-06, - "loss": 285.7942, + "grad_norm": 1957.5294189453125, + "learning_rate": 1.808e-06, + "loss": 294.6305, "step": 2260 }, { "epoch": 0.009171087238452308, - "grad_norm": 3212.861083984375, - "learning_rate": 3.6320000000000005e-06, - "loss": 296.5446, + "grad_norm": 2914.186767578125, + "learning_rate": 1.8160000000000003e-06, + "loss": 308.1572, "step": 2270 }, { "epoch": 0.00921148850381994, - "grad_norm": 700.9654541015625, - "learning_rate": 3.6480000000000005e-06, - "loss": 277.7706, + "grad_norm": 635.1797485351562, + "learning_rate": 1.8240000000000002e-06, + "loss": 287.1085, "step": 2280 }, { "epoch": 0.00925188976918757, - "grad_norm": 601.0996704101562, - "learning_rate": 3.6640000000000004e-06, - "loss": 299.2936, + "grad_norm": 724.2377319335938, + "learning_rate": 1.8320000000000002e-06, + "loss": 294.9514, "step": 2290 }, { "epoch": 0.009292291034555202, - "grad_norm": 1073.6248779296875, - "learning_rate": 3.6800000000000003e-06, - "loss": 227.1634, + "grad_norm": 1040.5750732421875, + "learning_rate": 1.8400000000000002e-06, + "loss": 236.8611, "step": 2300 }, { "epoch": 0.009332692299922833, - "grad_norm": 803.8406372070312, - "learning_rate": 3.6960000000000003e-06, - "loss": 344.3006, + "grad_norm": 879.510498046875, + "learning_rate": 1.8480000000000001e-06, + "loss": 355.5618, "step": 2310 }, { "epoch": 0.009373093565290464, - "grad_norm": 940.9983520507812, - "learning_rate": 3.712e-06, - "loss": 266.4591, + "grad_norm": 1190.15966796875, + "learning_rate": 1.856e-06, + "loss": 276.8327, "step": 2320 }, { "epoch": 0.009413494830658096, - "grad_norm": 13408.8291015625, - "learning_rate": 3.7280000000000006e-06, - "loss": 295.9835, + "grad_norm": 12589.9306640625, + "learning_rate": 1.8640000000000003e-06, + "loss": 309.8928, "step": 2330 }, { "epoch": 0.009453896096025727, - "grad_norm": 1009.2761840820312, - "learning_rate": 3.7440000000000005e-06, - "loss": 292.3968, + "grad_norm": 836.6668701171875, + "learning_rate": 1.8720000000000002e-06, + "loss": 307.7065, "step": 2340 }, { "epoch": 0.009494297361393358, - "grad_norm": 556.4901733398438, - "learning_rate": 3.7600000000000004e-06, - "loss": 262.7075, + "grad_norm": 535.9210815429688, + "learning_rate": 1.8800000000000002e-06, + "loss": 270.6946, "step": 2350 }, { "epoch": 0.00953469862676099, - "grad_norm": 913.5816040039062, - "learning_rate": 3.7760000000000004e-06, - "loss": 508.4715, + "grad_norm": 955.3150024414062, + "learning_rate": 1.8880000000000002e-06, + "loss": 500.9817, "step": 2360 }, { "epoch": 0.009575099892128621, - "grad_norm": 1582.836181640625, - "learning_rate": 3.7920000000000003e-06, - "loss": 321.3247, + "grad_norm": 1457.6890869140625, + "learning_rate": 1.8960000000000001e-06, + "loss": 332.6355, "step": 2370 }, { "epoch": 0.009615501157496252, - "grad_norm": 1292.945068359375, - "learning_rate": 3.8080000000000006e-06, - "loss": 266.3162, + "grad_norm": 1534.5045166015625, + "learning_rate": 1.9040000000000003e-06, + "loss": 274.5902, "step": 2380 }, { "epoch": 0.009655902422863884, - "grad_norm": 741.0372314453125, - "learning_rate": 3.824e-06, - "loss": 333.5866, + "grad_norm": 789.72216796875, + "learning_rate": 1.912e-06, + "loss": 343.0171, "step": 2390 }, { "epoch": 0.009696303688231515, - "grad_norm": 594.6494750976562, - "learning_rate": 3.8400000000000005e-06, - "loss": 370.6466, + "grad_norm": 629.2387084960938, + "learning_rate": 1.9200000000000003e-06, + "loss": 379.0578, "step": 2400 }, { "epoch": 0.009736704953599146, - "grad_norm": 671.8253173828125, - "learning_rate": 3.856e-06, - "loss": 176.7593, + "grad_norm": 668.2212524414062, + "learning_rate": 1.928e-06, + "loss": 182.6655, "step": 2410 }, { "epoch": 0.009777106218966778, - "grad_norm": 1318.59912109375, - "learning_rate": 3.872e-06, - "loss": 296.8816, + "grad_norm": 791.4715576171875, + "learning_rate": 1.936e-06, + "loss": 302.6184, "step": 2420 }, { "epoch": 0.009817507484334409, - "grad_norm": 4490.33837890625, - "learning_rate": 3.888e-06, - "loss": 353.2209, + "grad_norm": 4107.4833984375, + "learning_rate": 1.944e-06, + "loss": 364.3251, "step": 2430 }, { "epoch": 0.00985790874970204, - "grad_norm": 1145.6590576171875, - "learning_rate": 3.904e-06, - "loss": 232.9775, + "grad_norm": 1162.642822265625, + "learning_rate": 1.952e-06, + "loss": 239.8189, "step": 2440 }, { "epoch": 0.009898310015069672, - "grad_norm": 669.175537109375, - "learning_rate": 3.920000000000001e-06, - "loss": 245.2131, + "grad_norm": 611.2891235351562, + "learning_rate": 1.9600000000000003e-06, + "loss": 253.1135, "step": 2450 }, { "epoch": 0.009938711280437303, - "grad_norm": 945.0543212890625, - "learning_rate": 3.936e-06, - "loss": 189.2121, + "grad_norm": 781.6123046875, + "learning_rate": 1.968e-06, + "loss": 197.4247, "step": 2460 }, { "epoch": 0.009979112545804934, - "grad_norm": 871.2230224609375, - "learning_rate": 3.9520000000000004e-06, - "loss": 354.6668, + "grad_norm": 899.2442626953125, + "learning_rate": 1.9760000000000002e-06, + "loss": 359.1835, "step": 2470 }, { "epoch": 0.010019513811172566, - "grad_norm": 1143.5745849609375, - "learning_rate": 3.968e-06, - "loss": 309.6017, + "grad_norm": 1001.3045654296875, + "learning_rate": 1.984e-06, + "loss": 319.6203, "step": 2480 }, { "epoch": 0.010059915076540197, - "grad_norm": 921.1688842773438, - "learning_rate": 3.984e-06, - "loss": 393.8875, + "grad_norm": 801.998291015625, + "learning_rate": 1.992e-06, + "loss": 406.7989, "step": 2490 }, { "epoch": 0.010100316341907828, - "grad_norm": 670.0267333984375, - "learning_rate": 4.000000000000001e-06, - "loss": 187.9601, + "grad_norm": 564.4352416992188, + "learning_rate": 2.0000000000000003e-06, + "loss": 193.7105, "step": 2500 }, { "epoch": 0.01014071760727546, - "grad_norm": 906.67626953125, - "learning_rate": 4.016e-06, - "loss": 263.1041, + "grad_norm": 876.2869262695312, + "learning_rate": 2.008e-06, + "loss": 271.2863, "step": 2510 }, { "epoch": 0.010181118872643091, - "grad_norm": 501.8714599609375, - "learning_rate": 4.0320000000000005e-06, - "loss": 249.8009, + "grad_norm": 512.7799682617188, + "learning_rate": 2.0160000000000003e-06, + "loss": 256.779, "step": 2520 }, { "epoch": 0.010221520138010722, - "grad_norm": 1012.7957153320312, - "learning_rate": 4.048e-06, - "loss": 215.6589, + "grad_norm": 815.3817138671875, + "learning_rate": 2.024e-06, + "loss": 221.8417, "step": 2530 }, { "epoch": 0.010261921403378354, - "grad_norm": 1067.8087158203125, - "learning_rate": 4.064e-06, - "loss": 277.4538, + "grad_norm": 1036.2320556640625, + "learning_rate": 2.032e-06, + "loss": 288.727, "step": 2540 }, { "epoch": 0.010302322668745985, - "grad_norm": 821.8876342773438, - "learning_rate": 4.08e-06, - "loss": 357.288, + "grad_norm": 1002.4608764648438, + "learning_rate": 2.04e-06, + "loss": 363.3406, "step": 2550 }, { "epoch": 0.010342723934113616, - "grad_norm": 695.7543334960938, - "learning_rate": 4.096e-06, - "loss": 192.3841, + "grad_norm": 602.5241088867188, + "learning_rate": 2.048e-06, + "loss": 199.61, "step": 2560 }, { "epoch": 0.010383125199481247, - "grad_norm": 2217.32080078125, - "learning_rate": 4.112000000000001e-06, - "loss": 311.4946, + "grad_norm": 1869.5498046875, + "learning_rate": 2.0560000000000003e-06, + "loss": 325.036, "step": 2570 }, { "epoch": 0.010423526464848879, - "grad_norm": 1252.262451171875, - "learning_rate": 4.128e-06, - "loss": 299.7565, + "grad_norm": 1140.6495361328125, + "learning_rate": 2.064e-06, + "loss": 305.7905, "step": 2580 }, { "epoch": 0.01046392773021651, - "grad_norm": 1227.412109375, - "learning_rate": 4.1440000000000005e-06, - "loss": 258.2722, + "grad_norm": 1102.019287109375, + "learning_rate": 2.0720000000000002e-06, + "loss": 269.985, "step": 2590 }, { "epoch": 0.010504328995584141, - "grad_norm": 1460.778076171875, - "learning_rate": 4.16e-06, - "loss": 310.2179, + "grad_norm": 1465.7049560546875, + "learning_rate": 2.08e-06, + "loss": 318.5626, "step": 2600 }, { "epoch": 0.010544730260951773, - "grad_norm": 3191.095947265625, - "learning_rate": 4.176e-06, - "loss": 276.1908, + "grad_norm": 3439.4609375, + "learning_rate": 2.088e-06, + "loss": 285.6084, "step": 2610 }, { "epoch": 0.010585131526319404, - "grad_norm": 841.0155029296875, - "learning_rate": 4.192000000000001e-06, - "loss": 409.2921, + "grad_norm": 810.2839965820312, + "learning_rate": 2.0960000000000003e-06, + "loss": 416.4135, "step": 2620 }, { "epoch": 0.010625532791687035, - "grad_norm": 979.8448486328125, - "learning_rate": 4.208e-06, - "loss": 370.0784, + "grad_norm": 931.5498657226562, + "learning_rate": 2.104e-06, + "loss": 379.4601, "step": 2630 }, { "epoch": 0.010665934057054667, - "grad_norm": 779.2421264648438, - "learning_rate": 4.2240000000000006e-06, - "loss": 327.6988, + "grad_norm": 698.158935546875, + "learning_rate": 2.1120000000000003e-06, + "loss": 338.5171, "step": 2640 }, { "epoch": 0.010706335322422298, - "grad_norm": 1958.304443359375, - "learning_rate": 4.24e-06, - "loss": 270.3837, + "grad_norm": 1654.8193359375, + "learning_rate": 2.12e-06, + "loss": 277.3883, "step": 2650 }, { "epoch": 0.01074673658778993, - "grad_norm": 612.7789916992188, - "learning_rate": 4.256e-06, - "loss": 208.5096, + "grad_norm": 541.6326293945312, + "learning_rate": 2.128e-06, + "loss": 212.8906, "step": 2660 }, { "epoch": 0.01078713785315756, - "grad_norm": 913.646484375, - "learning_rate": 4.272000000000001e-06, - "loss": 326.1478, + "grad_norm": 1034.8848876953125, + "learning_rate": 2.1360000000000004e-06, + "loss": 338.241, "step": 2670 }, { "epoch": 0.010827539118525192, - "grad_norm": 609.9149780273438, - "learning_rate": 4.288e-06, - "loss": 323.9391, + "grad_norm": 567.1165771484375, + "learning_rate": 2.144e-06, + "loss": 341.1244, "step": 2680 }, { "epoch": 0.010867940383892823, - "grad_norm": 4146.9677734375, - "learning_rate": 4.304000000000001e-06, - "loss": 331.8204, + "grad_norm": 4211.4228515625, + "learning_rate": 2.1520000000000003e-06, + "loss": 349.5273, "step": 2690 }, { "epoch": 0.010908341649260455, - "grad_norm": 1144.696044921875, - "learning_rate": 4.32e-06, - "loss": 344.9819, + "grad_norm": 1199.0078125, + "learning_rate": 2.16e-06, + "loss": 361.8076, "step": 2700 }, { "epoch": 0.010948742914628086, - "grad_norm": 1466.951904296875, - "learning_rate": 4.3360000000000005e-06, - "loss": 364.1662, + "grad_norm": 1021.7445068359375, + "learning_rate": 2.1680000000000002e-06, + "loss": 373.7587, "step": 2710 }, { "epoch": 0.010989144179995717, - "grad_norm": 955.5458984375, - "learning_rate": 4.352e-06, - "loss": 284.9516, + "grad_norm": 868.0924072265625, + "learning_rate": 2.176e-06, + "loss": 290.991, "step": 2720 }, { "epoch": 0.011029545445363349, - "grad_norm": 591.5704345703125, - "learning_rate": 4.368e-06, - "loss": 279.7713, + "grad_norm": 640.3760986328125, + "learning_rate": 2.184e-06, + "loss": 284.8304, "step": 2730 }, { "epoch": 0.01106994671073098, - "grad_norm": 1922.548095703125, - "learning_rate": 4.384000000000001e-06, - "loss": 269.3951, + "grad_norm": 1700.9189453125, + "learning_rate": 2.1920000000000004e-06, + "loss": 276.7029, "step": 2740 }, { "epoch": 0.011110347976098611, - "grad_norm": 705.4920654296875, - "learning_rate": 4.4e-06, - "loss": 202.7344, + "grad_norm": 653.1834106445312, + "learning_rate": 2.2e-06, + "loss": 208.0135, "step": 2750 }, { "epoch": 0.011150749241466243, - "grad_norm": 6034.74951171875, - "learning_rate": 4.416000000000001e-06, - "loss": 319.9652, + "grad_norm": 3310.527099609375, + "learning_rate": 2.2080000000000003e-06, + "loss": 325.7164, "step": 2760 }, { "epoch": 0.011191150506833874, - "grad_norm": 1474.4591064453125, - "learning_rate": 4.432e-06, - "loss": 239.9669, + "grad_norm": 1914.8753662109375, + "learning_rate": 2.216e-06, + "loss": 248.9018, "step": 2770 }, { "epoch": 0.011231551772201505, - "grad_norm": 998.685302734375, - "learning_rate": 4.4480000000000004e-06, - "loss": 254.0082, + "grad_norm": 980.6690673828125, + "learning_rate": 2.2240000000000002e-06, + "loss": 264.1739, "step": 2780 }, { "epoch": 0.011271953037569137, - "grad_norm": 1730.3958740234375, - "learning_rate": 4.464000000000001e-06, - "loss": 285.7142, + "grad_norm": 839.1393432617188, + "learning_rate": 2.2320000000000004e-06, + "loss": 295.3007, "step": 2790 }, { "epoch": 0.011312354302936768, - "grad_norm": 666.0034790039062, - "learning_rate": 4.48e-06, - "loss": 243.7346, + "grad_norm": 727.7144165039062, + "learning_rate": 2.24e-06, + "loss": 251.4149, "step": 2800 }, { "epoch": 0.0113527555683044, - "grad_norm": 1185.6009521484375, - "learning_rate": 4.496000000000001e-06, - "loss": 243.6461, + "grad_norm": 1180.777099609375, + "learning_rate": 2.2480000000000003e-06, + "loss": 250.4361, "step": 2810 }, { "epoch": 0.01139315683367203, - "grad_norm": 902.0358276367188, - "learning_rate": 4.512e-06, - "loss": 193.6116, + "grad_norm": 733.5826416015625, + "learning_rate": 2.256e-06, + "loss": 199.094, "step": 2820 }, { "epoch": 0.011433558099039662, - "grad_norm": 1310.3072509765625, - "learning_rate": 4.5280000000000005e-06, - "loss": 327.7516, + "grad_norm": 1635.172119140625, + "learning_rate": 2.2640000000000003e-06, + "loss": 334.5896, "step": 2830 }, { "epoch": 0.011473959364407293, - "grad_norm": 693.1176147460938, - "learning_rate": 4.544000000000001e-06, - "loss": 239.3185, + "grad_norm": 630.0571899414062, + "learning_rate": 2.2720000000000004e-06, + "loss": 246.5118, "step": 2840 }, { "epoch": 0.011514360629774924, - "grad_norm": 968.5886840820312, - "learning_rate": 4.56e-06, - "loss": 287.3573, + "grad_norm": 758.105224609375, + "learning_rate": 2.28e-06, + "loss": 294.8159, "step": 2850 }, { "epoch": 0.011554761895142556, - "grad_norm": 705.2077026367188, - "learning_rate": 4.576000000000001e-06, - "loss": 282.05, + "grad_norm": 959.7066650390625, + "learning_rate": 2.2880000000000004e-06, + "loss": 291.1446, "step": 2860 }, { "epoch": 0.011595163160510187, - "grad_norm": 1975.849365234375, - "learning_rate": 4.592e-06, - "loss": 296.7342, + "grad_norm": 1965.349853515625, + "learning_rate": 2.296e-06, + "loss": 307.958, "step": 2870 }, { "epoch": 0.011635564425877818, - "grad_norm": 746.0858764648438, - "learning_rate": 4.608000000000001e-06, - "loss": 204.9099, + "grad_norm": 642.3687133789062, + "learning_rate": 2.3040000000000003e-06, + "loss": 214.3914, "step": 2880 }, { "epoch": 0.01167596569124545, - "grad_norm": 1130.648681640625, - "learning_rate": 4.624e-06, - "loss": 342.3003, + "grad_norm": 1156.0281982421875, + "learning_rate": 2.312e-06, + "loss": 357.4384, "step": 2890 }, { "epoch": 0.011716366956613081, - "grad_norm": 1225.900634765625, - "learning_rate": 4.6400000000000005e-06, - "loss": 302.3731, + "grad_norm": 1323.7642822265625, + "learning_rate": 2.3200000000000002e-06, + "loss": 311.784, "step": 2900 }, { "epoch": 0.011756768221980712, - "grad_norm": 965.9699096679688, - "learning_rate": 4.656000000000001e-06, - "loss": 403.5073, + "grad_norm": 1038.2747802734375, + "learning_rate": 2.3280000000000004e-06, + "loss": 401.4958, "step": 2910 }, { "epoch": 0.011797169487348344, - "grad_norm": 700.615966796875, - "learning_rate": 4.672e-06, - "loss": 220.3576, + "grad_norm": 668.4197998046875, + "learning_rate": 2.336e-06, + "loss": 230.4382, "step": 2920 }, { "epoch": 0.011837570752715975, - "grad_norm": 744.499267578125, - "learning_rate": 4.688000000000001e-06, - "loss": 227.1823, + "grad_norm": 734.0855102539062, + "learning_rate": 2.3440000000000003e-06, + "loss": 235.5078, "step": 2930 }, { "epoch": 0.011877972018083606, - "grad_norm": 588.2945556640625, - "learning_rate": 4.704e-06, - "loss": 294.2805, + "grad_norm": 569.2127685546875, + "learning_rate": 2.352e-06, + "loss": 310.0418, "step": 2940 }, { "epoch": 0.011918373283451238, - "grad_norm": 8179.02490234375, - "learning_rate": 4.7200000000000005e-06, - "loss": 360.6105, + "grad_norm": 6976.2177734375, + "learning_rate": 2.3600000000000003e-06, + "loss": 368.4, "step": 2950 }, { "epoch": 0.011958774548818869, - "grad_norm": 4027.3515625, - "learning_rate": 4.736000000000001e-06, - "loss": 286.0583, + "grad_norm": 4395.798828125, + "learning_rate": 2.3680000000000005e-06, + "loss": 298.4696, "step": 2960 }, { "epoch": 0.0119991758141865, - "grad_norm": 8620.037109375, - "learning_rate": 4.752e-06, - "loss": 338.6114, + "grad_norm": 8097.44677734375, + "learning_rate": 2.376e-06, + "loss": 349.6447, "step": 2970 }, { "epoch": 0.012039577079554132, - "grad_norm": 1030.6036376953125, - "learning_rate": 4.768000000000001e-06, - "loss": 293.7208, + "grad_norm": 789.9635009765625, + "learning_rate": 2.3840000000000004e-06, + "loss": 303.074, "step": 2980 }, { "epoch": 0.012079978344921763, - "grad_norm": 2113.48095703125, - "learning_rate": 4.784e-06, - "loss": 338.7042, + "grad_norm": 1684.7451171875, + "learning_rate": 2.392e-06, + "loss": 346.2279, "step": 2990 }, { "epoch": 0.012120379610289394, - "grad_norm": 664.2764892578125, - "learning_rate": 4.800000000000001e-06, - "loss": 180.6681, + "grad_norm": 575.2348022460938, + "learning_rate": 2.4000000000000003e-06, + "loss": 188.9465, "step": 3000 }, { "epoch": 0.012160780875657026, - "grad_norm": 2052.64111328125, - "learning_rate": 4.816e-06, - "loss": 269.5311, + "grad_norm": 2578.208740234375, + "learning_rate": 2.408e-06, + "loss": 272.512, "step": 3010 }, { "epoch": 0.012201182141024657, - "grad_norm": 646.8975219726562, - "learning_rate": 4.8320000000000005e-06, - "loss": 199.238, + "grad_norm": 664.0204467773438, + "learning_rate": 2.4160000000000002e-06, + "loss": 210.9064, "step": 3020 }, { "epoch": 0.012241583406392288, - "grad_norm": 1087.690673828125, - "learning_rate": 4.848000000000001e-06, - "loss": 300.9158, + "grad_norm": 1122.7061767578125, + "learning_rate": 2.4240000000000004e-06, + "loss": 312.6036, "step": 3030 }, { "epoch": 0.01228198467175992, - "grad_norm": 1012.8238525390625, - "learning_rate": 4.864e-06, - "loss": 405.1533, + "grad_norm": 1009.51416015625, + "learning_rate": 2.432e-06, + "loss": 421.7347, "step": 3040 }, { "epoch": 0.01232238593712755, - "grad_norm": 1519.6015625, - "learning_rate": 4.880000000000001e-06, - "loss": 340.4835, + "grad_norm": 1166.524658203125, + "learning_rate": 2.4400000000000004e-06, + "loss": 354.4147, "step": 3050 }, { "epoch": 0.012362787202495182, - "grad_norm": 787.8037109375, - "learning_rate": 4.896e-06, - "loss": 249.336, + "grad_norm": 864.7195434570312, + "learning_rate": 2.448e-06, + "loss": 260.3325, "step": 3060 }, { "epoch": 0.012403188467862813, - "grad_norm": 1030.4306640625, - "learning_rate": 4.9120000000000006e-06, - "loss": 248.3195, + "grad_norm": 1270.10595703125, + "learning_rate": 2.4560000000000003e-06, + "loss": 258.1562, "step": 3070 }, { "epoch": 0.012443589733230445, - "grad_norm": 1620.253173828125, - "learning_rate": 4.928000000000001e-06, - "loss": 210.1467, + "grad_norm": 1230.795166015625, + "learning_rate": 2.4640000000000005e-06, + "loss": 218.6261, "step": 3080 }, { "epoch": 0.012483990998598076, - "grad_norm": 2037.361328125, - "learning_rate": 4.9440000000000004e-06, - "loss": 331.2714, + "grad_norm": 2445.281494140625, + "learning_rate": 2.4720000000000002e-06, + "loss": 342.231, "step": 3090 }, { "epoch": 0.012524392263965707, - "grad_norm": 4152.18701171875, - "learning_rate": 4.960000000000001e-06, - "loss": 369.5478, + "grad_norm": 3970.042724609375, + "learning_rate": 2.4800000000000004e-06, + "loss": 385.3239, "step": 3100 }, { "epoch": 0.012564793529333339, - "grad_norm": 751.2320556640625, - "learning_rate": 4.976e-06, - "loss": 339.7245, + "grad_norm": 769.3772583007812, + "learning_rate": 2.488e-06, + "loss": 348.2264, "step": 3110 }, { "epoch": 0.01260519479470097, - "grad_norm": 1463.79833984375, - "learning_rate": 4.992e-06, - "loss": 349.4508, + "grad_norm": 1864.233642578125, + "learning_rate": 2.496e-06, + "loss": 360.0176, "step": 3120 }, { "epoch": 0.012645596060068601, - "grad_norm": 931.794189453125, - "learning_rate": 5.008000000000001e-06, - "loss": 219.2512, + "grad_norm": 932.556640625, + "learning_rate": 2.5040000000000005e-06, + "loss": 230.3502, "step": 3130 }, { "epoch": 0.012685997325436233, - "grad_norm": 587.0802001953125, - "learning_rate": 5.024e-06, - "loss": 249.6072, + "grad_norm": 632.7327880859375, + "learning_rate": 2.512e-06, + "loss": 266.6627, "step": 3140 }, { "epoch": 0.012726398590803864, - "grad_norm": 4629.9833984375, - "learning_rate": 5.04e-06, - "loss": 274.248, + "grad_norm": 6750.98486328125, + "learning_rate": 2.52e-06, + "loss": 288.1833, "step": 3150 }, { "epoch": 0.012766799856171495, - "grad_norm": 1912.8997802734375, - "learning_rate": 5.056000000000001e-06, - "loss": 265.1541, + "grad_norm": 1928.933837890625, + "learning_rate": 2.5280000000000006e-06, + "loss": 275.775, "step": 3160 }, { "epoch": 0.012807201121539127, - "grad_norm": 714.0695190429688, - "learning_rate": 5.072e-06, - "loss": 189.4337, + "grad_norm": 771.8600463867188, + "learning_rate": 2.536e-06, + "loss": 193.8478, "step": 3170 }, { "epoch": 0.012847602386906758, - "grad_norm": 1278.3006591796875, - "learning_rate": 5.088000000000001e-06, - "loss": 368.549, + "grad_norm": 895.3870239257812, + "learning_rate": 2.5440000000000005e-06, + "loss": 379.8545, "step": 3180 }, { "epoch": 0.01288800365227439, - "grad_norm": 749.0835571289062, - "learning_rate": 5.104e-06, - "loss": 228.1246, + "grad_norm": 559.06494140625, + "learning_rate": 2.552e-06, + "loss": 236.7381, "step": 3190 }, { "epoch": 0.01292840491764202, "grad_norm": 0.0, - "learning_rate": 5.12e-06, - "loss": 227.3482, + "learning_rate": 2.56e-06, + "loss": 232.504, "step": 3200 }, { "epoch": 0.012968806183009652, - "grad_norm": 888.6599731445312, - "learning_rate": 5.136e-06, - "loss": 310.4809, + "grad_norm": 810.1832885742188, + "learning_rate": 2.568e-06, + "loss": 324.2067, "step": 3210 }, { "epoch": 0.013009207448377283, - "grad_norm": 760.9104614257812, - "learning_rate": 5.152e-06, - "loss": 229.1965, + "grad_norm": 752.4735717773438, + "learning_rate": 2.576e-06, + "loss": 238.8209, "step": 3220 }, { "epoch": 0.013049608713744915, - "grad_norm": 1096.23291015625, - "learning_rate": 5.168000000000001e-06, - "loss": 325.4543, + "grad_norm": 920.9286499023438, + "learning_rate": 2.5840000000000006e-06, + "loss": 336.4419, "step": 3230 }, { "epoch": 0.013090009979112546, - "grad_norm": 2388.175537109375, - "learning_rate": 5.184e-06, - "loss": 270.4536, + "grad_norm": 1401.5660400390625, + "learning_rate": 2.592e-06, + "loss": 282.7134, "step": 3240 }, { "epoch": 0.013130411244480177, - "grad_norm": 7733.79296875, - "learning_rate": 5.2e-06, - "loss": 364.0487, + "grad_norm": 9530.2431640625, + "learning_rate": 2.6e-06, + "loss": 383.3861, "step": 3250 }, { "epoch": 0.013170812509847809, - "grad_norm": 1177.9654541015625, - "learning_rate": 5.216e-06, - "loss": 272.8813, + "grad_norm": 969.5448608398438, + "learning_rate": 2.608e-06, + "loss": 284.8195, "step": 3260 }, { "epoch": 0.01321121377521544, - "grad_norm": 1095.551025390625, - "learning_rate": 5.232e-06, - "loss": 356.0726, + "grad_norm": 1034.160888671875, + "learning_rate": 2.616e-06, + "loss": 367.5712, "step": 3270 }, { "epoch": 0.013251615040583071, - "grad_norm": 1486.8251953125, - "learning_rate": 5.248000000000001e-06, - "loss": 269.6024, + "grad_norm": 1939.5294189453125, + "learning_rate": 2.6240000000000006e-06, + "loss": 283.061, "step": 3280 }, { "epoch": 0.013292016305950703, - "grad_norm": 1649.369384765625, - "learning_rate": 5.264e-06, - "loss": 406.2197, + "grad_norm": 1946.1861572265625, + "learning_rate": 2.632e-06, + "loss": 422.7646, "step": 3290 }, { "epoch": 0.013332417571318334, - "grad_norm": 901.133544921875, - "learning_rate": 5.28e-06, - "loss": 334.5719, + "grad_norm": 873.4537963867188, + "learning_rate": 2.64e-06, + "loss": 344.5549, "step": 3300 }, { "epoch": 0.013372818836685965, - "grad_norm": 1080.5753173828125, - "learning_rate": 5.296e-06, - "loss": 256.0706, + "grad_norm": 1010.3089599609375, + "learning_rate": 2.648e-06, + "loss": 190.8384, "step": 3310 }, { "epoch": 0.013413220102053596, - "grad_norm": 903.6046752929688, - "learning_rate": 5.312e-06, - "loss": 229.897, + "grad_norm": 1023.7975463867188, + "learning_rate": 2.656e-06, + "loss": 239.7801, "step": 3320 }, { "epoch": 0.013453621367421228, - "grad_norm": 984.148193359375, - "learning_rate": 5.328000000000001e-06, - "loss": 268.5443, + "grad_norm": 1019.0778198242188, + "learning_rate": 2.6640000000000007e-06, + "loss": 274.9548, "step": 3330 }, { "epoch": 0.013494022632788859, - "grad_norm": 1167.3309326171875, - "learning_rate": 5.344e-06, - "loss": 339.0956, + "grad_norm": 1070.2169189453125, + "learning_rate": 2.672e-06, + "loss": 352.4845, "step": 3340 }, { "epoch": 0.01353442389815649, - "grad_norm": 2445.7763671875, - "learning_rate": 5.36e-06, - "loss": 237.0434, + "grad_norm": 2437.802978515625, + "learning_rate": 2.68e-06, + "loss": 244.2223, "step": 3350 }, { "epoch": 0.013574825163524122, - "grad_norm": 814.3775634765625, - "learning_rate": 5.376e-06, - "loss": 182.9125, + "grad_norm": 1718.4080810546875, + "learning_rate": 2.688e-06, + "loss": 189.5913, "step": 3360 }, { "epoch": 0.013615226428891753, - "grad_norm": 883.6320190429688, - "learning_rate": 5.392e-06, - "loss": 346.1443, + "grad_norm": 991.4490356445312, + "learning_rate": 2.696e-06, + "loss": 367.0182, "step": 3370 }, { "epoch": 0.013655627694259384, - "grad_norm": 788.3290405273438, - "learning_rate": 5.408e-06, - "loss": 143.6525, + "grad_norm": 1084.2481689453125, + "learning_rate": 2.704e-06, + "loss": 151.7596, "step": 3380 }, { "epoch": 0.013696028959627016, - "grad_norm": 1109.12744140625, - "learning_rate": 5.424e-06, - "loss": 293.3582, + "grad_norm": 955.623779296875, + "learning_rate": 2.712e-06, + "loss": 304.651, "step": 3390 }, { "epoch": 0.013736430224994647, - "grad_norm": 968.6015014648438, - "learning_rate": 5.4400000000000004e-06, - "loss": 228.7989, + "grad_norm": 839.148193359375, + "learning_rate": 2.7200000000000002e-06, + "loss": 240.1972, "step": 3400 }, { "epoch": 0.013776831490362278, - "grad_norm": 1213.0040283203125, - "learning_rate": 5.456e-06, - "loss": 432.3414, + "grad_norm": 1995.0126953125, + "learning_rate": 2.728e-06, + "loss": 459.2235, "step": 3410 }, { "epoch": 0.01381723275572991, - "grad_norm": 1326.287109375, - "learning_rate": 5.472e-06, - "loss": 332.6496, + "grad_norm": 1700.23046875, + "learning_rate": 2.736e-06, + "loss": 351.0001, "step": 3420 }, { "epoch": 0.013857634021097541, - "grad_norm": 6259.54248046875, - "learning_rate": 5.488e-06, - "loss": 271.7205, + "grad_norm": 3849.339599609375, + "learning_rate": 2.744e-06, + "loss": 288.3887, "step": 3430 }, { "epoch": 0.013898035286465172, - "grad_norm": 708.2656860351562, - "learning_rate": 5.504e-06, - "loss": 180.486, + "grad_norm": 906.9827880859375, + "learning_rate": 2.752e-06, + "loss": 190.7232, "step": 3440 }, { "epoch": 0.013938436551832804, - "grad_norm": 685.2205200195312, - "learning_rate": 5.5200000000000005e-06, - "loss": 268.5384, + "grad_norm": 596.560302734375, + "learning_rate": 2.7600000000000003e-06, + "loss": 280.325, "step": 3450 }, { "epoch": 0.013978837817200435, - "grad_norm": 1125.2498779296875, - "learning_rate": 5.536e-06, - "loss": 288.8181, + "grad_norm": 985.322265625, + "learning_rate": 2.768e-06, + "loss": 299.1884, "step": 3460 }, { "epoch": 0.014019239082568066, - "grad_norm": 1228.8125, - "learning_rate": 5.552e-06, - "loss": 244.6971, + "grad_norm": 2226.59326171875, + "learning_rate": 2.776e-06, + "loss": 255.0898, "step": 3470 }, { "epoch": 0.014059640347935698, - "grad_norm": 2762.309814453125, - "learning_rate": 5.568e-06, - "loss": 273.8453, + "grad_norm": 2863.064453125, + "learning_rate": 2.784e-06, + "loss": 294.2987, "step": 3480 }, { "epoch": 0.014100041613303329, - "grad_norm": 945.8040161132812, - "learning_rate": 5.584e-06, - "loss": 240.8805, + "grad_norm": 1138.6005859375, + "learning_rate": 2.792e-06, + "loss": 251.408, "step": 3490 }, { "epoch": 0.01414044287867096, - "grad_norm": 2207.40185546875, - "learning_rate": 5.600000000000001e-06, - "loss": 165.5966, + "grad_norm": 1882.5142822265625, + "learning_rate": 2.8000000000000003e-06, + "loss": 177.0275, "step": 3500 }, { "epoch": 0.014180844144038592, - "grad_norm": 622.099609375, - "learning_rate": 5.616e-06, - "loss": 202.4418, + "grad_norm": 705.8058471679688, + "learning_rate": 2.808e-06, + "loss": 213.4635, "step": 3510 }, { "epoch": 0.014221245409406223, - "grad_norm": 1473.064208984375, - "learning_rate": 5.6320000000000005e-06, - "loss": 229.2896, + "grad_norm": 1090.1929931640625, + "learning_rate": 2.8160000000000002e-06, + "loss": 240.1692, "step": 3520 }, { "epoch": 0.014261646674773854, - "grad_norm": 1057.5018310546875, - "learning_rate": 5.648e-06, - "loss": 211.9801, + "grad_norm": 885.9393310546875, + "learning_rate": 2.824e-06, + "loss": 222.4137, "step": 3530 }, { "epoch": 0.014302047940141486, - "grad_norm": 4774.0947265625, - "learning_rate": 5.664e-06, - "loss": 204.8896, + "grad_norm": 4716.58837890625, + "learning_rate": 2.832e-06, + "loss": 212.752, "step": 3540 }, { "epoch": 0.014342449205509117, - "grad_norm": 676.964111328125, - "learning_rate": 5.68e-06, - "loss": 200.4312, + "grad_norm": 695.3489990234375, + "learning_rate": 2.84e-06, + "loss": 214.0755, "step": 3550 }, { "epoch": 0.014382850470876748, - "grad_norm": 571.7015380859375, - "learning_rate": 5.696e-06, - "loss": 275.7348, + "grad_norm": 521.3193969726562, + "learning_rate": 2.848e-06, + "loss": 288.9722, "step": 3560 }, { "epoch": 0.01442325173624438, - "grad_norm": 1923.3953857421875, - "learning_rate": 5.7120000000000005e-06, - "loss": 224.3793, + "grad_norm": 989.7683715820312, + "learning_rate": 2.8560000000000003e-06, + "loss": 233.4864, "step": 3570 }, { "epoch": 0.01446365300161201, - "grad_norm": 541.0182495117188, - "learning_rate": 5.728e-06, - "loss": 155.7405, + "grad_norm": 485.1424865722656, + "learning_rate": 2.864e-06, + "loss": 164.5155, "step": 3580 }, { "epoch": 0.014504054266979642, - "grad_norm": 1030.9990234375, - "learning_rate": 5.744e-06, - "loss": 225.919, + "grad_norm": 957.7999267578125, + "learning_rate": 2.872e-06, + "loss": 234.5045, "step": 3590 }, { "epoch": 0.014544455532347273, - "grad_norm": 1258.4619140625, - "learning_rate": 5.76e-06, - "loss": 242.7457, + "grad_norm": 767.5082397460938, + "learning_rate": 2.88e-06, + "loss": 255.9814, "step": 3600 }, { "epoch": 0.014584856797714905, - "grad_norm": 1085.32177734375, - "learning_rate": 5.776e-06, - "loss": 251.3692, + "grad_norm": 1104.1214599609375, + "learning_rate": 2.888e-06, + "loss": 258.1813, "step": 3610 }, { "epoch": 0.014625258063082536, - "grad_norm": 880.9969482421875, - "learning_rate": 5.792000000000001e-06, - "loss": 228.1045, + "grad_norm": 834.6510009765625, + "learning_rate": 2.8960000000000003e-06, + "loss": 240.5487, "step": 3620 }, { "epoch": 0.014665659328450167, - "grad_norm": 616.6618041992188, - "learning_rate": 5.808e-06, - "loss": 216.2711, + "grad_norm": 620.6282958984375, + "learning_rate": 2.904e-06, + "loss": 222.8977, "step": 3630 }, { "epoch": 0.014706060593817799, - "grad_norm": 2865.4169921875, - "learning_rate": 5.8240000000000005e-06, - "loss": 253.2795, + "grad_norm": 3170.61474609375, + "learning_rate": 2.9120000000000002e-06, + "loss": 263.4291, "step": 3640 }, { "epoch": 0.01474646185918543, - "grad_norm": 941.0968627929688, - "learning_rate": 5.84e-06, - "loss": 268.1058, + "grad_norm": 900.1008911132812, + "learning_rate": 2.92e-06, + "loss": 284.5013, "step": 3650 }, { "epoch": 0.014786863124553061, - "grad_norm": 1495.3153076171875, - "learning_rate": 5.856e-06, - "loss": 262.6222, + "grad_norm": 1374.6429443359375, + "learning_rate": 2.928e-06, + "loss": 276.6986, "step": 3660 }, { "epoch": 0.014827264389920693, - "grad_norm": 1153.8773193359375, - "learning_rate": 5.872000000000001e-06, - "loss": 314.2995, + "grad_norm": 746.1883544921875, + "learning_rate": 2.9360000000000003e-06, + "loss": 325.1993, "step": 3670 }, { "epoch": 0.014867665655288324, - "grad_norm": 3272.336669921875, - "learning_rate": 5.888e-06, - "loss": 275.6237, + "grad_norm": 2877.278564453125, + "learning_rate": 2.944e-06, + "loss": 285.8458, "step": 3680 }, { "epoch": 0.014908066920655955, - "grad_norm": 1363.574462890625, - "learning_rate": 5.9040000000000006e-06, - "loss": 408.7946, + "grad_norm": 1090.628662109375, + "learning_rate": 2.9520000000000003e-06, + "loss": 432.6214, "step": 3690 }, { "epoch": 0.014948468186023587, - "grad_norm": 1989.05810546875, - "learning_rate": 5.92e-06, - "loss": 347.1884, + "grad_norm": 2480.501953125, + "learning_rate": 2.96e-06, + "loss": 353.5424, "step": 3700 }, { "epoch": 0.014988869451391218, - "grad_norm": 893.3648071289062, - "learning_rate": 5.9360000000000004e-06, - "loss": 231.8197, + "grad_norm": 737.1724853515625, + "learning_rate": 2.9680000000000002e-06, + "loss": 242.5514, "step": 3710 }, { "epoch": 0.01502927071675885, - "grad_norm": 2015.3258056640625, - "learning_rate": 5.952e-06, - "loss": 241.9273, + "grad_norm": 2191.963623046875, + "learning_rate": 2.976e-06, + "loss": 253.1738, "step": 3720 }, { "epoch": 0.01506967198212648, - "grad_norm": 1097.1138916015625, - "learning_rate": 5.968e-06, - "loss": 289.7548, + "grad_norm": 1055.7967529296875, + "learning_rate": 2.984e-06, + "loss": 305.6454, "step": 3730 }, { "epoch": 0.015110073247494112, - "grad_norm": 1198.3504638671875, - "learning_rate": 5.984000000000001e-06, - "loss": 257.8569, + "grad_norm": 1204.630615234375, + "learning_rate": 2.9920000000000003e-06, + "loss": 271.2107, "step": 3740 }, { "epoch": 0.015150474512861743, - "grad_norm": 2764.9599609375, - "learning_rate": 6e-06, - "loss": 394.4565, + "grad_norm": 1987.0604248046875, + "learning_rate": 3e-06, + "loss": 409.8629, "step": 3750 }, { "epoch": 0.015190875778229375, - "grad_norm": 758.348876953125, - "learning_rate": 6.0160000000000005e-06, - "loss": 249.2975, + "grad_norm": 914.4812622070312, + "learning_rate": 3.0080000000000003e-06, + "loss": 257.4287, "step": 3760 }, { "epoch": 0.015231277043597006, - "grad_norm": 2277.13818359375, - "learning_rate": 6.032e-06, - "loss": 294.9143, + "grad_norm": 3268.1220703125, + "learning_rate": 3.016e-06, + "loss": 318.43, "step": 3770 }, { "epoch": 0.015271678308964637, - "grad_norm": 2135.546142578125, - "learning_rate": 6.048e-06, - "loss": 235.1545, + "grad_norm": 1624.833251953125, + "learning_rate": 3.024e-06, + "loss": 245.4421, "step": 3780 }, { "epoch": 0.015312079574332269, - "grad_norm": 1097.388916015625, - "learning_rate": 6.064000000000001e-06, - "loss": 241.006, + "grad_norm": 1148.6253662109375, + "learning_rate": 3.0320000000000004e-06, + "loss": 250.8723, "step": 3790 }, { "epoch": 0.0153524808396999, - "grad_norm": 943.5238647460938, - "learning_rate": 6.08e-06, - "loss": 174.3143, + "grad_norm": 1320.0499267578125, + "learning_rate": 3.04e-06, + "loss": 188.4256, "step": 3800 }, { "epoch": 0.015392882105067531, - "grad_norm": 1239.0521240234375, - "learning_rate": 6.096000000000001e-06, - "loss": 307.3331, + "grad_norm": 591.3594360351562, + "learning_rate": 3.0480000000000003e-06, + "loss": 322.9109, "step": 3810 }, { "epoch": 0.015433283370435162, - "grad_norm": 643.6400146484375, - "learning_rate": 6.112e-06, - "loss": 217.4633, + "grad_norm": 723.1932373046875, + "learning_rate": 3.056e-06, + "loss": 228.9798, "step": 3820 }, { "epoch": 0.015473684635802794, - "grad_norm": 1569.7869873046875, - "learning_rate": 6.1280000000000005e-06, - "loss": 322.4538, + "grad_norm": 1093.16943359375, + "learning_rate": 3.0640000000000002e-06, + "loss": 339.4324, "step": 3830 }, { "epoch": 0.015514085901170425, - "grad_norm": 3763.859130859375, - "learning_rate": 6.144e-06, - "loss": 182.1174, + "grad_norm": 2611.811279296875, + "learning_rate": 3.072e-06, + "loss": 189.2289, "step": 3840 }, { "epoch": 0.015554487166538056, - "grad_norm": 717.2794799804688, - "learning_rate": 6.16e-06, - "loss": 207.8513, + "grad_norm": 781.1378173828125, + "learning_rate": 3.08e-06, + "loss": 217.5148, "step": 3850 }, { "epoch": 0.015594888431905688, - "grad_norm": 1418.3277587890625, - "learning_rate": 6.176000000000001e-06, - "loss": 254.4121, + "grad_norm": 1848.516357421875, + "learning_rate": 3.0880000000000003e-06, + "loss": 268.352, "step": 3860 }, { "epoch": 0.01563528969727332, - "grad_norm": 1528.9119873046875, - "learning_rate": 6.192e-06, - "loss": 223.6123, + "grad_norm": 2161.83349609375, + "learning_rate": 3.096e-06, + "loss": 235.3343, "step": 3870 }, { "epoch": 0.01567569096264095, - "grad_norm": 544.6751708984375, - "learning_rate": 6.2080000000000005e-06, - "loss": 222.1293, + "grad_norm": 457.9054870605469, + "learning_rate": 3.1040000000000003e-06, + "loss": 236.3315, "step": 3880 }, { "epoch": 0.01571609222800858, - "grad_norm": 754.9981079101562, - "learning_rate": 6.224e-06, - "loss": 255.1399, + "grad_norm": 707.6962890625, + "learning_rate": 3.112e-06, + "loss": 269.9745, "step": 3890 }, { "epoch": 0.01575649349337621, - "grad_norm": 1966.5140380859375, - "learning_rate": 6.24e-06, - "loss": 275.0173, + "grad_norm": 1671.724609375, + "learning_rate": 3.12e-06, + "loss": 286.5569, "step": 3900 }, { "epoch": 0.015796894758743844, - "grad_norm": 810.1495361328125, - "learning_rate": 6.256000000000001e-06, - "loss": 219.6732, + "grad_norm": 759.36572265625, + "learning_rate": 3.1280000000000004e-06, + "loss": 232.0514, "step": 3910 }, { "epoch": 0.015837296024111474, - "grad_norm": 3056.527099609375, - "learning_rate": 6.272e-06, - "loss": 184.322, + "grad_norm": 1955.5032958984375, + "learning_rate": 3.136e-06, + "loss": 188.9286, "step": 3920 }, { "epoch": 0.015877697289479107, - "grad_norm": 2820.2568359375, - "learning_rate": 6.288000000000001e-06, - "loss": 235.3666, + "grad_norm": 1340.0684814453125, + "learning_rate": 3.1440000000000003e-06, + "loss": 240.7968, "step": 3930 }, { "epoch": 0.015918098554846737, - "grad_norm": 1320.5225830078125, - "learning_rate": 6.304e-06, - "loss": 220.3165, + "grad_norm": 1421.1473388671875, + "learning_rate": 3.152e-06, + "loss": 233.7573, "step": 3940 }, { "epoch": 0.01595849982021437, - "grad_norm": 933.7466430664062, - "learning_rate": 6.3200000000000005e-06, - "loss": 307.4496, + "grad_norm": 1020.22705078125, + "learning_rate": 3.1600000000000002e-06, + "loss": 322.254, "step": 3950 }, { "epoch": 0.015998901085582, - "grad_norm": 867.0122680664062, - "learning_rate": 6.336000000000001e-06, - "loss": 263.1927, + "grad_norm": 853.1231689453125, + "learning_rate": 3.1680000000000004e-06, + "loss": 272.9022, "step": 3960 }, { "epoch": 0.016039302350949632, - "grad_norm": 747.3133544921875, - "learning_rate": 6.352e-06, - "loss": 306.4962, + "grad_norm": 766.1064453125, + "learning_rate": 3.176e-06, + "loss": 325.4429, "step": 3970 }, { "epoch": 0.016079703616317262, - "grad_norm": 711.7617797851562, - "learning_rate": 6.368000000000001e-06, - "loss": 243.1562, + "grad_norm": 651.106689453125, + "learning_rate": 3.1840000000000003e-06, + "loss": 261.1683, "step": 3980 }, { "epoch": 0.016120104881684895, - "grad_norm": 732.2003173828125, - "learning_rate": 6.384e-06, - "loss": 231.337, + "grad_norm": 917.818603515625, + "learning_rate": 3.192e-06, + "loss": 248.8511, "step": 3990 }, { "epoch": 0.016160506147052525, - "grad_norm": 582.39990234375, - "learning_rate": 6.4000000000000006e-06, - "loss": 273.9559, + "grad_norm": 576.7960815429688, + "learning_rate": 3.2000000000000003e-06, + "loss": 283.6953, "step": 4000 + }, + { + "epoch": 0.016200907412420158, + "grad_norm": 703.4618530273438, + "learning_rate": 3.208e-06, + "loss": 221.6604, + "step": 4010 + }, + { + "epoch": 0.016241308677787787, + "grad_norm": 0.0, + "learning_rate": 3.216e-06, + "loss": 238.4616, + "step": 4020 + }, + { + "epoch": 0.01628170994315542, + "grad_norm": 698.3013305664062, + "learning_rate": 3.2240000000000004e-06, + "loss": 290.2816, + "step": 4030 + }, + { + "epoch": 0.01632211120852305, + "grad_norm": 788.061279296875, + "learning_rate": 3.232e-06, + "loss": 341.1846, + "step": 4040 + }, + { + "epoch": 0.016362512473890683, + "grad_norm": 1486.7274169921875, + "learning_rate": 3.2400000000000003e-06, + "loss": 233.853, + "step": 4050 + }, + { + "epoch": 0.016402913739258312, + "grad_norm": 1192.024169921875, + "learning_rate": 3.248e-06, + "loss": 350.9311, + "step": 4060 + }, + { + "epoch": 0.016443315004625945, + "grad_norm": 1837.5101318359375, + "learning_rate": 3.2560000000000003e-06, + "loss": 254.87, + "step": 4070 + }, + { + "epoch": 0.016483716269993575, + "grad_norm": 837.7913208007812, + "learning_rate": 3.2640000000000004e-06, + "loss": 253.6598, + "step": 4080 + }, + { + "epoch": 0.016524117535361208, + "grad_norm": 716.4096069335938, + "learning_rate": 3.272e-06, + "loss": 354.4232, + "step": 4090 + }, + { + "epoch": 0.016564518800728838, + "grad_norm": 868.5091552734375, + "learning_rate": 3.2800000000000004e-06, + "loss": 268.9362, + "step": 4100 + }, + { + "epoch": 0.01660492006609647, + "grad_norm": 634.5536499023438, + "learning_rate": 3.288e-06, + "loss": 214.6038, + "step": 4110 + }, + { + "epoch": 0.0166453213314641, + "grad_norm": 604.802490234375, + "learning_rate": 3.2960000000000003e-06, + "loss": 341.2742, + "step": 4120 + }, + { + "epoch": 0.016685722596831733, + "grad_norm": 668.8900756835938, + "learning_rate": 3.3040000000000005e-06, + "loss": 229.8222, + "step": 4130 + }, + { + "epoch": 0.016726123862199363, + "grad_norm": 1141.625732421875, + "learning_rate": 3.3120000000000002e-06, + "loss": 279.8577, + "step": 4140 + }, + { + "epoch": 0.016766525127566996, + "grad_norm": 1097.2060546875, + "learning_rate": 3.3200000000000004e-06, + "loss": 335.7476, + "step": 4150 + }, + { + "epoch": 0.016806926392934626, + "grad_norm": 827.9783935546875, + "learning_rate": 3.328e-06, + "loss": 290.7574, + "step": 4160 + }, + { + "epoch": 0.01684732765830226, + "grad_norm": 1198.46142578125, + "learning_rate": 3.3360000000000003e-06, + "loss": 358.6813, + "step": 4170 + }, + { + "epoch": 0.016887728923669888, + "grad_norm": 4645.3603515625, + "learning_rate": 3.344e-06, + "loss": 255.452, + "step": 4180 + }, + { + "epoch": 0.01692813018903752, + "grad_norm": 605.9213256835938, + "learning_rate": 3.3520000000000003e-06, + "loss": 261.5411, + "step": 4190 + }, + { + "epoch": 0.01696853145440515, + "grad_norm": 1752.691162109375, + "learning_rate": 3.3600000000000004e-06, + "loss": 180.5614, + "step": 4200 + }, + { + "epoch": 0.017008932719772784, + "grad_norm": 683.8428344726562, + "learning_rate": 3.368e-06, + "loss": 285.6484, + "step": 4210 + }, + { + "epoch": 0.017049333985140414, + "grad_norm": 565.120361328125, + "learning_rate": 3.3760000000000004e-06, + "loss": 198.7863, + "step": 4220 + }, + { + "epoch": 0.017089735250508047, + "grad_norm": 1145.80322265625, + "learning_rate": 3.384e-06, + "loss": 302.483, + "step": 4230 + }, + { + "epoch": 0.017130136515875676, + "grad_norm": 965.896240234375, + "learning_rate": 3.3920000000000003e-06, + "loss": 330.8129, + "step": 4240 + }, + { + "epoch": 0.01717053778124331, + "grad_norm": 1721.089599609375, + "learning_rate": 3.4000000000000005e-06, + "loss": 330.5561, + "step": 4250 + }, + { + "epoch": 0.01721093904661094, + "grad_norm": 750.7158813476562, + "learning_rate": 3.4080000000000002e-06, + "loss": 274.9737, + "step": 4260 + }, + { + "epoch": 0.017251340311978572, + "grad_norm": 801.703857421875, + "learning_rate": 3.4160000000000004e-06, + "loss": 202.9271, + "step": 4270 + }, + { + "epoch": 0.0172917415773462, + "grad_norm": 1434.4979248046875, + "learning_rate": 3.424e-06, + "loss": 353.9297, + "step": 4280 + }, + { + "epoch": 0.017332142842713835, + "grad_norm": 615.7587280273438, + "learning_rate": 3.4320000000000003e-06, + "loss": 151.3951, + "step": 4290 + }, + { + "epoch": 0.017372544108081464, + "grad_norm": 1309.6622314453125, + "learning_rate": 3.44e-06, + "loss": 250.1997, + "step": 4300 + }, + { + "epoch": 0.017412945373449097, + "grad_norm": 886.3528442382812, + "learning_rate": 3.4480000000000003e-06, + "loss": 356.1532, + "step": 4310 + }, + { + "epoch": 0.017453346638816727, + "grad_norm": 1573.43359375, + "learning_rate": 3.4560000000000005e-06, + "loss": 267.2262, + "step": 4320 + }, + { + "epoch": 0.01749374790418436, + "grad_norm": 1121.71240234375, + "learning_rate": 3.464e-06, + "loss": 176.7969, + "step": 4330 + }, + { + "epoch": 0.01753414916955199, + "grad_norm": 1691.6214599609375, + "learning_rate": 3.4720000000000004e-06, + "loss": 261.0579, + "step": 4340 + }, + { + "epoch": 0.017574550434919622, + "grad_norm": 1357.11572265625, + "learning_rate": 3.48e-06, + "loss": 222.6117, + "step": 4350 + }, + { + "epoch": 0.017614951700287252, + "grad_norm": 1573.6026611328125, + "learning_rate": 3.4880000000000003e-06, + "loss": 209.2924, + "step": 4360 + }, + { + "epoch": 0.017655352965654885, + "grad_norm": 1021.2926025390625, + "learning_rate": 3.4960000000000005e-06, + "loss": 260.6172, + "step": 4370 + }, + { + "epoch": 0.017695754231022515, + "grad_norm": 548.7271118164062, + "learning_rate": 3.5040000000000002e-06, + "loss": 278.5907, + "step": 4380 + }, + { + "epoch": 0.017736155496390148, + "grad_norm": 737.4352416992188, + "learning_rate": 3.5120000000000004e-06, + "loss": 212.2047, + "step": 4390 + }, + { + "epoch": 0.017776556761757777, + "grad_norm": 865.3966064453125, + "learning_rate": 3.52e-06, + "loss": 319.7553, + "step": 4400 + }, + { + "epoch": 0.01781695802712541, + "grad_norm": 569.0396728515625, + "learning_rate": 3.5280000000000004e-06, + "loss": 223.2969, + "step": 4410 + }, + { + "epoch": 0.01785735929249304, + "grad_norm": 1365.351806640625, + "learning_rate": 3.5360000000000005e-06, + "loss": 220.5284, + "step": 4420 + }, + { + "epoch": 0.017897760557860673, + "grad_norm": 1133.9447021484375, + "learning_rate": 3.5440000000000003e-06, + "loss": 236.8231, + "step": 4430 + }, + { + "epoch": 0.017938161823228303, + "grad_norm": 819.81982421875, + "learning_rate": 3.5520000000000005e-06, + "loss": 210.566, + "step": 4440 + }, + { + "epoch": 0.017978563088595936, + "grad_norm": 1337.5098876953125, + "learning_rate": 3.5600000000000002e-06, + "loss": 228.1564, + "step": 4450 + }, + { + "epoch": 0.018018964353963565, + "grad_norm": 964.3289184570312, + "learning_rate": 3.5680000000000004e-06, + "loss": 242.1303, + "step": 4460 + }, + { + "epoch": 0.0180593656193312, + "grad_norm": 1475.19384765625, + "learning_rate": 3.576e-06, + "loss": 229.214, + "step": 4470 + }, + { + "epoch": 0.018099766884698828, + "grad_norm": 1174.5684814453125, + "learning_rate": 3.5840000000000003e-06, + "loss": 250.0778, + "step": 4480 + }, + { + "epoch": 0.01814016815006646, + "grad_norm": 1063.322021484375, + "learning_rate": 3.5920000000000005e-06, + "loss": 215.5135, + "step": 4490 + }, + { + "epoch": 0.01818056941543409, + "grad_norm": 724.023681640625, + "learning_rate": 3.6000000000000003e-06, + "loss": 248.552, + "step": 4500 + }, + { + "epoch": 0.018220970680801724, + "grad_norm": 1222.7664794921875, + "learning_rate": 3.6080000000000004e-06, + "loss": 234.3776, + "step": 4510 + }, + { + "epoch": 0.018261371946169353, + "grad_norm": 677.7725219726562, + "learning_rate": 3.616e-06, + "loss": 212.8068, + "step": 4520 + }, + { + "epoch": 0.018301773211536986, + "grad_norm": 1107.8935546875, + "learning_rate": 3.6240000000000004e-06, + "loss": 239.7066, + "step": 4530 + }, + { + "epoch": 0.018342174476904616, + "grad_norm": 898.6642456054688, + "learning_rate": 3.6320000000000005e-06, + "loss": 201.3611, + "step": 4540 + }, + { + "epoch": 0.01838257574227225, + "grad_norm": 1333.6512451171875, + "learning_rate": 3.6400000000000003e-06, + "loss": 335.3635, + "step": 4550 + }, + { + "epoch": 0.01842297700763988, + "grad_norm": 1404.3516845703125, + "learning_rate": 3.6480000000000005e-06, + "loss": 345.1794, + "step": 4560 + }, + { + "epoch": 0.01846337827300751, + "grad_norm": 1524.8006591796875, + "learning_rate": 3.6560000000000002e-06, + "loss": 240.9258, + "step": 4570 + }, + { + "epoch": 0.01850377953837514, + "grad_norm": 1005.1021118164062, + "learning_rate": 3.6640000000000004e-06, + "loss": 294.3367, + "step": 4580 + }, + { + "epoch": 0.018544180803742774, + "grad_norm": 390.6331481933594, + "learning_rate": 3.6720000000000006e-06, + "loss": 325.6292, + "step": 4590 + }, + { + "epoch": 0.018584582069110404, + "grad_norm": 3332.533935546875, + "learning_rate": 3.6800000000000003e-06, + "loss": 317.098, + "step": 4600 + }, + { + "epoch": 0.018624983334478037, + "grad_norm": 778.4534301757812, + "learning_rate": 3.6880000000000005e-06, + "loss": 216.7282, + "step": 4610 + }, + { + "epoch": 0.018665384599845666, + "grad_norm": 562.4667358398438, + "learning_rate": 3.6960000000000003e-06, + "loss": 192.6008, + "step": 4620 + }, + { + "epoch": 0.0187057858652133, + "grad_norm": 742.5579833984375, + "learning_rate": 3.7040000000000005e-06, + "loss": 196.6221, + "step": 4630 + }, + { + "epoch": 0.01874618713058093, + "grad_norm": 1615.2911376953125, + "learning_rate": 3.712e-06, + "loss": 203.2837, + "step": 4640 + }, + { + "epoch": 0.018786588395948562, + "grad_norm": 2102.44970703125, + "learning_rate": 3.7200000000000004e-06, + "loss": 287.735, + "step": 4650 + }, + { + "epoch": 0.01882698966131619, + "grad_norm": 1209.79296875, + "learning_rate": 3.7280000000000006e-06, + "loss": 305.2592, + "step": 4660 + }, + { + "epoch": 0.018867390926683825, + "grad_norm": 653.5800170898438, + "learning_rate": 3.7360000000000003e-06, + "loss": 294.9557, + "step": 4670 + }, + { + "epoch": 0.018907792192051454, + "grad_norm": 1804.913818359375, + "learning_rate": 3.7440000000000005e-06, + "loss": 215.6078, + "step": 4680 + }, + { + "epoch": 0.018948193457419087, + "grad_norm": 1226.2567138671875, + "learning_rate": 3.7520000000000002e-06, + "loss": 214.5115, + "step": 4690 + }, + { + "epoch": 0.018988594722786717, + "grad_norm": 1538.9205322265625, + "learning_rate": 3.7600000000000004e-06, + "loss": 244.2366, + "step": 4700 + }, + { + "epoch": 0.01902899598815435, + "grad_norm": 1672.07373046875, + "learning_rate": 3.7680000000000006e-06, + "loss": 260.998, + "step": 4710 + }, + { + "epoch": 0.01906939725352198, + "grad_norm": 0.0, + "learning_rate": 3.7760000000000004e-06, + "loss": 200.9737, + "step": 4720 + }, + { + "epoch": 0.019109798518889613, + "grad_norm": 1790.7144775390625, + "learning_rate": 3.7840000000000005e-06, + "loss": 201.2574, + "step": 4730 + }, + { + "epoch": 0.019150199784257242, + "grad_norm": 2101.942626953125, + "learning_rate": 3.7920000000000003e-06, + "loss": 268.5821, + "step": 4740 + }, + { + "epoch": 0.019190601049624875, + "grad_norm": 757.1461181640625, + "learning_rate": 3.8000000000000005e-06, + "loss": 278.9659, + "step": 4750 + }, + { + "epoch": 0.019231002314992505, + "grad_norm": 861.269287109375, + "learning_rate": 3.8080000000000006e-06, + "loss": 223.4364, + "step": 4760 + }, + { + "epoch": 0.019271403580360138, + "grad_norm": 800.5299682617188, + "learning_rate": 3.816e-06, + "loss": 259.876, + "step": 4770 + }, + { + "epoch": 0.019311804845727767, + "grad_norm": 987.7889404296875, + "learning_rate": 3.824e-06, + "loss": 242.2536, + "step": 4780 + }, + { + "epoch": 0.0193522061110954, + "grad_norm": 468.9601135253906, + "learning_rate": 3.832e-06, + "loss": 174.1677, + "step": 4790 + }, + { + "epoch": 0.01939260737646303, + "grad_norm": 789.4224853515625, + "learning_rate": 3.8400000000000005e-06, + "loss": 185.3222, + "step": 4800 + }, + { + "epoch": 0.019433008641830663, + "grad_norm": 733.8356323242188, + "learning_rate": 3.848e-06, + "loss": 233.7156, + "step": 4810 + }, + { + "epoch": 0.019473409907198293, + "grad_norm": 3644.848388671875, + "learning_rate": 3.856e-06, + "loss": 282.5843, + "step": 4820 + }, + { + "epoch": 0.019513811172565926, + "grad_norm": 1515.1463623046875, + "learning_rate": 3.864000000000001e-06, + "loss": 228.3501, + "step": 4830 + }, + { + "epoch": 0.019554212437933555, + "grad_norm": 1994.6700439453125, + "learning_rate": 3.872e-06, + "loss": 179.8708, + "step": 4840 + }, + { + "epoch": 0.01959461370330119, + "grad_norm": 1518.75, + "learning_rate": 3.88e-06, + "loss": 312.2273, + "step": 4850 + }, + { + "epoch": 0.019635014968668818, + "grad_norm": 438.5001525878906, + "learning_rate": 3.888e-06, + "loss": 171.058, + "step": 4860 + }, + { + "epoch": 0.01967541623403645, + "grad_norm": 2248.920166015625, + "learning_rate": 3.8960000000000005e-06, + "loss": 302.2809, + "step": 4870 + }, + { + "epoch": 0.01971581749940408, + "grad_norm": 1396.06640625, + "learning_rate": 3.904e-06, + "loss": 207.1693, + "step": 4880 + }, + { + "epoch": 0.019756218764771714, + "grad_norm": 926.9723510742188, + "learning_rate": 3.912e-06, + "loss": 187.9564, + "step": 4890 + }, + { + "epoch": 0.019796620030139343, + "grad_norm": 992.8926391601562, + "learning_rate": 3.920000000000001e-06, + "loss": 314.0639, + "step": 4900 + }, + { + "epoch": 0.019837021295506976, + "grad_norm": 2172.178466796875, + "learning_rate": 3.928e-06, + "loss": 295.6219, + "step": 4910 + }, + { + "epoch": 0.019877422560874606, + "grad_norm": 896.484375, + "learning_rate": 3.936e-06, + "loss": 163.1548, + "step": 4920 + }, + { + "epoch": 0.01991782382624224, + "grad_norm": 1239.43359375, + "learning_rate": 3.944e-06, + "loss": 295.573, + "step": 4930 + }, + { + "epoch": 0.01995822509160987, + "grad_norm": 1171.8026123046875, + "learning_rate": 3.9520000000000004e-06, + "loss": 280.9915, + "step": 4940 + }, + { + "epoch": 0.0199986263569775, + "grad_norm": 995.73583984375, + "learning_rate": 3.96e-06, + "loss": 233.0034, + "step": 4950 + }, + { + "epoch": 0.02003902762234513, + "grad_norm": 1719.3262939453125, + "learning_rate": 3.968e-06, + "loss": 145.5473, + "step": 4960 + }, + { + "epoch": 0.020079428887712764, + "grad_norm": 14412.759765625, + "learning_rate": 3.9760000000000006e-06, + "loss": 403.25, + "step": 4970 + }, + { + "epoch": 0.020119830153080394, + "grad_norm": 1432.953369140625, + "learning_rate": 3.984e-06, + "loss": 243.9733, + "step": 4980 + }, + { + "epoch": 0.020160231418448027, + "grad_norm": 655.8001098632812, + "learning_rate": 3.992e-06, + "loss": 255.6257, + "step": 4990 + }, + { + "epoch": 0.020200632683815656, + "grad_norm": 1421.9742431640625, + "learning_rate": 4.000000000000001e-06, + "loss": 182.0297, + "step": 5000 + }, + { + "epoch": 0.02024103394918329, + "grad_norm": 1990.9124755859375, + "learning_rate": 4.008e-06, + "loss": 238.356, + "step": 5010 + }, + { + "epoch": 0.02028143521455092, + "grad_norm": 1020.7196655273438, + "learning_rate": 4.016e-06, + "loss": 159.2277, + "step": 5020 + }, + { + "epoch": 0.020321836479918552, + "grad_norm": 1281.9739990234375, + "learning_rate": 4.024e-06, + "loss": 272.8915, + "step": 5030 + }, + { + "epoch": 0.020362237745286182, + "grad_norm": 1122.945556640625, + "learning_rate": 4.0320000000000005e-06, + "loss": 263.444, + "step": 5040 + }, + { + "epoch": 0.020402639010653815, + "grad_norm": 1039.796630859375, + "learning_rate": 4.04e-06, + "loss": 195.0571, + "step": 5050 + }, + { + "epoch": 0.020443040276021444, + "grad_norm": 0.0, + "learning_rate": 4.048e-06, + "loss": 287.9595, + "step": 5060 + }, + { + "epoch": 0.020483441541389077, + "grad_norm": 946.9546508789062, + "learning_rate": 4.056000000000001e-06, + "loss": 222.9819, + "step": 5070 + }, + { + "epoch": 0.020523842806756707, + "grad_norm": 2530.3251953125, + "learning_rate": 4.064e-06, + "loss": 240.0088, + "step": 5080 + }, + { + "epoch": 0.02056424407212434, + "grad_norm": 2582.428466796875, + "learning_rate": 4.072e-06, + "loss": 300.8218, + "step": 5090 + }, + { + "epoch": 0.02060464533749197, + "grad_norm": 1097.255615234375, + "learning_rate": 4.08e-06, + "loss": 227.3757, + "step": 5100 + }, + { + "epoch": 0.020645046602859603, + "grad_norm": 829.824462890625, + "learning_rate": 4.0880000000000005e-06, + "loss": 271.9307, + "step": 5110 + }, + { + "epoch": 0.020685447868227232, + "grad_norm": 1170.4205322265625, + "learning_rate": 4.096e-06, + "loss": 208.7429, + "step": 5120 + }, + { + "epoch": 0.020725849133594865, + "grad_norm": 2347.397705078125, + "learning_rate": 4.104e-06, + "loss": 202.8534, + "step": 5130 + }, + { + "epoch": 0.020766250398962495, + "grad_norm": 747.20263671875, + "learning_rate": 4.112000000000001e-06, + "loss": 188.2006, + "step": 5140 + }, + { + "epoch": 0.020806651664330128, + "grad_norm": 626.6402587890625, + "learning_rate": 4.12e-06, + "loss": 161.1189, + "step": 5150 + }, + { + "epoch": 0.020847052929697758, + "grad_norm": 945.6193237304688, + "learning_rate": 4.128e-06, + "loss": 238.4999, + "step": 5160 + }, + { + "epoch": 0.02088745419506539, + "grad_norm": 815.0576782226562, + "learning_rate": 4.136000000000001e-06, + "loss": 258.373, + "step": 5170 + }, + { + "epoch": 0.02092785546043302, + "grad_norm": 881.4215698242188, + "learning_rate": 4.1440000000000005e-06, + "loss": 222.5087, + "step": 5180 + }, + { + "epoch": 0.020968256725800653, + "grad_norm": 1163.3695068359375, + "learning_rate": 4.152e-06, + "loss": 265.9115, + "step": 5190 + }, + { + "epoch": 0.021008657991168283, + "grad_norm": 715.35791015625, + "learning_rate": 4.16e-06, + "loss": 262.0144, + "step": 5200 + }, + { + "epoch": 0.021049059256535916, + "grad_norm": 915.2119750976562, + "learning_rate": 4.168000000000001e-06, + "loss": 238.4794, + "step": 5210 + }, + { + "epoch": 0.021089460521903546, + "grad_norm": 1859.83544921875, + "learning_rate": 4.176e-06, + "loss": 236.1828, + "step": 5220 + }, + { + "epoch": 0.02112986178727118, + "grad_norm": 1224.433349609375, + "learning_rate": 4.184e-06, + "loss": 262.4781, + "step": 5230 + }, + { + "epoch": 0.021170263052638808, + "grad_norm": 1389.5914306640625, + "learning_rate": 4.192000000000001e-06, + "loss": 317.3082, + "step": 5240 + }, + { + "epoch": 0.02121066431800644, + "grad_norm": 3302.87451171875, + "learning_rate": 4.2000000000000004e-06, + "loss": 270.2042, + "step": 5250 + }, + { + "epoch": 0.02125106558337407, + "grad_norm": 1709.3145751953125, + "learning_rate": 4.208e-06, + "loss": 295.2886, + "step": 5260 + }, + { + "epoch": 0.021291466848741704, + "grad_norm": 3116.50830078125, + "learning_rate": 4.216e-06, + "loss": 226.2704, + "step": 5270 + }, + { + "epoch": 0.021331868114109333, + "grad_norm": 972.5924072265625, + "learning_rate": 4.2240000000000006e-06, + "loss": 270.3954, + "step": 5280 + }, + { + "epoch": 0.021372269379476967, + "grad_norm": 1125.4678955078125, + "learning_rate": 4.232e-06, + "loss": 260.8531, + "step": 5290 + }, + { + "epoch": 0.021412670644844596, + "grad_norm": 817.9036254882812, + "learning_rate": 4.24e-06, + "loss": 289.322, + "step": 5300 + }, + { + "epoch": 0.02145307191021223, + "grad_norm": 1266.4287109375, + "learning_rate": 4.248000000000001e-06, + "loss": 297.7058, + "step": 5310 + }, + { + "epoch": 0.02149347317557986, + "grad_norm": 1158.7506103515625, + "learning_rate": 4.256e-06, + "loss": 218.8427, + "step": 5320 + }, + { + "epoch": 0.021533874440947492, + "grad_norm": 701.3471069335938, + "learning_rate": 4.264e-06, + "loss": 249.6902, + "step": 5330 + }, + { + "epoch": 0.02157427570631512, + "grad_norm": 1382.0621337890625, + "learning_rate": 4.272000000000001e-06, + "loss": 217.1363, + "step": 5340 + }, + { + "epoch": 0.021614676971682754, + "grad_norm": 727.7159423828125, + "learning_rate": 4.2800000000000005e-06, + "loss": 238.6548, + "step": 5350 + }, + { + "epoch": 0.021655078237050384, + "grad_norm": 689.1691284179688, + "learning_rate": 4.288e-06, + "loss": 305.8205, + "step": 5360 + }, + { + "epoch": 0.021695479502418017, + "grad_norm": 704.255859375, + "learning_rate": 4.296e-06, + "loss": 247.3537, + "step": 5370 + }, + { + "epoch": 0.021735880767785647, + "grad_norm": 1821.1497802734375, + "learning_rate": 4.304000000000001e-06, + "loss": 295.0289, + "step": 5380 + }, + { + "epoch": 0.02177628203315328, + "grad_norm": 2477.83056640625, + "learning_rate": 4.312e-06, + "loss": 270.8833, + "step": 5390 + }, + { + "epoch": 0.02181668329852091, + "grad_norm": 1743.1378173828125, + "learning_rate": 4.32e-06, + "loss": 326.5343, + "step": 5400 + }, + { + "epoch": 0.021857084563888542, + "grad_norm": 729.7633056640625, + "learning_rate": 4.328000000000001e-06, + "loss": 266.4839, + "step": 5410 + }, + { + "epoch": 0.021897485829256172, + "grad_norm": 1911.1827392578125, + "learning_rate": 4.3360000000000005e-06, + "loss": 272.9004, + "step": 5420 + }, + { + "epoch": 0.021937887094623805, + "grad_norm": 6040.431640625, + "learning_rate": 4.344e-06, + "loss": 278.508, + "step": 5430 + }, + { + "epoch": 0.021978288359991435, + "grad_norm": 1092.593017578125, + "learning_rate": 4.352e-06, + "loss": 252.7811, + "step": 5440 + }, + { + "epoch": 0.022018689625359068, + "grad_norm": 7559.6689453125, + "learning_rate": 4.360000000000001e-06, + "loss": 355.687, + "step": 5450 + }, + { + "epoch": 0.022059090890726697, + "grad_norm": 960.236572265625, + "learning_rate": 4.368e-06, + "loss": 235.2395, + "step": 5460 + }, + { + "epoch": 0.02209949215609433, + "grad_norm": 1374.5234375, + "learning_rate": 4.376e-06, + "loss": 231.7762, + "step": 5470 + }, + { + "epoch": 0.02213989342146196, + "grad_norm": 2087.810546875, + "learning_rate": 4.384000000000001e-06, + "loss": 258.7059, + "step": 5480 + }, + { + "epoch": 0.022180294686829593, + "grad_norm": 1122.2423095703125, + "learning_rate": 4.3920000000000005e-06, + "loss": 192.6893, + "step": 5490 + }, + { + "epoch": 0.022220695952197222, + "grad_norm": 2292.519287109375, + "learning_rate": 4.4e-06, + "loss": 287.8694, + "step": 5500 + }, + { + "epoch": 0.022261097217564856, + "grad_norm": 1324.64892578125, + "learning_rate": 4.408000000000001e-06, + "loss": 204.9342, + "step": 5510 + }, + { + "epoch": 0.022301498482932485, + "grad_norm": 650.8558959960938, + "learning_rate": 4.416000000000001e-06, + "loss": 225.5367, + "step": 5520 + }, + { + "epoch": 0.022341899748300118, + "grad_norm": 2953.84033203125, + "learning_rate": 4.424e-06, + "loss": 202.2525, + "step": 5530 + }, + { + "epoch": 0.022382301013667748, + "grad_norm": 7420.94482421875, + "learning_rate": 4.432e-06, + "loss": 264.7, + "step": 5540 + }, + { + "epoch": 0.02242270227903538, + "grad_norm": 771.5780029296875, + "learning_rate": 4.440000000000001e-06, + "loss": 260.405, + "step": 5550 + }, + { + "epoch": 0.02246310354440301, + "grad_norm": 1432.8135986328125, + "learning_rate": 4.4480000000000004e-06, + "loss": 208.0314, + "step": 5560 + }, + { + "epoch": 0.022503504809770643, + "grad_norm": 507.216064453125, + "learning_rate": 4.456e-06, + "loss": 129.4772, + "step": 5570 + }, + { + "epoch": 0.022543906075138273, + "grad_norm": 6830.529296875, + "learning_rate": 4.464000000000001e-06, + "loss": 233.2185, + "step": 5580 + }, + { + "epoch": 0.022584307340505906, + "grad_norm": 571.62841796875, + "learning_rate": 4.4720000000000006e-06, + "loss": 258.6644, + "step": 5590 + }, + { + "epoch": 0.022624708605873536, + "grad_norm": 788.0087890625, + "learning_rate": 4.48e-06, + "loss": 202.7467, + "step": 5600 + }, + { + "epoch": 0.02266510987124117, + "grad_norm": 3073.55078125, + "learning_rate": 4.488e-06, + "loss": 231.3413, + "step": 5610 + }, + { + "epoch": 0.0227055111366088, + "grad_norm": 2199.2275390625, + "learning_rate": 4.496000000000001e-06, + "loss": 216.6432, + "step": 5620 + }, + { + "epoch": 0.02274591240197643, + "grad_norm": 1550.0816650390625, + "learning_rate": 4.504e-06, + "loss": 313.3094, + "step": 5630 + }, + { + "epoch": 0.02278631366734406, + "grad_norm": 1045.01123046875, + "learning_rate": 4.512e-06, + "loss": 272.873, + "step": 5640 + }, + { + "epoch": 0.022826714932711694, + "grad_norm": 1931.4056396484375, + "learning_rate": 4.520000000000001e-06, + "loss": 238.2261, + "step": 5650 + }, + { + "epoch": 0.022867116198079324, + "grad_norm": 925.8681640625, + "learning_rate": 4.5280000000000005e-06, + "loss": 266.3857, + "step": 5660 + }, + { + "epoch": 0.022907517463446957, + "grad_norm": 1213.0843505859375, + "learning_rate": 4.536e-06, + "loss": 208.0252, + "step": 5670 + }, + { + "epoch": 0.022947918728814586, + "grad_norm": 491.13604736328125, + "learning_rate": 4.544000000000001e-06, + "loss": 191.799, + "step": 5680 + }, + { + "epoch": 0.02298831999418222, + "grad_norm": 0.0, + "learning_rate": 4.552000000000001e-06, + "loss": 258.5874, + "step": 5690 + }, + { + "epoch": 0.02302872125954985, + "grad_norm": 919.0978393554688, + "learning_rate": 4.56e-06, + "loss": 218.949, + "step": 5700 + }, + { + "epoch": 0.023069122524917482, + "grad_norm": 1192.0655517578125, + "learning_rate": 4.568e-06, + "loss": 233.1014, + "step": 5710 + }, + { + "epoch": 0.02310952379028511, + "grad_norm": 603.2725219726562, + "learning_rate": 4.576000000000001e-06, + "loss": 201.1196, + "step": 5720 + }, + { + "epoch": 0.023149925055652745, + "grad_norm": 678.8341064453125, + "learning_rate": 4.5840000000000005e-06, + "loss": 172.9851, + "step": 5730 + }, + { + "epoch": 0.023190326321020374, + "grad_norm": 725.107177734375, + "learning_rate": 4.592e-06, + "loss": 212.2116, + "step": 5740 + }, + { + "epoch": 0.023230727586388007, + "grad_norm": 1741.0740966796875, + "learning_rate": 4.600000000000001e-06, + "loss": 241.789, + "step": 5750 + }, + { + "epoch": 0.023271128851755637, + "grad_norm": 1773.99658203125, + "learning_rate": 4.608000000000001e-06, + "loss": 329.6526, + "step": 5760 + }, + { + "epoch": 0.02331153011712327, + "grad_norm": 2564.032470703125, + "learning_rate": 4.616e-06, + "loss": 219.3119, + "step": 5770 + }, + { + "epoch": 0.0233519313824909, + "grad_norm": 983.3951416015625, + "learning_rate": 4.624e-06, + "loss": 135.3453, + "step": 5780 + }, + { + "epoch": 0.023392332647858533, + "grad_norm": 683.25927734375, + "learning_rate": 4.632000000000001e-06, + "loss": 286.5372, + "step": 5790 + }, + { + "epoch": 0.023432733913226162, + "grad_norm": 895.6082153320312, + "learning_rate": 4.6400000000000005e-06, + "loss": 237.6086, + "step": 5800 + }, + { + "epoch": 0.023473135178593795, + "grad_norm": 951.7195434570312, + "learning_rate": 4.648e-06, + "loss": 189.3838, + "step": 5810 + }, + { + "epoch": 0.023513536443961425, + "grad_norm": 891.3842163085938, + "learning_rate": 4.656000000000001e-06, + "loss": 228.5827, + "step": 5820 + }, + { + "epoch": 0.023553937709329058, + "grad_norm": 863.3218383789062, + "learning_rate": 4.664000000000001e-06, + "loss": 183.2099, + "step": 5830 + }, + { + "epoch": 0.023594338974696687, + "grad_norm": 1344.14208984375, + "learning_rate": 4.672e-06, + "loss": 302.0065, + "step": 5840 + }, + { + "epoch": 0.02363474024006432, + "grad_norm": 2021.83740234375, + "learning_rate": 4.680000000000001e-06, + "loss": 399.304, + "step": 5850 + }, + { + "epoch": 0.02367514150543195, + "grad_norm": 1086.980224609375, + "learning_rate": 4.688000000000001e-06, + "loss": 283.3496, + "step": 5860 + }, + { + "epoch": 0.023715542770799583, + "grad_norm": 1416.5675048828125, + "learning_rate": 4.6960000000000004e-06, + "loss": 231.4329, + "step": 5870 + }, + { + "epoch": 0.023755944036167213, + "grad_norm": 619.0659790039062, + "learning_rate": 4.704e-06, + "loss": 203.1393, + "step": 5880 + }, + { + "epoch": 0.023796345301534846, + "grad_norm": 1728.76220703125, + "learning_rate": 4.712000000000001e-06, + "loss": 217.0127, + "step": 5890 + }, + { + "epoch": 0.023836746566902475, + "grad_norm": 2863.87841796875, + "learning_rate": 4.7200000000000005e-06, + "loss": 332.1416, + "step": 5900 + }, + { + "epoch": 0.02387714783227011, + "grad_norm": 910.1724243164062, + "learning_rate": 4.728e-06, + "loss": 249.6941, + "step": 5910 + }, + { + "epoch": 0.023917549097637738, + "grad_norm": 1207.9207763671875, + "learning_rate": 4.736000000000001e-06, + "loss": 163.2521, + "step": 5920 + }, + { + "epoch": 0.02395795036300537, + "grad_norm": 2455.398193359375, + "learning_rate": 4.744000000000001e-06, + "loss": 207.0845, + "step": 5930 + }, + { + "epoch": 0.023998351628373, + "grad_norm": 1146.4932861328125, + "learning_rate": 4.752e-06, + "loss": 187.881, + "step": 5940 + }, + { + "epoch": 0.024038752893740634, + "grad_norm": 1110.4263916015625, + "learning_rate": 4.76e-06, + "loss": 224.1446, + "step": 5950 + }, + { + "epoch": 0.024079154159108263, + "grad_norm": 828.0372314453125, + "learning_rate": 4.768000000000001e-06, + "loss": 205.4142, + "step": 5960 + }, + { + "epoch": 0.024119555424475896, + "grad_norm": 1250.98095703125, + "learning_rate": 4.7760000000000005e-06, + "loss": 228.8398, + "step": 5970 + }, + { + "epoch": 0.024159956689843526, + "grad_norm": 1650.892578125, + "learning_rate": 4.784e-06, + "loss": 244.8798, + "step": 5980 + }, + { + "epoch": 0.02420035795521116, + "grad_norm": 2203.7802734375, + "learning_rate": 4.792000000000001e-06, + "loss": 123.9805, + "step": 5990 + }, + { + "epoch": 0.02424075922057879, + "grad_norm": 1320.2685546875, + "learning_rate": 4.800000000000001e-06, + "loss": 224.6934, + "step": 6000 + }, + { + "epoch": 0.02428116048594642, + "grad_norm": 2024.762939453125, + "learning_rate": 4.808e-06, + "loss": 179.326, + "step": 6010 + }, + { + "epoch": 0.02432156175131405, + "grad_norm": 1290.4625244140625, + "learning_rate": 4.816e-06, + "loss": 194.0418, + "step": 6020 + }, + { + "epoch": 0.024361963016681684, + "grad_norm": 1255.84228515625, + "learning_rate": 4.824000000000001e-06, + "loss": 224.2139, + "step": 6030 + }, + { + "epoch": 0.024402364282049314, + "grad_norm": 2267.0771484375, + "learning_rate": 4.8320000000000005e-06, + "loss": 173.2077, + "step": 6040 + }, + { + "epoch": 0.024442765547416947, + "grad_norm": 1085.2239990234375, + "learning_rate": 4.84e-06, + "loss": 231.2666, + "step": 6050 + }, + { + "epoch": 0.024483166812784576, + "grad_norm": 646.0029296875, + "learning_rate": 4.848000000000001e-06, + "loss": 225.9161, + "step": 6060 + }, + { + "epoch": 0.02452356807815221, + "grad_norm": 1238.6492919921875, + "learning_rate": 4.856e-06, + "loss": 241.2469, + "step": 6070 + }, + { + "epoch": 0.02456396934351984, + "grad_norm": 1897.38525390625, + "learning_rate": 4.864e-06, + "loss": 241.3081, + "step": 6080 + }, + { + "epoch": 0.02460437060888747, + "grad_norm": 1054.46533203125, + "learning_rate": 4.872000000000001e-06, + "loss": 239.2661, + "step": 6090 + }, + { + "epoch": 0.0246447718742551, + "grad_norm": 1075.7666015625, + "learning_rate": 4.880000000000001e-06, + "loss": 283.6578, + "step": 6100 + }, + { + "epoch": 0.02468517313962273, + "grad_norm": 1039.7003173828125, + "learning_rate": 4.8880000000000005e-06, + "loss": 240.7815, + "step": 6110 + }, + { + "epoch": 0.024725574404990364, + "grad_norm": 1259.1131591796875, + "learning_rate": 4.896e-06, + "loss": 366.4511, + "step": 6120 + }, + { + "epoch": 0.024765975670357994, + "grad_norm": 1470.3970947265625, + "learning_rate": 4.904000000000001e-06, + "loss": 225.3204, + "step": 6130 + }, + { + "epoch": 0.024806376935725627, + "grad_norm": 1794.458984375, + "learning_rate": 4.9120000000000006e-06, + "loss": 223.0217, + "step": 6140 + }, + { + "epoch": 0.024846778201093257, + "grad_norm": 1118.264892578125, + "learning_rate": 4.92e-06, + "loss": 223.7026, + "step": 6150 + }, + { + "epoch": 0.02488717946646089, + "grad_norm": 727.241943359375, + "learning_rate": 4.928000000000001e-06, + "loss": 222.272, + "step": 6160 + }, + { + "epoch": 0.02492758073182852, + "grad_norm": 945.7498779296875, + "learning_rate": 4.936e-06, + "loss": 218.763, + "step": 6170 + }, + { + "epoch": 0.024967981997196152, + "grad_norm": 787.7138061523438, + "learning_rate": 4.9440000000000004e-06, + "loss": 240.9764, + "step": 6180 + }, + { + "epoch": 0.025008383262563782, + "grad_norm": 766.1471557617188, + "learning_rate": 4.952e-06, + "loss": 217.0796, + "step": 6190 + }, + { + "epoch": 0.025048784527931415, + "grad_norm": 847.5587158203125, + "learning_rate": 4.960000000000001e-06, + "loss": 266.6659, + "step": 6200 + }, + { + "epoch": 0.025089185793299044, + "grad_norm": 5914.6298828125, + "learning_rate": 4.9680000000000005e-06, + "loss": 279.6393, + "step": 6210 + }, + { + "epoch": 0.025129587058666678, + "grad_norm": 677.2847290039062, + "learning_rate": 4.976e-06, + "loss": 227.3291, + "step": 6220 + }, + { + "epoch": 0.025169988324034307, + "grad_norm": 655.4007568359375, + "learning_rate": 4.984000000000001e-06, + "loss": 202.4817, + "step": 6230 + }, + { + "epoch": 0.02521038958940194, + "grad_norm": 1323.789306640625, + "learning_rate": 4.992e-06, + "loss": 191.2438, + "step": 6240 + }, + { + "epoch": 0.02525079085476957, + "grad_norm": 507.458740234375, + "learning_rate": 5e-06, + "loss": 254.478, + "step": 6250 + }, + { + "epoch": 0.025291192120137203, + "grad_norm": 943.15576171875, + "learning_rate": 5.008000000000001e-06, + "loss": 239.349, + "step": 6260 + }, + { + "epoch": 0.025331593385504832, + "grad_norm": 3894.38232421875, + "learning_rate": 5.016000000000001e-06, + "loss": 338.9898, + "step": 6270 + }, + { + "epoch": 0.025371994650872465, + "grad_norm": 1571.9990234375, + "learning_rate": 5.024e-06, + "loss": 219.3787, + "step": 6280 + }, + { + "epoch": 0.025412395916240095, + "grad_norm": 867.1376342773438, + "learning_rate": 5.032e-06, + "loss": 155.8549, + "step": 6290 + }, + { + "epoch": 0.025452797181607728, + "grad_norm": 2691.724365234375, + "learning_rate": 5.04e-06, + "loss": 195.9285, + "step": 6300 + }, + { + "epoch": 0.025493198446975358, + "grad_norm": 679.07470703125, + "learning_rate": 5.048000000000001e-06, + "loss": 152.4618, + "step": 6310 + }, + { + "epoch": 0.02553359971234299, + "grad_norm": 1874.7620849609375, + "learning_rate": 5.056000000000001e-06, + "loss": 239.3215, + "step": 6320 + }, + { + "epoch": 0.02557400097771062, + "grad_norm": 1597.4532470703125, + "learning_rate": 5.064e-06, + "loss": 294.9449, + "step": 6330 + }, + { + "epoch": 0.025614402243078253, + "grad_norm": 835.0941162109375, + "learning_rate": 5.072e-06, + "loss": 260.0973, + "step": 6340 + }, + { + "epoch": 0.025654803508445883, + "grad_norm": 570.6823120117188, + "learning_rate": 5.0800000000000005e-06, + "loss": 275.1141, + "step": 6350 + }, + { + "epoch": 0.025695204773813516, + "grad_norm": 1883.6915283203125, + "learning_rate": 5.088000000000001e-06, + "loss": 241.7582, + "step": 6360 + }, + { + "epoch": 0.025735606039181146, + "grad_norm": 1226.4967041015625, + "learning_rate": 5.096000000000001e-06, + "loss": 208.412, + "step": 6370 + }, + { + "epoch": 0.02577600730454878, + "grad_norm": 979.4700927734375, + "learning_rate": 5.104e-06, + "loss": 274.0542, + "step": 6380 + }, + { + "epoch": 0.025816408569916408, + "grad_norm": 4669.87646484375, + "learning_rate": 5.112e-06, + "loss": 242.256, + "step": 6390 + }, + { + "epoch": 0.02585680983528404, + "grad_norm": 790.1699829101562, + "learning_rate": 5.12e-06, + "loss": 189.5335, + "step": 6400 + }, + { + "epoch": 0.02589721110065167, + "grad_norm": 656.4432983398438, + "learning_rate": 5.128000000000001e-06, + "loss": 199.3494, + "step": 6410 + }, + { + "epoch": 0.025937612366019304, + "grad_norm": 1939.25927734375, + "learning_rate": 5.136e-06, + "loss": 200.8123, + "step": 6420 + }, + { + "epoch": 0.025978013631386934, + "grad_norm": 783.3861083984375, + "learning_rate": 5.144e-06, + "loss": 209.4736, + "step": 6430 + }, + { + "epoch": 0.026018414896754567, + "grad_norm": 7453.63525390625, + "learning_rate": 5.152e-06, + "loss": 273.0079, + "step": 6440 + }, + { + "epoch": 0.026058816162122196, + "grad_norm": 2093.859130859375, + "learning_rate": 5.1600000000000006e-06, + "loss": 272.8426, + "step": 6450 + }, + { + "epoch": 0.02609921742748983, + "grad_norm": 878.1881713867188, + "learning_rate": 5.168000000000001e-06, + "loss": 166.2215, + "step": 6460 + }, + { + "epoch": 0.02613961869285746, + "grad_norm": 778.22998046875, + "learning_rate": 5.176e-06, + "loss": 246.2203, + "step": 6470 + }, + { + "epoch": 0.026180019958225092, + "grad_norm": 872.73779296875, + "learning_rate": 5.184e-06, + "loss": 267.9873, + "step": 6480 + }, + { + "epoch": 0.02622042122359272, + "grad_norm": 1905.6541748046875, + "learning_rate": 5.1920000000000004e-06, + "loss": 220.6847, + "step": 6490 + }, + { + "epoch": 0.026260822488960354, + "grad_norm": 773.5687866210938, + "learning_rate": 5.2e-06, + "loss": 245.812, + "step": 6500 + }, + { + "epoch": 0.026301223754327984, + "grad_norm": 708.81640625, + "learning_rate": 5.208000000000001e-06, + "loss": 223.7549, + "step": 6510 + }, + { + "epoch": 0.026341625019695617, + "grad_norm": 720.0722045898438, + "learning_rate": 5.216e-06, + "loss": 230.0802, + "step": 6520 + }, + { + "epoch": 0.026382026285063247, + "grad_norm": 1879.3905029296875, + "learning_rate": 5.224e-06, + "loss": 253.9856, + "step": 6530 + }, + { + "epoch": 0.02642242755043088, + "grad_norm": 1301.2640380859375, + "learning_rate": 5.232e-06, + "loss": 250.2336, + "step": 6540 + }, + { + "epoch": 0.02646282881579851, + "grad_norm": 3535.994384765625, + "learning_rate": 5.240000000000001e-06, + "loss": 178.8618, + "step": 6550 + }, + { + "epoch": 0.026503230081166142, + "grad_norm": 819.4697875976562, + "learning_rate": 5.248000000000001e-06, + "loss": 215.1948, + "step": 6560 + }, + { + "epoch": 0.026543631346533772, + "grad_norm": 2550.58056640625, + "learning_rate": 5.256e-06, + "loss": 171.359, + "step": 6570 + }, + { + "epoch": 0.026584032611901405, + "grad_norm": 888.9176635742188, + "learning_rate": 5.264e-06, + "loss": 208.8959, + "step": 6580 + }, + { + "epoch": 0.026624433877269035, + "grad_norm": 1065.287109375, + "learning_rate": 5.2720000000000005e-06, + "loss": 130.0779, + "step": 6590 + }, + { + "epoch": 0.026664835142636668, + "grad_norm": 716.4095458984375, + "learning_rate": 5.28e-06, + "loss": 272.3356, + "step": 6600 + }, + { + "epoch": 0.026705236408004297, + "grad_norm": 1295.81689453125, + "learning_rate": 5.288000000000001e-06, + "loss": 260.927, + "step": 6610 + }, + { + "epoch": 0.02674563767337193, + "grad_norm": 1182.1502685546875, + "learning_rate": 5.296e-06, + "loss": 180.9049, + "step": 6620 + }, + { + "epoch": 0.02678603893873956, + "grad_norm": 787.0662231445312, + "learning_rate": 5.304e-06, + "loss": 332.8199, + "step": 6630 + }, + { + "epoch": 0.026826440204107193, + "grad_norm": 909.1768798828125, + "learning_rate": 5.312e-06, + "loss": 175.8336, + "step": 6640 + }, + { + "epoch": 0.026866841469474823, + "grad_norm": 718.7473754882812, + "learning_rate": 5.320000000000001e-06, + "loss": 170.0182, + "step": 6650 + }, + { + "epoch": 0.026907242734842456, + "grad_norm": 1273.45751953125, + "learning_rate": 5.328000000000001e-06, + "loss": 183.7154, + "step": 6660 + }, + { + "epoch": 0.026947644000210085, + "grad_norm": 1967.2197265625, + "learning_rate": 5.336e-06, + "loss": 216.884, + "step": 6670 + }, + { + "epoch": 0.026988045265577718, + "grad_norm": 1119.968994140625, + "learning_rate": 5.344e-06, + "loss": 176.886, + "step": 6680 + }, + { + "epoch": 0.027028446530945348, + "grad_norm": 836.3682861328125, + "learning_rate": 5.352000000000001e-06, + "loss": 227.2072, + "step": 6690 + }, + { + "epoch": 0.02706884779631298, + "grad_norm": 1655.4444580078125, + "learning_rate": 5.36e-06, + "loss": 288.9412, + "step": 6700 + }, + { + "epoch": 0.02710924906168061, + "grad_norm": 892.9385375976562, + "learning_rate": 5.368000000000001e-06, + "loss": 266.4419, + "step": 6710 + }, + { + "epoch": 0.027149650327048244, + "grad_norm": 2061.684814453125, + "learning_rate": 5.376e-06, + "loss": 203.1807, + "step": 6720 + }, + { + "epoch": 0.027190051592415873, + "grad_norm": 4281.24462890625, + "learning_rate": 5.3840000000000005e-06, + "loss": 277.4917, + "step": 6730 + }, + { + "epoch": 0.027230452857783506, + "grad_norm": 1263.8963623046875, + "learning_rate": 5.392e-06, + "loss": 233.3151, + "step": 6740 + }, + { + "epoch": 0.027270854123151136, + "grad_norm": 2914.96240234375, + "learning_rate": 5.400000000000001e-06, + "loss": 309.4876, + "step": 6750 + }, + { + "epoch": 0.02731125538851877, + "grad_norm": 2502.68505859375, + "learning_rate": 5.408e-06, + "loss": 230.7599, + "step": 6760 + }, + { + "epoch": 0.0273516566538864, + "grad_norm": 721.0610961914062, + "learning_rate": 5.416e-06, + "loss": 192.2397, + "step": 6770 + }, + { + "epoch": 0.02739205791925403, + "grad_norm": 1302.8016357421875, + "learning_rate": 5.424e-06, + "loss": 234.8806, + "step": 6780 + }, + { + "epoch": 0.02743245918462166, + "grad_norm": 2844.963134765625, + "learning_rate": 5.432000000000001e-06, + "loss": 223.0276, + "step": 6790 + }, + { + "epoch": 0.027472860449989294, + "grad_norm": 2866.33935546875, + "learning_rate": 5.4400000000000004e-06, + "loss": 270.0705, + "step": 6800 + }, + { + "epoch": 0.027513261715356924, + "grad_norm": 1128.1572265625, + "learning_rate": 5.448e-06, + "loss": 204.3208, + "step": 6810 + }, + { + "epoch": 0.027553662980724557, + "grad_norm": 969.2723999023438, + "learning_rate": 5.456e-06, + "loss": 175.3854, + "step": 6820 + }, + { + "epoch": 0.027594064246092186, + "grad_norm": 2599.377197265625, + "learning_rate": 5.4640000000000005e-06, + "loss": 222.5188, + "step": 6830 + }, + { + "epoch": 0.02763446551145982, + "grad_norm": 867.9594116210938, + "learning_rate": 5.472e-06, + "loss": 156.1774, + "step": 6840 + }, + { + "epoch": 0.02767486677682745, + "grad_norm": 846.9324340820312, + "learning_rate": 5.480000000000001e-06, + "loss": 257.9354, + "step": 6850 + }, + { + "epoch": 0.027715268042195082, + "grad_norm": 1269.0369873046875, + "learning_rate": 5.488e-06, + "loss": 178.2477, + "step": 6860 + }, + { + "epoch": 0.02775566930756271, + "grad_norm": 1494.2698974609375, + "learning_rate": 5.496e-06, + "loss": 236.2311, + "step": 6870 + }, + { + "epoch": 0.027796070572930345, + "grad_norm": 1050.6405029296875, + "learning_rate": 5.504e-06, + "loss": 262.8825, + "step": 6880 + }, + { + "epoch": 0.027836471838297974, + "grad_norm": 988.2347412109375, + "learning_rate": 5.512000000000001e-06, + "loss": 295.5132, + "step": 6890 + }, + { + "epoch": 0.027876873103665607, + "grad_norm": 2504.8466796875, + "learning_rate": 5.5200000000000005e-06, + "loss": 172.9894, + "step": 6900 + }, + { + "epoch": 0.027917274369033237, + "grad_norm": 1700.8922119140625, + "learning_rate": 5.528e-06, + "loss": 216.5585, + "step": 6910 + }, + { + "epoch": 0.02795767563440087, + "grad_norm": 842.3058471679688, + "learning_rate": 5.536e-06, + "loss": 184.2832, + "step": 6920 + }, + { + "epoch": 0.0279980768997685, + "grad_norm": 1680.5631103515625, + "learning_rate": 5.544000000000001e-06, + "loss": 174.7779, + "step": 6930 + }, + { + "epoch": 0.028038478165136133, + "grad_norm": 1502.794921875, + "learning_rate": 5.552e-06, + "loss": 193.01, + "step": 6940 + }, + { + "epoch": 0.028078879430503762, + "grad_norm": 993.77001953125, + "learning_rate": 5.560000000000001e-06, + "loss": 193.0354, + "step": 6950 + }, + { + "epoch": 0.028119280695871395, + "grad_norm": 1127.84619140625, + "learning_rate": 5.568e-06, + "loss": 153.1952, + "step": 6960 + }, + { + "epoch": 0.028159681961239025, + "grad_norm": 0.0, + "learning_rate": 5.5760000000000005e-06, + "loss": 128.1788, + "step": 6970 + }, + { + "epoch": 0.028200083226606658, + "grad_norm": 2749.52490234375, + "learning_rate": 5.584e-06, + "loss": 218.7497, + "step": 6980 + }, + { + "epoch": 0.028240484491974287, + "grad_norm": 1261.0377197265625, + "learning_rate": 5.592000000000001e-06, + "loss": 219.2645, + "step": 6990 + }, + { + "epoch": 0.02828088575734192, + "grad_norm": 897.4447631835938, + "learning_rate": 5.600000000000001e-06, + "loss": 200.2647, + "step": 7000 + }, + { + "epoch": 0.02832128702270955, + "grad_norm": 1057.957763671875, + "learning_rate": 5.608e-06, + "loss": 202.9416, + "step": 7010 + }, + { + "epoch": 0.028361688288077183, + "grad_norm": 1196.8267822265625, + "learning_rate": 5.616e-06, + "loss": 236.2699, + "step": 7020 + }, + { + "epoch": 0.028402089553444813, + "grad_norm": 782.2156372070312, + "learning_rate": 5.624000000000001e-06, + "loss": 140.3201, + "step": 7030 + }, + { + "epoch": 0.028442490818812446, + "grad_norm": 2569.600830078125, + "learning_rate": 5.6320000000000005e-06, + "loss": 192.4108, + "step": 7040 + }, + { + "epoch": 0.028482892084180075, + "grad_norm": 1064.5799560546875, + "learning_rate": 5.64e-06, + "loss": 215.6623, + "step": 7050 + }, + { + "epoch": 0.02852329334954771, + "grad_norm": 1072.6158447265625, + "learning_rate": 5.648e-06, + "loss": 244.8136, + "step": 7060 + }, + { + "epoch": 0.028563694614915338, + "grad_norm": 1529.186767578125, + "learning_rate": 5.6560000000000006e-06, + "loss": 151.7662, + "step": 7070 + }, + { + "epoch": 0.02860409588028297, + "grad_norm": 1600.055908203125, + "learning_rate": 5.664e-06, + "loss": 208.1255, + "step": 7080 + }, + { + "epoch": 0.0286444971456506, + "grad_norm": 2405.045166015625, + "learning_rate": 5.672000000000001e-06, + "loss": 213.3373, + "step": 7090 + }, + { + "epoch": 0.028684898411018234, + "grad_norm": 809.442138671875, + "learning_rate": 5.68e-06, + "loss": 272.1993, + "step": 7100 + }, + { + "epoch": 0.028725299676385863, + "grad_norm": 685.48486328125, + "learning_rate": 5.6880000000000004e-06, + "loss": 249.301, + "step": 7110 + }, + { + "epoch": 0.028765700941753496, + "grad_norm": 953.4014892578125, + "learning_rate": 5.696e-06, + "loss": 229.629, + "step": 7120 + }, + { + "epoch": 0.028806102207121126, + "grad_norm": 1117.8612060546875, + "learning_rate": 5.704000000000001e-06, + "loss": 182.2337, + "step": 7130 + }, + { + "epoch": 0.02884650347248876, + "grad_norm": 1129.2705078125, + "learning_rate": 5.7120000000000005e-06, + "loss": 227.1903, + "step": 7140 + }, + { + "epoch": 0.02888690473785639, + "grad_norm": 1262.8304443359375, + "learning_rate": 5.72e-06, + "loss": 197.0418, + "step": 7150 + }, + { + "epoch": 0.02892730600322402, + "grad_norm": 1090.8460693359375, + "learning_rate": 5.728e-06, + "loss": 258.6746, + "step": 7160 + }, + { + "epoch": 0.02896770726859165, + "grad_norm": 1301.744873046875, + "learning_rate": 5.736000000000001e-06, + "loss": 186.262, + "step": 7170 + }, + { + "epoch": 0.029008108533959284, + "grad_norm": 0.0, + "learning_rate": 5.744e-06, + "loss": 234.6007, + "step": 7180 + }, + { + "epoch": 0.029048509799326914, + "grad_norm": 1715.7432861328125, + "learning_rate": 5.752000000000001e-06, + "loss": 259.1334, + "step": 7190 + }, + { + "epoch": 0.029088911064694547, + "grad_norm": 6556.9111328125, + "learning_rate": 5.76e-06, + "loss": 214.17, + "step": 7200 + }, + { + "epoch": 0.029129312330062176, + "grad_norm": 1113.2286376953125, + "learning_rate": 5.7680000000000005e-06, + "loss": 196.0228, + "step": 7210 + }, + { + "epoch": 0.02916971359542981, + "grad_norm": 517.07421875, + "learning_rate": 5.776e-06, + "loss": 218.068, + "step": 7220 + }, + { + "epoch": 0.02921011486079744, + "grad_norm": 3351.35302734375, + "learning_rate": 5.784000000000001e-06, + "loss": 252.4301, + "step": 7230 + }, + { + "epoch": 0.029250516126165072, + "grad_norm": 766.5056762695312, + "learning_rate": 5.792000000000001e-06, + "loss": 182.3972, + "step": 7240 + }, + { + "epoch": 0.029290917391532702, + "grad_norm": 1673.7806396484375, + "learning_rate": 5.8e-06, + "loss": 216.9672, + "step": 7250 + }, + { + "epoch": 0.029331318656900335, + "grad_norm": 2629.049072265625, + "learning_rate": 5.808e-06, + "loss": 223.4914, + "step": 7260 + }, + { + "epoch": 0.029371719922267964, + "grad_norm": 0.0, + "learning_rate": 5.816000000000001e-06, + "loss": 152.4408, + "step": 7270 + }, + { + "epoch": 0.029412121187635597, + "grad_norm": 642.0879516601562, + "learning_rate": 5.8240000000000005e-06, + "loss": 223.5615, + "step": 7280 + }, + { + "epoch": 0.029452522453003227, + "grad_norm": 1864.696533203125, + "learning_rate": 5.832000000000001e-06, + "loss": 222.317, + "step": 7290 + }, + { + "epoch": 0.02949292371837086, + "grad_norm": 2587.0654296875, + "learning_rate": 5.84e-06, + "loss": 171.8688, + "step": 7300 + }, + { + "epoch": 0.02953332498373849, + "grad_norm": 1596.408203125, + "learning_rate": 5.848000000000001e-06, + "loss": 265.2696, + "step": 7310 + }, + { + "epoch": 0.029573726249106123, + "grad_norm": 1245.2196044921875, + "learning_rate": 5.856e-06, + "loss": 266.6465, + "step": 7320 + }, + { + "epoch": 0.029614127514473752, + "grad_norm": 2699.140625, + "learning_rate": 5.864000000000001e-06, + "loss": 227.3748, + "step": 7330 + }, + { + "epoch": 0.029654528779841385, + "grad_norm": 1752.6739501953125, + "learning_rate": 5.872000000000001e-06, + "loss": 192.5232, + "step": 7340 + }, + { + "epoch": 0.029694930045209015, + "grad_norm": 1439.0364990234375, + "learning_rate": 5.8800000000000005e-06, + "loss": 146.6349, + "step": 7350 + }, + { + "epoch": 0.029735331310576648, + "grad_norm": 1190.3038330078125, + "learning_rate": 5.888e-06, + "loss": 224.5771, + "step": 7360 + }, + { + "epoch": 0.029775732575944278, + "grad_norm": 1894.7503662109375, + "learning_rate": 5.896000000000001e-06, + "loss": 157.2506, + "step": 7370 + }, + { + "epoch": 0.02981613384131191, + "grad_norm": 1797.5352783203125, + "learning_rate": 5.9040000000000006e-06, + "loss": 186.649, + "step": 7380 + }, + { + "epoch": 0.02985653510667954, + "grad_norm": 1430.443115234375, + "learning_rate": 5.912e-06, + "loss": 220.6982, + "step": 7390 + }, + { + "epoch": 0.029896936372047173, + "grad_norm": 1276.9576416015625, + "learning_rate": 5.92e-06, + "loss": 304.9311, + "step": 7400 + }, + { + "epoch": 0.029937337637414803, + "grad_norm": 1519.4019775390625, + "learning_rate": 5.928000000000001e-06, + "loss": 210.0733, + "step": 7410 + }, + { + "epoch": 0.029977738902782436, + "grad_norm": 1359.1260986328125, + "learning_rate": 5.9360000000000004e-06, + "loss": 208.0187, + "step": 7420 + }, + { + "epoch": 0.030018140168150065, + "grad_norm": 1850.1422119140625, + "learning_rate": 5.944000000000001e-06, + "loss": 142.9344, + "step": 7430 + }, + { + "epoch": 0.0300585414335177, + "grad_norm": 3120.883544921875, + "learning_rate": 5.952e-06, + "loss": 170.6609, + "step": 7440 + }, + { + "epoch": 0.030098942698885328, + "grad_norm": 740.1887817382812, + "learning_rate": 5.9600000000000005e-06, + "loss": 219.5085, + "step": 7450 + }, + { + "epoch": 0.03013934396425296, + "grad_norm": 1086.6651611328125, + "learning_rate": 5.968e-06, + "loss": 241.9711, + "step": 7460 + }, + { + "epoch": 0.03017974522962059, + "grad_norm": 1672.157470703125, + "learning_rate": 5.976000000000001e-06, + "loss": 191.8062, + "step": 7470 + }, + { + "epoch": 0.030220146494988224, + "grad_norm": 1653.2451171875, + "learning_rate": 5.984000000000001e-06, + "loss": 205.3574, + "step": 7480 + }, + { + "epoch": 0.030260547760355853, + "grad_norm": 1397.8525390625, + "learning_rate": 5.992e-06, + "loss": 202.0153, + "step": 7490 + }, + { + "epoch": 0.030300949025723486, + "grad_norm": 2837.261474609375, + "learning_rate": 6e-06, + "loss": 221.9876, + "step": 7500 + }, + { + "epoch": 0.030341350291091116, + "grad_norm": 4730.8916015625, + "learning_rate": 6.008000000000001e-06, + "loss": 302.4308, + "step": 7510 + }, + { + "epoch": 0.03038175155645875, + "grad_norm": 1000.0179443359375, + "learning_rate": 6.0160000000000005e-06, + "loss": 196.5067, + "step": 7520 + }, + { + "epoch": 0.03042215282182638, + "grad_norm": 1019.1756591796875, + "learning_rate": 6.024000000000001e-06, + "loss": 184.9854, + "step": 7530 + }, + { + "epoch": 0.030462554087194012, + "grad_norm": 1395.87744140625, + "learning_rate": 6.032e-06, + "loss": 249.2624, + "step": 7540 + }, + { + "epoch": 0.03050295535256164, + "grad_norm": 4622.18798828125, + "learning_rate": 6.040000000000001e-06, + "loss": 160.7419, + "step": 7550 + }, + { + "epoch": 0.030543356617929274, + "grad_norm": 1667.111083984375, + "learning_rate": 6.048e-06, + "loss": 175.9941, + "step": 7560 + }, + { + "epoch": 0.030583757883296904, + "grad_norm": 2928.287841796875, + "learning_rate": 6.056000000000001e-06, + "loss": 141.5608, + "step": 7570 + }, + { + "epoch": 0.030624159148664537, + "grad_norm": 2412.287841796875, + "learning_rate": 6.064000000000001e-06, + "loss": 325.7221, + "step": 7580 + }, + { + "epoch": 0.030664560414032167, + "grad_norm": 707.0420532226562, + "learning_rate": 6.0720000000000005e-06, + "loss": 269.7903, + "step": 7590 + }, + { + "epoch": 0.0307049616793998, + "grad_norm": 2488.442138671875, + "learning_rate": 6.08e-06, + "loss": 277.1691, + "step": 7600 + }, + { + "epoch": 0.03074536294476743, + "grad_norm": 1018.7144775390625, + "learning_rate": 6.088000000000001e-06, + "loss": 266.7885, + "step": 7610 + }, + { + "epoch": 0.030785764210135062, + "grad_norm": 794.6591796875, + "learning_rate": 6.096000000000001e-06, + "loss": 197.3139, + "step": 7620 + }, + { + "epoch": 0.030826165475502692, + "grad_norm": 1089.7852783203125, + "learning_rate": 6.104000000000001e-06, + "loss": 219.666, + "step": 7630 + }, + { + "epoch": 0.030866566740870325, + "grad_norm": 895.769287109375, + "learning_rate": 6.112e-06, + "loss": 161.9822, + "step": 7640 + }, + { + "epoch": 0.030906968006237955, + "grad_norm": 2226.711669921875, + "learning_rate": 6.120000000000001e-06, + "loss": 183.5556, + "step": 7650 + }, + { + "epoch": 0.030947369271605588, + "grad_norm": 915.3427124023438, + "learning_rate": 6.1280000000000005e-06, + "loss": 141.0757, + "step": 7660 + }, + { + "epoch": 0.030987770536973217, + "grad_norm": 1243.2064208984375, + "learning_rate": 6.136000000000001e-06, + "loss": 170.3279, + "step": 7670 + }, + { + "epoch": 0.03102817180234085, + "grad_norm": 2503.118408203125, + "learning_rate": 6.144e-06, + "loss": 263.5051, + "step": 7680 + }, + { + "epoch": 0.03106857306770848, + "grad_norm": 3317.427734375, + "learning_rate": 6.1520000000000006e-06, + "loss": 225.3879, + "step": 7690 + }, + { + "epoch": 0.031108974333076113, + "grad_norm": 1108.700439453125, + "learning_rate": 6.16e-06, + "loss": 188.1277, + "step": 7700 + }, + { + "epoch": 0.031149375598443742, + "grad_norm": 1502.6771240234375, + "learning_rate": 6.168000000000001e-06, + "loss": 210.462, + "step": 7710 + }, + { + "epoch": 0.031189776863811376, + "grad_norm": 1329.4693603515625, + "learning_rate": 6.176000000000001e-06, + "loss": 149.7618, + "step": 7720 + }, + { + "epoch": 0.031230178129179005, + "grad_norm": 598.7182006835938, + "learning_rate": 6.184e-06, + "loss": 198.0457, + "step": 7730 + }, + { + "epoch": 0.03127057939454664, + "grad_norm": 1033.920654296875, + "learning_rate": 6.192e-06, + "loss": 130.0827, + "step": 7740 + }, + { + "epoch": 0.03131098065991427, + "grad_norm": 1291.9276123046875, + "learning_rate": 6.200000000000001e-06, + "loss": 202.4379, + "step": 7750 + }, + { + "epoch": 0.0313513819252819, + "grad_norm": 2443.810791015625, + "learning_rate": 6.2080000000000005e-06, + "loss": 261.256, + "step": 7760 + }, + { + "epoch": 0.03139178319064953, + "grad_norm": 884.2955322265625, + "learning_rate": 6.216000000000001e-06, + "loss": 167.975, + "step": 7770 + }, + { + "epoch": 0.03143218445601716, + "grad_norm": 1252.4766845703125, + "learning_rate": 6.224e-06, + "loss": 215.3594, + "step": 7780 + }, + { + "epoch": 0.031472585721384796, + "grad_norm": 1382.6903076171875, + "learning_rate": 6.232000000000001e-06, + "loss": 194.3709, + "step": 7790 + }, + { + "epoch": 0.03151298698675242, + "grad_norm": 1545.613525390625, + "learning_rate": 6.24e-06, + "loss": 219.0374, + "step": 7800 + }, + { + "epoch": 0.031553388252120056, + "grad_norm": 1265.0899658203125, + "learning_rate": 6.248000000000001e-06, + "loss": 151.8752, + "step": 7810 + }, + { + "epoch": 0.03159378951748769, + "grad_norm": 1623.3619384765625, + "learning_rate": 6.256000000000001e-06, + "loss": 242.5646, + "step": 7820 + }, + { + "epoch": 0.03163419078285532, + "grad_norm": 592.3634033203125, + "learning_rate": 6.264e-06, + "loss": 164.5361, + "step": 7830 + }, + { + "epoch": 0.03167459204822295, + "grad_norm": 5308.93701171875, + "learning_rate": 6.272e-06, + "loss": 208.5307, + "step": 7840 + }, + { + "epoch": 0.03171499331359058, + "grad_norm": 2834.79833984375, + "learning_rate": 6.280000000000001e-06, + "loss": 221.2382, + "step": 7850 + }, + { + "epoch": 0.031755394578958214, + "grad_norm": 1279.97314453125, + "learning_rate": 6.288000000000001e-06, + "loss": 176.2134, + "step": 7860 + }, + { + "epoch": 0.03179579584432585, + "grad_norm": 1794.7216796875, + "learning_rate": 6.296000000000001e-06, + "loss": 181.0496, + "step": 7870 + }, + { + "epoch": 0.03183619710969347, + "grad_norm": 5191.13232421875, + "learning_rate": 6.304e-06, + "loss": 316.8718, + "step": 7880 + }, + { + "epoch": 0.031876598375061106, + "grad_norm": 1317.33447265625, + "learning_rate": 6.312000000000001e-06, + "loss": 152.3099, + "step": 7890 + }, + { + "epoch": 0.03191699964042874, + "grad_norm": 977.89990234375, + "learning_rate": 6.3200000000000005e-06, + "loss": 130.9134, + "step": 7900 + }, + { + "epoch": 0.03195740090579637, + "grad_norm": 651.8931884765625, + "learning_rate": 6.328000000000001e-06, + "loss": 223.651, + "step": 7910 + }, + { + "epoch": 0.031997802171164, + "grad_norm": 767.2454223632812, + "learning_rate": 6.336000000000001e-06, + "loss": 229.737, + "step": 7920 + }, + { + "epoch": 0.03203820343653163, + "grad_norm": 1214.2449951171875, + "learning_rate": 6.344e-06, + "loss": 220.1174, + "step": 7930 + }, + { + "epoch": 0.032078604701899265, + "grad_norm": 0.0, + "learning_rate": 6.352e-06, + "loss": 153.6576, + "step": 7940 + }, + { + "epoch": 0.0321190059672669, + "grad_norm": 1004.9306640625, + "learning_rate": 6.360000000000001e-06, + "loss": 240.569, + "step": 7950 + }, + { + "epoch": 0.032159407232634524, + "grad_norm": 1504.4068603515625, + "learning_rate": 6.368000000000001e-06, + "loss": 274.5606, + "step": 7960 + }, + { + "epoch": 0.03219980849800216, + "grad_norm": 2671.022705078125, + "learning_rate": 6.376e-06, + "loss": 274.2913, + "step": 7970 + }, + { + "epoch": 0.03224020976336979, + "grad_norm": 2131.018798828125, + "learning_rate": 6.384e-06, + "loss": 217.4296, + "step": 7980 + }, + { + "epoch": 0.03228061102873742, + "grad_norm": 1160.2044677734375, + "learning_rate": 6.392000000000001e-06, + "loss": 176.5127, + "step": 7990 + }, + { + "epoch": 0.03232101229410505, + "grad_norm": 740.7154541015625, + "learning_rate": 6.4000000000000006e-06, + "loss": 231.2125, + "step": 8000 } ], "logging_steps": 10,