{ "best_metric": 0.8260869565217391, "best_model_checkpoint": "CTMAE2_CS_V7_6/checkpoint-3432", "epoch": 49.01367741935484, "eval_steps": 500, "global_step": 7750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012903225806451613, "grad_norm": 2.5513670444488525, "learning_rate": 1.2903225806451614e-07, "loss": 0.6817, "step": 10 }, { "epoch": 0.0025806451612903226, "grad_norm": 2.1570885181427, "learning_rate": 2.580645161290323e-07, "loss": 0.6718, "step": 20 }, { "epoch": 0.003870967741935484, "grad_norm": 3.7924509048461914, "learning_rate": 3.870967741935484e-07, "loss": 0.664, "step": 30 }, { "epoch": 0.005161290322580645, "grad_norm": 3.294450044631958, "learning_rate": 5.161290322580646e-07, "loss": 0.6574, "step": 40 }, { "epoch": 0.0064516129032258064, "grad_norm": 4.026410102844238, "learning_rate": 6.451612903225807e-07, "loss": 0.6455, "step": 50 }, { "epoch": 0.007741935483870968, "grad_norm": 2.2546310424804688, "learning_rate": 7.741935483870968e-07, "loss": 0.7021, "step": 60 }, { "epoch": 0.00903225806451613, "grad_norm": 17.691654205322266, "learning_rate": 9.032258064516129e-07, "loss": 0.6599, "step": 70 }, { "epoch": 0.01032258064516129, "grad_norm": 2.6751585006713867, "learning_rate": 1.0322580645161291e-06, "loss": 0.687, "step": 80 }, { "epoch": 0.011612903225806452, "grad_norm": 3.2990963459014893, "learning_rate": 1.1612903225806454e-06, "loss": 0.6226, "step": 90 }, { "epoch": 0.012903225806451613, "grad_norm": 13.228951454162598, "learning_rate": 1.2903225806451614e-06, "loss": 0.6137, "step": 100 }, { "epoch": 0.014193548387096775, "grad_norm": 9.068791389465332, "learning_rate": 1.4193548387096776e-06, "loss": 0.593, "step": 110 }, { "epoch": 0.015483870967741935, "grad_norm": 7.151644706726074, "learning_rate": 1.5483870967741937e-06, "loss": 0.6282, "step": 120 }, { "epoch": 0.016774193548387096, "grad_norm": 5.203014373779297, "learning_rate": 1.67741935483871e-06, "loss": 0.644, "step": 130 }, { "epoch": 0.01806451612903226, "grad_norm": 5.906531810760498, "learning_rate": 1.8064516129032258e-06, "loss": 0.7139, "step": 140 }, { "epoch": 0.01935483870967742, "grad_norm": 5.024239540100098, "learning_rate": 1.935483870967742e-06, "loss": 0.6413, "step": 150 }, { "epoch": 0.020129032258064516, "eval_accuracy": 0.45652173913043476, "eval_loss": 0.7947535514831543, "eval_runtime": 14.8738, "eval_samples_per_second": 3.093, "eval_steps_per_second": 0.672, "step": 156 }, { "epoch": 1.0005161290322582, "grad_norm": 5.875014781951904, "learning_rate": 2.0645161290322582e-06, "loss": 0.5828, "step": 160 }, { "epoch": 1.0018064516129033, "grad_norm": 5.040384292602539, "learning_rate": 2.1935483870967745e-06, "loss": 0.611, "step": 170 }, { "epoch": 1.0030967741935484, "grad_norm": 4.287607669830322, "learning_rate": 2.3225806451612907e-06, "loss": 0.6525, "step": 180 }, { "epoch": 1.0043870967741935, "grad_norm": 5.1059112548828125, "learning_rate": 2.4516129032258066e-06, "loss": 0.6126, "step": 190 }, { "epoch": 1.0056774193548388, "grad_norm": 4.623972415924072, "learning_rate": 2.580645161290323e-06, "loss": 0.6107, "step": 200 }, { "epoch": 1.0069677419354839, "grad_norm": 5.343932151794434, "learning_rate": 2.709677419354839e-06, "loss": 0.6286, "step": 210 }, { "epoch": 1.008258064516129, "grad_norm": 10.037893295288086, "learning_rate": 2.8387096774193553e-06, "loss": 0.6688, "step": 220 }, { "epoch": 1.0095483870967743, "grad_norm": 9.156920433044434, "learning_rate": 2.967741935483871e-06, "loss": 0.6537, "step": 230 }, { "epoch": 1.0108387096774194, "grad_norm": 5.66978645324707, "learning_rate": 3.0967741935483874e-06, "loss": 0.6444, "step": 240 }, { "epoch": 1.0121290322580645, "grad_norm": 5.309883117675781, "learning_rate": 3.225806451612903e-06, "loss": 0.6362, "step": 250 }, { "epoch": 1.0134193548387096, "grad_norm": 10.887988090515137, "learning_rate": 3.35483870967742e-06, "loss": 0.6312, "step": 260 }, { "epoch": 1.014709677419355, "grad_norm": 11.26144027709961, "learning_rate": 3.4838709677419357e-06, "loss": 0.664, "step": 270 }, { "epoch": 1.016, "grad_norm": 9.947885513305664, "learning_rate": 3.6129032258064515e-06, "loss": 0.6263, "step": 280 }, { "epoch": 1.0172903225806451, "grad_norm": 7.827743053436279, "learning_rate": 3.741935483870968e-06, "loss": 0.6035, "step": 290 }, { "epoch": 1.0185806451612902, "grad_norm": 13.62060260772705, "learning_rate": 3.870967741935484e-06, "loss": 0.5709, "step": 300 }, { "epoch": 1.0198709677419355, "grad_norm": 7.442618370056152, "learning_rate": 4.000000000000001e-06, "loss": 0.5022, "step": 310 }, { "epoch": 1.0201290322580645, "eval_accuracy": 0.45652173913043476, "eval_loss": 0.8548817038536072, "eval_runtime": 13.6498, "eval_samples_per_second": 3.37, "eval_steps_per_second": 0.733, "step": 312 }, { "epoch": 2.0010322580645163, "grad_norm": 12.483841896057129, "learning_rate": 4.1290322580645165e-06, "loss": 0.6708, "step": 320 }, { "epoch": 2.002322580645161, "grad_norm": 9.477625846862793, "learning_rate": 4.258064516129032e-06, "loss": 0.5875, "step": 330 }, { "epoch": 2.0036129032258065, "grad_norm": 10.442103385925293, "learning_rate": 4.387096774193549e-06, "loss": 0.4635, "step": 340 }, { "epoch": 2.0049032258064514, "grad_norm": 11.479347229003906, "learning_rate": 4.516129032258065e-06, "loss": 0.5914, "step": 350 }, { "epoch": 2.0061935483870967, "grad_norm": 22.441858291625977, "learning_rate": 4.6451612903225815e-06, "loss": 0.5727, "step": 360 }, { "epoch": 2.007483870967742, "grad_norm": 12.805633544921875, "learning_rate": 4.774193548387097e-06, "loss": 0.5004, "step": 370 }, { "epoch": 2.008774193548387, "grad_norm": 12.923316955566406, "learning_rate": 4.903225806451613e-06, "loss": 0.466, "step": 380 }, { "epoch": 2.0100645161290323, "grad_norm": 11.742849349975586, "learning_rate": 5.032258064516129e-06, "loss": 0.5136, "step": 390 }, { "epoch": 2.0113548387096776, "grad_norm": 12.677641868591309, "learning_rate": 5.161290322580646e-06, "loss": 0.6058, "step": 400 }, { "epoch": 2.0126451612903224, "grad_norm": 11.963651657104492, "learning_rate": 5.290322580645162e-06, "loss": 0.551, "step": 410 }, { "epoch": 2.0139354838709678, "grad_norm": 11.98446273803711, "learning_rate": 5.419354838709678e-06, "loss": 0.5064, "step": 420 }, { "epoch": 2.015225806451613, "grad_norm": 10.235251426696777, "learning_rate": 5.548387096774194e-06, "loss": 0.607, "step": 430 }, { "epoch": 2.016516129032258, "grad_norm": 19.838441848754883, "learning_rate": 5.677419354838711e-06, "loss": 0.5645, "step": 440 }, { "epoch": 2.0178064516129033, "grad_norm": 8.506612777709961, "learning_rate": 5.806451612903226e-06, "loss": 0.5118, "step": 450 }, { "epoch": 2.0190967741935486, "grad_norm": 70.65406799316406, "learning_rate": 5.935483870967742e-06, "loss": 0.6512, "step": 460 }, { "epoch": 2.0201290322580645, "eval_accuracy": 0.5869565217391305, "eval_loss": 0.6670534610748291, "eval_runtime": 12.9029, "eval_samples_per_second": 3.565, "eval_steps_per_second": 0.775, "step": 468 }, { "epoch": 3.000258064516129, "grad_norm": 7.6030683517456055, "learning_rate": 6.064516129032259e-06, "loss": 0.5794, "step": 470 }, { "epoch": 3.0015483870967743, "grad_norm": 7.256181716918945, "learning_rate": 6.193548387096775e-06, "loss": 0.5691, "step": 480 }, { "epoch": 3.002838709677419, "grad_norm": 2.9833614826202393, "learning_rate": 6.3225806451612906e-06, "loss": 0.6358, "step": 490 }, { "epoch": 3.0041290322580645, "grad_norm": 2.7177481651306152, "learning_rate": 6.451612903225806e-06, "loss": 0.6, "step": 500 }, { "epoch": 3.00541935483871, "grad_norm": 5.681326866149902, "learning_rate": 6.580645161290323e-06, "loss": 0.5881, "step": 510 }, { "epoch": 3.0067096774193547, "grad_norm": 8.107717514038086, "learning_rate": 6.70967741935484e-06, "loss": 0.5759, "step": 520 }, { "epoch": 3.008, "grad_norm": 8.868525505065918, "learning_rate": 6.838709677419355e-06, "loss": 0.5422, "step": 530 }, { "epoch": 3.0092903225806453, "grad_norm": 22.951568603515625, "learning_rate": 6.967741935483871e-06, "loss": 0.6157, "step": 540 }, { "epoch": 3.01058064516129, "grad_norm": 14.757305145263672, "learning_rate": 7.096774193548388e-06, "loss": 0.5238, "step": 550 }, { "epoch": 3.0118709677419355, "grad_norm": 21.001012802124023, "learning_rate": 7.225806451612903e-06, "loss": 0.497, "step": 560 }, { "epoch": 3.013161290322581, "grad_norm": 3.7305068969726562, "learning_rate": 7.35483870967742e-06, "loss": 0.4675, "step": 570 }, { "epoch": 3.0144516129032257, "grad_norm": 23.764007568359375, "learning_rate": 7.483870967741936e-06, "loss": 0.5675, "step": 580 }, { "epoch": 3.015741935483871, "grad_norm": 7.207258701324463, "learning_rate": 7.612903225806451e-06, "loss": 0.7603, "step": 590 }, { "epoch": 3.017032258064516, "grad_norm": 4.507648944854736, "learning_rate": 7.741935483870968e-06, "loss": 0.6185, "step": 600 }, { "epoch": 3.0183225806451612, "grad_norm": 11.389603614807129, "learning_rate": 7.870967741935484e-06, "loss": 0.526, "step": 610 }, { "epoch": 3.0196129032258066, "grad_norm": 7.626564979553223, "learning_rate": 8.000000000000001e-06, "loss": 0.5026, "step": 620 }, { "epoch": 3.0201290322580645, "eval_accuracy": 0.6521739130434783, "eval_loss": 0.6542143821716309, "eval_runtime": 13.6189, "eval_samples_per_second": 3.378, "eval_steps_per_second": 0.734, "step": 624 }, { "epoch": 4.000774193548387, "grad_norm": 3.567378044128418, "learning_rate": 8.129032258064517e-06, "loss": 0.4565, "step": 630 }, { "epoch": 4.002064516129033, "grad_norm": 8.64986515045166, "learning_rate": 8.258064516129033e-06, "loss": 0.7222, "step": 640 }, { "epoch": 4.003354838709678, "grad_norm": 7.742122173309326, "learning_rate": 8.387096774193549e-06, "loss": 0.5627, "step": 650 }, { "epoch": 4.004645161290322, "grad_norm": 8.357722282409668, "learning_rate": 8.516129032258065e-06, "loss": 0.4885, "step": 660 }, { "epoch": 4.005935483870967, "grad_norm": 16.816993713378906, "learning_rate": 8.64516129032258e-06, "loss": 0.4283, "step": 670 }, { "epoch": 4.007225806451613, "grad_norm": 13.521672248840332, "learning_rate": 8.774193548387098e-06, "loss": 0.4759, "step": 680 }, { "epoch": 4.008516129032258, "grad_norm": 11.313436508178711, "learning_rate": 8.903225806451614e-06, "loss": 0.6334, "step": 690 }, { "epoch": 4.009806451612903, "grad_norm": 8.696495056152344, "learning_rate": 9.03225806451613e-06, "loss": 0.4834, "step": 700 }, { "epoch": 4.011096774193549, "grad_norm": 7.647945880889893, "learning_rate": 9.161290322580645e-06, "loss": 0.549, "step": 710 }, { "epoch": 4.0123870967741935, "grad_norm": 4.169041633605957, "learning_rate": 9.290322580645163e-06, "loss": 0.3946, "step": 720 }, { "epoch": 4.013677419354838, "grad_norm": 8.994153022766113, "learning_rate": 9.419354838709677e-06, "loss": 0.4346, "step": 730 }, { "epoch": 4.014967741935484, "grad_norm": 25.87916374206543, "learning_rate": 9.548387096774195e-06, "loss": 0.7258, "step": 740 }, { "epoch": 4.016258064516129, "grad_norm": 13.010149002075195, "learning_rate": 9.67741935483871e-06, "loss": 0.5168, "step": 750 }, { "epoch": 4.017548387096774, "grad_norm": 10.839446067810059, "learning_rate": 9.806451612903226e-06, "loss": 0.4877, "step": 760 }, { "epoch": 4.01883870967742, "grad_norm": 6.985745906829834, "learning_rate": 9.935483870967742e-06, "loss": 0.4667, "step": 770 }, { "epoch": 4.0201290322580645, "grad_norm": 6.948948383331299, "learning_rate": 9.992831541218639e-06, "loss": 0.5752, "step": 780 }, { "epoch": 4.0201290322580645, "eval_accuracy": 0.717391304347826, "eval_loss": 0.6095772385597229, "eval_runtime": 12.9584, "eval_samples_per_second": 3.55, "eval_steps_per_second": 0.772, "step": 780 }, { "epoch": 5.001290322580645, "grad_norm": 5.174366474151611, "learning_rate": 9.978494623655915e-06, "loss": 0.4587, "step": 790 }, { "epoch": 5.002580645161291, "grad_norm": 10.11021614074707, "learning_rate": 9.96415770609319e-06, "loss": 0.6132, "step": 800 }, { "epoch": 5.0038709677419355, "grad_norm": 10.487930297851562, "learning_rate": 9.949820788530466e-06, "loss": 0.4209, "step": 810 }, { "epoch": 5.00516129032258, "grad_norm": 19.883081436157227, "learning_rate": 9.935483870967742e-06, "loss": 0.7923, "step": 820 }, { "epoch": 5.006451612903226, "grad_norm": 17.361183166503906, "learning_rate": 9.921146953405018e-06, "loss": 0.4802, "step": 830 }, { "epoch": 5.007741935483871, "grad_norm": 15.686917304992676, "learning_rate": 9.906810035842294e-06, "loss": 0.4867, "step": 840 }, { "epoch": 5.009032258064516, "grad_norm": 24.809295654296875, "learning_rate": 9.89247311827957e-06, "loss": 0.5621, "step": 850 }, { "epoch": 5.010322580645162, "grad_norm": 21.643070220947266, "learning_rate": 9.878136200716847e-06, "loss": 0.5553, "step": 860 }, { "epoch": 5.0116129032258065, "grad_norm": 15.540183067321777, "learning_rate": 9.863799283154123e-06, "loss": 0.5442, "step": 870 }, { "epoch": 5.012903225806451, "grad_norm": 4.480224132537842, "learning_rate": 9.8494623655914e-06, "loss": 0.4327, "step": 880 }, { "epoch": 5.014193548387097, "grad_norm": 3.1469571590423584, "learning_rate": 9.835125448028676e-06, "loss": 0.3235, "step": 890 }, { "epoch": 5.015483870967742, "grad_norm": 22.272279739379883, "learning_rate": 9.820788530465952e-06, "loss": 0.5458, "step": 900 }, { "epoch": 5.016774193548387, "grad_norm": 12.204292297363281, "learning_rate": 9.806451612903226e-06, "loss": 0.3684, "step": 910 }, { "epoch": 5.018064516129032, "grad_norm": 35.64434814453125, "learning_rate": 9.792114695340502e-06, "loss": 0.5122, "step": 920 }, { "epoch": 5.019354838709678, "grad_norm": 28.54721450805664, "learning_rate": 9.777777777777779e-06, "loss": 0.5908, "step": 930 }, { "epoch": 5.0201290322580645, "eval_accuracy": 0.5217391304347826, "eval_loss": 0.7500813603401184, "eval_runtime": 12.9028, "eval_samples_per_second": 3.565, "eval_steps_per_second": 0.775, "step": 936 }, { "epoch": 6.000516129032258, "grad_norm": 5.353494167327881, "learning_rate": 9.763440860215055e-06, "loss": 0.6537, "step": 940 }, { "epoch": 6.001806451612903, "grad_norm": 8.211503982543945, "learning_rate": 9.749103942652331e-06, "loss": 0.4766, "step": 950 }, { "epoch": 6.003096774193549, "grad_norm": 4.954970836639404, "learning_rate": 9.734767025089607e-06, "loss": 0.554, "step": 960 }, { "epoch": 6.0043870967741935, "grad_norm": 7.555273532867432, "learning_rate": 9.720430107526882e-06, "loss": 0.5044, "step": 970 }, { "epoch": 6.005677419354838, "grad_norm": 10.074644088745117, "learning_rate": 9.706093189964158e-06, "loss": 0.3389, "step": 980 }, { "epoch": 6.006967741935484, "grad_norm": 40.93281936645508, "learning_rate": 9.691756272401434e-06, "loss": 0.4695, "step": 990 }, { "epoch": 6.008258064516129, "grad_norm": 22.333370208740234, "learning_rate": 9.67741935483871e-06, "loss": 0.5501, "step": 1000 }, { "epoch": 6.009548387096774, "grad_norm": 10.75607967376709, "learning_rate": 9.663082437275987e-06, "loss": 0.3417, "step": 1010 }, { "epoch": 6.01083870967742, "grad_norm": 11.736294746398926, "learning_rate": 9.648745519713263e-06, "loss": 0.5667, "step": 1020 }, { "epoch": 6.0121290322580645, "grad_norm": 13.930486679077148, "learning_rate": 9.634408602150539e-06, "loss": 0.6093, "step": 1030 }, { "epoch": 6.013419354838709, "grad_norm": 16.155759811401367, "learning_rate": 9.620071684587814e-06, "loss": 0.4456, "step": 1040 }, { "epoch": 6.014709677419355, "grad_norm": 39.24663543701172, "learning_rate": 9.60573476702509e-06, "loss": 0.4708, "step": 1050 }, { "epoch": 6.016, "grad_norm": 16.95737648010254, "learning_rate": 9.591397849462366e-06, "loss": 0.6944, "step": 1060 }, { "epoch": 6.017290322580645, "grad_norm": 5.135772705078125, "learning_rate": 9.577060931899642e-06, "loss": 0.3379, "step": 1070 }, { "epoch": 6.018580645161291, "grad_norm": 21.27039909362793, "learning_rate": 9.562724014336918e-06, "loss": 0.4247, "step": 1080 }, { "epoch": 6.0198709677419355, "grad_norm": 10.397266387939453, "learning_rate": 9.548387096774195e-06, "loss": 0.4882, "step": 1090 }, { "epoch": 6.0201290322580645, "eval_accuracy": 0.6304347826086957, "eval_loss": 0.7651864886283875, "eval_runtime": 13.0824, "eval_samples_per_second": 3.516, "eval_steps_per_second": 0.764, "step": 1092 }, { "epoch": 7.001032258064516, "grad_norm": 1.6547272205352783, "learning_rate": 9.53405017921147e-06, "loss": 0.4434, "step": 1100 }, { "epoch": 7.002322580645162, "grad_norm": 4.0995683670043945, "learning_rate": 9.519713261648747e-06, "loss": 0.7862, "step": 1110 }, { "epoch": 7.0036129032258065, "grad_norm": 11.412741661071777, "learning_rate": 9.505376344086023e-06, "loss": 0.3906, "step": 1120 }, { "epoch": 7.004903225806451, "grad_norm": 21.728618621826172, "learning_rate": 9.491039426523298e-06, "loss": 0.3443, "step": 1130 }, { "epoch": 7.006193548387097, "grad_norm": 12.444160461425781, "learning_rate": 9.476702508960574e-06, "loss": 0.6742, "step": 1140 }, { "epoch": 7.007483870967742, "grad_norm": 7.956286907196045, "learning_rate": 9.46236559139785e-06, "loss": 0.2696, "step": 1150 }, { "epoch": 7.008774193548387, "grad_norm": 3.1120765209198, "learning_rate": 9.448028673835126e-06, "loss": 0.5785, "step": 1160 }, { "epoch": 7.010064516129033, "grad_norm": 14.62903881072998, "learning_rate": 9.433691756272403e-06, "loss": 0.4837, "step": 1170 }, { "epoch": 7.011354838709678, "grad_norm": 7.206798553466797, "learning_rate": 9.419354838709677e-06, "loss": 0.4467, "step": 1180 }, { "epoch": 7.0126451612903224, "grad_norm": 11.134936332702637, "learning_rate": 9.405017921146953e-06, "loss": 0.4307, "step": 1190 }, { "epoch": 7.013935483870967, "grad_norm": 6.921154499053955, "learning_rate": 9.39068100358423e-06, "loss": 0.5121, "step": 1200 }, { "epoch": 7.015225806451613, "grad_norm": 14.086562156677246, "learning_rate": 9.376344086021506e-06, "loss": 0.3154, "step": 1210 }, { "epoch": 7.016516129032258, "grad_norm": 11.563647270202637, "learning_rate": 9.362007168458782e-06, "loss": 0.4912, "step": 1220 }, { "epoch": 7.017806451612903, "grad_norm": 17.971603393554688, "learning_rate": 9.347670250896058e-06, "loss": 0.4507, "step": 1230 }, { "epoch": 7.019096774193549, "grad_norm": 12.179518699645996, "learning_rate": 9.333333333333334e-06, "loss": 0.4128, "step": 1240 }, { "epoch": 7.0201290322580645, "eval_accuracy": 0.6521739130434783, "eval_loss": 0.7746206521987915, "eval_runtime": 12.226, "eval_samples_per_second": 3.762, "eval_steps_per_second": 0.818, "step": 1248 }, { "epoch": 8.00025806451613, "grad_norm": 21.103506088256836, "learning_rate": 9.31899641577061e-06, "loss": 0.4607, "step": 1250 }, { "epoch": 8.001548387096774, "grad_norm": 19.628625869750977, "learning_rate": 9.304659498207887e-06, "loss": 0.5058, "step": 1260 }, { "epoch": 8.00283870967742, "grad_norm": 16.78925323486328, "learning_rate": 9.290322580645163e-06, "loss": 0.5021, "step": 1270 }, { "epoch": 8.004129032258065, "grad_norm": 16.168333053588867, "learning_rate": 9.27598566308244e-06, "loss": 0.6082, "step": 1280 }, { "epoch": 8.00541935483871, "grad_norm": 14.910419464111328, "learning_rate": 9.261648745519714e-06, "loss": 0.5062, "step": 1290 }, { "epoch": 8.006709677419355, "grad_norm": 11.717639923095703, "learning_rate": 9.24731182795699e-06, "loss": 0.5506, "step": 1300 }, { "epoch": 8.008, "grad_norm": 16.124109268188477, "learning_rate": 9.232974910394266e-06, "loss": 0.3168, "step": 1310 }, { "epoch": 8.009290322580645, "grad_norm": 11.513866424560547, "learning_rate": 9.218637992831542e-06, "loss": 0.4799, "step": 1320 }, { "epoch": 8.01058064516129, "grad_norm": 3.4021828174591064, "learning_rate": 9.204301075268819e-06, "loss": 0.3777, "step": 1330 }, { "epoch": 8.011870967741935, "grad_norm": 3.518261194229126, "learning_rate": 9.189964157706093e-06, "loss": 0.4202, "step": 1340 }, { "epoch": 8.01316129032258, "grad_norm": 26.859046936035156, "learning_rate": 9.17562724014337e-06, "loss": 0.405, "step": 1350 }, { "epoch": 8.014451612903226, "grad_norm": 36.80097961425781, "learning_rate": 9.161290322580645e-06, "loss": 0.3284, "step": 1360 }, { "epoch": 8.01574193548387, "grad_norm": 24.820600509643555, "learning_rate": 9.146953405017922e-06, "loss": 0.537, "step": 1370 }, { "epoch": 8.017032258064516, "grad_norm": 10.015302658081055, "learning_rate": 9.132616487455198e-06, "loss": 0.3947, "step": 1380 }, { "epoch": 8.018322580645162, "grad_norm": 34.552711486816406, "learning_rate": 9.118279569892474e-06, "loss": 0.4069, "step": 1390 }, { "epoch": 8.019612903225806, "grad_norm": 10.10966682434082, "learning_rate": 9.10394265232975e-06, "loss": 0.4414, "step": 1400 }, { "epoch": 8.020129032258065, "eval_accuracy": 0.717391304347826, "eval_loss": 0.5973219871520996, "eval_runtime": 11.5068, "eval_samples_per_second": 3.998, "eval_steps_per_second": 0.869, "step": 1404 }, { "epoch": 9.000774193548388, "grad_norm": 23.22496795654297, "learning_rate": 9.089605734767026e-06, "loss": 0.4189, "step": 1410 }, { "epoch": 9.002064516129032, "grad_norm": 5.423180103302002, "learning_rate": 9.075268817204301e-06, "loss": 0.5723, "step": 1420 }, { "epoch": 9.003354838709678, "grad_norm": 24.04204750061035, "learning_rate": 9.060931899641577e-06, "loss": 0.2991, "step": 1430 }, { "epoch": 9.004645161290323, "grad_norm": 7.290071964263916, "learning_rate": 9.046594982078853e-06, "loss": 0.1849, "step": 1440 }, { "epoch": 9.005935483870967, "grad_norm": 24.317068099975586, "learning_rate": 9.03225806451613e-06, "loss": 0.3501, "step": 1450 }, { "epoch": 9.007225806451613, "grad_norm": 2.3210699558258057, "learning_rate": 9.017921146953406e-06, "loss": 0.3368, "step": 1460 }, { "epoch": 9.008516129032259, "grad_norm": 20.747764587402344, "learning_rate": 9.003584229390682e-06, "loss": 0.362, "step": 1470 }, { "epoch": 9.009806451612903, "grad_norm": 18.469575881958008, "learning_rate": 8.989247311827958e-06, "loss": 0.4049, "step": 1480 }, { "epoch": 9.011096774193549, "grad_norm": 28.91852569580078, "learning_rate": 8.974910394265234e-06, "loss": 0.3612, "step": 1490 }, { "epoch": 9.012387096774194, "grad_norm": 25.942922592163086, "learning_rate": 8.96057347670251e-06, "loss": 0.533, "step": 1500 }, { "epoch": 9.013677419354838, "grad_norm": 42.32680130004883, "learning_rate": 8.946236559139785e-06, "loss": 0.6108, "step": 1510 }, { "epoch": 9.014967741935484, "grad_norm": 27.841663360595703, "learning_rate": 8.931899641577061e-06, "loss": 0.5484, "step": 1520 }, { "epoch": 9.01625806451613, "grad_norm": 7.528716087341309, "learning_rate": 8.917562724014338e-06, "loss": 0.5113, "step": 1530 }, { "epoch": 9.017548387096774, "grad_norm": 4.6436052322387695, "learning_rate": 8.903225806451614e-06, "loss": 0.2363, "step": 1540 }, { "epoch": 9.01883870967742, "grad_norm": 12.990522384643555, "learning_rate": 8.888888888888888e-06, "loss": 0.4912, "step": 1550 }, { "epoch": 9.020129032258065, "grad_norm": 8.900224685668945, "learning_rate": 8.874551971326165e-06, "loss": 0.4291, "step": 1560 }, { "epoch": 9.020129032258065, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7594068646430969, "eval_runtime": 11.5243, "eval_samples_per_second": 3.992, "eval_steps_per_second": 0.868, "step": 1560 }, { "epoch": 10.001290322580646, "grad_norm": 14.535553932189941, "learning_rate": 8.86021505376344e-06, "loss": 0.432, "step": 1570 }, { "epoch": 10.00258064516129, "grad_norm": 12.444673538208008, "learning_rate": 8.845878136200717e-06, "loss": 0.4088, "step": 1580 }, { "epoch": 10.003870967741936, "grad_norm": 1.6427336931228638, "learning_rate": 8.831541218637993e-06, "loss": 0.2173, "step": 1590 }, { "epoch": 10.005161290322581, "grad_norm": 24.513877868652344, "learning_rate": 8.81720430107527e-06, "loss": 0.2821, "step": 1600 }, { "epoch": 10.006451612903225, "grad_norm": 8.219161033630371, "learning_rate": 8.802867383512546e-06, "loss": 0.2473, "step": 1610 }, { "epoch": 10.007741935483871, "grad_norm": 12.838802337646484, "learning_rate": 8.788530465949822e-06, "loss": 0.4085, "step": 1620 }, { "epoch": 10.009032258064517, "grad_norm": 26.629636764526367, "learning_rate": 8.774193548387098e-06, "loss": 0.4704, "step": 1630 }, { "epoch": 10.01032258064516, "grad_norm": 19.027122497558594, "learning_rate": 8.759856630824374e-06, "loss": 0.4165, "step": 1640 }, { "epoch": 10.011612903225807, "grad_norm": 7.150402069091797, "learning_rate": 8.74551971326165e-06, "loss": 0.4684, "step": 1650 }, { "epoch": 10.012903225806452, "grad_norm": 12.694477081298828, "learning_rate": 8.731182795698927e-06, "loss": 0.3594, "step": 1660 }, { "epoch": 10.014193548387096, "grad_norm": 8.918449401855469, "learning_rate": 8.716845878136203e-06, "loss": 0.656, "step": 1670 }, { "epoch": 10.015483870967742, "grad_norm": 10.326666831970215, "learning_rate": 8.702508960573477e-06, "loss": 0.4367, "step": 1680 }, { "epoch": 10.016774193548388, "grad_norm": 61.20396423339844, "learning_rate": 8.688172043010754e-06, "loss": 0.5137, "step": 1690 }, { "epoch": 10.018064516129032, "grad_norm": 13.492216110229492, "learning_rate": 8.67383512544803e-06, "loss": 0.3008, "step": 1700 }, { "epoch": 10.019354838709678, "grad_norm": 28.311128616333008, "learning_rate": 8.659498207885306e-06, "loss": 0.2729, "step": 1710 }, { "epoch": 10.020129032258065, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.5485166311264038, "eval_runtime": 11.5007, "eval_samples_per_second": 4.0, "eval_steps_per_second": 0.87, "step": 1716 }, { "epoch": 11.000516129032258, "grad_norm": 1.0564289093017578, "learning_rate": 8.64516129032258e-06, "loss": 0.3413, "step": 1720 }, { "epoch": 11.001806451612904, "grad_norm": 84.91474914550781, "learning_rate": 8.630824372759857e-06, "loss": 0.2871, "step": 1730 }, { "epoch": 11.003096774193548, "grad_norm": 70.9284439086914, "learning_rate": 8.616487455197133e-06, "loss": 0.4256, "step": 1740 }, { "epoch": 11.004387096774193, "grad_norm": 17.388111114501953, "learning_rate": 8.602150537634409e-06, "loss": 0.3764, "step": 1750 }, { "epoch": 11.00567741935484, "grad_norm": 12.29884147644043, "learning_rate": 8.587813620071685e-06, "loss": 0.2914, "step": 1760 }, { "epoch": 11.006967741935483, "grad_norm": 18.944608688354492, "learning_rate": 8.573476702508961e-06, "loss": 0.7564, "step": 1770 }, { "epoch": 11.008258064516129, "grad_norm": 7.012818813323975, "learning_rate": 8.559139784946238e-06, "loss": 0.4211, "step": 1780 }, { "epoch": 11.009548387096775, "grad_norm": 25.96501350402832, "learning_rate": 8.544802867383514e-06, "loss": 0.3174, "step": 1790 }, { "epoch": 11.010838709677419, "grad_norm": 30.386552810668945, "learning_rate": 8.530465949820788e-06, "loss": 0.6142, "step": 1800 }, { "epoch": 11.012129032258064, "grad_norm": 29.219703674316406, "learning_rate": 8.516129032258065e-06, "loss": 0.2829, "step": 1810 }, { "epoch": 11.01341935483871, "grad_norm": 22.719879150390625, "learning_rate": 8.50179211469534e-06, "loss": 0.2425, "step": 1820 }, { "epoch": 11.014709677419354, "grad_norm": 33.46078872680664, "learning_rate": 8.487455197132617e-06, "loss": 0.2741, "step": 1830 }, { "epoch": 11.016, "grad_norm": 2.2795767784118652, "learning_rate": 8.473118279569893e-06, "loss": 0.3184, "step": 1840 }, { "epoch": 11.017290322580646, "grad_norm": 26.029348373413086, "learning_rate": 8.45878136200717e-06, "loss": 0.2652, "step": 1850 }, { "epoch": 11.01858064516129, "grad_norm": 26.775375366210938, "learning_rate": 8.444444444444446e-06, "loss": 0.578, "step": 1860 }, { "epoch": 11.019870967741936, "grad_norm": 56.20985794067383, "learning_rate": 8.430107526881722e-06, "loss": 0.5803, "step": 1870 }, { "epoch": 11.020129032258065, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.7237962484359741, "eval_runtime": 11.7255, "eval_samples_per_second": 3.923, "eval_steps_per_second": 0.853, "step": 1872 }, { "epoch": 12.001032258064516, "grad_norm": 8.385832786560059, "learning_rate": 8.415770609318998e-06, "loss": 0.2432, "step": 1880 }, { "epoch": 12.002322580645162, "grad_norm": 37.530921936035156, "learning_rate": 8.401433691756273e-06, "loss": 0.3082, "step": 1890 }, { "epoch": 12.003612903225806, "grad_norm": 30.538076400756836, "learning_rate": 8.387096774193549e-06, "loss": 0.6239, "step": 1900 }, { "epoch": 12.004903225806451, "grad_norm": 5.471914768218994, "learning_rate": 8.372759856630825e-06, "loss": 0.3223, "step": 1910 }, { "epoch": 12.006193548387097, "grad_norm": 15.895891189575195, "learning_rate": 8.358422939068101e-06, "loss": 0.3024, "step": 1920 }, { "epoch": 12.007483870967741, "grad_norm": 43.042415618896484, "learning_rate": 8.344086021505376e-06, "loss": 0.3618, "step": 1930 }, { "epoch": 12.008774193548387, "grad_norm": 25.53901481628418, "learning_rate": 8.329749103942652e-06, "loss": 0.4449, "step": 1940 }, { "epoch": 12.010064516129033, "grad_norm": 29.640016555786133, "learning_rate": 8.315412186379928e-06, "loss": 0.1822, "step": 1950 }, { "epoch": 12.011354838709677, "grad_norm": 40.57928466796875, "learning_rate": 8.301075268817204e-06, "loss": 0.3976, "step": 1960 }, { "epoch": 12.012645161290322, "grad_norm": 54.96004104614258, "learning_rate": 8.28673835125448e-06, "loss": 0.2721, "step": 1970 }, { "epoch": 12.013935483870968, "grad_norm": 7.112162113189697, "learning_rate": 8.272401433691757e-06, "loss": 0.3598, "step": 1980 }, { "epoch": 12.015225806451612, "grad_norm": 15.882174491882324, "learning_rate": 8.258064516129033e-06, "loss": 0.4285, "step": 1990 }, { "epoch": 12.016516129032258, "grad_norm": 7.165054798126221, "learning_rate": 8.24372759856631e-06, "loss": 0.505, "step": 2000 }, { "epoch": 12.017806451612904, "grad_norm": 14.390588760375977, "learning_rate": 8.229390681003585e-06, "loss": 0.6273, "step": 2010 }, { "epoch": 12.019096774193548, "grad_norm": 18.983789443969727, "learning_rate": 8.215053763440862e-06, "loss": 0.4601, "step": 2020 }, { "epoch": 12.020129032258065, "eval_accuracy": 0.717391304347826, "eval_loss": 0.7227532863616943, "eval_runtime": 11.5641, "eval_samples_per_second": 3.978, "eval_steps_per_second": 0.865, "step": 2028 }, { "epoch": 13.00025806451613, "grad_norm": 30.633363723754883, "learning_rate": 8.200716845878138e-06, "loss": 0.3894, "step": 2030 }, { "epoch": 13.001548387096774, "grad_norm": 22.502687454223633, "learning_rate": 8.186379928315414e-06, "loss": 0.2765, "step": 2040 }, { "epoch": 13.00283870967742, "grad_norm": 14.860006332397461, "learning_rate": 8.172043010752689e-06, "loss": 0.3795, "step": 2050 }, { "epoch": 13.004129032258065, "grad_norm": 1.1069475412368774, "learning_rate": 8.157706093189965e-06, "loss": 0.3691, "step": 2060 }, { "epoch": 13.00541935483871, "grad_norm": 12.334137916564941, "learning_rate": 8.143369175627241e-06, "loss": 0.566, "step": 2070 }, { "epoch": 13.006709677419355, "grad_norm": 25.805068969726562, "learning_rate": 8.129032258064517e-06, "loss": 0.4035, "step": 2080 }, { "epoch": 13.008, "grad_norm": 17.6246337890625, "learning_rate": 8.114695340501793e-06, "loss": 0.5375, "step": 2090 }, { "epoch": 13.009290322580645, "grad_norm": 17.924428939819336, "learning_rate": 8.100358422939068e-06, "loss": 0.2386, "step": 2100 }, { "epoch": 13.01058064516129, "grad_norm": 33.194332122802734, "learning_rate": 8.086021505376344e-06, "loss": 0.2554, "step": 2110 }, { "epoch": 13.011870967741935, "grad_norm": 27.832996368408203, "learning_rate": 8.07168458781362e-06, "loss": 0.4041, "step": 2120 }, { "epoch": 13.01316129032258, "grad_norm": 26.759201049804688, "learning_rate": 8.057347670250897e-06, "loss": 0.4563, "step": 2130 }, { "epoch": 13.014451612903226, "grad_norm": 55.400516510009766, "learning_rate": 8.043010752688173e-06, "loss": 0.375, "step": 2140 }, { "epoch": 13.01574193548387, "grad_norm": 6.199549674987793, "learning_rate": 8.028673835125449e-06, "loss": 0.1555, "step": 2150 }, { "epoch": 13.017032258064516, "grad_norm": 11.90478801727295, "learning_rate": 8.014336917562725e-06, "loss": 0.2158, "step": 2160 }, { "epoch": 13.018322580645162, "grad_norm": 6.3363776206970215, "learning_rate": 8.000000000000001e-06, "loss": 0.1602, "step": 2170 }, { "epoch": 13.019612903225806, "grad_norm": 0.9281737208366394, "learning_rate": 7.985663082437278e-06, "loss": 0.1306, "step": 2180 }, { "epoch": 13.020129032258065, "eval_accuracy": 0.717391304347826, "eval_loss": 0.9496133923530579, "eval_runtime": 12.9184, "eval_samples_per_second": 3.561, "eval_steps_per_second": 0.774, "step": 2184 }, { "epoch": 14.000774193548388, "grad_norm": 4.7668890953063965, "learning_rate": 7.971326164874552e-06, "loss": 0.1779, "step": 2190 }, { "epoch": 14.002064516129032, "grad_norm": 16.246652603149414, "learning_rate": 7.956989247311828e-06, "loss": 0.3018, "step": 2200 }, { "epoch": 14.003354838709678, "grad_norm": 19.79844856262207, "learning_rate": 7.942652329749104e-06, "loss": 0.2864, "step": 2210 }, { "epoch": 14.004645161290323, "grad_norm": 98.33323669433594, "learning_rate": 7.92831541218638e-06, "loss": 0.1684, "step": 2220 }, { "epoch": 14.005935483870967, "grad_norm": 62.57590103149414, "learning_rate": 7.913978494623657e-06, "loss": 0.4146, "step": 2230 }, { "epoch": 14.007225806451613, "grad_norm": 2.014406681060791, "learning_rate": 7.899641577060933e-06, "loss": 0.2172, "step": 2240 }, { "epoch": 14.008516129032259, "grad_norm": 10.904878616333008, "learning_rate": 7.88530465949821e-06, "loss": 0.4201, "step": 2250 }, { "epoch": 14.009806451612903, "grad_norm": 1.069327712059021, "learning_rate": 7.870967741935484e-06, "loss": 0.2062, "step": 2260 }, { "epoch": 14.011096774193549, "grad_norm": 35.006107330322266, "learning_rate": 7.85663082437276e-06, "loss": 0.2215, "step": 2270 }, { "epoch": 14.012387096774194, "grad_norm": 0.30733057856559753, "learning_rate": 7.842293906810036e-06, "loss": 0.2581, "step": 2280 }, { "epoch": 14.013677419354838, "grad_norm": 66.62032318115234, "learning_rate": 7.827956989247312e-06, "loss": 0.5737, "step": 2290 }, { "epoch": 14.014967741935484, "grad_norm": 0.328438401222229, "learning_rate": 7.813620071684589e-06, "loss": 0.4204, "step": 2300 }, { "epoch": 14.01625806451613, "grad_norm": 19.034343719482422, "learning_rate": 7.799283154121865e-06, "loss": 0.4638, "step": 2310 }, { "epoch": 14.017548387096774, "grad_norm": 0.17759037017822266, "learning_rate": 7.78494623655914e-06, "loss": 0.1976, "step": 2320 }, { "epoch": 14.01883870967742, "grad_norm": 73.92333984375, "learning_rate": 7.770609318996416e-06, "loss": 0.4399, "step": 2330 }, { "epoch": 14.020129032258065, "grad_norm": 51.79098892211914, "learning_rate": 7.756272401433692e-06, "loss": 0.4727, "step": 2340 }, { "epoch": 14.020129032258065, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8971098065376282, "eval_runtime": 11.583, "eval_samples_per_second": 3.971, "eval_steps_per_second": 0.863, "step": 2340 }, { "epoch": 15.001290322580646, "grad_norm": 0.22272948920726776, "learning_rate": 7.741935483870968e-06, "loss": 0.4785, "step": 2350 }, { "epoch": 15.00258064516129, "grad_norm": 52.44696807861328, "learning_rate": 7.727598566308244e-06, "loss": 0.5291, "step": 2360 }, { "epoch": 15.003870967741936, "grad_norm": 57.886680603027344, "learning_rate": 7.71326164874552e-06, "loss": 0.3466, "step": 2370 }, { "epoch": 15.005161290322581, "grad_norm": 27.158994674682617, "learning_rate": 7.698924731182797e-06, "loss": 0.4824, "step": 2380 }, { "epoch": 15.006451612903225, "grad_norm": 39.17644500732422, "learning_rate": 7.684587813620073e-06, "loss": 0.3503, "step": 2390 }, { "epoch": 15.007741935483871, "grad_norm": 66.95225524902344, "learning_rate": 7.670250896057349e-06, "loss": 0.4213, "step": 2400 }, { "epoch": 15.009032258064517, "grad_norm": 1.047979712486267, "learning_rate": 7.655913978494625e-06, "loss": 0.2738, "step": 2410 }, { "epoch": 15.01032258064516, "grad_norm": 11.98534870147705, "learning_rate": 7.641577060931901e-06, "loss": 0.4612, "step": 2420 }, { "epoch": 15.011612903225807, "grad_norm": 4.641979694366455, "learning_rate": 7.627240143369177e-06, "loss": 0.3949, "step": 2430 }, { "epoch": 15.012903225806452, "grad_norm": 22.586994171142578, "learning_rate": 7.612903225806451e-06, "loss": 0.2244, "step": 2440 }, { "epoch": 15.014193548387096, "grad_norm": 20.83822250366211, "learning_rate": 7.5985663082437275e-06, "loss": 0.2711, "step": 2450 }, { "epoch": 15.015483870967742, "grad_norm": 0.08025629073381424, "learning_rate": 7.584229390681004e-06, "loss": 0.1635, "step": 2460 }, { "epoch": 15.016774193548388, "grad_norm": 27.178850173950195, "learning_rate": 7.56989247311828e-06, "loss": 0.4588, "step": 2470 }, { "epoch": 15.018064516129032, "grad_norm": 0.07889118045568466, "learning_rate": 7.555555555555556e-06, "loss": 0.4309, "step": 2480 }, { "epoch": 15.019354838709678, "grad_norm": 26.410064697265625, "learning_rate": 7.541218637992832e-06, "loss": 0.4027, "step": 2490 }, { "epoch": 15.020129032258065, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.6290571093559265, "eval_runtime": 11.544, "eval_samples_per_second": 3.985, "eval_steps_per_second": 0.866, "step": 2496 }, { "epoch": 16.00051612903226, "grad_norm": 46.11518096923828, "learning_rate": 7.526881720430108e-06, "loss": 0.1485, "step": 2500 }, { "epoch": 16.001806451612904, "grad_norm": 1.1168911457061768, "learning_rate": 7.512544802867384e-06, "loss": 0.2854, "step": 2510 }, { "epoch": 16.003096774193548, "grad_norm": 67.67325592041016, "learning_rate": 7.49820788530466e-06, "loss": 0.5283, "step": 2520 }, { "epoch": 16.004387096774195, "grad_norm": 18.026124954223633, "learning_rate": 7.483870967741936e-06, "loss": 0.1939, "step": 2530 }, { "epoch": 16.00567741935484, "grad_norm": 7.835986137390137, "learning_rate": 7.4695340501792126e-06, "loss": 0.1039, "step": 2540 }, { "epoch": 16.006967741935483, "grad_norm": 0.15798869729042053, "learning_rate": 7.455197132616489e-06, "loss": 0.3111, "step": 2550 }, { "epoch": 16.00825806451613, "grad_norm": 2.2462079524993896, "learning_rate": 7.440860215053764e-06, "loss": 0.4044, "step": 2560 }, { "epoch": 16.009548387096775, "grad_norm": 5.170799732208252, "learning_rate": 7.4265232974910395e-06, "loss": 0.2037, "step": 2570 }, { "epoch": 16.01083870967742, "grad_norm": 0.9790383577346802, "learning_rate": 7.412186379928316e-06, "loss": 0.4237, "step": 2580 }, { "epoch": 16.012129032258066, "grad_norm": 0.3645631968975067, "learning_rate": 7.397849462365592e-06, "loss": 0.3833, "step": 2590 }, { "epoch": 16.01341935483871, "grad_norm": 13.735404968261719, "learning_rate": 7.383512544802868e-06, "loss": 0.2175, "step": 2600 }, { "epoch": 16.014709677419354, "grad_norm": 46.35683059692383, "learning_rate": 7.3691756272401435e-06, "loss": 0.5188, "step": 2610 }, { "epoch": 16.016, "grad_norm": 1.6273128986358643, "learning_rate": 7.35483870967742e-06, "loss": 0.3996, "step": 2620 }, { "epoch": 16.017290322580646, "grad_norm": 0.33608105778694153, "learning_rate": 7.340501792114696e-06, "loss": 0.3724, "step": 2630 }, { "epoch": 16.01858064516129, "grad_norm": 6.148233890533447, "learning_rate": 7.326164874551972e-06, "loss": 0.3918, "step": 2640 }, { "epoch": 16.019870967741934, "grad_norm": 49.421302795410156, "learning_rate": 7.311827956989248e-06, "loss": 0.3149, "step": 2650 }, { "epoch": 16.020129032258065, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8639056086540222, "eval_runtime": 11.6124, "eval_samples_per_second": 3.961, "eval_steps_per_second": 0.861, "step": 2652 }, { "epoch": 17.001032258064516, "grad_norm": 4.524965286254883, "learning_rate": 7.2974910394265245e-06, "loss": 0.2253, "step": 2660 }, { "epoch": 17.00232258064516, "grad_norm": 0.06006554886698723, "learning_rate": 7.2831541218638e-06, "loss": 0.1516, "step": 2670 }, { "epoch": 17.003612903225807, "grad_norm": 0.19946829974651337, "learning_rate": 7.268817204301076e-06, "loss": 0.1861, "step": 2680 }, { "epoch": 17.00490322580645, "grad_norm": 0.011921118944883347, "learning_rate": 7.254480286738352e-06, "loss": 0.1355, "step": 2690 }, { "epoch": 17.006193548387095, "grad_norm": 123.93885040283203, "learning_rate": 7.240143369175628e-06, "loss": 0.5439, "step": 2700 }, { "epoch": 17.007483870967743, "grad_norm": 19.03252410888672, "learning_rate": 7.225806451612903e-06, "loss": 0.1261, "step": 2710 }, { "epoch": 17.008774193548387, "grad_norm": 0.39719194173812866, "learning_rate": 7.211469534050179e-06, "loss": 0.2275, "step": 2720 }, { "epoch": 17.01006451612903, "grad_norm": 6.024928092956543, "learning_rate": 7.1971326164874554e-06, "loss": 0.3176, "step": 2730 }, { "epoch": 17.01135483870968, "grad_norm": 14.307723999023438, "learning_rate": 7.182795698924732e-06, "loss": 0.4357, "step": 2740 }, { "epoch": 17.012645161290322, "grad_norm": 36.195838928222656, "learning_rate": 7.168458781362008e-06, "loss": 0.2396, "step": 2750 }, { "epoch": 17.013935483870966, "grad_norm": 2.867856979370117, "learning_rate": 7.154121863799284e-06, "loss": 0.332, "step": 2760 }, { "epoch": 17.015225806451614, "grad_norm": 0.9910167455673218, "learning_rate": 7.139784946236559e-06, "loss": 0.3526, "step": 2770 }, { "epoch": 17.016516129032258, "grad_norm": 28.9378662109375, "learning_rate": 7.125448028673836e-06, "loss": 0.1507, "step": 2780 }, { "epoch": 17.017806451612902, "grad_norm": 8.577140808105469, "learning_rate": 7.111111111111112e-06, "loss": 0.2594, "step": 2790 }, { "epoch": 17.01909677419355, "grad_norm": 45.68972396850586, "learning_rate": 7.096774193548388e-06, "loss": 0.1737, "step": 2800 }, { "epoch": 17.020129032258065, "eval_accuracy": 0.6956521739130435, "eval_loss": 1.0473228693008423, "eval_runtime": 11.8823, "eval_samples_per_second": 3.871, "eval_steps_per_second": 0.842, "step": 2808 }, { "epoch": 18.000258064516128, "grad_norm": 0.08299116790294647, "learning_rate": 7.082437275985664e-06, "loss": 0.2546, "step": 2810 }, { "epoch": 18.001548387096776, "grad_norm": 0.03872419893741608, "learning_rate": 7.0681003584229404e-06, "loss": 0.0542, "step": 2820 }, { "epoch": 18.00283870967742, "grad_norm": 0.10180419683456421, "learning_rate": 7.053763440860215e-06, "loss": 0.2784, "step": 2830 }, { "epoch": 18.004129032258064, "grad_norm": 1.5735174417495728, "learning_rate": 7.039426523297491e-06, "loss": 0.2167, "step": 2840 }, { "epoch": 18.00541935483871, "grad_norm": 0.6496406197547913, "learning_rate": 7.025089605734767e-06, "loss": 0.186, "step": 2850 }, { "epoch": 18.006709677419355, "grad_norm": 46.05006790161133, "learning_rate": 7.010752688172044e-06, "loss": 0.2636, "step": 2860 }, { "epoch": 18.008, "grad_norm": 68.08049011230469, "learning_rate": 6.99641577060932e-06, "loss": 0.3521, "step": 2870 }, { "epoch": 18.009290322580647, "grad_norm": 0.24645358324050903, "learning_rate": 6.982078853046595e-06, "loss": 0.2698, "step": 2880 }, { "epoch": 18.01058064516129, "grad_norm": 12.3702392578125, "learning_rate": 6.967741935483871e-06, "loss": 0.3664, "step": 2890 }, { "epoch": 18.011870967741935, "grad_norm": 3.917325019836426, "learning_rate": 6.9534050179211476e-06, "loss": 0.3934, "step": 2900 }, { "epoch": 18.013161290322582, "grad_norm": 72.05944061279297, "learning_rate": 6.939068100358424e-06, "loss": 0.5198, "step": 2910 }, { "epoch": 18.014451612903226, "grad_norm": 47.64091873168945, "learning_rate": 6.9247311827957e-06, "loss": 0.4427, "step": 2920 }, { "epoch": 18.01574193548387, "grad_norm": 0.24409973621368408, "learning_rate": 6.910394265232976e-06, "loss": 0.316, "step": 2930 }, { "epoch": 18.017032258064518, "grad_norm": 7.296486854553223, "learning_rate": 6.8960573476702516e-06, "loss": 0.3637, "step": 2940 }, { "epoch": 18.01832258064516, "grad_norm": 40.13751220703125, "learning_rate": 6.881720430107528e-06, "loss": 0.4517, "step": 2950 }, { "epoch": 18.019612903225806, "grad_norm": 0.6599991917610168, "learning_rate": 6.867383512544803e-06, "loss": 0.2368, "step": 2960 }, { "epoch": 18.020129032258065, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8657800555229187, "eval_runtime": 13.041, "eval_samples_per_second": 3.527, "eval_steps_per_second": 0.767, "step": 2964 }, { "epoch": 19.000774193548388, "grad_norm": 0.258963018655777, "learning_rate": 6.853046594982079e-06, "loss": 0.1621, "step": 2970 }, { "epoch": 19.002064516129032, "grad_norm": 0.06587108969688416, "learning_rate": 6.838709677419355e-06, "loss": 0.3614, "step": 2980 }, { "epoch": 19.003354838709676, "grad_norm": 7.753833770751953, "learning_rate": 6.824372759856631e-06, "loss": 0.2746, "step": 2990 }, { "epoch": 19.004645161290323, "grad_norm": 11.172388076782227, "learning_rate": 6.810035842293907e-06, "loss": 0.1721, "step": 3000 }, { "epoch": 19.005935483870967, "grad_norm": 15.600470542907715, "learning_rate": 6.795698924731183e-06, "loss": 0.1007, "step": 3010 }, { "epoch": 19.00722580645161, "grad_norm": 118.02156829833984, "learning_rate": 6.7813620071684595e-06, "loss": 0.3359, "step": 3020 }, { "epoch": 19.00851612903226, "grad_norm": 23.922033309936523, "learning_rate": 6.767025089605736e-06, "loss": 0.5477, "step": 3030 }, { "epoch": 19.009806451612903, "grad_norm": 0.036624372005462646, "learning_rate": 6.752688172043012e-06, "loss": 0.1127, "step": 3040 }, { "epoch": 19.011096774193547, "grad_norm": 37.53927230834961, "learning_rate": 6.738351254480287e-06, "loss": 0.3983, "step": 3050 }, { "epoch": 19.012387096774194, "grad_norm": 65.55288696289062, "learning_rate": 6.7240143369175635e-06, "loss": 0.3273, "step": 3060 }, { "epoch": 19.01367741935484, "grad_norm": 0.6554312705993652, "learning_rate": 6.70967741935484e-06, "loss": 0.3426, "step": 3070 }, { "epoch": 19.014967741935482, "grad_norm": 46.70892333984375, "learning_rate": 6.695340501792115e-06, "loss": 0.6548, "step": 3080 }, { "epoch": 19.01625806451613, "grad_norm": 0.10974624007940292, "learning_rate": 6.6810035842293904e-06, "loss": 0.3601, "step": 3090 }, { "epoch": 19.017548387096774, "grad_norm": 47.921546936035156, "learning_rate": 6.666666666666667e-06, "loss": 0.3866, "step": 3100 }, { "epoch": 19.018838709677418, "grad_norm": 0.6024359464645386, "learning_rate": 6.652329749103943e-06, "loss": 0.1906, "step": 3110 }, { "epoch": 19.020129032258065, "grad_norm": 0.09655216336250305, "learning_rate": 6.637992831541219e-06, "loss": 0.1155, "step": 3120 }, { "epoch": 19.020129032258065, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.7655417323112488, "eval_runtime": 12.8895, "eval_samples_per_second": 3.569, "eval_steps_per_second": 0.776, "step": 3120 }, { "epoch": 20.001290322580644, "grad_norm": 0.12409224361181259, "learning_rate": 6.623655913978495e-06, "loss": 0.0661, "step": 3130 }, { "epoch": 20.00258064516129, "grad_norm": 17.999996185302734, "learning_rate": 6.6093189964157715e-06, "loss": 0.251, "step": 3140 }, { "epoch": 20.003870967741936, "grad_norm": 0.32539618015289307, "learning_rate": 6.594982078853047e-06, "loss": 0.2564, "step": 3150 }, { "epoch": 20.00516129032258, "grad_norm": 0.3205997347831726, "learning_rate": 6.580645161290323e-06, "loss": 0.4519, "step": 3160 }, { "epoch": 20.006451612903227, "grad_norm": 59.489017486572266, "learning_rate": 6.566308243727599e-06, "loss": 0.2653, "step": 3170 }, { "epoch": 20.00774193548387, "grad_norm": 126.20308685302734, "learning_rate": 6.5519713261648755e-06, "loss": 0.5098, "step": 3180 }, { "epoch": 20.009032258064515, "grad_norm": 0.6583748459815979, "learning_rate": 6.537634408602152e-06, "loss": 0.308, "step": 3190 }, { "epoch": 20.010322580645163, "grad_norm": 21.273740768432617, "learning_rate": 6.523297491039428e-06, "loss": 0.6646, "step": 3200 }, { "epoch": 20.011612903225807, "grad_norm": 12.020598411560059, "learning_rate": 6.508960573476702e-06, "loss": 0.2557, "step": 3210 }, { "epoch": 20.01290322580645, "grad_norm": 5.178054332733154, "learning_rate": 6.494623655913979e-06, "loss": 0.0384, "step": 3220 }, { "epoch": 20.014193548387098, "grad_norm": 2.1413395404815674, "learning_rate": 6.480286738351255e-06, "loss": 0.4482, "step": 3230 }, { "epoch": 20.015483870967742, "grad_norm": 7.210319519042969, "learning_rate": 6.465949820788531e-06, "loss": 0.3, "step": 3240 }, { "epoch": 20.016774193548386, "grad_norm": 34.87846374511719, "learning_rate": 6.451612903225806e-06, "loss": 0.3668, "step": 3250 }, { "epoch": 20.018064516129034, "grad_norm": 52.603023529052734, "learning_rate": 6.437275985663083e-06, "loss": 0.2179, "step": 3260 }, { "epoch": 20.019354838709678, "grad_norm": 47.16685104370117, "learning_rate": 6.422939068100359e-06, "loss": 0.156, "step": 3270 }, { "epoch": 20.020129032258065, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7960483431816101, "eval_runtime": 12.9043, "eval_samples_per_second": 3.565, "eval_steps_per_second": 0.775, "step": 3276 }, { "epoch": 21.00051612903226, "grad_norm": 0.21653440594673157, "learning_rate": 6.408602150537635e-06, "loss": 0.276, "step": 3280 }, { "epoch": 21.001806451612904, "grad_norm": 94.91622924804688, "learning_rate": 6.394265232974911e-06, "loss": 0.2205, "step": 3290 }, { "epoch": 21.003096774193548, "grad_norm": 0.20536521077156067, "learning_rate": 6.379928315412187e-06, "loss": 0.1726, "step": 3300 }, { "epoch": 21.004387096774195, "grad_norm": 0.0463717095553875, "learning_rate": 6.365591397849464e-06, "loss": 0.3419, "step": 3310 }, { "epoch": 21.00567741935484, "grad_norm": 26.09324073791504, "learning_rate": 6.351254480286739e-06, "loss": 0.2172, "step": 3320 }, { "epoch": 21.006967741935483, "grad_norm": 0.1585940718650818, "learning_rate": 6.336917562724015e-06, "loss": 0.2854, "step": 3330 }, { "epoch": 21.00825806451613, "grad_norm": 72.09548950195312, "learning_rate": 6.3225806451612906e-06, "loss": 0.1566, "step": 3340 }, { "epoch": 21.009548387096775, "grad_norm": 53.04383850097656, "learning_rate": 6.308243727598567e-06, "loss": 0.22, "step": 3350 }, { "epoch": 21.01083870967742, "grad_norm": 0.2047654092311859, "learning_rate": 6.293906810035842e-06, "loss": 0.1901, "step": 3360 }, { "epoch": 21.012129032258066, "grad_norm": 92.4298324584961, "learning_rate": 6.279569892473118e-06, "loss": 0.1818, "step": 3370 }, { "epoch": 21.01341935483871, "grad_norm": 0.09610098600387573, "learning_rate": 6.2652329749103945e-06, "loss": 0.1439, "step": 3380 }, { "epoch": 21.014709677419354, "grad_norm": 0.6577791571617126, "learning_rate": 6.250896057347671e-06, "loss": 0.5037, "step": 3390 }, { "epoch": 21.016, "grad_norm": 51.65420913696289, "learning_rate": 6.236559139784947e-06, "loss": 0.2934, "step": 3400 }, { "epoch": 21.017290322580646, "grad_norm": 2.744246244430542, "learning_rate": 6.222222222222223e-06, "loss": 0.1764, "step": 3410 }, { "epoch": 21.01858064516129, "grad_norm": 115.11161041259766, "learning_rate": 6.2078853046594985e-06, "loss": 0.4345, "step": 3420 }, { "epoch": 21.019870967741934, "grad_norm": 77.17694091796875, "learning_rate": 6.193548387096775e-06, "loss": 0.2685, "step": 3430 }, { "epoch": 21.020129032258065, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.8259710073471069, "eval_runtime": 12.1557, "eval_samples_per_second": 3.784, "eval_steps_per_second": 0.823, "step": 3432 }, { "epoch": 22.001032258064516, "grad_norm": 0.2587222158908844, "learning_rate": 6.179211469534051e-06, "loss": 0.1121, "step": 3440 }, { "epoch": 22.00232258064516, "grad_norm": 0.03930482640862465, "learning_rate": 6.164874551971327e-06, "loss": 0.2412, "step": 3450 }, { "epoch": 22.003612903225807, "grad_norm": 0.11786483973264694, "learning_rate": 6.150537634408603e-06, "loss": 0.1179, "step": 3460 }, { "epoch": 22.00490322580645, "grad_norm": 0.3767687976360321, "learning_rate": 6.136200716845878e-06, "loss": 0.5422, "step": 3470 }, { "epoch": 22.006193548387095, "grad_norm": 88.04617309570312, "learning_rate": 6.121863799283154e-06, "loss": 0.4517, "step": 3480 }, { "epoch": 22.007483870967743, "grad_norm": 0.580293595790863, "learning_rate": 6.10752688172043e-06, "loss": 0.3507, "step": 3490 }, { "epoch": 22.008774193548387, "grad_norm": 26.64361572265625, "learning_rate": 6.0931899641577065e-06, "loss": 0.282, "step": 3500 }, { "epoch": 22.01006451612903, "grad_norm": 0.7853363156318665, "learning_rate": 6.078853046594983e-06, "loss": 0.1827, "step": 3510 }, { "epoch": 22.01135483870968, "grad_norm": 0.024075984954833984, "learning_rate": 6.064516129032259e-06, "loss": 0.3153, "step": 3520 }, { "epoch": 22.012645161290322, "grad_norm": 86.04522705078125, "learning_rate": 6.050179211469534e-06, "loss": 0.3924, "step": 3530 }, { "epoch": 22.013935483870966, "grad_norm": 0.09735769778490067, "learning_rate": 6.0358422939068105e-06, "loss": 0.2222, "step": 3540 }, { "epoch": 22.015225806451614, "grad_norm": 2.366472005844116, "learning_rate": 6.021505376344087e-06, "loss": 0.1644, "step": 3550 }, { "epoch": 22.016516129032258, "grad_norm": 1.0237826108932495, "learning_rate": 6.007168458781363e-06, "loss": 0.0778, "step": 3560 }, { "epoch": 22.017806451612902, "grad_norm": 30.40985870361328, "learning_rate": 5.992831541218639e-06, "loss": 0.2183, "step": 3570 }, { "epoch": 22.01909677419355, "grad_norm": 28.814910888671875, "learning_rate": 5.978494623655915e-06, "loss": 0.2572, "step": 3580 }, { "epoch": 22.020129032258065, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8299353122711182, "eval_runtime": 11.5663, "eval_samples_per_second": 3.977, "eval_steps_per_second": 0.865, "step": 3588 }, { "epoch": 23.000258064516128, "grad_norm": 49.0082893371582, "learning_rate": 5.964157706093191e-06, "loss": 0.2307, "step": 3590 }, { "epoch": 23.001548387096776, "grad_norm": 7.303269386291504, "learning_rate": 5.949820788530466e-06, "loss": 0.4955, "step": 3600 }, { "epoch": 23.00283870967742, "grad_norm": 0.36897215247154236, "learning_rate": 5.935483870967742e-06, "loss": 0.0558, "step": 3610 }, { "epoch": 23.004129032258064, "grad_norm": 141.3982391357422, "learning_rate": 5.9211469534050184e-06, "loss": 0.3308, "step": 3620 }, { "epoch": 23.00541935483871, "grad_norm": 141.97036743164062, "learning_rate": 5.906810035842294e-06, "loss": 0.1683, "step": 3630 }, { "epoch": 23.006709677419355, "grad_norm": 76.91606140136719, "learning_rate": 5.89247311827957e-06, "loss": 0.1145, "step": 3640 }, { "epoch": 23.008, "grad_norm": 0.09071186184883118, "learning_rate": 5.878136200716846e-06, "loss": 0.3234, "step": 3650 }, { "epoch": 23.009290322580647, "grad_norm": 0.1430203765630722, "learning_rate": 5.8637992831541224e-06, "loss": 0.0478, "step": 3660 }, { "epoch": 23.01058064516129, "grad_norm": 0.049157168716192245, "learning_rate": 5.849462365591399e-06, "loss": 0.3276, "step": 3670 }, { "epoch": 23.011870967741935, "grad_norm": 110.4677963256836, "learning_rate": 5.835125448028675e-06, "loss": 0.2345, "step": 3680 }, { "epoch": 23.013161290322582, "grad_norm": 0.016318589448928833, "learning_rate": 5.82078853046595e-06, "loss": 0.2085, "step": 3690 }, { "epoch": 23.014451612903226, "grad_norm": 0.13428108394145966, "learning_rate": 5.806451612903226e-06, "loss": 0.056, "step": 3700 }, { "epoch": 23.01574193548387, "grad_norm": 85.2003173828125, "learning_rate": 5.792114695340503e-06, "loss": 0.1992, "step": 3710 }, { "epoch": 23.017032258064518, "grad_norm": 1.063201665878296, "learning_rate": 5.777777777777778e-06, "loss": 0.2594, "step": 3720 }, { "epoch": 23.01832258064516, "grad_norm": 0.17249171435832977, "learning_rate": 5.763440860215054e-06, "loss": 0.0525, "step": 3730 }, { "epoch": 23.019612903225806, "grad_norm": 0.344163715839386, "learning_rate": 5.7491039426523296e-06, "loss": 0.3788, "step": 3740 }, { "epoch": 23.020129032258065, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.8373197317123413, "eval_runtime": 12.8345, "eval_samples_per_second": 3.584, "eval_steps_per_second": 0.779, "step": 3744 }, { "epoch": 24.000774193548388, "grad_norm": 0.026729216799139977, "learning_rate": 5.734767025089606e-06, "loss": 0.1654, "step": 3750 }, { "epoch": 24.002064516129032, "grad_norm": 0.772559642791748, "learning_rate": 5.720430107526882e-06, "loss": 0.2895, "step": 3760 }, { "epoch": 24.003354838709676, "grad_norm": 0.08740129321813583, "learning_rate": 5.706093189964158e-06, "loss": 0.1543, "step": 3770 }, { "epoch": 24.004645161290323, "grad_norm": 51.019981384277344, "learning_rate": 5.691756272401434e-06, "loss": 0.2222, "step": 3780 }, { "epoch": 24.005935483870967, "grad_norm": 0.029490599408745766, "learning_rate": 5.677419354838711e-06, "loss": 0.4127, "step": 3790 }, { "epoch": 24.00722580645161, "grad_norm": 0.36594241857528687, "learning_rate": 5.663082437275986e-06, "loss": 0.0088, "step": 3800 }, { "epoch": 24.00851612903226, "grad_norm": 0.002953264629468322, "learning_rate": 5.648745519713262e-06, "loss": 0.1196, "step": 3810 }, { "epoch": 24.009806451612903, "grad_norm": 0.053151555359363556, "learning_rate": 5.634408602150538e-06, "loss": 0.1123, "step": 3820 }, { "epoch": 24.011096774193547, "grad_norm": 0.01194888073951006, "learning_rate": 5.620071684587815e-06, "loss": 0.6074, "step": 3830 }, { "epoch": 24.012387096774194, "grad_norm": 0.2657994329929352, "learning_rate": 5.605734767025091e-06, "loss": 0.4327, "step": 3840 }, { "epoch": 24.01367741935484, "grad_norm": 0.8833375573158264, "learning_rate": 5.591397849462365e-06, "loss": 0.1067, "step": 3850 }, { "epoch": 24.014967741935482, "grad_norm": 0.7625280022621155, "learning_rate": 5.5770609318996415e-06, "loss": 0.3043, "step": 3860 }, { "epoch": 24.01625806451613, "grad_norm": 1.092976689338684, "learning_rate": 5.562724014336918e-06, "loss": 0.1906, "step": 3870 }, { "epoch": 24.017548387096774, "grad_norm": 0.20581433176994324, "learning_rate": 5.548387096774194e-06, "loss": 0.1236, "step": 3880 }, { "epoch": 24.018838709677418, "grad_norm": 144.0025634765625, "learning_rate": 5.53405017921147e-06, "loss": 0.1985, "step": 3890 }, { "epoch": 24.020129032258065, "grad_norm": 0.20440329611301422, "learning_rate": 5.5197132616487455e-06, "loss": 0.3816, "step": 3900 }, { "epoch": 24.020129032258065, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.9689333438873291, "eval_runtime": 11.5582, "eval_samples_per_second": 3.98, "eval_steps_per_second": 0.865, "step": 3900 }, { "epoch": 25.001290322580644, "grad_norm": 26.94411849975586, "learning_rate": 5.505376344086022e-06, "loss": 0.4264, "step": 3910 }, { "epoch": 25.00258064516129, "grad_norm": 98.3396224975586, "learning_rate": 5.491039426523298e-06, "loss": 0.3294, "step": 3920 }, { "epoch": 25.003870967741936, "grad_norm": 1.056504726409912, "learning_rate": 5.476702508960574e-06, "loss": 0.3583, "step": 3930 }, { "epoch": 25.00516129032258, "grad_norm": 0.3273158669471741, "learning_rate": 5.46236559139785e-06, "loss": 0.1745, "step": 3940 }, { "epoch": 25.006451612903227, "grad_norm": 81.80546569824219, "learning_rate": 5.4480286738351265e-06, "loss": 0.2122, "step": 3950 }, { "epoch": 25.00774193548387, "grad_norm": 0.7988190650939941, "learning_rate": 5.433691756272402e-06, "loss": 0.1459, "step": 3960 }, { "epoch": 25.009032258064515, "grad_norm": 142.90243530273438, "learning_rate": 5.419354838709678e-06, "loss": 0.205, "step": 3970 }, { "epoch": 25.010322580645163, "grad_norm": 0.11823119223117828, "learning_rate": 5.4050179211469535e-06, "loss": 0.3307, "step": 3980 }, { "epoch": 25.011612903225807, "grad_norm": 0.061636436730623245, "learning_rate": 5.39068100358423e-06, "loss": 0.1693, "step": 3990 }, { "epoch": 25.01290322580645, "grad_norm": 0.05063611641526222, "learning_rate": 5.376344086021506e-06, "loss": 0.2545, "step": 4000 }, { "epoch": 25.014193548387098, "grad_norm": 0.05370887741446495, "learning_rate": 5.362007168458781e-06, "loss": 0.0039, "step": 4010 }, { "epoch": 25.015483870967742, "grad_norm": 74.40390014648438, "learning_rate": 5.3476702508960574e-06, "loss": 0.1385, "step": 4020 }, { "epoch": 25.016774193548386, "grad_norm": 142.03109741210938, "learning_rate": 5.333333333333334e-06, "loss": 0.1266, "step": 4030 }, { "epoch": 25.018064516129034, "grad_norm": 104.37118530273438, "learning_rate": 5.31899641577061e-06, "loss": 0.0545, "step": 4040 }, { "epoch": 25.019354838709678, "grad_norm": 68.47547149658203, "learning_rate": 5.304659498207886e-06, "loss": 0.4579, "step": 4050 }, { "epoch": 25.020129032258065, "eval_accuracy": 0.6739130434782609, "eval_loss": 1.2805536985397339, "eval_runtime": 12.161, "eval_samples_per_second": 3.783, "eval_steps_per_second": 0.822, "step": 4056 }, { "epoch": 26.00051612903226, "grad_norm": 40.76604461669922, "learning_rate": 5.290322580645162e-06, "loss": 0.1336, "step": 4060 }, { "epoch": 26.001806451612904, "grad_norm": 78.58018493652344, "learning_rate": 5.275985663082438e-06, "loss": 0.2986, "step": 4070 }, { "epoch": 26.003096774193548, "grad_norm": 14.429988861083984, "learning_rate": 5.261648745519714e-06, "loss": 0.1484, "step": 4080 }, { "epoch": 26.004387096774195, "grad_norm": 24.105003356933594, "learning_rate": 5.24731182795699e-06, "loss": 0.2044, "step": 4090 }, { "epoch": 26.00567741935484, "grad_norm": 8.945204734802246, "learning_rate": 5.232974910394266e-06, "loss": 0.3542, "step": 4100 }, { "epoch": 26.006967741935483, "grad_norm": 0.05342581868171692, "learning_rate": 5.218637992831541e-06, "loss": 0.1549, "step": 4110 }, { "epoch": 26.00825806451613, "grad_norm": 0.19823309779167175, "learning_rate": 5.204301075268817e-06, "loss": 0.2721, "step": 4120 }, { "epoch": 26.009548387096775, "grad_norm": 107.35823822021484, "learning_rate": 5.189964157706093e-06, "loss": 0.1426, "step": 4130 }, { "epoch": 26.01083870967742, "grad_norm": 59.54574203491211, "learning_rate": 5.175627240143369e-06, "loss": 0.237, "step": 4140 }, { "epoch": 26.012129032258066, "grad_norm": 0.09815605729818344, "learning_rate": 5.161290322580646e-06, "loss": 0.1383, "step": 4150 }, { "epoch": 26.01341935483871, "grad_norm": 3.2161612510681152, "learning_rate": 5.146953405017922e-06, "loss": 0.0292, "step": 4160 }, { "epoch": 26.014709677419354, "grad_norm": 106.2547607421875, "learning_rate": 5.132616487455197e-06, "loss": 0.0378, "step": 4170 }, { "epoch": 26.016, "grad_norm": 0.504286527633667, "learning_rate": 5.118279569892473e-06, "loss": 0.2752, "step": 4180 }, { "epoch": 26.017290322580646, "grad_norm": 0.22041937708854675, "learning_rate": 5.10394265232975e-06, "loss": 0.1093, "step": 4190 }, { "epoch": 26.01858064516129, "grad_norm": 30.628664016723633, "learning_rate": 5.089605734767026e-06, "loss": 0.3371, "step": 4200 }, { "epoch": 26.019870967741934, "grad_norm": 0.10209270566701889, "learning_rate": 5.075268817204302e-06, "loss": 0.2543, "step": 4210 }, { "epoch": 26.020129032258065, "eval_accuracy": 0.7608695652173914, "eval_loss": 1.2309492826461792, "eval_runtime": 12.2097, "eval_samples_per_second": 3.768, "eval_steps_per_second": 0.819, "step": 4212 }, { "epoch": 27.001032258064516, "grad_norm": 1.90790593624115, "learning_rate": 5.060931899641578e-06, "loss": 0.0385, "step": 4220 }, { "epoch": 27.00232258064516, "grad_norm": 0.0760672315955162, "learning_rate": 5.0465949820788544e-06, "loss": 0.0684, "step": 4230 }, { "epoch": 27.003612903225807, "grad_norm": 0.034514494240283966, "learning_rate": 5.032258064516129e-06, "loss": 0.1567, "step": 4240 }, { "epoch": 27.00490322580645, "grad_norm": 0.020678775385022163, "learning_rate": 5.017921146953405e-06, "loss": 0.1836, "step": 4250 }, { "epoch": 27.006193548387095, "grad_norm": 0.23241804540157318, "learning_rate": 5.003584229390681e-06, "loss": 0.1184, "step": 4260 }, { "epoch": 27.007483870967743, "grad_norm": 0.6777844429016113, "learning_rate": 4.9892473118279576e-06, "loss": 0.1064, "step": 4270 }, { "epoch": 27.008774193548387, "grad_norm": 80.56316375732422, "learning_rate": 4.974910394265233e-06, "loss": 0.3246, "step": 4280 }, { "epoch": 27.01006451612903, "grad_norm": 96.52352142333984, "learning_rate": 4.960573476702509e-06, "loss": 0.1645, "step": 4290 }, { "epoch": 27.01135483870968, "grad_norm": 0.007455157116055489, "learning_rate": 4.946236559139785e-06, "loss": 0.0491, "step": 4300 }, { "epoch": 27.012645161290322, "grad_norm": 6.093803882598877, "learning_rate": 4.9318996415770615e-06, "loss": 0.0624, "step": 4310 }, { "epoch": 27.013935483870966, "grad_norm": 0.043880682438611984, "learning_rate": 4.917562724014338e-06, "loss": 0.1716, "step": 4320 }, { "epoch": 27.015225806451614, "grad_norm": 154.94888305664062, "learning_rate": 4.903225806451613e-06, "loss": 0.1257, "step": 4330 }, { "epoch": 27.016516129032258, "grad_norm": 0.11380237340927124, "learning_rate": 4.888888888888889e-06, "loss": 0.2291, "step": 4340 }, { "epoch": 27.017806451612902, "grad_norm": 0.013241041451692581, "learning_rate": 4.8745519713261655e-06, "loss": 0.2065, "step": 4350 }, { "epoch": 27.01909677419355, "grad_norm": 0.4187248945236206, "learning_rate": 4.860215053763441e-06, "loss": 0.1227, "step": 4360 }, { "epoch": 27.020129032258065, "eval_accuracy": 0.6521739130434783, "eval_loss": 1.2931348085403442, "eval_runtime": 11.5376, "eval_samples_per_second": 3.987, "eval_steps_per_second": 0.867, "step": 4368 }, { "epoch": 28.000258064516128, "grad_norm": 0.11039324849843979, "learning_rate": 4.845878136200717e-06, "loss": 0.2869, "step": 4370 }, { "epoch": 28.001548387096776, "grad_norm": 0.010043107904493809, "learning_rate": 4.831541218637993e-06, "loss": 0.2055, "step": 4380 }, { "epoch": 28.00283870967742, "grad_norm": 0.07319659739732742, "learning_rate": 4.8172043010752695e-06, "loss": 0.1488, "step": 4390 }, { "epoch": 28.004129032258064, "grad_norm": 0.08317713439464569, "learning_rate": 4.802867383512545e-06, "loss": 0.0447, "step": 4400 }, { "epoch": 28.00541935483871, "grad_norm": 73.7962417602539, "learning_rate": 4.788530465949821e-06, "loss": 0.2861, "step": 4410 }, { "epoch": 28.006709677419355, "grad_norm": 0.011232980526983738, "learning_rate": 4.774193548387097e-06, "loss": 0.4564, "step": 4420 }, { "epoch": 28.008, "grad_norm": 9.163039207458496, "learning_rate": 4.7598566308243735e-06, "loss": 0.0174, "step": 4430 }, { "epoch": 28.009290322580647, "grad_norm": 0.023740319535136223, "learning_rate": 4.745519713261649e-06, "loss": 0.2245, "step": 4440 }, { "epoch": 28.01058064516129, "grad_norm": 0.24346721172332764, "learning_rate": 4.731182795698925e-06, "loss": 0.1731, "step": 4450 }, { "epoch": 28.011870967741935, "grad_norm": 3.2129597663879395, "learning_rate": 4.716845878136201e-06, "loss": 0.3248, "step": 4460 }, { "epoch": 28.013161290322582, "grad_norm": 0.4691598117351532, "learning_rate": 4.702508960573477e-06, "loss": 0.0359, "step": 4470 }, { "epoch": 28.014451612903226, "grad_norm": 117.49690246582031, "learning_rate": 4.688172043010753e-06, "loss": 0.5497, "step": 4480 }, { "epoch": 28.01574193548387, "grad_norm": 0.028058160096406937, "learning_rate": 4.673835125448029e-06, "loss": 0.1482, "step": 4490 }, { "epoch": 28.017032258064518, "grad_norm": 0.20224638283252716, "learning_rate": 4.659498207885305e-06, "loss": 0.2575, "step": 4500 }, { "epoch": 28.01832258064516, "grad_norm": 0.008048119954764843, "learning_rate": 4.6451612903225815e-06, "loss": 0.0804, "step": 4510 }, { "epoch": 28.019612903225806, "grad_norm": 0.04067204147577286, "learning_rate": 4.630824372759857e-06, "loss": 0.3303, "step": 4520 }, { "epoch": 28.020129032258065, "eval_accuracy": 0.8043478260869565, "eval_loss": 1.0449941158294678, "eval_runtime": 11.5459, "eval_samples_per_second": 3.984, "eval_steps_per_second": 0.866, "step": 4524 }, { "epoch": 29.000774193548388, "grad_norm": 1.4166280031204224, "learning_rate": 4.616487455197133e-06, "loss": 0.3155, "step": 4530 }, { "epoch": 29.002064516129032, "grad_norm": 57.51814270019531, "learning_rate": 4.602150537634409e-06, "loss": 0.2734, "step": 4540 }, { "epoch": 29.003354838709676, "grad_norm": 0.024605078622698784, "learning_rate": 4.587813620071685e-06, "loss": 0.0513, "step": 4550 }, { "epoch": 29.004645161290323, "grad_norm": 32.84198760986328, "learning_rate": 4.573476702508961e-06, "loss": 0.2478, "step": 4560 }, { "epoch": 29.005935483870967, "grad_norm": 111.84346771240234, "learning_rate": 4.559139784946237e-06, "loss": 0.2213, "step": 4570 }, { "epoch": 29.00722580645161, "grad_norm": 0.019566096365451813, "learning_rate": 4.544802867383513e-06, "loss": 0.0147, "step": 4580 }, { "epoch": 29.00851612903226, "grad_norm": 0.014302892610430717, "learning_rate": 4.530465949820789e-06, "loss": 0.0245, "step": 4590 }, { "epoch": 29.009806451612903, "grad_norm": 0.013128543272614479, "learning_rate": 4.516129032258065e-06, "loss": 0.1292, "step": 4600 }, { "epoch": 29.011096774193547, "grad_norm": 150.27513122558594, "learning_rate": 4.501792114695341e-06, "loss": 0.2414, "step": 4610 }, { "epoch": 29.012387096774194, "grad_norm": 0.7926704287528992, "learning_rate": 4.487455197132617e-06, "loss": 0.0009, "step": 4620 }, { "epoch": 29.01367741935484, "grad_norm": 0.006922030821442604, "learning_rate": 4.473118279569893e-06, "loss": 0.1366, "step": 4630 }, { "epoch": 29.014967741935482, "grad_norm": 101.47315216064453, "learning_rate": 4.458781362007169e-06, "loss": 0.1944, "step": 4640 }, { "epoch": 29.01625806451613, "grad_norm": 0.6442185044288635, "learning_rate": 4.444444444444444e-06, "loss": 0.0411, "step": 4650 }, { "epoch": 29.017548387096774, "grad_norm": 0.03204209730029106, "learning_rate": 4.43010752688172e-06, "loss": 0.0119, "step": 4660 }, { "epoch": 29.018838709677418, "grad_norm": 0.08610539138317108, "learning_rate": 4.4157706093189966e-06, "loss": 0.268, "step": 4670 }, { "epoch": 29.020129032258065, "grad_norm": 0.0920485258102417, "learning_rate": 4.401433691756273e-06, "loss": 0.0808, "step": 4680 }, { "epoch": 29.020129032258065, "eval_accuracy": 0.7391304347826086, "eval_loss": 1.3096072673797607, "eval_runtime": 12.8777, "eval_samples_per_second": 3.572, "eval_steps_per_second": 0.777, "step": 4680 }, { "epoch": 30.001290322580644, "grad_norm": 0.07281237840652466, "learning_rate": 4.387096774193549e-06, "loss": 0.0004, "step": 4690 }, { "epoch": 30.00258064516129, "grad_norm": 0.028409792110323906, "learning_rate": 4.372759856630825e-06, "loss": 0.0556, "step": 4700 }, { "epoch": 30.003870967741936, "grad_norm": 0.006064075976610184, "learning_rate": 4.358422939068101e-06, "loss": 0.0071, "step": 4710 }, { "epoch": 30.00516129032258, "grad_norm": 0.3672466576099396, "learning_rate": 4.344086021505377e-06, "loss": 0.0768, "step": 4720 }, { "epoch": 30.006451612903227, "grad_norm": 0.004668663255870342, "learning_rate": 4.329749103942653e-06, "loss": 0.2411, "step": 4730 }, { "epoch": 30.00774193548387, "grad_norm": 0.057829201221466064, "learning_rate": 4.315412186379928e-06, "loss": 0.3242, "step": 4740 }, { "epoch": 30.009032258064515, "grad_norm": 0.14374171197414398, "learning_rate": 4.3010752688172045e-06, "loss": 0.0972, "step": 4750 }, { "epoch": 30.010322580645163, "grad_norm": 165.0821990966797, "learning_rate": 4.286738351254481e-06, "loss": 0.4375, "step": 4760 }, { "epoch": 30.011612903225807, "grad_norm": 42.48862075805664, "learning_rate": 4.272401433691757e-06, "loss": 0.0142, "step": 4770 }, { "epoch": 30.01290322580645, "grad_norm": 0.008011335507035255, "learning_rate": 4.258064516129032e-06, "loss": 0.0421, "step": 4780 }, { "epoch": 30.014193548387098, "grad_norm": 0.02149633876979351, "learning_rate": 4.2437275985663085e-06, "loss": 0.1221, "step": 4790 }, { "epoch": 30.015483870967742, "grad_norm": 0.5769780874252319, "learning_rate": 4.229390681003585e-06, "loss": 0.1967, "step": 4800 }, { "epoch": 30.016774193548386, "grad_norm": 0.02931826002895832, "learning_rate": 4.215053763440861e-06, "loss": 0.3096, "step": 4810 }, { "epoch": 30.018064516129034, "grad_norm": 0.023532267659902573, "learning_rate": 4.200716845878136e-06, "loss": 0.2504, "step": 4820 }, { "epoch": 30.019354838709678, "grad_norm": 0.07375481724739075, "learning_rate": 4.1863799283154125e-06, "loss": 0.0987, "step": 4830 }, { "epoch": 30.020129032258065, "eval_accuracy": 0.782608695652174, "eval_loss": 1.1348512172698975, "eval_runtime": 12.8459, "eval_samples_per_second": 3.581, "eval_steps_per_second": 0.778, "step": 4836 }, { "epoch": 31.00051612903226, "grad_norm": 0.017024632543325424, "learning_rate": 4.172043010752688e-06, "loss": 0.0383, "step": 4840 }, { "epoch": 31.001806451612904, "grad_norm": 0.036306753754615784, "learning_rate": 4.157706093189964e-06, "loss": 0.0764, "step": 4850 }, { "epoch": 31.003096774193548, "grad_norm": 111.12344360351562, "learning_rate": 4.14336917562724e-06, "loss": 0.3264, "step": 4860 }, { "epoch": 31.004387096774195, "grad_norm": 139.1377716064453, "learning_rate": 4.1290322580645165e-06, "loss": 0.0829, "step": 4870 }, { "epoch": 31.00567741935484, "grad_norm": 1.2580883502960205, "learning_rate": 4.114695340501793e-06, "loss": 0.0411, "step": 4880 }, { "epoch": 31.006967741935483, "grad_norm": 0.020289059728384018, "learning_rate": 4.100358422939069e-06, "loss": 0.1642, "step": 4890 }, { "epoch": 31.00825806451613, "grad_norm": 0.1317121684551239, "learning_rate": 4.086021505376344e-06, "loss": 0.0291, "step": 4900 }, { "epoch": 31.009548387096775, "grad_norm": 0.01632014475762844, "learning_rate": 4.0716845878136205e-06, "loss": 0.0088, "step": 4910 }, { "epoch": 31.01083870967742, "grad_norm": 0.03345351293683052, "learning_rate": 4.057347670250897e-06, "loss": 0.0832, "step": 4920 }, { "epoch": 31.012129032258066, "grad_norm": 0.012935369275510311, "learning_rate": 4.043010752688172e-06, "loss": 0.1361, "step": 4930 }, { "epoch": 31.01341935483871, "grad_norm": 13.744651794433594, "learning_rate": 4.028673835125448e-06, "loss": 0.232, "step": 4940 }, { "epoch": 31.014709677419354, "grad_norm": 0.06759098917245865, "learning_rate": 4.0143369175627245e-06, "loss": 0.1972, "step": 4950 }, { "epoch": 31.016, "grad_norm": 0.29370027780532837, "learning_rate": 4.000000000000001e-06, "loss": 0.0829, "step": 4960 }, { "epoch": 31.017290322580646, "grad_norm": 0.07303759455680847, "learning_rate": 3.985663082437276e-06, "loss": 0.1494, "step": 4970 }, { "epoch": 31.01858064516129, "grad_norm": 0.07196178287267685, "learning_rate": 3.971326164874552e-06, "loss": 0.1362, "step": 4980 }, { "epoch": 31.019870967741934, "grad_norm": 38.06782531738281, "learning_rate": 3.9569892473118284e-06, "loss": 0.0432, "step": 4990 }, { "epoch": 31.020129032258065, "eval_accuracy": 0.7391304347826086, "eval_loss": 1.044263482093811, "eval_runtime": 12.1262, "eval_samples_per_second": 3.793, "eval_steps_per_second": 0.825, "step": 4992 }, { "epoch": 32.00103225806452, "grad_norm": 0.61575847864151, "learning_rate": 3.942652329749105e-06, "loss": 0.132, "step": 5000 }, { "epoch": 32.00232258064516, "grad_norm": 0.01723526231944561, "learning_rate": 3.92831541218638e-06, "loss": 0.1956, "step": 5010 }, { "epoch": 32.00361290322581, "grad_norm": 0.9134840369224548, "learning_rate": 3.913978494623656e-06, "loss": 0.2458, "step": 5020 }, { "epoch": 32.00490322580645, "grad_norm": 0.019222645089030266, "learning_rate": 3.8996415770609324e-06, "loss": 0.2748, "step": 5030 }, { "epoch": 32.006193548387095, "grad_norm": 67.789306640625, "learning_rate": 3.885304659498208e-06, "loss": 0.242, "step": 5040 }, { "epoch": 32.00748387096774, "grad_norm": 0.060604579746723175, "learning_rate": 3.870967741935484e-06, "loss": 0.1056, "step": 5050 }, { "epoch": 32.00877419354839, "grad_norm": 16.191116333007812, "learning_rate": 3.85663082437276e-06, "loss": 0.1223, "step": 5060 }, { "epoch": 32.010064516129034, "grad_norm": 0.06931120157241821, "learning_rate": 3.842293906810036e-06, "loss": 0.194, "step": 5070 }, { "epoch": 32.01135483870968, "grad_norm": 0.07669204473495483, "learning_rate": 3.827956989247313e-06, "loss": 0.0103, "step": 5080 }, { "epoch": 32.01264516129032, "grad_norm": 0.009800991043448448, "learning_rate": 3.8136200716845884e-06, "loss": 0.0018, "step": 5090 }, { "epoch": 32.013935483870966, "grad_norm": 0.07491903007030487, "learning_rate": 3.7992831541218638e-06, "loss": 0.0099, "step": 5100 }, { "epoch": 32.01522580645161, "grad_norm": 0.03506194427609444, "learning_rate": 3.78494623655914e-06, "loss": 0.0014, "step": 5110 }, { "epoch": 32.01651612903226, "grad_norm": 87.74604797363281, "learning_rate": 3.770609318996416e-06, "loss": 0.2446, "step": 5120 }, { "epoch": 32.017806451612906, "grad_norm": 65.79183197021484, "learning_rate": 3.756272401433692e-06, "loss": 0.1657, "step": 5130 }, { "epoch": 32.01909677419355, "grad_norm": 0.07675101608037949, "learning_rate": 3.741935483870968e-06, "loss": 0.0373, "step": 5140 }, { "epoch": 32.02012903225806, "eval_accuracy": 0.6739130434782609, "eval_loss": 1.8531540632247925, "eval_runtime": 12.2063, "eval_samples_per_second": 3.769, "eval_steps_per_second": 0.819, "step": 5148 }, { "epoch": 33.00025806451613, "grad_norm": 0.047407910227775574, "learning_rate": 3.7275985663082444e-06, "loss": 0.0022, "step": 5150 }, { "epoch": 33.001548387096776, "grad_norm": 0.5225340723991394, "learning_rate": 3.7132616487455197e-06, "loss": 0.1555, "step": 5160 }, { "epoch": 33.00283870967742, "grad_norm": 0.061823975294828415, "learning_rate": 3.698924731182796e-06, "loss": 0.0344, "step": 5170 }, { "epoch": 33.004129032258064, "grad_norm": 67.81706237792969, "learning_rate": 3.6845878136200717e-06, "loss": 0.1243, "step": 5180 }, { "epoch": 33.00541935483871, "grad_norm": 0.7582953572273254, "learning_rate": 3.670250896057348e-06, "loss": 0.102, "step": 5190 }, { "epoch": 33.00670967741935, "grad_norm": 0.00984943751245737, "learning_rate": 3.655913978494624e-06, "loss": 0.1058, "step": 5200 }, { "epoch": 33.008, "grad_norm": 0.02095373533666134, "learning_rate": 3.6415770609319e-06, "loss": 0.3667, "step": 5210 }, { "epoch": 33.00929032258065, "grad_norm": 0.017365047708153725, "learning_rate": 3.627240143369176e-06, "loss": 0.1456, "step": 5220 }, { "epoch": 33.01058064516129, "grad_norm": 0.024257881566882133, "learning_rate": 3.6129032258064515e-06, "loss": 0.1947, "step": 5230 }, { "epoch": 33.011870967741935, "grad_norm": 0.009880566969513893, "learning_rate": 3.5985663082437277e-06, "loss": 0.242, "step": 5240 }, { "epoch": 33.01316129032258, "grad_norm": 50.43818664550781, "learning_rate": 3.584229390681004e-06, "loss": 0.3662, "step": 5250 }, { "epoch": 33.01445161290322, "grad_norm": 0.20178300142288208, "learning_rate": 3.5698924731182797e-06, "loss": 0.2486, "step": 5260 }, { "epoch": 33.015741935483874, "grad_norm": 0.002111848210915923, "learning_rate": 3.555555555555556e-06, "loss": 0.002, "step": 5270 }, { "epoch": 33.01703225806452, "grad_norm": 0.05945451185107231, "learning_rate": 3.541218637992832e-06, "loss": 0.0009, "step": 5280 }, { "epoch": 33.01832258064516, "grad_norm": 0.003861631266772747, "learning_rate": 3.5268817204301075e-06, "loss": 0.005, "step": 5290 }, { "epoch": 33.019612903225806, "grad_norm": 0.021970974281430244, "learning_rate": 3.5125448028673837e-06, "loss": 0.1619, "step": 5300 }, { "epoch": 33.02012903225806, "eval_accuracy": 0.7391304347826086, "eval_loss": 1.048993706703186, "eval_runtime": 13.2378, "eval_samples_per_second": 3.475, "eval_steps_per_second": 0.755, "step": 5304 }, { "epoch": 34.00077419354839, "grad_norm": 0.31462323665618896, "learning_rate": 3.49820788530466e-06, "loss": 0.0017, "step": 5310 }, { "epoch": 34.00206451612903, "grad_norm": 0.018684007227420807, "learning_rate": 3.4838709677419357e-06, "loss": 0.2576, "step": 5320 }, { "epoch": 34.003354838709676, "grad_norm": 0.010775264352560043, "learning_rate": 3.469534050179212e-06, "loss": 0.0006, "step": 5330 }, { "epoch": 34.00464516129032, "grad_norm": 0.04289526119828224, "learning_rate": 3.455197132616488e-06, "loss": 0.2436, "step": 5340 }, { "epoch": 34.00593548387097, "grad_norm": 0.02890205755829811, "learning_rate": 3.440860215053764e-06, "loss": 0.1125, "step": 5350 }, { "epoch": 34.007225806451615, "grad_norm": 104.6270751953125, "learning_rate": 3.4265232974910397e-06, "loss": 0.2731, "step": 5360 }, { "epoch": 34.00851612903226, "grad_norm": 4.66964864730835, "learning_rate": 3.4121863799283155e-06, "loss": 0.1204, "step": 5370 }, { "epoch": 34.0098064516129, "grad_norm": 0.04278410226106644, "learning_rate": 3.3978494623655917e-06, "loss": 0.0011, "step": 5380 }, { "epoch": 34.01109677419355, "grad_norm": 0.012167668901383877, "learning_rate": 3.383512544802868e-06, "loss": 0.1122, "step": 5390 }, { "epoch": 34.01238709677419, "grad_norm": 0.023635732010006905, "learning_rate": 3.3691756272401437e-06, "loss": 0.1885, "step": 5400 }, { "epoch": 34.01367741935484, "grad_norm": 0.023513080552220345, "learning_rate": 3.35483870967742e-06, "loss": 0.0213, "step": 5410 }, { "epoch": 34.014967741935486, "grad_norm": 0.17787271738052368, "learning_rate": 3.3405017921146952e-06, "loss": 0.0103, "step": 5420 }, { "epoch": 34.01625806451613, "grad_norm": 0.028996452689170837, "learning_rate": 3.3261648745519714e-06, "loss": 0.0069, "step": 5430 }, { "epoch": 34.017548387096774, "grad_norm": 0.10073871165513992, "learning_rate": 3.3118279569892476e-06, "loss": 0.0008, "step": 5440 }, { "epoch": 34.01883870967742, "grad_norm": 0.04820786789059639, "learning_rate": 3.2974910394265234e-06, "loss": 0.0544, "step": 5450 }, { "epoch": 34.02012903225806, "grad_norm": 0.030315034091472626, "learning_rate": 3.2831541218637996e-06, "loss": 0.096, "step": 5460 }, { "epoch": 34.02012903225806, "eval_accuracy": 0.782608695652174, "eval_loss": 1.2420079708099365, "eval_runtime": 12.1209, "eval_samples_per_second": 3.795, "eval_steps_per_second": 0.825, "step": 5460 }, { "epoch": 35.001290322580644, "grad_norm": 0.08960109204053879, "learning_rate": 3.268817204301076e-06, "loss": 0.2159, "step": 5470 }, { "epoch": 35.00258064516129, "grad_norm": 0.021017009392380714, "learning_rate": 3.254480286738351e-06, "loss": 0.1267, "step": 5480 }, { "epoch": 35.00387096774193, "grad_norm": 0.059589825570583344, "learning_rate": 3.2401433691756274e-06, "loss": 0.3431, "step": 5490 }, { "epoch": 35.00516129032258, "grad_norm": 0.005972574464976788, "learning_rate": 3.225806451612903e-06, "loss": 0.0854, "step": 5500 }, { "epoch": 35.00645161290323, "grad_norm": 0.008477037772536278, "learning_rate": 3.2114695340501794e-06, "loss": 0.0089, "step": 5510 }, { "epoch": 35.00774193548387, "grad_norm": 0.0019872374832630157, "learning_rate": 3.1971326164874556e-06, "loss": 0.142, "step": 5520 }, { "epoch": 35.009032258064515, "grad_norm": 0.07476571202278137, "learning_rate": 3.182795698924732e-06, "loss": 0.0184, "step": 5530 }, { "epoch": 35.01032258064516, "grad_norm": 0.004031882155686617, "learning_rate": 3.1684587813620076e-06, "loss": 0.0016, "step": 5540 }, { "epoch": 35.0116129032258, "grad_norm": 1.8121750354766846, "learning_rate": 3.1541218637992834e-06, "loss": 0.0074, "step": 5550 }, { "epoch": 35.012903225806454, "grad_norm": 0.6697885990142822, "learning_rate": 3.139784946236559e-06, "loss": 0.3569, "step": 5560 }, { "epoch": 35.0141935483871, "grad_norm": 0.040170762687921524, "learning_rate": 3.1254480286738354e-06, "loss": 0.0671, "step": 5570 }, { "epoch": 35.01548387096774, "grad_norm": 0.015748370438814163, "learning_rate": 3.1111111111111116e-06, "loss": 0.2443, "step": 5580 }, { "epoch": 35.016774193548386, "grad_norm": 0.9828428626060486, "learning_rate": 3.0967741935483874e-06, "loss": 0.012, "step": 5590 }, { "epoch": 35.01806451612903, "grad_norm": 0.06932424753904343, "learning_rate": 3.0824372759856636e-06, "loss": 0.1389, "step": 5600 }, { "epoch": 35.019354838709674, "grad_norm": 0.13489635288715363, "learning_rate": 3.068100358422939e-06, "loss": 0.0112, "step": 5610 }, { "epoch": 35.02012903225806, "eval_accuracy": 0.7608695652173914, "eval_loss": 1.4820091724395752, "eval_runtime": 12.8164, "eval_samples_per_second": 3.589, "eval_steps_per_second": 0.78, "step": 5616 }, { "epoch": 36.000516129032256, "grad_norm": 0.3131411373615265, "learning_rate": 3.053763440860215e-06, "loss": 0.0005, "step": 5620 }, { "epoch": 36.0018064516129, "grad_norm": 0.014214432798326015, "learning_rate": 3.0394265232974914e-06, "loss": 0.1268, "step": 5630 }, { "epoch": 36.00309677419355, "grad_norm": 0.00823433417826891, "learning_rate": 3.025089605734767e-06, "loss": 0.1459, "step": 5640 }, { "epoch": 36.004387096774195, "grad_norm": 0.0030675220768898726, "learning_rate": 3.0107526881720433e-06, "loss": 0.1364, "step": 5650 }, { "epoch": 36.00567741935484, "grad_norm": 2.5035667419433594, "learning_rate": 2.9964157706093196e-06, "loss": 0.034, "step": 5660 }, { "epoch": 36.00696774193548, "grad_norm": 0.0277547724545002, "learning_rate": 2.9820788530465953e-06, "loss": 0.0009, "step": 5670 }, { "epoch": 36.00825806451613, "grad_norm": 0.026789885014295578, "learning_rate": 2.967741935483871e-06, "loss": 0.0138, "step": 5680 }, { "epoch": 36.00954838709677, "grad_norm": 2.1961116790771484, "learning_rate": 2.953405017921147e-06, "loss": 0.0018, "step": 5690 }, { "epoch": 36.01083870967742, "grad_norm": 0.0160366278141737, "learning_rate": 2.939068100358423e-06, "loss": 0.2965, "step": 5700 }, { "epoch": 36.012129032258066, "grad_norm": 4.776129245758057, "learning_rate": 2.9247311827956993e-06, "loss": 0.1323, "step": 5710 }, { "epoch": 36.01341935483871, "grad_norm": 0.012811361812055111, "learning_rate": 2.910394265232975e-06, "loss": 0.0178, "step": 5720 }, { "epoch": 36.014709677419354, "grad_norm": 0.033293869346380234, "learning_rate": 2.8960573476702513e-06, "loss": 0.0002, "step": 5730 }, { "epoch": 36.016, "grad_norm": 0.007899209856987, "learning_rate": 2.881720430107527e-06, "loss": 0.0103, "step": 5740 }, { "epoch": 36.01729032258064, "grad_norm": 1.5940768718719482, "learning_rate": 2.867383512544803e-06, "loss": 0.1847, "step": 5750 }, { "epoch": 36.01858064516129, "grad_norm": 0.01006957795470953, "learning_rate": 2.853046594982079e-06, "loss": 0.0005, "step": 5760 }, { "epoch": 36.01987096774194, "grad_norm": 0.03862006217241287, "learning_rate": 2.8387096774193553e-06, "loss": 0.0282, "step": 5770 }, { "epoch": 36.02012903225806, "eval_accuracy": 0.782608695652174, "eval_loss": 1.3097363710403442, "eval_runtime": 11.4795, "eval_samples_per_second": 4.007, "eval_steps_per_second": 0.871, "step": 5772 }, { "epoch": 37.00103225806452, "grad_norm": 0.20495983958244324, "learning_rate": 2.824372759856631e-06, "loss": 0.205, "step": 5780 }, { "epoch": 37.00232258064516, "grad_norm": 0.004989553242921829, "learning_rate": 2.8100358422939073e-06, "loss": 0.1046, "step": 5790 }, { "epoch": 37.00361290322581, "grad_norm": 0.03228915482759476, "learning_rate": 2.7956989247311827e-06, "loss": 0.1067, "step": 5800 }, { "epoch": 37.00490322580645, "grad_norm": 0.006522230803966522, "learning_rate": 2.781362007168459e-06, "loss": 0.0027, "step": 5810 }, { "epoch": 37.006193548387095, "grad_norm": 0.010635578073561192, "learning_rate": 2.767025089605735e-06, "loss": 0.0004, "step": 5820 }, { "epoch": 37.00748387096774, "grad_norm": 0.016306089237332344, "learning_rate": 2.752688172043011e-06, "loss": 0.0311, "step": 5830 }, { "epoch": 37.00877419354839, "grad_norm": 7.976202964782715, "learning_rate": 2.738351254480287e-06, "loss": 0.0008, "step": 5840 }, { "epoch": 37.010064516129034, "grad_norm": 0.1106325313448906, "learning_rate": 2.7240143369175633e-06, "loss": 0.1163, "step": 5850 }, { "epoch": 37.01135483870968, "grad_norm": 0.01112562045454979, "learning_rate": 2.709677419354839e-06, "loss": 0.0293, "step": 5860 }, { "epoch": 37.01264516129032, "grad_norm": 0.008133800700306892, "learning_rate": 2.695340501792115e-06, "loss": 0.1363, "step": 5870 }, { "epoch": 37.013935483870966, "grad_norm": 0.03337938338518143, "learning_rate": 2.6810035842293906e-06, "loss": 0.0012, "step": 5880 }, { "epoch": 37.01522580645161, "grad_norm": 0.0005771717987954617, "learning_rate": 2.666666666666667e-06, "loss": 0.0445, "step": 5890 }, { "epoch": 37.01651612903226, "grad_norm": 0.021181073039770126, "learning_rate": 2.652329749103943e-06, "loss": 0.0315, "step": 5900 }, { "epoch": 37.017806451612906, "grad_norm": 1.9666517972946167, "learning_rate": 2.637992831541219e-06, "loss": 0.0678, "step": 5910 }, { "epoch": 37.01909677419355, "grad_norm": 0.02967059426009655, "learning_rate": 2.623655913978495e-06, "loss": 0.1689, "step": 5920 }, { "epoch": 37.02012903225806, "eval_accuracy": 0.6739130434782609, "eval_loss": 1.6520096063613892, "eval_runtime": 11.5878, "eval_samples_per_second": 3.97, "eval_steps_per_second": 0.863, "step": 5928 }, { "epoch": 38.00025806451613, "grad_norm": 0.47885993123054504, "learning_rate": 2.6093189964157704e-06, "loss": 0.2587, "step": 5930 }, { "epoch": 38.001548387096776, "grad_norm": 0.7331668734550476, "learning_rate": 2.5949820788530466e-06, "loss": 0.1019, "step": 5940 }, { "epoch": 38.00283870967742, "grad_norm": 0.0029360023327171803, "learning_rate": 2.580645161290323e-06, "loss": 0.0446, "step": 5950 }, { "epoch": 38.004129032258064, "grad_norm": 0.0046666692942380905, "learning_rate": 2.5663082437275986e-06, "loss": 0.022, "step": 5960 }, { "epoch": 38.00541935483871, "grad_norm": 0.3813575506210327, "learning_rate": 2.551971326164875e-06, "loss": 0.0098, "step": 5970 }, { "epoch": 38.00670967741935, "grad_norm": 0.04312867671251297, "learning_rate": 2.537634408602151e-06, "loss": 0.1297, "step": 5980 }, { "epoch": 38.008, "grad_norm": 0.006952712312340736, "learning_rate": 2.5232974910394272e-06, "loss": 0.1184, "step": 5990 }, { "epoch": 38.00929032258065, "grad_norm": 0.002405844395980239, "learning_rate": 2.5089605734767026e-06, "loss": 0.0128, "step": 6000 }, { "epoch": 38.01058064516129, "grad_norm": 0.0011628689244389534, "learning_rate": 2.4946236559139788e-06, "loss": 0.112, "step": 6010 }, { "epoch": 38.011870967741935, "grad_norm": 0.0034226062707602978, "learning_rate": 2.4802867383512546e-06, "loss": 0.0763, "step": 6020 }, { "epoch": 38.01316129032258, "grad_norm": 0.21437561511993408, "learning_rate": 2.4659498207885308e-06, "loss": 0.0003, "step": 6030 }, { "epoch": 38.01445161290322, "grad_norm": 0.007954117842018604, "learning_rate": 2.4516129032258066e-06, "loss": 0.0002, "step": 6040 }, { "epoch": 38.015741935483874, "grad_norm": 1.3664933443069458, "learning_rate": 2.4372759856630828e-06, "loss": 0.0005, "step": 6050 }, { "epoch": 38.01703225806452, "grad_norm": 0.0019887620583176613, "learning_rate": 2.4229390681003586e-06, "loss": 0.0255, "step": 6060 }, { "epoch": 38.01832258064516, "grad_norm": 0.01791042648255825, "learning_rate": 2.4086021505376348e-06, "loss": 0.0183, "step": 6070 }, { "epoch": 38.019612903225806, "grad_norm": 0.009486278519034386, "learning_rate": 2.3942652329749105e-06, "loss": 0.0005, "step": 6080 }, { "epoch": 38.02012903225806, "eval_accuracy": 0.6739130434782609, "eval_loss": 1.9083939790725708, "eval_runtime": 12.3171, "eval_samples_per_second": 3.735, "eval_steps_per_second": 0.812, "step": 6084 }, { "epoch": 39.00077419354839, "grad_norm": 183.52352905273438, "learning_rate": 2.3799283154121868e-06, "loss": 0.1101, "step": 6090 }, { "epoch": 39.00206451612903, "grad_norm": 1.2428672313690186, "learning_rate": 2.3655913978494625e-06, "loss": 0.0029, "step": 6100 }, { "epoch": 39.003354838709676, "grad_norm": 0.02754882536828518, "learning_rate": 2.3512544802867383e-06, "loss": 0.0124, "step": 6110 }, { "epoch": 39.00464516129032, "grad_norm": 0.0068018557503819466, "learning_rate": 2.3369175627240145e-06, "loss": 0.0658, "step": 6120 }, { "epoch": 39.00593548387097, "grad_norm": 0.054412633180618286, "learning_rate": 2.3225806451612907e-06, "loss": 0.1474, "step": 6130 }, { "epoch": 39.007225806451615, "grad_norm": 0.004437610507011414, "learning_rate": 2.3082437275985665e-06, "loss": 0.0002, "step": 6140 }, { "epoch": 39.00851612903226, "grad_norm": 0.014133200980722904, "learning_rate": 2.2939068100358423e-06, "loss": 0.2054, "step": 6150 }, { "epoch": 39.0098064516129, "grad_norm": 11.847437858581543, "learning_rate": 2.2795698924731185e-06, "loss": 0.0033, "step": 6160 }, { "epoch": 39.01109677419355, "grad_norm": 0.002006458817049861, "learning_rate": 2.2652329749103943e-06, "loss": 0.0001, "step": 6170 }, { "epoch": 39.01238709677419, "grad_norm": 0.6751833558082581, "learning_rate": 2.2508960573476705e-06, "loss": 0.1604, "step": 6180 }, { "epoch": 39.01367741935484, "grad_norm": 3.2521636486053467, "learning_rate": 2.2365591397849463e-06, "loss": 0.0005, "step": 6190 }, { "epoch": 39.014967741935486, "grad_norm": 0.006344661116600037, "learning_rate": 2.222222222222222e-06, "loss": 0.1238, "step": 6200 }, { "epoch": 39.01625806451613, "grad_norm": 0.3917979896068573, "learning_rate": 2.2078853046594983e-06, "loss": 0.2527, "step": 6210 }, { "epoch": 39.017548387096774, "grad_norm": 0.008893150836229324, "learning_rate": 2.1935483870967745e-06, "loss": 0.1878, "step": 6220 }, { "epoch": 39.01883870967742, "grad_norm": 2.086853265762329, "learning_rate": 2.1792114695340507e-06, "loss": 0.0088, "step": 6230 }, { "epoch": 39.02012903225806, "grad_norm": 0.04705026000738144, "learning_rate": 2.1648745519713265e-06, "loss": 0.1626, "step": 6240 }, { "epoch": 39.02012903225806, "eval_accuracy": 0.8043478260869565, "eval_loss": 1.3166276216506958, "eval_runtime": 11.478, "eval_samples_per_second": 4.008, "eval_steps_per_second": 0.871, "step": 6240 }, { "epoch": 40.001290322580644, "grad_norm": 0.2208189070224762, "learning_rate": 2.1505376344086023e-06, "loss": 0.0886, "step": 6250 }, { "epoch": 40.00258064516129, "grad_norm": 235.0808868408203, "learning_rate": 2.1362007168458785e-06, "loss": 0.0342, "step": 6260 }, { "epoch": 40.00387096774193, "grad_norm": 0.16445375978946686, "learning_rate": 2.1218637992831543e-06, "loss": 0.0022, "step": 6270 }, { "epoch": 40.00516129032258, "grad_norm": 127.58007049560547, "learning_rate": 2.1075268817204305e-06, "loss": 0.0091, "step": 6280 }, { "epoch": 40.00645161290323, "grad_norm": 91.26319885253906, "learning_rate": 2.0931899641577063e-06, "loss": 0.0105, "step": 6290 }, { "epoch": 40.00774193548387, "grad_norm": 0.012966694310307503, "learning_rate": 2.078853046594982e-06, "loss": 0.0391, "step": 6300 }, { "epoch": 40.009032258064515, "grad_norm": 0.0035923051182180643, "learning_rate": 2.0645161290322582e-06, "loss": 0.0009, "step": 6310 }, { "epoch": 40.01032258064516, "grad_norm": 0.008221358060836792, "learning_rate": 2.0501792114695345e-06, "loss": 0.1235, "step": 6320 }, { "epoch": 40.0116129032258, "grad_norm": 238.31866455078125, "learning_rate": 2.0358422939068102e-06, "loss": 0.1389, "step": 6330 }, { "epoch": 40.012903225806454, "grad_norm": 0.04064839705824852, "learning_rate": 2.021505376344086e-06, "loss": 0.0235, "step": 6340 }, { "epoch": 40.0141935483871, "grad_norm": 0.008291717618703842, "learning_rate": 2.0071684587813622e-06, "loss": 0.0001, "step": 6350 }, { "epoch": 40.01548387096774, "grad_norm": 0.009570412337779999, "learning_rate": 1.992831541218638e-06, "loss": 0.1065, "step": 6360 }, { "epoch": 40.016774193548386, "grad_norm": 0.10300207138061523, "learning_rate": 1.9784946236559142e-06, "loss": 0.182, "step": 6370 }, { "epoch": 40.01806451612903, "grad_norm": 0.8126786351203918, "learning_rate": 1.96415770609319e-06, "loss": 0.1568, "step": 6380 }, { "epoch": 40.019354838709674, "grad_norm": 157.0272216796875, "learning_rate": 1.9498207885304662e-06, "loss": 0.08, "step": 6390 }, { "epoch": 40.02012903225806, "eval_accuracy": 0.8043478260869565, "eval_loss": 1.4161419868469238, "eval_runtime": 12.1488, "eval_samples_per_second": 3.786, "eval_steps_per_second": 0.823, "step": 6396 }, { "epoch": 41.000516129032256, "grad_norm": 0.014858972281217575, "learning_rate": 1.935483870967742e-06, "loss": 0.0648, "step": 6400 }, { "epoch": 41.0018064516129, "grad_norm": 0.01136968843638897, "learning_rate": 1.921146953405018e-06, "loss": 0.1669, "step": 6410 }, { "epoch": 41.00309677419355, "grad_norm": 0.0023584014270454645, "learning_rate": 1.9068100358422942e-06, "loss": 0.0002, "step": 6420 }, { "epoch": 41.004387096774195, "grad_norm": 10.562312126159668, "learning_rate": 1.89247311827957e-06, "loss": 0.1544, "step": 6430 }, { "epoch": 41.00567741935484, "grad_norm": 0.003775796154513955, "learning_rate": 1.878136200716846e-06, "loss": 0.0013, "step": 6440 }, { "epoch": 41.00696774193548, "grad_norm": 0.007018662989139557, "learning_rate": 1.8637992831541222e-06, "loss": 0.2684, "step": 6450 }, { "epoch": 41.00825806451613, "grad_norm": 0.008144672028720379, "learning_rate": 1.849462365591398e-06, "loss": 0.0038, "step": 6460 }, { "epoch": 41.00954838709677, "grad_norm": 0.8660280704498291, "learning_rate": 1.835125448028674e-06, "loss": 0.0005, "step": 6470 }, { "epoch": 41.01083870967742, "grad_norm": 0.00585430720821023, "learning_rate": 1.82078853046595e-06, "loss": 0.0049, "step": 6480 }, { "epoch": 41.012129032258066, "grad_norm": 283.7451477050781, "learning_rate": 1.8064516129032258e-06, "loss": 0.0955, "step": 6490 }, { "epoch": 41.01341935483871, "grad_norm": 0.007923400029540062, "learning_rate": 1.792114695340502e-06, "loss": 0.0135, "step": 6500 }, { "epoch": 41.014709677419354, "grad_norm": 77.85164642333984, "learning_rate": 1.777777777777778e-06, "loss": 0.1103, "step": 6510 }, { "epoch": 41.016, "grad_norm": 0.00663497531786561, "learning_rate": 1.7634408602150537e-06, "loss": 0.0019, "step": 6520 }, { "epoch": 41.01729032258064, "grad_norm": 0.014023227617144585, "learning_rate": 1.74910394265233e-06, "loss": 0.1251, "step": 6530 }, { "epoch": 41.01858064516129, "grad_norm": 0.004419019911438227, "learning_rate": 1.734767025089606e-06, "loss": 0.0001, "step": 6540 }, { "epoch": 41.01987096774194, "grad_norm": 0.004604637157171965, "learning_rate": 1.720430107526882e-06, "loss": 0.062, "step": 6550 }, { "epoch": 41.02012903225806, "eval_accuracy": 0.7391304347826086, "eval_loss": 1.4863042831420898, "eval_runtime": 13.1887, "eval_samples_per_second": 3.488, "eval_steps_per_second": 0.758, "step": 6552 }, { "epoch": 42.00103225806452, "grad_norm": 0.3599920868873596, "learning_rate": 1.7060931899641577e-06, "loss": 0.0281, "step": 6560 }, { "epoch": 42.00232258064516, "grad_norm": 0.02549358271062374, "learning_rate": 1.691756272401434e-06, "loss": 0.0002, "step": 6570 }, { "epoch": 42.00361290322581, "grad_norm": 0.0028553956653922796, "learning_rate": 1.67741935483871e-06, "loss": 0.1049, "step": 6580 }, { "epoch": 42.00490322580645, "grad_norm": 0.0029041580855846405, "learning_rate": 1.6630824372759857e-06, "loss": 0.0182, "step": 6590 }, { "epoch": 42.006193548387095, "grad_norm": 0.0037799286656081676, "learning_rate": 1.6487455197132617e-06, "loss": 0.0333, "step": 6600 }, { "epoch": 42.00748387096774, "grad_norm": 0.006967680528759956, "learning_rate": 1.634408602150538e-06, "loss": 0.0809, "step": 6610 }, { "epoch": 42.00877419354839, "grad_norm": 0.0073914541862905025, "learning_rate": 1.6200716845878137e-06, "loss": 0.0006, "step": 6620 }, { "epoch": 42.010064516129034, "grad_norm": 1.7508246898651123, "learning_rate": 1.6057347670250897e-06, "loss": 0.0753, "step": 6630 }, { "epoch": 42.01135483870968, "grad_norm": 0.030523056164383888, "learning_rate": 1.591397849462366e-06, "loss": 0.156, "step": 6640 }, { "epoch": 42.01264516129032, "grad_norm": 0.00355586432851851, "learning_rate": 1.5770609318996417e-06, "loss": 0.171, "step": 6650 }, { "epoch": 42.013935483870966, "grad_norm": 0.0012914433609694242, "learning_rate": 1.5627240143369177e-06, "loss": 0.0032, "step": 6660 }, { "epoch": 42.01522580645161, "grad_norm": 2.320455551147461, "learning_rate": 1.5483870967741937e-06, "loss": 0.06, "step": 6670 }, { "epoch": 42.01651612903226, "grad_norm": 0.12741488218307495, "learning_rate": 1.5340501792114695e-06, "loss": 0.0008, "step": 6680 }, { "epoch": 42.017806451612906, "grad_norm": 0.21855418384075165, "learning_rate": 1.5197132616487457e-06, "loss": 0.0503, "step": 6690 }, { "epoch": 42.01909677419355, "grad_norm": 0.011004120111465454, "learning_rate": 1.5053763440860217e-06, "loss": 0.1915, "step": 6700 }, { "epoch": 42.02012903225806, "eval_accuracy": 0.6521739130434783, "eval_loss": 1.6604118347167969, "eval_runtime": 12.9102, "eval_samples_per_second": 3.563, "eval_steps_per_second": 0.775, "step": 6708 }, { "epoch": 43.00025806451613, "grad_norm": 0.00805247388780117, "learning_rate": 1.4910394265232977e-06, "loss": 0.0003, "step": 6710 }, { "epoch": 43.001548387096776, "grad_norm": 0.19692690670490265, "learning_rate": 1.4767025089605735e-06, "loss": 0.0692, "step": 6720 }, { "epoch": 43.00283870967742, "grad_norm": 0.4546060562133789, "learning_rate": 1.4623655913978497e-06, "loss": 0.1384, "step": 6730 }, { "epoch": 43.004129032258064, "grad_norm": 0.004471180960536003, "learning_rate": 1.4480286738351257e-06, "loss": 0.0001, "step": 6740 }, { "epoch": 43.00541935483871, "grad_norm": 0.057842936366796494, "learning_rate": 1.4336917562724014e-06, "loss": 0.0005, "step": 6750 }, { "epoch": 43.00670967741935, "grad_norm": 0.020293984562158585, "learning_rate": 1.4193548387096776e-06, "loss": 0.0003, "step": 6760 }, { "epoch": 43.008, "grad_norm": 0.007428640499711037, "learning_rate": 1.4050179211469536e-06, "loss": 0.2642, "step": 6770 }, { "epoch": 43.00929032258065, "grad_norm": 0.006076927296817303, "learning_rate": 1.3906810035842294e-06, "loss": 0.0575, "step": 6780 }, { "epoch": 43.01058064516129, "grad_norm": 0.00406764866784215, "learning_rate": 1.3763440860215054e-06, "loss": 0.2648, "step": 6790 }, { "epoch": 43.011870967741935, "grad_norm": 1.9007959365844727, "learning_rate": 1.3620071684587816e-06, "loss": 0.0142, "step": 6800 }, { "epoch": 43.01316129032258, "grad_norm": 0.014039999805390835, "learning_rate": 1.3476702508960574e-06, "loss": 0.0051, "step": 6810 }, { "epoch": 43.01445161290322, "grad_norm": 0.001595507375895977, "learning_rate": 1.3333333333333334e-06, "loss": 0.1087, "step": 6820 }, { "epoch": 43.015741935483874, "grad_norm": 157.6433563232422, "learning_rate": 1.3189964157706094e-06, "loss": 0.0838, "step": 6830 }, { "epoch": 43.01703225806452, "grad_norm": 98.38020324707031, "learning_rate": 1.3046594982078852e-06, "loss": 0.0305, "step": 6840 }, { "epoch": 43.01832258064516, "grad_norm": 0.012816263362765312, "learning_rate": 1.2903225806451614e-06, "loss": 0.0001, "step": 6850 }, { "epoch": 43.019612903225806, "grad_norm": 0.0019207666628062725, "learning_rate": 1.2759856630824374e-06, "loss": 0.365, "step": 6860 }, { "epoch": 43.02012903225806, "eval_accuracy": 0.8260869565217391, "eval_loss": 1.4168778657913208, "eval_runtime": 12.0728, "eval_samples_per_second": 3.81, "eval_steps_per_second": 0.828, "step": 6864 }, { "epoch": 44.00077419354839, "grad_norm": 0.01464733388274908, "learning_rate": 1.2616487455197136e-06, "loss": 0.0036, "step": 6870 }, { "epoch": 44.00206451612903, "grad_norm": 0.051096364855766296, "learning_rate": 1.2473118279569894e-06, "loss": 0.0629, "step": 6880 }, { "epoch": 44.003354838709676, "grad_norm": 12.162487030029297, "learning_rate": 1.2329749103942654e-06, "loss": 0.052, "step": 6890 }, { "epoch": 44.00464516129032, "grad_norm": 0.00528025534003973, "learning_rate": 1.2186379928315414e-06, "loss": 0.0001, "step": 6900 }, { "epoch": 44.00593548387097, "grad_norm": 0.0016480134800076485, "learning_rate": 1.2043010752688174e-06, "loss": 0.1353, "step": 6910 }, { "epoch": 44.007225806451615, "grad_norm": 0.0034174532629549503, "learning_rate": 1.1899641577060934e-06, "loss": 0.0068, "step": 6920 }, { "epoch": 44.00851612903226, "grad_norm": 0.004657174460589886, "learning_rate": 1.1756272401433692e-06, "loss": 0.0002, "step": 6930 }, { "epoch": 44.0098064516129, "grad_norm": 0.0068703750148415565, "learning_rate": 1.1612903225806454e-06, "loss": 0.0002, "step": 6940 }, { "epoch": 44.01109677419355, "grad_norm": 7.4889936447143555, "learning_rate": 1.1469534050179212e-06, "loss": 0.0021, "step": 6950 }, { "epoch": 44.01238709677419, "grad_norm": 0.3485462963581085, "learning_rate": 1.1326164874551971e-06, "loss": 0.0055, "step": 6960 }, { "epoch": 44.01367741935484, "grad_norm": 0.01862538419663906, "learning_rate": 1.1182795698924731e-06, "loss": 0.0014, "step": 6970 }, { "epoch": 44.014967741935486, "grad_norm": 0.002196903107687831, "learning_rate": 1.1039426523297491e-06, "loss": 0.1788, "step": 6980 }, { "epoch": 44.01625806451613, "grad_norm": 0.0016718535916879773, "learning_rate": 1.0896057347670253e-06, "loss": 0.3165, "step": 6990 }, { "epoch": 44.017548387096774, "grad_norm": 0.0007507894770242274, "learning_rate": 1.0752688172043011e-06, "loss": 0.0037, "step": 7000 }, { "epoch": 44.01883870967742, "grad_norm": 0.8214715719223022, "learning_rate": 1.0609318996415771e-06, "loss": 0.0959, "step": 7010 }, { "epoch": 44.02012903225806, "grad_norm": 0.009730805642902851, "learning_rate": 1.0465949820788531e-06, "loss": 0.0001, "step": 7020 }, { "epoch": 44.02012903225806, "eval_accuracy": 0.8043478260869565, "eval_loss": 1.4883445501327515, "eval_runtime": 13.2102, "eval_samples_per_second": 3.482, "eval_steps_per_second": 0.757, "step": 7020 }, { "epoch": 45.001290322580644, "grad_norm": 0.003847012296319008, "learning_rate": 1.0322580645161291e-06, "loss": 0.0009, "step": 7030 }, { "epoch": 45.00258064516129, "grad_norm": 0.046989716589450836, "learning_rate": 1.0179211469534051e-06, "loss": 0.1967, "step": 7040 }, { "epoch": 45.00387096774193, "grad_norm": 0.0026057204231619835, "learning_rate": 1.0035842293906811e-06, "loss": 0.1732, "step": 7050 }, { "epoch": 45.00516129032258, "grad_norm": 0.010523968376219273, "learning_rate": 9.892473118279571e-07, "loss": 0.0002, "step": 7060 }, { "epoch": 45.00645161290323, "grad_norm": 67.8198471069336, "learning_rate": 9.749103942652331e-07, "loss": 0.0067, "step": 7070 }, { "epoch": 45.00774193548387, "grad_norm": 0.006785286590456963, "learning_rate": 9.60573476702509e-07, "loss": 0.0234, "step": 7080 }, { "epoch": 45.009032258064515, "grad_norm": 223.58937072753906, "learning_rate": 9.46236559139785e-07, "loss": 0.0627, "step": 7090 }, { "epoch": 45.01032258064516, "grad_norm": 3.282024383544922, "learning_rate": 9.318996415770611e-07, "loss": 0.019, "step": 7100 }, { "epoch": 45.0116129032258, "grad_norm": 0.9105473756790161, "learning_rate": 9.17562724014337e-07, "loss": 0.0003, "step": 7110 }, { "epoch": 45.012903225806454, "grad_norm": 0.008867247961461544, "learning_rate": 9.032258064516129e-07, "loss": 0.0001, "step": 7120 }, { "epoch": 45.0141935483871, "grad_norm": 0.04539462924003601, "learning_rate": 8.88888888888889e-07, "loss": 0.2999, "step": 7130 }, { "epoch": 45.01548387096774, "grad_norm": 0.0017706549260765314, "learning_rate": 8.74551971326165e-07, "loss": 0.0021, "step": 7140 }, { "epoch": 45.016774193548386, "grad_norm": 1.5139387845993042, "learning_rate": 8.60215053763441e-07, "loss": 0.0678, "step": 7150 }, { "epoch": 45.01806451612903, "grad_norm": 0.008817179128527641, "learning_rate": 8.45878136200717e-07, "loss": 0.0202, "step": 7160 }, { "epoch": 45.019354838709674, "grad_norm": 0.012126709334552288, "learning_rate": 8.315412186379929e-07, "loss": 0.0014, "step": 7170 }, { "epoch": 45.02012903225806, "eval_accuracy": 0.8043478260869565, "eval_loss": 1.5121612548828125, "eval_runtime": 12.8985, "eval_samples_per_second": 3.566, "eval_steps_per_second": 0.775, "step": 7176 }, { "epoch": 46.000516129032256, "grad_norm": 0.0038337453734129667, "learning_rate": 8.17204301075269e-07, "loss": 0.008, "step": 7180 }, { "epoch": 46.0018064516129, "grad_norm": 0.000946657033637166, "learning_rate": 8.028673835125448e-07, "loss": 0.005, "step": 7190 }, { "epoch": 46.00309677419355, "grad_norm": 0.0015710624866187572, "learning_rate": 7.885304659498208e-07, "loss": 0.0418, "step": 7200 }, { "epoch": 46.004387096774195, "grad_norm": 0.08825144171714783, "learning_rate": 7.741935483870968e-07, "loss": 0.0006, "step": 7210 }, { "epoch": 46.00567741935484, "grad_norm": 0.0015074616530910134, "learning_rate": 7.598566308243728e-07, "loss": 0.0001, "step": 7220 }, { "epoch": 46.00696774193548, "grad_norm": 177.5682373046875, "learning_rate": 7.455197132616488e-07, "loss": 0.3399, "step": 7230 }, { "epoch": 46.00825806451613, "grad_norm": 0.007398999761790037, "learning_rate": 7.311827956989248e-07, "loss": 0.0003, "step": 7240 }, { "epoch": 46.00954838709677, "grad_norm": 0.004576251842081547, "learning_rate": 7.168458781362007e-07, "loss": 0.0001, "step": 7250 }, { "epoch": 46.01083870967742, "grad_norm": 0.012000703252851963, "learning_rate": 7.025089605734768e-07, "loss": 0.0002, "step": 7260 }, { "epoch": 46.012129032258066, "grad_norm": 0.02267647720873356, "learning_rate": 6.881720430107527e-07, "loss": 0.0001, "step": 7270 }, { "epoch": 46.01341935483871, "grad_norm": 0.003983432427048683, "learning_rate": 6.738351254480287e-07, "loss": 0.0001, "step": 7280 }, { "epoch": 46.014709677419354, "grad_norm": 0.0014599552378058434, "learning_rate": 6.594982078853047e-07, "loss": 0.0001, "step": 7290 }, { "epoch": 46.016, "grad_norm": 6.292613506317139, "learning_rate": 6.451612903225807e-07, "loss": 0.018, "step": 7300 }, { "epoch": 46.01729032258064, "grad_norm": 0.006310159340500832, "learning_rate": 6.308243727598568e-07, "loss": 0.0064, "step": 7310 }, { "epoch": 46.01858064516129, "grad_norm": 0.007816086523234844, "learning_rate": 6.164874551971327e-07, "loss": 0.0061, "step": 7320 }, { "epoch": 46.01987096774194, "grad_norm": 34.82604217529297, "learning_rate": 6.021505376344087e-07, "loss": 0.0024, "step": 7330 }, { "epoch": 46.02012903225806, "eval_accuracy": 0.8043478260869565, "eval_loss": 1.4807806015014648, "eval_runtime": 12.8143, "eval_samples_per_second": 3.59, "eval_steps_per_second": 0.78, "step": 7332 }, { "epoch": 47.00103225806452, "grad_norm": 0.002470318228006363, "learning_rate": 5.878136200716846e-07, "loss": 0.1209, "step": 7340 }, { "epoch": 47.00232258064516, "grad_norm": 53.0703010559082, "learning_rate": 5.734767025089606e-07, "loss": 0.0066, "step": 7350 }, { "epoch": 47.00361290322581, "grad_norm": 1.9326198101043701, "learning_rate": 5.591397849462366e-07, "loss": 0.015, "step": 7360 }, { "epoch": 47.00490322580645, "grad_norm": 0.02357548102736473, "learning_rate": 5.448028673835127e-07, "loss": 0.0001, "step": 7370 }, { "epoch": 47.006193548387095, "grad_norm": 0.0012209382839500904, "learning_rate": 5.304659498207886e-07, "loss": 0.2121, "step": 7380 }, { "epoch": 47.00748387096774, "grad_norm": 0.008492130786180496, "learning_rate": 5.161290322580646e-07, "loss": 0.1444, "step": 7390 }, { "epoch": 47.00877419354839, "grad_norm": 0.005266561638563871, "learning_rate": 5.017921146953406e-07, "loss": 0.346, "step": 7400 }, { "epoch": 47.010064516129034, "grad_norm": 0.0006817293469794095, "learning_rate": 4.874551971326166e-07, "loss": 0.0002, "step": 7410 }, { "epoch": 47.01135483870968, "grad_norm": 0.004157207906246185, "learning_rate": 4.731182795698925e-07, "loss": 0.0001, "step": 7420 }, { "epoch": 47.01264516129032, "grad_norm": 0.0016743586165830493, "learning_rate": 4.587813620071685e-07, "loss": 0.0029, "step": 7430 }, { "epoch": 47.013935483870966, "grad_norm": 0.6975991129875183, "learning_rate": 4.444444444444445e-07, "loss": 0.0021, "step": 7440 }, { "epoch": 47.01522580645161, "grad_norm": 0.0034498583991080523, "learning_rate": 4.301075268817205e-07, "loss": 0.0415, "step": 7450 }, { "epoch": 47.01651612903226, "grad_norm": 0.0027017593383789062, "learning_rate": 4.1577060931899643e-07, "loss": 0.0157, "step": 7460 }, { "epoch": 47.017806451612906, "grad_norm": 0.012391701340675354, "learning_rate": 4.014336917562724e-07, "loss": 0.0434, "step": 7470 }, { "epoch": 47.01909677419355, "grad_norm": 0.0006409501074813306, "learning_rate": 3.870967741935484e-07, "loss": 0.0001, "step": 7480 }, { "epoch": 47.02012903225806, "eval_accuracy": 0.782608695652174, "eval_loss": 1.4889546632766724, "eval_runtime": 13.1368, "eval_samples_per_second": 3.502, "eval_steps_per_second": 0.761, "step": 7488 }, { "epoch": 48.00025806451613, "grad_norm": 0.0067320214584469795, "learning_rate": 3.727598566308244e-07, "loss": 0.0002, "step": 7490 }, { "epoch": 48.001548387096776, "grad_norm": 0.020699143409729004, "learning_rate": 3.5842293906810036e-07, "loss": 0.0164, "step": 7500 }, { "epoch": 48.00283870967742, "grad_norm": 0.004197176545858383, "learning_rate": 3.4408602150537636e-07, "loss": 0.0008, "step": 7510 }, { "epoch": 48.004129032258064, "grad_norm": 0.21093904972076416, "learning_rate": 3.2974910394265235e-07, "loss": 0.002, "step": 7520 }, { "epoch": 48.00541935483871, "grad_norm": 0.0005535822710953653, "learning_rate": 3.154121863799284e-07, "loss": 0.0003, "step": 7530 }, { "epoch": 48.00670967741935, "grad_norm": 0.007659395691007376, "learning_rate": 3.0107526881720434e-07, "loss": 0.0027, "step": 7540 }, { "epoch": 48.008, "grad_norm": 0.07470550388097763, "learning_rate": 2.867383512544803e-07, "loss": 0.053, "step": 7550 }, { "epoch": 48.00929032258065, "grad_norm": 0.002228489378467202, "learning_rate": 2.7240143369175634e-07, "loss": 0.0003, "step": 7560 }, { "epoch": 48.01058064516129, "grad_norm": 0.0019506957614794374, "learning_rate": 2.580645161290323e-07, "loss": 0.0004, "step": 7570 }, { "epoch": 48.011870967741935, "grad_norm": 0.0018325488781556487, "learning_rate": 2.437275985663083e-07, "loss": 0.0791, "step": 7580 }, { "epoch": 48.01316129032258, "grad_norm": 0.00621037045493722, "learning_rate": 2.2939068100358425e-07, "loss": 0.0001, "step": 7590 }, { "epoch": 48.01445161290322, "grad_norm": 0.016049824655056, "learning_rate": 2.1505376344086024e-07, "loss": 0.0315, "step": 7600 }, { "epoch": 48.015741935483874, "grad_norm": 2.039865016937256, "learning_rate": 2.007168458781362e-07, "loss": 0.0004, "step": 7610 }, { "epoch": 48.01703225806452, "grad_norm": 0.001883605495095253, "learning_rate": 1.863799283154122e-07, "loss": 0.0001, "step": 7620 }, { "epoch": 48.01832258064516, "grad_norm": 0.0038601632695645094, "learning_rate": 1.7204301075268818e-07, "loss": 0.0001, "step": 7630 }, { "epoch": 48.019612903225806, "grad_norm": 0.006116045173257589, "learning_rate": 1.577060931899642e-07, "loss": 0.0001, "step": 7640 }, { "epoch": 48.02012903225806, "eval_accuracy": 0.7391304347826086, "eval_loss": 1.512891411781311, "eval_runtime": 13.0068, "eval_samples_per_second": 3.537, "eval_steps_per_second": 0.769, "step": 7644 }, { "epoch": 49.00077419354839, "grad_norm": 0.028257016092538834, "learning_rate": 1.4336917562724014e-07, "loss": 0.1925, "step": 7650 }, { "epoch": 49.00206451612903, "grad_norm": 0.008592449128627777, "learning_rate": 1.2903225806451614e-07, "loss": 0.0002, "step": 7660 }, { "epoch": 49.003354838709676, "grad_norm": 136.31967163085938, "learning_rate": 1.1469534050179212e-07, "loss": 0.3131, "step": 7670 }, { "epoch": 49.00464516129032, "grad_norm": 0.002297618892043829, "learning_rate": 1.003584229390681e-07, "loss": 0.0002, "step": 7680 }, { "epoch": 49.00593548387097, "grad_norm": 0.0033956149127334356, "learning_rate": 8.602150537634409e-08, "loss": 0.0001, "step": 7690 }, { "epoch": 49.007225806451615, "grad_norm": 0.021755073219537735, "learning_rate": 7.168458781362007e-08, "loss": 0.033, "step": 7700 }, { "epoch": 49.00851612903226, "grad_norm": 0.0010228661121800542, "learning_rate": 5.734767025089606e-08, "loss": 0.0001, "step": 7710 }, { "epoch": 49.0098064516129, "grad_norm": 0.026721712201833725, "learning_rate": 4.3010752688172045e-08, "loss": 0.0001, "step": 7720 }, { "epoch": 49.01109677419355, "grad_norm": 0.0076351589523255825, "learning_rate": 2.867383512544803e-08, "loss": 0.0003, "step": 7730 }, { "epoch": 49.01238709677419, "grad_norm": 0.006438611075282097, "learning_rate": 1.4336917562724015e-08, "loss": 0.0004, "step": 7740 }, { "epoch": 49.01367741935484, "grad_norm": 0.010015049949288368, "learning_rate": 0.0, "loss": 0.0001, "step": 7750 }, { "epoch": 49.01367741935484, "eval_accuracy": 0.7608695652173914, "eval_loss": 1.5113792419433594, "eval_runtime": 14.1006, "eval_samples_per_second": 3.262, "eval_steps_per_second": 0.709, "step": 7750 }, { "epoch": 49.01367741935484, "step": 7750, "total_flos": 1.6993710942830022e+20, "train_loss": 0.25061025346567734, "train_runtime": 31250.7993, "train_samples_per_second": 1.24, "train_steps_per_second": 0.248 }, { "epoch": 49.01367741935484, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.8259707689285278, "eval_runtime": 12.109, "eval_samples_per_second": 3.799, "eval_steps_per_second": 0.826, "step": 7750 }, { "epoch": 49.01367741935484, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.8259710073471069, "eval_runtime": 12.0738, "eval_samples_per_second": 3.81, "eval_steps_per_second": 0.828, "step": 7750 } ], "logging_steps": 10, "max_steps": 7750, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6993710942830022e+20, "train_batch_size": 5, "trial_name": null, "trial_params": null }