{ "best_metric": 0.80437648, "best_model_checkpoint": "/qlgy0912/llm_sft_output/qwen2_5-7b/v0-20240927-140411/checkpoint-76000", "epoch": 1.994779743070036, "eval_steps": 500, "global_step": 85500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.69601125, "epoch": 2.333075722888931e-05, "grad_norm": 65.0, "learning_rate": 3.888024883359254e-09, "loss": 1.2215333, "memory(GiB)": 101.91, "step": 1, "train_speed(iter/s)": 0.081993 }, { "acc": 0.6643041, "epoch": 0.0002333075722888931, "grad_norm": 52.25, "learning_rate": 3.888024883359254e-08, "loss": 1.42277463, "memory(GiB)": 103.58, "step": 10, "train_speed(iter/s)": 0.306137 }, { "acc": 0.64442787, "epoch": 0.0004666151445777862, "grad_norm": 63.25, "learning_rate": 7.776049766718508e-08, "loss": 1.4146143, "memory(GiB)": 106.04, "step": 20, "train_speed(iter/s)": 0.340883 }, { "acc": 0.6630681, "epoch": 0.0006999227168666793, "grad_norm": 58.25, "learning_rate": 1.1664074650077761e-07, "loss": 1.34018517, "memory(GiB)": 106.04, "step": 30, "train_speed(iter/s)": 0.358391 }, { "acc": 0.65009575, "epoch": 0.0009332302891555724, "grad_norm": 54.5, "learning_rate": 1.5552099533437016e-07, "loss": 1.45347633, "memory(GiB)": 106.17, "step": 40, "train_speed(iter/s)": 0.375122 }, { "acc": 0.6623395, "epoch": 0.0011665378614444655, "grad_norm": 28.25, "learning_rate": 1.944012441679627e-07, "loss": 1.3650135, "memory(GiB)": 107.6, "step": 50, "train_speed(iter/s)": 0.374611 }, { "acc": 0.65479803, "epoch": 0.0013998454337333585, "grad_norm": 192.0, "learning_rate": 2.3328149300155523e-07, "loss": 1.39007854, "memory(GiB)": 107.6, "step": 60, "train_speed(iter/s)": 0.384975 }, { "acc": 0.67412581, "epoch": 0.0016331530060222517, "grad_norm": 42.75, "learning_rate": 2.721617418351478e-07, "loss": 1.3340167, "memory(GiB)": 107.6, "step": 70, "train_speed(iter/s)": 0.392443 }, { "acc": 0.64180751, "epoch": 0.0018664605783111448, "grad_norm": 46.0, "learning_rate": 3.110419906687403e-07, "loss": 1.44007759, "memory(GiB)": 107.6, "step": 80, "train_speed(iter/s)": 0.400513 }, { "acc": 0.65562725, "epoch": 0.0020997681506000378, "grad_norm": 169.0, "learning_rate": 3.4992223950233286e-07, "loss": 1.377672, "memory(GiB)": 107.6, "step": 90, "train_speed(iter/s)": 0.404057 }, { "acc": 0.65258265, "epoch": 0.002333075722888931, "grad_norm": 201.0, "learning_rate": 3.888024883359254e-07, "loss": 1.40249634, "memory(GiB)": 107.73, "step": 100, "train_speed(iter/s)": 0.404893 }, { "acc": 0.66373754, "epoch": 0.0025663832951778242, "grad_norm": 44.5, "learning_rate": 4.2768273716951787e-07, "loss": 1.3577816, "memory(GiB)": 107.73, "step": 110, "train_speed(iter/s)": 0.406581 }, { "acc": 0.66515074, "epoch": 0.002799690867466717, "grad_norm": 69.5, "learning_rate": 4.6656298600311046e-07, "loss": 1.33736687, "memory(GiB)": 107.73, "step": 120, "train_speed(iter/s)": 0.406629 }, { "acc": 0.6434556, "epoch": 0.0030329984397556103, "grad_norm": 39.75, "learning_rate": 5.054432348367029e-07, "loss": 1.45937138, "memory(GiB)": 107.73, "step": 130, "train_speed(iter/s)": 0.409521 }, { "acc": 0.63647475, "epoch": 0.0032663060120445035, "grad_norm": 43.5, "learning_rate": 5.443234836702956e-07, "loss": 1.45315819, "memory(GiB)": 107.73, "step": 140, "train_speed(iter/s)": 0.409846 }, { "acc": 0.66497622, "epoch": 0.0034996135843333967, "grad_norm": 18.875, "learning_rate": 5.832037325038881e-07, "loss": 1.33231993, "memory(GiB)": 107.73, "step": 150, "train_speed(iter/s)": 0.41013 }, { "acc": 0.67984552, "epoch": 0.0037329211566222895, "grad_norm": 64.0, "learning_rate": 6.220839813374806e-07, "loss": 1.25875092, "memory(GiB)": 109.26, "step": 160, "train_speed(iter/s)": 0.409049 }, { "acc": 0.66304803, "epoch": 0.003966228728911183, "grad_norm": 41.25, "learning_rate": 6.609642301710731e-07, "loss": 1.35497437, "memory(GiB)": 109.26, "step": 170, "train_speed(iter/s)": 0.408629 }, { "acc": 0.6695015, "epoch": 0.0041995363012000755, "grad_norm": 24.125, "learning_rate": 6.998444790046657e-07, "loss": 1.32009106, "memory(GiB)": 109.26, "step": 180, "train_speed(iter/s)": 0.408194 }, { "acc": 0.64377413, "epoch": 0.004432843873488969, "grad_norm": 189.0, "learning_rate": 7.387247278382582e-07, "loss": 1.45081196, "memory(GiB)": 109.26, "step": 190, "train_speed(iter/s)": 0.40967 }, { "acc": 0.66846848, "epoch": 0.004666151445777862, "grad_norm": 32.25, "learning_rate": 7.776049766718508e-07, "loss": 1.34669209, "memory(GiB)": 109.26, "step": 200, "train_speed(iter/s)": 0.409516 }, { "acc": 0.66021504, "epoch": 0.004899459018066755, "grad_norm": 22.25, "learning_rate": 8.164852255054432e-07, "loss": 1.3446023, "memory(GiB)": 109.26, "step": 210, "train_speed(iter/s)": 0.40949 }, { "acc": 0.67646599, "epoch": 0.0051327665903556485, "grad_norm": 22.625, "learning_rate": 8.553654743390357e-07, "loss": 1.24624052, "memory(GiB)": 109.26, "step": 220, "train_speed(iter/s)": 0.408686 }, { "acc": 0.67372646, "epoch": 0.005366074162644541, "grad_norm": 187.0, "learning_rate": 8.942457231726284e-07, "loss": 1.31202698, "memory(GiB)": 109.26, "step": 230, "train_speed(iter/s)": 0.408586 }, { "acc": 0.67337198, "epoch": 0.005599381734933434, "grad_norm": 87.0, "learning_rate": 9.331259720062209e-07, "loss": 1.32102585, "memory(GiB)": 109.26, "step": 240, "train_speed(iter/s)": 0.409543 }, { "acc": 0.68346677, "epoch": 0.005832689307222328, "grad_norm": 19.125, "learning_rate": 9.720062208398133e-07, "loss": 1.25821629, "memory(GiB)": 109.26, "step": 250, "train_speed(iter/s)": 0.409961 }, { "acc": 0.6582798, "epoch": 0.0060659968795112205, "grad_norm": 27.0, "learning_rate": 1.0108864696734059e-06, "loss": 1.37746887, "memory(GiB)": 109.26, "step": 260, "train_speed(iter/s)": 0.410468 }, { "acc": 0.65754652, "epoch": 0.006299304451800113, "grad_norm": 30.375, "learning_rate": 1.0497667185069986e-06, "loss": 1.34862213, "memory(GiB)": 109.64, "step": 270, "train_speed(iter/s)": 0.410747 }, { "acc": 0.67330213, "epoch": 0.006532612024089007, "grad_norm": 49.0, "learning_rate": 1.0886469673405912e-06, "loss": 1.3070693, "memory(GiB)": 109.64, "step": 280, "train_speed(iter/s)": 0.411653 }, { "acc": 0.68029757, "epoch": 0.0067659195963779, "grad_norm": 34.5, "learning_rate": 1.1275272161741837e-06, "loss": 1.24388609, "memory(GiB)": 109.64, "step": 290, "train_speed(iter/s)": 0.411959 }, { "acc": 0.69043927, "epoch": 0.006999227168666793, "grad_norm": 22.875, "learning_rate": 1.1664074650077762e-06, "loss": 1.24611559, "memory(GiB)": 109.77, "step": 300, "train_speed(iter/s)": 0.411594 }, { "acc": 0.68904781, "epoch": 0.007232534740955686, "grad_norm": 12.75, "learning_rate": 1.2052877138413686e-06, "loss": 1.20471973, "memory(GiB)": 109.77, "step": 310, "train_speed(iter/s)": 0.410459 }, { "acc": 0.6853941, "epoch": 0.007465842313244579, "grad_norm": 24.875, "learning_rate": 1.2441679626749613e-06, "loss": 1.23733406, "memory(GiB)": 109.77, "step": 320, "train_speed(iter/s)": 0.411169 }, { "acc": 0.69481087, "epoch": 0.007699149885533473, "grad_norm": 85.0, "learning_rate": 1.2830482115085538e-06, "loss": 1.23310585, "memory(GiB)": 109.77, "step": 330, "train_speed(iter/s)": 0.411728 }, { "acc": 0.69631901, "epoch": 0.007932457457822365, "grad_norm": 35.0, "learning_rate": 1.3219284603421462e-06, "loss": 1.18948269, "memory(GiB)": 109.77, "step": 340, "train_speed(iter/s)": 0.412145 }, { "acc": 0.66458092, "epoch": 0.00816576503011126, "grad_norm": 17.625, "learning_rate": 1.360808709175739e-06, "loss": 1.3200798, "memory(GiB)": 109.77, "step": 350, "train_speed(iter/s)": 0.411958 }, { "acc": 0.68819141, "epoch": 0.008399072602400151, "grad_norm": 32.5, "learning_rate": 1.3996889580093314e-06, "loss": 1.22151852, "memory(GiB)": 109.77, "step": 360, "train_speed(iter/s)": 0.412697 }, { "acc": 0.68463111, "epoch": 0.008632380174689045, "grad_norm": 38.0, "learning_rate": 1.4385692068429238e-06, "loss": 1.26127262, "memory(GiB)": 109.77, "step": 370, "train_speed(iter/s)": 0.413014 }, { "acc": 0.66406565, "epoch": 0.008865687746977938, "grad_norm": 21.75, "learning_rate": 1.4774494556765165e-06, "loss": 1.33156033, "memory(GiB)": 109.77, "step": 380, "train_speed(iter/s)": 0.412519 }, { "acc": 0.67032399, "epoch": 0.00909899531926683, "grad_norm": 22.875, "learning_rate": 1.5163297045101088e-06, "loss": 1.27387772, "memory(GiB)": 109.77, "step": 390, "train_speed(iter/s)": 0.412242 }, { "acc": 0.68596916, "epoch": 0.009332302891555724, "grad_norm": 15.0, "learning_rate": 1.5552099533437016e-06, "loss": 1.23029289, "memory(GiB)": 109.77, "step": 400, "train_speed(iter/s)": 0.41303 }, { "acc": 0.68200741, "epoch": 0.009565610463844618, "grad_norm": 19.75, "learning_rate": 1.594090202177294e-06, "loss": 1.23422031, "memory(GiB)": 109.77, "step": 410, "train_speed(iter/s)": 0.413378 }, { "acc": 0.68170118, "epoch": 0.00979891803613351, "grad_norm": 23.875, "learning_rate": 1.6329704510108864e-06, "loss": 1.24763041, "memory(GiB)": 109.77, "step": 420, "train_speed(iter/s)": 0.413317 }, { "acc": 0.68019838, "epoch": 0.010032225608422403, "grad_norm": 12.5625, "learning_rate": 1.6718506998444792e-06, "loss": 1.25324621, "memory(GiB)": 109.77, "step": 430, "train_speed(iter/s)": 0.413645 }, { "acc": 0.68601837, "epoch": 0.010265533180711297, "grad_norm": 22.125, "learning_rate": 1.7107309486780715e-06, "loss": 1.21224289, "memory(GiB)": 109.77, "step": 440, "train_speed(iter/s)": 0.413557 }, { "acc": 0.70243044, "epoch": 0.010498840753000189, "grad_norm": 32.0, "learning_rate": 1.7496111975116642e-06, "loss": 1.15849266, "memory(GiB)": 109.77, "step": 450, "train_speed(iter/s)": 0.412938 }, { "acc": 0.68498225, "epoch": 0.010732148325289083, "grad_norm": 21.0, "learning_rate": 1.7884914463452568e-06, "loss": 1.22883425, "memory(GiB)": 109.77, "step": 460, "train_speed(iter/s)": 0.413629 }, { "acc": 0.71473951, "epoch": 0.010965455897577976, "grad_norm": 27.75, "learning_rate": 1.8273716951788493e-06, "loss": 1.14569626, "memory(GiB)": 109.77, "step": 470, "train_speed(iter/s)": 0.413969 }, { "acc": 0.68733125, "epoch": 0.011198763469866868, "grad_norm": 17.75, "learning_rate": 1.8662519440124418e-06, "loss": 1.21703568, "memory(GiB)": 109.77, "step": 480, "train_speed(iter/s)": 0.414002 }, { "acc": 0.68442454, "epoch": 0.011432071042155762, "grad_norm": 29.875, "learning_rate": 1.9051321928460342e-06, "loss": 1.21695004, "memory(GiB)": 109.77, "step": 490, "train_speed(iter/s)": 0.414733 }, { "acc": 0.69048681, "epoch": 0.011665378614444655, "grad_norm": 39.5, "learning_rate": 1.9440124416796267e-06, "loss": 1.17031765, "memory(GiB)": 109.77, "step": 500, "train_speed(iter/s)": 0.414989 }, { "epoch": 0.011665378614444655, "eval_acc": 0.6771181596993516, "eval_loss": 1.170393943786621, "eval_runtime": 1270.1553, "eval_samples_per_second": 28.336, "eval_steps_per_second": 14.168, "step": 500 }, { "acc": 0.68483567, "epoch": 0.011898686186733547, "grad_norm": 24.375, "learning_rate": 1.9828926905132194e-06, "loss": 1.21425552, "memory(GiB)": 112.37, "step": 510, "train_speed(iter/s)": 0.20271 }, { "acc": 0.71478772, "epoch": 0.012131993759022441, "grad_norm": 40.75, "learning_rate": 2.0217729393468118e-06, "loss": 1.08218079, "memory(GiB)": 112.37, "step": 520, "train_speed(iter/s)": 0.204723 }, { "acc": 0.69291697, "epoch": 0.012365301331311335, "grad_norm": 13.3125, "learning_rate": 2.0606531881804045e-06, "loss": 1.18659, "memory(GiB)": 112.37, "step": 530, "train_speed(iter/s)": 0.206783 }, { "acc": 0.70176544, "epoch": 0.012598608903600227, "grad_norm": 10.8125, "learning_rate": 2.0995334370139973e-06, "loss": 1.15439968, "memory(GiB)": 112.37, "step": 540, "train_speed(iter/s)": 0.208567 }, { "acc": 0.70656576, "epoch": 0.01283191647588912, "grad_norm": 24.75, "learning_rate": 2.1384136858475896e-06, "loss": 1.13584023, "memory(GiB)": 112.37, "step": 550, "train_speed(iter/s)": 0.210478 }, { "acc": 0.69827976, "epoch": 0.013065224048178014, "grad_norm": 18.375, "learning_rate": 2.1772939346811823e-06, "loss": 1.16824875, "memory(GiB)": 112.37, "step": 560, "train_speed(iter/s)": 0.212251 }, { "acc": 0.69722204, "epoch": 0.013298531620466908, "grad_norm": 20.625, "learning_rate": 2.2161741835147746e-06, "loss": 1.17354527, "memory(GiB)": 112.37, "step": 570, "train_speed(iter/s)": 0.214144 }, { "acc": 0.69391127, "epoch": 0.0135318391927558, "grad_norm": 12.25, "learning_rate": 2.2550544323483674e-06, "loss": 1.18383684, "memory(GiB)": 112.37, "step": 580, "train_speed(iter/s)": 0.215836 }, { "acc": 0.7008687, "epoch": 0.013765146765044693, "grad_norm": 21.125, "learning_rate": 2.2939346811819597e-06, "loss": 1.13180761, "memory(GiB)": 112.37, "step": 590, "train_speed(iter/s)": 0.217548 }, { "acc": 0.70401969, "epoch": 0.013998454337333587, "grad_norm": 14.1875, "learning_rate": 2.3328149300155525e-06, "loss": 1.14089346, "memory(GiB)": 112.37, "step": 600, "train_speed(iter/s)": 0.219209 }, { "acc": 0.67588034, "epoch": 0.014231761909622479, "grad_norm": 152.0, "learning_rate": 2.3716951788491448e-06, "loss": 1.282197, "memory(GiB)": 112.37, "step": 610, "train_speed(iter/s)": 0.220998 }, { "acc": 0.71631866, "epoch": 0.014465069481911372, "grad_norm": 10.0, "learning_rate": 2.410575427682737e-06, "loss": 1.09193068, "memory(GiB)": 112.37, "step": 620, "train_speed(iter/s)": 0.22271 }, { "acc": 0.70892181, "epoch": 0.014698377054200266, "grad_norm": 21.625, "learning_rate": 2.44945567651633e-06, "loss": 1.10817814, "memory(GiB)": 112.37, "step": 630, "train_speed(iter/s)": 0.224373 }, { "acc": 0.71971693, "epoch": 0.014931684626489158, "grad_norm": 22.75, "learning_rate": 2.4883359253499226e-06, "loss": 1.06725435, "memory(GiB)": 112.37, "step": 640, "train_speed(iter/s)": 0.226002 }, { "acc": 0.6958858, "epoch": 0.015164992198778052, "grad_norm": 13.5, "learning_rate": 2.527216174183515e-06, "loss": 1.14539852, "memory(GiB)": 112.37, "step": 650, "train_speed(iter/s)": 0.227501 }, { "acc": 0.7064106, "epoch": 0.015398299771066945, "grad_norm": 14.875, "learning_rate": 2.5660964230171077e-06, "loss": 1.09452753, "memory(GiB)": 112.37, "step": 660, "train_speed(iter/s)": 0.229076 }, { "acc": 0.68724928, "epoch": 0.01563160734335584, "grad_norm": 10.125, "learning_rate": 2.6049766718507004e-06, "loss": 1.17633553, "memory(GiB)": 112.37, "step": 670, "train_speed(iter/s)": 0.230642 }, { "acc": 0.71898904, "epoch": 0.01586491491564473, "grad_norm": 20.25, "learning_rate": 2.6438569206842923e-06, "loss": 1.08943672, "memory(GiB)": 112.37, "step": 680, "train_speed(iter/s)": 0.232146 }, { "acc": 0.70866594, "epoch": 0.016098222487933623, "grad_norm": 41.25, "learning_rate": 2.682737169517885e-06, "loss": 1.1266736, "memory(GiB)": 112.37, "step": 690, "train_speed(iter/s)": 0.233723 }, { "acc": 0.69511156, "epoch": 0.01633153006022252, "grad_norm": 16.75, "learning_rate": 2.721617418351478e-06, "loss": 1.15227604, "memory(GiB)": 112.37, "step": 700, "train_speed(iter/s)": 0.235191 }, { "acc": 0.69382067, "epoch": 0.01656483763251141, "grad_norm": 13.875, "learning_rate": 2.76049766718507e-06, "loss": 1.16943235, "memory(GiB)": 112.37, "step": 710, "train_speed(iter/s)": 0.236655 }, { "acc": 0.70306535, "epoch": 0.016798145204800302, "grad_norm": 55.5, "learning_rate": 2.799377916018663e-06, "loss": 1.11830854, "memory(GiB)": 112.5, "step": 720, "train_speed(iter/s)": 0.238078 }, { "acc": 0.7088171, "epoch": 0.017031452777089198, "grad_norm": 13.875, "learning_rate": 2.838258164852255e-06, "loss": 1.11293411, "memory(GiB)": 112.5, "step": 730, "train_speed(iter/s)": 0.239489 }, { "acc": 0.73532681, "epoch": 0.01726476034937809, "grad_norm": 15.375, "learning_rate": 2.8771384136858475e-06, "loss": 1.01635103, "memory(GiB)": 112.5, "step": 740, "train_speed(iter/s)": 0.240802 }, { "acc": 0.73056784, "epoch": 0.01749806792166698, "grad_norm": 8.6875, "learning_rate": 2.9160186625194403e-06, "loss": 1.03644848, "memory(GiB)": 112.5, "step": 750, "train_speed(iter/s)": 0.24216 }, { "acc": 0.71434994, "epoch": 0.017731375493955877, "grad_norm": 14.0625, "learning_rate": 2.954898911353033e-06, "loss": 1.09710608, "memory(GiB)": 112.5, "step": 760, "train_speed(iter/s)": 0.24334 }, { "acc": 0.71459484, "epoch": 0.01796468306624477, "grad_norm": 11.125, "learning_rate": 2.9937791601866257e-06, "loss": 1.10940857, "memory(GiB)": 112.5, "step": 770, "train_speed(iter/s)": 0.24472 }, { "acc": 0.72359238, "epoch": 0.01819799063853366, "grad_norm": 10.6875, "learning_rate": 3.0326594090202176e-06, "loss": 1.06084309, "memory(GiB)": 112.5, "step": 780, "train_speed(iter/s)": 0.246022 }, { "acc": 0.71572514, "epoch": 0.018431298210822556, "grad_norm": 11.8125, "learning_rate": 3.0715396578538104e-06, "loss": 1.07304935, "memory(GiB)": 112.5, "step": 790, "train_speed(iter/s)": 0.247402 }, { "acc": 0.73548241, "epoch": 0.018664605783111448, "grad_norm": 9.625, "learning_rate": 3.110419906687403e-06, "loss": 1.0010951, "memory(GiB)": 112.5, "step": 800, "train_speed(iter/s)": 0.248693 }, { "acc": 0.74713941, "epoch": 0.01889791335540034, "grad_norm": 58.0, "learning_rate": 3.1493001555209955e-06, "loss": 0.96250553, "memory(GiB)": 112.5, "step": 810, "train_speed(iter/s)": 0.249902 }, { "acc": 0.72171469, "epoch": 0.019131220927689235, "grad_norm": 25.0, "learning_rate": 3.188180404354588e-06, "loss": 1.06542034, "memory(GiB)": 112.5, "step": 820, "train_speed(iter/s)": 0.251022 }, { "acc": 0.72154565, "epoch": 0.019364528499978127, "grad_norm": 10.375, "learning_rate": 3.2270606531881805e-06, "loss": 1.06000462, "memory(GiB)": 112.5, "step": 830, "train_speed(iter/s)": 0.252274 }, { "acc": 0.72352486, "epoch": 0.01959783607226702, "grad_norm": 15.6875, "learning_rate": 3.265940902021773e-06, "loss": 1.07526264, "memory(GiB)": 112.5, "step": 840, "train_speed(iter/s)": 0.253342 }, { "acc": 0.70478067, "epoch": 0.019831143644555915, "grad_norm": 30.75, "learning_rate": 3.3048211508553656e-06, "loss": 1.0926899, "memory(GiB)": 112.5, "step": 850, "train_speed(iter/s)": 0.254406 }, { "acc": 0.7308609, "epoch": 0.020064451216844807, "grad_norm": 9.375, "learning_rate": 3.3437013996889583e-06, "loss": 1.01046867, "memory(GiB)": 112.5, "step": 860, "train_speed(iter/s)": 0.255641 }, { "acc": 0.69665556, "epoch": 0.0202977587891337, "grad_norm": 29.125, "learning_rate": 3.382581648522551e-06, "loss": 1.16045341, "memory(GiB)": 112.5, "step": 870, "train_speed(iter/s)": 0.256795 }, { "acc": 0.72442212, "epoch": 0.020531066361422594, "grad_norm": 19.875, "learning_rate": 3.421461897356143e-06, "loss": 1.05415115, "memory(GiB)": 112.5, "step": 880, "train_speed(iter/s)": 0.2579 }, { "acc": 0.71560249, "epoch": 0.020764373933711486, "grad_norm": 17.5, "learning_rate": 3.4603421461897357e-06, "loss": 1.08642292, "memory(GiB)": 112.5, "step": 890, "train_speed(iter/s)": 0.259042 }, { "acc": 0.7090786, "epoch": 0.020997681506000378, "grad_norm": 20.5, "learning_rate": 3.4992223950233285e-06, "loss": 1.12236042, "memory(GiB)": 112.5, "step": 900, "train_speed(iter/s)": 0.260196 }, { "acc": 0.72494149, "epoch": 0.021230989078289273, "grad_norm": 14.125, "learning_rate": 3.5381026438569212e-06, "loss": 1.06427841, "memory(GiB)": 112.5, "step": 910, "train_speed(iter/s)": 0.261275 }, { "acc": 0.73181691, "epoch": 0.021464296650578165, "grad_norm": 10.3125, "learning_rate": 3.5769828926905135e-06, "loss": 1.00808945, "memory(GiB)": 112.5, "step": 920, "train_speed(iter/s)": 0.26246 }, { "acc": 0.70418386, "epoch": 0.021697604222867057, "grad_norm": 9.0625, "learning_rate": 3.615863141524106e-06, "loss": 1.1433609, "memory(GiB)": 112.5, "step": 930, "train_speed(iter/s)": 0.263556 }, { "acc": 0.72886324, "epoch": 0.021930911795155952, "grad_norm": 6.9375, "learning_rate": 3.6547433903576986e-06, "loss": 1.02939377, "memory(GiB)": 112.5, "step": 940, "train_speed(iter/s)": 0.264494 }, { "acc": 0.71419945, "epoch": 0.022164219367444844, "grad_norm": 16.625, "learning_rate": 3.693623639191291e-06, "loss": 1.06986065, "memory(GiB)": 112.5, "step": 950, "train_speed(iter/s)": 0.265441 }, { "acc": 0.73321857, "epoch": 0.022397526939733736, "grad_norm": 14.3125, "learning_rate": 3.7325038880248837e-06, "loss": 1.01707897, "memory(GiB)": 112.5, "step": 960, "train_speed(iter/s)": 0.266415 }, { "acc": 0.70803647, "epoch": 0.02263083451202263, "grad_norm": 22.5, "learning_rate": 3.7713841368584764e-06, "loss": 1.15186424, "memory(GiB)": 112.5, "step": 970, "train_speed(iter/s)": 0.267359 }, { "acc": 0.7166934, "epoch": 0.022864142084311524, "grad_norm": 11.5, "learning_rate": 3.8102643856920683e-06, "loss": 1.08353691, "memory(GiB)": 112.5, "step": 980, "train_speed(iter/s)": 0.268331 }, { "acc": 0.75482187, "epoch": 0.023097449656600415, "grad_norm": 11.125, "learning_rate": 3.849144634525661e-06, "loss": 0.92038155, "memory(GiB)": 112.5, "step": 990, "train_speed(iter/s)": 0.269254 }, { "acc": 0.72072606, "epoch": 0.02333075722888931, "grad_norm": 17.375, "learning_rate": 3.888024883359253e-06, "loss": 1.032094, "memory(GiB)": 112.5, "step": 1000, "train_speed(iter/s)": 0.270211 }, { "epoch": 0.02333075722888931, "eval_acc": 0.6944079715955378, "eval_loss": 1.03887140750885, "eval_runtime": 1269.799, "eval_samples_per_second": 28.344, "eval_steps_per_second": 14.172, "step": 1000 }, { "acc": 0.73672218, "epoch": 0.023564064801178203, "grad_norm": 11.6875, "learning_rate": 3.9269051321928466e-06, "loss": 0.97621746, "memory(GiB)": 112.5, "step": 1010, "train_speed(iter/s)": 0.201516 }, { "acc": 0.72829971, "epoch": 0.023797372373467095, "grad_norm": 12.625, "learning_rate": 3.965785381026439e-06, "loss": 1.04721756, "memory(GiB)": 112.5, "step": 1020, "train_speed(iter/s)": 0.202516 }, { "acc": 0.74315853, "epoch": 0.02403067994575599, "grad_norm": 12.0625, "learning_rate": 4.004665629860031e-06, "loss": 0.96440945, "memory(GiB)": 112.5, "step": 1030, "train_speed(iter/s)": 0.203553 }, { "acc": 0.71699724, "epoch": 0.024263987518044882, "grad_norm": 10.4375, "learning_rate": 4.0435458786936235e-06, "loss": 1.05050545, "memory(GiB)": 112.5, "step": 1040, "train_speed(iter/s)": 0.20455 }, { "acc": 0.73913536, "epoch": 0.024497295090333774, "grad_norm": 56.75, "learning_rate": 4.082426127527217e-06, "loss": 0.98256474, "memory(GiB)": 112.5, "step": 1050, "train_speed(iter/s)": 0.205477 }, { "acc": 0.72385454, "epoch": 0.02473060266262267, "grad_norm": 10.1875, "learning_rate": 4.121306376360809e-06, "loss": 1.03298187, "memory(GiB)": 112.5, "step": 1060, "train_speed(iter/s)": 0.206437 }, { "acc": 0.73873882, "epoch": 0.02496391023491156, "grad_norm": 13.625, "learning_rate": 4.160186625194401e-06, "loss": 1.0152791, "memory(GiB)": 112.5, "step": 1070, "train_speed(iter/s)": 0.207325 }, { "acc": 0.73708954, "epoch": 0.025197217807200453, "grad_norm": 12.3125, "learning_rate": 4.1990668740279945e-06, "loss": 0.98759651, "memory(GiB)": 112.5, "step": 1080, "train_speed(iter/s)": 0.208333 }, { "acc": 0.71818414, "epoch": 0.02543052537948935, "grad_norm": 65.0, "learning_rate": 4.237947122861587e-06, "loss": 1.0364994, "memory(GiB)": 112.5, "step": 1090, "train_speed(iter/s)": 0.209286 }, { "acc": 0.7241076, "epoch": 0.02566383295177824, "grad_norm": 16.5, "learning_rate": 4.276827371695179e-06, "loss": 1.03190594, "memory(GiB)": 112.5, "step": 1100, "train_speed(iter/s)": 0.210209 }, { "acc": 0.72726188, "epoch": 0.025897140524067136, "grad_norm": 18.25, "learning_rate": 4.3157076205287715e-06, "loss": 1.02791977, "memory(GiB)": 112.5, "step": 1110, "train_speed(iter/s)": 0.211143 }, { "acc": 0.73693681, "epoch": 0.026130448096356028, "grad_norm": 8.125, "learning_rate": 4.354587869362365e-06, "loss": 1.01167622, "memory(GiB)": 112.5, "step": 1120, "train_speed(iter/s)": 0.212056 }, { "acc": 0.73370867, "epoch": 0.02636375566864492, "grad_norm": 8.3125, "learning_rate": 4.393468118195957e-06, "loss": 1.00605249, "memory(GiB)": 112.5, "step": 1130, "train_speed(iter/s)": 0.212952 }, { "acc": 0.73210607, "epoch": 0.026597063240933815, "grad_norm": 15.8125, "learning_rate": 4.432348367029549e-06, "loss": 0.99587374, "memory(GiB)": 112.5, "step": 1140, "train_speed(iter/s)": 0.213882 }, { "acc": 0.73727956, "epoch": 0.026830370813222707, "grad_norm": 15.5, "learning_rate": 4.471228615863142e-06, "loss": 0.9840374, "memory(GiB)": 112.5, "step": 1150, "train_speed(iter/s)": 0.214805 }, { "acc": 0.73756843, "epoch": 0.0270636783855116, "grad_norm": 13.6875, "learning_rate": 4.510108864696735e-06, "loss": 1.0077363, "memory(GiB)": 112.5, "step": 1160, "train_speed(iter/s)": 0.215706 }, { "acc": 0.71364336, "epoch": 0.027296985957800494, "grad_norm": 9.375, "learning_rate": 4.548989113530327e-06, "loss": 1.07981796, "memory(GiB)": 112.5, "step": 1170, "train_speed(iter/s)": 0.216546 }, { "acc": 0.7402174, "epoch": 0.027530293530089386, "grad_norm": 6.34375, "learning_rate": 4.587869362363919e-06, "loss": 0.96098614, "memory(GiB)": 112.5, "step": 1180, "train_speed(iter/s)": 0.217392 }, { "acc": 0.72905722, "epoch": 0.02776360110237828, "grad_norm": 12.6875, "learning_rate": 4.626749611197512e-06, "loss": 1.03549309, "memory(GiB)": 112.5, "step": 1190, "train_speed(iter/s)": 0.218326 }, { "acc": 0.71565552, "epoch": 0.027996908674667174, "grad_norm": 6.21875, "learning_rate": 4.665629860031105e-06, "loss": 1.08136559, "memory(GiB)": 112.5, "step": 1200, "train_speed(iter/s)": 0.2192 }, { "acc": 0.75236349, "epoch": 0.028230216246956066, "grad_norm": 8.9375, "learning_rate": 4.704510108864697e-06, "loss": 0.95540047, "memory(GiB)": 112.5, "step": 1210, "train_speed(iter/s)": 0.220119 }, { "acc": 0.72307405, "epoch": 0.028463523819244958, "grad_norm": 10.875, "learning_rate": 4.7433903576982896e-06, "loss": 1.01828804, "memory(GiB)": 112.5, "step": 1220, "train_speed(iter/s)": 0.221032 }, { "acc": 0.71835833, "epoch": 0.028696831391533853, "grad_norm": 11.75, "learning_rate": 4.782270606531883e-06, "loss": 1.05052891, "memory(GiB)": 112.5, "step": 1230, "train_speed(iter/s)": 0.221873 }, { "acc": 0.72516937, "epoch": 0.028930138963822745, "grad_norm": 8.9375, "learning_rate": 4.821150855365474e-06, "loss": 1.02222395, "memory(GiB)": 112.5, "step": 1240, "train_speed(iter/s)": 0.222722 }, { "acc": 0.72957277, "epoch": 0.029163446536111637, "grad_norm": 10.1875, "learning_rate": 4.860031104199067e-06, "loss": 1.01317539, "memory(GiB)": 112.5, "step": 1250, "train_speed(iter/s)": 0.223525 }, { "acc": 0.74022655, "epoch": 0.029396754108400532, "grad_norm": 8.875, "learning_rate": 4.89891135303266e-06, "loss": 0.95849752, "memory(GiB)": 112.5, "step": 1260, "train_speed(iter/s)": 0.224384 }, { "acc": 0.7379168, "epoch": 0.029630061680689424, "grad_norm": 9.1875, "learning_rate": 4.937791601866253e-06, "loss": 0.96695614, "memory(GiB)": 112.5, "step": 1270, "train_speed(iter/s)": 0.225256 }, { "acc": 0.7323555, "epoch": 0.029863369252978316, "grad_norm": 8.125, "learning_rate": 4.976671850699845e-06, "loss": 1.0258522, "memory(GiB)": 112.5, "step": 1280, "train_speed(iter/s)": 0.226036 }, { "acc": 0.74023056, "epoch": 0.03009667682526721, "grad_norm": 11.1875, "learning_rate": 5.0155520995334375e-06, "loss": 0.97798119, "memory(GiB)": 112.5, "step": 1290, "train_speed(iter/s)": 0.226849 }, { "acc": 0.74629755, "epoch": 0.030329984397556103, "grad_norm": 10.0625, "learning_rate": 5.05443234836703e-06, "loss": 0.95969715, "memory(GiB)": 112.5, "step": 1300, "train_speed(iter/s)": 0.227679 }, { "acc": 0.74427795, "epoch": 0.030563291969844995, "grad_norm": 11.75, "learning_rate": 5.093312597200622e-06, "loss": 0.94915905, "memory(GiB)": 112.5, "step": 1310, "train_speed(iter/s)": 0.228474 }, { "acc": 0.73095655, "epoch": 0.03079659954213389, "grad_norm": 12.375, "learning_rate": 5.132192846034215e-06, "loss": 1.00967436, "memory(GiB)": 112.5, "step": 1320, "train_speed(iter/s)": 0.229243 }, { "acc": 0.74197416, "epoch": 0.031029907114422783, "grad_norm": 10.0625, "learning_rate": 5.171073094867808e-06, "loss": 0.95410166, "memory(GiB)": 112.5, "step": 1330, "train_speed(iter/s)": 0.230016 }, { "acc": 0.72437105, "epoch": 0.03126321468671168, "grad_norm": 9.6875, "learning_rate": 5.209953343701401e-06, "loss": 1.02081079, "memory(GiB)": 112.5, "step": 1340, "train_speed(iter/s)": 0.230814 }, { "acc": 0.71890373, "epoch": 0.03149652225900057, "grad_norm": 7.25, "learning_rate": 5.248833592534993e-06, "loss": 1.05639267, "memory(GiB)": 112.5, "step": 1350, "train_speed(iter/s)": 0.231593 }, { "acc": 0.73581686, "epoch": 0.03172982983128946, "grad_norm": 26.375, "learning_rate": 5.287713841368585e-06, "loss": 0.99747047, "memory(GiB)": 112.5, "step": 1360, "train_speed(iter/s)": 0.232336 }, { "acc": 0.73829999, "epoch": 0.031963137403578354, "grad_norm": 13.9375, "learning_rate": 5.326594090202177e-06, "loss": 0.97000275, "memory(GiB)": 112.5, "step": 1370, "train_speed(iter/s)": 0.233092 }, { "acc": 0.73752613, "epoch": 0.032196444975867246, "grad_norm": 9.25, "learning_rate": 5.36547433903577e-06, "loss": 0.97096643, "memory(GiB)": 112.5, "step": 1380, "train_speed(iter/s)": 0.23385 }, { "acc": 0.73803453, "epoch": 0.03242975254815614, "grad_norm": 8.0, "learning_rate": 5.404354587869362e-06, "loss": 0.97089777, "memory(GiB)": 112.5, "step": 1390, "train_speed(iter/s)": 0.234563 }, { "acc": 0.73658538, "epoch": 0.03266306012044504, "grad_norm": 12.5625, "learning_rate": 5.443234836702956e-06, "loss": 0.99308605, "memory(GiB)": 112.5, "step": 1400, "train_speed(iter/s)": 0.235338 }, { "acc": 0.71785216, "epoch": 0.03289636769273393, "grad_norm": 8.3125, "learning_rate": 5.482115085536548e-06, "loss": 1.07402363, "memory(GiB)": 112.5, "step": 1410, "train_speed(iter/s)": 0.236091 }, { "acc": 0.74998498, "epoch": 0.03312967526502282, "grad_norm": 8.25, "learning_rate": 5.52099533437014e-06, "loss": 0.93102665, "memory(GiB)": 112.5, "step": 1420, "train_speed(iter/s)": 0.236812 }, { "acc": 0.7163455, "epoch": 0.03336298283731171, "grad_norm": 13.5, "learning_rate": 5.559875583203733e-06, "loss": 1.08192539, "memory(GiB)": 112.5, "step": 1430, "train_speed(iter/s)": 0.237497 }, { "acc": 0.74807477, "epoch": 0.033596290409600604, "grad_norm": 17.125, "learning_rate": 5.598755832037326e-06, "loss": 0.94381409, "memory(GiB)": 112.5, "step": 1440, "train_speed(iter/s)": 0.238196 }, { "acc": 0.72739019, "epoch": 0.033829597981889496, "grad_norm": 9.0, "learning_rate": 5.637636080870919e-06, "loss": 1.02746716, "memory(GiB)": 112.5, "step": 1450, "train_speed(iter/s)": 0.238917 }, { "acc": 0.72795725, "epoch": 0.034062905554178395, "grad_norm": 9.1875, "learning_rate": 5.67651632970451e-06, "loss": 1.00883026, "memory(GiB)": 112.5, "step": 1460, "train_speed(iter/s)": 0.239627 }, { "acc": 0.74547176, "epoch": 0.03429621312646729, "grad_norm": 20.125, "learning_rate": 5.715396578538103e-06, "loss": 0.94491329, "memory(GiB)": 112.5, "step": 1470, "train_speed(iter/s)": 0.240273 }, { "acc": 0.76093078, "epoch": 0.03452952069875618, "grad_norm": 9.4375, "learning_rate": 5.754276827371695e-06, "loss": 0.89414454, "memory(GiB)": 112.5, "step": 1480, "train_speed(iter/s)": 0.240967 }, { "acc": 0.71505122, "epoch": 0.03476282827104507, "grad_norm": 11.6875, "learning_rate": 5.793157076205288e-06, "loss": 1.07852478, "memory(GiB)": 112.5, "step": 1490, "train_speed(iter/s)": 0.241618 }, { "acc": 0.73277426, "epoch": 0.03499613584333396, "grad_norm": 11.875, "learning_rate": 5.8320373250388805e-06, "loss": 0.99803505, "memory(GiB)": 112.5, "step": 1500, "train_speed(iter/s)": 0.242283 }, { "epoch": 0.03499613584333396, "eval_acc": 0.7026027121066317, "eval_loss": 0.9811294078826904, "eval_runtime": 1269.74, "eval_samples_per_second": 28.345, "eval_steps_per_second": 14.173, "step": 1500 }, { "acc": 0.73681936, "epoch": 0.03522944341562286, "grad_norm": 7.21875, "learning_rate": 5.870917573872474e-06, "loss": 1.00402145, "memory(GiB)": 112.5, "step": 1510, "train_speed(iter/s)": 0.201315 }, { "acc": 0.75371494, "epoch": 0.035462750987911754, "grad_norm": 13.0625, "learning_rate": 5.909797822706066e-06, "loss": 0.91483002, "memory(GiB)": 112.5, "step": 1520, "train_speed(iter/s)": 0.201999 }, { "acc": 0.73148508, "epoch": 0.035696058560200646, "grad_norm": 7.3125, "learning_rate": 5.948678071539658e-06, "loss": 1.00952492, "memory(GiB)": 112.5, "step": 1530, "train_speed(iter/s)": 0.202708 }, { "acc": 0.74506388, "epoch": 0.03592936613248954, "grad_norm": 10.3125, "learning_rate": 5.9875583203732515e-06, "loss": 0.94095707, "memory(GiB)": 112.5, "step": 1540, "train_speed(iter/s)": 0.20341 }, { "acc": 0.7385951, "epoch": 0.03616267370477843, "grad_norm": 9.9375, "learning_rate": 6.026438569206844e-06, "loss": 0.96724091, "memory(GiB)": 114.08, "step": 1550, "train_speed(iter/s)": 0.204074 }, { "acc": 0.73922119, "epoch": 0.03639598127706732, "grad_norm": 17.375, "learning_rate": 6.065318818040435e-06, "loss": 0.96251907, "memory(GiB)": 114.08, "step": 1560, "train_speed(iter/s)": 0.204769 }, { "acc": 0.72654696, "epoch": 0.03662928884935622, "grad_norm": 10.5625, "learning_rate": 6.1041990668740285e-06, "loss": 1.03361921, "memory(GiB)": 114.08, "step": 1570, "train_speed(iter/s)": 0.20549 }, { "acc": 0.72640285, "epoch": 0.03686259642164511, "grad_norm": 17.25, "learning_rate": 6.143079315707621e-06, "loss": 1.04433689, "memory(GiB)": 114.08, "step": 1580, "train_speed(iter/s)": 0.20617 }, { "acc": 0.75693703, "epoch": 0.037095903993934004, "grad_norm": 7.75, "learning_rate": 6.181959564541213e-06, "loss": 0.89104633, "memory(GiB)": 114.08, "step": 1590, "train_speed(iter/s)": 0.206865 }, { "acc": 0.74660778, "epoch": 0.037329211566222896, "grad_norm": 13.375, "learning_rate": 6.220839813374806e-06, "loss": 0.92481155, "memory(GiB)": 114.08, "step": 1600, "train_speed(iter/s)": 0.207546 }, { "acc": 0.72870965, "epoch": 0.03756251913851179, "grad_norm": 7.21875, "learning_rate": 6.259720062208399e-06, "loss": 1.03226433, "memory(GiB)": 114.08, "step": 1610, "train_speed(iter/s)": 0.208219 }, { "acc": 0.72618661, "epoch": 0.03779582671080068, "grad_norm": 9.0, "learning_rate": 6.298600311041991e-06, "loss": 1.02390461, "memory(GiB)": 114.08, "step": 1620, "train_speed(iter/s)": 0.208905 }, { "acc": 0.72518511, "epoch": 0.03802913428308958, "grad_norm": 11.0625, "learning_rate": 6.337480559875584e-06, "loss": 1.02645645, "memory(GiB)": 114.08, "step": 1630, "train_speed(iter/s)": 0.209512 }, { "acc": 0.74411564, "epoch": 0.03826244185537847, "grad_norm": 7.28125, "learning_rate": 6.376360808709176e-06, "loss": 0.95807343, "memory(GiB)": 114.08, "step": 1640, "train_speed(iter/s)": 0.21016 }, { "acc": 0.73625383, "epoch": 0.03849574942766736, "grad_norm": 8.625, "learning_rate": 6.4152410575427696e-06, "loss": 0.96755209, "memory(GiB)": 114.08, "step": 1650, "train_speed(iter/s)": 0.210758 }, { "acc": 0.72957053, "epoch": 0.038729056999956255, "grad_norm": 9.625, "learning_rate": 6.454121306376361e-06, "loss": 0.99150848, "memory(GiB)": 114.08, "step": 1660, "train_speed(iter/s)": 0.211372 }, { "acc": 0.74594126, "epoch": 0.038962364572245146, "grad_norm": 9.1875, "learning_rate": 6.493001555209953e-06, "loss": 0.92405319, "memory(GiB)": 114.08, "step": 1670, "train_speed(iter/s)": 0.212017 }, { "acc": 0.73622656, "epoch": 0.03919567214453404, "grad_norm": 10.3125, "learning_rate": 6.531881804043546e-06, "loss": 0.98413754, "memory(GiB)": 114.08, "step": 1680, "train_speed(iter/s)": 0.212643 }, { "acc": 0.72955575, "epoch": 0.03942897971682294, "grad_norm": 11.125, "learning_rate": 6.570762052877139e-06, "loss": 1.01020985, "memory(GiB)": 114.08, "step": 1690, "train_speed(iter/s)": 0.21329 }, { "acc": 0.74046488, "epoch": 0.03966228728911183, "grad_norm": 10.1875, "learning_rate": 6.609642301710731e-06, "loss": 0.96516933, "memory(GiB)": 114.08, "step": 1700, "train_speed(iter/s)": 0.21388 }, { "acc": 0.74028692, "epoch": 0.03989559486140072, "grad_norm": 7.15625, "learning_rate": 6.648522550544324e-06, "loss": 0.97764168, "memory(GiB)": 114.08, "step": 1710, "train_speed(iter/s)": 0.214504 }, { "acc": 0.74346085, "epoch": 0.04012890243368961, "grad_norm": 7.1875, "learning_rate": 6.687402799377917e-06, "loss": 0.95602665, "memory(GiB)": 114.08, "step": 1720, "train_speed(iter/s)": 0.215093 }, { "acc": 0.74011574, "epoch": 0.040362210005978505, "grad_norm": 18.0, "learning_rate": 6.726283048211509e-06, "loss": 0.96684675, "memory(GiB)": 114.08, "step": 1730, "train_speed(iter/s)": 0.215721 }, { "acc": 0.73707952, "epoch": 0.0405955175782674, "grad_norm": 8.0, "learning_rate": 6.765163297045102e-06, "loss": 0.97277508, "memory(GiB)": 114.08, "step": 1740, "train_speed(iter/s)": 0.216341 }, { "acc": 0.72721863, "epoch": 0.040828825150556296, "grad_norm": 7.6875, "learning_rate": 6.8040435458786945e-06, "loss": 0.99633102, "memory(GiB)": 114.08, "step": 1750, "train_speed(iter/s)": 0.216947 }, { "acc": 0.76437578, "epoch": 0.04106213272284519, "grad_norm": 6.78125, "learning_rate": 6.842923794712286e-06, "loss": 0.86832066, "memory(GiB)": 114.08, "step": 1760, "train_speed(iter/s)": 0.217536 }, { "acc": 0.73662996, "epoch": 0.04129544029513408, "grad_norm": 7.1875, "learning_rate": 6.881804043545879e-06, "loss": 1.00305567, "memory(GiB)": 114.08, "step": 1770, "train_speed(iter/s)": 0.218134 }, { "acc": 0.73254414, "epoch": 0.04152874786742297, "grad_norm": 7.71875, "learning_rate": 6.9206842923794715e-06, "loss": 1.02277117, "memory(GiB)": 114.08, "step": 1780, "train_speed(iter/s)": 0.218699 }, { "acc": 0.74180479, "epoch": 0.04176205543971186, "grad_norm": 9.5, "learning_rate": 6.959564541213064e-06, "loss": 0.97406406, "memory(GiB)": 114.08, "step": 1790, "train_speed(iter/s)": 0.219286 }, { "acc": 0.75128908, "epoch": 0.041995363012000755, "grad_norm": 7.4375, "learning_rate": 6.998444790046657e-06, "loss": 0.92827549, "memory(GiB)": 114.08, "step": 1800, "train_speed(iter/s)": 0.219875 }, { "acc": 0.75414939, "epoch": 0.042228670584289654, "grad_norm": 8.6875, "learning_rate": 7.037325038880249e-06, "loss": 0.88612537, "memory(GiB)": 114.08, "step": 1810, "train_speed(iter/s)": 0.220455 }, { "acc": 0.74286633, "epoch": 0.042461978156578546, "grad_norm": 9.4375, "learning_rate": 7.0762052877138424e-06, "loss": 0.95695734, "memory(GiB)": 114.08, "step": 1820, "train_speed(iter/s)": 0.221039 }, { "acc": 0.72784061, "epoch": 0.04269528572886744, "grad_norm": 9.6875, "learning_rate": 7.115085536547435e-06, "loss": 1.02230368, "memory(GiB)": 114.08, "step": 1830, "train_speed(iter/s)": 0.221625 }, { "acc": 0.72988882, "epoch": 0.04292859330115633, "grad_norm": 14.8125, "learning_rate": 7.153965785381027e-06, "loss": 0.99553108, "memory(GiB)": 114.08, "step": 1840, "train_speed(iter/s)": 0.222188 }, { "acc": 0.74705415, "epoch": 0.04316190087344522, "grad_norm": 7.1875, "learning_rate": 7.19284603421462e-06, "loss": 0.95047102, "memory(GiB)": 114.08, "step": 1850, "train_speed(iter/s)": 0.222809 }, { "acc": 0.71940756, "epoch": 0.043395208445734114, "grad_norm": 6.53125, "learning_rate": 7.231726283048212e-06, "loss": 1.04833097, "memory(GiB)": 114.08, "step": 1860, "train_speed(iter/s)": 0.223337 }, { "acc": 0.73792143, "epoch": 0.04362851601802301, "grad_norm": 10.6875, "learning_rate": 7.270606531881804e-06, "loss": 0.97211323, "memory(GiB)": 114.08, "step": 1870, "train_speed(iter/s)": 0.223914 }, { "acc": 0.723668, "epoch": 0.043861823590311905, "grad_norm": 5.8125, "learning_rate": 7.309486780715397e-06, "loss": 1.0364295, "memory(GiB)": 114.08, "step": 1880, "train_speed(iter/s)": 0.224504 }, { "acc": 0.72725635, "epoch": 0.0440951311626008, "grad_norm": 8.0, "learning_rate": 7.3483670295489895e-06, "loss": 1.04018497, "memory(GiB)": 114.08, "step": 1890, "train_speed(iter/s)": 0.225068 }, { "acc": 0.75151863, "epoch": 0.04432843873488969, "grad_norm": 6.3125, "learning_rate": 7.387247278382582e-06, "loss": 0.9029954, "memory(GiB)": 114.08, "step": 1900, "train_speed(iter/s)": 0.225623 }, { "acc": 0.77624922, "epoch": 0.04456174630717858, "grad_norm": 10.8125, "learning_rate": 7.426127527216175e-06, "loss": 0.82208271, "memory(GiB)": 114.08, "step": 1910, "train_speed(iter/s)": 0.226166 }, { "acc": 0.7336926, "epoch": 0.04479505387946747, "grad_norm": 7.1875, "learning_rate": 7.465007776049767e-06, "loss": 0.98131351, "memory(GiB)": 114.08, "step": 1920, "train_speed(iter/s)": 0.226693 }, { "acc": 0.73093643, "epoch": 0.04502836145175637, "grad_norm": 8.0, "learning_rate": 7.5038880248833605e-06, "loss": 0.9974226, "memory(GiB)": 114.08, "step": 1930, "train_speed(iter/s)": 0.227229 }, { "acc": 0.73827853, "epoch": 0.04526166902404526, "grad_norm": 9.0, "learning_rate": 7.542768273716953e-06, "loss": 0.95264168, "memory(GiB)": 114.08, "step": 1940, "train_speed(iter/s)": 0.227762 }, { "acc": 0.74395685, "epoch": 0.045494976596334155, "grad_norm": 11.125, "learning_rate": 7.581648522550545e-06, "loss": 0.95124683, "memory(GiB)": 114.08, "step": 1950, "train_speed(iter/s)": 0.228247 }, { "acc": 0.74083662, "epoch": 0.04572828416862305, "grad_norm": 7.59375, "learning_rate": 7.620528771384137e-06, "loss": 0.97542019, "memory(GiB)": 114.08, "step": 1960, "train_speed(iter/s)": 0.228772 }, { "acc": 0.7502737, "epoch": 0.04596159174091194, "grad_norm": 5.96875, "learning_rate": 7.659409020217729e-06, "loss": 0.92420015, "memory(GiB)": 114.08, "step": 1970, "train_speed(iter/s)": 0.229254 }, { "acc": 0.75313225, "epoch": 0.04619489931320083, "grad_norm": 20.75, "learning_rate": 7.698289269051322e-06, "loss": 0.90695429, "memory(GiB)": 114.08, "step": 1980, "train_speed(iter/s)": 0.229765 }, { "acc": 0.74232001, "epoch": 0.04642820688548973, "grad_norm": 8.375, "learning_rate": 7.737169517884915e-06, "loss": 0.97582893, "memory(GiB)": 114.08, "step": 1990, "train_speed(iter/s)": 0.230283 }, { "acc": 0.757305, "epoch": 0.04666151445777862, "grad_norm": 6.40625, "learning_rate": 7.776049766718507e-06, "loss": 0.91142721, "memory(GiB)": 114.08, "step": 2000, "train_speed(iter/s)": 0.230798 }, { "epoch": 0.04666151445777862, "eval_acc": 0.709365603036719, "eval_loss": 0.9482490420341492, "eval_runtime": 1270.2818, "eval_samples_per_second": 28.333, "eval_steps_per_second": 14.167, "step": 2000 }, { "acc": 0.73707428, "epoch": 0.046894822030067514, "grad_norm": 10.1875, "learning_rate": 7.8149300155521e-06, "loss": 0.97233877, "memory(GiB)": 114.08, "step": 2010, "train_speed(iter/s)": 0.201446 }, { "acc": 0.74674578, "epoch": 0.047128129602356406, "grad_norm": 11.125, "learning_rate": 7.853810264385693e-06, "loss": 0.93988552, "memory(GiB)": 114.08, "step": 2020, "train_speed(iter/s)": 0.201937 }, { "acc": 0.74976053, "epoch": 0.0473614371746453, "grad_norm": 8.0, "learning_rate": 7.892690513219286e-06, "loss": 0.91554279, "memory(GiB)": 114.08, "step": 2030, "train_speed(iter/s)": 0.202447 }, { "acc": 0.74252758, "epoch": 0.04759474474693419, "grad_norm": 11.625, "learning_rate": 7.931570762052878e-06, "loss": 0.93326302, "memory(GiB)": 114.08, "step": 2040, "train_speed(iter/s)": 0.202943 }, { "acc": 0.72653966, "epoch": 0.04782805231922309, "grad_norm": 10.125, "learning_rate": 7.970451010886471e-06, "loss": 1.01006451, "memory(GiB)": 114.08, "step": 2050, "train_speed(iter/s)": 0.203443 }, { "acc": 0.75554819, "epoch": 0.04806135989151198, "grad_norm": 5.875, "learning_rate": 8.009331259720062e-06, "loss": 0.90173092, "memory(GiB)": 114.08, "step": 2060, "train_speed(iter/s)": 0.203953 }, { "acc": 0.7355751, "epoch": 0.04829466746380087, "grad_norm": 6.78125, "learning_rate": 8.048211508553656e-06, "loss": 0.98420334, "memory(GiB)": 114.08, "step": 2070, "train_speed(iter/s)": 0.204457 }, { "acc": 0.74058089, "epoch": 0.048527975036089764, "grad_norm": 8.8125, "learning_rate": 8.087091757387247e-06, "loss": 0.96377907, "memory(GiB)": 114.08, "step": 2080, "train_speed(iter/s)": 0.204931 }, { "acc": 0.75542016, "epoch": 0.048761282608378656, "grad_norm": 7.125, "learning_rate": 8.12597200622084e-06, "loss": 0.90528049, "memory(GiB)": 114.08, "step": 2090, "train_speed(iter/s)": 0.205464 }, { "acc": 0.74561777, "epoch": 0.04899459018066755, "grad_norm": 9.6875, "learning_rate": 8.164852255054433e-06, "loss": 0.94761057, "memory(GiB)": 114.08, "step": 2100, "train_speed(iter/s)": 0.205929 }, { "acc": 0.75310192, "epoch": 0.04922789775295645, "grad_norm": 7.0625, "learning_rate": 8.203732503888025e-06, "loss": 0.90652037, "memory(GiB)": 114.08, "step": 2110, "train_speed(iter/s)": 0.206417 }, { "acc": 0.74553957, "epoch": 0.04946120532524534, "grad_norm": 8.8125, "learning_rate": 8.242612752721618e-06, "loss": 0.94698086, "memory(GiB)": 114.08, "step": 2120, "train_speed(iter/s)": 0.206878 }, { "acc": 0.74139624, "epoch": 0.04969451289753423, "grad_norm": 5.78125, "learning_rate": 8.281493001555211e-06, "loss": 0.9554759, "memory(GiB)": 114.08, "step": 2130, "train_speed(iter/s)": 0.207363 }, { "acc": 0.75223551, "epoch": 0.04992782046982312, "grad_norm": 7.1875, "learning_rate": 8.320373250388803e-06, "loss": 0.91746941, "memory(GiB)": 114.08, "step": 2140, "train_speed(iter/s)": 0.20786 }, { "acc": 0.75747862, "epoch": 0.050161128042112015, "grad_norm": 8.1875, "learning_rate": 8.359253499222396e-06, "loss": 0.90032187, "memory(GiB)": 114.08, "step": 2150, "train_speed(iter/s)": 0.208349 }, { "acc": 0.74738655, "epoch": 0.050394435614400906, "grad_norm": 8.0, "learning_rate": 8.398133748055989e-06, "loss": 0.95068922, "memory(GiB)": 114.08, "step": 2160, "train_speed(iter/s)": 0.208799 }, { "acc": 0.74726391, "epoch": 0.050627743186689805, "grad_norm": 10.875, "learning_rate": 8.43701399688958e-06, "loss": 0.92888641, "memory(GiB)": 114.08, "step": 2170, "train_speed(iter/s)": 0.209331 }, { "acc": 0.7601017, "epoch": 0.0508610507589787, "grad_norm": 6.59375, "learning_rate": 8.475894245723174e-06, "loss": 0.88537483, "memory(GiB)": 114.08, "step": 2180, "train_speed(iter/s)": 0.209829 }, { "acc": 0.74622045, "epoch": 0.05109435833126759, "grad_norm": 5.90625, "learning_rate": 8.514774494556765e-06, "loss": 0.9415596, "memory(GiB)": 114.08, "step": 2190, "train_speed(iter/s)": 0.210313 }, { "acc": 0.74102707, "epoch": 0.05132766590355648, "grad_norm": 13.25, "learning_rate": 8.553654743390358e-06, "loss": 0.970403, "memory(GiB)": 114.08, "step": 2200, "train_speed(iter/s)": 0.210819 }, { "acc": 0.74637518, "epoch": 0.05156097347584537, "grad_norm": 6.59375, "learning_rate": 8.592534992223951e-06, "loss": 0.93611584, "memory(GiB)": 114.08, "step": 2210, "train_speed(iter/s)": 0.211296 }, { "acc": 0.72973704, "epoch": 0.05179428104813427, "grad_norm": 13.0625, "learning_rate": 8.631415241057543e-06, "loss": 0.99803753, "memory(GiB)": 114.08, "step": 2220, "train_speed(iter/s)": 0.211789 }, { "acc": 0.70917277, "epoch": 0.052027588620423164, "grad_norm": 8.25, "learning_rate": 8.670295489891136e-06, "loss": 1.06907005, "memory(GiB)": 114.08, "step": 2230, "train_speed(iter/s)": 0.212285 }, { "acc": 0.74743881, "epoch": 0.052260896192712056, "grad_norm": 21.0, "learning_rate": 8.70917573872473e-06, "loss": 0.9445673, "memory(GiB)": 114.08, "step": 2240, "train_speed(iter/s)": 0.212758 }, { "acc": 0.74076357, "epoch": 0.05249420376500095, "grad_norm": 5.21875, "learning_rate": 8.74805598755832e-06, "loss": 0.97673054, "memory(GiB)": 114.08, "step": 2250, "train_speed(iter/s)": 0.213229 }, { "acc": 0.74062457, "epoch": 0.05272751133728984, "grad_norm": 5.96875, "learning_rate": 8.786936236391914e-06, "loss": 0.96884727, "memory(GiB)": 114.08, "step": 2260, "train_speed(iter/s)": 0.213684 }, { "acc": 0.72948203, "epoch": 0.05296081890957873, "grad_norm": 6.0625, "learning_rate": 8.825816485225505e-06, "loss": 1.01084576, "memory(GiB)": 114.08, "step": 2270, "train_speed(iter/s)": 0.214158 }, { "acc": 0.75192738, "epoch": 0.05319412648186763, "grad_norm": 6.15625, "learning_rate": 8.864696734059099e-06, "loss": 0.89396305, "memory(GiB)": 114.08, "step": 2280, "train_speed(iter/s)": 0.214642 }, { "acc": 0.76000366, "epoch": 0.05342743405415652, "grad_norm": 6.90625, "learning_rate": 8.903576982892692e-06, "loss": 0.90859699, "memory(GiB)": 114.08, "step": 2290, "train_speed(iter/s)": 0.215099 }, { "acc": 0.7458251, "epoch": 0.053660741626445414, "grad_norm": 7.3125, "learning_rate": 8.942457231726283e-06, "loss": 0.97098904, "memory(GiB)": 114.08, "step": 2300, "train_speed(iter/s)": 0.21551 }, { "acc": 0.73837004, "epoch": 0.053894049198734306, "grad_norm": 5.8125, "learning_rate": 8.981337480559876e-06, "loss": 0.95030222, "memory(GiB)": 114.08, "step": 2310, "train_speed(iter/s)": 0.215929 }, { "acc": 0.75400095, "epoch": 0.0541273567710232, "grad_norm": 8.1875, "learning_rate": 9.02021772939347e-06, "loss": 0.90010729, "memory(GiB)": 114.08, "step": 2320, "train_speed(iter/s)": 0.216371 }, { "acc": 0.75261707, "epoch": 0.05436066434331209, "grad_norm": 6.4375, "learning_rate": 9.059097978227061e-06, "loss": 0.94032478, "memory(GiB)": 114.08, "step": 2330, "train_speed(iter/s)": 0.216778 }, { "acc": 0.7515697, "epoch": 0.05459397191560099, "grad_norm": 5.34375, "learning_rate": 9.097978227060654e-06, "loss": 0.94090843, "memory(GiB)": 114.08, "step": 2340, "train_speed(iter/s)": 0.217239 }, { "acc": 0.7397305, "epoch": 0.05482727948788988, "grad_norm": 10.0625, "learning_rate": 9.136858475894247e-06, "loss": 0.97343073, "memory(GiB)": 114.08, "step": 2350, "train_speed(iter/s)": 0.217667 }, { "acc": 0.74915609, "epoch": 0.05506058706017877, "grad_norm": 6.65625, "learning_rate": 9.175738724727839e-06, "loss": 0.921453, "memory(GiB)": 114.08, "step": 2360, "train_speed(iter/s)": 0.218073 }, { "acc": 0.7468821, "epoch": 0.055293894632467665, "grad_norm": 9.75, "learning_rate": 9.21461897356143e-06, "loss": 0.95533428, "memory(GiB)": 114.08, "step": 2370, "train_speed(iter/s)": 0.218486 }, { "acc": 0.73546376, "epoch": 0.05552720220475656, "grad_norm": 6.96875, "learning_rate": 9.253499222395023e-06, "loss": 0.99596453, "memory(GiB)": 114.08, "step": 2380, "train_speed(iter/s)": 0.218898 }, { "acc": 0.73259478, "epoch": 0.05576050977704545, "grad_norm": 7.96875, "learning_rate": 9.292379471228617e-06, "loss": 0.99712763, "memory(GiB)": 114.08, "step": 2390, "train_speed(iter/s)": 0.219283 }, { "acc": 0.73952942, "epoch": 0.05599381734933435, "grad_norm": 5.71875, "learning_rate": 9.33125972006221e-06, "loss": 0.94039307, "memory(GiB)": 114.08, "step": 2400, "train_speed(iter/s)": 0.21973 }, { "acc": 0.74226012, "epoch": 0.05622712492162324, "grad_norm": 6.125, "learning_rate": 9.370139968895801e-06, "loss": 0.93415909, "memory(GiB)": 114.08, "step": 2410, "train_speed(iter/s)": 0.220139 }, { "acc": 0.75117826, "epoch": 0.05646043249391213, "grad_norm": 24.625, "learning_rate": 9.409020217729394e-06, "loss": 0.89564247, "memory(GiB)": 114.08, "step": 2420, "train_speed(iter/s)": 0.220555 }, { "acc": 0.74033728, "epoch": 0.05669374006620102, "grad_norm": 10.4375, "learning_rate": 9.447900466562988e-06, "loss": 0.96549892, "memory(GiB)": 114.08, "step": 2430, "train_speed(iter/s)": 0.220978 }, { "acc": 0.76307154, "epoch": 0.056927047638489915, "grad_norm": 17.375, "learning_rate": 9.486780715396579e-06, "loss": 0.86378517, "memory(GiB)": 114.08, "step": 2440, "train_speed(iter/s)": 0.221411 }, { "acc": 0.73024807, "epoch": 0.05716035521077881, "grad_norm": 12.6875, "learning_rate": 9.525660964230172e-06, "loss": 1.01558571, "memory(GiB)": 114.08, "step": 2450, "train_speed(iter/s)": 0.221845 }, { "acc": 0.75076675, "epoch": 0.057393662783067706, "grad_norm": 7.1875, "learning_rate": 9.564541213063765e-06, "loss": 0.91901255, "memory(GiB)": 114.08, "step": 2460, "train_speed(iter/s)": 0.222309 }, { "acc": 0.76636009, "epoch": 0.0576269703553566, "grad_norm": 4.5, "learning_rate": 9.603421461897357e-06, "loss": 0.85223923, "memory(GiB)": 114.08, "step": 2470, "train_speed(iter/s)": 0.222694 }, { "acc": 0.75716038, "epoch": 0.05786027792764549, "grad_norm": 6.5625, "learning_rate": 9.642301710730948e-06, "loss": 0.90162163, "memory(GiB)": 114.08, "step": 2480, "train_speed(iter/s)": 0.223102 }, { "acc": 0.74022632, "epoch": 0.05809358549993438, "grad_norm": 7.46875, "learning_rate": 9.681181959564542e-06, "loss": 0.96824207, "memory(GiB)": 114.08, "step": 2490, "train_speed(iter/s)": 0.223508 }, { "acc": 0.75365782, "epoch": 0.058326893072223274, "grad_norm": 6.75, "learning_rate": 9.720062208398135e-06, "loss": 0.93858395, "memory(GiB)": 114.08, "step": 2500, "train_speed(iter/s)": 0.223925 }, { "epoch": 0.058326893072223274, "eval_acc": 0.7143739537577124, "eval_loss": 0.9248969554901123, "eval_runtime": 1269.8474, "eval_samples_per_second": 28.343, "eval_steps_per_second": 14.172, "step": 2500 }, { "acc": 0.7417676, "epoch": 0.058560200644512166, "grad_norm": 6.03125, "learning_rate": 9.758942457231726e-06, "loss": 0.95499773, "memory(GiB)": 114.08, "step": 2510, "train_speed(iter/s)": 0.201204 }, { "acc": 0.74519234, "epoch": 0.058793508216801064, "grad_norm": 8.75, "learning_rate": 9.79782270606532e-06, "loss": 0.94459839, "memory(GiB)": 114.08, "step": 2520, "train_speed(iter/s)": 0.201609 }, { "acc": 0.75776386, "epoch": 0.059026815789089956, "grad_norm": 4.96875, "learning_rate": 9.836702954898913e-06, "loss": 0.88159065, "memory(GiB)": 114.08, "step": 2530, "train_speed(iter/s)": 0.201997 }, { "acc": 0.75585461, "epoch": 0.05926012336137885, "grad_norm": 8.625, "learning_rate": 9.875583203732506e-06, "loss": 0.89656048, "memory(GiB)": 114.08, "step": 2540, "train_speed(iter/s)": 0.202388 }, { "acc": 0.74211607, "epoch": 0.05949343093366774, "grad_norm": 15.3125, "learning_rate": 9.914463452566097e-06, "loss": 0.94759617, "memory(GiB)": 114.08, "step": 2550, "train_speed(iter/s)": 0.202765 }, { "acc": 0.74785175, "epoch": 0.05972673850595663, "grad_norm": 6.9375, "learning_rate": 9.95334370139969e-06, "loss": 0.92067432, "memory(GiB)": 114.08, "step": 2560, "train_speed(iter/s)": 0.203186 }, { "acc": 0.73263016, "epoch": 0.059960046078245524, "grad_norm": 7.5625, "learning_rate": 9.992223950233282e-06, "loss": 0.99488621, "memory(GiB)": 114.08, "step": 2570, "train_speed(iter/s)": 0.203615 }, { "acc": 0.74043674, "epoch": 0.06019335365053442, "grad_norm": 7.8125, "learning_rate": 9.999999771600465e-06, "loss": 0.96276875, "memory(GiB)": 114.08, "step": 2580, "train_speed(iter/s)": 0.204022 }, { "acc": 0.78092365, "epoch": 0.060426661222823315, "grad_norm": 6.6875, "learning_rate": 9.999998843727385e-06, "loss": 0.79039125, "memory(GiB)": 114.08, "step": 2590, "train_speed(iter/s)": 0.204452 }, { "acc": 0.76478825, "epoch": 0.06065996879511221, "grad_norm": 5.5, "learning_rate": 9.999997202105923e-06, "loss": 0.85824947, "memory(GiB)": 114.08, "step": 2600, "train_speed(iter/s)": 0.204846 }, { "acc": 0.77252517, "epoch": 0.0608932763674011, "grad_norm": 7.1875, "learning_rate": 9.999994846736312e-06, "loss": 0.83852539, "memory(GiB)": 114.08, "step": 2610, "train_speed(iter/s)": 0.20524 }, { "acc": 0.73587317, "epoch": 0.06112658393968999, "grad_norm": 16.5, "learning_rate": 9.99999177761889e-06, "loss": 0.97813015, "memory(GiB)": 114.08, "step": 2620, "train_speed(iter/s)": 0.205637 }, { "acc": 0.75071974, "epoch": 0.06135989151197888, "grad_norm": 4.71875, "learning_rate": 9.999987994754094e-06, "loss": 0.92447548, "memory(GiB)": 114.08, "step": 2630, "train_speed(iter/s)": 0.206024 }, { "acc": 0.74091039, "epoch": 0.06159319908426778, "grad_norm": 6.21875, "learning_rate": 9.999983498142464e-06, "loss": 0.95536308, "memory(GiB)": 114.08, "step": 2640, "train_speed(iter/s)": 0.206403 }, { "acc": 0.74781256, "epoch": 0.06182650665655667, "grad_norm": 14.75, "learning_rate": 9.999978287784642e-06, "loss": 0.9136961, "memory(GiB)": 114.08, "step": 2650, "train_speed(iter/s)": 0.206807 }, { "acc": 0.74666529, "epoch": 0.062059814228845565, "grad_norm": 5.8125, "learning_rate": 9.999972363681371e-06, "loss": 0.90506544, "memory(GiB)": 114.08, "step": 2660, "train_speed(iter/s)": 0.207192 }, { "acc": 0.76904163, "epoch": 0.06229312180113446, "grad_norm": 5.84375, "learning_rate": 9.9999657258335e-06, "loss": 0.85019245, "memory(GiB)": 114.08, "step": 2670, "train_speed(iter/s)": 0.207572 }, { "acc": 0.7517498, "epoch": 0.06252642937342336, "grad_norm": 8.125, "learning_rate": 9.999958374241974e-06, "loss": 0.91046963, "memory(GiB)": 114.08, "step": 2680, "train_speed(iter/s)": 0.207944 }, { "acc": 0.76082239, "epoch": 0.06275973694571224, "grad_norm": 6.4375, "learning_rate": 9.99995030890784e-06, "loss": 0.87537355, "memory(GiB)": 114.08, "step": 2690, "train_speed(iter/s)": 0.208348 }, { "acc": 0.75649023, "epoch": 0.06299304451800114, "grad_norm": 6.90625, "learning_rate": 9.999941529832254e-06, "loss": 0.9041748, "memory(GiB)": 114.08, "step": 2700, "train_speed(iter/s)": 0.208755 }, { "acc": 0.7486876, "epoch": 0.06322635209029003, "grad_norm": 8.4375, "learning_rate": 9.999932037016466e-06, "loss": 0.94257088, "memory(GiB)": 114.08, "step": 2710, "train_speed(iter/s)": 0.209144 }, { "acc": 0.76688719, "epoch": 0.06345965966257892, "grad_norm": 15.0, "learning_rate": 9.999921830461833e-06, "loss": 0.88587494, "memory(GiB)": 114.08, "step": 2720, "train_speed(iter/s)": 0.20952 }, { "acc": 0.76675873, "epoch": 0.06369296723486782, "grad_norm": 4.90625, "learning_rate": 9.99991091016981e-06, "loss": 0.84941301, "memory(GiB)": 114.08, "step": 2730, "train_speed(iter/s)": 0.209885 }, { "acc": 0.76298923, "epoch": 0.06392627480715671, "grad_norm": 7.125, "learning_rate": 9.99989927614196e-06, "loss": 0.87009039, "memory(GiB)": 114.08, "step": 2740, "train_speed(iter/s)": 0.210263 }, { "acc": 0.75599327, "epoch": 0.0641595823794456, "grad_norm": 6.71875, "learning_rate": 9.999886928379939e-06, "loss": 0.88294525, "memory(GiB)": 114.08, "step": 2750, "train_speed(iter/s)": 0.2106 }, { "acc": 0.76202002, "epoch": 0.06439288995173449, "grad_norm": 7.5625, "learning_rate": 9.99987386688551e-06, "loss": 0.87443285, "memory(GiB)": 114.08, "step": 2760, "train_speed(iter/s)": 0.210993 }, { "acc": 0.74971361, "epoch": 0.06462619752402339, "grad_norm": 5.375, "learning_rate": 9.99986009166054e-06, "loss": 0.90866261, "memory(GiB)": 114.08, "step": 2770, "train_speed(iter/s)": 0.211356 }, { "acc": 0.75157547, "epoch": 0.06485950509631228, "grad_norm": 6.40625, "learning_rate": 9.999845602706995e-06, "loss": 0.89404106, "memory(GiB)": 114.08, "step": 2780, "train_speed(iter/s)": 0.211714 }, { "acc": 0.7517457, "epoch": 0.06509281266860117, "grad_norm": 7.9375, "learning_rate": 9.999830400026941e-06, "loss": 0.93447304, "memory(GiB)": 114.08, "step": 2790, "train_speed(iter/s)": 0.212101 }, { "acc": 0.74939804, "epoch": 0.06532612024089007, "grad_norm": 5.375, "learning_rate": 9.999814483622552e-06, "loss": 0.90546379, "memory(GiB)": 114.08, "step": 2800, "train_speed(iter/s)": 0.212463 }, { "acc": 0.74818916, "epoch": 0.06555942781317896, "grad_norm": 8.625, "learning_rate": 9.999797853496097e-06, "loss": 0.91956005, "memory(GiB)": 114.08, "step": 2810, "train_speed(iter/s)": 0.212815 }, { "acc": 0.74374294, "epoch": 0.06579273538546786, "grad_norm": 6.40625, "learning_rate": 9.999780509649952e-06, "loss": 0.95786781, "memory(GiB)": 114.08, "step": 2820, "train_speed(iter/s)": 0.213177 }, { "acc": 0.74510412, "epoch": 0.06602604295775674, "grad_norm": 6.125, "learning_rate": 9.99976245208659e-06, "loss": 0.96016006, "memory(GiB)": 114.08, "step": 2830, "train_speed(iter/s)": 0.213555 }, { "acc": 0.75915327, "epoch": 0.06625935053004564, "grad_norm": 7.03125, "learning_rate": 9.99974368080859e-06, "loss": 0.8857523, "memory(GiB)": 114.08, "step": 2840, "train_speed(iter/s)": 0.213924 }, { "acc": 0.74505186, "epoch": 0.06649265810233454, "grad_norm": 7.125, "learning_rate": 9.999724195818634e-06, "loss": 0.949786, "memory(GiB)": 114.08, "step": 2850, "train_speed(iter/s)": 0.21428 }, { "acc": 0.76169367, "epoch": 0.06672596567462342, "grad_norm": 7.5, "learning_rate": 9.999703997119501e-06, "loss": 0.85612011, "memory(GiB)": 114.08, "step": 2860, "train_speed(iter/s)": 0.214653 }, { "acc": 0.75327644, "epoch": 0.06695927324691232, "grad_norm": 7.625, "learning_rate": 9.999683084714074e-06, "loss": 0.8901247, "memory(GiB)": 114.09, "step": 2870, "train_speed(iter/s)": 0.214995 }, { "acc": 0.73931332, "epoch": 0.06719258081920121, "grad_norm": 10.0625, "learning_rate": 9.999661458605339e-06, "loss": 0.95245857, "memory(GiB)": 114.09, "step": 2880, "train_speed(iter/s)": 0.215355 }, { "acc": 0.76272769, "epoch": 0.06742588839149011, "grad_norm": 4.28125, "learning_rate": 9.999639118796384e-06, "loss": 0.86453457, "memory(GiB)": 114.09, "step": 2890, "train_speed(iter/s)": 0.215716 }, { "acc": 0.77728963, "epoch": 0.06765919596377899, "grad_norm": 7.96875, "learning_rate": 9.999616065290396e-06, "loss": 0.8032423, "memory(GiB)": 114.09, "step": 2900, "train_speed(iter/s)": 0.216072 }, { "acc": 0.75634222, "epoch": 0.06789250353606789, "grad_norm": 6.15625, "learning_rate": 9.999592298090669e-06, "loss": 0.86408386, "memory(GiB)": 114.09, "step": 2910, "train_speed(iter/s)": 0.216406 }, { "acc": 0.77233763, "epoch": 0.06812581110835679, "grad_norm": 5.5625, "learning_rate": 9.999567817200592e-06, "loss": 0.83319054, "memory(GiB)": 114.09, "step": 2920, "train_speed(iter/s)": 0.21677 }, { "acc": 0.73433094, "epoch": 0.06835911868064568, "grad_norm": 4.84375, "learning_rate": 9.999542622623661e-06, "loss": 0.98757401, "memory(GiB)": 114.09, "step": 2930, "train_speed(iter/s)": 0.217116 }, { "acc": 0.76126733, "epoch": 0.06859242625293457, "grad_norm": 7.125, "learning_rate": 9.999516714363475e-06, "loss": 0.89113979, "memory(GiB)": 114.09, "step": 2940, "train_speed(iter/s)": 0.217466 }, { "acc": 0.74799905, "epoch": 0.06882573382522346, "grad_norm": 4.71875, "learning_rate": 9.99949009242373e-06, "loss": 0.92414608, "memory(GiB)": 114.09, "step": 2950, "train_speed(iter/s)": 0.217813 }, { "acc": 0.75590515, "epoch": 0.06905904139751236, "grad_norm": 5.625, "learning_rate": 9.999462756808227e-06, "loss": 0.88417921, "memory(GiB)": 114.09, "step": 2960, "train_speed(iter/s)": 0.218171 }, { "acc": 0.73305931, "epoch": 0.06929234896980126, "grad_norm": 10.0625, "learning_rate": 9.999434707520867e-06, "loss": 0.96791725, "memory(GiB)": 114.09, "step": 2970, "train_speed(iter/s)": 0.218519 }, { "acc": 0.74652576, "epoch": 0.06952565654209014, "grad_norm": 5.03125, "learning_rate": 9.999405944565654e-06, "loss": 0.9400857, "memory(GiB)": 114.09, "step": 2980, "train_speed(iter/s)": 0.218871 }, { "acc": 0.75939126, "epoch": 0.06975896411437904, "grad_norm": 5.46875, "learning_rate": 9.999376467946695e-06, "loss": 0.88669376, "memory(GiB)": 114.09, "step": 2990, "train_speed(iter/s)": 0.219224 }, { "acc": 0.74682531, "epoch": 0.06999227168666793, "grad_norm": 6.0625, "learning_rate": 9.999346277668198e-06, "loss": 0.89512215, "memory(GiB)": 114.09, "step": 3000, "train_speed(iter/s)": 0.21959 }, { "epoch": 0.06999227168666793, "eval_acc": 0.7189282955801313, "eval_loss": 0.9065292477607727, "eval_runtime": 1270.6255, "eval_samples_per_second": 28.325, "eval_steps_per_second": 14.163, "step": 3000 }, { "acc": 0.75453682, "epoch": 0.07022557925895682, "grad_norm": 5.0, "learning_rate": 9.999315373734472e-06, "loss": 0.87960033, "memory(GiB)": 117.28, "step": 3010, "train_speed(iter/s)": 0.200973 }, { "acc": 0.76882467, "epoch": 0.07045888683124572, "grad_norm": 10.75, "learning_rate": 9.999283756149932e-06, "loss": 0.83173504, "memory(GiB)": 117.28, "step": 3020, "train_speed(iter/s)": 0.201327 }, { "acc": 0.7758441, "epoch": 0.07069219440353461, "grad_norm": 5.21875, "learning_rate": 9.999251424919083e-06, "loss": 0.8254406, "memory(GiB)": 117.28, "step": 3030, "train_speed(iter/s)": 0.201672 }, { "acc": 0.7445199, "epoch": 0.07092550197582351, "grad_norm": 10.1875, "learning_rate": 9.999218380046548e-06, "loss": 0.93491154, "memory(GiB)": 117.28, "step": 3040, "train_speed(iter/s)": 0.202018 }, { "acc": 0.77481375, "epoch": 0.07115880954811239, "grad_norm": 6.90625, "learning_rate": 9.99918462153704e-06, "loss": 0.8122179, "memory(GiB)": 117.28, "step": 3050, "train_speed(iter/s)": 0.202354 }, { "acc": 0.7394393, "epoch": 0.07139211712040129, "grad_norm": 6.59375, "learning_rate": 9.999150149395383e-06, "loss": 0.97233334, "memory(GiB)": 117.28, "step": 3060, "train_speed(iter/s)": 0.202695 }, { "acc": 0.74153099, "epoch": 0.07162542469269018, "grad_norm": 5.53125, "learning_rate": 9.99911496362649e-06, "loss": 0.9292963, "memory(GiB)": 117.28, "step": 3070, "train_speed(iter/s)": 0.203018 }, { "acc": 0.74457722, "epoch": 0.07185873226497907, "grad_norm": 6.6875, "learning_rate": 9.99907906423539e-06, "loss": 0.95251703, "memory(GiB)": 117.28, "step": 3080, "train_speed(iter/s)": 0.203382 }, { "acc": 0.73962574, "epoch": 0.07209203983726797, "grad_norm": 6.875, "learning_rate": 9.999042451227208e-06, "loss": 0.9660202, "memory(GiB)": 117.28, "step": 3090, "train_speed(iter/s)": 0.203734 }, { "acc": 0.75118494, "epoch": 0.07232534740955686, "grad_norm": 7.65625, "learning_rate": 9.999005124607167e-06, "loss": 0.89292297, "memory(GiB)": 117.28, "step": 3100, "train_speed(iter/s)": 0.204063 }, { "acc": 0.77734885, "epoch": 0.07255865498184576, "grad_norm": 22.0, "learning_rate": 9.998967084380596e-06, "loss": 0.81176682, "memory(GiB)": 117.28, "step": 3110, "train_speed(iter/s)": 0.204418 }, { "acc": 0.75207405, "epoch": 0.07279196255413464, "grad_norm": 7.125, "learning_rate": 9.998928330552925e-06, "loss": 0.92476635, "memory(GiB)": 117.28, "step": 3120, "train_speed(iter/s)": 0.204741 }, { "acc": 0.75438986, "epoch": 0.07302527012642354, "grad_norm": 9.1875, "learning_rate": 9.998888863129688e-06, "loss": 0.9007534, "memory(GiB)": 117.28, "step": 3130, "train_speed(iter/s)": 0.205082 }, { "acc": 0.75198278, "epoch": 0.07325857769871244, "grad_norm": 9.1875, "learning_rate": 9.998848682116518e-06, "loss": 0.9158268, "memory(GiB)": 117.28, "step": 3140, "train_speed(iter/s)": 0.205427 }, { "acc": 0.75476503, "epoch": 0.07349188527100133, "grad_norm": 7.875, "learning_rate": 9.998807787519151e-06, "loss": 0.87708139, "memory(GiB)": 117.28, "step": 3150, "train_speed(iter/s)": 0.205765 }, { "acc": 0.75708885, "epoch": 0.07372519284329022, "grad_norm": 4.6875, "learning_rate": 9.998766179343425e-06, "loss": 0.87132273, "memory(GiB)": 117.28, "step": 3160, "train_speed(iter/s)": 0.206091 }, { "acc": 0.76350727, "epoch": 0.07395850041557911, "grad_norm": 6.375, "learning_rate": 9.998723857595278e-06, "loss": 0.8524066, "memory(GiB)": 117.28, "step": 3170, "train_speed(iter/s)": 0.206437 }, { "acc": 0.75130353, "epoch": 0.07419180798786801, "grad_norm": 6.46875, "learning_rate": 9.998680822280752e-06, "loss": 0.94219074, "memory(GiB)": 117.28, "step": 3180, "train_speed(iter/s)": 0.20678 }, { "acc": 0.74551325, "epoch": 0.0744251155601569, "grad_norm": 6.25, "learning_rate": 9.998637073405992e-06, "loss": 0.94350052, "memory(GiB)": 117.28, "step": 3190, "train_speed(iter/s)": 0.207101 }, { "acc": 0.74958072, "epoch": 0.07465842313244579, "grad_norm": 5.0, "learning_rate": 9.998592610977241e-06, "loss": 0.9216177, "memory(GiB)": 117.28, "step": 3200, "train_speed(iter/s)": 0.207424 }, { "acc": 0.75495872, "epoch": 0.07489173070473469, "grad_norm": 6.03125, "learning_rate": 9.998547435000847e-06, "loss": 0.89467354, "memory(GiB)": 117.28, "step": 3210, "train_speed(iter/s)": 0.207777 }, { "acc": 0.74197507, "epoch": 0.07512503827702358, "grad_norm": 6.5, "learning_rate": 9.998501545483259e-06, "loss": 0.94241686, "memory(GiB)": 117.28, "step": 3220, "train_speed(iter/s)": 0.208109 }, { "acc": 0.75202932, "epoch": 0.07535834584931247, "grad_norm": 5.375, "learning_rate": 9.998454942431029e-06, "loss": 0.91658039, "memory(GiB)": 117.28, "step": 3230, "train_speed(iter/s)": 0.208416 }, { "acc": 0.7436245, "epoch": 0.07559165342160136, "grad_norm": 9.3125, "learning_rate": 9.998407625850806e-06, "loss": 0.93169861, "memory(GiB)": 117.28, "step": 3240, "train_speed(iter/s)": 0.208753 }, { "acc": 0.76363297, "epoch": 0.07582496099389026, "grad_norm": 6.5625, "learning_rate": 9.998359595749346e-06, "loss": 0.88251057, "memory(GiB)": 117.28, "step": 3250, "train_speed(iter/s)": 0.209087 }, { "acc": 0.76651325, "epoch": 0.07605826856617916, "grad_norm": 6.53125, "learning_rate": 9.998310852133506e-06, "loss": 0.88197155, "memory(GiB)": 117.28, "step": 3260, "train_speed(iter/s)": 0.209407 }, { "acc": 0.73732243, "epoch": 0.07629157613846804, "grad_norm": 6.4375, "learning_rate": 9.998261395010246e-06, "loss": 0.97510824, "memory(GiB)": 117.28, "step": 3270, "train_speed(iter/s)": 0.209726 }, { "acc": 0.76042881, "epoch": 0.07652488371075694, "grad_norm": 5.625, "learning_rate": 9.998211224386623e-06, "loss": 0.88643074, "memory(GiB)": 117.28, "step": 3280, "train_speed(iter/s)": 0.210037 }, { "acc": 0.73288879, "epoch": 0.07675819128304583, "grad_norm": 8.1875, "learning_rate": 9.998160340269799e-06, "loss": 0.98039322, "memory(GiB)": 117.28, "step": 3290, "train_speed(iter/s)": 0.210365 }, { "acc": 0.74865913, "epoch": 0.07699149885533473, "grad_norm": 32.5, "learning_rate": 9.998108742667038e-06, "loss": 0.93164606, "memory(GiB)": 117.28, "step": 3300, "train_speed(iter/s)": 0.210674 }, { "acc": 0.73399429, "epoch": 0.07722480642762361, "grad_norm": 5.3125, "learning_rate": 9.998056431585707e-06, "loss": 0.97385902, "memory(GiB)": 117.28, "step": 3310, "train_speed(iter/s)": 0.210992 }, { "acc": 0.76379633, "epoch": 0.07745811399991251, "grad_norm": 4.96875, "learning_rate": 9.998003407033271e-06, "loss": 0.84101114, "memory(GiB)": 117.28, "step": 3320, "train_speed(iter/s)": 0.211281 }, { "acc": 0.76484241, "epoch": 0.07769142157220141, "grad_norm": 5.71875, "learning_rate": 9.997949669017302e-06, "loss": 0.8537735, "memory(GiB)": 117.28, "step": 3330, "train_speed(iter/s)": 0.211582 }, { "acc": 0.74536896, "epoch": 0.07792472914449029, "grad_norm": 9.0, "learning_rate": 9.997895217545468e-06, "loss": 0.99053888, "memory(GiB)": 117.28, "step": 3340, "train_speed(iter/s)": 0.211872 }, { "acc": 0.76393571, "epoch": 0.07815803671677919, "grad_norm": 6.25, "learning_rate": 9.997840052625546e-06, "loss": 0.8449995, "memory(GiB)": 117.28, "step": 3350, "train_speed(iter/s)": 0.212198 }, { "acc": 0.75357141, "epoch": 0.07839134428906808, "grad_norm": 7.96875, "learning_rate": 9.997784174265407e-06, "loss": 0.89924717, "memory(GiB)": 117.28, "step": 3360, "train_speed(iter/s)": 0.212515 }, { "acc": 0.74694405, "epoch": 0.07862465186135698, "grad_norm": 5.375, "learning_rate": 9.99772758247303e-06, "loss": 0.916609, "memory(GiB)": 117.28, "step": 3370, "train_speed(iter/s)": 0.212802 }, { "acc": 0.75625582, "epoch": 0.07885795943364587, "grad_norm": 9.5, "learning_rate": 9.99767027725649e-06, "loss": 0.89150448, "memory(GiB)": 117.28, "step": 3380, "train_speed(iter/s)": 0.213124 }, { "acc": 0.77579999, "epoch": 0.07909126700593476, "grad_norm": 5.0625, "learning_rate": 9.997612258623972e-06, "loss": 0.80863886, "memory(GiB)": 117.28, "step": 3390, "train_speed(iter/s)": 0.213441 }, { "acc": 0.72985349, "epoch": 0.07932457457822366, "grad_norm": 7.90625, "learning_rate": 9.997553526583755e-06, "loss": 1.00999374, "memory(GiB)": 117.28, "step": 3400, "train_speed(iter/s)": 0.21375 }, { "acc": 0.75671997, "epoch": 0.07955788215051254, "grad_norm": 13.6875, "learning_rate": 9.997494081144224e-06, "loss": 0.89063911, "memory(GiB)": 117.28, "step": 3410, "train_speed(iter/s)": 0.214062 }, { "acc": 0.78263044, "epoch": 0.07979118972280144, "grad_norm": 8.75, "learning_rate": 9.997433922313863e-06, "loss": 0.78714075, "memory(GiB)": 117.28, "step": 3420, "train_speed(iter/s)": 0.21435 }, { "acc": 0.76892452, "epoch": 0.08002449729509033, "grad_norm": 5.21875, "learning_rate": 9.997373050101265e-06, "loss": 0.84681721, "memory(GiB)": 117.28, "step": 3430, "train_speed(iter/s)": 0.21465 }, { "acc": 0.74426117, "epoch": 0.08025780486737923, "grad_norm": 7.25, "learning_rate": 9.997311464515113e-06, "loss": 0.94508867, "memory(GiB)": 117.28, "step": 3440, "train_speed(iter/s)": 0.214929 }, { "acc": 0.7614994, "epoch": 0.08049111243966812, "grad_norm": 5.625, "learning_rate": 9.997249165564203e-06, "loss": 0.83586025, "memory(GiB)": 117.28, "step": 3450, "train_speed(iter/s)": 0.215237 }, { "acc": 0.75516629, "epoch": 0.08072442001195701, "grad_norm": 5.40625, "learning_rate": 9.997186153257425e-06, "loss": 0.89334755, "memory(GiB)": 117.28, "step": 3460, "train_speed(iter/s)": 0.215538 }, { "acc": 0.7614068, "epoch": 0.08095772758424591, "grad_norm": 4.96875, "learning_rate": 9.997122427603777e-06, "loss": 0.87028713, "memory(GiB)": 117.28, "step": 3470, "train_speed(iter/s)": 0.215836 }, { "acc": 0.73593702, "epoch": 0.0811910351565348, "grad_norm": 5.96875, "learning_rate": 9.997057988612351e-06, "loss": 0.96313515, "memory(GiB)": 117.28, "step": 3480, "train_speed(iter/s)": 0.21614 }, { "acc": 0.74797134, "epoch": 0.08142434272882369, "grad_norm": 6.3125, "learning_rate": 9.996992836292352e-06, "loss": 0.92687082, "memory(GiB)": 117.28, "step": 3490, "train_speed(iter/s)": 0.216442 }, { "acc": 0.77501402, "epoch": 0.08165765030111259, "grad_norm": 15.3125, "learning_rate": 9.996926970653076e-06, "loss": 0.80666208, "memory(GiB)": 117.28, "step": 3500, "train_speed(iter/s)": 0.216742 }, { "epoch": 0.08165765030111259, "eval_acc": 0.7207238327126944, "eval_loss": 0.8971706628799438, "eval_runtime": 1269.6947, "eval_samples_per_second": 28.346, "eval_steps_per_second": 14.173, "step": 3500 }, { "acc": 0.75318351, "epoch": 0.08189095787340148, "grad_norm": 4.59375, "learning_rate": 9.996860391703925e-06, "loss": 0.9090744, "memory(GiB)": 117.28, "step": 3510, "train_speed(iter/s)": 0.201043 }, { "acc": 0.76253424, "epoch": 0.08212426544569038, "grad_norm": 6.125, "learning_rate": 9.996793099454407e-06, "loss": 0.87270107, "memory(GiB)": 117.28, "step": 3520, "train_speed(iter/s)": 0.20133 }, { "acc": 0.76890745, "epoch": 0.08235757301797926, "grad_norm": 6.375, "learning_rate": 9.996725093914125e-06, "loss": 0.846912, "memory(GiB)": 117.28, "step": 3530, "train_speed(iter/s)": 0.20162 }, { "acc": 0.75093694, "epoch": 0.08259088059026816, "grad_norm": 6.40625, "learning_rate": 9.996656375092786e-06, "loss": 0.91460123, "memory(GiB)": 117.28, "step": 3540, "train_speed(iter/s)": 0.201922 }, { "acc": 0.74694242, "epoch": 0.08282418816255704, "grad_norm": 8.25, "learning_rate": 9.996586943000203e-06, "loss": 0.94491138, "memory(GiB)": 117.28, "step": 3550, "train_speed(iter/s)": 0.202225 }, { "acc": 0.7550127, "epoch": 0.08305749573484594, "grad_norm": 6.8125, "learning_rate": 9.996516797646285e-06, "loss": 0.89832973, "memory(GiB)": 117.28, "step": 3560, "train_speed(iter/s)": 0.202507 }, { "acc": 0.76115112, "epoch": 0.08329080330713484, "grad_norm": 7.375, "learning_rate": 9.996445939041043e-06, "loss": 0.8638464, "memory(GiB)": 117.28, "step": 3570, "train_speed(iter/s)": 0.20277 }, { "acc": 0.7447835, "epoch": 0.08352411087942373, "grad_norm": 8.6875, "learning_rate": 9.996374367194599e-06, "loss": 0.94378824, "memory(GiB)": 117.28, "step": 3580, "train_speed(iter/s)": 0.203058 }, { "acc": 0.72563133, "epoch": 0.08375741845171263, "grad_norm": 38.5, "learning_rate": 9.996302082117162e-06, "loss": 1.1059824, "memory(GiB)": 117.28, "step": 3590, "train_speed(iter/s)": 0.203348 }, { "acc": 0.77572851, "epoch": 0.08399072602400151, "grad_norm": 19.75, "learning_rate": 9.996229083819055e-06, "loss": 0.79340849, "memory(GiB)": 117.28, "step": 3600, "train_speed(iter/s)": 0.203658 }, { "acc": 0.74407644, "epoch": 0.08422403359629041, "grad_norm": 4.90625, "learning_rate": 9.996155372310699e-06, "loss": 0.92524662, "memory(GiB)": 117.28, "step": 3610, "train_speed(iter/s)": 0.203955 }, { "acc": 0.74106493, "epoch": 0.08445734116857931, "grad_norm": 5.84375, "learning_rate": 9.996080947602615e-06, "loss": 0.96179218, "memory(GiB)": 117.28, "step": 3620, "train_speed(iter/s)": 0.204248 }, { "acc": 0.73815827, "epoch": 0.0846906487408682, "grad_norm": 6.125, "learning_rate": 9.996005809705428e-06, "loss": 0.95186195, "memory(GiB)": 117.28, "step": 3630, "train_speed(iter/s)": 0.204526 }, { "acc": 0.75756698, "epoch": 0.08492395631315709, "grad_norm": 8.125, "learning_rate": 9.99592995862986e-06, "loss": 0.8868536, "memory(GiB)": 117.28, "step": 3640, "train_speed(iter/s)": 0.204816 }, { "acc": 0.7773509, "epoch": 0.08515726388544598, "grad_norm": 5.6875, "learning_rate": 9.995853394386743e-06, "loss": 0.80111885, "memory(GiB)": 117.28, "step": 3650, "train_speed(iter/s)": 0.205093 }, { "acc": 0.75153332, "epoch": 0.08539057145773488, "grad_norm": 7.28125, "learning_rate": 9.995776116987006e-06, "loss": 0.92716045, "memory(GiB)": 117.28, "step": 3660, "train_speed(iter/s)": 0.205357 }, { "acc": 0.76465082, "epoch": 0.08562387903002378, "grad_norm": 5.53125, "learning_rate": 9.995698126441678e-06, "loss": 0.84658279, "memory(GiB)": 117.28, "step": 3670, "train_speed(iter/s)": 0.205626 }, { "acc": 0.75522108, "epoch": 0.08585718660231266, "grad_norm": 7.6875, "learning_rate": 9.995619422761896e-06, "loss": 0.89340916, "memory(GiB)": 117.28, "step": 3680, "train_speed(iter/s)": 0.205898 }, { "acc": 0.76193209, "epoch": 0.08609049417460156, "grad_norm": 5.34375, "learning_rate": 9.995540005958891e-06, "loss": 0.85231638, "memory(GiB)": 117.28, "step": 3690, "train_speed(iter/s)": 0.206188 }, { "acc": 0.74403105, "epoch": 0.08632380174689044, "grad_norm": 5.1875, "learning_rate": 9.995459876044e-06, "loss": 0.96531124, "memory(GiB)": 117.28, "step": 3700, "train_speed(iter/s)": 0.206473 }, { "acc": 0.76611438, "epoch": 0.08655710931917934, "grad_norm": 4.9375, "learning_rate": 9.995379033028666e-06, "loss": 0.85632906, "memory(GiB)": 117.28, "step": 3710, "train_speed(iter/s)": 0.206772 }, { "acc": 0.74768972, "epoch": 0.08679041689146823, "grad_norm": 4.46875, "learning_rate": 9.995297476924424e-06, "loss": 0.94205217, "memory(GiB)": 117.28, "step": 3720, "train_speed(iter/s)": 0.207058 }, { "acc": 0.75818162, "epoch": 0.08702372446375713, "grad_norm": 4.78125, "learning_rate": 9.99521520774292e-06, "loss": 0.88370638, "memory(GiB)": 117.28, "step": 3730, "train_speed(iter/s)": 0.207322 }, { "acc": 0.76598167, "epoch": 0.08725703203604603, "grad_norm": 4.625, "learning_rate": 9.995132225495896e-06, "loss": 0.8380312, "memory(GiB)": 117.28, "step": 3740, "train_speed(iter/s)": 0.207593 }, { "acc": 0.74331932, "epoch": 0.08749033960833491, "grad_norm": 4.71875, "learning_rate": 9.995048530195198e-06, "loss": 0.94072809, "memory(GiB)": 117.28, "step": 3750, "train_speed(iter/s)": 0.207867 }, { "acc": 0.74893398, "epoch": 0.08772364718062381, "grad_norm": 8.9375, "learning_rate": 9.99496412185277e-06, "loss": 0.93704576, "memory(GiB)": 117.28, "step": 3760, "train_speed(iter/s)": 0.208126 }, { "acc": 0.74125042, "epoch": 0.0879569547529127, "grad_norm": 8.25, "learning_rate": 9.994879000480668e-06, "loss": 0.96825314, "memory(GiB)": 117.28, "step": 3770, "train_speed(iter/s)": 0.208406 }, { "acc": 0.77773728, "epoch": 0.0881902623252016, "grad_norm": 6.5625, "learning_rate": 9.994793166091039e-06, "loss": 0.81420488, "memory(GiB)": 117.28, "step": 3780, "train_speed(iter/s)": 0.208696 }, { "acc": 0.75972219, "epoch": 0.08842356989749049, "grad_norm": 6.59375, "learning_rate": 9.994706618696137e-06, "loss": 0.89639206, "memory(GiB)": 117.28, "step": 3790, "train_speed(iter/s)": 0.208963 }, { "acc": 0.76652493, "epoch": 0.08865687746977938, "grad_norm": 7.15625, "learning_rate": 9.994619358308316e-06, "loss": 0.83598127, "memory(GiB)": 117.28, "step": 3800, "train_speed(iter/s)": 0.209241 }, { "acc": 0.76296563, "epoch": 0.08889018504206828, "grad_norm": 10.4375, "learning_rate": 9.994531384940032e-06, "loss": 0.85312195, "memory(GiB)": 117.28, "step": 3810, "train_speed(iter/s)": 0.209509 }, { "acc": 0.76371479, "epoch": 0.08912349261435716, "grad_norm": 7.96875, "learning_rate": 9.994442698603844e-06, "loss": 0.8648243, "memory(GiB)": 117.28, "step": 3820, "train_speed(iter/s)": 0.209773 }, { "acc": 0.76453638, "epoch": 0.08935680018664606, "grad_norm": 7.59375, "learning_rate": 9.99435329931241e-06, "loss": 0.85945387, "memory(GiB)": 117.28, "step": 3830, "train_speed(iter/s)": 0.210032 }, { "acc": 0.76211448, "epoch": 0.08959010775893494, "grad_norm": 7.1875, "learning_rate": 9.994263187078496e-06, "loss": 0.86692085, "memory(GiB)": 117.28, "step": 3840, "train_speed(iter/s)": 0.210304 }, { "acc": 0.76937609, "epoch": 0.08982341533122384, "grad_norm": 4.90625, "learning_rate": 9.994172361914962e-06, "loss": 0.84302549, "memory(GiB)": 117.28, "step": 3850, "train_speed(iter/s)": 0.210561 }, { "acc": 0.76893077, "epoch": 0.09005672290351274, "grad_norm": 5.21875, "learning_rate": 9.994080823834775e-06, "loss": 0.84133387, "memory(GiB)": 117.54, "step": 3860, "train_speed(iter/s)": 0.21084 }, { "acc": 0.75491571, "epoch": 0.09029003047580163, "grad_norm": 5.53125, "learning_rate": 9.993988572851e-06, "loss": 0.90440521, "memory(GiB)": 117.54, "step": 3870, "train_speed(iter/s)": 0.2111 }, { "acc": 0.74396949, "epoch": 0.09052333804809053, "grad_norm": 8.0625, "learning_rate": 9.993895608976806e-06, "loss": 0.95679359, "memory(GiB)": 117.54, "step": 3880, "train_speed(iter/s)": 0.211355 }, { "acc": 0.75566082, "epoch": 0.09075664562037941, "grad_norm": 8.9375, "learning_rate": 9.993801932225466e-06, "loss": 0.89417982, "memory(GiB)": 117.54, "step": 3890, "train_speed(iter/s)": 0.211633 }, { "acc": 0.76257367, "epoch": 0.09098995319266831, "grad_norm": 5.625, "learning_rate": 9.993707542610351e-06, "loss": 0.87649937, "memory(GiB)": 117.54, "step": 3900, "train_speed(iter/s)": 0.211899 }, { "acc": 0.73606167, "epoch": 0.09122326076495721, "grad_norm": 7.34375, "learning_rate": 9.993612440144935e-06, "loss": 0.9821991, "memory(GiB)": 117.54, "step": 3910, "train_speed(iter/s)": 0.212158 }, { "acc": 0.75946436, "epoch": 0.0914565683372461, "grad_norm": 6.03125, "learning_rate": 9.993516624842792e-06, "loss": 0.8709816, "memory(GiB)": 117.54, "step": 3920, "train_speed(iter/s)": 0.212416 }, { "acc": 0.74540901, "epoch": 0.09168987590953499, "grad_norm": 8.5, "learning_rate": 9.993420096717603e-06, "loss": 0.94000492, "memory(GiB)": 117.54, "step": 3930, "train_speed(iter/s)": 0.212684 }, { "acc": 0.77204304, "epoch": 0.09192318348182388, "grad_norm": 8.0, "learning_rate": 9.993322855783146e-06, "loss": 0.81134806, "memory(GiB)": 117.54, "step": 3940, "train_speed(iter/s)": 0.212916 }, { "acc": 0.75332932, "epoch": 0.09215649105411278, "grad_norm": 7.5, "learning_rate": 9.993224902053302e-06, "loss": 0.88411512, "memory(GiB)": 117.54, "step": 3950, "train_speed(iter/s)": 0.213189 }, { "acc": 0.73475533, "epoch": 0.09238979862640166, "grad_norm": 4.6875, "learning_rate": 9.993126235542053e-06, "loss": 0.98314781, "memory(GiB)": 117.54, "step": 3960, "train_speed(iter/s)": 0.213459 }, { "acc": 0.76145267, "epoch": 0.09262310619869056, "grad_norm": 8.375, "learning_rate": 9.993026856263486e-06, "loss": 0.86928034, "memory(GiB)": 117.54, "step": 3970, "train_speed(iter/s)": 0.213725 }, { "acc": 0.76495628, "epoch": 0.09285641377097946, "grad_norm": 5.5625, "learning_rate": 9.992926764231784e-06, "loss": 0.83155947, "memory(GiB)": 117.54, "step": 3980, "train_speed(iter/s)": 0.213988 }, { "acc": 0.75666361, "epoch": 0.09308972134326834, "grad_norm": 6.84375, "learning_rate": 9.992825959461237e-06, "loss": 0.88326521, "memory(GiB)": 117.54, "step": 3990, "train_speed(iter/s)": 0.214237 }, { "acc": 0.774965, "epoch": 0.09332302891555724, "grad_norm": 5.125, "learning_rate": 9.992724441966234e-06, "loss": 0.8087532, "memory(GiB)": 117.54, "step": 4000, "train_speed(iter/s)": 0.214481 }, { "epoch": 0.09332302891555724, "eval_acc": 0.7228697654255328, "eval_loss": 0.8877829313278198, "eval_runtime": 1271.4977, "eval_samples_per_second": 28.306, "eval_steps_per_second": 14.153, "step": 4000 }, { "acc": 0.76409369, "epoch": 0.09355633648784613, "grad_norm": 4.625, "learning_rate": 9.99262221176127e-06, "loss": 0.85309639, "memory(GiB)": 117.54, "step": 4010, "train_speed(iter/s)": 0.200878 }, { "acc": 0.76466002, "epoch": 0.09378964406013503, "grad_norm": 5.84375, "learning_rate": 9.992519268860934e-06, "loss": 0.87510233, "memory(GiB)": 117.54, "step": 4020, "train_speed(iter/s)": 0.201146 }, { "acc": 0.74888363, "epoch": 0.09402295163242393, "grad_norm": 4.75, "learning_rate": 9.992415613279922e-06, "loss": 0.91498604, "memory(GiB)": 117.54, "step": 4030, "train_speed(iter/s)": 0.201399 }, { "acc": 0.7692903, "epoch": 0.09425625920471281, "grad_norm": 4.34375, "learning_rate": 9.992311245033033e-06, "loss": 0.83095226, "memory(GiB)": 117.54, "step": 4040, "train_speed(iter/s)": 0.201636 }, { "acc": 0.7509232, "epoch": 0.09448956677700171, "grad_norm": 5.0, "learning_rate": 9.992206164135163e-06, "loss": 0.92139778, "memory(GiB)": 117.54, "step": 4050, "train_speed(iter/s)": 0.201873 }, { "acc": 0.77602358, "epoch": 0.0947228743492906, "grad_norm": 8.125, "learning_rate": 9.992100370601313e-06, "loss": 0.79812717, "memory(GiB)": 117.54, "step": 4060, "train_speed(iter/s)": 0.202133 }, { "acc": 0.7557579, "epoch": 0.0949561819215795, "grad_norm": 6.875, "learning_rate": 9.991993864446585e-06, "loss": 0.89294071, "memory(GiB)": 117.54, "step": 4070, "train_speed(iter/s)": 0.202411 }, { "acc": 0.73773766, "epoch": 0.09518948949386838, "grad_norm": 5.46875, "learning_rate": 9.991886645686184e-06, "loss": 0.95771408, "memory(GiB)": 117.54, "step": 4080, "train_speed(iter/s)": 0.20265 }, { "acc": 0.75475817, "epoch": 0.09542279706615728, "grad_norm": 8.3125, "learning_rate": 9.991778714335415e-06, "loss": 0.90779686, "memory(GiB)": 117.54, "step": 4090, "train_speed(iter/s)": 0.202915 }, { "acc": 0.73981633, "epoch": 0.09565610463844618, "grad_norm": 6.90625, "learning_rate": 9.991670070409684e-06, "loss": 0.96604061, "memory(GiB)": 117.54, "step": 4100, "train_speed(iter/s)": 0.203177 }, { "acc": 0.76607499, "epoch": 0.09588941221073506, "grad_norm": 5.78125, "learning_rate": 9.991560713924501e-06, "loss": 0.85471153, "memory(GiB)": 117.54, "step": 4110, "train_speed(iter/s)": 0.20342 }, { "acc": 0.77161045, "epoch": 0.09612271978302396, "grad_norm": 8.125, "learning_rate": 9.991450644895476e-06, "loss": 0.84620152, "memory(GiB)": 117.54, "step": 4120, "train_speed(iter/s)": 0.203677 }, { "acc": 0.77144375, "epoch": 0.09635602735531285, "grad_norm": 4.625, "learning_rate": 9.99133986333832e-06, "loss": 0.83797445, "memory(GiB)": 117.54, "step": 4130, "train_speed(iter/s)": 0.20393 }, { "acc": 0.73537869, "epoch": 0.09658933492760174, "grad_norm": 6.3125, "learning_rate": 9.99122836926885e-06, "loss": 0.96916094, "memory(GiB)": 117.54, "step": 4140, "train_speed(iter/s)": 0.204181 }, { "acc": 0.75800476, "epoch": 0.09682264249989064, "grad_norm": 5.625, "learning_rate": 9.991116162702981e-06, "loss": 0.86221294, "memory(GiB)": 117.54, "step": 4150, "train_speed(iter/s)": 0.204423 }, { "acc": 0.75571017, "epoch": 0.09705595007217953, "grad_norm": 5.5, "learning_rate": 9.991003243656728e-06, "loss": 0.88189182, "memory(GiB)": 117.54, "step": 4160, "train_speed(iter/s)": 0.204672 }, { "acc": 0.75988812, "epoch": 0.09728925764446843, "grad_norm": 8.5, "learning_rate": 9.990889612146213e-06, "loss": 0.88356085, "memory(GiB)": 117.54, "step": 4170, "train_speed(iter/s)": 0.204882 }, { "acc": 0.74842806, "epoch": 0.09752256521675731, "grad_norm": 4.96875, "learning_rate": 9.990775268187654e-06, "loss": 0.93167324, "memory(GiB)": 117.54, "step": 4180, "train_speed(iter/s)": 0.20513 }, { "acc": 0.76515474, "epoch": 0.09775587278904621, "grad_norm": 35.75, "learning_rate": 9.990660211797378e-06, "loss": 0.92051849, "memory(GiB)": 117.54, "step": 4190, "train_speed(iter/s)": 0.205376 }, { "acc": 0.7553896, "epoch": 0.0979891803613351, "grad_norm": 5.375, "learning_rate": 9.990544442991805e-06, "loss": 0.89758406, "memory(GiB)": 117.54, "step": 4200, "train_speed(iter/s)": 0.20564 }, { "acc": 0.75030479, "epoch": 0.098222487933624, "grad_norm": 5.3125, "learning_rate": 9.99042796178746e-06, "loss": 0.91539536, "memory(GiB)": 117.54, "step": 4210, "train_speed(iter/s)": 0.205895 }, { "acc": 0.74988375, "epoch": 0.0984557955059129, "grad_norm": 4.25, "learning_rate": 9.990310768200977e-06, "loss": 0.9145546, "memory(GiB)": 117.54, "step": 4220, "train_speed(iter/s)": 0.206142 }, { "acc": 0.73188667, "epoch": 0.09868910307820178, "grad_norm": 5.65625, "learning_rate": 9.99019286224908e-06, "loss": 0.98185387, "memory(GiB)": 117.54, "step": 4230, "train_speed(iter/s)": 0.20638 }, { "acc": 0.75859137, "epoch": 0.09892241065049068, "grad_norm": 8.125, "learning_rate": 9.990074243948602e-06, "loss": 0.90473709, "memory(GiB)": 117.54, "step": 4240, "train_speed(iter/s)": 0.206628 }, { "acc": 0.76608834, "epoch": 0.09915571822277956, "grad_norm": 6.34375, "learning_rate": 9.989954913316476e-06, "loss": 0.85702991, "memory(GiB)": 117.54, "step": 4250, "train_speed(iter/s)": 0.20688 }, { "acc": 0.76376286, "epoch": 0.09938902579506846, "grad_norm": 9.625, "learning_rate": 9.989834870369735e-06, "loss": 0.84923592, "memory(GiB)": 117.54, "step": 4260, "train_speed(iter/s)": 0.207135 }, { "acc": 0.77528548, "epoch": 0.09962233336735736, "grad_norm": 5.1875, "learning_rate": 9.989714115125515e-06, "loss": 0.79958353, "memory(GiB)": 117.54, "step": 4270, "train_speed(iter/s)": 0.207364 }, { "acc": 0.75993328, "epoch": 0.09985564093964625, "grad_norm": 10.625, "learning_rate": 9.989592647601056e-06, "loss": 0.86611481, "memory(GiB)": 117.54, "step": 4280, "train_speed(iter/s)": 0.207609 }, { "acc": 0.75393734, "epoch": 0.10008894851193514, "grad_norm": 6.5625, "learning_rate": 9.989470467813696e-06, "loss": 0.91193056, "memory(GiB)": 117.54, "step": 4290, "train_speed(iter/s)": 0.207846 }, { "acc": 0.75926132, "epoch": 0.10032225608422403, "grad_norm": 8.0, "learning_rate": 9.989347575780874e-06, "loss": 0.88821259, "memory(GiB)": 117.54, "step": 4300, "train_speed(iter/s)": 0.208096 }, { "acc": 0.75386171, "epoch": 0.10055556365651293, "grad_norm": 7.65625, "learning_rate": 9.989223971520136e-06, "loss": 0.89129429, "memory(GiB)": 117.54, "step": 4310, "train_speed(iter/s)": 0.208334 }, { "acc": 0.75681725, "epoch": 0.10078887122880181, "grad_norm": 6.65625, "learning_rate": 9.989099655049128e-06, "loss": 0.87771931, "memory(GiB)": 117.54, "step": 4320, "train_speed(iter/s)": 0.20856 }, { "acc": 0.7391355, "epoch": 0.10102217880109071, "grad_norm": 10.0, "learning_rate": 9.98897462638559e-06, "loss": 0.96863251, "memory(GiB)": 117.54, "step": 4330, "train_speed(iter/s)": 0.20879 }, { "acc": 0.76924124, "epoch": 0.10125548637337961, "grad_norm": 5.78125, "learning_rate": 9.988848885547376e-06, "loss": 0.8352129, "memory(GiB)": 117.54, "step": 4340, "train_speed(iter/s)": 0.209021 }, { "acc": 0.77654471, "epoch": 0.1014887939456685, "grad_norm": 7.375, "learning_rate": 9.988722432552431e-06, "loss": 0.79848795, "memory(GiB)": 117.54, "step": 4350, "train_speed(iter/s)": 0.209261 }, { "acc": 0.74507241, "epoch": 0.1017221015179574, "grad_norm": 5.625, "learning_rate": 9.988595267418809e-06, "loss": 0.92341309, "memory(GiB)": 117.54, "step": 4360, "train_speed(iter/s)": 0.209503 }, { "acc": 0.76773772, "epoch": 0.10195540909024628, "grad_norm": 5.4375, "learning_rate": 9.988467390164662e-06, "loss": 0.84407911, "memory(GiB)": 117.54, "step": 4370, "train_speed(iter/s)": 0.209715 }, { "acc": 0.74648399, "epoch": 0.10218871666253518, "grad_norm": 5.0625, "learning_rate": 9.988338800808245e-06, "loss": 0.93966646, "memory(GiB)": 117.54, "step": 4380, "train_speed(iter/s)": 0.209944 }, { "acc": 0.76602821, "epoch": 0.10242202423482408, "grad_norm": 6.6875, "learning_rate": 9.988209499367911e-06, "loss": 0.8338089, "memory(GiB)": 117.54, "step": 4390, "train_speed(iter/s)": 0.210194 }, { "acc": 0.76203928, "epoch": 0.10265533180711296, "grad_norm": 4.34375, "learning_rate": 9.988079485862121e-06, "loss": 0.87073536, "memory(GiB)": 117.54, "step": 4400, "train_speed(iter/s)": 0.210432 }, { "acc": 0.74238539, "epoch": 0.10288863937940186, "grad_norm": 12.375, "learning_rate": 9.987948760309434e-06, "loss": 0.95618248, "memory(GiB)": 117.54, "step": 4410, "train_speed(iter/s)": 0.210669 }, { "acc": 0.7657311, "epoch": 0.10312194695169075, "grad_norm": 5.4375, "learning_rate": 9.987817322728509e-06, "loss": 0.85881863, "memory(GiB)": 117.54, "step": 4420, "train_speed(iter/s)": 0.210901 }, { "acc": 0.76233692, "epoch": 0.10335525452397964, "grad_norm": 4.4375, "learning_rate": 9.98768517313811e-06, "loss": 0.8707386, "memory(GiB)": 117.54, "step": 4430, "train_speed(iter/s)": 0.211135 }, { "acc": 0.75146971, "epoch": 0.10358856209626854, "grad_norm": 6.9375, "learning_rate": 9.987552311557103e-06, "loss": 0.90720158, "memory(GiB)": 117.54, "step": 4440, "train_speed(iter/s)": 0.211368 }, { "acc": 0.77758303, "epoch": 0.10382186966855743, "grad_norm": 18.25, "learning_rate": 9.987418738004453e-06, "loss": 0.81924438, "memory(GiB)": 117.54, "step": 4450, "train_speed(iter/s)": 0.211579 }, { "acc": 0.73662348, "epoch": 0.10405517724084633, "grad_norm": 5.59375, "learning_rate": 9.987284452499227e-06, "loss": 0.9785923, "memory(GiB)": 117.54, "step": 4460, "train_speed(iter/s)": 0.211822 }, { "acc": 0.74656315, "epoch": 0.10428848481313521, "grad_norm": 17.875, "learning_rate": 9.987149455060592e-06, "loss": 0.9363677, "memory(GiB)": 117.54, "step": 4470, "train_speed(iter/s)": 0.212055 }, { "acc": 0.75212345, "epoch": 0.10452179238542411, "grad_norm": 4.71875, "learning_rate": 9.987013745707824e-06, "loss": 0.93190804, "memory(GiB)": 117.54, "step": 4480, "train_speed(iter/s)": 0.212285 }, { "acc": 0.7772028, "epoch": 0.104755099957713, "grad_norm": 4.46875, "learning_rate": 9.986877324460288e-06, "loss": 0.79044576, "memory(GiB)": 117.54, "step": 4490, "train_speed(iter/s)": 0.212502 }, { "acc": 0.76467466, "epoch": 0.1049884075300019, "grad_norm": 4.65625, "learning_rate": 9.986740191337467e-06, "loss": 0.86383896, "memory(GiB)": 117.54, "step": 4500, "train_speed(iter/s)": 0.212726 }, { "epoch": 0.1049884075300019, "eval_acc": 0.7243065794777408, "eval_loss": 0.8816430568695068, "eval_runtime": 1268.0964, "eval_samples_per_second": 28.382, "eval_steps_per_second": 14.191, "step": 4500 }, { "acc": 0.74907675, "epoch": 0.1052217151022908, "grad_norm": 4.90625, "learning_rate": 9.986602346358932e-06, "loss": 0.92465839, "memory(GiB)": 117.54, "step": 4510, "train_speed(iter/s)": 0.200795 }, { "acc": 0.77655468, "epoch": 0.10545502267457968, "grad_norm": 5.6875, "learning_rate": 9.986463789544359e-06, "loss": 0.79614592, "memory(GiB)": 117.54, "step": 4520, "train_speed(iter/s)": 0.20104 }, { "acc": 0.74242306, "epoch": 0.10568833024686858, "grad_norm": 5.625, "learning_rate": 9.986324520913528e-06, "loss": 0.9382905, "memory(GiB)": 117.54, "step": 4530, "train_speed(iter/s)": 0.201278 }, { "acc": 0.7789465, "epoch": 0.10592163781915746, "grad_norm": 5.1875, "learning_rate": 9.986184540486322e-06, "loss": 0.80827017, "memory(GiB)": 117.54, "step": 4540, "train_speed(iter/s)": 0.201497 }, { "acc": 0.78276606, "epoch": 0.10615494539144636, "grad_norm": 5.9375, "learning_rate": 9.98604384828272e-06, "loss": 0.78076072, "memory(GiB)": 117.54, "step": 4550, "train_speed(iter/s)": 0.201723 }, { "acc": 0.75945024, "epoch": 0.10638825296373526, "grad_norm": 5.375, "learning_rate": 9.985902444322809e-06, "loss": 0.86157227, "memory(GiB)": 117.54, "step": 4560, "train_speed(iter/s)": 0.201954 }, { "acc": 0.75581503, "epoch": 0.10662156053602415, "grad_norm": 5.90625, "learning_rate": 9.98576032862677e-06, "loss": 0.89451599, "memory(GiB)": 117.54, "step": 4570, "train_speed(iter/s)": 0.202173 }, { "acc": 0.7623549, "epoch": 0.10685486810831304, "grad_norm": 5.3125, "learning_rate": 9.985617501214895e-06, "loss": 0.86804514, "memory(GiB)": 117.54, "step": 4580, "train_speed(iter/s)": 0.202413 }, { "acc": 0.75258703, "epoch": 0.10708817568060193, "grad_norm": 4.78125, "learning_rate": 9.985473962107568e-06, "loss": 0.916399, "memory(GiB)": 117.54, "step": 4590, "train_speed(iter/s)": 0.202637 }, { "acc": 0.75255208, "epoch": 0.10732148325289083, "grad_norm": 6.71875, "learning_rate": 9.985329711325282e-06, "loss": 0.90113773, "memory(GiB)": 117.54, "step": 4600, "train_speed(iter/s)": 0.20288 }, { "acc": 0.75763159, "epoch": 0.10755479082517971, "grad_norm": 6.875, "learning_rate": 9.985184748888627e-06, "loss": 0.91873503, "memory(GiB)": 117.54, "step": 4610, "train_speed(iter/s)": 0.203115 }, { "acc": 0.77346997, "epoch": 0.10778809839746861, "grad_norm": 5.4375, "learning_rate": 9.985039074818298e-06, "loss": 0.8059083, "memory(GiB)": 117.54, "step": 4620, "train_speed(iter/s)": 0.203336 }, { "acc": 0.75904865, "epoch": 0.10802140596975751, "grad_norm": 5.59375, "learning_rate": 9.98489268913509e-06, "loss": 0.87097445, "memory(GiB)": 117.54, "step": 4630, "train_speed(iter/s)": 0.203564 }, { "acc": 0.75039043, "epoch": 0.1082547135420464, "grad_norm": 6.65625, "learning_rate": 9.984745591859899e-06, "loss": 0.90553379, "memory(GiB)": 117.54, "step": 4640, "train_speed(iter/s)": 0.203807 }, { "acc": 0.76141634, "epoch": 0.1084880211143353, "grad_norm": 4.21875, "learning_rate": 9.98459778301372e-06, "loss": 0.8886508, "memory(GiB)": 117.54, "step": 4650, "train_speed(iter/s)": 0.20402 }, { "acc": 0.76583786, "epoch": 0.10872132868662418, "grad_norm": 9.625, "learning_rate": 9.984449262617659e-06, "loss": 0.86086102, "memory(GiB)": 117.54, "step": 4660, "train_speed(iter/s)": 0.20427 }, { "acc": 0.76089182, "epoch": 0.10895463625891308, "grad_norm": 6.09375, "learning_rate": 9.984300030692913e-06, "loss": 0.87257662, "memory(GiB)": 117.54, "step": 4670, "train_speed(iter/s)": 0.204504 }, { "acc": 0.75297747, "epoch": 0.10918794383120198, "grad_norm": 9.5, "learning_rate": 9.984150087260784e-06, "loss": 0.90016375, "memory(GiB)": 117.54, "step": 4680, "train_speed(iter/s)": 0.204719 }, { "acc": 0.74139366, "epoch": 0.10942125140349086, "grad_norm": 6.46875, "learning_rate": 9.983999432342679e-06, "loss": 0.9342598, "memory(GiB)": 117.54, "step": 4690, "train_speed(iter/s)": 0.204946 }, { "acc": 0.75930262, "epoch": 0.10965455897577976, "grad_norm": 5.75, "learning_rate": 9.983848065960103e-06, "loss": 0.8737071, "memory(GiB)": 117.54, "step": 4700, "train_speed(iter/s)": 0.205175 }, { "acc": 0.77567587, "epoch": 0.10988786654806865, "grad_norm": 8.125, "learning_rate": 9.983695988134662e-06, "loss": 0.79876671, "memory(GiB)": 117.54, "step": 4710, "train_speed(iter/s)": 0.205389 }, { "acc": 0.75503325, "epoch": 0.11012117412035755, "grad_norm": 6.03125, "learning_rate": 9.983543198888069e-06, "loss": 0.90450735, "memory(GiB)": 117.54, "step": 4720, "train_speed(iter/s)": 0.205611 }, { "acc": 0.75266876, "epoch": 0.11035448169264643, "grad_norm": 5.0625, "learning_rate": 9.98338969824213e-06, "loss": 0.9201149, "memory(GiB)": 117.54, "step": 4730, "train_speed(iter/s)": 0.205829 }, { "acc": 0.76564531, "epoch": 0.11058778926493533, "grad_norm": 6.625, "learning_rate": 9.98323548621876e-06, "loss": 0.85361395, "memory(GiB)": 117.54, "step": 4740, "train_speed(iter/s)": 0.206042 }, { "acc": 0.74129553, "epoch": 0.11082109683722423, "grad_norm": 9.4375, "learning_rate": 9.983080562839971e-06, "loss": 0.96250782, "memory(GiB)": 117.54, "step": 4750, "train_speed(iter/s)": 0.206248 }, { "acc": 0.76101308, "epoch": 0.11105440440951311, "grad_norm": 5.71875, "learning_rate": 9.982924928127881e-06, "loss": 0.85979137, "memory(GiB)": 117.54, "step": 4760, "train_speed(iter/s)": 0.206465 }, { "acc": 0.78050613, "epoch": 0.11128771198180201, "grad_norm": 5.28125, "learning_rate": 9.982768582104705e-06, "loss": 0.80721207, "memory(GiB)": 117.54, "step": 4770, "train_speed(iter/s)": 0.206675 }, { "acc": 0.74485474, "epoch": 0.1115210195540909, "grad_norm": 4.4375, "learning_rate": 9.98261152479276e-06, "loss": 0.95470104, "memory(GiB)": 117.54, "step": 4780, "train_speed(iter/s)": 0.20689 }, { "acc": 0.76254826, "epoch": 0.1117543271263798, "grad_norm": 5.9375, "learning_rate": 9.982453756214467e-06, "loss": 0.84586992, "memory(GiB)": 117.54, "step": 4790, "train_speed(iter/s)": 0.207109 }, { "acc": 0.76125956, "epoch": 0.1119876346986687, "grad_norm": 7.71875, "learning_rate": 9.982295276392349e-06, "loss": 0.88607883, "memory(GiB)": 117.54, "step": 4800, "train_speed(iter/s)": 0.207333 }, { "acc": 0.73547726, "epoch": 0.11222094227095758, "grad_norm": 5.8125, "learning_rate": 9.982136085349028e-06, "loss": 0.96996174, "memory(GiB)": 117.54, "step": 4810, "train_speed(iter/s)": 0.207542 }, { "acc": 0.77212734, "epoch": 0.11245424984324648, "grad_norm": 7.40625, "learning_rate": 9.981976183107227e-06, "loss": 0.83480778, "memory(GiB)": 117.54, "step": 4820, "train_speed(iter/s)": 0.20776 }, { "acc": 0.76515455, "epoch": 0.11268755741553536, "grad_norm": 7.1875, "learning_rate": 9.981815569689774e-06, "loss": 0.88458767, "memory(GiB)": 117.54, "step": 4830, "train_speed(iter/s)": 0.207986 }, { "acc": 0.73565607, "epoch": 0.11292086498782426, "grad_norm": 6.09375, "learning_rate": 9.981654245119594e-06, "loss": 0.98005428, "memory(GiB)": 117.54, "step": 4840, "train_speed(iter/s)": 0.208198 }, { "acc": 0.74796729, "epoch": 0.11315417256011315, "grad_norm": 7.5625, "learning_rate": 9.98149220941972e-06, "loss": 0.90888004, "memory(GiB)": 117.54, "step": 4850, "train_speed(iter/s)": 0.208403 }, { "acc": 0.76575413, "epoch": 0.11338748013240205, "grad_norm": 6.4375, "learning_rate": 9.981329462613278e-06, "loss": 0.8409193, "memory(GiB)": 117.54, "step": 4860, "train_speed(iter/s)": 0.208616 }, { "acc": 0.76135168, "epoch": 0.11362078770469095, "grad_norm": 4.96875, "learning_rate": 9.981166004723504e-06, "loss": 0.8730361, "memory(GiB)": 117.54, "step": 4870, "train_speed(iter/s)": 0.208832 }, { "acc": 0.75576491, "epoch": 0.11385409527697983, "grad_norm": 5.96875, "learning_rate": 9.981001835773729e-06, "loss": 0.86638641, "memory(GiB)": 117.54, "step": 4880, "train_speed(iter/s)": 0.20906 }, { "acc": 0.78793383, "epoch": 0.11408740284926873, "grad_norm": 6.34375, "learning_rate": 9.98083695578739e-06, "loss": 0.78035274, "memory(GiB)": 117.54, "step": 4890, "train_speed(iter/s)": 0.209275 }, { "acc": 0.76127825, "epoch": 0.11432071042155761, "grad_norm": 4.59375, "learning_rate": 9.980671364788022e-06, "loss": 0.91963701, "memory(GiB)": 117.54, "step": 4900, "train_speed(iter/s)": 0.20949 }, { "acc": 0.73467045, "epoch": 0.11455401799384651, "grad_norm": 5.53125, "learning_rate": 9.980505062799262e-06, "loss": 0.9670414, "memory(GiB)": 117.54, "step": 4910, "train_speed(iter/s)": 0.209715 }, { "acc": 0.78355579, "epoch": 0.11478732556613541, "grad_norm": 6.0, "learning_rate": 9.980338049844854e-06, "loss": 0.78485413, "memory(GiB)": 117.54, "step": 4920, "train_speed(iter/s)": 0.209923 }, { "acc": 0.74745569, "epoch": 0.1150206331384243, "grad_norm": 5.40625, "learning_rate": 9.980170325948633e-06, "loss": 0.92230692, "memory(GiB)": 117.54, "step": 4930, "train_speed(iter/s)": 0.210131 }, { "acc": 0.75931578, "epoch": 0.1152539407107132, "grad_norm": 6.53125, "learning_rate": 9.980001891134548e-06, "loss": 0.87786045, "memory(GiB)": 117.54, "step": 4940, "train_speed(iter/s)": 0.210361 }, { "acc": 0.7422245, "epoch": 0.11548724828300208, "grad_norm": 4.875, "learning_rate": 9.979832745426637e-06, "loss": 0.94051199, "memory(GiB)": 117.54, "step": 4950, "train_speed(iter/s)": 0.21057 }, { "acc": 0.75386519, "epoch": 0.11572055585529098, "grad_norm": 8.1875, "learning_rate": 9.97966288884905e-06, "loss": 0.89630547, "memory(GiB)": 117.54, "step": 4960, "train_speed(iter/s)": 0.210783 }, { "acc": 0.74389353, "epoch": 0.11595386342757986, "grad_norm": 6.375, "learning_rate": 9.979492321426032e-06, "loss": 0.93810577, "memory(GiB)": 117.54, "step": 4970, "train_speed(iter/s)": 0.210987 }, { "acc": 0.76075001, "epoch": 0.11618717099986876, "grad_norm": 5.4375, "learning_rate": 9.97932104318193e-06, "loss": 0.84013653, "memory(GiB)": 117.54, "step": 4980, "train_speed(iter/s)": 0.211194 }, { "acc": 0.77435451, "epoch": 0.11642047857215766, "grad_norm": 4.84375, "learning_rate": 9.979149054141197e-06, "loss": 0.80839243, "memory(GiB)": 117.54, "step": 4990, "train_speed(iter/s)": 0.211393 }, { "acc": 0.7572638, "epoch": 0.11665378614444655, "grad_norm": 7.3125, "learning_rate": 9.978976354328383e-06, "loss": 0.94224863, "memory(GiB)": 117.54, "step": 5000, "train_speed(iter/s)": 0.21161 }, { "epoch": 0.11665378614444655, "eval_acc": 0.7257033574718734, "eval_loss": 0.876083493232727, "eval_runtime": 1268.8673, "eval_samples_per_second": 28.365, "eval_steps_per_second": 14.183, "step": 5000 }, { "acc": 0.74820719, "epoch": 0.11688709371673545, "grad_norm": 4.8125, "learning_rate": 9.97880294376814e-06, "loss": 0.91807137, "memory(GiB)": 117.54, "step": 5010, "train_speed(iter/s)": 0.200898 }, { "acc": 0.75378785, "epoch": 0.11712040128902433, "grad_norm": 5.59375, "learning_rate": 9.978628822485224e-06, "loss": 0.92413845, "memory(GiB)": 117.54, "step": 5020, "train_speed(iter/s)": 0.201111 }, { "acc": 0.73901176, "epoch": 0.11735370886131323, "grad_norm": 7.34375, "learning_rate": 9.978453990504488e-06, "loss": 0.95172367, "memory(GiB)": 117.54, "step": 5030, "train_speed(iter/s)": 0.201322 }, { "acc": 0.75668182, "epoch": 0.11758701643360213, "grad_norm": 5.25, "learning_rate": 9.978278447850894e-06, "loss": 0.89718676, "memory(GiB)": 117.54, "step": 5040, "train_speed(iter/s)": 0.201526 }, { "acc": 0.75949087, "epoch": 0.11782032400589101, "grad_norm": 5.59375, "learning_rate": 9.978102194549498e-06, "loss": 0.89434929, "memory(GiB)": 117.54, "step": 5050, "train_speed(iter/s)": 0.201738 }, { "acc": 0.75701513, "epoch": 0.11805363157817991, "grad_norm": 6.4375, "learning_rate": 9.977925230625455e-06, "loss": 0.86358986, "memory(GiB)": 117.54, "step": 5060, "train_speed(iter/s)": 0.201951 }, { "acc": 0.76386681, "epoch": 0.1182869391504688, "grad_norm": 7.3125, "learning_rate": 9.977747556104036e-06, "loss": 0.87934084, "memory(GiB)": 117.54, "step": 5070, "train_speed(iter/s)": 0.202151 }, { "acc": 0.75649219, "epoch": 0.1185202467227577, "grad_norm": 5.15625, "learning_rate": 9.9775691710106e-06, "loss": 0.88812695, "memory(GiB)": 117.54, "step": 5080, "train_speed(iter/s)": 0.202352 }, { "acc": 0.75279508, "epoch": 0.11875355429504658, "grad_norm": 4.875, "learning_rate": 9.977390075370607e-06, "loss": 0.90212612, "memory(GiB)": 117.54, "step": 5090, "train_speed(iter/s)": 0.202571 }, { "acc": 0.75354643, "epoch": 0.11898686186733548, "grad_norm": 6.875, "learning_rate": 9.97721026920963e-06, "loss": 0.89249134, "memory(GiB)": 117.54, "step": 5100, "train_speed(iter/s)": 0.202776 }, { "acc": 0.74565663, "epoch": 0.11922016943962438, "grad_norm": 5.53125, "learning_rate": 9.977029752553331e-06, "loss": 0.93843594, "memory(GiB)": 117.54, "step": 5110, "train_speed(iter/s)": 0.202986 }, { "acc": 0.75341768, "epoch": 0.11945347701191326, "grad_norm": 4.5, "learning_rate": 9.97684852542748e-06, "loss": 0.88506985, "memory(GiB)": 117.54, "step": 5120, "train_speed(iter/s)": 0.203191 }, { "acc": 0.74062805, "epoch": 0.11968678458420216, "grad_norm": 5.65625, "learning_rate": 9.976666587857951e-06, "loss": 0.94102879, "memory(GiB)": 117.54, "step": 5130, "train_speed(iter/s)": 0.203403 }, { "acc": 0.76677217, "epoch": 0.11992009215649105, "grad_norm": 4.96875, "learning_rate": 9.97648393987071e-06, "loss": 0.85841618, "memory(GiB)": 117.54, "step": 5140, "train_speed(iter/s)": 0.203589 }, { "acc": 0.7633976, "epoch": 0.12015339972877995, "grad_norm": 8.0625, "learning_rate": 9.976300581491833e-06, "loss": 0.83148098, "memory(GiB)": 117.54, "step": 5150, "train_speed(iter/s)": 0.203784 }, { "acc": 0.7448679, "epoch": 0.12038670730106885, "grad_norm": 5.65625, "learning_rate": 9.976116512747493e-06, "loss": 0.94005127, "memory(GiB)": 117.54, "step": 5160, "train_speed(iter/s)": 0.203989 }, { "acc": 0.76915073, "epoch": 0.12062001487335773, "grad_norm": 6.8125, "learning_rate": 9.975931733663966e-06, "loss": 0.83365669, "memory(GiB)": 117.54, "step": 5170, "train_speed(iter/s)": 0.204202 }, { "acc": 0.7760251, "epoch": 0.12085332244564663, "grad_norm": 4.40625, "learning_rate": 9.97574624426763e-06, "loss": 0.810886, "memory(GiB)": 117.54, "step": 5180, "train_speed(iter/s)": 0.204402 }, { "acc": 0.74410958, "epoch": 0.12108663001793551, "grad_norm": 5.9375, "learning_rate": 9.975560044584964e-06, "loss": 0.92947273, "memory(GiB)": 117.54, "step": 5190, "train_speed(iter/s)": 0.204599 }, { "acc": 0.74937978, "epoch": 0.12131993759022441, "grad_norm": 8.0, "learning_rate": 9.975373134642545e-06, "loss": 0.92720146, "memory(GiB)": 117.54, "step": 5200, "train_speed(iter/s)": 0.2048 }, { "acc": 0.77070284, "epoch": 0.12155324516251331, "grad_norm": 5.90625, "learning_rate": 9.975185514467058e-06, "loss": 0.82222948, "memory(GiB)": 117.54, "step": 5210, "train_speed(iter/s)": 0.204998 }, { "acc": 0.7634182, "epoch": 0.1217865527348022, "grad_norm": 6.3125, "learning_rate": 9.974997184085285e-06, "loss": 0.86025238, "memory(GiB)": 117.54, "step": 5220, "train_speed(iter/s)": 0.205205 }, { "acc": 0.75734425, "epoch": 0.1220198603070911, "grad_norm": 5.59375, "learning_rate": 9.974808143524107e-06, "loss": 0.87401457, "memory(GiB)": 117.54, "step": 5230, "train_speed(iter/s)": 0.20541 }, { "acc": 0.75092173, "epoch": 0.12225316787937998, "grad_norm": 4.1875, "learning_rate": 9.974618392810513e-06, "loss": 0.93851089, "memory(GiB)": 117.54, "step": 5240, "train_speed(iter/s)": 0.205619 }, { "acc": 0.74711943, "epoch": 0.12248647545166888, "grad_norm": 6.625, "learning_rate": 9.974427931971588e-06, "loss": 0.96647253, "memory(GiB)": 117.54, "step": 5250, "train_speed(iter/s)": 0.205826 }, { "acc": 0.74908724, "epoch": 0.12271978302395777, "grad_norm": 6.34375, "learning_rate": 9.97423676103452e-06, "loss": 0.93568954, "memory(GiB)": 117.54, "step": 5260, "train_speed(iter/s)": 0.206042 }, { "acc": 0.74937153, "epoch": 0.12295309059624666, "grad_norm": 8.125, "learning_rate": 9.974044880026602e-06, "loss": 0.94437447, "memory(GiB)": 117.54, "step": 5270, "train_speed(iter/s)": 0.206236 }, { "acc": 0.76040206, "epoch": 0.12318639816853556, "grad_norm": 7.59375, "learning_rate": 9.97385228897522e-06, "loss": 0.88464508, "memory(GiB)": 117.54, "step": 5280, "train_speed(iter/s)": 0.206441 }, { "acc": 0.77492819, "epoch": 0.12341970574082445, "grad_norm": 12.25, "learning_rate": 9.97365898790787e-06, "loss": 0.80230036, "memory(GiB)": 117.54, "step": 5290, "train_speed(iter/s)": 0.206638 }, { "acc": 0.75563636, "epoch": 0.12365301331311335, "grad_norm": 4.5625, "learning_rate": 9.973464976852144e-06, "loss": 0.8981823, "memory(GiB)": 117.54, "step": 5300, "train_speed(iter/s)": 0.206834 }, { "acc": 0.75627747, "epoch": 0.12388632088540223, "grad_norm": 6.90625, "learning_rate": 9.973270255835737e-06, "loss": 0.8913538, "memory(GiB)": 117.54, "step": 5310, "train_speed(iter/s)": 0.207026 }, { "acc": 0.75131683, "epoch": 0.12411962845769113, "grad_norm": 5.96875, "learning_rate": 9.973074824886446e-06, "loss": 0.93807335, "memory(GiB)": 117.54, "step": 5320, "train_speed(iter/s)": 0.207209 }, { "acc": 0.74059019, "epoch": 0.12435293602998003, "grad_norm": 5.53125, "learning_rate": 9.972878684032169e-06, "loss": 0.96218138, "memory(GiB)": 117.54, "step": 5330, "train_speed(iter/s)": 0.207402 }, { "acc": 0.76596565, "epoch": 0.12458624360226891, "grad_norm": 6.875, "learning_rate": 9.972681833300903e-06, "loss": 0.86264296, "memory(GiB)": 117.54, "step": 5340, "train_speed(iter/s)": 0.207611 }, { "acc": 0.74638081, "epoch": 0.12481955117455781, "grad_norm": 4.78125, "learning_rate": 9.972484272720751e-06, "loss": 0.91274433, "memory(GiB)": 117.54, "step": 5350, "train_speed(iter/s)": 0.207813 }, { "acc": 0.75317116, "epoch": 0.1250528587468467, "grad_norm": 4.78125, "learning_rate": 9.972286002319913e-06, "loss": 0.91048222, "memory(GiB)": 117.54, "step": 5360, "train_speed(iter/s)": 0.208016 }, { "acc": 0.77595243, "epoch": 0.12528616631913558, "grad_norm": 6.90625, "learning_rate": 9.972087022126693e-06, "loss": 0.81915474, "memory(GiB)": 117.54, "step": 5370, "train_speed(iter/s)": 0.208199 }, { "acc": 0.76468878, "epoch": 0.12551947389142448, "grad_norm": 6.75, "learning_rate": 9.971887332169494e-06, "loss": 0.84793015, "memory(GiB)": 117.54, "step": 5380, "train_speed(iter/s)": 0.208388 }, { "acc": 0.75001726, "epoch": 0.12575278146371338, "grad_norm": 7.375, "learning_rate": 9.971686932476825e-06, "loss": 0.90510178, "memory(GiB)": 117.54, "step": 5390, "train_speed(iter/s)": 0.208579 }, { "acc": 0.7452601, "epoch": 0.12598608903600228, "grad_norm": 4.71875, "learning_rate": 9.971485823077288e-06, "loss": 0.92312012, "memory(GiB)": 117.54, "step": 5400, "train_speed(iter/s)": 0.20877 }, { "acc": 0.75537529, "epoch": 0.12621939660829118, "grad_norm": 5.46875, "learning_rate": 9.971284003999595e-06, "loss": 0.89949427, "memory(GiB)": 117.54, "step": 5410, "train_speed(iter/s)": 0.208957 }, { "acc": 0.74212222, "epoch": 0.12645270418058005, "grad_norm": 6.15625, "learning_rate": 9.971081475272555e-06, "loss": 0.93930664, "memory(GiB)": 117.54, "step": 5420, "train_speed(iter/s)": 0.209154 }, { "acc": 0.76207142, "epoch": 0.12668601175286895, "grad_norm": 5.4375, "learning_rate": 9.97087823692508e-06, "loss": 0.86257935, "memory(GiB)": 117.54, "step": 5430, "train_speed(iter/s)": 0.20934 }, { "acc": 0.76142607, "epoch": 0.12691931932515785, "grad_norm": 5.78125, "learning_rate": 9.970674288986178e-06, "loss": 0.86087112, "memory(GiB)": 117.54, "step": 5440, "train_speed(iter/s)": 0.209527 }, { "acc": 0.77505727, "epoch": 0.12715262689744675, "grad_norm": 7.0625, "learning_rate": 9.970469631484967e-06, "loss": 0.81593513, "memory(GiB)": 117.54, "step": 5450, "train_speed(iter/s)": 0.209699 }, { "acc": 0.75893412, "epoch": 0.12738593446973565, "grad_norm": 6.03125, "learning_rate": 9.970264264450659e-06, "loss": 0.85409393, "memory(GiB)": 117.54, "step": 5460, "train_speed(iter/s)": 0.209887 }, { "acc": 0.76101146, "epoch": 0.12761924204202452, "grad_norm": 4.5625, "learning_rate": 9.970058187912572e-06, "loss": 0.89274902, "memory(GiB)": 117.54, "step": 5470, "train_speed(iter/s)": 0.210074 }, { "acc": 0.76868429, "epoch": 0.12785254961431342, "grad_norm": 5.6875, "learning_rate": 9.969851401900122e-06, "loss": 0.83084993, "memory(GiB)": 117.54, "step": 5480, "train_speed(iter/s)": 0.210247 }, { "acc": 0.75309086, "epoch": 0.12808585718660231, "grad_norm": 5.4375, "learning_rate": 9.969643906442828e-06, "loss": 0.90782976, "memory(GiB)": 117.54, "step": 5490, "train_speed(iter/s)": 0.210435 }, { "acc": 0.76415911, "epoch": 0.1283191647588912, "grad_norm": 5.46875, "learning_rate": 9.96943570157031e-06, "loss": 0.85922518, "memory(GiB)": 117.54, "step": 5500, "train_speed(iter/s)": 0.210616 }, { "epoch": 0.1283191647588912, "eval_acc": 0.7270218249364108, "eval_loss": 0.8716417551040649, "eval_runtime": 1270.1331, "eval_samples_per_second": 28.336, "eval_steps_per_second": 14.169, "step": 5500 }, { "acc": 0.73600473, "epoch": 0.1285524723311801, "grad_norm": 7.03125, "learning_rate": 9.969226787312288e-06, "loss": 0.96929722, "memory(GiB)": 117.54, "step": 5510, "train_speed(iter/s)": 0.200915 }, { "acc": 0.76587524, "epoch": 0.12878577990346898, "grad_norm": 8.375, "learning_rate": 9.969017163698587e-06, "loss": 0.848559, "memory(GiB)": 117.54, "step": 5520, "train_speed(iter/s)": 0.201104 }, { "acc": 0.75857878, "epoch": 0.12901908747575788, "grad_norm": 5.0625, "learning_rate": 9.96880683075913e-06, "loss": 0.88542099, "memory(GiB)": 117.54, "step": 5530, "train_speed(iter/s)": 0.201282 }, { "acc": 0.77082253, "epoch": 0.12925239504804678, "grad_norm": 6.09375, "learning_rate": 9.96859578852394e-06, "loss": 0.81673002, "memory(GiB)": 117.54, "step": 5540, "train_speed(iter/s)": 0.201462 }, { "acc": 0.76157207, "epoch": 0.12948570262033568, "grad_norm": 5.625, "learning_rate": 9.968384037023147e-06, "loss": 0.87119312, "memory(GiB)": 117.54, "step": 5550, "train_speed(iter/s)": 0.201654 }, { "acc": 0.75955338, "epoch": 0.12971901019262455, "grad_norm": 5.21875, "learning_rate": 9.968171576286973e-06, "loss": 0.86215534, "memory(GiB)": 117.54, "step": 5560, "train_speed(iter/s)": 0.20183 }, { "acc": 0.77258162, "epoch": 0.12995231776491345, "grad_norm": 4.5, "learning_rate": 9.96795840634575e-06, "loss": 0.81898003, "memory(GiB)": 117.54, "step": 5570, "train_speed(iter/s)": 0.202015 }, { "acc": 0.75478101, "epoch": 0.13018562533720235, "grad_norm": 4.40625, "learning_rate": 9.96774452722991e-06, "loss": 0.91053543, "memory(GiB)": 117.54, "step": 5580, "train_speed(iter/s)": 0.202209 }, { "acc": 0.77054396, "epoch": 0.13041893290949125, "grad_norm": 6.78125, "learning_rate": 9.967529938969981e-06, "loss": 0.81340065, "memory(GiB)": 117.54, "step": 5590, "train_speed(iter/s)": 0.202403 }, { "acc": 0.75519128, "epoch": 0.13065224048178015, "grad_norm": 4.28125, "learning_rate": 9.967314641596595e-06, "loss": 0.87434855, "memory(GiB)": 117.54, "step": 5600, "train_speed(iter/s)": 0.202598 }, { "acc": 0.74656882, "epoch": 0.13088554805406902, "grad_norm": 8.875, "learning_rate": 9.967098635140489e-06, "loss": 0.92529068, "memory(GiB)": 117.54, "step": 5610, "train_speed(iter/s)": 0.202788 }, { "acc": 0.76542044, "epoch": 0.13111885562635792, "grad_norm": 7.9375, "learning_rate": 9.966881919632494e-06, "loss": 0.86160889, "memory(GiB)": 117.54, "step": 5620, "train_speed(iter/s)": 0.202986 }, { "acc": 0.76484203, "epoch": 0.13135216319864682, "grad_norm": 5.0625, "learning_rate": 9.966664495103548e-06, "loss": 0.85322952, "memory(GiB)": 117.54, "step": 5630, "train_speed(iter/s)": 0.203181 }, { "acc": 0.75730438, "epoch": 0.13158547077093571, "grad_norm": 5.25, "learning_rate": 9.96644636158469e-06, "loss": 0.85542212, "memory(GiB)": 117.54, "step": 5640, "train_speed(iter/s)": 0.203364 }, { "acc": 0.75253458, "epoch": 0.1318187783432246, "grad_norm": 19.875, "learning_rate": 9.966227519107054e-06, "loss": 0.96850071, "memory(GiB)": 117.54, "step": 5650, "train_speed(iter/s)": 0.203551 }, { "acc": 0.76095395, "epoch": 0.13205208591551348, "grad_norm": 4.25, "learning_rate": 9.966007967701884e-06, "loss": 0.88187704, "memory(GiB)": 117.54, "step": 5660, "train_speed(iter/s)": 0.203728 }, { "acc": 0.76344061, "epoch": 0.13228539348780238, "grad_norm": 7.3125, "learning_rate": 9.965787707400521e-06, "loss": 0.87797689, "memory(GiB)": 117.54, "step": 5670, "train_speed(iter/s)": 0.203918 }, { "acc": 0.7655921, "epoch": 0.13251870106009128, "grad_norm": 4.15625, "learning_rate": 9.965566738234403e-06, "loss": 0.83435678, "memory(GiB)": 117.54, "step": 5680, "train_speed(iter/s)": 0.204106 }, { "acc": 0.76264372, "epoch": 0.13275200863238018, "grad_norm": 7.1875, "learning_rate": 9.965345060235075e-06, "loss": 0.86193419, "memory(GiB)": 117.54, "step": 5690, "train_speed(iter/s)": 0.204292 }, { "acc": 0.75327988, "epoch": 0.13298531620466908, "grad_norm": 6.875, "learning_rate": 9.965122673434182e-06, "loss": 0.91502628, "memory(GiB)": 117.54, "step": 5700, "train_speed(iter/s)": 0.204459 }, { "acc": 0.75798521, "epoch": 0.13321862377695795, "grad_norm": 7.15625, "learning_rate": 9.964899577863472e-06, "loss": 0.86956244, "memory(GiB)": 117.54, "step": 5710, "train_speed(iter/s)": 0.204649 }, { "acc": 0.75251207, "epoch": 0.13345193134924685, "grad_norm": 6.03125, "learning_rate": 9.964675773554789e-06, "loss": 0.907057, "memory(GiB)": 117.54, "step": 5720, "train_speed(iter/s)": 0.204833 }, { "acc": 0.75791512, "epoch": 0.13368523892153575, "grad_norm": 6.03125, "learning_rate": 9.96445126054008e-06, "loss": 0.89783096, "memory(GiB)": 117.54, "step": 5730, "train_speed(iter/s)": 0.205017 }, { "acc": 0.76094255, "epoch": 0.13391854649382465, "grad_norm": 6.4375, "learning_rate": 9.964226038851397e-06, "loss": 0.88064919, "memory(GiB)": 117.54, "step": 5740, "train_speed(iter/s)": 0.205195 }, { "acc": 0.76472521, "epoch": 0.13415185406611355, "grad_norm": 5.375, "learning_rate": 9.964000108520889e-06, "loss": 0.84428978, "memory(GiB)": 117.54, "step": 5750, "train_speed(iter/s)": 0.205379 }, { "acc": 0.76074214, "epoch": 0.13438516163840242, "grad_norm": 5.40625, "learning_rate": 9.963773469580806e-06, "loss": 0.85086288, "memory(GiB)": 117.54, "step": 5760, "train_speed(iter/s)": 0.205562 }, { "acc": 0.76351795, "epoch": 0.13461846921069132, "grad_norm": 4.40625, "learning_rate": 9.963546122063504e-06, "loss": 0.86668959, "memory(GiB)": 117.54, "step": 5770, "train_speed(iter/s)": 0.205749 }, { "acc": 0.76371803, "epoch": 0.13485177678298021, "grad_norm": 8.5625, "learning_rate": 9.963318066001433e-06, "loss": 0.86080093, "memory(GiB)": 117.54, "step": 5780, "train_speed(iter/s)": 0.205929 }, { "acc": 0.7618516, "epoch": 0.1350850843552691, "grad_norm": 7.1875, "learning_rate": 9.963089301427152e-06, "loss": 0.84876232, "memory(GiB)": 117.54, "step": 5790, "train_speed(iter/s)": 0.206111 }, { "acc": 0.77087803, "epoch": 0.13531839192755798, "grad_norm": 5.0, "learning_rate": 9.962859828373315e-06, "loss": 0.83783426, "memory(GiB)": 117.54, "step": 5800, "train_speed(iter/s)": 0.206293 }, { "acc": 0.77675738, "epoch": 0.13555169949984688, "grad_norm": 7.90625, "learning_rate": 9.96262964687268e-06, "loss": 0.83521681, "memory(GiB)": 117.54, "step": 5810, "train_speed(iter/s)": 0.206466 }, { "acc": 0.76050811, "epoch": 0.13578500707213578, "grad_norm": 6.1875, "learning_rate": 9.9623987569581e-06, "loss": 0.88420362, "memory(GiB)": 117.54, "step": 5820, "train_speed(iter/s)": 0.206646 }, { "acc": 0.73890877, "epoch": 0.13601831464442468, "grad_norm": 5.90625, "learning_rate": 9.962167158662543e-06, "loss": 0.95491695, "memory(GiB)": 117.54, "step": 5830, "train_speed(iter/s)": 0.206826 }, { "acc": 0.74483318, "epoch": 0.13625162221671358, "grad_norm": 4.15625, "learning_rate": 9.961934852019066e-06, "loss": 0.92905416, "memory(GiB)": 117.54, "step": 5840, "train_speed(iter/s)": 0.207007 }, { "acc": 0.74667826, "epoch": 0.13648492978900245, "grad_norm": 4.6875, "learning_rate": 9.96170183706083e-06, "loss": 0.92713289, "memory(GiB)": 117.54, "step": 5850, "train_speed(iter/s)": 0.207186 }, { "acc": 0.74078379, "epoch": 0.13671823736129135, "grad_norm": 7.71875, "learning_rate": 9.961468113821096e-06, "loss": 0.94992075, "memory(GiB)": 117.54, "step": 5860, "train_speed(iter/s)": 0.207359 }, { "acc": 0.77699747, "epoch": 0.13695154493358025, "grad_norm": 5.84375, "learning_rate": 9.96123368233323e-06, "loss": 0.78689365, "memory(GiB)": 117.54, "step": 5870, "train_speed(iter/s)": 0.207542 }, { "acc": 0.74916096, "epoch": 0.13718485250586915, "grad_norm": 7.625, "learning_rate": 9.9609985426307e-06, "loss": 0.93192005, "memory(GiB)": 117.54, "step": 5880, "train_speed(iter/s)": 0.207728 }, { "acc": 0.7669405, "epoch": 0.13741816007815805, "grad_norm": 6.25, "learning_rate": 9.960762694747068e-06, "loss": 0.87499352, "memory(GiB)": 117.54, "step": 5890, "train_speed(iter/s)": 0.207914 }, { "acc": 0.77017179, "epoch": 0.13765146765044692, "grad_norm": 6.4375, "learning_rate": 9.960526138716e-06, "loss": 0.83586044, "memory(GiB)": 117.54, "step": 5900, "train_speed(iter/s)": 0.208098 }, { "acc": 0.75412836, "epoch": 0.13788477522273582, "grad_norm": 7.78125, "learning_rate": 9.960288874571271e-06, "loss": 0.90887356, "memory(GiB)": 117.54, "step": 5910, "train_speed(iter/s)": 0.208286 }, { "acc": 0.76210413, "epoch": 0.13811808279502472, "grad_norm": 6.0, "learning_rate": 9.960050902346743e-06, "loss": 0.85120649, "memory(GiB)": 117.54, "step": 5920, "train_speed(iter/s)": 0.208456 }, { "acc": 0.75921898, "epoch": 0.13835139036731361, "grad_norm": 8.875, "learning_rate": 9.959812222076391e-06, "loss": 0.87873182, "memory(GiB)": 117.54, "step": 5930, "train_speed(iter/s)": 0.208634 }, { "acc": 0.75706267, "epoch": 0.1385846979396025, "grad_norm": 7.96875, "learning_rate": 9.959572833794283e-06, "loss": 0.86111183, "memory(GiB)": 117.54, "step": 5940, "train_speed(iter/s)": 0.208814 }, { "acc": 0.74689651, "epoch": 0.13881800551189138, "grad_norm": 6.34375, "learning_rate": 9.959332737534597e-06, "loss": 0.92088099, "memory(GiB)": 117.54, "step": 5950, "train_speed(iter/s)": 0.208985 }, { "acc": 0.7608963, "epoch": 0.13905131308418028, "grad_norm": 17.625, "learning_rate": 9.959091933331601e-06, "loss": 0.85624313, "memory(GiB)": 117.54, "step": 5960, "train_speed(iter/s)": 0.209159 }, { "acc": 0.77537665, "epoch": 0.13928462065646918, "grad_norm": 4.96875, "learning_rate": 9.958850421219675e-06, "loss": 0.79838071, "memory(GiB)": 117.54, "step": 5970, "train_speed(iter/s)": 0.209339 }, { "acc": 0.75666475, "epoch": 0.13951792822875808, "grad_norm": 7.78125, "learning_rate": 9.958608201233288e-06, "loss": 0.89333801, "memory(GiB)": 117.54, "step": 5980, "train_speed(iter/s)": 0.209526 }, { "acc": 0.75140362, "epoch": 0.13975123580104698, "grad_norm": 4.75, "learning_rate": 9.958365273407023e-06, "loss": 0.9052248, "memory(GiB)": 117.54, "step": 5990, "train_speed(iter/s)": 0.209691 }, { "acc": 0.73782969, "epoch": 0.13998454337333585, "grad_norm": 13.3125, "learning_rate": 9.958121637775554e-06, "loss": 0.96962986, "memory(GiB)": 117.54, "step": 6000, "train_speed(iter/s)": 0.209875 }, { "epoch": 0.13998454337333585, "eval_acc": 0.727715409606508, "eval_loss": 0.8690560460090637, "eval_runtime": 1270.3769, "eval_samples_per_second": 28.331, "eval_steps_per_second": 14.166, "step": 6000 }, { "acc": 0.75419903, "epoch": 0.14021785094562475, "grad_norm": 6.4375, "learning_rate": 9.957877294373665e-06, "loss": 0.91555004, "memory(GiB)": 117.54, "step": 6010, "train_speed(iter/s)": 0.200992 }, { "acc": 0.74702139, "epoch": 0.14045115851791365, "grad_norm": 7.0, "learning_rate": 9.957632243236231e-06, "loss": 0.92673454, "memory(GiB)": 117.54, "step": 6020, "train_speed(iter/s)": 0.201172 }, { "acc": 0.75800505, "epoch": 0.14068446609020255, "grad_norm": 6.53125, "learning_rate": 9.957386484398233e-06, "loss": 0.87921581, "memory(GiB)": 117.54, "step": 6030, "train_speed(iter/s)": 0.201343 }, { "acc": 0.74582348, "epoch": 0.14091777366249145, "grad_norm": 5.21875, "learning_rate": 9.957140017894754e-06, "loss": 0.91472492, "memory(GiB)": 117.54, "step": 6040, "train_speed(iter/s)": 0.201522 }, { "acc": 0.78675618, "epoch": 0.14115108123478032, "grad_norm": 6.03125, "learning_rate": 9.956892843760979e-06, "loss": 0.75914721, "memory(GiB)": 117.54, "step": 6050, "train_speed(iter/s)": 0.201696 }, { "acc": 0.74411964, "epoch": 0.14138438880706922, "grad_norm": 12.0625, "learning_rate": 9.956644962032192e-06, "loss": 0.93103161, "memory(GiB)": 117.54, "step": 6060, "train_speed(iter/s)": 0.201878 }, { "acc": 0.7726965, "epoch": 0.14161769637935812, "grad_norm": 7.375, "learning_rate": 9.956396372743775e-06, "loss": 0.82945976, "memory(GiB)": 117.54, "step": 6070, "train_speed(iter/s)": 0.202042 }, { "acc": 0.78586645, "epoch": 0.14185100395164701, "grad_norm": 6.875, "learning_rate": 9.956147075931215e-06, "loss": 0.76539755, "memory(GiB)": 117.54, "step": 6080, "train_speed(iter/s)": 0.202205 }, { "acc": 0.75904989, "epoch": 0.14208431152393589, "grad_norm": 8.75, "learning_rate": 9.955897071630101e-06, "loss": 0.87475338, "memory(GiB)": 117.54, "step": 6090, "train_speed(iter/s)": 0.202374 }, { "acc": 0.76456523, "epoch": 0.14231761909622478, "grad_norm": 6.03125, "learning_rate": 9.955646359876118e-06, "loss": 0.86053829, "memory(GiB)": 117.54, "step": 6100, "train_speed(iter/s)": 0.202548 }, { "acc": 0.76305366, "epoch": 0.14255092666851368, "grad_norm": 6.84375, "learning_rate": 9.955394940705057e-06, "loss": 0.85630341, "memory(GiB)": 117.54, "step": 6110, "train_speed(iter/s)": 0.202712 }, { "acc": 0.7719203, "epoch": 0.14278423424080258, "grad_norm": 8.1875, "learning_rate": 9.95514281415281e-06, "loss": 0.82893257, "memory(GiB)": 117.54, "step": 6120, "train_speed(iter/s)": 0.20288 }, { "acc": 0.76380386, "epoch": 0.14301754181309148, "grad_norm": 7.1875, "learning_rate": 9.954889980255363e-06, "loss": 0.84828529, "memory(GiB)": 117.54, "step": 6130, "train_speed(iter/s)": 0.203048 }, { "acc": 0.77324157, "epoch": 0.14325084938538035, "grad_norm": 4.34375, "learning_rate": 9.954636439048813e-06, "loss": 0.82084036, "memory(GiB)": 117.54, "step": 6140, "train_speed(iter/s)": 0.203199 }, { "acc": 0.76487913, "epoch": 0.14348415695766925, "grad_norm": 6.03125, "learning_rate": 9.95438219056935e-06, "loss": 0.85029163, "memory(GiB)": 117.54, "step": 6150, "train_speed(iter/s)": 0.203348 }, { "acc": 0.75904469, "epoch": 0.14371746452995815, "grad_norm": 7.875, "learning_rate": 9.954127234853267e-06, "loss": 0.89040432, "memory(GiB)": 117.54, "step": 6160, "train_speed(iter/s)": 0.203524 }, { "acc": 0.76309099, "epoch": 0.14395077210224705, "grad_norm": 4.75, "learning_rate": 9.953871571936962e-06, "loss": 0.85744724, "memory(GiB)": 117.54, "step": 6170, "train_speed(iter/s)": 0.203691 }, { "acc": 0.77011318, "epoch": 0.14418407967453595, "grad_norm": 6.625, "learning_rate": 9.953615201856928e-06, "loss": 0.8331212, "memory(GiB)": 117.54, "step": 6180, "train_speed(iter/s)": 0.203861 }, { "acc": 0.77067161, "epoch": 0.14441738724682482, "grad_norm": 5.53125, "learning_rate": 9.953358124649764e-06, "loss": 0.82904701, "memory(GiB)": 117.54, "step": 6190, "train_speed(iter/s)": 0.20402 }, { "acc": 0.75703244, "epoch": 0.14465069481911372, "grad_norm": 6.59375, "learning_rate": 9.953100340352166e-06, "loss": 0.87819614, "memory(GiB)": 117.54, "step": 6200, "train_speed(iter/s)": 0.20419 }, { "acc": 0.76388779, "epoch": 0.14488400239140262, "grad_norm": 6.78125, "learning_rate": 9.952841849000935e-06, "loss": 0.84278336, "memory(GiB)": 117.54, "step": 6210, "train_speed(iter/s)": 0.204357 }, { "acc": 0.78060641, "epoch": 0.14511730996369152, "grad_norm": 6.90625, "learning_rate": 9.952582650632967e-06, "loss": 0.80852413, "memory(GiB)": 117.54, "step": 6220, "train_speed(iter/s)": 0.204527 }, { "acc": 0.76564684, "epoch": 0.14535061753598041, "grad_norm": 5.875, "learning_rate": 9.952322745285266e-06, "loss": 0.83691425, "memory(GiB)": 117.54, "step": 6230, "train_speed(iter/s)": 0.20469 }, { "acc": 0.77099514, "epoch": 0.14558392510826929, "grad_norm": 9.75, "learning_rate": 9.95206213299493e-06, "loss": 0.83298168, "memory(GiB)": 117.54, "step": 6240, "train_speed(iter/s)": 0.204862 }, { "acc": 0.76582918, "epoch": 0.14581723268055818, "grad_norm": 4.75, "learning_rate": 9.951800813799164e-06, "loss": 0.85061855, "memory(GiB)": 117.54, "step": 6250, "train_speed(iter/s)": 0.205033 }, { "acc": 0.77523489, "epoch": 0.14605054025284708, "grad_norm": 5.90625, "learning_rate": 9.95153878773527e-06, "loss": 0.8064352, "memory(GiB)": 117.54, "step": 6260, "train_speed(iter/s)": 0.205197 }, { "acc": 0.77384329, "epoch": 0.14628384782513598, "grad_norm": 7.375, "learning_rate": 9.951276054840654e-06, "loss": 0.80671873, "memory(GiB)": 117.54, "step": 6270, "train_speed(iter/s)": 0.205362 }, { "acc": 0.75420942, "epoch": 0.14651715539742488, "grad_norm": 6.5625, "learning_rate": 9.951012615152816e-06, "loss": 0.92444849, "memory(GiB)": 117.54, "step": 6280, "train_speed(iter/s)": 0.205538 }, { "acc": 0.78133068, "epoch": 0.14675046296971375, "grad_norm": 9.5, "learning_rate": 9.950748468709368e-06, "loss": 0.78833847, "memory(GiB)": 117.54, "step": 6290, "train_speed(iter/s)": 0.205701 }, { "acc": 0.74810176, "epoch": 0.14698377054200265, "grad_norm": 5.78125, "learning_rate": 9.950483615548014e-06, "loss": 0.92472363, "memory(GiB)": 117.54, "step": 6300, "train_speed(iter/s)": 0.205871 }, { "acc": 0.7479497, "epoch": 0.14721707811429155, "grad_norm": 5.9375, "learning_rate": 9.950218055706563e-06, "loss": 0.92439194, "memory(GiB)": 117.54, "step": 6310, "train_speed(iter/s)": 0.206038 }, { "acc": 0.77187309, "epoch": 0.14745038568658045, "grad_norm": 5.40625, "learning_rate": 9.94995178922292e-06, "loss": 0.83006401, "memory(GiB)": 117.54, "step": 6320, "train_speed(iter/s)": 0.206198 }, { "acc": 0.75937672, "epoch": 0.14768369325886932, "grad_norm": 7.03125, "learning_rate": 9.949684816135098e-06, "loss": 0.88418226, "memory(GiB)": 117.54, "step": 6330, "train_speed(iter/s)": 0.206369 }, { "acc": 0.7517971, "epoch": 0.14791700083115822, "grad_norm": 4.96875, "learning_rate": 9.949417136481207e-06, "loss": 0.92034903, "memory(GiB)": 117.54, "step": 6340, "train_speed(iter/s)": 0.206546 }, { "acc": 0.75734596, "epoch": 0.14815030840344712, "grad_norm": 5.1875, "learning_rate": 9.94914875029946e-06, "loss": 0.90017557, "memory(GiB)": 117.54, "step": 6350, "train_speed(iter/s)": 0.206707 }, { "acc": 0.75256109, "epoch": 0.14838361597573602, "grad_norm": 7.0625, "learning_rate": 9.948879657628164e-06, "loss": 0.9080761, "memory(GiB)": 117.54, "step": 6360, "train_speed(iter/s)": 0.206879 }, { "acc": 0.76637239, "epoch": 0.14861692354802492, "grad_norm": 13.8125, "learning_rate": 9.948609858505734e-06, "loss": 0.84904175, "memory(GiB)": 117.54, "step": 6370, "train_speed(iter/s)": 0.207045 }, { "acc": 0.73998928, "epoch": 0.1488502311203138, "grad_norm": 5.25, "learning_rate": 9.948339352970683e-06, "loss": 0.97184429, "memory(GiB)": 117.54, "step": 6380, "train_speed(iter/s)": 0.207209 }, { "acc": 0.75804882, "epoch": 0.14908353869260269, "grad_norm": 8.0, "learning_rate": 9.948068141061631e-06, "loss": 0.85682335, "memory(GiB)": 117.54, "step": 6390, "train_speed(iter/s)": 0.207362 }, { "acc": 0.72696562, "epoch": 0.14931684626489158, "grad_norm": 6.6875, "learning_rate": 9.947796222817286e-06, "loss": 1.01356945, "memory(GiB)": 117.54, "step": 6400, "train_speed(iter/s)": 0.207524 }, { "acc": 0.76806068, "epoch": 0.14955015383718048, "grad_norm": 5.125, "learning_rate": 9.94752359827647e-06, "loss": 0.85701227, "memory(GiB)": 117.54, "step": 6410, "train_speed(iter/s)": 0.207681 }, { "acc": 0.76032314, "epoch": 0.14978346140946938, "grad_norm": 7.25, "learning_rate": 9.947250267478094e-06, "loss": 0.87021475, "memory(GiB)": 117.54, "step": 6420, "train_speed(iter/s)": 0.207838 }, { "acc": 0.75938177, "epoch": 0.15001676898175825, "grad_norm": 5.03125, "learning_rate": 9.946976230461183e-06, "loss": 0.88507252, "memory(GiB)": 117.54, "step": 6430, "train_speed(iter/s)": 0.208015 }, { "acc": 0.74555626, "epoch": 0.15025007655404715, "grad_norm": 5.40625, "learning_rate": 9.946701487264851e-06, "loss": 0.9236105, "memory(GiB)": 117.54, "step": 6440, "train_speed(iter/s)": 0.208181 }, { "acc": 0.761584, "epoch": 0.15048338412633605, "grad_norm": 4.71875, "learning_rate": 9.946426037928319e-06, "loss": 0.86102915, "memory(GiB)": 117.54, "step": 6450, "train_speed(iter/s)": 0.208345 }, { "acc": 0.7360589, "epoch": 0.15071669169862495, "grad_norm": 8.5, "learning_rate": 9.946149882490907e-06, "loss": 0.94918003, "memory(GiB)": 117.54, "step": 6460, "train_speed(iter/s)": 0.208502 }, { "acc": 0.76649723, "epoch": 0.15094999927091385, "grad_norm": 4.78125, "learning_rate": 9.945873020992036e-06, "loss": 0.86450176, "memory(GiB)": 117.54, "step": 6470, "train_speed(iter/s)": 0.208672 }, { "acc": 0.77010989, "epoch": 0.15118330684320272, "grad_norm": 5.53125, "learning_rate": 9.945595453471228e-06, "loss": 0.82765493, "memory(GiB)": 117.54, "step": 6480, "train_speed(iter/s)": 0.20882 }, { "acc": 0.76764402, "epoch": 0.15141661441549162, "grad_norm": 13.0625, "learning_rate": 9.945317179968105e-06, "loss": 0.84424448, "memory(GiB)": 117.54, "step": 6490, "train_speed(iter/s)": 0.208984 }, { "acc": 0.75681572, "epoch": 0.15164992198778052, "grad_norm": 4.4375, "learning_rate": 9.945038200522392e-06, "loss": 0.9044385, "memory(GiB)": 117.54, "step": 6500, "train_speed(iter/s)": 0.209132 }, { "epoch": 0.15164992198778052, "eval_acc": 0.7281223361007858, "eval_loss": 0.8657384514808655, "eval_runtime": 1269.6135, "eval_samples_per_second": 28.348, "eval_steps_per_second": 14.174, "step": 6500 }, { "acc": 0.77224231, "epoch": 0.15188322956006942, "grad_norm": 6.875, "learning_rate": 9.944758515173912e-06, "loss": 0.82228127, "memory(GiB)": 117.54, "step": 6510, "train_speed(iter/s)": 0.200985 }, { "acc": 0.76387882, "epoch": 0.15211653713235831, "grad_norm": 5.21875, "learning_rate": 9.944478123962592e-06, "loss": 0.85496302, "memory(GiB)": 117.54, "step": 6520, "train_speed(iter/s)": 0.20115 }, { "acc": 0.75840979, "epoch": 0.15234984470464719, "grad_norm": 4.8125, "learning_rate": 9.944197026928454e-06, "loss": 0.88294678, "memory(GiB)": 117.54, "step": 6530, "train_speed(iter/s)": 0.201303 }, { "acc": 0.76922741, "epoch": 0.15258315227693608, "grad_norm": 4.84375, "learning_rate": 9.943915224111627e-06, "loss": 0.87327852, "memory(GiB)": 117.54, "step": 6540, "train_speed(iter/s)": 0.201457 }, { "acc": 0.75248985, "epoch": 0.15281645984922498, "grad_norm": 8.4375, "learning_rate": 9.943632715552338e-06, "loss": 0.88687239, "memory(GiB)": 117.54, "step": 6550, "train_speed(iter/s)": 0.201614 }, { "acc": 0.75970192, "epoch": 0.15304976742151388, "grad_norm": 14.4375, "learning_rate": 9.943349501290916e-06, "loss": 0.85225811, "memory(GiB)": 117.54, "step": 6560, "train_speed(iter/s)": 0.201782 }, { "acc": 0.77734346, "epoch": 0.15328307499380278, "grad_norm": 5.3125, "learning_rate": 9.943065581367788e-06, "loss": 0.81777477, "memory(GiB)": 117.54, "step": 6570, "train_speed(iter/s)": 0.201947 }, { "acc": 0.75825195, "epoch": 0.15351638256609165, "grad_norm": 7.625, "learning_rate": 9.942780955823485e-06, "loss": 0.86885967, "memory(GiB)": 117.54, "step": 6580, "train_speed(iter/s)": 0.202112 }, { "acc": 0.76747169, "epoch": 0.15374969013838055, "grad_norm": 4.4375, "learning_rate": 9.942495624698636e-06, "loss": 0.84972601, "memory(GiB)": 117.54, "step": 6590, "train_speed(iter/s)": 0.202268 }, { "acc": 0.75881453, "epoch": 0.15398299771066945, "grad_norm": 5.46875, "learning_rate": 9.942209588033973e-06, "loss": 0.87035789, "memory(GiB)": 117.54, "step": 6600, "train_speed(iter/s)": 0.202422 }, { "acc": 0.74656525, "epoch": 0.15421630528295835, "grad_norm": 5.03125, "learning_rate": 9.941922845870326e-06, "loss": 0.92789898, "memory(GiB)": 117.54, "step": 6610, "train_speed(iter/s)": 0.202575 }, { "acc": 0.7642231, "epoch": 0.15444961285524722, "grad_norm": 5.4375, "learning_rate": 9.941635398248628e-06, "loss": 0.83677711, "memory(GiB)": 117.54, "step": 6620, "train_speed(iter/s)": 0.202738 }, { "acc": 0.77601938, "epoch": 0.15468292042753612, "grad_norm": 6.65625, "learning_rate": 9.941347245209914e-06, "loss": 0.78568192, "memory(GiB)": 117.54, "step": 6630, "train_speed(iter/s)": 0.202885 }, { "acc": 0.75074358, "epoch": 0.15491622799982502, "grad_norm": 6.375, "learning_rate": 9.941058386795314e-06, "loss": 0.90226822, "memory(GiB)": 117.54, "step": 6640, "train_speed(iter/s)": 0.203031 }, { "acc": 0.77472057, "epoch": 0.15514953557211392, "grad_norm": 4.53125, "learning_rate": 9.940768823046067e-06, "loss": 0.83820248, "memory(GiB)": 117.54, "step": 6650, "train_speed(iter/s)": 0.203189 }, { "acc": 0.77502432, "epoch": 0.15538284314440282, "grad_norm": 5.78125, "learning_rate": 9.940478554003506e-06, "loss": 0.79055486, "memory(GiB)": 117.54, "step": 6660, "train_speed(iter/s)": 0.203346 }, { "acc": 0.76303005, "epoch": 0.1556161507166917, "grad_norm": 6.53125, "learning_rate": 9.940187579709064e-06, "loss": 0.86618824, "memory(GiB)": 117.54, "step": 6670, "train_speed(iter/s)": 0.203512 }, { "acc": 0.74345989, "epoch": 0.15584945828898059, "grad_norm": 4.96875, "learning_rate": 9.939895900204281e-06, "loss": 0.91676807, "memory(GiB)": 117.54, "step": 6680, "train_speed(iter/s)": 0.203669 }, { "acc": 0.76821184, "epoch": 0.15608276586126948, "grad_norm": 5.46875, "learning_rate": 9.939603515530796e-06, "loss": 0.8443615, "memory(GiB)": 117.54, "step": 6690, "train_speed(iter/s)": 0.203826 }, { "acc": 0.75775089, "epoch": 0.15631607343355838, "grad_norm": 6.21875, "learning_rate": 9.939310425730342e-06, "loss": 0.83715219, "memory(GiB)": 117.54, "step": 6700, "train_speed(iter/s)": 0.203982 }, { "acc": 0.75873032, "epoch": 0.15654938100584728, "grad_norm": 4.875, "learning_rate": 9.939016630844758e-06, "loss": 0.87146358, "memory(GiB)": 117.54, "step": 6710, "train_speed(iter/s)": 0.204137 }, { "acc": 0.76880903, "epoch": 0.15678268857813615, "grad_norm": 5.53125, "learning_rate": 9.938722130915988e-06, "loss": 0.81287689, "memory(GiB)": 117.54, "step": 6720, "train_speed(iter/s)": 0.20429 }, { "acc": 0.76790185, "epoch": 0.15701599615042505, "grad_norm": 4.84375, "learning_rate": 9.938426925986066e-06, "loss": 0.85982666, "memory(GiB)": 117.54, "step": 6730, "train_speed(iter/s)": 0.204444 }, { "acc": 0.75649314, "epoch": 0.15724930372271395, "grad_norm": 5.15625, "learning_rate": 9.938131016097137e-06, "loss": 0.89570599, "memory(GiB)": 117.54, "step": 6740, "train_speed(iter/s)": 0.204591 }, { "acc": 0.75742426, "epoch": 0.15748261129500285, "grad_norm": 6.375, "learning_rate": 9.937834401291437e-06, "loss": 0.88885574, "memory(GiB)": 117.54, "step": 6750, "train_speed(iter/s)": 0.204744 }, { "acc": 0.75880747, "epoch": 0.15771591886729175, "grad_norm": 6.5, "learning_rate": 9.937537081611313e-06, "loss": 0.89389744, "memory(GiB)": 117.54, "step": 6760, "train_speed(iter/s)": 0.2049 }, { "acc": 0.76473279, "epoch": 0.15794922643958062, "grad_norm": 5.03125, "learning_rate": 9.937239057099205e-06, "loss": 0.84799232, "memory(GiB)": 117.54, "step": 6770, "train_speed(iter/s)": 0.205043 }, { "acc": 0.76936255, "epoch": 0.15818253401186952, "grad_norm": 6.625, "learning_rate": 9.936940327797655e-06, "loss": 0.8422473, "memory(GiB)": 117.54, "step": 6780, "train_speed(iter/s)": 0.205188 }, { "acc": 0.75960865, "epoch": 0.15841584158415842, "grad_norm": 6.15625, "learning_rate": 9.936640893749308e-06, "loss": 0.86526461, "memory(GiB)": 117.54, "step": 6790, "train_speed(iter/s)": 0.205344 }, { "acc": 0.75233178, "epoch": 0.15864914915644732, "grad_norm": 5.59375, "learning_rate": 9.936340754996906e-06, "loss": 0.89683399, "memory(GiB)": 117.54, "step": 6800, "train_speed(iter/s)": 0.205496 }, { "acc": 0.75799913, "epoch": 0.15888245672873622, "grad_norm": 6.40625, "learning_rate": 9.936039911583298e-06, "loss": 0.88384476, "memory(GiB)": 117.54, "step": 6810, "train_speed(iter/s)": 0.205656 }, { "acc": 0.76789608, "epoch": 0.1591157643010251, "grad_norm": 6.0, "learning_rate": 9.935738363551424e-06, "loss": 0.84808712, "memory(GiB)": 117.54, "step": 6820, "train_speed(iter/s)": 0.205793 }, { "acc": 0.73517952, "epoch": 0.15934907187331399, "grad_norm": 5.53125, "learning_rate": 9.935436110944335e-06, "loss": 0.95522289, "memory(GiB)": 117.54, "step": 6830, "train_speed(iter/s)": 0.205947 }, { "acc": 0.75344524, "epoch": 0.15958237944560288, "grad_norm": 6.4375, "learning_rate": 9.935133153805172e-06, "loss": 0.91215868, "memory(GiB)": 117.54, "step": 6840, "train_speed(iter/s)": 0.206099 }, { "acc": 0.73823147, "epoch": 0.15981568701789178, "grad_norm": 6.46875, "learning_rate": 9.934829492177187e-06, "loss": 0.9458313, "memory(GiB)": 117.54, "step": 6850, "train_speed(iter/s)": 0.206255 }, { "acc": 0.74221478, "epoch": 0.16004899459018065, "grad_norm": 5.40625, "learning_rate": 9.934525126103725e-06, "loss": 0.94343185, "memory(GiB)": 117.54, "step": 6860, "train_speed(iter/s)": 0.206398 }, { "acc": 0.76774483, "epoch": 0.16028230216246955, "grad_norm": 7.4375, "learning_rate": 9.934220055628233e-06, "loss": 0.85056896, "memory(GiB)": 117.54, "step": 6870, "train_speed(iter/s)": 0.206556 }, { "acc": 0.75645094, "epoch": 0.16051560973475845, "grad_norm": 5.375, "learning_rate": 9.933914280794266e-06, "loss": 0.87650585, "memory(GiB)": 117.54, "step": 6880, "train_speed(iter/s)": 0.206713 }, { "acc": 0.76912999, "epoch": 0.16074891730704735, "grad_norm": 5.75, "learning_rate": 9.933607801645464e-06, "loss": 0.85444221, "memory(GiB)": 117.54, "step": 6890, "train_speed(iter/s)": 0.206863 }, { "acc": 0.76916509, "epoch": 0.16098222487933625, "grad_norm": 5.84375, "learning_rate": 9.933300618225584e-06, "loss": 0.83676643, "memory(GiB)": 117.54, "step": 6900, "train_speed(iter/s)": 0.207006 }, { "acc": 0.7689889, "epoch": 0.16121553245162512, "grad_norm": 12.5625, "learning_rate": 9.932992730578473e-06, "loss": 0.85195637, "memory(GiB)": 117.54, "step": 6910, "train_speed(iter/s)": 0.207151 }, { "acc": 0.7639226, "epoch": 0.16144884002391402, "grad_norm": 4.8125, "learning_rate": 9.932684138748083e-06, "loss": 0.8682415, "memory(GiB)": 117.54, "step": 6920, "train_speed(iter/s)": 0.207303 }, { "acc": 0.74718456, "epoch": 0.16168214759620292, "grad_norm": 5.28125, "learning_rate": 9.932374842778466e-06, "loss": 0.95030527, "memory(GiB)": 117.54, "step": 6930, "train_speed(iter/s)": 0.207455 }, { "acc": 0.76108999, "epoch": 0.16191545516849182, "grad_norm": 8.9375, "learning_rate": 9.932064842713773e-06, "loss": 0.88821373, "memory(GiB)": 117.54, "step": 6940, "train_speed(iter/s)": 0.207608 }, { "acc": 0.76204557, "epoch": 0.16214876274078072, "grad_norm": 5.3125, "learning_rate": 9.931754138598256e-06, "loss": 0.83484173, "memory(GiB)": 117.54, "step": 6950, "train_speed(iter/s)": 0.207758 }, { "acc": 0.77546968, "epoch": 0.1623820703130696, "grad_norm": 6.3125, "learning_rate": 9.931442730476266e-06, "loss": 0.8154768, "memory(GiB)": 117.54, "step": 6960, "train_speed(iter/s)": 0.207903 }, { "acc": 0.74842148, "epoch": 0.1626153778853585, "grad_norm": 7.5625, "learning_rate": 9.931130618392262e-06, "loss": 0.91668129, "memory(GiB)": 117.54, "step": 6970, "train_speed(iter/s)": 0.208058 }, { "acc": 0.75227356, "epoch": 0.16284868545764739, "grad_norm": 5.59375, "learning_rate": 9.930817802390794e-06, "loss": 0.90838165, "memory(GiB)": 117.54, "step": 6980, "train_speed(iter/s)": 0.208207 }, { "acc": 0.77258139, "epoch": 0.16308199302993628, "grad_norm": 5.5625, "learning_rate": 9.930504282516517e-06, "loss": 0.81022301, "memory(GiB)": 117.54, "step": 6990, "train_speed(iter/s)": 0.208358 }, { "acc": 0.76104183, "epoch": 0.16331530060222518, "grad_norm": 4.125, "learning_rate": 9.930190058814185e-06, "loss": 0.87542782, "memory(GiB)": 117.54, "step": 7000, "train_speed(iter/s)": 0.208517 }, { "epoch": 0.16331530060222518, "eval_acc": 0.7292125980342936, "eval_loss": 0.8633277416229248, "eval_runtime": 1270.1232, "eval_samples_per_second": 28.337, "eval_steps_per_second": 14.169, "step": 7000 }, { "acc": 0.76679716, "epoch": 0.16354860817451405, "grad_norm": 4.59375, "learning_rate": 9.929875131328655e-06, "loss": 0.84782066, "memory(GiB)": 117.54, "step": 7010, "train_speed(iter/s)": 0.200947 }, { "acc": 0.74878941, "epoch": 0.16378191574680295, "grad_norm": 4.15625, "learning_rate": 9.929559500104883e-06, "loss": 0.91403189, "memory(GiB)": 117.54, "step": 7020, "train_speed(iter/s)": 0.201094 }, { "acc": 0.76715469, "epoch": 0.16401522331909185, "grad_norm": 6.4375, "learning_rate": 9.929243165187922e-06, "loss": 0.84933138, "memory(GiB)": 117.54, "step": 7030, "train_speed(iter/s)": 0.201244 }, { "acc": 0.76669478, "epoch": 0.16424853089138075, "grad_norm": 3.796875, "learning_rate": 9.928926126622933e-06, "loss": 0.84990997, "memory(GiB)": 117.54, "step": 7040, "train_speed(iter/s)": 0.201391 }, { "acc": 0.76388216, "epoch": 0.16448183846366965, "grad_norm": 6.25, "learning_rate": 9.928608384455172e-06, "loss": 0.87723694, "memory(GiB)": 117.54, "step": 7050, "train_speed(iter/s)": 0.201534 }, { "acc": 0.74291534, "epoch": 0.16471514603595852, "grad_norm": 32.5, "learning_rate": 9.928289938729996e-06, "loss": 0.95859528, "memory(GiB)": 117.54, "step": 7060, "train_speed(iter/s)": 0.201681 }, { "acc": 0.75394406, "epoch": 0.16494845360824742, "grad_norm": 5.34375, "learning_rate": 9.92797078949286e-06, "loss": 0.90898361, "memory(GiB)": 117.54, "step": 7070, "train_speed(iter/s)": 0.201821 }, { "acc": 0.74871893, "epoch": 0.16518176118053632, "grad_norm": 5.125, "learning_rate": 9.927650936789329e-06, "loss": 0.90822897, "memory(GiB)": 117.54, "step": 7080, "train_speed(iter/s)": 0.201961 }, { "acc": 0.77017407, "epoch": 0.16541506875282522, "grad_norm": 7.90625, "learning_rate": 9.927330380665056e-06, "loss": 0.84466286, "memory(GiB)": 117.54, "step": 7090, "train_speed(iter/s)": 0.202117 }, { "acc": 0.75458536, "epoch": 0.1656483763251141, "grad_norm": 4.9375, "learning_rate": 9.927009121165803e-06, "loss": 0.90666466, "memory(GiB)": 117.54, "step": 7100, "train_speed(iter/s)": 0.202266 }, { "acc": 0.75543022, "epoch": 0.165881683897403, "grad_norm": 5.46875, "learning_rate": 9.92668715833743e-06, "loss": 0.8722743, "memory(GiB)": 117.54, "step": 7110, "train_speed(iter/s)": 0.202404 }, { "acc": 0.77939939, "epoch": 0.16611499146969189, "grad_norm": 10.75, "learning_rate": 9.926364492225894e-06, "loss": 0.78749428, "memory(GiB)": 117.54, "step": 7120, "train_speed(iter/s)": 0.202551 }, { "acc": 0.79106226, "epoch": 0.16634829904198078, "grad_norm": 7.3125, "learning_rate": 9.92604112287726e-06, "loss": 0.74738359, "memory(GiB)": 117.54, "step": 7130, "train_speed(iter/s)": 0.202705 }, { "acc": 0.76939073, "epoch": 0.16658160661426968, "grad_norm": 4.8125, "learning_rate": 9.925717050337686e-06, "loss": 0.8409626, "memory(GiB)": 117.54, "step": 7140, "train_speed(iter/s)": 0.202843 }, { "acc": 0.76826558, "epoch": 0.16681491418655855, "grad_norm": 5.65625, "learning_rate": 9.925392274653435e-06, "loss": 0.82716637, "memory(GiB)": 117.54, "step": 7150, "train_speed(iter/s)": 0.202993 }, { "acc": 0.76554303, "epoch": 0.16704822175884745, "grad_norm": 4.21875, "learning_rate": 9.925066795870868e-06, "loss": 0.849158, "memory(GiB)": 117.54, "step": 7160, "train_speed(iter/s)": 0.203138 }, { "acc": 0.78131075, "epoch": 0.16728152933113635, "grad_norm": 5.65625, "learning_rate": 9.924740614036445e-06, "loss": 0.77836246, "memory(GiB)": 117.54, "step": 7170, "train_speed(iter/s)": 0.203275 }, { "acc": 0.77169647, "epoch": 0.16751483690342525, "grad_norm": 6.46875, "learning_rate": 9.92441372919673e-06, "loss": 0.83209658, "memory(GiB)": 117.54, "step": 7180, "train_speed(iter/s)": 0.203419 }, { "acc": 0.75183039, "epoch": 0.16774814447571415, "grad_norm": 4.875, "learning_rate": 9.924086141398385e-06, "loss": 0.92607002, "memory(GiB)": 117.54, "step": 7190, "train_speed(iter/s)": 0.203569 }, { "acc": 0.77899566, "epoch": 0.16798145204800302, "grad_norm": 9.9375, "learning_rate": 9.923757850688176e-06, "loss": 0.78890448, "memory(GiB)": 117.54, "step": 7200, "train_speed(iter/s)": 0.203724 }, { "acc": 0.7672864, "epoch": 0.16821475962029192, "grad_norm": 5.375, "learning_rate": 9.923428857112963e-06, "loss": 0.82765884, "memory(GiB)": 117.54, "step": 7210, "train_speed(iter/s)": 0.203869 }, { "acc": 0.76710939, "epoch": 0.16844806719258082, "grad_norm": 7.78125, "learning_rate": 9.923099160719711e-06, "loss": 0.83284779, "memory(GiB)": 117.54, "step": 7220, "train_speed(iter/s)": 0.204016 }, { "acc": 0.76684504, "epoch": 0.16868137476486972, "grad_norm": 7.5, "learning_rate": 9.922768761555485e-06, "loss": 0.84021397, "memory(GiB)": 117.54, "step": 7230, "train_speed(iter/s)": 0.20416 }, { "acc": 0.75771241, "epoch": 0.16891468233715862, "grad_norm": 6.625, "learning_rate": 9.922437659667448e-06, "loss": 0.8986228, "memory(GiB)": 117.54, "step": 7240, "train_speed(iter/s)": 0.204305 }, { "acc": 0.77050877, "epoch": 0.1691479899094475, "grad_norm": 5.0, "learning_rate": 9.922105855102864e-06, "loss": 0.83454065, "memory(GiB)": 117.54, "step": 7250, "train_speed(iter/s)": 0.204448 }, { "acc": 0.77722654, "epoch": 0.1693812974817364, "grad_norm": 6.21875, "learning_rate": 9.921773347909098e-06, "loss": 0.79222279, "memory(GiB)": 117.54, "step": 7260, "train_speed(iter/s)": 0.204598 }, { "acc": 0.76436782, "epoch": 0.16961460505402529, "grad_norm": 6.75, "learning_rate": 9.921440138133619e-06, "loss": 0.843188, "memory(GiB)": 117.54, "step": 7270, "train_speed(iter/s)": 0.20475 }, { "acc": 0.75543094, "epoch": 0.16984791262631418, "grad_norm": 6.15625, "learning_rate": 9.921106225823988e-06, "loss": 0.91073751, "memory(GiB)": 117.54, "step": 7280, "train_speed(iter/s)": 0.204899 }, { "acc": 0.76456766, "epoch": 0.17008122019860308, "grad_norm": 5.625, "learning_rate": 9.920771611027875e-06, "loss": 0.83511429, "memory(GiB)": 117.54, "step": 7290, "train_speed(iter/s)": 0.205052 }, { "acc": 0.73825827, "epoch": 0.17031452777089195, "grad_norm": 5.3125, "learning_rate": 9.920436293793043e-06, "loss": 0.97240715, "memory(GiB)": 117.54, "step": 7300, "train_speed(iter/s)": 0.205195 }, { "acc": 0.74200888, "epoch": 0.17054783534318085, "grad_norm": 6.59375, "learning_rate": 9.920100274167359e-06, "loss": 0.9331625, "memory(GiB)": 117.54, "step": 7310, "train_speed(iter/s)": 0.205347 }, { "acc": 0.76395473, "epoch": 0.17078114291546975, "grad_norm": 7.46875, "learning_rate": 9.91976355219879e-06, "loss": 0.85346889, "memory(GiB)": 117.54, "step": 7320, "train_speed(iter/s)": 0.205459 }, { "acc": 0.75636325, "epoch": 0.17101445048775865, "grad_norm": 5.78125, "learning_rate": 9.919426127935404e-06, "loss": 0.89509583, "memory(GiB)": 117.54, "step": 7330, "train_speed(iter/s)": 0.205592 }, { "acc": 0.76570711, "epoch": 0.17124775806004755, "grad_norm": 5.53125, "learning_rate": 9.919088001425367e-06, "loss": 0.85378265, "memory(GiB)": 117.54, "step": 7340, "train_speed(iter/s)": 0.205738 }, { "acc": 0.75373197, "epoch": 0.17148106563233642, "grad_norm": 8.8125, "learning_rate": 9.918749172716946e-06, "loss": 0.88216324, "memory(GiB)": 117.54, "step": 7350, "train_speed(iter/s)": 0.205869 }, { "acc": 0.75796227, "epoch": 0.17171437320462532, "grad_norm": 7.0, "learning_rate": 9.91840964185851e-06, "loss": 0.8882925, "memory(GiB)": 117.54, "step": 7360, "train_speed(iter/s)": 0.206007 }, { "acc": 0.76364655, "epoch": 0.17194768077691422, "grad_norm": 5.53125, "learning_rate": 9.918069408898527e-06, "loss": 0.85295391, "memory(GiB)": 117.54, "step": 7370, "train_speed(iter/s)": 0.206146 }, { "acc": 0.77610121, "epoch": 0.17218098834920312, "grad_norm": 6.40625, "learning_rate": 9.917728473885564e-06, "loss": 0.78088756, "memory(GiB)": 117.54, "step": 7380, "train_speed(iter/s)": 0.206278 }, { "acc": 0.7751833, "epoch": 0.172414295921492, "grad_norm": 8.0, "learning_rate": 9.91738683686829e-06, "loss": 0.82343903, "memory(GiB)": 117.54, "step": 7390, "train_speed(iter/s)": 0.206425 }, { "acc": 0.75816336, "epoch": 0.1726476034937809, "grad_norm": 6.40625, "learning_rate": 9.917044497895474e-06, "loss": 0.86748638, "memory(GiB)": 117.54, "step": 7400, "train_speed(iter/s)": 0.206522 }, { "acc": 0.76400776, "epoch": 0.1728809110660698, "grad_norm": 5.25, "learning_rate": 9.916701457015983e-06, "loss": 0.86720943, "memory(GiB)": 117.54, "step": 7410, "train_speed(iter/s)": 0.206658 }, { "acc": 0.75771341, "epoch": 0.17311421863835869, "grad_norm": 5.15625, "learning_rate": 9.91635771427879e-06, "loss": 0.88320751, "memory(GiB)": 117.54, "step": 7420, "train_speed(iter/s)": 0.206797 }, { "acc": 0.7662806, "epoch": 0.17334752621064758, "grad_norm": 6.0625, "learning_rate": 9.91601326973296e-06, "loss": 0.835116, "memory(GiB)": 117.54, "step": 7430, "train_speed(iter/s)": 0.206941 }, { "acc": 0.77026176, "epoch": 0.17358083378293646, "grad_norm": 7.625, "learning_rate": 9.915668123427662e-06, "loss": 0.84814157, "memory(GiB)": 117.54, "step": 7440, "train_speed(iter/s)": 0.207086 }, { "acc": 0.76674738, "epoch": 0.17381414135522535, "grad_norm": 4.6875, "learning_rate": 9.91532227541217e-06, "loss": 0.85983877, "memory(GiB)": 117.54, "step": 7450, "train_speed(iter/s)": 0.207226 }, { "acc": 0.75598326, "epoch": 0.17404744892751425, "grad_norm": 6.03125, "learning_rate": 9.91497572573585e-06, "loss": 0.87792206, "memory(GiB)": 117.54, "step": 7460, "train_speed(iter/s)": 0.207368 }, { "acc": 0.77461433, "epoch": 0.17428075649980315, "grad_norm": 4.8125, "learning_rate": 9.914628474448173e-06, "loss": 0.81929722, "memory(GiB)": 117.54, "step": 7470, "train_speed(iter/s)": 0.207504 }, { "acc": 0.75475292, "epoch": 0.17451406407209205, "grad_norm": 5.53125, "learning_rate": 9.91428052159871e-06, "loss": 0.88305016, "memory(GiB)": 117.54, "step": 7480, "train_speed(iter/s)": 0.207644 }, { "acc": 0.77698159, "epoch": 0.17474737164438092, "grad_norm": 5.40625, "learning_rate": 9.913931867237129e-06, "loss": 0.81536446, "memory(GiB)": 117.54, "step": 7490, "train_speed(iter/s)": 0.207778 }, { "acc": 0.75837955, "epoch": 0.17498067921666982, "grad_norm": 4.71875, "learning_rate": 9.913582511413201e-06, "loss": 0.87997665, "memory(GiB)": 117.54, "step": 7500, "train_speed(iter/s)": 0.207923 }, { "epoch": 0.17498067921666982, "eval_acc": 0.7296547562596777, "eval_loss": 0.8604273200035095, "eval_runtime": 1269.7388, "eval_samples_per_second": 28.345, "eval_steps_per_second": 14.173, "step": 7500 }, { "acc": 0.75066805, "epoch": 0.17521398678895872, "grad_norm": 5.96875, "learning_rate": 9.913232454176797e-06, "loss": 0.88283882, "memory(GiB)": 117.54, "step": 7510, "train_speed(iter/s)": 0.2009 }, { "acc": 0.73445344, "epoch": 0.17544729436124762, "grad_norm": 6.25, "learning_rate": 9.912881695577889e-06, "loss": 0.97776871, "memory(GiB)": 117.54, "step": 7520, "train_speed(iter/s)": 0.201035 }, { "acc": 0.77131672, "epoch": 0.17568060193353652, "grad_norm": 6.21875, "learning_rate": 9.912530235666546e-06, "loss": 0.82929153, "memory(GiB)": 117.54, "step": 7530, "train_speed(iter/s)": 0.201167 }, { "acc": 0.77159753, "epoch": 0.1759139095058254, "grad_norm": 8.5625, "learning_rate": 9.912178074492937e-06, "loss": 0.81753139, "memory(GiB)": 117.54, "step": 7540, "train_speed(iter/s)": 0.201308 }, { "acc": 0.75888309, "epoch": 0.1761472170781143, "grad_norm": 5.9375, "learning_rate": 9.911825212107337e-06, "loss": 0.85590801, "memory(GiB)": 117.54, "step": 7550, "train_speed(iter/s)": 0.20145 }, { "acc": 0.77193937, "epoch": 0.1763805246504032, "grad_norm": 6.21875, "learning_rate": 9.911471648560114e-06, "loss": 0.81675425, "memory(GiB)": 117.54, "step": 7560, "train_speed(iter/s)": 0.201582 }, { "acc": 0.75802794, "epoch": 0.17661383222269209, "grad_norm": 5.625, "learning_rate": 9.91111738390174e-06, "loss": 0.91160278, "memory(GiB)": 117.54, "step": 7570, "train_speed(iter/s)": 0.201724 }, { "acc": 0.77081351, "epoch": 0.17684713979498098, "grad_norm": 4.84375, "learning_rate": 9.910762418182786e-06, "loss": 0.81983862, "memory(GiB)": 117.54, "step": 7580, "train_speed(iter/s)": 0.201858 }, { "acc": 0.76340055, "epoch": 0.17708044736726986, "grad_norm": 14.375, "learning_rate": 9.910406751453923e-06, "loss": 0.85830593, "memory(GiB)": 117.54, "step": 7590, "train_speed(iter/s)": 0.202005 }, { "acc": 0.75996342, "epoch": 0.17731375493955875, "grad_norm": 5.0625, "learning_rate": 9.910050383765924e-06, "loss": 0.86299267, "memory(GiB)": 117.54, "step": 7600, "train_speed(iter/s)": 0.202147 }, { "acc": 0.73427224, "epoch": 0.17754706251184765, "grad_norm": 4.875, "learning_rate": 9.909693315169657e-06, "loss": 0.96497478, "memory(GiB)": 117.54, "step": 7610, "train_speed(iter/s)": 0.202278 }, { "acc": 0.76901188, "epoch": 0.17778037008413655, "grad_norm": 5.9375, "learning_rate": 9.909335545716097e-06, "loss": 0.82169056, "memory(GiB)": 117.54, "step": 7620, "train_speed(iter/s)": 0.202416 }, { "acc": 0.75650215, "epoch": 0.17801367765642542, "grad_norm": 6.53125, "learning_rate": 9.908977075456314e-06, "loss": 0.8937892, "memory(GiB)": 117.54, "step": 7630, "train_speed(iter/s)": 0.202547 }, { "acc": 0.78488522, "epoch": 0.17824698522871432, "grad_norm": 5.53125, "learning_rate": 9.90861790444148e-06, "loss": 0.76624699, "memory(GiB)": 117.54, "step": 7640, "train_speed(iter/s)": 0.202691 }, { "acc": 0.77700996, "epoch": 0.17848029280100322, "grad_norm": 5.75, "learning_rate": 9.908258032722865e-06, "loss": 0.79376793, "memory(GiB)": 117.54, "step": 7650, "train_speed(iter/s)": 0.202822 }, { "acc": 0.78278093, "epoch": 0.17871360037329212, "grad_norm": 6.09375, "learning_rate": 9.907897460351842e-06, "loss": 0.80827179, "memory(GiB)": 117.54, "step": 7660, "train_speed(iter/s)": 0.202955 }, { "acc": 0.7604249, "epoch": 0.17894690794558102, "grad_norm": 7.09375, "learning_rate": 9.907536187379883e-06, "loss": 0.88734035, "memory(GiB)": 117.54, "step": 7670, "train_speed(iter/s)": 0.203091 }, { "acc": 0.76182446, "epoch": 0.1791802155178699, "grad_norm": 5.90625, "learning_rate": 9.907174213858556e-06, "loss": 0.88050871, "memory(GiB)": 117.54, "step": 7680, "train_speed(iter/s)": 0.203227 }, { "acc": 0.76138544, "epoch": 0.1794135230901588, "grad_norm": 12.4375, "learning_rate": 9.906811539839539e-06, "loss": 0.86582584, "memory(GiB)": 117.54, "step": 7690, "train_speed(iter/s)": 0.203366 }, { "acc": 0.77104373, "epoch": 0.1796468306624477, "grad_norm": 5.1875, "learning_rate": 9.9064481653746e-06, "loss": 0.83353386, "memory(GiB)": 117.54, "step": 7700, "train_speed(iter/s)": 0.203486 }, { "acc": 0.77400684, "epoch": 0.1798801382347366, "grad_norm": 4.46875, "learning_rate": 9.906084090515609e-06, "loss": 0.83886662, "memory(GiB)": 117.54, "step": 7710, "train_speed(iter/s)": 0.203625 }, { "acc": 0.75984573, "epoch": 0.18011344580702549, "grad_norm": 5.96875, "learning_rate": 9.90571931531454e-06, "loss": 0.88316107, "memory(GiB)": 117.54, "step": 7720, "train_speed(iter/s)": 0.203764 }, { "acc": 0.76342249, "epoch": 0.18034675337931436, "grad_norm": 4.46875, "learning_rate": 9.905353839823463e-06, "loss": 0.85615129, "memory(GiB)": 117.54, "step": 7730, "train_speed(iter/s)": 0.203908 }, { "acc": 0.78326778, "epoch": 0.18058006095160326, "grad_norm": 6.09375, "learning_rate": 9.904987664094553e-06, "loss": 0.76730537, "memory(GiB)": 117.54, "step": 7740, "train_speed(iter/s)": 0.204033 }, { "acc": 0.75868654, "epoch": 0.18081336852389215, "grad_norm": 6.40625, "learning_rate": 9.904620788180076e-06, "loss": 0.89008236, "memory(GiB)": 117.54, "step": 7750, "train_speed(iter/s)": 0.204174 }, { "acc": 0.76263795, "epoch": 0.18104667609618105, "grad_norm": 5.5, "learning_rate": 9.904253212132406e-06, "loss": 0.87411928, "memory(GiB)": 117.54, "step": 7760, "train_speed(iter/s)": 0.204312 }, { "acc": 0.76147628, "epoch": 0.18127998366846995, "grad_norm": 6.0, "learning_rate": 9.903884936004017e-06, "loss": 0.86704102, "memory(GiB)": 117.54, "step": 7770, "train_speed(iter/s)": 0.204445 }, { "acc": 0.76194277, "epoch": 0.18151329124075882, "grad_norm": 7.5, "learning_rate": 9.903515959847477e-06, "loss": 0.83300629, "memory(GiB)": 117.54, "step": 7780, "train_speed(iter/s)": 0.204576 }, { "acc": 0.7630332, "epoch": 0.18174659881304772, "grad_norm": 8.5, "learning_rate": 9.903146283715459e-06, "loss": 0.85758762, "memory(GiB)": 117.54, "step": 7790, "train_speed(iter/s)": 0.204716 }, { "acc": 0.74647932, "epoch": 0.18197990638533662, "grad_norm": 5.5625, "learning_rate": 9.902775907660733e-06, "loss": 0.91409321, "memory(GiB)": 117.54, "step": 7800, "train_speed(iter/s)": 0.204846 }, { "acc": 0.74644547, "epoch": 0.18221321395762552, "grad_norm": 5.1875, "learning_rate": 9.90240483173617e-06, "loss": 0.95661182, "memory(GiB)": 117.54, "step": 7810, "train_speed(iter/s)": 0.204982 }, { "acc": 0.75153122, "epoch": 0.18244652152991442, "grad_norm": 6.4375, "learning_rate": 9.902033055994739e-06, "loss": 0.90269318, "memory(GiB)": 117.54, "step": 7820, "train_speed(iter/s)": 0.205106 }, { "acc": 0.75813732, "epoch": 0.1826798291022033, "grad_norm": 83.5, "learning_rate": 9.901660580489517e-06, "loss": 0.88405313, "memory(GiB)": 117.54, "step": 7830, "train_speed(iter/s)": 0.20524 }, { "acc": 0.78136988, "epoch": 0.1829131366744922, "grad_norm": 9.25, "learning_rate": 9.90128740527367e-06, "loss": 0.79004688, "memory(GiB)": 117.54, "step": 7840, "train_speed(iter/s)": 0.205378 }, { "acc": 0.77203064, "epoch": 0.1831464442467811, "grad_norm": 4.6875, "learning_rate": 9.900913530400469e-06, "loss": 0.80475359, "memory(GiB)": 117.54, "step": 7850, "train_speed(iter/s)": 0.205502 }, { "acc": 0.76842432, "epoch": 0.18337975181906999, "grad_norm": 5.03125, "learning_rate": 9.900538955923287e-06, "loss": 0.84965286, "memory(GiB)": 117.54, "step": 7860, "train_speed(iter/s)": 0.205631 }, { "acc": 0.7858345, "epoch": 0.18361305939135886, "grad_norm": 4.625, "learning_rate": 9.900163681895591e-06, "loss": 0.7589952, "memory(GiB)": 117.54, "step": 7870, "train_speed(iter/s)": 0.205758 }, { "acc": 0.75342913, "epoch": 0.18384636696364776, "grad_norm": 6.03125, "learning_rate": 9.899787708370954e-06, "loss": 0.89493637, "memory(GiB)": 117.54, "step": 7880, "train_speed(iter/s)": 0.20589 }, { "acc": 0.7876399, "epoch": 0.18407967453593665, "grad_norm": 7.21875, "learning_rate": 9.899411035403044e-06, "loss": 0.74116306, "memory(GiB)": 117.54, "step": 7890, "train_speed(iter/s)": 0.206028 }, { "acc": 0.78092766, "epoch": 0.18431298210822555, "grad_norm": 4.71875, "learning_rate": 9.899033663045632e-06, "loss": 0.79192057, "memory(GiB)": 117.54, "step": 7900, "train_speed(iter/s)": 0.206168 }, { "acc": 0.75420218, "epoch": 0.18454628968051445, "grad_norm": 9.8125, "learning_rate": 9.898655591352589e-06, "loss": 0.88830757, "memory(GiB)": 117.54, "step": 7910, "train_speed(iter/s)": 0.206303 }, { "acc": 0.75896831, "epoch": 0.18477959725280332, "grad_norm": 6.8125, "learning_rate": 9.898276820377882e-06, "loss": 0.8895916, "memory(GiB)": 117.54, "step": 7920, "train_speed(iter/s)": 0.206432 }, { "acc": 0.75719609, "epoch": 0.18501290482509222, "grad_norm": 6.875, "learning_rate": 9.897897350175583e-06, "loss": 0.89884977, "memory(GiB)": 117.54, "step": 7930, "train_speed(iter/s)": 0.206564 }, { "acc": 0.75096016, "epoch": 0.18524621239738112, "grad_norm": 6.875, "learning_rate": 9.897517180799858e-06, "loss": 0.90399666, "memory(GiB)": 117.54, "step": 7940, "train_speed(iter/s)": 0.206695 }, { "acc": 0.76634254, "epoch": 0.18547951996967002, "grad_norm": 5.5, "learning_rate": 9.89713631230498e-06, "loss": 0.84754448, "memory(GiB)": 117.54, "step": 7950, "train_speed(iter/s)": 0.206826 }, { "acc": 0.76263399, "epoch": 0.18571282754195892, "grad_norm": 5.1875, "learning_rate": 9.896754744745315e-06, "loss": 0.87414799, "memory(GiB)": 117.54, "step": 7960, "train_speed(iter/s)": 0.206959 }, { "acc": 0.76916466, "epoch": 0.1859461351142478, "grad_norm": 6.25, "learning_rate": 9.896372478175336e-06, "loss": 0.85287285, "memory(GiB)": 117.54, "step": 7970, "train_speed(iter/s)": 0.207091 }, { "acc": 0.76025381, "epoch": 0.1861794426865367, "grad_norm": 23.5, "learning_rate": 9.895989512649605e-06, "loss": 0.85706806, "memory(GiB)": 117.54, "step": 7980, "train_speed(iter/s)": 0.207216 }, { "acc": 0.78468599, "epoch": 0.1864127502588256, "grad_norm": 8.4375, "learning_rate": 9.895605848222794e-06, "loss": 0.78214846, "memory(GiB)": 117.54, "step": 7990, "train_speed(iter/s)": 0.207344 }, { "acc": 0.76179285, "epoch": 0.1866460578311145, "grad_norm": 5.4375, "learning_rate": 9.89522148494967e-06, "loss": 0.87004957, "memory(GiB)": 117.54, "step": 8000, "train_speed(iter/s)": 0.20748 }, { "epoch": 0.1866460578311145, "eval_acc": 0.7304815809310498, "eval_loss": 0.8580639958381653, "eval_runtime": 1269.3016, "eval_samples_per_second": 28.355, "eval_steps_per_second": 14.178, "step": 8000 }, { "acc": 0.75695009, "epoch": 0.18687936540340339, "grad_norm": 8.125, "learning_rate": 9.894836422885101e-06, "loss": 0.87596626, "memory(GiB)": 117.54, "step": 8010, "train_speed(iter/s)": 0.20091 }, { "acc": 0.76842041, "epoch": 0.18711267297569226, "grad_norm": 5.1875, "learning_rate": 9.894450662084055e-06, "loss": 0.84125156, "memory(GiB)": 117.54, "step": 8020, "train_speed(iter/s)": 0.20104 }, { "acc": 0.75734787, "epoch": 0.18734598054798116, "grad_norm": 5.4375, "learning_rate": 9.8940642026016e-06, "loss": 0.88770895, "memory(GiB)": 117.54, "step": 8030, "train_speed(iter/s)": 0.201174 }, { "acc": 0.74493351, "epoch": 0.18757928812027005, "grad_norm": 17.875, "learning_rate": 9.8936770444929e-06, "loss": 0.93823605, "memory(GiB)": 117.54, "step": 8040, "train_speed(iter/s)": 0.201304 }, { "acc": 0.77957907, "epoch": 0.18781259569255895, "grad_norm": 4.5625, "learning_rate": 9.893289187813224e-06, "loss": 0.79470148, "memory(GiB)": 117.54, "step": 8050, "train_speed(iter/s)": 0.201441 }, { "acc": 0.76955447, "epoch": 0.18804590326484785, "grad_norm": 6.0, "learning_rate": 9.892900632617939e-06, "loss": 0.82833557, "memory(GiB)": 117.54, "step": 8060, "train_speed(iter/s)": 0.201567 }, { "acc": 0.77944613, "epoch": 0.18827921083713672, "grad_norm": 5.3125, "learning_rate": 9.892511378962509e-06, "loss": 0.78586206, "memory(GiB)": 117.54, "step": 8070, "train_speed(iter/s)": 0.201694 }, { "acc": 0.78304958, "epoch": 0.18851251840942562, "grad_norm": 6.1875, "learning_rate": 9.892121426902502e-06, "loss": 0.7807333, "memory(GiB)": 117.54, "step": 8080, "train_speed(iter/s)": 0.201821 }, { "acc": 0.77091231, "epoch": 0.18874582598171452, "grad_norm": 7.75, "learning_rate": 9.891730776493579e-06, "loss": 0.82303715, "memory(GiB)": 117.54, "step": 8090, "train_speed(iter/s)": 0.201941 }, { "acc": 0.76864967, "epoch": 0.18897913355400342, "grad_norm": 6.03125, "learning_rate": 9.891339427791513e-06, "loss": 0.82791605, "memory(GiB)": 117.54, "step": 8100, "train_speed(iter/s)": 0.202068 }, { "acc": 0.78619051, "epoch": 0.18921244112629232, "grad_norm": 6.125, "learning_rate": 9.890947380852163e-06, "loss": 0.78398085, "memory(GiB)": 117.54, "step": 8110, "train_speed(iter/s)": 0.202193 }, { "acc": 0.763907, "epoch": 0.1894457486985812, "grad_norm": 5.3125, "learning_rate": 9.890554635731496e-06, "loss": 0.86447706, "memory(GiB)": 117.54, "step": 8120, "train_speed(iter/s)": 0.202319 }, { "acc": 0.75075836, "epoch": 0.1896790562708701, "grad_norm": 4.6875, "learning_rate": 9.890161192485573e-06, "loss": 0.91182461, "memory(GiB)": 117.54, "step": 8130, "train_speed(iter/s)": 0.202452 }, { "acc": 0.78794985, "epoch": 0.189912363843159, "grad_norm": 5.28125, "learning_rate": 9.889767051170563e-06, "loss": 0.75603533, "memory(GiB)": 117.54, "step": 8140, "train_speed(iter/s)": 0.202579 }, { "acc": 0.77740474, "epoch": 0.1901456714154479, "grad_norm": 6.6875, "learning_rate": 9.889372211842726e-06, "loss": 0.80326691, "memory(GiB)": 117.54, "step": 8150, "train_speed(iter/s)": 0.202706 }, { "acc": 0.77399597, "epoch": 0.19037897898773676, "grad_norm": 5.03125, "learning_rate": 9.888976674558426e-06, "loss": 0.8158947, "memory(GiB)": 117.54, "step": 8160, "train_speed(iter/s)": 0.202837 }, { "acc": 0.74653988, "epoch": 0.19061228656002566, "grad_norm": 6.375, "learning_rate": 9.888580439374126e-06, "loss": 0.91792946, "memory(GiB)": 117.54, "step": 8170, "train_speed(iter/s)": 0.20296 }, { "acc": 0.74516058, "epoch": 0.19084559413231456, "grad_norm": 5.34375, "learning_rate": 9.888183506346389e-06, "loss": 0.91901636, "memory(GiB)": 117.54, "step": 8180, "train_speed(iter/s)": 0.203084 }, { "acc": 0.76203327, "epoch": 0.19107890170460345, "grad_norm": 6.4375, "learning_rate": 9.887785875531875e-06, "loss": 0.87705736, "memory(GiB)": 117.54, "step": 8190, "train_speed(iter/s)": 0.203213 }, { "acc": 0.75964112, "epoch": 0.19131220927689235, "grad_norm": 4.25, "learning_rate": 9.887387546987349e-06, "loss": 0.87349434, "memory(GiB)": 117.54, "step": 8200, "train_speed(iter/s)": 0.203336 }, { "acc": 0.77561655, "epoch": 0.19154551684918122, "grad_norm": 4.21875, "learning_rate": 9.886988520769669e-06, "loss": 0.80881062, "memory(GiB)": 117.54, "step": 8210, "train_speed(iter/s)": 0.203451 }, { "acc": 0.76098909, "epoch": 0.19177882442147012, "grad_norm": 6.96875, "learning_rate": 9.886588796935797e-06, "loss": 0.87041025, "memory(GiB)": 117.54, "step": 8220, "train_speed(iter/s)": 0.203578 }, { "acc": 0.7447237, "epoch": 0.19201213199375902, "grad_norm": 5.25, "learning_rate": 9.886188375542795e-06, "loss": 0.93633842, "memory(GiB)": 117.54, "step": 8230, "train_speed(iter/s)": 0.20371 }, { "acc": 0.76472883, "epoch": 0.19224543956604792, "grad_norm": 5.15625, "learning_rate": 9.885787256647822e-06, "loss": 0.85684052, "memory(GiB)": 117.54, "step": 8240, "train_speed(iter/s)": 0.203835 }, { "acc": 0.78805923, "epoch": 0.19247874713833682, "grad_norm": 5.84375, "learning_rate": 9.885385440308137e-06, "loss": 0.73936048, "memory(GiB)": 117.54, "step": 8250, "train_speed(iter/s)": 0.203964 }, { "acc": 0.75716724, "epoch": 0.1927120547106257, "grad_norm": 7.59375, "learning_rate": 9.8849829265811e-06, "loss": 0.89653664, "memory(GiB)": 117.54, "step": 8260, "train_speed(iter/s)": 0.204088 }, { "acc": 0.78018217, "epoch": 0.1929453622829146, "grad_norm": 5.75, "learning_rate": 9.884579715524168e-06, "loss": 0.77996178, "memory(GiB)": 117.54, "step": 8270, "train_speed(iter/s)": 0.204218 }, { "acc": 0.74880614, "epoch": 0.1931786698552035, "grad_norm": 6.28125, "learning_rate": 9.884175807194902e-06, "loss": 0.90730553, "memory(GiB)": 117.54, "step": 8280, "train_speed(iter/s)": 0.204343 }, { "acc": 0.77563052, "epoch": 0.1934119774274924, "grad_norm": 6.25, "learning_rate": 9.883771201650958e-06, "loss": 0.80887585, "memory(GiB)": 117.54, "step": 8290, "train_speed(iter/s)": 0.204473 }, { "acc": 0.76742563, "epoch": 0.1936452849997813, "grad_norm": 5.71875, "learning_rate": 9.883365898950094e-06, "loss": 0.83605604, "memory(GiB)": 117.54, "step": 8300, "train_speed(iter/s)": 0.204603 }, { "acc": 0.77535963, "epoch": 0.19387859257207016, "grad_norm": 5.65625, "learning_rate": 9.882959899150166e-06, "loss": 0.79957132, "memory(GiB)": 117.54, "step": 8310, "train_speed(iter/s)": 0.204735 }, { "acc": 0.76650438, "epoch": 0.19411190014435906, "grad_norm": 8.25, "learning_rate": 9.882553202309131e-06, "loss": 0.83275509, "memory(GiB)": 117.54, "step": 8320, "train_speed(iter/s)": 0.204866 }, { "acc": 0.76723166, "epoch": 0.19434520771664796, "grad_norm": 6.84375, "learning_rate": 9.882145808485045e-06, "loss": 0.83784151, "memory(GiB)": 117.54, "step": 8330, "train_speed(iter/s)": 0.204991 }, { "acc": 0.76202507, "epoch": 0.19457851528893685, "grad_norm": 6.6875, "learning_rate": 9.881737717736063e-06, "loss": 0.8698616, "memory(GiB)": 117.54, "step": 8340, "train_speed(iter/s)": 0.205119 }, { "acc": 0.76839499, "epoch": 0.19481182286122575, "grad_norm": 6.71875, "learning_rate": 9.88132893012044e-06, "loss": 0.85822887, "memory(GiB)": 117.54, "step": 8350, "train_speed(iter/s)": 0.205249 }, { "acc": 0.75704322, "epoch": 0.19504513043351462, "grad_norm": 4.8125, "learning_rate": 9.88091944569653e-06, "loss": 0.88926487, "memory(GiB)": 117.54, "step": 8360, "train_speed(iter/s)": 0.20537 }, { "acc": 0.7531908, "epoch": 0.19527843800580352, "grad_norm": 6.21875, "learning_rate": 9.880509264522788e-06, "loss": 0.89586658, "memory(GiB)": 117.54, "step": 8370, "train_speed(iter/s)": 0.205492 }, { "acc": 0.76802006, "epoch": 0.19551174557809242, "grad_norm": 4.59375, "learning_rate": 9.880098386657765e-06, "loss": 0.83442373, "memory(GiB)": 117.54, "step": 8380, "train_speed(iter/s)": 0.205617 }, { "acc": 0.75828171, "epoch": 0.19574505315038132, "grad_norm": 12.1875, "learning_rate": 9.879686812160116e-06, "loss": 0.86747961, "memory(GiB)": 117.54, "step": 8390, "train_speed(iter/s)": 0.205742 }, { "acc": 0.77685285, "epoch": 0.1959783607226702, "grad_norm": 13.125, "learning_rate": 9.87927454108859e-06, "loss": 0.81312294, "memory(GiB)": 117.54, "step": 8400, "train_speed(iter/s)": 0.205862 }, { "acc": 0.75890455, "epoch": 0.1962116682949591, "grad_norm": 4.625, "learning_rate": 9.878861573502044e-06, "loss": 0.90135555, "memory(GiB)": 117.54, "step": 8410, "train_speed(iter/s)": 0.205988 }, { "acc": 0.77306185, "epoch": 0.196444975867248, "grad_norm": 5.4375, "learning_rate": 9.878447909459423e-06, "loss": 0.84094868, "memory(GiB)": 117.54, "step": 8420, "train_speed(iter/s)": 0.206109 }, { "acc": 0.77186542, "epoch": 0.1966782834395369, "grad_norm": 4.53125, "learning_rate": 9.878033549019781e-06, "loss": 0.81778107, "memory(GiB)": 117.54, "step": 8430, "train_speed(iter/s)": 0.206226 }, { "acc": 0.7592206, "epoch": 0.1969115910118258, "grad_norm": 5.75, "learning_rate": 9.877618492242267e-06, "loss": 0.90185413, "memory(GiB)": 117.54, "step": 8440, "train_speed(iter/s)": 0.206345 }, { "acc": 0.77778916, "epoch": 0.19714489858411466, "grad_norm": 5.28125, "learning_rate": 9.877202739186132e-06, "loss": 0.79591818, "memory(GiB)": 117.54, "step": 8450, "train_speed(iter/s)": 0.20647 }, { "acc": 0.77902613, "epoch": 0.19737820615640356, "grad_norm": 9.125, "learning_rate": 9.876786289910721e-06, "loss": 0.79726973, "memory(GiB)": 117.54, "step": 8460, "train_speed(iter/s)": 0.206596 }, { "acc": 0.75946159, "epoch": 0.19761151372869246, "grad_norm": 8.6875, "learning_rate": 9.876369144475484e-06, "loss": 0.8990036, "memory(GiB)": 117.54, "step": 8470, "train_speed(iter/s)": 0.206715 }, { "acc": 0.74261923, "epoch": 0.19784482130098135, "grad_norm": 6.125, "learning_rate": 9.875951302939967e-06, "loss": 0.91385193, "memory(GiB)": 117.54, "step": 8480, "train_speed(iter/s)": 0.206839 }, { "acc": 0.78411551, "epoch": 0.19807812887327025, "grad_norm": 4.53125, "learning_rate": 9.87553276536382e-06, "loss": 0.79219027, "memory(GiB)": 117.54, "step": 8490, "train_speed(iter/s)": 0.206957 }, { "acc": 0.7668088, "epoch": 0.19831143644555912, "grad_norm": 9.125, "learning_rate": 9.875113531806785e-06, "loss": 0.83160515, "memory(GiB)": 117.54, "step": 8500, "train_speed(iter/s)": 0.207077 }, { "epoch": 0.19831143644555912, "eval_acc": 0.731228333486271, "eval_loss": 0.8559815883636475, "eval_runtime": 1270.6586, "eval_samples_per_second": 28.325, "eval_steps_per_second": 14.163, "step": 8500 }, { "acc": 0.75493984, "epoch": 0.19854474401784802, "grad_norm": 6.3125, "learning_rate": 9.874693602328711e-06, "loss": 0.896556, "memory(GiB)": 117.54, "step": 8510, "train_speed(iter/s)": 0.200895 }, { "acc": 0.78136353, "epoch": 0.19877805159013692, "grad_norm": 5.03125, "learning_rate": 9.874272976989541e-06, "loss": 0.78143473, "memory(GiB)": 117.54, "step": 8520, "train_speed(iter/s)": 0.201016 }, { "acc": 0.76888914, "epoch": 0.19901135916242582, "grad_norm": 6.0625, "learning_rate": 9.87385165584932e-06, "loss": 0.82431383, "memory(GiB)": 117.54, "step": 8530, "train_speed(iter/s)": 0.201146 }, { "acc": 0.75388536, "epoch": 0.19924466673471472, "grad_norm": 5.84375, "learning_rate": 9.873429638968191e-06, "loss": 0.89886112, "memory(GiB)": 117.54, "step": 8540, "train_speed(iter/s)": 0.20127 }, { "acc": 0.74465961, "epoch": 0.1994779743070036, "grad_norm": 5.5, "learning_rate": 9.873006926406397e-06, "loss": 0.93650312, "memory(GiB)": 117.54, "step": 8550, "train_speed(iter/s)": 0.201395 }, { "acc": 0.76810255, "epoch": 0.1997112818792925, "grad_norm": 11.0625, "learning_rate": 9.872583518224279e-06, "loss": 0.83152609, "memory(GiB)": 117.54, "step": 8560, "train_speed(iter/s)": 0.201522 }, { "acc": 0.75034895, "epoch": 0.1999445894515814, "grad_norm": 4.375, "learning_rate": 9.872159414482279e-06, "loss": 0.90507641, "memory(GiB)": 117.54, "step": 8570, "train_speed(iter/s)": 0.201644 }, { "acc": 0.76051202, "epoch": 0.2001778970238703, "grad_norm": 5.40625, "learning_rate": 9.871734615240938e-06, "loss": 0.88187637, "memory(GiB)": 117.54, "step": 8580, "train_speed(iter/s)": 0.201772 }, { "acc": 0.75299907, "epoch": 0.2004112045961592, "grad_norm": 6.875, "learning_rate": 9.871309120560897e-06, "loss": 0.88183479, "memory(GiB)": 117.54, "step": 8590, "train_speed(iter/s)": 0.201899 }, { "acc": 0.77017865, "epoch": 0.20064451216844806, "grad_norm": 7.375, "learning_rate": 9.870882930502894e-06, "loss": 0.84091225, "memory(GiB)": 117.54, "step": 8600, "train_speed(iter/s)": 0.202013 }, { "acc": 0.76628962, "epoch": 0.20087781974073696, "grad_norm": 6.84375, "learning_rate": 9.870456045127767e-06, "loss": 0.87175293, "memory(GiB)": 117.54, "step": 8610, "train_speed(iter/s)": 0.202136 }, { "acc": 0.77316351, "epoch": 0.20111112731302586, "grad_norm": 5.59375, "learning_rate": 9.870028464496455e-06, "loss": 0.83083458, "memory(GiB)": 117.54, "step": 8620, "train_speed(iter/s)": 0.202255 }, { "acc": 0.74215651, "epoch": 0.20134443488531475, "grad_norm": 7.46875, "learning_rate": 9.869600188669995e-06, "loss": 0.96017342, "memory(GiB)": 117.54, "step": 8630, "train_speed(iter/s)": 0.20237 }, { "acc": 0.76919737, "epoch": 0.20157774245760363, "grad_norm": 4.625, "learning_rate": 9.869171217709522e-06, "loss": 0.82330322, "memory(GiB)": 117.54, "step": 8640, "train_speed(iter/s)": 0.202492 }, { "acc": 0.76708584, "epoch": 0.20181105002989252, "grad_norm": 9.375, "learning_rate": 9.86874155167627e-06, "loss": 0.82590866, "memory(GiB)": 117.54, "step": 8650, "train_speed(iter/s)": 0.202616 }, { "acc": 0.7566463, "epoch": 0.20204435760218142, "grad_norm": 5.96875, "learning_rate": 9.868311190631578e-06, "loss": 0.8966464, "memory(GiB)": 117.54, "step": 8660, "train_speed(iter/s)": 0.202744 }, { "acc": 0.75115132, "epoch": 0.20227766517447032, "grad_norm": 4.4375, "learning_rate": 9.867880134636877e-06, "loss": 0.91379795, "memory(GiB)": 117.54, "step": 8670, "train_speed(iter/s)": 0.202873 }, { "acc": 0.78020339, "epoch": 0.20251097274675922, "grad_norm": 5.78125, "learning_rate": 9.867448383753702e-06, "loss": 0.78962541, "memory(GiB)": 117.54, "step": 8680, "train_speed(iter/s)": 0.202989 }, { "acc": 0.77505159, "epoch": 0.2027442803190481, "grad_norm": 10.125, "learning_rate": 9.867015938043685e-06, "loss": 0.82020912, "memory(GiB)": 117.54, "step": 8690, "train_speed(iter/s)": 0.203111 }, { "acc": 0.79113216, "epoch": 0.202977587891337, "grad_norm": 6.03125, "learning_rate": 9.866582797568556e-06, "loss": 0.7530036, "memory(GiB)": 117.54, "step": 8700, "train_speed(iter/s)": 0.203235 }, { "acc": 0.75312705, "epoch": 0.2032108954636259, "grad_norm": 7.125, "learning_rate": 9.866148962390146e-06, "loss": 0.90598717, "memory(GiB)": 117.54, "step": 8710, "train_speed(iter/s)": 0.203359 }, { "acc": 0.765062, "epoch": 0.2034442030359148, "grad_norm": 4.28125, "learning_rate": 9.865714432570384e-06, "loss": 0.86992626, "memory(GiB)": 117.54, "step": 8720, "train_speed(iter/s)": 0.203478 }, { "acc": 0.76311955, "epoch": 0.2036775106082037, "grad_norm": 6.25, "learning_rate": 9.8652792081713e-06, "loss": 0.85822725, "memory(GiB)": 117.54, "step": 8730, "train_speed(iter/s)": 0.203588 }, { "acc": 0.74687796, "epoch": 0.20391081818049256, "grad_norm": 6.71875, "learning_rate": 9.864843289255026e-06, "loss": 0.91143522, "memory(GiB)": 117.54, "step": 8740, "train_speed(iter/s)": 0.203712 }, { "acc": 0.75389738, "epoch": 0.20414412575278146, "grad_norm": 6.9375, "learning_rate": 9.864406675883784e-06, "loss": 0.90044785, "memory(GiB)": 117.54, "step": 8750, "train_speed(iter/s)": 0.203828 }, { "acc": 0.75677834, "epoch": 0.20437743332507036, "grad_norm": 7.6875, "learning_rate": 9.863969368119902e-06, "loss": 0.88022346, "memory(GiB)": 117.54, "step": 8760, "train_speed(iter/s)": 0.203942 }, { "acc": 0.76615176, "epoch": 0.20461074089735926, "grad_norm": 8.4375, "learning_rate": 9.863531366025804e-06, "loss": 0.8678463, "memory(GiB)": 117.54, "step": 8770, "train_speed(iter/s)": 0.204065 }, { "acc": 0.76859779, "epoch": 0.20484404846964815, "grad_norm": 6.6875, "learning_rate": 9.863092669664018e-06, "loss": 0.8375844, "memory(GiB)": 117.54, "step": 8780, "train_speed(iter/s)": 0.204184 }, { "acc": 0.76002398, "epoch": 0.20507735604193703, "grad_norm": 6.4375, "learning_rate": 9.862653279097166e-06, "loss": 0.86004581, "memory(GiB)": 117.54, "step": 8790, "train_speed(iter/s)": 0.204307 }, { "acc": 0.75623741, "epoch": 0.20531066361422592, "grad_norm": 10.0625, "learning_rate": 9.86221319438797e-06, "loss": 0.87964134, "memory(GiB)": 117.54, "step": 8800, "train_speed(iter/s)": 0.204424 }, { "acc": 0.74737983, "epoch": 0.20554397118651482, "grad_norm": 5.0625, "learning_rate": 9.861772415599256e-06, "loss": 0.93885403, "memory(GiB)": 117.54, "step": 8810, "train_speed(iter/s)": 0.204549 }, { "acc": 0.74907146, "epoch": 0.20577727875880372, "grad_norm": 6.125, "learning_rate": 9.861330942793939e-06, "loss": 0.90780516, "memory(GiB)": 117.54, "step": 8820, "train_speed(iter/s)": 0.204666 }, { "acc": 0.7763402, "epoch": 0.20601058633109262, "grad_norm": 4.96875, "learning_rate": 9.860888776035043e-06, "loss": 0.80836182, "memory(GiB)": 117.54, "step": 8830, "train_speed(iter/s)": 0.204762 }, { "acc": 0.74543486, "epoch": 0.2062438939033815, "grad_norm": 5.40625, "learning_rate": 9.860445915385687e-06, "loss": 0.91186695, "memory(GiB)": 117.54, "step": 8840, "train_speed(iter/s)": 0.204882 }, { "acc": 0.76045542, "epoch": 0.2064772014756704, "grad_norm": 5.3125, "learning_rate": 9.860002360909086e-06, "loss": 0.84907341, "memory(GiB)": 117.54, "step": 8850, "train_speed(iter/s)": 0.20499 }, { "acc": 0.74592304, "epoch": 0.2067105090479593, "grad_norm": 4.03125, "learning_rate": 9.859558112668563e-06, "loss": 0.89834366, "memory(GiB)": 117.54, "step": 8860, "train_speed(iter/s)": 0.20511 }, { "acc": 0.78678131, "epoch": 0.2069438166202482, "grad_norm": 6.28125, "learning_rate": 9.85911317072753e-06, "loss": 0.76607027, "memory(GiB)": 117.54, "step": 8870, "train_speed(iter/s)": 0.205225 }, { "acc": 0.76762028, "epoch": 0.2071771241925371, "grad_norm": 8.375, "learning_rate": 9.858667535149503e-06, "loss": 0.83826656, "memory(GiB)": 117.54, "step": 8880, "train_speed(iter/s)": 0.205346 }, { "acc": 0.78018045, "epoch": 0.20741043176482596, "grad_norm": 8.5625, "learning_rate": 9.858221205998097e-06, "loss": 0.79698944, "memory(GiB)": 117.54, "step": 8890, "train_speed(iter/s)": 0.205468 }, { "acc": 0.76134815, "epoch": 0.20764373933711486, "grad_norm": 7.9375, "learning_rate": 9.857774183337025e-06, "loss": 0.86685352, "memory(GiB)": 117.54, "step": 8900, "train_speed(iter/s)": 0.205593 }, { "acc": 0.77512984, "epoch": 0.20787704690940376, "grad_norm": 6.84375, "learning_rate": 9.8573264672301e-06, "loss": 0.8138916, "memory(GiB)": 117.54, "step": 8910, "train_speed(iter/s)": 0.205715 }, { "acc": 0.74891701, "epoch": 0.20811035448169266, "grad_norm": 5.4375, "learning_rate": 9.856878057741233e-06, "loss": 0.89883404, "memory(GiB)": 117.54, "step": 8920, "train_speed(iter/s)": 0.20584 }, { "acc": 0.76096349, "epoch": 0.20834366205398153, "grad_norm": 6.0, "learning_rate": 9.856428954934434e-06, "loss": 0.85659542, "memory(GiB)": 117.54, "step": 8930, "train_speed(iter/s)": 0.205957 }, { "acc": 0.76727762, "epoch": 0.20857696962627043, "grad_norm": 6.125, "learning_rate": 9.855979158873812e-06, "loss": 0.82931509, "memory(GiB)": 117.54, "step": 8940, "train_speed(iter/s)": 0.206077 }, { "acc": 0.76898785, "epoch": 0.20881027719855932, "grad_norm": 7.09375, "learning_rate": 9.855528669623576e-06, "loss": 0.81118364, "memory(GiB)": 117.54, "step": 8950, "train_speed(iter/s)": 0.206187 }, { "acc": 0.744419, "epoch": 0.20904358477084822, "grad_norm": 7.5, "learning_rate": 9.855077487248034e-06, "loss": 0.93352222, "memory(GiB)": 117.54, "step": 8960, "train_speed(iter/s)": 0.206303 }, { "acc": 0.7688417, "epoch": 0.20927689234313712, "grad_norm": 4.75, "learning_rate": 9.85462561181159e-06, "loss": 0.8121109, "memory(GiB)": 117.54, "step": 8970, "train_speed(iter/s)": 0.206412 }, { "acc": 0.7531055, "epoch": 0.209510199915426, "grad_norm": 5.96875, "learning_rate": 9.85417304337875e-06, "loss": 0.89911957, "memory(GiB)": 117.54, "step": 8980, "train_speed(iter/s)": 0.206523 }, { "acc": 0.74656286, "epoch": 0.2097435074877149, "grad_norm": 6.625, "learning_rate": 9.85371978201412e-06, "loss": 0.93082085, "memory(GiB)": 117.54, "step": 8990, "train_speed(iter/s)": 0.206644 }, { "acc": 0.7624054, "epoch": 0.2099768150600038, "grad_norm": 5.90625, "learning_rate": 9.8532658277824e-06, "loss": 0.86307392, "memory(GiB)": 117.54, "step": 9000, "train_speed(iter/s)": 0.206763 }, { "epoch": 0.2099768150600038, "eval_acc": 0.7314818418060042, "eval_loss": 0.8535876274108887, "eval_runtime": 1269.9932, "eval_samples_per_second": 28.34, "eval_steps_per_second": 14.17, "step": 9000 }, { "acc": 0.76916304, "epoch": 0.2102101226322927, "grad_norm": 5.125, "learning_rate": 9.852811180748391e-06, "loss": 0.83703365, "memory(GiB)": 117.54, "step": 9010, "train_speed(iter/s)": 0.200937 }, { "acc": 0.77324028, "epoch": 0.2104434302045816, "grad_norm": 8.75, "learning_rate": 9.852355840976996e-06, "loss": 0.8149888, "memory(GiB)": 117.54, "step": 9020, "train_speed(iter/s)": 0.201051 }, { "acc": 0.76231546, "epoch": 0.21067673777687046, "grad_norm": 7.28125, "learning_rate": 9.851899808533218e-06, "loss": 0.85619822, "memory(GiB)": 117.54, "step": 9030, "train_speed(iter/s)": 0.201166 }, { "acc": 0.76985378, "epoch": 0.21091004534915936, "grad_norm": 7.0, "learning_rate": 9.851443083482149e-06, "loss": 0.83200512, "memory(GiB)": 117.54, "step": 9040, "train_speed(iter/s)": 0.20128 }, { "acc": 0.7554831, "epoch": 0.21114335292144826, "grad_norm": 7.3125, "learning_rate": 9.850985665888988e-06, "loss": 0.89478712, "memory(GiB)": 117.54, "step": 9050, "train_speed(iter/s)": 0.201393 }, { "acc": 0.76672964, "epoch": 0.21137666049373716, "grad_norm": 5.65625, "learning_rate": 9.850527555819036e-06, "loss": 0.85093822, "memory(GiB)": 117.54, "step": 9060, "train_speed(iter/s)": 0.201505 }, { "acc": 0.76372004, "epoch": 0.21160996806602606, "grad_norm": 5.0625, "learning_rate": 9.850068753337683e-06, "loss": 0.86411572, "memory(GiB)": 117.54, "step": 9070, "train_speed(iter/s)": 0.201607 }, { "acc": 0.76573353, "epoch": 0.21184327563831493, "grad_norm": 5.25, "learning_rate": 9.849609258510423e-06, "loss": 0.83601494, "memory(GiB)": 117.54, "step": 9080, "train_speed(iter/s)": 0.201721 }, { "acc": 0.74441233, "epoch": 0.21207658321060383, "grad_norm": 5.4375, "learning_rate": 9.84914907140285e-06, "loss": 0.92128773, "memory(GiB)": 117.54, "step": 9090, "train_speed(iter/s)": 0.201835 }, { "acc": 0.76748171, "epoch": 0.21230989078289272, "grad_norm": 8.125, "learning_rate": 9.848688192080657e-06, "loss": 0.83393631, "memory(GiB)": 117.54, "step": 9100, "train_speed(iter/s)": 0.201944 }, { "acc": 0.7691443, "epoch": 0.21254319835518162, "grad_norm": 5.625, "learning_rate": 9.848226620609634e-06, "loss": 0.85348082, "memory(GiB)": 117.54, "step": 9110, "train_speed(iter/s)": 0.202057 }, { "acc": 0.75306101, "epoch": 0.21277650592747052, "grad_norm": 4.78125, "learning_rate": 9.847764357055669e-06, "loss": 0.90074921, "memory(GiB)": 117.54, "step": 9120, "train_speed(iter/s)": 0.202178 }, { "acc": 0.76210451, "epoch": 0.2130098134997594, "grad_norm": 5.71875, "learning_rate": 9.84730140148475e-06, "loss": 0.870508, "memory(GiB)": 117.54, "step": 9130, "train_speed(iter/s)": 0.202294 }, { "acc": 0.77442284, "epoch": 0.2132431210720483, "grad_norm": 6.0625, "learning_rate": 9.846837753962964e-06, "loss": 0.81348133, "memory(GiB)": 117.54, "step": 9140, "train_speed(iter/s)": 0.202414 }, { "acc": 0.75884104, "epoch": 0.2134764286443372, "grad_norm": 5.1875, "learning_rate": 9.846373414556495e-06, "loss": 0.85764046, "memory(GiB)": 117.54, "step": 9150, "train_speed(iter/s)": 0.202533 }, { "acc": 0.76104126, "epoch": 0.2137097362166261, "grad_norm": 20.75, "learning_rate": 9.84590838333163e-06, "loss": 0.86669235, "memory(GiB)": 117.54, "step": 9160, "train_speed(iter/s)": 0.202649 }, { "acc": 0.78756847, "epoch": 0.21394304378891496, "grad_norm": 4.8125, "learning_rate": 9.845442660354752e-06, "loss": 0.76561289, "memory(GiB)": 117.54, "step": 9170, "train_speed(iter/s)": 0.202771 }, { "acc": 0.76651459, "epoch": 0.21417635136120386, "grad_norm": 4.375, "learning_rate": 9.844976245692341e-06, "loss": 0.84633484, "memory(GiB)": 117.54, "step": 9180, "train_speed(iter/s)": 0.202893 }, { "acc": 0.75503793, "epoch": 0.21440965893349276, "grad_norm": 7.46875, "learning_rate": 9.84450913941098e-06, "loss": 0.88526068, "memory(GiB)": 117.54, "step": 9190, "train_speed(iter/s)": 0.202997 }, { "acc": 0.76714697, "epoch": 0.21464296650578166, "grad_norm": 6.0625, "learning_rate": 9.844041341577344e-06, "loss": 0.82248907, "memory(GiB)": 117.54, "step": 9200, "train_speed(iter/s)": 0.203109 }, { "acc": 0.74727135, "epoch": 0.21487627407807056, "grad_norm": 4.125, "learning_rate": 9.843572852258216e-06, "loss": 0.91963558, "memory(GiB)": 117.54, "step": 9210, "train_speed(iter/s)": 0.203216 }, { "acc": 0.7496871, "epoch": 0.21510958165035943, "grad_norm": 4.78125, "learning_rate": 9.843103671520469e-06, "loss": 0.92044659, "memory(GiB)": 117.54, "step": 9220, "train_speed(iter/s)": 0.203325 }, { "acc": 0.76813116, "epoch": 0.21534288922264833, "grad_norm": 7.75, "learning_rate": 9.842633799431081e-06, "loss": 0.83491936, "memory(GiB)": 117.54, "step": 9230, "train_speed(iter/s)": 0.203434 }, { "acc": 0.7559175, "epoch": 0.21557619679493722, "grad_norm": 7.75, "learning_rate": 9.842163236057123e-06, "loss": 0.86974545, "memory(GiB)": 117.54, "step": 9240, "train_speed(iter/s)": 0.203544 }, { "acc": 0.75992718, "epoch": 0.21580950436722612, "grad_norm": 4.46875, "learning_rate": 9.841691981465771e-06, "loss": 0.87016802, "memory(GiB)": 117.54, "step": 9250, "train_speed(iter/s)": 0.203653 }, { "acc": 0.74953775, "epoch": 0.21604281193951502, "grad_norm": 7.09375, "learning_rate": 9.841220035724295e-06, "loss": 0.92368717, "memory(GiB)": 117.54, "step": 9260, "train_speed(iter/s)": 0.203766 }, { "acc": 0.77228193, "epoch": 0.2162761195118039, "grad_norm": 4.53125, "learning_rate": 9.840747398900066e-06, "loss": 0.80582151, "memory(GiB)": 117.54, "step": 9270, "train_speed(iter/s)": 0.203884 }, { "acc": 0.74684772, "epoch": 0.2165094270840928, "grad_norm": 4.84375, "learning_rate": 9.840274071060552e-06, "loss": 0.92780819, "memory(GiB)": 117.54, "step": 9280, "train_speed(iter/s)": 0.203989 }, { "acc": 0.77230215, "epoch": 0.2167427346563817, "grad_norm": 7.28125, "learning_rate": 9.839800052273319e-06, "loss": 0.83060665, "memory(GiB)": 117.54, "step": 9290, "train_speed(iter/s)": 0.204102 }, { "acc": 0.7695189, "epoch": 0.2169760422286706, "grad_norm": 5.6875, "learning_rate": 9.839325342606034e-06, "loss": 0.83614769, "memory(GiB)": 117.54, "step": 9300, "train_speed(iter/s)": 0.204205 }, { "acc": 0.75516381, "epoch": 0.2172093498009595, "grad_norm": 4.90625, "learning_rate": 9.838849942126465e-06, "loss": 0.88942032, "memory(GiB)": 117.54, "step": 9310, "train_speed(iter/s)": 0.204315 }, { "acc": 0.75599709, "epoch": 0.21744265737324836, "grad_norm": 6.5625, "learning_rate": 9.83837385090247e-06, "loss": 0.90213356, "memory(GiB)": 117.54, "step": 9320, "train_speed(iter/s)": 0.204429 }, { "acc": 0.7904994, "epoch": 0.21767596494553726, "grad_norm": 5.40625, "learning_rate": 9.837897069002014e-06, "loss": 0.7411685, "memory(GiB)": 117.54, "step": 9330, "train_speed(iter/s)": 0.204543 }, { "acc": 0.76367722, "epoch": 0.21790927251782616, "grad_norm": 5.78125, "learning_rate": 9.837419596493158e-06, "loss": 0.86719389, "memory(GiB)": 117.54, "step": 9340, "train_speed(iter/s)": 0.204654 }, { "acc": 0.77815828, "epoch": 0.21814258009011506, "grad_norm": 4.59375, "learning_rate": 9.836941433444058e-06, "loss": 0.78728352, "memory(GiB)": 117.54, "step": 9350, "train_speed(iter/s)": 0.204767 }, { "acc": 0.75084362, "epoch": 0.21837588766240396, "grad_norm": 5.125, "learning_rate": 9.836462579922977e-06, "loss": 0.90360699, "memory(GiB)": 117.54, "step": 9360, "train_speed(iter/s)": 0.204881 }, { "acc": 0.76905317, "epoch": 0.21860919523469283, "grad_norm": 6.0, "learning_rate": 9.835983035998264e-06, "loss": 0.85155258, "memory(GiB)": 117.54, "step": 9370, "train_speed(iter/s)": 0.204998 }, { "acc": 0.79401808, "epoch": 0.21884250280698173, "grad_norm": 4.15625, "learning_rate": 9.835502801738379e-06, "loss": 0.7368948, "memory(GiB)": 117.54, "step": 9380, "train_speed(iter/s)": 0.205109 }, { "acc": 0.76240873, "epoch": 0.21907581037927062, "grad_norm": 12.5625, "learning_rate": 9.835021877211873e-06, "loss": 0.86140232, "memory(GiB)": 117.54, "step": 9390, "train_speed(iter/s)": 0.205222 }, { "acc": 0.77225218, "epoch": 0.21930911795155952, "grad_norm": 7.59375, "learning_rate": 9.834540262487399e-06, "loss": 0.79418178, "memory(GiB)": 117.54, "step": 9400, "train_speed(iter/s)": 0.205338 }, { "acc": 0.76001101, "epoch": 0.2195424255238484, "grad_norm": 5.53125, "learning_rate": 9.834057957633707e-06, "loss": 0.85294189, "memory(GiB)": 117.54, "step": 9410, "train_speed(iter/s)": 0.205444 }, { "acc": 0.75794129, "epoch": 0.2197757330961373, "grad_norm": 4.625, "learning_rate": 9.833574962719646e-06, "loss": 0.8874979, "memory(GiB)": 117.54, "step": 9420, "train_speed(iter/s)": 0.205545 }, { "acc": 0.75950623, "epoch": 0.2200090406684262, "grad_norm": 5.4375, "learning_rate": 9.833091277814163e-06, "loss": 0.86289253, "memory(GiB)": 117.54, "step": 9430, "train_speed(iter/s)": 0.205662 }, { "acc": 0.75737329, "epoch": 0.2202423482407151, "grad_norm": 5.4375, "learning_rate": 9.832606902986305e-06, "loss": 0.94861965, "memory(GiB)": 117.54, "step": 9440, "train_speed(iter/s)": 0.20577 }, { "acc": 0.78068657, "epoch": 0.220475655813004, "grad_norm": 8.25, "learning_rate": 9.832121838305214e-06, "loss": 0.7791955, "memory(GiB)": 117.54, "step": 9450, "train_speed(iter/s)": 0.205886 }, { "acc": 0.76481562, "epoch": 0.22070896338529286, "grad_norm": 5.5, "learning_rate": 9.831636083840135e-06, "loss": 0.84020872, "memory(GiB)": 117.54, "step": 9460, "train_speed(iter/s)": 0.205992 }, { "acc": 0.78759332, "epoch": 0.22094227095758176, "grad_norm": 5.8125, "learning_rate": 9.831149639660409e-06, "loss": 0.78061409, "memory(GiB)": 117.54, "step": 9470, "train_speed(iter/s)": 0.206099 }, { "acc": 0.76587334, "epoch": 0.22117557852987066, "grad_norm": 10.6875, "learning_rate": 9.830662505835476e-06, "loss": 0.84685173, "memory(GiB)": 117.54, "step": 9480, "train_speed(iter/s)": 0.206213 }, { "acc": 0.75680537, "epoch": 0.22140888610215956, "grad_norm": 6.6875, "learning_rate": 9.830174682434872e-06, "loss": 0.88343801, "memory(GiB)": 117.54, "step": 9490, "train_speed(iter/s)": 0.206329 }, { "acc": 0.76228046, "epoch": 0.22164219367444846, "grad_norm": 3.953125, "learning_rate": 9.829686169528237e-06, "loss": 0.87286196, "memory(GiB)": 117.54, "step": 9500, "train_speed(iter/s)": 0.206441 }, { "epoch": 0.22164219367444846, "eval_acc": 0.7321477215239133, "eval_loss": 0.8513048887252808, "eval_runtime": 1271.0526, "eval_samples_per_second": 28.316, "eval_steps_per_second": 14.158, "step": 9500 }, { "acc": 0.7736063, "epoch": 0.22187550124673733, "grad_norm": 4.78125, "learning_rate": 9.829196967185302e-06, "loss": 0.82306576, "memory(GiB)": 117.54, "step": 9510, "train_speed(iter/s)": 0.20093 }, { "acc": 0.77956977, "epoch": 0.22210880881902623, "grad_norm": 4.65625, "learning_rate": 9.828707075475905e-06, "loss": 0.78804388, "memory(GiB)": 117.54, "step": 9520, "train_speed(iter/s)": 0.201042 }, { "acc": 0.76058235, "epoch": 0.22234211639131513, "grad_norm": 8.6875, "learning_rate": 9.828216494469975e-06, "loss": 0.87693319, "memory(GiB)": 117.54, "step": 9530, "train_speed(iter/s)": 0.201149 }, { "acc": 0.77363548, "epoch": 0.22257542396360402, "grad_norm": 4.53125, "learning_rate": 9.827725224237542e-06, "loss": 0.83595715, "memory(GiB)": 117.54, "step": 9540, "train_speed(iter/s)": 0.201247 }, { "acc": 0.7730834, "epoch": 0.22280873153589292, "grad_norm": 8.8125, "learning_rate": 9.827233264848737e-06, "loss": 0.81986485, "memory(GiB)": 117.54, "step": 9550, "train_speed(iter/s)": 0.201349 }, { "acc": 0.74745684, "epoch": 0.2230420391081818, "grad_norm": 5.5625, "learning_rate": 9.826740616373785e-06, "loss": 0.89649019, "memory(GiB)": 117.54, "step": 9560, "train_speed(iter/s)": 0.201461 }, { "acc": 0.74992027, "epoch": 0.2232753466804707, "grad_norm": 8.9375, "learning_rate": 9.826247278883012e-06, "loss": 0.92695465, "memory(GiB)": 117.54, "step": 9570, "train_speed(iter/s)": 0.201565 }, { "acc": 0.78267574, "epoch": 0.2235086542527596, "grad_norm": 5.875, "learning_rate": 9.825753252446843e-06, "loss": 0.79742799, "memory(GiB)": 117.54, "step": 9580, "train_speed(iter/s)": 0.201669 }, { "acc": 0.78160505, "epoch": 0.2237419618250485, "grad_norm": 6.4375, "learning_rate": 9.825258537135798e-06, "loss": 0.8101265, "memory(GiB)": 117.54, "step": 9590, "train_speed(iter/s)": 0.201776 }, { "acc": 0.77349329, "epoch": 0.2239752693973374, "grad_norm": 4.71875, "learning_rate": 9.8247631330205e-06, "loss": 0.83796654, "memory(GiB)": 117.54, "step": 9600, "train_speed(iter/s)": 0.201886 }, { "acc": 0.78152819, "epoch": 0.22420857696962626, "grad_norm": 6.71875, "learning_rate": 9.824267040171666e-06, "loss": 0.80050898, "memory(GiB)": 117.54, "step": 9610, "train_speed(iter/s)": 0.201988 }, { "acc": 0.75929379, "epoch": 0.22444188454191516, "grad_norm": 4.90625, "learning_rate": 9.823770258660113e-06, "loss": 0.87012081, "memory(GiB)": 117.54, "step": 9620, "train_speed(iter/s)": 0.202098 }, { "acc": 0.78390207, "epoch": 0.22467519211420406, "grad_norm": 4.65625, "learning_rate": 9.823272788556757e-06, "loss": 0.76243534, "memory(GiB)": 117.54, "step": 9630, "train_speed(iter/s)": 0.202202 }, { "acc": 0.7557579, "epoch": 0.22490849968649296, "grad_norm": 7.53125, "learning_rate": 9.822774629932612e-06, "loss": 0.90164337, "memory(GiB)": 117.54, "step": 9640, "train_speed(iter/s)": 0.202301 }, { "acc": 0.76308393, "epoch": 0.22514180725878186, "grad_norm": 5.75, "learning_rate": 9.822275782858788e-06, "loss": 0.85689535, "memory(GiB)": 117.54, "step": 9650, "train_speed(iter/s)": 0.202409 }, { "acc": 0.7550004, "epoch": 0.22537511483107073, "grad_norm": 11.5625, "learning_rate": 9.821776247406498e-06, "loss": 0.8669714, "memory(GiB)": 117.54, "step": 9660, "train_speed(iter/s)": 0.202514 }, { "acc": 0.75697646, "epoch": 0.22560842240335963, "grad_norm": 5.03125, "learning_rate": 9.821276023647049e-06, "loss": 0.89136448, "memory(GiB)": 117.54, "step": 9670, "train_speed(iter/s)": 0.202619 }, { "acc": 0.77591777, "epoch": 0.22584172997564853, "grad_norm": 5.03125, "learning_rate": 9.820775111651849e-06, "loss": 0.80667782, "memory(GiB)": 117.54, "step": 9680, "train_speed(iter/s)": 0.202725 }, { "acc": 0.76816502, "epoch": 0.22607503754793742, "grad_norm": 5.96875, "learning_rate": 9.820273511492401e-06, "loss": 0.84152288, "memory(GiB)": 117.54, "step": 9690, "train_speed(iter/s)": 0.202834 }, { "acc": 0.76578364, "epoch": 0.2263083451202263, "grad_norm": 6.25, "learning_rate": 9.819771223240312e-06, "loss": 0.84982014, "memory(GiB)": 117.54, "step": 9700, "train_speed(iter/s)": 0.202942 }, { "acc": 0.76617751, "epoch": 0.2265416526925152, "grad_norm": 5.8125, "learning_rate": 9.819268246967279e-06, "loss": 0.82987022, "memory(GiB)": 117.54, "step": 9710, "train_speed(iter/s)": 0.203048 }, { "acc": 0.7701932, "epoch": 0.2267749602648041, "grad_norm": 7.21875, "learning_rate": 9.818764582745103e-06, "loss": 0.84979038, "memory(GiB)": 117.54, "step": 9720, "train_speed(iter/s)": 0.203157 }, { "acc": 0.77504597, "epoch": 0.227008267837093, "grad_norm": 5.25, "learning_rate": 9.818260230645684e-06, "loss": 0.81210556, "memory(GiB)": 117.54, "step": 9730, "train_speed(iter/s)": 0.203266 }, { "acc": 0.77041559, "epoch": 0.2272415754093819, "grad_norm": 5.28125, "learning_rate": 9.817755190741018e-06, "loss": 0.81857834, "memory(GiB)": 117.54, "step": 9740, "train_speed(iter/s)": 0.203372 }, { "acc": 0.76968975, "epoch": 0.22747488298167076, "grad_norm": 4.375, "learning_rate": 9.817249463103196e-06, "loss": 0.82577667, "memory(GiB)": 117.54, "step": 9750, "train_speed(iter/s)": 0.203475 }, { "acc": 0.79651542, "epoch": 0.22770819055395966, "grad_norm": 7.4375, "learning_rate": 9.816743047804413e-06, "loss": 0.73690844, "memory(GiB)": 117.54, "step": 9760, "train_speed(iter/s)": 0.203583 }, { "acc": 0.76188879, "epoch": 0.22794149812624856, "grad_norm": 4.5625, "learning_rate": 9.816235944916959e-06, "loss": 0.87093134, "memory(GiB)": 117.54, "step": 9770, "train_speed(iter/s)": 0.203682 }, { "acc": 0.76635122, "epoch": 0.22817480569853746, "grad_norm": 6.65625, "learning_rate": 9.815728154513224e-06, "loss": 0.84573898, "memory(GiB)": 117.54, "step": 9780, "train_speed(iter/s)": 0.203777 }, { "acc": 0.7656188, "epoch": 0.22840811327082636, "grad_norm": 8.5625, "learning_rate": 9.815219676665694e-06, "loss": 0.82872429, "memory(GiB)": 117.54, "step": 9790, "train_speed(iter/s)": 0.203881 }, { "acc": 0.75859795, "epoch": 0.22864142084311523, "grad_norm": 7.03125, "learning_rate": 9.814710511446954e-06, "loss": 0.88318329, "memory(GiB)": 117.54, "step": 9800, "train_speed(iter/s)": 0.203992 }, { "acc": 0.78200207, "epoch": 0.22887472841540413, "grad_norm": 5.375, "learning_rate": 9.814200658929686e-06, "loss": 0.78139582, "memory(GiB)": 117.54, "step": 9810, "train_speed(iter/s)": 0.204096 }, { "acc": 0.77523031, "epoch": 0.22910803598769303, "grad_norm": 6.53125, "learning_rate": 9.813690119186673e-06, "loss": 0.83075628, "memory(GiB)": 117.54, "step": 9820, "train_speed(iter/s)": 0.204201 }, { "acc": 0.76592264, "epoch": 0.22934134355998193, "grad_norm": 5.625, "learning_rate": 9.813178892290793e-06, "loss": 0.84281292, "memory(GiB)": 117.54, "step": 9830, "train_speed(iter/s)": 0.204306 }, { "acc": 0.75805759, "epoch": 0.22957465113227082, "grad_norm": 5.1875, "learning_rate": 9.812666978315026e-06, "loss": 0.88515167, "memory(GiB)": 117.54, "step": 9840, "train_speed(iter/s)": 0.204417 }, { "acc": 0.75032368, "epoch": 0.2298079587045597, "grad_norm": 5.03125, "learning_rate": 9.812154377332446e-06, "loss": 0.91492596, "memory(GiB)": 117.54, "step": 9850, "train_speed(iter/s)": 0.204526 }, { "acc": 0.77257566, "epoch": 0.2300412662768486, "grad_norm": 8.6875, "learning_rate": 9.811641089416225e-06, "loss": 0.81846771, "memory(GiB)": 117.54, "step": 9860, "train_speed(iter/s)": 0.20463 }, { "acc": 0.74543791, "epoch": 0.2302745738491375, "grad_norm": 7.96875, "learning_rate": 9.811127114639637e-06, "loss": 0.92871838, "memory(GiB)": 117.54, "step": 9870, "train_speed(iter/s)": 0.204741 }, { "acc": 0.79538689, "epoch": 0.2305078814214264, "grad_norm": 5.0625, "learning_rate": 9.810612453076052e-06, "loss": 0.72743883, "memory(GiB)": 117.54, "step": 9880, "train_speed(iter/s)": 0.204846 }, { "acc": 0.76809082, "epoch": 0.2307411889937153, "grad_norm": 5.75, "learning_rate": 9.810097104798934e-06, "loss": 0.84220142, "memory(GiB)": 117.54, "step": 9890, "train_speed(iter/s)": 0.204953 }, { "acc": 0.77071457, "epoch": 0.23097449656600416, "grad_norm": 7.28125, "learning_rate": 9.809581069881854e-06, "loss": 0.81765766, "memory(GiB)": 117.54, "step": 9900, "train_speed(iter/s)": 0.205051 }, { "acc": 0.76900225, "epoch": 0.23120780413829306, "grad_norm": 7.96875, "learning_rate": 9.809064348398474e-06, "loss": 0.84059877, "memory(GiB)": 117.54, "step": 9910, "train_speed(iter/s)": 0.205156 }, { "acc": 0.7770957, "epoch": 0.23144111171058196, "grad_norm": 7.5625, "learning_rate": 9.808546940422555e-06, "loss": 0.79744329, "memory(GiB)": 117.54, "step": 9920, "train_speed(iter/s)": 0.205261 }, { "acc": 0.78089809, "epoch": 0.23167441928287086, "grad_norm": 7.28125, "learning_rate": 9.808028846027954e-06, "loss": 0.79451566, "memory(GiB)": 117.54, "step": 9930, "train_speed(iter/s)": 0.205354 }, { "acc": 0.78215675, "epoch": 0.23190772685515973, "grad_norm": 5.09375, "learning_rate": 9.807510065288635e-06, "loss": 0.7874649, "memory(GiB)": 117.54, "step": 9940, "train_speed(iter/s)": 0.205452 }, { "acc": 0.7620882, "epoch": 0.23214103442744863, "grad_norm": 10.6875, "learning_rate": 9.806990598278651e-06, "loss": 0.8667222, "memory(GiB)": 117.54, "step": 9950, "train_speed(iter/s)": 0.205557 }, { "acc": 0.76581755, "epoch": 0.23237434199973753, "grad_norm": 6.125, "learning_rate": 9.806470445072156e-06, "loss": 0.83361053, "memory(GiB)": 123.09, "step": 9960, "train_speed(iter/s)": 0.205654 }, { "acc": 0.76487746, "epoch": 0.23260764957202643, "grad_norm": 5.1875, "learning_rate": 9.8059496057434e-06, "loss": 0.82635136, "memory(GiB)": 123.09, "step": 9970, "train_speed(iter/s)": 0.205754 }, { "acc": 0.75906668, "epoch": 0.23284095714431532, "grad_norm": 5.5, "learning_rate": 9.805428080366733e-06, "loss": 0.8668766, "memory(GiB)": 123.09, "step": 9980, "train_speed(iter/s)": 0.205859 }, { "acc": 0.75760293, "epoch": 0.2330742647166042, "grad_norm": 7.0, "learning_rate": 9.804905869016603e-06, "loss": 0.8750042, "memory(GiB)": 123.09, "step": 9990, "train_speed(iter/s)": 0.205958 }, { "acc": 0.75372882, "epoch": 0.2333075722888931, "grad_norm": 6.5625, "learning_rate": 9.804382971767559e-06, "loss": 0.92433701, "memory(GiB)": 123.09, "step": 10000, "train_speed(iter/s)": 0.20606 }, { "epoch": 0.2333075722888931, "eval_acc": 0.7324511948441245, "eval_loss": 0.8497293591499329, "eval_runtime": 1272.2316, "eval_samples_per_second": 28.29, "eval_steps_per_second": 14.145, "step": 10000 }, { "acc": 0.77208815, "epoch": 0.233540879861182, "grad_norm": 5.46875, "learning_rate": 9.803859388694238e-06, "loss": 0.82612524, "memory(GiB)": 123.09, "step": 10010, "train_speed(iter/s)": 0.200834 }, { "acc": 0.76006479, "epoch": 0.2337741874334709, "grad_norm": 7.21875, "learning_rate": 9.803335119871388e-06, "loss": 0.87721148, "memory(GiB)": 123.09, "step": 10020, "train_speed(iter/s)": 0.200935 }, { "acc": 0.74797354, "epoch": 0.2340074950057598, "grad_norm": 5.125, "learning_rate": 9.802810165373845e-06, "loss": 0.93788738, "memory(GiB)": 123.09, "step": 10030, "train_speed(iter/s)": 0.201046 }, { "acc": 0.75885124, "epoch": 0.23424080257804866, "grad_norm": 9.375, "learning_rate": 9.802284525276544e-06, "loss": 0.86786547, "memory(GiB)": 123.09, "step": 10040, "train_speed(iter/s)": 0.201152 }, { "acc": 0.77307854, "epoch": 0.23447411015033756, "grad_norm": 4.6875, "learning_rate": 9.801758199654522e-06, "loss": 0.81534319, "memory(GiB)": 123.09, "step": 10050, "train_speed(iter/s)": 0.201251 }, { "acc": 0.75475931, "epoch": 0.23470741772262646, "grad_norm": 5.71875, "learning_rate": 9.801231188582914e-06, "loss": 0.88743048, "memory(GiB)": 123.09, "step": 10060, "train_speed(iter/s)": 0.201359 }, { "acc": 0.7665451, "epoch": 0.23494072529491536, "grad_norm": 6.53125, "learning_rate": 9.800703492136948e-06, "loss": 0.84892178, "memory(GiB)": 123.09, "step": 10070, "train_speed(iter/s)": 0.201466 }, { "acc": 0.77399416, "epoch": 0.23517403286720426, "grad_norm": 4.53125, "learning_rate": 9.800175110391952e-06, "loss": 0.82023621, "memory(GiB)": 123.09, "step": 10080, "train_speed(iter/s)": 0.20157 }, { "acc": 0.75192294, "epoch": 0.23540734043949313, "grad_norm": 4.875, "learning_rate": 9.799646043423353e-06, "loss": 0.91079044, "memory(GiB)": 123.09, "step": 10090, "train_speed(iter/s)": 0.201671 }, { "acc": 0.76243572, "epoch": 0.23564064801178203, "grad_norm": 5.71875, "learning_rate": 9.799116291306677e-06, "loss": 0.85686026, "memory(GiB)": 123.09, "step": 10100, "train_speed(iter/s)": 0.201774 }, { "acc": 0.76248827, "epoch": 0.23587395558407093, "grad_norm": 5.3125, "learning_rate": 9.798585854117543e-06, "loss": 0.856462, "memory(GiB)": 123.09, "step": 10110, "train_speed(iter/s)": 0.201874 }, { "acc": 0.77080183, "epoch": 0.23610726315635983, "grad_norm": 4.4375, "learning_rate": 9.798054731931674e-06, "loss": 0.8349369, "memory(GiB)": 123.09, "step": 10120, "train_speed(iter/s)": 0.201974 }, { "acc": 0.76452069, "epoch": 0.23634057072864872, "grad_norm": 7.125, "learning_rate": 9.797522924824886e-06, "loss": 0.85023165, "memory(GiB)": 123.09, "step": 10130, "train_speed(iter/s)": 0.202084 }, { "acc": 0.77273831, "epoch": 0.2365738783009376, "grad_norm": 4.84375, "learning_rate": 9.796990432873093e-06, "loss": 0.8241353, "memory(GiB)": 123.09, "step": 10140, "train_speed(iter/s)": 0.202192 }, { "acc": 0.77139888, "epoch": 0.2368071858732265, "grad_norm": 7.34375, "learning_rate": 9.79645725615231e-06, "loss": 0.85567112, "memory(GiB)": 123.09, "step": 10150, "train_speed(iter/s)": 0.202304 }, { "acc": 0.76190987, "epoch": 0.2370404934455154, "grad_norm": 4.875, "learning_rate": 9.795923394738646e-06, "loss": 0.8605545, "memory(GiB)": 126.99, "step": 10160, "train_speed(iter/s)": 0.202404 }, { "acc": 0.76465502, "epoch": 0.2372738010178043, "grad_norm": 9.1875, "learning_rate": 9.795388848708312e-06, "loss": 0.86345005, "memory(GiB)": 126.99, "step": 10170, "train_speed(iter/s)": 0.202507 }, { "acc": 0.76446505, "epoch": 0.23750710859009316, "grad_norm": 5.5, "learning_rate": 9.794853618137612e-06, "loss": 0.86249762, "memory(GiB)": 126.99, "step": 10180, "train_speed(iter/s)": 0.202612 }, { "acc": 0.76790714, "epoch": 0.23774041616238206, "grad_norm": 7.15625, "learning_rate": 9.794317703102951e-06, "loss": 0.82229271, "memory(GiB)": 126.99, "step": 10190, "train_speed(iter/s)": 0.202713 }, { "acc": 0.76957502, "epoch": 0.23797372373467096, "grad_norm": 11.125, "learning_rate": 9.793781103680833e-06, "loss": 0.86161594, "memory(GiB)": 126.99, "step": 10200, "train_speed(iter/s)": 0.202818 }, { "acc": 0.7653698, "epoch": 0.23820703130695986, "grad_norm": 4.40625, "learning_rate": 9.793243819947851e-06, "loss": 0.84829473, "memory(GiB)": 126.99, "step": 10210, "train_speed(iter/s)": 0.202925 }, { "acc": 0.7630024, "epoch": 0.23844033887924876, "grad_norm": 6.875, "learning_rate": 9.79270585198071e-06, "loss": 0.84405813, "memory(GiB)": 126.99, "step": 10220, "train_speed(iter/s)": 0.203028 }, { "acc": 0.76894646, "epoch": 0.23867364645153763, "grad_norm": 6.6875, "learning_rate": 9.792167199856198e-06, "loss": 0.83470745, "memory(GiB)": 126.99, "step": 10230, "train_speed(iter/s)": 0.203128 }, { "acc": 0.77872562, "epoch": 0.23890695402382653, "grad_norm": 8.5625, "learning_rate": 9.791627863651212e-06, "loss": 0.78085318, "memory(GiB)": 126.99, "step": 10240, "train_speed(iter/s)": 0.203233 }, { "acc": 0.76503386, "epoch": 0.23914026159611543, "grad_norm": 7.28125, "learning_rate": 9.791087843442738e-06, "loss": 0.87600975, "memory(GiB)": 126.99, "step": 10250, "train_speed(iter/s)": 0.203335 }, { "acc": 0.7559289, "epoch": 0.23937356916840433, "grad_norm": 8.8125, "learning_rate": 9.790547139307869e-06, "loss": 0.90565586, "memory(GiB)": 126.99, "step": 10260, "train_speed(iter/s)": 0.203439 }, { "acc": 0.77096276, "epoch": 0.23960687674069323, "grad_norm": 6.53125, "learning_rate": 9.790005751323787e-06, "loss": 0.83454618, "memory(GiB)": 126.99, "step": 10270, "train_speed(iter/s)": 0.203535 }, { "acc": 0.77344503, "epoch": 0.2398401843129821, "grad_norm": 5.0625, "learning_rate": 9.789463679567775e-06, "loss": 0.82620983, "memory(GiB)": 126.99, "step": 10280, "train_speed(iter/s)": 0.203636 }, { "acc": 0.75782394, "epoch": 0.240073491885271, "grad_norm": 5.625, "learning_rate": 9.788920924117213e-06, "loss": 0.89050446, "memory(GiB)": 126.99, "step": 10290, "train_speed(iter/s)": 0.203738 }, { "acc": 0.76385994, "epoch": 0.2403067994575599, "grad_norm": 5.15625, "learning_rate": 9.788377485049583e-06, "loss": 0.85634632, "memory(GiB)": 126.99, "step": 10300, "train_speed(iter/s)": 0.203836 }, { "acc": 0.7727993, "epoch": 0.2405401070298488, "grad_norm": 4.875, "learning_rate": 9.787833362442456e-06, "loss": 0.81806173, "memory(GiB)": 126.99, "step": 10310, "train_speed(iter/s)": 0.203937 }, { "acc": 0.75147309, "epoch": 0.2407734146021377, "grad_norm": 6.15625, "learning_rate": 9.78728855637351e-06, "loss": 0.8997839, "memory(GiB)": 126.99, "step": 10320, "train_speed(iter/s)": 0.204036 }, { "acc": 0.77584658, "epoch": 0.24100672217442656, "grad_norm": 7.5, "learning_rate": 9.786743066920509e-06, "loss": 0.79813576, "memory(GiB)": 126.99, "step": 10330, "train_speed(iter/s)": 0.204138 }, { "acc": 0.7439743, "epoch": 0.24124002974671546, "grad_norm": 4.90625, "learning_rate": 9.786196894161329e-06, "loss": 0.9293148, "memory(GiB)": 126.99, "step": 10340, "train_speed(iter/s)": 0.20424 }, { "acc": 0.77829103, "epoch": 0.24147333731900436, "grad_norm": 6.625, "learning_rate": 9.78565003817393e-06, "loss": 0.78798056, "memory(GiB)": 126.99, "step": 10350, "train_speed(iter/s)": 0.204339 }, { "acc": 0.75890827, "epoch": 0.24170664489129326, "grad_norm": 5.5625, "learning_rate": 9.78510249903638e-06, "loss": 0.87458324, "memory(GiB)": 126.99, "step": 10360, "train_speed(iter/s)": 0.204444 }, { "acc": 0.76287951, "epoch": 0.24193995246358216, "grad_norm": 4.84375, "learning_rate": 9.784554276826839e-06, "loss": 0.84869385, "memory(GiB)": 126.99, "step": 10370, "train_speed(iter/s)": 0.204546 }, { "acc": 0.74085793, "epoch": 0.24217326003587103, "grad_norm": 6.625, "learning_rate": 9.784005371623564e-06, "loss": 0.93879375, "memory(GiB)": 126.99, "step": 10380, "train_speed(iter/s)": 0.204649 }, { "acc": 0.77869744, "epoch": 0.24240656760815993, "grad_norm": 5.15625, "learning_rate": 9.783455783504911e-06, "loss": 0.78200302, "memory(GiB)": 126.99, "step": 10390, "train_speed(iter/s)": 0.20475 }, { "acc": 0.7931366, "epoch": 0.24263987518044883, "grad_norm": 6.40625, "learning_rate": 9.782905512549336e-06, "loss": 0.73280854, "memory(GiB)": 126.99, "step": 10400, "train_speed(iter/s)": 0.204851 }, { "acc": 0.75236487, "epoch": 0.24287318275273773, "grad_norm": 4.375, "learning_rate": 9.78235455883539e-06, "loss": 0.90658245, "memory(GiB)": 126.99, "step": 10410, "train_speed(iter/s)": 0.204943 }, { "acc": 0.7735096, "epoch": 0.24310649032502663, "grad_norm": 8.75, "learning_rate": 9.781802922441716e-06, "loss": 0.81277504, "memory(GiB)": 126.99, "step": 10420, "train_speed(iter/s)": 0.205038 }, { "acc": 0.76845932, "epoch": 0.2433397978973155, "grad_norm": 5.84375, "learning_rate": 9.781250603447069e-06, "loss": 0.85649538, "memory(GiB)": 126.99, "step": 10430, "train_speed(iter/s)": 0.205138 }, { "acc": 0.76778679, "epoch": 0.2435731054696044, "grad_norm": 7.40625, "learning_rate": 9.780697601930282e-06, "loss": 0.84032097, "memory(GiB)": 126.99, "step": 10440, "train_speed(iter/s)": 0.205233 }, { "acc": 0.77375274, "epoch": 0.2438064130418933, "grad_norm": 4.96875, "learning_rate": 9.780143917970304e-06, "loss": 0.81406355, "memory(GiB)": 126.99, "step": 10450, "train_speed(iter/s)": 0.205333 }, { "acc": 0.76430569, "epoch": 0.2440397206141822, "grad_norm": 6.09375, "learning_rate": 9.77958955164617e-06, "loss": 0.84869604, "memory(GiB)": 126.99, "step": 10460, "train_speed(iter/s)": 0.205429 }, { "acc": 0.75566778, "epoch": 0.24427302818647106, "grad_norm": 6.5, "learning_rate": 9.779034503037016e-06, "loss": 0.88865891, "memory(GiB)": 126.99, "step": 10470, "train_speed(iter/s)": 0.205519 }, { "acc": 0.77346563, "epoch": 0.24450633575875996, "grad_norm": 6.09375, "learning_rate": 9.778478772222075e-06, "loss": 0.82391491, "memory(GiB)": 126.99, "step": 10480, "train_speed(iter/s)": 0.20562 }, { "acc": 0.7746438, "epoch": 0.24473964333104886, "grad_norm": 7.84375, "learning_rate": 9.777922359280677e-06, "loss": 0.80834398, "memory(GiB)": 126.99, "step": 10490, "train_speed(iter/s)": 0.205719 }, { "acc": 0.75339031, "epoch": 0.24497295090333776, "grad_norm": 6.875, "learning_rate": 9.777365264292252e-06, "loss": 0.88665466, "memory(GiB)": 126.99, "step": 10500, "train_speed(iter/s)": 0.205821 }, { "epoch": 0.24497295090333776, "eval_acc": 0.7332850658617177, "eval_loss": 0.8476359844207764, "eval_runtime": 1270.6663, "eval_samples_per_second": 28.325, "eval_steps_per_second": 14.163, "step": 10500 }, { "acc": 0.77165108, "epoch": 0.24520625847562666, "grad_norm": 5.03125, "learning_rate": 9.77680748733632e-06, "loss": 0.81288023, "memory(GiB)": 126.99, "step": 10510, "train_speed(iter/s)": 0.200861 }, { "acc": 0.75663023, "epoch": 0.24543956604791553, "grad_norm": 4.46875, "learning_rate": 9.77624902849251e-06, "loss": 0.89590054, "memory(GiB)": 126.99, "step": 10520, "train_speed(iter/s)": 0.200959 }, { "acc": 0.76394377, "epoch": 0.24567287362020443, "grad_norm": 5.53125, "learning_rate": 9.775689887840537e-06, "loss": 0.86777287, "memory(GiB)": 126.99, "step": 10530, "train_speed(iter/s)": 0.201059 }, { "acc": 0.76071887, "epoch": 0.24590618119249333, "grad_norm": 26.5, "learning_rate": 9.775130065460222e-06, "loss": 0.88518524, "memory(GiB)": 126.99, "step": 10540, "train_speed(iter/s)": 0.201157 }, { "acc": 0.77137909, "epoch": 0.24613948876478223, "grad_norm": 7.40625, "learning_rate": 9.774569561431474e-06, "loss": 0.83860283, "memory(GiB)": 126.99, "step": 10550, "train_speed(iter/s)": 0.201256 }, { "acc": 0.7849823, "epoch": 0.24637279633707113, "grad_norm": 5.53125, "learning_rate": 9.77400837583431e-06, "loss": 0.7651825, "memory(GiB)": 126.99, "step": 10560, "train_speed(iter/s)": 0.201362 }, { "acc": 0.76274223, "epoch": 0.24660610390936, "grad_norm": 7.5, "learning_rate": 9.773446508748836e-06, "loss": 0.86177063, "memory(GiB)": 126.99, "step": 10570, "train_speed(iter/s)": 0.201458 }, { "acc": 0.75613794, "epoch": 0.2468394114816489, "grad_norm": 5.96875, "learning_rate": 9.772883960255261e-06, "loss": 0.89295292, "memory(GiB)": 126.99, "step": 10580, "train_speed(iter/s)": 0.201555 }, { "acc": 0.76609902, "epoch": 0.2470727190539378, "grad_norm": 6.1875, "learning_rate": 9.772320730433886e-06, "loss": 0.84143562, "memory(GiB)": 126.99, "step": 10590, "train_speed(iter/s)": 0.201655 }, { "acc": 0.75564928, "epoch": 0.2473060266262267, "grad_norm": 6.3125, "learning_rate": 9.771756819365114e-06, "loss": 0.88248844, "memory(GiB)": 126.99, "step": 10600, "train_speed(iter/s)": 0.201757 }, { "acc": 0.7873702, "epoch": 0.2475393341985156, "grad_norm": 7.78125, "learning_rate": 9.771192227129442e-06, "loss": 0.78887529, "memory(GiB)": 126.99, "step": 10610, "train_speed(iter/s)": 0.201848 }, { "acc": 0.78069005, "epoch": 0.24777264177080446, "grad_norm": 4.65625, "learning_rate": 9.770626953807468e-06, "loss": 0.78519044, "memory(GiB)": 126.99, "step": 10620, "train_speed(iter/s)": 0.201946 }, { "acc": 0.76918497, "epoch": 0.24800594934309336, "grad_norm": 6.0, "learning_rate": 9.770060999479878e-06, "loss": 0.83556156, "memory(GiB)": 126.99, "step": 10630, "train_speed(iter/s)": 0.202046 }, { "acc": 0.76681919, "epoch": 0.24823925691538226, "grad_norm": 4.875, "learning_rate": 9.769494364227468e-06, "loss": 0.83850193, "memory(GiB)": 126.99, "step": 10640, "train_speed(iter/s)": 0.202148 }, { "acc": 0.78110733, "epoch": 0.24847256448767116, "grad_norm": 4.84375, "learning_rate": 9.768927048131122e-06, "loss": 0.77184925, "memory(GiB)": 126.99, "step": 10650, "train_speed(iter/s)": 0.202252 }, { "acc": 0.76969419, "epoch": 0.24870587205996006, "grad_norm": 5.875, "learning_rate": 9.768359051271827e-06, "loss": 0.85380192, "memory(GiB)": 126.99, "step": 10660, "train_speed(iter/s)": 0.202351 }, { "acc": 0.7685781, "epoch": 0.24893917963224893, "grad_norm": 11.375, "learning_rate": 9.767790373730663e-06, "loss": 0.84533386, "memory(GiB)": 126.99, "step": 10670, "train_speed(iter/s)": 0.202452 }, { "acc": 0.77039485, "epoch": 0.24917248720453783, "grad_norm": 5.6875, "learning_rate": 9.767221015588807e-06, "loss": 0.84062462, "memory(GiB)": 126.99, "step": 10680, "train_speed(iter/s)": 0.202549 }, { "acc": 0.77639532, "epoch": 0.24940579477682673, "grad_norm": 5.84375, "learning_rate": 9.766650976927536e-06, "loss": 0.81324234, "memory(GiB)": 126.99, "step": 10690, "train_speed(iter/s)": 0.202647 }, { "acc": 0.77875633, "epoch": 0.24963910234911563, "grad_norm": 7.96875, "learning_rate": 9.766080257828223e-06, "loss": 0.80693998, "memory(GiB)": 126.99, "step": 10700, "train_speed(iter/s)": 0.202741 }, { "acc": 0.7646112, "epoch": 0.2498724099214045, "grad_norm": 6.28125, "learning_rate": 9.765508858372337e-06, "loss": 0.85661631, "memory(GiB)": 126.99, "step": 10710, "train_speed(iter/s)": 0.202837 }, { "acc": 0.76114054, "epoch": 0.2501057174936934, "grad_norm": 6.65625, "learning_rate": 9.764936778641448e-06, "loss": 0.88623295, "memory(GiB)": 126.99, "step": 10720, "train_speed(iter/s)": 0.202941 }, { "acc": 0.77698221, "epoch": 0.2503390250659823, "grad_norm": 7.75, "learning_rate": 9.764364018717215e-06, "loss": 0.79923716, "memory(GiB)": 126.99, "step": 10730, "train_speed(iter/s)": 0.203033 }, { "acc": 0.74807673, "epoch": 0.25057233263827117, "grad_norm": 6.4375, "learning_rate": 9.763790578681404e-06, "loss": 0.92849827, "memory(GiB)": 126.99, "step": 10740, "train_speed(iter/s)": 0.20313 }, { "acc": 0.76602993, "epoch": 0.25080564021056007, "grad_norm": 5.9375, "learning_rate": 9.763216458615871e-06, "loss": 0.8343996, "memory(GiB)": 126.99, "step": 10750, "train_speed(iter/s)": 0.203226 }, { "acc": 0.77881093, "epoch": 0.25103894778284896, "grad_norm": 5.25, "learning_rate": 9.762641658602575e-06, "loss": 0.7768116, "memory(GiB)": 126.99, "step": 10760, "train_speed(iter/s)": 0.203323 }, { "acc": 0.74730372, "epoch": 0.25127225535513786, "grad_norm": 5.84375, "learning_rate": 9.762066178723562e-06, "loss": 0.90061388, "memory(GiB)": 126.99, "step": 10770, "train_speed(iter/s)": 0.203419 }, { "acc": 0.77457342, "epoch": 0.25150556292742676, "grad_norm": 5.34375, "learning_rate": 9.761490019060988e-06, "loss": 0.80694494, "memory(GiB)": 126.99, "step": 10780, "train_speed(iter/s)": 0.203511 }, { "acc": 0.76837416, "epoch": 0.25173887049971566, "grad_norm": 5.34375, "learning_rate": 9.760913179697095e-06, "loss": 0.82134075, "memory(GiB)": 126.99, "step": 10790, "train_speed(iter/s)": 0.203603 }, { "acc": 0.75787249, "epoch": 0.25197217807200456, "grad_norm": 6.125, "learning_rate": 9.76033566071423e-06, "loss": 0.88264313, "memory(GiB)": 126.99, "step": 10800, "train_speed(iter/s)": 0.203694 }, { "acc": 0.75982161, "epoch": 0.25220548564429346, "grad_norm": 4.65625, "learning_rate": 9.759757462194832e-06, "loss": 0.8837141, "memory(GiB)": 126.99, "step": 10810, "train_speed(iter/s)": 0.203783 }, { "acc": 0.78241043, "epoch": 0.25243879321658236, "grad_norm": 6.09375, "learning_rate": 9.759178584221439e-06, "loss": 0.79735069, "memory(GiB)": 126.99, "step": 10820, "train_speed(iter/s)": 0.203875 }, { "acc": 0.76798344, "epoch": 0.2526721007888712, "grad_norm": 7.125, "learning_rate": 9.758599026876685e-06, "loss": 0.83385935, "memory(GiB)": 126.99, "step": 10830, "train_speed(iter/s)": 0.203966 }, { "acc": 0.77368031, "epoch": 0.2529054083611601, "grad_norm": 7.65625, "learning_rate": 9.758018790243304e-06, "loss": 0.8143096, "memory(GiB)": 126.99, "step": 10840, "train_speed(iter/s)": 0.204058 }, { "acc": 0.75938997, "epoch": 0.253138715933449, "grad_norm": 5.59375, "learning_rate": 9.757437874404121e-06, "loss": 0.85309505, "memory(GiB)": 126.99, "step": 10850, "train_speed(iter/s)": 0.204149 }, { "acc": 0.78056459, "epoch": 0.2533720235057379, "grad_norm": 5.625, "learning_rate": 9.756856279442064e-06, "loss": 0.77668962, "memory(GiB)": 126.99, "step": 10860, "train_speed(iter/s)": 0.204244 }, { "acc": 0.74807405, "epoch": 0.2536053310780268, "grad_norm": 6.0625, "learning_rate": 9.756274005440156e-06, "loss": 0.92297764, "memory(GiB)": 126.99, "step": 10870, "train_speed(iter/s)": 0.204338 }, { "acc": 0.76769743, "epoch": 0.2538386386503157, "grad_norm": 4.625, "learning_rate": 9.755691052481515e-06, "loss": 0.81955452, "memory(GiB)": 126.99, "step": 10880, "train_speed(iter/s)": 0.204435 }, { "acc": 0.7821074, "epoch": 0.2540719462226046, "grad_norm": 5.78125, "learning_rate": 9.755107420649357e-06, "loss": 0.77177048, "memory(GiB)": 126.99, "step": 10890, "train_speed(iter/s)": 0.204532 }, { "acc": 0.76513309, "epoch": 0.2543052537948935, "grad_norm": 5.1875, "learning_rate": 9.754523110026997e-06, "loss": 0.84079628, "memory(GiB)": 126.99, "step": 10900, "train_speed(iter/s)": 0.204625 }, { "acc": 0.77776675, "epoch": 0.2545385613671824, "grad_norm": 7.125, "learning_rate": 9.753938120697843e-06, "loss": 0.79896116, "memory(GiB)": 126.99, "step": 10910, "train_speed(iter/s)": 0.204718 }, { "acc": 0.77102752, "epoch": 0.2547718689394713, "grad_norm": 5.34375, "learning_rate": 9.753352452745406e-06, "loss": 0.8230175, "memory(GiB)": 126.99, "step": 10920, "train_speed(iter/s)": 0.204808 }, { "acc": 0.7826086, "epoch": 0.25500517651176013, "grad_norm": 8.3125, "learning_rate": 9.752766106253285e-06, "loss": 0.77752132, "memory(GiB)": 126.99, "step": 10930, "train_speed(iter/s)": 0.204896 }, { "acc": 0.76253233, "epoch": 0.25523848408404903, "grad_norm": 4.75, "learning_rate": 9.752179081305184e-06, "loss": 0.85609999, "memory(GiB)": 126.99, "step": 10940, "train_speed(iter/s)": 0.204985 }, { "acc": 0.78686914, "epoch": 0.25547179165633793, "grad_norm": 5.65625, "learning_rate": 9.751591377984899e-06, "loss": 0.78415165, "memory(GiB)": 126.99, "step": 10950, "train_speed(iter/s)": 0.205081 }, { "acc": 0.76985493, "epoch": 0.25570509922862683, "grad_norm": 13.25, "learning_rate": 9.751002996376324e-06, "loss": 0.87296238, "memory(GiB)": 126.99, "step": 10960, "train_speed(iter/s)": 0.205177 }, { "acc": 0.77404528, "epoch": 0.25593840680091573, "grad_norm": 4.375, "learning_rate": 9.750413936563454e-06, "loss": 0.80378752, "memory(GiB)": 126.99, "step": 10970, "train_speed(iter/s)": 0.20527 }, { "acc": 0.76339779, "epoch": 0.25617171437320463, "grad_norm": 6.125, "learning_rate": 9.749824198630371e-06, "loss": 0.87236614, "memory(GiB)": 126.99, "step": 10980, "train_speed(iter/s)": 0.205367 }, { "acc": 0.76326017, "epoch": 0.2564050219454935, "grad_norm": 4.78125, "learning_rate": 9.749233782661267e-06, "loss": 0.84402294, "memory(GiB)": 126.99, "step": 10990, "train_speed(iter/s)": 0.20546 }, { "acc": 0.78211966, "epoch": 0.2566383295177824, "grad_norm": 4.4375, "learning_rate": 9.74864268874042e-06, "loss": 0.79133849, "memory(GiB)": 126.99, "step": 11000, "train_speed(iter/s)": 0.20555 }, { "epoch": 0.2566383295177824, "eval_acc": 0.73350094028686, "eval_loss": 0.8474711775779724, "eval_runtime": 1269.9892, "eval_samples_per_second": 28.34, "eval_steps_per_second": 14.17, "step": 11000 }, { "acc": 0.76290751, "epoch": 0.2568716370900713, "grad_norm": 4.28125, "learning_rate": 9.748050916952206e-06, "loss": 0.85341339, "memory(GiB)": 126.99, "step": 11010, "train_speed(iter/s)": 0.200816 }, { "acc": 0.76456928, "epoch": 0.2571049446623602, "grad_norm": 4.84375, "learning_rate": 9.747458467381104e-06, "loss": 0.83613319, "memory(GiB)": 126.99, "step": 11020, "train_speed(iter/s)": 0.200915 }, { "acc": 0.77785635, "epoch": 0.25733825223464907, "grad_norm": 6.875, "learning_rate": 9.746865340111686e-06, "loss": 0.78845282, "memory(GiB)": 126.99, "step": 11030, "train_speed(iter/s)": 0.201015 }, { "acc": 0.77325573, "epoch": 0.25757155980693797, "grad_norm": 4.6875, "learning_rate": 9.74627153522862e-06, "loss": 0.82584515, "memory(GiB)": 126.99, "step": 11040, "train_speed(iter/s)": 0.201115 }, { "acc": 0.75915756, "epoch": 0.25780486737922687, "grad_norm": 7.75, "learning_rate": 9.74567705281667e-06, "loss": 0.86543064, "memory(GiB)": 126.99, "step": 11050, "train_speed(iter/s)": 0.201208 }, { "acc": 0.75994439, "epoch": 0.25803817495151576, "grad_norm": 5.46875, "learning_rate": 9.745081892960699e-06, "loss": 0.87644157, "memory(GiB)": 126.99, "step": 11060, "train_speed(iter/s)": 0.201305 }, { "acc": 0.75638137, "epoch": 0.25827148252380466, "grad_norm": 6.21875, "learning_rate": 9.744486055745667e-06, "loss": 0.90656376, "memory(GiB)": 126.99, "step": 11070, "train_speed(iter/s)": 0.201399 }, { "acc": 0.76791043, "epoch": 0.25850479009609356, "grad_norm": 4.8125, "learning_rate": 9.743889541256628e-06, "loss": 0.85354557, "memory(GiB)": 126.99, "step": 11080, "train_speed(iter/s)": 0.201493 }, { "acc": 0.78517094, "epoch": 0.25873809766838246, "grad_norm": 10.5, "learning_rate": 9.743292349578737e-06, "loss": 0.79755116, "memory(GiB)": 126.99, "step": 11090, "train_speed(iter/s)": 0.201588 }, { "acc": 0.77367783, "epoch": 0.25897140524067136, "grad_norm": 6.375, "learning_rate": 9.742694480797239e-06, "loss": 0.8688302, "memory(GiB)": 126.99, "step": 11100, "train_speed(iter/s)": 0.201679 }, { "acc": 0.76761403, "epoch": 0.25920471281296026, "grad_norm": 5.71875, "learning_rate": 9.742095934997482e-06, "loss": 0.84129333, "memory(GiB)": 126.99, "step": 11110, "train_speed(iter/s)": 0.201772 }, { "acc": 0.7736413, "epoch": 0.2594380203852491, "grad_norm": 5.28125, "learning_rate": 9.741496712264908e-06, "loss": 0.82993221, "memory(GiB)": 126.99, "step": 11120, "train_speed(iter/s)": 0.20186 }, { "acc": 0.80244122, "epoch": 0.259671327957538, "grad_norm": 9.8125, "learning_rate": 9.740896812685057e-06, "loss": 0.70631914, "memory(GiB)": 126.99, "step": 11130, "train_speed(iter/s)": 0.201957 }, { "acc": 0.78163109, "epoch": 0.2599046355298269, "grad_norm": 5.21875, "learning_rate": 9.740296236343561e-06, "loss": 0.7652422, "memory(GiB)": 126.99, "step": 11140, "train_speed(iter/s)": 0.202052 }, { "acc": 0.79088631, "epoch": 0.2601379431021158, "grad_norm": 6.8125, "learning_rate": 9.739694983326155e-06, "loss": 0.74674163, "memory(GiB)": 126.99, "step": 11150, "train_speed(iter/s)": 0.20215 }, { "acc": 0.75785141, "epoch": 0.2603712506744047, "grad_norm": 10.1875, "learning_rate": 9.739093053718669e-06, "loss": 0.86176929, "memory(GiB)": 126.99, "step": 11160, "train_speed(iter/s)": 0.20224 }, { "acc": 0.77965183, "epoch": 0.2606045582466936, "grad_norm": 5.71875, "learning_rate": 9.738490447607025e-06, "loss": 0.78320961, "memory(GiB)": 126.99, "step": 11170, "train_speed(iter/s)": 0.202336 }, { "acc": 0.77159529, "epoch": 0.2608378658189825, "grad_norm": 10.3125, "learning_rate": 9.737887165077246e-06, "loss": 0.82931585, "memory(GiB)": 126.99, "step": 11180, "train_speed(iter/s)": 0.202431 }, { "acc": 0.76718621, "epoch": 0.2610711733912714, "grad_norm": 5.09375, "learning_rate": 9.73728320621545e-06, "loss": 0.8547121, "memory(GiB)": 126.99, "step": 11190, "train_speed(iter/s)": 0.202523 }, { "acc": 0.76389389, "epoch": 0.2613044809635603, "grad_norm": 5.09375, "learning_rate": 9.736678571107854e-06, "loss": 0.86680279, "memory(GiB)": 126.99, "step": 11200, "train_speed(iter/s)": 0.202609 }, { "acc": 0.77421455, "epoch": 0.2615377885358492, "grad_norm": 5.28125, "learning_rate": 9.736073259840766e-06, "loss": 0.80106392, "memory(GiB)": 126.99, "step": 11210, "train_speed(iter/s)": 0.202707 }, { "acc": 0.74991431, "epoch": 0.26177109610813803, "grad_norm": 6.28125, "learning_rate": 9.735467272500597e-06, "loss": 0.92689228, "memory(GiB)": 126.99, "step": 11220, "train_speed(iter/s)": 0.202804 }, { "acc": 0.78649111, "epoch": 0.26200440368042693, "grad_norm": 10.5, "learning_rate": 9.73486060917385e-06, "loss": 0.78013344, "memory(GiB)": 126.99, "step": 11230, "train_speed(iter/s)": 0.202899 }, { "acc": 0.76939349, "epoch": 0.26223771125271583, "grad_norm": 9.375, "learning_rate": 9.734253269947128e-06, "loss": 0.84294491, "memory(GiB)": 126.99, "step": 11240, "train_speed(iter/s)": 0.20299 }, { "acc": 0.74981251, "epoch": 0.26247101882500473, "grad_norm": 4.0625, "learning_rate": 9.733645254907126e-06, "loss": 0.91040516, "memory(GiB)": 126.99, "step": 11250, "train_speed(iter/s)": 0.203082 }, { "acc": 0.77205968, "epoch": 0.26270432639729363, "grad_norm": 5.96875, "learning_rate": 9.73303656414064e-06, "loss": 0.80362463, "memory(GiB)": 126.99, "step": 11260, "train_speed(iter/s)": 0.203173 }, { "acc": 0.78065724, "epoch": 0.26293763396958253, "grad_norm": 10.25, "learning_rate": 9.732427197734557e-06, "loss": 0.77598629, "memory(GiB)": 126.99, "step": 11270, "train_speed(iter/s)": 0.203266 }, { "acc": 0.77076845, "epoch": 0.26317094154187143, "grad_norm": 5.84375, "learning_rate": 9.73181715577587e-06, "loss": 0.82357779, "memory(GiB)": 126.99, "step": 11280, "train_speed(iter/s)": 0.203349 }, { "acc": 0.78687391, "epoch": 0.2634042491141603, "grad_norm": 5.0625, "learning_rate": 9.731206438351655e-06, "loss": 0.77638292, "memory(GiB)": 126.99, "step": 11290, "train_speed(iter/s)": 0.203439 }, { "acc": 0.77242026, "epoch": 0.2636375566864492, "grad_norm": 6.4375, "learning_rate": 9.730595045549096e-06, "loss": 0.80217266, "memory(GiB)": 126.99, "step": 11300, "train_speed(iter/s)": 0.203536 }, { "acc": 0.76687241, "epoch": 0.2638708642587381, "grad_norm": 4.375, "learning_rate": 9.72998297745547e-06, "loss": 0.84325809, "memory(GiB)": 126.99, "step": 11310, "train_speed(iter/s)": 0.203631 }, { "acc": 0.74472752, "epoch": 0.26410417183102697, "grad_norm": 5.0625, "learning_rate": 9.729370234158147e-06, "loss": 0.92086382, "memory(GiB)": 126.99, "step": 11320, "train_speed(iter/s)": 0.203723 }, { "acc": 0.7770112, "epoch": 0.26433747940331587, "grad_norm": 6.25, "learning_rate": 9.728756815744598e-06, "loss": 0.86181021, "memory(GiB)": 126.99, "step": 11330, "train_speed(iter/s)": 0.203818 }, { "acc": 0.75057278, "epoch": 0.26457078697560477, "grad_norm": 6.1875, "learning_rate": 9.728142722302385e-06, "loss": 0.91030331, "memory(GiB)": 126.99, "step": 11340, "train_speed(iter/s)": 0.20391 }, { "acc": 0.76302509, "epoch": 0.26480409454789366, "grad_norm": 11.0625, "learning_rate": 9.727527953919174e-06, "loss": 0.87596025, "memory(GiB)": 126.99, "step": 11350, "train_speed(iter/s)": 0.203997 }, { "acc": 0.75909495, "epoch": 0.26503740212018256, "grad_norm": 5.53125, "learning_rate": 9.72691251068272e-06, "loss": 0.86929102, "memory(GiB)": 126.99, "step": 11360, "train_speed(iter/s)": 0.204091 }, { "acc": 0.76749086, "epoch": 0.26527070969247146, "grad_norm": 5.53125, "learning_rate": 9.726296392680879e-06, "loss": 0.83195705, "memory(GiB)": 126.99, "step": 11370, "train_speed(iter/s)": 0.204179 }, { "acc": 0.76265335, "epoch": 0.26550401726476036, "grad_norm": 20.5, "learning_rate": 9.7256796000016e-06, "loss": 0.84113808, "memory(GiB)": 126.99, "step": 11380, "train_speed(iter/s)": 0.204266 }, { "acc": 0.7723104, "epoch": 0.26573732483704926, "grad_norm": 6.53125, "learning_rate": 9.725062132732931e-06, "loss": 0.82607574, "memory(GiB)": 126.99, "step": 11390, "train_speed(iter/s)": 0.204356 }, { "acc": 0.76926665, "epoch": 0.26597063240933816, "grad_norm": 9.25, "learning_rate": 9.724443990963017e-06, "loss": 0.84509716, "memory(GiB)": 126.99, "step": 11400, "train_speed(iter/s)": 0.204446 }, { "acc": 0.77471218, "epoch": 0.266203939981627, "grad_norm": 5.96875, "learning_rate": 9.723825174780095e-06, "loss": 0.79775171, "memory(GiB)": 126.99, "step": 11410, "train_speed(iter/s)": 0.204537 }, { "acc": 0.76716242, "epoch": 0.2664372475539159, "grad_norm": 6.125, "learning_rate": 9.723205684272501e-06, "loss": 0.82582216, "memory(GiB)": 126.99, "step": 11420, "train_speed(iter/s)": 0.20463 }, { "acc": 0.77979913, "epoch": 0.2666705551262048, "grad_norm": 4.59375, "learning_rate": 9.722585519528666e-06, "loss": 0.79641523, "memory(GiB)": 126.99, "step": 11430, "train_speed(iter/s)": 0.204722 }, { "acc": 0.77594395, "epoch": 0.2669038626984937, "grad_norm": 7.5625, "learning_rate": 9.721964680637124e-06, "loss": 0.81631298, "memory(GiB)": 126.99, "step": 11440, "train_speed(iter/s)": 0.204809 }, { "acc": 0.77567191, "epoch": 0.2671371702707826, "grad_norm": 6.46875, "learning_rate": 9.721343167686491e-06, "loss": 0.80270195, "memory(GiB)": 126.99, "step": 11450, "train_speed(iter/s)": 0.2049 }, { "acc": 0.76392403, "epoch": 0.2673704778430715, "grad_norm": 9.3125, "learning_rate": 9.720720980765495e-06, "loss": 0.85869751, "memory(GiB)": 126.99, "step": 11460, "train_speed(iter/s)": 0.204987 }, { "acc": 0.76475925, "epoch": 0.2676037854153604, "grad_norm": 4.75, "learning_rate": 9.72009811996295e-06, "loss": 0.85545101, "memory(GiB)": 126.99, "step": 11470, "train_speed(iter/s)": 0.205075 }, { "acc": 0.77414856, "epoch": 0.2678370929876493, "grad_norm": 6.125, "learning_rate": 9.719474585367771e-06, "loss": 0.80691338, "memory(GiB)": 126.99, "step": 11480, "train_speed(iter/s)": 0.205167 }, { "acc": 0.7491497, "epoch": 0.2680704005599382, "grad_norm": 8.875, "learning_rate": 9.718850377068964e-06, "loss": 0.95009527, "memory(GiB)": 126.99, "step": 11490, "train_speed(iter/s)": 0.205256 }, { "acc": 0.77273641, "epoch": 0.2683037081322271, "grad_norm": 6.34375, "learning_rate": 9.718225495155638e-06, "loss": 0.82936974, "memory(GiB)": 126.99, "step": 11500, "train_speed(iter/s)": 0.205341 }, { "epoch": 0.2683037081322271, "eval_acc": 0.7336386643266392, "eval_loss": 0.8452193737030029, "eval_runtime": 1270.1304, "eval_samples_per_second": 28.336, "eval_steps_per_second": 14.169, "step": 11500 }, { "acc": 0.7597836, "epoch": 0.26853701570451594, "grad_norm": 6.1875, "learning_rate": 9.717599939716992e-06, "loss": 0.84666138, "memory(GiB)": 126.99, "step": 11510, "train_speed(iter/s)": 0.200821 }, { "acc": 0.75912447, "epoch": 0.26877032327680483, "grad_norm": 5.90625, "learning_rate": 9.716973710842326e-06, "loss": 0.86441269, "memory(GiB)": 126.99, "step": 11520, "train_speed(iter/s)": 0.20091 }, { "acc": 0.77214475, "epoch": 0.26900363084909373, "grad_norm": 6.0, "learning_rate": 9.716346808621031e-06, "loss": 0.83422747, "memory(GiB)": 126.99, "step": 11530, "train_speed(iter/s)": 0.201 }, { "acc": 0.78334231, "epoch": 0.26923693842138263, "grad_norm": 5.6875, "learning_rate": 9.715719233142601e-06, "loss": 0.77969465, "memory(GiB)": 126.99, "step": 11540, "train_speed(iter/s)": 0.201089 }, { "acc": 0.77558217, "epoch": 0.26947024599367153, "grad_norm": 5.46875, "learning_rate": 9.71509098449662e-06, "loss": 0.82058563, "memory(GiB)": 126.99, "step": 11550, "train_speed(iter/s)": 0.201179 }, { "acc": 0.75732489, "epoch": 0.26970355356596043, "grad_norm": 4.5625, "learning_rate": 9.71446206277277e-06, "loss": 0.88411083, "memory(GiB)": 126.99, "step": 11560, "train_speed(iter/s)": 0.201266 }, { "acc": 0.75806885, "epoch": 0.26993686113824933, "grad_norm": 5.09375, "learning_rate": 9.713832468060831e-06, "loss": 0.86597509, "memory(GiB)": 126.99, "step": 11570, "train_speed(iter/s)": 0.20135 }, { "acc": 0.74850397, "epoch": 0.2701701687105382, "grad_norm": 5.5, "learning_rate": 9.713202200450678e-06, "loss": 0.9307641, "memory(GiB)": 126.99, "step": 11580, "train_speed(iter/s)": 0.201441 }, { "acc": 0.76493893, "epoch": 0.2704034762828271, "grad_norm": 5.75, "learning_rate": 9.712571260032277e-06, "loss": 0.8510767, "memory(GiB)": 126.99, "step": 11590, "train_speed(iter/s)": 0.201524 }, { "acc": 0.76068068, "epoch": 0.27063678385511597, "grad_norm": 5.9375, "learning_rate": 9.7119396468957e-06, "loss": 0.87124729, "memory(GiB)": 126.99, "step": 11600, "train_speed(iter/s)": 0.201618 }, { "acc": 0.76412511, "epoch": 0.27087009142740487, "grad_norm": 4.59375, "learning_rate": 9.711307361131107e-06, "loss": 0.84404926, "memory(GiB)": 126.99, "step": 11610, "train_speed(iter/s)": 0.2017 }, { "acc": 0.76919479, "epoch": 0.27110339899969377, "grad_norm": 5.375, "learning_rate": 9.710674402828755e-06, "loss": 0.83093939, "memory(GiB)": 126.99, "step": 11620, "train_speed(iter/s)": 0.201785 }, { "acc": 0.7725348, "epoch": 0.27133670657198267, "grad_norm": 4.28125, "learning_rate": 9.710040772079001e-06, "loss": 0.81993046, "memory(GiB)": 126.99, "step": 11630, "train_speed(iter/s)": 0.201875 }, { "acc": 0.75861168, "epoch": 0.27157001414427157, "grad_norm": 6.375, "learning_rate": 9.709406468972295e-06, "loss": 0.87555952, "memory(GiB)": 126.99, "step": 11640, "train_speed(iter/s)": 0.201964 }, { "acc": 0.76188645, "epoch": 0.27180332171656046, "grad_norm": 6.53125, "learning_rate": 9.708771493599185e-06, "loss": 0.83453884, "memory(GiB)": 126.99, "step": 11650, "train_speed(iter/s)": 0.202059 }, { "acc": 0.76258478, "epoch": 0.27203662928884936, "grad_norm": 4.78125, "learning_rate": 9.708135846050313e-06, "loss": 0.85640106, "memory(GiB)": 126.99, "step": 11660, "train_speed(iter/s)": 0.20215 }, { "acc": 0.78319798, "epoch": 0.27226993686113826, "grad_norm": 4.40625, "learning_rate": 9.707499526416415e-06, "loss": 0.78641605, "memory(GiB)": 126.99, "step": 11670, "train_speed(iter/s)": 0.202232 }, { "acc": 0.75823736, "epoch": 0.27250324443342716, "grad_norm": 5.9375, "learning_rate": 9.706862534788327e-06, "loss": 0.86921701, "memory(GiB)": 126.99, "step": 11680, "train_speed(iter/s)": 0.202322 }, { "acc": 0.77783613, "epoch": 0.27273655200571606, "grad_norm": 8.8125, "learning_rate": 9.70622487125698e-06, "loss": 0.78761058, "memory(GiB)": 126.99, "step": 11690, "train_speed(iter/s)": 0.202412 }, { "acc": 0.77462397, "epoch": 0.2729698595780049, "grad_norm": 6.625, "learning_rate": 9.7055865359134e-06, "loss": 0.81614437, "memory(GiB)": 126.99, "step": 11700, "train_speed(iter/s)": 0.202495 }, { "acc": 0.78386021, "epoch": 0.2732031671502938, "grad_norm": 7.15625, "learning_rate": 9.704947528848706e-06, "loss": 0.7758873, "memory(GiB)": 126.99, "step": 11710, "train_speed(iter/s)": 0.202588 }, { "acc": 0.77271953, "epoch": 0.2734364747225827, "grad_norm": 4.46875, "learning_rate": 9.704307850154125e-06, "loss": 0.80577879, "memory(GiB)": 126.99, "step": 11720, "train_speed(iter/s)": 0.202677 }, { "acc": 0.76300001, "epoch": 0.2736697822948716, "grad_norm": 5.0, "learning_rate": 9.70366749992096e-06, "loss": 0.85106087, "memory(GiB)": 126.99, "step": 11730, "train_speed(iter/s)": 0.202765 }, { "acc": 0.76563978, "epoch": 0.2739030898671605, "grad_norm": 4.0625, "learning_rate": 9.703026478240627e-06, "loss": 0.84070683, "memory(GiB)": 126.99, "step": 11740, "train_speed(iter/s)": 0.202854 }, { "acc": 0.79070711, "epoch": 0.2741363974394494, "grad_norm": 5.09375, "learning_rate": 9.702384785204631e-06, "loss": 0.75290918, "memory(GiB)": 126.99, "step": 11750, "train_speed(iter/s)": 0.202945 }, { "acc": 0.75680985, "epoch": 0.2743697050117383, "grad_norm": 5.125, "learning_rate": 9.701742420904574e-06, "loss": 0.89517021, "memory(GiB)": 126.99, "step": 11760, "train_speed(iter/s)": 0.203037 }, { "acc": 0.74839983, "epoch": 0.2746030125840272, "grad_norm": 6.59375, "learning_rate": 9.701099385432151e-06, "loss": 0.92097282, "memory(GiB)": 126.99, "step": 11770, "train_speed(iter/s)": 0.203121 }, { "acc": 0.75796165, "epoch": 0.2748363201563161, "grad_norm": 4.0, "learning_rate": 9.700455678879157e-06, "loss": 0.8720623, "memory(GiB)": 126.99, "step": 11780, "train_speed(iter/s)": 0.203206 }, { "acc": 0.76428533, "epoch": 0.275069627728605, "grad_norm": 11.4375, "learning_rate": 9.69981130133748e-06, "loss": 0.8485014, "memory(GiB)": 126.99, "step": 11790, "train_speed(iter/s)": 0.203289 }, { "acc": 0.75732021, "epoch": 0.27530293530089384, "grad_norm": 4.84375, "learning_rate": 9.699166252899104e-06, "loss": 0.88501377, "memory(GiB)": 126.99, "step": 11800, "train_speed(iter/s)": 0.203367 }, { "acc": 0.781742, "epoch": 0.27553624287318274, "grad_norm": 4.34375, "learning_rate": 9.698520533656112e-06, "loss": 0.78006496, "memory(GiB)": 126.99, "step": 11810, "train_speed(iter/s)": 0.203457 }, { "acc": 0.76830721, "epoch": 0.27576955044547163, "grad_norm": 7.125, "learning_rate": 9.697874143700679e-06, "loss": 0.81617126, "memory(GiB)": 126.99, "step": 11820, "train_speed(iter/s)": 0.203552 }, { "acc": 0.77733955, "epoch": 0.27600285801776053, "grad_norm": 7.65625, "learning_rate": 9.697227083125076e-06, "loss": 0.80572834, "memory(GiB)": 126.99, "step": 11830, "train_speed(iter/s)": 0.203642 }, { "acc": 0.77268896, "epoch": 0.27623616559004943, "grad_norm": 7.03125, "learning_rate": 9.69657935202167e-06, "loss": 0.80644588, "memory(GiB)": 126.99, "step": 11840, "train_speed(iter/s)": 0.203734 }, { "acc": 0.7876543, "epoch": 0.27646947316233833, "grad_norm": 6.1875, "learning_rate": 9.695930950482928e-06, "loss": 0.747785, "memory(GiB)": 126.99, "step": 11850, "train_speed(iter/s)": 0.203818 }, { "acc": 0.74815092, "epoch": 0.27670278073462723, "grad_norm": 5.53125, "learning_rate": 9.695281878601406e-06, "loss": 0.90503931, "memory(GiB)": 126.99, "step": 11860, "train_speed(iter/s)": 0.203908 }, { "acc": 0.7729579, "epoch": 0.27693608830691613, "grad_norm": 5.0625, "learning_rate": 9.69463213646976e-06, "loss": 0.82826509, "memory(GiB)": 126.99, "step": 11870, "train_speed(iter/s)": 0.203998 }, { "acc": 0.75472612, "epoch": 0.277169395879205, "grad_norm": 5.5, "learning_rate": 9.69398172418074e-06, "loss": 0.87955236, "memory(GiB)": 126.99, "step": 11880, "train_speed(iter/s)": 0.204085 }, { "acc": 0.78086786, "epoch": 0.27740270345149387, "grad_norm": 3.859375, "learning_rate": 9.693330641827194e-06, "loss": 0.75951362, "memory(GiB)": 126.99, "step": 11890, "train_speed(iter/s)": 0.204175 }, { "acc": 0.78073997, "epoch": 0.27763601102378277, "grad_norm": 4.46875, "learning_rate": 9.69267888950206e-06, "loss": 0.78068328, "memory(GiB)": 126.99, "step": 11900, "train_speed(iter/s)": 0.204262 }, { "acc": 0.77878237, "epoch": 0.27786931859607167, "grad_norm": 4.375, "learning_rate": 9.69202646729838e-06, "loss": 0.79412766, "memory(GiB)": 126.99, "step": 11910, "train_speed(iter/s)": 0.204346 }, { "acc": 0.78074179, "epoch": 0.27810262616836057, "grad_norm": 5.75, "learning_rate": 9.691373375309284e-06, "loss": 0.84565754, "memory(GiB)": 126.99, "step": 11920, "train_speed(iter/s)": 0.20443 }, { "acc": 0.75893049, "epoch": 0.27833593374064947, "grad_norm": 7.375, "learning_rate": 9.690719613628001e-06, "loss": 0.88701725, "memory(GiB)": 126.99, "step": 11930, "train_speed(iter/s)": 0.204519 }, { "acc": 0.77881913, "epoch": 0.27856924131293836, "grad_norm": 5.46875, "learning_rate": 9.690065182347857e-06, "loss": 0.8020647, "memory(GiB)": 126.99, "step": 11940, "train_speed(iter/s)": 0.204602 }, { "acc": 0.75828795, "epoch": 0.27880254888522726, "grad_norm": 6.03125, "learning_rate": 9.68941008156227e-06, "loss": 0.88880653, "memory(GiB)": 126.99, "step": 11950, "train_speed(iter/s)": 0.204691 }, { "acc": 0.7708621, "epoch": 0.27903585645751616, "grad_norm": 6.0, "learning_rate": 9.688754311364755e-06, "loss": 0.81502056, "memory(GiB)": 126.99, "step": 11960, "train_speed(iter/s)": 0.204778 }, { "acc": 0.74046493, "epoch": 0.27926916402980506, "grad_norm": 6.28125, "learning_rate": 9.688097871848925e-06, "loss": 0.93770523, "memory(GiB)": 126.99, "step": 11970, "train_speed(iter/s)": 0.204869 }, { "acc": 0.78146906, "epoch": 0.27950247160209396, "grad_norm": 6.375, "learning_rate": 9.687440763108487e-06, "loss": 0.79011135, "memory(GiB)": 126.99, "step": 11980, "train_speed(iter/s)": 0.204953 }, { "acc": 0.76589327, "epoch": 0.2797357791743828, "grad_norm": 5.875, "learning_rate": 9.68678298523724e-06, "loss": 0.86380882, "memory(GiB)": 126.99, "step": 11990, "train_speed(iter/s)": 0.205044 }, { "acc": 0.7602313, "epoch": 0.2799690867466717, "grad_norm": 4.84375, "learning_rate": 9.686124538329083e-06, "loss": 0.86302662, "memory(GiB)": 126.99, "step": 12000, "train_speed(iter/s)": 0.20513 }, { "epoch": 0.2799690867466717, "eval_acc": 0.7339767288010274, "eval_loss": 0.8442361354827881, "eval_runtime": 1270.1423, "eval_samples_per_second": 28.336, "eval_steps_per_second": 14.168, "step": 12000 }, { "acc": 0.7855062, "epoch": 0.2802023943189606, "grad_norm": 4.5625, "learning_rate": 9.685465422478011e-06, "loss": 0.78390985, "memory(GiB)": 126.99, "step": 12010, "train_speed(iter/s)": 0.200803 }, { "acc": 0.77192798, "epoch": 0.2804357018912495, "grad_norm": 4.28125, "learning_rate": 9.684805637778109e-06, "loss": 0.83765612, "memory(GiB)": 126.99, "step": 12020, "train_speed(iter/s)": 0.200888 }, { "acc": 0.77343655, "epoch": 0.2806690094635384, "grad_norm": 6.65625, "learning_rate": 9.684145184323565e-06, "loss": 0.81042671, "memory(GiB)": 126.99, "step": 12030, "train_speed(iter/s)": 0.200973 }, { "acc": 0.76841726, "epoch": 0.2809023170358273, "grad_norm": 4.3125, "learning_rate": 9.683484062208657e-06, "loss": 0.84212866, "memory(GiB)": 126.99, "step": 12040, "train_speed(iter/s)": 0.201061 }, { "acc": 0.76003461, "epoch": 0.2811356246081162, "grad_norm": 7.59375, "learning_rate": 9.682822271527758e-06, "loss": 0.86936035, "memory(GiB)": 126.99, "step": 12050, "train_speed(iter/s)": 0.20115 }, { "acc": 0.76045341, "epoch": 0.2813689321804051, "grad_norm": 7.15625, "learning_rate": 9.682159812375342e-06, "loss": 0.87109947, "memory(GiB)": 126.99, "step": 12060, "train_speed(iter/s)": 0.201238 }, { "acc": 0.78477945, "epoch": 0.281602239752694, "grad_norm": 5.25, "learning_rate": 9.681496684845973e-06, "loss": 0.75963497, "memory(GiB)": 126.99, "step": 12070, "train_speed(iter/s)": 0.201321 }, { "acc": 0.76544571, "epoch": 0.2818355473249829, "grad_norm": 5.09375, "learning_rate": 9.68083288903431e-06, "loss": 0.84756174, "memory(GiB)": 126.99, "step": 12080, "train_speed(iter/s)": 0.201406 }, { "acc": 0.77797155, "epoch": 0.28206885489727174, "grad_norm": 6.65625, "learning_rate": 9.680168425035114e-06, "loss": 0.8038868, "memory(GiB)": 126.99, "step": 12090, "train_speed(iter/s)": 0.201491 }, { "acc": 0.75691757, "epoch": 0.28230216246956064, "grad_norm": 5.75, "learning_rate": 9.679503292943234e-06, "loss": 0.87715702, "memory(GiB)": 126.99, "step": 12100, "train_speed(iter/s)": 0.201577 }, { "acc": 0.76970291, "epoch": 0.28253547004184953, "grad_norm": 5.03125, "learning_rate": 9.678837492853619e-06, "loss": 0.83921423, "memory(GiB)": 126.99, "step": 12110, "train_speed(iter/s)": 0.201649 }, { "acc": 0.77343578, "epoch": 0.28276877761413843, "grad_norm": 6.71875, "learning_rate": 9.67817102486131e-06, "loss": 0.79686985, "memory(GiB)": 126.99, "step": 12120, "train_speed(iter/s)": 0.201735 }, { "acc": 0.76627212, "epoch": 0.28300208518642733, "grad_norm": 5.96875, "learning_rate": 9.677503889061446e-06, "loss": 0.8474947, "memory(GiB)": 126.99, "step": 12130, "train_speed(iter/s)": 0.201818 }, { "acc": 0.76688299, "epoch": 0.28323539275871623, "grad_norm": 5.875, "learning_rate": 9.676836085549263e-06, "loss": 0.8329731, "memory(GiB)": 126.99, "step": 12140, "train_speed(iter/s)": 0.201905 }, { "acc": 0.74896669, "epoch": 0.28346870033100513, "grad_norm": 5.25, "learning_rate": 9.676167614420085e-06, "loss": 0.91642704, "memory(GiB)": 126.99, "step": 12150, "train_speed(iter/s)": 0.201985 }, { "acc": 0.76068959, "epoch": 0.28370200790329403, "grad_norm": 7.15625, "learning_rate": 9.67549847576934e-06, "loss": 0.8687542, "memory(GiB)": 126.99, "step": 12160, "train_speed(iter/s)": 0.202069 }, { "acc": 0.76523075, "epoch": 0.28393531547558293, "grad_norm": 20.0, "learning_rate": 9.674828669692545e-06, "loss": 0.8203887, "memory(GiB)": 126.99, "step": 12170, "train_speed(iter/s)": 0.202155 }, { "acc": 0.77215323, "epoch": 0.28416862304787177, "grad_norm": 6.5625, "learning_rate": 9.674158196285316e-06, "loss": 0.80515928, "memory(GiB)": 126.99, "step": 12180, "train_speed(iter/s)": 0.202235 }, { "acc": 0.77063136, "epoch": 0.28440193062016067, "grad_norm": 7.75, "learning_rate": 9.673487055643362e-06, "loss": 0.82243662, "memory(GiB)": 126.99, "step": 12190, "train_speed(iter/s)": 0.202321 }, { "acc": 0.76656513, "epoch": 0.28463523819244957, "grad_norm": 5.25, "learning_rate": 9.672815247862489e-06, "loss": 0.82771702, "memory(GiB)": 126.99, "step": 12200, "train_speed(iter/s)": 0.202403 }, { "acc": 0.78110971, "epoch": 0.28486854576473847, "grad_norm": 6.15625, "learning_rate": 9.672142773038595e-06, "loss": 0.80094032, "memory(GiB)": 126.99, "step": 12210, "train_speed(iter/s)": 0.20249 }, { "acc": 0.75440207, "epoch": 0.28510185333702737, "grad_norm": 5.0, "learning_rate": 9.671469631267678e-06, "loss": 0.89850616, "memory(GiB)": 126.99, "step": 12220, "train_speed(iter/s)": 0.20258 }, { "acc": 0.753054, "epoch": 0.28533516090931627, "grad_norm": 7.15625, "learning_rate": 9.67079582264583e-06, "loss": 0.90991497, "memory(GiB)": 126.99, "step": 12230, "train_speed(iter/s)": 0.202672 }, { "acc": 0.77435451, "epoch": 0.28556846848160516, "grad_norm": 5.90625, "learning_rate": 9.670121347269234e-06, "loss": 0.82249737, "memory(GiB)": 126.99, "step": 12240, "train_speed(iter/s)": 0.202753 }, { "acc": 0.75970974, "epoch": 0.28580177605389406, "grad_norm": 5.40625, "learning_rate": 9.669446205234172e-06, "loss": 0.85512848, "memory(GiB)": 126.99, "step": 12250, "train_speed(iter/s)": 0.202839 }, { "acc": 0.78008022, "epoch": 0.28603508362618296, "grad_norm": 5.15625, "learning_rate": 9.668770396637022e-06, "loss": 0.79443493, "memory(GiB)": 126.99, "step": 12260, "train_speed(iter/s)": 0.20292 }, { "acc": 0.76683731, "epoch": 0.28626839119847186, "grad_norm": 5.15625, "learning_rate": 9.668093921574253e-06, "loss": 0.85410089, "memory(GiB)": 126.99, "step": 12270, "train_speed(iter/s)": 0.203005 }, { "acc": 0.75589981, "epoch": 0.2865016987707607, "grad_norm": 6.71875, "learning_rate": 9.667416780142434e-06, "loss": 0.88183594, "memory(GiB)": 126.99, "step": 12280, "train_speed(iter/s)": 0.20309 }, { "acc": 0.7808938, "epoch": 0.2867350063430496, "grad_norm": 5.34375, "learning_rate": 9.666738972438224e-06, "loss": 0.78594069, "memory(GiB)": 126.99, "step": 12290, "train_speed(iter/s)": 0.203169 }, { "acc": 0.77634277, "epoch": 0.2869683139153385, "grad_norm": 5.15625, "learning_rate": 9.666060498558381e-06, "loss": 0.80375671, "memory(GiB)": 126.99, "step": 12300, "train_speed(iter/s)": 0.203253 }, { "acc": 0.76860728, "epoch": 0.2872016214876274, "grad_norm": 4.9375, "learning_rate": 9.665381358599759e-06, "loss": 0.86330442, "memory(GiB)": 126.99, "step": 12310, "train_speed(iter/s)": 0.203335 }, { "acc": 0.7679863, "epoch": 0.2874349290599163, "grad_norm": 5.375, "learning_rate": 9.664701552659303e-06, "loss": 0.838801, "memory(GiB)": 126.99, "step": 12320, "train_speed(iter/s)": 0.20342 }, { "acc": 0.74797144, "epoch": 0.2876682366322052, "grad_norm": 8.5, "learning_rate": 9.664021080834053e-06, "loss": 0.93438997, "memory(GiB)": 126.99, "step": 12330, "train_speed(iter/s)": 0.203504 }, { "acc": 0.75219126, "epoch": 0.2879015442044941, "grad_norm": 5.59375, "learning_rate": 9.663339943221153e-06, "loss": 0.906147, "memory(GiB)": 126.99, "step": 12340, "train_speed(iter/s)": 0.203584 }, { "acc": 0.76432757, "epoch": 0.288134851776783, "grad_norm": 5.28125, "learning_rate": 9.662658139917827e-06, "loss": 0.84146051, "memory(GiB)": 126.99, "step": 12350, "train_speed(iter/s)": 0.203666 }, { "acc": 0.76514382, "epoch": 0.2883681593490719, "grad_norm": 5.0625, "learning_rate": 9.661975671021408e-06, "loss": 0.85332146, "memory(GiB)": 126.99, "step": 12360, "train_speed(iter/s)": 0.203749 }, { "acc": 0.75728321, "epoch": 0.2886014669213608, "grad_norm": 4.84375, "learning_rate": 9.661292536629316e-06, "loss": 0.85679588, "memory(GiB)": 126.99, "step": 12370, "train_speed(iter/s)": 0.203833 }, { "acc": 0.77875571, "epoch": 0.28883477449364964, "grad_norm": 7.0, "learning_rate": 9.660608736839067e-06, "loss": 0.80268106, "memory(GiB)": 126.99, "step": 12380, "train_speed(iter/s)": 0.203916 }, { "acc": 0.75960855, "epoch": 0.28906808206593854, "grad_norm": 5.125, "learning_rate": 9.659924271748277e-06, "loss": 0.86685429, "memory(GiB)": 126.99, "step": 12390, "train_speed(iter/s)": 0.204004 }, { "acc": 0.76572495, "epoch": 0.28930138963822744, "grad_norm": 4.09375, "learning_rate": 9.65923914145465e-06, "loss": 0.8074934, "memory(GiB)": 126.99, "step": 12400, "train_speed(iter/s)": 0.204085 }, { "acc": 0.76597309, "epoch": 0.28953469721051633, "grad_norm": 5.59375, "learning_rate": 9.65855334605599e-06, "loss": 0.82842016, "memory(GiB)": 126.99, "step": 12410, "train_speed(iter/s)": 0.204169 }, { "acc": 0.75547256, "epoch": 0.28976800478280523, "grad_norm": 6.5, "learning_rate": 9.65786688565019e-06, "loss": 0.88945179, "memory(GiB)": 126.99, "step": 12420, "train_speed(iter/s)": 0.204248 }, { "acc": 0.77330575, "epoch": 0.29000131235509413, "grad_norm": 5.34375, "learning_rate": 9.65717976033525e-06, "loss": 0.80908098, "memory(GiB)": 126.99, "step": 12430, "train_speed(iter/s)": 0.204329 }, { "acc": 0.76689053, "epoch": 0.29023461992738303, "grad_norm": 4.6875, "learning_rate": 9.656491970209248e-06, "loss": 0.84457645, "memory(GiB)": 126.99, "step": 12440, "train_speed(iter/s)": 0.204414 }, { "acc": 0.76981139, "epoch": 0.29046792749967193, "grad_norm": 6.71875, "learning_rate": 9.655803515370373e-06, "loss": 0.8389142, "memory(GiB)": 126.99, "step": 12450, "train_speed(iter/s)": 0.204496 }, { "acc": 0.77279081, "epoch": 0.29070123507196083, "grad_norm": 4.46875, "learning_rate": 9.655114395916896e-06, "loss": 0.8170186, "memory(GiB)": 126.99, "step": 12460, "train_speed(iter/s)": 0.20458 }, { "acc": 0.7852149, "epoch": 0.29093454264424967, "grad_norm": 6.90625, "learning_rate": 9.654424611947194e-06, "loss": 0.80533628, "memory(GiB)": 126.99, "step": 12470, "train_speed(iter/s)": 0.204664 }, { "acc": 0.76608725, "epoch": 0.29116785021653857, "grad_norm": 5.75, "learning_rate": 9.65373416355973e-06, "loss": 0.84452038, "memory(GiB)": 126.99, "step": 12480, "train_speed(iter/s)": 0.204746 }, { "acc": 0.76543536, "epoch": 0.29140115778882747, "grad_norm": 5.59375, "learning_rate": 9.653043050853065e-06, "loss": 0.85324554, "memory(GiB)": 126.99, "step": 12490, "train_speed(iter/s)": 0.204826 }, { "acc": 0.76770477, "epoch": 0.29163446536111637, "grad_norm": 6.5625, "learning_rate": 9.652351273925854e-06, "loss": 0.84301796, "memory(GiB)": 126.99, "step": 12500, "train_speed(iter/s)": 0.204912 }, { "epoch": 0.29163446536111637, "eval_acc": 0.7345405966429606, "eval_loss": 0.8422214984893799, "eval_runtime": 1269.9542, "eval_samples_per_second": 28.34, "eval_steps_per_second": 14.171, "step": 12500 }, { "acc": 0.7895587, "epoch": 0.29186777293340527, "grad_norm": 5.8125, "learning_rate": 9.651658832876853e-06, "loss": 0.75596642, "memory(GiB)": 126.99, "step": 12510, "train_speed(iter/s)": 0.200765 }, { "acc": 0.75599642, "epoch": 0.29210108050569417, "grad_norm": 5.40625, "learning_rate": 9.650965727804907e-06, "loss": 0.88882275, "memory(GiB)": 126.99, "step": 12520, "train_speed(iter/s)": 0.200848 }, { "acc": 0.77908826, "epoch": 0.29233438807798307, "grad_norm": 6.25, "learning_rate": 9.65027195880895e-06, "loss": 0.79518585, "memory(GiB)": 126.99, "step": 12530, "train_speed(iter/s)": 0.200931 }, { "acc": 0.7476614, "epoch": 0.29256769565027196, "grad_norm": 10.25, "learning_rate": 9.649577525988025e-06, "loss": 0.93213406, "memory(GiB)": 126.99, "step": 12540, "train_speed(iter/s)": 0.201017 }, { "acc": 0.7507143, "epoch": 0.29280100322256086, "grad_norm": 5.78125, "learning_rate": 9.648882429441258e-06, "loss": 0.92582474, "memory(GiB)": 126.99, "step": 12550, "train_speed(iter/s)": 0.201102 }, { "acc": 0.76029882, "epoch": 0.29303431079484976, "grad_norm": 10.0625, "learning_rate": 9.648186669267874e-06, "loss": 0.86718149, "memory(GiB)": 126.99, "step": 12560, "train_speed(iter/s)": 0.201185 }, { "acc": 0.75470057, "epoch": 0.2932676183671386, "grad_norm": 6.6875, "learning_rate": 9.647490245567194e-06, "loss": 0.90233717, "memory(GiB)": 126.99, "step": 12570, "train_speed(iter/s)": 0.201266 }, { "acc": 0.7683279, "epoch": 0.2935009259394275, "grad_norm": 7.0625, "learning_rate": 9.646793158438632e-06, "loss": 0.82610855, "memory(GiB)": 126.99, "step": 12580, "train_speed(iter/s)": 0.201344 }, { "acc": 0.77332907, "epoch": 0.2937342335117164, "grad_norm": 9.125, "learning_rate": 9.646095407981695e-06, "loss": 0.80713387, "memory(GiB)": 126.99, "step": 12590, "train_speed(iter/s)": 0.201425 }, { "acc": 0.74880519, "epoch": 0.2939675410840053, "grad_norm": 8.75, "learning_rate": 9.64539699429599e-06, "loss": 0.91812592, "memory(GiB)": 126.99, "step": 12600, "train_speed(iter/s)": 0.201507 }, { "acc": 0.76272821, "epoch": 0.2942008486562942, "grad_norm": 7.25, "learning_rate": 9.644697917481212e-06, "loss": 0.84828262, "memory(GiB)": 126.99, "step": 12610, "train_speed(iter/s)": 0.201593 }, { "acc": 0.76495266, "epoch": 0.2944341562285831, "grad_norm": 5.46875, "learning_rate": 9.643998177637157e-06, "loss": 0.8520998, "memory(GiB)": 126.99, "step": 12620, "train_speed(iter/s)": 0.201679 }, { "acc": 0.75189352, "epoch": 0.294667463800872, "grad_norm": 5.40625, "learning_rate": 9.643297774863709e-06, "loss": 0.88819065, "memory(GiB)": 126.99, "step": 12630, "train_speed(iter/s)": 0.201765 }, { "acc": 0.74685659, "epoch": 0.2949007713731609, "grad_norm": 6.0625, "learning_rate": 9.642596709260854e-06, "loss": 0.93766823, "memory(GiB)": 126.99, "step": 12640, "train_speed(iter/s)": 0.201846 }, { "acc": 0.77418585, "epoch": 0.2951340789454498, "grad_norm": 7.71875, "learning_rate": 9.641894980928668e-06, "loss": 0.82185469, "memory(GiB)": 126.99, "step": 12650, "train_speed(iter/s)": 0.201928 }, { "acc": 0.77108278, "epoch": 0.29536738651773864, "grad_norm": 7.53125, "learning_rate": 9.641192589967321e-06, "loss": 0.84046984, "memory(GiB)": 126.99, "step": 12660, "train_speed(iter/s)": 0.202013 }, { "acc": 0.7660718, "epoch": 0.29560069409002754, "grad_norm": 6.28125, "learning_rate": 9.64048953647708e-06, "loss": 0.83025084, "memory(GiB)": 126.99, "step": 12670, "train_speed(iter/s)": 0.202095 }, { "acc": 0.76199236, "epoch": 0.29583400166231644, "grad_norm": 3.8125, "learning_rate": 9.639785820558307e-06, "loss": 0.88814316, "memory(GiB)": 126.99, "step": 12680, "train_speed(iter/s)": 0.202176 }, { "acc": 0.75356421, "epoch": 0.29606730923460534, "grad_norm": 4.96875, "learning_rate": 9.639081442311456e-06, "loss": 0.88174667, "memory(GiB)": 126.99, "step": 12690, "train_speed(iter/s)": 0.202261 }, { "acc": 0.76016741, "epoch": 0.29630061680689423, "grad_norm": 6.59375, "learning_rate": 9.638376401837075e-06, "loss": 0.87111378, "memory(GiB)": 126.99, "step": 12700, "train_speed(iter/s)": 0.202342 }, { "acc": 0.78399467, "epoch": 0.29653392437918313, "grad_norm": 4.625, "learning_rate": 9.63767069923581e-06, "loss": 0.77465773, "memory(GiB)": 126.99, "step": 12710, "train_speed(iter/s)": 0.202423 }, { "acc": 0.76579094, "epoch": 0.29676723195147203, "grad_norm": 5.28125, "learning_rate": 9.636964334608402e-06, "loss": 0.85976906, "memory(GiB)": 126.99, "step": 12720, "train_speed(iter/s)": 0.2025 }, { "acc": 0.76346893, "epoch": 0.29700053952376093, "grad_norm": 5.5625, "learning_rate": 9.636257308055682e-06, "loss": 0.85128269, "memory(GiB)": 126.99, "step": 12730, "train_speed(iter/s)": 0.202583 }, { "acc": 0.76815395, "epoch": 0.29723384709604983, "grad_norm": 4.65625, "learning_rate": 9.635549619678578e-06, "loss": 0.84149694, "memory(GiB)": 126.99, "step": 12740, "train_speed(iter/s)": 0.202667 }, { "acc": 0.76429968, "epoch": 0.29746715466833873, "grad_norm": 5.15625, "learning_rate": 9.63484126957811e-06, "loss": 0.8531889, "memory(GiB)": 126.99, "step": 12750, "train_speed(iter/s)": 0.20275 }, { "acc": 0.75266466, "epoch": 0.2977004622406276, "grad_norm": 7.625, "learning_rate": 9.6341322578554e-06, "loss": 0.87924614, "memory(GiB)": 126.99, "step": 12760, "train_speed(iter/s)": 0.202829 }, { "acc": 0.77012124, "epoch": 0.29793376981291647, "grad_norm": 4.4375, "learning_rate": 9.633422584611654e-06, "loss": 0.81980047, "memory(GiB)": 126.99, "step": 12770, "train_speed(iter/s)": 0.202914 }, { "acc": 0.78082376, "epoch": 0.29816707738520537, "grad_norm": 4.90625, "learning_rate": 9.632712249948182e-06, "loss": 0.80720272, "memory(GiB)": 126.99, "step": 12780, "train_speed(iter/s)": 0.202998 }, { "acc": 0.7774765, "epoch": 0.29840038495749427, "grad_norm": 4.71875, "learning_rate": 9.632001253966381e-06, "loss": 0.79807181, "memory(GiB)": 126.99, "step": 12790, "train_speed(iter/s)": 0.203084 }, { "acc": 0.77757535, "epoch": 0.29863369252978317, "grad_norm": 5.625, "learning_rate": 9.631289596767748e-06, "loss": 0.78618836, "memory(GiB)": 126.99, "step": 12800, "train_speed(iter/s)": 0.203164 }, { "acc": 0.77521505, "epoch": 0.29886700010207207, "grad_norm": 8.1875, "learning_rate": 9.63057727845387e-06, "loss": 0.80945616, "memory(GiB)": 126.99, "step": 12810, "train_speed(iter/s)": 0.203252 }, { "acc": 0.77795644, "epoch": 0.29910030767436097, "grad_norm": 8.875, "learning_rate": 9.62986429912643e-06, "loss": 0.78125734, "memory(GiB)": 126.99, "step": 12820, "train_speed(iter/s)": 0.203339 }, { "acc": 0.77475395, "epoch": 0.29933361524664986, "grad_norm": 5.8125, "learning_rate": 9.629150658887206e-06, "loss": 0.81328382, "memory(GiB)": 126.99, "step": 12830, "train_speed(iter/s)": 0.203414 }, { "acc": 0.77334385, "epoch": 0.29956692281893876, "grad_norm": 5.1875, "learning_rate": 9.628436357838072e-06, "loss": 0.81213036, "memory(GiB)": 126.99, "step": 12840, "train_speed(iter/s)": 0.203488 }, { "acc": 0.79729042, "epoch": 0.29980023039122766, "grad_norm": 4.84375, "learning_rate": 9.627721396080992e-06, "loss": 0.73419204, "memory(GiB)": 126.99, "step": 12850, "train_speed(iter/s)": 0.203572 }, { "acc": 0.76485343, "epoch": 0.3000335379635165, "grad_norm": 6.78125, "learning_rate": 9.627005773718026e-06, "loss": 0.85712852, "memory(GiB)": 126.99, "step": 12860, "train_speed(iter/s)": 0.203649 }, { "acc": 0.77371435, "epoch": 0.3002668455358054, "grad_norm": 6.25, "learning_rate": 9.626289490851329e-06, "loss": 0.81916208, "memory(GiB)": 126.99, "step": 12870, "train_speed(iter/s)": 0.203723 }, { "acc": 0.78421712, "epoch": 0.3005001531080943, "grad_norm": 3.6875, "learning_rate": 9.625572547583153e-06, "loss": 0.77294006, "memory(GiB)": 126.99, "step": 12880, "train_speed(iter/s)": 0.203807 }, { "acc": 0.77695723, "epoch": 0.3007334606803832, "grad_norm": 5.75, "learning_rate": 9.624854944015839e-06, "loss": 0.80077038, "memory(GiB)": 126.99, "step": 12890, "train_speed(iter/s)": 0.203892 }, { "acc": 0.77849216, "epoch": 0.3009667682526721, "grad_norm": 4.53125, "learning_rate": 9.624136680251826e-06, "loss": 0.78988638, "memory(GiB)": 126.99, "step": 12900, "train_speed(iter/s)": 0.203969 }, { "acc": 0.78356485, "epoch": 0.301200075824961, "grad_norm": 6.28125, "learning_rate": 9.623417756393644e-06, "loss": 0.78568068, "memory(GiB)": 126.99, "step": 12910, "train_speed(iter/s)": 0.204048 }, { "acc": 0.75422754, "epoch": 0.3014333833972499, "grad_norm": 43.25, "learning_rate": 9.622698172543921e-06, "loss": 0.88511696, "memory(GiB)": 126.99, "step": 12920, "train_speed(iter/s)": 0.20413 }, { "acc": 0.76167445, "epoch": 0.3016666909695388, "grad_norm": 5.375, "learning_rate": 9.621977928805377e-06, "loss": 0.85034475, "memory(GiB)": 126.99, "step": 12930, "train_speed(iter/s)": 0.204214 }, { "acc": 0.77598619, "epoch": 0.3018999985418277, "grad_norm": 6.9375, "learning_rate": 9.621257025280826e-06, "loss": 0.79932165, "memory(GiB)": 126.99, "step": 12940, "train_speed(iter/s)": 0.204302 }, { "acc": 0.76910601, "epoch": 0.30213330611411654, "grad_norm": 15.6875, "learning_rate": 9.620535462073177e-06, "loss": 0.85517778, "memory(GiB)": 126.99, "step": 12950, "train_speed(iter/s)": 0.204382 }, { "acc": 0.74408941, "epoch": 0.30236661368640544, "grad_norm": 6.09375, "learning_rate": 9.619813239285433e-06, "loss": 0.95150757, "memory(GiB)": 126.99, "step": 12960, "train_speed(iter/s)": 0.204462 }, { "acc": 0.77192564, "epoch": 0.30259992125869434, "grad_norm": 5.65625, "learning_rate": 9.619090357020691e-06, "loss": 0.81647863, "memory(GiB)": 126.99, "step": 12970, "train_speed(iter/s)": 0.204548 }, { "acc": 0.78476696, "epoch": 0.30283322883098324, "grad_norm": 6.21875, "learning_rate": 9.618366815382143e-06, "loss": 0.78855925, "memory(GiB)": 126.99, "step": 12980, "train_speed(iter/s)": 0.204626 }, { "acc": 0.77091103, "epoch": 0.30306653640327214, "grad_norm": 5.6875, "learning_rate": 9.617642614473073e-06, "loss": 0.88907585, "memory(GiB)": 126.99, "step": 12990, "train_speed(iter/s)": 0.204708 }, { "acc": 0.767976, "epoch": 0.30329984397556103, "grad_norm": 9.375, "learning_rate": 9.616917754396861e-06, "loss": 0.82190113, "memory(GiB)": 126.99, "step": 13000, "train_speed(iter/s)": 0.204791 }, { "epoch": 0.30329984397556103, "eval_acc": 0.7348525576074836, "eval_loss": 0.8418717384338379, "eval_runtime": 1269.0169, "eval_samples_per_second": 28.361, "eval_steps_per_second": 14.181, "step": 13000 }, { "acc": 0.77310801, "epoch": 0.30353315154784993, "grad_norm": 5.375, "learning_rate": 9.616192235256983e-06, "loss": 0.80410042, "memory(GiB)": 126.99, "step": 13010, "train_speed(iter/s)": 0.200802 }, { "acc": 0.75549717, "epoch": 0.30376645912013883, "grad_norm": 6.28125, "learning_rate": 9.615466057157002e-06, "loss": 0.87396736, "memory(GiB)": 126.99, "step": 13020, "train_speed(iter/s)": 0.200883 }, { "acc": 0.78147354, "epoch": 0.30399976669242773, "grad_norm": 6.96875, "learning_rate": 9.614739220200583e-06, "loss": 0.79309797, "memory(GiB)": 126.99, "step": 13030, "train_speed(iter/s)": 0.20096 }, { "acc": 0.76732526, "epoch": 0.30423307426471663, "grad_norm": 6.28125, "learning_rate": 9.614011724491481e-06, "loss": 0.8398242, "memory(GiB)": 126.99, "step": 13040, "train_speed(iter/s)": 0.201041 }, { "acc": 0.75758252, "epoch": 0.3044663818370055, "grad_norm": 8.5, "learning_rate": 9.613283570133547e-06, "loss": 0.87248812, "memory(GiB)": 126.99, "step": 13050, "train_speed(iter/s)": 0.201125 }, { "acc": 0.76003232, "epoch": 0.30469968940929437, "grad_norm": 6.84375, "learning_rate": 9.612554757230722e-06, "loss": 0.87683392, "memory(GiB)": 126.99, "step": 13060, "train_speed(iter/s)": 0.201212 }, { "acc": 0.77400045, "epoch": 0.30493299698158327, "grad_norm": 5.78125, "learning_rate": 9.611825285887045e-06, "loss": 0.81213093, "memory(GiB)": 126.99, "step": 13070, "train_speed(iter/s)": 0.201287 }, { "acc": 0.76753764, "epoch": 0.30516630455387217, "grad_norm": 6.21875, "learning_rate": 9.61109515620665e-06, "loss": 0.84945755, "memory(GiB)": 126.99, "step": 13080, "train_speed(iter/s)": 0.201367 }, { "acc": 0.79222326, "epoch": 0.30539961212616107, "grad_norm": 5.5, "learning_rate": 9.61036436829376e-06, "loss": 0.73132648, "memory(GiB)": 126.99, "step": 13090, "train_speed(iter/s)": 0.201446 }, { "acc": 0.75894957, "epoch": 0.30563291969844997, "grad_norm": 5.5625, "learning_rate": 9.609632922252695e-06, "loss": 0.89140854, "memory(GiB)": 126.99, "step": 13100, "train_speed(iter/s)": 0.201525 }, { "acc": 0.77855654, "epoch": 0.30586622727073887, "grad_norm": 5.0625, "learning_rate": 9.60890081818787e-06, "loss": 0.81173906, "memory(GiB)": 126.99, "step": 13110, "train_speed(iter/s)": 0.201604 }, { "acc": 0.77230225, "epoch": 0.30609953484302777, "grad_norm": 5.40625, "learning_rate": 9.608168056203792e-06, "loss": 0.80290689, "memory(GiB)": 126.99, "step": 13120, "train_speed(iter/s)": 0.201684 }, { "acc": 0.76812458, "epoch": 0.30633284241531666, "grad_norm": 7.71875, "learning_rate": 9.607434636405063e-06, "loss": 0.86059694, "memory(GiB)": 126.99, "step": 13130, "train_speed(iter/s)": 0.201761 }, { "acc": 0.76915483, "epoch": 0.30656614998760556, "grad_norm": 4.90625, "learning_rate": 9.606700558896376e-06, "loss": 0.8283617, "memory(GiB)": 126.99, "step": 13140, "train_speed(iter/s)": 0.201843 }, { "acc": 0.75852375, "epoch": 0.3067994575598944, "grad_norm": 5.59375, "learning_rate": 9.605965823782525e-06, "loss": 0.88993883, "memory(GiB)": 126.99, "step": 13150, "train_speed(iter/s)": 0.201916 }, { "acc": 0.77059002, "epoch": 0.3070327651321833, "grad_norm": 6.875, "learning_rate": 9.605230431168391e-06, "loss": 0.83691521, "memory(GiB)": 126.99, "step": 13160, "train_speed(iter/s)": 0.201993 }, { "acc": 0.77577543, "epoch": 0.3072660727044722, "grad_norm": 4.3125, "learning_rate": 9.604494381158949e-06, "loss": 0.81176348, "memory(GiB)": 126.99, "step": 13170, "train_speed(iter/s)": 0.202066 }, { "acc": 0.78056211, "epoch": 0.3074993802767611, "grad_norm": 5.53125, "learning_rate": 9.603757673859274e-06, "loss": 0.80487776, "memory(GiB)": 126.99, "step": 13180, "train_speed(iter/s)": 0.202144 }, { "acc": 0.76723247, "epoch": 0.30773268784905, "grad_norm": 11.375, "learning_rate": 9.603020309374526e-06, "loss": 0.87342901, "memory(GiB)": 126.99, "step": 13190, "train_speed(iter/s)": 0.202221 }, { "acc": 0.77724471, "epoch": 0.3079659954213389, "grad_norm": 8.75, "learning_rate": 9.602282287809966e-06, "loss": 0.8030695, "memory(GiB)": 126.99, "step": 13200, "train_speed(iter/s)": 0.202298 }, { "acc": 0.77045097, "epoch": 0.3081993029936278, "grad_norm": 4.125, "learning_rate": 9.601543609270947e-06, "loss": 0.83347521, "memory(GiB)": 126.99, "step": 13210, "train_speed(iter/s)": 0.202369 }, { "acc": 0.76114216, "epoch": 0.3084326105659167, "grad_norm": 7.5625, "learning_rate": 9.600804273862917e-06, "loss": 0.86944656, "memory(GiB)": 126.99, "step": 13220, "train_speed(iter/s)": 0.202449 }, { "acc": 0.76104956, "epoch": 0.3086659181382056, "grad_norm": 10.1875, "learning_rate": 9.60006428169141e-06, "loss": 0.87441788, "memory(GiB)": 126.99, "step": 13230, "train_speed(iter/s)": 0.202524 }, { "acc": 0.78825359, "epoch": 0.30889922571049444, "grad_norm": 4.5625, "learning_rate": 9.599323632862063e-06, "loss": 0.7530653, "memory(GiB)": 126.99, "step": 13240, "train_speed(iter/s)": 0.202602 }, { "acc": 0.77793827, "epoch": 0.30913253328278334, "grad_norm": 8.1875, "learning_rate": 9.598582327480605e-06, "loss": 0.78715887, "memory(GiB)": 126.99, "step": 13250, "train_speed(iter/s)": 0.202675 }, { "acc": 0.76915102, "epoch": 0.30936584085507224, "grad_norm": 5.6875, "learning_rate": 9.597840365652857e-06, "loss": 0.82546225, "memory(GiB)": 126.99, "step": 13260, "train_speed(iter/s)": 0.202753 }, { "acc": 0.7758049, "epoch": 0.30959914842736114, "grad_norm": 5.5625, "learning_rate": 9.597097747484731e-06, "loss": 0.8197854, "memory(GiB)": 126.99, "step": 13270, "train_speed(iter/s)": 0.202827 }, { "acc": 0.77688622, "epoch": 0.30983245599965004, "grad_norm": 5.0625, "learning_rate": 9.596354473082237e-06, "loss": 0.80246162, "memory(GiB)": 126.99, "step": 13280, "train_speed(iter/s)": 0.202904 }, { "acc": 0.77225947, "epoch": 0.31006576357193893, "grad_norm": 6.46875, "learning_rate": 9.595610542551476e-06, "loss": 0.82849407, "memory(GiB)": 126.99, "step": 13290, "train_speed(iter/s)": 0.202984 }, { "acc": 0.76478229, "epoch": 0.31029907114422783, "grad_norm": 6.09375, "learning_rate": 9.594865955998648e-06, "loss": 0.85376844, "memory(GiB)": 126.99, "step": 13300, "train_speed(iter/s)": 0.203061 }, { "acc": 0.77614241, "epoch": 0.31053237871651673, "grad_norm": 4.1875, "learning_rate": 9.594120713530038e-06, "loss": 0.82821312, "memory(GiB)": 126.99, "step": 13310, "train_speed(iter/s)": 0.203142 }, { "acc": 0.76965942, "epoch": 0.31076568628880563, "grad_norm": 6.125, "learning_rate": 9.59337481525203e-06, "loss": 0.83385906, "memory(GiB)": 126.99, "step": 13320, "train_speed(iter/s)": 0.20322 }, { "acc": 0.7630805, "epoch": 0.31099899386109453, "grad_norm": 6.96875, "learning_rate": 9.592628261271102e-06, "loss": 0.8761137, "memory(GiB)": 126.99, "step": 13330, "train_speed(iter/s)": 0.203301 }, { "acc": 0.76680398, "epoch": 0.3112323014333834, "grad_norm": 5.28125, "learning_rate": 9.591881051693826e-06, "loss": 0.85415878, "memory(GiB)": 126.99, "step": 13340, "train_speed(iter/s)": 0.20338 }, { "acc": 0.77896576, "epoch": 0.3114656090056723, "grad_norm": 6.53125, "learning_rate": 9.591133186626861e-06, "loss": 0.80970163, "memory(GiB)": 126.99, "step": 13350, "train_speed(iter/s)": 0.203457 }, { "acc": 0.75927386, "epoch": 0.31169891657796117, "grad_norm": 4.84375, "learning_rate": 9.590384666176968e-06, "loss": 0.86237316, "memory(GiB)": 126.99, "step": 13360, "train_speed(iter/s)": 0.203535 }, { "acc": 0.77169557, "epoch": 0.31193222415025007, "grad_norm": 4.15625, "learning_rate": 9.589635490450999e-06, "loss": 0.80186424, "memory(GiB)": 126.99, "step": 13370, "train_speed(iter/s)": 0.203611 }, { "acc": 0.78818069, "epoch": 0.31216553172253897, "grad_norm": 5.0, "learning_rate": 9.588885659555895e-06, "loss": 0.76719275, "memory(GiB)": 126.99, "step": 13380, "train_speed(iter/s)": 0.20369 }, { "acc": 0.74860768, "epoch": 0.31239883929482787, "grad_norm": 7.09375, "learning_rate": 9.588135173598696e-06, "loss": 0.91373577, "memory(GiB)": 126.99, "step": 13390, "train_speed(iter/s)": 0.203769 }, { "acc": 0.77999992, "epoch": 0.31263214686711677, "grad_norm": 9.8125, "learning_rate": 9.587384032686536e-06, "loss": 0.78641224, "memory(GiB)": 126.99, "step": 13400, "train_speed(iter/s)": 0.203846 }, { "acc": 0.78604879, "epoch": 0.31286545443940567, "grad_norm": 5.6875, "learning_rate": 9.586632236926637e-06, "loss": 0.76967993, "memory(GiB)": 126.99, "step": 13410, "train_speed(iter/s)": 0.203919 }, { "acc": 0.76556487, "epoch": 0.31309876201169456, "grad_norm": 5.1875, "learning_rate": 9.585879786426317e-06, "loss": 0.83489494, "memory(GiB)": 126.99, "step": 13420, "train_speed(iter/s)": 0.203997 }, { "acc": 0.77812591, "epoch": 0.3133320695839834, "grad_norm": 4.3125, "learning_rate": 9.585126681292991e-06, "loss": 0.7988245, "memory(GiB)": 126.99, "step": 13430, "train_speed(iter/s)": 0.204078 }, { "acc": 0.77678261, "epoch": 0.3135653771562723, "grad_norm": 6.71875, "learning_rate": 9.584372921634164e-06, "loss": 0.82126627, "memory(GiB)": 126.99, "step": 13440, "train_speed(iter/s)": 0.204156 }, { "acc": 0.77149763, "epoch": 0.3137986847285612, "grad_norm": 6.34375, "learning_rate": 9.583618507557433e-06, "loss": 0.82525063, "memory(GiB)": 126.99, "step": 13450, "train_speed(iter/s)": 0.204235 }, { "acc": 0.76586752, "epoch": 0.3140319923008501, "grad_norm": 4.6875, "learning_rate": 9.582863439170493e-06, "loss": 0.83788195, "memory(GiB)": 126.99, "step": 13460, "train_speed(iter/s)": 0.204314 }, { "acc": 0.77512035, "epoch": 0.314265299873139, "grad_norm": 4.8125, "learning_rate": 9.582107716581125e-06, "loss": 0.82719707, "memory(GiB)": 126.99, "step": 13470, "train_speed(iter/s)": 0.204394 }, { "acc": 0.79411778, "epoch": 0.3144986074454279, "grad_norm": 12.1875, "learning_rate": 9.581351339897215e-06, "loss": 0.73287711, "memory(GiB)": 126.99, "step": 13480, "train_speed(iter/s)": 0.204473 }, { "acc": 0.78208966, "epoch": 0.3147319150177168, "grad_norm": 5.03125, "learning_rate": 9.580594309226731e-06, "loss": 0.75941992, "memory(GiB)": 126.99, "step": 13490, "train_speed(iter/s)": 0.204553 }, { "acc": 0.75446653, "epoch": 0.3149652225900057, "grad_norm": 6.65625, "learning_rate": 9.579836624677742e-06, "loss": 0.89500237, "memory(GiB)": 126.99, "step": 13500, "train_speed(iter/s)": 0.204631 }, { "epoch": 0.3149652225900057, "eval_acc": 0.7354596643921382, "eval_loss": 0.8395183086395264, "eval_runtime": 1270.619, "eval_samples_per_second": 28.326, "eval_steps_per_second": 14.163, "step": 13500 }, { "acc": 0.7628726, "epoch": 0.3151985301622946, "grad_norm": 4.3125, "learning_rate": 9.579078286358403e-06, "loss": 0.86440277, "memory(GiB)": 126.99, "step": 13510, "train_speed(iter/s)": 0.200788 }, { "acc": 0.76125512, "epoch": 0.3154318377345835, "grad_norm": 5.4375, "learning_rate": 9.578319294376968e-06, "loss": 0.86319227, "memory(GiB)": 126.99, "step": 13520, "train_speed(iter/s)": 0.200865 }, { "acc": 0.76013141, "epoch": 0.31566514530687234, "grad_norm": 5.59375, "learning_rate": 9.577559648841785e-06, "loss": 0.85533056, "memory(GiB)": 126.99, "step": 13530, "train_speed(iter/s)": 0.200944 }, { "acc": 0.76009645, "epoch": 0.31589845287916124, "grad_norm": 6.59375, "learning_rate": 9.576799349861292e-06, "loss": 0.86631908, "memory(GiB)": 126.99, "step": 13540, "train_speed(iter/s)": 0.201021 }, { "acc": 0.76818829, "epoch": 0.31613176045145014, "grad_norm": 5.6875, "learning_rate": 9.576038397544021e-06, "loss": 0.83353624, "memory(GiB)": 126.99, "step": 13550, "train_speed(iter/s)": 0.201096 }, { "acc": 0.76592422, "epoch": 0.31636506802373904, "grad_norm": 4.8125, "learning_rate": 9.5752767919986e-06, "loss": 0.8563282, "memory(GiB)": 126.99, "step": 13560, "train_speed(iter/s)": 0.201172 }, { "acc": 0.76527843, "epoch": 0.31659837559602794, "grad_norm": 8.125, "learning_rate": 9.574514533333744e-06, "loss": 0.86033154, "memory(GiB)": 126.99, "step": 13570, "train_speed(iter/s)": 0.201252 }, { "acc": 0.77543097, "epoch": 0.31683168316831684, "grad_norm": 17.25, "learning_rate": 9.573751621658267e-06, "loss": 0.79135704, "memory(GiB)": 126.99, "step": 13580, "train_speed(iter/s)": 0.201323 }, { "acc": 0.76395407, "epoch": 0.31706499074060573, "grad_norm": 6.03125, "learning_rate": 9.572988057081076e-06, "loss": 0.85698547, "memory(GiB)": 126.99, "step": 13590, "train_speed(iter/s)": 0.2014 }, { "acc": 0.74935603, "epoch": 0.31729829831289463, "grad_norm": 5.8125, "learning_rate": 9.572223839711168e-06, "loss": 0.89535446, "memory(GiB)": 126.99, "step": 13600, "train_speed(iter/s)": 0.20148 }, { "acc": 0.76039143, "epoch": 0.31753160588518353, "grad_norm": 4.4375, "learning_rate": 9.571458969657634e-06, "loss": 0.85859356, "memory(GiB)": 126.99, "step": 13610, "train_speed(iter/s)": 0.201561 }, { "acc": 0.79370561, "epoch": 0.31776491345747243, "grad_norm": 6.375, "learning_rate": 9.570693447029662e-06, "loss": 0.73161249, "memory(GiB)": 126.99, "step": 13620, "train_speed(iter/s)": 0.20163 }, { "acc": 0.77307386, "epoch": 0.3179982210297613, "grad_norm": 4.8125, "learning_rate": 9.569927271936528e-06, "loss": 0.81770248, "memory(GiB)": 126.99, "step": 13630, "train_speed(iter/s)": 0.201711 }, { "acc": 0.77128267, "epoch": 0.3182315286020502, "grad_norm": 6.75, "learning_rate": 9.569160444487602e-06, "loss": 0.82179031, "memory(GiB)": 126.99, "step": 13640, "train_speed(iter/s)": 0.20179 }, { "acc": 0.78777056, "epoch": 0.31846483617433907, "grad_norm": 6.375, "learning_rate": 9.56839296479235e-06, "loss": 0.74274769, "memory(GiB)": 126.99, "step": 13650, "train_speed(iter/s)": 0.201867 }, { "acc": 0.79498763, "epoch": 0.31869814374662797, "grad_norm": 6.5, "learning_rate": 9.56762483296033e-06, "loss": 0.72544613, "memory(GiB)": 126.99, "step": 13660, "train_speed(iter/s)": 0.201949 }, { "acc": 0.76066217, "epoch": 0.31893145131891687, "grad_norm": 5.625, "learning_rate": 9.566856049101192e-06, "loss": 0.86741734, "memory(GiB)": 126.99, "step": 13670, "train_speed(iter/s)": 0.20202 }, { "acc": 0.77316117, "epoch": 0.31916475889120577, "grad_norm": 5.09375, "learning_rate": 9.56608661332468e-06, "loss": 0.80373688, "memory(GiB)": 126.99, "step": 13680, "train_speed(iter/s)": 0.202091 }, { "acc": 0.76230974, "epoch": 0.31939806646349467, "grad_norm": 3.96875, "learning_rate": 9.56531652574063e-06, "loss": 0.8677124, "memory(GiB)": 126.99, "step": 13690, "train_speed(iter/s)": 0.20217 }, { "acc": 0.77418733, "epoch": 0.31963137403578357, "grad_norm": 10.375, "learning_rate": 9.564545786458971e-06, "loss": 0.81346025, "memory(GiB)": 126.99, "step": 13700, "train_speed(iter/s)": 0.202244 }, { "acc": 0.77347922, "epoch": 0.31986468160807247, "grad_norm": 3.984375, "learning_rate": 9.563774395589728e-06, "loss": 0.79495201, "memory(GiB)": 126.99, "step": 13710, "train_speed(iter/s)": 0.202319 }, { "acc": 0.76973238, "epoch": 0.3200979891803613, "grad_norm": 5.28125, "learning_rate": 9.563002353243019e-06, "loss": 0.84068336, "memory(GiB)": 126.99, "step": 13720, "train_speed(iter/s)": 0.202395 }, { "acc": 0.74852266, "epoch": 0.3203312967526502, "grad_norm": 9.8125, "learning_rate": 9.562229659529046e-06, "loss": 0.91825085, "memory(GiB)": 126.99, "step": 13730, "train_speed(iter/s)": 0.202472 }, { "acc": 0.77733259, "epoch": 0.3205646043249391, "grad_norm": 4.71875, "learning_rate": 9.561456314558116e-06, "loss": 0.79782739, "memory(GiB)": 126.99, "step": 13740, "train_speed(iter/s)": 0.202545 }, { "acc": 0.75323248, "epoch": 0.320797911897228, "grad_norm": 8.3125, "learning_rate": 9.560682318440619e-06, "loss": 0.88182631, "memory(GiB)": 126.99, "step": 13750, "train_speed(iter/s)": 0.202621 }, { "acc": 0.76970348, "epoch": 0.3210312194695169, "grad_norm": 6.5625, "learning_rate": 9.55990767128705e-06, "loss": 0.84244089, "memory(GiB)": 126.99, "step": 13760, "train_speed(iter/s)": 0.202694 }, { "acc": 0.7580492, "epoch": 0.3212645270418058, "grad_norm": 5.46875, "learning_rate": 9.559132373207984e-06, "loss": 0.87328644, "memory(GiB)": 126.99, "step": 13770, "train_speed(iter/s)": 0.202774 }, { "acc": 0.74232302, "epoch": 0.3214978346140947, "grad_norm": 5.09375, "learning_rate": 9.558356424314095e-06, "loss": 0.94130011, "memory(GiB)": 126.99, "step": 13780, "train_speed(iter/s)": 0.202853 }, { "acc": 0.77524061, "epoch": 0.3217311421863836, "grad_norm": 5.375, "learning_rate": 9.557579824716152e-06, "loss": 0.81230278, "memory(GiB)": 126.99, "step": 13790, "train_speed(iter/s)": 0.20293 }, { "acc": 0.76253085, "epoch": 0.3219644497586725, "grad_norm": 6.25, "learning_rate": 9.556802574525013e-06, "loss": 0.86086216, "memory(GiB)": 126.99, "step": 13800, "train_speed(iter/s)": 0.203005 }, { "acc": 0.7655097, "epoch": 0.3221977573309614, "grad_norm": 5.125, "learning_rate": 9.556024673851629e-06, "loss": 0.86178303, "memory(GiB)": 126.99, "step": 13810, "train_speed(iter/s)": 0.203083 }, { "acc": 0.74736996, "epoch": 0.32243106490325024, "grad_norm": 6.4375, "learning_rate": 9.555246122807047e-06, "loss": 0.92181644, "memory(GiB)": 126.99, "step": 13820, "train_speed(iter/s)": 0.20316 }, { "acc": 0.77784414, "epoch": 0.32266437247553914, "grad_norm": 3.796875, "learning_rate": 9.554466921502405e-06, "loss": 0.80609112, "memory(GiB)": 126.99, "step": 13830, "train_speed(iter/s)": 0.203234 }, { "acc": 0.76699972, "epoch": 0.32289768004782804, "grad_norm": 6.21875, "learning_rate": 9.553687070048934e-06, "loss": 0.8321928, "memory(GiB)": 126.99, "step": 13840, "train_speed(iter/s)": 0.203305 }, { "acc": 0.76653528, "epoch": 0.32313098762011694, "grad_norm": 9.6875, "learning_rate": 9.552906568557953e-06, "loss": 0.85399723, "memory(GiB)": 126.99, "step": 13850, "train_speed(iter/s)": 0.203377 }, { "acc": 0.78444996, "epoch": 0.32336429519240584, "grad_norm": 4.78125, "learning_rate": 9.552125417140885e-06, "loss": 0.75022516, "memory(GiB)": 126.99, "step": 13860, "train_speed(iter/s)": 0.203452 }, { "acc": 0.75712585, "epoch": 0.32359760276469474, "grad_norm": 5.1875, "learning_rate": 9.551343615909236e-06, "loss": 0.87373152, "memory(GiB)": 126.99, "step": 13870, "train_speed(iter/s)": 0.203521 }, { "acc": 0.77796164, "epoch": 0.32383091033698364, "grad_norm": 5.125, "learning_rate": 9.550561164974606e-06, "loss": 0.79385543, "memory(GiB)": 126.99, "step": 13880, "train_speed(iter/s)": 0.203594 }, { "acc": 0.77821608, "epoch": 0.32406421790927253, "grad_norm": 7.8125, "learning_rate": 9.549778064448693e-06, "loss": 0.79453735, "memory(GiB)": 126.99, "step": 13890, "train_speed(iter/s)": 0.203671 }, { "acc": 0.77085085, "epoch": 0.32429752548156143, "grad_norm": 6.875, "learning_rate": 9.548994314443284e-06, "loss": 0.83549728, "memory(GiB)": 126.99, "step": 13900, "train_speed(iter/s)": 0.203749 }, { "acc": 0.7559999, "epoch": 0.32453083305385033, "grad_norm": 5.65625, "learning_rate": 9.548209915070256e-06, "loss": 0.88760204, "memory(GiB)": 126.99, "step": 13910, "train_speed(iter/s)": 0.203819 }, { "acc": 0.7560101, "epoch": 0.3247641406261392, "grad_norm": 5.65625, "learning_rate": 9.547424866441586e-06, "loss": 0.8699544, "memory(GiB)": 126.99, "step": 13920, "train_speed(iter/s)": 0.203893 }, { "acc": 0.77840662, "epoch": 0.3249974481984281, "grad_norm": 6.53125, "learning_rate": 9.546639168669336e-06, "loss": 0.80326424, "memory(GiB)": 126.99, "step": 13930, "train_speed(iter/s)": 0.203969 }, { "acc": 0.77628956, "epoch": 0.325230755770717, "grad_norm": 5.75, "learning_rate": 9.545852821865667e-06, "loss": 0.81750832, "memory(GiB)": 126.99, "step": 13940, "train_speed(iter/s)": 0.204043 }, { "acc": 0.76031036, "epoch": 0.32546406334300587, "grad_norm": 8.125, "learning_rate": 9.545065826142825e-06, "loss": 0.87485161, "memory(GiB)": 126.99, "step": 13950, "train_speed(iter/s)": 0.204122 }, { "acc": 0.76914015, "epoch": 0.32569737091529477, "grad_norm": 4.15625, "learning_rate": 9.544278181613158e-06, "loss": 0.82654829, "memory(GiB)": 126.99, "step": 13960, "train_speed(iter/s)": 0.204195 }, { "acc": 0.77492094, "epoch": 0.32593067848758367, "grad_norm": 5.1875, "learning_rate": 9.543489888389103e-06, "loss": 0.83645306, "memory(GiB)": 126.99, "step": 13970, "train_speed(iter/s)": 0.20427 }, { "acc": 0.77900114, "epoch": 0.32616398605987257, "grad_norm": 5.28125, "learning_rate": 9.542700946583184e-06, "loss": 0.79197259, "memory(GiB)": 126.99, "step": 13980, "train_speed(iter/s)": 0.204347 }, { "acc": 0.77285557, "epoch": 0.32639729363216147, "grad_norm": 5.5, "learning_rate": 9.541911356308025e-06, "loss": 0.83121672, "memory(GiB)": 126.99, "step": 13990, "train_speed(iter/s)": 0.204424 }, { "acc": 0.76080709, "epoch": 0.32663060120445037, "grad_norm": 4.71875, "learning_rate": 9.541121117676339e-06, "loss": 0.86064758, "memory(GiB)": 126.99, "step": 14000, "train_speed(iter/s)": 0.204499 }, { "epoch": 0.32663060120445037, "eval_acc": 0.7354247529494964, "eval_loss": 0.8388969898223877, "eval_runtime": 1270.5843, "eval_samples_per_second": 28.326, "eval_steps_per_second": 14.164, "step": 14000 }, { "acc": 0.7774581, "epoch": 0.3268639087767392, "grad_norm": 4.84375, "learning_rate": 9.540330230800935e-06, "loss": 0.79988861, "memory(GiB)": 126.99, "step": 14010, "train_speed(iter/s)": 0.200798 }, { "acc": 0.78027916, "epoch": 0.3270972163490281, "grad_norm": 5.875, "learning_rate": 9.539538695794708e-06, "loss": 0.76487494, "memory(GiB)": 126.99, "step": 14020, "train_speed(iter/s)": 0.200878 }, { "acc": 0.77316322, "epoch": 0.327330523921317, "grad_norm": 5.46875, "learning_rate": 9.53874651277065e-06, "loss": 0.80076408, "memory(GiB)": 126.99, "step": 14030, "train_speed(iter/s)": 0.200953 }, { "acc": 0.79859195, "epoch": 0.3275638314936059, "grad_norm": 4.53125, "learning_rate": 9.537953681841847e-06, "loss": 0.72243781, "memory(GiB)": 126.99, "step": 14040, "train_speed(iter/s)": 0.201027 }, { "acc": 0.77445354, "epoch": 0.3277971390658948, "grad_norm": 5.5, "learning_rate": 9.537160203121474e-06, "loss": 0.80300465, "memory(GiB)": 126.99, "step": 14050, "train_speed(iter/s)": 0.201102 }, { "acc": 0.78483415, "epoch": 0.3280304466381837, "grad_norm": 4.71875, "learning_rate": 9.536366076722799e-06, "loss": 0.77799454, "memory(GiB)": 126.99, "step": 14060, "train_speed(iter/s)": 0.201175 }, { "acc": 0.76576405, "epoch": 0.3282637542104726, "grad_norm": 6.125, "learning_rate": 9.535571302759184e-06, "loss": 0.84655552, "memory(GiB)": 126.99, "step": 14070, "train_speed(iter/s)": 0.201249 }, { "acc": 0.76197848, "epoch": 0.3284970617827615, "grad_norm": 8.1875, "learning_rate": 9.534775881344086e-06, "loss": 0.8686224, "memory(GiB)": 126.99, "step": 14080, "train_speed(iter/s)": 0.201318 }, { "acc": 0.7604651, "epoch": 0.3287303693550504, "grad_norm": 6.40625, "learning_rate": 9.533979812591046e-06, "loss": 0.83883457, "memory(GiB)": 126.99, "step": 14090, "train_speed(iter/s)": 0.201389 }, { "acc": 0.77590809, "epoch": 0.3289636769273393, "grad_norm": 4.96875, "learning_rate": 9.533183096613705e-06, "loss": 0.80737171, "memory(GiB)": 126.99, "step": 14100, "train_speed(iter/s)": 0.201463 }, { "acc": 0.7858882, "epoch": 0.32919698449962814, "grad_norm": 5.0, "learning_rate": 9.532385733525793e-06, "loss": 0.7538044, "memory(GiB)": 126.99, "step": 14110, "train_speed(iter/s)": 0.201532 }, { "acc": 0.78518639, "epoch": 0.32943029207191704, "grad_norm": 5.84375, "learning_rate": 9.531587723441136e-06, "loss": 0.77603545, "memory(GiB)": 126.99, "step": 14120, "train_speed(iter/s)": 0.201604 }, { "acc": 0.7789793, "epoch": 0.32966359964420594, "grad_norm": 4.5625, "learning_rate": 9.530789066473648e-06, "loss": 0.78307705, "memory(GiB)": 126.99, "step": 14130, "train_speed(iter/s)": 0.201676 }, { "acc": 0.75844407, "epoch": 0.32989690721649484, "grad_norm": 5.75, "learning_rate": 9.529989762737336e-06, "loss": 0.85525703, "memory(GiB)": 126.99, "step": 14140, "train_speed(iter/s)": 0.201751 }, { "acc": 0.76479483, "epoch": 0.33013021478878374, "grad_norm": 6.8125, "learning_rate": 9.529189812346303e-06, "loss": 0.85921402, "memory(GiB)": 126.99, "step": 14150, "train_speed(iter/s)": 0.201826 }, { "acc": 0.77646847, "epoch": 0.33036352236107264, "grad_norm": 6.0, "learning_rate": 9.528389215414737e-06, "loss": 0.81476002, "memory(GiB)": 126.99, "step": 14160, "train_speed(iter/s)": 0.201901 }, { "acc": 0.77587495, "epoch": 0.33059682993336154, "grad_norm": 6.09375, "learning_rate": 9.527587972056929e-06, "loss": 0.79444175, "memory(GiB)": 126.99, "step": 14170, "train_speed(iter/s)": 0.201978 }, { "acc": 0.787714, "epoch": 0.33083013750565043, "grad_norm": 5.28125, "learning_rate": 9.526786082387251e-06, "loss": 0.73652754, "memory(GiB)": 126.99, "step": 14180, "train_speed(iter/s)": 0.20205 }, { "acc": 0.77357411, "epoch": 0.33106344507793933, "grad_norm": 4.4375, "learning_rate": 9.525983546520176e-06, "loss": 0.83334599, "memory(GiB)": 126.99, "step": 14190, "train_speed(iter/s)": 0.202123 }, { "acc": 0.78118382, "epoch": 0.3312967526502282, "grad_norm": 7.0, "learning_rate": 9.525180364570265e-06, "loss": 0.80493126, "memory(GiB)": 126.99, "step": 14200, "train_speed(iter/s)": 0.202197 }, { "acc": 0.75183415, "epoch": 0.3315300602225171, "grad_norm": 4.5625, "learning_rate": 9.52437653665217e-06, "loss": 0.91054926, "memory(GiB)": 126.99, "step": 14210, "train_speed(iter/s)": 0.202276 }, { "acc": 0.78139944, "epoch": 0.331763367794806, "grad_norm": 9.0, "learning_rate": 9.52357206288064e-06, "loss": 0.79247675, "memory(GiB)": 126.99, "step": 14220, "train_speed(iter/s)": 0.20235 }, { "acc": 0.75987701, "epoch": 0.3319966753670949, "grad_norm": 6.0625, "learning_rate": 9.522766943370512e-06, "loss": 0.87182941, "memory(GiB)": 126.99, "step": 14230, "train_speed(iter/s)": 0.202422 }, { "acc": 0.77394571, "epoch": 0.33222998293938377, "grad_norm": 7.53125, "learning_rate": 9.521961178236716e-06, "loss": 0.81149826, "memory(GiB)": 126.99, "step": 14240, "train_speed(iter/s)": 0.202493 }, { "acc": 0.77338943, "epoch": 0.33246329051167267, "grad_norm": 4.3125, "learning_rate": 9.521154767594276e-06, "loss": 0.81733627, "memory(GiB)": 126.99, "step": 14250, "train_speed(iter/s)": 0.202568 }, { "acc": 0.75781255, "epoch": 0.33269659808396157, "grad_norm": 6.9375, "learning_rate": 9.520347711558306e-06, "loss": 0.88546219, "memory(GiB)": 126.99, "step": 14260, "train_speed(iter/s)": 0.202638 }, { "acc": 0.76396008, "epoch": 0.33292990565625047, "grad_norm": 5.53125, "learning_rate": 9.519540010244013e-06, "loss": 0.86021147, "memory(GiB)": 126.99, "step": 14270, "train_speed(iter/s)": 0.202717 }, { "acc": 0.76865139, "epoch": 0.33316321322853937, "grad_norm": 4.78125, "learning_rate": 9.518731663766697e-06, "loss": 0.8223979, "memory(GiB)": 126.99, "step": 14280, "train_speed(iter/s)": 0.202789 }, { "acc": 0.78064613, "epoch": 0.33339652080082827, "grad_norm": 5.71875, "learning_rate": 9.517922672241748e-06, "loss": 0.78749752, "memory(GiB)": 126.99, "step": 14290, "train_speed(iter/s)": 0.202864 }, { "acc": 0.79933729, "epoch": 0.3336298283731171, "grad_norm": 7.34375, "learning_rate": 9.517113035784651e-06, "loss": 0.7244029, "memory(GiB)": 126.99, "step": 14300, "train_speed(iter/s)": 0.20294 }, { "acc": 0.78056884, "epoch": 0.333863135945406, "grad_norm": 7.25, "learning_rate": 9.51630275451098e-06, "loss": 0.79498949, "memory(GiB)": 126.99, "step": 14310, "train_speed(iter/s)": 0.203011 }, { "acc": 0.7816524, "epoch": 0.3340964435176949, "grad_norm": 6.0625, "learning_rate": 9.515491828536403e-06, "loss": 0.77081575, "memory(GiB)": 126.99, "step": 14320, "train_speed(iter/s)": 0.203084 }, { "acc": 0.76975269, "epoch": 0.3343297510899838, "grad_norm": 5.0625, "learning_rate": 9.51468025797668e-06, "loss": 0.8435153, "memory(GiB)": 126.99, "step": 14330, "train_speed(iter/s)": 0.203157 }, { "acc": 0.79847164, "epoch": 0.3345630586622727, "grad_norm": 6.90625, "learning_rate": 9.51386804294766e-06, "loss": 0.7151453, "memory(GiB)": 126.99, "step": 14340, "train_speed(iter/s)": 0.20323 }, { "acc": 0.78779755, "epoch": 0.3347963662345616, "grad_norm": 5.21875, "learning_rate": 9.51305518356529e-06, "loss": 0.75147953, "memory(GiB)": 126.99, "step": 14350, "train_speed(iter/s)": 0.2033 }, { "acc": 0.7548315, "epoch": 0.3350296738068505, "grad_norm": 7.09375, "learning_rate": 9.512241679945602e-06, "loss": 0.89756851, "memory(GiB)": 126.99, "step": 14360, "train_speed(iter/s)": 0.203372 }, { "acc": 0.7766715, "epoch": 0.3352629813791394, "grad_norm": 6.3125, "learning_rate": 9.511427532204725e-06, "loss": 0.79562111, "memory(GiB)": 126.99, "step": 14370, "train_speed(iter/s)": 0.20344 }, { "acc": 0.77851067, "epoch": 0.3354962889514283, "grad_norm": 6.46875, "learning_rate": 9.51061274045888e-06, "loss": 0.7912365, "memory(GiB)": 126.99, "step": 14380, "train_speed(iter/s)": 0.203516 }, { "acc": 0.76406937, "epoch": 0.3357295965237172, "grad_norm": 5.46875, "learning_rate": 9.509797304824376e-06, "loss": 0.87257404, "memory(GiB)": 126.99, "step": 14390, "train_speed(iter/s)": 0.203587 }, { "acc": 0.77162414, "epoch": 0.33596290409600604, "grad_norm": 6.21875, "learning_rate": 9.508981225417615e-06, "loss": 0.81675949, "memory(GiB)": 126.99, "step": 14400, "train_speed(iter/s)": 0.203654 }, { "acc": 0.76914759, "epoch": 0.33619621166829494, "grad_norm": 5.53125, "learning_rate": 9.508164502355095e-06, "loss": 0.83779526, "memory(GiB)": 126.99, "step": 14410, "train_speed(iter/s)": 0.203724 }, { "acc": 0.77464485, "epoch": 0.33642951924058384, "grad_norm": 4.53125, "learning_rate": 9.507347135753403e-06, "loss": 0.79855938, "memory(GiB)": 126.99, "step": 14420, "train_speed(iter/s)": 0.203795 }, { "acc": 0.77766848, "epoch": 0.33666282681287274, "grad_norm": 7.875, "learning_rate": 9.506529125729216e-06, "loss": 0.80243073, "memory(GiB)": 126.99, "step": 14430, "train_speed(iter/s)": 0.203867 }, { "acc": 0.77303267, "epoch": 0.33689613438516164, "grad_norm": 5.90625, "learning_rate": 9.505710472399306e-06, "loss": 0.79739962, "memory(GiB)": 126.99, "step": 14440, "train_speed(iter/s)": 0.203937 }, { "acc": 0.76884651, "epoch": 0.33712944195745054, "grad_norm": 4.78125, "learning_rate": 9.504891175880533e-06, "loss": 0.82474422, "memory(GiB)": 126.99, "step": 14450, "train_speed(iter/s)": 0.204006 }, { "acc": 0.76067557, "epoch": 0.33736274952973944, "grad_norm": 6.1875, "learning_rate": 9.504071236289856e-06, "loss": 0.87364044, "memory(GiB)": 126.99, "step": 14460, "train_speed(iter/s)": 0.204078 }, { "acc": 0.77003412, "epoch": 0.33759605710202834, "grad_norm": 5.3125, "learning_rate": 9.503250653744316e-06, "loss": 0.81374006, "memory(GiB)": 126.99, "step": 14470, "train_speed(iter/s)": 0.204147 }, { "acc": 0.77818384, "epoch": 0.33782936467431723, "grad_norm": 5.4375, "learning_rate": 9.502429428361055e-06, "loss": 0.81029873, "memory(GiB)": 126.99, "step": 14480, "train_speed(iter/s)": 0.204217 }, { "acc": 0.74643278, "epoch": 0.3380626722466061, "grad_norm": 5.4375, "learning_rate": 9.5016075602573e-06, "loss": 0.92115335, "memory(GiB)": 126.99, "step": 14490, "train_speed(iter/s)": 0.20429 }, { "acc": 0.76585412, "epoch": 0.338295979818895, "grad_norm": 9.25, "learning_rate": 9.500785049550373e-06, "loss": 0.83918991, "memory(GiB)": 126.99, "step": 14500, "train_speed(iter/s)": 0.204369 }, { "epoch": 0.338295979818895, "eval_acc": 0.7356140434320767, "eval_loss": 0.8385112881660461, "eval_runtime": 1269.8549, "eval_samples_per_second": 28.343, "eval_steps_per_second": 14.172, "step": 14500 }, { "acc": 0.77105675, "epoch": 0.3385292873911839, "grad_norm": 4.25, "learning_rate": 9.49996189635769e-06, "loss": 0.81464977, "memory(GiB)": 126.99, "step": 14510, "train_speed(iter/s)": 0.200798 }, { "acc": 0.76173067, "epoch": 0.3387625949634728, "grad_norm": 4.75, "learning_rate": 9.499138100796752e-06, "loss": 0.85862141, "memory(GiB)": 126.99, "step": 14520, "train_speed(iter/s)": 0.200874 }, { "acc": 0.76991301, "epoch": 0.3389959025357617, "grad_norm": 4.5, "learning_rate": 9.498313662985159e-06, "loss": 0.83255396, "memory(GiB)": 126.99, "step": 14530, "train_speed(iter/s)": 0.200942 }, { "acc": 0.77943153, "epoch": 0.33922921010805057, "grad_norm": 5.21875, "learning_rate": 9.497488583040595e-06, "loss": 0.79724607, "memory(GiB)": 126.99, "step": 14540, "train_speed(iter/s)": 0.201014 }, { "acc": 0.76977105, "epoch": 0.33946251768033947, "grad_norm": 6.40625, "learning_rate": 9.496662861080842e-06, "loss": 0.83135042, "memory(GiB)": 126.99, "step": 14550, "train_speed(iter/s)": 0.201086 }, { "acc": 0.78565388, "epoch": 0.33969582525262837, "grad_norm": 7.28125, "learning_rate": 9.495836497223775e-06, "loss": 0.74578938, "memory(GiB)": 126.99, "step": 14560, "train_speed(iter/s)": 0.201157 }, { "acc": 0.75958099, "epoch": 0.33992913282491727, "grad_norm": 5.78125, "learning_rate": 9.49500949158735e-06, "loss": 0.88202744, "memory(GiB)": 126.99, "step": 14570, "train_speed(iter/s)": 0.201229 }, { "acc": 0.78799295, "epoch": 0.34016244039720617, "grad_norm": 5.90625, "learning_rate": 9.494181844289629e-06, "loss": 0.75782499, "memory(GiB)": 126.99, "step": 14580, "train_speed(iter/s)": 0.201295 }, { "acc": 0.78360558, "epoch": 0.340395747969495, "grad_norm": 4.78125, "learning_rate": 9.493353555448754e-06, "loss": 0.75555897, "memory(GiB)": 135.77, "step": 14590, "train_speed(iter/s)": 0.201362 }, { "acc": 0.78057513, "epoch": 0.3406290555417839, "grad_norm": 6.8125, "learning_rate": 9.492524625182965e-06, "loss": 0.76722956, "memory(GiB)": 135.77, "step": 14600, "train_speed(iter/s)": 0.201432 }, { "acc": 0.77534466, "epoch": 0.3408623631140728, "grad_norm": 4.8125, "learning_rate": 9.49169505361059e-06, "loss": 0.8156085, "memory(GiB)": 135.77, "step": 14610, "train_speed(iter/s)": 0.201504 }, { "acc": 0.7402894, "epoch": 0.3410956706863617, "grad_norm": 10.5, "learning_rate": 9.490864840850051e-06, "loss": 0.96359291, "memory(GiB)": 135.77, "step": 14620, "train_speed(iter/s)": 0.201581 }, { "acc": 0.7706079, "epoch": 0.3413289782586506, "grad_norm": 7.40625, "learning_rate": 9.490033987019862e-06, "loss": 0.83333149, "memory(GiB)": 135.77, "step": 14630, "train_speed(iter/s)": 0.201651 }, { "acc": 0.76909857, "epoch": 0.3415622858309395, "grad_norm": 4.6875, "learning_rate": 9.489202492238624e-06, "loss": 0.82511196, "memory(GiB)": 135.77, "step": 14640, "train_speed(iter/s)": 0.201726 }, { "acc": 0.75274253, "epoch": 0.3417955934032284, "grad_norm": 5.3125, "learning_rate": 9.488370356625035e-06, "loss": 0.8952776, "memory(GiB)": 135.77, "step": 14650, "train_speed(iter/s)": 0.2018 }, { "acc": 0.78073397, "epoch": 0.3420289009755173, "grad_norm": 12.375, "learning_rate": 9.487537580297881e-06, "loss": 0.7810813, "memory(GiB)": 135.77, "step": 14660, "train_speed(iter/s)": 0.201871 }, { "acc": 0.75856628, "epoch": 0.3422622085478062, "grad_norm": 4.5625, "learning_rate": 9.486704163376041e-06, "loss": 0.89207325, "memory(GiB)": 135.77, "step": 14670, "train_speed(iter/s)": 0.201942 }, { "acc": 0.77476802, "epoch": 0.3424955161200951, "grad_norm": 6.28125, "learning_rate": 9.485870105978487e-06, "loss": 0.78234882, "memory(GiB)": 135.77, "step": 14680, "train_speed(iter/s)": 0.202016 }, { "acc": 0.76214299, "epoch": 0.34272882369238394, "grad_norm": 4.78125, "learning_rate": 9.485035408224277e-06, "loss": 0.84753475, "memory(GiB)": 135.77, "step": 14690, "train_speed(iter/s)": 0.202093 }, { "acc": 0.77669897, "epoch": 0.34296213126467284, "grad_norm": 7.3125, "learning_rate": 9.484200070232565e-06, "loss": 0.82079849, "memory(GiB)": 135.77, "step": 14700, "train_speed(iter/s)": 0.202169 }, { "acc": 0.76222053, "epoch": 0.34319543883696174, "grad_norm": 5.125, "learning_rate": 9.483364092122595e-06, "loss": 0.86403046, "memory(GiB)": 135.77, "step": 14710, "train_speed(iter/s)": 0.20224 }, { "acc": 0.77041807, "epoch": 0.34342874640925064, "grad_norm": 6.03125, "learning_rate": 9.482527474013705e-06, "loss": 0.84138517, "memory(GiB)": 135.77, "step": 14720, "train_speed(iter/s)": 0.202309 }, { "acc": 0.769876, "epoch": 0.34366205398153954, "grad_norm": 8.125, "learning_rate": 9.481690216025321e-06, "loss": 0.83939104, "memory(GiB)": 135.77, "step": 14730, "train_speed(iter/s)": 0.202379 }, { "acc": 0.76854458, "epoch": 0.34389536155382844, "grad_norm": 7.75, "learning_rate": 9.480852318276958e-06, "loss": 0.8580862, "memory(GiB)": 135.77, "step": 14740, "train_speed(iter/s)": 0.202448 }, { "acc": 0.79126053, "epoch": 0.34412866912611734, "grad_norm": 7.03125, "learning_rate": 9.48001378088823e-06, "loss": 0.74607038, "memory(GiB)": 135.77, "step": 14750, "train_speed(iter/s)": 0.202516 }, { "acc": 0.77793612, "epoch": 0.34436197669840624, "grad_norm": 5.53125, "learning_rate": 9.479174603978836e-06, "loss": 0.7889802, "memory(GiB)": 135.77, "step": 14760, "train_speed(iter/s)": 0.202585 }, { "acc": 0.78102379, "epoch": 0.34459528427069513, "grad_norm": 5.34375, "learning_rate": 9.478334787668569e-06, "loss": 0.79738607, "memory(GiB)": 135.77, "step": 14770, "train_speed(iter/s)": 0.202653 }, { "acc": 0.77394485, "epoch": 0.344828591842984, "grad_norm": 8.1875, "learning_rate": 9.477494332077311e-06, "loss": 0.81590576, "memory(GiB)": 135.77, "step": 14780, "train_speed(iter/s)": 0.202721 }, { "acc": 0.76899199, "epoch": 0.3450618994152729, "grad_norm": 4.78125, "learning_rate": 9.476653237325037e-06, "loss": 0.84678192, "memory(GiB)": 135.77, "step": 14790, "train_speed(iter/s)": 0.202792 }, { "acc": 0.77013807, "epoch": 0.3452952069875618, "grad_norm": 5.21875, "learning_rate": 9.475811503531815e-06, "loss": 0.82642403, "memory(GiB)": 135.77, "step": 14800, "train_speed(iter/s)": 0.202859 }, { "acc": 0.7706193, "epoch": 0.3455285145598507, "grad_norm": 6.15625, "learning_rate": 9.474969130817801e-06, "loss": 0.82289953, "memory(GiB)": 135.77, "step": 14810, "train_speed(iter/s)": 0.20293 }, { "acc": 0.76709123, "epoch": 0.3457618221321396, "grad_norm": 5.34375, "learning_rate": 9.474126119303245e-06, "loss": 0.85336533, "memory(GiB)": 135.77, "step": 14820, "train_speed(iter/s)": 0.202989 }, { "acc": 0.77197552, "epoch": 0.3459951297044285, "grad_norm": 7.625, "learning_rate": 9.473282469108483e-06, "loss": 0.81618519, "memory(GiB)": 135.77, "step": 14830, "train_speed(iter/s)": 0.203057 }, { "acc": 0.76123796, "epoch": 0.34622843727671737, "grad_norm": 4.625, "learning_rate": 9.472438180353948e-06, "loss": 0.87118511, "memory(GiB)": 135.77, "step": 14840, "train_speed(iter/s)": 0.203124 }, { "acc": 0.79290915, "epoch": 0.34646174484900627, "grad_norm": 5.125, "learning_rate": 9.471593253160162e-06, "loss": 0.73074284, "memory(GiB)": 135.77, "step": 14850, "train_speed(iter/s)": 0.203196 }, { "acc": 0.76475649, "epoch": 0.34669505242129517, "grad_norm": 5.15625, "learning_rate": 9.470747687647741e-06, "loss": 0.85227308, "memory(GiB)": 135.77, "step": 14860, "train_speed(iter/s)": 0.203268 }, { "acc": 0.7734736, "epoch": 0.34692835999358407, "grad_norm": 7.03125, "learning_rate": 9.469901483937384e-06, "loss": 0.79763713, "memory(GiB)": 135.77, "step": 14870, "train_speed(iter/s)": 0.203343 }, { "acc": 0.79150887, "epoch": 0.3471616675658729, "grad_norm": 4.8125, "learning_rate": 9.469054642149889e-06, "loss": 0.73478098, "memory(GiB)": 135.77, "step": 14880, "train_speed(iter/s)": 0.203411 }, { "acc": 0.77741909, "epoch": 0.3473949751381618, "grad_norm": 6.25, "learning_rate": 9.468207162406143e-06, "loss": 0.81160421, "memory(GiB)": 135.77, "step": 14890, "train_speed(iter/s)": 0.203483 }, { "acc": 0.76978407, "epoch": 0.3476282827104507, "grad_norm": 5.0, "learning_rate": 9.46735904482712e-06, "loss": 0.81876183, "memory(GiB)": 135.77, "step": 14900, "train_speed(iter/s)": 0.203552 }, { "acc": 0.77738295, "epoch": 0.3478615902827396, "grad_norm": 6.3125, "learning_rate": 9.466510289533894e-06, "loss": 0.79115467, "memory(GiB)": 135.77, "step": 14910, "train_speed(iter/s)": 0.203622 }, { "acc": 0.78486333, "epoch": 0.3480948978550285, "grad_norm": 5.96875, "learning_rate": 9.46566089664762e-06, "loss": 0.74208999, "memory(GiB)": 135.77, "step": 14920, "train_speed(iter/s)": 0.203692 }, { "acc": 0.78259773, "epoch": 0.3483282054273174, "grad_norm": 8.75, "learning_rate": 9.46481086628955e-06, "loss": 0.79701777, "memory(GiB)": 135.77, "step": 14930, "train_speed(iter/s)": 0.20376 }, { "acc": 0.77444029, "epoch": 0.3485615129996063, "grad_norm": 5.53125, "learning_rate": 9.463960198581028e-06, "loss": 0.82572346, "memory(GiB)": 135.77, "step": 14940, "train_speed(iter/s)": 0.203828 }, { "acc": 0.76832108, "epoch": 0.3487948205718952, "grad_norm": 7.4375, "learning_rate": 9.463108893643483e-06, "loss": 0.81358547, "memory(GiB)": 135.77, "step": 14950, "train_speed(iter/s)": 0.203898 }, { "acc": 0.76551371, "epoch": 0.3490281281441841, "grad_norm": 6.84375, "learning_rate": 9.46225695159844e-06, "loss": 0.86378431, "memory(GiB)": 135.77, "step": 14960, "train_speed(iter/s)": 0.203957 }, { "acc": 0.79054885, "epoch": 0.34926143571647295, "grad_norm": 4.71875, "learning_rate": 9.461404372567513e-06, "loss": 0.75698428, "memory(GiB)": 135.77, "step": 14970, "train_speed(iter/s)": 0.204027 }, { "acc": 0.76183825, "epoch": 0.34949474328876184, "grad_norm": 4.84375, "learning_rate": 9.460551156672408e-06, "loss": 0.87834749, "memory(GiB)": 135.77, "step": 14980, "train_speed(iter/s)": 0.204097 }, { "acc": 0.75772481, "epoch": 0.34972805086105074, "grad_norm": 5.0, "learning_rate": 9.459697304034923e-06, "loss": 0.87366276, "memory(GiB)": 135.77, "step": 14990, "train_speed(iter/s)": 0.204166 }, { "acc": 0.74876041, "epoch": 0.34996135843333964, "grad_norm": 5.53125, "learning_rate": 9.458842814776941e-06, "loss": 0.91464996, "memory(GiB)": 135.77, "step": 15000, "train_speed(iter/s)": 0.204229 }, { "epoch": 0.34996135843333964, "eval_acc": 0.7360832660327197, "eval_loss": 0.8364920616149902, "eval_runtime": 1270.2256, "eval_samples_per_second": 28.334, "eval_steps_per_second": 14.168, "step": 15000 }, { "acc": 0.78980742, "epoch": 0.35019466600562854, "grad_norm": 5.09375, "learning_rate": 9.457987689020444e-06, "loss": 0.74719791, "memory(GiB)": 135.77, "step": 15010, "train_speed(iter/s)": 0.200781 }, { "acc": 0.76756434, "epoch": 0.35042797357791744, "grad_norm": 6.4375, "learning_rate": 9.457131926887498e-06, "loss": 0.84798584, "memory(GiB)": 135.77, "step": 15020, "train_speed(iter/s)": 0.200849 }, { "acc": 0.7941637, "epoch": 0.35066128115020634, "grad_norm": 6.28125, "learning_rate": 9.456275528500264e-06, "loss": 0.75351415, "memory(GiB)": 135.77, "step": 15030, "train_speed(iter/s)": 0.200916 }, { "acc": 0.76890516, "epoch": 0.35089458872249524, "grad_norm": 5.53125, "learning_rate": 9.455418493980996e-06, "loss": 0.84603033, "memory(GiB)": 135.77, "step": 15040, "train_speed(iter/s)": 0.200988 }, { "acc": 0.76987128, "epoch": 0.35112789629478414, "grad_norm": 8.5625, "learning_rate": 9.454560823452031e-06, "loss": 0.82527428, "memory(GiB)": 135.77, "step": 15050, "train_speed(iter/s)": 0.201059 }, { "acc": 0.76635885, "epoch": 0.35136120386707304, "grad_norm": 6.21875, "learning_rate": 9.4537025170358e-06, "loss": 0.84537125, "memory(GiB)": 135.77, "step": 15060, "train_speed(iter/s)": 0.201127 }, { "acc": 0.78066635, "epoch": 0.3515945114393619, "grad_norm": 4.71875, "learning_rate": 9.45284357485483e-06, "loss": 0.79466629, "memory(GiB)": 135.77, "step": 15070, "train_speed(iter/s)": 0.201199 }, { "acc": 0.77449002, "epoch": 0.3518278190116508, "grad_norm": 4.65625, "learning_rate": 9.451983997031736e-06, "loss": 0.80630569, "memory(GiB)": 135.77, "step": 15080, "train_speed(iter/s)": 0.201266 }, { "acc": 0.77382059, "epoch": 0.3520611265839397, "grad_norm": 5.1875, "learning_rate": 9.451123783689216e-06, "loss": 0.81937408, "memory(GiB)": 135.77, "step": 15090, "train_speed(iter/s)": 0.201336 }, { "acc": 0.75354958, "epoch": 0.3522944341562286, "grad_norm": 6.28125, "learning_rate": 9.450262934950069e-06, "loss": 0.88799534, "memory(GiB)": 135.77, "step": 15100, "train_speed(iter/s)": 0.201403 }, { "acc": 0.76095815, "epoch": 0.3525277417285175, "grad_norm": 6.25, "learning_rate": 9.449401450937184e-06, "loss": 0.88658123, "memory(GiB)": 135.77, "step": 15110, "train_speed(iter/s)": 0.201471 }, { "acc": 0.77167931, "epoch": 0.3527610493008064, "grad_norm": 5.53125, "learning_rate": 9.448539331773532e-06, "loss": 0.83307276, "memory(GiB)": 135.77, "step": 15120, "train_speed(iter/s)": 0.201538 }, { "acc": 0.77130241, "epoch": 0.35299435687309527, "grad_norm": 6.75, "learning_rate": 9.447676577582184e-06, "loss": 0.8277441, "memory(GiB)": 135.77, "step": 15130, "train_speed(iter/s)": 0.201605 }, { "acc": 0.76398301, "epoch": 0.35322766444538417, "grad_norm": 5.5, "learning_rate": 9.446813188486294e-06, "loss": 0.85096407, "memory(GiB)": 135.77, "step": 15140, "train_speed(iter/s)": 0.201673 }, { "acc": 0.76413918, "epoch": 0.35346097201767307, "grad_norm": 7.375, "learning_rate": 9.445949164609116e-06, "loss": 0.87346344, "memory(GiB)": 135.77, "step": 15150, "train_speed(iter/s)": 0.201742 }, { "acc": 0.78450708, "epoch": 0.35369427958996197, "grad_norm": 5.125, "learning_rate": 9.445084506073985e-06, "loss": 0.77033629, "memory(GiB)": 135.77, "step": 15160, "train_speed(iter/s)": 0.201812 }, { "acc": 0.76229258, "epoch": 0.3539275871622508, "grad_norm": 7.3125, "learning_rate": 9.444219213004333e-06, "loss": 0.86626339, "memory(GiB)": 135.77, "step": 15170, "train_speed(iter/s)": 0.201878 }, { "acc": 0.76085272, "epoch": 0.3541608947345397, "grad_norm": 4.71875, "learning_rate": 9.443353285523678e-06, "loss": 0.86795692, "memory(GiB)": 135.77, "step": 15180, "train_speed(iter/s)": 0.20195 }, { "acc": 0.76711388, "epoch": 0.3543942023068286, "grad_norm": 5.5, "learning_rate": 9.442486723755633e-06, "loss": 0.82695503, "memory(GiB)": 135.77, "step": 15190, "train_speed(iter/s)": 0.202021 }, { "acc": 0.76617556, "epoch": 0.3546275098791175, "grad_norm": 7.375, "learning_rate": 9.4416195278239e-06, "loss": 0.82697554, "memory(GiB)": 135.77, "step": 15200, "train_speed(iter/s)": 0.202089 }, { "acc": 0.76866422, "epoch": 0.3548608174514064, "grad_norm": 4.28125, "learning_rate": 9.440751697852268e-06, "loss": 0.84350128, "memory(GiB)": 135.77, "step": 15210, "train_speed(iter/s)": 0.202153 }, { "acc": 0.78142757, "epoch": 0.3550941250236953, "grad_norm": 9.0625, "learning_rate": 9.439883233964621e-06, "loss": 0.79565501, "memory(GiB)": 135.77, "step": 15220, "train_speed(iter/s)": 0.202225 }, { "acc": 0.77310419, "epoch": 0.3553274325959842, "grad_norm": 4.875, "learning_rate": 9.439014136284934e-06, "loss": 0.83620567, "memory(GiB)": 135.77, "step": 15230, "train_speed(iter/s)": 0.202294 }, { "acc": 0.78746805, "epoch": 0.3555607401682731, "grad_norm": 4.0625, "learning_rate": 9.438144404937266e-06, "loss": 0.75883884, "memory(GiB)": 135.77, "step": 15240, "train_speed(iter/s)": 0.202366 }, { "acc": 0.77289262, "epoch": 0.355794047740562, "grad_norm": 4.53125, "learning_rate": 9.437274040045775e-06, "loss": 0.85325108, "memory(GiB)": 135.77, "step": 15250, "train_speed(iter/s)": 0.202433 }, { "acc": 0.79338937, "epoch": 0.35602735531285085, "grad_norm": 5.53125, "learning_rate": 9.436403041734704e-06, "loss": 0.74687109, "memory(GiB)": 135.77, "step": 15260, "train_speed(iter/s)": 0.202505 }, { "acc": 0.77933197, "epoch": 0.35626066288513975, "grad_norm": 4.5, "learning_rate": 9.435531410128387e-06, "loss": 0.78443747, "memory(GiB)": 135.77, "step": 15270, "train_speed(iter/s)": 0.202572 }, { "acc": 0.76276693, "epoch": 0.35649397045742864, "grad_norm": 4.125, "learning_rate": 9.434659145351251e-06, "loss": 0.85257626, "memory(GiB)": 135.77, "step": 15280, "train_speed(iter/s)": 0.202643 }, { "acc": 0.77083325, "epoch": 0.35672727802971754, "grad_norm": 5.4375, "learning_rate": 9.433786247527809e-06, "loss": 0.81012554, "memory(GiB)": 135.77, "step": 15290, "train_speed(iter/s)": 0.202711 }, { "acc": 0.76903772, "epoch": 0.35696058560200644, "grad_norm": 4.625, "learning_rate": 9.432912716782667e-06, "loss": 0.82582378, "memory(GiB)": 135.77, "step": 15300, "train_speed(iter/s)": 0.202778 }, { "acc": 0.77473183, "epoch": 0.35719389317429534, "grad_norm": 5.28125, "learning_rate": 9.432038553240526e-06, "loss": 0.82623148, "memory(GiB)": 135.77, "step": 15310, "train_speed(iter/s)": 0.202843 }, { "acc": 0.77468882, "epoch": 0.35742720074658424, "grad_norm": 5.625, "learning_rate": 9.431163757026167e-06, "loss": 0.79663391, "memory(GiB)": 135.77, "step": 15320, "train_speed(iter/s)": 0.20291 }, { "acc": 0.7720438, "epoch": 0.35766050831887314, "grad_norm": 5.9375, "learning_rate": 9.430288328264467e-06, "loss": 0.8156147, "memory(GiB)": 135.77, "step": 15330, "train_speed(iter/s)": 0.202979 }, { "acc": 0.77816076, "epoch": 0.35789381589116204, "grad_norm": 5.59375, "learning_rate": 9.429412267080397e-06, "loss": 0.78154631, "memory(GiB)": 135.77, "step": 15340, "train_speed(iter/s)": 0.203047 }, { "acc": 0.76980267, "epoch": 0.35812712346345094, "grad_norm": 5.8125, "learning_rate": 9.428535573599013e-06, "loss": 0.83480797, "memory(GiB)": 135.77, "step": 15350, "train_speed(iter/s)": 0.203121 }, { "acc": 0.76492939, "epoch": 0.3583604310357398, "grad_norm": 4.65625, "learning_rate": 9.427658247945463e-06, "loss": 0.8619091, "memory(GiB)": 135.77, "step": 15360, "train_speed(iter/s)": 0.203181 }, { "acc": 0.75917945, "epoch": 0.3585937386080287, "grad_norm": 9.375, "learning_rate": 9.426780290244983e-06, "loss": 0.89382544, "memory(GiB)": 135.77, "step": 15370, "train_speed(iter/s)": 0.203252 }, { "acc": 0.76189117, "epoch": 0.3588270461803176, "grad_norm": 8.9375, "learning_rate": 9.425901700622904e-06, "loss": 0.84197168, "memory(GiB)": 135.77, "step": 15380, "train_speed(iter/s)": 0.20332 }, { "acc": 0.77091656, "epoch": 0.3590603537526065, "grad_norm": 7.6875, "learning_rate": 9.42502247920464e-06, "loss": 0.8417284, "memory(GiB)": 135.77, "step": 15390, "train_speed(iter/s)": 0.20339 }, { "acc": 0.77439508, "epoch": 0.3592936613248954, "grad_norm": 4.78125, "learning_rate": 9.424142626115706e-06, "loss": 0.81533623, "memory(GiB)": 135.77, "step": 15400, "train_speed(iter/s)": 0.203459 }, { "acc": 0.75681648, "epoch": 0.3595269688971843, "grad_norm": 6.8125, "learning_rate": 9.423262141481695e-06, "loss": 0.87958059, "memory(GiB)": 135.77, "step": 15410, "train_speed(iter/s)": 0.203521 }, { "acc": 0.77736855, "epoch": 0.3597602764694732, "grad_norm": 6.5, "learning_rate": 9.4223810254283e-06, "loss": 0.80077677, "memory(GiB)": 135.77, "step": 15420, "train_speed(iter/s)": 0.203593 }, { "acc": 0.7821209, "epoch": 0.35999358404176207, "grad_norm": 6.6875, "learning_rate": 9.421499278081296e-06, "loss": 0.80219059, "memory(GiB)": 135.77, "step": 15430, "train_speed(iter/s)": 0.20366 }, { "acc": 0.77446098, "epoch": 0.36022689161405097, "grad_norm": 6.5625, "learning_rate": 9.420616899566557e-06, "loss": 0.79374628, "memory(GiB)": 135.77, "step": 15440, "train_speed(iter/s)": 0.203725 }, { "acc": 0.76317291, "epoch": 0.36046019918633987, "grad_norm": 4.84375, "learning_rate": 9.41973389001004e-06, "loss": 0.87783279, "memory(GiB)": 135.77, "step": 15450, "train_speed(iter/s)": 0.203795 }, { "acc": 0.77032871, "epoch": 0.3606935067586287, "grad_norm": 5.0, "learning_rate": 9.418850249537792e-06, "loss": 0.83090649, "memory(GiB)": 135.77, "step": 15460, "train_speed(iter/s)": 0.203862 }, { "acc": 0.75985117, "epoch": 0.3609268143309176, "grad_norm": 7.0625, "learning_rate": 9.417965978275955e-06, "loss": 0.87291212, "memory(GiB)": 135.77, "step": 15470, "train_speed(iter/s)": 0.203928 }, { "acc": 0.78009644, "epoch": 0.3611601219032065, "grad_norm": 5.90625, "learning_rate": 9.417081076350758e-06, "loss": 0.79100814, "memory(GiB)": 135.77, "step": 15480, "train_speed(iter/s)": 0.203997 }, { "acc": 0.78492813, "epoch": 0.3613934294754954, "grad_norm": 5.75, "learning_rate": 9.416195543888522e-06, "loss": 0.78736544, "memory(GiB)": 135.77, "step": 15490, "train_speed(iter/s)": 0.204062 }, { "acc": 0.75766697, "epoch": 0.3616267370477843, "grad_norm": 6.53125, "learning_rate": 9.415309381015654e-06, "loss": 0.86035919, "memory(GiB)": 135.77, "step": 15500, "train_speed(iter/s)": 0.204131 }, { "epoch": 0.3616267370477843, "eval_acc": 0.7363535894968445, "eval_loss": 0.8363694548606873, "eval_runtime": 1270.7457, "eval_samples_per_second": 28.323, "eval_steps_per_second": 14.162, "step": 15500 }, { "acc": 0.76968145, "epoch": 0.3618600446200732, "grad_norm": 7.1875, "learning_rate": 9.414422587858654e-06, "loss": 0.82733736, "memory(GiB)": 135.77, "step": 15510, "train_speed(iter/s)": 0.200797 }, { "acc": 0.78013411, "epoch": 0.3620933521923621, "grad_norm": 4.21875, "learning_rate": 9.413535164544112e-06, "loss": 0.77737093, "memory(GiB)": 135.77, "step": 15520, "train_speed(iter/s)": 0.200865 }, { "acc": 0.76417913, "epoch": 0.362326659764651, "grad_norm": 5.8125, "learning_rate": 9.412647111198708e-06, "loss": 0.86939831, "memory(GiB)": 135.77, "step": 15530, "train_speed(iter/s)": 0.200933 }, { "acc": 0.76209974, "epoch": 0.3625599673369399, "grad_norm": 3.8125, "learning_rate": 9.411758427949211e-06, "loss": 0.85637188, "memory(GiB)": 135.77, "step": 15540, "train_speed(iter/s)": 0.201 }, { "acc": 0.79845304, "epoch": 0.36279327490922875, "grad_norm": 4.3125, "learning_rate": 9.410869114922478e-06, "loss": 0.72790012, "memory(GiB)": 135.77, "step": 15550, "train_speed(iter/s)": 0.201068 }, { "acc": 0.77766886, "epoch": 0.36302658248151765, "grad_norm": 6.5, "learning_rate": 9.409979172245463e-06, "loss": 0.79989691, "memory(GiB)": 135.77, "step": 15560, "train_speed(iter/s)": 0.201133 }, { "acc": 0.77453566, "epoch": 0.36325989005380654, "grad_norm": 5.625, "learning_rate": 9.409088600045202e-06, "loss": 0.82451859, "memory(GiB)": 135.77, "step": 15570, "train_speed(iter/s)": 0.201202 }, { "acc": 0.77265034, "epoch": 0.36349319762609544, "grad_norm": 5.78125, "learning_rate": 9.408197398448822e-06, "loss": 0.79861336, "memory(GiB)": 135.77, "step": 15580, "train_speed(iter/s)": 0.201269 }, { "acc": 0.77113476, "epoch": 0.36372650519838434, "grad_norm": 4.9375, "learning_rate": 9.407305567583547e-06, "loss": 0.8350358, "memory(GiB)": 135.77, "step": 15590, "train_speed(iter/s)": 0.201333 }, { "acc": 0.76244006, "epoch": 0.36395981277067324, "grad_norm": 5.1875, "learning_rate": 9.40641310757668e-06, "loss": 0.86877785, "memory(GiB)": 135.77, "step": 15600, "train_speed(iter/s)": 0.2014 }, { "acc": 0.7791647, "epoch": 0.36419312034296214, "grad_norm": 5.75, "learning_rate": 9.405520018555624e-06, "loss": 0.79430704, "memory(GiB)": 135.77, "step": 15610, "train_speed(iter/s)": 0.201469 }, { "acc": 0.77130833, "epoch": 0.36442642791525104, "grad_norm": 5.46875, "learning_rate": 9.404626300647864e-06, "loss": 0.8362133, "memory(GiB)": 135.77, "step": 15620, "train_speed(iter/s)": 0.201536 }, { "acc": 0.78146958, "epoch": 0.36465973548753994, "grad_norm": 5.53125, "learning_rate": 9.403731953980978e-06, "loss": 0.7925806, "memory(GiB)": 135.77, "step": 15630, "train_speed(iter/s)": 0.201599 }, { "acc": 0.77909827, "epoch": 0.36489304305982884, "grad_norm": 4.25, "learning_rate": 9.402836978682636e-06, "loss": 0.79750004, "memory(GiB)": 135.77, "step": 15640, "train_speed(iter/s)": 0.201662 }, { "acc": 0.77254696, "epoch": 0.3651263506321177, "grad_norm": 7.96875, "learning_rate": 9.401941374880595e-06, "loss": 0.80021648, "memory(GiB)": 135.77, "step": 15650, "train_speed(iter/s)": 0.201723 }, { "acc": 0.77365751, "epoch": 0.3653596582044066, "grad_norm": 5.09375, "learning_rate": 9.4010451427027e-06, "loss": 0.81189899, "memory(GiB)": 135.77, "step": 15660, "train_speed(iter/s)": 0.201787 }, { "acc": 0.77464514, "epoch": 0.3655929657766955, "grad_norm": 4.34375, "learning_rate": 9.40014828227689e-06, "loss": 0.81965914, "memory(GiB)": 135.77, "step": 15670, "train_speed(iter/s)": 0.201854 }, { "acc": 0.76693192, "epoch": 0.3658262733489844, "grad_norm": 12.625, "learning_rate": 9.399250793731192e-06, "loss": 0.85733891, "memory(GiB)": 135.77, "step": 15680, "train_speed(iter/s)": 0.20192 }, { "acc": 0.76616225, "epoch": 0.3660595809212733, "grad_norm": 4.5, "learning_rate": 9.398352677193719e-06, "loss": 0.85392532, "memory(GiB)": 135.77, "step": 15690, "train_speed(iter/s)": 0.201988 }, { "acc": 0.77419605, "epoch": 0.3662928884935622, "grad_norm": 5.375, "learning_rate": 9.397453932792681e-06, "loss": 0.80535965, "memory(GiB)": 135.77, "step": 15700, "train_speed(iter/s)": 0.202053 }, { "acc": 0.76591721, "epoch": 0.3665261960658511, "grad_norm": 32.5, "learning_rate": 9.396554560656371e-06, "loss": 0.90535355, "memory(GiB)": 135.77, "step": 15710, "train_speed(iter/s)": 0.202119 }, { "acc": 0.75157509, "epoch": 0.36675950363813997, "grad_norm": 5.125, "learning_rate": 9.395654560913174e-06, "loss": 0.90469847, "memory(GiB)": 135.77, "step": 15720, "train_speed(iter/s)": 0.202184 }, { "acc": 0.7669796, "epoch": 0.36699281121042887, "grad_norm": 4.5625, "learning_rate": 9.394753933691567e-06, "loss": 0.86323175, "memory(GiB)": 135.77, "step": 15730, "train_speed(iter/s)": 0.202243 }, { "acc": 0.76320677, "epoch": 0.3672261187827177, "grad_norm": 4.65625, "learning_rate": 9.393852679120113e-06, "loss": 0.85583038, "memory(GiB)": 135.77, "step": 15740, "train_speed(iter/s)": 0.202312 }, { "acc": 0.78019586, "epoch": 0.3674594263550066, "grad_norm": 4.875, "learning_rate": 9.392950797327463e-06, "loss": 0.77571545, "memory(GiB)": 135.77, "step": 15750, "train_speed(iter/s)": 0.202374 }, { "acc": 0.76291428, "epoch": 0.3676927339272955, "grad_norm": 7.25, "learning_rate": 9.392048288442363e-06, "loss": 0.8645957, "memory(GiB)": 135.77, "step": 15760, "train_speed(iter/s)": 0.202441 }, { "acc": 0.75715952, "epoch": 0.3679260414995844, "grad_norm": 5.65625, "learning_rate": 9.391145152593646e-06, "loss": 0.89210663, "memory(GiB)": 135.77, "step": 15770, "train_speed(iter/s)": 0.202507 }, { "acc": 0.77362776, "epoch": 0.3681593490718733, "grad_norm": 8.9375, "learning_rate": 9.390241389910236e-06, "loss": 0.84507351, "memory(GiB)": 135.77, "step": 15780, "train_speed(iter/s)": 0.202567 }, { "acc": 0.77469263, "epoch": 0.3683926566441622, "grad_norm": 5.65625, "learning_rate": 9.389337000521142e-06, "loss": 0.82550621, "memory(GiB)": 135.77, "step": 15790, "train_speed(iter/s)": 0.202635 }, { "acc": 0.76861792, "epoch": 0.3686259642164511, "grad_norm": 5.125, "learning_rate": 9.388431984555466e-06, "loss": 0.84576836, "memory(GiB)": 135.77, "step": 15800, "train_speed(iter/s)": 0.202702 }, { "acc": 0.79161878, "epoch": 0.36885927178874, "grad_norm": 4.8125, "learning_rate": 9.387526342142398e-06, "loss": 0.75349326, "memory(GiB)": 135.77, "step": 15810, "train_speed(iter/s)": 0.202771 }, { "acc": 0.7592205, "epoch": 0.3690925793610289, "grad_norm": 6.21875, "learning_rate": 9.386620073411221e-06, "loss": 0.8692564, "memory(GiB)": 135.77, "step": 15820, "train_speed(iter/s)": 0.202835 }, { "acc": 0.77718821, "epoch": 0.3693258869333178, "grad_norm": 7.3125, "learning_rate": 9.385713178491302e-06, "loss": 0.81367092, "memory(GiB)": 135.77, "step": 15830, "train_speed(iter/s)": 0.202905 }, { "acc": 0.78268156, "epoch": 0.36955919450560665, "grad_norm": 7.625, "learning_rate": 9.384805657512101e-06, "loss": 0.79621563, "memory(GiB)": 135.77, "step": 15840, "train_speed(iter/s)": 0.202971 }, { "acc": 0.76887393, "epoch": 0.36979250207789555, "grad_norm": 6.75, "learning_rate": 9.383897510603167e-06, "loss": 0.84250908, "memory(GiB)": 135.77, "step": 15850, "train_speed(iter/s)": 0.203038 }, { "acc": 0.78669109, "epoch": 0.37002580965018445, "grad_norm": 5.90625, "learning_rate": 9.382988737894136e-06, "loss": 0.7452014, "memory(GiB)": 135.77, "step": 15860, "train_speed(iter/s)": 0.203102 }, { "acc": 0.76802521, "epoch": 0.37025911722247334, "grad_norm": 7.375, "learning_rate": 9.382079339514736e-06, "loss": 0.85868645, "memory(GiB)": 135.77, "step": 15870, "train_speed(iter/s)": 0.203168 }, { "acc": 0.76643047, "epoch": 0.37049242479476224, "grad_norm": 5.0, "learning_rate": 9.381169315594782e-06, "loss": 0.83867474, "memory(GiB)": 135.77, "step": 15880, "train_speed(iter/s)": 0.20323 }, { "acc": 0.77568998, "epoch": 0.37072573236705114, "grad_norm": 4.90625, "learning_rate": 9.380258666264184e-06, "loss": 0.83556032, "memory(GiB)": 135.77, "step": 15890, "train_speed(iter/s)": 0.203294 }, { "acc": 0.77228613, "epoch": 0.37095903993934004, "grad_norm": 9.0625, "learning_rate": 9.379347391652931e-06, "loss": 0.81034203, "memory(GiB)": 135.77, "step": 15900, "train_speed(iter/s)": 0.20336 }, { "acc": 0.76030293, "epoch": 0.37119234751162894, "grad_norm": 7.4375, "learning_rate": 9.378435491891112e-06, "loss": 0.88398857, "memory(GiB)": 135.77, "step": 15910, "train_speed(iter/s)": 0.203423 }, { "acc": 0.7455883, "epoch": 0.37142565508391784, "grad_norm": 6.21875, "learning_rate": 9.377522967108897e-06, "loss": 0.94365292, "memory(GiB)": 135.77, "step": 15920, "train_speed(iter/s)": 0.203489 }, { "acc": 0.76555414, "epoch": 0.37165896265620674, "grad_norm": 6.5625, "learning_rate": 9.376609817436551e-06, "loss": 0.86510744, "memory(GiB)": 135.77, "step": 15930, "train_speed(iter/s)": 0.203548 }, { "acc": 0.75843372, "epoch": 0.3718922702284956, "grad_norm": 9.6875, "learning_rate": 9.375696043004425e-06, "loss": 0.90398607, "memory(GiB)": 135.77, "step": 15940, "train_speed(iter/s)": 0.203613 }, { "acc": 0.75562124, "epoch": 0.3721255778007845, "grad_norm": 4.71875, "learning_rate": 9.374781643942961e-06, "loss": 0.88584518, "memory(GiB)": 135.77, "step": 15950, "train_speed(iter/s)": 0.20368 }, { "acc": 0.76519918, "epoch": 0.3723588853730734, "grad_norm": 7.25, "learning_rate": 9.373866620382686e-06, "loss": 0.8289814, "memory(GiB)": 135.77, "step": 15960, "train_speed(iter/s)": 0.203744 }, { "acc": 0.78620424, "epoch": 0.3725921929453623, "grad_norm": 13.9375, "learning_rate": 9.372950972454222e-06, "loss": 0.74287148, "memory(GiB)": 135.77, "step": 15970, "train_speed(iter/s)": 0.203812 }, { "acc": 0.77351294, "epoch": 0.3728255005176512, "grad_norm": 4.84375, "learning_rate": 9.372034700288278e-06, "loss": 0.79458714, "memory(GiB)": 135.77, "step": 15980, "train_speed(iter/s)": 0.20388 }, { "acc": 0.74867344, "epoch": 0.3730588080899401, "grad_norm": 6.75, "learning_rate": 9.37111780401565e-06, "loss": 0.92256651, "memory(GiB)": 135.77, "step": 15990, "train_speed(iter/s)": 0.203948 }, { "acc": 0.74160004, "epoch": 0.373292115662229, "grad_norm": 5.8125, "learning_rate": 9.370200283767225e-06, "loss": 0.94474297, "memory(GiB)": 135.77, "step": 16000, "train_speed(iter/s)": 0.204015 }, { "epoch": 0.373292115662229, "eval_acc": 0.7363508670448953, "eval_loss": 0.8348709940910339, "eval_runtime": 1270.1462, "eval_samples_per_second": 28.336, "eval_steps_per_second": 14.168, "step": 16000 }, { "acc": 0.78337979, "epoch": 0.3735254232345179, "grad_norm": 5.5, "learning_rate": 9.369282139673979e-06, "loss": 0.79470425, "memory(GiB)": 135.77, "step": 16010, "train_speed(iter/s)": 0.200783 }, { "acc": 0.76890593, "epoch": 0.37375873080680677, "grad_norm": 4.6875, "learning_rate": 9.368363371866978e-06, "loss": 0.82831898, "memory(GiB)": 135.77, "step": 16020, "train_speed(iter/s)": 0.200845 }, { "acc": 0.75959797, "epoch": 0.3739920383790956, "grad_norm": 3.90625, "learning_rate": 9.367443980477374e-06, "loss": 0.86754074, "memory(GiB)": 135.77, "step": 16030, "train_speed(iter/s)": 0.200911 }, { "acc": 0.77851305, "epoch": 0.3742253459513845, "grad_norm": 5.6875, "learning_rate": 9.366523965636412e-06, "loss": 0.77482352, "memory(GiB)": 135.77, "step": 16040, "train_speed(iter/s)": 0.200977 }, { "acc": 0.75851483, "epoch": 0.3744586535236734, "grad_norm": 6.1875, "learning_rate": 9.36560332747542e-06, "loss": 0.86383018, "memory(GiB)": 135.77, "step": 16050, "train_speed(iter/s)": 0.201043 }, { "acc": 0.77050686, "epoch": 0.3746919610959623, "grad_norm": 7.25, "learning_rate": 9.364682066125822e-06, "loss": 0.84514685, "memory(GiB)": 135.77, "step": 16060, "train_speed(iter/s)": 0.201107 }, { "acc": 0.7671258, "epoch": 0.3749252686682512, "grad_norm": 7.90625, "learning_rate": 9.363760181719127e-06, "loss": 0.86259499, "memory(GiB)": 135.77, "step": 16070, "train_speed(iter/s)": 0.201169 }, { "acc": 0.76694169, "epoch": 0.3751585762405401, "grad_norm": 10.0, "learning_rate": 9.362837674386934e-06, "loss": 0.85537701, "memory(GiB)": 135.77, "step": 16080, "train_speed(iter/s)": 0.201237 }, { "acc": 0.79346738, "epoch": 0.375391883812829, "grad_norm": 6.4375, "learning_rate": 9.36191454426093e-06, "loss": 0.74549022, "memory(GiB)": 135.77, "step": 16090, "train_speed(iter/s)": 0.2013 }, { "acc": 0.76404152, "epoch": 0.3756251913851179, "grad_norm": 5.21875, "learning_rate": 9.360990791472893e-06, "loss": 0.85988121, "memory(GiB)": 135.77, "step": 16100, "train_speed(iter/s)": 0.20136 }, { "acc": 0.77790794, "epoch": 0.3758584989574068, "grad_norm": 7.1875, "learning_rate": 9.360066416154687e-06, "loss": 0.82874393, "memory(GiB)": 135.77, "step": 16110, "train_speed(iter/s)": 0.201424 }, { "acc": 0.76713996, "epoch": 0.3760918065296957, "grad_norm": 6.34375, "learning_rate": 9.359141418438266e-06, "loss": 0.82495213, "memory(GiB)": 135.77, "step": 16120, "train_speed(iter/s)": 0.201487 }, { "acc": 0.76586213, "epoch": 0.37632511410198455, "grad_norm": 7.125, "learning_rate": 9.358215798455674e-06, "loss": 0.83144331, "memory(GiB)": 135.77, "step": 16130, "train_speed(iter/s)": 0.201555 }, { "acc": 0.7565609, "epoch": 0.37655842167427345, "grad_norm": 6.0, "learning_rate": 9.357289556339044e-06, "loss": 0.8857419, "memory(GiB)": 135.77, "step": 16140, "train_speed(iter/s)": 0.201622 }, { "acc": 0.76512766, "epoch": 0.37679172924656235, "grad_norm": 10.5, "learning_rate": 9.356362692220593e-06, "loss": 0.88596659, "memory(GiB)": 135.77, "step": 16150, "train_speed(iter/s)": 0.201686 }, { "acc": 0.77160578, "epoch": 0.37702503681885124, "grad_norm": 7.6875, "learning_rate": 9.355435206232635e-06, "loss": 0.81468468, "memory(GiB)": 135.77, "step": 16160, "train_speed(iter/s)": 0.201747 }, { "acc": 0.76007071, "epoch": 0.37725834439114014, "grad_norm": 5.1875, "learning_rate": 9.354507098507568e-06, "loss": 0.86362362, "memory(GiB)": 135.77, "step": 16170, "train_speed(iter/s)": 0.201815 }, { "acc": 0.75717325, "epoch": 0.37749165196342904, "grad_norm": 5.78125, "learning_rate": 9.353578369177876e-06, "loss": 0.88568306, "memory(GiB)": 135.77, "step": 16180, "train_speed(iter/s)": 0.201881 }, { "acc": 0.77196169, "epoch": 0.37772495953571794, "grad_norm": 5.78125, "learning_rate": 9.352649018376136e-06, "loss": 0.82890453, "memory(GiB)": 135.77, "step": 16190, "train_speed(iter/s)": 0.201948 }, { "acc": 0.75310216, "epoch": 0.37795826710800684, "grad_norm": 5.8125, "learning_rate": 9.351719046235013e-06, "loss": 0.88745995, "memory(GiB)": 135.77, "step": 16200, "train_speed(iter/s)": 0.202008 }, { "acc": 0.78186665, "epoch": 0.37819157468029574, "grad_norm": 4.03125, "learning_rate": 9.350788452887262e-06, "loss": 0.75842514, "memory(GiB)": 135.77, "step": 16210, "train_speed(iter/s)": 0.202072 }, { "acc": 0.7654644, "epoch": 0.37842488225258464, "grad_norm": 7.9375, "learning_rate": 9.349857238465723e-06, "loss": 0.8322834, "memory(GiB)": 135.77, "step": 16220, "train_speed(iter/s)": 0.202134 }, { "acc": 0.77073183, "epoch": 0.3786581898248735, "grad_norm": 5.125, "learning_rate": 9.348925403103326e-06, "loss": 0.84538736, "memory(GiB)": 135.77, "step": 16230, "train_speed(iter/s)": 0.202201 }, { "acc": 0.76439257, "epoch": 0.3788914973971624, "grad_norm": 6.25, "learning_rate": 9.347992946933091e-06, "loss": 0.86807938, "memory(GiB)": 135.77, "step": 16240, "train_speed(iter/s)": 0.202266 }, { "acc": 0.77966743, "epoch": 0.3791248049694513, "grad_norm": 5.0625, "learning_rate": 9.347059870088127e-06, "loss": 0.75784845, "memory(GiB)": 135.77, "step": 16250, "train_speed(iter/s)": 0.202327 }, { "acc": 0.76927428, "epoch": 0.3793581125417402, "grad_norm": 5.96875, "learning_rate": 9.346126172701629e-06, "loss": 0.83154087, "memory(GiB)": 135.77, "step": 16260, "train_speed(iter/s)": 0.202393 }, { "acc": 0.77350688, "epoch": 0.3795914201140291, "grad_norm": 3.765625, "learning_rate": 9.345191854906881e-06, "loss": 0.80910931, "memory(GiB)": 135.77, "step": 16270, "train_speed(iter/s)": 0.202457 }, { "acc": 0.7722403, "epoch": 0.379824727686318, "grad_norm": 4.125, "learning_rate": 9.344256916837259e-06, "loss": 0.83394699, "memory(GiB)": 135.77, "step": 16280, "train_speed(iter/s)": 0.202518 }, { "acc": 0.77347498, "epoch": 0.3800580352586069, "grad_norm": 8.625, "learning_rate": 9.343321358626225e-06, "loss": 0.78846674, "memory(GiB)": 135.77, "step": 16290, "train_speed(iter/s)": 0.202587 }, { "acc": 0.76977482, "epoch": 0.3802913428308958, "grad_norm": 6.1875, "learning_rate": 9.342385180407328e-06, "loss": 0.83136139, "memory(GiB)": 135.77, "step": 16300, "train_speed(iter/s)": 0.202652 }, { "acc": 0.77511702, "epoch": 0.38052465040318467, "grad_norm": 4.34375, "learning_rate": 9.341448382314207e-06, "loss": 0.80090103, "memory(GiB)": 135.77, "step": 16310, "train_speed(iter/s)": 0.202714 }, { "acc": 0.7784832, "epoch": 0.3807579579754735, "grad_norm": 4.28125, "learning_rate": 9.340510964480591e-06, "loss": 0.77812891, "memory(GiB)": 135.77, "step": 16320, "train_speed(iter/s)": 0.202777 }, { "acc": 0.78746824, "epoch": 0.3809912655477624, "grad_norm": 5.40625, "learning_rate": 9.339572927040298e-06, "loss": 0.76021338, "memory(GiB)": 135.77, "step": 16330, "train_speed(iter/s)": 0.202838 }, { "acc": 0.77678819, "epoch": 0.3812245731200513, "grad_norm": 5.46875, "learning_rate": 9.338634270127227e-06, "loss": 0.81745186, "memory(GiB)": 135.77, "step": 16340, "train_speed(iter/s)": 0.202905 }, { "acc": 0.75448432, "epoch": 0.3814578806923402, "grad_norm": 4.875, "learning_rate": 9.337694993875376e-06, "loss": 0.89182949, "memory(GiB)": 135.77, "step": 16350, "train_speed(iter/s)": 0.202971 }, { "acc": 0.7632596, "epoch": 0.3816911882646291, "grad_norm": 7.53125, "learning_rate": 9.336755098418824e-06, "loss": 0.86298809, "memory(GiB)": 135.77, "step": 16360, "train_speed(iter/s)": 0.203036 }, { "acc": 0.756043, "epoch": 0.381924495836918, "grad_norm": 5.5, "learning_rate": 9.335814583891743e-06, "loss": 0.89383011, "memory(GiB)": 135.77, "step": 16370, "train_speed(iter/s)": 0.203095 }, { "acc": 0.76111898, "epoch": 0.3821578034092069, "grad_norm": 4.65625, "learning_rate": 9.33487345042839e-06, "loss": 0.89134369, "memory(GiB)": 135.77, "step": 16380, "train_speed(iter/s)": 0.203158 }, { "acc": 0.75423412, "epoch": 0.3823911109814958, "grad_norm": 7.34375, "learning_rate": 9.333931698163107e-06, "loss": 0.89891577, "memory(GiB)": 135.77, "step": 16390, "train_speed(iter/s)": 0.203226 }, { "acc": 0.77302418, "epoch": 0.3826244185537847, "grad_norm": 5.6875, "learning_rate": 9.332989327230337e-06, "loss": 0.80847969, "memory(GiB)": 135.77, "step": 16400, "train_speed(iter/s)": 0.203289 }, { "acc": 0.75313158, "epoch": 0.3828577261260736, "grad_norm": 6.71875, "learning_rate": 9.3320463377646e-06, "loss": 0.87891369, "memory(GiB)": 135.77, "step": 16410, "train_speed(iter/s)": 0.20336 }, { "acc": 0.76052837, "epoch": 0.38309103369836245, "grad_norm": 5.0625, "learning_rate": 9.331102729900505e-06, "loss": 0.87168789, "memory(GiB)": 135.77, "step": 16420, "train_speed(iter/s)": 0.203426 }, { "acc": 0.75323372, "epoch": 0.38332434127065135, "grad_norm": 4.8125, "learning_rate": 9.330158503772753e-06, "loss": 0.9150569, "memory(GiB)": 135.77, "step": 16430, "train_speed(iter/s)": 0.203491 }, { "acc": 0.7864645, "epoch": 0.38355764884294025, "grad_norm": 5.90625, "learning_rate": 9.329213659516134e-06, "loss": 0.79203343, "memory(GiB)": 135.77, "step": 16440, "train_speed(iter/s)": 0.203553 }, { "acc": 0.77691231, "epoch": 0.38379095641522915, "grad_norm": 4.40625, "learning_rate": 9.328268197265523e-06, "loss": 0.79185615, "memory(GiB)": 135.77, "step": 16450, "train_speed(iter/s)": 0.203619 }, { "acc": 0.77853565, "epoch": 0.38402426398751804, "grad_norm": 6.59375, "learning_rate": 9.327322117155881e-06, "loss": 0.78379364, "memory(GiB)": 135.77, "step": 16460, "train_speed(iter/s)": 0.20368 }, { "acc": 0.78826647, "epoch": 0.38425757155980694, "grad_norm": 4.21875, "learning_rate": 9.326375419322267e-06, "loss": 0.81094532, "memory(GiB)": 135.77, "step": 16470, "train_speed(iter/s)": 0.203742 }, { "acc": 0.76101255, "epoch": 0.38449087913209584, "grad_norm": 4.59375, "learning_rate": 9.325428103899818e-06, "loss": 0.88430595, "memory(GiB)": 135.77, "step": 16480, "train_speed(iter/s)": 0.203798 }, { "acc": 0.774227, "epoch": 0.38472418670438474, "grad_norm": 5.75, "learning_rate": 9.324480171023764e-06, "loss": 0.81697226, "memory(GiB)": 135.77, "step": 16490, "train_speed(iter/s)": 0.203856 }, { "acc": 0.76122689, "epoch": 0.38495749427667364, "grad_norm": 4.5, "learning_rate": 9.32353162082942e-06, "loss": 0.85380974, "memory(GiB)": 135.77, "step": 16500, "train_speed(iter/s)": 0.203918 }, { "epoch": 0.38495749427667364, "eval_acc": 0.736815605607034, "eval_loss": 0.8338123559951782, "eval_runtime": 1271.5777, "eval_samples_per_second": 28.304, "eval_steps_per_second": 14.152, "step": 16500 }, { "acc": 0.76048388, "epoch": 0.3851908018489625, "grad_norm": 7.8125, "learning_rate": 9.322582453452195e-06, "loss": 0.85223875, "memory(GiB)": 135.77, "step": 16510, "train_speed(iter/s)": 0.200786 }, { "acc": 0.78726511, "epoch": 0.3854241094212514, "grad_norm": 5.5625, "learning_rate": 9.32163266902758e-06, "loss": 0.76919699, "memory(GiB)": 135.77, "step": 16520, "train_speed(iter/s)": 0.200847 }, { "acc": 0.76031055, "epoch": 0.3856574169935403, "grad_norm": 6.0625, "learning_rate": 9.320682267691157e-06, "loss": 0.87715168, "memory(GiB)": 135.77, "step": 16530, "train_speed(iter/s)": 0.200912 }, { "acc": 0.76955204, "epoch": 0.3858907245658292, "grad_norm": 8.6875, "learning_rate": 9.319731249578595e-06, "loss": 0.84494944, "memory(GiB)": 135.77, "step": 16540, "train_speed(iter/s)": 0.200977 }, { "acc": 0.77983017, "epoch": 0.3861240321381181, "grad_norm": 5.4375, "learning_rate": 9.318779614825653e-06, "loss": 0.78632112, "memory(GiB)": 135.77, "step": 16550, "train_speed(iter/s)": 0.201038 }, { "acc": 0.77373309, "epoch": 0.386357339710407, "grad_norm": 5.15625, "learning_rate": 9.317827363568176e-06, "loss": 0.8040472, "memory(GiB)": 135.77, "step": 16560, "train_speed(iter/s)": 0.2011 }, { "acc": 0.77826786, "epoch": 0.3865906472826959, "grad_norm": 6.34375, "learning_rate": 9.316874495942095e-06, "loss": 0.7839695, "memory(GiB)": 135.77, "step": 16570, "train_speed(iter/s)": 0.20116 }, { "acc": 0.77872405, "epoch": 0.3868239548549848, "grad_norm": 6.0, "learning_rate": 9.315921012083436e-06, "loss": 0.77586727, "memory(GiB)": 135.77, "step": 16580, "train_speed(iter/s)": 0.201222 }, { "acc": 0.7748662, "epoch": 0.3870572624272737, "grad_norm": 11.875, "learning_rate": 9.314966912128305e-06, "loss": 0.81466694, "memory(GiB)": 135.77, "step": 16590, "train_speed(iter/s)": 0.201285 }, { "acc": 0.77469873, "epoch": 0.3872905699995626, "grad_norm": 6.21875, "learning_rate": 9.3140121962129e-06, "loss": 0.81014376, "memory(GiB)": 135.77, "step": 16600, "train_speed(iter/s)": 0.201349 }, { "acc": 0.78045406, "epoch": 0.3875238775718514, "grad_norm": 6.5625, "learning_rate": 9.313056864473508e-06, "loss": 0.77907581, "memory(GiB)": 135.77, "step": 16610, "train_speed(iter/s)": 0.20141 }, { "acc": 0.76823573, "epoch": 0.3877571851441403, "grad_norm": 6.46875, "learning_rate": 9.312100917046502e-06, "loss": 0.8444459, "memory(GiB)": 135.77, "step": 16620, "train_speed(iter/s)": 0.201474 }, { "acc": 0.76610289, "epoch": 0.3879904927164292, "grad_norm": 6.03125, "learning_rate": 9.311144354068342e-06, "loss": 0.84226179, "memory(GiB)": 135.77, "step": 16630, "train_speed(iter/s)": 0.20154 }, { "acc": 0.78322606, "epoch": 0.3882238002887181, "grad_norm": 5.84375, "learning_rate": 9.310187175675579e-06, "loss": 0.78812022, "memory(GiB)": 135.77, "step": 16640, "train_speed(iter/s)": 0.201607 }, { "acc": 0.75634503, "epoch": 0.388457107861007, "grad_norm": 6.78125, "learning_rate": 9.309229382004847e-06, "loss": 0.90151043, "memory(GiB)": 135.77, "step": 16650, "train_speed(iter/s)": 0.20167 }, { "acc": 0.75544233, "epoch": 0.3886904154332959, "grad_norm": 5.0625, "learning_rate": 9.308270973192875e-06, "loss": 0.89699974, "memory(GiB)": 135.77, "step": 16660, "train_speed(iter/s)": 0.201732 }, { "acc": 0.74633512, "epoch": 0.3889237230055848, "grad_norm": 5.96875, "learning_rate": 9.307311949376472e-06, "loss": 0.92126493, "memory(GiB)": 135.77, "step": 16670, "train_speed(iter/s)": 0.201796 }, { "acc": 0.77705793, "epoch": 0.3891570305778737, "grad_norm": 6.125, "learning_rate": 9.306352310692539e-06, "loss": 0.80012226, "memory(GiB)": 135.77, "step": 16680, "train_speed(iter/s)": 0.201858 }, { "acc": 0.77201366, "epoch": 0.3893903381501626, "grad_norm": 4.84375, "learning_rate": 9.305392057278066e-06, "loss": 0.8111311, "memory(GiB)": 135.77, "step": 16690, "train_speed(iter/s)": 0.201919 }, { "acc": 0.76858664, "epoch": 0.3896236457224515, "grad_norm": 5.28125, "learning_rate": 9.304431189270127e-06, "loss": 0.8267766, "memory(GiB)": 135.77, "step": 16700, "train_speed(iter/s)": 0.201983 }, { "acc": 0.75889578, "epoch": 0.38985695329474035, "grad_norm": 4.59375, "learning_rate": 9.303469706805886e-06, "loss": 0.84821739, "memory(GiB)": 135.77, "step": 16710, "train_speed(iter/s)": 0.202043 }, { "acc": 0.76242075, "epoch": 0.39009026086702925, "grad_norm": 6.125, "learning_rate": 9.302507610022593e-06, "loss": 0.87555027, "memory(GiB)": 135.77, "step": 16720, "train_speed(iter/s)": 0.202104 }, { "acc": 0.79397368, "epoch": 0.39032356843931815, "grad_norm": 8.0625, "learning_rate": 9.30154489905759e-06, "loss": 0.72974205, "memory(GiB)": 135.77, "step": 16730, "train_speed(iter/s)": 0.202163 }, { "acc": 0.74320917, "epoch": 0.39055687601160705, "grad_norm": 5.5, "learning_rate": 9.300581574048303e-06, "loss": 0.93350315, "memory(GiB)": 135.77, "step": 16740, "train_speed(iter/s)": 0.202225 }, { "acc": 0.77537889, "epoch": 0.39079018358389594, "grad_norm": 5.59375, "learning_rate": 9.299617635132243e-06, "loss": 0.78615303, "memory(GiB)": 135.77, "step": 16750, "train_speed(iter/s)": 0.202286 }, { "acc": 0.77060065, "epoch": 0.39102349115618484, "grad_norm": 5.6875, "learning_rate": 9.298653082447019e-06, "loss": 0.84748516, "memory(GiB)": 135.77, "step": 16760, "train_speed(iter/s)": 0.202351 }, { "acc": 0.78617654, "epoch": 0.39125679872847374, "grad_norm": 5.40625, "learning_rate": 9.29768791613031e-06, "loss": 0.76330194, "memory(GiB)": 135.77, "step": 16770, "train_speed(iter/s)": 0.202413 }, { "acc": 0.7793808, "epoch": 0.39149010630076264, "grad_norm": 4.96875, "learning_rate": 9.296722136319904e-06, "loss": 0.79507647, "memory(GiB)": 135.77, "step": 16780, "train_speed(iter/s)": 0.202476 }, { "acc": 0.7710515, "epoch": 0.39172341387305154, "grad_norm": 6.25, "learning_rate": 9.29575574315366e-06, "loss": 0.8227356, "memory(GiB)": 135.77, "step": 16790, "train_speed(iter/s)": 0.202539 }, { "acc": 0.77599983, "epoch": 0.3919567214453404, "grad_norm": 5.53125, "learning_rate": 9.294788736769534e-06, "loss": 0.80106039, "memory(GiB)": 135.77, "step": 16800, "train_speed(iter/s)": 0.202601 }, { "acc": 0.77693524, "epoch": 0.3921900290176293, "grad_norm": 7.5625, "learning_rate": 9.293821117305562e-06, "loss": 0.79049306, "memory(GiB)": 135.77, "step": 16810, "train_speed(iter/s)": 0.202663 }, { "acc": 0.7763052, "epoch": 0.3924233365899182, "grad_norm": 6.28125, "learning_rate": 9.29285288489987e-06, "loss": 0.81076241, "memory(GiB)": 135.77, "step": 16820, "train_speed(iter/s)": 0.202723 }, { "acc": 0.77143803, "epoch": 0.3926566441622071, "grad_norm": 9.8125, "learning_rate": 9.29188403969068e-06, "loss": 0.81417742, "memory(GiB)": 135.77, "step": 16830, "train_speed(iter/s)": 0.202787 }, { "acc": 0.79286661, "epoch": 0.392889951734496, "grad_norm": 8.375, "learning_rate": 9.290914581816287e-06, "loss": 0.72028284, "memory(GiB)": 135.77, "step": 16840, "train_speed(iter/s)": 0.20285 }, { "acc": 0.77796297, "epoch": 0.3931232593067849, "grad_norm": 6.1875, "learning_rate": 9.289944511415086e-06, "loss": 0.80314026, "memory(GiB)": 135.77, "step": 16850, "train_speed(iter/s)": 0.202906 }, { "acc": 0.75296588, "epoch": 0.3933565668790738, "grad_norm": 4.28125, "learning_rate": 9.28897382862555e-06, "loss": 0.88018427, "memory(GiB)": 135.77, "step": 16860, "train_speed(iter/s)": 0.202965 }, { "acc": 0.74555116, "epoch": 0.3935898744513627, "grad_norm": 5.53125, "learning_rate": 9.288002533586247e-06, "loss": 0.92258759, "memory(GiB)": 135.77, "step": 16870, "train_speed(iter/s)": 0.203027 }, { "acc": 0.77875471, "epoch": 0.3938231820236516, "grad_norm": 5.125, "learning_rate": 9.287030626435828e-06, "loss": 0.81162071, "memory(GiB)": 135.77, "step": 16880, "train_speed(iter/s)": 0.20309 }, { "acc": 0.77456217, "epoch": 0.3940564895959405, "grad_norm": 5.1875, "learning_rate": 9.286058107313034e-06, "loss": 0.79409857, "memory(GiB)": 135.77, "step": 16890, "train_speed(iter/s)": 0.203154 }, { "acc": 0.76377172, "epoch": 0.3942897971682293, "grad_norm": 5.8125, "learning_rate": 9.285084976356689e-06, "loss": 0.84241772, "memory(GiB)": 135.77, "step": 16900, "train_speed(iter/s)": 0.203216 }, { "acc": 0.76552515, "epoch": 0.3945231047405182, "grad_norm": 4.9375, "learning_rate": 9.284111233705709e-06, "loss": 0.86844196, "memory(GiB)": 135.77, "step": 16910, "train_speed(iter/s)": 0.20328 }, { "acc": 0.7652967, "epoch": 0.3947564123128071, "grad_norm": 4.28125, "learning_rate": 9.283136879499094e-06, "loss": 0.85671463, "memory(GiB)": 135.77, "step": 16920, "train_speed(iter/s)": 0.203337 }, { "acc": 0.77319775, "epoch": 0.394989719885096, "grad_norm": 4.84375, "learning_rate": 9.282161913875933e-06, "loss": 0.83186893, "memory(GiB)": 135.77, "step": 16930, "train_speed(iter/s)": 0.203398 }, { "acc": 0.76866665, "epoch": 0.3952230274573849, "grad_norm": 4.71875, "learning_rate": 9.281186336975406e-06, "loss": 0.83866825, "memory(GiB)": 135.77, "step": 16940, "train_speed(iter/s)": 0.203462 }, { "acc": 0.76994901, "epoch": 0.3954563350296738, "grad_norm": 5.78125, "learning_rate": 9.28021014893677e-06, "loss": 0.83855286, "memory(GiB)": 135.77, "step": 16950, "train_speed(iter/s)": 0.203521 }, { "acc": 0.77082605, "epoch": 0.3956896426019627, "grad_norm": 6.09375, "learning_rate": 9.27923334989938e-06, "loss": 0.83198881, "memory(GiB)": 135.77, "step": 16960, "train_speed(iter/s)": 0.203576 }, { "acc": 0.75358386, "epoch": 0.3959229501742516, "grad_norm": 5.28125, "learning_rate": 9.278255940002671e-06, "loss": 0.89232082, "memory(GiB)": 135.77, "step": 16970, "train_speed(iter/s)": 0.20364 }, { "acc": 0.76478891, "epoch": 0.3961562577465405, "grad_norm": 9.0625, "learning_rate": 9.27727791938617e-06, "loss": 0.84635849, "memory(GiB)": 135.77, "step": 16980, "train_speed(iter/s)": 0.203705 }, { "acc": 0.7693079, "epoch": 0.3963895653188294, "grad_norm": 7.15625, "learning_rate": 9.27629928818949e-06, "loss": 0.83523731, "memory(GiB)": 135.77, "step": 16990, "train_speed(iter/s)": 0.203767 }, { "acc": 0.76514502, "epoch": 0.39662287289111825, "grad_norm": 6.40625, "learning_rate": 9.275320046552328e-06, "loss": 0.85309467, "memory(GiB)": 135.77, "step": 17000, "train_speed(iter/s)": 0.203827 }, { "epoch": 0.39662287289111825, "eval_acc": 0.7371956278702851, "eval_loss": 0.8326179385185242, "eval_runtime": 1270.543, "eval_samples_per_second": 28.327, "eval_steps_per_second": 14.164, "step": 17000 }, { "acc": 0.76229954, "epoch": 0.39685618046340715, "grad_norm": 6.625, "learning_rate": 9.274340194614471e-06, "loss": 0.83733425, "memory(GiB)": 135.77, "step": 17010, "train_speed(iter/s)": 0.200789 }, { "acc": 0.77126536, "epoch": 0.39708948803569605, "grad_norm": 8.75, "learning_rate": 9.273359732515793e-06, "loss": 0.83545189, "memory(GiB)": 135.77, "step": 17020, "train_speed(iter/s)": 0.200853 }, { "acc": 0.76169815, "epoch": 0.39732279560798495, "grad_norm": 5.84375, "learning_rate": 9.272378660396255e-06, "loss": 0.87710171, "memory(GiB)": 135.77, "step": 17030, "train_speed(iter/s)": 0.200915 }, { "acc": 0.77218542, "epoch": 0.39755610318027385, "grad_norm": 5.0, "learning_rate": 9.271396978395904e-06, "loss": 0.83802376, "memory(GiB)": 135.77, "step": 17040, "train_speed(iter/s)": 0.200972 }, { "acc": 0.76550169, "epoch": 0.39778941075256274, "grad_norm": 5.25, "learning_rate": 9.270414686654875e-06, "loss": 0.85740089, "memory(GiB)": 135.77, "step": 17050, "train_speed(iter/s)": 0.201033 }, { "acc": 0.7832613, "epoch": 0.39802271832485164, "grad_norm": 7.125, "learning_rate": 9.269431785313391e-06, "loss": 0.76584721, "memory(GiB)": 135.77, "step": 17060, "train_speed(iter/s)": 0.201094 }, { "acc": 0.77678032, "epoch": 0.39825602589714054, "grad_norm": 7.3125, "learning_rate": 9.268448274511759e-06, "loss": 0.81068668, "memory(GiB)": 135.77, "step": 17070, "train_speed(iter/s)": 0.201157 }, { "acc": 0.77908564, "epoch": 0.39848933346942944, "grad_norm": 5.96875, "learning_rate": 9.267464154390375e-06, "loss": 0.79487085, "memory(GiB)": 135.77, "step": 17080, "train_speed(iter/s)": 0.201218 }, { "acc": 0.80031281, "epoch": 0.3987226410417183, "grad_norm": 4.46875, "learning_rate": 9.266479425089725e-06, "loss": 0.71730728, "memory(GiB)": 135.77, "step": 17090, "train_speed(iter/s)": 0.20128 }, { "acc": 0.77682505, "epoch": 0.3989559486140072, "grad_norm": 10.0625, "learning_rate": 9.265494086750375e-06, "loss": 0.80053043, "memory(GiB)": 135.77, "step": 17100, "train_speed(iter/s)": 0.201344 }, { "acc": 0.77470179, "epoch": 0.3991892561862961, "grad_norm": 5.125, "learning_rate": 9.264508139512985e-06, "loss": 0.81498718, "memory(GiB)": 135.77, "step": 17110, "train_speed(iter/s)": 0.201402 }, { "acc": 0.77046309, "epoch": 0.399422563758585, "grad_norm": 6.25, "learning_rate": 9.263521583518293e-06, "loss": 0.82258339, "memory(GiB)": 135.77, "step": 17120, "train_speed(iter/s)": 0.20146 }, { "acc": 0.79407024, "epoch": 0.3996558713308739, "grad_norm": 8.25, "learning_rate": 9.262534418907137e-06, "loss": 0.72604542, "memory(GiB)": 135.77, "step": 17130, "train_speed(iter/s)": 0.20152 }, { "acc": 0.7635807, "epoch": 0.3998891789031628, "grad_norm": 5.6875, "learning_rate": 9.26154664582043e-06, "loss": 0.8567337, "memory(GiB)": 135.77, "step": 17140, "train_speed(iter/s)": 0.201581 }, { "acc": 0.78804669, "epoch": 0.4001224864754517, "grad_norm": 6.65625, "learning_rate": 9.260558264399177e-06, "loss": 0.73743448, "memory(GiB)": 135.77, "step": 17150, "train_speed(iter/s)": 0.201638 }, { "acc": 0.76328459, "epoch": 0.4003557940477406, "grad_norm": 5.5, "learning_rate": 9.25956927478447e-06, "loss": 0.85945473, "memory(GiB)": 135.77, "step": 17160, "train_speed(iter/s)": 0.2017 }, { "acc": 0.77089682, "epoch": 0.4005891016200295, "grad_norm": 5.4375, "learning_rate": 9.258579677117486e-06, "loss": 0.81945114, "memory(GiB)": 135.77, "step": 17170, "train_speed(iter/s)": 0.201758 }, { "acc": 0.7839241, "epoch": 0.4008224091923184, "grad_norm": 5.125, "learning_rate": 9.25758947153949e-06, "loss": 0.77010636, "memory(GiB)": 135.77, "step": 17180, "train_speed(iter/s)": 0.201814 }, { "acc": 0.76271248, "epoch": 0.4010557167646072, "grad_norm": 3.828125, "learning_rate": 9.256598658191834e-06, "loss": 0.85306063, "memory(GiB)": 135.77, "step": 17190, "train_speed(iter/s)": 0.201875 }, { "acc": 0.77881155, "epoch": 0.4012890243368961, "grad_norm": 6.875, "learning_rate": 9.255607237215957e-06, "loss": 0.7829484, "memory(GiB)": 135.77, "step": 17200, "train_speed(iter/s)": 0.201937 }, { "acc": 0.75986853, "epoch": 0.401522331909185, "grad_norm": 5.96875, "learning_rate": 9.254615208753381e-06, "loss": 0.8725729, "memory(GiB)": 135.77, "step": 17210, "train_speed(iter/s)": 0.202 }, { "acc": 0.77503152, "epoch": 0.4017556394814739, "grad_norm": 4.0, "learning_rate": 9.253622572945722e-06, "loss": 0.82128544, "memory(GiB)": 135.77, "step": 17220, "train_speed(iter/s)": 0.202063 }, { "acc": 0.75122519, "epoch": 0.4019889470537628, "grad_norm": 5.53125, "learning_rate": 9.252629329934676e-06, "loss": 0.91885424, "memory(GiB)": 135.77, "step": 17230, "train_speed(iter/s)": 0.202123 }, { "acc": 0.78238068, "epoch": 0.4022222546260517, "grad_norm": 4.25, "learning_rate": 9.251635479862029e-06, "loss": 0.7822403, "memory(GiB)": 135.77, "step": 17240, "train_speed(iter/s)": 0.202187 }, { "acc": 0.7734271, "epoch": 0.4024555621983406, "grad_norm": 5.9375, "learning_rate": 9.25064102286965e-06, "loss": 0.83377533, "memory(GiB)": 135.77, "step": 17250, "train_speed(iter/s)": 0.202249 }, { "acc": 0.78481631, "epoch": 0.4026888697706295, "grad_norm": 8.6875, "learning_rate": 9.249645959099503e-06, "loss": 0.76742496, "memory(GiB)": 135.77, "step": 17260, "train_speed(iter/s)": 0.202311 }, { "acc": 0.78067999, "epoch": 0.4029221773429184, "grad_norm": 6.4375, "learning_rate": 9.248650288693628e-06, "loss": 0.78222075, "memory(GiB)": 135.77, "step": 17270, "train_speed(iter/s)": 0.202371 }, { "acc": 0.78589168, "epoch": 0.40315548491520725, "grad_norm": 4.6875, "learning_rate": 9.247654011794158e-06, "loss": 0.7706872, "memory(GiB)": 135.77, "step": 17280, "train_speed(iter/s)": 0.20243 }, { "acc": 0.76477909, "epoch": 0.40338879248749615, "grad_norm": 9.0, "learning_rate": 9.246657128543313e-06, "loss": 0.85425339, "memory(GiB)": 135.77, "step": 17290, "train_speed(iter/s)": 0.202491 }, { "acc": 0.77842503, "epoch": 0.40362210005978505, "grad_norm": 5.4375, "learning_rate": 9.245659639083396e-06, "loss": 0.79319229, "memory(GiB)": 135.77, "step": 17300, "train_speed(iter/s)": 0.202549 }, { "acc": 0.78424797, "epoch": 0.40385540763207395, "grad_norm": 4.375, "learning_rate": 9.244661543556799e-06, "loss": 0.7634676, "memory(GiB)": 135.77, "step": 17310, "train_speed(iter/s)": 0.202609 }, { "acc": 0.77571602, "epoch": 0.40408871520436285, "grad_norm": 8.375, "learning_rate": 9.243662842106e-06, "loss": 0.80258932, "memory(GiB)": 135.77, "step": 17320, "train_speed(iter/s)": 0.202667 }, { "acc": 0.76631498, "epoch": 0.40432202277665175, "grad_norm": 4.25, "learning_rate": 9.242663534873562e-06, "loss": 0.85109797, "memory(GiB)": 135.77, "step": 17330, "train_speed(iter/s)": 0.202731 }, { "acc": 0.76990891, "epoch": 0.40455533034894064, "grad_norm": 5.40625, "learning_rate": 9.241663622002137e-06, "loss": 0.84066992, "memory(GiB)": 135.77, "step": 17340, "train_speed(iter/s)": 0.202789 }, { "acc": 0.76096568, "epoch": 0.40478863792122954, "grad_norm": 5.59375, "learning_rate": 9.240663103634464e-06, "loss": 0.86097231, "memory(GiB)": 135.77, "step": 17350, "train_speed(iter/s)": 0.202851 }, { "acc": 0.78643513, "epoch": 0.40502194549351844, "grad_norm": 7.78125, "learning_rate": 9.239661979913364e-06, "loss": 0.7529027, "memory(GiB)": 135.77, "step": 17360, "train_speed(iter/s)": 0.20291 }, { "acc": 0.78234057, "epoch": 0.40525525306580734, "grad_norm": 5.71875, "learning_rate": 9.238660250981748e-06, "loss": 0.76194448, "memory(GiB)": 135.77, "step": 17370, "train_speed(iter/s)": 0.202971 }, { "acc": 0.7768539, "epoch": 0.4054885606380962, "grad_norm": 5.0, "learning_rate": 9.237657916982612e-06, "loss": 0.79744596, "memory(GiB)": 135.77, "step": 17380, "train_speed(iter/s)": 0.203032 }, { "acc": 0.76710577, "epoch": 0.4057218682103851, "grad_norm": 13.4375, "learning_rate": 9.236654978059039e-06, "loss": 0.82225361, "memory(GiB)": 135.77, "step": 17390, "train_speed(iter/s)": 0.203092 }, { "acc": 0.78320732, "epoch": 0.405955175782674, "grad_norm": 4.53125, "learning_rate": 9.2356514343542e-06, "loss": 0.79602137, "memory(GiB)": 135.77, "step": 17400, "train_speed(iter/s)": 0.203151 }, { "acc": 0.7971508, "epoch": 0.4061884833549629, "grad_norm": 3.9375, "learning_rate": 9.234647286011347e-06, "loss": 0.73624454, "memory(GiB)": 135.77, "step": 17410, "train_speed(iter/s)": 0.203208 }, { "acc": 0.79400434, "epoch": 0.4064217909272518, "grad_norm": 5.75, "learning_rate": 9.233642533173827e-06, "loss": 0.73229074, "memory(GiB)": 135.77, "step": 17420, "train_speed(iter/s)": 0.203266 }, { "acc": 0.78245573, "epoch": 0.4066550984995407, "grad_norm": 7.125, "learning_rate": 9.232637175985064e-06, "loss": 0.79270024, "memory(GiB)": 135.77, "step": 17430, "train_speed(iter/s)": 0.203326 }, { "acc": 0.7708353, "epoch": 0.4068884060718296, "grad_norm": 5.40625, "learning_rate": 9.231631214588572e-06, "loss": 0.80469551, "memory(GiB)": 135.77, "step": 17440, "train_speed(iter/s)": 0.203388 }, { "acc": 0.77979951, "epoch": 0.4071217136441185, "grad_norm": 5.3125, "learning_rate": 9.230624649127956e-06, "loss": 0.7915205, "memory(GiB)": 135.77, "step": 17450, "train_speed(iter/s)": 0.203451 }, { "acc": 0.78297606, "epoch": 0.4073550212164074, "grad_norm": 5.21875, "learning_rate": 9.2296174797469e-06, "loss": 0.78460536, "memory(GiB)": 135.77, "step": 17460, "train_speed(iter/s)": 0.20351 }, { "acc": 0.78100643, "epoch": 0.4075883287886963, "grad_norm": 4.1875, "learning_rate": 9.228609706589175e-06, "loss": 0.80171537, "memory(GiB)": 135.77, "step": 17470, "train_speed(iter/s)": 0.203567 }, { "acc": 0.76642833, "epoch": 0.4078216363609851, "grad_norm": 5.1875, "learning_rate": 9.227601329798645e-06, "loss": 0.86399326, "memory(GiB)": 135.77, "step": 17480, "train_speed(iter/s)": 0.203629 }, { "acc": 0.76468382, "epoch": 0.408054943933274, "grad_norm": 5.15625, "learning_rate": 9.226592349519254e-06, "loss": 0.84064198, "memory(GiB)": 135.77, "step": 17490, "train_speed(iter/s)": 0.203689 }, { "acc": 0.757623, "epoch": 0.4082882515055629, "grad_norm": 7.5, "learning_rate": 9.225582765895032e-06, "loss": 0.88375015, "memory(GiB)": 135.77, "step": 17500, "train_speed(iter/s)": 0.203747 }, { "epoch": 0.4082882515055629, "eval_acc": 0.7373759502758565, "eval_loss": 0.8317380547523499, "eval_runtime": 1270.3611, "eval_samples_per_second": 28.331, "eval_steps_per_second": 14.166, "step": 17500 }, { "acc": 0.76951265, "epoch": 0.4085215590778518, "grad_norm": 6.9375, "learning_rate": 9.224572579070097e-06, "loss": 0.83398447, "memory(GiB)": 135.77, "step": 17510, "train_speed(iter/s)": 0.200799 }, { "acc": 0.77614574, "epoch": 0.4087548666501407, "grad_norm": 5.53125, "learning_rate": 9.223561789188655e-06, "loss": 0.80068007, "memory(GiB)": 135.77, "step": 17520, "train_speed(iter/s)": 0.200853 }, { "acc": 0.76483154, "epoch": 0.4089881742224296, "grad_norm": 6.59375, "learning_rate": 9.222550396394994e-06, "loss": 0.82644291, "memory(GiB)": 135.77, "step": 17530, "train_speed(iter/s)": 0.200914 }, { "acc": 0.76731434, "epoch": 0.4092214817947185, "grad_norm": 11.25, "learning_rate": 9.221538400833489e-06, "loss": 0.84105368, "memory(GiB)": 135.77, "step": 17540, "train_speed(iter/s)": 0.200968 }, { "acc": 0.79679441, "epoch": 0.4094547893670074, "grad_norm": 5.625, "learning_rate": 9.220525802648605e-06, "loss": 0.72740088, "memory(GiB)": 135.77, "step": 17550, "train_speed(iter/s)": 0.201025 }, { "acc": 0.77622986, "epoch": 0.4096880969392963, "grad_norm": 5.5625, "learning_rate": 9.219512601984889e-06, "loss": 0.81656227, "memory(GiB)": 135.77, "step": 17560, "train_speed(iter/s)": 0.20108 }, { "acc": 0.78093081, "epoch": 0.40992140451158515, "grad_norm": 5.8125, "learning_rate": 9.218498798986975e-06, "loss": 0.75747585, "memory(GiB)": 135.77, "step": 17570, "train_speed(iter/s)": 0.201138 }, { "acc": 0.75355787, "epoch": 0.41015471208387405, "grad_norm": 4.84375, "learning_rate": 9.217484393799582e-06, "loss": 0.91436329, "memory(GiB)": 135.77, "step": 17580, "train_speed(iter/s)": 0.201199 }, { "acc": 0.76489058, "epoch": 0.41038801965616295, "grad_norm": 5.125, "learning_rate": 9.216469386567517e-06, "loss": 0.86888409, "memory(GiB)": 135.77, "step": 17590, "train_speed(iter/s)": 0.201259 }, { "acc": 0.79336662, "epoch": 0.41062132722845185, "grad_norm": 4.875, "learning_rate": 9.215453777435672e-06, "loss": 0.71763039, "memory(GiB)": 135.77, "step": 17600, "train_speed(iter/s)": 0.201315 }, { "acc": 0.80592194, "epoch": 0.41085463480074075, "grad_norm": 7.0, "learning_rate": 9.214437566549026e-06, "loss": 0.7039835, "memory(GiB)": 135.77, "step": 17610, "train_speed(iter/s)": 0.201373 }, { "acc": 0.75960789, "epoch": 0.41108794237302965, "grad_norm": 5.3125, "learning_rate": 9.21342075405264e-06, "loss": 0.88718882, "memory(GiB)": 135.77, "step": 17620, "train_speed(iter/s)": 0.201435 }, { "acc": 0.75959663, "epoch": 0.41132124994531855, "grad_norm": 5.09375, "learning_rate": 9.212403340091667e-06, "loss": 0.88778267, "memory(GiB)": 135.77, "step": 17630, "train_speed(iter/s)": 0.201493 }, { "acc": 0.76623917, "epoch": 0.41155455751760744, "grad_norm": 5.65625, "learning_rate": 9.21138532481134e-06, "loss": 0.84202518, "memory(GiB)": 135.77, "step": 17640, "train_speed(iter/s)": 0.201554 }, { "acc": 0.76951289, "epoch": 0.41178786508989634, "grad_norm": 5.375, "learning_rate": 9.210366708356982e-06, "loss": 0.81451988, "memory(GiB)": 135.77, "step": 17650, "train_speed(iter/s)": 0.201614 }, { "acc": 0.76205425, "epoch": 0.41202117266218524, "grad_norm": 5.0625, "learning_rate": 9.209347490874e-06, "loss": 0.86257401, "memory(GiB)": 135.77, "step": 17660, "train_speed(iter/s)": 0.201674 }, { "acc": 0.76792526, "epoch": 0.4122544802344741, "grad_norm": 4.46875, "learning_rate": 9.208327672507883e-06, "loss": 0.82535515, "memory(GiB)": 135.77, "step": 17670, "train_speed(iter/s)": 0.201736 }, { "acc": 0.77031803, "epoch": 0.412487787806763, "grad_norm": 6.40625, "learning_rate": 9.207307253404216e-06, "loss": 0.83331718, "memory(GiB)": 135.77, "step": 17680, "train_speed(iter/s)": 0.201795 }, { "acc": 0.78183222, "epoch": 0.4127210953790519, "grad_norm": 5.59375, "learning_rate": 9.20628623370866e-06, "loss": 0.7861207, "memory(GiB)": 135.77, "step": 17690, "train_speed(iter/s)": 0.201854 }, { "acc": 0.78258052, "epoch": 0.4129544029513408, "grad_norm": 4.71875, "learning_rate": 9.205264613566968e-06, "loss": 0.78774872, "memory(GiB)": 135.77, "step": 17700, "train_speed(iter/s)": 0.201912 }, { "acc": 0.76604571, "epoch": 0.4131877105236297, "grad_norm": 5.78125, "learning_rate": 9.204242393124973e-06, "loss": 0.85110035, "memory(GiB)": 135.77, "step": 17710, "train_speed(iter/s)": 0.201968 }, { "acc": 0.77456961, "epoch": 0.4134210180959186, "grad_norm": 4.25, "learning_rate": 9.203219572528597e-06, "loss": 0.7974988, "memory(GiB)": 135.77, "step": 17720, "train_speed(iter/s)": 0.202026 }, { "acc": 0.78755922, "epoch": 0.4136543256682075, "grad_norm": 6.59375, "learning_rate": 9.202196151923849e-06, "loss": 0.75697212, "memory(GiB)": 135.77, "step": 17730, "train_speed(iter/s)": 0.202085 }, { "acc": 0.75414586, "epoch": 0.4138876332404964, "grad_norm": 6.90625, "learning_rate": 9.201172131456821e-06, "loss": 0.8852272, "memory(GiB)": 135.77, "step": 17740, "train_speed(iter/s)": 0.202146 }, { "acc": 0.75268192, "epoch": 0.4141209408127853, "grad_norm": 6.1875, "learning_rate": 9.20014751127369e-06, "loss": 0.89101114, "memory(GiB)": 135.77, "step": 17750, "train_speed(iter/s)": 0.202204 }, { "acc": 0.76879907, "epoch": 0.4143542483850742, "grad_norm": 4.65625, "learning_rate": 9.199122291520724e-06, "loss": 0.82659073, "memory(GiB)": 135.77, "step": 17760, "train_speed(iter/s)": 0.202266 }, { "acc": 0.78848228, "epoch": 0.414587555957363, "grad_norm": 4.625, "learning_rate": 9.198096472344269e-06, "loss": 0.76872091, "memory(GiB)": 135.77, "step": 17770, "train_speed(iter/s)": 0.202325 }, { "acc": 0.76771512, "epoch": 0.4148208635296519, "grad_norm": 4.34375, "learning_rate": 9.197070053890764e-06, "loss": 0.82841167, "memory(GiB)": 135.77, "step": 17780, "train_speed(iter/s)": 0.202385 }, { "acc": 0.78823538, "epoch": 0.4150541711019408, "grad_norm": 3.953125, "learning_rate": 9.196043036306726e-06, "loss": 0.76270905, "memory(GiB)": 135.77, "step": 17790, "train_speed(iter/s)": 0.202446 }, { "acc": 0.78680201, "epoch": 0.4152874786742297, "grad_norm": 4.9375, "learning_rate": 9.195015419738765e-06, "loss": 0.77416258, "memory(GiB)": 135.77, "step": 17800, "train_speed(iter/s)": 0.202509 }, { "acc": 0.77593956, "epoch": 0.4155207862465186, "grad_norm": 6.6875, "learning_rate": 9.193987204333573e-06, "loss": 0.79424219, "memory(GiB)": 135.77, "step": 17810, "train_speed(iter/s)": 0.202572 }, { "acc": 0.76831007, "epoch": 0.4157540938188075, "grad_norm": 7.0, "learning_rate": 9.192958390237923e-06, "loss": 0.84353199, "memory(GiB)": 135.77, "step": 17820, "train_speed(iter/s)": 0.202631 }, { "acc": 0.7490344, "epoch": 0.4159874013910964, "grad_norm": 4.9375, "learning_rate": 9.19192897759868e-06, "loss": 0.92404823, "memory(GiB)": 135.77, "step": 17830, "train_speed(iter/s)": 0.202692 }, { "acc": 0.76985192, "epoch": 0.4162207089633853, "grad_norm": 9.25, "learning_rate": 9.190898966562796e-06, "loss": 0.83063097, "memory(GiB)": 135.77, "step": 17840, "train_speed(iter/s)": 0.202753 }, { "acc": 0.76316862, "epoch": 0.4164540165356742, "grad_norm": 5.8125, "learning_rate": 9.1898683572773e-06, "loss": 0.8611824, "memory(GiB)": 135.77, "step": 17850, "train_speed(iter/s)": 0.202812 }, { "acc": 0.75131006, "epoch": 0.41668732410796305, "grad_norm": 7.21875, "learning_rate": 9.188837149889316e-06, "loss": 0.92140636, "memory(GiB)": 135.77, "step": 17860, "train_speed(iter/s)": 0.202873 }, { "acc": 0.77328281, "epoch": 0.41692063168025195, "grad_norm": 5.0625, "learning_rate": 9.187805344546044e-06, "loss": 0.81961975, "memory(GiB)": 135.77, "step": 17870, "train_speed(iter/s)": 0.202932 }, { "acc": 0.78284941, "epoch": 0.41715393925254085, "grad_norm": 5.375, "learning_rate": 9.186772941394776e-06, "loss": 0.78479848, "memory(GiB)": 135.77, "step": 17880, "train_speed(iter/s)": 0.202994 }, { "acc": 0.77911134, "epoch": 0.41738724682482975, "grad_norm": 6.78125, "learning_rate": 9.185739940582885e-06, "loss": 0.82526703, "memory(GiB)": 135.77, "step": 17890, "train_speed(iter/s)": 0.203057 }, { "acc": 0.77189975, "epoch": 0.41762055439711865, "grad_norm": 7.34375, "learning_rate": 9.184706342257835e-06, "loss": 0.80744982, "memory(GiB)": 135.77, "step": 17900, "train_speed(iter/s)": 0.20312 }, { "acc": 0.77462916, "epoch": 0.41785386196940755, "grad_norm": 6.46875, "learning_rate": 9.183672146567171e-06, "loss": 0.81883631, "memory(GiB)": 135.77, "step": 17910, "train_speed(iter/s)": 0.203176 }, { "acc": 0.75889959, "epoch": 0.41808716954169645, "grad_norm": 4.78125, "learning_rate": 9.182637353658523e-06, "loss": 0.85708733, "memory(GiB)": 135.77, "step": 17920, "train_speed(iter/s)": 0.203236 }, { "acc": 0.77901268, "epoch": 0.41832047711398535, "grad_norm": 11.125, "learning_rate": 9.181601963679607e-06, "loss": 0.80118008, "memory(GiB)": 135.77, "step": 17930, "train_speed(iter/s)": 0.203298 }, { "acc": 0.76691198, "epoch": 0.41855378468627424, "grad_norm": 6.09375, "learning_rate": 9.180565976778226e-06, "loss": 0.82437811, "memory(GiB)": 135.77, "step": 17940, "train_speed(iter/s)": 0.203358 }, { "acc": 0.79058146, "epoch": 0.41878709225856314, "grad_norm": 5.59375, "learning_rate": 9.179529393102265e-06, "loss": 0.75918694, "memory(GiB)": 135.77, "step": 17950, "train_speed(iter/s)": 0.203415 }, { "acc": 0.78140459, "epoch": 0.419020399830852, "grad_norm": 4.46875, "learning_rate": 9.1784922127997e-06, "loss": 0.77219858, "memory(GiB)": 135.77, "step": 17960, "train_speed(iter/s)": 0.203476 }, { "acc": 0.76335707, "epoch": 0.4192537074031409, "grad_norm": 4.625, "learning_rate": 9.177454436018584e-06, "loss": 0.84384384, "memory(GiB)": 135.77, "step": 17970, "train_speed(iter/s)": 0.203534 }, { "acc": 0.77697759, "epoch": 0.4194870149754298, "grad_norm": 6.5625, "learning_rate": 9.17641606290706e-06, "loss": 0.85184278, "memory(GiB)": 135.77, "step": 17980, "train_speed(iter/s)": 0.203595 }, { "acc": 0.75867949, "epoch": 0.4197203225477187, "grad_norm": 7.40625, "learning_rate": 9.175377093613359e-06, "loss": 0.84790859, "memory(GiB)": 135.77, "step": 17990, "train_speed(iter/s)": 0.203654 }, { "acc": 0.77024422, "epoch": 0.4199536301200076, "grad_norm": 6.71875, "learning_rate": 9.174337528285787e-06, "loss": 0.83740625, "memory(GiB)": 135.77, "step": 18000, "train_speed(iter/s)": 0.203712 }, { "epoch": 0.4199536301200076, "eval_acc": 0.7375597958545385, "eval_loss": 0.8316662907600403, "eval_runtime": 1270.1578, "eval_samples_per_second": 28.336, "eval_steps_per_second": 14.168, "step": 18000 }, { "acc": 0.75532227, "epoch": 0.4201869376922965, "grad_norm": 4.65625, "learning_rate": 9.173297367072748e-06, "loss": 0.87728586, "memory(GiB)": 135.77, "step": 18010, "train_speed(iter/s)": 0.200842 }, { "acc": 0.78155527, "epoch": 0.4204202452645854, "grad_norm": 6.9375, "learning_rate": 9.172256610122721e-06, "loss": 0.79191427, "memory(GiB)": 135.77, "step": 18020, "train_speed(iter/s)": 0.200902 }, { "acc": 0.7721571, "epoch": 0.4206535528368743, "grad_norm": 7.78125, "learning_rate": 9.171215257584277e-06, "loss": 0.82012405, "memory(GiB)": 135.77, "step": 18030, "train_speed(iter/s)": 0.200964 }, { "acc": 0.74781427, "epoch": 0.4208868604091632, "grad_norm": 5.1875, "learning_rate": 9.170173309606063e-06, "loss": 0.92548695, "memory(GiB)": 135.77, "step": 18040, "train_speed(iter/s)": 0.201023 }, { "acc": 0.77438364, "epoch": 0.421120167981452, "grad_norm": 5.3125, "learning_rate": 9.169130766336824e-06, "loss": 0.8166708, "memory(GiB)": 135.77, "step": 18050, "train_speed(iter/s)": 0.201079 }, { "acc": 0.76485076, "epoch": 0.4213534755537409, "grad_norm": 4.90625, "learning_rate": 9.168087627925377e-06, "loss": 0.85983076, "memory(GiB)": 135.77, "step": 18060, "train_speed(iter/s)": 0.201133 }, { "acc": 0.77190371, "epoch": 0.4215867831260298, "grad_norm": 6.40625, "learning_rate": 9.167043894520633e-06, "loss": 0.81317501, "memory(GiB)": 135.77, "step": 18070, "train_speed(iter/s)": 0.201191 }, { "acc": 0.77723899, "epoch": 0.4218200906983187, "grad_norm": 4.6875, "learning_rate": 9.165999566271584e-06, "loss": 0.81175861, "memory(GiB)": 135.77, "step": 18080, "train_speed(iter/s)": 0.201248 }, { "acc": 0.77668056, "epoch": 0.4220533982706076, "grad_norm": 5.8125, "learning_rate": 9.164954643327306e-06, "loss": 0.79176364, "memory(GiB)": 135.77, "step": 18090, "train_speed(iter/s)": 0.201302 }, { "acc": 0.78481464, "epoch": 0.4222867058428965, "grad_norm": 4.5625, "learning_rate": 9.163909125836965e-06, "loss": 0.7773984, "memory(GiB)": 135.77, "step": 18100, "train_speed(iter/s)": 0.201359 }, { "acc": 0.77743702, "epoch": 0.4225200134151854, "grad_norm": 6.1875, "learning_rate": 9.162863013949803e-06, "loss": 0.7793623, "memory(GiB)": 135.77, "step": 18110, "train_speed(iter/s)": 0.201417 }, { "acc": 0.75463772, "epoch": 0.4227533209874743, "grad_norm": 5.96875, "learning_rate": 9.161816307815157e-06, "loss": 0.88482685, "memory(GiB)": 135.77, "step": 18120, "train_speed(iter/s)": 0.201472 }, { "acc": 0.78212109, "epoch": 0.4229866285597632, "grad_norm": 5.53125, "learning_rate": 9.160769007582441e-06, "loss": 0.78088417, "memory(GiB)": 135.77, "step": 18130, "train_speed(iter/s)": 0.201529 }, { "acc": 0.77992072, "epoch": 0.4232199361320521, "grad_norm": 5.21875, "learning_rate": 9.15972111340116e-06, "loss": 0.80578728, "memory(GiB)": 135.77, "step": 18140, "train_speed(iter/s)": 0.201586 }, { "acc": 0.75823526, "epoch": 0.42345324370434095, "grad_norm": 7.90625, "learning_rate": 9.158672625420894e-06, "loss": 0.88358002, "memory(GiB)": 135.77, "step": 18150, "train_speed(iter/s)": 0.201641 }, { "acc": 0.78155651, "epoch": 0.42368655127662985, "grad_norm": 6.40625, "learning_rate": 9.157623543791323e-06, "loss": 0.7862711, "memory(GiB)": 135.77, "step": 18160, "train_speed(iter/s)": 0.201697 }, { "acc": 0.77112303, "epoch": 0.42391985884891875, "grad_norm": 6.96875, "learning_rate": 9.156573868662197e-06, "loss": 0.83190708, "memory(GiB)": 135.77, "step": 18170, "train_speed(iter/s)": 0.201751 }, { "acc": 0.78510017, "epoch": 0.42415316642120765, "grad_norm": 5.3125, "learning_rate": 9.155523600183359e-06, "loss": 0.75841799, "memory(GiB)": 135.77, "step": 18180, "train_speed(iter/s)": 0.201805 }, { "acc": 0.76126904, "epoch": 0.42438647399349655, "grad_norm": 6.5625, "learning_rate": 9.154472738504735e-06, "loss": 0.84513931, "memory(GiB)": 135.77, "step": 18190, "train_speed(iter/s)": 0.201858 }, { "acc": 0.77650328, "epoch": 0.42461978156578545, "grad_norm": 5.09375, "learning_rate": 9.153421283776334e-06, "loss": 0.80113697, "memory(GiB)": 135.77, "step": 18200, "train_speed(iter/s)": 0.201916 }, { "acc": 0.78944378, "epoch": 0.42485308913807435, "grad_norm": 4.0625, "learning_rate": 9.152369236148252e-06, "loss": 0.76571293, "memory(GiB)": 135.77, "step": 18210, "train_speed(iter/s)": 0.201971 }, { "acc": 0.77950716, "epoch": 0.42508639671036325, "grad_norm": 5.1875, "learning_rate": 9.151316595770665e-06, "loss": 0.78431196, "memory(GiB)": 135.77, "step": 18220, "train_speed(iter/s)": 0.202028 }, { "acc": 0.75776768, "epoch": 0.42531970428265214, "grad_norm": 6.96875, "learning_rate": 9.150263362793844e-06, "loss": 0.88814754, "memory(GiB)": 135.77, "step": 18230, "train_speed(iter/s)": 0.20209 }, { "acc": 0.7760685, "epoch": 0.42555301185494104, "grad_norm": 4.65625, "learning_rate": 9.14920953736813e-06, "loss": 0.81063766, "memory(GiB)": 135.77, "step": 18240, "train_speed(iter/s)": 0.202147 }, { "acc": 0.77113094, "epoch": 0.4257863194272299, "grad_norm": 5.03125, "learning_rate": 9.148155119643963e-06, "loss": 0.82983465, "memory(GiB)": 135.77, "step": 18250, "train_speed(iter/s)": 0.202206 }, { "acc": 0.76979413, "epoch": 0.4260196269995188, "grad_norm": 4.9375, "learning_rate": 9.147100109771856e-06, "loss": 0.83212032, "memory(GiB)": 135.77, "step": 18260, "train_speed(iter/s)": 0.202259 }, { "acc": 0.7747787, "epoch": 0.4262529345718077, "grad_norm": 4.78125, "learning_rate": 9.146044507902411e-06, "loss": 0.80783243, "memory(GiB)": 135.77, "step": 18270, "train_speed(iter/s)": 0.202315 }, { "acc": 0.78634143, "epoch": 0.4264862421440966, "grad_norm": 7.71875, "learning_rate": 9.144988314186321e-06, "loss": 0.76813116, "memory(GiB)": 135.77, "step": 18280, "train_speed(iter/s)": 0.202366 }, { "acc": 0.76954098, "epoch": 0.4267195497163855, "grad_norm": 5.75, "learning_rate": 9.143931528774351e-06, "loss": 0.82664013, "memory(GiB)": 135.77, "step": 18290, "train_speed(iter/s)": 0.202423 }, { "acc": 0.74711275, "epoch": 0.4269528572886744, "grad_norm": 5.65625, "learning_rate": 9.14287415181736e-06, "loss": 0.90971508, "memory(GiB)": 135.77, "step": 18300, "train_speed(iter/s)": 0.202481 }, { "acc": 0.75965652, "epoch": 0.4271861648609633, "grad_norm": 5.6875, "learning_rate": 9.141816183466286e-06, "loss": 0.85946274, "memory(GiB)": 135.77, "step": 18310, "train_speed(iter/s)": 0.202539 }, { "acc": 0.76663742, "epoch": 0.4274194724332522, "grad_norm": 5.5625, "learning_rate": 9.140757623872156e-06, "loss": 0.86080341, "memory(GiB)": 135.77, "step": 18320, "train_speed(iter/s)": 0.202596 }, { "acc": 0.76160979, "epoch": 0.4276527800055411, "grad_norm": 4.34375, "learning_rate": 9.139698473186079e-06, "loss": 0.85901699, "memory(GiB)": 135.77, "step": 18330, "train_speed(iter/s)": 0.202652 }, { "acc": 0.77596149, "epoch": 0.4278860875778299, "grad_norm": 5.03125, "learning_rate": 9.138638731559246e-06, "loss": 0.80930233, "memory(GiB)": 135.77, "step": 18340, "train_speed(iter/s)": 0.20271 }, { "acc": 0.78887205, "epoch": 0.4281193951501188, "grad_norm": 5.0, "learning_rate": 9.137578399142936e-06, "loss": 0.73909388, "memory(GiB)": 135.77, "step": 18350, "train_speed(iter/s)": 0.202766 }, { "acc": 0.77916021, "epoch": 0.4283527027224077, "grad_norm": 5.0, "learning_rate": 9.136517476088513e-06, "loss": 0.80833597, "memory(GiB)": 135.77, "step": 18360, "train_speed(iter/s)": 0.202821 }, { "acc": 0.77610526, "epoch": 0.4285860102946966, "grad_norm": 6.5625, "learning_rate": 9.135455962547422e-06, "loss": 0.80960493, "memory(GiB)": 135.77, "step": 18370, "train_speed(iter/s)": 0.202876 }, { "acc": 0.75241766, "epoch": 0.4288193178669855, "grad_norm": 6.25, "learning_rate": 9.134393858671193e-06, "loss": 0.87778807, "memory(GiB)": 135.77, "step": 18380, "train_speed(iter/s)": 0.202933 }, { "acc": 0.7702713, "epoch": 0.4290526254392744, "grad_norm": 7.0, "learning_rate": 9.13333116461144e-06, "loss": 0.82612343, "memory(GiB)": 135.77, "step": 18390, "train_speed(iter/s)": 0.20299 }, { "acc": 0.78864408, "epoch": 0.4292859330115633, "grad_norm": 5.15625, "learning_rate": 9.132267880519867e-06, "loss": 0.75824766, "memory(GiB)": 135.77, "step": 18400, "train_speed(iter/s)": 0.203043 }, { "acc": 0.76866798, "epoch": 0.4295192405838522, "grad_norm": 6.1875, "learning_rate": 9.131204006548253e-06, "loss": 0.83031731, "memory(GiB)": 135.77, "step": 18410, "train_speed(iter/s)": 0.203101 }, { "acc": 0.7711411, "epoch": 0.4297525481561411, "grad_norm": 6.90625, "learning_rate": 9.130139542848468e-06, "loss": 0.83066397, "memory(GiB)": 135.77, "step": 18420, "train_speed(iter/s)": 0.203161 }, { "acc": 0.76493034, "epoch": 0.42998585572843, "grad_norm": 5.75, "learning_rate": 9.129074489572463e-06, "loss": 0.85996132, "memory(GiB)": 135.77, "step": 18430, "train_speed(iter/s)": 0.203214 }, { "acc": 0.76129556, "epoch": 0.43021916330071885, "grad_norm": 4.15625, "learning_rate": 9.128008846872273e-06, "loss": 0.87012196, "memory(GiB)": 135.77, "step": 18440, "train_speed(iter/s)": 0.203266 }, { "acc": 0.77166157, "epoch": 0.43045247087300775, "grad_norm": 9.5625, "learning_rate": 9.126942614900021e-06, "loss": 0.80364647, "memory(GiB)": 135.77, "step": 18450, "train_speed(iter/s)": 0.203324 }, { "acc": 0.76030464, "epoch": 0.43068577844529665, "grad_norm": 4.1875, "learning_rate": 9.125875793807908e-06, "loss": 0.88488598, "memory(GiB)": 135.77, "step": 18460, "train_speed(iter/s)": 0.203381 }, { "acc": 0.77922316, "epoch": 0.43091908601758555, "grad_norm": 9.4375, "learning_rate": 9.124808383748226e-06, "loss": 0.79430113, "memory(GiB)": 135.77, "step": 18470, "train_speed(iter/s)": 0.203433 }, { "acc": 0.76354847, "epoch": 0.43115239358987445, "grad_norm": 7.46875, "learning_rate": 9.123740384873343e-06, "loss": 0.87230701, "memory(GiB)": 135.77, "step": 18480, "train_speed(iter/s)": 0.203489 }, { "acc": 0.76531634, "epoch": 0.43138570116216335, "grad_norm": 6.3125, "learning_rate": 9.122671797335719e-06, "loss": 0.86186409, "memory(GiB)": 135.77, "step": 18490, "train_speed(iter/s)": 0.203546 }, { "acc": 0.76710591, "epoch": 0.43161900873445225, "grad_norm": 5.96875, "learning_rate": 9.121602621287892e-06, "loss": 0.85632124, "memory(GiB)": 135.77, "step": 18500, "train_speed(iter/s)": 0.203601 }, { "epoch": 0.43161900873445225, "eval_acc": 0.737777431866236, "eval_loss": 0.8302583694458008, "eval_runtime": 1269.9297, "eval_samples_per_second": 28.341, "eval_steps_per_second": 14.171, "step": 18500 }, { "acc": 0.79308367, "epoch": 0.43185231630674115, "grad_norm": 5.53125, "learning_rate": 9.120532856882491e-06, "loss": 0.74874711, "memory(GiB)": 135.77, "step": 18510, "train_speed(iter/s)": 0.200815 }, { "acc": 0.77777281, "epoch": 0.43208562387903005, "grad_norm": 5.375, "learning_rate": 9.119462504272221e-06, "loss": 0.8051053, "memory(GiB)": 135.77, "step": 18520, "train_speed(iter/s)": 0.200868 }, { "acc": 0.74930582, "epoch": 0.43231893145131894, "grad_norm": 5.9375, "learning_rate": 9.118391563609875e-06, "loss": 0.8819828, "memory(GiB)": 135.77, "step": 18530, "train_speed(iter/s)": 0.200927 }, { "acc": 0.77309723, "epoch": 0.4325522390236078, "grad_norm": 6.34375, "learning_rate": 9.117320035048329e-06, "loss": 0.82186155, "memory(GiB)": 135.77, "step": 18540, "train_speed(iter/s)": 0.200984 }, { "acc": 0.77171998, "epoch": 0.4327855465958967, "grad_norm": 6.65625, "learning_rate": 9.116247918740544e-06, "loss": 0.80551882, "memory(GiB)": 135.77, "step": 18550, "train_speed(iter/s)": 0.201042 }, { "acc": 0.76656771, "epoch": 0.4330188541681856, "grad_norm": 4.59375, "learning_rate": 9.115175214839565e-06, "loss": 0.87863846, "memory(GiB)": 135.77, "step": 18560, "train_speed(iter/s)": 0.201095 }, { "acc": 0.78267555, "epoch": 0.4332521617404745, "grad_norm": 5.46875, "learning_rate": 9.114101923498519e-06, "loss": 0.77156582, "memory(GiB)": 135.77, "step": 18570, "train_speed(iter/s)": 0.201152 }, { "acc": 0.76703129, "epoch": 0.4334854693127634, "grad_norm": 8.625, "learning_rate": 9.113028044870619e-06, "loss": 0.84351616, "memory(GiB)": 135.77, "step": 18580, "train_speed(iter/s)": 0.201206 }, { "acc": 0.76306887, "epoch": 0.4337187768850523, "grad_norm": 5.625, "learning_rate": 9.11195357910916e-06, "loss": 0.86529598, "memory(GiB)": 135.77, "step": 18590, "train_speed(iter/s)": 0.201258 }, { "acc": 0.77179461, "epoch": 0.4339520844573412, "grad_norm": 5.03125, "learning_rate": 9.110878526367523e-06, "loss": 0.8225605, "memory(GiB)": 135.77, "step": 18600, "train_speed(iter/s)": 0.201317 }, { "acc": 0.77156925, "epoch": 0.4341853920296301, "grad_norm": 6.03125, "learning_rate": 9.10980288679917e-06, "loss": 0.81905994, "memory(GiB)": 135.77, "step": 18610, "train_speed(iter/s)": 0.201374 }, { "acc": 0.76532478, "epoch": 0.434418699601919, "grad_norm": 8.8125, "learning_rate": 9.10872666055765e-06, "loss": 0.85075331, "memory(GiB)": 135.77, "step": 18620, "train_speed(iter/s)": 0.201432 }, { "acc": 0.76773767, "epoch": 0.4346520071742078, "grad_norm": 6.34375, "learning_rate": 9.107649847796591e-06, "loss": 0.8390007, "memory(GiB)": 135.77, "step": 18630, "train_speed(iter/s)": 0.201484 }, { "acc": 0.76637259, "epoch": 0.4348853147464967, "grad_norm": 7.53125, "learning_rate": 9.10657244866971e-06, "loss": 0.83106155, "memory(GiB)": 135.77, "step": 18640, "train_speed(iter/s)": 0.201536 }, { "acc": 0.76556454, "epoch": 0.4351186223187856, "grad_norm": 6.5625, "learning_rate": 9.105494463330805e-06, "loss": 0.82924232, "memory(GiB)": 135.77, "step": 18650, "train_speed(iter/s)": 0.201594 }, { "acc": 0.76742105, "epoch": 0.4353519298910745, "grad_norm": 4.0, "learning_rate": 9.104415891933757e-06, "loss": 0.83789377, "memory(GiB)": 135.77, "step": 18660, "train_speed(iter/s)": 0.20165 }, { "acc": 0.77874622, "epoch": 0.4355852374633634, "grad_norm": 5.1875, "learning_rate": 9.103336734632536e-06, "loss": 0.78959827, "memory(GiB)": 135.77, "step": 18670, "train_speed(iter/s)": 0.2017 }, { "acc": 0.7700861, "epoch": 0.4358185450356523, "grad_norm": 4.5, "learning_rate": 9.102256991581185e-06, "loss": 0.82032681, "memory(GiB)": 135.77, "step": 18680, "train_speed(iter/s)": 0.201755 }, { "acc": 0.76898155, "epoch": 0.4360518526079412, "grad_norm": 5.65625, "learning_rate": 9.101176662933842e-06, "loss": 0.85672741, "memory(GiB)": 135.77, "step": 18690, "train_speed(iter/s)": 0.201813 }, { "acc": 0.76275616, "epoch": 0.4362851601802301, "grad_norm": 5.09375, "learning_rate": 9.10009574884472e-06, "loss": 0.86024952, "memory(GiB)": 135.77, "step": 18700, "train_speed(iter/s)": 0.201867 }, { "acc": 0.76601181, "epoch": 0.436518467752519, "grad_norm": 6.375, "learning_rate": 9.099014249468124e-06, "loss": 0.86094589, "memory(GiB)": 135.77, "step": 18710, "train_speed(iter/s)": 0.201923 }, { "acc": 0.77703285, "epoch": 0.4367517753248079, "grad_norm": 7.375, "learning_rate": 9.097932164958432e-06, "loss": 0.79216099, "memory(GiB)": 135.77, "step": 18720, "train_speed(iter/s)": 0.201982 }, { "acc": 0.76565948, "epoch": 0.43698508289709675, "grad_norm": 5.28125, "learning_rate": 9.096849495470113e-06, "loss": 0.86634254, "memory(GiB)": 135.77, "step": 18730, "train_speed(iter/s)": 0.202039 }, { "acc": 0.77021027, "epoch": 0.43721839046938565, "grad_norm": 5.5, "learning_rate": 9.095766241157721e-06, "loss": 0.81364603, "memory(GiB)": 135.77, "step": 18740, "train_speed(iter/s)": 0.202094 }, { "acc": 0.78136158, "epoch": 0.43745169804167455, "grad_norm": 4.8125, "learning_rate": 9.094682402175887e-06, "loss": 0.79765949, "memory(GiB)": 135.77, "step": 18750, "train_speed(iter/s)": 0.202146 }, { "acc": 0.76939726, "epoch": 0.43768500561396345, "grad_norm": 9.6875, "learning_rate": 9.093597978679329e-06, "loss": 0.84548798, "memory(GiB)": 135.77, "step": 18760, "train_speed(iter/s)": 0.202202 }, { "acc": 0.77349901, "epoch": 0.43791831318625235, "grad_norm": 7.65625, "learning_rate": 9.09251297082285e-06, "loss": 0.81046124, "memory(GiB)": 135.77, "step": 18770, "train_speed(iter/s)": 0.202258 }, { "acc": 0.76422777, "epoch": 0.43815162075854125, "grad_norm": 4.5, "learning_rate": 9.091427378761333e-06, "loss": 0.86868763, "memory(GiB)": 135.77, "step": 18780, "train_speed(iter/s)": 0.202317 }, { "acc": 0.77027721, "epoch": 0.43838492833083015, "grad_norm": 5.15625, "learning_rate": 9.090341202649746e-06, "loss": 0.84670544, "memory(GiB)": 135.77, "step": 18790, "train_speed(iter/s)": 0.202371 }, { "acc": 0.76263161, "epoch": 0.43861823590311905, "grad_norm": 5.9375, "learning_rate": 9.08925444264314e-06, "loss": 0.86874142, "memory(GiB)": 135.77, "step": 18800, "train_speed(iter/s)": 0.202424 }, { "acc": 0.75490875, "epoch": 0.43885154347540795, "grad_norm": 6.78125, "learning_rate": 9.088167098896652e-06, "loss": 0.89412689, "memory(GiB)": 135.77, "step": 18810, "train_speed(iter/s)": 0.202481 }, { "acc": 0.77074194, "epoch": 0.4390848510476968, "grad_norm": 6.15625, "learning_rate": 9.087079171565496e-06, "loss": 0.84905615, "memory(GiB)": 135.77, "step": 18820, "train_speed(iter/s)": 0.202537 }, { "acc": 0.76013765, "epoch": 0.4393181586199857, "grad_norm": 4.59375, "learning_rate": 9.085990660804976e-06, "loss": 0.84425535, "memory(GiB)": 135.77, "step": 18830, "train_speed(iter/s)": 0.202595 }, { "acc": 0.78072777, "epoch": 0.4395514661922746, "grad_norm": 5.96875, "learning_rate": 9.084901566770476e-06, "loss": 0.78052444, "memory(GiB)": 135.77, "step": 18840, "train_speed(iter/s)": 0.20265 }, { "acc": 0.78506041, "epoch": 0.4397847737645635, "grad_norm": 7.21875, "learning_rate": 9.083811889617467e-06, "loss": 0.76543379, "memory(GiB)": 135.77, "step": 18850, "train_speed(iter/s)": 0.202704 }, { "acc": 0.7516593, "epoch": 0.4400180813368524, "grad_norm": 5.46875, "learning_rate": 9.082721629501494e-06, "loss": 0.88883514, "memory(GiB)": 135.77, "step": 18860, "train_speed(iter/s)": 0.202756 }, { "acc": 0.77115822, "epoch": 0.4402513889091413, "grad_norm": 8.1875, "learning_rate": 9.081630786578195e-06, "loss": 0.8167778, "memory(GiB)": 135.77, "step": 18870, "train_speed(iter/s)": 0.202811 }, { "acc": 0.76743846, "epoch": 0.4404846964814302, "grad_norm": 6.125, "learning_rate": 9.080539361003288e-06, "loss": 0.85453176, "memory(GiB)": 135.77, "step": 18880, "train_speed(iter/s)": 0.202866 }, { "acc": 0.75334711, "epoch": 0.4407180040537191, "grad_norm": 4.375, "learning_rate": 9.079447352932571e-06, "loss": 0.89333754, "memory(GiB)": 135.77, "step": 18890, "train_speed(iter/s)": 0.202919 }, { "acc": 0.78106527, "epoch": 0.440951311626008, "grad_norm": 6.21875, "learning_rate": 9.078354762521931e-06, "loss": 0.80243101, "memory(GiB)": 135.77, "step": 18900, "train_speed(iter/s)": 0.202975 }, { "acc": 0.76703472, "epoch": 0.4411846191982969, "grad_norm": 5.8125, "learning_rate": 9.077261589927333e-06, "loss": 0.84479599, "memory(GiB)": 135.77, "step": 18910, "train_speed(iter/s)": 0.203031 }, { "acc": 0.77735329, "epoch": 0.4414179267705857, "grad_norm": 11.625, "learning_rate": 9.076167835304828e-06, "loss": 0.80349712, "memory(GiB)": 135.77, "step": 18920, "train_speed(iter/s)": 0.203086 }, { "acc": 0.79136057, "epoch": 0.4416512343428746, "grad_norm": 5.25, "learning_rate": 9.075073498810547e-06, "loss": 0.76573734, "memory(GiB)": 135.77, "step": 18930, "train_speed(iter/s)": 0.203143 }, { "acc": 0.78350835, "epoch": 0.4418845419151635, "grad_norm": 5.09375, "learning_rate": 9.073978580600709e-06, "loss": 0.78551674, "memory(GiB)": 135.77, "step": 18940, "train_speed(iter/s)": 0.203194 }, { "acc": 0.76640816, "epoch": 0.4421178494874524, "grad_norm": 5.5625, "learning_rate": 9.072883080831611e-06, "loss": 0.8355978, "memory(GiB)": 135.77, "step": 18950, "train_speed(iter/s)": 0.203252 }, { "acc": 0.75378032, "epoch": 0.4423511570597413, "grad_norm": 6.25, "learning_rate": 9.071786999659638e-06, "loss": 0.88689022, "memory(GiB)": 135.77, "step": 18960, "train_speed(iter/s)": 0.203313 }, { "acc": 0.74607992, "epoch": 0.4425844646320302, "grad_norm": 31.375, "learning_rate": 9.070690337241252e-06, "loss": 0.98085604, "memory(GiB)": 135.77, "step": 18970, "train_speed(iter/s)": 0.203368 }, { "acc": 0.78964634, "epoch": 0.4428177722043191, "grad_norm": 5.71875, "learning_rate": 9.069593093733004e-06, "loss": 0.76366062, "memory(GiB)": 135.77, "step": 18980, "train_speed(iter/s)": 0.203426 }, { "acc": 0.77851391, "epoch": 0.443051079776608, "grad_norm": 7.03125, "learning_rate": 9.068495269291524e-06, "loss": 0.81780148, "memory(GiB)": 135.77, "step": 18990, "train_speed(iter/s)": 0.203481 }, { "acc": 0.75752182, "epoch": 0.4432843873488969, "grad_norm": 6.84375, "learning_rate": 9.067396864073527e-06, "loss": 0.87670937, "memory(GiB)": 135.77, "step": 19000, "train_speed(iter/s)": 0.203539 }, { "epoch": 0.4432843873488969, "eval_acc": 0.7378800843191412, "eval_loss": 0.8296611905097961, "eval_runtime": 1269.212, "eval_samples_per_second": 28.357, "eval_steps_per_second": 14.179, "step": 19000 }, { "acc": 0.77129135, "epoch": 0.4435176949211858, "grad_norm": 4.09375, "learning_rate": 9.066297878235808e-06, "loss": 0.82974644, "memory(GiB)": 135.77, "step": 19010, "train_speed(iter/s)": 0.200829 }, { "acc": 0.77290907, "epoch": 0.44375100249347466, "grad_norm": 3.796875, "learning_rate": 9.065198311935248e-06, "loss": 0.82878008, "memory(GiB)": 135.77, "step": 19020, "train_speed(iter/s)": 0.200885 }, { "acc": 0.77206917, "epoch": 0.44398431006576355, "grad_norm": 7.4375, "learning_rate": 9.06409816532881e-06, "loss": 0.84106894, "memory(GiB)": 135.77, "step": 19030, "train_speed(iter/s)": 0.20094 }, { "acc": 0.75648594, "epoch": 0.44421761763805245, "grad_norm": 4.9375, "learning_rate": 9.06299743857354e-06, "loss": 0.90824041, "memory(GiB)": 135.77, "step": 19040, "train_speed(iter/s)": 0.200992 }, { "acc": 0.76354513, "epoch": 0.44445092521034135, "grad_norm": 4.8125, "learning_rate": 9.061896131826566e-06, "loss": 0.86085148, "memory(GiB)": 135.77, "step": 19050, "train_speed(iter/s)": 0.201042 }, { "acc": 0.76192708, "epoch": 0.44468423278263025, "grad_norm": 6.8125, "learning_rate": 9.0607942452451e-06, "loss": 0.8651083, "memory(GiB)": 135.77, "step": 19060, "train_speed(iter/s)": 0.201098 }, { "acc": 0.76140442, "epoch": 0.44491754035491915, "grad_norm": 6.28125, "learning_rate": 9.059691778986433e-06, "loss": 0.85103846, "memory(GiB)": 135.77, "step": 19070, "train_speed(iter/s)": 0.201155 }, { "acc": 0.76223292, "epoch": 0.44515084792720805, "grad_norm": 8.5, "learning_rate": 9.058588733207945e-06, "loss": 0.83704605, "memory(GiB)": 135.77, "step": 19080, "train_speed(iter/s)": 0.201211 }, { "acc": 0.76429772, "epoch": 0.44538415549949695, "grad_norm": 6.09375, "learning_rate": 9.057485108067094e-06, "loss": 0.86491184, "memory(GiB)": 135.77, "step": 19090, "train_speed(iter/s)": 0.201266 }, { "acc": 0.77062263, "epoch": 0.44561746307178585, "grad_norm": 4.875, "learning_rate": 9.056380903721424e-06, "loss": 0.81978521, "memory(GiB)": 135.77, "step": 19100, "train_speed(iter/s)": 0.201321 }, { "acc": 0.77301741, "epoch": 0.4458507706440747, "grad_norm": 5.78125, "learning_rate": 9.055276120328557e-06, "loss": 0.81939163, "memory(GiB)": 135.77, "step": 19110, "train_speed(iter/s)": 0.201375 }, { "acc": 0.77150536, "epoch": 0.4460840782163636, "grad_norm": 4.4375, "learning_rate": 9.054170758046204e-06, "loss": 0.81701164, "memory(GiB)": 135.77, "step": 19120, "train_speed(iter/s)": 0.201429 }, { "acc": 0.77243052, "epoch": 0.4463173857886525, "grad_norm": 5.375, "learning_rate": 9.05306481703215e-06, "loss": 0.83213739, "memory(GiB)": 135.77, "step": 19130, "train_speed(iter/s)": 0.201484 }, { "acc": 0.76942472, "epoch": 0.4465506933609414, "grad_norm": 6.0625, "learning_rate": 9.051958297444272e-06, "loss": 0.8267148, "memory(GiB)": 135.77, "step": 19140, "train_speed(iter/s)": 0.201538 }, { "acc": 0.76962104, "epoch": 0.4467840009332303, "grad_norm": 5.28125, "learning_rate": 9.050851199440524e-06, "loss": 0.81351061, "memory(GiB)": 135.77, "step": 19150, "train_speed(iter/s)": 0.201591 }, { "acc": 0.7752852, "epoch": 0.4470173085055192, "grad_norm": 5.25, "learning_rate": 9.049743523178945e-06, "loss": 0.81729002, "memory(GiB)": 135.77, "step": 19160, "train_speed(iter/s)": 0.201644 }, { "acc": 0.76699119, "epoch": 0.4472506160778081, "grad_norm": 4.6875, "learning_rate": 9.048635268817653e-06, "loss": 0.85345955, "memory(GiB)": 135.77, "step": 19170, "train_speed(iter/s)": 0.201701 }, { "acc": 0.76262298, "epoch": 0.447483923650097, "grad_norm": 8.9375, "learning_rate": 9.047526436514854e-06, "loss": 0.85194921, "memory(GiB)": 135.77, "step": 19180, "train_speed(iter/s)": 0.201754 }, { "acc": 0.76009088, "epoch": 0.4477172312223859, "grad_norm": 4.90625, "learning_rate": 9.04641702642883e-06, "loss": 0.86009045, "memory(GiB)": 135.77, "step": 19190, "train_speed(iter/s)": 0.201813 }, { "acc": 0.77364101, "epoch": 0.4479505387946748, "grad_norm": 4.90625, "learning_rate": 9.045307038717954e-06, "loss": 0.81192675, "memory(GiB)": 135.77, "step": 19200, "train_speed(iter/s)": 0.201865 }, { "acc": 0.77397594, "epoch": 0.4481838463669636, "grad_norm": 6.625, "learning_rate": 9.044196473540672e-06, "loss": 0.8263483, "memory(GiB)": 135.77, "step": 19210, "train_speed(iter/s)": 0.201919 }, { "acc": 0.753967, "epoch": 0.4484171539392525, "grad_norm": 5.0625, "learning_rate": 9.043085331055516e-06, "loss": 0.89315586, "memory(GiB)": 135.77, "step": 19220, "train_speed(iter/s)": 0.201975 }, { "acc": 0.78117127, "epoch": 0.4486504615115414, "grad_norm": 3.390625, "learning_rate": 9.041973611421106e-06, "loss": 0.77627773, "memory(GiB)": 135.77, "step": 19230, "train_speed(iter/s)": 0.20203 }, { "acc": 0.76547561, "epoch": 0.4488837690838303, "grad_norm": 5.5625, "learning_rate": 9.040861314796137e-06, "loss": 0.84824905, "memory(GiB)": 135.77, "step": 19240, "train_speed(iter/s)": 0.20209 }, { "acc": 0.79242058, "epoch": 0.4491170766561192, "grad_norm": 4.40625, "learning_rate": 9.039748441339389e-06, "loss": 0.73243713, "memory(GiB)": 135.77, "step": 19250, "train_speed(iter/s)": 0.202146 }, { "acc": 0.74393578, "epoch": 0.4493503842284081, "grad_norm": 7.0625, "learning_rate": 9.038634991209725e-06, "loss": 0.93040276, "memory(GiB)": 135.77, "step": 19260, "train_speed(iter/s)": 0.202197 }, { "acc": 0.7861661, "epoch": 0.449583691800697, "grad_norm": 7.78125, "learning_rate": 9.03752096456609e-06, "loss": 0.77835784, "memory(GiB)": 135.77, "step": 19270, "train_speed(iter/s)": 0.20225 }, { "acc": 0.77567, "epoch": 0.4498169993729859, "grad_norm": 5.0, "learning_rate": 9.036406361567506e-06, "loss": 0.7994051, "memory(GiB)": 135.77, "step": 19280, "train_speed(iter/s)": 0.202304 }, { "acc": 0.76911907, "epoch": 0.4500503069452748, "grad_norm": 5.5, "learning_rate": 9.035291182373092e-06, "loss": 0.82964163, "memory(GiB)": 135.77, "step": 19290, "train_speed(iter/s)": 0.202358 }, { "acc": 0.76818333, "epoch": 0.4502836145175637, "grad_norm": 5.65625, "learning_rate": 9.03417542714203e-06, "loss": 0.8365284, "memory(GiB)": 135.77, "step": 19300, "train_speed(iter/s)": 0.202412 }, { "acc": 0.77055397, "epoch": 0.45051692208985256, "grad_norm": 5.53125, "learning_rate": 9.033059096033598e-06, "loss": 0.82133121, "memory(GiB)": 135.77, "step": 19310, "train_speed(iter/s)": 0.202465 }, { "acc": 0.769734, "epoch": 0.45075022966214146, "grad_norm": 4.65625, "learning_rate": 9.031942189207154e-06, "loss": 0.81667995, "memory(GiB)": 135.77, "step": 19320, "train_speed(iter/s)": 0.202515 }, { "acc": 0.7695981, "epoch": 0.45098353723443035, "grad_norm": 5.25, "learning_rate": 9.030824706822132e-06, "loss": 0.83406506, "memory(GiB)": 135.77, "step": 19330, "train_speed(iter/s)": 0.202565 }, { "acc": 0.77217684, "epoch": 0.45121684480671925, "grad_norm": 5.0625, "learning_rate": 9.029706649038055e-06, "loss": 0.80703802, "memory(GiB)": 135.77, "step": 19340, "train_speed(iter/s)": 0.202615 }, { "acc": 0.76901155, "epoch": 0.45145015237900815, "grad_norm": 6.9375, "learning_rate": 9.028588016014524e-06, "loss": 0.8353384, "memory(GiB)": 135.77, "step": 19350, "train_speed(iter/s)": 0.202672 }, { "acc": 0.78265429, "epoch": 0.45168345995129705, "grad_norm": 5.53125, "learning_rate": 9.027468807911223e-06, "loss": 0.78769207, "memory(GiB)": 135.77, "step": 19360, "train_speed(iter/s)": 0.202725 }, { "acc": 0.79214249, "epoch": 0.45191676752358595, "grad_norm": 11.1875, "learning_rate": 9.026349024887921e-06, "loss": 0.72728148, "memory(GiB)": 135.77, "step": 19370, "train_speed(iter/s)": 0.202779 }, { "acc": 0.76945453, "epoch": 0.45215007509587485, "grad_norm": 5.875, "learning_rate": 9.025228667104465e-06, "loss": 0.83437386, "memory(GiB)": 135.77, "step": 19380, "train_speed(iter/s)": 0.202835 }, { "acc": 0.76589389, "epoch": 0.45238338266816375, "grad_norm": 5.5, "learning_rate": 9.024107734720786e-06, "loss": 0.85786552, "memory(GiB)": 135.77, "step": 19390, "train_speed(iter/s)": 0.20289 }, { "acc": 0.76294546, "epoch": 0.4526166902404526, "grad_norm": 6.5, "learning_rate": 9.022986227896898e-06, "loss": 0.8357048, "memory(GiB)": 135.77, "step": 19400, "train_speed(iter/s)": 0.202945 }, { "acc": 0.78395748, "epoch": 0.4528499978127415, "grad_norm": 7.28125, "learning_rate": 9.021864146792894e-06, "loss": 0.74856625, "memory(GiB)": 135.77, "step": 19410, "train_speed(iter/s)": 0.203001 }, { "acc": 0.78489389, "epoch": 0.4530833053850304, "grad_norm": 15.25, "learning_rate": 9.02074149156895e-06, "loss": 0.76024318, "memory(GiB)": 135.77, "step": 19420, "train_speed(iter/s)": 0.203057 }, { "acc": 0.75939164, "epoch": 0.4533166129573193, "grad_norm": 7.75, "learning_rate": 9.019618262385328e-06, "loss": 0.85180931, "memory(GiB)": 135.77, "step": 19430, "train_speed(iter/s)": 0.203111 }, { "acc": 0.78605642, "epoch": 0.4535499205296082, "grad_norm": 5.4375, "learning_rate": 9.018494459402365e-06, "loss": 0.76549592, "memory(GiB)": 135.77, "step": 19440, "train_speed(iter/s)": 0.203163 }, { "acc": 0.76347561, "epoch": 0.4537832281018971, "grad_norm": 7.0625, "learning_rate": 9.017370082780485e-06, "loss": 0.85260391, "memory(GiB)": 135.77, "step": 19450, "train_speed(iter/s)": 0.203218 }, { "acc": 0.78153772, "epoch": 0.454016535674186, "grad_norm": 4.8125, "learning_rate": 9.016245132680195e-06, "loss": 0.77672606, "memory(GiB)": 135.77, "step": 19460, "train_speed(iter/s)": 0.203268 }, { "acc": 0.78809853, "epoch": 0.4542498432464749, "grad_norm": 5.4375, "learning_rate": 9.015119609262078e-06, "loss": 0.74799476, "memory(GiB)": 135.77, "step": 19470, "train_speed(iter/s)": 0.203321 }, { "acc": 0.78086472, "epoch": 0.4544831508187638, "grad_norm": 6.03125, "learning_rate": 9.013993512686803e-06, "loss": 0.7939446, "memory(GiB)": 135.77, "step": 19480, "train_speed(iter/s)": 0.203372 }, { "acc": 0.77702465, "epoch": 0.4547164583910527, "grad_norm": 5.78125, "learning_rate": 9.01286684311512e-06, "loss": 0.81716118, "memory(GiB)": 135.77, "step": 19490, "train_speed(iter/s)": 0.203422 }, { "acc": 0.75863905, "epoch": 0.4549497659633415, "grad_norm": 6.78125, "learning_rate": 9.011739600707862e-06, "loss": 0.89850245, "memory(GiB)": 135.77, "step": 19500, "train_speed(iter/s)": 0.203476 }, { "epoch": 0.4549497659633415, "eval_acc": 0.7380889123980622, "eval_loss": 0.829414427280426, "eval_runtime": 1270.491, "eval_samples_per_second": 28.328, "eval_steps_per_second": 14.165, "step": 19500 }, { "acc": 0.7731576, "epoch": 0.4551830735356304, "grad_norm": 5.09375, "learning_rate": 9.01061178562594e-06, "loss": 0.81027994, "memory(GiB)": 135.77, "step": 19510, "train_speed(iter/s)": 0.200831 }, { "acc": 0.76880264, "epoch": 0.4554163811079193, "grad_norm": 5.0625, "learning_rate": 9.009483398030353e-06, "loss": 0.81138697, "memory(GiB)": 135.77, "step": 19520, "train_speed(iter/s)": 0.200881 }, { "acc": 0.78158731, "epoch": 0.4556496886802082, "grad_norm": 5.8125, "learning_rate": 9.008354438082173e-06, "loss": 0.79674578, "memory(GiB)": 135.77, "step": 19530, "train_speed(iter/s)": 0.200933 }, { "acc": 0.75377884, "epoch": 0.4558829962524971, "grad_norm": 7.09375, "learning_rate": 9.007224905942562e-06, "loss": 0.89388762, "memory(GiB)": 135.77, "step": 19540, "train_speed(iter/s)": 0.200988 }, { "acc": 0.77003942, "epoch": 0.456116303824786, "grad_norm": 6.125, "learning_rate": 9.00609480177276e-06, "loss": 0.83401814, "memory(GiB)": 135.77, "step": 19550, "train_speed(iter/s)": 0.201041 }, { "acc": 0.76566763, "epoch": 0.4563496113970749, "grad_norm": 4.6875, "learning_rate": 9.00496412573409e-06, "loss": 0.86685047, "memory(GiB)": 135.77, "step": 19560, "train_speed(iter/s)": 0.201093 }, { "acc": 0.79012947, "epoch": 0.4565829189693638, "grad_norm": 4.8125, "learning_rate": 9.003832877987952e-06, "loss": 0.74963512, "memory(GiB)": 135.77, "step": 19570, "train_speed(iter/s)": 0.201142 }, { "acc": 0.76897526, "epoch": 0.4568162265416527, "grad_norm": 3.921875, "learning_rate": 9.002701058695836e-06, "loss": 0.82385035, "memory(GiB)": 135.77, "step": 19580, "train_speed(iter/s)": 0.201196 }, { "acc": 0.76111898, "epoch": 0.45704953411394156, "grad_norm": 5.375, "learning_rate": 9.001568668019306e-06, "loss": 0.85867405, "memory(GiB)": 135.77, "step": 19590, "train_speed(iter/s)": 0.201252 }, { "acc": 0.78389206, "epoch": 0.45728284168623046, "grad_norm": 6.125, "learning_rate": 9.000435706120011e-06, "loss": 0.78628263, "memory(GiB)": 135.77, "step": 19600, "train_speed(iter/s)": 0.201305 }, { "acc": 0.77073708, "epoch": 0.45751614925851936, "grad_norm": 6.28125, "learning_rate": 8.999302173159681e-06, "loss": 0.83480806, "memory(GiB)": 135.77, "step": 19610, "train_speed(iter/s)": 0.201358 }, { "acc": 0.76435275, "epoch": 0.45774945683080825, "grad_norm": 5.15625, "learning_rate": 8.998168069300128e-06, "loss": 0.86908226, "memory(GiB)": 135.77, "step": 19620, "train_speed(iter/s)": 0.20141 }, { "acc": 0.77386503, "epoch": 0.45798276440309715, "grad_norm": 4.78125, "learning_rate": 8.997033394703246e-06, "loss": 0.82412939, "memory(GiB)": 135.77, "step": 19630, "train_speed(iter/s)": 0.201461 }, { "acc": 0.75808735, "epoch": 0.45821607197538605, "grad_norm": 7.875, "learning_rate": 8.995898149531005e-06, "loss": 0.87254372, "memory(GiB)": 135.77, "step": 19640, "train_speed(iter/s)": 0.201514 }, { "acc": 0.77585478, "epoch": 0.45844937954767495, "grad_norm": 6.125, "learning_rate": 8.994762333945465e-06, "loss": 0.8103075, "memory(GiB)": 135.77, "step": 19650, "train_speed(iter/s)": 0.20157 }, { "acc": 0.76372824, "epoch": 0.45868268711996385, "grad_norm": 5.21875, "learning_rate": 8.993625948108764e-06, "loss": 0.86949158, "memory(GiB)": 135.77, "step": 19660, "train_speed(iter/s)": 0.201625 }, { "acc": 0.76708436, "epoch": 0.45891599469225275, "grad_norm": 5.65625, "learning_rate": 8.992488992183116e-06, "loss": 0.84779406, "memory(GiB)": 135.77, "step": 19670, "train_speed(iter/s)": 0.201678 }, { "acc": 0.7899591, "epoch": 0.45914930226454165, "grad_norm": 5.03125, "learning_rate": 8.991351466330827e-06, "loss": 0.7441288, "memory(GiB)": 135.77, "step": 19680, "train_speed(iter/s)": 0.201731 }, { "acc": 0.78236446, "epoch": 0.4593826098368305, "grad_norm": 4.96875, "learning_rate": 8.990213370714274e-06, "loss": 0.79019766, "memory(GiB)": 135.77, "step": 19690, "train_speed(iter/s)": 0.201785 }, { "acc": 0.7680593, "epoch": 0.4596159174091194, "grad_norm": 7.71875, "learning_rate": 8.989074705495921e-06, "loss": 0.8316782, "memory(GiB)": 135.77, "step": 19700, "train_speed(iter/s)": 0.201834 }, { "acc": 0.77131128, "epoch": 0.4598492249814083, "grad_norm": 8.875, "learning_rate": 8.987935470838315e-06, "loss": 0.82049313, "memory(GiB)": 135.77, "step": 19710, "train_speed(iter/s)": 0.201885 }, { "acc": 0.78124671, "epoch": 0.4600825325536972, "grad_norm": 5.71875, "learning_rate": 8.986795666904077e-06, "loss": 0.80133085, "memory(GiB)": 135.77, "step": 19720, "train_speed(iter/s)": 0.201938 }, { "acc": 0.75701828, "epoch": 0.4603158401259861, "grad_norm": 9.125, "learning_rate": 8.985655293855917e-06, "loss": 0.89116259, "memory(GiB)": 135.77, "step": 19730, "train_speed(iter/s)": 0.201995 }, { "acc": 0.76774426, "epoch": 0.460549147698275, "grad_norm": 5.34375, "learning_rate": 8.98451435185662e-06, "loss": 0.87877426, "memory(GiB)": 135.77, "step": 19740, "train_speed(iter/s)": 0.202049 }, { "acc": 0.77983117, "epoch": 0.4607824552705639, "grad_norm": 5.34375, "learning_rate": 8.983372841069059e-06, "loss": 0.79148512, "memory(GiB)": 135.77, "step": 19750, "train_speed(iter/s)": 0.202102 }, { "acc": 0.74972019, "epoch": 0.4610157628428528, "grad_norm": 5.84375, "learning_rate": 8.98223076165618e-06, "loss": 0.89190731, "memory(GiB)": 135.77, "step": 19760, "train_speed(iter/s)": 0.202153 }, { "acc": 0.78662281, "epoch": 0.4612490704151417, "grad_norm": 5.71875, "learning_rate": 8.981088113781018e-06, "loss": 0.75874639, "memory(GiB)": 135.77, "step": 19770, "train_speed(iter/s)": 0.202207 }, { "acc": 0.76131806, "epoch": 0.4614823779874306, "grad_norm": 7.21875, "learning_rate": 8.979944897606685e-06, "loss": 0.87373924, "memory(GiB)": 135.77, "step": 19780, "train_speed(iter/s)": 0.20226 }, { "acc": 0.76455545, "epoch": 0.4617156855597194, "grad_norm": 6.15625, "learning_rate": 8.978801113296371e-06, "loss": 0.87849846, "memory(GiB)": 135.77, "step": 19790, "train_speed(iter/s)": 0.20231 }, { "acc": 0.76590109, "epoch": 0.4619489931320083, "grad_norm": 5.15625, "learning_rate": 8.977656761013357e-06, "loss": 0.83625021, "memory(GiB)": 135.77, "step": 19800, "train_speed(iter/s)": 0.202362 }, { "acc": 0.75346403, "epoch": 0.4621823007042972, "grad_norm": 15.125, "learning_rate": 8.976511840920994e-06, "loss": 0.90793972, "memory(GiB)": 135.77, "step": 19810, "train_speed(iter/s)": 0.202415 }, { "acc": 0.77614479, "epoch": 0.4624156082765861, "grad_norm": 5.9375, "learning_rate": 8.975366353182721e-06, "loss": 0.81695967, "memory(GiB)": 135.77, "step": 19820, "train_speed(iter/s)": 0.202469 }, { "acc": 0.77849016, "epoch": 0.462648915848875, "grad_norm": 5.46875, "learning_rate": 8.974220297962058e-06, "loss": 0.79997635, "memory(GiB)": 135.77, "step": 19830, "train_speed(iter/s)": 0.202525 }, { "acc": 0.77346191, "epoch": 0.4628822234211639, "grad_norm": 6.0625, "learning_rate": 8.973073675422602e-06, "loss": 0.80886145, "memory(GiB)": 135.77, "step": 19840, "train_speed(iter/s)": 0.202581 }, { "acc": 0.81118984, "epoch": 0.4631155309934528, "grad_norm": 5.6875, "learning_rate": 8.97192648572803e-06, "loss": 0.68877487, "memory(GiB)": 135.77, "step": 19850, "train_speed(iter/s)": 0.202632 }, { "acc": 0.75902133, "epoch": 0.4633488385657417, "grad_norm": 4.53125, "learning_rate": 8.970778729042109e-06, "loss": 0.86830082, "memory(GiB)": 135.77, "step": 19860, "train_speed(iter/s)": 0.202682 }, { "acc": 0.75023003, "epoch": 0.4635821461380306, "grad_norm": 6.3125, "learning_rate": 8.969630405528675e-06, "loss": 0.90686493, "memory(GiB)": 135.77, "step": 19870, "train_speed(iter/s)": 0.202737 }, { "acc": 0.78217545, "epoch": 0.46381545371031946, "grad_norm": 5.875, "learning_rate": 8.968481515351656e-06, "loss": 0.77547541, "memory(GiB)": 135.77, "step": 19880, "train_speed(iter/s)": 0.202788 }, { "acc": 0.77224998, "epoch": 0.46404876128260836, "grad_norm": 5.4375, "learning_rate": 8.967332058675054e-06, "loss": 0.81753349, "memory(GiB)": 135.77, "step": 19890, "train_speed(iter/s)": 0.20284 }, { "acc": 0.76978784, "epoch": 0.46428206885489726, "grad_norm": 5.40625, "learning_rate": 8.96618203566295e-06, "loss": 0.83251038, "memory(GiB)": 135.77, "step": 19900, "train_speed(iter/s)": 0.202894 }, { "acc": 0.78358245, "epoch": 0.46451537642718616, "grad_norm": 5.125, "learning_rate": 8.965031446479516e-06, "loss": 0.77494287, "memory(GiB)": 135.77, "step": 19910, "train_speed(iter/s)": 0.202944 }, { "acc": 0.75092211, "epoch": 0.46474868399947505, "grad_norm": 6.6875, "learning_rate": 8.963880291288992e-06, "loss": 0.90174236, "memory(GiB)": 135.77, "step": 19920, "train_speed(iter/s)": 0.202996 }, { "acc": 0.76736884, "epoch": 0.46498199157176395, "grad_norm": 5.8125, "learning_rate": 8.96272857025571e-06, "loss": 0.833918, "memory(GiB)": 135.77, "step": 19930, "train_speed(iter/s)": 0.203051 }, { "acc": 0.75787697, "epoch": 0.46521529914405285, "grad_norm": 5.375, "learning_rate": 8.961576283544076e-06, "loss": 0.89234486, "memory(GiB)": 135.77, "step": 19940, "train_speed(iter/s)": 0.203107 }, { "acc": 0.77328529, "epoch": 0.46544860671634175, "grad_norm": 6.15625, "learning_rate": 8.960423431318576e-06, "loss": 0.8312933, "memory(GiB)": 135.77, "step": 19950, "train_speed(iter/s)": 0.20316 }, { "acc": 0.77509122, "epoch": 0.46568191428863065, "grad_norm": 6.59375, "learning_rate": 8.959270013743784e-06, "loss": 0.81753225, "memory(GiB)": 135.77, "step": 19960, "train_speed(iter/s)": 0.203211 }, { "acc": 0.76919918, "epoch": 0.46591522186091955, "grad_norm": 5.25, "learning_rate": 8.958116030984347e-06, "loss": 0.84067249, "memory(GiB)": 135.77, "step": 19970, "train_speed(iter/s)": 0.203265 }, { "acc": 0.77842779, "epoch": 0.4661485294332084, "grad_norm": 4.125, "learning_rate": 8.956961483204996e-06, "loss": 0.78261929, "memory(GiB)": 135.77, "step": 19980, "train_speed(iter/s)": 0.203316 }, { "acc": 0.75851555, "epoch": 0.4663818370054973, "grad_norm": 6.78125, "learning_rate": 8.955806370570543e-06, "loss": 0.87037487, "memory(GiB)": 135.77, "step": 19990, "train_speed(iter/s)": 0.203366 }, { "acc": 0.77085943, "epoch": 0.4666151445777862, "grad_norm": 4.84375, "learning_rate": 8.954650693245882e-06, "loss": 0.81232281, "memory(GiB)": 135.77, "step": 20000, "train_speed(iter/s)": 0.203415 }, { "epoch": 0.4666151445777862, "eval_acc": 0.7384622086035567, "eval_loss": 0.8278706669807434, "eval_runtime": 1271.2712, "eval_samples_per_second": 28.311, "eval_steps_per_second": 14.156, "step": 20000 }, { "acc": 0.77572637, "epoch": 0.4668484521500751, "grad_norm": 5.65625, "learning_rate": 8.953494451395979e-06, "loss": 0.8144228, "memory(GiB)": 135.77, "step": 20010, "train_speed(iter/s)": 0.200838 }, { "acc": 0.79318457, "epoch": 0.467081759722364, "grad_norm": 9.25, "learning_rate": 8.952337645185894e-06, "loss": 0.74365864, "memory(GiB)": 135.77, "step": 20020, "train_speed(iter/s)": 0.20089 }, { "acc": 0.7678813, "epoch": 0.4673150672946529, "grad_norm": 4.6875, "learning_rate": 8.951180274780758e-06, "loss": 0.8354866, "memory(GiB)": 135.77, "step": 20030, "train_speed(iter/s)": 0.20094 }, { "acc": 0.76604643, "epoch": 0.4675483748669418, "grad_norm": 9.0, "learning_rate": 8.950022340345786e-06, "loss": 0.85739231, "memory(GiB)": 135.77, "step": 20040, "train_speed(iter/s)": 0.200993 }, { "acc": 0.76447287, "epoch": 0.4677816824392307, "grad_norm": 5.75, "learning_rate": 8.948863842046272e-06, "loss": 0.84905338, "memory(GiB)": 135.77, "step": 20050, "train_speed(iter/s)": 0.201048 }, { "acc": 0.77429447, "epoch": 0.4680149900115196, "grad_norm": 3.671875, "learning_rate": 8.947704780047593e-06, "loss": 0.81440744, "memory(GiB)": 135.77, "step": 20060, "train_speed(iter/s)": 0.201097 }, { "acc": 0.78105116, "epoch": 0.4682482975838085, "grad_norm": 5.40625, "learning_rate": 8.946545154515201e-06, "loss": 0.78914485, "memory(GiB)": 135.77, "step": 20070, "train_speed(iter/s)": 0.201152 }, { "acc": 0.7645546, "epoch": 0.4684816051560973, "grad_norm": 7.3125, "learning_rate": 8.945384965614636e-06, "loss": 0.87174253, "memory(GiB)": 135.77, "step": 20080, "train_speed(iter/s)": 0.201205 }, { "acc": 0.79997892, "epoch": 0.4687149127283862, "grad_norm": 5.6875, "learning_rate": 8.944224213511514e-06, "loss": 0.71954641, "memory(GiB)": 135.77, "step": 20090, "train_speed(iter/s)": 0.20126 }, { "acc": 0.762815, "epoch": 0.4689482203006751, "grad_norm": 8.1875, "learning_rate": 8.943062898371531e-06, "loss": 0.85633907, "memory(GiB)": 135.77, "step": 20100, "train_speed(iter/s)": 0.201312 }, { "acc": 0.76686854, "epoch": 0.469181527872964, "grad_norm": 13.0, "learning_rate": 8.941901020360464e-06, "loss": 0.82543678, "memory(GiB)": 135.77, "step": 20110, "train_speed(iter/s)": 0.201361 }, { "acc": 0.77254, "epoch": 0.4694148354452529, "grad_norm": 5.09375, "learning_rate": 8.940738579644171e-06, "loss": 0.80971737, "memory(GiB)": 135.77, "step": 20120, "train_speed(iter/s)": 0.201415 }, { "acc": 0.78281898, "epoch": 0.4696481430175418, "grad_norm": 6.0, "learning_rate": 8.939575576388592e-06, "loss": 0.78121605, "memory(GiB)": 135.77, "step": 20130, "train_speed(iter/s)": 0.201468 }, { "acc": 0.77683525, "epoch": 0.4698814505898307, "grad_norm": 5.0625, "learning_rate": 8.938412010759743e-06, "loss": 0.82987223, "memory(GiB)": 135.77, "step": 20140, "train_speed(iter/s)": 0.20152 }, { "acc": 0.78844047, "epoch": 0.4701147581621196, "grad_norm": 8.6875, "learning_rate": 8.937247882923724e-06, "loss": 0.76155491, "memory(GiB)": 135.77, "step": 20150, "train_speed(iter/s)": 0.201573 }, { "acc": 0.78030567, "epoch": 0.4703480657344085, "grad_norm": 5.9375, "learning_rate": 8.936083193046712e-06, "loss": 0.78899822, "memory(GiB)": 135.77, "step": 20160, "train_speed(iter/s)": 0.201621 }, { "acc": 0.76187115, "epoch": 0.47058137330669736, "grad_norm": 7.34375, "learning_rate": 8.93491794129497e-06, "loss": 0.83778381, "memory(GiB)": 135.77, "step": 20170, "train_speed(iter/s)": 0.201673 }, { "acc": 0.76804895, "epoch": 0.47081468087898626, "grad_norm": 4.1875, "learning_rate": 8.933752127834834e-06, "loss": 0.85201845, "memory(GiB)": 135.77, "step": 20180, "train_speed(iter/s)": 0.201725 }, { "acc": 0.764326, "epoch": 0.47104798845127516, "grad_norm": 6.3125, "learning_rate": 8.932585752832725e-06, "loss": 0.83452625, "memory(GiB)": 135.77, "step": 20190, "train_speed(iter/s)": 0.201778 }, { "acc": 0.76837258, "epoch": 0.47128129602356406, "grad_norm": 5.0, "learning_rate": 8.931418816455142e-06, "loss": 0.83890285, "memory(GiB)": 135.77, "step": 20200, "train_speed(iter/s)": 0.201827 }, { "acc": 0.76820593, "epoch": 0.47151460359585295, "grad_norm": 5.03125, "learning_rate": 8.930251318868664e-06, "loss": 0.82331924, "memory(GiB)": 135.77, "step": 20210, "train_speed(iter/s)": 0.201878 }, { "acc": 0.78207231, "epoch": 0.47174791116814185, "grad_norm": 4.96875, "learning_rate": 8.929083260239952e-06, "loss": 0.78689303, "memory(GiB)": 135.77, "step": 20220, "train_speed(iter/s)": 0.201928 }, { "acc": 0.7715971, "epoch": 0.47198121874043075, "grad_norm": 5.625, "learning_rate": 8.927914640735748e-06, "loss": 0.82081528, "memory(GiB)": 135.77, "step": 20230, "train_speed(iter/s)": 0.201983 }, { "acc": 0.76612329, "epoch": 0.47221452631271965, "grad_norm": 5.90625, "learning_rate": 8.926745460522867e-06, "loss": 0.85213652, "memory(GiB)": 135.77, "step": 20240, "train_speed(iter/s)": 0.202032 }, { "acc": 0.77809095, "epoch": 0.47244783388500855, "grad_norm": 6.9375, "learning_rate": 8.925575719768215e-06, "loss": 0.80060663, "memory(GiB)": 135.77, "step": 20250, "train_speed(iter/s)": 0.202087 }, { "acc": 0.77375336, "epoch": 0.47268114145729745, "grad_norm": 5.4375, "learning_rate": 8.92440541863877e-06, "loss": 0.81226845, "memory(GiB)": 135.77, "step": 20260, "train_speed(iter/s)": 0.202137 }, { "acc": 0.80193977, "epoch": 0.4729144490295863, "grad_norm": 6.34375, "learning_rate": 8.923234557301588e-06, "loss": 0.72059889, "memory(GiB)": 135.77, "step": 20270, "train_speed(iter/s)": 0.202186 }, { "acc": 0.79480009, "epoch": 0.4731477566018752, "grad_norm": 5.21875, "learning_rate": 8.922063135923815e-06, "loss": 0.72272043, "memory(GiB)": 135.77, "step": 20280, "train_speed(iter/s)": 0.202237 }, { "acc": 0.79387074, "epoch": 0.4733810641741641, "grad_norm": 4.9375, "learning_rate": 8.920891154672668e-06, "loss": 0.75309858, "memory(GiB)": 135.77, "step": 20290, "train_speed(iter/s)": 0.202289 }, { "acc": 0.7726409, "epoch": 0.473614371746453, "grad_norm": 8.4375, "learning_rate": 8.91971861371545e-06, "loss": 0.83473721, "memory(GiB)": 135.77, "step": 20300, "train_speed(iter/s)": 0.202341 }, { "acc": 0.77393756, "epoch": 0.4738476793187419, "grad_norm": 3.65625, "learning_rate": 8.918545513219535e-06, "loss": 0.79616671, "memory(GiB)": 135.77, "step": 20310, "train_speed(iter/s)": 0.202391 }, { "acc": 0.76672659, "epoch": 0.4740809868910308, "grad_norm": 5.90625, "learning_rate": 8.917371853352388e-06, "loss": 0.85512495, "memory(GiB)": 135.77, "step": 20320, "train_speed(iter/s)": 0.202442 }, { "acc": 0.765168, "epoch": 0.4743142944633197, "grad_norm": 8.625, "learning_rate": 8.916197634281547e-06, "loss": 0.82549314, "memory(GiB)": 135.77, "step": 20330, "train_speed(iter/s)": 0.202492 }, { "acc": 0.77795401, "epoch": 0.4745476020356086, "grad_norm": 5.28125, "learning_rate": 8.91502285617463e-06, "loss": 0.79356041, "memory(GiB)": 135.77, "step": 20340, "train_speed(iter/s)": 0.202544 }, { "acc": 0.75690231, "epoch": 0.4747809096078975, "grad_norm": 5.875, "learning_rate": 8.913847519199341e-06, "loss": 0.89064808, "memory(GiB)": 135.77, "step": 20350, "train_speed(iter/s)": 0.202595 }, { "acc": 0.76705751, "epoch": 0.4750142171801863, "grad_norm": 6.65625, "learning_rate": 8.912671623523452e-06, "loss": 0.84812698, "memory(GiB)": 135.77, "step": 20360, "train_speed(iter/s)": 0.202641 }, { "acc": 0.78243647, "epoch": 0.4752475247524752, "grad_norm": 6.40625, "learning_rate": 8.911495169314828e-06, "loss": 0.77283859, "memory(GiB)": 135.77, "step": 20370, "train_speed(iter/s)": 0.20269 }, { "acc": 0.78674965, "epoch": 0.4754808323247641, "grad_norm": 5.59375, "learning_rate": 8.910318156741401e-06, "loss": 0.76450891, "memory(GiB)": 135.77, "step": 20380, "train_speed(iter/s)": 0.20274 }, { "acc": 0.7955699, "epoch": 0.475714139897053, "grad_norm": 4.5, "learning_rate": 8.909140585971198e-06, "loss": 0.73192253, "memory(GiB)": 135.77, "step": 20390, "train_speed(iter/s)": 0.202788 }, { "acc": 0.78440504, "epoch": 0.4759474474693419, "grad_norm": 5.28125, "learning_rate": 8.90796245717231e-06, "loss": 0.77477818, "memory(GiB)": 135.77, "step": 20400, "train_speed(iter/s)": 0.202837 }, { "acc": 0.76490378, "epoch": 0.4761807550416308, "grad_norm": 8.9375, "learning_rate": 8.906783770512915e-06, "loss": 0.86861458, "memory(GiB)": 135.77, "step": 20410, "train_speed(iter/s)": 0.202886 }, { "acc": 0.76525488, "epoch": 0.4764140626139197, "grad_norm": 5.4375, "learning_rate": 8.905604526161274e-06, "loss": 0.82954245, "memory(GiB)": 135.77, "step": 20420, "train_speed(iter/s)": 0.20294 }, { "acc": 0.7579958, "epoch": 0.4766473701862086, "grad_norm": 6.25, "learning_rate": 8.904424724285721e-06, "loss": 0.87198391, "memory(GiB)": 135.77, "step": 20430, "train_speed(iter/s)": 0.202991 }, { "acc": 0.76998606, "epoch": 0.4768806777584975, "grad_norm": 6.78125, "learning_rate": 8.903244365054671e-06, "loss": 0.81940556, "memory(GiB)": 135.77, "step": 20440, "train_speed(iter/s)": 0.20304 }, { "acc": 0.76503363, "epoch": 0.4771139853307864, "grad_norm": 5.8125, "learning_rate": 8.902063448636624e-06, "loss": 0.86698151, "memory(GiB)": 135.77, "step": 20450, "train_speed(iter/s)": 0.203093 }, { "acc": 0.77026491, "epoch": 0.47734729290307526, "grad_norm": 5.71875, "learning_rate": 8.900881975200151e-06, "loss": 0.82462835, "memory(GiB)": 135.77, "step": 20460, "train_speed(iter/s)": 0.203142 }, { "acc": 0.76255193, "epoch": 0.47758060047536416, "grad_norm": 4.3125, "learning_rate": 8.89969994491391e-06, "loss": 0.84601803, "memory(GiB)": 135.77, "step": 20470, "train_speed(iter/s)": 0.203189 }, { "acc": 0.75619602, "epoch": 0.47781390804765306, "grad_norm": 6.28125, "learning_rate": 8.898517357946636e-06, "loss": 0.88812332, "memory(GiB)": 135.77, "step": 20480, "train_speed(iter/s)": 0.203241 }, { "acc": 0.77179279, "epoch": 0.47804721561994196, "grad_norm": 6.3125, "learning_rate": 8.897334214467141e-06, "loss": 0.82135019, "memory(GiB)": 135.77, "step": 20490, "train_speed(iter/s)": 0.203288 }, { "acc": 0.77499766, "epoch": 0.47828052319223086, "grad_norm": 5.53125, "learning_rate": 8.89615051464432e-06, "loss": 0.79335108, "memory(GiB)": 135.77, "step": 20500, "train_speed(iter/s)": 0.203339 }, { "epoch": 0.47828052319223086, "eval_acc": 0.7387652014910709, "eval_loss": 0.827155590057373, "eval_runtime": 1271.1109, "eval_samples_per_second": 28.315, "eval_steps_per_second": 14.158, "step": 20500 }, { "acc": 0.76934633, "epoch": 0.47851383076451975, "grad_norm": 5.34375, "learning_rate": 8.894966258647144e-06, "loss": 0.82157841, "memory(GiB)": 135.77, "step": 20510, "train_speed(iter/s)": 0.200826 }, { "acc": 0.76049023, "epoch": 0.47874713833680865, "grad_norm": 6.0625, "learning_rate": 8.893781446644667e-06, "loss": 0.87835636, "memory(GiB)": 135.77, "step": 20520, "train_speed(iter/s)": 0.200874 }, { "acc": 0.77881689, "epoch": 0.47898044590909755, "grad_norm": 4.375, "learning_rate": 8.892596078806017e-06, "loss": 0.81451454, "memory(GiB)": 135.77, "step": 20530, "train_speed(iter/s)": 0.200925 }, { "acc": 0.77435408, "epoch": 0.47921375348138645, "grad_norm": 4.8125, "learning_rate": 8.89141015530041e-06, "loss": 0.8198391, "memory(GiB)": 135.77, "step": 20540, "train_speed(iter/s)": 0.200977 }, { "acc": 0.77629027, "epoch": 0.47944706105367535, "grad_norm": 4.34375, "learning_rate": 8.890223676297132e-06, "loss": 0.80017614, "memory(GiB)": 135.77, "step": 20550, "train_speed(iter/s)": 0.201028 }, { "acc": 0.78644643, "epoch": 0.4796803686259642, "grad_norm": 6.3125, "learning_rate": 8.889036641965557e-06, "loss": 0.77309122, "memory(GiB)": 135.77, "step": 20560, "train_speed(iter/s)": 0.201078 }, { "acc": 0.77234049, "epoch": 0.4799136761982531, "grad_norm": 10.125, "learning_rate": 8.887849052475128e-06, "loss": 0.82501678, "memory(GiB)": 135.77, "step": 20570, "train_speed(iter/s)": 0.201129 }, { "acc": 0.78163843, "epoch": 0.480146983770542, "grad_norm": 4.6875, "learning_rate": 8.886660907995379e-06, "loss": 0.80244398, "memory(GiB)": 135.77, "step": 20580, "train_speed(iter/s)": 0.20118 }, { "acc": 0.77460861, "epoch": 0.4803802913428309, "grad_norm": 6.0, "learning_rate": 8.885472208695911e-06, "loss": 0.82104893, "memory(GiB)": 135.77, "step": 20590, "train_speed(iter/s)": 0.201231 }, { "acc": 0.7811389, "epoch": 0.4806135989151198, "grad_norm": 6.78125, "learning_rate": 8.884282954746417e-06, "loss": 0.76911659, "memory(GiB)": 135.77, "step": 20600, "train_speed(iter/s)": 0.201284 }, { "acc": 0.77763205, "epoch": 0.4808469064874087, "grad_norm": 5.1875, "learning_rate": 8.88309314631666e-06, "loss": 0.78176918, "memory(GiB)": 135.77, "step": 20610, "train_speed(iter/s)": 0.201334 }, { "acc": 0.76166954, "epoch": 0.4810802140596976, "grad_norm": 3.90625, "learning_rate": 8.881902783576482e-06, "loss": 0.86578398, "memory(GiB)": 135.77, "step": 20620, "train_speed(iter/s)": 0.201383 }, { "acc": 0.7865201, "epoch": 0.4813135216319865, "grad_norm": 4.96875, "learning_rate": 8.88071186669581e-06, "loss": 0.77439818, "memory(GiB)": 135.77, "step": 20630, "train_speed(iter/s)": 0.201435 }, { "acc": 0.76671553, "epoch": 0.4815468292042754, "grad_norm": 5.03125, "learning_rate": 8.879520395844648e-06, "loss": 0.84458523, "memory(GiB)": 135.77, "step": 20640, "train_speed(iter/s)": 0.201488 }, { "acc": 0.75628233, "epoch": 0.4817801367765642, "grad_norm": 5.25, "learning_rate": 8.878328371193074e-06, "loss": 0.86866341, "memory(GiB)": 135.77, "step": 20650, "train_speed(iter/s)": 0.201541 }, { "acc": 0.76081595, "epoch": 0.4820134443488531, "grad_norm": 5.09375, "learning_rate": 8.877135792911253e-06, "loss": 0.86255798, "memory(GiB)": 135.77, "step": 20660, "train_speed(iter/s)": 0.201591 }, { "acc": 0.75712156, "epoch": 0.482246751921142, "grad_norm": 6.03125, "learning_rate": 8.875942661169423e-06, "loss": 0.89423008, "memory(GiB)": 135.77, "step": 20670, "train_speed(iter/s)": 0.201641 }, { "acc": 0.77293806, "epoch": 0.4824800594934309, "grad_norm": 5.5, "learning_rate": 8.874748976137905e-06, "loss": 0.79875817, "memory(GiB)": 135.77, "step": 20680, "train_speed(iter/s)": 0.201693 }, { "acc": 0.74118772, "epoch": 0.4827133670657198, "grad_norm": 5.5625, "learning_rate": 8.873554737987098e-06, "loss": 0.91008282, "memory(GiB)": 135.77, "step": 20690, "train_speed(iter/s)": 0.201745 }, { "acc": 0.7834497, "epoch": 0.4829466746380087, "grad_norm": 7.0625, "learning_rate": 8.872359946887474e-06, "loss": 0.76405153, "memory(GiB)": 135.77, "step": 20700, "train_speed(iter/s)": 0.201796 }, { "acc": 0.73779564, "epoch": 0.4831799822102976, "grad_norm": 5.78125, "learning_rate": 8.871164603009595e-06, "loss": 0.97299786, "memory(GiB)": 135.77, "step": 20710, "train_speed(iter/s)": 0.201848 }, { "acc": 0.76881113, "epoch": 0.4834132897825865, "grad_norm": 5.6875, "learning_rate": 8.869968706524092e-06, "loss": 0.92142582, "memory(GiB)": 135.77, "step": 20720, "train_speed(iter/s)": 0.201901 }, { "acc": 0.78435059, "epoch": 0.4836465973548754, "grad_norm": 4.375, "learning_rate": 8.868772257601682e-06, "loss": 0.79667149, "memory(GiB)": 135.77, "step": 20730, "train_speed(iter/s)": 0.201952 }, { "acc": 0.75337543, "epoch": 0.4838799049271643, "grad_norm": 7.15625, "learning_rate": 8.867575256413154e-06, "loss": 0.91704159, "memory(GiB)": 135.77, "step": 20740, "train_speed(iter/s)": 0.202003 }, { "acc": 0.76679363, "epoch": 0.48411321249945316, "grad_norm": 6.15625, "learning_rate": 8.866377703129382e-06, "loss": 0.82663794, "memory(GiB)": 135.77, "step": 20750, "train_speed(iter/s)": 0.202056 }, { "acc": 0.77863007, "epoch": 0.48434652007174206, "grad_norm": 4.8125, "learning_rate": 8.865179597921318e-06, "loss": 0.79373541, "memory(GiB)": 135.77, "step": 20760, "train_speed(iter/s)": 0.202104 }, { "acc": 0.75620103, "epoch": 0.48457982764403096, "grad_norm": 6.0625, "learning_rate": 8.863980940959989e-06, "loss": 0.92305183, "memory(GiB)": 135.77, "step": 20770, "train_speed(iter/s)": 0.20215 }, { "acc": 0.7790472, "epoch": 0.48481313521631986, "grad_norm": 4.78125, "learning_rate": 8.862781732416502e-06, "loss": 0.77640362, "memory(GiB)": 135.77, "step": 20780, "train_speed(iter/s)": 0.2022 }, { "acc": 0.77119627, "epoch": 0.48504644278860876, "grad_norm": 6.0625, "learning_rate": 8.861581972462045e-06, "loss": 0.82055054, "memory(GiB)": 135.77, "step": 20790, "train_speed(iter/s)": 0.202249 }, { "acc": 0.76551666, "epoch": 0.48527975036089765, "grad_norm": 5.5, "learning_rate": 8.860381661267882e-06, "loss": 0.82655621, "memory(GiB)": 135.77, "step": 20800, "train_speed(iter/s)": 0.202298 }, { "acc": 0.78621545, "epoch": 0.48551305793318655, "grad_norm": 8.9375, "learning_rate": 8.859180799005361e-06, "loss": 0.76219521, "memory(GiB)": 135.77, "step": 20810, "train_speed(iter/s)": 0.202346 }, { "acc": 0.77965999, "epoch": 0.48574636550547545, "grad_norm": 5.34375, "learning_rate": 8.857979385845901e-06, "loss": 0.781145, "memory(GiB)": 135.77, "step": 20820, "train_speed(iter/s)": 0.202396 }, { "acc": 0.76239419, "epoch": 0.48597967307776435, "grad_norm": 4.75, "learning_rate": 8.856777421961004e-06, "loss": 0.87815866, "memory(GiB)": 135.77, "step": 20830, "train_speed(iter/s)": 0.202448 }, { "acc": 0.77660542, "epoch": 0.48621298065005325, "grad_norm": 5.1875, "learning_rate": 8.855574907522251e-06, "loss": 0.79042578, "memory(GiB)": 135.77, "step": 20840, "train_speed(iter/s)": 0.202497 }, { "acc": 0.7720644, "epoch": 0.4864462882223421, "grad_norm": 6.375, "learning_rate": 8.854371842701299e-06, "loss": 0.82205925, "memory(GiB)": 135.77, "step": 20850, "train_speed(iter/s)": 0.202548 }, { "acc": 0.77594128, "epoch": 0.486679595794631, "grad_norm": 5.625, "learning_rate": 8.853168227669886e-06, "loss": 0.81534863, "memory(GiB)": 135.77, "step": 20860, "train_speed(iter/s)": 0.202599 }, { "acc": 0.77423177, "epoch": 0.4869129033669199, "grad_norm": 5.0, "learning_rate": 8.851964062599828e-06, "loss": 0.79051189, "memory(GiB)": 135.77, "step": 20870, "train_speed(iter/s)": 0.202649 }, { "acc": 0.77537422, "epoch": 0.4871462109392088, "grad_norm": 15.1875, "learning_rate": 8.850759347663021e-06, "loss": 0.80190439, "memory(GiB)": 135.77, "step": 20880, "train_speed(iter/s)": 0.202703 }, { "acc": 0.76119142, "epoch": 0.4873795185114977, "grad_norm": 6.40625, "learning_rate": 8.849554083031435e-06, "loss": 0.86152458, "memory(GiB)": 135.77, "step": 20890, "train_speed(iter/s)": 0.202756 }, { "acc": 0.75670996, "epoch": 0.4876128260837866, "grad_norm": 5.1875, "learning_rate": 8.84834826887712e-06, "loss": 0.87847652, "memory(GiB)": 135.77, "step": 20900, "train_speed(iter/s)": 0.20281 }, { "acc": 0.76710215, "epoch": 0.4878461336560755, "grad_norm": 6.84375, "learning_rate": 8.84714190537221e-06, "loss": 0.84475441, "memory(GiB)": 135.77, "step": 20910, "train_speed(iter/s)": 0.20286 }, { "acc": 0.77066274, "epoch": 0.4880794412283644, "grad_norm": 5.6875, "learning_rate": 8.84593499268891e-06, "loss": 0.82889204, "memory(GiB)": 135.77, "step": 20920, "train_speed(iter/s)": 0.202908 }, { "acc": 0.76561365, "epoch": 0.4883127488006533, "grad_norm": 4.75, "learning_rate": 8.844727530999506e-06, "loss": 0.8475029, "memory(GiB)": 135.77, "step": 20930, "train_speed(iter/s)": 0.202954 }, { "acc": 0.77417688, "epoch": 0.48854605637294213, "grad_norm": 4.59375, "learning_rate": 8.843519520476365e-06, "loss": 0.82181873, "memory(GiB)": 135.77, "step": 20940, "train_speed(iter/s)": 0.203003 }, { "acc": 0.78241935, "epoch": 0.488779363945231, "grad_norm": 5.1875, "learning_rate": 8.842310961291926e-06, "loss": 0.80962143, "memory(GiB)": 135.77, "step": 20950, "train_speed(iter/s)": 0.203051 }, { "acc": 0.79182229, "epoch": 0.4890126715175199, "grad_norm": 7.59375, "learning_rate": 8.841101853618717e-06, "loss": 0.74582729, "memory(GiB)": 135.77, "step": 20960, "train_speed(iter/s)": 0.203103 }, { "acc": 0.78107166, "epoch": 0.4892459790898088, "grad_norm": 4.21875, "learning_rate": 8.839892197629334e-06, "loss": 0.78661366, "memory(GiB)": 135.77, "step": 20970, "train_speed(iter/s)": 0.203151 }, { "acc": 0.77803307, "epoch": 0.4894792866620977, "grad_norm": 6.875, "learning_rate": 8.838681993496454e-06, "loss": 0.79262047, "memory(GiB)": 135.77, "step": 20980, "train_speed(iter/s)": 0.203205 }, { "acc": 0.74206686, "epoch": 0.4897125942343866, "grad_norm": 7.4375, "learning_rate": 8.837471241392835e-06, "loss": 0.94257679, "memory(GiB)": 135.77, "step": 20990, "train_speed(iter/s)": 0.203256 }, { "acc": 0.76296959, "epoch": 0.4899459018066755, "grad_norm": 6.03125, "learning_rate": 8.83625994149131e-06, "loss": 0.87966471, "memory(GiB)": 135.77, "step": 21000, "train_speed(iter/s)": 0.203307 }, { "epoch": 0.4899459018066755, "eval_acc": 0.7386233137012519, "eval_loss": 0.8264919519424438, "eval_runtime": 1269.5678, "eval_samples_per_second": 28.349, "eval_steps_per_second": 14.175, "step": 21000 }, { "acc": 0.76228848, "epoch": 0.4901792093789644, "grad_norm": 5.1875, "learning_rate": 8.835048093964796e-06, "loss": 0.86440468, "memory(GiB)": 135.77, "step": 21010, "train_speed(iter/s)": 0.200857 }, { "acc": 0.7821867, "epoch": 0.4904125169512533, "grad_norm": 4.21875, "learning_rate": 8.833835698986276e-06, "loss": 0.78023453, "memory(GiB)": 135.77, "step": 21020, "train_speed(iter/s)": 0.200905 }, { "acc": 0.77446461, "epoch": 0.4906458245235422, "grad_norm": 5.3125, "learning_rate": 8.832622756728828e-06, "loss": 0.81071396, "memory(GiB)": 135.77, "step": 21030, "train_speed(iter/s)": 0.200954 }, { "acc": 0.75736957, "epoch": 0.49087913209583106, "grad_norm": 5.9375, "learning_rate": 8.831409267365594e-06, "loss": 0.87871695, "memory(GiB)": 135.77, "step": 21040, "train_speed(iter/s)": 0.201004 }, { "acc": 0.7824193, "epoch": 0.49111243966811996, "grad_norm": 5.21875, "learning_rate": 8.830195231069799e-06, "loss": 0.77914681, "memory(GiB)": 135.77, "step": 21050, "train_speed(iter/s)": 0.201054 }, { "acc": 0.77138987, "epoch": 0.49134574724040886, "grad_norm": 7.03125, "learning_rate": 8.828980648014747e-06, "loss": 0.82984867, "memory(GiB)": 135.77, "step": 21060, "train_speed(iter/s)": 0.201107 }, { "acc": 0.76764355, "epoch": 0.49157905481269776, "grad_norm": 6.15625, "learning_rate": 8.82776551837382e-06, "loss": 0.83743019, "memory(GiB)": 135.77, "step": 21070, "train_speed(iter/s)": 0.201157 }, { "acc": 0.7618494, "epoch": 0.49181236238498666, "grad_norm": 5.40625, "learning_rate": 8.826549842320478e-06, "loss": 0.87587261, "memory(GiB)": 135.77, "step": 21080, "train_speed(iter/s)": 0.201209 }, { "acc": 0.77934647, "epoch": 0.49204566995727556, "grad_norm": 4.4375, "learning_rate": 8.825333620028257e-06, "loss": 0.80535469, "memory(GiB)": 135.77, "step": 21090, "train_speed(iter/s)": 0.201258 }, { "acc": 0.76986637, "epoch": 0.49227897752956445, "grad_norm": 4.375, "learning_rate": 8.824116851670772e-06, "loss": 0.83490791, "memory(GiB)": 135.77, "step": 21100, "train_speed(iter/s)": 0.201309 }, { "acc": 0.7731987, "epoch": 0.49251228510185335, "grad_norm": 6.5625, "learning_rate": 8.822899537421721e-06, "loss": 0.83892517, "memory(GiB)": 135.77, "step": 21110, "train_speed(iter/s)": 0.201358 }, { "acc": 0.76397038, "epoch": 0.49274559267414225, "grad_norm": 6.46875, "learning_rate": 8.821681677454868e-06, "loss": 0.84116879, "memory(GiB)": 135.77, "step": 21120, "train_speed(iter/s)": 0.201407 }, { "acc": 0.76126223, "epoch": 0.49297890024643115, "grad_norm": 4.8125, "learning_rate": 8.820463271944066e-06, "loss": 0.85854349, "memory(GiB)": 135.77, "step": 21130, "train_speed(iter/s)": 0.201458 }, { "acc": 0.77129207, "epoch": 0.49321220781872, "grad_norm": 8.4375, "learning_rate": 8.819244321063243e-06, "loss": 0.82659998, "memory(GiB)": 135.77, "step": 21140, "train_speed(iter/s)": 0.201505 }, { "acc": 0.76816425, "epoch": 0.4934455153910089, "grad_norm": 7.4375, "learning_rate": 8.818024824986404e-06, "loss": 0.84741154, "memory(GiB)": 135.77, "step": 21150, "train_speed(iter/s)": 0.201552 }, { "acc": 0.76245174, "epoch": 0.4936788229632978, "grad_norm": 8.6875, "learning_rate": 8.816804783887628e-06, "loss": 0.86243572, "memory(GiB)": 135.77, "step": 21160, "train_speed(iter/s)": 0.2016 }, { "acc": 0.78625603, "epoch": 0.4939121305355867, "grad_norm": 6.09375, "learning_rate": 8.815584197941078e-06, "loss": 0.76783695, "memory(GiB)": 135.77, "step": 21170, "train_speed(iter/s)": 0.201648 }, { "acc": 0.79220161, "epoch": 0.4941454381078756, "grad_norm": 6.96875, "learning_rate": 8.814363067320995e-06, "loss": 0.75065899, "memory(GiB)": 135.77, "step": 21180, "train_speed(iter/s)": 0.201696 }, { "acc": 0.78739977, "epoch": 0.4943787456801645, "grad_norm": 5.5, "learning_rate": 8.81314139220169e-06, "loss": 0.77436671, "memory(GiB)": 135.77, "step": 21190, "train_speed(iter/s)": 0.201746 }, { "acc": 0.75706043, "epoch": 0.4946120532524534, "grad_norm": 4.375, "learning_rate": 8.811919172757558e-06, "loss": 0.86744957, "memory(GiB)": 135.77, "step": 21200, "train_speed(iter/s)": 0.201796 }, { "acc": 0.77850995, "epoch": 0.4948453608247423, "grad_norm": 5.03125, "learning_rate": 8.810696409163073e-06, "loss": 0.81761723, "memory(GiB)": 135.77, "step": 21210, "train_speed(iter/s)": 0.201845 }, { "acc": 0.79036131, "epoch": 0.4950786683970312, "grad_norm": 4.65625, "learning_rate": 8.809473101592783e-06, "loss": 0.77943802, "memory(GiB)": 135.77, "step": 21220, "train_speed(iter/s)": 0.201896 }, { "acc": 0.77873907, "epoch": 0.49531197596932003, "grad_norm": 5.75, "learning_rate": 8.808249250221312e-06, "loss": 0.79133549, "memory(GiB)": 135.77, "step": 21230, "train_speed(iter/s)": 0.201945 }, { "acc": 0.77417302, "epoch": 0.4955452835416089, "grad_norm": 5.84375, "learning_rate": 8.807024855223369e-06, "loss": 0.81216822, "memory(GiB)": 135.77, "step": 21240, "train_speed(iter/s)": 0.201996 }, { "acc": 0.76840887, "epoch": 0.4957785911138978, "grad_norm": 7.0625, "learning_rate": 8.805799916773734e-06, "loss": 0.82953939, "memory(GiB)": 135.77, "step": 21250, "train_speed(iter/s)": 0.202046 }, { "acc": 0.74933271, "epoch": 0.4960118986861867, "grad_norm": 6.4375, "learning_rate": 8.804574435047265e-06, "loss": 0.90532026, "memory(GiB)": 135.77, "step": 21260, "train_speed(iter/s)": 0.202093 }, { "acc": 0.77972379, "epoch": 0.4962452062584756, "grad_norm": 6.4375, "learning_rate": 8.803348410218902e-06, "loss": 0.80596933, "memory(GiB)": 135.77, "step": 21270, "train_speed(iter/s)": 0.202143 }, { "acc": 0.77269158, "epoch": 0.4964785138307645, "grad_norm": 4.53125, "learning_rate": 8.802121842463658e-06, "loss": 0.81263514, "memory(GiB)": 135.77, "step": 21280, "train_speed(iter/s)": 0.202195 }, { "acc": 0.77480335, "epoch": 0.4967118214030534, "grad_norm": 6.21875, "learning_rate": 8.800894731956624e-06, "loss": 0.81107597, "memory(GiB)": 135.77, "step": 21290, "train_speed(iter/s)": 0.202243 }, { "acc": 0.75720205, "epoch": 0.4969451289753423, "grad_norm": 8.5625, "learning_rate": 8.799667078872973e-06, "loss": 0.88299694, "memory(GiB)": 135.77, "step": 21300, "train_speed(iter/s)": 0.202292 }, { "acc": 0.75752621, "epoch": 0.4971784365476312, "grad_norm": 6.65625, "learning_rate": 8.79843888338795e-06, "loss": 0.90188789, "memory(GiB)": 135.77, "step": 21310, "train_speed(iter/s)": 0.202342 }, { "acc": 0.77910862, "epoch": 0.4974117441199201, "grad_norm": 4.84375, "learning_rate": 8.797210145676879e-06, "loss": 0.78877602, "memory(GiB)": 135.77, "step": 21320, "train_speed(iter/s)": 0.202388 }, { "acc": 0.78843384, "epoch": 0.49764505169220896, "grad_norm": 5.15625, "learning_rate": 8.795980865915164e-06, "loss": 0.75855894, "memory(GiB)": 135.77, "step": 21330, "train_speed(iter/s)": 0.202436 }, { "acc": 0.78700695, "epoch": 0.49787835926449786, "grad_norm": 6.125, "learning_rate": 8.794751044278282e-06, "loss": 0.76552219, "memory(GiB)": 135.77, "step": 21340, "train_speed(iter/s)": 0.202487 }, { "acc": 0.77964926, "epoch": 0.49811166683678676, "grad_norm": 5.53125, "learning_rate": 8.793520680941792e-06, "loss": 0.78910999, "memory(GiB)": 135.77, "step": 21350, "train_speed(iter/s)": 0.202533 }, { "acc": 0.76680984, "epoch": 0.49834497440907566, "grad_norm": 6.96875, "learning_rate": 8.792289776081326e-06, "loss": 0.84402256, "memory(GiB)": 135.77, "step": 21360, "train_speed(iter/s)": 0.20258 }, { "acc": 0.7667099, "epoch": 0.49857828198136456, "grad_norm": 4.25, "learning_rate": 8.791058329872595e-06, "loss": 0.85665445, "memory(GiB)": 135.77, "step": 21370, "train_speed(iter/s)": 0.20263 }, { "acc": 0.77208252, "epoch": 0.49881158955365346, "grad_norm": 4.71875, "learning_rate": 8.78982634249139e-06, "loss": 0.82627926, "memory(GiB)": 135.77, "step": 21380, "train_speed(iter/s)": 0.202672 }, { "acc": 0.76971836, "epoch": 0.49904489712594235, "grad_norm": 5.03125, "learning_rate": 8.788593814113576e-06, "loss": 0.82828751, "memory(GiB)": 135.77, "step": 21390, "train_speed(iter/s)": 0.20272 }, { "acc": 0.77777624, "epoch": 0.49927820469823125, "grad_norm": 7.25, "learning_rate": 8.787360744915096e-06, "loss": 0.78845906, "memory(GiB)": 135.77, "step": 21400, "train_speed(iter/s)": 0.202769 }, { "acc": 0.76814055, "epoch": 0.49951151227052015, "grad_norm": 4.9375, "learning_rate": 8.786127135071968e-06, "loss": 0.82526455, "memory(GiB)": 135.77, "step": 21410, "train_speed(iter/s)": 0.202816 }, { "acc": 0.76935434, "epoch": 0.499744819842809, "grad_norm": 4.84375, "learning_rate": 8.784892984760292e-06, "loss": 0.82196589, "memory(GiB)": 135.77, "step": 21420, "train_speed(iter/s)": 0.20286 }, { "acc": 0.76840973, "epoch": 0.4999781274150979, "grad_norm": 4.65625, "learning_rate": 8.783658294156244e-06, "loss": 0.84509468, "memory(GiB)": 135.77, "step": 21430, "train_speed(iter/s)": 0.202908 }, { "acc": 0.78127012, "epoch": 0.5002114349873868, "grad_norm": 5.0, "learning_rate": 8.782423063436072e-06, "loss": 0.79705486, "memory(GiB)": 135.77, "step": 21440, "train_speed(iter/s)": 0.202956 }, { "acc": 0.79307652, "epoch": 0.5004447425596757, "grad_norm": 6.28125, "learning_rate": 8.781187292776106e-06, "loss": 0.75553842, "memory(GiB)": 135.77, "step": 21450, "train_speed(iter/s)": 0.203003 }, { "acc": 0.77026339, "epoch": 0.5006780501319646, "grad_norm": 5.28125, "learning_rate": 8.779950982352751e-06, "loss": 0.82962399, "memory(GiB)": 135.77, "step": 21460, "train_speed(iter/s)": 0.203051 }, { "acc": 0.78340578, "epoch": 0.5009113577042534, "grad_norm": 4.40625, "learning_rate": 8.778714132342494e-06, "loss": 0.76500807, "memory(GiB)": 135.77, "step": 21470, "train_speed(iter/s)": 0.203099 }, { "acc": 0.76358261, "epoch": 0.5011446652765423, "grad_norm": 5.3125, "learning_rate": 8.777476742921893e-06, "loss": 0.86745148, "memory(GiB)": 135.77, "step": 21480, "train_speed(iter/s)": 0.203144 }, { "acc": 0.76443934, "epoch": 0.5013779728488312, "grad_norm": 6.3125, "learning_rate": 8.776238814267581e-06, "loss": 0.85055971, "memory(GiB)": 135.77, "step": 21490, "train_speed(iter/s)": 0.203195 }, { "acc": 0.7856565, "epoch": 0.5016112804211201, "grad_norm": 4.84375, "learning_rate": 8.775000346556278e-06, "loss": 0.76976533, "memory(GiB)": 135.77, "step": 21500, "train_speed(iter/s)": 0.203245 }, { "epoch": 0.5016112804211201, "eval_acc": 0.7388905944249629, "eval_loss": 0.8257662653923035, "eval_runtime": 1270.2669, "eval_samples_per_second": 28.333, "eval_steps_per_second": 14.167, "step": 21500 }, { "acc": 0.77753158, "epoch": 0.501844587993409, "grad_norm": 5.96875, "learning_rate": 8.773761339964773e-06, "loss": 0.79236035, "memory(GiB)": 135.77, "step": 21510, "train_speed(iter/s)": 0.20085 }, { "acc": 0.77459497, "epoch": 0.5020778955656979, "grad_norm": 6.34375, "learning_rate": 8.77252179466993e-06, "loss": 0.80510149, "memory(GiB)": 135.77, "step": 21520, "train_speed(iter/s)": 0.200901 }, { "acc": 0.79065723, "epoch": 0.5023112031379868, "grad_norm": 5.875, "learning_rate": 8.771281710848697e-06, "loss": 0.75441141, "memory(GiB)": 135.77, "step": 21530, "train_speed(iter/s)": 0.200951 }, { "acc": 0.78152294, "epoch": 0.5025445107102757, "grad_norm": 5.6875, "learning_rate": 8.770041088678098e-06, "loss": 0.78967323, "memory(GiB)": 135.77, "step": 21540, "train_speed(iter/s)": 0.200999 }, { "acc": 0.76808047, "epoch": 0.5027778182825646, "grad_norm": 4.875, "learning_rate": 8.768799928335227e-06, "loss": 0.83531723, "memory(GiB)": 135.77, "step": 21550, "train_speed(iter/s)": 0.201047 }, { "acc": 0.78148546, "epoch": 0.5030111258548535, "grad_norm": 6.8125, "learning_rate": 8.76755822999726e-06, "loss": 0.77231493, "memory(GiB)": 135.77, "step": 21560, "train_speed(iter/s)": 0.201097 }, { "acc": 0.78652892, "epoch": 0.5032444334271424, "grad_norm": 6.5, "learning_rate": 8.766315993841452e-06, "loss": 0.76357327, "memory(GiB)": 135.77, "step": 21570, "train_speed(iter/s)": 0.201146 }, { "acc": 0.74624329, "epoch": 0.5034777409994313, "grad_norm": 4.90625, "learning_rate": 8.76507322004513e-06, "loss": 0.91759214, "memory(GiB)": 135.77, "step": 21580, "train_speed(iter/s)": 0.201192 }, { "acc": 0.77871332, "epoch": 0.5037110485717202, "grad_norm": 5.46875, "learning_rate": 8.7638299087857e-06, "loss": 0.78241935, "memory(GiB)": 135.77, "step": 21590, "train_speed(iter/s)": 0.201238 }, { "acc": 0.7683857, "epoch": 0.5039443561440091, "grad_norm": 6.59375, "learning_rate": 8.762586060240642e-06, "loss": 0.83219185, "memory(GiB)": 135.77, "step": 21600, "train_speed(iter/s)": 0.201284 }, { "acc": 0.78917575, "epoch": 0.504177663716298, "grad_norm": 10.875, "learning_rate": 8.761341674587518e-06, "loss": 0.76239634, "memory(GiB)": 135.77, "step": 21610, "train_speed(iter/s)": 0.201331 }, { "acc": 0.78974929, "epoch": 0.5044109712885869, "grad_norm": 4.96875, "learning_rate": 8.760096752003962e-06, "loss": 0.77535329, "memory(GiB)": 135.77, "step": 21620, "train_speed(iter/s)": 0.201378 }, { "acc": 0.77866402, "epoch": 0.5046442788608758, "grad_norm": 4.9375, "learning_rate": 8.758851292667687e-06, "loss": 0.78650284, "memory(GiB)": 135.77, "step": 21630, "train_speed(iter/s)": 0.201426 }, { "acc": 0.77446556, "epoch": 0.5048775864331647, "grad_norm": 6.6875, "learning_rate": 8.757605296756483e-06, "loss": 0.80399456, "memory(GiB)": 135.77, "step": 21640, "train_speed(iter/s)": 0.201474 }, { "acc": 0.76340103, "epoch": 0.5051108940054536, "grad_norm": 5.0, "learning_rate": 8.756358764448214e-06, "loss": 0.85637417, "memory(GiB)": 135.77, "step": 21650, "train_speed(iter/s)": 0.201523 }, { "acc": 0.78347025, "epoch": 0.5053442015777424, "grad_norm": 4.875, "learning_rate": 8.755111695920823e-06, "loss": 0.79445, "memory(GiB)": 135.77, "step": 21660, "train_speed(iter/s)": 0.201571 }, { "acc": 0.77764778, "epoch": 0.5055775091500313, "grad_norm": 4.125, "learning_rate": 8.753864091352326e-06, "loss": 0.80275316, "memory(GiB)": 135.77, "step": 21670, "train_speed(iter/s)": 0.201619 }, { "acc": 0.78915906, "epoch": 0.5058108167223202, "grad_norm": 3.84375, "learning_rate": 8.752615950920824e-06, "loss": 0.75332527, "memory(GiB)": 135.77, "step": 21680, "train_speed(iter/s)": 0.201664 }, { "acc": 0.77775726, "epoch": 0.5060441242946091, "grad_norm": 10.625, "learning_rate": 8.751367274804483e-06, "loss": 0.79482059, "memory(GiB)": 135.77, "step": 21690, "train_speed(iter/s)": 0.201709 }, { "acc": 0.77287965, "epoch": 0.506277431866898, "grad_norm": 4.53125, "learning_rate": 8.750118063181553e-06, "loss": 0.8031868, "memory(GiB)": 135.77, "step": 21700, "train_speed(iter/s)": 0.201762 }, { "acc": 0.77395134, "epoch": 0.5065107394391869, "grad_norm": 4.8125, "learning_rate": 8.74886831623036e-06, "loss": 0.79606371, "memory(GiB)": 135.77, "step": 21710, "train_speed(iter/s)": 0.20181 }, { "acc": 0.75434523, "epoch": 0.5067440470114758, "grad_norm": 5.375, "learning_rate": 8.747618034129304e-06, "loss": 0.90953197, "memory(GiB)": 135.77, "step": 21720, "train_speed(iter/s)": 0.20186 }, { "acc": 0.78446827, "epoch": 0.5069773545837647, "grad_norm": 6.5625, "learning_rate": 8.746367217056861e-06, "loss": 0.79035029, "memory(GiB)": 135.77, "step": 21730, "train_speed(iter/s)": 0.201906 }, { "acc": 0.76340156, "epoch": 0.5072106621560536, "grad_norm": 6.40625, "learning_rate": 8.745115865191587e-06, "loss": 0.8527956, "memory(GiB)": 135.77, "step": 21740, "train_speed(iter/s)": 0.201955 }, { "acc": 0.77388973, "epoch": 0.5074439697283425, "grad_norm": 5.46875, "learning_rate": 8.743863978712111e-06, "loss": 0.81416397, "memory(GiB)": 135.77, "step": 21750, "train_speed(iter/s)": 0.202002 }, { "acc": 0.77853451, "epoch": 0.5076772773006314, "grad_norm": 5.0, "learning_rate": 8.74261155779714e-06, "loss": 0.78336382, "memory(GiB)": 135.77, "step": 21760, "train_speed(iter/s)": 0.202047 }, { "acc": 0.75856724, "epoch": 0.5079105848729203, "grad_norm": 7.21875, "learning_rate": 8.741358602625455e-06, "loss": 0.87376547, "memory(GiB)": 135.77, "step": 21770, "train_speed(iter/s)": 0.202089 }, { "acc": 0.78273201, "epoch": 0.5081438924452092, "grad_norm": 5.65625, "learning_rate": 8.740105113375919e-06, "loss": 0.76739621, "memory(GiB)": 135.77, "step": 21780, "train_speed(iter/s)": 0.202136 }, { "acc": 0.77240195, "epoch": 0.5083772000174981, "grad_norm": 4.21875, "learning_rate": 8.738851090227462e-06, "loss": 0.80845757, "memory(GiB)": 135.77, "step": 21790, "train_speed(iter/s)": 0.202185 }, { "acc": 0.78104792, "epoch": 0.508610507589787, "grad_norm": 5.625, "learning_rate": 8.737596533359101e-06, "loss": 0.78938627, "memory(GiB)": 135.77, "step": 21800, "train_speed(iter/s)": 0.202235 }, { "acc": 0.76249051, "epoch": 0.5088438151620759, "grad_norm": 5.96875, "learning_rate": 8.736341442949919e-06, "loss": 0.8694706, "memory(GiB)": 135.77, "step": 21810, "train_speed(iter/s)": 0.202282 }, { "acc": 0.77197785, "epoch": 0.5090771227343648, "grad_norm": 6.6875, "learning_rate": 8.73508581917908e-06, "loss": 0.80897655, "memory(GiB)": 135.77, "step": 21820, "train_speed(iter/s)": 0.202328 }, { "acc": 0.75979204, "epoch": 0.5093104303066537, "grad_norm": 7.5625, "learning_rate": 8.733829662225825e-06, "loss": 0.86723404, "memory(GiB)": 135.77, "step": 21830, "train_speed(iter/s)": 0.202377 }, { "acc": 0.77098808, "epoch": 0.5095437378789426, "grad_norm": 7.1875, "learning_rate": 8.732572972269472e-06, "loss": 0.84290762, "memory(GiB)": 135.77, "step": 21840, "train_speed(iter/s)": 0.202422 }, { "acc": 0.78516512, "epoch": 0.5097770454512315, "grad_norm": 5.5, "learning_rate": 8.731315749489412e-06, "loss": 0.77529469, "memory(GiB)": 135.77, "step": 21850, "train_speed(iter/s)": 0.202471 }, { "acc": 0.77012315, "epoch": 0.5100103530235203, "grad_norm": 5.625, "learning_rate": 8.730057994065113e-06, "loss": 0.81857462, "memory(GiB)": 135.77, "step": 21860, "train_speed(iter/s)": 0.202521 }, { "acc": 0.76024265, "epoch": 0.5102436605958092, "grad_norm": 7.84375, "learning_rate": 8.728799706176117e-06, "loss": 0.89396629, "memory(GiB)": 135.77, "step": 21870, "train_speed(iter/s)": 0.20257 }, { "acc": 0.79580889, "epoch": 0.5104769681680981, "grad_norm": 4.96875, "learning_rate": 8.727540886002048e-06, "loss": 0.71652207, "memory(GiB)": 135.77, "step": 21880, "train_speed(iter/s)": 0.20262 }, { "acc": 0.7760314, "epoch": 0.510710275740387, "grad_norm": 4.96875, "learning_rate": 8.7262815337226e-06, "loss": 0.81335468, "memory(GiB)": 135.77, "step": 21890, "train_speed(iter/s)": 0.202666 }, { "acc": 0.7670917, "epoch": 0.5109435833126759, "grad_norm": 5.21875, "learning_rate": 8.725021649517545e-06, "loss": 0.82741623, "memory(GiB)": 135.77, "step": 21900, "train_speed(iter/s)": 0.202713 }, { "acc": 0.76540327, "epoch": 0.5111768908849648, "grad_norm": 7.03125, "learning_rate": 8.723761233566732e-06, "loss": 0.85327883, "memory(GiB)": 135.77, "step": 21910, "train_speed(iter/s)": 0.202761 }, { "acc": 0.76294174, "epoch": 0.5114101984572537, "grad_norm": 7.59375, "learning_rate": 8.722500286050084e-06, "loss": 0.8585103, "memory(GiB)": 135.77, "step": 21920, "train_speed(iter/s)": 0.202807 }, { "acc": 0.77434635, "epoch": 0.5116435060295426, "grad_norm": 6.40625, "learning_rate": 8.721238807147602e-06, "loss": 0.83060961, "memory(GiB)": 135.77, "step": 21930, "train_speed(iter/s)": 0.202851 }, { "acc": 0.76418648, "epoch": 0.5118768136018315, "grad_norm": 10.4375, "learning_rate": 8.71997679703936e-06, "loss": 0.84364891, "memory(GiB)": 135.77, "step": 21940, "train_speed(iter/s)": 0.202899 }, { "acc": 0.77589755, "epoch": 0.5121101211741204, "grad_norm": 6.53125, "learning_rate": 8.718714255905514e-06, "loss": 0.80891504, "memory(GiB)": 135.77, "step": 21950, "train_speed(iter/s)": 0.202942 }, { "acc": 0.78276296, "epoch": 0.5123434287464093, "grad_norm": 4.96875, "learning_rate": 8.717451183926286e-06, "loss": 0.78558817, "memory(GiB)": 135.77, "step": 21960, "train_speed(iter/s)": 0.20299 }, { "acc": 0.76813164, "epoch": 0.5125767363186982, "grad_norm": 5.0625, "learning_rate": 8.716187581281982e-06, "loss": 0.82764816, "memory(GiB)": 135.77, "step": 21970, "train_speed(iter/s)": 0.203035 }, { "acc": 0.76752729, "epoch": 0.512810043890987, "grad_norm": 10.5625, "learning_rate": 8.71492344815298e-06, "loss": 0.84671097, "memory(GiB)": 135.77, "step": 21980, "train_speed(iter/s)": 0.203084 }, { "acc": 0.7599195, "epoch": 0.513043351463276, "grad_norm": 5.5625, "learning_rate": 8.713658784719735e-06, "loss": 0.85972528, "memory(GiB)": 135.77, "step": 21990, "train_speed(iter/s)": 0.203127 }, { "acc": 0.77313404, "epoch": 0.5132766590355649, "grad_norm": 5.90625, "learning_rate": 8.712393591162779e-06, "loss": 0.82830524, "memory(GiB)": 135.77, "step": 22000, "train_speed(iter/s)": 0.203176 }, { "epoch": 0.5132766590355649, "eval_acc": 0.739051699522658, "eval_loss": 0.8252963423728943, "eval_runtime": 1269.798, "eval_samples_per_second": 28.344, "eval_steps_per_second": 14.172, "step": 22000 }, { "acc": 0.76468811, "epoch": 0.5135099666078538, "grad_norm": 6.0, "learning_rate": 8.711127867662715e-06, "loss": 0.85957546, "memory(GiB)": 135.77, "step": 22010, "train_speed(iter/s)": 0.200838 }, { "acc": 0.77129335, "epoch": 0.5137432741801427, "grad_norm": 4.25, "learning_rate": 8.709861614400223e-06, "loss": 0.83853226, "memory(GiB)": 135.77, "step": 22020, "train_speed(iter/s)": 0.200887 }, { "acc": 0.78872757, "epoch": 0.5139765817524315, "grad_norm": 4.9375, "learning_rate": 8.708594831556068e-06, "loss": 0.76557589, "memory(GiB)": 135.77, "step": 22030, "train_speed(iter/s)": 0.200934 }, { "acc": 0.76171837, "epoch": 0.5142098893247204, "grad_norm": 5.75, "learning_rate": 8.707327519311075e-06, "loss": 0.8551321, "memory(GiB)": 135.77, "step": 22040, "train_speed(iter/s)": 0.200982 }, { "acc": 0.76743393, "epoch": 0.5144431968970092, "grad_norm": 8.3125, "learning_rate": 8.706059677846157e-06, "loss": 0.85651779, "memory(GiB)": 135.77, "step": 22050, "train_speed(iter/s)": 0.201031 }, { "acc": 0.77318702, "epoch": 0.5146765044692981, "grad_norm": 4.8125, "learning_rate": 8.704791307342297e-06, "loss": 0.79379597, "memory(GiB)": 135.77, "step": 22060, "train_speed(iter/s)": 0.201077 }, { "acc": 0.7719686, "epoch": 0.514909812041587, "grad_norm": 5.8125, "learning_rate": 8.703522407980554e-06, "loss": 0.81712446, "memory(GiB)": 135.77, "step": 22070, "train_speed(iter/s)": 0.201124 }, { "acc": 0.77638397, "epoch": 0.5151431196138759, "grad_norm": 10.875, "learning_rate": 8.702252979942063e-06, "loss": 0.80130062, "memory(GiB)": 135.77, "step": 22080, "train_speed(iter/s)": 0.201174 }, { "acc": 0.77209868, "epoch": 0.5153764271861648, "grad_norm": 5.21875, "learning_rate": 8.700983023408034e-06, "loss": 0.8194252, "memory(GiB)": 135.77, "step": 22090, "train_speed(iter/s)": 0.201224 }, { "acc": 0.76339512, "epoch": 0.5156097347584537, "grad_norm": 6.71875, "learning_rate": 8.699712538559752e-06, "loss": 0.85726337, "memory(GiB)": 135.77, "step": 22100, "train_speed(iter/s)": 0.201271 }, { "acc": 0.7645709, "epoch": 0.5158430423307426, "grad_norm": 4.9375, "learning_rate": 8.698441525578582e-06, "loss": 0.85991373, "memory(GiB)": 135.77, "step": 22110, "train_speed(iter/s)": 0.201319 }, { "acc": 0.79457126, "epoch": 0.5160763499030315, "grad_norm": 5.125, "learning_rate": 8.697169984645959e-06, "loss": 0.72133851, "memory(GiB)": 135.77, "step": 22120, "train_speed(iter/s)": 0.201368 }, { "acc": 0.75647445, "epoch": 0.5163096574753204, "grad_norm": 4.0625, "learning_rate": 8.695897915943395e-06, "loss": 0.86364326, "memory(GiB)": 135.77, "step": 22130, "train_speed(iter/s)": 0.201417 }, { "acc": 0.76443992, "epoch": 0.5165429650476093, "grad_norm": 5.15625, "learning_rate": 8.694625319652477e-06, "loss": 0.85923405, "memory(GiB)": 135.77, "step": 22140, "train_speed(iter/s)": 0.201461 }, { "acc": 0.79320621, "epoch": 0.5167762726198982, "grad_norm": 5.46875, "learning_rate": 8.693352195954866e-06, "loss": 0.74455805, "memory(GiB)": 135.77, "step": 22150, "train_speed(iter/s)": 0.201504 }, { "acc": 0.75500603, "epoch": 0.5170095801921871, "grad_norm": 4.90625, "learning_rate": 8.692078545032304e-06, "loss": 0.89729595, "memory(GiB)": 135.77, "step": 22160, "train_speed(iter/s)": 0.201548 }, { "acc": 0.78628874, "epoch": 0.517242887764476, "grad_norm": 6.65625, "learning_rate": 8.6908043670666e-06, "loss": 0.77559862, "memory(GiB)": 135.77, "step": 22170, "train_speed(iter/s)": 0.201596 }, { "acc": 0.7568099, "epoch": 0.5174761953367649, "grad_norm": 6.21875, "learning_rate": 8.689529662239647e-06, "loss": 0.91406498, "memory(GiB)": 135.77, "step": 22180, "train_speed(iter/s)": 0.20164 }, { "acc": 0.76510105, "epoch": 0.5177095029090538, "grad_norm": 4.40625, "learning_rate": 8.688254430733405e-06, "loss": 0.85134945, "memory(GiB)": 135.77, "step": 22190, "train_speed(iter/s)": 0.201688 }, { "acc": 0.77507906, "epoch": 0.5179428104813427, "grad_norm": 5.5, "learning_rate": 8.686978672729916e-06, "loss": 0.8101469, "memory(GiB)": 135.77, "step": 22200, "train_speed(iter/s)": 0.201735 }, { "acc": 0.75599632, "epoch": 0.5181761180536316, "grad_norm": 4.78125, "learning_rate": 8.68570238841129e-06, "loss": 0.86438799, "memory(GiB)": 135.77, "step": 22210, "train_speed(iter/s)": 0.201783 }, { "acc": 0.76995659, "epoch": 0.5184094256259205, "grad_norm": 6.125, "learning_rate": 8.684425577959722e-06, "loss": 0.82093, "memory(GiB)": 135.77, "step": 22220, "train_speed(iter/s)": 0.201832 }, { "acc": 0.77510152, "epoch": 0.5186427331982094, "grad_norm": 4.3125, "learning_rate": 8.683148241557472e-06, "loss": 0.80338688, "memory(GiB)": 135.77, "step": 22230, "train_speed(iter/s)": 0.201878 }, { "acc": 0.76440411, "epoch": 0.5188760407704982, "grad_norm": 6.125, "learning_rate": 8.681870379386879e-06, "loss": 0.83043871, "memory(GiB)": 135.77, "step": 22240, "train_speed(iter/s)": 0.201929 }, { "acc": 0.76180897, "epoch": 0.5191093483427871, "grad_norm": 5.78125, "learning_rate": 8.68059199163036e-06, "loss": 0.86176853, "memory(GiB)": 135.77, "step": 22250, "train_speed(iter/s)": 0.201964 }, { "acc": 0.77140579, "epoch": 0.519342655915076, "grad_norm": 6.4375, "learning_rate": 8.679313078470403e-06, "loss": 0.80798788, "memory(GiB)": 135.77, "step": 22260, "train_speed(iter/s)": 0.202008 }, { "acc": 0.76855969, "epoch": 0.5195759634873649, "grad_norm": 4.875, "learning_rate": 8.678033640089574e-06, "loss": 0.83115768, "memory(GiB)": 135.77, "step": 22270, "train_speed(iter/s)": 0.202055 }, { "acc": 0.78249931, "epoch": 0.5198092710596538, "grad_norm": 4.65625, "learning_rate": 8.676753676670511e-06, "loss": 0.78415298, "memory(GiB)": 135.77, "step": 22280, "train_speed(iter/s)": 0.2021 }, { "acc": 0.78204756, "epoch": 0.5200425786319427, "grad_norm": 4.15625, "learning_rate": 8.67547318839593e-06, "loss": 0.78528633, "memory(GiB)": 135.77, "step": 22290, "train_speed(iter/s)": 0.202146 }, { "acc": 0.76204596, "epoch": 0.5202758862042316, "grad_norm": 6.75, "learning_rate": 8.674192175448617e-06, "loss": 0.8319747, "memory(GiB)": 135.77, "step": 22300, "train_speed(iter/s)": 0.202193 }, { "acc": 0.76219587, "epoch": 0.5205091937765205, "grad_norm": 6.53125, "learning_rate": 8.672910638011439e-06, "loss": 0.84944124, "memory(GiB)": 135.77, "step": 22310, "train_speed(iter/s)": 0.202241 }, { "acc": 0.76580586, "epoch": 0.5207425013488094, "grad_norm": 5.25, "learning_rate": 8.671628576267333e-06, "loss": 0.83932171, "memory(GiB)": 135.77, "step": 22320, "train_speed(iter/s)": 0.20229 }, { "acc": 0.76935239, "epoch": 0.5209758089210983, "grad_norm": 5.34375, "learning_rate": 8.670345990399317e-06, "loss": 0.82982864, "memory(GiB)": 135.77, "step": 22330, "train_speed(iter/s)": 0.202333 }, { "acc": 0.79416113, "epoch": 0.5212091164933872, "grad_norm": 7.59375, "learning_rate": 8.669062880590474e-06, "loss": 0.73947349, "memory(GiB)": 135.77, "step": 22340, "train_speed(iter/s)": 0.202381 }, { "acc": 0.77279081, "epoch": 0.5214424240656761, "grad_norm": 4.71875, "learning_rate": 8.667779247023974e-06, "loss": 0.81007175, "memory(GiB)": 135.77, "step": 22350, "train_speed(iter/s)": 0.202429 }, { "acc": 0.7789588, "epoch": 0.521675731637965, "grad_norm": 5.03125, "learning_rate": 8.666495089883049e-06, "loss": 0.76399951, "memory(GiB)": 135.77, "step": 22360, "train_speed(iter/s)": 0.202475 }, { "acc": 0.78549161, "epoch": 0.5219090392102539, "grad_norm": 4.65625, "learning_rate": 8.665210409351015e-06, "loss": 0.77918396, "memory(GiB)": 135.77, "step": 22370, "train_speed(iter/s)": 0.202519 }, { "acc": 0.76314993, "epoch": 0.5221423467825428, "grad_norm": 6.125, "learning_rate": 8.663925205611261e-06, "loss": 0.87036915, "memory(GiB)": 135.77, "step": 22380, "train_speed(iter/s)": 0.202566 }, { "acc": 0.79042592, "epoch": 0.5223756543548317, "grad_norm": 4.875, "learning_rate": 8.66263947884725e-06, "loss": 0.77454262, "memory(GiB)": 135.77, "step": 22390, "train_speed(iter/s)": 0.202605 }, { "acc": 0.76517191, "epoch": 0.5226089619271206, "grad_norm": 3.859375, "learning_rate": 8.661353229242514e-06, "loss": 0.84707232, "memory(GiB)": 135.77, "step": 22400, "train_speed(iter/s)": 0.202646 }, { "acc": 0.77440958, "epoch": 0.5228422694994095, "grad_norm": 5.46875, "learning_rate": 8.66006645698067e-06, "loss": 0.80511379, "memory(GiB)": 135.77, "step": 22410, "train_speed(iter/s)": 0.202689 }, { "acc": 0.77160006, "epoch": 0.5230755770716984, "grad_norm": 6.71875, "learning_rate": 8.658779162245404e-06, "loss": 0.82624683, "memory(GiB)": 135.77, "step": 22420, "train_speed(iter/s)": 0.202737 }, { "acc": 0.79068604, "epoch": 0.5233088846439872, "grad_norm": 6.1875, "learning_rate": 8.657491345220475e-06, "loss": 0.74397564, "memory(GiB)": 135.77, "step": 22430, "train_speed(iter/s)": 0.202783 }, { "acc": 0.79531755, "epoch": 0.5235421922162761, "grad_norm": 4.34375, "learning_rate": 8.656203006089716e-06, "loss": 0.72965755, "memory(GiB)": 135.77, "step": 22440, "train_speed(iter/s)": 0.202829 }, { "acc": 0.78984456, "epoch": 0.523775499788565, "grad_norm": 6.46875, "learning_rate": 8.654914145037044e-06, "loss": 0.74049387, "memory(GiB)": 135.77, "step": 22450, "train_speed(iter/s)": 0.202874 }, { "acc": 0.76976719, "epoch": 0.5240088073608539, "grad_norm": 4.875, "learning_rate": 8.653624762246437e-06, "loss": 0.82113333, "memory(GiB)": 135.77, "step": 22460, "train_speed(iter/s)": 0.202921 }, { "acc": 0.76820931, "epoch": 0.5242421149331428, "grad_norm": 4.90625, "learning_rate": 8.652334857901957e-06, "loss": 0.8498045, "memory(GiB)": 135.77, "step": 22470, "train_speed(iter/s)": 0.202959 }, { "acc": 0.75914822, "epoch": 0.5244754225054317, "grad_norm": 6.53125, "learning_rate": 8.651044432187736e-06, "loss": 0.895998, "memory(GiB)": 135.77, "step": 22480, "train_speed(iter/s)": 0.203006 }, { "acc": 0.77738414, "epoch": 0.5247087300777206, "grad_norm": 4.75, "learning_rate": 8.649753485287986e-06, "loss": 0.79222326, "memory(GiB)": 135.77, "step": 22490, "train_speed(iter/s)": 0.203052 }, { "acc": 0.78186607, "epoch": 0.5249420376500095, "grad_norm": 7.0625, "learning_rate": 8.648462017386982e-06, "loss": 0.80105238, "memory(GiB)": 135.77, "step": 22500, "train_speed(iter/s)": 0.203094 }, { "epoch": 0.5249420376500095, "eval_acc": 0.7395784139026973, "eval_loss": 0.8238633871078491, "eval_runtime": 1268.9883, "eval_samples_per_second": 28.362, "eval_steps_per_second": 14.181, "step": 22500 }, { "acc": 0.78160195, "epoch": 0.5251753452222984, "grad_norm": 4.875, "learning_rate": 8.64717002866909e-06, "loss": 0.77151155, "memory(GiB)": 135.77, "step": 22510, "train_speed(iter/s)": 0.200805 }, { "acc": 0.78638101, "epoch": 0.5254086527945873, "grad_norm": 4.71875, "learning_rate": 8.64587751931873e-06, "loss": 0.77090092, "memory(GiB)": 135.77, "step": 22520, "train_speed(iter/s)": 0.200853 }, { "acc": 0.78011241, "epoch": 0.5256419603668762, "grad_norm": 5.03125, "learning_rate": 8.644584489520418e-06, "loss": 0.78862772, "memory(GiB)": 135.77, "step": 22530, "train_speed(iter/s)": 0.200897 }, { "acc": 0.77731066, "epoch": 0.5258752679391651, "grad_norm": 4.8125, "learning_rate": 8.643290939458728e-06, "loss": 0.79162712, "memory(GiB)": 135.77, "step": 22540, "train_speed(iter/s)": 0.200945 }, { "acc": 0.77135563, "epoch": 0.526108575511454, "grad_norm": 4.84375, "learning_rate": 8.641996869318313e-06, "loss": 0.83071518, "memory(GiB)": 135.77, "step": 22550, "train_speed(iter/s)": 0.200987 }, { "acc": 0.78698025, "epoch": 0.5263418830837429, "grad_norm": 8.125, "learning_rate": 8.640702279283904e-06, "loss": 0.76378479, "memory(GiB)": 135.77, "step": 22560, "train_speed(iter/s)": 0.201035 }, { "acc": 0.75245285, "epoch": 0.5265751906560318, "grad_norm": 6.875, "learning_rate": 8.639407169540302e-06, "loss": 0.90592232, "memory(GiB)": 135.77, "step": 22570, "train_speed(iter/s)": 0.201079 }, { "acc": 0.76512051, "epoch": 0.5268084982283207, "grad_norm": 6.40625, "learning_rate": 8.638111540272384e-06, "loss": 0.84115753, "memory(GiB)": 135.77, "step": 22580, "train_speed(iter/s)": 0.201125 }, { "acc": 0.76233463, "epoch": 0.5270418058006096, "grad_norm": 5.09375, "learning_rate": 8.636815391665102e-06, "loss": 0.87490129, "memory(GiB)": 135.77, "step": 22590, "train_speed(iter/s)": 0.201173 }, { "acc": 0.77505226, "epoch": 0.5272751133728985, "grad_norm": 5.5625, "learning_rate": 8.635518723903478e-06, "loss": 0.82525368, "memory(GiB)": 135.77, "step": 22600, "train_speed(iter/s)": 0.201217 }, { "acc": 0.77524152, "epoch": 0.5275084209451874, "grad_norm": 5.375, "learning_rate": 8.634221537172612e-06, "loss": 0.81201534, "memory(GiB)": 135.77, "step": 22610, "train_speed(iter/s)": 0.201261 }, { "acc": 0.76617479, "epoch": 0.5277417285174762, "grad_norm": 6.625, "learning_rate": 8.632923831657678e-06, "loss": 0.85680828, "memory(GiB)": 135.77, "step": 22620, "train_speed(iter/s)": 0.201309 }, { "acc": 0.77629232, "epoch": 0.527975036089765, "grad_norm": 5.4375, "learning_rate": 8.631625607543921e-06, "loss": 0.7853322, "memory(GiB)": 135.77, "step": 22630, "train_speed(iter/s)": 0.201353 }, { "acc": 0.78849845, "epoch": 0.5282083436620539, "grad_norm": 4.1875, "learning_rate": 8.630326865016663e-06, "loss": 0.74947386, "memory(GiB)": 135.77, "step": 22640, "train_speed(iter/s)": 0.201397 }, { "acc": 0.76544561, "epoch": 0.5284416512343428, "grad_norm": 5.0625, "learning_rate": 8.629027604261303e-06, "loss": 0.84232635, "memory(GiB)": 135.77, "step": 22650, "train_speed(iter/s)": 0.201444 }, { "acc": 0.77632599, "epoch": 0.5286749588066317, "grad_norm": 6.21875, "learning_rate": 8.627727825463303e-06, "loss": 0.79433575, "memory(GiB)": 135.77, "step": 22660, "train_speed(iter/s)": 0.201489 }, { "acc": 0.78881607, "epoch": 0.5289082663789206, "grad_norm": 6.40625, "learning_rate": 8.626427528808212e-06, "loss": 0.75886679, "memory(GiB)": 135.77, "step": 22670, "train_speed(iter/s)": 0.201535 }, { "acc": 0.78701801, "epoch": 0.5291415739512095, "grad_norm": 14.4375, "learning_rate": 8.625126714481645e-06, "loss": 0.79751778, "memory(GiB)": 135.77, "step": 22680, "train_speed(iter/s)": 0.201581 }, { "acc": 0.76863942, "epoch": 0.5293748815234984, "grad_norm": 5.28125, "learning_rate": 8.623825382669291e-06, "loss": 0.82617855, "memory(GiB)": 135.77, "step": 22690, "train_speed(iter/s)": 0.201626 }, { "acc": 0.77472267, "epoch": 0.5296081890957873, "grad_norm": 6.40625, "learning_rate": 8.622523533556916e-06, "loss": 0.84323826, "memory(GiB)": 135.77, "step": 22700, "train_speed(iter/s)": 0.201672 }, { "acc": 0.76703949, "epoch": 0.5298414966680762, "grad_norm": 6.875, "learning_rate": 8.621221167330363e-06, "loss": 0.85303726, "memory(GiB)": 135.77, "step": 22710, "train_speed(iter/s)": 0.201718 }, { "acc": 0.76639805, "epoch": 0.5300748042403651, "grad_norm": 4.15625, "learning_rate": 8.619918284175537e-06, "loss": 0.82314072, "memory(GiB)": 135.77, "step": 22720, "train_speed(iter/s)": 0.201763 }, { "acc": 0.79025326, "epoch": 0.530308111812654, "grad_norm": 3.8125, "learning_rate": 8.618614884278427e-06, "loss": 0.73413811, "memory(GiB)": 135.77, "step": 22730, "train_speed(iter/s)": 0.201808 }, { "acc": 0.76510487, "epoch": 0.5305414193849429, "grad_norm": 4.625, "learning_rate": 8.617310967825094e-06, "loss": 0.82598991, "memory(GiB)": 135.77, "step": 22740, "train_speed(iter/s)": 0.201853 }, { "acc": 0.76874247, "epoch": 0.5307747269572318, "grad_norm": 4.9375, "learning_rate": 8.616006535001673e-06, "loss": 0.83377934, "memory(GiB)": 135.77, "step": 22750, "train_speed(iter/s)": 0.201897 }, { "acc": 0.76021204, "epoch": 0.5310080345295207, "grad_norm": 5.15625, "learning_rate": 8.614701585994368e-06, "loss": 0.87588558, "memory(GiB)": 135.77, "step": 22760, "train_speed(iter/s)": 0.201943 }, { "acc": 0.77022514, "epoch": 0.5312413421018096, "grad_norm": 5.1875, "learning_rate": 8.613396120989463e-06, "loss": 0.82545309, "memory(GiB)": 135.77, "step": 22770, "train_speed(iter/s)": 0.201989 }, { "acc": 0.7779541, "epoch": 0.5314746496740985, "grad_norm": 4.75, "learning_rate": 8.61209014017331e-06, "loss": 0.7887279, "memory(GiB)": 135.77, "step": 22780, "train_speed(iter/s)": 0.202035 }, { "acc": 0.77761946, "epoch": 0.5317079572463874, "grad_norm": 4.40625, "learning_rate": 8.610783643732339e-06, "loss": 0.78312531, "memory(GiB)": 135.77, "step": 22790, "train_speed(iter/s)": 0.202078 }, { "acc": 0.76519241, "epoch": 0.5319412648186763, "grad_norm": 9.9375, "learning_rate": 8.60947663185305e-06, "loss": 0.85003548, "memory(GiB)": 135.77, "step": 22800, "train_speed(iter/s)": 0.202127 }, { "acc": 0.770683, "epoch": 0.5321745723909652, "grad_norm": 4.90625, "learning_rate": 8.608169104722024e-06, "loss": 0.83203659, "memory(GiB)": 135.77, "step": 22810, "train_speed(iter/s)": 0.202169 }, { "acc": 0.75626473, "epoch": 0.532407879963254, "grad_norm": 6.0, "learning_rate": 8.606861062525904e-06, "loss": 0.88525591, "memory(GiB)": 135.77, "step": 22820, "train_speed(iter/s)": 0.20221 }, { "acc": 0.7737916, "epoch": 0.5326411875355429, "grad_norm": 4.59375, "learning_rate": 8.605552505451417e-06, "loss": 0.81642294, "memory(GiB)": 135.77, "step": 22830, "train_speed(iter/s)": 0.202255 }, { "acc": 0.77903752, "epoch": 0.5328744951078318, "grad_norm": 5.9375, "learning_rate": 8.604243433685356e-06, "loss": 0.8116827, "memory(GiB)": 135.77, "step": 22840, "train_speed(iter/s)": 0.202302 }, { "acc": 0.79416924, "epoch": 0.5331078026801207, "grad_norm": 6.96875, "learning_rate": 8.602933847414592e-06, "loss": 0.74621596, "memory(GiB)": 135.77, "step": 22850, "train_speed(iter/s)": 0.202346 }, { "acc": 0.77615061, "epoch": 0.5333411102524096, "grad_norm": 6.375, "learning_rate": 8.601623746826068e-06, "loss": 0.80405464, "memory(GiB)": 135.77, "step": 22860, "train_speed(iter/s)": 0.20239 }, { "acc": 0.75500526, "epoch": 0.5335744178246985, "grad_norm": 5.4375, "learning_rate": 8.600313132106801e-06, "loss": 0.92301064, "memory(GiB)": 135.77, "step": 22870, "train_speed(iter/s)": 0.202435 }, { "acc": 0.74417582, "epoch": 0.5338077253969874, "grad_norm": 14.125, "learning_rate": 8.599002003443879e-06, "loss": 0.93030882, "memory(GiB)": 135.77, "step": 22880, "train_speed(iter/s)": 0.202484 }, { "acc": 0.76454968, "epoch": 0.5340410329692763, "grad_norm": 5.21875, "learning_rate": 8.597690361024468e-06, "loss": 0.85199795, "memory(GiB)": 135.77, "step": 22890, "train_speed(iter/s)": 0.202531 }, { "acc": 0.77811437, "epoch": 0.5342743405415652, "grad_norm": 7.15625, "learning_rate": 8.596378205035803e-06, "loss": 0.80663748, "memory(GiB)": 135.77, "step": 22900, "train_speed(iter/s)": 0.202578 }, { "acc": 0.77453089, "epoch": 0.5345076481138541, "grad_norm": 5.21875, "learning_rate": 8.595065535665192e-06, "loss": 0.81512251, "memory(GiB)": 135.77, "step": 22910, "train_speed(iter/s)": 0.202625 }, { "acc": 0.77443113, "epoch": 0.534740955686143, "grad_norm": 5.53125, "learning_rate": 8.593752353100022e-06, "loss": 0.82128239, "memory(GiB)": 135.77, "step": 22920, "train_speed(iter/s)": 0.202669 }, { "acc": 0.78365164, "epoch": 0.5349742632584319, "grad_norm": 5.5, "learning_rate": 8.592438657527746e-06, "loss": 0.77799377, "memory(GiB)": 135.77, "step": 22930, "train_speed(iter/s)": 0.202714 }, { "acc": 0.76975231, "epoch": 0.5352075708307208, "grad_norm": 6.40625, "learning_rate": 8.591124449135897e-06, "loss": 0.81138725, "memory(GiB)": 135.77, "step": 22940, "train_speed(iter/s)": 0.202762 }, { "acc": 0.79545565, "epoch": 0.5354408784030097, "grad_norm": 13.875, "learning_rate": 8.589809728112076e-06, "loss": 0.72523451, "memory(GiB)": 135.77, "step": 22950, "train_speed(iter/s)": 0.202809 }, { "acc": 0.76595306, "epoch": 0.5356741859752986, "grad_norm": 11.1875, "learning_rate": 8.588494494643959e-06, "loss": 0.84729843, "memory(GiB)": 135.77, "step": 22960, "train_speed(iter/s)": 0.202854 }, { "acc": 0.77811699, "epoch": 0.5359074935475875, "grad_norm": 5.125, "learning_rate": 8.587178748919294e-06, "loss": 0.8056942, "memory(GiB)": 135.77, "step": 22970, "train_speed(iter/s)": 0.202899 }, { "acc": 0.77778416, "epoch": 0.5361408011198764, "grad_norm": 4.875, "learning_rate": 8.585862491125906e-06, "loss": 0.79754281, "memory(GiB)": 135.77, "step": 22980, "train_speed(iter/s)": 0.202945 }, { "acc": 0.77270217, "epoch": 0.5363741086921653, "grad_norm": 4.84375, "learning_rate": 8.584545721451689e-06, "loss": 0.81794977, "memory(GiB)": 135.77, "step": 22990, "train_speed(iter/s)": 0.202991 }, { "acc": 0.75620012, "epoch": 0.5366074162644542, "grad_norm": 5.03125, "learning_rate": 8.583228440084612e-06, "loss": 0.89408569, "memory(GiB)": 135.77, "step": 23000, "train_speed(iter/s)": 0.203038 }, { "epoch": 0.5366074162644542, "eval_acc": 0.7395305307772392, "eval_loss": 0.8233683705329895, "eval_runtime": 1269.3275, "eval_samples_per_second": 28.354, "eval_steps_per_second": 14.178, "step": 23000 }, { "acc": 0.76377153, "epoch": 0.536840723836743, "grad_norm": 22.625, "learning_rate": 8.581910647212714e-06, "loss": 0.84084702, "memory(GiB)": 135.77, "step": 23010, "train_speed(iter/s)": 0.200803 }, { "acc": 0.77578378, "epoch": 0.5370740314090319, "grad_norm": 6.3125, "learning_rate": 8.580592343024114e-06, "loss": 0.81817398, "memory(GiB)": 135.77, "step": 23020, "train_speed(iter/s)": 0.200847 }, { "acc": 0.78007755, "epoch": 0.5373073389813208, "grad_norm": 4.375, "learning_rate": 8.579273527706997e-06, "loss": 0.79391298, "memory(GiB)": 135.77, "step": 23030, "train_speed(iter/s)": 0.200887 }, { "acc": 0.7673666, "epoch": 0.5375406465536097, "grad_norm": 5.0625, "learning_rate": 8.577954201449621e-06, "loss": 0.8341753, "memory(GiB)": 135.77, "step": 23040, "train_speed(iter/s)": 0.200931 }, { "acc": 0.75479908, "epoch": 0.5377739541258986, "grad_norm": 6.9375, "learning_rate": 8.576634364440327e-06, "loss": 0.87442303, "memory(GiB)": 135.77, "step": 23050, "train_speed(iter/s)": 0.20097 }, { "acc": 0.77252378, "epoch": 0.5380072616981875, "grad_norm": 6.65625, "learning_rate": 8.575314016867512e-06, "loss": 0.84094563, "memory(GiB)": 135.77, "step": 23060, "train_speed(iter/s)": 0.201015 }, { "acc": 0.76748762, "epoch": 0.5382405692704764, "grad_norm": 7.25, "learning_rate": 8.573993158919661e-06, "loss": 0.82697706, "memory(GiB)": 135.77, "step": 23070, "train_speed(iter/s)": 0.201062 }, { "acc": 0.77766724, "epoch": 0.5384738768427653, "grad_norm": 3.890625, "learning_rate": 8.572671790785325e-06, "loss": 0.78724775, "memory(GiB)": 135.77, "step": 23080, "train_speed(iter/s)": 0.201105 }, { "acc": 0.76558957, "epoch": 0.5387071844150542, "grad_norm": 4.8125, "learning_rate": 8.57134991265313e-06, "loss": 0.844098, "memory(GiB)": 135.77, "step": 23090, "train_speed(iter/s)": 0.201148 }, { "acc": 0.78299785, "epoch": 0.5389404919873431, "grad_norm": 5.90625, "learning_rate": 8.57002752471177e-06, "loss": 0.77799244, "memory(GiB)": 135.77, "step": 23100, "train_speed(iter/s)": 0.201192 }, { "acc": 0.78844042, "epoch": 0.539173799559632, "grad_norm": 4.78125, "learning_rate": 8.56870462715002e-06, "loss": 0.73825145, "memory(GiB)": 135.77, "step": 23110, "train_speed(iter/s)": 0.201237 }, { "acc": 0.78577042, "epoch": 0.5394071071319209, "grad_norm": 8.75, "learning_rate": 8.567381220156721e-06, "loss": 0.76423135, "memory(GiB)": 135.77, "step": 23120, "train_speed(iter/s)": 0.201279 }, { "acc": 0.76853571, "epoch": 0.5396404147042098, "grad_norm": 5.59375, "learning_rate": 8.566057303920788e-06, "loss": 0.82382536, "memory(GiB)": 135.77, "step": 23130, "train_speed(iter/s)": 0.201326 }, { "acc": 0.79181309, "epoch": 0.5398737222764987, "grad_norm": 3.703125, "learning_rate": 8.564732878631212e-06, "loss": 0.76008949, "memory(GiB)": 135.77, "step": 23140, "train_speed(iter/s)": 0.20137 }, { "acc": 0.77274303, "epoch": 0.5401070298487876, "grad_norm": 5.6875, "learning_rate": 8.563407944477052e-06, "loss": 0.82584839, "memory(GiB)": 135.77, "step": 23150, "train_speed(iter/s)": 0.201413 }, { "acc": 0.76535187, "epoch": 0.5403403374210765, "grad_norm": 5.8125, "learning_rate": 8.562082501647445e-06, "loss": 0.82021761, "memory(GiB)": 135.77, "step": 23160, "train_speed(iter/s)": 0.201455 }, { "acc": 0.76622362, "epoch": 0.5405736449933654, "grad_norm": 6.09375, "learning_rate": 8.560756550331594e-06, "loss": 0.82352085, "memory(GiB)": 135.77, "step": 23170, "train_speed(iter/s)": 0.201499 }, { "acc": 0.78566408, "epoch": 0.5408069525656543, "grad_norm": 5.34375, "learning_rate": 8.55943009071878e-06, "loss": 0.78319578, "memory(GiB)": 135.77, "step": 23180, "train_speed(iter/s)": 0.201539 }, { "acc": 0.79545965, "epoch": 0.5410402601379432, "grad_norm": 5.90625, "learning_rate": 8.558103122998354e-06, "loss": 0.74384508, "memory(GiB)": 135.77, "step": 23190, "train_speed(iter/s)": 0.201582 }, { "acc": 0.76623678, "epoch": 0.5412735677102319, "grad_norm": 6.3125, "learning_rate": 8.556775647359744e-06, "loss": 0.8515913, "memory(GiB)": 135.77, "step": 23200, "train_speed(iter/s)": 0.201629 }, { "acc": 0.76652737, "epoch": 0.5415068752825208, "grad_norm": 18.25, "learning_rate": 8.55544766399244e-06, "loss": 0.84666195, "memory(GiB)": 135.77, "step": 23210, "train_speed(iter/s)": 0.201675 }, { "acc": 0.77143021, "epoch": 0.5417401828548097, "grad_norm": 4.84375, "learning_rate": 8.554119173086014e-06, "loss": 0.82694244, "memory(GiB)": 135.77, "step": 23220, "train_speed(iter/s)": 0.201722 }, { "acc": 0.78573523, "epoch": 0.5419734904270986, "grad_norm": 4.4375, "learning_rate": 8.552790174830112e-06, "loss": 0.76147246, "memory(GiB)": 135.77, "step": 23230, "train_speed(iter/s)": 0.201769 }, { "acc": 0.77535591, "epoch": 0.5422067979993875, "grad_norm": 5.53125, "learning_rate": 8.551460669414444e-06, "loss": 0.80129585, "memory(GiB)": 135.77, "step": 23240, "train_speed(iter/s)": 0.201817 }, { "acc": 0.77153683, "epoch": 0.5424401055716764, "grad_norm": 5.84375, "learning_rate": 8.550130657028797e-06, "loss": 0.80181551, "memory(GiB)": 135.77, "step": 23250, "train_speed(iter/s)": 0.201864 }, { "acc": 0.78025579, "epoch": 0.5426734131439653, "grad_norm": 4.53125, "learning_rate": 8.548800137863028e-06, "loss": 0.78459616, "memory(GiB)": 135.77, "step": 23260, "train_speed(iter/s)": 0.20191 }, { "acc": 0.7832684, "epoch": 0.5429067207162542, "grad_norm": 5.25, "learning_rate": 8.547469112107071e-06, "loss": 0.79565368, "memory(GiB)": 135.77, "step": 23270, "train_speed(iter/s)": 0.201959 }, { "acc": 0.76917963, "epoch": 0.5431400282885431, "grad_norm": 5.96875, "learning_rate": 8.54613757995093e-06, "loss": 0.83835936, "memory(GiB)": 135.77, "step": 23280, "train_speed(iter/s)": 0.202004 }, { "acc": 0.76447086, "epoch": 0.543373335860832, "grad_norm": 7.375, "learning_rate": 8.54480554158468e-06, "loss": 0.83568411, "memory(GiB)": 135.77, "step": 23290, "train_speed(iter/s)": 0.202048 }, { "acc": 0.757164, "epoch": 0.5436066434331209, "grad_norm": 5.34375, "learning_rate": 8.543472997198467e-06, "loss": 0.87914925, "memory(GiB)": 135.77, "step": 23300, "train_speed(iter/s)": 0.202092 }, { "acc": 0.76810994, "epoch": 0.5438399510054098, "grad_norm": 5.96875, "learning_rate": 8.542139946982516e-06, "loss": 0.81503553, "memory(GiB)": 135.77, "step": 23310, "train_speed(iter/s)": 0.202138 }, { "acc": 0.7738739, "epoch": 0.5440732585776987, "grad_norm": 7.0625, "learning_rate": 8.540806391127112e-06, "loss": 0.82891731, "memory(GiB)": 135.77, "step": 23320, "train_speed(iter/s)": 0.202183 }, { "acc": 0.76221933, "epoch": 0.5443065661499876, "grad_norm": 6.1875, "learning_rate": 8.539472329822627e-06, "loss": 0.86402712, "memory(GiB)": 135.77, "step": 23330, "train_speed(iter/s)": 0.20223 }, { "acc": 0.79112253, "epoch": 0.5445398737222765, "grad_norm": 6.09375, "learning_rate": 8.538137763259495e-06, "loss": 0.75483222, "memory(GiB)": 135.77, "step": 23340, "train_speed(iter/s)": 0.202277 }, { "acc": 0.78928189, "epoch": 0.5447731812945654, "grad_norm": 5.4375, "learning_rate": 8.536802691628226e-06, "loss": 0.77416606, "memory(GiB)": 135.77, "step": 23350, "train_speed(iter/s)": 0.202318 }, { "acc": 0.77431383, "epoch": 0.5450064888668543, "grad_norm": 7.90625, "learning_rate": 8.535467115119399e-06, "loss": 0.79211545, "memory(GiB)": 135.77, "step": 23360, "train_speed(iter/s)": 0.202362 }, { "acc": 0.78563662, "epoch": 0.5452397964391432, "grad_norm": 4.40625, "learning_rate": 8.534131033923668e-06, "loss": 0.76018214, "memory(GiB)": 135.77, "step": 23370, "train_speed(iter/s)": 0.202406 }, { "acc": 0.7482007, "epoch": 0.5454731040114321, "grad_norm": 5.15625, "learning_rate": 8.53279444823176e-06, "loss": 0.90958967, "memory(GiB)": 135.77, "step": 23380, "train_speed(iter/s)": 0.202452 }, { "acc": 0.77748098, "epoch": 0.545706411583721, "grad_norm": 4.75, "learning_rate": 8.531457358234469e-06, "loss": 0.81523991, "memory(GiB)": 135.77, "step": 23390, "train_speed(iter/s)": 0.202498 }, { "acc": 0.78067293, "epoch": 0.5459397191560098, "grad_norm": 4.65625, "learning_rate": 8.530119764122666e-06, "loss": 0.80296059, "memory(GiB)": 135.77, "step": 23400, "train_speed(iter/s)": 0.202546 }, { "acc": 0.77213707, "epoch": 0.5461730267282987, "grad_norm": 6.3125, "learning_rate": 8.528781666087294e-06, "loss": 0.809834, "memory(GiB)": 135.77, "step": 23410, "train_speed(iter/s)": 0.202589 }, { "acc": 0.76574154, "epoch": 0.5464063343005876, "grad_norm": 5.75, "learning_rate": 8.527443064319362e-06, "loss": 0.82819872, "memory(GiB)": 135.77, "step": 23420, "train_speed(iter/s)": 0.202637 }, { "acc": 0.78312383, "epoch": 0.5466396418728765, "grad_norm": 5.4375, "learning_rate": 8.526103959009959e-06, "loss": 0.75177727, "memory(GiB)": 135.77, "step": 23430, "train_speed(iter/s)": 0.202685 }, { "acc": 0.77955656, "epoch": 0.5468729494451654, "grad_norm": 4.34375, "learning_rate": 8.52476435035024e-06, "loss": 0.78664408, "memory(GiB)": 135.77, "step": 23440, "train_speed(iter/s)": 0.202731 }, { "acc": 0.76553507, "epoch": 0.5471062570174543, "grad_norm": 5.8125, "learning_rate": 8.523424238531435e-06, "loss": 0.84986553, "memory(GiB)": 135.77, "step": 23450, "train_speed(iter/s)": 0.202773 }, { "acc": 0.77595615, "epoch": 0.5473395645897432, "grad_norm": 5.1875, "learning_rate": 8.522083623744841e-06, "loss": 0.80186443, "memory(GiB)": 135.77, "step": 23460, "train_speed(iter/s)": 0.202818 }, { "acc": 0.76666412, "epoch": 0.5475728721620321, "grad_norm": 7.65625, "learning_rate": 8.520742506181834e-06, "loss": 0.85667477, "memory(GiB)": 135.77, "step": 23470, "train_speed(iter/s)": 0.202861 }, { "acc": 0.78935471, "epoch": 0.547806179734321, "grad_norm": 5.625, "learning_rate": 8.519400886033858e-06, "loss": 0.77769156, "memory(GiB)": 135.77, "step": 23480, "train_speed(iter/s)": 0.202904 }, { "acc": 0.78004494, "epoch": 0.5480394873066099, "grad_norm": 5.46875, "learning_rate": 8.518058763492428e-06, "loss": 0.78160458, "memory(GiB)": 135.77, "step": 23490, "train_speed(iter/s)": 0.202951 }, { "acc": 0.7742156, "epoch": 0.5482727948788988, "grad_norm": 5.71875, "learning_rate": 8.516716138749131e-06, "loss": 0.82459059, "memory(GiB)": 135.77, "step": 23500, "train_speed(iter/s)": 0.202994 }, { "epoch": 0.5482727948788988, "eval_acc": 0.7398569047226694, "eval_loss": 0.8228998184204102, "eval_runtime": 1269.4084, "eval_samples_per_second": 28.353, "eval_steps_per_second": 14.177, "step": 23500 }, { "acc": 0.76532078, "epoch": 0.5485061024511877, "grad_norm": 6.0, "learning_rate": 8.515373011995624e-06, "loss": 0.84984226, "memory(GiB)": 135.77, "step": 23510, "train_speed(iter/s)": 0.200806 }, { "acc": 0.78477125, "epoch": 0.5487394100234766, "grad_norm": 6.78125, "learning_rate": 8.514029383423644e-06, "loss": 0.75276413, "memory(GiB)": 135.77, "step": 23520, "train_speed(iter/s)": 0.200846 }, { "acc": 0.79451346, "epoch": 0.5489727175957655, "grad_norm": 6.03125, "learning_rate": 8.51268525322499e-06, "loss": 0.74686522, "memory(GiB)": 135.77, "step": 23530, "train_speed(iter/s)": 0.200888 }, { "acc": 0.76671562, "epoch": 0.5492060251680544, "grad_norm": 6.0625, "learning_rate": 8.511340621591536e-06, "loss": 0.84974279, "memory(GiB)": 135.77, "step": 23540, "train_speed(iter/s)": 0.200932 }, { "acc": 0.77346573, "epoch": 0.5494393327403433, "grad_norm": 5.78125, "learning_rate": 8.509995488715228e-06, "loss": 0.83321466, "memory(GiB)": 135.77, "step": 23550, "train_speed(iter/s)": 0.200978 }, { "acc": 0.7776125, "epoch": 0.5496726403126322, "grad_norm": 3.34375, "learning_rate": 8.508649854788085e-06, "loss": 0.80355291, "memory(GiB)": 135.77, "step": 23560, "train_speed(iter/s)": 0.201019 }, { "acc": 0.76173635, "epoch": 0.5499059478849211, "grad_norm": 7.90625, "learning_rate": 8.507303720002194e-06, "loss": 0.87492666, "memory(GiB)": 135.77, "step": 23570, "train_speed(iter/s)": 0.201065 }, { "acc": 0.77865009, "epoch": 0.55013925545721, "grad_norm": 5.0, "learning_rate": 8.505957084549714e-06, "loss": 0.82421484, "memory(GiB)": 135.77, "step": 23580, "train_speed(iter/s)": 0.201109 }, { "acc": 0.77421579, "epoch": 0.5503725630294988, "grad_norm": 5.1875, "learning_rate": 8.50460994862288e-06, "loss": 0.80633135, "memory(GiB)": 135.77, "step": 23590, "train_speed(iter/s)": 0.201151 }, { "acc": 0.77418938, "epoch": 0.5506058706017877, "grad_norm": 4.65625, "learning_rate": 8.503262312413994e-06, "loss": 0.82369556, "memory(GiB)": 135.77, "step": 23600, "train_speed(iter/s)": 0.201192 }, { "acc": 0.775385, "epoch": 0.5508391781740766, "grad_norm": 4.96875, "learning_rate": 8.501914176115432e-06, "loss": 0.81491432, "memory(GiB)": 135.77, "step": 23610, "train_speed(iter/s)": 0.201234 }, { "acc": 0.76755171, "epoch": 0.5510724857463655, "grad_norm": 7.46875, "learning_rate": 8.500565539919636e-06, "loss": 0.84395332, "memory(GiB)": 135.77, "step": 23620, "train_speed(iter/s)": 0.201278 }, { "acc": 0.77598143, "epoch": 0.5513057933186544, "grad_norm": 6.75, "learning_rate": 8.499216404019129e-06, "loss": 0.80111885, "memory(GiB)": 135.77, "step": 23630, "train_speed(iter/s)": 0.201323 }, { "acc": 0.7583106, "epoch": 0.5515391008909433, "grad_norm": 7.28125, "learning_rate": 8.497866768606493e-06, "loss": 0.86915989, "memory(GiB)": 135.77, "step": 23640, "train_speed(iter/s)": 0.201369 }, { "acc": 0.77197981, "epoch": 0.5517724084632322, "grad_norm": 5.4375, "learning_rate": 8.496516633874395e-06, "loss": 0.81613064, "memory(GiB)": 135.77, "step": 23650, "train_speed(iter/s)": 0.201414 }, { "acc": 0.7657783, "epoch": 0.5520057160355211, "grad_norm": 5.625, "learning_rate": 8.495166000015562e-06, "loss": 0.84468031, "memory(GiB)": 135.77, "step": 23660, "train_speed(iter/s)": 0.201457 }, { "acc": 0.77991076, "epoch": 0.55223902360781, "grad_norm": 6.15625, "learning_rate": 8.493814867222799e-06, "loss": 0.80958157, "memory(GiB)": 135.77, "step": 23670, "train_speed(iter/s)": 0.201502 }, { "acc": 0.77264977, "epoch": 0.5524723311800989, "grad_norm": 5.5625, "learning_rate": 8.492463235688977e-06, "loss": 0.79891062, "memory(GiB)": 135.77, "step": 23680, "train_speed(iter/s)": 0.201547 }, { "acc": 0.7838623, "epoch": 0.5527056387523878, "grad_norm": 5.65625, "learning_rate": 8.491111105607044e-06, "loss": 0.77505665, "memory(GiB)": 135.77, "step": 23690, "train_speed(iter/s)": 0.20159 }, { "acc": 0.77503614, "epoch": 0.5529389463246767, "grad_norm": 4.59375, "learning_rate": 8.489758477170015e-06, "loss": 0.80072899, "memory(GiB)": 135.77, "step": 23700, "train_speed(iter/s)": 0.201631 }, { "acc": 0.76735268, "epoch": 0.5531722538969656, "grad_norm": 6.40625, "learning_rate": 8.488405350570976e-06, "loss": 0.84666109, "memory(GiB)": 135.77, "step": 23710, "train_speed(iter/s)": 0.201675 }, { "acc": 0.76926999, "epoch": 0.5534055614692545, "grad_norm": 7.34375, "learning_rate": 8.487051726003087e-06, "loss": 0.81005325, "memory(GiB)": 135.77, "step": 23720, "train_speed(iter/s)": 0.201719 }, { "acc": 0.7819787, "epoch": 0.5536388690415434, "grad_norm": 6.34375, "learning_rate": 8.485697603659578e-06, "loss": 0.77697315, "memory(GiB)": 135.77, "step": 23730, "train_speed(iter/s)": 0.201765 }, { "acc": 0.76455793, "epoch": 0.5538721766138323, "grad_norm": 6.1875, "learning_rate": 8.484342983733747e-06, "loss": 0.8478878, "memory(GiB)": 135.77, "step": 23740, "train_speed(iter/s)": 0.201811 }, { "acc": 0.77278852, "epoch": 0.5541054841861212, "grad_norm": 5.375, "learning_rate": 8.482987866418968e-06, "loss": 0.82200966, "memory(GiB)": 135.77, "step": 23750, "train_speed(iter/s)": 0.201851 }, { "acc": 0.79056635, "epoch": 0.55433879175841, "grad_norm": 7.78125, "learning_rate": 8.481632251908684e-06, "loss": 0.75850916, "memory(GiB)": 135.77, "step": 23760, "train_speed(iter/s)": 0.201893 }, { "acc": 0.78154702, "epoch": 0.554572099330699, "grad_norm": 7.5, "learning_rate": 8.480276140396406e-06, "loss": 0.78442221, "memory(GiB)": 135.77, "step": 23770, "train_speed(iter/s)": 0.201934 }, { "acc": 0.77974281, "epoch": 0.5548054069029877, "grad_norm": 6.03125, "learning_rate": 8.478919532075723e-06, "loss": 0.80161047, "memory(GiB)": 135.77, "step": 23780, "train_speed(iter/s)": 0.201977 }, { "acc": 0.77860785, "epoch": 0.5550387144752766, "grad_norm": 5.53125, "learning_rate": 8.477562427140283e-06, "loss": 0.80894966, "memory(GiB)": 135.77, "step": 23790, "train_speed(iter/s)": 0.20202 }, { "acc": 0.77603893, "epoch": 0.5552720220475655, "grad_norm": 4.3125, "learning_rate": 8.47620482578382e-06, "loss": 0.81759472, "memory(GiB)": 135.77, "step": 23800, "train_speed(iter/s)": 0.202062 }, { "acc": 0.77046776, "epoch": 0.5555053296198544, "grad_norm": 4.90625, "learning_rate": 8.474846728200125e-06, "loss": 0.82772932, "memory(GiB)": 135.77, "step": 23810, "train_speed(iter/s)": 0.202107 }, { "acc": 0.77512856, "epoch": 0.5557386371921433, "grad_norm": 3.6875, "learning_rate": 8.473488134583071e-06, "loss": 0.80098133, "memory(GiB)": 135.77, "step": 23820, "train_speed(iter/s)": 0.20215 }, { "acc": 0.7914567, "epoch": 0.5559719447644322, "grad_norm": 7.59375, "learning_rate": 8.472129045126596e-06, "loss": 0.74370127, "memory(GiB)": 135.77, "step": 23830, "train_speed(iter/s)": 0.202193 }, { "acc": 0.78856716, "epoch": 0.5562052523367211, "grad_norm": 6.0, "learning_rate": 8.470769460024705e-06, "loss": 0.75198135, "memory(GiB)": 135.77, "step": 23840, "train_speed(iter/s)": 0.202238 }, { "acc": 0.76632452, "epoch": 0.55643855990901, "grad_norm": 5.9375, "learning_rate": 8.469409379471486e-06, "loss": 0.86098537, "memory(GiB)": 135.77, "step": 23850, "train_speed(iter/s)": 0.202281 }, { "acc": 0.77517405, "epoch": 0.5566718674812989, "grad_norm": 4.625, "learning_rate": 8.468048803661083e-06, "loss": 0.79920835, "memory(GiB)": 135.77, "step": 23860, "train_speed(iter/s)": 0.202325 }, { "acc": 0.78587723, "epoch": 0.5569051750535878, "grad_norm": 7.875, "learning_rate": 8.466687732787721e-06, "loss": 0.77098179, "memory(GiB)": 135.77, "step": 23870, "train_speed(iter/s)": 0.202368 }, { "acc": 0.76620317, "epoch": 0.5571384826258767, "grad_norm": 10.5625, "learning_rate": 8.465326167045693e-06, "loss": 0.86184235, "memory(GiB)": 135.77, "step": 23880, "train_speed(iter/s)": 0.202409 }, { "acc": 0.77027483, "epoch": 0.5573717901981656, "grad_norm": 5.875, "learning_rate": 8.463964106629361e-06, "loss": 0.81875496, "memory(GiB)": 135.77, "step": 23890, "train_speed(iter/s)": 0.20245 }, { "acc": 0.78449135, "epoch": 0.5576050977704545, "grad_norm": 4.625, "learning_rate": 8.46260155173316e-06, "loss": 0.77044706, "memory(GiB)": 135.77, "step": 23900, "train_speed(iter/s)": 0.202491 }, { "acc": 0.77500076, "epoch": 0.5578384053427434, "grad_norm": 4.90625, "learning_rate": 8.461238502551592e-06, "loss": 0.81618023, "memory(GiB)": 135.77, "step": 23910, "train_speed(iter/s)": 0.202537 }, { "acc": 0.7681963, "epoch": 0.5580717129150323, "grad_norm": 6.3125, "learning_rate": 8.459874959279235e-06, "loss": 0.85666189, "memory(GiB)": 135.77, "step": 23920, "train_speed(iter/s)": 0.202581 }, { "acc": 0.75990124, "epoch": 0.5583050204873212, "grad_norm": 4.40625, "learning_rate": 8.45851092211073e-06, "loss": 0.86498947, "memory(GiB)": 135.77, "step": 23930, "train_speed(iter/s)": 0.202626 }, { "acc": 0.77422037, "epoch": 0.5585383280596101, "grad_norm": 4.59375, "learning_rate": 8.457146391240798e-06, "loss": 0.81769314, "memory(GiB)": 135.77, "step": 23940, "train_speed(iter/s)": 0.202668 }, { "acc": 0.78469434, "epoch": 0.558771635631899, "grad_norm": 5.125, "learning_rate": 8.455781366864223e-06, "loss": 0.77262511, "memory(GiB)": 135.77, "step": 23950, "train_speed(iter/s)": 0.202713 }, { "acc": 0.77914438, "epoch": 0.5590049432041879, "grad_norm": 6.5, "learning_rate": 8.45441584917586e-06, "loss": 0.7662066, "memory(GiB)": 135.77, "step": 23960, "train_speed(iter/s)": 0.202752 }, { "acc": 0.78139782, "epoch": 0.5592382507764767, "grad_norm": 4.90625, "learning_rate": 8.453049838370639e-06, "loss": 0.78332119, "memory(GiB)": 135.77, "step": 23970, "train_speed(iter/s)": 0.202796 }, { "acc": 0.75784979, "epoch": 0.5594715583487656, "grad_norm": 7.40625, "learning_rate": 8.451683334643557e-06, "loss": 0.88706684, "memory(GiB)": 135.77, "step": 23980, "train_speed(iter/s)": 0.20284 }, { "acc": 0.79134464, "epoch": 0.5597048659210545, "grad_norm": 8.8125, "learning_rate": 8.45031633818968e-06, "loss": 0.73851681, "memory(GiB)": 135.77, "step": 23990, "train_speed(iter/s)": 0.202884 }, { "acc": 0.76034174, "epoch": 0.5599381734933434, "grad_norm": 5.25, "learning_rate": 8.44894884920415e-06, "loss": 0.87289333, "memory(GiB)": 135.77, "step": 24000, "train_speed(iter/s)": 0.20293 }, { "epoch": 0.5599381734933434, "eval_acc": 0.7398836488094638, "eval_loss": 0.822571873664856, "eval_runtime": 1269.3247, "eval_samples_per_second": 28.354, "eval_steps_per_second": 14.178, "step": 24000 }, { "acc": 0.76281738, "epoch": 0.5601714810656323, "grad_norm": 19.375, "learning_rate": 8.447580867882172e-06, "loss": 0.85142126, "memory(GiB)": 135.77, "step": 24010, "train_speed(iter/s)": 0.200792 }, { "acc": 0.79376831, "epoch": 0.5604047886379212, "grad_norm": 4.65625, "learning_rate": 8.446212394419028e-06, "loss": 0.74882054, "memory(GiB)": 135.77, "step": 24020, "train_speed(iter/s)": 0.200833 }, { "acc": 0.78502007, "epoch": 0.5606380962102101, "grad_norm": 4.5625, "learning_rate": 8.444843429010065e-06, "loss": 0.7658041, "memory(GiB)": 135.77, "step": 24030, "train_speed(iter/s)": 0.200876 }, { "acc": 0.79705825, "epoch": 0.560871403782499, "grad_norm": 4.75, "learning_rate": 8.443473971850703e-06, "loss": 0.72210217, "memory(GiB)": 135.77, "step": 24040, "train_speed(iter/s)": 0.200917 }, { "acc": 0.76908588, "epoch": 0.5611047113547879, "grad_norm": 7.78125, "learning_rate": 8.442104023136435e-06, "loss": 0.83583965, "memory(GiB)": 135.77, "step": 24050, "train_speed(iter/s)": 0.20096 }, { "acc": 0.78580241, "epoch": 0.5613380189270768, "grad_norm": 6.5625, "learning_rate": 8.440733583062814e-06, "loss": 0.76494713, "memory(GiB)": 135.77, "step": 24060, "train_speed(iter/s)": 0.201002 }, { "acc": 0.75843534, "epoch": 0.5615713264993657, "grad_norm": 6.8125, "learning_rate": 8.439362651825475e-06, "loss": 0.8755847, "memory(GiB)": 135.77, "step": 24070, "train_speed(iter/s)": 0.201046 }, { "acc": 0.77627125, "epoch": 0.5618046340716546, "grad_norm": 9.9375, "learning_rate": 8.437991229620117e-06, "loss": 0.80955582, "memory(GiB)": 135.77, "step": 24080, "train_speed(iter/s)": 0.201086 }, { "acc": 0.77059631, "epoch": 0.5620379416439435, "grad_norm": 4.46875, "learning_rate": 8.436619316642508e-06, "loss": 0.82761564, "memory(GiB)": 135.77, "step": 24090, "train_speed(iter/s)": 0.20113 }, { "acc": 0.77916279, "epoch": 0.5622712492162324, "grad_norm": 4.625, "learning_rate": 8.435246913088492e-06, "loss": 0.80605087, "memory(GiB)": 135.77, "step": 24100, "train_speed(iter/s)": 0.201173 }, { "acc": 0.78393345, "epoch": 0.5625045567885213, "grad_norm": 5.9375, "learning_rate": 8.433874019153976e-06, "loss": 0.78385267, "memory(GiB)": 135.77, "step": 24110, "train_speed(iter/s)": 0.201217 }, { "acc": 0.78179646, "epoch": 0.5627378643608102, "grad_norm": 5.28125, "learning_rate": 8.432500635034942e-06, "loss": 0.77540627, "memory(GiB)": 135.77, "step": 24120, "train_speed(iter/s)": 0.201259 }, { "acc": 0.76452451, "epoch": 0.5629711719330991, "grad_norm": 6.65625, "learning_rate": 8.43112676092744e-06, "loss": 0.88243942, "memory(GiB)": 135.77, "step": 24130, "train_speed(iter/s)": 0.201301 }, { "acc": 0.77689223, "epoch": 0.563204479505388, "grad_norm": 5.375, "learning_rate": 8.429752397027585e-06, "loss": 0.79288564, "memory(GiB)": 135.77, "step": 24140, "train_speed(iter/s)": 0.201342 }, { "acc": 0.77824707, "epoch": 0.5634377870776769, "grad_norm": 7.28125, "learning_rate": 8.428377543531577e-06, "loss": 0.79980693, "memory(GiB)": 135.77, "step": 24150, "train_speed(iter/s)": 0.201385 }, { "acc": 0.77147899, "epoch": 0.5636710946499658, "grad_norm": 9.625, "learning_rate": 8.427002200635669e-06, "loss": 0.82787066, "memory(GiB)": 135.77, "step": 24160, "train_speed(iter/s)": 0.201428 }, { "acc": 0.75589886, "epoch": 0.5639044022222546, "grad_norm": 5.8125, "learning_rate": 8.425626368536192e-06, "loss": 0.88443241, "memory(GiB)": 135.77, "step": 24170, "train_speed(iter/s)": 0.201471 }, { "acc": 0.7736052, "epoch": 0.5641377097945435, "grad_norm": 6.0625, "learning_rate": 8.424250047429547e-06, "loss": 0.82636681, "memory(GiB)": 135.77, "step": 24180, "train_speed(iter/s)": 0.201514 }, { "acc": 0.76566911, "epoch": 0.5643710173668324, "grad_norm": 5.8125, "learning_rate": 8.4228732375122e-06, "loss": 0.85641251, "memory(GiB)": 135.77, "step": 24190, "train_speed(iter/s)": 0.201557 }, { "acc": 0.77742729, "epoch": 0.5646043249391213, "grad_norm": 5.375, "learning_rate": 8.421495938980695e-06, "loss": 0.78332605, "memory(GiB)": 135.77, "step": 24200, "train_speed(iter/s)": 0.201598 }, { "acc": 0.77955341, "epoch": 0.5648376325114102, "grad_norm": 4.40625, "learning_rate": 8.420118152031638e-06, "loss": 0.79840794, "memory(GiB)": 135.77, "step": 24210, "train_speed(iter/s)": 0.201641 }, { "acc": 0.79004874, "epoch": 0.5650709400836991, "grad_norm": 6.71875, "learning_rate": 8.418739876861708e-06, "loss": 0.75119066, "memory(GiB)": 135.77, "step": 24220, "train_speed(iter/s)": 0.201681 }, { "acc": 0.77495551, "epoch": 0.565304247655988, "grad_norm": 5.875, "learning_rate": 8.417361113667654e-06, "loss": 0.80294151, "memory(GiB)": 135.77, "step": 24230, "train_speed(iter/s)": 0.201723 }, { "acc": 0.77338634, "epoch": 0.5655375552282769, "grad_norm": 6.34375, "learning_rate": 8.415981862646295e-06, "loss": 0.80396366, "memory(GiB)": 135.77, "step": 24240, "train_speed(iter/s)": 0.201765 }, { "acc": 0.76142254, "epoch": 0.5657708628005658, "grad_norm": 5.53125, "learning_rate": 8.414602123994517e-06, "loss": 0.8642437, "memory(GiB)": 135.77, "step": 24250, "train_speed(iter/s)": 0.201808 }, { "acc": 0.77578001, "epoch": 0.5660041703728547, "grad_norm": 7.84375, "learning_rate": 8.413221897909277e-06, "loss": 0.80832329, "memory(GiB)": 135.77, "step": 24260, "train_speed(iter/s)": 0.201849 }, { "acc": 0.75002069, "epoch": 0.5662374779451436, "grad_norm": 27.875, "learning_rate": 8.411841184587602e-06, "loss": 0.94861403, "memory(GiB)": 135.77, "step": 24270, "train_speed(iter/s)": 0.201892 }, { "acc": 0.79898376, "epoch": 0.5664707855174325, "grad_norm": 6.53125, "learning_rate": 8.41045998422659e-06, "loss": 0.73172636, "memory(GiB)": 135.77, "step": 24280, "train_speed(iter/s)": 0.201931 }, { "acc": 0.79310303, "epoch": 0.5667040930897214, "grad_norm": 6.03125, "learning_rate": 8.409078297023406e-06, "loss": 0.71440797, "memory(GiB)": 135.77, "step": 24290, "train_speed(iter/s)": 0.201972 }, { "acc": 0.77959661, "epoch": 0.5669374006620103, "grad_norm": 4.5625, "learning_rate": 8.407696123175285e-06, "loss": 0.78759675, "memory(GiB)": 135.77, "step": 24300, "train_speed(iter/s)": 0.202013 }, { "acc": 0.79495506, "epoch": 0.5671707082342992, "grad_norm": 4.53125, "learning_rate": 8.406313462879533e-06, "loss": 0.73519535, "memory(GiB)": 135.77, "step": 24310, "train_speed(iter/s)": 0.202053 }, { "acc": 0.77956157, "epoch": 0.5674040158065881, "grad_norm": 4.875, "learning_rate": 8.404930316333524e-06, "loss": 0.80541306, "memory(GiB)": 135.77, "step": 24320, "train_speed(iter/s)": 0.202096 }, { "acc": 0.76412115, "epoch": 0.567637323378877, "grad_norm": 5.8125, "learning_rate": 8.4035466837347e-06, "loss": 0.86554079, "memory(GiB)": 135.77, "step": 24330, "train_speed(iter/s)": 0.202139 }, { "acc": 0.76175427, "epoch": 0.5678706309511659, "grad_norm": 4.46875, "learning_rate": 8.402162565280577e-06, "loss": 0.92497549, "memory(GiB)": 135.77, "step": 24340, "train_speed(iter/s)": 0.202181 }, { "acc": 0.77174816, "epoch": 0.5681039385234548, "grad_norm": 5.34375, "learning_rate": 8.400777961168736e-06, "loss": 0.83751183, "memory(GiB)": 135.77, "step": 24350, "train_speed(iter/s)": 0.202224 }, { "acc": 0.76675367, "epoch": 0.5683372460957435, "grad_norm": 4.5, "learning_rate": 8.399392871596828e-06, "loss": 0.82344303, "memory(GiB)": 135.77, "step": 24360, "train_speed(iter/s)": 0.20227 }, { "acc": 0.79096088, "epoch": 0.5685705536680324, "grad_norm": 6.0625, "learning_rate": 8.398007296762576e-06, "loss": 0.74808559, "memory(GiB)": 135.77, "step": 24370, "train_speed(iter/s)": 0.202311 }, { "acc": 0.78298712, "epoch": 0.5688038612403213, "grad_norm": 5.96875, "learning_rate": 8.39662123686377e-06, "loss": 0.7767374, "memory(GiB)": 135.77, "step": 24380, "train_speed(iter/s)": 0.202355 }, { "acc": 0.77403417, "epoch": 0.5690371688126102, "grad_norm": 5.65625, "learning_rate": 8.395234692098267e-06, "loss": 0.80425043, "memory(GiB)": 135.77, "step": 24390, "train_speed(iter/s)": 0.202399 }, { "acc": 0.76454625, "epoch": 0.5692704763848991, "grad_norm": 4.96875, "learning_rate": 8.393847662663998e-06, "loss": 0.83601761, "memory(GiB)": 135.77, "step": 24400, "train_speed(iter/s)": 0.20244 }, { "acc": 0.78817453, "epoch": 0.569503783957188, "grad_norm": 5.625, "learning_rate": 8.392460148758962e-06, "loss": 0.75539885, "memory(GiB)": 135.77, "step": 24410, "train_speed(iter/s)": 0.202484 }, { "acc": 0.77908316, "epoch": 0.5697370915294769, "grad_norm": 5.25, "learning_rate": 8.391072150581228e-06, "loss": 0.78923807, "memory(GiB)": 135.77, "step": 24420, "train_speed(iter/s)": 0.202523 }, { "acc": 0.77952948, "epoch": 0.5699703991017658, "grad_norm": 7.96875, "learning_rate": 8.389683668328927e-06, "loss": 0.79888625, "memory(GiB)": 135.77, "step": 24430, "train_speed(iter/s)": 0.202564 }, { "acc": 0.7668036, "epoch": 0.5702037066740547, "grad_norm": 5.21875, "learning_rate": 8.388294702200267e-06, "loss": 0.83973408, "memory(GiB)": 135.77, "step": 24440, "train_speed(iter/s)": 0.202607 }, { "acc": 0.78203883, "epoch": 0.5704370142463436, "grad_norm": 6.28125, "learning_rate": 8.386905252393522e-06, "loss": 0.7930686, "memory(GiB)": 135.77, "step": 24450, "train_speed(iter/s)": 0.20265 }, { "acc": 0.78412566, "epoch": 0.5706703218186325, "grad_norm": 10.0, "learning_rate": 8.385515319107038e-06, "loss": 0.77225199, "memory(GiB)": 135.77, "step": 24460, "train_speed(iter/s)": 0.202691 }, { "acc": 0.76364594, "epoch": 0.5709036293909214, "grad_norm": 6.0, "learning_rate": 8.384124902539225e-06, "loss": 0.83564758, "memory(GiB)": 135.77, "step": 24470, "train_speed(iter/s)": 0.202734 }, { "acc": 0.76822014, "epoch": 0.5711369369632103, "grad_norm": 4.6875, "learning_rate": 8.382734002888565e-06, "loss": 0.81515923, "memory(GiB)": 135.77, "step": 24480, "train_speed(iter/s)": 0.202778 }, { "acc": 0.78435955, "epoch": 0.5713702445354992, "grad_norm": 6.0625, "learning_rate": 8.381342620353609e-06, "loss": 0.7858181, "memory(GiB)": 135.77, "step": 24490, "train_speed(iter/s)": 0.202821 }, { "acc": 0.7721489, "epoch": 0.5716035521077881, "grad_norm": 6.1875, "learning_rate": 8.379950755132975e-06, "loss": 0.81357956, "memory(GiB)": 135.77, "step": 24500, "train_speed(iter/s)": 0.202865 }, { "epoch": 0.5716035521077881, "eval_acc": 0.739892296598008, "eval_loss": 0.8211784958839417, "eval_runtime": 1271.0226, "eval_samples_per_second": 28.317, "eval_steps_per_second": 14.159, "step": 24500 }, { "acc": 0.79300923, "epoch": 0.571836859680077, "grad_norm": 6.03125, "learning_rate": 8.378558407425355e-06, "loss": 0.75731974, "memory(GiB)": 135.77, "step": 24510, "train_speed(iter/s)": 0.200768 }, { "acc": 0.7801857, "epoch": 0.5720701672523659, "grad_norm": 5.375, "learning_rate": 8.377165577429502e-06, "loss": 0.79804745, "memory(GiB)": 135.77, "step": 24520, "train_speed(iter/s)": 0.20081 }, { "acc": 0.78895264, "epoch": 0.5723034748246548, "grad_norm": 6.4375, "learning_rate": 8.375772265344244e-06, "loss": 0.74042835, "memory(GiB)": 135.77, "step": 24530, "train_speed(iter/s)": 0.200848 }, { "acc": 0.78116617, "epoch": 0.5725367823969437, "grad_norm": 6.0625, "learning_rate": 8.374378471368476e-06, "loss": 0.78446503, "memory(GiB)": 135.77, "step": 24540, "train_speed(iter/s)": 0.200891 }, { "acc": 0.78846378, "epoch": 0.5727700899692325, "grad_norm": 7.96875, "learning_rate": 8.37298419570116e-06, "loss": 0.76507263, "memory(GiB)": 135.77, "step": 24550, "train_speed(iter/s)": 0.200932 }, { "acc": 0.75134902, "epoch": 0.5730033975415214, "grad_norm": 8.1875, "learning_rate": 8.371589438541333e-06, "loss": 0.91664543, "memory(GiB)": 135.77, "step": 24560, "train_speed(iter/s)": 0.200973 }, { "acc": 0.77958789, "epoch": 0.5732367051138103, "grad_norm": 5.6875, "learning_rate": 8.370194200088091e-06, "loss": 0.78155866, "memory(GiB)": 135.77, "step": 24570, "train_speed(iter/s)": 0.201014 }, { "acc": 0.78482103, "epoch": 0.5734700126860992, "grad_norm": 5.375, "learning_rate": 8.368798480540607e-06, "loss": 0.76826344, "memory(GiB)": 135.77, "step": 24580, "train_speed(iter/s)": 0.201055 }, { "acc": 0.7931962, "epoch": 0.5737033202583881, "grad_norm": 5.125, "learning_rate": 8.367402280098118e-06, "loss": 0.71536484, "memory(GiB)": 135.77, "step": 24590, "train_speed(iter/s)": 0.201093 }, { "acc": 0.77020044, "epoch": 0.573936627830677, "grad_norm": 6.3125, "learning_rate": 8.366005598959932e-06, "loss": 0.83158503, "memory(GiB)": 135.77, "step": 24600, "train_speed(iter/s)": 0.201134 }, { "acc": 0.79495125, "epoch": 0.5741699354029659, "grad_norm": 5.09375, "learning_rate": 8.364608437325426e-06, "loss": 0.74491563, "memory(GiB)": 135.77, "step": 24610, "train_speed(iter/s)": 0.201177 }, { "acc": 0.75548391, "epoch": 0.5744032429752548, "grad_norm": 5.09375, "learning_rate": 8.363210795394042e-06, "loss": 0.88298302, "memory(GiB)": 135.77, "step": 24620, "train_speed(iter/s)": 0.201221 }, { "acc": 0.7842412, "epoch": 0.5746365505475437, "grad_norm": 6.53125, "learning_rate": 8.361812673365292e-06, "loss": 0.76854534, "memory(GiB)": 135.77, "step": 24630, "train_speed(iter/s)": 0.201263 }, { "acc": 0.7659914, "epoch": 0.5748698581198326, "grad_norm": 5.125, "learning_rate": 8.360414071438761e-06, "loss": 0.84088774, "memory(GiB)": 135.77, "step": 24640, "train_speed(iter/s)": 0.201306 }, { "acc": 0.78436427, "epoch": 0.5751031656921215, "grad_norm": 5.0625, "learning_rate": 8.359014989814099e-06, "loss": 0.79782057, "memory(GiB)": 135.77, "step": 24650, "train_speed(iter/s)": 0.201352 }, { "acc": 0.76118832, "epoch": 0.5753364732644104, "grad_norm": 8.0625, "learning_rate": 8.35761542869102e-06, "loss": 0.86918697, "memory(GiB)": 135.77, "step": 24660, "train_speed(iter/s)": 0.201396 }, { "acc": 0.76398478, "epoch": 0.5755697808366993, "grad_norm": 6.6875, "learning_rate": 8.356215388269316e-06, "loss": 0.84599447, "memory(GiB)": 135.77, "step": 24670, "train_speed(iter/s)": 0.201435 }, { "acc": 0.78095145, "epoch": 0.5758030884089882, "grad_norm": 5.3125, "learning_rate": 8.354814868748839e-06, "loss": 0.77294083, "memory(GiB)": 135.77, "step": 24680, "train_speed(iter/s)": 0.201476 }, { "acc": 0.76520529, "epoch": 0.5760363959812771, "grad_norm": 7.0625, "learning_rate": 8.353413870329514e-06, "loss": 0.86367378, "memory(GiB)": 135.77, "step": 24690, "train_speed(iter/s)": 0.201518 }, { "acc": 0.77567444, "epoch": 0.576269703553566, "grad_norm": 4.25, "learning_rate": 8.352012393211336e-06, "loss": 0.79126658, "memory(GiB)": 135.77, "step": 24700, "train_speed(iter/s)": 0.20156 }, { "acc": 0.7710907, "epoch": 0.5765030111258549, "grad_norm": 6.28125, "learning_rate": 8.35061043759436e-06, "loss": 0.83803148, "memory(GiB)": 135.77, "step": 24710, "train_speed(iter/s)": 0.201599 }, { "acc": 0.76869202, "epoch": 0.5767363186981438, "grad_norm": 5.21875, "learning_rate": 8.349208003678716e-06, "loss": 0.83444061, "memory(GiB)": 135.77, "step": 24720, "train_speed(iter/s)": 0.201641 }, { "acc": 0.77576461, "epoch": 0.5769696262704327, "grad_norm": 5.375, "learning_rate": 8.347805091664606e-06, "loss": 0.82614613, "memory(GiB)": 135.77, "step": 24730, "train_speed(iter/s)": 0.201681 }, { "acc": 0.77615957, "epoch": 0.5772029338427216, "grad_norm": 5.46875, "learning_rate": 8.34640170175229e-06, "loss": 0.79450846, "memory(GiB)": 135.77, "step": 24740, "train_speed(iter/s)": 0.201723 }, { "acc": 0.77073669, "epoch": 0.5774362414150104, "grad_norm": 7.9375, "learning_rate": 8.344997834142103e-06, "loss": 0.82609434, "memory(GiB)": 135.77, "step": 24750, "train_speed(iter/s)": 0.201762 }, { "acc": 0.79335299, "epoch": 0.5776695489872993, "grad_norm": 5.59375, "learning_rate": 8.343593489034447e-06, "loss": 0.74660645, "memory(GiB)": 135.77, "step": 24760, "train_speed(iter/s)": 0.201803 }, { "acc": 0.77887115, "epoch": 0.5779028565595882, "grad_norm": 6.34375, "learning_rate": 8.342188666629793e-06, "loss": 0.82571802, "memory(GiB)": 135.77, "step": 24770, "train_speed(iter/s)": 0.201846 }, { "acc": 0.78749199, "epoch": 0.5781361641318771, "grad_norm": 5.53125, "learning_rate": 8.340783367128677e-06, "loss": 0.7465394, "memory(GiB)": 135.77, "step": 24780, "train_speed(iter/s)": 0.20189 }, { "acc": 0.7764246, "epoch": 0.578369471704166, "grad_norm": 3.9375, "learning_rate": 8.339377590731705e-06, "loss": 0.78514347, "memory(GiB)": 135.77, "step": 24790, "train_speed(iter/s)": 0.201931 }, { "acc": 0.76679988, "epoch": 0.5786027792764549, "grad_norm": 6.8125, "learning_rate": 8.337971337639552e-06, "loss": 0.84274111, "memory(GiB)": 135.77, "step": 24800, "train_speed(iter/s)": 0.201972 }, { "acc": 0.77209625, "epoch": 0.5788360868487438, "grad_norm": 5.65625, "learning_rate": 8.336564608052961e-06, "loss": 0.80813961, "memory(GiB)": 135.77, "step": 24810, "train_speed(iter/s)": 0.202017 }, { "acc": 0.77081928, "epoch": 0.5790693944210327, "grad_norm": 5.0625, "learning_rate": 8.335157402172743e-06, "loss": 0.83020134, "memory(GiB)": 135.77, "step": 24820, "train_speed(iter/s)": 0.20206 }, { "acc": 0.78011541, "epoch": 0.5793027019933216, "grad_norm": 5.46875, "learning_rate": 8.333749720199772e-06, "loss": 0.80131474, "memory(GiB)": 135.77, "step": 24830, "train_speed(iter/s)": 0.202103 }, { "acc": 0.78027191, "epoch": 0.5795360095656105, "grad_norm": 7.375, "learning_rate": 8.332341562334998e-06, "loss": 0.78537641, "memory(GiB)": 135.77, "step": 24840, "train_speed(iter/s)": 0.202141 }, { "acc": 0.78687563, "epoch": 0.5797693171378994, "grad_norm": 4.90625, "learning_rate": 8.330932928779434e-06, "loss": 0.75612831, "memory(GiB)": 135.77, "step": 24850, "train_speed(iter/s)": 0.202183 }, { "acc": 0.78615999, "epoch": 0.5800026247101883, "grad_norm": 5.5625, "learning_rate": 8.329523819734161e-06, "loss": 0.74080048, "memory(GiB)": 135.77, "step": 24860, "train_speed(iter/s)": 0.202226 }, { "acc": 0.77790847, "epoch": 0.5802359322824772, "grad_norm": 7.1875, "learning_rate": 8.328114235400331e-06, "loss": 0.79633036, "memory(GiB)": 135.77, "step": 24870, "train_speed(iter/s)": 0.202268 }, { "acc": 0.76988811, "epoch": 0.5804692398547661, "grad_norm": 5.625, "learning_rate": 8.326704175979162e-06, "loss": 0.8448287, "memory(GiB)": 135.77, "step": 24880, "train_speed(iter/s)": 0.20231 }, { "acc": 0.78928442, "epoch": 0.580702547427055, "grad_norm": 4.65625, "learning_rate": 8.325293641671936e-06, "loss": 0.76444321, "memory(GiB)": 135.77, "step": 24890, "train_speed(iter/s)": 0.202353 }, { "acc": 0.77171555, "epoch": 0.5809358549993439, "grad_norm": 4.65625, "learning_rate": 8.32388263268001e-06, "loss": 0.83708153, "memory(GiB)": 135.77, "step": 24900, "train_speed(iter/s)": 0.202393 }, { "acc": 0.76470823, "epoch": 0.5811691625716328, "grad_norm": 6.5625, "learning_rate": 8.322471149204804e-06, "loss": 0.86168737, "memory(GiB)": 135.77, "step": 24910, "train_speed(iter/s)": 0.202435 }, { "acc": 0.76229396, "epoch": 0.5814024701439217, "grad_norm": 6.3125, "learning_rate": 8.321059191447807e-06, "loss": 0.8397192, "memory(GiB)": 135.77, "step": 24920, "train_speed(iter/s)": 0.202476 }, { "acc": 0.76328716, "epoch": 0.5816357777162106, "grad_norm": 4.65625, "learning_rate": 8.319646759610573e-06, "loss": 0.85694113, "memory(GiB)": 135.77, "step": 24930, "train_speed(iter/s)": 0.202516 }, { "acc": 0.76525183, "epoch": 0.5818690852884993, "grad_norm": 18.875, "learning_rate": 8.31823385389473e-06, "loss": 0.8547142, "memory(GiB)": 135.77, "step": 24940, "train_speed(iter/s)": 0.202558 }, { "acc": 0.79167852, "epoch": 0.5821023928607882, "grad_norm": 4.8125, "learning_rate": 8.316820474501968e-06, "loss": 0.74602165, "memory(GiB)": 135.77, "step": 24950, "train_speed(iter/s)": 0.202601 }, { "acc": 0.79028225, "epoch": 0.5823357004330771, "grad_norm": 14.0, "learning_rate": 8.315406621634048e-06, "loss": 0.76107903, "memory(GiB)": 135.77, "step": 24960, "train_speed(iter/s)": 0.202643 }, { "acc": 0.76247883, "epoch": 0.582569008005366, "grad_norm": 4.78125, "learning_rate": 8.313992295492794e-06, "loss": 0.87208462, "memory(GiB)": 135.77, "step": 24970, "train_speed(iter/s)": 0.202687 }, { "acc": 0.77712317, "epoch": 0.5828023155776549, "grad_norm": 4.59375, "learning_rate": 8.312577496280103e-06, "loss": 0.79026947, "memory(GiB)": 135.77, "step": 24980, "train_speed(iter/s)": 0.202728 }, { "acc": 0.76572886, "epoch": 0.5830356231499438, "grad_norm": 5.15625, "learning_rate": 8.311162224197938e-06, "loss": 0.84107924, "memory(GiB)": 135.77, "step": 24990, "train_speed(iter/s)": 0.202769 }, { "acc": 0.77434468, "epoch": 0.5832689307222327, "grad_norm": 5.53125, "learning_rate": 8.309746479448324e-06, "loss": 0.8128541, "memory(GiB)": 135.77, "step": 25000, "train_speed(iter/s)": 0.202812 }, { "epoch": 0.5832689307222327, "eval_acc": 0.7402269980435179, "eval_loss": 0.8210588693618774, "eval_runtime": 1270.1769, "eval_samples_per_second": 28.335, "eval_steps_per_second": 14.168, "step": 25000 }, { "acc": 0.78035855, "epoch": 0.5835022382945216, "grad_norm": 4.78125, "learning_rate": 8.308330262233366e-06, "loss": 0.78389635, "memory(GiB)": 135.77, "step": 25010, "train_speed(iter/s)": 0.200755 }, { "acc": 0.77793059, "epoch": 0.5837355458668105, "grad_norm": 4.03125, "learning_rate": 8.306913572755221e-06, "loss": 0.77305503, "memory(GiB)": 135.77, "step": 25020, "train_speed(iter/s)": 0.200798 }, { "acc": 0.77444048, "epoch": 0.5839688534390994, "grad_norm": 12.3125, "learning_rate": 8.305496411216125e-06, "loss": 0.82593975, "memory(GiB)": 135.77, "step": 25030, "train_speed(iter/s)": 0.200837 }, { "acc": 0.79068418, "epoch": 0.5842021610113883, "grad_norm": 4.96875, "learning_rate": 8.304078777818377e-06, "loss": 0.76902728, "memory(GiB)": 135.77, "step": 25040, "train_speed(iter/s)": 0.20088 }, { "acc": 0.78544216, "epoch": 0.5844354685836772, "grad_norm": 6.4375, "learning_rate": 8.302660672764343e-06, "loss": 0.77539878, "memory(GiB)": 135.77, "step": 25050, "train_speed(iter/s)": 0.200921 }, { "acc": 0.7708941, "epoch": 0.5846687761559661, "grad_norm": 6.03125, "learning_rate": 8.301242096256457e-06, "loss": 0.82189264, "memory(GiB)": 135.77, "step": 25060, "train_speed(iter/s)": 0.20096 }, { "acc": 0.76837645, "epoch": 0.584902083728255, "grad_norm": 5.75, "learning_rate": 8.299823048497221e-06, "loss": 0.8290554, "memory(GiB)": 135.77, "step": 25070, "train_speed(iter/s)": 0.201004 }, { "acc": 0.77567482, "epoch": 0.5851353913005439, "grad_norm": 5.28125, "learning_rate": 8.298403529689204e-06, "loss": 0.81522255, "memory(GiB)": 135.77, "step": 25080, "train_speed(iter/s)": 0.201047 }, { "acc": 0.78811507, "epoch": 0.5853686988728328, "grad_norm": 5.15625, "learning_rate": 8.296983540035041e-06, "loss": 0.73775659, "memory(GiB)": 135.77, "step": 25090, "train_speed(iter/s)": 0.201087 }, { "acc": 0.74818373, "epoch": 0.5856020064451217, "grad_norm": 4.75, "learning_rate": 8.295563079737436e-06, "loss": 0.91113939, "memory(GiB)": 135.77, "step": 25100, "train_speed(iter/s)": 0.201129 }, { "acc": 0.7771193, "epoch": 0.5858353140174106, "grad_norm": 5.46875, "learning_rate": 8.294142148999157e-06, "loss": 0.78849659, "memory(GiB)": 135.77, "step": 25110, "train_speed(iter/s)": 0.201173 }, { "acc": 0.77626028, "epoch": 0.5860686215896995, "grad_norm": 4.9375, "learning_rate": 8.292720748023045e-06, "loss": 0.78736038, "memory(GiB)": 135.77, "step": 25120, "train_speed(iter/s)": 0.201212 }, { "acc": 0.78426456, "epoch": 0.5863019291619883, "grad_norm": 5.71875, "learning_rate": 8.291298877012002e-06, "loss": 0.78405647, "memory(GiB)": 135.77, "step": 25130, "train_speed(iter/s)": 0.201251 }, { "acc": 0.77909927, "epoch": 0.5865352367342772, "grad_norm": 5.9375, "learning_rate": 8.289876536169002e-06, "loss": 0.80689306, "memory(GiB)": 135.77, "step": 25140, "train_speed(iter/s)": 0.201293 }, { "acc": 0.77817984, "epoch": 0.5867685443065661, "grad_norm": 5.46875, "learning_rate": 8.28845372569708e-06, "loss": 0.80842056, "memory(GiB)": 135.77, "step": 25150, "train_speed(iter/s)": 0.201335 }, { "acc": 0.76585131, "epoch": 0.587001851878855, "grad_norm": 5.3125, "learning_rate": 8.287030445799345e-06, "loss": 0.84683504, "memory(GiB)": 135.77, "step": 25160, "train_speed(iter/s)": 0.201377 }, { "acc": 0.76378751, "epoch": 0.5872351594511439, "grad_norm": 5.9375, "learning_rate": 8.285606696678969e-06, "loss": 0.85097218, "memory(GiB)": 135.77, "step": 25170, "train_speed(iter/s)": 0.201419 }, { "acc": 0.76524734, "epoch": 0.5874684670234328, "grad_norm": 6.75, "learning_rate": 8.28418247853919e-06, "loss": 0.82923918, "memory(GiB)": 135.77, "step": 25180, "train_speed(iter/s)": 0.201457 }, { "acc": 0.7815011, "epoch": 0.5877017745957217, "grad_norm": 5.5, "learning_rate": 8.282757791583316e-06, "loss": 0.78978734, "memory(GiB)": 135.77, "step": 25190, "train_speed(iter/s)": 0.201499 }, { "acc": 0.75179167, "epoch": 0.5879350821680106, "grad_norm": 5.09375, "learning_rate": 8.281332636014723e-06, "loss": 0.89616728, "memory(GiB)": 135.77, "step": 25200, "train_speed(iter/s)": 0.201541 }, { "acc": 0.77656918, "epoch": 0.5881683897402995, "grad_norm": 5.09375, "learning_rate": 8.279907012036849e-06, "loss": 0.80306749, "memory(GiB)": 135.77, "step": 25210, "train_speed(iter/s)": 0.201579 }, { "acc": 0.77237129, "epoch": 0.5884016973125884, "grad_norm": 5.5625, "learning_rate": 8.2784809198532e-06, "loss": 0.8121336, "memory(GiB)": 135.77, "step": 25220, "train_speed(iter/s)": 0.20162 }, { "acc": 0.7861938, "epoch": 0.5886350048848773, "grad_norm": 5.125, "learning_rate": 8.277054359667355e-06, "loss": 0.76683536, "memory(GiB)": 135.77, "step": 25230, "train_speed(iter/s)": 0.201662 }, { "acc": 0.75934463, "epoch": 0.5888683124571662, "grad_norm": 5.8125, "learning_rate": 8.27562733168295e-06, "loss": 0.87756844, "memory(GiB)": 135.77, "step": 25240, "train_speed(iter/s)": 0.201704 }, { "acc": 0.76303396, "epoch": 0.5891016200294551, "grad_norm": 5.34375, "learning_rate": 8.274199836103696e-06, "loss": 0.84503393, "memory(GiB)": 135.77, "step": 25250, "train_speed(iter/s)": 0.201746 }, { "acc": 0.77054796, "epoch": 0.589334927601744, "grad_norm": 11.0, "learning_rate": 8.272771873133365e-06, "loss": 0.83606682, "memory(GiB)": 135.77, "step": 25260, "train_speed(iter/s)": 0.201789 }, { "acc": 0.78724136, "epoch": 0.5895682351740329, "grad_norm": 5.46875, "learning_rate": 8.271343442975803e-06, "loss": 0.76417646, "memory(GiB)": 135.77, "step": 25270, "train_speed(iter/s)": 0.201829 }, { "acc": 0.75213413, "epoch": 0.5898015427463218, "grad_norm": 5.375, "learning_rate": 8.269914545834911e-06, "loss": 0.90432196, "memory(GiB)": 135.77, "step": 25280, "train_speed(iter/s)": 0.201869 }, { "acc": 0.76026382, "epoch": 0.5900348503186107, "grad_norm": 5.34375, "learning_rate": 8.26848518191467e-06, "loss": 0.88067017, "memory(GiB)": 135.77, "step": 25290, "train_speed(iter/s)": 0.201909 }, { "acc": 0.77908268, "epoch": 0.5902681578908996, "grad_norm": 3.953125, "learning_rate": 8.267055351419117e-06, "loss": 0.78281631, "memory(GiB)": 135.77, "step": 25300, "train_speed(iter/s)": 0.201951 }, { "acc": 0.78081703, "epoch": 0.5905014654631885, "grad_norm": 5.6875, "learning_rate": 8.265625054552363e-06, "loss": 0.7916995, "memory(GiB)": 135.77, "step": 25310, "train_speed(iter/s)": 0.201991 }, { "acc": 0.78177204, "epoch": 0.5907347730354773, "grad_norm": 8.9375, "learning_rate": 8.264194291518583e-06, "loss": 0.79910288, "memory(GiB)": 135.77, "step": 25320, "train_speed(iter/s)": 0.202033 }, { "acc": 0.75857491, "epoch": 0.5909680806077662, "grad_norm": 7.46875, "learning_rate": 8.262763062522013e-06, "loss": 0.87688332, "memory(GiB)": 135.77, "step": 25330, "train_speed(iter/s)": 0.202076 }, { "acc": 0.76642208, "epoch": 0.5912013881800551, "grad_norm": 6.375, "learning_rate": 8.261331367766965e-06, "loss": 0.83190136, "memory(GiB)": 135.77, "step": 25340, "train_speed(iter/s)": 0.202117 }, { "acc": 0.77461624, "epoch": 0.591434695752344, "grad_norm": 5.3125, "learning_rate": 8.25989920745781e-06, "loss": 0.80886202, "memory(GiB)": 135.77, "step": 25350, "train_speed(iter/s)": 0.202155 }, { "acc": 0.75091953, "epoch": 0.5916680033246329, "grad_norm": 5.25, "learning_rate": 8.258466581798992e-06, "loss": 0.90273914, "memory(GiB)": 135.77, "step": 25360, "train_speed(iter/s)": 0.202196 }, { "acc": 0.77260246, "epoch": 0.5919013108969218, "grad_norm": 5.8125, "learning_rate": 8.257033490995017e-06, "loss": 0.81039619, "memory(GiB)": 135.77, "step": 25370, "train_speed(iter/s)": 0.202235 }, { "acc": 0.745508, "epoch": 0.5921346184692107, "grad_norm": 5.40625, "learning_rate": 8.255599935250456e-06, "loss": 0.90657196, "memory(GiB)": 135.77, "step": 25380, "train_speed(iter/s)": 0.202278 }, { "acc": 0.76827936, "epoch": 0.5923679260414996, "grad_norm": 6.0625, "learning_rate": 8.254165914769949e-06, "loss": 0.85670547, "memory(GiB)": 135.77, "step": 25390, "train_speed(iter/s)": 0.202321 }, { "acc": 0.76772423, "epoch": 0.5926012336137885, "grad_norm": 4.84375, "learning_rate": 8.252731429758205e-06, "loss": 0.83695488, "memory(GiB)": 135.77, "step": 25400, "train_speed(iter/s)": 0.202362 }, { "acc": 0.78266163, "epoch": 0.5928345411860774, "grad_norm": 5.65625, "learning_rate": 8.251296480419992e-06, "loss": 0.7788908, "memory(GiB)": 135.77, "step": 25410, "train_speed(iter/s)": 0.202402 }, { "acc": 0.77551551, "epoch": 0.5930678487583663, "grad_norm": 4.5625, "learning_rate": 8.249861066960154e-06, "loss": 0.80347424, "memory(GiB)": 135.77, "step": 25420, "train_speed(iter/s)": 0.202442 }, { "acc": 0.75031691, "epoch": 0.5933011563306552, "grad_norm": 5.25, "learning_rate": 8.248425189583589e-06, "loss": 0.90826054, "memory(GiB)": 135.77, "step": 25430, "train_speed(iter/s)": 0.202484 }, { "acc": 0.78029752, "epoch": 0.5935344639029441, "grad_norm": 6.40625, "learning_rate": 8.246988848495275e-06, "loss": 0.79855204, "memory(GiB)": 135.77, "step": 25440, "train_speed(iter/s)": 0.202525 }, { "acc": 0.77923479, "epoch": 0.593767771475233, "grad_norm": 4.5625, "learning_rate": 8.245552043900245e-06, "loss": 0.80320797, "memory(GiB)": 135.77, "step": 25450, "train_speed(iter/s)": 0.202567 }, { "acc": 0.76215806, "epoch": 0.5940010790475219, "grad_norm": 4.53125, "learning_rate": 8.244114776003605e-06, "loss": 0.88092327, "memory(GiB)": 135.77, "step": 25460, "train_speed(iter/s)": 0.202603 }, { "acc": 0.7797924, "epoch": 0.5942343866198108, "grad_norm": 5.6875, "learning_rate": 8.24267704501052e-06, "loss": 0.78126807, "memory(GiB)": 135.77, "step": 25470, "train_speed(iter/s)": 0.202644 }, { "acc": 0.78742337, "epoch": 0.5944676941920997, "grad_norm": 4.84375, "learning_rate": 8.241238851126231e-06, "loss": 0.78035583, "memory(GiB)": 135.77, "step": 25480, "train_speed(iter/s)": 0.202687 }, { "acc": 0.77751136, "epoch": 0.5947010017643886, "grad_norm": 7.8125, "learning_rate": 8.239800194556036e-06, "loss": 0.80985718, "memory(GiB)": 135.77, "step": 25490, "train_speed(iter/s)": 0.202728 }, { "acc": 0.76632509, "epoch": 0.5949343093366775, "grad_norm": 4.8125, "learning_rate": 8.238361075505307e-06, "loss": 0.8863204, "memory(GiB)": 135.77, "step": 25500, "train_speed(iter/s)": 0.202766 }, { "epoch": 0.5949343093366775, "eval_acc": 0.7401671041006372, "eval_loss": 0.8206676840782166, "eval_runtime": 1268.9058, "eval_samples_per_second": 28.364, "eval_steps_per_second": 14.182, "step": 25500 }, { "acc": 0.77610083, "epoch": 0.5951676169089664, "grad_norm": 6.28125, "learning_rate": 8.236921494179474e-06, "loss": 0.79520016, "memory(GiB)": 135.77, "step": 25510, "train_speed(iter/s)": 0.200756 }, { "acc": 0.76967053, "epoch": 0.5954009244812551, "grad_norm": 4.5625, "learning_rate": 8.235481450784037e-06, "loss": 0.82552261, "memory(GiB)": 135.77, "step": 25520, "train_speed(iter/s)": 0.200796 }, { "acc": 0.78743782, "epoch": 0.595634232053544, "grad_norm": 5.78125, "learning_rate": 8.234040945524563e-06, "loss": 0.78872375, "memory(GiB)": 135.77, "step": 25530, "train_speed(iter/s)": 0.200836 }, { "acc": 0.77408667, "epoch": 0.5958675396258329, "grad_norm": 5.8125, "learning_rate": 8.232599978606683e-06, "loss": 0.82129602, "memory(GiB)": 135.77, "step": 25540, "train_speed(iter/s)": 0.200877 }, { "acc": 0.76378164, "epoch": 0.5961008471981218, "grad_norm": 7.125, "learning_rate": 8.231158550236098e-06, "loss": 0.86470642, "memory(GiB)": 135.77, "step": 25550, "train_speed(iter/s)": 0.200919 }, { "acc": 0.77116547, "epoch": 0.5963341547704107, "grad_norm": 4.8125, "learning_rate": 8.229716660618567e-06, "loss": 0.80563984, "memory(GiB)": 135.77, "step": 25560, "train_speed(iter/s)": 0.200961 }, { "acc": 0.78916759, "epoch": 0.5965674623426996, "grad_norm": 5.0, "learning_rate": 8.22827430995992e-06, "loss": 0.74210978, "memory(GiB)": 135.77, "step": 25570, "train_speed(iter/s)": 0.201001 }, { "acc": 0.76487999, "epoch": 0.5968007699149885, "grad_norm": 5.75, "learning_rate": 8.226831498466054e-06, "loss": 0.86971359, "memory(GiB)": 135.77, "step": 25580, "train_speed(iter/s)": 0.201044 }, { "acc": 0.77532277, "epoch": 0.5970340774872774, "grad_norm": 5.53125, "learning_rate": 8.22538822634293e-06, "loss": 0.79406919, "memory(GiB)": 135.77, "step": 25590, "train_speed(iter/s)": 0.201083 }, { "acc": 0.78031688, "epoch": 0.5972673850595663, "grad_norm": 4.90625, "learning_rate": 8.223944493796572e-06, "loss": 0.77325659, "memory(GiB)": 135.77, "step": 25600, "train_speed(iter/s)": 0.201122 }, { "acc": 0.78207927, "epoch": 0.5975006926318552, "grad_norm": 12.0625, "learning_rate": 8.222500301033075e-06, "loss": 0.78344164, "memory(GiB)": 135.77, "step": 25610, "train_speed(iter/s)": 0.20116 }, { "acc": 0.76561384, "epoch": 0.5977340002041441, "grad_norm": 5.1875, "learning_rate": 8.221055648258596e-06, "loss": 0.85672531, "memory(GiB)": 135.77, "step": 25620, "train_speed(iter/s)": 0.201199 }, { "acc": 0.76766024, "epoch": 0.597967307776433, "grad_norm": 5.0, "learning_rate": 8.21961053567936e-06, "loss": 0.82406807, "memory(GiB)": 135.77, "step": 25630, "train_speed(iter/s)": 0.201242 }, { "acc": 0.79372106, "epoch": 0.5982006153487219, "grad_norm": 6.25, "learning_rate": 8.218164963501651e-06, "loss": 0.71891031, "memory(GiB)": 135.77, "step": 25640, "train_speed(iter/s)": 0.201286 }, { "acc": 0.76460505, "epoch": 0.5984339229210108, "grad_norm": 5.375, "learning_rate": 8.216718931931832e-06, "loss": 0.83040361, "memory(GiB)": 135.77, "step": 25650, "train_speed(iter/s)": 0.201327 }, { "acc": 0.77256279, "epoch": 0.5986672304932997, "grad_norm": 10.75, "learning_rate": 8.21527244117632e-06, "loss": 0.82293129, "memory(GiB)": 135.77, "step": 25660, "train_speed(iter/s)": 0.201372 }, { "acc": 0.79064188, "epoch": 0.5989005380655886, "grad_norm": 5.90625, "learning_rate": 8.2138254914416e-06, "loss": 0.72057476, "memory(GiB)": 135.77, "step": 25670, "train_speed(iter/s)": 0.201413 }, { "acc": 0.77393141, "epoch": 0.5991338456378775, "grad_norm": 5.8125, "learning_rate": 8.212378082934225e-06, "loss": 0.84098873, "memory(GiB)": 135.77, "step": 25680, "train_speed(iter/s)": 0.201451 }, { "acc": 0.78354292, "epoch": 0.5993671532101664, "grad_norm": 4.9375, "learning_rate": 8.210930215860812e-06, "loss": 0.77515049, "memory(GiB)": 135.77, "step": 25690, "train_speed(iter/s)": 0.201493 }, { "acc": 0.76722078, "epoch": 0.5996004607824553, "grad_norm": 5.03125, "learning_rate": 8.209481890428044e-06, "loss": 0.85564814, "memory(GiB)": 135.77, "step": 25700, "train_speed(iter/s)": 0.201533 }, { "acc": 0.78583412, "epoch": 0.5998337683547441, "grad_norm": 4.25, "learning_rate": 8.208033106842668e-06, "loss": 0.79263468, "memory(GiB)": 135.77, "step": 25710, "train_speed(iter/s)": 0.201569 }, { "acc": 0.77676077, "epoch": 0.600067075927033, "grad_norm": 7.96875, "learning_rate": 8.206583865311497e-06, "loss": 0.79163513, "memory(GiB)": 135.77, "step": 25720, "train_speed(iter/s)": 0.201609 }, { "acc": 0.78243885, "epoch": 0.6003003834993219, "grad_norm": 4.96875, "learning_rate": 8.205134166041412e-06, "loss": 0.79570303, "memory(GiB)": 135.77, "step": 25730, "train_speed(iter/s)": 0.201648 }, { "acc": 0.7864254, "epoch": 0.6005336910716108, "grad_norm": 5.4375, "learning_rate": 8.203684009239356e-06, "loss": 0.76417632, "memory(GiB)": 135.77, "step": 25740, "train_speed(iter/s)": 0.201687 }, { "acc": 0.76960988, "epoch": 0.6007669986438997, "grad_norm": 5.75, "learning_rate": 8.202233395112338e-06, "loss": 0.82158051, "memory(GiB)": 135.77, "step": 25750, "train_speed(iter/s)": 0.201728 }, { "acc": 0.77994032, "epoch": 0.6010003062161886, "grad_norm": 6.03125, "learning_rate": 8.200782323867432e-06, "loss": 0.82229548, "memory(GiB)": 135.77, "step": 25760, "train_speed(iter/s)": 0.201767 }, { "acc": 0.77866874, "epoch": 0.6012336137884775, "grad_norm": 5.96875, "learning_rate": 8.19933079571178e-06, "loss": 0.78451872, "memory(GiB)": 135.77, "step": 25770, "train_speed(iter/s)": 0.201805 }, { "acc": 0.78407946, "epoch": 0.6014669213607664, "grad_norm": 4.84375, "learning_rate": 8.197878810852587e-06, "loss": 0.76278973, "memory(GiB)": 135.77, "step": 25780, "train_speed(iter/s)": 0.201846 }, { "acc": 0.78200378, "epoch": 0.6017002289330553, "grad_norm": 7.875, "learning_rate": 8.196426369497121e-06, "loss": 0.78061686, "memory(GiB)": 135.77, "step": 25790, "train_speed(iter/s)": 0.201885 }, { "acc": 0.76984148, "epoch": 0.6019335365053442, "grad_norm": 7.0625, "learning_rate": 8.19497347185272e-06, "loss": 0.82394571, "memory(GiB)": 135.77, "step": 25800, "train_speed(iter/s)": 0.201928 }, { "acc": 0.74774513, "epoch": 0.6021668440776331, "grad_norm": 4.90625, "learning_rate": 8.193520118126785e-06, "loss": 0.89157257, "memory(GiB)": 135.77, "step": 25810, "train_speed(iter/s)": 0.201969 }, { "acc": 0.77559814, "epoch": 0.602400151649922, "grad_norm": 6.15625, "learning_rate": 8.19206630852678e-06, "loss": 0.78932376, "memory(GiB)": 135.77, "step": 25820, "train_speed(iter/s)": 0.202011 }, { "acc": 0.76863947, "epoch": 0.6026334592222109, "grad_norm": 5.9375, "learning_rate": 8.190612043260238e-06, "loss": 0.83424625, "memory(GiB)": 135.77, "step": 25830, "train_speed(iter/s)": 0.202052 }, { "acc": 0.73870344, "epoch": 0.6028667667944998, "grad_norm": 5.15625, "learning_rate": 8.189157322534753e-06, "loss": 0.97275181, "memory(GiB)": 135.77, "step": 25840, "train_speed(iter/s)": 0.202089 }, { "acc": 0.77539172, "epoch": 0.6031000743667887, "grad_norm": 6.1875, "learning_rate": 8.187702146557986e-06, "loss": 0.78618755, "memory(GiB)": 135.77, "step": 25850, "train_speed(iter/s)": 0.20213 }, { "acc": 0.76942921, "epoch": 0.6033333819390776, "grad_norm": 4.21875, "learning_rate": 8.186246515537664e-06, "loss": 0.83434811, "memory(GiB)": 135.77, "step": 25860, "train_speed(iter/s)": 0.202171 }, { "acc": 0.78348265, "epoch": 0.6035666895113665, "grad_norm": 6.8125, "learning_rate": 8.184790429681577e-06, "loss": 0.77311735, "memory(GiB)": 135.77, "step": 25870, "train_speed(iter/s)": 0.202209 }, { "acc": 0.76982718, "epoch": 0.6037999970836554, "grad_norm": 5.125, "learning_rate": 8.183333889197582e-06, "loss": 0.84020672, "memory(GiB)": 135.77, "step": 25880, "train_speed(iter/s)": 0.202249 }, { "acc": 0.77111564, "epoch": 0.6040333046559443, "grad_norm": 5.75, "learning_rate": 8.181876894293601e-06, "loss": 0.80620203, "memory(GiB)": 135.77, "step": 25890, "train_speed(iter/s)": 0.202291 }, { "acc": 0.77545085, "epoch": 0.6042666122282331, "grad_norm": 5.8125, "learning_rate": 8.180419445177614e-06, "loss": 0.81389084, "memory(GiB)": 135.77, "step": 25900, "train_speed(iter/s)": 0.202331 }, { "acc": 0.77913561, "epoch": 0.604499919800522, "grad_norm": 5.875, "learning_rate": 8.178961542057677e-06, "loss": 0.79586811, "memory(GiB)": 135.77, "step": 25910, "train_speed(iter/s)": 0.20237 }, { "acc": 0.78647404, "epoch": 0.6047332273728109, "grad_norm": 6.15625, "learning_rate": 8.177503185141904e-06, "loss": 0.77673407, "memory(GiB)": 135.77, "step": 25920, "train_speed(iter/s)": 0.202411 }, { "acc": 0.78315163, "epoch": 0.6049665349450998, "grad_norm": 11.3125, "learning_rate": 8.176044374638473e-06, "loss": 0.767239, "memory(GiB)": 135.77, "step": 25930, "train_speed(iter/s)": 0.202451 }, { "acc": 0.76807127, "epoch": 0.6051998425173887, "grad_norm": 5.6875, "learning_rate": 8.174585110755631e-06, "loss": 0.86293449, "memory(GiB)": 135.77, "step": 25940, "train_speed(iter/s)": 0.202491 }, { "acc": 0.78088841, "epoch": 0.6054331500896776, "grad_norm": 4.5, "learning_rate": 8.173125393701686e-06, "loss": 0.7886466, "memory(GiB)": 135.77, "step": 25950, "train_speed(iter/s)": 0.202533 }, { "acc": 0.77306123, "epoch": 0.6056664576619665, "grad_norm": 7.28125, "learning_rate": 8.171665223685014e-06, "loss": 0.81952877, "memory(GiB)": 135.77, "step": 25960, "train_speed(iter/s)": 0.202574 }, { "acc": 0.78507872, "epoch": 0.6058997652342554, "grad_norm": 4.78125, "learning_rate": 8.170204600914051e-06, "loss": 0.76790247, "memory(GiB)": 135.77, "step": 25970, "train_speed(iter/s)": 0.202612 }, { "acc": 0.76558475, "epoch": 0.6061330728065443, "grad_norm": 6.09375, "learning_rate": 8.168743525597304e-06, "loss": 0.86151485, "memory(GiB)": 135.77, "step": 25980, "train_speed(iter/s)": 0.20265 }, { "acc": 0.75887089, "epoch": 0.6063663803788332, "grad_norm": 6.46875, "learning_rate": 8.167281997943338e-06, "loss": 0.87729855, "memory(GiB)": 135.77, "step": 25990, "train_speed(iter/s)": 0.202691 }, { "acc": 0.7482584, "epoch": 0.6065996879511221, "grad_norm": 7.3125, "learning_rate": 8.165820018160787e-06, "loss": 0.91938248, "memory(GiB)": 135.77, "step": 26000, "train_speed(iter/s)": 0.202728 }, { "epoch": 0.6065996879511221, "eval_acc": 0.7404893142960276, "eval_loss": 0.8198909163475037, "eval_runtime": 1269.2964, "eval_samples_per_second": 28.355, "eval_steps_per_second": 14.178, "step": 26000 }, { "acc": 0.78726702, "epoch": 0.606832995523411, "grad_norm": 6.21875, "learning_rate": 8.164357586458348e-06, "loss": 0.75815754, "memory(GiB)": 135.77, "step": 26010, "train_speed(iter/s)": 0.200755 }, { "acc": 0.7841042, "epoch": 0.6070663030956999, "grad_norm": 20.0, "learning_rate": 8.162894703044783e-06, "loss": 0.7632184, "memory(GiB)": 135.77, "step": 26020, "train_speed(iter/s)": 0.200792 }, { "acc": 0.75316358, "epoch": 0.6072996106679888, "grad_norm": 4.625, "learning_rate": 8.161431368128919e-06, "loss": 0.89499702, "memory(GiB)": 135.77, "step": 26030, "train_speed(iter/s)": 0.200832 }, { "acc": 0.78723059, "epoch": 0.6075329182402777, "grad_norm": 6.0625, "learning_rate": 8.159967581919644e-06, "loss": 0.74924011, "memory(GiB)": 135.77, "step": 26040, "train_speed(iter/s)": 0.200871 }, { "acc": 0.79428697, "epoch": 0.6077662258125666, "grad_norm": 5.90625, "learning_rate": 8.158503344625915e-06, "loss": 0.73409123, "memory(GiB)": 135.77, "step": 26050, "train_speed(iter/s)": 0.200911 }, { "acc": 0.76840878, "epoch": 0.6079995333848555, "grad_norm": 5.53125, "learning_rate": 8.157038656456752e-06, "loss": 0.83939495, "memory(GiB)": 135.77, "step": 26060, "train_speed(iter/s)": 0.200951 }, { "acc": 0.77537632, "epoch": 0.6082328409571444, "grad_norm": 4.09375, "learning_rate": 8.155573517621238e-06, "loss": 0.81753826, "memory(GiB)": 135.77, "step": 26070, "train_speed(iter/s)": 0.20099 }, { "acc": 0.7842936, "epoch": 0.6084661485294333, "grad_norm": 6.59375, "learning_rate": 8.154107928328521e-06, "loss": 0.79942732, "memory(GiB)": 135.77, "step": 26080, "train_speed(iter/s)": 0.201029 }, { "acc": 0.77272925, "epoch": 0.608699456101722, "grad_norm": 6.71875, "learning_rate": 8.152641888787812e-06, "loss": 0.80734158, "memory(GiB)": 135.77, "step": 26090, "train_speed(iter/s)": 0.201068 }, { "acc": 0.76670575, "epoch": 0.608932763674011, "grad_norm": 7.75, "learning_rate": 8.15117539920839e-06, "loss": 0.85535202, "memory(GiB)": 135.77, "step": 26100, "train_speed(iter/s)": 0.201109 }, { "acc": 0.75531483, "epoch": 0.6091660712462998, "grad_norm": 6.09375, "learning_rate": 8.149708459799595e-06, "loss": 0.8802372, "memory(GiB)": 135.77, "step": 26110, "train_speed(iter/s)": 0.20115 }, { "acc": 0.76685176, "epoch": 0.6093993788185887, "grad_norm": 6.0, "learning_rate": 8.148241070770834e-06, "loss": 0.83814058, "memory(GiB)": 135.77, "step": 26120, "train_speed(iter/s)": 0.20119 }, { "acc": 0.7886631, "epoch": 0.6096326863908776, "grad_norm": 9.1875, "learning_rate": 8.146773232331574e-06, "loss": 0.77238207, "memory(GiB)": 135.77, "step": 26130, "train_speed(iter/s)": 0.201229 }, { "acc": 0.75037575, "epoch": 0.6098659939631665, "grad_norm": 5.65625, "learning_rate": 8.145304944691347e-06, "loss": 0.90184555, "memory(GiB)": 135.77, "step": 26140, "train_speed(iter/s)": 0.201269 }, { "acc": 0.7713768, "epoch": 0.6100993015354554, "grad_norm": 7.84375, "learning_rate": 8.143836208059754e-06, "loss": 0.80705109, "memory(GiB)": 135.77, "step": 26150, "train_speed(iter/s)": 0.20131 }, { "acc": 0.77182598, "epoch": 0.6103326091077443, "grad_norm": 4.78125, "learning_rate": 8.142367022646457e-06, "loss": 0.83266945, "memory(GiB)": 135.77, "step": 26160, "train_speed(iter/s)": 0.201352 }, { "acc": 0.77788215, "epoch": 0.6105659166800332, "grad_norm": 5.8125, "learning_rate": 8.14089738866118e-06, "loss": 0.80510206, "memory(GiB)": 135.77, "step": 26170, "train_speed(iter/s)": 0.201388 }, { "acc": 0.77358456, "epoch": 0.6107992242523221, "grad_norm": 4.59375, "learning_rate": 8.139427306313713e-06, "loss": 0.81454687, "memory(GiB)": 135.77, "step": 26180, "train_speed(iter/s)": 0.201426 }, { "acc": 0.78963194, "epoch": 0.611032531824611, "grad_norm": 7.6875, "learning_rate": 8.137956775813909e-06, "loss": 0.75747194, "memory(GiB)": 135.77, "step": 26190, "train_speed(iter/s)": 0.201467 }, { "acc": 0.76336627, "epoch": 0.6112658393968999, "grad_norm": 4.6875, "learning_rate": 8.136485797371687e-06, "loss": 0.84080048, "memory(GiB)": 135.77, "step": 26200, "train_speed(iter/s)": 0.201506 }, { "acc": 0.772651, "epoch": 0.6114991469691888, "grad_norm": 4.78125, "learning_rate": 8.13501437119703e-06, "loss": 0.81818466, "memory(GiB)": 135.77, "step": 26210, "train_speed(iter/s)": 0.201547 }, { "acc": 0.75983801, "epoch": 0.6117324545414777, "grad_norm": 7.03125, "learning_rate": 8.133542497499981e-06, "loss": 0.85904999, "memory(GiB)": 135.77, "step": 26220, "train_speed(iter/s)": 0.201587 }, { "acc": 0.77595029, "epoch": 0.6119657621137666, "grad_norm": 6.8125, "learning_rate": 8.132070176490652e-06, "loss": 0.79862223, "memory(GiB)": 135.77, "step": 26230, "train_speed(iter/s)": 0.201625 }, { "acc": 0.76928434, "epoch": 0.6121990696860555, "grad_norm": 6.28125, "learning_rate": 8.130597408379214e-06, "loss": 0.80921583, "memory(GiB)": 135.77, "step": 26240, "train_speed(iter/s)": 0.201663 }, { "acc": 0.76527472, "epoch": 0.6124323772583444, "grad_norm": 4.28125, "learning_rate": 8.129124193375906e-06, "loss": 0.83155708, "memory(GiB)": 135.77, "step": 26250, "train_speed(iter/s)": 0.201702 }, { "acc": 0.79154787, "epoch": 0.6126656848306333, "grad_norm": 5.53125, "learning_rate": 8.127650531691028e-06, "loss": 0.74951324, "memory(GiB)": 135.77, "step": 26260, "train_speed(iter/s)": 0.201743 }, { "acc": 0.78243771, "epoch": 0.6128989924029222, "grad_norm": 5.8125, "learning_rate": 8.126176423534945e-06, "loss": 0.76880646, "memory(GiB)": 135.77, "step": 26270, "train_speed(iter/s)": 0.201785 }, { "acc": 0.77010064, "epoch": 0.6131322999752111, "grad_norm": 5.34375, "learning_rate": 8.124701869118086e-06, "loss": 0.84182453, "memory(GiB)": 135.77, "step": 26280, "train_speed(iter/s)": 0.201823 }, { "acc": 0.7652318, "epoch": 0.6133656075474999, "grad_norm": 4.8125, "learning_rate": 8.123226868650944e-06, "loss": 0.8603693, "memory(GiB)": 135.77, "step": 26290, "train_speed(iter/s)": 0.201861 }, { "acc": 0.77005825, "epoch": 0.6135989151197888, "grad_norm": 8.875, "learning_rate": 8.121751422344072e-06, "loss": 0.81795826, "memory(GiB)": 135.77, "step": 26300, "train_speed(iter/s)": 0.201901 }, { "acc": 0.78708076, "epoch": 0.6138322226920777, "grad_norm": 6.1875, "learning_rate": 8.120275530408092e-06, "loss": 0.74994001, "memory(GiB)": 135.77, "step": 26310, "train_speed(iter/s)": 0.201942 }, { "acc": 0.78978472, "epoch": 0.6140655302643666, "grad_norm": 6.03125, "learning_rate": 8.118799193053686e-06, "loss": 0.74331207, "memory(GiB)": 135.77, "step": 26320, "train_speed(iter/s)": 0.201979 }, { "acc": 0.75865965, "epoch": 0.6142988378366555, "grad_norm": 5.6875, "learning_rate": 8.117322410491602e-06, "loss": 0.86435623, "memory(GiB)": 135.77, "step": 26330, "train_speed(iter/s)": 0.202018 }, { "acc": 0.77246876, "epoch": 0.6145321454089444, "grad_norm": 6.625, "learning_rate": 8.11584518293265e-06, "loss": 0.81503639, "memory(GiB)": 135.77, "step": 26340, "train_speed(iter/s)": 0.202058 }, { "acc": 0.76987753, "epoch": 0.6147654529812333, "grad_norm": 5.15625, "learning_rate": 8.114367510587701e-06, "loss": 0.82482548, "memory(GiB)": 135.77, "step": 26350, "train_speed(iter/s)": 0.202097 }, { "acc": 0.76622534, "epoch": 0.6149987605535222, "grad_norm": 6.28125, "learning_rate": 8.112889393667698e-06, "loss": 0.83077908, "memory(GiB)": 135.77, "step": 26360, "train_speed(iter/s)": 0.202137 }, { "acc": 0.77782397, "epoch": 0.6152320681258111, "grad_norm": 4.25, "learning_rate": 8.111410832383635e-06, "loss": 0.77891083, "memory(GiB)": 135.77, "step": 26370, "train_speed(iter/s)": 0.202173 }, { "acc": 0.79491034, "epoch": 0.6154653756981, "grad_norm": 4.21875, "learning_rate": 8.109931826946582e-06, "loss": 0.73813534, "memory(GiB)": 135.77, "step": 26380, "train_speed(iter/s)": 0.202211 }, { "acc": 0.78529496, "epoch": 0.6156986832703889, "grad_norm": 4.6875, "learning_rate": 8.108452377567663e-06, "loss": 0.77241392, "memory(GiB)": 135.77, "step": 26390, "train_speed(iter/s)": 0.202246 }, { "acc": 0.76942668, "epoch": 0.6159319908426778, "grad_norm": 6.875, "learning_rate": 8.10697248445807e-06, "loss": 0.82725449, "memory(GiB)": 135.77, "step": 26400, "train_speed(iter/s)": 0.202286 }, { "acc": 0.77350941, "epoch": 0.6161652984149667, "grad_norm": 4.5, "learning_rate": 8.105492147829059e-06, "loss": 0.81450577, "memory(GiB)": 135.77, "step": 26410, "train_speed(iter/s)": 0.202328 }, { "acc": 0.7737443, "epoch": 0.6163986059872556, "grad_norm": 5.6875, "learning_rate": 8.104011367891944e-06, "loss": 0.81999092, "memory(GiB)": 135.77, "step": 26420, "train_speed(iter/s)": 0.202367 }, { "acc": 0.78863039, "epoch": 0.6166319135595445, "grad_norm": 4.96875, "learning_rate": 8.102530144858109e-06, "loss": 0.77086744, "memory(GiB)": 135.77, "step": 26430, "train_speed(iter/s)": 0.202406 }, { "acc": 0.77370825, "epoch": 0.6168652211318334, "grad_norm": 4.46875, "learning_rate": 8.101048478938997e-06, "loss": 0.78342042, "memory(GiB)": 135.77, "step": 26440, "train_speed(iter/s)": 0.202446 }, { "acc": 0.78318248, "epoch": 0.6170985287041223, "grad_norm": 4.65625, "learning_rate": 8.099566370346115e-06, "loss": 0.78425016, "memory(GiB)": 135.77, "step": 26450, "train_speed(iter/s)": 0.202487 }, { "acc": 0.78459816, "epoch": 0.6173318362764112, "grad_norm": 5.4375, "learning_rate": 8.098083819291034e-06, "loss": 0.78062539, "memory(GiB)": 135.77, "step": 26460, "train_speed(iter/s)": 0.202527 }, { "acc": 0.77326651, "epoch": 0.6175651438487001, "grad_norm": 4.875, "learning_rate": 8.096600825985388e-06, "loss": 0.79871411, "memory(GiB)": 135.77, "step": 26470, "train_speed(iter/s)": 0.202567 }, { "acc": 0.75785589, "epoch": 0.6177984514209889, "grad_norm": 4.96875, "learning_rate": 8.095117390640875e-06, "loss": 0.87607994, "memory(GiB)": 135.77, "step": 26480, "train_speed(iter/s)": 0.202605 }, { "acc": 0.80151081, "epoch": 0.6180317589932778, "grad_norm": 7.53125, "learning_rate": 8.093633513469252e-06, "loss": 0.70373769, "memory(GiB)": 135.77, "step": 26490, "train_speed(iter/s)": 0.202645 }, { "acc": 0.78222547, "epoch": 0.6182650665655667, "grad_norm": 6.90625, "learning_rate": 8.092149194682343e-06, "loss": 0.78799133, "memory(GiB)": 135.77, "step": 26500, "train_speed(iter/s)": 0.202684 }, { "epoch": 0.6182650665655667, "eval_acc": 0.7407008648268977, "eval_loss": 0.8189985156059265, "eval_runtime": 1271.9101, "eval_samples_per_second": 28.297, "eval_steps_per_second": 14.149, "step": 26500 }, { "acc": 0.77204461, "epoch": 0.6184983741378556, "grad_norm": 9.25, "learning_rate": 8.090664434492037e-06, "loss": 0.81106329, "memory(GiB)": 135.77, "step": 26510, "train_speed(iter/s)": 0.200744 }, { "acc": 0.79130363, "epoch": 0.6187316817101445, "grad_norm": 4.65625, "learning_rate": 8.08917923311028e-06, "loss": 0.7358408, "memory(GiB)": 135.77, "step": 26520, "train_speed(iter/s)": 0.200781 }, { "acc": 0.78020353, "epoch": 0.6189649892824334, "grad_norm": 6.3125, "learning_rate": 8.087693590749083e-06, "loss": 0.80418053, "memory(GiB)": 135.77, "step": 26530, "train_speed(iter/s)": 0.200819 }, { "acc": 0.79271488, "epoch": 0.6191982968547223, "grad_norm": 17.625, "learning_rate": 8.086207507620524e-06, "loss": 0.71984701, "memory(GiB)": 135.77, "step": 26540, "train_speed(iter/s)": 0.200856 }, { "acc": 0.7886724, "epoch": 0.6194316044270112, "grad_norm": 6.34375, "learning_rate": 8.084720983936742e-06, "loss": 0.75640163, "memory(GiB)": 135.77, "step": 26550, "train_speed(iter/s)": 0.200895 }, { "acc": 0.776264, "epoch": 0.6196649119993001, "grad_norm": 5.84375, "learning_rate": 8.083234019909933e-06, "loss": 0.79287157, "memory(GiB)": 135.77, "step": 26560, "train_speed(iter/s)": 0.200932 }, { "acc": 0.77901525, "epoch": 0.619898219571589, "grad_norm": 7.875, "learning_rate": 8.081746615752365e-06, "loss": 0.80261803, "memory(GiB)": 135.77, "step": 26570, "train_speed(iter/s)": 0.20097 }, { "acc": 0.77423306, "epoch": 0.6201315271438779, "grad_norm": 13.75, "learning_rate": 8.080258771676363e-06, "loss": 0.78158264, "memory(GiB)": 135.77, "step": 26580, "train_speed(iter/s)": 0.20101 }, { "acc": 0.79111242, "epoch": 0.6203648347161668, "grad_norm": 5.25, "learning_rate": 8.078770487894314e-06, "loss": 0.75197382, "memory(GiB)": 135.77, "step": 26590, "train_speed(iter/s)": 0.201049 }, { "acc": 0.76238708, "epoch": 0.6205981422884557, "grad_norm": 7.65625, "learning_rate": 8.077281764618674e-06, "loss": 0.85257568, "memory(GiB)": 135.77, "step": 26600, "train_speed(iter/s)": 0.201088 }, { "acc": 0.77836542, "epoch": 0.6208314498607446, "grad_norm": 4.625, "learning_rate": 8.075792602061955e-06, "loss": 0.79014702, "memory(GiB)": 135.77, "step": 26610, "train_speed(iter/s)": 0.201128 }, { "acc": 0.77676468, "epoch": 0.6210647574330335, "grad_norm": 4.90625, "learning_rate": 8.074303000436737e-06, "loss": 0.79348154, "memory(GiB)": 135.77, "step": 26620, "train_speed(iter/s)": 0.201168 }, { "acc": 0.7470377, "epoch": 0.6212980650053224, "grad_norm": 6.5625, "learning_rate": 8.072812959955657e-06, "loss": 0.92623911, "memory(GiB)": 135.77, "step": 26630, "train_speed(iter/s)": 0.201208 }, { "acc": 0.78787136, "epoch": 0.6215313725776113, "grad_norm": 5.0625, "learning_rate": 8.071322480831422e-06, "loss": 0.77648373, "memory(GiB)": 135.77, "step": 26640, "train_speed(iter/s)": 0.201247 }, { "acc": 0.7703825, "epoch": 0.6217646801499002, "grad_norm": 4.25, "learning_rate": 8.069831563276793e-06, "loss": 0.82653189, "memory(GiB)": 135.77, "step": 26650, "train_speed(iter/s)": 0.201288 }, { "acc": 0.76768293, "epoch": 0.6219979877221891, "grad_norm": 4.9375, "learning_rate": 8.068340207504601e-06, "loss": 0.84485979, "memory(GiB)": 135.77, "step": 26660, "train_speed(iter/s)": 0.201326 }, { "acc": 0.78323984, "epoch": 0.6222312952944778, "grad_norm": 5.9375, "learning_rate": 8.066848413727736e-06, "loss": 0.77385492, "memory(GiB)": 135.77, "step": 26670, "train_speed(iter/s)": 0.201365 }, { "acc": 0.77316294, "epoch": 0.6224646028667667, "grad_norm": 15.5, "learning_rate": 8.06535618215915e-06, "loss": 0.83223066, "memory(GiB)": 135.77, "step": 26680, "train_speed(iter/s)": 0.201403 }, { "acc": 0.77781687, "epoch": 0.6226979104390556, "grad_norm": 6.1875, "learning_rate": 8.06386351301186e-06, "loss": 0.80505581, "memory(GiB)": 135.77, "step": 26690, "train_speed(iter/s)": 0.201442 }, { "acc": 0.78201003, "epoch": 0.6229312180113445, "grad_norm": 4.96875, "learning_rate": 8.062370406498944e-06, "loss": 0.77910857, "memory(GiB)": 135.77, "step": 26700, "train_speed(iter/s)": 0.20148 }, { "acc": 0.75401936, "epoch": 0.6231645255836334, "grad_norm": 4.1875, "learning_rate": 8.060876862833543e-06, "loss": 0.88524904, "memory(GiB)": 135.77, "step": 26710, "train_speed(iter/s)": 0.201519 }, { "acc": 0.77211714, "epoch": 0.6233978331559223, "grad_norm": 4.5625, "learning_rate": 8.059382882228857e-06, "loss": 0.81682949, "memory(GiB)": 135.77, "step": 26720, "train_speed(iter/s)": 0.201559 }, { "acc": 0.77653065, "epoch": 0.6236311407282112, "grad_norm": 3.890625, "learning_rate": 8.057888464898153e-06, "loss": 0.81049767, "memory(GiB)": 135.77, "step": 26730, "train_speed(iter/s)": 0.201597 }, { "acc": 0.76580505, "epoch": 0.6238644483005001, "grad_norm": 6.96875, "learning_rate": 8.056393611054761e-06, "loss": 0.84048862, "memory(GiB)": 135.77, "step": 26740, "train_speed(iter/s)": 0.201636 }, { "acc": 0.77059517, "epoch": 0.624097755872789, "grad_norm": 6.0, "learning_rate": 8.054898320912069e-06, "loss": 0.84382706, "memory(GiB)": 135.77, "step": 26750, "train_speed(iter/s)": 0.201676 }, { "acc": 0.78693314, "epoch": 0.6243310634450779, "grad_norm": 7.25, "learning_rate": 8.053402594683527e-06, "loss": 0.76821909, "memory(GiB)": 135.77, "step": 26760, "train_speed(iter/s)": 0.201715 }, { "acc": 0.78143969, "epoch": 0.6245643710173668, "grad_norm": 6.21875, "learning_rate": 8.051906432582651e-06, "loss": 0.77775021, "memory(GiB)": 135.77, "step": 26770, "train_speed(iter/s)": 0.201755 }, { "acc": 0.78252888, "epoch": 0.6247976785896557, "grad_norm": 4.625, "learning_rate": 8.050409834823021e-06, "loss": 0.77935286, "memory(GiB)": 135.77, "step": 26780, "train_speed(iter/s)": 0.201791 }, { "acc": 0.78484788, "epoch": 0.6250309861619446, "grad_norm": 5.59375, "learning_rate": 8.04891280161827e-06, "loss": 0.7744689, "memory(GiB)": 135.77, "step": 26790, "train_speed(iter/s)": 0.201828 }, { "acc": 0.7690753, "epoch": 0.6252642937342335, "grad_norm": 4.84375, "learning_rate": 8.047415333182105e-06, "loss": 0.82785397, "memory(GiB)": 135.77, "step": 26800, "train_speed(iter/s)": 0.201867 }, { "acc": 0.78505082, "epoch": 0.6254976013065224, "grad_norm": 4.40625, "learning_rate": 8.045917429728286e-06, "loss": 0.79035482, "memory(GiB)": 135.77, "step": 26810, "train_speed(iter/s)": 0.201908 }, { "acc": 0.77882872, "epoch": 0.6257309088788113, "grad_norm": 5.0625, "learning_rate": 8.044419091470638e-06, "loss": 0.79798956, "memory(GiB)": 135.77, "step": 26820, "train_speed(iter/s)": 0.201949 }, { "acc": 0.80624542, "epoch": 0.6259642164511002, "grad_norm": 4.375, "learning_rate": 8.042920318623051e-06, "loss": 0.67850709, "memory(GiB)": 135.77, "step": 26830, "train_speed(iter/s)": 0.201989 }, { "acc": 0.77503967, "epoch": 0.6261975240233891, "grad_norm": 5.375, "learning_rate": 8.04142111139947e-06, "loss": 0.8061245, "memory(GiB)": 135.77, "step": 26840, "train_speed(iter/s)": 0.202027 }, { "acc": 0.76675572, "epoch": 0.626430831595678, "grad_norm": 4.75, "learning_rate": 8.039921470013912e-06, "loss": 0.85774889, "memory(GiB)": 135.77, "step": 26850, "train_speed(iter/s)": 0.202064 }, { "acc": 0.76741781, "epoch": 0.6266641391679668, "grad_norm": 9.8125, "learning_rate": 8.038421394680445e-06, "loss": 0.82042427, "memory(GiB)": 135.77, "step": 26860, "train_speed(iter/s)": 0.202104 }, { "acc": 0.78301311, "epoch": 0.6268974467402557, "grad_norm": 5.0, "learning_rate": 8.036920885613206e-06, "loss": 0.77580643, "memory(GiB)": 135.77, "step": 26870, "train_speed(iter/s)": 0.202141 }, { "acc": 0.78032942, "epoch": 0.6271307543125446, "grad_norm": 3.984375, "learning_rate": 8.035419943026395e-06, "loss": 0.7978806, "memory(GiB)": 135.77, "step": 26880, "train_speed(iter/s)": 0.202178 }, { "acc": 0.77603312, "epoch": 0.6273640618848335, "grad_norm": 4.28125, "learning_rate": 8.033918567134266e-06, "loss": 0.81758986, "memory(GiB)": 135.77, "step": 26890, "train_speed(iter/s)": 0.202213 }, { "acc": 0.76494641, "epoch": 0.6275973694571224, "grad_norm": 4.21875, "learning_rate": 8.032416758151144e-06, "loss": 0.83805542, "memory(GiB)": 135.77, "step": 26900, "train_speed(iter/s)": 0.202252 }, { "acc": 0.78844762, "epoch": 0.6278306770294113, "grad_norm": 6.625, "learning_rate": 8.030914516291413e-06, "loss": 0.73256688, "memory(GiB)": 135.77, "step": 26910, "train_speed(iter/s)": 0.202292 }, { "acc": 0.78790379, "epoch": 0.6280639846017002, "grad_norm": 4.84375, "learning_rate": 8.029411841769515e-06, "loss": 0.77325296, "memory(GiB)": 135.77, "step": 26920, "train_speed(iter/s)": 0.20233 }, { "acc": 0.77131805, "epoch": 0.6282972921739891, "grad_norm": 5.3125, "learning_rate": 8.027908734799954e-06, "loss": 0.82325258, "memory(GiB)": 135.77, "step": 26930, "train_speed(iter/s)": 0.202368 }, { "acc": 0.77684565, "epoch": 0.628530599746278, "grad_norm": 5.125, "learning_rate": 8.026405195597302e-06, "loss": 0.77707276, "memory(GiB)": 135.77, "step": 26940, "train_speed(iter/s)": 0.202407 }, { "acc": 0.78030272, "epoch": 0.6287639073185669, "grad_norm": 4.625, "learning_rate": 8.024901224376186e-06, "loss": 0.77970848, "memory(GiB)": 135.77, "step": 26950, "train_speed(iter/s)": 0.202447 }, { "acc": 0.78028889, "epoch": 0.6289972148908558, "grad_norm": 5.75, "learning_rate": 8.023396821351302e-06, "loss": 0.80391293, "memory(GiB)": 135.77, "step": 26960, "train_speed(iter/s)": 0.202486 }, { "acc": 0.76113973, "epoch": 0.6292305224631447, "grad_norm": 4.96875, "learning_rate": 8.021891986737399e-06, "loss": 0.84696064, "memory(GiB)": 135.77, "step": 26970, "train_speed(iter/s)": 0.202525 }, { "acc": 0.77851915, "epoch": 0.6294638300354336, "grad_norm": 12.1875, "learning_rate": 8.020386720749292e-06, "loss": 0.8118988, "memory(GiB)": 135.77, "step": 26980, "train_speed(iter/s)": 0.202564 }, { "acc": 0.78185272, "epoch": 0.6296971376077225, "grad_norm": 6.625, "learning_rate": 8.018881023601858e-06, "loss": 0.77297134, "memory(GiB)": 135.77, "step": 26990, "train_speed(iter/s)": 0.202603 }, { "acc": 0.77753963, "epoch": 0.6299304451800114, "grad_norm": 4.90625, "learning_rate": 8.017374895510035e-06, "loss": 0.80134878, "memory(GiB)": 135.77, "step": 27000, "train_speed(iter/s)": 0.202641 }, { "epoch": 0.6299304451800114, "eval_acc": 0.7406633910765392, "eval_loss": 0.8188360929489136, "eval_runtime": 1270.4842, "eval_samples_per_second": 28.329, "eval_steps_per_second": 14.165, "step": 27000 }, { "acc": 0.78660893, "epoch": 0.6301637527523003, "grad_norm": 4.59375, "learning_rate": 8.015868336688822e-06, "loss": 0.79268298, "memory(GiB)": 135.77, "step": 27010, "train_speed(iter/s)": 0.200742 }, { "acc": 0.78694639, "epoch": 0.6303970603245892, "grad_norm": 10.5625, "learning_rate": 8.01436134735328e-06, "loss": 0.76775427, "memory(GiB)": 135.77, "step": 27020, "train_speed(iter/s)": 0.20078 }, { "acc": 0.79044333, "epoch": 0.6306303678968781, "grad_norm": 6.0, "learning_rate": 8.012853927718532e-06, "loss": 0.76151662, "memory(GiB)": 135.77, "step": 27030, "train_speed(iter/s)": 0.200816 }, { "acc": 0.77135372, "epoch": 0.630863675469167, "grad_norm": 5.28125, "learning_rate": 8.011346077999762e-06, "loss": 0.82462111, "memory(GiB)": 135.77, "step": 27040, "train_speed(iter/s)": 0.200856 }, { "acc": 0.76923323, "epoch": 0.6310969830414559, "grad_norm": 5.6875, "learning_rate": 8.009837798412213e-06, "loss": 0.81341896, "memory(GiB)": 135.77, "step": 27050, "train_speed(iter/s)": 0.200892 }, { "acc": 0.77677755, "epoch": 0.6313302906137447, "grad_norm": 5.9375, "learning_rate": 8.008329089171192e-06, "loss": 0.80337753, "memory(GiB)": 135.77, "step": 27060, "train_speed(iter/s)": 0.20093 }, { "acc": 0.77744932, "epoch": 0.6315635981860336, "grad_norm": 5.4375, "learning_rate": 8.006819950492067e-06, "loss": 0.79293747, "memory(GiB)": 135.77, "step": 27070, "train_speed(iter/s)": 0.200969 }, { "acc": 0.76862764, "epoch": 0.6317969057583225, "grad_norm": 5.9375, "learning_rate": 8.00531038259027e-06, "loss": 0.82000647, "memory(GiB)": 135.77, "step": 27080, "train_speed(iter/s)": 0.201008 }, { "acc": 0.79620075, "epoch": 0.6320302133306114, "grad_norm": 6.09375, "learning_rate": 8.003800385681287e-06, "loss": 0.73623171, "memory(GiB)": 135.77, "step": 27090, "train_speed(iter/s)": 0.201047 }, { "acc": 0.77440538, "epoch": 0.6322635209029003, "grad_norm": 5.75, "learning_rate": 8.002289959980672e-06, "loss": 0.81353531, "memory(GiB)": 135.77, "step": 27100, "train_speed(iter/s)": 0.201087 }, { "acc": 0.77269154, "epoch": 0.6324968284751892, "grad_norm": 7.9375, "learning_rate": 8.000779105704037e-06, "loss": 0.82167168, "memory(GiB)": 135.77, "step": 27110, "train_speed(iter/s)": 0.201127 }, { "acc": 0.78157363, "epoch": 0.6327301360474781, "grad_norm": 6.1875, "learning_rate": 7.999267823067056e-06, "loss": 0.79839826, "memory(GiB)": 135.77, "step": 27120, "train_speed(iter/s)": 0.201168 }, { "acc": 0.77923985, "epoch": 0.632963443619767, "grad_norm": 7.96875, "learning_rate": 7.997756112285467e-06, "loss": 0.77553253, "memory(GiB)": 135.77, "step": 27130, "train_speed(iter/s)": 0.201207 }, { "acc": 0.77168446, "epoch": 0.6331967511920559, "grad_norm": 3.703125, "learning_rate": 7.996243973575062e-06, "loss": 0.82124062, "memory(GiB)": 135.77, "step": 27140, "train_speed(iter/s)": 0.201245 }, { "acc": 0.77455969, "epoch": 0.6334300587643448, "grad_norm": 6.1875, "learning_rate": 7.994731407151702e-06, "loss": 0.80292664, "memory(GiB)": 135.77, "step": 27150, "train_speed(iter/s)": 0.201281 }, { "acc": 0.782617, "epoch": 0.6336633663366337, "grad_norm": 5.03125, "learning_rate": 7.9932184132313e-06, "loss": 0.77485504, "memory(GiB)": 135.77, "step": 27160, "train_speed(iter/s)": 0.201316 }, { "acc": 0.75979109, "epoch": 0.6338966739089226, "grad_norm": 4.46875, "learning_rate": 7.99170499202984e-06, "loss": 0.8872653, "memory(GiB)": 135.77, "step": 27170, "train_speed(iter/s)": 0.201356 }, { "acc": 0.7633131, "epoch": 0.6341299814812115, "grad_norm": 6.46875, "learning_rate": 7.990191143763364e-06, "loss": 0.84359808, "memory(GiB)": 135.77, "step": 27180, "train_speed(iter/s)": 0.201395 }, { "acc": 0.77210131, "epoch": 0.6343632890535004, "grad_norm": 7.96875, "learning_rate": 7.988676868647969e-06, "loss": 0.83302078, "memory(GiB)": 135.77, "step": 27190, "train_speed(iter/s)": 0.201435 }, { "acc": 0.76380386, "epoch": 0.6345965966257893, "grad_norm": 6.25, "learning_rate": 7.98716216689982e-06, "loss": 0.85607281, "memory(GiB)": 135.77, "step": 27200, "train_speed(iter/s)": 0.201476 }, { "acc": 0.77592916, "epoch": 0.6348299041980782, "grad_norm": 5.0625, "learning_rate": 7.985647038735139e-06, "loss": 0.812115, "memory(GiB)": 135.77, "step": 27210, "train_speed(iter/s)": 0.201516 }, { "acc": 0.77886858, "epoch": 0.6350632117703671, "grad_norm": 4.71875, "learning_rate": 7.98413148437021e-06, "loss": 0.79281015, "memory(GiB)": 135.77, "step": 27220, "train_speed(iter/s)": 0.201551 }, { "acc": 0.78781528, "epoch": 0.635296519342656, "grad_norm": 7.28125, "learning_rate": 7.98261550402138e-06, "loss": 0.75333433, "memory(GiB)": 135.77, "step": 27230, "train_speed(iter/s)": 0.201589 }, { "acc": 0.781744, "epoch": 0.6355298269149449, "grad_norm": 7.21875, "learning_rate": 7.981099097905051e-06, "loss": 0.78279748, "memory(GiB)": 135.77, "step": 27240, "train_speed(iter/s)": 0.201626 }, { "acc": 0.76686149, "epoch": 0.6357631344872336, "grad_norm": 6.34375, "learning_rate": 7.979582266237695e-06, "loss": 0.87493477, "memory(GiB)": 135.77, "step": 27250, "train_speed(iter/s)": 0.201666 }, { "acc": 0.7830512, "epoch": 0.6359964420595225, "grad_norm": 4.78125, "learning_rate": 7.978065009235834e-06, "loss": 0.78735223, "memory(GiB)": 135.77, "step": 27260, "train_speed(iter/s)": 0.201705 }, { "acc": 0.77184772, "epoch": 0.6362297496318114, "grad_norm": 5.53125, "learning_rate": 7.976547327116058e-06, "loss": 0.80568523, "memory(GiB)": 135.77, "step": 27270, "train_speed(iter/s)": 0.201741 }, { "acc": 0.76909132, "epoch": 0.6364630572041003, "grad_norm": 8.375, "learning_rate": 7.975029220095016e-06, "loss": 0.81954041, "memory(GiB)": 135.77, "step": 27280, "train_speed(iter/s)": 0.20178 }, { "acc": 0.77826443, "epoch": 0.6366963647763892, "grad_norm": 6.03125, "learning_rate": 7.973510688389417e-06, "loss": 0.7962677, "memory(GiB)": 135.77, "step": 27290, "train_speed(iter/s)": 0.201818 }, { "acc": 0.77817726, "epoch": 0.6369296723486781, "grad_norm": 5.6875, "learning_rate": 7.971991732216032e-06, "loss": 0.80456171, "memory(GiB)": 135.77, "step": 27300, "train_speed(iter/s)": 0.201857 }, { "acc": 0.76000805, "epoch": 0.637162979920967, "grad_norm": 5.375, "learning_rate": 7.97047235179169e-06, "loss": 0.8421195, "memory(GiB)": 135.77, "step": 27310, "train_speed(iter/s)": 0.201896 }, { "acc": 0.78686552, "epoch": 0.6373962874932559, "grad_norm": 5.71875, "learning_rate": 7.968952547333281e-06, "loss": 0.75428095, "memory(GiB)": 135.77, "step": 27320, "train_speed(iter/s)": 0.201932 }, { "acc": 0.76575708, "epoch": 0.6376295950655448, "grad_norm": 6.15625, "learning_rate": 7.967432319057762e-06, "loss": 0.84427853, "memory(GiB)": 135.77, "step": 27330, "train_speed(iter/s)": 0.201969 }, { "acc": 0.76280413, "epoch": 0.6378629026378337, "grad_norm": 4.1875, "learning_rate": 7.965911667182138e-06, "loss": 0.85895653, "memory(GiB)": 135.77, "step": 27340, "train_speed(iter/s)": 0.202008 }, { "acc": 0.78277311, "epoch": 0.6380962102101226, "grad_norm": 4.34375, "learning_rate": 7.964390591923487e-06, "loss": 0.77046175, "memory(GiB)": 135.77, "step": 27350, "train_speed(iter/s)": 0.202046 }, { "acc": 0.76807733, "epoch": 0.6383295177824115, "grad_norm": 4.25, "learning_rate": 7.962869093498939e-06, "loss": 0.84009323, "memory(GiB)": 135.77, "step": 27360, "train_speed(iter/s)": 0.202083 }, { "acc": 0.78158169, "epoch": 0.6385628253547004, "grad_norm": 5.34375, "learning_rate": 7.961347172125689e-06, "loss": 0.78689456, "memory(GiB)": 135.77, "step": 27370, "train_speed(iter/s)": 0.202122 }, { "acc": 0.75279794, "epoch": 0.6387961329269893, "grad_norm": 5.875, "learning_rate": 7.959824828020991e-06, "loss": 0.91323185, "memory(GiB)": 135.77, "step": 27380, "train_speed(iter/s)": 0.202153 }, { "acc": 0.78088951, "epoch": 0.6390294404992782, "grad_norm": 6.28125, "learning_rate": 7.958302061402159e-06, "loss": 0.76333561, "memory(GiB)": 135.77, "step": 27390, "train_speed(iter/s)": 0.202192 }, { "acc": 0.76975918, "epoch": 0.6392627480715671, "grad_norm": 4.5, "learning_rate": 7.956778872486566e-06, "loss": 0.82985058, "memory(GiB)": 135.77, "step": 27400, "train_speed(iter/s)": 0.202226 }, { "acc": 0.77449579, "epoch": 0.639496055643856, "grad_norm": 6.5, "learning_rate": 7.955255261491648e-06, "loss": 0.80823078, "memory(GiB)": 135.77, "step": 27410, "train_speed(iter/s)": 0.202263 }, { "acc": 0.78136916, "epoch": 0.6397293632161449, "grad_norm": 5.5, "learning_rate": 7.9537312286349e-06, "loss": 0.78419118, "memory(GiB)": 135.77, "step": 27420, "train_speed(iter/s)": 0.202299 }, { "acc": 0.7795558, "epoch": 0.6399626707884338, "grad_norm": 5.125, "learning_rate": 7.952206774133878e-06, "loss": 0.79317288, "memory(GiB)": 135.77, "step": 27430, "train_speed(iter/s)": 0.202337 }, { "acc": 0.76971769, "epoch": 0.6401959783607226, "grad_norm": 7.875, "learning_rate": 7.950681898206197e-06, "loss": 0.81421986, "memory(GiB)": 135.77, "step": 27440, "train_speed(iter/s)": 0.202373 }, { "acc": 0.76713629, "epoch": 0.6404292859330115, "grad_norm": 7.46875, "learning_rate": 7.949156601069531e-06, "loss": 0.86134014, "memory(GiB)": 135.77, "step": 27450, "train_speed(iter/s)": 0.20241 }, { "acc": 0.78460703, "epoch": 0.6406625935053004, "grad_norm": 5.40625, "learning_rate": 7.947630882941617e-06, "loss": 0.7690053, "memory(GiB)": 135.77, "step": 27460, "train_speed(iter/s)": 0.202448 }, { "acc": 0.77735882, "epoch": 0.6408959010775893, "grad_norm": 6.96875, "learning_rate": 7.94610474404025e-06, "loss": 0.82368259, "memory(GiB)": 135.77, "step": 27470, "train_speed(iter/s)": 0.202484 }, { "acc": 0.77271509, "epoch": 0.6411292086498782, "grad_norm": 6.15625, "learning_rate": 7.944578184583289e-06, "loss": 0.82098379, "memory(GiB)": 135.77, "step": 27480, "train_speed(iter/s)": 0.202525 }, { "acc": 0.77546682, "epoch": 0.6413625162221671, "grad_norm": 4.0625, "learning_rate": 7.943051204788646e-06, "loss": 0.81826458, "memory(GiB)": 135.77, "step": 27490, "train_speed(iter/s)": 0.202563 }, { "acc": 0.79403472, "epoch": 0.641595823794456, "grad_norm": 5.1875, "learning_rate": 7.941523804874298e-06, "loss": 0.73600636, "memory(GiB)": 135.77, "step": 27500, "train_speed(iter/s)": 0.202599 }, { "epoch": 0.641595823794456, "eval_acc": 0.7411853011296093, "eval_loss": 0.818037211894989, "eval_runtime": 1271.0412, "eval_samples_per_second": 28.316, "eval_steps_per_second": 14.158, "step": 27500 }, { "acc": 0.757687, "epoch": 0.6418291313667449, "grad_norm": 6.21875, "learning_rate": 7.939995985058282e-06, "loss": 0.86413383, "memory(GiB)": 135.77, "step": 27510, "train_speed(iter/s)": 0.200732 }, { "acc": 0.7694766, "epoch": 0.6420624389390338, "grad_norm": 7.8125, "learning_rate": 7.938467745558693e-06, "loss": 0.82724009, "memory(GiB)": 135.77, "step": 27520, "train_speed(iter/s)": 0.200767 }, { "acc": 0.7876204, "epoch": 0.6422957465113227, "grad_norm": 8.8125, "learning_rate": 7.936939086593688e-06, "loss": 0.7523385, "memory(GiB)": 135.77, "step": 27530, "train_speed(iter/s)": 0.200801 }, { "acc": 0.81005259, "epoch": 0.6425290540836116, "grad_norm": 6.96875, "learning_rate": 7.935410008381482e-06, "loss": 0.67726383, "memory(GiB)": 135.77, "step": 27540, "train_speed(iter/s)": 0.200839 }, { "acc": 0.78100648, "epoch": 0.6427623616559005, "grad_norm": 5.625, "learning_rate": 7.933880511140349e-06, "loss": 0.77527313, "memory(GiB)": 135.77, "step": 27550, "train_speed(iter/s)": 0.200879 }, { "acc": 0.7760963, "epoch": 0.6429956692281894, "grad_norm": 5.5625, "learning_rate": 7.932350595088623e-06, "loss": 0.7997427, "memory(GiB)": 135.77, "step": 27560, "train_speed(iter/s)": 0.200916 }, { "acc": 0.78075814, "epoch": 0.6432289768004783, "grad_norm": 6.3125, "learning_rate": 7.930820260444705e-06, "loss": 0.78330669, "memory(GiB)": 135.77, "step": 27570, "train_speed(iter/s)": 0.200952 }, { "acc": 0.77887297, "epoch": 0.6434622843727672, "grad_norm": 11.3125, "learning_rate": 7.929289507427044e-06, "loss": 0.79019032, "memory(GiB)": 135.77, "step": 27580, "train_speed(iter/s)": 0.200991 }, { "acc": 0.78241224, "epoch": 0.6436955919450561, "grad_norm": 4.96875, "learning_rate": 7.927758336254156e-06, "loss": 0.79271832, "memory(GiB)": 135.77, "step": 27590, "train_speed(iter/s)": 0.201029 }, { "acc": 0.75557337, "epoch": 0.643928899517345, "grad_norm": 8.4375, "learning_rate": 7.926226747144618e-06, "loss": 0.85448341, "memory(GiB)": 135.77, "step": 27600, "train_speed(iter/s)": 0.201065 }, { "acc": 0.77307668, "epoch": 0.6441622070896339, "grad_norm": 5.34375, "learning_rate": 7.924694740317063e-06, "loss": 0.83668203, "memory(GiB)": 135.77, "step": 27610, "train_speed(iter/s)": 0.201102 }, { "acc": 0.76995897, "epoch": 0.6443955146619228, "grad_norm": 6.6875, "learning_rate": 7.923162315990181e-06, "loss": 0.82851915, "memory(GiB)": 135.77, "step": 27620, "train_speed(iter/s)": 0.201139 }, { "acc": 0.77117805, "epoch": 0.6446288222342116, "grad_norm": 5.65625, "learning_rate": 7.92162947438273e-06, "loss": 0.82947626, "memory(GiB)": 135.77, "step": 27630, "train_speed(iter/s)": 0.201177 }, { "acc": 0.76669693, "epoch": 0.6448621298065005, "grad_norm": 5.3125, "learning_rate": 7.920096215713518e-06, "loss": 0.83536053, "memory(GiB)": 135.77, "step": 27640, "train_speed(iter/s)": 0.201211 }, { "acc": 0.78462639, "epoch": 0.6450954373787894, "grad_norm": 6.09375, "learning_rate": 7.91856254020142e-06, "loss": 0.78443203, "memory(GiB)": 135.77, "step": 27650, "train_speed(iter/s)": 0.201247 }, { "acc": 0.78006172, "epoch": 0.6453287449510783, "grad_norm": 6.28125, "learning_rate": 7.917028448065368e-06, "loss": 0.8074728, "memory(GiB)": 135.77, "step": 27660, "train_speed(iter/s)": 0.201283 }, { "acc": 0.76845789, "epoch": 0.6455620525233672, "grad_norm": 4.1875, "learning_rate": 7.915493939524352e-06, "loss": 0.8165184, "memory(GiB)": 135.77, "step": 27670, "train_speed(iter/s)": 0.201316 }, { "acc": 0.78136225, "epoch": 0.6457953600956561, "grad_norm": 6.53125, "learning_rate": 7.913959014797424e-06, "loss": 0.77983022, "memory(GiB)": 135.77, "step": 27680, "train_speed(iter/s)": 0.201352 }, { "acc": 0.75951281, "epoch": 0.646028667667945, "grad_norm": 5.84375, "learning_rate": 7.91242367410369e-06, "loss": 0.85881014, "memory(GiB)": 135.77, "step": 27690, "train_speed(iter/s)": 0.20139 }, { "acc": 0.77766533, "epoch": 0.6462619752402339, "grad_norm": 5.28125, "learning_rate": 7.910887917662326e-06, "loss": 0.80150757, "memory(GiB)": 135.77, "step": 27700, "train_speed(iter/s)": 0.201428 }, { "acc": 0.77703729, "epoch": 0.6464952828125228, "grad_norm": 5.6875, "learning_rate": 7.909351745692557e-06, "loss": 0.83589764, "memory(GiB)": 135.77, "step": 27710, "train_speed(iter/s)": 0.201465 }, { "acc": 0.7554203, "epoch": 0.6467285903848117, "grad_norm": 4.6875, "learning_rate": 7.907815158413669e-06, "loss": 0.90056362, "memory(GiB)": 135.77, "step": 27720, "train_speed(iter/s)": 0.201502 }, { "acc": 0.77123346, "epoch": 0.6469618979571006, "grad_norm": 6.375, "learning_rate": 7.906278156045015e-06, "loss": 0.8136116, "memory(GiB)": 135.77, "step": 27730, "train_speed(iter/s)": 0.20154 }, { "acc": 0.78147049, "epoch": 0.6471952055293895, "grad_norm": 5.59375, "learning_rate": 7.904740738805996e-06, "loss": 0.79858246, "memory(GiB)": 135.77, "step": 27740, "train_speed(iter/s)": 0.201576 }, { "acc": 0.76583729, "epoch": 0.6474285131016784, "grad_norm": 19.5, "learning_rate": 7.90320290691608e-06, "loss": 0.8749052, "memory(GiB)": 135.77, "step": 27750, "train_speed(iter/s)": 0.201611 }, { "acc": 0.77532988, "epoch": 0.6476618206739673, "grad_norm": 5.3125, "learning_rate": 7.901664660594794e-06, "loss": 0.81616383, "memory(GiB)": 135.77, "step": 27760, "train_speed(iter/s)": 0.20165 }, { "acc": 0.78104067, "epoch": 0.6478951282462562, "grad_norm": 5.3125, "learning_rate": 7.90012600006172e-06, "loss": 0.76530261, "memory(GiB)": 135.77, "step": 27770, "train_speed(iter/s)": 0.201684 }, { "acc": 0.78051329, "epoch": 0.6481284358185451, "grad_norm": 4.1875, "learning_rate": 7.898586925536504e-06, "loss": 0.78549862, "memory(GiB)": 135.77, "step": 27780, "train_speed(iter/s)": 0.201719 }, { "acc": 0.77190304, "epoch": 0.648361743390834, "grad_norm": 5.1875, "learning_rate": 7.897047437238845e-06, "loss": 0.83245649, "memory(GiB)": 135.77, "step": 27790, "train_speed(iter/s)": 0.201756 }, { "acc": 0.76675649, "epoch": 0.6485950509631229, "grad_norm": 6.84375, "learning_rate": 7.895507535388506e-06, "loss": 0.85838509, "memory(GiB)": 135.77, "step": 27800, "train_speed(iter/s)": 0.201794 }, { "acc": 0.76727285, "epoch": 0.6488283585354118, "grad_norm": 4.9375, "learning_rate": 7.893967220205307e-06, "loss": 0.84388571, "memory(GiB)": 135.77, "step": 27810, "train_speed(iter/s)": 0.201829 }, { "acc": 0.77514563, "epoch": 0.6490616661077007, "grad_norm": 4.5625, "learning_rate": 7.89242649190913e-06, "loss": 0.80860777, "memory(GiB)": 135.77, "step": 27820, "train_speed(iter/s)": 0.201866 }, { "acc": 0.76715317, "epoch": 0.6492949736799895, "grad_norm": 5.53125, "learning_rate": 7.890885350719907e-06, "loss": 0.83706264, "memory(GiB)": 135.77, "step": 27830, "train_speed(iter/s)": 0.201903 }, { "acc": 0.77728567, "epoch": 0.6495282812522783, "grad_norm": 5.25, "learning_rate": 7.889343796857645e-06, "loss": 0.79823217, "memory(GiB)": 135.77, "step": 27840, "train_speed(iter/s)": 0.201939 }, { "acc": 0.78821392, "epoch": 0.6497615888245672, "grad_norm": 6.71875, "learning_rate": 7.887801830542392e-06, "loss": 0.75157042, "memory(GiB)": 135.77, "step": 27850, "train_speed(iter/s)": 0.201976 }, { "acc": 0.7779901, "epoch": 0.6499948963968561, "grad_norm": 4.84375, "learning_rate": 7.886259451994267e-06, "loss": 0.79298458, "memory(GiB)": 135.77, "step": 27860, "train_speed(iter/s)": 0.202013 }, { "acc": 0.76499109, "epoch": 0.650228203969145, "grad_norm": 5.25, "learning_rate": 7.884716661433444e-06, "loss": 0.87161884, "memory(GiB)": 135.77, "step": 27870, "train_speed(iter/s)": 0.202049 }, { "acc": 0.7676559, "epoch": 0.650461511541434, "grad_norm": 6.40625, "learning_rate": 7.883173459080159e-06, "loss": 0.83052654, "memory(GiB)": 135.77, "step": 27880, "train_speed(iter/s)": 0.202088 }, { "acc": 0.78522291, "epoch": 0.6506948191137228, "grad_norm": 5.46875, "learning_rate": 7.881629845154696e-06, "loss": 0.77097383, "memory(GiB)": 135.77, "step": 27890, "train_speed(iter/s)": 0.202123 }, { "acc": 0.76635356, "epoch": 0.6509281266860117, "grad_norm": 4.8125, "learning_rate": 7.880085819877411e-06, "loss": 0.84650917, "memory(GiB)": 135.77, "step": 27900, "train_speed(iter/s)": 0.202162 }, { "acc": 0.76490736, "epoch": 0.6511614342583006, "grad_norm": 4.71875, "learning_rate": 7.878541383468712e-06, "loss": 0.8467804, "memory(GiB)": 135.77, "step": 27910, "train_speed(iter/s)": 0.202199 }, { "acc": 0.76074095, "epoch": 0.6513947418305895, "grad_norm": 5.03125, "learning_rate": 7.876996536149067e-06, "loss": 0.85577497, "memory(GiB)": 135.77, "step": 27920, "train_speed(iter/s)": 0.202233 }, { "acc": 0.79905081, "epoch": 0.6516280494028784, "grad_norm": 5.9375, "learning_rate": 7.875451278139001e-06, "loss": 0.71797628, "memory(GiB)": 135.77, "step": 27930, "train_speed(iter/s)": 0.202268 }, { "acc": 0.77437353, "epoch": 0.6518613569751673, "grad_norm": 6.6875, "learning_rate": 7.873905609659102e-06, "loss": 0.80220547, "memory(GiB)": 135.77, "step": 27940, "train_speed(iter/s)": 0.202306 }, { "acc": 0.78056316, "epoch": 0.6520946645474562, "grad_norm": 5.25, "learning_rate": 7.872359530930011e-06, "loss": 0.76737919, "memory(GiB)": 135.77, "step": 27950, "train_speed(iter/s)": 0.202343 }, { "acc": 0.77328043, "epoch": 0.6523279721197451, "grad_norm": 6.5, "learning_rate": 7.870813042172432e-06, "loss": 0.8179678, "memory(GiB)": 135.77, "step": 27960, "train_speed(iter/s)": 0.202381 }, { "acc": 0.784305, "epoch": 0.652561279692034, "grad_norm": 4.0, "learning_rate": 7.869266143607124e-06, "loss": 0.76675787, "memory(GiB)": 135.77, "step": 27970, "train_speed(iter/s)": 0.202421 }, { "acc": 0.77683382, "epoch": 0.6527945872643229, "grad_norm": 4.6875, "learning_rate": 7.86771883545491e-06, "loss": 0.80360317, "memory(GiB)": 135.77, "step": 27980, "train_speed(iter/s)": 0.202459 }, { "acc": 0.76954408, "epoch": 0.6530278948366118, "grad_norm": 6.28125, "learning_rate": 7.866171117936663e-06, "loss": 0.83105335, "memory(GiB)": 135.77, "step": 27990, "train_speed(iter/s)": 0.202497 }, { "acc": 0.76160703, "epoch": 0.6532612024089007, "grad_norm": 6.3125, "learning_rate": 7.864622991273322e-06, "loss": 0.8450861, "memory(GiB)": 135.77, "step": 28000, "train_speed(iter/s)": 0.202537 }, { "epoch": 0.6532612024089007, "eval_acc": 0.7410069004548256, "eval_loss": 0.8178855180740356, "eval_runtime": 1271.7089, "eval_samples_per_second": 28.301, "eval_steps_per_second": 14.151, "step": 28000 }, { "acc": 0.77182312, "epoch": 0.6534945099811896, "grad_norm": 8.375, "learning_rate": 7.863074455685882e-06, "loss": 0.83168869, "memory(GiB)": 135.77, "step": 28010, "train_speed(iter/s)": 0.200706 }, { "acc": 0.78294449, "epoch": 0.6537278175534784, "grad_norm": 4.375, "learning_rate": 7.861525511395394e-06, "loss": 0.76020308, "memory(GiB)": 135.77, "step": 28020, "train_speed(iter/s)": 0.200743 }, { "acc": 0.76998968, "epoch": 0.6539611251257673, "grad_norm": 4.71875, "learning_rate": 7.859976158622971e-06, "loss": 0.82431984, "memory(GiB)": 135.77, "step": 28030, "train_speed(iter/s)": 0.200778 }, { "acc": 0.75824432, "epoch": 0.6541944326980562, "grad_norm": 6.0625, "learning_rate": 7.858426397589783e-06, "loss": 0.87359171, "memory(GiB)": 135.77, "step": 28040, "train_speed(iter/s)": 0.200818 }, { "acc": 0.76430311, "epoch": 0.6544277402703451, "grad_norm": 10.0, "learning_rate": 7.856876228517057e-06, "loss": 0.84922371, "memory(GiB)": 135.77, "step": 28050, "train_speed(iter/s)": 0.200859 }, { "acc": 0.7763073, "epoch": 0.654661047842634, "grad_norm": 5.21875, "learning_rate": 7.85532565162608e-06, "loss": 0.81707954, "memory(GiB)": 135.77, "step": 28060, "train_speed(iter/s)": 0.200896 }, { "acc": 0.78287649, "epoch": 0.6548943554149229, "grad_norm": 3.984375, "learning_rate": 7.853774667138192e-06, "loss": 0.76915636, "memory(GiB)": 135.77, "step": 28070, "train_speed(iter/s)": 0.200937 }, { "acc": 0.78447509, "epoch": 0.6551276629872118, "grad_norm": 4.34375, "learning_rate": 7.852223275274804e-06, "loss": 0.77548103, "memory(GiB)": 135.77, "step": 28080, "train_speed(iter/s)": 0.200974 }, { "acc": 0.77808089, "epoch": 0.6553609705595007, "grad_norm": 4.625, "learning_rate": 7.85067147625737e-06, "loss": 0.79100828, "memory(GiB)": 135.77, "step": 28090, "train_speed(iter/s)": 0.20101 }, { "acc": 0.776159, "epoch": 0.6555942781317896, "grad_norm": 8.3125, "learning_rate": 7.84911927030741e-06, "loss": 0.79818797, "memory(GiB)": 135.77, "step": 28100, "train_speed(iter/s)": 0.201047 }, { "acc": 0.76561847, "epoch": 0.6558275857040785, "grad_norm": 3.890625, "learning_rate": 7.847566657646502e-06, "loss": 0.84216652, "memory(GiB)": 135.77, "step": 28110, "train_speed(iter/s)": 0.201085 }, { "acc": 0.77506824, "epoch": 0.6560608932763674, "grad_norm": 5.34375, "learning_rate": 7.846013638496281e-06, "loss": 0.82243309, "memory(GiB)": 135.77, "step": 28120, "train_speed(iter/s)": 0.201123 }, { "acc": 0.78300853, "epoch": 0.6562942008486563, "grad_norm": 4.75, "learning_rate": 7.84446021307844e-06, "loss": 0.76296067, "memory(GiB)": 135.77, "step": 28130, "train_speed(iter/s)": 0.20116 }, { "acc": 0.75912266, "epoch": 0.6565275084209452, "grad_norm": 4.40625, "learning_rate": 7.842906381614732e-06, "loss": 0.84529533, "memory(GiB)": 135.77, "step": 28140, "train_speed(iter/s)": 0.201196 }, { "acc": 0.78884659, "epoch": 0.6567608159932341, "grad_norm": 6.75, "learning_rate": 7.841352144326962e-06, "loss": 0.76649256, "memory(GiB)": 135.77, "step": 28150, "train_speed(iter/s)": 0.201231 }, { "acc": 0.78107457, "epoch": 0.656994123565523, "grad_norm": 3.78125, "learning_rate": 7.839797501436999e-06, "loss": 0.7952775, "memory(GiB)": 135.77, "step": 28160, "train_speed(iter/s)": 0.201266 }, { "acc": 0.7618453, "epoch": 0.6572274311378119, "grad_norm": 5.28125, "learning_rate": 7.838242453166766e-06, "loss": 0.86572094, "memory(GiB)": 135.77, "step": 28170, "train_speed(iter/s)": 0.201305 }, { "acc": 0.77299466, "epoch": 0.6574607387101008, "grad_norm": 6.59375, "learning_rate": 7.83668699973825e-06, "loss": 0.83398046, "memory(GiB)": 135.77, "step": 28180, "train_speed(iter/s)": 0.201342 }, { "acc": 0.77326827, "epoch": 0.6576940462823897, "grad_norm": 5.3125, "learning_rate": 7.835131141373487e-06, "loss": 0.80730629, "memory(GiB)": 135.77, "step": 28190, "train_speed(iter/s)": 0.201379 }, { "acc": 0.78218737, "epoch": 0.6579273538546786, "grad_norm": 5.65625, "learning_rate": 7.833574878294578e-06, "loss": 0.79669929, "memory(GiB)": 135.77, "step": 28200, "train_speed(iter/s)": 0.201411 }, { "acc": 0.76740227, "epoch": 0.6581606614269674, "grad_norm": 5.6875, "learning_rate": 7.832018210723679e-06, "loss": 0.84285278, "memory(GiB)": 135.77, "step": 28210, "train_speed(iter/s)": 0.201447 }, { "acc": 0.76748447, "epoch": 0.6583939689992563, "grad_norm": 4.46875, "learning_rate": 7.830461138883e-06, "loss": 0.84718819, "memory(GiB)": 135.77, "step": 28220, "train_speed(iter/s)": 0.201482 }, { "acc": 0.77524757, "epoch": 0.6586272765715452, "grad_norm": 6.375, "learning_rate": 7.82890366299482e-06, "loss": 0.81450605, "memory(GiB)": 135.77, "step": 28230, "train_speed(iter/s)": 0.201522 }, { "acc": 0.76804585, "epoch": 0.6588605841438341, "grad_norm": 4.8125, "learning_rate": 7.827345783281462e-06, "loss": 0.82887039, "memory(GiB)": 135.77, "step": 28240, "train_speed(iter/s)": 0.201558 }, { "acc": 0.77396336, "epoch": 0.659093891716123, "grad_norm": 5.5625, "learning_rate": 7.825787499965315e-06, "loss": 0.83359642, "memory(GiB)": 135.77, "step": 28250, "train_speed(iter/s)": 0.201595 }, { "acc": 0.74629498, "epoch": 0.6593271992884119, "grad_norm": 5.96875, "learning_rate": 7.824228813268823e-06, "loss": 0.92181091, "memory(GiB)": 135.77, "step": 28260, "train_speed(iter/s)": 0.20163 }, { "acc": 0.78333774, "epoch": 0.6595605068607008, "grad_norm": 4.71875, "learning_rate": 7.822669723414488e-06, "loss": 0.78887167, "memory(GiB)": 135.77, "step": 28270, "train_speed(iter/s)": 0.201667 }, { "acc": 0.75382862, "epoch": 0.6597938144329897, "grad_norm": 3.953125, "learning_rate": 7.82111023062487e-06, "loss": 0.89491863, "memory(GiB)": 135.77, "step": 28280, "train_speed(iter/s)": 0.2017 }, { "acc": 0.76489215, "epoch": 0.6600271220052786, "grad_norm": 5.5625, "learning_rate": 7.819550335122587e-06, "loss": 0.86177435, "memory(GiB)": 135.77, "step": 28290, "train_speed(iter/s)": 0.201738 }, { "acc": 0.76324902, "epoch": 0.6602604295775675, "grad_norm": 5.25, "learning_rate": 7.817990037130312e-06, "loss": 0.85025234, "memory(GiB)": 135.77, "step": 28300, "train_speed(iter/s)": 0.201777 }, { "acc": 0.77776289, "epoch": 0.6604937371498564, "grad_norm": 6.0, "learning_rate": 7.816429336870778e-06, "loss": 0.80502405, "memory(GiB)": 135.77, "step": 28310, "train_speed(iter/s)": 0.201815 }, { "acc": 0.78841567, "epoch": 0.6607270447221453, "grad_norm": 10.3125, "learning_rate": 7.814868234566775e-06, "loss": 0.72700367, "memory(GiB)": 135.77, "step": 28320, "train_speed(iter/s)": 0.201851 }, { "acc": 0.7645155, "epoch": 0.6609603522944342, "grad_norm": 4.75, "learning_rate": 7.813306730441147e-06, "loss": 0.84438887, "memory(GiB)": 135.77, "step": 28330, "train_speed(iter/s)": 0.20189 }, { "acc": 0.77188535, "epoch": 0.6611936598667231, "grad_norm": 4.9375, "learning_rate": 7.811744824716803e-06, "loss": 0.82249565, "memory(GiB)": 135.77, "step": 28340, "train_speed(iter/s)": 0.201927 }, { "acc": 0.79082394, "epoch": 0.661426967439012, "grad_norm": 6.4375, "learning_rate": 7.810182517616702e-06, "loss": 0.73457441, "memory(GiB)": 135.77, "step": 28350, "train_speed(iter/s)": 0.201965 }, { "acc": 0.77906938, "epoch": 0.6616602750113009, "grad_norm": 5.625, "learning_rate": 7.808619809363863e-06, "loss": 0.78538847, "memory(GiB)": 135.77, "step": 28360, "train_speed(iter/s)": 0.202 }, { "acc": 0.78205838, "epoch": 0.6618935825835898, "grad_norm": 9.25, "learning_rate": 7.80705670018136e-06, "loss": 0.79499912, "memory(GiB)": 135.77, "step": 28370, "train_speed(iter/s)": 0.202036 }, { "acc": 0.78381143, "epoch": 0.6621268901558787, "grad_norm": 5.96875, "learning_rate": 7.805493190292327e-06, "loss": 0.78962469, "memory(GiB)": 135.77, "step": 28380, "train_speed(iter/s)": 0.202067 }, { "acc": 0.77339664, "epoch": 0.6623601977281676, "grad_norm": 4.15625, "learning_rate": 7.80392927991996e-06, "loss": 0.79976535, "memory(GiB)": 135.77, "step": 28390, "train_speed(iter/s)": 0.202101 }, { "acc": 0.77922812, "epoch": 0.6625935053004564, "grad_norm": 5.25, "learning_rate": 7.802364969287501e-06, "loss": 0.77621737, "memory(GiB)": 135.77, "step": 28400, "train_speed(iter/s)": 0.202139 }, { "acc": 0.7975256, "epoch": 0.6628268128727453, "grad_norm": 5.1875, "learning_rate": 7.80080025861826e-06, "loss": 0.7247571, "memory(GiB)": 135.77, "step": 28410, "train_speed(iter/s)": 0.202175 }, { "acc": 0.79595299, "epoch": 0.6630601204450342, "grad_norm": 5.4375, "learning_rate": 7.799235148135592e-06, "loss": 0.71732335, "memory(GiB)": 135.77, "step": 28420, "train_speed(iter/s)": 0.20221 }, { "acc": 0.77594852, "epoch": 0.663293428017323, "grad_norm": 5.84375, "learning_rate": 7.797669638062921e-06, "loss": 0.79501762, "memory(GiB)": 135.77, "step": 28430, "train_speed(iter/s)": 0.202245 }, { "acc": 0.77370367, "epoch": 0.663526735589612, "grad_norm": 8.25, "learning_rate": 7.796103728623723e-06, "loss": 0.82125921, "memory(GiB)": 135.77, "step": 28440, "train_speed(iter/s)": 0.202281 }, { "acc": 0.78018131, "epoch": 0.6637600431619008, "grad_norm": 5.53125, "learning_rate": 7.794537420041527e-06, "loss": 0.80163622, "memory(GiB)": 135.77, "step": 28450, "train_speed(iter/s)": 0.202317 }, { "acc": 0.77699895, "epoch": 0.6639933507341897, "grad_norm": 7.96875, "learning_rate": 7.792970712539929e-06, "loss": 0.81302996, "memory(GiB)": 135.77, "step": 28460, "train_speed(iter/s)": 0.202354 }, { "acc": 0.78811188, "epoch": 0.6642266583064786, "grad_norm": 5.0625, "learning_rate": 7.791403606342572e-06, "loss": 0.77973518, "memory(GiB)": 135.77, "step": 28470, "train_speed(iter/s)": 0.202388 }, { "acc": 0.77782664, "epoch": 0.6644599658787675, "grad_norm": 7.90625, "learning_rate": 7.78983610167316e-06, "loss": 0.79536843, "memory(GiB)": 135.77, "step": 28480, "train_speed(iter/s)": 0.202424 }, { "acc": 0.78967228, "epoch": 0.6646932734510564, "grad_norm": 7.125, "learning_rate": 7.788268198755456e-06, "loss": 0.7731019, "memory(GiB)": 135.77, "step": 28490, "train_speed(iter/s)": 0.202461 }, { "acc": 0.76639204, "epoch": 0.6649265810233453, "grad_norm": 6.96875, "learning_rate": 7.786699897813277e-06, "loss": 0.83777695, "memory(GiB)": 135.77, "step": 28500, "train_speed(iter/s)": 0.202497 }, { "epoch": 0.6649265810233453, "eval_acc": 0.7410190714164805, "eval_loss": 0.8165611028671265, "eval_runtime": 1270.5641, "eval_samples_per_second": 28.327, "eval_steps_per_second": 14.164, "step": 28500 }, { "acc": 0.75553088, "epoch": 0.6651598885956342, "grad_norm": 9.125, "learning_rate": 7.785131199070497e-06, "loss": 0.88834152, "memory(GiB)": 135.77, "step": 28510, "train_speed(iter/s)": 0.200698 }, { "acc": 0.76838694, "epoch": 0.6653931961679231, "grad_norm": 3.9375, "learning_rate": 7.783562102751048e-06, "loss": 0.82640839, "memory(GiB)": 135.77, "step": 28520, "train_speed(iter/s)": 0.200734 }, { "acc": 0.78102646, "epoch": 0.665626503740212, "grad_norm": 7.0, "learning_rate": 7.781992609078916e-06, "loss": 0.77242785, "memory(GiB)": 135.77, "step": 28530, "train_speed(iter/s)": 0.200769 }, { "acc": 0.78945589, "epoch": 0.6658598113125009, "grad_norm": 7.03125, "learning_rate": 7.780422718278148e-06, "loss": 0.77674723, "memory(GiB)": 135.77, "step": 28540, "train_speed(iter/s)": 0.200804 }, { "acc": 0.75834866, "epoch": 0.6660931188847898, "grad_norm": 5.5, "learning_rate": 7.778852430572846e-06, "loss": 0.89787149, "memory(GiB)": 135.77, "step": 28550, "train_speed(iter/s)": 0.20084 }, { "acc": 0.7631052, "epoch": 0.6663264264570787, "grad_norm": 6.75, "learning_rate": 7.777281746187163e-06, "loss": 0.86248188, "memory(GiB)": 135.77, "step": 28560, "train_speed(iter/s)": 0.200877 }, { "acc": 0.78461332, "epoch": 0.6665597340293676, "grad_norm": 6.25, "learning_rate": 7.775710665345322e-06, "loss": 0.78493948, "memory(GiB)": 135.77, "step": 28570, "train_speed(iter/s)": 0.200913 }, { "acc": 0.75799332, "epoch": 0.6667930416016565, "grad_norm": 4.71875, "learning_rate": 7.774139188271588e-06, "loss": 0.86719799, "memory(GiB)": 135.77, "step": 28580, "train_speed(iter/s)": 0.200949 }, { "acc": 0.78168793, "epoch": 0.6670263491739454, "grad_norm": 5.59375, "learning_rate": 7.772567315190291e-06, "loss": 0.79164333, "memory(GiB)": 135.77, "step": 28590, "train_speed(iter/s)": 0.200984 }, { "acc": 0.76213493, "epoch": 0.6672596567462342, "grad_norm": 4.65625, "learning_rate": 7.770995046325813e-06, "loss": 0.86090012, "memory(GiB)": 135.77, "step": 28600, "train_speed(iter/s)": 0.201022 }, { "acc": 0.76110024, "epoch": 0.6674929643185231, "grad_norm": 5.53125, "learning_rate": 7.769422381902601e-06, "loss": 0.84771814, "memory(GiB)": 135.77, "step": 28610, "train_speed(iter/s)": 0.201061 }, { "acc": 0.78071322, "epoch": 0.667726271890812, "grad_norm": 5.8125, "learning_rate": 7.767849322145144e-06, "loss": 0.79260278, "memory(GiB)": 135.77, "step": 28620, "train_speed(iter/s)": 0.201098 }, { "acc": 0.76000824, "epoch": 0.6679595794631009, "grad_norm": 5.96875, "learning_rate": 7.766275867278004e-06, "loss": 0.8921133, "memory(GiB)": 135.77, "step": 28630, "train_speed(iter/s)": 0.201133 }, { "acc": 0.77165904, "epoch": 0.6681928870353898, "grad_norm": 7.3125, "learning_rate": 7.764702017525787e-06, "loss": 0.80725384, "memory(GiB)": 135.77, "step": 28640, "train_speed(iter/s)": 0.201167 }, { "acc": 0.77876086, "epoch": 0.6684261946076787, "grad_norm": 5.15625, "learning_rate": 7.763127773113159e-06, "loss": 0.79236784, "memory(GiB)": 135.77, "step": 28650, "train_speed(iter/s)": 0.201201 }, { "acc": 0.79020371, "epoch": 0.6686595021799676, "grad_norm": 8.625, "learning_rate": 7.761553134264844e-06, "loss": 0.75915909, "memory(GiB)": 135.77, "step": 28660, "train_speed(iter/s)": 0.201238 }, { "acc": 0.76552114, "epoch": 0.6688928097522565, "grad_norm": 5.90625, "learning_rate": 7.759978101205623e-06, "loss": 0.84368448, "memory(GiB)": 135.77, "step": 28670, "train_speed(iter/s)": 0.201275 }, { "acc": 0.78740005, "epoch": 0.6691261173245454, "grad_norm": 5.71875, "learning_rate": 7.758402674160328e-06, "loss": 0.75369263, "memory(GiB)": 135.77, "step": 28680, "train_speed(iter/s)": 0.201313 }, { "acc": 0.76241765, "epoch": 0.6693594248968343, "grad_norm": 4.96875, "learning_rate": 7.756826853353854e-06, "loss": 0.85766029, "memory(GiB)": 135.77, "step": 28690, "train_speed(iter/s)": 0.201349 }, { "acc": 0.76723084, "epoch": 0.6695927324691232, "grad_norm": 5.625, "learning_rate": 7.755250639011147e-06, "loss": 0.84183302, "memory(GiB)": 135.77, "step": 28700, "train_speed(iter/s)": 0.201389 }, { "acc": 0.77271857, "epoch": 0.6698260400414121, "grad_norm": 4.65625, "learning_rate": 7.75367403135721e-06, "loss": 0.80901413, "memory(GiB)": 135.77, "step": 28710, "train_speed(iter/s)": 0.201427 }, { "acc": 0.75930796, "epoch": 0.670059347613701, "grad_norm": 6.78125, "learning_rate": 7.752097030617107e-06, "loss": 0.87077732, "memory(GiB)": 135.77, "step": 28720, "train_speed(iter/s)": 0.201465 }, { "acc": 0.77311296, "epoch": 0.6702926551859899, "grad_norm": 7.84375, "learning_rate": 7.750519637015953e-06, "loss": 0.80176973, "memory(GiB)": 135.77, "step": 28730, "train_speed(iter/s)": 0.2015 }, { "acc": 0.78030658, "epoch": 0.6705259627582788, "grad_norm": 4.78125, "learning_rate": 7.748941850778917e-06, "loss": 0.774928, "memory(GiB)": 135.77, "step": 28740, "train_speed(iter/s)": 0.201535 }, { "acc": 0.76701117, "epoch": 0.6707592703305677, "grad_norm": 5.625, "learning_rate": 7.747363672131233e-06, "loss": 0.83319283, "memory(GiB)": 135.77, "step": 28750, "train_speed(iter/s)": 0.20157 }, { "acc": 0.78679342, "epoch": 0.6709925779028566, "grad_norm": 5.75, "learning_rate": 7.745785101298182e-06, "loss": 0.75664225, "memory(GiB)": 135.77, "step": 28760, "train_speed(iter/s)": 0.201606 }, { "acc": 0.76925483, "epoch": 0.6712258854751455, "grad_norm": 7.3125, "learning_rate": 7.744206138505106e-06, "loss": 0.83768616, "memory(GiB)": 135.77, "step": 28770, "train_speed(iter/s)": 0.201642 }, { "acc": 0.80132675, "epoch": 0.6714591930474344, "grad_norm": 5.6875, "learning_rate": 7.7426267839774e-06, "loss": 0.70183382, "memory(GiB)": 135.77, "step": 28780, "train_speed(iter/s)": 0.201677 }, { "acc": 0.77842226, "epoch": 0.6716925006197232, "grad_norm": 6.09375, "learning_rate": 7.741047037940516e-06, "loss": 0.79888177, "memory(GiB)": 135.77, "step": 28790, "train_speed(iter/s)": 0.201714 }, { "acc": 0.77139082, "epoch": 0.6719258081920121, "grad_norm": 5.15625, "learning_rate": 7.739466900619966e-06, "loss": 0.81673717, "memory(GiB)": 135.77, "step": 28800, "train_speed(iter/s)": 0.201753 }, { "acc": 0.76923008, "epoch": 0.672159115764301, "grad_norm": 8.4375, "learning_rate": 7.737886372241311e-06, "loss": 0.84230728, "memory(GiB)": 135.77, "step": 28810, "train_speed(iter/s)": 0.201789 }, { "acc": 0.75902824, "epoch": 0.6723924233365899, "grad_norm": 8.0625, "learning_rate": 7.736305453030172e-06, "loss": 0.86554337, "memory(GiB)": 135.77, "step": 28820, "train_speed(iter/s)": 0.201823 }, { "acc": 0.78267469, "epoch": 0.6726257309088788, "grad_norm": 9.0625, "learning_rate": 7.734724143212224e-06, "loss": 0.79381328, "memory(GiB)": 135.77, "step": 28830, "train_speed(iter/s)": 0.201857 }, { "acc": 0.77059851, "epoch": 0.6728590384811677, "grad_norm": 5.8125, "learning_rate": 7.733142443013199e-06, "loss": 0.82365742, "memory(GiB)": 135.77, "step": 28840, "train_speed(iter/s)": 0.201894 }, { "acc": 0.7740037, "epoch": 0.6730923460534566, "grad_norm": 5.6875, "learning_rate": 7.731560352658886e-06, "loss": 0.81236382, "memory(GiB)": 135.77, "step": 28850, "train_speed(iter/s)": 0.20193 }, { "acc": 0.77031889, "epoch": 0.6733256536257455, "grad_norm": 8.0, "learning_rate": 7.729977872375125e-06, "loss": 0.83344927, "memory(GiB)": 135.77, "step": 28860, "train_speed(iter/s)": 0.201964 }, { "acc": 0.77817774, "epoch": 0.6735589611980344, "grad_norm": 5.25, "learning_rate": 7.728395002387815e-06, "loss": 0.79905758, "memory(GiB)": 135.77, "step": 28870, "train_speed(iter/s)": 0.201999 }, { "acc": 0.76579313, "epoch": 0.6737922687703233, "grad_norm": 5.4375, "learning_rate": 7.726811742922912e-06, "loss": 0.84702206, "memory(GiB)": 135.77, "step": 28880, "train_speed(iter/s)": 0.202036 }, { "acc": 0.77942319, "epoch": 0.6740255763426122, "grad_norm": 4.8125, "learning_rate": 7.725228094206423e-06, "loss": 0.78971949, "memory(GiB)": 135.77, "step": 28890, "train_speed(iter/s)": 0.202071 }, { "acc": 0.76637468, "epoch": 0.6742588839149011, "grad_norm": 4.9375, "learning_rate": 7.723644056464416e-06, "loss": 0.83664494, "memory(GiB)": 135.77, "step": 28900, "train_speed(iter/s)": 0.202105 }, { "acc": 0.77963085, "epoch": 0.67449219148719, "grad_norm": 4.46875, "learning_rate": 7.722059629923014e-06, "loss": 0.78608251, "memory(GiB)": 135.77, "step": 28910, "train_speed(iter/s)": 0.202138 }, { "acc": 0.78031402, "epoch": 0.6747254990594789, "grad_norm": 5.09375, "learning_rate": 7.720474814808387e-06, "loss": 0.77677441, "memory(GiB)": 135.77, "step": 28920, "train_speed(iter/s)": 0.202173 }, { "acc": 0.76079268, "epoch": 0.6749588066317678, "grad_norm": 5.375, "learning_rate": 7.718889611346771e-06, "loss": 0.8742177, "memory(GiB)": 135.77, "step": 28930, "train_speed(iter/s)": 0.202208 }, { "acc": 0.7644505, "epoch": 0.6751921142040567, "grad_norm": 5.9375, "learning_rate": 7.717304019764456e-06, "loss": 0.86214409, "memory(GiB)": 135.77, "step": 28940, "train_speed(iter/s)": 0.202241 }, { "acc": 0.77155561, "epoch": 0.6754254217763456, "grad_norm": 4.4375, "learning_rate": 7.71571804028778e-06, "loss": 0.81877155, "memory(GiB)": 135.77, "step": 28950, "train_speed(iter/s)": 0.202275 }, { "acc": 0.77610378, "epoch": 0.6756587293486345, "grad_norm": 5.84375, "learning_rate": 7.714131673143139e-06, "loss": 0.80574827, "memory(GiB)": 135.77, "step": 28960, "train_speed(iter/s)": 0.202311 }, { "acc": 0.77921677, "epoch": 0.6758920369209234, "grad_norm": 7.875, "learning_rate": 7.712544918556994e-06, "loss": 0.77683654, "memory(GiB)": 135.77, "step": 28970, "train_speed(iter/s)": 0.202347 }, { "acc": 0.76622157, "epoch": 0.6761253444932122, "grad_norm": 6.09375, "learning_rate": 7.71095777675585e-06, "loss": 0.87221556, "memory(GiB)": 135.77, "step": 28980, "train_speed(iter/s)": 0.202383 }, { "acc": 0.79096737, "epoch": 0.676358652065501, "grad_norm": 6.09375, "learning_rate": 7.709370247966269e-06, "loss": 0.7472538, "memory(GiB)": 135.77, "step": 28990, "train_speed(iter/s)": 0.202418 }, { "acc": 0.77579155, "epoch": 0.67659195963779, "grad_norm": 6.75, "learning_rate": 7.707782332414873e-06, "loss": 0.80246773, "memory(GiB)": 135.77, "step": 29000, "train_speed(iter/s)": 0.202455 }, { "epoch": 0.67659195963779, "eval_acc": 0.7414119052183158, "eval_loss": 0.8165162205696106, "eval_runtime": 1270.8006, "eval_samples_per_second": 28.322, "eval_steps_per_second": 14.161, "step": 29000 }, { "acc": 0.76836233, "epoch": 0.6768252672100789, "grad_norm": 6.34375, "learning_rate": 7.706194030328336e-06, "loss": 0.81320305, "memory(GiB)": 135.77, "step": 29010, "train_speed(iter/s)": 0.200687 }, { "acc": 0.77826796, "epoch": 0.6770585747823678, "grad_norm": 4.1875, "learning_rate": 7.704605341933385e-06, "loss": 0.79997253, "memory(GiB)": 135.77, "step": 29020, "train_speed(iter/s)": 0.200723 }, { "acc": 0.76980276, "epoch": 0.6772918823546566, "grad_norm": 5.34375, "learning_rate": 7.70301626745681e-06, "loss": 0.82670012, "memory(GiB)": 135.77, "step": 29030, "train_speed(iter/s)": 0.200759 }, { "acc": 0.7695508, "epoch": 0.6775251899269455, "grad_norm": 7.53125, "learning_rate": 7.701426807125447e-06, "loss": 0.83088665, "memory(GiB)": 135.77, "step": 29040, "train_speed(iter/s)": 0.200796 }, { "acc": 0.77592602, "epoch": 0.6777584974992344, "grad_norm": 5.5, "learning_rate": 7.699836961166192e-06, "loss": 0.82842121, "memory(GiB)": 135.77, "step": 29050, "train_speed(iter/s)": 0.20083 }, { "acc": 0.78777199, "epoch": 0.6779918050715233, "grad_norm": 4.9375, "learning_rate": 7.698246729805996e-06, "loss": 0.76934967, "memory(GiB)": 135.77, "step": 29060, "train_speed(iter/s)": 0.200867 }, { "acc": 0.7728559, "epoch": 0.6782251126438122, "grad_norm": 4.9375, "learning_rate": 7.696656113271863e-06, "loss": 0.80948372, "memory(GiB)": 135.77, "step": 29070, "train_speed(iter/s)": 0.200902 }, { "acc": 0.79076042, "epoch": 0.6784584202161011, "grad_norm": 6.25, "learning_rate": 7.695065111790852e-06, "loss": 0.7413765, "memory(GiB)": 135.77, "step": 29080, "train_speed(iter/s)": 0.20094 }, { "acc": 0.76918368, "epoch": 0.67869172778839, "grad_norm": 5.28125, "learning_rate": 7.693473725590079e-06, "loss": 0.82540627, "memory(GiB)": 135.77, "step": 29090, "train_speed(iter/s)": 0.200974 }, { "acc": 0.77715392, "epoch": 0.6789250353606789, "grad_norm": 5.71875, "learning_rate": 7.691881954896716e-06, "loss": 0.77763295, "memory(GiB)": 135.77, "step": 29100, "train_speed(iter/s)": 0.201008 }, { "acc": 0.78276401, "epoch": 0.6791583429329678, "grad_norm": 5.0625, "learning_rate": 7.690289799937985e-06, "loss": 0.78063684, "memory(GiB)": 135.77, "step": 29110, "train_speed(iter/s)": 0.20104 }, { "acc": 0.76057873, "epoch": 0.6793916505052567, "grad_norm": 5.75, "learning_rate": 7.688697260941164e-06, "loss": 0.87658768, "memory(GiB)": 135.77, "step": 29120, "train_speed(iter/s)": 0.201077 }, { "acc": 0.78228416, "epoch": 0.6796249580775456, "grad_norm": 4.75, "learning_rate": 7.687104338133595e-06, "loss": 0.79393601, "memory(GiB)": 135.77, "step": 29130, "train_speed(iter/s)": 0.201111 }, { "acc": 0.7692297, "epoch": 0.6798582656498345, "grad_norm": 4.8125, "learning_rate": 7.68551103174266e-06, "loss": 0.82615576, "memory(GiB)": 135.77, "step": 29140, "train_speed(iter/s)": 0.201147 }, { "acc": 0.7818614, "epoch": 0.6800915732221234, "grad_norm": 4.3125, "learning_rate": 7.683917341995806e-06, "loss": 0.76776161, "memory(GiB)": 135.77, "step": 29150, "train_speed(iter/s)": 0.201182 }, { "acc": 0.75885959, "epoch": 0.6803248807944123, "grad_norm": 8.5, "learning_rate": 7.68232326912053e-06, "loss": 0.88169804, "memory(GiB)": 135.77, "step": 29160, "train_speed(iter/s)": 0.201219 }, { "acc": 0.77344918, "epoch": 0.6805581883667011, "grad_norm": 6.78125, "learning_rate": 7.680728813344388e-06, "loss": 0.81062412, "memory(GiB)": 135.77, "step": 29170, "train_speed(iter/s)": 0.201256 }, { "acc": 0.75116019, "epoch": 0.68079149593899, "grad_norm": 10.125, "learning_rate": 7.679133974894984e-06, "loss": 0.88641338, "memory(GiB)": 135.77, "step": 29180, "train_speed(iter/s)": 0.201294 }, { "acc": 0.77474976, "epoch": 0.6810248035112789, "grad_norm": 4.6875, "learning_rate": 7.677538753999984e-06, "loss": 0.81274004, "memory(GiB)": 135.77, "step": 29190, "train_speed(iter/s)": 0.201331 }, { "acc": 0.77214217, "epoch": 0.6812581110835678, "grad_norm": 5.875, "learning_rate": 7.675943150887107e-06, "loss": 0.83875799, "memory(GiB)": 135.77, "step": 29200, "train_speed(iter/s)": 0.201368 }, { "acc": 0.77097301, "epoch": 0.6814914186558567, "grad_norm": 6.8125, "learning_rate": 7.674347165784122e-06, "loss": 0.82045498, "memory(GiB)": 135.77, "step": 29210, "train_speed(iter/s)": 0.201403 }, { "acc": 0.77530422, "epoch": 0.6817247262281456, "grad_norm": 4.71875, "learning_rate": 7.672750798918854e-06, "loss": 0.81686249, "memory(GiB)": 135.77, "step": 29220, "train_speed(iter/s)": 0.201439 }, { "acc": 0.77789793, "epoch": 0.6819580338004345, "grad_norm": 7.09375, "learning_rate": 7.671154050519187e-06, "loss": 0.80894594, "memory(GiB)": 135.77, "step": 29230, "train_speed(iter/s)": 0.201476 }, { "acc": 0.77112041, "epoch": 0.6821913413727234, "grad_norm": 4.65625, "learning_rate": 7.669556920813056e-06, "loss": 0.82041979, "memory(GiB)": 135.77, "step": 29240, "train_speed(iter/s)": 0.201512 }, { "acc": 0.79637423, "epoch": 0.6824246489450123, "grad_norm": 7.46875, "learning_rate": 7.66795941002845e-06, "loss": 0.74020443, "memory(GiB)": 135.77, "step": 29250, "train_speed(iter/s)": 0.201548 }, { "acc": 0.77368479, "epoch": 0.6826579565173012, "grad_norm": 4.65625, "learning_rate": 7.666361518393413e-06, "loss": 0.83006983, "memory(GiB)": 135.77, "step": 29260, "train_speed(iter/s)": 0.201584 }, { "acc": 0.77014456, "epoch": 0.6828912640895901, "grad_norm": 4.25, "learning_rate": 7.664763246136042e-06, "loss": 0.830159, "memory(GiB)": 135.77, "step": 29270, "train_speed(iter/s)": 0.201618 }, { "acc": 0.78274126, "epoch": 0.683124571661879, "grad_norm": 15.0, "learning_rate": 7.663164593484493e-06, "loss": 0.80830526, "memory(GiB)": 135.77, "step": 29280, "train_speed(iter/s)": 0.201656 }, { "acc": 0.7715332, "epoch": 0.6833578792341679, "grad_norm": 6.09375, "learning_rate": 7.661565560666973e-06, "loss": 0.81128178, "memory(GiB)": 135.77, "step": 29290, "train_speed(iter/s)": 0.201691 }, { "acc": 0.78186312, "epoch": 0.6835911868064568, "grad_norm": 6.5625, "learning_rate": 7.65996614791174e-06, "loss": 0.76622462, "memory(GiB)": 135.77, "step": 29300, "train_speed(iter/s)": 0.201725 }, { "acc": 0.77319794, "epoch": 0.6838244943787457, "grad_norm": 5.09375, "learning_rate": 7.658366355447115e-06, "loss": 0.85582533, "memory(GiB)": 135.77, "step": 29310, "train_speed(iter/s)": 0.201761 }, { "acc": 0.74860148, "epoch": 0.6840578019510346, "grad_norm": 5.375, "learning_rate": 7.656766183501465e-06, "loss": 0.90369864, "memory(GiB)": 135.77, "step": 29320, "train_speed(iter/s)": 0.201797 }, { "acc": 0.78511214, "epoch": 0.6842911095233235, "grad_norm": 5.21875, "learning_rate": 7.655165632303212e-06, "loss": 0.77512712, "memory(GiB)": 135.77, "step": 29330, "train_speed(iter/s)": 0.201833 }, { "acc": 0.78140364, "epoch": 0.6845244170956124, "grad_norm": 6.34375, "learning_rate": 7.653564702080837e-06, "loss": 0.78134556, "memory(GiB)": 135.77, "step": 29340, "train_speed(iter/s)": 0.201869 }, { "acc": 0.77730131, "epoch": 0.6847577246679013, "grad_norm": 4.1875, "learning_rate": 7.651963393062872e-06, "loss": 0.80187654, "memory(GiB)": 135.77, "step": 29350, "train_speed(iter/s)": 0.201905 }, { "acc": 0.78949361, "epoch": 0.6849910322401902, "grad_norm": 7.34375, "learning_rate": 7.650361705477903e-06, "loss": 0.73667974, "memory(GiB)": 135.77, "step": 29360, "train_speed(iter/s)": 0.201939 }, { "acc": 0.78830619, "epoch": 0.685224339812479, "grad_norm": 5.875, "learning_rate": 7.648759639554571e-06, "loss": 0.74675617, "memory(GiB)": 135.77, "step": 29370, "train_speed(iter/s)": 0.201976 }, { "acc": 0.77039886, "epoch": 0.6854576473847679, "grad_norm": 5.34375, "learning_rate": 7.647157195521568e-06, "loss": 0.82584143, "memory(GiB)": 135.77, "step": 29380, "train_speed(iter/s)": 0.202012 }, { "acc": 0.78677912, "epoch": 0.6856909549570568, "grad_norm": 6.34375, "learning_rate": 7.645554373607647e-06, "loss": 0.74630208, "memory(GiB)": 135.77, "step": 29390, "train_speed(iter/s)": 0.202047 }, { "acc": 0.76948566, "epoch": 0.6859242625293457, "grad_norm": 4.5, "learning_rate": 7.643951174041606e-06, "loss": 0.82137985, "memory(GiB)": 135.77, "step": 29400, "train_speed(iter/s)": 0.20208 }, { "acc": 0.78999381, "epoch": 0.6861575701016346, "grad_norm": 4.78125, "learning_rate": 7.642347597052303e-06, "loss": 0.74593878, "memory(GiB)": 135.77, "step": 29410, "train_speed(iter/s)": 0.202116 }, { "acc": 0.76763272, "epoch": 0.6863908776739235, "grad_norm": 7.21875, "learning_rate": 7.64074364286865e-06, "loss": 0.84214306, "memory(GiB)": 135.77, "step": 29420, "train_speed(iter/s)": 0.202153 }, { "acc": 0.78076696, "epoch": 0.6866241852462124, "grad_norm": 6.65625, "learning_rate": 7.639139311719605e-06, "loss": 0.7823586, "memory(GiB)": 135.77, "step": 29430, "train_speed(iter/s)": 0.202187 }, { "acc": 0.77389212, "epoch": 0.6868574928185013, "grad_norm": 4.59375, "learning_rate": 7.637534603834193e-06, "loss": 0.8016942, "memory(GiB)": 135.77, "step": 29440, "train_speed(iter/s)": 0.202222 }, { "acc": 0.77874303, "epoch": 0.6870908003907902, "grad_norm": 6.28125, "learning_rate": 7.635929519441483e-06, "loss": 0.80179472, "memory(GiB)": 135.77, "step": 29450, "train_speed(iter/s)": 0.202259 }, { "acc": 0.76265059, "epoch": 0.6873241079630791, "grad_norm": 5.78125, "learning_rate": 7.634324058770598e-06, "loss": 0.86517248, "memory(GiB)": 135.77, "step": 29460, "train_speed(iter/s)": 0.202293 }, { "acc": 0.77614689, "epoch": 0.687557415535368, "grad_norm": 4.125, "learning_rate": 7.632718222050719e-06, "loss": 0.80183201, "memory(GiB)": 135.77, "step": 29470, "train_speed(iter/s)": 0.202329 }, { "acc": 0.76075153, "epoch": 0.6877907231076569, "grad_norm": 12.5, "learning_rate": 7.63111200951108e-06, "loss": 0.85887947, "memory(GiB)": 135.77, "step": 29480, "train_speed(iter/s)": 0.202363 }, { "acc": 0.76486063, "epoch": 0.6880240306799458, "grad_norm": 6.46875, "learning_rate": 7.629505421380965e-06, "loss": 0.83933964, "memory(GiB)": 135.77, "step": 29490, "train_speed(iter/s)": 0.2024 }, { "acc": 0.7917264, "epoch": 0.6882573382522347, "grad_norm": 5.4375, "learning_rate": 7.627898457889717e-06, "loss": 0.74840088, "memory(GiB)": 135.77, "step": 29500, "train_speed(iter/s)": 0.202435 }, { "epoch": 0.6882573382522347, "eval_acc": 0.7415806972391614, "eval_loss": 0.8159891366958618, "eval_runtime": 1269.8855, "eval_samples_per_second": 28.342, "eval_steps_per_second": 14.171, "step": 29500 }, { "acc": 0.78372993, "epoch": 0.6884906458245236, "grad_norm": 4.625, "learning_rate": 7.6262911192667245e-06, "loss": 0.7768353, "memory(GiB)": 135.77, "step": 29510, "train_speed(iter/s)": 0.200698 }, { "acc": 0.78984094, "epoch": 0.6887239533968125, "grad_norm": 5.9375, "learning_rate": 7.62468340574144e-06, "loss": 0.75353394, "memory(GiB)": 135.77, "step": 29520, "train_speed(iter/s)": 0.20073 }, { "acc": 0.78566389, "epoch": 0.6889572609691014, "grad_norm": 4.46875, "learning_rate": 7.623075317543361e-06, "loss": 0.74840536, "memory(GiB)": 135.77, "step": 29530, "train_speed(iter/s)": 0.200764 }, { "acc": 0.79994617, "epoch": 0.6891905685413903, "grad_norm": 6.40625, "learning_rate": 7.62146685490204e-06, "loss": 0.70613475, "memory(GiB)": 135.77, "step": 29540, "train_speed(iter/s)": 0.200797 }, { "acc": 0.78054943, "epoch": 0.6894238761136792, "grad_norm": 5.28125, "learning_rate": 7.6198580180470904e-06, "loss": 0.80197172, "memory(GiB)": 135.77, "step": 29550, "train_speed(iter/s)": 0.200835 }, { "acc": 0.75962677, "epoch": 0.689657183685968, "grad_norm": 5.9375, "learning_rate": 7.618248807208169e-06, "loss": 0.85815258, "memory(GiB)": 135.77, "step": 29560, "train_speed(iter/s)": 0.200868 }, { "acc": 0.76958151, "epoch": 0.6898904912582569, "grad_norm": 5.65625, "learning_rate": 7.61663922261499e-06, "loss": 0.82609196, "memory(GiB)": 135.77, "step": 29570, "train_speed(iter/s)": 0.200904 }, { "acc": 0.7712954, "epoch": 0.6901237988305458, "grad_norm": 4.90625, "learning_rate": 7.615029264497322e-06, "loss": 0.82603159, "memory(GiB)": 135.77, "step": 29580, "train_speed(iter/s)": 0.200938 }, { "acc": 0.76029787, "epoch": 0.6903571064028347, "grad_norm": 5.15625, "learning_rate": 7.6134189330849885e-06, "loss": 0.86350775, "memory(GiB)": 135.77, "step": 29590, "train_speed(iter/s)": 0.200973 }, { "acc": 0.77219863, "epoch": 0.6905904139751236, "grad_norm": 5.34375, "learning_rate": 7.611808228607859e-06, "loss": 0.82623463, "memory(GiB)": 135.77, "step": 29600, "train_speed(iter/s)": 0.201006 }, { "acc": 0.78632302, "epoch": 0.6908237215474125, "grad_norm": 6.53125, "learning_rate": 7.610197151295865e-06, "loss": 0.76113815, "memory(GiB)": 135.77, "step": 29610, "train_speed(iter/s)": 0.201042 }, { "acc": 0.78305635, "epoch": 0.6910570291197013, "grad_norm": 5.375, "learning_rate": 7.608585701378985e-06, "loss": 0.76118941, "memory(GiB)": 135.77, "step": 29620, "train_speed(iter/s)": 0.201074 }, { "acc": 0.7888319, "epoch": 0.6912903366919902, "grad_norm": 4.03125, "learning_rate": 7.6069738790872545e-06, "loss": 0.7636095, "memory(GiB)": 135.77, "step": 29630, "train_speed(iter/s)": 0.201108 }, { "acc": 0.77132835, "epoch": 0.6915236442642791, "grad_norm": 4.9375, "learning_rate": 7.6053616846507606e-06, "loss": 0.79716229, "memory(GiB)": 135.77, "step": 29640, "train_speed(iter/s)": 0.201143 }, { "acc": 0.7883872, "epoch": 0.691756951836568, "grad_norm": 3.296875, "learning_rate": 7.6037491182996415e-06, "loss": 0.77416801, "memory(GiB)": 135.77, "step": 29650, "train_speed(iter/s)": 0.201178 }, { "acc": 0.78499637, "epoch": 0.691990259408857, "grad_norm": 4.3125, "learning_rate": 7.602136180264094e-06, "loss": 0.77752113, "memory(GiB)": 135.77, "step": 29660, "train_speed(iter/s)": 0.201215 }, { "acc": 0.76922569, "epoch": 0.6922235669811458, "grad_norm": 6.0, "learning_rate": 7.6005228707743606e-06, "loss": 0.83131847, "memory(GiB)": 135.77, "step": 29670, "train_speed(iter/s)": 0.201242 }, { "acc": 0.77869415, "epoch": 0.6924568745534347, "grad_norm": 6.84375, "learning_rate": 7.598909190060744e-06, "loss": 0.8007268, "memory(GiB)": 135.77, "step": 29680, "train_speed(iter/s)": 0.201276 }, { "acc": 0.78622375, "epoch": 0.6926901821257236, "grad_norm": 5.0, "learning_rate": 7.597295138353596e-06, "loss": 0.75257502, "memory(GiB)": 135.77, "step": 29690, "train_speed(iter/s)": 0.201314 }, { "acc": 0.77413979, "epoch": 0.6929234896980125, "grad_norm": 5.6875, "learning_rate": 7.595680715883321e-06, "loss": 0.80841618, "memory(GiB)": 135.77, "step": 29700, "train_speed(iter/s)": 0.201349 }, { "acc": 0.77899332, "epoch": 0.6931567972703014, "grad_norm": 5.4375, "learning_rate": 7.594065922880378e-06, "loss": 0.78428011, "memory(GiB)": 135.77, "step": 29710, "train_speed(iter/s)": 0.201382 }, { "acc": 0.78220787, "epoch": 0.6933901048425903, "grad_norm": 3.53125, "learning_rate": 7.592450759575278e-06, "loss": 0.77431474, "memory(GiB)": 135.77, "step": 29720, "train_speed(iter/s)": 0.201414 }, { "acc": 0.78128052, "epoch": 0.6936234124148792, "grad_norm": 4.3125, "learning_rate": 7.590835226198585e-06, "loss": 0.79154778, "memory(GiB)": 135.77, "step": 29730, "train_speed(iter/s)": 0.201451 }, { "acc": 0.77904191, "epoch": 0.6938567199871681, "grad_norm": 5.28125, "learning_rate": 7.589219322980916e-06, "loss": 0.79745936, "memory(GiB)": 135.77, "step": 29740, "train_speed(iter/s)": 0.201485 }, { "acc": 0.78388314, "epoch": 0.6940900275594569, "grad_norm": 5.34375, "learning_rate": 7.587603050152941e-06, "loss": 0.79330082, "memory(GiB)": 135.77, "step": 29750, "train_speed(iter/s)": 0.20152 }, { "acc": 0.77729387, "epoch": 0.6943233351317458, "grad_norm": 4.75, "learning_rate": 7.585986407945383e-06, "loss": 0.79987221, "memory(GiB)": 135.77, "step": 29760, "train_speed(iter/s)": 0.201555 }, { "acc": 0.76541524, "epoch": 0.6945566427040347, "grad_norm": 5.78125, "learning_rate": 7.584369396589015e-06, "loss": 0.8547636, "memory(GiB)": 135.77, "step": 29770, "train_speed(iter/s)": 0.201591 }, { "acc": 0.75868149, "epoch": 0.6947899502763236, "grad_norm": 5.78125, "learning_rate": 7.582752016314669e-06, "loss": 0.84939613, "memory(GiB)": 135.77, "step": 29780, "train_speed(iter/s)": 0.201626 }, { "acc": 0.80678043, "epoch": 0.6950232578486125, "grad_norm": 4.90625, "learning_rate": 7.58113426735322e-06, "loss": 0.68545928, "memory(GiB)": 135.77, "step": 29790, "train_speed(iter/s)": 0.201662 }, { "acc": 0.75615764, "epoch": 0.6952565654209014, "grad_norm": 6.9375, "learning_rate": 7.579516149935606e-06, "loss": 0.86680984, "memory(GiB)": 135.77, "step": 29800, "train_speed(iter/s)": 0.201699 }, { "acc": 0.77607203, "epoch": 0.6954898729931903, "grad_norm": 4.8125, "learning_rate": 7.577897664292811e-06, "loss": 0.80326805, "memory(GiB)": 135.77, "step": 29810, "train_speed(iter/s)": 0.201734 }, { "acc": 0.78733912, "epoch": 0.6957231805654792, "grad_norm": 5.40625, "learning_rate": 7.57627881065587e-06, "loss": 0.77941236, "memory(GiB)": 135.77, "step": 29820, "train_speed(iter/s)": 0.201768 }, { "acc": 0.75822802, "epoch": 0.6959564881377681, "grad_norm": 4.65625, "learning_rate": 7.574659589255881e-06, "loss": 0.86697035, "memory(GiB)": 135.77, "step": 29830, "train_speed(iter/s)": 0.201804 }, { "acc": 0.74630752, "epoch": 0.696189795710057, "grad_norm": 5.25, "learning_rate": 7.573040000323984e-06, "loss": 0.92794971, "memory(GiB)": 135.77, "step": 29840, "train_speed(iter/s)": 0.201839 }, { "acc": 0.79665966, "epoch": 0.6964231032823459, "grad_norm": 5.46875, "learning_rate": 7.571420044091372e-06, "loss": 0.70622387, "memory(GiB)": 135.77, "step": 29850, "train_speed(iter/s)": 0.201874 }, { "acc": 0.76552095, "epoch": 0.6966564108546348, "grad_norm": 3.84375, "learning_rate": 7.569799720789297e-06, "loss": 0.86479321, "memory(GiB)": 135.77, "step": 29860, "train_speed(iter/s)": 0.201909 }, { "acc": 0.78278189, "epoch": 0.6968897184269237, "grad_norm": 5.90625, "learning_rate": 7.568179030649057e-06, "loss": 0.76828461, "memory(GiB)": 135.77, "step": 29870, "train_speed(iter/s)": 0.201945 }, { "acc": 0.76996212, "epoch": 0.6971230259992126, "grad_norm": 4.84375, "learning_rate": 7.566557973902007e-06, "loss": 0.85869331, "memory(GiB)": 135.77, "step": 29880, "train_speed(iter/s)": 0.201981 }, { "acc": 0.79064722, "epoch": 0.6973563335715015, "grad_norm": 5.96875, "learning_rate": 7.564936550779553e-06, "loss": 0.73188696, "memory(GiB)": 135.77, "step": 29890, "train_speed(iter/s)": 0.202018 }, { "acc": 0.77210503, "epoch": 0.6975896411437904, "grad_norm": 5.375, "learning_rate": 7.563314761513151e-06, "loss": 0.81467781, "memory(GiB)": 135.77, "step": 29900, "train_speed(iter/s)": 0.202053 }, { "acc": 0.79614601, "epoch": 0.6978229487160793, "grad_norm": 4.15625, "learning_rate": 7.56169260633431e-06, "loss": 0.71139755, "memory(GiB)": 135.77, "step": 29910, "train_speed(iter/s)": 0.202089 }, { "acc": 0.80187359, "epoch": 0.6980562562883682, "grad_norm": 4.40625, "learning_rate": 7.560070085474596e-06, "loss": 0.69751916, "memory(GiB)": 135.77, "step": 29920, "train_speed(iter/s)": 0.202123 }, { "acc": 0.77271166, "epoch": 0.6982895638606571, "grad_norm": 5.125, "learning_rate": 7.55844719916562e-06, "loss": 0.82679062, "memory(GiB)": 135.77, "step": 29930, "train_speed(iter/s)": 0.202157 }, { "acc": 0.77691226, "epoch": 0.6985228714329459, "grad_norm": 4.6875, "learning_rate": 7.556823947639048e-06, "loss": 0.79194837, "memory(GiB)": 135.77, "step": 29940, "train_speed(iter/s)": 0.202193 }, { "acc": 0.77552891, "epoch": 0.6987561790052348, "grad_norm": 5.84375, "learning_rate": 7.555200331126602e-06, "loss": 0.82497501, "memory(GiB)": 135.77, "step": 29950, "train_speed(iter/s)": 0.202227 }, { "acc": 0.77520766, "epoch": 0.6989894865775237, "grad_norm": 5.6875, "learning_rate": 7.55357634986005e-06, "loss": 0.81748133, "memory(GiB)": 135.77, "step": 29960, "train_speed(iter/s)": 0.202263 }, { "acc": 0.78164778, "epoch": 0.6992227941498126, "grad_norm": 5.75, "learning_rate": 7.551952004071217e-06, "loss": 0.78227224, "memory(GiB)": 135.77, "step": 29970, "train_speed(iter/s)": 0.202296 }, { "acc": 0.77848063, "epoch": 0.6994561017221015, "grad_norm": 6.4375, "learning_rate": 7.550327293991976e-06, "loss": 0.78909693, "memory(GiB)": 135.77, "step": 29980, "train_speed(iter/s)": 0.202326 }, { "acc": 0.75997553, "epoch": 0.6996894092943904, "grad_norm": 5.65625, "learning_rate": 7.5487022198542555e-06, "loss": 0.8917922, "memory(GiB)": 135.77, "step": 29990, "train_speed(iter/s)": 0.202361 }, { "acc": 0.78532553, "epoch": 0.6999227168666793, "grad_norm": 6.84375, "learning_rate": 7.547076781890032e-06, "loss": 0.7683835, "memory(GiB)": 135.77, "step": 30000, "train_speed(iter/s)": 0.202393 }, { "epoch": 0.6999227168666793, "eval_acc": 0.7416280999319227, "eval_loss": 0.8156638741493225, "eval_runtime": 1270.1597, "eval_samples_per_second": 28.336, "eval_steps_per_second": 14.168, "step": 30000 }, { "acc": 0.7785594, "epoch": 0.7001560244389682, "grad_norm": 7.5625, "learning_rate": 7.5454509803313394e-06, "loss": 0.79886799, "memory(GiB)": 135.77, "step": 30010, "train_speed(iter/s)": 0.200687 }, { "acc": 0.76497021, "epoch": 0.7003893320112571, "grad_norm": 4.71875, "learning_rate": 7.543824815410259e-06, "loss": 0.85206814, "memory(GiB)": 135.77, "step": 30020, "train_speed(iter/s)": 0.200722 }, { "acc": 0.78620939, "epoch": 0.700622639583546, "grad_norm": 5.46875, "learning_rate": 7.542198287358924e-06, "loss": 0.78464289, "memory(GiB)": 135.77, "step": 30030, "train_speed(iter/s)": 0.200747 }, { "acc": 0.76833134, "epoch": 0.7008559471558349, "grad_norm": 4.5625, "learning_rate": 7.540571396409522e-06, "loss": 0.8300333, "memory(GiB)": 135.77, "step": 30040, "train_speed(iter/s)": 0.200782 }, { "acc": 0.78864598, "epoch": 0.7010892547281238, "grad_norm": 4.0625, "learning_rate": 7.538944142794291e-06, "loss": 0.74310522, "memory(GiB)": 135.77, "step": 30050, "train_speed(iter/s)": 0.200815 }, { "acc": 0.76193199, "epoch": 0.7013225623004127, "grad_norm": 4.78125, "learning_rate": 7.537316526745522e-06, "loss": 0.86004868, "memory(GiB)": 135.77, "step": 30060, "train_speed(iter/s)": 0.200848 }, { "acc": 0.79334602, "epoch": 0.7015558698727016, "grad_norm": 5.125, "learning_rate": 7.535688548495557e-06, "loss": 0.73706589, "memory(GiB)": 135.77, "step": 30070, "train_speed(iter/s)": 0.200881 }, { "acc": 0.76868596, "epoch": 0.7017891774449905, "grad_norm": 7.25, "learning_rate": 7.534060208276786e-06, "loss": 0.81477661, "memory(GiB)": 135.77, "step": 30080, "train_speed(iter/s)": 0.200914 }, { "acc": 0.74454851, "epoch": 0.7020224850172794, "grad_norm": 5.78125, "learning_rate": 7.532431506321657e-06, "loss": 0.95340862, "memory(GiB)": 135.77, "step": 30090, "train_speed(iter/s)": 0.200948 }, { "acc": 0.76665716, "epoch": 0.7022557925895683, "grad_norm": 6.03125, "learning_rate": 7.530802442862666e-06, "loss": 0.84706726, "memory(GiB)": 135.77, "step": 30100, "train_speed(iter/s)": 0.20098 }, { "acc": 0.78391695, "epoch": 0.7024891001618572, "grad_norm": 5.1875, "learning_rate": 7.529173018132362e-06, "loss": 0.77013426, "memory(GiB)": 135.77, "step": 30110, "train_speed(iter/s)": 0.201015 }, { "acc": 0.80045862, "epoch": 0.7027224077341461, "grad_norm": 6.0, "learning_rate": 7.5275432323633446e-06, "loss": 0.71537471, "memory(GiB)": 135.77, "step": 30120, "train_speed(iter/s)": 0.20105 }, { "acc": 0.76974192, "epoch": 0.702955715306435, "grad_norm": 4.5, "learning_rate": 7.525913085788264e-06, "loss": 0.81845627, "memory(GiB)": 135.77, "step": 30130, "train_speed(iter/s)": 0.201085 }, { "acc": 0.7624217, "epoch": 0.7031890228787238, "grad_norm": 5.625, "learning_rate": 7.524282578639825e-06, "loss": 0.85741539, "memory(GiB)": 135.77, "step": 30140, "train_speed(iter/s)": 0.201118 }, { "acc": 0.77117977, "epoch": 0.7034223304510127, "grad_norm": 5.5, "learning_rate": 7.522651711150781e-06, "loss": 0.82536449, "memory(GiB)": 135.77, "step": 30150, "train_speed(iter/s)": 0.201154 }, { "acc": 0.76665888, "epoch": 0.7036556380233016, "grad_norm": 4.8125, "learning_rate": 7.521020483553939e-06, "loss": 0.82699442, "memory(GiB)": 135.77, "step": 30160, "train_speed(iter/s)": 0.201188 }, { "acc": 0.76989846, "epoch": 0.7038889455955905, "grad_norm": 4.75, "learning_rate": 7.519388896082154e-06, "loss": 0.83109045, "memory(GiB)": 135.77, "step": 30170, "train_speed(iter/s)": 0.201223 }, { "acc": 0.77924395, "epoch": 0.7041222531678794, "grad_norm": 4.78125, "learning_rate": 7.517756948968338e-06, "loss": 0.7907198, "memory(GiB)": 135.77, "step": 30180, "train_speed(iter/s)": 0.201257 }, { "acc": 0.76878471, "epoch": 0.7043555607401683, "grad_norm": 5.25, "learning_rate": 7.516124642445447e-06, "loss": 0.84302406, "memory(GiB)": 135.77, "step": 30190, "train_speed(iter/s)": 0.201291 }, { "acc": 0.77351613, "epoch": 0.7045888683124572, "grad_norm": 5.4375, "learning_rate": 7.514491976746494e-06, "loss": 0.81995602, "memory(GiB)": 135.77, "step": 30200, "train_speed(iter/s)": 0.201326 }, { "acc": 0.76795378, "epoch": 0.704822175884746, "grad_norm": 5.9375, "learning_rate": 7.512858952104544e-06, "loss": 0.83630714, "memory(GiB)": 135.77, "step": 30210, "train_speed(iter/s)": 0.201361 }, { "acc": 0.75973902, "epoch": 0.705055483457035, "grad_norm": 6.03125, "learning_rate": 7.511225568752707e-06, "loss": 0.854778, "memory(GiB)": 135.77, "step": 30220, "train_speed(iter/s)": 0.201397 }, { "acc": 0.7747612, "epoch": 0.7052887910293238, "grad_norm": 4.21875, "learning_rate": 7.50959182692415e-06, "loss": 0.81778402, "memory(GiB)": 135.77, "step": 30230, "train_speed(iter/s)": 0.201434 }, { "acc": 0.77487125, "epoch": 0.7055220986016127, "grad_norm": 3.984375, "learning_rate": 7.507957726852087e-06, "loss": 0.78481297, "memory(GiB)": 135.77, "step": 30240, "train_speed(iter/s)": 0.201469 }, { "acc": 0.77745609, "epoch": 0.7057554061739016, "grad_norm": 6.03125, "learning_rate": 7.506323268769788e-06, "loss": 0.7958005, "memory(GiB)": 135.77, "step": 30250, "train_speed(iter/s)": 0.201506 }, { "acc": 0.77071285, "epoch": 0.7059887137461905, "grad_norm": 4.8125, "learning_rate": 7.504688452910571e-06, "loss": 0.81425562, "memory(GiB)": 135.77, "step": 30260, "train_speed(iter/s)": 0.201541 }, { "acc": 0.77105627, "epoch": 0.7062220213184794, "grad_norm": 5.78125, "learning_rate": 7.503053279507806e-06, "loss": 0.82091007, "memory(GiB)": 135.77, "step": 30270, "train_speed(iter/s)": 0.201575 }, { "acc": 0.79748535, "epoch": 0.7064553288907683, "grad_norm": 5.21875, "learning_rate": 7.501417748794911e-06, "loss": 0.73630304, "memory(GiB)": 135.77, "step": 30280, "train_speed(iter/s)": 0.20161 }, { "acc": 0.78553853, "epoch": 0.7066886364630572, "grad_norm": 6.0625, "learning_rate": 7.49978186100536e-06, "loss": 0.77834482, "memory(GiB)": 135.77, "step": 30290, "train_speed(iter/s)": 0.201643 }, { "acc": 0.77966051, "epoch": 0.7069219440353461, "grad_norm": 4.90625, "learning_rate": 7.498145616372674e-06, "loss": 0.77920589, "memory(GiB)": 135.77, "step": 30300, "train_speed(iter/s)": 0.201678 }, { "acc": 0.78804102, "epoch": 0.707155251607635, "grad_norm": 4.875, "learning_rate": 7.4965090151304265e-06, "loss": 0.75938053, "memory(GiB)": 135.77, "step": 30310, "train_speed(iter/s)": 0.201711 }, { "acc": 0.7741745, "epoch": 0.7073885591799239, "grad_norm": 4.6875, "learning_rate": 7.494872057512242e-06, "loss": 0.82201767, "memory(GiB)": 135.77, "step": 30320, "train_speed(iter/s)": 0.201744 }, { "acc": 0.78020763, "epoch": 0.7076218667522127, "grad_norm": 4.25, "learning_rate": 7.493234743751797e-06, "loss": 0.77473249, "memory(GiB)": 135.77, "step": 30330, "train_speed(iter/s)": 0.201778 }, { "acc": 0.78065357, "epoch": 0.7078551743245016, "grad_norm": 8.8125, "learning_rate": 7.491597074082817e-06, "loss": 0.79642534, "memory(GiB)": 135.77, "step": 30340, "train_speed(iter/s)": 0.201811 }, { "acc": 0.77584829, "epoch": 0.7080884818967905, "grad_norm": 7.34375, "learning_rate": 7.489959048739079e-06, "loss": 0.81745863, "memory(GiB)": 135.77, "step": 30350, "train_speed(iter/s)": 0.201842 }, { "acc": 0.77412333, "epoch": 0.7083217894690794, "grad_norm": 9.125, "learning_rate": 7.488320667954408e-06, "loss": 0.81564293, "memory(GiB)": 135.77, "step": 30360, "train_speed(iter/s)": 0.201876 }, { "acc": 0.76567802, "epoch": 0.7085550970413683, "grad_norm": 4.96875, "learning_rate": 7.486681931962686e-06, "loss": 0.84731541, "memory(GiB)": 135.77, "step": 30370, "train_speed(iter/s)": 0.20191 }, { "acc": 0.76095247, "epoch": 0.7087884046136572, "grad_norm": 5.65625, "learning_rate": 7.48504284099784e-06, "loss": 0.87232685, "memory(GiB)": 135.77, "step": 30380, "train_speed(iter/s)": 0.201943 }, { "acc": 0.77653809, "epoch": 0.7090217121859461, "grad_norm": 6.53125, "learning_rate": 7.48340339529385e-06, "loss": 0.78932204, "memory(GiB)": 135.77, "step": 30390, "train_speed(iter/s)": 0.201979 }, { "acc": 0.78220081, "epoch": 0.709255019758235, "grad_norm": 5.34375, "learning_rate": 7.481763595084747e-06, "loss": 0.77936535, "memory(GiB)": 135.77, "step": 30400, "train_speed(iter/s)": 0.202013 }, { "acc": 0.7953701, "epoch": 0.7094883273305239, "grad_norm": 3.890625, "learning_rate": 7.480123440604613e-06, "loss": 0.73328466, "memory(GiB)": 135.77, "step": 30410, "train_speed(iter/s)": 0.202048 }, { "acc": 0.77866468, "epoch": 0.7097216349028128, "grad_norm": 6.71875, "learning_rate": 7.478482932087577e-06, "loss": 0.80838766, "memory(GiB)": 135.77, "step": 30420, "train_speed(iter/s)": 0.202079 }, { "acc": 0.78703556, "epoch": 0.7099549424751017, "grad_norm": 4.125, "learning_rate": 7.476842069767824e-06, "loss": 0.77653255, "memory(GiB)": 135.77, "step": 30430, "train_speed(iter/s)": 0.202113 }, { "acc": 0.77524529, "epoch": 0.7101882500473906, "grad_norm": 7.28125, "learning_rate": 7.475200853879583e-06, "loss": 0.79806619, "memory(GiB)": 135.77, "step": 30440, "train_speed(iter/s)": 0.202148 }, { "acc": 0.7719944, "epoch": 0.7104215576196795, "grad_norm": 5.21875, "learning_rate": 7.473559284657139e-06, "loss": 0.82557144, "memory(GiB)": 135.77, "step": 30450, "train_speed(iter/s)": 0.202185 }, { "acc": 0.77690635, "epoch": 0.7106548651919684, "grad_norm": 6.40625, "learning_rate": 7.471917362334828e-06, "loss": 0.79608321, "memory(GiB)": 135.77, "step": 30460, "train_speed(iter/s)": 0.20222 }, { "acc": 0.76047997, "epoch": 0.7108881727642573, "grad_norm": 5.75, "learning_rate": 7.47027508714703e-06, "loss": 0.8347681, "memory(GiB)": 135.77, "step": 30470, "train_speed(iter/s)": 0.202255 }, { "acc": 0.78284283, "epoch": 0.7111214803365462, "grad_norm": 4.53125, "learning_rate": 7.468632459328181e-06, "loss": 0.76897001, "memory(GiB)": 135.77, "step": 30480, "train_speed(iter/s)": 0.202289 }, { "acc": 0.77444568, "epoch": 0.7113547879088351, "grad_norm": 6.71875, "learning_rate": 7.466989479112766e-06, "loss": 0.81205473, "memory(GiB)": 135.77, "step": 30490, "train_speed(iter/s)": 0.202323 }, { "acc": 0.76428804, "epoch": 0.711588095481124, "grad_norm": 5.90625, "learning_rate": 7.465346146735319e-06, "loss": 0.83355408, "memory(GiB)": 135.77, "step": 30500, "train_speed(iter/s)": 0.202361 }, { "epoch": 0.711588095481124, "eval_acc": 0.7417022467114782, "eval_loss": 0.8149046301841736, "eval_runtime": 1270.5935, "eval_samples_per_second": 28.326, "eval_steps_per_second": 14.163, "step": 30500 }, { "acc": 0.77408185, "epoch": 0.7118214030534129, "grad_norm": 7.84375, "learning_rate": 7.463702462430427e-06, "loss": 0.82522774, "memory(GiB)": 135.77, "step": 30510, "train_speed(iter/s)": 0.20068 }, { "acc": 0.78299146, "epoch": 0.7120547106257017, "grad_norm": 4.4375, "learning_rate": 7.4620584264327236e-06, "loss": 0.79395037, "memory(GiB)": 135.77, "step": 30520, "train_speed(iter/s)": 0.200715 }, { "acc": 0.7901659, "epoch": 0.7122880181979906, "grad_norm": 5.03125, "learning_rate": 7.460414038976894e-06, "loss": 0.73990803, "memory(GiB)": 135.77, "step": 30530, "train_speed(iter/s)": 0.200747 }, { "acc": 0.76877952, "epoch": 0.7125213257702795, "grad_norm": 4.9375, "learning_rate": 7.458769300297676e-06, "loss": 0.82868195, "memory(GiB)": 135.77, "step": 30540, "train_speed(iter/s)": 0.200778 }, { "acc": 0.77149434, "epoch": 0.7127546333425684, "grad_norm": 6.40625, "learning_rate": 7.457124210629853e-06, "loss": 0.80213509, "memory(GiB)": 135.77, "step": 30550, "train_speed(iter/s)": 0.200813 }, { "acc": 0.76947355, "epoch": 0.7129879409148573, "grad_norm": 8.5, "learning_rate": 7.455478770208267e-06, "loss": 0.84286346, "memory(GiB)": 135.77, "step": 30560, "train_speed(iter/s)": 0.200847 }, { "acc": 0.77109156, "epoch": 0.7132212484871462, "grad_norm": 4.3125, "learning_rate": 7.453832979267796e-06, "loss": 0.81874142, "memory(GiB)": 135.77, "step": 30570, "train_speed(iter/s)": 0.200882 }, { "acc": 0.77773933, "epoch": 0.7134545560594351, "grad_norm": 5.46875, "learning_rate": 7.452186838043381e-06, "loss": 0.79809036, "memory(GiB)": 135.77, "step": 30580, "train_speed(iter/s)": 0.200915 }, { "acc": 0.78017259, "epoch": 0.713687863631724, "grad_norm": 4.34375, "learning_rate": 7.450540346770008e-06, "loss": 0.79023871, "memory(GiB)": 135.77, "step": 30590, "train_speed(iter/s)": 0.200952 }, { "acc": 0.78224049, "epoch": 0.7139211712040129, "grad_norm": 5.5625, "learning_rate": 7.4488935056827115e-06, "loss": 0.79336486, "memory(GiB)": 135.77, "step": 30600, "train_speed(iter/s)": 0.200985 }, { "acc": 0.7667264, "epoch": 0.7141544787763018, "grad_norm": 5.53125, "learning_rate": 7.447246315016579e-06, "loss": 0.84322777, "memory(GiB)": 135.77, "step": 30610, "train_speed(iter/s)": 0.201018 }, { "acc": 0.78506231, "epoch": 0.7143877863485907, "grad_norm": 6.4375, "learning_rate": 7.445598775006745e-06, "loss": 0.77431684, "memory(GiB)": 135.77, "step": 30620, "train_speed(iter/s)": 0.201053 }, { "acc": 0.76365628, "epoch": 0.7146210939208796, "grad_norm": 6.59375, "learning_rate": 7.443950885888398e-06, "loss": 0.85304022, "memory(GiB)": 135.77, "step": 30630, "train_speed(iter/s)": 0.201088 }, { "acc": 0.7706419, "epoch": 0.7148544014931685, "grad_norm": 5.8125, "learning_rate": 7.4423026478967706e-06, "loss": 0.83546371, "memory(GiB)": 135.77, "step": 30640, "train_speed(iter/s)": 0.201119 }, { "acc": 0.76318879, "epoch": 0.7150877090654574, "grad_norm": 4.78125, "learning_rate": 7.440654061267151e-06, "loss": 0.84243946, "memory(GiB)": 135.77, "step": 30650, "train_speed(iter/s)": 0.201148 }, { "acc": 0.7682981, "epoch": 0.7153210166377463, "grad_norm": 5.5625, "learning_rate": 7.439005126234872e-06, "loss": 0.82606144, "memory(GiB)": 135.77, "step": 30660, "train_speed(iter/s)": 0.201182 }, { "acc": 0.76082344, "epoch": 0.7155543242100352, "grad_norm": 5.3125, "learning_rate": 7.43735584303532e-06, "loss": 0.86378193, "memory(GiB)": 135.77, "step": 30670, "train_speed(iter/s)": 0.201216 }, { "acc": 0.77880592, "epoch": 0.7157876317823241, "grad_norm": 5.34375, "learning_rate": 7.435706211903929e-06, "loss": 0.8006321, "memory(GiB)": 135.77, "step": 30680, "train_speed(iter/s)": 0.201249 }, { "acc": 0.78893566, "epoch": 0.716020939354613, "grad_norm": 6.03125, "learning_rate": 7.434056233076184e-06, "loss": 0.74485593, "memory(GiB)": 135.77, "step": 30690, "train_speed(iter/s)": 0.201283 }, { "acc": 0.77817774, "epoch": 0.7162542469269019, "grad_norm": 6.84375, "learning_rate": 7.43240590678762e-06, "loss": 0.76789885, "memory(GiB)": 135.77, "step": 30700, "train_speed(iter/s)": 0.201319 }, { "acc": 0.77524462, "epoch": 0.7164875544991907, "grad_norm": 5.9375, "learning_rate": 7.4307552332738184e-06, "loss": 0.79615641, "memory(GiB)": 135.77, "step": 30710, "train_speed(iter/s)": 0.201355 }, { "acc": 0.76029038, "epoch": 0.7167208620714796, "grad_norm": 5.53125, "learning_rate": 7.429104212770414e-06, "loss": 0.88227682, "memory(GiB)": 135.77, "step": 30720, "train_speed(iter/s)": 0.201386 }, { "acc": 0.7853837, "epoch": 0.7169541696437685, "grad_norm": 4.1875, "learning_rate": 7.427452845513088e-06, "loss": 0.78255091, "memory(GiB)": 135.77, "step": 30730, "train_speed(iter/s)": 0.201419 }, { "acc": 0.7675179, "epoch": 0.7171874772160574, "grad_norm": 10.1875, "learning_rate": 7.4258011317375735e-06, "loss": 0.83768845, "memory(GiB)": 135.77, "step": 30740, "train_speed(iter/s)": 0.201452 }, { "acc": 0.79261498, "epoch": 0.7174207847883463, "grad_norm": 6.875, "learning_rate": 7.424149071679654e-06, "loss": 0.74290218, "memory(GiB)": 135.77, "step": 30750, "train_speed(iter/s)": 0.201484 }, { "acc": 0.78618517, "epoch": 0.7176540923606352, "grad_norm": 4.46875, "learning_rate": 7.422496665575156e-06, "loss": 0.75908709, "memory(GiB)": 135.77, "step": 30760, "train_speed(iter/s)": 0.201519 }, { "acc": 0.79118176, "epoch": 0.717887399932924, "grad_norm": 4.71875, "learning_rate": 7.420843913659965e-06, "loss": 0.75808463, "memory(GiB)": 135.77, "step": 30770, "train_speed(iter/s)": 0.201554 }, { "acc": 0.77106438, "epoch": 0.718120707505213, "grad_norm": 5.4375, "learning_rate": 7.419190816170008e-06, "loss": 0.83572817, "memory(GiB)": 135.77, "step": 30780, "train_speed(iter/s)": 0.201588 }, { "acc": 0.76454983, "epoch": 0.7183540150775019, "grad_norm": 4.4375, "learning_rate": 7.417537373341263e-06, "loss": 0.86166372, "memory(GiB)": 135.77, "step": 30790, "train_speed(iter/s)": 0.201623 }, { "acc": 0.77741642, "epoch": 0.7185873226497907, "grad_norm": 4.59375, "learning_rate": 7.415883585409762e-06, "loss": 0.79155254, "memory(GiB)": 135.77, "step": 30800, "train_speed(iter/s)": 0.201659 }, { "acc": 0.78092527, "epoch": 0.7188206302220796, "grad_norm": 5.4375, "learning_rate": 7.414229452611582e-06, "loss": 0.78085165, "memory(GiB)": 135.77, "step": 30810, "train_speed(iter/s)": 0.201694 }, { "acc": 0.75764637, "epoch": 0.7190539377943685, "grad_norm": 4.59375, "learning_rate": 7.412574975182848e-06, "loss": 0.88091917, "memory(GiB)": 135.77, "step": 30820, "train_speed(iter/s)": 0.201727 }, { "acc": 0.77109008, "epoch": 0.7192872453666574, "grad_norm": 4.34375, "learning_rate": 7.410920153359736e-06, "loss": 0.8264492, "memory(GiB)": 135.77, "step": 30830, "train_speed(iter/s)": 0.201761 }, { "acc": 0.76509042, "epoch": 0.7195205529389463, "grad_norm": 11.5625, "learning_rate": 7.409264987378473e-06, "loss": 0.83569603, "memory(GiB)": 135.77, "step": 30840, "train_speed(iter/s)": 0.201795 }, { "acc": 0.75531335, "epoch": 0.7197538605112352, "grad_norm": 6.625, "learning_rate": 7.407609477475334e-06, "loss": 0.87194748, "memory(GiB)": 135.77, "step": 30850, "train_speed(iter/s)": 0.201829 }, { "acc": 0.78215508, "epoch": 0.7199871680835241, "grad_norm": 4.90625, "learning_rate": 7.405953623886642e-06, "loss": 0.78222771, "memory(GiB)": 135.77, "step": 30860, "train_speed(iter/s)": 0.201862 }, { "acc": 0.77151766, "epoch": 0.720220475655813, "grad_norm": 9.6875, "learning_rate": 7.404297426848768e-06, "loss": 0.82328587, "memory(GiB)": 135.77, "step": 30870, "train_speed(iter/s)": 0.201896 }, { "acc": 0.74816885, "epoch": 0.7204537832281019, "grad_norm": 7.03125, "learning_rate": 7.4026408865981335e-06, "loss": 0.90460873, "memory(GiB)": 135.77, "step": 30880, "train_speed(iter/s)": 0.201931 }, { "acc": 0.77453132, "epoch": 0.7206870908003908, "grad_norm": 6.71875, "learning_rate": 7.400984003371211e-06, "loss": 0.83123531, "memory(GiB)": 135.77, "step": 30890, "train_speed(iter/s)": 0.201965 }, { "acc": 0.78442779, "epoch": 0.7209203983726797, "grad_norm": 5.75, "learning_rate": 7.3993267774045206e-06, "loss": 0.7607954, "memory(GiB)": 135.77, "step": 30900, "train_speed(iter/s)": 0.201998 }, { "acc": 0.79025621, "epoch": 0.7211537059449685, "grad_norm": 13.8125, "learning_rate": 7.397669208934628e-06, "loss": 0.77729826, "memory(GiB)": 135.77, "step": 30910, "train_speed(iter/s)": 0.202029 }, { "acc": 0.7896234, "epoch": 0.7213870135172574, "grad_norm": 4.25, "learning_rate": 7.396011298198155e-06, "loss": 0.74766607, "memory(GiB)": 135.77, "step": 30920, "train_speed(iter/s)": 0.202061 }, { "acc": 0.76902571, "epoch": 0.7216203210895463, "grad_norm": 5.625, "learning_rate": 7.394353045431765e-06, "loss": 0.83471451, "memory(GiB)": 135.77, "step": 30930, "train_speed(iter/s)": 0.202094 }, { "acc": 0.76524734, "epoch": 0.7218536286618352, "grad_norm": 6.84375, "learning_rate": 7.392694450872171e-06, "loss": 0.84251699, "memory(GiB)": 135.77, "step": 30940, "train_speed(iter/s)": 0.202129 }, { "acc": 0.75952892, "epoch": 0.7220869362341241, "grad_norm": 10.0625, "learning_rate": 7.3910355147561394e-06, "loss": 0.86127234, "memory(GiB)": 135.77, "step": 30950, "train_speed(iter/s)": 0.202164 }, { "acc": 0.78890076, "epoch": 0.722320243806413, "grad_norm": 5.09375, "learning_rate": 7.389376237320485e-06, "loss": 0.73546257, "memory(GiB)": 135.77, "step": 30960, "train_speed(iter/s)": 0.202198 }, { "acc": 0.79085941, "epoch": 0.7225535513787019, "grad_norm": 4.46875, "learning_rate": 7.387716618802064e-06, "loss": 0.73829441, "memory(GiB)": 135.77, "step": 30970, "train_speed(iter/s)": 0.20223 }, { "acc": 0.76388206, "epoch": 0.7227868589509908, "grad_norm": 3.796875, "learning_rate": 7.386056659437792e-06, "loss": 0.84366646, "memory(GiB)": 135.77, "step": 30980, "train_speed(iter/s)": 0.202265 }, { "acc": 0.77729826, "epoch": 0.7230201665232797, "grad_norm": 5.21875, "learning_rate": 7.384396359464623e-06, "loss": 0.80204477, "memory(GiB)": 135.77, "step": 30990, "train_speed(iter/s)": 0.202296 }, { "acc": 0.77947817, "epoch": 0.7232534740955686, "grad_norm": 4.5, "learning_rate": 7.382735719119568e-06, "loss": 0.77528582, "memory(GiB)": 135.77, "step": 31000, "train_speed(iter/s)": 0.20233 }, { "epoch": 0.7232534740955686, "eval_acc": 0.7419815382526118, "eval_loss": 0.8143436312675476, "eval_runtime": 1269.6984, "eval_samples_per_second": 28.346, "eval_steps_per_second": 14.173, "step": 31000 }, { "acc": 0.77952061, "epoch": 0.7234867816678575, "grad_norm": 5.15625, "learning_rate": 7.38107473863968e-06, "loss": 0.80758448, "memory(GiB)": 135.77, "step": 31010, "train_speed(iter/s)": 0.200679 }, { "acc": 0.76929088, "epoch": 0.7237200892401464, "grad_norm": 4.75, "learning_rate": 7.3794134182620646e-06, "loss": 0.84603424, "memory(GiB)": 135.77, "step": 31020, "train_speed(iter/s)": 0.200712 }, { "acc": 0.78471966, "epoch": 0.7239533968124353, "grad_norm": 5.625, "learning_rate": 7.377751758223876e-06, "loss": 0.77363634, "memory(GiB)": 135.77, "step": 31030, "train_speed(iter/s)": 0.200746 }, { "acc": 0.75998449, "epoch": 0.7241867043847242, "grad_norm": 5.96875, "learning_rate": 7.376089758762315e-06, "loss": 0.86344814, "memory(GiB)": 135.77, "step": 31040, "train_speed(iter/s)": 0.20078 }, { "acc": 0.79045277, "epoch": 0.7244200119570131, "grad_norm": 7.46875, "learning_rate": 7.374427420114629e-06, "loss": 0.74859829, "memory(GiB)": 135.77, "step": 31050, "train_speed(iter/s)": 0.20081 }, { "acc": 0.78292723, "epoch": 0.724653319529302, "grad_norm": 4.34375, "learning_rate": 7.37276474251812e-06, "loss": 0.78144426, "memory(GiB)": 135.77, "step": 31060, "train_speed(iter/s)": 0.200841 }, { "acc": 0.77971792, "epoch": 0.7248866271015909, "grad_norm": 4.3125, "learning_rate": 7.371101726210135e-06, "loss": 0.79532051, "memory(GiB)": 135.77, "step": 31070, "train_speed(iter/s)": 0.200875 }, { "acc": 0.7901329, "epoch": 0.7251199346738798, "grad_norm": 7.375, "learning_rate": 7.369438371428065e-06, "loss": 0.75919361, "memory(GiB)": 135.77, "step": 31080, "train_speed(iter/s)": 0.200907 }, { "acc": 0.77284174, "epoch": 0.7253532422461687, "grad_norm": 7.5625, "learning_rate": 7.367774678409357e-06, "loss": 0.81404877, "memory(GiB)": 135.77, "step": 31090, "train_speed(iter/s)": 0.200941 }, { "acc": 0.78015275, "epoch": 0.7255865498184575, "grad_norm": 4.8125, "learning_rate": 7.366110647391501e-06, "loss": 0.78611984, "memory(GiB)": 135.77, "step": 31100, "train_speed(iter/s)": 0.200974 }, { "acc": 0.79383612, "epoch": 0.7258198573907464, "grad_norm": 4.65625, "learning_rate": 7.364446278612036e-06, "loss": 0.7279952, "memory(GiB)": 135.77, "step": 31110, "train_speed(iter/s)": 0.201005 }, { "acc": 0.7822289, "epoch": 0.7260531649630353, "grad_norm": 4.59375, "learning_rate": 7.3627815723085535e-06, "loss": 0.77796512, "memory(GiB)": 135.77, "step": 31120, "train_speed(iter/s)": 0.201037 }, { "acc": 0.77322626, "epoch": 0.7262864725353242, "grad_norm": 8.75, "learning_rate": 7.361116528718688e-06, "loss": 0.80861044, "memory(GiB)": 135.77, "step": 31130, "train_speed(iter/s)": 0.201071 }, { "acc": 0.77036018, "epoch": 0.7265197801076131, "grad_norm": 8.0, "learning_rate": 7.359451148080123e-06, "loss": 0.83087883, "memory(GiB)": 135.77, "step": 31140, "train_speed(iter/s)": 0.201104 }, { "acc": 0.77294779, "epoch": 0.726753087679902, "grad_norm": 5.96875, "learning_rate": 7.357785430630593e-06, "loss": 0.81432848, "memory(GiB)": 135.77, "step": 31150, "train_speed(iter/s)": 0.201138 }, { "acc": 0.78335571, "epoch": 0.7269863952521909, "grad_norm": 4.0625, "learning_rate": 7.356119376607877e-06, "loss": 0.78180637, "memory(GiB)": 135.77, "step": 31160, "train_speed(iter/s)": 0.201171 }, { "acc": 0.75630546, "epoch": 0.7272197028244798, "grad_norm": 6.03125, "learning_rate": 7.354452986249805e-06, "loss": 0.88844528, "memory(GiB)": 135.77, "step": 31170, "train_speed(iter/s)": 0.201206 }, { "acc": 0.76719637, "epoch": 0.7274530103967687, "grad_norm": 9.9375, "learning_rate": 7.352786259794252e-06, "loss": 0.84311905, "memory(GiB)": 135.77, "step": 31180, "train_speed(iter/s)": 0.201239 }, { "acc": 0.78604798, "epoch": 0.7276863179690576, "grad_norm": 6.09375, "learning_rate": 7.351119197479144e-06, "loss": 0.7596261, "memory(GiB)": 135.77, "step": 31190, "train_speed(iter/s)": 0.201273 }, { "acc": 0.78300457, "epoch": 0.7279196255413465, "grad_norm": 6.0, "learning_rate": 7.349451799542455e-06, "loss": 0.75782876, "memory(GiB)": 135.77, "step": 31200, "train_speed(iter/s)": 0.201308 }, { "acc": 0.79482107, "epoch": 0.7281529331136354, "grad_norm": 6.25, "learning_rate": 7.3477840662222045e-06, "loss": 0.73927917, "memory(GiB)": 135.77, "step": 31210, "train_speed(iter/s)": 0.201342 }, { "acc": 0.7678937, "epoch": 0.7283862406859243, "grad_norm": 4.4375, "learning_rate": 7.346115997756459e-06, "loss": 0.82241888, "memory(GiB)": 135.77, "step": 31220, "train_speed(iter/s)": 0.201377 }, { "acc": 0.78195829, "epoch": 0.7286195482582132, "grad_norm": 4.65625, "learning_rate": 7.3444475943833375e-06, "loss": 0.78644238, "memory(GiB)": 135.77, "step": 31230, "train_speed(iter/s)": 0.201413 }, { "acc": 0.78667397, "epoch": 0.7288528558305021, "grad_norm": 6.96875, "learning_rate": 7.342778856341002e-06, "loss": 0.78144002, "memory(GiB)": 135.77, "step": 31240, "train_speed(iter/s)": 0.201444 }, { "acc": 0.75145693, "epoch": 0.729086163402791, "grad_norm": 4.65625, "learning_rate": 7.3411097838676645e-06, "loss": 0.92597151, "memory(GiB)": 135.77, "step": 31250, "train_speed(iter/s)": 0.201478 }, { "acc": 0.78834095, "epoch": 0.7293194709750799, "grad_norm": 6.09375, "learning_rate": 7.339440377201588e-06, "loss": 0.76001892, "memory(GiB)": 135.77, "step": 31260, "train_speed(iter/s)": 0.201512 }, { "acc": 0.76082201, "epoch": 0.7295527785473688, "grad_norm": 7.3125, "learning_rate": 7.337770636581075e-06, "loss": 0.87978334, "memory(GiB)": 135.77, "step": 31270, "train_speed(iter/s)": 0.201545 }, { "acc": 0.78952765, "epoch": 0.7297860861196577, "grad_norm": 4.5, "learning_rate": 7.3361005622444834e-06, "loss": 0.76534405, "memory(GiB)": 135.77, "step": 31280, "train_speed(iter/s)": 0.201581 }, { "acc": 0.78288898, "epoch": 0.7300193936919465, "grad_norm": 6.5, "learning_rate": 7.334430154430217e-06, "loss": 0.79545956, "memory(GiB)": 135.77, "step": 31290, "train_speed(iter/s)": 0.201612 }, { "acc": 0.76784639, "epoch": 0.7302527012642354, "grad_norm": 8.0625, "learning_rate": 7.332759413376721e-06, "loss": 0.83361969, "memory(GiB)": 135.77, "step": 31300, "train_speed(iter/s)": 0.201644 }, { "acc": 0.78950572, "epoch": 0.7304860088365243, "grad_norm": 4.5, "learning_rate": 7.331088339322499e-06, "loss": 0.7704586, "memory(GiB)": 135.77, "step": 31310, "train_speed(iter/s)": 0.201676 }, { "acc": 0.77479763, "epoch": 0.7307193164088132, "grad_norm": 5.15625, "learning_rate": 7.3294169325060925e-06, "loss": 0.81133862, "memory(GiB)": 135.77, "step": 31320, "train_speed(iter/s)": 0.20171 }, { "acc": 0.76270666, "epoch": 0.7309526239811021, "grad_norm": 5.875, "learning_rate": 7.327745193166096e-06, "loss": 0.86329832, "memory(GiB)": 135.77, "step": 31330, "train_speed(iter/s)": 0.201744 }, { "acc": 0.77042761, "epoch": 0.731185931553391, "grad_norm": 8.1875, "learning_rate": 7.3260731215411484e-06, "loss": 0.84471989, "memory(GiB)": 135.77, "step": 31340, "train_speed(iter/s)": 0.201777 }, { "acc": 0.76804247, "epoch": 0.7314192391256799, "grad_norm": 7.75, "learning_rate": 7.32440071786994e-06, "loss": 0.84253273, "memory(GiB)": 135.77, "step": 31350, "train_speed(iter/s)": 0.20181 }, { "acc": 0.77282157, "epoch": 0.7316525466979688, "grad_norm": 6.4375, "learning_rate": 7.322727982391203e-06, "loss": 0.82727652, "memory(GiB)": 135.77, "step": 31360, "train_speed(iter/s)": 0.201842 }, { "acc": 0.7756979, "epoch": 0.7318858542702577, "grad_norm": 6.59375, "learning_rate": 7.321054915343722e-06, "loss": 0.82158718, "memory(GiB)": 135.77, "step": 31370, "train_speed(iter/s)": 0.201877 }, { "acc": 0.75797024, "epoch": 0.7321191618425466, "grad_norm": 6.5625, "learning_rate": 7.3193815169663266e-06, "loss": 0.87084713, "memory(GiB)": 135.77, "step": 31380, "train_speed(iter/s)": 0.201911 }, { "acc": 0.7460907, "epoch": 0.7323524694148354, "grad_norm": 6.90625, "learning_rate": 7.317707787497892e-06, "loss": 0.90985165, "memory(GiB)": 135.77, "step": 31390, "train_speed(iter/s)": 0.201946 }, { "acc": 0.79898553, "epoch": 0.7325857769871243, "grad_norm": 6.5, "learning_rate": 7.316033727177345e-06, "loss": 0.72486515, "memory(GiB)": 135.77, "step": 31400, "train_speed(iter/s)": 0.20198 }, { "acc": 0.78949633, "epoch": 0.7328190845594132, "grad_norm": 4.9375, "learning_rate": 7.314359336243656e-06, "loss": 0.77411165, "memory(GiB)": 135.77, "step": 31410, "train_speed(iter/s)": 0.202011 }, { "acc": 0.77078295, "epoch": 0.7330523921317021, "grad_norm": 5.53125, "learning_rate": 7.312684614935846e-06, "loss": 0.80220585, "memory(GiB)": 135.77, "step": 31420, "train_speed(iter/s)": 0.20204 }, { "acc": 0.78742967, "epoch": 0.733285699703991, "grad_norm": 6.0625, "learning_rate": 7.311009563492977e-06, "loss": 0.77101741, "memory(GiB)": 135.77, "step": 31430, "train_speed(iter/s)": 0.202071 }, { "acc": 0.78280091, "epoch": 0.7335190072762799, "grad_norm": 5.125, "learning_rate": 7.309334182154164e-06, "loss": 0.78739457, "memory(GiB)": 135.77, "step": 31440, "train_speed(iter/s)": 0.202105 }, { "acc": 0.78376379, "epoch": 0.7337523148485688, "grad_norm": 8.9375, "learning_rate": 7.307658471158567e-06, "loss": 0.78450098, "memory(GiB)": 135.77, "step": 31450, "train_speed(iter/s)": 0.202138 }, { "acc": 0.79066029, "epoch": 0.7339856224208577, "grad_norm": 4.875, "learning_rate": 7.305982430745395e-06, "loss": 0.75622902, "memory(GiB)": 135.77, "step": 31460, "train_speed(iter/s)": 0.202173 }, { "acc": 0.769873, "epoch": 0.7342189299931466, "grad_norm": 3.984375, "learning_rate": 7.3043060611538995e-06, "loss": 0.82575455, "memory(GiB)": 135.77, "step": 31470, "train_speed(iter/s)": 0.202204 }, { "acc": 0.77303362, "epoch": 0.7344522375654354, "grad_norm": 6.5, "learning_rate": 7.302629362623384e-06, "loss": 0.81925964, "memory(GiB)": 135.77, "step": 31480, "train_speed(iter/s)": 0.202236 }, { "acc": 0.76517944, "epoch": 0.7346855451377243, "grad_norm": 5.125, "learning_rate": 7.3009523353931966e-06, "loss": 0.83144541, "memory(GiB)": 135.77, "step": 31490, "train_speed(iter/s)": 0.202271 }, { "acc": 0.78074522, "epoch": 0.7349188527100132, "grad_norm": 5.03125, "learning_rate": 7.299274979702732e-06, "loss": 0.79223261, "memory(GiB)": 135.77, "step": 31500, "train_speed(iter/s)": 0.202305 }, { "epoch": 0.7349188527100132, "eval_acc": 0.741985381714187, "eval_loss": 0.8141977787017822, "eval_runtime": 1269.7118, "eval_samples_per_second": 28.346, "eval_steps_per_second": 14.173, "step": 31500 }, { "acc": 0.78632488, "epoch": 0.7351521602823021, "grad_norm": 3.75, "learning_rate": 7.29759729579143e-06, "loss": 0.78005762, "memory(GiB)": 135.77, "step": 31510, "train_speed(iter/s)": 0.200679 }, { "acc": 0.77185574, "epoch": 0.735385467854591, "grad_norm": 7.78125, "learning_rate": 7.295919283898782e-06, "loss": 0.82155762, "memory(GiB)": 135.77, "step": 31520, "train_speed(iter/s)": 0.200713 }, { "acc": 0.77478409, "epoch": 0.7356187754268799, "grad_norm": 3.828125, "learning_rate": 7.294240944264323e-06, "loss": 0.80740013, "memory(GiB)": 135.77, "step": 31530, "train_speed(iter/s)": 0.200747 }, { "acc": 0.79292769, "epoch": 0.7358520829991688, "grad_norm": 4.78125, "learning_rate": 7.292562277127637e-06, "loss": 0.7431262, "memory(GiB)": 135.77, "step": 31540, "train_speed(iter/s)": 0.200781 }, { "acc": 0.79306731, "epoch": 0.7360853905714577, "grad_norm": 4.6875, "learning_rate": 7.290883282728352e-06, "loss": 0.74995842, "memory(GiB)": 135.77, "step": 31550, "train_speed(iter/s)": 0.200813 }, { "acc": 0.77480698, "epoch": 0.7363186981437466, "grad_norm": 5.6875, "learning_rate": 7.289203961306143e-06, "loss": 0.82969999, "memory(GiB)": 135.77, "step": 31560, "train_speed(iter/s)": 0.200845 }, { "acc": 0.76988225, "epoch": 0.7365520057160355, "grad_norm": 6.90625, "learning_rate": 7.287524313100735e-06, "loss": 0.82661495, "memory(GiB)": 135.77, "step": 31570, "train_speed(iter/s)": 0.20088 }, { "acc": 0.75901461, "epoch": 0.7367853132883244, "grad_norm": 5.15625, "learning_rate": 7.285844338351894e-06, "loss": 0.87335548, "memory(GiB)": 135.77, "step": 31580, "train_speed(iter/s)": 0.200914 }, { "acc": 0.77605467, "epoch": 0.7370186208606133, "grad_norm": 7.65625, "learning_rate": 7.284164037299438e-06, "loss": 0.80913677, "memory(GiB)": 135.77, "step": 31590, "train_speed(iter/s)": 0.200947 }, { "acc": 0.79126759, "epoch": 0.7372519284329022, "grad_norm": 7.125, "learning_rate": 7.28248341018323e-06, "loss": 0.72892237, "memory(GiB)": 135.77, "step": 31600, "train_speed(iter/s)": 0.20098 }, { "acc": 0.78675766, "epoch": 0.7374852360051911, "grad_norm": 5.375, "learning_rate": 7.280802457243178e-06, "loss": 0.76203346, "memory(GiB)": 135.77, "step": 31610, "train_speed(iter/s)": 0.201012 }, { "acc": 0.77029533, "epoch": 0.73771854357748, "grad_norm": 5.84375, "learning_rate": 7.2791211787192376e-06, "loss": 0.8017519, "memory(GiB)": 135.77, "step": 31620, "train_speed(iter/s)": 0.201045 }, { "acc": 0.76799793, "epoch": 0.7379518511497689, "grad_norm": 5.09375, "learning_rate": 7.27743957485141e-06, "loss": 0.83303394, "memory(GiB)": 135.77, "step": 31630, "train_speed(iter/s)": 0.20108 }, { "acc": 0.77150707, "epoch": 0.7381851587220578, "grad_norm": 4.5, "learning_rate": 7.2757576458797465e-06, "loss": 0.83672428, "memory(GiB)": 135.77, "step": 31640, "train_speed(iter/s)": 0.201114 }, { "acc": 0.77037902, "epoch": 0.7384184662943467, "grad_norm": 6.65625, "learning_rate": 7.27407539204434e-06, "loss": 0.83148661, "memory(GiB)": 135.77, "step": 31650, "train_speed(iter/s)": 0.201149 }, { "acc": 0.76977768, "epoch": 0.7386517738666356, "grad_norm": 6.0, "learning_rate": 7.272392813585332e-06, "loss": 0.81001329, "memory(GiB)": 135.77, "step": 31660, "train_speed(iter/s)": 0.201183 }, { "acc": 0.80247803, "epoch": 0.7388850814389245, "grad_norm": 3.875, "learning_rate": 7.270709910742908e-06, "loss": 0.70846043, "memory(GiB)": 135.77, "step": 31670, "train_speed(iter/s)": 0.201216 }, { "acc": 0.78375702, "epoch": 0.7391183890112133, "grad_norm": 4.9375, "learning_rate": 7.269026683757306e-06, "loss": 0.78587623, "memory(GiB)": 135.77, "step": 31680, "train_speed(iter/s)": 0.201247 }, { "acc": 0.76821456, "epoch": 0.7393516965835022, "grad_norm": 4.84375, "learning_rate": 7.267343132868803e-06, "loss": 0.82991257, "memory(GiB)": 135.77, "step": 31690, "train_speed(iter/s)": 0.201281 }, { "acc": 0.78773546, "epoch": 0.7395850041557911, "grad_norm": 8.5, "learning_rate": 7.265659258317725e-06, "loss": 0.77818766, "memory(GiB)": 135.77, "step": 31700, "train_speed(iter/s)": 0.201314 }, { "acc": 0.77224665, "epoch": 0.73981831172808, "grad_norm": 5.375, "learning_rate": 7.263975060344449e-06, "loss": 0.81635828, "memory(GiB)": 135.77, "step": 31710, "train_speed(iter/s)": 0.201345 }, { "acc": 0.79697518, "epoch": 0.7400516193003689, "grad_norm": 5.46875, "learning_rate": 7.26229053918939e-06, "loss": 0.74089785, "memory(GiB)": 135.77, "step": 31720, "train_speed(iter/s)": 0.201376 }, { "acc": 0.79544964, "epoch": 0.7402849268726578, "grad_norm": 4.78125, "learning_rate": 7.260605695093014e-06, "loss": 0.69942369, "memory(GiB)": 135.77, "step": 31730, "train_speed(iter/s)": 0.201411 }, { "acc": 0.77880449, "epoch": 0.7405182344449467, "grad_norm": 5.40625, "learning_rate": 7.25892052829583e-06, "loss": 0.79678488, "memory(GiB)": 135.77, "step": 31740, "train_speed(iter/s)": 0.201442 }, { "acc": 0.78319736, "epoch": 0.7407515420172356, "grad_norm": 5.3125, "learning_rate": 7.257235039038397e-06, "loss": 0.76695404, "memory(GiB)": 135.77, "step": 31750, "train_speed(iter/s)": 0.201476 }, { "acc": 0.78057632, "epoch": 0.7409848495895245, "grad_norm": 6.59375, "learning_rate": 7.25554922756132e-06, "loss": 0.77434168, "memory(GiB)": 135.77, "step": 31760, "train_speed(iter/s)": 0.20151 }, { "acc": 0.78480177, "epoch": 0.7412181571618134, "grad_norm": 4.59375, "learning_rate": 7.253863094105243e-06, "loss": 0.76221313, "memory(GiB)": 135.77, "step": 31770, "train_speed(iter/s)": 0.201541 }, { "acc": 0.78093719, "epoch": 0.7414514647341023, "grad_norm": 5.46875, "learning_rate": 7.252176638910867e-06, "loss": 0.79842443, "memory(GiB)": 135.77, "step": 31780, "train_speed(iter/s)": 0.201573 }, { "acc": 0.78112631, "epoch": 0.7416847723063912, "grad_norm": 7.46875, "learning_rate": 7.25048986221893e-06, "loss": 0.79473867, "memory(GiB)": 135.77, "step": 31790, "train_speed(iter/s)": 0.201606 }, { "acc": 0.76295033, "epoch": 0.7419180798786801, "grad_norm": 5.5625, "learning_rate": 7.248802764270217e-06, "loss": 0.86861839, "memory(GiB)": 135.77, "step": 31800, "train_speed(iter/s)": 0.201638 }, { "acc": 0.7788949, "epoch": 0.742151387450969, "grad_norm": 5.34375, "learning_rate": 7.247115345305564e-06, "loss": 0.79273958, "memory(GiB)": 135.77, "step": 31810, "train_speed(iter/s)": 0.201671 }, { "acc": 0.78000889, "epoch": 0.7423846950232579, "grad_norm": 6.15625, "learning_rate": 7.245427605565847e-06, "loss": 0.77720737, "memory(GiB)": 135.77, "step": 31820, "train_speed(iter/s)": 0.201705 }, { "acc": 0.77433429, "epoch": 0.7426180025955468, "grad_norm": 5.8125, "learning_rate": 7.243739545291994e-06, "loss": 0.83012581, "memory(GiB)": 135.77, "step": 31830, "train_speed(iter/s)": 0.201738 }, { "acc": 0.77186079, "epoch": 0.7428513101678357, "grad_norm": 7.53125, "learning_rate": 7.24205116472497e-06, "loss": 0.81709404, "memory(GiB)": 135.77, "step": 31840, "train_speed(iter/s)": 0.20177 }, { "acc": 0.77884254, "epoch": 0.7430846177401246, "grad_norm": 8.1875, "learning_rate": 7.240362464105795e-06, "loss": 0.79129939, "memory(GiB)": 135.77, "step": 31850, "train_speed(iter/s)": 0.201803 }, { "acc": 0.77921705, "epoch": 0.7433179253124135, "grad_norm": 4.96875, "learning_rate": 7.238673443675529e-06, "loss": 0.78770342, "memory(GiB)": 135.77, "step": 31860, "train_speed(iter/s)": 0.201836 }, { "acc": 0.79026484, "epoch": 0.7435512328847023, "grad_norm": 5.09375, "learning_rate": 7.236984103675278e-06, "loss": 0.73446693, "memory(GiB)": 135.77, "step": 31870, "train_speed(iter/s)": 0.20187 }, { "acc": 0.76663461, "epoch": 0.7437845404569912, "grad_norm": 4.71875, "learning_rate": 7.235294444346197e-06, "loss": 0.85680876, "memory(GiB)": 135.77, "step": 31880, "train_speed(iter/s)": 0.201904 }, { "acc": 0.76660204, "epoch": 0.7440178480292801, "grad_norm": 7.84375, "learning_rate": 7.233604465929485e-06, "loss": 0.83537083, "memory(GiB)": 135.77, "step": 31890, "train_speed(iter/s)": 0.201937 }, { "acc": 0.78674612, "epoch": 0.744251155601569, "grad_norm": 4.59375, "learning_rate": 7.231914168666382e-06, "loss": 0.73860254, "memory(GiB)": 135.77, "step": 31900, "train_speed(iter/s)": 0.201973 }, { "acc": 0.76229892, "epoch": 0.7444844631738579, "grad_norm": 7.0, "learning_rate": 7.23022355279818e-06, "loss": 0.86733685, "memory(GiB)": 135.77, "step": 31910, "train_speed(iter/s)": 0.202005 }, { "acc": 0.76368475, "epoch": 0.7447177707461468, "grad_norm": 5.0625, "learning_rate": 7.228532618566214e-06, "loss": 0.8780262, "memory(GiB)": 135.77, "step": 31920, "train_speed(iter/s)": 0.202034 }, { "acc": 0.78525524, "epoch": 0.7449510783184357, "grad_norm": 6.34375, "learning_rate": 7.226841366211865e-06, "loss": 0.77032347, "memory(GiB)": 135.77, "step": 31930, "train_speed(iter/s)": 0.202065 }, { "acc": 0.77381964, "epoch": 0.7451843858907246, "grad_norm": 5.3125, "learning_rate": 7.225149795976558e-06, "loss": 0.79736581, "memory(GiB)": 135.77, "step": 31940, "train_speed(iter/s)": 0.202097 }, { "acc": 0.78271856, "epoch": 0.7454176934630135, "grad_norm": 6.1875, "learning_rate": 7.223457908101763e-06, "loss": 0.769701, "memory(GiB)": 135.77, "step": 31950, "train_speed(iter/s)": 0.202131 }, { "acc": 0.77590184, "epoch": 0.7456510010353024, "grad_norm": 7.28125, "learning_rate": 7.2217657028289974e-06, "loss": 0.81713409, "memory(GiB)": 135.77, "step": 31960, "train_speed(iter/s)": 0.202164 }, { "acc": 0.79009981, "epoch": 0.7458843086075913, "grad_norm": 5.46875, "learning_rate": 7.220073180399824e-06, "loss": 0.76196728, "memory(GiB)": 135.77, "step": 31970, "train_speed(iter/s)": 0.202199 }, { "acc": 0.79176197, "epoch": 0.7461176161798801, "grad_norm": 5.0, "learning_rate": 7.218380341055848e-06, "loss": 0.74932814, "memory(GiB)": 135.77, "step": 31980, "train_speed(iter/s)": 0.202231 }, { "acc": 0.75970087, "epoch": 0.746350923752169, "grad_norm": 4.625, "learning_rate": 7.216687185038724e-06, "loss": 0.87884674, "memory(GiB)": 135.77, "step": 31990, "train_speed(iter/s)": 0.202261 }, { "acc": 0.77163577, "epoch": 0.746584231324458, "grad_norm": 6.3125, "learning_rate": 7.214993712590148e-06, "loss": 0.85210857, "memory(GiB)": 135.77, "step": 32000, "train_speed(iter/s)": 0.202296 }, { "epoch": 0.746584231324458, "eval_acc": 0.7420050794547601, "eval_loss": 0.8138281106948853, "eval_runtime": 1270.8326, "eval_samples_per_second": 28.321, "eval_steps_per_second": 14.161, "step": 32000 }, { "acc": 0.76468716, "epoch": 0.7468175388967468, "grad_norm": 100.5, "learning_rate": 7.213299923951863e-06, "loss": 0.88969822, "memory(GiB)": 135.77, "step": 32010, "train_speed(iter/s)": 0.200694 }, { "acc": 0.77527003, "epoch": 0.7470508464690357, "grad_norm": 5.59375, "learning_rate": 7.211605819365657e-06, "loss": 0.82085714, "memory(GiB)": 135.77, "step": 32020, "train_speed(iter/s)": 0.200726 }, { "acc": 0.77617159, "epoch": 0.7472841540413246, "grad_norm": 6.375, "learning_rate": 7.209911399073361e-06, "loss": 0.83005333, "memory(GiB)": 135.77, "step": 32030, "train_speed(iter/s)": 0.200758 }, { "acc": 0.77418909, "epoch": 0.7475174616136135, "grad_norm": 6.0, "learning_rate": 7.208216663316856e-06, "loss": 0.79054193, "memory(GiB)": 135.77, "step": 32040, "train_speed(iter/s)": 0.200791 }, { "acc": 0.78965578, "epoch": 0.7477507691859024, "grad_norm": 4.4375, "learning_rate": 7.206521612338064e-06, "loss": 0.74522982, "memory(GiB)": 135.77, "step": 32050, "train_speed(iter/s)": 0.200824 }, { "acc": 0.76647468, "epoch": 0.7479840767581912, "grad_norm": 6.28125, "learning_rate": 7.204826246378953e-06, "loss": 0.81988811, "memory(GiB)": 135.77, "step": 32060, "train_speed(iter/s)": 0.200858 }, { "acc": 0.77853847, "epoch": 0.7482173843304801, "grad_norm": 4.9375, "learning_rate": 7.203130565681537e-06, "loss": 0.79634762, "memory(GiB)": 135.77, "step": 32070, "train_speed(iter/s)": 0.200889 }, { "acc": 0.7814064, "epoch": 0.748450691902769, "grad_norm": 7.0, "learning_rate": 7.201434570487871e-06, "loss": 0.78645287, "memory(GiB)": 135.77, "step": 32080, "train_speed(iter/s)": 0.200919 }, { "acc": 0.77312088, "epoch": 0.7486839994750579, "grad_norm": 4.78125, "learning_rate": 7.199738261040059e-06, "loss": 0.83004112, "memory(GiB)": 135.77, "step": 32090, "train_speed(iter/s)": 0.200951 }, { "acc": 0.79272485, "epoch": 0.7489173070473468, "grad_norm": 5.9375, "learning_rate": 7.1980416375802494e-06, "loss": 0.7410665, "memory(GiB)": 135.77, "step": 32100, "train_speed(iter/s)": 0.200982 }, { "acc": 0.76613655, "epoch": 0.7491506146196357, "grad_norm": 5.46875, "learning_rate": 7.196344700350635e-06, "loss": 0.86439323, "memory(GiB)": 135.77, "step": 32110, "train_speed(iter/s)": 0.201014 }, { "acc": 0.78310966, "epoch": 0.7493839221919246, "grad_norm": 5.375, "learning_rate": 7.1946474495934535e-06, "loss": 0.765874, "memory(GiB)": 135.77, "step": 32120, "train_speed(iter/s)": 0.201043 }, { "acc": 0.79054403, "epoch": 0.7496172297642135, "grad_norm": 4.5, "learning_rate": 7.192949885550986e-06, "loss": 0.73549843, "memory(GiB)": 135.77, "step": 32130, "train_speed(iter/s)": 0.201075 }, { "acc": 0.77819924, "epoch": 0.7498505373365024, "grad_norm": 4.625, "learning_rate": 7.1912520084655594e-06, "loss": 0.79788404, "memory(GiB)": 135.77, "step": 32140, "train_speed(iter/s)": 0.201108 }, { "acc": 0.79777637, "epoch": 0.7500838449087913, "grad_norm": 6.8125, "learning_rate": 7.189553818579545e-06, "loss": 0.70898933, "memory(GiB)": 135.77, "step": 32150, "train_speed(iter/s)": 0.201138 }, { "acc": 0.77831688, "epoch": 0.7503171524810802, "grad_norm": 4.9375, "learning_rate": 7.187855316135358e-06, "loss": 0.79211369, "memory(GiB)": 135.77, "step": 32160, "train_speed(iter/s)": 0.201171 }, { "acc": 0.77343702, "epoch": 0.7505504600533691, "grad_norm": 5.1875, "learning_rate": 7.1861565013754605e-06, "loss": 0.81101351, "memory(GiB)": 135.77, "step": 32170, "train_speed(iter/s)": 0.201199 }, { "acc": 0.77341728, "epoch": 0.750783767625658, "grad_norm": 6.625, "learning_rate": 7.18445737454236e-06, "loss": 0.81564074, "memory(GiB)": 135.77, "step": 32180, "train_speed(iter/s)": 0.201231 }, { "acc": 0.78846521, "epoch": 0.7510170751979469, "grad_norm": 4.5625, "learning_rate": 7.182757935878601e-06, "loss": 0.75658441, "memory(GiB)": 135.77, "step": 32190, "train_speed(iter/s)": 0.201264 }, { "acc": 0.77531843, "epoch": 0.7512503827702358, "grad_norm": 4.90625, "learning_rate": 7.1810581856267815e-06, "loss": 0.82540045, "memory(GiB)": 135.77, "step": 32200, "train_speed(iter/s)": 0.201295 }, { "acc": 0.78034782, "epoch": 0.7514836903425247, "grad_norm": 14.75, "learning_rate": 7.17935812402954e-06, "loss": 0.78024654, "memory(GiB)": 135.77, "step": 32210, "train_speed(iter/s)": 0.201326 }, { "acc": 0.76428576, "epoch": 0.7517169979148136, "grad_norm": 5.0, "learning_rate": 7.177657751329559e-06, "loss": 0.84508133, "memory(GiB)": 135.77, "step": 32220, "train_speed(iter/s)": 0.201356 }, { "acc": 0.77687807, "epoch": 0.7519503054871025, "grad_norm": 4.6875, "learning_rate": 7.1759570677695665e-06, "loss": 0.80794582, "memory(GiB)": 135.77, "step": 32230, "train_speed(iter/s)": 0.201387 }, { "acc": 0.75994444, "epoch": 0.7521836130593914, "grad_norm": 6.78125, "learning_rate": 7.174256073592335e-06, "loss": 0.86417484, "memory(GiB)": 135.77, "step": 32240, "train_speed(iter/s)": 0.201419 }, { "acc": 0.79446583, "epoch": 0.7524169206316802, "grad_norm": 6.5625, "learning_rate": 7.172554769040681e-06, "loss": 0.72564263, "memory(GiB)": 135.77, "step": 32250, "train_speed(iter/s)": 0.20145 }, { "acc": 0.77692814, "epoch": 0.7526502282039691, "grad_norm": 5.1875, "learning_rate": 7.1708531543574635e-06, "loss": 0.79970336, "memory(GiB)": 135.77, "step": 32260, "train_speed(iter/s)": 0.201483 }, { "acc": 0.79846258, "epoch": 0.752883535776258, "grad_norm": 4.75, "learning_rate": 7.169151229785589e-06, "loss": 0.72652612, "memory(GiB)": 135.77, "step": 32270, "train_speed(iter/s)": 0.201517 }, { "acc": 0.79553976, "epoch": 0.7531168433485469, "grad_norm": 5.90625, "learning_rate": 7.167448995568009e-06, "loss": 0.73974028, "memory(GiB)": 135.77, "step": 32280, "train_speed(iter/s)": 0.201549 }, { "acc": 0.77979794, "epoch": 0.7533501509208358, "grad_norm": 9.8125, "learning_rate": 7.165746451947713e-06, "loss": 0.79384131, "memory(GiB)": 135.77, "step": 32290, "train_speed(iter/s)": 0.201579 }, { "acc": 0.76877642, "epoch": 0.7535834584931247, "grad_norm": 5.5625, "learning_rate": 7.16404359916774e-06, "loss": 0.86544285, "memory(GiB)": 135.77, "step": 32300, "train_speed(iter/s)": 0.20161 }, { "acc": 0.76505909, "epoch": 0.7538167660654136, "grad_norm": 10.9375, "learning_rate": 7.1623404374711715e-06, "loss": 0.85449657, "memory(GiB)": 135.77, "step": 32310, "train_speed(iter/s)": 0.201643 }, { "acc": 0.78930197, "epoch": 0.7540500736377025, "grad_norm": 4.125, "learning_rate": 7.160636967101134e-06, "loss": 0.75111027, "memory(GiB)": 135.77, "step": 32320, "train_speed(iter/s)": 0.201676 }, { "acc": 0.79103222, "epoch": 0.7542833812099914, "grad_norm": 5.9375, "learning_rate": 7.1589331883007965e-06, "loss": 0.75493755, "memory(GiB)": 135.77, "step": 32330, "train_speed(iter/s)": 0.201708 }, { "acc": 0.76673937, "epoch": 0.7545166887822803, "grad_norm": 5.3125, "learning_rate": 7.1572291013133745e-06, "loss": 0.858673, "memory(GiB)": 135.77, "step": 32340, "train_speed(iter/s)": 0.201739 }, { "acc": 0.77912583, "epoch": 0.7547499963545692, "grad_norm": 5.5625, "learning_rate": 7.155524706382125e-06, "loss": 0.79643412, "memory(GiB)": 135.77, "step": 32350, "train_speed(iter/s)": 0.201771 }, { "acc": 0.77822075, "epoch": 0.7549833039268581, "grad_norm": 7.15625, "learning_rate": 7.15382000375035e-06, "loss": 0.79455347, "memory(GiB)": 135.77, "step": 32360, "train_speed(iter/s)": 0.201803 }, { "acc": 0.79522676, "epoch": 0.755216611499147, "grad_norm": 4.0625, "learning_rate": 7.152114993661394e-06, "loss": 0.71623573, "memory(GiB)": 135.77, "step": 32370, "train_speed(iter/s)": 0.201835 }, { "acc": 0.77095437, "epoch": 0.7554499190714359, "grad_norm": 4.03125, "learning_rate": 7.150409676358649e-06, "loss": 0.83390207, "memory(GiB)": 135.77, "step": 32380, "train_speed(iter/s)": 0.201866 }, { "acc": 0.76215563, "epoch": 0.7556832266437248, "grad_norm": 5.28125, "learning_rate": 7.148704052085547e-06, "loss": 0.87032471, "memory(GiB)": 135.77, "step": 32390, "train_speed(iter/s)": 0.201899 }, { "acc": 0.77448578, "epoch": 0.7559165342160137, "grad_norm": 15.5, "learning_rate": 7.146998121085566e-06, "loss": 0.80818844, "memory(GiB)": 135.77, "step": 32400, "train_speed(iter/s)": 0.20193 }, { "acc": 0.78134699, "epoch": 0.7561498417883026, "grad_norm": 4.5, "learning_rate": 7.145291883602226e-06, "loss": 0.76230822, "memory(GiB)": 135.77, "step": 32410, "train_speed(iter/s)": 0.201961 }, { "acc": 0.7758811, "epoch": 0.7563831493605915, "grad_norm": 5.65625, "learning_rate": 7.143585339879093e-06, "loss": 0.81609783, "memory(GiB)": 135.77, "step": 32420, "train_speed(iter/s)": 0.201993 }, { "acc": 0.76103873, "epoch": 0.7566164569328804, "grad_norm": 7.3125, "learning_rate": 7.141878490159777e-06, "loss": 0.88947668, "memory(GiB)": 135.77, "step": 32430, "train_speed(iter/s)": 0.202027 }, { "acc": 0.79736533, "epoch": 0.7568497645051693, "grad_norm": 6.03125, "learning_rate": 7.140171334687927e-06, "loss": 0.73429241, "memory(GiB)": 135.77, "step": 32440, "train_speed(iter/s)": 0.20206 }, { "acc": 0.76088171, "epoch": 0.7570830720774581, "grad_norm": 7.0, "learning_rate": 7.138463873707242e-06, "loss": 0.87091007, "memory(GiB)": 135.77, "step": 32450, "train_speed(iter/s)": 0.202093 }, { "acc": 0.77715158, "epoch": 0.757316379649747, "grad_norm": 4.21875, "learning_rate": 7.13675610746146e-06, "loss": 0.80676775, "memory(GiB)": 135.77, "step": 32460, "train_speed(iter/s)": 0.202125 }, { "acc": 0.77457027, "epoch": 0.7575496872220359, "grad_norm": 4.65625, "learning_rate": 7.135048036194364e-06, "loss": 0.81614847, "memory(GiB)": 135.77, "step": 32470, "train_speed(iter/s)": 0.202157 }, { "acc": 0.76460385, "epoch": 0.7577829947943248, "grad_norm": 6.46875, "learning_rate": 7.13333966014978e-06, "loss": 0.84198284, "memory(GiB)": 135.77, "step": 32480, "train_speed(iter/s)": 0.20219 }, { "acc": 0.77977839, "epoch": 0.7580163023666137, "grad_norm": 6.625, "learning_rate": 7.131630979571581e-06, "loss": 0.79072237, "memory(GiB)": 135.77, "step": 32490, "train_speed(iter/s)": 0.20222 }, { "acc": 0.77567453, "epoch": 0.7582496099389026, "grad_norm": 4.96875, "learning_rate": 7.1299219947036795e-06, "loss": 0.81002426, "memory(GiB)": 135.77, "step": 32500, "train_speed(iter/s)": 0.202251 }, { "epoch": 0.7582496099389026, "eval_acc": 0.742288214457469, "eval_loss": 0.8132081031799316, "eval_runtime": 1270.5334, "eval_samples_per_second": 28.327, "eval_steps_per_second": 14.164, "step": 32500 }, { "acc": 0.77000389, "epoch": 0.7584829175111915, "grad_norm": 5.34375, "learning_rate": 7.12821270579003e-06, "loss": 0.80007992, "memory(GiB)": 135.77, "step": 32510, "train_speed(iter/s)": 0.200677 }, { "acc": 0.78935204, "epoch": 0.7587162250834804, "grad_norm": 5.75, "learning_rate": 7.126503113074636e-06, "loss": 0.74440832, "memory(GiB)": 135.77, "step": 32520, "train_speed(iter/s)": 0.200709 }, { "acc": 0.77540474, "epoch": 0.7589495326557693, "grad_norm": 5.15625, "learning_rate": 7.1247932168015396e-06, "loss": 0.81684599, "memory(GiB)": 135.77, "step": 32530, "train_speed(iter/s)": 0.20074 }, { "acc": 0.77324686, "epoch": 0.7591828402280582, "grad_norm": 5.8125, "learning_rate": 7.123083017214829e-06, "loss": 0.79553862, "memory(GiB)": 135.77, "step": 32540, "train_speed(iter/s)": 0.200771 }, { "acc": 0.77071304, "epoch": 0.759416147800347, "grad_norm": 8.5, "learning_rate": 7.121372514558635e-06, "loss": 0.83940582, "memory(GiB)": 135.77, "step": 32550, "train_speed(iter/s)": 0.200803 }, { "acc": 0.76741009, "epoch": 0.759649455372636, "grad_norm": 5.28125, "learning_rate": 7.1196617090771305e-06, "loss": 0.83808603, "memory(GiB)": 135.77, "step": 32560, "train_speed(iter/s)": 0.200835 }, { "acc": 0.79151773, "epoch": 0.7598827629449248, "grad_norm": 4.40625, "learning_rate": 7.1179506010145335e-06, "loss": 0.75109611, "memory(GiB)": 135.77, "step": 32570, "train_speed(iter/s)": 0.200869 }, { "acc": 0.76236095, "epoch": 0.7601160705172137, "grad_norm": 3.59375, "learning_rate": 7.116239190615104e-06, "loss": 0.87826719, "memory(GiB)": 135.77, "step": 32580, "train_speed(iter/s)": 0.200901 }, { "acc": 0.76941271, "epoch": 0.7603493780895026, "grad_norm": 5.15625, "learning_rate": 7.1145274781231435e-06, "loss": 0.84898701, "memory(GiB)": 135.77, "step": 32590, "train_speed(iter/s)": 0.200932 }, { "acc": 0.7833683, "epoch": 0.7605826856617915, "grad_norm": 7.25, "learning_rate": 7.112815463782998e-06, "loss": 0.79329114, "memory(GiB)": 135.77, "step": 32600, "train_speed(iter/s)": 0.200964 }, { "acc": 0.77949257, "epoch": 0.7608159932340804, "grad_norm": 5.6875, "learning_rate": 7.111103147839062e-06, "loss": 0.78569312, "memory(GiB)": 135.77, "step": 32610, "train_speed(iter/s)": 0.200995 }, { "acc": 0.79783745, "epoch": 0.7610493008063693, "grad_norm": 5.46875, "learning_rate": 7.109390530535762e-06, "loss": 0.698351, "memory(GiB)": 135.77, "step": 32620, "train_speed(iter/s)": 0.201026 }, { "acc": 0.77755065, "epoch": 0.7612826083786582, "grad_norm": 6.71875, "learning_rate": 7.1076776121175794e-06, "loss": 0.82539043, "memory(GiB)": 135.77, "step": 32630, "train_speed(iter/s)": 0.201059 }, { "acc": 0.78371029, "epoch": 0.761515915950947, "grad_norm": 5.25, "learning_rate": 7.105964392829029e-06, "loss": 0.77058702, "memory(GiB)": 135.77, "step": 32640, "train_speed(iter/s)": 0.201091 }, { "acc": 0.76712589, "epoch": 0.7617492235232359, "grad_norm": 5.125, "learning_rate": 7.104250872914673e-06, "loss": 0.83793221, "memory(GiB)": 135.77, "step": 32650, "train_speed(iter/s)": 0.201124 }, { "acc": 0.78511324, "epoch": 0.7619825310955248, "grad_norm": 17.625, "learning_rate": 7.102537052619116e-06, "loss": 0.7639678, "memory(GiB)": 135.77, "step": 32660, "train_speed(iter/s)": 0.201156 }, { "acc": 0.78138227, "epoch": 0.7622158386678137, "grad_norm": 5.1875, "learning_rate": 7.100822932187006e-06, "loss": 0.78408322, "memory(GiB)": 135.77, "step": 32670, "train_speed(iter/s)": 0.201185 }, { "acc": 0.78767548, "epoch": 0.7624491462401026, "grad_norm": 5.5, "learning_rate": 7.099108511863032e-06, "loss": 0.75953522, "memory(GiB)": 135.77, "step": 32680, "train_speed(iter/s)": 0.201217 }, { "acc": 0.77689095, "epoch": 0.7626824538123915, "grad_norm": 4.9375, "learning_rate": 7.097393791891929e-06, "loss": 0.8115571, "memory(GiB)": 135.77, "step": 32690, "train_speed(iter/s)": 0.201248 }, { "acc": 0.79620209, "epoch": 0.7629157613846804, "grad_norm": 6.0, "learning_rate": 7.095678772518471e-06, "loss": 0.70747728, "memory(GiB)": 135.77, "step": 32700, "train_speed(iter/s)": 0.201281 }, { "acc": 0.77352734, "epoch": 0.7631490689569693, "grad_norm": 10.5625, "learning_rate": 7.093963453987476e-06, "loss": 0.81851959, "memory(GiB)": 135.77, "step": 32710, "train_speed(iter/s)": 0.201315 }, { "acc": 0.7703311, "epoch": 0.7633823765292582, "grad_norm": 6.03125, "learning_rate": 7.092247836543808e-06, "loss": 0.84133596, "memory(GiB)": 135.77, "step": 32720, "train_speed(iter/s)": 0.201343 }, { "acc": 0.79012804, "epoch": 0.7636156841015471, "grad_norm": 5.46875, "learning_rate": 7.090531920432368e-06, "loss": 0.7244441, "memory(GiB)": 135.77, "step": 32730, "train_speed(iter/s)": 0.201373 }, { "acc": 0.78044481, "epoch": 0.763848991673836, "grad_norm": 7.59375, "learning_rate": 7.088815705898103e-06, "loss": 0.80008049, "memory(GiB)": 135.77, "step": 32740, "train_speed(iter/s)": 0.201404 }, { "acc": 0.75984888, "epoch": 0.7640822992461249, "grad_norm": 4.46875, "learning_rate": 7.0870991931860044e-06, "loss": 0.85378962, "memory(GiB)": 135.77, "step": 32750, "train_speed(iter/s)": 0.201435 }, { "acc": 0.80223875, "epoch": 0.7643156068184138, "grad_norm": 5.40625, "learning_rate": 7.0853823825411005e-06, "loss": 0.69828153, "memory(GiB)": 135.77, "step": 32760, "train_speed(iter/s)": 0.201466 }, { "acc": 0.78329859, "epoch": 0.7645489143907027, "grad_norm": 8.0, "learning_rate": 7.083665274208469e-06, "loss": 0.75291567, "memory(GiB)": 135.77, "step": 32770, "train_speed(iter/s)": 0.201495 }, { "acc": 0.79095221, "epoch": 0.7647822219629916, "grad_norm": 5.1875, "learning_rate": 7.081947868433223e-06, "loss": 0.75146961, "memory(GiB)": 135.77, "step": 32780, "train_speed(iter/s)": 0.201528 }, { "acc": 0.75557308, "epoch": 0.7650155295352805, "grad_norm": 6.875, "learning_rate": 7.0802301654605255e-06, "loss": 0.88731985, "memory(GiB)": 135.77, "step": 32790, "train_speed(iter/s)": 0.201561 }, { "acc": 0.7902915, "epoch": 0.7652488371075694, "grad_norm": 5.375, "learning_rate": 7.078512165535576e-06, "loss": 0.74065533, "memory(GiB)": 135.77, "step": 32800, "train_speed(iter/s)": 0.201593 }, { "acc": 0.78259544, "epoch": 0.7654821446798583, "grad_norm": 5.75, "learning_rate": 7.076793868903617e-06, "loss": 0.77633686, "memory(GiB)": 135.77, "step": 32810, "train_speed(iter/s)": 0.201624 }, { "acc": 0.77786484, "epoch": 0.7657154522521472, "grad_norm": 3.9375, "learning_rate": 7.0750752758099384e-06, "loss": 0.81483288, "memory(GiB)": 135.77, "step": 32820, "train_speed(iter/s)": 0.201655 }, { "acc": 0.78939443, "epoch": 0.765948759824436, "grad_norm": 3.96875, "learning_rate": 7.073356386499865e-06, "loss": 0.76333141, "memory(GiB)": 135.77, "step": 32830, "train_speed(iter/s)": 0.201684 }, { "acc": 0.76923409, "epoch": 0.7661820673967249, "grad_norm": 5.21875, "learning_rate": 7.071637201218772e-06, "loss": 0.83631258, "memory(GiB)": 135.77, "step": 32840, "train_speed(iter/s)": 0.201716 }, { "acc": 0.78508043, "epoch": 0.7664153749690138, "grad_norm": 5.5, "learning_rate": 7.06991772021207e-06, "loss": 0.77355008, "memory(GiB)": 135.77, "step": 32850, "train_speed(iter/s)": 0.201748 }, { "acc": 0.78850126, "epoch": 0.7666486825413027, "grad_norm": 4.875, "learning_rate": 7.068197943725214e-06, "loss": 0.77419615, "memory(GiB)": 135.77, "step": 32860, "train_speed(iter/s)": 0.201779 }, { "acc": 0.77871504, "epoch": 0.7668819901135916, "grad_norm": 11.375, "learning_rate": 7.0664778720037034e-06, "loss": 0.78229799, "memory(GiB)": 135.77, "step": 32870, "train_speed(iter/s)": 0.201812 }, { "acc": 0.78440037, "epoch": 0.7671152976858805, "grad_norm": 7.875, "learning_rate": 7.064757505293075e-06, "loss": 0.77941685, "memory(GiB)": 135.77, "step": 32880, "train_speed(iter/s)": 0.201844 }, { "acc": 0.75921545, "epoch": 0.7673486052581694, "grad_norm": 6.0625, "learning_rate": 7.063036843838913e-06, "loss": 0.85018616, "memory(GiB)": 135.77, "step": 32890, "train_speed(iter/s)": 0.201875 }, { "acc": 0.77700405, "epoch": 0.7675819128304583, "grad_norm": 5.21875, "learning_rate": 7.061315887886841e-06, "loss": 0.80813074, "memory(GiB)": 135.77, "step": 32900, "train_speed(iter/s)": 0.201908 }, { "acc": 0.78043146, "epoch": 0.7678152204027472, "grad_norm": 6.46875, "learning_rate": 7.059594637682526e-06, "loss": 0.7803668, "memory(GiB)": 135.77, "step": 32910, "train_speed(iter/s)": 0.201936 }, { "acc": 0.76210136, "epoch": 0.7680485279750361, "grad_norm": 6.25, "learning_rate": 7.057873093471673e-06, "loss": 0.85728846, "memory(GiB)": 135.77, "step": 32920, "train_speed(iter/s)": 0.201968 }, { "acc": 0.78007121, "epoch": 0.768281835547325, "grad_norm": 4.8125, "learning_rate": 7.056151255500036e-06, "loss": 0.77832518, "memory(GiB)": 135.77, "step": 32930, "train_speed(iter/s)": 0.202 }, { "acc": 0.75704546, "epoch": 0.7685151431196139, "grad_norm": 5.59375, "learning_rate": 7.0544291240134025e-06, "loss": 0.88629284, "memory(GiB)": 135.77, "step": 32940, "train_speed(iter/s)": 0.202032 }, { "acc": 0.7763031, "epoch": 0.7687484506919028, "grad_norm": 16.625, "learning_rate": 7.052706699257609e-06, "loss": 0.79555397, "memory(GiB)": 135.77, "step": 32950, "train_speed(iter/s)": 0.202064 }, { "acc": 0.77789965, "epoch": 0.7689817582641917, "grad_norm": 7.25, "learning_rate": 7.05098398147853e-06, "loss": 0.80233002, "memory(GiB)": 135.77, "step": 32960, "train_speed(iter/s)": 0.202096 }, { "acc": 0.77640162, "epoch": 0.7692150658364806, "grad_norm": 6.3125, "learning_rate": 7.0492609709220835e-06, "loss": 0.81024342, "memory(GiB)": 135.77, "step": 32970, "train_speed(iter/s)": 0.202129 }, { "acc": 0.79174838, "epoch": 0.7694483734087695, "grad_norm": 4.5625, "learning_rate": 7.04753766783423e-06, "loss": 0.75231781, "memory(GiB)": 135.77, "step": 32980, "train_speed(iter/s)": 0.20216 }, { "acc": 0.77766156, "epoch": 0.7696816809810584, "grad_norm": 6.46875, "learning_rate": 7.045814072460968e-06, "loss": 0.78604116, "memory(GiB)": 135.77, "step": 32990, "train_speed(iter/s)": 0.202192 }, { "acc": 0.76858015, "epoch": 0.7699149885533473, "grad_norm": 4.96875, "learning_rate": 7.044090185048343e-06, "loss": 0.86624451, "memory(GiB)": 135.77, "step": 33000, "train_speed(iter/s)": 0.202225 }, { "epoch": 0.7699149885533473, "eval_acc": 0.742469978161131, "eval_loss": 0.8125158548355103, "eval_runtime": 1269.1292, "eval_samples_per_second": 28.359, "eval_steps_per_second": 14.18, "step": 33000 }, { "acc": 0.76679192, "epoch": 0.7701482961256362, "grad_norm": 4.75, "learning_rate": 7.042366005842437e-06, "loss": 0.83562012, "memory(GiB)": 135.77, "step": 33010, "train_speed(iter/s)": 0.200677 }, { "acc": 0.78599691, "epoch": 0.770381603697925, "grad_norm": 6.6875, "learning_rate": 7.040641535089377e-06, "loss": 0.82067356, "memory(GiB)": 135.77, "step": 33020, "train_speed(iter/s)": 0.200708 }, { "acc": 0.77557378, "epoch": 0.7706149112702139, "grad_norm": 6.3125, "learning_rate": 7.038916773035332e-06, "loss": 0.80831299, "memory(GiB)": 135.77, "step": 33030, "train_speed(iter/s)": 0.200739 }, { "acc": 0.78095722, "epoch": 0.7708482188425028, "grad_norm": 6.09375, "learning_rate": 7.037191719926507e-06, "loss": 0.77209196, "memory(GiB)": 135.77, "step": 33040, "train_speed(iter/s)": 0.200771 }, { "acc": 0.78821402, "epoch": 0.7710815264147917, "grad_norm": 8.0, "learning_rate": 7.035466376009157e-06, "loss": 0.75552502, "memory(GiB)": 135.77, "step": 33050, "train_speed(iter/s)": 0.200803 }, { "acc": 0.7637373, "epoch": 0.7713148339870806, "grad_norm": 4.25, "learning_rate": 7.033740741529573e-06, "loss": 0.84696598, "memory(GiB)": 135.77, "step": 33060, "train_speed(iter/s)": 0.200834 }, { "acc": 0.79854646, "epoch": 0.7715481415593695, "grad_norm": 4.65625, "learning_rate": 7.03201481673409e-06, "loss": 0.69255486, "memory(GiB)": 135.77, "step": 33070, "train_speed(iter/s)": 0.200865 }, { "acc": 0.79627194, "epoch": 0.7717814491316584, "grad_norm": 5.09375, "learning_rate": 7.030288601869082e-06, "loss": 0.74422112, "memory(GiB)": 135.77, "step": 33080, "train_speed(iter/s)": 0.200896 }, { "acc": 0.75897083, "epoch": 0.7720147567039473, "grad_norm": 5.0625, "learning_rate": 7.028562097180965e-06, "loss": 0.87837725, "memory(GiB)": 135.77, "step": 33090, "train_speed(iter/s)": 0.200927 }, { "acc": 0.7633698, "epoch": 0.7722480642762362, "grad_norm": 6.46875, "learning_rate": 7.026835302916198e-06, "loss": 0.87001724, "memory(GiB)": 135.77, "step": 33100, "train_speed(iter/s)": 0.200958 }, { "acc": 0.77343187, "epoch": 0.7724813718485251, "grad_norm": 5.8125, "learning_rate": 7.025108219321281e-06, "loss": 0.82100897, "memory(GiB)": 135.77, "step": 33110, "train_speed(iter/s)": 0.200991 }, { "acc": 0.78877459, "epoch": 0.772714679420814, "grad_norm": 4.28125, "learning_rate": 7.023380846642754e-06, "loss": 0.76161418, "memory(GiB)": 135.77, "step": 33120, "train_speed(iter/s)": 0.201023 }, { "acc": 0.77717133, "epoch": 0.7729479869931029, "grad_norm": 5.15625, "learning_rate": 7.021653185127197e-06, "loss": 0.79486446, "memory(GiB)": 135.77, "step": 33130, "train_speed(iter/s)": 0.201053 }, { "acc": 0.77787819, "epoch": 0.7731812945653918, "grad_norm": 7.53125, "learning_rate": 7.019925235021237e-06, "loss": 0.8035284, "memory(GiB)": 135.77, "step": 33140, "train_speed(iter/s)": 0.201083 }, { "acc": 0.77615976, "epoch": 0.7734146021376807, "grad_norm": 5.125, "learning_rate": 7.018196996571538e-06, "loss": 0.81237202, "memory(GiB)": 135.77, "step": 33150, "train_speed(iter/s)": 0.201114 }, { "acc": 0.78682756, "epoch": 0.7736479097099696, "grad_norm": 5.4375, "learning_rate": 7.016468470024802e-06, "loss": 0.74849882, "memory(GiB)": 135.77, "step": 33160, "train_speed(iter/s)": 0.201146 }, { "acc": 0.77469473, "epoch": 0.7738812172822584, "grad_norm": 5.28125, "learning_rate": 7.014739655627778e-06, "loss": 0.79529195, "memory(GiB)": 135.77, "step": 33170, "train_speed(iter/s)": 0.201177 }, { "acc": 0.7825881, "epoch": 0.7741145248545473, "grad_norm": 4.5625, "learning_rate": 7.013010553627253e-06, "loss": 0.76973028, "memory(GiB)": 135.77, "step": 33180, "train_speed(iter/s)": 0.201207 }, { "acc": 0.79780211, "epoch": 0.7743478324268362, "grad_norm": 5.59375, "learning_rate": 7.011281164270056e-06, "loss": 0.72503815, "memory(GiB)": 135.77, "step": 33190, "train_speed(iter/s)": 0.201239 }, { "acc": 0.78128815, "epoch": 0.7745811399991251, "grad_norm": 5.71875, "learning_rate": 7.009551487803058e-06, "loss": 0.7804842, "memory(GiB)": 135.77, "step": 33200, "train_speed(iter/s)": 0.201271 }, { "acc": 0.78946109, "epoch": 0.774814447571414, "grad_norm": 6.375, "learning_rate": 7.0078215244731685e-06, "loss": 0.74558954, "memory(GiB)": 135.77, "step": 33210, "train_speed(iter/s)": 0.201302 }, { "acc": 0.77743354, "epoch": 0.7750477551437028, "grad_norm": 5.875, "learning_rate": 7.00609127452734e-06, "loss": 0.80112734, "memory(GiB)": 135.77, "step": 33220, "train_speed(iter/s)": 0.201333 }, { "acc": 0.78358135, "epoch": 0.7752810627159917, "grad_norm": 5.625, "learning_rate": 7.0043607382125645e-06, "loss": 0.77119946, "memory(GiB)": 135.77, "step": 33230, "train_speed(iter/s)": 0.201365 }, { "acc": 0.78574867, "epoch": 0.7755143702882806, "grad_norm": 4.78125, "learning_rate": 7.002629915775876e-06, "loss": 0.77710333, "memory(GiB)": 135.77, "step": 33240, "train_speed(iter/s)": 0.201395 }, { "acc": 0.77628012, "epoch": 0.7757476778605695, "grad_norm": 6.6875, "learning_rate": 7.000898807464349e-06, "loss": 0.84035797, "memory(GiB)": 135.77, "step": 33250, "train_speed(iter/s)": 0.201424 }, { "acc": 0.78520012, "epoch": 0.7759809854328584, "grad_norm": 4.40625, "learning_rate": 6.999167413525099e-06, "loss": 0.76161404, "memory(GiB)": 135.77, "step": 33260, "train_speed(iter/s)": 0.201455 }, { "acc": 0.78030691, "epoch": 0.7762142930051473, "grad_norm": 6.15625, "learning_rate": 6.9974357342052805e-06, "loss": 0.79453855, "memory(GiB)": 135.77, "step": 33270, "train_speed(iter/s)": 0.201487 }, { "acc": 0.79090891, "epoch": 0.7764476005774362, "grad_norm": 4.65625, "learning_rate": 6.995703769752091e-06, "loss": 0.73884726, "memory(GiB)": 135.77, "step": 33280, "train_speed(iter/s)": 0.201521 }, { "acc": 0.804249, "epoch": 0.7766809081497251, "grad_norm": 4.875, "learning_rate": 6.993971520412769e-06, "loss": 0.70278206, "memory(GiB)": 135.77, "step": 33290, "train_speed(iter/s)": 0.201554 }, { "acc": 0.78132133, "epoch": 0.776914215722014, "grad_norm": 6.875, "learning_rate": 6.992238986434591e-06, "loss": 0.79448328, "memory(GiB)": 135.77, "step": 33300, "train_speed(iter/s)": 0.201587 }, { "acc": 0.77671242, "epoch": 0.7771475232943029, "grad_norm": 4.75, "learning_rate": 6.9905061680648765e-06, "loss": 0.80529804, "memory(GiB)": 135.77, "step": 33310, "train_speed(iter/s)": 0.201617 }, { "acc": 0.78557901, "epoch": 0.7773808308665918, "grad_norm": 5.03125, "learning_rate": 6.9887730655509855e-06, "loss": 0.76596785, "memory(GiB)": 135.77, "step": 33320, "train_speed(iter/s)": 0.201648 }, { "acc": 0.7802887, "epoch": 0.7776141384388807, "grad_norm": 8.375, "learning_rate": 6.987039679140316e-06, "loss": 0.79592075, "memory(GiB)": 135.77, "step": 33330, "train_speed(iter/s)": 0.20168 }, { "acc": 0.77079372, "epoch": 0.7778474460111696, "grad_norm": 6.1875, "learning_rate": 6.9853060090803105e-06, "loss": 0.8273881, "memory(GiB)": 135.77, "step": 33340, "train_speed(iter/s)": 0.201712 }, { "acc": 0.76755457, "epoch": 0.7780807535834585, "grad_norm": 6.09375, "learning_rate": 6.983572055618449e-06, "loss": 0.84588947, "memory(GiB)": 135.77, "step": 33350, "train_speed(iter/s)": 0.201741 }, { "acc": 0.7820858, "epoch": 0.7783140611557474, "grad_norm": 3.640625, "learning_rate": 6.981837819002252e-06, "loss": 0.78927259, "memory(GiB)": 135.77, "step": 33360, "train_speed(iter/s)": 0.201773 }, { "acc": 0.78572111, "epoch": 0.7785473687280363, "grad_norm": 4.625, "learning_rate": 6.980103299479281e-06, "loss": 0.76833, "memory(GiB)": 135.77, "step": 33370, "train_speed(iter/s)": 0.201804 }, { "acc": 0.78051925, "epoch": 0.7787806763003252, "grad_norm": 5.65625, "learning_rate": 6.978368497297143e-06, "loss": 0.79444427, "memory(GiB)": 135.77, "step": 33380, "train_speed(iter/s)": 0.201834 }, { "acc": 0.76521983, "epoch": 0.7790139838726141, "grad_norm": 4.84375, "learning_rate": 6.976633412703474e-06, "loss": 0.85772009, "memory(GiB)": 135.77, "step": 33390, "train_speed(iter/s)": 0.201864 }, { "acc": 0.79295754, "epoch": 0.779247291444903, "grad_norm": 5.28125, "learning_rate": 6.974898045945959e-06, "loss": 0.74175, "memory(GiB)": 135.77, "step": 33400, "train_speed(iter/s)": 0.201895 }, { "acc": 0.78126574, "epoch": 0.7794805990171918, "grad_norm": 6.03125, "learning_rate": 6.973162397272323e-06, "loss": 0.79368715, "memory(GiB)": 135.77, "step": 33410, "train_speed(iter/s)": 0.201925 }, { "acc": 0.79608712, "epoch": 0.7797139065894807, "grad_norm": 6.59375, "learning_rate": 6.971426466930327e-06, "loss": 0.74015937, "memory(GiB)": 135.77, "step": 33420, "train_speed(iter/s)": 0.201954 }, { "acc": 0.78107915, "epoch": 0.7799472141617696, "grad_norm": 4.8125, "learning_rate": 6.969690255167777e-06, "loss": 0.77917404, "memory(GiB)": 135.77, "step": 33430, "train_speed(iter/s)": 0.201986 }, { "acc": 0.79521189, "epoch": 0.7801805217340585, "grad_norm": 6.90625, "learning_rate": 6.9679537622325154e-06, "loss": 0.73068018, "memory(GiB)": 135.77, "step": 33440, "train_speed(iter/s)": 0.202018 }, { "acc": 0.80074768, "epoch": 0.7804138293063474, "grad_norm": 4.8125, "learning_rate": 6.966216988372424e-06, "loss": 0.72729893, "memory(GiB)": 135.77, "step": 33450, "train_speed(iter/s)": 0.202049 }, { "acc": 0.78108997, "epoch": 0.7806471368786363, "grad_norm": 4.21875, "learning_rate": 6.964479933835429e-06, "loss": 0.78009777, "memory(GiB)": 135.77, "step": 33460, "train_speed(iter/s)": 0.202079 }, { "acc": 0.76426306, "epoch": 0.7808804444509252, "grad_norm": 6.15625, "learning_rate": 6.962742598869495e-06, "loss": 0.87753067, "memory(GiB)": 135.77, "step": 33470, "train_speed(iter/s)": 0.202111 }, { "acc": 0.77631526, "epoch": 0.7811137520232141, "grad_norm": 5.46875, "learning_rate": 6.961004983722625e-06, "loss": 0.80698681, "memory(GiB)": 135.77, "step": 33480, "train_speed(iter/s)": 0.20214 }, { "acc": 0.75766716, "epoch": 0.781347059595503, "grad_norm": 5.5, "learning_rate": 6.959267088642864e-06, "loss": 0.89244823, "memory(GiB)": 135.77, "step": 33490, "train_speed(iter/s)": 0.202169 }, { "acc": 0.77791281, "epoch": 0.7815803671677919, "grad_norm": 6.0625, "learning_rate": 6.9575289138782944e-06, "loss": 0.78410869, "memory(GiB)": 135.77, "step": 33500, "train_speed(iter/s)": 0.202201 }, { "epoch": 0.7815803671677919, "eval_acc": 0.7424501202763256, "eval_loss": 0.8124200105667114, "eval_runtime": 1269.5877, "eval_samples_per_second": 28.349, "eval_steps_per_second": 14.175, "step": 33500 }, { "acc": 0.79002161, "epoch": 0.7818136747400808, "grad_norm": 5.15625, "learning_rate": 6.955790459677041e-06, "loss": 0.74639416, "memory(GiB)": 135.77, "step": 33510, "train_speed(iter/s)": 0.200672 }, { "acc": 0.7705555, "epoch": 0.7820469823123697, "grad_norm": 4.90625, "learning_rate": 6.9540517262872675e-06, "loss": 0.82598076, "memory(GiB)": 135.77, "step": 33520, "train_speed(iter/s)": 0.200704 }, { "acc": 0.76916981, "epoch": 0.7822802898846586, "grad_norm": 6.28125, "learning_rate": 6.952312713957179e-06, "loss": 0.82480154, "memory(GiB)": 135.77, "step": 33530, "train_speed(iter/s)": 0.200733 }, { "acc": 0.77232513, "epoch": 0.7825135974569475, "grad_norm": 4.875, "learning_rate": 6.9505734229350155e-06, "loss": 0.8362215, "memory(GiB)": 135.77, "step": 33540, "train_speed(iter/s)": 0.200762 }, { "acc": 0.78856249, "epoch": 0.7827469050292364, "grad_norm": 5.0625, "learning_rate": 6.948833853469065e-06, "loss": 0.76099873, "memory(GiB)": 135.77, "step": 33550, "train_speed(iter/s)": 0.200795 }, { "acc": 0.78895731, "epoch": 0.7829802126015253, "grad_norm": 4.25, "learning_rate": 6.947094005807646e-06, "loss": 0.76939201, "memory(GiB)": 135.77, "step": 33560, "train_speed(iter/s)": 0.200828 }, { "acc": 0.76670189, "epoch": 0.7832135201738142, "grad_norm": 4.59375, "learning_rate": 6.945353880199124e-06, "loss": 0.84993162, "memory(GiB)": 135.77, "step": 33570, "train_speed(iter/s)": 0.20086 }, { "acc": 0.76875811, "epoch": 0.7834468277461031, "grad_norm": 5.25, "learning_rate": 6.943613476891902e-06, "loss": 0.84574566, "memory(GiB)": 135.77, "step": 33580, "train_speed(iter/s)": 0.200891 }, { "acc": 0.77109642, "epoch": 0.783680135318392, "grad_norm": 4.3125, "learning_rate": 6.941872796134419e-06, "loss": 0.82063408, "memory(GiB)": 135.77, "step": 33590, "train_speed(iter/s)": 0.200925 }, { "acc": 0.76629744, "epoch": 0.7839134428906808, "grad_norm": 4.1875, "learning_rate": 6.940131838175159e-06, "loss": 0.85809727, "memory(GiB)": 135.77, "step": 33600, "train_speed(iter/s)": 0.200955 }, { "acc": 0.77465167, "epoch": 0.7841467504629697, "grad_norm": 5.3125, "learning_rate": 6.938390603262644e-06, "loss": 0.80659037, "memory(GiB)": 135.77, "step": 33610, "train_speed(iter/s)": 0.200986 }, { "acc": 0.77369032, "epoch": 0.7843800580352586, "grad_norm": 5.09375, "learning_rate": 6.936649091645431e-06, "loss": 0.82941837, "memory(GiB)": 135.77, "step": 33620, "train_speed(iter/s)": 0.201015 }, { "acc": 0.79202423, "epoch": 0.7846133656075475, "grad_norm": 4.5, "learning_rate": 6.9349073035721235e-06, "loss": 0.7462399, "memory(GiB)": 135.77, "step": 33630, "train_speed(iter/s)": 0.201046 }, { "acc": 0.770962, "epoch": 0.7848466731798364, "grad_norm": 5.6875, "learning_rate": 6.933165239291362e-06, "loss": 0.82215929, "memory(GiB)": 135.77, "step": 33640, "train_speed(iter/s)": 0.201077 }, { "acc": 0.78424397, "epoch": 0.7850799807521253, "grad_norm": 4.90625, "learning_rate": 6.931422899051823e-06, "loss": 0.76600466, "memory(GiB)": 135.77, "step": 33650, "train_speed(iter/s)": 0.201109 }, { "acc": 0.76943855, "epoch": 0.7853132883244142, "grad_norm": 5.09375, "learning_rate": 6.929680283102227e-06, "loss": 0.83709068, "memory(GiB)": 135.77, "step": 33660, "train_speed(iter/s)": 0.20114 }, { "acc": 0.78439827, "epoch": 0.7855465958967031, "grad_norm": 5.1875, "learning_rate": 6.9279373916913305e-06, "loss": 0.78921213, "memory(GiB)": 135.77, "step": 33670, "train_speed(iter/s)": 0.20117 }, { "acc": 0.78686228, "epoch": 0.785779903468992, "grad_norm": 4.53125, "learning_rate": 6.926194225067932e-06, "loss": 0.77492542, "memory(GiB)": 135.77, "step": 33680, "train_speed(iter/s)": 0.201202 }, { "acc": 0.79153872, "epoch": 0.7860132110412809, "grad_norm": 5.78125, "learning_rate": 6.924450783480866e-06, "loss": 0.74138288, "memory(GiB)": 135.77, "step": 33690, "train_speed(iter/s)": 0.201232 }, { "acc": 0.78364296, "epoch": 0.7862465186135698, "grad_norm": 4.46875, "learning_rate": 6.922707067179011e-06, "loss": 0.78220735, "memory(GiB)": 135.77, "step": 33700, "train_speed(iter/s)": 0.201263 }, { "acc": 0.76618633, "epoch": 0.7864798261858587, "grad_norm": 5.34375, "learning_rate": 6.92096307641128e-06, "loss": 0.84831276, "memory(GiB)": 135.77, "step": 33710, "train_speed(iter/s)": 0.201293 }, { "acc": 0.78253098, "epoch": 0.7867131337581476, "grad_norm": 4.84375, "learning_rate": 6.919218811426629e-06, "loss": 0.78605328, "memory(GiB)": 135.77, "step": 33720, "train_speed(iter/s)": 0.201324 }, { "acc": 0.79246273, "epoch": 0.7869464413304365, "grad_norm": 5.5625, "learning_rate": 6.91747427247405e-06, "loss": 0.72274895, "memory(GiB)": 135.77, "step": 33730, "train_speed(iter/s)": 0.201356 }, { "acc": 0.7848577, "epoch": 0.7871797489027254, "grad_norm": 11.9375, "learning_rate": 6.915729459802575e-06, "loss": 0.76760449, "memory(GiB)": 135.77, "step": 33740, "train_speed(iter/s)": 0.201388 }, { "acc": 0.80589046, "epoch": 0.7874130564750143, "grad_norm": 5.25, "learning_rate": 6.913984373661275e-06, "loss": 0.68049059, "memory(GiB)": 135.77, "step": 33750, "train_speed(iter/s)": 0.20142 }, { "acc": 0.7855567, "epoch": 0.7876463640473031, "grad_norm": 5.09375, "learning_rate": 6.9122390142992634e-06, "loss": 0.77877674, "memory(GiB)": 135.77, "step": 33760, "train_speed(iter/s)": 0.201453 }, { "acc": 0.76452026, "epoch": 0.787879671619592, "grad_norm": 6.875, "learning_rate": 6.910493381965687e-06, "loss": 0.85262928, "memory(GiB)": 135.77, "step": 33770, "train_speed(iter/s)": 0.201485 }, { "acc": 0.77853546, "epoch": 0.788112979191881, "grad_norm": 6.59375, "learning_rate": 6.9087474769097366e-06, "loss": 0.7785665, "memory(GiB)": 135.77, "step": 33780, "train_speed(iter/s)": 0.201516 }, { "acc": 0.78647032, "epoch": 0.7883462867641697, "grad_norm": 6.65625, "learning_rate": 6.907001299380639e-06, "loss": 0.77372932, "memory(GiB)": 135.77, "step": 33790, "train_speed(iter/s)": 0.201546 }, { "acc": 0.78218312, "epoch": 0.7885795943364586, "grad_norm": 5.375, "learning_rate": 6.905254849627658e-06, "loss": 0.78796425, "memory(GiB)": 135.77, "step": 33800, "train_speed(iter/s)": 0.201576 }, { "acc": 0.76979618, "epoch": 0.7888129019087475, "grad_norm": 6.5, "learning_rate": 6.9035081279001e-06, "loss": 0.82921257, "memory(GiB)": 135.77, "step": 33810, "train_speed(iter/s)": 0.201608 }, { "acc": 0.79161835, "epoch": 0.7890462094810364, "grad_norm": 4.09375, "learning_rate": 6.901761134447311e-06, "loss": 0.76767821, "memory(GiB)": 135.77, "step": 33820, "train_speed(iter/s)": 0.201636 }, { "acc": 0.79137611, "epoch": 0.7892795170533253, "grad_norm": 6.375, "learning_rate": 6.900013869518673e-06, "loss": 0.74047461, "memory(GiB)": 135.77, "step": 33830, "train_speed(iter/s)": 0.201666 }, { "acc": 0.77348399, "epoch": 0.7895128246256142, "grad_norm": 7.4375, "learning_rate": 6.898266333363607e-06, "loss": 0.81377163, "memory(GiB)": 135.77, "step": 33840, "train_speed(iter/s)": 0.201698 }, { "acc": 0.77934666, "epoch": 0.7897461321979031, "grad_norm": 7.25, "learning_rate": 6.8965185262315725e-06, "loss": 0.79142418, "memory(GiB)": 135.77, "step": 33850, "train_speed(iter/s)": 0.201728 }, { "acc": 0.78089814, "epoch": 0.789979439770192, "grad_norm": 4.875, "learning_rate": 6.89477044837207e-06, "loss": 0.77429733, "memory(GiB)": 135.77, "step": 33860, "train_speed(iter/s)": 0.201757 }, { "acc": 0.7920969, "epoch": 0.7902127473424809, "grad_norm": 4.8125, "learning_rate": 6.893022100034636e-06, "loss": 0.74223742, "memory(GiB)": 135.77, "step": 33870, "train_speed(iter/s)": 0.201785 }, { "acc": 0.78182831, "epoch": 0.7904460549147698, "grad_norm": 5.28125, "learning_rate": 6.891273481468847e-06, "loss": 0.79486198, "memory(GiB)": 135.77, "step": 33880, "train_speed(iter/s)": 0.201814 }, { "acc": 0.78645382, "epoch": 0.7906793624870587, "grad_norm": 4.40625, "learning_rate": 6.889524592924319e-06, "loss": 0.75907497, "memory(GiB)": 135.77, "step": 33890, "train_speed(iter/s)": 0.201846 }, { "acc": 0.78487177, "epoch": 0.7909126700593476, "grad_norm": 8.5, "learning_rate": 6.887775434650704e-06, "loss": 0.76534662, "memory(GiB)": 135.77, "step": 33900, "train_speed(iter/s)": 0.201877 }, { "acc": 0.81099415, "epoch": 0.7911459776316365, "grad_norm": 5.65625, "learning_rate": 6.8860260068976935e-06, "loss": 0.66781607, "memory(GiB)": 135.77, "step": 33910, "train_speed(iter/s)": 0.201908 }, { "acc": 0.7786212, "epoch": 0.7913792852039254, "grad_norm": 10.1875, "learning_rate": 6.884276309915018e-06, "loss": 0.77360516, "memory(GiB)": 135.77, "step": 33920, "train_speed(iter/s)": 0.20194 }, { "acc": 0.77263908, "epoch": 0.7916125927762143, "grad_norm": 8.3125, "learning_rate": 6.882526343952448e-06, "loss": 0.81154432, "memory(GiB)": 135.77, "step": 33930, "train_speed(iter/s)": 0.201971 }, { "acc": 0.7702785, "epoch": 0.7918459003485032, "grad_norm": 5.375, "learning_rate": 6.880776109259788e-06, "loss": 0.81274462, "memory(GiB)": 135.77, "step": 33940, "train_speed(iter/s)": 0.202002 }, { "acc": 0.77342148, "epoch": 0.7920792079207921, "grad_norm": 4.6875, "learning_rate": 6.8790256060868866e-06, "loss": 0.80101671, "memory(GiB)": 135.77, "step": 33950, "train_speed(iter/s)": 0.202032 }, { "acc": 0.78165035, "epoch": 0.792312515493081, "grad_norm": 6.03125, "learning_rate": 6.8772748346836235e-06, "loss": 0.80019588, "memory(GiB)": 135.77, "step": 33960, "train_speed(iter/s)": 0.202064 }, { "acc": 0.7831409, "epoch": 0.7925458230653699, "grad_norm": 5.75, "learning_rate": 6.875523795299925e-06, "loss": 0.77129364, "memory(GiB)": 135.77, "step": 33970, "train_speed(iter/s)": 0.202096 }, { "acc": 0.77243023, "epoch": 0.7927791306376588, "grad_norm": 5.46875, "learning_rate": 6.873772488185747e-06, "loss": 0.83966198, "memory(GiB)": 135.77, "step": 33980, "train_speed(iter/s)": 0.202126 }, { "acc": 0.79374552, "epoch": 0.7930124382099476, "grad_norm": 5.71875, "learning_rate": 6.872020913591092e-06, "loss": 0.73950348, "memory(GiB)": 135.77, "step": 33990, "train_speed(iter/s)": 0.202158 }, { "acc": 0.78279028, "epoch": 0.7932457457822365, "grad_norm": 5.78125, "learning_rate": 6.870269071765997e-06, "loss": 0.79019928, "memory(GiB)": 135.77, "step": 34000, "train_speed(iter/s)": 0.202189 }, { "epoch": 0.7932457457822365, "eval_acc": 0.7426201934510297, "eval_loss": 0.8120821714401245, "eval_runtime": 1267.8986, "eval_samples_per_second": 28.386, "eval_steps_per_second": 14.194, "step": 34000 }, { "acc": 0.78353124, "epoch": 0.7934790533545254, "grad_norm": 6.4375, "learning_rate": 6.868516962960534e-06, "loss": 0.75921488, "memory(GiB)": 135.77, "step": 34010, "train_speed(iter/s)": 0.200686 }, { "acc": 0.78991613, "epoch": 0.7937123609268143, "grad_norm": 4.875, "learning_rate": 6.866764587424818e-06, "loss": 0.75100489, "memory(GiB)": 135.77, "step": 34020, "train_speed(iter/s)": 0.200716 }, { "acc": 0.78309031, "epoch": 0.7939456684991032, "grad_norm": 5.625, "learning_rate": 6.865011945408998e-06, "loss": 0.78480835, "memory(GiB)": 135.77, "step": 34030, "train_speed(iter/s)": 0.200748 }, { "acc": 0.7720583, "epoch": 0.7941789760713921, "grad_norm": 6.78125, "learning_rate": 6.863259037163266e-06, "loss": 0.81258068, "memory(GiB)": 135.77, "step": 34040, "train_speed(iter/s)": 0.200777 }, { "acc": 0.77889328, "epoch": 0.794412283643681, "grad_norm": 5.625, "learning_rate": 6.8615058629378465e-06, "loss": 0.79917364, "memory(GiB)": 135.77, "step": 34050, "train_speed(iter/s)": 0.200807 }, { "acc": 0.79887199, "epoch": 0.7946455912159699, "grad_norm": 5.375, "learning_rate": 6.859752422983006e-06, "loss": 0.72141008, "memory(GiB)": 135.77, "step": 34060, "train_speed(iter/s)": 0.200838 }, { "acc": 0.78431706, "epoch": 0.7948788987882588, "grad_norm": 7.625, "learning_rate": 6.857998717549048e-06, "loss": 0.7462605, "memory(GiB)": 135.77, "step": 34070, "train_speed(iter/s)": 0.200869 }, { "acc": 0.78418608, "epoch": 0.7951122063605477, "grad_norm": 5.5, "learning_rate": 6.856244746886313e-06, "loss": 0.77865896, "memory(GiB)": 135.77, "step": 34080, "train_speed(iter/s)": 0.200901 }, { "acc": 0.75828471, "epoch": 0.7953455139328366, "grad_norm": 4.46875, "learning_rate": 6.85449051124518e-06, "loss": 0.85784569, "memory(GiB)": 135.77, "step": 34090, "train_speed(iter/s)": 0.200935 }, { "acc": 0.78311863, "epoch": 0.7955788215051255, "grad_norm": 7.40625, "learning_rate": 6.852736010876063e-06, "loss": 0.76896353, "memory(GiB)": 135.77, "step": 34100, "train_speed(iter/s)": 0.200966 }, { "acc": 0.78136911, "epoch": 0.7958121290774144, "grad_norm": 4.96875, "learning_rate": 6.85098124602942e-06, "loss": 0.78809237, "memory(GiB)": 135.77, "step": 34110, "train_speed(iter/s)": 0.200996 }, { "acc": 0.78460474, "epoch": 0.7960454366497033, "grad_norm": 6.125, "learning_rate": 6.8492262169557435e-06, "loss": 0.74483161, "memory(GiB)": 135.77, "step": 34120, "train_speed(iter/s)": 0.201024 }, { "acc": 0.80227928, "epoch": 0.7962787442219922, "grad_norm": 4.375, "learning_rate": 6.847470923905559e-06, "loss": 0.71104026, "memory(GiB)": 135.77, "step": 34130, "train_speed(iter/s)": 0.201054 }, { "acc": 0.78113799, "epoch": 0.7965120517942811, "grad_norm": 7.125, "learning_rate": 6.845715367129438e-06, "loss": 0.79373302, "memory(GiB)": 135.77, "step": 34140, "train_speed(iter/s)": 0.201083 }, { "acc": 0.78072958, "epoch": 0.79674535936657, "grad_norm": 5.5, "learning_rate": 6.843959546877985e-06, "loss": 0.77883039, "memory(GiB)": 135.77, "step": 34150, "train_speed(iter/s)": 0.201115 }, { "acc": 0.78991518, "epoch": 0.7969786669388589, "grad_norm": 4.78125, "learning_rate": 6.842203463401842e-06, "loss": 0.74006615, "memory(GiB)": 135.77, "step": 34160, "train_speed(iter/s)": 0.201145 }, { "acc": 0.76768169, "epoch": 0.7972119745111478, "grad_norm": 6.8125, "learning_rate": 6.84044711695169e-06, "loss": 0.8356452, "memory(GiB)": 135.77, "step": 34170, "train_speed(iter/s)": 0.201176 }, { "acc": 0.78966255, "epoch": 0.7974452820834366, "grad_norm": 5.21875, "learning_rate": 6.838690507778247e-06, "loss": 0.75411134, "memory(GiB)": 135.77, "step": 34180, "train_speed(iter/s)": 0.201206 }, { "acc": 0.78948689, "epoch": 0.7976785896557255, "grad_norm": 7.0, "learning_rate": 6.836933636132267e-06, "loss": 0.75135803, "memory(GiB)": 135.77, "step": 34190, "train_speed(iter/s)": 0.201235 }, { "acc": 0.77856131, "epoch": 0.7979118972280144, "grad_norm": 8.9375, "learning_rate": 6.835176502264544e-06, "loss": 0.81702185, "memory(GiB)": 135.77, "step": 34200, "train_speed(iter/s)": 0.201263 }, { "acc": 0.76752882, "epoch": 0.7981452048003033, "grad_norm": 7.84375, "learning_rate": 6.8334191064259095e-06, "loss": 0.85097122, "memory(GiB)": 135.77, "step": 34210, "train_speed(iter/s)": 0.201294 }, { "acc": 0.76000738, "epoch": 0.7983785123725922, "grad_norm": 4.375, "learning_rate": 6.8316614488672305e-06, "loss": 0.88621998, "memory(GiB)": 135.77, "step": 34220, "train_speed(iter/s)": 0.201324 }, { "acc": 0.77623281, "epoch": 0.7986118199448811, "grad_norm": 5.875, "learning_rate": 6.829903529839411e-06, "loss": 0.79441414, "memory(GiB)": 135.77, "step": 34230, "train_speed(iter/s)": 0.201353 }, { "acc": 0.77455211, "epoch": 0.79884512751717, "grad_norm": 6.8125, "learning_rate": 6.828145349593395e-06, "loss": 0.80323877, "memory(GiB)": 135.77, "step": 34240, "train_speed(iter/s)": 0.201383 }, { "acc": 0.77257781, "epoch": 0.7990784350894589, "grad_norm": 5.0625, "learning_rate": 6.82638690838016e-06, "loss": 0.83061676, "memory(GiB)": 135.77, "step": 34250, "train_speed(iter/s)": 0.201413 }, { "acc": 0.76844459, "epoch": 0.7993117426617478, "grad_norm": 4.65625, "learning_rate": 6.824628206450724e-06, "loss": 0.84785528, "memory(GiB)": 135.77, "step": 34260, "train_speed(iter/s)": 0.201443 }, { "acc": 0.78014994, "epoch": 0.7995450502340367, "grad_norm": 6.21875, "learning_rate": 6.822869244056143e-06, "loss": 0.79277802, "memory(GiB)": 135.77, "step": 34270, "train_speed(iter/s)": 0.201472 }, { "acc": 0.78358684, "epoch": 0.7997783578063256, "grad_norm": 5.46875, "learning_rate": 6.821110021447506e-06, "loss": 0.79087338, "memory(GiB)": 135.77, "step": 34280, "train_speed(iter/s)": 0.201502 }, { "acc": 0.76477342, "epoch": 0.8000116653786145, "grad_norm": 5.9375, "learning_rate": 6.819350538875944e-06, "loss": 0.86270304, "memory(GiB)": 135.77, "step": 34290, "train_speed(iter/s)": 0.201534 }, { "acc": 0.77901411, "epoch": 0.8002449729509034, "grad_norm": 4.78125, "learning_rate": 6.817590796592621e-06, "loss": 0.7760778, "memory(GiB)": 135.77, "step": 34300, "train_speed(iter/s)": 0.201563 }, { "acc": 0.786831, "epoch": 0.8004782805231923, "grad_norm": 5.125, "learning_rate": 6.815830794848739e-06, "loss": 0.75008812, "memory(GiB)": 135.77, "step": 34310, "train_speed(iter/s)": 0.201594 }, { "acc": 0.77648001, "epoch": 0.8007115880954812, "grad_norm": 4.46875, "learning_rate": 6.8140705338955386e-06, "loss": 0.80113869, "memory(GiB)": 135.77, "step": 34320, "train_speed(iter/s)": 0.201625 }, { "acc": 0.76347504, "epoch": 0.80094489566777, "grad_norm": 7.34375, "learning_rate": 6.812310013984296e-06, "loss": 0.85464201, "memory(GiB)": 135.77, "step": 34330, "train_speed(iter/s)": 0.201655 }, { "acc": 0.77026582, "epoch": 0.801178203240059, "grad_norm": 5.34375, "learning_rate": 6.810549235366325e-06, "loss": 0.8082571, "memory(GiB)": 135.77, "step": 34340, "train_speed(iter/s)": 0.201686 }, { "acc": 0.77789001, "epoch": 0.8014115108123478, "grad_norm": 6.1875, "learning_rate": 6.808788198292977e-06, "loss": 0.78789454, "memory(GiB)": 135.77, "step": 34350, "train_speed(iter/s)": 0.201712 }, { "acc": 0.76755538, "epoch": 0.8016448183846367, "grad_norm": 6.25, "learning_rate": 6.80702690301564e-06, "loss": 0.83963947, "memory(GiB)": 135.77, "step": 34360, "train_speed(iter/s)": 0.201742 }, { "acc": 0.78171759, "epoch": 0.8018781259569255, "grad_norm": 6.5, "learning_rate": 6.805265349785738e-06, "loss": 0.78367205, "memory(GiB)": 135.77, "step": 34370, "train_speed(iter/s)": 0.201774 }, { "acc": 0.77745686, "epoch": 0.8021114335292144, "grad_norm": 5.90625, "learning_rate": 6.80350353885473e-06, "loss": 0.79816732, "memory(GiB)": 135.77, "step": 34380, "train_speed(iter/s)": 0.201804 }, { "acc": 0.76283531, "epoch": 0.8023447411015033, "grad_norm": 6.9375, "learning_rate": 6.801741470474117e-06, "loss": 0.86973667, "memory(GiB)": 135.77, "step": 34390, "train_speed(iter/s)": 0.201837 }, { "acc": 0.7806407, "epoch": 0.8025780486737922, "grad_norm": 5.0625, "learning_rate": 6.799979144895432e-06, "loss": 0.79195518, "memory(GiB)": 135.77, "step": 34400, "train_speed(iter/s)": 0.201868 }, { "acc": 0.75355606, "epoch": 0.8028113562460811, "grad_norm": 5.40625, "learning_rate": 6.798216562370247e-06, "loss": 0.90384073, "memory(GiB)": 135.77, "step": 34410, "train_speed(iter/s)": 0.201899 }, { "acc": 0.77500319, "epoch": 0.80304466381837, "grad_norm": 7.1875, "learning_rate": 6.79645372315017e-06, "loss": 0.82036419, "memory(GiB)": 135.77, "step": 34420, "train_speed(iter/s)": 0.201929 }, { "acc": 0.7839057, "epoch": 0.8032779713906589, "grad_norm": 4.09375, "learning_rate": 6.794690627486846e-06, "loss": 0.76988935, "memory(GiB)": 135.77, "step": 34430, "train_speed(iter/s)": 0.201958 }, { "acc": 0.78427267, "epoch": 0.8035112789629478, "grad_norm": 5.625, "learning_rate": 6.792927275631957e-06, "loss": 0.77875066, "memory(GiB)": 135.77, "step": 34440, "train_speed(iter/s)": 0.201985 }, { "acc": 0.78074837, "epoch": 0.8037445865352367, "grad_norm": 5.59375, "learning_rate": 6.791163667837219e-06, "loss": 0.77121277, "memory(GiB)": 135.77, "step": 34450, "train_speed(iter/s)": 0.202014 }, { "acc": 0.77221909, "epoch": 0.8039778941075256, "grad_norm": 6.21875, "learning_rate": 6.789399804354389e-06, "loss": 0.82327061, "memory(GiB)": 135.77, "step": 34460, "train_speed(iter/s)": 0.202044 }, { "acc": 0.78617983, "epoch": 0.8042112016798145, "grad_norm": 4.90625, "learning_rate": 6.787635685435255e-06, "loss": 0.78236666, "memory(GiB)": 135.77, "step": 34470, "train_speed(iter/s)": 0.202072 }, { "acc": 0.77123075, "epoch": 0.8044445092521034, "grad_norm": 6.90625, "learning_rate": 6.785871311331648e-06, "loss": 0.83095236, "memory(GiB)": 135.77, "step": 34480, "train_speed(iter/s)": 0.202104 }, { "acc": 0.7779706, "epoch": 0.8046778168243923, "grad_norm": 4.1875, "learning_rate": 6.7841066822954284e-06, "loss": 0.80731964, "memory(GiB)": 135.77, "step": 34490, "train_speed(iter/s)": 0.202133 }, { "acc": 0.78801842, "epoch": 0.8049111243966812, "grad_norm": 7.28125, "learning_rate": 6.7823417985784986e-06, "loss": 0.74819498, "memory(GiB)": 135.77, "step": 34500, "train_speed(iter/s)": 0.202161 }, { "epoch": 0.8049111243966812, "eval_acc": 0.7425651038351181, "eval_loss": 0.8118574619293213, "eval_runtime": 1269.6887, "eval_samples_per_second": 28.346, "eval_steps_per_second": 14.174, "step": 34500 }, { "acc": 0.77680259, "epoch": 0.8051444319689701, "grad_norm": 5.75, "learning_rate": 6.780576660432797e-06, "loss": 0.81476183, "memory(GiB)": 135.77, "step": 34510, "train_speed(iter/s)": 0.20068 }, { "acc": 0.75909719, "epoch": 0.805377739541259, "grad_norm": 5.90625, "learning_rate": 6.778811268110294e-06, "loss": 0.86525078, "memory(GiB)": 135.77, "step": 34520, "train_speed(iter/s)": 0.20071 }, { "acc": 0.76516881, "epoch": 0.8056110471135479, "grad_norm": 6.4375, "learning_rate": 6.777045621862997e-06, "loss": 0.83675842, "memory(GiB)": 135.77, "step": 34530, "train_speed(iter/s)": 0.200741 }, { "acc": 0.78065295, "epoch": 0.8058443546858368, "grad_norm": 7.59375, "learning_rate": 6.775279721942954e-06, "loss": 0.77787828, "memory(GiB)": 135.77, "step": 34540, "train_speed(iter/s)": 0.200771 }, { "acc": 0.78033791, "epoch": 0.8060776622581257, "grad_norm": 4.53125, "learning_rate": 6.773513568602248e-06, "loss": 0.78399673, "memory(GiB)": 135.77, "step": 34550, "train_speed(iter/s)": 0.2008 }, { "acc": 0.78510847, "epoch": 0.8063109698304145, "grad_norm": 4.90625, "learning_rate": 6.771747162092993e-06, "loss": 0.75295162, "memory(GiB)": 135.77, "step": 34560, "train_speed(iter/s)": 0.20083 }, { "acc": 0.75793881, "epoch": 0.8065442774027034, "grad_norm": 6.5625, "learning_rate": 6.769980502667348e-06, "loss": 0.85168362, "memory(GiB)": 135.77, "step": 34570, "train_speed(iter/s)": 0.20086 }, { "acc": 0.77159257, "epoch": 0.8067775849749923, "grad_norm": 5.8125, "learning_rate": 6.7682135905775e-06, "loss": 0.7980968, "memory(GiB)": 135.77, "step": 34580, "train_speed(iter/s)": 0.200891 }, { "acc": 0.77705431, "epoch": 0.8070108925472812, "grad_norm": 5.90625, "learning_rate": 6.7664464260756745e-06, "loss": 0.81086655, "memory(GiB)": 135.77, "step": 34590, "train_speed(iter/s)": 0.200921 }, { "acc": 0.77288313, "epoch": 0.8072442001195701, "grad_norm": 5.0, "learning_rate": 6.764679009414135e-06, "loss": 0.82601871, "memory(GiB)": 135.77, "step": 34600, "train_speed(iter/s)": 0.20095 }, { "acc": 0.77113543, "epoch": 0.807477507691859, "grad_norm": 6.3125, "learning_rate": 6.76291134084518e-06, "loss": 0.81248341, "memory(GiB)": 135.77, "step": 34610, "train_speed(iter/s)": 0.200982 }, { "acc": 0.77865152, "epoch": 0.8077108152641479, "grad_norm": 5.53125, "learning_rate": 6.761143420621141e-06, "loss": 0.81596804, "memory(GiB)": 135.77, "step": 34620, "train_speed(iter/s)": 0.201012 }, { "acc": 0.75378828, "epoch": 0.8079441228364368, "grad_norm": 7.0, "learning_rate": 6.759375248994393e-06, "loss": 0.90001888, "memory(GiB)": 135.77, "step": 34630, "train_speed(iter/s)": 0.201042 }, { "acc": 0.79571762, "epoch": 0.8081774304087257, "grad_norm": 3.953125, "learning_rate": 6.757606826217339e-06, "loss": 0.72643576, "memory(GiB)": 135.77, "step": 34640, "train_speed(iter/s)": 0.201072 }, { "acc": 0.77722073, "epoch": 0.8084107379810146, "grad_norm": 6.75, "learning_rate": 6.755838152542421e-06, "loss": 0.81900959, "memory(GiB)": 135.77, "step": 34650, "train_speed(iter/s)": 0.201102 }, { "acc": 0.76127481, "epoch": 0.8086440455533035, "grad_norm": 4.53125, "learning_rate": 6.754069228222117e-06, "loss": 0.85369015, "memory(GiB)": 135.77, "step": 34660, "train_speed(iter/s)": 0.201132 }, { "acc": 0.77648983, "epoch": 0.8088773531255924, "grad_norm": 4.8125, "learning_rate": 6.752300053508939e-06, "loss": 0.80511589, "memory(GiB)": 135.77, "step": 34670, "train_speed(iter/s)": 0.201162 }, { "acc": 0.7765605, "epoch": 0.8091106606978813, "grad_norm": 5.71875, "learning_rate": 6.750530628655437e-06, "loss": 0.80522938, "memory(GiB)": 135.77, "step": 34680, "train_speed(iter/s)": 0.201192 }, { "acc": 0.77244101, "epoch": 0.8093439682701702, "grad_norm": 5.0, "learning_rate": 6.748760953914198e-06, "loss": 0.81097126, "memory(GiB)": 135.77, "step": 34690, "train_speed(iter/s)": 0.201221 }, { "acc": 0.76321468, "epoch": 0.8095772758424591, "grad_norm": 24.375, "learning_rate": 6.746991029537841e-06, "loss": 0.84920597, "memory(GiB)": 135.77, "step": 34700, "train_speed(iter/s)": 0.201251 }, { "acc": 0.76385412, "epoch": 0.809810583414748, "grad_norm": 6.0, "learning_rate": 6.74522085577902e-06, "loss": 0.84149389, "memory(GiB)": 135.77, "step": 34710, "train_speed(iter/s)": 0.201281 }, { "acc": 0.77857213, "epoch": 0.8100438909870369, "grad_norm": 7.21875, "learning_rate": 6.743450432890431e-06, "loss": 0.77943163, "memory(GiB)": 135.77, "step": 34720, "train_speed(iter/s)": 0.201312 }, { "acc": 0.76652179, "epoch": 0.8102771985593258, "grad_norm": 5.84375, "learning_rate": 6.741679761124798e-06, "loss": 0.81627655, "memory(GiB)": 135.77, "step": 34730, "train_speed(iter/s)": 0.201342 }, { "acc": 0.78065634, "epoch": 0.8105105061316147, "grad_norm": 6.375, "learning_rate": 6.739908840734885e-06, "loss": 0.7564887, "memory(GiB)": 135.77, "step": 34740, "train_speed(iter/s)": 0.201371 }, { "acc": 0.78042436, "epoch": 0.8107438137039036, "grad_norm": 5.375, "learning_rate": 6.738137671973492e-06, "loss": 0.78124051, "memory(GiB)": 135.77, "step": 34750, "train_speed(iter/s)": 0.201399 }, { "acc": 0.78339186, "epoch": 0.8109771212761924, "grad_norm": 7.21875, "learning_rate": 6.736366255093449e-06, "loss": 0.75932827, "memory(GiB)": 135.77, "step": 34760, "train_speed(iter/s)": 0.201431 }, { "acc": 0.77975254, "epoch": 0.8112104288484813, "grad_norm": 5.4375, "learning_rate": 6.73459459034763e-06, "loss": 0.78994699, "memory(GiB)": 135.77, "step": 34770, "train_speed(iter/s)": 0.20146 }, { "acc": 0.77287989, "epoch": 0.8114437364207702, "grad_norm": 6.65625, "learning_rate": 6.732822677988935e-06, "loss": 0.81505728, "memory(GiB)": 135.77, "step": 34780, "train_speed(iter/s)": 0.20149 }, { "acc": 0.77970748, "epoch": 0.8116770439930591, "grad_norm": 7.3125, "learning_rate": 6.731050518270307e-06, "loss": 0.79578314, "memory(GiB)": 135.77, "step": 34790, "train_speed(iter/s)": 0.20152 }, { "acc": 0.77548347, "epoch": 0.811910351565348, "grad_norm": 5.75, "learning_rate": 6.729278111444721e-06, "loss": 0.80311823, "memory(GiB)": 135.77, "step": 34800, "train_speed(iter/s)": 0.201549 }, { "acc": 0.77532001, "epoch": 0.8121436591376369, "grad_norm": 4.6875, "learning_rate": 6.727505457765185e-06, "loss": 0.81310711, "memory(GiB)": 135.77, "step": 34810, "train_speed(iter/s)": 0.201581 }, { "acc": 0.77574654, "epoch": 0.8123769667099258, "grad_norm": 6.09375, "learning_rate": 6.725732557484748e-06, "loss": 0.79087381, "memory(GiB)": 135.77, "step": 34820, "train_speed(iter/s)": 0.201612 }, { "acc": 0.79127479, "epoch": 0.8126102742822147, "grad_norm": 5.96875, "learning_rate": 6.723959410856489e-06, "loss": 0.74439774, "memory(GiB)": 135.77, "step": 34830, "train_speed(iter/s)": 0.201639 }, { "acc": 0.78656416, "epoch": 0.8128435818545036, "grad_norm": 6.65625, "learning_rate": 6.722186018133525e-06, "loss": 0.76410608, "memory(GiB)": 135.77, "step": 34840, "train_speed(iter/s)": 0.20167 }, { "acc": 0.7781281, "epoch": 0.8130768894267925, "grad_norm": 7.25, "learning_rate": 6.720412379569008e-06, "loss": 0.79291306, "memory(GiB)": 135.77, "step": 34850, "train_speed(iter/s)": 0.201701 }, { "acc": 0.80160904, "epoch": 0.8133101969990814, "grad_norm": 4.8125, "learning_rate": 6.718638495416124e-06, "loss": 0.7177484, "memory(GiB)": 135.77, "step": 34860, "train_speed(iter/s)": 0.20173 }, { "acc": 0.77137651, "epoch": 0.8135435045713703, "grad_norm": 6.21875, "learning_rate": 6.716864365928094e-06, "loss": 0.82935047, "memory(GiB)": 135.77, "step": 34870, "train_speed(iter/s)": 0.201759 }, { "acc": 0.76962833, "epoch": 0.8137768121436592, "grad_norm": 5.34375, "learning_rate": 6.715089991358174e-06, "loss": 0.83387051, "memory(GiB)": 135.77, "step": 34880, "train_speed(iter/s)": 0.201789 }, { "acc": 0.74992967, "epoch": 0.814010119715948, "grad_norm": 5.625, "learning_rate": 6.713315371959656e-06, "loss": 0.89630518, "memory(GiB)": 135.77, "step": 34890, "train_speed(iter/s)": 0.201819 }, { "acc": 0.78066454, "epoch": 0.814243427288237, "grad_norm": 6.53125, "learning_rate": 6.7115405079858656e-06, "loss": 0.77816544, "memory(GiB)": 135.77, "step": 34900, "train_speed(iter/s)": 0.201848 }, { "acc": 0.77989941, "epoch": 0.8144767348605259, "grad_norm": 5.40625, "learning_rate": 6.709765399690164e-06, "loss": 0.77061434, "memory(GiB)": 135.77, "step": 34910, "train_speed(iter/s)": 0.20188 }, { "acc": 0.77812452, "epoch": 0.8147100424328148, "grad_norm": 4.375, "learning_rate": 6.707990047325952e-06, "loss": 0.79744072, "memory(GiB)": 135.77, "step": 34920, "train_speed(iter/s)": 0.201909 }, { "acc": 0.7687603, "epoch": 0.8149433500051037, "grad_norm": 5.875, "learning_rate": 6.706214451146654e-06, "loss": 0.82320156, "memory(GiB)": 135.77, "step": 34930, "train_speed(iter/s)": 0.201939 }, { "acc": 0.7765502, "epoch": 0.8151766575773925, "grad_norm": 4.96875, "learning_rate": 6.70443861140574e-06, "loss": 0.78917093, "memory(GiB)": 135.77, "step": 34940, "train_speed(iter/s)": 0.201968 }, { "acc": 0.7818922, "epoch": 0.8154099651496813, "grad_norm": 5.875, "learning_rate": 6.702662528356709e-06, "loss": 0.77447877, "memory(GiB)": 135.77, "step": 34950, "train_speed(iter/s)": 0.201998 }, { "acc": 0.77006559, "epoch": 0.8156432727219702, "grad_norm": 5.53125, "learning_rate": 6.700886202253096e-06, "loss": 0.83356829, "memory(GiB)": 135.77, "step": 34960, "train_speed(iter/s)": 0.202028 }, { "acc": 0.76896172, "epoch": 0.8158765802942591, "grad_norm": 6.0625, "learning_rate": 6.699109633348473e-06, "loss": 0.853722, "memory(GiB)": 135.77, "step": 34970, "train_speed(iter/s)": 0.202058 }, { "acc": 0.78813248, "epoch": 0.816109887866548, "grad_norm": 8.75, "learning_rate": 6.697332821896443e-06, "loss": 0.78391209, "memory(GiB)": 135.77, "step": 34980, "train_speed(iter/s)": 0.202086 }, { "acc": 0.77606354, "epoch": 0.8163431954388369, "grad_norm": 4.4375, "learning_rate": 6.695555768150644e-06, "loss": 0.78519859, "memory(GiB)": 135.77, "step": 34990, "train_speed(iter/s)": 0.202116 }, { "acc": 0.75625358, "epoch": 0.8165765030111258, "grad_norm": 5.71875, "learning_rate": 6.693778472364754e-06, "loss": 0.89699631, "memory(GiB)": 135.77, "step": 35000, "train_speed(iter/s)": 0.202146 }, { "epoch": 0.8165765030111258, "eval_acc": 0.7427814586929572, "eval_loss": 0.8112027049064636, "eval_runtime": 1268.9891, "eval_samples_per_second": 28.362, "eval_steps_per_second": 14.181, "step": 35000 }, { "acc": 0.76805382, "epoch": 0.8168098105834147, "grad_norm": 5.4375, "learning_rate": 6.692000934792479e-06, "loss": 0.81355896, "memory(GiB)": 135.77, "step": 35010, "train_speed(iter/s)": 0.200683 }, { "acc": 0.78000903, "epoch": 0.8170431181557036, "grad_norm": 6.96875, "learning_rate": 6.6902231556875605e-06, "loss": 0.79382019, "memory(GiB)": 135.77, "step": 35020, "train_speed(iter/s)": 0.200712 }, { "acc": 0.77730827, "epoch": 0.8172764257279925, "grad_norm": 4.59375, "learning_rate": 6.688445135303779e-06, "loss": 0.78697395, "memory(GiB)": 135.77, "step": 35030, "train_speed(iter/s)": 0.200742 }, { "acc": 0.80807705, "epoch": 0.8175097333002814, "grad_norm": 5.84375, "learning_rate": 6.686666873894945e-06, "loss": 0.6698247, "memory(GiB)": 135.77, "step": 35040, "train_speed(iter/s)": 0.200771 }, { "acc": 0.78960042, "epoch": 0.8177430408725703, "grad_norm": 5.3125, "learning_rate": 6.684888371714903e-06, "loss": 0.74742336, "memory(GiB)": 135.77, "step": 35050, "train_speed(iter/s)": 0.200798 }, { "acc": 0.77717094, "epoch": 0.8179763484448592, "grad_norm": 5.0625, "learning_rate": 6.683109629017536e-06, "loss": 0.79351301, "memory(GiB)": 135.77, "step": 35060, "train_speed(iter/s)": 0.200825 }, { "acc": 0.77711973, "epoch": 0.8182096560171481, "grad_norm": 10.8125, "learning_rate": 6.681330646056758e-06, "loss": 0.80530987, "memory(GiB)": 135.77, "step": 35070, "train_speed(iter/s)": 0.200854 }, { "acc": 0.78023176, "epoch": 0.818442963589437, "grad_norm": 5.53125, "learning_rate": 6.679551423086521e-06, "loss": 0.80717087, "memory(GiB)": 135.77, "step": 35080, "train_speed(iter/s)": 0.200883 }, { "acc": 0.76191292, "epoch": 0.8186762711617259, "grad_norm": 4.9375, "learning_rate": 6.677771960360806e-06, "loss": 0.8590991, "memory(GiB)": 135.77, "step": 35090, "train_speed(iter/s)": 0.200912 }, { "acc": 0.76519403, "epoch": 0.8189095787340148, "grad_norm": 4.875, "learning_rate": 6.6759922581336285e-06, "loss": 0.83596706, "memory(GiB)": 135.77, "step": 35100, "train_speed(iter/s)": 0.200943 }, { "acc": 0.77768407, "epoch": 0.8191428863063037, "grad_norm": 7.1875, "learning_rate": 6.674212316659045e-06, "loss": 0.79807773, "memory(GiB)": 135.77, "step": 35110, "train_speed(iter/s)": 0.200972 }, { "acc": 0.76411262, "epoch": 0.8193761938785926, "grad_norm": 5.09375, "learning_rate": 6.6724321361911384e-06, "loss": 0.84451618, "memory(GiB)": 135.77, "step": 35120, "train_speed(iter/s)": 0.201002 }, { "acc": 0.76351461, "epoch": 0.8196095014508815, "grad_norm": 4.875, "learning_rate": 6.6706517169840305e-06, "loss": 0.85793591, "memory(GiB)": 135.77, "step": 35130, "train_speed(iter/s)": 0.201032 }, { "acc": 0.77693996, "epoch": 0.8198428090231703, "grad_norm": 8.75, "learning_rate": 6.668871059291875e-06, "loss": 0.78128543, "memory(GiB)": 135.77, "step": 35140, "train_speed(iter/s)": 0.201062 }, { "acc": 0.78849602, "epoch": 0.8200761165954592, "grad_norm": 4.5, "learning_rate": 6.667090163368863e-06, "loss": 0.7485117, "memory(GiB)": 135.77, "step": 35150, "train_speed(iter/s)": 0.201092 }, { "acc": 0.77250385, "epoch": 0.8203094241677481, "grad_norm": 4.5625, "learning_rate": 6.665309029469214e-06, "loss": 0.81979828, "memory(GiB)": 135.77, "step": 35160, "train_speed(iter/s)": 0.201123 }, { "acc": 0.77600441, "epoch": 0.820542731740037, "grad_norm": 4.71875, "learning_rate": 6.663527657847182e-06, "loss": 0.81544447, "memory(GiB)": 135.77, "step": 35170, "train_speed(iter/s)": 0.201153 }, { "acc": 0.789469, "epoch": 0.8207760393123259, "grad_norm": 4.40625, "learning_rate": 6.661746048757061e-06, "loss": 0.75934238, "memory(GiB)": 135.77, "step": 35180, "train_speed(iter/s)": 0.201184 }, { "acc": 0.75972357, "epoch": 0.8210093468846148, "grad_norm": 4.21875, "learning_rate": 6.6599642024531755e-06, "loss": 0.87705612, "memory(GiB)": 135.77, "step": 35190, "train_speed(iter/s)": 0.201214 }, { "acc": 0.80127926, "epoch": 0.8212426544569037, "grad_norm": 5.125, "learning_rate": 6.658182119189882e-06, "loss": 0.71490326, "memory(GiB)": 135.77, "step": 35200, "train_speed(iter/s)": 0.201242 }, { "acc": 0.77492847, "epoch": 0.8214759620291926, "grad_norm": 5.125, "learning_rate": 6.656399799221572e-06, "loss": 0.79777231, "memory(GiB)": 135.77, "step": 35210, "train_speed(iter/s)": 0.201272 }, { "acc": 0.78476725, "epoch": 0.8217092696014815, "grad_norm": 6.0625, "learning_rate": 6.654617242802672e-06, "loss": 0.78095455, "memory(GiB)": 135.77, "step": 35220, "train_speed(iter/s)": 0.201303 }, { "acc": 0.78755674, "epoch": 0.8219425771737704, "grad_norm": 5.5625, "learning_rate": 6.652834450187643e-06, "loss": 0.77176509, "memory(GiB)": 135.77, "step": 35230, "train_speed(iter/s)": 0.201333 }, { "acc": 0.78302078, "epoch": 0.8221758847460593, "grad_norm": 4.84375, "learning_rate": 6.651051421630974e-06, "loss": 0.77143784, "memory(GiB)": 135.77, "step": 35240, "train_speed(iter/s)": 0.201363 }, { "acc": 0.80360794, "epoch": 0.8224091923183482, "grad_norm": 5.5, "learning_rate": 6.649268157387195e-06, "loss": 0.69977317, "memory(GiB)": 135.77, "step": 35250, "train_speed(iter/s)": 0.201395 }, { "acc": 0.7762537, "epoch": 0.8226424998906371, "grad_norm": 5.65625, "learning_rate": 6.647484657710867e-06, "loss": 0.80590124, "memory(GiB)": 135.77, "step": 35260, "train_speed(iter/s)": 0.201424 }, { "acc": 0.79981756, "epoch": 0.822875807462926, "grad_norm": 4.875, "learning_rate": 6.645700922856582e-06, "loss": 0.71729341, "memory(GiB)": 135.77, "step": 35270, "train_speed(iter/s)": 0.201452 }, { "acc": 0.77900333, "epoch": 0.8231091150352149, "grad_norm": 4.25, "learning_rate": 6.643916953078966e-06, "loss": 0.79960418, "memory(GiB)": 135.77, "step": 35280, "train_speed(iter/s)": 0.201482 }, { "acc": 0.783249, "epoch": 0.8233424226075038, "grad_norm": 4.875, "learning_rate": 6.642132748632685e-06, "loss": 0.79931679, "memory(GiB)": 135.77, "step": 35290, "train_speed(iter/s)": 0.201512 }, { "acc": 0.77131052, "epoch": 0.8235757301797927, "grad_norm": 7.09375, "learning_rate": 6.640348309772431e-06, "loss": 0.82600031, "memory(GiB)": 135.77, "step": 35300, "train_speed(iter/s)": 0.201543 }, { "acc": 0.77425413, "epoch": 0.8238090377520816, "grad_norm": 5.8125, "learning_rate": 6.638563636752932e-06, "loss": 0.81250858, "memory(GiB)": 135.77, "step": 35310, "train_speed(iter/s)": 0.201574 }, { "acc": 0.7789402, "epoch": 0.8240423453243705, "grad_norm": 6.59375, "learning_rate": 6.63677872982895e-06, "loss": 0.77923145, "memory(GiB)": 135.77, "step": 35320, "train_speed(iter/s)": 0.201602 }, { "acc": 0.79925971, "epoch": 0.8242756528966593, "grad_norm": 5.125, "learning_rate": 6.634993589255278e-06, "loss": 0.71169586, "memory(GiB)": 135.77, "step": 35330, "train_speed(iter/s)": 0.201631 }, { "acc": 0.79988279, "epoch": 0.8245089604689482, "grad_norm": 6.03125, "learning_rate": 6.633208215286748e-06, "loss": 0.70508504, "memory(GiB)": 135.77, "step": 35340, "train_speed(iter/s)": 0.201658 }, { "acc": 0.77253027, "epoch": 0.8247422680412371, "grad_norm": 5.15625, "learning_rate": 6.6314226081782195e-06, "loss": 0.80938549, "memory(GiB)": 135.77, "step": 35350, "train_speed(iter/s)": 0.201685 }, { "acc": 0.78187051, "epoch": 0.824975575613526, "grad_norm": 5.3125, "learning_rate": 6.6296367681845875e-06, "loss": 0.76466885, "memory(GiB)": 135.77, "step": 35360, "train_speed(iter/s)": 0.201716 }, { "acc": 0.76568704, "epoch": 0.8252088831858149, "grad_norm": 5.5, "learning_rate": 6.62785069556078e-06, "loss": 0.86667337, "memory(GiB)": 135.77, "step": 35370, "train_speed(iter/s)": 0.201747 }, { "acc": 0.78236561, "epoch": 0.8254421907581038, "grad_norm": 5.21875, "learning_rate": 6.6260643905617605e-06, "loss": 0.76878605, "memory(GiB)": 135.77, "step": 35380, "train_speed(iter/s)": 0.201776 }, { "acc": 0.80533562, "epoch": 0.8256754983303927, "grad_norm": 7.34375, "learning_rate": 6.624277853442519e-06, "loss": 0.69753013, "memory(GiB)": 135.77, "step": 35390, "train_speed(iter/s)": 0.201806 }, { "acc": 0.78438272, "epoch": 0.8259088059026816, "grad_norm": 7.625, "learning_rate": 6.622491084458087e-06, "loss": 0.75821857, "memory(GiB)": 135.77, "step": 35400, "train_speed(iter/s)": 0.201837 }, { "acc": 0.77371559, "epoch": 0.8261421134749705, "grad_norm": 5.3125, "learning_rate": 6.620704083863523e-06, "loss": 0.82229176, "memory(GiB)": 135.77, "step": 35410, "train_speed(iter/s)": 0.201866 }, { "acc": 0.77689691, "epoch": 0.8263754210472594, "grad_norm": 5.15625, "learning_rate": 6.618916851913923e-06, "loss": 0.79822955, "memory(GiB)": 135.77, "step": 35420, "train_speed(iter/s)": 0.201896 }, { "acc": 0.76417141, "epoch": 0.8266087286195483, "grad_norm": 10.0, "learning_rate": 6.617129388864412e-06, "loss": 0.86350403, "memory(GiB)": 135.77, "step": 35430, "train_speed(iter/s)": 0.201926 }, { "acc": 0.7825284, "epoch": 0.8268420361918372, "grad_norm": 11.75, "learning_rate": 6.615341694970151e-06, "loss": 0.76954165, "memory(GiB)": 135.77, "step": 35440, "train_speed(iter/s)": 0.201953 }, { "acc": 0.78282237, "epoch": 0.8270753437641261, "grad_norm": 7.4375, "learning_rate": 6.613553770486333e-06, "loss": 0.77155132, "memory(GiB)": 135.77, "step": 35450, "train_speed(iter/s)": 0.201981 }, { "acc": 0.76642599, "epoch": 0.827308651336415, "grad_norm": 4.28125, "learning_rate": 6.611765615668182e-06, "loss": 0.8688242, "memory(GiB)": 135.77, "step": 35460, "train_speed(iter/s)": 0.202011 }, { "acc": 0.79817481, "epoch": 0.8275419589087039, "grad_norm": 4.9375, "learning_rate": 6.609977230770957e-06, "loss": 0.71652417, "memory(GiB)": 135.77, "step": 35470, "train_speed(iter/s)": 0.20204 }, { "acc": 0.76829128, "epoch": 0.8277752664809928, "grad_norm": 5.5625, "learning_rate": 6.608188616049951e-06, "loss": 0.83743439, "memory(GiB)": 135.77, "step": 35480, "train_speed(iter/s)": 0.20207 }, { "acc": 0.78781743, "epoch": 0.8280085740532817, "grad_norm": 7.53125, "learning_rate": 6.606399771760487e-06, "loss": 0.76809006, "memory(GiB)": 135.77, "step": 35490, "train_speed(iter/s)": 0.2021 }, { "acc": 0.78514261, "epoch": 0.8282418816255706, "grad_norm": 6.03125, "learning_rate": 6.6046106981579216e-06, "loss": 0.76895576, "memory(GiB)": 135.77, "step": 35500, "train_speed(iter/s)": 0.202131 }, { "epoch": 0.8282418816255706, "eval_acc": 0.7427417429233465, "eval_loss": 0.8110933899879456, "eval_runtime": 1270.2164, "eval_samples_per_second": 28.335, "eval_steps_per_second": 14.168, "step": 35500 }, { "acc": 0.7827517, "epoch": 0.8284751891978595, "grad_norm": 5.8125, "learning_rate": 6.6028213954976474e-06, "loss": 0.79220624, "memory(GiB)": 135.77, "step": 35510, "train_speed(iter/s)": 0.200689 }, { "acc": 0.77409925, "epoch": 0.8287084967701484, "grad_norm": 6.15625, "learning_rate": 6.601031864035082e-06, "loss": 0.80091858, "memory(GiB)": 135.77, "step": 35520, "train_speed(iter/s)": 0.200717 }, { "acc": 0.78355837, "epoch": 0.8289418043424371, "grad_norm": 5.25, "learning_rate": 6.5992421040256834e-06, "loss": 0.75921507, "memory(GiB)": 135.77, "step": 35530, "train_speed(iter/s)": 0.200747 }, { "acc": 0.76949062, "epoch": 0.829175111914726, "grad_norm": 5.375, "learning_rate": 6.597452115724939e-06, "loss": 0.81914463, "memory(GiB)": 135.77, "step": 35540, "train_speed(iter/s)": 0.200775 }, { "acc": 0.77408943, "epoch": 0.8294084194870149, "grad_norm": 5.59375, "learning_rate": 6.5956618993883716e-06, "loss": 0.80963411, "memory(GiB)": 135.77, "step": 35550, "train_speed(iter/s)": 0.200806 }, { "acc": 0.80136003, "epoch": 0.8296417270593038, "grad_norm": 4.53125, "learning_rate": 6.59387145527153e-06, "loss": 0.71679068, "memory(GiB)": 135.77, "step": 35560, "train_speed(iter/s)": 0.200835 }, { "acc": 0.76815443, "epoch": 0.8298750346315927, "grad_norm": 5.4375, "learning_rate": 6.59208078363e-06, "loss": 0.8367054, "memory(GiB)": 135.77, "step": 35570, "train_speed(iter/s)": 0.200866 }, { "acc": 0.7694231, "epoch": 0.8301083422038816, "grad_norm": 5.34375, "learning_rate": 6.590289884719403e-06, "loss": 0.85313902, "memory(GiB)": 135.77, "step": 35580, "train_speed(iter/s)": 0.200898 }, { "acc": 0.79432936, "epoch": 0.8303416497761705, "grad_norm": 5.09375, "learning_rate": 6.588498758795386e-06, "loss": 0.75772634, "memory(GiB)": 135.77, "step": 35590, "train_speed(iter/s)": 0.200927 }, { "acc": 0.77298503, "epoch": 0.8305749573484594, "grad_norm": 5.625, "learning_rate": 6.586707406113632e-06, "loss": 0.81344709, "memory(GiB)": 135.77, "step": 35600, "train_speed(iter/s)": 0.200956 }, { "acc": 0.78691454, "epoch": 0.8308082649207483, "grad_norm": 5.5, "learning_rate": 6.5849158269298565e-06, "loss": 0.76828012, "memory(GiB)": 135.77, "step": 35610, "train_speed(iter/s)": 0.200984 }, { "acc": 0.76740513, "epoch": 0.8310415724930372, "grad_norm": 6.34375, "learning_rate": 6.583124021499807e-06, "loss": 0.83374586, "memory(GiB)": 135.77, "step": 35620, "train_speed(iter/s)": 0.201013 }, { "acc": 0.76556411, "epoch": 0.8312748800653261, "grad_norm": 6.0625, "learning_rate": 6.581331990079264e-06, "loss": 0.84218559, "memory(GiB)": 135.77, "step": 35630, "train_speed(iter/s)": 0.201041 }, { "acc": 0.77049723, "epoch": 0.831508187637615, "grad_norm": 5.75, "learning_rate": 6.579539732924038e-06, "loss": 0.83664169, "memory(GiB)": 135.77, "step": 35640, "train_speed(iter/s)": 0.201069 }, { "acc": 0.76034393, "epoch": 0.8317414952099039, "grad_norm": 7.5625, "learning_rate": 6.5777472502899765e-06, "loss": 0.87293215, "memory(GiB)": 135.77, "step": 35650, "train_speed(iter/s)": 0.201097 }, { "acc": 0.78872604, "epoch": 0.8319748027821928, "grad_norm": 5.03125, "learning_rate": 6.5759545424329514e-06, "loss": 0.7763586, "memory(GiB)": 135.77, "step": 35660, "train_speed(iter/s)": 0.201127 }, { "acc": 0.76089096, "epoch": 0.8322081103544817, "grad_norm": 5.90625, "learning_rate": 6.574161609608873e-06, "loss": 0.84671097, "memory(GiB)": 135.77, "step": 35670, "train_speed(iter/s)": 0.201157 }, { "acc": 0.78287115, "epoch": 0.8324414179267706, "grad_norm": 7.78125, "learning_rate": 6.572368452073683e-06, "loss": 0.79170933, "memory(GiB)": 135.77, "step": 35680, "train_speed(iter/s)": 0.201186 }, { "acc": 0.79201784, "epoch": 0.8326747254990595, "grad_norm": 5.46875, "learning_rate": 6.570575070083351e-06, "loss": 0.72850819, "memory(GiB)": 135.77, "step": 35690, "train_speed(iter/s)": 0.201217 }, { "acc": 0.76989765, "epoch": 0.8329080330713484, "grad_norm": 4.3125, "learning_rate": 6.5687814638938865e-06, "loss": 0.80628548, "memory(GiB)": 135.77, "step": 35700, "train_speed(iter/s)": 0.201247 }, { "acc": 0.77745337, "epoch": 0.8331413406436373, "grad_norm": 4.78125, "learning_rate": 6.566987633761323e-06, "loss": 0.80288849, "memory(GiB)": 135.77, "step": 35710, "train_speed(iter/s)": 0.201276 }, { "acc": 0.77031932, "epoch": 0.8333746482159261, "grad_norm": 5.25, "learning_rate": 6.5651935799417295e-06, "loss": 0.84479132, "memory(GiB)": 135.77, "step": 35720, "train_speed(iter/s)": 0.201306 }, { "acc": 0.7754509, "epoch": 0.833607955788215, "grad_norm": 5.53125, "learning_rate": 6.563399302691209e-06, "loss": 0.78503389, "memory(GiB)": 135.77, "step": 35730, "train_speed(iter/s)": 0.201336 }, { "acc": 0.77183795, "epoch": 0.8338412633605039, "grad_norm": 5.65625, "learning_rate": 6.561604802265891e-06, "loss": 0.82641964, "memory(GiB)": 135.77, "step": 35740, "train_speed(iter/s)": 0.201364 }, { "acc": 0.78574123, "epoch": 0.8340745709327928, "grad_norm": 4.21875, "learning_rate": 6.55981007892194e-06, "loss": 0.76580262, "memory(GiB)": 135.77, "step": 35750, "train_speed(iter/s)": 0.201393 }, { "acc": 0.79073353, "epoch": 0.8343078785050817, "grad_norm": 7.90625, "learning_rate": 6.558015132915554e-06, "loss": 0.7411293, "memory(GiB)": 135.77, "step": 35760, "train_speed(iter/s)": 0.201422 }, { "acc": 0.77999744, "epoch": 0.8345411860773706, "grad_norm": 5.5, "learning_rate": 6.556219964502961e-06, "loss": 0.77981787, "memory(GiB)": 135.77, "step": 35770, "train_speed(iter/s)": 0.201452 }, { "acc": 0.77890453, "epoch": 0.8347744936496595, "grad_norm": 6.5, "learning_rate": 6.5544245739404196e-06, "loss": 0.76951075, "memory(GiB)": 135.77, "step": 35780, "train_speed(iter/s)": 0.201478 }, { "acc": 0.78939867, "epoch": 0.8350078012219484, "grad_norm": 5.4375, "learning_rate": 6.552628961484222e-06, "loss": 0.76422501, "memory(GiB)": 135.77, "step": 35790, "train_speed(iter/s)": 0.201507 }, { "acc": 0.78203793, "epoch": 0.8352411087942373, "grad_norm": 8.1875, "learning_rate": 6.550833127390692e-06, "loss": 0.7660037, "memory(GiB)": 135.77, "step": 35800, "train_speed(iter/s)": 0.201535 }, { "acc": 0.7650146, "epoch": 0.8354744163665262, "grad_norm": 4.96875, "learning_rate": 6.549037071916184e-06, "loss": 0.87431145, "memory(GiB)": 135.77, "step": 35810, "train_speed(iter/s)": 0.201565 }, { "acc": 0.75421095, "epoch": 0.8357077239388151, "grad_norm": 4.21875, "learning_rate": 6.547240795317081e-06, "loss": 0.86133585, "memory(GiB)": 135.77, "step": 35820, "train_speed(iter/s)": 0.201593 }, { "acc": 0.76499453, "epoch": 0.835941031511104, "grad_norm": 8.5625, "learning_rate": 6.545444297849808e-06, "loss": 0.85870771, "memory(GiB)": 135.77, "step": 35830, "train_speed(iter/s)": 0.201622 }, { "acc": 0.77958255, "epoch": 0.8361743390833929, "grad_norm": 6.0625, "learning_rate": 6.543647579770806e-06, "loss": 0.78675489, "memory(GiB)": 135.77, "step": 35840, "train_speed(iter/s)": 0.20165 }, { "acc": 0.77734413, "epoch": 0.8364076466556818, "grad_norm": 5.65625, "learning_rate": 6.5418506413365634e-06, "loss": 0.78364563, "memory(GiB)": 135.77, "step": 35850, "train_speed(iter/s)": 0.201679 }, { "acc": 0.78868957, "epoch": 0.8366409542279707, "grad_norm": 5.59375, "learning_rate": 6.5400534828035885e-06, "loss": 0.74761543, "memory(GiB)": 135.77, "step": 35860, "train_speed(iter/s)": 0.201708 }, { "acc": 0.76377254, "epoch": 0.8368742618002596, "grad_norm": 5.90625, "learning_rate": 6.538256104428427e-06, "loss": 0.85419159, "memory(GiB)": 135.77, "step": 35870, "train_speed(iter/s)": 0.201738 }, { "acc": 0.77922173, "epoch": 0.8371075693725485, "grad_norm": 6.5, "learning_rate": 6.536458506467654e-06, "loss": 0.78688426, "memory(GiB)": 135.77, "step": 35880, "train_speed(iter/s)": 0.201765 }, { "acc": 0.76489186, "epoch": 0.8373408769448374, "grad_norm": 4.875, "learning_rate": 6.5346606891778755e-06, "loss": 0.84113293, "memory(GiB)": 135.77, "step": 35890, "train_speed(iter/s)": 0.201795 }, { "acc": 0.77273259, "epoch": 0.8375741845171263, "grad_norm": 7.375, "learning_rate": 6.532862652815728e-06, "loss": 0.8084506, "memory(GiB)": 135.77, "step": 35900, "train_speed(iter/s)": 0.201825 }, { "acc": 0.74848862, "epoch": 0.8378074920894151, "grad_norm": 7.0625, "learning_rate": 6.531064397637883e-06, "loss": 0.90964317, "memory(GiB)": 135.77, "step": 35910, "train_speed(iter/s)": 0.201853 }, { "acc": 0.78161101, "epoch": 0.838040799661704, "grad_norm": 5.8125, "learning_rate": 6.529265923901039e-06, "loss": 0.78454123, "memory(GiB)": 135.77, "step": 35920, "train_speed(iter/s)": 0.201882 }, { "acc": 0.78447218, "epoch": 0.8382741072339929, "grad_norm": 5.25, "learning_rate": 6.527467231861929e-06, "loss": 0.76308413, "memory(GiB)": 135.77, "step": 35930, "train_speed(iter/s)": 0.201912 }, { "acc": 0.78148451, "epoch": 0.8385074148062818, "grad_norm": 97.5, "learning_rate": 6.525668321777317e-06, "loss": 0.78811293, "memory(GiB)": 135.77, "step": 35940, "train_speed(iter/s)": 0.201942 }, { "acc": 0.76809978, "epoch": 0.8387407223785707, "grad_norm": 5.09375, "learning_rate": 6.523869193903994e-06, "loss": 0.84421577, "memory(GiB)": 135.77, "step": 35950, "train_speed(iter/s)": 0.201973 }, { "acc": 0.76690893, "epoch": 0.8389740299508596, "grad_norm": 4.28125, "learning_rate": 6.522069848498787e-06, "loss": 0.8514679, "memory(GiB)": 135.77, "step": 35960, "train_speed(iter/s)": 0.202001 }, { "acc": 0.78587494, "epoch": 0.8392073375231485, "grad_norm": 7.875, "learning_rate": 6.5202702858185495e-06, "loss": 0.75838385, "memory(GiB)": 135.77, "step": 35970, "train_speed(iter/s)": 0.20203 }, { "acc": 0.77815952, "epoch": 0.8394406450954374, "grad_norm": 5.15625, "learning_rate": 6.518470506120171e-06, "loss": 0.7992382, "memory(GiB)": 135.77, "step": 35980, "train_speed(iter/s)": 0.202058 }, { "acc": 0.77503004, "epoch": 0.8396739526677263, "grad_norm": 6.75, "learning_rate": 6.51667050966057e-06, "loss": 0.79922199, "memory(GiB)": 135.77, "step": 35990, "train_speed(iter/s)": 0.202088 }, { "acc": 0.76703901, "epoch": 0.8399072602400152, "grad_norm": 5.875, "learning_rate": 6.514870296696694e-06, "loss": 0.84329872, "memory(GiB)": 135.77, "step": 36000, "train_speed(iter/s)": 0.202117 }, { "epoch": 0.8399072602400152, "eval_acc": 0.7429556956177011, "eval_loss": 0.8103926777839661, "eval_runtime": 1268.5973, "eval_samples_per_second": 28.371, "eval_steps_per_second": 14.186, "step": 36000 }, { "acc": 0.76743212, "epoch": 0.8401405678123041, "grad_norm": 5.5, "learning_rate": 6.513069867485523e-06, "loss": 0.8609807, "memory(GiB)": 135.77, "step": 36010, "train_speed(iter/s)": 0.2007 }, { "acc": 0.75457373, "epoch": 0.840373875384593, "grad_norm": 6.21875, "learning_rate": 6.511269222284069e-06, "loss": 0.86449337, "memory(GiB)": 135.77, "step": 36020, "train_speed(iter/s)": 0.200728 }, { "acc": 0.76079054, "epoch": 0.8406071829568819, "grad_norm": 4.71875, "learning_rate": 6.509468361349371e-06, "loss": 0.87414017, "memory(GiB)": 135.77, "step": 36030, "train_speed(iter/s)": 0.200758 }, { "acc": 0.79017248, "epoch": 0.8408404905291708, "grad_norm": 4.625, "learning_rate": 6.507667284938502e-06, "loss": 0.75717969, "memory(GiB)": 135.77, "step": 36040, "train_speed(iter/s)": 0.200785 }, { "acc": 0.77153301, "epoch": 0.8410737981014597, "grad_norm": 5.5, "learning_rate": 6.505865993308568e-06, "loss": 0.84561863, "memory(GiB)": 135.77, "step": 36050, "train_speed(iter/s)": 0.200815 }, { "acc": 0.7702539, "epoch": 0.8413071056737486, "grad_norm": 10.75, "learning_rate": 6.5040644867167e-06, "loss": 0.83923016, "memory(GiB)": 135.77, "step": 36060, "train_speed(iter/s)": 0.200843 }, { "acc": 0.78410187, "epoch": 0.8415404132460375, "grad_norm": 7.4375, "learning_rate": 6.502262765420064e-06, "loss": 0.78843861, "memory(GiB)": 135.77, "step": 36070, "train_speed(iter/s)": 0.200871 }, { "acc": 0.76989207, "epoch": 0.8417737208183264, "grad_norm": 5.03125, "learning_rate": 6.500460829675854e-06, "loss": 0.81716394, "memory(GiB)": 135.77, "step": 36080, "train_speed(iter/s)": 0.2009 }, { "acc": 0.75923405, "epoch": 0.8420070283906153, "grad_norm": 6.4375, "learning_rate": 6.498658679741298e-06, "loss": 0.86520176, "memory(GiB)": 135.77, "step": 36090, "train_speed(iter/s)": 0.200925 }, { "acc": 0.79458346, "epoch": 0.842240335962904, "grad_norm": 6.75, "learning_rate": 6.49685631587365e-06, "loss": 0.74665785, "memory(GiB)": 135.77, "step": 36100, "train_speed(iter/s)": 0.200953 }, { "acc": 0.7725565, "epoch": 0.8424736435351929, "grad_norm": 6.46875, "learning_rate": 6.495053738330196e-06, "loss": 0.80447502, "memory(GiB)": 135.77, "step": 36110, "train_speed(iter/s)": 0.200982 }, { "acc": 0.78278484, "epoch": 0.8427069511074818, "grad_norm": 5.5625, "learning_rate": 6.493250947368257e-06, "loss": 0.77641745, "memory(GiB)": 135.77, "step": 36120, "train_speed(iter/s)": 0.20101 }, { "acc": 0.79837875, "epoch": 0.8429402586797707, "grad_norm": 5.0625, "learning_rate": 6.491447943245179e-06, "loss": 0.71321259, "memory(GiB)": 135.77, "step": 36130, "train_speed(iter/s)": 0.20104 }, { "acc": 0.79435396, "epoch": 0.8431735662520596, "grad_norm": 5.125, "learning_rate": 6.489644726218339e-06, "loss": 0.7343874, "memory(GiB)": 135.77, "step": 36140, "train_speed(iter/s)": 0.201069 }, { "acc": 0.78385534, "epoch": 0.8434068738243485, "grad_norm": 6.59375, "learning_rate": 6.4878412965451485e-06, "loss": 0.79530344, "memory(GiB)": 135.77, "step": 36150, "train_speed(iter/s)": 0.201099 }, { "acc": 0.78743668, "epoch": 0.8436401813966374, "grad_norm": 7.125, "learning_rate": 6.486037654483046e-06, "loss": 0.74157157, "memory(GiB)": 135.77, "step": 36160, "train_speed(iter/s)": 0.201127 }, { "acc": 0.80141068, "epoch": 0.8438734889689263, "grad_norm": 4.8125, "learning_rate": 6.484233800289499e-06, "loss": 0.70053635, "memory(GiB)": 135.77, "step": 36170, "train_speed(iter/s)": 0.201156 }, { "acc": 0.76834717, "epoch": 0.8441067965412152, "grad_norm": 6.09375, "learning_rate": 6.482429734222008e-06, "loss": 0.85586014, "memory(GiB)": 135.77, "step": 36180, "train_speed(iter/s)": 0.201183 }, { "acc": 0.77482719, "epoch": 0.8443401041135041, "grad_norm": 6.0625, "learning_rate": 6.4806254565381025e-06, "loss": 0.81930189, "memory(GiB)": 135.77, "step": 36190, "train_speed(iter/s)": 0.20121 }, { "acc": 0.81716328, "epoch": 0.844573411685793, "grad_norm": 5.375, "learning_rate": 6.478820967495343e-06, "loss": 0.6350563, "memory(GiB)": 135.77, "step": 36200, "train_speed(iter/s)": 0.201239 }, { "acc": 0.77658648, "epoch": 0.8448067192580819, "grad_norm": 4.34375, "learning_rate": 6.47701626735132e-06, "loss": 0.78943858, "memory(GiB)": 135.77, "step": 36210, "train_speed(iter/s)": 0.201265 }, { "acc": 0.78699703, "epoch": 0.8450400268303708, "grad_norm": 5.96875, "learning_rate": 6.475211356363655e-06, "loss": 0.75699453, "memory(GiB)": 135.77, "step": 36220, "train_speed(iter/s)": 0.201292 }, { "acc": 0.80862513, "epoch": 0.8452733344026597, "grad_norm": 4.625, "learning_rate": 6.473406234789998e-06, "loss": 0.67446146, "memory(GiB)": 135.77, "step": 36230, "train_speed(iter/s)": 0.20132 }, { "acc": 0.77075014, "epoch": 0.8455066419749486, "grad_norm": 6.25, "learning_rate": 6.471600902888029e-06, "loss": 0.81247129, "memory(GiB)": 135.77, "step": 36240, "train_speed(iter/s)": 0.20135 }, { "acc": 0.77357235, "epoch": 0.8457399495472375, "grad_norm": 6.21875, "learning_rate": 6.4697953609154575e-06, "loss": 0.81807146, "memory(GiB)": 135.77, "step": 36250, "train_speed(iter/s)": 0.201378 }, { "acc": 0.78533292, "epoch": 0.8459732571195264, "grad_norm": 5.34375, "learning_rate": 6.467989609130024e-06, "loss": 0.78466349, "memory(GiB)": 135.77, "step": 36260, "train_speed(iter/s)": 0.201406 }, { "acc": 0.77039223, "epoch": 0.8462065646918153, "grad_norm": 5.34375, "learning_rate": 6.466183647789502e-06, "loss": 0.83654995, "memory(GiB)": 135.77, "step": 36270, "train_speed(iter/s)": 0.201434 }, { "acc": 0.7773541, "epoch": 0.8464398722641042, "grad_norm": 4.40625, "learning_rate": 6.46437747715169e-06, "loss": 0.79755011, "memory(GiB)": 135.77, "step": 36280, "train_speed(iter/s)": 0.201462 }, { "acc": 0.77812119, "epoch": 0.8466731798363931, "grad_norm": 5.46875, "learning_rate": 6.462571097474419e-06, "loss": 0.78683367, "memory(GiB)": 135.77, "step": 36290, "train_speed(iter/s)": 0.201491 }, { "acc": 0.78762054, "epoch": 0.8469064874086819, "grad_norm": 4.28125, "learning_rate": 6.460764509015547e-06, "loss": 0.76013975, "memory(GiB)": 135.77, "step": 36300, "train_speed(iter/s)": 0.201516 }, { "acc": 0.78863735, "epoch": 0.8471397949809708, "grad_norm": 6.125, "learning_rate": 6.4589577120329685e-06, "loss": 0.7554966, "memory(GiB)": 135.77, "step": 36310, "train_speed(iter/s)": 0.201544 }, { "acc": 0.78832941, "epoch": 0.8473731025532597, "grad_norm": 6.375, "learning_rate": 6.4571507067845985e-06, "loss": 0.78529215, "memory(GiB)": 135.77, "step": 36320, "train_speed(iter/s)": 0.201569 }, { "acc": 0.78547802, "epoch": 0.8476064101255486, "grad_norm": 4.8125, "learning_rate": 6.455343493528388e-06, "loss": 0.77054935, "memory(GiB)": 135.77, "step": 36330, "train_speed(iter/s)": 0.201598 }, { "acc": 0.7642107, "epoch": 0.8478397176978375, "grad_norm": 5.96875, "learning_rate": 6.4535360725223175e-06, "loss": 0.85638161, "memory(GiB)": 135.77, "step": 36340, "train_speed(iter/s)": 0.201628 }, { "acc": 0.77951527, "epoch": 0.8480730252701264, "grad_norm": 4.15625, "learning_rate": 6.451728444024394e-06, "loss": 0.77187543, "memory(GiB)": 135.77, "step": 36350, "train_speed(iter/s)": 0.201656 }, { "acc": 0.78816013, "epoch": 0.8483063328424153, "grad_norm": 4.84375, "learning_rate": 6.449920608292658e-06, "loss": 0.75378218, "memory(GiB)": 135.77, "step": 36360, "train_speed(iter/s)": 0.201683 }, { "acc": 0.7518692, "epoch": 0.8485396404147042, "grad_norm": 5.65625, "learning_rate": 6.448112565585176e-06, "loss": 0.89817324, "memory(GiB)": 135.77, "step": 36370, "train_speed(iter/s)": 0.201712 }, { "acc": 0.77250676, "epoch": 0.8487729479869931, "grad_norm": 4.34375, "learning_rate": 6.446304316160046e-06, "loss": 0.80824423, "memory(GiB)": 135.77, "step": 36380, "train_speed(iter/s)": 0.20174 }, { "acc": 0.78365912, "epoch": 0.849006255559282, "grad_norm": 4.40625, "learning_rate": 6.444495860275395e-06, "loss": 0.76890135, "memory(GiB)": 135.77, "step": 36390, "train_speed(iter/s)": 0.201768 }, { "acc": 0.75655127, "epoch": 0.8492395631315709, "grad_norm": 5.625, "learning_rate": 6.442687198189379e-06, "loss": 0.86654816, "memory(GiB)": 135.77, "step": 36400, "train_speed(iter/s)": 0.201797 }, { "acc": 0.77825761, "epoch": 0.8494728707038598, "grad_norm": 5.3125, "learning_rate": 6.440878330160185e-06, "loss": 0.79696665, "memory(GiB)": 135.77, "step": 36410, "train_speed(iter/s)": 0.201829 }, { "acc": 0.78085461, "epoch": 0.8497061782761487, "grad_norm": 4.46875, "learning_rate": 6.439069256446027e-06, "loss": 0.78490944, "memory(GiB)": 135.77, "step": 36420, "train_speed(iter/s)": 0.201857 }, { "acc": 0.78769636, "epoch": 0.8499394858484376, "grad_norm": 7.34375, "learning_rate": 6.437259977305152e-06, "loss": 0.77664795, "memory(GiB)": 135.77, "step": 36430, "train_speed(iter/s)": 0.201887 }, { "acc": 0.77704792, "epoch": 0.8501727934207265, "grad_norm": 4.875, "learning_rate": 6.435450492995833e-06, "loss": 0.80361004, "memory(GiB)": 135.77, "step": 36440, "train_speed(iter/s)": 0.201916 }, { "acc": 0.79341078, "epoch": 0.8504061009930154, "grad_norm": 5.21875, "learning_rate": 6.433640803776372e-06, "loss": 0.7458343, "memory(GiB)": 135.77, "step": 36450, "train_speed(iter/s)": 0.201944 }, { "acc": 0.78444099, "epoch": 0.8506394085653043, "grad_norm": 6.5, "learning_rate": 6.431830909905105e-06, "loss": 0.77440691, "memory(GiB)": 135.77, "step": 36460, "train_speed(iter/s)": 0.20197 }, { "acc": 0.78535118, "epoch": 0.8508727161375932, "grad_norm": 4.28125, "learning_rate": 6.43002081164039e-06, "loss": 0.76964531, "memory(GiB)": 135.77, "step": 36470, "train_speed(iter/s)": 0.201997 }, { "acc": 0.7824791, "epoch": 0.8511060237098821, "grad_norm": 6.15625, "learning_rate": 6.428210509240618e-06, "loss": 0.80835476, "memory(GiB)": 135.77, "step": 36480, "train_speed(iter/s)": 0.202024 }, { "acc": 0.79336419, "epoch": 0.8513393312821709, "grad_norm": 4.90625, "learning_rate": 6.426400002964211e-06, "loss": 0.73920593, "memory(GiB)": 135.77, "step": 36490, "train_speed(iter/s)": 0.202054 }, { "acc": 0.77750196, "epoch": 0.8515726388544598, "grad_norm": 4.84375, "learning_rate": 6.42458929306962e-06, "loss": 0.81381168, "memory(GiB)": 135.77, "step": 36500, "train_speed(iter/s)": 0.202082 }, { "epoch": 0.8515726388544598, "eval_acc": 0.7429995751373517, "eval_loss": 0.8101937174797058, "eval_runtime": 1268.6867, "eval_samples_per_second": 28.369, "eval_steps_per_second": 14.185, "step": 36500 }, { "acc": 0.78311806, "epoch": 0.8518059464267487, "grad_norm": 5.34375, "learning_rate": 6.42277837981532e-06, "loss": 0.75560861, "memory(GiB)": 135.77, "step": 36510, "train_speed(iter/s)": 0.200682 }, { "acc": 0.76936331, "epoch": 0.8520392539990376, "grad_norm": 7.1875, "learning_rate": 6.420967263459821e-06, "loss": 0.82802315, "memory(GiB)": 135.77, "step": 36520, "train_speed(iter/s)": 0.200712 }, { "acc": 0.76965437, "epoch": 0.8522725615713265, "grad_norm": 5.3125, "learning_rate": 6.419155944261657e-06, "loss": 0.82444363, "memory(GiB)": 135.77, "step": 36530, "train_speed(iter/s)": 0.200741 }, { "acc": 0.77062273, "epoch": 0.8525058691436154, "grad_norm": 13.5, "learning_rate": 6.4173444224793935e-06, "loss": 0.82105751, "memory(GiB)": 135.77, "step": 36540, "train_speed(iter/s)": 0.200772 }, { "acc": 0.77684073, "epoch": 0.8527391767159043, "grad_norm": 5.96875, "learning_rate": 6.415532698371625e-06, "loss": 0.80583134, "memory(GiB)": 135.77, "step": 36550, "train_speed(iter/s)": 0.200801 }, { "acc": 0.78810062, "epoch": 0.8529724842881932, "grad_norm": 4.25, "learning_rate": 6.413720772196976e-06, "loss": 0.76445894, "memory(GiB)": 135.77, "step": 36560, "train_speed(iter/s)": 0.200832 }, { "acc": 0.7537466, "epoch": 0.8532057918604821, "grad_norm": 6.34375, "learning_rate": 6.411908644214098e-06, "loss": 0.89372339, "memory(GiB)": 135.77, "step": 36570, "train_speed(iter/s)": 0.200861 }, { "acc": 0.77111721, "epoch": 0.853439099432771, "grad_norm": 5.53125, "learning_rate": 6.410096314681671e-06, "loss": 0.82660351, "memory(GiB)": 135.77, "step": 36580, "train_speed(iter/s)": 0.200892 }, { "acc": 0.76293507, "epoch": 0.8536724070050599, "grad_norm": 5.34375, "learning_rate": 6.408283783858405e-06, "loss": 0.87582846, "memory(GiB)": 135.77, "step": 36590, "train_speed(iter/s)": 0.20092 }, { "acc": 0.77199388, "epoch": 0.8539057145773488, "grad_norm": 5.84375, "learning_rate": 6.406471052003036e-06, "loss": 0.82917595, "memory(GiB)": 135.77, "step": 36600, "train_speed(iter/s)": 0.200949 }, { "acc": 0.78203735, "epoch": 0.8541390221496377, "grad_norm": 5.28125, "learning_rate": 6.4046581193743344e-06, "loss": 0.79196444, "memory(GiB)": 135.77, "step": 36610, "train_speed(iter/s)": 0.200978 }, { "acc": 0.7757906, "epoch": 0.8543723297219266, "grad_norm": 5.6875, "learning_rate": 6.402844986231094e-06, "loss": 0.81577177, "memory(GiB)": 135.77, "step": 36620, "train_speed(iter/s)": 0.201007 }, { "acc": 0.77216077, "epoch": 0.8546056372942155, "grad_norm": 5.1875, "learning_rate": 6.401031652832141e-06, "loss": 0.79967194, "memory(GiB)": 135.77, "step": 36630, "train_speed(iter/s)": 0.201038 }, { "acc": 0.78772125, "epoch": 0.8548389448665044, "grad_norm": 4.875, "learning_rate": 6.3992181194363234e-06, "loss": 0.75887017, "memory(GiB)": 135.77, "step": 36640, "train_speed(iter/s)": 0.201067 }, { "acc": 0.76235476, "epoch": 0.8550722524387933, "grad_norm": 6.28125, "learning_rate": 6.397404386302528e-06, "loss": 0.84511347, "memory(GiB)": 135.77, "step": 36650, "train_speed(iter/s)": 0.201097 }, { "acc": 0.7826313, "epoch": 0.8553055600110822, "grad_norm": 4.21875, "learning_rate": 6.395590453689662e-06, "loss": 0.8007947, "memory(GiB)": 135.77, "step": 36660, "train_speed(iter/s)": 0.201124 }, { "acc": 0.77450428, "epoch": 0.855538867583371, "grad_norm": 6.53125, "learning_rate": 6.393776321856664e-06, "loss": 0.80132704, "memory(GiB)": 135.77, "step": 36670, "train_speed(iter/s)": 0.201152 }, { "acc": 0.75675406, "epoch": 0.8557721751556598, "grad_norm": 6.75, "learning_rate": 6.391961991062501e-06, "loss": 0.86429386, "memory(GiB)": 135.77, "step": 36680, "train_speed(iter/s)": 0.201179 }, { "acc": 0.75716515, "epoch": 0.8560054827279487, "grad_norm": 11.5, "learning_rate": 6.390147461566167e-06, "loss": 0.86847095, "memory(GiB)": 135.77, "step": 36690, "train_speed(iter/s)": 0.201205 }, { "acc": 0.78775959, "epoch": 0.8562387903002376, "grad_norm": 4.875, "learning_rate": 6.388332733626689e-06, "loss": 0.75113401, "memory(GiB)": 147.13, "step": 36700, "train_speed(iter/s)": 0.201232 }, { "acc": 0.76984215, "epoch": 0.8564720978725265, "grad_norm": 6.0625, "learning_rate": 6.386517807503114e-06, "loss": 0.83430986, "memory(GiB)": 147.13, "step": 36710, "train_speed(iter/s)": 0.201262 }, { "acc": 0.77020178, "epoch": 0.8567054054448154, "grad_norm": 6.28125, "learning_rate": 6.384702683454527e-06, "loss": 0.82754822, "memory(GiB)": 147.13, "step": 36720, "train_speed(iter/s)": 0.20129 }, { "acc": 0.7816741, "epoch": 0.8569387130171043, "grad_norm": 6.125, "learning_rate": 6.382887361740033e-06, "loss": 0.78474622, "memory(GiB)": 147.13, "step": 36730, "train_speed(iter/s)": 0.201319 }, { "acc": 0.79666619, "epoch": 0.8571720205893932, "grad_norm": 5.0, "learning_rate": 6.38107184261877e-06, "loss": 0.72450123, "memory(GiB)": 147.13, "step": 36740, "train_speed(iter/s)": 0.201344 }, { "acc": 0.78614969, "epoch": 0.8574053281616821, "grad_norm": 9.125, "learning_rate": 6.379256126349903e-06, "loss": 0.76010709, "memory(GiB)": 147.13, "step": 36750, "train_speed(iter/s)": 0.201372 }, { "acc": 0.77783051, "epoch": 0.857638635733971, "grad_norm": 5.53125, "learning_rate": 6.377440213192625e-06, "loss": 0.80419178, "memory(GiB)": 147.13, "step": 36760, "train_speed(iter/s)": 0.201399 }, { "acc": 0.78635902, "epoch": 0.8578719433062599, "grad_norm": 5.03125, "learning_rate": 6.375624103406155e-06, "loss": 0.77373714, "memory(GiB)": 147.13, "step": 36770, "train_speed(iter/s)": 0.201429 }, { "acc": 0.79163675, "epoch": 0.8581052508785488, "grad_norm": 5.71875, "learning_rate": 6.373807797249744e-06, "loss": 0.74694929, "memory(GiB)": 147.13, "step": 36780, "train_speed(iter/s)": 0.201456 }, { "acc": 0.77907381, "epoch": 0.8583385584508377, "grad_norm": 5.875, "learning_rate": 6.371991294982671e-06, "loss": 0.79713793, "memory(GiB)": 147.13, "step": 36790, "train_speed(iter/s)": 0.201484 }, { "acc": 0.75770316, "epoch": 0.8585718660231266, "grad_norm": 6.75, "learning_rate": 6.370174596864238e-06, "loss": 0.88794937, "memory(GiB)": 147.13, "step": 36800, "train_speed(iter/s)": 0.201513 }, { "acc": 0.77909842, "epoch": 0.8588051735954155, "grad_norm": 4.46875, "learning_rate": 6.368357703153782e-06, "loss": 0.80933161, "memory(GiB)": 147.13, "step": 36810, "train_speed(iter/s)": 0.201542 }, { "acc": 0.78378386, "epoch": 0.8590384811677044, "grad_norm": 4.3125, "learning_rate": 6.366540614110658e-06, "loss": 0.78677444, "memory(GiB)": 147.13, "step": 36820, "train_speed(iter/s)": 0.20157 }, { "acc": 0.77209501, "epoch": 0.8592717887399933, "grad_norm": 5.9375, "learning_rate": 6.364723329994259e-06, "loss": 0.81071463, "memory(GiB)": 147.13, "step": 36830, "train_speed(iter/s)": 0.201599 }, { "acc": 0.77325363, "epoch": 0.8595050963122822, "grad_norm": 6.4375, "learning_rate": 6.362905851064001e-06, "loss": 0.81591139, "memory(GiB)": 147.13, "step": 36840, "train_speed(iter/s)": 0.201627 }, { "acc": 0.76463056, "epoch": 0.8597384038845711, "grad_norm": 4.6875, "learning_rate": 6.361088177579329e-06, "loss": 0.83884716, "memory(GiB)": 147.13, "step": 36850, "train_speed(iter/s)": 0.201656 }, { "acc": 0.77708492, "epoch": 0.85997171145686, "grad_norm": 6.1875, "learning_rate": 6.359270309799715e-06, "loss": 0.80096121, "memory(GiB)": 147.13, "step": 36860, "train_speed(iter/s)": 0.201683 }, { "acc": 0.78316288, "epoch": 0.8602050190291488, "grad_norm": 3.859375, "learning_rate": 6.357452247984659e-06, "loss": 0.79706192, "memory(GiB)": 147.13, "step": 36870, "train_speed(iter/s)": 0.201711 }, { "acc": 0.77961206, "epoch": 0.8604383266014377, "grad_norm": 5.40625, "learning_rate": 6.35563399239369e-06, "loss": 0.80441303, "memory(GiB)": 147.13, "step": 36880, "train_speed(iter/s)": 0.201737 }, { "acc": 0.78116093, "epoch": 0.8606716341737266, "grad_norm": 4.59375, "learning_rate": 6.353815543286361e-06, "loss": 0.78395686, "memory(GiB)": 147.13, "step": 36890, "train_speed(iter/s)": 0.201765 }, { "acc": 0.78719234, "epoch": 0.8609049417460155, "grad_norm": 4.6875, "learning_rate": 6.351996900922257e-06, "loss": 0.75968537, "memory(GiB)": 147.13, "step": 36900, "train_speed(iter/s)": 0.201793 }, { "acc": 0.78203468, "epoch": 0.8611382493183044, "grad_norm": 5.1875, "learning_rate": 6.3501780655609875e-06, "loss": 0.78212891, "memory(GiB)": 147.13, "step": 36910, "train_speed(iter/s)": 0.201819 }, { "acc": 0.77617979, "epoch": 0.8613715568905933, "grad_norm": 5.75, "learning_rate": 6.348359037462194e-06, "loss": 0.79318113, "memory(GiB)": 147.13, "step": 36920, "train_speed(iter/s)": 0.201847 }, { "acc": 0.79143863, "epoch": 0.8616048644628822, "grad_norm": 5.5, "learning_rate": 6.346539816885537e-06, "loss": 0.73529739, "memory(GiB)": 147.13, "step": 36930, "train_speed(iter/s)": 0.201875 }, { "acc": 0.76003532, "epoch": 0.8618381720351711, "grad_norm": 5.1875, "learning_rate": 6.3447204040907125e-06, "loss": 0.87924767, "memory(GiB)": 147.13, "step": 36940, "train_speed(iter/s)": 0.201904 }, { "acc": 0.78279285, "epoch": 0.86207147960746, "grad_norm": 4.75, "learning_rate": 6.342900799337443e-06, "loss": 0.77494459, "memory(GiB)": 147.13, "step": 36950, "train_speed(iter/s)": 0.201933 }, { "acc": 0.78726845, "epoch": 0.8623047871797489, "grad_norm": 6.28125, "learning_rate": 6.341081002885472e-06, "loss": 0.7636848, "memory(GiB)": 147.13, "step": 36960, "train_speed(iter/s)": 0.201962 }, { "acc": 0.78955112, "epoch": 0.8625380947520378, "grad_norm": 22.25, "learning_rate": 6.33926101499458e-06, "loss": 0.75000238, "memory(GiB)": 147.13, "step": 36970, "train_speed(iter/s)": 0.201987 }, { "acc": 0.7933239, "epoch": 0.8627714023243267, "grad_norm": 5.8125, "learning_rate": 6.337440835924564e-06, "loss": 0.75319366, "memory(GiB)": 147.13, "step": 36980, "train_speed(iter/s)": 0.202016 }, { "acc": 0.7852212, "epoch": 0.8630047098966156, "grad_norm": 5.53125, "learning_rate": 6.335620465935259e-06, "loss": 0.76989651, "memory(GiB)": 147.13, "step": 36990, "train_speed(iter/s)": 0.202045 }, { "acc": 0.78815894, "epoch": 0.8632380174689045, "grad_norm": 5.4375, "learning_rate": 6.333799905286519e-06, "loss": 0.76841455, "memory(GiB)": 147.13, "step": 37000, "train_speed(iter/s)": 0.202073 }, { "epoch": 0.8632380174689045, "eval_acc": 0.7431547548984517, "eval_loss": 0.8100025653839111, "eval_runtime": 1269.5429, "eval_samples_per_second": 28.35, "eval_steps_per_second": 14.175, "step": 37000 }, { "acc": 0.76950207, "epoch": 0.8634713250411934, "grad_norm": 5.25, "learning_rate": 6.331979154238232e-06, "loss": 0.82681618, "memory(GiB)": 147.13, "step": 37010, "train_speed(iter/s)": 0.200693 }, { "acc": 0.77774849, "epoch": 0.8637046326134823, "grad_norm": 4.71875, "learning_rate": 6.330158213050308e-06, "loss": 0.81203289, "memory(GiB)": 147.13, "step": 37020, "train_speed(iter/s)": 0.20072 }, { "acc": 0.77672329, "epoch": 0.8639379401857712, "grad_norm": 9.375, "learning_rate": 6.328337081982685e-06, "loss": 0.78580074, "memory(GiB)": 147.13, "step": 37030, "train_speed(iter/s)": 0.200751 }, { "acc": 0.79651041, "epoch": 0.8641712477580601, "grad_norm": 5.125, "learning_rate": 6.326515761295328e-06, "loss": 0.72057328, "memory(GiB)": 147.13, "step": 37040, "train_speed(iter/s)": 0.200778 }, { "acc": 0.767904, "epoch": 0.864404555330349, "grad_norm": 6.1875, "learning_rate": 6.3246942512482325e-06, "loss": 0.84308643, "memory(GiB)": 147.13, "step": 37050, "train_speed(iter/s)": 0.200806 }, { "acc": 0.77300673, "epoch": 0.8646378629026379, "grad_norm": 4.9375, "learning_rate": 6.3228725521014165e-06, "loss": 0.8220705, "memory(GiB)": 147.13, "step": 37060, "train_speed(iter/s)": 0.200834 }, { "acc": 0.81248608, "epoch": 0.8648711704749267, "grad_norm": 8.0625, "learning_rate": 6.32105066411493e-06, "loss": 0.68309054, "memory(GiB)": 147.13, "step": 37070, "train_speed(iter/s)": 0.200863 }, { "acc": 0.76430616, "epoch": 0.8651044780472156, "grad_norm": 5.09375, "learning_rate": 6.319228587548843e-06, "loss": 0.85580292, "memory(GiB)": 147.13, "step": 37080, "train_speed(iter/s)": 0.200888 }, { "acc": 0.76082668, "epoch": 0.8653377856195045, "grad_norm": 5.0, "learning_rate": 6.317406322663259e-06, "loss": 0.86531963, "memory(GiB)": 147.13, "step": 37090, "train_speed(iter/s)": 0.200915 }, { "acc": 0.77870302, "epoch": 0.8655710931917934, "grad_norm": 4.40625, "learning_rate": 6.315583869718306e-06, "loss": 0.79484749, "memory(GiB)": 147.13, "step": 37100, "train_speed(iter/s)": 0.20094 }, { "acc": 0.7822587, "epoch": 0.8658044007640823, "grad_norm": 4.71875, "learning_rate": 6.313761228974137e-06, "loss": 0.80919828, "memory(GiB)": 147.13, "step": 37110, "train_speed(iter/s)": 0.200966 }, { "acc": 0.76600981, "epoch": 0.8660377083363712, "grad_norm": 6.15625, "learning_rate": 6.311938400690933e-06, "loss": 0.83051043, "memory(GiB)": 147.13, "step": 37120, "train_speed(iter/s)": 0.200994 }, { "acc": 0.78756142, "epoch": 0.8662710159086601, "grad_norm": 4.21875, "learning_rate": 6.310115385128905e-06, "loss": 0.73773327, "memory(GiB)": 147.13, "step": 37130, "train_speed(iter/s)": 0.201021 }, { "acc": 0.77740078, "epoch": 0.866504323480949, "grad_norm": 5.6875, "learning_rate": 6.308292182548287e-06, "loss": 0.79663916, "memory(GiB)": 147.13, "step": 37140, "train_speed(iter/s)": 0.20105 }, { "acc": 0.7775095, "epoch": 0.8667376310532379, "grad_norm": 4.3125, "learning_rate": 6.3064687932093386e-06, "loss": 0.79716334, "memory(GiB)": 147.13, "step": 37150, "train_speed(iter/s)": 0.201077 }, { "acc": 0.78595037, "epoch": 0.8669709386255268, "grad_norm": 4.0, "learning_rate": 6.3046452173723495e-06, "loss": 0.76871729, "memory(GiB)": 147.13, "step": 37160, "train_speed(iter/s)": 0.201103 }, { "acc": 0.76833076, "epoch": 0.8672042461978157, "grad_norm": 4.90625, "learning_rate": 6.302821455297635e-06, "loss": 0.83416252, "memory(GiB)": 147.13, "step": 37170, "train_speed(iter/s)": 0.201131 }, { "acc": 0.77176652, "epoch": 0.8674375537701046, "grad_norm": 4.84375, "learning_rate": 6.300997507245537e-06, "loss": 0.82635098, "memory(GiB)": 147.13, "step": 37180, "train_speed(iter/s)": 0.201159 }, { "acc": 0.78389416, "epoch": 0.8676708613423935, "grad_norm": 5.4375, "learning_rate": 6.299173373476422e-06, "loss": 0.77835617, "memory(GiB)": 147.13, "step": 37190, "train_speed(iter/s)": 0.201185 }, { "acc": 0.7786643, "epoch": 0.8679041689146824, "grad_norm": 4.65625, "learning_rate": 6.2973490542506854e-06, "loss": 0.81813984, "memory(GiB)": 147.13, "step": 37200, "train_speed(iter/s)": 0.201214 }, { "acc": 0.78079252, "epoch": 0.8681374764869713, "grad_norm": 5.625, "learning_rate": 6.295524549828747e-06, "loss": 0.76534152, "memory(GiB)": 147.13, "step": 37210, "train_speed(iter/s)": 0.201242 }, { "acc": 0.77490101, "epoch": 0.8683707840592602, "grad_norm": 5.5, "learning_rate": 6.293699860471057e-06, "loss": 0.77603149, "memory(GiB)": 147.13, "step": 37220, "train_speed(iter/s)": 0.201269 }, { "acc": 0.75890179, "epoch": 0.8686040916315491, "grad_norm": 6.375, "learning_rate": 6.2918749864380875e-06, "loss": 0.8679534, "memory(GiB)": 147.13, "step": 37230, "train_speed(iter/s)": 0.201298 }, { "acc": 0.7783083, "epoch": 0.868837399203838, "grad_norm": 4.3125, "learning_rate": 6.290049927990339e-06, "loss": 0.79737039, "memory(GiB)": 147.13, "step": 37240, "train_speed(iter/s)": 0.201325 }, { "acc": 0.77292824, "epoch": 0.8690707067761269, "grad_norm": 5.25, "learning_rate": 6.288224685388337e-06, "loss": 0.82085094, "memory(GiB)": 147.13, "step": 37250, "train_speed(iter/s)": 0.201351 }, { "acc": 0.78002787, "epoch": 0.8693040143484156, "grad_norm": 4.5625, "learning_rate": 6.286399258892638e-06, "loss": 0.80144997, "memory(GiB)": 147.13, "step": 37260, "train_speed(iter/s)": 0.201378 }, { "acc": 0.78442354, "epoch": 0.8695373219207045, "grad_norm": 3.90625, "learning_rate": 6.284573648763816e-06, "loss": 0.77326307, "memory(GiB)": 147.13, "step": 37270, "train_speed(iter/s)": 0.201406 }, { "acc": 0.78011799, "epoch": 0.8697706294929934, "grad_norm": 4.96875, "learning_rate": 6.28274785526248e-06, "loss": 0.82026291, "memory(GiB)": 147.13, "step": 37280, "train_speed(iter/s)": 0.201434 }, { "acc": 0.76693525, "epoch": 0.8700039370652823, "grad_norm": 12.875, "learning_rate": 6.2809218786492595e-06, "loss": 0.84187355, "memory(GiB)": 147.13, "step": 37290, "train_speed(iter/s)": 0.201462 }, { "acc": 0.79030437, "epoch": 0.8702372446375712, "grad_norm": 6.53125, "learning_rate": 6.279095719184813e-06, "loss": 0.7504972, "memory(GiB)": 147.13, "step": 37300, "train_speed(iter/s)": 0.201489 }, { "acc": 0.77714148, "epoch": 0.8704705522098601, "grad_norm": 4.71875, "learning_rate": 6.277269377129826e-06, "loss": 0.79407024, "memory(GiB)": 147.13, "step": 37310, "train_speed(iter/s)": 0.201517 }, { "acc": 0.77218485, "epoch": 0.870703859782149, "grad_norm": 6.09375, "learning_rate": 6.275442852745005e-06, "loss": 0.82651482, "memory(GiB)": 147.13, "step": 37320, "train_speed(iter/s)": 0.201546 }, { "acc": 0.77186918, "epoch": 0.8709371673544379, "grad_norm": 9.3125, "learning_rate": 6.273616146291086e-06, "loss": 0.81594334, "memory(GiB)": 147.13, "step": 37330, "train_speed(iter/s)": 0.201575 }, { "acc": 0.7657239, "epoch": 0.8711704749267268, "grad_norm": 5.59375, "learning_rate": 6.2717892580288335e-06, "loss": 0.83614559, "memory(GiB)": 147.13, "step": 37340, "train_speed(iter/s)": 0.201603 }, { "acc": 0.77476711, "epoch": 0.8714037824990157, "grad_norm": 4.75, "learning_rate": 6.269962188219034e-06, "loss": 0.80064745, "memory(GiB)": 147.13, "step": 37350, "train_speed(iter/s)": 0.20163 }, { "acc": 0.78318167, "epoch": 0.8716370900713046, "grad_norm": 5.9375, "learning_rate": 6.2681349371225e-06, "loss": 0.78883448, "memory(GiB)": 147.13, "step": 37360, "train_speed(iter/s)": 0.201654 }, { "acc": 0.79107747, "epoch": 0.8718703976435935, "grad_norm": 3.90625, "learning_rate": 6.266307505000073e-06, "loss": 0.72986178, "memory(GiB)": 147.13, "step": 37370, "train_speed(iter/s)": 0.201681 }, { "acc": 0.77610064, "epoch": 0.8721037052158824, "grad_norm": 6.8125, "learning_rate": 6.264479892112619e-06, "loss": 0.79103355, "memory(GiB)": 147.13, "step": 37380, "train_speed(iter/s)": 0.201711 }, { "acc": 0.77150335, "epoch": 0.8723370127881713, "grad_norm": 6.0625, "learning_rate": 6.262652098721026e-06, "loss": 0.82169552, "memory(GiB)": 147.13, "step": 37390, "train_speed(iter/s)": 0.201738 }, { "acc": 0.77216101, "epoch": 0.8725703203604602, "grad_norm": 5.65625, "learning_rate": 6.260824125086212e-06, "loss": 0.8190197, "memory(GiB)": 147.13, "step": 37400, "train_speed(iter/s)": 0.201767 }, { "acc": 0.77426405, "epoch": 0.8728036279327491, "grad_norm": 5.375, "learning_rate": 6.258995971469122e-06, "loss": 0.83238316, "memory(GiB)": 147.13, "step": 37410, "train_speed(iter/s)": 0.201795 }, { "acc": 0.78717422, "epoch": 0.873036935505038, "grad_norm": 6.0625, "learning_rate": 6.2571676381307215e-06, "loss": 0.79043889, "memory(GiB)": 147.13, "step": 37420, "train_speed(iter/s)": 0.201823 }, { "acc": 0.78104959, "epoch": 0.8732702430773269, "grad_norm": 5.0625, "learning_rate": 6.255339125332007e-06, "loss": 0.79519758, "memory(GiB)": 147.13, "step": 37430, "train_speed(iter/s)": 0.201849 }, { "acc": 0.7618619, "epoch": 0.8735035506496158, "grad_norm": 4.90625, "learning_rate": 6.253510433333996e-06, "loss": 0.86247349, "memory(GiB)": 147.13, "step": 37440, "train_speed(iter/s)": 0.201877 }, { "acc": 0.77435389, "epoch": 0.8737368582219046, "grad_norm": 5.84375, "learning_rate": 6.251681562397736e-06, "loss": 0.79898653, "memory(GiB)": 147.13, "step": 37450, "train_speed(iter/s)": 0.201905 }, { "acc": 0.78359423, "epoch": 0.8739701657941935, "grad_norm": 5.4375, "learning_rate": 6.2498525127842955e-06, "loss": 0.76345568, "memory(GiB)": 147.13, "step": 37460, "train_speed(iter/s)": 0.201931 }, { "acc": 0.7648387, "epoch": 0.8742034733664824, "grad_norm": 5.25, "learning_rate": 6.248023284754772e-06, "loss": 0.83345156, "memory(GiB)": 147.13, "step": 37470, "train_speed(iter/s)": 0.20196 }, { "acc": 0.78857408, "epoch": 0.8744367809387713, "grad_norm": 5.0625, "learning_rate": 6.2461938785702866e-06, "loss": 0.75806623, "memory(GiB)": 147.13, "step": 37480, "train_speed(iter/s)": 0.201987 }, { "acc": 0.76943989, "epoch": 0.8746700885110602, "grad_norm": 3.984375, "learning_rate": 6.244364294491989e-06, "loss": 0.82730265, "memory(GiB)": 147.13, "step": 37490, "train_speed(iter/s)": 0.202015 }, { "acc": 0.77271547, "epoch": 0.8749033960833491, "grad_norm": 7.25, "learning_rate": 6.2425345327810485e-06, "loss": 0.81668425, "memory(GiB)": 147.13, "step": 37500, "train_speed(iter/s)": 0.202043 }, { "epoch": 0.8749033960833491, "eval_acc": 0.7432971231209676, "eval_loss": 0.8092904686927795, "eval_runtime": 1270.0898, "eval_samples_per_second": 28.337, "eval_steps_per_second": 14.169, "step": 37500 }, { "acc": 0.77455845, "epoch": 0.875136703655638, "grad_norm": 5.8125, "learning_rate": 6.240704593698664e-06, "loss": 0.79787617, "memory(GiB)": 147.13, "step": 37510, "train_speed(iter/s)": 0.20068 }, { "acc": 0.77178693, "epoch": 0.8753700112279269, "grad_norm": 6.84375, "learning_rate": 6.238874477506061e-06, "loss": 0.81941929, "memory(GiB)": 147.13, "step": 37520, "train_speed(iter/s)": 0.200701 }, { "acc": 0.75730181, "epoch": 0.8756033188002158, "grad_norm": 6.46875, "learning_rate": 6.237044184464485e-06, "loss": 0.89033489, "memory(GiB)": 147.13, "step": 37530, "train_speed(iter/s)": 0.20073 }, { "acc": 0.77331123, "epoch": 0.8758366263725047, "grad_norm": 4.3125, "learning_rate": 6.235213714835211e-06, "loss": 0.82823706, "memory(GiB)": 147.13, "step": 37540, "train_speed(iter/s)": 0.200755 }, { "acc": 0.78762941, "epoch": 0.8760699339447936, "grad_norm": 4.125, "learning_rate": 6.233383068879538e-06, "loss": 0.75171795, "memory(GiB)": 147.13, "step": 37550, "train_speed(iter/s)": 0.200782 }, { "acc": 0.7689115, "epoch": 0.8763032415170825, "grad_norm": 5.28125, "learning_rate": 6.231552246858791e-06, "loss": 0.83698616, "memory(GiB)": 147.13, "step": 37560, "train_speed(iter/s)": 0.20081 }, { "acc": 0.78390293, "epoch": 0.8765365490893714, "grad_norm": 5.78125, "learning_rate": 6.229721249034318e-06, "loss": 0.79848022, "memory(GiB)": 147.13, "step": 37570, "train_speed(iter/s)": 0.200839 }, { "acc": 0.78724899, "epoch": 0.8767698566616603, "grad_norm": 5.59375, "learning_rate": 6.227890075667492e-06, "loss": 0.77459116, "memory(GiB)": 147.13, "step": 37580, "train_speed(iter/s)": 0.200868 }, { "acc": 0.79053335, "epoch": 0.8770031642339492, "grad_norm": 5.65625, "learning_rate": 6.226058727019717e-06, "loss": 0.75305357, "memory(GiB)": 147.13, "step": 37590, "train_speed(iter/s)": 0.200893 }, { "acc": 0.75518756, "epoch": 0.8772364718062381, "grad_norm": 5.8125, "learning_rate": 6.224227203352415e-06, "loss": 0.87923584, "memory(GiB)": 147.13, "step": 37600, "train_speed(iter/s)": 0.20092 }, { "acc": 0.78576441, "epoch": 0.877469779378527, "grad_norm": 4.875, "learning_rate": 6.222395504927035e-06, "loss": 0.77100844, "memory(GiB)": 147.13, "step": 37610, "train_speed(iter/s)": 0.200949 }, { "acc": 0.7848784, "epoch": 0.8777030869508159, "grad_norm": 6.3125, "learning_rate": 6.22056363200505e-06, "loss": 0.7881711, "memory(GiB)": 147.13, "step": 37620, "train_speed(iter/s)": 0.200976 }, { "acc": 0.77304902, "epoch": 0.8779363945231048, "grad_norm": 10.875, "learning_rate": 6.218731584847963e-06, "loss": 0.81832428, "memory(GiB)": 147.13, "step": 37630, "train_speed(iter/s)": 0.201004 }, { "acc": 0.78921366, "epoch": 0.8781697020953936, "grad_norm": 6.65625, "learning_rate": 6.216899363717295e-06, "loss": 0.76330223, "memory(GiB)": 147.13, "step": 37640, "train_speed(iter/s)": 0.201032 }, { "acc": 0.77204285, "epoch": 0.8784030096676825, "grad_norm": 7.15625, "learning_rate": 6.215066968874596e-06, "loss": 0.83313923, "memory(GiB)": 147.13, "step": 37650, "train_speed(iter/s)": 0.201059 }, { "acc": 0.78158464, "epoch": 0.8786363172399714, "grad_norm": 5.75, "learning_rate": 6.213234400581442e-06, "loss": 0.7798768, "memory(GiB)": 147.13, "step": 37660, "train_speed(iter/s)": 0.201087 }, { "acc": 0.77431865, "epoch": 0.8788696248122603, "grad_norm": 13.3125, "learning_rate": 6.2114016590994295e-06, "loss": 0.82120934, "memory(GiB)": 147.13, "step": 37670, "train_speed(iter/s)": 0.201113 }, { "acc": 0.76597242, "epoch": 0.8791029323845492, "grad_norm": 6.1875, "learning_rate": 6.209568744690181e-06, "loss": 0.82842293, "memory(GiB)": 147.13, "step": 37680, "train_speed(iter/s)": 0.20114 }, { "acc": 0.7780242, "epoch": 0.8793362399568381, "grad_norm": 7.125, "learning_rate": 6.207735657615346e-06, "loss": 0.7849369, "memory(GiB)": 147.13, "step": 37690, "train_speed(iter/s)": 0.201167 }, { "acc": 0.79243302, "epoch": 0.879569547529127, "grad_norm": 4.15625, "learning_rate": 6.2059023981365965e-06, "loss": 0.75370507, "memory(GiB)": 147.13, "step": 37700, "train_speed(iter/s)": 0.201196 }, { "acc": 0.78596573, "epoch": 0.8798028551014159, "grad_norm": 5.46875, "learning_rate": 6.20406896651563e-06, "loss": 0.76070099, "memory(GiB)": 147.13, "step": 37710, "train_speed(iter/s)": 0.201224 }, { "acc": 0.77707858, "epoch": 0.8800361626737048, "grad_norm": 6.59375, "learning_rate": 6.202235363014169e-06, "loss": 0.82894115, "memory(GiB)": 147.13, "step": 37720, "train_speed(iter/s)": 0.201248 }, { "acc": 0.77520704, "epoch": 0.8802694702459937, "grad_norm": 6.8125, "learning_rate": 6.2004015878939585e-06, "loss": 0.81481781, "memory(GiB)": 147.13, "step": 37730, "train_speed(iter/s)": 0.201275 }, { "acc": 0.78262343, "epoch": 0.8805027778182826, "grad_norm": 6.28125, "learning_rate": 6.198567641416772e-06, "loss": 0.78242087, "memory(GiB)": 147.13, "step": 37740, "train_speed(iter/s)": 0.201304 }, { "acc": 0.77590933, "epoch": 0.8807360853905715, "grad_norm": 5.09375, "learning_rate": 6.1967335238444004e-06, "loss": 0.81266613, "memory(GiB)": 147.13, "step": 37750, "train_speed(iter/s)": 0.201331 }, { "acc": 0.7834343, "epoch": 0.8809693929628604, "grad_norm": 6.9375, "learning_rate": 6.194899235438666e-06, "loss": 0.75194969, "memory(GiB)": 147.13, "step": 37760, "train_speed(iter/s)": 0.201358 }, { "acc": 0.75540552, "epoch": 0.8812027005351493, "grad_norm": 4.34375, "learning_rate": 6.193064776461415e-06, "loss": 0.88943443, "memory(GiB)": 147.13, "step": 37770, "train_speed(iter/s)": 0.201387 }, { "acc": 0.78063102, "epoch": 0.8814360081074382, "grad_norm": 4.25, "learning_rate": 6.191230147174512e-06, "loss": 0.80147295, "memory(GiB)": 147.13, "step": 37780, "train_speed(iter/s)": 0.201411 }, { "acc": 0.77797747, "epoch": 0.8816693156797271, "grad_norm": 5.5625, "learning_rate": 6.1893953478398515e-06, "loss": 0.81042271, "memory(GiB)": 147.13, "step": 37790, "train_speed(iter/s)": 0.201438 }, { "acc": 0.78152409, "epoch": 0.881902623252016, "grad_norm": 5.96875, "learning_rate": 6.18756037871935e-06, "loss": 0.77627401, "memory(GiB)": 147.13, "step": 37800, "train_speed(iter/s)": 0.201464 }, { "acc": 0.76364655, "epoch": 0.8821359308243049, "grad_norm": 4.125, "learning_rate": 6.185725240074951e-06, "loss": 0.85897408, "memory(GiB)": 147.13, "step": 37810, "train_speed(iter/s)": 0.201494 }, { "acc": 0.78259721, "epoch": 0.8823692383965938, "grad_norm": 5.28125, "learning_rate": 6.1838899321686185e-06, "loss": 0.78828173, "memory(GiB)": 147.13, "step": 37820, "train_speed(iter/s)": 0.201523 }, { "acc": 0.77779245, "epoch": 0.8826025459688827, "grad_norm": 4.625, "learning_rate": 6.1820544552623415e-06, "loss": 0.80315275, "memory(GiB)": 147.13, "step": 37830, "train_speed(iter/s)": 0.201551 }, { "acc": 0.77334061, "epoch": 0.8828358535411714, "grad_norm": 5.78125, "learning_rate": 6.180218809618135e-06, "loss": 0.81666965, "memory(GiB)": 147.13, "step": 37840, "train_speed(iter/s)": 0.201578 }, { "acc": 0.76984267, "epoch": 0.8830691611134603, "grad_norm": 6.96875, "learning_rate": 6.1783829954980345e-06, "loss": 0.83126059, "memory(GiB)": 147.13, "step": 37850, "train_speed(iter/s)": 0.201607 }, { "acc": 0.77715025, "epoch": 0.8833024686857492, "grad_norm": 6.84375, "learning_rate": 6.176547013164104e-06, "loss": 0.79118605, "memory(GiB)": 147.13, "step": 37860, "train_speed(iter/s)": 0.201634 }, { "acc": 0.76471095, "epoch": 0.8835357762580381, "grad_norm": 5.34375, "learning_rate": 6.17471086287843e-06, "loss": 0.83969116, "memory(GiB)": 147.13, "step": 37870, "train_speed(iter/s)": 0.201663 }, { "acc": 0.78056355, "epoch": 0.883769083830327, "grad_norm": 7.3125, "learning_rate": 6.172874544903122e-06, "loss": 0.76982565, "memory(GiB)": 147.13, "step": 37880, "train_speed(iter/s)": 0.201689 }, { "acc": 0.78541079, "epoch": 0.8840023914026159, "grad_norm": 6.625, "learning_rate": 6.171038059500315e-06, "loss": 0.77315903, "memory(GiB)": 147.13, "step": 37890, "train_speed(iter/s)": 0.201715 }, { "acc": 0.78337684, "epoch": 0.8842356989749048, "grad_norm": 4.15625, "learning_rate": 6.169201406932163e-06, "loss": 0.78238239, "memory(GiB)": 147.13, "step": 37900, "train_speed(iter/s)": 0.201744 }, { "acc": 0.7873363, "epoch": 0.8844690065471937, "grad_norm": 6.03125, "learning_rate": 6.167364587460849e-06, "loss": 0.75477743, "memory(GiB)": 147.13, "step": 37910, "train_speed(iter/s)": 0.201772 }, { "acc": 0.7772418, "epoch": 0.8847023141194826, "grad_norm": 5.71875, "learning_rate": 6.16552760134858e-06, "loss": 0.84283714, "memory(GiB)": 147.13, "step": 37920, "train_speed(iter/s)": 0.201801 }, { "acc": 0.78096313, "epoch": 0.8849356216917715, "grad_norm": 5.34375, "learning_rate": 6.1636904488575845e-06, "loss": 0.79128194, "memory(GiB)": 147.13, "step": 37930, "train_speed(iter/s)": 0.201826 }, { "acc": 0.79532461, "epoch": 0.8851689292640604, "grad_norm": 5.5625, "learning_rate": 6.161853130250117e-06, "loss": 0.74347682, "memory(GiB)": 147.13, "step": 37940, "train_speed(iter/s)": 0.201852 }, { "acc": 0.77315941, "epoch": 0.8854022368363493, "grad_norm": 4.75, "learning_rate": 6.160015645788451e-06, "loss": 0.8172802, "memory(GiB)": 147.13, "step": 37950, "train_speed(iter/s)": 0.201879 }, { "acc": 0.77103496, "epoch": 0.8856355444086382, "grad_norm": 5.1875, "learning_rate": 6.15817799573489e-06, "loss": 0.83767815, "memory(GiB)": 147.13, "step": 37960, "train_speed(iter/s)": 0.201905 }, { "acc": 0.80051785, "epoch": 0.8858688519809271, "grad_norm": 5.53125, "learning_rate": 6.1563401803517545e-06, "loss": 0.70509105, "memory(GiB)": 147.13, "step": 37970, "train_speed(iter/s)": 0.201931 }, { "acc": 0.77665696, "epoch": 0.886102159553216, "grad_norm": 5.375, "learning_rate": 6.154502199901396e-06, "loss": 0.81156635, "memory(GiB)": 147.13, "step": 37980, "train_speed(iter/s)": 0.201956 }, { "acc": 0.76781158, "epoch": 0.8863354671255049, "grad_norm": 4.46875, "learning_rate": 6.152664054646183e-06, "loss": 0.83875208, "memory(GiB)": 147.13, "step": 37990, "train_speed(iter/s)": 0.201984 }, { "acc": 0.75077744, "epoch": 0.8865687746977938, "grad_norm": 5.875, "learning_rate": 6.150825744848511e-06, "loss": 0.90200367, "memory(GiB)": 147.13, "step": 38000, "train_speed(iter/s)": 0.202011 }, { "epoch": 0.8865687746977938, "eval_acc": 0.74343228485303, "eval_loss": 0.8092561960220337, "eval_runtime": 1268.5776, "eval_samples_per_second": 28.371, "eval_steps_per_second": 14.186, "step": 38000 }, { "acc": 0.7838891, "epoch": 0.8868020822700827, "grad_norm": 5.5625, "learning_rate": 6.148987270770798e-06, "loss": 0.78417902, "memory(GiB)": 147.13, "step": 38010, "train_speed(iter/s)": 0.200667 }, { "acc": 0.79363413, "epoch": 0.8870353898423716, "grad_norm": 4.78125, "learning_rate": 6.147148632675486e-06, "loss": 0.73398776, "memory(GiB)": 147.13, "step": 38020, "train_speed(iter/s)": 0.200693 }, { "acc": 0.77010832, "epoch": 0.8872686974146604, "grad_norm": 12.5, "learning_rate": 6.145309830825041e-06, "loss": 0.87349873, "memory(GiB)": 147.13, "step": 38030, "train_speed(iter/s)": 0.200721 }, { "acc": 0.76912684, "epoch": 0.8875020049869493, "grad_norm": 5.15625, "learning_rate": 6.143470865481948e-06, "loss": 0.81946507, "memory(GiB)": 147.13, "step": 38040, "train_speed(iter/s)": 0.200748 }, { "acc": 0.76783762, "epoch": 0.8877353125592382, "grad_norm": 5.125, "learning_rate": 6.141631736908723e-06, "loss": 0.82789316, "memory(GiB)": 147.13, "step": 38050, "train_speed(iter/s)": 0.200774 }, { "acc": 0.78408384, "epoch": 0.8879686201315271, "grad_norm": 7.0625, "learning_rate": 6.1397924453679e-06, "loss": 0.7791399, "memory(GiB)": 147.13, "step": 38060, "train_speed(iter/s)": 0.2008 }, { "acc": 0.77008028, "epoch": 0.888201927703816, "grad_norm": 7.625, "learning_rate": 6.137952991122035e-06, "loss": 0.83867502, "memory(GiB)": 147.13, "step": 38070, "train_speed(iter/s)": 0.200828 }, { "acc": 0.79044018, "epoch": 0.8884352352761049, "grad_norm": 3.90625, "learning_rate": 6.136113374433712e-06, "loss": 0.74994631, "memory(GiB)": 147.13, "step": 38080, "train_speed(iter/s)": 0.200854 }, { "acc": 0.76272373, "epoch": 0.8886685428483938, "grad_norm": 4.84375, "learning_rate": 6.134273595565534e-06, "loss": 0.84841614, "memory(GiB)": 147.13, "step": 38090, "train_speed(iter/s)": 0.200882 }, { "acc": 0.76892252, "epoch": 0.8889018504206827, "grad_norm": 6.8125, "learning_rate": 6.13243365478013e-06, "loss": 0.81232853, "memory(GiB)": 147.13, "step": 38100, "train_speed(iter/s)": 0.20091 }, { "acc": 0.78824024, "epoch": 0.8891351579929716, "grad_norm": 5.59375, "learning_rate": 6.13059355234015e-06, "loss": 0.75986581, "memory(GiB)": 147.13, "step": 38110, "train_speed(iter/s)": 0.200939 }, { "acc": 0.77105479, "epoch": 0.8893684655652605, "grad_norm": 6.46875, "learning_rate": 6.128753288508271e-06, "loss": 0.83737135, "memory(GiB)": 147.13, "step": 38120, "train_speed(iter/s)": 0.200967 }, { "acc": 0.79165573, "epoch": 0.8896017731375494, "grad_norm": 7.875, "learning_rate": 6.126912863547186e-06, "loss": 0.74300852, "memory(GiB)": 147.13, "step": 38130, "train_speed(iter/s)": 0.200993 }, { "acc": 0.79510765, "epoch": 0.8898350807098383, "grad_norm": 5.0, "learning_rate": 6.125072277719618e-06, "loss": 0.7389545, "memory(GiB)": 147.13, "step": 38140, "train_speed(iter/s)": 0.201021 }, { "acc": 0.79052238, "epoch": 0.8900683882821272, "grad_norm": 6.25, "learning_rate": 6.123231531288308e-06, "loss": 0.75054169, "memory(GiB)": 147.13, "step": 38150, "train_speed(iter/s)": 0.201048 }, { "acc": 0.76992264, "epoch": 0.8903016958544161, "grad_norm": 7.71875, "learning_rate": 6.121390624516026e-06, "loss": 0.83893986, "memory(GiB)": 147.13, "step": 38160, "train_speed(iter/s)": 0.201076 }, { "acc": 0.78948836, "epoch": 0.890535003426705, "grad_norm": 8.0625, "learning_rate": 6.119549557665556e-06, "loss": 0.74797955, "memory(GiB)": 147.13, "step": 38170, "train_speed(iter/s)": 0.201103 }, { "acc": 0.77837534, "epoch": 0.8907683109989939, "grad_norm": 6.4375, "learning_rate": 6.117708330999712e-06, "loss": 0.78834462, "memory(GiB)": 147.13, "step": 38180, "train_speed(iter/s)": 0.201129 }, { "acc": 0.75297174, "epoch": 0.8910016185712828, "grad_norm": 4.59375, "learning_rate": 6.115866944781329e-06, "loss": 0.86872931, "memory(GiB)": 147.13, "step": 38190, "train_speed(iter/s)": 0.201158 }, { "acc": 0.79166384, "epoch": 0.8912349261435717, "grad_norm": 6.21875, "learning_rate": 6.114025399273264e-06, "loss": 0.72422724, "memory(GiB)": 147.13, "step": 38200, "train_speed(iter/s)": 0.201184 }, { "acc": 0.788766, "epoch": 0.8914682337158606, "grad_norm": 5.21875, "learning_rate": 6.112183694738395e-06, "loss": 0.74276934, "memory(GiB)": 147.13, "step": 38210, "train_speed(iter/s)": 0.20121 }, { "acc": 0.78757305, "epoch": 0.8917015412881494, "grad_norm": 5.75, "learning_rate": 6.110341831439628e-06, "loss": 0.76513844, "memory(GiB)": 147.13, "step": 38220, "train_speed(iter/s)": 0.201236 }, { "acc": 0.77797546, "epoch": 0.8919348488604383, "grad_norm": 5.1875, "learning_rate": 6.108499809639887e-06, "loss": 0.80226593, "memory(GiB)": 147.13, "step": 38230, "train_speed(iter/s)": 0.201263 }, { "acc": 0.78466263, "epoch": 0.8921681564327272, "grad_norm": 6.09375, "learning_rate": 6.106657629602122e-06, "loss": 0.76339788, "memory(GiB)": 147.13, "step": 38240, "train_speed(iter/s)": 0.201292 }, { "acc": 0.78802223, "epoch": 0.8924014640050161, "grad_norm": 4.5, "learning_rate": 6.104815291589299e-06, "loss": 0.75786328, "memory(GiB)": 147.13, "step": 38250, "train_speed(iter/s)": 0.20132 }, { "acc": 0.77927275, "epoch": 0.892634771577305, "grad_norm": 7.09375, "learning_rate": 6.1029727958644144e-06, "loss": 0.80576792, "memory(GiB)": 147.13, "step": 38260, "train_speed(iter/s)": 0.201349 }, { "acc": 0.78089199, "epoch": 0.8928680791495939, "grad_norm": 5.84375, "learning_rate": 6.1011301426904845e-06, "loss": 0.80531197, "memory(GiB)": 147.13, "step": 38270, "train_speed(iter/s)": 0.201377 }, { "acc": 0.79331675, "epoch": 0.8931013867218828, "grad_norm": 5.59375, "learning_rate": 6.0992873323305465e-06, "loss": 0.74135227, "memory(GiB)": 147.13, "step": 38280, "train_speed(iter/s)": 0.201404 }, { "acc": 0.78814082, "epoch": 0.8933346942941717, "grad_norm": 4.9375, "learning_rate": 6.097444365047662e-06, "loss": 0.76247959, "memory(GiB)": 147.13, "step": 38290, "train_speed(iter/s)": 0.201432 }, { "acc": 0.76565762, "epoch": 0.8935680018664606, "grad_norm": 6.375, "learning_rate": 6.095601241104911e-06, "loss": 0.85332813, "memory(GiB)": 147.13, "step": 38300, "train_speed(iter/s)": 0.201459 }, { "acc": 0.79775143, "epoch": 0.8938013094387495, "grad_norm": 4.78125, "learning_rate": 6.093757960765404e-06, "loss": 0.72894554, "memory(GiB)": 147.13, "step": 38310, "train_speed(iter/s)": 0.201486 }, { "acc": 0.76751165, "epoch": 0.8940346170110384, "grad_norm": 4.90625, "learning_rate": 6.091914524292264e-06, "loss": 0.86291084, "memory(GiB)": 147.13, "step": 38320, "train_speed(iter/s)": 0.201515 }, { "acc": 0.76315336, "epoch": 0.8942679245833273, "grad_norm": 6.5, "learning_rate": 6.090070931948643e-06, "loss": 0.85644455, "memory(GiB)": 147.13, "step": 38330, "train_speed(iter/s)": 0.201542 }, { "acc": 0.79633956, "epoch": 0.8945012321556162, "grad_norm": 6.65625, "learning_rate": 6.088227183997715e-06, "loss": 0.72798471, "memory(GiB)": 147.13, "step": 38340, "train_speed(iter/s)": 0.20157 }, { "acc": 0.78646612, "epoch": 0.8947345397279051, "grad_norm": 5.65625, "learning_rate": 6.08638328070267e-06, "loss": 0.75003166, "memory(GiB)": 147.13, "step": 38350, "train_speed(iter/s)": 0.201596 }, { "acc": 0.77957439, "epoch": 0.894967847300194, "grad_norm": 5.875, "learning_rate": 6.084539222326728e-06, "loss": 0.78398819, "memory(GiB)": 147.13, "step": 38360, "train_speed(iter/s)": 0.201624 }, { "acc": 0.78460279, "epoch": 0.8952011548724829, "grad_norm": 6.96875, "learning_rate": 6.082695009133126e-06, "loss": 0.77256527, "memory(GiB)": 147.13, "step": 38370, "train_speed(iter/s)": 0.201652 }, { "acc": 0.78010826, "epoch": 0.8954344624447718, "grad_norm": 6.8125, "learning_rate": 6.080850641385129e-06, "loss": 0.79071956, "memory(GiB)": 147.13, "step": 38380, "train_speed(iter/s)": 0.201679 }, { "acc": 0.76988091, "epoch": 0.8956677700170607, "grad_norm": 4.6875, "learning_rate": 6.079006119346015e-06, "loss": 0.83236942, "memory(GiB)": 147.13, "step": 38390, "train_speed(iter/s)": 0.201707 }, { "acc": 0.78275881, "epoch": 0.8959010775893496, "grad_norm": 4.90625, "learning_rate": 6.0771614432790915e-06, "loss": 0.76272345, "memory(GiB)": 147.13, "step": 38400, "train_speed(iter/s)": 0.201736 }, { "acc": 0.79821758, "epoch": 0.8961343851616383, "grad_norm": 3.765625, "learning_rate": 6.075316613447684e-06, "loss": 0.7420167, "memory(GiB)": 147.13, "step": 38410, "train_speed(iter/s)": 0.20176 }, { "acc": 0.78013487, "epoch": 0.8963676927339272, "grad_norm": 5.03125, "learning_rate": 6.073471630115142e-06, "loss": 0.77953615, "memory(GiB)": 147.13, "step": 38420, "train_speed(iter/s)": 0.201788 }, { "acc": 0.76554446, "epoch": 0.8966010003062161, "grad_norm": 5.4375, "learning_rate": 6.071626493544838e-06, "loss": 0.85112772, "memory(GiB)": 147.13, "step": 38430, "train_speed(iter/s)": 0.201815 }, { "acc": 0.78542604, "epoch": 0.896834307878505, "grad_norm": 4.34375, "learning_rate": 6.0697812040001625e-06, "loss": 0.7776813, "memory(GiB)": 147.13, "step": 38440, "train_speed(iter/s)": 0.201842 }, { "acc": 0.79344821, "epoch": 0.8970676154507939, "grad_norm": 8.5, "learning_rate": 6.067935761744531e-06, "loss": 0.74864655, "memory(GiB)": 147.13, "step": 38450, "train_speed(iter/s)": 0.201867 }, { "acc": 0.77485456, "epoch": 0.8973009230230828, "grad_norm": 5.4375, "learning_rate": 6.066090167041381e-06, "loss": 0.78871832, "memory(GiB)": 147.13, "step": 38460, "train_speed(iter/s)": 0.201894 }, { "acc": 0.7692565, "epoch": 0.8975342305953717, "grad_norm": 6.0625, "learning_rate": 6.0642444201541686e-06, "loss": 0.83406401, "memory(GiB)": 147.13, "step": 38470, "train_speed(iter/s)": 0.201921 }, { "acc": 0.78231134, "epoch": 0.8977675381676606, "grad_norm": 4.28125, "learning_rate": 6.062398521346374e-06, "loss": 0.77111473, "memory(GiB)": 147.13, "step": 38480, "train_speed(iter/s)": 0.201948 }, { "acc": 0.78063517, "epoch": 0.8980008457399495, "grad_norm": 5.96875, "learning_rate": 6.060552470881498e-06, "loss": 0.77957792, "memory(GiB)": 147.13, "step": 38490, "train_speed(iter/s)": 0.201974 }, { "acc": 0.77691154, "epoch": 0.8982341533122384, "grad_norm": 4.65625, "learning_rate": 6.0587062690230654e-06, "loss": 0.79885607, "memory(GiB)": 147.13, "step": 38500, "train_speed(iter/s)": 0.202001 }, { "epoch": 0.8982341533122384, "eval_acc": 0.7434691180264593, "eval_loss": 0.8091056942939758, "eval_runtime": 1269.2357, "eval_samples_per_second": 28.356, "eval_steps_per_second": 14.179, "step": 38500 }, { "acc": 0.76533337, "epoch": 0.8984674608845273, "grad_norm": 5.875, "learning_rate": 6.056859916034621e-06, "loss": 0.85079603, "memory(GiB)": 147.13, "step": 38510, "train_speed(iter/s)": 0.200674 }, { "acc": 0.80131989, "epoch": 0.8987007684568162, "grad_norm": 5.15625, "learning_rate": 6.055013412179732e-06, "loss": 0.73174257, "memory(GiB)": 147.13, "step": 38520, "train_speed(iter/s)": 0.200701 }, { "acc": 0.77752538, "epoch": 0.8989340760291051, "grad_norm": 5.59375, "learning_rate": 6.053166757721984e-06, "loss": 0.79698305, "memory(GiB)": 147.13, "step": 38530, "train_speed(iter/s)": 0.200726 }, { "acc": 0.78054886, "epoch": 0.899167383601394, "grad_norm": 5.03125, "learning_rate": 6.051319952924987e-06, "loss": 0.77370358, "memory(GiB)": 147.13, "step": 38540, "train_speed(iter/s)": 0.200752 }, { "acc": 0.79228392, "epoch": 0.8994006911736829, "grad_norm": 6.125, "learning_rate": 6.049472998052371e-06, "loss": 0.73808813, "memory(GiB)": 147.13, "step": 38550, "train_speed(iter/s)": 0.200779 }, { "acc": 0.78893967, "epoch": 0.8996339987459718, "grad_norm": 11.9375, "learning_rate": 6.047625893367791e-06, "loss": 0.74676943, "memory(GiB)": 147.13, "step": 38560, "train_speed(iter/s)": 0.200807 }, { "acc": 0.77075801, "epoch": 0.8998673063182607, "grad_norm": 6.25, "learning_rate": 6.0457786391349195e-06, "loss": 0.83967133, "memory(GiB)": 147.13, "step": 38570, "train_speed(iter/s)": 0.200834 }, { "acc": 0.77375088, "epoch": 0.9001006138905496, "grad_norm": 35.5, "learning_rate": 6.0439312356174495e-06, "loss": 0.83246117, "memory(GiB)": 147.13, "step": 38580, "train_speed(iter/s)": 0.200859 }, { "acc": 0.77322402, "epoch": 0.9003339214628385, "grad_norm": 4.71875, "learning_rate": 6.042083683079099e-06, "loss": 0.80312271, "memory(GiB)": 147.13, "step": 38590, "train_speed(iter/s)": 0.200886 }, { "acc": 0.78943686, "epoch": 0.9005672290351274, "grad_norm": 7.0, "learning_rate": 6.0402359817836065e-06, "loss": 0.76622558, "memory(GiB)": 147.13, "step": 38600, "train_speed(iter/s)": 0.200912 }, { "acc": 0.78096523, "epoch": 0.9008005366074162, "grad_norm": 6.9375, "learning_rate": 6.038388131994729e-06, "loss": 0.78436923, "memory(GiB)": 147.13, "step": 38610, "train_speed(iter/s)": 0.200938 }, { "acc": 0.7831727, "epoch": 0.9010338441797051, "grad_norm": 6.21875, "learning_rate": 6.036540133976247e-06, "loss": 0.77339869, "memory(GiB)": 147.13, "step": 38620, "train_speed(iter/s)": 0.200965 }, { "acc": 0.77260571, "epoch": 0.901267151751994, "grad_norm": 5.3125, "learning_rate": 6.034691987991963e-06, "loss": 0.81804924, "memory(GiB)": 147.13, "step": 38630, "train_speed(iter/s)": 0.200992 }, { "acc": 0.7917419, "epoch": 0.9015004593242829, "grad_norm": 3.796875, "learning_rate": 6.032843694305698e-06, "loss": 0.74714317, "memory(GiB)": 147.13, "step": 38640, "train_speed(iter/s)": 0.201018 }, { "acc": 0.78762455, "epoch": 0.9017337668965718, "grad_norm": 7.6875, "learning_rate": 6.0309952531812955e-06, "loss": 0.74814758, "memory(GiB)": 147.13, "step": 38650, "train_speed(iter/s)": 0.201046 }, { "acc": 0.78616996, "epoch": 0.9019670744688607, "grad_norm": 6.15625, "learning_rate": 6.029146664882619e-06, "loss": 0.77387581, "memory(GiB)": 147.13, "step": 38660, "train_speed(iter/s)": 0.201075 }, { "acc": 0.7877409, "epoch": 0.9022003820411496, "grad_norm": 4.65625, "learning_rate": 6.027297929673557e-06, "loss": 0.75647035, "memory(GiB)": 147.13, "step": 38670, "train_speed(iter/s)": 0.201104 }, { "acc": 0.77807121, "epoch": 0.9024336896134385, "grad_norm": 5.65625, "learning_rate": 6.025449047818012e-06, "loss": 0.80177183, "memory(GiB)": 147.13, "step": 38680, "train_speed(iter/s)": 0.201132 }, { "acc": 0.79884205, "epoch": 0.9026669971857274, "grad_norm": 3.53125, "learning_rate": 6.0236000195799164e-06, "loss": 0.73010259, "memory(GiB)": 147.13, "step": 38690, "train_speed(iter/s)": 0.201158 }, { "acc": 0.78179312, "epoch": 0.9029003047580163, "grad_norm": 6.6875, "learning_rate": 6.0217508452232135e-06, "loss": 0.76933064, "memory(GiB)": 147.13, "step": 38700, "train_speed(iter/s)": 0.201183 }, { "acc": 0.77845345, "epoch": 0.9031336123303052, "grad_norm": 8.9375, "learning_rate": 6.019901525011873e-06, "loss": 0.78353348, "memory(GiB)": 147.13, "step": 38710, "train_speed(iter/s)": 0.201211 }, { "acc": 0.78779402, "epoch": 0.9033669199025941, "grad_norm": 3.765625, "learning_rate": 6.018052059209887e-06, "loss": 0.76179886, "memory(GiB)": 147.13, "step": 38720, "train_speed(iter/s)": 0.201239 }, { "acc": 0.75972013, "epoch": 0.903600227474883, "grad_norm": 5.5625, "learning_rate": 6.016202448081266e-06, "loss": 0.8852212, "memory(GiB)": 147.13, "step": 38730, "train_speed(iter/s)": 0.201265 }, { "acc": 0.77685909, "epoch": 0.9038335350471719, "grad_norm": 7.03125, "learning_rate": 6.014352691890041e-06, "loss": 0.79779468, "memory(GiB)": 147.13, "step": 38740, "train_speed(iter/s)": 0.201293 }, { "acc": 0.79123516, "epoch": 0.9040668426194608, "grad_norm": 4.5625, "learning_rate": 6.012502790900263e-06, "loss": 0.76187844, "memory(GiB)": 147.13, "step": 38750, "train_speed(iter/s)": 0.201321 }, { "acc": 0.78490562, "epoch": 0.9043001501917497, "grad_norm": 5.65625, "learning_rate": 6.010652745376006e-06, "loss": 0.77625852, "memory(GiB)": 147.13, "step": 38760, "train_speed(iter/s)": 0.201347 }, { "acc": 0.77033277, "epoch": 0.9045334577640386, "grad_norm": 5.15625, "learning_rate": 6.008802555581364e-06, "loss": 0.81603584, "memory(GiB)": 147.13, "step": 38770, "train_speed(iter/s)": 0.201374 }, { "acc": 0.76615601, "epoch": 0.9047667653363275, "grad_norm": 6.5, "learning_rate": 6.006952221780447e-06, "loss": 0.86489754, "memory(GiB)": 147.13, "step": 38780, "train_speed(iter/s)": 0.201402 }, { "acc": 0.77902718, "epoch": 0.9050000729086164, "grad_norm": 6.21875, "learning_rate": 6.005101744237396e-06, "loss": 0.79199033, "memory(GiB)": 147.13, "step": 38790, "train_speed(iter/s)": 0.20143 }, { "acc": 0.78239946, "epoch": 0.9052333804809052, "grad_norm": 6.84375, "learning_rate": 6.003251123216362e-06, "loss": 0.76015949, "memory(GiB)": 147.13, "step": 38800, "train_speed(iter/s)": 0.201457 }, { "acc": 0.79029064, "epoch": 0.9054666880531941, "grad_norm": 5.625, "learning_rate": 6.001400358981522e-06, "loss": 0.73135719, "memory(GiB)": 147.13, "step": 38810, "train_speed(iter/s)": 0.201484 }, { "acc": 0.75774117, "epoch": 0.905699995625483, "grad_norm": 8.25, "learning_rate": 5.999549451797073e-06, "loss": 0.88968811, "memory(GiB)": 147.13, "step": 38820, "train_speed(iter/s)": 0.201513 }, { "acc": 0.78999863, "epoch": 0.9059333031977719, "grad_norm": 5.9375, "learning_rate": 5.997698401927228e-06, "loss": 0.77355022, "memory(GiB)": 147.13, "step": 38830, "train_speed(iter/s)": 0.201538 }, { "acc": 0.76618147, "epoch": 0.9061666107700608, "grad_norm": 5.78125, "learning_rate": 5.995847209636227e-06, "loss": 0.85498562, "memory(GiB)": 147.13, "step": 38840, "train_speed(iter/s)": 0.201565 }, { "acc": 0.7809248, "epoch": 0.9063999183423497, "grad_norm": 6.0625, "learning_rate": 5.993995875188324e-06, "loss": 0.76933775, "memory(GiB)": 147.13, "step": 38850, "train_speed(iter/s)": 0.201591 }, { "acc": 0.77357092, "epoch": 0.9066332259146386, "grad_norm": 4.6875, "learning_rate": 5.992144398847801e-06, "loss": 0.81348381, "memory(GiB)": 147.13, "step": 38860, "train_speed(iter/s)": 0.201618 }, { "acc": 0.77968731, "epoch": 0.9068665334869275, "grad_norm": 5.40625, "learning_rate": 5.990292780878952e-06, "loss": 0.79122777, "memory(GiB)": 147.13, "step": 38870, "train_speed(iter/s)": 0.201646 }, { "acc": 0.77165852, "epoch": 0.9070998410592164, "grad_norm": 7.46875, "learning_rate": 5.988441021546097e-06, "loss": 0.82253704, "memory(GiB)": 147.13, "step": 38880, "train_speed(iter/s)": 0.201674 }, { "acc": 0.798492, "epoch": 0.9073331486315053, "grad_norm": 5.15625, "learning_rate": 5.986589121113574e-06, "loss": 0.71198359, "memory(GiB)": 147.13, "step": 38890, "train_speed(iter/s)": 0.2017 }, { "acc": 0.78536911, "epoch": 0.9075664562037942, "grad_norm": 4.78125, "learning_rate": 5.9847370798457395e-06, "loss": 0.77795515, "memory(GiB)": 147.13, "step": 38900, "train_speed(iter/s)": 0.201727 }, { "acc": 0.80710964, "epoch": 0.9077997637760831, "grad_norm": 5.375, "learning_rate": 5.982884898006973e-06, "loss": 0.67784872, "memory(GiB)": 147.13, "step": 38910, "train_speed(iter/s)": 0.201755 }, { "acc": 0.76790614, "epoch": 0.908033071348372, "grad_norm": 4.5, "learning_rate": 5.981032575861674e-06, "loss": 0.83228683, "memory(GiB)": 147.13, "step": 38920, "train_speed(iter/s)": 0.201782 }, { "acc": 0.78043723, "epoch": 0.9082663789206609, "grad_norm": 8.8125, "learning_rate": 5.979180113674258e-06, "loss": 0.79833183, "memory(GiB)": 147.13, "step": 38930, "train_speed(iter/s)": 0.201808 }, { "acc": 0.78106966, "epoch": 0.9084996864929498, "grad_norm": 4.65625, "learning_rate": 5.9773275117091655e-06, "loss": 0.76774416, "memory(GiB)": 147.13, "step": 38940, "train_speed(iter/s)": 0.201834 }, { "acc": 0.77814817, "epoch": 0.9087329940652387, "grad_norm": 6.0, "learning_rate": 5.975474770230856e-06, "loss": 0.80169296, "memory(GiB)": 147.13, "step": 38950, "train_speed(iter/s)": 0.201858 }, { "acc": 0.79056687, "epoch": 0.9089663016375276, "grad_norm": 4.59375, "learning_rate": 5.973621889503804e-06, "loss": 0.74093547, "memory(GiB)": 147.13, "step": 38960, "train_speed(iter/s)": 0.201886 }, { "acc": 0.75867233, "epoch": 0.9091996092098165, "grad_norm": 10.8125, "learning_rate": 5.9717688697925134e-06, "loss": 0.87084265, "memory(GiB)": 147.13, "step": 38970, "train_speed(iter/s)": 0.201913 }, { "acc": 0.77255449, "epoch": 0.9094329167821054, "grad_norm": 4.78125, "learning_rate": 5.969915711361497e-06, "loss": 0.82182369, "memory(GiB)": 147.13, "step": 38980, "train_speed(iter/s)": 0.201939 }, { "acc": 0.77857056, "epoch": 0.9096662243543941, "grad_norm": 5.15625, "learning_rate": 5.968062414475294e-06, "loss": 0.80886345, "memory(GiB)": 147.13, "step": 38990, "train_speed(iter/s)": 0.201965 }, { "acc": 0.77122827, "epoch": 0.909899531926683, "grad_norm": 8.625, "learning_rate": 5.966208979398462e-06, "loss": 0.82270679, "memory(GiB)": 147.13, "step": 39000, "train_speed(iter/s)": 0.201991 }, { "epoch": 0.909899531926683, "eval_acc": 0.7434351674492115, "eval_loss": 0.8086429834365845, "eval_runtime": 1270.6235, "eval_samples_per_second": 28.325, "eval_steps_per_second": 14.163, "step": 39000 }, { "acc": 0.78865824, "epoch": 0.910132839498972, "grad_norm": 5.09375, "learning_rate": 5.964355406395581e-06, "loss": 0.73996811, "memory(GiB)": 147.13, "step": 39010, "train_speed(iter/s)": 0.200679 }, { "acc": 0.77401848, "epoch": 0.9103661470712608, "grad_norm": 7.21875, "learning_rate": 5.962501695731245e-06, "loss": 0.81361609, "memory(GiB)": 147.13, "step": 39020, "train_speed(iter/s)": 0.200707 }, { "acc": 0.76471219, "epoch": 0.9105994546435497, "grad_norm": 6.40625, "learning_rate": 5.9606478476700714e-06, "loss": 0.85037766, "memory(GiB)": 147.13, "step": 39030, "train_speed(iter/s)": 0.200733 }, { "acc": 0.79004974, "epoch": 0.9108327622158386, "grad_norm": 7.375, "learning_rate": 5.958793862476699e-06, "loss": 0.75956354, "memory(GiB)": 147.13, "step": 39040, "train_speed(iter/s)": 0.20076 }, { "acc": 0.78604002, "epoch": 0.9110660697881275, "grad_norm": 11.625, "learning_rate": 5.956939740415778e-06, "loss": 0.77915764, "memory(GiB)": 147.13, "step": 39050, "train_speed(iter/s)": 0.200785 }, { "acc": 0.78488879, "epoch": 0.9112993773604164, "grad_norm": 4.65625, "learning_rate": 5.9550854817519875e-06, "loss": 0.75412149, "memory(GiB)": 147.13, "step": 39060, "train_speed(iter/s)": 0.200811 }, { "acc": 0.79100518, "epoch": 0.9115326849327053, "grad_norm": 3.953125, "learning_rate": 5.953231086750022e-06, "loss": 0.7392344, "memory(GiB)": 147.13, "step": 39070, "train_speed(iter/s)": 0.200837 }, { "acc": 0.76816645, "epoch": 0.9117659925049942, "grad_norm": 7.78125, "learning_rate": 5.951376555674596e-06, "loss": 0.84252634, "memory(GiB)": 147.13, "step": 39080, "train_speed(iter/s)": 0.200863 }, { "acc": 0.7637064, "epoch": 0.9119993000772831, "grad_norm": 6.3125, "learning_rate": 5.949521888790444e-06, "loss": 0.85107002, "memory(GiB)": 147.13, "step": 39090, "train_speed(iter/s)": 0.200889 }, { "acc": 0.77783241, "epoch": 0.912232607649572, "grad_norm": 5.8125, "learning_rate": 5.947667086362318e-06, "loss": 0.78836126, "memory(GiB)": 147.13, "step": 39100, "train_speed(iter/s)": 0.200914 }, { "acc": 0.79197435, "epoch": 0.9124659152218609, "grad_norm": 5.75, "learning_rate": 5.945812148654991e-06, "loss": 0.74622936, "memory(GiB)": 147.13, "step": 39110, "train_speed(iter/s)": 0.200941 }, { "acc": 0.78090453, "epoch": 0.9126992227941498, "grad_norm": 6.84375, "learning_rate": 5.943957075933253e-06, "loss": 0.77442908, "memory(GiB)": 147.13, "step": 39120, "train_speed(iter/s)": 0.200968 }, { "acc": 0.80023155, "epoch": 0.9129325303664387, "grad_norm": 4.4375, "learning_rate": 5.9421018684619165e-06, "loss": 0.71343641, "memory(GiB)": 147.13, "step": 39130, "train_speed(iter/s)": 0.200995 }, { "acc": 0.7824955, "epoch": 0.9131658379387276, "grad_norm": 4.6875, "learning_rate": 5.940246526505814e-06, "loss": 0.78219862, "memory(GiB)": 147.13, "step": 39140, "train_speed(iter/s)": 0.201021 }, { "acc": 0.78558354, "epoch": 0.9133991455110165, "grad_norm": 7.34375, "learning_rate": 5.9383910503297915e-06, "loss": 0.77994785, "memory(GiB)": 147.13, "step": 39150, "train_speed(iter/s)": 0.201046 }, { "acc": 0.77539968, "epoch": 0.9136324530833054, "grad_norm": 5.75, "learning_rate": 5.9365354401987195e-06, "loss": 0.80972853, "memory(GiB)": 147.13, "step": 39160, "train_speed(iter/s)": 0.201072 }, { "acc": 0.78717179, "epoch": 0.9138657606555943, "grad_norm": 5.78125, "learning_rate": 5.934679696377486e-06, "loss": 0.77642155, "memory(GiB)": 147.13, "step": 39170, "train_speed(iter/s)": 0.2011 }, { "acc": 0.76665878, "epoch": 0.9140990682278831, "grad_norm": 4.21875, "learning_rate": 5.932823819130997e-06, "loss": 0.84620333, "memory(GiB)": 147.13, "step": 39180, "train_speed(iter/s)": 0.201124 }, { "acc": 0.77948418, "epoch": 0.914332375800172, "grad_norm": 4.78125, "learning_rate": 5.930967808724178e-06, "loss": 0.79968572, "memory(GiB)": 147.13, "step": 39190, "train_speed(iter/s)": 0.201152 }, { "acc": 0.79536867, "epoch": 0.9145656833724609, "grad_norm": 4.875, "learning_rate": 5.929111665421976e-06, "loss": 0.7362793, "memory(GiB)": 147.13, "step": 39200, "train_speed(iter/s)": 0.201179 }, { "acc": 0.76817694, "epoch": 0.9147989909447498, "grad_norm": 5.4375, "learning_rate": 5.927255389489354e-06, "loss": 0.81463871, "memory(GiB)": 147.13, "step": 39210, "train_speed(iter/s)": 0.201206 }, { "acc": 0.78327427, "epoch": 0.9150322985170387, "grad_norm": 7.375, "learning_rate": 5.925398981191293e-06, "loss": 0.75333605, "memory(GiB)": 147.13, "step": 39220, "train_speed(iter/s)": 0.201233 }, { "acc": 0.78863673, "epoch": 0.9152656060893276, "grad_norm": 4.53125, "learning_rate": 5.9235424407927965e-06, "loss": 0.749193, "memory(GiB)": 147.13, "step": 39230, "train_speed(iter/s)": 0.201259 }, { "acc": 0.76760859, "epoch": 0.9154989136616165, "grad_norm": 6.9375, "learning_rate": 5.9216857685588855e-06, "loss": 0.82770119, "memory(GiB)": 147.13, "step": 39240, "train_speed(iter/s)": 0.201284 }, { "acc": 0.78653831, "epoch": 0.9157322212339054, "grad_norm": 5.09375, "learning_rate": 5.919828964754599e-06, "loss": 0.76294737, "memory(GiB)": 147.13, "step": 39250, "train_speed(iter/s)": 0.201312 }, { "acc": 0.76094551, "epoch": 0.9159655288061943, "grad_norm": 6.21875, "learning_rate": 5.917972029644995e-06, "loss": 0.88056946, "memory(GiB)": 147.13, "step": 39260, "train_speed(iter/s)": 0.201338 }, { "acc": 0.7901597, "epoch": 0.9161988363784832, "grad_norm": 6.28125, "learning_rate": 5.91611496349515e-06, "loss": 0.76451311, "memory(GiB)": 147.13, "step": 39270, "train_speed(iter/s)": 0.201364 }, { "acc": 0.78220377, "epoch": 0.9164321439507721, "grad_norm": 4.46875, "learning_rate": 5.91425776657016e-06, "loss": 0.78790097, "memory(GiB)": 147.13, "step": 39280, "train_speed(iter/s)": 0.201389 }, { "acc": 0.76608167, "epoch": 0.916665451523061, "grad_norm": 5.03125, "learning_rate": 5.912400439135139e-06, "loss": 0.8713728, "memory(GiB)": 147.13, "step": 39290, "train_speed(iter/s)": 0.201416 }, { "acc": 0.77654085, "epoch": 0.9168987590953499, "grad_norm": 5.5625, "learning_rate": 5.9105429814552204e-06, "loss": 0.81738625, "memory(GiB)": 147.13, "step": 39300, "train_speed(iter/s)": 0.201442 }, { "acc": 0.79101648, "epoch": 0.9171320666676388, "grad_norm": 7.65625, "learning_rate": 5.908685393795557e-06, "loss": 0.74483733, "memory(GiB)": 147.13, "step": 39310, "train_speed(iter/s)": 0.201467 }, { "acc": 0.79285164, "epoch": 0.9173653742399277, "grad_norm": 11.4375, "learning_rate": 5.9068276764213175e-06, "loss": 0.73478308, "memory(GiB)": 147.13, "step": 39320, "train_speed(iter/s)": 0.201495 }, { "acc": 0.78043051, "epoch": 0.9175986818122166, "grad_norm": 6.78125, "learning_rate": 5.90496982959769e-06, "loss": 0.80263834, "memory(GiB)": 147.13, "step": 39330, "train_speed(iter/s)": 0.201522 }, { "acc": 0.77075157, "epoch": 0.9178319893845055, "grad_norm": 6.25, "learning_rate": 5.903111853589881e-06, "loss": 0.81982746, "memory(GiB)": 147.13, "step": 39340, "train_speed(iter/s)": 0.201548 }, { "acc": 0.78020802, "epoch": 0.9180652969567944, "grad_norm": 4.84375, "learning_rate": 5.9012537486631185e-06, "loss": 0.78267994, "memory(GiB)": 147.13, "step": 39350, "train_speed(iter/s)": 0.201574 }, { "acc": 0.77198472, "epoch": 0.9182986045290833, "grad_norm": 5.0625, "learning_rate": 5.899395515082644e-06, "loss": 0.82205286, "memory(GiB)": 147.13, "step": 39360, "train_speed(iter/s)": 0.2016 }, { "acc": 0.77900686, "epoch": 0.9185319121013722, "grad_norm": 4.40625, "learning_rate": 5.897537153113724e-06, "loss": 0.77138987, "memory(GiB)": 147.13, "step": 39370, "train_speed(iter/s)": 0.201626 }, { "acc": 0.7678503, "epoch": 0.918765219673661, "grad_norm": 5.96875, "learning_rate": 5.895678663021634e-06, "loss": 0.83069754, "memory(GiB)": 147.13, "step": 39380, "train_speed(iter/s)": 0.201654 }, { "acc": 0.775527, "epoch": 0.9189985272459499, "grad_norm": 5.5, "learning_rate": 5.893820045071675e-06, "loss": 0.80461073, "memory(GiB)": 147.13, "step": 39390, "train_speed(iter/s)": 0.201678 }, { "acc": 0.77663894, "epoch": 0.9192318348182388, "grad_norm": 6.15625, "learning_rate": 5.891961299529165e-06, "loss": 0.79502859, "memory(GiB)": 147.13, "step": 39400, "train_speed(iter/s)": 0.201704 }, { "acc": 0.76121426, "epoch": 0.9194651423905277, "grad_norm": 5.75, "learning_rate": 5.890102426659438e-06, "loss": 0.8578661, "memory(GiB)": 147.13, "step": 39410, "train_speed(iter/s)": 0.20173 }, { "acc": 0.78363619, "epoch": 0.9196984499628166, "grad_norm": 9.0625, "learning_rate": 5.888243426727847e-06, "loss": 0.78227863, "memory(GiB)": 147.13, "step": 39420, "train_speed(iter/s)": 0.201755 }, { "acc": 0.7768178, "epoch": 0.9199317575351055, "grad_norm": 6.5625, "learning_rate": 5.886384299999767e-06, "loss": 0.81008139, "memory(GiB)": 147.13, "step": 39430, "train_speed(iter/s)": 0.20178 }, { "acc": 0.77163954, "epoch": 0.9201650651073944, "grad_norm": 6.0, "learning_rate": 5.884525046740586e-06, "loss": 0.83757038, "memory(GiB)": 147.13, "step": 39440, "train_speed(iter/s)": 0.201805 }, { "acc": 0.76802855, "epoch": 0.9203983726796833, "grad_norm": 4.625, "learning_rate": 5.882665667215709e-06, "loss": 0.84138384, "memory(GiB)": 147.13, "step": 39450, "train_speed(iter/s)": 0.201832 }, { "acc": 0.77422495, "epoch": 0.9206316802519722, "grad_norm": 6.0, "learning_rate": 5.880806161690567e-06, "loss": 0.82321262, "memory(GiB)": 147.13, "step": 39460, "train_speed(iter/s)": 0.201859 }, { "acc": 0.7739378, "epoch": 0.9208649878242611, "grad_norm": 6.25, "learning_rate": 5.878946530430599e-06, "loss": 0.81013727, "memory(GiB)": 147.13, "step": 39470, "train_speed(iter/s)": 0.201885 }, { "acc": 0.77839308, "epoch": 0.92109829539655, "grad_norm": 10.25, "learning_rate": 5.877086773701271e-06, "loss": 0.80809059, "memory(GiB)": 147.13, "step": 39480, "train_speed(iter/s)": 0.201909 }, { "acc": 0.78637233, "epoch": 0.9213316029688389, "grad_norm": 8.4375, "learning_rate": 5.87522689176806e-06, "loss": 0.78325071, "memory(GiB)": 147.13, "step": 39490, "train_speed(iter/s)": 0.201935 }, { "acc": 0.78502064, "epoch": 0.9215649105411278, "grad_norm": 19.625, "learning_rate": 5.873366884896464e-06, "loss": 0.78469729, "memory(GiB)": 147.13, "step": 39500, "train_speed(iter/s)": 0.20196 }, { "epoch": 0.9215649105411278, "eval_acc": 0.7435581582196189, "eval_loss": 0.8083301782608032, "eval_runtime": 1269.8332, "eval_samples_per_second": 28.343, "eval_steps_per_second": 14.172, "step": 39500 }, { "acc": 0.758465, "epoch": 0.9217982181134167, "grad_norm": 4.4375, "learning_rate": 5.871506753352e-06, "loss": 0.88601971, "memory(GiB)": 147.13, "step": 39510, "train_speed(iter/s)": 0.200665 }, { "acc": 0.77248645, "epoch": 0.9220315256857056, "grad_norm": 5.03125, "learning_rate": 5.869646497400199e-06, "loss": 0.81327343, "memory(GiB)": 147.13, "step": 39520, "train_speed(iter/s)": 0.200692 }, { "acc": 0.78898239, "epoch": 0.9222648332579945, "grad_norm": 4.875, "learning_rate": 5.867786117306614e-06, "loss": 0.73193016, "memory(GiB)": 147.13, "step": 39530, "train_speed(iter/s)": 0.200719 }, { "acc": 0.77655249, "epoch": 0.9224981408302834, "grad_norm": 8.25, "learning_rate": 5.865925613336814e-06, "loss": 0.80876665, "memory(GiB)": 147.13, "step": 39540, "train_speed(iter/s)": 0.200747 }, { "acc": 0.79803247, "epoch": 0.9227314484025723, "grad_norm": 4.625, "learning_rate": 5.864064985756382e-06, "loss": 0.73142552, "memory(GiB)": 147.13, "step": 39550, "train_speed(iter/s)": 0.200773 }, { "acc": 0.78445044, "epoch": 0.9229647559748612, "grad_norm": 5.0, "learning_rate": 5.862204234830925e-06, "loss": 0.78575296, "memory(GiB)": 147.13, "step": 39560, "train_speed(iter/s)": 0.200799 }, { "acc": 0.77423639, "epoch": 0.92319806354715, "grad_norm": 7.25, "learning_rate": 5.860343360826063e-06, "loss": 0.79959049, "memory(GiB)": 147.13, "step": 39570, "train_speed(iter/s)": 0.200826 }, { "acc": 0.75270905, "epoch": 0.9234313711194388, "grad_norm": 5.40625, "learning_rate": 5.858482364007438e-06, "loss": 0.88813477, "memory(GiB)": 147.13, "step": 39580, "train_speed(iter/s)": 0.200853 }, { "acc": 0.77825441, "epoch": 0.9236646786917277, "grad_norm": 11.4375, "learning_rate": 5.856621244640704e-06, "loss": 0.79800529, "memory(GiB)": 147.13, "step": 39590, "train_speed(iter/s)": 0.200877 }, { "acc": 0.78405609, "epoch": 0.9238979862640166, "grad_norm": 5.34375, "learning_rate": 5.8547600029915366e-06, "loss": 0.77924662, "memory(GiB)": 147.13, "step": 39600, "train_speed(iter/s)": 0.200903 }, { "acc": 0.76475625, "epoch": 0.9241312938363055, "grad_norm": 5.25, "learning_rate": 5.852898639325627e-06, "loss": 0.84154186, "memory(GiB)": 147.13, "step": 39610, "train_speed(iter/s)": 0.20093 }, { "acc": 0.79988174, "epoch": 0.9243646014085944, "grad_norm": 5.5625, "learning_rate": 5.851037153908684e-06, "loss": 0.71289425, "memory(GiB)": 147.13, "step": 39620, "train_speed(iter/s)": 0.200956 }, { "acc": 0.77636728, "epoch": 0.9245979089808833, "grad_norm": 5.5625, "learning_rate": 5.849175547006433e-06, "loss": 0.8403513, "memory(GiB)": 147.13, "step": 39630, "train_speed(iter/s)": 0.200981 }, { "acc": 0.76683617, "epoch": 0.9248312165531722, "grad_norm": 5.75, "learning_rate": 5.8473138188846216e-06, "loss": 0.8414938, "memory(GiB)": 147.13, "step": 39640, "train_speed(iter/s)": 0.201007 }, { "acc": 0.78084307, "epoch": 0.9250645241254611, "grad_norm": 4.65625, "learning_rate": 5.845451969809009e-06, "loss": 0.79839873, "memory(GiB)": 147.13, "step": 39650, "train_speed(iter/s)": 0.201035 }, { "acc": 0.79516888, "epoch": 0.92529783169775, "grad_norm": 3.96875, "learning_rate": 5.843590000045372e-06, "loss": 0.71090298, "memory(GiB)": 147.13, "step": 39660, "train_speed(iter/s)": 0.201063 }, { "acc": 0.7792943, "epoch": 0.9255311392700389, "grad_norm": 5.03125, "learning_rate": 5.841727909859508e-06, "loss": 0.78876328, "memory(GiB)": 147.13, "step": 39670, "train_speed(iter/s)": 0.201089 }, { "acc": 0.76696959, "epoch": 0.9257644468423278, "grad_norm": 8.625, "learning_rate": 5.83986569951723e-06, "loss": 0.85549431, "memory(GiB)": 147.13, "step": 39680, "train_speed(iter/s)": 0.201115 }, { "acc": 0.78547182, "epoch": 0.9259977544146167, "grad_norm": 4.59375, "learning_rate": 5.838003369284366e-06, "loss": 0.7880579, "memory(GiB)": 147.13, "step": 39690, "train_speed(iter/s)": 0.20114 }, { "acc": 0.7796061, "epoch": 0.9262310619869056, "grad_norm": 4.0625, "learning_rate": 5.836140919426765e-06, "loss": 0.78260422, "memory(GiB)": 147.13, "step": 39700, "train_speed(iter/s)": 0.201165 }, { "acc": 0.77718563, "epoch": 0.9264643695591945, "grad_norm": 4.15625, "learning_rate": 5.834278350210292e-06, "loss": 0.81985626, "memory(GiB)": 147.13, "step": 39710, "train_speed(iter/s)": 0.201192 }, { "acc": 0.78238964, "epoch": 0.9266976771314834, "grad_norm": 6.65625, "learning_rate": 5.832415661900826e-06, "loss": 0.7647913, "memory(GiB)": 147.13, "step": 39720, "train_speed(iter/s)": 0.201217 }, { "acc": 0.78833084, "epoch": 0.9269309847037723, "grad_norm": 4.90625, "learning_rate": 5.830552854764265e-06, "loss": 0.73814125, "memory(GiB)": 147.13, "step": 39730, "train_speed(iter/s)": 0.201244 }, { "acc": 0.77150917, "epoch": 0.9271642922760612, "grad_norm": 5.3125, "learning_rate": 5.828689929066526e-06, "loss": 0.81474161, "memory(GiB)": 147.13, "step": 39740, "train_speed(iter/s)": 0.20127 }, { "acc": 0.78371716, "epoch": 0.9273975998483501, "grad_norm": 5.34375, "learning_rate": 5.826826885073541e-06, "loss": 0.77574019, "memory(GiB)": 147.13, "step": 39750, "train_speed(iter/s)": 0.201296 }, { "acc": 0.79163008, "epoch": 0.9276309074206389, "grad_norm": 6.09375, "learning_rate": 5.824963723051258e-06, "loss": 0.75398407, "memory(GiB)": 147.13, "step": 39760, "train_speed(iter/s)": 0.201322 }, { "acc": 0.77016096, "epoch": 0.9278642149929278, "grad_norm": 5.71875, "learning_rate": 5.823100443265643e-06, "loss": 0.84416866, "memory(GiB)": 147.13, "step": 39770, "train_speed(iter/s)": 0.201349 }, { "acc": 0.7718441, "epoch": 0.9280975225652167, "grad_norm": 6.71875, "learning_rate": 5.821237045982679e-06, "loss": 0.82021675, "memory(GiB)": 147.13, "step": 39780, "train_speed(iter/s)": 0.201376 }, { "acc": 0.76817198, "epoch": 0.9283308301375056, "grad_norm": 7.84375, "learning_rate": 5.819373531468364e-06, "loss": 0.82225189, "memory(GiB)": 147.13, "step": 39790, "train_speed(iter/s)": 0.201401 }, { "acc": 0.77029276, "epoch": 0.9285641377097945, "grad_norm": 10.125, "learning_rate": 5.817509899988717e-06, "loss": 0.81980934, "memory(GiB)": 147.13, "step": 39800, "train_speed(iter/s)": 0.201428 }, { "acc": 0.77408419, "epoch": 0.9287974452820834, "grad_norm": 5.0625, "learning_rate": 5.8156461518097695e-06, "loss": 0.81796417, "memory(GiB)": 147.13, "step": 39810, "train_speed(iter/s)": 0.201454 }, { "acc": 0.78498955, "epoch": 0.9290307528543723, "grad_norm": 20.0, "learning_rate": 5.813782287197569e-06, "loss": 0.78012381, "memory(GiB)": 147.13, "step": 39820, "train_speed(iter/s)": 0.201481 }, { "acc": 0.76299672, "epoch": 0.9292640604266612, "grad_norm": 8.3125, "learning_rate": 5.8119183064181864e-06, "loss": 0.84655218, "memory(GiB)": 147.13, "step": 39830, "train_speed(iter/s)": 0.201508 }, { "acc": 0.77199259, "epoch": 0.9294973679989501, "grad_norm": 8.125, "learning_rate": 5.810054209737699e-06, "loss": 0.807763, "memory(GiB)": 147.13, "step": 39840, "train_speed(iter/s)": 0.201533 }, { "acc": 0.77625284, "epoch": 0.929730675571239, "grad_norm": 4.28125, "learning_rate": 5.8081899974222076e-06, "loss": 0.8012434, "memory(GiB)": 147.13, "step": 39850, "train_speed(iter/s)": 0.20156 }, { "acc": 0.79133472, "epoch": 0.9299639831435279, "grad_norm": 4.84375, "learning_rate": 5.80632566973783e-06, "loss": 0.75545387, "memory(GiB)": 147.13, "step": 39860, "train_speed(iter/s)": 0.201585 }, { "acc": 0.78476238, "epoch": 0.9301972907158168, "grad_norm": 6.0, "learning_rate": 5.804461226950697e-06, "loss": 0.75581503, "memory(GiB)": 147.13, "step": 39870, "train_speed(iter/s)": 0.201611 }, { "acc": 0.75795188, "epoch": 0.9304305982881057, "grad_norm": 6.6875, "learning_rate": 5.80259666932696e-06, "loss": 0.87952824, "memory(GiB)": 147.13, "step": 39880, "train_speed(iter/s)": 0.201638 }, { "acc": 0.75466013, "epoch": 0.9306639058603946, "grad_norm": 5.78125, "learning_rate": 5.800731997132779e-06, "loss": 0.8734972, "memory(GiB)": 147.13, "step": 39890, "train_speed(iter/s)": 0.201665 }, { "acc": 0.78315744, "epoch": 0.9308972134326835, "grad_norm": 6.375, "learning_rate": 5.7988672106343395e-06, "loss": 0.8057559, "memory(GiB)": 147.13, "step": 39900, "train_speed(iter/s)": 0.20169 }, { "acc": 0.76869955, "epoch": 0.9311305210049724, "grad_norm": 6.21875, "learning_rate": 5.797002310097836e-06, "loss": 0.82583561, "memory(GiB)": 147.13, "step": 39910, "train_speed(iter/s)": 0.201718 }, { "acc": 0.78581076, "epoch": 0.9313638285772613, "grad_norm": 5.0625, "learning_rate": 5.795137295789486e-06, "loss": 0.78243923, "memory(GiB)": 147.13, "step": 39920, "train_speed(iter/s)": 0.201744 }, { "acc": 0.77420025, "epoch": 0.9315971361495502, "grad_norm": 5.4375, "learning_rate": 5.7932721679755164e-06, "loss": 0.81065769, "memory(GiB)": 147.13, "step": 39930, "train_speed(iter/s)": 0.20177 }, { "acc": 0.76268339, "epoch": 0.9318304437218391, "grad_norm": 4.09375, "learning_rate": 5.791406926922176e-06, "loss": 0.88640518, "memory(GiB)": 147.13, "step": 39940, "train_speed(iter/s)": 0.201797 }, { "acc": 0.78374519, "epoch": 0.9320637512941279, "grad_norm": 5.59375, "learning_rate": 5.789541572895727e-06, "loss": 0.78053789, "memory(GiB)": 147.13, "step": 39950, "train_speed(iter/s)": 0.201824 }, { "acc": 0.79211001, "epoch": 0.9322970588664168, "grad_norm": 4.96875, "learning_rate": 5.787676106162449e-06, "loss": 0.74461236, "memory(GiB)": 147.13, "step": 39960, "train_speed(iter/s)": 0.20185 }, { "acc": 0.80399532, "epoch": 0.9325303664387057, "grad_norm": 5.0625, "learning_rate": 5.785810526988633e-06, "loss": 0.70544991, "memory(GiB)": 147.13, "step": 39970, "train_speed(iter/s)": 0.201875 }, { "acc": 0.76267509, "epoch": 0.9327636740109946, "grad_norm": 4.125, "learning_rate": 5.783944835640594e-06, "loss": 0.82315941, "memory(GiB)": 147.13, "step": 39980, "train_speed(iter/s)": 0.201902 }, { "acc": 0.76198874, "epoch": 0.9329969815832835, "grad_norm": 4.4375, "learning_rate": 5.7820790323846566e-06, "loss": 0.88518867, "memory(GiB)": 147.13, "step": 39990, "train_speed(iter/s)": 0.201927 }, { "acc": 0.76849947, "epoch": 0.9332302891555724, "grad_norm": 7.75, "learning_rate": 5.780213117487167e-06, "loss": 0.81474934, "memory(GiB)": 147.13, "step": 40000, "train_speed(iter/s)": 0.201955 }, { "epoch": 0.9332302891555724, "eval_acc": 0.7435186025942405, "eval_loss": 0.8081688284873962, "eval_runtime": 1269.5667, "eval_samples_per_second": 28.349, "eval_steps_per_second": 14.175, "step": 40000 }, { "acc": 0.77274613, "epoch": 0.9334635967278613, "grad_norm": 4.8125, "learning_rate": 5.778347091214479e-06, "loss": 0.81792908, "memory(GiB)": 147.13, "step": 40010, "train_speed(iter/s)": 0.200676 }, { "acc": 0.77955446, "epoch": 0.9336969043001502, "grad_norm": 4.9375, "learning_rate": 5.77648095383297e-06, "loss": 0.7724124, "memory(GiB)": 147.13, "step": 40020, "train_speed(iter/s)": 0.200702 }, { "acc": 0.76370978, "epoch": 0.9339302118724391, "grad_norm": 11.625, "learning_rate": 5.774614705609032e-06, "loss": 0.85831079, "memory(GiB)": 147.13, "step": 40030, "train_speed(iter/s)": 0.200726 }, { "acc": 0.77389021, "epoch": 0.934163519444728, "grad_norm": 4.5, "learning_rate": 5.7727483468090686e-06, "loss": 0.80834856, "memory(GiB)": 147.13, "step": 40040, "train_speed(iter/s)": 0.200752 }, { "acc": 0.77940512, "epoch": 0.9343968270170169, "grad_norm": 4.375, "learning_rate": 5.770881877699502e-06, "loss": 0.78713818, "memory(GiB)": 147.13, "step": 40050, "train_speed(iter/s)": 0.200777 }, { "acc": 0.77909374, "epoch": 0.9346301345893058, "grad_norm": 7.21875, "learning_rate": 5.769015298546774e-06, "loss": 0.79142208, "memory(GiB)": 147.13, "step": 40060, "train_speed(iter/s)": 0.200801 }, { "acc": 0.77855005, "epoch": 0.9348634421615947, "grad_norm": 4.84375, "learning_rate": 5.7671486096173336e-06, "loss": 0.78886166, "memory(GiB)": 147.13, "step": 40070, "train_speed(iter/s)": 0.200827 }, { "acc": 0.77905307, "epoch": 0.9350967497338836, "grad_norm": 5.90625, "learning_rate": 5.765281811177652e-06, "loss": 0.81017637, "memory(GiB)": 147.13, "step": 40080, "train_speed(iter/s)": 0.200853 }, { "acc": 0.79088497, "epoch": 0.9353300573061725, "grad_norm": 5.5625, "learning_rate": 5.763414903494216e-06, "loss": 0.75551167, "memory(GiB)": 147.13, "step": 40090, "train_speed(iter/s)": 0.200879 }, { "acc": 0.75535116, "epoch": 0.9355633648784614, "grad_norm": 5.75, "learning_rate": 5.761547886833523e-06, "loss": 0.89576149, "memory(GiB)": 147.13, "step": 40100, "train_speed(iter/s)": 0.200905 }, { "acc": 0.80530338, "epoch": 0.9357966724507503, "grad_norm": 6.34375, "learning_rate": 5.759680761462091e-06, "loss": 0.69478006, "memory(GiB)": 147.13, "step": 40110, "train_speed(iter/s)": 0.20093 }, { "acc": 0.77864857, "epoch": 0.9360299800230392, "grad_norm": 4.875, "learning_rate": 5.757813527646449e-06, "loss": 0.8089098, "memory(GiB)": 147.13, "step": 40120, "train_speed(iter/s)": 0.200956 }, { "acc": 0.79146218, "epoch": 0.9362632875953281, "grad_norm": 4.59375, "learning_rate": 5.755946185653148e-06, "loss": 0.74217372, "memory(GiB)": 147.13, "step": 40130, "train_speed(iter/s)": 0.200982 }, { "acc": 0.78336248, "epoch": 0.936496595167617, "grad_norm": 15.5625, "learning_rate": 5.7540787357487485e-06, "loss": 0.78665681, "memory(GiB)": 147.13, "step": 40140, "train_speed(iter/s)": 0.201007 }, { "acc": 0.78126898, "epoch": 0.9367299027399058, "grad_norm": 5.59375, "learning_rate": 5.752211178199828e-06, "loss": 0.77836123, "memory(GiB)": 147.13, "step": 40150, "train_speed(iter/s)": 0.201033 }, { "acc": 0.78209891, "epoch": 0.9369632103121946, "grad_norm": 4.75, "learning_rate": 5.7503435132729805e-06, "loss": 0.76724977, "memory(GiB)": 147.13, "step": 40160, "train_speed(iter/s)": 0.201059 }, { "acc": 0.77962379, "epoch": 0.9371965178844835, "grad_norm": 6.0625, "learning_rate": 5.7484757412348146e-06, "loss": 0.78311558, "memory(GiB)": 147.13, "step": 40170, "train_speed(iter/s)": 0.201084 }, { "acc": 0.78048811, "epoch": 0.9374298254567724, "grad_norm": 7.125, "learning_rate": 5.746607862351955e-06, "loss": 0.78884706, "memory(GiB)": 147.13, "step": 40180, "train_speed(iter/s)": 0.201109 }, { "acc": 0.76181178, "epoch": 0.9376631330290613, "grad_norm": 5.09375, "learning_rate": 5.744739876891038e-06, "loss": 0.85814571, "memory(GiB)": 147.13, "step": 40190, "train_speed(iter/s)": 0.201136 }, { "acc": 0.77918482, "epoch": 0.9378964406013502, "grad_norm": 4.6875, "learning_rate": 5.742871785118721e-06, "loss": 0.76165657, "memory(GiB)": 147.13, "step": 40200, "train_speed(iter/s)": 0.201162 }, { "acc": 0.8000536, "epoch": 0.9381297481736391, "grad_norm": 13.25, "learning_rate": 5.741003587301673e-06, "loss": 0.71440544, "memory(GiB)": 147.13, "step": 40210, "train_speed(iter/s)": 0.201187 }, { "acc": 0.77322521, "epoch": 0.938363055745928, "grad_norm": 4.90625, "learning_rate": 5.739135283706576e-06, "loss": 0.82055111, "memory(GiB)": 147.13, "step": 40220, "train_speed(iter/s)": 0.201214 }, { "acc": 0.77559209, "epoch": 0.9385963633182169, "grad_norm": 5.875, "learning_rate": 5.737266874600134e-06, "loss": 0.79832792, "memory(GiB)": 147.13, "step": 40230, "train_speed(iter/s)": 0.20124 }, { "acc": 0.76249466, "epoch": 0.9388296708905058, "grad_norm": 4.59375, "learning_rate": 5.735398360249059e-06, "loss": 0.84215164, "memory(GiB)": 147.13, "step": 40240, "train_speed(iter/s)": 0.201266 }, { "acc": 0.79150157, "epoch": 0.9390629784627947, "grad_norm": 6.25, "learning_rate": 5.733529740920083e-06, "loss": 0.73628078, "memory(GiB)": 147.13, "step": 40250, "train_speed(iter/s)": 0.201291 }, { "acc": 0.77681289, "epoch": 0.9392962860350836, "grad_norm": 6.0625, "learning_rate": 5.731661016879948e-06, "loss": 0.79612532, "memory(GiB)": 147.13, "step": 40260, "train_speed(iter/s)": 0.201318 }, { "acc": 0.7825942, "epoch": 0.9395295936073725, "grad_norm": 4.9375, "learning_rate": 5.729792188395415e-06, "loss": 0.7699985, "memory(GiB)": 147.13, "step": 40270, "train_speed(iter/s)": 0.201342 }, { "acc": 0.76840849, "epoch": 0.9397629011796614, "grad_norm": 5.34375, "learning_rate": 5.7279232557332595e-06, "loss": 0.82999878, "memory(GiB)": 147.13, "step": 40280, "train_speed(iter/s)": 0.201367 }, { "acc": 0.77449684, "epoch": 0.9399962087519503, "grad_norm": 5.34375, "learning_rate": 5.726054219160273e-06, "loss": 0.81352596, "memory(GiB)": 147.13, "step": 40290, "train_speed(iter/s)": 0.201392 }, { "acc": 0.77998219, "epoch": 0.9402295163242392, "grad_norm": 4.65625, "learning_rate": 5.7241850789432555e-06, "loss": 0.79409661, "memory(GiB)": 147.13, "step": 40300, "train_speed(iter/s)": 0.201417 }, { "acc": 0.78303413, "epoch": 0.9404628238965281, "grad_norm": 5.8125, "learning_rate": 5.722315835349029e-06, "loss": 0.76099334, "memory(GiB)": 147.13, "step": 40310, "train_speed(iter/s)": 0.201444 }, { "acc": 0.78695984, "epoch": 0.940696131468817, "grad_norm": 5.59375, "learning_rate": 5.7204464886444265e-06, "loss": 0.76036892, "memory(GiB)": 147.13, "step": 40320, "train_speed(iter/s)": 0.201469 }, { "acc": 0.78353043, "epoch": 0.9409294390411059, "grad_norm": 5.25, "learning_rate": 5.718577039096297e-06, "loss": 0.78870249, "memory(GiB)": 147.13, "step": 40330, "train_speed(iter/s)": 0.201495 }, { "acc": 0.78347793, "epoch": 0.9411627466133947, "grad_norm": 5.25, "learning_rate": 5.7167074869715045e-06, "loss": 0.76787477, "memory(GiB)": 147.13, "step": 40340, "train_speed(iter/s)": 0.20152 }, { "acc": 0.79285727, "epoch": 0.9413960541856836, "grad_norm": 7.1875, "learning_rate": 5.714837832536926e-06, "loss": 0.73996181, "memory(GiB)": 147.13, "step": 40350, "train_speed(iter/s)": 0.201547 }, { "acc": 0.77512045, "epoch": 0.9416293617579725, "grad_norm": 5.78125, "learning_rate": 5.712968076059454e-06, "loss": 0.81619711, "memory(GiB)": 147.13, "step": 40360, "train_speed(iter/s)": 0.201572 }, { "acc": 0.78800678, "epoch": 0.9418626693302614, "grad_norm": 5.375, "learning_rate": 5.711098217805997e-06, "loss": 0.74943008, "memory(GiB)": 147.13, "step": 40370, "train_speed(iter/s)": 0.201597 }, { "acc": 0.77038002, "epoch": 0.9420959769025503, "grad_norm": 5.125, "learning_rate": 5.709228258043476e-06, "loss": 0.82832499, "memory(GiB)": 147.13, "step": 40380, "train_speed(iter/s)": 0.201622 }, { "acc": 0.7822566, "epoch": 0.9423292844748392, "grad_norm": 4.625, "learning_rate": 5.707358197038827e-06, "loss": 0.78541126, "memory(GiB)": 147.13, "step": 40390, "train_speed(iter/s)": 0.201646 }, { "acc": 0.77454891, "epoch": 0.9425625920471281, "grad_norm": 5.71875, "learning_rate": 5.7054880350590015e-06, "loss": 0.83365231, "memory(GiB)": 147.13, "step": 40400, "train_speed(iter/s)": 0.201673 }, { "acc": 0.77726865, "epoch": 0.942795899619417, "grad_norm": 4.96875, "learning_rate": 5.703617772370963e-06, "loss": 0.79805484, "memory(GiB)": 147.13, "step": 40410, "train_speed(iter/s)": 0.201698 }, { "acc": 0.7605464, "epoch": 0.9430292071917059, "grad_norm": 5.3125, "learning_rate": 5.701747409241691e-06, "loss": 0.85971928, "memory(GiB)": 147.13, "step": 40420, "train_speed(iter/s)": 0.201723 }, { "acc": 0.79386868, "epoch": 0.9432625147639948, "grad_norm": 6.03125, "learning_rate": 5.699876945938182e-06, "loss": 0.74754467, "memory(GiB)": 147.13, "step": 40430, "train_speed(iter/s)": 0.201748 }, { "acc": 0.7801425, "epoch": 0.9434958223362837, "grad_norm": 4.90625, "learning_rate": 5.698006382727441e-06, "loss": 0.7978281, "memory(GiB)": 147.13, "step": 40440, "train_speed(iter/s)": 0.201775 }, { "acc": 0.74835367, "epoch": 0.9437291299085726, "grad_norm": 8.5, "learning_rate": 5.696135719876492e-06, "loss": 0.92766171, "memory(GiB)": 147.13, "step": 40450, "train_speed(iter/s)": 0.201803 }, { "acc": 0.78347101, "epoch": 0.9439624374808615, "grad_norm": 5.0, "learning_rate": 5.694264957652373e-06, "loss": 0.75946207, "memory(GiB)": 147.13, "step": 40460, "train_speed(iter/s)": 0.201827 }, { "acc": 0.78046608, "epoch": 0.9441957450531504, "grad_norm": 5.59375, "learning_rate": 5.692394096322131e-06, "loss": 0.81162415, "memory(GiB)": 147.13, "step": 40470, "train_speed(iter/s)": 0.201854 }, { "acc": 0.77506027, "epoch": 0.9444290526254393, "grad_norm": 5.09375, "learning_rate": 5.690523136152834e-06, "loss": 0.83319483, "memory(GiB)": 147.13, "step": 40480, "train_speed(iter/s)": 0.201879 }, { "acc": 0.77044592, "epoch": 0.9446623601977282, "grad_norm": 5.75, "learning_rate": 5.688652077411558e-06, "loss": 0.82162943, "memory(GiB)": 147.13, "step": 40490, "train_speed(iter/s)": 0.201904 }, { "acc": 0.78238449, "epoch": 0.9448956677700171, "grad_norm": 6.28125, "learning_rate": 5.6867809203654004e-06, "loss": 0.78513522, "memory(GiB)": 147.13, "step": 40500, "train_speed(iter/s)": 0.201929 }, { "epoch": 0.9448956677700171, "eval_acc": 0.7436441556723647, "eval_loss": 0.8078497052192688, "eval_runtime": 1270.4605, "eval_samples_per_second": 28.329, "eval_steps_per_second": 14.165, "step": 40500 }, { "acc": 0.77993817, "epoch": 0.945128975342306, "grad_norm": 6.125, "learning_rate": 5.684909665281465e-06, "loss": 0.7786458, "memory(GiB)": 147.13, "step": 40510, "train_speed(iter/s)": 0.200667 }, { "acc": 0.7921401, "epoch": 0.9453622829145949, "grad_norm": 5.0, "learning_rate": 5.683038312426873e-06, "loss": 0.73767538, "memory(GiB)": 147.13, "step": 40520, "train_speed(iter/s)": 0.200693 }, { "acc": 0.77997236, "epoch": 0.9455955904868837, "grad_norm": 4.9375, "learning_rate": 5.681166862068761e-06, "loss": 0.80369387, "memory(GiB)": 147.13, "step": 40530, "train_speed(iter/s)": 0.200719 }, { "acc": 0.75641575, "epoch": 0.9458288980591726, "grad_norm": 4.34375, "learning_rate": 5.679295314474278e-06, "loss": 0.89748631, "memory(GiB)": 147.13, "step": 40540, "train_speed(iter/s)": 0.200743 }, { "acc": 0.77325706, "epoch": 0.9460622056314615, "grad_norm": 4.125, "learning_rate": 5.677423669910584e-06, "loss": 0.82863417, "memory(GiB)": 147.13, "step": 40550, "train_speed(iter/s)": 0.20077 }, { "acc": 0.7895503, "epoch": 0.9462955132037504, "grad_norm": 5.53125, "learning_rate": 5.67555192864486e-06, "loss": 0.74407339, "memory(GiB)": 147.13, "step": 40560, "train_speed(iter/s)": 0.200795 }, { "acc": 0.77496853, "epoch": 0.9465288207760393, "grad_norm": 11.0, "learning_rate": 5.673680090944294e-06, "loss": 0.81249332, "memory(GiB)": 147.13, "step": 40570, "train_speed(iter/s)": 0.200821 }, { "acc": 0.79530725, "epoch": 0.9467621283483282, "grad_norm": 9.1875, "learning_rate": 5.671808157076091e-06, "loss": 0.72718515, "memory(GiB)": 147.13, "step": 40580, "train_speed(iter/s)": 0.200843 }, { "acc": 0.76706448, "epoch": 0.9469954359206171, "grad_norm": 4.0, "learning_rate": 5.669936127307468e-06, "loss": 0.85577278, "memory(GiB)": 147.13, "step": 40590, "train_speed(iter/s)": 0.200868 }, { "acc": 0.77201781, "epoch": 0.947228743492906, "grad_norm": 6.84375, "learning_rate": 5.668064001905658e-06, "loss": 0.82298203, "memory(GiB)": 147.13, "step": 40600, "train_speed(iter/s)": 0.200893 }, { "acc": 0.76372414, "epoch": 0.9474620510651949, "grad_norm": 5.40625, "learning_rate": 5.666191781137905e-06, "loss": 0.85342846, "memory(GiB)": 147.13, "step": 40610, "train_speed(iter/s)": 0.200919 }, { "acc": 0.77155447, "epoch": 0.9476953586374838, "grad_norm": 6.09375, "learning_rate": 5.66431946527147e-06, "loss": 0.80941982, "memory(GiB)": 147.13, "step": 40620, "train_speed(iter/s)": 0.200945 }, { "acc": 0.78885984, "epoch": 0.9479286662097727, "grad_norm": 4.8125, "learning_rate": 5.662447054573624e-06, "loss": 0.76518712, "memory(GiB)": 147.13, "step": 40630, "train_speed(iter/s)": 0.20097 }, { "acc": 0.7786191, "epoch": 0.9481619737820616, "grad_norm": 4.3125, "learning_rate": 5.660574549311653e-06, "loss": 0.80177717, "memory(GiB)": 147.13, "step": 40640, "train_speed(iter/s)": 0.200994 }, { "acc": 0.77824173, "epoch": 0.9483952813543505, "grad_norm": 5.71875, "learning_rate": 5.658701949752856e-06, "loss": 0.79966569, "memory(GiB)": 147.13, "step": 40650, "train_speed(iter/s)": 0.201019 }, { "acc": 0.77022877, "epoch": 0.9486285889266394, "grad_norm": 5.09375, "learning_rate": 5.656829256164549e-06, "loss": 0.85357208, "memory(GiB)": 147.13, "step": 40660, "train_speed(iter/s)": 0.201045 }, { "acc": 0.78125162, "epoch": 0.9488618964989283, "grad_norm": 5.28125, "learning_rate": 5.6549564688140555e-06, "loss": 0.76331964, "memory(GiB)": 147.13, "step": 40670, "train_speed(iter/s)": 0.201072 }, { "acc": 0.79808364, "epoch": 0.9490952040712172, "grad_norm": 4.8125, "learning_rate": 5.653083587968716e-06, "loss": 0.71971464, "memory(GiB)": 147.13, "step": 40680, "train_speed(iter/s)": 0.201099 }, { "acc": 0.77893338, "epoch": 0.9493285116435061, "grad_norm": 6.21875, "learning_rate": 5.651210613895885e-06, "loss": 0.78710918, "memory(GiB)": 147.13, "step": 40690, "train_speed(iter/s)": 0.201125 }, { "acc": 0.77154942, "epoch": 0.949561819215795, "grad_norm": 7.03125, "learning_rate": 5.649337546862927e-06, "loss": 0.81971178, "memory(GiB)": 147.13, "step": 40700, "train_speed(iter/s)": 0.201151 }, { "acc": 0.7884603, "epoch": 0.9497951267880839, "grad_norm": 4.5, "learning_rate": 5.647464387137224e-06, "loss": 0.75167475, "memory(GiB)": 147.13, "step": 40710, "train_speed(iter/s)": 0.201176 }, { "acc": 0.77500906, "epoch": 0.9500284343603727, "grad_norm": 6.5, "learning_rate": 5.645591134986166e-06, "loss": 0.81845722, "memory(GiB)": 147.13, "step": 40720, "train_speed(iter/s)": 0.201201 }, { "acc": 0.78605471, "epoch": 0.9502617419326616, "grad_norm": 5.09375, "learning_rate": 5.643717790677162e-06, "loss": 0.7811121, "memory(GiB)": 147.13, "step": 40730, "train_speed(iter/s)": 0.201226 }, { "acc": 0.77256351, "epoch": 0.9504950495049505, "grad_norm": 5.46875, "learning_rate": 5.641844354477631e-06, "loss": 0.84232235, "memory(GiB)": 147.13, "step": 40740, "train_speed(iter/s)": 0.201251 }, { "acc": 0.76221151, "epoch": 0.9507283570772393, "grad_norm": 6.34375, "learning_rate": 5.639970826655005e-06, "loss": 0.85408916, "memory(GiB)": 147.13, "step": 40750, "train_speed(iter/s)": 0.201277 }, { "acc": 0.78343468, "epoch": 0.9509616646495282, "grad_norm": 6.96875, "learning_rate": 5.63809720747673e-06, "loss": 0.76852407, "memory(GiB)": 147.13, "step": 40760, "train_speed(iter/s)": 0.201302 }, { "acc": 0.78977461, "epoch": 0.9511949722218171, "grad_norm": 7.5, "learning_rate": 5.636223497210261e-06, "loss": 0.74346333, "memory(GiB)": 147.13, "step": 40770, "train_speed(iter/s)": 0.201328 }, { "acc": 0.7746685, "epoch": 0.951428279794106, "grad_norm": 7.0625, "learning_rate": 5.634349696123075e-06, "loss": 0.81890717, "memory(GiB)": 147.13, "step": 40780, "train_speed(iter/s)": 0.201354 }, { "acc": 0.77995763, "epoch": 0.951661587366395, "grad_norm": 15.1875, "learning_rate": 5.6324758044826535e-06, "loss": 0.79575968, "memory(GiB)": 147.13, "step": 40790, "train_speed(iter/s)": 0.201378 }, { "acc": 0.78342981, "epoch": 0.9518948949386838, "grad_norm": 4.0, "learning_rate": 5.6306018225564955e-06, "loss": 0.79688606, "memory(GiB)": 147.13, "step": 40800, "train_speed(iter/s)": 0.201404 }, { "acc": 0.77644749, "epoch": 0.9521282025109727, "grad_norm": 7.59375, "learning_rate": 5.6287277506121084e-06, "loss": 0.80725603, "memory(GiB)": 147.13, "step": 40810, "train_speed(iter/s)": 0.20143 }, { "acc": 0.78804812, "epoch": 0.9523615100832616, "grad_norm": 5.0, "learning_rate": 5.626853588917021e-06, "loss": 0.7547946, "memory(GiB)": 147.13, "step": 40820, "train_speed(iter/s)": 0.201454 }, { "acc": 0.76683331, "epoch": 0.9525948176555505, "grad_norm": 4.78125, "learning_rate": 5.624979337738763e-06, "loss": 0.8316308, "memory(GiB)": 147.13, "step": 40830, "train_speed(iter/s)": 0.20148 }, { "acc": 0.77537379, "epoch": 0.9528281252278394, "grad_norm": 4.9375, "learning_rate": 5.623104997344886e-06, "loss": 0.80050964, "memory(GiB)": 147.13, "step": 40840, "train_speed(iter/s)": 0.201507 }, { "acc": 0.77048178, "epoch": 0.9530614328001283, "grad_norm": 7.84375, "learning_rate": 5.621230568002952e-06, "loss": 0.84840851, "memory(GiB)": 147.13, "step": 40850, "train_speed(iter/s)": 0.201531 }, { "acc": 0.77276115, "epoch": 0.9532947403724172, "grad_norm": 6.125, "learning_rate": 5.619356049980536e-06, "loss": 0.84306707, "memory(GiB)": 147.13, "step": 40860, "train_speed(iter/s)": 0.201556 }, { "acc": 0.75870538, "epoch": 0.9535280479447061, "grad_norm": 5.21875, "learning_rate": 5.617481443545223e-06, "loss": 0.88285389, "memory(GiB)": 147.13, "step": 40870, "train_speed(iter/s)": 0.201582 }, { "acc": 0.78115153, "epoch": 0.953761355516995, "grad_norm": 5.3125, "learning_rate": 5.615606748964613e-06, "loss": 0.77917652, "memory(GiB)": 147.13, "step": 40880, "train_speed(iter/s)": 0.201608 }, { "acc": 0.77769594, "epoch": 0.9539946630892839, "grad_norm": 7.40625, "learning_rate": 5.613731966506321e-06, "loss": 0.78834753, "memory(GiB)": 147.13, "step": 40890, "train_speed(iter/s)": 0.201632 }, { "acc": 0.78029628, "epoch": 0.9542279706615728, "grad_norm": 7.28125, "learning_rate": 5.611857096437966e-06, "loss": 0.80404196, "memory(GiB)": 147.13, "step": 40900, "train_speed(iter/s)": 0.201658 }, { "acc": 0.78504705, "epoch": 0.9544612782338617, "grad_norm": 5.34375, "learning_rate": 5.60998213902719e-06, "loss": 0.78944445, "memory(GiB)": 147.13, "step": 40910, "train_speed(iter/s)": 0.201684 }, { "acc": 0.78178701, "epoch": 0.9546945858061505, "grad_norm": 6.03125, "learning_rate": 5.60810709454164e-06, "loss": 0.791785, "memory(GiB)": 147.13, "step": 40920, "train_speed(iter/s)": 0.20171 }, { "acc": 0.77981234, "epoch": 0.9549278933784394, "grad_norm": 6.3125, "learning_rate": 5.606231963248978e-06, "loss": 0.80088806, "memory(GiB)": 147.13, "step": 40930, "train_speed(iter/s)": 0.201736 }, { "acc": 0.79295478, "epoch": 0.9551612009507283, "grad_norm": 3.71875, "learning_rate": 5.60435674541688e-06, "loss": 0.7574069, "memory(GiB)": 147.13, "step": 40940, "train_speed(iter/s)": 0.20176 }, { "acc": 0.77834711, "epoch": 0.9553945085230172, "grad_norm": 6.34375, "learning_rate": 5.602481441313032e-06, "loss": 0.78354673, "memory(GiB)": 147.13, "step": 40950, "train_speed(iter/s)": 0.201786 }, { "acc": 0.79002304, "epoch": 0.9556278160953061, "grad_norm": 4.09375, "learning_rate": 5.6006060512051355e-06, "loss": 0.75721602, "memory(GiB)": 147.13, "step": 40960, "train_speed(iter/s)": 0.20181 }, { "acc": 0.78958197, "epoch": 0.955861123667595, "grad_norm": 6.34375, "learning_rate": 5.598730575360898e-06, "loss": 0.77207351, "memory(GiB)": 147.13, "step": 40970, "train_speed(iter/s)": 0.201836 }, { "acc": 0.78223519, "epoch": 0.9560944312398839, "grad_norm": 8.1875, "learning_rate": 5.596855014048045e-06, "loss": 0.77432485, "memory(GiB)": 147.13, "step": 40980, "train_speed(iter/s)": 0.201861 }, { "acc": 0.76258755, "epoch": 0.9563277388121728, "grad_norm": 5.375, "learning_rate": 5.594979367534311e-06, "loss": 0.84754496, "memory(GiB)": 147.13, "step": 40990, "train_speed(iter/s)": 0.201888 }, { "acc": 0.79521761, "epoch": 0.9565610463844617, "grad_norm": 4.3125, "learning_rate": 5.593103636087446e-06, "loss": 0.74911346, "memory(GiB)": 147.13, "step": 41000, "train_speed(iter/s)": 0.201913 }, { "epoch": 0.9565610463844617, "eval_acc": 0.7435477488445194, "eval_loss": 0.8075899481773376, "eval_runtime": 1270.0956, "eval_samples_per_second": 28.337, "eval_steps_per_second": 14.169, "step": 41000 }, { "acc": 0.78175507, "epoch": 0.9567943539567506, "grad_norm": 5.21875, "learning_rate": 5.591227819975209e-06, "loss": 0.77779932, "memory(GiB)": 147.13, "step": 41010, "train_speed(iter/s)": 0.200667 }, { "acc": 0.77938576, "epoch": 0.9570276615290395, "grad_norm": 5.59375, "learning_rate": 5.589351919465373e-06, "loss": 0.80347643, "memory(GiB)": 147.13, "step": 41020, "train_speed(iter/s)": 0.200691 }, { "acc": 0.77089553, "epoch": 0.9572609691013284, "grad_norm": 5.03125, "learning_rate": 5.587475934825721e-06, "loss": 0.84662561, "memory(GiB)": 147.13, "step": 41030, "train_speed(iter/s)": 0.200716 }, { "acc": 0.77880993, "epoch": 0.9574942766736173, "grad_norm": 5.3125, "learning_rate": 5.585599866324052e-06, "loss": 0.77378798, "memory(GiB)": 147.13, "step": 41040, "train_speed(iter/s)": 0.200739 }, { "acc": 0.77963066, "epoch": 0.9577275842459062, "grad_norm": 5.46875, "learning_rate": 5.583723714228169e-06, "loss": 0.78790627, "memory(GiB)": 147.13, "step": 41050, "train_speed(iter/s)": 0.200765 }, { "acc": 0.78368587, "epoch": 0.9579608918181951, "grad_norm": 5.71875, "learning_rate": 5.581847478805898e-06, "loss": 0.78440642, "memory(GiB)": 147.13, "step": 41060, "train_speed(iter/s)": 0.20079 }, { "acc": 0.79512386, "epoch": 0.958194199390484, "grad_norm": 5.15625, "learning_rate": 5.579971160325066e-06, "loss": 0.73103809, "memory(GiB)": 147.13, "step": 41070, "train_speed(iter/s)": 0.200817 }, { "acc": 0.79529629, "epoch": 0.9584275069627729, "grad_norm": 5.90625, "learning_rate": 5.578094759053521e-06, "loss": 0.73167953, "memory(GiB)": 147.13, "step": 41080, "train_speed(iter/s)": 0.200842 }, { "acc": 0.77233238, "epoch": 0.9586608145350618, "grad_norm": 5.65625, "learning_rate": 5.576218275259116e-06, "loss": 0.83091927, "memory(GiB)": 147.13, "step": 41090, "train_speed(iter/s)": 0.200868 }, { "acc": 0.76945944, "epoch": 0.9588941221073507, "grad_norm": 7.71875, "learning_rate": 5.574341709209721e-06, "loss": 0.83968544, "memory(GiB)": 147.13, "step": 41100, "train_speed(iter/s)": 0.200893 }, { "acc": 0.78422494, "epoch": 0.9591274296796395, "grad_norm": 5.71875, "learning_rate": 5.572465061173215e-06, "loss": 0.76671977, "memory(GiB)": 147.13, "step": 41110, "train_speed(iter/s)": 0.200919 }, { "acc": 0.76734838, "epoch": 0.9593607372519284, "grad_norm": 4.75, "learning_rate": 5.5705883314174845e-06, "loss": 0.83322086, "memory(GiB)": 147.13, "step": 41120, "train_speed(iter/s)": 0.200946 }, { "acc": 0.77143869, "epoch": 0.9595940448242173, "grad_norm": 4.59375, "learning_rate": 5.568711520210437e-06, "loss": 0.83908682, "memory(GiB)": 147.13, "step": 41130, "train_speed(iter/s)": 0.200972 }, { "acc": 0.77202053, "epoch": 0.9598273523965062, "grad_norm": 5.34375, "learning_rate": 5.566834627819986e-06, "loss": 0.8184268, "memory(GiB)": 147.13, "step": 41140, "train_speed(iter/s)": 0.200998 }, { "acc": 0.77577133, "epoch": 0.9600606599687951, "grad_norm": 4.875, "learning_rate": 5.564957654514055e-06, "loss": 0.80979881, "memory(GiB)": 147.13, "step": 41150, "train_speed(iter/s)": 0.201022 }, { "acc": 0.77774215, "epoch": 0.960293967541084, "grad_norm": 3.46875, "learning_rate": 5.563080600560584e-06, "loss": 0.80543184, "memory(GiB)": 147.13, "step": 41160, "train_speed(iter/s)": 0.201048 }, { "acc": 0.77905116, "epoch": 0.9605272751133729, "grad_norm": 5.46875, "learning_rate": 5.5612034662275205e-06, "loss": 0.78646588, "memory(GiB)": 147.13, "step": 41170, "train_speed(iter/s)": 0.201072 }, { "acc": 0.77790079, "epoch": 0.9607605826856618, "grad_norm": 4.375, "learning_rate": 5.559326251782825e-06, "loss": 0.78109341, "memory(GiB)": 147.13, "step": 41180, "train_speed(iter/s)": 0.201096 }, { "acc": 0.7724268, "epoch": 0.9609938902579507, "grad_norm": 5.03125, "learning_rate": 5.55744895749447e-06, "loss": 0.82191772, "memory(GiB)": 147.13, "step": 41190, "train_speed(iter/s)": 0.201123 }, { "acc": 0.78639202, "epoch": 0.9612271978302396, "grad_norm": 4.53125, "learning_rate": 5.555571583630439e-06, "loss": 0.75079656, "memory(GiB)": 147.13, "step": 41200, "train_speed(iter/s)": 0.201149 }, { "acc": 0.78967714, "epoch": 0.9614605054025285, "grad_norm": 4.78125, "learning_rate": 5.553694130458725e-06, "loss": 0.77342958, "memory(GiB)": 147.13, "step": 41210, "train_speed(iter/s)": 0.201173 }, { "acc": 0.77171679, "epoch": 0.9616938129748174, "grad_norm": 6.59375, "learning_rate": 5.551816598247334e-06, "loss": 0.81496878, "memory(GiB)": 147.13, "step": 41220, "train_speed(iter/s)": 0.201198 }, { "acc": 0.79381685, "epoch": 0.9619271205471063, "grad_norm": 4.84375, "learning_rate": 5.549938987264284e-06, "loss": 0.73036442, "memory(GiB)": 147.13, "step": 41230, "train_speed(iter/s)": 0.201223 }, { "acc": 0.79161181, "epoch": 0.9621604281193952, "grad_norm": 4.71875, "learning_rate": 5.548061297777604e-06, "loss": 0.73119068, "memory(GiB)": 147.13, "step": 41240, "train_speed(iter/s)": 0.201249 }, { "acc": 0.8105217, "epoch": 0.9623937356916841, "grad_norm": 5.0, "learning_rate": 5.546183530055334e-06, "loss": 0.68218474, "memory(GiB)": 147.13, "step": 41250, "train_speed(iter/s)": 0.201274 }, { "acc": 0.75500202, "epoch": 0.962627043263973, "grad_norm": 7.15625, "learning_rate": 5.544305684365522e-06, "loss": 0.89543095, "memory(GiB)": 147.13, "step": 41260, "train_speed(iter/s)": 0.2013 }, { "acc": 0.77084942, "epoch": 0.9628603508362619, "grad_norm": 6.6875, "learning_rate": 5.542427760976232e-06, "loss": 0.82826271, "memory(GiB)": 147.13, "step": 41270, "train_speed(iter/s)": 0.201325 }, { "acc": 0.76677494, "epoch": 0.9630936584085508, "grad_norm": 8.4375, "learning_rate": 5.540549760155537e-06, "loss": 0.86246529, "memory(GiB)": 147.13, "step": 41280, "train_speed(iter/s)": 0.201348 }, { "acc": 0.79330645, "epoch": 0.9633269659808397, "grad_norm": 4.40625, "learning_rate": 5.53867168217152e-06, "loss": 0.74018192, "memory(GiB)": 147.13, "step": 41290, "train_speed(iter/s)": 0.201371 }, { "acc": 0.77103343, "epoch": 0.9635602735531285, "grad_norm": 5.84375, "learning_rate": 5.536793527292278e-06, "loss": 0.83305159, "memory(GiB)": 147.13, "step": 41300, "train_speed(iter/s)": 0.201395 }, { "acc": 0.76550531, "epoch": 0.9637935811254174, "grad_norm": 5.90625, "learning_rate": 5.5349152957859155e-06, "loss": 0.83599548, "memory(GiB)": 147.13, "step": 41310, "train_speed(iter/s)": 0.201419 }, { "acc": 0.77552824, "epoch": 0.9640268886977063, "grad_norm": 4.03125, "learning_rate": 5.53303698792055e-06, "loss": 0.82170572, "memory(GiB)": 147.13, "step": 41320, "train_speed(iter/s)": 0.201445 }, { "acc": 0.7870533, "epoch": 0.9642601962699952, "grad_norm": 8.375, "learning_rate": 5.531158603964309e-06, "loss": 0.75256863, "memory(GiB)": 147.13, "step": 41330, "train_speed(iter/s)": 0.20147 }, { "acc": 0.77868299, "epoch": 0.964493503842284, "grad_norm": 5.25, "learning_rate": 5.529280144185331e-06, "loss": 0.79275427, "memory(GiB)": 147.13, "step": 41340, "train_speed(iter/s)": 0.201495 }, { "acc": 0.77070007, "epoch": 0.964726811414573, "grad_norm": 4.53125, "learning_rate": 5.5274016088517676e-06, "loss": 0.82578239, "memory(GiB)": 147.13, "step": 41350, "train_speed(iter/s)": 0.201519 }, { "acc": 0.78871584, "epoch": 0.9649601189868618, "grad_norm": 5.125, "learning_rate": 5.525522998231777e-06, "loss": 0.76853447, "memory(GiB)": 147.13, "step": 41360, "train_speed(iter/s)": 0.201544 }, { "acc": 0.78402672, "epoch": 0.9651934265591507, "grad_norm": 4.65625, "learning_rate": 5.523644312593533e-06, "loss": 0.76367111, "memory(GiB)": 147.13, "step": 41370, "train_speed(iter/s)": 0.20157 }, { "acc": 0.79334326, "epoch": 0.9654267341314396, "grad_norm": 5.09375, "learning_rate": 5.521765552205213e-06, "loss": 0.734587, "memory(GiB)": 147.13, "step": 41380, "train_speed(iter/s)": 0.201596 }, { "acc": 0.7742136, "epoch": 0.9656600417037285, "grad_norm": 4.78125, "learning_rate": 5.519886717335012e-06, "loss": 0.82144842, "memory(GiB)": 147.13, "step": 41390, "train_speed(iter/s)": 0.201618 }, { "acc": 0.75570812, "epoch": 0.9658933492760174, "grad_norm": 5.75, "learning_rate": 5.518007808251135e-06, "loss": 0.89175406, "memory(GiB)": 147.13, "step": 41400, "train_speed(iter/s)": 0.201643 }, { "acc": 0.7830318, "epoch": 0.9661266568483063, "grad_norm": 5.34375, "learning_rate": 5.516128825221792e-06, "loss": 0.77546549, "memory(GiB)": 147.13, "step": 41410, "train_speed(iter/s)": 0.201668 }, { "acc": 0.77321138, "epoch": 0.9663599644205952, "grad_norm": 4.625, "learning_rate": 5.514249768515209e-06, "loss": 0.80514297, "memory(GiB)": 147.13, "step": 41420, "train_speed(iter/s)": 0.201691 }, { "acc": 0.78639116, "epoch": 0.9665932719928841, "grad_norm": 6.0625, "learning_rate": 5.512370638399622e-06, "loss": 0.77311621, "memory(GiB)": 147.13, "step": 41430, "train_speed(iter/s)": 0.201717 }, { "acc": 0.78260765, "epoch": 0.966826579565173, "grad_norm": 5.40625, "learning_rate": 5.510491435143275e-06, "loss": 0.76550779, "memory(GiB)": 147.13, "step": 41440, "train_speed(iter/s)": 0.201741 }, { "acc": 0.79104238, "epoch": 0.9670598871374619, "grad_norm": 7.21875, "learning_rate": 5.508612159014424e-06, "loss": 0.7574954, "memory(GiB)": 147.13, "step": 41450, "train_speed(iter/s)": 0.201766 }, { "acc": 0.77882171, "epoch": 0.9672931947097508, "grad_norm": 5.34375, "learning_rate": 5.506732810281335e-06, "loss": 0.80786209, "memory(GiB)": 147.13, "step": 41460, "train_speed(iter/s)": 0.201792 }, { "acc": 0.80258312, "epoch": 0.9675265022820397, "grad_norm": 3.875, "learning_rate": 5.504853389212285e-06, "loss": 0.72713561, "memory(GiB)": 147.13, "step": 41470, "train_speed(iter/s)": 0.201817 }, { "acc": 0.79807262, "epoch": 0.9677598098543286, "grad_norm": 5.125, "learning_rate": 5.502973896075559e-06, "loss": 0.72810946, "memory(GiB)": 147.13, "step": 41480, "train_speed(iter/s)": 0.201842 }, { "acc": 0.77658319, "epoch": 0.9679931174266175, "grad_norm": 6.90625, "learning_rate": 5.501094331139457e-06, "loss": 0.79982195, "memory(GiB)": 147.13, "step": 41490, "train_speed(iter/s)": 0.201866 }, { "acc": 0.75606503, "epoch": 0.9682264249989063, "grad_norm": 5.1875, "learning_rate": 5.499214694672283e-06, "loss": 0.88801994, "memory(GiB)": 147.13, "step": 41500, "train_speed(iter/s)": 0.201891 }, { "epoch": 0.9682264249989063, "eval_acc": 0.7436978039901857, "eval_loss": 0.8076462745666504, "eval_runtime": 1271.1744, "eval_samples_per_second": 28.313, "eval_steps_per_second": 14.157, "step": 41500 }, { "acc": 0.78003526, "epoch": 0.9684597325711952, "grad_norm": 5.21875, "learning_rate": 5.497334986942358e-06, "loss": 0.77953706, "memory(GiB)": 147.13, "step": 41510, "train_speed(iter/s)": 0.20066 }, { "acc": 0.78291988, "epoch": 0.9686930401434841, "grad_norm": 5.1875, "learning_rate": 5.495455208218008e-06, "loss": 0.7734293, "memory(GiB)": 147.13, "step": 41520, "train_speed(iter/s)": 0.200685 }, { "acc": 0.76620455, "epoch": 0.968926347715773, "grad_norm": 5.96875, "learning_rate": 5.493575358767571e-06, "loss": 0.82494164, "memory(GiB)": 147.13, "step": 41530, "train_speed(iter/s)": 0.200708 }, { "acc": 0.77570481, "epoch": 0.9691596552880619, "grad_norm": 6.125, "learning_rate": 5.491695438859394e-06, "loss": 0.79977193, "memory(GiB)": 147.13, "step": 41540, "train_speed(iter/s)": 0.200734 }, { "acc": 0.79774528, "epoch": 0.9693929628603508, "grad_norm": 4.875, "learning_rate": 5.489815448761837e-06, "loss": 0.72081814, "memory(GiB)": 147.13, "step": 41550, "train_speed(iter/s)": 0.200759 }, { "acc": 0.78871336, "epoch": 0.9696262704326397, "grad_norm": 6.8125, "learning_rate": 5.487935388743266e-06, "loss": 0.74919105, "memory(GiB)": 147.13, "step": 41560, "train_speed(iter/s)": 0.200783 }, { "acc": 0.76116972, "epoch": 0.9698595780049286, "grad_norm": 7.3125, "learning_rate": 5.486055259072059e-06, "loss": 0.86874237, "memory(GiB)": 147.13, "step": 41570, "train_speed(iter/s)": 0.200808 }, { "acc": 0.77257109, "epoch": 0.9700928855772175, "grad_norm": 5.21875, "learning_rate": 5.484175060016607e-06, "loss": 0.820648, "memory(GiB)": 147.13, "step": 41580, "train_speed(iter/s)": 0.200834 }, { "acc": 0.78412466, "epoch": 0.9703261931495064, "grad_norm": 5.78125, "learning_rate": 5.482294791845305e-06, "loss": 0.78075528, "memory(GiB)": 147.13, "step": 41590, "train_speed(iter/s)": 0.20086 }, { "acc": 0.79700346, "epoch": 0.9705595007217953, "grad_norm": 5.15625, "learning_rate": 5.480414454826563e-06, "loss": 0.7259798, "memory(GiB)": 147.13, "step": 41600, "train_speed(iter/s)": 0.200882 }, { "acc": 0.7838769, "epoch": 0.9707928082940842, "grad_norm": 4.65625, "learning_rate": 5.478534049228794e-06, "loss": 0.78670282, "memory(GiB)": 147.13, "step": 41610, "train_speed(iter/s)": 0.200909 }, { "acc": 0.77976408, "epoch": 0.9710261158663731, "grad_norm": 6.71875, "learning_rate": 5.476653575320432e-06, "loss": 0.79632206, "memory(GiB)": 147.13, "step": 41620, "train_speed(iter/s)": 0.200935 }, { "acc": 0.78164692, "epoch": 0.971259423438662, "grad_norm": 6.96875, "learning_rate": 5.474773033369908e-06, "loss": 0.77794366, "memory(GiB)": 147.13, "step": 41630, "train_speed(iter/s)": 0.200959 }, { "acc": 0.81677799, "epoch": 0.9714927310109509, "grad_norm": 5.84375, "learning_rate": 5.472892423645673e-06, "loss": 0.63997331, "memory(GiB)": 147.13, "step": 41640, "train_speed(iter/s)": 0.200985 }, { "acc": 0.77521987, "epoch": 0.9717260385832398, "grad_norm": 8.3125, "learning_rate": 5.47101174641618e-06, "loss": 0.81140451, "memory(GiB)": 147.13, "step": 41650, "train_speed(iter/s)": 0.20101 }, { "acc": 0.77916822, "epoch": 0.9719593461555287, "grad_norm": 7.21875, "learning_rate": 5.469131001949899e-06, "loss": 0.80082006, "memory(GiB)": 147.13, "step": 41660, "train_speed(iter/s)": 0.201035 }, { "acc": 0.76695161, "epoch": 0.9721926537278176, "grad_norm": 6.375, "learning_rate": 5.467250190515303e-06, "loss": 0.8524477, "memory(GiB)": 147.13, "step": 41670, "train_speed(iter/s)": 0.201061 }, { "acc": 0.79857035, "epoch": 0.9724259613001065, "grad_norm": 7.375, "learning_rate": 5.465369312380879e-06, "loss": 0.72482586, "memory(GiB)": 147.13, "step": 41680, "train_speed(iter/s)": 0.201086 }, { "acc": 0.77130175, "epoch": 0.9726592688723953, "grad_norm": 8.1875, "learning_rate": 5.463488367815119e-06, "loss": 0.8281208, "memory(GiB)": 147.13, "step": 41690, "train_speed(iter/s)": 0.201113 }, { "acc": 0.78064547, "epoch": 0.9728925764446842, "grad_norm": 5.65625, "learning_rate": 5.46160735708653e-06, "loss": 0.78882265, "memory(GiB)": 147.13, "step": 41700, "train_speed(iter/s)": 0.201139 }, { "acc": 0.79099216, "epoch": 0.9731258840169731, "grad_norm": 5.8125, "learning_rate": 5.459726280463625e-06, "loss": 0.75952692, "memory(GiB)": 147.13, "step": 41710, "train_speed(iter/s)": 0.201163 }, { "acc": 0.77816892, "epoch": 0.973359191589262, "grad_norm": 3.90625, "learning_rate": 5.4578451382149275e-06, "loss": 0.7955409, "memory(GiB)": 147.13, "step": 41720, "train_speed(iter/s)": 0.201188 }, { "acc": 0.79420767, "epoch": 0.9735924991615509, "grad_norm": 8.5625, "learning_rate": 5.455963930608969e-06, "loss": 0.75089536, "memory(GiB)": 147.13, "step": 41730, "train_speed(iter/s)": 0.201211 }, { "acc": 0.7890696, "epoch": 0.9738258067338398, "grad_norm": 7.65625, "learning_rate": 5.454082657914292e-06, "loss": 0.75170484, "memory(GiB)": 147.13, "step": 41740, "train_speed(iter/s)": 0.201236 }, { "acc": 0.79078021, "epoch": 0.9740591143061287, "grad_norm": 16.625, "learning_rate": 5.452201320399447e-06, "loss": 0.7524622, "memory(GiB)": 147.13, "step": 41750, "train_speed(iter/s)": 0.201261 }, { "acc": 0.80642824, "epoch": 0.9742924218784176, "grad_norm": 6.5, "learning_rate": 5.450319918332995e-06, "loss": 0.69903097, "memory(GiB)": 147.13, "step": 41760, "train_speed(iter/s)": 0.201285 }, { "acc": 0.79178162, "epoch": 0.9745257294507065, "grad_norm": 4.34375, "learning_rate": 5.448438451983507e-06, "loss": 0.74413743, "memory(GiB)": 147.13, "step": 41770, "train_speed(iter/s)": 0.201311 }, { "acc": 0.77486391, "epoch": 0.9747590370229954, "grad_norm": 4.0625, "learning_rate": 5.4465569216195576e-06, "loss": 0.82327709, "memory(GiB)": 147.13, "step": 41780, "train_speed(iter/s)": 0.201336 }, { "acc": 0.78499551, "epoch": 0.9749923445952843, "grad_norm": 6.21875, "learning_rate": 5.444675327509738e-06, "loss": 0.7698328, "memory(GiB)": 147.13, "step": 41790, "train_speed(iter/s)": 0.201362 }, { "acc": 0.78951769, "epoch": 0.9752256521675732, "grad_norm": 6.375, "learning_rate": 5.4427936699226455e-06, "loss": 0.75837417, "memory(GiB)": 147.13, "step": 41800, "train_speed(iter/s)": 0.201387 }, { "acc": 0.7888494, "epoch": 0.9754589597398621, "grad_norm": 9.125, "learning_rate": 5.440911949126885e-06, "loss": 0.75651331, "memory(GiB)": 147.13, "step": 41810, "train_speed(iter/s)": 0.201411 }, { "acc": 0.79928493, "epoch": 0.975692267312151, "grad_norm": 7.96875, "learning_rate": 5.4390301653910726e-06, "loss": 0.71884995, "memory(GiB)": 147.13, "step": 41820, "train_speed(iter/s)": 0.201435 }, { "acc": 0.76790247, "epoch": 0.9759255748844399, "grad_norm": 5.84375, "learning_rate": 5.4371483189838315e-06, "loss": 0.83689575, "memory(GiB)": 147.13, "step": 41830, "train_speed(iter/s)": 0.201459 }, { "acc": 0.78529429, "epoch": 0.9761588824567288, "grad_norm": 10.25, "learning_rate": 5.435266410173794e-06, "loss": 0.77676764, "memory(GiB)": 147.13, "step": 41840, "train_speed(iter/s)": 0.201483 }, { "acc": 0.75995874, "epoch": 0.9763921900290177, "grad_norm": 4.875, "learning_rate": 5.433384439229603e-06, "loss": 0.85082378, "memory(GiB)": 147.13, "step": 41850, "train_speed(iter/s)": 0.201509 }, { "acc": 0.7614418, "epoch": 0.9766254976013066, "grad_norm": 5.625, "learning_rate": 5.431502406419908e-06, "loss": 0.87694912, "memory(GiB)": 147.13, "step": 41860, "train_speed(iter/s)": 0.201535 }, { "acc": 0.80430679, "epoch": 0.9768588051735955, "grad_norm": 9.625, "learning_rate": 5.429620312013372e-06, "loss": 0.68969584, "memory(GiB)": 147.13, "step": 41870, "train_speed(iter/s)": 0.201561 }, { "acc": 0.76424179, "epoch": 0.9770921127458843, "grad_norm": 6.75, "learning_rate": 5.427738156278662e-06, "loss": 0.86263018, "memory(GiB)": 147.13, "step": 41880, "train_speed(iter/s)": 0.201587 }, { "acc": 0.77523551, "epoch": 0.9773254203181732, "grad_norm": 4.25, "learning_rate": 5.4258559394844515e-06, "loss": 0.80997353, "memory(GiB)": 147.13, "step": 41890, "train_speed(iter/s)": 0.201613 }, { "acc": 0.76072025, "epoch": 0.977558727890462, "grad_norm": 4.78125, "learning_rate": 5.423973661899431e-06, "loss": 0.87792006, "memory(GiB)": 147.13, "step": 41900, "train_speed(iter/s)": 0.201639 }, { "acc": 0.78689508, "epoch": 0.977792035462751, "grad_norm": 6.0, "learning_rate": 5.4220913237922936e-06, "loss": 0.76750278, "memory(GiB)": 147.13, "step": 41910, "train_speed(iter/s)": 0.201663 }, { "acc": 0.77254424, "epoch": 0.9780253430350399, "grad_norm": 11.1875, "learning_rate": 5.4202089254317415e-06, "loss": 0.83702393, "memory(GiB)": 147.13, "step": 41920, "train_speed(iter/s)": 0.20169 }, { "acc": 0.77660599, "epoch": 0.9782586506073288, "grad_norm": 5.21875, "learning_rate": 5.418326467086488e-06, "loss": 0.79623809, "memory(GiB)": 147.13, "step": 41930, "train_speed(iter/s)": 0.201717 }, { "acc": 0.76378241, "epoch": 0.9784919581796176, "grad_norm": 5.40625, "learning_rate": 5.416443949025253e-06, "loss": 0.84370394, "memory(GiB)": 147.13, "step": 41940, "train_speed(iter/s)": 0.201744 }, { "acc": 0.76264544, "epoch": 0.9787252657519065, "grad_norm": 4.8125, "learning_rate": 5.414561371516764e-06, "loss": 0.86721058, "memory(GiB)": 147.13, "step": 41950, "train_speed(iter/s)": 0.20177 }, { "acc": 0.81001492, "epoch": 0.9789585733241954, "grad_norm": 6.375, "learning_rate": 5.41267873482976e-06, "loss": 0.70287361, "memory(GiB)": 147.13, "step": 41960, "train_speed(iter/s)": 0.201794 }, { "acc": 0.78907146, "epoch": 0.9791918808964843, "grad_norm": 7.46875, "learning_rate": 5.410796039232989e-06, "loss": 0.73023872, "memory(GiB)": 147.13, "step": 41970, "train_speed(iter/s)": 0.201819 }, { "acc": 0.77236247, "epoch": 0.9794251884687732, "grad_norm": 6.0, "learning_rate": 5.4089132849952e-06, "loss": 0.83747387, "memory(GiB)": 147.13, "step": 41980, "train_speed(iter/s)": 0.201845 }, { "acc": 0.78121223, "epoch": 0.9796584960410621, "grad_norm": 4.875, "learning_rate": 5.407030472385158e-06, "loss": 0.78266506, "memory(GiB)": 147.13, "step": 41990, "train_speed(iter/s)": 0.201871 }, { "acc": 0.76154213, "epoch": 0.979891803613351, "grad_norm": 4.875, "learning_rate": 5.4051476016716365e-06, "loss": 0.85525732, "memory(GiB)": 147.13, "step": 42000, "train_speed(iter/s)": 0.201896 }, { "epoch": 0.979891803613351, "eval_acc": 0.743733035721292, "eval_loss": 0.8074895739555359, "eval_runtime": 1270.6849, "eval_samples_per_second": 28.324, "eval_steps_per_second": 14.162, "step": 42000 }, { "acc": 0.79927797, "epoch": 0.9801251111856399, "grad_norm": 9.125, "learning_rate": 5.4032646731234115e-06, "loss": 0.72784662, "memory(GiB)": 147.13, "step": 42010, "train_speed(iter/s)": 0.200679 }, { "acc": 0.78354425, "epoch": 0.9803584187579288, "grad_norm": 6.53125, "learning_rate": 5.401381687009271e-06, "loss": 0.78162694, "memory(GiB)": 147.13, "step": 42020, "train_speed(iter/s)": 0.200704 }, { "acc": 0.77189026, "epoch": 0.9805917263302177, "grad_norm": 5.625, "learning_rate": 5.399498643598011e-06, "loss": 0.81935215, "memory(GiB)": 147.13, "step": 42030, "train_speed(iter/s)": 0.200729 }, { "acc": 0.77528439, "epoch": 0.9808250339025066, "grad_norm": 6.21875, "learning_rate": 5.3976155431584375e-06, "loss": 0.82163353, "memory(GiB)": 147.13, "step": 42040, "train_speed(iter/s)": 0.200752 }, { "acc": 0.78729029, "epoch": 0.9810583414747955, "grad_norm": 4.9375, "learning_rate": 5.3957323859593604e-06, "loss": 0.77468286, "memory(GiB)": 147.13, "step": 42050, "train_speed(iter/s)": 0.200777 }, { "acc": 0.77076778, "epoch": 0.9812916490470844, "grad_norm": 4.4375, "learning_rate": 5.3938491722695996e-06, "loss": 0.82037773, "memory(GiB)": 147.13, "step": 42060, "train_speed(iter/s)": 0.2008 }, { "acc": 0.75968246, "epoch": 0.9815249566193732, "grad_norm": 6.15625, "learning_rate": 5.391965902357983e-06, "loss": 0.88906879, "memory(GiB)": 147.13, "step": 42070, "train_speed(iter/s)": 0.200825 }, { "acc": 0.78322248, "epoch": 0.9817582641916621, "grad_norm": 5.40625, "learning_rate": 5.390082576493348e-06, "loss": 0.76822419, "memory(GiB)": 147.13, "step": 42080, "train_speed(iter/s)": 0.20085 }, { "acc": 0.77662072, "epoch": 0.981991571763951, "grad_norm": 6.71875, "learning_rate": 5.388199194944539e-06, "loss": 0.80375099, "memory(GiB)": 147.13, "step": 42090, "train_speed(iter/s)": 0.200875 }, { "acc": 0.79028854, "epoch": 0.9822248793362399, "grad_norm": 4.5625, "learning_rate": 5.3863157579804075e-06, "loss": 0.72856355, "memory(GiB)": 147.13, "step": 42100, "train_speed(iter/s)": 0.200899 }, { "acc": 0.76783481, "epoch": 0.9824581869085288, "grad_norm": 5.09375, "learning_rate": 5.384432265869815e-06, "loss": 0.84242039, "memory(GiB)": 147.13, "step": 42110, "train_speed(iter/s)": 0.200924 }, { "acc": 0.7857872, "epoch": 0.9826914944808177, "grad_norm": 6.96875, "learning_rate": 5.382548718881627e-06, "loss": 0.77328167, "memory(GiB)": 147.13, "step": 42120, "train_speed(iter/s)": 0.20095 }, { "acc": 0.78016534, "epoch": 0.9829248020531066, "grad_norm": 4.59375, "learning_rate": 5.380665117284721e-06, "loss": 0.78974218, "memory(GiB)": 147.13, "step": 42130, "train_speed(iter/s)": 0.200974 }, { "acc": 0.78708715, "epoch": 0.9831581096253955, "grad_norm": 5.0625, "learning_rate": 5.378781461347979e-06, "loss": 0.76465178, "memory(GiB)": 147.13, "step": 42140, "train_speed(iter/s)": 0.200997 }, { "acc": 0.77445755, "epoch": 0.9833914171976844, "grad_norm": 4.6875, "learning_rate": 5.376897751340294e-06, "loss": 0.82658815, "memory(GiB)": 147.13, "step": 42150, "train_speed(iter/s)": 0.201023 }, { "acc": 0.78046808, "epoch": 0.9836247247699733, "grad_norm": 8.625, "learning_rate": 5.375013987530565e-06, "loss": 0.78468456, "memory(GiB)": 147.13, "step": 42160, "train_speed(iter/s)": 0.201047 }, { "acc": 0.77549314, "epoch": 0.9838580323422622, "grad_norm": 7.78125, "learning_rate": 5.3731301701876985e-06, "loss": 0.81372795, "memory(GiB)": 147.13, "step": 42170, "train_speed(iter/s)": 0.201073 }, { "acc": 0.77141991, "epoch": 0.9840913399145511, "grad_norm": 4.96875, "learning_rate": 5.371246299580608e-06, "loss": 0.82050037, "memory(GiB)": 147.13, "step": 42180, "train_speed(iter/s)": 0.201097 }, { "acc": 0.79150095, "epoch": 0.98432464748684, "grad_norm": 6.59375, "learning_rate": 5.3693623759782165e-06, "loss": 0.73675623, "memory(GiB)": 147.13, "step": 42190, "train_speed(iter/s)": 0.201123 }, { "acc": 0.75877752, "epoch": 0.9845579550591289, "grad_norm": 4.71875, "learning_rate": 5.367478399649453e-06, "loss": 0.85596247, "memory(GiB)": 147.13, "step": 42200, "train_speed(iter/s)": 0.201147 }, { "acc": 0.77118468, "epoch": 0.9847912626314178, "grad_norm": 6.4375, "learning_rate": 5.365594370863254e-06, "loss": 0.81483583, "memory(GiB)": 147.13, "step": 42210, "train_speed(iter/s)": 0.201172 }, { "acc": 0.77900705, "epoch": 0.9850245702037067, "grad_norm": 6.0, "learning_rate": 5.363710289888564e-06, "loss": 0.79868593, "memory(GiB)": 147.13, "step": 42220, "train_speed(iter/s)": 0.201196 }, { "acc": 0.77433815, "epoch": 0.9852578777759956, "grad_norm": 5.5625, "learning_rate": 5.361826156994338e-06, "loss": 0.81312532, "memory(GiB)": 147.13, "step": 42230, "train_speed(iter/s)": 0.201221 }, { "acc": 0.79729567, "epoch": 0.9854911853482845, "grad_norm": 5.875, "learning_rate": 5.359941972449532e-06, "loss": 0.73013325, "memory(GiB)": 147.13, "step": 42240, "train_speed(iter/s)": 0.201245 }, { "acc": 0.78885775, "epoch": 0.9857244929205734, "grad_norm": 15.5625, "learning_rate": 5.358057736523114e-06, "loss": 0.75900469, "memory(GiB)": 147.13, "step": 42250, "train_speed(iter/s)": 0.20127 }, { "acc": 0.76385336, "epoch": 0.9859578004928623, "grad_norm": 6.15625, "learning_rate": 5.356173449484059e-06, "loss": 0.87608242, "memory(GiB)": 147.13, "step": 42260, "train_speed(iter/s)": 0.201292 }, { "acc": 0.78156281, "epoch": 0.9861911080651511, "grad_norm": 6.4375, "learning_rate": 5.3542891116013465e-06, "loss": 0.75918655, "memory(GiB)": 147.13, "step": 42270, "train_speed(iter/s)": 0.201318 }, { "acc": 0.78791151, "epoch": 0.98642441563744, "grad_norm": 5.1875, "learning_rate": 5.352404723143968e-06, "loss": 0.74543934, "memory(GiB)": 147.13, "step": 42280, "train_speed(iter/s)": 0.201342 }, { "acc": 0.7900342, "epoch": 0.9866577232097289, "grad_norm": 4.71875, "learning_rate": 5.350520284380916e-06, "loss": 0.74275174, "memory(GiB)": 147.13, "step": 42290, "train_speed(iter/s)": 0.201366 }, { "acc": 0.78988333, "epoch": 0.9868910307820178, "grad_norm": 5.375, "learning_rate": 5.3486357955811945e-06, "loss": 0.75508475, "memory(GiB)": 147.13, "step": 42300, "train_speed(iter/s)": 0.20139 }, { "acc": 0.76127229, "epoch": 0.9871243383543067, "grad_norm": 10.625, "learning_rate": 5.346751257013815e-06, "loss": 0.85649261, "memory(GiB)": 147.13, "step": 42310, "train_speed(iter/s)": 0.201414 }, { "acc": 0.76956244, "epoch": 0.9873576459265956, "grad_norm": 7.53125, "learning_rate": 5.344866668947794e-06, "loss": 0.8190443, "memory(GiB)": 147.13, "step": 42320, "train_speed(iter/s)": 0.201439 }, { "acc": 0.76760712, "epoch": 0.9875909534988845, "grad_norm": 5.03125, "learning_rate": 5.342982031652159e-06, "loss": 0.84836092, "memory(GiB)": 147.13, "step": 42330, "train_speed(iter/s)": 0.201462 }, { "acc": 0.77389174, "epoch": 0.9878242610711734, "grad_norm": 8.375, "learning_rate": 5.341097345395937e-06, "loss": 0.7971365, "memory(GiB)": 147.13, "step": 42340, "train_speed(iter/s)": 0.201487 }, { "acc": 0.78406467, "epoch": 0.9880575686434623, "grad_norm": 7.625, "learning_rate": 5.339212610448167e-06, "loss": 0.76403122, "memory(GiB)": 147.13, "step": 42350, "train_speed(iter/s)": 0.201509 }, { "acc": 0.7485312, "epoch": 0.9882908762157512, "grad_norm": 4.40625, "learning_rate": 5.3373278270778965e-06, "loss": 0.91936789, "memory(GiB)": 147.13, "step": 42360, "train_speed(iter/s)": 0.201534 }, { "acc": 0.78323016, "epoch": 0.9885241837880401, "grad_norm": 4.46875, "learning_rate": 5.3354429955541755e-06, "loss": 0.76532645, "memory(GiB)": 147.13, "step": 42370, "train_speed(iter/s)": 0.201559 }, { "acc": 0.774506, "epoch": 0.988757491360329, "grad_norm": 6.3125, "learning_rate": 5.333558116146063e-06, "loss": 0.78331833, "memory(GiB)": 147.13, "step": 42380, "train_speed(iter/s)": 0.201585 }, { "acc": 0.77661877, "epoch": 0.9889907989326179, "grad_norm": 4.90625, "learning_rate": 5.33167318912263e-06, "loss": 0.79144692, "memory(GiB)": 147.13, "step": 42390, "train_speed(iter/s)": 0.201608 }, { "acc": 0.79478455, "epoch": 0.9892241065049068, "grad_norm": 5.21875, "learning_rate": 5.329788214752944e-06, "loss": 0.72164011, "memory(GiB)": 147.13, "step": 42400, "train_speed(iter/s)": 0.201632 }, { "acc": 0.77908754, "epoch": 0.9894574140771957, "grad_norm": 10.3125, "learning_rate": 5.327903193306087e-06, "loss": 0.77795801, "memory(GiB)": 147.13, "step": 42410, "train_speed(iter/s)": 0.201655 }, { "acc": 0.76483593, "epoch": 0.9896907216494846, "grad_norm": 8.0, "learning_rate": 5.326018125051142e-06, "loss": 0.84128876, "memory(GiB)": 147.13, "step": 42420, "train_speed(iter/s)": 0.20168 }, { "acc": 0.79741845, "epoch": 0.9899240292217735, "grad_norm": 4.0625, "learning_rate": 5.324133010257206e-06, "loss": 0.73309522, "memory(GiB)": 147.13, "step": 42430, "train_speed(iter/s)": 0.201704 }, { "acc": 0.78365011, "epoch": 0.9901573367940624, "grad_norm": 5.78125, "learning_rate": 5.3222478491933775e-06, "loss": 0.78707895, "memory(GiB)": 147.13, "step": 42440, "train_speed(iter/s)": 0.201729 }, { "acc": 0.77475691, "epoch": 0.9903906443663513, "grad_norm": 8.5, "learning_rate": 5.320362642128761e-06, "loss": 0.82604122, "memory(GiB)": 147.13, "step": 42450, "train_speed(iter/s)": 0.201754 }, { "acc": 0.80049133, "epoch": 0.9906239519386401, "grad_norm": 5.875, "learning_rate": 5.318477389332471e-06, "loss": 0.72088904, "memory(GiB)": 147.13, "step": 42460, "train_speed(iter/s)": 0.201781 }, { "acc": 0.75600042, "epoch": 0.990857259510929, "grad_norm": 5.875, "learning_rate": 5.316592091073626e-06, "loss": 0.88159542, "memory(GiB)": 147.13, "step": 42470, "train_speed(iter/s)": 0.201805 }, { "acc": 0.7931004, "epoch": 0.9910905670832179, "grad_norm": 4.0625, "learning_rate": 5.314706747621352e-06, "loss": 0.73830881, "memory(GiB)": 147.13, "step": 42480, "train_speed(iter/s)": 0.201829 }, { "acc": 0.77918282, "epoch": 0.9913238746555068, "grad_norm": 4.65625, "learning_rate": 5.312821359244781e-06, "loss": 0.81584482, "memory(GiB)": 147.13, "step": 42490, "train_speed(iter/s)": 0.201854 }, { "acc": 0.76876907, "epoch": 0.9915571822277957, "grad_norm": 16.625, "learning_rate": 5.310935926213052e-06, "loss": 0.82897224, "memory(GiB)": 147.13, "step": 42500, "train_speed(iter/s)": 0.201878 }, { "epoch": 0.9915571822277957, "eval_acc": 0.7438225563471484, "eval_loss": 0.8072462677955627, "eval_runtime": 1270.4976, "eval_samples_per_second": 28.328, "eval_steps_per_second": 14.165, "step": 42500 }, { "acc": 0.76333456, "epoch": 0.9917904898000846, "grad_norm": 4.8125, "learning_rate": 5.309050448795311e-06, "loss": 0.84974442, "memory(GiB)": 147.13, "step": 42510, "train_speed(iter/s)": 0.200677 }, { "acc": 0.77313266, "epoch": 0.9920237973723735, "grad_norm": 5.6875, "learning_rate": 5.307164927260706e-06, "loss": 0.82176523, "memory(GiB)": 147.13, "step": 42520, "train_speed(iter/s)": 0.200701 }, { "acc": 0.78784313, "epoch": 0.9922571049446623, "grad_norm": 4.4375, "learning_rate": 5.305279361878398e-06, "loss": 0.75282192, "memory(GiB)": 147.13, "step": 42530, "train_speed(iter/s)": 0.200728 }, { "acc": 0.79874549, "epoch": 0.9924904125169512, "grad_norm": 4.5, "learning_rate": 5.30339375291755e-06, "loss": 0.71279907, "memory(GiB)": 147.13, "step": 42540, "train_speed(iter/s)": 0.200753 }, { "acc": 0.77315574, "epoch": 0.9927237200892401, "grad_norm": 5.25, "learning_rate": 5.3015081006473315e-06, "loss": 0.81193829, "memory(GiB)": 147.13, "step": 42550, "train_speed(iter/s)": 0.200777 }, { "acc": 0.77819738, "epoch": 0.992957027661529, "grad_norm": 5.375, "learning_rate": 5.299622405336919e-06, "loss": 0.78049955, "memory(GiB)": 147.13, "step": 42560, "train_speed(iter/s)": 0.200802 }, { "acc": 0.78241773, "epoch": 0.993190335233818, "grad_norm": 6.71875, "learning_rate": 5.297736667255497e-06, "loss": 0.79108753, "memory(GiB)": 147.13, "step": 42570, "train_speed(iter/s)": 0.200825 }, { "acc": 0.78949213, "epoch": 0.9934236428061068, "grad_norm": 5.5625, "learning_rate": 5.2958508866722506e-06, "loss": 0.75022783, "memory(GiB)": 147.13, "step": 42580, "train_speed(iter/s)": 0.200849 }, { "acc": 0.75920568, "epoch": 0.9936569503783957, "grad_norm": 6.25, "learning_rate": 5.293965063856375e-06, "loss": 0.8708231, "memory(GiB)": 147.13, "step": 42590, "train_speed(iter/s)": 0.200872 }, { "acc": 0.76359487, "epoch": 0.9938902579506846, "grad_norm": 7.6875, "learning_rate": 5.292079199077073e-06, "loss": 0.85245781, "memory(GiB)": 147.13, "step": 42600, "train_speed(iter/s)": 0.200898 }, { "acc": 0.77966127, "epoch": 0.9941235655229735, "grad_norm": 5.9375, "learning_rate": 5.290193292603551e-06, "loss": 0.7903245, "memory(GiB)": 147.13, "step": 42610, "train_speed(iter/s)": 0.200923 }, { "acc": 0.78285179, "epoch": 0.9943568730952624, "grad_norm": 7.59375, "learning_rate": 5.2883073447050205e-06, "loss": 0.78474703, "memory(GiB)": 147.13, "step": 42620, "train_speed(iter/s)": 0.200946 }, { "acc": 0.78321028, "epoch": 0.9945901806675513, "grad_norm": 5.09375, "learning_rate": 5.2864213556507e-06, "loss": 0.79646807, "memory(GiB)": 147.13, "step": 42630, "train_speed(iter/s)": 0.200971 }, { "acc": 0.76653428, "epoch": 0.9948234882398402, "grad_norm": 7.8125, "learning_rate": 5.2845353257098146e-06, "loss": 0.84334564, "memory(GiB)": 147.13, "step": 42640, "train_speed(iter/s)": 0.200996 }, { "acc": 0.77440205, "epoch": 0.995056795812129, "grad_norm": 5.8125, "learning_rate": 5.282649255151593e-06, "loss": 0.80458755, "memory(GiB)": 147.13, "step": 42650, "train_speed(iter/s)": 0.201021 }, { "acc": 0.77449627, "epoch": 0.9952901033844179, "grad_norm": 4.875, "learning_rate": 5.280763144245272e-06, "loss": 0.81251926, "memory(GiB)": 147.13, "step": 42660, "train_speed(iter/s)": 0.201043 }, { "acc": 0.77925887, "epoch": 0.9955234109567068, "grad_norm": 6.125, "learning_rate": 5.2788769932600944e-06, "loss": 0.79605002, "memory(GiB)": 147.13, "step": 42670, "train_speed(iter/s)": 0.201068 }, { "acc": 0.77389841, "epoch": 0.9957567185289957, "grad_norm": 6.71875, "learning_rate": 5.276990802465309e-06, "loss": 0.80099325, "memory(GiB)": 147.13, "step": 42680, "train_speed(iter/s)": 0.201092 }, { "acc": 0.77797318, "epoch": 0.9959900261012846, "grad_norm": 7.3125, "learning_rate": 5.275104572130167e-06, "loss": 0.78075852, "memory(GiB)": 147.13, "step": 42690, "train_speed(iter/s)": 0.201116 }, { "acc": 0.78155584, "epoch": 0.9962233336735735, "grad_norm": 5.125, "learning_rate": 5.273218302523925e-06, "loss": 0.78047514, "memory(GiB)": 147.13, "step": 42700, "train_speed(iter/s)": 0.201141 }, { "acc": 0.78246479, "epoch": 0.9964566412458624, "grad_norm": 5.4375, "learning_rate": 5.2713319939158494e-06, "loss": 0.79133334, "memory(GiB)": 147.13, "step": 42710, "train_speed(iter/s)": 0.201163 }, { "acc": 0.78450847, "epoch": 0.9966899488181513, "grad_norm": 4.625, "learning_rate": 5.2694456465752104e-06, "loss": 0.76527557, "memory(GiB)": 147.13, "step": 42720, "train_speed(iter/s)": 0.201188 }, { "acc": 0.77672338, "epoch": 0.9969232563904402, "grad_norm": 5.40625, "learning_rate": 5.267559260771285e-06, "loss": 0.78869748, "memory(GiB)": 147.13, "step": 42730, "train_speed(iter/s)": 0.201211 }, { "acc": 0.79831858, "epoch": 0.9971565639627291, "grad_norm": 5.4375, "learning_rate": 5.265672836773353e-06, "loss": 0.73061762, "memory(GiB)": 147.13, "step": 42740, "train_speed(iter/s)": 0.201234 }, { "acc": 0.76231604, "epoch": 0.997389871535018, "grad_norm": 6.9375, "learning_rate": 5.2637863748507e-06, "loss": 0.87828217, "memory(GiB)": 147.13, "step": 42750, "train_speed(iter/s)": 0.201257 }, { "acc": 0.77685285, "epoch": 0.9976231791073069, "grad_norm": 4.75, "learning_rate": 5.261899875272619e-06, "loss": 0.79581809, "memory(GiB)": 147.13, "step": 42760, "train_speed(iter/s)": 0.201281 }, { "acc": 0.77012229, "epoch": 0.9978564866795958, "grad_norm": 5.59375, "learning_rate": 5.260013338308408e-06, "loss": 0.84313641, "memory(GiB)": 147.13, "step": 42770, "train_speed(iter/s)": 0.201305 }, { "acc": 0.7849412, "epoch": 0.9980897942518847, "grad_norm": 3.953125, "learning_rate": 5.258126764227366e-06, "loss": 0.76588078, "memory(GiB)": 147.13, "step": 42780, "train_speed(iter/s)": 0.201328 }, { "acc": 0.77974372, "epoch": 0.9983231018241736, "grad_norm": 6.03125, "learning_rate": 5.256240153298804e-06, "loss": 0.78483295, "memory(GiB)": 147.13, "step": 42790, "train_speed(iter/s)": 0.201355 }, { "acc": 0.78103266, "epoch": 0.9985564093964625, "grad_norm": 5.21875, "learning_rate": 5.254353505792036e-06, "loss": 0.78471365, "memory(GiB)": 147.13, "step": 42800, "train_speed(iter/s)": 0.201379 }, { "acc": 0.76989121, "epoch": 0.9987897169687514, "grad_norm": 6.0625, "learning_rate": 5.252466821976377e-06, "loss": 0.82569427, "memory(GiB)": 147.13, "step": 42810, "train_speed(iter/s)": 0.201404 }, { "acc": 0.77489061, "epoch": 0.9990230245410403, "grad_norm": 4.59375, "learning_rate": 5.250580102121153e-06, "loss": 0.81538868, "memory(GiB)": 147.13, "step": 42820, "train_speed(iter/s)": 0.201429 }, { "acc": 0.78204842, "epoch": 0.9992563321133292, "grad_norm": 4.625, "learning_rate": 5.248693346495694e-06, "loss": 0.78283563, "memory(GiB)": 147.13, "step": 42830, "train_speed(iter/s)": 0.201452 }, { "acc": 0.79220905, "epoch": 0.999489639685618, "grad_norm": 4.78125, "learning_rate": 5.2468065553693306e-06, "loss": 0.73160219, "memory(GiB)": 147.13, "step": 42840, "train_speed(iter/s)": 0.201476 }, { "acc": 0.77306366, "epoch": 0.9997229472579069, "grad_norm": 5.90625, "learning_rate": 5.244919729011403e-06, "loss": 0.8217906, "memory(GiB)": 147.13, "step": 42850, "train_speed(iter/s)": 0.201501 }, { "acc": 0.78262877, "epoch": 0.9999562548301958, "grad_norm": 6.875, "learning_rate": 5.243032867691257e-06, "loss": 0.78975267, "memory(GiB)": 147.13, "step": 42860, "train_speed(iter/s)": 0.201522 }, { "acc": 0.78576965, "epoch": 1.0001895624024848, "grad_norm": 3.8125, "learning_rate": 5.241145971678238e-06, "loss": 0.76854234, "memory(GiB)": 147.13, "step": 42870, "train_speed(iter/s)": 0.201543 }, { "acc": 0.7733151, "epoch": 1.0004228699747737, "grad_norm": 4.6875, "learning_rate": 5.239259041241701e-06, "loss": 0.82483797, "memory(GiB)": 147.13, "step": 42880, "train_speed(iter/s)": 0.201568 }, { "acc": 0.80650272, "epoch": 1.0006561775470626, "grad_norm": 4.28125, "learning_rate": 5.237372076651006e-06, "loss": 0.67205696, "memory(GiB)": 147.13, "step": 42890, "train_speed(iter/s)": 0.20159 }, { "acc": 0.78537331, "epoch": 1.0008894851193515, "grad_norm": 6.03125, "learning_rate": 5.2354850781755175e-06, "loss": 0.76955976, "memory(GiB)": 147.13, "step": 42900, "train_speed(iter/s)": 0.201612 }, { "acc": 0.76812172, "epoch": 1.0011227926916404, "grad_norm": 3.875, "learning_rate": 5.233598046084602e-06, "loss": 0.85648232, "memory(GiB)": 147.13, "step": 42910, "train_speed(iter/s)": 0.201637 }, { "acc": 0.77023239, "epoch": 1.0013561002639293, "grad_norm": 5.125, "learning_rate": 5.231710980647632e-06, "loss": 0.8251276, "memory(GiB)": 147.13, "step": 42920, "train_speed(iter/s)": 0.201662 }, { "acc": 0.78217449, "epoch": 1.0015894078362182, "grad_norm": 5.625, "learning_rate": 5.229823882133987e-06, "loss": 0.78995705, "memory(GiB)": 147.13, "step": 42930, "train_speed(iter/s)": 0.201687 }, { "acc": 0.77538042, "epoch": 1.0018227154085069, "grad_norm": 6.96875, "learning_rate": 5.22793675081305e-06, "loss": 0.78225121, "memory(GiB)": 147.13, "step": 42940, "train_speed(iter/s)": 0.201711 }, { "acc": 0.77648048, "epoch": 1.0020560229807958, "grad_norm": 6.875, "learning_rate": 5.226049586954207e-06, "loss": 0.81052818, "memory(GiB)": 147.13, "step": 42950, "train_speed(iter/s)": 0.201735 }, { "acc": 0.78938169, "epoch": 1.0022893305530847, "grad_norm": 4.78125, "learning_rate": 5.2241623908268524e-06, "loss": 0.77443514, "memory(GiB)": 147.13, "step": 42960, "train_speed(iter/s)": 0.201759 }, { "acc": 0.76893907, "epoch": 1.0025226381253736, "grad_norm": 4.75, "learning_rate": 5.222275162700382e-06, "loss": 0.84441872, "memory(GiB)": 147.13, "step": 42970, "train_speed(iter/s)": 0.201781 }, { "acc": 0.77256842, "epoch": 1.0027559456976625, "grad_norm": 8.75, "learning_rate": 5.2203879028441975e-06, "loss": 0.83643208, "memory(GiB)": 147.13, "step": 42980, "train_speed(iter/s)": 0.201807 }, { "acc": 0.76644096, "epoch": 1.0029892532699514, "grad_norm": 5.375, "learning_rate": 5.218500611527701e-06, "loss": 0.82999525, "memory(GiB)": 147.13, "step": 42990, "train_speed(iter/s)": 0.201831 }, { "acc": 0.77953863, "epoch": 1.0032225608422403, "grad_norm": 5.1875, "learning_rate": 5.216613289020307e-06, "loss": 0.77615538, "memory(GiB)": 147.13, "step": 43000, "train_speed(iter/s)": 0.201856 }, { "epoch": 1.0032225608422403, "eval_acc": 0.7438829307227262, "eval_loss": 0.8073763847351074, "eval_runtime": 1269.4805, "eval_samples_per_second": 28.351, "eval_steps_per_second": 14.176, "step": 43000 }, { "acc": 0.77727814, "epoch": 1.0034558684145292, "grad_norm": 4.84375, "learning_rate": 5.214725935591429e-06, "loss": 0.80646038, "memory(GiB)": 147.13, "step": 43010, "train_speed(iter/s)": 0.200669 }, { "acc": 0.7740561, "epoch": 1.003689175986818, "grad_norm": 6.75, "learning_rate": 5.2128385515104865e-06, "loss": 0.81611605, "memory(GiB)": 147.13, "step": 43020, "train_speed(iter/s)": 0.200692 }, { "acc": 0.77674236, "epoch": 1.003922483559107, "grad_norm": 6.1875, "learning_rate": 5.210951137046903e-06, "loss": 0.80263958, "memory(GiB)": 147.13, "step": 43030, "train_speed(iter/s)": 0.200718 }, { "acc": 0.77732944, "epoch": 1.0041557911313959, "grad_norm": 6.78125, "learning_rate": 5.209063692470104e-06, "loss": 0.80889053, "memory(GiB)": 147.13, "step": 43040, "train_speed(iter/s)": 0.200743 }, { "acc": 0.75513368, "epoch": 1.0043890987036848, "grad_norm": 5.40625, "learning_rate": 5.207176218049526e-06, "loss": 0.88391256, "memory(GiB)": 147.13, "step": 43050, "train_speed(iter/s)": 0.200767 }, { "acc": 0.77179451, "epoch": 1.0046224062759737, "grad_norm": 4.71875, "learning_rate": 5.205288714054602e-06, "loss": 0.80851946, "memory(GiB)": 147.13, "step": 43060, "train_speed(iter/s)": 0.200791 }, { "acc": 0.77560835, "epoch": 1.0048557138482626, "grad_norm": 5.1875, "learning_rate": 5.203401180754772e-06, "loss": 0.79950652, "memory(GiB)": 147.13, "step": 43070, "train_speed(iter/s)": 0.200815 }, { "acc": 0.76174564, "epoch": 1.0050890214205515, "grad_norm": 4.53125, "learning_rate": 5.201513618419486e-06, "loss": 0.84228649, "memory(GiB)": 147.13, "step": 43080, "train_speed(iter/s)": 0.200839 }, { "acc": 0.78287921, "epoch": 1.0053223289928404, "grad_norm": 5.9375, "learning_rate": 5.199626027318188e-06, "loss": 0.78160782, "memory(GiB)": 147.13, "step": 43090, "train_speed(iter/s)": 0.200863 }, { "acc": 0.78439932, "epoch": 1.0055556365651293, "grad_norm": 5.09375, "learning_rate": 5.197738407720331e-06, "loss": 0.76160207, "memory(GiB)": 147.13, "step": 43100, "train_speed(iter/s)": 0.200886 }, { "acc": 0.77529421, "epoch": 1.0057889441374182, "grad_norm": 5.96875, "learning_rate": 5.195850759895374e-06, "loss": 0.8274806, "memory(GiB)": 147.13, "step": 43110, "train_speed(iter/s)": 0.200911 }, { "acc": 0.79326754, "epoch": 1.006022251709707, "grad_norm": 5.96875, "learning_rate": 5.193963084112781e-06, "loss": 0.73483434, "memory(GiB)": 147.13, "step": 43120, "train_speed(iter/s)": 0.200935 }, { "acc": 0.78251967, "epoch": 1.006255559281996, "grad_norm": 4.96875, "learning_rate": 5.192075380642011e-06, "loss": 0.76476879, "memory(GiB)": 147.13, "step": 43130, "train_speed(iter/s)": 0.200958 }, { "acc": 0.78560572, "epoch": 1.0064888668542848, "grad_norm": 6.3125, "learning_rate": 5.190187649752538e-06, "loss": 0.76422467, "memory(GiB)": 147.13, "step": 43140, "train_speed(iter/s)": 0.200984 }, { "acc": 0.77720866, "epoch": 1.0067221744265737, "grad_norm": 6.53125, "learning_rate": 5.1882998917138324e-06, "loss": 0.79875774, "memory(GiB)": 147.13, "step": 43150, "train_speed(iter/s)": 0.201008 }, { "acc": 0.78121185, "epoch": 1.0069554819988626, "grad_norm": 5.28125, "learning_rate": 5.186412106795371e-06, "loss": 0.78524699, "memory(GiB)": 147.13, "step": 43160, "train_speed(iter/s)": 0.201033 }, { "acc": 0.7672533, "epoch": 1.0071887895711515, "grad_norm": 5.0, "learning_rate": 5.1845242952666365e-06, "loss": 0.84109287, "memory(GiB)": 147.13, "step": 43170, "train_speed(iter/s)": 0.201057 }, { "acc": 0.79040389, "epoch": 1.0074220971434404, "grad_norm": 4.96875, "learning_rate": 5.1826364573971125e-06, "loss": 0.75233536, "memory(GiB)": 147.13, "step": 43180, "train_speed(iter/s)": 0.201079 }, { "acc": 0.77585616, "epoch": 1.0076554047157293, "grad_norm": 4.53125, "learning_rate": 5.180748593456289e-06, "loss": 0.8176096, "memory(GiB)": 147.13, "step": 43190, "train_speed(iter/s)": 0.201103 }, { "acc": 0.76432118, "epoch": 1.0078887122880182, "grad_norm": 4.84375, "learning_rate": 5.178860703713654e-06, "loss": 0.84310722, "memory(GiB)": 147.13, "step": 43200, "train_speed(iter/s)": 0.201128 }, { "acc": 0.78325276, "epoch": 1.0081220198603071, "grad_norm": 5.34375, "learning_rate": 5.176972788438705e-06, "loss": 0.7665885, "memory(GiB)": 147.13, "step": 43210, "train_speed(iter/s)": 0.201154 }, { "acc": 0.77819371, "epoch": 1.008355327432596, "grad_norm": 6.0, "learning_rate": 5.175084847900943e-06, "loss": 0.79968734, "memory(GiB)": 147.13, "step": 43220, "train_speed(iter/s)": 0.201178 }, { "acc": 0.78666801, "epoch": 1.008588635004885, "grad_norm": 4.5625, "learning_rate": 5.17319688236987e-06, "loss": 0.77615905, "memory(GiB)": 147.13, "step": 43230, "train_speed(iter/s)": 0.201203 }, { "acc": 0.78133011, "epoch": 1.0088219425771738, "grad_norm": 5.28125, "learning_rate": 5.171308892114991e-06, "loss": 0.80085545, "memory(GiB)": 147.13, "step": 43240, "train_speed(iter/s)": 0.201226 }, { "acc": 0.79328427, "epoch": 1.0090552501494627, "grad_norm": 4.25, "learning_rate": 5.16942087740582e-06, "loss": 0.73711023, "memory(GiB)": 147.13, "step": 43250, "train_speed(iter/s)": 0.201249 }, { "acc": 0.78356128, "epoch": 1.0092885577217516, "grad_norm": 4.71875, "learning_rate": 5.167532838511866e-06, "loss": 0.76406059, "memory(GiB)": 147.13, "step": 43260, "train_speed(iter/s)": 0.201274 }, { "acc": 0.76661291, "epoch": 1.0095218652940405, "grad_norm": 4.9375, "learning_rate": 5.16564477570265e-06, "loss": 0.81206608, "memory(GiB)": 147.13, "step": 43270, "train_speed(iter/s)": 0.201297 }, { "acc": 0.78339691, "epoch": 1.0097551728663294, "grad_norm": 5.65625, "learning_rate": 5.163756689247687e-06, "loss": 0.76879001, "memory(GiB)": 147.13, "step": 43280, "train_speed(iter/s)": 0.201321 }, { "acc": 0.78117151, "epoch": 1.0099884804386183, "grad_norm": 4.625, "learning_rate": 5.1618685794165066e-06, "loss": 0.78589344, "memory(GiB)": 147.13, "step": 43290, "train_speed(iter/s)": 0.201347 }, { "acc": 0.75611968, "epoch": 1.0102217880109072, "grad_norm": 5.75, "learning_rate": 5.159980446478633e-06, "loss": 0.88358593, "memory(GiB)": 147.13, "step": 43300, "train_speed(iter/s)": 0.201372 }, { "acc": 0.77154584, "epoch": 1.0104550955831961, "grad_norm": 5.375, "learning_rate": 5.158092290703597e-06, "loss": 0.81572247, "memory(GiB)": 147.13, "step": 43310, "train_speed(iter/s)": 0.201395 }, { "acc": 0.76835303, "epoch": 1.010688403155485, "grad_norm": 5.9375, "learning_rate": 5.156204112360933e-06, "loss": 0.80885944, "memory(GiB)": 147.13, "step": 43320, "train_speed(iter/s)": 0.201419 }, { "acc": 0.77416339, "epoch": 1.0109217107277737, "grad_norm": 5.03125, "learning_rate": 5.154315911720176e-06, "loss": 0.81178932, "memory(GiB)": 147.13, "step": 43330, "train_speed(iter/s)": 0.201444 }, { "acc": 0.78630424, "epoch": 1.0111550183000626, "grad_norm": 6.625, "learning_rate": 5.152427689050869e-06, "loss": 0.77561946, "memory(GiB)": 147.13, "step": 43340, "train_speed(iter/s)": 0.201467 }, { "acc": 0.7755363, "epoch": 1.0113883258723515, "grad_norm": 3.859375, "learning_rate": 5.150539444622552e-06, "loss": 0.80117035, "memory(GiB)": 147.13, "step": 43350, "train_speed(iter/s)": 0.201491 }, { "acc": 0.78446379, "epoch": 1.0116216334446404, "grad_norm": 8.1875, "learning_rate": 5.148651178704775e-06, "loss": 0.82833748, "memory(GiB)": 147.13, "step": 43360, "train_speed(iter/s)": 0.201516 }, { "acc": 0.76374173, "epoch": 1.0118549410169293, "grad_norm": 7.96875, "learning_rate": 5.146762891567084e-06, "loss": 0.88345985, "memory(GiB)": 147.13, "step": 43370, "train_speed(iter/s)": 0.20154 }, { "acc": 0.79155569, "epoch": 1.0120882485892182, "grad_norm": 5.8125, "learning_rate": 5.144874583479034e-06, "loss": 0.74518204, "memory(GiB)": 147.13, "step": 43380, "train_speed(iter/s)": 0.201564 }, { "acc": 0.77843351, "epoch": 1.012321556161507, "grad_norm": 5.59375, "learning_rate": 5.142986254710177e-06, "loss": 0.78075495, "memory(GiB)": 147.13, "step": 43390, "train_speed(iter/s)": 0.201587 }, { "acc": 0.77403011, "epoch": 1.012554863733796, "grad_norm": 5.9375, "learning_rate": 5.141097905530077e-06, "loss": 0.80180321, "memory(GiB)": 147.13, "step": 43400, "train_speed(iter/s)": 0.201608 }, { "acc": 0.79151011, "epoch": 1.012788171306085, "grad_norm": 5.53125, "learning_rate": 5.139209536208289e-06, "loss": 0.74089231, "memory(GiB)": 147.13, "step": 43410, "train_speed(iter/s)": 0.20163 }, { "acc": 0.76592178, "epoch": 1.0130214788783738, "grad_norm": 8.125, "learning_rate": 5.1373211470143814e-06, "loss": 0.83816414, "memory(GiB)": 147.13, "step": 43420, "train_speed(iter/s)": 0.201653 }, { "acc": 0.76648312, "epoch": 1.0132547864506627, "grad_norm": 5.625, "learning_rate": 5.13543273821792e-06, "loss": 0.85734253, "memory(GiB)": 147.13, "step": 43430, "train_speed(iter/s)": 0.201676 }, { "acc": 0.76693745, "epoch": 1.0134880940229516, "grad_norm": 4.875, "learning_rate": 5.133544310088474e-06, "loss": 0.84319305, "memory(GiB)": 147.13, "step": 43440, "train_speed(iter/s)": 0.2017 }, { "acc": 0.78325624, "epoch": 1.0137214015952405, "grad_norm": 5.4375, "learning_rate": 5.131655862895617e-06, "loss": 0.77048082, "memory(GiB)": 147.13, "step": 43450, "train_speed(iter/s)": 0.201723 }, { "acc": 0.79426832, "epoch": 1.0139547091675294, "grad_norm": 4.09375, "learning_rate": 5.129767396908923e-06, "loss": 0.73330755, "memory(GiB)": 147.13, "step": 43460, "train_speed(iter/s)": 0.201747 }, { "acc": 0.78500352, "epoch": 1.0141880167398183, "grad_norm": 4.5625, "learning_rate": 5.1278789123979736e-06, "loss": 0.76361828, "memory(GiB)": 147.13, "step": 43470, "train_speed(iter/s)": 0.201772 }, { "acc": 0.77640676, "epoch": 1.0144213243121072, "grad_norm": 5.6875, "learning_rate": 5.125990409632344e-06, "loss": 0.82779522, "memory(GiB)": 147.13, "step": 43480, "train_speed(iter/s)": 0.201796 }, { "acc": 0.76425214, "epoch": 1.014654631884396, "grad_norm": 4.3125, "learning_rate": 5.1241018888816205e-06, "loss": 0.862677, "memory(GiB)": 147.13, "step": 43490, "train_speed(iter/s)": 0.20182 }, { "acc": 0.78395214, "epoch": 1.014887939456685, "grad_norm": 6.28125, "learning_rate": 5.122213350415389e-06, "loss": 0.76862893, "memory(GiB)": 147.13, "step": 43500, "train_speed(iter/s)": 0.201844 }, { "epoch": 1.014887939456685, "eval_acc": 0.743926970386609, "eval_loss": 0.8070799708366394, "eval_runtime": 1270.2738, "eval_samples_per_second": 28.333, "eval_steps_per_second": 14.167, "step": 43500 }, { "acc": 0.77859073, "epoch": 1.0151212470289739, "grad_norm": 5.625, "learning_rate": 5.1203247945032365e-06, "loss": 0.78678293, "memory(GiB)": 147.13, "step": 43510, "train_speed(iter/s)": 0.200669 }, { "acc": 0.77705793, "epoch": 1.0153545546012628, "grad_norm": 6.6875, "learning_rate": 5.118436221414754e-06, "loss": 0.80260468, "memory(GiB)": 147.13, "step": 43520, "train_speed(iter/s)": 0.200692 }, { "acc": 0.76431408, "epoch": 1.0155878621735517, "grad_norm": 7.75, "learning_rate": 5.116547631419536e-06, "loss": 0.85512829, "memory(GiB)": 147.13, "step": 43530, "train_speed(iter/s)": 0.200717 }, { "acc": 0.7825469, "epoch": 1.0158211697458406, "grad_norm": 5.0625, "learning_rate": 5.114659024787179e-06, "loss": 0.76410046, "memory(GiB)": 147.13, "step": 43540, "train_speed(iter/s)": 0.200742 }, { "acc": 0.76927857, "epoch": 1.0160544773181295, "grad_norm": 5.875, "learning_rate": 5.112770401787278e-06, "loss": 0.81999454, "memory(GiB)": 147.13, "step": 43550, "train_speed(iter/s)": 0.200765 }, { "acc": 0.76841311, "epoch": 1.0162877848904184, "grad_norm": 5.0625, "learning_rate": 5.110881762689435e-06, "loss": 0.84414501, "memory(GiB)": 147.13, "step": 43560, "train_speed(iter/s)": 0.20079 }, { "acc": 0.77150702, "epoch": 1.0165210924627073, "grad_norm": 4.28125, "learning_rate": 5.1089931077632514e-06, "loss": 0.81745987, "memory(GiB)": 147.13, "step": 43570, "train_speed(iter/s)": 0.200814 }, { "acc": 0.7604578, "epoch": 1.0167544000349962, "grad_norm": 7.5, "learning_rate": 5.1071044372783355e-06, "loss": 0.86848364, "memory(GiB)": 147.13, "step": 43580, "train_speed(iter/s)": 0.200837 }, { "acc": 0.77862034, "epoch": 1.016987707607285, "grad_norm": 6.6875, "learning_rate": 5.10521575150429e-06, "loss": 0.80064793, "memory(GiB)": 147.13, "step": 43590, "train_speed(iter/s)": 0.20086 }, { "acc": 0.77443581, "epoch": 1.017221015179574, "grad_norm": 5.75, "learning_rate": 5.103327050710729e-06, "loss": 0.83215542, "memory(GiB)": 147.13, "step": 43600, "train_speed(iter/s)": 0.200884 }, { "acc": 0.7929235, "epoch": 1.0174543227518629, "grad_norm": 6.40625, "learning_rate": 5.10143833516726e-06, "loss": 0.74127417, "memory(GiB)": 147.13, "step": 43610, "train_speed(iter/s)": 0.200907 }, { "acc": 0.79245968, "epoch": 1.0176876303241518, "grad_norm": 6.25, "learning_rate": 5.099549605143499e-06, "loss": 0.75981627, "memory(GiB)": 147.13, "step": 43620, "train_speed(iter/s)": 0.200931 }, { "acc": 0.79105101, "epoch": 1.0179209378964407, "grad_norm": 5.4375, "learning_rate": 5.0976608609090606e-06, "loss": 0.74999475, "memory(GiB)": 147.13, "step": 43630, "train_speed(iter/s)": 0.200955 }, { "acc": 0.77970495, "epoch": 1.0181542454687296, "grad_norm": 6.1875, "learning_rate": 5.095772102733561e-06, "loss": 0.77877584, "memory(GiB)": 147.13, "step": 43640, "train_speed(iter/s)": 0.200979 }, { "acc": 0.78451672, "epoch": 1.0183875530410185, "grad_norm": 5.4375, "learning_rate": 5.093883330886623e-06, "loss": 0.75853043, "memory(GiB)": 147.13, "step": 43650, "train_speed(iter/s)": 0.201002 }, { "acc": 0.79117913, "epoch": 1.0186208606133074, "grad_norm": 7.15625, "learning_rate": 5.091994545637867e-06, "loss": 0.74103169, "memory(GiB)": 147.13, "step": 43660, "train_speed(iter/s)": 0.201026 }, { "acc": 0.79023709, "epoch": 1.0188541681855963, "grad_norm": 5.75, "learning_rate": 5.090105747256916e-06, "loss": 0.7264926, "memory(GiB)": 147.13, "step": 43670, "train_speed(iter/s)": 0.201049 }, { "acc": 0.78450155, "epoch": 1.0190874757578852, "grad_norm": 4.6875, "learning_rate": 5.088216936013396e-06, "loss": 0.77232718, "memory(GiB)": 147.13, "step": 43680, "train_speed(iter/s)": 0.201071 }, { "acc": 0.77247133, "epoch": 1.019320783330174, "grad_norm": 5.84375, "learning_rate": 5.086328112176934e-06, "loss": 0.82542839, "memory(GiB)": 147.13, "step": 43690, "train_speed(iter/s)": 0.201094 }, { "acc": 0.77573242, "epoch": 1.019554090902463, "grad_norm": 5.3125, "learning_rate": 5.084439276017159e-06, "loss": 0.81169853, "memory(GiB)": 147.13, "step": 43700, "train_speed(iter/s)": 0.201118 }, { "acc": 0.79919705, "epoch": 1.0197873984747516, "grad_norm": 4.34375, "learning_rate": 5.082550427803702e-06, "loss": 0.72000685, "memory(GiB)": 147.13, "step": 43710, "train_speed(iter/s)": 0.201141 }, { "acc": 0.79065461, "epoch": 1.0200207060470405, "grad_norm": 10.6875, "learning_rate": 5.080661567806195e-06, "loss": 0.746523, "memory(GiB)": 147.13, "step": 43720, "train_speed(iter/s)": 0.201163 }, { "acc": 0.77126436, "epoch": 1.0202540136193294, "grad_norm": 5.0625, "learning_rate": 5.078772696294273e-06, "loss": 0.81962471, "memory(GiB)": 147.13, "step": 43730, "train_speed(iter/s)": 0.201186 }, { "acc": 0.78477058, "epoch": 1.0204873211916183, "grad_norm": 6.4375, "learning_rate": 5.076883813537571e-06, "loss": 0.77737856, "memory(GiB)": 147.13, "step": 43740, "train_speed(iter/s)": 0.201211 }, { "acc": 0.78322659, "epoch": 1.0207206287639072, "grad_norm": 6.875, "learning_rate": 5.074994919805727e-06, "loss": 0.75895319, "memory(GiB)": 147.13, "step": 43750, "train_speed(iter/s)": 0.201235 }, { "acc": 0.77741756, "epoch": 1.0209539363361961, "grad_norm": 5.28125, "learning_rate": 5.073106015368381e-06, "loss": 0.81184292, "memory(GiB)": 147.13, "step": 43760, "train_speed(iter/s)": 0.201259 }, { "acc": 0.78123074, "epoch": 1.021187243908485, "grad_norm": 5.65625, "learning_rate": 5.071217100495172e-06, "loss": 0.78832426, "memory(GiB)": 147.13, "step": 43770, "train_speed(iter/s)": 0.201283 }, { "acc": 0.76711731, "epoch": 1.021420551480774, "grad_norm": 7.5, "learning_rate": 5.069328175455742e-06, "loss": 0.82933445, "memory(GiB)": 147.13, "step": 43780, "train_speed(iter/s)": 0.201307 }, { "acc": 0.7838244, "epoch": 1.0216538590530628, "grad_norm": 5.3125, "learning_rate": 5.067439240519735e-06, "loss": 0.78824844, "memory(GiB)": 147.13, "step": 43790, "train_speed(iter/s)": 0.201331 }, { "acc": 0.77966681, "epoch": 1.0218871666253517, "grad_norm": 4.8125, "learning_rate": 5.065550295956796e-06, "loss": 0.78713975, "memory(GiB)": 147.13, "step": 43800, "train_speed(iter/s)": 0.201355 }, { "acc": 0.78061857, "epoch": 1.0221204741976406, "grad_norm": 5.25, "learning_rate": 5.063661342036571e-06, "loss": 0.76254168, "memory(GiB)": 147.13, "step": 43810, "train_speed(iter/s)": 0.201379 }, { "acc": 0.76380448, "epoch": 1.0223537817699295, "grad_norm": 5.84375, "learning_rate": 5.061772379028709e-06, "loss": 0.87199421, "memory(GiB)": 147.13, "step": 43820, "train_speed(iter/s)": 0.201402 }, { "acc": 0.79298944, "epoch": 1.0225870893422184, "grad_norm": 8.4375, "learning_rate": 5.059883407202858e-06, "loss": 0.7629261, "memory(GiB)": 147.13, "step": 43830, "train_speed(iter/s)": 0.201426 }, { "acc": 0.78520899, "epoch": 1.0228203969145073, "grad_norm": 7.0625, "learning_rate": 5.057994426828669e-06, "loss": 0.76913924, "memory(GiB)": 147.13, "step": 43840, "train_speed(iter/s)": 0.20145 }, { "acc": 0.76806917, "epoch": 1.0230537044867962, "grad_norm": 6.25, "learning_rate": 5.05610543817579e-06, "loss": 0.8473484, "memory(GiB)": 147.13, "step": 43850, "train_speed(iter/s)": 0.201474 }, { "acc": 0.79321308, "epoch": 1.0232870120590851, "grad_norm": 3.625, "learning_rate": 5.054216441513876e-06, "loss": 0.7350997, "memory(GiB)": 147.13, "step": 43860, "train_speed(iter/s)": 0.201496 }, { "acc": 0.77993116, "epoch": 1.023520319631374, "grad_norm": 4.59375, "learning_rate": 5.052327437112582e-06, "loss": 0.77887244, "memory(GiB)": 147.13, "step": 43870, "train_speed(iter/s)": 0.20152 }, { "acc": 0.7738163, "epoch": 1.023753627203663, "grad_norm": 6.71875, "learning_rate": 5.050438425241562e-06, "loss": 0.82020864, "memory(GiB)": 147.13, "step": 43880, "train_speed(iter/s)": 0.201543 }, { "acc": 0.78634443, "epoch": 1.0239869347759518, "grad_norm": 5.8125, "learning_rate": 5.0485494061704695e-06, "loss": 0.76208029, "memory(GiB)": 147.13, "step": 43890, "train_speed(iter/s)": 0.201566 }, { "acc": 0.76844702, "epoch": 1.0242202423482407, "grad_norm": 6.96875, "learning_rate": 5.0466603801689655e-06, "loss": 0.82169828, "memory(GiB)": 147.13, "step": 43900, "train_speed(iter/s)": 0.201589 }, { "acc": 0.78849974, "epoch": 1.0244535499205296, "grad_norm": 4.34375, "learning_rate": 5.044771347506705e-06, "loss": 0.78166656, "memory(GiB)": 147.13, "step": 43910, "train_speed(iter/s)": 0.201611 }, { "acc": 0.79453068, "epoch": 1.0246868574928185, "grad_norm": 4.71875, "learning_rate": 5.0428823084533475e-06, "loss": 0.72507105, "memory(GiB)": 147.13, "step": 43920, "train_speed(iter/s)": 0.201635 }, { "acc": 0.77169046, "epoch": 1.0249201650651074, "grad_norm": 5.9375, "learning_rate": 5.040993263278552e-06, "loss": 0.83655701, "memory(GiB)": 147.13, "step": 43930, "train_speed(iter/s)": 0.201657 }, { "acc": 0.77949696, "epoch": 1.0251534726373963, "grad_norm": 6.3125, "learning_rate": 5.0391042122519815e-06, "loss": 0.81380482, "memory(GiB)": 147.13, "step": 43940, "train_speed(iter/s)": 0.201681 }, { "acc": 0.79480991, "epoch": 1.0253867802096852, "grad_norm": 5.375, "learning_rate": 5.037215155643296e-06, "loss": 0.74450397, "memory(GiB)": 147.13, "step": 43950, "train_speed(iter/s)": 0.201704 }, { "acc": 0.79364281, "epoch": 1.025620087781974, "grad_norm": 5.71875, "learning_rate": 5.035326093722157e-06, "loss": 0.73111277, "memory(GiB)": 147.13, "step": 43960, "train_speed(iter/s)": 0.201728 }, { "acc": 0.77582326, "epoch": 1.025853395354263, "grad_norm": 6.65625, "learning_rate": 5.033437026758228e-06, "loss": 0.83863983, "memory(GiB)": 147.13, "step": 43970, "train_speed(iter/s)": 0.201752 }, { "acc": 0.78823829, "epoch": 1.026086702926552, "grad_norm": 6.75, "learning_rate": 5.0315479550211746e-06, "loss": 0.75079851, "memory(GiB)": 147.13, "step": 43980, "train_speed(iter/s)": 0.201776 }, { "acc": 0.7844943, "epoch": 1.0263200104988408, "grad_norm": 6.3125, "learning_rate": 5.029658878780659e-06, "loss": 0.77733545, "memory(GiB)": 147.13, "step": 43990, "train_speed(iter/s)": 0.201798 }, { "acc": 0.78340597, "epoch": 1.0265533180711297, "grad_norm": 4.6875, "learning_rate": 5.0277697983063476e-06, "loss": 0.76800084, "memory(GiB)": 147.13, "step": 44000, "train_speed(iter/s)": 0.201823 }, { "epoch": 1.0265533180711297, "eval_acc": 0.7438331258664804, "eval_loss": 0.8070977330207825, "eval_runtime": 1271.2669, "eval_samples_per_second": 28.311, "eval_steps_per_second": 14.156, "step": 44000 }, { "acc": 0.77107697, "epoch": 1.0267866256434186, "grad_norm": 7.34375, "learning_rate": 5.025880713867904e-06, "loss": 0.85529652, "memory(GiB)": 147.13, "step": 44010, "train_speed(iter/s)": 0.20066 }, { "acc": 0.78703861, "epoch": 1.0270199332157075, "grad_norm": 5.375, "learning_rate": 5.023991625734998e-06, "loss": 0.76970444, "memory(GiB)": 147.13, "step": 44020, "train_speed(iter/s)": 0.200683 }, { "acc": 0.76558523, "epoch": 1.0272532407879964, "grad_norm": 4.15625, "learning_rate": 5.022102534177293e-06, "loss": 0.83571815, "memory(GiB)": 147.13, "step": 44030, "train_speed(iter/s)": 0.200707 }, { "acc": 0.77779908, "epoch": 1.0274865483602853, "grad_norm": 5.03125, "learning_rate": 5.020213439464458e-06, "loss": 0.78558512, "memory(GiB)": 147.13, "step": 44040, "train_speed(iter/s)": 0.200728 }, { "acc": 0.79692812, "epoch": 1.0277198559325742, "grad_norm": 6.3125, "learning_rate": 5.018324341866161e-06, "loss": 0.71697149, "memory(GiB)": 147.13, "step": 44050, "train_speed(iter/s)": 0.200751 }, { "acc": 0.80139132, "epoch": 1.027953163504863, "grad_norm": 5.9375, "learning_rate": 5.01643524165207e-06, "loss": 0.68105035, "memory(GiB)": 147.13, "step": 44060, "train_speed(iter/s)": 0.200773 }, { "acc": 0.77024827, "epoch": 1.028186471077152, "grad_norm": 6.34375, "learning_rate": 5.014546139091851e-06, "loss": 0.83891077, "memory(GiB)": 147.13, "step": 44070, "train_speed(iter/s)": 0.200796 }, { "acc": 0.78477793, "epoch": 1.028419778649441, "grad_norm": 4.0, "learning_rate": 5.012657034455176e-06, "loss": 0.80206022, "memory(GiB)": 147.13, "step": 44080, "train_speed(iter/s)": 0.200818 }, { "acc": 0.79112525, "epoch": 1.0286530862217296, "grad_norm": 6.1875, "learning_rate": 5.010767928011713e-06, "loss": 0.7436903, "memory(GiB)": 147.13, "step": 44090, "train_speed(iter/s)": 0.200842 }, { "acc": 0.78853774, "epoch": 1.0288863937940185, "grad_norm": 5.28125, "learning_rate": 5.008878820031131e-06, "loss": 0.76734447, "memory(GiB)": 147.13, "step": 44100, "train_speed(iter/s)": 0.200865 }, { "acc": 0.78617477, "epoch": 1.0291197013663074, "grad_norm": 6.09375, "learning_rate": 5.006989710783101e-06, "loss": 0.75627079, "memory(GiB)": 147.13, "step": 44110, "train_speed(iter/s)": 0.20089 }, { "acc": 0.78956699, "epoch": 1.0293530089385963, "grad_norm": 6.1875, "learning_rate": 5.005100600537292e-06, "loss": 0.74087925, "memory(GiB)": 147.13, "step": 44120, "train_speed(iter/s)": 0.200913 }, { "acc": 0.77857628, "epoch": 1.0295863165108852, "grad_norm": 5.25, "learning_rate": 5.003211489563373e-06, "loss": 0.80192738, "memory(GiB)": 147.13, "step": 44130, "train_speed(iter/s)": 0.200937 }, { "acc": 0.79740629, "epoch": 1.029819624083174, "grad_norm": 7.25, "learning_rate": 5.001322378131015e-06, "loss": 0.72210054, "memory(GiB)": 147.13, "step": 44140, "train_speed(iter/s)": 0.200958 }, { "acc": 0.79078064, "epoch": 1.030052931655463, "grad_norm": 5.0625, "learning_rate": 4.9994332665098885e-06, "loss": 0.74621177, "memory(GiB)": 147.13, "step": 44150, "train_speed(iter/s)": 0.200981 }, { "acc": 0.7691617, "epoch": 1.0302862392277519, "grad_norm": 4.96875, "learning_rate": 4.997544154969661e-06, "loss": 0.84756422, "memory(GiB)": 147.13, "step": 44160, "train_speed(iter/s)": 0.201006 }, { "acc": 0.78140469, "epoch": 1.0305195468000408, "grad_norm": 4.9375, "learning_rate": 4.995655043780006e-06, "loss": 0.80409288, "memory(GiB)": 147.13, "step": 44170, "train_speed(iter/s)": 0.201029 }, { "acc": 0.78131447, "epoch": 1.0307528543723297, "grad_norm": 7.96875, "learning_rate": 4.993765933210592e-06, "loss": 0.79957089, "memory(GiB)": 147.13, "step": 44180, "train_speed(iter/s)": 0.201052 }, { "acc": 0.78027859, "epoch": 1.0309861619446186, "grad_norm": 11.5625, "learning_rate": 4.991876823531089e-06, "loss": 0.79265738, "memory(GiB)": 147.13, "step": 44190, "train_speed(iter/s)": 0.201075 }, { "acc": 0.77979045, "epoch": 1.0312194695169075, "grad_norm": 10.0625, "learning_rate": 4.989987715011168e-06, "loss": 0.80041943, "memory(GiB)": 147.13, "step": 44200, "train_speed(iter/s)": 0.201097 }, { "acc": 0.77964592, "epoch": 1.0314527770891964, "grad_norm": 4.09375, "learning_rate": 4.988098607920497e-06, "loss": 0.79669185, "memory(GiB)": 147.13, "step": 44210, "train_speed(iter/s)": 0.201122 }, { "acc": 0.76723943, "epoch": 1.0316860846614853, "grad_norm": 6.15625, "learning_rate": 4.986209502528746e-06, "loss": 0.83204918, "memory(GiB)": 147.13, "step": 44220, "train_speed(iter/s)": 0.201146 }, { "acc": 0.782441, "epoch": 1.0319193922337742, "grad_norm": 4.09375, "learning_rate": 4.984320399105585e-06, "loss": 0.77679338, "memory(GiB)": 147.13, "step": 44230, "train_speed(iter/s)": 0.201168 }, { "acc": 0.78049326, "epoch": 1.032152699806063, "grad_norm": 3.6875, "learning_rate": 4.982431297920682e-06, "loss": 0.77765503, "memory(GiB)": 147.13, "step": 44240, "train_speed(iter/s)": 0.201191 }, { "acc": 0.79742098, "epoch": 1.032386007378352, "grad_norm": 16.75, "learning_rate": 4.980542199243709e-06, "loss": 0.72890673, "memory(GiB)": 147.13, "step": 44250, "train_speed(iter/s)": 0.201214 }, { "acc": 0.7838129, "epoch": 1.0326193149506409, "grad_norm": 5.90625, "learning_rate": 4.978653103344328e-06, "loss": 0.79265904, "memory(GiB)": 147.13, "step": 44260, "train_speed(iter/s)": 0.201238 }, { "acc": 0.78945446, "epoch": 1.0328526225229298, "grad_norm": 6.09375, "learning_rate": 4.976764010492211e-06, "loss": 0.7581913, "memory(GiB)": 147.13, "step": 44270, "train_speed(iter/s)": 0.201261 }, { "acc": 0.77482014, "epoch": 1.0330859300952187, "grad_norm": 7.21875, "learning_rate": 4.974874920957025e-06, "loss": 0.81373167, "memory(GiB)": 147.13, "step": 44280, "train_speed(iter/s)": 0.201284 }, { "acc": 0.79341946, "epoch": 1.0333192376675076, "grad_norm": 7.125, "learning_rate": 4.972985835008437e-06, "loss": 0.75202971, "memory(GiB)": 147.13, "step": 44290, "train_speed(iter/s)": 0.201306 }, { "acc": 0.79722977, "epoch": 1.0335525452397964, "grad_norm": 5.0625, "learning_rate": 4.971096752916113e-06, "loss": 0.73516812, "memory(GiB)": 147.13, "step": 44300, "train_speed(iter/s)": 0.201329 }, { "acc": 0.79803047, "epoch": 1.0337858528120853, "grad_norm": 8.5, "learning_rate": 4.969207674949719e-06, "loss": 0.71913786, "memory(GiB)": 147.13, "step": 44310, "train_speed(iter/s)": 0.201351 }, { "acc": 0.77934179, "epoch": 1.0340191603843742, "grad_norm": 6.28125, "learning_rate": 4.96731860137892e-06, "loss": 0.8005085, "memory(GiB)": 147.13, "step": 44320, "train_speed(iter/s)": 0.201374 }, { "acc": 0.76904945, "epoch": 1.0342524679566631, "grad_norm": 5.71875, "learning_rate": 4.965429532473383e-06, "loss": 0.84740162, "memory(GiB)": 147.13, "step": 44330, "train_speed(iter/s)": 0.201398 }, { "acc": 0.78627586, "epoch": 1.034485775528952, "grad_norm": 7.21875, "learning_rate": 4.963540468502768e-06, "loss": 0.75449095, "memory(GiB)": 147.13, "step": 44340, "train_speed(iter/s)": 0.201421 }, { "acc": 0.7668519, "epoch": 1.034719083101241, "grad_norm": 6.375, "learning_rate": 4.961651409736741e-06, "loss": 0.82782583, "memory(GiB)": 147.13, "step": 44350, "train_speed(iter/s)": 0.201445 }, { "acc": 0.77557964, "epoch": 1.0349523906735298, "grad_norm": 6.5, "learning_rate": 4.959762356444964e-06, "loss": 0.81699305, "memory(GiB)": 147.13, "step": 44360, "train_speed(iter/s)": 0.201469 }, { "acc": 0.77146072, "epoch": 1.0351856982458187, "grad_norm": 6.84375, "learning_rate": 4.957873308897102e-06, "loss": 0.83753395, "memory(GiB)": 147.13, "step": 44370, "train_speed(iter/s)": 0.201493 }, { "acc": 0.77347927, "epoch": 1.0354190058181076, "grad_norm": 4.1875, "learning_rate": 4.95598426736281e-06, "loss": 0.82432508, "memory(GiB)": 147.13, "step": 44380, "train_speed(iter/s)": 0.201518 }, { "acc": 0.78271379, "epoch": 1.0356523133903965, "grad_norm": 4.5, "learning_rate": 4.954095232111751e-06, "loss": 0.80724306, "memory(GiB)": 147.13, "step": 44390, "train_speed(iter/s)": 0.201542 }, { "acc": 0.78393545, "epoch": 1.0358856209626854, "grad_norm": 7.78125, "learning_rate": 4.9522062034135845e-06, "loss": 0.76437645, "memory(GiB)": 147.13, "step": 44400, "train_speed(iter/s)": 0.201561 }, { "acc": 0.77311645, "epoch": 1.0361189285349743, "grad_norm": 5.6875, "learning_rate": 4.9503171815379695e-06, "loss": 0.82633915, "memory(GiB)": 147.13, "step": 44410, "train_speed(iter/s)": 0.201584 }, { "acc": 0.78500485, "epoch": 1.0363522361072632, "grad_norm": 5.65625, "learning_rate": 4.948428166754561e-06, "loss": 0.76627178, "memory(GiB)": 147.13, "step": 44420, "train_speed(iter/s)": 0.201605 }, { "acc": 0.77328243, "epoch": 1.0365855436795521, "grad_norm": 4.15625, "learning_rate": 4.946539159333017e-06, "loss": 0.81605453, "memory(GiB)": 147.13, "step": 44430, "train_speed(iter/s)": 0.201628 }, { "acc": 0.78427544, "epoch": 1.036818851251841, "grad_norm": 4.25, "learning_rate": 4.944650159542993e-06, "loss": 0.76791339, "memory(GiB)": 147.13, "step": 44440, "train_speed(iter/s)": 0.201651 }, { "acc": 0.77760768, "epoch": 1.03705215882413, "grad_norm": 5.4375, "learning_rate": 4.942761167654142e-06, "loss": 0.7886517, "memory(GiB)": 147.13, "step": 44450, "train_speed(iter/s)": 0.201675 }, { "acc": 0.7762989, "epoch": 1.0372854663964188, "grad_norm": 6.46875, "learning_rate": 4.940872183936118e-06, "loss": 0.82030087, "memory(GiB)": 147.13, "step": 44460, "train_speed(iter/s)": 0.201699 }, { "acc": 0.77942324, "epoch": 1.0375187739687077, "grad_norm": 5.625, "learning_rate": 4.938983208658574e-06, "loss": 0.7806747, "memory(GiB)": 147.13, "step": 44470, "train_speed(iter/s)": 0.201721 }, { "acc": 0.76620932, "epoch": 1.0377520815409964, "grad_norm": 4.5, "learning_rate": 4.937094242091158e-06, "loss": 0.83421717, "memory(GiB)": 147.13, "step": 44480, "train_speed(iter/s)": 0.201745 }, { "acc": 0.79280834, "epoch": 1.0379853891132853, "grad_norm": 5.53125, "learning_rate": 4.935205284503522e-06, "loss": 0.74198709, "memory(GiB)": 147.13, "step": 44490, "train_speed(iter/s)": 0.201768 }, { "acc": 0.79166679, "epoch": 1.0382186966855742, "grad_norm": 7.03125, "learning_rate": 4.933316336165311e-06, "loss": 0.7469903, "memory(GiB)": 147.13, "step": 44500, "train_speed(iter/s)": 0.201791 }, { "epoch": 1.0382186966855742, "eval_acc": 0.743963643415806, "eval_loss": 0.8068258166313171, "eval_runtime": 1270.5076, "eval_samples_per_second": 28.328, "eval_steps_per_second": 14.164, "step": 44500 }, { "acc": 0.78040266, "epoch": 1.038452004257863, "grad_norm": 4.4375, "learning_rate": 4.931427397346174e-06, "loss": 0.77377152, "memory(GiB)": 147.13, "step": 44510, "train_speed(iter/s)": 0.200643 }, { "acc": 0.76233931, "epoch": 1.038685311830152, "grad_norm": 5.5625, "learning_rate": 4.929538468315756e-06, "loss": 0.8748394, "memory(GiB)": 147.13, "step": 44520, "train_speed(iter/s)": 0.200668 }, { "acc": 0.77354889, "epoch": 1.038918619402441, "grad_norm": 5.25, "learning_rate": 4.927649549343701e-06, "loss": 0.81373053, "memory(GiB)": 147.13, "step": 44530, "train_speed(iter/s)": 0.20069 }, { "acc": 0.77879763, "epoch": 1.0391519269747298, "grad_norm": 4.9375, "learning_rate": 4.9257606406996525e-06, "loss": 0.78499241, "memory(GiB)": 147.13, "step": 44540, "train_speed(iter/s)": 0.200714 }, { "acc": 0.77786951, "epoch": 1.0393852345470187, "grad_norm": 4.625, "learning_rate": 4.923871742653251e-06, "loss": 0.80249662, "memory(GiB)": 147.13, "step": 44550, "train_speed(iter/s)": 0.200736 }, { "acc": 0.76491838, "epoch": 1.0396185421193076, "grad_norm": 5.5625, "learning_rate": 4.921982855474136e-06, "loss": 0.84366484, "memory(GiB)": 147.13, "step": 44560, "train_speed(iter/s)": 0.20076 }, { "acc": 0.80446472, "epoch": 1.0398518496915965, "grad_norm": 5.78125, "learning_rate": 4.9200939794319444e-06, "loss": 0.70642214, "memory(GiB)": 147.13, "step": 44570, "train_speed(iter/s)": 0.200782 }, { "acc": 0.78261833, "epoch": 1.0400851572638854, "grad_norm": 6.09375, "learning_rate": 4.918205114796315e-06, "loss": 0.7593545, "memory(GiB)": 147.13, "step": 44580, "train_speed(iter/s)": 0.200805 }, { "acc": 0.77353225, "epoch": 1.0403184648361743, "grad_norm": 6.625, "learning_rate": 4.916316261836882e-06, "loss": 0.84205828, "memory(GiB)": 147.13, "step": 44590, "train_speed(iter/s)": 0.200829 }, { "acc": 0.77843504, "epoch": 1.0405517724084632, "grad_norm": 8.75, "learning_rate": 4.91442742082328e-06, "loss": 0.79012375, "memory(GiB)": 147.13, "step": 44600, "train_speed(iter/s)": 0.200852 }, { "acc": 0.79202127, "epoch": 1.040785079980752, "grad_norm": 5.5, "learning_rate": 4.912538592025137e-06, "loss": 0.75029149, "memory(GiB)": 147.13, "step": 44610, "train_speed(iter/s)": 0.200874 }, { "acc": 0.79142704, "epoch": 1.041018387553041, "grad_norm": 4.90625, "learning_rate": 4.910649775712084e-06, "loss": 0.77472105, "memory(GiB)": 147.13, "step": 44620, "train_speed(iter/s)": 0.200896 }, { "acc": 0.79433427, "epoch": 1.04125169512533, "grad_norm": 5.4375, "learning_rate": 4.908760972153751e-06, "loss": 0.7298358, "memory(GiB)": 147.13, "step": 44630, "train_speed(iter/s)": 0.200919 }, { "acc": 0.77725754, "epoch": 1.0414850026976188, "grad_norm": 6.46875, "learning_rate": 4.9068721816197615e-06, "loss": 0.83918152, "memory(GiB)": 147.13, "step": 44640, "train_speed(iter/s)": 0.200941 }, { "acc": 0.79220066, "epoch": 1.0417183102699077, "grad_norm": 5.34375, "learning_rate": 4.904983404379741e-06, "loss": 0.7297925, "memory(GiB)": 147.13, "step": 44650, "train_speed(iter/s)": 0.200964 }, { "acc": 0.77652454, "epoch": 1.0419516178421966, "grad_norm": 4.75, "learning_rate": 4.903094640703312e-06, "loss": 0.78495092, "memory(GiB)": 147.13, "step": 44660, "train_speed(iter/s)": 0.200988 }, { "acc": 0.79377813, "epoch": 1.0421849254144855, "grad_norm": 4.9375, "learning_rate": 4.901205890860095e-06, "loss": 0.73209939, "memory(GiB)": 147.13, "step": 44670, "train_speed(iter/s)": 0.20101 }, { "acc": 0.79054551, "epoch": 1.0424182329867744, "grad_norm": 5.34375, "learning_rate": 4.899317155119708e-06, "loss": 0.7544507, "memory(GiB)": 147.13, "step": 44680, "train_speed(iter/s)": 0.201033 }, { "acc": 0.79657516, "epoch": 1.0426515405590633, "grad_norm": 4.5625, "learning_rate": 4.89742843375177e-06, "loss": 0.75656972, "memory(GiB)": 147.13, "step": 44690, "train_speed(iter/s)": 0.201056 }, { "acc": 0.78189001, "epoch": 1.0428848481313522, "grad_norm": 6.46875, "learning_rate": 4.895539727025891e-06, "loss": 0.77084103, "memory(GiB)": 147.13, "step": 44700, "train_speed(iter/s)": 0.201079 }, { "acc": 0.78460836, "epoch": 1.043118155703641, "grad_norm": 4.6875, "learning_rate": 4.8936510352116886e-06, "loss": 0.77186456, "memory(GiB)": 147.13, "step": 44710, "train_speed(iter/s)": 0.201102 }, { "acc": 0.75842123, "epoch": 1.04335146327593, "grad_norm": 7.78125, "learning_rate": 4.891762358578767e-06, "loss": 0.86604462, "memory(GiB)": 147.13, "step": 44720, "train_speed(iter/s)": 0.201126 }, { "acc": 0.78901138, "epoch": 1.0435847708482189, "grad_norm": 4.5, "learning_rate": 4.889873697396738e-06, "loss": 0.76355696, "memory(GiB)": 147.13, "step": 44730, "train_speed(iter/s)": 0.201147 }, { "acc": 0.78258648, "epoch": 1.0438180784205078, "grad_norm": 5.46875, "learning_rate": 4.887985051935206e-06, "loss": 0.77364783, "memory(GiB)": 147.13, "step": 44740, "train_speed(iter/s)": 0.201171 }, { "acc": 0.7676549, "epoch": 1.0440513859927967, "grad_norm": 5.21875, "learning_rate": 4.8860964224637756e-06, "loss": 0.83399296, "memory(GiB)": 147.13, "step": 44750, "train_speed(iter/s)": 0.201196 }, { "acc": 0.76923347, "epoch": 1.0442846935650856, "grad_norm": 3.890625, "learning_rate": 4.884207809252049e-06, "loss": 0.81878109, "memory(GiB)": 147.13, "step": 44760, "train_speed(iter/s)": 0.201219 }, { "acc": 0.76931305, "epoch": 1.0445180011373745, "grad_norm": 3.890625, "learning_rate": 4.882319212569623e-06, "loss": 0.84117594, "memory(GiB)": 147.13, "step": 44770, "train_speed(iter/s)": 0.201242 }, { "acc": 0.78014016, "epoch": 1.0447513087096634, "grad_norm": 5.15625, "learning_rate": 4.880430632686096e-06, "loss": 0.78205185, "memory(GiB)": 147.13, "step": 44780, "train_speed(iter/s)": 0.201265 }, { "acc": 0.7773912, "epoch": 1.0449846162819523, "grad_norm": 4.84375, "learning_rate": 4.87854206987106e-06, "loss": 0.80879612, "memory(GiB)": 147.13, "step": 44790, "train_speed(iter/s)": 0.201287 }, { "acc": 0.78483715, "epoch": 1.0452179238542412, "grad_norm": 4.5625, "learning_rate": 4.876653524394109e-06, "loss": 0.77698545, "memory(GiB)": 147.13, "step": 44800, "train_speed(iter/s)": 0.20131 }, { "acc": 0.76030092, "epoch": 1.04545123142653, "grad_norm": 6.0625, "learning_rate": 4.874764996524831e-06, "loss": 0.86837893, "memory(GiB)": 147.13, "step": 44810, "train_speed(iter/s)": 0.201335 }, { "acc": 0.78192863, "epoch": 1.045684538998819, "grad_norm": 9.875, "learning_rate": 4.872876486532814e-06, "loss": 0.77877698, "memory(GiB)": 147.13, "step": 44820, "train_speed(iter/s)": 0.201359 }, { "acc": 0.77943573, "epoch": 1.0459178465711079, "grad_norm": 5.1875, "learning_rate": 4.870987994687644e-06, "loss": 0.78068628, "memory(GiB)": 147.13, "step": 44830, "train_speed(iter/s)": 0.201381 }, { "acc": 0.76940956, "epoch": 1.0461511541433968, "grad_norm": 4.8125, "learning_rate": 4.869099521258897e-06, "loss": 0.83160362, "memory(GiB)": 147.13, "step": 44840, "train_speed(iter/s)": 0.201403 }, { "acc": 0.77414241, "epoch": 1.0463844617156857, "grad_norm": 6.4375, "learning_rate": 4.867211066516157e-06, "loss": 0.82083912, "memory(GiB)": 147.13, "step": 44850, "train_speed(iter/s)": 0.201428 }, { "acc": 0.80341206, "epoch": 1.0466177692879746, "grad_norm": 5.40625, "learning_rate": 4.865322630728998e-06, "loss": 0.70446091, "memory(GiB)": 147.13, "step": 44860, "train_speed(iter/s)": 0.201452 }, { "acc": 0.79915457, "epoch": 1.0468510768602632, "grad_norm": 5.59375, "learning_rate": 4.863434214166994e-06, "loss": 0.69466219, "memory(GiB)": 147.13, "step": 44870, "train_speed(iter/s)": 0.201475 }, { "acc": 0.77168198, "epoch": 1.0470843844325521, "grad_norm": 5.75, "learning_rate": 4.8615458170997166e-06, "loss": 0.82629623, "memory(GiB)": 147.13, "step": 44880, "train_speed(iter/s)": 0.201499 }, { "acc": 0.78714318, "epoch": 1.047317692004841, "grad_norm": 6.3125, "learning_rate": 4.8596574397967324e-06, "loss": 0.74908113, "memory(GiB)": 147.13, "step": 44890, "train_speed(iter/s)": 0.201522 }, { "acc": 0.79111013, "epoch": 1.04755099957713, "grad_norm": 6.3125, "learning_rate": 4.857769082527609e-06, "loss": 0.77098122, "memory(GiB)": 147.13, "step": 44900, "train_speed(iter/s)": 0.201545 }, { "acc": 0.75890102, "epoch": 1.0477843071494188, "grad_norm": 5.875, "learning_rate": 4.855880745561909e-06, "loss": 0.86920433, "memory(GiB)": 147.13, "step": 44910, "train_speed(iter/s)": 0.201569 }, { "acc": 0.78598218, "epoch": 1.0480176147217077, "grad_norm": 6.53125, "learning_rate": 4.853992429169189e-06, "loss": 0.76193895, "memory(GiB)": 147.13, "step": 44920, "train_speed(iter/s)": 0.201591 }, { "acc": 0.75896392, "epoch": 1.0482509222939966, "grad_norm": 6.5, "learning_rate": 4.852104133619008e-06, "loss": 0.88328629, "memory(GiB)": 147.13, "step": 44930, "train_speed(iter/s)": 0.201613 }, { "acc": 0.78158102, "epoch": 1.0484842298662855, "grad_norm": 8.125, "learning_rate": 4.85021585918092e-06, "loss": 0.77389903, "memory(GiB)": 147.13, "step": 44940, "train_speed(iter/s)": 0.201638 }, { "acc": 0.78255506, "epoch": 1.0487175374385744, "grad_norm": 6.21875, "learning_rate": 4.848327606124473e-06, "loss": 0.75741982, "memory(GiB)": 147.13, "step": 44950, "train_speed(iter/s)": 0.20166 }, { "acc": 0.78304787, "epoch": 1.0489508450108633, "grad_norm": 5.03125, "learning_rate": 4.846439374719217e-06, "loss": 0.79422493, "memory(GiB)": 147.13, "step": 44960, "train_speed(iter/s)": 0.201682 }, { "acc": 0.77571564, "epoch": 1.0491841525831522, "grad_norm": 4.875, "learning_rate": 4.844551165234694e-06, "loss": 0.80409365, "memory(GiB)": 147.13, "step": 44970, "train_speed(iter/s)": 0.201706 }, { "acc": 0.78164635, "epoch": 1.0494174601554411, "grad_norm": 4.65625, "learning_rate": 4.842662977940448e-06, "loss": 0.80020761, "memory(GiB)": 147.13, "step": 44980, "train_speed(iter/s)": 0.201728 }, { "acc": 0.78135223, "epoch": 1.04965076772773, "grad_norm": 5.46875, "learning_rate": 4.8407748131060175e-06, "loss": 0.76936512, "memory(GiB)": 147.13, "step": 44990, "train_speed(iter/s)": 0.201751 }, { "acc": 0.7863555, "epoch": 1.049884075300019, "grad_norm": 6.15625, "learning_rate": 4.838886671000934e-06, "loss": 0.7614399, "memory(GiB)": 147.13, "step": 45000, "train_speed(iter/s)": 0.201774 }, { "epoch": 1.049884075300019, "eval_acc": 0.7440030388969522, "eval_loss": 0.8069285750389099, "eval_runtime": 1270.5322, "eval_samples_per_second": 28.327, "eval_steps_per_second": 14.164, "step": 45000 }, { "acc": 0.77712164, "epoch": 1.0501173828723078, "grad_norm": 6.6875, "learning_rate": 4.8369985518947336e-06, "loss": 0.79156408, "memory(GiB)": 147.13, "step": 45010, "train_speed(iter/s)": 0.200639 }, { "acc": 0.78002462, "epoch": 1.0503506904445967, "grad_norm": 6.0, "learning_rate": 4.83511045605694e-06, "loss": 0.79216776, "memory(GiB)": 147.13, "step": 45020, "train_speed(iter/s)": 0.200662 }, { "acc": 0.78218923, "epoch": 1.0505839980168856, "grad_norm": 6.40625, "learning_rate": 4.8332223837570824e-06, "loss": 0.7720716, "memory(GiB)": 147.13, "step": 45030, "train_speed(iter/s)": 0.200681 }, { "acc": 0.78232145, "epoch": 1.0508173055891745, "grad_norm": 6.15625, "learning_rate": 4.831334335264682e-06, "loss": 0.77959723, "memory(GiB)": 147.13, "step": 45040, "train_speed(iter/s)": 0.200704 }, { "acc": 0.78184223, "epoch": 1.0510506131614634, "grad_norm": 5.75, "learning_rate": 4.829446310849256e-06, "loss": 0.76550894, "memory(GiB)": 147.13, "step": 45050, "train_speed(iter/s)": 0.200728 }, { "acc": 0.78313303, "epoch": 1.0512839207337523, "grad_norm": 5.9375, "learning_rate": 4.827558310780319e-06, "loss": 0.77661057, "memory(GiB)": 147.13, "step": 45060, "train_speed(iter/s)": 0.20075 }, { "acc": 0.79087114, "epoch": 1.0515172283060412, "grad_norm": 4.40625, "learning_rate": 4.825670335327383e-06, "loss": 0.74372959, "memory(GiB)": 147.13, "step": 45070, "train_speed(iter/s)": 0.200773 }, { "acc": 0.80854607, "epoch": 1.0517505358783301, "grad_norm": 4.5625, "learning_rate": 4.823782384759955e-06, "loss": 0.68139353, "memory(GiB)": 147.13, "step": 45080, "train_speed(iter/s)": 0.200796 }, { "acc": 0.78581572, "epoch": 1.051983843450619, "grad_norm": 4.9375, "learning_rate": 4.821894459347542e-06, "loss": 0.7871974, "memory(GiB)": 147.13, "step": 45090, "train_speed(iter/s)": 0.200818 }, { "acc": 0.78187127, "epoch": 1.052217151022908, "grad_norm": 9.375, "learning_rate": 4.820006559359642e-06, "loss": 0.78979106, "memory(GiB)": 147.13, "step": 45100, "train_speed(iter/s)": 0.20084 }, { "acc": 0.76723819, "epoch": 1.0524504585951968, "grad_norm": 6.875, "learning_rate": 4.818118685065754e-06, "loss": 0.84743042, "memory(GiB)": 147.13, "step": 45110, "train_speed(iter/s)": 0.200863 }, { "acc": 0.77373695, "epoch": 1.0526837661674857, "grad_norm": 9.5, "learning_rate": 4.8162308367353705e-06, "loss": 0.81677189, "memory(GiB)": 147.13, "step": 45120, "train_speed(iter/s)": 0.200887 }, { "acc": 0.78724155, "epoch": 1.0529170737397746, "grad_norm": 5.21875, "learning_rate": 4.814343014637982e-06, "loss": 0.75338268, "memory(GiB)": 147.13, "step": 45130, "train_speed(iter/s)": 0.200909 }, { "acc": 0.77668324, "epoch": 1.0531503813120635, "grad_norm": 5.625, "learning_rate": 4.812455219043074e-06, "loss": 0.80819511, "memory(GiB)": 147.13, "step": 45140, "train_speed(iter/s)": 0.200933 }, { "acc": 0.77539773, "epoch": 1.0533836888843524, "grad_norm": 4.875, "learning_rate": 4.810567450220128e-06, "loss": 0.78674135, "memory(GiB)": 147.13, "step": 45150, "train_speed(iter/s)": 0.200956 }, { "acc": 0.7868578, "epoch": 1.0536169964566413, "grad_norm": 6.34375, "learning_rate": 4.808679708438624e-06, "loss": 0.76853762, "memory(GiB)": 147.13, "step": 45160, "train_speed(iter/s)": 0.200979 }, { "acc": 0.76111031, "epoch": 1.0538503040289302, "grad_norm": 5.375, "learning_rate": 4.806791993968039e-06, "loss": 0.86612825, "memory(GiB)": 147.13, "step": 45170, "train_speed(iter/s)": 0.201001 }, { "acc": 0.78183184, "epoch": 1.054083611601219, "grad_norm": 6.9375, "learning_rate": 4.804904307077838e-06, "loss": 0.76009259, "memory(GiB)": 147.13, "step": 45180, "train_speed(iter/s)": 0.201022 }, { "acc": 0.78163671, "epoch": 1.054316919173508, "grad_norm": 7.125, "learning_rate": 4.80301664803749e-06, "loss": 0.78068633, "memory(GiB)": 147.13, "step": 45190, "train_speed(iter/s)": 0.201045 }, { "acc": 0.78807421, "epoch": 1.054550226745797, "grad_norm": 5.625, "learning_rate": 4.80112901711646e-06, "loss": 0.75582342, "memory(GiB)": 147.13, "step": 45200, "train_speed(iter/s)": 0.201068 }, { "acc": 0.77289944, "epoch": 1.0547835343180858, "grad_norm": 6.28125, "learning_rate": 4.799241414584204e-06, "loss": 0.82699385, "memory(GiB)": 147.13, "step": 45210, "train_speed(iter/s)": 0.201091 }, { "acc": 0.78645868, "epoch": 1.0550168418903747, "grad_norm": 3.78125, "learning_rate": 4.797353840710178e-06, "loss": 0.7695097, "memory(GiB)": 147.13, "step": 45220, "train_speed(iter/s)": 0.201114 }, { "acc": 0.77393785, "epoch": 1.0552501494626636, "grad_norm": 6.125, "learning_rate": 4.795466295763832e-06, "loss": 0.81475391, "memory(GiB)": 147.13, "step": 45230, "train_speed(iter/s)": 0.201135 }, { "acc": 0.78816233, "epoch": 1.0554834570349523, "grad_norm": 6.1875, "learning_rate": 4.793578780014612e-06, "loss": 0.76425867, "memory(GiB)": 147.13, "step": 45240, "train_speed(iter/s)": 0.201158 }, { "acc": 0.76952739, "epoch": 1.0557167646072414, "grad_norm": 7.21875, "learning_rate": 4.791691293731962e-06, "loss": 0.81708698, "memory(GiB)": 147.13, "step": 45250, "train_speed(iter/s)": 0.201181 }, { "acc": 0.80263548, "epoch": 1.05595007217953, "grad_norm": 4.1875, "learning_rate": 4.78980383718532e-06, "loss": 0.68140831, "memory(GiB)": 147.13, "step": 45260, "train_speed(iter/s)": 0.201206 }, { "acc": 0.78784084, "epoch": 1.056183379751819, "grad_norm": 4.78125, "learning_rate": 4.787916410644119e-06, "loss": 0.75726004, "memory(GiB)": 147.13, "step": 45270, "train_speed(iter/s)": 0.201229 }, { "acc": 0.76332178, "epoch": 1.0564166873241079, "grad_norm": 5.5625, "learning_rate": 4.786029014377789e-06, "loss": 0.82791462, "memory(GiB)": 147.13, "step": 45280, "train_speed(iter/s)": 0.201253 }, { "acc": 0.77394991, "epoch": 1.0566499948963968, "grad_norm": 7.125, "learning_rate": 4.784141648655756e-06, "loss": 0.81197491, "memory(GiB)": 147.13, "step": 45290, "train_speed(iter/s)": 0.201276 }, { "acc": 0.76897669, "epoch": 1.0568833024686857, "grad_norm": 6.375, "learning_rate": 4.782254313747438e-06, "loss": 0.82489567, "memory(GiB)": 147.13, "step": 45300, "train_speed(iter/s)": 0.201299 }, { "acc": 0.78103123, "epoch": 1.0571166100409746, "grad_norm": 4.9375, "learning_rate": 4.780367009922253e-06, "loss": 0.785322, "memory(GiB)": 147.13, "step": 45310, "train_speed(iter/s)": 0.201322 }, { "acc": 0.76720481, "epoch": 1.0573499176132635, "grad_norm": 5.4375, "learning_rate": 4.778479737449614e-06, "loss": 0.85537357, "memory(GiB)": 147.13, "step": 45320, "train_speed(iter/s)": 0.201346 }, { "acc": 0.79495482, "epoch": 1.0575832251855524, "grad_norm": 6.28125, "learning_rate": 4.7765924965989286e-06, "loss": 0.72112303, "memory(GiB)": 147.13, "step": 45330, "train_speed(iter/s)": 0.201369 }, { "acc": 0.7869606, "epoch": 1.0578165327578413, "grad_norm": 6.15625, "learning_rate": 4.7747052876396e-06, "loss": 0.75025473, "memory(GiB)": 147.13, "step": 45340, "train_speed(iter/s)": 0.201393 }, { "acc": 0.77279506, "epoch": 1.0580498403301302, "grad_norm": 4.59375, "learning_rate": 4.772818110841025e-06, "loss": 0.82170506, "memory(GiB)": 147.13, "step": 45350, "train_speed(iter/s)": 0.201418 }, { "acc": 0.7949152, "epoch": 1.058283147902419, "grad_norm": 5.375, "learning_rate": 4.7709309664726e-06, "loss": 0.73024497, "memory(GiB)": 147.13, "step": 45360, "train_speed(iter/s)": 0.201443 }, { "acc": 0.79297123, "epoch": 1.058516455474708, "grad_norm": 6.84375, "learning_rate": 4.769043854803712e-06, "loss": 0.72323437, "memory(GiB)": 147.13, "step": 45370, "train_speed(iter/s)": 0.201465 }, { "acc": 0.77630758, "epoch": 1.0587497630469969, "grad_norm": 6.03125, "learning_rate": 4.767156776103746e-06, "loss": 0.80375509, "memory(GiB)": 147.13, "step": 45380, "train_speed(iter/s)": 0.201489 }, { "acc": 0.79003696, "epoch": 1.0589830706192858, "grad_norm": 5.09375, "learning_rate": 4.765269730642083e-06, "loss": 0.73524923, "memory(GiB)": 147.13, "step": 45390, "train_speed(iter/s)": 0.201511 }, { "acc": 0.79320269, "epoch": 1.0592163781915747, "grad_norm": 4.5625, "learning_rate": 4.7633827186881e-06, "loss": 0.7441308, "memory(GiB)": 147.13, "step": 45400, "train_speed(iter/s)": 0.201535 }, { "acc": 0.77364416, "epoch": 1.0594496857638636, "grad_norm": 5.75, "learning_rate": 4.7614957405111635e-06, "loss": 0.80192499, "memory(GiB)": 147.13, "step": 45410, "train_speed(iter/s)": 0.201559 }, { "acc": 0.78601189, "epoch": 1.0596829933361525, "grad_norm": 6.78125, "learning_rate": 4.759608796380642e-06, "loss": 0.77241449, "memory(GiB)": 147.13, "step": 45420, "train_speed(iter/s)": 0.20158 }, { "acc": 0.75279102, "epoch": 1.0599163009084414, "grad_norm": 5.5, "learning_rate": 4.757721886565893e-06, "loss": 0.89964409, "memory(GiB)": 147.13, "step": 45430, "train_speed(iter/s)": 0.201603 }, { "acc": 0.76471562, "epoch": 1.0601496084807303, "grad_norm": 9.5625, "learning_rate": 4.755835011336274e-06, "loss": 0.85386238, "memory(GiB)": 147.13, "step": 45440, "train_speed(iter/s)": 0.201626 }, { "acc": 0.78351641, "epoch": 1.0603829160530192, "grad_norm": 6.0625, "learning_rate": 4.753948170961137e-06, "loss": 0.76716695, "memory(GiB)": 147.13, "step": 45450, "train_speed(iter/s)": 0.201649 }, { "acc": 0.78080029, "epoch": 1.060616223625308, "grad_norm": 4.96875, "learning_rate": 4.752061365709827e-06, "loss": 0.76200881, "memory(GiB)": 147.13, "step": 45460, "train_speed(iter/s)": 0.201672 }, { "acc": 0.7751379, "epoch": 1.060849531197597, "grad_norm": 5.84375, "learning_rate": 4.750174595851685e-06, "loss": 0.79694057, "memory(GiB)": 147.13, "step": 45470, "train_speed(iter/s)": 0.201697 }, { "acc": 0.7780129, "epoch": 1.0610828387698858, "grad_norm": 4.71875, "learning_rate": 4.748287861656047e-06, "loss": 0.80889072, "memory(GiB)": 147.13, "step": 45480, "train_speed(iter/s)": 0.20172 }, { "acc": 0.78614645, "epoch": 1.0613161463421747, "grad_norm": 5.4375, "learning_rate": 4.746401163392244e-06, "loss": 0.79196577, "memory(GiB)": 147.13, "step": 45490, "train_speed(iter/s)": 0.201743 }, { "acc": 0.80344467, "epoch": 1.0615494539144636, "grad_norm": 4.78125, "learning_rate": 4.744514501329601e-06, "loss": 0.71334877, "memory(GiB)": 147.13, "step": 45500, "train_speed(iter/s)": 0.201765 }, { "epoch": 1.0615494539144636, "eval_acc": 0.7440579683686315, "eval_loss": 0.8067649006843567, "eval_runtime": 1270.3852, "eval_samples_per_second": 28.331, "eval_steps_per_second": 14.166, "step": 45500 }, { "acc": 0.78397212, "epoch": 1.0617827614867525, "grad_norm": 5.28125, "learning_rate": 4.74262787573744e-06, "loss": 0.78342257, "memory(GiB)": 147.13, "step": 45510, "train_speed(iter/s)": 0.200643 }, { "acc": 0.78444176, "epoch": 1.0620160690590414, "grad_norm": 11.125, "learning_rate": 4.7407412868850734e-06, "loss": 0.77105117, "memory(GiB)": 147.13, "step": 45520, "train_speed(iter/s)": 0.200666 }, { "acc": 0.79020176, "epoch": 1.0622493766313303, "grad_norm": 5.9375, "learning_rate": 4.738854735041813e-06, "loss": 0.75314264, "memory(GiB)": 147.13, "step": 45530, "train_speed(iter/s)": 0.200687 }, { "acc": 0.7691103, "epoch": 1.0624826842036192, "grad_norm": 6.375, "learning_rate": 4.736968220476963e-06, "loss": 0.8225666, "memory(GiB)": 147.13, "step": 45540, "train_speed(iter/s)": 0.200709 }, { "acc": 0.77286139, "epoch": 1.0627159917759081, "grad_norm": 5.78125, "learning_rate": 4.735081743459823e-06, "loss": 0.82172585, "memory(GiB)": 147.13, "step": 45550, "train_speed(iter/s)": 0.200732 }, { "acc": 0.78380318, "epoch": 1.062949299348197, "grad_norm": 5.03125, "learning_rate": 4.733195304259689e-06, "loss": 0.75959673, "memory(GiB)": 147.13, "step": 45560, "train_speed(iter/s)": 0.200755 }, { "acc": 0.78299665, "epoch": 1.063182606920486, "grad_norm": 6.28125, "learning_rate": 4.731308903145846e-06, "loss": 0.77152033, "memory(GiB)": 147.13, "step": 45570, "train_speed(iter/s)": 0.200776 }, { "acc": 0.78575029, "epoch": 1.0634159144927748, "grad_norm": 6.75, "learning_rate": 4.729422540387579e-06, "loss": 0.75135193, "memory(GiB)": 147.13, "step": 45580, "train_speed(iter/s)": 0.200799 }, { "acc": 0.77882252, "epoch": 1.0636492220650637, "grad_norm": 4.59375, "learning_rate": 4.727536216254166e-06, "loss": 0.77502632, "memory(GiB)": 147.13, "step": 45590, "train_speed(iter/s)": 0.200821 }, { "acc": 0.78998165, "epoch": 1.0638825296373526, "grad_norm": 5.15625, "learning_rate": 4.725649931014879e-06, "loss": 0.75522022, "memory(GiB)": 147.13, "step": 45600, "train_speed(iter/s)": 0.200844 }, { "acc": 0.78265238, "epoch": 1.0641158372096415, "grad_norm": 6.28125, "learning_rate": 4.723763684938985e-06, "loss": 0.78737378, "memory(GiB)": 147.13, "step": 45610, "train_speed(iter/s)": 0.200868 }, { "acc": 0.78826542, "epoch": 1.0643491447819304, "grad_norm": 4.53125, "learning_rate": 4.721877478295745e-06, "loss": 0.77142153, "memory(GiB)": 147.13, "step": 45620, "train_speed(iter/s)": 0.200891 }, { "acc": 0.76623106, "epoch": 1.064582452354219, "grad_norm": 7.78125, "learning_rate": 4.719991311354415e-06, "loss": 0.84487782, "memory(GiB)": 147.13, "step": 45630, "train_speed(iter/s)": 0.200914 }, { "acc": 0.78780022, "epoch": 1.0648157599265082, "grad_norm": 5.65625, "learning_rate": 4.718105184384243e-06, "loss": 0.75018334, "memory(GiB)": 147.13, "step": 45640, "train_speed(iter/s)": 0.200937 }, { "acc": 0.77883091, "epoch": 1.065049067498797, "grad_norm": 6.3125, "learning_rate": 4.7162190976544735e-06, "loss": 0.80597057, "memory(GiB)": 147.13, "step": 45650, "train_speed(iter/s)": 0.20096 }, { "acc": 0.78427258, "epoch": 1.0652823750710858, "grad_norm": 7.40625, "learning_rate": 4.7143330514343446e-06, "loss": 0.78061914, "memory(GiB)": 147.13, "step": 45660, "train_speed(iter/s)": 0.200983 }, { "acc": 0.78091021, "epoch": 1.0655156826433747, "grad_norm": 5.53125, "learning_rate": 4.712447045993091e-06, "loss": 0.78055716, "memory(GiB)": 147.13, "step": 45670, "train_speed(iter/s)": 0.201006 }, { "acc": 0.75933809, "epoch": 1.0657489902156636, "grad_norm": 7.15625, "learning_rate": 4.710561081599937e-06, "loss": 0.8501317, "memory(GiB)": 147.13, "step": 45680, "train_speed(iter/s)": 0.201028 }, { "acc": 0.77189841, "epoch": 1.0659822977879525, "grad_norm": 4.21875, "learning_rate": 4.708675158524105e-06, "loss": 0.81637125, "memory(GiB)": 147.13, "step": 45690, "train_speed(iter/s)": 0.201051 }, { "acc": 0.78322716, "epoch": 1.0662156053602414, "grad_norm": 4.46875, "learning_rate": 4.706789277034811e-06, "loss": 0.78145242, "memory(GiB)": 147.13, "step": 45700, "train_speed(iter/s)": 0.201074 }, { "acc": 0.7969058, "epoch": 1.0664489129325303, "grad_norm": 4.96875, "learning_rate": 4.704903437401261e-06, "loss": 0.70635157, "memory(GiB)": 147.13, "step": 45710, "train_speed(iter/s)": 0.201097 }, { "acc": 0.77726212, "epoch": 1.0666822205048192, "grad_norm": 6.0625, "learning_rate": 4.703017639892659e-06, "loss": 0.77836952, "memory(GiB)": 147.13, "step": 45720, "train_speed(iter/s)": 0.20112 }, { "acc": 0.77621179, "epoch": 1.066915528077108, "grad_norm": 6.3125, "learning_rate": 4.701131884778204e-06, "loss": 0.80525446, "memory(GiB)": 147.13, "step": 45730, "train_speed(iter/s)": 0.201144 }, { "acc": 0.77437353, "epoch": 1.067148835649397, "grad_norm": 9.125, "learning_rate": 4.699246172327087e-06, "loss": 0.79244003, "memory(GiB)": 147.13, "step": 45740, "train_speed(iter/s)": 0.201167 }, { "acc": 0.77285023, "epoch": 1.067382143221686, "grad_norm": 19.625, "learning_rate": 4.697360502808488e-06, "loss": 0.83040085, "memory(GiB)": 147.13, "step": 45750, "train_speed(iter/s)": 0.201191 }, { "acc": 0.7729352, "epoch": 1.0676154507939748, "grad_norm": 6.96875, "learning_rate": 4.695474876491592e-06, "loss": 0.84784021, "memory(GiB)": 147.13, "step": 45760, "train_speed(iter/s)": 0.201213 }, { "acc": 0.77671528, "epoch": 1.0678487583662637, "grad_norm": 6.15625, "learning_rate": 4.6935892936455664e-06, "loss": 0.78750763, "memory(GiB)": 147.13, "step": 45770, "train_speed(iter/s)": 0.201238 }, { "acc": 0.77845559, "epoch": 1.0680820659385526, "grad_norm": 5.90625, "learning_rate": 4.691703754539583e-06, "loss": 0.81025581, "memory(GiB)": 147.13, "step": 45780, "train_speed(iter/s)": 0.20126 }, { "acc": 0.79248986, "epoch": 1.0683153735108415, "grad_norm": 6.1875, "learning_rate": 4.689818259442797e-06, "loss": 0.75349188, "memory(GiB)": 147.13, "step": 45790, "train_speed(iter/s)": 0.201284 }, { "acc": 0.77852826, "epoch": 1.0685486810831304, "grad_norm": 9.5625, "learning_rate": 4.687932808624365e-06, "loss": 0.79906149, "memory(GiB)": 147.13, "step": 45800, "train_speed(iter/s)": 0.201307 }, { "acc": 0.78552589, "epoch": 1.0687819886554193, "grad_norm": 4.46875, "learning_rate": 4.686047402353433e-06, "loss": 0.74826708, "memory(GiB)": 147.13, "step": 45810, "train_speed(iter/s)": 0.201331 }, { "acc": 0.7994451, "epoch": 1.0690152962277082, "grad_norm": 3.953125, "learning_rate": 4.684162040899144e-06, "loss": 0.70688953, "memory(GiB)": 147.13, "step": 45820, "train_speed(iter/s)": 0.201354 }, { "acc": 0.78053541, "epoch": 1.069248603799997, "grad_norm": 5.09375, "learning_rate": 4.682276724530633e-06, "loss": 0.79235182, "memory(GiB)": 147.13, "step": 45830, "train_speed(iter/s)": 0.201377 }, { "acc": 0.77587891, "epoch": 1.069481911372286, "grad_norm": 4.6875, "learning_rate": 4.680391453517026e-06, "loss": 0.81508999, "memory(GiB)": 147.13, "step": 45840, "train_speed(iter/s)": 0.201399 }, { "acc": 0.79230328, "epoch": 1.0697152189445749, "grad_norm": 7.84375, "learning_rate": 4.678506228127447e-06, "loss": 0.7369689, "memory(GiB)": 147.13, "step": 45850, "train_speed(iter/s)": 0.201421 }, { "acc": 0.78080015, "epoch": 1.0699485265168638, "grad_norm": 5.6875, "learning_rate": 4.67662104863101e-06, "loss": 0.80230227, "memory(GiB)": 147.13, "step": 45860, "train_speed(iter/s)": 0.201444 }, { "acc": 0.79010229, "epoch": 1.0701818340891527, "grad_norm": 5.09375, "learning_rate": 4.674735915296824e-06, "loss": 0.75055971, "memory(GiB)": 147.13, "step": 45870, "train_speed(iter/s)": 0.201467 }, { "acc": 0.75482688, "epoch": 1.0704151416614416, "grad_norm": 6.71875, "learning_rate": 4.672850828393992e-06, "loss": 0.89350119, "memory(GiB)": 147.13, "step": 45880, "train_speed(iter/s)": 0.20149 }, { "acc": 0.76217985, "epoch": 1.0706484492337305, "grad_norm": 5.5625, "learning_rate": 4.670965788191609e-06, "loss": 0.86493149, "memory(GiB)": 147.13, "step": 45890, "train_speed(iter/s)": 0.201512 }, { "acc": 0.7592453, "epoch": 1.0708817568060194, "grad_norm": 5.46875, "learning_rate": 4.669080794958764e-06, "loss": 0.85552521, "memory(GiB)": 147.13, "step": 45900, "train_speed(iter/s)": 0.201532 }, { "acc": 0.76834116, "epoch": 1.0711150643783083, "grad_norm": 4.40625, "learning_rate": 4.6671958489645394e-06, "loss": 0.85116701, "memory(GiB)": 147.13, "step": 45910, "train_speed(iter/s)": 0.201556 }, { "acc": 0.79082565, "epoch": 1.0713483719505972, "grad_norm": 6.0625, "learning_rate": 4.665310950478011e-06, "loss": 0.74479675, "memory(GiB)": 147.13, "step": 45920, "train_speed(iter/s)": 0.201579 }, { "acc": 0.7921772, "epoch": 1.071581679522886, "grad_norm": 5.5625, "learning_rate": 4.663426099768247e-06, "loss": 0.73003244, "memory(GiB)": 147.13, "step": 45930, "train_speed(iter/s)": 0.201601 }, { "acc": 0.76949821, "epoch": 1.071814987095175, "grad_norm": 5.375, "learning_rate": 4.661541297104309e-06, "loss": 0.83000374, "memory(GiB)": 147.13, "step": 45940, "train_speed(iter/s)": 0.201623 }, { "acc": 0.77961545, "epoch": 1.0720482946674639, "grad_norm": 6.34375, "learning_rate": 4.659656542755253e-06, "loss": 0.76818419, "memory(GiB)": 147.13, "step": 45950, "train_speed(iter/s)": 0.201647 }, { "acc": 0.78886552, "epoch": 1.0722816022397528, "grad_norm": 7.03125, "learning_rate": 4.657771836990127e-06, "loss": 0.73285408, "memory(GiB)": 147.13, "step": 45960, "train_speed(iter/s)": 0.20167 }, { "acc": 0.77267704, "epoch": 1.0725149098120417, "grad_norm": 5.125, "learning_rate": 4.655887180077973e-06, "loss": 0.80965977, "memory(GiB)": 147.13, "step": 45970, "train_speed(iter/s)": 0.201693 }, { "acc": 0.7777483, "epoch": 1.0727482173843306, "grad_norm": 5.625, "learning_rate": 4.654002572287822e-06, "loss": 0.79634504, "memory(GiB)": 147.13, "step": 45980, "train_speed(iter/s)": 0.201714 }, { "acc": 0.7671217, "epoch": 1.0729815249566195, "grad_norm": 5.125, "learning_rate": 4.652118013888704e-06, "loss": 0.84396877, "memory(GiB)": 147.13, "step": 45990, "train_speed(iter/s)": 0.201736 }, { "acc": 0.7591877, "epoch": 1.0732148325289084, "grad_norm": 6.5625, "learning_rate": 4.650233505149639e-06, "loss": 0.86070147, "memory(GiB)": 147.13, "step": 46000, "train_speed(iter/s)": 0.201761 }, { "epoch": 1.0732148325289084, "eval_acc": 0.7440973638497776, "eval_loss": 0.8066761493682861, "eval_runtime": 1270.8927, "eval_samples_per_second": 28.319, "eval_steps_per_second": 14.16, "step": 46000 }, { "acc": 0.77020998, "epoch": 1.0734481401011973, "grad_norm": 5.5, "learning_rate": 4.648349046339639e-06, "loss": 0.83288393, "memory(GiB)": 147.13, "step": 46010, "train_speed(iter/s)": 0.20065 }, { "acc": 0.78606977, "epoch": 1.073681447673486, "grad_norm": 5.6875, "learning_rate": 4.64646463772771e-06, "loss": 0.76476479, "memory(GiB)": 147.13, "step": 46020, "train_speed(iter/s)": 0.200672 }, { "acc": 0.79527607, "epoch": 1.0739147552457748, "grad_norm": 9.0625, "learning_rate": 4.6445802795828515e-06, "loss": 0.72505322, "memory(GiB)": 147.13, "step": 46030, "train_speed(iter/s)": 0.200694 }, { "acc": 0.77043562, "epoch": 1.0741480628180637, "grad_norm": 5.59375, "learning_rate": 4.642695972174055e-06, "loss": 0.81447153, "memory(GiB)": 147.13, "step": 46040, "train_speed(iter/s)": 0.200717 }, { "acc": 0.78194885, "epoch": 1.0743813703903526, "grad_norm": 4.84375, "learning_rate": 4.640811715770305e-06, "loss": 0.78343391, "memory(GiB)": 147.13, "step": 46050, "train_speed(iter/s)": 0.200739 }, { "acc": 0.78372726, "epoch": 1.0746146779626415, "grad_norm": 4.78125, "learning_rate": 4.638927510640578e-06, "loss": 0.76459522, "memory(GiB)": 147.13, "step": 46060, "train_speed(iter/s)": 0.200761 }, { "acc": 0.79502363, "epoch": 1.0748479855349304, "grad_norm": 5.28125, "learning_rate": 4.637043357053844e-06, "loss": 0.72123518, "memory(GiB)": 147.13, "step": 46070, "train_speed(iter/s)": 0.200781 }, { "acc": 0.78148775, "epoch": 1.0750812931072193, "grad_norm": 6.90625, "learning_rate": 4.635159255279066e-06, "loss": 0.76372013, "memory(GiB)": 147.13, "step": 46080, "train_speed(iter/s)": 0.200803 }, { "acc": 0.7984148, "epoch": 1.0753146006795082, "grad_norm": 4.1875, "learning_rate": 4.633275205585198e-06, "loss": 0.69609632, "memory(GiB)": 147.13, "step": 46090, "train_speed(iter/s)": 0.200823 }, { "acc": 0.77495604, "epoch": 1.0755479082517971, "grad_norm": 6.375, "learning_rate": 4.631391208241187e-06, "loss": 0.80969296, "memory(GiB)": 147.13, "step": 46100, "train_speed(iter/s)": 0.200846 }, { "acc": 0.7904788, "epoch": 1.075781215824086, "grad_norm": 4.96875, "learning_rate": 4.6295072635159744e-06, "loss": 0.74659705, "memory(GiB)": 147.13, "step": 46110, "train_speed(iter/s)": 0.200868 }, { "acc": 0.77688103, "epoch": 1.076014523396375, "grad_norm": 4.75, "learning_rate": 4.627623371678492e-06, "loss": 0.8099329, "memory(GiB)": 147.13, "step": 46120, "train_speed(iter/s)": 0.200891 }, { "acc": 0.78068047, "epoch": 1.0762478309686638, "grad_norm": 4.375, "learning_rate": 4.625739532997665e-06, "loss": 0.7743185, "memory(GiB)": 147.13, "step": 46130, "train_speed(iter/s)": 0.200914 }, { "acc": 0.79984784, "epoch": 1.0764811385409527, "grad_norm": 5.28125, "learning_rate": 4.623855747742412e-06, "loss": 0.69627056, "memory(GiB)": 147.13, "step": 46140, "train_speed(iter/s)": 0.200936 }, { "acc": 0.77657719, "epoch": 1.0767144461132416, "grad_norm": 5.84375, "learning_rate": 4.62197201618164e-06, "loss": 0.78484745, "memory(GiB)": 147.13, "step": 46150, "train_speed(iter/s)": 0.200958 }, { "acc": 0.78479581, "epoch": 1.0769477536855305, "grad_norm": 4.75, "learning_rate": 4.620088338584254e-06, "loss": 0.76777401, "memory(GiB)": 147.13, "step": 46160, "train_speed(iter/s)": 0.200981 }, { "acc": 0.75793247, "epoch": 1.0771810612578194, "grad_norm": 5.03125, "learning_rate": 4.618204715219147e-06, "loss": 0.87096195, "memory(GiB)": 147.13, "step": 46170, "train_speed(iter/s)": 0.201004 }, { "acc": 0.78456974, "epoch": 1.0774143688301083, "grad_norm": 5.5, "learning_rate": 4.616321146355206e-06, "loss": 0.77649322, "memory(GiB)": 147.13, "step": 46180, "train_speed(iter/s)": 0.201026 }, { "acc": 0.76846437, "epoch": 1.0776476764023972, "grad_norm": 5.78125, "learning_rate": 4.614437632261311e-06, "loss": 0.82811508, "memory(GiB)": 147.13, "step": 46190, "train_speed(iter/s)": 0.201049 }, { "acc": 0.77970672, "epoch": 1.0778809839746861, "grad_norm": 10.0625, "learning_rate": 4.6125541732063315e-06, "loss": 0.80484219, "memory(GiB)": 147.13, "step": 46200, "train_speed(iter/s)": 0.20107 }, { "acc": 0.78650169, "epoch": 1.078114291546975, "grad_norm": 4.6875, "learning_rate": 4.6106707694591324e-06, "loss": 0.78840094, "memory(GiB)": 147.13, "step": 46210, "train_speed(iter/s)": 0.201093 }, { "acc": 0.77818255, "epoch": 1.078347599119264, "grad_norm": 5.28125, "learning_rate": 4.608787421288566e-06, "loss": 0.79254904, "memory(GiB)": 147.13, "step": 46220, "train_speed(iter/s)": 0.201118 }, { "acc": 0.77555122, "epoch": 1.0785809066915528, "grad_norm": 8.5, "learning_rate": 4.606904128963482e-06, "loss": 0.80696802, "memory(GiB)": 147.13, "step": 46230, "train_speed(iter/s)": 0.201142 }, { "acc": 0.77328091, "epoch": 1.0788142142638417, "grad_norm": 5.75, "learning_rate": 4.605020892752718e-06, "loss": 0.81441078, "memory(GiB)": 147.13, "step": 46240, "train_speed(iter/s)": 0.201163 }, { "acc": 0.7695261, "epoch": 1.0790475218361306, "grad_norm": 4.625, "learning_rate": 4.603137712925108e-06, "loss": 0.82396679, "memory(GiB)": 147.13, "step": 46250, "train_speed(iter/s)": 0.201185 }, { "acc": 0.77598724, "epoch": 1.0792808294084195, "grad_norm": 5.59375, "learning_rate": 4.601254589749474e-06, "loss": 0.8179862, "memory(GiB)": 147.13, "step": 46260, "train_speed(iter/s)": 0.201206 }, { "acc": 0.78526249, "epoch": 1.0795141369807084, "grad_norm": 5.4375, "learning_rate": 4.599371523494632e-06, "loss": 0.77546606, "memory(GiB)": 147.13, "step": 46270, "train_speed(iter/s)": 0.201228 }, { "acc": 0.78726249, "epoch": 1.0797474445529973, "grad_norm": 4.15625, "learning_rate": 4.597488514429388e-06, "loss": 0.75475173, "memory(GiB)": 147.13, "step": 46280, "train_speed(iter/s)": 0.20125 }, { "acc": 0.78735552, "epoch": 1.0799807521252862, "grad_norm": 5.21875, "learning_rate": 4.595605562822542e-06, "loss": 0.75245075, "memory(GiB)": 147.13, "step": 46290, "train_speed(iter/s)": 0.201272 }, { "acc": 0.7679944, "epoch": 1.0802140596975751, "grad_norm": 6.5, "learning_rate": 4.593722668942884e-06, "loss": 0.83475809, "memory(GiB)": 147.13, "step": 46300, "train_speed(iter/s)": 0.201295 }, { "acc": 0.78632035, "epoch": 1.080447367269864, "grad_norm": 5.8125, "learning_rate": 4.5918398330592e-06, "loss": 0.75767183, "memory(GiB)": 147.13, "step": 46310, "train_speed(iter/s)": 0.201317 }, { "acc": 0.7837204, "epoch": 1.080680674842153, "grad_norm": 8.5625, "learning_rate": 4.589957055440259e-06, "loss": 0.77630086, "memory(GiB)": 147.13, "step": 46320, "train_speed(iter/s)": 0.20134 }, { "acc": 0.77561617, "epoch": 1.0809139824144418, "grad_norm": 6.125, "learning_rate": 4.588074336354828e-06, "loss": 0.80536242, "memory(GiB)": 147.13, "step": 46330, "train_speed(iter/s)": 0.201363 }, { "acc": 0.77097578, "epoch": 1.0811472899867307, "grad_norm": 4.96875, "learning_rate": 4.586191676071666e-06, "loss": 0.83300571, "memory(GiB)": 147.13, "step": 46340, "train_speed(iter/s)": 0.201385 }, { "acc": 0.78117247, "epoch": 1.0813805975590196, "grad_norm": 5.21875, "learning_rate": 4.584309074859524e-06, "loss": 0.80158958, "memory(GiB)": 147.13, "step": 46350, "train_speed(iter/s)": 0.201407 }, { "acc": 0.78816423, "epoch": 1.0816139051313085, "grad_norm": 4.625, "learning_rate": 4.5824265329871395e-06, "loss": 0.7538969, "memory(GiB)": 147.13, "step": 46360, "train_speed(iter/s)": 0.201429 }, { "acc": 0.77398424, "epoch": 1.0818472127035974, "grad_norm": 5.90625, "learning_rate": 4.580544050723246e-06, "loss": 0.8361598, "memory(GiB)": 147.13, "step": 46370, "train_speed(iter/s)": 0.201451 }, { "acc": 0.79671211, "epoch": 1.0820805202758863, "grad_norm": 5.40625, "learning_rate": 4.578661628336567e-06, "loss": 0.73220673, "memory(GiB)": 147.13, "step": 46380, "train_speed(iter/s)": 0.201472 }, { "acc": 0.7743166, "epoch": 1.082313827848175, "grad_norm": 4.96875, "learning_rate": 4.576779266095818e-06, "loss": 0.8441576, "memory(GiB)": 147.13, "step": 46390, "train_speed(iter/s)": 0.201494 }, { "acc": 0.76517391, "epoch": 1.082547135420464, "grad_norm": 6.0, "learning_rate": 4.574896964269707e-06, "loss": 0.84825592, "memory(GiB)": 147.13, "step": 46400, "train_speed(iter/s)": 0.201518 }, { "acc": 0.80378437, "epoch": 1.0827804429927528, "grad_norm": 4.28125, "learning_rate": 4.573014723126931e-06, "loss": 0.70220609, "memory(GiB)": 147.13, "step": 46410, "train_speed(iter/s)": 0.201541 }, { "acc": 0.78200688, "epoch": 1.0830137505650417, "grad_norm": 3.625, "learning_rate": 4.571132542936179e-06, "loss": 0.79165678, "memory(GiB)": 147.13, "step": 46420, "train_speed(iter/s)": 0.201563 }, { "acc": 0.79855556, "epoch": 1.0832470581373306, "grad_norm": 4.78125, "learning_rate": 4.569250423966132e-06, "loss": 0.72647519, "memory(GiB)": 147.13, "step": 46430, "train_speed(iter/s)": 0.201586 }, { "acc": 0.77577281, "epoch": 1.0834803657096195, "grad_norm": 4.875, "learning_rate": 4.567368366485462e-06, "loss": 0.7856792, "memory(GiB)": 147.13, "step": 46440, "train_speed(iter/s)": 0.201609 }, { "acc": 0.77464085, "epoch": 1.0837136732819084, "grad_norm": 5.59375, "learning_rate": 4.56548637076283e-06, "loss": 0.80171261, "memory(GiB)": 147.13, "step": 46450, "train_speed(iter/s)": 0.201632 }, { "acc": 0.79902134, "epoch": 1.0839469808541973, "grad_norm": 5.875, "learning_rate": 4.563604437066894e-06, "loss": 0.714886, "memory(GiB)": 147.13, "step": 46460, "train_speed(iter/s)": 0.201654 }, { "acc": 0.78830729, "epoch": 1.0841802884264862, "grad_norm": 4.71875, "learning_rate": 4.561722565666298e-06, "loss": 0.74691525, "memory(GiB)": 147.13, "step": 46470, "train_speed(iter/s)": 0.201676 }, { "acc": 0.77470942, "epoch": 1.084413595998775, "grad_norm": 5.34375, "learning_rate": 4.559840756829677e-06, "loss": 0.80980682, "memory(GiB)": 147.13, "step": 46480, "train_speed(iter/s)": 0.201698 }, { "acc": 0.79907804, "epoch": 1.084646903571064, "grad_norm": 4.375, "learning_rate": 4.557959010825662e-06, "loss": 0.70692854, "memory(GiB)": 147.13, "step": 46490, "train_speed(iter/s)": 0.201721 }, { "acc": 0.76710625, "epoch": 1.0848802111433529, "grad_norm": 5.40625, "learning_rate": 4.5560773279228686e-06, "loss": 0.82628555, "memory(GiB)": 147.13, "step": 46500, "train_speed(iter/s)": 0.201745 }, { "epoch": 1.0848802111433529, "eval_acc": 0.7440590893782576, "eval_loss": 0.8066306710243225, "eval_runtime": 1271.0341, "eval_samples_per_second": 28.316, "eval_steps_per_second": 14.159, "step": 46500 }, { "acc": 0.76087437, "epoch": 1.0851135187156418, "grad_norm": 7.875, "learning_rate": 4.5541957083899075e-06, "loss": 0.90284786, "memory(GiB)": 147.13, "step": 46510, "train_speed(iter/s)": 0.200645 }, { "acc": 0.78702526, "epoch": 1.0853468262879307, "grad_norm": 7.34375, "learning_rate": 4.55231415249538e-06, "loss": 0.76634626, "memory(GiB)": 147.13, "step": 46520, "train_speed(iter/s)": 0.200665 }, { "acc": 0.75945778, "epoch": 1.0855801338602196, "grad_norm": 5.5, "learning_rate": 4.550432660507877e-06, "loss": 0.8995718, "memory(GiB)": 147.13, "step": 46530, "train_speed(iter/s)": 0.200686 }, { "acc": 0.78146367, "epoch": 1.0858134414325085, "grad_norm": 5.5625, "learning_rate": 4.548551232695983e-06, "loss": 0.78113079, "memory(GiB)": 147.13, "step": 46540, "train_speed(iter/s)": 0.20071 }, { "acc": 0.77994041, "epoch": 1.0860467490047974, "grad_norm": 5.125, "learning_rate": 4.5466698693282675e-06, "loss": 0.79309092, "memory(GiB)": 147.13, "step": 46550, "train_speed(iter/s)": 0.200731 }, { "acc": 0.79029708, "epoch": 1.0862800565770863, "grad_norm": 6.3125, "learning_rate": 4.544788570673296e-06, "loss": 0.76324611, "memory(GiB)": 147.13, "step": 46560, "train_speed(iter/s)": 0.200754 }, { "acc": 0.77777467, "epoch": 1.0865133641493752, "grad_norm": 14.5625, "learning_rate": 4.542907336999625e-06, "loss": 0.81118917, "memory(GiB)": 147.13, "step": 46570, "train_speed(iter/s)": 0.200776 }, { "acc": 0.78099852, "epoch": 1.086746671721664, "grad_norm": 5.34375, "learning_rate": 4.541026168575798e-06, "loss": 0.76369138, "memory(GiB)": 147.13, "step": 46580, "train_speed(iter/s)": 0.200799 }, { "acc": 0.79265385, "epoch": 1.086979979293953, "grad_norm": 4.53125, "learning_rate": 4.539145065670353e-06, "loss": 0.74946361, "memory(GiB)": 147.13, "step": 46590, "train_speed(iter/s)": 0.200822 }, { "acc": 0.7763402, "epoch": 1.0872132868662419, "grad_norm": 7.15625, "learning_rate": 4.537264028551814e-06, "loss": 0.81608562, "memory(GiB)": 147.13, "step": 46600, "train_speed(iter/s)": 0.200843 }, { "acc": 0.79085789, "epoch": 1.0874465944385308, "grad_norm": 6.6875, "learning_rate": 4.535383057488702e-06, "loss": 0.75591068, "memory(GiB)": 147.13, "step": 46610, "train_speed(iter/s)": 0.200865 }, { "acc": 0.78365879, "epoch": 1.0876799020108197, "grad_norm": 5.75, "learning_rate": 4.533502152749523e-06, "loss": 0.78133993, "memory(GiB)": 147.13, "step": 46620, "train_speed(iter/s)": 0.200887 }, { "acc": 0.81641464, "epoch": 1.0879132095831086, "grad_norm": 4.9375, "learning_rate": 4.531621314602777e-06, "loss": 0.6454319, "memory(GiB)": 147.13, "step": 46630, "train_speed(iter/s)": 0.20091 }, { "acc": 0.7887289, "epoch": 1.0881465171553975, "grad_norm": 6.1875, "learning_rate": 4.529740543316952e-06, "loss": 0.75883055, "memory(GiB)": 147.13, "step": 46640, "train_speed(iter/s)": 0.200931 }, { "acc": 0.76232872, "epoch": 1.0883798247276864, "grad_norm": 4.0, "learning_rate": 4.52785983916053e-06, "loss": 0.8628355, "memory(GiB)": 147.13, "step": 46650, "train_speed(iter/s)": 0.200952 }, { "acc": 0.76253319, "epoch": 1.0886131322999753, "grad_norm": 6.25, "learning_rate": 4.525979202401976e-06, "loss": 0.85835104, "memory(GiB)": 147.13, "step": 46660, "train_speed(iter/s)": 0.200974 }, { "acc": 0.78178864, "epoch": 1.0888464398722641, "grad_norm": 5.8125, "learning_rate": 4.524098633309753e-06, "loss": 0.79391246, "memory(GiB)": 147.13, "step": 46670, "train_speed(iter/s)": 0.200996 }, { "acc": 0.7927947, "epoch": 1.089079747444553, "grad_norm": 8.4375, "learning_rate": 4.522218132152313e-06, "loss": 0.73775425, "memory(GiB)": 147.13, "step": 46680, "train_speed(iter/s)": 0.201018 }, { "acc": 0.77629023, "epoch": 1.089313055016842, "grad_norm": 5.25, "learning_rate": 4.520337699198095e-06, "loss": 0.80242271, "memory(GiB)": 147.13, "step": 46690, "train_speed(iter/s)": 0.201041 }, { "acc": 0.79058509, "epoch": 1.0895463625891308, "grad_norm": 4.4375, "learning_rate": 4.5184573347155316e-06, "loss": 0.73654985, "memory(GiB)": 147.13, "step": 46700, "train_speed(iter/s)": 0.201065 }, { "acc": 0.77961817, "epoch": 1.0897796701614197, "grad_norm": 6.5, "learning_rate": 4.516577038973044e-06, "loss": 0.7927855, "memory(GiB)": 147.13, "step": 46710, "train_speed(iter/s)": 0.201088 }, { "acc": 0.78442645, "epoch": 1.0900129777337086, "grad_norm": 5.34375, "learning_rate": 4.514696812239043e-06, "loss": 0.78609529, "memory(GiB)": 147.13, "step": 46720, "train_speed(iter/s)": 0.20111 }, { "acc": 0.79100533, "epoch": 1.0902462853059975, "grad_norm": 8.9375, "learning_rate": 4.512816654781931e-06, "loss": 0.74049482, "memory(GiB)": 147.13, "step": 46730, "train_speed(iter/s)": 0.201132 }, { "acc": 0.78928852, "epoch": 1.0904795928782864, "grad_norm": 8.875, "learning_rate": 4.5109365668701e-06, "loss": 0.73953748, "memory(GiB)": 147.13, "step": 46740, "train_speed(iter/s)": 0.201155 }, { "acc": 0.78580799, "epoch": 1.0907129004505753, "grad_norm": 6.53125, "learning_rate": 4.5090565487719326e-06, "loss": 0.75008278, "memory(GiB)": 147.13, "step": 46750, "train_speed(iter/s)": 0.201177 }, { "acc": 0.77508364, "epoch": 1.0909462080228642, "grad_norm": 5.125, "learning_rate": 4.5071766007558e-06, "loss": 0.81171427, "memory(GiB)": 147.13, "step": 46760, "train_speed(iter/s)": 0.2012 }, { "acc": 0.78406925, "epoch": 1.0911795155951531, "grad_norm": 5.96875, "learning_rate": 4.505296723090066e-06, "loss": 0.7646771, "memory(GiB)": 147.13, "step": 46770, "train_speed(iter/s)": 0.201222 }, { "acc": 0.78110132, "epoch": 1.0914128231674418, "grad_norm": 5.6875, "learning_rate": 4.503416916043079e-06, "loss": 0.76935401, "memory(GiB)": 147.13, "step": 46780, "train_speed(iter/s)": 0.201242 }, { "acc": 0.79187145, "epoch": 1.091646130739731, "grad_norm": 4.8125, "learning_rate": 4.501537179883184e-06, "loss": 0.74730501, "memory(GiB)": 147.13, "step": 46790, "train_speed(iter/s)": 0.201264 }, { "acc": 0.78256168, "epoch": 1.0918794383120196, "grad_norm": 4.96875, "learning_rate": 4.499657514878711e-06, "loss": 0.77862916, "memory(GiB)": 147.13, "step": 46800, "train_speed(iter/s)": 0.201287 }, { "acc": 0.77598977, "epoch": 1.0921127458843085, "grad_norm": 6.75, "learning_rate": 4.497777921297983e-06, "loss": 0.8359479, "memory(GiB)": 147.13, "step": 46810, "train_speed(iter/s)": 0.201309 }, { "acc": 0.77790146, "epoch": 1.0923460534565974, "grad_norm": 5.96875, "learning_rate": 4.49589839940931e-06, "loss": 0.81699162, "memory(GiB)": 147.13, "step": 46820, "train_speed(iter/s)": 0.201332 }, { "acc": 0.78884344, "epoch": 1.0925793610288863, "grad_norm": 4.9375, "learning_rate": 4.494018949480994e-06, "loss": 0.74211559, "memory(GiB)": 147.13, "step": 46830, "train_speed(iter/s)": 0.201354 }, { "acc": 0.77702522, "epoch": 1.0928126686011752, "grad_norm": 6.0625, "learning_rate": 4.492139571781328e-06, "loss": 0.79819546, "memory(GiB)": 147.13, "step": 46840, "train_speed(iter/s)": 0.201376 }, { "acc": 0.77434607, "epoch": 1.093045976173464, "grad_norm": 5.3125, "learning_rate": 4.490260266578589e-06, "loss": 0.81396513, "memory(GiB)": 147.13, "step": 46850, "train_speed(iter/s)": 0.201398 }, { "acc": 0.79454117, "epoch": 1.093279283745753, "grad_norm": 6.28125, "learning_rate": 4.4883810341410485e-06, "loss": 0.74300327, "memory(GiB)": 147.13, "step": 46860, "train_speed(iter/s)": 0.20142 }, { "acc": 0.77728949, "epoch": 1.093512591318042, "grad_norm": 4.96875, "learning_rate": 4.486501874736967e-06, "loss": 0.80336361, "memory(GiB)": 147.13, "step": 46870, "train_speed(iter/s)": 0.201442 }, { "acc": 0.79518509, "epoch": 1.0937458988903308, "grad_norm": 5.3125, "learning_rate": 4.484622788634596e-06, "loss": 0.7204237, "memory(GiB)": 147.13, "step": 46880, "train_speed(iter/s)": 0.201465 }, { "acc": 0.78115616, "epoch": 1.0939792064626197, "grad_norm": 5.8125, "learning_rate": 4.48274377610217e-06, "loss": 0.78942556, "memory(GiB)": 147.13, "step": 46890, "train_speed(iter/s)": 0.201487 }, { "acc": 0.78453555, "epoch": 1.0942125140349086, "grad_norm": 4.6875, "learning_rate": 4.480864837407919e-06, "loss": 0.76847844, "memory(GiB)": 147.13, "step": 46900, "train_speed(iter/s)": 0.201509 }, { "acc": 0.80208855, "epoch": 1.0944458216071975, "grad_norm": 5.125, "learning_rate": 4.478985972820063e-06, "loss": 0.69647503, "memory(GiB)": 147.13, "step": 46910, "train_speed(iter/s)": 0.20153 }, { "acc": 0.77438059, "epoch": 1.0946791291794864, "grad_norm": 5.25, "learning_rate": 4.477107182606807e-06, "loss": 0.82461748, "memory(GiB)": 147.13, "step": 46920, "train_speed(iter/s)": 0.201553 }, { "acc": 0.7742578, "epoch": 1.0949124367517753, "grad_norm": 6.0625, "learning_rate": 4.4752284670363495e-06, "loss": 0.80661144, "memory(GiB)": 147.13, "step": 46930, "train_speed(iter/s)": 0.201575 }, { "acc": 0.78175735, "epoch": 1.0951457443240642, "grad_norm": 5.75, "learning_rate": 4.473349826376876e-06, "loss": 0.80005064, "memory(GiB)": 147.13, "step": 46940, "train_speed(iter/s)": 0.201597 }, { "acc": 0.78072362, "epoch": 1.095379051896353, "grad_norm": 6.78125, "learning_rate": 4.471471260896561e-06, "loss": 0.80625315, "memory(GiB)": 147.13, "step": 46950, "train_speed(iter/s)": 0.20162 }, { "acc": 0.78413506, "epoch": 1.095612359468642, "grad_norm": 4.4375, "learning_rate": 4.46959277086357e-06, "loss": 0.76228724, "memory(GiB)": 147.13, "step": 46960, "train_speed(iter/s)": 0.201642 }, { "acc": 0.7963975, "epoch": 1.095845667040931, "grad_norm": 5.03125, "learning_rate": 4.467714356546057e-06, "loss": 0.71428413, "memory(GiB)": 147.13, "step": 46970, "train_speed(iter/s)": 0.201664 }, { "acc": 0.79295254, "epoch": 1.0960789746132198, "grad_norm": 5.09375, "learning_rate": 4.465836018212166e-06, "loss": 0.72961626, "memory(GiB)": 147.13, "step": 46980, "train_speed(iter/s)": 0.201686 }, { "acc": 0.76461864, "epoch": 1.0963122821855087, "grad_norm": 5.84375, "learning_rate": 4.463957756130028e-06, "loss": 0.82763643, "memory(GiB)": 147.13, "step": 46990, "train_speed(iter/s)": 0.201709 }, { "acc": 0.77132835, "epoch": 1.0965455897577976, "grad_norm": 6.9375, "learning_rate": 4.462079570567765e-06, "loss": 0.81077747, "memory(GiB)": 147.13, "step": 47000, "train_speed(iter/s)": 0.20173 }, { "epoch": 1.0965455897577976, "eval_acc": 0.7441737526485854, "eval_loss": 0.8062352538108826, "eval_runtime": 1270.2715, "eval_samples_per_second": 28.333, "eval_steps_per_second": 14.167, "step": 47000 }, { "acc": 0.77072382, "epoch": 1.0967788973300865, "grad_norm": 4.21875, "learning_rate": 4.460201461793486e-06, "loss": 0.83631325, "memory(GiB)": 147.13, "step": 47010, "train_speed(iter/s)": 0.200644 }, { "acc": 0.79384909, "epoch": 1.0970122049023754, "grad_norm": 6.21875, "learning_rate": 4.458323430075292e-06, "loss": 0.74116993, "memory(GiB)": 147.13, "step": 47020, "train_speed(iter/s)": 0.200664 }, { "acc": 0.7897253, "epoch": 1.0972455124746643, "grad_norm": 4.375, "learning_rate": 4.45644547568127e-06, "loss": 0.75881343, "memory(GiB)": 147.13, "step": 47030, "train_speed(iter/s)": 0.200686 }, { "acc": 0.78165913, "epoch": 1.0974788200469532, "grad_norm": 4.9375, "learning_rate": 4.4545675988795e-06, "loss": 0.79373956, "memory(GiB)": 147.13, "step": 47040, "train_speed(iter/s)": 0.200708 }, { "acc": 0.78019323, "epoch": 1.097712127619242, "grad_norm": 5.5625, "learning_rate": 4.452689799938045e-06, "loss": 0.79432139, "memory(GiB)": 147.13, "step": 47050, "train_speed(iter/s)": 0.200729 }, { "acc": 0.76499996, "epoch": 1.097945435191531, "grad_norm": 6.0625, "learning_rate": 4.450812079124964e-06, "loss": 0.85962629, "memory(GiB)": 147.13, "step": 47060, "train_speed(iter/s)": 0.200751 }, { "acc": 0.78296137, "epoch": 1.0981787427638199, "grad_norm": 4.84375, "learning_rate": 4.448934436708297e-06, "loss": 0.78475742, "memory(GiB)": 147.13, "step": 47070, "train_speed(iter/s)": 0.200773 }, { "acc": 0.78817444, "epoch": 1.0984120503361088, "grad_norm": 6.28125, "learning_rate": 4.44705687295608e-06, "loss": 0.77784424, "memory(GiB)": 147.13, "step": 47080, "train_speed(iter/s)": 0.200795 }, { "acc": 0.78499746, "epoch": 1.0986453579083977, "grad_norm": 7.09375, "learning_rate": 4.445179388136335e-06, "loss": 0.760812, "memory(GiB)": 147.13, "step": 47090, "train_speed(iter/s)": 0.200816 }, { "acc": 0.78877263, "epoch": 1.0988786654806866, "grad_norm": 6.59375, "learning_rate": 4.44330198251707e-06, "loss": 0.73901072, "memory(GiB)": 147.13, "step": 47100, "train_speed(iter/s)": 0.200837 }, { "acc": 0.75595312, "epoch": 1.0991119730529755, "grad_norm": 6.3125, "learning_rate": 4.441424656366287e-06, "loss": 0.85828323, "memory(GiB)": 147.13, "step": 47110, "train_speed(iter/s)": 0.20086 }, { "acc": 0.78338614, "epoch": 1.0993452806252644, "grad_norm": 6.90625, "learning_rate": 4.43954740995197e-06, "loss": 0.77686892, "memory(GiB)": 147.13, "step": 47120, "train_speed(iter/s)": 0.200882 }, { "acc": 0.79634991, "epoch": 1.0995785881975533, "grad_norm": 7.125, "learning_rate": 4.437670243542097e-06, "loss": 0.72251873, "memory(GiB)": 147.13, "step": 47130, "train_speed(iter/s)": 0.200903 }, { "acc": 0.7735898, "epoch": 1.0998118957698422, "grad_norm": 5.3125, "learning_rate": 4.435793157404636e-06, "loss": 0.81141148, "memory(GiB)": 147.13, "step": 47140, "train_speed(iter/s)": 0.200925 }, { "acc": 0.76752191, "epoch": 1.100045203342131, "grad_norm": 5.78125, "learning_rate": 4.433916151807535e-06, "loss": 0.83682642, "memory(GiB)": 147.13, "step": 47150, "train_speed(iter/s)": 0.200948 }, { "acc": 0.78204641, "epoch": 1.10027851091442, "grad_norm": 5.5625, "learning_rate": 4.43203922701874e-06, "loss": 0.77274141, "memory(GiB)": 147.13, "step": 47160, "train_speed(iter/s)": 0.20097 }, { "acc": 0.7881916, "epoch": 1.1005118184867086, "grad_norm": 4.9375, "learning_rate": 4.43016238330618e-06, "loss": 0.74384875, "memory(GiB)": 147.13, "step": 47170, "train_speed(iter/s)": 0.200992 }, { "acc": 0.78752022, "epoch": 1.1007451260589978, "grad_norm": 3.984375, "learning_rate": 4.428285620937774e-06, "loss": 0.76339617, "memory(GiB)": 147.13, "step": 47180, "train_speed(iter/s)": 0.201013 }, { "acc": 0.76733403, "epoch": 1.1009784336312864, "grad_norm": 5.78125, "learning_rate": 4.4264089401814306e-06, "loss": 0.84128456, "memory(GiB)": 147.13, "step": 47190, "train_speed(iter/s)": 0.201035 }, { "acc": 0.78480959, "epoch": 1.1012117412035753, "grad_norm": 4.625, "learning_rate": 4.4245323413050446e-06, "loss": 0.76962199, "memory(GiB)": 147.13, "step": 47200, "train_speed(iter/s)": 0.201057 }, { "acc": 0.7589119, "epoch": 1.1014450487758642, "grad_norm": 4.6875, "learning_rate": 4.422655824576499e-06, "loss": 0.88679562, "memory(GiB)": 147.13, "step": 47210, "train_speed(iter/s)": 0.201079 }, { "acc": 0.80034866, "epoch": 1.1016783563481531, "grad_norm": 4.90625, "learning_rate": 4.420779390263669e-06, "loss": 0.70025153, "memory(GiB)": 147.13, "step": 47220, "train_speed(iter/s)": 0.201102 }, { "acc": 0.78663492, "epoch": 1.101911663920442, "grad_norm": 4.90625, "learning_rate": 4.4189030386344094e-06, "loss": 0.76303225, "memory(GiB)": 147.13, "step": 47230, "train_speed(iter/s)": 0.201123 }, { "acc": 0.77694702, "epoch": 1.102144971492731, "grad_norm": 6.90625, "learning_rate": 4.417026769956573e-06, "loss": 0.79356718, "memory(GiB)": 147.13, "step": 47240, "train_speed(iter/s)": 0.201143 }, { "acc": 0.77413359, "epoch": 1.1023782790650198, "grad_norm": 4.9375, "learning_rate": 4.415150584497996e-06, "loss": 0.81121407, "memory(GiB)": 147.13, "step": 47250, "train_speed(iter/s)": 0.201165 }, { "acc": 0.74965243, "epoch": 1.1026115866373087, "grad_norm": 6.5, "learning_rate": 4.413274482526503e-06, "loss": 0.91974735, "memory(GiB)": 147.13, "step": 47260, "train_speed(iter/s)": 0.201187 }, { "acc": 0.78864617, "epoch": 1.1028448942095976, "grad_norm": 5.28125, "learning_rate": 4.4113984643099075e-06, "loss": 0.75029535, "memory(GiB)": 147.13, "step": 47270, "train_speed(iter/s)": 0.201207 }, { "acc": 0.79483843, "epoch": 1.1030782017818865, "grad_norm": 5.21875, "learning_rate": 4.409522530116011e-06, "loss": 0.73387432, "memory(GiB)": 147.13, "step": 47280, "train_speed(iter/s)": 0.20123 }, { "acc": 0.7674356, "epoch": 1.1033115093541754, "grad_norm": 4.6875, "learning_rate": 4.407646680212601e-06, "loss": 0.85647984, "memory(GiB)": 147.13, "step": 47290, "train_speed(iter/s)": 0.201252 }, { "acc": 0.78836451, "epoch": 1.1035448169264643, "grad_norm": 5.90625, "learning_rate": 4.405770914867455e-06, "loss": 0.74849033, "memory(GiB)": 147.13, "step": 47300, "train_speed(iter/s)": 0.201274 }, { "acc": 0.78828807, "epoch": 1.1037781244987532, "grad_norm": 7.03125, "learning_rate": 4.403895234348338e-06, "loss": 0.74482512, "memory(GiB)": 147.13, "step": 47310, "train_speed(iter/s)": 0.201297 }, { "acc": 0.77510147, "epoch": 1.1040114320710421, "grad_norm": 5.4375, "learning_rate": 4.402019638923003e-06, "loss": 0.82446518, "memory(GiB)": 147.13, "step": 47320, "train_speed(iter/s)": 0.201319 }, { "acc": 0.78557339, "epoch": 1.104244739643331, "grad_norm": 5.3125, "learning_rate": 4.400144128859192e-06, "loss": 0.76738482, "memory(GiB)": 147.13, "step": 47330, "train_speed(iter/s)": 0.201342 }, { "acc": 0.79229355, "epoch": 1.10447804721562, "grad_norm": 4.28125, "learning_rate": 4.3982687044246336e-06, "loss": 0.76299677, "memory(GiB)": 147.13, "step": 47340, "train_speed(iter/s)": 0.201363 }, { "acc": 0.79036603, "epoch": 1.1047113547879088, "grad_norm": 6.1875, "learning_rate": 4.396393365887041e-06, "loss": 0.74940066, "memory(GiB)": 147.13, "step": 47350, "train_speed(iter/s)": 0.201384 }, { "acc": 0.78618107, "epoch": 1.1049446623601977, "grad_norm": 6.125, "learning_rate": 4.394518113514121e-06, "loss": 0.74857254, "memory(GiB)": 147.13, "step": 47360, "train_speed(iter/s)": 0.201407 }, { "acc": 0.78618221, "epoch": 1.1051779699324866, "grad_norm": 5.65625, "learning_rate": 4.392642947573563e-06, "loss": 0.75713739, "memory(GiB)": 147.13, "step": 47370, "train_speed(iter/s)": 0.20143 }, { "acc": 0.77709045, "epoch": 1.1054112775047755, "grad_norm": 4.9375, "learning_rate": 4.3907678683330486e-06, "loss": 0.7963932, "memory(GiB)": 147.13, "step": 47380, "train_speed(iter/s)": 0.201452 }, { "acc": 0.78100262, "epoch": 1.1056445850770644, "grad_norm": 5.375, "learning_rate": 4.388892876060243e-06, "loss": 0.79459877, "memory(GiB)": 147.13, "step": 47390, "train_speed(iter/s)": 0.201473 }, { "acc": 0.78773174, "epoch": 1.1058778926493533, "grad_norm": 8.875, "learning_rate": 4.387017971022803e-06, "loss": 0.74201722, "memory(GiB)": 147.13, "step": 47400, "train_speed(iter/s)": 0.201496 }, { "acc": 0.7911149, "epoch": 1.1061112002216422, "grad_norm": 47.25, "learning_rate": 4.385143153488369e-06, "loss": 0.74900484, "memory(GiB)": 147.13, "step": 47410, "train_speed(iter/s)": 0.201518 }, { "acc": 0.77765427, "epoch": 1.1063445077939311, "grad_norm": 12.8125, "learning_rate": 4.383268423724572e-06, "loss": 0.78825626, "memory(GiB)": 147.13, "step": 47420, "train_speed(iter/s)": 0.201539 }, { "acc": 0.76809454, "epoch": 1.10657781536622, "grad_norm": 5.375, "learning_rate": 4.381393781999027e-06, "loss": 0.84127693, "memory(GiB)": 147.13, "step": 47430, "train_speed(iter/s)": 0.201561 }, { "acc": 0.80026913, "epoch": 1.106811122938509, "grad_norm": 4.75, "learning_rate": 4.379519228579342e-06, "loss": 0.69325418, "memory(GiB)": 147.13, "step": 47440, "train_speed(iter/s)": 0.201583 }, { "acc": 0.78321247, "epoch": 1.1070444305107978, "grad_norm": 7.125, "learning_rate": 4.377644763733106e-06, "loss": 0.79312048, "memory(GiB)": 147.13, "step": 47450, "train_speed(iter/s)": 0.201604 }, { "acc": 0.7822238, "epoch": 1.1072777380830867, "grad_norm": 4.625, "learning_rate": 4.375770387727899e-06, "loss": 0.79141903, "memory(GiB)": 147.13, "step": 47460, "train_speed(iter/s)": 0.201627 }, { "acc": 0.7735548, "epoch": 1.1075110456553756, "grad_norm": 5.5625, "learning_rate": 4.373896100831288e-06, "loss": 0.81294842, "memory(GiB)": 147.13, "step": 47470, "train_speed(iter/s)": 0.20165 }, { "acc": 0.77273912, "epoch": 1.1077443532276645, "grad_norm": 5.96875, "learning_rate": 4.372021903310826e-06, "loss": 0.81545506, "memory(GiB)": 147.13, "step": 47480, "train_speed(iter/s)": 0.201672 }, { "acc": 0.77078466, "epoch": 1.1079776607999534, "grad_norm": 6.375, "learning_rate": 4.370147795434054e-06, "loss": 0.84389324, "memory(GiB)": 147.13, "step": 47490, "train_speed(iter/s)": 0.201694 }, { "acc": 0.81073818, "epoch": 1.1082109683722423, "grad_norm": 6.0625, "learning_rate": 4.3682737774685035e-06, "loss": 0.68347592, "memory(GiB)": 147.13, "step": 47500, "train_speed(iter/s)": 0.201717 }, { "epoch": 1.1082109683722423, "eval_acc": 0.7442459776973533, "eval_loss": 0.8062799572944641, "eval_runtime": 1271.1815, "eval_samples_per_second": 28.313, "eval_steps_per_second": 14.157, "step": 47500 }, { "acc": 0.77623062, "epoch": 1.1084442759445312, "grad_norm": 5.125, "learning_rate": 4.366399849681686e-06, "loss": 0.78513613, "memory(GiB)": 147.13, "step": 47510, "train_speed(iter/s)": 0.200641 }, { "acc": 0.78764448, "epoch": 1.10867758351682, "grad_norm": 7.3125, "learning_rate": 4.364526012341107e-06, "loss": 0.74058805, "memory(GiB)": 147.13, "step": 47520, "train_speed(iter/s)": 0.200661 }, { "acc": 0.77476292, "epoch": 1.108910891089109, "grad_norm": 4.53125, "learning_rate": 4.362652265714254e-06, "loss": 0.79294233, "memory(GiB)": 147.13, "step": 47530, "train_speed(iter/s)": 0.200683 }, { "acc": 0.75863619, "epoch": 1.109144198661398, "grad_norm": 6.375, "learning_rate": 4.360778610068605e-06, "loss": 0.88454666, "memory(GiB)": 147.13, "step": 47540, "train_speed(iter/s)": 0.200705 }, { "acc": 0.78705368, "epoch": 1.1093775062336868, "grad_norm": 6.1875, "learning_rate": 4.3589050456716254e-06, "loss": 0.74554396, "memory(GiB)": 147.13, "step": 47550, "train_speed(iter/s)": 0.200727 }, { "acc": 0.79435959, "epoch": 1.1096108138059755, "grad_norm": 4.65625, "learning_rate": 4.357031572790763e-06, "loss": 0.74123554, "memory(GiB)": 147.13, "step": 47560, "train_speed(iter/s)": 0.200747 }, { "acc": 0.76363263, "epoch": 1.1098441213782644, "grad_norm": 5.9375, "learning_rate": 4.355158191693458e-06, "loss": 0.87095814, "memory(GiB)": 147.13, "step": 47570, "train_speed(iter/s)": 0.200768 }, { "acc": 0.77314081, "epoch": 1.1100774289505533, "grad_norm": 6.1875, "learning_rate": 4.353284902647133e-06, "loss": 0.82467804, "memory(GiB)": 147.13, "step": 47580, "train_speed(iter/s)": 0.200788 }, { "acc": 0.7841754, "epoch": 1.1103107365228422, "grad_norm": 4.6875, "learning_rate": 4.351411705919201e-06, "loss": 0.78773432, "memory(GiB)": 147.13, "step": 47590, "train_speed(iter/s)": 0.200809 }, { "acc": 0.80135212, "epoch": 1.110544044095131, "grad_norm": 5.75, "learning_rate": 4.349538601777058e-06, "loss": 0.74643879, "memory(GiB)": 147.13, "step": 47600, "train_speed(iter/s)": 0.20083 }, { "acc": 0.77409286, "epoch": 1.11077735166742, "grad_norm": 4.28125, "learning_rate": 4.347665590488091e-06, "loss": 0.81729527, "memory(GiB)": 147.13, "step": 47610, "train_speed(iter/s)": 0.200851 }, { "acc": 0.80320225, "epoch": 1.1110106592397089, "grad_norm": 4.34375, "learning_rate": 4.3457926723196716e-06, "loss": 0.69598637, "memory(GiB)": 147.13, "step": 47620, "train_speed(iter/s)": 0.200873 }, { "acc": 0.79413824, "epoch": 1.1112439668119978, "grad_norm": 5.5, "learning_rate": 4.343919847539157e-06, "loss": 0.7392138, "memory(GiB)": 147.13, "step": 47630, "train_speed(iter/s)": 0.200895 }, { "acc": 0.76051364, "epoch": 1.1114772743842867, "grad_norm": 6.71875, "learning_rate": 4.342047116413897e-06, "loss": 0.88393078, "memory(GiB)": 147.13, "step": 47640, "train_speed(iter/s)": 0.200917 }, { "acc": 0.77696719, "epoch": 1.1117105819565756, "grad_norm": 4.84375, "learning_rate": 4.340174479211217e-06, "loss": 0.80170889, "memory(GiB)": 147.13, "step": 47650, "train_speed(iter/s)": 0.200938 }, { "acc": 0.77932134, "epoch": 1.1119438895288645, "grad_norm": 5.65625, "learning_rate": 4.338301936198439e-06, "loss": 0.79549494, "memory(GiB)": 147.13, "step": 47660, "train_speed(iter/s)": 0.200958 }, { "acc": 0.77386909, "epoch": 1.1121771971011534, "grad_norm": 5.4375, "learning_rate": 4.336429487642867e-06, "loss": 0.82580423, "memory(GiB)": 147.13, "step": 47670, "train_speed(iter/s)": 0.20098 }, { "acc": 0.79550128, "epoch": 1.1124105046734423, "grad_norm": 5.28125, "learning_rate": 4.334557133811796e-06, "loss": 0.71878395, "memory(GiB)": 147.13, "step": 47680, "train_speed(iter/s)": 0.201003 }, { "acc": 0.77494755, "epoch": 1.1126438122457312, "grad_norm": 5.9375, "learning_rate": 4.332684874972498e-06, "loss": 0.80830698, "memory(GiB)": 147.13, "step": 47690, "train_speed(iter/s)": 0.201024 }, { "acc": 0.80317068, "epoch": 1.11287711981802, "grad_norm": 5.78125, "learning_rate": 4.330812711392241e-06, "loss": 0.7105298, "memory(GiB)": 147.13, "step": 47700, "train_speed(iter/s)": 0.201045 }, { "acc": 0.78965735, "epoch": 1.113110427390309, "grad_norm": 3.765625, "learning_rate": 4.328940643338274e-06, "loss": 0.742593, "memory(GiB)": 147.13, "step": 47710, "train_speed(iter/s)": 0.201068 }, { "acc": 0.78511152, "epoch": 1.1133437349625979, "grad_norm": 5.40625, "learning_rate": 4.327068671077836e-06, "loss": 0.78913937, "memory(GiB)": 147.13, "step": 47720, "train_speed(iter/s)": 0.201088 }, { "acc": 0.78472223, "epoch": 1.1135770425348868, "grad_norm": 5.71875, "learning_rate": 4.32519679487815e-06, "loss": 0.77086167, "memory(GiB)": 147.13, "step": 47730, "train_speed(iter/s)": 0.20111 }, { "acc": 0.77630825, "epoch": 1.1138103501071757, "grad_norm": 6.6875, "learning_rate": 4.323325015006425e-06, "loss": 0.82653818, "memory(GiB)": 147.13, "step": 47740, "train_speed(iter/s)": 0.201132 }, { "acc": 0.76846933, "epoch": 1.1140436576794646, "grad_norm": 5.65625, "learning_rate": 4.321453331729857e-06, "loss": 0.82080832, "memory(GiB)": 147.13, "step": 47750, "train_speed(iter/s)": 0.201153 }, { "acc": 0.78095446, "epoch": 1.1142769652517535, "grad_norm": 3.890625, "learning_rate": 4.319581745315629e-06, "loss": 0.78204536, "memory(GiB)": 147.13, "step": 47760, "train_speed(iter/s)": 0.201175 }, { "acc": 0.77750607, "epoch": 1.1145102728240424, "grad_norm": 5.4375, "learning_rate": 4.317710256030911e-06, "loss": 0.82766047, "memory(GiB)": 147.13, "step": 47770, "train_speed(iter/s)": 0.201196 }, { "acc": 0.77918196, "epoch": 1.1147435803963313, "grad_norm": 7.03125, "learning_rate": 4.3158388641428536e-06, "loss": 0.7857625, "memory(GiB)": 147.13, "step": 47780, "train_speed(iter/s)": 0.201219 }, { "acc": 0.77556639, "epoch": 1.1149768879686202, "grad_norm": 5.21875, "learning_rate": 4.3139675699186e-06, "loss": 0.81662083, "memory(GiB)": 147.13, "step": 47790, "train_speed(iter/s)": 0.20124 }, { "acc": 0.79196119, "epoch": 1.115210195540909, "grad_norm": 6.40625, "learning_rate": 4.312096373625279e-06, "loss": 0.75588217, "memory(GiB)": 147.13, "step": 47800, "train_speed(iter/s)": 0.201261 }, { "acc": 0.77584, "epoch": 1.115443503113198, "grad_norm": 6.6875, "learning_rate": 4.310225275529998e-06, "loss": 0.79082928, "memory(GiB)": 147.13, "step": 47810, "train_speed(iter/s)": 0.201282 }, { "acc": 0.76462359, "epoch": 1.1156768106854869, "grad_norm": 4.78125, "learning_rate": 4.308354275899859e-06, "loss": 0.8640892, "memory(GiB)": 147.13, "step": 47820, "train_speed(iter/s)": 0.201304 }, { "acc": 0.8109581, "epoch": 1.1159101182577758, "grad_norm": 7.6875, "learning_rate": 4.306483375001946e-06, "loss": 0.65745254, "memory(GiB)": 147.13, "step": 47830, "train_speed(iter/s)": 0.201326 }, { "acc": 0.770755, "epoch": 1.1161434258300647, "grad_norm": 4.6875, "learning_rate": 4.30461257310333e-06, "loss": 0.84625854, "memory(GiB)": 147.13, "step": 47840, "train_speed(iter/s)": 0.201348 }, { "acc": 0.77912922, "epoch": 1.1163767334023535, "grad_norm": 4.78125, "learning_rate": 4.302741870471069e-06, "loss": 0.76887741, "memory(GiB)": 147.13, "step": 47850, "train_speed(iter/s)": 0.201369 }, { "acc": 0.78028278, "epoch": 1.1166100409746424, "grad_norm": 8.1875, "learning_rate": 4.3008712673722005e-06, "loss": 0.80194454, "memory(GiB)": 147.13, "step": 47860, "train_speed(iter/s)": 0.201392 }, { "acc": 0.79243369, "epoch": 1.1168433485469313, "grad_norm": 5.6875, "learning_rate": 4.299000764073757e-06, "loss": 0.733636, "memory(GiB)": 147.13, "step": 47870, "train_speed(iter/s)": 0.201412 }, { "acc": 0.78628654, "epoch": 1.1170766561192202, "grad_norm": 5.1875, "learning_rate": 4.29713036084275e-06, "loss": 0.76849709, "memory(GiB)": 147.13, "step": 47880, "train_speed(iter/s)": 0.201434 }, { "acc": 0.78145351, "epoch": 1.1173099636915091, "grad_norm": 5.53125, "learning_rate": 4.29526005794618e-06, "loss": 0.79676495, "memory(GiB)": 147.13, "step": 47890, "train_speed(iter/s)": 0.201454 }, { "acc": 0.7888092, "epoch": 1.117543271263798, "grad_norm": 6.4375, "learning_rate": 4.2933898556510325e-06, "loss": 0.7636147, "memory(GiB)": 147.13, "step": 47900, "train_speed(iter/s)": 0.201475 }, { "acc": 0.77102165, "epoch": 1.117776578836087, "grad_norm": 10.4375, "learning_rate": 4.29151975422428e-06, "loss": 0.8149704, "memory(GiB)": 147.13, "step": 47910, "train_speed(iter/s)": 0.201496 }, { "acc": 0.7934557, "epoch": 1.1180098864083758, "grad_norm": 5.0, "learning_rate": 4.289649753932874e-06, "loss": 0.72969637, "memory(GiB)": 147.13, "step": 47920, "train_speed(iter/s)": 0.201517 }, { "acc": 0.79851007, "epoch": 1.1182431939806645, "grad_norm": 5.03125, "learning_rate": 4.28777985504376e-06, "loss": 0.72625418, "memory(GiB)": 147.13, "step": 47930, "train_speed(iter/s)": 0.201538 }, { "acc": 0.77621775, "epoch": 1.1184765015529536, "grad_norm": 6.03125, "learning_rate": 4.285910057823864e-06, "loss": 0.83182402, "memory(GiB)": 147.13, "step": 47940, "train_speed(iter/s)": 0.201559 }, { "acc": 0.79367595, "epoch": 1.1187098091252423, "grad_norm": 5.25, "learning_rate": 4.284040362540101e-06, "loss": 0.73981581, "memory(GiB)": 147.13, "step": 47950, "train_speed(iter/s)": 0.201581 }, { "acc": 0.78099489, "epoch": 1.1189431166975312, "grad_norm": 4.21875, "learning_rate": 4.282170769459367e-06, "loss": 0.79089422, "memory(GiB)": 147.13, "step": 47960, "train_speed(iter/s)": 0.201603 }, { "acc": 0.79144993, "epoch": 1.1191764242698201, "grad_norm": 4.5625, "learning_rate": 4.2803012788485475e-06, "loss": 0.74069662, "memory(GiB)": 147.13, "step": 47970, "train_speed(iter/s)": 0.201622 }, { "acc": 0.77184448, "epoch": 1.119409731842109, "grad_norm": 5.5625, "learning_rate": 4.278431890974511e-06, "loss": 0.82890167, "memory(GiB)": 147.13, "step": 47980, "train_speed(iter/s)": 0.201646 }, { "acc": 0.77109141, "epoch": 1.119643039414398, "grad_norm": 8.125, "learning_rate": 4.276562606104114e-06, "loss": 0.80681925, "memory(GiB)": 147.13, "step": 47990, "train_speed(iter/s)": 0.201667 }, { "acc": 0.78441682, "epoch": 1.1198763469866868, "grad_norm": 4.875, "learning_rate": 4.274693424504194e-06, "loss": 0.78442831, "memory(GiB)": 147.13, "step": 48000, "train_speed(iter/s)": 0.201688 }, { "epoch": 1.1198763469866868, "eval_acc": 0.744195852552643, "eval_loss": 0.806307315826416, "eval_runtime": 1268.7398, "eval_samples_per_second": 28.368, "eval_steps_per_second": 14.184, "step": 48000 }, { "acc": 0.78372445, "epoch": 1.1201096545589757, "grad_norm": 11.0, "learning_rate": 4.272824346441576e-06, "loss": 0.77319117, "memory(GiB)": 147.13, "step": 48010, "train_speed(iter/s)": 0.200624 }, { "acc": 0.77591352, "epoch": 1.1203429621312646, "grad_norm": 7.625, "learning_rate": 4.270955372183074e-06, "loss": 0.80051012, "memory(GiB)": 147.13, "step": 48020, "train_speed(iter/s)": 0.200643 }, { "acc": 0.77649298, "epoch": 1.1205762697035535, "grad_norm": 4.59375, "learning_rate": 4.269086501995478e-06, "loss": 0.81262932, "memory(GiB)": 147.13, "step": 48030, "train_speed(iter/s)": 0.200664 }, { "acc": 0.79521112, "epoch": 1.1208095772758424, "grad_norm": 5.34375, "learning_rate": 4.267217736145573e-06, "loss": 0.7146986, "memory(GiB)": 147.13, "step": 48040, "train_speed(iter/s)": 0.200684 }, { "acc": 0.76602602, "epoch": 1.1210428848481313, "grad_norm": 5.75, "learning_rate": 4.265349074900123e-06, "loss": 0.83298635, "memory(GiB)": 147.13, "step": 48050, "train_speed(iter/s)": 0.200707 }, { "acc": 0.78246584, "epoch": 1.1212761924204202, "grad_norm": 5.21875, "learning_rate": 4.263480518525878e-06, "loss": 0.78094912, "memory(GiB)": 147.13, "step": 48060, "train_speed(iter/s)": 0.200728 }, { "acc": 0.77939157, "epoch": 1.121509499992709, "grad_norm": 4.78125, "learning_rate": 4.261612067289577e-06, "loss": 0.78741126, "memory(GiB)": 147.13, "step": 48070, "train_speed(iter/s)": 0.200751 }, { "acc": 0.78157773, "epoch": 1.121742807564998, "grad_norm": 5.25, "learning_rate": 4.259743721457937e-06, "loss": 0.76782036, "memory(GiB)": 147.13, "step": 48080, "train_speed(iter/s)": 0.200772 }, { "acc": 0.76486025, "epoch": 1.121976115137287, "grad_norm": 6.53125, "learning_rate": 4.257875481297667e-06, "loss": 0.86555462, "memory(GiB)": 147.13, "step": 48090, "train_speed(iter/s)": 0.200794 }, { "acc": 0.79323206, "epoch": 1.1222094227095758, "grad_norm": 5.40625, "learning_rate": 4.256007347075455e-06, "loss": 0.73063769, "memory(GiB)": 147.13, "step": 48100, "train_speed(iter/s)": 0.200816 }, { "acc": 0.76760402, "epoch": 1.1224427302818647, "grad_norm": 5.46875, "learning_rate": 4.254139319057979e-06, "loss": 0.84659023, "memory(GiB)": 147.13, "step": 48110, "train_speed(iter/s)": 0.200837 }, { "acc": 0.78787589, "epoch": 1.1226760378541536, "grad_norm": 8.625, "learning_rate": 4.252271397511898e-06, "loss": 0.76824799, "memory(GiB)": 147.13, "step": 48120, "train_speed(iter/s)": 0.200856 }, { "acc": 0.78889484, "epoch": 1.1229093454264425, "grad_norm": 5.4375, "learning_rate": 4.2504035827038595e-06, "loss": 0.75601206, "memory(GiB)": 147.13, "step": 48130, "train_speed(iter/s)": 0.200878 }, { "acc": 0.78154402, "epoch": 1.1231426529987314, "grad_norm": 6.90625, "learning_rate": 4.248535874900491e-06, "loss": 0.79978552, "memory(GiB)": 147.13, "step": 48140, "train_speed(iter/s)": 0.200899 }, { "acc": 0.79487672, "epoch": 1.1233759605710203, "grad_norm": 4.25, "learning_rate": 4.246668274368409e-06, "loss": 0.71404648, "memory(GiB)": 147.13, "step": 48150, "train_speed(iter/s)": 0.20092 }, { "acc": 0.7581543, "epoch": 1.1236092681433092, "grad_norm": 4.8125, "learning_rate": 4.24480078137421e-06, "loss": 0.88291979, "memory(GiB)": 147.13, "step": 48160, "train_speed(iter/s)": 0.200941 }, { "acc": 0.75657516, "epoch": 1.123842575715598, "grad_norm": 4.46875, "learning_rate": 4.2429333961844805e-06, "loss": 0.87030544, "memory(GiB)": 147.13, "step": 48170, "train_speed(iter/s)": 0.200963 }, { "acc": 0.79136295, "epoch": 1.124075883287887, "grad_norm": 3.890625, "learning_rate": 4.241066119065789e-06, "loss": 0.74944844, "memory(GiB)": 147.13, "step": 48180, "train_speed(iter/s)": 0.200985 }, { "acc": 0.77575951, "epoch": 1.1243091908601759, "grad_norm": 4.90625, "learning_rate": 4.239198950284688e-06, "loss": 0.80282288, "memory(GiB)": 147.13, "step": 48190, "train_speed(iter/s)": 0.201006 }, { "acc": 0.78090439, "epoch": 1.1245424984324648, "grad_norm": 5.28125, "learning_rate": 4.237331890107717e-06, "loss": 0.79569597, "memory(GiB)": 147.13, "step": 48200, "train_speed(iter/s)": 0.201024 }, { "acc": 0.78601589, "epoch": 1.1247758060047537, "grad_norm": 4.09375, "learning_rate": 4.2354649388013965e-06, "loss": 0.77665925, "memory(GiB)": 147.13, "step": 48210, "train_speed(iter/s)": 0.201045 }, { "acc": 0.77956362, "epoch": 1.1250091135770426, "grad_norm": 4.9375, "learning_rate": 4.233598096632234e-06, "loss": 0.77131734, "memory(GiB)": 147.13, "step": 48220, "train_speed(iter/s)": 0.201068 }, { "acc": 0.79384608, "epoch": 1.1252424211493315, "grad_norm": 6.09375, "learning_rate": 4.23173136386672e-06, "loss": 0.73389316, "memory(GiB)": 147.13, "step": 48230, "train_speed(iter/s)": 0.201089 }, { "acc": 0.78729687, "epoch": 1.1254757287216204, "grad_norm": 4.53125, "learning_rate": 4.2298647407713314e-06, "loss": 0.7819458, "memory(GiB)": 147.13, "step": 48240, "train_speed(iter/s)": 0.201111 }, { "acc": 0.78039856, "epoch": 1.1257090362939093, "grad_norm": 5.1875, "learning_rate": 4.227998227612529e-06, "loss": 0.78439426, "memory(GiB)": 147.13, "step": 48250, "train_speed(iter/s)": 0.201133 }, { "acc": 0.78524809, "epoch": 1.1259423438661982, "grad_norm": 4.25, "learning_rate": 4.226131824656752e-06, "loss": 0.7713275, "memory(GiB)": 147.13, "step": 48260, "train_speed(iter/s)": 0.201155 }, { "acc": 0.77486734, "epoch": 1.126175651438487, "grad_norm": 4.125, "learning_rate": 4.224265532170434e-06, "loss": 0.79942617, "memory(GiB)": 147.13, "step": 48270, "train_speed(iter/s)": 0.201175 }, { "acc": 0.78316307, "epoch": 1.126408959010776, "grad_norm": 6.03125, "learning_rate": 4.222399350419985e-06, "loss": 0.76972885, "memory(GiB)": 147.13, "step": 48280, "train_speed(iter/s)": 0.201196 }, { "acc": 0.7986887, "epoch": 1.1266422665830649, "grad_norm": 6.625, "learning_rate": 4.220533279671804e-06, "loss": 0.71824002, "memory(GiB)": 147.13, "step": 48290, "train_speed(iter/s)": 0.201218 }, { "acc": 0.79125404, "epoch": 1.1268755741553538, "grad_norm": 4.4375, "learning_rate": 4.21866732019227e-06, "loss": 0.74609642, "memory(GiB)": 147.13, "step": 48300, "train_speed(iter/s)": 0.201238 }, { "acc": 0.78295269, "epoch": 1.1271088817276427, "grad_norm": 6.53125, "learning_rate": 4.216801472247749e-06, "loss": 0.77721262, "memory(GiB)": 147.13, "step": 48310, "train_speed(iter/s)": 0.20126 }, { "acc": 0.76647134, "epoch": 1.1273421892999314, "grad_norm": 5.59375, "learning_rate": 4.214935736104591e-06, "loss": 0.85473747, "memory(GiB)": 147.13, "step": 48320, "train_speed(iter/s)": 0.201283 }, { "acc": 0.7766263, "epoch": 1.1275754968722205, "grad_norm": 6.625, "learning_rate": 4.213070112029127e-06, "loss": 0.80300884, "memory(GiB)": 147.13, "step": 48330, "train_speed(iter/s)": 0.201304 }, { "acc": 0.78820081, "epoch": 1.1278088044445091, "grad_norm": 5.8125, "learning_rate": 4.211204600287677e-06, "loss": 0.7805532, "memory(GiB)": 147.13, "step": 48340, "train_speed(iter/s)": 0.201325 }, { "acc": 0.78450546, "epoch": 1.128042112016798, "grad_norm": 7.84375, "learning_rate": 4.2093392011465425e-06, "loss": 0.77261581, "memory(GiB)": 147.13, "step": 48350, "train_speed(iter/s)": 0.201347 }, { "acc": 0.75979958, "epoch": 1.128275419589087, "grad_norm": 5.46875, "learning_rate": 4.207473914872006e-06, "loss": 0.86760702, "memory(GiB)": 147.13, "step": 48360, "train_speed(iter/s)": 0.201368 }, { "acc": 0.78485813, "epoch": 1.1285087271613758, "grad_norm": 6.1875, "learning_rate": 4.20560874173034e-06, "loss": 0.7671032, "memory(GiB)": 147.13, "step": 48370, "train_speed(iter/s)": 0.201391 }, { "acc": 0.77036834, "epoch": 1.1287420347336647, "grad_norm": 5.6875, "learning_rate": 4.203743681987793e-06, "loss": 0.8098217, "memory(GiB)": 147.13, "step": 48380, "train_speed(iter/s)": 0.201413 }, { "acc": 0.77560768, "epoch": 1.1289753423059536, "grad_norm": 4.3125, "learning_rate": 4.2018787359106045e-06, "loss": 0.81092434, "memory(GiB)": 147.13, "step": 48390, "train_speed(iter/s)": 0.201435 }, { "acc": 0.78146276, "epoch": 1.1292086498782425, "grad_norm": 5.15625, "learning_rate": 4.200013903764994e-06, "loss": 0.77416515, "memory(GiB)": 147.13, "step": 48400, "train_speed(iter/s)": 0.201457 }, { "acc": 0.77433596, "epoch": 1.1294419574505314, "grad_norm": 6.34375, "learning_rate": 4.198149185817167e-06, "loss": 0.80879793, "memory(GiB)": 147.13, "step": 48410, "train_speed(iter/s)": 0.201478 }, { "acc": 0.77343392, "epoch": 1.1296752650228203, "grad_norm": 6.15625, "learning_rate": 4.19628458233331e-06, "loss": 0.81420822, "memory(GiB)": 147.13, "step": 48420, "train_speed(iter/s)": 0.201501 }, { "acc": 0.78997188, "epoch": 1.1299085725951092, "grad_norm": 5.625, "learning_rate": 4.194420093579597e-06, "loss": 0.74577589, "memory(GiB)": 147.13, "step": 48430, "train_speed(iter/s)": 0.201523 }, { "acc": 0.79533091, "epoch": 1.1301418801673981, "grad_norm": 6.25, "learning_rate": 4.1925557198221805e-06, "loss": 0.72079945, "memory(GiB)": 147.13, "step": 48440, "train_speed(iter/s)": 0.201542 }, { "acc": 0.76862497, "epoch": 1.130375187739687, "grad_norm": 4.5625, "learning_rate": 4.1906914613272e-06, "loss": 0.86250811, "memory(GiB)": 147.13, "step": 48450, "train_speed(iter/s)": 0.201564 }, { "acc": 0.78376331, "epoch": 1.130608495311976, "grad_norm": 4.53125, "learning_rate": 4.188827318360779e-06, "loss": 0.76748829, "memory(GiB)": 147.13, "step": 48460, "train_speed(iter/s)": 0.201585 }, { "acc": 0.77294278, "epoch": 1.1308418028842648, "grad_norm": 6.09375, "learning_rate": 4.186963291189022e-06, "loss": 0.83114796, "memory(GiB)": 147.13, "step": 48470, "train_speed(iter/s)": 0.201606 }, { "acc": 0.78433104, "epoch": 1.1310751104565537, "grad_norm": 4.625, "learning_rate": 4.185099380078022e-06, "loss": 0.76472044, "memory(GiB)": 147.13, "step": 48480, "train_speed(iter/s)": 0.201626 }, { "acc": 0.78431997, "epoch": 1.1313084180288426, "grad_norm": 5.84375, "learning_rate": 4.183235585293846e-06, "loss": 0.79146395, "memory(GiB)": 147.13, "step": 48490, "train_speed(iter/s)": 0.201648 }, { "acc": 0.78068762, "epoch": 1.1315417256011315, "grad_norm": 5.96875, "learning_rate": 4.181371907102553e-06, "loss": 0.79927993, "memory(GiB)": 147.13, "step": 48500, "train_speed(iter/s)": 0.201669 }, { "epoch": 1.1315417256011315, "eval_acc": 0.7441143391384016, "eval_loss": 0.8060808777809143, "eval_runtime": 1270.8615, "eval_samples_per_second": 28.32, "eval_steps_per_second": 14.16, "step": 48500 }, { "acc": 0.7775506, "epoch": 1.1317750331734204, "grad_norm": 5.4375, "learning_rate": 4.179508345770184e-06, "loss": 0.77313948, "memory(GiB)": 147.13, "step": 48510, "train_speed(iter/s)": 0.200616 }, { "acc": 0.78219233, "epoch": 1.1320083407457093, "grad_norm": 5.0625, "learning_rate": 4.177644901562758e-06, "loss": 0.79809017, "memory(GiB)": 147.13, "step": 48520, "train_speed(iter/s)": 0.200637 }, { "acc": 0.7918745, "epoch": 1.1322416483179982, "grad_norm": 4.59375, "learning_rate": 4.1757815747462845e-06, "loss": 0.740306, "memory(GiB)": 147.13, "step": 48530, "train_speed(iter/s)": 0.200658 }, { "acc": 0.7779398, "epoch": 1.1324749558902871, "grad_norm": 7.3125, "learning_rate": 4.173918365586751e-06, "loss": 0.81543484, "memory(GiB)": 147.13, "step": 48540, "train_speed(iter/s)": 0.20068 }, { "acc": 0.7860157, "epoch": 1.132708263462576, "grad_norm": 4.875, "learning_rate": 4.172055274350132e-06, "loss": 0.75977716, "memory(GiB)": 147.13, "step": 48550, "train_speed(iter/s)": 0.2007 }, { "acc": 0.775599, "epoch": 1.132941571034865, "grad_norm": 5.4375, "learning_rate": 4.170192301302382e-06, "loss": 0.80197821, "memory(GiB)": 147.13, "step": 48560, "train_speed(iter/s)": 0.200722 }, { "acc": 0.79071817, "epoch": 1.1331748786071538, "grad_norm": 5.21875, "learning_rate": 4.168329446709439e-06, "loss": 0.73534966, "memory(GiB)": 147.13, "step": 48570, "train_speed(iter/s)": 0.200744 }, { "acc": 0.79963331, "epoch": 1.1334081861794427, "grad_norm": 4.5, "learning_rate": 4.166466710837226e-06, "loss": 0.71867814, "memory(GiB)": 147.13, "step": 48580, "train_speed(iter/s)": 0.200764 }, { "acc": 0.76818967, "epoch": 1.1336414937517316, "grad_norm": 4.40625, "learning_rate": 4.1646040939516485e-06, "loss": 0.83885851, "memory(GiB)": 147.13, "step": 48590, "train_speed(iter/s)": 0.200784 }, { "acc": 0.8015955, "epoch": 1.1338748013240205, "grad_norm": 4.53125, "learning_rate": 4.162741596318596e-06, "loss": 0.70797453, "memory(GiB)": 147.13, "step": 48600, "train_speed(iter/s)": 0.200804 }, { "acc": 0.78102293, "epoch": 1.1341081088963094, "grad_norm": 5.59375, "learning_rate": 4.160879218203935e-06, "loss": 0.79669533, "memory(GiB)": 147.13, "step": 48610, "train_speed(iter/s)": 0.200826 }, { "acc": 0.77543592, "epoch": 1.1343414164685983, "grad_norm": 3.890625, "learning_rate": 4.159016959873521e-06, "loss": 0.79444084, "memory(GiB)": 147.13, "step": 48620, "train_speed(iter/s)": 0.200847 }, { "acc": 0.76163797, "epoch": 1.1345747240408872, "grad_norm": 5.65625, "learning_rate": 4.1571548215931925e-06, "loss": 0.86727257, "memory(GiB)": 147.13, "step": 48630, "train_speed(iter/s)": 0.200869 }, { "acc": 0.78029289, "epoch": 1.1348080316131761, "grad_norm": 4.46875, "learning_rate": 4.155292803628768e-06, "loss": 0.79705906, "memory(GiB)": 147.13, "step": 48640, "train_speed(iter/s)": 0.200891 }, { "acc": 0.77660732, "epoch": 1.135041339185465, "grad_norm": 5.21875, "learning_rate": 4.153430906246052e-06, "loss": 0.80238552, "memory(GiB)": 147.13, "step": 48650, "train_speed(iter/s)": 0.200913 }, { "acc": 0.78615427, "epoch": 1.135274646757754, "grad_norm": 4.875, "learning_rate": 4.151569129710827e-06, "loss": 0.78471584, "memory(GiB)": 147.13, "step": 48660, "train_speed(iter/s)": 0.200933 }, { "acc": 0.77250824, "epoch": 1.1355079543300428, "grad_norm": 4.8125, "learning_rate": 4.149707474288862e-06, "loss": 0.81316957, "memory(GiB)": 147.13, "step": 48670, "train_speed(iter/s)": 0.200956 }, { "acc": 0.79985304, "epoch": 1.1357412619023317, "grad_norm": 4.53125, "learning_rate": 4.147845940245908e-06, "loss": 0.70010648, "memory(GiB)": 147.13, "step": 48680, "train_speed(iter/s)": 0.200977 }, { "acc": 0.77105551, "epoch": 1.1359745694746204, "grad_norm": 5.28125, "learning_rate": 4.145984527847699e-06, "loss": 0.81544399, "memory(GiB)": 147.13, "step": 48690, "train_speed(iter/s)": 0.200998 }, { "acc": 0.77478409, "epoch": 1.1362078770469095, "grad_norm": 6.03125, "learning_rate": 4.14412323735995e-06, "loss": 0.81853638, "memory(GiB)": 147.13, "step": 48700, "train_speed(iter/s)": 0.20102 }, { "acc": 0.78074069, "epoch": 1.1364411846191982, "grad_norm": 6.78125, "learning_rate": 4.142262069048362e-06, "loss": 0.78377013, "memory(GiB)": 147.13, "step": 48710, "train_speed(iter/s)": 0.201041 }, { "acc": 0.78800902, "epoch": 1.1366744921914873, "grad_norm": 5.03125, "learning_rate": 4.140401023178613e-06, "loss": 0.76176023, "memory(GiB)": 147.13, "step": 48720, "train_speed(iter/s)": 0.201062 }, { "acc": 0.78853445, "epoch": 1.136907799763776, "grad_norm": 4.4375, "learning_rate": 4.138540100016369e-06, "loss": 0.7591939, "memory(GiB)": 147.13, "step": 48730, "train_speed(iter/s)": 0.201084 }, { "acc": 0.78152742, "epoch": 1.1371411073360649, "grad_norm": 7.03125, "learning_rate": 4.136679299827275e-06, "loss": 0.77542849, "memory(GiB)": 147.13, "step": 48740, "train_speed(iter/s)": 0.201105 }, { "acc": 0.80343904, "epoch": 1.1373744149083538, "grad_norm": 5.21875, "learning_rate": 4.134818622876959e-06, "loss": 0.68836613, "memory(GiB)": 147.13, "step": 48750, "train_speed(iter/s)": 0.201128 }, { "acc": 0.77372913, "epoch": 1.1376077224806427, "grad_norm": 6.5625, "learning_rate": 4.132958069431034e-06, "loss": 0.78857126, "memory(GiB)": 147.13, "step": 48760, "train_speed(iter/s)": 0.20115 }, { "acc": 0.78540783, "epoch": 1.1378410300529316, "grad_norm": 4.6875, "learning_rate": 4.131097639755093e-06, "loss": 0.76112299, "memory(GiB)": 147.13, "step": 48770, "train_speed(iter/s)": 0.201169 }, { "acc": 0.77772455, "epoch": 1.1380743376252205, "grad_norm": 6.21875, "learning_rate": 4.129237334114712e-06, "loss": 0.81547289, "memory(GiB)": 147.13, "step": 48780, "train_speed(iter/s)": 0.20119 }, { "acc": 0.78677707, "epoch": 1.1383076451975094, "grad_norm": 4.40625, "learning_rate": 4.127377152775448e-06, "loss": 0.77611208, "memory(GiB)": 147.13, "step": 48790, "train_speed(iter/s)": 0.201211 }, { "acc": 0.79749675, "epoch": 1.1385409527697983, "grad_norm": 4.125, "learning_rate": 4.125517096002842e-06, "loss": 0.71030278, "memory(GiB)": 147.13, "step": 48800, "train_speed(iter/s)": 0.201231 }, { "acc": 0.79024119, "epoch": 1.1387742603420872, "grad_norm": 4.46875, "learning_rate": 4.123657164062415e-06, "loss": 0.74269834, "memory(GiB)": 147.13, "step": 48810, "train_speed(iter/s)": 0.201252 }, { "acc": 0.76520209, "epoch": 1.139007567914376, "grad_norm": 4.84375, "learning_rate": 4.121797357219678e-06, "loss": 0.84504023, "memory(GiB)": 147.13, "step": 48820, "train_speed(iter/s)": 0.201274 }, { "acc": 0.7725544, "epoch": 1.139240875486665, "grad_norm": 5.09375, "learning_rate": 4.119937675740109e-06, "loss": 0.81992121, "memory(GiB)": 147.13, "step": 48830, "train_speed(iter/s)": 0.201296 }, { "acc": 0.78341656, "epoch": 1.1394741830589539, "grad_norm": 5.65625, "learning_rate": 4.118078119889182e-06, "loss": 0.76839838, "memory(GiB)": 147.13, "step": 48840, "train_speed(iter/s)": 0.201317 }, { "acc": 0.80117064, "epoch": 1.1397074906312428, "grad_norm": 5.21875, "learning_rate": 4.116218689932346e-06, "loss": 0.71321936, "memory(GiB)": 147.13, "step": 48850, "train_speed(iter/s)": 0.201338 }, { "acc": 0.79148402, "epoch": 1.1399407982035317, "grad_norm": 5.53125, "learning_rate": 4.114359386135038e-06, "loss": 0.74038553, "memory(GiB)": 147.13, "step": 48860, "train_speed(iter/s)": 0.20136 }, { "acc": 0.77297692, "epoch": 1.1401741057758206, "grad_norm": 5.4375, "learning_rate": 4.112500208762668e-06, "loss": 0.80347652, "memory(GiB)": 147.13, "step": 48870, "train_speed(iter/s)": 0.201381 }, { "acc": 0.78603468, "epoch": 1.1404074133481095, "grad_norm": 4.40625, "learning_rate": 4.110641158080636e-06, "loss": 0.78780212, "memory(GiB)": 147.13, "step": 48880, "train_speed(iter/s)": 0.201402 }, { "acc": 0.80655193, "epoch": 1.1406407209203984, "grad_norm": 4.78125, "learning_rate": 4.108782234354321e-06, "loss": 0.7006731, "memory(GiB)": 147.13, "step": 48890, "train_speed(iter/s)": 0.201423 }, { "acc": 0.79013271, "epoch": 1.1408740284926873, "grad_norm": 5.5, "learning_rate": 4.106923437849082e-06, "loss": 0.75852995, "memory(GiB)": 147.13, "step": 48900, "train_speed(iter/s)": 0.201443 }, { "acc": 0.7818615, "epoch": 1.1411073360649762, "grad_norm": 4.40625, "learning_rate": 4.105064768830263e-06, "loss": 0.77103171, "memory(GiB)": 147.13, "step": 48910, "train_speed(iter/s)": 0.201463 }, { "acc": 0.76176796, "epoch": 1.141340643637265, "grad_norm": 4.90625, "learning_rate": 4.1032062275631894e-06, "loss": 0.87524958, "memory(GiB)": 147.13, "step": 48920, "train_speed(iter/s)": 0.201485 }, { "acc": 0.76909437, "epoch": 1.141573951209554, "grad_norm": 4.625, "learning_rate": 4.101347814313166e-06, "loss": 0.81877556, "memory(GiB)": 147.13, "step": 48930, "train_speed(iter/s)": 0.201506 }, { "acc": 0.80096893, "epoch": 1.1418072587818429, "grad_norm": 4.25, "learning_rate": 4.099489529345483e-06, "loss": 0.71625419, "memory(GiB)": 147.13, "step": 48940, "train_speed(iter/s)": 0.201527 }, { "acc": 0.77617383, "epoch": 1.1420405663541318, "grad_norm": 5.75, "learning_rate": 4.097631372925405e-06, "loss": 0.79977846, "memory(GiB)": 147.13, "step": 48950, "train_speed(iter/s)": 0.201549 }, { "acc": 0.78573585, "epoch": 1.1422738739264207, "grad_norm": 6.28125, "learning_rate": 4.095773345318186e-06, "loss": 0.77477055, "memory(GiB)": 147.13, "step": 48960, "train_speed(iter/s)": 0.201571 }, { "acc": 0.78423834, "epoch": 1.1425071814987096, "grad_norm": 15.9375, "learning_rate": 4.0939154467890605e-06, "loss": 0.76231022, "memory(GiB)": 147.13, "step": 48970, "train_speed(iter/s)": 0.201591 }, { "acc": 0.79738851, "epoch": 1.1427404890709985, "grad_norm": 8.875, "learning_rate": 4.0920576776032415e-06, "loss": 0.71518207, "memory(GiB)": 147.13, "step": 48980, "train_speed(iter/s)": 0.201613 }, { "acc": 0.77292356, "epoch": 1.1429737966432874, "grad_norm": 4.375, "learning_rate": 4.090200038025926e-06, "loss": 0.79759941, "memory(GiB)": 147.13, "step": 48990, "train_speed(iter/s)": 0.201634 }, { "acc": 0.8075819, "epoch": 1.1432071042155763, "grad_norm": 7.0625, "learning_rate": 4.08834252832229e-06, "loss": 0.67757139, "memory(GiB)": 147.13, "step": 49000, "train_speed(iter/s)": 0.201656 }, { "epoch": 1.1432071042155763, "eval_acc": 0.744222436495205, "eval_loss": 0.8061447739601135, "eval_runtime": 1270.2536, "eval_samples_per_second": 28.334, "eval_steps_per_second": 14.167, "step": 49000 }, { "acc": 0.78951435, "epoch": 1.1434404117878652, "grad_norm": 6.3125, "learning_rate": 4.086485148757493e-06, "loss": 0.7389822, "memory(GiB)": 147.13, "step": 49010, "train_speed(iter/s)": 0.200613 }, { "acc": 0.78884678, "epoch": 1.143673719360154, "grad_norm": 5.875, "learning_rate": 4.084627899596676e-06, "loss": 0.75943174, "memory(GiB)": 147.13, "step": 49020, "train_speed(iter/s)": 0.200633 }, { "acc": 0.76504393, "epoch": 1.143907026932443, "grad_norm": 4.6875, "learning_rate": 4.082770781104961e-06, "loss": 0.84475698, "memory(GiB)": 147.13, "step": 49030, "train_speed(iter/s)": 0.200653 }, { "acc": 0.79473829, "epoch": 1.1441403345047318, "grad_norm": 6.03125, "learning_rate": 4.080913793547449e-06, "loss": 0.74036322, "memory(GiB)": 147.13, "step": 49040, "train_speed(iter/s)": 0.200673 }, { "acc": 0.77508821, "epoch": 1.1443736420770207, "grad_norm": 5.4375, "learning_rate": 4.079056937189229e-06, "loss": 0.80481701, "memory(GiB)": 147.13, "step": 49050, "train_speed(iter/s)": 0.200695 }, { "acc": 0.7855969, "epoch": 1.1446069496493096, "grad_norm": 5.5625, "learning_rate": 4.077200212295361e-06, "loss": 0.76578369, "memory(GiB)": 147.13, "step": 49060, "train_speed(iter/s)": 0.200716 }, { "acc": 0.7720355, "epoch": 1.1448402572215985, "grad_norm": 4.875, "learning_rate": 4.075343619130895e-06, "loss": 0.8170085, "memory(GiB)": 147.13, "step": 49070, "train_speed(iter/s)": 0.200737 }, { "acc": 0.7803793, "epoch": 1.1450735647938872, "grad_norm": 4.71875, "learning_rate": 4.0734871579608606e-06, "loss": 0.79507275, "memory(GiB)": 147.13, "step": 49080, "train_speed(iter/s)": 0.200756 }, { "acc": 0.78040457, "epoch": 1.1453068723661763, "grad_norm": 5.28125, "learning_rate": 4.071630829050263e-06, "loss": 0.79167814, "memory(GiB)": 147.13, "step": 49090, "train_speed(iter/s)": 0.200777 }, { "acc": 0.80139008, "epoch": 1.145540179938465, "grad_norm": 4.28125, "learning_rate": 4.069774632664095e-06, "loss": 0.69903708, "memory(GiB)": 147.13, "step": 49100, "train_speed(iter/s)": 0.200795 }, { "acc": 0.7990375, "epoch": 1.1457734875107541, "grad_norm": 6.46875, "learning_rate": 4.0679185690673285e-06, "loss": 0.707726, "memory(GiB)": 147.13, "step": 49110, "train_speed(iter/s)": 0.200815 }, { "acc": 0.78790112, "epoch": 1.1460067950830428, "grad_norm": 5.625, "learning_rate": 4.066062638524915e-06, "loss": 0.76991944, "memory(GiB)": 147.13, "step": 49120, "train_speed(iter/s)": 0.200838 }, { "acc": 0.77768164, "epoch": 1.1462401026553317, "grad_norm": 7.125, "learning_rate": 4.064206841301789e-06, "loss": 0.78967957, "memory(GiB)": 147.13, "step": 49130, "train_speed(iter/s)": 0.20086 }, { "acc": 0.78068905, "epoch": 1.1464734102276206, "grad_norm": 5.53125, "learning_rate": 4.062351177662866e-06, "loss": 0.77087574, "memory(GiB)": 147.13, "step": 49140, "train_speed(iter/s)": 0.200883 }, { "acc": 0.78477345, "epoch": 1.1467067177999095, "grad_norm": 4.6875, "learning_rate": 4.060495647873038e-06, "loss": 0.77597842, "memory(GiB)": 147.13, "step": 49150, "train_speed(iter/s)": 0.200903 }, { "acc": 0.77061286, "epoch": 1.1469400253721984, "grad_norm": 6.0625, "learning_rate": 4.058640252197184e-06, "loss": 0.81207161, "memory(GiB)": 147.13, "step": 49160, "train_speed(iter/s)": 0.200923 }, { "acc": 0.79196901, "epoch": 1.1471733329444873, "grad_norm": 6.90625, "learning_rate": 4.056784990900162e-06, "loss": 0.73144436, "memory(GiB)": 147.13, "step": 49170, "train_speed(iter/s)": 0.200944 }, { "acc": 0.76946406, "epoch": 1.1474066405167762, "grad_norm": 4.75, "learning_rate": 4.054929864246807e-06, "loss": 0.8269702, "memory(GiB)": 147.13, "step": 49180, "train_speed(iter/s)": 0.200966 }, { "acc": 0.79781704, "epoch": 1.147639948089065, "grad_norm": 5.03125, "learning_rate": 4.053074872501939e-06, "loss": 0.72426014, "memory(GiB)": 147.13, "step": 49190, "train_speed(iter/s)": 0.200989 }, { "acc": 0.79303007, "epoch": 1.147873255661354, "grad_norm": 3.4375, "learning_rate": 4.051220015930358e-06, "loss": 0.74754534, "memory(GiB)": 147.13, "step": 49200, "train_speed(iter/s)": 0.20101 }, { "acc": 0.78933134, "epoch": 1.148106563233643, "grad_norm": 7.0, "learning_rate": 4.049365294796844e-06, "loss": 0.74844933, "memory(GiB)": 147.13, "step": 49210, "train_speed(iter/s)": 0.20103 }, { "acc": 0.76711445, "epoch": 1.1483398708059318, "grad_norm": 3.390625, "learning_rate": 4.047510709366159e-06, "loss": 0.85391102, "memory(GiB)": 147.13, "step": 49220, "train_speed(iter/s)": 0.201052 }, { "acc": 0.77335253, "epoch": 1.1485731783782207, "grad_norm": 6.59375, "learning_rate": 4.045656259903042e-06, "loss": 0.81023026, "memory(GiB)": 147.13, "step": 49230, "train_speed(iter/s)": 0.201073 }, { "acc": 0.79616175, "epoch": 1.1488064859505096, "grad_norm": 5.15625, "learning_rate": 4.043801946672217e-06, "loss": 0.72714806, "memory(GiB)": 147.13, "step": 49240, "train_speed(iter/s)": 0.201095 }, { "acc": 0.77853937, "epoch": 1.1490397935227985, "grad_norm": 6.4375, "learning_rate": 4.041947769938387e-06, "loss": 0.81164007, "memory(GiB)": 147.13, "step": 49250, "train_speed(iter/s)": 0.201117 }, { "acc": 0.78679638, "epoch": 1.1492731010950874, "grad_norm": 4.6875, "learning_rate": 4.040093729966234e-06, "loss": 0.77023234, "memory(GiB)": 147.13, "step": 49260, "train_speed(iter/s)": 0.201139 }, { "acc": 0.79337749, "epoch": 1.1495064086673763, "grad_norm": 5.0625, "learning_rate": 4.038239827020424e-06, "loss": 0.72135153, "memory(GiB)": 147.13, "step": 49270, "train_speed(iter/s)": 0.20116 }, { "acc": 0.79230156, "epoch": 1.1497397162396652, "grad_norm": 6.0, "learning_rate": 4.036386061365598e-06, "loss": 0.74501095, "memory(GiB)": 147.13, "step": 49280, "train_speed(iter/s)": 0.201181 }, { "acc": 0.78310337, "epoch": 1.149973023811954, "grad_norm": 5.21875, "learning_rate": 4.034532433266382e-06, "loss": 0.78334503, "memory(GiB)": 147.13, "step": 49290, "train_speed(iter/s)": 0.201201 }, { "acc": 0.78788719, "epoch": 1.150206331384243, "grad_norm": 12.9375, "learning_rate": 4.032678942987382e-06, "loss": 0.74433508, "memory(GiB)": 147.13, "step": 49300, "train_speed(iter/s)": 0.201221 }, { "acc": 0.77241621, "epoch": 1.150439638956532, "grad_norm": 5.0625, "learning_rate": 4.030825590793179e-06, "loss": 0.81397333, "memory(GiB)": 147.13, "step": 49310, "train_speed(iter/s)": 0.201242 }, { "acc": 0.78942137, "epoch": 1.1506729465288208, "grad_norm": 4.75, "learning_rate": 4.028972376948343e-06, "loss": 0.75723009, "memory(GiB)": 147.13, "step": 49320, "train_speed(iter/s)": 0.201261 }, { "acc": 0.78589869, "epoch": 1.1509062541011097, "grad_norm": 6.0625, "learning_rate": 4.027119301717417e-06, "loss": 0.74469566, "memory(GiB)": 147.13, "step": 49330, "train_speed(iter/s)": 0.201283 }, { "acc": 0.79240808, "epoch": 1.1511395616733986, "grad_norm": 4.71875, "learning_rate": 4.025266365364928e-06, "loss": 0.72842922, "memory(GiB)": 147.13, "step": 49340, "train_speed(iter/s)": 0.201302 }, { "acc": 0.79350748, "epoch": 1.1513728692456875, "grad_norm": 8.3125, "learning_rate": 4.0234135681553835e-06, "loss": 0.74578514, "memory(GiB)": 147.13, "step": 49350, "train_speed(iter/s)": 0.201324 }, { "acc": 0.76579685, "epoch": 1.1516061768179764, "grad_norm": 5.5625, "learning_rate": 4.021560910353268e-06, "loss": 0.84622517, "memory(GiB)": 147.13, "step": 49360, "train_speed(iter/s)": 0.201344 }, { "acc": 0.77959232, "epoch": 1.1518394843902653, "grad_norm": 5.03125, "learning_rate": 4.019708392223048e-06, "loss": 0.79495625, "memory(GiB)": 147.13, "step": 49370, "train_speed(iter/s)": 0.201366 }, { "acc": 0.79989409, "epoch": 1.1520727919625542, "grad_norm": 4.875, "learning_rate": 4.017856014029171e-06, "loss": 0.71503725, "memory(GiB)": 147.13, "step": 49380, "train_speed(iter/s)": 0.201386 }, { "acc": 0.77668247, "epoch": 1.152306099534843, "grad_norm": 5.0, "learning_rate": 4.016003776036064e-06, "loss": 0.78333073, "memory(GiB)": 147.13, "step": 49390, "train_speed(iter/s)": 0.201405 }, { "acc": 0.77576346, "epoch": 1.152539407107132, "grad_norm": 6.34375, "learning_rate": 4.01415167850813e-06, "loss": 0.80841331, "memory(GiB)": 147.13, "step": 49400, "train_speed(iter/s)": 0.201426 }, { "acc": 0.77319818, "epoch": 1.1527727146794209, "grad_norm": 5.6875, "learning_rate": 4.012299721709757e-06, "loss": 0.81761799, "memory(GiB)": 147.13, "step": 49410, "train_speed(iter/s)": 0.201447 }, { "acc": 0.76485691, "epoch": 1.1530060222517098, "grad_norm": 8.375, "learning_rate": 4.010447905905312e-06, "loss": 0.85202141, "memory(GiB)": 147.13, "step": 49420, "train_speed(iter/s)": 0.201468 }, { "acc": 0.77609119, "epoch": 1.1532393298239987, "grad_norm": 6.0, "learning_rate": 4.0085962313591416e-06, "loss": 0.82331104, "memory(GiB)": 147.13, "step": 49430, "train_speed(iter/s)": 0.201488 }, { "acc": 0.76981869, "epoch": 1.1534726373962876, "grad_norm": 4.125, "learning_rate": 4.006744698335572e-06, "loss": 0.83070965, "memory(GiB)": 147.13, "step": 49440, "train_speed(iter/s)": 0.201508 }, { "acc": 0.77602968, "epoch": 1.1537059449685765, "grad_norm": 5.5, "learning_rate": 4.004893307098907e-06, "loss": 0.80735464, "memory(GiB)": 147.13, "step": 49450, "train_speed(iter/s)": 0.20153 }, { "acc": 0.79087119, "epoch": 1.1539392525408654, "grad_norm": 4.46875, "learning_rate": 4.003042057913434e-06, "loss": 0.74953089, "memory(GiB)": 147.13, "step": 49460, "train_speed(iter/s)": 0.201551 }, { "acc": 0.79258838, "epoch": 1.154172560113154, "grad_norm": 4.65625, "learning_rate": 4.001190951043416e-06, "loss": 0.73958902, "memory(GiB)": 147.13, "step": 49470, "train_speed(iter/s)": 0.201572 }, { "acc": 0.78888607, "epoch": 1.1544058676854432, "grad_norm": 5.125, "learning_rate": 3.9993399867531e-06, "loss": 0.74525504, "memory(GiB)": 147.13, "step": 49480, "train_speed(iter/s)": 0.201592 }, { "acc": 0.78004522, "epoch": 1.1546391752577319, "grad_norm": 7.15625, "learning_rate": 3.997489165306713e-06, "loss": 0.77018299, "memory(GiB)": 147.13, "step": 49490, "train_speed(iter/s)": 0.201612 }, { "acc": 0.78893394, "epoch": 1.154872482830021, "grad_norm": 7.375, "learning_rate": 3.995638486968453e-06, "loss": 0.74949064, "memory(GiB)": 147.13, "step": 49500, "train_speed(iter/s)": 0.201633 }, { "epoch": 1.154872482830021, "eval_acc": 0.7442927398131853, "eval_loss": 0.8059793710708618, "eval_runtime": 1270.1643, "eval_samples_per_second": 28.336, "eval_steps_per_second": 14.168, "step": 49500 }, { "acc": 0.77485609, "epoch": 1.1551057904023097, "grad_norm": 4.84375, "learning_rate": 3.99378795200251e-06, "loss": 0.8245182, "memory(GiB)": 147.13, "step": 49510, "train_speed(iter/s)": 0.200599 }, { "acc": 0.77423291, "epoch": 1.1553390979745986, "grad_norm": 6.34375, "learning_rate": 3.991937560673044e-06, "loss": 0.79736261, "memory(GiB)": 147.13, "step": 49520, "train_speed(iter/s)": 0.20062 }, { "acc": 0.77984371, "epoch": 1.1555724055468874, "grad_norm": 5.15625, "learning_rate": 3.990087313244197e-06, "loss": 0.77974262, "memory(GiB)": 147.13, "step": 49530, "train_speed(iter/s)": 0.200641 }, { "acc": 0.77480555, "epoch": 1.1558057131191763, "grad_norm": 6.34375, "learning_rate": 3.988237209980093e-06, "loss": 0.79339681, "memory(GiB)": 147.13, "step": 49540, "train_speed(iter/s)": 0.200664 }, { "acc": 0.79203959, "epoch": 1.1560390206914652, "grad_norm": 5.0, "learning_rate": 3.986387251144833e-06, "loss": 0.74149151, "memory(GiB)": 147.13, "step": 49550, "train_speed(iter/s)": 0.200683 }, { "acc": 0.78118038, "epoch": 1.1562723282637541, "grad_norm": 6.0, "learning_rate": 3.9845374370024995e-06, "loss": 0.78897324, "memory(GiB)": 147.13, "step": 49560, "train_speed(iter/s)": 0.200703 }, { "acc": 0.78257017, "epoch": 1.156505635836043, "grad_norm": 5.03125, "learning_rate": 3.9826877678171515e-06, "loss": 0.78667088, "memory(GiB)": 147.13, "step": 49570, "train_speed(iter/s)": 0.200723 }, { "acc": 0.77982354, "epoch": 1.156738943408332, "grad_norm": 5.1875, "learning_rate": 3.980838243852829e-06, "loss": 0.77447562, "memory(GiB)": 147.13, "step": 49580, "train_speed(iter/s)": 0.200745 }, { "acc": 0.7748292, "epoch": 1.1569722509806208, "grad_norm": 4.5, "learning_rate": 3.978988865373551e-06, "loss": 0.80467196, "memory(GiB)": 147.13, "step": 49590, "train_speed(iter/s)": 0.200766 }, { "acc": 0.78131981, "epoch": 1.1572055585529097, "grad_norm": 5.0625, "learning_rate": 3.977139632643316e-06, "loss": 0.7902432, "memory(GiB)": 147.13, "step": 49600, "train_speed(iter/s)": 0.200786 }, { "acc": 0.79091144, "epoch": 1.1574388661251986, "grad_norm": 5.3125, "learning_rate": 3.975290545926101e-06, "loss": 0.77418213, "memory(GiB)": 147.13, "step": 49610, "train_speed(iter/s)": 0.200807 }, { "acc": 0.77418609, "epoch": 1.1576721736974875, "grad_norm": 6.0625, "learning_rate": 3.973441605485864e-06, "loss": 0.81481934, "memory(GiB)": 147.13, "step": 49620, "train_speed(iter/s)": 0.200826 }, { "acc": 0.78058105, "epoch": 1.1579054812697764, "grad_norm": 8.375, "learning_rate": 3.971592811586539e-06, "loss": 0.78378577, "memory(GiB)": 147.13, "step": 49630, "train_speed(iter/s)": 0.200846 }, { "acc": 0.76909003, "epoch": 1.1581387888420653, "grad_norm": 6.09375, "learning_rate": 3.969744164492041e-06, "loss": 0.83589916, "memory(GiB)": 147.13, "step": 49640, "train_speed(iter/s)": 0.200868 }, { "acc": 0.77044625, "epoch": 1.1583720964143542, "grad_norm": 5.15625, "learning_rate": 3.967895664466265e-06, "loss": 0.81802549, "memory(GiB)": 147.13, "step": 49650, "train_speed(iter/s)": 0.200888 }, { "acc": 0.77337999, "epoch": 1.1586054039866431, "grad_norm": 5.375, "learning_rate": 3.966047311773083e-06, "loss": 0.82878065, "memory(GiB)": 147.13, "step": 49660, "train_speed(iter/s)": 0.200909 }, { "acc": 0.76443853, "epoch": 1.158838711558932, "grad_norm": 5.96875, "learning_rate": 3.964199106676345e-06, "loss": 0.85824375, "memory(GiB)": 147.13, "step": 49670, "train_speed(iter/s)": 0.20093 }, { "acc": 0.7868217, "epoch": 1.159072019131221, "grad_norm": 13.0625, "learning_rate": 3.962351049439885e-06, "loss": 0.77052197, "memory(GiB)": 147.13, "step": 49680, "train_speed(iter/s)": 0.200953 }, { "acc": 0.78962936, "epoch": 1.1593053267035098, "grad_norm": 5.96875, "learning_rate": 3.960503140327511e-06, "loss": 0.75969067, "memory(GiB)": 147.13, "step": 49690, "train_speed(iter/s)": 0.200974 }, { "acc": 0.76684074, "epoch": 1.1595386342757987, "grad_norm": 7.0625, "learning_rate": 3.958655379603011e-06, "loss": 0.85526161, "memory(GiB)": 147.13, "step": 49700, "train_speed(iter/s)": 0.200996 }, { "acc": 0.78498111, "epoch": 1.1597719418480876, "grad_norm": 5.0625, "learning_rate": 3.956807767530155e-06, "loss": 0.78701396, "memory(GiB)": 147.13, "step": 49710, "train_speed(iter/s)": 0.201017 }, { "acc": 0.7782752, "epoch": 1.1600052494203765, "grad_norm": 4.8125, "learning_rate": 3.954960304372686e-06, "loss": 0.82441578, "memory(GiB)": 147.13, "step": 49720, "train_speed(iter/s)": 0.201039 }, { "acc": 0.76695485, "epoch": 1.1602385569926654, "grad_norm": 4.90625, "learning_rate": 3.95311299039433e-06, "loss": 0.86624813, "memory(GiB)": 147.13, "step": 49730, "train_speed(iter/s)": 0.20106 }, { "acc": 0.77228241, "epoch": 1.1604718645649543, "grad_norm": 3.875, "learning_rate": 3.951265825858792e-06, "loss": 0.8170001, "memory(GiB)": 147.13, "step": 49740, "train_speed(iter/s)": 0.201083 }, { "acc": 0.77791176, "epoch": 1.1607051721372432, "grad_norm": 5.75, "learning_rate": 3.949418811029752e-06, "loss": 0.8135541, "memory(GiB)": 147.13, "step": 49750, "train_speed(iter/s)": 0.201103 }, { "acc": 0.7993576, "epoch": 1.1609384797095321, "grad_norm": 6.09375, "learning_rate": 3.94757194617087e-06, "loss": 0.70589614, "memory(GiB)": 147.13, "step": 49760, "train_speed(iter/s)": 0.201124 }, { "acc": 0.7687006, "epoch": 1.161171787281821, "grad_norm": 6.90625, "learning_rate": 3.945725231545787e-06, "loss": 0.8219142, "memory(GiB)": 147.13, "step": 49770, "train_speed(iter/s)": 0.201145 }, { "acc": 0.78447962, "epoch": 1.16140509485411, "grad_norm": 5.375, "learning_rate": 3.943878667418122e-06, "loss": 0.76722612, "memory(GiB)": 147.13, "step": 49780, "train_speed(iter/s)": 0.201165 }, { "acc": 0.78034124, "epoch": 1.1616384024263988, "grad_norm": 4.46875, "learning_rate": 3.942032254051471e-06, "loss": 0.78942509, "memory(GiB)": 147.13, "step": 49790, "train_speed(iter/s)": 0.201185 }, { "acc": 0.79355407, "epoch": 1.1618717099986877, "grad_norm": 5.78125, "learning_rate": 3.940185991709407e-06, "loss": 0.72849441, "memory(GiB)": 147.13, "step": 49800, "train_speed(iter/s)": 0.201206 }, { "acc": 0.7907186, "epoch": 1.1621050175709766, "grad_norm": 4.125, "learning_rate": 3.938339880655485e-06, "loss": 0.76254416, "memory(GiB)": 147.13, "step": 49810, "train_speed(iter/s)": 0.201226 }, { "acc": 0.78690691, "epoch": 1.1623383251432655, "grad_norm": 5.03125, "learning_rate": 3.9364939211532365e-06, "loss": 0.76221375, "memory(GiB)": 147.13, "step": 49820, "train_speed(iter/s)": 0.201247 }, { "acc": 0.77374029, "epoch": 1.1625716327155544, "grad_norm": 4.96875, "learning_rate": 3.934648113466172e-06, "loss": 0.80819073, "memory(GiB)": 147.13, "step": 49830, "train_speed(iter/s)": 0.201268 }, { "acc": 0.77530112, "epoch": 1.1628049402878433, "grad_norm": 5.0, "learning_rate": 3.93280245785778e-06, "loss": 0.78218956, "memory(GiB)": 147.13, "step": 49840, "train_speed(iter/s)": 0.201288 }, { "acc": 0.78257365, "epoch": 1.1630382478601322, "grad_norm": 5.3125, "learning_rate": 3.9309569545915285e-06, "loss": 0.77504187, "memory(GiB)": 147.13, "step": 49850, "train_speed(iter/s)": 0.201309 }, { "acc": 0.77935219, "epoch": 1.163271555432421, "grad_norm": 7.0625, "learning_rate": 3.9291116039308605e-06, "loss": 0.79420118, "memory(GiB)": 147.13, "step": 49860, "train_speed(iter/s)": 0.20133 }, { "acc": 0.78821473, "epoch": 1.16350486300471, "grad_norm": 5.625, "learning_rate": 3.9272664061392e-06, "loss": 0.74523487, "memory(GiB)": 147.13, "step": 49870, "train_speed(iter/s)": 0.201351 }, { "acc": 0.75341196, "epoch": 1.1637381705769987, "grad_norm": 6.125, "learning_rate": 3.925421361479947e-06, "loss": 0.88330593, "memory(GiB)": 147.13, "step": 49880, "train_speed(iter/s)": 0.20137 }, { "acc": 0.77374578, "epoch": 1.1639714781492876, "grad_norm": 5.25, "learning_rate": 3.923576470216483e-06, "loss": 0.83307543, "memory(GiB)": 147.13, "step": 49890, "train_speed(iter/s)": 0.201391 }, { "acc": 0.7787755, "epoch": 1.1642047857215765, "grad_norm": 8.25, "learning_rate": 3.9217317326121655e-06, "loss": 0.78595629, "memory(GiB)": 147.13, "step": 49900, "train_speed(iter/s)": 0.201411 }, { "acc": 0.79464808, "epoch": 1.1644380932938654, "grad_norm": 5.5625, "learning_rate": 3.919887148930329e-06, "loss": 0.72763758, "memory(GiB)": 147.13, "step": 49910, "train_speed(iter/s)": 0.201431 }, { "acc": 0.78488331, "epoch": 1.1646714008661543, "grad_norm": 5.21875, "learning_rate": 3.918042719434288e-06, "loss": 0.77773986, "memory(GiB)": 147.13, "step": 49920, "train_speed(iter/s)": 0.201453 }, { "acc": 0.79530802, "epoch": 1.1649047084384432, "grad_norm": 4.46875, "learning_rate": 3.916198444387337e-06, "loss": 0.71487732, "memory(GiB)": 147.13, "step": 49930, "train_speed(iter/s)": 0.201473 }, { "acc": 0.78052902, "epoch": 1.165138016010732, "grad_norm": 6.03125, "learning_rate": 3.914354324052741e-06, "loss": 0.77790031, "memory(GiB)": 147.13, "step": 49940, "train_speed(iter/s)": 0.201494 }, { "acc": 0.7807313, "epoch": 1.165371323583021, "grad_norm": 4.46875, "learning_rate": 3.91251035869375e-06, "loss": 0.77833533, "memory(GiB)": 147.13, "step": 49950, "train_speed(iter/s)": 0.201514 }, { "acc": 0.78631954, "epoch": 1.1656046311553099, "grad_norm": 5.5625, "learning_rate": 3.91066654857359e-06, "loss": 0.7649682, "memory(GiB)": 147.13, "step": 49960, "train_speed(iter/s)": 0.201533 }, { "acc": 0.78310833, "epoch": 1.1658379387275988, "grad_norm": 5.28125, "learning_rate": 3.908822893955466e-06, "loss": 0.78763723, "memory(GiB)": 147.13, "step": 49970, "train_speed(iter/s)": 0.201554 }, { "acc": 0.7878459, "epoch": 1.1660712462998877, "grad_norm": 7.6875, "learning_rate": 3.9069793951025544e-06, "loss": 0.74921327, "memory(GiB)": 147.13, "step": 49980, "train_speed(iter/s)": 0.201575 }, { "acc": 0.76945477, "epoch": 1.1663045538721766, "grad_norm": 4.125, "learning_rate": 3.9051360522780166e-06, "loss": 0.81270313, "memory(GiB)": 147.13, "step": 49990, "train_speed(iter/s)": 0.201596 }, { "acc": 0.76828485, "epoch": 1.1665378614444655, "grad_norm": 6.5625, "learning_rate": 3.903292865744989e-06, "loss": 0.81358232, "memory(GiB)": 147.13, "step": 50000, "train_speed(iter/s)": 0.201615 }, { "epoch": 1.1665378614444655, "eval_acc": 0.7442993057267097, "eval_loss": 0.8060175180435181, "eval_runtime": 1270.1101, "eval_samples_per_second": 28.337, "eval_steps_per_second": 14.169, "step": 50000 }, { "acc": 0.78241692, "epoch": 1.1667711690167544, "grad_norm": 5.375, "learning_rate": 3.901449835766588e-06, "loss": 0.79187217, "memory(GiB)": 147.13, "step": 50010, "train_speed(iter/s)": 0.200593 }, { "acc": 0.7830759, "epoch": 1.1670044765890433, "grad_norm": 6.4375, "learning_rate": 3.899606962605902e-06, "loss": 0.76939902, "memory(GiB)": 147.13, "step": 50020, "train_speed(iter/s)": 0.200614 }, { "acc": 0.77431717, "epoch": 1.1672377841613322, "grad_norm": 5.84375, "learning_rate": 3.897764246526003e-06, "loss": 0.80634575, "memory(GiB)": 147.13, "step": 50030, "train_speed(iter/s)": 0.200635 }, { "acc": 0.77010593, "epoch": 1.167471091733621, "grad_norm": 6.03125, "learning_rate": 3.895921687789936e-06, "loss": 0.83955479, "memory(GiB)": 147.13, "step": 50040, "train_speed(iter/s)": 0.200657 }, { "acc": 0.77356577, "epoch": 1.16770439930591, "grad_norm": 5.0625, "learning_rate": 3.894079286660729e-06, "loss": 0.80566463, "memory(GiB)": 147.13, "step": 50050, "train_speed(iter/s)": 0.200677 }, { "acc": 0.75591946, "epoch": 1.1679377068781989, "grad_norm": 25.0, "learning_rate": 3.892237043401382e-06, "loss": 0.87540159, "memory(GiB)": 147.13, "step": 50060, "train_speed(iter/s)": 0.200697 }, { "acc": 0.79346662, "epoch": 1.1681710144504878, "grad_norm": 9.4375, "learning_rate": 3.890394958274877e-06, "loss": 0.74789476, "memory(GiB)": 147.13, "step": 50070, "train_speed(iter/s)": 0.200719 }, { "acc": 0.78836994, "epoch": 1.1684043220227767, "grad_norm": 5.625, "learning_rate": 3.888553031544169e-06, "loss": 0.75674076, "memory(GiB)": 147.13, "step": 50080, "train_speed(iter/s)": 0.20074 }, { "acc": 0.78470092, "epoch": 1.1686376295950656, "grad_norm": 4.6875, "learning_rate": 3.886711263472192e-06, "loss": 0.78518977, "memory(GiB)": 147.13, "step": 50090, "train_speed(iter/s)": 0.200762 }, { "acc": 0.79191232, "epoch": 1.1688709371673545, "grad_norm": 3.90625, "learning_rate": 3.884869654321859e-06, "loss": 0.73618574, "memory(GiB)": 147.13, "step": 50100, "train_speed(iter/s)": 0.200782 }, { "acc": 0.78092413, "epoch": 1.1691042447396434, "grad_norm": 10.1875, "learning_rate": 3.883028204356058e-06, "loss": 0.77842164, "memory(GiB)": 147.13, "step": 50110, "train_speed(iter/s)": 0.200803 }, { "acc": 0.7757453, "epoch": 1.1693375523119323, "grad_norm": 26.125, "learning_rate": 3.881186913837657e-06, "loss": 0.79169908, "memory(GiB)": 147.13, "step": 50120, "train_speed(iter/s)": 0.200824 }, { "acc": 0.80329933, "epoch": 1.1695708598842212, "grad_norm": 6.96875, "learning_rate": 3.879345783029498e-06, "loss": 0.70160761, "memory(GiB)": 147.13, "step": 50130, "train_speed(iter/s)": 0.200845 }, { "acc": 0.77083302, "epoch": 1.16980416745651, "grad_norm": 6.34375, "learning_rate": 3.877504812194404e-06, "loss": 0.82752018, "memory(GiB)": 147.13, "step": 50140, "train_speed(iter/s)": 0.200867 }, { "acc": 0.79996233, "epoch": 1.170037475028799, "grad_norm": 4.78125, "learning_rate": 3.875664001595172e-06, "loss": 0.68956242, "memory(GiB)": 147.13, "step": 50150, "train_speed(iter/s)": 0.200887 }, { "acc": 0.78814058, "epoch": 1.1702707826010879, "grad_norm": 5.25, "learning_rate": 3.873823351494576e-06, "loss": 0.74643149, "memory(GiB)": 147.13, "step": 50160, "train_speed(iter/s)": 0.200908 }, { "acc": 0.7647471, "epoch": 1.1705040901733768, "grad_norm": 5.71875, "learning_rate": 3.8719828621553715e-06, "loss": 0.85008469, "memory(GiB)": 147.13, "step": 50170, "train_speed(iter/s)": 0.20093 }, { "acc": 0.75985403, "epoch": 1.1707373977456657, "grad_norm": 4.875, "learning_rate": 3.870142533840283e-06, "loss": 0.88021393, "memory(GiB)": 147.13, "step": 50180, "train_speed(iter/s)": 0.200952 }, { "acc": 0.77487907, "epoch": 1.1709707053179546, "grad_norm": 5.34375, "learning_rate": 3.868302366812024e-06, "loss": 0.81592302, "memory(GiB)": 147.13, "step": 50190, "train_speed(iter/s)": 0.200972 }, { "acc": 0.77659655, "epoch": 1.1712040128902435, "grad_norm": 4.375, "learning_rate": 3.8664623613332705e-06, "loss": 0.81098127, "memory(GiB)": 147.13, "step": 50200, "train_speed(iter/s)": 0.200992 }, { "acc": 0.79070673, "epoch": 1.1714373204625323, "grad_norm": 4.65625, "learning_rate": 3.864622517666685e-06, "loss": 0.75920906, "memory(GiB)": 147.13, "step": 50210, "train_speed(iter/s)": 0.201013 }, { "acc": 0.79322906, "epoch": 1.1716706280348212, "grad_norm": 5.15625, "learning_rate": 3.862782836074906e-06, "loss": 0.72068224, "memory(GiB)": 147.13, "step": 50220, "train_speed(iter/s)": 0.201032 }, { "acc": 0.76726875, "epoch": 1.17190393560711, "grad_norm": 6.5, "learning_rate": 3.860943316820548e-06, "loss": 0.84628353, "memory(GiB)": 147.13, "step": 50230, "train_speed(iter/s)": 0.201054 }, { "acc": 0.79586458, "epoch": 1.172137243179399, "grad_norm": 7.03125, "learning_rate": 3.859103960166198e-06, "loss": 0.72394934, "memory(GiB)": 147.13, "step": 50240, "train_speed(iter/s)": 0.201074 }, { "acc": 0.77846246, "epoch": 1.1723705507516877, "grad_norm": 4.65625, "learning_rate": 3.857264766374428e-06, "loss": 0.79643097, "memory(GiB)": 147.13, "step": 50250, "train_speed(iter/s)": 0.201096 }, { "acc": 0.77642593, "epoch": 1.1726038583239768, "grad_norm": 4.5625, "learning_rate": 3.855425735707779e-06, "loss": 0.82001638, "memory(GiB)": 147.13, "step": 50260, "train_speed(iter/s)": 0.201116 }, { "acc": 0.76520214, "epoch": 1.1728371658962655, "grad_norm": 5.28125, "learning_rate": 3.853586868428775e-06, "loss": 0.84052525, "memory(GiB)": 147.13, "step": 50270, "train_speed(iter/s)": 0.201137 }, { "acc": 0.7804378, "epoch": 1.1730704734685544, "grad_norm": 5.375, "learning_rate": 3.851748164799914e-06, "loss": 0.77831354, "memory(GiB)": 147.13, "step": 50280, "train_speed(iter/s)": 0.201157 }, { "acc": 0.79932928, "epoch": 1.1733037810408433, "grad_norm": 13.5, "learning_rate": 3.849909625083666e-06, "loss": 0.70987091, "memory(GiB)": 147.13, "step": 50290, "train_speed(iter/s)": 0.201178 }, { "acc": 0.7968874, "epoch": 1.1735370886131322, "grad_norm": 4.96875, "learning_rate": 3.848071249542486e-06, "loss": 0.72468081, "memory(GiB)": 147.13, "step": 50300, "train_speed(iter/s)": 0.2012 }, { "acc": 0.76522064, "epoch": 1.1737703961854211, "grad_norm": 4.5, "learning_rate": 3.846233038438803e-06, "loss": 0.83323078, "memory(GiB)": 147.13, "step": 50310, "train_speed(iter/s)": 0.201221 }, { "acc": 0.76262903, "epoch": 1.17400370375771, "grad_norm": 5.59375, "learning_rate": 3.844394992035017e-06, "loss": 0.83975763, "memory(GiB)": 147.13, "step": 50320, "train_speed(iter/s)": 0.201241 }, { "acc": 0.79148641, "epoch": 1.174237011329999, "grad_norm": 4.34375, "learning_rate": 3.842557110593509e-06, "loss": 0.74630098, "memory(GiB)": 147.13, "step": 50330, "train_speed(iter/s)": 0.201261 }, { "acc": 0.78939476, "epoch": 1.1744703189022878, "grad_norm": 4.90625, "learning_rate": 3.840719394376638e-06, "loss": 0.74165897, "memory(GiB)": 147.13, "step": 50340, "train_speed(iter/s)": 0.20128 }, { "acc": 0.78680506, "epoch": 1.1747036264745767, "grad_norm": 5.5, "learning_rate": 3.838881843646736e-06, "loss": 0.77483697, "memory(GiB)": 147.13, "step": 50350, "train_speed(iter/s)": 0.2013 }, { "acc": 0.77357845, "epoch": 1.1749369340468656, "grad_norm": 5.125, "learning_rate": 3.8370444586661135e-06, "loss": 0.82649841, "memory(GiB)": 147.13, "step": 50360, "train_speed(iter/s)": 0.20132 }, { "acc": 0.77592096, "epoch": 1.1751702416191545, "grad_norm": 37.0, "learning_rate": 3.835207239697057e-06, "loss": 0.78373413, "memory(GiB)": 147.13, "step": 50370, "train_speed(iter/s)": 0.20134 }, { "acc": 0.80010624, "epoch": 1.1754035491914434, "grad_norm": 7.3125, "learning_rate": 3.8333701870018296e-06, "loss": 0.71716213, "memory(GiB)": 147.13, "step": 50380, "train_speed(iter/s)": 0.20136 }, { "acc": 0.78422251, "epoch": 1.1756368567637323, "grad_norm": 5.25, "learning_rate": 3.831533300842667e-06, "loss": 0.77883387, "memory(GiB)": 147.13, "step": 50390, "train_speed(iter/s)": 0.201381 }, { "acc": 0.77899208, "epoch": 1.1758701643360212, "grad_norm": 5.5, "learning_rate": 3.829696581481787e-06, "loss": 0.78697596, "memory(GiB)": 147.13, "step": 50400, "train_speed(iter/s)": 0.201399 }, { "acc": 0.77367115, "epoch": 1.17610347190831, "grad_norm": 4.1875, "learning_rate": 3.827860029181382e-06, "loss": 0.82581224, "memory(GiB)": 147.13, "step": 50410, "train_speed(iter/s)": 0.20142 }, { "acc": 0.76857705, "epoch": 1.176336779480599, "grad_norm": 6.0625, "learning_rate": 3.826023644203617e-06, "loss": 0.85204086, "memory(GiB)": 147.13, "step": 50420, "train_speed(iter/s)": 0.20144 }, { "acc": 0.79034348, "epoch": 1.176570087052888, "grad_norm": 6.96875, "learning_rate": 3.824187426810635e-06, "loss": 0.73557873, "memory(GiB)": 147.13, "step": 50430, "train_speed(iter/s)": 0.201461 }, { "acc": 0.7905654, "epoch": 1.1768033946251768, "grad_norm": 4.96875, "learning_rate": 3.822351377264555e-06, "loss": 0.75825205, "memory(GiB)": 147.13, "step": 50440, "train_speed(iter/s)": 0.201482 }, { "acc": 0.78993406, "epoch": 1.1770367021974657, "grad_norm": 4.5, "learning_rate": 3.820515495827476e-06, "loss": 0.77078609, "memory(GiB)": 147.13, "step": 50450, "train_speed(iter/s)": 0.201502 }, { "acc": 0.76711464, "epoch": 1.1772700097697546, "grad_norm": 5.4375, "learning_rate": 3.818679782761465e-06, "loss": 0.83489895, "memory(GiB)": 147.13, "step": 50460, "train_speed(iter/s)": 0.201523 }, { "acc": 0.76933699, "epoch": 1.1775033173420435, "grad_norm": 8.25, "learning_rate": 3.816844238328573e-06, "loss": 0.83145552, "memory(GiB)": 147.13, "step": 50470, "train_speed(iter/s)": 0.201543 }, { "acc": 0.77108583, "epoch": 1.1777366249143324, "grad_norm": 6.75, "learning_rate": 3.815008862790822e-06, "loss": 0.82043381, "memory(GiB)": 147.13, "step": 50480, "train_speed(iter/s)": 0.201563 }, { "acc": 0.7991888, "epoch": 1.1779699324866213, "grad_norm": 5.3125, "learning_rate": 3.813173656410211e-06, "loss": 0.74574351, "memory(GiB)": 147.13, "step": 50490, "train_speed(iter/s)": 0.201583 }, { "acc": 0.7735095, "epoch": 1.1782032400589102, "grad_norm": 6.46875, "learning_rate": 3.8113386194487177e-06, "loss": 0.8170289, "memory(GiB)": 147.13, "step": 50500, "train_speed(iter/s)": 0.201604 }, { "epoch": 1.1782032400589102, "eval_acc": 0.7443518330349046, "eval_loss": 0.8057728409767151, "eval_runtime": 1269.6788, "eval_samples_per_second": 28.347, "eval_steps_per_second": 14.174, "step": 50500 }, { "acc": 0.77446365, "epoch": 1.178436547631199, "grad_norm": 6.3125, "learning_rate": 3.80950375216829e-06, "loss": 0.80997276, "memory(GiB)": 147.13, "step": 50510, "train_speed(iter/s)": 0.200592 }, { "acc": 0.78115602, "epoch": 1.178669855203488, "grad_norm": 5.5625, "learning_rate": 3.807669054830855e-06, "loss": 0.77538776, "memory(GiB)": 147.13, "step": 50520, "train_speed(iter/s)": 0.200613 }, { "acc": 0.7817029, "epoch": 1.178903162775777, "grad_norm": 4.875, "learning_rate": 3.8058345276983165e-06, "loss": 0.78529196, "memory(GiB)": 147.13, "step": 50530, "train_speed(iter/s)": 0.200633 }, { "acc": 0.76126156, "epoch": 1.1791364703480658, "grad_norm": 5.46875, "learning_rate": 3.8040001710325547e-06, "loss": 0.87188816, "memory(GiB)": 147.13, "step": 50540, "train_speed(iter/s)": 0.200652 }, { "acc": 0.77631016, "epoch": 1.1793697779203547, "grad_norm": 6.09375, "learning_rate": 3.8021659850954186e-06, "loss": 0.79485397, "memory(GiB)": 147.13, "step": 50550, "train_speed(iter/s)": 0.200673 }, { "acc": 0.78766565, "epoch": 1.1796030854926436, "grad_norm": 4.78125, "learning_rate": 3.8003319701487407e-06, "loss": 0.77381859, "memory(GiB)": 147.13, "step": 50560, "train_speed(iter/s)": 0.200696 }, { "acc": 0.76473374, "epoch": 1.1798363930649325, "grad_norm": 4.59375, "learning_rate": 3.7984981264543247e-06, "loss": 0.8554389, "memory(GiB)": 147.13, "step": 50570, "train_speed(iter/s)": 0.200716 }, { "acc": 0.78639488, "epoch": 1.1800697006372214, "grad_norm": 6.28125, "learning_rate": 3.7966644542739538e-06, "loss": 0.76430273, "memory(GiB)": 147.13, "step": 50580, "train_speed(iter/s)": 0.200738 }, { "acc": 0.77539215, "epoch": 1.1803030082095103, "grad_norm": 4.5, "learning_rate": 3.794830953869381e-06, "loss": 0.81135311, "memory(GiB)": 147.13, "step": 50590, "train_speed(iter/s)": 0.200759 }, { "acc": 0.77136354, "epoch": 1.1805363157817992, "grad_norm": 4.875, "learning_rate": 3.7929976255023398e-06, "loss": 0.81377373, "memory(GiB)": 147.13, "step": 50600, "train_speed(iter/s)": 0.200779 }, { "acc": 0.77100811, "epoch": 1.180769623354088, "grad_norm": 7.5625, "learning_rate": 3.7911644694345368e-06, "loss": 0.80235748, "memory(GiB)": 147.13, "step": 50610, "train_speed(iter/s)": 0.2008 }, { "acc": 0.77771163, "epoch": 1.1810029309263768, "grad_norm": 4.5, "learning_rate": 3.789331485927654e-06, "loss": 0.78241501, "memory(GiB)": 147.13, "step": 50620, "train_speed(iter/s)": 0.200821 }, { "acc": 0.76570158, "epoch": 1.1812362384986659, "grad_norm": 4.71875, "learning_rate": 3.7874986752433506e-06, "loss": 0.8620245, "memory(GiB)": 147.13, "step": 50630, "train_speed(iter/s)": 0.200841 }, { "acc": 0.79454651, "epoch": 1.1814695460709546, "grad_norm": 6.4375, "learning_rate": 3.78566603764326e-06, "loss": 0.74558945, "memory(GiB)": 147.13, "step": 50640, "train_speed(iter/s)": 0.200863 }, { "acc": 0.78433299, "epoch": 1.1817028536432437, "grad_norm": 5.46875, "learning_rate": 3.7838335733889895e-06, "loss": 0.78314257, "memory(GiB)": 147.13, "step": 50650, "train_speed(iter/s)": 0.200883 }, { "acc": 0.77772627, "epoch": 1.1819361612155324, "grad_norm": 5.59375, "learning_rate": 3.782001282742124e-06, "loss": 0.80842457, "memory(GiB)": 147.13, "step": 50660, "train_speed(iter/s)": 0.200904 }, { "acc": 0.77893171, "epoch": 1.1821694687878213, "grad_norm": 6.9375, "learning_rate": 3.7801691659642196e-06, "loss": 0.79274435, "memory(GiB)": 147.13, "step": 50670, "train_speed(iter/s)": 0.200926 }, { "acc": 0.77441139, "epoch": 1.1824027763601102, "grad_norm": 7.625, "learning_rate": 3.7783372233168127e-06, "loss": 0.81310921, "memory(GiB)": 147.13, "step": 50680, "train_speed(iter/s)": 0.200945 }, { "acc": 0.78166981, "epoch": 1.182636083932399, "grad_norm": 5.875, "learning_rate": 3.776505455061412e-06, "loss": 0.78240037, "memory(GiB)": 147.13, "step": 50690, "train_speed(iter/s)": 0.200966 }, { "acc": 0.78959174, "epoch": 1.182869391504688, "grad_norm": 4.6875, "learning_rate": 3.7746738614595022e-06, "loss": 0.75032959, "memory(GiB)": 147.13, "step": 50700, "train_speed(iter/s)": 0.200984 }, { "acc": 0.78085089, "epoch": 1.1831026990769768, "grad_norm": 5.21875, "learning_rate": 3.772842442772543e-06, "loss": 0.78894525, "memory(GiB)": 147.13, "step": 50710, "train_speed(iter/s)": 0.201005 }, { "acc": 0.79085646, "epoch": 1.1833360066492657, "grad_norm": 6.75, "learning_rate": 3.7710111992619696e-06, "loss": 0.7664422, "memory(GiB)": 147.13, "step": 50720, "train_speed(iter/s)": 0.201026 }, { "acc": 0.77849669, "epoch": 1.1835693142215546, "grad_norm": 5.21875, "learning_rate": 3.7691801311891898e-06, "loss": 0.79077878, "memory(GiB)": 147.13, "step": 50730, "train_speed(iter/s)": 0.201043 }, { "acc": 0.77560053, "epoch": 1.1838026217938435, "grad_norm": 6.28125, "learning_rate": 3.767349238815588e-06, "loss": 0.8155261, "memory(GiB)": 147.13, "step": 50740, "train_speed(iter/s)": 0.201063 }, { "acc": 0.76759486, "epoch": 1.1840359293661324, "grad_norm": 5.09375, "learning_rate": 3.7655185224025247e-06, "loss": 0.84323349, "memory(GiB)": 147.13, "step": 50750, "train_speed(iter/s)": 0.201083 }, { "acc": 0.77860336, "epoch": 1.1842692369384213, "grad_norm": 5.9375, "learning_rate": 3.7636879822113338e-06, "loss": 0.78937483, "memory(GiB)": 147.13, "step": 50760, "train_speed(iter/s)": 0.201104 }, { "acc": 0.77916279, "epoch": 1.1845025445107102, "grad_norm": 8.8125, "learning_rate": 3.761857618503326e-06, "loss": 0.80653362, "memory(GiB)": 147.13, "step": 50770, "train_speed(iter/s)": 0.201123 }, { "acc": 0.75581121, "epoch": 1.1847358520829991, "grad_norm": 6.75, "learning_rate": 3.7600274315397816e-06, "loss": 0.90901031, "memory(GiB)": 147.13, "step": 50780, "train_speed(iter/s)": 0.201144 }, { "acc": 0.75556707, "epoch": 1.184969159655288, "grad_norm": 4.53125, "learning_rate": 3.758197421581961e-06, "loss": 0.90556011, "memory(GiB)": 147.13, "step": 50790, "train_speed(iter/s)": 0.201164 }, { "acc": 0.77783809, "epoch": 1.185202467227577, "grad_norm": 4.46875, "learning_rate": 3.756367588891099e-06, "loss": 0.79138689, "memory(GiB)": 147.13, "step": 50800, "train_speed(iter/s)": 0.201183 }, { "acc": 0.77093124, "epoch": 1.1854357747998658, "grad_norm": 6.28125, "learning_rate": 3.754537933728401e-06, "loss": 0.82287769, "memory(GiB)": 147.13, "step": 50810, "train_speed(iter/s)": 0.201204 }, { "acc": 0.78769693, "epoch": 1.1856690823721547, "grad_norm": 15.625, "learning_rate": 3.7527084563550515e-06, "loss": 0.75005512, "memory(GiB)": 147.13, "step": 50820, "train_speed(iter/s)": 0.201226 }, { "acc": 0.78668466, "epoch": 1.1859023899444436, "grad_norm": 5.9375, "learning_rate": 3.750879157032207e-06, "loss": 0.75284238, "memory(GiB)": 147.13, "step": 50830, "train_speed(iter/s)": 0.201247 }, { "acc": 0.78141642, "epoch": 1.1861356975167325, "grad_norm": 7.15625, "learning_rate": 3.7490500360210003e-06, "loss": 0.79019589, "memory(GiB)": 147.13, "step": 50840, "train_speed(iter/s)": 0.201266 }, { "acc": 0.76448064, "epoch": 1.1863690050890214, "grad_norm": 6.15625, "learning_rate": 3.747221093582538e-06, "loss": 0.86280327, "memory(GiB)": 147.13, "step": 50850, "train_speed(iter/s)": 0.201286 }, { "acc": 0.76448793, "epoch": 1.1866023126613103, "grad_norm": 5.28125, "learning_rate": 3.7453923299779014e-06, "loss": 0.86757164, "memory(GiB)": 147.13, "step": 50860, "train_speed(iter/s)": 0.201307 }, { "acc": 0.78806419, "epoch": 1.1868356202335992, "grad_norm": 4.3125, "learning_rate": 3.743563745468144e-06, "loss": 0.75735083, "memory(GiB)": 147.13, "step": 50870, "train_speed(iter/s)": 0.201327 }, { "acc": 0.78509789, "epoch": 1.1870689278058881, "grad_norm": 6.125, "learning_rate": 3.7417353403142988e-06, "loss": 0.77933402, "memory(GiB)": 147.13, "step": 50880, "train_speed(iter/s)": 0.201348 }, { "acc": 0.76924162, "epoch": 1.187302235378177, "grad_norm": 4.84375, "learning_rate": 3.7399071147773668e-06, "loss": 0.83347654, "memory(GiB)": 147.13, "step": 50890, "train_speed(iter/s)": 0.201369 }, { "acc": 0.76855621, "epoch": 1.187535542950466, "grad_norm": 4.1875, "learning_rate": 3.7380790691183276e-06, "loss": 0.82479677, "memory(GiB)": 147.13, "step": 50900, "train_speed(iter/s)": 0.201388 }, { "acc": 0.77282152, "epoch": 1.1877688505227548, "grad_norm": 5.0625, "learning_rate": 3.7362512035981347e-06, "loss": 0.81683245, "memory(GiB)": 147.13, "step": 50910, "train_speed(iter/s)": 0.201409 }, { "acc": 0.77848644, "epoch": 1.1880021580950437, "grad_norm": 5.59375, "learning_rate": 3.7344235184777157e-06, "loss": 0.78320317, "memory(GiB)": 147.13, "step": 50920, "train_speed(iter/s)": 0.20143 }, { "acc": 0.77635636, "epoch": 1.1882354656673326, "grad_norm": 4.5625, "learning_rate": 3.7325960140179717e-06, "loss": 0.82402401, "memory(GiB)": 147.13, "step": 50930, "train_speed(iter/s)": 0.201451 }, { "acc": 0.80113115, "epoch": 1.1884687732396215, "grad_norm": 3.828125, "learning_rate": 3.730768690479779e-06, "loss": 0.70224762, "memory(GiB)": 147.13, "step": 50940, "train_speed(iter/s)": 0.20147 }, { "acc": 0.78452883, "epoch": 1.1887020808119104, "grad_norm": 6.09375, "learning_rate": 3.7289415481239865e-06, "loss": 0.78695316, "memory(GiB)": 147.13, "step": 50950, "train_speed(iter/s)": 0.20149 }, { "acc": 0.79433985, "epoch": 1.1889353883841993, "grad_norm": 5.53125, "learning_rate": 3.727114587211419e-06, "loss": 0.7226675, "memory(GiB)": 147.13, "step": 50960, "train_speed(iter/s)": 0.201511 }, { "acc": 0.78255625, "epoch": 1.1891686959564882, "grad_norm": 4.46875, "learning_rate": 3.7252878080028744e-06, "loss": 0.77606573, "memory(GiB)": 147.13, "step": 50970, "train_speed(iter/s)": 0.201531 }, { "acc": 0.77274323, "epoch": 1.1894020035287771, "grad_norm": 5.75, "learning_rate": 3.7234612107591246e-06, "loss": 0.83783932, "memory(GiB)": 147.13, "step": 50980, "train_speed(iter/s)": 0.201551 }, { "acc": 0.79396834, "epoch": 1.189635311101066, "grad_norm": 3.640625, "learning_rate": 3.721634795740918e-06, "loss": 0.73608699, "memory(GiB)": 147.13, "step": 50990, "train_speed(iter/s)": 0.201572 }, { "acc": 0.78861499, "epoch": 1.189868618673355, "grad_norm": 4.125, "learning_rate": 3.719808563208971e-06, "loss": 0.75779295, "memory(GiB)": 147.13, "step": 51000, "train_speed(iter/s)": 0.201593 }, { "epoch": 1.189868618673355, "eval_acc": 0.7443236476500195, "eval_loss": 0.8055968880653381, "eval_runtime": 1269.0015, "eval_samples_per_second": 28.362, "eval_steps_per_second": 14.181, "step": 51000 }, { "acc": 0.79447026, "epoch": 1.1901019262456436, "grad_norm": 5.28125, "learning_rate": 3.71798251342398e-06, "loss": 0.73775778, "memory(GiB)": 147.13, "step": 51010, "train_speed(iter/s)": 0.20059 }, { "acc": 0.78940754, "epoch": 1.1903352338179327, "grad_norm": 6.1875, "learning_rate": 3.7161566466466137e-06, "loss": 0.74937639, "memory(GiB)": 147.13, "step": 51020, "train_speed(iter/s)": 0.200611 }, { "acc": 0.78550367, "epoch": 1.1905685413902214, "grad_norm": 5.09375, "learning_rate": 3.714330963137512e-06, "loss": 0.78713713, "memory(GiB)": 147.13, "step": 51030, "train_speed(iter/s)": 0.200632 }, { "acc": 0.77606764, "epoch": 1.1908018489625105, "grad_norm": 7.4375, "learning_rate": 3.7125054631572915e-06, "loss": 0.8188735, "memory(GiB)": 147.13, "step": 51040, "train_speed(iter/s)": 0.200652 }, { "acc": 0.79022417, "epoch": 1.1910351565347992, "grad_norm": 5.5, "learning_rate": 3.710680146966542e-06, "loss": 0.75739288, "memory(GiB)": 147.13, "step": 51050, "train_speed(iter/s)": 0.200672 }, { "acc": 0.75823822, "epoch": 1.191268464107088, "grad_norm": 9.25, "learning_rate": 3.7088550148258277e-06, "loss": 0.87464418, "memory(GiB)": 147.13, "step": 51060, "train_speed(iter/s)": 0.20069 }, { "acc": 0.7662158, "epoch": 1.191501771679377, "grad_norm": 6.46875, "learning_rate": 3.707030066995685e-06, "loss": 0.84084873, "memory(GiB)": 147.13, "step": 51070, "train_speed(iter/s)": 0.20071 }, { "acc": 0.77479043, "epoch": 1.1917350792516659, "grad_norm": 5.4375, "learning_rate": 3.705205303736625e-06, "loss": 0.8108655, "memory(GiB)": 147.13, "step": 51080, "train_speed(iter/s)": 0.200729 }, { "acc": 0.78514309, "epoch": 1.1919683868239548, "grad_norm": 5.40625, "learning_rate": 3.7033807253091313e-06, "loss": 0.77751369, "memory(GiB)": 147.13, "step": 51090, "train_speed(iter/s)": 0.200749 }, { "acc": 0.76177063, "epoch": 1.1922016943962437, "grad_norm": 5.46875, "learning_rate": 3.7015563319736618e-06, "loss": 0.8812314, "memory(GiB)": 147.13, "step": 51100, "train_speed(iter/s)": 0.20077 }, { "acc": 0.78570685, "epoch": 1.1924350019685326, "grad_norm": 4.375, "learning_rate": 3.6997321239906513e-06, "loss": 0.76931667, "memory(GiB)": 147.13, "step": 51110, "train_speed(iter/s)": 0.20079 }, { "acc": 0.78043647, "epoch": 1.1926683095408215, "grad_norm": 5.15625, "learning_rate": 3.6979081016204998e-06, "loss": 0.79293995, "memory(GiB)": 147.13, "step": 51120, "train_speed(iter/s)": 0.200811 }, { "acc": 0.7917635, "epoch": 1.1929016171131104, "grad_norm": 5.59375, "learning_rate": 3.6960842651235894e-06, "loss": 0.73350315, "memory(GiB)": 147.13, "step": 51130, "train_speed(iter/s)": 0.20083 }, { "acc": 0.78342714, "epoch": 1.1931349246853993, "grad_norm": 4.6875, "learning_rate": 3.6942606147602705e-06, "loss": 0.76641498, "memory(GiB)": 147.13, "step": 51140, "train_speed(iter/s)": 0.200851 }, { "acc": 0.77356782, "epoch": 1.1933682322576882, "grad_norm": 7.90625, "learning_rate": 3.6924371507908695e-06, "loss": 0.81099625, "memory(GiB)": 147.13, "step": 51150, "train_speed(iter/s)": 0.200871 }, { "acc": 0.8011385, "epoch": 1.193601539829977, "grad_norm": 5.09375, "learning_rate": 3.690613873475687e-06, "loss": 0.71837916, "memory(GiB)": 147.13, "step": 51160, "train_speed(iter/s)": 0.200891 }, { "acc": 0.78795547, "epoch": 1.193834847402266, "grad_norm": 5.6875, "learning_rate": 3.6887907830749923e-06, "loss": 0.75448265, "memory(GiB)": 147.13, "step": 51170, "train_speed(iter/s)": 0.200911 }, { "acc": 0.78816214, "epoch": 1.1940681549745549, "grad_norm": 4.46875, "learning_rate": 3.686967879849033e-06, "loss": 0.7446311, "memory(GiB)": 147.13, "step": 51180, "train_speed(iter/s)": 0.200931 }, { "acc": 0.78221531, "epoch": 1.1943014625468438, "grad_norm": 5.71875, "learning_rate": 3.6851451640580264e-06, "loss": 0.7772069, "memory(GiB)": 147.13, "step": 51190, "train_speed(iter/s)": 0.20095 }, { "acc": 0.76984034, "epoch": 1.1945347701191327, "grad_norm": 6.5625, "learning_rate": 3.6833226359621668e-06, "loss": 0.83698673, "memory(GiB)": 147.13, "step": 51200, "train_speed(iter/s)": 0.200971 }, { "acc": 0.77070379, "epoch": 1.1947680776914216, "grad_norm": 5.59375, "learning_rate": 3.6815002958216183e-06, "loss": 0.81270151, "memory(GiB)": 147.13, "step": 51210, "train_speed(iter/s)": 0.200991 }, { "acc": 0.79680748, "epoch": 1.1950013852637105, "grad_norm": 5.78125, "learning_rate": 3.67967814389652e-06, "loss": 0.71274958, "memory(GiB)": 147.13, "step": 51220, "train_speed(iter/s)": 0.201011 }, { "acc": 0.78539243, "epoch": 1.1952346928359994, "grad_norm": 4.84375, "learning_rate": 3.6778561804469825e-06, "loss": 0.78309517, "memory(GiB)": 147.13, "step": 51230, "train_speed(iter/s)": 0.201031 }, { "acc": 0.78247108, "epoch": 1.1954680004082883, "grad_norm": 5.625, "learning_rate": 3.676034405733092e-06, "loss": 0.7762372, "memory(GiB)": 147.13, "step": 51240, "train_speed(iter/s)": 0.201051 }, { "acc": 0.77460847, "epoch": 1.1957013079805772, "grad_norm": 5.46875, "learning_rate": 3.6742128200149042e-06, "loss": 0.82830582, "memory(GiB)": 147.13, "step": 51250, "train_speed(iter/s)": 0.201072 }, { "acc": 0.77715383, "epoch": 1.195934615552866, "grad_norm": 5.6875, "learning_rate": 3.672391423552451e-06, "loss": 0.81013908, "memory(GiB)": 147.13, "step": 51260, "train_speed(iter/s)": 0.201091 }, { "acc": 0.76946192, "epoch": 1.196167923125155, "grad_norm": 5.75, "learning_rate": 3.6705702166057366e-06, "loss": 0.83462391, "memory(GiB)": 147.13, "step": 51270, "train_speed(iter/s)": 0.201111 }, { "acc": 0.77984385, "epoch": 1.1964012306974439, "grad_norm": 5.75, "learning_rate": 3.668749199434738e-06, "loss": 0.80135126, "memory(GiB)": 147.13, "step": 51280, "train_speed(iter/s)": 0.20113 }, { "acc": 0.77329564, "epoch": 1.1966345382697328, "grad_norm": 5.625, "learning_rate": 3.6669283722994054e-06, "loss": 0.81273098, "memory(GiB)": 147.13, "step": 51290, "train_speed(iter/s)": 0.20115 }, { "acc": 0.78445587, "epoch": 1.1968678458420217, "grad_norm": 5.78125, "learning_rate": 3.6651077354596586e-06, "loss": 0.76524854, "memory(GiB)": 147.13, "step": 51300, "train_speed(iter/s)": 0.201171 }, { "acc": 0.79493475, "epoch": 1.1971011534143106, "grad_norm": 7.125, "learning_rate": 3.6632872891753956e-06, "loss": 0.73145099, "memory(GiB)": 147.13, "step": 51310, "train_speed(iter/s)": 0.201192 }, { "acc": 0.77402053, "epoch": 1.1973344609865995, "grad_norm": 3.96875, "learning_rate": 3.661467033706483e-06, "loss": 0.79744701, "memory(GiB)": 147.13, "step": 51320, "train_speed(iter/s)": 0.201213 }, { "acc": 0.78484211, "epoch": 1.1975677685588884, "grad_norm": 6.1875, "learning_rate": 3.6596469693127636e-06, "loss": 0.77361097, "memory(GiB)": 147.13, "step": 51330, "train_speed(iter/s)": 0.201234 }, { "acc": 0.80426512, "epoch": 1.1978010761311773, "grad_norm": 7.53125, "learning_rate": 3.6578270962540506e-06, "loss": 0.69536757, "memory(GiB)": 147.13, "step": 51340, "train_speed(iter/s)": 0.201254 }, { "acc": 0.75910811, "epoch": 1.1980343837034662, "grad_norm": 13.5, "learning_rate": 3.6560074147901287e-06, "loss": 0.85062847, "memory(GiB)": 147.13, "step": 51350, "train_speed(iter/s)": 0.201275 }, { "acc": 0.78548346, "epoch": 1.198267691275755, "grad_norm": 8.625, "learning_rate": 3.654187925180758e-06, "loss": 0.76579847, "memory(GiB)": 147.13, "step": 51360, "train_speed(iter/s)": 0.201297 }, { "acc": 0.78351984, "epoch": 1.198500998848044, "grad_norm": 5.25, "learning_rate": 3.65236862768567e-06, "loss": 0.77279978, "memory(GiB)": 147.13, "step": 51370, "train_speed(iter/s)": 0.201317 }, { "acc": 0.79646559, "epoch": 1.1987343064203329, "grad_norm": 5.90625, "learning_rate": 3.650549522564569e-06, "loss": 0.73050995, "memory(GiB)": 147.13, "step": 51380, "train_speed(iter/s)": 0.201338 }, { "acc": 0.77936029, "epoch": 1.1989676139926218, "grad_norm": 8.0625, "learning_rate": 3.648730610077131e-06, "loss": 0.79830408, "memory(GiB)": 147.13, "step": 51390, "train_speed(iter/s)": 0.201358 }, { "acc": 0.78159542, "epoch": 1.1992009215649104, "grad_norm": 4.71875, "learning_rate": 3.646911890483006e-06, "loss": 0.80945339, "memory(GiB)": 147.13, "step": 51400, "train_speed(iter/s)": 0.201379 }, { "acc": 0.77410226, "epoch": 1.1994342291371995, "grad_norm": 5.34375, "learning_rate": 3.645093364041815e-06, "loss": 0.81879921, "memory(GiB)": 147.13, "step": 51410, "train_speed(iter/s)": 0.201398 }, { "acc": 0.78253832, "epoch": 1.1996675367094882, "grad_norm": 4.40625, "learning_rate": 3.6432750310131537e-06, "loss": 0.78207197, "memory(GiB)": 147.13, "step": 51420, "train_speed(iter/s)": 0.201417 }, { "acc": 0.77488294, "epoch": 1.1999008442817771, "grad_norm": 6.0625, "learning_rate": 3.6414568916565884e-06, "loss": 0.791008, "memory(GiB)": 147.13, "step": 51430, "train_speed(iter/s)": 0.201438 }, { "acc": 0.78896904, "epoch": 1.200134151854066, "grad_norm": 6.5, "learning_rate": 3.6396389462316558e-06, "loss": 0.76538782, "memory(GiB)": 147.13, "step": 51440, "train_speed(iter/s)": 0.201457 }, { "acc": 0.77477431, "epoch": 1.200367459426355, "grad_norm": 5.03125, "learning_rate": 3.6378211949978693e-06, "loss": 0.8104063, "memory(GiB)": 147.13, "step": 51450, "train_speed(iter/s)": 0.201477 }, { "acc": 0.77967916, "epoch": 1.2006007669986438, "grad_norm": 5.40625, "learning_rate": 3.6360036382147117e-06, "loss": 0.7820168, "memory(GiB)": 147.13, "step": 51460, "train_speed(iter/s)": 0.201498 }, { "acc": 0.79494371, "epoch": 1.2008340745709327, "grad_norm": 6.40625, "learning_rate": 3.634186276141638e-06, "loss": 0.73448949, "memory(GiB)": 147.13, "step": 51470, "train_speed(iter/s)": 0.201519 }, { "acc": 0.77666898, "epoch": 1.2010673821432216, "grad_norm": 5.84375, "learning_rate": 3.6323691090380756e-06, "loss": 0.79062681, "memory(GiB)": 147.13, "step": 51480, "train_speed(iter/s)": 0.20154 }, { "acc": 0.78186355, "epoch": 1.2013006897155105, "grad_norm": 4.625, "learning_rate": 3.630552137163427e-06, "loss": 0.7758111, "memory(GiB)": 147.13, "step": 51490, "train_speed(iter/s)": 0.20156 }, { "acc": 0.78854303, "epoch": 1.2015339972877994, "grad_norm": 4.375, "learning_rate": 3.6287353607770613e-06, "loss": 0.75728836, "memory(GiB)": 147.13, "step": 51500, "train_speed(iter/s)": 0.20158 }, { "epoch": 1.2015339972877994, "eval_acc": 0.7444104458239269, "eval_loss": 0.8055254817008972, "eval_runtime": 1269.0052, "eval_samples_per_second": 28.362, "eval_steps_per_second": 14.181, "step": 51500 }, { "acc": 0.79272242, "epoch": 1.2017673048600883, "grad_norm": 4.34375, "learning_rate": 3.6269187801383267e-06, "loss": 0.74171906, "memory(GiB)": 147.13, "step": 51510, "train_speed(iter/s)": 0.20059 }, { "acc": 0.77594776, "epoch": 1.2020006124323772, "grad_norm": 6.90625, "learning_rate": 3.6251023955065356e-06, "loss": 0.79424891, "memory(GiB)": 147.13, "step": 51520, "train_speed(iter/s)": 0.200612 }, { "acc": 0.78154349, "epoch": 1.2022339200046661, "grad_norm": 4.5, "learning_rate": 3.623286207140979e-06, "loss": 0.7680542, "memory(GiB)": 147.13, "step": 51530, "train_speed(iter/s)": 0.200631 }, { "acc": 0.7791707, "epoch": 1.202467227576955, "grad_norm": 5.25, "learning_rate": 3.6214702153009157e-06, "loss": 0.79780264, "memory(GiB)": 147.13, "step": 51540, "train_speed(iter/s)": 0.200652 }, { "acc": 0.7682992, "epoch": 1.202700535149244, "grad_norm": 8.75, "learning_rate": 3.6196544202455787e-06, "loss": 0.84575796, "memory(GiB)": 147.13, "step": 51550, "train_speed(iter/s)": 0.200671 }, { "acc": 0.77210584, "epoch": 1.2029338427215328, "grad_norm": 4.5, "learning_rate": 3.617838822234175e-06, "loss": 0.82075291, "memory(GiB)": 147.13, "step": 51560, "train_speed(iter/s)": 0.200692 }, { "acc": 0.77745943, "epoch": 1.2031671502938217, "grad_norm": 6.71875, "learning_rate": 3.616023421525875e-06, "loss": 0.81119947, "memory(GiB)": 147.13, "step": 51570, "train_speed(iter/s)": 0.200713 }, { "acc": 0.78472643, "epoch": 1.2034004578661106, "grad_norm": 5.53125, "learning_rate": 3.61420821837983e-06, "loss": 0.76066618, "memory(GiB)": 147.13, "step": 51580, "train_speed(iter/s)": 0.200733 }, { "acc": 0.79031987, "epoch": 1.2036337654383995, "grad_norm": 4.8125, "learning_rate": 3.61239321305516e-06, "loss": 0.73573508, "memory(GiB)": 147.13, "step": 51590, "train_speed(iter/s)": 0.200754 }, { "acc": 0.78004036, "epoch": 1.2038670730106884, "grad_norm": 6.9375, "learning_rate": 3.610578405810955e-06, "loss": 0.80121174, "memory(GiB)": 147.13, "step": 51600, "train_speed(iter/s)": 0.200773 }, { "acc": 0.78116407, "epoch": 1.2041003805829773, "grad_norm": 5.15625, "learning_rate": 3.6087637969062783e-06, "loss": 0.77282748, "memory(GiB)": 147.13, "step": 51610, "train_speed(iter/s)": 0.200793 }, { "acc": 0.79813347, "epoch": 1.2043336881552662, "grad_norm": 7.09375, "learning_rate": 3.606949386600166e-06, "loss": 0.71265526, "memory(GiB)": 147.13, "step": 51620, "train_speed(iter/s)": 0.200813 }, { "acc": 0.76442337, "epoch": 1.204566995727555, "grad_norm": 7.375, "learning_rate": 3.605135175151624e-06, "loss": 0.8571002, "memory(GiB)": 147.13, "step": 51630, "train_speed(iter/s)": 0.200833 }, { "acc": 0.78852344, "epoch": 1.204800303299844, "grad_norm": 4.78125, "learning_rate": 3.6033211628196308e-06, "loss": 0.74523458, "memory(GiB)": 147.13, "step": 51640, "train_speed(iter/s)": 0.200854 }, { "acc": 0.77451267, "epoch": 1.205033610872133, "grad_norm": 6.15625, "learning_rate": 3.601507349863137e-06, "loss": 0.79993911, "memory(GiB)": 147.13, "step": 51650, "train_speed(iter/s)": 0.200874 }, { "acc": 0.77563667, "epoch": 1.2052669184444218, "grad_norm": 6.34375, "learning_rate": 3.599693736541061e-06, "loss": 0.81237659, "memory(GiB)": 147.13, "step": 51660, "train_speed(iter/s)": 0.200894 }, { "acc": 0.76454887, "epoch": 1.2055002260167107, "grad_norm": 5.71875, "learning_rate": 3.5978803231122977e-06, "loss": 0.86580725, "memory(GiB)": 147.13, "step": 51670, "train_speed(iter/s)": 0.200914 }, { "acc": 0.77970967, "epoch": 1.2057335335889996, "grad_norm": 5.125, "learning_rate": 3.596067109835713e-06, "loss": 0.78640342, "memory(GiB)": 147.13, "step": 51680, "train_speed(iter/s)": 0.200932 }, { "acc": 0.77567797, "epoch": 1.2059668411612885, "grad_norm": 3.65625, "learning_rate": 3.5942540969701386e-06, "loss": 0.79667077, "memory(GiB)": 147.13, "step": 51690, "train_speed(iter/s)": 0.200952 }, { "acc": 0.77737932, "epoch": 1.2062001487335774, "grad_norm": 4.84375, "learning_rate": 3.592441284774383e-06, "loss": 0.79587197, "memory(GiB)": 147.13, "step": 51700, "train_speed(iter/s)": 0.200972 }, { "acc": 0.770786, "epoch": 1.2064334563058663, "grad_norm": 6.46875, "learning_rate": 3.5906286735072255e-06, "loss": 0.82086658, "memory(GiB)": 147.13, "step": 51710, "train_speed(iter/s)": 0.200992 }, { "acc": 0.78262281, "epoch": 1.2066667638781552, "grad_norm": 4.4375, "learning_rate": 3.5888162634274154e-06, "loss": 0.77802362, "memory(GiB)": 147.13, "step": 51720, "train_speed(iter/s)": 0.201013 }, { "acc": 0.7677485, "epoch": 1.206900071450444, "grad_norm": 4.5625, "learning_rate": 3.5870040547936748e-06, "loss": 0.83916025, "memory(GiB)": 147.13, "step": 51730, "train_speed(iter/s)": 0.201034 }, { "acc": 0.77800913, "epoch": 1.207133379022733, "grad_norm": 6.09375, "learning_rate": 3.585192047864694e-06, "loss": 0.80633202, "memory(GiB)": 147.13, "step": 51740, "train_speed(iter/s)": 0.201054 }, { "acc": 0.75977621, "epoch": 1.2073666865950219, "grad_norm": 6.5, "learning_rate": 3.5833802428991373e-06, "loss": 0.86888771, "memory(GiB)": 147.13, "step": 51750, "train_speed(iter/s)": 0.201074 }, { "acc": 0.76785235, "epoch": 1.2075999941673108, "grad_norm": 5.3125, "learning_rate": 3.581568640155639e-06, "loss": 0.82541256, "memory(GiB)": 147.13, "step": 51760, "train_speed(iter/s)": 0.201094 }, { "acc": 0.78889942, "epoch": 1.2078333017395995, "grad_norm": 5.21875, "learning_rate": 3.5797572398928053e-06, "loss": 0.73905168, "memory(GiB)": 147.13, "step": 51770, "train_speed(iter/s)": 0.201111 }, { "acc": 0.78549714, "epoch": 1.2080666093118886, "grad_norm": 5.1875, "learning_rate": 3.5779460423692136e-06, "loss": 0.77860193, "memory(GiB)": 147.13, "step": 51780, "train_speed(iter/s)": 0.201131 }, { "acc": 0.77393236, "epoch": 1.2082999168841773, "grad_norm": 4.46875, "learning_rate": 3.5761350478434133e-06, "loss": 0.82066584, "memory(GiB)": 147.13, "step": 51790, "train_speed(iter/s)": 0.20115 }, { "acc": 0.76424036, "epoch": 1.2085332244564664, "grad_norm": 5.03125, "learning_rate": 3.5743242565739183e-06, "loss": 0.86626759, "memory(GiB)": 147.13, "step": 51800, "train_speed(iter/s)": 0.20117 }, { "acc": 0.77947626, "epoch": 1.208766532028755, "grad_norm": 5.71875, "learning_rate": 3.572513668819223e-06, "loss": 0.79311905, "memory(GiB)": 147.13, "step": 51810, "train_speed(iter/s)": 0.20119 }, { "acc": 0.77670922, "epoch": 1.208999839601044, "grad_norm": 6.03125, "learning_rate": 3.570703284837786e-06, "loss": 0.81235285, "memory(GiB)": 147.13, "step": 51820, "train_speed(iter/s)": 0.201211 }, { "acc": 0.79082127, "epoch": 1.2092331471733329, "grad_norm": 5.375, "learning_rate": 3.5688931048880397e-06, "loss": 0.75774207, "memory(GiB)": 147.13, "step": 51830, "train_speed(iter/s)": 0.20123 }, { "acc": 0.78081455, "epoch": 1.2094664547456218, "grad_norm": 5.375, "learning_rate": 3.567083129228387e-06, "loss": 0.76074839, "memory(GiB)": 147.13, "step": 51840, "train_speed(iter/s)": 0.201251 }, { "acc": 0.77981596, "epoch": 1.2096997623179107, "grad_norm": 6.0, "learning_rate": 3.5652733581172015e-06, "loss": 0.7848402, "memory(GiB)": 147.13, "step": 51850, "train_speed(iter/s)": 0.201272 }, { "acc": 0.78686342, "epoch": 1.2099330698901996, "grad_norm": 6.28125, "learning_rate": 3.5634637918128267e-06, "loss": 0.77134657, "memory(GiB)": 147.13, "step": 51860, "train_speed(iter/s)": 0.201291 }, { "acc": 0.7602006, "epoch": 1.2101663774624885, "grad_norm": 6.46875, "learning_rate": 3.56165443057358e-06, "loss": 0.87410669, "memory(GiB)": 147.13, "step": 51870, "train_speed(iter/s)": 0.20131 }, { "acc": 0.78266001, "epoch": 1.2103996850347774, "grad_norm": 3.96875, "learning_rate": 3.5598452746577443e-06, "loss": 0.76107483, "memory(GiB)": 147.13, "step": 51880, "train_speed(iter/s)": 0.20133 }, { "acc": 0.7808578, "epoch": 1.2106329926070662, "grad_norm": 5.40625, "learning_rate": 3.5580363243235773e-06, "loss": 0.76968455, "memory(GiB)": 147.13, "step": 51890, "train_speed(iter/s)": 0.20135 }, { "acc": 0.78987422, "epoch": 1.2108663001793551, "grad_norm": 5.3125, "learning_rate": 3.556227579829306e-06, "loss": 0.75738258, "memory(GiB)": 147.13, "step": 51900, "train_speed(iter/s)": 0.201369 }, { "acc": 0.78409944, "epoch": 1.211099607751644, "grad_norm": 7.03125, "learning_rate": 3.5544190414331305e-06, "loss": 0.77000666, "memory(GiB)": 147.13, "step": 51910, "train_speed(iter/s)": 0.201389 }, { "acc": 0.76755562, "epoch": 1.211332915323933, "grad_norm": 6.5625, "learning_rate": 3.552610709393215e-06, "loss": 0.83009071, "memory(GiB)": 147.13, "step": 51920, "train_speed(iter/s)": 0.201409 }, { "acc": 0.77854753, "epoch": 1.2115662228962218, "grad_norm": 4.71875, "learning_rate": 3.5508025839676997e-06, "loss": 0.79306674, "memory(GiB)": 147.13, "step": 51930, "train_speed(iter/s)": 0.201429 }, { "acc": 0.78213997, "epoch": 1.2117995304685107, "grad_norm": 7.90625, "learning_rate": 3.5489946654146945e-06, "loss": 0.77478514, "memory(GiB)": 147.13, "step": 51940, "train_speed(iter/s)": 0.20145 }, { "acc": 0.79072771, "epoch": 1.2120328380407996, "grad_norm": 6.75, "learning_rate": 3.547186953992281e-06, "loss": 0.74444256, "memory(GiB)": 147.13, "step": 51950, "train_speed(iter/s)": 0.20147 }, { "acc": 0.78262076, "epoch": 1.2122661456130885, "grad_norm": 9.25, "learning_rate": 3.5453794499585057e-06, "loss": 0.77354512, "memory(GiB)": 147.13, "step": 51960, "train_speed(iter/s)": 0.20149 }, { "acc": 0.79070244, "epoch": 1.2124994531853774, "grad_norm": 4.3125, "learning_rate": 3.543572153571393e-06, "loss": 0.73883228, "memory(GiB)": 147.13, "step": 51970, "train_speed(iter/s)": 0.20151 }, { "acc": 0.78741131, "epoch": 1.2127327607576663, "grad_norm": 5.40625, "learning_rate": 3.541765065088931e-06, "loss": 0.76005692, "memory(GiB)": 147.13, "step": 51980, "train_speed(iter/s)": 0.20153 }, { "acc": 0.77303057, "epoch": 1.2129660683299552, "grad_norm": 6.53125, "learning_rate": 3.539958184769082e-06, "loss": 0.80659695, "memory(GiB)": 147.13, "step": 51990, "train_speed(iter/s)": 0.20155 }, { "acc": 0.78834496, "epoch": 1.2131993759022441, "grad_norm": 5.28125, "learning_rate": 3.53815151286978e-06, "loss": 0.74369602, "memory(GiB)": 147.13, "step": 52000, "train_speed(iter/s)": 0.20157 }, { "epoch": 1.2131993759022441, "eval_acc": 0.7444085240931393, "eval_loss": 0.8053334355354309, "eval_runtime": 1270.0, "eval_samples_per_second": 28.339, "eval_steps_per_second": 14.17, "step": 52000 }, { "acc": 0.77881718, "epoch": 1.213432683474533, "grad_norm": 5.96875, "learning_rate": 3.536345049648924e-06, "loss": 0.78626618, "memory(GiB)": 147.13, "step": 52010, "train_speed(iter/s)": 0.200588 }, { "acc": 0.77282524, "epoch": 1.213665991046822, "grad_norm": 6.09375, "learning_rate": 3.5345387953643872e-06, "loss": 0.82502003, "memory(GiB)": 147.13, "step": 52020, "train_speed(iter/s)": 0.200607 }, { "acc": 0.79441557, "epoch": 1.2138992986191108, "grad_norm": 4.40625, "learning_rate": 3.5327327502740114e-06, "loss": 0.73947439, "memory(GiB)": 147.13, "step": 52030, "train_speed(iter/s)": 0.200626 }, { "acc": 0.7964479, "epoch": 1.2141326061913997, "grad_norm": 5.375, "learning_rate": 3.5309269146356097e-06, "loss": 0.72861004, "memory(GiB)": 147.13, "step": 52040, "train_speed(iter/s)": 0.200645 }, { "acc": 0.78275375, "epoch": 1.2143659137636886, "grad_norm": 4.9375, "learning_rate": 3.5291212887069624e-06, "loss": 0.77264972, "memory(GiB)": 147.13, "step": 52050, "train_speed(iter/s)": 0.200664 }, { "acc": 0.77427526, "epoch": 1.2145992213359775, "grad_norm": 8.25, "learning_rate": 3.5273158727458253e-06, "loss": 0.82076292, "memory(GiB)": 147.13, "step": 52060, "train_speed(iter/s)": 0.200684 }, { "acc": 0.78050809, "epoch": 1.2148325289082664, "grad_norm": 5.0625, "learning_rate": 3.5255106670099186e-06, "loss": 0.79774323, "memory(GiB)": 147.13, "step": 52070, "train_speed(iter/s)": 0.200704 }, { "acc": 0.77670012, "epoch": 1.2150658364805553, "grad_norm": 5.375, "learning_rate": 3.5237056717569363e-06, "loss": 0.80653028, "memory(GiB)": 147.13, "step": 52080, "train_speed(iter/s)": 0.200723 }, { "acc": 0.77818155, "epoch": 1.2152991440528442, "grad_norm": 5.09375, "learning_rate": 3.5219008872445414e-06, "loss": 0.80688896, "memory(GiB)": 147.13, "step": 52090, "train_speed(iter/s)": 0.200743 }, { "acc": 0.79195747, "epoch": 1.2155324516251331, "grad_norm": 4.40625, "learning_rate": 3.5200963137303644e-06, "loss": 0.73866539, "memory(GiB)": 147.13, "step": 52100, "train_speed(iter/s)": 0.200764 }, { "acc": 0.77116313, "epoch": 1.215765759197422, "grad_norm": 4.375, "learning_rate": 3.5182919514720087e-06, "loss": 0.81336851, "memory(GiB)": 147.13, "step": 52110, "train_speed(iter/s)": 0.200784 }, { "acc": 0.79589829, "epoch": 1.215999066769711, "grad_norm": 4.9375, "learning_rate": 3.5164878007270464e-06, "loss": 0.72459707, "memory(GiB)": 147.13, "step": 52120, "train_speed(iter/s)": 0.200803 }, { "acc": 0.7925993, "epoch": 1.2162323743419998, "grad_norm": 8.5625, "learning_rate": 3.5146838617530197e-06, "loss": 0.7503953, "memory(GiB)": 147.13, "step": 52130, "train_speed(iter/s)": 0.200822 }, { "acc": 0.77429123, "epoch": 1.2164656819142887, "grad_norm": 5.34375, "learning_rate": 3.5128801348074426e-06, "loss": 0.80790281, "memory(GiB)": 147.13, "step": 52140, "train_speed(iter/s)": 0.200843 }, { "acc": 0.79118776, "epoch": 1.2166989894865776, "grad_norm": 7.59375, "learning_rate": 3.511076620147792e-06, "loss": 0.74122844, "memory(GiB)": 147.13, "step": 52150, "train_speed(iter/s)": 0.200863 }, { "acc": 0.79350772, "epoch": 1.2169322970588663, "grad_norm": 4.96875, "learning_rate": 3.5092733180315206e-06, "loss": 0.75740881, "memory(GiB)": 147.13, "step": 52160, "train_speed(iter/s)": 0.200883 }, { "acc": 0.80063057, "epoch": 1.2171656046311554, "grad_norm": 6.40625, "learning_rate": 3.5074702287160523e-06, "loss": 0.70421405, "memory(GiB)": 147.13, "step": 52170, "train_speed(iter/s)": 0.200903 }, { "acc": 0.78242369, "epoch": 1.217398912203444, "grad_norm": 5.9375, "learning_rate": 3.5056673524587733e-06, "loss": 0.79225969, "memory(GiB)": 147.13, "step": 52180, "train_speed(iter/s)": 0.200923 }, { "acc": 0.77852802, "epoch": 1.2176322197757332, "grad_norm": 4.40625, "learning_rate": 3.503864689517046e-06, "loss": 0.80126581, "memory(GiB)": 147.13, "step": 52190, "train_speed(iter/s)": 0.200944 }, { "acc": 0.78113604, "epoch": 1.217865527348022, "grad_norm": 4.71875, "learning_rate": 3.5020622401481996e-06, "loss": 0.8105751, "memory(GiB)": 147.13, "step": 52200, "train_speed(iter/s)": 0.200964 }, { "acc": 0.76554623, "epoch": 1.2180988349203108, "grad_norm": 5.71875, "learning_rate": 3.500260004609533e-06, "loss": 0.83915272, "memory(GiB)": 147.13, "step": 52210, "train_speed(iter/s)": 0.200984 }, { "acc": 0.76399746, "epoch": 1.2183321424925997, "grad_norm": 5.4375, "learning_rate": 3.4984579831583166e-06, "loss": 0.82813606, "memory(GiB)": 147.13, "step": 52220, "train_speed(iter/s)": 0.201005 }, { "acc": 0.79804144, "epoch": 1.2185654500648886, "grad_norm": 14.8125, "learning_rate": 3.4966561760517852e-06, "loss": 0.71784048, "memory(GiB)": 147.13, "step": 52230, "train_speed(iter/s)": 0.201026 }, { "acc": 0.77001858, "epoch": 1.2187987576371775, "grad_norm": 4.96875, "learning_rate": 3.494854583547148e-06, "loss": 0.8265419, "memory(GiB)": 147.13, "step": 52240, "train_speed(iter/s)": 0.201046 }, { "acc": 0.80135403, "epoch": 1.2190320652094664, "grad_norm": 5.8125, "learning_rate": 3.4930532059015845e-06, "loss": 0.70065279, "memory(GiB)": 147.13, "step": 52250, "train_speed(iter/s)": 0.201066 }, { "acc": 0.77459936, "epoch": 1.2192653727817553, "grad_norm": 7.46875, "learning_rate": 3.491252043372236e-06, "loss": 0.81747303, "memory(GiB)": 147.13, "step": 52260, "train_speed(iter/s)": 0.201087 }, { "acc": 0.79471178, "epoch": 1.2194986803540442, "grad_norm": 7.625, "learning_rate": 3.4894510962162194e-06, "loss": 0.72956462, "memory(GiB)": 147.13, "step": 52270, "train_speed(iter/s)": 0.201106 }, { "acc": 0.76919899, "epoch": 1.219731987926333, "grad_norm": 6.25, "learning_rate": 3.4876503646906203e-06, "loss": 0.82097034, "memory(GiB)": 147.13, "step": 52280, "train_speed(iter/s)": 0.201125 }, { "acc": 0.78185291, "epoch": 1.219965295498622, "grad_norm": 4.71875, "learning_rate": 3.4858498490524924e-06, "loss": 0.80037918, "memory(GiB)": 147.13, "step": 52290, "train_speed(iter/s)": 0.201146 }, { "acc": 0.7733079, "epoch": 1.2201986030709109, "grad_norm": 4.65625, "learning_rate": 3.4840495495588593e-06, "loss": 0.78827391, "memory(GiB)": 147.13, "step": 52300, "train_speed(iter/s)": 0.201166 }, { "acc": 0.78070698, "epoch": 1.2204319106431998, "grad_norm": 4.59375, "learning_rate": 3.4822494664667117e-06, "loss": 0.80994854, "memory(GiB)": 147.13, "step": 52310, "train_speed(iter/s)": 0.201186 }, { "acc": 0.78385916, "epoch": 1.2206652182154887, "grad_norm": 5.4375, "learning_rate": 3.4804496000330124e-06, "loss": 0.75973425, "memory(GiB)": 147.13, "step": 52320, "train_speed(iter/s)": 0.201206 }, { "acc": 0.78199615, "epoch": 1.2208985257877776, "grad_norm": 6.09375, "learning_rate": 3.478649950514691e-06, "loss": 0.76241202, "memory(GiB)": 147.13, "step": 52330, "train_speed(iter/s)": 0.201227 }, { "acc": 0.79231329, "epoch": 1.2211318333600665, "grad_norm": 8.8125, "learning_rate": 3.4768505181686468e-06, "loss": 0.71977391, "memory(GiB)": 147.13, "step": 52340, "train_speed(iter/s)": 0.201246 }, { "acc": 0.77791686, "epoch": 1.2213651409323554, "grad_norm": 6.875, "learning_rate": 3.4750513032517493e-06, "loss": 0.79545641, "memory(GiB)": 147.13, "step": 52350, "train_speed(iter/s)": 0.201267 }, { "acc": 0.78212147, "epoch": 1.2215984485046443, "grad_norm": 4.8125, "learning_rate": 3.473252306020837e-06, "loss": 0.7947474, "memory(GiB)": 147.13, "step": 52360, "train_speed(iter/s)": 0.201287 }, { "acc": 0.77962971, "epoch": 1.2218317560769332, "grad_norm": 6.71875, "learning_rate": 3.471453526732712e-06, "loss": 0.79630547, "memory(GiB)": 147.13, "step": 52370, "train_speed(iter/s)": 0.201308 }, { "acc": 0.78212519, "epoch": 1.222065063649222, "grad_norm": 5.90625, "learning_rate": 3.4696549656441537e-06, "loss": 0.77139959, "memory(GiB)": 147.13, "step": 52380, "train_speed(iter/s)": 0.201328 }, { "acc": 0.76968145, "epoch": 1.222298371221511, "grad_norm": 6.375, "learning_rate": 3.467856623011903e-06, "loss": 0.83397465, "memory(GiB)": 147.13, "step": 52390, "train_speed(iter/s)": 0.201348 }, { "acc": 0.79080944, "epoch": 1.2225316787937999, "grad_norm": 4.09375, "learning_rate": 3.4660584990926748e-06, "loss": 0.73990412, "memory(GiB)": 147.13, "step": 52400, "train_speed(iter/s)": 0.201367 }, { "acc": 0.77898731, "epoch": 1.2227649863660888, "grad_norm": 6.5, "learning_rate": 3.4642605941431494e-06, "loss": 0.78911119, "memory(GiB)": 147.13, "step": 52410, "train_speed(iter/s)": 0.201387 }, { "acc": 0.76685104, "epoch": 1.2229982939383777, "grad_norm": 4.1875, "learning_rate": 3.462462908419979e-06, "loss": 0.82818241, "memory(GiB)": 147.13, "step": 52420, "train_speed(iter/s)": 0.201405 }, { "acc": 0.78912749, "epoch": 1.2232316015106666, "grad_norm": 4.4375, "learning_rate": 3.4606654421797814e-06, "loss": 0.75287247, "memory(GiB)": 147.13, "step": 52430, "train_speed(iter/s)": 0.201424 }, { "acc": 0.77771249, "epoch": 1.2234649090829555, "grad_norm": 5.40625, "learning_rate": 3.458868195679146e-06, "loss": 0.81555805, "memory(GiB)": 147.13, "step": 52440, "train_speed(iter/s)": 0.201444 }, { "acc": 0.78952436, "epoch": 1.2236982166552444, "grad_norm": 4.28125, "learning_rate": 3.4570711691746262e-06, "loss": 0.772684, "memory(GiB)": 147.13, "step": 52450, "train_speed(iter/s)": 0.201465 }, { "acc": 0.77552791, "epoch": 1.2239315242275333, "grad_norm": 4.65625, "learning_rate": 3.4552743629227494e-06, "loss": 0.80711937, "memory(GiB)": 147.13, "step": 52460, "train_speed(iter/s)": 0.201485 }, { "acc": 0.7582983, "epoch": 1.2241648317998222, "grad_norm": 7.4375, "learning_rate": 3.4534777771800083e-06, "loss": 0.85153837, "memory(GiB)": 147.13, "step": 52470, "train_speed(iter/s)": 0.201505 }, { "acc": 0.77164292, "epoch": 1.224398139372111, "grad_norm": 5.0625, "learning_rate": 3.4516814122028676e-06, "loss": 0.84448309, "memory(GiB)": 147.13, "step": 52480, "train_speed(iter/s)": 0.201523 }, { "acc": 0.76962109, "epoch": 1.2246314469444, "grad_norm": 6.625, "learning_rate": 3.449885268247753e-06, "loss": 0.81848526, "memory(GiB)": 147.13, "step": 52490, "train_speed(iter/s)": 0.201542 }, { "acc": 0.78164902, "epoch": 1.2248647545166889, "grad_norm": 5.28125, "learning_rate": 3.448089345571066e-06, "loss": 0.78736157, "memory(GiB)": 147.13, "step": 52500, "train_speed(iter/s)": 0.201562 }, { "epoch": 1.2248647545166889, "eval_acc": 0.7444283819779446, "eval_loss": 0.8053562045097351, "eval_runtime": 1270.1507, "eval_samples_per_second": 28.336, "eval_steps_per_second": 14.168, "step": 52500 }, { "acc": 0.77160788, "epoch": 1.2250980620889778, "grad_norm": 4.78125, "learning_rate": 3.4462936444291744e-06, "loss": 0.80924244, "memory(GiB)": 147.13, "step": 52510, "train_speed(iter/s)": 0.200588 }, { "acc": 0.78200493, "epoch": 1.2253313696612667, "grad_norm": 4.40625, "learning_rate": 3.4444981650784147e-06, "loss": 0.78650246, "memory(GiB)": 147.13, "step": 52520, "train_speed(iter/s)": 0.200607 }, { "acc": 0.77312975, "epoch": 1.2255646772335556, "grad_norm": 5.5, "learning_rate": 3.4427029077750895e-06, "loss": 0.81262064, "memory(GiB)": 147.13, "step": 52530, "train_speed(iter/s)": 0.200627 }, { "acc": 0.78849087, "epoch": 1.2257979848058445, "grad_norm": 9.75, "learning_rate": 3.4409078727754707e-06, "loss": 0.75306044, "memory(GiB)": 147.13, "step": 52540, "train_speed(iter/s)": 0.200647 }, { "acc": 0.78517804, "epoch": 1.2260312923781331, "grad_norm": 4.9375, "learning_rate": 3.4391130603358013e-06, "loss": 0.7650878, "memory(GiB)": 147.13, "step": 52550, "train_speed(iter/s)": 0.200666 }, { "acc": 0.81452112, "epoch": 1.2262645999504223, "grad_norm": 4.4375, "learning_rate": 3.4373184707122886e-06, "loss": 0.67917662, "memory(GiB)": 147.13, "step": 52560, "train_speed(iter/s)": 0.200686 }, { "acc": 0.80121727, "epoch": 1.226497907522711, "grad_norm": 4.375, "learning_rate": 3.4355241041611096e-06, "loss": 0.69461622, "memory(GiB)": 147.13, "step": 52570, "train_speed(iter/s)": 0.200706 }, { "acc": 0.78457727, "epoch": 1.226731215095, "grad_norm": 5.125, "learning_rate": 3.4337299609384122e-06, "loss": 0.76386065, "memory(GiB)": 147.13, "step": 52580, "train_speed(iter/s)": 0.200725 }, { "acc": 0.7923645, "epoch": 1.2269645226672887, "grad_norm": 6.375, "learning_rate": 3.431936041300308e-06, "loss": 0.75402551, "memory(GiB)": 147.13, "step": 52590, "train_speed(iter/s)": 0.200744 }, { "acc": 0.78272791, "epoch": 1.2271978302395776, "grad_norm": 4.96875, "learning_rate": 3.4301423455028777e-06, "loss": 0.77303209, "memory(GiB)": 147.13, "step": 52600, "train_speed(iter/s)": 0.200763 }, { "acc": 0.77151518, "epoch": 1.2274311378118665, "grad_norm": 4.4375, "learning_rate": 3.4283488738021707e-06, "loss": 0.83244991, "memory(GiB)": 147.13, "step": 52610, "train_speed(iter/s)": 0.200781 }, { "acc": 0.79330654, "epoch": 1.2276644453841554, "grad_norm": 5.3125, "learning_rate": 3.4265556264542054e-06, "loss": 0.72327337, "memory(GiB)": 147.13, "step": 52620, "train_speed(iter/s)": 0.200798 }, { "acc": 0.7816957, "epoch": 1.2278977529564443, "grad_norm": 6.5, "learning_rate": 3.424762603714967e-06, "loss": 0.794205, "memory(GiB)": 147.13, "step": 52630, "train_speed(iter/s)": 0.200818 }, { "acc": 0.7910058, "epoch": 1.2281310605287332, "grad_norm": 4.5625, "learning_rate": 3.4229698058404106e-06, "loss": 0.7762475, "memory(GiB)": 147.13, "step": 52640, "train_speed(iter/s)": 0.200836 }, { "acc": 0.7753264, "epoch": 1.2283643681010221, "grad_norm": 8.4375, "learning_rate": 3.4211772330864552e-06, "loss": 0.81560392, "memory(GiB)": 147.13, "step": 52650, "train_speed(iter/s)": 0.200856 }, { "acc": 0.76706119, "epoch": 1.228597675673311, "grad_norm": 7.78125, "learning_rate": 3.4193848857089924e-06, "loss": 0.85505123, "memory(GiB)": 147.13, "step": 52660, "train_speed(iter/s)": 0.200878 }, { "acc": 0.78135409, "epoch": 1.2288309832456, "grad_norm": 5.03125, "learning_rate": 3.4175927639638767e-06, "loss": 0.76477842, "memory(GiB)": 147.13, "step": 52670, "train_speed(iter/s)": 0.200897 }, { "acc": 0.79110146, "epoch": 1.2290642908178888, "grad_norm": 5.3125, "learning_rate": 3.4158008681069343e-06, "loss": 0.76938314, "memory(GiB)": 147.13, "step": 52680, "train_speed(iter/s)": 0.200918 }, { "acc": 0.7797955, "epoch": 1.2292975983901777, "grad_norm": 6.0, "learning_rate": 3.4140091983939584e-06, "loss": 0.78467655, "memory(GiB)": 147.13, "step": 52690, "train_speed(iter/s)": 0.200938 }, { "acc": 0.78217092, "epoch": 1.2295309059624666, "grad_norm": 5.25, "learning_rate": 3.4122177550807077e-06, "loss": 0.79160671, "memory(GiB)": 147.13, "step": 52700, "train_speed(iter/s)": 0.200959 }, { "acc": 0.79885192, "epoch": 1.2297642135347555, "grad_norm": 5.25, "learning_rate": 3.410426538422914e-06, "loss": 0.71652822, "memory(GiB)": 147.13, "step": 52710, "train_speed(iter/s)": 0.20098 }, { "acc": 0.78403473, "epoch": 1.2299975211070444, "grad_norm": 6.59375, "learning_rate": 3.4086355486762678e-06, "loss": 0.78251448, "memory(GiB)": 147.13, "step": 52720, "train_speed(iter/s)": 0.201 }, { "acc": 0.76983075, "epoch": 1.2302308286793333, "grad_norm": 7.3125, "learning_rate": 3.406844786096435e-06, "loss": 0.8267252, "memory(GiB)": 147.13, "step": 52730, "train_speed(iter/s)": 0.201019 }, { "acc": 0.77623463, "epoch": 1.2304641362516222, "grad_norm": 5.375, "learning_rate": 3.405054250939047e-06, "loss": 0.81395969, "memory(GiB)": 147.13, "step": 52740, "train_speed(iter/s)": 0.201039 }, { "acc": 0.76780348, "epoch": 1.230697443823911, "grad_norm": 7.53125, "learning_rate": 3.4032639434597003e-06, "loss": 0.83625088, "memory(GiB)": 147.13, "step": 52750, "train_speed(iter/s)": 0.201059 }, { "acc": 0.79240489, "epoch": 1.2309307513962, "grad_norm": 5.5, "learning_rate": 3.4014738639139622e-06, "loss": 0.75073404, "memory(GiB)": 147.13, "step": 52760, "train_speed(iter/s)": 0.201079 }, { "acc": 0.79068785, "epoch": 1.231164058968489, "grad_norm": 5.4375, "learning_rate": 3.399684012557365e-06, "loss": 0.75344276, "memory(GiB)": 147.13, "step": 52770, "train_speed(iter/s)": 0.201097 }, { "acc": 0.81060543, "epoch": 1.2313973665407778, "grad_norm": 4.625, "learning_rate": 3.3978943896454107e-06, "loss": 0.66089754, "memory(GiB)": 147.13, "step": 52780, "train_speed(iter/s)": 0.201117 }, { "acc": 0.79075351, "epoch": 1.2316306741130667, "grad_norm": 6.84375, "learning_rate": 3.396104995433567e-06, "loss": 0.73719893, "memory(GiB)": 147.13, "step": 52790, "train_speed(iter/s)": 0.201137 }, { "acc": 0.78999639, "epoch": 1.2318639816853556, "grad_norm": 6.25, "learning_rate": 3.3943158301772695e-06, "loss": 0.74546919, "memory(GiB)": 147.13, "step": 52800, "train_speed(iter/s)": 0.201157 }, { "acc": 0.79035387, "epoch": 1.2320972892576445, "grad_norm": 3.6875, "learning_rate": 3.39252689413192e-06, "loss": 0.75522537, "memory(GiB)": 147.13, "step": 52810, "train_speed(iter/s)": 0.201176 }, { "acc": 0.77618947, "epoch": 1.2323305968299334, "grad_norm": 4.9375, "learning_rate": 3.3907381875528916e-06, "loss": 0.81478825, "memory(GiB)": 147.13, "step": 52820, "train_speed(iter/s)": 0.201194 }, { "acc": 0.75931473, "epoch": 1.2325639044022223, "grad_norm": 5.125, "learning_rate": 3.388949710695517e-06, "loss": 0.86494942, "memory(GiB)": 147.13, "step": 52830, "train_speed(iter/s)": 0.201214 }, { "acc": 0.7774549, "epoch": 1.2327972119745112, "grad_norm": 8.5625, "learning_rate": 3.387161463815104e-06, "loss": 0.80290031, "memory(GiB)": 147.13, "step": 52840, "train_speed(iter/s)": 0.201234 }, { "acc": 0.7622488, "epoch": 1.2330305195468, "grad_norm": 5.0, "learning_rate": 3.3853734471669232e-06, "loss": 0.86460876, "memory(GiB)": 147.13, "step": 52850, "train_speed(iter/s)": 0.201254 }, { "acc": 0.78175044, "epoch": 1.233263827119089, "grad_norm": 7.09375, "learning_rate": 3.3835856610062135e-06, "loss": 0.76835246, "memory(GiB)": 147.13, "step": 52860, "train_speed(iter/s)": 0.201275 }, { "acc": 0.77455969, "epoch": 1.233497134691378, "grad_norm": 6.3125, "learning_rate": 3.381798105588181e-06, "loss": 0.79920435, "memory(GiB)": 147.13, "step": 52870, "train_speed(iter/s)": 0.201295 }, { "acc": 0.7796454, "epoch": 1.2337304422636668, "grad_norm": 6.21875, "learning_rate": 3.3800107811680004e-06, "loss": 0.78245277, "memory(GiB)": 147.13, "step": 52880, "train_speed(iter/s)": 0.201315 }, { "acc": 0.77907124, "epoch": 1.2339637498359557, "grad_norm": 5.46875, "learning_rate": 3.378223688000809e-06, "loss": 0.78705425, "memory(GiB)": 147.13, "step": 52890, "train_speed(iter/s)": 0.201334 }, { "acc": 0.7818182, "epoch": 1.2341970574082446, "grad_norm": 4.96875, "learning_rate": 3.3764368263417146e-06, "loss": 0.76016235, "memory(GiB)": 147.13, "step": 52900, "train_speed(iter/s)": 0.201355 }, { "acc": 0.78806572, "epoch": 1.2344303649805335, "grad_norm": 8.0, "learning_rate": 3.3746501964457916e-06, "loss": 0.75710969, "memory(GiB)": 147.13, "step": 52910, "train_speed(iter/s)": 0.201375 }, { "acc": 0.78649473, "epoch": 1.2346636725528224, "grad_norm": 5.28125, "learning_rate": 3.3728637985680814e-06, "loss": 0.7589705, "memory(GiB)": 147.13, "step": 52920, "train_speed(iter/s)": 0.201395 }, { "acc": 0.79017825, "epoch": 1.2348969801251113, "grad_norm": 6.28125, "learning_rate": 3.371077632963592e-06, "loss": 0.74173431, "memory(GiB)": 147.13, "step": 52930, "train_speed(iter/s)": 0.201413 }, { "acc": 0.77001181, "epoch": 1.2351302876974, "grad_norm": 9.25, "learning_rate": 3.3692916998872972e-06, "loss": 0.82475138, "memory(GiB)": 147.13, "step": 52940, "train_speed(iter/s)": 0.201433 }, { "acc": 0.78582697, "epoch": 1.235363595269689, "grad_norm": 5.40625, "learning_rate": 3.367505999594138e-06, "loss": 0.76669807, "memory(GiB)": 147.13, "step": 52950, "train_speed(iter/s)": 0.201453 }, { "acc": 0.7884789, "epoch": 1.2355969028419778, "grad_norm": 7.0, "learning_rate": 3.3657205323390234e-06, "loss": 0.76776428, "memory(GiB)": 147.13, "step": 52960, "train_speed(iter/s)": 0.201474 }, { "acc": 0.77210593, "epoch": 1.2358302104142667, "grad_norm": 4.9375, "learning_rate": 3.3639352983768276e-06, "loss": 0.8105154, "memory(GiB)": 147.13, "step": 52970, "train_speed(iter/s)": 0.201495 }, { "acc": 0.80149565, "epoch": 1.2360635179865556, "grad_norm": 9.625, "learning_rate": 3.3621502979623923e-06, "loss": 0.69630108, "memory(GiB)": 147.13, "step": 52980, "train_speed(iter/s)": 0.201515 }, { "acc": 0.77142386, "epoch": 1.2362968255588445, "grad_norm": 10.125, "learning_rate": 3.360365531350527e-06, "loss": 0.830303, "memory(GiB)": 147.13, "step": 52990, "train_speed(iter/s)": 0.201536 }, { "acc": 0.78944068, "epoch": 1.2365301331311334, "grad_norm": 5.59375, "learning_rate": 3.358580998796005e-06, "loss": 0.75569763, "memory(GiB)": 147.13, "step": 53000, "train_speed(iter/s)": 0.201555 }, { "epoch": 1.2365301331311334, "eval_acc": 0.7444263001029247, "eval_loss": 0.8053351044654846, "eval_runtime": 1270.5258, "eval_samples_per_second": 28.328, "eval_steps_per_second": 14.164, "step": 53000 }, { "acc": 0.7961484, "epoch": 1.2367634407034223, "grad_norm": 5.375, "learning_rate": 3.3567967005535696e-06, "loss": 0.73044991, "memory(GiB)": 147.13, "step": 53010, "train_speed(iter/s)": 0.200588 }, { "acc": 0.77993622, "epoch": 1.2369967482757112, "grad_norm": 4.75, "learning_rate": 3.355012636877927e-06, "loss": 0.77450337, "memory(GiB)": 147.13, "step": 53020, "train_speed(iter/s)": 0.200607 }, { "acc": 0.78480597, "epoch": 1.237230055848, "grad_norm": 5.0, "learning_rate": 3.353228808023752e-06, "loss": 0.77071924, "memory(GiB)": 147.13, "step": 53030, "train_speed(iter/s)": 0.200626 }, { "acc": 0.8039917, "epoch": 1.237463363420289, "grad_norm": 4.46875, "learning_rate": 3.351445214245687e-06, "loss": 0.68680391, "memory(GiB)": 147.13, "step": 53040, "train_speed(iter/s)": 0.200645 }, { "acc": 0.78955479, "epoch": 1.2376966709925779, "grad_norm": 5.40625, "learning_rate": 3.3496618557983405e-06, "loss": 0.73595805, "memory(GiB)": 147.13, "step": 53050, "train_speed(iter/s)": 0.200665 }, { "acc": 0.79907274, "epoch": 1.2379299785648668, "grad_norm": 4.46875, "learning_rate": 3.347878732936283e-06, "loss": 0.73365273, "memory(GiB)": 147.13, "step": 53060, "train_speed(iter/s)": 0.200684 }, { "acc": 0.77731886, "epoch": 1.2381632861371556, "grad_norm": 4.65625, "learning_rate": 3.346095845914056e-06, "loss": 0.81012163, "memory(GiB)": 147.13, "step": 53070, "train_speed(iter/s)": 0.200704 }, { "acc": 0.76856656, "epoch": 1.2383965937094445, "grad_norm": 5.40625, "learning_rate": 3.3443131949861667e-06, "loss": 0.84078465, "memory(GiB)": 147.13, "step": 53080, "train_speed(iter/s)": 0.200722 }, { "acc": 0.78171597, "epoch": 1.2386299012817334, "grad_norm": 12.3125, "learning_rate": 3.3425307804070896e-06, "loss": 0.79482059, "memory(GiB)": 147.13, "step": 53090, "train_speed(iter/s)": 0.200742 }, { "acc": 0.78333645, "epoch": 1.2388632088540223, "grad_norm": 6.78125, "learning_rate": 3.3407486024312596e-06, "loss": 0.78734274, "memory(GiB)": 147.13, "step": 53100, "train_speed(iter/s)": 0.200761 }, { "acc": 0.7784162, "epoch": 1.2390965164263112, "grad_norm": 6.375, "learning_rate": 3.3389666613130856e-06, "loss": 0.79651308, "memory(GiB)": 147.13, "step": 53110, "train_speed(iter/s)": 0.200781 }, { "acc": 0.81201239, "epoch": 1.2393298239986001, "grad_norm": 3.671875, "learning_rate": 3.337184957306938e-06, "loss": 0.6528636, "memory(GiB)": 147.13, "step": 53120, "train_speed(iter/s)": 0.200801 }, { "acc": 0.79003553, "epoch": 1.239563131570889, "grad_norm": 7.8125, "learning_rate": 3.3354034906671545e-06, "loss": 0.77086411, "memory(GiB)": 147.13, "step": 53130, "train_speed(iter/s)": 0.200821 }, { "acc": 0.78376465, "epoch": 1.239796439143178, "grad_norm": 7.25, "learning_rate": 3.333622261648039e-06, "loss": 0.76185646, "memory(GiB)": 147.13, "step": 53140, "train_speed(iter/s)": 0.20084 }, { "acc": 0.7993577, "epoch": 1.2400297467154668, "grad_norm": 5.9375, "learning_rate": 3.3318412705038626e-06, "loss": 0.70314708, "memory(GiB)": 147.13, "step": 53150, "train_speed(iter/s)": 0.200859 }, { "acc": 0.77541742, "epoch": 1.2402630542877557, "grad_norm": 6.5625, "learning_rate": 3.330060517488861e-06, "loss": 0.81968384, "memory(GiB)": 147.13, "step": 53160, "train_speed(iter/s)": 0.20088 }, { "acc": 0.78192739, "epoch": 1.2404963618600446, "grad_norm": 6.625, "learning_rate": 3.328280002857234e-06, "loss": 0.77632504, "memory(GiB)": 147.13, "step": 53170, "train_speed(iter/s)": 0.2009 }, { "acc": 0.79778252, "epoch": 1.2407296694323335, "grad_norm": 6.5625, "learning_rate": 3.3264997268631515e-06, "loss": 0.75389705, "memory(GiB)": 147.13, "step": 53180, "train_speed(iter/s)": 0.200921 }, { "acc": 0.77507801, "epoch": 1.2409629770046224, "grad_norm": 5.0, "learning_rate": 3.324719689760746e-06, "loss": 0.80741787, "memory(GiB)": 147.13, "step": 53190, "train_speed(iter/s)": 0.200941 }, { "acc": 0.78459139, "epoch": 1.2411962845769113, "grad_norm": 5.78125, "learning_rate": 3.3229398918041184e-06, "loss": 0.76652284, "memory(GiB)": 147.13, "step": 53200, "train_speed(iter/s)": 0.200958 }, { "acc": 0.7865819, "epoch": 1.2414295921492002, "grad_norm": 4.125, "learning_rate": 3.321160333247334e-06, "loss": 0.76798935, "memory(GiB)": 147.13, "step": 53210, "train_speed(iter/s)": 0.200978 }, { "acc": 0.77011223, "epoch": 1.2416628997214891, "grad_norm": 8.125, "learning_rate": 3.319381014344424e-06, "loss": 0.81456013, "memory(GiB)": 147.13, "step": 53220, "train_speed(iter/s)": 0.200995 }, { "acc": 0.78842793, "epoch": 1.241896207293778, "grad_norm": 5.65625, "learning_rate": 3.3176019353493873e-06, "loss": 0.74759521, "memory(GiB)": 147.13, "step": 53230, "train_speed(iter/s)": 0.201014 }, { "acc": 0.75929575, "epoch": 1.242129514866067, "grad_norm": 5.09375, "learning_rate": 3.315823096516184e-06, "loss": 0.8730217, "memory(GiB)": 147.13, "step": 53240, "train_speed(iter/s)": 0.201033 }, { "acc": 0.78181305, "epoch": 1.2423628224383558, "grad_norm": 7.375, "learning_rate": 3.314044498098745e-06, "loss": 0.77940979, "memory(GiB)": 147.13, "step": 53250, "train_speed(iter/s)": 0.201051 }, { "acc": 0.79055223, "epoch": 1.2425961300106447, "grad_norm": 5.09375, "learning_rate": 3.3122661403509643e-06, "loss": 0.73277559, "memory(GiB)": 147.13, "step": 53260, "train_speed(iter/s)": 0.20107 }, { "acc": 0.78648615, "epoch": 1.2428294375829336, "grad_norm": 7.40625, "learning_rate": 3.3104880235267014e-06, "loss": 0.7687428, "memory(GiB)": 147.13, "step": 53270, "train_speed(iter/s)": 0.201089 }, { "acc": 0.77741222, "epoch": 1.2430627451552225, "grad_norm": 7.5625, "learning_rate": 3.3087101478797846e-06, "loss": 0.80210009, "memory(GiB)": 147.13, "step": 53280, "train_speed(iter/s)": 0.201108 }, { "acc": 0.77117157, "epoch": 1.2432960527275114, "grad_norm": 4.46875, "learning_rate": 3.3069325136640007e-06, "loss": 0.81199608, "memory(GiB)": 147.13, "step": 53290, "train_speed(iter/s)": 0.201126 }, { "acc": 0.77286162, "epoch": 1.2435293602998003, "grad_norm": 6.21875, "learning_rate": 3.305155121133109e-06, "loss": 0.80843086, "memory(GiB)": 147.13, "step": 53300, "train_speed(iter/s)": 0.201146 }, { "acc": 0.78854752, "epoch": 1.243762667872089, "grad_norm": 5.5625, "learning_rate": 3.303377970540832e-06, "loss": 0.76747618, "memory(GiB)": 147.13, "step": 53310, "train_speed(iter/s)": 0.201164 }, { "acc": 0.77600183, "epoch": 1.2439959754443781, "grad_norm": 5.75, "learning_rate": 3.3016010621408558e-06, "loss": 0.7977457, "memory(GiB)": 147.13, "step": 53320, "train_speed(iter/s)": 0.201183 }, { "acc": 0.79537611, "epoch": 1.2442292830166668, "grad_norm": 9.9375, "learning_rate": 3.299824396186835e-06, "loss": 0.73316031, "memory(GiB)": 147.13, "step": 53330, "train_speed(iter/s)": 0.201202 }, { "acc": 0.78379049, "epoch": 1.244462590588956, "grad_norm": 4.90625, "learning_rate": 3.2980479729323867e-06, "loss": 0.77379413, "memory(GiB)": 147.13, "step": 53340, "train_speed(iter/s)": 0.201222 }, { "acc": 0.75966711, "epoch": 1.2446958981612446, "grad_norm": 4.625, "learning_rate": 3.2962717926310966e-06, "loss": 0.87534847, "memory(GiB)": 147.13, "step": 53350, "train_speed(iter/s)": 0.20124 }, { "acc": 0.79759836, "epoch": 1.2449292057335335, "grad_norm": 3.828125, "learning_rate": 3.2944958555365135e-06, "loss": 0.71614137, "memory(GiB)": 147.13, "step": 53360, "train_speed(iter/s)": 0.201259 }, { "acc": 0.80230551, "epoch": 1.2451625133058224, "grad_norm": 5.625, "learning_rate": 3.292720161902152e-06, "loss": 0.69462061, "memory(GiB)": 147.13, "step": 53370, "train_speed(iter/s)": 0.201278 }, { "acc": 0.79200816, "epoch": 1.2453958208781113, "grad_norm": 6.5625, "learning_rate": 3.2909447119814907e-06, "loss": 0.74009314, "memory(GiB)": 147.13, "step": 53380, "train_speed(iter/s)": 0.201299 }, { "acc": 0.78173537, "epoch": 1.2456291284504002, "grad_norm": 4.8125, "learning_rate": 3.289169506027977e-06, "loss": 0.769735, "memory(GiB)": 147.13, "step": 53390, "train_speed(iter/s)": 0.201318 }, { "acc": 0.78138733, "epoch": 1.245862436022689, "grad_norm": 4.84375, "learning_rate": 3.287394544295018e-06, "loss": 0.79002943, "memory(GiB)": 147.13, "step": 53400, "train_speed(iter/s)": 0.201338 }, { "acc": 0.77280626, "epoch": 1.246095743594978, "grad_norm": 6.21875, "learning_rate": 3.2856198270359895e-06, "loss": 0.82154875, "memory(GiB)": 147.13, "step": 53410, "train_speed(iter/s)": 0.201357 }, { "acc": 0.79443498, "epoch": 1.2463290511672669, "grad_norm": 4.90625, "learning_rate": 3.2838453545042326e-06, "loss": 0.72624025, "memory(GiB)": 147.13, "step": 53420, "train_speed(iter/s)": 0.201376 }, { "acc": 0.78037777, "epoch": 1.2465623587395558, "grad_norm": 4.71875, "learning_rate": 3.2820711269530535e-06, "loss": 0.7707921, "memory(GiB)": 147.13, "step": 53430, "train_speed(iter/s)": 0.201395 }, { "acc": 0.78268867, "epoch": 1.2467956663118447, "grad_norm": 5.90625, "learning_rate": 3.280297144635721e-06, "loss": 0.76652794, "memory(GiB)": 147.13, "step": 53440, "train_speed(iter/s)": 0.201415 }, { "acc": 0.76735125, "epoch": 1.2470289738841336, "grad_norm": 4.125, "learning_rate": 3.278523407805474e-06, "loss": 0.83181791, "memory(GiB)": 147.13, "step": 53450, "train_speed(iter/s)": 0.201435 }, { "acc": 0.77343864, "epoch": 1.2472622814564225, "grad_norm": 6.125, "learning_rate": 3.276749916715508e-06, "loss": 0.81363611, "memory(GiB)": 147.13, "step": 53460, "train_speed(iter/s)": 0.201455 }, { "acc": 0.7868227, "epoch": 1.2474955890287114, "grad_norm": 5.28125, "learning_rate": 3.274976671618992e-06, "loss": 0.75913687, "memory(GiB)": 147.13, "step": 53470, "train_speed(iter/s)": 0.201475 }, { "acc": 0.7895021, "epoch": 1.2477288966010003, "grad_norm": 4.40625, "learning_rate": 3.2732036727690543e-06, "loss": 0.75617399, "memory(GiB)": 147.13, "step": 53480, "train_speed(iter/s)": 0.201495 }, { "acc": 0.78445072, "epoch": 1.2479622041732892, "grad_norm": 5.25, "learning_rate": 3.2714309204187905e-06, "loss": 0.77543073, "memory(GiB)": 147.13, "step": 53490, "train_speed(iter/s)": 0.201513 }, { "acc": 0.76975474, "epoch": 1.248195511745578, "grad_norm": 5.25, "learning_rate": 3.2696584148212606e-06, "loss": 0.8373764, "memory(GiB)": 147.13, "step": 53500, "train_speed(iter/s)": 0.201533 }, { "epoch": 1.248195511745578, "eval_acc": 0.744429663131803, "eval_loss": 0.8053871393203735, "eval_runtime": 1269.4157, "eval_samples_per_second": 28.352, "eval_steps_per_second": 14.177, "step": 53500 }, { "acc": 0.7716815, "epoch": 1.248428819317867, "grad_norm": 4.34375, "learning_rate": 3.2678861562294916e-06, "loss": 0.81136675, "memory(GiB)": 147.13, "step": 53510, "train_speed(iter/s)": 0.20058 }, { "acc": 0.77584715, "epoch": 1.2486621268901559, "grad_norm": 5.5, "learning_rate": 3.2661141448964688e-06, "loss": 0.78764925, "memory(GiB)": 147.13, "step": 53520, "train_speed(iter/s)": 0.200599 }, { "acc": 0.78310165, "epoch": 1.2488954344624448, "grad_norm": 4.25, "learning_rate": 3.2643423810751497e-06, "loss": 0.76774969, "memory(GiB)": 147.13, "step": 53530, "train_speed(iter/s)": 0.200618 }, { "acc": 0.76813574, "epoch": 1.2491287420347337, "grad_norm": 5.78125, "learning_rate": 3.2625708650184496e-06, "loss": 0.83579845, "memory(GiB)": 147.13, "step": 53540, "train_speed(iter/s)": 0.200637 }, { "acc": 0.77997684, "epoch": 1.2493620496070226, "grad_norm": 6.34375, "learning_rate": 3.260799596979254e-06, "loss": 0.79803414, "memory(GiB)": 147.13, "step": 53550, "train_speed(iter/s)": 0.200657 }, { "acc": 0.78893514, "epoch": 1.2495953571793115, "grad_norm": 4.4375, "learning_rate": 3.25902857721041e-06, "loss": 0.76923342, "memory(GiB)": 147.13, "step": 53560, "train_speed(iter/s)": 0.200676 }, { "acc": 0.76379528, "epoch": 1.2498286647516004, "grad_norm": 4.34375, "learning_rate": 3.257257805964732e-06, "loss": 0.85577602, "memory(GiB)": 147.13, "step": 53570, "train_speed(iter/s)": 0.200696 }, { "acc": 0.7713376, "epoch": 1.2500619723238893, "grad_norm": 5.34375, "learning_rate": 3.255487283494995e-06, "loss": 0.83352032, "memory(GiB)": 147.13, "step": 53580, "train_speed(iter/s)": 0.200715 }, { "acc": 0.78212161, "epoch": 1.2502952798961782, "grad_norm": 6.6875, "learning_rate": 3.253717010053943e-06, "loss": 0.7555759, "memory(GiB)": 147.13, "step": 53590, "train_speed(iter/s)": 0.200734 }, { "acc": 0.7807415, "epoch": 1.250528587468467, "grad_norm": 7.53125, "learning_rate": 3.25194698589428e-06, "loss": 0.7950223, "memory(GiB)": 147.13, "step": 53600, "train_speed(iter/s)": 0.200754 }, { "acc": 0.76310434, "epoch": 1.250761895040756, "grad_norm": 4.8125, "learning_rate": 3.2501772112686757e-06, "loss": 0.87415972, "memory(GiB)": 147.13, "step": 53610, "train_speed(iter/s)": 0.200773 }, { "acc": 0.7808527, "epoch": 1.2509952026130449, "grad_norm": 5.59375, "learning_rate": 3.2484076864297687e-06, "loss": 0.78480973, "memory(GiB)": 147.13, "step": 53620, "train_speed(iter/s)": 0.200793 }, { "acc": 0.77754536, "epoch": 1.2512285101853338, "grad_norm": 5.0, "learning_rate": 3.246638411630154e-06, "loss": 0.80107412, "memory(GiB)": 147.13, "step": 53630, "train_speed(iter/s)": 0.200812 }, { "acc": 0.76862297, "epoch": 1.2514618177576227, "grad_norm": 4.3125, "learning_rate": 3.2448693871223968e-06, "loss": 0.85014858, "memory(GiB)": 147.13, "step": 53640, "train_speed(iter/s)": 0.200832 }, { "acc": 0.78168688, "epoch": 1.2516951253299116, "grad_norm": 7.71875, "learning_rate": 3.2431006131590244e-06, "loss": 0.77133665, "memory(GiB)": 147.13, "step": 53650, "train_speed(iter/s)": 0.200852 }, { "acc": 0.77619853, "epoch": 1.2519284329022005, "grad_norm": 4.78125, "learning_rate": 3.2413320899925287e-06, "loss": 0.782967, "memory(GiB)": 147.13, "step": 53660, "train_speed(iter/s)": 0.200871 }, { "acc": 0.78052235, "epoch": 1.2521617404744894, "grad_norm": 6.5, "learning_rate": 3.2395638178753673e-06, "loss": 0.77000246, "memory(GiB)": 147.13, "step": 53670, "train_speed(iter/s)": 0.200891 }, { "acc": 0.76536398, "epoch": 1.252395048046778, "grad_norm": 5.96875, "learning_rate": 3.2377957970599594e-06, "loss": 0.83446522, "memory(GiB)": 147.13, "step": 53680, "train_speed(iter/s)": 0.20091 }, { "acc": 0.79172969, "epoch": 1.2526283556190672, "grad_norm": 6.09375, "learning_rate": 3.2360280277986887e-06, "loss": 0.73861761, "memory(GiB)": 147.13, "step": 53690, "train_speed(iter/s)": 0.200929 }, { "acc": 0.76554937, "epoch": 1.2528616631913558, "grad_norm": 6.1875, "learning_rate": 3.234260510343905e-06, "loss": 0.84061546, "memory(GiB)": 147.13, "step": 53700, "train_speed(iter/s)": 0.200949 }, { "acc": 0.77454815, "epoch": 1.253094970763645, "grad_norm": 4.78125, "learning_rate": 3.23249324494792e-06, "loss": 0.82709484, "memory(GiB)": 147.13, "step": 53710, "train_speed(iter/s)": 0.200968 }, { "acc": 0.7816751, "epoch": 1.2533282783359336, "grad_norm": 5.3125, "learning_rate": 3.230726231863013e-06, "loss": 0.78138809, "memory(GiB)": 147.13, "step": 53720, "train_speed(iter/s)": 0.200986 }, { "acc": 0.78555498, "epoch": 1.2535615859082228, "grad_norm": 6.78125, "learning_rate": 3.2289594713414207e-06, "loss": 0.78290424, "memory(GiB)": 147.13, "step": 53730, "train_speed(iter/s)": 0.201005 }, { "acc": 0.79429498, "epoch": 1.2537948934805114, "grad_norm": 5.59375, "learning_rate": 3.2271929636353494e-06, "loss": 0.72328415, "memory(GiB)": 147.13, "step": 53740, "train_speed(iter/s)": 0.201025 }, { "acc": 0.78629532, "epoch": 1.2540282010528006, "grad_norm": 4.15625, "learning_rate": 3.2254267089969688e-06, "loss": 0.76473923, "memory(GiB)": 147.13, "step": 53750, "train_speed(iter/s)": 0.201043 }, { "acc": 0.78774004, "epoch": 1.2542615086250892, "grad_norm": 7.0, "learning_rate": 3.2236607076784086e-06, "loss": 0.76563559, "memory(GiB)": 147.13, "step": 53760, "train_speed(iter/s)": 0.201062 }, { "acc": 0.78109026, "epoch": 1.2544948161973781, "grad_norm": 6.8125, "learning_rate": 3.2218949599317664e-06, "loss": 0.79619112, "memory(GiB)": 147.13, "step": 53770, "train_speed(iter/s)": 0.201082 }, { "acc": 0.77077236, "epoch": 1.254728123769667, "grad_norm": 5.03125, "learning_rate": 3.220129466009102e-06, "loss": 0.83952332, "memory(GiB)": 147.13, "step": 53780, "train_speed(iter/s)": 0.201101 }, { "acc": 0.7884572, "epoch": 1.254961431341956, "grad_norm": 5.625, "learning_rate": 3.2183642261624393e-06, "loss": 0.75626945, "memory(GiB)": 147.13, "step": 53790, "train_speed(iter/s)": 0.20112 }, { "acc": 0.75669336, "epoch": 1.2551947389142448, "grad_norm": 5.0, "learning_rate": 3.216599240643765e-06, "loss": 0.88252993, "memory(GiB)": 147.13, "step": 53800, "train_speed(iter/s)": 0.20114 }, { "acc": 0.77265863, "epoch": 1.2554280464865337, "grad_norm": 9.5, "learning_rate": 3.2148345097050332e-06, "loss": 0.83709316, "memory(GiB)": 147.13, "step": 53810, "train_speed(iter/s)": 0.201159 }, { "acc": 0.76388264, "epoch": 1.2556613540588226, "grad_norm": 15.5625, "learning_rate": 3.213070033598155e-06, "loss": 0.85438976, "memory(GiB)": 147.13, "step": 53820, "train_speed(iter/s)": 0.20118 }, { "acc": 0.80244446, "epoch": 1.2558946616311115, "grad_norm": 5.0625, "learning_rate": 3.211305812575011e-06, "loss": 0.70825286, "memory(GiB)": 147.13, "step": 53830, "train_speed(iter/s)": 0.201199 }, { "acc": 0.79701781, "epoch": 1.2561279692034004, "grad_norm": 5.4375, "learning_rate": 3.209541846887442e-06, "loss": 0.73176861, "memory(GiB)": 147.13, "step": 53840, "train_speed(iter/s)": 0.201219 }, { "acc": 0.79489713, "epoch": 1.2563612767756893, "grad_norm": 4.0, "learning_rate": 3.207778136787256e-06, "loss": 0.70806804, "memory(GiB)": 147.13, "step": 53850, "train_speed(iter/s)": 0.201238 }, { "acc": 0.78195076, "epoch": 1.2565945843479782, "grad_norm": 4.5625, "learning_rate": 3.2060146825262196e-06, "loss": 0.79273672, "memory(GiB)": 147.13, "step": 53860, "train_speed(iter/s)": 0.201258 }, { "acc": 0.79850488, "epoch": 1.2568278919202671, "grad_norm": 4.46875, "learning_rate": 3.2042514843560644e-06, "loss": 0.71020679, "memory(GiB)": 147.13, "step": 53870, "train_speed(iter/s)": 0.201279 }, { "acc": 0.77073278, "epoch": 1.257061199492556, "grad_norm": 5.84375, "learning_rate": 3.2024885425284893e-06, "loss": 0.81626215, "memory(GiB)": 147.13, "step": 53880, "train_speed(iter/s)": 0.201299 }, { "acc": 0.7916851, "epoch": 1.257294507064845, "grad_norm": 4.28125, "learning_rate": 3.200725857295153e-06, "loss": 0.73321352, "memory(GiB)": 147.13, "step": 53890, "train_speed(iter/s)": 0.201319 }, { "acc": 0.77081394, "epoch": 1.2575278146371338, "grad_norm": 4.78125, "learning_rate": 3.1989634289076776e-06, "loss": 0.81450729, "memory(GiB)": 147.13, "step": 53900, "train_speed(iter/s)": 0.201336 }, { "acc": 0.77993765, "epoch": 1.2577611222094227, "grad_norm": 4.625, "learning_rate": 3.197201257617649e-06, "loss": 0.78307796, "memory(GiB)": 147.13, "step": 53910, "train_speed(iter/s)": 0.201356 }, { "acc": 0.78636661, "epoch": 1.2579944297817116, "grad_norm": 5.4375, "learning_rate": 3.195439343676617e-06, "loss": 0.77123709, "memory(GiB)": 147.13, "step": 53920, "train_speed(iter/s)": 0.201376 }, { "acc": 0.78531208, "epoch": 1.2582277373540005, "grad_norm": 4.5, "learning_rate": 3.1936776873360947e-06, "loss": 0.78743391, "memory(GiB)": 147.13, "step": 53930, "train_speed(iter/s)": 0.201395 }, { "acc": 0.78387508, "epoch": 1.2584610449262894, "grad_norm": 9.875, "learning_rate": 3.1919162888475586e-06, "loss": 0.77252707, "memory(GiB)": 147.13, "step": 53940, "train_speed(iter/s)": 0.201416 }, { "acc": 0.79411707, "epoch": 1.2586943524985783, "grad_norm": 5.5, "learning_rate": 3.190155148462446e-06, "loss": 0.72856312, "memory(GiB)": 147.13, "step": 53950, "train_speed(iter/s)": 0.201436 }, { "acc": 0.76296749, "epoch": 1.2589276600708672, "grad_norm": 4.96875, "learning_rate": 3.188394266432162e-06, "loss": 0.8607048, "memory(GiB)": 147.13, "step": 53960, "train_speed(iter/s)": 0.201455 }, { "acc": 0.77531281, "epoch": 1.259160967643156, "grad_norm": 5.625, "learning_rate": 3.186633643008069e-06, "loss": 0.80578537, "memory(GiB)": 147.13, "step": 53970, "train_speed(iter/s)": 0.201475 }, { "acc": 0.78390398, "epoch": 1.259394275215445, "grad_norm": 4.46875, "learning_rate": 3.1848732784414965e-06, "loss": 0.75349741, "memory(GiB)": 147.13, "step": 53980, "train_speed(iter/s)": 0.201495 }, { "acc": 0.7925087, "epoch": 1.259627582787734, "grad_norm": 5.15625, "learning_rate": 3.183113172983736e-06, "loss": 0.75053062, "memory(GiB)": 147.13, "step": 53990, "train_speed(iter/s)": 0.201513 }, { "acc": 0.79274836, "epoch": 1.2598608903600228, "grad_norm": 6.0625, "learning_rate": 3.181353326886042e-06, "loss": 0.74251266, "memory(GiB)": 147.13, "step": 54000, "train_speed(iter/s)": 0.201533 }, { "epoch": 1.2598608903600228, "eval_acc": 0.7445457677002215, "eval_loss": 0.8053033947944641, "eval_runtime": 1272.3358, "eval_samples_per_second": 28.287, "eval_steps_per_second": 14.144, "step": 54000 }, { "acc": 0.78218327, "epoch": 1.2600941979323117, "grad_norm": 6.46875, "learning_rate": 3.1795937403996324e-06, "loss": 0.76867433, "memory(GiB)": 147.13, "step": 54010, "train_speed(iter/s)": 0.200585 }, { "acc": 0.78693171, "epoch": 1.2603275055046006, "grad_norm": 5.40625, "learning_rate": 3.1778344137756887e-06, "loss": 0.75604353, "memory(GiB)": 147.13, "step": 54020, "train_speed(iter/s)": 0.200605 }, { "acc": 0.79023509, "epoch": 1.2605608130768895, "grad_norm": 4.84375, "learning_rate": 3.176075347265352e-06, "loss": 0.7607203, "memory(GiB)": 147.13, "step": 54030, "train_speed(iter/s)": 0.200623 }, { "acc": 0.77073202, "epoch": 1.2607941206491784, "grad_norm": 4.75, "learning_rate": 3.17431654111973e-06, "loss": 0.82227421, "memory(GiB)": 147.13, "step": 54040, "train_speed(iter/s)": 0.200642 }, { "acc": 0.78074164, "epoch": 1.2610274282214673, "grad_norm": 6.25, "learning_rate": 3.1725579955898904e-06, "loss": 0.76902056, "memory(GiB)": 147.13, "step": 54050, "train_speed(iter/s)": 0.200661 }, { "acc": 0.77640028, "epoch": 1.2612607357937562, "grad_norm": 6.5625, "learning_rate": 3.170799710926867e-06, "loss": 0.80473099, "memory(GiB)": 147.13, "step": 54060, "train_speed(iter/s)": 0.200681 }, { "acc": 0.76053233, "epoch": 1.2614940433660449, "grad_norm": 5.53125, "learning_rate": 3.1690416873816533e-06, "loss": 0.87321119, "memory(GiB)": 147.13, "step": 54070, "train_speed(iter/s)": 0.200699 }, { "acc": 0.78000383, "epoch": 1.261727350938334, "grad_norm": 4.875, "learning_rate": 3.1672839252052083e-06, "loss": 0.77255969, "memory(GiB)": 147.13, "step": 54080, "train_speed(iter/s)": 0.200719 }, { "acc": 0.79675488, "epoch": 1.2619606585106227, "grad_norm": 5.125, "learning_rate": 3.165526424648449e-06, "loss": 0.71965032, "memory(GiB)": 147.13, "step": 54090, "train_speed(iter/s)": 0.200738 }, { "acc": 0.7872025, "epoch": 1.2621939660829118, "grad_norm": 5.4375, "learning_rate": 3.1637691859622612e-06, "loss": 0.78567123, "memory(GiB)": 147.13, "step": 54100, "train_speed(iter/s)": 0.200757 }, { "acc": 0.7730463, "epoch": 1.2624272736552005, "grad_norm": 5.46875, "learning_rate": 3.1620122093974864e-06, "loss": 0.83145771, "memory(GiB)": 147.13, "step": 54110, "train_speed(iter/s)": 0.200776 }, { "acc": 0.78709173, "epoch": 1.2626605812274896, "grad_norm": 4.8125, "learning_rate": 3.160255495204936e-06, "loss": 0.76123199, "memory(GiB)": 147.13, "step": 54120, "train_speed(iter/s)": 0.200795 }, { "acc": 0.77258062, "epoch": 1.2628938887997783, "grad_norm": 7.15625, "learning_rate": 3.158499043635378e-06, "loss": 0.81969166, "memory(GiB)": 147.13, "step": 54130, "train_speed(iter/s)": 0.200816 }, { "acc": 0.78251991, "epoch": 1.2631271963720674, "grad_norm": 7.46875, "learning_rate": 3.156742854939547e-06, "loss": 0.79453034, "memory(GiB)": 147.13, "step": 54140, "train_speed(iter/s)": 0.200835 }, { "acc": 0.77685394, "epoch": 1.263360503944356, "grad_norm": 7.5625, "learning_rate": 3.1549869293681385e-06, "loss": 0.81942472, "memory(GiB)": 147.13, "step": 54150, "train_speed(iter/s)": 0.200855 }, { "acc": 0.77604828, "epoch": 1.263593811516645, "grad_norm": 5.0, "learning_rate": 3.1532312671718102e-06, "loss": 0.79661341, "memory(GiB)": 147.13, "step": 54160, "train_speed(iter/s)": 0.200875 }, { "acc": 0.77102032, "epoch": 1.2638271190889339, "grad_norm": 4.90625, "learning_rate": 3.1514758686011816e-06, "loss": 0.84676743, "memory(GiB)": 147.13, "step": 54170, "train_speed(iter/s)": 0.200894 }, { "acc": 0.77398539, "epoch": 1.2640604266612228, "grad_norm": 6.25, "learning_rate": 3.149720733906836e-06, "loss": 0.82969208, "memory(GiB)": 147.13, "step": 54180, "train_speed(iter/s)": 0.200912 }, { "acc": 0.77549362, "epoch": 1.2642937342335117, "grad_norm": 5.96875, "learning_rate": 3.1479658633393194e-06, "loss": 0.82915554, "memory(GiB)": 147.13, "step": 54190, "train_speed(iter/s)": 0.200931 }, { "acc": 0.78671007, "epoch": 1.2645270418058006, "grad_norm": 5.71875, "learning_rate": 3.146211257149136e-06, "loss": 0.77449756, "memory(GiB)": 147.13, "step": 54200, "train_speed(iter/s)": 0.200951 }, { "acc": 0.79366622, "epoch": 1.2647603493780895, "grad_norm": 7.75, "learning_rate": 3.1444569155867573e-06, "loss": 0.7421545, "memory(GiB)": 147.13, "step": 54210, "train_speed(iter/s)": 0.200969 }, { "acc": 0.77127199, "epoch": 1.2649936569503784, "grad_norm": 5.28125, "learning_rate": 3.1427028389026147e-06, "loss": 0.83733711, "memory(GiB)": 147.13, "step": 54220, "train_speed(iter/s)": 0.200989 }, { "acc": 0.77401648, "epoch": 1.2652269645226673, "grad_norm": 5.03125, "learning_rate": 3.140949027347102e-06, "loss": 0.82600069, "memory(GiB)": 147.13, "step": 54230, "train_speed(iter/s)": 0.201007 }, { "acc": 0.78014426, "epoch": 1.2654602720949562, "grad_norm": 7.84375, "learning_rate": 3.139195481170577e-06, "loss": 0.78453398, "memory(GiB)": 147.13, "step": 54240, "train_speed(iter/s)": 0.201025 }, { "acc": 0.79164677, "epoch": 1.265693579667245, "grad_norm": 5.6875, "learning_rate": 3.1374422006233553e-06, "loss": 0.72470675, "memory(GiB)": 147.13, "step": 54250, "train_speed(iter/s)": 0.201044 }, { "acc": 0.78896961, "epoch": 1.265926887239534, "grad_norm": 5.625, "learning_rate": 3.1356891859557187e-06, "loss": 0.76291442, "memory(GiB)": 147.13, "step": 54260, "train_speed(iter/s)": 0.201063 }, { "acc": 0.78453579, "epoch": 1.2661601948118228, "grad_norm": 5.46875, "learning_rate": 3.1339364374179092e-06, "loss": 0.76488409, "memory(GiB)": 147.13, "step": 54270, "train_speed(iter/s)": 0.201082 }, { "acc": 0.76442003, "epoch": 1.2663935023841117, "grad_norm": 7.03125, "learning_rate": 3.1321839552601308e-06, "loss": 0.83735924, "memory(GiB)": 147.13, "step": 54280, "train_speed(iter/s)": 0.201102 }, { "acc": 0.7594471, "epoch": 1.2666268099564006, "grad_norm": 5.5, "learning_rate": 3.1304317397325503e-06, "loss": 0.87646704, "memory(GiB)": 147.13, "step": 54290, "train_speed(iter/s)": 0.201121 }, { "acc": 0.78775072, "epoch": 1.2668601175286895, "grad_norm": 6.09375, "learning_rate": 3.128679791085297e-06, "loss": 0.74031334, "memory(GiB)": 147.13, "step": 54300, "train_speed(iter/s)": 0.20114 }, { "acc": 0.78536415, "epoch": 1.2670934251009784, "grad_norm": 4.5625, "learning_rate": 3.1269281095684594e-06, "loss": 0.78184915, "memory(GiB)": 147.13, "step": 54310, "train_speed(iter/s)": 0.20116 }, { "acc": 0.76834879, "epoch": 1.2673267326732673, "grad_norm": 5.09375, "learning_rate": 3.1251766954320906e-06, "loss": 0.83152952, "memory(GiB)": 147.13, "step": 54320, "train_speed(iter/s)": 0.201178 }, { "acc": 0.78719902, "epoch": 1.2675600402455562, "grad_norm": 8.5625, "learning_rate": 3.123425548926203e-06, "loss": 0.75794401, "memory(GiB)": 147.13, "step": 54330, "train_speed(iter/s)": 0.201198 }, { "acc": 0.77632055, "epoch": 1.2677933478178451, "grad_norm": 7.1875, "learning_rate": 3.121674670300773e-06, "loss": 0.81995049, "memory(GiB)": 147.13, "step": 54340, "train_speed(iter/s)": 0.201219 }, { "acc": 0.79015565, "epoch": 1.268026655390134, "grad_norm": 3.25, "learning_rate": 3.1199240598057377e-06, "loss": 0.7804615, "memory(GiB)": 147.13, "step": 54350, "train_speed(iter/s)": 0.201239 }, { "acc": 0.76458197, "epoch": 1.268259962962423, "grad_norm": 5.46875, "learning_rate": 3.1181737176909967e-06, "loss": 0.85183592, "memory(GiB)": 147.13, "step": 54360, "train_speed(iter/s)": 0.201259 }, { "acc": 0.76942658, "epoch": 1.2684932705347118, "grad_norm": 4.96875, "learning_rate": 3.116423644206411e-06, "loss": 0.83050613, "memory(GiB)": 147.13, "step": 54370, "train_speed(iter/s)": 0.201278 }, { "acc": 0.76355877, "epoch": 1.2687265781070007, "grad_norm": 7.28125, "learning_rate": 3.1146738396018043e-06, "loss": 0.86715822, "memory(GiB)": 147.13, "step": 54380, "train_speed(iter/s)": 0.201298 }, { "acc": 0.77561574, "epoch": 1.2689598856792896, "grad_norm": 5.5, "learning_rate": 3.112924304126958e-06, "loss": 0.81895523, "memory(GiB)": 147.13, "step": 54390, "train_speed(iter/s)": 0.201318 }, { "acc": 0.79888744, "epoch": 1.2691931932515785, "grad_norm": 4.125, "learning_rate": 3.111175038031619e-06, "loss": 0.70149899, "memory(GiB)": 147.13, "step": 54400, "train_speed(iter/s)": 0.201337 }, { "acc": 0.78107786, "epoch": 1.2694265008238674, "grad_norm": 4.96875, "learning_rate": 3.1094260415654955e-06, "loss": 0.79072466, "memory(GiB)": 147.13, "step": 54410, "train_speed(iter/s)": 0.201357 }, { "acc": 0.77758923, "epoch": 1.2696598083961563, "grad_norm": 5.75, "learning_rate": 3.1076773149782557e-06, "loss": 0.80385323, "memory(GiB)": 147.13, "step": 54420, "train_speed(iter/s)": 0.201377 }, { "acc": 0.79401674, "epoch": 1.2698931159684452, "grad_norm": 6.0, "learning_rate": 3.105928858519529e-06, "loss": 0.72925024, "memory(GiB)": 147.13, "step": 54430, "train_speed(iter/s)": 0.201397 }, { "acc": 0.79929819, "epoch": 1.270126423540734, "grad_norm": 4.5, "learning_rate": 3.1041806724389067e-06, "loss": 0.70583544, "memory(GiB)": 147.13, "step": 54440, "train_speed(iter/s)": 0.201414 }, { "acc": 0.78400726, "epoch": 1.270359731113023, "grad_norm": 6.46875, "learning_rate": 3.1024327569859425e-06, "loss": 0.79743586, "memory(GiB)": 147.13, "step": 54450, "train_speed(iter/s)": 0.201434 }, { "acc": 0.75207291, "epoch": 1.2705930386853117, "grad_norm": 5.96875, "learning_rate": 3.1006851124101524e-06, "loss": 0.90693626, "memory(GiB)": 147.13, "step": 54460, "train_speed(iter/s)": 0.201453 }, { "acc": 0.77595568, "epoch": 1.2708263462576008, "grad_norm": 18.375, "learning_rate": 3.0989377389610097e-06, "loss": 0.82628527, "memory(GiB)": 147.13, "step": 54470, "train_speed(iter/s)": 0.201473 }, { "acc": 0.77218075, "epoch": 1.2710596538298895, "grad_norm": 4.9375, "learning_rate": 3.0971906368879524e-06, "loss": 0.8140975, "memory(GiB)": 147.13, "step": 54480, "train_speed(iter/s)": 0.201491 }, { "acc": 0.80995159, "epoch": 1.2712929614021786, "grad_norm": 4.6875, "learning_rate": 3.095443806440379e-06, "loss": 0.68186216, "memory(GiB)": 147.13, "step": 54490, "train_speed(iter/s)": 0.20151 }, { "acc": 0.78228736, "epoch": 1.2715262689744673, "grad_norm": 5.65625, "learning_rate": 3.0936972478676493e-06, "loss": 0.77988653, "memory(GiB)": 147.13, "step": 54500, "train_speed(iter/s)": 0.201529 }, { "epoch": 1.2715262689744673, "eval_acc": 0.7445822805851863, "eval_loss": 0.80512934923172, "eval_runtime": 1270.2737, "eval_samples_per_second": 28.333, "eval_steps_per_second": 14.167, "step": 54500 }, { "acc": 0.78444986, "epoch": 1.2717595765467564, "grad_norm": 4.90625, "learning_rate": 3.0919509614190836e-06, "loss": 0.77315531, "memory(GiB)": 147.13, "step": 54510, "train_speed(iter/s)": 0.200592 }, { "acc": 0.77153635, "epoch": 1.271992884119045, "grad_norm": 4.625, "learning_rate": 3.0902049473439643e-06, "loss": 0.80885544, "memory(GiB)": 147.13, "step": 54520, "train_speed(iter/s)": 0.200612 }, { "acc": 0.7847683, "epoch": 1.2722261916913342, "grad_norm": 9.9375, "learning_rate": 3.0884592058915342e-06, "loss": 0.79745474, "memory(GiB)": 147.13, "step": 54530, "train_speed(iter/s)": 0.200631 }, { "acc": 0.76501255, "epoch": 1.272459499263623, "grad_norm": 5.28125, "learning_rate": 3.0867137373109972e-06, "loss": 0.86392994, "memory(GiB)": 147.13, "step": 54540, "train_speed(iter/s)": 0.20065 }, { "acc": 0.78538918, "epoch": 1.2726928068359118, "grad_norm": 9.4375, "learning_rate": 3.0849685418515174e-06, "loss": 0.77651844, "memory(GiB)": 147.13, "step": 54550, "train_speed(iter/s)": 0.20067 }, { "acc": 0.76775522, "epoch": 1.2729261144082007, "grad_norm": 5.53125, "learning_rate": 3.0832236197622223e-06, "loss": 0.8431736, "memory(GiB)": 147.13, "step": 54560, "train_speed(iter/s)": 0.20069 }, { "acc": 0.79953661, "epoch": 1.2731594219804896, "grad_norm": 6.0625, "learning_rate": 3.0814789712921977e-06, "loss": 0.70614519, "memory(GiB)": 147.13, "step": 54570, "train_speed(iter/s)": 0.20071 }, { "acc": 0.78566427, "epoch": 1.2733927295527785, "grad_norm": 4.34375, "learning_rate": 3.0797345966904933e-06, "loss": 0.74611931, "memory(GiB)": 147.13, "step": 54580, "train_speed(iter/s)": 0.200729 }, { "acc": 0.78306842, "epoch": 1.2736260371250674, "grad_norm": 5.40625, "learning_rate": 3.0779904962061173e-06, "loss": 0.75449877, "memory(GiB)": 147.13, "step": 54590, "train_speed(iter/s)": 0.200746 }, { "acc": 0.78306794, "epoch": 1.2738593446973563, "grad_norm": 5.625, "learning_rate": 3.076246670088041e-06, "loss": 0.77245879, "memory(GiB)": 147.13, "step": 54600, "train_speed(iter/s)": 0.200765 }, { "acc": 0.77822618, "epoch": 1.2740926522696452, "grad_norm": 6.59375, "learning_rate": 3.074503118585192e-06, "loss": 0.78750162, "memory(GiB)": 147.13, "step": 54610, "train_speed(iter/s)": 0.200784 }, { "acc": 0.76630201, "epoch": 1.274325959841934, "grad_norm": 5.90625, "learning_rate": 3.072759841946464e-06, "loss": 0.84698448, "memory(GiB)": 147.13, "step": 54620, "train_speed(iter/s)": 0.200802 }, { "acc": 0.757019, "epoch": 1.274559267414223, "grad_norm": 6.5, "learning_rate": 3.0710168404207086e-06, "loss": 0.88410797, "memory(GiB)": 147.13, "step": 54630, "train_speed(iter/s)": 0.200821 }, { "acc": 0.7815589, "epoch": 1.2747925749865119, "grad_norm": 3.984375, "learning_rate": 3.0692741142567385e-06, "loss": 0.78019571, "memory(GiB)": 147.13, "step": 54640, "train_speed(iter/s)": 0.200841 }, { "acc": 0.78222027, "epoch": 1.2750258825588008, "grad_norm": 6.3125, "learning_rate": 3.0675316637033296e-06, "loss": 0.80412626, "memory(GiB)": 147.13, "step": 54650, "train_speed(iter/s)": 0.20086 }, { "acc": 0.77893991, "epoch": 1.2752591901310897, "grad_norm": 4.125, "learning_rate": 3.0657894890092134e-06, "loss": 0.79963918, "memory(GiB)": 147.13, "step": 54660, "train_speed(iter/s)": 0.200879 }, { "acc": 0.79304705, "epoch": 1.2754924977033786, "grad_norm": 7.5, "learning_rate": 3.0640475904230848e-06, "loss": 0.75699606, "memory(GiB)": 147.13, "step": 54670, "train_speed(iter/s)": 0.200896 }, { "acc": 0.78414087, "epoch": 1.2757258052756675, "grad_norm": 4.375, "learning_rate": 3.062305968193601e-06, "loss": 0.77377367, "memory(GiB)": 147.13, "step": 54680, "train_speed(iter/s)": 0.200915 }, { "acc": 0.80083361, "epoch": 1.2759591128479564, "grad_norm": 4.09375, "learning_rate": 3.060564622569377e-06, "loss": 0.69888325, "memory(GiB)": 147.13, "step": 54690, "train_speed(iter/s)": 0.200935 }, { "acc": 0.77480173, "epoch": 1.2761924204202453, "grad_norm": 6.34375, "learning_rate": 3.0588235537989897e-06, "loss": 0.79366236, "memory(GiB)": 147.13, "step": 54700, "train_speed(iter/s)": 0.200954 }, { "acc": 0.75772133, "epoch": 1.2764257279925342, "grad_norm": 5.0, "learning_rate": 3.057082762130976e-06, "loss": 0.87921181, "memory(GiB)": 147.13, "step": 54710, "train_speed(iter/s)": 0.200974 }, { "acc": 0.80236292, "epoch": 1.276659035564823, "grad_norm": 5.875, "learning_rate": 3.0553422478138333e-06, "loss": 0.69562078, "memory(GiB)": 147.13, "step": 54720, "train_speed(iter/s)": 0.200992 }, { "acc": 0.76373925, "epoch": 1.276892343137112, "grad_norm": 6.59375, "learning_rate": 3.0536020110960214e-06, "loss": 0.86020823, "memory(GiB)": 147.13, "step": 54730, "train_speed(iter/s)": 0.201011 }, { "acc": 0.76700792, "epoch": 1.2771256507094009, "grad_norm": 4.4375, "learning_rate": 3.0518620522259557e-06, "loss": 0.83871212, "memory(GiB)": 147.13, "step": 54740, "train_speed(iter/s)": 0.201031 }, { "acc": 0.78003569, "epoch": 1.2773589582816898, "grad_norm": 7.4375, "learning_rate": 3.0501223714520155e-06, "loss": 0.78800702, "memory(GiB)": 147.13, "step": 54750, "train_speed(iter/s)": 0.201049 }, { "acc": 0.7942296, "epoch": 1.2775922658539787, "grad_norm": 6.28125, "learning_rate": 3.048382969022543e-06, "loss": 0.75022469, "memory(GiB)": 147.13, "step": 54760, "train_speed(iter/s)": 0.201069 }, { "acc": 0.77333212, "epoch": 1.2778255734262676, "grad_norm": 8.4375, "learning_rate": 3.0466438451858326e-06, "loss": 0.81570168, "memory(GiB)": 147.13, "step": 54770, "train_speed(iter/s)": 0.201088 }, { "acc": 0.7831358, "epoch": 1.2780588809985565, "grad_norm": 6.625, "learning_rate": 3.044905000190146e-06, "loss": 0.77250023, "memory(GiB)": 147.13, "step": 54780, "train_speed(iter/s)": 0.201107 }, { "acc": 0.78039427, "epoch": 1.2782921885708454, "grad_norm": 6.53125, "learning_rate": 3.043166434283703e-06, "loss": 0.78244643, "memory(GiB)": 147.13, "step": 54790, "train_speed(iter/s)": 0.201127 }, { "acc": 0.7836205, "epoch": 1.2785254961431343, "grad_norm": 5.96875, "learning_rate": 3.0414281477146823e-06, "loss": 0.79010315, "memory(GiB)": 147.13, "step": 54800, "train_speed(iter/s)": 0.201146 }, { "acc": 0.77984056, "epoch": 1.2787588037154232, "grad_norm": 5.625, "learning_rate": 3.0396901407312263e-06, "loss": 0.80035505, "memory(GiB)": 147.13, "step": 54810, "train_speed(iter/s)": 0.201166 }, { "acc": 0.78728938, "epoch": 1.278992111287712, "grad_norm": 5.15625, "learning_rate": 3.037952413581431e-06, "loss": 0.74682913, "memory(GiB)": 147.13, "step": 54820, "train_speed(iter/s)": 0.201186 }, { "acc": 0.7650311, "epoch": 1.2792254188600007, "grad_norm": 7.59375, "learning_rate": 3.03621496651336e-06, "loss": 0.84308777, "memory(GiB)": 147.13, "step": 54830, "train_speed(iter/s)": 0.201204 }, { "acc": 0.75912447, "epoch": 1.2794587264322899, "grad_norm": 6.75, "learning_rate": 3.0344777997750313e-06, "loss": 0.8658433, "memory(GiB)": 147.13, "step": 54840, "train_speed(iter/s)": 0.201224 }, { "acc": 0.7930964, "epoch": 1.2796920340045785, "grad_norm": 4.125, "learning_rate": 3.0327409136144257e-06, "loss": 0.73651738, "memory(GiB)": 147.13, "step": 54850, "train_speed(iter/s)": 0.201243 }, { "acc": 0.7795516, "epoch": 1.2799253415768677, "grad_norm": 5.09375, "learning_rate": 3.031004308279484e-06, "loss": 0.77885942, "memory(GiB)": 147.13, "step": 54860, "train_speed(iter/s)": 0.201262 }, { "acc": 0.7870728, "epoch": 1.2801586491491563, "grad_norm": 4.1875, "learning_rate": 3.0292679840181048e-06, "loss": 0.77090311, "memory(GiB)": 147.13, "step": 54870, "train_speed(iter/s)": 0.20128 }, { "acc": 0.79632759, "epoch": 1.2803919567214455, "grad_norm": 5.53125, "learning_rate": 3.02753194107815e-06, "loss": 0.7337678, "memory(GiB)": 147.13, "step": 54880, "train_speed(iter/s)": 0.201298 }, { "acc": 0.78915548, "epoch": 1.2806252642937341, "grad_norm": 4.65625, "learning_rate": 3.0257961797074353e-06, "loss": 0.75042048, "memory(GiB)": 147.13, "step": 54890, "train_speed(iter/s)": 0.201317 }, { "acc": 0.78317914, "epoch": 1.2808585718660233, "grad_norm": 6.96875, "learning_rate": 3.0240607001537442e-06, "loss": 0.78657742, "memory(GiB)": 147.13, "step": 54900, "train_speed(iter/s)": 0.201335 }, { "acc": 0.7978034, "epoch": 1.281091879438312, "grad_norm": 6.4375, "learning_rate": 3.022325502664813e-06, "loss": 0.71349802, "memory(GiB)": 147.13, "step": 54910, "train_speed(iter/s)": 0.201354 }, { "acc": 0.78730545, "epoch": 1.2813251870106008, "grad_norm": 8.125, "learning_rate": 3.020590587488342e-06, "loss": 0.7752285, "memory(GiB)": 147.13, "step": 54920, "train_speed(iter/s)": 0.201372 }, { "acc": 0.76834173, "epoch": 1.2815584945828897, "grad_norm": 5.875, "learning_rate": 3.0188559548719888e-06, "loss": 0.83232594, "memory(GiB)": 147.13, "step": 54930, "train_speed(iter/s)": 0.201391 }, { "acc": 0.78525715, "epoch": 1.2817918021551786, "grad_norm": 7.96875, "learning_rate": 3.0171216050633735e-06, "loss": 0.7697998, "memory(GiB)": 147.13, "step": 54940, "train_speed(iter/s)": 0.20141 }, { "acc": 0.7783596, "epoch": 1.2820251097274675, "grad_norm": 8.375, "learning_rate": 3.0153875383100732e-06, "loss": 0.78331108, "memory(GiB)": 147.13, "step": 54950, "train_speed(iter/s)": 0.201429 }, { "acc": 0.77114954, "epoch": 1.2822584172997564, "grad_norm": 6.8125, "learning_rate": 3.0136537548596247e-06, "loss": 0.83518686, "memory(GiB)": 147.13, "step": 54960, "train_speed(iter/s)": 0.201448 }, { "acc": 0.79357519, "epoch": 1.2824917248720453, "grad_norm": 3.984375, "learning_rate": 3.011920254959526e-06, "loss": 0.72738924, "memory(GiB)": 147.13, "step": 54970, "train_speed(iter/s)": 0.201468 }, { "acc": 0.7828373, "epoch": 1.2827250324443342, "grad_norm": 4.21875, "learning_rate": 3.010187038857233e-06, "loss": 0.78417239, "memory(GiB)": 147.13, "step": 54980, "train_speed(iter/s)": 0.201488 }, { "acc": 0.78416605, "epoch": 1.2829583400166231, "grad_norm": 5.59375, "learning_rate": 3.008454106800164e-06, "loss": 0.77895746, "memory(GiB)": 147.13, "step": 54990, "train_speed(iter/s)": 0.201507 }, { "acc": 0.77086582, "epoch": 1.283191647588912, "grad_norm": 5.3125, "learning_rate": 3.006721459035691e-06, "loss": 0.81353455, "memory(GiB)": 147.13, "step": 55000, "train_speed(iter/s)": 0.201526 }, { "epoch": 1.283191647588912, "eval_acc": 0.7444704999110399, "eval_loss": 0.8050407767295837, "eval_runtime": 1270.065, "eval_samples_per_second": 28.338, "eval_steps_per_second": 14.169, "step": 55000 }, { "acc": 0.76830349, "epoch": 1.283424955161201, "grad_norm": 4.90625, "learning_rate": 3.0049890958111505e-06, "loss": 0.82744799, "memory(GiB)": 147.13, "step": 55010, "train_speed(iter/s)": 0.200597 }, { "acc": 0.78124275, "epoch": 1.2836582627334898, "grad_norm": 5.25, "learning_rate": 3.0032570173738367e-06, "loss": 0.77643175, "memory(GiB)": 147.13, "step": 55020, "train_speed(iter/s)": 0.200615 }, { "acc": 0.77881341, "epoch": 1.2838915703057787, "grad_norm": 5.5, "learning_rate": 3.0015252239710052e-06, "loss": 0.8051384, "memory(GiB)": 147.13, "step": 55030, "train_speed(iter/s)": 0.200634 }, { "acc": 0.78495493, "epoch": 1.2841248778780676, "grad_norm": 6.09375, "learning_rate": 2.9997937158498657e-06, "loss": 0.77997541, "memory(GiB)": 147.13, "step": 55040, "train_speed(iter/s)": 0.200653 }, { "acc": 0.76635275, "epoch": 1.2843581854503565, "grad_norm": 5.15625, "learning_rate": 2.998062493257593e-06, "loss": 0.85216427, "memory(GiB)": 147.13, "step": 55050, "train_speed(iter/s)": 0.200671 }, { "acc": 0.77827249, "epoch": 1.2845914930226454, "grad_norm": 4.84375, "learning_rate": 2.9963315564413174e-06, "loss": 0.78949223, "memory(GiB)": 147.13, "step": 55060, "train_speed(iter/s)": 0.200689 }, { "acc": 0.7691534, "epoch": 1.2848248005949343, "grad_norm": 4.59375, "learning_rate": 2.994600905648131e-06, "loss": 0.83677711, "memory(GiB)": 147.13, "step": 55070, "train_speed(iter/s)": 0.200708 }, { "acc": 0.7861763, "epoch": 1.2850581081672232, "grad_norm": 6.5625, "learning_rate": 2.9928705411250813e-06, "loss": 0.77848387, "memory(GiB)": 147.13, "step": 55080, "train_speed(iter/s)": 0.200726 }, { "acc": 0.766677, "epoch": 1.285291415739512, "grad_norm": 4.625, "learning_rate": 2.9911404631191796e-06, "loss": 0.84276142, "memory(GiB)": 147.13, "step": 55090, "train_speed(iter/s)": 0.200745 }, { "acc": 0.77530727, "epoch": 1.285524723311801, "grad_norm": 5.21875, "learning_rate": 2.9894106718773936e-06, "loss": 0.82425938, "memory(GiB)": 147.13, "step": 55100, "train_speed(iter/s)": 0.200763 }, { "acc": 0.78108807, "epoch": 1.28575803088409, "grad_norm": 6.34375, "learning_rate": 2.987681167646652e-06, "loss": 0.78073077, "memory(GiB)": 147.13, "step": 55110, "train_speed(iter/s)": 0.200782 }, { "acc": 0.76942391, "epoch": 1.2859913384563788, "grad_norm": 6.5625, "learning_rate": 2.985951950673836e-06, "loss": 0.8330018, "memory(GiB)": 147.13, "step": 55120, "train_speed(iter/s)": 0.200802 }, { "acc": 0.75507021, "epoch": 1.2862246460286677, "grad_norm": 6.03125, "learning_rate": 2.984223021205795e-06, "loss": 0.89126616, "memory(GiB)": 147.13, "step": 55130, "train_speed(iter/s)": 0.200822 }, { "acc": 0.78265476, "epoch": 1.2864579536009566, "grad_norm": 4.75, "learning_rate": 2.9824943794893312e-06, "loss": 0.78735905, "memory(GiB)": 147.13, "step": 55140, "train_speed(iter/s)": 0.200841 }, { "acc": 0.78866234, "epoch": 1.2866912611732455, "grad_norm": 6.34375, "learning_rate": 2.9807660257712097e-06, "loss": 0.75422587, "memory(GiB)": 147.13, "step": 55150, "train_speed(iter/s)": 0.200861 }, { "acc": 0.78413725, "epoch": 1.2869245687455344, "grad_norm": 4.40625, "learning_rate": 2.9790379602981508e-06, "loss": 0.78166876, "memory(GiB)": 147.13, "step": 55160, "train_speed(iter/s)": 0.200878 }, { "acc": 0.77496414, "epoch": 1.2871578763178233, "grad_norm": 4.59375, "learning_rate": 2.9773101833168374e-06, "loss": 0.81109505, "memory(GiB)": 147.13, "step": 55170, "train_speed(iter/s)": 0.200897 }, { "acc": 0.77975092, "epoch": 1.2873911838901122, "grad_norm": 4.375, "learning_rate": 2.9755826950739057e-06, "loss": 0.79754815, "memory(GiB)": 147.13, "step": 55180, "train_speed(iter/s)": 0.200915 }, { "acc": 0.77278199, "epoch": 1.287624491462401, "grad_norm": 5.125, "learning_rate": 2.973855495815957e-06, "loss": 0.80997, "memory(GiB)": 147.13, "step": 55190, "train_speed(iter/s)": 0.200934 }, { "acc": 0.77235184, "epoch": 1.28785779903469, "grad_norm": 6.46875, "learning_rate": 2.9721285857895475e-06, "loss": 0.82727203, "memory(GiB)": 147.13, "step": 55200, "train_speed(iter/s)": 0.200953 }, { "acc": 0.78080778, "epoch": 1.288091106606979, "grad_norm": 5.6875, "learning_rate": 2.9704019652411933e-06, "loss": 0.80346479, "memory(GiB)": 147.13, "step": 55210, "train_speed(iter/s)": 0.200972 }, { "acc": 0.78474374, "epoch": 1.2883244141792676, "grad_norm": 5.96875, "learning_rate": 2.9686756344173712e-06, "loss": 0.77900343, "memory(GiB)": 147.13, "step": 55220, "train_speed(iter/s)": 0.20099 }, { "acc": 0.77824039, "epoch": 1.2885577217515567, "grad_norm": 25.75, "learning_rate": 2.96694959356451e-06, "loss": 0.81776686, "memory(GiB)": 147.13, "step": 55230, "train_speed(iter/s)": 0.20101 }, { "acc": 0.78159637, "epoch": 1.2887910293238454, "grad_norm": 6.09375, "learning_rate": 2.9652238429290036e-06, "loss": 0.79265308, "memory(GiB)": 147.13, "step": 55240, "train_speed(iter/s)": 0.20103 }, { "acc": 0.79291801, "epoch": 1.2890243368961345, "grad_norm": 6.40625, "learning_rate": 2.9634983827572038e-06, "loss": 0.73101878, "memory(GiB)": 147.13, "step": 55250, "train_speed(iter/s)": 0.20105 }, { "acc": 0.7721282, "epoch": 1.2892576444684232, "grad_norm": 4.53125, "learning_rate": 2.961773213295417e-06, "loss": 0.82999897, "memory(GiB)": 147.13, "step": 55260, "train_speed(iter/s)": 0.20107 }, { "acc": 0.79432802, "epoch": 1.2894909520407123, "grad_norm": 4.78125, "learning_rate": 2.960048334789912e-06, "loss": 0.73425961, "memory(GiB)": 147.13, "step": 55270, "train_speed(iter/s)": 0.20109 }, { "acc": 0.78572264, "epoch": 1.289724259613001, "grad_norm": 5.625, "learning_rate": 2.9583237474869143e-06, "loss": 0.76063013, "memory(GiB)": 147.13, "step": 55280, "train_speed(iter/s)": 0.201108 }, { "acc": 0.77698021, "epoch": 1.28995756718529, "grad_norm": 5.40625, "learning_rate": 2.956599451632609e-06, "loss": 0.79158425, "memory(GiB)": 147.13, "step": 55290, "train_speed(iter/s)": 0.201127 }, { "acc": 0.76085668, "epoch": 1.2901908747575788, "grad_norm": 5.375, "learning_rate": 2.9548754474731376e-06, "loss": 0.85578661, "memory(GiB)": 147.13, "step": 55300, "train_speed(iter/s)": 0.201145 }, { "acc": 0.78259788, "epoch": 1.2904241823298677, "grad_norm": 5.75, "learning_rate": 2.953151735254604e-06, "loss": 0.77954855, "memory(GiB)": 147.13, "step": 55310, "train_speed(iter/s)": 0.201164 }, { "acc": 0.78067522, "epoch": 1.2906574899021566, "grad_norm": 5.375, "learning_rate": 2.9514283152230637e-06, "loss": 0.78616638, "memory(GiB)": 147.13, "step": 55320, "train_speed(iter/s)": 0.201181 }, { "acc": 0.77021255, "epoch": 1.2908907974744455, "grad_norm": 8.5625, "learning_rate": 2.949705187624539e-06, "loss": 0.80933561, "memory(GiB)": 147.13, "step": 55330, "train_speed(iter/s)": 0.2012 }, { "acc": 0.777321, "epoch": 1.2911241050467344, "grad_norm": 5.53125, "learning_rate": 2.947982352705001e-06, "loss": 0.78500934, "memory(GiB)": 147.13, "step": 55340, "train_speed(iter/s)": 0.201219 }, { "acc": 0.78850975, "epoch": 1.2913574126190233, "grad_norm": 5.96875, "learning_rate": 2.9462598107103855e-06, "loss": 0.7612771, "memory(GiB)": 147.13, "step": 55350, "train_speed(iter/s)": 0.201238 }, { "acc": 0.77932158, "epoch": 1.2915907201913122, "grad_norm": 6.78125, "learning_rate": 2.9445375618865857e-06, "loss": 0.79323964, "memory(GiB)": 147.13, "step": 55360, "train_speed(iter/s)": 0.201256 }, { "acc": 0.75293274, "epoch": 1.291824027763601, "grad_norm": 6.15625, "learning_rate": 2.942815606479452e-06, "loss": 0.89559402, "memory(GiB)": 147.13, "step": 55370, "train_speed(iter/s)": 0.201275 }, { "acc": 0.79058542, "epoch": 1.29205733533589, "grad_norm": 4.8125, "learning_rate": 2.941093944734793e-06, "loss": 0.76939631, "memory(GiB)": 147.13, "step": 55380, "train_speed(iter/s)": 0.201294 }, { "acc": 0.78246756, "epoch": 1.2922906429081789, "grad_norm": 5.4375, "learning_rate": 2.939372576898376e-06, "loss": 0.78994184, "memory(GiB)": 147.13, "step": 55390, "train_speed(iter/s)": 0.201313 }, { "acc": 0.7745213, "epoch": 1.2925239504804678, "grad_norm": 5.84375, "learning_rate": 2.937651503215924e-06, "loss": 0.80446148, "memory(GiB)": 147.13, "step": 55400, "train_speed(iter/s)": 0.20133 }, { "acc": 0.77480874, "epoch": 1.2927572580527567, "grad_norm": 5.625, "learning_rate": 2.9359307239331214e-06, "loss": 0.8080328, "memory(GiB)": 147.13, "step": 55410, "train_speed(iter/s)": 0.20135 }, { "acc": 0.78159122, "epoch": 1.2929905656250456, "grad_norm": 5.1875, "learning_rate": 2.9342102392956075e-06, "loss": 0.77628255, "memory(GiB)": 147.13, "step": 55420, "train_speed(iter/s)": 0.201368 }, { "acc": 0.77849216, "epoch": 1.2932238731973345, "grad_norm": 5.59375, "learning_rate": 2.932490049548982e-06, "loss": 0.79256525, "memory(GiB)": 147.13, "step": 55430, "train_speed(iter/s)": 0.201385 }, { "acc": 0.80485229, "epoch": 1.2934571807696233, "grad_norm": 3.953125, "learning_rate": 2.9307701549388025e-06, "loss": 0.68343935, "memory(GiB)": 147.13, "step": 55440, "train_speed(iter/s)": 0.201402 }, { "acc": 0.77289133, "epoch": 1.2936904883419122, "grad_norm": 4.75, "learning_rate": 2.929050555710582e-06, "loss": 0.8112318, "memory(GiB)": 147.13, "step": 55450, "train_speed(iter/s)": 0.201421 }, { "acc": 0.7689023, "epoch": 1.2939237959142011, "grad_norm": 8.125, "learning_rate": 2.9273312521097926e-06, "loss": 0.82355614, "memory(GiB)": 147.13, "step": 55460, "train_speed(iter/s)": 0.201441 }, { "acc": 0.78700824, "epoch": 1.29415710348649, "grad_norm": 6.34375, "learning_rate": 2.9256122443818657e-06, "loss": 0.75792155, "memory(GiB)": 147.13, "step": 55470, "train_speed(iter/s)": 0.201459 }, { "acc": 0.78975463, "epoch": 1.294390411058779, "grad_norm": 5.90625, "learning_rate": 2.923893532772187e-06, "loss": 0.77099237, "memory(GiB)": 147.13, "step": 55480, "train_speed(iter/s)": 0.201477 }, { "acc": 0.77440877, "epoch": 1.2946237186310678, "grad_norm": 5.6875, "learning_rate": 2.9221751175261036e-06, "loss": 0.81504545, "memory(GiB)": 147.13, "step": 55490, "train_speed(iter/s)": 0.201496 }, { "acc": 0.78191586, "epoch": 1.2948570262033567, "grad_norm": 7.34375, "learning_rate": 2.9204569988889186e-06, "loss": 0.78614321, "memory(GiB)": 147.13, "step": 55500, "train_speed(iter/s)": 0.201513 }, { "epoch": 1.2948570262033567, "eval_acc": 0.7445686683254407, "eval_loss": 0.8049291968345642, "eval_runtime": 1270.1062, "eval_samples_per_second": 28.337, "eval_steps_per_second": 14.169, "step": 55500 }, { "acc": 0.79387369, "epoch": 1.2950903337756456, "grad_norm": 6.3125, "learning_rate": 2.9187391771058938e-06, "loss": 0.7472805, "memory(GiB)": 147.13, "step": 55510, "train_speed(iter/s)": 0.200593 }, { "acc": 0.77921963, "epoch": 1.2953236413479345, "grad_norm": 4.96875, "learning_rate": 2.9170216524222446e-06, "loss": 0.78324275, "memory(GiB)": 147.13, "step": 55520, "train_speed(iter/s)": 0.200611 }, { "acc": 0.76559649, "epoch": 1.2955569489202234, "grad_norm": 7.96875, "learning_rate": 2.9153044250831512e-06, "loss": 0.84542322, "memory(GiB)": 147.13, "step": 55530, "train_speed(iter/s)": 0.20063 }, { "acc": 0.76427712, "epoch": 1.2957902564925123, "grad_norm": 6.21875, "learning_rate": 2.913587495333744e-06, "loss": 0.83981133, "memory(GiB)": 147.13, "step": 55540, "train_speed(iter/s)": 0.200649 }, { "acc": 0.77819748, "epoch": 1.2960235640648012, "grad_norm": 6.46875, "learning_rate": 2.9118708634191177e-06, "loss": 0.78465223, "memory(GiB)": 147.13, "step": 55550, "train_speed(iter/s)": 0.200668 }, { "acc": 0.78325009, "epoch": 1.2962568716370901, "grad_norm": 5.03125, "learning_rate": 2.910154529584319e-06, "loss": 0.78194418, "memory(GiB)": 147.13, "step": 55560, "train_speed(iter/s)": 0.200687 }, { "acc": 0.79050236, "epoch": 1.296490179209379, "grad_norm": 4.09375, "learning_rate": 2.9084384940743543e-06, "loss": 0.76087713, "memory(GiB)": 147.13, "step": 55570, "train_speed(iter/s)": 0.200705 }, { "acc": 0.78712444, "epoch": 1.296723486781668, "grad_norm": 5.21875, "learning_rate": 2.9067227571341873e-06, "loss": 0.76762724, "memory(GiB)": 147.13, "step": 55580, "train_speed(iter/s)": 0.200723 }, { "acc": 0.776087, "epoch": 1.2969567943539568, "grad_norm": 4.9375, "learning_rate": 2.905007319008736e-06, "loss": 0.81573505, "memory(GiB)": 147.13, "step": 55590, "train_speed(iter/s)": 0.200741 }, { "acc": 0.77835026, "epoch": 1.2971901019262457, "grad_norm": 4.8125, "learning_rate": 2.903292179942883e-06, "loss": 0.79308224, "memory(GiB)": 147.13, "step": 55600, "train_speed(iter/s)": 0.200758 }, { "acc": 0.78184395, "epoch": 1.2974234094985344, "grad_norm": 5.0, "learning_rate": 2.9015773401814606e-06, "loss": 0.79529061, "memory(GiB)": 147.13, "step": 55610, "train_speed(iter/s)": 0.200777 }, { "acc": 0.78858995, "epoch": 1.2976567170708235, "grad_norm": 5.65625, "learning_rate": 2.899862799969265e-06, "loss": 0.77276478, "memory(GiB)": 147.13, "step": 55620, "train_speed(iter/s)": 0.200795 }, { "acc": 0.77310009, "epoch": 1.2978900246431122, "grad_norm": 5.15625, "learning_rate": 2.898148559551045e-06, "loss": 0.81262197, "memory(GiB)": 147.13, "step": 55630, "train_speed(iter/s)": 0.200814 }, { "acc": 0.79125342, "epoch": 1.2981233322154013, "grad_norm": 5.15625, "learning_rate": 2.8964346191715058e-06, "loss": 0.74343634, "memory(GiB)": 147.13, "step": 55640, "train_speed(iter/s)": 0.200831 }, { "acc": 0.78252659, "epoch": 1.29835663978769, "grad_norm": 4.5625, "learning_rate": 2.894720979075315e-06, "loss": 0.76678948, "memory(GiB)": 147.13, "step": 55650, "train_speed(iter/s)": 0.20085 }, { "acc": 0.77767839, "epoch": 1.2985899473599791, "grad_norm": 4.84375, "learning_rate": 2.8930076395070915e-06, "loss": 0.76314602, "memory(GiB)": 147.13, "step": 55660, "train_speed(iter/s)": 0.200869 }, { "acc": 0.76303263, "epoch": 1.2988232549322678, "grad_norm": 4.71875, "learning_rate": 2.8912946007114175e-06, "loss": 0.85604687, "memory(GiB)": 147.13, "step": 55670, "train_speed(iter/s)": 0.200887 }, { "acc": 0.79654713, "epoch": 1.299056562504557, "grad_norm": 4.65625, "learning_rate": 2.8895818629328254e-06, "loss": 0.71905622, "memory(GiB)": 147.13, "step": 55680, "train_speed(iter/s)": 0.200905 }, { "acc": 0.76574378, "epoch": 1.2992898700768456, "grad_norm": 11.75, "learning_rate": 2.8878694264158103e-06, "loss": 0.84654961, "memory(GiB)": 147.13, "step": 55690, "train_speed(iter/s)": 0.200924 }, { "acc": 0.78273115, "epoch": 1.2995231776491345, "grad_norm": 5.71875, "learning_rate": 2.8861572914048184e-06, "loss": 0.76711287, "memory(GiB)": 147.13, "step": 55700, "train_speed(iter/s)": 0.200943 }, { "acc": 0.77995882, "epoch": 1.2997564852214234, "grad_norm": 5.84375, "learning_rate": 2.8844454581442614e-06, "loss": 0.77336879, "memory(GiB)": 147.13, "step": 55710, "train_speed(iter/s)": 0.200962 }, { "acc": 0.80577364, "epoch": 1.2999897927937123, "grad_norm": 4.90625, "learning_rate": 2.8827339268785015e-06, "loss": 0.70034103, "memory(GiB)": 147.13, "step": 55720, "train_speed(iter/s)": 0.20098 }, { "acc": 0.76289902, "epoch": 1.3002231003660012, "grad_norm": 6.09375, "learning_rate": 2.881022697851855e-06, "loss": 0.84976912, "memory(GiB)": 147.13, "step": 55730, "train_speed(iter/s)": 0.201 }, { "acc": 0.76065493, "epoch": 1.30045640793829, "grad_norm": 7.09375, "learning_rate": 2.879311771308606e-06, "loss": 0.87251511, "memory(GiB)": 147.13, "step": 55740, "train_speed(iter/s)": 0.201019 }, { "acc": 0.7798687, "epoch": 1.300689715510579, "grad_norm": 9.0, "learning_rate": 2.877601147492983e-06, "loss": 0.78704138, "memory(GiB)": 147.13, "step": 55750, "train_speed(iter/s)": 0.201038 }, { "acc": 0.76645966, "epoch": 1.300923023082868, "grad_norm": 4.5, "learning_rate": 2.8758908266491815e-06, "loss": 0.84115009, "memory(GiB)": 147.13, "step": 55760, "train_speed(iter/s)": 0.201054 }, { "acc": 0.77044306, "epoch": 1.3011563306551568, "grad_norm": 7.0, "learning_rate": 2.874180809021348e-06, "loss": 0.82898483, "memory(GiB)": 147.13, "step": 55770, "train_speed(iter/s)": 0.201074 }, { "acc": 0.78014908, "epoch": 1.3013896382274457, "grad_norm": 5.65625, "learning_rate": 2.872471094853584e-06, "loss": 0.78329229, "memory(GiB)": 147.13, "step": 55780, "train_speed(iter/s)": 0.201093 }, { "acc": 0.78297763, "epoch": 1.3016229457997346, "grad_norm": 6.15625, "learning_rate": 2.8707616843899554e-06, "loss": 0.76991262, "memory(GiB)": 147.13, "step": 55790, "train_speed(iter/s)": 0.201112 }, { "acc": 0.78463106, "epoch": 1.3018562533720235, "grad_norm": 7.28125, "learning_rate": 2.8690525778744777e-06, "loss": 0.81536741, "memory(GiB)": 147.13, "step": 55800, "train_speed(iter/s)": 0.201131 }, { "acc": 0.78778915, "epoch": 1.3020895609443124, "grad_norm": 5.1875, "learning_rate": 2.867343775551126e-06, "loss": 0.76017818, "memory(GiB)": 147.13, "step": 55810, "train_speed(iter/s)": 0.20115 }, { "acc": 0.76485157, "epoch": 1.3023228685166013, "grad_norm": 7.0625, "learning_rate": 2.8656352776638274e-06, "loss": 0.84671221, "memory(GiB)": 147.13, "step": 55820, "train_speed(iter/s)": 0.201168 }, { "acc": 0.78802285, "epoch": 1.3025561760888902, "grad_norm": 5.03125, "learning_rate": 2.863927084456476e-06, "loss": 0.74512553, "memory(GiB)": 147.13, "step": 55830, "train_speed(iter/s)": 0.201188 }, { "acc": 0.78229799, "epoch": 1.302789483661179, "grad_norm": 6.0, "learning_rate": 2.862219196172911e-06, "loss": 0.78259425, "memory(GiB)": 147.13, "step": 55840, "train_speed(iter/s)": 0.201207 }, { "acc": 0.78135662, "epoch": 1.303022791233468, "grad_norm": 5.1875, "learning_rate": 2.8605116130569355e-06, "loss": 0.78383131, "memory(GiB)": 147.13, "step": 55850, "train_speed(iter/s)": 0.201226 }, { "acc": 0.77223597, "epoch": 1.3032560988057569, "grad_norm": 7.78125, "learning_rate": 2.8588043353523066e-06, "loss": 0.83154545, "memory(GiB)": 147.13, "step": 55860, "train_speed(iter/s)": 0.201246 }, { "acc": 0.76503639, "epoch": 1.3034894063780458, "grad_norm": 6.0, "learning_rate": 2.8570973633027342e-06, "loss": 0.856847, "memory(GiB)": 147.13, "step": 55870, "train_speed(iter/s)": 0.201264 }, { "acc": 0.78612194, "epoch": 1.3037227139503347, "grad_norm": 4.59375, "learning_rate": 2.8553906971518936e-06, "loss": 0.76604691, "memory(GiB)": 147.13, "step": 55880, "train_speed(iter/s)": 0.201282 }, { "acc": 0.7799561, "epoch": 1.3039560215226236, "grad_norm": 8.3125, "learning_rate": 2.8536843371434054e-06, "loss": 0.78653412, "memory(GiB)": 147.13, "step": 55890, "train_speed(iter/s)": 0.201301 }, { "acc": 0.77653108, "epoch": 1.3041893290949125, "grad_norm": 6.0, "learning_rate": 2.851978283520859e-06, "loss": 0.79027066, "memory(GiB)": 147.13, "step": 55900, "train_speed(iter/s)": 0.20132 }, { "acc": 0.78584161, "epoch": 1.3044226366672014, "grad_norm": 6.0, "learning_rate": 2.850272536527784e-06, "loss": 0.78225594, "memory(GiB)": 147.13, "step": 55910, "train_speed(iter/s)": 0.20134 }, { "acc": 0.77251587, "epoch": 1.3046559442394903, "grad_norm": 8.75, "learning_rate": 2.848567096407682e-06, "loss": 0.83527536, "memory(GiB)": 147.13, "step": 55920, "train_speed(iter/s)": 0.201358 }, { "acc": 0.77410121, "epoch": 1.3048892518117792, "grad_norm": 6.3125, "learning_rate": 2.8468619634040017e-06, "loss": 0.820961, "memory(GiB)": 147.13, "step": 55930, "train_speed(iter/s)": 0.201376 }, { "acc": 0.76235828, "epoch": 1.305122559384068, "grad_norm": 4.78125, "learning_rate": 2.8451571377601495e-06, "loss": 0.85515633, "memory(GiB)": 147.13, "step": 55940, "train_speed(iter/s)": 0.201395 }, { "acc": 0.79035673, "epoch": 1.305355866956357, "grad_norm": 5.125, "learning_rate": 2.8434526197194915e-06, "loss": 0.73724895, "memory(GiB)": 147.13, "step": 55950, "train_speed(iter/s)": 0.201414 }, { "acc": 0.78144627, "epoch": 1.3055891745286459, "grad_norm": 5.4375, "learning_rate": 2.8417484095253434e-06, "loss": 0.78020916, "memory(GiB)": 147.13, "step": 55960, "train_speed(iter/s)": 0.201432 }, { "acc": 0.77665806, "epoch": 1.3058224821009348, "grad_norm": 5.4375, "learning_rate": 2.8400445074209852e-06, "loss": 0.78938322, "memory(GiB)": 147.13, "step": 55970, "train_speed(iter/s)": 0.201451 }, { "acc": 0.76400418, "epoch": 1.3060557896732234, "grad_norm": 6.1875, "learning_rate": 2.8383409136496443e-06, "loss": 0.86329784, "memory(GiB)": 147.13, "step": 55980, "train_speed(iter/s)": 0.20147 }, { "acc": 0.78050137, "epoch": 1.3062890972455126, "grad_norm": 4.59375, "learning_rate": 2.8366376284545117e-06, "loss": 0.77180653, "memory(GiB)": 147.13, "step": 55990, "train_speed(iter/s)": 0.201489 }, { "acc": 0.79701538, "epoch": 1.3065224048178012, "grad_norm": 9.75, "learning_rate": 2.8349346520787284e-06, "loss": 0.72172403, "memory(GiB)": 147.13, "step": 56000, "train_speed(iter/s)": 0.201506 }, { "epoch": 1.3065224048178012, "eval_acc": 0.7445838820275092, "eval_loss": 0.8049860000610352, "eval_runtime": 1271.539, "eval_samples_per_second": 28.305, "eval_steps_per_second": 14.153, "step": 56000 }, { "acc": 0.7685607, "epoch": 1.3067557123900904, "grad_norm": 4.6875, "learning_rate": 2.833231984765393e-06, "loss": 0.81922808, "memory(GiB)": 147.13, "step": 56010, "train_speed(iter/s)": 0.200593 }, { "acc": 0.79894352, "epoch": 1.306989019962379, "grad_norm": 5.125, "learning_rate": 2.8315296267575672e-06, "loss": 0.68040066, "memory(GiB)": 147.13, "step": 56020, "train_speed(iter/s)": 0.200612 }, { "acc": 0.77997193, "epoch": 1.3072223275346682, "grad_norm": 4.6875, "learning_rate": 2.8298275782982525e-06, "loss": 0.81127281, "memory(GiB)": 147.13, "step": 56030, "train_speed(iter/s)": 0.200631 }, { "acc": 0.77440147, "epoch": 1.3074556351069568, "grad_norm": 4.625, "learning_rate": 2.8281258396304224e-06, "loss": 0.8060461, "memory(GiB)": 147.13, "step": 56040, "train_speed(iter/s)": 0.20065 }, { "acc": 0.7795207, "epoch": 1.307688942679246, "grad_norm": 5.03125, "learning_rate": 2.8264244109969963e-06, "loss": 0.82625637, "memory(GiB)": 147.13, "step": 56050, "train_speed(iter/s)": 0.200669 }, { "acc": 0.77957468, "epoch": 1.3079222502515346, "grad_norm": 6.59375, "learning_rate": 2.824723292640856e-06, "loss": 0.77296095, "memory(GiB)": 147.13, "step": 56060, "train_speed(iter/s)": 0.200685 }, { "acc": 0.76619778, "epoch": 1.3081555578238238, "grad_norm": 5.8125, "learning_rate": 2.823022484804834e-06, "loss": 0.82944441, "memory(GiB)": 147.13, "step": 56070, "train_speed(iter/s)": 0.200702 }, { "acc": 0.77313166, "epoch": 1.3083888653961124, "grad_norm": 7.875, "learning_rate": 2.8213219877317164e-06, "loss": 0.82456112, "memory(GiB)": 147.13, "step": 56080, "train_speed(iter/s)": 0.200721 }, { "acc": 0.76039181, "epoch": 1.3086221729684013, "grad_norm": 5.71875, "learning_rate": 2.819621801664256e-06, "loss": 0.85574379, "memory(GiB)": 147.13, "step": 56090, "train_speed(iter/s)": 0.200741 }, { "acc": 0.78070183, "epoch": 1.3088554805406902, "grad_norm": 4.78125, "learning_rate": 2.817921926845147e-06, "loss": 0.81434116, "memory(GiB)": 147.13, "step": 56100, "train_speed(iter/s)": 0.20076 }, { "acc": 0.78396792, "epoch": 1.3090887881129791, "grad_norm": 5.1875, "learning_rate": 2.8162223635170515e-06, "loss": 0.76618247, "memory(GiB)": 147.13, "step": 56110, "train_speed(iter/s)": 0.200777 }, { "acc": 0.79208441, "epoch": 1.309322095685268, "grad_norm": 6.53125, "learning_rate": 2.814523111922577e-06, "loss": 0.74408183, "memory(GiB)": 147.13, "step": 56120, "train_speed(iter/s)": 0.200796 }, { "acc": 0.79604063, "epoch": 1.309555403257557, "grad_norm": 4.53125, "learning_rate": 2.812824172304297e-06, "loss": 0.75110483, "memory(GiB)": 147.13, "step": 56130, "train_speed(iter/s)": 0.200814 }, { "acc": 0.77951555, "epoch": 1.3097887108298458, "grad_norm": 5.75, "learning_rate": 2.8111255449047277e-06, "loss": 0.76333828, "memory(GiB)": 147.13, "step": 56140, "train_speed(iter/s)": 0.200833 }, { "acc": 0.79114752, "epoch": 1.3100220184021347, "grad_norm": 5.21875, "learning_rate": 2.809427229966353e-06, "loss": 0.73384008, "memory(GiB)": 147.13, "step": 56150, "train_speed(iter/s)": 0.200851 }, { "acc": 0.77196131, "epoch": 1.3102553259744236, "grad_norm": 8.3125, "learning_rate": 2.8077292277316036e-06, "loss": 0.80360489, "memory(GiB)": 147.13, "step": 56160, "train_speed(iter/s)": 0.200869 }, { "acc": 0.78124895, "epoch": 1.3104886335467125, "grad_norm": 5.375, "learning_rate": 2.8060315384428692e-06, "loss": 0.79064064, "memory(GiB)": 147.13, "step": 56170, "train_speed(iter/s)": 0.200889 }, { "acc": 0.77550502, "epoch": 1.3107219411190014, "grad_norm": 8.8125, "learning_rate": 2.8043341623424974e-06, "loss": 0.82532978, "memory(GiB)": 147.13, "step": 56180, "train_speed(iter/s)": 0.200907 }, { "acc": 0.80522442, "epoch": 1.3109552486912903, "grad_norm": 5.15625, "learning_rate": 2.8026370996727835e-06, "loss": 0.68403687, "memory(GiB)": 147.13, "step": 56190, "train_speed(iter/s)": 0.200925 }, { "acc": 0.79306288, "epoch": 1.3111885562635792, "grad_norm": 7.21875, "learning_rate": 2.800940350675988e-06, "loss": 0.7425447, "memory(GiB)": 147.13, "step": 56200, "train_speed(iter/s)": 0.200944 }, { "acc": 0.76478882, "epoch": 1.3114218638358681, "grad_norm": 5.40625, "learning_rate": 2.7992439155943185e-06, "loss": 0.87208843, "memory(GiB)": 147.13, "step": 56210, "train_speed(iter/s)": 0.200963 }, { "acc": 0.7814106, "epoch": 1.311655171408157, "grad_norm": 4.34375, "learning_rate": 2.797547794669938e-06, "loss": 0.7701395, "memory(GiB)": 147.13, "step": 56220, "train_speed(iter/s)": 0.200982 }, { "acc": 0.76465588, "epoch": 1.311888478980446, "grad_norm": 6.15625, "learning_rate": 2.7958519881449723e-06, "loss": 0.860077, "memory(GiB)": 147.13, "step": 56230, "train_speed(iter/s)": 0.201001 }, { "acc": 0.78399577, "epoch": 1.3121217865527348, "grad_norm": 4.40625, "learning_rate": 2.794156496261493e-06, "loss": 0.75324211, "memory(GiB)": 147.13, "step": 56240, "train_speed(iter/s)": 0.201021 }, { "acc": 0.77755909, "epoch": 1.3123550941250237, "grad_norm": 5.78125, "learning_rate": 2.792461319261538e-06, "loss": 0.79980249, "memory(GiB)": 147.13, "step": 56250, "train_speed(iter/s)": 0.201039 }, { "acc": 0.77361746, "epoch": 1.3125884016973126, "grad_norm": 4.96875, "learning_rate": 2.790766457387083e-06, "loss": 0.81588135, "memory(GiB)": 147.13, "step": 56260, "train_speed(iter/s)": 0.201056 }, { "acc": 0.78619328, "epoch": 1.3128217092696015, "grad_norm": 4.25, "learning_rate": 2.7890719108800766e-06, "loss": 0.75692482, "memory(GiB)": 147.13, "step": 56270, "train_speed(iter/s)": 0.201074 }, { "acc": 0.77736025, "epoch": 1.3130550168418904, "grad_norm": 6.0, "learning_rate": 2.7873776799824115e-06, "loss": 0.79954443, "memory(GiB)": 147.13, "step": 56280, "train_speed(iter/s)": 0.201092 }, { "acc": 0.8001195, "epoch": 1.3132883244141793, "grad_norm": 5.0, "learning_rate": 2.7856837649359416e-06, "loss": 0.7024128, "memory(GiB)": 147.13, "step": 56290, "train_speed(iter/s)": 0.201111 }, { "acc": 0.78101416, "epoch": 1.3135216319864682, "grad_norm": 6.75, "learning_rate": 2.7839901659824707e-06, "loss": 0.80548544, "memory(GiB)": 147.13, "step": 56300, "train_speed(iter/s)": 0.201128 }, { "acc": 0.7754415, "epoch": 1.313754939558757, "grad_norm": 5.25, "learning_rate": 2.7822968833637577e-06, "loss": 0.79988079, "memory(GiB)": 147.13, "step": 56310, "train_speed(iter/s)": 0.201147 }, { "acc": 0.7806459, "epoch": 1.313988247131046, "grad_norm": 4.78125, "learning_rate": 2.7806039173215225e-06, "loss": 0.78693638, "memory(GiB)": 147.13, "step": 56320, "train_speed(iter/s)": 0.201165 }, { "acc": 0.78040733, "epoch": 1.314221554703335, "grad_norm": 6.5625, "learning_rate": 2.7789112680974316e-06, "loss": 0.78004808, "memory(GiB)": 147.13, "step": 56330, "train_speed(iter/s)": 0.201183 }, { "acc": 0.80179815, "epoch": 1.3144548622756238, "grad_norm": 3.546875, "learning_rate": 2.7772189359331136e-06, "loss": 0.68882008, "memory(GiB)": 147.13, "step": 56340, "train_speed(iter/s)": 0.2012 }, { "acc": 0.7934073, "epoch": 1.3146881698479127, "grad_norm": 6.09375, "learning_rate": 2.7755269210701475e-06, "loss": 0.76322498, "memory(GiB)": 147.13, "step": 56350, "train_speed(iter/s)": 0.201217 }, { "acc": 0.78926401, "epoch": 1.3149214774202016, "grad_norm": 6.03125, "learning_rate": 2.7738352237500667e-06, "loss": 0.74684887, "memory(GiB)": 147.13, "step": 56360, "train_speed(iter/s)": 0.201235 }, { "acc": 0.76026993, "epoch": 1.3151547849924903, "grad_norm": 4.59375, "learning_rate": 2.7721438442143607e-06, "loss": 0.85534048, "memory(GiB)": 147.13, "step": 56370, "train_speed(iter/s)": 0.201253 }, { "acc": 0.77141409, "epoch": 1.3153880925647794, "grad_norm": 5.125, "learning_rate": 2.7704527827044714e-06, "loss": 0.81075706, "memory(GiB)": 147.13, "step": 56380, "train_speed(iter/s)": 0.201271 }, { "acc": 0.79955773, "epoch": 1.315621400137068, "grad_norm": 3.953125, "learning_rate": 2.7687620394618025e-06, "loss": 0.71365733, "memory(GiB)": 147.13, "step": 56390, "train_speed(iter/s)": 0.20129 }, { "acc": 0.78430367, "epoch": 1.3158547077093572, "grad_norm": 6.34375, "learning_rate": 2.767071614727702e-06, "loss": 0.75429764, "memory(GiB)": 147.13, "step": 56400, "train_speed(iter/s)": 0.201309 }, { "acc": 0.76308584, "epoch": 1.3160880152816459, "grad_norm": 6.65625, "learning_rate": 2.765381508743482e-06, "loss": 0.86523905, "memory(GiB)": 147.13, "step": 56410, "train_speed(iter/s)": 0.201326 }, { "acc": 0.78555737, "epoch": 1.316321322853935, "grad_norm": 5.65625, "learning_rate": 2.7636917217504007e-06, "loss": 0.77236929, "memory(GiB)": 147.13, "step": 56420, "train_speed(iter/s)": 0.201345 }, { "acc": 0.78901734, "epoch": 1.3165546304262237, "grad_norm": 4.21875, "learning_rate": 2.762002253989678e-06, "loss": 0.75951052, "memory(GiB)": 147.13, "step": 56430, "train_speed(iter/s)": 0.201364 }, { "acc": 0.76562176, "epoch": 1.3167879379985128, "grad_norm": 5.375, "learning_rate": 2.7603131057024835e-06, "loss": 0.84810133, "memory(GiB)": 147.13, "step": 56440, "train_speed(iter/s)": 0.201383 }, { "acc": 0.78627138, "epoch": 1.3170212455708015, "grad_norm": 5.125, "learning_rate": 2.7586242771299404e-06, "loss": 0.76413832, "memory(GiB)": 147.13, "step": 56450, "train_speed(iter/s)": 0.201402 }, { "acc": 0.78318043, "epoch": 1.3172545531430904, "grad_norm": 4.15625, "learning_rate": 2.7569357685131325e-06, "loss": 0.80174465, "memory(GiB)": 147.13, "step": 56460, "train_speed(iter/s)": 0.201419 }, { "acc": 0.79765291, "epoch": 1.3174878607153793, "grad_norm": 6.84375, "learning_rate": 2.7552475800930907e-06, "loss": 0.72690916, "memory(GiB)": 147.13, "step": 56470, "train_speed(iter/s)": 0.201438 }, { "acc": 0.79450531, "epoch": 1.3177211682876682, "grad_norm": 4.3125, "learning_rate": 2.753559712110808e-06, "loss": 0.73781071, "memory(GiB)": 147.13, "step": 56480, "train_speed(iter/s)": 0.201455 }, { "acc": 0.78455567, "epoch": 1.317954475859957, "grad_norm": 4.84375, "learning_rate": 2.75187216480722e-06, "loss": 0.7775054, "memory(GiB)": 147.13, "step": 56490, "train_speed(iter/s)": 0.201473 }, { "acc": 0.79019089, "epoch": 1.318187783432246, "grad_norm": 4.90625, "learning_rate": 2.75018493842323e-06, "loss": 0.75038633, "memory(GiB)": 147.13, "step": 56500, "train_speed(iter/s)": 0.201491 }, { "epoch": 1.318187783432246, "eval_acc": 0.7445172620268719, "eval_loss": 0.8049691915512085, "eval_runtime": 1270.3048, "eval_samples_per_second": 28.333, "eval_steps_per_second": 14.167, "step": 56500 }, { "acc": 0.77661104, "epoch": 1.3184210910045349, "grad_norm": 4.21875, "learning_rate": 2.748498033199686e-06, "loss": 0.79541645, "memory(GiB)": 147.13, "step": 56510, "train_speed(iter/s)": 0.200588 }, { "acc": 0.78313751, "epoch": 1.3186543985768238, "grad_norm": 4.90625, "learning_rate": 2.7468114493773913e-06, "loss": 0.80430899, "memory(GiB)": 147.13, "step": 56520, "train_speed(iter/s)": 0.200606 }, { "acc": 0.76499681, "epoch": 1.3188877061491127, "grad_norm": 5.71875, "learning_rate": 2.7451251871971103e-06, "loss": 0.85248184, "memory(GiB)": 147.13, "step": 56530, "train_speed(iter/s)": 0.200625 }, { "acc": 0.78148131, "epoch": 1.3191210137214016, "grad_norm": 5.09375, "learning_rate": 2.743439246899552e-06, "loss": 0.80053949, "memory(GiB)": 147.13, "step": 56540, "train_speed(iter/s)": 0.200644 }, { "acc": 0.78812108, "epoch": 1.3193543212936905, "grad_norm": 4.28125, "learning_rate": 2.7417536287253864e-06, "loss": 0.77857547, "memory(GiB)": 147.13, "step": 56550, "train_speed(iter/s)": 0.200661 }, { "acc": 0.78670983, "epoch": 1.3195876288659794, "grad_norm": 4.4375, "learning_rate": 2.7400683329152358e-06, "loss": 0.77823715, "memory(GiB)": 147.13, "step": 56560, "train_speed(iter/s)": 0.200679 }, { "acc": 0.78537912, "epoch": 1.3198209364382683, "grad_norm": 4.15625, "learning_rate": 2.738383359709671e-06, "loss": 0.76351776, "memory(GiB)": 147.13, "step": 56570, "train_speed(iter/s)": 0.200698 }, { "acc": 0.77593765, "epoch": 1.3200542440105572, "grad_norm": 9.25, "learning_rate": 2.736698709349227e-06, "loss": 0.80890398, "memory(GiB)": 147.13, "step": 56580, "train_speed(iter/s)": 0.200715 }, { "acc": 0.7956008, "epoch": 1.320287551582846, "grad_norm": 5.5, "learning_rate": 2.7350143820743847e-06, "loss": 0.7315671, "memory(GiB)": 147.13, "step": 56590, "train_speed(iter/s)": 0.200732 }, { "acc": 0.77545271, "epoch": 1.320520859155135, "grad_norm": 24.25, "learning_rate": 2.7333303781255816e-06, "loss": 0.78528709, "memory(GiB)": 147.13, "step": 56600, "train_speed(iter/s)": 0.20075 }, { "acc": 0.76972427, "epoch": 1.3207541667274239, "grad_norm": 9.625, "learning_rate": 2.7316466977432067e-06, "loss": 0.80668039, "memory(GiB)": 147.13, "step": 56610, "train_speed(iter/s)": 0.200768 }, { "acc": 0.77437797, "epoch": 1.3209874742997127, "grad_norm": 5.65625, "learning_rate": 2.729963341167608e-06, "loss": 0.82345848, "memory(GiB)": 147.13, "step": 56620, "train_speed(iter/s)": 0.200787 }, { "acc": 0.79564867, "epoch": 1.3212207818720016, "grad_norm": 4.9375, "learning_rate": 2.728280308639081e-06, "loss": 0.72023449, "memory(GiB)": 147.13, "step": 56630, "train_speed(iter/s)": 0.200806 }, { "acc": 0.77554989, "epoch": 1.3214540894442905, "grad_norm": 5.40625, "learning_rate": 2.7265976003978828e-06, "loss": 0.79516258, "memory(GiB)": 147.13, "step": 56640, "train_speed(iter/s)": 0.200824 }, { "acc": 0.76518278, "epoch": 1.3216873970165794, "grad_norm": 6.0625, "learning_rate": 2.7249152166842164e-06, "loss": 0.8493721, "memory(GiB)": 147.13, "step": 56650, "train_speed(iter/s)": 0.200843 }, { "acc": 0.78103828, "epoch": 1.3219207045888683, "grad_norm": 5.9375, "learning_rate": 2.72323315773824e-06, "loss": 0.78014269, "memory(GiB)": 147.13, "step": 56660, "train_speed(iter/s)": 0.200862 }, { "acc": 0.77679877, "epoch": 1.3221540121611572, "grad_norm": 4.03125, "learning_rate": 2.72155142380007e-06, "loss": 0.80789299, "memory(GiB)": 147.13, "step": 56670, "train_speed(iter/s)": 0.200881 }, { "acc": 0.79273858, "epoch": 1.3223873197334461, "grad_norm": 6.65625, "learning_rate": 2.7198700151097714e-06, "loss": 0.74300385, "memory(GiB)": 147.13, "step": 56680, "train_speed(iter/s)": 0.2009 }, { "acc": 0.76670465, "epoch": 1.322620627305735, "grad_norm": 4.90625, "learning_rate": 2.7181889319073674e-06, "loss": 0.83968229, "memory(GiB)": 147.13, "step": 56690, "train_speed(iter/s)": 0.200917 }, { "acc": 0.80488224, "epoch": 1.322853934878024, "grad_norm": 6.09375, "learning_rate": 2.7165081744328304e-06, "loss": 0.69187484, "memory(GiB)": 147.13, "step": 56700, "train_speed(iter/s)": 0.200936 }, { "acc": 0.78700209, "epoch": 1.3230872424503128, "grad_norm": 5.125, "learning_rate": 2.714827742926088e-06, "loss": 0.75780425, "memory(GiB)": 147.13, "step": 56710, "train_speed(iter/s)": 0.200954 }, { "acc": 0.80247622, "epoch": 1.3233205500226017, "grad_norm": 5.03125, "learning_rate": 2.7131476376270215e-06, "loss": 0.71583662, "memory(GiB)": 147.13, "step": 56720, "train_speed(iter/s)": 0.200971 }, { "acc": 0.78960152, "epoch": 1.3235538575948906, "grad_norm": 4.75, "learning_rate": 2.711467858775464e-06, "loss": 0.75800562, "memory(GiB)": 147.13, "step": 56730, "train_speed(iter/s)": 0.20099 }, { "acc": 0.77212892, "epoch": 1.3237871651671795, "grad_norm": 4.625, "learning_rate": 2.7097884066112062e-06, "loss": 0.81948633, "memory(GiB)": 147.13, "step": 56740, "train_speed(iter/s)": 0.201009 }, { "acc": 0.77226896, "epoch": 1.3240204727394684, "grad_norm": 4.96875, "learning_rate": 2.7081092813739863e-06, "loss": 0.81515799, "memory(GiB)": 147.13, "step": 56750, "train_speed(iter/s)": 0.201028 }, { "acc": 0.7703146, "epoch": 1.3242537803117571, "grad_norm": 9.4375, "learning_rate": 2.7064304833035027e-06, "loss": 0.82806835, "memory(GiB)": 147.13, "step": 56760, "train_speed(iter/s)": 0.201046 }, { "acc": 0.75851698, "epoch": 1.3244870878840462, "grad_norm": 9.625, "learning_rate": 2.704752012639399e-06, "loss": 0.87062492, "memory(GiB)": 147.13, "step": 56770, "train_speed(iter/s)": 0.201063 }, { "acc": 0.78109846, "epoch": 1.324720395456335, "grad_norm": 7.53125, "learning_rate": 2.703073869621281e-06, "loss": 0.75848188, "memory(GiB)": 147.13, "step": 56780, "train_speed(iter/s)": 0.201083 }, { "acc": 0.78177109, "epoch": 1.324953703028624, "grad_norm": 5.78125, "learning_rate": 2.7013960544887007e-06, "loss": 0.77262831, "memory(GiB)": 147.13, "step": 56790, "train_speed(iter/s)": 0.201101 }, { "acc": 0.79652271, "epoch": 1.3251870106009127, "grad_norm": 3.84375, "learning_rate": 2.699718567481164e-06, "loss": 0.72971768, "memory(GiB)": 147.13, "step": 56800, "train_speed(iter/s)": 0.20112 }, { "acc": 0.76938486, "epoch": 1.3254203181732018, "grad_norm": 5.4375, "learning_rate": 2.698041408838136e-06, "loss": 0.81897049, "memory(GiB)": 147.13, "step": 56810, "train_speed(iter/s)": 0.201139 }, { "acc": 0.796984, "epoch": 1.3256536257454905, "grad_norm": 4.21875, "learning_rate": 2.696364578799028e-06, "loss": 0.69904175, "memory(GiB)": 147.13, "step": 56820, "train_speed(iter/s)": 0.201158 }, { "acc": 0.79061108, "epoch": 1.3258869333177796, "grad_norm": 5.03125, "learning_rate": 2.694688077603207e-06, "loss": 0.75381279, "memory(GiB)": 147.13, "step": 56830, "train_speed(iter/s)": 0.201177 }, { "acc": 0.77822804, "epoch": 1.3261202408900683, "grad_norm": 4.75, "learning_rate": 2.6930119054899905e-06, "loss": 0.78712869, "memory(GiB)": 147.13, "step": 56840, "train_speed(iter/s)": 0.201195 }, { "acc": 0.79944668, "epoch": 1.3263535484623572, "grad_norm": 5.59375, "learning_rate": 2.6913360626986575e-06, "loss": 0.72601347, "memory(GiB)": 147.13, "step": 56850, "train_speed(iter/s)": 0.201214 }, { "acc": 0.78989477, "epoch": 1.326586856034646, "grad_norm": 5.34375, "learning_rate": 2.68966054946843e-06, "loss": 0.72735806, "memory(GiB)": 147.13, "step": 56860, "train_speed(iter/s)": 0.201232 }, { "acc": 0.78181467, "epoch": 1.326820163606935, "grad_norm": 4.96875, "learning_rate": 2.687985366038486e-06, "loss": 0.79019494, "memory(GiB)": 147.13, "step": 56870, "train_speed(iter/s)": 0.201251 }, { "acc": 0.76266632, "epoch": 1.327053471179224, "grad_norm": 9.0, "learning_rate": 2.6863105126479616e-06, "loss": 0.85104504, "memory(GiB)": 147.13, "step": 56880, "train_speed(iter/s)": 0.20127 }, { "acc": 0.77862854, "epoch": 1.3272867787515128, "grad_norm": 4.03125, "learning_rate": 2.6846359895359373e-06, "loss": 0.79643354, "memory(GiB)": 147.13, "step": 56890, "train_speed(iter/s)": 0.201288 }, { "acc": 0.77985401, "epoch": 1.3275200863238017, "grad_norm": 4.9375, "learning_rate": 2.682961796941456e-06, "loss": 0.78767076, "memory(GiB)": 147.13, "step": 56900, "train_speed(iter/s)": 0.201306 }, { "acc": 0.7925395, "epoch": 1.3277533938960906, "grad_norm": 5.0, "learning_rate": 2.6812879351035015e-06, "loss": 0.71750937, "memory(GiB)": 147.13, "step": 56910, "train_speed(iter/s)": 0.201325 }, { "acc": 0.80067816, "epoch": 1.3279867014683795, "grad_norm": 4.53125, "learning_rate": 2.679614404261023e-06, "loss": 0.70950232, "memory(GiB)": 147.13, "step": 56920, "train_speed(iter/s)": 0.201343 }, { "acc": 0.80294123, "epoch": 1.3282200090406684, "grad_norm": 9.4375, "learning_rate": 2.677941204652914e-06, "loss": 0.70604153, "memory(GiB)": 147.13, "step": 56930, "train_speed(iter/s)": 0.201362 }, { "acc": 0.78235593, "epoch": 1.3284533166129573, "grad_norm": 5.65625, "learning_rate": 2.676268336518024e-06, "loss": 0.77650847, "memory(GiB)": 147.13, "step": 56940, "train_speed(iter/s)": 0.201381 }, { "acc": 0.78522987, "epoch": 1.3286866241852462, "grad_norm": 4.34375, "learning_rate": 2.6745958000951546e-06, "loss": 0.76834965, "memory(GiB)": 147.13, "step": 56950, "train_speed(iter/s)": 0.2014 }, { "acc": 0.77212758, "epoch": 1.328919931757535, "grad_norm": 5.8125, "learning_rate": 2.672923595623056e-06, "loss": 0.78489156, "memory(GiB)": 147.13, "step": 56960, "train_speed(iter/s)": 0.201417 }, { "acc": 0.78244338, "epoch": 1.329153239329824, "grad_norm": 4.46875, "learning_rate": 2.67125172334044e-06, "loss": 0.77883863, "memory(GiB)": 147.13, "step": 56970, "train_speed(iter/s)": 0.201435 }, { "acc": 0.76625214, "epoch": 1.3293865469021129, "grad_norm": 5.3125, "learning_rate": 2.669580183485963e-06, "loss": 0.8450387, "memory(GiB)": 147.13, "step": 56980, "train_speed(iter/s)": 0.201453 }, { "acc": 0.76922102, "epoch": 1.3296198544744018, "grad_norm": 5.09375, "learning_rate": 2.667908976298239e-06, "loss": 0.81567163, "memory(GiB)": 147.13, "step": 56990, "train_speed(iter/s)": 0.201472 }, { "acc": 0.76809058, "epoch": 1.3298531620466907, "grad_norm": 9.375, "learning_rate": 2.666238102015832e-06, "loss": 0.83717842, "memory(GiB)": 147.13, "step": 57000, "train_speed(iter/s)": 0.201488 }, { "epoch": 1.3298531620466907, "eval_acc": 0.7444829911611593, "eval_loss": 0.8050174117088318, "eval_runtime": 1269.7274, "eval_samples_per_second": 28.345, "eval_steps_per_second": 14.173, "step": 57000 }, { "acc": 0.79596758, "epoch": 1.3300864696189796, "grad_norm": 6.75, "learning_rate": 2.6645675608772554e-06, "loss": 0.73099294, "memory(GiB)": 147.13, "step": 57010, "train_speed(iter/s)": 0.200594 }, { "acc": 0.76844263, "epoch": 1.3303197771912685, "grad_norm": 6.03125, "learning_rate": 2.662897353120983e-06, "loss": 0.82072315, "memory(GiB)": 147.13, "step": 57020, "train_speed(iter/s)": 0.200613 }, { "acc": 0.77861929, "epoch": 1.3305530847635574, "grad_norm": 8.375, "learning_rate": 2.6612274789854326e-06, "loss": 0.75851078, "memory(GiB)": 147.13, "step": 57030, "train_speed(iter/s)": 0.200631 }, { "acc": 0.77308245, "epoch": 1.3307863923358463, "grad_norm": 6.5, "learning_rate": 2.659557938708982e-06, "loss": 0.81746635, "memory(GiB)": 147.13, "step": 57040, "train_speed(iter/s)": 0.20065 }, { "acc": 0.79103942, "epoch": 1.3310196999081352, "grad_norm": 6.78125, "learning_rate": 2.657888732529956e-06, "loss": 0.75392189, "memory(GiB)": 147.13, "step": 57050, "train_speed(iter/s)": 0.200667 }, { "acc": 0.76411114, "epoch": 1.331253007480424, "grad_norm": 5.1875, "learning_rate": 2.656219860686633e-06, "loss": 0.86522112, "memory(GiB)": 147.13, "step": 57060, "train_speed(iter/s)": 0.200687 }, { "acc": 0.77968445, "epoch": 1.331486315052713, "grad_norm": 5.9375, "learning_rate": 2.6545513234172413e-06, "loss": 0.79840469, "memory(GiB)": 147.13, "step": 57070, "train_speed(iter/s)": 0.200705 }, { "acc": 0.79253349, "epoch": 1.3317196226250019, "grad_norm": 4.875, "learning_rate": 2.65288312095997e-06, "loss": 0.7571218, "memory(GiB)": 147.13, "step": 57080, "train_speed(iter/s)": 0.200724 }, { "acc": 0.78285084, "epoch": 1.3319529301972908, "grad_norm": 3.703125, "learning_rate": 2.651215253552951e-06, "loss": 0.77423506, "memory(GiB)": 147.13, "step": 57090, "train_speed(iter/s)": 0.200742 }, { "acc": 0.78827653, "epoch": 1.3321862377695797, "grad_norm": 5.8125, "learning_rate": 2.6495477214342704e-06, "loss": 0.74848604, "memory(GiB)": 147.13, "step": 57100, "train_speed(iter/s)": 0.20076 }, { "acc": 0.77849426, "epoch": 1.3324195453418686, "grad_norm": 5.21875, "learning_rate": 2.647880524841971e-06, "loss": 0.79774141, "memory(GiB)": 147.13, "step": 57110, "train_speed(iter/s)": 0.200778 }, { "acc": 0.78932242, "epoch": 1.3326528529141575, "grad_norm": 6.125, "learning_rate": 2.646213664014042e-06, "loss": 0.74397902, "memory(GiB)": 147.13, "step": 57120, "train_speed(iter/s)": 0.200797 }, { "acc": 0.78925648, "epoch": 1.3328861604864464, "grad_norm": 4.65625, "learning_rate": 2.6445471391884304e-06, "loss": 0.76678162, "memory(GiB)": 147.13, "step": 57130, "train_speed(iter/s)": 0.200815 }, { "acc": 0.78613939, "epoch": 1.3331194680587353, "grad_norm": 5.0625, "learning_rate": 2.6428809506030306e-06, "loss": 0.78517361, "memory(GiB)": 147.13, "step": 57140, "train_speed(iter/s)": 0.200833 }, { "acc": 0.77853289, "epoch": 1.333352775631024, "grad_norm": 5.75, "learning_rate": 2.641215098495688e-06, "loss": 0.80418797, "memory(GiB)": 147.13, "step": 57150, "train_speed(iter/s)": 0.20085 }, { "acc": 0.7710371, "epoch": 1.333586083203313, "grad_norm": 7.1875, "learning_rate": 2.639549583104209e-06, "loss": 0.82036028, "memory(GiB)": 147.13, "step": 57160, "train_speed(iter/s)": 0.200869 }, { "acc": 0.7829278, "epoch": 1.3338193907756017, "grad_norm": 6.40625, "learning_rate": 2.6378844046663375e-06, "loss": 0.78318968, "memory(GiB)": 147.13, "step": 57170, "train_speed(iter/s)": 0.200887 }, { "acc": 0.77414856, "epoch": 1.3340526983478909, "grad_norm": 4.59375, "learning_rate": 2.636219563419783e-06, "loss": 0.79386172, "memory(GiB)": 147.13, "step": 57180, "train_speed(iter/s)": 0.200905 }, { "acc": 0.78854723, "epoch": 1.3342860059201795, "grad_norm": 6.0625, "learning_rate": 2.6345550596021967e-06, "loss": 0.76046, "memory(GiB)": 147.13, "step": 57190, "train_speed(iter/s)": 0.200924 }, { "acc": 0.77748866, "epoch": 1.3345193134924687, "grad_norm": 5.9375, "learning_rate": 2.632890893451191e-06, "loss": 0.80022764, "memory(GiB)": 147.13, "step": 57200, "train_speed(iter/s)": 0.200942 }, { "acc": 0.78416834, "epoch": 1.3347526210647573, "grad_norm": 4.5, "learning_rate": 2.63122706520432e-06, "loss": 0.79278574, "memory(GiB)": 147.13, "step": 57210, "train_speed(iter/s)": 0.200961 }, { "acc": 0.7817687, "epoch": 1.3349859286370465, "grad_norm": 5.59375, "learning_rate": 2.6295635750990998e-06, "loss": 0.80557747, "memory(GiB)": 147.13, "step": 57220, "train_speed(iter/s)": 0.200979 }, { "acc": 0.7696763, "epoch": 1.3352192362093351, "grad_norm": 5.90625, "learning_rate": 2.627900423372991e-06, "loss": 0.81974831, "memory(GiB)": 147.13, "step": 57230, "train_speed(iter/s)": 0.200998 }, { "acc": 0.79114909, "epoch": 1.335452543781624, "grad_norm": 4.28125, "learning_rate": 2.626237610263406e-06, "loss": 0.74017973, "memory(GiB)": 147.13, "step": 57240, "train_speed(iter/s)": 0.201017 }, { "acc": 0.7918015, "epoch": 1.335685851353913, "grad_norm": 7.0625, "learning_rate": 2.6245751360077133e-06, "loss": 0.72015886, "memory(GiB)": 147.13, "step": 57250, "train_speed(iter/s)": 0.201035 }, { "acc": 0.775524, "epoch": 1.3359191589262018, "grad_norm": 7.34375, "learning_rate": 2.622913000843228e-06, "loss": 0.79100475, "memory(GiB)": 147.13, "step": 57260, "train_speed(iter/s)": 0.201053 }, { "acc": 0.7828311, "epoch": 1.3361524664984907, "grad_norm": 5.15625, "learning_rate": 2.6212512050072236e-06, "loss": 0.77825603, "memory(GiB)": 147.13, "step": 57270, "train_speed(iter/s)": 0.201071 }, { "acc": 0.79701376, "epoch": 1.3363857740707796, "grad_norm": 5.15625, "learning_rate": 2.6195897487369195e-06, "loss": 0.71078596, "memory(GiB)": 147.13, "step": 57280, "train_speed(iter/s)": 0.201089 }, { "acc": 0.77222824, "epoch": 1.3366190816430685, "grad_norm": 6.03125, "learning_rate": 2.6179286322694866e-06, "loss": 0.80330944, "memory(GiB)": 147.13, "step": 57290, "train_speed(iter/s)": 0.201107 }, { "acc": 0.7781415, "epoch": 1.3368523892153574, "grad_norm": 6.4375, "learning_rate": 2.6162678558420484e-06, "loss": 0.79362383, "memory(GiB)": 147.13, "step": 57300, "train_speed(iter/s)": 0.201126 }, { "acc": 0.76344051, "epoch": 1.3370856967876463, "grad_norm": 6.40625, "learning_rate": 2.6146074196916806e-06, "loss": 0.84555454, "memory(GiB)": 147.13, "step": 57310, "train_speed(iter/s)": 0.201146 }, { "acc": 0.79169607, "epoch": 1.3373190043599352, "grad_norm": 5.03125, "learning_rate": 2.6129473240554126e-06, "loss": 0.7247673, "memory(GiB)": 147.13, "step": 57320, "train_speed(iter/s)": 0.201165 }, { "acc": 0.77501116, "epoch": 1.3375523119322241, "grad_norm": 5.1875, "learning_rate": 2.6112875691702176e-06, "loss": 0.79619384, "memory(GiB)": 147.13, "step": 57330, "train_speed(iter/s)": 0.201184 }, { "acc": 0.79097958, "epoch": 1.337785619504513, "grad_norm": 10.5, "learning_rate": 2.609628155273032e-06, "loss": 0.75640821, "memory(GiB)": 147.13, "step": 57340, "train_speed(iter/s)": 0.201202 }, { "acc": 0.77276196, "epoch": 1.338018927076802, "grad_norm": 4.96875, "learning_rate": 2.6079690826007307e-06, "loss": 0.83116341, "memory(GiB)": 147.13, "step": 57350, "train_speed(iter/s)": 0.20122 }, { "acc": 0.77244496, "epoch": 1.3382522346490908, "grad_norm": 5.125, "learning_rate": 2.606310351390148e-06, "loss": 0.79508491, "memory(GiB)": 147.13, "step": 57360, "train_speed(iter/s)": 0.201239 }, { "acc": 0.78902702, "epoch": 1.3384855422213797, "grad_norm": 5.125, "learning_rate": 2.6046519618780673e-06, "loss": 0.73824391, "memory(GiB)": 147.13, "step": 57370, "train_speed(iter/s)": 0.201256 }, { "acc": 0.78787184, "epoch": 1.3387188497936686, "grad_norm": 6.4375, "learning_rate": 2.6029939143012228e-06, "loss": 0.75440736, "memory(GiB)": 147.13, "step": 57380, "train_speed(iter/s)": 0.201273 }, { "acc": 0.80366573, "epoch": 1.3389521573659575, "grad_norm": 8.125, "learning_rate": 2.601336208896304e-06, "loss": 0.69952521, "memory(GiB)": 147.13, "step": 57390, "train_speed(iter/s)": 0.201292 }, { "acc": 0.77663946, "epoch": 1.3391854649382464, "grad_norm": 5.5625, "learning_rate": 2.5996788458999404e-06, "loss": 0.80009689, "memory(GiB)": 147.13, "step": 57400, "train_speed(iter/s)": 0.20131 }, { "acc": 0.77313061, "epoch": 1.3394187725105353, "grad_norm": 5.3125, "learning_rate": 2.598021825548727e-06, "loss": 0.81944647, "memory(GiB)": 147.13, "step": 57410, "train_speed(iter/s)": 0.201328 }, { "acc": 0.78171992, "epoch": 1.3396520800828242, "grad_norm": 5.46875, "learning_rate": 2.596365148079197e-06, "loss": 0.77206979, "memory(GiB)": 147.13, "step": 57420, "train_speed(iter/s)": 0.201347 }, { "acc": 0.78008041, "epoch": 1.3398853876551131, "grad_norm": 4.46875, "learning_rate": 2.594708813727847e-06, "loss": 0.79095898, "memory(GiB)": 147.13, "step": 57430, "train_speed(iter/s)": 0.201365 }, { "acc": 0.79991326, "epoch": 1.340118695227402, "grad_norm": 5.78125, "learning_rate": 2.5930528227311148e-06, "loss": 0.69914985, "memory(GiB)": 147.13, "step": 57440, "train_speed(iter/s)": 0.201382 }, { "acc": 0.79570603, "epoch": 1.340352002799691, "grad_norm": 5.1875, "learning_rate": 2.591397175325391e-06, "loss": 0.71980028, "memory(GiB)": 147.13, "step": 57450, "train_speed(iter/s)": 0.201399 }, { "acc": 0.78314619, "epoch": 1.3405853103719798, "grad_norm": 5.53125, "learning_rate": 2.5897418717470224e-06, "loss": 0.77125511, "memory(GiB)": 147.13, "step": 57460, "train_speed(iter/s)": 0.201417 }, { "acc": 0.77202096, "epoch": 1.3408186179442687, "grad_norm": 6.9375, "learning_rate": 2.5880869122322994e-06, "loss": 0.82745008, "memory(GiB)": 147.13, "step": 57470, "train_speed(iter/s)": 0.201435 }, { "acc": 0.77319994, "epoch": 1.3410519255165576, "grad_norm": 5.0, "learning_rate": 2.5864322970174714e-06, "loss": 0.79846964, "memory(GiB)": 147.13, "step": 57480, "train_speed(iter/s)": 0.201454 }, { "acc": 0.79231257, "epoch": 1.3412852330888465, "grad_norm": 5.125, "learning_rate": 2.5847780263387314e-06, "loss": 0.73147917, "memory(GiB)": 147.13, "step": 57490, "train_speed(iter/s)": 0.201472 }, { "acc": 0.77735405, "epoch": 1.3415185406611354, "grad_norm": 6.53125, "learning_rate": 2.583124100432227e-06, "loss": 0.78960352, "memory(GiB)": 147.13, "step": 57500, "train_speed(iter/s)": 0.201491 }, { "epoch": 1.3415185406611354, "eval_acc": 0.7445643444311685, "eval_loss": 0.8049200177192688, "eval_runtime": 1269.9367, "eval_samples_per_second": 28.341, "eval_steps_per_second": 14.171, "step": 57500 }, { "acc": 0.77931428, "epoch": 1.3417518482334243, "grad_norm": 5.0625, "learning_rate": 2.5814705195340527e-06, "loss": 0.79522343, "memory(GiB)": 147.13, "step": 57510, "train_speed(iter/s)": 0.200603 }, { "acc": 0.77775626, "epoch": 1.341985155805713, "grad_norm": 5.3125, "learning_rate": 2.5798172838802616e-06, "loss": 0.79539862, "memory(GiB)": 147.13, "step": 57520, "train_speed(iter/s)": 0.20062 }, { "acc": 0.7882154, "epoch": 1.342218463378002, "grad_norm": 5.75, "learning_rate": 2.5781643937068495e-06, "loss": 0.75900679, "memory(GiB)": 147.13, "step": 57530, "train_speed(iter/s)": 0.200638 }, { "acc": 0.78802242, "epoch": 1.3424517709502908, "grad_norm": 4.3125, "learning_rate": 2.5765118492497654e-06, "loss": 0.75461683, "memory(GiB)": 147.13, "step": 57540, "train_speed(iter/s)": 0.200656 }, { "acc": 0.79044104, "epoch": 1.34268507852258, "grad_norm": 5.40625, "learning_rate": 2.5748596507449118e-06, "loss": 0.75172663, "memory(GiB)": 147.13, "step": 57550, "train_speed(iter/s)": 0.200674 }, { "acc": 0.78111725, "epoch": 1.3429183860948686, "grad_norm": 6.03125, "learning_rate": 2.5732077984281378e-06, "loss": 0.76782961, "memory(GiB)": 147.13, "step": 57560, "train_speed(iter/s)": 0.200691 }, { "acc": 0.78649011, "epoch": 1.3431516936671577, "grad_norm": 6.0, "learning_rate": 2.571556292535247e-06, "loss": 0.77026129, "memory(GiB)": 147.13, "step": 57570, "train_speed(iter/s)": 0.20071 }, { "acc": 0.79622579, "epoch": 1.3433850012394464, "grad_norm": 4.0625, "learning_rate": 2.5699051333019897e-06, "loss": 0.74895239, "memory(GiB)": 147.13, "step": 57580, "train_speed(iter/s)": 0.200728 }, { "acc": 0.77864823, "epoch": 1.3436183088117355, "grad_norm": 5.0, "learning_rate": 2.568254320964067e-06, "loss": 0.78803215, "memory(GiB)": 147.13, "step": 57590, "train_speed(iter/s)": 0.200746 }, { "acc": 0.776053, "epoch": 1.3438516163840242, "grad_norm": 6.46875, "learning_rate": 2.5666038557571355e-06, "loss": 0.81020985, "memory(GiB)": 147.13, "step": 57600, "train_speed(iter/s)": 0.200764 }, { "acc": 0.78326931, "epoch": 1.3440849239563133, "grad_norm": 3.53125, "learning_rate": 2.5649537379167944e-06, "loss": 0.78707523, "memory(GiB)": 147.13, "step": 57610, "train_speed(iter/s)": 0.200781 }, { "acc": 0.76431599, "epoch": 1.344318231528602, "grad_norm": 5.5, "learning_rate": 2.5633039676786044e-06, "loss": 0.8433094, "memory(GiB)": 147.13, "step": 57620, "train_speed(iter/s)": 0.2008 }, { "acc": 0.78314161, "epoch": 1.3445515391008909, "grad_norm": 4.71875, "learning_rate": 2.5616545452780607e-06, "loss": 0.76809273, "memory(GiB)": 147.13, "step": 57630, "train_speed(iter/s)": 0.200819 }, { "acc": 0.78058529, "epoch": 1.3447848466731798, "grad_norm": 5.28125, "learning_rate": 2.5600054709506244e-06, "loss": 0.78540258, "memory(GiB)": 147.13, "step": 57640, "train_speed(iter/s)": 0.200837 }, { "acc": 0.77517385, "epoch": 1.3450181542454687, "grad_norm": 6.75, "learning_rate": 2.5583567449316983e-06, "loss": 0.7901629, "memory(GiB)": 147.13, "step": 57650, "train_speed(iter/s)": 0.200854 }, { "acc": 0.75654125, "epoch": 1.3452514618177576, "grad_norm": 5.28125, "learning_rate": 2.5567083674566363e-06, "loss": 0.87824078, "memory(GiB)": 147.13, "step": 57660, "train_speed(iter/s)": 0.200873 }, { "acc": 0.79099903, "epoch": 1.3454847693900465, "grad_norm": 4.71875, "learning_rate": 2.555060338760746e-06, "loss": 0.74126825, "memory(GiB)": 147.13, "step": 57670, "train_speed(iter/s)": 0.200891 }, { "acc": 0.77133732, "epoch": 1.3457180769623354, "grad_norm": 5.5, "learning_rate": 2.553412659079281e-06, "loss": 0.83065491, "memory(GiB)": 147.13, "step": 57680, "train_speed(iter/s)": 0.200907 }, { "acc": 0.77394104, "epoch": 1.3459513845346243, "grad_norm": 4.5625, "learning_rate": 2.5517653286474486e-06, "loss": 0.7871973, "memory(GiB)": 147.13, "step": 57690, "train_speed(iter/s)": 0.200926 }, { "acc": 0.77455368, "epoch": 1.3461846921069132, "grad_norm": 7.375, "learning_rate": 2.5501183477004036e-06, "loss": 0.81267767, "memory(GiB)": 147.13, "step": 57700, "train_speed(iter/s)": 0.200944 }, { "acc": 0.78425789, "epoch": 1.346417999679202, "grad_norm": 6.65625, "learning_rate": 2.548471716473255e-06, "loss": 0.75778885, "memory(GiB)": 147.13, "step": 57710, "train_speed(iter/s)": 0.200963 }, { "acc": 0.77946496, "epoch": 1.346651307251491, "grad_norm": 5.5625, "learning_rate": 2.546825435201056e-06, "loss": 0.79724245, "memory(GiB)": 147.13, "step": 57720, "train_speed(iter/s)": 0.200981 }, { "acc": 0.76264691, "epoch": 1.3468846148237799, "grad_norm": 6.46875, "learning_rate": 2.5451795041188137e-06, "loss": 0.84872093, "memory(GiB)": 147.13, "step": 57730, "train_speed(iter/s)": 0.200998 }, { "acc": 0.78879805, "epoch": 1.3471179223960688, "grad_norm": 6.75, "learning_rate": 2.543533923461484e-06, "loss": 0.76141896, "memory(GiB)": 147.13, "step": 57740, "train_speed(iter/s)": 0.201016 }, { "acc": 0.78948154, "epoch": 1.3473512299683577, "grad_norm": 4.09375, "learning_rate": 2.541888693463971e-06, "loss": 0.73227172, "memory(GiB)": 147.13, "step": 57750, "train_speed(iter/s)": 0.201033 }, { "acc": 0.77823572, "epoch": 1.3475845375406466, "grad_norm": 4.9375, "learning_rate": 2.540243814361135e-06, "loss": 0.80870161, "memory(GiB)": 147.13, "step": 57760, "train_speed(iter/s)": 0.201051 }, { "acc": 0.77902465, "epoch": 1.3478178451129355, "grad_norm": 7.4375, "learning_rate": 2.5385992863877783e-06, "loss": 0.80094547, "memory(GiB)": 147.13, "step": 57770, "train_speed(iter/s)": 0.201069 }, { "acc": 0.77547317, "epoch": 1.3480511526852244, "grad_norm": 7.125, "learning_rate": 2.5369551097786606e-06, "loss": 0.79117107, "memory(GiB)": 147.13, "step": 57780, "train_speed(iter/s)": 0.201086 }, { "acc": 0.78636332, "epoch": 1.3482844602575133, "grad_norm": 4.75, "learning_rate": 2.5353112847684846e-06, "loss": 0.77924938, "memory(GiB)": 147.13, "step": 57790, "train_speed(iter/s)": 0.201103 }, { "acc": 0.79167924, "epoch": 1.3485177678298021, "grad_norm": 6.71875, "learning_rate": 2.5336678115919056e-06, "loss": 0.7614068, "memory(GiB)": 147.13, "step": 57800, "train_speed(iter/s)": 0.201122 }, { "acc": 0.773736, "epoch": 1.348751075402091, "grad_norm": 5.40625, "learning_rate": 2.532024690483531e-06, "loss": 0.81560125, "memory(GiB)": 147.13, "step": 57810, "train_speed(iter/s)": 0.20114 }, { "acc": 0.78321466, "epoch": 1.34898438297438, "grad_norm": 6.5, "learning_rate": 2.5303819216779134e-06, "loss": 0.76348276, "memory(GiB)": 147.13, "step": 57820, "train_speed(iter/s)": 0.201158 }, { "acc": 0.76863384, "epoch": 1.3492176905466688, "grad_norm": 5.15625, "learning_rate": 2.528739505409561e-06, "loss": 0.84750042, "memory(GiB)": 147.13, "step": 57830, "train_speed(iter/s)": 0.201177 }, { "acc": 0.79967451, "epoch": 1.3494509981189577, "grad_norm": 4.59375, "learning_rate": 2.5270974419129248e-06, "loss": 0.71103725, "memory(GiB)": 147.13, "step": 57840, "train_speed(iter/s)": 0.201194 }, { "acc": 0.77464585, "epoch": 1.3496843056912466, "grad_norm": 9.125, "learning_rate": 2.525455731422414e-06, "loss": 0.8018692, "memory(GiB)": 147.13, "step": 57850, "train_speed(iter/s)": 0.201213 }, { "acc": 0.79444895, "epoch": 1.3499176132635355, "grad_norm": 5.0, "learning_rate": 2.5238143741723743e-06, "loss": 0.72763662, "memory(GiB)": 147.13, "step": 57860, "train_speed(iter/s)": 0.201231 }, { "acc": 0.78565588, "epoch": 1.3501509208358244, "grad_norm": 6.0, "learning_rate": 2.5221733703971165e-06, "loss": 0.76235085, "memory(GiB)": 147.13, "step": 57870, "train_speed(iter/s)": 0.201249 }, { "acc": 0.78204556, "epoch": 1.3503842284081133, "grad_norm": 7.15625, "learning_rate": 2.5205327203308887e-06, "loss": 0.76153746, "memory(GiB)": 147.13, "step": 57880, "train_speed(iter/s)": 0.201268 }, { "acc": 0.78484373, "epoch": 1.3506175359804022, "grad_norm": 5.15625, "learning_rate": 2.518892424207894e-06, "loss": 0.77373028, "memory(GiB)": 147.13, "step": 57890, "train_speed(iter/s)": 0.201285 }, { "acc": 0.7609642, "epoch": 1.3508508435526911, "grad_norm": 6.65625, "learning_rate": 2.517252482262286e-06, "loss": 0.85898237, "memory(GiB)": 147.13, "step": 57900, "train_speed(iter/s)": 0.201304 }, { "acc": 0.78211231, "epoch": 1.3510841511249798, "grad_norm": 4.5625, "learning_rate": 2.515612894728164e-06, "loss": 0.78129263, "memory(GiB)": 147.13, "step": 57910, "train_speed(iter/s)": 0.201321 }, { "acc": 0.77921953, "epoch": 1.351317458697269, "grad_norm": 6.5, "learning_rate": 2.5139736618395804e-06, "loss": 0.7952693, "memory(GiB)": 147.13, "step": 57920, "train_speed(iter/s)": 0.20134 }, { "acc": 0.78653154, "epoch": 1.3515507662695576, "grad_norm": 6.09375, "learning_rate": 2.5123347838305354e-06, "loss": 0.75747843, "memory(GiB)": 147.13, "step": 57930, "train_speed(iter/s)": 0.201359 }, { "acc": 0.77528124, "epoch": 1.3517840738418467, "grad_norm": 6.96875, "learning_rate": 2.510696260934975e-06, "loss": 0.80986633, "memory(GiB)": 147.13, "step": 57940, "train_speed(iter/s)": 0.201375 }, { "acc": 0.77857499, "epoch": 1.3520173814141354, "grad_norm": 6.21875, "learning_rate": 2.509058093386802e-06, "loss": 0.79531021, "memory(GiB)": 147.13, "step": 57950, "train_speed(iter/s)": 0.201393 }, { "acc": 0.75173092, "epoch": 1.3522506889864245, "grad_norm": 5.0625, "learning_rate": 2.507420281419862e-06, "loss": 0.89647017, "memory(GiB)": 147.13, "step": 57960, "train_speed(iter/s)": 0.201411 }, { "acc": 0.77753296, "epoch": 1.3524839965587132, "grad_norm": 4.90625, "learning_rate": 2.505782825267954e-06, "loss": 0.79882941, "memory(GiB)": 147.13, "step": 57970, "train_speed(iter/s)": 0.201428 }, { "acc": 0.77016959, "epoch": 1.3527173041310023, "grad_norm": 4.5625, "learning_rate": 2.5041457251648204e-06, "loss": 0.84093132, "memory(GiB)": 147.13, "step": 57980, "train_speed(iter/s)": 0.201445 }, { "acc": 0.78002801, "epoch": 1.352950611703291, "grad_norm": 5.0, "learning_rate": 2.502508981344162e-06, "loss": 0.78813906, "memory(GiB)": 147.13, "step": 57990, "train_speed(iter/s)": 0.201463 }, { "acc": 0.78480358, "epoch": 1.35318391927558, "grad_norm": 5.65625, "learning_rate": 2.5008725940396182e-06, "loss": 0.76581554, "memory(GiB)": 147.13, "step": 58000, "train_speed(iter/s)": 0.20148 }, { "epoch": 1.35318391927558, "eval_acc": 0.7446276014029275, "eval_loss": 0.8048146963119507, "eval_runtime": 1270.2313, "eval_samples_per_second": 28.334, "eval_steps_per_second": 14.167, "step": 58000 }, { "acc": 0.7872107, "epoch": 1.3534172268478688, "grad_norm": 6.875, "learning_rate": 2.499236563484788e-06, "loss": 0.75764198, "memory(GiB)": 147.13, "step": 58010, "train_speed(iter/s)": 0.2006 }, { "acc": 0.79398293, "epoch": 1.3536505344201577, "grad_norm": 4.78125, "learning_rate": 2.4976008899132122e-06, "loss": 0.73490696, "memory(GiB)": 147.13, "step": 58020, "train_speed(iter/s)": 0.200618 }, { "acc": 0.78950262, "epoch": 1.3538838419924466, "grad_norm": 4.96875, "learning_rate": 2.49596557355838e-06, "loss": 0.73174076, "memory(GiB)": 147.13, "step": 58030, "train_speed(iter/s)": 0.200636 }, { "acc": 0.78074455, "epoch": 1.3541171495647355, "grad_norm": 6.6875, "learning_rate": 2.4943306146537365e-06, "loss": 0.79538136, "memory(GiB)": 147.13, "step": 58040, "train_speed(iter/s)": 0.200655 }, { "acc": 0.77637463, "epoch": 1.3543504571370244, "grad_norm": 8.75, "learning_rate": 2.4926960134326684e-06, "loss": 0.79754944, "memory(GiB)": 147.13, "step": 58050, "train_speed(iter/s)": 0.200673 }, { "acc": 0.78122358, "epoch": 1.3545837647093133, "grad_norm": 6.0625, "learning_rate": 2.491061770128518e-06, "loss": 0.77927294, "memory(GiB)": 147.13, "step": 58060, "train_speed(iter/s)": 0.20069 }, { "acc": 0.77235141, "epoch": 1.3548170722816022, "grad_norm": 5.09375, "learning_rate": 2.4894278849745705e-06, "loss": 0.81898794, "memory(GiB)": 147.13, "step": 58070, "train_speed(iter/s)": 0.200707 }, { "acc": 0.77147989, "epoch": 1.355050379853891, "grad_norm": 7.71875, "learning_rate": 2.4877943582040636e-06, "loss": 0.81207123, "memory(GiB)": 147.13, "step": 58080, "train_speed(iter/s)": 0.200726 }, { "acc": 0.79436178, "epoch": 1.35528368742618, "grad_norm": 4.4375, "learning_rate": 2.486161190050182e-06, "loss": 0.73931255, "memory(GiB)": 147.13, "step": 58090, "train_speed(iter/s)": 0.200743 }, { "acc": 0.78587637, "epoch": 1.355516994998469, "grad_norm": 5.8125, "learning_rate": 2.4845283807460587e-06, "loss": 0.76380043, "memory(GiB)": 147.13, "step": 58100, "train_speed(iter/s)": 0.200762 }, { "acc": 0.79440575, "epoch": 1.3557503025707578, "grad_norm": 5.21875, "learning_rate": 2.4828959305247795e-06, "loss": 0.74594207, "memory(GiB)": 147.13, "step": 58110, "train_speed(iter/s)": 0.20078 }, { "acc": 0.79074926, "epoch": 1.3559836101430467, "grad_norm": 5.0625, "learning_rate": 2.4812638396193734e-06, "loss": 0.76517062, "memory(GiB)": 147.13, "step": 58120, "train_speed(iter/s)": 0.200798 }, { "acc": 0.7802947, "epoch": 1.3562169177153356, "grad_norm": 4.6875, "learning_rate": 2.479632108262825e-06, "loss": 0.77691612, "memory(GiB)": 147.13, "step": 58130, "train_speed(iter/s)": 0.200816 }, { "acc": 0.78983259, "epoch": 1.3564502252876245, "grad_norm": 4.90625, "learning_rate": 2.4780007366880584e-06, "loss": 0.75796638, "memory(GiB)": 147.13, "step": 58140, "train_speed(iter/s)": 0.200832 }, { "acc": 0.77917747, "epoch": 1.3566835328599134, "grad_norm": 3.953125, "learning_rate": 2.476369725127956e-06, "loss": 0.79284611, "memory(GiB)": 147.13, "step": 58150, "train_speed(iter/s)": 0.20085 }, { "acc": 0.76637774, "epoch": 1.3569168404322023, "grad_norm": 5.34375, "learning_rate": 2.474739073815342e-06, "loss": 0.84874744, "memory(GiB)": 147.13, "step": 58160, "train_speed(iter/s)": 0.200868 }, { "acc": 0.78275561, "epoch": 1.3571501480044912, "grad_norm": 4.59375, "learning_rate": 2.47310878298299e-06, "loss": 0.79091311, "memory(GiB)": 147.13, "step": 58170, "train_speed(iter/s)": 0.200886 }, { "acc": 0.78089781, "epoch": 1.35738345557678, "grad_norm": 5.5, "learning_rate": 2.4714788528636275e-06, "loss": 0.77343168, "memory(GiB)": 147.13, "step": 58180, "train_speed(iter/s)": 0.200903 }, { "acc": 0.77175679, "epoch": 1.357616763149069, "grad_norm": 4.625, "learning_rate": 2.4698492836899234e-06, "loss": 0.80777512, "memory(GiB)": 147.13, "step": 58190, "train_speed(iter/s)": 0.20092 }, { "acc": 0.78164663, "epoch": 1.3578500707213579, "grad_norm": 4.8125, "learning_rate": 2.4682200756944997e-06, "loss": 0.7826436, "memory(GiB)": 147.13, "step": 58200, "train_speed(iter/s)": 0.200938 }, { "acc": 0.79077911, "epoch": 1.3580833782936468, "grad_norm": 6.03125, "learning_rate": 2.4665912291099225e-06, "loss": 0.74198914, "memory(GiB)": 147.13, "step": 58210, "train_speed(iter/s)": 0.200956 }, { "acc": 0.78291769, "epoch": 1.3583166858659357, "grad_norm": 5.59375, "learning_rate": 2.4649627441687134e-06, "loss": 0.79139719, "memory(GiB)": 147.13, "step": 58220, "train_speed(iter/s)": 0.200973 }, { "acc": 0.77830348, "epoch": 1.3585499934382246, "grad_norm": 5.40625, "learning_rate": 2.463334621103336e-06, "loss": 0.7960639, "memory(GiB)": 147.13, "step": 58230, "train_speed(iter/s)": 0.200991 }, { "acc": 0.77357435, "epoch": 1.3587833010105135, "grad_norm": 6.53125, "learning_rate": 2.461706860146203e-06, "loss": 0.81305132, "memory(GiB)": 147.13, "step": 58240, "train_speed(iter/s)": 0.201008 }, { "acc": 0.76329145, "epoch": 1.3590166085828024, "grad_norm": 6.3125, "learning_rate": 2.4600794615296797e-06, "loss": 0.85375662, "memory(GiB)": 147.13, "step": 58250, "train_speed(iter/s)": 0.201026 }, { "acc": 0.78082247, "epoch": 1.3592499161550913, "grad_norm": 17.125, "learning_rate": 2.4584524254860736e-06, "loss": 0.78417535, "memory(GiB)": 147.13, "step": 58260, "train_speed(iter/s)": 0.201043 }, { "acc": 0.77918062, "epoch": 1.3594832237273802, "grad_norm": 5.0, "learning_rate": 2.4568257522476476e-06, "loss": 0.80585155, "memory(GiB)": 147.13, "step": 58270, "train_speed(iter/s)": 0.201061 }, { "acc": 0.76542234, "epoch": 1.359716531299669, "grad_norm": 4.34375, "learning_rate": 2.455199442046607e-06, "loss": 0.83563366, "memory(GiB)": 147.13, "step": 58280, "train_speed(iter/s)": 0.20108 }, { "acc": 0.80455227, "epoch": 1.359949838871958, "grad_norm": 5.25, "learning_rate": 2.453573495115104e-06, "loss": 0.68476171, "memory(GiB)": 147.13, "step": 58290, "train_speed(iter/s)": 0.201099 }, { "acc": 0.80062113, "epoch": 1.3601831464442466, "grad_norm": 4.21875, "learning_rate": 2.4519479116852476e-06, "loss": 0.7182889, "memory(GiB)": 147.13, "step": 58300, "train_speed(iter/s)": 0.201116 }, { "acc": 0.78591204, "epoch": 1.3604164540165358, "grad_norm": 5.71875, "learning_rate": 2.450322691989086e-06, "loss": 0.7478971, "memory(GiB)": 147.13, "step": 58310, "train_speed(iter/s)": 0.201134 }, { "acc": 0.78090954, "epoch": 1.3606497615888244, "grad_norm": 6.5625, "learning_rate": 2.4486978362586196e-06, "loss": 0.79978738, "memory(GiB)": 147.13, "step": 58320, "train_speed(iter/s)": 0.201151 }, { "acc": 0.77255669, "epoch": 1.3608830691611136, "grad_norm": 4.96875, "learning_rate": 2.447073344725794e-06, "loss": 0.80740366, "memory(GiB)": 147.13, "step": 58330, "train_speed(iter/s)": 0.201168 }, { "acc": 0.77630148, "epoch": 1.3611163767334022, "grad_norm": 6.34375, "learning_rate": 2.4454492176225087e-06, "loss": 0.79302568, "memory(GiB)": 147.13, "step": 58340, "train_speed(iter/s)": 0.201186 }, { "acc": 0.78025756, "epoch": 1.3613496843056914, "grad_norm": 5.34375, "learning_rate": 2.4438254551806034e-06, "loss": 0.77262883, "memory(GiB)": 147.13, "step": 58350, "train_speed(iter/s)": 0.201204 }, { "acc": 0.78731136, "epoch": 1.36158299187798, "grad_norm": 4.71875, "learning_rate": 2.4422020576318737e-06, "loss": 0.75996304, "memory(GiB)": 147.13, "step": 58360, "train_speed(iter/s)": 0.201221 }, { "acc": 0.75691671, "epoch": 1.3618162994502692, "grad_norm": 6.125, "learning_rate": 2.4405790252080576e-06, "loss": 0.90783367, "memory(GiB)": 147.13, "step": 58370, "train_speed(iter/s)": 0.201239 }, { "acc": 0.78348799, "epoch": 1.3620496070225578, "grad_norm": 6.125, "learning_rate": 2.4389563581408397e-06, "loss": 0.79257278, "memory(GiB)": 147.13, "step": 58380, "train_speed(iter/s)": 0.201255 }, { "acc": 0.76404047, "epoch": 1.3622829145948467, "grad_norm": 7.3125, "learning_rate": 2.4373340566618603e-06, "loss": 0.85965233, "memory(GiB)": 147.13, "step": 58390, "train_speed(iter/s)": 0.201273 }, { "acc": 0.77306118, "epoch": 1.3625162221671356, "grad_norm": 5.21875, "learning_rate": 2.435712121002698e-06, "loss": 0.82709122, "memory(GiB)": 147.13, "step": 58400, "train_speed(iter/s)": 0.20129 }, { "acc": 0.78636055, "epoch": 1.3627495297394245, "grad_norm": 4.59375, "learning_rate": 2.4340905513948866e-06, "loss": 0.76273184, "memory(GiB)": 147.13, "step": 58410, "train_speed(iter/s)": 0.201308 }, { "acc": 0.79748254, "epoch": 1.3629828373117134, "grad_norm": 4.34375, "learning_rate": 2.432469348069904e-06, "loss": 0.73208895, "memory(GiB)": 147.13, "step": 58420, "train_speed(iter/s)": 0.201323 }, { "acc": 0.79371405, "epoch": 1.3632161448840023, "grad_norm": 5.9375, "learning_rate": 2.4308485112591764e-06, "loss": 0.73580298, "memory(GiB)": 147.13, "step": 58430, "train_speed(iter/s)": 0.20134 }, { "acc": 0.79239931, "epoch": 1.3634494524562912, "grad_norm": 6.84375, "learning_rate": 2.429228041194077e-06, "loss": 0.74616742, "memory(GiB)": 147.13, "step": 58440, "train_speed(iter/s)": 0.201358 }, { "acc": 0.75693569, "epoch": 1.3636827600285801, "grad_norm": 4.5, "learning_rate": 2.4276079381059258e-06, "loss": 0.89039459, "memory(GiB)": 147.13, "step": 58450, "train_speed(iter/s)": 0.201376 }, { "acc": 0.76860232, "epoch": 1.363916067600869, "grad_norm": 7.15625, "learning_rate": 2.4259882022259968e-06, "loss": 0.82961311, "memory(GiB)": 147.13, "step": 58460, "train_speed(iter/s)": 0.201394 }, { "acc": 0.79684687, "epoch": 1.364149375173158, "grad_norm": 5.25, "learning_rate": 2.424368833785502e-06, "loss": 0.72636261, "memory(GiB)": 147.13, "step": 58470, "train_speed(iter/s)": 0.201411 }, { "acc": 0.77321219, "epoch": 1.3643826827454468, "grad_norm": 5.125, "learning_rate": 2.4227498330156095e-06, "loss": 0.82248058, "memory(GiB)": 147.13, "step": 58480, "train_speed(iter/s)": 0.20143 }, { "acc": 0.77851667, "epoch": 1.3646159903177357, "grad_norm": 5.4375, "learning_rate": 2.421131200147428e-06, "loss": 0.81334505, "memory(GiB)": 147.13, "step": 58490, "train_speed(iter/s)": 0.201447 }, { "acc": 0.78208332, "epoch": 1.3648492978900246, "grad_norm": 5.15625, "learning_rate": 2.4195129354120204e-06, "loss": 0.78902369, "memory(GiB)": 147.13, "step": 58500, "train_speed(iter/s)": 0.201465 }, { "epoch": 1.3648492978900246, "eval_acc": 0.7446186333259187, "eval_loss": 0.8047336339950562, "eval_runtime": 1268.6994, "eval_samples_per_second": 28.368, "eval_steps_per_second": 14.185, "step": 58500 }, { "acc": 0.78909907, "epoch": 1.3650826054623135, "grad_norm": 4.84375, "learning_rate": 2.4178950390403917e-06, "loss": 0.74475222, "memory(GiB)": 147.13, "step": 58510, "train_speed(iter/s)": 0.200594 }, { "acc": 0.75878892, "epoch": 1.3653159130346024, "grad_norm": 7.4375, "learning_rate": 2.416277511263494e-06, "loss": 0.86468029, "memory(GiB)": 147.13, "step": 58520, "train_speed(iter/s)": 0.200611 }, { "acc": 0.77081804, "epoch": 1.3655492206068913, "grad_norm": 4.6875, "learning_rate": 2.4146603523122347e-06, "loss": 0.83188496, "memory(GiB)": 147.13, "step": 58530, "train_speed(iter/s)": 0.200629 }, { "acc": 0.78759122, "epoch": 1.3657825281791802, "grad_norm": 6.4375, "learning_rate": 2.413043562417456e-06, "loss": 0.7472733, "memory(GiB)": 147.13, "step": 58540, "train_speed(iter/s)": 0.200647 }, { "acc": 0.79054499, "epoch": 1.3660158357514691, "grad_norm": 6.25, "learning_rate": 2.4114271418099583e-06, "loss": 0.77312899, "memory(GiB)": 147.13, "step": 58550, "train_speed(iter/s)": 0.200666 }, { "acc": 0.76001344, "epoch": 1.366249143323758, "grad_norm": 5.6875, "learning_rate": 2.4098110907204824e-06, "loss": 0.86084118, "memory(GiB)": 147.13, "step": 58560, "train_speed(iter/s)": 0.200683 }, { "acc": 0.7808567, "epoch": 1.366482450896047, "grad_norm": 5.0, "learning_rate": 2.4081954093797234e-06, "loss": 0.81140366, "memory(GiB)": 147.13, "step": 58570, "train_speed(iter/s)": 0.200701 }, { "acc": 0.78830757, "epoch": 1.3667157584683358, "grad_norm": 5.375, "learning_rate": 2.406580098018316e-06, "loss": 0.74808269, "memory(GiB)": 147.13, "step": 58580, "train_speed(iter/s)": 0.200719 }, { "acc": 0.78679028, "epoch": 1.3669490660406247, "grad_norm": 5.84375, "learning_rate": 2.4049651568668447e-06, "loss": 0.76290827, "memory(GiB)": 147.13, "step": 58590, "train_speed(iter/s)": 0.200738 }, { "acc": 0.77562418, "epoch": 1.3671823736129136, "grad_norm": 5.9375, "learning_rate": 2.403350586155845e-06, "loss": 0.82148495, "memory(GiB)": 147.13, "step": 58600, "train_speed(iter/s)": 0.200756 }, { "acc": 0.76451578, "epoch": 1.3674156811852025, "grad_norm": 5.46875, "learning_rate": 2.4017363861157927e-06, "loss": 0.85645256, "memory(GiB)": 147.13, "step": 58610, "train_speed(iter/s)": 0.200775 }, { "acc": 0.77976284, "epoch": 1.3676489887574914, "grad_norm": 5.96875, "learning_rate": 2.400122556977119e-06, "loss": 0.79148855, "memory(GiB)": 147.13, "step": 58620, "train_speed(iter/s)": 0.200793 }, { "acc": 0.77969866, "epoch": 1.3678822963297803, "grad_norm": 3.96875, "learning_rate": 2.398509098970193e-06, "loss": 0.77951374, "memory(GiB)": 147.13, "step": 58630, "train_speed(iter/s)": 0.200811 }, { "acc": 0.78442101, "epoch": 1.3681156039020692, "grad_norm": 4.0625, "learning_rate": 2.3968960123253392e-06, "loss": 0.76123886, "memory(GiB)": 147.13, "step": 58640, "train_speed(iter/s)": 0.20083 }, { "acc": 0.7682898, "epoch": 1.368348911474358, "grad_norm": 7.375, "learning_rate": 2.3952832972728234e-06, "loss": 0.83347187, "memory(GiB)": 147.13, "step": 58650, "train_speed(iter/s)": 0.200847 }, { "acc": 0.78368082, "epoch": 1.368582219046647, "grad_norm": 5.5625, "learning_rate": 2.39367095404286e-06, "loss": 0.78032255, "memory(GiB)": 147.13, "step": 58660, "train_speed(iter/s)": 0.200866 }, { "acc": 0.7843792, "epoch": 1.368815526618936, "grad_norm": 4.53125, "learning_rate": 2.392058982865611e-06, "loss": 0.76798601, "memory(GiB)": 147.13, "step": 58670, "train_speed(iter/s)": 0.200883 }, { "acc": 0.77215185, "epoch": 1.3690488341912248, "grad_norm": 7.34375, "learning_rate": 2.3904473839711826e-06, "loss": 0.81567993, "memory(GiB)": 147.13, "step": 58680, "train_speed(iter/s)": 0.200902 }, { "acc": 0.79290199, "epoch": 1.3692821417635135, "grad_norm": 5.96875, "learning_rate": 2.388836157589634e-06, "loss": 0.75185156, "memory(GiB)": 147.13, "step": 58690, "train_speed(iter/s)": 0.20092 }, { "acc": 0.78097115, "epoch": 1.3695154493358026, "grad_norm": 5.5, "learning_rate": 2.3872253039509637e-06, "loss": 0.78168497, "memory(GiB)": 147.13, "step": 58700, "train_speed(iter/s)": 0.200937 }, { "acc": 0.76524758, "epoch": 1.3697487569080913, "grad_norm": 6.3125, "learning_rate": 2.3856148232851237e-06, "loss": 0.90382605, "memory(GiB)": 147.13, "step": 58710, "train_speed(iter/s)": 0.200955 }, { "acc": 0.78163342, "epoch": 1.3699820644803804, "grad_norm": 5.53125, "learning_rate": 2.384004715822009e-06, "loss": 0.78530493, "memory(GiB)": 147.13, "step": 58720, "train_speed(iter/s)": 0.200972 }, { "acc": 0.78916397, "epoch": 1.370215372052669, "grad_norm": 9.8125, "learning_rate": 2.3823949817914584e-06, "loss": 0.75205698, "memory(GiB)": 147.13, "step": 58730, "train_speed(iter/s)": 0.20099 }, { "acc": 0.79455929, "epoch": 1.3704486796249582, "grad_norm": 6.84375, "learning_rate": 2.380785621423266e-06, "loss": 0.72274475, "memory(GiB)": 147.13, "step": 58740, "train_speed(iter/s)": 0.201007 }, { "acc": 0.78989534, "epoch": 1.3706819871972469, "grad_norm": 6.0625, "learning_rate": 2.379176634947163e-06, "loss": 0.7328239, "memory(GiB)": 147.13, "step": 58750, "train_speed(iter/s)": 0.201025 }, { "acc": 0.77720261, "epoch": 1.370915294769536, "grad_norm": 5.75, "learning_rate": 2.377568022592838e-06, "loss": 0.79559016, "memory(GiB)": 147.13, "step": 58760, "train_speed(iter/s)": 0.201042 }, { "acc": 0.77461166, "epoch": 1.3711486023418247, "grad_norm": 4.15625, "learning_rate": 2.3759597845899123e-06, "loss": 0.79362354, "memory(GiB)": 147.13, "step": 58770, "train_speed(iter/s)": 0.201059 }, { "acc": 0.78339438, "epoch": 1.3713819099141136, "grad_norm": 9.4375, "learning_rate": 2.374351921167967e-06, "loss": 0.74716663, "memory(GiB)": 147.13, "step": 58780, "train_speed(iter/s)": 0.201077 }, { "acc": 0.78279791, "epoch": 1.3716152174864025, "grad_norm": 5.34375, "learning_rate": 2.37274443255652e-06, "loss": 0.80144253, "memory(GiB)": 147.13, "step": 58790, "train_speed(iter/s)": 0.201094 }, { "acc": 0.78304019, "epoch": 1.3718485250586914, "grad_norm": 4.53125, "learning_rate": 2.3711373189850444e-06, "loss": 0.76427855, "memory(GiB)": 147.13, "step": 58800, "train_speed(iter/s)": 0.201111 }, { "acc": 0.76471519, "epoch": 1.3720818326309803, "grad_norm": 5.25, "learning_rate": 2.369530580682953e-06, "loss": 0.83371811, "memory(GiB)": 147.13, "step": 58810, "train_speed(iter/s)": 0.201129 }, { "acc": 0.7774107, "epoch": 1.3723151402032692, "grad_norm": 5.28125, "learning_rate": 2.367924217879604e-06, "loss": 0.80156479, "memory(GiB)": 147.13, "step": 58820, "train_speed(iter/s)": 0.201146 }, { "acc": 0.78052721, "epoch": 1.372548447775558, "grad_norm": 5.0625, "learning_rate": 2.3663182308043115e-06, "loss": 0.79353514, "memory(GiB)": 147.13, "step": 58830, "train_speed(iter/s)": 0.201163 }, { "acc": 0.77628222, "epoch": 1.372781755347847, "grad_norm": 6.34375, "learning_rate": 2.3647126196863234e-06, "loss": 0.78026848, "memory(GiB)": 147.13, "step": 58840, "train_speed(iter/s)": 0.20118 }, { "acc": 0.7791482, "epoch": 1.3730150629201359, "grad_norm": 6.0625, "learning_rate": 2.3631073847548457e-06, "loss": 0.80607452, "memory(GiB)": 147.13, "step": 58850, "train_speed(iter/s)": 0.201197 }, { "acc": 0.76585202, "epoch": 1.3732483704924248, "grad_norm": 4.46875, "learning_rate": 2.3615025262390228e-06, "loss": 0.8779665, "memory(GiB)": 147.13, "step": 58860, "train_speed(iter/s)": 0.201214 }, { "acc": 0.77455015, "epoch": 1.3734816780647137, "grad_norm": 5.59375, "learning_rate": 2.3598980443679483e-06, "loss": 0.80353298, "memory(GiB)": 147.13, "step": 58870, "train_speed(iter/s)": 0.201232 }, { "acc": 0.77604198, "epoch": 1.3737149856370026, "grad_norm": 5.625, "learning_rate": 2.3582939393706604e-06, "loss": 0.79859333, "memory(GiB)": 147.13, "step": 58880, "train_speed(iter/s)": 0.201249 }, { "acc": 0.79776068, "epoch": 1.3739482932092915, "grad_norm": 8.8125, "learning_rate": 2.3566902114761435e-06, "loss": 0.71753788, "memory(GiB)": 147.13, "step": 58890, "train_speed(iter/s)": 0.201267 }, { "acc": 0.78484306, "epoch": 1.3741816007815804, "grad_norm": 4.40625, "learning_rate": 2.3550868609133326e-06, "loss": 0.76917105, "memory(GiB)": 147.13, "step": 58900, "train_speed(iter/s)": 0.201285 }, { "acc": 0.78789654, "epoch": 1.3744149083538693, "grad_norm": 5.65625, "learning_rate": 2.3534838879111026e-06, "loss": 0.74949794, "memory(GiB)": 147.13, "step": 58910, "train_speed(iter/s)": 0.201303 }, { "acc": 0.7702579, "epoch": 1.3746482159261582, "grad_norm": 4.1875, "learning_rate": 2.35188129269828e-06, "loss": 0.82006226, "memory(GiB)": 147.13, "step": 58920, "train_speed(iter/s)": 0.201321 }, { "acc": 0.77914472, "epoch": 1.374881523498447, "grad_norm": 6.875, "learning_rate": 2.3502790755036324e-06, "loss": 0.80082273, "memory(GiB)": 147.13, "step": 58930, "train_speed(iter/s)": 0.201338 }, { "acc": 0.78248625, "epoch": 1.375114831070736, "grad_norm": 6.71875, "learning_rate": 2.3486772365558786e-06, "loss": 0.80548611, "memory(GiB)": 147.13, "step": 58940, "train_speed(iter/s)": 0.201356 }, { "acc": 0.7839232, "epoch": 1.3753481386430249, "grad_norm": 6.125, "learning_rate": 2.3470757760836794e-06, "loss": 0.75650473, "memory(GiB)": 147.13, "step": 58950, "train_speed(iter/s)": 0.201373 }, { "acc": 0.77035446, "epoch": 1.3755814462153138, "grad_norm": 4.40625, "learning_rate": 2.34547469431564e-06, "loss": 0.84554367, "memory(GiB)": 147.13, "step": 58960, "train_speed(iter/s)": 0.201391 }, { "acc": 0.77048821, "epoch": 1.3758147537876027, "grad_norm": 5.0, "learning_rate": 2.3438739914803193e-06, "loss": 0.82956161, "memory(GiB)": 147.13, "step": 58970, "train_speed(iter/s)": 0.201409 }, { "acc": 0.76621504, "epoch": 1.3760480613598916, "grad_norm": 5.0, "learning_rate": 2.3422736678062126e-06, "loss": 0.85359306, "memory(GiB)": 147.13, "step": 58980, "train_speed(iter/s)": 0.201427 }, { "acc": 0.8010005, "epoch": 1.3762813689321804, "grad_norm": 4.375, "learning_rate": 2.3406737235217714e-06, "loss": 0.69008727, "memory(GiB)": 147.13, "step": 58990, "train_speed(iter/s)": 0.201445 }, { "acc": 0.78054056, "epoch": 1.3765146765044693, "grad_norm": 5.65625, "learning_rate": 2.33907415885538e-06, "loss": 0.75995293, "memory(GiB)": 147.13, "step": 59000, "train_speed(iter/s)": 0.201462 }, { "epoch": 1.3765146765044693, "eval_acc": 0.7445920493833567, "eval_loss": 0.8047448992729187, "eval_runtime": 1269.8518, "eval_samples_per_second": 28.343, "eval_steps_per_second": 14.172, "step": 59000 }, { "acc": 0.78174009, "epoch": 1.3767479840767582, "grad_norm": 4.40625, "learning_rate": 2.3374749740353815e-06, "loss": 0.77599964, "memory(GiB)": 147.13, "step": 59010, "train_speed(iter/s)": 0.200599 }, { "acc": 0.78817472, "epoch": 1.3769812916490471, "grad_norm": 4.34375, "learning_rate": 2.335876169290056e-06, "loss": 0.76314001, "memory(GiB)": 147.13, "step": 59020, "train_speed(iter/s)": 0.200615 }, { "acc": 0.78163285, "epoch": 1.377214599221336, "grad_norm": 8.5625, "learning_rate": 2.3342777448476326e-06, "loss": 0.76157379, "memory(GiB)": 147.13, "step": 59030, "train_speed(iter/s)": 0.200632 }, { "acc": 0.7947587, "epoch": 1.377447906793625, "grad_norm": 5.71875, "learning_rate": 2.3326797009362884e-06, "loss": 0.72174067, "memory(GiB)": 147.13, "step": 59040, "train_speed(iter/s)": 0.20065 }, { "acc": 0.78288536, "epoch": 1.3776812143659138, "grad_norm": 5.0625, "learning_rate": 2.33108203778414e-06, "loss": 0.78767552, "memory(GiB)": 147.13, "step": 59050, "train_speed(iter/s)": 0.200667 }, { "acc": 0.76853189, "epoch": 1.3779145219382025, "grad_norm": 4.5625, "learning_rate": 2.3294847556192575e-06, "loss": 0.83004322, "memory(GiB)": 147.13, "step": 59060, "train_speed(iter/s)": 0.200685 }, { "acc": 0.76078501, "epoch": 1.3781478295104916, "grad_norm": 6.40625, "learning_rate": 2.32788785466965e-06, "loss": 0.86728783, "memory(GiB)": 147.13, "step": 59070, "train_speed(iter/s)": 0.200703 }, { "acc": 0.77539091, "epoch": 1.3783811370827803, "grad_norm": 5.625, "learning_rate": 2.3262913351632725e-06, "loss": 0.79250236, "memory(GiB)": 147.13, "step": 59080, "train_speed(iter/s)": 0.200721 }, { "acc": 0.77549553, "epoch": 1.3786144446550694, "grad_norm": 4.65625, "learning_rate": 2.3246951973280328e-06, "loss": 0.80941544, "memory(GiB)": 147.13, "step": 59090, "train_speed(iter/s)": 0.200739 }, { "acc": 0.78177032, "epoch": 1.3788477522273581, "grad_norm": 6.3125, "learning_rate": 2.3230994413917767e-06, "loss": 0.77314987, "memory(GiB)": 147.13, "step": 59100, "train_speed(iter/s)": 0.200757 }, { "acc": 0.77774806, "epoch": 1.3790810597996472, "grad_norm": 4.28125, "learning_rate": 2.3215040675822976e-06, "loss": 0.79984112, "memory(GiB)": 147.13, "step": 59110, "train_speed(iter/s)": 0.200775 }, { "acc": 0.78535266, "epoch": 1.379314367371936, "grad_norm": 5.0, "learning_rate": 2.319909076127333e-06, "loss": 0.76313772, "memory(GiB)": 147.13, "step": 59120, "train_speed(iter/s)": 0.200793 }, { "acc": 0.78793802, "epoch": 1.379547674944225, "grad_norm": 4.53125, "learning_rate": 2.3183144672545706e-06, "loss": 0.77101927, "memory(GiB)": 147.13, "step": 59130, "train_speed(iter/s)": 0.200811 }, { "acc": 0.79543657, "epoch": 1.3797809825165137, "grad_norm": 6.125, "learning_rate": 2.3167202411916372e-06, "loss": 0.71729631, "memory(GiB)": 147.13, "step": 59140, "train_speed(iter/s)": 0.200829 }, { "acc": 0.79009199, "epoch": 1.3800142900888028, "grad_norm": 7.875, "learning_rate": 2.315126398166112e-06, "loss": 0.75708284, "memory(GiB)": 147.13, "step": 59150, "train_speed(iter/s)": 0.200845 }, { "acc": 0.78260612, "epoch": 1.3802475976610915, "grad_norm": 6.625, "learning_rate": 2.3135329384055134e-06, "loss": 0.78136015, "memory(GiB)": 147.13, "step": 59160, "train_speed(iter/s)": 0.200861 }, { "acc": 0.75776453, "epoch": 1.3804809052333804, "grad_norm": 4.375, "learning_rate": 2.3119398621373055e-06, "loss": 0.87936306, "memory(GiB)": 147.13, "step": 59170, "train_speed(iter/s)": 0.200879 }, { "acc": 0.78566179, "epoch": 1.3807142128056693, "grad_norm": 6.46875, "learning_rate": 2.3103471695889035e-06, "loss": 0.75471678, "memory(GiB)": 147.13, "step": 59180, "train_speed(iter/s)": 0.200896 }, { "acc": 0.77722082, "epoch": 1.3809475203779582, "grad_norm": 3.84375, "learning_rate": 2.308754860987659e-06, "loss": 0.78902168, "memory(GiB)": 147.13, "step": 59190, "train_speed(iter/s)": 0.20091 }, { "acc": 0.78038301, "epoch": 1.381180827950247, "grad_norm": 5.34375, "learning_rate": 2.3071629365608793e-06, "loss": 0.7852129, "memory(GiB)": 147.13, "step": 59200, "train_speed(iter/s)": 0.200929 }, { "acc": 0.76663785, "epoch": 1.381414135522536, "grad_norm": 6.8125, "learning_rate": 2.305571396535807e-06, "loss": 0.84909973, "memory(GiB)": 147.13, "step": 59210, "train_speed(iter/s)": 0.200946 }, { "acc": 0.76717291, "epoch": 1.381647443094825, "grad_norm": 5.84375, "learning_rate": 2.303980241139636e-06, "loss": 0.84376945, "memory(GiB)": 147.13, "step": 59220, "train_speed(iter/s)": 0.200964 }, { "acc": 0.75790958, "epoch": 1.3818807506671138, "grad_norm": 9.0, "learning_rate": 2.3023894705995e-06, "loss": 0.85827541, "memory(GiB)": 147.13, "step": 59230, "train_speed(iter/s)": 0.200981 }, { "acc": 0.7841033, "epoch": 1.3821140582394027, "grad_norm": 10.6875, "learning_rate": 2.3007990851424862e-06, "loss": 0.79800854, "memory(GiB)": 147.13, "step": 59240, "train_speed(iter/s)": 0.200999 }, { "acc": 0.78057899, "epoch": 1.3823473658116916, "grad_norm": 4.15625, "learning_rate": 2.2992090849956176e-06, "loss": 0.8046237, "memory(GiB)": 147.13, "step": 59250, "train_speed(iter/s)": 0.201017 }, { "acc": 0.77489777, "epoch": 1.3825806733839805, "grad_norm": 6.46875, "learning_rate": 2.2976194703858666e-06, "loss": 0.80679083, "memory(GiB)": 147.13, "step": 59260, "train_speed(iter/s)": 0.201034 }, { "acc": 0.77184649, "epoch": 1.3828139809562694, "grad_norm": 6.0625, "learning_rate": 2.2960302415401525e-06, "loss": 0.84999313, "memory(GiB)": 147.13, "step": 59270, "train_speed(iter/s)": 0.201052 }, { "acc": 0.79643536, "epoch": 1.3830472885285583, "grad_norm": 6.625, "learning_rate": 2.2944413986853344e-06, "loss": 0.75381327, "memory(GiB)": 147.13, "step": 59280, "train_speed(iter/s)": 0.201069 }, { "acc": 0.77682009, "epoch": 1.3832805961008472, "grad_norm": 4.0, "learning_rate": 2.292852942048222e-06, "loss": 0.78831792, "memory(GiB)": 147.13, "step": 59290, "train_speed(iter/s)": 0.201088 }, { "acc": 0.78009706, "epoch": 1.383513903673136, "grad_norm": 5.375, "learning_rate": 2.2912648718555665e-06, "loss": 0.79728909, "memory(GiB)": 147.13, "step": 59300, "train_speed(iter/s)": 0.201106 }, { "acc": 0.77483168, "epoch": 1.383747211245425, "grad_norm": 5.40625, "learning_rate": 2.2896771883340614e-06, "loss": 0.81179886, "memory(GiB)": 147.13, "step": 59310, "train_speed(iter/s)": 0.201124 }, { "acc": 0.77034178, "epoch": 1.383980518817714, "grad_norm": 4.8125, "learning_rate": 2.2880898917103515e-06, "loss": 0.84435978, "memory(GiB)": 147.13, "step": 59320, "train_speed(iter/s)": 0.201142 }, { "acc": 0.78930626, "epoch": 1.3842138263900028, "grad_norm": 8.9375, "learning_rate": 2.2865029822110222e-06, "loss": 0.73878508, "memory(GiB)": 147.13, "step": 59330, "train_speed(iter/s)": 0.201159 }, { "acc": 0.78097978, "epoch": 1.3844471339622917, "grad_norm": 4.59375, "learning_rate": 2.2849164600626045e-06, "loss": 0.77706594, "memory(GiB)": 147.13, "step": 59340, "train_speed(iter/s)": 0.201178 }, { "acc": 0.77245169, "epoch": 1.3846804415345806, "grad_norm": 5.25, "learning_rate": 2.2833303254915713e-06, "loss": 0.80866556, "memory(GiB)": 147.13, "step": 59350, "train_speed(iter/s)": 0.201195 }, { "acc": 0.80767746, "epoch": 1.3849137491068695, "grad_norm": 5.71875, "learning_rate": 2.2817445787243464e-06, "loss": 0.6845746, "memory(GiB)": 147.13, "step": 59360, "train_speed(iter/s)": 0.201214 }, { "acc": 0.78989286, "epoch": 1.3851470566791584, "grad_norm": 7.21875, "learning_rate": 2.280159219987293e-06, "loss": 0.73846169, "memory(GiB)": 147.13, "step": 59370, "train_speed(iter/s)": 0.201231 }, { "acc": 0.76815262, "epoch": 1.3853803642514473, "grad_norm": 8.6875, "learning_rate": 2.27857424950672e-06, "loss": 0.82266207, "memory(GiB)": 147.13, "step": 59380, "train_speed(iter/s)": 0.201249 }, { "acc": 0.76027946, "epoch": 1.3856136718237362, "grad_norm": 6.03125, "learning_rate": 2.2769896675088833e-06, "loss": 0.87801542, "memory(GiB)": 147.13, "step": 59390, "train_speed(iter/s)": 0.201267 }, { "acc": 0.78674126, "epoch": 1.385846979396025, "grad_norm": 4.96875, "learning_rate": 2.2754054742199787e-06, "loss": 0.78030553, "memory(GiB)": 147.13, "step": 59400, "train_speed(iter/s)": 0.201285 }, { "acc": 0.78899231, "epoch": 1.386080286968314, "grad_norm": 4.625, "learning_rate": 2.273821669866153e-06, "loss": 0.73768158, "memory(GiB)": 147.13, "step": 59410, "train_speed(iter/s)": 0.201301 }, { "acc": 0.77118654, "epoch": 1.3863135945406029, "grad_norm": 6.15625, "learning_rate": 2.2722382546734904e-06, "loss": 0.83121977, "memory(GiB)": 147.13, "step": 59420, "train_speed(iter/s)": 0.201319 }, { "acc": 0.76945696, "epoch": 1.3865469021128918, "grad_norm": 5.78125, "learning_rate": 2.270655228868026e-06, "loss": 0.84560337, "memory(GiB)": 147.13, "step": 59430, "train_speed(iter/s)": 0.201336 }, { "acc": 0.79610276, "epoch": 1.3867802096851807, "grad_norm": 5.25, "learning_rate": 2.2690725926757355e-06, "loss": 0.73223934, "memory(GiB)": 147.13, "step": 59440, "train_speed(iter/s)": 0.201354 }, { "acc": 0.76559458, "epoch": 1.3870135172574694, "grad_norm": 5.875, "learning_rate": 2.267490346322539e-06, "loss": 0.86651058, "memory(GiB)": 147.13, "step": 59450, "train_speed(iter/s)": 0.20137 }, { "acc": 0.80258989, "epoch": 1.3872468248297585, "grad_norm": 10.0, "learning_rate": 2.265908490034301e-06, "loss": 0.71374941, "memory(GiB)": 147.13, "step": 59460, "train_speed(iter/s)": 0.201388 }, { "acc": 0.77058544, "epoch": 1.3874801324020472, "grad_norm": 5.21875, "learning_rate": 2.2643270240368305e-06, "loss": 0.83985233, "memory(GiB)": 147.13, "step": 59470, "train_speed(iter/s)": 0.201406 }, { "acc": 0.77997141, "epoch": 1.3877134399743363, "grad_norm": 5.53125, "learning_rate": 2.2627459485558846e-06, "loss": 0.77843761, "memory(GiB)": 147.13, "step": 59480, "train_speed(iter/s)": 0.201423 }, { "acc": 0.76730309, "epoch": 1.387946747546625, "grad_norm": 7.03125, "learning_rate": 2.2611652638171568e-06, "loss": 0.82932968, "memory(GiB)": 147.13, "step": 59490, "train_speed(iter/s)": 0.20144 }, { "acc": 0.78150969, "epoch": 1.388180055118914, "grad_norm": 5.5, "learning_rate": 2.259584970046294e-06, "loss": 0.7819231, "memory(GiB)": 147.13, "step": 59500, "train_speed(iter/s)": 0.201457 }, { "epoch": 1.388180055118914, "eval_acc": 0.7445906080852659, "eval_loss": 0.8047671914100647, "eval_runtime": 1269.924, "eval_samples_per_second": 28.341, "eval_steps_per_second": 14.171, "step": 59500 }, { "acc": 0.74836121, "epoch": 1.3884133626912027, "grad_norm": 6.53125, "learning_rate": 2.2580050674688815e-06, "loss": 0.9253665, "memory(GiB)": 147.13, "step": 59510, "train_speed(iter/s)": 0.200601 }, { "acc": 0.77755685, "epoch": 1.3886466702634919, "grad_norm": 5.25, "learning_rate": 2.2564255563104465e-06, "loss": 0.79394531, "memory(GiB)": 147.13, "step": 59520, "train_speed(iter/s)": 0.200617 }, { "acc": 0.77132187, "epoch": 1.3888799778357805, "grad_norm": 5.90625, "learning_rate": 2.254846436796468e-06, "loss": 0.82167749, "memory(GiB)": 147.13, "step": 59530, "train_speed(iter/s)": 0.200635 }, { "acc": 0.76772861, "epoch": 1.3891132854080694, "grad_norm": 5.75, "learning_rate": 2.2532677091523615e-06, "loss": 0.84193869, "memory(GiB)": 147.13, "step": 59540, "train_speed(iter/s)": 0.200654 }, { "acc": 0.79113283, "epoch": 1.3893465929803583, "grad_norm": 6.0, "learning_rate": 2.2516893736034935e-06, "loss": 0.74261956, "memory(GiB)": 147.13, "step": 59550, "train_speed(iter/s)": 0.20067 }, { "acc": 0.76318316, "epoch": 1.3895799005526472, "grad_norm": 5.46875, "learning_rate": 2.250111430375169e-06, "loss": 0.8669055, "memory(GiB)": 147.13, "step": 59560, "train_speed(iter/s)": 0.200688 }, { "acc": 0.780229, "epoch": 1.3898132081249361, "grad_norm": 6.5, "learning_rate": 2.248533879692639e-06, "loss": 0.77526116, "memory(GiB)": 147.13, "step": 59570, "train_speed(iter/s)": 0.200705 }, { "acc": 0.7953208, "epoch": 1.390046515697225, "grad_norm": 4.5625, "learning_rate": 2.246956721781097e-06, "loss": 0.74956465, "memory(GiB)": 147.13, "step": 59580, "train_speed(iter/s)": 0.200722 }, { "acc": 0.79553127, "epoch": 1.390279823269514, "grad_norm": 4.78125, "learning_rate": 2.245379956865684e-06, "loss": 0.74106674, "memory(GiB)": 147.13, "step": 59590, "train_speed(iter/s)": 0.200741 }, { "acc": 0.74994259, "epoch": 1.3905131308418028, "grad_norm": 5.28125, "learning_rate": 2.243803585171483e-06, "loss": 0.89359455, "memory(GiB)": 147.13, "step": 59600, "train_speed(iter/s)": 0.200758 }, { "acc": 0.76904812, "epoch": 1.3907464384140917, "grad_norm": 5.53125, "learning_rate": 2.2422276069235174e-06, "loss": 0.82504959, "memory(GiB)": 147.13, "step": 59610, "train_speed(iter/s)": 0.200776 }, { "acc": 0.77541676, "epoch": 1.3909797459863806, "grad_norm": 7.625, "learning_rate": 2.240652022346761e-06, "loss": 0.80968227, "memory(GiB)": 147.13, "step": 59620, "train_speed(iter/s)": 0.200793 }, { "acc": 0.7667109, "epoch": 1.3912130535586695, "grad_norm": 5.34375, "learning_rate": 2.2390768316661256e-06, "loss": 0.82808847, "memory(GiB)": 147.13, "step": 59630, "train_speed(iter/s)": 0.20081 }, { "acc": 0.79110146, "epoch": 1.3914463611309584, "grad_norm": 4.71875, "learning_rate": 2.237502035106472e-06, "loss": 0.75989141, "memory(GiB)": 147.13, "step": 59640, "train_speed(iter/s)": 0.200828 }, { "acc": 0.76692033, "epoch": 1.3916796687032473, "grad_norm": 16.375, "learning_rate": 2.2359276328926007e-06, "loss": 0.83675632, "memory(GiB)": 147.13, "step": 59650, "train_speed(iter/s)": 0.200846 }, { "acc": 0.77610579, "epoch": 1.3919129762755362, "grad_norm": 5.25, "learning_rate": 2.2343536252492542e-06, "loss": 0.80217104, "memory(GiB)": 147.13, "step": 59660, "train_speed(iter/s)": 0.200863 }, { "acc": 0.77709975, "epoch": 1.3921462838478251, "grad_norm": 5.96875, "learning_rate": 2.2327800124011285e-06, "loss": 0.81001654, "memory(GiB)": 147.13, "step": 59670, "train_speed(iter/s)": 0.200881 }, { "acc": 0.77643595, "epoch": 1.392379591420114, "grad_norm": 5.4375, "learning_rate": 2.231206794572848e-06, "loss": 0.80721416, "memory(GiB)": 147.13, "step": 59680, "train_speed(iter/s)": 0.200898 }, { "acc": 0.77637901, "epoch": 1.392612898992403, "grad_norm": 5.34375, "learning_rate": 2.229633971988996e-06, "loss": 0.80078964, "memory(GiB)": 147.13, "step": 59690, "train_speed(iter/s)": 0.200915 }, { "acc": 0.78074894, "epoch": 1.3928462065646918, "grad_norm": 8.0625, "learning_rate": 2.2280615448740873e-06, "loss": 0.79391489, "memory(GiB)": 147.13, "step": 59700, "train_speed(iter/s)": 0.200933 }, { "acc": 0.78368201, "epoch": 1.3930795141369807, "grad_norm": 9.0625, "learning_rate": 2.2264895134525898e-06, "loss": 0.77110548, "memory(GiB)": 147.13, "step": 59710, "train_speed(iter/s)": 0.200951 }, { "acc": 0.79938531, "epoch": 1.3933128217092696, "grad_norm": 4.40625, "learning_rate": 2.2249178779489065e-06, "loss": 0.71430826, "memory(GiB)": 147.13, "step": 59720, "train_speed(iter/s)": 0.200968 }, { "acc": 0.78940001, "epoch": 1.3935461292815585, "grad_norm": 4.625, "learning_rate": 2.223346638587392e-06, "loss": 0.76045151, "memory(GiB)": 147.13, "step": 59730, "train_speed(iter/s)": 0.200984 }, { "acc": 0.79823899, "epoch": 1.3937794368538474, "grad_norm": 6.46875, "learning_rate": 2.2217757955923386e-06, "loss": 0.72001886, "memory(GiB)": 147.13, "step": 59740, "train_speed(iter/s)": 0.201001 }, { "acc": 0.79044533, "epoch": 1.3940127444261363, "grad_norm": 4.5, "learning_rate": 2.220205349187981e-06, "loss": 0.74306707, "memory(GiB)": 147.13, "step": 59750, "train_speed(iter/s)": 0.201018 }, { "acc": 0.78812528, "epoch": 1.3942460519984252, "grad_norm": 8.375, "learning_rate": 2.218635299598504e-06, "loss": 0.76315918, "memory(GiB)": 147.13, "step": 59760, "train_speed(iter/s)": 0.201036 }, { "acc": 0.78087091, "epoch": 1.3944793595707141, "grad_norm": 5.9375, "learning_rate": 2.2170656470480284e-06, "loss": 0.79584026, "memory(GiB)": 147.13, "step": 59770, "train_speed(iter/s)": 0.201053 }, { "acc": 0.77213335, "epoch": 1.394712667143003, "grad_norm": 5.96875, "learning_rate": 2.215496391760625e-06, "loss": 0.83666573, "memory(GiB)": 147.13, "step": 59780, "train_speed(iter/s)": 0.201071 }, { "acc": 0.79270315, "epoch": 1.394945974715292, "grad_norm": 5.65625, "learning_rate": 2.2139275339603023e-06, "loss": 0.7484087, "memory(GiB)": 147.13, "step": 59790, "train_speed(iter/s)": 0.201088 }, { "acc": 0.80184422, "epoch": 1.3951792822875808, "grad_norm": 4.8125, "learning_rate": 2.2123590738710153e-06, "loss": 0.71304121, "memory(GiB)": 147.13, "step": 59800, "train_speed(iter/s)": 0.201105 }, { "acc": 0.793783, "epoch": 1.3954125898598697, "grad_norm": 5.03125, "learning_rate": 2.2107910117166608e-06, "loss": 0.73716478, "memory(GiB)": 147.13, "step": 59810, "train_speed(iter/s)": 0.201123 }, { "acc": 0.77772017, "epoch": 1.3956458974321586, "grad_norm": 4.75, "learning_rate": 2.2092233477210767e-06, "loss": 0.78334846, "memory(GiB)": 147.13, "step": 59820, "train_speed(iter/s)": 0.201141 }, { "acc": 0.78544092, "epoch": 1.3958792050044475, "grad_norm": 5.9375, "learning_rate": 2.2076560821080515e-06, "loss": 0.77470407, "memory(GiB)": 147.13, "step": 59830, "train_speed(iter/s)": 0.201158 }, { "acc": 0.77739944, "epoch": 1.3961125125767362, "grad_norm": 4.875, "learning_rate": 2.2060892151013067e-06, "loss": 0.807547, "memory(GiB)": 147.13, "step": 59840, "train_speed(iter/s)": 0.201175 }, { "acc": 0.7652626, "epoch": 1.3963458201490253, "grad_norm": 4.75, "learning_rate": 2.2045227469245178e-06, "loss": 0.83140392, "memory(GiB)": 147.13, "step": 59850, "train_speed(iter/s)": 0.201194 }, { "acc": 0.78730431, "epoch": 1.396579127721314, "grad_norm": 4.9375, "learning_rate": 2.202956677801292e-06, "loss": 0.74223843, "memory(GiB)": 147.13, "step": 59860, "train_speed(iter/s)": 0.201211 }, { "acc": 0.77190733, "epoch": 1.396812435293603, "grad_norm": 5.8125, "learning_rate": 2.2013910079551905e-06, "loss": 0.82516975, "memory(GiB)": 147.13, "step": 59870, "train_speed(iter/s)": 0.201227 }, { "acc": 0.77773724, "epoch": 1.3970457428658918, "grad_norm": 6.1875, "learning_rate": 2.199825737609709e-06, "loss": 0.78944416, "memory(GiB)": 147.13, "step": 59880, "train_speed(iter/s)": 0.201245 }, { "acc": 0.77316818, "epoch": 1.397279050438181, "grad_norm": 6.125, "learning_rate": 2.198260866988288e-06, "loss": 0.81869354, "memory(GiB)": 147.13, "step": 59890, "train_speed(iter/s)": 0.201262 }, { "acc": 0.79717088, "epoch": 1.3975123580104696, "grad_norm": 4.75, "learning_rate": 2.1966963963143184e-06, "loss": 0.72420158, "memory(GiB)": 147.13, "step": 59900, "train_speed(iter/s)": 0.201279 }, { "acc": 0.7967824, "epoch": 1.3977456655827587, "grad_norm": 4.21875, "learning_rate": 2.1951323258111194e-06, "loss": 0.71405821, "memory(GiB)": 147.13, "step": 59910, "train_speed(iter/s)": 0.201295 }, { "acc": 0.77692866, "epoch": 1.3979789731550474, "grad_norm": 5.125, "learning_rate": 2.193568655701969e-06, "loss": 0.80145493, "memory(GiB)": 147.13, "step": 59920, "train_speed(iter/s)": 0.201313 }, { "acc": 0.77152061, "epoch": 1.3982122807273363, "grad_norm": 4.125, "learning_rate": 2.1920053862100754e-06, "loss": 0.82826338, "memory(GiB)": 147.13, "step": 59930, "train_speed(iter/s)": 0.201331 }, { "acc": 0.78626838, "epoch": 1.3984455882996252, "grad_norm": 22.0, "learning_rate": 2.190442517558599e-06, "loss": 0.76881523, "memory(GiB)": 147.13, "step": 59940, "train_speed(iter/s)": 0.201348 }, { "acc": 0.77308369, "epoch": 1.398678895871914, "grad_norm": 5.4375, "learning_rate": 2.188880049970637e-06, "loss": 0.82151947, "memory(GiB)": 147.13, "step": 59950, "train_speed(iter/s)": 0.201365 }, { "acc": 0.76596837, "epoch": 1.398912203444203, "grad_norm": 5.5625, "learning_rate": 2.18731798366923e-06, "loss": 0.8408473, "memory(GiB)": 147.13, "step": 59960, "train_speed(iter/s)": 0.201384 }, { "acc": 0.7691186, "epoch": 1.3991455110164919, "grad_norm": 6.09375, "learning_rate": 2.1857563188773644e-06, "loss": 0.84743719, "memory(GiB)": 147.13, "step": 59970, "train_speed(iter/s)": 0.201402 }, { "acc": 0.77092209, "epoch": 1.3993788185887808, "grad_norm": 8.4375, "learning_rate": 2.184195055817966e-06, "loss": 0.84977627, "memory(GiB)": 147.13, "step": 59980, "train_speed(iter/s)": 0.201418 }, { "acc": 0.77478681, "epoch": 1.3996121261610697, "grad_norm": 6.90625, "learning_rate": 2.1826341947139067e-06, "loss": 0.80044651, "memory(GiB)": 147.13, "step": 59990, "train_speed(iter/s)": 0.201435 }, { "acc": 0.7726613, "epoch": 1.3998454337333586, "grad_norm": 3.703125, "learning_rate": 2.181073735787998e-06, "loss": 0.83067513, "memory(GiB)": 147.13, "step": 60000, "train_speed(iter/s)": 0.201452 }, { "epoch": 1.3998454337333586, "eval_acc": 0.7445992558738102, "eval_loss": 0.8046594858169556, "eval_runtime": 1271.0218, "eval_samples_per_second": 28.317, "eval_steps_per_second": 14.159, "step": 60000 }, { "acc": 0.78201704, "epoch": 1.4000787413056475, "grad_norm": 4.84375, "learning_rate": 2.179513679262992e-06, "loss": 0.78135967, "memory(GiB)": 147.13, "step": 60010, "train_speed(iter/s)": 0.200603 }, { "acc": 0.7830049, "epoch": 1.4003120488779364, "grad_norm": 5.03125, "learning_rate": 2.1779540253615917e-06, "loss": 0.78097987, "memory(GiB)": 147.13, "step": 60020, "train_speed(iter/s)": 0.200621 }, { "acc": 0.80212574, "epoch": 1.4005453564502253, "grad_norm": 5.46875, "learning_rate": 2.176394774306434e-06, "loss": 0.71014204, "memory(GiB)": 147.13, "step": 60030, "train_speed(iter/s)": 0.200636 }, { "acc": 0.79081697, "epoch": 1.4007786640225142, "grad_norm": 6.875, "learning_rate": 2.174835926320102e-06, "loss": 0.75322866, "memory(GiB)": 147.13, "step": 60040, "train_speed(iter/s)": 0.200655 }, { "acc": 0.77095056, "epoch": 1.401011971594803, "grad_norm": 5.375, "learning_rate": 2.173277481625119e-06, "loss": 0.83664217, "memory(GiB)": 147.13, "step": 60050, "train_speed(iter/s)": 0.200671 }, { "acc": 0.76814718, "epoch": 1.401245279167092, "grad_norm": 5.375, "learning_rate": 2.1717194404439563e-06, "loss": 0.84824924, "memory(GiB)": 147.13, "step": 60060, "train_speed(iter/s)": 0.200687 }, { "acc": 0.77537007, "epoch": 1.4014785867393809, "grad_norm": 4.8125, "learning_rate": 2.17016180299902e-06, "loss": 0.79466648, "memory(GiB)": 147.13, "step": 60070, "train_speed(iter/s)": 0.200704 }, { "acc": 0.78756018, "epoch": 1.4017118943116698, "grad_norm": 4.5, "learning_rate": 2.168604569512666e-06, "loss": 0.75703373, "memory(GiB)": 147.13, "step": 60080, "train_speed(iter/s)": 0.200721 }, { "acc": 0.76781497, "epoch": 1.4019452018839587, "grad_norm": 7.03125, "learning_rate": 2.167047740207187e-06, "loss": 0.84280319, "memory(GiB)": 147.13, "step": 60090, "train_speed(iter/s)": 0.200739 }, { "acc": 0.76985674, "epoch": 1.4021785094562476, "grad_norm": 5.3125, "learning_rate": 2.1654913153048186e-06, "loss": 0.8241436, "memory(GiB)": 147.13, "step": 60100, "train_speed(iter/s)": 0.200756 }, { "acc": 0.79031215, "epoch": 1.4024118170285365, "grad_norm": 3.828125, "learning_rate": 2.1639352950277433e-06, "loss": 0.74541531, "memory(GiB)": 147.13, "step": 60110, "train_speed(iter/s)": 0.200775 }, { "acc": 0.7930346, "epoch": 1.4026451246008254, "grad_norm": 7.75, "learning_rate": 2.162379679598079e-06, "loss": 0.73493719, "memory(GiB)": 147.13, "step": 60120, "train_speed(iter/s)": 0.20079 }, { "acc": 0.77408686, "epoch": 1.4028784321731143, "grad_norm": 4.75, "learning_rate": 2.1608244692378946e-06, "loss": 0.83344536, "memory(GiB)": 147.13, "step": 60130, "train_speed(iter/s)": 0.200806 }, { "acc": 0.78932433, "epoch": 1.4031117397454032, "grad_norm": 5.84375, "learning_rate": 2.1592696641691884e-06, "loss": 0.744315, "memory(GiB)": 147.13, "step": 60140, "train_speed(iter/s)": 0.200824 }, { "acc": 0.78552046, "epoch": 1.403345047317692, "grad_norm": 5.40625, "learning_rate": 2.157715264613915e-06, "loss": 0.78410473, "memory(GiB)": 147.13, "step": 60150, "train_speed(iter/s)": 0.200841 }, { "acc": 0.76167541, "epoch": 1.403578354889981, "grad_norm": 6.46875, "learning_rate": 2.156161270793961e-06, "loss": 0.8703126, "memory(GiB)": 147.13, "step": 60160, "train_speed(iter/s)": 0.200858 }, { "acc": 0.78156233, "epoch": 1.4038116624622698, "grad_norm": 4.875, "learning_rate": 2.1546076829311584e-06, "loss": 0.7798255, "memory(GiB)": 147.13, "step": 60170, "train_speed(iter/s)": 0.200876 }, { "acc": 0.79834514, "epoch": 1.4040449700345587, "grad_norm": 5.5625, "learning_rate": 2.153054501247284e-06, "loss": 0.73483548, "memory(GiB)": 147.13, "step": 60180, "train_speed(iter/s)": 0.200893 }, { "acc": 0.77490888, "epoch": 1.4042782776068476, "grad_norm": 5.4375, "learning_rate": 2.151501725964051e-06, "loss": 0.79429326, "memory(GiB)": 147.13, "step": 60190, "train_speed(iter/s)": 0.200911 }, { "acc": 0.79103308, "epoch": 1.4045115851791365, "grad_norm": 3.796875, "learning_rate": 2.14994935730312e-06, "loss": 0.74287324, "memory(GiB)": 147.13, "step": 60200, "train_speed(iter/s)": 0.200927 }, { "acc": 0.76605501, "epoch": 1.4047448927514254, "grad_norm": 8.1875, "learning_rate": 2.1483973954860894e-06, "loss": 0.86529417, "memory(GiB)": 147.13, "step": 60210, "train_speed(iter/s)": 0.200943 }, { "acc": 0.7755722, "epoch": 1.4049782003237143, "grad_norm": 4.9375, "learning_rate": 2.146845840734504e-06, "loss": 0.79309669, "memory(GiB)": 147.13, "step": 60220, "train_speed(iter/s)": 0.200961 }, { "acc": 0.7734189, "epoch": 1.405211507896003, "grad_norm": 8.75, "learning_rate": 2.1452946932698454e-06, "loss": 0.84152288, "memory(GiB)": 147.13, "step": 60230, "train_speed(iter/s)": 0.200978 }, { "acc": 0.78702106, "epoch": 1.4054448154682921, "grad_norm": 4.5625, "learning_rate": 2.1437439533135386e-06, "loss": 0.77411451, "memory(GiB)": 147.13, "step": 60240, "train_speed(iter/s)": 0.200993 }, { "acc": 0.78498821, "epoch": 1.4056781230405808, "grad_norm": 4.96875, "learning_rate": 2.142193621086956e-06, "loss": 0.75319977, "memory(GiB)": 147.13, "step": 60250, "train_speed(iter/s)": 0.201011 }, { "acc": 0.7821517, "epoch": 1.40591143061287, "grad_norm": 4.46875, "learning_rate": 2.140643696811401e-06, "loss": 0.80938702, "memory(GiB)": 147.13, "step": 60260, "train_speed(iter/s)": 0.201027 }, { "acc": 0.77948647, "epoch": 1.4061447381851586, "grad_norm": 7.25, "learning_rate": 2.1390941807081285e-06, "loss": 0.79588933, "memory(GiB)": 147.13, "step": 60270, "train_speed(iter/s)": 0.201044 }, { "acc": 0.79571724, "epoch": 1.4063780457574477, "grad_norm": 7.15625, "learning_rate": 2.1375450729983294e-06, "loss": 0.72755728, "memory(GiB)": 147.13, "step": 60280, "train_speed(iter/s)": 0.201061 }, { "acc": 0.79610963, "epoch": 1.4066113533297364, "grad_norm": 4.90625, "learning_rate": 2.1359963739031407e-06, "loss": 0.728936, "memory(GiB)": 147.13, "step": 60290, "train_speed(iter/s)": 0.201079 }, { "acc": 0.7886919, "epoch": 1.4068446609020255, "grad_norm": 5.21875, "learning_rate": 2.134448083643638e-06, "loss": 0.76339221, "memory(GiB)": 147.13, "step": 60300, "train_speed(iter/s)": 0.201095 }, { "acc": 0.76663504, "epoch": 1.4070779684743142, "grad_norm": 5.59375, "learning_rate": 2.1329002024408375e-06, "loss": 0.84437771, "memory(GiB)": 147.13, "step": 60310, "train_speed(iter/s)": 0.201113 }, { "acc": 0.77732272, "epoch": 1.407311276046603, "grad_norm": 5.125, "learning_rate": 2.1313527305157015e-06, "loss": 0.78522806, "memory(GiB)": 147.13, "step": 60320, "train_speed(iter/s)": 0.201128 }, { "acc": 0.78382611, "epoch": 1.407544583618892, "grad_norm": 4.0625, "learning_rate": 2.1298056680891288e-06, "loss": 0.78302422, "memory(GiB)": 147.13, "step": 60330, "train_speed(iter/s)": 0.201145 }, { "acc": 0.77463741, "epoch": 1.407777891191181, "grad_norm": 6.0, "learning_rate": 2.1282590153819645e-06, "loss": 0.80812149, "memory(GiB)": 147.13, "step": 60340, "train_speed(iter/s)": 0.201161 }, { "acc": 0.79031725, "epoch": 1.4080111987634698, "grad_norm": 4.03125, "learning_rate": 2.1267127726149896e-06, "loss": 0.76469936, "memory(GiB)": 147.13, "step": 60350, "train_speed(iter/s)": 0.201179 }, { "acc": 0.76453934, "epoch": 1.4082445063357587, "grad_norm": 5.78125, "learning_rate": 2.1251669400089353e-06, "loss": 0.84970903, "memory(GiB)": 147.13, "step": 60360, "train_speed(iter/s)": 0.201195 }, { "acc": 0.79993305, "epoch": 1.4084778139080476, "grad_norm": 4.84375, "learning_rate": 2.1236215177844617e-06, "loss": 0.70226107, "memory(GiB)": 147.13, "step": 60370, "train_speed(iter/s)": 0.201212 }, { "acc": 0.80778503, "epoch": 1.4087111214803365, "grad_norm": 7.0625, "learning_rate": 2.1220765061621828e-06, "loss": 0.67411156, "memory(GiB)": 147.13, "step": 60380, "train_speed(iter/s)": 0.201229 }, { "acc": 0.77227788, "epoch": 1.4089444290526254, "grad_norm": 4.15625, "learning_rate": 2.120531905362646e-06, "loss": 0.81891384, "memory(GiB)": 147.13, "step": 60390, "train_speed(iter/s)": 0.201246 }, { "acc": 0.79898548, "epoch": 1.4091777366249143, "grad_norm": 4.8125, "learning_rate": 2.118987715606342e-06, "loss": 0.72313986, "memory(GiB)": 147.13, "step": 60400, "train_speed(iter/s)": 0.201261 }, { "acc": 0.78145227, "epoch": 1.4094110441972032, "grad_norm": 5.71875, "learning_rate": 2.1174439371137064e-06, "loss": 0.78863721, "memory(GiB)": 147.13, "step": 60410, "train_speed(iter/s)": 0.201278 }, { "acc": 0.78333859, "epoch": 1.409644351769492, "grad_norm": 5.875, "learning_rate": 2.1159005701051093e-06, "loss": 0.77123041, "memory(GiB)": 147.13, "step": 60420, "train_speed(iter/s)": 0.201296 }, { "acc": 0.79827003, "epoch": 1.409877659341781, "grad_norm": 5.375, "learning_rate": 2.11435761480087e-06, "loss": 0.71006422, "memory(GiB)": 147.13, "step": 60430, "train_speed(iter/s)": 0.201312 }, { "acc": 0.78789725, "epoch": 1.41011096691407, "grad_norm": 4.40625, "learning_rate": 2.112815071421243e-06, "loss": 0.7574213, "memory(GiB)": 147.13, "step": 60440, "train_speed(iter/s)": 0.201329 }, { "acc": 0.77921619, "epoch": 1.4103442744863588, "grad_norm": 4.3125, "learning_rate": 2.111272940186424e-06, "loss": 0.78693404, "memory(GiB)": 147.13, "step": 60450, "train_speed(iter/s)": 0.201347 }, { "acc": 0.76266708, "epoch": 1.4105775820586477, "grad_norm": 5.84375, "learning_rate": 2.109731221316555e-06, "loss": 0.87478523, "memory(GiB)": 147.13, "step": 60460, "train_speed(iter/s)": 0.201363 }, { "acc": 0.76366029, "epoch": 1.4108108896309366, "grad_norm": 7.03125, "learning_rate": 2.108189915031715e-06, "loss": 0.84197912, "memory(GiB)": 147.13, "step": 60470, "train_speed(iter/s)": 0.201381 }, { "acc": 0.78174658, "epoch": 1.4110441972032255, "grad_norm": 5.65625, "learning_rate": 2.1066490215519243e-06, "loss": 0.78470864, "memory(GiB)": 147.13, "step": 60480, "train_speed(iter/s)": 0.201398 }, { "acc": 0.80206575, "epoch": 1.4112775047755144, "grad_norm": 6.53125, "learning_rate": 2.105108541097143e-06, "loss": 0.71396837, "memory(GiB)": 147.13, "step": 60490, "train_speed(iter/s)": 0.201416 }, { "acc": 0.77739525, "epoch": 1.4115108123478033, "grad_norm": 6.40625, "learning_rate": 2.1035684738872792e-06, "loss": 0.78567958, "memory(GiB)": 147.13, "step": 60500, "train_speed(iter/s)": 0.201433 }, { "epoch": 1.4115108123478033, "eval_acc": 0.7446263202490692, "eval_loss": 0.8046724200248718, "eval_runtime": 1269.8878, "eval_samples_per_second": 28.342, "eval_steps_per_second": 14.171, "step": 60500 }, { "acc": 0.78144083, "epoch": 1.4117441199200922, "grad_norm": 4.96875, "learning_rate": 2.1020288201421722e-06, "loss": 0.7848321, "memory(GiB)": 147.13, "step": 60510, "train_speed(iter/s)": 0.200589 }, { "acc": 0.78034506, "epoch": 1.411977427492381, "grad_norm": 4.96875, "learning_rate": 2.100489580081611e-06, "loss": 0.79630418, "memory(GiB)": 147.13, "step": 60520, "train_speed(iter/s)": 0.200608 }, { "acc": 0.76285105, "epoch": 1.41221073506467, "grad_norm": 6.125, "learning_rate": 2.09895075392532e-06, "loss": 0.84258804, "memory(GiB)": 147.13, "step": 60530, "train_speed(iter/s)": 0.200625 }, { "acc": 0.7814455, "epoch": 1.4124440426369589, "grad_norm": 5.15625, "learning_rate": 2.0974123418929644e-06, "loss": 0.78600588, "memory(GiB)": 147.13, "step": 60540, "train_speed(iter/s)": 0.200644 }, { "acc": 0.79303546, "epoch": 1.4126773502092478, "grad_norm": 5.15625, "learning_rate": 2.095874344204155e-06, "loss": 0.73297596, "memory(GiB)": 147.13, "step": 60550, "train_speed(iter/s)": 0.200662 }, { "acc": 0.79392943, "epoch": 1.4129106577815367, "grad_norm": 6.0, "learning_rate": 2.094336761078438e-06, "loss": 0.73787174, "memory(GiB)": 147.13, "step": 60560, "train_speed(iter/s)": 0.200679 }, { "acc": 0.78089685, "epoch": 1.4131439653538256, "grad_norm": 5.9375, "learning_rate": 2.0927995927353062e-06, "loss": 0.79184189, "memory(GiB)": 147.13, "step": 60570, "train_speed(iter/s)": 0.200696 }, { "acc": 0.78633385, "epoch": 1.4133772729261145, "grad_norm": 5.0, "learning_rate": 2.091262839394188e-06, "loss": 0.76935081, "memory(GiB)": 147.13, "step": 60580, "train_speed(iter/s)": 0.200712 }, { "acc": 0.7927001, "epoch": 1.4136105804984034, "grad_norm": 6.125, "learning_rate": 2.0897265012744543e-06, "loss": 0.75596743, "memory(GiB)": 147.13, "step": 60590, "train_speed(iter/s)": 0.200729 }, { "acc": 0.77544589, "epoch": 1.413843888070692, "grad_norm": 5.15625, "learning_rate": 2.0881905785954172e-06, "loss": 0.79783726, "memory(GiB)": 147.13, "step": 60600, "train_speed(iter/s)": 0.200747 }, { "acc": 0.76404762, "epoch": 1.4140771956429812, "grad_norm": 6.25, "learning_rate": 2.086655071576327e-06, "loss": 0.83482218, "memory(GiB)": 147.13, "step": 60610, "train_speed(iter/s)": 0.200763 }, { "acc": 0.77434025, "epoch": 1.4143105032152699, "grad_norm": 5.15625, "learning_rate": 2.085119980436381e-06, "loss": 0.79602251, "memory(GiB)": 147.13, "step": 60620, "train_speed(iter/s)": 0.200781 }, { "acc": 0.78777227, "epoch": 1.414543810787559, "grad_norm": 4.03125, "learning_rate": 2.083585305394709e-06, "loss": 0.75320654, "memory(GiB)": 147.13, "step": 60630, "train_speed(iter/s)": 0.200798 }, { "acc": 0.77788224, "epoch": 1.4147771183598477, "grad_norm": 6.0, "learning_rate": 2.0820510466703898e-06, "loss": 0.77969913, "memory(GiB)": 147.13, "step": 60640, "train_speed(iter/s)": 0.200815 }, { "acc": 0.79586306, "epoch": 1.4150104259321368, "grad_norm": 5.21875, "learning_rate": 2.080517204482434e-06, "loss": 0.73031406, "memory(GiB)": 147.13, "step": 60650, "train_speed(iter/s)": 0.200832 }, { "acc": 0.79818316, "epoch": 1.4152437335044254, "grad_norm": 4.09375, "learning_rate": 2.078983779049801e-06, "loss": 0.73085876, "memory(GiB)": 147.13, "step": 60660, "train_speed(iter/s)": 0.200849 }, { "acc": 0.78288202, "epoch": 1.4154770410767146, "grad_norm": 5.40625, "learning_rate": 2.0774507705913844e-06, "loss": 0.77484465, "memory(GiB)": 147.13, "step": 60670, "train_speed(iter/s)": 0.200865 }, { "acc": 0.77833548, "epoch": 1.4157103486490032, "grad_norm": 5.40625, "learning_rate": 2.07591817932602e-06, "loss": 0.78327904, "memory(GiB)": 147.13, "step": 60680, "train_speed(iter/s)": 0.200881 }, { "acc": 0.78331342, "epoch": 1.4159436562212924, "grad_norm": 10.5625, "learning_rate": 2.074386005472488e-06, "loss": 0.77935891, "memory(GiB)": 147.13, "step": 60690, "train_speed(iter/s)": 0.200898 }, { "acc": 0.77536631, "epoch": 1.416176963793581, "grad_norm": 4.96875, "learning_rate": 2.072854249249503e-06, "loss": 0.79027624, "memory(GiB)": 147.13, "step": 60700, "train_speed(iter/s)": 0.200916 }, { "acc": 0.77584419, "epoch": 1.41641027136587, "grad_norm": 4.34375, "learning_rate": 2.0713229108757244e-06, "loss": 0.80542765, "memory(GiB)": 147.13, "step": 60710, "train_speed(iter/s)": 0.200932 }, { "acc": 0.78596239, "epoch": 1.4166435789381588, "grad_norm": 4.8125, "learning_rate": 2.0697919905697474e-06, "loss": 0.77891278, "memory(GiB)": 147.13, "step": 60720, "train_speed(iter/s)": 0.200949 }, { "acc": 0.78150387, "epoch": 1.4168768865104477, "grad_norm": 3.5, "learning_rate": 2.0682614885501147e-06, "loss": 0.77592754, "memory(GiB)": 147.13, "step": 60730, "train_speed(iter/s)": 0.200964 }, { "acc": 0.7803638, "epoch": 1.4171101940827366, "grad_norm": 6.21875, "learning_rate": 2.066731405035302e-06, "loss": 0.78199749, "memory(GiB)": 147.13, "step": 60740, "train_speed(iter/s)": 0.20098 }, { "acc": 0.79469795, "epoch": 1.4173435016550255, "grad_norm": 5.28125, "learning_rate": 2.065201740243728e-06, "loss": 0.72167587, "memory(GiB)": 147.13, "step": 60750, "train_speed(iter/s)": 0.200997 }, { "acc": 0.76823006, "epoch": 1.4175768092273144, "grad_norm": 5.53125, "learning_rate": 2.063672494393755e-06, "loss": 0.84083357, "memory(GiB)": 147.13, "step": 60760, "train_speed(iter/s)": 0.201015 }, { "acc": 0.79289522, "epoch": 1.4178101167996033, "grad_norm": 6.125, "learning_rate": 2.0621436677036775e-06, "loss": 0.74563189, "memory(GiB)": 147.13, "step": 60770, "train_speed(iter/s)": 0.201033 }, { "acc": 0.79262247, "epoch": 1.4180434243718922, "grad_norm": 5.0625, "learning_rate": 2.0606152603917406e-06, "loss": 0.74184866, "memory(GiB)": 147.13, "step": 60780, "train_speed(iter/s)": 0.20105 }, { "acc": 0.78077393, "epoch": 1.4182767319441811, "grad_norm": 7.8125, "learning_rate": 2.0590872726761215e-06, "loss": 0.77461243, "memory(GiB)": 147.13, "step": 60790, "train_speed(iter/s)": 0.201068 }, { "acc": 0.78589611, "epoch": 1.41851003951647, "grad_norm": 5.40625, "learning_rate": 2.057559704774938e-06, "loss": 0.78779116, "memory(GiB)": 147.13, "step": 60800, "train_speed(iter/s)": 0.201085 }, { "acc": 0.77445693, "epoch": 1.418743347088759, "grad_norm": 6.875, "learning_rate": 2.0560325569062535e-06, "loss": 0.80334835, "memory(GiB)": 147.13, "step": 60810, "train_speed(iter/s)": 0.201103 }, { "acc": 0.77701745, "epoch": 1.4189766546610478, "grad_norm": 5.21875, "learning_rate": 2.054505829288066e-06, "loss": 0.81892672, "memory(GiB)": 147.13, "step": 60820, "train_speed(iter/s)": 0.201121 }, { "acc": 0.79528999, "epoch": 1.4192099622333367, "grad_norm": 5.1875, "learning_rate": 2.0529795221383164e-06, "loss": 0.72682176, "memory(GiB)": 147.13, "step": 60830, "train_speed(iter/s)": 0.201136 }, { "acc": 0.75750227, "epoch": 1.4194432698056256, "grad_norm": 5.96875, "learning_rate": 2.0514536356748814e-06, "loss": 0.86790705, "memory(GiB)": 147.13, "step": 60840, "train_speed(iter/s)": 0.201154 }, { "acc": 0.76532421, "epoch": 1.4196765773779145, "grad_norm": 4.40625, "learning_rate": 2.0499281701155852e-06, "loss": 0.84386234, "memory(GiB)": 147.13, "step": 60850, "train_speed(iter/s)": 0.201171 }, { "acc": 0.7653913, "epoch": 1.4199098849502034, "grad_norm": 9.375, "learning_rate": 2.0484031256781845e-06, "loss": 0.84208984, "memory(GiB)": 147.13, "step": 60860, "train_speed(iter/s)": 0.201188 }, { "acc": 0.7715621, "epoch": 1.4201431925224923, "grad_norm": 5.65625, "learning_rate": 2.046878502580382e-06, "loss": 0.8072238, "memory(GiB)": 147.13, "step": 60870, "train_speed(iter/s)": 0.201206 }, { "acc": 0.77248592, "epoch": 1.4203765000947812, "grad_norm": 4.625, "learning_rate": 2.045354301039815e-06, "loss": 0.81728535, "memory(GiB)": 147.13, "step": 60880, "train_speed(iter/s)": 0.201223 }, { "acc": 0.79007959, "epoch": 1.4206098076670701, "grad_norm": 7.90625, "learning_rate": 2.043830521274061e-06, "loss": 0.74873753, "memory(GiB)": 147.13, "step": 60890, "train_speed(iter/s)": 0.20124 }, { "acc": 0.7762063, "epoch": 1.420843115239359, "grad_norm": 5.0625, "learning_rate": 2.0423071635006436e-06, "loss": 0.80805073, "memory(GiB)": 147.13, "step": 60900, "train_speed(iter/s)": 0.201257 }, { "acc": 0.77343888, "epoch": 1.421076422811648, "grad_norm": 5.5, "learning_rate": 2.0407842279370176e-06, "loss": 0.82168198, "memory(GiB)": 147.13, "step": 60910, "train_speed(iter/s)": 0.201274 }, { "acc": 0.7828186, "epoch": 1.4213097303839368, "grad_norm": 4.5, "learning_rate": 2.039261714800585e-06, "loss": 0.7554111, "memory(GiB)": 147.13, "step": 60920, "train_speed(iter/s)": 0.201291 }, { "acc": 0.78235793, "epoch": 1.4215430379562257, "grad_norm": 5.90625, "learning_rate": 2.0377396243086827e-06, "loss": 0.7957407, "memory(GiB)": 147.13, "step": 60930, "train_speed(iter/s)": 0.201309 }, { "acc": 0.77107277, "epoch": 1.4217763455285146, "grad_norm": 27.75, "learning_rate": 2.036217956678588e-06, "loss": 0.83353214, "memory(GiB)": 147.13, "step": 60940, "train_speed(iter/s)": 0.201325 }, { "acc": 0.78861694, "epoch": 1.4220096531008035, "grad_norm": 4.34375, "learning_rate": 2.034696712127518e-06, "loss": 0.75524387, "memory(GiB)": 147.13, "step": 60950, "train_speed(iter/s)": 0.201343 }, { "acc": 0.76062737, "epoch": 1.4222429606730924, "grad_norm": 6.53125, "learning_rate": 2.0331758908726323e-06, "loss": 0.87450056, "memory(GiB)": 147.13, "step": 60960, "train_speed(iter/s)": 0.201359 }, { "acc": 0.77146816, "epoch": 1.4224762682453813, "grad_norm": 5.25, "learning_rate": 2.031655493131026e-06, "loss": 0.80269995, "memory(GiB)": 147.13, "step": 60970, "train_speed(iter/s)": 0.201377 }, { "acc": 0.76841688, "epoch": 1.4227095758176702, "grad_norm": 5.5625, "learning_rate": 2.030135519119735e-06, "loss": 0.83026876, "memory(GiB)": 147.13, "step": 60980, "train_speed(iter/s)": 0.201393 }, { "acc": 0.78153791, "epoch": 1.422942883389959, "grad_norm": 8.375, "learning_rate": 2.0286159690557366e-06, "loss": 0.8033679, "memory(GiB)": 147.13, "step": 60990, "train_speed(iter/s)": 0.20141 }, { "acc": 0.79216089, "epoch": 1.423176190962248, "grad_norm": 8.0, "learning_rate": 2.027096843155944e-06, "loss": 0.76074972, "memory(GiB)": 147.13, "step": 61000, "train_speed(iter/s)": 0.201427 }, { "epoch": 1.423176190962248, "eval_acc": 0.744670359912952, "eval_loss": 0.8046051859855652, "eval_runtime": 1270.2783, "eval_samples_per_second": 28.333, "eval_steps_per_second": 14.167, "step": 61000 }, { "acc": 0.79125977, "epoch": 1.4234094985345367, "grad_norm": 10.5625, "learning_rate": 2.025578141637215e-06, "loss": 0.74310904, "memory(GiB)": 147.13, "step": 61010, "train_speed(iter/s)": 0.20059 }, { "acc": 0.76769581, "epoch": 1.4236428061068258, "grad_norm": 4.875, "learning_rate": 2.024059864716343e-06, "loss": 0.83147717, "memory(GiB)": 147.13, "step": 61020, "train_speed(iter/s)": 0.200608 }, { "acc": 0.77622566, "epoch": 1.4238761136791145, "grad_norm": 7.03125, "learning_rate": 2.022542012610058e-06, "loss": 0.80299397, "memory(GiB)": 147.13, "step": 61030, "train_speed(iter/s)": 0.200626 }, { "acc": 0.78763971, "epoch": 1.4241094212514036, "grad_norm": 4.53125, "learning_rate": 2.0210245855350397e-06, "loss": 0.7463274, "memory(GiB)": 147.13, "step": 61040, "train_speed(iter/s)": 0.200643 }, { "acc": 0.78232255, "epoch": 1.4243427288236923, "grad_norm": 5.8125, "learning_rate": 2.019507583707893e-06, "loss": 0.80083017, "memory(GiB)": 147.13, "step": 61050, "train_speed(iter/s)": 0.200661 }, { "acc": 0.79297686, "epoch": 1.4245760363959814, "grad_norm": 4.71875, "learning_rate": 2.017991007345175e-06, "loss": 0.73181958, "memory(GiB)": 147.13, "step": 61060, "train_speed(iter/s)": 0.200677 }, { "acc": 0.78155422, "epoch": 1.42480934396827, "grad_norm": 5.375, "learning_rate": 2.016474856663372e-06, "loss": 0.79105949, "memory(GiB)": 147.13, "step": 61070, "train_speed(iter/s)": 0.200693 }, { "acc": 0.77549515, "epoch": 1.425042651540559, "grad_norm": 6.0, "learning_rate": 2.014959131878918e-06, "loss": 0.81070833, "memory(GiB)": 147.13, "step": 61080, "train_speed(iter/s)": 0.20071 }, { "acc": 0.78981018, "epoch": 1.4252759591128479, "grad_norm": 4.03125, "learning_rate": 2.0134438332081814e-06, "loss": 0.76355982, "memory(GiB)": 147.13, "step": 61090, "train_speed(iter/s)": 0.200727 }, { "acc": 0.78231525, "epoch": 1.4255092666851368, "grad_norm": 7.0625, "learning_rate": 2.0119289608674682e-06, "loss": 0.77640409, "memory(GiB)": 147.13, "step": 61100, "train_speed(iter/s)": 0.200745 }, { "acc": 0.77027392, "epoch": 1.4257425742574257, "grad_norm": 6.28125, "learning_rate": 2.010414515073029e-06, "loss": 0.8242136, "memory(GiB)": 147.13, "step": 61110, "train_speed(iter/s)": 0.200761 }, { "acc": 0.78219414, "epoch": 1.4259758818297146, "grad_norm": 5.03125, "learning_rate": 2.0089004960410485e-06, "loss": 0.78226018, "memory(GiB)": 147.13, "step": 61120, "train_speed(iter/s)": 0.200778 }, { "acc": 0.79695072, "epoch": 1.4262091894020035, "grad_norm": 4.46875, "learning_rate": 2.007386903987654e-06, "loss": 0.72442718, "memory(GiB)": 147.13, "step": 61130, "train_speed(iter/s)": 0.200796 }, { "acc": 0.78832622, "epoch": 1.4264424969742924, "grad_norm": 5.15625, "learning_rate": 2.0058737391289085e-06, "loss": 0.7365797, "memory(GiB)": 147.13, "step": 61140, "train_speed(iter/s)": 0.200813 }, { "acc": 0.79924631, "epoch": 1.4266758045465813, "grad_norm": 10.625, "learning_rate": 2.0043610016808185e-06, "loss": 0.71290426, "memory(GiB)": 147.13, "step": 61150, "train_speed(iter/s)": 0.200831 }, { "acc": 0.793398, "epoch": 1.4269091121188702, "grad_norm": 5.25, "learning_rate": 2.0028486918593253e-06, "loss": 0.74249353, "memory(GiB)": 147.13, "step": 61160, "train_speed(iter/s)": 0.200847 }, { "acc": 0.79817839, "epoch": 1.427142419691159, "grad_norm": 7.1875, "learning_rate": 2.001336809880311e-06, "loss": 0.71638279, "memory(GiB)": 147.13, "step": 61170, "train_speed(iter/s)": 0.200864 }, { "acc": 0.7944067, "epoch": 1.427375727263448, "grad_norm": 7.3125, "learning_rate": 1.9998253559595952e-06, "loss": 0.72443352, "memory(GiB)": 147.13, "step": 61180, "train_speed(iter/s)": 0.200882 }, { "acc": 0.77642808, "epoch": 1.4276090348357369, "grad_norm": 5.40625, "learning_rate": 1.9983143303129373e-06, "loss": 0.81988869, "memory(GiB)": 147.13, "step": 61190, "train_speed(iter/s)": 0.200899 }, { "acc": 0.79813347, "epoch": 1.4278423424080258, "grad_norm": 4.875, "learning_rate": 1.996803733156038e-06, "loss": 0.72240438, "memory(GiB)": 147.13, "step": 61200, "train_speed(iter/s)": 0.200916 }, { "acc": 0.76945052, "epoch": 1.4280756499803147, "grad_norm": 5.96875, "learning_rate": 1.9952935647045317e-06, "loss": 0.83810911, "memory(GiB)": 147.13, "step": 61210, "train_speed(iter/s)": 0.200933 }, { "acc": 0.79682202, "epoch": 1.4283089575526036, "grad_norm": 7.59375, "learning_rate": 1.9937838251739983e-06, "loss": 0.72564445, "memory(GiB)": 147.13, "step": 61220, "train_speed(iter/s)": 0.200949 }, { "acc": 0.79631333, "epoch": 1.4285422651248925, "grad_norm": 6.0, "learning_rate": 1.9922745147799505e-06, "loss": 0.74118524, "memory(GiB)": 147.13, "step": 61230, "train_speed(iter/s)": 0.200967 }, { "acc": 0.80053787, "epoch": 1.4287755726971814, "grad_norm": 3.890625, "learning_rate": 1.9907656337378396e-06, "loss": 0.69944177, "memory(GiB)": 147.13, "step": 61240, "train_speed(iter/s)": 0.200983 }, { "acc": 0.76880045, "epoch": 1.4290088802694703, "grad_norm": 5.3125, "learning_rate": 1.9892571822630622e-06, "loss": 0.8351284, "memory(GiB)": 147.13, "step": 61250, "train_speed(iter/s)": 0.201 }, { "acc": 0.76350574, "epoch": 1.4292421878417592, "grad_norm": 4.46875, "learning_rate": 1.987749160570946e-06, "loss": 0.85821857, "memory(GiB)": 147.13, "step": 61260, "train_speed(iter/s)": 0.201018 }, { "acc": 0.7792346, "epoch": 1.429475495414048, "grad_norm": 5.5, "learning_rate": 1.9862415688767657e-06, "loss": 0.78018122, "memory(GiB)": 147.13, "step": 61270, "train_speed(iter/s)": 0.201036 }, { "acc": 0.7726048, "epoch": 1.429708802986337, "grad_norm": 5.375, "learning_rate": 1.984734407395722e-06, "loss": 0.82185307, "memory(GiB)": 147.13, "step": 61280, "train_speed(iter/s)": 0.201053 }, { "acc": 0.78452177, "epoch": 1.4299421105586259, "grad_norm": 4.0625, "learning_rate": 1.9832276763429674e-06, "loss": 0.75924959, "memory(GiB)": 147.13, "step": 61290, "train_speed(iter/s)": 0.201071 }, { "acc": 0.77438474, "epoch": 1.4301754181309148, "grad_norm": 4.65625, "learning_rate": 1.9817213759335846e-06, "loss": 0.84851189, "memory(GiB)": 147.13, "step": 61300, "train_speed(iter/s)": 0.201089 }, { "acc": 0.77766914, "epoch": 1.4304087257032037, "grad_norm": 5.15625, "learning_rate": 1.9802155063825995e-06, "loss": 0.7944005, "memory(GiB)": 147.13, "step": 61310, "train_speed(iter/s)": 0.201107 }, { "acc": 0.74958205, "epoch": 1.4306420332754926, "grad_norm": 5.84375, "learning_rate": 1.9787100679049742e-06, "loss": 0.91778793, "memory(GiB)": 147.13, "step": 61320, "train_speed(iter/s)": 0.201124 }, { "acc": 0.79310417, "epoch": 1.4308753408477815, "grad_norm": 9.375, "learning_rate": 1.977205060715607e-06, "loss": 0.74190741, "memory(GiB)": 147.13, "step": 61330, "train_speed(iter/s)": 0.201141 }, { "acc": 0.77827301, "epoch": 1.4311086484200704, "grad_norm": 5.5, "learning_rate": 1.975700485029341e-06, "loss": 0.80124111, "memory(GiB)": 147.13, "step": 61340, "train_speed(iter/s)": 0.201157 }, { "acc": 0.78497138, "epoch": 1.4313419559923592, "grad_norm": 6.21875, "learning_rate": 1.9741963410609506e-06, "loss": 0.75833988, "memory(GiB)": 147.13, "step": 61350, "train_speed(iter/s)": 0.201175 }, { "acc": 0.77764912, "epoch": 1.4315752635646481, "grad_norm": 3.65625, "learning_rate": 1.9726926290251548e-06, "loss": 0.7967144, "memory(GiB)": 147.13, "step": 61360, "train_speed(iter/s)": 0.201192 }, { "acc": 0.76858315, "epoch": 1.431808571136937, "grad_norm": 6.1875, "learning_rate": 1.971189349136607e-06, "loss": 0.82962227, "memory(GiB)": 147.13, "step": 61370, "train_speed(iter/s)": 0.201209 }, { "acc": 0.7532167, "epoch": 1.4320418787092257, "grad_norm": 4.6875, "learning_rate": 1.969686501609898e-06, "loss": 0.89655132, "memory(GiB)": 147.13, "step": 61380, "train_speed(iter/s)": 0.201227 }, { "acc": 0.79799242, "epoch": 1.4322751862815148, "grad_norm": 3.984375, "learning_rate": 1.9681840866595644e-06, "loss": 0.73923264, "memory(GiB)": 147.13, "step": 61390, "train_speed(iter/s)": 0.201245 }, { "acc": 0.77811699, "epoch": 1.4325084938538035, "grad_norm": 6.15625, "learning_rate": 1.966682104500068e-06, "loss": 0.79530067, "memory(GiB)": 147.13, "step": 61400, "train_speed(iter/s)": 0.201263 }, { "acc": 0.76311293, "epoch": 1.4327418014260926, "grad_norm": 4.8125, "learning_rate": 1.9651805553458212e-06, "loss": 0.83997898, "memory(GiB)": 147.13, "step": 61410, "train_speed(iter/s)": 0.201281 }, { "acc": 0.77644825, "epoch": 1.4329751089983813, "grad_norm": 5.65625, "learning_rate": 1.9636794394111676e-06, "loss": 0.81493549, "memory(GiB)": 147.13, "step": 61420, "train_speed(iter/s)": 0.201298 }, { "acc": 0.7841002, "epoch": 1.4332084165706704, "grad_norm": 5.625, "learning_rate": 1.962178756910393e-06, "loss": 0.77840614, "memory(GiB)": 147.13, "step": 61430, "train_speed(iter/s)": 0.201315 }, { "acc": 0.77601957, "epoch": 1.4334417241429591, "grad_norm": 6.15625, "learning_rate": 1.9606785080577173e-06, "loss": 0.80642538, "memory(GiB)": 147.13, "step": 61440, "train_speed(iter/s)": 0.201332 }, { "acc": 0.78321104, "epoch": 1.4336750317152482, "grad_norm": 4.4375, "learning_rate": 1.959178693067303e-06, "loss": 0.77348614, "memory(GiB)": 147.13, "step": 61450, "train_speed(iter/s)": 0.201349 }, { "acc": 0.77611961, "epoch": 1.433908339287537, "grad_norm": 5.53125, "learning_rate": 1.9576793121532467e-06, "loss": 0.81834526, "memory(GiB)": 147.13, "step": 61460, "train_speed(iter/s)": 0.201365 }, { "acc": 0.76458263, "epoch": 1.4341416468598258, "grad_norm": 5.40625, "learning_rate": 1.9561803655295835e-06, "loss": 0.86758423, "memory(GiB)": 147.13, "step": 61470, "train_speed(iter/s)": 0.201381 }, { "acc": 0.78354859, "epoch": 1.4343749544321147, "grad_norm": 5.28125, "learning_rate": 1.9546818534102903e-06, "loss": 0.77626381, "memory(GiB)": 147.13, "step": 61480, "train_speed(iter/s)": 0.201397 }, { "acc": 0.79747005, "epoch": 1.4346082620044036, "grad_norm": 4.6875, "learning_rate": 1.9531837760092765e-06, "loss": 0.72363586, "memory(GiB)": 147.13, "step": 61490, "train_speed(iter/s)": 0.201414 }, { "acc": 0.77367868, "epoch": 1.4348415695766925, "grad_norm": 4.65625, "learning_rate": 1.9516861335403963e-06, "loss": 0.80418158, "memory(GiB)": 147.13, "step": 61500, "train_speed(iter/s)": 0.201431 }, { "epoch": 1.4348415695766925, "eval_acc": 0.7446573882301356, "eval_loss": 0.8045687675476074, "eval_runtime": 1271.1131, "eval_samples_per_second": 28.315, "eval_steps_per_second": 14.158, "step": 61500 }, { "acc": 0.77232461, "epoch": 1.4350748771489814, "grad_norm": 5.09375, "learning_rate": 1.9501889262174323e-06, "loss": 0.82037907, "memory(GiB)": 147.13, "step": 61510, "train_speed(iter/s)": 0.2006 }, { "acc": 0.77945395, "epoch": 1.4353081847212703, "grad_norm": 5.4375, "learning_rate": 1.9486921542541147e-06, "loss": 0.79027767, "memory(GiB)": 147.13, "step": 61520, "train_speed(iter/s)": 0.200618 }, { "acc": 0.79523678, "epoch": 1.4355414922935592, "grad_norm": 4.375, "learning_rate": 1.9471958178641055e-06, "loss": 0.7177619, "memory(GiB)": 147.13, "step": 61530, "train_speed(iter/s)": 0.200635 }, { "acc": 0.79970589, "epoch": 1.435774799865848, "grad_norm": 4.25, "learning_rate": 1.9456999172610046e-06, "loss": 0.71671066, "memory(GiB)": 147.13, "step": 61540, "train_speed(iter/s)": 0.200652 }, { "acc": 0.76421814, "epoch": 1.436008107438137, "grad_norm": 5.625, "learning_rate": 1.9442044526583555e-06, "loss": 0.86059494, "memory(GiB)": 147.13, "step": 61550, "train_speed(iter/s)": 0.200669 }, { "acc": 0.77591553, "epoch": 1.436241415010426, "grad_norm": 7.0, "learning_rate": 1.9427094242696304e-06, "loss": 0.81485806, "memory(GiB)": 147.13, "step": 61560, "train_speed(iter/s)": 0.200686 }, { "acc": 0.78154783, "epoch": 1.4364747225827148, "grad_norm": 6.46875, "learning_rate": 1.941214832308249e-06, "loss": 0.78967419, "memory(GiB)": 147.13, "step": 61570, "train_speed(iter/s)": 0.200703 }, { "acc": 0.77090359, "epoch": 1.4367080301550037, "grad_norm": 5.5, "learning_rate": 1.9397206769875602e-06, "loss": 0.83600082, "memory(GiB)": 147.13, "step": 61580, "train_speed(iter/s)": 0.20072 }, { "acc": 0.77267971, "epoch": 1.4369413377272926, "grad_norm": 5.46875, "learning_rate": 1.9382269585208576e-06, "loss": 0.84045687, "memory(GiB)": 147.13, "step": 61590, "train_speed(iter/s)": 0.200737 }, { "acc": 0.75382733, "epoch": 1.4371746452995815, "grad_norm": 4.5625, "learning_rate": 1.936733677121367e-06, "loss": 0.87408772, "memory(GiB)": 147.13, "step": 61600, "train_speed(iter/s)": 0.200754 }, { "acc": 0.78201303, "epoch": 1.4374079528718704, "grad_norm": 6.21875, "learning_rate": 1.935240833002252e-06, "loss": 0.78332329, "memory(GiB)": 147.13, "step": 61610, "train_speed(iter/s)": 0.200771 }, { "acc": 0.8092041, "epoch": 1.4376412604441593, "grad_norm": 4.9375, "learning_rate": 1.933748426376622e-06, "loss": 0.68223801, "memory(GiB)": 147.13, "step": 61620, "train_speed(iter/s)": 0.200788 }, { "acc": 0.79494247, "epoch": 1.4378745680164482, "grad_norm": 5.40625, "learning_rate": 1.932256457457509e-06, "loss": 0.72881784, "memory(GiB)": 147.13, "step": 61630, "train_speed(iter/s)": 0.200805 }, { "acc": 0.78494654, "epoch": 1.438107875588737, "grad_norm": 4.78125, "learning_rate": 1.9307649264578982e-06, "loss": 0.75482392, "memory(GiB)": 147.13, "step": 61640, "train_speed(iter/s)": 0.200822 }, { "acc": 0.7854641, "epoch": 1.438341183161026, "grad_norm": 5.9375, "learning_rate": 1.9292738335907e-06, "loss": 0.77157822, "memory(GiB)": 147.13, "step": 61650, "train_speed(iter/s)": 0.200839 }, { "acc": 0.7829257, "epoch": 1.438574490733315, "grad_norm": 6.53125, "learning_rate": 1.9277831790687724e-06, "loss": 0.77883649, "memory(GiB)": 147.13, "step": 61660, "train_speed(iter/s)": 0.200854 }, { "acc": 0.78623981, "epoch": 1.4388077983056038, "grad_norm": 5.0625, "learning_rate": 1.9262929631049034e-06, "loss": 0.76112614, "memory(GiB)": 147.13, "step": 61670, "train_speed(iter/s)": 0.20087 }, { "acc": 0.7791275, "epoch": 1.4390411058778927, "grad_norm": 4.5, "learning_rate": 1.924803185911819e-06, "loss": 0.79928355, "memory(GiB)": 147.13, "step": 61680, "train_speed(iter/s)": 0.200888 }, { "acc": 0.79443464, "epoch": 1.4392744134501816, "grad_norm": 5.125, "learning_rate": 1.923313847702188e-06, "loss": 0.72661982, "memory(GiB)": 147.13, "step": 61690, "train_speed(iter/s)": 0.200903 }, { "acc": 0.78530979, "epoch": 1.4395077210224705, "grad_norm": 5.1875, "learning_rate": 1.9218249486886097e-06, "loss": 0.76591215, "memory(GiB)": 147.13, "step": 61700, "train_speed(iter/s)": 0.20092 }, { "acc": 0.78451734, "epoch": 1.4397410285947594, "grad_norm": 6.0, "learning_rate": 1.9203364890836277e-06, "loss": 0.78146687, "memory(GiB)": 147.13, "step": 61710, "train_speed(iter/s)": 0.200936 }, { "acc": 0.77538376, "epoch": 1.4399743361670483, "grad_norm": 5.78125, "learning_rate": 1.918848469099718e-06, "loss": 0.79503469, "memory(GiB)": 147.13, "step": 61720, "train_speed(iter/s)": 0.200953 }, { "acc": 0.79319973, "epoch": 1.4402076437393372, "grad_norm": 4.5, "learning_rate": 1.9173608889492936e-06, "loss": 0.7500802, "memory(GiB)": 147.13, "step": 61730, "train_speed(iter/s)": 0.20097 }, { "acc": 0.77077246, "epoch": 1.440440951311626, "grad_norm": 5.75, "learning_rate": 1.915873748844705e-06, "loss": 0.82878227, "memory(GiB)": 147.13, "step": 61740, "train_speed(iter/s)": 0.200986 }, { "acc": 0.78738832, "epoch": 1.440674258883915, "grad_norm": 6.25, "learning_rate": 1.9143870489982443e-06, "loss": 0.73900366, "memory(GiB)": 147.13, "step": 61750, "train_speed(iter/s)": 0.201002 }, { "acc": 0.76408119, "epoch": 1.4409075664562039, "grad_norm": 5.625, "learning_rate": 1.9129007896221365e-06, "loss": 0.88208513, "memory(GiB)": 147.13, "step": 61760, "train_speed(iter/s)": 0.201018 }, { "acc": 0.77571197, "epoch": 1.4411408740284926, "grad_norm": 5.4375, "learning_rate": 1.9114149709285416e-06, "loss": 0.8385725, "memory(GiB)": 147.13, "step": 61770, "train_speed(iter/s)": 0.201035 }, { "acc": 0.78287396, "epoch": 1.4413741816007817, "grad_norm": 5.1875, "learning_rate": 1.909929593129565e-06, "loss": 0.76827679, "memory(GiB)": 147.13, "step": 61780, "train_speed(iter/s)": 0.201051 }, { "acc": 0.76898689, "epoch": 1.4416074891730704, "grad_norm": 8.125, "learning_rate": 1.9084446564372393e-06, "loss": 0.83712349, "memory(GiB)": 147.13, "step": 61790, "train_speed(iter/s)": 0.201068 }, { "acc": 0.79256206, "epoch": 1.4418407967453595, "grad_norm": 6.46875, "learning_rate": 1.9069601610635424e-06, "loss": 0.74981198, "memory(GiB)": 147.13, "step": 61800, "train_speed(iter/s)": 0.201086 }, { "acc": 0.77438807, "epoch": 1.4420741043176482, "grad_norm": 5.6875, "learning_rate": 1.9054761072203843e-06, "loss": 0.80913887, "memory(GiB)": 147.13, "step": 61810, "train_speed(iter/s)": 0.201102 }, { "acc": 0.78683462, "epoch": 1.4423074118899373, "grad_norm": 4.125, "learning_rate": 1.9039924951196109e-06, "loss": 0.74516659, "memory(GiB)": 147.13, "step": 61820, "train_speed(iter/s)": 0.201119 }, { "acc": 0.76727543, "epoch": 1.442540719462226, "grad_norm": 4.9375, "learning_rate": 1.9025093249730108e-06, "loss": 0.85720892, "memory(GiB)": 147.13, "step": 61830, "train_speed(iter/s)": 0.201137 }, { "acc": 0.78560305, "epoch": 1.442774027034515, "grad_norm": 4.03125, "learning_rate": 1.9010265969923052e-06, "loss": 0.7749969, "memory(GiB)": 147.13, "step": 61840, "train_speed(iter/s)": 0.201152 }, { "acc": 0.77236147, "epoch": 1.4430073346068037, "grad_norm": 7.28125, "learning_rate": 1.8995443113891527e-06, "loss": 0.83206615, "memory(GiB)": 147.13, "step": 61850, "train_speed(iter/s)": 0.20117 }, { "acc": 0.77366638, "epoch": 1.4432406421790926, "grad_norm": 5.03125, "learning_rate": 1.898062468375147e-06, "loss": 0.80375004, "memory(GiB)": 147.13, "step": 61860, "train_speed(iter/s)": 0.201187 }, { "acc": 0.7846086, "epoch": 1.4434739497513815, "grad_norm": 6.0625, "learning_rate": 1.8965810681618251e-06, "loss": 0.78282852, "memory(GiB)": 147.13, "step": 61870, "train_speed(iter/s)": 0.201204 }, { "acc": 0.76645479, "epoch": 1.4437072573236704, "grad_norm": 9.0625, "learning_rate": 1.8951001109606538e-06, "loss": 0.84201212, "memory(GiB)": 147.13, "step": 61880, "train_speed(iter/s)": 0.201221 }, { "acc": 0.77387953, "epoch": 1.4439405648959593, "grad_norm": 4.15625, "learning_rate": 1.893619596983038e-06, "loss": 0.82105818, "memory(GiB)": 147.13, "step": 61890, "train_speed(iter/s)": 0.201238 }, { "acc": 0.7874526, "epoch": 1.4441738724682482, "grad_norm": 3.734375, "learning_rate": 1.8921395264403236e-06, "loss": 0.76474705, "memory(GiB)": 147.13, "step": 61900, "train_speed(iter/s)": 0.201255 }, { "acc": 0.78465261, "epoch": 1.4444071800405371, "grad_norm": 4.3125, "learning_rate": 1.890659899543788e-06, "loss": 0.77885513, "memory(GiB)": 147.13, "step": 61910, "train_speed(iter/s)": 0.201272 }, { "acc": 0.77660131, "epoch": 1.444640487612826, "grad_norm": 5.78125, "learning_rate": 1.88918071650465e-06, "loss": 0.8049757, "memory(GiB)": 147.13, "step": 61920, "train_speed(iter/s)": 0.201289 }, { "acc": 0.77938328, "epoch": 1.444873795185115, "grad_norm": 5.125, "learning_rate": 1.8877019775340587e-06, "loss": 0.81291084, "memory(GiB)": 147.13, "step": 61930, "train_speed(iter/s)": 0.201307 }, { "acc": 0.78789864, "epoch": 1.4451071027574038, "grad_norm": 6.28125, "learning_rate": 1.8862236828431086e-06, "loss": 0.76462269, "memory(GiB)": 147.13, "step": 61940, "train_speed(iter/s)": 0.201324 }, { "acc": 0.77977004, "epoch": 1.4453404103296927, "grad_norm": 6.0, "learning_rate": 1.8847458326428226e-06, "loss": 0.79729452, "memory(GiB)": 147.13, "step": 61950, "train_speed(iter/s)": 0.20134 }, { "acc": 0.78940058, "epoch": 1.4455737179019816, "grad_norm": 6.34375, "learning_rate": 1.8832684271441643e-06, "loss": 0.74841819, "memory(GiB)": 147.13, "step": 61960, "train_speed(iter/s)": 0.201357 }, { "acc": 0.78960233, "epoch": 1.4458070254742705, "grad_norm": 3.703125, "learning_rate": 1.8817914665580322e-06, "loss": 0.7424305, "memory(GiB)": 147.13, "step": 61970, "train_speed(iter/s)": 0.201373 }, { "acc": 0.80876637, "epoch": 1.4460403330465594, "grad_norm": 7.53125, "learning_rate": 1.8803149510952613e-06, "loss": 0.71806173, "memory(GiB)": 147.13, "step": 61980, "train_speed(iter/s)": 0.20139 }, { "acc": 0.77671194, "epoch": 1.4462736406188483, "grad_norm": 4.875, "learning_rate": 1.8788388809666259e-06, "loss": 0.79790506, "memory(GiB)": 147.13, "step": 61990, "train_speed(iter/s)": 0.201407 }, { "acc": 0.78528218, "epoch": 1.4465069481911372, "grad_norm": 5.21875, "learning_rate": 1.877363256382832e-06, "loss": 0.7618494, "memory(GiB)": 147.13, "step": 62000, "train_speed(iter/s)": 0.201423 }, { "epoch": 1.4465069481911372, "eval_acc": 0.7446714809225781, "eval_loss": 0.8045513033866882, "eval_runtime": 1270.3592, "eval_samples_per_second": 28.331, "eval_steps_per_second": 14.166, "step": 62000 }, { "acc": 0.78110247, "epoch": 1.4467402557634261, "grad_norm": 7.09375, "learning_rate": 1.8758880775545279e-06, "loss": 0.77745328, "memory(GiB)": 147.13, "step": 62010, "train_speed(iter/s)": 0.200597 }, { "acc": 0.78310804, "epoch": 1.446973563335715, "grad_norm": 4.15625, "learning_rate": 1.8744133446922935e-06, "loss": 0.78575201, "memory(GiB)": 147.13, "step": 62020, "train_speed(iter/s)": 0.200613 }, { "acc": 0.77370396, "epoch": 1.447206870908004, "grad_norm": 4.6875, "learning_rate": 1.8729390580066442e-06, "loss": 0.82233162, "memory(GiB)": 147.13, "step": 62030, "train_speed(iter/s)": 0.200629 }, { "acc": 0.77510133, "epoch": 1.4474401784802928, "grad_norm": 6.09375, "learning_rate": 1.8714652177080377e-06, "loss": 0.80517254, "memory(GiB)": 147.13, "step": 62040, "train_speed(iter/s)": 0.200645 }, { "acc": 0.78512564, "epoch": 1.4476734860525817, "grad_norm": 5.9375, "learning_rate": 1.869991824006861e-06, "loss": 0.78300362, "memory(GiB)": 147.13, "step": 62050, "train_speed(iter/s)": 0.200662 }, { "acc": 0.78685694, "epoch": 1.4479067936248706, "grad_norm": 20.25, "learning_rate": 1.8685188771134433e-06, "loss": 0.77022228, "memory(GiB)": 147.13, "step": 62060, "train_speed(iter/s)": 0.200678 }, { "acc": 0.76881313, "epoch": 1.4481401011971595, "grad_norm": 5.09375, "learning_rate": 1.8670463772380464e-06, "loss": 0.82338123, "memory(GiB)": 147.13, "step": 62070, "train_speed(iter/s)": 0.200695 }, { "acc": 0.79330425, "epoch": 1.4483734087694484, "grad_norm": 6.59375, "learning_rate": 1.8655743245908692e-06, "loss": 0.73466892, "memory(GiB)": 147.13, "step": 62080, "train_speed(iter/s)": 0.200711 }, { "acc": 0.79285131, "epoch": 1.4486067163417373, "grad_norm": 6.3125, "learning_rate": 1.864102719382045e-06, "loss": 0.73613644, "memory(GiB)": 147.13, "step": 62090, "train_speed(iter/s)": 0.200727 }, { "acc": 0.79956055, "epoch": 1.4488400239140262, "grad_norm": 4.125, "learning_rate": 1.8626315618216484e-06, "loss": 0.72277913, "memory(GiB)": 147.13, "step": 62100, "train_speed(iter/s)": 0.200744 }, { "acc": 0.77584467, "epoch": 1.4490733314863151, "grad_norm": 4.75, "learning_rate": 1.8611608521196844e-06, "loss": 0.80201588, "memory(GiB)": 147.13, "step": 62110, "train_speed(iter/s)": 0.200761 }, { "acc": 0.78103304, "epoch": 1.449306639058604, "grad_norm": 8.125, "learning_rate": 1.8596905904860956e-06, "loss": 0.78463149, "memory(GiB)": 147.13, "step": 62120, "train_speed(iter/s)": 0.200778 }, { "acc": 0.78027878, "epoch": 1.449539946630893, "grad_norm": 6.65625, "learning_rate": 1.8582207771307647e-06, "loss": 0.78982868, "memory(GiB)": 147.13, "step": 62130, "train_speed(iter/s)": 0.200794 }, { "acc": 0.80753584, "epoch": 1.4497732542031816, "grad_norm": 6.125, "learning_rate": 1.8567514122635027e-06, "loss": 0.68397255, "memory(GiB)": 147.13, "step": 62140, "train_speed(iter/s)": 0.200811 }, { "acc": 0.80782089, "epoch": 1.4500065617754707, "grad_norm": 7.21875, "learning_rate": 1.8552824960940658e-06, "loss": 0.67650642, "memory(GiB)": 147.13, "step": 62150, "train_speed(iter/s)": 0.200828 }, { "acc": 0.77864218, "epoch": 1.4502398693477594, "grad_norm": 7.0, "learning_rate": 1.8538140288321387e-06, "loss": 0.80679083, "memory(GiB)": 147.13, "step": 62160, "train_speed(iter/s)": 0.200844 }, { "acc": 0.79958267, "epoch": 1.4504731769200485, "grad_norm": 6.15625, "learning_rate": 1.8523460106873436e-06, "loss": 0.7120677, "memory(GiB)": 147.13, "step": 62170, "train_speed(iter/s)": 0.20086 }, { "acc": 0.77506552, "epoch": 1.4507064844923372, "grad_norm": 6.71875, "learning_rate": 1.8508784418692428e-06, "loss": 0.83289452, "memory(GiB)": 147.13, "step": 62180, "train_speed(iter/s)": 0.200876 }, { "acc": 0.80013189, "epoch": 1.4509397920646263, "grad_norm": 5.0625, "learning_rate": 1.8494113225873295e-06, "loss": 0.70295601, "memory(GiB)": 147.13, "step": 62190, "train_speed(iter/s)": 0.200893 }, { "acc": 0.79650593, "epoch": 1.451173099636915, "grad_norm": 8.4375, "learning_rate": 1.8479446530510348e-06, "loss": 0.72814693, "memory(GiB)": 147.13, "step": 62200, "train_speed(iter/s)": 0.20091 }, { "acc": 0.77118607, "epoch": 1.451406407209204, "grad_norm": 4.09375, "learning_rate": 1.8464784334697234e-06, "loss": 0.84197454, "memory(GiB)": 147.13, "step": 62210, "train_speed(iter/s)": 0.200925 }, { "acc": 0.79284525, "epoch": 1.4516397147814928, "grad_norm": 4.78125, "learning_rate": 1.845012664052701e-06, "loss": 0.74894667, "memory(GiB)": 147.13, "step": 62220, "train_speed(iter/s)": 0.200941 }, { "acc": 0.77012434, "epoch": 1.451873022353782, "grad_norm": 5.1875, "learning_rate": 1.843547345009203e-06, "loss": 0.81883812, "memory(GiB)": 147.13, "step": 62230, "train_speed(iter/s)": 0.200958 }, { "acc": 0.77381506, "epoch": 1.4521063299260706, "grad_norm": 6.71875, "learning_rate": 1.8420824765484058e-06, "loss": 0.79767299, "memory(GiB)": 147.13, "step": 62240, "train_speed(iter/s)": 0.200974 }, { "acc": 0.77434549, "epoch": 1.4523396374983595, "grad_norm": 4.9375, "learning_rate": 1.8406180588794176e-06, "loss": 0.81015167, "memory(GiB)": 147.13, "step": 62250, "train_speed(iter/s)": 0.20099 }, { "acc": 0.79392915, "epoch": 1.4525729450706484, "grad_norm": 6.53125, "learning_rate": 1.8391540922112822e-06, "loss": 0.72809219, "memory(GiB)": 147.13, "step": 62260, "train_speed(iter/s)": 0.201007 }, { "acc": 0.77831402, "epoch": 1.4528062526429373, "grad_norm": 5.59375, "learning_rate": 1.8376905767529834e-06, "loss": 0.78499942, "memory(GiB)": 147.13, "step": 62270, "train_speed(iter/s)": 0.201022 }, { "acc": 0.79829054, "epoch": 1.4530395602152262, "grad_norm": 6.21875, "learning_rate": 1.8362275127134348e-06, "loss": 0.71653814, "memory(GiB)": 147.13, "step": 62280, "train_speed(iter/s)": 0.201037 }, { "acc": 0.75779715, "epoch": 1.453272867787515, "grad_norm": 5.25, "learning_rate": 1.8347649003014911e-06, "loss": 0.86373796, "memory(GiB)": 147.13, "step": 62290, "train_speed(iter/s)": 0.201054 }, { "acc": 0.76059661, "epoch": 1.453506175359804, "grad_norm": 8.1875, "learning_rate": 1.833302739725939e-06, "loss": 0.87852764, "memory(GiB)": 147.13, "step": 62300, "train_speed(iter/s)": 0.201069 }, { "acc": 0.77951355, "epoch": 1.4537394829320929, "grad_norm": 11.6875, "learning_rate": 1.8318410311955003e-06, "loss": 0.7952219, "memory(GiB)": 147.13, "step": 62310, "train_speed(iter/s)": 0.201087 }, { "acc": 0.78401728, "epoch": 1.4539727905043818, "grad_norm": 4.6875, "learning_rate": 1.830379774918834e-06, "loss": 0.77956657, "memory(GiB)": 147.13, "step": 62320, "train_speed(iter/s)": 0.201102 }, { "acc": 0.78924007, "epoch": 1.4542060980766707, "grad_norm": 5.59375, "learning_rate": 1.8289189711045324e-06, "loss": 0.75395088, "memory(GiB)": 147.13, "step": 62330, "train_speed(iter/s)": 0.201119 }, { "acc": 0.77698288, "epoch": 1.4544394056489596, "grad_norm": 4.71875, "learning_rate": 1.8274586199611283e-06, "loss": 0.78425674, "memory(GiB)": 147.13, "step": 62340, "train_speed(iter/s)": 0.201135 }, { "acc": 0.78032331, "epoch": 1.4546727132212485, "grad_norm": 6.59375, "learning_rate": 1.8259987216970826e-06, "loss": 0.79403481, "memory(GiB)": 147.13, "step": 62350, "train_speed(iter/s)": 0.201152 }, { "acc": 0.7868978, "epoch": 1.4549060207935374, "grad_norm": 4.9375, "learning_rate": 1.8245392765207993e-06, "loss": 0.75227661, "memory(GiB)": 147.13, "step": 62360, "train_speed(iter/s)": 0.201168 }, { "acc": 0.78523664, "epoch": 1.4551393283658263, "grad_norm": 5.40625, "learning_rate": 1.8230802846406104e-06, "loss": 0.76713495, "memory(GiB)": 147.13, "step": 62370, "train_speed(iter/s)": 0.201183 }, { "acc": 0.77572908, "epoch": 1.4553726359381152, "grad_norm": 5.84375, "learning_rate": 1.821621746264789e-06, "loss": 0.80009136, "memory(GiB)": 147.13, "step": 62380, "train_speed(iter/s)": 0.2012 }, { "acc": 0.78314152, "epoch": 1.455605943510404, "grad_norm": 7.15625, "learning_rate": 1.8201636616015405e-06, "loss": 0.77063742, "memory(GiB)": 147.13, "step": 62390, "train_speed(iter/s)": 0.201216 }, { "acc": 0.77914481, "epoch": 1.455839251082693, "grad_norm": 4.53125, "learning_rate": 1.8187060308590038e-06, "loss": 0.77920589, "memory(GiB)": 147.13, "step": 62400, "train_speed(iter/s)": 0.201232 }, { "acc": 0.79587164, "epoch": 1.4560725586549819, "grad_norm": 5.65625, "learning_rate": 1.8172488542452583e-06, "loss": 0.73282957, "memory(GiB)": 147.13, "step": 62410, "train_speed(iter/s)": 0.201249 }, { "acc": 0.79226098, "epoch": 1.4563058662272708, "grad_norm": 5.65625, "learning_rate": 1.8157921319683147e-06, "loss": 0.73642225, "memory(GiB)": 147.13, "step": 62420, "train_speed(iter/s)": 0.201265 }, { "acc": 0.76984286, "epoch": 1.4565391737995597, "grad_norm": 5.0625, "learning_rate": 1.8143358642361191e-06, "loss": 0.83457851, "memory(GiB)": 147.13, "step": 62430, "train_speed(iter/s)": 0.201282 }, { "acc": 0.78718529, "epoch": 1.4567724813718486, "grad_norm": 5.3125, "learning_rate": 1.8128800512565514e-06, "loss": 0.75155096, "memory(GiB)": 147.13, "step": 62440, "train_speed(iter/s)": 0.201299 }, { "acc": 0.78033142, "epoch": 1.4570057889441375, "grad_norm": 4.03125, "learning_rate": 1.811424693237433e-06, "loss": 0.77643776, "memory(GiB)": 147.13, "step": 62450, "train_speed(iter/s)": 0.201317 }, { "acc": 0.80350819, "epoch": 1.4572390965164264, "grad_norm": 4.40625, "learning_rate": 1.8099697903865127e-06, "loss": 0.69405346, "memory(GiB)": 147.13, "step": 62460, "train_speed(iter/s)": 0.201333 }, { "acc": 0.7767561, "epoch": 1.4574724040887153, "grad_norm": 4.09375, "learning_rate": 1.8085153429114766e-06, "loss": 0.80634718, "memory(GiB)": 147.13, "step": 62470, "train_speed(iter/s)": 0.201351 }, { "acc": 0.79806414, "epoch": 1.4577057116610042, "grad_norm": 4.71875, "learning_rate": 1.8070613510199497e-06, "loss": 0.72472191, "memory(GiB)": 147.13, "step": 62480, "train_speed(iter/s)": 0.201368 }, { "acc": 0.7791873, "epoch": 1.457939019233293, "grad_norm": 4.75, "learning_rate": 1.8056078149194861e-06, "loss": 0.78082762, "memory(GiB)": 147.13, "step": 62490, "train_speed(iter/s)": 0.201385 }, { "acc": 0.78747339, "epoch": 1.458172326805582, "grad_norm": 7.03125, "learning_rate": 1.8041547348175803e-06, "loss": 0.76401696, "memory(GiB)": 147.13, "step": 62500, "train_speed(iter/s)": 0.201402 }, { "epoch": 1.458172326805582, "eval_acc": 0.7446378506337948, "eval_loss": 0.8045361042022705, "eval_runtime": 1270.7937, "eval_samples_per_second": 28.322, "eval_steps_per_second": 14.161, "step": 62500 }, { "acc": 0.76177197, "epoch": 1.4584056343778709, "grad_norm": 4.9375, "learning_rate": 1.802702110921658e-06, "loss": 0.85588417, "memory(GiB)": 147.13, "step": 62510, "train_speed(iter/s)": 0.200583 }, { "acc": 0.7588037, "epoch": 1.4586389419501598, "grad_norm": 6.59375, "learning_rate": 1.8012499434390784e-06, "loss": 0.8837925, "memory(GiB)": 147.13, "step": 62520, "train_speed(iter/s)": 0.200598 }, { "acc": 0.76614499, "epoch": 1.4588722495224484, "grad_norm": 5.90625, "learning_rate": 1.7997982325771425e-06, "loss": 0.83949127, "memory(GiB)": 147.13, "step": 62530, "train_speed(iter/s)": 0.200613 }, { "acc": 0.78617134, "epoch": 1.4591055570947375, "grad_norm": 6.21875, "learning_rate": 1.7983469785430785e-06, "loss": 0.79134507, "memory(GiB)": 147.13, "step": 62540, "train_speed(iter/s)": 0.200629 }, { "acc": 0.79341078, "epoch": 1.4593388646670262, "grad_norm": 4.3125, "learning_rate": 1.7968961815440534e-06, "loss": 0.76289606, "memory(GiB)": 147.13, "step": 62550, "train_speed(iter/s)": 0.200646 }, { "acc": 0.77591524, "epoch": 1.4595721722393153, "grad_norm": 7.96875, "learning_rate": 1.7954458417871667e-06, "loss": 0.80238419, "memory(GiB)": 147.13, "step": 62560, "train_speed(iter/s)": 0.200663 }, { "acc": 0.79817991, "epoch": 1.459805479811604, "grad_norm": 5.1875, "learning_rate": 1.7939959594794564e-06, "loss": 0.71526041, "memory(GiB)": 147.13, "step": 62570, "train_speed(iter/s)": 0.200681 }, { "acc": 0.77496409, "epoch": 1.4600387873838931, "grad_norm": 7.46875, "learning_rate": 1.7925465348278898e-06, "loss": 0.7962131, "memory(GiB)": 147.13, "step": 62580, "train_speed(iter/s)": 0.200697 }, { "acc": 0.78668814, "epoch": 1.4602720949561818, "grad_norm": 6.4375, "learning_rate": 1.7910975680393756e-06, "loss": 0.76099, "memory(GiB)": 147.13, "step": 62590, "train_speed(iter/s)": 0.200714 }, { "acc": 0.77080307, "epoch": 1.460505402528471, "grad_norm": 5.15625, "learning_rate": 1.789649059320751e-06, "loss": 0.82710533, "memory(GiB)": 147.13, "step": 62600, "train_speed(iter/s)": 0.20073 }, { "acc": 0.77813931, "epoch": 1.4607387101007596, "grad_norm": 5.8125, "learning_rate": 1.7882010088787888e-06, "loss": 0.801579, "memory(GiB)": 147.13, "step": 62610, "train_speed(iter/s)": 0.200749 }, { "acc": 0.77716465, "epoch": 1.4609720176730487, "grad_norm": 5.9375, "learning_rate": 1.7867534169202018e-06, "loss": 0.80519314, "memory(GiB)": 147.13, "step": 62620, "train_speed(iter/s)": 0.200766 }, { "acc": 0.75499105, "epoch": 1.4612053252453374, "grad_norm": 4.65625, "learning_rate": 1.785306283651629e-06, "loss": 0.88495131, "memory(GiB)": 147.13, "step": 62630, "train_speed(iter/s)": 0.200784 }, { "acc": 0.77682557, "epoch": 1.4614386328176263, "grad_norm": 6.34375, "learning_rate": 1.783859609279654e-06, "loss": 0.80335407, "memory(GiB)": 147.13, "step": 62640, "train_speed(iter/s)": 0.200802 }, { "acc": 0.78338518, "epoch": 1.4616719403899152, "grad_norm": 5.4375, "learning_rate": 1.7824133940107818e-06, "loss": 0.77647243, "memory(GiB)": 147.13, "step": 62650, "train_speed(iter/s)": 0.200818 }, { "acc": 0.77094107, "epoch": 1.4619052479622041, "grad_norm": 4.375, "learning_rate": 1.7809676380514646e-06, "loss": 0.84018555, "memory(GiB)": 147.13, "step": 62660, "train_speed(iter/s)": 0.200835 }, { "acc": 0.78348899, "epoch": 1.462138555534493, "grad_norm": 5.78125, "learning_rate": 1.7795223416080804e-06, "loss": 0.7956295, "memory(GiB)": 147.13, "step": 62670, "train_speed(iter/s)": 0.200851 }, { "acc": 0.77315598, "epoch": 1.462371863106782, "grad_norm": 5.59375, "learning_rate": 1.778077504886948e-06, "loss": 0.81796093, "memory(GiB)": 147.13, "step": 62680, "train_speed(iter/s)": 0.200868 }, { "acc": 0.7750021, "epoch": 1.4626051706790708, "grad_norm": 7.0625, "learning_rate": 1.7766331280943156e-06, "loss": 0.80663986, "memory(GiB)": 147.13, "step": 62690, "train_speed(iter/s)": 0.200886 }, { "acc": 0.78340206, "epoch": 1.4628384782513597, "grad_norm": 5.28125, "learning_rate": 1.775189211436366e-06, "loss": 0.78596182, "memory(GiB)": 147.13, "step": 62700, "train_speed(iter/s)": 0.200902 }, { "acc": 0.77824869, "epoch": 1.4630717858236486, "grad_norm": 5.25, "learning_rate": 1.7737457551192221e-06, "loss": 0.80983734, "memory(GiB)": 147.13, "step": 62710, "train_speed(iter/s)": 0.200918 }, { "acc": 0.77051554, "epoch": 1.4633050933959375, "grad_norm": 6.8125, "learning_rate": 1.7723027593489322e-06, "loss": 0.82581806, "memory(GiB)": 147.13, "step": 62720, "train_speed(iter/s)": 0.200936 }, { "acc": 0.76511116, "epoch": 1.4635384009682264, "grad_norm": 7.53125, "learning_rate": 1.7708602243314876e-06, "loss": 0.84343872, "memory(GiB)": 147.13, "step": 62730, "train_speed(iter/s)": 0.200953 }, { "acc": 0.78928823, "epoch": 1.4637717085405153, "grad_norm": 4.3125, "learning_rate": 1.7694181502728074e-06, "loss": 0.74881258, "memory(GiB)": 147.13, "step": 62740, "train_speed(iter/s)": 0.20097 }, { "acc": 0.77590837, "epoch": 1.4640050161128042, "grad_norm": 6.125, "learning_rate": 1.7679765373787467e-06, "loss": 0.78589211, "memory(GiB)": 147.13, "step": 62750, "train_speed(iter/s)": 0.200987 }, { "acc": 0.78272462, "epoch": 1.464238323685093, "grad_norm": 6.59375, "learning_rate": 1.7665353858550993e-06, "loss": 0.75262532, "memory(GiB)": 147.13, "step": 62760, "train_speed(iter/s)": 0.201003 }, { "acc": 0.77953529, "epoch": 1.464471631257382, "grad_norm": 7.4375, "learning_rate": 1.7650946959075833e-06, "loss": 0.81623087, "memory(GiB)": 147.13, "step": 62770, "train_speed(iter/s)": 0.20102 }, { "acc": 0.77969527, "epoch": 1.464704938829671, "grad_norm": 5.53125, "learning_rate": 1.763654467741861e-06, "loss": 0.78417301, "memory(GiB)": 147.13, "step": 62780, "train_speed(iter/s)": 0.201037 }, { "acc": 0.79427376, "epoch": 1.4649382464019598, "grad_norm": 6.53125, "learning_rate": 1.7622147015635222e-06, "loss": 0.75227299, "memory(GiB)": 147.13, "step": 62790, "train_speed(iter/s)": 0.201054 }, { "acc": 0.77105227, "epoch": 1.4651715539742487, "grad_norm": 4.75, "learning_rate": 1.760775397578095e-06, "loss": 0.83062248, "memory(GiB)": 147.13, "step": 62800, "train_speed(iter/s)": 0.20107 }, { "acc": 0.7859479, "epoch": 1.4654048615465376, "grad_norm": 5.375, "learning_rate": 1.7593365559910397e-06, "loss": 0.76885614, "memory(GiB)": 147.13, "step": 62810, "train_speed(iter/s)": 0.201086 }, { "acc": 0.76209989, "epoch": 1.4656381691188265, "grad_norm": 6.4375, "learning_rate": 1.7578981770077474e-06, "loss": 0.84865894, "memory(GiB)": 147.13, "step": 62820, "train_speed(iter/s)": 0.201103 }, { "acc": 0.78125958, "epoch": 1.4658714766911154, "grad_norm": 6.0625, "learning_rate": 1.7564602608335502e-06, "loss": 0.77670856, "memory(GiB)": 147.13, "step": 62830, "train_speed(iter/s)": 0.20112 }, { "acc": 0.79948587, "epoch": 1.4661047842634043, "grad_norm": 7.4375, "learning_rate": 1.7550228076737069e-06, "loss": 0.72959423, "memory(GiB)": 147.13, "step": 62840, "train_speed(iter/s)": 0.201137 }, { "acc": 0.75941429, "epoch": 1.4663380918356932, "grad_norm": 5.75, "learning_rate": 1.7535858177334163e-06, "loss": 0.86305752, "memory(GiB)": 147.13, "step": 62850, "train_speed(iter/s)": 0.201152 }, { "acc": 0.78896446, "epoch": 1.466571399407982, "grad_norm": 4.28125, "learning_rate": 1.7521492912178062e-06, "loss": 0.75683656, "memory(GiB)": 147.13, "step": 62860, "train_speed(iter/s)": 0.20117 }, { "acc": 0.79165177, "epoch": 1.466804706980271, "grad_norm": 5.40625, "learning_rate": 1.7507132283319445e-06, "loss": 0.73763084, "memory(GiB)": 147.13, "step": 62870, "train_speed(iter/s)": 0.201185 }, { "acc": 0.76576929, "epoch": 1.4670380145525599, "grad_norm": 4.1875, "learning_rate": 1.7492776292808217e-06, "loss": 0.84900112, "memory(GiB)": 147.13, "step": 62880, "train_speed(iter/s)": 0.201201 }, { "acc": 0.77859445, "epoch": 1.4672713221248488, "grad_norm": 4.78125, "learning_rate": 1.7478424942693751e-06, "loss": 0.79272537, "memory(GiB)": 147.13, "step": 62890, "train_speed(iter/s)": 0.201217 }, { "acc": 0.7813118, "epoch": 1.4675046296971377, "grad_norm": 6.9375, "learning_rate": 1.7464078235024678e-06, "loss": 0.78525038, "memory(GiB)": 147.13, "step": 62900, "train_speed(iter/s)": 0.201233 }, { "acc": 0.78674507, "epoch": 1.4677379372694266, "grad_norm": 7.4375, "learning_rate": 1.7449736171848964e-06, "loss": 0.76398363, "memory(GiB)": 147.13, "step": 62910, "train_speed(iter/s)": 0.201248 }, { "acc": 0.77175941, "epoch": 1.4679712448417153, "grad_norm": 9.0625, "learning_rate": 1.7435398755213977e-06, "loss": 0.8071166, "memory(GiB)": 147.13, "step": 62920, "train_speed(iter/s)": 0.201265 }, { "acc": 0.75375805, "epoch": 1.4682045524140044, "grad_norm": 6.59375, "learning_rate": 1.7421065987166335e-06, "loss": 0.87107162, "memory(GiB)": 147.13, "step": 62930, "train_speed(iter/s)": 0.201281 }, { "acc": 0.80961304, "epoch": 1.468437859986293, "grad_norm": 5.34375, "learning_rate": 1.7406737869752082e-06, "loss": 0.67591333, "memory(GiB)": 147.13, "step": 62940, "train_speed(iter/s)": 0.201296 }, { "acc": 0.77118597, "epoch": 1.4686711675585822, "grad_norm": 5.28125, "learning_rate": 1.7392414405016527e-06, "loss": 0.83532896, "memory(GiB)": 147.13, "step": 62950, "train_speed(iter/s)": 0.201312 }, { "acc": 0.76841412, "epoch": 1.4689044751308709, "grad_norm": 6.3125, "learning_rate": 1.7378095595004323e-06, "loss": 0.84016171, "memory(GiB)": 147.13, "step": 62960, "train_speed(iter/s)": 0.201329 }, { "acc": 0.77326422, "epoch": 1.46913778270316, "grad_norm": 6.40625, "learning_rate": 1.736378144175952e-06, "loss": 0.83091669, "memory(GiB)": 147.13, "step": 62970, "train_speed(iter/s)": 0.201346 }, { "acc": 0.7910758, "epoch": 1.4693710902754487, "grad_norm": 6.0625, "learning_rate": 1.7349471947325414e-06, "loss": 0.72511082, "memory(GiB)": 147.13, "step": 62980, "train_speed(iter/s)": 0.201363 }, { "acc": 0.78480339, "epoch": 1.4696043978477378, "grad_norm": 4.28125, "learning_rate": 1.7335167113744732e-06, "loss": 0.76047554, "memory(GiB)": 147.13, "step": 62990, "train_speed(iter/s)": 0.20138 }, { "acc": 0.78779182, "epoch": 1.4698377054200265, "grad_norm": 4.3125, "learning_rate": 1.7320866943059427e-06, "loss": 0.75346622, "memory(GiB)": 147.13, "step": 63000, "train_speed(iter/s)": 0.201397 }, { "epoch": 1.4698377054200265, "eval_acc": 0.7446705200571843, "eval_loss": 0.8045252561569214, "eval_runtime": 1270.0191, "eval_samples_per_second": 28.339, "eval_steps_per_second": 14.17, "step": 63000 }, { "acc": 0.79027567, "epoch": 1.4700710129923154, "grad_norm": 5.84375, "learning_rate": 1.7306571437310893e-06, "loss": 0.75496302, "memory(GiB)": 147.13, "step": 63010, "train_speed(iter/s)": 0.200586 }, { "acc": 0.77779145, "epoch": 1.4703043205646043, "grad_norm": 5.15625, "learning_rate": 1.7292280598539769e-06, "loss": 0.80010624, "memory(GiB)": 147.13, "step": 63020, "train_speed(iter/s)": 0.200603 }, { "acc": 0.78809614, "epoch": 1.4705376281368931, "grad_norm": 5.375, "learning_rate": 1.72779944287861e-06, "loss": 0.7661644, "memory(GiB)": 147.13, "step": 63030, "train_speed(iter/s)": 0.20062 }, { "acc": 0.78059311, "epoch": 1.470770935709182, "grad_norm": 4.875, "learning_rate": 1.7263712930089227e-06, "loss": 0.78118305, "memory(GiB)": 147.13, "step": 63040, "train_speed(iter/s)": 0.200635 }, { "acc": 0.78940554, "epoch": 1.471004243281471, "grad_norm": 9.75, "learning_rate": 1.7249436104487805e-06, "loss": 0.74943719, "memory(GiB)": 147.13, "step": 63050, "train_speed(iter/s)": 0.200651 }, { "acc": 0.77817397, "epoch": 1.4712375508537598, "grad_norm": 3.9375, "learning_rate": 1.7235163954019878e-06, "loss": 0.83013039, "memory(GiB)": 147.13, "step": 63060, "train_speed(iter/s)": 0.200667 }, { "acc": 0.7942832, "epoch": 1.4714708584260487, "grad_norm": 4.71875, "learning_rate": 1.7220896480722766e-06, "loss": 0.71083398, "memory(GiB)": 147.13, "step": 63070, "train_speed(iter/s)": 0.200683 }, { "acc": 0.77893252, "epoch": 1.4717041659983376, "grad_norm": 5.28125, "learning_rate": 1.7206633686633172e-06, "loss": 0.7933465, "memory(GiB)": 147.13, "step": 63080, "train_speed(iter/s)": 0.200699 }, { "acc": 0.77962651, "epoch": 1.4719374735706265, "grad_norm": 3.625, "learning_rate": 1.719237557378709e-06, "loss": 0.79189463, "memory(GiB)": 147.13, "step": 63090, "train_speed(iter/s)": 0.200715 }, { "acc": 0.7757308, "epoch": 1.4721707811429154, "grad_norm": 4.59375, "learning_rate": 1.7178122144219873e-06, "loss": 0.79125972, "memory(GiB)": 147.13, "step": 63100, "train_speed(iter/s)": 0.200732 }, { "acc": 0.78209968, "epoch": 1.4724040887152043, "grad_norm": 4.25, "learning_rate": 1.716387339996618e-06, "loss": 0.78565693, "memory(GiB)": 147.13, "step": 63110, "train_speed(iter/s)": 0.200747 }, { "acc": 0.76956949, "epoch": 1.4726373962874932, "grad_norm": 7.1875, "learning_rate": 1.7149629343060003e-06, "loss": 0.82966146, "memory(GiB)": 147.13, "step": 63120, "train_speed(iter/s)": 0.200764 }, { "acc": 0.78773918, "epoch": 1.4728707038597821, "grad_norm": 4.8125, "learning_rate": 1.7135389975534711e-06, "loss": 0.76110039, "memory(GiB)": 147.13, "step": 63130, "train_speed(iter/s)": 0.200779 }, { "acc": 0.7758707, "epoch": 1.473104011432071, "grad_norm": 4.84375, "learning_rate": 1.7121155299422936e-06, "loss": 0.81253977, "memory(GiB)": 147.13, "step": 63140, "train_speed(iter/s)": 0.200795 }, { "acc": 0.75602198, "epoch": 1.47333731900436, "grad_norm": 5.1875, "learning_rate": 1.710692531675671e-06, "loss": 0.90366888, "memory(GiB)": 147.13, "step": 63150, "train_speed(iter/s)": 0.200813 }, { "acc": 0.77753906, "epoch": 1.4735706265766488, "grad_norm": 5.40625, "learning_rate": 1.709270002956732e-06, "loss": 0.77708693, "memory(GiB)": 147.13, "step": 63160, "train_speed(iter/s)": 0.20083 }, { "acc": 0.77087703, "epoch": 1.4738039341489377, "grad_norm": 5.6875, "learning_rate": 1.7078479439885458e-06, "loss": 0.82326393, "memory(GiB)": 147.13, "step": 63170, "train_speed(iter/s)": 0.200845 }, { "acc": 0.78759995, "epoch": 1.4740372417212266, "grad_norm": 7.46875, "learning_rate": 1.7064263549741095e-06, "loss": 0.76839981, "memory(GiB)": 147.13, "step": 63180, "train_speed(iter/s)": 0.200862 }, { "acc": 0.79642954, "epoch": 1.4742705492935155, "grad_norm": 5.34375, "learning_rate": 1.7050052361163522e-06, "loss": 0.7275032, "memory(GiB)": 147.13, "step": 63190, "train_speed(iter/s)": 0.20088 }, { "acc": 0.79400454, "epoch": 1.4745038568658044, "grad_norm": 5.375, "learning_rate": 1.7035845876181422e-06, "loss": 0.74391875, "memory(GiB)": 147.13, "step": 63200, "train_speed(iter/s)": 0.200895 }, { "acc": 0.7892807, "epoch": 1.4747371644380933, "grad_norm": 12.75, "learning_rate": 1.7021644096822748e-06, "loss": 0.76604948, "memory(GiB)": 147.13, "step": 63210, "train_speed(iter/s)": 0.200911 }, { "acc": 0.76153727, "epoch": 1.4749704720103822, "grad_norm": 4.125, "learning_rate": 1.7007447025114798e-06, "loss": 0.86720562, "memory(GiB)": 147.13, "step": 63220, "train_speed(iter/s)": 0.200928 }, { "acc": 0.77364707, "epoch": 1.4752037795826711, "grad_norm": 5.8125, "learning_rate": 1.699325466308418e-06, "loss": 0.81828022, "memory(GiB)": 147.13, "step": 63230, "train_speed(iter/s)": 0.200946 }, { "acc": 0.77053213, "epoch": 1.47543708715496, "grad_norm": 4.21875, "learning_rate": 1.6979067012756888e-06, "loss": 0.83672915, "memory(GiB)": 147.13, "step": 63240, "train_speed(iter/s)": 0.200962 }, { "acc": 0.78983154, "epoch": 1.475670394727249, "grad_norm": 5.1875, "learning_rate": 1.6964884076158194e-06, "loss": 0.76191545, "memory(GiB)": 147.13, "step": 63250, "train_speed(iter/s)": 0.200978 }, { "acc": 0.75502505, "epoch": 1.4759037022995378, "grad_norm": 5.3125, "learning_rate": 1.6950705855312677e-06, "loss": 0.89188004, "memory(GiB)": 147.13, "step": 63260, "train_speed(iter/s)": 0.200994 }, { "acc": 0.7857728, "epoch": 1.4761370098718267, "grad_norm": 5.21875, "learning_rate": 1.6936532352244316e-06, "loss": 0.75499687, "memory(GiB)": 147.13, "step": 63270, "train_speed(iter/s)": 0.201011 }, { "acc": 0.78429489, "epoch": 1.4763703174441156, "grad_norm": 5.625, "learning_rate": 1.6922363568976347e-06, "loss": 0.78085718, "memory(GiB)": 147.13, "step": 63280, "train_speed(iter/s)": 0.201027 }, { "acc": 0.78143988, "epoch": 1.4766036250164045, "grad_norm": 5.75, "learning_rate": 1.690819950753138e-06, "loss": 0.77364426, "memory(GiB)": 147.13, "step": 63290, "train_speed(iter/s)": 0.201044 }, { "acc": 0.7872983, "epoch": 1.4768369325886934, "grad_norm": 4.6875, "learning_rate": 1.6894040169931303e-06, "loss": 0.75831628, "memory(GiB)": 147.13, "step": 63300, "train_speed(iter/s)": 0.201059 }, { "acc": 0.7711751, "epoch": 1.477070240160982, "grad_norm": 4.875, "learning_rate": 1.6879885558197395e-06, "loss": 0.80848656, "memory(GiB)": 147.13, "step": 63310, "train_speed(iter/s)": 0.201076 }, { "acc": 0.78801699, "epoch": 1.4773035477332712, "grad_norm": 4.125, "learning_rate": 1.6865735674350198e-06, "loss": 0.74979692, "memory(GiB)": 147.13, "step": 63320, "train_speed(iter/s)": 0.201093 }, { "acc": 0.8035552, "epoch": 1.47753685530556, "grad_norm": 4.4375, "learning_rate": 1.6851590520409611e-06, "loss": 0.69476547, "memory(GiB)": 147.13, "step": 63330, "train_speed(iter/s)": 0.201108 }, { "acc": 0.78817482, "epoch": 1.477770162877849, "grad_norm": 6.46875, "learning_rate": 1.6837450098394848e-06, "loss": 0.76696267, "memory(GiB)": 147.13, "step": 63340, "train_speed(iter/s)": 0.201124 }, { "acc": 0.79101725, "epoch": 1.4780034704501377, "grad_norm": 5.46875, "learning_rate": 1.6823314410324426e-06, "loss": 0.74547434, "memory(GiB)": 147.13, "step": 63350, "train_speed(iter/s)": 0.201141 }, { "acc": 0.78399458, "epoch": 1.4782367780224268, "grad_norm": 9.375, "learning_rate": 1.680918345821626e-06, "loss": 0.78179579, "memory(GiB)": 147.13, "step": 63360, "train_speed(iter/s)": 0.201158 }, { "acc": 0.80156345, "epoch": 1.4784700855947155, "grad_norm": 5.4375, "learning_rate": 1.6795057244087493e-06, "loss": 0.70179181, "memory(GiB)": 147.13, "step": 63370, "train_speed(iter/s)": 0.201173 }, { "acc": 0.79969625, "epoch": 1.4787033931670046, "grad_norm": 7.09375, "learning_rate": 1.678093576995467e-06, "loss": 0.71199322, "memory(GiB)": 147.13, "step": 63380, "train_speed(iter/s)": 0.201189 }, { "acc": 0.77803707, "epoch": 1.4789367007392933, "grad_norm": 5.8125, "learning_rate": 1.676681903783362e-06, "loss": 0.77450371, "memory(GiB)": 147.13, "step": 63390, "train_speed(iter/s)": 0.201204 }, { "acc": 0.79645801, "epoch": 1.4791700083115822, "grad_norm": 5.0, "learning_rate": 1.6752707049739487e-06, "loss": 0.72130475, "memory(GiB)": 147.13, "step": 63400, "train_speed(iter/s)": 0.201221 }, { "acc": 0.78377075, "epoch": 1.479403315883871, "grad_norm": 5.9375, "learning_rate": 1.6738599807686774e-06, "loss": 0.78707418, "memory(GiB)": 147.13, "step": 63410, "train_speed(iter/s)": 0.201237 }, { "acc": 0.7851202, "epoch": 1.47963662345616, "grad_norm": 5.53125, "learning_rate": 1.6724497313689258e-06, "loss": 0.76640143, "memory(GiB)": 147.13, "step": 63420, "train_speed(iter/s)": 0.201253 }, { "acc": 0.77378364, "epoch": 1.4798699310284489, "grad_norm": 4.5, "learning_rate": 1.6710399569760105e-06, "loss": 0.81367893, "memory(GiB)": 147.13, "step": 63430, "train_speed(iter/s)": 0.201268 }, { "acc": 0.77930741, "epoch": 1.4801032386007378, "grad_norm": 5.65625, "learning_rate": 1.669630657791174e-06, "loss": 0.77642231, "memory(GiB)": 147.13, "step": 63440, "train_speed(iter/s)": 0.201284 }, { "acc": 0.773598, "epoch": 1.4803365461730267, "grad_norm": 5.5, "learning_rate": 1.6682218340155936e-06, "loss": 0.80465155, "memory(GiB)": 147.13, "step": 63450, "train_speed(iter/s)": 0.201302 }, { "acc": 0.76938334, "epoch": 1.4805698537453156, "grad_norm": 6.1875, "learning_rate": 1.666813485850377e-06, "loss": 0.82184029, "memory(GiB)": 147.13, "step": 63460, "train_speed(iter/s)": 0.201318 }, { "acc": 0.77726431, "epoch": 1.4808031613176045, "grad_norm": 6.25, "learning_rate": 1.665405613496569e-06, "loss": 0.81066351, "memory(GiB)": 147.13, "step": 63470, "train_speed(iter/s)": 0.201334 }, { "acc": 0.75288639, "epoch": 1.4810364688898934, "grad_norm": 6.03125, "learning_rate": 1.6639982171551405e-06, "loss": 0.89472895, "memory(GiB)": 147.13, "step": 63480, "train_speed(iter/s)": 0.20135 }, { "acc": 0.77361879, "epoch": 1.4812697764621823, "grad_norm": 7.03125, "learning_rate": 1.6625912970269958e-06, "loss": 0.80337019, "memory(GiB)": 147.13, "step": 63490, "train_speed(iter/s)": 0.201367 }, { "acc": 0.77668934, "epoch": 1.4815030840344712, "grad_norm": 4.65625, "learning_rate": 1.6611848533129754e-06, "loss": 0.79210997, "memory(GiB)": 147.13, "step": 63500, "train_speed(iter/s)": 0.201385 }, { "epoch": 1.4815030840344712, "eval_acc": 0.7447049510671291, "eval_loss": 0.8045029044151306, "eval_runtime": 1268.8732, "eval_samples_per_second": 28.365, "eval_steps_per_second": 14.183, "step": 63500 }, { "acc": 0.77970753, "epoch": 1.48173639160676, "grad_norm": 3.75, "learning_rate": 1.6597788862138458e-06, "loss": 0.78724799, "memory(GiB)": 147.13, "step": 63510, "train_speed(iter/s)": 0.200581 }, { "acc": 0.78490286, "epoch": 1.481969699179049, "grad_norm": 4.5625, "learning_rate": 1.6583733959303116e-06, "loss": 0.7604619, "memory(GiB)": 147.13, "step": 63520, "train_speed(iter/s)": 0.200596 }, { "acc": 0.77267179, "epoch": 1.4822030067513379, "grad_norm": 4.78125, "learning_rate": 1.6569683826630045e-06, "loss": 0.80196037, "memory(GiB)": 147.13, "step": 63530, "train_speed(iter/s)": 0.200613 }, { "acc": 0.78124847, "epoch": 1.4824363143236268, "grad_norm": 4.84375, "learning_rate": 1.6555638466124878e-06, "loss": 0.80149517, "memory(GiB)": 147.13, "step": 63540, "train_speed(iter/s)": 0.200628 }, { "acc": 0.77615232, "epoch": 1.4826696218959157, "grad_norm": 5.9375, "learning_rate": 1.654159787979262e-06, "loss": 0.81325369, "memory(GiB)": 147.13, "step": 63550, "train_speed(iter/s)": 0.200645 }, { "acc": 0.78834553, "epoch": 1.4829029294682046, "grad_norm": 6.03125, "learning_rate": 1.6527562069637543e-06, "loss": 0.77705812, "memory(GiB)": 147.13, "step": 63560, "train_speed(iter/s)": 0.200662 }, { "acc": 0.76742916, "epoch": 1.4831362370404935, "grad_norm": 7.125, "learning_rate": 1.6513531037663262e-06, "loss": 0.83876429, "memory(GiB)": 147.13, "step": 63570, "train_speed(iter/s)": 0.200678 }, { "acc": 0.79001141, "epoch": 1.4833695446127824, "grad_norm": 5.1875, "learning_rate": 1.6499504785872679e-06, "loss": 0.74909034, "memory(GiB)": 147.13, "step": 63580, "train_speed(iter/s)": 0.200693 }, { "acc": 0.79263821, "epoch": 1.4836028521850713, "grad_norm": 6.125, "learning_rate": 1.648548331626807e-06, "loss": 0.76054592, "memory(GiB)": 147.13, "step": 63590, "train_speed(iter/s)": 0.200708 }, { "acc": 0.78314829, "epoch": 1.4838361597573602, "grad_norm": 6.28125, "learning_rate": 1.6471466630850985e-06, "loss": 0.78806133, "memory(GiB)": 147.13, "step": 63600, "train_speed(iter/s)": 0.200724 }, { "acc": 0.7892365, "epoch": 1.484069467329649, "grad_norm": 4.6875, "learning_rate": 1.645745473162228e-06, "loss": 0.74208422, "memory(GiB)": 147.13, "step": 63610, "train_speed(iter/s)": 0.200741 }, { "acc": 0.80071468, "epoch": 1.484302774901938, "grad_norm": 4.84375, "learning_rate": 1.644344762058218e-06, "loss": 0.71267734, "memory(GiB)": 147.13, "step": 63620, "train_speed(iter/s)": 0.200757 }, { "acc": 0.77488637, "epoch": 1.4845360824742269, "grad_norm": 10.1875, "learning_rate": 1.6429445299730173e-06, "loss": 0.82684231, "memory(GiB)": 147.13, "step": 63630, "train_speed(iter/s)": 0.200773 }, { "acc": 0.75984926, "epoch": 1.4847693900465158, "grad_norm": 5.3125, "learning_rate": 1.6415447771065112e-06, "loss": 0.88017397, "memory(GiB)": 147.13, "step": 63640, "train_speed(iter/s)": 0.200789 }, { "acc": 0.76972971, "epoch": 1.4850026976188047, "grad_norm": 5.75, "learning_rate": 1.6401455036585111e-06, "loss": 0.82038412, "memory(GiB)": 147.13, "step": 63650, "train_speed(iter/s)": 0.200804 }, { "acc": 0.77063775, "epoch": 1.4852360051910936, "grad_norm": 6.125, "learning_rate": 1.6387467098287656e-06, "loss": 0.82443304, "memory(GiB)": 147.13, "step": 63660, "train_speed(iter/s)": 0.200821 }, { "acc": 0.79164023, "epoch": 1.4854693127633825, "grad_norm": 4.4375, "learning_rate": 1.637348395816951e-06, "loss": 0.76277428, "memory(GiB)": 147.13, "step": 63670, "train_speed(iter/s)": 0.200838 }, { "acc": 0.80538321, "epoch": 1.4857026203356711, "grad_norm": 4.9375, "learning_rate": 1.635950561822676e-06, "loss": 0.68404865, "memory(GiB)": 147.13, "step": 63680, "train_speed(iter/s)": 0.200853 }, { "acc": 0.77760887, "epoch": 1.4859359279079603, "grad_norm": 6.625, "learning_rate": 1.6345532080454813e-06, "loss": 0.80281401, "memory(GiB)": 147.13, "step": 63690, "train_speed(iter/s)": 0.200871 }, { "acc": 0.7884223, "epoch": 1.486169235480249, "grad_norm": 7.5625, "learning_rate": 1.6331563346848366e-06, "loss": 0.77055836, "memory(GiB)": 147.13, "step": 63700, "train_speed(iter/s)": 0.200887 }, { "acc": 0.78238873, "epoch": 1.486402543052538, "grad_norm": 4.75, "learning_rate": 1.6317599419401486e-06, "loss": 0.78170815, "memory(GiB)": 147.13, "step": 63710, "train_speed(iter/s)": 0.200904 }, { "acc": 0.77347994, "epoch": 1.4866358506248267, "grad_norm": 8.0, "learning_rate": 1.6303640300107493e-06, "loss": 0.79208412, "memory(GiB)": 147.13, "step": 63720, "train_speed(iter/s)": 0.200919 }, { "acc": 0.7714962, "epoch": 1.4868691581971158, "grad_norm": 5.5, "learning_rate": 1.628968599095907e-06, "loss": 0.81774483, "memory(GiB)": 147.13, "step": 63730, "train_speed(iter/s)": 0.200936 }, { "acc": 0.78385878, "epoch": 1.4871024657694045, "grad_norm": 5.875, "learning_rate": 1.6275736493948174e-06, "loss": 0.77382607, "memory(GiB)": 147.13, "step": 63740, "train_speed(iter/s)": 0.200951 }, { "acc": 0.77496548, "epoch": 1.4873357733416936, "grad_norm": 4.40625, "learning_rate": 1.626179181106609e-06, "loss": 0.82866879, "memory(GiB)": 147.13, "step": 63750, "train_speed(iter/s)": 0.200966 }, { "acc": 0.78694777, "epoch": 1.4875690809139823, "grad_norm": 7.8125, "learning_rate": 1.6247851944303433e-06, "loss": 0.7435267, "memory(GiB)": 147.13, "step": 63760, "train_speed(iter/s)": 0.200983 }, { "acc": 0.7818573, "epoch": 1.4878023884862714, "grad_norm": 4.875, "learning_rate": 1.6233916895650093e-06, "loss": 0.77421417, "memory(GiB)": 147.13, "step": 63770, "train_speed(iter/s)": 0.200999 }, { "acc": 0.77949824, "epoch": 1.4880356960585601, "grad_norm": 5.84375, "learning_rate": 1.6219986667095323e-06, "loss": 0.81707954, "memory(GiB)": 147.13, "step": 63780, "train_speed(iter/s)": 0.201017 }, { "acc": 0.77718635, "epoch": 1.488269003630849, "grad_norm": 5.8125, "learning_rate": 1.6206061260627643e-06, "loss": 0.80593338, "memory(GiB)": 147.13, "step": 63790, "train_speed(iter/s)": 0.201034 }, { "acc": 0.79007106, "epoch": 1.488502311203138, "grad_norm": 6.40625, "learning_rate": 1.6192140678234903e-06, "loss": 0.75823188, "memory(GiB)": 147.13, "step": 63800, "train_speed(iter/s)": 0.201051 }, { "acc": 0.76932173, "epoch": 1.4887356187754268, "grad_norm": 6.09375, "learning_rate": 1.617822492190424e-06, "loss": 0.82226, "memory(GiB)": 147.13, "step": 63810, "train_speed(iter/s)": 0.201068 }, { "acc": 0.77304816, "epoch": 1.4889689263477157, "grad_norm": 4.78125, "learning_rate": 1.616431399362216e-06, "loss": 0.80309229, "memory(GiB)": 147.13, "step": 63820, "train_speed(iter/s)": 0.201084 }, { "acc": 0.79105062, "epoch": 1.4892022339200046, "grad_norm": 4.8125, "learning_rate": 1.615040789537443e-06, "loss": 0.73891435, "memory(GiB)": 147.13, "step": 63830, "train_speed(iter/s)": 0.201101 }, { "acc": 0.78510084, "epoch": 1.4894355414922935, "grad_norm": 6.0625, "learning_rate": 1.6136506629146125e-06, "loss": 0.76382818, "memory(GiB)": 147.13, "step": 63840, "train_speed(iter/s)": 0.201118 }, { "acc": 0.78627157, "epoch": 1.4896688490645824, "grad_norm": 4.84375, "learning_rate": 1.6122610196921673e-06, "loss": 0.75550165, "memory(GiB)": 147.13, "step": 63850, "train_speed(iter/s)": 0.201135 }, { "acc": 0.78514376, "epoch": 1.4899021566368713, "grad_norm": 6.1875, "learning_rate": 1.6108718600684764e-06, "loss": 0.78006501, "memory(GiB)": 147.13, "step": 63860, "train_speed(iter/s)": 0.201151 }, { "acc": 0.76863089, "epoch": 1.4901354642091602, "grad_norm": 4.71875, "learning_rate": 1.609483184241844e-06, "loss": 0.81679535, "memory(GiB)": 147.13, "step": 63870, "train_speed(iter/s)": 0.201168 }, { "acc": 0.79790592, "epoch": 1.490368771781449, "grad_norm": 5.5, "learning_rate": 1.6080949924105022e-06, "loss": 0.71203775, "memory(GiB)": 147.13, "step": 63880, "train_speed(iter/s)": 0.201184 }, { "acc": 0.77048893, "epoch": 1.490602079353738, "grad_norm": 6.25, "learning_rate": 1.6067072847726134e-06, "loss": 0.81707821, "memory(GiB)": 147.13, "step": 63890, "train_speed(iter/s)": 0.2012 }, { "acc": 0.75077691, "epoch": 1.490835386926027, "grad_norm": 4.84375, "learning_rate": 1.605320061526277e-06, "loss": 0.89881153, "memory(GiB)": 147.13, "step": 63900, "train_speed(iter/s)": 0.201217 }, { "acc": 0.81723709, "epoch": 1.4910686944983158, "grad_norm": 4.75, "learning_rate": 1.6039333228695132e-06, "loss": 0.65049648, "memory(GiB)": 147.13, "step": 63910, "train_speed(iter/s)": 0.201232 }, { "acc": 0.78162031, "epoch": 1.4913020020706047, "grad_norm": 6.34375, "learning_rate": 1.6025470690002815e-06, "loss": 0.77352018, "memory(GiB)": 147.13, "step": 63920, "train_speed(iter/s)": 0.201249 }, { "acc": 0.76750431, "epoch": 1.4915353096428936, "grad_norm": 4.40625, "learning_rate": 1.6011613001164677e-06, "loss": 0.84007797, "memory(GiB)": 147.13, "step": 63930, "train_speed(iter/s)": 0.201265 }, { "acc": 0.78385334, "epoch": 1.4917686172151825, "grad_norm": 6.71875, "learning_rate": 1.5997760164158927e-06, "loss": 0.77498088, "memory(GiB)": 147.13, "step": 63940, "train_speed(iter/s)": 0.20128 }, { "acc": 0.78162289, "epoch": 1.4920019247874714, "grad_norm": 4.8125, "learning_rate": 1.5983912180963012e-06, "loss": 0.76992779, "memory(GiB)": 147.13, "step": 63950, "train_speed(iter/s)": 0.201296 }, { "acc": 0.78982821, "epoch": 1.4922352323597603, "grad_norm": 5.03125, "learning_rate": 1.5970069053553776e-06, "loss": 0.74374228, "memory(GiB)": 147.13, "step": 63960, "train_speed(iter/s)": 0.201313 }, { "acc": 0.79276485, "epoch": 1.4924685399320492, "grad_norm": 5.46875, "learning_rate": 1.5956230783907294e-06, "loss": 0.72949905, "memory(GiB)": 147.13, "step": 63970, "train_speed(iter/s)": 0.20133 }, { "acc": 0.77740602, "epoch": 1.492701847504338, "grad_norm": 5.75, "learning_rate": 1.5942397373998959e-06, "loss": 0.79231834, "memory(GiB)": 147.13, "step": 63980, "train_speed(iter/s)": 0.201346 }, { "acc": 0.78017025, "epoch": 1.492935155076627, "grad_norm": 5.4375, "learning_rate": 1.5928568825803526e-06, "loss": 0.78005562, "memory(GiB)": 147.13, "step": 63990, "train_speed(iter/s)": 0.201364 }, { "acc": 0.78059478, "epoch": 1.493168462648916, "grad_norm": 7.40625, "learning_rate": 1.5914745141294974e-06, "loss": 0.76288352, "memory(GiB)": 147.13, "step": 64000, "train_speed(iter/s)": 0.20138 }, { "epoch": 1.493168462648916, "eval_acc": 0.7446312847202705, "eval_loss": 0.8045158982276917, "eval_runtime": 1269.109, "eval_samples_per_second": 28.359, "eval_steps_per_second": 14.18, "step": 64000 }, { "acc": 0.80024071, "epoch": 1.4934017702212048, "grad_norm": 6.28125, "learning_rate": 1.5900926322446686e-06, "loss": 0.74026098, "memory(GiB)": 147.13, "step": 64010, "train_speed(iter/s)": 0.200581 }, { "acc": 0.78730555, "epoch": 1.4936350777934937, "grad_norm": 5.5, "learning_rate": 1.5887112371231227e-06, "loss": 0.75264711, "memory(GiB)": 147.13, "step": 64020, "train_speed(iter/s)": 0.200598 }, { "acc": 0.76471243, "epoch": 1.4938683853657826, "grad_norm": 5.875, "learning_rate": 1.5873303289620585e-06, "loss": 0.85071449, "memory(GiB)": 147.13, "step": 64030, "train_speed(iter/s)": 0.200613 }, { "acc": 0.77108183, "epoch": 1.4941016929380715, "grad_norm": 8.1875, "learning_rate": 1.5859499079585982e-06, "loss": 0.81153488, "memory(GiB)": 147.13, "step": 64040, "train_speed(iter/s)": 0.20063 }, { "acc": 0.78847384, "epoch": 1.4943350005103604, "grad_norm": 5.84375, "learning_rate": 1.5845699743097953e-06, "loss": 0.75993419, "memory(GiB)": 147.13, "step": 64050, "train_speed(iter/s)": 0.200644 }, { "acc": 0.78438349, "epoch": 1.4945683080826493, "grad_norm": 5.28125, "learning_rate": 1.583190528212638e-06, "loss": 0.78886342, "memory(GiB)": 147.13, "step": 64060, "train_speed(iter/s)": 0.200661 }, { "acc": 0.78701124, "epoch": 1.494801615654938, "grad_norm": 5.8125, "learning_rate": 1.5818115698640386e-06, "loss": 0.77774706, "memory(GiB)": 147.13, "step": 64070, "train_speed(iter/s)": 0.200679 }, { "acc": 0.76535654, "epoch": 1.495034923227227, "grad_norm": 6.75, "learning_rate": 1.5804330994608463e-06, "loss": 0.84333496, "memory(GiB)": 147.13, "step": 64080, "train_speed(iter/s)": 0.200696 }, { "acc": 0.79128575, "epoch": 1.4952682307995158, "grad_norm": 5.46875, "learning_rate": 1.5790551171998337e-06, "loss": 0.75358858, "memory(GiB)": 147.13, "step": 64090, "train_speed(iter/s)": 0.200712 }, { "acc": 0.77005005, "epoch": 1.4955015383718049, "grad_norm": 7.6875, "learning_rate": 1.5776776232777114e-06, "loss": 0.84556427, "memory(GiB)": 147.13, "step": 64100, "train_speed(iter/s)": 0.200728 }, { "acc": 0.77952299, "epoch": 1.4957348459440936, "grad_norm": 9.6875, "learning_rate": 1.5763006178911139e-06, "loss": 0.80671349, "memory(GiB)": 147.13, "step": 64110, "train_speed(iter/s)": 0.200744 }, { "acc": 0.77896752, "epoch": 1.4959681535163827, "grad_norm": 5.4375, "learning_rate": 1.5749241012366068e-06, "loss": 0.78669481, "memory(GiB)": 147.13, "step": 64120, "train_speed(iter/s)": 0.20076 }, { "acc": 0.7698513, "epoch": 1.4962014610886714, "grad_norm": 5.8125, "learning_rate": 1.5735480735106927e-06, "loss": 0.82209358, "memory(GiB)": 147.13, "step": 64130, "train_speed(iter/s)": 0.200776 }, { "acc": 0.78666344, "epoch": 1.4964347686609605, "grad_norm": 5.03125, "learning_rate": 1.5721725349097926e-06, "loss": 0.74558525, "memory(GiB)": 147.13, "step": 64140, "train_speed(iter/s)": 0.200792 }, { "acc": 0.79377465, "epoch": 1.4966680762332492, "grad_norm": 4.40625, "learning_rate": 1.570797485630269e-06, "loss": 0.75561595, "memory(GiB)": 147.13, "step": 64150, "train_speed(iter/s)": 0.20081 }, { "acc": 0.77909813, "epoch": 1.4969013838055383, "grad_norm": 6.9375, "learning_rate": 1.5694229258684063e-06, "loss": 0.77388506, "memory(GiB)": 147.13, "step": 64160, "train_speed(iter/s)": 0.200827 }, { "acc": 0.77309704, "epoch": 1.497134691377827, "grad_norm": 4.5625, "learning_rate": 1.5680488558204259e-06, "loss": 0.82009621, "memory(GiB)": 147.13, "step": 64170, "train_speed(iter/s)": 0.200841 }, { "acc": 0.77360954, "epoch": 1.4973679989501159, "grad_norm": 5.03125, "learning_rate": 1.566675275682475e-06, "loss": 0.80817747, "memory(GiB)": 147.13, "step": 64180, "train_speed(iter/s)": 0.200858 }, { "acc": 0.78091269, "epoch": 1.4976013065224048, "grad_norm": 4.25, "learning_rate": 1.565302185650629e-06, "loss": 0.78140159, "memory(GiB)": 147.13, "step": 64190, "train_speed(iter/s)": 0.200874 }, { "acc": 0.79408712, "epoch": 1.4978346140946937, "grad_norm": 5.4375, "learning_rate": 1.5639295859208998e-06, "loss": 0.74546127, "memory(GiB)": 147.13, "step": 64200, "train_speed(iter/s)": 0.20089 }, { "acc": 0.79185085, "epoch": 1.4980679216669825, "grad_norm": 6.96875, "learning_rate": 1.562557476689222e-06, "loss": 0.7497364, "memory(GiB)": 147.13, "step": 64210, "train_speed(iter/s)": 0.200906 }, { "acc": 0.79052382, "epoch": 1.4983012292392714, "grad_norm": 4.75, "learning_rate": 1.5611858581514683e-06, "loss": 0.75888486, "memory(GiB)": 147.13, "step": 64220, "train_speed(iter/s)": 0.200923 }, { "acc": 0.78845158, "epoch": 1.4985345368115603, "grad_norm": 4.125, "learning_rate": 1.559814730503434e-06, "loss": 0.7904933, "memory(GiB)": 147.13, "step": 64230, "train_speed(iter/s)": 0.200939 }, { "acc": 0.77261376, "epoch": 1.4987678443838492, "grad_norm": 5.4375, "learning_rate": 1.5584440939408473e-06, "loss": 0.82468948, "memory(GiB)": 147.13, "step": 64240, "train_speed(iter/s)": 0.200956 }, { "acc": 0.76977367, "epoch": 1.4990011519561381, "grad_norm": 5.84375, "learning_rate": 1.557073948659365e-06, "loss": 0.83275337, "memory(GiB)": 147.13, "step": 64250, "train_speed(iter/s)": 0.200972 }, { "acc": 0.79217887, "epoch": 1.499234459528427, "grad_norm": 5.9375, "learning_rate": 1.555704294854578e-06, "loss": 0.74504166, "memory(GiB)": 147.13, "step": 64260, "train_speed(iter/s)": 0.200988 }, { "acc": 0.7652029, "epoch": 1.499467767100716, "grad_norm": 5.9375, "learning_rate": 1.5543351327220025e-06, "loss": 0.84907303, "memory(GiB)": 147.13, "step": 64270, "train_speed(iter/s)": 0.201003 }, { "acc": 0.79375896, "epoch": 1.4997010746730048, "grad_norm": 5.5, "learning_rate": 1.5529664624570839e-06, "loss": 0.72161312, "memory(GiB)": 147.13, "step": 64280, "train_speed(iter/s)": 0.20102 }, { "acc": 0.78259649, "epoch": 1.4999343822452937, "grad_norm": 6.53125, "learning_rate": 1.551598284255203e-06, "loss": 0.79077225, "memory(GiB)": 147.13, "step": 64290, "train_speed(iter/s)": 0.201036 }, { "acc": 0.78028545, "epoch": 1.5001676898175826, "grad_norm": 5.375, "learning_rate": 1.550230598311664e-06, "loss": 0.81074438, "memory(GiB)": 147.13, "step": 64300, "train_speed(iter/s)": 0.201053 }, { "acc": 0.76743765, "epoch": 1.5004009973898715, "grad_norm": 4.21875, "learning_rate": 1.548863404821706e-06, "loss": 0.83184929, "memory(GiB)": 147.13, "step": 64310, "train_speed(iter/s)": 0.201069 }, { "acc": 0.80110559, "epoch": 1.5006343049621604, "grad_norm": 11.0, "learning_rate": 1.547496703980495e-06, "loss": 0.71932917, "memory(GiB)": 147.13, "step": 64320, "train_speed(iter/s)": 0.201085 }, { "acc": 0.77408819, "epoch": 1.5008676125344493, "grad_norm": 5.0, "learning_rate": 1.5461304959831248e-06, "loss": 0.81594849, "memory(GiB)": 147.13, "step": 64330, "train_speed(iter/s)": 0.201101 }, { "acc": 0.80996132, "epoch": 1.5011009201067382, "grad_norm": 5.53125, "learning_rate": 1.5447647810246241e-06, "loss": 0.68073368, "memory(GiB)": 147.13, "step": 64340, "train_speed(iter/s)": 0.201116 }, { "acc": 0.77236247, "epoch": 1.5013342276790271, "grad_norm": 7.9375, "learning_rate": 1.5433995592999457e-06, "loss": 0.79192848, "memory(GiB)": 147.13, "step": 64350, "train_speed(iter/s)": 0.201132 }, { "acc": 0.78151517, "epoch": 1.501567535251316, "grad_norm": 4.4375, "learning_rate": 1.5420348310039796e-06, "loss": 0.79086199, "memory(GiB)": 147.13, "step": 64360, "train_speed(iter/s)": 0.201149 }, { "acc": 0.77429748, "epoch": 1.501800842823605, "grad_norm": 5.78125, "learning_rate": 1.5406705963315333e-06, "loss": 0.80469446, "memory(GiB)": 147.13, "step": 64370, "train_speed(iter/s)": 0.201165 }, { "acc": 0.78121309, "epoch": 1.5020341503958938, "grad_norm": 5.125, "learning_rate": 1.539306855477356e-06, "loss": 0.7872479, "memory(GiB)": 147.13, "step": 64380, "train_speed(iter/s)": 0.201181 }, { "acc": 0.78482914, "epoch": 1.5022674579681827, "grad_norm": 4.875, "learning_rate": 1.5379436086361187e-06, "loss": 0.77998705, "memory(GiB)": 147.13, "step": 64390, "train_speed(iter/s)": 0.201197 }, { "acc": 0.76260514, "epoch": 1.5025007655404716, "grad_norm": 4.03125, "learning_rate": 1.5365808560024264e-06, "loss": 0.84974766, "memory(GiB)": 147.13, "step": 64400, "train_speed(iter/s)": 0.201214 }, { "acc": 0.79643979, "epoch": 1.5027340731127605, "grad_norm": 8.0625, "learning_rate": 1.5352185977708112e-06, "loss": 0.72695284, "memory(GiB)": 147.13, "step": 64410, "train_speed(iter/s)": 0.201229 }, { "acc": 0.80471954, "epoch": 1.5029673806850492, "grad_norm": 4.28125, "learning_rate": 1.533856834135733e-06, "loss": 0.71714687, "memory(GiB)": 147.13, "step": 64420, "train_speed(iter/s)": 0.201244 }, { "acc": 0.77157187, "epoch": 1.5032006882573383, "grad_norm": 6.53125, "learning_rate": 1.532495565291587e-06, "loss": 0.81608639, "memory(GiB)": 147.13, "step": 64430, "train_speed(iter/s)": 0.20126 }, { "acc": 0.79674616, "epoch": 1.503433995829627, "grad_norm": 6.09375, "learning_rate": 1.5311347914326891e-06, "loss": 0.72117443, "memory(GiB)": 147.13, "step": 64440, "train_speed(iter/s)": 0.201276 }, { "acc": 0.77385588, "epoch": 1.5036673034019161, "grad_norm": 3.9375, "learning_rate": 1.5297745127532942e-06, "loss": 0.83259687, "memory(GiB)": 147.13, "step": 64450, "train_speed(iter/s)": 0.201292 }, { "acc": 0.76449928, "epoch": 1.5039006109742048, "grad_norm": 4.71875, "learning_rate": 1.5284147294475792e-06, "loss": 0.84181862, "memory(GiB)": 147.13, "step": 64460, "train_speed(iter/s)": 0.201307 }, { "acc": 0.79227133, "epoch": 1.504133918546494, "grad_norm": 5.03125, "learning_rate": 1.5270554417096533e-06, "loss": 0.75533457, "memory(GiB)": 147.13, "step": 64470, "train_speed(iter/s)": 0.201322 }, { "acc": 0.77802868, "epoch": 1.5043672261187826, "grad_norm": 5.6875, "learning_rate": 1.5256966497335541e-06, "loss": 0.78939829, "memory(GiB)": 147.13, "step": 64480, "train_speed(iter/s)": 0.201339 }, { "acc": 0.77074051, "epoch": 1.5046005336910717, "grad_norm": 5.875, "learning_rate": 1.5243383537132473e-06, "loss": 0.83445225, "memory(GiB)": 147.13, "step": 64490, "train_speed(iter/s)": 0.201355 }, { "acc": 0.78632889, "epoch": 1.5048338412633604, "grad_norm": 4.84375, "learning_rate": 1.5229805538426323e-06, "loss": 0.76857004, "memory(GiB)": 147.13, "step": 64500, "train_speed(iter/s)": 0.201371 }, { "epoch": 1.5048338412633604, "eval_acc": 0.7446565875089741, "eval_loss": 0.8044771552085876, "eval_runtime": 1269.4786, "eval_samples_per_second": 28.351, "eval_steps_per_second": 14.176, "step": 64500 }, { "acc": 0.79818277, "epoch": 1.5050671488356495, "grad_norm": 6.90625, "learning_rate": 1.5216232503155314e-06, "loss": 0.71413069, "memory(GiB)": 147.13, "step": 64510, "train_speed(iter/s)": 0.200578 }, { "acc": 0.79765759, "epoch": 1.5053004564079382, "grad_norm": 4.875, "learning_rate": 1.520266443325703e-06, "loss": 0.70702105, "memory(GiB)": 147.13, "step": 64520, "train_speed(iter/s)": 0.200594 }, { "acc": 0.77393379, "epoch": 1.5055337639802273, "grad_norm": 6.03125, "learning_rate": 1.5189101330668288e-06, "loss": 0.82264185, "memory(GiB)": 147.13, "step": 64530, "train_speed(iter/s)": 0.200609 }, { "acc": 0.78522334, "epoch": 1.505767071552516, "grad_norm": 5.59375, "learning_rate": 1.5175543197325205e-06, "loss": 0.76188078, "memory(GiB)": 147.13, "step": 64540, "train_speed(iter/s)": 0.200625 }, { "acc": 0.78678837, "epoch": 1.506000379124805, "grad_norm": 6.25, "learning_rate": 1.5161990035163226e-06, "loss": 0.77836008, "memory(GiB)": 147.13, "step": 64550, "train_speed(iter/s)": 0.20064 }, { "acc": 0.76740236, "epoch": 1.5062336866970938, "grad_norm": 5.71875, "learning_rate": 1.5148441846117035e-06, "loss": 0.8500349, "memory(GiB)": 147.13, "step": 64560, "train_speed(iter/s)": 0.200657 }, { "acc": 0.77085843, "epoch": 1.506466994269383, "grad_norm": 6.15625, "learning_rate": 1.5134898632120659e-06, "loss": 0.80457802, "memory(GiB)": 147.13, "step": 64570, "train_speed(iter/s)": 0.200672 }, { "acc": 0.78143206, "epoch": 1.5067003018416716, "grad_norm": 6.34375, "learning_rate": 1.5121360395107366e-06, "loss": 0.80713491, "memory(GiB)": 147.13, "step": 64580, "train_speed(iter/s)": 0.200689 }, { "acc": 0.7703723, "epoch": 1.5069336094139607, "grad_norm": 4.34375, "learning_rate": 1.5107827137009772e-06, "loss": 0.83621893, "memory(GiB)": 147.13, "step": 64590, "train_speed(iter/s)": 0.200705 }, { "acc": 0.78048077, "epoch": 1.5071669169862494, "grad_norm": 5.03125, "learning_rate": 1.509429885975968e-06, "loss": 0.77270036, "memory(GiB)": 147.13, "step": 64600, "train_speed(iter/s)": 0.200722 }, { "acc": 0.78544827, "epoch": 1.5074002245585383, "grad_norm": 5.78125, "learning_rate": 1.5080775565288314e-06, "loss": 0.76025243, "memory(GiB)": 147.13, "step": 64610, "train_speed(iter/s)": 0.200738 }, { "acc": 0.77681675, "epoch": 1.5076335321308272, "grad_norm": 5.21875, "learning_rate": 1.5067257255526085e-06, "loss": 0.79332089, "memory(GiB)": 147.13, "step": 64620, "train_speed(iter/s)": 0.200753 }, { "acc": 0.79631433, "epoch": 1.507866839703116, "grad_norm": 7.15625, "learning_rate": 1.505374393240272e-06, "loss": 0.71731257, "memory(GiB)": 147.13, "step": 64630, "train_speed(iter/s)": 0.200769 }, { "acc": 0.7795043, "epoch": 1.508100147275405, "grad_norm": 6.6875, "learning_rate": 1.5040235597847268e-06, "loss": 0.79805608, "memory(GiB)": 147.13, "step": 64640, "train_speed(iter/s)": 0.200785 }, { "acc": 0.77450428, "epoch": 1.5083334548476939, "grad_norm": 6.15625, "learning_rate": 1.5026732253788018e-06, "loss": 0.82841396, "memory(GiB)": 147.13, "step": 64650, "train_speed(iter/s)": 0.2008 }, { "acc": 0.78522072, "epoch": 1.5085667624199828, "grad_norm": 5.3125, "learning_rate": 1.501323390215259e-06, "loss": 0.77809391, "memory(GiB)": 147.13, "step": 64660, "train_speed(iter/s)": 0.200816 }, { "acc": 0.77315788, "epoch": 1.5088000699922717, "grad_norm": 6.84375, "learning_rate": 1.4999740544867864e-06, "loss": 0.82334805, "memory(GiB)": 147.13, "step": 64670, "train_speed(iter/s)": 0.200833 }, { "acc": 0.77878227, "epoch": 1.5090333775645606, "grad_norm": 5.03125, "learning_rate": 1.498625218385999e-06, "loss": 0.80296698, "memory(GiB)": 147.13, "step": 64680, "train_speed(iter/s)": 0.200848 }, { "acc": 0.76942329, "epoch": 1.5092666851368495, "grad_norm": 6.125, "learning_rate": 1.4972768821054461e-06, "loss": 0.82304773, "memory(GiB)": 147.13, "step": 64690, "train_speed(iter/s)": 0.200864 }, { "acc": 0.78032999, "epoch": 1.5094999927091384, "grad_norm": 4.96875, "learning_rate": 1.4959290458376008e-06, "loss": 0.78106308, "memory(GiB)": 147.13, "step": 64700, "train_speed(iter/s)": 0.20088 }, { "acc": 0.77347794, "epoch": 1.5097333002814273, "grad_norm": 6.5, "learning_rate": 1.494581709774866e-06, "loss": 0.82325249, "memory(GiB)": 147.13, "step": 64710, "train_speed(iter/s)": 0.200895 }, { "acc": 0.78451128, "epoch": 1.5099666078537162, "grad_norm": 5.65625, "learning_rate": 1.4932348741095726e-06, "loss": 0.76474838, "memory(GiB)": 147.13, "step": 64720, "train_speed(iter/s)": 0.200911 }, { "acc": 0.78300848, "epoch": 1.510199915426005, "grad_norm": 6.125, "learning_rate": 1.4918885390339837e-06, "loss": 0.77643466, "memory(GiB)": 147.13, "step": 64730, "train_speed(iter/s)": 0.200926 }, { "acc": 0.79654002, "epoch": 1.510433222998294, "grad_norm": 5.3125, "learning_rate": 1.4905427047402848e-06, "loss": 0.72235746, "memory(GiB)": 147.13, "step": 64740, "train_speed(iter/s)": 0.200942 }, { "acc": 0.77989998, "epoch": 1.5106665305705829, "grad_norm": 5.75, "learning_rate": 1.4891973714205971e-06, "loss": 0.78111591, "memory(GiB)": 147.13, "step": 64750, "train_speed(iter/s)": 0.200959 }, { "acc": 0.75031438, "epoch": 1.5108998381428718, "grad_norm": 6.78125, "learning_rate": 1.4878525392669652e-06, "loss": 0.91278858, "memory(GiB)": 147.13, "step": 64760, "train_speed(iter/s)": 0.200976 }, { "acc": 0.79479699, "epoch": 1.5111331457151607, "grad_norm": 4.46875, "learning_rate": 1.4865082084713605e-06, "loss": 0.73966794, "memory(GiB)": 147.13, "step": 64770, "train_speed(iter/s)": 0.200993 }, { "acc": 0.79615707, "epoch": 1.5113664532874496, "grad_norm": 5.34375, "learning_rate": 1.485164379225691e-06, "loss": 0.72713776, "memory(GiB)": 147.13, "step": 64780, "train_speed(iter/s)": 0.201008 }, { "acc": 0.79181237, "epoch": 1.5115997608597382, "grad_norm": 6.25, "learning_rate": 1.4838210517217827e-06, "loss": 0.75146961, "memory(GiB)": 147.13, "step": 64790, "train_speed(iter/s)": 0.201024 }, { "acc": 0.7745163, "epoch": 1.5118330684320274, "grad_norm": 5.5, "learning_rate": 1.4824782261513997e-06, "loss": 0.81358576, "memory(GiB)": 147.13, "step": 64800, "train_speed(iter/s)": 0.201038 }, { "acc": 0.77751856, "epoch": 1.512066376004316, "grad_norm": 4.71875, "learning_rate": 1.4811359027062282e-06, "loss": 0.79387336, "memory(GiB)": 147.13, "step": 64810, "train_speed(iter/s)": 0.201054 }, { "acc": 0.78064179, "epoch": 1.5122996835766052, "grad_norm": 7.125, "learning_rate": 1.4797940815778849e-06, "loss": 0.78270473, "memory(GiB)": 147.13, "step": 64820, "train_speed(iter/s)": 0.201069 }, { "acc": 0.77359581, "epoch": 1.5125329911488938, "grad_norm": 6.25, "learning_rate": 1.4784527629579132e-06, "loss": 0.8067338, "memory(GiB)": 147.13, "step": 64830, "train_speed(iter/s)": 0.201085 }, { "acc": 0.77926979, "epoch": 1.512766298721183, "grad_norm": 6.59375, "learning_rate": 1.4771119470377853e-06, "loss": 0.78685246, "memory(GiB)": 147.13, "step": 64840, "train_speed(iter/s)": 0.201101 }, { "acc": 0.77433491, "epoch": 1.5129996062934716, "grad_norm": 6.75, "learning_rate": 1.4757716340089046e-06, "loss": 0.81569548, "memory(GiB)": 147.13, "step": 64850, "train_speed(iter/s)": 0.201116 }, { "acc": 0.77438469, "epoch": 1.5132329138657608, "grad_norm": 7.1875, "learning_rate": 1.4744318240625981e-06, "loss": 0.79644213, "memory(GiB)": 147.13, "step": 64860, "train_speed(iter/s)": 0.201133 }, { "acc": 0.78981161, "epoch": 1.5134662214380494, "grad_norm": 4.375, "learning_rate": 1.4730925173901262e-06, "loss": 0.74776039, "memory(GiB)": 147.13, "step": 64870, "train_speed(iter/s)": 0.20115 }, { "acc": 0.76372499, "epoch": 1.5136995290103386, "grad_norm": 4.75, "learning_rate": 1.4717537141826716e-06, "loss": 0.84560843, "memory(GiB)": 147.13, "step": 64880, "train_speed(iter/s)": 0.201164 }, { "acc": 0.77633152, "epoch": 1.5139328365826272, "grad_norm": 4.125, "learning_rate": 1.4704154146313503e-06, "loss": 0.80207596, "memory(GiB)": 147.13, "step": 64890, "train_speed(iter/s)": 0.20118 }, { "acc": 0.78325543, "epoch": 1.5141661441549163, "grad_norm": 7.15625, "learning_rate": 1.4690776189272033e-06, "loss": 0.78658934, "memory(GiB)": 147.13, "step": 64900, "train_speed(iter/s)": 0.201197 }, { "acc": 0.78068142, "epoch": 1.514399451727205, "grad_norm": 5.0625, "learning_rate": 1.467740327261199e-06, "loss": 0.79103022, "memory(GiB)": 147.13, "step": 64910, "train_speed(iter/s)": 0.201213 }, { "acc": 0.79627781, "epoch": 1.5146327592994941, "grad_norm": 5.6875, "learning_rate": 1.4664035398242387e-06, "loss": 0.72616043, "memory(GiB)": 147.13, "step": 64920, "train_speed(iter/s)": 0.201228 }, { "acc": 0.77021713, "epoch": 1.5148660668717828, "grad_norm": 4.875, "learning_rate": 1.4650672568071461e-06, "loss": 0.81304054, "memory(GiB)": 147.13, "step": 64930, "train_speed(iter/s)": 0.201244 }, { "acc": 0.78109665, "epoch": 1.515099374444072, "grad_norm": 5.40625, "learning_rate": 1.4637314784006761e-06, "loss": 0.77189884, "memory(GiB)": 147.13, "step": 64940, "train_speed(iter/s)": 0.201261 }, { "acc": 0.76999512, "epoch": 1.5153326820163606, "grad_norm": 6.84375, "learning_rate": 1.4623962047955087e-06, "loss": 0.82561445, "memory(GiB)": 147.13, "step": 64950, "train_speed(iter/s)": 0.201276 }, { "acc": 0.77311716, "epoch": 1.5155659895886497, "grad_norm": 5.34375, "learning_rate": 1.4610614361822567e-06, "loss": 0.817449, "memory(GiB)": 147.13, "step": 64960, "train_speed(iter/s)": 0.201293 }, { "acc": 0.79791188, "epoch": 1.5157992971609384, "grad_norm": 5.0625, "learning_rate": 1.4597271727514568e-06, "loss": 0.7242517, "memory(GiB)": 147.13, "step": 64970, "train_speed(iter/s)": 0.201307 }, { "acc": 0.78501654, "epoch": 1.5160326047332273, "grad_norm": 5.53125, "learning_rate": 1.4583934146935725e-06, "loss": 0.75701132, "memory(GiB)": 147.13, "step": 64980, "train_speed(iter/s)": 0.201323 }, { "acc": 0.76452885, "epoch": 1.5162659123055162, "grad_norm": 5.84375, "learning_rate": 1.4570601621990016e-06, "loss": 0.86551952, "memory(GiB)": 147.13, "step": 64990, "train_speed(iter/s)": 0.201339 }, { "acc": 0.7722949, "epoch": 1.5164992198778051, "grad_norm": 5.625, "learning_rate": 1.4557274154580614e-06, "loss": 0.81062641, "memory(GiB)": 147.13, "step": 65000, "train_speed(iter/s)": 0.201353 }, { "epoch": 1.5164992198778051, "eval_acc": 0.7446660360186799, "eval_loss": 0.8044557571411133, "eval_runtime": 1269.9569, "eval_samples_per_second": 28.34, "eval_steps_per_second": 14.171, "step": 65000 }, { "acc": 0.79000292, "epoch": 1.516732527450094, "grad_norm": 5.25, "learning_rate": 1.4543951746610047e-06, "loss": 0.73190441, "memory(GiB)": 147.13, "step": 65010, "train_speed(iter/s)": 0.200566 }, { "acc": 0.7913847, "epoch": 1.516965835022383, "grad_norm": 4.34375, "learning_rate": 1.4530634399980049e-06, "loss": 0.75735092, "memory(GiB)": 147.13, "step": 65020, "train_speed(iter/s)": 0.200581 }, { "acc": 0.78307304, "epoch": 1.5171991425946718, "grad_norm": 4.25, "learning_rate": 1.4517322116591709e-06, "loss": 0.77868781, "memory(GiB)": 147.13, "step": 65030, "train_speed(iter/s)": 0.200596 }, { "acc": 0.79364333, "epoch": 1.5174324501669607, "grad_norm": 6.65625, "learning_rate": 1.4504014898345326e-06, "loss": 0.73498554, "memory(GiB)": 147.13, "step": 65040, "train_speed(iter/s)": 0.200611 }, { "acc": 0.78174887, "epoch": 1.5176657577392496, "grad_norm": 6.53125, "learning_rate": 1.44907127471405e-06, "loss": 0.78112001, "memory(GiB)": 147.13, "step": 65050, "train_speed(iter/s)": 0.200625 }, { "acc": 0.79200649, "epoch": 1.5178990653115385, "grad_norm": 6.53125, "learning_rate": 1.447741566487612e-06, "loss": 0.73226194, "memory(GiB)": 147.13, "step": 65060, "train_speed(iter/s)": 0.200642 }, { "acc": 0.77733145, "epoch": 1.5181323728838274, "grad_norm": 5.5625, "learning_rate": 1.4464123653450318e-06, "loss": 0.7879797, "memory(GiB)": 147.13, "step": 65070, "train_speed(iter/s)": 0.200658 }, { "acc": 0.76886935, "epoch": 1.5183656804561163, "grad_norm": 5.0, "learning_rate": 1.4450836714760553e-06, "loss": 0.81756496, "memory(GiB)": 147.13, "step": 65080, "train_speed(iter/s)": 0.200674 }, { "acc": 0.79252605, "epoch": 1.5185989880284052, "grad_norm": 7.21875, "learning_rate": 1.4437554850703506e-06, "loss": 0.73690176, "memory(GiB)": 147.13, "step": 65090, "train_speed(iter/s)": 0.20069 }, { "acc": 0.79318552, "epoch": 1.518832295600694, "grad_norm": 4.625, "learning_rate": 1.442427806317519e-06, "loss": 0.73230796, "memory(GiB)": 147.13, "step": 65100, "train_speed(iter/s)": 0.200707 }, { "acc": 0.78320851, "epoch": 1.519065603172983, "grad_norm": 6.375, "learning_rate": 1.4411006354070844e-06, "loss": 0.76603732, "memory(GiB)": 147.13, "step": 65110, "train_speed(iter/s)": 0.200722 }, { "acc": 0.77989206, "epoch": 1.519298910745272, "grad_norm": 4.90625, "learning_rate": 1.4397739725284988e-06, "loss": 0.77163534, "memory(GiB)": 147.13, "step": 65120, "train_speed(iter/s)": 0.200738 }, { "acc": 0.77356272, "epoch": 1.5195322183175608, "grad_norm": 5.40625, "learning_rate": 1.4384478178711458e-06, "loss": 0.7995605, "memory(GiB)": 147.13, "step": 65130, "train_speed(iter/s)": 0.200754 }, { "acc": 0.79077578, "epoch": 1.5197655258898497, "grad_norm": 5.46875, "learning_rate": 1.437122171624331e-06, "loss": 0.73075666, "memory(GiB)": 147.13, "step": 65140, "train_speed(iter/s)": 0.200771 }, { "acc": 0.77648382, "epoch": 1.5199988334621386, "grad_norm": 4.9375, "learning_rate": 1.4357970339772924e-06, "loss": 0.79962687, "memory(GiB)": 147.13, "step": 65150, "train_speed(iter/s)": 0.200787 }, { "acc": 0.78098822, "epoch": 1.5202321410344275, "grad_norm": 6.6875, "learning_rate": 1.4344724051191917e-06, "loss": 0.80490036, "memory(GiB)": 147.13, "step": 65160, "train_speed(iter/s)": 0.200802 }, { "acc": 0.78283033, "epoch": 1.5204654486067164, "grad_norm": 6.1875, "learning_rate": 1.43314828523912e-06, "loss": 0.77967253, "memory(GiB)": 147.13, "step": 65170, "train_speed(iter/s)": 0.200819 }, { "acc": 0.78426676, "epoch": 1.520698756179005, "grad_norm": 10.1875, "learning_rate": 1.431824674526092e-06, "loss": 0.7804903, "memory(GiB)": 147.13, "step": 65180, "train_speed(iter/s)": 0.200834 }, { "acc": 0.77625027, "epoch": 1.5209320637512942, "grad_norm": 5.59375, "learning_rate": 1.430501573169057e-06, "loss": 0.79841747, "memory(GiB)": 147.13, "step": 65190, "train_speed(iter/s)": 0.200848 }, { "acc": 0.79016428, "epoch": 1.5211653713235829, "grad_norm": 4.75, "learning_rate": 1.4291789813568858e-06, "loss": 0.75621462, "memory(GiB)": 147.13, "step": 65200, "train_speed(iter/s)": 0.200864 }, { "acc": 0.78785486, "epoch": 1.521398678895872, "grad_norm": 6.53125, "learning_rate": 1.4278568992783758e-06, "loss": 0.76169577, "memory(GiB)": 147.13, "step": 65210, "train_speed(iter/s)": 0.20088 }, { "acc": 0.78738542, "epoch": 1.5216319864681607, "grad_norm": 4.625, "learning_rate": 1.4265353271222577e-06, "loss": 0.75058613, "memory(GiB)": 147.13, "step": 65220, "train_speed(iter/s)": 0.200898 }, { "acc": 0.78331351, "epoch": 1.5218652940404498, "grad_norm": 5.8125, "learning_rate": 1.4252142650771811e-06, "loss": 0.78213634, "memory(GiB)": 147.13, "step": 65230, "train_speed(iter/s)": 0.200913 }, { "acc": 0.768677, "epoch": 1.5220986016127385, "grad_norm": 4.375, "learning_rate": 1.4238937133317322e-06, "loss": 0.81841431, "memory(GiB)": 147.13, "step": 65240, "train_speed(iter/s)": 0.20093 }, { "acc": 0.78544569, "epoch": 1.5223319091850276, "grad_norm": 5.4375, "learning_rate": 1.4225736720744164e-06, "loss": 0.77090549, "memory(GiB)": 147.13, "step": 65250, "train_speed(iter/s)": 0.200946 }, { "acc": 0.77120905, "epoch": 1.5225652167573163, "grad_norm": 5.53125, "learning_rate": 1.4212541414936682e-06, "loss": 0.81031246, "memory(GiB)": 147.13, "step": 65260, "train_speed(iter/s)": 0.200962 }, { "acc": 0.77259464, "epoch": 1.5227985243296054, "grad_norm": 5.0625, "learning_rate": 1.4199351217778545e-06, "loss": 0.80552759, "memory(GiB)": 147.13, "step": 65270, "train_speed(iter/s)": 0.200978 }, { "acc": 0.78281403, "epoch": 1.523031831901894, "grad_norm": 6.375, "learning_rate": 1.4186166131152595e-06, "loss": 0.76773696, "memory(GiB)": 147.13, "step": 65280, "train_speed(iter/s)": 0.200994 }, { "acc": 0.75774269, "epoch": 1.5232651394741832, "grad_norm": 5.59375, "learning_rate": 1.4172986156941038e-06, "loss": 0.87673063, "memory(GiB)": 147.13, "step": 65290, "train_speed(iter/s)": 0.20101 }, { "acc": 0.76485977, "epoch": 1.5234984470464719, "grad_norm": 4.53125, "learning_rate": 1.4159811297025284e-06, "loss": 0.86806583, "memory(GiB)": 147.13, "step": 65300, "train_speed(iter/s)": 0.201025 }, { "acc": 0.80621672, "epoch": 1.523731754618761, "grad_norm": 6.125, "learning_rate": 1.4146641553286071e-06, "loss": 0.68341751, "memory(GiB)": 147.13, "step": 65310, "train_speed(iter/s)": 0.201041 }, { "acc": 0.78828001, "epoch": 1.5239650621910497, "grad_norm": 6.4375, "learning_rate": 1.4133476927603362e-06, "loss": 0.74977875, "memory(GiB)": 147.13, "step": 65320, "train_speed(iter/s)": 0.201058 }, { "acc": 0.76758041, "epoch": 1.5241983697633388, "grad_norm": 6.1875, "learning_rate": 1.4120317421856389e-06, "loss": 0.81543188, "memory(GiB)": 147.13, "step": 65330, "train_speed(iter/s)": 0.201074 }, { "acc": 0.76192322, "epoch": 1.5244316773356275, "grad_norm": 5.4375, "learning_rate": 1.4107163037923693e-06, "loss": 0.84419556, "memory(GiB)": 147.13, "step": 65340, "train_speed(iter/s)": 0.201091 }, { "acc": 0.77184172, "epoch": 1.5246649849079166, "grad_norm": 5.6875, "learning_rate": 1.409401377768303e-06, "loss": 0.82547474, "memory(GiB)": 147.13, "step": 65350, "train_speed(iter/s)": 0.201106 }, { "acc": 0.77231297, "epoch": 1.5248982924802053, "grad_norm": 4.5625, "learning_rate": 1.408086964301149e-06, "loss": 0.83197184, "memory(GiB)": 147.13, "step": 65360, "train_speed(iter/s)": 0.201122 }, { "acc": 0.78483472, "epoch": 1.5251316000524942, "grad_norm": 6.21875, "learning_rate": 1.4067730635785354e-06, "loss": 0.7758481, "memory(GiB)": 147.13, "step": 65370, "train_speed(iter/s)": 0.201137 }, { "acc": 0.77703638, "epoch": 1.525364907624783, "grad_norm": 9.125, "learning_rate": 1.4054596757880262e-06, "loss": 0.81458502, "memory(GiB)": 147.13, "step": 65380, "train_speed(iter/s)": 0.201152 }, { "acc": 0.77877493, "epoch": 1.525598215197072, "grad_norm": 6.21875, "learning_rate": 1.4041468011171017e-06, "loss": 0.81068916, "memory(GiB)": 147.13, "step": 65390, "train_speed(iter/s)": 0.201168 }, { "acc": 0.77528429, "epoch": 1.5258315227693608, "grad_norm": 4.8125, "learning_rate": 1.4028344397531773e-06, "loss": 0.80191021, "memory(GiB)": 147.13, "step": 65400, "train_speed(iter/s)": 0.201184 }, { "acc": 0.7893209, "epoch": 1.5260648303416497, "grad_norm": 9.5625, "learning_rate": 1.4015225918835923e-06, "loss": 0.7487329, "memory(GiB)": 147.13, "step": 65410, "train_speed(iter/s)": 0.201199 }, { "acc": 0.78093977, "epoch": 1.5262981379139386, "grad_norm": 5.96875, "learning_rate": 1.4002112576956102e-06, "loss": 0.78119683, "memory(GiB)": 147.13, "step": 65420, "train_speed(iter/s)": 0.201215 }, { "acc": 0.7891861, "epoch": 1.5265314454862275, "grad_norm": 8.75, "learning_rate": 1.3989004373764264e-06, "loss": 0.73629584, "memory(GiB)": 147.13, "step": 65430, "train_speed(iter/s)": 0.20123 }, { "acc": 0.76216254, "epoch": 1.5267647530585164, "grad_norm": 5.5625, "learning_rate": 1.3975901311131584e-06, "loss": 0.85819883, "memory(GiB)": 147.13, "step": 65440, "train_speed(iter/s)": 0.201246 }, { "acc": 0.77317934, "epoch": 1.5269980606308053, "grad_norm": 5.75, "learning_rate": 1.3962803390928537e-06, "loss": 0.80435085, "memory(GiB)": 147.13, "step": 65450, "train_speed(iter/s)": 0.201262 }, { "acc": 0.78658466, "epoch": 1.5272313682030942, "grad_norm": 6.8125, "learning_rate": 1.3949710615024836e-06, "loss": 0.76053772, "memory(GiB)": 147.13, "step": 65460, "train_speed(iter/s)": 0.201277 }, { "acc": 0.79117346, "epoch": 1.5274646757753831, "grad_norm": 5.125, "learning_rate": 1.3936622985289462e-06, "loss": 0.75075483, "memory(GiB)": 147.13, "step": 65470, "train_speed(iter/s)": 0.201292 }, { "acc": 0.77793059, "epoch": 1.527697983347672, "grad_norm": 5.34375, "learning_rate": 1.3923540503590689e-06, "loss": 0.79492455, "memory(GiB)": 147.13, "step": 65480, "train_speed(iter/s)": 0.201309 }, { "acc": 0.79856424, "epoch": 1.527931290919961, "grad_norm": 4.25, "learning_rate": 1.3910463171796018e-06, "loss": 0.71239781, "memory(GiB)": 147.13, "step": 65490, "train_speed(iter/s)": 0.201325 }, { "acc": 0.77887478, "epoch": 1.5281645984922498, "grad_norm": 8.3125, "learning_rate": 1.3897390991772269e-06, "loss": 0.78372169, "memory(GiB)": 147.13, "step": 65500, "train_speed(iter/s)": 0.201341 }, { "epoch": 1.5281645984922498, "eval_acc": 0.7447174423172486, "eval_loss": 0.8044983148574829, "eval_runtime": 1270.2082, "eval_samples_per_second": 28.335, "eval_steps_per_second": 14.168, "step": 65500 }, { "acc": 0.77596331, "epoch": 1.5283979060645387, "grad_norm": 5.75, "learning_rate": 1.3884323965385443e-06, "loss": 0.80863295, "memory(GiB)": 147.13, "step": 65510, "train_speed(iter/s)": 0.20056 }, { "acc": 0.79642587, "epoch": 1.5286312136368276, "grad_norm": 4.53125, "learning_rate": 1.3871262094500897e-06, "loss": 0.72295847, "memory(GiB)": 147.13, "step": 65520, "train_speed(iter/s)": 0.200576 }, { "acc": 0.78992147, "epoch": 1.5288645212091165, "grad_norm": 4.25, "learning_rate": 1.3858205380983175e-06, "loss": 0.7313539, "memory(GiB)": 147.13, "step": 65530, "train_speed(iter/s)": 0.200591 }, { "acc": 0.78537593, "epoch": 1.5290978287814054, "grad_norm": 7.5, "learning_rate": 1.3845153826696144e-06, "loss": 0.76099243, "memory(GiB)": 147.13, "step": 65540, "train_speed(iter/s)": 0.200607 }, { "acc": 0.76554108, "epoch": 1.5293311363536943, "grad_norm": 5.6875, "learning_rate": 1.3832107433502912e-06, "loss": 0.85681934, "memory(GiB)": 147.13, "step": 65550, "train_speed(iter/s)": 0.200623 }, { "acc": 0.77308493, "epoch": 1.5295644439259832, "grad_norm": 14.5, "learning_rate": 1.3819066203265813e-06, "loss": 0.82602463, "memory(GiB)": 147.13, "step": 65560, "train_speed(iter/s)": 0.200639 }, { "acc": 0.76433573, "epoch": 1.529797751498272, "grad_norm": 4.375, "learning_rate": 1.3806030137846521e-06, "loss": 0.84086971, "memory(GiB)": 147.13, "step": 65570, "train_speed(iter/s)": 0.200654 }, { "acc": 0.79202843, "epoch": 1.530031059070561, "grad_norm": 5.09375, "learning_rate": 1.3792999239105898e-06, "loss": 0.74395723, "memory(GiB)": 147.13, "step": 65580, "train_speed(iter/s)": 0.20067 }, { "acc": 0.79133291, "epoch": 1.5302643666428497, "grad_norm": 6.25, "learning_rate": 1.3779973508904132e-06, "loss": 0.74699202, "memory(GiB)": 147.13, "step": 65590, "train_speed(iter/s)": 0.200684 }, { "acc": 0.77008934, "epoch": 1.5304976742151388, "grad_norm": 5.5, "learning_rate": 1.3766952949100625e-06, "loss": 0.83408842, "memory(GiB)": 147.13, "step": 65600, "train_speed(iter/s)": 0.2007 }, { "acc": 0.77300768, "epoch": 1.5307309817874275, "grad_norm": 6.65625, "learning_rate": 1.3753937561554053e-06, "loss": 0.81059189, "memory(GiB)": 147.13, "step": 65610, "train_speed(iter/s)": 0.200715 }, { "acc": 0.76012034, "epoch": 1.5309642893597166, "grad_norm": 6.8125, "learning_rate": 1.3740927348122373e-06, "loss": 0.87639198, "memory(GiB)": 147.13, "step": 65620, "train_speed(iter/s)": 0.200731 }, { "acc": 0.77505989, "epoch": 1.5311975969320053, "grad_norm": 6.65625, "learning_rate": 1.3727922310662762e-06, "loss": 0.79479666, "memory(GiB)": 147.13, "step": 65630, "train_speed(iter/s)": 0.200746 }, { "acc": 0.78588977, "epoch": 1.5314309045042944, "grad_norm": 4.75, "learning_rate": 1.371492245103172e-06, "loss": 0.79581938, "memory(GiB)": 147.13, "step": 65640, "train_speed(iter/s)": 0.200761 }, { "acc": 0.78697853, "epoch": 1.531664212076583, "grad_norm": 7.75, "learning_rate": 1.3701927771084939e-06, "loss": 0.77796259, "memory(GiB)": 147.13, "step": 65650, "train_speed(iter/s)": 0.200777 }, { "acc": 0.7804883, "epoch": 1.5318975196488722, "grad_norm": 4.9375, "learning_rate": 1.3688938272677442e-06, "loss": 0.78984337, "memory(GiB)": 147.13, "step": 65660, "train_speed(iter/s)": 0.200793 }, { "acc": 0.7731575, "epoch": 1.532130827221161, "grad_norm": 7.46875, "learning_rate": 1.3675953957663441e-06, "loss": 0.8263773, "memory(GiB)": 147.13, "step": 65670, "train_speed(iter/s)": 0.200808 }, { "acc": 0.81379738, "epoch": 1.53236413479345, "grad_norm": 5.09375, "learning_rate": 1.3662974827896474e-06, "loss": 0.65503149, "memory(GiB)": 147.13, "step": 65680, "train_speed(iter/s)": 0.200825 }, { "acc": 0.77931385, "epoch": 1.5325974423657387, "grad_norm": 5.125, "learning_rate": 1.3650000885229297e-06, "loss": 0.80769215, "memory(GiB)": 147.13, "step": 65690, "train_speed(iter/s)": 0.200841 }, { "acc": 0.77855949, "epoch": 1.5328307499380278, "grad_norm": 5.6875, "learning_rate": 1.3637032131513922e-06, "loss": 0.78796816, "memory(GiB)": 147.13, "step": 65700, "train_speed(iter/s)": 0.200856 }, { "acc": 0.77442112, "epoch": 1.5330640575103165, "grad_norm": 6.3125, "learning_rate": 1.3624068568601655e-06, "loss": 0.79497447, "memory(GiB)": 147.13, "step": 65710, "train_speed(iter/s)": 0.200872 }, { "acc": 0.77679834, "epoch": 1.5332973650826056, "grad_norm": 8.5625, "learning_rate": 1.3611110198343025e-06, "loss": 0.81760941, "memory(GiB)": 147.13, "step": 65720, "train_speed(iter/s)": 0.200887 }, { "acc": 0.78379855, "epoch": 1.5335306726548943, "grad_norm": 5.09375, "learning_rate": 1.359815702258787e-06, "loss": 0.77201395, "memory(GiB)": 147.13, "step": 65730, "train_speed(iter/s)": 0.200902 }, { "acc": 0.78287749, "epoch": 1.5337639802271834, "grad_norm": 6.8125, "learning_rate": 1.3585209043185193e-06, "loss": 0.76966333, "memory(GiB)": 147.13, "step": 65740, "train_speed(iter/s)": 0.200917 }, { "acc": 0.78400383, "epoch": 1.533997287799472, "grad_norm": 6.375, "learning_rate": 1.3572266261983363e-06, "loss": 0.77230244, "memory(GiB)": 147.13, "step": 65750, "train_speed(iter/s)": 0.200934 }, { "acc": 0.78760571, "epoch": 1.534230595371761, "grad_norm": 5.59375, "learning_rate": 1.3559328680829942e-06, "loss": 0.76585751, "memory(GiB)": 147.13, "step": 65760, "train_speed(iter/s)": 0.200949 }, { "acc": 0.79107695, "epoch": 1.5344639029440499, "grad_norm": 4.75, "learning_rate": 1.3546396301571751e-06, "loss": 0.74846153, "memory(GiB)": 147.13, "step": 65770, "train_speed(iter/s)": 0.200965 }, { "acc": 0.78852139, "epoch": 1.5346972105163388, "grad_norm": 5.53125, "learning_rate": 1.353346912605491e-06, "loss": 0.77424564, "memory(GiB)": 147.13, "step": 65780, "train_speed(iter/s)": 0.200981 }, { "acc": 0.79274092, "epoch": 1.5349305180886277, "grad_norm": 7.1875, "learning_rate": 1.3520547156124748e-06, "loss": 0.74635301, "memory(GiB)": 147.13, "step": 65790, "train_speed(iter/s)": 0.200998 }, { "acc": 0.77839308, "epoch": 1.5351638256609166, "grad_norm": 4.875, "learning_rate": 1.3507630393625893e-06, "loss": 0.80371504, "memory(GiB)": 147.13, "step": 65800, "train_speed(iter/s)": 0.201015 }, { "acc": 0.77611036, "epoch": 1.5353971332332055, "grad_norm": 5.25, "learning_rate": 1.3494718840402188e-06, "loss": 0.78693495, "memory(GiB)": 147.13, "step": 65810, "train_speed(iter/s)": 0.201031 }, { "acc": 0.77409744, "epoch": 1.5356304408054944, "grad_norm": 6.59375, "learning_rate": 1.348181249829677e-06, "loss": 0.82686939, "memory(GiB)": 147.13, "step": 65820, "train_speed(iter/s)": 0.201047 }, { "acc": 0.76908383, "epoch": 1.5358637483777833, "grad_norm": 4.625, "learning_rate": 1.3468911369152015e-06, "loss": 0.83825998, "memory(GiB)": 147.13, "step": 65830, "train_speed(iter/s)": 0.201063 }, { "acc": 0.77742887, "epoch": 1.5360970559500722, "grad_norm": 4.6875, "learning_rate": 1.3456015454809551e-06, "loss": 0.79348636, "memory(GiB)": 147.13, "step": 65840, "train_speed(iter/s)": 0.20108 }, { "acc": 0.77535996, "epoch": 1.536330363522361, "grad_norm": 7.0, "learning_rate": 1.3443124757110266e-06, "loss": 0.81135139, "memory(GiB)": 147.13, "step": 65850, "train_speed(iter/s)": 0.201095 }, { "acc": 0.77499065, "epoch": 1.53656367109465, "grad_norm": 6.96875, "learning_rate": 1.3430239277894281e-06, "loss": 0.82661295, "memory(GiB)": 147.13, "step": 65860, "train_speed(iter/s)": 0.201109 }, { "acc": 0.78962278, "epoch": 1.5367969786669389, "grad_norm": 5.09375, "learning_rate": 1.3417359019001037e-06, "loss": 0.74881072, "memory(GiB)": 147.13, "step": 65870, "train_speed(iter/s)": 0.201125 }, { "acc": 0.79519157, "epoch": 1.5370302862392278, "grad_norm": 9.3125, "learning_rate": 1.3404483982269145e-06, "loss": 0.74976549, "memory(GiB)": 147.13, "step": 65880, "train_speed(iter/s)": 0.201141 }, { "acc": 0.7780304, "epoch": 1.5372635938115167, "grad_norm": 6.15625, "learning_rate": 1.339161416953655e-06, "loss": 0.78960228, "memory(GiB)": 147.13, "step": 65890, "train_speed(iter/s)": 0.201157 }, { "acc": 0.74467463, "epoch": 1.5374969013838056, "grad_norm": 6.0, "learning_rate": 1.3378749582640398e-06, "loss": 0.95666943, "memory(GiB)": 147.13, "step": 65900, "train_speed(iter/s)": 0.201174 }, { "acc": 0.78630295, "epoch": 1.5377302089560945, "grad_norm": 4.125, "learning_rate": 1.3365890223417088e-06, "loss": 0.74238262, "memory(GiB)": 147.13, "step": 65910, "train_speed(iter/s)": 0.20119 }, { "acc": 0.7730278, "epoch": 1.5379635165283834, "grad_norm": 5.15625, "learning_rate": 1.3353036093702326e-06, "loss": 0.7966548, "memory(GiB)": 147.13, "step": 65920, "train_speed(iter/s)": 0.201206 }, { "acc": 0.78292246, "epoch": 1.5381968241006723, "grad_norm": 4.625, "learning_rate": 1.3340187195331e-06, "loss": 0.7709209, "memory(GiB)": 147.13, "step": 65930, "train_speed(iter/s)": 0.201223 }, { "acc": 0.78873224, "epoch": 1.5384301316729612, "grad_norm": 5.96875, "learning_rate": 1.3327343530137316e-06, "loss": 0.76129274, "memory(GiB)": 147.13, "step": 65940, "train_speed(iter/s)": 0.201238 }, { "acc": 0.77018852, "epoch": 1.53866343924525, "grad_norm": 3.5, "learning_rate": 1.3314505099954683e-06, "loss": 0.82236538, "memory(GiB)": 147.13, "step": 65950, "train_speed(iter/s)": 0.201254 }, { "acc": 0.77579556, "epoch": 1.5388967468175387, "grad_norm": 6.65625, "learning_rate": 1.33016719066158e-06, "loss": 0.7895669, "memory(GiB)": 147.13, "step": 65960, "train_speed(iter/s)": 0.201269 }, { "acc": 0.78516469, "epoch": 1.5391300543898279, "grad_norm": 4.78125, "learning_rate": 1.328884395195257e-06, "loss": 0.77448907, "memory(GiB)": 147.13, "step": 65970, "train_speed(iter/s)": 0.201284 }, { "acc": 0.77121625, "epoch": 1.5393633619621165, "grad_norm": 6.0, "learning_rate": 1.3276021237796216e-06, "loss": 0.83493958, "memory(GiB)": 147.13, "step": 65980, "train_speed(iter/s)": 0.201298 }, { "acc": 0.79430099, "epoch": 1.5395966695344057, "grad_norm": 4.28125, "learning_rate": 1.3263203765977168e-06, "loss": 0.73301487, "memory(GiB)": 147.13, "step": 65990, "train_speed(iter/s)": 0.201314 }, { "acc": 0.78175182, "epoch": 1.5398299771066943, "grad_norm": 5.03125, "learning_rate": 1.3250391538325085e-06, "loss": 0.75961099, "memory(GiB)": 147.13, "step": 66000, "train_speed(iter/s)": 0.20133 }, { "epoch": 1.5398299771066943, "eval_acc": 0.7446849330380915, "eval_loss": 0.8045194149017334, "eval_runtime": 1270.9323, "eval_samples_per_second": 28.319, "eval_steps_per_second": 14.16, "step": 66000 }, { "acc": 0.79298277, "epoch": 1.5400632846789835, "grad_norm": 5.65625, "learning_rate": 1.3237584556668958e-06, "loss": 0.74688911, "memory(GiB)": 147.13, "step": 66010, "train_speed(iter/s)": 0.200553 }, { "acc": 0.78268442, "epoch": 1.5402965922512721, "grad_norm": 4.0, "learning_rate": 1.322478282283694e-06, "loss": 0.77756462, "memory(GiB)": 147.13, "step": 66020, "train_speed(iter/s)": 0.200569 }, { "acc": 0.77548428, "epoch": 1.5405298998235613, "grad_norm": 6.875, "learning_rate": 1.3211986338656503e-06, "loss": 0.80724964, "memory(GiB)": 147.13, "step": 66030, "train_speed(iter/s)": 0.200586 }, { "acc": 0.77102299, "epoch": 1.54076320739585, "grad_norm": 5.03125, "learning_rate": 1.3199195105954331e-06, "loss": 0.82279663, "memory(GiB)": 147.13, "step": 66040, "train_speed(iter/s)": 0.200601 }, { "acc": 0.77918787, "epoch": 1.540996514968139, "grad_norm": 5.46875, "learning_rate": 1.318640912655635e-06, "loss": 0.79191918, "memory(GiB)": 147.13, "step": 66050, "train_speed(iter/s)": 0.200616 }, { "acc": 0.80649509, "epoch": 1.5412298225404277, "grad_norm": 4.40625, "learning_rate": 1.3173628402287785e-06, "loss": 0.68745565, "memory(GiB)": 147.13, "step": 66060, "train_speed(iter/s)": 0.200632 }, { "acc": 0.77775507, "epoch": 1.5414631301127169, "grad_norm": 7.46875, "learning_rate": 1.3160852934973073e-06, "loss": 0.79977551, "memory(GiB)": 147.13, "step": 66070, "train_speed(iter/s)": 0.200647 }, { "acc": 0.76136436, "epoch": 1.5416964376850055, "grad_norm": 11.75, "learning_rate": 1.31480827264359e-06, "loss": 0.86434231, "memory(GiB)": 147.13, "step": 66080, "train_speed(iter/s)": 0.200664 }, { "acc": 0.77416496, "epoch": 1.5419297452572946, "grad_norm": 7.0, "learning_rate": 1.3135317778499196e-06, "loss": 0.78873339, "memory(GiB)": 147.13, "step": 66090, "train_speed(iter/s)": 0.200681 }, { "acc": 0.76674585, "epoch": 1.5421630528295833, "grad_norm": 5.8125, "learning_rate": 1.3122558092985188e-06, "loss": 0.83854027, "memory(GiB)": 147.13, "step": 66100, "train_speed(iter/s)": 0.200697 }, { "acc": 0.77215676, "epoch": 1.5423963604018724, "grad_norm": 4.375, "learning_rate": 1.3109803671715283e-06, "loss": 0.81484003, "memory(GiB)": 147.13, "step": 66110, "train_speed(iter/s)": 0.200713 }, { "acc": 0.79621944, "epoch": 1.5426296679741611, "grad_norm": 5.78125, "learning_rate": 1.30970545165102e-06, "loss": 0.72149839, "memory(GiB)": 147.13, "step": 66120, "train_speed(iter/s)": 0.200729 }, { "acc": 0.79580064, "epoch": 1.5428629755464502, "grad_norm": 12.9375, "learning_rate": 1.3084310629189868e-06, "loss": 0.73481159, "memory(GiB)": 147.13, "step": 66130, "train_speed(iter/s)": 0.200743 }, { "acc": 0.77218895, "epoch": 1.543096283118739, "grad_norm": 5.0625, "learning_rate": 1.3071572011573453e-06, "loss": 0.83505878, "memory(GiB)": 147.13, "step": 66140, "train_speed(iter/s)": 0.200758 }, { "acc": 0.78478584, "epoch": 1.5433295906910278, "grad_norm": 3.75, "learning_rate": 1.3058838665479418e-06, "loss": 0.79148293, "memory(GiB)": 147.13, "step": 66150, "train_speed(iter/s)": 0.200774 }, { "acc": 0.7856843, "epoch": 1.5435628982633167, "grad_norm": 4.21875, "learning_rate": 1.304611059272542e-06, "loss": 0.77415447, "memory(GiB)": 147.13, "step": 66160, "train_speed(iter/s)": 0.200788 }, { "acc": 0.77835255, "epoch": 1.5437962058356056, "grad_norm": 6.25, "learning_rate": 1.3033387795128416e-06, "loss": 0.79931955, "memory(GiB)": 147.13, "step": 66170, "train_speed(iter/s)": 0.200804 }, { "acc": 0.76517811, "epoch": 1.5440295134078945, "grad_norm": 5.3125, "learning_rate": 1.302067027450456e-06, "loss": 0.84638748, "memory(GiB)": 147.13, "step": 66180, "train_speed(iter/s)": 0.200821 }, { "acc": 0.78675337, "epoch": 1.5442628209801834, "grad_norm": 8.125, "learning_rate": 1.3007958032669283e-06, "loss": 0.76565952, "memory(GiB)": 147.13, "step": 66190, "train_speed(iter/s)": 0.200836 }, { "acc": 0.78021317, "epoch": 1.5444961285524723, "grad_norm": 5.0625, "learning_rate": 1.2995251071437253e-06, "loss": 0.78182669, "memory(GiB)": 147.13, "step": 66200, "train_speed(iter/s)": 0.200851 }, { "acc": 0.78736315, "epoch": 1.5447294361247612, "grad_norm": 5.75, "learning_rate": 1.2982549392622362e-06, "loss": 0.75720816, "memory(GiB)": 147.13, "step": 66210, "train_speed(iter/s)": 0.200866 }, { "acc": 0.76521492, "epoch": 1.5449627436970501, "grad_norm": 5.3125, "learning_rate": 1.2969852998037813e-06, "loss": 0.8522131, "memory(GiB)": 147.13, "step": 66220, "train_speed(iter/s)": 0.200882 }, { "acc": 0.78474207, "epoch": 1.545196051269339, "grad_norm": 4.21875, "learning_rate": 1.2957161889495972e-06, "loss": 0.77432103, "memory(GiB)": 147.13, "step": 66230, "train_speed(iter/s)": 0.200897 }, { "acc": 0.77806253, "epoch": 1.545429358841628, "grad_norm": 4.59375, "learning_rate": 1.2944476068808526e-06, "loss": 0.80501852, "memory(GiB)": 147.13, "step": 66240, "train_speed(iter/s)": 0.200913 }, { "acc": 0.7818716, "epoch": 1.5456626664139168, "grad_norm": 4.6875, "learning_rate": 1.2931795537786357e-06, "loss": 0.77636614, "memory(GiB)": 147.13, "step": 66250, "train_speed(iter/s)": 0.200929 }, { "acc": 0.77749758, "epoch": 1.5458959739862057, "grad_norm": 5.40625, "learning_rate": 1.2919120298239591e-06, "loss": 0.80647802, "memory(GiB)": 147.13, "step": 66260, "train_speed(iter/s)": 0.200944 }, { "acc": 0.77590075, "epoch": 1.5461292815584946, "grad_norm": 5.75, "learning_rate": 1.2906450351977646e-06, "loss": 0.81553898, "memory(GiB)": 147.13, "step": 66270, "train_speed(iter/s)": 0.200959 }, { "acc": 0.78080235, "epoch": 1.5463625891307835, "grad_norm": 6.0, "learning_rate": 1.2893785700809118e-06, "loss": 0.79015241, "memory(GiB)": 147.13, "step": 66280, "train_speed(iter/s)": 0.200975 }, { "acc": 0.78328552, "epoch": 1.5465958967030724, "grad_norm": 4.8125, "learning_rate": 1.2881126346541922e-06, "loss": 0.76925888, "memory(GiB)": 147.13, "step": 66290, "train_speed(iter/s)": 0.20099 }, { "acc": 0.77661171, "epoch": 1.5468292042753613, "grad_norm": 4.46875, "learning_rate": 1.2868472290983158e-06, "loss": 0.80561304, "memory(GiB)": 147.13, "step": 66300, "train_speed(iter/s)": 0.201006 }, { "acc": 0.79191103, "epoch": 1.5470625118476502, "grad_norm": 3.734375, "learning_rate": 1.2855823535939188e-06, "loss": 0.74329491, "memory(GiB)": 147.13, "step": 66310, "train_speed(iter/s)": 0.20102 }, { "acc": 0.77668848, "epoch": 1.547295819419939, "grad_norm": 4.9375, "learning_rate": 1.28431800832156e-06, "loss": 0.79781227, "memory(GiB)": 147.13, "step": 66320, "train_speed(iter/s)": 0.201036 }, { "acc": 0.77691069, "epoch": 1.5475291269922278, "grad_norm": 5.03125, "learning_rate": 1.2830541934617274e-06, "loss": 0.79807272, "memory(GiB)": 147.13, "step": 66330, "train_speed(iter/s)": 0.201051 }, { "acc": 0.78547478, "epoch": 1.547762434564517, "grad_norm": 5.1875, "learning_rate": 1.2817909091948293e-06, "loss": 0.76715317, "memory(GiB)": 147.13, "step": 66340, "train_speed(iter/s)": 0.201066 }, { "acc": 0.78639536, "epoch": 1.5479957421368056, "grad_norm": 5.0625, "learning_rate": 1.2805281557011972e-06, "loss": 0.75163832, "memory(GiB)": 147.13, "step": 66350, "train_speed(iter/s)": 0.201082 }, { "acc": 0.79573755, "epoch": 1.5482290497090947, "grad_norm": 7.09375, "learning_rate": 1.2792659331610919e-06, "loss": 0.72670212, "memory(GiB)": 147.13, "step": 66360, "train_speed(iter/s)": 0.201097 }, { "acc": 0.77826233, "epoch": 1.5484623572813834, "grad_norm": 6.21875, "learning_rate": 1.2780042417546917e-06, "loss": 0.80692101, "memory(GiB)": 147.13, "step": 66370, "train_speed(iter/s)": 0.201111 }, { "acc": 0.76512957, "epoch": 1.5486956648536725, "grad_norm": 5.71875, "learning_rate": 1.276743081662107e-06, "loss": 0.85662422, "memory(GiB)": 147.13, "step": 66380, "train_speed(iter/s)": 0.201126 }, { "acc": 0.78918648, "epoch": 1.5489289724259612, "grad_norm": 4.78125, "learning_rate": 1.2754824530633654e-06, "loss": 0.77457085, "memory(GiB)": 147.13, "step": 66390, "train_speed(iter/s)": 0.201141 }, { "acc": 0.76402283, "epoch": 1.5491622799982503, "grad_norm": 4.71875, "learning_rate": 1.2742223561384204e-06, "loss": 0.86086817, "memory(GiB)": 147.13, "step": 66400, "train_speed(iter/s)": 0.201157 }, { "acc": 0.77913628, "epoch": 1.549395587570539, "grad_norm": 4.9375, "learning_rate": 1.2729627910671544e-06, "loss": 0.79207058, "memory(GiB)": 147.13, "step": 66410, "train_speed(iter/s)": 0.201172 }, { "acc": 0.80253363, "epoch": 1.549628895142828, "grad_norm": 5.9375, "learning_rate": 1.271703758029364e-06, "loss": 0.71166973, "memory(GiB)": 147.13, "step": 66420, "train_speed(iter/s)": 0.201188 }, { "acc": 0.77526503, "epoch": 1.5498622027151168, "grad_norm": 5.53125, "learning_rate": 1.270445257204781e-06, "loss": 0.78889947, "memory(GiB)": 147.13, "step": 66430, "train_speed(iter/s)": 0.201205 }, { "acc": 0.78784337, "epoch": 1.5500955102874059, "grad_norm": 6.625, "learning_rate": 1.2691872887730528e-06, "loss": 0.74168453, "memory(GiB)": 147.13, "step": 66440, "train_speed(iter/s)": 0.20122 }, { "acc": 0.79704084, "epoch": 1.5503288178596946, "grad_norm": 7.65625, "learning_rate": 1.2679298529137563e-06, "loss": 0.75291963, "memory(GiB)": 147.13, "step": 66450, "train_speed(iter/s)": 0.201236 }, { "acc": 0.78185368, "epoch": 1.5505621254319837, "grad_norm": 4.8125, "learning_rate": 1.266672949806388e-06, "loss": 0.78402071, "memory(GiB)": 147.13, "step": 66460, "train_speed(iter/s)": 0.201252 }, { "acc": 0.78436918, "epoch": 1.5507954330042724, "grad_norm": 5.875, "learning_rate": 1.265416579630373e-06, "loss": 0.76729245, "memory(GiB)": 147.13, "step": 66470, "train_speed(iter/s)": 0.201267 }, { "acc": 0.77176619, "epoch": 1.5510287405765615, "grad_norm": 5.6875, "learning_rate": 1.2641607425650565e-06, "loss": 0.8262764, "memory(GiB)": 147.13, "step": 66480, "train_speed(iter/s)": 0.201283 }, { "acc": 0.79059591, "epoch": 1.5512620481488502, "grad_norm": 4.125, "learning_rate": 1.2629054387897066e-06, "loss": 0.75890532, "memory(GiB)": 147.13, "step": 66490, "train_speed(iter/s)": 0.201299 }, { "acc": 0.78669443, "epoch": 1.5514953557211393, "grad_norm": 6.75, "learning_rate": 1.2616506684835217e-06, "loss": 0.77961187, "memory(GiB)": 147.13, "step": 66500, "train_speed(iter/s)": 0.201314 }, { "epoch": 1.5514953557211393, "eval_acc": 0.7446834917400007, "eval_loss": 0.8045360445976257, "eval_runtime": 1270.6085, "eval_samples_per_second": 28.326, "eval_steps_per_second": 14.163, "step": 66500 }, { "acc": 0.7779171, "epoch": 1.551728663293428, "grad_norm": 3.71875, "learning_rate": 1.2603964318256167e-06, "loss": 0.82731237, "memory(GiB)": 147.13, "step": 66510, "train_speed(iter/s)": 0.200544 }, { "acc": 0.76602592, "epoch": 1.5519619708657169, "grad_norm": 5.5625, "learning_rate": 1.2591427289950358e-06, "loss": 0.85228939, "memory(GiB)": 147.13, "step": 66520, "train_speed(iter/s)": 0.20056 }, { "acc": 0.79903426, "epoch": 1.5521952784380058, "grad_norm": 4.40625, "learning_rate": 1.2578895601707435e-06, "loss": 0.74350095, "memory(GiB)": 147.13, "step": 66530, "train_speed(iter/s)": 0.200576 }, { "acc": 0.77694921, "epoch": 1.5524285860102947, "grad_norm": 5.6875, "learning_rate": 1.2566369255316296e-06, "loss": 0.81319304, "memory(GiB)": 147.13, "step": 66540, "train_speed(iter/s)": 0.200592 }, { "acc": 0.77841153, "epoch": 1.5526618935825836, "grad_norm": 5.46875, "learning_rate": 1.255384825256507e-06, "loss": 0.79920754, "memory(GiB)": 147.13, "step": 66550, "train_speed(iter/s)": 0.200607 }, { "acc": 0.77843218, "epoch": 1.5528952011548725, "grad_norm": 4.9375, "learning_rate": 1.2541332595241112e-06, "loss": 0.8021946, "memory(GiB)": 147.13, "step": 66560, "train_speed(iter/s)": 0.200622 }, { "acc": 0.77249784, "epoch": 1.5531285087271613, "grad_norm": 7.15625, "learning_rate": 1.2528822285131059e-06, "loss": 0.8305788, "memory(GiB)": 147.13, "step": 66570, "train_speed(iter/s)": 0.200636 }, { "acc": 0.77560177, "epoch": 1.5533618162994502, "grad_norm": 5.5625, "learning_rate": 1.2516317324020727e-06, "loss": 0.79391565, "memory(GiB)": 147.13, "step": 66580, "train_speed(iter/s)": 0.200652 }, { "acc": 0.78499255, "epoch": 1.5535951238717391, "grad_norm": 4.21875, "learning_rate": 1.2503817713695221e-06, "loss": 0.76304283, "memory(GiB)": 147.13, "step": 66590, "train_speed(iter/s)": 0.200668 }, { "acc": 0.79315147, "epoch": 1.553828431444028, "grad_norm": 5.0625, "learning_rate": 1.2491323455938831e-06, "loss": 0.75981688, "memory(GiB)": 147.13, "step": 66600, "train_speed(iter/s)": 0.200683 }, { "acc": 0.79385643, "epoch": 1.554061739016317, "grad_norm": 3.984375, "learning_rate": 1.2478834552535135e-06, "loss": 0.72674108, "memory(GiB)": 147.13, "step": 66610, "train_speed(iter/s)": 0.200699 }, { "acc": 0.77791653, "epoch": 1.5542950465886058, "grad_norm": 5.71875, "learning_rate": 1.2466351005266903e-06, "loss": 0.78302855, "memory(GiB)": 147.13, "step": 66620, "train_speed(iter/s)": 0.200715 }, { "acc": 0.80931139, "epoch": 1.5545283541608947, "grad_norm": 3.828125, "learning_rate": 1.2453872815916151e-06, "loss": 0.67272167, "memory(GiB)": 147.13, "step": 66630, "train_speed(iter/s)": 0.200732 }, { "acc": 0.76477842, "epoch": 1.5547616617331836, "grad_norm": 5.25, "learning_rate": 1.2441399986264174e-06, "loss": 0.8524189, "memory(GiB)": 147.13, "step": 66640, "train_speed(iter/s)": 0.200745 }, { "acc": 0.78072309, "epoch": 1.5549949693054725, "grad_norm": 6.5, "learning_rate": 1.2428932518091413e-06, "loss": 0.79402523, "memory(GiB)": 147.13, "step": 66650, "train_speed(iter/s)": 0.200761 }, { "acc": 0.77371817, "epoch": 1.5552282768777614, "grad_norm": 30.25, "learning_rate": 1.2416470413177633e-06, "loss": 0.82794876, "memory(GiB)": 147.13, "step": 66660, "train_speed(iter/s)": 0.200777 }, { "acc": 0.7927494, "epoch": 1.5554615844500503, "grad_norm": 5.90625, "learning_rate": 1.2404013673301768e-06, "loss": 0.73954892, "memory(GiB)": 147.13, "step": 66670, "train_speed(iter/s)": 0.200794 }, { "acc": 0.78622789, "epoch": 1.5556948920223392, "grad_norm": 7.71875, "learning_rate": 1.2391562300242044e-06, "loss": 0.79023104, "memory(GiB)": 147.13, "step": 66680, "train_speed(iter/s)": 0.200809 }, { "acc": 0.77736959, "epoch": 1.5559281995946281, "grad_norm": 4.21875, "learning_rate": 1.2379116295775877e-06, "loss": 0.79935203, "memory(GiB)": 147.13, "step": 66690, "train_speed(iter/s)": 0.200825 }, { "acc": 0.77949438, "epoch": 1.556161507166917, "grad_norm": 6.78125, "learning_rate": 1.236667566167991e-06, "loss": 0.791997, "memory(GiB)": 147.13, "step": 66700, "train_speed(iter/s)": 0.200841 }, { "acc": 0.78041005, "epoch": 1.556394814739206, "grad_norm": 4.53125, "learning_rate": 1.2354240399730083e-06, "loss": 0.78324399, "memory(GiB)": 147.13, "step": 66710, "train_speed(iter/s)": 0.200856 }, { "acc": 0.77618508, "epoch": 1.5566281223114946, "grad_norm": 6.875, "learning_rate": 1.2341810511701485e-06, "loss": 0.80764313, "memory(GiB)": 147.13, "step": 66720, "train_speed(iter/s)": 0.20087 }, { "acc": 0.78065491, "epoch": 1.5568614298837837, "grad_norm": 5.125, "learning_rate": 1.2329385999368509e-06, "loss": 0.80852127, "memory(GiB)": 147.13, "step": 66730, "train_speed(iter/s)": 0.200886 }, { "acc": 0.78683767, "epoch": 1.5570947374560724, "grad_norm": 4.65625, "learning_rate": 1.2316966864504725e-06, "loss": 0.76271248, "memory(GiB)": 147.13, "step": 66740, "train_speed(iter/s)": 0.200901 }, { "acc": 0.76667438, "epoch": 1.5573280450283615, "grad_norm": 6.0625, "learning_rate": 1.230455310888299e-06, "loss": 0.85402393, "memory(GiB)": 147.13, "step": 66750, "train_speed(iter/s)": 0.200918 }, { "acc": 0.80291309, "epoch": 1.5575613526006502, "grad_norm": 5.21875, "learning_rate": 1.2292144734275347e-06, "loss": 0.70809097, "memory(GiB)": 147.13, "step": 66760, "train_speed(iter/s)": 0.200934 }, { "acc": 0.79057379, "epoch": 1.5577946601729393, "grad_norm": 4.46875, "learning_rate": 1.2279741742453088e-06, "loss": 0.75554724, "memory(GiB)": 147.13, "step": 66770, "train_speed(iter/s)": 0.200949 }, { "acc": 0.79762211, "epoch": 1.558027967745228, "grad_norm": 4.1875, "learning_rate": 1.2267344135186743e-06, "loss": 0.72075491, "memory(GiB)": 147.13, "step": 66780, "train_speed(iter/s)": 0.200965 }, { "acc": 0.78778682, "epoch": 1.5582612753175171, "grad_norm": 6.0, "learning_rate": 1.2254951914246038e-06, "loss": 0.75706754, "memory(GiB)": 147.13, "step": 66790, "train_speed(iter/s)": 0.200981 }, { "acc": 0.77799454, "epoch": 1.5584945828898058, "grad_norm": 4.5, "learning_rate": 1.2242565081400004e-06, "loss": 0.80059433, "memory(GiB)": 147.13, "step": 66800, "train_speed(iter/s)": 0.200996 }, { "acc": 0.78043733, "epoch": 1.558727890462095, "grad_norm": 7.78125, "learning_rate": 1.223018363841682e-06, "loss": 0.78616896, "memory(GiB)": 147.13, "step": 66810, "train_speed(iter/s)": 0.201012 }, { "acc": 0.78293238, "epoch": 1.5589611980343836, "grad_norm": 6.84375, "learning_rate": 1.2217807587063962e-06, "loss": 0.77822418, "memory(GiB)": 147.13, "step": 66820, "train_speed(iter/s)": 0.201029 }, { "acc": 0.75709181, "epoch": 1.5591945056066727, "grad_norm": 5.3125, "learning_rate": 1.2205436929108093e-06, "loss": 0.87782154, "memory(GiB)": 147.13, "step": 66830, "train_speed(iter/s)": 0.201044 }, { "acc": 0.78140383, "epoch": 1.5594278131789614, "grad_norm": 5.4375, "learning_rate": 1.2193071666315114e-06, "loss": 0.77247772, "memory(GiB)": 147.13, "step": 66840, "train_speed(iter/s)": 0.201061 }, { "acc": 0.77636423, "epoch": 1.5596611207512505, "grad_norm": 4.6875, "learning_rate": 1.2180711800450184e-06, "loss": 0.79241691, "memory(GiB)": 147.13, "step": 66850, "train_speed(iter/s)": 0.201076 }, { "acc": 0.76072507, "epoch": 1.5598944283235392, "grad_norm": 6.34375, "learning_rate": 1.2168357333277641e-06, "loss": 0.85695286, "memory(GiB)": 147.13, "step": 66860, "train_speed(iter/s)": 0.201091 }, { "acc": 0.80281935, "epoch": 1.5601277358958283, "grad_norm": 4.09375, "learning_rate": 1.215600826656113e-06, "loss": 0.72951822, "memory(GiB)": 147.13, "step": 66870, "train_speed(iter/s)": 0.201107 }, { "acc": 0.77838449, "epoch": 1.560361043468117, "grad_norm": 4.65625, "learning_rate": 1.2143664602063415e-06, "loss": 0.79508176, "memory(GiB)": 147.13, "step": 66880, "train_speed(iter/s)": 0.201123 }, { "acc": 0.78288064, "epoch": 1.5605943510404061, "grad_norm": 6.15625, "learning_rate": 1.2131326341546596e-06, "loss": 0.78689842, "memory(GiB)": 147.13, "step": 66890, "train_speed(iter/s)": 0.201138 }, { "acc": 0.7747613, "epoch": 1.5608276586126948, "grad_norm": 5.5, "learning_rate": 1.2118993486771924e-06, "loss": 0.80352983, "memory(GiB)": 147.13, "step": 66900, "train_speed(iter/s)": 0.201153 }, { "acc": 0.78817844, "epoch": 1.5610609661849837, "grad_norm": 6.28125, "learning_rate": 1.2106666039499942e-06, "loss": 0.77384362, "memory(GiB)": 147.13, "step": 66910, "train_speed(iter/s)": 0.201168 }, { "acc": 0.75901203, "epoch": 1.5612942737572726, "grad_norm": 5.40625, "learning_rate": 1.2094344001490383e-06, "loss": 0.88302488, "memory(GiB)": 147.13, "step": 66920, "train_speed(iter/s)": 0.201185 }, { "acc": 0.78414526, "epoch": 1.5615275813295615, "grad_norm": 6.0625, "learning_rate": 1.2082027374502181e-06, "loss": 0.79911737, "memory(GiB)": 147.13, "step": 66930, "train_speed(iter/s)": 0.201201 }, { "acc": 0.77624192, "epoch": 1.5617608889018504, "grad_norm": 5.9375, "learning_rate": 1.2069716160293577e-06, "loss": 0.80140715, "memory(GiB)": 147.13, "step": 66940, "train_speed(iter/s)": 0.201216 }, { "acc": 0.7758502, "epoch": 1.5619941964741393, "grad_norm": 5.0625, "learning_rate": 1.2057410360621952e-06, "loss": 0.80642414, "memory(GiB)": 147.13, "step": 66950, "train_speed(iter/s)": 0.201233 }, { "acc": 0.76256318, "epoch": 1.5622275040464282, "grad_norm": 10.5625, "learning_rate": 1.2045109977243996e-06, "loss": 0.84569578, "memory(GiB)": 147.13, "step": 66960, "train_speed(iter/s)": 0.201249 }, { "acc": 0.79698315, "epoch": 1.562460811618717, "grad_norm": 5.59375, "learning_rate": 1.203281501191556e-06, "loss": 0.72654285, "memory(GiB)": 147.13, "step": 66970, "train_speed(iter/s)": 0.201265 }, { "acc": 0.78373365, "epoch": 1.562694119191006, "grad_norm": 7.03125, "learning_rate": 1.2020525466391758e-06, "loss": 0.76392503, "memory(GiB)": 147.13, "step": 66980, "train_speed(iter/s)": 0.20128 }, { "acc": 0.79392319, "epoch": 1.5629274267632949, "grad_norm": 7.875, "learning_rate": 1.2008241342426907e-06, "loss": 0.71769762, "memory(GiB)": 147.13, "step": 66990, "train_speed(iter/s)": 0.201296 }, { "acc": 0.79626703, "epoch": 1.5631607343355838, "grad_norm": 6.46875, "learning_rate": 1.1995962641774556e-06, "loss": 0.71886544, "memory(GiB)": 147.13, "step": 67000, "train_speed(iter/s)": 0.201311 }, { "epoch": 1.5631607343355838, "eval_acc": 0.7446565875089741, "eval_loss": 0.80445796251297, "eval_runtime": 1269.6632, "eval_samples_per_second": 28.347, "eval_steps_per_second": 14.174, "step": 67000 }, { "acc": 0.7860218, "epoch": 1.5633940419078727, "grad_norm": 5.0, "learning_rate": 1.1983689366187512e-06, "loss": 0.77764053, "memory(GiB)": 147.13, "step": 67010, "train_speed(iter/s)": 0.200547 }, { "acc": 0.77029128, "epoch": 1.5636273494801616, "grad_norm": 5.8125, "learning_rate": 1.1971421517417748e-06, "loss": 0.85604916, "memory(GiB)": 147.13, "step": 67020, "train_speed(iter/s)": 0.200564 }, { "acc": 0.78226252, "epoch": 1.5638606570524505, "grad_norm": 5.09375, "learning_rate": 1.1959159097216533e-06, "loss": 0.78034925, "memory(GiB)": 147.13, "step": 67030, "train_speed(iter/s)": 0.200579 }, { "acc": 0.78731737, "epoch": 1.5640939646247394, "grad_norm": 6.03125, "learning_rate": 1.1946902107334308e-06, "loss": 0.7751092, "memory(GiB)": 147.13, "step": 67040, "train_speed(iter/s)": 0.200595 }, { "acc": 0.77418489, "epoch": 1.5643272721970283, "grad_norm": 6.65625, "learning_rate": 1.1934650549520737e-06, "loss": 0.81517506, "memory(GiB)": 147.13, "step": 67050, "train_speed(iter/s)": 0.200611 }, { "acc": 0.7982223, "epoch": 1.5645605797693172, "grad_norm": 5.6875, "learning_rate": 1.1922404425524753e-06, "loss": 0.72336531, "memory(GiB)": 147.13, "step": 67060, "train_speed(iter/s)": 0.200626 }, { "acc": 0.78747759, "epoch": 1.564793887341606, "grad_norm": 5.03125, "learning_rate": 1.1910163737094465e-06, "loss": 0.75430288, "memory(GiB)": 147.13, "step": 67070, "train_speed(iter/s)": 0.200641 }, { "acc": 0.78929777, "epoch": 1.565027194913895, "grad_norm": 5.21875, "learning_rate": 1.189792848597725e-06, "loss": 0.77023878, "memory(GiB)": 147.13, "step": 67080, "train_speed(iter/s)": 0.200657 }, { "acc": 0.75839481, "epoch": 1.5652605024861839, "grad_norm": 6.5, "learning_rate": 1.1885698673919666e-06, "loss": 0.86226034, "memory(GiB)": 147.13, "step": 67090, "train_speed(iter/s)": 0.200672 }, { "acc": 0.77864447, "epoch": 1.5654938100584728, "grad_norm": 5.59375, "learning_rate": 1.1873474302667548e-06, "loss": 0.81831455, "memory(GiB)": 147.13, "step": 67100, "train_speed(iter/s)": 0.200688 }, { "acc": 0.77476091, "epoch": 1.5657271176307614, "grad_norm": 5.625, "learning_rate": 1.186125537396587e-06, "loss": 0.80706577, "memory(GiB)": 147.13, "step": 67110, "train_speed(iter/s)": 0.200704 }, { "acc": 0.76268578, "epoch": 1.5659604252030506, "grad_norm": 4.84375, "learning_rate": 1.1849041889558922e-06, "loss": 0.85323725, "memory(GiB)": 147.13, "step": 67120, "train_speed(iter/s)": 0.200718 }, { "acc": 0.78476624, "epoch": 1.5661937327753392, "grad_norm": 4.375, "learning_rate": 1.1836833851190161e-06, "loss": 0.78540063, "memory(GiB)": 147.13, "step": 67130, "train_speed(iter/s)": 0.200733 }, { "acc": 0.79616303, "epoch": 1.5664270403476284, "grad_norm": 6.25, "learning_rate": 1.1824631260602266e-06, "loss": 0.71438951, "memory(GiB)": 147.13, "step": 67140, "train_speed(iter/s)": 0.200749 }, { "acc": 0.79416599, "epoch": 1.566660347919917, "grad_norm": 4.59375, "learning_rate": 1.1812434119537187e-06, "loss": 0.74181514, "memory(GiB)": 147.13, "step": 67150, "train_speed(iter/s)": 0.200765 }, { "acc": 0.78638306, "epoch": 1.5668936554922062, "grad_norm": 8.25, "learning_rate": 1.1800242429736025e-06, "loss": 0.75746689, "memory(GiB)": 147.13, "step": 67160, "train_speed(iter/s)": 0.200781 }, { "acc": 0.76691604, "epoch": 1.5671269630644948, "grad_norm": 4.84375, "learning_rate": 1.1788056192939173e-06, "loss": 0.85675087, "memory(GiB)": 147.13, "step": 67170, "train_speed(iter/s)": 0.200796 }, { "acc": 0.78023224, "epoch": 1.567360270636784, "grad_norm": 5.75, "learning_rate": 1.1775875410886206e-06, "loss": 0.80645647, "memory(GiB)": 147.13, "step": 67180, "train_speed(iter/s)": 0.200811 }, { "acc": 0.80352058, "epoch": 1.5675935782090726, "grad_norm": 3.640625, "learning_rate": 1.17637000853159e-06, "loss": 0.68619394, "memory(GiB)": 147.13, "step": 67190, "train_speed(iter/s)": 0.200826 }, { "acc": 0.77900271, "epoch": 1.5678268857813618, "grad_norm": 4.90625, "learning_rate": 1.1751530217966312e-06, "loss": 0.79527159, "memory(GiB)": 147.13, "step": 67200, "train_speed(iter/s)": 0.200842 }, { "acc": 0.79193454, "epoch": 1.5680601933536504, "grad_norm": 9.1875, "learning_rate": 1.1739365810574677e-06, "loss": 0.74616985, "memory(GiB)": 147.13, "step": 67210, "train_speed(iter/s)": 0.200857 }, { "acc": 0.79721117, "epoch": 1.5682935009259396, "grad_norm": 9.0, "learning_rate": 1.1727206864877456e-06, "loss": 0.72338243, "memory(GiB)": 147.13, "step": 67220, "train_speed(iter/s)": 0.200872 }, { "acc": 0.77702656, "epoch": 1.5685268084982282, "grad_norm": 4.8125, "learning_rate": 1.1715053382610325e-06, "loss": 0.79841452, "memory(GiB)": 147.13, "step": 67230, "train_speed(iter/s)": 0.200888 }, { "acc": 0.74713612, "epoch": 1.5687601160705174, "grad_norm": 4.5625, "learning_rate": 1.170290536550821e-06, "loss": 0.89974098, "memory(GiB)": 147.13, "step": 67240, "train_speed(iter/s)": 0.200905 }, { "acc": 0.77886939, "epoch": 1.568993423642806, "grad_norm": 3.78125, "learning_rate": 1.1690762815305224e-06, "loss": 0.80204468, "memory(GiB)": 147.13, "step": 67250, "train_speed(iter/s)": 0.20092 }, { "acc": 0.77801399, "epoch": 1.5692267312150951, "grad_norm": 5.78125, "learning_rate": 1.1678625733734722e-06, "loss": 0.79122267, "memory(GiB)": 147.13, "step": 67260, "train_speed(iter/s)": 0.200936 }, { "acc": 0.77463903, "epoch": 1.5694600387873838, "grad_norm": 5.875, "learning_rate": 1.166649412252927e-06, "loss": 0.80886192, "memory(GiB)": 147.13, "step": 67270, "train_speed(iter/s)": 0.200952 }, { "acc": 0.81241455, "epoch": 1.569693346359673, "grad_norm": 4.8125, "learning_rate": 1.1654367983420628e-06, "loss": 0.65604534, "memory(GiB)": 147.13, "step": 67280, "train_speed(iter/s)": 0.200968 }, { "acc": 0.78449936, "epoch": 1.5699266539319616, "grad_norm": 5.0, "learning_rate": 1.1642247318139837e-06, "loss": 0.77122612, "memory(GiB)": 147.13, "step": 67290, "train_speed(iter/s)": 0.200984 }, { "acc": 0.77493362, "epoch": 1.5701599615042505, "grad_norm": 5.46875, "learning_rate": 1.1630132128417083e-06, "loss": 0.80600281, "memory(GiB)": 147.13, "step": 67300, "train_speed(iter/s)": 0.200999 }, { "acc": 0.7923779, "epoch": 1.5703932690765394, "grad_norm": 6.875, "learning_rate": 1.1618022415981827e-06, "loss": 0.7523242, "memory(GiB)": 147.13, "step": 67310, "train_speed(iter/s)": 0.201015 }, { "acc": 0.7808217, "epoch": 1.5706265766488283, "grad_norm": 5.9375, "learning_rate": 1.1605918182562731e-06, "loss": 0.784408, "memory(GiB)": 147.13, "step": 67320, "train_speed(iter/s)": 0.20103 }, { "acc": 0.77961884, "epoch": 1.5708598842211172, "grad_norm": 5.96875, "learning_rate": 1.1593819429887655e-06, "loss": 0.78548775, "memory(GiB)": 147.13, "step": 67330, "train_speed(iter/s)": 0.201046 }, { "acc": 0.79387894, "epoch": 1.5710931917934061, "grad_norm": 5.25, "learning_rate": 1.1581726159683698e-06, "loss": 0.73550348, "memory(GiB)": 147.13, "step": 67340, "train_speed(iter/s)": 0.201061 }, { "acc": 0.77411294, "epoch": 1.571326499365695, "grad_norm": 17.125, "learning_rate": 1.1569638373677162e-06, "loss": 0.80191069, "memory(GiB)": 147.13, "step": 67350, "train_speed(iter/s)": 0.201076 }, { "acc": 0.77730083, "epoch": 1.571559806937984, "grad_norm": 5.03125, "learning_rate": 1.1557556073593595e-06, "loss": 0.79704728, "memory(GiB)": 147.13, "step": 67360, "train_speed(iter/s)": 0.201092 }, { "acc": 0.77600565, "epoch": 1.5717931145102728, "grad_norm": 5.875, "learning_rate": 1.1545479261157715e-06, "loss": 0.80014782, "memory(GiB)": 147.13, "step": 67370, "train_speed(iter/s)": 0.201107 }, { "acc": 0.76335773, "epoch": 1.5720264220825617, "grad_norm": 5.9375, "learning_rate": 1.1533407938093515e-06, "loss": 0.8461832, "memory(GiB)": 147.13, "step": 67380, "train_speed(iter/s)": 0.201123 }, { "acc": 0.7748148, "epoch": 1.5722597296548506, "grad_norm": 5.34375, "learning_rate": 1.1521342106124145e-06, "loss": 0.80657759, "memory(GiB)": 147.13, "step": 67390, "train_speed(iter/s)": 0.201138 }, { "acc": 0.80296421, "epoch": 1.5724930372271395, "grad_norm": 4.21875, "learning_rate": 1.1509281766972026e-06, "loss": 0.6991787, "memory(GiB)": 147.13, "step": 67400, "train_speed(iter/s)": 0.201153 }, { "acc": 0.78165808, "epoch": 1.5727263447994284, "grad_norm": 6.0, "learning_rate": 1.149722692235875e-06, "loss": 0.79229851, "memory(GiB)": 147.13, "step": 67410, "train_speed(iter/s)": 0.201168 }, { "acc": 0.77584095, "epoch": 1.5729596523717173, "grad_norm": 5.46875, "learning_rate": 1.1485177574005134e-06, "loss": 0.79919806, "memory(GiB)": 147.13, "step": 67420, "train_speed(iter/s)": 0.201184 }, { "acc": 0.77663813, "epoch": 1.5731929599440062, "grad_norm": 6.40625, "learning_rate": 1.1473133723631241e-06, "loss": 0.83092327, "memory(GiB)": 147.13, "step": 67430, "train_speed(iter/s)": 0.2012 }, { "acc": 0.761483, "epoch": 1.573426267516295, "grad_norm": 4.53125, "learning_rate": 1.1461095372956322e-06, "loss": 0.86643829, "memory(GiB)": 147.13, "step": 67440, "train_speed(iter/s)": 0.201216 }, { "acc": 0.76875048, "epoch": 1.573659575088584, "grad_norm": 3.765625, "learning_rate": 1.1449062523698839e-06, "loss": 0.82976036, "memory(GiB)": 147.13, "step": 67450, "train_speed(iter/s)": 0.201231 }, { "acc": 0.77960482, "epoch": 1.573892882660873, "grad_norm": 5.375, "learning_rate": 1.1437035177576467e-06, "loss": 0.79715147, "memory(GiB)": 147.13, "step": 67460, "train_speed(iter/s)": 0.201245 }, { "acc": 0.77590551, "epoch": 1.5741261902331618, "grad_norm": 4.9375, "learning_rate": 1.142501333630614e-06, "loss": 0.79287214, "memory(GiB)": 147.13, "step": 67470, "train_speed(iter/s)": 0.201262 }, { "acc": 0.7959466, "epoch": 1.5743594978054507, "grad_norm": 3.984375, "learning_rate": 1.1412997001603947e-06, "loss": 0.74863319, "memory(GiB)": 147.13, "step": 67480, "train_speed(iter/s)": 0.201277 }, { "acc": 0.80535631, "epoch": 1.5745928053777396, "grad_norm": 35.5, "learning_rate": 1.1400986175185214e-06, "loss": 0.70440435, "memory(GiB)": 147.13, "step": 67490, "train_speed(iter/s)": 0.201291 }, { "acc": 0.78224182, "epoch": 1.5748261129500283, "grad_norm": 5.3125, "learning_rate": 1.1388980858764504e-06, "loss": 0.76127481, "memory(GiB)": 147.13, "step": 67500, "train_speed(iter/s)": 0.201306 }, { "epoch": 1.5748261129500283, "eval_acc": 0.74470014674016, "eval_loss": 0.804486095905304, "eval_runtime": 1270.1666, "eval_samples_per_second": 28.336, "eval_steps_per_second": 14.168, "step": 67500 }, { "acc": 0.78972077, "epoch": 1.5750594205223174, "grad_norm": 5.1875, "learning_rate": 1.1376981054055542e-06, "loss": 0.75714273, "memory(GiB)": 147.13, "step": 67510, "train_speed(iter/s)": 0.200548 }, { "acc": 0.75635891, "epoch": 1.575292728094606, "grad_norm": 5.65625, "learning_rate": 1.136498676277133e-06, "loss": 0.8773056, "memory(GiB)": 147.13, "step": 67520, "train_speed(iter/s)": 0.200564 }, { "acc": 0.78398147, "epoch": 1.5755260356668952, "grad_norm": 4.59375, "learning_rate": 1.135299798662402e-06, "loss": 0.76907773, "memory(GiB)": 147.13, "step": 67530, "train_speed(iter/s)": 0.200579 }, { "acc": 0.77541685, "epoch": 1.5757593432391839, "grad_norm": 5.28125, "learning_rate": 1.1341014727325038e-06, "loss": 0.83951645, "memory(GiB)": 147.13, "step": 67540, "train_speed(iter/s)": 0.200594 }, { "acc": 0.77365732, "epoch": 1.575992650811473, "grad_norm": 4.9375, "learning_rate": 1.1329036986584968e-06, "loss": 0.80199337, "memory(GiB)": 147.13, "step": 67550, "train_speed(iter/s)": 0.20061 }, { "acc": 0.77182055, "epoch": 1.5762259583837617, "grad_norm": 10.0, "learning_rate": 1.131706476611364e-06, "loss": 0.84103651, "memory(GiB)": 147.13, "step": 67560, "train_speed(iter/s)": 0.200624 }, { "acc": 0.78937941, "epoch": 1.5764592659560508, "grad_norm": 5.375, "learning_rate": 1.1305098067620074e-06, "loss": 0.74375906, "memory(GiB)": 147.13, "step": 67570, "train_speed(iter/s)": 0.200638 }, { "acc": 0.78117437, "epoch": 1.5766925735283395, "grad_norm": 5.3125, "learning_rate": 1.1293136892812507e-06, "loss": 0.79897671, "memory(GiB)": 147.13, "step": 67580, "train_speed(iter/s)": 0.200653 }, { "acc": 0.78500509, "epoch": 1.5769258811006286, "grad_norm": 4.625, "learning_rate": 1.1281181243398414e-06, "loss": 0.76446457, "memory(GiB)": 147.13, "step": 67590, "train_speed(iter/s)": 0.200668 }, { "acc": 0.76707001, "epoch": 1.5771591886729173, "grad_norm": 4.9375, "learning_rate": 1.1269231121084439e-06, "loss": 0.83135767, "memory(GiB)": 147.13, "step": 67600, "train_speed(iter/s)": 0.200683 }, { "acc": 0.78357925, "epoch": 1.5773924962452064, "grad_norm": 5.6875, "learning_rate": 1.1257286527576488e-06, "loss": 0.77646589, "memory(GiB)": 147.13, "step": 67610, "train_speed(iter/s)": 0.200698 }, { "acc": 0.76686993, "epoch": 1.577625803817495, "grad_norm": 6.46875, "learning_rate": 1.1245347464579626e-06, "loss": 0.83330984, "memory(GiB)": 147.13, "step": 67620, "train_speed(iter/s)": 0.200714 }, { "acc": 0.77705078, "epoch": 1.5778591113897842, "grad_norm": 5.1875, "learning_rate": 1.1233413933798143e-06, "loss": 0.80012417, "memory(GiB)": 147.13, "step": 67630, "train_speed(iter/s)": 0.200729 }, { "acc": 0.78694525, "epoch": 1.5780924189620729, "grad_norm": 6.53125, "learning_rate": 1.1221485936935571e-06, "loss": 0.75556831, "memory(GiB)": 147.13, "step": 67640, "train_speed(iter/s)": 0.200742 }, { "acc": 0.7862113, "epoch": 1.578325726534362, "grad_norm": 4.59375, "learning_rate": 1.1209563475694608e-06, "loss": 0.77506971, "memory(GiB)": 147.13, "step": 67650, "train_speed(iter/s)": 0.200755 }, { "acc": 0.76903219, "epoch": 1.5785590341066507, "grad_norm": 6.15625, "learning_rate": 1.1197646551777196e-06, "loss": 0.83937769, "memory(GiB)": 147.13, "step": 67660, "train_speed(iter/s)": 0.20077 }, { "acc": 0.78912687, "epoch": 1.5787923416789398, "grad_norm": 4.53125, "learning_rate": 1.118573516688447e-06, "loss": 0.75033379, "memory(GiB)": 147.13, "step": 67670, "train_speed(iter/s)": 0.200786 }, { "acc": 0.7752059, "epoch": 1.5790256492512285, "grad_norm": 4.625, "learning_rate": 1.1173829322716774e-06, "loss": 0.80030231, "memory(GiB)": 147.13, "step": 67680, "train_speed(iter/s)": 0.200801 }, { "acc": 0.77555189, "epoch": 1.5792589568235174, "grad_norm": 4.8125, "learning_rate": 1.116192902097365e-06, "loss": 0.79320483, "memory(GiB)": 147.13, "step": 67690, "train_speed(iter/s)": 0.200816 }, { "acc": 0.77827835, "epoch": 1.5794922643958063, "grad_norm": 8.25, "learning_rate": 1.1150034263353887e-06, "loss": 0.78702092, "memory(GiB)": 147.13, "step": 67700, "train_speed(iter/s)": 0.200832 }, { "acc": 0.78876238, "epoch": 1.5797255719680952, "grad_norm": 4.40625, "learning_rate": 1.113814505155545e-06, "loss": 0.75948009, "memory(GiB)": 147.13, "step": 67710, "train_speed(iter/s)": 0.200848 }, { "acc": 0.78561726, "epoch": 1.579958879540384, "grad_norm": 4.3125, "learning_rate": 1.11262613872755e-06, "loss": 0.78061628, "memory(GiB)": 147.13, "step": 67720, "train_speed(iter/s)": 0.200863 }, { "acc": 0.76869831, "epoch": 1.580192187112673, "grad_norm": 4.6875, "learning_rate": 1.111438327221046e-06, "loss": 0.84357386, "memory(GiB)": 147.13, "step": 67730, "train_speed(iter/s)": 0.200878 }, { "acc": 0.79987097, "epoch": 1.5804254946849619, "grad_norm": 4.78125, "learning_rate": 1.1102510708055897e-06, "loss": 0.71598945, "memory(GiB)": 147.13, "step": 67740, "train_speed(iter/s)": 0.200893 }, { "acc": 0.76315088, "epoch": 1.5806588022572508, "grad_norm": 6.78125, "learning_rate": 1.1090643696506648e-06, "loss": 0.87976942, "memory(GiB)": 147.13, "step": 67750, "train_speed(iter/s)": 0.200909 }, { "acc": 0.79254284, "epoch": 1.5808921098295396, "grad_norm": 4.34375, "learning_rate": 1.1078782239256707e-06, "loss": 0.71899223, "memory(GiB)": 147.13, "step": 67760, "train_speed(iter/s)": 0.200924 }, { "acc": 0.78601794, "epoch": 1.5811254174018285, "grad_norm": 4.875, "learning_rate": 1.106692633799928e-06, "loss": 0.75329552, "memory(GiB)": 147.13, "step": 67770, "train_speed(iter/s)": 0.20094 }, { "acc": 0.77427597, "epoch": 1.5813587249741174, "grad_norm": 6.0625, "learning_rate": 1.1055075994426833e-06, "loss": 0.818151, "memory(GiB)": 147.13, "step": 67780, "train_speed(iter/s)": 0.200955 }, { "acc": 0.79199877, "epoch": 1.5815920325464063, "grad_norm": 6.875, "learning_rate": 1.1043231210230949e-06, "loss": 0.73157892, "memory(GiB)": 147.13, "step": 67790, "train_speed(iter/s)": 0.20097 }, { "acc": 0.77825437, "epoch": 1.5818253401186952, "grad_norm": 6.09375, "learning_rate": 1.1031391987102502e-06, "loss": 0.78899622, "memory(GiB)": 147.13, "step": 67800, "train_speed(iter/s)": 0.200985 }, { "acc": 0.7796669, "epoch": 1.5820586476909841, "grad_norm": 7.0625, "learning_rate": 1.1019558326731522e-06, "loss": 0.78667364, "memory(GiB)": 147.13, "step": 67810, "train_speed(iter/s)": 0.201 }, { "acc": 0.76001234, "epoch": 1.582291955263273, "grad_norm": 6.125, "learning_rate": 1.100773023080728e-06, "loss": 0.8563343, "memory(GiB)": 147.13, "step": 67820, "train_speed(iter/s)": 0.201016 }, { "acc": 0.76715779, "epoch": 1.582525262835562, "grad_norm": 6.3125, "learning_rate": 1.09959077010182e-06, "loss": 0.83880901, "memory(GiB)": 147.13, "step": 67830, "train_speed(iter/s)": 0.201032 }, { "acc": 0.77683773, "epoch": 1.5827585704078508, "grad_norm": 4.09375, "learning_rate": 1.0984090739051984e-06, "loss": 0.79647036, "memory(GiB)": 147.13, "step": 67840, "train_speed(iter/s)": 0.201048 }, { "acc": 0.77974472, "epoch": 1.5829918779801397, "grad_norm": 4.6875, "learning_rate": 1.0972279346595477e-06, "loss": 0.78166647, "memory(GiB)": 147.13, "step": 67850, "train_speed(iter/s)": 0.201063 }, { "acc": 0.78405037, "epoch": 1.5832251855524286, "grad_norm": 5.03125, "learning_rate": 1.0960473525334747e-06, "loss": 0.76982818, "memory(GiB)": 147.13, "step": 67860, "train_speed(iter/s)": 0.201077 }, { "acc": 0.76590328, "epoch": 1.5834584931247173, "grad_norm": 4.3125, "learning_rate": 1.094867327695509e-06, "loss": 0.85223007, "memory(GiB)": 147.13, "step": 67870, "train_speed(iter/s)": 0.201092 }, { "acc": 0.79253726, "epoch": 1.5836918006970064, "grad_norm": 4.71875, "learning_rate": 1.0936878603140966e-06, "loss": 0.72532487, "memory(GiB)": 147.13, "step": 67880, "train_speed(iter/s)": 0.201108 }, { "acc": 0.79152641, "epoch": 1.5839251082692951, "grad_norm": 5.0625, "learning_rate": 1.0925089505576085e-06, "loss": 0.75220585, "memory(GiB)": 147.13, "step": 67890, "train_speed(iter/s)": 0.201123 }, { "acc": 0.77057228, "epoch": 1.5841584158415842, "grad_norm": 5.9375, "learning_rate": 1.0913305985943328e-06, "loss": 0.83439999, "memory(GiB)": 147.13, "step": 67900, "train_speed(iter/s)": 0.201138 }, { "acc": 0.78468051, "epoch": 1.584391723413873, "grad_norm": 7.0, "learning_rate": 1.0901528045924786e-06, "loss": 0.75254793, "memory(GiB)": 147.13, "step": 67910, "train_speed(iter/s)": 0.201152 }, { "acc": 0.77283545, "epoch": 1.584625030986162, "grad_norm": 4.5, "learning_rate": 1.0889755687201758e-06, "loss": 0.82278538, "memory(GiB)": 147.13, "step": 67920, "train_speed(iter/s)": 0.201168 }, { "acc": 0.78898163, "epoch": 1.5848583385584507, "grad_norm": 5.46875, "learning_rate": 1.087798891145473e-06, "loss": 0.75745392, "memory(GiB)": 147.13, "step": 67930, "train_speed(iter/s)": 0.201184 }, { "acc": 0.75619049, "epoch": 1.5850916461307398, "grad_norm": 6.5, "learning_rate": 1.0866227720363431e-06, "loss": 0.86945744, "memory(GiB)": 147.13, "step": 67940, "train_speed(iter/s)": 0.2012 }, { "acc": 0.78170376, "epoch": 1.5853249537030285, "grad_norm": 5.8125, "learning_rate": 1.0854472115606745e-06, "loss": 0.79862347, "memory(GiB)": 147.13, "step": 67950, "train_speed(iter/s)": 0.201216 }, { "acc": 0.80022554, "epoch": 1.5855582612753176, "grad_norm": 5.4375, "learning_rate": 1.0842722098862813e-06, "loss": 0.70132508, "memory(GiB)": 147.13, "step": 67960, "train_speed(iter/s)": 0.201231 }, { "acc": 0.79483395, "epoch": 1.5857915688476063, "grad_norm": 4.8125, "learning_rate": 1.0830977671808918e-06, "loss": 0.74464898, "memory(GiB)": 147.13, "step": 67970, "train_speed(iter/s)": 0.201246 }, { "acc": 0.7618938, "epoch": 1.5860248764198954, "grad_norm": 5.9375, "learning_rate": 1.081923883612157e-06, "loss": 0.85374565, "memory(GiB)": 147.13, "step": 67980, "train_speed(iter/s)": 0.201261 }, { "acc": 0.77546587, "epoch": 1.586258183992184, "grad_norm": 5.1875, "learning_rate": 1.080750559347651e-06, "loss": 0.79738269, "memory(GiB)": 147.13, "step": 67990, "train_speed(iter/s)": 0.201277 }, { "acc": 0.78763037, "epoch": 1.5864914915644732, "grad_norm": 4.5625, "learning_rate": 1.0795777945548624e-06, "loss": 0.78351851, "memory(GiB)": 147.13, "step": 68000, "train_speed(iter/s)": 0.201292 }, { "epoch": 1.5864914915644732, "eval_acc": 0.7446918192400804, "eval_loss": 0.8044730424880981, "eval_runtime": 1269.9877, "eval_samples_per_second": 28.34, "eval_steps_per_second": 14.17, "step": 68000 }, { "acc": 0.78458452, "epoch": 1.586724799136762, "grad_norm": 5.96875, "learning_rate": 1.078405589401208e-06, "loss": 0.77951698, "memory(GiB)": 147.13, "step": 68010, "train_speed(iter/s)": 0.200538 }, { "acc": 0.77816787, "epoch": 1.586958106709051, "grad_norm": 7.375, "learning_rate": 1.0772339440540135e-06, "loss": 0.77889547, "memory(GiB)": 147.13, "step": 68020, "train_speed(iter/s)": 0.200553 }, { "acc": 0.77970071, "epoch": 1.5871914142813397, "grad_norm": 5.6875, "learning_rate": 1.076062858680535e-06, "loss": 0.78750424, "memory(GiB)": 147.13, "step": 68030, "train_speed(iter/s)": 0.200567 }, { "acc": 0.79411736, "epoch": 1.5874247218536288, "grad_norm": 3.90625, "learning_rate": 1.0748923334479427e-06, "loss": 0.73421063, "memory(GiB)": 147.13, "step": 68040, "train_speed(iter/s)": 0.200582 }, { "acc": 0.78528967, "epoch": 1.5876580294259175, "grad_norm": 5.4375, "learning_rate": 1.0737223685233306e-06, "loss": 0.76053562, "memory(GiB)": 147.13, "step": 68050, "train_speed(iter/s)": 0.200597 }, { "acc": 0.77646742, "epoch": 1.5878913369982064, "grad_norm": 4.75, "learning_rate": 1.0725529640737098e-06, "loss": 0.80411501, "memory(GiB)": 147.13, "step": 68060, "train_speed(iter/s)": 0.200612 }, { "acc": 0.76218376, "epoch": 1.5881246445704953, "grad_norm": 5.15625, "learning_rate": 1.0713841202660114e-06, "loss": 0.86731377, "memory(GiB)": 147.13, "step": 68070, "train_speed(iter/s)": 0.200625 }, { "acc": 0.77366896, "epoch": 1.5883579521427842, "grad_norm": 5.0, "learning_rate": 1.0702158372670895e-06, "loss": 0.80369129, "memory(GiB)": 147.13, "step": 68080, "train_speed(iter/s)": 0.200641 }, { "acc": 0.77165804, "epoch": 1.588591259715073, "grad_norm": 5.0, "learning_rate": 1.0690481152437138e-06, "loss": 0.80997429, "memory(GiB)": 147.13, "step": 68090, "train_speed(iter/s)": 0.200655 }, { "acc": 0.7878973, "epoch": 1.588824567287362, "grad_norm": 6.15625, "learning_rate": 1.0678809543625796e-06, "loss": 0.76152925, "memory(GiB)": 147.13, "step": 68100, "train_speed(iter/s)": 0.200671 }, { "acc": 0.77761049, "epoch": 1.5890578748596509, "grad_norm": 6.40625, "learning_rate": 1.0667143547902964e-06, "loss": 0.80845604, "memory(GiB)": 147.13, "step": 68110, "train_speed(iter/s)": 0.200686 }, { "acc": 0.78582354, "epoch": 1.5892911824319398, "grad_norm": 4.0, "learning_rate": 1.065548316693395e-06, "loss": 0.75975571, "memory(GiB)": 147.13, "step": 68120, "train_speed(iter/s)": 0.200702 }, { "acc": 0.78570542, "epoch": 1.5895244900042287, "grad_norm": 4.6875, "learning_rate": 1.0643828402383317e-06, "loss": 0.7738452, "memory(GiB)": 147.13, "step": 68130, "train_speed(iter/s)": 0.200716 }, { "acc": 0.78605022, "epoch": 1.5897577975765176, "grad_norm": 5.3125, "learning_rate": 1.0632179255914716e-06, "loss": 0.76639547, "memory(GiB)": 147.13, "step": 68140, "train_speed(iter/s)": 0.200731 }, { "acc": 0.77231016, "epoch": 1.5899911051488065, "grad_norm": 5.5625, "learning_rate": 1.06205357291911e-06, "loss": 0.79767914, "memory(GiB)": 147.13, "step": 68150, "train_speed(iter/s)": 0.200746 }, { "acc": 0.78238726, "epoch": 1.5902244127210954, "grad_norm": 8.875, "learning_rate": 1.0608897823874565e-06, "loss": 0.76666489, "memory(GiB)": 147.13, "step": 68160, "train_speed(iter/s)": 0.200759 }, { "acc": 0.78485689, "epoch": 1.5904577202933843, "grad_norm": 5.875, "learning_rate": 1.0597265541626428e-06, "loss": 0.77945495, "memory(GiB)": 147.13, "step": 68170, "train_speed(iter/s)": 0.200774 }, { "acc": 0.789116, "epoch": 1.5906910278656732, "grad_norm": 4.625, "learning_rate": 1.0585638884107174e-06, "loss": 0.76332469, "memory(GiB)": 147.13, "step": 68180, "train_speed(iter/s)": 0.200789 }, { "acc": 0.77295685, "epoch": 1.590924335437962, "grad_norm": 5.96875, "learning_rate": 1.0574017852976538e-06, "loss": 0.80912657, "memory(GiB)": 147.13, "step": 68190, "train_speed(iter/s)": 0.200803 }, { "acc": 0.78879251, "epoch": 1.591157643010251, "grad_norm": 6.28125, "learning_rate": 1.0562402449893394e-06, "loss": 0.75802283, "memory(GiB)": 147.13, "step": 68200, "train_speed(iter/s)": 0.20082 }, { "acc": 0.77317877, "epoch": 1.5913909505825399, "grad_norm": 6.1875, "learning_rate": 1.0550792676515836e-06, "loss": 0.81081858, "memory(GiB)": 147.13, "step": 68210, "train_speed(iter/s)": 0.200836 }, { "acc": 0.79010024, "epoch": 1.5916242581548288, "grad_norm": 7.875, "learning_rate": 1.0539188534501176e-06, "loss": 0.75586977, "memory(GiB)": 147.13, "step": 68220, "train_speed(iter/s)": 0.200851 }, { "acc": 0.80472164, "epoch": 1.5918575657271177, "grad_norm": 7.15625, "learning_rate": 1.0527590025505873e-06, "loss": 0.71593409, "memory(GiB)": 147.13, "step": 68230, "train_speed(iter/s)": 0.200866 }, { "acc": 0.78382373, "epoch": 1.5920908732994066, "grad_norm": 5.28125, "learning_rate": 1.051599715118566e-06, "loss": 0.78045077, "memory(GiB)": 147.13, "step": 68240, "train_speed(iter/s)": 0.200881 }, { "acc": 0.79803543, "epoch": 1.5923241808716955, "grad_norm": 6.09375, "learning_rate": 1.0504409913195346e-06, "loss": 0.71405649, "memory(GiB)": 147.13, "step": 68250, "train_speed(iter/s)": 0.200896 }, { "acc": 0.79116182, "epoch": 1.5925574884439841, "grad_norm": 5.09375, "learning_rate": 1.0492828313189064e-06, "loss": 0.73879242, "memory(GiB)": 147.13, "step": 68260, "train_speed(iter/s)": 0.200911 }, { "acc": 0.76173649, "epoch": 1.5927907960162733, "grad_norm": 7.75, "learning_rate": 1.0481252352820064e-06, "loss": 0.85937166, "memory(GiB)": 147.13, "step": 68270, "train_speed(iter/s)": 0.200926 }, { "acc": 0.78311167, "epoch": 1.593024103588562, "grad_norm": 4.4375, "learning_rate": 1.046968203374079e-06, "loss": 0.78482008, "memory(GiB)": 147.13, "step": 68280, "train_speed(iter/s)": 0.20094 }, { "acc": 0.75329967, "epoch": 1.593257411160851, "grad_norm": 7.40625, "learning_rate": 1.0458117357602944e-06, "loss": 0.89326878, "memory(GiB)": 147.13, "step": 68290, "train_speed(iter/s)": 0.200956 }, { "acc": 0.77799082, "epoch": 1.5934907187331397, "grad_norm": 4.40625, "learning_rate": 1.0446558326057342e-06, "loss": 0.79334536, "memory(GiB)": 147.13, "step": 68300, "train_speed(iter/s)": 0.200972 }, { "acc": 0.75523701, "epoch": 1.5937240263054289, "grad_norm": 6.28125, "learning_rate": 1.0435004940754062e-06, "loss": 0.90748978, "memory(GiB)": 147.13, "step": 68310, "train_speed(iter/s)": 0.200987 }, { "acc": 0.80369549, "epoch": 1.5939573338777175, "grad_norm": 6.0, "learning_rate": 1.0423457203342318e-06, "loss": 0.68501949, "memory(GiB)": 147.13, "step": 68320, "train_speed(iter/s)": 0.201001 }, { "acc": 0.78252659, "epoch": 1.5941906414500067, "grad_norm": 5.8125, "learning_rate": 1.0411915115470578e-06, "loss": 0.772118, "memory(GiB)": 147.13, "step": 68330, "train_speed(iter/s)": 0.201017 }, { "acc": 0.78786469, "epoch": 1.5944239490222953, "grad_norm": 6.15625, "learning_rate": 1.0400378678786449e-06, "loss": 0.75693922, "memory(GiB)": 147.13, "step": 68340, "train_speed(iter/s)": 0.201033 }, { "acc": 0.77552619, "epoch": 1.5946572565945845, "grad_norm": 4.3125, "learning_rate": 1.0388847894936765e-06, "loss": 0.80412102, "memory(GiB)": 147.13, "step": 68350, "train_speed(iter/s)": 0.201048 }, { "acc": 0.7692029, "epoch": 1.5948905641668731, "grad_norm": 7.34375, "learning_rate": 1.0377322765567533e-06, "loss": 0.85158634, "memory(GiB)": 147.13, "step": 68360, "train_speed(iter/s)": 0.201064 }, { "acc": 0.79861155, "epoch": 1.5951238717391623, "grad_norm": 5.21875, "learning_rate": 1.0365803292323956e-06, "loss": 0.68969359, "memory(GiB)": 147.13, "step": 68370, "train_speed(iter/s)": 0.201079 }, { "acc": 0.80761919, "epoch": 1.595357179311451, "grad_norm": 4.5, "learning_rate": 1.0354289476850459e-06, "loss": 0.67954183, "memory(GiB)": 147.13, "step": 68380, "train_speed(iter/s)": 0.201094 }, { "acc": 0.79081321, "epoch": 1.59559048688374, "grad_norm": 9.625, "learning_rate": 1.0342781320790606e-06, "loss": 0.74034171, "memory(GiB)": 147.13, "step": 68390, "train_speed(iter/s)": 0.201108 }, { "acc": 0.79297962, "epoch": 1.5958237944560287, "grad_norm": 4.03125, "learning_rate": 1.0331278825787211e-06, "loss": 0.75901136, "memory(GiB)": 147.13, "step": 68400, "train_speed(iter/s)": 0.201124 }, { "acc": 0.78131299, "epoch": 1.5960571020283179, "grad_norm": 6.625, "learning_rate": 1.0319781993482242e-06, "loss": 0.77496767, "memory(GiB)": 147.13, "step": 68410, "train_speed(iter/s)": 0.20114 }, { "acc": 0.77413187, "epoch": 1.5962904096006065, "grad_norm": 7.40625, "learning_rate": 1.0308290825516852e-06, "loss": 0.80257893, "memory(GiB)": 147.13, "step": 68420, "train_speed(iter/s)": 0.201154 }, { "acc": 0.78656511, "epoch": 1.5965237171728957, "grad_norm": 5.96875, "learning_rate": 1.0296805323531435e-06, "loss": 0.79238644, "memory(GiB)": 147.13, "step": 68430, "train_speed(iter/s)": 0.20117 }, { "acc": 0.7843667, "epoch": 1.5967570247451843, "grad_norm": 5.9375, "learning_rate": 1.0285325489165503e-06, "loss": 0.78407121, "memory(GiB)": 147.13, "step": 68440, "train_speed(iter/s)": 0.201187 }, { "acc": 0.81157207, "epoch": 1.5969903323174732, "grad_norm": 5.90625, "learning_rate": 1.0273851324057838e-06, "loss": 0.67407074, "memory(GiB)": 147.13, "step": 68450, "train_speed(iter/s)": 0.201202 }, { "acc": 0.79256735, "epoch": 1.5972236398897621, "grad_norm": 6.03125, "learning_rate": 1.026238282984634e-06, "loss": 0.74679794, "memory(GiB)": 147.13, "step": 68460, "train_speed(iter/s)": 0.201217 }, { "acc": 0.79176388, "epoch": 1.597456947462051, "grad_norm": 9.5625, "learning_rate": 1.025092000816818e-06, "loss": 0.74075713, "memory(GiB)": 147.13, "step": 68470, "train_speed(iter/s)": 0.201231 }, { "acc": 0.78975224, "epoch": 1.59769025503434, "grad_norm": 5.5625, "learning_rate": 1.023946286065961e-06, "loss": 0.74442635, "memory(GiB)": 147.13, "step": 68480, "train_speed(iter/s)": 0.201247 }, { "acc": 0.77964411, "epoch": 1.5979235626066288, "grad_norm": 4.78125, "learning_rate": 1.0228011388956182e-06, "loss": 0.7978488, "memory(GiB)": 147.13, "step": 68490, "train_speed(iter/s)": 0.201262 }, { "acc": 0.78166089, "epoch": 1.5981568701789177, "grad_norm": 5.09375, "learning_rate": 1.0216565594692573e-06, "loss": 0.78926048, "memory(GiB)": 147.13, "step": 68500, "train_speed(iter/s)": 0.201277 }, { "epoch": 1.5981568701789177, "eval_acc": 0.7447232075096114, "eval_loss": 0.8043951988220215, "eval_runtime": 1270.2348, "eval_samples_per_second": 28.334, "eval_steps_per_second": 14.167, "step": 68500 }, { "acc": 0.79227681, "epoch": 1.5983901777512066, "grad_norm": 6.90625, "learning_rate": 1.0205125479502658e-06, "loss": 0.74688244, "memory(GiB)": 147.13, "step": 68510, "train_speed(iter/s)": 0.200531 }, { "acc": 0.78717947, "epoch": 1.5986234853234955, "grad_norm": 5.5625, "learning_rate": 1.0193691045019533e-06, "loss": 0.7674963, "memory(GiB)": 147.13, "step": 68520, "train_speed(iter/s)": 0.200547 }, { "acc": 0.77596292, "epoch": 1.5988567928957844, "grad_norm": 4.5, "learning_rate": 1.0182262292875427e-06, "loss": 0.80983753, "memory(GiB)": 147.13, "step": 68530, "train_speed(iter/s)": 0.200563 }, { "acc": 0.77494078, "epoch": 1.5990901004680733, "grad_norm": 5.6875, "learning_rate": 1.0170839224701834e-06, "loss": 0.81508121, "memory(GiB)": 147.13, "step": 68540, "train_speed(iter/s)": 0.200578 }, { "acc": 0.78155966, "epoch": 1.5993234080403622, "grad_norm": 7.4375, "learning_rate": 1.015942184212937e-06, "loss": 0.81253033, "memory(GiB)": 147.13, "step": 68550, "train_speed(iter/s)": 0.200593 }, { "acc": 0.79896936, "epoch": 1.5995567156126511, "grad_norm": 7.65625, "learning_rate": 1.0148010146787845e-06, "loss": 0.70341702, "memory(GiB)": 148.85, "step": 68560, "train_speed(iter/s)": 0.200606 }, { "acc": 0.77949715, "epoch": 1.59979002318494, "grad_norm": 3.375, "learning_rate": 1.0136604140306312e-06, "loss": 0.7729764, "memory(GiB)": 138.1, "step": 68570, "train_speed(iter/s)": 0.200621 }, { "acc": 0.7776124, "epoch": 1.600023330757229, "grad_norm": 5.875, "learning_rate": 1.0125203824312957e-06, "loss": 0.78736258, "memory(GiB)": 138.1, "step": 68580, "train_speed(iter/s)": 0.200636 }, { "acc": 0.77759681, "epoch": 1.6002566383295178, "grad_norm": 5.875, "learning_rate": 1.0113809200435176e-06, "loss": 0.81777782, "memory(GiB)": 138.1, "step": 68590, "train_speed(iter/s)": 0.200651 }, { "acc": 0.78296614, "epoch": 1.6004899459018067, "grad_norm": 5.5, "learning_rate": 1.010242027029953e-06, "loss": 0.77656908, "memory(GiB)": 138.1, "step": 68600, "train_speed(iter/s)": 0.200667 }, { "acc": 0.77765617, "epoch": 1.6007232534740956, "grad_norm": 4.5, "learning_rate": 1.009103703553181e-06, "loss": 0.81307631, "memory(GiB)": 138.1, "step": 68610, "train_speed(iter/s)": 0.200682 }, { "acc": 0.78029776, "epoch": 1.6009565610463845, "grad_norm": 5.15625, "learning_rate": 1.0079659497756943e-06, "loss": 0.77876968, "memory(GiB)": 138.1, "step": 68620, "train_speed(iter/s)": 0.200697 }, { "acc": 0.77350893, "epoch": 1.6011898686186734, "grad_norm": 4.28125, "learning_rate": 1.0068287658599107e-06, "loss": 0.81584587, "memory(GiB)": 138.1, "step": 68630, "train_speed(iter/s)": 0.200711 }, { "acc": 0.77707477, "epoch": 1.6014231761909623, "grad_norm": 7.65625, "learning_rate": 1.0056921519681605e-06, "loss": 0.79458685, "memory(GiB)": 138.1, "step": 68640, "train_speed(iter/s)": 0.200726 }, { "acc": 0.80541954, "epoch": 1.601656483763251, "grad_norm": 6.09375, "learning_rate": 1.0045561082626936e-06, "loss": 0.68734684, "memory(GiB)": 138.1, "step": 68650, "train_speed(iter/s)": 0.200741 }, { "acc": 0.7977459, "epoch": 1.60188979133554, "grad_norm": 4.1875, "learning_rate": 1.0034206349056829e-06, "loss": 0.723417, "memory(GiB)": 138.1, "step": 68660, "train_speed(iter/s)": 0.200757 }, { "acc": 0.78180084, "epoch": 1.6021230989078288, "grad_norm": 6.5, "learning_rate": 1.002285732059215e-06, "loss": 0.79331684, "memory(GiB)": 138.1, "step": 68670, "train_speed(iter/s)": 0.200771 }, { "acc": 0.79122467, "epoch": 1.602356406480118, "grad_norm": 5.84375, "learning_rate": 1.001151399885298e-06, "loss": 0.74690819, "memory(GiB)": 138.1, "step": 68680, "train_speed(iter/s)": 0.200785 }, { "acc": 0.78143425, "epoch": 1.6025897140524066, "grad_norm": 6.125, "learning_rate": 1.000017638545857e-06, "loss": 0.77264366, "memory(GiB)": 138.1, "step": 68690, "train_speed(iter/s)": 0.200801 }, { "acc": 0.79719687, "epoch": 1.6028230216246957, "grad_norm": 5.90625, "learning_rate": 9.988844482027365e-07, "loss": 0.71532197, "memory(GiB)": 138.1, "step": 68700, "train_speed(iter/s)": 0.200816 }, { "acc": 0.77920446, "epoch": 1.6030563291969844, "grad_norm": 5.6875, "learning_rate": 9.97751829017699e-07, "loss": 0.77345262, "memory(GiB)": 138.1, "step": 68710, "train_speed(iter/s)": 0.200832 }, { "acc": 0.76342015, "epoch": 1.6032896367692735, "grad_norm": 6.75, "learning_rate": 9.966197811524231e-07, "loss": 0.85860195, "memory(GiB)": 138.1, "step": 68720, "train_speed(iter/s)": 0.200846 }, { "acc": 0.77934628, "epoch": 1.6035229443415622, "grad_norm": 6.21875, "learning_rate": 9.954883047685121e-07, "loss": 0.79521642, "memory(GiB)": 138.1, "step": 68730, "train_speed(iter/s)": 0.200862 }, { "acc": 0.781073, "epoch": 1.6037562519138513, "grad_norm": 5.1875, "learning_rate": 9.943574000274814e-07, "loss": 0.78763037, "memory(GiB)": 138.1, "step": 68740, "train_speed(iter/s)": 0.200878 }, { "acc": 0.77881775, "epoch": 1.60398955948614, "grad_norm": 5.59375, "learning_rate": 9.93227067090769e-07, "loss": 0.81242599, "memory(GiB)": 138.1, "step": 68750, "train_speed(iter/s)": 0.200894 }, { "acc": 0.77240114, "epoch": 1.604222867058429, "grad_norm": 5.75, "learning_rate": 9.920973061197291e-07, "loss": 0.80986681, "memory(GiB)": 138.1, "step": 68760, "train_speed(iter/s)": 0.200908 }, { "acc": 0.77040758, "epoch": 1.6044561746307178, "grad_norm": 5.53125, "learning_rate": 9.90968117275633e-07, "loss": 0.82993565, "memory(GiB)": 138.1, "step": 68770, "train_speed(iter/s)": 0.200923 }, { "acc": 0.76394424, "epoch": 1.604689482203007, "grad_norm": 5.625, "learning_rate": 9.898395007196747e-07, "loss": 0.84758568, "memory(GiB)": 138.1, "step": 68780, "train_speed(iter/s)": 0.200937 }, { "acc": 0.78739591, "epoch": 1.6049227897752956, "grad_norm": 6.0, "learning_rate": 9.887114566129613e-07, "loss": 0.75285759, "memory(GiB)": 138.1, "step": 68790, "train_speed(iter/s)": 0.200951 }, { "acc": 0.7844842, "epoch": 1.6051560973475847, "grad_norm": 8.5625, "learning_rate": 9.875839851165237e-07, "loss": 0.77806282, "memory(GiB)": 138.1, "step": 68800, "train_speed(iter/s)": 0.200966 }, { "acc": 0.77970557, "epoch": 1.6053894049198734, "grad_norm": 7.4375, "learning_rate": 9.86457086391307e-07, "loss": 0.8352437, "memory(GiB)": 138.1, "step": 68810, "train_speed(iter/s)": 0.20098 }, { "acc": 0.77098522, "epoch": 1.6056227124921625, "grad_norm": 7.4375, "learning_rate": 9.85330760598175e-07, "loss": 0.82055664, "memory(GiB)": 138.1, "step": 68820, "train_speed(iter/s)": 0.200995 }, { "acc": 0.77886381, "epoch": 1.6058560200644512, "grad_norm": 6.5, "learning_rate": 9.842050078979088e-07, "loss": 0.78484612, "memory(GiB)": 138.1, "step": 68830, "train_speed(iter/s)": 0.20101 }, { "acc": 0.79053154, "epoch": 1.60608932763674, "grad_norm": 3.765625, "learning_rate": 9.830798284512132e-07, "loss": 0.74266052, "memory(GiB)": 138.1, "step": 68840, "train_speed(iter/s)": 0.201025 }, { "acc": 0.79033446, "epoch": 1.606322635209029, "grad_norm": 5.15625, "learning_rate": 9.819552224187046e-07, "loss": 0.7502409, "memory(GiB)": 138.1, "step": 68850, "train_speed(iter/s)": 0.20104 }, { "acc": 0.78153863, "epoch": 1.6065559427813179, "grad_norm": 5.4375, "learning_rate": 9.808311899609197e-07, "loss": 0.76911235, "memory(GiB)": 138.1, "step": 68860, "train_speed(iter/s)": 0.201055 }, { "acc": 0.79210978, "epoch": 1.6067892503536068, "grad_norm": 5.03125, "learning_rate": 9.797077312383162e-07, "loss": 0.74023113, "memory(GiB)": 138.1, "step": 68870, "train_speed(iter/s)": 0.20107 }, { "acc": 0.79266319, "epoch": 1.6070225579258957, "grad_norm": 6.46875, "learning_rate": 9.785848464112647e-07, "loss": 0.76112881, "memory(GiB)": 138.1, "step": 68880, "train_speed(iter/s)": 0.201084 }, { "acc": 0.79358058, "epoch": 1.6072558654981846, "grad_norm": 3.65625, "learning_rate": 9.774625356400597e-07, "loss": 0.7407238, "memory(GiB)": 138.1, "step": 68890, "train_speed(iter/s)": 0.2011 }, { "acc": 0.78830051, "epoch": 1.6074891730704735, "grad_norm": 4.5625, "learning_rate": 9.763407990849089e-07, "loss": 0.75743866, "memory(GiB)": 138.1, "step": 68900, "train_speed(iter/s)": 0.201115 }, { "acc": 0.76708155, "epoch": 1.6077224806427624, "grad_norm": 5.34375, "learning_rate": 9.75219636905939e-07, "loss": 0.81055851, "memory(GiB)": 138.1, "step": 68910, "train_speed(iter/s)": 0.20113 }, { "acc": 0.78732371, "epoch": 1.6079557882150513, "grad_norm": 7.53125, "learning_rate": 9.74099049263198e-07, "loss": 0.76581669, "memory(GiB)": 138.1, "step": 68920, "train_speed(iter/s)": 0.201144 }, { "acc": 0.77004409, "epoch": 1.6081890957873402, "grad_norm": 4.6875, "learning_rate": 9.729790363166487e-07, "loss": 0.82656031, "memory(GiB)": 138.1, "step": 68930, "train_speed(iter/s)": 0.201159 }, { "acc": 0.79495058, "epoch": 1.608422403359629, "grad_norm": 4.1875, "learning_rate": 9.718595982261713e-07, "loss": 0.74895706, "memory(GiB)": 138.1, "step": 68940, "train_speed(iter/s)": 0.201172 }, { "acc": 0.78100176, "epoch": 1.608655710931918, "grad_norm": 6.09375, "learning_rate": 9.707407351515653e-07, "loss": 0.78046789, "memory(GiB)": 138.1, "step": 68950, "train_speed(iter/s)": 0.201188 }, { "acc": 0.78246336, "epoch": 1.6088890185042068, "grad_norm": 5.28125, "learning_rate": 9.696224472525494e-07, "loss": 0.78533907, "memory(GiB)": 138.1, "step": 68960, "train_speed(iter/s)": 0.201204 }, { "acc": 0.78976364, "epoch": 1.6091223260764957, "grad_norm": 7.0625, "learning_rate": 9.685047346887578e-07, "loss": 0.75304751, "memory(GiB)": 138.1, "step": 68970, "train_speed(iter/s)": 0.201219 }, { "acc": 0.78010139, "epoch": 1.6093556336487846, "grad_norm": 5.3125, "learning_rate": 9.673875976197455e-07, "loss": 0.79901323, "memory(GiB)": 138.1, "step": 68980, "train_speed(iter/s)": 0.201234 }, { "acc": 0.7747858, "epoch": 1.6095889412210735, "grad_norm": 5.03125, "learning_rate": 9.662710362049815e-07, "loss": 0.82582674, "memory(GiB)": 138.1, "step": 68990, "train_speed(iter/s)": 0.201248 }, { "acc": 0.78229184, "epoch": 1.6098222487933624, "grad_norm": 6.09375, "learning_rate": 9.651550506038543e-07, "loss": 0.78168783, "memory(GiB)": 138.1, "step": 69000, "train_speed(iter/s)": 0.201264 }, { "epoch": 1.6098222487933624, "eval_acc": 0.7446647548648214, "eval_loss": 0.8044345378875732, "eval_runtime": 1269.3193, "eval_samples_per_second": 28.355, "eval_steps_per_second": 14.178, "step": 69000 }, { "acc": 0.77631769, "epoch": 1.6100555563656513, "grad_norm": 4.40625, "learning_rate": 9.640396409756731e-07, "loss": 0.83473434, "memory(GiB)": 138.1, "step": 69010, "train_speed(iter/s)": 0.200524 }, { "acc": 0.79392805, "epoch": 1.6102888639379402, "grad_norm": 5.75, "learning_rate": 9.629248074796593e-07, "loss": 0.7308218, "memory(GiB)": 138.1, "step": 69020, "train_speed(iter/s)": 0.20054 }, { "acc": 0.78255825, "epoch": 1.6105221715102291, "grad_norm": 7.625, "learning_rate": 9.618105502749575e-07, "loss": 0.78719893, "memory(GiB)": 138.1, "step": 69030, "train_speed(iter/s)": 0.200555 }, { "acc": 0.80137396, "epoch": 1.6107554790825178, "grad_norm": 10.5625, "learning_rate": 9.606968695206264e-07, "loss": 0.69361744, "memory(GiB)": 138.1, "step": 69040, "train_speed(iter/s)": 0.200571 }, { "acc": 0.7979476, "epoch": 1.610988786654807, "grad_norm": 6.5625, "learning_rate": 9.59583765375644e-07, "loss": 0.72113686, "memory(GiB)": 138.1, "step": 69050, "train_speed(iter/s)": 0.200587 }, { "acc": 0.7878149, "epoch": 1.6112220942270956, "grad_norm": 8.5625, "learning_rate": 9.58471237998906e-07, "loss": 0.77528825, "memory(GiB)": 138.1, "step": 69060, "train_speed(iter/s)": 0.200602 }, { "acc": 0.77063704, "epoch": 1.6114554017993847, "grad_norm": 5.71875, "learning_rate": 9.57359287549222e-07, "loss": 0.84115477, "memory(GiB)": 138.1, "step": 69070, "train_speed(iter/s)": 0.200617 }, { "acc": 0.77148275, "epoch": 1.6116887093716734, "grad_norm": 5.3125, "learning_rate": 9.562479141853276e-07, "loss": 0.80791435, "memory(GiB)": 138.1, "step": 69080, "train_speed(iter/s)": 0.200631 }, { "acc": 0.77265015, "epoch": 1.6119220169439625, "grad_norm": 5.3125, "learning_rate": 9.551371180658675e-07, "loss": 0.79873414, "memory(GiB)": 138.1, "step": 69090, "train_speed(iter/s)": 0.200646 }, { "acc": 0.79981709, "epoch": 1.6121553245162512, "grad_norm": 4.8125, "learning_rate": 9.540268993494095e-07, "loss": 0.70313296, "memory(GiB)": 138.1, "step": 69100, "train_speed(iter/s)": 0.200661 }, { "acc": 0.77894669, "epoch": 1.6123886320885403, "grad_norm": 6.6875, "learning_rate": 9.529172581944352e-07, "loss": 0.81366968, "memory(GiB)": 138.1, "step": 69110, "train_speed(iter/s)": 0.200676 }, { "acc": 0.77654457, "epoch": 1.612621939660829, "grad_norm": 7.15625, "learning_rate": 9.518081947593477e-07, "loss": 0.81639051, "memory(GiB)": 138.1, "step": 69120, "train_speed(iter/s)": 0.200692 }, { "acc": 0.76891427, "epoch": 1.6128552472331181, "grad_norm": 4.8125, "learning_rate": 9.50699709202465e-07, "loss": 0.85439548, "memory(GiB)": 138.1, "step": 69130, "train_speed(iter/s)": 0.200706 }, { "acc": 0.78814936, "epoch": 1.6130885548054068, "grad_norm": 5.71875, "learning_rate": 9.495918016820204e-07, "loss": 0.7638484, "memory(GiB)": 138.1, "step": 69140, "train_speed(iter/s)": 0.200722 }, { "acc": 0.79379234, "epoch": 1.613321862377696, "grad_norm": 5.1875, "learning_rate": 9.484844723561726e-07, "loss": 0.7099072, "memory(GiB)": 138.1, "step": 69150, "train_speed(iter/s)": 0.200737 }, { "acc": 0.77234783, "epoch": 1.6135551699499846, "grad_norm": 6.90625, "learning_rate": 9.473777213829866e-07, "loss": 0.81089096, "memory(GiB)": 138.1, "step": 69160, "train_speed(iter/s)": 0.200752 }, { "acc": 0.77259874, "epoch": 1.6137884775222737, "grad_norm": 4.90625, "learning_rate": 9.462715489204549e-07, "loss": 0.81169949, "memory(GiB)": 138.1, "step": 69170, "train_speed(iter/s)": 0.200768 }, { "acc": 0.78828964, "epoch": 1.6140217850945624, "grad_norm": 7.78125, "learning_rate": 9.451659551264808e-07, "loss": 0.75277605, "memory(GiB)": 138.1, "step": 69180, "train_speed(iter/s)": 0.200783 }, { "acc": 0.79725246, "epoch": 1.6142550926668515, "grad_norm": 6.59375, "learning_rate": 9.440609401588901e-07, "loss": 0.72743855, "memory(GiB)": 138.1, "step": 69190, "train_speed(iter/s)": 0.200799 }, { "acc": 0.79967861, "epoch": 1.6144884002391402, "grad_norm": 4.21875, "learning_rate": 9.429565041754218e-07, "loss": 0.70557318, "memory(GiB)": 138.1, "step": 69200, "train_speed(iter/s)": 0.200813 }, { "acc": 0.77489271, "epoch": 1.6147217078114293, "grad_norm": 3.90625, "learning_rate": 9.418526473337325e-07, "loss": 0.81767092, "memory(GiB)": 138.1, "step": 69210, "train_speed(iter/s)": 0.200828 }, { "acc": 0.77260404, "epoch": 1.614955015383718, "grad_norm": 4.9375, "learning_rate": 9.407493697913999e-07, "loss": 0.82012691, "memory(GiB)": 138.1, "step": 69220, "train_speed(iter/s)": 0.200844 }, { "acc": 0.79442325, "epoch": 1.615188322956007, "grad_norm": 8.5, "learning_rate": 9.396466717059149e-07, "loss": 0.72898545, "memory(GiB)": 138.1, "step": 69230, "train_speed(iter/s)": 0.200859 }, { "acc": 0.78504844, "epoch": 1.6154216305282958, "grad_norm": 5.4375, "learning_rate": 9.385445532346887e-07, "loss": 0.76235442, "memory(GiB)": 138.1, "step": 69240, "train_speed(iter/s)": 0.200874 }, { "acc": 0.79455013, "epoch": 1.6156549381005847, "grad_norm": 4.40625, "learning_rate": 9.374430145350466e-07, "loss": 0.74329395, "memory(GiB)": 138.1, "step": 69250, "train_speed(iter/s)": 0.200888 }, { "acc": 0.7682219, "epoch": 1.6158882456728736, "grad_norm": 5.46875, "learning_rate": 9.363420557642355e-07, "loss": 0.84918575, "memory(GiB)": 138.1, "step": 69260, "train_speed(iter/s)": 0.200903 }, { "acc": 0.78665934, "epoch": 1.6161215532451625, "grad_norm": 5.25, "learning_rate": 9.352416770794154e-07, "loss": 0.76795254, "memory(GiB)": 138.1, "step": 69270, "train_speed(iter/s)": 0.200917 }, { "acc": 0.7946744, "epoch": 1.6163548608174514, "grad_norm": 4.71875, "learning_rate": 9.341418786376649e-07, "loss": 0.71400213, "memory(GiB)": 138.1, "step": 69280, "train_speed(iter/s)": 0.200932 }, { "acc": 0.77530904, "epoch": 1.6165881683897403, "grad_norm": 4.625, "learning_rate": 9.330426605959803e-07, "loss": 0.82718925, "memory(GiB)": 138.1, "step": 69290, "train_speed(iter/s)": 0.200947 }, { "acc": 0.7725615, "epoch": 1.6168214759620292, "grad_norm": 5.78125, "learning_rate": 9.319440231112725e-07, "loss": 0.81718159, "memory(GiB)": 138.1, "step": 69300, "train_speed(iter/s)": 0.200963 }, { "acc": 0.79436731, "epoch": 1.617054783534318, "grad_norm": 5.15625, "learning_rate": 9.308459663403757e-07, "loss": 0.75845556, "memory(GiB)": 138.1, "step": 69310, "train_speed(iter/s)": 0.200977 }, { "acc": 0.78687463, "epoch": 1.617288091106607, "grad_norm": 4.71875, "learning_rate": 9.297484904400333e-07, "loss": 0.7621346, "memory(GiB)": 138.1, "step": 69320, "train_speed(iter/s)": 0.200992 }, { "acc": 0.78693705, "epoch": 1.6175213986788959, "grad_norm": 4.90625, "learning_rate": 9.286515955669134e-07, "loss": 0.74903417, "memory(GiB)": 138.1, "step": 69330, "train_speed(iter/s)": 0.201007 }, { "acc": 0.78143864, "epoch": 1.6177547062511848, "grad_norm": 5.53125, "learning_rate": 9.275552818775945e-07, "loss": 0.79808311, "memory(GiB)": 138.1, "step": 69340, "train_speed(iter/s)": 0.201023 }, { "acc": 0.7710865, "epoch": 1.6179880138234737, "grad_norm": 6.0625, "learning_rate": 9.264595495285755e-07, "loss": 0.83482437, "memory(GiB)": 138.1, "step": 69350, "train_speed(iter/s)": 0.201037 }, { "acc": 0.76026926, "epoch": 1.6182213213957626, "grad_norm": 8.1875, "learning_rate": 9.25364398676274e-07, "loss": 0.85732269, "memory(GiB)": 138.1, "step": 69360, "train_speed(iter/s)": 0.201052 }, { "acc": 0.78260975, "epoch": 1.6184546289680515, "grad_norm": 7.125, "learning_rate": 9.242698294770191e-07, "loss": 0.77640896, "memory(GiB)": 138.1, "step": 69370, "train_speed(iter/s)": 0.201067 }, { "acc": 0.79006805, "epoch": 1.6186879365403404, "grad_norm": 3.921875, "learning_rate": 9.231758420870645e-07, "loss": 0.75500803, "memory(GiB)": 138.1, "step": 69380, "train_speed(iter/s)": 0.201082 }, { "acc": 0.7738595, "epoch": 1.6189212441126293, "grad_norm": 4.4375, "learning_rate": 9.220824366625719e-07, "loss": 0.8050436, "memory(GiB)": 138.1, "step": 69390, "train_speed(iter/s)": 0.201097 }, { "acc": 0.77326818, "epoch": 1.6191545516849182, "grad_norm": 4.84375, "learning_rate": 9.20989613359628e-07, "loss": 0.83086958, "memory(GiB)": 138.1, "step": 69400, "train_speed(iter/s)": 0.201111 }, { "acc": 0.77323809, "epoch": 1.6193878592572069, "grad_norm": 5.90625, "learning_rate": 9.198973723342303e-07, "loss": 0.81775436, "memory(GiB)": 138.1, "step": 69410, "train_speed(iter/s)": 0.201125 }, { "acc": 0.76896219, "epoch": 1.619621166829496, "grad_norm": 4.84375, "learning_rate": 9.18805713742299e-07, "loss": 0.82727232, "memory(GiB)": 138.1, "step": 69420, "train_speed(iter/s)": 0.20114 }, { "acc": 0.77858291, "epoch": 1.6198544744017846, "grad_norm": 5.5, "learning_rate": 9.177146377396662e-07, "loss": 0.80608435, "memory(GiB)": 138.1, "step": 69430, "train_speed(iter/s)": 0.201156 }, { "acc": 0.78011804, "epoch": 1.6200877819740738, "grad_norm": 6.125, "learning_rate": 9.166241444820817e-07, "loss": 0.76758566, "memory(GiB)": 138.1, "step": 69440, "train_speed(iter/s)": 0.201171 }, { "acc": 0.78035154, "epoch": 1.6203210895463624, "grad_norm": 10.625, "learning_rate": 9.15534234125216e-07, "loss": 0.78598452, "memory(GiB)": 138.1, "step": 69450, "train_speed(iter/s)": 0.201186 }, { "acc": 0.80147018, "epoch": 1.6205543971186516, "grad_norm": 5.25, "learning_rate": 9.144449068246502e-07, "loss": 0.70722756, "memory(GiB)": 138.1, "step": 69460, "train_speed(iter/s)": 0.201202 }, { "acc": 0.75699992, "epoch": 1.6207877046909402, "grad_norm": 4.59375, "learning_rate": 9.133561627358884e-07, "loss": 0.86237135, "memory(GiB)": 138.1, "step": 69470, "train_speed(iter/s)": 0.201218 }, { "acc": 0.79911742, "epoch": 1.6210210122632294, "grad_norm": 8.875, "learning_rate": 9.122680020143476e-07, "loss": 0.70699673, "memory(GiB)": 138.1, "step": 69480, "train_speed(iter/s)": 0.201231 }, { "acc": 0.80146561, "epoch": 1.621254319835518, "grad_norm": 4.90625, "learning_rate": 9.111804248153605e-07, "loss": 0.70860691, "memory(GiB)": 138.1, "step": 69490, "train_speed(iter/s)": 0.201246 }, { "acc": 0.78607807, "epoch": 1.6214876274078072, "grad_norm": 5.40625, "learning_rate": 9.100934312941822e-07, "loss": 0.76731787, "memory(GiB)": 138.1, "step": 69500, "train_speed(iter/s)": 0.20126 }, { "epoch": 1.6214876274078072, "eval_acc": 0.744685733759253, "eval_loss": 0.8044191002845764, "eval_runtime": 1269.1743, "eval_samples_per_second": 28.358, "eval_steps_per_second": 14.179, "step": 69500 }, { "acc": 0.77421312, "epoch": 1.6217209349800958, "grad_norm": 6.46875, "learning_rate": 9.09007021605976e-07, "loss": 0.79150124, "memory(GiB)": 138.1, "step": 69510, "train_speed(iter/s)": 0.200524 }, { "acc": 0.76822271, "epoch": 1.621954242552385, "grad_norm": 10.875, "learning_rate": 9.079211959058304e-07, "loss": 0.8320261, "memory(GiB)": 138.1, "step": 69520, "train_speed(iter/s)": 0.200539 }, { "acc": 0.79691601, "epoch": 1.6221875501246736, "grad_norm": 8.0, "learning_rate": 9.068359543487442e-07, "loss": 0.73356771, "memory(GiB)": 138.1, "step": 69530, "train_speed(iter/s)": 0.200553 }, { "acc": 0.76938868, "epoch": 1.6224208576969628, "grad_norm": 4.40625, "learning_rate": 9.057512970896376e-07, "loss": 0.81400595, "memory(GiB)": 138.1, "step": 69540, "train_speed(iter/s)": 0.200568 }, { "acc": 0.77159843, "epoch": 1.6226541652692514, "grad_norm": 6.1875, "learning_rate": 9.046672242833427e-07, "loss": 0.82162743, "memory(GiB)": 138.1, "step": 69550, "train_speed(iter/s)": 0.200583 }, { "acc": 0.79073334, "epoch": 1.6228874728415406, "grad_norm": 5.71875, "learning_rate": 9.035837360846134e-07, "loss": 0.75625534, "memory(GiB)": 138.1, "step": 69560, "train_speed(iter/s)": 0.200599 }, { "acc": 0.77172422, "epoch": 1.6231207804138292, "grad_norm": 6.71875, "learning_rate": 9.02500832648115e-07, "loss": 0.82089863, "memory(GiB)": 138.1, "step": 69570, "train_speed(iter/s)": 0.200614 }, { "acc": 0.75787716, "epoch": 1.6233540879861184, "grad_norm": 5.8125, "learning_rate": 9.014185141284315e-07, "loss": 0.87999249, "memory(GiB)": 138.1, "step": 69580, "train_speed(iter/s)": 0.200629 }, { "acc": 0.78990684, "epoch": 1.623587395558407, "grad_norm": 3.5, "learning_rate": 9.003367806800661e-07, "loss": 0.76870031, "memory(GiB)": 138.1, "step": 69590, "train_speed(iter/s)": 0.200644 }, { "acc": 0.77759619, "epoch": 1.623820703130696, "grad_norm": 5.0, "learning_rate": 8.992556324574325e-07, "loss": 0.77327042, "memory(GiB)": 138.1, "step": 69600, "train_speed(iter/s)": 0.200658 }, { "acc": 0.77967892, "epoch": 1.6240540107029848, "grad_norm": 5.28125, "learning_rate": 8.981750696148689e-07, "loss": 0.78824787, "memory(GiB)": 138.1, "step": 69610, "train_speed(iter/s)": 0.200674 }, { "acc": 0.77583332, "epoch": 1.6242873182752737, "grad_norm": 5.21875, "learning_rate": 8.970950923066201e-07, "loss": 0.81284409, "memory(GiB)": 138.1, "step": 69620, "train_speed(iter/s)": 0.200689 }, { "acc": 0.79084225, "epoch": 1.6245206258475626, "grad_norm": 6.9375, "learning_rate": 8.960157006868564e-07, "loss": 0.76385317, "memory(GiB)": 138.1, "step": 69630, "train_speed(iter/s)": 0.200703 }, { "acc": 0.77975607, "epoch": 1.6247539334198515, "grad_norm": 5.0, "learning_rate": 8.949368949096588e-07, "loss": 0.79590893, "memory(GiB)": 138.1, "step": 69640, "train_speed(iter/s)": 0.200719 }, { "acc": 0.76001158, "epoch": 1.6249872409921404, "grad_norm": 6.125, "learning_rate": 8.938586751290257e-07, "loss": 0.85949221, "memory(GiB)": 138.1, "step": 69650, "train_speed(iter/s)": 0.200734 }, { "acc": 0.78397055, "epoch": 1.6252205485644293, "grad_norm": 8.75, "learning_rate": 8.927810414988752e-07, "loss": 0.78344545, "memory(GiB)": 138.1, "step": 69660, "train_speed(iter/s)": 0.200749 }, { "acc": 0.7823205, "epoch": 1.6254538561367182, "grad_norm": 5.8125, "learning_rate": 8.917039941730365e-07, "loss": 0.77132821, "memory(GiB)": 138.1, "step": 69670, "train_speed(iter/s)": 0.200763 }, { "acc": 0.77989521, "epoch": 1.6256871637090071, "grad_norm": 5.21875, "learning_rate": 8.906275333052605e-07, "loss": 0.76878123, "memory(GiB)": 138.1, "step": 69680, "train_speed(iter/s)": 0.200778 }, { "acc": 0.78908534, "epoch": 1.625920471281296, "grad_norm": 4.59375, "learning_rate": 8.895516590492104e-07, "loss": 0.74415193, "memory(GiB)": 138.1, "step": 69690, "train_speed(iter/s)": 0.200793 }, { "acc": 0.80243616, "epoch": 1.626153778853585, "grad_norm": 4.84375, "learning_rate": 8.88476371558466e-07, "loss": 0.70059543, "memory(GiB)": 138.1, "step": 69700, "train_speed(iter/s)": 0.200808 }, { "acc": 0.76366444, "epoch": 1.6263870864258738, "grad_norm": 4.34375, "learning_rate": 8.874016709865257e-07, "loss": 0.83497429, "memory(GiB)": 138.1, "step": 69710, "train_speed(iter/s)": 0.200822 }, { "acc": 0.79546161, "epoch": 1.6266203939981627, "grad_norm": 5.59375, "learning_rate": 8.863275574868014e-07, "loss": 0.73149471, "memory(GiB)": 138.1, "step": 69720, "train_speed(iter/s)": 0.200836 }, { "acc": 0.77597876, "epoch": 1.6268537015704516, "grad_norm": 3.890625, "learning_rate": 8.852540312126256e-07, "loss": 0.79382687, "memory(GiB)": 138.1, "step": 69730, "train_speed(iter/s)": 0.200852 }, { "acc": 0.77241516, "epoch": 1.6270870091427405, "grad_norm": 6.09375, "learning_rate": 8.841810923172389e-07, "loss": 0.82993584, "memory(GiB)": 138.1, "step": 69740, "train_speed(iter/s)": 0.200867 }, { "acc": 0.77992134, "epoch": 1.6273203167150294, "grad_norm": 5.125, "learning_rate": 8.83108740953807e-07, "loss": 0.77837553, "memory(GiB)": 138.1, "step": 69750, "train_speed(iter/s)": 0.200882 }, { "acc": 0.79285231, "epoch": 1.6275536242873183, "grad_norm": 4.8125, "learning_rate": 8.82036977275405e-07, "loss": 0.74223328, "memory(GiB)": 138.1, "step": 69760, "train_speed(iter/s)": 0.200897 }, { "acc": 0.76947985, "epoch": 1.6277869318596072, "grad_norm": 4.75, "learning_rate": 8.809658014350297e-07, "loss": 0.82958355, "memory(GiB)": 138.1, "step": 69770, "train_speed(iter/s)": 0.200913 }, { "acc": 0.78920531, "epoch": 1.628020239431896, "grad_norm": 4.15625, "learning_rate": 8.798952135855893e-07, "loss": 0.76306615, "memory(GiB)": 138.1, "step": 69780, "train_speed(iter/s)": 0.200928 }, { "acc": 0.77562838, "epoch": 1.628253547004185, "grad_norm": 8.25, "learning_rate": 8.788252138799092e-07, "loss": 0.80673237, "memory(GiB)": 138.1, "step": 69790, "train_speed(iter/s)": 0.200942 }, { "acc": 0.77968626, "epoch": 1.6284868545764737, "grad_norm": 5.1875, "learning_rate": 8.777558024707339e-07, "loss": 0.78458438, "memory(GiB)": 138.1, "step": 69800, "train_speed(iter/s)": 0.200957 }, { "acc": 0.78913994, "epoch": 1.6287201621487628, "grad_norm": 4.375, "learning_rate": 8.766869795107191e-07, "loss": 0.75235834, "memory(GiB)": 138.1, "step": 69810, "train_speed(iter/s)": 0.200971 }, { "acc": 0.78043246, "epoch": 1.6289534697210515, "grad_norm": 6.1875, "learning_rate": 8.756187451524412e-07, "loss": 0.77849183, "memory(GiB)": 138.1, "step": 69820, "train_speed(iter/s)": 0.200986 }, { "acc": 0.77848892, "epoch": 1.6291867772933406, "grad_norm": 4.34375, "learning_rate": 8.745510995483892e-07, "loss": 0.80239801, "memory(GiB)": 138.1, "step": 69830, "train_speed(iter/s)": 0.201001 }, { "acc": 0.79218383, "epoch": 1.6294200848656293, "grad_norm": 5.28125, "learning_rate": 8.734840428509694e-07, "loss": 0.74597349, "memory(GiB)": 138.1, "step": 69840, "train_speed(iter/s)": 0.201017 }, { "acc": 0.77905416, "epoch": 1.6296533924379184, "grad_norm": 5.5, "learning_rate": 8.724175752125042e-07, "loss": 0.78995914, "memory(GiB)": 138.1, "step": 69850, "train_speed(iter/s)": 0.201032 }, { "acc": 0.78956504, "epoch": 1.629886700010207, "grad_norm": 5.84375, "learning_rate": 8.713516967852292e-07, "loss": 0.76276588, "memory(GiB)": 138.1, "step": 69860, "train_speed(iter/s)": 0.201048 }, { "acc": 0.77725039, "epoch": 1.6301200075824962, "grad_norm": 5.53125, "learning_rate": 8.702864077213014e-07, "loss": 0.80270042, "memory(GiB)": 138.1, "step": 69870, "train_speed(iter/s)": 0.201063 }, { "acc": 0.77587008, "epoch": 1.6303533151547849, "grad_norm": 4.65625, "learning_rate": 8.692217081727883e-07, "loss": 0.81218023, "memory(GiB)": 138.1, "step": 69880, "train_speed(iter/s)": 0.201077 }, { "acc": 0.76779122, "epoch": 1.630586622727074, "grad_norm": 5.625, "learning_rate": 8.681575982916773e-07, "loss": 0.85469055, "memory(GiB)": 138.1, "step": 69890, "train_speed(iter/s)": 0.201091 }, { "acc": 0.78068151, "epoch": 1.6308199302993627, "grad_norm": 5.65625, "learning_rate": 8.670940782298675e-07, "loss": 0.79066305, "memory(GiB)": 138.1, "step": 69900, "train_speed(iter/s)": 0.201106 }, { "acc": 0.76384864, "epoch": 1.6310532378716518, "grad_norm": 4.25, "learning_rate": 8.660311481391792e-07, "loss": 0.85888195, "memory(GiB)": 138.1, "step": 69910, "train_speed(iter/s)": 0.20112 }, { "acc": 0.79069233, "epoch": 1.6312865454439405, "grad_norm": 5.34375, "learning_rate": 8.649688081713431e-07, "loss": 0.74954491, "memory(GiB)": 138.1, "step": 69920, "train_speed(iter/s)": 0.201135 }, { "acc": 0.78507986, "epoch": 1.6315198530162296, "grad_norm": 5.0625, "learning_rate": 8.639070584780074e-07, "loss": 0.79639606, "memory(GiB)": 138.1, "step": 69930, "train_speed(iter/s)": 0.20115 }, { "acc": 0.79327669, "epoch": 1.6317531605885183, "grad_norm": 5.53125, "learning_rate": 8.628458992107386e-07, "loss": 0.74162989, "memory(GiB)": 138.1, "step": 69940, "train_speed(iter/s)": 0.201165 }, { "acc": 0.7880043, "epoch": 1.6319864681608074, "grad_norm": 8.3125, "learning_rate": 8.617853305210161e-07, "loss": 0.75817766, "memory(GiB)": 138.1, "step": 69950, "train_speed(iter/s)": 0.20118 }, { "acc": 0.77378907, "epoch": 1.632219775733096, "grad_norm": 7.6875, "learning_rate": 8.607253525602355e-07, "loss": 0.80225611, "memory(GiB)": 138.1, "step": 69960, "train_speed(iter/s)": 0.201195 }, { "acc": 0.77905831, "epoch": 1.6324530833053852, "grad_norm": 5.1875, "learning_rate": 8.596659654797068e-07, "loss": 0.79952292, "memory(GiB)": 138.1, "step": 69970, "train_speed(iter/s)": 0.201209 }, { "acc": 0.77380133, "epoch": 1.6326863908776739, "grad_norm": 4.75, "learning_rate": 8.586071694306602e-07, "loss": 0.80798283, "memory(GiB)": 138.1, "step": 69980, "train_speed(iter/s)": 0.201225 }, { "acc": 0.79780202, "epoch": 1.6329196984499628, "grad_norm": 4.625, "learning_rate": 8.575489645642371e-07, "loss": 0.71656194, "memory(GiB)": 138.1, "step": 69990, "train_speed(iter/s)": 0.201239 }, { "acc": 0.77291632, "epoch": 1.6331530060222517, "grad_norm": 11.8125, "learning_rate": 8.564913510314943e-07, "loss": 0.80805435, "memory(GiB)": 138.1, "step": 70000, "train_speed(iter/s)": 0.201254 }, { "epoch": 1.6331530060222517, "eval_acc": 0.744727211115419, "eval_loss": 0.8044537901878357, "eval_runtime": 1270.0228, "eval_samples_per_second": 28.339, "eval_steps_per_second": 14.17, "step": 70000 }, { "acc": 0.78201637, "epoch": 1.6333863135945406, "grad_norm": 5.75, "learning_rate": 8.554343289834094e-07, "loss": 0.80393801, "memory(GiB)": 138.1, "step": 70010, "train_speed(iter/s)": 0.200523 }, { "acc": 0.79293175, "epoch": 1.6336196211668295, "grad_norm": 5.0625, "learning_rate": 8.543778985708683e-07, "loss": 0.74324322, "memory(GiB)": 138.1, "step": 70020, "train_speed(iter/s)": 0.200537 }, { "acc": 0.78512459, "epoch": 1.6338529287391184, "grad_norm": 5.65625, "learning_rate": 8.533220599446789e-07, "loss": 0.78935528, "memory(GiB)": 138.1, "step": 70030, "train_speed(iter/s)": 0.200552 }, { "acc": 0.77615108, "epoch": 1.6340862363114073, "grad_norm": 6.84375, "learning_rate": 8.522668132555601e-07, "loss": 0.82629166, "memory(GiB)": 138.1, "step": 70040, "train_speed(iter/s)": 0.200566 }, { "acc": 0.77868662, "epoch": 1.6343195438836962, "grad_norm": 8.9375, "learning_rate": 8.512121586541499e-07, "loss": 0.79097457, "memory(GiB)": 138.1, "step": 70050, "train_speed(iter/s)": 0.20058 }, { "acc": 0.79038181, "epoch": 1.634552851455985, "grad_norm": 5.4375, "learning_rate": 8.501580962909989e-07, "loss": 0.73661346, "memory(GiB)": 138.1, "step": 70060, "train_speed(iter/s)": 0.200594 }, { "acc": 0.7649703, "epoch": 1.634786159028274, "grad_norm": 6.75, "learning_rate": 8.491046263165737e-07, "loss": 0.85525341, "memory(GiB)": 138.1, "step": 70070, "train_speed(iter/s)": 0.20061 }, { "acc": 0.7755065, "epoch": 1.6350194666005629, "grad_norm": 4.71875, "learning_rate": 8.480517488812578e-07, "loss": 0.79242172, "memory(GiB)": 138.1, "step": 70080, "train_speed(iter/s)": 0.200625 }, { "acc": 0.77949743, "epoch": 1.6352527741728518, "grad_norm": 5.40625, "learning_rate": 8.469994641353468e-07, "loss": 0.77577677, "memory(GiB)": 138.1, "step": 70090, "train_speed(iter/s)": 0.200639 }, { "acc": 0.77214479, "epoch": 1.6354860817451407, "grad_norm": 4.96875, "learning_rate": 8.459477722290577e-07, "loss": 0.82063198, "memory(GiB)": 138.1, "step": 70100, "train_speed(iter/s)": 0.200653 }, { "acc": 0.77816544, "epoch": 1.6357193893174296, "grad_norm": 14.4375, "learning_rate": 8.448966733125152e-07, "loss": 0.81221743, "memory(GiB)": 138.1, "step": 70110, "train_speed(iter/s)": 0.200668 }, { "acc": 0.7740953, "epoch": 1.6359526968897184, "grad_norm": 7.09375, "learning_rate": 8.438461675357679e-07, "loss": 0.81897221, "memory(GiB)": 138.1, "step": 70120, "train_speed(iter/s)": 0.200681 }, { "acc": 0.78639469, "epoch": 1.6361860044620073, "grad_norm": 5.15625, "learning_rate": 8.427962550487717e-07, "loss": 0.7703733, "memory(GiB)": 138.1, "step": 70130, "train_speed(iter/s)": 0.200696 }, { "acc": 0.77399602, "epoch": 1.6364193120342962, "grad_norm": 6.71875, "learning_rate": 8.417469360014019e-07, "loss": 0.81631184, "memory(GiB)": 138.1, "step": 70140, "train_speed(iter/s)": 0.20071 }, { "acc": 0.78081784, "epoch": 1.6366526196065851, "grad_norm": 5.8125, "learning_rate": 8.406982105434502e-07, "loss": 0.79938803, "memory(GiB)": 138.1, "step": 70150, "train_speed(iter/s)": 0.200725 }, { "acc": 0.78800888, "epoch": 1.636885927178874, "grad_norm": 5.5, "learning_rate": 8.396500788246192e-07, "loss": 0.75535965, "memory(GiB)": 138.1, "step": 70160, "train_speed(iter/s)": 0.200739 }, { "acc": 0.80056391, "epoch": 1.637119234751163, "grad_norm": 6.46875, "learning_rate": 8.38602540994532e-07, "loss": 0.69680567, "memory(GiB)": 138.1, "step": 70170, "train_speed(iter/s)": 0.200755 }, { "acc": 0.7816555, "epoch": 1.6373525423234518, "grad_norm": 4.625, "learning_rate": 8.375555972027233e-07, "loss": 0.76161251, "memory(GiB)": 138.1, "step": 70180, "train_speed(iter/s)": 0.20077 }, { "acc": 0.78333101, "epoch": 1.6375858498957405, "grad_norm": 4.40625, "learning_rate": 8.365092475986442e-07, "loss": 0.77230682, "memory(GiB)": 138.1, "step": 70190, "train_speed(iter/s)": 0.200786 }, { "acc": 0.75599384, "epoch": 1.6378191574680296, "grad_norm": 5.15625, "learning_rate": 8.35463492331659e-07, "loss": 0.91006165, "memory(GiB)": 138.1, "step": 70200, "train_speed(iter/s)": 0.200801 }, { "acc": 0.7760139, "epoch": 1.6380524650403183, "grad_norm": 5.5, "learning_rate": 8.344183315510518e-07, "loss": 0.807833, "memory(GiB)": 138.1, "step": 70210, "train_speed(iter/s)": 0.200816 }, { "acc": 0.77669868, "epoch": 1.6382857726126074, "grad_norm": 4.75, "learning_rate": 8.333737654060176e-07, "loss": 0.78656607, "memory(GiB)": 138.1, "step": 70220, "train_speed(iter/s)": 0.200831 }, { "acc": 0.77367878, "epoch": 1.6385190801848961, "grad_norm": 5.03125, "learning_rate": 8.323297940456665e-07, "loss": 0.82135525, "memory(GiB)": 138.1, "step": 70230, "train_speed(iter/s)": 0.200846 }, { "acc": 0.77797899, "epoch": 1.6387523877571852, "grad_norm": 4.78125, "learning_rate": 8.312864176190282e-07, "loss": 0.80228634, "memory(GiB)": 138.1, "step": 70240, "train_speed(iter/s)": 0.200861 }, { "acc": 0.7765986, "epoch": 1.638985695329474, "grad_norm": 6.8125, "learning_rate": 8.302436362750416e-07, "loss": 0.80377598, "memory(GiB)": 138.1, "step": 70250, "train_speed(iter/s)": 0.200877 }, { "acc": 0.79614129, "epoch": 1.639219002901763, "grad_norm": 4.375, "learning_rate": 8.292014501625656e-07, "loss": 0.70680809, "memory(GiB)": 138.1, "step": 70260, "train_speed(iter/s)": 0.200891 }, { "acc": 0.79076648, "epoch": 1.6394523104740517, "grad_norm": 4.375, "learning_rate": 8.281598594303708e-07, "loss": 0.74855566, "memory(GiB)": 138.1, "step": 70270, "train_speed(iter/s)": 0.200905 }, { "acc": 0.7758749, "epoch": 1.6396856180463408, "grad_norm": 3.984375, "learning_rate": 8.271188642271432e-07, "loss": 0.81647902, "memory(GiB)": 138.1, "step": 70280, "train_speed(iter/s)": 0.200919 }, { "acc": 0.79907188, "epoch": 1.6399189256186295, "grad_norm": 5.03125, "learning_rate": 8.260784647014864e-07, "loss": 0.7401226, "memory(GiB)": 138.1, "step": 70290, "train_speed(iter/s)": 0.200934 }, { "acc": 0.77903986, "epoch": 1.6401522331909186, "grad_norm": 5.40625, "learning_rate": 8.250386610019167e-07, "loss": 0.80523014, "memory(GiB)": 138.1, "step": 70300, "train_speed(iter/s)": 0.200949 }, { "acc": 0.78396397, "epoch": 1.6403855407632073, "grad_norm": 5.46875, "learning_rate": 8.239994532768647e-07, "loss": 0.78131781, "memory(GiB)": 138.1, "step": 70310, "train_speed(iter/s)": 0.200964 }, { "acc": 0.78332367, "epoch": 1.6406188483354964, "grad_norm": 5.0, "learning_rate": 8.22960841674677e-07, "loss": 0.77818413, "memory(GiB)": 138.1, "step": 70320, "train_speed(iter/s)": 0.200979 }, { "acc": 0.77349386, "epoch": 1.640852155907785, "grad_norm": 4.96875, "learning_rate": 8.219228263436168e-07, "loss": 0.82582359, "memory(GiB)": 138.1, "step": 70330, "train_speed(iter/s)": 0.200993 }, { "acc": 0.78453584, "epoch": 1.6410854634800742, "grad_norm": 4.875, "learning_rate": 8.208854074318579e-07, "loss": 0.76499305, "memory(GiB)": 138.1, "step": 70340, "train_speed(iter/s)": 0.201008 }, { "acc": 0.78075652, "epoch": 1.641318771052363, "grad_norm": 9.5, "learning_rate": 8.198485850874943e-07, "loss": 0.78641806, "memory(GiB)": 138.1, "step": 70350, "train_speed(iter/s)": 0.201022 }, { "acc": 0.77025485, "epoch": 1.641552078624652, "grad_norm": 5.6875, "learning_rate": 8.188123594585312e-07, "loss": 0.81662521, "memory(GiB)": 138.1, "step": 70360, "train_speed(iter/s)": 0.201037 }, { "acc": 0.78271184, "epoch": 1.6417853861969407, "grad_norm": 4.0, "learning_rate": 8.177767306928875e-07, "loss": 0.76894827, "memory(GiB)": 138.1, "step": 70370, "train_speed(iter/s)": 0.201051 }, { "acc": 0.77495737, "epoch": 1.6420186937692296, "grad_norm": 6.25, "learning_rate": 8.16741698938402e-07, "loss": 0.80135536, "memory(GiB)": 138.1, "step": 70380, "train_speed(iter/s)": 0.201067 }, { "acc": 0.77556715, "epoch": 1.6422520013415185, "grad_norm": 3.484375, "learning_rate": 8.157072643428227e-07, "loss": 0.80013609, "memory(GiB)": 138.1, "step": 70390, "train_speed(iter/s)": 0.201081 }, { "acc": 0.78865757, "epoch": 1.6424853089138074, "grad_norm": 6.46875, "learning_rate": 8.14673427053817e-07, "loss": 0.7638164, "memory(GiB)": 138.1, "step": 70400, "train_speed(iter/s)": 0.201096 }, { "acc": 0.78071489, "epoch": 1.6427186164860963, "grad_norm": 4.875, "learning_rate": 8.136401872189631e-07, "loss": 0.7829968, "memory(GiB)": 138.1, "step": 70410, "train_speed(iter/s)": 0.201111 }, { "acc": 0.79857535, "epoch": 1.6429519240583852, "grad_norm": 5.96875, "learning_rate": 8.126075449857574e-07, "loss": 0.71098719, "memory(GiB)": 138.1, "step": 70420, "train_speed(iter/s)": 0.201126 }, { "acc": 0.76003633, "epoch": 1.643185231630674, "grad_norm": 5.09375, "learning_rate": 8.115755005016074e-07, "loss": 0.87234459, "memory(GiB)": 138.1, "step": 70430, "train_speed(iter/s)": 0.201141 }, { "acc": 0.76972141, "epoch": 1.643418539202963, "grad_norm": 10.25, "learning_rate": 8.105440539138371e-07, "loss": 0.82762814, "memory(GiB)": 138.1, "step": 70440, "train_speed(iter/s)": 0.201155 }, { "acc": 0.78295097, "epoch": 1.643651846775252, "grad_norm": 4.65625, "learning_rate": 8.095132053696869e-07, "loss": 0.79033747, "memory(GiB)": 138.1, "step": 70450, "train_speed(iter/s)": 0.201169 }, { "acc": 0.77744741, "epoch": 1.6438851543475408, "grad_norm": 4.90625, "learning_rate": 8.084829550163087e-07, "loss": 0.8104229, "memory(GiB)": 138.1, "step": 70460, "train_speed(iter/s)": 0.201184 }, { "acc": 0.79442644, "epoch": 1.6441184619198297, "grad_norm": 5.09375, "learning_rate": 8.074533030007714e-07, "loss": 0.7583622, "memory(GiB)": 138.1, "step": 70470, "train_speed(iter/s)": 0.2012 }, { "acc": 0.77418184, "epoch": 1.6443517694921186, "grad_norm": 4.25, "learning_rate": 8.064242494700581e-07, "loss": 0.79210043, "memory(GiB)": 138.1, "step": 70480, "train_speed(iter/s)": 0.201215 }, { "acc": 0.7854991, "epoch": 1.6445850770644075, "grad_norm": 9.3125, "learning_rate": 8.053957945710633e-07, "loss": 0.7795774, "memory(GiB)": 138.1, "step": 70490, "train_speed(iter/s)": 0.201229 }, { "acc": 0.76263819, "epoch": 1.6448183846366964, "grad_norm": 5.75, "learning_rate": 8.043679384506014e-07, "loss": 0.83604841, "memory(GiB)": 138.1, "step": 70500, "train_speed(iter/s)": 0.201243 }, { "epoch": 1.6448183846366964, "eval_acc": 0.7446782069803348, "eval_loss": 0.8044106364250183, "eval_runtime": 1269.9954, "eval_samples_per_second": 28.339, "eval_steps_per_second": 14.17, "step": 70500 }, { "acc": 0.77483778, "epoch": 1.6450516922089853, "grad_norm": 5.84375, "learning_rate": 8.033406812553962e-07, "loss": 0.81247177, "memory(GiB)": 138.1, "step": 70510, "train_speed(iter/s)": 0.200516 }, { "acc": 0.78622189, "epoch": 1.6452849997812742, "grad_norm": 7.625, "learning_rate": 8.023140231320919e-07, "loss": 0.77647934, "memory(GiB)": 138.1, "step": 70520, "train_speed(iter/s)": 0.200532 }, { "acc": 0.76669769, "epoch": 1.645518307353563, "grad_norm": 4.9375, "learning_rate": 8.012879642272392e-07, "loss": 0.84020367, "memory(GiB)": 138.1, "step": 70530, "train_speed(iter/s)": 0.200548 }, { "acc": 0.77495613, "epoch": 1.645751614925852, "grad_norm": 4.15625, "learning_rate": 8.002625046873114e-07, "loss": 0.81265011, "memory(GiB)": 138.1, "step": 70540, "train_speed(iter/s)": 0.200562 }, { "acc": 0.77314444, "epoch": 1.6459849224981409, "grad_norm": 4.90625, "learning_rate": 7.992376446586891e-07, "loss": 0.81458588, "memory(GiB)": 138.1, "step": 70550, "train_speed(iter/s)": 0.200577 }, { "acc": 0.78395252, "epoch": 1.6462182300704298, "grad_norm": 6.5625, "learning_rate": 7.982133842876744e-07, "loss": 0.7673286, "memory(GiB)": 138.1, "step": 70560, "train_speed(iter/s)": 0.200592 }, { "acc": 0.79952912, "epoch": 1.6464515376427187, "grad_norm": 6.5625, "learning_rate": 7.971897237204785e-07, "loss": 0.71654468, "memory(GiB)": 138.1, "step": 70570, "train_speed(iter/s)": 0.200608 }, { "acc": 0.77154684, "epoch": 1.6466848452150074, "grad_norm": 5.4375, "learning_rate": 7.961666631032273e-07, "loss": 0.8394556, "memory(GiB)": 138.1, "step": 70580, "train_speed(iter/s)": 0.200622 }, { "acc": 0.78006334, "epoch": 1.6469181527872965, "grad_norm": 5.8125, "learning_rate": 7.951442025819651e-07, "loss": 0.79687848, "memory(GiB)": 138.1, "step": 70590, "train_speed(iter/s)": 0.200636 }, { "acc": 0.77312818, "epoch": 1.6471514603595852, "grad_norm": 12.5625, "learning_rate": 7.941223423026445e-07, "loss": 0.82311382, "memory(GiB)": 138.1, "step": 70600, "train_speed(iter/s)": 0.20065 }, { "acc": 0.78867445, "epoch": 1.6473847679318743, "grad_norm": 5.8125, "learning_rate": 7.931010824111396e-07, "loss": 0.76376724, "memory(GiB)": 138.1, "step": 70610, "train_speed(iter/s)": 0.200664 }, { "acc": 0.78184714, "epoch": 1.647618075504163, "grad_norm": 5.84375, "learning_rate": 7.920804230532331e-07, "loss": 0.77635255, "memory(GiB)": 138.1, "step": 70620, "train_speed(iter/s)": 0.200679 }, { "acc": 0.79582644, "epoch": 1.647851383076452, "grad_norm": 6.21875, "learning_rate": 7.910603643746223e-07, "loss": 0.72300158, "memory(GiB)": 138.1, "step": 70630, "train_speed(iter/s)": 0.200693 }, { "acc": 0.76628838, "epoch": 1.6480846906487407, "grad_norm": 6.71875, "learning_rate": 7.90040906520923e-07, "loss": 0.85222702, "memory(GiB)": 138.1, "step": 70640, "train_speed(iter/s)": 0.200709 }, { "acc": 0.7875648, "epoch": 1.6483179982210299, "grad_norm": 6.40625, "learning_rate": 7.890220496376616e-07, "loss": 0.76165543, "memory(GiB)": 138.1, "step": 70650, "train_speed(iter/s)": 0.200724 }, { "acc": 0.78504138, "epoch": 1.6485513057933185, "grad_norm": 4.65625, "learning_rate": 7.880037938702789e-07, "loss": 0.74155846, "memory(GiB)": 138.1, "step": 70660, "train_speed(iter/s)": 0.200738 }, { "acc": 0.78913331, "epoch": 1.6487846133656077, "grad_norm": 5.4375, "learning_rate": 7.869861393641304e-07, "loss": 0.75573359, "memory(GiB)": 138.1, "step": 70670, "train_speed(iter/s)": 0.200752 }, { "acc": 0.77992077, "epoch": 1.6490179209378963, "grad_norm": 6.5, "learning_rate": 7.859690862644876e-07, "loss": 0.78720312, "memory(GiB)": 138.1, "step": 70680, "train_speed(iter/s)": 0.200767 }, { "acc": 0.77604442, "epoch": 1.6492512285101855, "grad_norm": 6.6875, "learning_rate": 7.849526347165321e-07, "loss": 0.83598385, "memory(GiB)": 138.1, "step": 70690, "train_speed(iter/s)": 0.200782 }, { "acc": 0.78691902, "epoch": 1.6494845360824741, "grad_norm": 8.875, "learning_rate": 7.83936784865365e-07, "loss": 0.77053566, "memory(GiB)": 138.1, "step": 70700, "train_speed(iter/s)": 0.200796 }, { "acc": 0.79002995, "epoch": 1.6497178436547633, "grad_norm": 3.6875, "learning_rate": 7.829215368559967e-07, "loss": 0.74581184, "memory(GiB)": 138.1, "step": 70710, "train_speed(iter/s)": 0.200811 }, { "acc": 0.78356628, "epoch": 1.649951151227052, "grad_norm": 5.28125, "learning_rate": 7.819068908333532e-07, "loss": 0.76630058, "memory(GiB)": 138.1, "step": 70720, "train_speed(iter/s)": 0.200827 }, { "acc": 0.79238472, "epoch": 1.650184458799341, "grad_norm": 5.21875, "learning_rate": 7.808928469422766e-07, "loss": 0.7440876, "memory(GiB)": 138.1, "step": 70730, "train_speed(iter/s)": 0.200841 }, { "acc": 0.7771996, "epoch": 1.6504177663716297, "grad_norm": 5.875, "learning_rate": 7.798794053275193e-07, "loss": 0.80373068, "memory(GiB)": 138.1, "step": 70740, "train_speed(iter/s)": 0.200856 }, { "acc": 0.75577822, "epoch": 1.6506510739439189, "grad_norm": 5.5625, "learning_rate": 7.78866566133753e-07, "loss": 0.90629873, "memory(GiB)": 138.1, "step": 70750, "train_speed(iter/s)": 0.200869 }, { "acc": 0.79528246, "epoch": 1.6508843815162075, "grad_norm": 5.84375, "learning_rate": 7.778543295055563e-07, "loss": 0.73617687, "memory(GiB)": 138.1, "step": 70760, "train_speed(iter/s)": 0.200884 }, { "acc": 0.77932301, "epoch": 1.6511176890884964, "grad_norm": 5.53125, "learning_rate": 7.768426955874287e-07, "loss": 0.78966246, "memory(GiB)": 138.1, "step": 70770, "train_speed(iter/s)": 0.2009 }, { "acc": 0.78333249, "epoch": 1.6513509966607853, "grad_norm": 5.46875, "learning_rate": 7.758316645237791e-07, "loss": 0.76522617, "memory(GiB)": 138.1, "step": 70780, "train_speed(iter/s)": 0.200915 }, { "acc": 0.78264174, "epoch": 1.6515843042330742, "grad_norm": 4.59375, "learning_rate": 7.748212364589314e-07, "loss": 0.77815781, "memory(GiB)": 138.1, "step": 70790, "train_speed(iter/s)": 0.20093 }, { "acc": 0.7721468, "epoch": 1.6518176118053631, "grad_norm": 6.09375, "learning_rate": 7.738114115371254e-07, "loss": 0.82284946, "memory(GiB)": 138.1, "step": 70800, "train_speed(iter/s)": 0.200945 }, { "acc": 0.77733574, "epoch": 1.652050919377652, "grad_norm": 5.71875, "learning_rate": 7.728021899025124e-07, "loss": 0.80925665, "memory(GiB)": 138.1, "step": 70810, "train_speed(iter/s)": 0.20096 }, { "acc": 0.79648423, "epoch": 1.652284226949941, "grad_norm": 4.53125, "learning_rate": 7.717935716991592e-07, "loss": 0.72788992, "memory(GiB)": 138.1, "step": 70820, "train_speed(iter/s)": 0.200975 }, { "acc": 0.7860816, "epoch": 1.6525175345222298, "grad_norm": 7.125, "learning_rate": 7.707855570710443e-07, "loss": 0.75475664, "memory(GiB)": 138.1, "step": 70830, "train_speed(iter/s)": 0.20099 }, { "acc": 0.77979727, "epoch": 1.6527508420945187, "grad_norm": 5.03125, "learning_rate": 7.69778146162064e-07, "loss": 0.80329094, "memory(GiB)": 138.1, "step": 70840, "train_speed(iter/s)": 0.201005 }, { "acc": 0.77364244, "epoch": 1.6529841496668076, "grad_norm": 5.875, "learning_rate": 7.68771339116024e-07, "loss": 0.82023067, "memory(GiB)": 138.1, "step": 70850, "train_speed(iter/s)": 0.201019 }, { "acc": 0.77010069, "epoch": 1.6532174572390965, "grad_norm": 5.65625, "learning_rate": 7.677651360766453e-07, "loss": 0.85668049, "memory(GiB)": 138.1, "step": 70860, "train_speed(iter/s)": 0.201034 }, { "acc": 0.7844202, "epoch": 1.6534507648113854, "grad_norm": 7.375, "learning_rate": 7.667595371875663e-07, "loss": 0.7779212, "memory(GiB)": 138.1, "step": 70870, "train_speed(iter/s)": 0.201048 }, { "acc": 0.77224879, "epoch": 1.6536840723836743, "grad_norm": 5.03125, "learning_rate": 7.657545425923313e-07, "loss": 0.82133636, "memory(GiB)": 138.1, "step": 70880, "train_speed(iter/s)": 0.201062 }, { "acc": 0.76610975, "epoch": 1.6539173799559632, "grad_norm": 5.46875, "learning_rate": 7.647501524344064e-07, "loss": 0.8340517, "memory(GiB)": 138.1, "step": 70890, "train_speed(iter/s)": 0.201077 }, { "acc": 0.78516321, "epoch": 1.6541506875282521, "grad_norm": 4.75, "learning_rate": 7.637463668571659e-07, "loss": 0.76711388, "memory(GiB)": 138.1, "step": 70900, "train_speed(iter/s)": 0.201091 }, { "acc": 0.77286749, "epoch": 1.654383995100541, "grad_norm": 5.375, "learning_rate": 7.627431860039019e-07, "loss": 0.81947851, "memory(GiB)": 138.1, "step": 70910, "train_speed(iter/s)": 0.201105 }, { "acc": 0.76813011, "epoch": 1.65461730267283, "grad_norm": 5.71875, "learning_rate": 7.617406100178171e-07, "loss": 0.83200006, "memory(GiB)": 138.1, "step": 70920, "train_speed(iter/s)": 0.201119 }, { "acc": 0.78832273, "epoch": 1.6548506102451188, "grad_norm": 5.03125, "learning_rate": 7.607386390420279e-07, "loss": 0.74536948, "memory(GiB)": 138.1, "step": 70930, "train_speed(iter/s)": 0.201134 }, { "acc": 0.7811326, "epoch": 1.6550839178174077, "grad_norm": 5.4375, "learning_rate": 7.597372732195674e-07, "loss": 0.78975849, "memory(GiB)": 138.1, "step": 70940, "train_speed(iter/s)": 0.201149 }, { "acc": 0.77318063, "epoch": 1.6553172253896964, "grad_norm": 6.75, "learning_rate": 7.587365126933782e-07, "loss": 0.8096487, "memory(GiB)": 138.1, "step": 70950, "train_speed(iter/s)": 0.201163 }, { "acc": 0.78359327, "epoch": 1.6555505329619855, "grad_norm": 5.28125, "learning_rate": 7.577363576063212e-07, "loss": 0.77436295, "memory(GiB)": 138.1, "step": 70960, "train_speed(iter/s)": 0.201178 }, { "acc": 0.78843203, "epoch": 1.6557838405342742, "grad_norm": 5.46875, "learning_rate": 7.567368081011656e-07, "loss": 0.74306149, "memory(GiB)": 138.1, "step": 70970, "train_speed(iter/s)": 0.201191 }, { "acc": 0.78639054, "epoch": 1.6560171481065633, "grad_norm": 7.375, "learning_rate": 7.557378643206003e-07, "loss": 0.78001995, "memory(GiB)": 138.1, "step": 70980, "train_speed(iter/s)": 0.201206 }, { "acc": 0.79448624, "epoch": 1.656250455678852, "grad_norm": 7.6875, "learning_rate": 7.547395264072193e-07, "loss": 0.74755363, "memory(GiB)": 138.1, "step": 70990, "train_speed(iter/s)": 0.20122 }, { "acc": 0.78536434, "epoch": 1.656483763251141, "grad_norm": 6.78125, "learning_rate": 7.537417945035391e-07, "loss": 0.76806111, "memory(GiB)": 138.1, "step": 71000, "train_speed(iter/s)": 0.201235 }, { "epoch": 1.656483763251141, "eval_acc": 0.7447166415960871, "eval_loss": 0.8044018745422363, "eval_runtime": 1270.2511, "eval_samples_per_second": 28.334, "eval_steps_per_second": 14.167, "step": 71000 }, { "acc": 0.78267994, "epoch": 1.6567170708234298, "grad_norm": 5.5625, "learning_rate": 7.52744668751984e-07, "loss": 0.78940582, "memory(GiB)": 138.1, "step": 71010, "train_speed(iter/s)": 0.200514 }, { "acc": 0.78548698, "epoch": 1.656950378395719, "grad_norm": 9.375, "learning_rate": 7.517481492948925e-07, "loss": 0.77665896, "memory(GiB)": 138.1, "step": 71020, "train_speed(iter/s)": 0.200527 }, { "acc": 0.76159725, "epoch": 1.6571836859680076, "grad_norm": 5.84375, "learning_rate": 7.507522362745195e-07, "loss": 0.8417695, "memory(GiB)": 138.1, "step": 71030, "train_speed(iter/s)": 0.200541 }, { "acc": 0.78062215, "epoch": 1.6574169935402967, "grad_norm": 5.59375, "learning_rate": 7.497569298330293e-07, "loss": 0.7981185, "memory(GiB)": 138.1, "step": 71040, "train_speed(iter/s)": 0.200557 }, { "acc": 0.76416349, "epoch": 1.6576503011125854, "grad_norm": 6.3125, "learning_rate": 7.487622301125041e-07, "loss": 0.87657032, "memory(GiB)": 138.1, "step": 71050, "train_speed(iter/s)": 0.200572 }, { "acc": 0.78121519, "epoch": 1.6578836086848745, "grad_norm": 4.65625, "learning_rate": 7.477681372549355e-07, "loss": 0.77680473, "memory(GiB)": 138.1, "step": 71060, "train_speed(iter/s)": 0.200585 }, { "acc": 0.77829385, "epoch": 1.6581169162571632, "grad_norm": 5.75, "learning_rate": 7.467746514022284e-07, "loss": 0.80236874, "memory(GiB)": 138.1, "step": 71070, "train_speed(iter/s)": 0.200599 }, { "acc": 0.76091347, "epoch": 1.6583502238294523, "grad_norm": 5.5625, "learning_rate": 7.457817726962058e-07, "loss": 0.86437483, "memory(GiB)": 138.1, "step": 71080, "train_speed(iter/s)": 0.200613 }, { "acc": 0.79476452, "epoch": 1.658583531401741, "grad_norm": 5.78125, "learning_rate": 7.447895012785983e-07, "loss": 0.73813944, "memory(GiB)": 138.1, "step": 71090, "train_speed(iter/s)": 0.200627 }, { "acc": 0.77920585, "epoch": 1.65881683897403, "grad_norm": 5.09375, "learning_rate": 7.437978372910554e-07, "loss": 0.80791998, "memory(GiB)": 138.1, "step": 71100, "train_speed(iter/s)": 0.200641 }, { "acc": 0.7747447, "epoch": 1.6590501465463188, "grad_norm": 4.9375, "learning_rate": 7.428067808751327e-07, "loss": 0.80314159, "memory(GiB)": 138.1, "step": 71110, "train_speed(iter/s)": 0.200656 }, { "acc": 0.78580241, "epoch": 1.659283454118608, "grad_norm": 5.71875, "learning_rate": 7.41816332172306e-07, "loss": 0.77066259, "memory(GiB)": 138.1, "step": 71120, "train_speed(iter/s)": 0.200671 }, { "acc": 0.78698373, "epoch": 1.6595167616908966, "grad_norm": 6.28125, "learning_rate": 7.408264913239598e-07, "loss": 0.74832349, "memory(GiB)": 138.1, "step": 71130, "train_speed(iter/s)": 0.200685 }, { "acc": 0.77769785, "epoch": 1.6597500692631855, "grad_norm": 5.96875, "learning_rate": 7.398372584713964e-07, "loss": 0.78736668, "memory(GiB)": 138.1, "step": 71140, "train_speed(iter/s)": 0.2007 }, { "acc": 0.78976698, "epoch": 1.6599833768354744, "grad_norm": 6.0625, "learning_rate": 7.388486337558265e-07, "loss": 0.75616255, "memory(GiB)": 138.1, "step": 71150, "train_speed(iter/s)": 0.200714 }, { "acc": 0.80499172, "epoch": 1.6602166844077633, "grad_norm": 5.5, "learning_rate": 7.378606173183749e-07, "loss": 0.70488954, "memory(GiB)": 138.1, "step": 71160, "train_speed(iter/s)": 0.200729 }, { "acc": 0.79924831, "epoch": 1.6604499919800522, "grad_norm": 5.5625, "learning_rate": 7.36873209300083e-07, "loss": 0.71615515, "memory(GiB)": 138.1, "step": 71170, "train_speed(iter/s)": 0.200743 }, { "acc": 0.77024212, "epoch": 1.660683299552341, "grad_norm": 5.1875, "learning_rate": 7.35886409841901e-07, "loss": 0.82428303, "memory(GiB)": 138.1, "step": 71180, "train_speed(iter/s)": 0.200758 }, { "acc": 0.78113737, "epoch": 1.66091660712463, "grad_norm": 4.3125, "learning_rate": 7.349002190846965e-07, "loss": 0.77647839, "memory(GiB)": 138.1, "step": 71190, "train_speed(iter/s)": 0.200773 }, { "acc": 0.78592319, "epoch": 1.6611499146969189, "grad_norm": 4.5625, "learning_rate": 7.339146371692468e-07, "loss": 0.77881441, "memory(GiB)": 138.1, "step": 71200, "train_speed(iter/s)": 0.200787 }, { "acc": 0.78388247, "epoch": 1.6613832222692078, "grad_norm": 7.34375, "learning_rate": 7.329296642362438e-07, "loss": 0.77189293, "memory(GiB)": 138.1, "step": 71210, "train_speed(iter/s)": 0.200803 }, { "acc": 0.77709417, "epoch": 1.6616165298414967, "grad_norm": 12.875, "learning_rate": 7.319453004262911e-07, "loss": 0.80911427, "memory(GiB)": 138.1, "step": 71220, "train_speed(iter/s)": 0.200818 }, { "acc": 0.78052092, "epoch": 1.6618498374137856, "grad_norm": 6.15625, "learning_rate": 7.309615458799058e-07, "loss": 0.80394659, "memory(GiB)": 138.1, "step": 71230, "train_speed(iter/s)": 0.200832 }, { "acc": 0.78070478, "epoch": 1.6620831449860745, "grad_norm": 4.84375, "learning_rate": 7.299784007375205e-07, "loss": 0.78910527, "memory(GiB)": 138.1, "step": 71240, "train_speed(iter/s)": 0.200846 }, { "acc": 0.79349833, "epoch": 1.6623164525583634, "grad_norm": 5.375, "learning_rate": 7.289958651394774e-07, "loss": 0.7258852, "memory(GiB)": 138.1, "step": 71250, "train_speed(iter/s)": 0.20086 }, { "acc": 0.76768055, "epoch": 1.6625497601306523, "grad_norm": 5.28125, "learning_rate": 7.280139392260344e-07, "loss": 0.83310871, "memory(GiB)": 138.1, "step": 71260, "train_speed(iter/s)": 0.200874 }, { "acc": 0.76861515, "epoch": 1.6627830677029412, "grad_norm": 7.6875, "learning_rate": 7.270326231373598e-07, "loss": 0.81961651, "memory(GiB)": 138.1, "step": 71270, "train_speed(iter/s)": 0.20089 }, { "acc": 0.81069078, "epoch": 1.66301637527523, "grad_norm": 6.3125, "learning_rate": 7.260519170135383e-07, "loss": 0.6584691, "memory(GiB)": 138.1, "step": 71280, "train_speed(iter/s)": 0.200904 }, { "acc": 0.7901927, "epoch": 1.663249682847519, "grad_norm": 6.90625, "learning_rate": 7.25071820994564e-07, "loss": 0.76735644, "memory(GiB)": 138.1, "step": 71290, "train_speed(iter/s)": 0.200919 }, { "acc": 0.77350497, "epoch": 1.6634829904198078, "grad_norm": 6.15625, "learning_rate": 7.240923352203438e-07, "loss": 0.81890402, "memory(GiB)": 138.1, "step": 71300, "train_speed(iter/s)": 0.200934 }, { "acc": 0.77600427, "epoch": 1.6637162979920967, "grad_norm": 4.46875, "learning_rate": 7.231134598307022e-07, "loss": 0.80230637, "memory(GiB)": 138.1, "step": 71310, "train_speed(iter/s)": 0.200949 }, { "acc": 0.78995695, "epoch": 1.6639496055643856, "grad_norm": 4.40625, "learning_rate": 7.221351949653715e-07, "loss": 0.73928757, "memory(GiB)": 138.1, "step": 71320, "train_speed(iter/s)": 0.200965 }, { "acc": 0.78438892, "epoch": 1.6641829131366745, "grad_norm": 4.8125, "learning_rate": 7.211575407639987e-07, "loss": 0.77428908, "memory(GiB)": 138.1, "step": 71330, "train_speed(iter/s)": 0.200979 }, { "acc": 0.77166224, "epoch": 1.6644162207089632, "grad_norm": 7.84375, "learning_rate": 7.20180497366143e-07, "loss": 0.83759232, "memory(GiB)": 138.1, "step": 71340, "train_speed(iter/s)": 0.200994 }, { "acc": 0.79361496, "epoch": 1.6646495282812523, "grad_norm": 4.28125, "learning_rate": 7.192040649112797e-07, "loss": 0.74620438, "memory(GiB)": 138.1, "step": 71350, "train_speed(iter/s)": 0.201009 }, { "acc": 0.77841673, "epoch": 1.664882835853541, "grad_norm": 5.0625, "learning_rate": 7.182282435387922e-07, "loss": 0.77743587, "memory(GiB)": 138.1, "step": 71360, "train_speed(iter/s)": 0.201024 }, { "acc": 0.77280235, "epoch": 1.6651161434258301, "grad_norm": 5.4375, "learning_rate": 7.172530333879774e-07, "loss": 0.83453283, "memory(GiB)": 138.1, "step": 71370, "train_speed(iter/s)": 0.201039 }, { "acc": 0.78664598, "epoch": 1.6653494509981188, "grad_norm": 6.0, "learning_rate": 7.1627843459805e-07, "loss": 0.78319445, "memory(GiB)": 138.1, "step": 71380, "train_speed(iter/s)": 0.201053 }, { "acc": 0.77737265, "epoch": 1.665582758570408, "grad_norm": 5.65625, "learning_rate": 7.153044473081299e-07, "loss": 0.79047518, "memory(GiB)": 138.1, "step": 71390, "train_speed(iter/s)": 0.201068 }, { "acc": 0.7710959, "epoch": 1.6658160661426966, "grad_norm": 6.0625, "learning_rate": 7.143310716572565e-07, "loss": 0.83085079, "memory(GiB)": 138.1, "step": 71400, "train_speed(iter/s)": 0.201083 }, { "acc": 0.797966, "epoch": 1.6660493737149857, "grad_norm": 5.75, "learning_rate": 7.133583077843776e-07, "loss": 0.70618973, "memory(GiB)": 138.1, "step": 71410, "train_speed(iter/s)": 0.201099 }, { "acc": 0.77575045, "epoch": 1.6662826812872744, "grad_norm": 5.34375, "learning_rate": 7.12386155828354e-07, "loss": 0.82906694, "memory(GiB)": 138.1, "step": 71420, "train_speed(iter/s)": 0.201114 }, { "acc": 0.75881324, "epoch": 1.6665159888595635, "grad_norm": 5.65625, "learning_rate": 7.114146159279622e-07, "loss": 0.8684248, "memory(GiB)": 138.1, "step": 71430, "train_speed(iter/s)": 0.201129 }, { "acc": 0.77309985, "epoch": 1.6667492964318522, "grad_norm": 6.28125, "learning_rate": 7.104436882218879e-07, "loss": 0.81412992, "memory(GiB)": 138.1, "step": 71440, "train_speed(iter/s)": 0.201143 }, { "acc": 0.78402615, "epoch": 1.6669826040041413, "grad_norm": 3.8125, "learning_rate": 7.094733728487313e-07, "loss": 0.78737326, "memory(GiB)": 138.1, "step": 71450, "train_speed(iter/s)": 0.201157 }, { "acc": 0.79735641, "epoch": 1.66721591157643, "grad_norm": 6.6875, "learning_rate": 7.085036699470027e-07, "loss": 0.71820598, "memory(GiB)": 138.1, "step": 71460, "train_speed(iter/s)": 0.201172 }, { "acc": 0.77270122, "epoch": 1.6674492191487191, "grad_norm": 5.03125, "learning_rate": 7.075345796551303e-07, "loss": 0.82472687, "memory(GiB)": 138.1, "step": 71470, "train_speed(iter/s)": 0.201185 }, { "acc": 0.79649363, "epoch": 1.6676825267210078, "grad_norm": 6.3125, "learning_rate": 7.065661021114478e-07, "loss": 0.72360072, "memory(GiB)": 138.1, "step": 71480, "train_speed(iter/s)": 0.201199 }, { "acc": 0.78411646, "epoch": 1.667915834293297, "grad_norm": 6.09375, "learning_rate": 7.055982374542086e-07, "loss": 0.75441995, "memory(GiB)": 138.1, "step": 71490, "train_speed(iter/s)": 0.201214 }, { "acc": 0.76906228, "epoch": 1.6681491418655856, "grad_norm": 5.0, "learning_rate": 7.046309858215733e-07, "loss": 0.84269314, "memory(GiB)": 138.1, "step": 71500, "train_speed(iter/s)": 0.201228 }, { "epoch": 1.6681491418655856, "eval_acc": 0.7446756446726179, "eval_loss": 0.8043976426124573, "eval_runtime": 1270.6288, "eval_samples_per_second": 28.325, "eval_steps_per_second": 14.163, "step": 71500 }, { "acc": 0.77692218, "epoch": 1.6683824494378747, "grad_norm": 5.90625, "learning_rate": 7.036643473516164e-07, "loss": 0.81663551, "memory(GiB)": 138.1, "step": 71510, "train_speed(iter/s)": 0.200512 }, { "acc": 0.77694969, "epoch": 1.6686157570101634, "grad_norm": 5.625, "learning_rate": 7.026983221823264e-07, "loss": 0.81028423, "memory(GiB)": 138.1, "step": 71520, "train_speed(iter/s)": 0.200526 }, { "acc": 0.77378316, "epoch": 1.6688490645824523, "grad_norm": 6.46875, "learning_rate": 7.017329104516013e-07, "loss": 0.7947732, "memory(GiB)": 138.1, "step": 71530, "train_speed(iter/s)": 0.200541 }, { "acc": 0.77548847, "epoch": 1.6690823721547412, "grad_norm": 4.65625, "learning_rate": 7.007681122972559e-07, "loss": 0.80890713, "memory(GiB)": 138.1, "step": 71540, "train_speed(iter/s)": 0.200555 }, { "acc": 0.78633842, "epoch": 1.66931567972703, "grad_norm": 5.4375, "learning_rate": 6.998039278570134e-07, "loss": 0.74881501, "memory(GiB)": 138.1, "step": 71550, "train_speed(iter/s)": 0.20057 }, { "acc": 0.76336951, "epoch": 1.669548987299319, "grad_norm": 5.78125, "learning_rate": 6.988403572685115e-07, "loss": 0.82800121, "memory(GiB)": 138.1, "step": 71560, "train_speed(iter/s)": 0.200585 }, { "acc": 0.78569932, "epoch": 1.669782294871608, "grad_norm": 4.25, "learning_rate": 6.978774006692984e-07, "loss": 0.76846313, "memory(GiB)": 138.1, "step": 71570, "train_speed(iter/s)": 0.200599 }, { "acc": 0.78505049, "epoch": 1.6700156024438968, "grad_norm": 4.0625, "learning_rate": 6.969150581968359e-07, "loss": 0.76446638, "memory(GiB)": 138.1, "step": 71580, "train_speed(iter/s)": 0.200613 }, { "acc": 0.79139848, "epoch": 1.6702489100161857, "grad_norm": 3.65625, "learning_rate": 6.959533299885001e-07, "loss": 0.75788527, "memory(GiB)": 138.1, "step": 71590, "train_speed(iter/s)": 0.200628 }, { "acc": 0.78600264, "epoch": 1.6704822175884746, "grad_norm": 5.28125, "learning_rate": 6.949922161815748e-07, "loss": 0.76887283, "memory(GiB)": 138.1, "step": 71600, "train_speed(iter/s)": 0.200643 }, { "acc": 0.77672091, "epoch": 1.6707155251607635, "grad_norm": 5.59375, "learning_rate": 6.94031716913261e-07, "loss": 0.80708447, "memory(GiB)": 138.1, "step": 71610, "train_speed(iter/s)": 0.200657 }, { "acc": 0.78550982, "epoch": 1.6709488327330524, "grad_norm": 6.84375, "learning_rate": 6.930718323206676e-07, "loss": 0.78040328, "memory(GiB)": 138.1, "step": 71620, "train_speed(iter/s)": 0.200672 }, { "acc": 0.78563089, "epoch": 1.6711821403053413, "grad_norm": 7.84375, "learning_rate": 6.921125625408198e-07, "loss": 0.75789003, "memory(GiB)": 138.1, "step": 71630, "train_speed(iter/s)": 0.200686 }, { "acc": 0.77634792, "epoch": 1.6714154478776302, "grad_norm": 7.25, "learning_rate": 6.911539077106527e-07, "loss": 0.81327028, "memory(GiB)": 138.1, "step": 71640, "train_speed(iter/s)": 0.200701 }, { "acc": 0.75646667, "epoch": 1.671648755449919, "grad_norm": 5.09375, "learning_rate": 6.901958679670123e-07, "loss": 0.90101776, "memory(GiB)": 138.1, "step": 71650, "train_speed(iter/s)": 0.200715 }, { "acc": 0.80443058, "epoch": 1.671882063022208, "grad_norm": 5.625, "learning_rate": 6.892384434466609e-07, "loss": 0.70652561, "memory(GiB)": 138.1, "step": 71660, "train_speed(iter/s)": 0.200729 }, { "acc": 0.77762895, "epoch": 1.6721153705944969, "grad_norm": 6.875, "learning_rate": 6.882816342862692e-07, "loss": 0.80890656, "memory(GiB)": 138.1, "step": 71670, "train_speed(iter/s)": 0.200743 }, { "acc": 0.78075476, "epoch": 1.6723486781667858, "grad_norm": 4.5, "learning_rate": 6.873254406224223e-07, "loss": 0.79064302, "memory(GiB)": 138.1, "step": 71680, "train_speed(iter/s)": 0.200758 }, { "acc": 0.78527708, "epoch": 1.6725819857390747, "grad_norm": 6.09375, "learning_rate": 6.863698625916137e-07, "loss": 0.74829164, "memory(GiB)": 138.1, "step": 71690, "train_speed(iter/s)": 0.200773 }, { "acc": 0.77983356, "epoch": 1.6728152933113636, "grad_norm": 4.25, "learning_rate": 6.854149003302562e-07, "loss": 0.77540479, "memory(GiB)": 138.1, "step": 71700, "train_speed(iter/s)": 0.200787 }, { "acc": 0.79154415, "epoch": 1.6730486008836525, "grad_norm": 4.90625, "learning_rate": 6.844605539746679e-07, "loss": 0.7527297, "memory(GiB)": 138.1, "step": 71710, "train_speed(iter/s)": 0.200801 }, { "acc": 0.81196423, "epoch": 1.6732819084559414, "grad_norm": 3.984375, "learning_rate": 6.835068236610809e-07, "loss": 0.66903067, "memory(GiB)": 138.1, "step": 71720, "train_speed(iter/s)": 0.200815 }, { "acc": 0.80410776, "epoch": 1.67351521602823, "grad_norm": 5.28125, "learning_rate": 6.825537095256418e-07, "loss": 0.68753047, "memory(GiB)": 138.1, "step": 71730, "train_speed(iter/s)": 0.200829 }, { "acc": 0.76747966, "epoch": 1.6737485236005192, "grad_norm": 4.625, "learning_rate": 6.816012117044052e-07, "loss": 0.83944092, "memory(GiB)": 138.1, "step": 71740, "train_speed(iter/s)": 0.200843 }, { "acc": 0.7719739, "epoch": 1.6739818311728079, "grad_norm": 4.8125, "learning_rate": 6.806493303333422e-07, "loss": 0.82619953, "memory(GiB)": 138.1, "step": 71750, "train_speed(iter/s)": 0.200859 }, { "acc": 0.77261276, "epoch": 1.674215138745097, "grad_norm": 8.25, "learning_rate": 6.796980655483315e-07, "loss": 0.80542164, "memory(GiB)": 138.1, "step": 71760, "train_speed(iter/s)": 0.200874 }, { "acc": 0.77031894, "epoch": 1.6744484463173857, "grad_norm": 4.46875, "learning_rate": 6.787474174851683e-07, "loss": 0.84091234, "memory(GiB)": 138.1, "step": 71770, "train_speed(iter/s)": 0.200889 }, { "acc": 0.78931608, "epoch": 1.6746817538896748, "grad_norm": 4.71875, "learning_rate": 6.777973862795556e-07, "loss": 0.76971698, "memory(GiB)": 138.1, "step": 71780, "train_speed(iter/s)": 0.200904 }, { "acc": 0.80558147, "epoch": 1.6749150614619635, "grad_norm": 5.0, "learning_rate": 6.768479720671106e-07, "loss": 0.69032288, "memory(GiB)": 138.1, "step": 71790, "train_speed(iter/s)": 0.200918 }, { "acc": 0.79586306, "epoch": 1.6751483690342526, "grad_norm": 3.1875, "learning_rate": 6.758991749833616e-07, "loss": 0.72956157, "memory(GiB)": 138.1, "step": 71800, "train_speed(iter/s)": 0.200932 }, { "acc": 0.77934403, "epoch": 1.6753816766065412, "grad_norm": 4.65625, "learning_rate": 6.749509951637484e-07, "loss": 0.78227091, "memory(GiB)": 138.1, "step": 71810, "train_speed(iter/s)": 0.200947 }, { "acc": 0.7744832, "epoch": 1.6756149841788304, "grad_norm": 7.0625, "learning_rate": 6.740034327436251e-07, "loss": 0.81382933, "memory(GiB)": 138.1, "step": 71820, "train_speed(iter/s)": 0.200961 }, { "acc": 0.77508135, "epoch": 1.675848291751119, "grad_norm": 5.84375, "learning_rate": 6.730564878582535e-07, "loss": 0.81892805, "memory(GiB)": 138.1, "step": 71830, "train_speed(iter/s)": 0.200975 }, { "acc": 0.80630512, "epoch": 1.6760815993234082, "grad_norm": 6.15625, "learning_rate": 6.721101606428132e-07, "loss": 0.70892029, "memory(GiB)": 138.1, "step": 71840, "train_speed(iter/s)": 0.200989 }, { "acc": 0.7737956, "epoch": 1.6763149068956968, "grad_norm": 5.8125, "learning_rate": 6.711644512323895e-07, "loss": 0.83846264, "memory(GiB)": 138.1, "step": 71850, "train_speed(iter/s)": 0.201004 }, { "acc": 0.76679764, "epoch": 1.676548214467986, "grad_norm": 4.3125, "learning_rate": 6.702193597619821e-07, "loss": 0.84609222, "memory(GiB)": 138.1, "step": 71860, "train_speed(iter/s)": 0.201018 }, { "acc": 0.75910959, "epoch": 1.6767815220402746, "grad_norm": 5.40625, "learning_rate": 6.692748863665044e-07, "loss": 0.85825319, "memory(GiB)": 138.1, "step": 71870, "train_speed(iter/s)": 0.201033 }, { "acc": 0.78103266, "epoch": 1.6770148296125638, "grad_norm": 5.34375, "learning_rate": 6.683310311807772e-07, "loss": 0.78897181, "memory(GiB)": 138.1, "step": 71880, "train_speed(iter/s)": 0.201048 }, { "acc": 0.78252711, "epoch": 1.6772481371848524, "grad_norm": 5.71875, "learning_rate": 6.673877943395385e-07, "loss": 0.7663425, "memory(GiB)": 138.1, "step": 71890, "train_speed(iter/s)": 0.201063 }, { "acc": 0.79238405, "epoch": 1.6774814447571416, "grad_norm": 6.40625, "learning_rate": 6.664451759774332e-07, "loss": 0.7801724, "memory(GiB)": 138.1, "step": 71900, "train_speed(iter/s)": 0.201077 }, { "acc": 0.78016853, "epoch": 1.6777147523294302, "grad_norm": 5.625, "learning_rate": 6.655031762290203e-07, "loss": 0.77946939, "memory(GiB)": 138.1, "step": 71910, "train_speed(iter/s)": 0.201091 }, { "acc": 0.75844526, "epoch": 1.6779480599017191, "grad_norm": 4.71875, "learning_rate": 6.645617952287686e-07, "loss": 0.85980473, "memory(GiB)": 138.1, "step": 71920, "train_speed(iter/s)": 0.201105 }, { "acc": 0.77880216, "epoch": 1.678181367474008, "grad_norm": 6.15625, "learning_rate": 6.636210331110621e-07, "loss": 0.78121481, "memory(GiB)": 138.1, "step": 71930, "train_speed(iter/s)": 0.201119 }, { "acc": 0.76943226, "epoch": 1.678414675046297, "grad_norm": 4.6875, "learning_rate": 6.626808900101939e-07, "loss": 0.82415466, "memory(GiB)": 138.1, "step": 71940, "train_speed(iter/s)": 0.201134 }, { "acc": 0.76650505, "epoch": 1.6786479826185858, "grad_norm": 5.59375, "learning_rate": 6.617413660603672e-07, "loss": 0.86484928, "memory(GiB)": 138.1, "step": 71950, "train_speed(iter/s)": 0.201149 }, { "acc": 0.78949447, "epoch": 1.6788812901908747, "grad_norm": 5.0, "learning_rate": 6.608024613957015e-07, "loss": 0.77047791, "memory(GiB)": 138.1, "step": 71960, "train_speed(iter/s)": 0.201164 }, { "acc": 0.77228599, "epoch": 1.6791145977631636, "grad_norm": 6.4375, "learning_rate": 6.598641761502222e-07, "loss": 0.82509651, "memory(GiB)": 138.1, "step": 71970, "train_speed(iter/s)": 0.201178 }, { "acc": 0.75860147, "epoch": 1.6793479053354525, "grad_norm": 5.4375, "learning_rate": 6.58926510457873e-07, "loss": 0.88990307, "memory(GiB)": 138.1, "step": 71980, "train_speed(iter/s)": 0.201192 }, { "acc": 0.77799234, "epoch": 1.6795812129077414, "grad_norm": 4.5625, "learning_rate": 6.579894644525026e-07, "loss": 0.83457985, "memory(GiB)": 138.1, "step": 71990, "train_speed(iter/s)": 0.201207 }, { "acc": 0.80525131, "epoch": 1.6798145204800303, "grad_norm": 3.59375, "learning_rate": 6.570530382678741e-07, "loss": 0.69075212, "memory(GiB)": 138.1, "step": 72000, "train_speed(iter/s)": 0.201221 }, { "epoch": 1.6798145204800303, "eval_acc": 0.7446879757785052, "eval_loss": 0.8043897747993469, "eval_runtime": 1271.86, "eval_samples_per_second": 28.298, "eval_steps_per_second": 14.149, "step": 72000 }, { "acc": 0.77684412, "epoch": 1.6800478280523192, "grad_norm": 7.34375, "learning_rate": 6.561172320376647e-07, "loss": 0.79601202, "memory(GiB)": 138.1, "step": 72010, "train_speed(iter/s)": 0.200509 }, { "acc": 0.76988397, "epoch": 1.6802811356246081, "grad_norm": 5.65625, "learning_rate": 6.551820458954561e-07, "loss": 0.80573511, "memory(GiB)": 138.1, "step": 72020, "train_speed(iter/s)": 0.200524 }, { "acc": 0.77471781, "epoch": 1.680514443196897, "grad_norm": 4.53125, "learning_rate": 6.5424747997475e-07, "loss": 0.79388332, "memory(GiB)": 138.1, "step": 72030, "train_speed(iter/s)": 0.200539 }, { "acc": 0.76678681, "epoch": 1.680747750769186, "grad_norm": 4.5, "learning_rate": 6.53313534408952e-07, "loss": 0.85442133, "memory(GiB)": 138.1, "step": 72040, "train_speed(iter/s)": 0.200554 }, { "acc": 0.78160801, "epoch": 1.6809810583414748, "grad_norm": 7.375, "learning_rate": 6.523802093313857e-07, "loss": 0.75970039, "memory(GiB)": 138.1, "step": 72050, "train_speed(iter/s)": 0.20057 }, { "acc": 0.75993977, "epoch": 1.6812143659137637, "grad_norm": 6.5625, "learning_rate": 6.514475048752805e-07, "loss": 0.88980999, "memory(GiB)": 138.1, "step": 72060, "train_speed(iter/s)": 0.200585 }, { "acc": 0.78098273, "epoch": 1.6814476734860526, "grad_norm": 4.4375, "learning_rate": 6.505154211737813e-07, "loss": 0.7791935, "memory(GiB)": 138.1, "step": 72070, "train_speed(iter/s)": 0.200599 }, { "acc": 0.78494759, "epoch": 1.6816809810583415, "grad_norm": 6.375, "learning_rate": 6.495839583599428e-07, "loss": 0.765938, "memory(GiB)": 138.1, "step": 72080, "train_speed(iter/s)": 0.200614 }, { "acc": 0.78756342, "epoch": 1.6819142886306304, "grad_norm": 4.90625, "learning_rate": 6.486531165667292e-07, "loss": 0.74883633, "memory(GiB)": 138.1, "step": 72090, "train_speed(iter/s)": 0.200629 }, { "acc": 0.78392973, "epoch": 1.6821475962029193, "grad_norm": 5.03125, "learning_rate": 6.477228959270199e-07, "loss": 0.78099174, "memory(GiB)": 138.1, "step": 72100, "train_speed(iter/s)": 0.200643 }, { "acc": 0.78292713, "epoch": 1.6823809037752082, "grad_norm": 5.46875, "learning_rate": 6.467932965736024e-07, "loss": 0.75630102, "memory(GiB)": 138.1, "step": 72110, "train_speed(iter/s)": 0.200657 }, { "acc": 0.78047161, "epoch": 1.682614211347497, "grad_norm": 5.125, "learning_rate": 6.458643186391789e-07, "loss": 0.76697688, "memory(GiB)": 138.1, "step": 72120, "train_speed(iter/s)": 0.200672 }, { "acc": 0.78350782, "epoch": 1.682847518919786, "grad_norm": 7.0, "learning_rate": 6.449359622563567e-07, "loss": 0.76841931, "memory(GiB)": 138.1, "step": 72130, "train_speed(iter/s)": 0.200686 }, { "acc": 0.79610729, "epoch": 1.6830808264920747, "grad_norm": 4.375, "learning_rate": 6.44008227557662e-07, "loss": 0.74552526, "memory(GiB)": 138.1, "step": 72140, "train_speed(iter/s)": 0.200701 }, { "acc": 0.75990648, "epoch": 1.6833141340643638, "grad_norm": 5.34375, "learning_rate": 6.430811146755272e-07, "loss": 0.85680218, "memory(GiB)": 138.1, "step": 72150, "train_speed(iter/s)": 0.200715 }, { "acc": 0.78126965, "epoch": 1.6835474416366525, "grad_norm": 6.0625, "learning_rate": 6.421546237422971e-07, "loss": 0.78917265, "memory(GiB)": 138.1, "step": 72160, "train_speed(iter/s)": 0.200729 }, { "acc": 0.77440681, "epoch": 1.6837807492089416, "grad_norm": 4.53125, "learning_rate": 6.412287548902291e-07, "loss": 0.82259197, "memory(GiB)": 138.1, "step": 72170, "train_speed(iter/s)": 0.200744 }, { "acc": 0.79978375, "epoch": 1.6840140567812303, "grad_norm": 9.0625, "learning_rate": 6.403035082514891e-07, "loss": 0.71059504, "memory(GiB)": 138.1, "step": 72180, "train_speed(iter/s)": 0.200758 }, { "acc": 0.78088136, "epoch": 1.6842473643535194, "grad_norm": 6.4375, "learning_rate": 6.393788839581578e-07, "loss": 0.78694282, "memory(GiB)": 138.1, "step": 72190, "train_speed(iter/s)": 0.200771 }, { "acc": 0.76632071, "epoch": 1.684480671925808, "grad_norm": 7.75, "learning_rate": 6.384548821422243e-07, "loss": 0.8362587, "memory(GiB)": 138.1, "step": 72200, "train_speed(iter/s)": 0.200785 }, { "acc": 0.75542583, "epoch": 1.6847139794980972, "grad_norm": 5.25, "learning_rate": 6.375315029355883e-07, "loss": 0.88243284, "memory(GiB)": 138.1, "step": 72210, "train_speed(iter/s)": 0.2008 }, { "acc": 0.79456859, "epoch": 1.6849472870703859, "grad_norm": 6.1875, "learning_rate": 6.366087464700637e-07, "loss": 0.7337564, "memory(GiB)": 138.1, "step": 72220, "train_speed(iter/s)": 0.200814 }, { "acc": 0.79592638, "epoch": 1.685180594642675, "grad_norm": 5.0, "learning_rate": 6.35686612877372e-07, "loss": 0.7540689, "memory(GiB)": 138.1, "step": 72230, "train_speed(iter/s)": 0.200828 }, { "acc": 0.76712937, "epoch": 1.6854139022149637, "grad_norm": 4.84375, "learning_rate": 6.34765102289151e-07, "loss": 0.85317764, "memory(GiB)": 138.1, "step": 72240, "train_speed(iter/s)": 0.200843 }, { "acc": 0.77703066, "epoch": 1.6856472097872528, "grad_norm": 4.875, "learning_rate": 6.338442148369406e-07, "loss": 0.77372303, "memory(GiB)": 138.1, "step": 72250, "train_speed(iter/s)": 0.200857 }, { "acc": 0.78281322, "epoch": 1.6858805173595415, "grad_norm": 4.5, "learning_rate": 6.329239506522017e-07, "loss": 0.77638769, "memory(GiB)": 138.1, "step": 72260, "train_speed(iter/s)": 0.200872 }, { "acc": 0.78430648, "epoch": 1.6861138249318306, "grad_norm": 7.125, "learning_rate": 6.320043098662992e-07, "loss": 0.7674891, "memory(GiB)": 138.1, "step": 72270, "train_speed(iter/s)": 0.200887 }, { "acc": 0.77010136, "epoch": 1.6863471325041193, "grad_norm": 4.6875, "learning_rate": 6.310852926105138e-07, "loss": 0.82719336, "memory(GiB)": 138.1, "step": 72280, "train_speed(iter/s)": 0.2009 }, { "acc": 0.78210573, "epoch": 1.6865804400764084, "grad_norm": 5.4375, "learning_rate": 6.301668990160331e-07, "loss": 0.79944468, "memory(GiB)": 138.1, "step": 72290, "train_speed(iter/s)": 0.200914 }, { "acc": 0.80032415, "epoch": 1.686813747648697, "grad_norm": 4.9375, "learning_rate": 6.292491292139574e-07, "loss": 0.72096748, "memory(GiB)": 138.1, "step": 72300, "train_speed(iter/s)": 0.200929 }, { "acc": 0.7697011, "epoch": 1.687047055220986, "grad_norm": 7.03125, "learning_rate": 6.283319833353002e-07, "loss": 0.82724371, "memory(GiB)": 138.1, "step": 72310, "train_speed(iter/s)": 0.200943 }, { "acc": 0.79757509, "epoch": 1.6872803627932749, "grad_norm": 6.6875, "learning_rate": 6.274154615109812e-07, "loss": 0.70658522, "memory(GiB)": 138.1, "step": 72320, "train_speed(iter/s)": 0.200957 }, { "acc": 0.79727383, "epoch": 1.6875136703655638, "grad_norm": 6.875, "learning_rate": 6.264995638718352e-07, "loss": 0.7160409, "memory(GiB)": 138.1, "step": 72330, "train_speed(iter/s)": 0.200971 }, { "acc": 0.76207228, "epoch": 1.6877469779378527, "grad_norm": 9.125, "learning_rate": 6.255842905486065e-07, "loss": 0.85418339, "memory(GiB)": 138.1, "step": 72340, "train_speed(iter/s)": 0.200986 }, { "acc": 0.77744265, "epoch": 1.6879802855101416, "grad_norm": 5.125, "learning_rate": 6.246696416719495e-07, "loss": 0.80833817, "memory(GiB)": 138.1, "step": 72350, "train_speed(iter/s)": 0.201 }, { "acc": 0.80837536, "epoch": 1.6882135930824305, "grad_norm": 5.4375, "learning_rate": 6.237556173724291e-07, "loss": 0.70104604, "memory(GiB)": 138.1, "step": 72360, "train_speed(iter/s)": 0.201015 }, { "acc": 0.79856534, "epoch": 1.6884469006547194, "grad_norm": 4.8125, "learning_rate": 6.228422177805244e-07, "loss": 0.70726795, "memory(GiB)": 138.1, "step": 72370, "train_speed(iter/s)": 0.20103 }, { "acc": 0.76452069, "epoch": 1.6886802082270083, "grad_norm": 8.4375, "learning_rate": 6.21929443026621e-07, "loss": 0.84759407, "memory(GiB)": 138.1, "step": 72380, "train_speed(iter/s)": 0.201045 }, { "acc": 0.78385258, "epoch": 1.6889135157992972, "grad_norm": 4.90625, "learning_rate": 6.210172932410169e-07, "loss": 0.76335421, "memory(GiB)": 138.1, "step": 72390, "train_speed(iter/s)": 0.201059 }, { "acc": 0.77586222, "epoch": 1.689146823371586, "grad_norm": 5.9375, "learning_rate": 6.20105768553923e-07, "loss": 0.79805136, "memory(GiB)": 138.1, "step": 72400, "train_speed(iter/s)": 0.201075 }, { "acc": 0.78417902, "epoch": 1.689380130943875, "grad_norm": 5.375, "learning_rate": 6.191948690954575e-07, "loss": 0.75983038, "memory(GiB)": 138.1, "step": 72410, "train_speed(iter/s)": 0.201089 }, { "acc": 0.78113298, "epoch": 1.6896134385161639, "grad_norm": 4.6875, "learning_rate": 6.182845949956523e-07, "loss": 0.80795841, "memory(GiB)": 138.1, "step": 72420, "train_speed(iter/s)": 0.201104 }, { "acc": 0.78343716, "epoch": 1.6898467460884528, "grad_norm": 5.1875, "learning_rate": 6.173749463844486e-07, "loss": 0.76608415, "memory(GiB)": 138.1, "step": 72430, "train_speed(iter/s)": 0.201118 }, { "acc": 0.79046288, "epoch": 1.6900800536607417, "grad_norm": 5.1875, "learning_rate": 6.164659233916976e-07, "loss": 0.75967202, "memory(GiB)": 138.1, "step": 72440, "train_speed(iter/s)": 0.201131 }, { "acc": 0.76052351, "epoch": 1.6903133612330306, "grad_norm": 6.0625, "learning_rate": 6.15557526147163e-07, "loss": 0.86396046, "memory(GiB)": 138.1, "step": 72450, "train_speed(iter/s)": 0.201145 }, { "acc": 0.7680316, "epoch": 1.6905466688053195, "grad_norm": 4.875, "learning_rate": 6.146497547805169e-07, "loss": 0.83324471, "memory(GiB)": 138.1, "step": 72460, "train_speed(iter/s)": 0.201159 }, { "acc": 0.78809872, "epoch": 1.6907799763776084, "grad_norm": 3.703125, "learning_rate": 6.137426094213466e-07, "loss": 0.74023752, "memory(GiB)": 138.1, "step": 72470, "train_speed(iter/s)": 0.201174 }, { "acc": 0.77969913, "epoch": 1.6910132839498973, "grad_norm": 4.71875, "learning_rate": 6.128360901991426e-07, "loss": 0.79022303, "memory(GiB)": 138.1, "step": 72480, "train_speed(iter/s)": 0.201189 }, { "acc": 0.80677795, "epoch": 1.691246591522186, "grad_norm": 10.1875, "learning_rate": 6.119301972433128e-07, "loss": 0.69705276, "memory(GiB)": 138.1, "step": 72490, "train_speed(iter/s)": 0.201203 }, { "acc": 0.76214151, "epoch": 1.691479899094475, "grad_norm": 7.3125, "learning_rate": 6.110249306831733e-07, "loss": 0.86750393, "memory(GiB)": 138.1, "step": 72500, "train_speed(iter/s)": 0.201218 }, { "epoch": 1.691479899094475, "eval_acc": 0.7446806091438193, "eval_loss": 0.8044036626815796, "eval_runtime": 1270.6076, "eval_samples_per_second": 28.326, "eval_steps_per_second": 14.163, "step": 72500 }, { "acc": 0.80648994, "epoch": 1.6917132066667637, "grad_norm": 6.1875, "learning_rate": 6.101202906479487e-07, "loss": 0.67632608, "memory(GiB)": 138.1, "step": 72510, "train_speed(iter/s)": 0.200512 }, { "acc": 0.77256756, "epoch": 1.6919465142390528, "grad_norm": 3.859375, "learning_rate": 6.092162772667781e-07, "loss": 0.80494852, "memory(GiB)": 138.1, "step": 72520, "train_speed(iter/s)": 0.200527 }, { "acc": 0.77806387, "epoch": 1.6921798218113415, "grad_norm": 6.625, "learning_rate": 6.083128906687074e-07, "loss": 0.78679771, "memory(GiB)": 138.1, "step": 72530, "train_speed(iter/s)": 0.200542 }, { "acc": 0.77233944, "epoch": 1.6924131293836306, "grad_norm": 5.09375, "learning_rate": 6.074101309826968e-07, "loss": 0.80660133, "memory(GiB)": 138.1, "step": 72540, "train_speed(iter/s)": 0.200557 }, { "acc": 0.79014969, "epoch": 1.6926464369559193, "grad_norm": 6.875, "learning_rate": 6.065079983376132e-07, "loss": 0.76289606, "memory(GiB)": 138.1, "step": 72550, "train_speed(iter/s)": 0.200571 }, { "acc": 0.78846631, "epoch": 1.6928797445282084, "grad_norm": 5.75, "learning_rate": 6.056064928622374e-07, "loss": 0.77690454, "memory(GiB)": 138.1, "step": 72560, "train_speed(iter/s)": 0.200586 }, { "acc": 0.79145346, "epoch": 1.6931130521004971, "grad_norm": 6.15625, "learning_rate": 6.047056146852575e-07, "loss": 0.74644814, "memory(GiB)": 138.1, "step": 72570, "train_speed(iter/s)": 0.2006 }, { "acc": 0.78505545, "epoch": 1.6933463596727862, "grad_norm": 4.125, "learning_rate": 6.038053639352754e-07, "loss": 0.76110735, "memory(GiB)": 138.1, "step": 72580, "train_speed(iter/s)": 0.200613 }, { "acc": 0.78503914, "epoch": 1.693579667245075, "grad_norm": 4.21875, "learning_rate": 6.029057407407995e-07, "loss": 0.7781271, "memory(GiB)": 138.1, "step": 72590, "train_speed(iter/s)": 0.200628 }, { "acc": 0.82557945, "epoch": 1.693812974817364, "grad_norm": 4.21875, "learning_rate": 6.020067452302514e-07, "loss": 0.6177803, "memory(GiB)": 138.1, "step": 72600, "train_speed(iter/s)": 0.200642 }, { "acc": 0.79121065, "epoch": 1.6940462823896527, "grad_norm": 5.78125, "learning_rate": 6.011083775319637e-07, "loss": 0.73865213, "memory(GiB)": 138.1, "step": 72610, "train_speed(iter/s)": 0.200657 }, { "acc": 0.77857733, "epoch": 1.6942795899619418, "grad_norm": 4.15625, "learning_rate": 6.002106377741762e-07, "loss": 0.78328385, "memory(GiB)": 138.1, "step": 72620, "train_speed(iter/s)": 0.200671 }, { "acc": 0.7810358, "epoch": 1.6945128975342305, "grad_norm": 6.53125, "learning_rate": 5.99313526085043e-07, "loss": 0.7782196, "memory(GiB)": 138.1, "step": 72630, "train_speed(iter/s)": 0.200686 }, { "acc": 0.79591274, "epoch": 1.6947462051065196, "grad_norm": 16.125, "learning_rate": 5.984170425926256e-07, "loss": 0.72331409, "memory(GiB)": 138.1, "step": 72640, "train_speed(iter/s)": 0.2007 }, { "acc": 0.77169046, "epoch": 1.6949795126788083, "grad_norm": 5.09375, "learning_rate": 5.975211874248954e-07, "loss": 0.83417196, "memory(GiB)": 138.1, "step": 72650, "train_speed(iter/s)": 0.200715 }, { "acc": 0.80202818, "epoch": 1.6952128202510974, "grad_norm": 6.21875, "learning_rate": 5.96625960709738e-07, "loss": 0.72186661, "memory(GiB)": 138.1, "step": 72660, "train_speed(iter/s)": 0.200729 }, { "acc": 0.78016157, "epoch": 1.695446127823386, "grad_norm": 5.53125, "learning_rate": 5.957313625749445e-07, "loss": 0.80601139, "memory(GiB)": 138.1, "step": 72670, "train_speed(iter/s)": 0.200743 }, { "acc": 0.78957376, "epoch": 1.695679435395675, "grad_norm": 4.75, "learning_rate": 5.948373931482204e-07, "loss": 0.75643826, "memory(GiB)": 138.1, "step": 72680, "train_speed(iter/s)": 0.200758 }, { "acc": 0.75675497, "epoch": 1.695912742967964, "grad_norm": 6.875, "learning_rate": 5.939440525571788e-07, "loss": 0.88091316, "memory(GiB)": 138.1, "step": 72690, "train_speed(iter/s)": 0.200771 }, { "acc": 0.77509961, "epoch": 1.6961460505402528, "grad_norm": 6.21875, "learning_rate": 5.930513409293437e-07, "loss": 0.79928541, "memory(GiB)": 138.1, "step": 72700, "train_speed(iter/s)": 0.200786 }, { "acc": 0.77624722, "epoch": 1.6963793581125417, "grad_norm": 6.1875, "learning_rate": 5.921592583921488e-07, "loss": 0.7726511, "memory(GiB)": 138.1, "step": 72710, "train_speed(iter/s)": 0.2008 }, { "acc": 0.77374887, "epoch": 1.6966126656848306, "grad_norm": 5.9375, "learning_rate": 5.912678050729398e-07, "loss": 0.7831955, "memory(GiB)": 138.1, "step": 72720, "train_speed(iter/s)": 0.200814 }, { "acc": 0.7693531, "epoch": 1.6968459732571195, "grad_norm": 4.34375, "learning_rate": 5.903769810989713e-07, "loss": 0.82154694, "memory(GiB)": 138.1, "step": 72730, "train_speed(iter/s)": 0.200828 }, { "acc": 0.77684445, "epoch": 1.6970792808294084, "grad_norm": 4.4375, "learning_rate": 5.894867865974064e-07, "loss": 0.80565586, "memory(GiB)": 138.1, "step": 72740, "train_speed(iter/s)": 0.200843 }, { "acc": 0.78173313, "epoch": 1.6973125884016973, "grad_norm": 4.25, "learning_rate": 5.885972216953223e-07, "loss": 0.78659649, "memory(GiB)": 138.1, "step": 72750, "train_speed(iter/s)": 0.200857 }, { "acc": 0.78354235, "epoch": 1.6975458959739862, "grad_norm": 5.09375, "learning_rate": 5.877082865197026e-07, "loss": 0.80655441, "memory(GiB)": 138.1, "step": 72760, "train_speed(iter/s)": 0.200872 }, { "acc": 0.79456587, "epoch": 1.697779203546275, "grad_norm": 6.65625, "learning_rate": 5.86819981197444e-07, "loss": 0.73504858, "memory(GiB)": 138.1, "step": 72770, "train_speed(iter/s)": 0.200886 }, { "acc": 0.79108696, "epoch": 1.698012511118564, "grad_norm": 8.5, "learning_rate": 5.859323058553512e-07, "loss": 0.75745306, "memory(GiB)": 138.1, "step": 72780, "train_speed(iter/s)": 0.200901 }, { "acc": 0.7710947, "epoch": 1.698245818690853, "grad_norm": 4.25, "learning_rate": 5.850452606201384e-07, "loss": 0.81637497, "memory(GiB)": 138.1, "step": 72790, "train_speed(iter/s)": 0.200915 }, { "acc": 0.8023735, "epoch": 1.6984791262631418, "grad_norm": 5.125, "learning_rate": 5.841588456184333e-07, "loss": 0.69645176, "memory(GiB)": 138.1, "step": 72800, "train_speed(iter/s)": 0.20093 }, { "acc": 0.78629084, "epoch": 1.6987124338354307, "grad_norm": 7.1875, "learning_rate": 5.8327306097677e-07, "loss": 0.75006866, "memory(GiB)": 138.1, "step": 72810, "train_speed(iter/s)": 0.200943 }, { "acc": 0.77999191, "epoch": 1.6989457414077196, "grad_norm": 10.5625, "learning_rate": 5.823879068215943e-07, "loss": 0.80349598, "memory(GiB)": 138.1, "step": 72820, "train_speed(iter/s)": 0.200958 }, { "acc": 0.77250009, "epoch": 1.6991790489800085, "grad_norm": 6.09375, "learning_rate": 5.815033832792605e-07, "loss": 0.79956465, "memory(GiB)": 138.1, "step": 72830, "train_speed(iter/s)": 0.200972 }, { "acc": 0.78692751, "epoch": 1.6994123565522974, "grad_norm": 4.90625, "learning_rate": 5.806194904760365e-07, "loss": 0.75956717, "memory(GiB)": 138.1, "step": 72840, "train_speed(iter/s)": 0.200985 }, { "acc": 0.7845644, "epoch": 1.6996456641245863, "grad_norm": 5.75, "learning_rate": 5.797362285380948e-07, "loss": 0.77232275, "memory(GiB)": 138.1, "step": 72850, "train_speed(iter/s)": 0.200999 }, { "acc": 0.78012753, "epoch": 1.6998789716968752, "grad_norm": 6.8125, "learning_rate": 5.788535975915239e-07, "loss": 0.77585268, "memory(GiB)": 138.1, "step": 72860, "train_speed(iter/s)": 0.201013 }, { "acc": 0.77515068, "epoch": 1.700112279269164, "grad_norm": 5.34375, "learning_rate": 5.779715977623168e-07, "loss": 0.81616535, "memory(GiB)": 138.1, "step": 72870, "train_speed(iter/s)": 0.201026 }, { "acc": 0.79815941, "epoch": 1.7003455868414528, "grad_norm": 5.625, "learning_rate": 5.770902291763791e-07, "loss": 0.71203232, "memory(GiB)": 138.1, "step": 72880, "train_speed(iter/s)": 0.20104 }, { "acc": 0.80398407, "epoch": 1.7005788944137419, "grad_norm": 5.8125, "learning_rate": 5.762094919595274e-07, "loss": 0.71322527, "memory(GiB)": 138.1, "step": 72890, "train_speed(iter/s)": 0.201052 }, { "acc": 0.79169493, "epoch": 1.7008122019860306, "grad_norm": 5.125, "learning_rate": 5.753293862374842e-07, "loss": 0.73782015, "memory(GiB)": 138.1, "step": 72900, "train_speed(iter/s)": 0.201067 }, { "acc": 0.78701067, "epoch": 1.7010455095583197, "grad_norm": 7.125, "learning_rate": 5.744499121358871e-07, "loss": 0.77875328, "memory(GiB)": 138.1, "step": 72910, "train_speed(iter/s)": 0.201081 }, { "acc": 0.79096022, "epoch": 1.7012788171306084, "grad_norm": 4.4375, "learning_rate": 5.735710697802793e-07, "loss": 0.73712015, "memory(GiB)": 138.1, "step": 72920, "train_speed(iter/s)": 0.201095 }, { "acc": 0.77540636, "epoch": 1.7015121247028975, "grad_norm": 4.84375, "learning_rate": 5.726928592961156e-07, "loss": 0.80175991, "memory(GiB)": 138.1, "step": 72930, "train_speed(iter/s)": 0.20111 }, { "acc": 0.77421494, "epoch": 1.7017454322751862, "grad_norm": 7.53125, "learning_rate": 5.718152808087601e-07, "loss": 0.81656933, "memory(GiB)": 138.1, "step": 72940, "train_speed(iter/s)": 0.201125 }, { "acc": 0.76136036, "epoch": 1.7019787398474753, "grad_norm": 4.5, "learning_rate": 5.709383344434854e-07, "loss": 0.8624939, "memory(GiB)": 138.1, "step": 72950, "train_speed(iter/s)": 0.201139 }, { "acc": 0.790942, "epoch": 1.702212047419764, "grad_norm": 7.21875, "learning_rate": 5.700620203254781e-07, "loss": 0.74097085, "memory(GiB)": 138.1, "step": 72960, "train_speed(iter/s)": 0.201154 }, { "acc": 0.77590914, "epoch": 1.702445354992053, "grad_norm": 5.875, "learning_rate": 5.691863385798296e-07, "loss": 0.81017885, "memory(GiB)": 138.1, "step": 72970, "train_speed(iter/s)": 0.201168 }, { "acc": 0.7668539, "epoch": 1.7026786625643417, "grad_norm": 5.21875, "learning_rate": 5.683112893315451e-07, "loss": 0.86975269, "memory(GiB)": 138.1, "step": 72980, "train_speed(iter/s)": 0.201183 }, { "acc": 0.78860574, "epoch": 1.7029119701366309, "grad_norm": 5.53125, "learning_rate": 5.674368727055351e-07, "loss": 0.74698706, "memory(GiB)": 138.1, "step": 72990, "train_speed(iter/s)": 0.201197 }, { "acc": 0.77102966, "epoch": 1.7031452777089195, "grad_norm": 5.3125, "learning_rate": 5.665630888266254e-07, "loss": 0.82879314, "memory(GiB)": 138.1, "step": 73000, "train_speed(iter/s)": 0.201211 }, { "epoch": 1.7031452777089195, "eval_acc": 0.7447366596251248, "eval_loss": 0.8043876886367798, "eval_runtime": 1271.875, "eval_samples_per_second": 28.298, "eval_steps_per_second": 14.149, "step": 73000 }, { "acc": 0.78933325, "epoch": 1.7033785852812087, "grad_norm": 5.0625, "learning_rate": 5.656899378195468e-07, "loss": 0.75085564, "memory(GiB)": 138.1, "step": 73010, "train_speed(iter/s)": 0.200509 }, { "acc": 0.79252825, "epoch": 1.7036118928534973, "grad_norm": 4.34375, "learning_rate": 5.648174198089407e-07, "loss": 0.74576683, "memory(GiB)": 138.1, "step": 73020, "train_speed(iter/s)": 0.200523 }, { "acc": 0.79472671, "epoch": 1.7038452004257865, "grad_norm": 5.8125, "learning_rate": 5.639455349193602e-07, "loss": 0.7537653, "memory(GiB)": 138.1, "step": 73030, "train_speed(iter/s)": 0.200537 }, { "acc": 0.78937383, "epoch": 1.7040785079980751, "grad_norm": 5.28125, "learning_rate": 5.630742832752655e-07, "loss": 0.74292388, "memory(GiB)": 138.1, "step": 73040, "train_speed(iter/s)": 0.200551 }, { "acc": 0.78601589, "epoch": 1.7043118155703643, "grad_norm": 3.828125, "learning_rate": 5.622036650010281e-07, "loss": 0.77539072, "memory(GiB)": 138.1, "step": 73050, "train_speed(iter/s)": 0.200565 }, { "acc": 0.80177393, "epoch": 1.704545123142653, "grad_norm": 4.65625, "learning_rate": 5.613336802209274e-07, "loss": 0.7048872, "memory(GiB)": 138.1, "step": 73060, "train_speed(iter/s)": 0.200579 }, { "acc": 0.76926432, "epoch": 1.7047784307149418, "grad_norm": 5.78125, "learning_rate": 5.604643290591555e-07, "loss": 0.82517433, "memory(GiB)": 138.1, "step": 73070, "train_speed(iter/s)": 0.200594 }, { "acc": 0.78017607, "epoch": 1.7050117382872307, "grad_norm": 5.5625, "learning_rate": 5.595956116398111e-07, "loss": 0.75298491, "memory(GiB)": 138.1, "step": 73080, "train_speed(iter/s)": 0.200608 }, { "acc": 0.77815399, "epoch": 1.7052450458595196, "grad_norm": 5.1875, "learning_rate": 5.58727528086902e-07, "loss": 0.78736992, "memory(GiB)": 138.1, "step": 73090, "train_speed(iter/s)": 0.200623 }, { "acc": 0.77575245, "epoch": 1.7054783534318085, "grad_norm": 5.375, "learning_rate": 5.578600785243493e-07, "loss": 0.81773357, "memory(GiB)": 138.1, "step": 73100, "train_speed(iter/s)": 0.200638 }, { "acc": 0.79463716, "epoch": 1.7057116610040974, "grad_norm": 4.3125, "learning_rate": 5.569932630759789e-07, "loss": 0.74758506, "memory(GiB)": 138.1, "step": 73110, "train_speed(iter/s)": 0.200652 }, { "acc": 0.78639936, "epoch": 1.7059449685763863, "grad_norm": 4.90625, "learning_rate": 5.561270818655301e-07, "loss": 0.76089525, "memory(GiB)": 138.1, "step": 73120, "train_speed(iter/s)": 0.200666 }, { "acc": 0.7882925, "epoch": 1.7061782761486752, "grad_norm": 10.375, "learning_rate": 5.552615350166496e-07, "loss": 0.76413431, "memory(GiB)": 138.1, "step": 73130, "train_speed(iter/s)": 0.200681 }, { "acc": 0.78352389, "epoch": 1.7064115837209641, "grad_norm": 5.375, "learning_rate": 5.54396622652893e-07, "loss": 0.77671185, "memory(GiB)": 138.1, "step": 73140, "train_speed(iter/s)": 0.200694 }, { "acc": 0.78264346, "epoch": 1.706644891293253, "grad_norm": 8.0625, "learning_rate": 5.535323448977275e-07, "loss": 0.76960349, "memory(GiB)": 138.1, "step": 73150, "train_speed(iter/s)": 0.200709 }, { "acc": 0.8065589, "epoch": 1.706878198865542, "grad_norm": 6.84375, "learning_rate": 5.526687018745286e-07, "loss": 0.69353619, "memory(GiB)": 138.1, "step": 73160, "train_speed(iter/s)": 0.200723 }, { "acc": 0.78679342, "epoch": 1.7071115064378308, "grad_norm": 5.0625, "learning_rate": 5.518056937065802e-07, "loss": 0.77192287, "memory(GiB)": 138.1, "step": 73170, "train_speed(iter/s)": 0.200737 }, { "acc": 0.7717082, "epoch": 1.7073448140101197, "grad_norm": 4.15625, "learning_rate": 5.509433205170761e-07, "loss": 0.82198477, "memory(GiB)": 138.1, "step": 73180, "train_speed(iter/s)": 0.200751 }, { "acc": 0.78816519, "epoch": 1.7075781215824086, "grad_norm": 6.03125, "learning_rate": 5.500815824291216e-07, "loss": 0.78270626, "memory(GiB)": 138.1, "step": 73190, "train_speed(iter/s)": 0.200764 }, { "acc": 0.76701832, "epoch": 1.7078114291546975, "grad_norm": 6.4375, "learning_rate": 5.492204795657274e-07, "loss": 0.82847338, "memory(GiB)": 138.1, "step": 73200, "train_speed(iter/s)": 0.200779 }, { "acc": 0.7891695, "epoch": 1.7080447367269864, "grad_norm": 4.59375, "learning_rate": 5.483600120498178e-07, "loss": 0.74464188, "memory(GiB)": 138.1, "step": 73210, "train_speed(iter/s)": 0.200792 }, { "acc": 0.76037254, "epoch": 1.7082780442992753, "grad_norm": 5.0, "learning_rate": 5.475001800042228e-07, "loss": 0.87401991, "memory(GiB)": 138.1, "step": 73220, "train_speed(iter/s)": 0.200806 }, { "acc": 0.78426857, "epoch": 1.7085113518715642, "grad_norm": 5.15625, "learning_rate": 5.466409835516834e-07, "loss": 0.7634037, "memory(GiB)": 138.1, "step": 73230, "train_speed(iter/s)": 0.20082 }, { "acc": 0.80567493, "epoch": 1.7087446594438531, "grad_norm": 4.78125, "learning_rate": 5.457824228148506e-07, "loss": 0.68272343, "memory(GiB)": 138.1, "step": 73240, "train_speed(iter/s)": 0.200834 }, { "acc": 0.77797771, "epoch": 1.708977967016142, "grad_norm": 4.78125, "learning_rate": 5.449244979162816e-07, "loss": 0.80399952, "memory(GiB)": 138.1, "step": 73250, "train_speed(iter/s)": 0.200848 }, { "acc": 0.78294101, "epoch": 1.709211274588431, "grad_norm": 6.6875, "learning_rate": 5.440672089784476e-07, "loss": 0.78328276, "memory(GiB)": 138.1, "step": 73260, "train_speed(iter/s)": 0.20086 }, { "acc": 0.79575491, "epoch": 1.7094445821607196, "grad_norm": 5.5625, "learning_rate": 5.43210556123725e-07, "loss": 0.73883858, "memory(GiB)": 138.1, "step": 73270, "train_speed(iter/s)": 0.200876 }, { "acc": 0.78026762, "epoch": 1.7096778897330087, "grad_norm": 5.625, "learning_rate": 5.423545394744012e-07, "loss": 0.79718757, "memory(GiB)": 138.1, "step": 73280, "train_speed(iter/s)": 0.20089 }, { "acc": 0.79239721, "epoch": 1.7099111973052974, "grad_norm": 5.5, "learning_rate": 5.414991591526714e-07, "loss": 0.74911137, "memory(GiB)": 138.1, "step": 73290, "train_speed(iter/s)": 0.200904 }, { "acc": 0.78526192, "epoch": 1.7101445048775865, "grad_norm": 9.625, "learning_rate": 5.406444152806406e-07, "loss": 0.75527325, "memory(GiB)": 138.1, "step": 73300, "train_speed(iter/s)": 0.200919 }, { "acc": 0.77781644, "epoch": 1.7103778124498752, "grad_norm": 5.28125, "learning_rate": 5.397903079803251e-07, "loss": 0.81301641, "memory(GiB)": 138.1, "step": 73310, "train_speed(iter/s)": 0.200933 }, { "acc": 0.78796282, "epoch": 1.7106111200221643, "grad_norm": 6.09375, "learning_rate": 5.389368373736464e-07, "loss": 0.75768142, "memory(GiB)": 138.1, "step": 73320, "train_speed(iter/s)": 0.200947 }, { "acc": 0.80338306, "epoch": 1.710844427594453, "grad_norm": 7.4375, "learning_rate": 5.380840035824397e-07, "loss": 0.6969276, "memory(GiB)": 138.1, "step": 73330, "train_speed(iter/s)": 0.200961 }, { "acc": 0.78724689, "epoch": 1.711077735166742, "grad_norm": 5.84375, "learning_rate": 5.372318067284438e-07, "loss": 0.76626501, "memory(GiB)": 138.1, "step": 73340, "train_speed(iter/s)": 0.200976 }, { "acc": 0.78899498, "epoch": 1.7113110427390308, "grad_norm": 5.71875, "learning_rate": 5.363802469333118e-07, "loss": 0.75074024, "memory(GiB)": 138.1, "step": 73350, "train_speed(iter/s)": 0.200989 }, { "acc": 0.77315474, "epoch": 1.71154435031132, "grad_norm": 6.8125, "learning_rate": 5.355293243186033e-07, "loss": 0.82559099, "memory(GiB)": 138.1, "step": 73360, "train_speed(iter/s)": 0.201005 }, { "acc": 0.77842388, "epoch": 1.7117776578836086, "grad_norm": 6.96875, "learning_rate": 5.34679039005786e-07, "loss": 0.8188942, "memory(GiB)": 138.1, "step": 73370, "train_speed(iter/s)": 0.201019 }, { "acc": 0.76217799, "epoch": 1.7120109654558977, "grad_norm": 5.8125, "learning_rate": 5.338293911162401e-07, "loss": 0.85048714, "memory(GiB)": 138.1, "step": 73380, "train_speed(iter/s)": 0.201033 }, { "acc": 0.76450157, "epoch": 1.7122442730281864, "grad_norm": 7.03125, "learning_rate": 5.329803807712497e-07, "loss": 0.85439548, "memory(GiB)": 138.1, "step": 73390, "train_speed(iter/s)": 0.201048 }, { "acc": 0.78162451, "epoch": 1.7124775806004755, "grad_norm": 7.53125, "learning_rate": 5.321320080920128e-07, "loss": 0.78223362, "memory(GiB)": 138.1, "step": 73400, "train_speed(iter/s)": 0.201062 }, { "acc": 0.77304645, "epoch": 1.7127108881727642, "grad_norm": 5.15625, "learning_rate": 5.312842731996332e-07, "loss": 0.82387562, "memory(GiB)": 138.1, "step": 73410, "train_speed(iter/s)": 0.201076 }, { "acc": 0.77667975, "epoch": 1.7129441957450533, "grad_norm": 4.03125, "learning_rate": 5.304371762151261e-07, "loss": 0.80046349, "memory(GiB)": 138.1, "step": 73420, "train_speed(iter/s)": 0.20109 }, { "acc": 0.79003797, "epoch": 1.713177503317342, "grad_norm": 10.625, "learning_rate": 5.295907172594139e-07, "loss": 0.75672364, "memory(GiB)": 138.1, "step": 73430, "train_speed(iter/s)": 0.201105 }, { "acc": 0.79200077, "epoch": 1.713410810889631, "grad_norm": 5.71875, "learning_rate": 5.287448964533276e-07, "loss": 0.7386096, "memory(GiB)": 138.1, "step": 73440, "train_speed(iter/s)": 0.201119 }, { "acc": 0.77990046, "epoch": 1.7136441184619198, "grad_norm": 6.75, "learning_rate": 5.278997139176084e-07, "loss": 0.79957833, "memory(GiB)": 138.1, "step": 73450, "train_speed(iter/s)": 0.201132 }, { "acc": 0.74390507, "epoch": 1.7138774260342087, "grad_norm": 4.1875, "learning_rate": 5.270551697729059e-07, "loss": 0.94712648, "memory(GiB)": 138.1, "step": 73460, "train_speed(iter/s)": 0.201147 }, { "acc": 0.77517686, "epoch": 1.7141107336064976, "grad_norm": 4.875, "learning_rate": 5.262112641397788e-07, "loss": 0.80620651, "memory(GiB)": 138.1, "step": 73470, "train_speed(iter/s)": 0.201161 }, { "acc": 0.76355195, "epoch": 1.7143440411787865, "grad_norm": 7.25, "learning_rate": 5.253679971386932e-07, "loss": 0.84303713, "memory(GiB)": 138.1, "step": 73480, "train_speed(iter/s)": 0.201173 }, { "acc": 0.78939095, "epoch": 1.7145773487510754, "grad_norm": 5.09375, "learning_rate": 5.245253688900287e-07, "loss": 0.75973749, "memory(GiB)": 138.1, "step": 73490, "train_speed(iter/s)": 0.201187 }, { "acc": 0.78993163, "epoch": 1.7148106563233643, "grad_norm": 5.375, "learning_rate": 5.236833795140651e-07, "loss": 0.73632402, "memory(GiB)": 138.1, "step": 73500, "train_speed(iter/s)": 0.201202 }, { "epoch": 1.7148106563233643, "eval_acc": 0.7447105561152596, "eval_loss": 0.8043909072875977, "eval_runtime": 1271.1227, "eval_samples_per_second": 28.314, "eval_steps_per_second": 14.158, "step": 73500 }, { "acc": 0.79015398, "epoch": 1.7150439638956532, "grad_norm": 5.1875, "learning_rate": 5.228420291309999e-07, "loss": 0.74385405, "memory(GiB)": 138.1, "step": 73510, "train_speed(iter/s)": 0.200504 }, { "acc": 0.7631135, "epoch": 1.715277271467942, "grad_norm": 5.5625, "learning_rate": 5.22001317860934e-07, "loss": 0.84219952, "memory(GiB)": 138.1, "step": 73520, "train_speed(iter/s)": 0.200519 }, { "acc": 0.76704111, "epoch": 1.715510579040231, "grad_norm": 3.984375, "learning_rate": 5.211612458238785e-07, "loss": 0.87439299, "memory(GiB)": 138.1, "step": 73530, "train_speed(iter/s)": 0.200533 }, { "acc": 0.79748411, "epoch": 1.7157438866125199, "grad_norm": 6.0625, "learning_rate": 5.203218131397553e-07, "loss": 0.73548927, "memory(GiB)": 138.1, "step": 73540, "train_speed(iter/s)": 0.200547 }, { "acc": 0.77375269, "epoch": 1.7159771941848088, "grad_norm": 4.875, "learning_rate": 5.194830199283907e-07, "loss": 0.80748701, "memory(GiB)": 138.1, "step": 73550, "train_speed(iter/s)": 0.200561 }, { "acc": 0.77218609, "epoch": 1.7162105017570977, "grad_norm": 6.5, "learning_rate": 5.18644866309524e-07, "loss": 0.81236382, "memory(GiB)": 138.1, "step": 73560, "train_speed(iter/s)": 0.200575 }, { "acc": 0.79794626, "epoch": 1.7164438093293866, "grad_norm": 3.90625, "learning_rate": 5.178073524028016e-07, "loss": 0.71432037, "memory(GiB)": 138.1, "step": 73570, "train_speed(iter/s)": 0.200589 }, { "acc": 0.75880527, "epoch": 1.7166771169016755, "grad_norm": 6.28125, "learning_rate": 5.169704783277756e-07, "loss": 0.84431038, "memory(GiB)": 138.1, "step": 73580, "train_speed(iter/s)": 0.200603 }, { "acc": 0.78814268, "epoch": 1.7169104244739644, "grad_norm": 5.375, "learning_rate": 5.16134244203913e-07, "loss": 0.75521536, "memory(GiB)": 138.1, "step": 73590, "train_speed(iter/s)": 0.200618 }, { "acc": 0.78718247, "epoch": 1.7171437320462533, "grad_norm": 6.625, "learning_rate": 5.152986501505835e-07, "loss": 0.76596746, "memory(GiB)": 138.1, "step": 73600, "train_speed(iter/s)": 0.200633 }, { "acc": 0.77716827, "epoch": 1.7173770396185422, "grad_norm": 3.65625, "learning_rate": 5.144636962870708e-07, "loss": 0.80403767, "memory(GiB)": 138.1, "step": 73610, "train_speed(iter/s)": 0.200646 }, { "acc": 0.75140824, "epoch": 1.717610347190831, "grad_norm": 5.5625, "learning_rate": 5.136293827325606e-07, "loss": 0.89990616, "memory(GiB)": 138.1, "step": 73620, "train_speed(iter/s)": 0.20066 }, { "acc": 0.77322063, "epoch": 1.71784365476312, "grad_norm": 6.125, "learning_rate": 5.127957096061537e-07, "loss": 0.83046427, "memory(GiB)": 138.1, "step": 73630, "train_speed(iter/s)": 0.200674 }, { "acc": 0.76129708, "epoch": 1.7180769623354089, "grad_norm": 5.90625, "learning_rate": 5.119626770268543e-07, "loss": 0.86177006, "memory(GiB)": 138.1, "step": 73640, "train_speed(iter/s)": 0.200688 }, { "acc": 0.78243065, "epoch": 1.7183102699076978, "grad_norm": 3.5625, "learning_rate": 5.111302851135802e-07, "loss": 0.79324946, "memory(GiB)": 138.1, "step": 73650, "train_speed(iter/s)": 0.200702 }, { "acc": 0.79360733, "epoch": 1.7185435774799864, "grad_norm": 4.84375, "learning_rate": 5.10298533985154e-07, "loss": 0.74446688, "memory(GiB)": 138.1, "step": 73660, "train_speed(iter/s)": 0.200716 }, { "acc": 0.78374872, "epoch": 1.7187768850522755, "grad_norm": 5.25, "learning_rate": 5.09467423760307e-07, "loss": 0.74643164, "memory(GiB)": 138.1, "step": 73670, "train_speed(iter/s)": 0.200731 }, { "acc": 0.78896942, "epoch": 1.7190101926245642, "grad_norm": 4.84375, "learning_rate": 5.086369545576814e-07, "loss": 0.75824614, "memory(GiB)": 138.1, "step": 73680, "train_speed(iter/s)": 0.200746 }, { "acc": 0.78774047, "epoch": 1.7192435001968533, "grad_norm": 5.375, "learning_rate": 5.078071264958245e-07, "loss": 0.74300013, "memory(GiB)": 138.1, "step": 73690, "train_speed(iter/s)": 0.20076 }, { "acc": 0.79266844, "epoch": 1.719476807769142, "grad_norm": 4.125, "learning_rate": 5.069779396931967e-07, "loss": 0.74105115, "memory(GiB)": 138.1, "step": 73700, "train_speed(iter/s)": 0.200774 }, { "acc": 0.76223621, "epoch": 1.7197101153414311, "grad_norm": 13.625, "learning_rate": 5.061493942681628e-07, "loss": 0.88398075, "memory(GiB)": 138.1, "step": 73710, "train_speed(iter/s)": 0.200788 }, { "acc": 0.77189965, "epoch": 1.7199434229137198, "grad_norm": 5.71875, "learning_rate": 5.053214903389975e-07, "loss": 0.81558266, "memory(GiB)": 138.1, "step": 73720, "train_speed(iter/s)": 0.200802 }, { "acc": 0.78657942, "epoch": 1.720176730486009, "grad_norm": 5.71875, "learning_rate": 5.044942280238835e-07, "loss": 0.7773448, "memory(GiB)": 138.1, "step": 73730, "train_speed(iter/s)": 0.200815 }, { "acc": 0.76862726, "epoch": 1.7204100380582976, "grad_norm": 4.40625, "learning_rate": 5.036676074409114e-07, "loss": 0.85371304, "memory(GiB)": 138.1, "step": 73740, "train_speed(iter/s)": 0.200829 }, { "acc": 0.77896681, "epoch": 1.7206433456305867, "grad_norm": 7.53125, "learning_rate": 5.028416287080834e-07, "loss": 0.7777874, "memory(GiB)": 138.1, "step": 73750, "train_speed(iter/s)": 0.200844 }, { "acc": 0.77873478, "epoch": 1.7208766532028754, "grad_norm": 5.28125, "learning_rate": 5.020162919433053e-07, "loss": 0.79268093, "memory(GiB)": 138.1, "step": 73760, "train_speed(iter/s)": 0.200858 }, { "acc": 0.78746619, "epoch": 1.7211099607751645, "grad_norm": 7.21875, "learning_rate": 5.01191597264396e-07, "loss": 0.75564404, "memory(GiB)": 138.1, "step": 73770, "train_speed(iter/s)": 0.200871 }, { "acc": 0.78620825, "epoch": 1.7213432683474532, "grad_norm": 6.28125, "learning_rate": 5.003675447890782e-07, "loss": 0.75644531, "memory(GiB)": 138.1, "step": 73780, "train_speed(iter/s)": 0.200885 }, { "acc": 0.76999698, "epoch": 1.7215765759197423, "grad_norm": 4.8125, "learning_rate": 4.995441346349872e-07, "loss": 0.83167076, "memory(GiB)": 138.1, "step": 73790, "train_speed(iter/s)": 0.200899 }, { "acc": 0.80347652, "epoch": 1.721809883492031, "grad_norm": 4.28125, "learning_rate": 4.987213669196639e-07, "loss": 0.68127127, "memory(GiB)": 138.1, "step": 73800, "train_speed(iter/s)": 0.200913 }, { "acc": 0.79354105, "epoch": 1.7220431910643201, "grad_norm": 4.5625, "learning_rate": 4.978992417605566e-07, "loss": 0.73909149, "memory(GiB)": 138.1, "step": 73810, "train_speed(iter/s)": 0.200927 }, { "acc": 0.8010478, "epoch": 1.7222764986366088, "grad_norm": 5.46875, "learning_rate": 4.970777592750253e-07, "loss": 0.69659333, "memory(GiB)": 138.1, "step": 73820, "train_speed(iter/s)": 0.20094 }, { "acc": 0.76971216, "epoch": 1.722509806208898, "grad_norm": 5.09375, "learning_rate": 4.962569195803352e-07, "loss": 0.8232933, "memory(GiB)": 138.1, "step": 73830, "train_speed(iter/s)": 0.200953 }, { "acc": 0.77865553, "epoch": 1.7227431137811866, "grad_norm": 4.15625, "learning_rate": 4.954367227936635e-07, "loss": 0.7898417, "memory(GiB)": 138.1, "step": 73840, "train_speed(iter/s)": 0.200967 }, { "acc": 0.77204628, "epoch": 1.7229764213534755, "grad_norm": 6.71875, "learning_rate": 4.946171690320889e-07, "loss": 0.82885933, "memory(GiB)": 138.1, "step": 73850, "train_speed(iter/s)": 0.200981 }, { "acc": 0.79157839, "epoch": 1.7232097289257644, "grad_norm": 5.5, "learning_rate": 4.937982584126055e-07, "loss": 0.73304939, "memory(GiB)": 138.1, "step": 73860, "train_speed(iter/s)": 0.200995 }, { "acc": 0.7956171, "epoch": 1.7234430364980533, "grad_norm": 5.5625, "learning_rate": 4.929799910521116e-07, "loss": 0.73145752, "memory(GiB)": 138.1, "step": 73870, "train_speed(iter/s)": 0.201009 }, { "acc": 0.76525025, "epoch": 1.7236763440703422, "grad_norm": 4.96875, "learning_rate": 4.921623670674142e-07, "loss": 0.84171267, "memory(GiB)": 138.1, "step": 73880, "train_speed(iter/s)": 0.201024 }, { "acc": 0.80335445, "epoch": 1.723909651642631, "grad_norm": 4.9375, "learning_rate": 4.913453865752299e-07, "loss": 0.69952669, "memory(GiB)": 138.1, "step": 73890, "train_speed(iter/s)": 0.201038 }, { "acc": 0.76818728, "epoch": 1.72414295921492, "grad_norm": 11.875, "learning_rate": 4.905290496921811e-07, "loss": 0.84597569, "memory(GiB)": 138.1, "step": 73900, "train_speed(iter/s)": 0.201053 }, { "acc": 0.79364624, "epoch": 1.724376266787209, "grad_norm": 5.875, "learning_rate": 4.897133565348012e-07, "loss": 0.73016977, "memory(GiB)": 138.1, "step": 73910, "train_speed(iter/s)": 0.201067 }, { "acc": 0.79641161, "epoch": 1.7246095743594978, "grad_norm": 3.640625, "learning_rate": 4.888983072195296e-07, "loss": 0.72878189, "memory(GiB)": 138.1, "step": 73920, "train_speed(iter/s)": 0.201082 }, { "acc": 0.7892786, "epoch": 1.7248428819317867, "grad_norm": 5.34375, "learning_rate": 4.880839018627132e-07, "loss": 0.74224253, "memory(GiB)": 138.1, "step": 73930, "train_speed(iter/s)": 0.201096 }, { "acc": 0.78093824, "epoch": 1.7250761895040756, "grad_norm": 5.65625, "learning_rate": 4.872701405806101e-07, "loss": 0.77963114, "memory(GiB)": 138.1, "step": 73940, "train_speed(iter/s)": 0.20111 }, { "acc": 0.77672095, "epoch": 1.7253094970763645, "grad_norm": 7.0625, "learning_rate": 4.864570234893834e-07, "loss": 0.78376875, "memory(GiB)": 138.1, "step": 73950, "train_speed(iter/s)": 0.201123 }, { "acc": 0.78407612, "epoch": 1.7255428046486534, "grad_norm": 6.125, "learning_rate": 4.856445507051049e-07, "loss": 0.77355375, "memory(GiB)": 138.1, "step": 73960, "train_speed(iter/s)": 0.201137 }, { "acc": 0.7757288, "epoch": 1.7257761122209423, "grad_norm": 4.5625, "learning_rate": 4.848327223437554e-07, "loss": 0.80582609, "memory(GiB)": 138.1, "step": 73970, "train_speed(iter/s)": 0.201151 }, { "acc": 0.76491833, "epoch": 1.7260094197932312, "grad_norm": 6.34375, "learning_rate": 4.840215385212232e-07, "loss": 0.86212664, "memory(GiB)": 138.1, "step": 73980, "train_speed(iter/s)": 0.201165 }, { "acc": 0.77588701, "epoch": 1.72624272736552, "grad_norm": 7.25, "learning_rate": 4.832109993533041e-07, "loss": 0.80320539, "memory(GiB)": 138.1, "step": 73990, "train_speed(iter/s)": 0.201178 }, { "acc": 0.7706954, "epoch": 1.726476034937809, "grad_norm": 5.3125, "learning_rate": 4.824011049557037e-07, "loss": 0.83672714, "memory(GiB)": 138.1, "step": 74000, "train_speed(iter/s)": 0.201192 }, { "epoch": 1.726476034937809, "eval_acc": 0.7447166415960871, "eval_loss": 0.8044154047966003, "eval_runtime": 1272.3059, "eval_samples_per_second": 28.288, "eval_steps_per_second": 14.144, "step": 74000 }, { "acc": 0.77697601, "epoch": 1.7267093425100979, "grad_norm": 7.1875, "learning_rate": 4.815918554440324e-07, "loss": 0.802526, "memory(GiB)": 138.1, "step": 74010, "train_speed(iter/s)": 0.200499 }, { "acc": 0.78431935, "epoch": 1.7269426500823868, "grad_norm": 4.71875, "learning_rate": 4.807832509338112e-07, "loss": 0.771733, "memory(GiB)": 138.1, "step": 74020, "train_speed(iter/s)": 0.200512 }, { "acc": 0.78355474, "epoch": 1.7271759576546755, "grad_norm": 4.40625, "learning_rate": 4.799752915404682e-07, "loss": 0.77948675, "memory(GiB)": 138.1, "step": 74030, "train_speed(iter/s)": 0.200525 }, { "acc": 0.77410603, "epoch": 1.7274092652269646, "grad_norm": 4.9375, "learning_rate": 4.791679773793389e-07, "loss": 0.82328606, "memory(GiB)": 138.1, "step": 74040, "train_speed(iter/s)": 0.200539 }, { "acc": 0.77805958, "epoch": 1.7276425727992533, "grad_norm": 5.96875, "learning_rate": 4.78361308565668e-07, "loss": 0.80983696, "memory(GiB)": 138.1, "step": 74050, "train_speed(iter/s)": 0.200553 }, { "acc": 0.76219211, "epoch": 1.7278758803715424, "grad_norm": 8.875, "learning_rate": 4.77555285214607e-07, "loss": 0.85069485, "memory(GiB)": 138.1, "step": 74060, "train_speed(iter/s)": 0.200567 }, { "acc": 0.77103195, "epoch": 1.728109187943831, "grad_norm": 3.90625, "learning_rate": 4.767499074412152e-07, "loss": 0.82191906, "memory(GiB)": 138.1, "step": 74070, "train_speed(iter/s)": 0.200579 }, { "acc": 0.80303154, "epoch": 1.7283424955161202, "grad_norm": 7.125, "learning_rate": 4.7594517536045936e-07, "loss": 0.72213902, "memory(GiB)": 138.1, "step": 74080, "train_speed(iter/s)": 0.200593 }, { "acc": 0.76572065, "epoch": 1.7285758030884089, "grad_norm": 6.1875, "learning_rate": 4.751410890872166e-07, "loss": 0.85232534, "memory(GiB)": 138.1, "step": 74090, "train_speed(iter/s)": 0.200607 }, { "acc": 0.77745919, "epoch": 1.728809110660698, "grad_norm": 5.53125, "learning_rate": 4.743376487362683e-07, "loss": 0.81853027, "memory(GiB)": 138.1, "step": 74100, "train_speed(iter/s)": 0.200621 }, { "acc": 0.79782553, "epoch": 1.7290424182329867, "grad_norm": 5.03125, "learning_rate": 4.7353485442230507e-07, "loss": 0.70489302, "memory(GiB)": 138.1, "step": 74110, "train_speed(iter/s)": 0.200635 }, { "acc": 0.77856512, "epoch": 1.7292757258052758, "grad_norm": 11.5, "learning_rate": 4.7273270625992794e-07, "loss": 0.80788822, "memory(GiB)": 138.1, "step": 74120, "train_speed(iter/s)": 0.200649 }, { "acc": 0.76892738, "epoch": 1.7295090333775645, "grad_norm": 6.59375, "learning_rate": 4.719312043636404e-07, "loss": 0.83906326, "memory(GiB)": 138.1, "step": 74130, "train_speed(iter/s)": 0.200663 }, { "acc": 0.77776084, "epoch": 1.7297423409498536, "grad_norm": 6.0, "learning_rate": 4.711303488478591e-07, "loss": 0.78263907, "memory(GiB)": 138.1, "step": 74140, "train_speed(iter/s)": 0.200676 }, { "acc": 0.7878387, "epoch": 1.7299756485221423, "grad_norm": 5.28125, "learning_rate": 4.7033013982690446e-07, "loss": 0.76044273, "memory(GiB)": 138.1, "step": 74150, "train_speed(iter/s)": 0.20069 }, { "acc": 0.77629614, "epoch": 1.7302089560944314, "grad_norm": 7.53125, "learning_rate": 4.695305774150061e-07, "loss": 0.79856586, "memory(GiB)": 138.1, "step": 74160, "train_speed(iter/s)": 0.200705 }, { "acc": 0.7986949, "epoch": 1.73044226366672, "grad_norm": 6.40625, "learning_rate": 4.687316617263027e-07, "loss": 0.73556232, "memory(GiB)": 138.1, "step": 74170, "train_speed(iter/s)": 0.20072 }, { "acc": 0.79174337, "epoch": 1.7306755712390092, "grad_norm": 6.375, "learning_rate": 4.679333928748375e-07, "loss": 0.73341341, "memory(GiB)": 138.1, "step": 74180, "train_speed(iter/s)": 0.200733 }, { "acc": 0.77004156, "epoch": 1.7309088788112978, "grad_norm": 7.09375, "learning_rate": 4.671357709745644e-07, "loss": 0.83492775, "memory(GiB)": 138.1, "step": 74190, "train_speed(iter/s)": 0.200748 }, { "acc": 0.79907241, "epoch": 1.731142186383587, "grad_norm": 5.375, "learning_rate": 4.6633879613934227e-07, "loss": 0.6985404, "memory(GiB)": 138.1, "step": 74200, "train_speed(iter/s)": 0.200762 }, { "acc": 0.75945568, "epoch": 1.7313754939558756, "grad_norm": 4.78125, "learning_rate": 4.6554246848294127e-07, "loss": 0.86438541, "memory(GiB)": 138.1, "step": 74210, "train_speed(iter/s)": 0.200777 }, { "acc": 0.78431191, "epoch": 1.7316088015281648, "grad_norm": 5.1875, "learning_rate": 4.6474678811903483e-07, "loss": 0.77434368, "memory(GiB)": 138.1, "step": 74220, "train_speed(iter/s)": 0.200791 }, { "acc": 0.7858922, "epoch": 1.7318421091004534, "grad_norm": 6.09375, "learning_rate": 4.639517551612066e-07, "loss": 0.74613075, "memory(GiB)": 138.1, "step": 74230, "train_speed(iter/s)": 0.200803 }, { "acc": 0.78975163, "epoch": 1.7320754166727423, "grad_norm": 5.65625, "learning_rate": 4.63157369722948e-07, "loss": 0.74640374, "memory(GiB)": 138.1, "step": 74240, "train_speed(iter/s)": 0.200817 }, { "acc": 0.78697009, "epoch": 1.7323087242450312, "grad_norm": 4.96875, "learning_rate": 4.6236363191765666e-07, "loss": 0.76076813, "memory(GiB)": 138.1, "step": 74250, "train_speed(iter/s)": 0.200831 }, { "acc": 0.774368, "epoch": 1.7325420318173201, "grad_norm": 6.96875, "learning_rate": 4.615705418586391e-07, "loss": 0.84603786, "memory(GiB)": 138.1, "step": 74260, "train_speed(iter/s)": 0.200845 }, { "acc": 0.78506193, "epoch": 1.732775339389609, "grad_norm": 6.28125, "learning_rate": 4.6077809965910716e-07, "loss": 0.76752243, "memory(GiB)": 138.1, "step": 74270, "train_speed(iter/s)": 0.200859 }, { "acc": 0.77089567, "epoch": 1.733008646961898, "grad_norm": 4.875, "learning_rate": 4.59986305432184e-07, "loss": 0.81474075, "memory(GiB)": 138.1, "step": 74280, "train_speed(iter/s)": 0.200873 }, { "acc": 0.7874424, "epoch": 1.7332419545341868, "grad_norm": 6.3125, "learning_rate": 4.591951592908972e-07, "loss": 0.76511889, "memory(GiB)": 138.1, "step": 74290, "train_speed(iter/s)": 0.200887 }, { "acc": 0.77571363, "epoch": 1.7334752621064757, "grad_norm": 4.1875, "learning_rate": 4.5840466134818184e-07, "loss": 0.8134244, "memory(GiB)": 138.1, "step": 74300, "train_speed(iter/s)": 0.200901 }, { "acc": 0.77631946, "epoch": 1.7337085696787646, "grad_norm": 5.90625, "learning_rate": 4.576148117168816e-07, "loss": 0.78171606, "memory(GiB)": 138.1, "step": 74310, "train_speed(iter/s)": 0.200915 }, { "acc": 0.79241109, "epoch": 1.7339418772510535, "grad_norm": 8.0, "learning_rate": 4.568256105097468e-07, "loss": 0.72888813, "memory(GiB)": 138.1, "step": 74320, "train_speed(iter/s)": 0.200929 }, { "acc": 0.78169894, "epoch": 1.7341751848233424, "grad_norm": 4.9375, "learning_rate": 4.560370578394374e-07, "loss": 0.76154728, "memory(GiB)": 138.1, "step": 74330, "train_speed(iter/s)": 0.200943 }, { "acc": 0.79366865, "epoch": 1.7344084923956313, "grad_norm": 5.875, "learning_rate": 4.5524915381851663e-07, "loss": 0.74610434, "memory(GiB)": 138.1, "step": 74340, "train_speed(iter/s)": 0.200956 }, { "acc": 0.78711934, "epoch": 1.7346417999679202, "grad_norm": 7.71875, "learning_rate": 4.544618985594601e-07, "loss": 0.75808601, "memory(GiB)": 138.1, "step": 74350, "train_speed(iter/s)": 0.200971 }, { "acc": 0.7786828, "epoch": 1.7348751075402091, "grad_norm": 5.40625, "learning_rate": 4.5367529217464733e-07, "loss": 0.78535357, "memory(GiB)": 138.1, "step": 74360, "train_speed(iter/s)": 0.200986 }, { "acc": 0.78281074, "epoch": 1.735108415112498, "grad_norm": 5.78125, "learning_rate": 4.5288933477636466e-07, "loss": 0.77032428, "memory(GiB)": 138.1, "step": 74370, "train_speed(iter/s)": 0.200999 }, { "acc": 0.77319117, "epoch": 1.735341722684787, "grad_norm": 5.96875, "learning_rate": 4.5210402647680895e-07, "loss": 0.81330729, "memory(GiB)": 138.1, "step": 74380, "train_speed(iter/s)": 0.201013 }, { "acc": 0.78810053, "epoch": 1.7355750302570758, "grad_norm": 4.8125, "learning_rate": 4.5131936738808225e-07, "loss": 0.76743989, "memory(GiB)": 138.1, "step": 74390, "train_speed(iter/s)": 0.201028 }, { "acc": 0.79243145, "epoch": 1.7358083378293647, "grad_norm": 6.375, "learning_rate": 4.5053535762219494e-07, "loss": 0.74746494, "memory(GiB)": 138.1, "step": 74400, "train_speed(iter/s)": 0.201042 }, { "acc": 0.7838623, "epoch": 1.7360416454016536, "grad_norm": 5.4375, "learning_rate": 4.4975199729106355e-07, "loss": 0.78794641, "memory(GiB)": 138.1, "step": 74410, "train_speed(iter/s)": 0.201056 }, { "acc": 0.8007185, "epoch": 1.7362749529739423, "grad_norm": 6.4375, "learning_rate": 4.489692865065126e-07, "loss": 0.72447138, "memory(GiB)": 138.1, "step": 74420, "train_speed(iter/s)": 0.201071 }, { "acc": 0.78932304, "epoch": 1.7365082605462314, "grad_norm": 4.125, "learning_rate": 4.4818722538027326e-07, "loss": 0.75672827, "memory(GiB)": 138.1, "step": 74430, "train_speed(iter/s)": 0.201086 }, { "acc": 0.78215332, "epoch": 1.73674156811852, "grad_norm": 4.75, "learning_rate": 4.474058140239862e-07, "loss": 0.77884374, "memory(GiB)": 138.1, "step": 74440, "train_speed(iter/s)": 0.2011 }, { "acc": 0.78150392, "epoch": 1.7369748756908092, "grad_norm": 6.09375, "learning_rate": 4.4662505254919665e-07, "loss": 0.75879669, "memory(GiB)": 138.1, "step": 74450, "train_speed(iter/s)": 0.201114 }, { "acc": 0.79613762, "epoch": 1.737208183263098, "grad_norm": 5.8125, "learning_rate": 4.4584494106735707e-07, "loss": 0.73280921, "memory(GiB)": 138.1, "step": 74460, "train_speed(iter/s)": 0.201128 }, { "acc": 0.78308897, "epoch": 1.737441490835387, "grad_norm": 5.84375, "learning_rate": 4.4506547968983016e-07, "loss": 0.78369331, "memory(GiB)": 138.1, "step": 74470, "train_speed(iter/s)": 0.201143 }, { "acc": 0.77581263, "epoch": 1.7376747984076757, "grad_norm": 8.125, "learning_rate": 4.4428666852788126e-07, "loss": 0.80782318, "memory(GiB)": 138.1, "step": 74480, "train_speed(iter/s)": 0.201157 }, { "acc": 0.76641207, "epoch": 1.7379081059799648, "grad_norm": 5.375, "learning_rate": 4.4350850769268874e-07, "loss": 0.84937601, "memory(GiB)": 138.1, "step": 74490, "train_speed(iter/s)": 0.20117 }, { "acc": 0.78670392, "epoch": 1.7381414135522535, "grad_norm": 4.65625, "learning_rate": 4.4273099729533255e-07, "loss": 0.76552391, "memory(GiB)": 138.1, "step": 74500, "train_speed(iter/s)": 0.201183 }, { "epoch": 1.7381414135522535, "eval_acc": 0.7446762852495471, "eval_loss": 0.804404079914093, "eval_runtime": 1271.2914, "eval_samples_per_second": 28.311, "eval_steps_per_second": 14.156, "step": 74500 }, { "acc": 0.7867198, "epoch": 1.7383747211245426, "grad_norm": 4.5, "learning_rate": 4.419541374468023e-07, "loss": 0.76824493, "memory(GiB)": 138.1, "step": 74510, "train_speed(iter/s)": 0.200496 }, { "acc": 0.77137146, "epoch": 1.7386080286968313, "grad_norm": 5.75, "learning_rate": 4.411779282579959e-07, "loss": 0.82607307, "memory(GiB)": 138.1, "step": 74520, "train_speed(iter/s)": 0.200509 }, { "acc": 0.78419704, "epoch": 1.7388413362691204, "grad_norm": 4.25, "learning_rate": 4.4040236983971476e-07, "loss": 0.76362429, "memory(GiB)": 138.1, "step": 74530, "train_speed(iter/s)": 0.200523 }, { "acc": 0.78158565, "epoch": 1.739074643841409, "grad_norm": 4.8125, "learning_rate": 4.3962746230267084e-07, "loss": 0.77455659, "memory(GiB)": 138.1, "step": 74540, "train_speed(iter/s)": 0.200537 }, { "acc": 0.7929049, "epoch": 1.7393079514136982, "grad_norm": 6.9375, "learning_rate": 4.388532057574818e-07, "loss": 0.73814707, "memory(GiB)": 138.1, "step": 74550, "train_speed(iter/s)": 0.200551 }, { "acc": 0.79585075, "epoch": 1.7395412589859869, "grad_norm": 4.21875, "learning_rate": 4.380796003146731e-07, "loss": 0.71769018, "memory(GiB)": 138.1, "step": 74560, "train_speed(iter/s)": 0.200565 }, { "acc": 0.76547136, "epoch": 1.739774566558276, "grad_norm": 4.71875, "learning_rate": 4.3730664608467534e-07, "loss": 0.81900177, "memory(GiB)": 138.1, "step": 74570, "train_speed(iter/s)": 0.200579 }, { "acc": 0.77250733, "epoch": 1.7400078741305647, "grad_norm": 10.375, "learning_rate": 4.3653434317782905e-07, "loss": 0.79657941, "memory(GiB)": 138.1, "step": 74580, "train_speed(iter/s)": 0.200593 }, { "acc": 0.77615132, "epoch": 1.7402411817028538, "grad_norm": 5.3125, "learning_rate": 4.3576269170438e-07, "loss": 0.81013956, "memory(GiB)": 138.1, "step": 74590, "train_speed(iter/s)": 0.200607 }, { "acc": 0.80237694, "epoch": 1.7404744892751425, "grad_norm": 5.53125, "learning_rate": 4.349916917744801e-07, "loss": 0.69212132, "memory(GiB)": 138.1, "step": 74600, "train_speed(iter/s)": 0.200621 }, { "acc": 0.77684078, "epoch": 1.7407077968474314, "grad_norm": 5.1875, "learning_rate": 4.3422134349819014e-07, "loss": 0.82745504, "memory(GiB)": 138.1, "step": 74610, "train_speed(iter/s)": 0.200635 }, { "acc": 0.7814332, "epoch": 1.7409411044197203, "grad_norm": 6.59375, "learning_rate": 4.334516469854766e-07, "loss": 0.75724936, "memory(GiB)": 138.1, "step": 74620, "train_speed(iter/s)": 0.200649 }, { "acc": 0.76991339, "epoch": 1.7411744119920092, "grad_norm": 7.21875, "learning_rate": 4.3268260234621497e-07, "loss": 0.82924404, "memory(GiB)": 138.1, "step": 74630, "train_speed(iter/s)": 0.200663 }, { "acc": 0.78513436, "epoch": 1.741407719564298, "grad_norm": 6.34375, "learning_rate": 4.319142096901846e-07, "loss": 0.76789331, "memory(GiB)": 138.1, "step": 74640, "train_speed(iter/s)": 0.200677 }, { "acc": 0.77668796, "epoch": 1.741641027136587, "grad_norm": 5.15625, "learning_rate": 4.3114646912707394e-07, "loss": 0.8108284, "memory(GiB)": 138.1, "step": 74650, "train_speed(iter/s)": 0.200691 }, { "acc": 0.7892621, "epoch": 1.7418743347088759, "grad_norm": 5.0625, "learning_rate": 4.30379380766478e-07, "loss": 0.76283331, "memory(GiB)": 138.1, "step": 74660, "train_speed(iter/s)": 0.200705 }, { "acc": 0.80962782, "epoch": 1.7421076422811648, "grad_norm": 4.96875, "learning_rate": 4.296129447178965e-07, "loss": 0.68484173, "memory(GiB)": 138.1, "step": 74670, "train_speed(iter/s)": 0.200718 }, { "acc": 0.78058996, "epoch": 1.7423409498534537, "grad_norm": 5.53125, "learning_rate": 4.288471610907402e-07, "loss": 0.80650549, "memory(GiB)": 138.1, "step": 74680, "train_speed(iter/s)": 0.200732 }, { "acc": 0.77258387, "epoch": 1.7425742574257426, "grad_norm": 7.625, "learning_rate": 4.2808202999432335e-07, "loss": 0.80087252, "memory(GiB)": 138.1, "step": 74690, "train_speed(iter/s)": 0.200746 }, { "acc": 0.77971539, "epoch": 1.7428075649980315, "grad_norm": 4.90625, "learning_rate": 4.2731755153786915e-07, "loss": 0.79446883, "memory(GiB)": 138.1, "step": 74700, "train_speed(iter/s)": 0.200761 }, { "acc": 0.78136635, "epoch": 1.7430408725703204, "grad_norm": 4.4375, "learning_rate": 4.2655372583050525e-07, "loss": 0.78863945, "memory(GiB)": 138.1, "step": 74710, "train_speed(iter/s)": 0.200775 }, { "acc": 0.78437352, "epoch": 1.7432741801426093, "grad_norm": 4.15625, "learning_rate": 4.2579055298126945e-07, "loss": 0.7832253, "memory(GiB)": 138.1, "step": 74720, "train_speed(iter/s)": 0.20079 }, { "acc": 0.77852464, "epoch": 1.7435074877148982, "grad_norm": 5.1875, "learning_rate": 4.2502803309910354e-07, "loss": 0.79541359, "memory(GiB)": 138.1, "step": 74730, "train_speed(iter/s)": 0.200804 }, { "acc": 0.77891903, "epoch": 1.743740795287187, "grad_norm": 5.28125, "learning_rate": 4.2426616629285544e-07, "loss": 0.79765043, "memory(GiB)": 138.1, "step": 74740, "train_speed(iter/s)": 0.200818 }, { "acc": 0.76886244, "epoch": 1.743974102859476, "grad_norm": 29.25, "learning_rate": 4.235049526712848e-07, "loss": 0.8290019, "memory(GiB)": 138.1, "step": 74750, "train_speed(iter/s)": 0.200833 }, { "acc": 0.78655448, "epoch": 1.7442074104317649, "grad_norm": 14.1875, "learning_rate": 4.227443923430513e-07, "loss": 0.75835438, "memory(GiB)": 138.1, "step": 74760, "train_speed(iter/s)": 0.200847 }, { "acc": 0.78014526, "epoch": 1.7444407180040538, "grad_norm": 6.5, "learning_rate": 4.2198448541672654e-07, "loss": 0.7994524, "memory(GiB)": 138.1, "step": 74770, "train_speed(iter/s)": 0.200861 }, { "acc": 0.77601213, "epoch": 1.7446740255763427, "grad_norm": 5.1875, "learning_rate": 4.212252320007859e-07, "loss": 0.83280792, "memory(GiB)": 138.1, "step": 74780, "train_speed(iter/s)": 0.200875 }, { "acc": 0.79946394, "epoch": 1.7449073331486316, "grad_norm": 4.96875, "learning_rate": 4.204666322036138e-07, "loss": 0.70037394, "memory(GiB)": 138.1, "step": 74790, "train_speed(iter/s)": 0.200889 }, { "acc": 0.78642559, "epoch": 1.7451406407209205, "grad_norm": 6.40625, "learning_rate": 4.197086861334998e-07, "loss": 0.78597708, "memory(GiB)": 138.1, "step": 74800, "train_speed(iter/s)": 0.200903 }, { "acc": 0.76866412, "epoch": 1.7453739482932091, "grad_norm": 5.5625, "learning_rate": 4.189513938986395e-07, "loss": 0.82813854, "memory(GiB)": 138.1, "step": 74810, "train_speed(iter/s)": 0.200917 }, { "acc": 0.77864561, "epoch": 1.7456072558654983, "grad_norm": 7.96875, "learning_rate": 4.181947556071381e-07, "loss": 0.79182148, "memory(GiB)": 138.1, "step": 74820, "train_speed(iter/s)": 0.200931 }, { "acc": 0.77730217, "epoch": 1.745840563437787, "grad_norm": 6.6875, "learning_rate": 4.1743877136700307e-07, "loss": 0.80669651, "memory(GiB)": 138.1, "step": 74830, "train_speed(iter/s)": 0.200944 }, { "acc": 0.78707137, "epoch": 1.746073871010076, "grad_norm": 5.8125, "learning_rate": 4.1668344128615254e-07, "loss": 0.77385092, "memory(GiB)": 138.1, "step": 74840, "train_speed(iter/s)": 0.200957 }, { "acc": 0.79038467, "epoch": 1.7463071785823647, "grad_norm": 6.09375, "learning_rate": 4.1592876547241035e-07, "loss": 0.74476175, "memory(GiB)": 138.1, "step": 74850, "train_speed(iter/s)": 0.200971 }, { "acc": 0.78900266, "epoch": 1.7465404861546538, "grad_norm": 5.34375, "learning_rate": 4.151747440335047e-07, "loss": 0.74805627, "memory(GiB)": 138.1, "step": 74860, "train_speed(iter/s)": 0.200984 }, { "acc": 0.77410288, "epoch": 1.7467737937269425, "grad_norm": 4.40625, "learning_rate": 4.144213770770711e-07, "loss": 0.82338734, "memory(GiB)": 138.1, "step": 74870, "train_speed(iter/s)": 0.200996 }, { "acc": 0.7824645, "epoch": 1.7470071012992316, "grad_norm": 4.5, "learning_rate": 4.1366866471065524e-07, "loss": 0.77526579, "memory(GiB)": 138.1, "step": 74880, "train_speed(iter/s)": 0.20101 }, { "acc": 0.76652699, "epoch": 1.7472404088715203, "grad_norm": 4.90625, "learning_rate": 4.129166070417051e-07, "loss": 0.85960999, "memory(GiB)": 138.1, "step": 74890, "train_speed(iter/s)": 0.201024 }, { "acc": 0.78324442, "epoch": 1.7474737164438094, "grad_norm": 6.5625, "learning_rate": 4.121652041775759e-07, "loss": 0.75736356, "memory(GiB)": 138.1, "step": 74900, "train_speed(iter/s)": 0.201038 }, { "acc": 0.78140106, "epoch": 1.7477070240160981, "grad_norm": 5.78125, "learning_rate": 4.1141445622553175e-07, "loss": 0.80148916, "memory(GiB)": 138.1, "step": 74910, "train_speed(iter/s)": 0.201053 }, { "acc": 0.79169626, "epoch": 1.7479403315883872, "grad_norm": 5.59375, "learning_rate": 4.106643632927404e-07, "loss": 0.7412683, "memory(GiB)": 138.1, "step": 74920, "train_speed(iter/s)": 0.201066 }, { "acc": 0.77393618, "epoch": 1.748173639160676, "grad_norm": 6.71875, "learning_rate": 4.099149254862783e-07, "loss": 0.78559313, "memory(GiB)": 138.1, "step": 74930, "train_speed(iter/s)": 0.20108 }, { "acc": 0.78671503, "epoch": 1.748406946732965, "grad_norm": 7.40625, "learning_rate": 4.091661429131277e-07, "loss": 0.77732067, "memory(GiB)": 138.1, "step": 74940, "train_speed(iter/s)": 0.201095 }, { "acc": 0.76118011, "epoch": 1.7486402543052537, "grad_norm": 5.65625, "learning_rate": 4.0841801568017534e-07, "loss": 0.86246977, "memory(GiB)": 138.1, "step": 74950, "train_speed(iter/s)": 0.201108 }, { "acc": 0.76429033, "epoch": 1.7488735618775428, "grad_norm": 5.375, "learning_rate": 4.0767054389421857e-07, "loss": 0.85855999, "memory(GiB)": 138.1, "step": 74960, "train_speed(iter/s)": 0.201121 }, { "acc": 0.78033648, "epoch": 1.7491068694498315, "grad_norm": 4.15625, "learning_rate": 4.069237276619564e-07, "loss": 0.76560135, "memory(GiB)": 138.1, "step": 74970, "train_speed(iter/s)": 0.201134 }, { "acc": 0.76945028, "epoch": 1.7493401770221206, "grad_norm": 4.71875, "learning_rate": 4.0617756708999976e-07, "loss": 0.82791023, "memory(GiB)": 138.1, "step": 74980, "train_speed(iter/s)": 0.201148 }, { "acc": 0.76738582, "epoch": 1.7495734845944093, "grad_norm": 5.46875, "learning_rate": 4.05432062284859e-07, "loss": 0.834793, "memory(GiB)": 138.1, "step": 74990, "train_speed(iter/s)": 0.201161 }, { "acc": 0.78293915, "epoch": 1.7498067921666982, "grad_norm": 6.5, "learning_rate": 4.0468721335295726e-07, "loss": 0.78276381, "memory(GiB)": 138.1, "step": 75000, "train_speed(iter/s)": 0.201175 }, { "epoch": 1.7498067921666982, "eval_acc": 0.7446785272687994, "eval_loss": 0.8044347763061523, "eval_runtime": 1272.0014, "eval_samples_per_second": 28.295, "eval_steps_per_second": 14.148, "step": 75000 }, { "acc": 0.77675529, "epoch": 1.750040099738987, "grad_norm": 5.9375, "learning_rate": 4.0394302040062117e-07, "loss": 0.78634806, "memory(GiB)": 138.1, "step": 75010, "train_speed(iter/s)": 0.200492 }, { "acc": 0.79533587, "epoch": 1.750273407311276, "grad_norm": 4.46875, "learning_rate": 4.03199483534083e-07, "loss": 0.73570852, "memory(GiB)": 138.1, "step": 75020, "train_speed(iter/s)": 0.200505 }, { "acc": 0.78597369, "epoch": 1.750506714883565, "grad_norm": 6.53125, "learning_rate": 4.0245660285948394e-07, "loss": 0.75488353, "memory(GiB)": 138.1, "step": 75030, "train_speed(iter/s)": 0.200519 }, { "acc": 0.8074131, "epoch": 1.7507400224558538, "grad_norm": 11.6875, "learning_rate": 4.0171437848286867e-07, "loss": 0.698316, "memory(GiB)": 138.1, "step": 75040, "train_speed(iter/s)": 0.200532 }, { "acc": 0.77187395, "epoch": 1.7509733300281427, "grad_norm": 6.53125, "learning_rate": 4.009728105101901e-07, "loss": 0.81652546, "memory(GiB)": 138.1, "step": 75050, "train_speed(iter/s)": 0.200547 }, { "acc": 0.77719297, "epoch": 1.7512066376004316, "grad_norm": 6.0, "learning_rate": 4.00231899047307e-07, "loss": 0.8079525, "memory(GiB)": 138.1, "step": 75060, "train_speed(iter/s)": 0.20056 }, { "acc": 0.77472291, "epoch": 1.7514399451727205, "grad_norm": 6.15625, "learning_rate": 3.994916441999841e-07, "loss": 0.80752106, "memory(GiB)": 138.1, "step": 75070, "train_speed(iter/s)": 0.200573 }, { "acc": 0.79288235, "epoch": 1.7516732527450094, "grad_norm": 6.0625, "learning_rate": 3.9875204607389304e-07, "loss": 0.76388402, "memory(GiB)": 138.1, "step": 75080, "train_speed(iter/s)": 0.200587 }, { "acc": 0.77766056, "epoch": 1.7519065603172983, "grad_norm": 4.25, "learning_rate": 3.980131047746105e-07, "loss": 0.80273018, "memory(GiB)": 138.1, "step": 75090, "train_speed(iter/s)": 0.200601 }, { "acc": 0.78243771, "epoch": 1.7521398678895872, "grad_norm": 4.875, "learning_rate": 3.9727482040762044e-07, "loss": 0.77541404, "memory(GiB)": 138.1, "step": 75100, "train_speed(iter/s)": 0.200615 }, { "acc": 0.78391609, "epoch": 1.752373175461876, "grad_norm": 5.59375, "learning_rate": 3.965371930783113e-07, "loss": 0.78062081, "memory(GiB)": 138.1, "step": 75110, "train_speed(iter/s)": 0.200628 }, { "acc": 0.79337425, "epoch": 1.752606483034165, "grad_norm": 5.0, "learning_rate": 3.958002228919822e-07, "loss": 0.73935184, "memory(GiB)": 138.1, "step": 75120, "train_speed(iter/s)": 0.200642 }, { "acc": 0.80228825, "epoch": 1.752839790606454, "grad_norm": 3.96875, "learning_rate": 3.9506390995383225e-07, "loss": 0.70532112, "memory(GiB)": 138.1, "step": 75130, "train_speed(iter/s)": 0.200656 }, { "acc": 0.79691057, "epoch": 1.7530730981787428, "grad_norm": 5.5, "learning_rate": 3.943282543689725e-07, "loss": 0.73715506, "memory(GiB)": 138.1, "step": 75140, "train_speed(iter/s)": 0.20067 }, { "acc": 0.78184276, "epoch": 1.7533064057510317, "grad_norm": 5.0625, "learning_rate": 3.935932562424166e-07, "loss": 0.78103657, "memory(GiB)": 138.1, "step": 75150, "train_speed(iter/s)": 0.200681 }, { "acc": 0.78166604, "epoch": 1.7535397133233206, "grad_norm": 4.6875, "learning_rate": 3.9285891567908465e-07, "loss": 0.78287544, "memory(GiB)": 138.1, "step": 75160, "train_speed(iter/s)": 0.200695 }, { "acc": 0.78169422, "epoch": 1.7537730208956095, "grad_norm": 6.28125, "learning_rate": 3.9212523278380434e-07, "loss": 0.75878248, "memory(GiB)": 138.1, "step": 75170, "train_speed(iter/s)": 0.200707 }, { "acc": 0.78662972, "epoch": 1.7540063284678984, "grad_norm": 7.4375, "learning_rate": 3.9139220766130803e-07, "loss": 0.75259981, "memory(GiB)": 138.1, "step": 75180, "train_speed(iter/s)": 0.20072 }, { "acc": 0.77813616, "epoch": 1.7542396360401873, "grad_norm": 9.8125, "learning_rate": 3.9065984041623594e-07, "loss": 0.7942369, "memory(GiB)": 138.1, "step": 75190, "train_speed(iter/s)": 0.200733 }, { "acc": 0.77014294, "epoch": 1.754472943612476, "grad_norm": 4.09375, "learning_rate": 3.8992813115313164e-07, "loss": 0.83160582, "memory(GiB)": 138.1, "step": 75200, "train_speed(iter/s)": 0.200747 }, { "acc": 0.77763653, "epoch": 1.754706251184765, "grad_norm": 5.28125, "learning_rate": 3.8919707997644884e-07, "loss": 0.8163887, "memory(GiB)": 138.1, "step": 75210, "train_speed(iter/s)": 0.200761 }, { "acc": 0.77771187, "epoch": 1.7549395587570538, "grad_norm": 4.6875, "learning_rate": 3.8846668699054233e-07, "loss": 0.80659122, "memory(GiB)": 138.1, "step": 75220, "train_speed(iter/s)": 0.200774 }, { "acc": 0.78318548, "epoch": 1.7551728663293429, "grad_norm": 5.90625, "learning_rate": 3.877369522996771e-07, "loss": 0.80368767, "memory(GiB)": 138.1, "step": 75230, "train_speed(iter/s)": 0.200787 }, { "acc": 0.7785296, "epoch": 1.7554061739016316, "grad_norm": 4.625, "learning_rate": 3.8700787600802203e-07, "loss": 0.78689766, "memory(GiB)": 138.1, "step": 75240, "train_speed(iter/s)": 0.200801 }, { "acc": 0.77669411, "epoch": 1.7556394814739207, "grad_norm": 6.90625, "learning_rate": 3.862794582196522e-07, "loss": 0.8024128, "memory(GiB)": 138.1, "step": 75250, "train_speed(iter/s)": 0.200815 }, { "acc": 0.77892799, "epoch": 1.7558727890462094, "grad_norm": 5.96875, "learning_rate": 3.8555169903854993e-07, "loss": 0.7945653, "memory(GiB)": 138.1, "step": 75260, "train_speed(iter/s)": 0.200829 }, { "acc": 0.79608464, "epoch": 1.7561060966184985, "grad_norm": 6.03125, "learning_rate": 3.848245985686011e-07, "loss": 0.72459307, "memory(GiB)": 138.1, "step": 75270, "train_speed(iter/s)": 0.200843 }, { "acc": 0.77168674, "epoch": 1.7563394041907872, "grad_norm": 5.96875, "learning_rate": 3.84098156913601e-07, "loss": 0.82376051, "memory(GiB)": 138.1, "step": 75280, "train_speed(iter/s)": 0.200857 }, { "acc": 0.79666181, "epoch": 1.7565727117630763, "grad_norm": 6.6875, "learning_rate": 3.8337237417724827e-07, "loss": 0.72730808, "memory(GiB)": 138.1, "step": 75290, "train_speed(iter/s)": 0.200871 }, { "acc": 0.8078371, "epoch": 1.756806019335365, "grad_norm": 5.8125, "learning_rate": 3.826472504631473e-07, "loss": 0.68620291, "memory(GiB)": 138.1, "step": 75300, "train_speed(iter/s)": 0.200884 }, { "acc": 0.78211536, "epoch": 1.757039326907654, "grad_norm": 6.28125, "learning_rate": 3.8192278587481035e-07, "loss": 0.77496066, "memory(GiB)": 138.1, "step": 75310, "train_speed(iter/s)": 0.200898 }, { "acc": 0.77927465, "epoch": 1.7572726344799428, "grad_norm": 5.21875, "learning_rate": 3.811989805156546e-07, "loss": 0.80956287, "memory(GiB)": 138.1, "step": 75320, "train_speed(iter/s)": 0.200912 }, { "acc": 0.79260778, "epoch": 1.7575059420522319, "grad_norm": 7.46875, "learning_rate": 3.804758344890025e-07, "loss": 0.74879017, "memory(GiB)": 138.1, "step": 75330, "train_speed(iter/s)": 0.200926 }, { "acc": 0.78099527, "epoch": 1.7577392496245205, "grad_norm": 5.75, "learning_rate": 3.7975334789808194e-07, "loss": 0.78341789, "memory(GiB)": 138.1, "step": 75340, "train_speed(iter/s)": 0.20094 }, { "acc": 0.77331486, "epoch": 1.7579725571968097, "grad_norm": 4.84375, "learning_rate": 3.790315208460299e-07, "loss": 0.82067747, "memory(GiB)": 138.1, "step": 75350, "train_speed(iter/s)": 0.200953 }, { "acc": 0.77612276, "epoch": 1.7582058647690983, "grad_norm": 5.25, "learning_rate": 3.783103534358845e-07, "loss": 0.79735045, "memory(GiB)": 138.1, "step": 75360, "train_speed(iter/s)": 0.200967 }, { "acc": 0.77384906, "epoch": 1.7584391723413875, "grad_norm": 6.6875, "learning_rate": 3.775898457705951e-07, "loss": 0.81172314, "memory(GiB)": 138.1, "step": 75370, "train_speed(iter/s)": 0.200981 }, { "acc": 0.77728834, "epoch": 1.7586724799136761, "grad_norm": 5.8125, "learning_rate": 3.768699979530122e-07, "loss": 0.77499533, "memory(GiB)": 138.1, "step": 75380, "train_speed(iter/s)": 0.200994 }, { "acc": 0.78484507, "epoch": 1.758905787485965, "grad_norm": 4.65625, "learning_rate": 3.761508100858929e-07, "loss": 0.7881773, "memory(GiB)": 138.1, "step": 75390, "train_speed(iter/s)": 0.201007 }, { "acc": 0.76613727, "epoch": 1.759139095058254, "grad_norm": 7.375, "learning_rate": 3.7543228227190307e-07, "loss": 0.84683084, "memory(GiB)": 138.1, "step": 75400, "train_speed(iter/s)": 0.201021 }, { "acc": 0.80847015, "epoch": 1.7593724026305428, "grad_norm": 4.125, "learning_rate": 3.747144146136111e-07, "loss": 0.67406015, "memory(GiB)": 138.1, "step": 75410, "train_speed(iter/s)": 0.201035 }, { "acc": 0.79223614, "epoch": 1.7596057102028317, "grad_norm": 3.734375, "learning_rate": 3.739972072134934e-07, "loss": 0.75303974, "memory(GiB)": 138.1, "step": 75420, "train_speed(iter/s)": 0.201048 }, { "acc": 0.7955615, "epoch": 1.7598390177751206, "grad_norm": 5.40625, "learning_rate": 3.7328066017393025e-07, "loss": 0.72555103, "memory(GiB)": 138.1, "step": 75430, "train_speed(iter/s)": 0.201061 }, { "acc": 0.7834197, "epoch": 1.7600723253474095, "grad_norm": 6.5625, "learning_rate": 3.725647735972093e-07, "loss": 0.77190681, "memory(GiB)": 138.1, "step": 75440, "train_speed(iter/s)": 0.201075 }, { "acc": 0.78522491, "epoch": 1.7603056329196984, "grad_norm": 3.6875, "learning_rate": 3.7184954758552215e-07, "loss": 0.75268602, "memory(GiB)": 138.1, "step": 75450, "train_speed(iter/s)": 0.201089 }, { "acc": 0.77963495, "epoch": 1.7605389404919873, "grad_norm": 5.65625, "learning_rate": 3.711349822409671e-07, "loss": 0.78821554, "memory(GiB)": 138.1, "step": 75460, "train_speed(iter/s)": 0.201103 }, { "acc": 0.77956514, "epoch": 1.7607722480642762, "grad_norm": 5.875, "learning_rate": 3.7042107766554925e-07, "loss": 0.78396311, "memory(GiB)": 138.1, "step": 75470, "train_speed(iter/s)": 0.201115 }, { "acc": 0.79036522, "epoch": 1.7610055556365651, "grad_norm": 5.40625, "learning_rate": 3.6970783396117706e-07, "loss": 0.77081327, "memory(GiB)": 138.1, "step": 75480, "train_speed(iter/s)": 0.201128 }, { "acc": 0.77387805, "epoch": 1.761238863208854, "grad_norm": 6.625, "learning_rate": 3.689952512296674e-07, "loss": 0.83468933, "memory(GiB)": 138.1, "step": 75490, "train_speed(iter/s)": 0.201142 }, { "acc": 0.78259726, "epoch": 1.761472170781143, "grad_norm": 5.53125, "learning_rate": 3.682833295727389e-07, "loss": 0.76884212, "memory(GiB)": 138.1, "step": 75500, "train_speed(iter/s)": 0.201156 }, { "epoch": 1.761472170781143, "eval_acc": 0.7446898975092928, "eval_loss": 0.8043956756591797, "eval_runtime": 1270.8807, "eval_samples_per_second": 28.32, "eval_steps_per_second": 14.16, "step": 75500 }, { "acc": 0.775494, "epoch": 1.7617054783534318, "grad_norm": 3.890625, "learning_rate": 3.675720690920209e-07, "loss": 0.7966898, "memory(GiB)": 138.1, "step": 75510, "train_speed(iter/s)": 0.200477 }, { "acc": 0.77051344, "epoch": 1.7619387859257207, "grad_norm": 4.96875, "learning_rate": 3.668614698890444e-07, "loss": 0.8196578, "memory(GiB)": 138.1, "step": 75520, "train_speed(iter/s)": 0.200491 }, { "acc": 0.77491951, "epoch": 1.7621720934980096, "grad_norm": 4.5, "learning_rate": 3.661515320652459e-07, "loss": 0.82127848, "memory(GiB)": 138.1, "step": 75530, "train_speed(iter/s)": 0.200505 }, { "acc": 0.80416489, "epoch": 1.7624054010702985, "grad_norm": 5.28125, "learning_rate": 3.654422557219711e-07, "loss": 0.6906394, "memory(GiB)": 138.1, "step": 75540, "train_speed(iter/s)": 0.200519 }, { "acc": 0.76411433, "epoch": 1.7626387086425874, "grad_norm": 5.46875, "learning_rate": 3.6473364096046795e-07, "loss": 0.85491076, "memory(GiB)": 138.1, "step": 75550, "train_speed(iter/s)": 0.200534 }, { "acc": 0.7577879, "epoch": 1.7628720162148763, "grad_norm": 5.0625, "learning_rate": 3.64025687881891e-07, "loss": 0.87597389, "memory(GiB)": 138.1, "step": 75560, "train_speed(iter/s)": 0.200548 }, { "acc": 0.78338909, "epoch": 1.763105323787165, "grad_norm": 4.5, "learning_rate": 3.633183965872994e-07, "loss": 0.78178053, "memory(GiB)": 138.1, "step": 75570, "train_speed(iter/s)": 0.200561 }, { "acc": 0.80338173, "epoch": 1.7633386313594541, "grad_norm": 5.21875, "learning_rate": 3.6261176717766076e-07, "loss": 0.71166687, "memory(GiB)": 138.1, "step": 75580, "train_speed(iter/s)": 0.200575 }, { "acc": 0.77883034, "epoch": 1.7635719389317428, "grad_norm": 6.46875, "learning_rate": 3.6190579975384497e-07, "loss": 0.81209888, "memory(GiB)": 138.1, "step": 75590, "train_speed(iter/s)": 0.200588 }, { "acc": 0.78879728, "epoch": 1.763805246504032, "grad_norm": 4.4375, "learning_rate": 3.6120049441662805e-07, "loss": 0.76921787, "memory(GiB)": 138.1, "step": 75600, "train_speed(iter/s)": 0.200602 }, { "acc": 0.78142576, "epoch": 1.7640385540763206, "grad_norm": 5.46875, "learning_rate": 3.6049585126669395e-07, "loss": 0.79211183, "memory(GiB)": 138.1, "step": 75610, "train_speed(iter/s)": 0.200615 }, { "acc": 0.78412828, "epoch": 1.7642718616486097, "grad_norm": 6.3125, "learning_rate": 3.5979187040462883e-07, "loss": 0.77902346, "memory(GiB)": 138.1, "step": 75620, "train_speed(iter/s)": 0.200629 }, { "acc": 0.75922632, "epoch": 1.7645051692208984, "grad_norm": 6.3125, "learning_rate": 3.5908855193092617e-07, "loss": 0.87735043, "memory(GiB)": 138.1, "step": 75630, "train_speed(iter/s)": 0.200644 }, { "acc": 0.79961157, "epoch": 1.7647384767931875, "grad_norm": 5.46875, "learning_rate": 3.5838589594598514e-07, "loss": 0.7121191, "memory(GiB)": 138.1, "step": 75640, "train_speed(iter/s)": 0.200658 }, { "acc": 0.7941186, "epoch": 1.7649717843654762, "grad_norm": 5.125, "learning_rate": 3.5768390255010833e-07, "loss": 0.71949234, "memory(GiB)": 138.1, "step": 75650, "train_speed(iter/s)": 0.200672 }, { "acc": 0.77251234, "epoch": 1.7652050919377653, "grad_norm": 5.5625, "learning_rate": 3.569825718435066e-07, "loss": 0.82141552, "memory(GiB)": 138.1, "step": 75660, "train_speed(iter/s)": 0.200686 }, { "acc": 0.7619668, "epoch": 1.765438399510054, "grad_norm": 4.65625, "learning_rate": 3.562819039262938e-07, "loss": 0.87379723, "memory(GiB)": 138.1, "step": 75670, "train_speed(iter/s)": 0.200701 }, { "acc": 0.80607891, "epoch": 1.7656717070823431, "grad_norm": 7.5, "learning_rate": 3.5558189889848995e-07, "loss": 0.67555485, "memory(GiB)": 138.1, "step": 75680, "train_speed(iter/s)": 0.200715 }, { "acc": 0.78483949, "epoch": 1.7659050146546318, "grad_norm": 4.0625, "learning_rate": 3.5488255686002005e-07, "loss": 0.80067978, "memory(GiB)": 138.1, "step": 75690, "train_speed(iter/s)": 0.200728 }, { "acc": 0.77885909, "epoch": 1.766138322226921, "grad_norm": 5.28125, "learning_rate": 3.5418387791071706e-07, "loss": 0.81857595, "memory(GiB)": 138.1, "step": 75700, "train_speed(iter/s)": 0.200743 }, { "acc": 0.77617831, "epoch": 1.7663716297992096, "grad_norm": 5.125, "learning_rate": 3.5348586215031457e-07, "loss": 0.79372025, "memory(GiB)": 138.1, "step": 75710, "train_speed(iter/s)": 0.200756 }, { "acc": 0.78535709, "epoch": 1.7666049373714987, "grad_norm": 6.03125, "learning_rate": 3.527885096784567e-07, "loss": 0.76806793, "memory(GiB)": 138.1, "step": 75720, "train_speed(iter/s)": 0.200769 }, { "acc": 0.76860857, "epoch": 1.7668382449437874, "grad_norm": 5.59375, "learning_rate": 3.520918205946883e-07, "loss": 0.81936932, "memory(GiB)": 138.1, "step": 75730, "train_speed(iter/s)": 0.200784 }, { "acc": 0.77166557, "epoch": 1.7670715525160765, "grad_norm": 5.375, "learning_rate": 3.51395794998462e-07, "loss": 0.82154722, "memory(GiB)": 138.1, "step": 75740, "train_speed(iter/s)": 0.200798 }, { "acc": 0.78490963, "epoch": 1.7673048600883652, "grad_norm": 6.53125, "learning_rate": 3.507004329891367e-07, "loss": 0.76154852, "memory(GiB)": 138.1, "step": 75750, "train_speed(iter/s)": 0.200812 }, { "acc": 0.78819742, "epoch": 1.7675381676606543, "grad_norm": 5.96875, "learning_rate": 3.5000573466597243e-07, "loss": 0.74255095, "memory(GiB)": 138.1, "step": 75760, "train_speed(iter/s)": 0.200825 }, { "acc": 0.79479284, "epoch": 1.767771475232943, "grad_norm": 6.65625, "learning_rate": 3.4931170012813985e-07, "loss": 0.73127079, "memory(GiB)": 138.1, "step": 75770, "train_speed(iter/s)": 0.200838 }, { "acc": 0.79238229, "epoch": 1.7680047828052319, "grad_norm": 6.1875, "learning_rate": 3.4861832947471086e-07, "loss": 0.73390646, "memory(GiB)": 138.1, "step": 75780, "train_speed(iter/s)": 0.200852 }, { "acc": 0.78550787, "epoch": 1.7682380903775208, "grad_norm": 5.0625, "learning_rate": 3.479256228046646e-07, "loss": 0.75722055, "memory(GiB)": 138.1, "step": 75790, "train_speed(iter/s)": 0.200866 }, { "acc": 0.777526, "epoch": 1.7684713979498097, "grad_norm": 9.1875, "learning_rate": 3.4723358021688303e-07, "loss": 0.78414106, "memory(GiB)": 138.1, "step": 75800, "train_speed(iter/s)": 0.20088 }, { "acc": 0.77671871, "epoch": 1.7687047055220986, "grad_norm": 4.90625, "learning_rate": 3.465422018101572e-07, "loss": 0.80290337, "memory(GiB)": 138.1, "step": 75810, "train_speed(iter/s)": 0.200894 }, { "acc": 0.79054794, "epoch": 1.7689380130943875, "grad_norm": 5.28125, "learning_rate": 3.4585148768317975e-07, "loss": 0.74591765, "memory(GiB)": 138.1, "step": 75820, "train_speed(iter/s)": 0.200909 }, { "acc": 0.77464447, "epoch": 1.7691713206666764, "grad_norm": 6.28125, "learning_rate": 3.4516143793455027e-07, "loss": 0.83038616, "memory(GiB)": 138.1, "step": 75830, "train_speed(iter/s)": 0.200923 }, { "acc": 0.78378277, "epoch": 1.7694046282389653, "grad_norm": 4.34375, "learning_rate": 3.4447205266277373e-07, "loss": 0.75231061, "memory(GiB)": 138.1, "step": 75840, "train_speed(iter/s)": 0.200936 }, { "acc": 0.78303156, "epoch": 1.7696379358112542, "grad_norm": 4.53125, "learning_rate": 3.437833319662587e-07, "loss": 0.78000083, "memory(GiB)": 138.1, "step": 75850, "train_speed(iter/s)": 0.20095 }, { "acc": 0.79331565, "epoch": 1.769871243383543, "grad_norm": 5.65625, "learning_rate": 3.430952759433209e-07, "loss": 0.74238071, "memory(GiB)": 138.1, "step": 75860, "train_speed(iter/s)": 0.200964 }, { "acc": 0.78225794, "epoch": 1.770104550955832, "grad_norm": 5.5625, "learning_rate": 3.4240788469217966e-07, "loss": 0.78348684, "memory(GiB)": 138.1, "step": 75870, "train_speed(iter/s)": 0.200977 }, { "acc": 0.79212933, "epoch": 1.7703378585281209, "grad_norm": 6.96875, "learning_rate": 3.417211583109592e-07, "loss": 0.73944921, "memory(GiB)": 138.1, "step": 75880, "train_speed(iter/s)": 0.200991 }, { "acc": 0.77557068, "epoch": 1.7705711661004098, "grad_norm": 5.03125, "learning_rate": 3.4103509689769165e-07, "loss": 0.78513107, "memory(GiB)": 138.1, "step": 75890, "train_speed(iter/s)": 0.201004 }, { "acc": 0.77322273, "epoch": 1.7708044736726987, "grad_norm": 6.78125, "learning_rate": 3.4034970055030923e-07, "loss": 0.81071148, "memory(GiB)": 138.1, "step": 75900, "train_speed(iter/s)": 0.201018 }, { "acc": 0.79248972, "epoch": 1.7710377812449876, "grad_norm": 4.28125, "learning_rate": 3.396649693666537e-07, "loss": 0.74013586, "memory(GiB)": 138.1, "step": 75910, "train_speed(iter/s)": 0.201031 }, { "acc": 0.79005871, "epoch": 1.7712710888172765, "grad_norm": 6.03125, "learning_rate": 3.3898090344446966e-07, "loss": 0.73053083, "memory(GiB)": 138.1, "step": 75920, "train_speed(iter/s)": 0.201044 }, { "acc": 0.76683903, "epoch": 1.7715043963895654, "grad_norm": 7.09375, "learning_rate": 3.382975028814078e-07, "loss": 0.8392231, "memory(GiB)": 138.1, "step": 75930, "train_speed(iter/s)": 0.201059 }, { "acc": 0.78439379, "epoch": 1.7717377039618543, "grad_norm": 6.59375, "learning_rate": 3.3761476777502355e-07, "loss": 0.77608676, "memory(GiB)": 138.1, "step": 75940, "train_speed(iter/s)": 0.201072 }, { "acc": 0.78523664, "epoch": 1.7719710115341432, "grad_norm": 4.9375, "learning_rate": 3.369326982227761e-07, "loss": 0.76700554, "memory(GiB)": 138.1, "step": 75950, "train_speed(iter/s)": 0.201086 }, { "acc": 0.78907003, "epoch": 1.7722043191064318, "grad_norm": 4.4375, "learning_rate": 3.3625129432203197e-07, "loss": 0.75762873, "memory(GiB)": 138.1, "step": 75960, "train_speed(iter/s)": 0.201099 }, { "acc": 0.77812548, "epoch": 1.772437626678721, "grad_norm": 4.28125, "learning_rate": 3.3557055617006006e-07, "loss": 0.76968746, "memory(GiB)": 138.1, "step": 75970, "train_speed(iter/s)": 0.201113 }, { "acc": 0.77837658, "epoch": 1.7726709342510096, "grad_norm": 4.34375, "learning_rate": 3.348904838640371e-07, "loss": 0.77840204, "memory(GiB)": 138.1, "step": 75980, "train_speed(iter/s)": 0.201127 }, { "acc": 0.77046366, "epoch": 1.7729042418232988, "grad_norm": 5.40625, "learning_rate": 3.3421107750104155e-07, "loss": 0.83179445, "memory(GiB)": 138.1, "step": 75990, "train_speed(iter/s)": 0.201141 }, { "acc": 0.77653723, "epoch": 1.7731375493955874, "grad_norm": 5.96875, "learning_rate": 3.3353233717805967e-07, "loss": 0.80116844, "memory(GiB)": 138.1, "step": 76000, "train_speed(iter/s)": 0.201154 }, { "epoch": 1.7731375493955874, "eval_acc": 0.744712317701815, "eval_loss": 0.804376482963562, "eval_runtime": 1270.1032, "eval_samples_per_second": 28.337, "eval_steps_per_second": 14.169, "step": 76000 }, { "acc": 0.77820454, "epoch": 1.7733708569678766, "grad_norm": 4.59375, "learning_rate": 3.3285426299198175e-07, "loss": 0.81404266, "memory(GiB)": 138.1, "step": 76010, "train_speed(iter/s)": 0.200481 }, { "acc": 0.77452345, "epoch": 1.7736041645401652, "grad_norm": 4.4375, "learning_rate": 3.321768550396015e-07, "loss": 0.81841717, "memory(GiB)": 138.1, "step": 76020, "train_speed(iter/s)": 0.200495 }, { "acc": 0.78945813, "epoch": 1.7738374721124543, "grad_norm": 4.96875, "learning_rate": 3.3150011341761933e-07, "loss": 0.75137343, "memory(GiB)": 138.1, "step": 76030, "train_speed(iter/s)": 0.200509 }, { "acc": 0.7791081, "epoch": 1.774070779684743, "grad_norm": 5.375, "learning_rate": 3.308240382226385e-07, "loss": 0.7906178, "memory(GiB)": 138.1, "step": 76040, "train_speed(iter/s)": 0.200523 }, { "acc": 0.78823118, "epoch": 1.7743040872570321, "grad_norm": 4.59375, "learning_rate": 3.301486295511713e-07, "loss": 0.77563086, "memory(GiB)": 138.1, "step": 76050, "train_speed(iter/s)": 0.200537 }, { "acc": 0.78934536, "epoch": 1.7745373948293208, "grad_norm": 7.5, "learning_rate": 3.294738874996295e-07, "loss": 0.7632966, "memory(GiB)": 138.1, "step": 76060, "train_speed(iter/s)": 0.200552 }, { "acc": 0.77486506, "epoch": 1.77477070240161, "grad_norm": 7.0, "learning_rate": 3.2879981216433433e-07, "loss": 0.85285072, "memory(GiB)": 138.1, "step": 76070, "train_speed(iter/s)": 0.200566 }, { "acc": 0.7874649, "epoch": 1.7750040099738986, "grad_norm": 4.25, "learning_rate": 3.281264036415088e-07, "loss": 0.75169353, "memory(GiB)": 138.1, "step": 76080, "train_speed(iter/s)": 0.200579 }, { "acc": 0.77508116, "epoch": 1.7752373175461877, "grad_norm": 6.125, "learning_rate": 3.274536620272811e-07, "loss": 0.80775452, "memory(GiB)": 138.1, "step": 76090, "train_speed(iter/s)": 0.200593 }, { "acc": 0.77886529, "epoch": 1.7754706251184764, "grad_norm": 4.8125, "learning_rate": 3.267815874176866e-07, "loss": 0.80793839, "memory(GiB)": 138.1, "step": 76100, "train_speed(iter/s)": 0.200607 }, { "acc": 0.77209039, "epoch": 1.7757039326907655, "grad_norm": 7.125, "learning_rate": 3.2611017990866244e-07, "loss": 0.82538052, "memory(GiB)": 138.1, "step": 76110, "train_speed(iter/s)": 0.200621 }, { "acc": 0.77398095, "epoch": 1.7759372402630542, "grad_norm": 4.96875, "learning_rate": 3.254394395960536e-07, "loss": 0.81047382, "memory(GiB)": 138.1, "step": 76120, "train_speed(iter/s)": 0.200635 }, { "acc": 0.76875162, "epoch": 1.7761705478353433, "grad_norm": 5.0, "learning_rate": 3.247693665756052e-07, "loss": 0.84657841, "memory(GiB)": 138.1, "step": 76130, "train_speed(iter/s)": 0.200648 }, { "acc": 0.7959794, "epoch": 1.776403855407632, "grad_norm": 4.8125, "learning_rate": 3.2409996094297294e-07, "loss": 0.74632068, "memory(GiB)": 138.1, "step": 76140, "train_speed(iter/s)": 0.200661 }, { "acc": 0.77729673, "epoch": 1.776637162979921, "grad_norm": 6.25, "learning_rate": 3.234312227937114e-07, "loss": 0.82577114, "memory(GiB)": 138.1, "step": 76150, "train_speed(iter/s)": 0.200674 }, { "acc": 0.80631809, "epoch": 1.7768704705522098, "grad_norm": 6.125, "learning_rate": 3.2276315222328547e-07, "loss": 0.71360807, "memory(GiB)": 138.1, "step": 76160, "train_speed(iter/s)": 0.200689 }, { "acc": 0.80361118, "epoch": 1.7771037781244987, "grad_norm": 4.1875, "learning_rate": 3.2209574932706043e-07, "loss": 0.69395614, "memory(GiB)": 138.1, "step": 76170, "train_speed(iter/s)": 0.200703 }, { "acc": 0.77744942, "epoch": 1.7773370856967876, "grad_norm": 5.0625, "learning_rate": 3.214290142003079e-07, "loss": 0.80556965, "memory(GiB)": 138.1, "step": 76180, "train_speed(iter/s)": 0.200716 }, { "acc": 0.78394184, "epoch": 1.7775703932690765, "grad_norm": 5.875, "learning_rate": 3.207629469382051e-07, "loss": 0.78173552, "memory(GiB)": 138.1, "step": 76190, "train_speed(iter/s)": 0.20073 }, { "acc": 0.7806911, "epoch": 1.7778037008413654, "grad_norm": 4.875, "learning_rate": 3.2009754763583143e-07, "loss": 0.79640622, "memory(GiB)": 138.1, "step": 76200, "train_speed(iter/s)": 0.200743 }, { "acc": 0.78159871, "epoch": 1.7780370084136543, "grad_norm": 5.34375, "learning_rate": 3.194328163881738e-07, "loss": 0.77030783, "memory(GiB)": 138.1, "step": 76210, "train_speed(iter/s)": 0.200756 }, { "acc": 0.78937597, "epoch": 1.7782703159859432, "grad_norm": 4.5, "learning_rate": 3.1876875329012235e-07, "loss": 0.74239864, "memory(GiB)": 138.1, "step": 76220, "train_speed(iter/s)": 0.200768 }, { "acc": 0.76783009, "epoch": 1.778503623558232, "grad_norm": 5.03125, "learning_rate": 3.181053584364707e-07, "loss": 0.84220142, "memory(GiB)": 138.1, "step": 76230, "train_speed(iter/s)": 0.200783 }, { "acc": 0.77021403, "epoch": 1.778736931130521, "grad_norm": 5.5, "learning_rate": 3.174426319219204e-07, "loss": 0.83477764, "memory(GiB)": 138.1, "step": 76240, "train_speed(iter/s)": 0.200798 }, { "acc": 0.78976417, "epoch": 1.77897023870281, "grad_norm": 5.03125, "learning_rate": 3.167805738410723e-07, "loss": 0.75705347, "memory(GiB)": 138.1, "step": 76250, "train_speed(iter/s)": 0.200812 }, { "acc": 0.78457289, "epoch": 1.7792035462750988, "grad_norm": 6.6875, "learning_rate": 3.161191842884381e-07, "loss": 0.76162119, "memory(GiB)": 138.1, "step": 76260, "train_speed(iter/s)": 0.200827 }, { "acc": 0.78860617, "epoch": 1.7794368538473877, "grad_norm": 4.5625, "learning_rate": 3.1545846335842843e-07, "loss": 0.76644807, "memory(GiB)": 138.1, "step": 76270, "train_speed(iter/s)": 0.20084 }, { "acc": 0.78241735, "epoch": 1.7796701614196766, "grad_norm": 5.59375, "learning_rate": 3.1479841114536334e-07, "loss": 0.7895534, "memory(GiB)": 138.1, "step": 76280, "train_speed(iter/s)": 0.200854 }, { "acc": 0.76458092, "epoch": 1.7799034689919655, "grad_norm": 5.59375, "learning_rate": 3.1413902774346305e-07, "loss": 0.85796795, "memory(GiB)": 138.1, "step": 76290, "train_speed(iter/s)": 0.200868 }, { "acc": 0.7956975, "epoch": 1.7801367765642544, "grad_norm": 6.59375, "learning_rate": 3.134803132468561e-07, "loss": 0.73320594, "memory(GiB)": 138.1, "step": 76300, "train_speed(iter/s)": 0.200882 }, { "acc": 0.80051413, "epoch": 1.7803700841365433, "grad_norm": 5.4375, "learning_rate": 3.128222677495729e-07, "loss": 0.71121464, "memory(GiB)": 138.1, "step": 76310, "train_speed(iter/s)": 0.200896 }, { "acc": 0.78982344, "epoch": 1.7806033917088322, "grad_norm": 5.9375, "learning_rate": 3.1216489134554886e-07, "loss": 0.76041822, "memory(GiB)": 138.1, "step": 76320, "train_speed(iter/s)": 0.200909 }, { "acc": 0.78644514, "epoch": 1.780836699281121, "grad_norm": 5.5625, "learning_rate": 3.115081841286255e-07, "loss": 0.76248388, "memory(GiB)": 138.1, "step": 76330, "train_speed(iter/s)": 0.200923 }, { "acc": 0.78216338, "epoch": 1.78107000685341, "grad_norm": 5.21875, "learning_rate": 3.108521461925457e-07, "loss": 0.80187321, "memory(GiB)": 138.1, "step": 76340, "train_speed(iter/s)": 0.200937 }, { "acc": 0.75998535, "epoch": 1.7813033144256987, "grad_norm": 5.125, "learning_rate": 3.101967776309617e-07, "loss": 0.8771349, "memory(GiB)": 138.1, "step": 76350, "train_speed(iter/s)": 0.200951 }, { "acc": 0.80899925, "epoch": 1.7815366219979878, "grad_norm": 5.0625, "learning_rate": 3.095420785374237e-07, "loss": 0.66126566, "memory(GiB)": 138.1, "step": 76360, "train_speed(iter/s)": 0.200965 }, { "acc": 0.76196551, "epoch": 1.7817699295702765, "grad_norm": 6.78125, "learning_rate": 3.088880490053925e-07, "loss": 0.87400131, "memory(GiB)": 138.1, "step": 76370, "train_speed(iter/s)": 0.200978 }, { "acc": 0.76612358, "epoch": 1.7820032371425656, "grad_norm": 6.21875, "learning_rate": 3.0823468912822895e-07, "loss": 0.84581919, "memory(GiB)": 138.1, "step": 76380, "train_speed(iter/s)": 0.200992 }, { "acc": 0.78834362, "epoch": 1.7822365447148543, "grad_norm": 11.0, "learning_rate": 3.0758199899920014e-07, "loss": 0.74992781, "memory(GiB)": 138.1, "step": 76390, "train_speed(iter/s)": 0.201005 }, { "acc": 0.77249632, "epoch": 1.7824698522871434, "grad_norm": 4.71875, "learning_rate": 3.069299787114782e-07, "loss": 0.83343058, "memory(GiB)": 138.1, "step": 76400, "train_speed(iter/s)": 0.201019 }, { "acc": 0.78593674, "epoch": 1.782703159859432, "grad_norm": 4.46875, "learning_rate": 3.0627862835813814e-07, "loss": 0.75706806, "memory(GiB)": 138.1, "step": 76410, "train_speed(iter/s)": 0.201033 }, { "acc": 0.77988739, "epoch": 1.7829364674317212, "grad_norm": 4.8125, "learning_rate": 3.0562794803216114e-07, "loss": 0.77060046, "memory(GiB)": 138.1, "step": 76420, "train_speed(iter/s)": 0.201045 }, { "acc": 0.80068836, "epoch": 1.7831697750040099, "grad_norm": 6.96875, "learning_rate": 3.0497793782642946e-07, "loss": 0.69606977, "memory(GiB)": 138.1, "step": 76430, "train_speed(iter/s)": 0.201059 }, { "acc": 0.80669575, "epoch": 1.783403082576299, "grad_norm": 4.6875, "learning_rate": 3.043285978337346e-07, "loss": 0.69010477, "memory(GiB)": 138.1, "step": 76440, "train_speed(iter/s)": 0.201073 }, { "acc": 0.78510504, "epoch": 1.7836363901485877, "grad_norm": 6.0, "learning_rate": 3.036799281467678e-07, "loss": 0.75290413, "memory(GiB)": 138.1, "step": 76450, "train_speed(iter/s)": 0.201086 }, { "acc": 0.78098297, "epoch": 1.7838696977208768, "grad_norm": 4.96875, "learning_rate": 3.0303192885812737e-07, "loss": 0.77756319, "memory(GiB)": 138.1, "step": 76460, "train_speed(iter/s)": 0.2011 }, { "acc": 0.79312925, "epoch": 1.7841030052931655, "grad_norm": 4.8125, "learning_rate": 3.023846000603148e-07, "loss": 0.733179, "memory(GiB)": 138.1, "step": 76470, "train_speed(iter/s)": 0.201115 }, { "acc": 0.79176178, "epoch": 1.7843363128654546, "grad_norm": 5.375, "learning_rate": 3.0173794184573444e-07, "loss": 0.75639806, "memory(GiB)": 138.1, "step": 76480, "train_speed(iter/s)": 0.201129 }, { "acc": 0.78836441, "epoch": 1.7845696204377433, "grad_norm": 5.28125, "learning_rate": 3.0109195430669925e-07, "loss": 0.76128173, "memory(GiB)": 138.1, "step": 76490, "train_speed(iter/s)": 0.201142 }, { "acc": 0.78055067, "epoch": 1.7848029280100324, "grad_norm": 5.78125, "learning_rate": 3.00446637535422e-07, "loss": 0.79550552, "memory(GiB)": 138.1, "step": 76500, "train_speed(iter/s)": 0.201156 }, { "epoch": 1.7848029280100324, "eval_acc": 0.7446987054420694, "eval_loss": 0.8044025897979736, "eval_runtime": 1269.883, "eval_samples_per_second": 28.342, "eval_steps_per_second": 14.171, "step": 76500 }, { "acc": 0.78949766, "epoch": 1.785036235582321, "grad_norm": 4.96875, "learning_rate": 2.9980199162402245e-07, "loss": 0.77280111, "memory(GiB)": 138.1, "step": 76510, "train_speed(iter/s)": 0.200486 }, { "acc": 0.7705359, "epoch": 1.7852695431546102, "grad_norm": 5.9375, "learning_rate": 2.9915801666452307e-07, "loss": 0.82575054, "memory(GiB)": 138.1, "step": 76520, "train_speed(iter/s)": 0.200499 }, { "acc": 0.79377584, "epoch": 1.7855028507268988, "grad_norm": 5.96875, "learning_rate": 2.985147127488508e-07, "loss": 0.73605633, "memory(GiB)": 138.1, "step": 76530, "train_speed(iter/s)": 0.200512 }, { "acc": 0.77772698, "epoch": 1.7857361582991877, "grad_norm": 6.75, "learning_rate": 2.978720799688378e-07, "loss": 0.77994833, "memory(GiB)": 138.1, "step": 76540, "train_speed(iter/s)": 0.200526 }, { "acc": 0.78651562, "epoch": 1.7859694658714766, "grad_norm": 7.78125, "learning_rate": 2.9723011841621905e-07, "loss": 0.74728775, "memory(GiB)": 138.1, "step": 76550, "train_speed(iter/s)": 0.200539 }, { "acc": 0.76202259, "epoch": 1.7862027734437655, "grad_norm": 8.3125, "learning_rate": 2.965888281826357e-07, "loss": 0.84415607, "memory(GiB)": 138.1, "step": 76560, "train_speed(iter/s)": 0.200552 }, { "acc": 0.77949867, "epoch": 1.7864360810160544, "grad_norm": 4.3125, "learning_rate": 2.9594820935963e-07, "loss": 0.76835766, "memory(GiB)": 138.1, "step": 76570, "train_speed(iter/s)": 0.200565 }, { "acc": 0.78318701, "epoch": 1.7866693885883433, "grad_norm": 4.03125, "learning_rate": 2.953082620386516e-07, "loss": 0.77278767, "memory(GiB)": 138.1, "step": 76580, "train_speed(iter/s)": 0.200579 }, { "acc": 0.78765154, "epoch": 1.7869026961606322, "grad_norm": 7.375, "learning_rate": 2.946689863110508e-07, "loss": 0.74817243, "memory(GiB)": 138.1, "step": 76590, "train_speed(iter/s)": 0.200593 }, { "acc": 0.79166756, "epoch": 1.7871360037329211, "grad_norm": 4.875, "learning_rate": 2.9403038226808625e-07, "loss": 0.74761534, "memory(GiB)": 138.1, "step": 76600, "train_speed(iter/s)": 0.200606 }, { "acc": 0.77928109, "epoch": 1.78736931130521, "grad_norm": 5.9375, "learning_rate": 2.9339245000091776e-07, "loss": 0.80569696, "memory(GiB)": 138.1, "step": 76610, "train_speed(iter/s)": 0.20062 }, { "acc": 0.78152971, "epoch": 1.787602618877499, "grad_norm": 5.90625, "learning_rate": 2.9275518960060867e-07, "loss": 0.78764749, "memory(GiB)": 138.1, "step": 76620, "train_speed(iter/s)": 0.200633 }, { "acc": 0.78837299, "epoch": 1.7878359264497878, "grad_norm": 6.40625, "learning_rate": 2.9211860115813005e-07, "loss": 0.7382781, "memory(GiB)": 138.1, "step": 76630, "train_speed(iter/s)": 0.200647 }, { "acc": 0.77095346, "epoch": 1.7880692340220767, "grad_norm": 5.0, "learning_rate": 2.9148268476435206e-07, "loss": 0.82100334, "memory(GiB)": 138.1, "step": 76640, "train_speed(iter/s)": 0.200661 }, { "acc": 0.77619829, "epoch": 1.7883025415943656, "grad_norm": 7.4375, "learning_rate": 2.908474405100542e-07, "loss": 0.82527828, "memory(GiB)": 138.1, "step": 76650, "train_speed(iter/s)": 0.200675 }, { "acc": 0.77772455, "epoch": 1.7885358491666545, "grad_norm": 3.921875, "learning_rate": 2.9021286848591626e-07, "loss": 0.79985743, "memory(GiB)": 138.1, "step": 76660, "train_speed(iter/s)": 0.200688 }, { "acc": 0.78497052, "epoch": 1.7887691567389434, "grad_norm": 6.3125, "learning_rate": 2.895789687825218e-07, "loss": 0.77265472, "memory(GiB)": 138.1, "step": 76670, "train_speed(iter/s)": 0.200702 }, { "acc": 0.78477559, "epoch": 1.7890024643112323, "grad_norm": 5.6875, "learning_rate": 2.8894574149036237e-07, "loss": 0.77249074, "memory(GiB)": 138.1, "step": 76680, "train_speed(iter/s)": 0.200715 }, { "acc": 0.78217597, "epoch": 1.7892357718835212, "grad_norm": 5.3125, "learning_rate": 2.8831318669982956e-07, "loss": 0.78129129, "memory(GiB)": 138.1, "step": 76690, "train_speed(iter/s)": 0.200728 }, { "acc": 0.78776398, "epoch": 1.7894690794558101, "grad_norm": 4.5, "learning_rate": 2.876813045012211e-07, "loss": 0.76364908, "memory(GiB)": 138.1, "step": 76700, "train_speed(iter/s)": 0.200742 }, { "acc": 0.78187065, "epoch": 1.789702387028099, "grad_norm": 6.8125, "learning_rate": 2.8705009498473604e-07, "loss": 0.77821112, "memory(GiB)": 138.1, "step": 76710, "train_speed(iter/s)": 0.200756 }, { "acc": 0.78065195, "epoch": 1.789935694600388, "grad_norm": 12.5625, "learning_rate": 2.8641955824048216e-07, "loss": 0.78781352, "memory(GiB)": 138.1, "step": 76720, "train_speed(iter/s)": 0.20077 }, { "acc": 0.75964537, "epoch": 1.7901690021726768, "grad_norm": 5.03125, "learning_rate": 2.857896943584665e-07, "loss": 0.86047745, "memory(GiB)": 138.1, "step": 76730, "train_speed(iter/s)": 0.200784 }, { "acc": 0.76801662, "epoch": 1.7904023097449655, "grad_norm": 7.28125, "learning_rate": 2.85160503428602e-07, "loss": 0.84608536, "memory(GiB)": 138.1, "step": 76740, "train_speed(iter/s)": 0.200797 }, { "acc": 0.78553681, "epoch": 1.7906356173172546, "grad_norm": 5.1875, "learning_rate": 2.8453198554070694e-07, "loss": 0.78472672, "memory(GiB)": 138.1, "step": 76750, "train_speed(iter/s)": 0.20081 }, { "acc": 0.7960628, "epoch": 1.7908689248895433, "grad_norm": 5.34375, "learning_rate": 2.8390414078450003e-07, "loss": 0.71955853, "memory(GiB)": 138.1, "step": 76760, "train_speed(iter/s)": 0.200824 }, { "acc": 0.78666868, "epoch": 1.7911022324618324, "grad_norm": 13.3125, "learning_rate": 2.8327696924960737e-07, "loss": 0.76741972, "memory(GiB)": 138.1, "step": 76770, "train_speed(iter/s)": 0.200837 }, { "acc": 0.7942173, "epoch": 1.791335540034121, "grad_norm": 5.59375, "learning_rate": 2.8265047102555733e-07, "loss": 0.75203381, "memory(GiB)": 138.1, "step": 76780, "train_speed(iter/s)": 0.20085 }, { "acc": 0.79992652, "epoch": 1.7915688476064102, "grad_norm": 6.03125, "learning_rate": 2.8202464620178225e-07, "loss": 0.70470304, "memory(GiB)": 138.1, "step": 76790, "train_speed(iter/s)": 0.200864 }, { "acc": 0.80589857, "epoch": 1.791802155178699, "grad_norm": 4.15625, "learning_rate": 2.8139949486761853e-07, "loss": 0.67734222, "memory(GiB)": 138.1, "step": 76800, "train_speed(iter/s)": 0.200877 }, { "acc": 0.78081474, "epoch": 1.792035462750988, "grad_norm": 5.9375, "learning_rate": 2.807750171123058e-07, "loss": 0.78283153, "memory(GiB)": 138.1, "step": 76810, "train_speed(iter/s)": 0.200891 }, { "acc": 0.78747606, "epoch": 1.7922687703232767, "grad_norm": 6.5625, "learning_rate": 2.8015121302498894e-07, "loss": 0.75321217, "memory(GiB)": 138.1, "step": 76820, "train_speed(iter/s)": 0.200905 }, { "acc": 0.78687334, "epoch": 1.7925020778955658, "grad_norm": 3.875, "learning_rate": 2.7952808269471445e-07, "loss": 0.75753322, "memory(GiB)": 138.1, "step": 76830, "train_speed(iter/s)": 0.200918 }, { "acc": 0.77859888, "epoch": 1.7927353854678545, "grad_norm": 6.28125, "learning_rate": 2.7890562621043503e-07, "loss": 0.7866539, "memory(GiB)": 138.1, "step": 76840, "train_speed(iter/s)": 0.200932 }, { "acc": 0.78868618, "epoch": 1.7929686930401436, "grad_norm": 4.1875, "learning_rate": 2.782838436610058e-07, "loss": 0.75083804, "memory(GiB)": 138.1, "step": 76850, "train_speed(iter/s)": 0.200945 }, { "acc": 0.76023216, "epoch": 1.7932020006124323, "grad_norm": 6.53125, "learning_rate": 2.776627351351868e-07, "loss": 0.87442131, "memory(GiB)": 138.1, "step": 76860, "train_speed(iter/s)": 0.200959 }, { "acc": 0.7890811, "epoch": 1.7934353081847214, "grad_norm": 4.96875, "learning_rate": 2.770423007216411e-07, "loss": 0.75472221, "memory(GiB)": 138.1, "step": 76870, "train_speed(iter/s)": 0.200973 }, { "acc": 0.77964153, "epoch": 1.79366861575701, "grad_norm": 5.875, "learning_rate": 2.764225405089332e-07, "loss": 0.80402193, "memory(GiB)": 138.1, "step": 76880, "train_speed(iter/s)": 0.200986 }, { "acc": 0.76858959, "epoch": 1.7939019233292992, "grad_norm": 5.3125, "learning_rate": 2.7580345458553705e-07, "loss": 0.84677277, "memory(GiB)": 138.1, "step": 76890, "train_speed(iter/s)": 0.201 }, { "acc": 0.78171978, "epoch": 1.7941352309015879, "grad_norm": 6.0625, "learning_rate": 2.75185043039824e-07, "loss": 0.78139849, "memory(GiB)": 138.1, "step": 76900, "train_speed(iter/s)": 0.201014 }, { "acc": 0.76607041, "epoch": 1.794368538473877, "grad_norm": 4.9375, "learning_rate": 2.7456730596007454e-07, "loss": 0.83766766, "memory(GiB)": 138.1, "step": 76910, "train_speed(iter/s)": 0.201027 }, { "acc": 0.79166379, "epoch": 1.7946018460461657, "grad_norm": 4.75, "learning_rate": 2.739502434344693e-07, "loss": 0.73072734, "memory(GiB)": 138.1, "step": 76920, "train_speed(iter/s)": 0.201041 }, { "acc": 0.7861433, "epoch": 1.7948351536184546, "grad_norm": 5.5, "learning_rate": 2.733338555510939e-07, "loss": 0.77780027, "memory(GiB)": 138.1, "step": 76930, "train_speed(iter/s)": 0.201054 }, { "acc": 0.78073063, "epoch": 1.7950684611907435, "grad_norm": 4.625, "learning_rate": 2.7271814239793693e-07, "loss": 0.77235231, "memory(GiB)": 138.1, "step": 76940, "train_speed(iter/s)": 0.201067 }, { "acc": 0.79449263, "epoch": 1.7953017687630324, "grad_norm": 6.34375, "learning_rate": 2.721031040628924e-07, "loss": 0.74123163, "memory(GiB)": 138.1, "step": 76950, "train_speed(iter/s)": 0.20108 }, { "acc": 0.77358427, "epoch": 1.7955350763353213, "grad_norm": 6.84375, "learning_rate": 2.714887406337563e-07, "loss": 0.8185647, "memory(GiB)": 138.1, "step": 76960, "train_speed(iter/s)": 0.201093 }, { "acc": 0.77752705, "epoch": 1.7957683839076102, "grad_norm": 5.125, "learning_rate": 2.708750521982284e-07, "loss": 0.80426521, "memory(GiB)": 138.1, "step": 76970, "train_speed(iter/s)": 0.201107 }, { "acc": 0.7755044, "epoch": 1.796001691479899, "grad_norm": 4.78125, "learning_rate": 2.7026203884391313e-07, "loss": 0.81011715, "memory(GiB)": 138.1, "step": 76980, "train_speed(iter/s)": 0.201121 }, { "acc": 0.78045125, "epoch": 1.796234999052188, "grad_norm": 5.21875, "learning_rate": 2.696497006583176e-07, "loss": 0.79744205, "memory(GiB)": 138.1, "step": 76990, "train_speed(iter/s)": 0.201135 }, { "acc": 0.77946234, "epoch": 1.7964683066244769, "grad_norm": 5.09375, "learning_rate": 2.6903803772885375e-07, "loss": 0.79059172, "memory(GiB)": 138.1, "step": 77000, "train_speed(iter/s)": 0.201149 }, { "epoch": 1.7964683066244769, "eval_acc": 0.7447200046249655, "eval_loss": 0.8044254779815674, "eval_runtime": 1271.9514, "eval_samples_per_second": 28.296, "eval_steps_per_second": 14.148, "step": 77000 }, { "acc": 0.78546667, "epoch": 1.7967016141967658, "grad_norm": 6.96875, "learning_rate": 2.6842705014283545e-07, "loss": 0.76823864, "memory(GiB)": 138.1, "step": 77010, "train_speed(iter/s)": 0.200482 }, { "acc": 0.77572174, "epoch": 1.7969349217690547, "grad_norm": 4.21875, "learning_rate": 2.6781673798748074e-07, "loss": 0.8007781, "memory(GiB)": 138.1, "step": 77020, "train_speed(iter/s)": 0.200497 }, { "acc": 0.77363772, "epoch": 1.7971682293413436, "grad_norm": 6.09375, "learning_rate": 2.672071013499122e-07, "loss": 0.82122746, "memory(GiB)": 138.1, "step": 77030, "train_speed(iter/s)": 0.200509 }, { "acc": 0.76963663, "epoch": 1.7974015369136325, "grad_norm": 5.40625, "learning_rate": 2.665981403171558e-07, "loss": 0.84492531, "memory(GiB)": 138.1, "step": 77040, "train_speed(iter/s)": 0.200522 }, { "acc": 0.78084383, "epoch": 1.7976348444859214, "grad_norm": 23.875, "learning_rate": 2.6598985497613915e-07, "loss": 0.7569222, "memory(GiB)": 138.1, "step": 77050, "train_speed(iter/s)": 0.200536 }, { "acc": 0.77769785, "epoch": 1.7978681520582103, "grad_norm": 5.46875, "learning_rate": 2.653822454136951e-07, "loss": 0.78896704, "memory(GiB)": 138.1, "step": 77060, "train_speed(iter/s)": 0.20055 }, { "acc": 0.80396709, "epoch": 1.7981014596304992, "grad_norm": 4.40625, "learning_rate": 2.647753117165608e-07, "loss": 0.6958436, "memory(GiB)": 138.1, "step": 77070, "train_speed(iter/s)": 0.200564 }, { "acc": 0.79132442, "epoch": 1.798334767202788, "grad_norm": 3.875, "learning_rate": 2.641690539713743e-07, "loss": 0.75024261, "memory(GiB)": 138.1, "step": 77080, "train_speed(iter/s)": 0.200577 }, { "acc": 0.78543453, "epoch": 1.798568074775077, "grad_norm": 5.8125, "learning_rate": 2.635634722646807e-07, "loss": 0.77864475, "memory(GiB)": 138.1, "step": 77090, "train_speed(iter/s)": 0.200591 }, { "acc": 0.77233229, "epoch": 1.7988013823473659, "grad_norm": 4.65625, "learning_rate": 2.6295856668292487e-07, "loss": 0.81739025, "memory(GiB)": 138.1, "step": 77100, "train_speed(iter/s)": 0.200605 }, { "acc": 0.76904373, "epoch": 1.7990346899196545, "grad_norm": 5.65625, "learning_rate": 2.623543373124571e-07, "loss": 0.81224604, "memory(GiB)": 138.1, "step": 77110, "train_speed(iter/s)": 0.200618 }, { "acc": 0.78083372, "epoch": 1.7992679974919437, "grad_norm": 5.90625, "learning_rate": 2.6175078423953225e-07, "loss": 0.79091768, "memory(GiB)": 138.1, "step": 77120, "train_speed(iter/s)": 0.200631 }, { "acc": 0.80353069, "epoch": 1.7995013050642323, "grad_norm": 5.625, "learning_rate": 2.6114790755030593e-07, "loss": 0.72558646, "memory(GiB)": 138.1, "step": 77130, "train_speed(iter/s)": 0.200644 }, { "acc": 0.77595344, "epoch": 1.7997346126365215, "grad_norm": 6.28125, "learning_rate": 2.605457073308393e-07, "loss": 0.79515977, "memory(GiB)": 138.1, "step": 77140, "train_speed(iter/s)": 0.200658 }, { "acc": 0.7906909, "epoch": 1.7999679202088101, "grad_norm": 6.53125, "learning_rate": 2.5994418366709686e-07, "loss": 0.75885439, "memory(GiB)": 138.1, "step": 77150, "train_speed(iter/s)": 0.200672 }, { "acc": 0.76935129, "epoch": 1.8002012277810993, "grad_norm": 6.25, "learning_rate": 2.5934333664494436e-07, "loss": 0.84422007, "memory(GiB)": 138.1, "step": 77160, "train_speed(iter/s)": 0.200685 }, { "acc": 0.77682958, "epoch": 1.800434535353388, "grad_norm": 5.28125, "learning_rate": 2.5874316635015383e-07, "loss": 0.82697449, "memory(GiB)": 138.1, "step": 77170, "train_speed(iter/s)": 0.200699 }, { "acc": 0.78309669, "epoch": 1.800667842925677, "grad_norm": 6.25, "learning_rate": 2.581436728683984e-07, "loss": 0.76987929, "memory(GiB)": 138.1, "step": 77180, "train_speed(iter/s)": 0.200712 }, { "acc": 0.79188871, "epoch": 1.8009011504979657, "grad_norm": 6.625, "learning_rate": 2.575448562852567e-07, "loss": 0.76328211, "memory(GiB)": 138.1, "step": 77190, "train_speed(iter/s)": 0.200726 }, { "acc": 0.7809392, "epoch": 1.8011344580702549, "grad_norm": 6.65625, "learning_rate": 2.569467166862083e-07, "loss": 0.79415689, "memory(GiB)": 138.1, "step": 77200, "train_speed(iter/s)": 0.200738 }, { "acc": 0.77853994, "epoch": 1.8013677656425435, "grad_norm": 4.875, "learning_rate": 2.563492541566387e-07, "loss": 0.80777569, "memory(GiB)": 138.1, "step": 77210, "train_speed(iter/s)": 0.200752 }, { "acc": 0.78512239, "epoch": 1.8016010732148326, "grad_norm": 5.09375, "learning_rate": 2.557524687818347e-07, "loss": 0.76266022, "memory(GiB)": 138.1, "step": 77220, "train_speed(iter/s)": 0.200764 }, { "acc": 0.78263688, "epoch": 1.8018343807871213, "grad_norm": 6.0625, "learning_rate": 2.551563606469881e-07, "loss": 0.7762639, "memory(GiB)": 138.1, "step": 77230, "train_speed(iter/s)": 0.200777 }, { "acc": 0.79786873, "epoch": 1.8020676883594104, "grad_norm": 5.09375, "learning_rate": 2.545609298371926e-07, "loss": 0.72444544, "memory(GiB)": 138.1, "step": 77240, "train_speed(iter/s)": 0.20079 }, { "acc": 0.78850036, "epoch": 1.8023009959316991, "grad_norm": 4.3125, "learning_rate": 2.539661764374457e-07, "loss": 0.75326676, "memory(GiB)": 138.1, "step": 77250, "train_speed(iter/s)": 0.200803 }, { "acc": 0.77271185, "epoch": 1.8025343035039882, "grad_norm": 4.65625, "learning_rate": 2.5337210053264893e-07, "loss": 0.80799408, "memory(GiB)": 138.1, "step": 77260, "train_speed(iter/s)": 0.200816 }, { "acc": 0.76806946, "epoch": 1.802767611076277, "grad_norm": 5.59375, "learning_rate": 2.5277870220760504e-07, "loss": 0.85381813, "memory(GiB)": 138.1, "step": 77270, "train_speed(iter/s)": 0.20083 }, { "acc": 0.77597227, "epoch": 1.803000918648566, "grad_norm": 7.71875, "learning_rate": 2.521859815470229e-07, "loss": 0.81069126, "memory(GiB)": 138.1, "step": 77280, "train_speed(iter/s)": 0.200844 }, { "acc": 0.785149, "epoch": 1.8032342262208547, "grad_norm": 7.34375, "learning_rate": 2.515939386355121e-07, "loss": 0.76063423, "memory(GiB)": 138.1, "step": 77290, "train_speed(iter/s)": 0.200857 }, { "acc": 0.77492638, "epoch": 1.8034675337931438, "grad_norm": 4.59375, "learning_rate": 2.5100257355758715e-07, "loss": 0.78454142, "memory(GiB)": 138.1, "step": 77300, "train_speed(iter/s)": 0.200871 }, { "acc": 0.77450662, "epoch": 1.8037008413654325, "grad_norm": 4.96875, "learning_rate": 2.5041188639766624e-07, "loss": 0.78545609, "memory(GiB)": 138.1, "step": 77310, "train_speed(iter/s)": 0.200885 }, { "acc": 0.77065268, "epoch": 1.8039341489377214, "grad_norm": 4.5625, "learning_rate": 2.498218772400673e-07, "loss": 0.83770046, "memory(GiB)": 138.1, "step": 77320, "train_speed(iter/s)": 0.200898 }, { "acc": 0.77967792, "epoch": 1.8041674565100103, "grad_norm": 10.25, "learning_rate": 2.4923254616901646e-07, "loss": 0.80178738, "memory(GiB)": 138.1, "step": 77330, "train_speed(iter/s)": 0.200912 }, { "acc": 0.74775844, "epoch": 1.8044007640822992, "grad_norm": 5.40625, "learning_rate": 2.486438932686386e-07, "loss": 0.90301533, "memory(GiB)": 138.1, "step": 77340, "train_speed(iter/s)": 0.200926 }, { "acc": 0.7848135, "epoch": 1.8046340716545881, "grad_norm": 10.9375, "learning_rate": 2.4805591862296587e-07, "loss": 0.75431414, "memory(GiB)": 138.1, "step": 77350, "train_speed(iter/s)": 0.20094 }, { "acc": 0.78480129, "epoch": 1.804867379226877, "grad_norm": 5.21875, "learning_rate": 2.4746862231593006e-07, "loss": 0.76673164, "memory(GiB)": 138.1, "step": 77360, "train_speed(iter/s)": 0.200953 }, { "acc": 0.77168055, "epoch": 1.805100686799166, "grad_norm": 6.15625, "learning_rate": 2.468820044313669e-07, "loss": 0.83345318, "memory(GiB)": 138.1, "step": 77370, "train_speed(iter/s)": 0.200967 }, { "acc": 0.77629547, "epoch": 1.8053339943714548, "grad_norm": 4.65625, "learning_rate": 2.4629606505301775e-07, "loss": 0.81766853, "memory(GiB)": 138.1, "step": 77380, "train_speed(iter/s)": 0.20098 }, { "acc": 0.80229378, "epoch": 1.8055673019437437, "grad_norm": 4.25, "learning_rate": 2.457108042645245e-07, "loss": 0.71003008, "memory(GiB)": 138.1, "step": 77390, "train_speed(iter/s)": 0.200993 }, { "acc": 0.76214561, "epoch": 1.8058006095160326, "grad_norm": 5.4375, "learning_rate": 2.4512622214943274e-07, "loss": 0.87743969, "memory(GiB)": 138.1, "step": 77400, "train_speed(iter/s)": 0.201007 }, { "acc": 0.78265681, "epoch": 1.8060339170883215, "grad_norm": 4.21875, "learning_rate": 2.445423187911905e-07, "loss": 0.78319178, "memory(GiB)": 138.1, "step": 77410, "train_speed(iter/s)": 0.201019 }, { "acc": 0.77975445, "epoch": 1.8062672246606104, "grad_norm": 6.09375, "learning_rate": 2.439590942731518e-07, "loss": 0.79706392, "memory(GiB)": 138.1, "step": 77420, "train_speed(iter/s)": 0.201033 }, { "acc": 0.80120268, "epoch": 1.8065005322328993, "grad_norm": 5.5, "learning_rate": 2.433765486785694e-07, "loss": 0.70945711, "memory(GiB)": 138.1, "step": 77430, "train_speed(iter/s)": 0.201046 }, { "acc": 0.76658869, "epoch": 1.8067338398051882, "grad_norm": 4.1875, "learning_rate": 2.427946820906041e-07, "loss": 0.84224901, "memory(GiB)": 138.1, "step": 77440, "train_speed(iter/s)": 0.201059 }, { "acc": 0.78487082, "epoch": 1.806967147377477, "grad_norm": 7.28125, "learning_rate": 2.42213494592316e-07, "loss": 0.76289835, "memory(GiB)": 138.1, "step": 77450, "train_speed(iter/s)": 0.201072 }, { "acc": 0.78675003, "epoch": 1.807200454949766, "grad_norm": 5.84375, "learning_rate": 2.4163298626666885e-07, "loss": 0.76830788, "memory(GiB)": 138.1, "step": 77460, "train_speed(iter/s)": 0.201085 }, { "acc": 0.79696965, "epoch": 1.807433762522055, "grad_norm": 5.125, "learning_rate": 2.410531571965308e-07, "loss": 0.73548222, "memory(GiB)": 138.1, "step": 77470, "train_speed(iter/s)": 0.201099 }, { "acc": 0.76947789, "epoch": 1.8076670700943438, "grad_norm": 5.34375, "learning_rate": 2.4047400746467165e-07, "loss": 0.84600887, "memory(GiB)": 138.1, "step": 77480, "train_speed(iter/s)": 0.201112 }, { "acc": 0.78702273, "epoch": 1.8079003776666327, "grad_norm": 4.0625, "learning_rate": 2.398955371537665e-07, "loss": 0.76369286, "memory(GiB)": 138.1, "step": 77490, "train_speed(iter/s)": 0.201126 }, { "acc": 0.7907228, "epoch": 1.8081336852389214, "grad_norm": 7.03125, "learning_rate": 2.393177463463897e-07, "loss": 0.76782665, "memory(GiB)": 138.1, "step": 77500, "train_speed(iter/s)": 0.201137 }, { "epoch": 1.8081336852389214, "eval_acc": 0.7447478697213858, "eval_loss": 0.8044267296791077, "eval_runtime": 1270.6205, "eval_samples_per_second": 28.326, "eval_steps_per_second": 14.163, "step": 77500 }, { "acc": 0.79640326, "epoch": 1.8083669928112105, "grad_norm": 12.0625, "learning_rate": 2.387406351250221e-07, "loss": 0.71954288, "memory(GiB)": 138.1, "step": 77510, "train_speed(iter/s)": 0.200476 }, { "acc": 0.78035679, "epoch": 1.8086003003834992, "grad_norm": 9.3125, "learning_rate": 2.3816420357204495e-07, "loss": 0.79402161, "memory(GiB)": 138.1, "step": 77520, "train_speed(iter/s)": 0.200489 }, { "acc": 0.80683222, "epoch": 1.8088336079557883, "grad_norm": 4.125, "learning_rate": 2.3758845176974465e-07, "loss": 0.70322952, "memory(GiB)": 138.1, "step": 77530, "train_speed(iter/s)": 0.200502 }, { "acc": 0.78853745, "epoch": 1.809066915528077, "grad_norm": 6.34375, "learning_rate": 2.3701337980030993e-07, "loss": 0.76086707, "memory(GiB)": 138.1, "step": 77540, "train_speed(iter/s)": 0.200515 }, { "acc": 0.77532892, "epoch": 1.809300223100366, "grad_norm": 5.6875, "learning_rate": 2.3643898774583064e-07, "loss": 0.8250165, "memory(GiB)": 138.1, "step": 77550, "train_speed(iter/s)": 0.200528 }, { "acc": 0.79044323, "epoch": 1.8095335306726548, "grad_norm": 4.625, "learning_rate": 2.3586527568830286e-07, "loss": 0.76854286, "memory(GiB)": 138.1, "step": 77560, "train_speed(iter/s)": 0.200543 }, { "acc": 0.76759367, "epoch": 1.8097668382449439, "grad_norm": 6.0625, "learning_rate": 2.3529224370962223e-07, "loss": 0.82133694, "memory(GiB)": 138.1, "step": 77570, "train_speed(iter/s)": 0.200557 }, { "acc": 0.79010715, "epoch": 1.8100001458172326, "grad_norm": 5.75, "learning_rate": 2.347198918915905e-07, "loss": 0.74337749, "memory(GiB)": 138.1, "step": 77580, "train_speed(iter/s)": 0.200571 }, { "acc": 0.78925729, "epoch": 1.8102334533895217, "grad_norm": 4.21875, "learning_rate": 2.3414822031590956e-07, "loss": 0.76391687, "memory(GiB)": 138.1, "step": 77590, "train_speed(iter/s)": 0.200584 }, { "acc": 0.78881116, "epoch": 1.8104667609618104, "grad_norm": 5.3125, "learning_rate": 2.3357722906418523e-07, "loss": 0.74917927, "memory(GiB)": 138.1, "step": 77600, "train_speed(iter/s)": 0.200597 }, { "acc": 0.77421007, "epoch": 1.8107000685340995, "grad_norm": 8.5, "learning_rate": 2.3300691821792788e-07, "loss": 0.81043053, "memory(GiB)": 138.1, "step": 77610, "train_speed(iter/s)": 0.20061 }, { "acc": 0.79148536, "epoch": 1.8109333761063882, "grad_norm": 6.21875, "learning_rate": 2.3243728785854737e-07, "loss": 0.7413094, "memory(GiB)": 138.1, "step": 77620, "train_speed(iter/s)": 0.200622 }, { "acc": 0.75671234, "epoch": 1.8111666836786773, "grad_norm": 5.15625, "learning_rate": 2.318683380673592e-07, "loss": 0.88378201, "memory(GiB)": 138.1, "step": 77630, "train_speed(iter/s)": 0.200636 }, { "acc": 0.77831712, "epoch": 1.811399991250966, "grad_norm": 6.625, "learning_rate": 2.313000689255801e-07, "loss": 0.779283, "memory(GiB)": 138.1, "step": 77640, "train_speed(iter/s)": 0.20065 }, { "acc": 0.77758551, "epoch": 1.811633298823255, "grad_norm": 5.40625, "learning_rate": 2.3073248051433127e-07, "loss": 0.79231982, "memory(GiB)": 138.1, "step": 77650, "train_speed(iter/s)": 0.200664 }, { "acc": 0.79954634, "epoch": 1.8118666063955438, "grad_norm": 8.3125, "learning_rate": 2.301655729146357e-07, "loss": 0.70971565, "memory(GiB)": 138.1, "step": 77660, "train_speed(iter/s)": 0.200677 }, { "acc": 0.79472017, "epoch": 1.8120999139678329, "grad_norm": 3.671875, "learning_rate": 2.295993462074181e-07, "loss": 0.72714438, "memory(GiB)": 138.1, "step": 77670, "train_speed(iter/s)": 0.200691 }, { "acc": 0.78361807, "epoch": 1.8123332215401216, "grad_norm": 9.3125, "learning_rate": 2.2903380047350876e-07, "loss": 0.76192293, "memory(GiB)": 138.1, "step": 77680, "train_speed(iter/s)": 0.200704 }, { "acc": 0.75392261, "epoch": 1.8125665291124105, "grad_norm": 5.75, "learning_rate": 2.284689357936376e-07, "loss": 0.89919205, "memory(GiB)": 138.1, "step": 77690, "train_speed(iter/s)": 0.200718 }, { "acc": 0.77323561, "epoch": 1.8127998366846994, "grad_norm": 7.1875, "learning_rate": 2.2790475224844067e-07, "loss": 0.82149096, "memory(GiB)": 138.1, "step": 77700, "train_speed(iter/s)": 0.200732 }, { "acc": 0.7900878, "epoch": 1.8130331442569882, "grad_norm": 5.84375, "learning_rate": 2.2734124991845352e-07, "loss": 0.74978971, "memory(GiB)": 138.1, "step": 77710, "train_speed(iter/s)": 0.200745 }, { "acc": 0.77167468, "epoch": 1.8132664518292771, "grad_norm": 6.09375, "learning_rate": 2.2677842888411738e-07, "loss": 0.83236217, "memory(GiB)": 138.1, "step": 77720, "train_speed(iter/s)": 0.200759 }, { "acc": 0.75530715, "epoch": 1.813499759401566, "grad_norm": 4.59375, "learning_rate": 2.26216289225773e-07, "loss": 0.8862648, "memory(GiB)": 138.1, "step": 77730, "train_speed(iter/s)": 0.200772 }, { "acc": 0.79051895, "epoch": 1.813733066973855, "grad_norm": 5.15625, "learning_rate": 2.256548310236667e-07, "loss": 0.74030609, "memory(GiB)": 138.1, "step": 77740, "train_speed(iter/s)": 0.200784 }, { "acc": 0.79147606, "epoch": 1.8139663745461438, "grad_norm": 4.6875, "learning_rate": 2.2509405435794662e-07, "loss": 0.72963748, "memory(GiB)": 138.1, "step": 77750, "train_speed(iter/s)": 0.200797 }, { "acc": 0.77560072, "epoch": 1.8141996821184327, "grad_norm": 6.15625, "learning_rate": 2.2453395930866262e-07, "loss": 0.8010334, "memory(GiB)": 138.1, "step": 77760, "train_speed(iter/s)": 0.20081 }, { "acc": 0.7697216, "epoch": 1.8144329896907216, "grad_norm": 5.25, "learning_rate": 2.2397454595576906e-07, "loss": 0.82412586, "memory(GiB)": 138.1, "step": 77770, "train_speed(iter/s)": 0.200822 }, { "acc": 0.79263906, "epoch": 1.8146662972630105, "grad_norm": 4.71875, "learning_rate": 2.2341581437912097e-07, "loss": 0.73385839, "memory(GiB)": 138.1, "step": 77780, "train_speed(iter/s)": 0.200837 }, { "acc": 0.77369003, "epoch": 1.8148996048352994, "grad_norm": 9.0625, "learning_rate": 2.2285776465847842e-07, "loss": 0.78555937, "memory(GiB)": 138.1, "step": 77790, "train_speed(iter/s)": 0.20085 }, { "acc": 0.81455336, "epoch": 1.8151329124075883, "grad_norm": 5.625, "learning_rate": 2.2230039687350212e-07, "loss": 0.65586133, "memory(GiB)": 138.1, "step": 77800, "train_speed(iter/s)": 0.200865 }, { "acc": 0.79386964, "epoch": 1.8153662199798772, "grad_norm": 5.46875, "learning_rate": 2.217437111037557e-07, "loss": 0.74161119, "memory(GiB)": 138.1, "step": 77810, "train_speed(iter/s)": 0.200878 }, { "acc": 0.784272, "epoch": 1.8155995275521661, "grad_norm": 5.90625, "learning_rate": 2.211877074287072e-07, "loss": 0.76136417, "memory(GiB)": 138.1, "step": 77820, "train_speed(iter/s)": 0.200892 }, { "acc": 0.78744574, "epoch": 1.815832835124455, "grad_norm": 7.90625, "learning_rate": 2.206323859277254e-07, "loss": 0.77874732, "memory(GiB)": 138.1, "step": 77830, "train_speed(iter/s)": 0.200905 }, { "acc": 0.78655195, "epoch": 1.816066142696744, "grad_norm": 4.03125, "learning_rate": 2.200777466800813e-07, "loss": 0.76195555, "memory(GiB)": 138.1, "step": 77840, "train_speed(iter/s)": 0.200918 }, { "acc": 0.77641582, "epoch": 1.8162994502690328, "grad_norm": 7.09375, "learning_rate": 2.1952378976495048e-07, "loss": 0.82479191, "memory(GiB)": 138.1, "step": 77850, "train_speed(iter/s)": 0.200932 }, { "acc": 0.78586106, "epoch": 1.8165327578413217, "grad_norm": 5.34375, "learning_rate": 2.1897051526141022e-07, "loss": 0.76681871, "memory(GiB)": 138.1, "step": 77860, "train_speed(iter/s)": 0.200945 }, { "acc": 0.78540492, "epoch": 1.8167660654136106, "grad_norm": 7.5625, "learning_rate": 2.1841792324843958e-07, "loss": 0.78315992, "memory(GiB)": 138.1, "step": 77870, "train_speed(iter/s)": 0.200959 }, { "acc": 0.78267908, "epoch": 1.8169993729858995, "grad_norm": 6.71875, "learning_rate": 2.1786601380492156e-07, "loss": 0.79527521, "memory(GiB)": 138.1, "step": 77880, "train_speed(iter/s)": 0.200972 }, { "acc": 0.78243675, "epoch": 1.8172326805581882, "grad_norm": 6.21875, "learning_rate": 2.1731478700964093e-07, "loss": 0.78953333, "memory(GiB)": 138.1, "step": 77890, "train_speed(iter/s)": 0.200986 }, { "acc": 0.77621989, "epoch": 1.8174659881304773, "grad_norm": 7.5625, "learning_rate": 2.1676424294128474e-07, "loss": 0.78211274, "memory(GiB)": 138.1, "step": 77900, "train_speed(iter/s)": 0.200999 }, { "acc": 0.79703307, "epoch": 1.817699295702766, "grad_norm": 6.5, "learning_rate": 2.16214381678444e-07, "loss": 0.70052767, "memory(GiB)": 138.1, "step": 77910, "train_speed(iter/s)": 0.201012 }, { "acc": 0.78100514, "epoch": 1.8179326032750551, "grad_norm": 6.1875, "learning_rate": 2.1566520329960928e-07, "loss": 0.76739492, "memory(GiB)": 138.1, "step": 77920, "train_speed(iter/s)": 0.201026 }, { "acc": 0.78988457, "epoch": 1.8181659108473438, "grad_norm": 6.75, "learning_rate": 2.1511670788317839e-07, "loss": 0.75334415, "memory(GiB)": 138.1, "step": 77930, "train_speed(iter/s)": 0.201039 }, { "acc": 0.79587126, "epoch": 1.818399218419633, "grad_norm": 6.25, "learning_rate": 2.1456889550744707e-07, "loss": 0.7218152, "memory(GiB)": 138.1, "step": 77940, "train_speed(iter/s)": 0.201052 }, { "acc": 0.76245584, "epoch": 1.8186325259919216, "grad_norm": 6.53125, "learning_rate": 2.1402176625061554e-07, "loss": 0.88262806, "memory(GiB)": 138.1, "step": 77950, "train_speed(iter/s)": 0.201066 }, { "acc": 0.77558794, "epoch": 1.8188658335642107, "grad_norm": 6.09375, "learning_rate": 2.1347532019078686e-07, "loss": 0.80598965, "memory(GiB)": 138.1, "step": 77960, "train_speed(iter/s)": 0.201079 }, { "acc": 0.77870779, "epoch": 1.8190991411364994, "grad_norm": 6.96875, "learning_rate": 2.1292955740596478e-07, "loss": 0.79289889, "memory(GiB)": 138.1, "step": 77970, "train_speed(iter/s)": 0.201093 }, { "acc": 0.77847414, "epoch": 1.8193324487087885, "grad_norm": 5.71875, "learning_rate": 2.123844779740586e-07, "loss": 0.80017395, "memory(GiB)": 138.1, "step": 77980, "train_speed(iter/s)": 0.201106 }, { "acc": 0.77644243, "epoch": 1.8195657562810772, "grad_norm": 8.125, "learning_rate": 2.118400819728772e-07, "loss": 0.80827723, "memory(GiB)": 138.1, "step": 77990, "train_speed(iter/s)": 0.20112 }, { "acc": 0.79161921, "epoch": 1.8197990638533663, "grad_norm": 8.25, "learning_rate": 2.1129636948013287e-07, "loss": 0.75301976, "memory(GiB)": 138.1, "step": 78000, "train_speed(iter/s)": 0.201134 }, { "epoch": 1.8197990638533663, "eval_acc": 0.7446887764996667, "eval_loss": 0.8044082522392273, "eval_runtime": 1271.6729, "eval_samples_per_second": 28.302, "eval_steps_per_second": 14.151, "step": 78000 }, { "acc": 0.78124561, "epoch": 1.820032371425655, "grad_norm": 5.125, "learning_rate": 2.1075334057344077e-07, "loss": 0.79524374, "memory(GiB)": 138.1, "step": 78010, "train_speed(iter/s)": 0.200477 }, { "acc": 0.80066538, "epoch": 1.8202656789979441, "grad_norm": 4.03125, "learning_rate": 2.102109953303183e-07, "loss": 0.70290136, "memory(GiB)": 138.1, "step": 78020, "train_speed(iter/s)": 0.20049 }, { "acc": 0.76704898, "epoch": 1.8204989865702328, "grad_norm": 7.71875, "learning_rate": 2.0966933382818465e-07, "loss": 0.86864662, "memory(GiB)": 138.1, "step": 78030, "train_speed(iter/s)": 0.200504 }, { "acc": 0.78443837, "epoch": 1.820732294142522, "grad_norm": 6.0, "learning_rate": 2.0912835614436132e-07, "loss": 0.78189793, "memory(GiB)": 138.1, "step": 78040, "train_speed(iter/s)": 0.200517 }, { "acc": 0.76247149, "epoch": 1.8209656017148106, "grad_norm": 5.9375, "learning_rate": 2.085880623560743e-07, "loss": 0.85706472, "memory(GiB)": 138.1, "step": 78050, "train_speed(iter/s)": 0.200531 }, { "acc": 0.79499025, "epoch": 1.8211989092870997, "grad_norm": 5.46875, "learning_rate": 2.080484525404497e-07, "loss": 0.73801851, "memory(GiB)": 138.1, "step": 78060, "train_speed(iter/s)": 0.200544 }, { "acc": 0.77147074, "epoch": 1.8214322168593884, "grad_norm": 4.8125, "learning_rate": 2.0750952677451585e-07, "loss": 0.82764015, "memory(GiB)": 138.1, "step": 78070, "train_speed(iter/s)": 0.200558 }, { "acc": 0.78612347, "epoch": 1.8216655244316773, "grad_norm": 8.4375, "learning_rate": 2.069712851352046e-07, "loss": 0.74784603, "memory(GiB)": 138.1, "step": 78080, "train_speed(iter/s)": 0.200572 }, { "acc": 0.79965754, "epoch": 1.8218988320039662, "grad_norm": 7.21875, "learning_rate": 2.0643372769935055e-07, "loss": 0.71098585, "memory(GiB)": 138.1, "step": 78090, "train_speed(iter/s)": 0.200585 }, { "acc": 0.79127426, "epoch": 1.822132139576255, "grad_norm": 8.9375, "learning_rate": 2.0589685454368957e-07, "loss": 0.75095186, "memory(GiB)": 138.1, "step": 78100, "train_speed(iter/s)": 0.200599 }, { "acc": 0.79673557, "epoch": 1.822365447148544, "grad_norm": 6.96875, "learning_rate": 2.0536066574485868e-07, "loss": 0.72113428, "memory(GiB)": 138.1, "step": 78110, "train_speed(iter/s)": 0.200612 }, { "acc": 0.79205379, "epoch": 1.8225987547208329, "grad_norm": 4.1875, "learning_rate": 2.0482516137940113e-07, "loss": 0.73350883, "memory(GiB)": 138.1, "step": 78120, "train_speed(iter/s)": 0.200625 }, { "acc": 0.76067524, "epoch": 1.8228320622931218, "grad_norm": 5.125, "learning_rate": 2.0429034152375793e-07, "loss": 0.87135782, "memory(GiB)": 138.1, "step": 78130, "train_speed(iter/s)": 0.200639 }, { "acc": 0.77361522, "epoch": 1.8230653698654107, "grad_norm": 4.78125, "learning_rate": 2.0375620625427584e-07, "loss": 0.79716196, "memory(GiB)": 138.1, "step": 78140, "train_speed(iter/s)": 0.200652 }, { "acc": 0.76916037, "epoch": 1.8232986774376996, "grad_norm": 4.46875, "learning_rate": 2.0322275564720163e-07, "loss": 0.81197529, "memory(GiB)": 138.1, "step": 78150, "train_speed(iter/s)": 0.200665 }, { "acc": 0.78346782, "epoch": 1.8235319850099885, "grad_norm": 6.6875, "learning_rate": 2.0268998977868603e-07, "loss": 0.79479795, "memory(GiB)": 138.1, "step": 78160, "train_speed(iter/s)": 0.200677 }, { "acc": 0.78680763, "epoch": 1.8237652925822774, "grad_norm": 6.25, "learning_rate": 2.0215790872478048e-07, "loss": 0.75535355, "memory(GiB)": 138.1, "step": 78170, "train_speed(iter/s)": 0.200691 }, { "acc": 0.78493023, "epoch": 1.8239986001545663, "grad_norm": 7.21875, "learning_rate": 2.016265125614403e-07, "loss": 0.79210768, "memory(GiB)": 138.1, "step": 78180, "train_speed(iter/s)": 0.200703 }, { "acc": 0.77242117, "epoch": 1.8242319077268552, "grad_norm": 5.4375, "learning_rate": 2.010958013645209e-07, "loss": 0.8096137, "memory(GiB)": 138.1, "step": 78190, "train_speed(iter/s)": 0.200716 }, { "acc": 0.79208431, "epoch": 1.824465215299144, "grad_norm": 4.40625, "learning_rate": 2.0056577520978171e-07, "loss": 0.7458868, "memory(GiB)": 138.1, "step": 78200, "train_speed(iter/s)": 0.200729 }, { "acc": 0.76901507, "epoch": 1.824698522871433, "grad_norm": 4.90625, "learning_rate": 2.0003643417288386e-07, "loss": 0.82966356, "memory(GiB)": 138.1, "step": 78210, "train_speed(iter/s)": 0.200743 }, { "acc": 0.7897049, "epoch": 1.8249318304437219, "grad_norm": 7.875, "learning_rate": 1.9950777832939029e-07, "loss": 0.74858961, "memory(GiB)": 138.1, "step": 78220, "train_speed(iter/s)": 0.200756 }, { "acc": 0.76055508, "epoch": 1.8251651380160108, "grad_norm": 5.96875, "learning_rate": 1.9897980775476778e-07, "loss": 0.87515984, "memory(GiB)": 138.1, "step": 78230, "train_speed(iter/s)": 0.20077 }, { "acc": 0.79980593, "epoch": 1.8253984455882997, "grad_norm": 6.59375, "learning_rate": 1.9845252252438274e-07, "loss": 0.70951848, "memory(GiB)": 138.1, "step": 78240, "train_speed(iter/s)": 0.200783 }, { "acc": 0.79019027, "epoch": 1.8256317531605886, "grad_norm": 5.4375, "learning_rate": 1.9792592271350443e-07, "loss": 0.73835411, "memory(GiB)": 138.1, "step": 78250, "train_speed(iter/s)": 0.200796 }, { "acc": 0.76858778, "epoch": 1.8258650607328775, "grad_norm": 5.375, "learning_rate": 1.9740000839730656e-07, "loss": 0.82827644, "memory(GiB)": 138.1, "step": 78260, "train_speed(iter/s)": 0.20081 }, { "acc": 0.77280111, "epoch": 1.8260983683051664, "grad_norm": 5.46875, "learning_rate": 1.9687477965086132e-07, "loss": 0.81801071, "memory(GiB)": 138.1, "step": 78270, "train_speed(iter/s)": 0.200824 }, { "acc": 0.7763041, "epoch": 1.826331675877455, "grad_norm": 6.71875, "learning_rate": 1.963502365491471e-07, "loss": 0.82029505, "memory(GiB)": 138.1, "step": 78280, "train_speed(iter/s)": 0.200838 }, { "acc": 0.77627983, "epoch": 1.8265649834497442, "grad_norm": 4.65625, "learning_rate": 1.958263791670406e-07, "loss": 0.80829754, "memory(GiB)": 138.1, "step": 78290, "train_speed(iter/s)": 0.200851 }, { "acc": 0.80785828, "epoch": 1.8267982910220328, "grad_norm": 5.59375, "learning_rate": 1.953032075793232e-07, "loss": 0.68023977, "memory(GiB)": 138.1, "step": 78300, "train_speed(iter/s)": 0.200865 }, { "acc": 0.75986423, "epoch": 1.827031598594322, "grad_norm": 10.4375, "learning_rate": 1.9478072186067676e-07, "loss": 0.88966808, "memory(GiB)": 138.1, "step": 78310, "train_speed(iter/s)": 0.200879 }, { "acc": 0.77503624, "epoch": 1.8272649061666106, "grad_norm": 4.15625, "learning_rate": 1.9425892208568665e-07, "loss": 0.80476112, "memory(GiB)": 138.1, "step": 78320, "train_speed(iter/s)": 0.200892 }, { "acc": 0.77604232, "epoch": 1.8274982137388998, "grad_norm": 6.875, "learning_rate": 1.9373780832883937e-07, "loss": 0.82170696, "memory(GiB)": 138.1, "step": 78330, "train_speed(iter/s)": 0.200905 }, { "acc": 0.77696238, "epoch": 1.8277315213111884, "grad_norm": 4.34375, "learning_rate": 1.9321738066452266e-07, "loss": 0.80580139, "memory(GiB)": 138.1, "step": 78340, "train_speed(iter/s)": 0.200918 }, { "acc": 0.77858658, "epoch": 1.8279648288834776, "grad_norm": 9.375, "learning_rate": 1.9269763916702988e-07, "loss": 0.80092487, "memory(GiB)": 138.1, "step": 78350, "train_speed(iter/s)": 0.200931 }, { "acc": 0.78533554, "epoch": 1.8281981364557662, "grad_norm": 7.9375, "learning_rate": 1.9217858391055167e-07, "loss": 0.75122461, "memory(GiB)": 138.1, "step": 78360, "train_speed(iter/s)": 0.200945 }, { "acc": 0.7888196, "epoch": 1.8284314440280554, "grad_norm": 5.0625, "learning_rate": 1.916602149691843e-07, "loss": 0.74484324, "memory(GiB)": 138.1, "step": 78370, "train_speed(iter/s)": 0.200958 }, { "acc": 0.76858897, "epoch": 1.828664751600344, "grad_norm": 4.90625, "learning_rate": 1.911425324169247e-07, "loss": 0.82479372, "memory(GiB)": 138.1, "step": 78380, "train_speed(iter/s)": 0.200972 }, { "acc": 0.77135515, "epoch": 1.8288980591726332, "grad_norm": 7.0, "learning_rate": 1.90625536327671e-07, "loss": 0.83313236, "memory(GiB)": 138.1, "step": 78390, "train_speed(iter/s)": 0.200985 }, { "acc": 0.79445343, "epoch": 1.8291313667449218, "grad_norm": 5.5625, "learning_rate": 1.9010922677522525e-07, "loss": 0.74220562, "memory(GiB)": 138.1, "step": 78400, "train_speed(iter/s)": 0.200998 }, { "acc": 0.78015499, "epoch": 1.829364674317211, "grad_norm": 5.59375, "learning_rate": 1.8959360383329017e-07, "loss": 0.79001665, "memory(GiB)": 138.1, "step": 78410, "train_speed(iter/s)": 0.201012 }, { "acc": 0.76368427, "epoch": 1.8295979818894996, "grad_norm": 7.40625, "learning_rate": 1.8907866757547077e-07, "loss": 0.86557426, "memory(GiB)": 138.1, "step": 78420, "train_speed(iter/s)": 0.201026 }, { "acc": 0.79951019, "epoch": 1.8298312894617887, "grad_norm": 6.25, "learning_rate": 1.8856441807527325e-07, "loss": 0.71126404, "memory(GiB)": 138.1, "step": 78430, "train_speed(iter/s)": 0.201039 }, { "acc": 0.77727337, "epoch": 1.8300645970340774, "grad_norm": 5.78125, "learning_rate": 1.8805085540610836e-07, "loss": 0.78865066, "memory(GiB)": 138.1, "step": 78440, "train_speed(iter/s)": 0.201053 }, { "acc": 0.8113802, "epoch": 1.8302979046063665, "grad_norm": 14.6875, "learning_rate": 1.8753797964128573e-07, "loss": 0.68480577, "memory(GiB)": 138.1, "step": 78450, "train_speed(iter/s)": 0.201065 }, { "acc": 0.78258839, "epoch": 1.8305312121786552, "grad_norm": 5.0625, "learning_rate": 1.870257908540174e-07, "loss": 0.79557185, "memory(GiB)": 138.1, "step": 78460, "train_speed(iter/s)": 0.201078 }, { "acc": 0.78330555, "epoch": 1.8307645197509441, "grad_norm": 7.375, "learning_rate": 1.8651428911742043e-07, "loss": 0.77810469, "memory(GiB)": 138.1, "step": 78470, "train_speed(iter/s)": 0.201092 }, { "acc": 0.78139677, "epoch": 1.830997827323233, "grad_norm": 4.90625, "learning_rate": 1.8600347450450974e-07, "loss": 0.77297707, "memory(GiB)": 138.1, "step": 78480, "train_speed(iter/s)": 0.201106 }, { "acc": 0.79237366, "epoch": 1.831231134895522, "grad_norm": 7.4375, "learning_rate": 1.8549334708820476e-07, "loss": 0.73578663, "memory(GiB)": 138.1, "step": 78490, "train_speed(iter/s)": 0.201118 }, { "acc": 0.78118787, "epoch": 1.8314644424678108, "grad_norm": 3.578125, "learning_rate": 1.8498390694132562e-07, "loss": 0.77322454, "memory(GiB)": 138.1, "step": 78500, "train_speed(iter/s)": 0.201132 }, { "epoch": 1.8314644424678108, "eval_acc": 0.7446793279899608, "eval_loss": 0.8043855428695679, "eval_runtime": 1271.7476, "eval_samples_per_second": 28.3, "eval_steps_per_second": 14.151, "step": 78500 }, { "acc": 0.79847651, "epoch": 1.8316977500400997, "grad_norm": 6.1875, "learning_rate": 1.8447515413659578e-07, "loss": 0.71289806, "memory(GiB)": 138.1, "step": 78510, "train_speed(iter/s)": 0.200478 }, { "acc": 0.76541691, "epoch": 1.8319310576123886, "grad_norm": 5.40625, "learning_rate": 1.8396708874663826e-07, "loss": 0.84451513, "memory(GiB)": 138.1, "step": 78520, "train_speed(iter/s)": 0.200492 }, { "acc": 0.75975571, "epoch": 1.8321643651846775, "grad_norm": 5.875, "learning_rate": 1.8345971084398007e-07, "loss": 0.87615738, "memory(GiB)": 138.1, "step": 78530, "train_speed(iter/s)": 0.200504 }, { "acc": 0.76963463, "epoch": 1.8323976727569664, "grad_norm": 6.0625, "learning_rate": 1.829530205010488e-07, "loss": 0.83870296, "memory(GiB)": 138.1, "step": 78540, "train_speed(iter/s)": 0.200518 }, { "acc": 0.7904026, "epoch": 1.8326309803292553, "grad_norm": 5.1875, "learning_rate": 1.8244701779017438e-07, "loss": 0.74433708, "memory(GiB)": 138.1, "step": 78550, "train_speed(iter/s)": 0.200531 }, { "acc": 0.78499613, "epoch": 1.8328642879015442, "grad_norm": 6.28125, "learning_rate": 1.8194170278358847e-07, "loss": 0.77916923, "memory(GiB)": 138.1, "step": 78560, "train_speed(iter/s)": 0.200544 }, { "acc": 0.76727276, "epoch": 1.833097595473833, "grad_norm": 4.5, "learning_rate": 1.8143707555342504e-07, "loss": 0.84287758, "memory(GiB)": 138.1, "step": 78570, "train_speed(iter/s)": 0.200557 }, { "acc": 0.78795671, "epoch": 1.833330903046122, "grad_norm": 3.9375, "learning_rate": 1.8093313617171927e-07, "loss": 0.77323265, "memory(GiB)": 138.1, "step": 78580, "train_speed(iter/s)": 0.20057 }, { "acc": 0.76881981, "epoch": 1.833564210618411, "grad_norm": 6.40625, "learning_rate": 1.8042988471040856e-07, "loss": 0.84690332, "memory(GiB)": 138.1, "step": 78590, "train_speed(iter/s)": 0.200583 }, { "acc": 0.78443508, "epoch": 1.8337975181906998, "grad_norm": 4.84375, "learning_rate": 1.7992732124133106e-07, "loss": 0.78077679, "memory(GiB)": 138.1, "step": 78600, "train_speed(iter/s)": 0.200596 }, { "acc": 0.79543123, "epoch": 1.8340308257629887, "grad_norm": 6.59375, "learning_rate": 1.7942544583622878e-07, "loss": 0.73268194, "memory(GiB)": 138.1, "step": 78610, "train_speed(iter/s)": 0.200609 }, { "acc": 0.77256894, "epoch": 1.8342641333352776, "grad_norm": 4.75, "learning_rate": 1.7892425856674334e-07, "loss": 0.83725958, "memory(GiB)": 138.1, "step": 78620, "train_speed(iter/s)": 0.200622 }, { "acc": 0.78704338, "epoch": 1.8344974409075665, "grad_norm": 6.09375, "learning_rate": 1.7842375950442025e-07, "loss": 0.75788522, "memory(GiB)": 138.1, "step": 78630, "train_speed(iter/s)": 0.200636 }, { "acc": 0.76816063, "epoch": 1.8347307484798554, "grad_norm": 5.96875, "learning_rate": 1.7792394872070407e-07, "loss": 0.82397146, "memory(GiB)": 138.1, "step": 78640, "train_speed(iter/s)": 0.200649 }, { "acc": 0.79935145, "epoch": 1.834964056052144, "grad_norm": 6.21875, "learning_rate": 1.7742482628694379e-07, "loss": 0.72347355, "memory(GiB)": 138.1, "step": 78650, "train_speed(iter/s)": 0.200663 }, { "acc": 0.78173332, "epoch": 1.8351973636244332, "grad_norm": 6.75, "learning_rate": 1.76926392274388e-07, "loss": 0.78438444, "memory(GiB)": 138.1, "step": 78660, "train_speed(iter/s)": 0.200676 }, { "acc": 0.79420872, "epoch": 1.8354306711967219, "grad_norm": 5.6875, "learning_rate": 1.7642864675418925e-07, "loss": 0.74262061, "memory(GiB)": 138.1, "step": 78670, "train_speed(iter/s)": 0.200688 }, { "acc": 0.78799567, "epoch": 1.835663978769011, "grad_norm": 4.625, "learning_rate": 1.7593158979739955e-07, "loss": 0.75903831, "memory(GiB)": 138.1, "step": 78680, "train_speed(iter/s)": 0.200701 }, { "acc": 0.77767072, "epoch": 1.8358972863412997, "grad_norm": 5.71875, "learning_rate": 1.7543522147497382e-07, "loss": 0.79137626, "memory(GiB)": 138.1, "step": 78690, "train_speed(iter/s)": 0.200715 }, { "acc": 0.79292173, "epoch": 1.8361305939135888, "grad_norm": 4.84375, "learning_rate": 1.7493954185776928e-07, "loss": 0.74533262, "memory(GiB)": 138.1, "step": 78700, "train_speed(iter/s)": 0.200728 }, { "acc": 0.79466348, "epoch": 1.8363639014858775, "grad_norm": 4.1875, "learning_rate": 1.7444455101654267e-07, "loss": 0.73135486, "memory(GiB)": 138.1, "step": 78710, "train_speed(iter/s)": 0.200741 }, { "acc": 0.78591375, "epoch": 1.8365972090581666, "grad_norm": 3.9375, "learning_rate": 1.7395024902195522e-07, "loss": 0.77585912, "memory(GiB)": 138.1, "step": 78720, "train_speed(iter/s)": 0.200755 }, { "acc": 0.77571945, "epoch": 1.8368305166304553, "grad_norm": 5.375, "learning_rate": 1.7345663594456775e-07, "loss": 0.79873123, "memory(GiB)": 138.1, "step": 78730, "train_speed(iter/s)": 0.200768 }, { "acc": 0.80318031, "epoch": 1.8370638242027444, "grad_norm": 6.53125, "learning_rate": 1.7296371185484328e-07, "loss": 0.6877492, "memory(GiB)": 138.1, "step": 78740, "train_speed(iter/s)": 0.20078 }, { "acc": 0.77101789, "epoch": 1.837297131775033, "grad_norm": 9.375, "learning_rate": 1.7247147682314724e-07, "loss": 0.8241971, "memory(GiB)": 138.1, "step": 78750, "train_speed(iter/s)": 0.200793 }, { "acc": 0.78852234, "epoch": 1.8375304393473222, "grad_norm": 6.8125, "learning_rate": 1.7197993091974452e-07, "loss": 0.75911512, "memory(GiB)": 138.1, "step": 78760, "train_speed(iter/s)": 0.200806 }, { "acc": 0.77903395, "epoch": 1.8377637469196109, "grad_norm": 5.3125, "learning_rate": 1.7148907421480455e-07, "loss": 0.7937254, "memory(GiB)": 138.1, "step": 78770, "train_speed(iter/s)": 0.20082 }, { "acc": 0.78879142, "epoch": 1.8379970544919, "grad_norm": 4.03125, "learning_rate": 1.7099890677839626e-07, "loss": 0.77068634, "memory(GiB)": 138.1, "step": 78780, "train_speed(iter/s)": 0.200833 }, { "acc": 0.78774767, "epoch": 1.8382303620641887, "grad_norm": 7.6875, "learning_rate": 1.7050942868049147e-07, "loss": 0.75463233, "memory(GiB)": 138.1, "step": 78790, "train_speed(iter/s)": 0.200847 }, { "acc": 0.78863096, "epoch": 1.8384636696364778, "grad_norm": 4.09375, "learning_rate": 1.7002063999096208e-07, "loss": 0.74385252, "memory(GiB)": 138.1, "step": 78800, "train_speed(iter/s)": 0.20086 }, { "acc": 0.77340384, "epoch": 1.8386969772087665, "grad_norm": 4.53125, "learning_rate": 1.695325407795839e-07, "loss": 0.82360716, "memory(GiB)": 138.1, "step": 78810, "train_speed(iter/s)": 0.200874 }, { "acc": 0.78802562, "epoch": 1.8389302847810556, "grad_norm": 5.9375, "learning_rate": 1.6904513111603238e-07, "loss": 0.76730204, "memory(GiB)": 138.1, "step": 78820, "train_speed(iter/s)": 0.200886 }, { "acc": 0.78368859, "epoch": 1.8391635923533443, "grad_norm": 4.46875, "learning_rate": 1.685584110698846e-07, "loss": 0.77572279, "memory(GiB)": 138.1, "step": 78830, "train_speed(iter/s)": 0.2009 }, { "acc": 0.79617405, "epoch": 1.8393968999256334, "grad_norm": 5.34375, "learning_rate": 1.680723807106205e-07, "loss": 0.70224414, "memory(GiB)": 138.1, "step": 78840, "train_speed(iter/s)": 0.200912 }, { "acc": 0.77466588, "epoch": 1.839630207497922, "grad_norm": 5.46875, "learning_rate": 1.675870401076196e-07, "loss": 0.81798153, "memory(GiB)": 138.1, "step": 78850, "train_speed(iter/s)": 0.200925 }, { "acc": 0.78734894, "epoch": 1.839863515070211, "grad_norm": 4.5625, "learning_rate": 1.6710238933016597e-07, "loss": 0.7503355, "memory(GiB)": 138.1, "step": 78860, "train_speed(iter/s)": 0.200938 }, { "acc": 0.79105768, "epoch": 1.8400968226424999, "grad_norm": 5.1875, "learning_rate": 1.6661842844744148e-07, "loss": 0.74174986, "memory(GiB)": 138.1, "step": 78870, "train_speed(iter/s)": 0.200951 }, { "acc": 0.76048555, "epoch": 1.8403301302147888, "grad_norm": 5.09375, "learning_rate": 1.6613515752853303e-07, "loss": 0.8763361, "memory(GiB)": 138.1, "step": 78880, "train_speed(iter/s)": 0.200964 }, { "acc": 0.77112312, "epoch": 1.8405634377870776, "grad_norm": 6.9375, "learning_rate": 1.6565257664242606e-07, "loss": 0.79881568, "memory(GiB)": 138.1, "step": 78890, "train_speed(iter/s)": 0.200977 }, { "acc": 0.76842785, "epoch": 1.8407967453593665, "grad_norm": 5.03125, "learning_rate": 1.6517068585800932e-07, "loss": 0.82623138, "memory(GiB)": 138.1, "step": 78900, "train_speed(iter/s)": 0.200991 }, { "acc": 0.78424797, "epoch": 1.8410300529316554, "grad_norm": 5.28125, "learning_rate": 1.646894852440728e-07, "loss": 0.76516953, "memory(GiB)": 138.1, "step": 78910, "train_speed(iter/s)": 0.201004 }, { "acc": 0.79357691, "epoch": 1.8412633605039443, "grad_norm": 5.59375, "learning_rate": 1.642089748693071e-07, "loss": 0.73923221, "memory(GiB)": 138.1, "step": 78920, "train_speed(iter/s)": 0.201018 }, { "acc": 0.77890062, "epoch": 1.8414966680762332, "grad_norm": 4.4375, "learning_rate": 1.6372915480230622e-07, "loss": 0.81259813, "memory(GiB)": 138.1, "step": 78930, "train_speed(iter/s)": 0.201031 }, { "acc": 0.80381775, "epoch": 1.8417299756485221, "grad_norm": 6.09375, "learning_rate": 1.6325002511156262e-07, "loss": 0.7070673, "memory(GiB)": 138.1, "step": 78940, "train_speed(iter/s)": 0.201044 }, { "acc": 0.77546864, "epoch": 1.841963283220811, "grad_norm": 4.71875, "learning_rate": 1.6277158586547325e-07, "loss": 0.81202116, "memory(GiB)": 138.1, "step": 78950, "train_speed(iter/s)": 0.201056 }, { "acc": 0.78609886, "epoch": 1.8421965907931, "grad_norm": 5.53125, "learning_rate": 1.6229383713233516e-07, "loss": 0.76256151, "memory(GiB)": 138.1, "step": 78960, "train_speed(iter/s)": 0.20107 }, { "acc": 0.77121072, "epoch": 1.8424298983653888, "grad_norm": 5.15625, "learning_rate": 1.6181677898034597e-07, "loss": 0.82044077, "memory(GiB)": 138.1, "step": 78970, "train_speed(iter/s)": 0.201083 }, { "acc": 0.7971004, "epoch": 1.8426632059376777, "grad_norm": 7.03125, "learning_rate": 1.6134041147760738e-07, "loss": 0.70855894, "memory(GiB)": 138.1, "step": 78980, "train_speed(iter/s)": 0.201096 }, { "acc": 0.76304283, "epoch": 1.8428965135099666, "grad_norm": 6.03125, "learning_rate": 1.608647346921177e-07, "loss": 0.84548931, "memory(GiB)": 138.1, "step": 78990, "train_speed(iter/s)": 0.201108 }, { "acc": 0.76878996, "epoch": 1.8431298210822555, "grad_norm": 6.9375, "learning_rate": 1.6038974869178214e-07, "loss": 0.8415431, "memory(GiB)": 138.1, "step": 79000, "train_speed(iter/s)": 0.201122 }, { "epoch": 1.8431298210822555, "eval_acc": 0.744701748182483, "eval_loss": 0.8044163584709167, "eval_runtime": 1270.82, "eval_samples_per_second": 28.321, "eval_steps_per_second": 14.161, "step": 79000 }, { "acc": 0.7843946, "epoch": 1.8433631286545444, "grad_norm": 4.59375, "learning_rate": 1.5991545354440363e-07, "loss": 0.77073593, "memory(GiB)": 138.1, "step": 79010, "train_speed(iter/s)": 0.200473 }, { "acc": 0.78733177, "epoch": 1.8435964362268333, "grad_norm": 5.3125, "learning_rate": 1.594418493176886e-07, "loss": 0.76721573, "memory(GiB)": 138.1, "step": 79020, "train_speed(iter/s)": 0.200486 }, { "acc": 0.772614, "epoch": 1.8438297437991222, "grad_norm": 4.53125, "learning_rate": 1.5896893607924346e-07, "loss": 0.82392635, "memory(GiB)": 138.1, "step": 79030, "train_speed(iter/s)": 0.200499 }, { "acc": 0.78404236, "epoch": 1.844063051371411, "grad_norm": 5.25, "learning_rate": 1.5849671389657594e-07, "loss": 0.7704998, "memory(GiB)": 138.1, "step": 79040, "train_speed(iter/s)": 0.200512 }, { "acc": 0.77602654, "epoch": 1.8442963589437, "grad_norm": 6.9375, "learning_rate": 1.58025182837096e-07, "loss": 0.80820026, "memory(GiB)": 138.1, "step": 79050, "train_speed(iter/s)": 0.200526 }, { "acc": 0.79040246, "epoch": 1.8445296665159887, "grad_norm": 4.125, "learning_rate": 1.5755434296811478e-07, "loss": 0.75322657, "memory(GiB)": 138.1, "step": 79060, "train_speed(iter/s)": 0.200539 }, { "acc": 0.76732149, "epoch": 1.8447629740882778, "grad_norm": 4.59375, "learning_rate": 1.5708419435684463e-07, "loss": 0.85027685, "memory(GiB)": 138.1, "step": 79070, "train_speed(iter/s)": 0.200552 }, { "acc": 0.77900348, "epoch": 1.8449962816605665, "grad_norm": 7.15625, "learning_rate": 1.5661473707039852e-07, "loss": 0.7863019, "memory(GiB)": 138.1, "step": 79080, "train_speed(iter/s)": 0.200564 }, { "acc": 0.79656229, "epoch": 1.8452295892328556, "grad_norm": 4.15625, "learning_rate": 1.561459711757918e-07, "loss": 0.73082271, "memory(GiB)": 138.1, "step": 79090, "train_speed(iter/s)": 0.200578 }, { "acc": 0.78649101, "epoch": 1.8454628968051443, "grad_norm": 5.34375, "learning_rate": 1.5567789673994026e-07, "loss": 0.76644926, "memory(GiB)": 138.1, "step": 79100, "train_speed(iter/s)": 0.20059 }, { "acc": 0.76249952, "epoch": 1.8456962043774334, "grad_norm": 4.8125, "learning_rate": 1.5521051382966224e-07, "loss": 0.85472183, "memory(GiB)": 138.1, "step": 79110, "train_speed(iter/s)": 0.200604 }, { "acc": 0.79876027, "epoch": 1.845929511949722, "grad_norm": 5.15625, "learning_rate": 1.5474382251167597e-07, "loss": 0.70353813, "memory(GiB)": 138.1, "step": 79120, "train_speed(iter/s)": 0.200616 }, { "acc": 0.77245951, "epoch": 1.8461628195220112, "grad_norm": 5.0, "learning_rate": 1.542778228526004e-07, "loss": 0.79053502, "memory(GiB)": 138.1, "step": 79130, "train_speed(iter/s)": 0.20063 }, { "acc": 0.76596804, "epoch": 1.8463961270943, "grad_norm": 5.21875, "learning_rate": 1.53812514918959e-07, "loss": 0.83255501, "memory(GiB)": 138.1, "step": 79140, "train_speed(iter/s)": 0.200644 }, { "acc": 0.76959438, "epoch": 1.846629434666589, "grad_norm": 4.5625, "learning_rate": 1.5334789877717248e-07, "loss": 0.82380829, "memory(GiB)": 138.1, "step": 79150, "train_speed(iter/s)": 0.200657 }, { "acc": 0.76817465, "epoch": 1.8468627422388777, "grad_norm": 5.03125, "learning_rate": 1.5288397449356617e-07, "loss": 0.84938889, "memory(GiB)": 138.1, "step": 79160, "train_speed(iter/s)": 0.20067 }, { "acc": 0.77281103, "epoch": 1.8470960498111668, "grad_norm": 4.96875, "learning_rate": 1.524207421343643e-07, "loss": 0.81317339, "memory(GiB)": 138.1, "step": 79170, "train_speed(iter/s)": 0.200683 }, { "acc": 0.78833442, "epoch": 1.8473293573834555, "grad_norm": 4.78125, "learning_rate": 1.5195820176569288e-07, "loss": 0.73116922, "memory(GiB)": 138.1, "step": 79180, "train_speed(iter/s)": 0.200696 }, { "acc": 0.77847328, "epoch": 1.8475626649557446, "grad_norm": 5.8125, "learning_rate": 1.5149635345358017e-07, "loss": 0.80921946, "memory(GiB)": 138.1, "step": 79190, "train_speed(iter/s)": 0.200708 }, { "acc": 0.77257528, "epoch": 1.8477959725280333, "grad_norm": 5.40625, "learning_rate": 1.51035197263954e-07, "loss": 0.83006363, "memory(GiB)": 138.1, "step": 79200, "train_speed(iter/s)": 0.200722 }, { "acc": 0.78382149, "epoch": 1.8480292801003224, "grad_norm": 4.9375, "learning_rate": 1.5057473326264614e-07, "loss": 0.76725683, "memory(GiB)": 138.1, "step": 79210, "train_speed(iter/s)": 0.200735 }, { "acc": 0.79515305, "epoch": 1.848262587672611, "grad_norm": 4.6875, "learning_rate": 1.5011496151538462e-07, "loss": 0.73181725, "memory(GiB)": 138.1, "step": 79220, "train_speed(iter/s)": 0.200748 }, { "acc": 0.77747626, "epoch": 1.8484958952449, "grad_norm": 5.5, "learning_rate": 1.4965588208780468e-07, "loss": 0.77987266, "memory(GiB)": 138.1, "step": 79230, "train_speed(iter/s)": 0.200761 }, { "acc": 0.77652779, "epoch": 1.8487292028171889, "grad_norm": 5.75, "learning_rate": 1.491974950454378e-07, "loss": 0.80380249, "memory(GiB)": 138.1, "step": 79240, "train_speed(iter/s)": 0.200775 }, { "acc": 0.78755636, "epoch": 1.8489625103894778, "grad_norm": 4.4375, "learning_rate": 1.4873980045371938e-07, "loss": 0.75492868, "memory(GiB)": 138.1, "step": 79250, "train_speed(iter/s)": 0.200787 }, { "acc": 0.80663643, "epoch": 1.8491958179617667, "grad_norm": 4.90625, "learning_rate": 1.4828279837798553e-07, "loss": 0.68906574, "memory(GiB)": 138.1, "step": 79260, "train_speed(iter/s)": 0.200801 }, { "acc": 0.78660917, "epoch": 1.8494291255340556, "grad_norm": 4.96875, "learning_rate": 1.478264888834724e-07, "loss": 0.76002073, "memory(GiB)": 138.1, "step": 79270, "train_speed(iter/s)": 0.200814 }, { "acc": 0.77109876, "epoch": 1.8496624331063445, "grad_norm": 6.40625, "learning_rate": 1.4737087203531896e-07, "loss": 0.81629725, "memory(GiB)": 138.1, "step": 79280, "train_speed(iter/s)": 0.200827 }, { "acc": 0.78523669, "epoch": 1.8498957406786334, "grad_norm": 7.78125, "learning_rate": 1.4691594789856268e-07, "loss": 0.76274586, "memory(GiB)": 138.1, "step": 79290, "train_speed(iter/s)": 0.20084 }, { "acc": 0.78136597, "epoch": 1.8501290482509223, "grad_norm": 5.65625, "learning_rate": 1.46461716538146e-07, "loss": 0.79236145, "memory(GiB)": 138.1, "step": 79300, "train_speed(iter/s)": 0.200852 }, { "acc": 0.77388811, "epoch": 1.8503623558232112, "grad_norm": 6.90625, "learning_rate": 1.4600817801890933e-07, "loss": 0.82036371, "memory(GiB)": 138.1, "step": 79310, "train_speed(iter/s)": 0.200865 }, { "acc": 0.74084215, "epoch": 1.8505956633955, "grad_norm": 5.65625, "learning_rate": 1.4555533240559526e-07, "loss": 0.93160048, "memory(GiB)": 138.1, "step": 79320, "train_speed(iter/s)": 0.200878 }, { "acc": 0.77508183, "epoch": 1.850828970967789, "grad_norm": 6.1875, "learning_rate": 1.4510317976284715e-07, "loss": 0.81556673, "memory(GiB)": 138.1, "step": 79330, "train_speed(iter/s)": 0.200891 }, { "acc": 0.7719595, "epoch": 1.8510622785400779, "grad_norm": 9.0, "learning_rate": 1.4465172015520945e-07, "loss": 0.82613068, "memory(GiB)": 138.1, "step": 79340, "train_speed(iter/s)": 0.200903 }, { "acc": 0.79376431, "epoch": 1.8512955861123668, "grad_norm": 4.75, "learning_rate": 1.4420095364712838e-07, "loss": 0.72482729, "memory(GiB)": 138.1, "step": 79350, "train_speed(iter/s)": 0.200915 }, { "acc": 0.76974692, "epoch": 1.8515288936846557, "grad_norm": 6.25, "learning_rate": 1.4375088030295027e-07, "loss": 0.83236866, "memory(GiB)": 138.1, "step": 79360, "train_speed(iter/s)": 0.200928 }, { "acc": 0.79396596, "epoch": 1.8517622012569446, "grad_norm": 4.0, "learning_rate": 1.433015001869237e-07, "loss": 0.74162588, "memory(GiB)": 138.1, "step": 79370, "train_speed(iter/s)": 0.20094 }, { "acc": 0.76310349, "epoch": 1.8519955088292335, "grad_norm": 5.6875, "learning_rate": 1.428528133631968e-07, "loss": 0.85866594, "memory(GiB)": 138.1, "step": 79380, "train_speed(iter/s)": 0.200953 }, { "acc": 0.75772862, "epoch": 1.8522288164015224, "grad_norm": 6.75, "learning_rate": 1.4240481989581944e-07, "loss": 0.87741089, "memory(GiB)": 138.1, "step": 79390, "train_speed(iter/s)": 0.200967 }, { "acc": 0.77879143, "epoch": 1.8524621239738113, "grad_norm": 5.125, "learning_rate": 1.4195751984874383e-07, "loss": 0.79758248, "memory(GiB)": 138.1, "step": 79400, "train_speed(iter/s)": 0.20098 }, { "acc": 0.78456173, "epoch": 1.8526954315461002, "grad_norm": 5.5, "learning_rate": 1.4151091328582e-07, "loss": 0.75210719, "memory(GiB)": 138.1, "step": 79410, "train_speed(iter/s)": 0.200993 }, { "acc": 0.77924018, "epoch": 1.852928739118389, "grad_norm": 4.03125, "learning_rate": 1.410650002708025e-07, "loss": 0.80060263, "memory(GiB)": 138.1, "step": 79420, "train_speed(iter/s)": 0.201006 }, { "acc": 0.78018885, "epoch": 1.8531620466906777, "grad_norm": 5.03125, "learning_rate": 1.4061978086734484e-07, "loss": 0.77695847, "memory(GiB)": 138.1, "step": 79430, "train_speed(iter/s)": 0.20102 }, { "acc": 0.76331925, "epoch": 1.8533953542629669, "grad_norm": 6.84375, "learning_rate": 1.4017525513900175e-07, "loss": 0.85669298, "memory(GiB)": 138.1, "step": 79440, "train_speed(iter/s)": 0.201033 }, { "acc": 0.79373131, "epoch": 1.8536286618352555, "grad_norm": 10.5, "learning_rate": 1.3973142314922862e-07, "loss": 0.73819771, "memory(GiB)": 138.1, "step": 79450, "train_speed(iter/s)": 0.201046 }, { "acc": 0.77359915, "epoch": 1.8538619694075447, "grad_norm": 6.625, "learning_rate": 1.3928828496138358e-07, "loss": 0.83192682, "memory(GiB)": 138.1, "step": 79460, "train_speed(iter/s)": 0.201058 }, { "acc": 0.79424295, "epoch": 1.8540952769798333, "grad_norm": 7.75, "learning_rate": 1.3884584063872386e-07, "loss": 0.72979498, "memory(GiB)": 138.1, "step": 79470, "train_speed(iter/s)": 0.201071 }, { "acc": 0.79742351, "epoch": 1.8543285845521225, "grad_norm": 4.625, "learning_rate": 1.3840409024440726e-07, "loss": 0.72036037, "memory(GiB)": 138.1, "step": 79480, "train_speed(iter/s)": 0.201084 }, { "acc": 0.76095319, "epoch": 1.8545618921244111, "grad_norm": 6.125, "learning_rate": 1.3796303384149557e-07, "loss": 0.855832, "memory(GiB)": 138.1, "step": 79490, "train_speed(iter/s)": 0.201098 }, { "acc": 0.768857, "epoch": 1.8547951996967003, "grad_norm": 4.84375, "learning_rate": 1.375226714929473e-07, "loss": 0.83169584, "memory(GiB)": 138.1, "step": 79500, "train_speed(iter/s)": 0.201111 }, { "epoch": 1.8547951996967003, "eval_acc": 0.744718723471107, "eval_loss": 0.8043954968452454, "eval_runtime": 1271.2392, "eval_samples_per_second": 28.312, "eval_steps_per_second": 14.156, "step": 79500 }, { "acc": 0.80413227, "epoch": 1.855028507268989, "grad_norm": 4.5625, "learning_rate": 1.3708300326162605e-07, "loss": 0.69383755, "memory(GiB)": 138.1, "step": 79510, "train_speed(iter/s)": 0.200467 }, { "acc": 0.77757564, "epoch": 1.855261814841278, "grad_norm": 5.1875, "learning_rate": 1.3664402921029328e-07, "loss": 0.78825073, "memory(GiB)": 138.1, "step": 79520, "train_speed(iter/s)": 0.20048 }, { "acc": 0.75611916, "epoch": 1.8554951224135667, "grad_norm": 4.21875, "learning_rate": 1.3620574940161168e-07, "loss": 0.88193607, "memory(GiB)": 138.1, "step": 79530, "train_speed(iter/s)": 0.200492 }, { "acc": 0.78447027, "epoch": 1.8557284299858559, "grad_norm": 5.59375, "learning_rate": 1.357681638981473e-07, "loss": 0.78287034, "memory(GiB)": 138.1, "step": 79540, "train_speed(iter/s)": 0.200504 }, { "acc": 0.78698115, "epoch": 1.8559617375581445, "grad_norm": 6.875, "learning_rate": 1.3533127276236458e-07, "loss": 0.78185997, "memory(GiB)": 138.1, "step": 79550, "train_speed(iter/s)": 0.200518 }, { "acc": 0.77826419, "epoch": 1.8561950451304337, "grad_norm": 4.4375, "learning_rate": 1.348950760566292e-07, "loss": 0.79370699, "memory(GiB)": 138.1, "step": 79560, "train_speed(iter/s)": 0.200531 }, { "acc": 0.78184977, "epoch": 1.8564283527027223, "grad_norm": 5.78125, "learning_rate": 1.3445957384320808e-07, "loss": 0.77437334, "memory(GiB)": 138.1, "step": 79570, "train_speed(iter/s)": 0.200545 }, { "acc": 0.76415167, "epoch": 1.8566616602750114, "grad_norm": 6.125, "learning_rate": 1.340247661842692e-07, "loss": 0.85826664, "memory(GiB)": 138.1, "step": 79580, "train_speed(iter/s)": 0.200557 }, { "acc": 0.76513901, "epoch": 1.8568949678473001, "grad_norm": 5.625, "learning_rate": 1.335906531418818e-07, "loss": 0.84353552, "memory(GiB)": 138.1, "step": 79590, "train_speed(iter/s)": 0.20057 }, { "acc": 0.79837427, "epoch": 1.8571282754195892, "grad_norm": 4.8125, "learning_rate": 1.3315723477801467e-07, "loss": 0.71099424, "memory(GiB)": 138.1, "step": 79600, "train_speed(iter/s)": 0.200583 }, { "acc": 0.77831469, "epoch": 1.857361582991878, "grad_norm": 4.90625, "learning_rate": 1.3272451115453888e-07, "loss": 0.80567703, "memory(GiB)": 138.1, "step": 79610, "train_speed(iter/s)": 0.200597 }, { "acc": 0.77662849, "epoch": 1.8575948905641668, "grad_norm": 5.875, "learning_rate": 1.32292482333225e-07, "loss": 0.80015821, "memory(GiB)": 138.1, "step": 79620, "train_speed(iter/s)": 0.20061 }, { "acc": 0.78590217, "epoch": 1.8578281981364557, "grad_norm": 5.0625, "learning_rate": 1.3186114837574538e-07, "loss": 0.75417018, "memory(GiB)": 138.1, "step": 79630, "train_speed(iter/s)": 0.200624 }, { "acc": 0.77556219, "epoch": 1.8580615057087446, "grad_norm": 4.78125, "learning_rate": 1.3143050934367187e-07, "loss": 0.81039896, "memory(GiB)": 138.1, "step": 79640, "train_speed(iter/s)": 0.200637 }, { "acc": 0.78844309, "epoch": 1.8582948132810335, "grad_norm": 6.4375, "learning_rate": 1.310005652984797e-07, "loss": 0.75361748, "memory(GiB)": 138.1, "step": 79650, "train_speed(iter/s)": 0.20065 }, { "acc": 0.78295665, "epoch": 1.8585281208533224, "grad_norm": 11.5, "learning_rate": 1.3057131630154208e-07, "loss": 0.7822053, "memory(GiB)": 138.1, "step": 79660, "train_speed(iter/s)": 0.200663 }, { "acc": 0.76080747, "epoch": 1.8587614284256113, "grad_norm": 5.53125, "learning_rate": 1.3014276241413438e-07, "loss": 0.84956684, "memory(GiB)": 138.1, "step": 79670, "train_speed(iter/s)": 0.200676 }, { "acc": 0.78363895, "epoch": 1.8589947359979002, "grad_norm": 4.53125, "learning_rate": 1.2971490369743323e-07, "loss": 0.79574165, "memory(GiB)": 138.1, "step": 79680, "train_speed(iter/s)": 0.200689 }, { "acc": 0.77177901, "epoch": 1.8592280435701891, "grad_norm": 6.0, "learning_rate": 1.2928774021251368e-07, "loss": 0.8176857, "memory(GiB)": 138.1, "step": 79690, "train_speed(iter/s)": 0.200702 }, { "acc": 0.75965395, "epoch": 1.859461351142478, "grad_norm": 5.53125, "learning_rate": 1.288612720203547e-07, "loss": 0.86375732, "memory(GiB)": 138.1, "step": 79700, "train_speed(iter/s)": 0.200715 }, { "acc": 0.76926231, "epoch": 1.859694658714767, "grad_norm": 7.09375, "learning_rate": 1.284354991818343e-07, "loss": 0.83586464, "memory(GiB)": 138.1, "step": 79710, "train_speed(iter/s)": 0.200729 }, { "acc": 0.78583107, "epoch": 1.8599279662870558, "grad_norm": 5.125, "learning_rate": 1.2801042175773104e-07, "loss": 0.76661096, "memory(GiB)": 138.1, "step": 79720, "train_speed(iter/s)": 0.200742 }, { "acc": 0.7856154, "epoch": 1.8601612738593447, "grad_norm": 4.96875, "learning_rate": 1.2758603980872419e-07, "loss": 0.76946712, "memory(GiB)": 138.1, "step": 79730, "train_speed(iter/s)": 0.200755 }, { "acc": 0.78945646, "epoch": 1.8603945814316336, "grad_norm": 5.3125, "learning_rate": 1.2716235339539585e-07, "loss": 0.76702728, "memory(GiB)": 138.1, "step": 79740, "train_speed(iter/s)": 0.200768 }, { "acc": 0.78451262, "epoch": 1.8606278890039225, "grad_norm": 5.5625, "learning_rate": 1.267393625782254e-07, "loss": 0.75838819, "memory(GiB)": 138.1, "step": 79750, "train_speed(iter/s)": 0.20078 }, { "acc": 0.77074156, "epoch": 1.8608611965762114, "grad_norm": 4.71875, "learning_rate": 1.263170674175951e-07, "loss": 0.83515434, "memory(GiB)": 138.1, "step": 79760, "train_speed(iter/s)": 0.200793 }, { "acc": 0.77783523, "epoch": 1.8610945041485003, "grad_norm": 5.5625, "learning_rate": 1.2589546797378783e-07, "loss": 0.7946856, "memory(GiB)": 138.1, "step": 79770, "train_speed(iter/s)": 0.200807 }, { "acc": 0.76884985, "epoch": 1.8613278117207892, "grad_norm": 6.03125, "learning_rate": 1.2547456430698656e-07, "loss": 0.82223921, "memory(GiB)": 138.1, "step": 79780, "train_speed(iter/s)": 0.20082 }, { "acc": 0.7526144, "epoch": 1.861561119293078, "grad_norm": 5.15625, "learning_rate": 1.2505435647727548e-07, "loss": 0.89265423, "memory(GiB)": 138.1, "step": 79790, "train_speed(iter/s)": 0.200833 }, { "acc": 0.77072687, "epoch": 1.861794426865367, "grad_norm": 5.75, "learning_rate": 1.2463484454463826e-07, "loss": 0.82332354, "memory(GiB)": 138.1, "step": 79800, "train_speed(iter/s)": 0.200845 }, { "acc": 0.7775485, "epoch": 1.862027734437656, "grad_norm": 5.34375, "learning_rate": 1.2421602856896087e-07, "loss": 0.7913826, "memory(GiB)": 138.1, "step": 79810, "train_speed(iter/s)": 0.200858 }, { "acc": 0.78050547, "epoch": 1.8622610420099446, "grad_norm": 4.65625, "learning_rate": 1.237979086100294e-07, "loss": 0.7908287, "memory(GiB)": 138.1, "step": 79820, "train_speed(iter/s)": 0.200871 }, { "acc": 0.78652105, "epoch": 1.8624943495822337, "grad_norm": 4.96875, "learning_rate": 1.233804847275294e-07, "loss": 0.76471519, "memory(GiB)": 138.1, "step": 79830, "train_speed(iter/s)": 0.200884 }, { "acc": 0.78220301, "epoch": 1.8627276571545224, "grad_norm": 5.34375, "learning_rate": 1.2296375698104878e-07, "loss": 0.77280107, "memory(GiB)": 138.1, "step": 79840, "train_speed(iter/s)": 0.200897 }, { "acc": 0.78212929, "epoch": 1.8629609647268115, "grad_norm": 6.0625, "learning_rate": 1.2254772543007442e-07, "loss": 0.80081415, "memory(GiB)": 138.1, "step": 79850, "train_speed(iter/s)": 0.20091 }, { "acc": 0.78351145, "epoch": 1.8631942722991002, "grad_norm": 6.125, "learning_rate": 1.2213239013399602e-07, "loss": 0.77549877, "memory(GiB)": 138.1, "step": 79860, "train_speed(iter/s)": 0.200923 }, { "acc": 0.77944012, "epoch": 1.8634275798713893, "grad_norm": 5.3125, "learning_rate": 1.217177511521017e-07, "loss": 0.78213992, "memory(GiB)": 138.1, "step": 79870, "train_speed(iter/s)": 0.200936 }, { "acc": 0.78833656, "epoch": 1.863660887443678, "grad_norm": 4.78125, "learning_rate": 1.2130380854358136e-07, "loss": 0.74980211, "memory(GiB)": 138.1, "step": 79880, "train_speed(iter/s)": 0.200949 }, { "acc": 0.78198175, "epoch": 1.863894195015967, "grad_norm": 4.9375, "learning_rate": 1.208905623675255e-07, "loss": 0.80464964, "memory(GiB)": 138.1, "step": 79890, "train_speed(iter/s)": 0.200961 }, { "acc": 0.77077436, "epoch": 1.8641275025882558, "grad_norm": 6.78125, "learning_rate": 1.2047801268292414e-07, "loss": 0.82589788, "memory(GiB)": 138.1, "step": 79900, "train_speed(iter/s)": 0.200974 }, { "acc": 0.77802629, "epoch": 1.864360810160545, "grad_norm": 5.9375, "learning_rate": 1.2006615954866906e-07, "loss": 0.77997022, "memory(GiB)": 138.1, "step": 79910, "train_speed(iter/s)": 0.200987 }, { "acc": 0.77522287, "epoch": 1.8645941177328336, "grad_norm": 5.71875, "learning_rate": 1.196550030235516e-07, "loss": 0.80558329, "memory(GiB)": 138.1, "step": 79920, "train_speed(iter/s)": 0.201 }, { "acc": 0.76346202, "epoch": 1.8648274253051227, "grad_norm": 6.40625, "learning_rate": 1.1924454316626478e-07, "loss": 0.8442524, "memory(GiB)": 138.1, "step": 79930, "train_speed(iter/s)": 0.201012 }, { "acc": 0.77351999, "epoch": 1.8650607328774114, "grad_norm": 8.8125, "learning_rate": 1.1883478003540172e-07, "loss": 0.82075424, "memory(GiB)": 138.1, "step": 79940, "train_speed(iter/s)": 0.201026 }, { "acc": 0.7649683, "epoch": 1.8652940404497005, "grad_norm": 3.84375, "learning_rate": 1.1842571368945566e-07, "loss": 0.85019636, "memory(GiB)": 138.1, "step": 79950, "train_speed(iter/s)": 0.201038 }, { "acc": 0.78048048, "epoch": 1.8655273480219892, "grad_norm": 5.09375, "learning_rate": 1.1801734418682154e-07, "loss": 0.79374285, "memory(GiB)": 138.1, "step": 79960, "train_speed(iter/s)": 0.201052 }, { "acc": 0.78092804, "epoch": 1.8657606555942783, "grad_norm": 6.125, "learning_rate": 1.1760967158579217e-07, "loss": 0.80186243, "memory(GiB)": 138.1, "step": 79970, "train_speed(iter/s)": 0.201065 }, { "acc": 0.79199791, "epoch": 1.865993963166567, "grad_norm": 5.71875, "learning_rate": 1.172026959445649e-07, "loss": 0.74338923, "memory(GiB)": 138.1, "step": 79980, "train_speed(iter/s)": 0.201077 }, { "acc": 0.79580288, "epoch": 1.866227270738856, "grad_norm": 5.21875, "learning_rate": 1.1679641732123382e-07, "loss": 0.71093869, "memory(GiB)": 138.1, "step": 79990, "train_speed(iter/s)": 0.201091 }, { "acc": 0.77390499, "epoch": 1.8664605783111448, "grad_norm": 6.25, "learning_rate": 1.163908357737964e-07, "loss": 0.81473513, "memory(GiB)": 138.1, "step": 80000, "train_speed(iter/s)": 0.201105 }, { "epoch": 1.8664605783111448, "eval_acc": 0.7447302538558327, "eval_loss": 0.8044291138648987, "eval_runtime": 1271.074, "eval_samples_per_second": 28.315, "eval_steps_per_second": 14.158, "step": 80000 }, { "acc": 0.79017296, "epoch": 1.8666938858834337, "grad_norm": 5.59375, "learning_rate": 1.1598595136014745e-07, "loss": 0.75285716, "memory(GiB)": 138.1, "step": 80010, "train_speed(iter/s)": 0.200465 }, { "acc": 0.76734552, "epoch": 1.8669271934557226, "grad_norm": 5.46875, "learning_rate": 1.1558176413808519e-07, "loss": 0.864007, "memory(GiB)": 138.1, "step": 80020, "train_speed(iter/s)": 0.200477 }, { "acc": 0.76343198, "epoch": 1.8671605010280115, "grad_norm": 5.34375, "learning_rate": 1.1517827416530736e-07, "loss": 0.8714529, "memory(GiB)": 138.1, "step": 80030, "train_speed(iter/s)": 0.200489 }, { "acc": 0.7741498, "epoch": 1.8673938086003004, "grad_norm": 4.0625, "learning_rate": 1.1477548149941176e-07, "loss": 0.79983683, "memory(GiB)": 138.1, "step": 80040, "train_speed(iter/s)": 0.200503 }, { "acc": 0.79464846, "epoch": 1.8676271161725893, "grad_norm": 5.71875, "learning_rate": 1.143733861978974e-07, "loss": 0.72727637, "memory(GiB)": 138.1, "step": 80050, "train_speed(iter/s)": 0.200516 }, { "acc": 0.7844614, "epoch": 1.8678604237448782, "grad_norm": 4.5, "learning_rate": 1.1397198831816226e-07, "loss": 0.73518605, "memory(GiB)": 138.1, "step": 80060, "train_speed(iter/s)": 0.200529 }, { "acc": 0.78245525, "epoch": 1.868093731317167, "grad_norm": 5.3125, "learning_rate": 1.1357128791750716e-07, "loss": 0.78568835, "memory(GiB)": 138.1, "step": 80070, "train_speed(iter/s)": 0.200543 }, { "acc": 0.79324665, "epoch": 1.868327038889456, "grad_norm": 5.28125, "learning_rate": 1.1317128505313024e-07, "loss": 0.75692196, "memory(GiB)": 138.1, "step": 80080, "train_speed(iter/s)": 0.200556 }, { "acc": 0.79333744, "epoch": 1.8685603464617448, "grad_norm": 5.6875, "learning_rate": 1.1277197978213362e-07, "loss": 0.71830125, "memory(GiB)": 138.1, "step": 80090, "train_speed(iter/s)": 0.200569 }, { "acc": 0.75221357, "epoch": 1.8687936540340337, "grad_norm": 5.5625, "learning_rate": 1.1237337216151723e-07, "loss": 0.89948711, "memory(GiB)": 138.1, "step": 80100, "train_speed(iter/s)": 0.200582 }, { "acc": 0.77064877, "epoch": 1.8690269616063226, "grad_norm": 6.15625, "learning_rate": 1.1197546224818112e-07, "loss": 0.82701855, "memory(GiB)": 138.1, "step": 80110, "train_speed(iter/s)": 0.200595 }, { "acc": 0.79913635, "epoch": 1.8692602691786115, "grad_norm": 5.125, "learning_rate": 1.1157825009892931e-07, "loss": 0.71697931, "memory(GiB)": 138.1, "step": 80120, "train_speed(iter/s)": 0.200608 }, { "acc": 0.76889296, "epoch": 1.8694935767509004, "grad_norm": 6.0, "learning_rate": 1.1118173577046088e-07, "loss": 0.82190304, "memory(GiB)": 138.1, "step": 80130, "train_speed(iter/s)": 0.200621 }, { "acc": 0.78341179, "epoch": 1.8697268843231893, "grad_norm": 5.15625, "learning_rate": 1.1078591931937999e-07, "loss": 0.77279377, "memory(GiB)": 138.1, "step": 80140, "train_speed(iter/s)": 0.200633 }, { "acc": 0.78050628, "epoch": 1.8699601918954782, "grad_norm": 5.25, "learning_rate": 1.1039080080218811e-07, "loss": 0.80319948, "memory(GiB)": 138.1, "step": 80150, "train_speed(iter/s)": 0.200646 }, { "acc": 0.78934498, "epoch": 1.8701934994677671, "grad_norm": 7.625, "learning_rate": 1.0999638027528959e-07, "loss": 0.74584684, "memory(GiB)": 138.1, "step": 80160, "train_speed(iter/s)": 0.200659 }, { "acc": 0.78847446, "epoch": 1.870426807040056, "grad_norm": 4.8125, "learning_rate": 1.0960265779498769e-07, "loss": 0.73895617, "memory(GiB)": 138.1, "step": 80170, "train_speed(iter/s)": 0.200671 }, { "acc": 0.77396088, "epoch": 1.870660114612345, "grad_norm": 5.4375, "learning_rate": 1.092096334174847e-07, "loss": 0.81397905, "memory(GiB)": 138.1, "step": 80180, "train_speed(iter/s)": 0.200684 }, { "acc": 0.78955622, "epoch": 1.8708934221846336, "grad_norm": 5.21875, "learning_rate": 1.0881730719888628e-07, "loss": 0.76100597, "memory(GiB)": 138.1, "step": 80190, "train_speed(iter/s)": 0.200696 }, { "acc": 0.80156097, "epoch": 1.8711267297569227, "grad_norm": 3.828125, "learning_rate": 1.0842567919519597e-07, "loss": 0.69538174, "memory(GiB)": 138.1, "step": 80200, "train_speed(iter/s)": 0.200709 }, { "acc": 0.78851342, "epoch": 1.8713600373292114, "grad_norm": 6.34375, "learning_rate": 1.0803474946231963e-07, "loss": 0.76109829, "memory(GiB)": 138.1, "step": 80210, "train_speed(iter/s)": 0.200721 }, { "acc": 0.78874664, "epoch": 1.8715933449015005, "grad_norm": 5.71875, "learning_rate": 1.0764451805606091e-07, "loss": 0.73757567, "memory(GiB)": 138.1, "step": 80220, "train_speed(iter/s)": 0.200735 }, { "acc": 0.77023153, "epoch": 1.8718266524737892, "grad_norm": 3.984375, "learning_rate": 1.0725498503212694e-07, "loss": 0.81957588, "memory(GiB)": 138.1, "step": 80230, "train_speed(iter/s)": 0.200748 }, { "acc": 0.77765932, "epoch": 1.8720599600460783, "grad_norm": 6.6875, "learning_rate": 1.0686615044612159e-07, "loss": 0.77763338, "memory(GiB)": 138.1, "step": 80240, "train_speed(iter/s)": 0.20076 }, { "acc": 0.7891181, "epoch": 1.872293267618367, "grad_norm": 4.84375, "learning_rate": 1.0647801435355264e-07, "loss": 0.75162325, "memory(GiB)": 138.1, "step": 80250, "train_speed(iter/s)": 0.200774 }, { "acc": 0.76905499, "epoch": 1.8725265751906561, "grad_norm": 4.5625, "learning_rate": 1.0609057680982527e-07, "loss": 0.82668858, "memory(GiB)": 138.1, "step": 80260, "train_speed(iter/s)": 0.200787 }, { "acc": 0.78598719, "epoch": 1.8727598827629448, "grad_norm": 4.9375, "learning_rate": 1.0570383787024574e-07, "loss": 0.76735053, "memory(GiB)": 138.1, "step": 80270, "train_speed(iter/s)": 0.200798 }, { "acc": 0.78083096, "epoch": 1.872993190335234, "grad_norm": 5.46875, "learning_rate": 1.0531779759002214e-07, "loss": 0.79012175, "memory(GiB)": 138.1, "step": 80280, "train_speed(iter/s)": 0.200811 }, { "acc": 0.7970469, "epoch": 1.8732264979075226, "grad_norm": 6.25, "learning_rate": 1.0493245602426095e-07, "loss": 0.71222925, "memory(GiB)": 138.1, "step": 80290, "train_speed(iter/s)": 0.200823 }, { "acc": 0.78838396, "epoch": 1.8734598054798117, "grad_norm": 6.0625, "learning_rate": 1.0454781322796981e-07, "loss": 0.74144182, "memory(GiB)": 138.1, "step": 80300, "train_speed(iter/s)": 0.200836 }, { "acc": 0.80160275, "epoch": 1.8736931130521004, "grad_norm": 4.78125, "learning_rate": 1.0416386925605592e-07, "loss": 0.71015835, "memory(GiB)": 138.1, "step": 80310, "train_speed(iter/s)": 0.200849 }, { "acc": 0.80436602, "epoch": 1.8739264206243895, "grad_norm": 3.828125, "learning_rate": 1.0378062416332712e-07, "loss": 0.70317974, "memory(GiB)": 138.1, "step": 80320, "train_speed(iter/s)": 0.200862 }, { "acc": 0.7857511, "epoch": 1.8741597281966782, "grad_norm": 4.625, "learning_rate": 1.0339807800449241e-07, "loss": 0.76099644, "memory(GiB)": 138.1, "step": 80330, "train_speed(iter/s)": 0.200875 }, { "acc": 0.77934189, "epoch": 1.8743930357689673, "grad_norm": 5.90625, "learning_rate": 1.0301623083415924e-07, "loss": 0.79090543, "memory(GiB)": 138.1, "step": 80340, "train_speed(iter/s)": 0.200887 }, { "acc": 0.79418821, "epoch": 1.874626343341256, "grad_norm": 5.65625, "learning_rate": 1.0263508270683731e-07, "loss": 0.73172894, "memory(GiB)": 138.1, "step": 80350, "train_speed(iter/s)": 0.200899 }, { "acc": 0.76580453, "epoch": 1.8748596509135451, "grad_norm": 5.71875, "learning_rate": 1.0225463367693367e-07, "loss": 0.84161091, "memory(GiB)": 138.1, "step": 80360, "train_speed(iter/s)": 0.200912 }, { "acc": 0.80100813, "epoch": 1.8750929584858338, "grad_norm": 5.4375, "learning_rate": 1.0187488379875876e-07, "loss": 0.71097794, "memory(GiB)": 138.1, "step": 80370, "train_speed(iter/s)": 0.200925 }, { "acc": 0.79370942, "epoch": 1.875326266058123, "grad_norm": 6.65625, "learning_rate": 1.0149583312652089e-07, "loss": 0.73821492, "memory(GiB)": 138.1, "step": 80380, "train_speed(iter/s)": 0.200938 }, { "acc": 0.79008064, "epoch": 1.8755595736304116, "grad_norm": 6.1875, "learning_rate": 1.0111748171433067e-07, "loss": 0.76331358, "memory(GiB)": 138.1, "step": 80390, "train_speed(iter/s)": 0.20095 }, { "acc": 0.76334047, "epoch": 1.8757928812027005, "grad_norm": 6.1875, "learning_rate": 1.00739829616196e-07, "loss": 0.85111656, "memory(GiB)": 138.1, "step": 80400, "train_speed(iter/s)": 0.200962 }, { "acc": 0.78283181, "epoch": 1.8760261887749894, "grad_norm": 4.78125, "learning_rate": 1.0036287688602764e-07, "loss": 0.76812816, "memory(GiB)": 138.1, "step": 80410, "train_speed(iter/s)": 0.200976 }, { "acc": 0.76459651, "epoch": 1.8762594963472783, "grad_norm": 5.71875, "learning_rate": 9.998662357763534e-08, "loss": 0.85813875, "memory(GiB)": 138.1, "step": 80420, "train_speed(iter/s)": 0.200988 }, { "acc": 0.77645798, "epoch": 1.8764928039195672, "grad_norm": 4.84375, "learning_rate": 9.961106974472834e-08, "loss": 0.80161037, "memory(GiB)": 138.1, "step": 80430, "train_speed(iter/s)": 0.201 }, { "acc": 0.77092047, "epoch": 1.876726111491856, "grad_norm": 5.625, "learning_rate": 9.923621544091877e-08, "loss": 0.8250824, "memory(GiB)": 138.1, "step": 80440, "train_speed(iter/s)": 0.201013 }, { "acc": 0.79298916, "epoch": 1.876959419064145, "grad_norm": 6.1875, "learning_rate": 9.886206071971493e-08, "loss": 0.75579348, "memory(GiB)": 138.1, "step": 80450, "train_speed(iter/s)": 0.201025 }, { "acc": 0.77953978, "epoch": 1.8771927266364339, "grad_norm": 7.0625, "learning_rate": 9.848860563452855e-08, "loss": 0.78975258, "memory(GiB)": 138.1, "step": 80460, "train_speed(iter/s)": 0.201037 }, { "acc": 0.78567953, "epoch": 1.8774260342087228, "grad_norm": 3.96875, "learning_rate": 9.811585023866976e-08, "loss": 0.76402936, "memory(GiB)": 138.1, "step": 80470, "train_speed(iter/s)": 0.20105 }, { "acc": 0.78225698, "epoch": 1.8776593417810117, "grad_norm": 4.875, "learning_rate": 9.774379458534933e-08, "loss": 0.78416901, "memory(GiB)": 138.1, "step": 80480, "train_speed(iter/s)": 0.201063 }, { "acc": 0.77153597, "epoch": 1.8778926493533006, "grad_norm": 5.28125, "learning_rate": 9.73724387276781e-08, "loss": 0.84029179, "memory(GiB)": 138.1, "step": 80490, "train_speed(iter/s)": 0.201075 }, { "acc": 0.77242804, "epoch": 1.8781259569255895, "grad_norm": 5.1875, "learning_rate": 9.700178271866645e-08, "loss": 0.81851034, "memory(GiB)": 138.1, "step": 80500, "train_speed(iter/s)": 0.201087 }, { "epoch": 1.8781259569255895, "eval_acc": 0.7446892569323635, "eval_loss": 0.804384708404541, "eval_runtime": 1272.9616, "eval_samples_per_second": 28.273, "eval_steps_per_second": 14.137, "step": 80500 }, { "acc": 0.78566909, "epoch": 1.8783592644978784, "grad_norm": 5.0625, "learning_rate": 9.66318266112265e-08, "loss": 0.78130226, "memory(GiB)": 138.1, "step": 80510, "train_speed(iter/s)": 0.20045 }, { "acc": 0.78306484, "epoch": 1.8785925720701673, "grad_norm": 7.53125, "learning_rate": 9.626257045816879e-08, "loss": 0.77127929, "memory(GiB)": 138.1, "step": 80520, "train_speed(iter/s)": 0.200462 }, { "acc": 0.80443401, "epoch": 1.8788258796424562, "grad_norm": 4.59375, "learning_rate": 9.589401431220502e-08, "loss": 0.70631161, "memory(GiB)": 138.1, "step": 80530, "train_speed(iter/s)": 0.200474 }, { "acc": 0.78915844, "epoch": 1.879059187214745, "grad_norm": 5.4375, "learning_rate": 9.552615822594536e-08, "loss": 0.75501652, "memory(GiB)": 138.1, "step": 80540, "train_speed(iter/s)": 0.200486 }, { "acc": 0.77199645, "epoch": 1.879292494787034, "grad_norm": 7.125, "learning_rate": 9.515900225190222e-08, "loss": 0.8397831, "memory(GiB)": 138.1, "step": 80550, "train_speed(iter/s)": 0.200499 }, { "acc": 0.77649212, "epoch": 1.8795258023593229, "grad_norm": 8.0, "learning_rate": 9.479254644248648e-08, "loss": 0.80428057, "memory(GiB)": 138.1, "step": 80560, "train_speed(iter/s)": 0.200511 }, { "acc": 0.79153538, "epoch": 1.8797591099316118, "grad_norm": 5.5625, "learning_rate": 9.442679085000961e-08, "loss": 0.74753222, "memory(GiB)": 138.1, "step": 80570, "train_speed(iter/s)": 0.200524 }, { "acc": 0.77919335, "epoch": 1.8799924175039004, "grad_norm": 4.9375, "learning_rate": 9.40617355266843e-08, "loss": 0.80626259, "memory(GiB)": 138.1, "step": 80580, "train_speed(iter/s)": 0.200538 }, { "acc": 0.76336107, "epoch": 1.8802257250761896, "grad_norm": 5.1875, "learning_rate": 9.369738052461996e-08, "loss": 0.86293964, "memory(GiB)": 138.1, "step": 80590, "train_speed(iter/s)": 0.20055 }, { "acc": 0.79171991, "epoch": 1.8804590326484782, "grad_norm": 23.5, "learning_rate": 9.333372589583e-08, "loss": 0.75359583, "memory(GiB)": 138.1, "step": 80600, "train_speed(iter/s)": 0.200563 }, { "acc": 0.79271598, "epoch": 1.8806923402207674, "grad_norm": 8.5, "learning_rate": 9.297077169222513e-08, "loss": 0.74482245, "memory(GiB)": 138.1, "step": 80610, "train_speed(iter/s)": 0.200576 }, { "acc": 0.77063508, "epoch": 1.880925647793056, "grad_norm": 4.40625, "learning_rate": 9.260851796561609e-08, "loss": 0.84915276, "memory(GiB)": 138.1, "step": 80620, "train_speed(iter/s)": 0.20059 }, { "acc": 0.79158163, "epoch": 1.8811589553653452, "grad_norm": 3.96875, "learning_rate": 9.224696476771655e-08, "loss": 0.78680096, "memory(GiB)": 138.1, "step": 80630, "train_speed(iter/s)": 0.200602 }, { "acc": 0.77896152, "epoch": 1.8813922629376338, "grad_norm": 5.6875, "learning_rate": 9.188611215013631e-08, "loss": 0.79821224, "memory(GiB)": 138.1, "step": 80640, "train_speed(iter/s)": 0.200614 }, { "acc": 0.77518044, "epoch": 1.881625570509923, "grad_norm": 6.5, "learning_rate": 9.152596016438864e-08, "loss": 0.80904598, "memory(GiB)": 138.1, "step": 80650, "train_speed(iter/s)": 0.200627 }, { "acc": 0.7727581, "epoch": 1.8818588780822116, "grad_norm": 6.8125, "learning_rate": 9.11665088618835e-08, "loss": 0.83627472, "memory(GiB)": 138.1, "step": 80660, "train_speed(iter/s)": 0.20064 }, { "acc": 0.79154997, "epoch": 1.8820921856545008, "grad_norm": 4.25, "learning_rate": 9.080775829393373e-08, "loss": 0.75480595, "memory(GiB)": 138.1, "step": 80670, "train_speed(iter/s)": 0.200653 }, { "acc": 0.78853526, "epoch": 1.8823254932267894, "grad_norm": 6.28125, "learning_rate": 9.044970851175006e-08, "loss": 0.75794249, "memory(GiB)": 138.1, "step": 80680, "train_speed(iter/s)": 0.200665 }, { "acc": 0.77822475, "epoch": 1.8825588007990786, "grad_norm": 4.6875, "learning_rate": 9.009235956644491e-08, "loss": 0.79094992, "memory(GiB)": 138.1, "step": 80690, "train_speed(iter/s)": 0.200677 }, { "acc": 0.7686842, "epoch": 1.8827921083713672, "grad_norm": 4.75, "learning_rate": 8.97357115090286e-08, "loss": 0.84376907, "memory(GiB)": 138.1, "step": 80700, "train_speed(iter/s)": 0.20069 }, { "acc": 0.7691422, "epoch": 1.8830254159436564, "grad_norm": 5.625, "learning_rate": 8.937976439041263e-08, "loss": 0.83207283, "memory(GiB)": 138.1, "step": 80710, "train_speed(iter/s)": 0.200702 }, { "acc": 0.78920188, "epoch": 1.883258723515945, "grad_norm": 4.90625, "learning_rate": 8.902451826140911e-08, "loss": 0.74556618, "memory(GiB)": 138.1, "step": 80720, "train_speed(iter/s)": 0.200715 }, { "acc": 0.78106089, "epoch": 1.8834920310882342, "grad_norm": 6.59375, "learning_rate": 8.866997317272863e-08, "loss": 0.78032007, "memory(GiB)": 138.1, "step": 80730, "train_speed(iter/s)": 0.200726 }, { "acc": 0.80264349, "epoch": 1.8837253386605228, "grad_norm": 7.53125, "learning_rate": 8.831612917498288e-08, "loss": 0.69003105, "memory(GiB)": 138.1, "step": 80740, "train_speed(iter/s)": 0.200739 }, { "acc": 0.78006659, "epoch": 1.883958646232812, "grad_norm": 4.78125, "learning_rate": 8.796298631868317e-08, "loss": 0.79409275, "memory(GiB)": 138.1, "step": 80750, "train_speed(iter/s)": 0.200752 }, { "acc": 0.77235012, "epoch": 1.8841919538051006, "grad_norm": 6.09375, "learning_rate": 8.761054465423969e-08, "loss": 0.83254261, "memory(GiB)": 138.1, "step": 80760, "train_speed(iter/s)": 0.200764 }, { "acc": 0.79439788, "epoch": 1.8844252613773895, "grad_norm": 4.875, "learning_rate": 8.725880423196442e-08, "loss": 0.75280304, "memory(GiB)": 138.1, "step": 80770, "train_speed(iter/s)": 0.200777 }, { "acc": 0.7835988, "epoch": 1.8846585689496784, "grad_norm": 4.375, "learning_rate": 8.690776510206723e-08, "loss": 0.77280979, "memory(GiB)": 138.1, "step": 80780, "train_speed(iter/s)": 0.20079 }, { "acc": 0.79621954, "epoch": 1.8848918765219673, "grad_norm": 4.9375, "learning_rate": 8.655742731465966e-08, "loss": 0.74539442, "memory(GiB)": 138.1, "step": 80790, "train_speed(iter/s)": 0.200803 }, { "acc": 0.77132044, "epoch": 1.8851251840942562, "grad_norm": 6.1875, "learning_rate": 8.62077909197523e-08, "loss": 0.80378342, "memory(GiB)": 138.1, "step": 80800, "train_speed(iter/s)": 0.200816 }, { "acc": 0.79714594, "epoch": 1.8853584916665451, "grad_norm": 5.78125, "learning_rate": 8.585885596725518e-08, "loss": 0.71226463, "memory(GiB)": 138.1, "step": 80810, "train_speed(iter/s)": 0.200829 }, { "acc": 0.76155009, "epoch": 1.885591799238834, "grad_norm": 9.1875, "learning_rate": 8.551062250697795e-08, "loss": 0.89277039, "memory(GiB)": 138.1, "step": 80820, "train_speed(iter/s)": 0.200842 }, { "acc": 0.79220524, "epoch": 1.885825106811123, "grad_norm": 3.609375, "learning_rate": 8.516309058863249e-08, "loss": 0.72857275, "memory(GiB)": 138.1, "step": 80830, "train_speed(iter/s)": 0.200855 }, { "acc": 0.77188721, "epoch": 1.8860584143834118, "grad_norm": 4.71875, "learning_rate": 8.481626026182798e-08, "loss": 0.84043217, "memory(GiB)": 138.1, "step": 80840, "train_speed(iter/s)": 0.200869 }, { "acc": 0.79511795, "epoch": 1.8862917219557007, "grad_norm": 5.8125, "learning_rate": 8.447013157607431e-08, "loss": 0.7310348, "memory(GiB)": 138.1, "step": 80850, "train_speed(iter/s)": 0.200881 }, { "acc": 0.79568253, "epoch": 1.8865250295279896, "grad_norm": 6.03125, "learning_rate": 8.412470458078137e-08, "loss": 0.72156382, "memory(GiB)": 138.1, "step": 80860, "train_speed(iter/s)": 0.200894 }, { "acc": 0.75479288, "epoch": 1.8867583371002785, "grad_norm": 8.25, "learning_rate": 8.37799793252586e-08, "loss": 0.89089012, "memory(GiB)": 138.1, "step": 80870, "train_speed(iter/s)": 0.200906 }, { "acc": 0.79052868, "epoch": 1.8869916446725674, "grad_norm": 5.53125, "learning_rate": 8.343595585871611e-08, "loss": 0.74636126, "memory(GiB)": 138.1, "step": 80880, "train_speed(iter/s)": 0.200919 }, { "acc": 0.77063046, "epoch": 1.8872249522448563, "grad_norm": 4.78125, "learning_rate": 8.309263423026237e-08, "loss": 0.81843338, "memory(GiB)": 138.1, "step": 80890, "train_speed(iter/s)": 0.200933 }, { "acc": 0.77630882, "epoch": 1.8874582598171452, "grad_norm": 5.8125, "learning_rate": 8.275001448890651e-08, "loss": 0.81249247, "memory(GiB)": 138.1, "step": 80900, "train_speed(iter/s)": 0.200945 }, { "acc": 0.75715265, "epoch": 1.887691567389434, "grad_norm": 5.65625, "learning_rate": 8.240809668355776e-08, "loss": 0.88787174, "memory(GiB)": 138.1, "step": 80910, "train_speed(iter/s)": 0.200959 }, { "acc": 0.7806591, "epoch": 1.887924874961723, "grad_norm": 6.15625, "learning_rate": 8.206688086302483e-08, "loss": 0.80691376, "memory(GiB)": 138.1, "step": 80920, "train_speed(iter/s)": 0.200972 }, { "acc": 0.78252301, "epoch": 1.888158182534012, "grad_norm": 5.25, "learning_rate": 8.1726367076016e-08, "loss": 0.78188124, "memory(GiB)": 138.1, "step": 80930, "train_speed(iter/s)": 0.200985 }, { "acc": 0.77645779, "epoch": 1.8883914901063008, "grad_norm": 5.90625, "learning_rate": 8.138655537113904e-08, "loss": 0.82004871, "memory(GiB)": 138.1, "step": 80940, "train_speed(iter/s)": 0.200998 }, { "acc": 0.79138808, "epoch": 1.8886247976785897, "grad_norm": 4.375, "learning_rate": 8.104744579690294e-08, "loss": 0.75213003, "memory(GiB)": 138.1, "step": 80950, "train_speed(iter/s)": 0.20101 }, { "acc": 0.77159157, "epoch": 1.8888581052508786, "grad_norm": 5.15625, "learning_rate": 8.070903840171451e-08, "loss": 0.84294567, "memory(GiB)": 138.1, "step": 80960, "train_speed(iter/s)": 0.201022 }, { "acc": 0.77069464, "epoch": 1.8890914128231673, "grad_norm": 7.9375, "learning_rate": 8.037133323388236e-08, "loss": 0.82081795, "memory(GiB)": 138.1, "step": 80970, "train_speed(iter/s)": 0.201035 }, { "acc": 0.79293261, "epoch": 1.8893247203954564, "grad_norm": 5.75, "learning_rate": 8.00343303416129e-08, "loss": 0.74969621, "memory(GiB)": 138.1, "step": 80980, "train_speed(iter/s)": 0.201049 }, { "acc": 0.78427496, "epoch": 1.889558027967745, "grad_norm": 6.34375, "learning_rate": 7.969802977301322e-08, "loss": 0.77531166, "memory(GiB)": 138.1, "step": 80990, "train_speed(iter/s)": 0.201062 }, { "acc": 0.77599058, "epoch": 1.8897913355400342, "grad_norm": 5.4375, "learning_rate": 7.936243157609103e-08, "loss": 0.79406672, "memory(GiB)": 138.1, "step": 81000, "train_speed(iter/s)": 0.201076 }, { "epoch": 1.8897913355400342, "eval_acc": 0.7446983851536048, "eval_loss": 0.804445207118988, "eval_runtime": 1271.5518, "eval_samples_per_second": 28.305, "eval_steps_per_second": 14.153, "step": 81000 }, { "acc": 0.7871388, "epoch": 1.8900246431123229, "grad_norm": 5.40625, "learning_rate": 7.902753579875189e-08, "loss": 0.78215551, "memory(GiB)": 138.1, "step": 81010, "train_speed(iter/s)": 0.200443 }, { "acc": 0.76717043, "epoch": 1.890257950684612, "grad_norm": 11.125, "learning_rate": 7.869334248880256e-08, "loss": 0.82725344, "memory(GiB)": 138.1, "step": 81020, "train_speed(iter/s)": 0.200456 }, { "acc": 0.77815256, "epoch": 1.8904912582569007, "grad_norm": 5.96875, "learning_rate": 7.835985169394933e-08, "loss": 0.7883893, "memory(GiB)": 138.1, "step": 81030, "train_speed(iter/s)": 0.200469 }, { "acc": 0.76834126, "epoch": 1.8907245658291898, "grad_norm": 4.8125, "learning_rate": 7.802706346179744e-08, "loss": 0.84197254, "memory(GiB)": 138.1, "step": 81040, "train_speed(iter/s)": 0.200482 }, { "acc": 0.7763195, "epoch": 1.8909578734014785, "grad_norm": 5.53125, "learning_rate": 7.769497783985224e-08, "loss": 0.81596165, "memory(GiB)": 138.1, "step": 81050, "train_speed(iter/s)": 0.200494 }, { "acc": 0.77682486, "epoch": 1.8911911809737676, "grad_norm": 4.71875, "learning_rate": 7.736359487551859e-08, "loss": 0.80225954, "memory(GiB)": 138.1, "step": 81060, "train_speed(iter/s)": 0.200507 }, { "acc": 0.77334614, "epoch": 1.8914244885460563, "grad_norm": 5.71875, "learning_rate": 7.703291461610252e-08, "loss": 0.81839981, "memory(GiB)": 138.1, "step": 81070, "train_speed(iter/s)": 0.20052 }, { "acc": 0.76000309, "epoch": 1.8916577961183454, "grad_norm": 7.65625, "learning_rate": 7.670293710880683e-08, "loss": 0.86824694, "memory(GiB)": 138.1, "step": 81080, "train_speed(iter/s)": 0.200533 }, { "acc": 0.75734501, "epoch": 1.891891103690634, "grad_norm": 4.4375, "learning_rate": 7.637366240073717e-08, "loss": 0.89891548, "memory(GiB)": 138.1, "step": 81090, "train_speed(iter/s)": 0.200546 }, { "acc": 0.78540058, "epoch": 1.8921244112629232, "grad_norm": 4.4375, "learning_rate": 7.604509053889708e-08, "loss": 0.7534976, "memory(GiB)": 138.1, "step": 81100, "train_speed(iter/s)": 0.200559 }, { "acc": 0.76539426, "epoch": 1.8923577188352119, "grad_norm": 6.15625, "learning_rate": 7.571722157018957e-08, "loss": 0.84462833, "memory(GiB)": 138.1, "step": 81110, "train_speed(iter/s)": 0.200571 }, { "acc": 0.79330382, "epoch": 1.892591026407501, "grad_norm": 4.5625, "learning_rate": 7.539005554141831e-08, "loss": 0.72500844, "memory(GiB)": 138.1, "step": 81120, "train_speed(iter/s)": 0.200584 }, { "acc": 0.80007896, "epoch": 1.8928243339797897, "grad_norm": 3.875, "learning_rate": 7.506359249928542e-08, "loss": 0.71526747, "memory(GiB)": 138.1, "step": 81130, "train_speed(iter/s)": 0.200596 }, { "acc": 0.77556849, "epoch": 1.8930576415520788, "grad_norm": 5.8125, "learning_rate": 7.473783249039468e-08, "loss": 0.80747452, "memory(GiB)": 138.1, "step": 81140, "train_speed(iter/s)": 0.200609 }, { "acc": 0.78302164, "epoch": 1.8932909491243675, "grad_norm": 5.8125, "learning_rate": 7.441277556124781e-08, "loss": 0.77237792, "memory(GiB)": 138.1, "step": 81150, "train_speed(iter/s)": 0.200622 }, { "acc": 0.78645997, "epoch": 1.8935242566966564, "grad_norm": 5.84375, "learning_rate": 7.408842175824604e-08, "loss": 0.76683159, "memory(GiB)": 138.1, "step": 81160, "train_speed(iter/s)": 0.200635 }, { "acc": 0.78576627, "epoch": 1.8937575642689453, "grad_norm": 5.28125, "learning_rate": 7.376477112769064e-08, "loss": 0.77226114, "memory(GiB)": 138.1, "step": 81170, "train_speed(iter/s)": 0.200648 }, { "acc": 0.77925673, "epoch": 1.8939908718412342, "grad_norm": 5.0, "learning_rate": 7.344182371578356e-08, "loss": 0.79336381, "memory(GiB)": 138.1, "step": 81180, "train_speed(iter/s)": 0.200661 }, { "acc": 0.79144859, "epoch": 1.894224179413523, "grad_norm": 4.46875, "learning_rate": 7.311957956862459e-08, "loss": 0.73583159, "memory(GiB)": 138.1, "step": 81190, "train_speed(iter/s)": 0.200674 }, { "acc": 0.7666995, "epoch": 1.894457486985812, "grad_norm": 5.5625, "learning_rate": 7.279803873221469e-08, "loss": 0.84476967, "memory(GiB)": 138.1, "step": 81200, "train_speed(iter/s)": 0.200687 }, { "acc": 0.80386267, "epoch": 1.8946907945581009, "grad_norm": 5.375, "learning_rate": 7.247720125245328e-08, "loss": 0.68902006, "memory(GiB)": 138.1, "step": 81210, "train_speed(iter/s)": 0.2007 }, { "acc": 0.79350948, "epoch": 1.8949241021303898, "grad_norm": 5.8125, "learning_rate": 7.215706717513982e-08, "loss": 0.72404013, "memory(GiB)": 138.1, "step": 81220, "train_speed(iter/s)": 0.200712 }, { "acc": 0.76372366, "epoch": 1.8951574097026787, "grad_norm": 5.375, "learning_rate": 7.183763654597387e-08, "loss": 0.8393321, "memory(GiB)": 138.1, "step": 81230, "train_speed(iter/s)": 0.200725 }, { "acc": 0.79516706, "epoch": 1.8953907172749676, "grad_norm": 6.53125, "learning_rate": 7.15189094105534e-08, "loss": 0.72492518, "memory(GiB)": 138.1, "step": 81240, "train_speed(iter/s)": 0.200739 }, { "acc": 0.76616926, "epoch": 1.8956240248472565, "grad_norm": 4.40625, "learning_rate": 7.120088581437645e-08, "loss": 0.85155525, "memory(GiB)": 138.1, "step": 81250, "train_speed(iter/s)": 0.200751 }, { "acc": 0.79208398, "epoch": 1.8958573324195453, "grad_norm": 4.1875, "learning_rate": 7.088356580284228e-08, "loss": 0.73564825, "memory(GiB)": 138.1, "step": 81260, "train_speed(iter/s)": 0.200763 }, { "acc": 0.77262831, "epoch": 1.8960906399918342, "grad_norm": 5.5625, "learning_rate": 7.05669494212463e-08, "loss": 0.818857, "memory(GiB)": 138.1, "step": 81270, "train_speed(iter/s)": 0.200776 }, { "acc": 0.77157354, "epoch": 1.8963239475641231, "grad_norm": 6.28125, "learning_rate": 7.025103671478684e-08, "loss": 0.81951456, "memory(GiB)": 138.1, "step": 81280, "train_speed(iter/s)": 0.200789 }, { "acc": 0.78546553, "epoch": 1.896557255136412, "grad_norm": 4.53125, "learning_rate": 6.993582772855889e-08, "loss": 0.77391062, "memory(GiB)": 138.1, "step": 81290, "train_speed(iter/s)": 0.200801 }, { "acc": 0.77268543, "epoch": 1.896790562708701, "grad_norm": 4.59375, "learning_rate": 6.962132250756037e-08, "loss": 0.82842484, "memory(GiB)": 138.1, "step": 81300, "train_speed(iter/s)": 0.200815 }, { "acc": 0.7767509, "epoch": 1.8970238702809898, "grad_norm": 5.53125, "learning_rate": 6.930752109668481e-08, "loss": 0.7966526, "memory(GiB)": 138.1, "step": 81310, "train_speed(iter/s)": 0.200828 }, { "acc": 0.7757802, "epoch": 1.8972571778532787, "grad_norm": 4.3125, "learning_rate": 6.899442354072916e-08, "loss": 0.79777894, "memory(GiB)": 138.1, "step": 81320, "train_speed(iter/s)": 0.20084 }, { "acc": 0.77078934, "epoch": 1.8974904854255676, "grad_norm": 5.84375, "learning_rate": 6.868202988438655e-08, "loss": 0.82341976, "memory(GiB)": 138.1, "step": 81330, "train_speed(iter/s)": 0.200852 }, { "acc": 0.78582072, "epoch": 1.8977237929978565, "grad_norm": 5.65625, "learning_rate": 6.837034017225186e-08, "loss": 0.78013291, "memory(GiB)": 138.1, "step": 81340, "train_speed(iter/s)": 0.200864 }, { "acc": 0.77980404, "epoch": 1.8979571005701454, "grad_norm": 4.96875, "learning_rate": 6.80593544488184e-08, "loss": 0.78744049, "memory(GiB)": 138.1, "step": 81350, "train_speed(iter/s)": 0.200877 }, { "acc": 0.77276716, "epoch": 1.8981904081424341, "grad_norm": 5.625, "learning_rate": 6.774907275847898e-08, "loss": 0.81919241, "memory(GiB)": 138.1, "step": 81360, "train_speed(iter/s)": 0.200889 }, { "acc": 0.76729136, "epoch": 1.8984237157147232, "grad_norm": 5.21875, "learning_rate": 6.743949514552706e-08, "loss": 0.83895359, "memory(GiB)": 138.1, "step": 81370, "train_speed(iter/s)": 0.200903 }, { "acc": 0.78235312, "epoch": 1.898657023287012, "grad_norm": 4.375, "learning_rate": 6.713062165415451e-08, "loss": 0.77152939, "memory(GiB)": 138.1, "step": 81380, "train_speed(iter/s)": 0.200916 }, { "acc": 0.79524784, "epoch": 1.898890330859301, "grad_norm": 7.625, "learning_rate": 6.682245232845219e-08, "loss": 0.74803662, "memory(GiB)": 138.1, "step": 81390, "train_speed(iter/s)": 0.200929 }, { "acc": 0.78636365, "epoch": 1.8991236384315897, "grad_norm": 6.09375, "learning_rate": 6.651498721241212e-08, "loss": 0.78164091, "memory(GiB)": 138.1, "step": 81400, "train_speed(iter/s)": 0.200941 }, { "acc": 0.7818541, "epoch": 1.8993569460038788, "grad_norm": 4.1875, "learning_rate": 6.620822634992419e-08, "loss": 0.79073544, "memory(GiB)": 138.1, "step": 81410, "train_speed(iter/s)": 0.200954 }, { "acc": 0.7828022, "epoch": 1.8995902535761675, "grad_norm": 5.71875, "learning_rate": 6.590216978477836e-08, "loss": 0.77449131, "memory(GiB)": 138.1, "step": 81420, "train_speed(iter/s)": 0.200964 }, { "acc": 0.78085957, "epoch": 1.8998235611484566, "grad_norm": 6.0625, "learning_rate": 6.559681756066471e-08, "loss": 0.7848876, "memory(GiB)": 138.1, "step": 81430, "train_speed(iter/s)": 0.200978 }, { "acc": 0.78292851, "epoch": 1.9000568687207453, "grad_norm": 5.15625, "learning_rate": 6.529216972117225e-08, "loss": 0.77457685, "memory(GiB)": 138.1, "step": 81440, "train_speed(iter/s)": 0.20099 }, { "acc": 0.76646495, "epoch": 1.9002901762930344, "grad_norm": 5.03125, "learning_rate": 6.49882263097884e-08, "loss": 0.8381773, "memory(GiB)": 138.1, "step": 81450, "train_speed(iter/s)": 0.201003 }, { "acc": 0.78961086, "epoch": 1.900523483865323, "grad_norm": 4.78125, "learning_rate": 6.468498736990181e-08, "loss": 0.73249388, "memory(GiB)": 138.1, "step": 81460, "train_speed(iter/s)": 0.201015 }, { "acc": 0.77733517, "epoch": 1.9007567914376122, "grad_norm": 5.0, "learning_rate": 6.438245294480006e-08, "loss": 0.78677988, "memory(GiB)": 138.1, "step": 81470, "train_speed(iter/s)": 0.201028 }, { "acc": 0.79435854, "epoch": 1.900990099009901, "grad_norm": 4.96875, "learning_rate": 6.40806230776686e-08, "loss": 0.71958585, "memory(GiB)": 138.1, "step": 81480, "train_speed(iter/s)": 0.20104 }, { "acc": 0.7977644, "epoch": 1.90122340658219, "grad_norm": 6.25, "learning_rate": 6.37794978115952e-08, "loss": 0.73467293, "memory(GiB)": 138.1, "step": 81490, "train_speed(iter/s)": 0.201052 }, { "acc": 0.78441677, "epoch": 1.9014567141544787, "grad_norm": 7.0625, "learning_rate": 6.347907718956381e-08, "loss": 0.77021914, "memory(GiB)": 138.1, "step": 81500, "train_speed(iter/s)": 0.201064 }, { "epoch": 1.9014567141544787, "eval_acc": 0.7447350581828017, "eval_loss": 0.8044015765190125, "eval_runtime": 1270.7097, "eval_samples_per_second": 28.324, "eval_steps_per_second": 14.162, "step": 81500 }, { "acc": 0.78291373, "epoch": 1.9016900217267678, "grad_norm": 5.34375, "learning_rate": 6.317936125446012e-08, "loss": 0.78251152, "memory(GiB)": 138.1, "step": 81510, "train_speed(iter/s)": 0.200436 }, { "acc": 0.76142359, "epoch": 1.9019233292990565, "grad_norm": 6.21875, "learning_rate": 6.288035004906878e-08, "loss": 0.87129774, "memory(GiB)": 138.1, "step": 81520, "train_speed(iter/s)": 0.200449 }, { "acc": 0.80396862, "epoch": 1.9021566368713456, "grad_norm": 6.1875, "learning_rate": 6.258204361607289e-08, "loss": 0.69574623, "memory(GiB)": 138.1, "step": 81530, "train_speed(iter/s)": 0.200461 }, { "acc": 0.78162894, "epoch": 1.9023899444436343, "grad_norm": 4.46875, "learning_rate": 6.228444199805617e-08, "loss": 0.77852755, "memory(GiB)": 138.1, "step": 81540, "train_speed(iter/s)": 0.200474 }, { "acc": 0.77957335, "epoch": 1.9026232520159232, "grad_norm": 4.875, "learning_rate": 6.198754523750072e-08, "loss": 0.80971165, "memory(GiB)": 138.1, "step": 81550, "train_speed(iter/s)": 0.200487 }, { "acc": 0.7833252, "epoch": 1.902856559588212, "grad_norm": 5.84375, "learning_rate": 6.169135337678878e-08, "loss": 0.77970705, "memory(GiB)": 138.1, "step": 81560, "train_speed(iter/s)": 0.2005 }, { "acc": 0.77797418, "epoch": 1.903089867160501, "grad_norm": 7.125, "learning_rate": 6.13958664582015e-08, "loss": 0.79811144, "memory(GiB)": 138.1, "step": 81570, "train_speed(iter/s)": 0.200513 }, { "acc": 0.77659426, "epoch": 1.90332317473279, "grad_norm": 5.9375, "learning_rate": 6.11010845239196e-08, "loss": 0.80996256, "memory(GiB)": 138.1, "step": 81580, "train_speed(iter/s)": 0.200525 }, { "acc": 0.79570951, "epoch": 1.9035564823050788, "grad_norm": 4.21875, "learning_rate": 6.080700761602331e-08, "loss": 0.72712016, "memory(GiB)": 138.1, "step": 81590, "train_speed(iter/s)": 0.200538 }, { "acc": 0.77094784, "epoch": 1.9037897898773677, "grad_norm": 5.1875, "learning_rate": 6.051363577649238e-08, "loss": 0.83303509, "memory(GiB)": 138.1, "step": 81600, "train_speed(iter/s)": 0.200551 }, { "acc": 0.77590933, "epoch": 1.9040230974496566, "grad_norm": 5.25, "learning_rate": 6.022096904720388e-08, "loss": 0.79160347, "memory(GiB)": 138.1, "step": 81610, "train_speed(iter/s)": 0.200562 }, { "acc": 0.77262087, "epoch": 1.9042564050219455, "grad_norm": 4.75, "learning_rate": 5.992900746993768e-08, "loss": 0.82788048, "memory(GiB)": 138.1, "step": 81620, "train_speed(iter/s)": 0.200575 }, { "acc": 0.77261982, "epoch": 1.9044897125942344, "grad_norm": 8.3125, "learning_rate": 5.963775108637048e-08, "loss": 0.83426991, "memory(GiB)": 138.1, "step": 81630, "train_speed(iter/s)": 0.200588 }, { "acc": 0.76702747, "epoch": 1.9047230201665233, "grad_norm": 5.75, "learning_rate": 5.9347199938079026e-08, "loss": 0.8470952, "memory(GiB)": 138.1, "step": 81640, "train_speed(iter/s)": 0.200601 }, { "acc": 0.77795296, "epoch": 1.9049563277388122, "grad_norm": 4.84375, "learning_rate": 5.9057354066539564e-08, "loss": 0.79301691, "memory(GiB)": 138.1, "step": 81650, "train_speed(iter/s)": 0.200613 }, { "acc": 0.77648511, "epoch": 1.905189635311101, "grad_norm": 5.40625, "learning_rate": 5.876821351312734e-08, "loss": 0.81472111, "memory(GiB)": 138.1, "step": 81660, "train_speed(iter/s)": 0.200626 }, { "acc": 0.78697462, "epoch": 1.90542294288339, "grad_norm": 5.625, "learning_rate": 5.8479778319117665e-08, "loss": 0.76727009, "memory(GiB)": 138.1, "step": 81670, "train_speed(iter/s)": 0.200639 }, { "acc": 0.76950393, "epoch": 1.9056562504556789, "grad_norm": 4.59375, "learning_rate": 5.819204852568372e-08, "loss": 0.82860451, "memory(GiB)": 138.1, "step": 81680, "train_speed(iter/s)": 0.200652 }, { "acc": 0.76736298, "epoch": 1.9058895580279678, "grad_norm": 6.875, "learning_rate": 5.790502417389876e-08, "loss": 0.82692204, "memory(GiB)": 138.1, "step": 81690, "train_speed(iter/s)": 0.200665 }, { "acc": 0.77684221, "epoch": 1.9061228656002567, "grad_norm": 5.15625, "learning_rate": 5.7618705304736676e-08, "loss": 0.79659696, "memory(GiB)": 138.1, "step": 81700, "train_speed(iter/s)": 0.200677 }, { "acc": 0.76629496, "epoch": 1.9063561731725456, "grad_norm": 4.25, "learning_rate": 5.733309195906811e-08, "loss": 0.8493578, "memory(GiB)": 138.1, "step": 81710, "train_speed(iter/s)": 0.20069 }, { "acc": 0.76272459, "epoch": 1.9065894807448345, "grad_norm": 5.40625, "learning_rate": 5.7048184177666e-08, "loss": 0.84202557, "memory(GiB)": 138.1, "step": 81720, "train_speed(iter/s)": 0.200703 }, { "acc": 0.77872486, "epoch": 1.9068227883171232, "grad_norm": 7.9375, "learning_rate": 5.676398200119837e-08, "loss": 0.78867388, "memory(GiB)": 138.1, "step": 81730, "train_speed(iter/s)": 0.200716 }, { "acc": 0.79240322, "epoch": 1.9070560958894123, "grad_norm": 4.65625, "learning_rate": 5.648048547023666e-08, "loss": 0.71631851, "memory(GiB)": 138.1, "step": 81740, "train_speed(iter/s)": 0.200729 }, { "acc": 0.77871118, "epoch": 1.907289403461701, "grad_norm": 5.1875, "learning_rate": 5.61976946252496e-08, "loss": 0.79847703, "memory(GiB)": 138.1, "step": 81750, "train_speed(iter/s)": 0.200742 }, { "acc": 0.77508669, "epoch": 1.90752271103399, "grad_norm": 4.71875, "learning_rate": 5.591560950660546e-08, "loss": 0.8124464, "memory(GiB)": 138.1, "step": 81760, "train_speed(iter/s)": 0.200755 }, { "acc": 0.79896374, "epoch": 1.9077560186062787, "grad_norm": 5.15625, "learning_rate": 5.563423015457203e-08, "loss": 0.72898488, "memory(GiB)": 138.1, "step": 81770, "train_speed(iter/s)": 0.200767 }, { "acc": 0.80807238, "epoch": 1.9079893261785679, "grad_norm": 4.5, "learning_rate": 5.535355660931552e-08, "loss": 0.68237019, "memory(GiB)": 138.1, "step": 81780, "train_speed(iter/s)": 0.200779 }, { "acc": 0.77174668, "epoch": 1.9082226337508565, "grad_norm": 4.6875, "learning_rate": 5.50735889109022e-08, "loss": 0.82769318, "memory(GiB)": 138.1, "step": 81790, "train_speed(iter/s)": 0.200792 }, { "acc": 0.75991964, "epoch": 1.9084559413231457, "grad_norm": 4.3125, "learning_rate": 5.4794327099297887e-08, "loss": 0.86439896, "memory(GiB)": 138.1, "step": 81800, "train_speed(iter/s)": 0.200805 }, { "acc": 0.76893587, "epoch": 1.9086892488954343, "grad_norm": 5.15625, "learning_rate": 5.45157712143668e-08, "loss": 0.82247448, "memory(GiB)": 138.1, "step": 81810, "train_speed(iter/s)": 0.200817 }, { "acc": 0.79243269, "epoch": 1.9089225564677235, "grad_norm": 5.03125, "learning_rate": 5.423792129587269e-08, "loss": 0.71618028, "memory(GiB)": 138.1, "step": 81820, "train_speed(iter/s)": 0.20083 }, { "acc": 0.79535136, "epoch": 1.9091558640400121, "grad_norm": 4.21875, "learning_rate": 5.396077738347882e-08, "loss": 0.74078579, "memory(GiB)": 138.1, "step": 81830, "train_speed(iter/s)": 0.200842 }, { "acc": 0.79126263, "epoch": 1.9093891716123013, "grad_norm": 5.90625, "learning_rate": 5.36843395167469e-08, "loss": 0.76003933, "memory(GiB)": 138.1, "step": 81840, "train_speed(iter/s)": 0.200855 }, { "acc": 0.78736057, "epoch": 1.90962247918459, "grad_norm": 5.1875, "learning_rate": 5.340860773513812e-08, "loss": 0.76994362, "memory(GiB)": 138.1, "step": 81850, "train_speed(iter/s)": 0.200867 }, { "acc": 0.77512932, "epoch": 1.909855786756879, "grad_norm": 6.625, "learning_rate": 5.313358207801433e-08, "loss": 0.81734104, "memory(GiB)": 138.1, "step": 81860, "train_speed(iter/s)": 0.20088 }, { "acc": 0.77062392, "epoch": 1.9100890943291677, "grad_norm": 5.1875, "learning_rate": 5.285926258463414e-08, "loss": 0.80548515, "memory(GiB)": 138.1, "step": 81870, "train_speed(iter/s)": 0.200893 }, { "acc": 0.78182507, "epoch": 1.9103224019014569, "grad_norm": 5.4375, "learning_rate": 5.2585649294157326e-08, "loss": 0.79319897, "memory(GiB)": 138.1, "step": 81880, "train_speed(iter/s)": 0.200904 }, { "acc": 0.78957319, "epoch": 1.9105557094737455, "grad_norm": 7.28125, "learning_rate": 5.231274224564154e-08, "loss": 0.7621007, "memory(GiB)": 138.1, "step": 81890, "train_speed(iter/s)": 0.200916 }, { "acc": 0.77407713, "epoch": 1.9107890170460347, "grad_norm": 5.0625, "learning_rate": 5.2040541478044496e-08, "loss": 0.82251482, "memory(GiB)": 138.1, "step": 81900, "train_speed(iter/s)": 0.200929 }, { "acc": 0.75633006, "epoch": 1.9110223246183233, "grad_norm": 6.5, "learning_rate": 5.176904703022345e-08, "loss": 0.87226925, "memory(GiB)": 138.1, "step": 81910, "train_speed(iter/s)": 0.200942 }, { "acc": 0.77395215, "epoch": 1.9112556321906125, "grad_norm": 6.0625, "learning_rate": 5.149825894093241e-08, "loss": 0.80833683, "memory(GiB)": 138.1, "step": 81920, "train_speed(iter/s)": 0.200955 }, { "acc": 0.76668892, "epoch": 1.9114889397629011, "grad_norm": 7.75, "learning_rate": 5.1228177248828224e-08, "loss": 0.82612123, "memory(GiB)": 138.1, "step": 81930, "train_speed(iter/s)": 0.200968 }, { "acc": 0.7853972, "epoch": 1.91172224733519, "grad_norm": 6.59375, "learning_rate": 5.0958801992463944e-08, "loss": 0.76109056, "memory(GiB)": 138.1, "step": 81940, "train_speed(iter/s)": 0.20098 }, { "acc": 0.78190913, "epoch": 1.911955554907479, "grad_norm": 8.0, "learning_rate": 5.069013321029326e-08, "loss": 0.7810389, "memory(GiB)": 138.1, "step": 81950, "train_speed(iter/s)": 0.200993 }, { "acc": 0.76182289, "epoch": 1.9121888624797678, "grad_norm": 5.125, "learning_rate": 5.0422170940667147e-08, "loss": 0.85948324, "memory(GiB)": 138.1, "step": 81960, "train_speed(iter/s)": 0.201006 }, { "acc": 0.78014874, "epoch": 1.9124221700520567, "grad_norm": 7.4375, "learning_rate": 5.015491522183946e-08, "loss": 0.78254461, "memory(GiB)": 138.1, "step": 81970, "train_speed(iter/s)": 0.201019 }, { "acc": 0.77584715, "epoch": 1.9126554776243456, "grad_norm": 6.25, "learning_rate": 4.9888366091959125e-08, "loss": 0.8201292, "memory(GiB)": 138.1, "step": 81980, "train_speed(iter/s)": 0.201031 }, { "acc": 0.76424618, "epoch": 1.9128887851966345, "grad_norm": 5.1875, "learning_rate": 4.962252358907627e-08, "loss": 0.85319223, "memory(GiB)": 138.1, "step": 81990, "train_speed(iter/s)": 0.201044 }, { "acc": 0.76576996, "epoch": 1.9131220927689234, "grad_norm": 5.125, "learning_rate": 4.935738775114052e-08, "loss": 0.84802933, "memory(GiB)": 138.1, "step": 82000, "train_speed(iter/s)": 0.201056 }, { "epoch": 1.9131220927689234, "eval_acc": 0.7447390617886093, "eval_loss": 0.8044153451919556, "eval_runtime": 1273.2096, "eval_samples_per_second": 28.268, "eval_steps_per_second": 14.134, "step": 82000 }, { "acc": 0.78968439, "epoch": 1.9133554003412123, "grad_norm": 5.59375, "learning_rate": 4.9092958615999385e-08, "loss": 0.77019253, "memory(GiB)": 138.1, "step": 82010, "train_speed(iter/s)": 0.20043 }, { "acc": 0.75527754, "epoch": 1.9135887079135012, "grad_norm": 14.875, "learning_rate": 4.882923622140046e-08, "loss": 0.87390118, "memory(GiB)": 138.1, "step": 82020, "train_speed(iter/s)": 0.200442 }, { "acc": 0.77405939, "epoch": 1.9138220154857901, "grad_norm": 5.5625, "learning_rate": 4.856622060498972e-08, "loss": 0.80371208, "memory(GiB)": 138.1, "step": 82030, "train_speed(iter/s)": 0.200455 }, { "acc": 0.78950992, "epoch": 1.914055323058079, "grad_norm": 3.859375, "learning_rate": 4.83039118043116e-08, "loss": 0.744206, "memory(GiB)": 138.1, "step": 82040, "train_speed(iter/s)": 0.200467 }, { "acc": 0.78136091, "epoch": 1.914288630630368, "grad_norm": 9.9375, "learning_rate": 4.80423098568128e-08, "loss": 0.781563, "memory(GiB)": 138.1, "step": 82050, "train_speed(iter/s)": 0.20048 }, { "acc": 0.76963954, "epoch": 1.9145219382026568, "grad_norm": 4.9375, "learning_rate": 4.7781414799835116e-08, "loss": 0.82037382, "memory(GiB)": 138.1, "step": 82060, "train_speed(iter/s)": 0.200492 }, { "acc": 0.80206232, "epoch": 1.9147552457749457, "grad_norm": 5.6875, "learning_rate": 4.752122667062209e-08, "loss": 0.70895433, "memory(GiB)": 138.1, "step": 82070, "train_speed(iter/s)": 0.200505 }, { "acc": 0.79812136, "epoch": 1.9149885533472346, "grad_norm": 4.40625, "learning_rate": 4.726174550631513e-08, "loss": 0.68837681, "memory(GiB)": 138.1, "step": 82080, "train_speed(iter/s)": 0.200516 }, { "acc": 0.78133001, "epoch": 1.9152218609195235, "grad_norm": 4.71875, "learning_rate": 4.7002971343955153e-08, "loss": 0.74567728, "memory(GiB)": 138.1, "step": 82090, "train_speed(iter/s)": 0.200529 }, { "acc": 0.80090256, "epoch": 1.9154551684918124, "grad_norm": 4.21875, "learning_rate": 4.6744904220482056e-08, "loss": 0.71008487, "memory(GiB)": 138.1, "step": 82100, "train_speed(iter/s)": 0.200542 }, { "acc": 0.7789093, "epoch": 1.9156884760641013, "grad_norm": 4.59375, "learning_rate": 4.648754417273526e-08, "loss": 0.78899159, "memory(GiB)": 138.1, "step": 82110, "train_speed(iter/s)": 0.200554 }, { "acc": 0.77255368, "epoch": 1.91592178363639, "grad_norm": 7.25, "learning_rate": 4.623089123745261e-08, "loss": 0.82642889, "memory(GiB)": 138.1, "step": 82120, "train_speed(iter/s)": 0.200566 }, { "acc": 0.79890847, "epoch": 1.916155091208679, "grad_norm": 5.15625, "learning_rate": 4.597494545127035e-08, "loss": 0.71255264, "memory(GiB)": 138.1, "step": 82130, "train_speed(iter/s)": 0.200579 }, { "acc": 0.77682538, "epoch": 1.9163883987809678, "grad_norm": 8.0625, "learning_rate": 4.571970685072646e-08, "loss": 0.7757277, "memory(GiB)": 138.1, "step": 82140, "train_speed(iter/s)": 0.200591 }, { "acc": 0.78598347, "epoch": 1.916621706353257, "grad_norm": 3.734375, "learning_rate": 4.5465175472254594e-08, "loss": 0.75754504, "memory(GiB)": 138.1, "step": 82150, "train_speed(iter/s)": 0.200604 }, { "acc": 0.78972554, "epoch": 1.9168550139255456, "grad_norm": 8.1875, "learning_rate": 4.521135135218957e-08, "loss": 0.7571701, "memory(GiB)": 138.1, "step": 82160, "train_speed(iter/s)": 0.200615 }, { "acc": 0.77707663, "epoch": 1.9170883214978347, "grad_norm": 4.84375, "learning_rate": 4.495823452676518e-08, "loss": 0.82133408, "memory(GiB)": 138.1, "step": 82170, "train_speed(iter/s)": 0.200628 }, { "acc": 0.76835198, "epoch": 1.9173216290701234, "grad_norm": 5.5, "learning_rate": 4.4705825032113624e-08, "loss": 0.83846817, "memory(GiB)": 138.1, "step": 82180, "train_speed(iter/s)": 0.20064 }, { "acc": 0.77485347, "epoch": 1.9175549366424125, "grad_norm": 6.78125, "learning_rate": 4.445412290426554e-08, "loss": 0.81650391, "memory(GiB)": 138.1, "step": 82190, "train_speed(iter/s)": 0.200653 }, { "acc": 0.77240715, "epoch": 1.9177882442147012, "grad_norm": 7.0625, "learning_rate": 4.420312817915162e-08, "loss": 0.81943703, "memory(GiB)": 138.1, "step": 82200, "train_speed(iter/s)": 0.200666 }, { "acc": 0.7817626, "epoch": 1.9180215517869903, "grad_norm": 6.25, "learning_rate": 4.39528408926021e-08, "loss": 0.78492823, "memory(GiB)": 138.1, "step": 82210, "train_speed(iter/s)": 0.200678 }, { "acc": 0.77328238, "epoch": 1.918254859359279, "grad_norm": 6.84375, "learning_rate": 4.370326108034395e-08, "loss": 0.81196861, "memory(GiB)": 138.1, "step": 82220, "train_speed(iter/s)": 0.200691 }, { "acc": 0.76239452, "epoch": 1.918488166931568, "grad_norm": 7.53125, "learning_rate": 4.3454388778005894e-08, "loss": 0.88454895, "memory(GiB)": 138.1, "step": 82230, "train_speed(iter/s)": 0.200703 }, { "acc": 0.76898079, "epoch": 1.9187214745038568, "grad_norm": 5.25, "learning_rate": 4.3206224021113966e-08, "loss": 0.84615307, "memory(GiB)": 138.1, "step": 82240, "train_speed(iter/s)": 0.200715 }, { "acc": 0.765411, "epoch": 1.918954782076146, "grad_norm": 6.9375, "learning_rate": 4.2958766845093704e-08, "loss": 0.85209064, "memory(GiB)": 138.1, "step": 82250, "train_speed(iter/s)": 0.200728 }, { "acc": 0.77735281, "epoch": 1.9191880896484346, "grad_norm": 6.15625, "learning_rate": 4.271201728526963e-08, "loss": 0.79499168, "memory(GiB)": 138.1, "step": 82260, "train_speed(iter/s)": 0.200741 }, { "acc": 0.78337259, "epoch": 1.9194213972207237, "grad_norm": 4.71875, "learning_rate": 4.24659753768647e-08, "loss": 0.80247488, "memory(GiB)": 138.1, "step": 82270, "train_speed(iter/s)": 0.200755 }, { "acc": 0.78709459, "epoch": 1.9196547047930124, "grad_norm": 5.21875, "learning_rate": 4.222064115500191e-08, "loss": 0.76674123, "memory(GiB)": 138.1, "step": 82280, "train_speed(iter/s)": 0.200767 }, { "acc": 0.77503223, "epoch": 1.9198880123653015, "grad_norm": 11.125, "learning_rate": 4.1976014654702135e-08, "loss": 0.8402914, "memory(GiB)": 138.1, "step": 82290, "train_speed(iter/s)": 0.200779 }, { "acc": 0.77781281, "epoch": 1.9201213199375902, "grad_norm": 7.5, "learning_rate": 4.1732095910885785e-08, "loss": 0.78994112, "memory(GiB)": 138.1, "step": 82300, "train_speed(iter/s)": 0.200792 }, { "acc": 0.79021931, "epoch": 1.920354627509879, "grad_norm": 4.34375, "learning_rate": 4.148888495837222e-08, "loss": 0.7459506, "memory(GiB)": 138.1, "step": 82310, "train_speed(iter/s)": 0.200805 }, { "acc": 0.79193068, "epoch": 1.920587935082168, "grad_norm": 4.40625, "learning_rate": 4.1246381831880345e-08, "loss": 0.73066068, "memory(GiB)": 138.1, "step": 82320, "train_speed(iter/s)": 0.200818 }, { "acc": 0.7667829, "epoch": 1.9208212426544569, "grad_norm": 4.65625, "learning_rate": 4.1004586566026904e-08, "loss": 0.80857086, "memory(GiB)": 138.1, "step": 82330, "train_speed(iter/s)": 0.200831 }, { "acc": 0.78284898, "epoch": 1.9210545502267458, "grad_norm": 6.53125, "learning_rate": 4.076349919532763e-08, "loss": 0.79354701, "memory(GiB)": 138.1, "step": 82340, "train_speed(iter/s)": 0.200844 }, { "acc": 0.79704752, "epoch": 1.9212878577990347, "grad_norm": 6.46875, "learning_rate": 4.052311975419887e-08, "loss": 0.75222983, "memory(GiB)": 138.1, "step": 82350, "train_speed(iter/s)": 0.200856 }, { "acc": 0.78382096, "epoch": 1.9215211653713236, "grad_norm": 4.25, "learning_rate": 4.0283448276953186e-08, "loss": 0.79394064, "memory(GiB)": 138.1, "step": 82360, "train_speed(iter/s)": 0.200869 }, { "acc": 0.78321552, "epoch": 1.9217544729436125, "grad_norm": 3.859375, "learning_rate": 4.004448479780543e-08, "loss": 0.77877493, "memory(GiB)": 138.1, "step": 82370, "train_speed(iter/s)": 0.200881 }, { "acc": 0.78489828, "epoch": 1.9219877805159014, "grad_norm": 3.953125, "learning_rate": 3.9806229350865534e-08, "loss": 0.76953378, "memory(GiB)": 138.1, "step": 82380, "train_speed(iter/s)": 0.200893 }, { "acc": 0.77053556, "epoch": 1.9222210880881903, "grad_norm": 4.78125, "learning_rate": 3.95686819701463e-08, "loss": 0.85348272, "memory(GiB)": 138.1, "step": 82390, "train_speed(iter/s)": 0.200906 }, { "acc": 0.77399349, "epoch": 1.9224543956604792, "grad_norm": 7.875, "learning_rate": 3.93318426895567e-08, "loss": 0.81090708, "memory(GiB)": 138.1, "step": 82400, "train_speed(iter/s)": 0.200918 }, { "acc": 0.79830041, "epoch": 1.922687703232768, "grad_norm": 5.28125, "learning_rate": 3.9095711542905257e-08, "loss": 0.71653395, "memory(GiB)": 138.1, "step": 82410, "train_speed(iter/s)": 0.200931 }, { "acc": 0.81262779, "epoch": 1.922921010805057, "grad_norm": 5.4375, "learning_rate": 3.8860288563900006e-08, "loss": 0.65959501, "memory(GiB)": 138.1, "step": 82420, "train_speed(iter/s)": 0.200944 }, { "acc": 0.77822075, "epoch": 1.9231543183773459, "grad_norm": 5.53125, "learning_rate": 3.862557378614684e-08, "loss": 0.79374828, "memory(GiB)": 138.1, "step": 82430, "train_speed(iter/s)": 0.200957 }, { "acc": 0.7975666, "epoch": 1.9233876259496347, "grad_norm": 7.4375, "learning_rate": 3.839156724315174e-08, "loss": 0.73447914, "memory(GiB)": 138.1, "step": 82440, "train_speed(iter/s)": 0.200969 }, { "acc": 0.78388729, "epoch": 1.9236209335219236, "grad_norm": 3.90625, "learning_rate": 3.815826896831909e-08, "loss": 0.79074955, "memory(GiB)": 138.1, "step": 82450, "train_speed(iter/s)": 0.200981 }, { "acc": 0.78147302, "epoch": 1.9238542410942125, "grad_norm": 5.875, "learning_rate": 3.792567899495281e-08, "loss": 0.78904285, "memory(GiB)": 138.1, "step": 82460, "train_speed(iter/s)": 0.200994 }, { "acc": 0.79638376, "epoch": 1.9240875486665014, "grad_norm": 6.28125, "learning_rate": 3.7693797356254115e-08, "loss": 0.72602043, "memory(GiB)": 138.1, "step": 82470, "train_speed(iter/s)": 0.201007 }, { "acc": 0.79340577, "epoch": 1.9243208562387903, "grad_norm": 5.25, "learning_rate": 3.746262408532375e-08, "loss": 0.72227745, "memory(GiB)": 138.1, "step": 82480, "train_speed(iter/s)": 0.201019 }, { "acc": 0.76795883, "epoch": 1.9245541638110792, "grad_norm": 6.40625, "learning_rate": 3.723215921516254e-08, "loss": 0.83940735, "memory(GiB)": 138.1, "step": 82490, "train_speed(iter/s)": 0.201032 }, { "acc": 0.78825803, "epoch": 1.9247874713833681, "grad_norm": 6.46875, "learning_rate": 3.7002402778668625e-08, "loss": 0.78783998, "memory(GiB)": 138.1, "step": 82500, "train_speed(iter/s)": 0.201046 }, { "epoch": 1.9247874713833681, "eval_acc": 0.7447004670286247, "eval_loss": 0.8044062852859497, "eval_runtime": 1271.4855, "eval_samples_per_second": 28.306, "eval_steps_per_second": 14.154, "step": 82500 }, { "acc": 0.79530158, "epoch": 1.9250207789556568, "grad_norm": 4.71875, "learning_rate": 3.6773354808640194e-08, "loss": 0.71304455, "memory(GiB)": 138.1, "step": 82510, "train_speed(iter/s)": 0.200425 }, { "acc": 0.79802475, "epoch": 1.925254086527946, "grad_norm": 3.96875, "learning_rate": 3.654501533777388e-08, "loss": 0.71960392, "memory(GiB)": 138.1, "step": 82520, "train_speed(iter/s)": 0.200437 }, { "acc": 0.77804232, "epoch": 1.9254873941002346, "grad_norm": 4.375, "learning_rate": 3.6317384398664167e-08, "loss": 0.79983625, "memory(GiB)": 138.1, "step": 82530, "train_speed(iter/s)": 0.20045 }, { "acc": 0.78172102, "epoch": 1.9257207016725237, "grad_norm": 5.0625, "learning_rate": 3.6090462023806175e-08, "loss": 0.79019833, "memory(GiB)": 138.1, "step": 82540, "train_speed(iter/s)": 0.200463 }, { "acc": 0.77815161, "epoch": 1.9259540092448124, "grad_norm": 3.71875, "learning_rate": 3.586424824559287e-08, "loss": 0.80372667, "memory(GiB)": 138.1, "step": 82550, "train_speed(iter/s)": 0.200475 }, { "acc": 0.77129049, "epoch": 1.9261873168171015, "grad_norm": 5.28125, "learning_rate": 3.563874309631565e-08, "loss": 0.84257956, "memory(GiB)": 138.1, "step": 82560, "train_speed(iter/s)": 0.200487 }, { "acc": 0.77810478, "epoch": 1.9264206243893902, "grad_norm": 5.03125, "learning_rate": 3.5413946608165995e-08, "loss": 0.79976082, "memory(GiB)": 138.1, "step": 82570, "train_speed(iter/s)": 0.200499 }, { "acc": 0.77861633, "epoch": 1.9266539319616793, "grad_norm": 5.5625, "learning_rate": 3.518985881323322e-08, "loss": 0.78327084, "memory(GiB)": 138.1, "step": 82580, "train_speed(iter/s)": 0.200511 }, { "acc": 0.80329628, "epoch": 1.926887239533968, "grad_norm": 6.125, "learning_rate": 3.496647974350509e-08, "loss": 0.69266319, "memory(GiB)": 138.1, "step": 82590, "train_speed(iter/s)": 0.200524 }, { "acc": 0.77527142, "epoch": 1.9271205471062571, "grad_norm": 4.8125, "learning_rate": 3.4743809430870525e-08, "loss": 0.80601139, "memory(GiB)": 138.1, "step": 82600, "train_speed(iter/s)": 0.200538 }, { "acc": 0.7910471, "epoch": 1.9273538546785458, "grad_norm": 5.1875, "learning_rate": 3.452184790711466e-08, "loss": 0.7612885, "memory(GiB)": 138.1, "step": 82610, "train_speed(iter/s)": 0.200551 }, { "acc": 0.77904315, "epoch": 1.927587162250835, "grad_norm": 5.53125, "learning_rate": 3.430059520392215e-08, "loss": 0.79040279, "memory(GiB)": 138.1, "step": 82620, "train_speed(iter/s)": 0.200564 }, { "acc": 0.80225992, "epoch": 1.9278204698231236, "grad_norm": 4.65625, "learning_rate": 3.408005135287773e-08, "loss": 0.70399323, "memory(GiB)": 138.1, "step": 82630, "train_speed(iter/s)": 0.200576 }, { "acc": 0.75945187, "epoch": 1.9280537773954127, "grad_norm": 5.90625, "learning_rate": 3.386021638546233e-08, "loss": 0.87596512, "memory(GiB)": 138.1, "step": 82640, "train_speed(iter/s)": 0.200586 }, { "acc": 0.7902338, "epoch": 1.9282870849677014, "grad_norm": 5.5, "learning_rate": 3.364109033305918e-08, "loss": 0.74522243, "memory(GiB)": 138.1, "step": 82650, "train_speed(iter/s)": 0.200599 }, { "acc": 0.78515444, "epoch": 1.9285203925399905, "grad_norm": 4.71875, "learning_rate": 3.3422673226947145e-08, "loss": 0.76768475, "memory(GiB)": 138.1, "step": 82660, "train_speed(iter/s)": 0.200611 }, { "acc": 0.77809896, "epoch": 1.9287537001122792, "grad_norm": 6.21875, "learning_rate": 3.3204965098306284e-08, "loss": 0.79778728, "memory(GiB)": 138.1, "step": 82670, "train_speed(iter/s)": 0.200624 }, { "acc": 0.77451992, "epoch": 1.9289870076845683, "grad_norm": 7.25, "learning_rate": 3.298796597821341e-08, "loss": 0.80516586, "memory(GiB)": 138.1, "step": 82680, "train_speed(iter/s)": 0.200636 }, { "acc": 0.77414317, "epoch": 1.929220315256857, "grad_norm": 6.0, "learning_rate": 3.2771675897645405e-08, "loss": 0.80401745, "memory(GiB)": 138.1, "step": 82690, "train_speed(iter/s)": 0.200649 }, { "acc": 0.78727055, "epoch": 1.929453622829146, "grad_norm": 5.78125, "learning_rate": 3.255609488747813e-08, "loss": 0.74130497, "memory(GiB)": 138.1, "step": 82700, "train_speed(iter/s)": 0.200661 }, { "acc": 0.78334398, "epoch": 1.9296869304014348, "grad_norm": 4.40625, "learning_rate": 3.234122297848474e-08, "loss": 0.75857935, "memory(GiB)": 138.1, "step": 82710, "train_speed(iter/s)": 0.200673 }, { "acc": 0.78041697, "epoch": 1.9299202379737237, "grad_norm": 5.4375, "learning_rate": 3.212706020133904e-08, "loss": 0.80112228, "memory(GiB)": 138.1, "step": 82720, "train_speed(iter/s)": 0.200685 }, { "acc": 0.78621788, "epoch": 1.9301535455460126, "grad_norm": 5.875, "learning_rate": 3.1913606586612135e-08, "loss": 0.74973865, "memory(GiB)": 138.1, "step": 82730, "train_speed(iter/s)": 0.200697 }, { "acc": 0.76397328, "epoch": 1.9303868531183015, "grad_norm": 6.0, "learning_rate": 3.170086216477464e-08, "loss": 0.86834965, "memory(GiB)": 138.1, "step": 82740, "train_speed(iter/s)": 0.20071 }, { "acc": 0.76520152, "epoch": 1.9306201606905904, "grad_norm": 4.59375, "learning_rate": 3.148882696619615e-08, "loss": 0.86656876, "memory(GiB)": 138.1, "step": 82750, "train_speed(iter/s)": 0.200721 }, { "acc": 0.78473167, "epoch": 1.9308534682628793, "grad_norm": 5.59375, "learning_rate": 3.127750102114358e-08, "loss": 0.77946978, "memory(GiB)": 138.1, "step": 82760, "train_speed(iter/s)": 0.200733 }, { "acc": 0.78480234, "epoch": 1.9310867758351682, "grad_norm": 8.0, "learning_rate": 3.1066884359785e-08, "loss": 0.77041464, "memory(GiB)": 138.1, "step": 82770, "train_speed(iter/s)": 0.200747 }, { "acc": 0.81039772, "epoch": 1.931320083407457, "grad_norm": 5.5, "learning_rate": 3.085697701218471e-08, "loss": 0.65555139, "memory(GiB)": 138.1, "step": 82780, "train_speed(iter/s)": 0.20076 }, { "acc": 0.77078209, "epoch": 1.931553390979746, "grad_norm": 4.8125, "learning_rate": 3.064777900830762e-08, "loss": 0.82027807, "memory(GiB)": 138.1, "step": 82790, "train_speed(iter/s)": 0.200772 }, { "acc": 0.77345366, "epoch": 1.9317866985520349, "grad_norm": 5.875, "learning_rate": 3.043929037801596e-08, "loss": 0.79783955, "memory(GiB)": 138.1, "step": 82800, "train_speed(iter/s)": 0.200786 }, { "acc": 0.78663902, "epoch": 1.9320200061243238, "grad_norm": 8.375, "learning_rate": 3.023151115107259e-08, "loss": 0.77612534, "memory(GiB)": 138.1, "step": 82810, "train_speed(iter/s)": 0.200799 }, { "acc": 0.78559351, "epoch": 1.9322533136966127, "grad_norm": 7.21875, "learning_rate": 3.002444135713711e-08, "loss": 0.76495595, "memory(GiB)": 138.1, "step": 82820, "train_speed(iter/s)": 0.200811 }, { "acc": 0.79293275, "epoch": 1.9324866212689016, "grad_norm": 5.03125, "learning_rate": 2.9818081025768667e-08, "loss": 0.75699339, "memory(GiB)": 138.1, "step": 82830, "train_speed(iter/s)": 0.200823 }, { "acc": 0.77729588, "epoch": 1.9327199288411905, "grad_norm": 7.78125, "learning_rate": 2.9612430186425346e-08, "loss": 0.81681576, "memory(GiB)": 138.1, "step": 82840, "train_speed(iter/s)": 0.200836 }, { "acc": 0.78396635, "epoch": 1.9329532364134794, "grad_norm": 12.375, "learning_rate": 2.9407488868463675e-08, "loss": 0.77789707, "memory(GiB)": 138.1, "step": 82850, "train_speed(iter/s)": 0.200849 }, { "acc": 0.77509494, "epoch": 1.9331865439857683, "grad_norm": 6.46875, "learning_rate": 2.9203257101139694e-08, "loss": 0.80295143, "memory(GiB)": 138.1, "step": 82860, "train_speed(iter/s)": 0.200862 }, { "acc": 0.78139973, "epoch": 1.9334198515580572, "grad_norm": 5.03125, "learning_rate": 2.8999734913606193e-08, "loss": 0.7943119, "memory(GiB)": 138.1, "step": 82870, "train_speed(iter/s)": 0.200875 }, { "acc": 0.77936926, "epoch": 1.933653159130346, "grad_norm": 8.5, "learning_rate": 2.8796922334916044e-08, "loss": 0.78870902, "memory(GiB)": 138.1, "step": 82880, "train_speed(iter/s)": 0.200887 }, { "acc": 0.78523445, "epoch": 1.933886466702635, "grad_norm": 4.71875, "learning_rate": 2.8594819394021646e-08, "loss": 0.76724324, "memory(GiB)": 138.1, "step": 82890, "train_speed(iter/s)": 0.2009 }, { "acc": 0.78431993, "epoch": 1.9341197742749237, "grad_norm": 4.65625, "learning_rate": 2.8393426119772138e-08, "loss": 0.77278762, "memory(GiB)": 138.1, "step": 82900, "train_speed(iter/s)": 0.200912 }, { "acc": 0.77910967, "epoch": 1.9343530818472128, "grad_norm": 14.5625, "learning_rate": 2.8192742540917305e-08, "loss": 0.80649271, "memory(GiB)": 138.1, "step": 82910, "train_speed(iter/s)": 0.200925 }, { "acc": 0.78101177, "epoch": 1.9345863894195015, "grad_norm": 5.8125, "learning_rate": 2.799276868610368e-08, "loss": 0.7846736, "memory(GiB)": 138.1, "step": 82920, "train_speed(iter/s)": 0.200938 }, { "acc": 0.7781353, "epoch": 1.9348196969917906, "grad_norm": 4.75, "learning_rate": 2.7793504583878417e-08, "loss": 0.79075127, "memory(GiB)": 138.1, "step": 82930, "train_speed(iter/s)": 0.20095 }, { "acc": 0.77793598, "epoch": 1.9350530045640792, "grad_norm": 5.53125, "learning_rate": 2.7594950262685438e-08, "loss": 0.78132844, "memory(GiB)": 138.1, "step": 82940, "train_speed(iter/s)": 0.200962 }, { "acc": 0.76155319, "epoch": 1.9352863121363684, "grad_norm": 8.5625, "learning_rate": 2.739710575086929e-08, "loss": 0.8619318, "memory(GiB)": 138.1, "step": 82950, "train_speed(iter/s)": 0.200975 }, { "acc": 0.80455065, "epoch": 1.935519619708657, "grad_norm": 5.09375, "learning_rate": 2.719997107667127e-08, "loss": 0.71163216, "memory(GiB)": 138.1, "step": 82960, "train_speed(iter/s)": 0.200987 }, { "acc": 0.78168869, "epoch": 1.9357529272809462, "grad_norm": 8.0, "learning_rate": 2.7003546268233317e-08, "loss": 0.76260276, "memory(GiB)": 138.1, "step": 82970, "train_speed(iter/s)": 0.200999 }, { "acc": 0.75511971, "epoch": 1.9359862348532348, "grad_norm": 4.59375, "learning_rate": 2.6807831353594106e-08, "loss": 0.87710342, "memory(GiB)": 138.1, "step": 82980, "train_speed(iter/s)": 0.20101 }, { "acc": 0.80496244, "epoch": 1.936219542425524, "grad_norm": 5.28125, "learning_rate": 2.66128263606924e-08, "loss": 0.68820372, "memory(GiB)": 138.1, "step": 82990, "train_speed(iter/s)": 0.201023 }, { "acc": 0.78904171, "epoch": 1.9364528499978126, "grad_norm": 5.65625, "learning_rate": 2.6418531317364825e-08, "loss": 0.74845371, "memory(GiB)": 138.1, "step": 83000, "train_speed(iter/s)": 0.201035 }, { "epoch": 1.9364528499978126, "eval_acc": 0.7447185633268747, "eval_loss": 0.8043991327285767, "eval_runtime": 1272.2124, "eval_samples_per_second": 28.29, "eval_steps_per_second": 14.145, "step": 83000 }, { "acc": 0.76983991, "epoch": 1.9366861575701018, "grad_norm": 5.0625, "learning_rate": 2.6224946251346973e-08, "loss": 0.84707108, "memory(GiB)": 138.1, "step": 83010, "train_speed(iter/s)": 0.200418 }, { "acc": 0.76536455, "epoch": 1.9369194651423904, "grad_norm": 4.5625, "learning_rate": 2.60320711902734e-08, "loss": 0.84149084, "memory(GiB)": 138.1, "step": 83020, "train_speed(iter/s)": 0.200431 }, { "acc": 0.78333254, "epoch": 1.9371527727146796, "grad_norm": 5.375, "learning_rate": 2.5839906161676532e-08, "loss": 0.77945223, "memory(GiB)": 138.1, "step": 83030, "train_speed(iter/s)": 0.200443 }, { "acc": 0.79502096, "epoch": 1.9373860802869682, "grad_norm": 5.1875, "learning_rate": 2.564845119298831e-08, "loss": 0.73158178, "memory(GiB)": 138.1, "step": 83040, "train_speed(iter/s)": 0.200455 }, { "acc": 0.77735152, "epoch": 1.9376193878592574, "grad_norm": 5.25, "learning_rate": 2.5457706311538544e-08, "loss": 0.81248274, "memory(GiB)": 138.1, "step": 83050, "train_speed(iter/s)": 0.200467 }, { "acc": 0.78180962, "epoch": 1.937852695431546, "grad_norm": 4.5625, "learning_rate": 2.526767154455545e-08, "loss": 0.7838274, "memory(GiB)": 138.1, "step": 83060, "train_speed(iter/s)": 0.200479 }, { "acc": 0.80066261, "epoch": 1.9380860030038352, "grad_norm": 5.5, "learning_rate": 2.5078346919167883e-08, "loss": 0.71001968, "memory(GiB)": 138.1, "step": 83070, "train_speed(iter/s)": 0.200491 }, { "acc": 0.76299033, "epoch": 1.9383193105761238, "grad_norm": 6.84375, "learning_rate": 2.4889732462400895e-08, "loss": 0.84023495, "memory(GiB)": 138.1, "step": 83080, "train_speed(iter/s)": 0.200504 }, { "acc": 0.79042563, "epoch": 1.9385526181484127, "grad_norm": 5.71875, "learning_rate": 2.4701828201179057e-08, "loss": 0.74814739, "memory(GiB)": 138.1, "step": 83090, "train_speed(iter/s)": 0.200516 }, { "acc": 0.78727136, "epoch": 1.9387859257207016, "grad_norm": 5.53125, "learning_rate": 2.451463416232591e-08, "loss": 0.76182685, "memory(GiB)": 138.1, "step": 83100, "train_speed(iter/s)": 0.200529 }, { "acc": 0.78427858, "epoch": 1.9390192332929905, "grad_norm": 6.0625, "learning_rate": 2.4328150372563974e-08, "loss": 0.77642002, "memory(GiB)": 138.1, "step": 83110, "train_speed(iter/s)": 0.200541 }, { "acc": 0.75748053, "epoch": 1.9392525408652794, "grad_norm": 5.46875, "learning_rate": 2.4142376858512506e-08, "loss": 0.87451792, "memory(GiB)": 138.1, "step": 83120, "train_speed(iter/s)": 0.200554 }, { "acc": 0.79059162, "epoch": 1.9394858484375683, "grad_norm": 5.4375, "learning_rate": 2.3957313646691406e-08, "loss": 0.74902925, "memory(GiB)": 138.1, "step": 83130, "train_speed(iter/s)": 0.200567 }, { "acc": 0.78442965, "epoch": 1.9397191560098572, "grad_norm": 4.5625, "learning_rate": 2.3772960763518428e-08, "loss": 0.77176137, "memory(GiB)": 138.1, "step": 83140, "train_speed(iter/s)": 0.20058 }, { "acc": 0.78968287, "epoch": 1.9399524635821461, "grad_norm": 4.34375, "learning_rate": 2.358931823530919e-08, "loss": 0.73917861, "memory(GiB)": 138.1, "step": 83150, "train_speed(iter/s)": 0.200592 }, { "acc": 0.79481835, "epoch": 1.940185771154435, "grad_norm": 4.625, "learning_rate": 2.3406386088279387e-08, "loss": 0.73219633, "memory(GiB)": 138.1, "step": 83160, "train_speed(iter/s)": 0.200605 }, { "acc": 0.78026934, "epoch": 1.940419078726724, "grad_norm": 9.0, "learning_rate": 2.3224164348542576e-08, "loss": 0.78946395, "memory(GiB)": 138.1, "step": 83170, "train_speed(iter/s)": 0.200617 }, { "acc": 0.77109671, "epoch": 1.9406523862990128, "grad_norm": 10.5625, "learning_rate": 2.3042653042110175e-08, "loss": 0.8287899, "memory(GiB)": 138.1, "step": 83180, "train_speed(iter/s)": 0.20063 }, { "acc": 0.77708349, "epoch": 1.9408856938713017, "grad_norm": 4.84375, "learning_rate": 2.2861852194893118e-08, "loss": 0.80719547, "memory(GiB)": 138.1, "step": 83190, "train_speed(iter/s)": 0.200642 }, { "acc": 0.78134727, "epoch": 1.9411190014435906, "grad_norm": 6.28125, "learning_rate": 2.2681761832701323e-08, "loss": 0.78660898, "memory(GiB)": 138.1, "step": 83200, "train_speed(iter/s)": 0.200654 }, { "acc": 0.76660013, "epoch": 1.9413523090158795, "grad_norm": 7.09375, "learning_rate": 2.2502381981241993e-08, "loss": 0.84043627, "memory(GiB)": 138.1, "step": 83210, "train_speed(iter/s)": 0.200666 }, { "acc": 0.78337593, "epoch": 1.9415856165881684, "grad_norm": 4.5, "learning_rate": 2.2323712666121324e-08, "loss": 0.77146144, "memory(GiB)": 138.1, "step": 83220, "train_speed(iter/s)": 0.200679 }, { "acc": 0.78208766, "epoch": 1.9418189241604573, "grad_norm": 4.625, "learning_rate": 2.2145753912845014e-08, "loss": 0.78833818, "memory(GiB)": 138.1, "step": 83230, "train_speed(iter/s)": 0.200692 }, { "acc": 0.77869391, "epoch": 1.9420522317327462, "grad_norm": 7.71875, "learning_rate": 2.1968505746815527e-08, "loss": 0.79904451, "memory(GiB)": 138.1, "step": 83240, "train_speed(iter/s)": 0.200703 }, { "acc": 0.78262606, "epoch": 1.9422855393050351, "grad_norm": 5.78125, "learning_rate": 2.179196819333651e-08, "loss": 0.77370358, "memory(GiB)": 138.1, "step": 83250, "train_speed(iter/s)": 0.200716 }, { "acc": 0.77597246, "epoch": 1.942518846877324, "grad_norm": 7.6875, "learning_rate": 2.1616141277607804e-08, "loss": 0.80707121, "memory(GiB)": 138.1, "step": 83260, "train_speed(iter/s)": 0.200729 }, { "acc": 0.79199247, "epoch": 1.9427521544496127, "grad_norm": 5.6875, "learning_rate": 2.1441025024728225e-08, "loss": 0.74981995, "memory(GiB)": 138.1, "step": 83270, "train_speed(iter/s)": 0.200741 }, { "acc": 0.77484131, "epoch": 1.9429854620219018, "grad_norm": 5.78125, "learning_rate": 2.1266619459696102e-08, "loss": 0.81854687, "memory(GiB)": 138.1, "step": 83280, "train_speed(iter/s)": 0.200753 }, { "acc": 0.78900671, "epoch": 1.9432187695941905, "grad_norm": 5.96875, "learning_rate": 2.1092924607408195e-08, "loss": 0.76061816, "memory(GiB)": 138.1, "step": 83290, "train_speed(iter/s)": 0.200766 }, { "acc": 0.7663866, "epoch": 1.9434520771664796, "grad_norm": 5.0, "learning_rate": 2.091994049265855e-08, "loss": 0.85271311, "memory(GiB)": 138.1, "step": 83300, "train_speed(iter/s)": 0.200778 }, { "acc": 0.79749975, "epoch": 1.9436853847387683, "grad_norm": 6.40625, "learning_rate": 2.074766714014076e-08, "loss": 0.72964478, "memory(GiB)": 138.1, "step": 83310, "train_speed(iter/s)": 0.200791 }, { "acc": 0.77364941, "epoch": 1.9439186923110574, "grad_norm": 5.90625, "learning_rate": 2.057610457444792e-08, "loss": 0.82341738, "memory(GiB)": 138.1, "step": 83320, "train_speed(iter/s)": 0.200804 }, { "acc": 0.78957624, "epoch": 1.944151999883346, "grad_norm": 5.875, "learning_rate": 2.0405252820068776e-08, "loss": 0.76410146, "memory(GiB)": 138.1, "step": 83330, "train_speed(iter/s)": 0.200817 }, { "acc": 0.77884073, "epoch": 1.9443853074556352, "grad_norm": 3.71875, "learning_rate": 2.0235111901393266e-08, "loss": 0.77160797, "memory(GiB)": 138.1, "step": 83340, "train_speed(iter/s)": 0.200828 }, { "acc": 0.76247535, "epoch": 1.9446186150279239, "grad_norm": 5.84375, "learning_rate": 2.0065681842709185e-08, "loss": 0.86031971, "memory(GiB)": 138.1, "step": 83350, "train_speed(iter/s)": 0.20084 }, { "acc": 0.77901812, "epoch": 1.944851922600213, "grad_norm": 4.34375, "learning_rate": 1.989696266820218e-08, "loss": 0.79860239, "memory(GiB)": 138.1, "step": 83360, "train_speed(iter/s)": 0.200853 }, { "acc": 0.77776785, "epoch": 1.9450852301725017, "grad_norm": 5.84375, "learning_rate": 1.972895440195688e-08, "loss": 0.7847723, "memory(GiB)": 138.1, "step": 83370, "train_speed(iter/s)": 0.200865 }, { "acc": 0.79139214, "epoch": 1.9453185377447908, "grad_norm": 4.375, "learning_rate": 1.9561657067956874e-08, "loss": 0.75544043, "memory(GiB)": 138.1, "step": 83380, "train_speed(iter/s)": 0.200877 }, { "acc": 0.76193299, "epoch": 1.9455518453170795, "grad_norm": 5.375, "learning_rate": 1.9395070690083617e-08, "loss": 0.84404163, "memory(GiB)": 138.1, "step": 83390, "train_speed(iter/s)": 0.200889 }, { "acc": 0.78581266, "epoch": 1.9457851528893686, "grad_norm": 5.4375, "learning_rate": 1.9229195292116976e-08, "loss": 0.76360941, "memory(GiB)": 138.1, "step": 83400, "train_speed(iter/s)": 0.200901 }, { "acc": 0.79550982, "epoch": 1.9460184604616573, "grad_norm": 6.375, "learning_rate": 1.9064030897735232e-08, "loss": 0.73615227, "memory(GiB)": 138.1, "step": 83410, "train_speed(iter/s)": 0.200913 }, { "acc": 0.78520379, "epoch": 1.9462517680339464, "grad_norm": 4.21875, "learning_rate": 1.8899577530516744e-08, "loss": 0.77724113, "memory(GiB)": 138.1, "step": 83420, "train_speed(iter/s)": 0.200926 }, { "acc": 0.78606596, "epoch": 1.946485075606235, "grad_norm": 5.90625, "learning_rate": 1.8735835213936627e-08, "loss": 0.76647534, "memory(GiB)": 138.1, "step": 83430, "train_speed(iter/s)": 0.200938 }, { "acc": 0.78347244, "epoch": 1.9467183831785242, "grad_norm": 5.59375, "learning_rate": 1.8572803971368404e-08, "loss": 0.80083179, "memory(GiB)": 138.1, "step": 83440, "train_speed(iter/s)": 0.200951 }, { "acc": 0.77931328, "epoch": 1.9469516907508129, "grad_norm": 8.6875, "learning_rate": 1.841048382608568e-08, "loss": 0.79017577, "memory(GiB)": 138.1, "step": 83450, "train_speed(iter/s)": 0.200963 }, { "acc": 0.77811432, "epoch": 1.947184998323102, "grad_norm": 5.8125, "learning_rate": 1.8248874801259363e-08, "loss": 0.79184465, "memory(GiB)": 138.1, "step": 83460, "train_speed(iter/s)": 0.200976 }, { "acc": 0.79206524, "epoch": 1.9474183058953907, "grad_norm": 5.625, "learning_rate": 1.8087976919958783e-08, "loss": 0.73837852, "memory(GiB)": 138.1, "step": 83470, "train_speed(iter/s)": 0.200989 }, { "acc": 0.78631954, "epoch": 1.9476516134676796, "grad_norm": 7.09375, "learning_rate": 1.792779020515223e-08, "loss": 0.76524954, "memory(GiB)": 138.1, "step": 83480, "train_speed(iter/s)": 0.201002 }, { "acc": 0.78290014, "epoch": 1.9478849210399685, "grad_norm": 6.65625, "learning_rate": 1.776831467970641e-08, "loss": 0.77942362, "memory(GiB)": 138.1, "step": 83490, "train_speed(iter/s)": 0.201014 }, { "acc": 0.79167795, "epoch": 1.9481182286122574, "grad_norm": 5.4375, "learning_rate": 1.7609550366385897e-08, "loss": 0.74010186, "memory(GiB)": 138.1, "step": 83500, "train_speed(iter/s)": 0.201027 }, { "epoch": 1.9481182286122574, "eval_acc": 0.7446927801054742, "eval_loss": 0.8044261932373047, "eval_runtime": 1271.6845, "eval_samples_per_second": 28.302, "eval_steps_per_second": 14.151, "step": 83500 }, { "acc": 0.77565384, "epoch": 1.9483515361845463, "grad_norm": 5.25, "learning_rate": 1.7451497287855334e-08, "loss": 0.77604108, "memory(GiB)": 138.1, "step": 83510, "train_speed(iter/s)": 0.200414 }, { "acc": 0.7824501, "epoch": 1.9485848437568352, "grad_norm": 3.953125, "learning_rate": 1.7294155466675567e-08, "loss": 0.80296421, "memory(GiB)": 138.1, "step": 83520, "train_speed(iter/s)": 0.200427 }, { "acc": 0.7742733, "epoch": 1.948818151329124, "grad_norm": 4.5625, "learning_rate": 1.7137524925307515e-08, "loss": 0.83972244, "memory(GiB)": 138.1, "step": 83530, "train_speed(iter/s)": 0.20044 }, { "acc": 0.76697764, "epoch": 1.949051458901413, "grad_norm": 6.34375, "learning_rate": 1.698160568611107e-08, "loss": 0.82968264, "memory(GiB)": 138.1, "step": 83540, "train_speed(iter/s)": 0.200453 }, { "acc": 0.77781119, "epoch": 1.9492847664737019, "grad_norm": 5.5625, "learning_rate": 1.6826397771342317e-08, "loss": 0.80852528, "memory(GiB)": 138.1, "step": 83550, "train_speed(iter/s)": 0.200466 }, { "acc": 0.77753439, "epoch": 1.9495180740459908, "grad_norm": 7.4375, "learning_rate": 1.6671901203157425e-08, "loss": 0.83437614, "memory(GiB)": 138.1, "step": 83560, "train_speed(iter/s)": 0.200479 }, { "acc": 0.79008942, "epoch": 1.9497513816182797, "grad_norm": 5.875, "learning_rate": 1.6518116003611527e-08, "loss": 0.74177456, "memory(GiB)": 138.1, "step": 83570, "train_speed(iter/s)": 0.200492 }, { "acc": 0.80940037, "epoch": 1.9499846891905686, "grad_norm": 10.3125, "learning_rate": 1.636504219465651e-08, "loss": 0.66792626, "memory(GiB)": 138.1, "step": 83580, "train_speed(iter/s)": 0.200505 }, { "acc": 0.76242118, "epoch": 1.9502179967628575, "grad_norm": 5.65625, "learning_rate": 1.6212679798143782e-08, "loss": 0.85964575, "memory(GiB)": 138.1, "step": 83590, "train_speed(iter/s)": 0.200517 }, { "acc": 0.7804348, "epoch": 1.9504513043351464, "grad_norm": 5.96875, "learning_rate": 1.6061028835823723e-08, "loss": 0.78724127, "memory(GiB)": 138.1, "step": 83600, "train_speed(iter/s)": 0.20053 }, { "acc": 0.76076965, "epoch": 1.9506846119074353, "grad_norm": 6.0625, "learning_rate": 1.591008932934346e-08, "loss": 0.86802006, "memory(GiB)": 138.1, "step": 83610, "train_speed(iter/s)": 0.200542 }, { "acc": 0.77423449, "epoch": 1.9509179194797241, "grad_norm": 5.6875, "learning_rate": 1.57598613002502e-08, "loss": 0.81039219, "memory(GiB)": 138.1, "step": 83620, "train_speed(iter/s)": 0.200555 }, { "acc": 0.76987734, "epoch": 1.951151227052013, "grad_norm": 6.78125, "learning_rate": 1.5610344769989017e-08, "loss": 0.83137999, "memory(GiB)": 138.1, "step": 83630, "train_speed(iter/s)": 0.200567 }, { "acc": 0.76419888, "epoch": 1.951384534624302, "grad_norm": 5.84375, "learning_rate": 1.5461539759902832e-08, "loss": 0.84437933, "memory(GiB)": 138.1, "step": 83640, "train_speed(iter/s)": 0.200579 }, { "acc": 0.79972954, "epoch": 1.9516178421965908, "grad_norm": 4.40625, "learning_rate": 1.5313446291234104e-08, "loss": 0.706955, "memory(GiB)": 138.1, "step": 83650, "train_speed(iter/s)": 0.200591 }, { "acc": 0.79801159, "epoch": 1.9518511497688795, "grad_norm": 4.5, "learning_rate": 1.516606438512258e-08, "loss": 0.73285971, "memory(GiB)": 138.1, "step": 83660, "train_speed(iter/s)": 0.200604 }, { "acc": 0.77286844, "epoch": 1.9520844573411686, "grad_norm": 6.5, "learning_rate": 1.501939406260755e-08, "loss": 0.82769775, "memory(GiB)": 138.1, "step": 83670, "train_speed(iter/s)": 0.200617 }, { "acc": 0.78321648, "epoch": 1.9523177649134573, "grad_norm": 5.28125, "learning_rate": 1.4873435344625597e-08, "loss": 0.78056812, "memory(GiB)": 138.1, "step": 83680, "train_speed(iter/s)": 0.200629 }, { "acc": 0.77889185, "epoch": 1.9525510724857464, "grad_norm": 4.09375, "learning_rate": 1.4728188252012832e-08, "loss": 0.79473181, "memory(GiB)": 138.1, "step": 83690, "train_speed(iter/s)": 0.200642 }, { "acc": 0.78766642, "epoch": 1.9527843800580351, "grad_norm": 4.9375, "learning_rate": 1.4583652805503223e-08, "loss": 0.76746063, "memory(GiB)": 138.1, "step": 83700, "train_speed(iter/s)": 0.200654 }, { "acc": 0.79879084, "epoch": 1.9530176876303242, "grad_norm": 7.625, "learning_rate": 1.4439829025728047e-08, "loss": 0.72144575, "memory(GiB)": 138.1, "step": 83710, "train_speed(iter/s)": 0.200667 }, { "acc": 0.78694019, "epoch": 1.953250995202613, "grad_norm": 8.3125, "learning_rate": 1.4296716933219768e-08, "loss": 0.7669951, "memory(GiB)": 138.1, "step": 83720, "train_speed(iter/s)": 0.200679 }, { "acc": 0.77646313, "epoch": 1.953484302774902, "grad_norm": 5.09375, "learning_rate": 1.4154316548406488e-08, "loss": 0.8079896, "memory(GiB)": 138.1, "step": 83730, "train_speed(iter/s)": 0.200692 }, { "acc": 0.77782555, "epoch": 1.9537176103471907, "grad_norm": 5.625, "learning_rate": 1.4012627891615838e-08, "loss": 0.79529219, "memory(GiB)": 138.1, "step": 83740, "train_speed(iter/s)": 0.200704 }, { "acc": 0.77345643, "epoch": 1.9539509179194798, "grad_norm": 7.25, "learning_rate": 1.3871650983074414e-08, "loss": 0.81681423, "memory(GiB)": 138.1, "step": 83750, "train_speed(iter/s)": 0.200717 }, { "acc": 0.77701006, "epoch": 1.9541842254917685, "grad_norm": 5.625, "learning_rate": 1.3731385842906675e-08, "loss": 0.79945378, "memory(GiB)": 138.1, "step": 83760, "train_speed(iter/s)": 0.200729 }, { "acc": 0.78949852, "epoch": 1.9544175330640576, "grad_norm": 6.1875, "learning_rate": 1.3591832491134382e-08, "loss": 0.75286026, "memory(GiB)": 138.1, "step": 83770, "train_speed(iter/s)": 0.200742 }, { "acc": 0.77981601, "epoch": 1.9546508406363463, "grad_norm": 4.1875, "learning_rate": 1.3452990947679933e-08, "loss": 0.78779631, "memory(GiB)": 138.1, "step": 83780, "train_speed(iter/s)": 0.200753 }, { "acc": 0.77697887, "epoch": 1.9548841482086354, "grad_norm": 6.28125, "learning_rate": 1.3314861232362475e-08, "loss": 0.8031744, "memory(GiB)": 138.1, "step": 83790, "train_speed(iter/s)": 0.200766 }, { "acc": 0.7862566, "epoch": 1.955117455780924, "grad_norm": 7.625, "learning_rate": 1.3177443364899567e-08, "loss": 0.76605148, "memory(GiB)": 138.1, "step": 83800, "train_speed(iter/s)": 0.200779 }, { "acc": 0.78782101, "epoch": 1.9553507633532132, "grad_norm": 5.6875, "learning_rate": 1.3040737364908295e-08, "loss": 0.75679483, "memory(GiB)": 138.1, "step": 83810, "train_speed(iter/s)": 0.200791 }, { "acc": 0.77958326, "epoch": 1.955584070925502, "grad_norm": 5.5, "learning_rate": 1.2904743251902496e-08, "loss": 0.79298873, "memory(GiB)": 138.1, "step": 83820, "train_speed(iter/s)": 0.200804 }, { "acc": 0.77609186, "epoch": 1.955817378497791, "grad_norm": 6.21875, "learning_rate": 1.2769461045296083e-08, "loss": 0.81230192, "memory(GiB)": 138.1, "step": 83830, "train_speed(iter/s)": 0.200816 }, { "acc": 0.78773966, "epoch": 1.9560506860700797, "grad_norm": 5.75, "learning_rate": 1.2634890764400832e-08, "loss": 0.78044167, "memory(GiB)": 138.1, "step": 83840, "train_speed(iter/s)": 0.200828 }, { "acc": 0.77997303, "epoch": 1.9562839936423686, "grad_norm": 6.375, "learning_rate": 1.2501032428425264e-08, "loss": 0.80062265, "memory(GiB)": 138.1, "step": 83850, "train_speed(iter/s)": 0.200841 }, { "acc": 0.77704172, "epoch": 1.9565173012146575, "grad_norm": 6.25, "learning_rate": 1.2367886056479095e-08, "loss": 0.79110088, "memory(GiB)": 138.1, "step": 83860, "train_speed(iter/s)": 0.200853 }, { "acc": 0.77524996, "epoch": 1.9567506087869464, "grad_norm": 4.84375, "learning_rate": 1.2235451667567678e-08, "loss": 0.80665646, "memory(GiB)": 138.1, "step": 83870, "train_speed(iter/s)": 0.200865 }, { "acc": 0.80717812, "epoch": 1.9569839163592353, "grad_norm": 4.59375, "learning_rate": 1.2103729280596998e-08, "loss": 0.69145899, "memory(GiB)": 138.1, "step": 83880, "train_speed(iter/s)": 0.200877 }, { "acc": 0.7651278, "epoch": 1.9572172239315242, "grad_norm": 5.46875, "learning_rate": 1.1972718914370351e-08, "loss": 0.84759274, "memory(GiB)": 138.1, "step": 83890, "train_speed(iter/s)": 0.20089 }, { "acc": 0.78047352, "epoch": 1.957450531503813, "grad_norm": 4.625, "learning_rate": 1.1842420587588333e-08, "loss": 0.80353117, "memory(GiB)": 138.1, "step": 83900, "train_speed(iter/s)": 0.200902 }, { "acc": 0.77811413, "epoch": 1.957683839076102, "grad_norm": 5.375, "learning_rate": 1.1712834318852173e-08, "loss": 0.79200544, "memory(GiB)": 138.1, "step": 83910, "train_speed(iter/s)": 0.200914 }, { "acc": 0.79376502, "epoch": 1.957917146648391, "grad_norm": 6.0, "learning_rate": 1.15839601266593e-08, "loss": 0.74129534, "memory(GiB)": 138.1, "step": 83920, "train_speed(iter/s)": 0.200927 }, { "acc": 0.78470793, "epoch": 1.9581504542206798, "grad_norm": 5.5, "learning_rate": 1.1455798029407772e-08, "loss": 0.76864552, "memory(GiB)": 138.1, "step": 83930, "train_speed(iter/s)": 0.200939 }, { "acc": 0.78365679, "epoch": 1.9583837617929687, "grad_norm": 3.96875, "learning_rate": 1.132834804539129e-08, "loss": 0.75253015, "memory(GiB)": 138.1, "step": 83940, "train_speed(iter/s)": 0.200952 }, { "acc": 0.78898249, "epoch": 1.9586170693652576, "grad_norm": 4.96875, "learning_rate": 1.120161019280419e-08, "loss": 0.74453573, "memory(GiB)": 138.1, "step": 83950, "train_speed(iter/s)": 0.200964 }, { "acc": 0.77433839, "epoch": 1.9588503769375465, "grad_norm": 6.5, "learning_rate": 1.1075584489737557e-08, "loss": 0.82852116, "memory(GiB)": 138.1, "step": 83960, "train_speed(iter/s)": 0.200977 }, { "acc": 0.77746682, "epoch": 1.9590836845098354, "grad_norm": 4.78125, "learning_rate": 1.0950270954181997e-08, "loss": 0.79795814, "memory(GiB)": 138.1, "step": 83970, "train_speed(iter/s)": 0.200989 }, { "acc": 0.77838507, "epoch": 1.9593169920821243, "grad_norm": 6.46875, "learning_rate": 1.0825669604026534e-08, "loss": 0.79831772, "memory(GiB)": 138.1, "step": 83980, "train_speed(iter/s)": 0.201001 }, { "acc": 0.78435926, "epoch": 1.9595502996544132, "grad_norm": 6.28125, "learning_rate": 1.0701780457056943e-08, "loss": 0.79196739, "memory(GiB)": 138.1, "step": 83990, "train_speed(iter/s)": 0.201014 }, { "acc": 0.78845625, "epoch": 1.959783607226702, "grad_norm": 4.9375, "learning_rate": 1.0578603530958519e-08, "loss": 0.76660895, "memory(GiB)": 138.1, "step": 84000, "train_speed(iter/s)": 0.201026 }, { "epoch": 1.959783607226702, "eval_acc": 0.7447569979426271, "eval_loss": 0.8044305443763733, "eval_runtime": 1271.2957, "eval_samples_per_second": 28.31, "eval_steps_per_second": 14.156, "step": 84000 }, { "acc": 0.77818713, "epoch": 1.960016914798991, "grad_norm": 6.59375, "learning_rate": 1.0456138843315534e-08, "loss": 0.80791035, "memory(GiB)": 138.1, "step": 84010, "train_speed(iter/s)": 0.200416 }, { "acc": 0.7748589, "epoch": 1.9602502223712799, "grad_norm": 6.375, "learning_rate": 1.0334386411609e-08, "loss": 0.7919076, "memory(GiB)": 138.1, "step": 84020, "train_speed(iter/s)": 0.200429 }, { "acc": 0.77099795, "epoch": 1.9604835299435688, "grad_norm": 5.40625, "learning_rate": 1.0213346253219459e-08, "loss": 0.7921875, "memory(GiB)": 138.1, "step": 84030, "train_speed(iter/s)": 0.200441 }, { "acc": 0.79554253, "epoch": 1.9607168375158577, "grad_norm": 5.4375, "learning_rate": 1.0093018385424757e-08, "loss": 0.73114367, "memory(GiB)": 138.1, "step": 84040, "train_speed(iter/s)": 0.200454 }, { "acc": 0.80636587, "epoch": 1.9609501450881464, "grad_norm": 6.21875, "learning_rate": 9.973402825402823e-09, "loss": 0.70025764, "memory(GiB)": 138.1, "step": 84050, "train_speed(iter/s)": 0.200466 }, { "acc": 0.79211783, "epoch": 1.9611834526604355, "grad_norm": 7.6875, "learning_rate": 9.854499590227217e-09, "loss": 0.72471952, "memory(GiB)": 138.1, "step": 84060, "train_speed(iter/s)": 0.200478 }, { "acc": 0.77074847, "epoch": 1.9614167602327242, "grad_norm": 4.5625, "learning_rate": 9.736308696872698e-09, "loss": 0.8087081, "memory(GiB)": 138.1, "step": 84070, "train_speed(iter/s)": 0.200491 }, { "acc": 0.77772079, "epoch": 1.9616500678050133, "grad_norm": 6.1875, "learning_rate": 9.618830162210213e-09, "loss": 0.80092773, "memory(GiB)": 138.1, "step": 84080, "train_speed(iter/s)": 0.200502 }, { "acc": 0.78463898, "epoch": 1.961883375377302, "grad_norm": 4.46875, "learning_rate": 9.502064003010236e-09, "loss": 0.78809996, "memory(GiB)": 138.1, "step": 84090, "train_speed(iter/s)": 0.200515 }, { "acc": 0.7911252, "epoch": 1.962116682949591, "grad_norm": 5.3125, "learning_rate": 9.386010235940546e-09, "loss": 0.76758842, "memory(GiB)": 138.1, "step": 84100, "train_speed(iter/s)": 0.200527 }, { "acc": 0.76237383, "epoch": 1.9623499905218798, "grad_norm": 7.28125, "learning_rate": 9.270668877568444e-09, "loss": 0.84567051, "memory(GiB)": 138.1, "step": 84110, "train_speed(iter/s)": 0.200539 }, { "acc": 0.77878971, "epoch": 1.9625832980941689, "grad_norm": 5.25, "learning_rate": 9.156039944358542e-09, "loss": 0.79879036, "memory(GiB)": 138.1, "step": 84120, "train_speed(iter/s)": 0.200552 }, { "acc": 0.76202507, "epoch": 1.9628166056664575, "grad_norm": 4.875, "learning_rate": 9.042123452673856e-09, "loss": 0.8723732, "memory(GiB)": 138.1, "step": 84130, "train_speed(iter/s)": 0.200564 }, { "acc": 0.77010603, "epoch": 1.9630499132387467, "grad_norm": 4.96875, "learning_rate": 8.928919418776382e-09, "loss": 0.83259068, "memory(GiB)": 138.1, "step": 84140, "train_speed(iter/s)": 0.200577 }, { "acc": 0.77653313, "epoch": 1.9632832208110353, "grad_norm": 5.65625, "learning_rate": 8.816427858825416e-09, "loss": 0.79427109, "memory(GiB)": 138.1, "step": 84150, "train_speed(iter/s)": 0.200589 }, { "acc": 0.79803519, "epoch": 1.9635165283833245, "grad_norm": 5.78125, "learning_rate": 8.704648788879777e-09, "loss": 0.73506384, "memory(GiB)": 138.1, "step": 84160, "train_speed(iter/s)": 0.200601 }, { "acc": 0.76183743, "epoch": 1.9637498359556131, "grad_norm": 4.375, "learning_rate": 8.593582224895036e-09, "loss": 0.85956383, "memory(GiB)": 138.1, "step": 84170, "train_speed(iter/s)": 0.200613 }, { "acc": 0.76150913, "epoch": 1.9639831435279023, "grad_norm": 8.1875, "learning_rate": 8.483228182726843e-09, "loss": 0.85972614, "memory(GiB)": 138.1, "step": 84180, "train_speed(iter/s)": 0.200625 }, { "acc": 0.79041133, "epoch": 1.964216451100191, "grad_norm": 4.71875, "learning_rate": 8.3735866781276e-09, "loss": 0.75700512, "memory(GiB)": 138.1, "step": 84190, "train_speed(iter/s)": 0.200637 }, { "acc": 0.76137877, "epoch": 1.96444975867248, "grad_norm": 5.78125, "learning_rate": 8.264657726749226e-09, "loss": 0.84933109, "memory(GiB)": 138.1, "step": 84200, "train_speed(iter/s)": 0.200649 }, { "acc": 0.782689, "epoch": 1.9646830662447687, "grad_norm": 4.1875, "learning_rate": 8.156441344140398e-09, "loss": 0.7834971, "memory(GiB)": 138.1, "step": 84210, "train_speed(iter/s)": 0.200661 }, { "acc": 0.78736172, "epoch": 1.9649163738170579, "grad_norm": 7.0625, "learning_rate": 8.048937545749313e-09, "loss": 0.74641953, "memory(GiB)": 138.1, "step": 84220, "train_speed(iter/s)": 0.200674 }, { "acc": 0.77960968, "epoch": 1.9651496813893465, "grad_norm": 6.65625, "learning_rate": 7.942146346922586e-09, "loss": 0.81022434, "memory(GiB)": 138.1, "step": 84230, "train_speed(iter/s)": 0.200687 }, { "acc": 0.75209432, "epoch": 1.9653829889616354, "grad_norm": 4.125, "learning_rate": 7.83606776290413e-09, "loss": 0.89543324, "memory(GiB)": 138.1, "step": 84240, "train_speed(iter/s)": 0.200699 }, { "acc": 0.79684596, "epoch": 1.9656162965339243, "grad_norm": 4.875, "learning_rate": 7.730701808836837e-09, "loss": 0.72995892, "memory(GiB)": 138.1, "step": 84250, "train_speed(iter/s)": 0.200712 }, { "acc": 0.78651543, "epoch": 1.9658496041062132, "grad_norm": 5.9375, "learning_rate": 7.626048499761452e-09, "loss": 0.74779282, "memory(GiB)": 138.1, "step": 84260, "train_speed(iter/s)": 0.200723 }, { "acc": 0.7833818, "epoch": 1.9660829116785021, "grad_norm": 6.28125, "learning_rate": 7.522107850617689e-09, "loss": 0.78160305, "memory(GiB)": 138.1, "step": 84270, "train_speed(iter/s)": 0.200736 }, { "acc": 0.80099115, "epoch": 1.966316219250791, "grad_norm": 3.59375, "learning_rate": 7.418879876242014e-09, "loss": 0.70863304, "memory(GiB)": 138.1, "step": 84280, "train_speed(iter/s)": 0.200749 }, { "acc": 0.76215916, "epoch": 1.96654952682308, "grad_norm": 6.40625, "learning_rate": 7.316364591371527e-09, "loss": 0.85770187, "memory(GiB)": 138.1, "step": 84290, "train_speed(iter/s)": 0.20076 }, { "acc": 0.79879122, "epoch": 1.9667828343953688, "grad_norm": 4.9375, "learning_rate": 7.214562010639525e-09, "loss": 0.73097854, "memory(GiB)": 138.1, "step": 84300, "train_speed(iter/s)": 0.200772 }, { "acc": 0.77161856, "epoch": 1.9670161419676577, "grad_norm": 6.625, "learning_rate": 7.113472148578271e-09, "loss": 0.85240879, "memory(GiB)": 138.1, "step": 84310, "train_speed(iter/s)": 0.200784 }, { "acc": 0.78391027, "epoch": 1.9672494495399466, "grad_norm": 4.625, "learning_rate": 7.013095019618443e-09, "loss": 0.76578526, "memory(GiB)": 138.1, "step": 84320, "train_speed(iter/s)": 0.200796 }, { "acc": 0.76423545, "epoch": 1.9674827571122355, "grad_norm": 6.28125, "learning_rate": 6.9134306380885805e-09, "loss": 0.87942724, "memory(GiB)": 138.1, "step": 84330, "train_speed(iter/s)": 0.200808 }, { "acc": 0.76918364, "epoch": 1.9677160646845244, "grad_norm": 8.1875, "learning_rate": 6.814479018216192e-09, "loss": 0.83055038, "memory(GiB)": 138.1, "step": 84340, "train_speed(iter/s)": 0.200821 }, { "acc": 0.76374469, "epoch": 1.9679493722568133, "grad_norm": 4.65625, "learning_rate": 6.7162401741266425e-09, "loss": 0.86087818, "memory(GiB)": 138.1, "step": 84350, "train_speed(iter/s)": 0.200833 }, { "acc": 0.76832781, "epoch": 1.9681826798291022, "grad_norm": 6.25, "learning_rate": 6.6187141198431615e-09, "loss": 0.83810291, "memory(GiB)": 138.1, "step": 84360, "train_speed(iter/s)": 0.200845 }, { "acc": 0.76127386, "epoch": 1.9684159874013911, "grad_norm": 7.15625, "learning_rate": 6.52190086928739e-09, "loss": 0.86279221, "memory(GiB)": 138.1, "step": 84370, "train_speed(iter/s)": 0.200857 }, { "acc": 0.78920145, "epoch": 1.96864929497368, "grad_norm": 5.59375, "learning_rate": 6.425800436279383e-09, "loss": 0.74230909, "memory(GiB)": 138.1, "step": 84380, "train_speed(iter/s)": 0.20087 }, { "acc": 0.77091441, "epoch": 1.968882602545969, "grad_norm": 5.125, "learning_rate": 6.330412834538169e-09, "loss": 0.83819075, "memory(GiB)": 138.1, "step": 84390, "train_speed(iter/s)": 0.200882 }, { "acc": 0.77305069, "epoch": 1.9691159101182578, "grad_norm": 5.1875, "learning_rate": 6.235738077680076e-09, "loss": 0.84126186, "memory(GiB)": 138.1, "step": 84400, "train_speed(iter/s)": 0.200894 }, { "acc": 0.7800487, "epoch": 1.9693492176905467, "grad_norm": 4.78125, "learning_rate": 6.141776179219294e-09, "loss": 0.7732388, "memory(GiB)": 138.1, "step": 84410, "train_speed(iter/s)": 0.200906 }, { "acc": 0.79444857, "epoch": 1.9695825252628356, "grad_norm": 10.0625, "learning_rate": 6.048527152569539e-09, "loss": 0.73840342, "memory(GiB)": 138.1, "step": 84420, "train_speed(iter/s)": 0.200918 }, { "acc": 0.78184214, "epoch": 1.9698158328351245, "grad_norm": 5.1875, "learning_rate": 5.955991011041273e-09, "loss": 0.80885363, "memory(GiB)": 138.1, "step": 84430, "train_speed(iter/s)": 0.200931 }, { "acc": 0.76946917, "epoch": 1.9700491404074132, "grad_norm": 11.5, "learning_rate": 5.864167767845041e-09, "loss": 0.82882366, "memory(GiB)": 138.1, "step": 84440, "train_speed(iter/s)": 0.200943 }, { "acc": 0.79174566, "epoch": 1.9702824479797023, "grad_norm": 8.375, "learning_rate": 5.773057436087581e-09, "loss": 0.72687259, "memory(GiB)": 138.1, "step": 84450, "train_speed(iter/s)": 0.200956 }, { "acc": 0.77029505, "epoch": 1.970515755551991, "grad_norm": 5.375, "learning_rate": 5.6826600287757105e-09, "loss": 0.83553743, "memory(GiB)": 138.1, "step": 84460, "train_speed(iter/s)": 0.200968 }, { "acc": 0.81181126, "epoch": 1.97074906312428, "grad_norm": 6.03125, "learning_rate": 5.592975558813551e-09, "loss": 0.68757429, "memory(GiB)": 138.1, "step": 84470, "train_speed(iter/s)": 0.20098 }, { "acc": 0.7760201, "epoch": 1.9709823706965688, "grad_norm": 4.9375, "learning_rate": 5.504004039002531e-09, "loss": 0.78468761, "memory(GiB)": 138.1, "step": 84480, "train_speed(iter/s)": 0.200993 }, { "acc": 0.78242598, "epoch": 1.971215678268858, "grad_norm": 6.6875, "learning_rate": 5.41574548204471e-09, "loss": 0.77616792, "memory(GiB)": 138.1, "step": 84490, "train_speed(iter/s)": 0.201005 }, { "acc": 0.77654591, "epoch": 1.9714489858411466, "grad_norm": 8.1875, "learning_rate": 5.32819990053779e-09, "loss": 0.82183952, "memory(GiB)": 138.1, "step": 84500, "train_speed(iter/s)": 0.201017 }, { "epoch": 1.9714489858411466, "eval_acc": 0.7447488305867797, "eval_loss": 0.8044179677963257, "eval_runtime": 1271.4413, "eval_samples_per_second": 28.307, "eval_steps_per_second": 14.154, "step": 84500 }, { "acc": 0.77603683, "epoch": 1.9716822934134357, "grad_norm": 4.75, "learning_rate": 5.24136730697955e-09, "loss": 0.79193287, "memory(GiB)": 138.1, "step": 84510, "train_speed(iter/s)": 0.200412 }, { "acc": 0.7718574, "epoch": 1.9719156009857244, "grad_norm": 5.78125, "learning_rate": 5.155247713765077e-09, "loss": 0.82882061, "memory(GiB)": 138.1, "step": 84520, "train_speed(iter/s)": 0.200425 }, { "acc": 0.76282473, "epoch": 1.9721489085580135, "grad_norm": 4.0625, "learning_rate": 5.069841133187869e-09, "loss": 0.83978472, "memory(GiB)": 138.1, "step": 84530, "train_speed(iter/s)": 0.200437 }, { "acc": 0.79579945, "epoch": 1.9723822161303022, "grad_norm": 4.125, "learning_rate": 4.985147577439842e-09, "loss": 0.73812609, "memory(GiB)": 138.1, "step": 84540, "train_speed(iter/s)": 0.200449 }, { "acc": 0.7669486, "epoch": 1.9726155237025913, "grad_norm": 6.875, "learning_rate": 4.901167058610767e-09, "loss": 0.84001894, "memory(GiB)": 138.1, "step": 84550, "train_speed(iter/s)": 0.200461 }, { "acc": 0.78317232, "epoch": 1.97284883127488, "grad_norm": 11.375, "learning_rate": 4.8178995886893895e-09, "loss": 0.76386995, "memory(GiB)": 138.1, "step": 84560, "train_speed(iter/s)": 0.200474 }, { "acc": 0.78392029, "epoch": 1.973082138847169, "grad_norm": 4.90625, "learning_rate": 4.735345179561757e-09, "loss": 0.77154541, "memory(GiB)": 138.1, "step": 84570, "train_speed(iter/s)": 0.200485 }, { "acc": 0.77048054, "epoch": 1.9733154464194578, "grad_norm": 5.96875, "learning_rate": 4.65350384301233e-09, "loss": 0.82347736, "memory(GiB)": 138.1, "step": 84580, "train_speed(iter/s)": 0.200498 }, { "acc": 0.7574182, "epoch": 1.973548753991747, "grad_norm": 4.5625, "learning_rate": 4.572375590723988e-09, "loss": 0.88755779, "memory(GiB)": 138.1, "step": 84590, "train_speed(iter/s)": 0.20051 }, { "acc": 0.786063, "epoch": 1.9737820615640356, "grad_norm": 5.375, "learning_rate": 4.49196043427802e-09, "loss": 0.76739092, "memory(GiB)": 138.1, "step": 84600, "train_speed(iter/s)": 0.200524 }, { "acc": 0.75360541, "epoch": 1.9740153691363247, "grad_norm": 6.4375, "learning_rate": 4.4122583851535785e-09, "loss": 0.87514687, "memory(GiB)": 138.1, "step": 84610, "train_speed(iter/s)": 0.200535 }, { "acc": 0.78751106, "epoch": 1.9742486767086134, "grad_norm": 5.6875, "learning_rate": 4.3332694547276736e-09, "loss": 0.75879245, "memory(GiB)": 138.1, "step": 84620, "train_speed(iter/s)": 0.200548 }, { "acc": 0.77941394, "epoch": 1.9744819842809023, "grad_norm": 5.25, "learning_rate": 4.254993654276285e-09, "loss": 0.7969101, "memory(GiB)": 138.1, "step": 84630, "train_speed(iter/s)": 0.20056 }, { "acc": 0.78160868, "epoch": 1.9747152918531912, "grad_norm": 5.25, "learning_rate": 4.177430994973808e-09, "loss": 0.76313763, "memory(GiB)": 138.1, "step": 84640, "train_speed(iter/s)": 0.200572 }, { "acc": 0.78432083, "epoch": 1.97494859942548, "grad_norm": 7.34375, "learning_rate": 4.1005814878913865e-09, "loss": 0.78618231, "memory(GiB)": 138.1, "step": 84650, "train_speed(iter/s)": 0.200584 }, { "acc": 0.77199898, "epoch": 1.975181906997769, "grad_norm": 4.75, "learning_rate": 4.024445143999689e-09, "loss": 0.81763344, "memory(GiB)": 138.1, "step": 84660, "train_speed(iter/s)": 0.200596 }, { "acc": 0.77098942, "epoch": 1.9754152145700579, "grad_norm": 4.75, "learning_rate": 3.9490219741672445e-09, "loss": 0.82059212, "memory(GiB)": 138.1, "step": 84670, "train_speed(iter/s)": 0.200608 }, { "acc": 0.78812075, "epoch": 1.9756485221423468, "grad_norm": 6.84375, "learning_rate": 3.87431198916044e-09, "loss": 0.75257215, "memory(GiB)": 138.1, "step": 84680, "train_speed(iter/s)": 0.20062 }, { "acc": 0.78079576, "epoch": 1.9758818297146357, "grad_norm": 4.8125, "learning_rate": 3.800315199644078e-09, "loss": 0.78688898, "memory(GiB)": 138.1, "step": 84690, "train_speed(iter/s)": 0.200632 }, { "acc": 0.78553157, "epoch": 1.9761151372869246, "grad_norm": 4.34375, "learning_rate": 3.727031616181376e-09, "loss": 0.76703348, "memory(GiB)": 138.1, "step": 84700, "train_speed(iter/s)": 0.200644 }, { "acc": 0.78337941, "epoch": 1.9763484448592135, "grad_norm": 6.03125, "learning_rate": 3.6544612492334097e-09, "loss": 0.76780548, "memory(GiB)": 138.1, "step": 84710, "train_speed(iter/s)": 0.200655 }, { "acc": 0.77466536, "epoch": 1.9765817524315024, "grad_norm": 4.71875, "learning_rate": 3.582604109159671e-09, "loss": 0.80691957, "memory(GiB)": 138.1, "step": 84720, "train_speed(iter/s)": 0.200668 }, { "acc": 0.77310543, "epoch": 1.9768150600037913, "grad_norm": 4.375, "learning_rate": 3.5114602062180646e-09, "loss": 0.8129674, "memory(GiB)": 138.1, "step": 84730, "train_speed(iter/s)": 0.200679 }, { "acc": 0.77795973, "epoch": 1.9770483675760802, "grad_norm": 5.40625, "learning_rate": 3.4410295505638013e-09, "loss": 0.82256088, "memory(GiB)": 138.1, "step": 84740, "train_speed(iter/s)": 0.200691 }, { "acc": 0.78295822, "epoch": 1.977281675148369, "grad_norm": 5.90625, "learning_rate": 3.3713121522510607e-09, "loss": 0.79164433, "memory(GiB)": 138.1, "step": 84750, "train_speed(iter/s)": 0.200703 }, { "acc": 0.76906929, "epoch": 1.977514982720658, "grad_norm": 5.28125, "learning_rate": 3.3023080212318814e-09, "loss": 0.8484437, "memory(GiB)": 138.1, "step": 84760, "train_speed(iter/s)": 0.200715 }, { "acc": 0.80698633, "epoch": 1.9777482902929469, "grad_norm": 4.59375, "learning_rate": 3.234017167356718e-09, "loss": 0.68034239, "memory(GiB)": 138.1, "step": 84770, "train_speed(iter/s)": 0.200727 }, { "acc": 0.77349758, "epoch": 1.9779815978652358, "grad_norm": 7.46875, "learning_rate": 3.1664396003738827e-09, "loss": 0.82069979, "memory(GiB)": 138.1, "step": 84780, "train_speed(iter/s)": 0.200739 }, { "acc": 0.77547059, "epoch": 1.9782149054375247, "grad_norm": 5.28125, "learning_rate": 3.0995753299306598e-09, "loss": 0.80736742, "memory(GiB)": 138.1, "step": 84790, "train_speed(iter/s)": 0.20075 }, { "acc": 0.7942091, "epoch": 1.9784482130098135, "grad_norm": 5.15625, "learning_rate": 3.0334243655710805e-09, "loss": 0.7481492, "memory(GiB)": 138.1, "step": 84800, "train_speed(iter/s)": 0.200762 }, { "acc": 0.78032541, "epoch": 1.9786815205821022, "grad_norm": 5.5625, "learning_rate": 2.9679867167387024e-09, "loss": 0.79240317, "memory(GiB)": 138.1, "step": 84810, "train_speed(iter/s)": 0.200775 }, { "acc": 0.78130064, "epoch": 1.9789148281543913, "grad_norm": 5.125, "learning_rate": 2.9032623927743864e-09, "loss": 0.79106226, "memory(GiB)": 138.1, "step": 84820, "train_speed(iter/s)": 0.200787 }, { "acc": 0.77278428, "epoch": 1.97914813572668, "grad_norm": 5.53125, "learning_rate": 2.839251402917964e-09, "loss": 0.81999369, "memory(GiB)": 138.1, "step": 84830, "train_speed(iter/s)": 0.200799 }, { "acc": 0.78221178, "epoch": 1.9793814432989691, "grad_norm": 4.21875, "learning_rate": 2.7759537563065706e-09, "loss": 0.76479435, "memory(GiB)": 138.1, "step": 84840, "train_speed(iter/s)": 0.200812 }, { "acc": 0.7911087, "epoch": 1.9796147508712578, "grad_norm": 6.25, "learning_rate": 2.7133694619763117e-09, "loss": 0.75844345, "memory(GiB)": 138.1, "step": 84850, "train_speed(iter/s)": 0.200824 }, { "acc": 0.77396116, "epoch": 1.979848058443547, "grad_norm": 4.25, "learning_rate": 2.6514985288605964e-09, "loss": 0.83643398, "memory(GiB)": 138.1, "step": 84860, "train_speed(iter/s)": 0.200836 }, { "acc": 0.79459782, "epoch": 1.9800813660158356, "grad_norm": 10.875, "learning_rate": 2.590340965791804e-09, "loss": 0.74517059, "memory(GiB)": 138.1, "step": 84870, "train_speed(iter/s)": 0.200848 }, { "acc": 0.77023711, "epoch": 1.9803146735881247, "grad_norm": 5.40625, "learning_rate": 2.529896781500174e-09, "loss": 0.81559296, "memory(GiB)": 138.1, "step": 84880, "train_speed(iter/s)": 0.200861 }, { "acc": 0.75082717, "epoch": 1.9805479811604134, "grad_norm": 4.90625, "learning_rate": 2.4701659846138036e-09, "loss": 0.89876308, "memory(GiB)": 138.1, "step": 84890, "train_speed(iter/s)": 0.200873 }, { "acc": 0.7946104, "epoch": 1.9807812887327025, "grad_norm": 4.3125, "learning_rate": 2.4111485836592065e-09, "loss": 0.73043966, "memory(GiB)": 138.1, "step": 84900, "train_speed(iter/s)": 0.200884 }, { "acc": 0.76876898, "epoch": 1.9810145963049912, "grad_norm": 6.75, "learning_rate": 2.3528445870618643e-09, "loss": 0.83043594, "memory(GiB)": 138.1, "step": 84910, "train_speed(iter/s)": 0.200896 }, { "acc": 0.7821094, "epoch": 1.9812479038772803, "grad_norm": 5.71875, "learning_rate": 2.2952540031440096e-09, "loss": 0.79190273, "memory(GiB)": 138.1, "step": 84920, "train_speed(iter/s)": 0.200909 }, { "acc": 0.78448, "epoch": 1.981481211449569, "grad_norm": 5.21875, "learning_rate": 2.2383768401268435e-09, "loss": 0.78527408, "memory(GiB)": 138.1, "step": 84930, "train_speed(iter/s)": 0.20092 }, { "acc": 0.779772, "epoch": 1.9817145190218581, "grad_norm": 4.9375, "learning_rate": 2.182213106129427e-09, "loss": 0.78437147, "memory(GiB)": 138.1, "step": 84940, "train_speed(iter/s)": 0.200933 }, { "acc": 0.76009808, "epoch": 1.9819478265941468, "grad_norm": 4.625, "learning_rate": 2.126762809169236e-09, "loss": 0.84811478, "memory(GiB)": 138.1, "step": 84950, "train_speed(iter/s)": 0.200945 }, { "acc": 0.7567162, "epoch": 1.982181134166436, "grad_norm": 5.875, "learning_rate": 2.072025957161605e-09, "loss": 0.86951723, "memory(GiB)": 138.1, "step": 84960, "train_speed(iter/s)": 0.200957 }, { "acc": 0.78688498, "epoch": 1.9824144417387246, "grad_norm": 5.9375, "learning_rate": 2.0180025579202844e-09, "loss": 0.77867031, "memory(GiB)": 138.1, "step": 84970, "train_speed(iter/s)": 0.200969 }, { "acc": 0.79099741, "epoch": 1.9826477493110137, "grad_norm": 3.921875, "learning_rate": 1.964692619157438e-09, "loss": 0.73870716, "memory(GiB)": 138.1, "step": 84980, "train_speed(iter/s)": 0.200981 }, { "acc": 0.77988491, "epoch": 1.9828810568833024, "grad_norm": 3.890625, "learning_rate": 1.912096148482534e-09, "loss": 0.80207052, "memory(GiB)": 138.1, "step": 84990, "train_speed(iter/s)": 0.200994 }, { "acc": 0.78414559, "epoch": 1.9831143644555915, "grad_norm": 5.3125, "learning_rate": 1.8602131534045665e-09, "loss": 0.79055219, "memory(GiB)": 138.1, "step": 85000, "train_speed(iter/s)": 0.201006 }, { "epoch": 1.9831143644555915, "eval_acc": 0.7447177626057132, "eval_loss": 0.8044196963310242, "eval_runtime": 1271.3001, "eval_samples_per_second": 28.31, "eval_steps_per_second": 14.156, "step": 85000 }, { "acc": 0.78565392, "epoch": 1.9833476720278802, "grad_norm": 6.03125, "learning_rate": 1.8090436413287226e-09, "loss": 0.76658969, "memory(GiB)": 138.1, "step": 85010, "train_speed(iter/s)": 0.200404 }, { "acc": 0.79666553, "epoch": 1.983580979600169, "grad_norm": 5.34375, "learning_rate": 1.758587619559715e-09, "loss": 0.73290777, "memory(GiB)": 138.1, "step": 85020, "train_speed(iter/s)": 0.200416 }, { "acc": 0.78800335, "epoch": 1.983814287172458, "grad_norm": 5.46875, "learning_rate": 1.7088450953006708e-09, "loss": 0.73815637, "memory(GiB)": 138.1, "step": 85030, "train_speed(iter/s)": 0.200428 }, { "acc": 0.78558655, "epoch": 1.984047594744747, "grad_norm": 5.09375, "learning_rate": 1.659816075652021e-09, "loss": 0.76726046, "memory(GiB)": 138.1, "step": 85040, "train_speed(iter/s)": 0.20044 }, { "acc": 0.79275599, "epoch": 1.9842809023170358, "grad_norm": 5.59375, "learning_rate": 1.6115005676120565e-09, "loss": 0.72810202, "memory(GiB)": 138.1, "step": 85050, "train_speed(iter/s)": 0.200452 }, { "acc": 0.78331842, "epoch": 1.9845142098893247, "grad_norm": 5.09375, "learning_rate": 1.5638985780791483e-09, "loss": 0.78223214, "memory(GiB)": 138.1, "step": 85060, "train_speed(iter/s)": 0.200463 }, { "acc": 0.77881184, "epoch": 1.9847475174616136, "grad_norm": 5.625, "learning_rate": 1.517010113847306e-09, "loss": 0.79921713, "memory(GiB)": 138.1, "step": 85070, "train_speed(iter/s)": 0.200476 }, { "acc": 0.77289662, "epoch": 1.9849808250339025, "grad_norm": 5.25, "learning_rate": 1.4708351816100641e-09, "loss": 0.80485191, "memory(GiB)": 138.1, "step": 85080, "train_speed(iter/s)": 0.200488 }, { "acc": 0.77916284, "epoch": 1.9852141326061914, "grad_norm": 4.5625, "learning_rate": 1.425373787958817e-09, "loss": 0.77773075, "memory(GiB)": 138.1, "step": 85090, "train_speed(iter/s)": 0.200501 }, { "acc": 0.79094009, "epoch": 1.9854474401784803, "grad_norm": 4.6875, "learning_rate": 1.3806259393839282e-09, "loss": 0.7560596, "memory(GiB)": 138.1, "step": 85100, "train_speed(iter/s)": 0.200513 }, { "acc": 0.77891779, "epoch": 1.9856807477507692, "grad_norm": 4.53125, "learning_rate": 1.336591642271956e-09, "loss": 0.79700637, "memory(GiB)": 138.1, "step": 85110, "train_speed(iter/s)": 0.200526 }, { "acc": 0.76863756, "epoch": 1.985914055323058, "grad_norm": 6.71875, "learning_rate": 1.2932709029100933e-09, "loss": 0.8220705, "memory(GiB)": 138.1, "step": 85120, "train_speed(iter/s)": 0.200539 }, { "acc": 0.77171407, "epoch": 1.986147362895347, "grad_norm": 5.625, "learning_rate": 1.2506637274811717e-09, "loss": 0.81749229, "memory(GiB)": 138.1, "step": 85130, "train_speed(iter/s)": 0.200551 }, { "acc": 0.78697386, "epoch": 1.986380670467636, "grad_norm": 7.96875, "learning_rate": 1.2087701220681036e-09, "loss": 0.76459389, "memory(GiB)": 138.1, "step": 85140, "train_speed(iter/s)": 0.200565 }, { "acc": 0.78453169, "epoch": 1.9866139780399248, "grad_norm": 7.8125, "learning_rate": 1.1675900926511053e-09, "loss": 0.77584524, "memory(GiB)": 138.1, "step": 85150, "train_speed(iter/s)": 0.200577 }, { "acc": 0.79571877, "epoch": 1.9868472856122137, "grad_norm": 4.5625, "learning_rate": 1.1271236451082524e-09, "loss": 0.72640424, "memory(GiB)": 138.1, "step": 85160, "train_speed(iter/s)": 0.200589 }, { "acc": 0.77864351, "epoch": 1.9870805931845026, "grad_norm": 7.5, "learning_rate": 1.0873707852160354e-09, "loss": 0.80453529, "memory(GiB)": 138.1, "step": 85170, "train_speed(iter/s)": 0.200601 }, { "acc": 0.76175656, "epoch": 1.9873139007567915, "grad_norm": 5.25, "learning_rate": 1.0483315186499143e-09, "loss": 0.84362011, "memory(GiB)": 138.1, "step": 85180, "train_speed(iter/s)": 0.200613 }, { "acc": 0.77368731, "epoch": 1.9875472083290804, "grad_norm": 3.71875, "learning_rate": 1.0100058509815435e-09, "loss": 0.80538511, "memory(GiB)": 138.1, "step": 85190, "train_speed(iter/s)": 0.200625 }, { "acc": 0.76618557, "epoch": 1.987780515901369, "grad_norm": 7.21875, "learning_rate": 9.723937876832124e-10, "loss": 0.82723331, "memory(GiB)": 138.1, "step": 85200, "train_speed(iter/s)": 0.200638 }, { "acc": 0.7858901, "epoch": 1.9880138234736582, "grad_norm": 5.0, "learning_rate": 9.354953341234042e-10, "loss": 0.75257692, "memory(GiB)": 138.1, "step": 85210, "train_speed(iter/s)": 0.20065 }, { "acc": 0.78722382, "epoch": 1.9882471310459469, "grad_norm": 5.625, "learning_rate": 8.99310495569572e-10, "loss": 0.74956484, "memory(GiB)": 138.1, "step": 85220, "train_speed(iter/s)": 0.200662 }, { "acc": 0.80738592, "epoch": 1.988480438618236, "grad_norm": 4.125, "learning_rate": 8.638392771864734e-10, "loss": 0.67543035, "memory(GiB)": 138.1, "step": 85230, "train_speed(iter/s)": 0.200675 }, { "acc": 0.79583402, "epoch": 1.9887137461905247, "grad_norm": 5.4375, "learning_rate": 8.290816840383908e-10, "loss": 0.73210516, "memory(GiB)": 138.1, "step": 85240, "train_speed(iter/s)": 0.200687 }, { "acc": 0.76078939, "epoch": 1.9889470537628138, "grad_norm": 6.4375, "learning_rate": 7.950377210863558e-10, "loss": 0.84969578, "memory(GiB)": 138.1, "step": 85250, "train_speed(iter/s)": 0.2007 }, { "acc": 0.77894077, "epoch": 1.9891803613351025, "grad_norm": 7.0625, "learning_rate": 7.617073931909247e-10, "loss": 0.79533377, "memory(GiB)": 138.1, "step": 85260, "train_speed(iter/s)": 0.200713 }, { "acc": 0.78382912, "epoch": 1.9894136689073916, "grad_norm": 5.21875, "learning_rate": 7.29090705108848e-10, "loss": 0.77288427, "memory(GiB)": 138.1, "step": 85270, "train_speed(iter/s)": 0.200725 }, { "acc": 0.75326347, "epoch": 1.9896469764796803, "grad_norm": 4.8125, "learning_rate": 6.971876614969564e-10, "loss": 0.89215813, "memory(GiB)": 138.1, "step": 85280, "train_speed(iter/s)": 0.200737 }, { "acc": 0.79840131, "epoch": 1.9898802840519694, "grad_norm": 6.84375, "learning_rate": 6.659982669093845e-10, "loss": 0.70803699, "memory(GiB)": 138.1, "step": 85290, "train_speed(iter/s)": 0.20075 }, { "acc": 0.78066187, "epoch": 1.990113591624258, "grad_norm": 4.90625, "learning_rate": 6.355225257981268e-10, "loss": 0.79505005, "memory(GiB)": 138.1, "step": 85300, "train_speed(iter/s)": 0.200762 }, { "acc": 0.78745785, "epoch": 1.9903468991965472, "grad_norm": 4.65625, "learning_rate": 6.057604425135921e-10, "loss": 0.74916191, "memory(GiB)": 138.1, "step": 85310, "train_speed(iter/s)": 0.200774 }, { "acc": 0.75778542, "epoch": 1.9905802067688358, "grad_norm": 5.96875, "learning_rate": 5.76712021304604e-10, "loss": 0.86030426, "memory(GiB)": 138.1, "step": 85320, "train_speed(iter/s)": 0.200785 }, { "acc": 0.76393342, "epoch": 1.990813514341125, "grad_norm": 6.6875, "learning_rate": 5.483772663178455e-10, "loss": 0.85037289, "memory(GiB)": 138.1, "step": 85330, "train_speed(iter/s)": 0.200798 }, { "acc": 0.77822199, "epoch": 1.9910468219134136, "grad_norm": 6.40625, "learning_rate": 5.20756181597859e-10, "loss": 0.80846586, "memory(GiB)": 138.1, "step": 85340, "train_speed(iter/s)": 0.20081 }, { "acc": 0.78142519, "epoch": 1.9912801294857028, "grad_norm": 6.0625, "learning_rate": 4.938487710870465e-10, "loss": 0.78094144, "memory(GiB)": 138.1, "step": 85350, "train_speed(iter/s)": 0.200822 }, { "acc": 0.77061892, "epoch": 1.9915134370579914, "grad_norm": 4.9375, "learning_rate": 4.676550386273349e-10, "loss": 0.82950497, "memory(GiB)": 138.1, "step": 85360, "train_speed(iter/s)": 0.200834 }, { "acc": 0.78219571, "epoch": 1.9917467446302806, "grad_norm": 6.28125, "learning_rate": 4.421749879574e-10, "loss": 0.76107216, "memory(GiB)": 138.1, "step": 85370, "train_speed(iter/s)": 0.200847 }, { "acc": 0.76850634, "epoch": 1.9919800522025692, "grad_norm": 5.09375, "learning_rate": 4.174086227148877e-10, "loss": 0.84992552, "memory(GiB)": 138.1, "step": 85380, "train_speed(iter/s)": 0.200859 }, { "acc": 0.76265931, "epoch": 1.9922133597748581, "grad_norm": 6.25, "learning_rate": 3.9335594643419294e-10, "loss": 0.84543304, "memory(GiB)": 138.1, "step": 85390, "train_speed(iter/s)": 0.200872 }, { "acc": 0.78111825, "epoch": 1.992446667347147, "grad_norm": 5.15625, "learning_rate": 3.700169625503458e-10, "loss": 0.79352822, "memory(GiB)": 138.1, "step": 85400, "train_speed(iter/s)": 0.200884 }, { "acc": 0.78446589, "epoch": 1.992679974919436, "grad_norm": 5.15625, "learning_rate": 3.4739167439346024e-10, "loss": 0.77862563, "memory(GiB)": 138.1, "step": 85410, "train_speed(iter/s)": 0.200896 }, { "acc": 0.78342505, "epoch": 1.9929132824917248, "grad_norm": 4.03125, "learning_rate": 3.254800851948403e-10, "loss": 0.78520279, "memory(GiB)": 138.1, "step": 85420, "train_speed(iter/s)": 0.200908 }, { "acc": 0.77086182, "epoch": 1.9931465900640137, "grad_norm": 5.59375, "learning_rate": 3.042821980808741e-10, "loss": 0.82681541, "memory(GiB)": 138.1, "step": 85430, "train_speed(iter/s)": 0.20092 }, { "acc": 0.76659908, "epoch": 1.9933798976363026, "grad_norm": 6.09375, "learning_rate": 2.8379801607858473e-10, "loss": 0.83367481, "memory(GiB)": 138.1, "step": 85440, "train_speed(iter/s)": 0.200933 }, { "acc": 0.77113953, "epoch": 1.9936132052085915, "grad_norm": 6.15625, "learning_rate": 2.640275421111893e-10, "loss": 0.82532921, "memory(GiB)": 138.1, "step": 85450, "train_speed(iter/s)": 0.200945 }, { "acc": 0.80559187, "epoch": 1.9938465127808804, "grad_norm": 4.71875, "learning_rate": 2.449707790019851e-10, "loss": 0.6829802, "memory(GiB)": 138.1, "step": 85460, "train_speed(iter/s)": 0.200957 }, { "acc": 0.77535009, "epoch": 1.9940798203531693, "grad_norm": 5.1875, "learning_rate": 2.266277294704633e-10, "loss": 0.80879288, "memory(GiB)": 138.1, "step": 85470, "train_speed(iter/s)": 0.20097 }, { "acc": 0.78569078, "epoch": 1.9943131279254582, "grad_norm": 7.46875, "learning_rate": 2.0899839613508499e-10, "loss": 0.76950636, "memory(GiB)": 138.1, "step": 85480, "train_speed(iter/s)": 0.200981 }, { "acc": 0.78273611, "epoch": 1.9945464354977471, "grad_norm": 7.84375, "learning_rate": 1.9208278151328087e-10, "loss": 0.78465424, "memory(GiB)": 138.1, "step": 85490, "train_speed(iter/s)": 0.200993 }, { "acc": 0.79737329, "epoch": 1.994779743070036, "grad_norm": 5.84375, "learning_rate": 1.7588088801923088e-10, "loss": 0.73678904, "memory(GiB)": 138.1, "step": 85500, "train_speed(iter/s)": 0.201006 }, { "epoch": 1.994779743070036, "eval_acc": 0.744678847557264, "eval_loss": 0.8044396638870239, "eval_runtime": 1270.037, "eval_samples_per_second": 28.339, "eval_steps_per_second": 14.17, "step": 85500 } ], "logging_steps": 10, "max_steps": 85722, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 2.0183064448975503e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }