{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "cqUj97NiBhho", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "58399c19-7d5d-4f3c-a123-06d31af77441" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mudHEhjxBhh6" }, "outputs": [], "source": [ "!pip install -q tiktoken\n", "!pip install -q gradio" ] }, { "cell_type": "code", "source": [ "import os\n", "\n", "script_dir = os.path.dirname(\"/content/drive/MyDrive/S21/training.ipynb\")\n", "os.chdir(script_dir)" ], "metadata": { "id": "EsZacm8wIB_z" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from transformers import GPT2LMHeadModel\n", "import tiktoken\n", "import torch\n", "import gradio as gr\n", "import model\n", "from model import run_train, gen_text\n" ], "metadata": { "id": "4vxHQrvhB6Qe", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "4983528b-15f9-4c4b-e680-0b767becf221" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "using device: cuda\n", "loaded 338025 tokens\n", "1 epoch = 82 batches\n" ] } ] }, { "cell_type": "code", "source": [ "PATH = \"/content/drive/MyDrive/S21/gpt_124M_30thJune2024.pth\"\n", "class GPTConfig:\n", " block_size: int = 1024 # max sequence length\n", " vocab_size: int = 50304 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token\n", " n_layer: int = 12 # number of layers\n", " n_head: int = 12 # number of heads\n", " n_embd: int = 768 # embedding dimension" ], "metadata": { "id": "eYcIy_DxZE4b" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model.run_train(max_steps = 5000, warmup_steps = 100,PATH = \"/content/drive/MyDrive/S21/gpt_124M_6thJuly2024.pth\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tDFaZbMbJy62", "outputId": "4dbf0849-8bca-4877-a32e-c82ab2b7661d" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[1;30;43mStreaming output truncated to the last 5000 lines.\u001b[0m\n", "step1 | loss: 8.945737838745117 | dt: 1390.27ms | tok/sec: 2946.19 | norm: 2.94\n", "step2 | loss: 8.977760314941406 | dt: 1410.55ms | tok/sec: 2903.84 | norm: 2.57\n", "step3 | loss: 8.760540008544922 | dt: 1415.07ms | tok/sec: 2894.56 | norm: 2.80\n", "step4 | loss: 8.788466453552246 | dt: 1417.25ms | tok/sec: 2890.11 | norm: 2.53\n", "step5 | loss: 8.553628921508789 | dt: 1414.54ms | tok/sec: 2895.64 | norm: 3.64\n", "step6 | loss: 8.549941062927246 | dt: 1412.69ms | tok/sec: 2899.43 | norm: 3.06\n", "step7 | loss: 8.620550155639648 | dt: 1425.69ms | tok/sec: 2873.00 | norm: 2.76\n", "step8 | loss: 8.458500862121582 | dt: 1422.62ms | tok/sec: 2879.20 | norm: 2.58\n", "step9 | loss: 8.644089698791504 | dt: 1425.43ms | tok/sec: 2873.52 | norm: 2.81\n", "step10 | loss: 8.387128829956055 | dt: 1434.13ms | tok/sec: 2856.10 | norm: 3.28\n", "step11 | loss: 8.398907661437988 | dt: 1440.71ms | tok/sec: 2843.03 | norm: 3.10\n", "step12 | loss: 8.482580184936523 | dt: 1438.34ms | tok/sec: 2847.73 | norm: 2.40\n", "step13 | loss: 8.50646686553955 | dt: 1444.66ms | tok/sec: 2835.28 | norm: 2.80\n", "step14 | loss: 8.394956588745117 | dt: 1456.22ms | tok/sec: 2812.76 | norm: 2.20\n", "step15 | loss: 8.208503723144531 | dt: 1453.23ms | tok/sec: 2818.55 | norm: 2.49\n", "step16 | loss: 8.26568603515625 | dt: 1455.34ms | tok/sec: 2814.46 | norm: 1.73\n", "step17 | loss: 8.168992042541504 | dt: 1473.45ms | tok/sec: 2779.87 | norm: 1.80\n", "step18 | loss: 7.985135078430176 | dt: 1476.84ms | tok/sec: 2773.48 | norm: 1.95\n", "step19 | loss: 7.830777168273926 | dt: 1467.51ms | tok/sec: 2791.12 | norm: 1.95\n", "step20 | loss: 7.762506484985352 | dt: 1483.47ms | tok/sec: 2761.09 | norm: 1.78\n", "step21 | loss: 7.561286926269531 | dt: 1490.95ms | tok/sec: 2747.25 | norm: 1.71\n", "step22 | loss: 7.636416435241699 | dt: 1497.22ms | tok/sec: 2735.74 | norm: 1.64\n", "step23 | loss: 7.54252815246582 | dt: 1485.05ms | tok/sec: 2758.15 | norm: 1.61\n", "step24 | loss: 7.456019878387451 | dt: 1489.18ms | tok/sec: 2750.51 | norm: 1.59\n", "step25 | loss: 7.219597816467285 | dt: 1499.84ms | tok/sec: 2730.96 | norm: 2.09\n", "step26 | loss: 7.265284061431885 | dt: 1496.94ms | tok/sec: 2736.26 | norm: 1.56\n", "step27 | loss: 7.096428871154785 | dt: 1491.73ms | tok/sec: 2745.80 | norm: 1.86\n", "step28 | loss: 6.852946758270264 | dt: 1498.17ms | tok/sec: 2733.99 | norm: 1.43\n", "step29 | loss: 6.851794242858887 | dt: 1487.07ms | tok/sec: 2754.42 | norm: 2.58\n", "step30 | loss: 6.868936538696289 | dt: 1491.01ms | tok/sec: 2747.14 | norm: 1.24\n", "step31 | loss: 6.6105780601501465 | dt: 1477.43ms | tok/sec: 2772.38 | norm: 1.66\n", "step32 | loss: 6.565546989440918 | dt: 1465.75ms | tok/sec: 2794.47 | norm: 1.45\n", "step33 | loss: 6.659306526184082 | dt: 1476.80ms | tok/sec: 2773.56 | norm: 1.40\n", "step34 | loss: 6.500646591186523 | dt: 1467.93ms | tok/sec: 2790.33 | norm: 1.21\n", "step35 | loss: 6.318203449249268 | dt: 1469.53ms | tok/sec: 2787.29 | norm: 1.17\n", "step36 | loss: 6.303432941436768 | dt: 1461.20ms | tok/sec: 2803.17 | norm: 1.80\n", "step37 | loss: 6.216024875640869 | dt: 1464.40ms | tok/sec: 2797.06 | norm: 1.17\n", "step38 | loss: 6.080145835876465 | dt: 1446.36ms | tok/sec: 2831.94 | norm: 1.26\n", "step39 | loss: 6.164088249206543 | dt: 1450.68ms | tok/sec: 2823.51 | norm: 1.32\n", "step40 | loss: 6.073040008544922 | dt: 1450.30ms | tok/sec: 2824.25 | norm: 1.42\n", "step41 | loss: 6.536880970001221 | dt: 1446.06ms | tok/sec: 2832.52 | norm: 2.25\n", "step42 | loss: 6.4170122146606445 | dt: 1438.22ms | tok/sec: 2847.96 | norm: 1.28\n", "step43 | loss: 6.219386100769043 | dt: 1441.54ms | tok/sec: 2841.41 | norm: 1.30\n", "step44 | loss: 6.305191516876221 | dt: 1441.30ms | tok/sec: 2841.88 | norm: 1.48\n", "step45 | loss: 6.515499114990234 | dt: 1435.57ms | tok/sec: 2853.22 | norm: 1.04\n", "step46 | loss: 6.462535858154297 | dt: 1441.71ms | tok/sec: 2841.06 | norm: 1.79\n", "step47 | loss: 6.162384033203125 | dt: 1428.64ms | tok/sec: 2867.06 | norm: 1.94\n", "step48 | loss: 6.246455192565918 | dt: 1433.18ms | tok/sec: 2857.98 | norm: 1.40\n", "step49 | loss: 6.158731460571289 | dt: 1432.28ms | tok/sec: 2859.78 | norm: 1.22\n", "step50 | loss: 6.084151744842529 | dt: 1429.72ms | tok/sec: 2864.91 | norm: 1.17\n", "step51 | loss: 6.224058151245117 | dt: 1430.90ms | tok/sec: 2862.54 | norm: 1.30\n", "step52 | loss: 6.187849521636963 | dt: 1427.47ms | tok/sec: 2869.42 | norm: 6.30\n", "step53 | loss: 5.909010887145996 | dt: 1426.86ms | tok/sec: 2870.65 | norm: 1.89\n", "step54 | loss: 6.0662455558776855 | dt: 1427.74ms | tok/sec: 2868.88 | norm: 1.61\n", "step55 | loss: 6.295223236083984 | dt: 1425.00ms | tok/sec: 2874.38 | norm: 2.16\n", "step56 | loss: 6.134381294250488 | dt: 1418.66ms | tok/sec: 2887.23 | norm: 1.47\n", "step57 | loss: 6.069981575012207 | dt: 1430.84ms | tok/sec: 2862.66 | norm: 1.46\n", "step58 | loss: 5.78688383102417 | dt: 1435.64ms | tok/sec: 2853.09 | norm: 1.61\n", "step59 | loss: 5.608602523803711 | dt: 1416.77ms | tok/sec: 2891.09 | norm: 1.21\n", "step60 | loss: 6.1430535316467285 | dt: 1423.67ms | tok/sec: 2877.07 | norm: 1.49\n", "step61 | loss: 6.061432838439941 | dt: 1425.89ms | tok/sec: 2872.59 | norm: 1.58\n", "step62 | loss: 5.85019588470459 | dt: 1431.69ms | tok/sec: 2860.96 | norm: 1.64\n", "step63 | loss: 5.876503944396973 | dt: 1433.67ms | tok/sec: 2857.00 | norm: 2.30\n", "step64 | loss: 6.037599563598633 | dt: 1431.35ms | tok/sec: 2861.63 | norm: 1.48\n", "step65 | loss: 5.862331390380859 | dt: 1437.71ms | tok/sec: 2848.97 | norm: 4.00\n", "step66 | loss: 5.517021179199219 | dt: 1436.26ms | tok/sec: 2851.85 | norm: 1.94\n", "step67 | loss: 5.525525093078613 | dt: 1431.22ms | tok/sec: 2861.90 | norm: 1.64\n", "step68 | loss: 5.451373100280762 | dt: 1438.44ms | tok/sec: 2847.53 | norm: 1.29\n", "step69 | loss: 6.241055965423584 | dt: 1446.71ms | tok/sec: 2831.26 | norm: 2.36\n", "step70 | loss: 6.067675590515137 | dt: 1434.42ms | tok/sec: 2855.51 | norm: 1.96\n", "step71 | loss: 6.129344940185547 | dt: 1441.95ms | tok/sec: 2840.60 | norm: 1.34\n", "step72 | loss: 6.067255973815918 | dt: 1444.00ms | tok/sec: 2836.56 | norm: 1.58\n", "step73 | loss: 6.068198204040527 | dt: 1451.99ms | tok/sec: 2820.96 | norm: 1.41\n", "step74 | loss: 6.027049541473389 | dt: 1453.50ms | tok/sec: 2818.03 | norm: 1.53\n", "step75 | loss: 5.701395511627197 | dt: 1438.34ms | tok/sec: 2847.72 | norm: 1.56\n", "step76 | loss: 5.476472854614258 | dt: 1450.10ms | tok/sec: 2824.64 | norm: 1.45\n", "step77 | loss: 5.424685955047607 | dt: 1445.20ms | tok/sec: 2834.20 | norm: 1.29\n", "step78 | loss: 5.435624599456787 | dt: 1451.05ms | tok/sec: 2822.78 | norm: 1.37\n", "step79 | loss: 5.522193908691406 | dt: 1463.53ms | tok/sec: 2798.72 | norm: 1.32\n", "step80 | loss: 5.379876613616943 | dt: 1445.86ms | tok/sec: 2832.92 | norm: 1.76\n", "step81 | loss: 5.594132423400879 | dt: 1454.80ms | tok/sec: 2815.51 | norm: 1.21\n", "step82 | loss: 5.6916656494140625 | dt: 1457.04ms | tok/sec: 2811.17 | norm: 1.64\n", "step83 | loss: 5.813674449920654 | dt: 1461.42ms | tok/sec: 2802.76 | norm: 2.01\n", "step84 | loss: 5.7820658683776855 | dt: 1457.80ms | tok/sec: 2809.72 | norm: 2.34\n", "step85 | loss: 5.621419906616211 | dt: 1468.19ms | tok/sec: 2789.82 | norm: 1.56\n", "step86 | loss: 5.5934247970581055 | dt: 1465.05ms | tok/sec: 2795.81 | norm: 1.61\n", "step87 | loss: 5.331068515777588 | dt: 1454.28ms | tok/sec: 2816.51 | norm: 1.37\n", "step88 | loss: 5.331151008605957 | dt: 1454.47ms | tok/sec: 2816.15 | norm: 2.18\n", "step89 | loss: 5.572970867156982 | dt: 1454.95ms | tok/sec: 2815.22 | norm: 1.60\n", "step90 | loss: 5.447515487670898 | dt: 1466.25ms | tok/sec: 2793.53 | norm: 1.87\n", "step91 | loss: 5.635876178741455 | dt: 1466.74ms | tok/sec: 2792.58 | norm: 1.83\n", "step92 | loss: 5.310215473175049 | dt: 1447.74ms | tok/sec: 2829.24 | norm: 3.47\n", "step93 | loss: 5.679214954376221 | dt: 1457.87ms | tok/sec: 2809.58 | norm: 1.68\n", "step94 | loss: 5.732431411743164 | dt: 1459.20ms | tok/sec: 2807.01 | norm: 1.31\n", "step95 | loss: 5.836301803588867 | dt: 1455.36ms | tok/sec: 2814.42 | norm: 1.23\n", "step96 | loss: 5.715570449829102 | dt: 1452.90ms | tok/sec: 2819.19 | norm: 1.24\n", "step97 | loss: 5.64725399017334 | dt: 1452.58ms | tok/sec: 2819.82 | norm: 2.47\n", "step98 | loss: 5.652454376220703 | dt: 1455.55ms | tok/sec: 2814.06 | norm: 2.43\n", "step99 | loss: 5.556742191314697 | dt: 1453.73ms | tok/sec: 2817.59 | norm: 1.21\n", "step100 | loss: 5.479020118713379 | dt: 1453.52ms | tok/sec: 2817.99 | norm: 1.47\n", "step101 | loss: 5.273493766784668 | dt: 1449.33ms | tok/sec: 2826.13 | norm: 1.39\n", "step102 | loss: 5.139571189880371 | dt: 1455.69ms | tok/sec: 2813.78 | norm: 1.44\n", "step103 | loss: 5.500404357910156 | dt: 1441.16ms | tok/sec: 2842.16 | norm: 1.47\n", "step104 | loss: 5.660423278808594 | dt: 1444.52ms | tok/sec: 2835.55 | norm: 1.27\n", "step105 | loss: 5.7075347900390625 | dt: 1449.79ms | tok/sec: 2825.24 | norm: 1.13\n", "step106 | loss: 5.6637773513793945 | dt: 1443.62ms | tok/sec: 2837.31 | norm: 1.41\n", "step107 | loss: 5.531773567199707 | dt: 1442.64ms | tok/sec: 2839.24 | norm: 1.47\n", "step108 | loss: 5.708502769470215 | dt: 1449.30ms | tok/sec: 2826.19 | norm: 1.24\n", "step109 | loss: 5.505993843078613 | dt: 1453.30ms | tok/sec: 2818.42 | norm: 1.21\n", "step110 | loss: 5.276731014251709 | dt: 1446.37ms | tok/sec: 2831.91 | norm: 1.43\n", "step111 | loss: 5.469809532165527 | dt: 1448.57ms | tok/sec: 2827.61 | norm: 1.26\n", "step112 | loss: 5.597436428070068 | dt: 1437.37ms | tok/sec: 2849.64 | norm: 1.16\n", "step113 | loss: 5.286303520202637 | dt: 1443.77ms | tok/sec: 2837.01 | norm: 1.27\n", "step114 | loss: 5.252491474151611 | dt: 1450.50ms | tok/sec: 2823.86 | norm: 1.10\n", "step115 | loss: 5.537644386291504 | dt: 1444.89ms | tok/sec: 2834.81 | norm: 1.00\n", "step116 | loss: 5.348674774169922 | dt: 1449.27ms | tok/sec: 2826.26 | norm: 0.98\n", "step117 | loss: 5.232497215270996 | dt: 1444.64ms | tok/sec: 2835.30 | norm: 1.16\n", "step118 | loss: 5.144531726837158 | dt: 1450.97ms | tok/sec: 2822.95 | norm: 1.24\n", "step119 | loss: 5.234447956085205 | dt: 1437.53ms | tok/sec: 2849.33 | norm: 0.99\n", "step120 | loss: 5.129011154174805 | dt: 1437.60ms | tok/sec: 2849.18 | norm: 1.00\n", "step121 | loss: 5.345563888549805 | dt: 1446.43ms | tok/sec: 2831.80 | norm: 1.09\n", "step122 | loss: 5.112115859985352 | dt: 1440.04ms | tok/sec: 2844.36 | norm: 1.03\n", "step123 | loss: 5.626099109649658 | dt: 1439.68ms | tok/sec: 2845.08 | norm: 1.52\n", "step124 | loss: 5.553160667419434 | dt: 1446.95ms | tok/sec: 2830.78 | norm: 1.50\n", "step125 | loss: 5.379629611968994 | dt: 1443.85ms | tok/sec: 2836.87 | norm: 1.41\n", "step126 | loss: 5.528325080871582 | dt: 1437.40ms | tok/sec: 2849.59 | norm: 1.31\n", "step127 | loss: 5.860466003417969 | dt: 1449.36ms | tok/sec: 2826.08 | norm: 1.26\n", "step128 | loss: 5.7655720710754395 | dt: 1439.63ms | tok/sec: 2845.17 | norm: 1.23\n", "step129 | loss: 5.402417182922363 | dt: 1447.43ms | tok/sec: 2829.85 | norm: 1.23\n", "step130 | loss: 5.556426048278809 | dt: 1444.88ms | tok/sec: 2834.84 | norm: 1.32\n", "step131 | loss: 5.52518367767334 | dt: 1442.11ms | tok/sec: 2840.29 | norm: 1.28\n", "step132 | loss: 5.429701805114746 | dt: 1441.55ms | tok/sec: 2841.38 | norm: 1.17\n", "step133 | loss: 5.367312431335449 | dt: 1441.42ms | tok/sec: 2841.65 | norm: 1.27\n", "step134 | loss: 5.338177680969238 | dt: 1441.62ms | tok/sec: 2841.24 | norm: 1.43\n", "step135 | loss: 5.108290195465088 | dt: 1450.46ms | tok/sec: 2823.93 | norm: 1.34\n", "step136 | loss: 5.316558837890625 | dt: 1452.87ms | tok/sec: 2819.25 | norm: 1.43\n", "step137 | loss: 5.428184986114502 | dt: 1452.26ms | tok/sec: 2820.43 | norm: 1.45\n", "step138 | loss: 5.291104316711426 | dt: 1439.93ms | tok/sec: 2844.59 | norm: 1.37\n", "step139 | loss: 5.25390625 | dt: 1447.56ms | tok/sec: 2829.58 | norm: 1.22\n", "step140 | loss: 5.011916160583496 | dt: 1449.17ms | tok/sec: 2826.45 | norm: 1.26\n", "step141 | loss: 4.93460750579834 | dt: 1440.92ms | tok/sec: 2842.63 | norm: 1.17\n", "step142 | loss: 5.469920635223389 | dt: 1451.76ms | tok/sec: 2821.40 | norm: 1.26\n", "step143 | loss: 5.324513912200928 | dt: 1448.64ms | tok/sec: 2827.47 | norm: 1.26\n", "step144 | loss: 5.080223083496094 | dt: 1442.03ms | tok/sec: 2840.45 | norm: 1.35\n", "step145 | loss: 5.062565803527832 | dt: 1441.52ms | tok/sec: 2841.45 | norm: 1.58\n", "step146 | loss: 5.29072380065918 | dt: 1448.49ms | tok/sec: 2827.78 | norm: 1.18\n", "step147 | loss: 5.143205642700195 | dt: 1439.06ms | tok/sec: 2846.30 | norm: 1.21\n", "step148 | loss: 4.825336456298828 | dt: 1450.91ms | tok/sec: 2823.05 | norm: 1.08\n", "step149 | loss: 4.872326374053955 | dt: 1453.17ms | tok/sec: 2818.67 | norm: 1.21\n", "step150 | loss: 4.8494062423706055 | dt: 1440.46ms | tok/sec: 2843.54 | norm: 1.15\n", "step151 | loss: 5.503891944885254 | dt: 1443.64ms | tok/sec: 2837.26 | norm: 1.67\n", "step152 | loss: 5.354808807373047 | dt: 1451.60ms | tok/sec: 2821.72 | norm: 1.67\n", "step153 | loss: 5.273574352264404 | dt: 1445.21ms | tok/sec: 2834.18 | norm: 1.46\n", "step154 | loss: 5.258111476898193 | dt: 1446.64ms | tok/sec: 2831.39 | norm: 1.43\n", "step155 | loss: 5.2853803634643555 | dt: 1442.69ms | tok/sec: 2839.14 | norm: 1.33\n", "step156 | loss: 5.245497226715088 | dt: 1450.17ms | tok/sec: 2824.50 | norm: 1.30\n", "step157 | loss: 4.9587788581848145 | dt: 1451.49ms | tok/sec: 2821.93 | norm: 1.21\n", "step158 | loss: 4.743156909942627 | dt: 1439.29ms | tok/sec: 2845.85 | norm: 1.50\n", "step159 | loss: 4.782686233520508 | dt: 1452.13ms | tok/sec: 2820.68 | norm: 1.06\n", "step160 | loss: 4.800837516784668 | dt: 1453.40ms | tok/sec: 2818.21 | norm: 1.09\n", "step161 | loss: 4.930896282196045 | dt: 1449.95ms | tok/sec: 2824.92 | norm: 1.10\n", "step162 | loss: 4.887170791625977 | dt: 1439.17ms | tok/sec: 2846.09 | norm: 0.92\n", "step163 | loss: 5.087141990661621 | dt: 1450.76ms | tok/sec: 2823.34 | norm: 1.07\n", "step164 | loss: 5.041522979736328 | dt: 1453.57ms | tok/sec: 2817.90 | norm: 1.15\n", "step165 | loss: 4.950073719024658 | dt: 1449.70ms | tok/sec: 2825.42 | norm: 1.86\n", "step166 | loss: 4.985386371612549 | dt: 1441.39ms | tok/sec: 2841.71 | norm: 1.46\n", "step167 | loss: 4.986093521118164 | dt: 1455.95ms | tok/sec: 2813.29 | norm: 1.59\n", "step168 | loss: 4.9492411613464355 | dt: 1455.57ms | tok/sec: 2814.02 | norm: 1.10\n", "step169 | loss: 4.657823085784912 | dt: 1455.70ms | tok/sec: 2813.77 | norm: 1.25\n", "step170 | loss: 4.7482805252075195 | dt: 1449.59ms | tok/sec: 2825.62 | norm: 1.09\n", "step171 | loss: 5.054223537445068 | dt: 1453.51ms | tok/sec: 2818.00 | norm: 1.15\n", "step172 | loss: 4.766578674316406 | dt: 1454.58ms | tok/sec: 2815.93 | norm: 1.32\n", "step173 | loss: 5.013449668884277 | dt: 1447.29ms | tok/sec: 2830.11 | norm: 1.19\n", "step174 | loss: 4.688421249389648 | dt: 1452.83ms | tok/sec: 2819.32 | norm: 1.59\n", "step175 | loss: 5.17453145980835 | dt: 1449.82ms | tok/sec: 2825.18 | norm: 0.97\n", "step176 | loss: 5.233617782592773 | dt: 1454.26ms | tok/sec: 2816.55 | norm: 1.01\n", "step177 | loss: 5.361075401306152 | dt: 1448.91ms | tok/sec: 2826.95 | norm: 1.14\n", "step178 | loss: 5.304369926452637 | dt: 1447.93ms | tok/sec: 2828.86 | norm: 1.17\n", "step179 | loss: 5.300342082977295 | dt: 1456.97ms | tok/sec: 2811.31 | norm: 1.12\n", "step180 | loss: 5.314785957336426 | dt: 1450.71ms | tok/sec: 2823.44 | norm: 0.97\n", "step181 | loss: 5.293108940124512 | dt: 1445.02ms | tok/sec: 2834.56 | norm: 0.98\n", "step182 | loss: 5.268526554107666 | dt: 1453.93ms | tok/sec: 2817.20 | norm: 1.01\n", "step183 | loss: 5.044040679931641 | dt: 1450.07ms | tok/sec: 2824.70 | norm: 0.95\n", "step184 | loss: 4.872367858886719 | dt: 1440.45ms | tok/sec: 2843.55 | norm: 1.20\n", "step185 | loss: 5.184866428375244 | dt: 1448.00ms | tok/sec: 2828.74 | norm: 1.37\n", "step186 | loss: 5.295458793640137 | dt: 1454.94ms | tok/sec: 2815.23 | norm: 1.16\n", "step187 | loss: 5.375043869018555 | dt: 1455.22ms | tok/sec: 2814.69 | norm: 1.01\n", "step188 | loss: 5.317579746246338 | dt: 1454.48ms | tok/sec: 2816.12 | norm: 1.13\n", "step189 | loss: 5.168277740478516 | dt: 1453.35ms | tok/sec: 2818.32 | norm: 1.38\n", "step190 | loss: 5.402000904083252 | dt: 1452.41ms | tok/sec: 2820.14 | norm: 1.17\n", "step191 | loss: 5.204967021942139 | dt: 1439.37ms | tok/sec: 2845.69 | norm: 1.19\n", "step192 | loss: 4.981260299682617 | dt: 1450.10ms | tok/sec: 2824.62 | norm: 1.22\n", "step193 | loss: 5.175750732421875 | dt: 1450.48ms | tok/sec: 2823.90 | norm: 1.11\n", "step194 | loss: 5.324488639831543 | dt: 1449.33ms | tok/sec: 2826.14 | norm: 1.06\n", "step195 | loss: 5.019986629486084 | dt: 1447.24ms | tok/sec: 2830.22 | norm: 1.12\n", "step196 | loss: 5.015951156616211 | dt: 1451.31ms | tok/sec: 2822.29 | norm: 1.02\n", "step197 | loss: 5.309602737426758 | dt: 1452.08ms | tok/sec: 2820.79 | norm: 1.00\n", "step198 | loss: 5.12251091003418 | dt: 1447.72ms | tok/sec: 2829.27 | norm: 1.00\n", "step199 | loss: 5.0004377365112305 | dt: 1443.59ms | tok/sec: 2837.37 | norm: 1.13\n", "step200 | loss: 4.8574419021606445 | dt: 1455.42ms | tok/sec: 2814.31 | norm: 1.30\n", "step201 | loss: 4.9786152839660645 | dt: 1449.47ms | tok/sec: 2825.87 | norm: 1.02\n", "step202 | loss: 4.86368989944458 | dt: 1454.35ms | tok/sec: 2816.39 | norm: 1.01\n", "step203 | loss: 5.067568302154541 | dt: 1436.38ms | tok/sec: 2851.62 | norm: 1.00\n", "step204 | loss: 4.844322204589844 | dt: 1451.32ms | tok/sec: 2822.26 | norm: 1.00\n", "step205 | loss: 5.368171691894531 | dt: 1440.22ms | tok/sec: 2844.01 | norm: 1.48\n", "step206 | loss: 5.302802562713623 | dt: 1453.90ms | tok/sec: 2817.25 | norm: 1.42\n", "step207 | loss: 5.1223978996276855 | dt: 1455.85ms | tok/sec: 2813.47 | norm: 1.41\n", "step208 | loss: 5.313565731048584 | dt: 1447.71ms | tok/sec: 2829.29 | norm: 1.29\n", "step209 | loss: 5.629851341247559 | dt: 1454.28ms | tok/sec: 2816.50 | norm: 1.24\n", "step210 | loss: 5.508889198303223 | dt: 1450.40ms | tok/sec: 2824.05 | norm: 1.20\n", "step211 | loss: 5.158907413482666 | dt: 1451.83ms | tok/sec: 2821.27 | norm: 1.13\n", "step212 | loss: 5.268360614776611 | dt: 1450.26ms | tok/sec: 2824.32 | norm: 1.19\n", "step213 | loss: 5.27314567565918 | dt: 1443.69ms | tok/sec: 2837.18 | norm: 1.09\n", "step214 | loss: 5.151209354400635 | dt: 1453.24ms | tok/sec: 2818.53 | norm: 0.96\n", "step215 | loss: 5.079079627990723 | dt: 1447.94ms | tok/sec: 2828.85 | norm: 1.51\n", "step216 | loss: 5.029804706573486 | dt: 1450.61ms | tok/sec: 2823.63 | norm: 1.51\n", "step217 | loss: 4.786205768585205 | dt: 1455.49ms | tok/sec: 2814.17 | norm: 1.37\n", "step218 | loss: 5.012978553771973 | dt: 1443.10ms | tok/sec: 2838.33 | norm: 1.51\n", "step219 | loss: 5.053293228149414 | dt: 1454.77ms | tok/sec: 2815.56 | norm: 1.55\n", "step220 | loss: 4.939119815826416 | dt: 1445.77ms | tok/sec: 2833.09 | norm: 1.50\n", "step221 | loss: 4.9059295654296875 | dt: 1459.22ms | tok/sec: 2806.99 | norm: 1.25\n", "step222 | loss: 4.677852153778076 | dt: 1454.14ms | tok/sec: 2816.79 | norm: 1.43\n", "step223 | loss: 4.62007999420166 | dt: 1447.32ms | tok/sec: 2830.06 | norm: 1.20\n", "step224 | loss: 5.282585144042969 | dt: 1451.35ms | tok/sec: 2822.21 | norm: 1.38\n", "step225 | loss: 5.0619988441467285 | dt: 1451.42ms | tok/sec: 2822.07 | norm: 1.33\n", "step226 | loss: 4.8299431800842285 | dt: 1452.71ms | tok/sec: 2819.55 | norm: 1.51\n", "step227 | loss: 4.821722984313965 | dt: 1452.27ms | tok/sec: 2820.41 | norm: 1.63\n", "step228 | loss: 5.030673027038574 | dt: 1453.36ms | tok/sec: 2818.30 | norm: 1.18\n", "step229 | loss: 4.916889667510986 | dt: 1455.18ms | tok/sec: 2814.78 | norm: 1.22\n", "step230 | loss: 4.607572078704834 | dt: 1452.80ms | tok/sec: 2819.39 | norm: 1.43\n", "step231 | loss: 4.6107683181762695 | dt: 1448.57ms | tok/sec: 2827.61 | norm: 1.23\n", "step232 | loss: 4.617313861846924 | dt: 1448.98ms | tok/sec: 2826.82 | norm: 1.18\n", "step233 | loss: 5.343664169311523 | dt: 1443.95ms | tok/sec: 2836.66 | norm: 1.49\n", "step234 | loss: 5.193075656890869 | dt: 1453.67ms | tok/sec: 2817.71 | norm: 1.74\n", "step235 | loss: 5.103072643280029 | dt: 1453.98ms | tok/sec: 2817.09 | norm: 1.44\n", "step236 | loss: 5.075605392456055 | dt: 1445.70ms | tok/sec: 2833.22 | norm: 1.32\n", "step237 | loss: 5.115041255950928 | dt: 1454.77ms | tok/sec: 2815.56 | norm: 1.25\n", "step238 | loss: 5.075817584991455 | dt: 1455.57ms | tok/sec: 2814.01 | norm: 1.32\n", "step239 | loss: 4.808104515075684 | dt: 1451.63ms | tok/sec: 2821.65 | norm: 1.18\n", "step240 | loss: 4.576166152954102 | dt: 1456.38ms | tok/sec: 2812.46 | norm: 1.51\n", "step241 | loss: 4.620969295501709 | dt: 1445.46ms | tok/sec: 2833.71 | norm: 1.04\n", "step242 | loss: 4.625289440155029 | dt: 1441.21ms | tok/sec: 2842.06 | norm: 1.13\n", "step243 | loss: 4.7604660987854 | dt: 1451.89ms | tok/sec: 2821.16 | norm: 1.18\n", "step244 | loss: 4.732614517211914 | dt: 1454.62ms | tok/sec: 2815.85 | norm: 0.96\n", "step245 | loss: 4.946151256561279 | dt: 1451.72ms | tok/sec: 2821.47 | norm: 1.10\n", "step246 | loss: 4.890689373016357 | dt: 1450.21ms | tok/sec: 2824.41 | norm: 1.03\n", "step247 | loss: 4.791741847991943 | dt: 1460.90ms | tok/sec: 2803.75 | norm: 1.68\n", "step248 | loss: 4.801576614379883 | dt: 1455.20ms | tok/sec: 2814.74 | norm: 1.31\n", "step249 | loss: 4.8141021728515625 | dt: 1451.77ms | tok/sec: 2821.39 | norm: 1.29\n", "step250 | loss: 4.7890305519104 | dt: 1452.17ms | tok/sec: 2820.60 | norm: 1.10\n", "step251 | loss: 4.46976900100708 | dt: 1458.42ms | tok/sec: 2808.52 | norm: 1.30\n", "step252 | loss: 4.600205898284912 | dt: 1454.67ms | tok/sec: 2815.76 | norm: 1.19\n", "step253 | loss: 4.912721633911133 | dt: 1452.54ms | tok/sec: 2819.89 | norm: 1.18\n", "step254 | loss: 4.567456245422363 | dt: 1455.87ms | tok/sec: 2813.44 | norm: 1.38\n", "step255 | loss: 4.816335678100586 | dt: 1450.28ms | tok/sec: 2824.29 | norm: 1.18\n", "step256 | loss: 4.471263408660889 | dt: 1447.25ms | tok/sec: 2830.19 | norm: 1.37\n", "step257 | loss: 5.022769927978516 | dt: 1453.00ms | tok/sec: 2818.99 | norm: 1.11\n", "step258 | loss: 5.070857048034668 | dt: 1451.40ms | tok/sec: 2822.11 | norm: 1.15\n", "step259 | loss: 5.1955156326293945 | dt: 1457.56ms | tok/sec: 2810.18 | norm: 1.12\n", "step260 | loss: 5.142659664154053 | dt: 1455.14ms | tok/sec: 2814.85 | norm: 1.20\n", "step261 | loss: 5.122988700866699 | dt: 1446.02ms | tok/sec: 2832.60 | norm: 1.24\n", "step262 | loss: 5.126574993133545 | dt: 1460.57ms | tok/sec: 2804.39 | norm: 1.15\n", "step263 | loss: 5.112588882446289 | dt: 1461.60ms | tok/sec: 2802.41 | norm: 1.08\n", "step264 | loss: 5.089587688446045 | dt: 1458.64ms | tok/sec: 2808.09 | norm: 1.28\n", "step265 | loss: 4.853396415710449 | dt: 1454.54ms | tok/sec: 2816.02 | norm: 1.18\n", "step266 | loss: 4.658661365509033 | dt: 1457.19ms | tok/sec: 2810.90 | norm: 1.24\n", "step267 | loss: 4.975258827209473 | dt: 1455.40ms | tok/sec: 2814.35 | norm: 1.23\n", "step268 | loss: 5.076679706573486 | dt: 1453.75ms | tok/sec: 2817.54 | norm: 1.09\n", "step269 | loss: 5.163875102996826 | dt: 1449.30ms | tok/sec: 2826.19 | norm: 1.24\n", "step270 | loss: 5.069171905517578 | dt: 1449.16ms | tok/sec: 2826.46 | norm: 1.15\n", "step271 | loss: 4.9300336837768555 | dt: 1456.52ms | tok/sec: 2812.18 | norm: 1.21\n", "step272 | loss: 5.16623592376709 | dt: 1450.00ms | tok/sec: 2824.84 | norm: 1.19\n", "step273 | loss: 4.969892978668213 | dt: 1456.92ms | tok/sec: 2811.41 | norm: 1.16\n", "step274 | loss: 4.743776321411133 | dt: 1453.87ms | tok/sec: 2817.30 | norm: 1.24\n", "step275 | loss: 4.958178520202637 | dt: 1449.32ms | tok/sec: 2826.16 | norm: 1.13\n", "step276 | loss: 5.122068405151367 | dt: 1457.80ms | tok/sec: 2809.72 | norm: 1.13\n", "step277 | loss: 4.8242411613464355 | dt: 1455.41ms | tok/sec: 2814.32 | norm: 1.48\n", "step278 | loss: 4.82649564743042 | dt: 1451.96ms | tok/sec: 2821.02 | norm: 1.15\n", "step279 | loss: 5.146365642547607 | dt: 1446.34ms | tok/sec: 2831.97 | norm: 1.06\n", "step280 | loss: 4.9463090896606445 | dt: 1459.19ms | tok/sec: 2807.04 | norm: 1.07\n", "step281 | loss: 4.831483364105225 | dt: 1443.52ms | tok/sec: 2837.51 | norm: 1.09\n", "step282 | loss: 4.648527145385742 | dt: 1450.11ms | tok/sec: 2824.60 | norm: 1.49\n", "step283 | loss: 4.785009384155273 | dt: 1452.55ms | tok/sec: 2819.87 | norm: 1.10\n", "step284 | loss: 4.676198959350586 | dt: 1454.99ms | tok/sec: 2815.13 | norm: 0.98\n", "step285 | loss: 4.867419719696045 | dt: 1453.87ms | tok/sec: 2817.30 | norm: 1.03\n", "step286 | loss: 4.652552127838135 | dt: 1453.09ms | tok/sec: 2818.82 | norm: 1.09\n", "step287 | loss: 5.209376811981201 | dt: 1453.76ms | tok/sec: 2817.51 | norm: 1.52\n", "step288 | loss: 5.13571834564209 | dt: 1461.55ms | tok/sec: 2802.50 | norm: 1.43\n", "step289 | loss: 4.954739570617676 | dt: 1454.87ms | tok/sec: 2815.37 | norm: 1.36\n", "step290 | loss: 5.1656880378723145 | dt: 1450.71ms | tok/sec: 2823.45 | norm: 1.26\n", "step291 | loss: 5.484488010406494 | dt: 1457.86ms | tok/sec: 2809.60 | norm: 1.35\n", "step292 | loss: 5.352728366851807 | dt: 1444.02ms | tok/sec: 2836.53 | norm: 1.26\n", "step293 | loss: 4.992597579956055 | dt: 1449.76ms | tok/sec: 2825.29 | norm: 1.16\n", "step294 | loss: 5.093385696411133 | dt: 1445.27ms | tok/sec: 2834.07 | norm: 1.27\n", "step295 | loss: 5.115419387817383 | dt: 1453.18ms | tok/sec: 2818.64 | norm: 1.15\n", "step296 | loss: 4.9814534187316895 | dt: 1446.77ms | tok/sec: 2831.13 | norm: 1.05\n", "step297 | loss: 4.942039489746094 | dt: 1449.03ms | tok/sec: 2826.73 | norm: 1.64\n", "step298 | loss: 4.838765621185303 | dt: 1447.54ms | tok/sec: 2829.62 | norm: 1.62\n", "step299 | loss: 4.594645977020264 | dt: 1445.33ms | tok/sec: 2833.96 | norm: 1.40\n", "step300 | loss: 4.830693244934082 | dt: 1459.43ms | tok/sec: 2806.58 | norm: 1.47\n", "step301 | loss: 4.883988380432129 | dt: 1445.85ms | tok/sec: 2832.94 | norm: 1.64\n", "step302 | loss: 4.770143985748291 | dt: 1451.72ms | tok/sec: 2821.48 | norm: 1.53\n", "step303 | loss: 4.735522270202637 | dt: 1446.31ms | tok/sec: 2832.04 | norm: 1.14\n", "step304 | loss: 4.515258312225342 | dt: 1444.85ms | tok/sec: 2834.91 | norm: 1.25\n", "step305 | loss: 4.461855411529541 | dt: 1441.48ms | tok/sec: 2841.51 | norm: 1.19\n", "step306 | loss: 5.13619327545166 | dt: 1447.54ms | tok/sec: 2829.64 | norm: 1.53\n", "step307 | loss: 4.8986382484436035 | dt: 1453.52ms | tok/sec: 2817.99 | norm: 1.55\n", "step308 | loss: 4.653582572937012 | dt: 1451.52ms | tok/sec: 2821.87 | norm: 1.47\n", "step309 | loss: 4.645441055297852 | dt: 1457.01ms | tok/sec: 2811.24 | norm: 1.80\n", "step310 | loss: 4.848757266998291 | dt: 1452.31ms | tok/sec: 2820.33 | norm: 1.36\n", "step311 | loss: 4.750953197479248 | dt: 1454.08ms | tok/sec: 2816.90 | norm: 1.28\n", "step312 | loss: 4.4546918869018555 | dt: 1453.30ms | tok/sec: 2818.41 | norm: 1.63\n", "step313 | loss: 4.435152530670166 | dt: 1450.45ms | tok/sec: 2823.94 | norm: 1.65\n", "step314 | loss: 4.442281246185303 | dt: 1455.91ms | tok/sec: 2813.35 | norm: 1.31\n", "step315 | loss: 5.203556060791016 | dt: 1449.47ms | tok/sec: 2825.85 | norm: 1.53\n", "step316 | loss: 5.053839206695557 | dt: 1448.57ms | tok/sec: 2827.62 | norm: 1.47\n", "step317 | loss: 4.96637487411499 | dt: 1452.77ms | tok/sec: 2819.44 | norm: 1.47\n", "step318 | loss: 4.922221660614014 | dt: 1444.26ms | tok/sec: 2836.05 | norm: 1.43\n", "step319 | loss: 4.982655048370361 | dt: 1447.82ms | tok/sec: 2829.09 | norm: 1.38\n", "step320 | loss: 4.942850589752197 | dt: 1452.36ms | tok/sec: 2820.23 | norm: 1.36\n", "step321 | loss: 4.683999061584473 | dt: 1451.72ms | tok/sec: 2821.47 | norm: 1.23\n", "step322 | loss: 4.434199810028076 | dt: 1450.33ms | tok/sec: 2824.19 | norm: 1.39\n", "step323 | loss: 4.4951171875 | dt: 1453.44ms | tok/sec: 2818.13 | norm: 1.10\n", "step324 | loss: 4.4982686042785645 | dt: 1451.56ms | tok/sec: 2821.80 | norm: 1.22\n", "step325 | loss: 4.627958297729492 | dt: 1449.95ms | tok/sec: 2824.93 | norm: 1.30\n", "step326 | loss: 4.6142144203186035 | dt: 1448.66ms | tok/sec: 2827.44 | norm: 1.07\n", "step327 | loss: 4.826625347137451 | dt: 1453.19ms | tok/sec: 2818.63 | norm: 1.07\n", "step328 | loss: 4.757080078125 | dt: 1448.25ms | tok/sec: 2828.24 | norm: 1.03\n", "step329 | loss: 4.645442008972168 | dt: 1452.59ms | tok/sec: 2819.79 | norm: 1.64\n", "step330 | loss: 4.655596733093262 | dt: 1453.34ms | tok/sec: 2818.33 | norm: 1.37\n", "step331 | loss: 4.664466381072998 | dt: 1439.49ms | tok/sec: 2845.45 | norm: 1.36\n", "step332 | loss: 4.658212661743164 | dt: 1453.39ms | tok/sec: 2818.24 | norm: 1.18\n", "step333 | loss: 4.316652774810791 | dt: 1455.65ms | tok/sec: 2813.86 | norm: 1.32\n", "step334 | loss: 4.455777645111084 | dt: 1446.72ms | tok/sec: 2831.24 | norm: 1.15\n", "step335 | loss: 4.791997909545898 | dt: 1448.09ms | tok/sec: 2828.56 | norm: 1.25\n", "step336 | loss: 4.41414737701416 | dt: 1454.33ms | tok/sec: 2816.41 | norm: 1.41\n", "step337 | loss: 4.675680160522461 | dt: 1447.09ms | tok/sec: 2830.52 | norm: 1.34\n", "step338 | loss: 4.315677165985107 | dt: 1453.25ms | tok/sec: 2818.52 | norm: 1.30\n", "step339 | loss: 4.887519836425781 | dt: 1442.89ms | tok/sec: 2838.75 | norm: 1.12\n", "step340 | loss: 4.927259922027588 | dt: 1451.58ms | tok/sec: 2821.75 | norm: 1.15\n", "step341 | loss: 5.048630714416504 | dt: 1447.25ms | tok/sec: 2830.19 | norm: 1.14\n", "step342 | loss: 4.99721097946167 | dt: 1450.63ms | tok/sec: 2823.60 | norm: 1.19\n", "step343 | loss: 4.965035438537598 | dt: 1436.86ms | tok/sec: 2850.66 | norm: 1.35\n", "step344 | loss: 4.966833114624023 | dt: 1441.93ms | tok/sec: 2840.64 | norm: 1.17\n", "step345 | loss: 4.960330486297607 | dt: 1449.52ms | tok/sec: 2825.76 | norm: 1.08\n", "step346 | loss: 4.9501633644104 | dt: 1443.39ms | tok/sec: 2837.77 | norm: 1.18\n", "step347 | loss: 4.710467338562012 | dt: 1449.92ms | tok/sec: 2824.98 | norm: 1.24\n", "step348 | loss: 4.5105156898498535 | dt: 1443.24ms | tok/sec: 2838.06 | norm: 1.31\n", "step349 | loss: 4.8239874839782715 | dt: 1450.78ms | tok/sec: 2823.31 | norm: 1.33\n", "step350 | loss: 4.926147937774658 | dt: 1451.12ms | tok/sec: 2822.64 | norm: 1.20\n", "step351 | loss: 5.010854721069336 | dt: 1451.17ms | tok/sec: 2822.55 | norm: 1.21\n", "step352 | loss: 4.900291919708252 | dt: 1453.92ms | tok/sec: 2817.21 | norm: 1.20\n", "step353 | loss: 4.772107124328613 | dt: 1455.10ms | tok/sec: 2814.92 | norm: 1.26\n", "step354 | loss: 4.9986653327941895 | dt: 1450.87ms | tok/sec: 2823.14 | norm: 1.30\n", "step355 | loss: 4.7951555252075195 | dt: 1449.75ms | tok/sec: 2825.32 | norm: 1.18\n", "step356 | loss: 4.574727535247803 | dt: 1443.23ms | tok/sec: 2838.09 | norm: 1.16\n", "step357 | loss: 4.783575057983398 | dt: 1444.40ms | tok/sec: 2835.79 | norm: 1.12\n", "step358 | loss: 4.9649529457092285 | dt: 1434.73ms | tok/sec: 2854.89 | norm: 1.14\n", "step359 | loss: 4.663640022277832 | dt: 1445.35ms | tok/sec: 2833.93 | norm: 1.39\n", "step360 | loss: 4.679715633392334 | dt: 1448.28ms | tok/sec: 2828.18 | norm: 1.24\n", "step361 | loss: 5.000891208648682 | dt: 1458.14ms | tok/sec: 2809.05 | norm: 1.08\n", "step362 | loss: 4.796950340270996 | dt: 1446.80ms | tok/sec: 2831.08 | norm: 1.11\n", "step363 | loss: 4.688414573669434 | dt: 1450.67ms | tok/sec: 2823.52 | norm: 1.09\n", "step364 | loss: 4.494034290313721 | dt: 1452.00ms | tok/sec: 2820.93 | norm: 1.49\n", "step365 | loss: 4.641475677490234 | dt: 1436.94ms | tok/sec: 2850.49 | norm: 1.16\n", "step366 | loss: 4.525160312652588 | dt: 1450.83ms | tok/sec: 2823.22 | norm: 1.05\n", "step367 | loss: 4.705638885498047 | dt: 1437.01ms | tok/sec: 2850.37 | norm: 1.02\n", "step368 | loss: 4.511965274810791 | dt: 1442.61ms | tok/sec: 2839.30 | norm: 1.04\n", "step369 | loss: 5.050025463104248 | dt: 1441.67ms | tok/sec: 2841.14 | norm: 1.48\n", "step370 | loss: 4.978274822235107 | dt: 1449.89ms | tok/sec: 2825.04 | norm: 1.52\n", "step371 | loss: 4.809103965759277 | dt: 1435.89ms | tok/sec: 2852.59 | norm: 1.42\n", "step372 | loss: 5.027266502380371 | dt: 1452.24ms | tok/sec: 2820.46 | norm: 1.30\n", "step373 | loss: 5.351414203643799 | dt: 1436.40ms | tok/sec: 2851.56 | norm: 1.38\n", "step374 | loss: 5.197083950042725 | dt: 1452.50ms | tok/sec: 2819.96 | norm: 1.27\n", "step375 | loss: 4.841187477111816 | dt: 1443.40ms | tok/sec: 2837.75 | norm: 1.21\n", "step376 | loss: 4.934873104095459 | dt: 1447.62ms | tok/sec: 2829.46 | norm: 1.28\n", "step377 | loss: 4.975739479064941 | dt: 1448.76ms | tok/sec: 2827.25 | norm: 1.19\n", "step378 | loss: 4.826531887054443 | dt: 1438.89ms | tok/sec: 2846.64 | norm: 1.07\n", "step379 | loss: 4.798772811889648 | dt: 1446.32ms | tok/sec: 2832.01 | norm: 1.47\n", "step380 | loss: 4.693748474121094 | dt: 1445.84ms | tok/sec: 2832.95 | norm: 1.69\n", "step381 | loss: 4.457385540008545 | dt: 1449.23ms | tok/sec: 2826.33 | norm: 1.52\n", "step382 | loss: 4.683049201965332 | dt: 1442.43ms | tok/sec: 2839.66 | norm: 1.54\n", "step383 | loss: 4.728100776672363 | dt: 1449.53ms | tok/sec: 2825.74 | norm: 1.42\n", "step384 | loss: 4.626945972442627 | dt: 1452.55ms | tok/sec: 2819.87 | norm: 1.39\n", "step385 | loss: 4.59235143661499 | dt: 1439.25ms | tok/sec: 2845.92 | norm: 1.11\n", "step386 | loss: 4.373235702514648 | dt: 1445.39ms | tok/sec: 2833.84 | norm: 1.19\n", "step387 | loss: 4.322368144989014 | dt: 1439.71ms | tok/sec: 2845.01 | norm: 1.07\n", "step388 | loss: 5.002132415771484 | dt: 1445.68ms | tok/sec: 2833.26 | norm: 1.34\n", "step389 | loss: 4.750082015991211 | dt: 1443.32ms | tok/sec: 2837.91 | norm: 1.53\n", "step390 | loss: 4.486581325531006 | dt: 1445.87ms | tok/sec: 2832.89 | norm: 1.28\n", "step391 | loss: 4.4999847412109375 | dt: 1451.04ms | tok/sec: 2822.79 | norm: 1.55\n", "step392 | loss: 4.689620018005371 | dt: 1443.04ms | tok/sec: 2838.46 | norm: 1.24\n", "step393 | loss: 4.613969802856445 | dt: 1446.95ms | tok/sec: 2830.79 | norm: 1.31\n", "step394 | loss: 4.320478439331055 | dt: 1446.69ms | tok/sec: 2831.29 | norm: 1.54\n", "step395 | loss: 4.28681755065918 | dt: 1450.69ms | tok/sec: 2823.47 | norm: 1.45\n", "step396 | loss: 4.299644947052002 | dt: 1445.80ms | tok/sec: 2833.03 | norm: 1.20\n", "step397 | loss: 5.068978309631348 | dt: 1445.36ms | tok/sec: 2833.89 | norm: 1.51\n", "step398 | loss: 4.920777320861816 | dt: 1453.79ms | tok/sec: 2817.46 | norm: 1.46\n", "step399 | loss: 4.864882946014404 | dt: 1439.15ms | tok/sec: 2846.12 | norm: 1.42\n", "step400 | loss: 4.7843732833862305 | dt: 1450.79ms | tok/sec: 2823.29 | norm: 1.40\n", "step401 | loss: 4.871971130371094 | dt: 1454.47ms | tok/sec: 2816.15 | norm: 1.52\n", "step402 | loss: 4.829896450042725 | dt: 1443.89ms | tok/sec: 2836.78 | norm: 1.46\n", "step403 | loss: 4.587617874145508 | dt: 1447.38ms | tok/sec: 2829.93 | norm: 1.38\n", "step404 | loss: 4.332151889801025 | dt: 1439.30ms | tok/sec: 2845.83 | norm: 1.36\n", "step405 | loss: 4.394386291503906 | dt: 1441.28ms | tok/sec: 2841.91 | norm: 1.19\n", "step406 | loss: 4.394397735595703 | dt: 1451.10ms | tok/sec: 2822.69 | norm: 1.42\n", "step407 | loss: 4.508070945739746 | dt: 1438.15ms | tok/sec: 2848.10 | norm: 1.44\n", "step408 | loss: 4.51992130279541 | dt: 1451.08ms | tok/sec: 2822.73 | norm: 1.17\n", "step409 | loss: 4.72074031829834 | dt: 1441.40ms | tok/sec: 2841.67 | norm: 1.27\n", "step410 | loss: 4.647989749908447 | dt: 1443.83ms | tok/sec: 2836.90 | norm: 1.26\n", "step411 | loss: 4.542581081390381 | dt: 1445.79ms | tok/sec: 2833.05 | norm: 1.79\n", "step412 | loss: 4.529564380645752 | dt: 1438.86ms | tok/sec: 2846.70 | norm: 1.39\n", "step413 | loss: 4.523290157318115 | dt: 1451.50ms | tok/sec: 2821.91 | norm: 1.43\n", "step414 | loss: 4.5399956703186035 | dt: 1448.25ms | tok/sec: 2828.24 | norm: 1.25\n", "step415 | loss: 4.203490257263184 | dt: 1445.49ms | tok/sec: 2833.65 | norm: 1.43\n", "step416 | loss: 4.331180572509766 | dt: 1446.90ms | tok/sec: 2830.88 | norm: 1.31\n", "step417 | loss: 4.681077003479004 | dt: 1452.34ms | tok/sec: 2820.27 | norm: 1.19\n", "step418 | loss: 4.286984443664551 | dt: 1453.28ms | tok/sec: 2818.45 | norm: 1.19\n", "step419 | loss: 4.540879726409912 | dt: 1451.91ms | tok/sec: 2821.11 | norm: 1.30\n", "step420 | loss: 4.1917805671691895 | dt: 1453.74ms | tok/sec: 2817.57 | norm: 1.41\n", "step421 | loss: 4.751793384552002 | dt: 1442.31ms | tok/sec: 2839.90 | norm: 1.22\n", "step422 | loss: 4.786155700683594 | dt: 1451.64ms | tok/sec: 2821.64 | norm: 1.26\n", "step423 | loss: 4.893470764160156 | dt: 1453.95ms | tok/sec: 2817.16 | norm: 1.22\n", "step424 | loss: 4.8548054695129395 | dt: 1438.92ms | tok/sec: 2846.58 | norm: 1.19\n", "step425 | loss: 4.793612957000732 | dt: 1451.87ms | tok/sec: 2821.20 | norm: 1.26\n", "step426 | loss: 4.805150985717773 | dt: 1454.04ms | tok/sec: 2816.97 | norm: 1.14\n", "step427 | loss: 4.793338298797607 | dt: 1437.47ms | tok/sec: 2849.46 | norm: 1.15\n", "step428 | loss: 4.795429229736328 | dt: 1453.82ms | tok/sec: 2817.40 | norm: 1.28\n", "step429 | loss: 4.556128025054932 | dt: 1452.57ms | tok/sec: 2819.84 | norm: 1.12\n", "step430 | loss: 4.369061470031738 | dt: 1451.48ms | tok/sec: 2821.96 | norm: 1.22\n", "step431 | loss: 4.676914215087891 | dt: 1448.64ms | tok/sec: 2827.48 | norm: 1.34\n", "step432 | loss: 4.7812957763671875 | dt: 1444.93ms | tok/sec: 2834.74 | norm: 1.26\n", "step433 | loss: 4.867162704467773 | dt: 1451.55ms | tok/sec: 2821.82 | norm: 1.23\n", "step434 | loss: 4.751791000366211 | dt: 1445.97ms | tok/sec: 2832.71 | norm: 1.29\n", "step435 | loss: 4.629043102264404 | dt: 1449.91ms | tok/sec: 2825.01 | norm: 1.38\n", "step436 | loss: 4.835895538330078 | dt: 1450.61ms | tok/sec: 2823.65 | norm: 1.46\n", "step437 | loss: 4.642320156097412 | dt: 1446.47ms | tok/sec: 2831.73 | norm: 1.25\n", "step438 | loss: 4.425959587097168 | dt: 1455.64ms | tok/sec: 2813.89 | norm: 1.24\n", "step439 | loss: 4.621522903442383 | dt: 1440.01ms | tok/sec: 2844.42 | norm: 1.21\n", "step440 | loss: 4.823385715484619 | dt: 1449.45ms | tok/sec: 2825.91 | norm: 1.24\n", "step441 | loss: 4.533592700958252 | dt: 1449.18ms | tok/sec: 2826.43 | norm: 1.46\n", "step442 | loss: 4.549861907958984 | dt: 1454.01ms | tok/sec: 2817.04 | norm: 1.30\n", "step443 | loss: 4.874139785766602 | dt: 1451.38ms | tok/sec: 2822.15 | norm: 1.28\n", "step444 | loss: 4.665793418884277 | dt: 1452.72ms | tok/sec: 2819.54 | norm: 1.16\n", "step445 | loss: 4.570728302001953 | dt: 1452.34ms | tok/sec: 2820.27 | norm: 1.13\n", "step446 | loss: 4.380552768707275 | dt: 1454.33ms | tok/sec: 2816.42 | norm: 1.50\n", "step447 | loss: 4.543609142303467 | dt: 1456.20ms | tok/sec: 2812.81 | norm: 1.37\n", "step448 | loss: 4.409053325653076 | dt: 1448.80ms | tok/sec: 2827.17 | norm: 1.15\n", "step449 | loss: 4.578281879425049 | dt: 1450.86ms | tok/sec: 2823.15 | norm: 1.14\n", "step450 | loss: 4.389840602874756 | dt: 1440.24ms | tok/sec: 2843.97 | norm: 1.11\n", "step451 | loss: 4.927459239959717 | dt: 1445.45ms | tok/sec: 2833.73 | norm: 1.62\n", "step452 | loss: 4.845308303833008 | dt: 1444.30ms | tok/sec: 2835.98 | norm: 1.63\n", "step453 | loss: 4.698278427124023 | dt: 1446.61ms | tok/sec: 2831.45 | norm: 1.52\n", "step454 | loss: 4.902656078338623 | dt: 1451.60ms | tok/sec: 2821.71 | norm: 1.31\n", "step455 | loss: 5.218589782714844 | dt: 1453.68ms | tok/sec: 2817.68 | norm: 1.45\n", "step456 | loss: 5.053982734680176 | dt: 1453.23ms | tok/sec: 2818.54 | norm: 1.37\n", "step457 | loss: 4.702969074249268 | dt: 1455.13ms | tok/sec: 2814.87 | norm: 1.37\n", "step458 | loss: 4.781749725341797 | dt: 1453.36ms | tok/sec: 2818.30 | norm: 1.35\n", "step459 | loss: 4.856635570526123 | dt: 1452.44ms | tok/sec: 2820.09 | norm: 1.29\n", "step460 | loss: 4.693620681762695 | dt: 1440.33ms | tok/sec: 2843.80 | norm: 1.20\n", "step461 | loss: 4.673206329345703 | dt: 1447.35ms | tok/sec: 2829.99 | norm: 1.46\n", "step462 | loss: 4.562049865722656 | dt: 1453.17ms | tok/sec: 2818.66 | norm: 1.61\n", "step463 | loss: 4.352673530578613 | dt: 1451.24ms | tok/sec: 2822.42 | norm: 1.45\n", "step464 | loss: 4.572988033294678 | dt: 1453.40ms | tok/sec: 2818.21 | norm: 1.61\n", "step465 | loss: 4.599318504333496 | dt: 1442.37ms | tok/sec: 2839.77 | norm: 1.51\n", "step466 | loss: 4.5016655921936035 | dt: 1451.08ms | tok/sec: 2822.73 | norm: 1.45\n", "step467 | loss: 4.470655918121338 | dt: 1452.90ms | tok/sec: 2819.19 | norm: 1.27\n", "step468 | loss: 4.257596969604492 | dt: 1454.16ms | tok/sec: 2816.74 | norm: 1.38\n", "step469 | loss: 4.206538200378418 | dt: 1452.66ms | tok/sec: 2819.66 | norm: 1.14\n", "step470 | loss: 4.87727165222168 | dt: 1451.71ms | tok/sec: 2821.49 | norm: 1.35\n", "step471 | loss: 4.609493732452393 | dt: 1440.40ms | tok/sec: 2843.66 | norm: 1.48\n", "step472 | loss: 4.342877388000488 | dt: 1449.93ms | tok/sec: 2824.97 | norm: 1.33\n", "step473 | loss: 4.372161865234375 | dt: 1448.36ms | tok/sec: 2828.02 | norm: 1.55\n", "step474 | loss: 4.547359943389893 | dt: 1452.96ms | tok/sec: 2819.07 | norm: 1.30\n", "step475 | loss: 4.4888105392456055 | dt: 1455.08ms | tok/sec: 2814.97 | norm: 1.33\n", "step476 | loss: 4.191911220550537 | dt: 1439.16ms | tok/sec: 2846.10 | norm: 1.37\n", "step477 | loss: 4.135360240936279 | dt: 1450.41ms | tok/sec: 2824.02 | norm: 1.44\n", "step478 | loss: 4.159686088562012 | dt: 1454.62ms | tok/sec: 2815.86 | norm: 1.32\n", "step479 | loss: 4.935885906219482 | dt: 1444.65ms | tok/sec: 2835.29 | norm: 1.61\n", "step480 | loss: 4.787212371826172 | dt: 1451.97ms | tok/sec: 2821.00 | norm: 1.52\n", "step481 | loss: 4.743310928344727 | dt: 1447.35ms | tok/sec: 2830.01 | norm: 1.57\n", "step482 | loss: 4.640646457672119 | dt: 1449.90ms | tok/sec: 2825.01 | norm: 1.42\n", "step483 | loss: 4.753310203552246 | dt: 1452.05ms | tok/sec: 2820.84 | norm: 1.45\n", "step484 | loss: 4.701311111450195 | dt: 1449.50ms | tok/sec: 2825.81 | norm: 1.45\n", "step485 | loss: 4.486164569854736 | dt: 1450.04ms | tok/sec: 2824.75 | norm: 1.52\n", "step486 | loss: 4.216708660125732 | dt: 1447.07ms | tok/sec: 2830.55 | norm: 1.43\n", "step487 | loss: 4.282656192779541 | dt: 1450.95ms | tok/sec: 2822.97 | norm: 1.27\n", "step488 | loss: 4.278609275817871 | dt: 1450.13ms | tok/sec: 2824.57 | norm: 1.52\n", "step489 | loss: 4.390397548675537 | dt: 1455.74ms | tok/sec: 2813.68 | norm: 1.77\n", "step490 | loss: 4.417691230773926 | dt: 1452.59ms | tok/sec: 2819.79 | norm: 1.36\n", "step491 | loss: 4.6110920906066895 | dt: 1442.40ms | tok/sec: 2839.71 | norm: 1.17\n", "step492 | loss: 4.539865016937256 | dt: 1448.35ms | tok/sec: 2828.04 | norm: 1.38\n", "step493 | loss: 4.444937705993652 | dt: 1439.67ms | tok/sec: 2845.09 | norm: 2.33\n", "step494 | loss: 4.423857688903809 | dt: 1451.60ms | tok/sec: 2821.71 | norm: 1.77\n", "step495 | loss: 4.37763786315918 | dt: 1452.34ms | tok/sec: 2820.28 | norm: 1.43\n", "step496 | loss: 4.431158065795898 | dt: 1455.56ms | tok/sec: 2814.03 | norm: 1.43\n", "step497 | loss: 4.100925922393799 | dt: 1442.14ms | tok/sec: 2840.22 | norm: 1.44\n", "step498 | loss: 4.224133014678955 | dt: 1454.56ms | tok/sec: 2815.96 | norm: 1.39\n", "step499 | loss: 4.573838233947754 | dt: 1450.01ms | tok/sec: 2824.81 | norm: 1.30\n", "step500 | loss: 4.168519973754883 | dt: 1450.98ms | tok/sec: 2822.92 | norm: 1.26\n", "step501 | loss: 4.41989803314209 | dt: 1452.75ms | tok/sec: 2819.49 | norm: 1.25\n", "step502 | loss: 4.093413352966309 | dt: 1460.57ms | tok/sec: 2804.38 | norm: 1.65\n", "step503 | loss: 4.632689476013184 | dt: 1444.68ms | tok/sec: 2835.23 | norm: 1.34\n", "step504 | loss: 4.649772644042969 | dt: 1452.59ms | tok/sec: 2819.80 | norm: 1.31\n", "step505 | loss: 4.742977619171143 | dt: 1454.22ms | tok/sec: 2816.64 | norm: 1.36\n", "step506 | loss: 4.713996410369873 | dt: 1447.83ms | tok/sec: 2829.05 | norm: 1.28\n", "step507 | loss: 4.635598659515381 | dt: 1441.37ms | tok/sec: 2841.74 | norm: 1.32\n", "step508 | loss: 4.650856971740723 | dt: 1455.49ms | tok/sec: 2814.17 | norm: 1.18\n", "step509 | loss: 4.647605895996094 | dt: 1448.26ms | tok/sec: 2828.22 | norm: 1.28\n", "step510 | loss: 4.670739650726318 | dt: 1451.27ms | tok/sec: 2822.35 | norm: 1.50\n", "step511 | loss: 4.420587539672852 | dt: 1439.25ms | tok/sec: 2845.92 | norm: 1.16\n", "step512 | loss: 4.251978397369385 | dt: 1449.18ms | tok/sec: 2826.43 | norm: 1.28\n", "step513 | loss: 4.54186487197876 | dt: 1446.12ms | tok/sec: 2832.41 | norm: 1.36\n", "step514 | loss: 4.6590189933776855 | dt: 1447.60ms | tok/sec: 2829.51 | norm: 1.44\n", "step515 | loss: 4.7345404624938965 | dt: 1459.32ms | tok/sec: 2806.79 | norm: 1.30\n", "step516 | loss: 4.627356052398682 | dt: 1445.45ms | tok/sec: 2833.72 | norm: 1.37\n", "step517 | loss: 4.510025501251221 | dt: 1441.48ms | tok/sec: 2841.53 | norm: 1.50\n", "step518 | loss: 4.69949197769165 | dt: 1453.63ms | tok/sec: 2817.77 | norm: 1.58\n", "step519 | loss: 4.504380226135254 | dt: 1449.71ms | tok/sec: 2825.40 | norm: 1.39\n", "step520 | loss: 4.304786682128906 | dt: 1439.17ms | tok/sec: 2846.08 | norm: 1.31\n", "step521 | loss: 4.485190391540527 | dt: 1451.19ms | tok/sec: 2822.51 | norm: 1.36\n", "step522 | loss: 4.696103572845459 | dt: 1454.63ms | tok/sec: 2815.83 | norm: 1.53\n", "step523 | loss: 4.416008949279785 | dt: 1450.18ms | tok/sec: 2824.48 | norm: 1.61\n", "step524 | loss: 4.434887409210205 | dt: 1443.88ms | tok/sec: 2836.81 | norm: 1.30\n", "step525 | loss: 4.759880542755127 | dt: 1450.76ms | tok/sec: 2823.36 | norm: 1.37\n", "step526 | loss: 4.553787708282471 | dt: 1455.31ms | tok/sec: 2814.51 | norm: 1.34\n", "step527 | loss: 4.460385799407959 | dt: 1453.52ms | tok/sec: 2818.00 | norm: 1.34\n", "step528 | loss: 4.283224105834961 | dt: 1451.19ms | tok/sec: 2822.51 | norm: 1.40\n", "step529 | loss: 4.4528374671936035 | dt: 1448.26ms | tok/sec: 2828.22 | norm: 1.35\n", "step530 | loss: 4.310113430023193 | dt: 1442.28ms | tok/sec: 2839.95 | norm: 1.32\n", "step531 | loss: 4.472282409667969 | dt: 1447.92ms | tok/sec: 2828.90 | norm: 1.48\n", "step532 | loss: 4.285065650939941 | dt: 1451.43ms | tok/sec: 2822.04 | norm: 1.33\n", "step533 | loss: 4.820312976837158 | dt: 1451.55ms | tok/sec: 2821.81 | norm: 1.68\n", "step534 | loss: 4.74605655670166 | dt: 1452.85ms | tok/sec: 2819.28 | norm: 1.83\n", "step535 | loss: 4.599341869354248 | dt: 1435.40ms | tok/sec: 2853.56 | norm: 1.64\n", "step536 | loss: 4.797690391540527 | dt: 1445.39ms | tok/sec: 2833.84 | norm: 1.44\n", "step537 | loss: 5.105985641479492 | dt: 1441.36ms | tok/sec: 2841.76 | norm: 1.53\n", "step538 | loss: 4.92313289642334 | dt: 1452.65ms | tok/sec: 2819.67 | norm: 1.52\n", "step539 | loss: 4.579255104064941 | dt: 1442.07ms | tok/sec: 2840.37 | norm: 1.49\n", "step540 | loss: 4.649352550506592 | dt: 1445.42ms | tok/sec: 2833.77 | norm: 1.36\n", "step541 | loss: 4.753263473510742 | dt: 1449.34ms | tok/sec: 2826.12 | norm: 1.39\n", "step542 | loss: 4.583715915679932 | dt: 1452.20ms | tok/sec: 2820.54 | norm: 1.33\n", "step543 | loss: 4.548273086547852 | dt: 1452.62ms | tok/sec: 2819.74 | norm: 1.61\n", "step544 | loss: 4.426210403442383 | dt: 1452.93ms | tok/sec: 2819.13 | norm: 1.71\n", "step545 | loss: 4.250688552856445 | dt: 1448.99ms | tok/sec: 2826.79 | norm: 1.45\n", "step546 | loss: 4.463655471801758 | dt: 1450.40ms | tok/sec: 2824.06 | norm: 1.53\n", "step547 | loss: 4.482334136962891 | dt: 1446.59ms | tok/sec: 2831.49 | norm: 1.60\n", "step548 | loss: 4.385271072387695 | dt: 1449.10ms | tok/sec: 2826.58 | norm: 1.42\n", "step549 | loss: 4.351579666137695 | dt: 1451.75ms | tok/sec: 2821.41 | norm: 1.41\n", "step550 | loss: 4.159631252288818 | dt: 1450.96ms | tok/sec: 2822.96 | norm: 1.45\n", "step551 | loss: 4.09958028793335 | dt: 1452.88ms | tok/sec: 2819.23 | norm: 1.22\n", "step552 | loss: 4.759954929351807 | dt: 1449.35ms | tok/sec: 2826.10 | norm: 1.36\n", "step553 | loss: 4.46757173538208 | dt: 1443.68ms | tok/sec: 2837.20 | norm: 1.51\n", "step554 | loss: 4.212086200714111 | dt: 1442.01ms | tok/sec: 2840.47 | norm: 1.33\n", "step555 | loss: 4.264333724975586 | dt: 1449.83ms | tok/sec: 2825.16 | norm: 1.53\n", "step556 | loss: 4.414207458496094 | dt: 1448.33ms | tok/sec: 2828.08 | norm: 1.39\n", "step557 | loss: 4.379427433013916 | dt: 1439.54ms | tok/sec: 2845.34 | norm: 1.40\n", "step558 | loss: 4.071807384490967 | dt: 1456.89ms | tok/sec: 2811.46 | norm: 1.38\n", "step559 | loss: 4.0112624168396 | dt: 1448.67ms | tok/sec: 2827.42 | norm: 1.32\n", "step560 | loss: 4.038169860839844 | dt: 1454.48ms | tok/sec: 2816.12 | norm: 1.26\n", "step561 | loss: 4.821777820587158 | dt: 1438.45ms | tok/sec: 2847.51 | norm: 1.77\n", "step562 | loss: 4.663741588592529 | dt: 1450.68ms | tok/sec: 2823.50 | norm: 1.60\n", "step563 | loss: 4.641353607177734 | dt: 1451.02ms | tok/sec: 2822.85 | norm: 1.65\n", "step564 | loss: 4.5034074783325195 | dt: 1454.12ms | tok/sec: 2816.82 | norm: 1.45\n", "step565 | loss: 4.6293439865112305 | dt: 1440.18ms | tok/sec: 2844.08 | norm: 1.44\n", "step566 | loss: 4.582202911376953 | dt: 1448.64ms | tok/sec: 2827.49 | norm: 1.51\n", "step567 | loss: 4.395103931427002 | dt: 1453.54ms | tok/sec: 2817.94 | norm: 1.42\n", "step568 | loss: 4.114687442779541 | dt: 1449.28ms | tok/sec: 2826.23 | norm: 1.45\n", "step569 | loss: 4.177568435668945 | dt: 1456.09ms | tok/sec: 2813.02 | norm: 1.36\n", "step570 | loss: 4.177389621734619 | dt: 1436.85ms | tok/sec: 2850.68 | norm: 1.54\n", "step571 | loss: 4.275235176086426 | dt: 1446.52ms | tok/sec: 2831.62 | norm: 1.74\n", "step572 | loss: 4.317491054534912 | dt: 1440.01ms | tok/sec: 2844.42 | norm: 1.32\n", "step573 | loss: 4.4907941818237305 | dt: 1452.75ms | tok/sec: 2819.48 | norm: 1.21\n", "step574 | loss: 4.429429531097412 | dt: 1450.44ms | tok/sec: 2823.96 | norm: 1.43\n", "step575 | loss: 4.332371234893799 | dt: 1454.90ms | tok/sec: 2815.32 | norm: 2.22\n", "step576 | loss: 4.339951515197754 | dt: 1454.97ms | tok/sec: 2815.18 | norm: 1.86\n", "step577 | loss: 4.2598772048950195 | dt: 1452.09ms | tok/sec: 2820.77 | norm: 1.45\n", "step578 | loss: 4.3180928230285645 | dt: 1452.01ms | tok/sec: 2820.92 | norm: 1.46\n", "step579 | loss: 3.992223024368286 | dt: 1438.16ms | tok/sec: 2848.09 | norm: 1.49\n", "step580 | loss: 4.106361389160156 | dt: 1446.76ms | tok/sec: 2831.16 | norm: 1.39\n", "step581 | loss: 4.469982147216797 | dt: 1444.96ms | tok/sec: 2834.68 | norm: 1.49\n", "step582 | loss: 4.058409214019775 | dt: 1445.80ms | tok/sec: 2833.03 | norm: 1.50\n", "step583 | loss: 4.307351112365723 | dt: 1442.27ms | tok/sec: 2839.97 | norm: 1.47\n", "step584 | loss: 4.001046180725098 | dt: 1445.30ms | tok/sec: 2834.02 | norm: 1.87\n", "step585 | loss: 4.520284175872803 | dt: 1451.33ms | tok/sec: 2822.23 | norm: 1.56\n", "step586 | loss: 4.529872894287109 | dt: 1453.64ms | tok/sec: 2817.76 | norm: 1.45\n", "step587 | loss: 4.610291481018066 | dt: 1453.22ms | tok/sec: 2818.58 | norm: 1.44\n", "step588 | loss: 4.587438583374023 | dt: 1452.72ms | tok/sec: 2819.53 | norm: 1.41\n", "step589 | loss: 4.497069835662842 | dt: 1449.34ms | tok/sec: 2826.11 | norm: 1.40\n", "step590 | loss: 4.52139139175415 | dt: 1438.83ms | tok/sec: 2846.76 | norm: 1.44\n", "step591 | loss: 4.51242208480835 | dt: 1448.53ms | tok/sec: 2827.68 | norm: 1.33\n", "step592 | loss: 4.540665626525879 | dt: 1448.52ms | tok/sec: 2827.71 | norm: 1.45\n", "step593 | loss: 4.296619415283203 | dt: 1440.55ms | tok/sec: 2843.35 | norm: 1.29\n", "step594 | loss: 4.141761779785156 | dt: 1453.56ms | tok/sec: 2817.91 | norm: 1.34\n", "step595 | loss: 4.4242658615112305 | dt: 1441.87ms | tok/sec: 2840.76 | norm: 1.53\n", "step596 | loss: 4.560310363769531 | dt: 1446.65ms | tok/sec: 2831.37 | norm: 1.50\n", "step597 | loss: 4.615647315979004 | dt: 1449.43ms | tok/sec: 2825.95 | norm: 1.45\n", "step598 | loss: 4.508000373840332 | dt: 1452.38ms | tok/sec: 2820.21 | norm: 1.42\n", "step599 | loss: 4.400507926940918 | dt: 1450.84ms | tok/sec: 2823.19 | norm: 1.55\n", "step600 | loss: 4.568173408508301 | dt: 1444.45ms | tok/sec: 2835.68 | norm: 1.59\n", "step601 | loss: 4.388156890869141 | dt: 1444.75ms | tok/sec: 2835.09 | norm: 1.52\n", "step602 | loss: 4.1949849128723145 | dt: 1450.55ms | tok/sec: 2823.75 | norm: 1.46\n", "step603 | loss: 4.357595920562744 | dt: 1452.56ms | tok/sec: 2819.85 | norm: 1.33\n", "step604 | loss: 4.587517738342285 | dt: 1436.84ms | tok/sec: 2850.71 | norm: 1.44\n", "step605 | loss: 4.310882568359375 | dt: 1450.29ms | tok/sec: 2824.27 | norm: 1.77\n", "step606 | loss: 4.327176094055176 | dt: 1453.06ms | tok/sec: 2818.89 | norm: 1.48\n", "step607 | loss: 4.652812480926514 | dt: 1443.43ms | tok/sec: 2837.69 | norm: 1.26\n", "step608 | loss: 4.448120594024658 | dt: 1449.43ms | tok/sec: 2825.93 | norm: 1.30\n", "step609 | loss: 4.3639750480651855 | dt: 1442.85ms | tok/sec: 2838.84 | norm: 1.49\n", "step610 | loss: 4.178696632385254 | dt: 1447.34ms | tok/sec: 2830.03 | norm: 1.47\n", "step611 | loss: 4.341161727905273 | dt: 1441.50ms | tok/sec: 2841.49 | norm: 1.32\n", "step612 | loss: 4.207866668701172 | dt: 1439.36ms | tok/sec: 2845.72 | norm: 1.35\n", "step613 | loss: 4.37126350402832 | dt: 1446.84ms | tok/sec: 2831.00 | norm: 1.51\n", "step614 | loss: 4.190659046173096 | dt: 1452.14ms | tok/sec: 2820.66 | norm: 1.74\n", "step615 | loss: 4.714026927947998 | dt: 1438.93ms | tok/sec: 2846.56 | norm: 2.03\n", "step616 | loss: 4.643448829650879 | dt: 1442.54ms | tok/sec: 2839.43 | norm: 1.80\n", "step617 | loss: 4.509130001068115 | dt: 1446.68ms | tok/sec: 2831.31 | norm: 1.75\n", "step618 | loss: 4.706866264343262 | dt: 1448.11ms | tok/sec: 2828.51 | norm: 2.01\n", "step619 | loss: 5.019446849822998 | dt: 1453.35ms | tok/sec: 2818.31 | norm: 2.09\n", "step620 | loss: 4.8245978355407715 | dt: 1441.35ms | tok/sec: 2841.78 | norm: 1.79\n", "step621 | loss: 4.480430603027344 | dt: 1447.33ms | tok/sec: 2830.03 | norm: 1.59\n", "step622 | loss: 4.56837272644043 | dt: 1449.01ms | tok/sec: 2826.76 | norm: 2.02\n", "step623 | loss: 4.6714324951171875 | dt: 1443.59ms | tok/sec: 2837.38 | norm: 2.47\n", "step624 | loss: 4.486955642700195 | dt: 1452.71ms | tok/sec: 2819.55 | norm: 1.93\n", "step625 | loss: 4.448971748352051 | dt: 1449.47ms | tok/sec: 2825.85 | norm: 1.80\n", "step626 | loss: 4.319253444671631 | dt: 1448.05ms | tok/sec: 2828.63 | norm: 2.21\n", "step627 | loss: 4.177393913269043 | dt: 1443.56ms | tok/sec: 2837.42 | norm: 2.11\n", "step628 | loss: 4.372792720794678 | dt: 1451.12ms | tok/sec: 2822.65 | norm: 1.88\n", "step629 | loss: 4.396945476531982 | dt: 1445.57ms | tok/sec: 2833.49 | norm: 1.84\n", "step630 | loss: 4.287465572357178 | dt: 1442.82ms | tok/sec: 2838.88 | norm: 1.49\n", "step631 | loss: 4.256464958190918 | dt: 1444.38ms | tok/sec: 2835.81 | norm: 1.66\n", "step632 | loss: 4.078336238861084 | dt: 1451.66ms | tok/sec: 2821.60 | norm: 1.63\n", "step633 | loss: 4.0024733543396 | dt: 1448.44ms | tok/sec: 2827.87 | norm: 1.51\n", "step634 | loss: 4.667900085449219 | dt: 1443.95ms | tok/sec: 2836.66 | norm: 1.76\n", "step635 | loss: 4.360756874084473 | dt: 1449.04ms | tok/sec: 2826.70 | norm: 2.12\n", "step636 | loss: 4.111001968383789 | dt: 1440.42ms | tok/sec: 2843.62 | norm: 1.88\n", "step637 | loss: 4.1568193435668945 | dt: 1443.45ms | tok/sec: 2837.66 | norm: 1.73\n", "step638 | loss: 4.2971906661987305 | dt: 1439.54ms | tok/sec: 2845.36 | norm: 1.77\n", "step639 | loss: 4.278807163238525 | dt: 1448.16ms | tok/sec: 2828.43 | norm: 1.82\n", "step640 | loss: 3.985626697540283 | dt: 1451.71ms | tok/sec: 2821.49 | norm: 1.84\n", "step641 | loss: 3.901503562927246 | dt: 1441.45ms | tok/sec: 2841.59 | norm: 1.38\n", "step642 | loss: 3.9274044036865234 | dt: 1445.49ms | tok/sec: 2833.65 | norm: 1.39\n", "step643 | loss: 4.703834056854248 | dt: 1442.07ms | tok/sec: 2840.37 | norm: 1.68\n", "step644 | loss: 4.522756576538086 | dt: 1444.94ms | tok/sec: 2834.72 | norm: 1.63\n", "step645 | loss: 4.53239631652832 | dt: 1450.62ms | tok/sec: 2823.61 | norm: 1.77\n", "step646 | loss: 4.370109558105469 | dt: 1439.39ms | tok/sec: 2845.64 | norm: 1.64\n", "step647 | loss: 4.509148120880127 | dt: 1451.15ms | tok/sec: 2822.59 | norm: 1.52\n", "step648 | loss: 4.451492786407471 | dt: 1447.29ms | tok/sec: 2830.13 | norm: 1.63\n", "step649 | loss: 4.27906608581543 | dt: 1444.21ms | tok/sec: 2836.15 | norm: 1.65\n", "step650 | loss: 3.9996249675750732 | dt: 1448.62ms | tok/sec: 2827.51 | norm: 1.70\n", "step651 | loss: 4.071080684661865 | dt: 1451.85ms | tok/sec: 2821.24 | norm: 1.50\n", "step652 | loss: 4.0697102546691895 | dt: 1447.68ms | tok/sec: 2829.36 | norm: 1.35\n", "step653 | loss: 4.162938117980957 | dt: 1446.88ms | tok/sec: 2830.91 | norm: 1.57\n", "step654 | loss: 4.213058948516846 | dt: 1448.18ms | tok/sec: 2828.38 | norm: 1.56\n", "step655 | loss: 4.380051612854004 | dt: 1447.20ms | tok/sec: 2830.29 | norm: 1.49\n", "step656 | loss: 4.32816743850708 | dt: 1442.30ms | tok/sec: 2839.91 | norm: 1.53\n", "step657 | loss: 4.226820945739746 | dt: 1453.85ms | tok/sec: 2817.35 | norm: 2.15\n", "step658 | loss: 4.259798526763916 | dt: 1450.46ms | tok/sec: 2823.93 | norm: 2.01\n", "step659 | loss: 4.167628288269043 | dt: 1447.60ms | tok/sec: 2829.51 | norm: 1.90\n", "step660 | loss: 4.228102684020996 | dt: 1444.59ms | tok/sec: 2835.40 | norm: 1.66\n", "step661 | loss: 3.9072837829589844 | dt: 1435.94ms | tok/sec: 2852.49 | norm: 1.67\n", "step662 | loss: 4.003950595855713 | dt: 1435.81ms | tok/sec: 2852.74 | norm: 1.59\n", "step663 | loss: 4.382406711578369 | dt: 1446.26ms | tok/sec: 2832.13 | norm: 1.88\n", "step664 | loss: 3.969059705734253 | dt: 1448.46ms | tok/sec: 2827.83 | norm: 2.02\n", "step665 | loss: 4.211912155151367 | dt: 1443.07ms | tok/sec: 2838.38 | norm: 1.85\n", "step666 | loss: 3.9029488563537598 | dt: 1443.84ms | tok/sec: 2836.88 | norm: 1.75\n", "step667 | loss: 4.402716636657715 | dt: 1444.75ms | tok/sec: 2835.08 | norm: 1.76\n", "step668 | loss: 4.419709205627441 | dt: 1449.34ms | tok/sec: 2826.11 | norm: 1.99\n", "step669 | loss: 4.495043754577637 | dt: 1444.33ms | tok/sec: 2835.91 | norm: 2.14\n", "step670 | loss: 4.467289447784424 | dt: 1444.77ms | tok/sec: 2835.06 | norm: 1.78\n", "step671 | loss: 4.366879940032959 | dt: 1448.68ms | tok/sec: 2827.39 | norm: 1.51\n", "step672 | loss: 4.4076828956604 | dt: 1444.65ms | tok/sec: 2835.29 | norm: 1.64\n", "step673 | loss: 4.390005111694336 | dt: 1449.36ms | tok/sec: 2826.07 | norm: 1.56\n", "step674 | loss: 4.421118259429932 | dt: 1450.77ms | tok/sec: 2823.33 | norm: 1.79\n", "step675 | loss: 4.184621810913086 | dt: 1437.81ms | tok/sec: 2848.78 | norm: 1.49\n", "step676 | loss: 4.036233425140381 | dt: 1437.97ms | tok/sec: 2848.45 | norm: 1.47\n", "step677 | loss: 4.332939624786377 | dt: 1440.80ms | tok/sec: 2842.87 | norm: 1.81\n", "step678 | loss: 4.48549747467041 | dt: 1444.79ms | tok/sec: 2835.01 | norm: 1.95\n", "step679 | loss: 4.52016019821167 | dt: 1444.73ms | tok/sec: 2835.13 | norm: 1.82\n", "step680 | loss: 4.411609172821045 | dt: 1450.25ms | tok/sec: 2824.33 | norm: 1.77\n", "step681 | loss: 4.309284687042236 | dt: 1434.30ms | tok/sec: 2855.75 | norm: 1.78\n", "step682 | loss: 4.455377578735352 | dt: 1451.82ms | tok/sec: 2821.29 | norm: 1.83\n", "step683 | loss: 4.277469158172607 | dt: 1447.12ms | tok/sec: 2830.44 | norm: 1.69\n", "step684 | loss: 4.09932804107666 | dt: 1440.64ms | tok/sec: 2843.18 | norm: 1.62\n", "step685 | loss: 4.24647331237793 | dt: 1444.59ms | tok/sec: 2835.41 | norm: 1.80\n", "step686 | loss: 4.470576763153076 | dt: 1442.16ms | tok/sec: 2840.18 | norm: 1.66\n", "step687 | loss: 4.203147888183594 | dt: 1452.33ms | tok/sec: 2820.29 | norm: 1.94\n", "step688 | loss: 4.216793060302734 | dt: 1451.19ms | tok/sec: 2822.51 | norm: 1.70\n", "step689 | loss: 4.525407314300537 | dt: 1439.80ms | tok/sec: 2844.85 | norm: 1.61\n", "step690 | loss: 4.330930233001709 | dt: 1437.26ms | tok/sec: 2849.87 | norm: 1.40\n", "step691 | loss: 4.263252258300781 | dt: 1447.72ms | tok/sec: 2829.28 | norm: 1.56\n", "step692 | loss: 4.069068431854248 | dt: 1448.90ms | tok/sec: 2826.97 | norm: 1.66\n", "step693 | loss: 4.231716632843018 | dt: 1440.23ms | tok/sec: 2844.00 | norm: 1.61\n", "step694 | loss: 4.093886375427246 | dt: 1444.39ms | tok/sec: 2835.79 | norm: 1.61\n", "step695 | loss: 4.259829998016357 | dt: 1442.35ms | tok/sec: 2839.81 | norm: 1.48\n", "step696 | loss: 4.106372356414795 | dt: 1447.63ms | tok/sec: 2829.46 | norm: 1.87\n", "step697 | loss: 4.627236366271973 | dt: 1444.53ms | tok/sec: 2835.52 | norm: 2.41\n", "step698 | loss: 4.541844844818115 | dt: 1443.90ms | tok/sec: 2836.76 | norm: 2.33\n", "step699 | loss: 4.421751022338867 | dt: 1437.11ms | tok/sec: 2850.16 | norm: 2.07\n", "step700 | loss: 4.618506908416748 | dt: 1447.99ms | tok/sec: 2828.75 | norm: 1.79\n", "step701 | loss: 4.929530143737793 | dt: 1453.75ms | tok/sec: 2817.53 | norm: 2.10\n", "step702 | loss: 4.726560115814209 | dt: 1445.00ms | tok/sec: 2834.60 | norm: 2.15\n", "step703 | loss: 4.3945207595825195 | dt: 1449.29ms | tok/sec: 2826.21 | norm: 1.99\n", "step704 | loss: 4.484568119049072 | dt: 1451.61ms | tok/sec: 2821.69 | norm: 1.92\n", "step705 | loss: 4.596220970153809 | dt: 1437.20ms | tok/sec: 2849.98 | norm: 1.88\n", "step706 | loss: 4.413845062255859 | dt: 1444.42ms | tok/sec: 2835.74 | norm: 2.17\n", "step707 | loss: 4.355100631713867 | dt: 1445.47ms | tok/sec: 2833.69 | norm: 2.47\n", "step708 | loss: 4.245798587799072 | dt: 1442.42ms | tok/sec: 2839.68 | norm: 2.65\n", "step709 | loss: 4.106559753417969 | dt: 1450.04ms | tok/sec: 2824.75 | norm: 1.91\n", "step710 | loss: 4.277285099029541 | dt: 1444.23ms | tok/sec: 2836.11 | norm: 2.03\n", "step711 | loss: 4.291108131408691 | dt: 1448.05ms | tok/sec: 2828.63 | norm: 2.01\n", "step712 | loss: 4.181967258453369 | dt: 1447.72ms | tok/sec: 2829.28 | norm: 2.13\n", "step713 | loss: 4.151841640472412 | dt: 1450.75ms | tok/sec: 2823.38 | norm: 1.77\n", "step714 | loss: 3.9794230461120605 | dt: 1443.85ms | tok/sec: 2836.86 | norm: 1.58\n", "step715 | loss: 3.9105606079101562 | dt: 1447.78ms | tok/sec: 2829.15 | norm: 1.73\n", "step716 | loss: 4.561700820922852 | dt: 1450.09ms | tok/sec: 2824.66 | norm: 1.96\n", "step717 | loss: 4.250373840332031 | dt: 1441.64ms | tok/sec: 2841.22 | norm: 2.05\n", "step718 | loss: 3.990934371948242 | dt: 1449.63ms | tok/sec: 2825.55 | norm: 1.58\n", "step719 | loss: 4.038787364959717 | dt: 1448.76ms | tok/sec: 2827.25 | norm: 1.75\n", "step720 | loss: 4.171940326690674 | dt: 1451.43ms | tok/sec: 2822.05 | norm: 1.72\n", "step721 | loss: 4.169094085693359 | dt: 1438.64ms | tok/sec: 2847.13 | norm: 1.85\n", "step722 | loss: 3.8707120418548584 | dt: 1445.81ms | tok/sec: 2833.01 | norm: 1.75\n", "step723 | loss: 3.764331817626953 | dt: 1436.41ms | tok/sec: 2851.55 | norm: 1.49\n", "step724 | loss: 3.7942886352539062 | dt: 1451.06ms | tok/sec: 2822.76 | norm: 1.62\n", "step725 | loss: 4.559459686279297 | dt: 1443.74ms | tok/sec: 2837.07 | norm: 1.67\n", "step726 | loss: 4.3765106201171875 | dt: 1441.74ms | tok/sec: 2841.02 | norm: 1.62\n", "step727 | loss: 4.4147257804870605 | dt: 1436.43ms | tok/sec: 2851.51 | norm: 1.81\n", "step728 | loss: 4.240688323974609 | dt: 1450.38ms | tok/sec: 2824.09 | norm: 1.66\n", "step729 | loss: 4.3763041496276855 | dt: 1443.30ms | tok/sec: 2837.95 | norm: 1.65\n", "step730 | loss: 4.32746696472168 | dt: 1445.34ms | tok/sec: 2833.93 | norm: 1.72\n", "step731 | loss: 4.171619415283203 | dt: 1452.66ms | tok/sec: 2819.66 | norm: 1.67\n", "step732 | loss: 3.8883275985717773 | dt: 1436.03ms | tok/sec: 2852.30 | norm: 1.72\n", "step733 | loss: 3.9588444232940674 | dt: 1452.33ms | tok/sec: 2820.31 | norm: 1.72\n", "step734 | loss: 3.949384927749634 | dt: 1439.55ms | tok/sec: 2845.34 | norm: 1.60\n", "step735 | loss: 4.038397312164307 | dt: 1449.16ms | tok/sec: 2826.46 | norm: 1.55\n", "step736 | loss: 4.103679656982422 | dt: 1452.41ms | tok/sec: 2820.14 | norm: 1.52\n", "step737 | loss: 4.264881134033203 | dt: 1435.86ms | tok/sec: 2852.65 | norm: 1.57\n", "step738 | loss: 4.216683864593506 | dt: 1452.35ms | tok/sec: 2820.25 | norm: 1.72\n", "step739 | loss: 4.129377365112305 | dt: 1444.48ms | tok/sec: 2835.62 | norm: 2.18\n", "step740 | loss: 4.163913726806641 | dt: 1448.48ms | tok/sec: 2827.80 | norm: 1.93\n", "step741 | loss: 4.0770063400268555 | dt: 1446.28ms | tok/sec: 2832.10 | norm: 1.80\n", "step742 | loss: 4.151022434234619 | dt: 1445.62ms | tok/sec: 2833.38 | norm: 2.13\n", "step743 | loss: 3.825644016265869 | dt: 1446.41ms | tok/sec: 2831.84 | norm: 2.11\n", "step744 | loss: 3.903989315032959 | dt: 1448.56ms | tok/sec: 2827.64 | norm: 1.71\n", "step745 | loss: 4.286641597747803 | dt: 1436.76ms | tok/sec: 2850.85 | norm: 1.66\n", "step746 | loss: 3.890695571899414 | dt: 1440.69ms | tok/sec: 2843.08 | norm: 2.00\n", "step747 | loss: 4.125534534454346 | dt: 1434.24ms | tok/sec: 2855.86 | norm: 2.12\n", "step748 | loss: 3.8121793270111084 | dt: 1449.94ms | tok/sec: 2824.94 | norm: 2.00\n", "step749 | loss: 4.306314468383789 | dt: 1439.93ms | tok/sec: 2844.58 | norm: 1.89\n", "step750 | loss: 4.314875602722168 | dt: 1448.44ms | tok/sec: 2827.88 | norm: 1.73\n", "step751 | loss: 4.3931169509887695 | dt: 1452.51ms | tok/sec: 2819.94 | norm: 1.89\n", "step752 | loss: 4.366686820983887 | dt: 1437.00ms | tok/sec: 2850.38 | norm: 2.05\n", "step753 | loss: 4.2622456550598145 | dt: 1447.99ms | tok/sec: 2828.75 | norm: 2.01\n", "step754 | loss: 4.3070387840271 | dt: 1447.48ms | tok/sec: 2829.75 | norm: 1.95\n", "step755 | loss: 4.286289215087891 | dt: 1439.52ms | tok/sec: 2845.39 | norm: 1.84\n", "step756 | loss: 4.305007457733154 | dt: 1443.81ms | tok/sec: 2836.95 | norm: 1.59\n", "step757 | loss: 4.07480525970459 | dt: 1433.28ms | tok/sec: 2857.79 | norm: 1.63\n", "step758 | loss: 3.937227725982666 | dt: 1446.14ms | tok/sec: 2832.37 | norm: 1.81\n", "step759 | loss: 4.2274274826049805 | dt: 1439.40ms | tok/sec: 2845.62 | norm: 2.05\n", "step760 | loss: 4.401251792907715 | dt: 1451.15ms | tok/sec: 2822.59 | norm: 1.98\n", "step761 | loss: 4.427430629730225 | dt: 1452.29ms | tok/sec: 2820.38 | norm: 1.81\n", "step762 | loss: 4.312440872192383 | dt: 1448.47ms | tok/sec: 2827.82 | norm: 1.80\n", "step763 | loss: 4.22205924987793 | dt: 1445.08ms | tok/sec: 2834.44 | norm: 1.93\n", "step764 | loss: 4.350377559661865 | dt: 1444.65ms | tok/sec: 2835.30 | norm: 1.84\n", "step765 | loss: 4.1757330894470215 | dt: 1446.51ms | tok/sec: 2831.65 | norm: 1.70\n", "step766 | loss: 4.003306865692139 | dt: 1440.44ms | tok/sec: 2843.58 | norm: 1.68\n", "step767 | loss: 4.158267498016357 | dt: 1435.26ms | tok/sec: 2853.85 | norm: 2.18\n", "step768 | loss: 4.363535404205322 | dt: 1445.79ms | tok/sec: 2833.06 | norm: 2.06\n", "step769 | loss: 4.107522487640381 | dt: 1441.01ms | tok/sec: 2842.45 | norm: 2.20\n", "step770 | loss: 4.117395877838135 | dt: 1442.92ms | tok/sec: 2838.68 | norm: 2.09\n", "step771 | loss: 4.419598579406738 | dt: 1438.16ms | tok/sec: 2848.09 | norm: 2.15\n", "step772 | loss: 4.2174482345581055 | dt: 1447.70ms | tok/sec: 2829.31 | norm: 1.91\n", "step773 | loss: 4.153998851776123 | dt: 1433.27ms | tok/sec: 2857.79 | norm: 1.69\n", "step774 | loss: 3.9513251781463623 | dt: 1443.72ms | tok/sec: 2837.12 | norm: 1.65\n", "step775 | loss: 4.128494739532471 | dt: 1441.05ms | tok/sec: 2842.38 | norm: 1.74\n", "step776 | loss: 4.009593963623047 | dt: 1446.40ms | tok/sec: 2831.87 | norm: 1.87\n", "step777 | loss: 4.147928714752197 | dt: 1438.74ms | tok/sec: 2846.94 | norm: 1.73\n", "step778 | loss: 4.007620811462402 | dt: 1449.14ms | tok/sec: 2826.51 | norm: 1.79\n", "step779 | loss: 4.52501106262207 | dt: 1449.94ms | tok/sec: 2824.95 | norm: 2.15\n", "step780 | loss: 4.438234329223633 | dt: 1452.81ms | tok/sec: 2819.36 | norm: 2.24\n", "step781 | loss: 4.323859214782715 | dt: 1440.14ms | tok/sec: 2844.18 | norm: 2.37\n", "step782 | loss: 4.496395111083984 | dt: 1449.78ms | tok/sec: 2825.26 | norm: 2.03\n", "step783 | loss: 4.793951034545898 | dt: 1450.85ms | tok/sec: 2823.16 | norm: 1.84\n", "step784 | loss: 4.595280170440674 | dt: 1448.00ms | tok/sec: 2828.74 | norm: 1.84\n", "step785 | loss: 4.273977756500244 | dt: 1454.18ms | tok/sec: 2816.71 | norm: 1.85\n", "step786 | loss: 4.352395057678223 | dt: 1443.22ms | tok/sec: 2838.09 | norm: 1.87\n", "step787 | loss: 4.470378875732422 | dt: 1439.30ms | tok/sec: 2845.82 | norm: 2.08\n", "step788 | loss: 4.297005653381348 | dt: 1450.74ms | tok/sec: 2823.38 | norm: 1.85\n", "step789 | loss: 4.253270149230957 | dt: 1444.64ms | tok/sec: 2835.30 | norm: 2.29\n", "step790 | loss: 4.147818088531494 | dt: 1449.37ms | tok/sec: 2826.06 | norm: 2.89\n", "step791 | loss: 4.025310039520264 | dt: 1436.30ms | tok/sec: 2851.78 | norm: 2.50\n", "step792 | loss: 4.1812944412231445 | dt: 1446.65ms | tok/sec: 2831.37 | norm: 2.37\n", "step793 | loss: 4.198884010314941 | dt: 1444.87ms | tok/sec: 2834.85 | norm: 2.09\n", "step794 | loss: 4.092230796813965 | dt: 1447.44ms | tok/sec: 2829.83 | norm: 2.14\n", "step795 | loss: 4.034039497375488 | dt: 1437.09ms | tok/sec: 2850.20 | norm: 1.57\n", "step796 | loss: 3.8649797439575195 | dt: 1440.95ms | tok/sec: 2842.58 | norm: 1.79\n", "step797 | loss: 3.799236297607422 | dt: 1447.63ms | tok/sec: 2829.45 | norm: 1.80\n", "step798 | loss: 4.449859142303467 | dt: 1439.35ms | tok/sec: 2845.73 | norm: 1.88\n", "step799 | loss: 4.126354217529297 | dt: 1449.06ms | tok/sec: 2826.67 | norm: 2.03\n", "step800 | loss: 3.861661195755005 | dt: 1446.69ms | tok/sec: 2831.30 | norm: 1.88\n", "step801 | loss: 3.9292171001434326 | dt: 1444.46ms | tok/sec: 2835.67 | norm: 1.94\n", "step802 | loss: 4.054919719696045 | dt: 1440.35ms | tok/sec: 2843.76 | norm: 2.07\n", "step803 | loss: 4.066946029663086 | dt: 1451.73ms | tok/sec: 2821.46 | norm: 1.91\n", "step804 | loss: 3.7603859901428223 | dt: 1450.80ms | tok/sec: 2823.28 | norm: 2.11\n", "step805 | loss: 3.6604199409484863 | dt: 1436.84ms | tok/sec: 2850.70 | norm: 1.93\n", "step806 | loss: 3.6722922325134277 | dt: 1449.26ms | tok/sec: 2826.27 | norm: 1.77\n", "step807 | loss: 4.426248550415039 | dt: 1441.45ms | tok/sec: 2841.58 | norm: 1.97\n", "step808 | loss: 4.2388834953308105 | dt: 1445.85ms | tok/sec: 2832.93 | norm: 1.94\n", "step809 | loss: 4.301135063171387 | dt: 1442.09ms | tok/sec: 2840.32 | norm: 2.29\n", "step810 | loss: 4.112524032592773 | dt: 1443.72ms | tok/sec: 2837.11 | norm: 1.80\n", "step811 | loss: 4.2348713874816895 | dt: 1438.72ms | tok/sec: 2846.98 | norm: 1.72\n", "step812 | loss: 4.191003322601318 | dt: 1455.77ms | tok/sec: 2813.62 | norm: 1.86\n", "step813 | loss: 4.062061309814453 | dt: 1447.28ms | tok/sec: 2830.13 | norm: 2.05\n", "step814 | loss: 3.7635507583618164 | dt: 1453.64ms | tok/sec: 2817.75 | norm: 1.70\n", "step815 | loss: 3.8384194374084473 | dt: 1443.36ms | tok/sec: 2837.83 | norm: 1.46\n", "step816 | loss: 3.8339955806732178 | dt: 1445.17ms | tok/sec: 2834.26 | norm: 1.63\n", "step817 | loss: 3.9167733192443848 | dt: 1444.52ms | tok/sec: 2835.55 | norm: 1.81\n", "step818 | loss: 3.9884965419769287 | dt: 1451.37ms | tok/sec: 2822.16 | norm: 1.64\n", "step819 | loss: 4.145778179168701 | dt: 1444.27ms | tok/sec: 2836.03 | norm: 1.56\n", "step820 | loss: 4.108086109161377 | dt: 1443.57ms | tok/sec: 2837.40 | norm: 1.80\n", "step821 | loss: 4.03638219833374 | dt: 1440.31ms | tok/sec: 2843.84 | norm: 2.36\n", "step822 | loss: 4.0773725509643555 | dt: 1454.11ms | tok/sec: 2816.84 | norm: 2.26\n", "step823 | loss: 3.988246440887451 | dt: 1452.24ms | tok/sec: 2820.46 | norm: 2.10\n", "step824 | loss: 4.053038597106934 | dt: 1452.28ms | tok/sec: 2820.39 | norm: 1.93\n", "step825 | loss: 3.7173829078674316 | dt: 1445.75ms | tok/sec: 2833.14 | norm: 2.17\n", "step826 | loss: 3.7867560386657715 | dt: 1453.10ms | tok/sec: 2818.81 | norm: 2.14\n", "step827 | loss: 4.180225849151611 | dt: 1448.05ms | tok/sec: 2828.64 | norm: 2.19\n", "step828 | loss: 3.8150837421417236 | dt: 1451.18ms | tok/sec: 2822.53 | norm: 1.84\n", "step829 | loss: 4.038074970245361 | dt: 1450.92ms | tok/sec: 2823.04 | norm: 1.94\n", "step830 | loss: 3.733642101287842 | dt: 1452.51ms | tok/sec: 2819.94 | norm: 2.35\n", "step831 | loss: 4.198508262634277 | dt: 1435.64ms | tok/sec: 2853.07 | norm: 2.44\n", "step832 | loss: 4.195823669433594 | dt: 1454.19ms | tok/sec: 2816.69 | norm: 1.96\n", "step833 | loss: 4.266617298126221 | dt: 1455.00ms | tok/sec: 2815.12 | norm: 1.93\n", "step834 | loss: 4.254683494567871 | dt: 1453.02ms | tok/sec: 2818.96 | norm: 1.85\n", "step835 | loss: 4.149558067321777 | dt: 1455.63ms | tok/sec: 2813.91 | norm: 1.93\n", "step836 | loss: 4.192340850830078 | dt: 1448.43ms | tok/sec: 2827.88 | norm: 2.00\n", "step837 | loss: 4.164726257324219 | dt: 1455.37ms | tok/sec: 2814.41 | norm: 2.04\n", "step838 | loss: 4.164725303649902 | dt: 1454.97ms | tok/sec: 2815.18 | norm: 1.76\n", "step839 | loss: 3.9666073322296143 | dt: 1451.89ms | tok/sec: 2821.16 | norm: 1.68\n", "step840 | loss: 3.8285014629364014 | dt: 1452.80ms | tok/sec: 2819.39 | norm: 1.72\n", "step841 | loss: 4.114020824432373 | dt: 1453.73ms | tok/sec: 2817.57 | norm: 2.04\n", "step842 | loss: 4.281392574310303 | dt: 1447.34ms | tok/sec: 2830.02 | norm: 2.08\n", "step843 | loss: 4.290920257568359 | dt: 1445.63ms | tok/sec: 2833.37 | norm: 1.93\n", "step844 | loss: 4.179692268371582 | dt: 1453.65ms | tok/sec: 2817.74 | norm: 1.85\n", "step845 | loss: 4.107074737548828 | dt: 1448.98ms | tok/sec: 2826.83 | norm: 1.94\n", "step846 | loss: 4.225712299346924 | dt: 1449.29ms | tok/sec: 2826.22 | norm: 1.82\n", "step847 | loss: 4.060737133026123 | dt: 1448.72ms | tok/sec: 2827.32 | norm: 1.83\n", "step848 | loss: 3.8919577598571777 | dt: 1454.04ms | tok/sec: 2816.98 | norm: 1.69\n", "step849 | loss: 4.041390895843506 | dt: 1453.18ms | tok/sec: 2818.65 | norm: 1.82\n", "step850 | loss: 4.249907970428467 | dt: 1453.40ms | tok/sec: 2818.22 | norm: 1.95\n", "step851 | loss: 4.004054069519043 | dt: 1456.12ms | tok/sec: 2812.96 | norm: 2.01\n", "step852 | loss: 4.008541584014893 | dt: 1452.22ms | tok/sec: 2820.50 | norm: 2.07\n", "step853 | loss: 4.306646347045898 | dt: 1452.75ms | tok/sec: 2819.49 | norm: 1.88\n", "step854 | loss: 4.1091437339782715 | dt: 1457.36ms | tok/sec: 2810.55 | norm: 1.88\n", "step855 | loss: 4.028663158416748 | dt: 1456.08ms | tok/sec: 2813.03 | norm: 1.78\n", "step856 | loss: 3.8197386264801025 | dt: 1448.12ms | tok/sec: 2828.49 | norm: 1.84\n", "step857 | loss: 4.004386901855469 | dt: 1451.90ms | tok/sec: 2821.12 | norm: 1.82\n", "step858 | loss: 3.886216402053833 | dt: 1453.92ms | tok/sec: 2817.21 | norm: 1.85\n", "step859 | loss: 4.026863098144531 | dt: 1451.17ms | tok/sec: 2822.56 | norm: 1.89\n", "step860 | loss: 3.884765863418579 | dt: 1455.09ms | tok/sec: 2814.94 | norm: 1.92\n", "step861 | loss: 4.393168926239014 | dt: 1452.43ms | tok/sec: 2820.09 | norm: 2.19\n", "step862 | loss: 4.309086322784424 | dt: 1458.56ms | tok/sec: 2808.26 | norm: 2.14\n", "step863 | loss: 4.208496570587158 | dt: 1461.91ms | tok/sec: 2801.82 | norm: 2.23\n", "step864 | loss: 4.374499320983887 | dt: 1443.53ms | tok/sec: 2837.49 | norm: 2.06\n", "step865 | loss: 4.647050380706787 | dt: 1444.45ms | tok/sec: 2835.67 | norm: 2.05\n", "step866 | loss: 4.463078498840332 | dt: 1445.44ms | tok/sec: 2833.75 | norm: 2.02\n", "step867 | loss: 4.153178691864014 | dt: 1447.41ms | tok/sec: 2829.88 | norm: 1.86\n", "step868 | loss: 4.224534511566162 | dt: 1456.66ms | tok/sec: 2811.92 | norm: 1.95\n", "step869 | loss: 4.354718208312988 | dt: 1455.89ms | tok/sec: 2813.39 | norm: 2.14\n", "step870 | loss: 4.1663103103637695 | dt: 1446.27ms | tok/sec: 2832.12 | norm: 2.03\n", "step871 | loss: 4.124032974243164 | dt: 1449.46ms | tok/sec: 2825.89 | norm: 2.35\n", "step872 | loss: 4.02586030960083 | dt: 1444.68ms | tok/sec: 2835.23 | norm: 2.88\n", "step873 | loss: 3.918419599533081 | dt: 1444.15ms | tok/sec: 2836.27 | norm: 2.60\n", "step874 | loss: 4.052966117858887 | dt: 1450.72ms | tok/sec: 2823.42 | norm: 2.11\n", "step875 | loss: 4.089559555053711 | dt: 1451.43ms | tok/sec: 2822.04 | norm: 2.71\n", "step876 | loss: 3.9874267578125 | dt: 1448.89ms | tok/sec: 2826.99 | norm: 2.66\n", "step877 | loss: 3.9173460006713867 | dt: 1447.16ms | tok/sec: 2830.36 | norm: 1.96\n", "step878 | loss: 3.7693729400634766 | dt: 1449.36ms | tok/sec: 2826.07 | norm: 1.83\n", "step879 | loss: 3.6977286338806152 | dt: 1456.51ms | tok/sec: 2812.21 | norm: 1.88\n", "step880 | loss: 4.33180046081543 | dt: 1453.37ms | tok/sec: 2818.27 | norm: 2.12\n", "step881 | loss: 4.021844863891602 | dt: 1448.47ms | tok/sec: 2827.81 | norm: 2.44\n", "step882 | loss: 3.754702091217041 | dt: 1457.08ms | tok/sec: 2811.09 | norm: 2.07\n", "step883 | loss: 3.8125784397125244 | dt: 1448.46ms | tok/sec: 2827.83 | norm: 1.81\n", "step884 | loss: 3.957828998565674 | dt: 1456.53ms | tok/sec: 2812.15 | norm: 2.46\n", "step885 | loss: 3.954402208328247 | dt: 1453.82ms | tok/sec: 2817.40 | norm: 2.37\n", "step886 | loss: 3.6742310523986816 | dt: 1455.58ms | tok/sec: 2814.00 | norm: 2.43\n", "step887 | loss: 3.5594143867492676 | dt: 1454.20ms | tok/sec: 2816.67 | norm: 2.34\n", "step888 | loss: 3.5537590980529785 | dt: 1452.12ms | tok/sec: 2820.70 | norm: 1.96\n", "step889 | loss: 4.2895636558532715 | dt: 1445.64ms | tok/sec: 2833.35 | norm: 2.00\n", "step890 | loss: 4.103391647338867 | dt: 1449.52ms | tok/sec: 2825.77 | norm: 1.94\n", "step891 | loss: 4.20980167388916 | dt: 1453.01ms | tok/sec: 2818.97 | norm: 2.50\n", "step892 | loss: 4.022819995880127 | dt: 1453.01ms | tok/sec: 2818.98 | norm: 2.42\n", "step893 | loss: 4.12714958190918 | dt: 1452.29ms | tok/sec: 2820.37 | norm: 2.23\n", "step894 | loss: 4.092396259307861 | dt: 1457.84ms | tok/sec: 2809.64 | norm: 2.35\n", "step895 | loss: 3.956422805786133 | dt: 1445.60ms | tok/sec: 2833.43 | norm: 2.05\n", "step896 | loss: 3.657424211502075 | dt: 1445.96ms | tok/sec: 2832.72 | norm: 2.17\n", "step897 | loss: 3.7303693294525146 | dt: 1451.00ms | tok/sec: 2822.89 | norm: 2.09\n", "step898 | loss: 3.7194840908050537 | dt: 1453.57ms | tok/sec: 2817.90 | norm: 1.96\n", "step899 | loss: 3.811445951461792 | dt: 1452.47ms | tok/sec: 2820.03 | norm: 2.20\n", "step900 | loss: 3.8963258266448975 | dt: 1455.48ms | tok/sec: 2814.19 | norm: 2.27\n", "step901 | loss: 4.0348219871521 | dt: 1452.65ms | tok/sec: 2819.68 | norm: 1.96\n", "step902 | loss: 3.9689667224884033 | dt: 1450.70ms | tok/sec: 2823.47 | norm: 1.83\n", "step903 | loss: 3.889946460723877 | dt: 1442.68ms | tok/sec: 2839.16 | norm: 2.31\n", "step904 | loss: 3.953756809234619 | dt: 1448.44ms | tok/sec: 2827.86 | norm: 2.35\n", "step905 | loss: 3.8573157787323 | dt: 1458.35ms | tok/sec: 2808.65 | norm: 2.04\n", "step906 | loss: 3.921621322631836 | dt: 1447.06ms | tok/sec: 2830.57 | norm: 1.89\n", "step907 | loss: 3.6058952808380127 | dt: 1454.98ms | tok/sec: 2815.16 | norm: 1.89\n", "step908 | loss: 3.6616218090057373 | dt: 1448.80ms | tok/sec: 2827.17 | norm: 1.88\n", "step909 | loss: 4.052510738372803 | dt: 1444.94ms | tok/sec: 2834.71 | norm: 2.15\n", "step910 | loss: 3.6741394996643066 | dt: 1447.23ms | tok/sec: 2830.23 | norm: 1.90\n", "step911 | loss: 3.8903019428253174 | dt: 1449.73ms | tok/sec: 2825.36 | norm: 1.89\n", "step912 | loss: 3.6199560165405273 | dt: 1454.62ms | tok/sec: 2815.85 | norm: 1.99\n", "step913 | loss: 4.089437961578369 | dt: 1445.49ms | tok/sec: 2833.65 | norm: 2.31\n", "step914 | loss: 4.060504913330078 | dt: 1444.89ms | tok/sec: 2834.81 | norm: 2.22\n", "step915 | loss: 4.116152763366699 | dt: 1447.96ms | tok/sec: 2828.80 | norm: 1.83\n", "step916 | loss: 4.1054534912109375 | dt: 1444.85ms | tok/sec: 2834.89 | norm: 1.77\n", "step917 | loss: 4.006865978240967 | dt: 1451.80ms | tok/sec: 2821.32 | norm: 2.07\n", "step918 | loss: 4.06325626373291 | dt: 1448.91ms | tok/sec: 2826.95 | norm: 2.03\n", "step919 | loss: 4.034075736999512 | dt: 1453.91ms | tok/sec: 2817.24 | norm: 2.01\n", "step920 | loss: 4.026345252990723 | dt: 1449.80ms | tok/sec: 2825.21 | norm: 1.79\n", "step921 | loss: 3.836486339569092 | dt: 1439.10ms | tok/sec: 2846.23 | norm: 1.82\n", "step922 | loss: 3.701857805252075 | dt: 1445.01ms | tok/sec: 2834.58 | norm: 1.80\n", "step923 | loss: 3.984572410583496 | dt: 1443.26ms | tok/sec: 2838.02 | norm: 2.00\n", "step924 | loss: 4.141959190368652 | dt: 1450.95ms | tok/sec: 2822.98 | norm: 1.96\n", "step925 | loss: 4.151538372039795 | dt: 1452.62ms | tok/sec: 2819.74 | norm: 1.98\n", "step926 | loss: 4.03513240814209 | dt: 1441.51ms | tok/sec: 2841.47 | norm: 1.95\n", "step927 | loss: 3.970369577407837 | dt: 1452.08ms | tok/sec: 2820.79 | norm: 1.99\n", "step928 | loss: 4.08612060546875 | dt: 1446.03ms | tok/sec: 2832.58 | norm: 1.95\n", "step929 | loss: 3.9287109375 | dt: 1439.77ms | tok/sec: 2844.90 | norm: 1.89\n", "step930 | loss: 3.766981840133667 | dt: 1449.87ms | tok/sec: 2825.08 | norm: 1.87\n", "step931 | loss: 3.901465654373169 | dt: 1452.92ms | tok/sec: 2819.14 | norm: 1.84\n", "step932 | loss: 4.106594085693359 | dt: 1439.81ms | tok/sec: 2844.82 | norm: 1.83\n", "step933 | loss: 3.8713278770446777 | dt: 1448.32ms | tok/sec: 2828.10 | norm: 2.11\n", "step934 | loss: 3.8879847526550293 | dt: 1439.81ms | tok/sec: 2844.82 | norm: 2.28\n", "step935 | loss: 4.165831565856934 | dt: 1450.35ms | tok/sec: 2824.15 | norm: 2.04\n", "step936 | loss: 3.9788477420806885 | dt: 1455.82ms | tok/sec: 2813.54 | norm: 2.04\n", "step937 | loss: 3.895249128341675 | dt: 1445.46ms | tok/sec: 2833.70 | norm: 1.96\n", "step938 | loss: 3.6977550983428955 | dt: 1456.04ms | tok/sec: 2813.11 | norm: 2.09\n", "step939 | loss: 3.87697434425354 | dt: 1452.89ms | tok/sec: 2819.20 | norm: 2.00\n", "step940 | loss: 3.762925148010254 | dt: 1455.17ms | tok/sec: 2814.79 | norm: 1.98\n", "step941 | loss: 3.897991418838501 | dt: 1452.27ms | tok/sec: 2820.41 | norm: 2.03\n", "step942 | loss: 3.763456106185913 | dt: 1453.58ms | tok/sec: 2817.87 | norm: 2.01\n", "step943 | loss: 4.266374588012695 | dt: 1451.55ms | tok/sec: 2821.81 | norm: 2.36\n", "step944 | loss: 4.187796592712402 | dt: 1448.74ms | tok/sec: 2827.28 | norm: 2.26\n", "step945 | loss: 4.085707187652588 | dt: 1442.42ms | tok/sec: 2839.67 | norm: 2.39\n", "step946 | loss: 4.242721080780029 | dt: 1453.23ms | tok/sec: 2818.55 | norm: 2.31\n", "step947 | loss: 4.496296405792236 | dt: 1455.42ms | tok/sec: 2814.31 | norm: 2.17\n", "step948 | loss: 4.309307098388672 | dt: 1451.81ms | tok/sec: 2821.30 | norm: 2.14\n", "step949 | loss: 4.019320011138916 | dt: 1447.80ms | tok/sec: 2829.11 | norm: 2.02\n", "step950 | loss: 4.103781223297119 | dt: 1445.05ms | tok/sec: 2834.51 | norm: 2.04\n", "step951 | loss: 4.239468574523926 | dt: 1450.54ms | tok/sec: 2823.77 | norm: 2.15\n", "step952 | loss: 4.050425052642822 | dt: 1454.39ms | tok/sec: 2816.30 | norm: 2.23\n", "step953 | loss: 4.00532865524292 | dt: 1447.82ms | tok/sec: 2829.08 | norm: 2.53\n", "step954 | loss: 3.9224982261657715 | dt: 1450.35ms | tok/sec: 2824.14 | norm: 2.61\n", "step955 | loss: 3.814807891845703 | dt: 1450.98ms | tok/sec: 2822.91 | norm: 2.65\n", "step956 | loss: 3.9162991046905518 | dt: 1452.42ms | tok/sec: 2820.12 | norm: 2.53\n", "step957 | loss: 4.001554012298584 | dt: 1456.18ms | tok/sec: 2812.85 | norm: 2.79\n", "step958 | loss: 3.8844738006591797 | dt: 1441.27ms | tok/sec: 2841.94 | norm: 2.82\n", "step959 | loss: 3.801079273223877 | dt: 1447.25ms | tok/sec: 2830.20 | norm: 2.73\n", "step960 | loss: 3.6556143760681152 | dt: 1452.95ms | tok/sec: 2819.10 | norm: 2.65\n", "step961 | loss: 3.594616413116455 | dt: 1452.64ms | tok/sec: 2819.68 | norm: 2.12\n", "step962 | loss: 4.229109764099121 | dt: 1455.13ms | tok/sec: 2814.87 | norm: 2.29\n", "step963 | loss: 3.95074725151062 | dt: 1450.82ms | tok/sec: 2823.23 | norm: 3.14\n", "step964 | loss: 3.698045492172241 | dt: 1453.40ms | tok/sec: 2818.22 | norm: 3.17\n", "step965 | loss: 3.71537446975708 | dt: 1456.78ms | tok/sec: 2811.68 | norm: 2.96\n", "step966 | loss: 3.8513636589050293 | dt: 1456.44ms | tok/sec: 2812.33 | norm: 2.57\n", "step967 | loss: 3.8530642986297607 | dt: 1446.89ms | tok/sec: 2830.89 | norm: 2.52\n", "step968 | loss: 3.58026385307312 | dt: 1449.18ms | tok/sec: 2826.43 | norm: 2.38\n", "step969 | loss: 3.4856839179992676 | dt: 1443.29ms | tok/sec: 2837.96 | norm: 2.56\n", "step970 | loss: 3.459122657775879 | dt: 1447.29ms | tok/sec: 2830.12 | norm: 2.51\n", "step971 | loss: 4.1576828956604 | dt: 1452.30ms | tok/sec: 2820.35 | norm: 2.64\n", "step972 | loss: 3.9591445922851562 | dt: 1449.45ms | tok/sec: 2825.90 | norm: 2.38\n", "step973 | loss: 4.074720859527588 | dt: 1448.24ms | tok/sec: 2828.26 | norm: 2.74\n", "step974 | loss: 3.8947513103485107 | dt: 1445.67ms | tok/sec: 2833.28 | norm: 2.58\n", "step975 | loss: 4.004524230957031 | dt: 1457.67ms | tok/sec: 2809.97 | norm: 2.60\n", "step976 | loss: 3.9829349517822266 | dt: 1453.11ms | tok/sec: 2818.77 | norm: 2.59\n", "step977 | loss: 3.84340238571167 | dt: 1460.02ms | tok/sec: 2805.44 | norm: 2.48\n", "step978 | loss: 3.5513906478881836 | dt: 1447.05ms | tok/sec: 2830.59 | norm: 2.27\n", "step979 | loss: 3.629338264465332 | dt: 1449.13ms | tok/sec: 2826.53 | norm: 2.21\n", "step980 | loss: 3.609307050704956 | dt: 1450.39ms | tok/sec: 2824.07 | norm: 2.10\n", "step981 | loss: 3.693915367126465 | dt: 1446.87ms | tok/sec: 2830.94 | norm: 2.07\n", "step982 | loss: 3.7823379039764404 | dt: 1447.23ms | tok/sec: 2830.24 | norm: 2.34\n", "step983 | loss: 3.9031856060028076 | dt: 1443.11ms | tok/sec: 2838.31 | norm: 2.04\n", "step984 | loss: 3.8226964473724365 | dt: 1458.29ms | tok/sec: 2808.76 | norm: 2.02\n", "step985 | loss: 3.7648799419403076 | dt: 1462.47ms | tok/sec: 2800.73 | norm: 2.46\n", "step986 | loss: 3.8101301193237305 | dt: 1454.83ms | tok/sec: 2815.45 | norm: 2.26\n", "step987 | loss: 3.7138547897338867 | dt: 1456.29ms | tok/sec: 2812.63 | norm: 2.20\n", "step988 | loss: 3.7933599948883057 | dt: 1444.91ms | tok/sec: 2834.78 | norm: 2.26\n", "step989 | loss: 3.474100112915039 | dt: 1453.71ms | tok/sec: 2817.62 | norm: 2.25\n", "step990 | loss: 3.529863119125366 | dt: 1454.35ms | tok/sec: 2816.37 | norm: 2.04\n", "step991 | loss: 3.9187049865722656 | dt: 1447.06ms | tok/sec: 2830.58 | norm: 2.20\n", "step992 | loss: 3.542435884475708 | dt: 1454.96ms | tok/sec: 2815.20 | norm: 2.16\n", "step993 | loss: 3.734328031539917 | dt: 1448.74ms | tok/sec: 2827.28 | norm: 2.03\n", "step994 | loss: 3.4839494228363037 | dt: 1449.07ms | tok/sec: 2826.63 | norm: 2.21\n", "step995 | loss: 3.974329710006714 | dt: 1447.21ms | tok/sec: 2830.28 | norm: 2.67\n", "step996 | loss: 3.9434456825256348 | dt: 1454.01ms | tok/sec: 2817.04 | norm: 2.32\n", "step997 | loss: 3.978290557861328 | dt: 1451.10ms | tok/sec: 2822.69 | norm: 2.18\n", "step998 | loss: 3.964400053024292 | dt: 1452.97ms | tok/sec: 2819.06 | norm: 2.04\n", "step999 | loss: 3.8679349422454834 | dt: 1452.98ms | tok/sec: 2819.03 | norm: 2.02\n", "step1000 | loss: 3.9290919303894043 | dt: 1447.66ms | tok/sec: 2829.39 | norm: 2.06\n", "step1001 | loss: 3.9054243564605713 | dt: 1450.73ms | tok/sec: 2823.41 | norm: 2.27\n", "step1002 | loss: 3.881596565246582 | dt: 1455.84ms | tok/sec: 2813.50 | norm: 2.08\n", "step1003 | loss: 3.6960411071777344 | dt: 1449.70ms | tok/sec: 2825.41 | norm: 1.84\n", "step1004 | loss: 3.581388473510742 | dt: 1453.97ms | tok/sec: 2817.11 | norm: 1.99\n", "step1005 | loss: 3.8511903285980225 | dt: 1448.37ms | tok/sec: 2828.00 | norm: 2.40\n", "step1006 | loss: 3.9997658729553223 | dt: 1443.87ms | tok/sec: 2836.82 | norm: 2.18\n", "step1007 | loss: 4.002960681915283 | dt: 1450.58ms | tok/sec: 2823.69 | norm: 2.19\n", "step1008 | loss: 3.8834447860717773 | dt: 1445.74ms | tok/sec: 2833.15 | norm: 2.14\n", "step1009 | loss: 3.8350882530212402 | dt: 1453.33ms | tok/sec: 2818.36 | norm: 2.52\n", "step1010 | loss: 3.939549446105957 | dt: 1455.83ms | tok/sec: 2813.52 | norm: 2.23\n", "step1011 | loss: 3.7939612865448 | dt: 1453.03ms | tok/sec: 2818.93 | norm: 2.13\n", "step1012 | loss: 3.6575911045074463 | dt: 1451.30ms | tok/sec: 2822.30 | norm: 2.19\n", "step1013 | loss: 3.7802810668945312 | dt: 1446.91ms | tok/sec: 2830.85 | norm: 2.35\n", "step1014 | loss: 3.9724035263061523 | dt: 1449.95ms | tok/sec: 2824.92 | norm: 2.18\n", "step1015 | loss: 3.745605945587158 | dt: 1447.81ms | tok/sec: 2829.10 | norm: 2.06\n", "step1016 | loss: 3.7726006507873535 | dt: 1442.55ms | tok/sec: 2839.42 | norm: 2.19\n", "step1017 | loss: 4.030423164367676 | dt: 1444.60ms | tok/sec: 2835.39 | norm: 2.20\n", "step1018 | loss: 3.8654797077178955 | dt: 1453.93ms | tok/sec: 2817.18 | norm: 2.40\n", "step1019 | loss: 3.7772905826568604 | dt: 1454.98ms | tok/sec: 2815.17 | norm: 2.27\n", "step1020 | loss: 3.5736286640167236 | dt: 1456.93ms | tok/sec: 2811.39 | norm: 2.17\n", "step1021 | loss: 3.7574563026428223 | dt: 1447.83ms | tok/sec: 2829.07 | norm: 2.34\n", "step1022 | loss: 3.6457247734069824 | dt: 1453.58ms | tok/sec: 2817.87 | norm: 2.27\n", "step1023 | loss: 3.780345916748047 | dt: 1453.73ms | tok/sec: 2817.58 | norm: 2.28\n", "step1024 | loss: 3.644838571548462 | dt: 1454.57ms | tok/sec: 2815.95 | norm: 2.31\n", "step1025 | loss: 4.14162540435791 | dt: 1451.19ms | tok/sec: 2822.51 | norm: 2.64\n", "step1026 | loss: 4.062448501586914 | dt: 1440.88ms | tok/sec: 2842.71 | norm: 2.60\n", "step1027 | loss: 3.968428611755371 | dt: 1450.86ms | tok/sec: 2823.15 | norm: 2.82\n", "step1028 | loss: 4.141637325286865 | dt: 1451.10ms | tok/sec: 2822.68 | norm: 2.77\n", "step1029 | loss: 4.384054183959961 | dt: 1455.32ms | tok/sec: 2814.50 | norm: 2.98\n", "step1030 | loss: 4.173174858093262 | dt: 1437.88ms | tok/sec: 2848.64 | norm: 2.70\n", "step1031 | loss: 3.8796582221984863 | dt: 1441.45ms | tok/sec: 2841.59 | norm: 2.34\n", "step1032 | loss: 3.965097188949585 | dt: 1447.40ms | tok/sec: 2829.90 | norm: 2.63\n", "step1033 | loss: 4.115586757659912 | dt: 1449.51ms | tok/sec: 2825.79 | norm: 3.14\n", "step1034 | loss: 3.937936305999756 | dt: 1456.67ms | tok/sec: 2811.90 | norm: 2.64\n", "step1035 | loss: 3.8865416049957275 | dt: 1440.09ms | tok/sec: 2844.27 | norm: 2.82\n", "step1036 | loss: 3.808229446411133 | dt: 1450.99ms | tok/sec: 2822.91 | norm: 3.09\n", "step1037 | loss: 3.7422142028808594 | dt: 1461.09ms | tok/sec: 2803.39 | norm: 3.20\n", "step1038 | loss: 3.8195977210998535 | dt: 1447.50ms | tok/sec: 2829.70 | norm: 3.02\n", "step1039 | loss: 3.9190256595611572 | dt: 1441.51ms | tok/sec: 2841.46 | norm: 3.58\n", "step1040 | loss: 3.7843515872955322 | dt: 1443.89ms | tok/sec: 2836.78 | norm: 3.09\n", "step1041 | loss: 3.7068700790405273 | dt: 1456.94ms | tok/sec: 2811.37 | norm: 2.73\n", "step1042 | loss: 3.5859076976776123 | dt: 1459.12ms | tok/sec: 2807.18 | norm: 3.12\n", "step1043 | loss: 3.5370357036590576 | dt: 1452.46ms | tok/sec: 2820.04 | norm: 3.85\n", "step1044 | loss: 4.17103385925293 | dt: 1450.95ms | tok/sec: 2822.99 | norm: 3.92\n", "step1045 | loss: 3.8975369930267334 | dt: 1450.87ms | tok/sec: 2823.13 | norm: 4.12\n", "step1046 | loss: 3.6287333965301514 | dt: 1455.64ms | tok/sec: 2813.89 | norm: 3.06\n", "step1047 | loss: 3.6454806327819824 | dt: 1453.03ms | tok/sec: 2818.93 | norm: 2.82\n", "step1048 | loss: 3.7819876670837402 | dt: 1454.57ms | tok/sec: 2815.96 | norm: 3.31\n", "step1049 | loss: 3.8184618949890137 | dt: 1451.08ms | tok/sec: 2822.72 | norm: 4.12\n", "step1050 | loss: 3.531522035598755 | dt: 1440.36ms | tok/sec: 2843.73 | norm: 3.35\n", "step1051 | loss: 3.42948842048645 | dt: 1451.83ms | tok/sec: 2821.26 | norm: 2.98\n", "step1052 | loss: 3.3747715950012207 | dt: 1451.75ms | tok/sec: 2821.42 | norm: 2.82\n", "step1053 | loss: 4.0699381828308105 | dt: 1444.99ms | tok/sec: 2834.63 | norm: 2.94\n", "step1054 | loss: 3.871002435684204 | dt: 1440.97ms | tok/sec: 2842.52 | norm: 3.09\n", "step1055 | loss: 3.998018741607666 | dt: 1451.46ms | tok/sec: 2821.99 | norm: 3.29\n", "step1056 | loss: 3.805924415588379 | dt: 1449.35ms | tok/sec: 2826.10 | norm: 2.95\n", "step1057 | loss: 3.900602340698242 | dt: 1439.96ms | tok/sec: 2844.53 | norm: 2.78\n", "step1058 | loss: 3.8619205951690674 | dt: 1449.87ms | tok/sec: 2825.09 | norm: 2.83\n", "step1059 | loss: 3.7361931800842285 | dt: 1452.34ms | tok/sec: 2820.28 | norm: 2.77\n", "step1060 | loss: 3.4786832332611084 | dt: 1453.70ms | tok/sec: 2817.63 | norm: 3.38\n", "step1061 | loss: 3.5457746982574463 | dt: 1452.82ms | tok/sec: 2819.35 | norm: 3.08\n", "step1062 | loss: 3.513343334197998 | dt: 1447.38ms | tok/sec: 2829.95 | norm: 2.88\n", "step1063 | loss: 3.5813043117523193 | dt: 1439.34ms | tok/sec: 2845.76 | norm: 2.69\n", "step1064 | loss: 3.694998264312744 | dt: 1447.93ms | tok/sec: 2828.86 | norm: 2.86\n", "step1065 | loss: 3.813612937927246 | dt: 1445.27ms | tok/sec: 2834.08 | norm: 2.70\n", "step1066 | loss: 3.7148914337158203 | dt: 1452.33ms | tok/sec: 2820.30 | norm: 2.75\n", "step1067 | loss: 3.6307716369628906 | dt: 1445.87ms | tok/sec: 2832.89 | norm: 2.69\n", "step1068 | loss: 3.6818838119506836 | dt: 1453.88ms | tok/sec: 2817.29 | norm: 2.39\n", "step1069 | loss: 3.6134281158447266 | dt: 1450.63ms | tok/sec: 2823.59 | norm: 2.54\n", "step1070 | loss: 3.6793084144592285 | dt: 1449.82ms | tok/sec: 2825.17 | norm: 2.46\n", "step1071 | loss: 3.3669240474700928 | dt: 1442.79ms | tok/sec: 2838.94 | norm: 2.48\n", "step1072 | loss: 3.4144866466522217 | dt: 1447.16ms | tok/sec: 2830.37 | norm: 2.26\n", "step1073 | loss: 3.797008752822876 | dt: 1445.91ms | tok/sec: 2832.81 | norm: 2.53\n", "step1074 | loss: 3.4166667461395264 | dt: 1445.22ms | tok/sec: 2834.18 | norm: 2.28\n", "step1075 | loss: 3.5895750522613525 | dt: 1449.42ms | tok/sec: 2825.96 | norm: 2.24\n", "step1076 | loss: 3.34812068939209 | dt: 1455.80ms | tok/sec: 2813.57 | norm: 2.41\n", "step1077 | loss: 3.849395275115967 | dt: 1453.15ms | tok/sec: 2818.71 | norm: 2.56\n", "step1078 | loss: 3.8070120811462402 | dt: 1441.55ms | tok/sec: 2841.39 | norm: 2.46\n", "step1079 | loss: 3.842967987060547 | dt: 1439.74ms | tok/sec: 2844.97 | norm: 2.59\n", "step1080 | loss: 3.8298096656799316 | dt: 1454.36ms | tok/sec: 2816.35 | norm: 2.51\n", "step1081 | loss: 3.7355897426605225 | dt: 1453.28ms | tok/sec: 2818.45 | norm: 2.82\n", "step1082 | loss: 3.8010787963867188 | dt: 1451.72ms | tok/sec: 2821.49 | norm: 2.77\n", "step1083 | loss: 3.7788615226745605 | dt: 1447.37ms | tok/sec: 2829.95 | norm: 2.65\n", "step1084 | loss: 3.7409348487854004 | dt: 1443.31ms | tok/sec: 2837.92 | norm: 2.44\n", "step1085 | loss: 3.5669100284576416 | dt: 1452.25ms | tok/sec: 2820.45 | norm: 2.33\n", "step1086 | loss: 3.4612467288970947 | dt: 1460.29ms | tok/sec: 2804.92 | norm: 2.29\n", "step1087 | loss: 3.7566914558410645 | dt: 1445.67ms | tok/sec: 2833.28 | norm: 2.70\n", "step1088 | loss: 3.8916685581207275 | dt: 1446.50ms | tok/sec: 2831.66 | norm: 2.63\n", "step1089 | loss: 3.9108874797821045 | dt: 1440.99ms | tok/sec: 2842.49 | norm: 3.09\n", "step1090 | loss: 3.7758889198303223 | dt: 1459.55ms | tok/sec: 2806.34 | norm: 2.75\n", "step1091 | loss: 3.7205348014831543 | dt: 1450.95ms | tok/sec: 2822.98 | norm: 2.40\n", "step1092 | loss: 3.811109781265259 | dt: 1446.48ms | tok/sec: 2831.69 | norm: 2.28\n", "step1093 | loss: 3.689265727996826 | dt: 1448.93ms | tok/sec: 2826.91 | norm: 3.16\n", "step1094 | loss: 3.560065269470215 | dt: 1439.87ms | tok/sec: 2844.69 | norm: 2.99\n", "step1095 | loss: 3.673739433288574 | dt: 1448.95ms | tok/sec: 2826.87 | norm: 2.85\n", "step1096 | loss: 3.8604001998901367 | dt: 1438.25ms | tok/sec: 2847.91 | norm: 2.58\n", "step1097 | loss: 3.652625560760498 | dt: 1448.56ms | tok/sec: 2827.64 | norm: 2.96\n", "step1098 | loss: 3.6655447483062744 | dt: 1450.26ms | tok/sec: 2824.33 | norm: 2.75\n", "step1099 | loss: 3.9144983291625977 | dt: 1441.35ms | tok/sec: 2841.77 | norm: 2.66\n", "step1100 | loss: 3.7431416511535645 | dt: 1455.84ms | tok/sec: 2813.50 | norm: 2.47\n", "step1101 | loss: 3.6807806491851807 | dt: 1448.10ms | tok/sec: 2828.54 | norm: 2.81\n", "step1102 | loss: 3.4724605083465576 | dt: 1451.94ms | tok/sec: 2821.05 | norm: 3.06\n", "step1103 | loss: 3.678325653076172 | dt: 1448.78ms | tok/sec: 2827.21 | norm: 3.17\n", "step1104 | loss: 3.5670275688171387 | dt: 1448.10ms | tok/sec: 2828.54 | norm: 3.09\n", "step1105 | loss: 3.6832594871520996 | dt: 1453.27ms | tok/sec: 2818.47 | norm: 3.02\n", "step1106 | loss: 3.54788875579834 | dt: 1446.78ms | tok/sec: 2831.12 | norm: 2.83\n", "step1107 | loss: 4.027676105499268 | dt: 1447.24ms | tok/sec: 2830.22 | norm: 3.29\n", "step1108 | loss: 3.9173266887664795 | dt: 1444.13ms | tok/sec: 2836.31 | norm: 2.94\n", "step1109 | loss: 3.830869674682617 | dt: 1445.85ms | tok/sec: 2832.93 | norm: 2.88\n", "step1110 | loss: 4.024904251098633 | dt: 1440.84ms | tok/sec: 2842.79 | norm: 3.22\n", "step1111 | loss: 4.302937984466553 | dt: 1444.80ms | tok/sec: 2835.00 | norm: 3.77\n", "step1112 | loss: 4.098597526550293 | dt: 1446.18ms | tok/sec: 2832.30 | norm: 3.28\n", "step1113 | loss: 3.785801410675049 | dt: 1444.16ms | tok/sec: 2836.25 | norm: 2.96\n", "step1114 | loss: 3.8641672134399414 | dt: 1450.82ms | tok/sec: 2823.23 | norm: 2.90\n", "step1115 | loss: 4.02388334274292 | dt: 1437.73ms | tok/sec: 2848.93 | norm: 3.18\n", "step1116 | loss: 3.8470492362976074 | dt: 1441.21ms | tok/sec: 2842.05 | norm: 3.59\n", "step1117 | loss: 3.813481330871582 | dt: 1445.31ms | tok/sec: 2834.00 | norm: 3.52\n", "step1118 | loss: 3.736497163772583 | dt: 1450.06ms | tok/sec: 2824.70 | norm: 3.41\n", "step1119 | loss: 3.6744301319122314 | dt: 1448.34ms | tok/sec: 2828.06 | norm: 3.26\n", "step1120 | loss: 3.745173454284668 | dt: 1454.32ms | tok/sec: 2816.43 | norm: 3.31\n", "step1121 | loss: 3.836928367614746 | dt: 1456.36ms | tok/sec: 2812.48 | norm: 3.32\n", "step1122 | loss: 3.6994166374206543 | dt: 1447.36ms | tok/sec: 2829.98 | norm: 3.34\n", "step1123 | loss: 3.6253817081451416 | dt: 1453.85ms | tok/sec: 2817.34 | norm: 3.48\n", "step1124 | loss: 3.5147132873535156 | dt: 1450.76ms | tok/sec: 2823.34 | norm: 3.34\n", "step1125 | loss: 3.4603686332702637 | dt: 1442.71ms | tok/sec: 2839.10 | norm: 3.21\n", "step1126 | loss: 4.069830894470215 | dt: 1449.00ms | tok/sec: 2826.78 | norm: 3.08\n", "step1127 | loss: 3.8146517276763916 | dt: 1453.30ms | tok/sec: 2818.42 | norm: 3.57\n", "step1128 | loss: 3.5965287685394287 | dt: 1455.07ms | tok/sec: 2814.99 | norm: 3.52\n", "step1129 | loss: 3.6042838096618652 | dt: 1453.76ms | tok/sec: 2817.52 | norm: 3.45\n", "step1130 | loss: 3.6889331340789795 | dt: 1454.63ms | tok/sec: 2815.84 | norm: 3.11\n", "step1131 | loss: 3.7037932872772217 | dt: 1451.30ms | tok/sec: 2822.30 | norm: 3.10\n", "step1132 | loss: 3.4268434047698975 | dt: 1454.06ms | tok/sec: 2816.94 | norm: 3.00\n", "step1133 | loss: 3.3260791301727295 | dt: 1452.59ms | tok/sec: 2819.78 | norm: 2.92\n", "step1134 | loss: 3.276679515838623 | dt: 1437.93ms | tok/sec: 2848.53 | norm: 2.95\n", "step1135 | loss: 3.981884002685547 | dt: 1440.23ms | tok/sec: 2844.00 | norm: 3.29\n", "step1136 | loss: 3.7680983543395996 | dt: 1449.06ms | tok/sec: 2826.66 | norm: 3.05\n", "step1137 | loss: 3.891313076019287 | dt: 1446.08ms | tok/sec: 2832.48 | norm: 3.57\n", "step1138 | loss: 3.719045877456665 | dt: 1439.83ms | tok/sec: 2844.79 | norm: 3.33\n", "step1139 | loss: 3.7994422912597656 | dt: 1451.23ms | tok/sec: 2822.42 | norm: 3.04\n", "step1140 | loss: 3.7644219398498535 | dt: 1452.51ms | tok/sec: 2819.94 | norm: 2.88\n", "step1141 | loss: 3.6401724815368652 | dt: 1451.82ms | tok/sec: 2821.29 | norm: 2.92\n", "step1142 | loss: 3.392120599746704 | dt: 1452.68ms | tok/sec: 2819.62 | norm: 2.93\n", "step1143 | loss: 3.4464550018310547 | dt: 1455.57ms | tok/sec: 2814.01 | norm: 2.67\n", "step1144 | loss: 3.4204370975494385 | dt: 1451.65ms | tok/sec: 2821.62 | norm: 2.77\n", "step1145 | loss: 3.4848926067352295 | dt: 1453.19ms | tok/sec: 2818.63 | norm: 2.69\n", "step1146 | loss: 3.6145689487457275 | dt: 1452.90ms | tok/sec: 2819.20 | norm: 3.07\n", "step1147 | loss: 3.712216854095459 | dt: 1454.40ms | tok/sec: 2816.28 | norm: 2.83\n", "step1148 | loss: 3.630281686782837 | dt: 1435.92ms | tok/sec: 2852.52 | norm: 3.01\n", "step1149 | loss: 3.5462913513183594 | dt: 1447.95ms | tok/sec: 2828.83 | norm: 3.47\n", "step1150 | loss: 3.6125376224517822 | dt: 1455.72ms | tok/sec: 2813.72 | norm: 3.35\n", "step1151 | loss: 3.5169105529785156 | dt: 1451.79ms | tok/sec: 2821.35 | norm: 3.04\n", "step1152 | loss: 3.582515239715576 | dt: 1446.32ms | tok/sec: 2832.01 | norm: 2.87\n", "step1153 | loss: 3.263972759246826 | dt: 1452.89ms | tok/sec: 2819.21 | norm: 2.73\n", "step1154 | loss: 3.3093903064727783 | dt: 1444.90ms | tok/sec: 2834.79 | norm: 2.92\n", "step1155 | loss: 3.720813751220703 | dt: 1452.12ms | tok/sec: 2820.70 | norm: 3.38\n", "step1156 | loss: 3.3435728549957275 | dt: 1451.04ms | tok/sec: 2822.81 | norm: 3.12\n", "step1157 | loss: 3.5012810230255127 | dt: 1451.80ms | tok/sec: 2821.32 | norm: 2.77\n", "step1158 | loss: 3.2518694400787354 | dt: 1442.84ms | tok/sec: 2838.84 | norm: 2.55\n", "step1159 | loss: 3.7243154048919678 | dt: 1445.46ms | tok/sec: 2833.69 | norm: 2.72\n", "step1160 | loss: 3.6937761306762695 | dt: 1450.04ms | tok/sec: 2824.74 | norm: 3.03\n", "step1161 | loss: 3.726954460144043 | dt: 1435.20ms | tok/sec: 2853.97 | norm: 2.91\n", "step1162 | loss: 3.726532459259033 | dt: 1455.39ms | tok/sec: 2814.36 | norm: 2.65\n", "step1163 | loss: 3.6371309757232666 | dt: 1447.56ms | tok/sec: 2829.58 | norm: 2.73\n", "step1164 | loss: 3.708040714263916 | dt: 1445.19ms | tok/sec: 2834.23 | norm: 2.70\n", "step1165 | loss: 3.68489146232605 | dt: 1454.09ms | tok/sec: 2816.89 | norm: 2.99\n", "step1166 | loss: 3.6521384716033936 | dt: 1448.23ms | tok/sec: 2828.29 | norm: 3.05\n", "step1167 | loss: 3.47226619720459 | dt: 1443.42ms | tok/sec: 2837.71 | norm: 2.97\n", "step1168 | loss: 3.326901435852051 | dt: 1447.43ms | tok/sec: 2829.85 | norm: 2.44\n", "step1169 | loss: 3.6261444091796875 | dt: 1445.30ms | tok/sec: 2834.02 | norm: 2.72\n", "step1170 | loss: 3.748194456100464 | dt: 1447.36ms | tok/sec: 2829.98 | norm: 2.43\n", "step1171 | loss: 3.7967071533203125 | dt: 1455.57ms | tok/sec: 2814.02 | norm: 2.75\n", "step1172 | loss: 3.6691348552703857 | dt: 1453.72ms | tok/sec: 2817.60 | norm: 3.17\n", "step1173 | loss: 3.6015498638153076 | dt: 1451.28ms | tok/sec: 2822.34 | norm: 3.25\n", "step1174 | loss: 3.6879725456237793 | dt: 1441.59ms | tok/sec: 2841.32 | norm: 3.06\n", "step1175 | loss: 3.5857670307159424 | dt: 1448.27ms | tok/sec: 2828.21 | norm: 2.80\n", "step1176 | loss: 3.43173885345459 | dt: 1446.85ms | tok/sec: 2830.97 | norm: 2.54\n", "step1177 | loss: 3.5402538776397705 | dt: 1447.38ms | tok/sec: 2829.95 | norm: 2.59\n", "step1178 | loss: 3.7251148223876953 | dt: 1447.40ms | tok/sec: 2829.91 | norm: 2.72\n", "step1179 | loss: 3.550017833709717 | dt: 1440.88ms | tok/sec: 2842.70 | norm: 3.04\n", "step1180 | loss: 3.549325704574585 | dt: 1446.41ms | tok/sec: 2831.84 | norm: 2.72\n", "step1181 | loss: 3.78363299369812 | dt: 1453.30ms | tok/sec: 2818.42 | norm: 2.63\n", "step1182 | loss: 3.599607467651367 | dt: 1437.07ms | tok/sec: 2850.24 | norm: 2.47\n", "step1183 | loss: 3.5564656257629395 | dt: 1450.79ms | tok/sec: 2823.29 | norm: 2.72\n", "step1184 | loss: 3.355560541152954 | dt: 1451.61ms | tok/sec: 2821.70 | norm: 2.78\n", "step1185 | loss: 3.550461769104004 | dt: 1451.68ms | tok/sec: 2821.56 | norm: 2.81\n", "step1186 | loss: 3.438836097717285 | dt: 1437.30ms | tok/sec: 2849.79 | norm: 2.73\n", "step1187 | loss: 3.5622718334198 | dt: 1435.77ms | tok/sec: 2852.83 | norm: 2.81\n", "step1188 | loss: 3.422891855239868 | dt: 1449.17ms | tok/sec: 2826.44 | norm: 2.85\n", "step1189 | loss: 3.9167420864105225 | dt: 1450.48ms | tok/sec: 2823.88 | norm: 3.30\n", "step1190 | loss: 3.7889435291290283 | dt: 1446.21ms | tok/sec: 2832.23 | norm: 2.91\n", "step1191 | loss: 3.7226758003234863 | dt: 1437.59ms | tok/sec: 2849.22 | norm: 3.11\n", "step1192 | loss: 3.90228271484375 | dt: 1443.08ms | tok/sec: 2838.37 | norm: 3.11\n", "step1193 | loss: 4.174879550933838 | dt: 1443.61ms | tok/sec: 2837.32 | norm: 3.35\n", "step1194 | loss: 3.963388442993164 | dt: 1455.60ms | tok/sec: 2813.96 | norm: 3.45\n", "step1195 | loss: 3.675394296646118 | dt: 1451.64ms | tok/sec: 2821.64 | norm: 3.23\n", "step1196 | loss: 3.7476165294647217 | dt: 1439.49ms | tok/sec: 2845.45 | norm: 3.12\n", "step1197 | loss: 3.8983421325683594 | dt: 1449.12ms | tok/sec: 2826.55 | norm: 3.09\n", "step1198 | loss: 3.7180912494659424 | dt: 1442.70ms | tok/sec: 2839.12 | norm: 2.82\n", "step1199 | loss: 3.6869235038757324 | dt: 1443.43ms | tok/sec: 2837.69 | norm: 3.26\n", "step1200 | loss: 3.6274988651275635 | dt: 1435.82ms | tok/sec: 2852.73 | norm: 3.88\n", "step1201 | loss: 3.5438666343688965 | dt: 1447.20ms | tok/sec: 2830.29 | norm: 3.08\n", "step1202 | loss: 3.599811315536499 | dt: 1447.99ms | tok/sec: 2828.75 | norm: 2.79\n", "step1203 | loss: 3.703568458557129 | dt: 1444.00ms | tok/sec: 2836.56 | norm: 3.13\n", "step1204 | loss: 3.5629703998565674 | dt: 1451.20ms | tok/sec: 2822.49 | norm: 3.06\n", "step1205 | loss: 3.5022850036621094 | dt: 1451.75ms | tok/sec: 2821.43 | norm: 2.92\n", "step1206 | loss: 3.411656141281128 | dt: 1452.00ms | tok/sec: 2820.93 | norm: 3.20\n", "step1207 | loss: 3.3215551376342773 | dt: 1452.04ms | tok/sec: 2820.86 | norm: 2.96\n", "step1208 | loss: 3.9232025146484375 | dt: 1441.14ms | tok/sec: 2842.20 | norm: 3.44\n", "step1209 | loss: 3.6676058769226074 | dt: 1445.93ms | tok/sec: 2832.77 | norm: 3.17\n", "step1210 | loss: 3.421034097671509 | dt: 1439.37ms | tok/sec: 2845.69 | norm: 3.11\n", "step1211 | loss: 3.4329984188079834 | dt: 1450.78ms | tok/sec: 2823.31 | norm: 2.86\n", "step1212 | loss: 3.544773578643799 | dt: 1442.84ms | tok/sec: 2838.85 | norm: 3.02\n", "step1213 | loss: 3.559906005859375 | dt: 1448.96ms | tok/sec: 2826.85 | norm: 2.87\n", "step1214 | loss: 3.2789907455444336 | dt: 1444.18ms | tok/sec: 2836.21 | norm: 2.68\n", "step1215 | loss: 3.1904547214508057 | dt: 1444.08ms | tok/sec: 2836.42 | norm: 2.83\n", "step1216 | loss: 3.138681173324585 | dt: 1450.86ms | tok/sec: 2823.15 | norm: 2.57\n", "step1217 | loss: 3.82690167427063 | dt: 1439.45ms | tok/sec: 2845.53 | norm: 2.95\n", "step1218 | loss: 3.619699716567993 | dt: 1437.09ms | tok/sec: 2850.21 | norm: 2.86\n", "step1219 | loss: 3.7614200115203857 | dt: 1449.10ms | tok/sec: 2826.58 | norm: 3.19\n", "step1220 | loss: 3.5678341388702393 | dt: 1441.23ms | tok/sec: 2842.02 | norm: 2.82\n", "step1221 | loss: 3.648859977722168 | dt: 1447.06ms | tok/sec: 2830.58 | norm: 3.05\n", "step1222 | loss: 3.6302952766418457 | dt: 1433.22ms | tok/sec: 2857.90 | norm: 3.26\n", "step1223 | loss: 3.5092484951019287 | dt: 1439.61ms | tok/sec: 2845.22 | norm: 3.17\n", "step1224 | loss: 3.2794370651245117 | dt: 1447.73ms | tok/sec: 2829.26 | norm: 3.19\n", "step1225 | loss: 3.3237061500549316 | dt: 1443.60ms | tok/sec: 2837.34 | norm: 3.21\n", "step1226 | loss: 3.299518346786499 | dt: 1440.74ms | tok/sec: 2842.99 | norm: 3.27\n", "step1227 | loss: 3.3417255878448486 | dt: 1437.84ms | tok/sec: 2848.71 | norm: 3.14\n", "step1228 | loss: 3.4892406463623047 | dt: 1438.60ms | tok/sec: 2847.21 | norm: 3.16\n", "step1229 | loss: 3.5792860984802246 | dt: 1445.84ms | tok/sec: 2832.96 | norm: 2.76\n", "step1230 | loss: 3.484585762023926 | dt: 1442.85ms | tok/sec: 2838.82 | norm: 2.84\n", "step1231 | loss: 3.4150984287261963 | dt: 1449.44ms | tok/sec: 2825.91 | norm: 3.13\n", "step1232 | loss: 3.4725570678710938 | dt: 1445.43ms | tok/sec: 2833.75 | norm: 3.17\n", "step1233 | loss: 3.384406566619873 | dt: 1444.49ms | tok/sec: 2835.59 | norm: 3.17\n", "step1234 | loss: 3.448181629180908 | dt: 1446.65ms | tok/sec: 2831.38 | norm: 2.97\n", "step1235 | loss: 3.1297860145568848 | dt: 1449.71ms | tok/sec: 2825.39 | norm: 2.84\n", "step1236 | loss: 3.1800787448883057 | dt: 1436.70ms | tok/sec: 2850.98 | norm: 2.55\n", "step1237 | loss: 3.5909459590911865 | dt: 1450.38ms | tok/sec: 2824.08 | norm: 2.89\n", "step1238 | loss: 3.2118031978607178 | dt: 1450.02ms | tok/sec: 2824.78 | norm: 3.08\n", "step1239 | loss: 3.37359619140625 | dt: 1439.45ms | tok/sec: 2845.53 | norm: 3.02\n", "step1240 | loss: 3.1281731128692627 | dt: 1451.25ms | tok/sec: 2822.39 | norm: 3.28\n", "step1241 | loss: 3.578911781311035 | dt: 1438.06ms | tok/sec: 2848.27 | norm: 2.85\n", "step1242 | loss: 3.5539467334747314 | dt: 1449.28ms | tok/sec: 2826.22 | norm: 2.62\n", "step1243 | loss: 3.5824904441833496 | dt: 1449.08ms | tok/sec: 2826.63 | norm: 2.59\n", "step1244 | loss: 3.559507131576538 | dt: 1452.20ms | tok/sec: 2820.54 | norm: 2.45\n", "step1245 | loss: 3.468959331512451 | dt: 1437.67ms | tok/sec: 2849.06 | norm: 2.71\n", "step1246 | loss: 3.571873426437378 | dt: 1439.18ms | tok/sec: 2846.07 | norm: 2.87\n", "step1247 | loss: 3.541560173034668 | dt: 1442.37ms | tok/sec: 2839.76 | norm: 2.90\n", "step1248 | loss: 3.529926061630249 | dt: 1451.39ms | tok/sec: 2822.12 | norm: 3.14\n", "step1249 | loss: 3.3476743698120117 | dt: 1438.37ms | tok/sec: 2847.67 | norm: 2.95\n", "step1250 | loss: 3.1931252479553223 | dt: 1456.23ms | tok/sec: 2812.75 | norm: 2.71\n", "step1251 | loss: 3.498459577560425 | dt: 1457.26ms | tok/sec: 2810.75 | norm: 3.22\n", "step1252 | loss: 3.6062867641448975 | dt: 1442.97ms | tok/sec: 2838.58 | norm: 3.00\n", "step1253 | loss: 3.656637191772461 | dt: 1447.45ms | tok/sec: 2829.80 | norm: 2.89\n", "step1254 | loss: 3.541102409362793 | dt: 1451.57ms | tok/sec: 2821.76 | norm: 2.93\n", "step1255 | loss: 3.479316234588623 | dt: 1437.91ms | tok/sec: 2848.57 | norm: 2.86\n", "step1256 | loss: 3.5836617946624756 | dt: 1451.55ms | tok/sec: 2821.81 | norm: 3.18\n", "step1257 | loss: 3.467555046081543 | dt: 1453.53ms | tok/sec: 2817.96 | norm: 3.28\n", "step1258 | loss: 3.321120262145996 | dt: 1450.41ms | tok/sec: 2824.02 | norm: 3.19\n", "step1259 | loss: 3.404651165008545 | dt: 1449.59ms | tok/sec: 2825.63 | norm: 2.97\n", "step1260 | loss: 3.5964293479919434 | dt: 1448.16ms | tok/sec: 2828.41 | norm: 3.01\n", "step1261 | loss: 3.4281086921691895 | dt: 1435.07ms | tok/sec: 2854.22 | norm: 3.12\n", "step1262 | loss: 3.42305326461792 | dt: 1447.96ms | tok/sec: 2828.81 | norm: 2.80\n", "step1263 | loss: 3.6419081687927246 | dt: 1444.98ms | tok/sec: 2834.65 | norm: 2.83\n", "step1264 | loss: 3.462308645248413 | dt: 1453.14ms | tok/sec: 2818.73 | norm: 2.92\n", "step1265 | loss: 3.423816442489624 | dt: 1445.06ms | tok/sec: 2834.49 | norm: 2.86\n", "step1266 | loss: 3.1964569091796875 | dt: 1445.17ms | tok/sec: 2834.27 | norm: 2.64\n", "step1267 | loss: 3.3975772857666016 | dt: 1443.75ms | tok/sec: 2837.05 | norm: 2.76\n", "step1268 | loss: 3.299860715866089 | dt: 1447.44ms | tok/sec: 2829.83 | norm: 2.86\n", "step1269 | loss: 3.422297477722168 | dt: 1448.16ms | tok/sec: 2828.42 | norm: 2.90\n", "step1270 | loss: 3.3125736713409424 | dt: 1440.75ms | tok/sec: 2842.97 | norm: 3.14\n", "step1271 | loss: 3.8013803958892822 | dt: 1452.96ms | tok/sec: 2819.07 | norm: 3.58\n", "step1272 | loss: 3.654487133026123 | dt: 1449.17ms | tok/sec: 2826.45 | norm: 3.27\n", "step1273 | loss: 3.5802206993103027 | dt: 1436.33ms | tok/sec: 2851.71 | norm: 3.18\n", "step1274 | loss: 3.7515676021575928 | dt: 1454.36ms | tok/sec: 2816.37 | norm: 3.16\n", "step1275 | loss: 3.994290351867676 | dt: 1445.40ms | tok/sec: 2833.82 | norm: 3.16\n", "step1276 | loss: 3.802091360092163 | dt: 1441.33ms | tok/sec: 2841.81 | norm: 3.13\n", "step1277 | loss: 3.5415289402008057 | dt: 1444.12ms | tok/sec: 2836.32 | norm: 3.21\n", "step1278 | loss: 3.6211154460906982 | dt: 1435.72ms | tok/sec: 2852.93 | norm: 3.32\n", "step1279 | loss: 3.798858642578125 | dt: 1437.19ms | tok/sec: 2850.00 | norm: 3.81\n", "step1280 | loss: 3.617563486099243 | dt: 1440.99ms | tok/sec: 2842.50 | norm: 3.58\n", "step1281 | loss: 3.598262071609497 | dt: 1447.86ms | tok/sec: 2829.00 | norm: 3.71\n", "step1282 | loss: 3.5322322845458984 | dt: 1445.99ms | tok/sec: 2832.66 | norm: 3.88\n", "step1283 | loss: 3.3965814113616943 | dt: 1438.70ms | tok/sec: 2847.01 | norm: 2.96\n", "step1284 | loss: 3.4383127689361572 | dt: 1448.00ms | tok/sec: 2828.73 | norm: 2.74\n", "step1285 | loss: 3.5866708755493164 | dt: 1450.53ms | tok/sec: 2823.80 | norm: 4.12\n", "step1286 | loss: 3.4291417598724365 | dt: 1453.83ms | tok/sec: 2817.39 | norm: 3.03\n", "step1287 | loss: 3.380134105682373 | dt: 1436.31ms | tok/sec: 2851.76 | norm: 3.24\n", "step1288 | loss: 3.2765262126922607 | dt: 1440.30ms | tok/sec: 2843.85 | norm: 3.06\n", "step1289 | loss: 3.1911802291870117 | dt: 1442.77ms | tok/sec: 2838.98 | norm: 2.83\n", "step1290 | loss: 3.7959988117218018 | dt: 1446.54ms | tok/sec: 2831.59 | norm: 2.98\n", "step1291 | loss: 3.5639660358428955 | dt: 1443.77ms | tok/sec: 2837.02 | norm: 3.40\n", "step1292 | loss: 3.3040528297424316 | dt: 1449.39ms | tok/sec: 2826.01 | norm: 3.31\n", "step1293 | loss: 3.2981178760528564 | dt: 1436.14ms | tok/sec: 2852.08 | norm: 2.98\n", "step1294 | loss: 3.3885269165039062 | dt: 1453.31ms | tok/sec: 2818.39 | norm: 2.74\n", "step1295 | loss: 3.407174825668335 | dt: 1449.76ms | tok/sec: 2825.29 | norm: 2.81\n", "step1296 | loss: 3.1425108909606934 | dt: 1450.71ms | tok/sec: 2823.45 | norm: 2.83\n", "step1297 | loss: 3.0590810775756836 | dt: 1450.91ms | tok/sec: 2823.06 | norm: 2.87\n", "step1298 | loss: 2.994497299194336 | dt: 1445.06ms | tok/sec: 2834.48 | norm: 2.51\n", "step1299 | loss: 3.7013208866119385 | dt: 1448.00ms | tok/sec: 2828.73 | norm: 3.42\n", "step1300 | loss: 3.4792070388793945 | dt: 1448.75ms | tok/sec: 2827.26 | norm: 3.10\n", "step1301 | loss: 3.6675264835357666 | dt: 1438.16ms | tok/sec: 2848.08 | norm: 4.05\n", "step1302 | loss: 3.4592175483703613 | dt: 1444.21ms | tok/sec: 2836.16 | norm: 3.38\n", "step1303 | loss: 3.5519728660583496 | dt: 1446.05ms | tok/sec: 2832.55 | norm: 3.53\n", "step1304 | loss: 3.5501880645751953 | dt: 1454.71ms | tok/sec: 2815.68 | norm: 3.77\n", "step1305 | loss: 3.4159467220306396 | dt: 1448.24ms | tok/sec: 2828.26 | norm: 3.75\n", "step1306 | loss: 3.1894516944885254 | dt: 1438.73ms | tok/sec: 2846.95 | norm: 4.17\n", "step1307 | loss: 3.239626169204712 | dt: 1440.16ms | tok/sec: 2844.13 | norm: 3.58\n", "step1308 | loss: 3.233346700668335 | dt: 1449.94ms | tok/sec: 2824.95 | norm: 3.72\n", "step1309 | loss: 3.2640538215637207 | dt: 1451.07ms | tok/sec: 2822.74 | norm: 3.73\n", "step1310 | loss: 3.394880771636963 | dt: 1438.70ms | tok/sec: 2847.01 | norm: 4.22\n", "step1311 | loss: 3.476031541824341 | dt: 1450.52ms | tok/sec: 2823.82 | norm: 3.79\n", "step1312 | loss: 3.3754801750183105 | dt: 1448.78ms | tok/sec: 2827.21 | norm: 3.35\n", "step1313 | loss: 3.2928664684295654 | dt: 1447.28ms | tok/sec: 2830.13 | norm: 3.22\n", "step1314 | loss: 3.353868007659912 | dt: 1452.44ms | tok/sec: 2820.07 | norm: 3.17\n", "step1315 | loss: 3.289269208908081 | dt: 1443.16ms | tok/sec: 2838.21 | norm: 3.38\n", "step1316 | loss: 3.3579154014587402 | dt: 1452.46ms | tok/sec: 2820.05 | norm: 3.39\n", "step1317 | loss: 3.033219814300537 | dt: 1448.79ms | tok/sec: 2827.19 | norm: 3.06\n", "step1318 | loss: 3.051227569580078 | dt: 1451.46ms | tok/sec: 2821.99 | norm: 2.73\n", "step1319 | loss: 3.4487059116363525 | dt: 1451.00ms | tok/sec: 2822.88 | norm: 2.93\n", "step1320 | loss: 3.0967791080474854 | dt: 1442.21ms | tok/sec: 2840.08 | norm: 2.98\n", "step1321 | loss: 3.236849546432495 | dt: 1448.12ms | tok/sec: 2828.50 | norm: 3.03\n", "step1322 | loss: 3.0006775856018066 | dt: 1450.34ms | tok/sec: 2824.17 | norm: 3.12\n", "step1323 | loss: 3.4390904903411865 | dt: 1456.11ms | tok/sec: 2812.97 | norm: 3.46\n", "step1324 | loss: 3.420511245727539 | dt: 1453.29ms | tok/sec: 2818.43 | norm: 3.33\n", "step1325 | loss: 3.4720711708068848 | dt: 1455.36ms | tok/sec: 2814.41 | norm: 3.17\n", "step1326 | loss: 3.441551923751831 | dt: 1448.38ms | tok/sec: 2827.98 | norm: 3.00\n", "step1327 | loss: 3.3398776054382324 | dt: 1454.49ms | tok/sec: 2816.10 | norm: 3.05\n", "step1328 | loss: 3.4163215160369873 | dt: 1444.94ms | tok/sec: 2834.71 | norm: 2.82\n", "step1329 | loss: 3.3892018795013428 | dt: 1440.81ms | tok/sec: 2842.85 | norm: 2.90\n", "step1330 | loss: 3.3777027130126953 | dt: 1436.01ms | tok/sec: 2852.34 | norm: 2.91\n", "step1331 | loss: 3.216796398162842 | dt: 1447.54ms | tok/sec: 2829.63 | norm: 2.88\n", "step1332 | loss: 3.0700907707214355 | dt: 1438.23ms | tok/sec: 2847.94 | norm: 2.93\n", "step1333 | loss: 3.3890280723571777 | dt: 1445.14ms | tok/sec: 2834.33 | norm: 3.39\n", "step1334 | loss: 3.489885091781616 | dt: 1446.56ms | tok/sec: 2831.54 | norm: 3.37\n", "step1335 | loss: 3.5483577251434326 | dt: 1447.44ms | tok/sec: 2829.83 | norm: 3.33\n", "step1336 | loss: 3.4016003608703613 | dt: 1437.72ms | tok/sec: 2848.96 | norm: 3.20\n", "step1337 | loss: 3.3353428840637207 | dt: 1440.37ms | tok/sec: 2843.72 | norm: 3.05\n", "step1338 | loss: 3.4540834426879883 | dt: 1449.16ms | tok/sec: 2826.47 | norm: 3.07\n", "step1339 | loss: 3.3364741802215576 | dt: 1443.76ms | tok/sec: 2837.03 | norm: 3.15\n", "step1340 | loss: 3.2113101482391357 | dt: 1441.45ms | tok/sec: 2841.58 | norm: 3.26\n", "step1341 | loss: 3.3027961254119873 | dt: 1453.09ms | tok/sec: 2818.82 | norm: 3.32\n", "step1342 | loss: 3.48158860206604 | dt: 1445.23ms | tok/sec: 2834.15 | norm: 3.30\n", "step1343 | loss: 3.345754623413086 | dt: 1450.94ms | tok/sec: 2823.00 | norm: 4.43\n", "step1344 | loss: 3.2979655265808105 | dt: 1454.63ms | tok/sec: 2815.83 | norm: 3.41\n", "step1345 | loss: 3.4858744144439697 | dt: 1441.98ms | tok/sec: 2840.53 | norm: 2.95\n", "step1346 | loss: 3.3239309787750244 | dt: 1457.02ms | tok/sec: 2811.22 | norm: 2.69\n", "step1347 | loss: 3.3004684448242188 | dt: 1452.16ms | tok/sec: 2820.63 | norm: 3.08\n", "step1348 | loss: 3.0811617374420166 | dt: 1441.97ms | tok/sec: 2840.56 | norm: 2.99\n", "step1349 | loss: 3.2861530780792236 | dt: 1449.95ms | tok/sec: 2824.93 | norm: 3.14\n", "step1350 | loss: 3.1828765869140625 | dt: 1459.26ms | tok/sec: 2806.90 | norm: 3.05\n", "step1351 | loss: 3.2925360202789307 | dt: 1435.12ms | tok/sec: 2854.12 | norm: 3.01\n", "step1352 | loss: 3.1886651515960693 | dt: 1455.71ms | tok/sec: 2813.75 | norm: 3.11\n", "step1353 | loss: 3.705148696899414 | dt: 1440.57ms | tok/sec: 2843.32 | norm: 3.82\n", "step1354 | loss: 3.5693233013153076 | dt: 1449.57ms | tok/sec: 2825.67 | norm: 3.68\n", "step1355 | loss: 3.48260498046875 | dt: 1445.00ms | tok/sec: 2834.60 | norm: 3.49\n", "step1356 | loss: 3.6395423412323 | dt: 1447.77ms | tok/sec: 2829.18 | norm: 3.55\n", "step1357 | loss: 3.8765597343444824 | dt: 1446.35ms | tok/sec: 2831.96 | norm: 3.67\n", "step1358 | loss: 3.67264723777771 | dt: 1448.44ms | tok/sec: 2827.87 | norm: 3.85\n", "step1359 | loss: 3.396235942840576 | dt: 1455.28ms | tok/sec: 2814.57 | norm: 3.57\n", "step1360 | loss: 3.474064826965332 | dt: 1450.81ms | tok/sec: 2823.25 | norm: 3.36\n", "step1361 | loss: 3.631812572479248 | dt: 1450.38ms | tok/sec: 2824.09 | norm: 3.25\n", "step1362 | loss: 3.4646904468536377 | dt: 1455.17ms | tok/sec: 2814.80 | norm: 3.11\n", "step1363 | loss: 3.462815046310425 | dt: 1444.62ms | tok/sec: 2835.35 | norm: 3.53\n", "step1364 | loss: 3.3977603912353516 | dt: 1450.03ms | tok/sec: 2824.77 | norm: 3.87\n", "step1365 | loss: 3.2642016410827637 | dt: 1449.23ms | tok/sec: 2826.34 | norm: 3.52\n", "step1366 | loss: 3.301161289215088 | dt: 1457.59ms | tok/sec: 2810.13 | norm: 3.40\n", "step1367 | loss: 3.446805238723755 | dt: 1443.51ms | tok/sec: 2837.54 | norm: 3.64\n", "step1368 | loss: 3.272695541381836 | dt: 1444.08ms | tok/sec: 2836.41 | norm: 3.26\n", "step1369 | loss: 3.212900161743164 | dt: 1451.65ms | tok/sec: 2821.61 | norm: 3.05\n", "step1370 | loss: 3.1375224590301514 | dt: 1453.27ms | tok/sec: 2818.47 | norm: 3.06\n", "step1371 | loss: 3.055809497833252 | dt: 1447.52ms | tok/sec: 2829.67 | norm: 2.99\n", "step1372 | loss: 3.6437227725982666 | dt: 1453.68ms | tok/sec: 2817.67 | norm: 3.05\n", "step1373 | loss: 3.4312071800231934 | dt: 1448.57ms | tok/sec: 2827.61 | norm: 3.55\n", "step1374 | loss: 3.1661314964294434 | dt: 1448.00ms | tok/sec: 2828.72 | norm: 3.27\n", "step1375 | loss: 3.1297521591186523 | dt: 1441.71ms | tok/sec: 2841.06 | norm: 2.85\n", "step1376 | loss: 3.21325945854187 | dt: 1451.62ms | tok/sec: 2821.67 | norm: 2.69\n", "step1377 | loss: 3.2490458488464355 | dt: 1450.34ms | tok/sec: 2824.17 | norm: 2.88\n", "step1378 | loss: 3.0084710121154785 | dt: 1442.90ms | tok/sec: 2838.73 | norm: 3.04\n", "step1379 | loss: 2.9603359699249268 | dt: 1444.48ms | tok/sec: 2835.62 | norm: 3.37\n", "step1380 | loss: 2.881194829940796 | dt: 1445.97ms | tok/sec: 2832.69 | norm: 3.03\n", "step1381 | loss: 3.6012179851531982 | dt: 1447.06ms | tok/sec: 2830.57 | norm: 3.62\n", "step1382 | loss: 3.338846445083618 | dt: 1453.20ms | tok/sec: 2818.60 | norm: 3.03\n", "step1383 | loss: 3.567185401916504 | dt: 1440.61ms | tok/sec: 2843.25 | norm: 3.77\n", "step1384 | loss: 3.3325324058532715 | dt: 1446.27ms | tok/sec: 2832.12 | norm: 3.39\n", "step1385 | loss: 3.4233670234680176 | dt: 1459.25ms | tok/sec: 2806.93 | norm: 3.42\n", "step1386 | loss: 3.4188687801361084 | dt: 1454.94ms | tok/sec: 2815.24 | norm: 3.41\n", "step1387 | loss: 3.302102565765381 | dt: 1452.13ms | tok/sec: 2820.68 | norm: 3.44\n", "step1388 | loss: 3.089635133743286 | dt: 1451.16ms | tok/sec: 2822.57 | norm: 3.79\n", "step1389 | loss: 3.1269545555114746 | dt: 1452.99ms | tok/sec: 2819.02 | norm: 3.48\n", "step1390 | loss: 3.1151418685913086 | dt: 1441.86ms | tok/sec: 2840.78 | norm: 3.43\n", "step1391 | loss: 3.1459782123565674 | dt: 1451.04ms | tok/sec: 2822.80 | norm: 3.40\n", "step1392 | loss: 3.307852268218994 | dt: 1448.19ms | tok/sec: 2828.37 | norm: 3.72\n", "step1393 | loss: 3.3728368282318115 | dt: 1453.32ms | tok/sec: 2818.38 | norm: 3.56\n", "step1394 | loss: 3.2901511192321777 | dt: 1457.14ms | tok/sec: 2810.99 | norm: 3.86\n", "step1395 | loss: 3.1881179809570312 | dt: 1451.84ms | tok/sec: 2821.25 | norm: 4.14\n", "step1396 | loss: 3.221130609512329 | dt: 1454.57ms | tok/sec: 2815.96 | norm: 3.54\n", "step1397 | loss: 3.15496563911438 | dt: 1453.86ms | tok/sec: 2817.32 | norm: 3.09\n", "step1398 | loss: 3.238677978515625 | dt: 1452.30ms | tok/sec: 2820.36 | norm: 3.33\n", "step1399 | loss: 2.9444212913513184 | dt: 1444.84ms | tok/sec: 2834.91 | norm: 3.68\n", "step1400 | loss: 2.944122076034546 | dt: 1466.48ms | tok/sec: 2793.09 | norm: 3.34\n", "step1401 | loss: 3.332388401031494 | dt: 1443.78ms | tok/sec: 2836.99 | norm: 3.24\n", "step1402 | loss: 2.969442129135132 | dt: 1450.46ms | tok/sec: 2823.93 | norm: 2.83\n", "step1403 | loss: 3.1019766330718994 | dt: 1443.09ms | tok/sec: 2838.35 | norm: 2.97\n", "step1404 | loss: 2.8956048488616943 | dt: 1455.35ms | tok/sec: 2814.44 | norm: 3.22\n", "step1405 | loss: 3.346465587615967 | dt: 1461.35ms | tok/sec: 2802.90 | norm: 3.57\n", "step1406 | loss: 3.334993839263916 | dt: 1439.88ms | tok/sec: 2844.68 | norm: 3.52\n", "step1407 | loss: 3.3523645401000977 | dt: 1457.03ms | tok/sec: 2811.20 | norm: 3.37\n", "step1408 | loss: 3.295518398284912 | dt: 1450.42ms | tok/sec: 2824.00 | norm: 3.13\n", "step1409 | loss: 3.2072503566741943 | dt: 1454.11ms | tok/sec: 2816.84 | norm: 3.55\n", "step1410 | loss: 3.293586254119873 | dt: 1454.45ms | tok/sec: 2816.19 | norm: 3.47\n", "step1411 | loss: 3.297262191772461 | dt: 1459.12ms | tok/sec: 2807.18 | norm: 3.76\n", "step1412 | loss: 3.2634878158569336 | dt: 1454.49ms | tok/sec: 2816.11 | norm: 3.42\n", "step1413 | loss: 3.0945589542388916 | dt: 1446.78ms | tok/sec: 2831.11 | norm: 3.19\n", "step1414 | loss: 2.925870418548584 | dt: 1458.75ms | tok/sec: 2807.88 | norm: 2.83\n", "step1415 | loss: 3.236896276473999 | dt: 1450.81ms | tok/sec: 2823.26 | norm: 3.22\n", "step1416 | loss: 3.3607006072998047 | dt: 1457.96ms | tok/sec: 2809.40 | norm: 3.36\n", "step1417 | loss: 3.4243221282958984 | dt: 1449.39ms | tok/sec: 2826.02 | norm: 3.57\n", "step1418 | loss: 3.2660019397735596 | dt: 1450.30ms | tok/sec: 2824.25 | norm: 3.59\n", "step1419 | loss: 3.2247416973114014 | dt: 1448.64ms | tok/sec: 2827.49 | norm: 3.64\n", "step1420 | loss: 3.32700252532959 | dt: 1442.46ms | tok/sec: 2839.60 | norm: 3.33\n", "step1421 | loss: 3.1858928203582764 | dt: 1450.99ms | tok/sec: 2822.90 | norm: 3.15\n", "step1422 | loss: 3.0677270889282227 | dt: 1453.96ms | tok/sec: 2817.13 | norm: 2.98\n", "step1423 | loss: 3.153961420059204 | dt: 1454.66ms | tok/sec: 2815.77 | norm: 3.09\n", "step1424 | loss: 3.332632541656494 | dt: 1449.66ms | tok/sec: 2825.49 | norm: 3.13\n", "step1425 | loss: 3.2608938217163086 | dt: 1454.32ms | tok/sec: 2816.43 | norm: 4.01\n", "step1426 | loss: 3.1836349964141846 | dt: 1455.00ms | tok/sec: 2815.12 | norm: 3.36\n", "step1427 | loss: 3.3460073471069336 | dt: 1457.40ms | tok/sec: 2810.49 | norm: 3.38\n", "step1428 | loss: 3.1738874912261963 | dt: 1456.17ms | tok/sec: 2812.85 | norm: 3.08\n", "step1429 | loss: 3.1605587005615234 | dt: 1449.39ms | tok/sec: 2826.01 | norm: 3.31\n", "step1430 | loss: 2.9421420097351074 | dt: 1458.42ms | tok/sec: 2808.51 | norm: 3.19\n", "step1431 | loss: 3.1371757984161377 | dt: 1452.66ms | tok/sec: 2819.66 | norm: 3.08\n", "step1432 | loss: 3.055518627166748 | dt: 1442.77ms | tok/sec: 2838.98 | norm: 3.16\n", "step1433 | loss: 3.179914712905884 | dt: 1457.98ms | tok/sec: 2809.37 | norm: 3.27\n", "step1434 | loss: 3.0481557846069336 | dt: 1447.63ms | tok/sec: 2829.45 | norm: 3.04\n", "step1435 | loss: 3.574449062347412 | dt: 1453.74ms | tok/sec: 2817.56 | norm: 3.97\n", "step1436 | loss: 3.4466538429260254 | dt: 1459.08ms | tok/sec: 2807.24 | norm: 4.13\n", "step1437 | loss: 3.3615972995758057 | dt: 1446.69ms | tok/sec: 2831.30 | norm: 3.87\n", "step1438 | loss: 3.5132155418395996 | dt: 1457.24ms | tok/sec: 2810.79 | norm: 3.72\n", "step1439 | loss: 3.698291063308716 | dt: 1449.97ms | tok/sec: 2824.89 | norm: 3.46\n", "step1440 | loss: 3.5475003719329834 | dt: 1450.87ms | tok/sec: 2823.14 | norm: 3.70\n", "step1441 | loss: 3.265167236328125 | dt: 1462.39ms | tok/sec: 2800.90 | norm: 3.43\n", "step1442 | loss: 3.33648943901062 | dt: 1460.95ms | tok/sec: 2803.66 | norm: 3.71\n", "step1443 | loss: 3.477627754211426 | dt: 1448.58ms | tok/sec: 2827.60 | norm: 3.60\n", "step1444 | loss: 3.3208649158477783 | dt: 1446.50ms | tok/sec: 2831.67 | norm: 3.56\n", "step1445 | loss: 3.3156216144561768 | dt: 1455.02ms | tok/sec: 2815.08 | norm: 3.62\n", "step1446 | loss: 3.2452492713928223 | dt: 1452.04ms | tok/sec: 2820.86 | norm: 3.55\n", "step1447 | loss: 3.1207752227783203 | dt: 1446.25ms | tok/sec: 2832.15 | norm: 3.35\n", "step1448 | loss: 3.1542162895202637 | dt: 1444.96ms | tok/sec: 2834.67 | norm: 3.38\n", "step1449 | loss: 3.29718017578125 | dt: 1453.63ms | tok/sec: 2817.77 | norm: 3.75\n", "step1450 | loss: 3.137746810913086 | dt: 1452.97ms | tok/sec: 2819.06 | norm: 3.56\n", "step1451 | loss: 3.080477476119995 | dt: 1451.43ms | tok/sec: 2822.04 | norm: 3.44\n", "step1452 | loss: 3.007735252380371 | dt: 1449.09ms | tok/sec: 2826.60 | norm: 3.26\n", "step1453 | loss: 2.928799629211426 | dt: 1445.97ms | tok/sec: 2832.69 | norm: 3.22\n", "step1454 | loss: 3.475447177886963 | dt: 1458.22ms | tok/sec: 2808.91 | norm: 3.34\n", "step1455 | loss: 3.291193962097168 | dt: 1445.07ms | tok/sec: 2834.46 | norm: 3.94\n", "step1456 | loss: 3.04699969291687 | dt: 1445.78ms | tok/sec: 2833.06 | norm: 3.62\n", "step1457 | loss: 2.9993107318878174 | dt: 1445.85ms | tok/sec: 2832.94 | norm: 3.44\n", "step1458 | loss: 3.0775961875915527 | dt: 1452.14ms | tok/sec: 2820.67 | norm: 3.21\n", "step1459 | loss: 3.0934581756591797 | dt: 1442.60ms | tok/sec: 2839.32 | norm: 3.14\n", "step1460 | loss: 2.8637452125549316 | dt: 1456.17ms | tok/sec: 2812.87 | norm: 3.23\n", "step1461 | loss: 2.838033676147461 | dt: 1458.63ms | tok/sec: 2808.11 | norm: 3.49\n", "step1462 | loss: 2.7602906227111816 | dt: 1456.85ms | tok/sec: 2811.54 | norm: 3.24\n", "step1463 | loss: 3.464437484741211 | dt: 1460.05ms | tok/sec: 2805.37 | norm: 3.66\n", "step1464 | loss: 3.1855738162994385 | dt: 1459.98ms | tok/sec: 2805.52 | norm: 3.22\n", "step1465 | loss: 3.4611711502075195 | dt: 1455.38ms | tok/sec: 2814.39 | norm: 3.94\n", "step1466 | loss: 3.226252317428589 | dt: 1450.00ms | tok/sec: 2824.84 | norm: 3.51\n", "step1467 | loss: 3.2982301712036133 | dt: 1454.91ms | tok/sec: 2815.29 | norm: 3.55\n", "step1468 | loss: 3.267383098602295 | dt: 1447.16ms | tok/sec: 2830.38 | norm: 3.33\n", "step1469 | loss: 3.1521108150482178 | dt: 1454.15ms | tok/sec: 2816.76 | norm: 3.33\n", "step1470 | loss: 2.9370810985565186 | dt: 1443.64ms | tok/sec: 2837.26 | norm: 3.51\n", "step1471 | loss: 2.977977991104126 | dt: 1447.50ms | tok/sec: 2829.70 | norm: 3.30\n", "step1472 | loss: 2.9567031860351562 | dt: 1459.91ms | tok/sec: 2805.65 | norm: 3.34\n", "step1473 | loss: 2.990262985229492 | dt: 1441.72ms | tok/sec: 2841.05 | norm: 3.19\n", "step1474 | loss: 3.1567790508270264 | dt: 1455.59ms | tok/sec: 2813.97 | norm: 3.15\n", "step1475 | loss: 3.2294111251831055 | dt: 1449.32ms | tok/sec: 2826.16 | norm: 3.25\n", "step1476 | loss: 3.1858596801757812 | dt: 1443.70ms | tok/sec: 2837.15 | norm: 3.87\n", "step1477 | loss: 3.0967955589294434 | dt: 1447.95ms | tok/sec: 2828.83 | norm: 4.07\n", "step1478 | loss: 3.1207308769226074 | dt: 1451.78ms | tok/sec: 2821.37 | norm: 3.80\n", "step1479 | loss: 3.0321826934814453 | dt: 1453.69ms | tok/sec: 2817.65 | norm: 3.70\n", "step1480 | loss: 3.124257802963257 | dt: 1450.03ms | tok/sec: 2824.77 | norm: 3.83\n", "step1481 | loss: 2.8461759090423584 | dt: 1455.46ms | tok/sec: 2814.22 | norm: 3.71\n", "step1482 | loss: 2.8524301052093506 | dt: 1448.27ms | tok/sec: 2828.20 | norm: 3.59\n", "step1483 | loss: 3.197767496109009 | dt: 1443.52ms | tok/sec: 2837.51 | norm: 3.45\n", "step1484 | loss: 2.841329336166382 | dt: 1457.02ms | tok/sec: 2811.22 | norm: 3.17\n", "step1485 | loss: 2.9548590183258057 | dt: 1447.69ms | tok/sec: 2829.34 | norm: 2.91\n", "step1486 | loss: 2.748911142349243 | dt: 1447.43ms | tok/sec: 2829.84 | norm: 3.00\n", "step1487 | loss: 3.2189574241638184 | dt: 1442.12ms | tok/sec: 2840.26 | norm: 3.75\n", "step1488 | loss: 3.216310977935791 | dt: 1455.14ms | tok/sec: 2814.86 | norm: 3.66\n", "step1489 | loss: 3.2251689434051514 | dt: 1451.91ms | tok/sec: 2821.11 | norm: 3.43\n", "step1490 | loss: 3.162231922149658 | dt: 1455.51ms | tok/sec: 2814.13 | norm: 3.09\n", "step1491 | loss: 3.0833537578582764 | dt: 1452.65ms | tok/sec: 2819.68 | norm: 3.28\n", "step1492 | loss: 3.1717917919158936 | dt: 1454.07ms | tok/sec: 2816.92 | norm: 3.60\n", "step1493 | loss: 3.1721549034118652 | dt: 1455.04ms | tok/sec: 2815.05 | norm: 3.97\n", "step1494 | loss: 3.150857925415039 | dt: 1447.06ms | tok/sec: 2830.57 | norm: 3.94\n", "step1495 | loss: 2.9829087257385254 | dt: 1448.93ms | tok/sec: 2826.91 | norm: 3.63\n", "step1496 | loss: 2.799520969390869 | dt: 1454.89ms | tok/sec: 2815.33 | norm: 3.17\n", "step1497 | loss: 3.1091325283050537 | dt: 1451.58ms | tok/sec: 2821.75 | norm: 3.40\n", "step1498 | loss: 3.207249879837036 | dt: 1453.63ms | tok/sec: 2817.77 | norm: 3.33\n", "step1499 | loss: 3.2424845695495605 | dt: 1455.57ms | tok/sec: 2814.02 | norm: 3.29\n", "step1500 | loss: 3.090125322341919 | dt: 1457.11ms | tok/sec: 2811.04 | norm: 3.18\n", "step1501 | loss: 3.0872702598571777 | dt: 1450.76ms | tok/sec: 2823.36 | norm: 3.45\n", "step1502 | loss: 3.1675877571105957 | dt: 1450.75ms | tok/sec: 2823.36 | norm: 3.37\n", "step1503 | loss: 3.037053346633911 | dt: 1437.65ms | tok/sec: 2849.09 | norm: 3.24\n", "step1504 | loss: 2.93229079246521 | dt: 1449.86ms | tok/sec: 2825.10 | norm: 3.38\n", "step1505 | loss: 2.9954583644866943 | dt: 1443.92ms | tok/sec: 2836.71 | norm: 3.29\n", "step1506 | loss: 3.1767003536224365 | dt: 1459.05ms | tok/sec: 2807.30 | norm: 3.33\n", "step1507 | loss: 3.0940604209899902 | dt: 1453.59ms | tok/sec: 2817.86 | norm: 3.24\n", "step1508 | loss: 3.0097720623016357 | dt: 1449.90ms | tok/sec: 2825.01 | norm: 3.06\n", "step1509 | loss: 3.1799163818359375 | dt: 1445.51ms | tok/sec: 2833.61 | norm: 3.14\n", "step1510 | loss: 3.0166330337524414 | dt: 1458.95ms | tok/sec: 2807.49 | norm: 2.96\n", "step1511 | loss: 3.003944158554077 | dt: 1448.80ms | tok/sec: 2827.17 | norm: 3.47\n", "step1512 | loss: 2.8003697395324707 | dt: 1452.37ms | tok/sec: 2820.21 | norm: 3.44\n", "step1513 | loss: 2.9985344409942627 | dt: 1452.96ms | tok/sec: 2819.07 | norm: 3.45\n", "step1514 | loss: 2.918199300765991 | dt: 1454.27ms | tok/sec: 2816.53 | norm: 3.10\n", "step1515 | loss: 3.037616014480591 | dt: 1453.78ms | tok/sec: 2817.49 | norm: 3.43\n", "step1516 | loss: 2.923269510269165 | dt: 1451.12ms | tok/sec: 2822.66 | norm: 3.65\n", "step1517 | loss: 3.4353129863739014 | dt: 1452.58ms | tok/sec: 2819.81 | norm: 4.08\n", "step1518 | loss: 3.318049192428589 | dt: 1454.49ms | tok/sec: 2816.11 | norm: 3.86\n", "step1519 | loss: 3.228544235229492 | dt: 1451.40ms | tok/sec: 2822.11 | norm: 3.66\n", "step1520 | loss: 3.371650457382202 | dt: 1448.90ms | tok/sec: 2826.98 | norm: 3.54\n", "step1521 | loss: 3.556941509246826 | dt: 1445.52ms | tok/sec: 2833.58 | norm: 4.19\n", "step1522 | loss: 3.4302926063537598 | dt: 1446.75ms | tok/sec: 2831.17 | norm: 4.36\n", "step1523 | loss: 3.1506259441375732 | dt: 1450.00ms | tok/sec: 2824.84 | norm: 4.13\n", "step1524 | loss: 3.2056918144226074 | dt: 1449.66ms | tok/sec: 2825.49 | norm: 3.86\n", "step1525 | loss: 3.340135097503662 | dt: 1445.52ms | tok/sec: 2833.57 | norm: 3.86\n", "step1526 | loss: 3.181262254714966 | dt: 1444.25ms | tok/sec: 2836.08 | norm: 3.67\n", "step1527 | loss: 3.1662495136260986 | dt: 1448.93ms | tok/sec: 2826.91 | norm: 3.64\n", "step1528 | loss: 3.0878212451934814 | dt: 1451.52ms | tok/sec: 2821.87 | norm: 3.81\n", "step1529 | loss: 3.0023090839385986 | dt: 1448.96ms | tok/sec: 2826.86 | norm: 3.75\n", "step1530 | loss: 3.013842821121216 | dt: 1445.92ms | tok/sec: 2832.80 | norm: 3.62\n", "step1531 | loss: 3.1738483905792236 | dt: 1450.28ms | tok/sec: 2824.28 | norm: 3.81\n", "step1532 | loss: 3.008180856704712 | dt: 1452.26ms | tok/sec: 2820.44 | norm: 3.74\n", "step1533 | loss: 2.9571690559387207 | dt: 1444.52ms | tok/sec: 2835.55 | norm: 3.62\n", "step1534 | loss: 2.8522863388061523 | dt: 1443.14ms | tok/sec: 2838.26 | norm: 3.26\n", "step1535 | loss: 2.781322717666626 | dt: 1450.03ms | tok/sec: 2824.76 | norm: 3.21\n", "step1536 | loss: 3.3353729248046875 | dt: 1448.62ms | tok/sec: 2827.53 | norm: 3.50\n", "step1537 | loss: 3.1631832122802734 | dt: 1454.54ms | tok/sec: 2816.02 | norm: 3.91\n", "step1538 | loss: 2.9222095012664795 | dt: 1449.01ms | tok/sec: 2826.76 | norm: 3.69\n", "step1539 | loss: 2.867367744445801 | dt: 1461.61ms | tok/sec: 2802.39 | norm: 3.25\n", "step1540 | loss: 2.9368770122528076 | dt: 1445.99ms | tok/sec: 2832.65 | norm: 3.30\n", "step1541 | loss: 2.9675679206848145 | dt: 1457.39ms | tok/sec: 2810.51 | norm: 3.41\n", "step1542 | loss: 2.753309726715088 | dt: 1458.76ms | tok/sec: 2807.87 | norm: 3.72\n", "step1543 | loss: 2.726902723312378 | dt: 1449.95ms | tok/sec: 2824.92 | norm: 3.93\n", "step1544 | loss: 2.6464505195617676 | dt: 1448.47ms | tok/sec: 2827.81 | norm: 3.41\n", "step1545 | loss: 3.3207595348358154 | dt: 1447.05ms | tok/sec: 2830.58 | norm: 3.59\n", "step1546 | loss: 3.0410866737365723 | dt: 1450.71ms | tok/sec: 2823.44 | norm: 3.49\n", "step1547 | loss: 3.342519760131836 | dt: 1449.23ms | tok/sec: 2826.32 | norm: 4.58\n", "step1548 | loss: 3.095961570739746 | dt: 1446.80ms | tok/sec: 2831.08 | norm: 3.79\n", "step1549 | loss: 3.1613688468933105 | dt: 1449.48ms | tok/sec: 2825.83 | norm: 3.64\n", "step1550 | loss: 3.139395236968994 | dt: 1454.98ms | tok/sec: 2815.15 | norm: 3.63\n", "step1551 | loss: 3.021144151687622 | dt: 1445.87ms | tok/sec: 2832.90 | norm: 3.68\n", "step1552 | loss: 2.8180606365203857 | dt: 1443.65ms | tok/sec: 2837.25 | norm: 3.56\n", "step1553 | loss: 2.8550448417663574 | dt: 1453.28ms | tok/sec: 2818.46 | norm: 3.46\n", "step1554 | loss: 2.826472759246826 | dt: 1448.05ms | tok/sec: 2828.62 | norm: 3.45\n", "step1555 | loss: 2.8449249267578125 | dt: 1451.40ms | tok/sec: 2822.10 | norm: 3.48\n", "step1556 | loss: 3.0156421661376953 | dt: 1455.28ms | tok/sec: 2814.58 | norm: 3.52\n", "step1557 | loss: 3.0806057453155518 | dt: 1448.92ms | tok/sec: 2826.93 | norm: 3.25\n", "step1558 | loss: 3.0359954833984375 | dt: 1454.69ms | tok/sec: 2815.72 | norm: 3.62\n", "step1559 | loss: 2.9591920375823975 | dt: 1450.65ms | tok/sec: 2823.56 | norm: 3.82\n", "step1560 | loss: 3.0095229148864746 | dt: 1448.19ms | tok/sec: 2828.37 | norm: 4.17\n", "step1561 | loss: 2.914074420928955 | dt: 1444.88ms | tok/sec: 2834.84 | norm: 3.71\n", "step1562 | loss: 3.017073154449463 | dt: 1460.10ms | tok/sec: 2805.28 | norm: 3.81\n", "step1563 | loss: 2.7269556522369385 | dt: 1452.70ms | tok/sec: 2819.58 | norm: 3.77\n", "step1564 | loss: 2.750120162963867 | dt: 1452.97ms | tok/sec: 2819.05 | norm: 3.98\n", "step1565 | loss: 3.083667278289795 | dt: 1453.97ms | tok/sec: 2817.12 | norm: 4.01\n", "step1566 | loss: 2.762744665145874 | dt: 1445.77ms | tok/sec: 2833.09 | norm: 4.02\n", "step1567 | loss: 2.8330187797546387 | dt: 1443.09ms | tok/sec: 2838.36 | norm: 3.49\n", "step1568 | loss: 2.628380298614502 | dt: 1457.30ms | tok/sec: 2810.67 | norm: 3.56\n", "step1569 | loss: 3.086890697479248 | dt: 1446.49ms | tok/sec: 2831.68 | norm: 3.58\n", "step1570 | loss: 3.069105625152588 | dt: 1455.46ms | tok/sec: 2814.22 | norm: 3.47\n", "step1571 | loss: 3.1065304279327393 | dt: 1455.97ms | tok/sec: 2813.24 | norm: 3.83\n", "step1572 | loss: 3.055159568786621 | dt: 1449.54ms | tok/sec: 2825.72 | norm: 3.91\n", "step1573 | loss: 2.965069055557251 | dt: 1449.52ms | tok/sec: 2825.77 | norm: 4.03\n", "step1574 | loss: 3.064474582672119 | dt: 1448.62ms | tok/sec: 2827.52 | norm: 3.75\n", "step1575 | loss: 3.038600444793701 | dt: 1452.42ms | tok/sec: 2820.13 | norm: 3.32\n", "step1576 | loss: 3.016038417816162 | dt: 1444.25ms | tok/sec: 2836.06 | norm: 3.55\n", "step1577 | loss: 2.861804723739624 | dt: 1451.05ms | tok/sec: 2822.78 | norm: 3.74\n", "step1578 | loss: 2.6975724697113037 | dt: 1455.02ms | tok/sec: 2815.09 | norm: 4.10\n", "step1579 | loss: 3.0389902591705322 | dt: 1448.72ms | tok/sec: 2827.32 | norm: 4.35\n", "step1580 | loss: 3.1457366943359375 | dt: 1447.13ms | tok/sec: 2830.44 | norm: 4.23\n", "step1581 | loss: 3.1445093154907227 | dt: 1449.90ms | tok/sec: 2825.01 | norm: 3.71\n", "step1582 | loss: 2.988229274749756 | dt: 1453.29ms | tok/sec: 2818.43 | norm: 3.47\n", "step1583 | loss: 2.9454996585845947 | dt: 1451.81ms | tok/sec: 2821.31 | norm: 3.48\n", "step1584 | loss: 3.0321390628814697 | dt: 1457.67ms | tok/sec: 2809.96 | norm: 3.80\n", "step1585 | loss: 2.8888843059539795 | dt: 1444.36ms | tok/sec: 2835.86 | norm: 3.62\n", "step1586 | loss: 2.831448793411255 | dt: 1450.76ms | tok/sec: 2823.36 | norm: 3.81\n", "step1587 | loss: 2.890998601913452 | dt: 1450.67ms | tok/sec: 2823.53 | norm: 3.81\n", "step1588 | loss: 3.049755334854126 | dt: 1451.40ms | tok/sec: 2822.09 | norm: 3.79\n", "step1589 | loss: 2.9841339588165283 | dt: 1441.36ms | tok/sec: 2841.75 | norm: 3.81\n", "step1590 | loss: 2.889411687850952 | dt: 1463.52ms | tok/sec: 2798.73 | norm: 3.58\n", "step1591 | loss: 3.0723209381103516 | dt: 1446.71ms | tok/sec: 2831.26 | norm: 3.73\n", "step1592 | loss: 2.8840034008026123 | dt: 1449.57ms | tok/sec: 2825.67 | norm: 3.26\n", "step1593 | loss: 2.8941144943237305 | dt: 1459.80ms | tok/sec: 2805.86 | norm: 3.92\n", "step1594 | loss: 2.6907150745391846 | dt: 1451.89ms | tok/sec: 2821.15 | norm: 3.70\n", "step1595 | loss: 2.8500664234161377 | dt: 1445.72ms | tok/sec: 2833.19 | norm: 3.36\n", "step1596 | loss: 2.7798409461975098 | dt: 1460.42ms | tok/sec: 2804.68 | norm: 3.43\n", "step1597 | loss: 2.9056713581085205 | dt: 1448.44ms | tok/sec: 2827.86 | norm: 3.39\n", "step1598 | loss: 2.7989885807037354 | dt: 1454.13ms | tok/sec: 2816.80 | norm: 3.30\n", "step1599 | loss: 3.3047683238983154 | dt: 1452.03ms | tok/sec: 2820.87 | norm: 3.91\n", "step1600 | loss: 3.178370952606201 | dt: 1451.86ms | tok/sec: 2821.21 | norm: 3.91\n", "step1601 | loss: 3.0861799716949463 | dt: 1450.70ms | tok/sec: 2823.47 | norm: 3.87\n", "step1602 | loss: 3.224592685699463 | dt: 1439.89ms | tok/sec: 2844.65 | norm: 3.65\n", "step1603 | loss: 3.41261887550354 | dt: 1466.50ms | tok/sec: 2793.04 | norm: 3.85\n", "step1604 | loss: 3.2635498046875 | dt: 1447.74ms | tok/sec: 2829.24 | norm: 3.51\n", "step1605 | loss: 3.002974033355713 | dt: 1443.38ms | tok/sec: 2837.78 | norm: 3.66\n", "step1606 | loss: 3.081861972808838 | dt: 1443.75ms | tok/sec: 2837.05 | norm: 3.80\n", "step1607 | loss: 3.224790334701538 | dt: 1443.55ms | tok/sec: 2837.44 | norm: 4.04\n", "step1608 | loss: 3.071070671081543 | dt: 1451.70ms | tok/sec: 2821.52 | norm: 4.29\n", "step1609 | loss: 3.067883014678955 | dt: 1439.21ms | tok/sec: 2846.01 | norm: 4.61\n", "step1610 | loss: 2.989292621612549 | dt: 1458.71ms | tok/sec: 2807.96 | norm: 4.31\n", "step1611 | loss: 2.908651351928711 | dt: 1448.04ms | tok/sec: 2828.65 | norm: 3.82\n", "step1612 | loss: 2.918238878250122 | dt: 1456.00ms | tok/sec: 2813.18 | norm: 3.93\n", "step1613 | loss: 3.073016881942749 | dt: 1451.21ms | tok/sec: 2822.47 | norm: 4.64\n", "step1614 | loss: 2.9037606716156006 | dt: 1451.69ms | tok/sec: 2821.54 | norm: 4.25\n", "step1615 | loss: 2.8491508960723877 | dt: 1441.97ms | tok/sec: 2840.55 | norm: 3.88\n", "step1616 | loss: 2.7400224208831787 | dt: 1444.49ms | tok/sec: 2835.61 | norm: 3.86\n", "step1617 | loss: 2.6795568466186523 | dt: 1450.82ms | tok/sec: 2823.22 | norm: 3.98\n", "step1618 | loss: 3.231274366378784 | dt: 1443.86ms | tok/sec: 2836.85 | norm: 4.12\n", "step1619 | loss: 3.0812087059020996 | dt: 1451.23ms | tok/sec: 2822.43 | norm: 4.80\n", "step1620 | loss: 2.8466358184814453 | dt: 1456.10ms | tok/sec: 2813.00 | norm: 4.31\n", "step1621 | loss: 2.755948066711426 | dt: 1442.74ms | tok/sec: 2839.04 | norm: 3.61\n", "step1622 | loss: 2.8082189559936523 | dt: 1442.52ms | tok/sec: 2839.48 | norm: 3.30\n", "step1623 | loss: 2.844548225402832 | dt: 1445.89ms | tok/sec: 2832.86 | norm: 3.58\n", "step1624 | loss: 2.6513171195983887 | dt: 1445.35ms | tok/sec: 2833.92 | norm: 3.80\n", "step1625 | loss: 2.604067325592041 | dt: 1447.59ms | tok/sec: 2829.54 | norm: 3.77\n", "step1626 | loss: 2.533107042312622 | dt: 1455.96ms | tok/sec: 2813.26 | norm: 3.38\n", "step1627 | loss: 3.1819093227386475 | dt: 1443.99ms | tok/sec: 2836.59 | norm: 4.07\n", "step1628 | loss: 2.8926427364349365 | dt: 1446.81ms | tok/sec: 2831.06 | norm: 3.51\n", "step1629 | loss: 3.250544786453247 | dt: 1451.01ms | tok/sec: 2822.85 | norm: 4.96\n", "step1630 | loss: 3.002277135848999 | dt: 1447.06ms | tok/sec: 2830.57 | norm: 4.35\n", "step1631 | loss: 3.104188919067383 | dt: 1443.83ms | tok/sec: 2836.90 | norm: 4.33\n", "step1632 | loss: 3.067514419555664 | dt: 1443.22ms | tok/sec: 2838.10 | norm: 4.50\n", "step1633 | loss: 2.914889097213745 | dt: 1442.01ms | tok/sec: 2840.48 | norm: 3.70\n", "step1634 | loss: 2.7098746299743652 | dt: 1449.18ms | tok/sec: 2826.42 | norm: 3.88\n", "step1635 | loss: 2.771829128265381 | dt: 1455.37ms | tok/sec: 2814.40 | norm: 4.24\n", "step1636 | loss: 2.755082130432129 | dt: 1440.15ms | tok/sec: 2844.14 | norm: 4.56\n", "step1637 | loss: 2.7878520488739014 | dt: 1445.33ms | tok/sec: 2833.95 | norm: 4.54\n", "step1638 | loss: 2.924039602279663 | dt: 1447.76ms | tok/sec: 2829.20 | norm: 4.16\n", "step1639 | loss: 2.97713565826416 | dt: 1453.04ms | tok/sec: 2818.92 | norm: 3.73\n", "step1640 | loss: 2.933854579925537 | dt: 1441.08ms | tok/sec: 2842.31 | norm: 3.81\n", "step1641 | loss: 2.853708505630493 | dt: 1446.31ms | tok/sec: 2832.04 | norm: 3.80\n", "step1642 | loss: 2.9205524921417236 | dt: 1452.79ms | tok/sec: 2819.41 | norm: 4.03\n", "step1643 | loss: 2.8249268531799316 | dt: 1444.26ms | tok/sec: 2836.05 | norm: 4.12\n", "step1644 | loss: 2.9176788330078125 | dt: 1446.96ms | tok/sec: 2830.77 | norm: 4.36\n", "step1645 | loss: 2.640511989593506 | dt: 1449.95ms | tok/sec: 2824.92 | norm: 4.41\n", "step1646 | loss: 2.667797565460205 | dt: 1450.18ms | tok/sec: 2824.48 | norm: 4.16\n", "step1647 | loss: 2.982063055038452 | dt: 1440.50ms | tok/sec: 2843.45 | norm: 3.67\n", "step1648 | loss: 2.6907074451446533 | dt: 1444.12ms | tok/sec: 2836.34 | norm: 4.14\n", "step1649 | loss: 2.7746541500091553 | dt: 1451.88ms | tok/sec: 2821.18 | norm: 4.02\n", "step1650 | loss: 2.550480842590332 | dt: 1449.98ms | tok/sec: 2824.87 | norm: 4.07\n", "step1651 | loss: 3.0002689361572266 | dt: 1443.05ms | tok/sec: 2838.43 | norm: 4.55\n", "step1652 | loss: 2.971526861190796 | dt: 1452.86ms | tok/sec: 2819.27 | norm: 4.04\n", "step1653 | loss: 3.010608434677124 | dt: 1445.27ms | tok/sec: 2834.07 | norm: 3.78\n", "step1654 | loss: 2.9479622840881348 | dt: 1452.34ms | tok/sec: 2820.28 | norm: 3.48\n", "step1655 | loss: 2.86098575592041 | dt: 1443.02ms | tok/sec: 2838.49 | norm: 3.87\n", "step1656 | loss: 2.957564115524292 | dt: 1450.18ms | tok/sec: 2824.48 | norm: 3.94\n", "step1657 | loss: 2.939814329147339 | dt: 1451.88ms | tok/sec: 2821.17 | norm: 3.99\n", "step1658 | loss: 2.9059064388275146 | dt: 1439.04ms | tok/sec: 2846.35 | norm: 3.55\n", "step1659 | loss: 2.7521321773529053 | dt: 1443.67ms | tok/sec: 2837.22 | norm: 3.39\n", "step1660 | loss: 2.5851330757141113 | dt: 1449.09ms | tok/sec: 2826.60 | norm: 3.36\n", "step1661 | loss: 2.9195480346679688 | dt: 1441.61ms | tok/sec: 2841.26 | norm: 4.05\n", "step1662 | loss: 3.0494887828826904 | dt: 1452.79ms | tok/sec: 2819.40 | norm: 4.42\n", "step1663 | loss: 3.065645456314087 | dt: 1441.41ms | tok/sec: 2841.66 | norm: 4.38\n", "step1664 | loss: 2.9163975715637207 | dt: 1438.08ms | tok/sec: 2848.25 | norm: 4.40\n", "step1665 | loss: 2.851146936416626 | dt: 1435.92ms | tok/sec: 2852.52 | norm: 3.88\n", "step1666 | loss: 2.9367480278015137 | dt: 1444.19ms | tok/sec: 2836.18 | norm: 3.80\n", "step1667 | loss: 2.788362979888916 | dt: 1450.92ms | tok/sec: 2823.03 | norm: 3.95\n", "step1668 | loss: 2.7132885456085205 | dt: 1436.71ms | tok/sec: 2850.96 | norm: 3.92\n", "step1669 | loss: 2.7995059490203857 | dt: 1439.48ms | tok/sec: 2845.46 | norm: 4.08\n", "step1670 | loss: 2.9720067977905273 | dt: 1449.40ms | tok/sec: 2826.00 | norm: 4.06\n", "step1671 | loss: 2.9405033588409424 | dt: 1438.98ms | tok/sec: 2846.47 | norm: 4.61\n", "step1672 | loss: 2.8381075859069824 | dt: 1448.94ms | tok/sec: 2826.90 | norm: 4.24\n", "step1673 | loss: 3.0064291954040527 | dt: 1445.80ms | tok/sec: 2833.03 | norm: 4.06\n", "step1674 | loss: 2.786062479019165 | dt: 1453.47ms | tok/sec: 2818.09 | norm: 3.55\n", "step1675 | loss: 2.7887563705444336 | dt: 1455.08ms | tok/sec: 2814.97 | norm: 3.70\n", "step1676 | loss: 2.6024742126464844 | dt: 1438.51ms | tok/sec: 2847.38 | norm: 3.92\n", "step1677 | loss: 2.7612204551696777 | dt: 1444.74ms | tok/sec: 2835.12 | norm: 3.81\n", "step1678 | loss: 2.704562187194824 | dt: 1452.61ms | tok/sec: 2819.75 | norm: 3.93\n", "step1679 | loss: 2.8121650218963623 | dt: 1444.33ms | tok/sec: 2835.92 | norm: 3.88\n", "step1680 | loss: 2.7182538509368896 | dt: 1457.50ms | tok/sec: 2810.29 | norm: 3.79\n", "step1681 | loss: 3.1985185146331787 | dt: 1445.67ms | tok/sec: 2833.28 | norm: 4.14\n", "step1682 | loss: 3.0693271160125732 | dt: 1442.06ms | tok/sec: 2840.39 | norm: 4.00\n", "step1683 | loss: 2.9848690032958984 | dt: 1435.44ms | tok/sec: 2853.47 | norm: 3.83\n", "step1684 | loss: 3.091822385787964 | dt: 1437.62ms | tok/sec: 2849.15 | norm: 3.76\n", "step1685 | loss: 3.2846012115478516 | dt: 1450.94ms | tok/sec: 2823.00 | norm: 4.22\n", "step1686 | loss: 3.130993127822876 | dt: 1442.00ms | tok/sec: 2840.50 | norm: 4.10\n", "step1687 | loss: 2.872735023498535 | dt: 1443.54ms | tok/sec: 2837.47 | norm: 3.88\n", "step1688 | loss: 2.935316324234009 | dt: 1445.12ms | tok/sec: 2834.36 | norm: 3.98\n", "step1689 | loss: 3.0873284339904785 | dt: 1436.11ms | tok/sec: 2852.15 | norm: 4.02\n", "step1690 | loss: 2.955260753631592 | dt: 1443.00ms | tok/sec: 2838.54 | norm: 4.11\n", "step1691 | loss: 2.9798943996429443 | dt: 1440.21ms | tok/sec: 2844.03 | norm: 4.31\n", "step1692 | loss: 2.885573148727417 | dt: 1447.69ms | tok/sec: 2829.34 | norm: 4.41\n", "step1693 | loss: 2.8024070262908936 | dt: 1446.17ms | tok/sec: 2832.31 | norm: 3.93\n", "step1694 | loss: 2.7968761920928955 | dt: 1449.03ms | tok/sec: 2826.72 | norm: 3.87\n", "step1695 | loss: 2.9627037048339844 | dt: 1441.64ms | tok/sec: 2841.22 | norm: 4.31\n", "step1696 | loss: 2.8171701431274414 | dt: 1445.18ms | tok/sec: 2834.24 | norm: 4.23\n", "step1697 | loss: 2.7703022956848145 | dt: 1448.39ms | tok/sec: 2827.96 | norm: 4.32\n", "step1698 | loss: 2.6640126705169678 | dt: 1449.95ms | tok/sec: 2824.93 | norm: 3.96\n", "step1699 | loss: 2.6036624908447266 | dt: 1449.80ms | tok/sec: 2825.21 | norm: 4.01\n", "step1700 | loss: 3.133716583251953 | dt: 1437.53ms | tok/sec: 2849.34 | norm: 4.21\n", "step1701 | loss: 2.992976188659668 | dt: 1453.00ms | tok/sec: 2818.99 | norm: 5.03\n", "step1702 | loss: 2.753469944000244 | dt: 1440.88ms | tok/sec: 2842.71 | norm: 4.60\n", "step1703 | loss: 2.6550943851470947 | dt: 1438.41ms | tok/sec: 2847.59 | norm: 4.28\n", "step1704 | loss: 2.6888883113861084 | dt: 1450.90ms | tok/sec: 2823.07 | norm: 3.78\n", "step1705 | loss: 2.723747491836548 | dt: 1444.38ms | tok/sec: 2835.82 | norm: 3.57\n", "step1706 | loss: 2.5343480110168457 | dt: 1438.03ms | tok/sec: 2848.34 | norm: 3.63\n", "step1707 | loss: 2.4947123527526855 | dt: 1447.72ms | tok/sec: 2829.27 | norm: 3.62\n", "step1708 | loss: 2.435509204864502 | dt: 1440.26ms | tok/sec: 2843.93 | norm: 3.35\n", "step1709 | loss: 3.075312852859497 | dt: 1449.58ms | tok/sec: 2825.64 | norm: 3.98\n", "step1710 | loss: 2.7632365226745605 | dt: 1450.61ms | tok/sec: 2823.64 | norm: 3.58\n", "step1711 | loss: 3.112643241882324 | dt: 1436.44ms | tok/sec: 2851.49 | norm: 4.55\n", "step1712 | loss: 2.874877452850342 | dt: 1456.23ms | tok/sec: 2812.75 | norm: 3.98\n", "step1713 | loss: 2.9697656631469727 | dt: 1448.32ms | tok/sec: 2828.11 | norm: 4.30\n", "step1714 | loss: 2.9720447063446045 | dt: 1451.67ms | tok/sec: 2821.58 | norm: 4.88\n", "step1715 | loss: 2.778057336807251 | dt: 1450.20ms | tok/sec: 2824.44 | norm: 3.94\n", "step1716 | loss: 2.551442861557007 | dt: 1442.42ms | tok/sec: 2839.68 | norm: 3.49\n", "step1717 | loss: 2.6440088748931885 | dt: 1452.09ms | tok/sec: 2820.77 | norm: 3.57\n", "step1718 | loss: 2.641187906265259 | dt: 1447.03ms | tok/sec: 2830.63 | norm: 3.68\n", "step1719 | loss: 2.648200273513794 | dt: 1450.50ms | tok/sec: 2823.85 | norm: 3.79\n", "step1720 | loss: 2.806253433227539 | dt: 1445.55ms | tok/sec: 2833.51 | norm: 4.19\n", "step1721 | loss: 2.8534178733825684 | dt: 1442.96ms | tok/sec: 2838.61 | norm: 3.94\n", "step1722 | loss: 2.808581590652466 | dt: 1444.57ms | tok/sec: 2835.44 | norm: 4.06\n", "step1723 | loss: 2.710932493209839 | dt: 1445.40ms | tok/sec: 2833.82 | norm: 4.09\n", "step1724 | loss: 2.771305561065674 | dt: 1457.86ms | tok/sec: 2809.59 | norm: 3.96\n", "step1725 | loss: 2.6944916248321533 | dt: 1456.04ms | tok/sec: 2813.12 | norm: 3.88\n", "step1726 | loss: 2.7687501907348633 | dt: 1446.01ms | tok/sec: 2832.63 | norm: 3.67\n", "step1727 | loss: 2.506814479827881 | dt: 1449.21ms | tok/sec: 2826.38 | norm: 3.84\n", "step1728 | loss: 2.5218594074249268 | dt: 1451.37ms | tok/sec: 2822.16 | norm: 3.82\n", "step1729 | loss: 2.842710494995117 | dt: 1439.04ms | tok/sec: 2846.34 | norm: 3.94\n", "step1730 | loss: 2.5718202590942383 | dt: 1443.75ms | tok/sec: 2837.06 | norm: 4.13\n", "step1731 | loss: 2.6405019760131836 | dt: 1445.00ms | tok/sec: 2834.60 | norm: 3.67\n", "step1732 | loss: 2.4272618293762207 | dt: 1452.61ms | tok/sec: 2819.75 | norm: 3.51\n", "step1733 | loss: 2.863238573074341 | dt: 1434.68ms | tok/sec: 2855.00 | norm: 3.74\n", "step1734 | loss: 2.8413100242614746 | dt: 1448.25ms | tok/sec: 2828.24 | norm: 3.97\n", "step1735 | loss: 2.8633430004119873 | dt: 1454.61ms | tok/sec: 2815.88 | norm: 3.95\n", "step1736 | loss: 2.8005919456481934 | dt: 1445.79ms | tok/sec: 2833.06 | norm: 3.64\n", "step1737 | loss: 2.7357778549194336 | dt: 1444.24ms | tok/sec: 2836.09 | norm: 3.99\n", "step1738 | loss: 2.820648193359375 | dt: 1452.14ms | tok/sec: 2820.66 | norm: 3.81\n", "step1739 | loss: 2.811903715133667 | dt: 1437.41ms | tok/sec: 2849.57 | norm: 4.13\n", "step1740 | loss: 2.7692582607269287 | dt: 1448.93ms | tok/sec: 2826.91 | norm: 3.85\n", "step1741 | loss: 2.6028029918670654 | dt: 1440.89ms | tok/sec: 2842.70 | norm: 3.50\n", "step1742 | loss: 2.4220762252807617 | dt: 1446.24ms | tok/sec: 2832.17 | norm: 3.24\n", "step1743 | loss: 2.7637054920196533 | dt: 1439.58ms | tok/sec: 2845.27 | norm: 3.79\n", "step1744 | loss: 2.8866477012634277 | dt: 1447.62ms | tok/sec: 2829.47 | norm: 3.85\n", "step1745 | loss: 2.9184775352478027 | dt: 1437.47ms | tok/sec: 2849.44 | norm: 4.13\n", "step1746 | loss: 2.7779550552368164 | dt: 1448.21ms | tok/sec: 2828.32 | norm: 4.33\n", "step1747 | loss: 2.7152504920959473 | dt: 1442.45ms | tok/sec: 2839.61 | norm: 4.07\n", "step1748 | loss: 2.7959752082824707 | dt: 1446.70ms | tok/sec: 2831.27 | norm: 3.95\n", "step1749 | loss: 2.6694371700286865 | dt: 1448.42ms | tok/sec: 2827.90 | norm: 3.83\n", "step1750 | loss: 2.5802600383758545 | dt: 1438.72ms | tok/sec: 2846.98 | norm: 4.01\n", "step1751 | loss: 2.6518845558166504 | dt: 1452.19ms | tok/sec: 2820.57 | norm: 4.12\n", "step1752 | loss: 2.79783296585083 | dt: 1456.81ms | tok/sec: 2811.63 | norm: 3.87\n", "step1753 | loss: 2.8316092491149902 | dt: 1452.26ms | tok/sec: 2820.42 | norm: 4.17\n", "step1754 | loss: 2.7282941341400146 | dt: 1440.89ms | tok/sec: 2842.68 | norm: 4.20\n", "step1755 | loss: 2.900294065475464 | dt: 1447.17ms | tok/sec: 2830.34 | norm: 4.35\n", "step1756 | loss: 2.6810078620910645 | dt: 1440.69ms | tok/sec: 2843.07 | norm: 3.69\n", "step1757 | loss: 2.645833730697632 | dt: 1442.62ms | tok/sec: 2839.28 | norm: 3.59\n", "step1758 | loss: 2.474733591079712 | dt: 1448.38ms | tok/sec: 2827.99 | norm: 3.51\n", "step1759 | loss: 2.627412796020508 | dt: 1444.39ms | tok/sec: 2835.79 | norm: 3.53\n", "step1760 | loss: 2.568208694458008 | dt: 1440.96ms | tok/sec: 2842.55 | norm: 3.57\n", "step1761 | loss: 2.676464080810547 | dt: 1448.93ms | tok/sec: 2826.91 | norm: 3.79\n", "step1762 | loss: 2.5773465633392334 | dt: 1442.00ms | tok/sec: 2840.49 | norm: 4.03\n", "step1763 | loss: 3.0411734580993652 | dt: 1442.26ms | tok/sec: 2840.00 | norm: 4.12\n", "step1764 | loss: 2.9132683277130127 | dt: 1437.54ms | tok/sec: 2849.30 | norm: 4.10\n", "step1765 | loss: 2.8311927318573 | dt: 1450.64ms | tok/sec: 2823.59 | norm: 3.62\n", "step1766 | loss: 2.962029218673706 | dt: 1443.99ms | tok/sec: 2836.58 | norm: 3.73\n", "step1767 | loss: 3.1429920196533203 | dt: 1448.16ms | tok/sec: 2828.41 | norm: 4.20\n", "step1768 | loss: 2.9818577766418457 | dt: 1458.03ms | tok/sec: 2809.26 | norm: 3.88\n", "step1769 | loss: 2.720933437347412 | dt: 1446.05ms | tok/sec: 2832.55 | norm: 3.86\n", "step1770 | loss: 2.784911632537842 | dt: 1442.56ms | tok/sec: 2839.39 | norm: 3.83\n", "step1771 | loss: 2.9357833862304688 | dt: 1445.01ms | tok/sec: 2834.57 | norm: 3.94\n", "step1772 | loss: 2.799447536468506 | dt: 1442.35ms | tok/sec: 2839.82 | norm: 4.11\n", "step1773 | loss: 2.834157705307007 | dt: 1454.41ms | tok/sec: 2816.25 | norm: 4.34\n", "step1774 | loss: 2.755706787109375 | dt: 1442.11ms | tok/sec: 2840.29 | norm: 4.99\n", "step1775 | loss: 2.6792802810668945 | dt: 1446.75ms | tok/sec: 2831.17 | norm: 4.32\n", "step1776 | loss: 2.675802707672119 | dt: 1450.26ms | tok/sec: 2824.32 | norm: 4.02\n", "step1777 | loss: 2.8362441062927246 | dt: 1441.45ms | tok/sec: 2841.58 | norm: 4.39\n", "step1778 | loss: 2.668484926223755 | dt: 1447.54ms | tok/sec: 2829.63 | norm: 3.92\n", "step1779 | loss: 2.643240451812744 | dt: 1453.91ms | tok/sec: 2817.23 | norm: 4.01\n", "step1780 | loss: 2.548037528991699 | dt: 1438.39ms | tok/sec: 2847.63 | norm: 4.47\n", "step1781 | loss: 2.508511543273926 | dt: 1448.33ms | tok/sec: 2828.09 | norm: 4.57\n", "step1782 | loss: 3.061093330383301 | dt: 1450.29ms | tok/sec: 2824.27 | norm: 5.09\n", "step1783 | loss: 2.960718870162964 | dt: 1440.08ms | tok/sec: 2844.29 | norm: 5.35\n", "step1784 | loss: 2.6384239196777344 | dt: 1452.28ms | tok/sec: 2820.39 | norm: 4.31\n", "step1785 | loss: 2.528904914855957 | dt: 1445.86ms | tok/sec: 2832.91 | norm: 3.73\n", "step1786 | loss: 2.5772252082824707 | dt: 1443.70ms | tok/sec: 2837.16 | norm: 3.85\n", "step1787 | loss: 2.65212082862854 | dt: 1450.86ms | tok/sec: 2823.15 | norm: 4.82\n", "step1788 | loss: 2.483833074569702 | dt: 1444.96ms | tok/sec: 2834.68 | norm: 4.78\n", "step1789 | loss: 2.4367830753326416 | dt: 1448.55ms | tok/sec: 2827.65 | norm: 4.70\n", "step1790 | loss: 2.328629970550537 | dt: 1447.24ms | tok/sec: 2830.21 | norm: 3.97\n", "step1791 | loss: 2.9378833770751953 | dt: 1440.26ms | tok/sec: 2843.93 | norm: 3.86\n", "step1792 | loss: 2.6480836868286133 | dt: 1454.11ms | tok/sec: 2816.85 | norm: 3.64\n", "step1793 | loss: 3.022667169570923 | dt: 1452.92ms | tok/sec: 2819.15 | norm: 4.47\n", "step1794 | loss: 2.7742996215820312 | dt: 1443.40ms | tok/sec: 2837.74 | norm: 4.15\n", "step1795 | loss: 2.849818468093872 | dt: 1445.58ms | tok/sec: 2833.47 | norm: 4.20\n", "step1796 | loss: 2.8502566814422607 | dt: 1446.10ms | tok/sec: 2832.44 | norm: 4.24\n", "step1797 | loss: 2.6623120307922363 | dt: 1449.51ms | tok/sec: 2825.79 | norm: 3.87\n", "step1798 | loss: 2.4557888507843018 | dt: 1451.54ms | tok/sec: 2821.82 | norm: 4.01\n", "step1799 | loss: 2.513927698135376 | dt: 1446.08ms | tok/sec: 2832.48 | norm: 3.73\n", "step1800 | loss: 2.4956963062286377 | dt: 1453.06ms | tok/sec: 2818.88 | norm: 3.58\n", "step1801 | loss: 2.496955156326294 | dt: 1450.93ms | tok/sec: 2823.02 | norm: 3.55\n", "step1802 | loss: 2.666533946990967 | dt: 1440.89ms | tok/sec: 2842.69 | norm: 3.80\n", "step1803 | loss: 2.7271666526794434 | dt: 1441.41ms | tok/sec: 2841.65 | norm: 3.69\n", "step1804 | loss: 2.665534496307373 | dt: 1450.42ms | tok/sec: 2824.00 | norm: 3.94\n", "step1805 | loss: 2.579195022583008 | dt: 1454.32ms | tok/sec: 2816.43 | norm: 3.97\n", "step1806 | loss: 2.6253697872161865 | dt: 1455.48ms | tok/sec: 2814.19 | norm: 3.81\n", "step1807 | loss: 2.549828052520752 | dt: 1451.43ms | tok/sec: 2822.05 | norm: 4.00\n", "step1808 | loss: 2.6257574558258057 | dt: 1449.00ms | tok/sec: 2826.78 | norm: 3.74\n", "step1809 | loss: 2.382718801498413 | dt: 1441.23ms | tok/sec: 2842.02 | norm: 3.75\n", "step1810 | loss: 2.3899688720703125 | dt: 1447.42ms | tok/sec: 2829.87 | norm: 3.58\n", "step1811 | loss: 2.682582378387451 | dt: 1453.40ms | tok/sec: 2818.21 | norm: 3.36\n", "step1812 | loss: 2.4244396686553955 | dt: 1448.33ms | tok/sec: 2828.09 | norm: 3.47\n", "step1813 | loss: 2.5013747215270996 | dt: 1449.48ms | tok/sec: 2825.85 | norm: 3.56\n", "step1814 | loss: 2.285341501235962 | dt: 1443.08ms | tok/sec: 2838.38 | norm: 3.51\n", "step1815 | loss: 2.740583658218384 | dt: 1446.96ms | tok/sec: 2830.77 | norm: 4.60\n", "step1816 | loss: 2.732238292694092 | dt: 1438.53ms | tok/sec: 2847.35 | norm: 4.03\n", "step1817 | loss: 2.7465660572052 | dt: 1444.03ms | tok/sec: 2836.50 | norm: 3.83\n", "step1818 | loss: 2.670788049697876 | dt: 1454.27ms | tok/sec: 2816.53 | norm: 3.69\n", "step1819 | loss: 2.6185476779937744 | dt: 1446.54ms | tok/sec: 2831.59 | norm: 3.97\n", "step1820 | loss: 2.7035505771636963 | dt: 1442.88ms | tok/sec: 2838.77 | norm: 3.96\n", "step1821 | loss: 2.6767807006835938 | dt: 1439.88ms | tok/sec: 2844.69 | norm: 3.72\n", "step1822 | loss: 2.6248507499694824 | dt: 1447.26ms | tok/sec: 2830.17 | norm: 3.79\n", "step1823 | loss: 2.475203037261963 | dt: 1449.77ms | tok/sec: 2825.27 | norm: 3.61\n", "step1824 | loss: 2.298332929611206 | dt: 1445.94ms | tok/sec: 2832.77 | norm: 3.60\n", "step1825 | loss: 2.6518962383270264 | dt: 1445.88ms | tok/sec: 2832.87 | norm: 4.03\n", "step1826 | loss: 2.757434368133545 | dt: 1447.45ms | tok/sec: 2829.81 | norm: 4.00\n", "step1827 | loss: 2.789584159851074 | dt: 1448.11ms | tok/sec: 2828.51 | norm: 4.22\n", "step1828 | loss: 2.6575896739959717 | dt: 1451.82ms | tok/sec: 2821.29 | norm: 4.36\n", "step1829 | loss: 2.5551061630249023 | dt: 1448.19ms | tok/sec: 2828.37 | norm: 4.04\n", "step1830 | loss: 2.6273066997528076 | dt: 1455.85ms | tok/sec: 2813.47 | norm: 3.48\n", "step1831 | loss: 2.5264859199523926 | dt: 1446.21ms | tok/sec: 2832.24 | norm: 3.90\n", "step1832 | loss: 2.4506707191467285 | dt: 1443.69ms | tok/sec: 2837.18 | norm: 3.92\n", "step1833 | loss: 2.531629800796509 | dt: 1436.52ms | tok/sec: 2851.34 | norm: 4.03\n", "step1834 | loss: 2.673370122909546 | dt: 1441.14ms | tok/sec: 2842.19 | norm: 3.96\n", "step1835 | loss: 2.659074306488037 | dt: 1440.20ms | tok/sec: 2844.06 | norm: 4.27\n", "step1836 | loss: 2.570361375808716 | dt: 1445.09ms | tok/sec: 2834.42 | norm: 4.26\n", "step1837 | loss: 2.740022659301758 | dt: 1449.76ms | tok/sec: 2825.30 | norm: 4.47\n", "step1838 | loss: 2.521965265274048 | dt: 1435.95ms | tok/sec: 2852.46 | norm: 4.14\n", "step1839 | loss: 2.5051028728485107 | dt: 1441.30ms | tok/sec: 2841.88 | norm: 4.17\n", "step1840 | loss: 2.324838638305664 | dt: 1442.90ms | tok/sec: 2838.73 | norm: 3.37\n", "step1841 | loss: 2.4474453926086426 | dt: 1441.51ms | tok/sec: 2841.47 | norm: 3.27\n", "step1842 | loss: 2.3872015476226807 | dt: 1445.89ms | tok/sec: 2832.85 | norm: 3.17\n", "step1843 | loss: 2.490732431411743 | dt: 1440.22ms | tok/sec: 2844.00 | norm: 3.13\n", "step1844 | loss: 2.415226936340332 | dt: 1444.21ms | tok/sec: 2836.14 | norm: 3.48\n", "step1845 | loss: 2.8670129776000977 | dt: 1438.17ms | tok/sec: 2848.06 | norm: 3.85\n", "step1846 | loss: 2.733567953109741 | dt: 1448.82ms | tok/sec: 2827.12 | norm: 3.83\n", "step1847 | loss: 2.6581642627716064 | dt: 1441.26ms | tok/sec: 2841.96 | norm: 4.01\n", "step1848 | loss: 2.7880654335021973 | dt: 1448.49ms | tok/sec: 2827.77 | norm: 3.80\n", "step1849 | loss: 2.951856851577759 | dt: 1443.11ms | tok/sec: 2838.31 | norm: 3.96\n", "step1850 | loss: 2.819154739379883 | dt: 1448.53ms | tok/sec: 2827.70 | norm: 3.79\n", "step1851 | loss: 2.5758676528930664 | dt: 1441.46ms | tok/sec: 2841.56 | norm: 3.86\n", "step1852 | loss: 2.6267104148864746 | dt: 1457.72ms | tok/sec: 2809.88 | norm: 3.90\n", "step1853 | loss: 2.766209602355957 | dt: 1438.98ms | tok/sec: 2846.46 | norm: 4.03\n", "step1854 | loss: 2.6334102153778076 | dt: 1450.98ms | tok/sec: 2822.93 | norm: 3.90\n", "step1855 | loss: 2.6251587867736816 | dt: 1447.33ms | tok/sec: 2830.04 | norm: 3.93\n", "step1856 | loss: 2.5403730869293213 | dt: 1447.91ms | tok/sec: 2828.91 | norm: 3.97\n", "step1857 | loss: 2.473175525665283 | dt: 1448.03ms | tok/sec: 2828.67 | norm: 3.93\n", "step1858 | loss: 2.500190258026123 | dt: 1446.23ms | tok/sec: 2832.20 | norm: 3.99\n", "step1859 | loss: 2.6845695972442627 | dt: 1450.95ms | tok/sec: 2822.99 | norm: 4.78\n", "step1860 | loss: 2.5242514610290527 | dt: 1436.85ms | tok/sec: 2850.68 | norm: 4.15\n", "step1861 | loss: 2.4860799312591553 | dt: 1446.28ms | tok/sec: 2832.10 | norm: 3.74\n", "step1862 | loss: 2.3813602924346924 | dt: 1441.80ms | tok/sec: 2840.89 | norm: 3.76\n", "step1863 | loss: 2.34983229637146 | dt: 1450.50ms | tok/sec: 2823.86 | norm: 3.92\n", "step1864 | loss: 2.9155783653259277 | dt: 1436.78ms | tok/sec: 2850.82 | norm: 4.88\n", "step1865 | loss: 2.847504138946533 | dt: 1437.02ms | tok/sec: 2850.35 | norm: 5.73\n", "step1866 | loss: 2.533596992492676 | dt: 1439.70ms | tok/sec: 2845.03 | norm: 4.94\n", "step1867 | loss: 2.403698444366455 | dt: 1449.88ms | tok/sec: 2825.06 | norm: 4.09\n", "step1868 | loss: 2.431633472442627 | dt: 1445.13ms | tok/sec: 2834.35 | norm: 3.89\n", "step1869 | loss: 2.512341260910034 | dt: 1448.48ms | tok/sec: 2827.80 | norm: 4.06\n", "step1870 | loss: 2.3501741886138916 | dt: 1453.62ms | tok/sec: 2817.80 | norm: 4.40\n", "step1871 | loss: 2.307572364807129 | dt: 1437.98ms | tok/sec: 2848.44 | norm: 4.39\n", "step1872 | loss: 2.2179195880889893 | dt: 1455.14ms | tok/sec: 2814.86 | norm: 3.83\n", "step1873 | loss: 2.787195920944214 | dt: 1443.21ms | tok/sec: 2838.11 | norm: 4.21\n", "step1874 | loss: 2.492039918899536 | dt: 1440.64ms | tok/sec: 2843.17 | norm: 3.90\n", "step1875 | loss: 2.869762659072876 | dt: 1437.65ms | tok/sec: 2849.09 | norm: 4.61\n", "step1876 | loss: 2.636798620223999 | dt: 1444.04ms | tok/sec: 2836.48 | norm: 4.10\n", "step1877 | loss: 2.7138278484344482 | dt: 1448.63ms | tok/sec: 2827.50 | norm: 3.93\n", "step1878 | loss: 2.7089803218841553 | dt: 1444.54ms | tok/sec: 2835.50 | norm: 4.17\n", "step1879 | loss: 2.546718120574951 | dt: 1449.94ms | tok/sec: 2824.94 | norm: 4.11\n", "step1880 | loss: 2.3467512130737305 | dt: 1445.39ms | tok/sec: 2833.84 | norm: 4.31\n", "step1881 | loss: 2.4023616313934326 | dt: 1442.93ms | tok/sec: 2838.67 | norm: 4.12\n", "step1882 | loss: 2.3858182430267334 | dt: 1451.19ms | tok/sec: 2822.50 | norm: 4.25\n", "step1883 | loss: 2.3798389434814453 | dt: 1452.28ms | tok/sec: 2820.39 | norm: 4.15\n", "step1884 | loss: 2.5298876762390137 | dt: 1444.75ms | tok/sec: 2835.09 | norm: 4.08\n", "step1885 | loss: 2.5887792110443115 | dt: 1442.07ms | tok/sec: 2840.37 | norm: 3.64\n", "step1886 | loss: 2.530117988586426 | dt: 1448.77ms | tok/sec: 2827.22 | norm: 3.81\n", "step1887 | loss: 2.4239566326141357 | dt: 1448.60ms | tok/sec: 2827.55 | norm: 3.94\n", "step1888 | loss: 2.4850118160247803 | dt: 1442.75ms | tok/sec: 2839.02 | norm: 3.81\n", "step1889 | loss: 2.4255340099334717 | dt: 1449.29ms | tok/sec: 2826.21 | norm: 4.38\n", "step1890 | loss: 2.504201650619507 | dt: 1443.27ms | tok/sec: 2838.00 | norm: 4.19\n", "step1891 | loss: 2.2623279094696045 | dt: 1449.27ms | tok/sec: 2826.25 | norm: 4.08\n", "step1892 | loss: 2.2511281967163086 | dt: 1449.78ms | tok/sec: 2825.26 | norm: 3.88\n", "step1893 | loss: 2.555668830871582 | dt: 1449.69ms | tok/sec: 2825.43 | norm: 4.06\n", "step1894 | loss: 2.3115875720977783 | dt: 1451.68ms | tok/sec: 2821.56 | norm: 3.95\n", "step1895 | loss: 2.3614401817321777 | dt: 1451.74ms | tok/sec: 2821.43 | norm: 3.86\n", "step1896 | loss: 2.17106556892395 | dt: 1442.66ms | tok/sec: 2839.19 | norm: 3.86\n", "step1897 | loss: 2.6330068111419678 | dt: 1449.07ms | tok/sec: 2826.65 | norm: 4.42\n", "step1898 | loss: 2.5959553718566895 | dt: 1447.36ms | tok/sec: 2829.97 | norm: 4.09\n", "step1899 | loss: 2.619565963745117 | dt: 1442.77ms | tok/sec: 2838.98 | norm: 4.44\n", "step1900 | loss: 2.5291686058044434 | dt: 1445.42ms | tok/sec: 2833.77 | norm: 4.24\n", "step1901 | loss: 2.465360641479492 | dt: 1435.38ms | tok/sec: 2853.60 | norm: 4.09\n", "step1902 | loss: 2.564682722091675 | dt: 1450.48ms | tok/sec: 2823.88 | norm: 4.11\n", "step1903 | loss: 2.5597424507141113 | dt: 1449.79ms | tok/sec: 2825.23 | norm: 4.51\n", "step1904 | loss: 2.5169034004211426 | dt: 1446.02ms | tok/sec: 2832.60 | norm: 4.39\n", "step1905 | loss: 2.3490023612976074 | dt: 1453.67ms | tok/sec: 2817.69 | norm: 3.76\n", "step1906 | loss: 2.1611719131469727 | dt: 1446.74ms | tok/sec: 2831.19 | norm: 3.26\n", "step1907 | loss: 2.500699996948242 | dt: 1446.22ms | tok/sec: 2832.20 | norm: 3.82\n", "step1908 | loss: 2.618497848510742 | dt: 1444.77ms | tok/sec: 2835.06 | norm: 4.05\n", "step1909 | loss: 2.6621692180633545 | dt: 1454.33ms | tok/sec: 2816.43 | norm: 4.01\n", "step1910 | loss: 2.5213887691497803 | dt: 1448.55ms | tok/sec: 2827.65 | norm: 4.07\n", "step1911 | loss: 2.413848876953125 | dt: 1440.56ms | tok/sec: 2843.35 | norm: 4.07\n", "step1912 | loss: 2.487623691558838 | dt: 1454.11ms | tok/sec: 2816.84 | norm: 4.00\n", "step1913 | loss: 2.4024059772491455 | dt: 1437.73ms | tok/sec: 2848.94 | norm: 3.99\n", "step1914 | loss: 2.324373483657837 | dt: 1455.81ms | tok/sec: 2813.55 | norm: 3.90\n", "step1915 | loss: 2.3934314250946045 | dt: 1441.74ms | tok/sec: 2841.01 | norm: 3.86\n", "step1916 | loss: 2.579657793045044 | dt: 1437.16ms | tok/sec: 2850.06 | norm: 4.47\n", "step1917 | loss: 2.572438955307007 | dt: 1447.11ms | tok/sec: 2830.46 | norm: 4.72\n", "step1918 | loss: 2.4745032787323 | dt: 1448.48ms | tok/sec: 2827.79 | norm: 4.46\n", "step1919 | loss: 2.6287648677825928 | dt: 1449.07ms | tok/sec: 2826.64 | norm: 4.31\n", "step1920 | loss: 2.39579701423645 | dt: 1449.94ms | tok/sec: 2824.95 | norm: 3.71\n", "step1921 | loss: 2.376199245452881 | dt: 1445.93ms | tok/sec: 2832.77 | norm: 3.89\n", "step1922 | loss: 2.2030134201049805 | dt: 1440.49ms | tok/sec: 2843.48 | norm: 3.70\n", "step1923 | loss: 2.3218398094177246 | dt: 1449.40ms | tok/sec: 2825.99 | norm: 4.11\n", "step1924 | loss: 2.260796308517456 | dt: 1449.10ms | tok/sec: 2826.57 | norm: 3.80\n", "step1925 | loss: 2.3625969886779785 | dt: 1439.80ms | tok/sec: 2844.84 | norm: 3.92\n", "step1926 | loss: 2.2768750190734863 | dt: 1452.32ms | tok/sec: 2820.31 | norm: 3.74\n", "step1927 | loss: 2.7286784648895264 | dt: 1441.09ms | tok/sec: 2842.29 | norm: 4.53\n", "step1928 | loss: 2.6181671619415283 | dt: 1445.95ms | tok/sec: 2832.73 | norm: 4.29\n", "step1929 | loss: 2.5470759868621826 | dt: 1439.63ms | tok/sec: 2845.18 | norm: 4.24\n", "step1930 | loss: 2.630458354949951 | dt: 1448.48ms | tok/sec: 2827.80 | norm: 3.70\n", "step1931 | loss: 2.8321237564086914 | dt: 1447.22ms | tok/sec: 2830.26 | norm: 4.90\n", "step1932 | loss: 2.675244092941284 | dt: 1456.36ms | tok/sec: 2812.48 | norm: 4.47\n", "step1933 | loss: 2.4447453022003174 | dt: 1451.87ms | tok/sec: 2821.18 | norm: 4.06\n", "step1934 | loss: 2.497340679168701 | dt: 1452.93ms | tok/sec: 2819.13 | norm: 4.12\n", "step1935 | loss: 2.637887954711914 | dt: 1452.80ms | tok/sec: 2819.38 | norm: 4.52\n", "step1936 | loss: 2.51589035987854 | dt: 1439.53ms | tok/sec: 2845.37 | norm: 4.32\n", "step1937 | loss: 2.5204834938049316 | dt: 1450.60ms | tok/sec: 2823.66 | norm: 4.26\n", "step1938 | loss: 2.4222474098205566 | dt: 1451.60ms | tok/sec: 2821.72 | norm: 4.28\n", "step1939 | loss: 2.3624110221862793 | dt: 1450.86ms | tok/sec: 2823.15 | norm: 4.20\n", "step1940 | loss: 2.3660995960235596 | dt: 1453.31ms | tok/sec: 2818.39 | norm: 3.51\n", "step1941 | loss: 2.5144238471984863 | dt: 1452.54ms | tok/sec: 2819.88 | norm: 3.82\n", "step1942 | loss: 2.374788522720337 | dt: 1452.25ms | tok/sec: 2820.46 | norm: 3.90\n", "step1943 | loss: 2.3296666145324707 | dt: 1453.54ms | tok/sec: 2817.96 | norm: 3.89\n", "step1944 | loss: 2.244328737258911 | dt: 1437.85ms | tok/sec: 2848.70 | norm: 3.94\n", "step1945 | loss: 2.2166178226470947 | dt: 1451.25ms | tok/sec: 2822.40 | norm: 4.02\n", "step1946 | loss: 2.7552127838134766 | dt: 1450.48ms | tok/sec: 2823.89 | norm: 4.24\n", "step1947 | loss: 2.699389934539795 | dt: 1437.13ms | tok/sec: 2850.13 | norm: 4.77\n", "step1948 | loss: 2.4220850467681885 | dt: 1449.71ms | tok/sec: 2825.40 | norm: 4.45\n", "step1949 | loss: 2.3148443698883057 | dt: 1454.50ms | tok/sec: 2816.08 | norm: 4.25\n", "step1950 | loss: 2.3129472732543945 | dt: 1451.30ms | tok/sec: 2822.29 | norm: 3.92\n", "step1951 | loss: 2.359598159790039 | dt: 1447.12ms | tok/sec: 2830.45 | norm: 4.03\n", "step1952 | loss: 2.197690725326538 | dt: 1446.49ms | tok/sec: 2831.68 | norm: 4.06\n", "step1953 | loss: 2.143035888671875 | dt: 1449.97ms | tok/sec: 2824.88 | norm: 3.80\n", "step1954 | loss: 2.080129384994507 | dt: 1454.79ms | tok/sec: 2815.52 | norm: 3.39\n", "step1955 | loss: 2.666476249694824 | dt: 1454.12ms | tok/sec: 2816.83 | norm: 4.17\n", "step1956 | loss: 2.369105339050293 | dt: 1450.28ms | tok/sec: 2824.29 | norm: 3.81\n", "step1957 | loss: 2.7395076751708984 | dt: 1455.64ms | tok/sec: 2813.88 | norm: 4.59\n", "step1958 | loss: 2.490774154663086 | dt: 1441.85ms | tok/sec: 2840.80 | norm: 3.85\n", "step1959 | loss: 2.559292793273926 | dt: 1446.12ms | tok/sec: 2832.40 | norm: 3.85\n", "step1960 | loss: 2.5603060722351074 | dt: 1453.33ms | tok/sec: 2818.36 | norm: 4.04\n", "step1961 | loss: 2.422595739364624 | dt: 1439.81ms | tok/sec: 2844.83 | norm: 4.14\n", "step1962 | loss: 2.2102808952331543 | dt: 1446.12ms | tok/sec: 2832.40 | norm: 3.89\n", "step1963 | loss: 2.2629780769348145 | dt: 1446.58ms | tok/sec: 2831.50 | norm: 3.85\n", "step1964 | loss: 2.260129690170288 | dt: 1447.08ms | tok/sec: 2830.53 | norm: 3.85\n", "step1965 | loss: 2.224254608154297 | dt: 1455.74ms | tok/sec: 2813.69 | norm: 3.70\n", "step1966 | loss: 2.3830809593200684 | dt: 1456.45ms | tok/sec: 2812.31 | norm: 4.12\n", "step1967 | loss: 2.4501709938049316 | dt: 1448.60ms | tok/sec: 2827.56 | norm: 4.00\n", "step1968 | loss: 2.4060399532318115 | dt: 1451.86ms | tok/sec: 2821.22 | norm: 3.86\n", "step1969 | loss: 2.3126938343048096 | dt: 1443.38ms | tok/sec: 2837.77 | norm: 3.83\n", "step1970 | loss: 2.3703463077545166 | dt: 1454.94ms | tok/sec: 2815.24 | norm: 3.93\n", "step1971 | loss: 2.303051471710205 | dt: 1441.58ms | tok/sec: 2841.34 | norm: 3.99\n", "step1972 | loss: 2.3543334007263184 | dt: 1454.22ms | tok/sec: 2816.63 | norm: 3.82\n", "step1973 | loss: 2.110092878341675 | dt: 1438.70ms | tok/sec: 2847.00 | norm: 3.72\n", "step1974 | loss: 2.1054861545562744 | dt: 1449.06ms | tok/sec: 2826.66 | norm: 3.84\n", "step1975 | loss: 2.425658702850342 | dt: 1444.73ms | tok/sec: 2835.14 | norm: 4.28\n", "step1976 | loss: 2.192049741744995 | dt: 1453.02ms | tok/sec: 2818.96 | norm: 4.41\n", "step1977 | loss: 2.241565465927124 | dt: 1445.58ms | tok/sec: 2833.46 | norm: 4.04\n", "step1978 | loss: 2.047963857650757 | dt: 1455.98ms | tok/sec: 2813.23 | norm: 3.93\n", "step1979 | loss: 2.4914069175720215 | dt: 1453.21ms | tok/sec: 2818.59 | norm: 4.09\n", "step1980 | loss: 2.4441919326782227 | dt: 1439.87ms | tok/sec: 2844.71 | norm: 4.04\n", "step1981 | loss: 2.466071367263794 | dt: 1448.43ms | tok/sec: 2827.89 | norm: 3.92\n", "step1982 | loss: 2.397076368331909 | dt: 1453.44ms | tok/sec: 2818.14 | norm: 3.83\n", "step1983 | loss: 2.347001791000366 | dt: 1458.03ms | tok/sec: 2809.27 | norm: 4.49\n", "step1984 | loss: 2.4261558055877686 | dt: 1450.31ms | tok/sec: 2824.22 | norm: 4.01\n", "step1985 | loss: 2.402303457260132 | dt: 1452.10ms | tok/sec: 2820.74 | norm: 3.86\n", "step1986 | loss: 2.359104871749878 | dt: 1450.53ms | tok/sec: 2823.80 | norm: 3.92\n", "step1987 | loss: 2.1960904598236084 | dt: 1443.14ms | tok/sec: 2838.25 | norm: 3.87\n", "step1988 | loss: 2.014460802078247 | dt: 1451.22ms | tok/sec: 2822.46 | norm: 3.66\n", "step1989 | loss: 2.363163948059082 | dt: 1449.12ms | tok/sec: 2826.53 | norm: 4.37\n", "step1990 | loss: 2.476691484451294 | dt: 1448.42ms | tok/sec: 2827.91 | norm: 4.50\n", "step1991 | loss: 2.4999442100524902 | dt: 1438.30ms | tok/sec: 2847.81 | norm: 4.02\n", "step1992 | loss: 2.3577163219451904 | dt: 1446.63ms | tok/sec: 2831.41 | norm: 3.88\n", "step1993 | loss: 2.2733631134033203 | dt: 1443.53ms | tok/sec: 2837.49 | norm: 3.80\n", "step1994 | loss: 2.3457236289978027 | dt: 1454.69ms | tok/sec: 2815.73 | norm: 3.69\n", "step1995 | loss: 2.2675247192382812 | dt: 1447.96ms | tok/sec: 2828.81 | norm: 3.98\n", "step1996 | loss: 2.2021427154541016 | dt: 1453.70ms | tok/sec: 2817.64 | norm: 4.23\n", "step1997 | loss: 2.2661712169647217 | dt: 1454.77ms | tok/sec: 2815.57 | norm: 4.02\n", "step1998 | loss: 2.437007188796997 | dt: 1450.70ms | tok/sec: 2823.46 | norm: 3.95\n", "step1999 | loss: 2.428576707839966 | dt: 1452.59ms | tok/sec: 2819.80 | norm: 4.09\n", "step2000 | loss: 2.3320577144622803 | dt: 1447.05ms | tok/sec: 2830.58 | norm: 4.20\n", "step2001 | loss: 2.4825966358184814 | dt: 1450.67ms | tok/sec: 2823.51 | norm: 4.39\n", "step2002 | loss: 2.268983840942383 | dt: 1450.55ms | tok/sec: 2823.75 | norm: 4.20\n", "step2003 | loss: 2.2470626831054688 | dt: 1442.23ms | tok/sec: 2840.05 | norm: 4.44\n", "step2004 | loss: 2.083927869796753 | dt: 1446.01ms | tok/sec: 2832.62 | norm: 4.08\n", "step2005 | loss: 2.224797248840332 | dt: 1450.23ms | tok/sec: 2824.37 | norm: 4.30\n", "step2006 | loss: 2.153144359588623 | dt: 1444.53ms | tok/sec: 2835.53 | norm: 4.13\n", "step2007 | loss: 2.255645751953125 | dt: 1450.45ms | tok/sec: 2823.94 | norm: 4.47\n", "step2008 | loss: 2.167393922805786 | dt: 1457.01ms | tok/sec: 2811.24 | norm: 3.99\n", "step2009 | loss: 2.62178635597229 | dt: 1450.99ms | tok/sec: 2822.91 | norm: 4.43\n", "step2010 | loss: 2.5038516521453857 | dt: 1452.74ms | tok/sec: 2819.49 | norm: 4.19\n", "step2011 | loss: 2.437343120574951 | dt: 1452.26ms | tok/sec: 2820.43 | norm: 4.19\n", "step2012 | loss: 2.4776434898376465 | dt: 1452.42ms | tok/sec: 2820.13 | norm: 3.83\n", "step2013 | loss: 2.7048721313476562 | dt: 1439.26ms | tok/sec: 2845.90 | norm: 4.59\n", "step2014 | loss: 2.5494225025177 | dt: 1451.22ms | tok/sec: 2822.45 | norm: 4.51\n", "step2015 | loss: 2.328117609024048 | dt: 1451.57ms | tok/sec: 2821.76 | norm: 4.43\n", "step2016 | loss: 2.3484647274017334 | dt: 1442.27ms | tok/sec: 2839.97 | norm: 4.12\n", "step2017 | loss: 2.4869191646575928 | dt: 1449.54ms | tok/sec: 2825.72 | norm: 3.94\n", "step2018 | loss: 2.3638741970062256 | dt: 1454.17ms | tok/sec: 2816.72 | norm: 3.98\n", "step2019 | loss: 2.381220579147339 | dt: 1449.39ms | tok/sec: 2826.02 | norm: 4.26\n", "step2020 | loss: 2.293431282043457 | dt: 1441.20ms | tok/sec: 2842.08 | norm: 4.52\n", "step2021 | loss: 2.243175983428955 | dt: 1452.52ms | tok/sec: 2819.92 | norm: 4.34\n", "step2022 | loss: 2.2258872985839844 | dt: 1456.69ms | tok/sec: 2811.86 | norm: 4.12\n", "step2023 | loss: 2.3699700832366943 | dt: 1453.83ms | tok/sec: 2817.39 | norm: 4.33\n", "step2024 | loss: 2.239957571029663 | dt: 1444.84ms | tok/sec: 2834.91 | norm: 3.88\n", "step2025 | loss: 2.1796436309814453 | dt: 1442.27ms | tok/sec: 2839.97 | norm: 3.80\n", "step2026 | loss: 2.084143877029419 | dt: 1448.53ms | tok/sec: 2827.69 | norm: 3.54\n", "step2027 | loss: 2.0603621006011963 | dt: 1449.89ms | tok/sec: 2825.04 | norm: 3.62\n", "step2028 | loss: 2.5577781200408936 | dt: 1449.71ms | tok/sec: 2825.40 | norm: 4.02\n", "step2029 | loss: 2.5635836124420166 | dt: 1454.56ms | tok/sec: 2815.97 | norm: 5.00\n", "step2030 | loss: 2.2846572399139404 | dt: 1455.22ms | tok/sec: 2814.69 | norm: 4.60\n", "step2031 | loss: 2.1857805252075195 | dt: 1451.96ms | tok/sec: 2821.01 | norm: 4.26\n", "step2032 | loss: 2.170330286026001 | dt: 1449.63ms | tok/sec: 2825.54 | norm: 3.74\n", "step2033 | loss: 2.244948387145996 | dt: 1452.48ms | tok/sec: 2820.01 | norm: 4.04\n", "step2034 | loss: 2.0854122638702393 | dt: 1457.91ms | tok/sec: 2809.49 | norm: 4.22\n", "step2035 | loss: 2.0023295879364014 | dt: 1441.80ms | tok/sec: 2840.90 | norm: 3.87\n", "step2036 | loss: 1.963401436805725 | dt: 1445.71ms | tok/sec: 2833.20 | norm: 3.81\n", "step2037 | loss: 2.567160129547119 | dt: 1452.48ms | tok/sec: 2820.00 | norm: 4.72\n", "step2038 | loss: 2.243534803390503 | dt: 1453.81ms | tok/sec: 2817.43 | norm: 4.13\n", "step2039 | loss: 2.6622443199157715 | dt: 1453.22ms | tok/sec: 2818.56 | norm: 5.45\n", "step2040 | loss: 2.409956455230713 | dt: 1449.02ms | tok/sec: 2826.74 | norm: 4.66\n", "step2041 | loss: 2.450800895690918 | dt: 1453.26ms | tok/sec: 2818.50 | norm: 4.25\n", "step2042 | loss: 2.470137119293213 | dt: 1452.67ms | tok/sec: 2819.64 | norm: 4.31\n", "step2043 | loss: 2.285386800765991 | dt: 1441.20ms | tok/sec: 2842.07 | norm: 3.97\n", "step2044 | loss: 2.096414089202881 | dt: 1454.17ms | tok/sec: 2816.73 | norm: 3.75\n", "step2045 | loss: 2.158629894256592 | dt: 1451.96ms | tok/sec: 2821.01 | norm: 4.23\n", "step2046 | loss: 2.1775970458984375 | dt: 1452.01ms | tok/sec: 2820.93 | norm: 4.45\n", "step2047 | loss: 2.1532251834869385 | dt: 1452.11ms | tok/sec: 2820.72 | norm: 4.60\n", "step2048 | loss: 2.336885929107666 | dt: 1439.03ms | tok/sec: 2846.37 | norm: 4.84\n", "step2049 | loss: 2.3740177154541016 | dt: 1458.37ms | tok/sec: 2808.62 | norm: 4.35\n", "step2050 | loss: 2.300715684890747 | dt: 1451.18ms | tok/sec: 2822.53 | norm: 4.01\n", "step2051 | loss: 2.170337200164795 | dt: 1452.74ms | tok/sec: 2819.50 | norm: 3.68\n", "step2052 | loss: 2.250549554824829 | dt: 1452.00ms | tok/sec: 2820.93 | norm: 4.07\n", "step2053 | loss: 2.184805393218994 | dt: 1455.46ms | tok/sec: 2814.23 | norm: 4.14\n", "step2054 | loss: 2.245744228363037 | dt: 1456.18ms | tok/sec: 2812.83 | norm: 3.90\n", "step2055 | loss: 2.012921094894409 | dt: 1457.67ms | tok/sec: 2809.97 | norm: 3.84\n", "step2056 | loss: 2.0122013092041016 | dt: 1449.19ms | tok/sec: 2826.40 | norm: 3.84\n", "step2057 | loss: 2.3244876861572266 | dt: 1444.37ms | tok/sec: 2835.83 | norm: 4.11\n", "step2058 | loss: 2.0585832595825195 | dt: 1452.40ms | tok/sec: 2820.15 | norm: 3.68\n", "step2059 | loss: 2.118842840194702 | dt: 1459.49ms | tok/sec: 2806.46 | norm: 3.76\n", "step2060 | loss: 1.9168795347213745 | dt: 1443.46ms | tok/sec: 2837.63 | norm: 3.72\n", "step2061 | loss: 2.388321876525879 | dt: 1457.72ms | tok/sec: 2809.86 | norm: 4.46\n", "step2062 | loss: 2.340012788772583 | dt: 1447.86ms | tok/sec: 2829.00 | norm: 4.34\n", "step2063 | loss: 2.336336612701416 | dt: 1456.96ms | tok/sec: 2811.33 | norm: 3.89\n", "step2064 | loss: 2.27061128616333 | dt: 1453.82ms | tok/sec: 2817.41 | norm: 3.89\n", "step2065 | loss: 2.245072841644287 | dt: 1450.08ms | tok/sec: 2824.68 | norm: 4.84\n", "step2066 | loss: 2.3327271938323975 | dt: 1449.91ms | tok/sec: 2825.01 | norm: 5.17\n", "step2067 | loss: 2.2963385581970215 | dt: 1446.72ms | tok/sec: 2831.23 | norm: 4.62\n", "step2068 | loss: 2.2346506118774414 | dt: 1459.07ms | tok/sec: 2807.27 | norm: 4.32\n", "step2069 | loss: 2.072579860687256 | dt: 1446.32ms | tok/sec: 2832.02 | norm: 3.73\n", "step2070 | loss: 1.8891364336013794 | dt: 1450.60ms | tok/sec: 2823.67 | norm: 3.50\n", "step2071 | loss: 2.226001262664795 | dt: 1443.35ms | tok/sec: 2837.83 | norm: 3.86\n", "step2072 | loss: 2.330904006958008 | dt: 1451.83ms | tok/sec: 2821.27 | norm: 4.06\n", "step2073 | loss: 2.3470449447631836 | dt: 1452.16ms | tok/sec: 2820.63 | norm: 3.89\n", "step2074 | loss: 2.1985321044921875 | dt: 1448.89ms | tok/sec: 2826.98 | norm: 3.95\n", "step2075 | loss: 2.12199068069458 | dt: 1468.19ms | tok/sec: 2789.84 | norm: 4.12\n", "step2076 | loss: 2.2142512798309326 | dt: 1454.39ms | tok/sec: 2816.30 | norm: 3.90\n", "step2077 | loss: 2.118840456008911 | dt: 1453.32ms | tok/sec: 2818.38 | norm: 3.97\n", "step2078 | loss: 2.0736119747161865 | dt: 1445.70ms | tok/sec: 2833.23 | norm: 4.46\n", "step2079 | loss: 2.114339828491211 | dt: 1448.62ms | tok/sec: 2827.52 | norm: 4.10\n", "step2080 | loss: 2.2787513732910156 | dt: 1447.42ms | tok/sec: 2829.86 | norm: 4.22\n", "step2081 | loss: 2.3081398010253906 | dt: 1454.87ms | tok/sec: 2815.36 | norm: 4.54\n", "step2082 | loss: 2.2087502479553223 | dt: 1456.47ms | tok/sec: 2812.27 | norm: 4.10\n", "step2083 | loss: 2.3724253177642822 | dt: 1461.80ms | tok/sec: 2802.02 | norm: 4.04\n", "step2084 | loss: 2.1598596572875977 | dt: 1445.74ms | tok/sec: 2833.16 | norm: 3.90\n", "step2085 | loss: 2.177501678466797 | dt: 1444.25ms | tok/sec: 2836.08 | norm: 4.97\n", "step2086 | loss: 2.0235981941223145 | dt: 1456.91ms | tok/sec: 2811.43 | norm: 4.69\n", "step2087 | loss: 2.1737213134765625 | dt: 1460.80ms | tok/sec: 2803.95 | norm: 5.35\n", "step2088 | loss: 2.0593056678771973 | dt: 1446.18ms | tok/sec: 2832.29 | norm: 4.61\n", "step2089 | loss: 2.1496288776397705 | dt: 1463.22ms | tok/sec: 2799.30 | norm: 4.15\n", "step2090 | loss: 2.0316646099090576 | dt: 1458.73ms | tok/sec: 2807.91 | norm: 3.93\n", "step2091 | loss: 2.5207433700561523 | dt: 1453.47ms | tok/sec: 2818.08 | norm: 4.80\n", "step2092 | loss: 2.3770687580108643 | dt: 1457.68ms | tok/sec: 2809.95 | norm: 4.63\n", "step2093 | loss: 2.334106683731079 | dt: 1446.45ms | tok/sec: 2831.75 | norm: 4.54\n", "step2094 | loss: 2.3608667850494385 | dt: 1452.70ms | tok/sec: 2819.57 | norm: 4.03\n", "step2095 | loss: 2.5417187213897705 | dt: 1449.92ms | tok/sec: 2824.98 | norm: 4.10\n", "step2096 | loss: 2.4083304405212402 | dt: 1455.97ms | tok/sec: 2813.25 | norm: 3.86\n", "step2097 | loss: 2.193145513534546 | dt: 1452.72ms | tok/sec: 2819.55 | norm: 4.03\n", "step2098 | loss: 2.199258327484131 | dt: 1452.29ms | tok/sec: 2820.38 | norm: 3.82\n", "step2099 | loss: 2.355243444442749 | dt: 1454.96ms | tok/sec: 2815.19 | norm: 5.00\n", "step2100 | loss: 2.2538559436798096 | dt: 1457.37ms | tok/sec: 2810.53 | norm: 4.76\n", "step2101 | loss: 2.2867066860198975 | dt: 1455.61ms | tok/sec: 2813.94 | norm: 4.81\n", "step2102 | loss: 2.171279191970825 | dt: 1454.24ms | tok/sec: 2816.60 | norm: 4.37\n", "step2103 | loss: 2.1323046684265137 | dt: 1448.74ms | tok/sec: 2827.28 | norm: 4.11\n", "step2104 | loss: 2.110490560531616 | dt: 1445.06ms | tok/sec: 2834.48 | norm: 3.76\n", "step2105 | loss: 2.2568047046661377 | dt: 1456.50ms | tok/sec: 2812.22 | norm: 4.59\n", "step2106 | loss: 2.1213772296905518 | dt: 1452.88ms | tok/sec: 2819.22 | norm: 4.30\n", "step2107 | loss: 2.0682191848754883 | dt: 1451.84ms | tok/sec: 2821.24 | norm: 3.99\n", "step2108 | loss: 1.9901846647262573 | dt: 1450.64ms | tok/sec: 2823.59 | norm: 4.30\n", "step2109 | loss: 1.9630556106567383 | dt: 1449.82ms | tok/sec: 2825.18 | norm: 4.21\n", "step2110 | loss: 2.437925338745117 | dt: 1452.90ms | tok/sec: 2819.18 | norm: 4.21\n", "step2111 | loss: 2.4171457290649414 | dt: 1451.89ms | tok/sec: 2821.15 | norm: 4.60\n", "step2112 | loss: 2.1637701988220215 | dt: 1446.77ms | tok/sec: 2831.14 | norm: 4.52\n", "step2113 | loss: 2.024907112121582 | dt: 1447.44ms | tok/sec: 2829.83 | norm: 3.77\n", "step2114 | loss: 2.018451452255249 | dt: 1448.43ms | tok/sec: 2827.89 | norm: 3.48\n", "step2115 | loss: 2.1202104091644287 | dt: 1452.85ms | tok/sec: 2819.29 | norm: 4.11\n", "step2116 | loss: 1.9745147228240967 | dt: 1459.33ms | tok/sec: 2806.76 | norm: 4.34\n", "step2117 | loss: 1.894737720489502 | dt: 1446.27ms | tok/sec: 2832.11 | norm: 4.08\n", "step2118 | loss: 1.8460711240768433 | dt: 1449.81ms | tok/sec: 2825.19 | norm: 3.95\n", "step2119 | loss: 2.423086643218994 | dt: 1459.86ms | tok/sec: 2805.76 | norm: 4.44\n", "step2120 | loss: 2.1065807342529297 | dt: 1451.98ms | tok/sec: 2820.98 | norm: 4.13\n", "step2121 | loss: 2.551300048828125 | dt: 1457.89ms | tok/sec: 2809.55 | norm: 5.20\n", "step2122 | loss: 2.304661273956299 | dt: 1449.71ms | tok/sec: 2825.39 | norm: 4.88\n", "step2123 | loss: 2.360830545425415 | dt: 1456.03ms | tok/sec: 2813.13 | norm: 4.91\n", "step2124 | loss: 2.3501431941986084 | dt: 1449.26ms | tok/sec: 2826.27 | norm: 4.54\n", "step2125 | loss: 2.1645326614379883 | dt: 1447.74ms | tok/sec: 2829.24 | norm: 4.07\n", "step2126 | loss: 1.9735702276229858 | dt: 1455.61ms | tok/sec: 2813.94 | norm: 3.86\n", "step2127 | loss: 2.01603627204895 | dt: 1451.49ms | tok/sec: 2821.93 | norm: 3.69\n", "step2128 | loss: 2.027304172515869 | dt: 1459.39ms | tok/sec: 2806.64 | norm: 3.84\n", "step2129 | loss: 2.0059165954589844 | dt: 1449.31ms | tok/sec: 2826.17 | norm: 3.88\n", "step2130 | loss: 2.1835341453552246 | dt: 1451.59ms | tok/sec: 2821.74 | norm: 4.28\n", "step2131 | loss: 2.229395866394043 | dt: 1439.06ms | tok/sec: 2846.31 | norm: 4.03\n", "step2132 | loss: 2.1627471446990967 | dt: 1451.94ms | tok/sec: 2821.05 | norm: 4.17\n", "step2133 | loss: 2.02250337600708 | dt: 1452.41ms | tok/sec: 2820.13 | norm: 3.92\n", "step2134 | loss: 2.093256711959839 | dt: 1452.30ms | tok/sec: 2820.35 | norm: 3.82\n", "step2135 | loss: 2.053407907485962 | dt: 1452.41ms | tok/sec: 2820.13 | norm: 3.95\n", "step2136 | loss: 2.1250851154327393 | dt: 1458.80ms | tok/sec: 2807.79 | norm: 4.21\n", "step2137 | loss: 1.8818204402923584 | dt: 1455.48ms | tok/sec: 2814.19 | norm: 3.97\n", "step2138 | loss: 1.8845746517181396 | dt: 1449.12ms | tok/sec: 2826.54 | norm: 4.00\n", "step2139 | loss: 2.1744470596313477 | dt: 1460.09ms | tok/sec: 2805.31 | norm: 4.17\n", "step2140 | loss: 1.9194265604019165 | dt: 1449.15ms | tok/sec: 2826.48 | norm: 3.78\n", "step2141 | loss: 1.992175817489624 | dt: 1455.68ms | tok/sec: 2813.81 | norm: 3.73\n", "step2142 | loss: 1.7860281467437744 | dt: 1453.00ms | tok/sec: 2818.99 | norm: 3.63\n", "step2143 | loss: 2.227482795715332 | dt: 1455.51ms | tok/sec: 2814.13 | norm: 4.07\n", "step2144 | loss: 2.211858034133911 | dt: 1453.05ms | tok/sec: 2818.89 | norm: 4.28\n", "step2145 | loss: 2.2205450534820557 | dt: 1450.84ms | tok/sec: 2823.19 | norm: 4.39\n", "step2146 | loss: 2.1573619842529297 | dt: 1456.58ms | tok/sec: 2812.07 | norm: 4.18\n", "step2147 | loss: 2.1311755180358887 | dt: 1453.72ms | tok/sec: 2817.59 | norm: 4.45\n", "step2148 | loss: 2.1829471588134766 | dt: 1453.66ms | tok/sec: 2817.71 | norm: 3.94\n", "step2149 | loss: 2.142322301864624 | dt: 1449.33ms | tok/sec: 2826.13 | norm: 4.14\n", "step2150 | loss: 2.113098621368408 | dt: 1440.51ms | tok/sec: 2843.44 | norm: 4.75\n", "step2151 | loss: 1.9465851783752441 | dt: 1449.80ms | tok/sec: 2825.22 | norm: 4.55\n", "step2152 | loss: 1.771390438079834 | dt: 1461.68ms | tok/sec: 2802.25 | norm: 4.19\n", "step2153 | loss: 2.107590675354004 | dt: 1453.33ms | tok/sec: 2818.36 | norm: 4.37\n", "step2154 | loss: 2.2116177082061768 | dt: 1458.36ms | tok/sec: 2808.63 | norm: 4.28\n", "step2155 | loss: 2.235349655151367 | dt: 1450.35ms | tok/sec: 2824.14 | norm: 4.33\n", "step2156 | loss: 2.102273941040039 | dt: 1447.45ms | tok/sec: 2829.80 | norm: 4.42\n", "step2157 | loss: 1.9906933307647705 | dt: 1450.80ms | tok/sec: 2823.26 | norm: 4.23\n", "step2158 | loss: 2.0721404552459717 | dt: 1453.97ms | tok/sec: 2817.12 | norm: 3.96\n", "step2159 | loss: 1.9858897924423218 | dt: 1452.26ms | tok/sec: 2820.43 | norm: 3.84\n", "step2160 | loss: 1.9587162733078003 | dt: 1449.35ms | tok/sec: 2826.09 | norm: 4.15\n", "step2161 | loss: 2.0003814697265625 | dt: 1441.15ms | tok/sec: 2842.18 | norm: 4.03\n", "step2162 | loss: 2.171656608581543 | dt: 1454.15ms | tok/sec: 2816.77 | norm: 4.28\n", "step2163 | loss: 2.2344627380371094 | dt: 1453.91ms | tok/sec: 2817.23 | norm: 5.15\n", "step2164 | loss: 2.1196775436401367 | dt: 1449.00ms | tok/sec: 2826.77 | norm: 4.90\n", "step2165 | loss: 2.2577247619628906 | dt: 1455.96ms | tok/sec: 2813.26 | norm: 4.75\n", "step2166 | loss: 2.0435914993286133 | dt: 1458.91ms | tok/sec: 2807.57 | norm: 3.90\n", "step2167 | loss: 2.055530548095703 | dt: 1453.49ms | tok/sec: 2818.04 | norm: 4.12\n", "step2168 | loss: 1.8999923467636108 | dt: 1451.40ms | tok/sec: 2822.11 | norm: 3.74\n", "step2169 | loss: 2.063194990158081 | dt: 1449.01ms | tok/sec: 2826.76 | norm: 4.31\n", "step2170 | loss: 1.9581328630447388 | dt: 1449.19ms | tok/sec: 2826.40 | norm: 4.10\n", "step2171 | loss: 2.0341241359710693 | dt: 1450.44ms | tok/sec: 2823.97 | norm: 4.48\n", "step2172 | loss: 1.9250370264053345 | dt: 1454.24ms | tok/sec: 2816.59 | norm: 4.45\n", "step2173 | loss: 2.4264330863952637 | dt: 1437.58ms | tok/sec: 2849.23 | norm: 5.27\n", "step2174 | loss: 2.257539749145508 | dt: 1459.12ms | tok/sec: 2807.18 | norm: 4.55\n", "step2175 | loss: 2.2179064750671387 | dt: 1453.43ms | tok/sec: 2818.17 | norm: 4.53\n", "step2176 | loss: 2.2830755710601807 | dt: 1451.49ms | tok/sec: 2821.93 | norm: 4.65\n", "step2177 | loss: 2.4276905059814453 | dt: 1455.16ms | tok/sec: 2814.80 | norm: 5.21\n", "step2178 | loss: 2.286127805709839 | dt: 1454.04ms | tok/sec: 2816.98 | norm: 4.50\n", "step2179 | loss: 2.06480073928833 | dt: 1453.91ms | tok/sec: 2817.23 | norm: 4.15\n", "step2180 | loss: 2.0873122215270996 | dt: 1453.46ms | tok/sec: 2818.09 | norm: 4.15\n", "step2181 | loss: 2.2598631381988525 | dt: 1447.16ms | tok/sec: 2830.36 | norm: 4.43\n", "step2182 | loss: 2.1300594806671143 | dt: 1454.03ms | tok/sec: 2816.99 | norm: 4.46\n", "step2183 | loss: 2.164055109024048 | dt: 1455.32ms | tok/sec: 2814.50 | norm: 4.76\n", "step2184 | loss: 2.053389549255371 | dt: 1448.64ms | tok/sec: 2827.47 | norm: 4.39\n", "step2185 | loss: 1.9886503219604492 | dt: 1451.40ms | tok/sec: 2822.10 | norm: 4.18\n", "step2186 | loss: 1.9587160348892212 | dt: 1440.59ms | tok/sec: 2843.27 | norm: 3.95\n", "step2187 | loss: 2.180896043777466 | dt: 1458.92ms | tok/sec: 2807.56 | norm: 5.56\n", "step2188 | loss: 2.044384479522705 | dt: 1447.47ms | tok/sec: 2829.76 | norm: 4.91\n", "step2189 | loss: 1.9549038410186768 | dt: 1454.11ms | tok/sec: 2816.85 | norm: 4.25\n", "step2190 | loss: 1.8726319074630737 | dt: 1453.16ms | tok/sec: 2818.68 | norm: 3.96\n", "step2191 | loss: 1.8398516178131104 | dt: 1451.94ms | tok/sec: 2821.06 | norm: 3.90\n", "step2192 | loss: 2.2788190841674805 | dt: 1440.44ms | tok/sec: 2843.57 | norm: 4.30\n", "step2193 | loss: 2.278754711151123 | dt: 1452.13ms | tok/sec: 2820.69 | norm: 5.01\n", "step2194 | loss: 2.0618808269500732 | dt: 1445.34ms | tok/sec: 2833.93 | norm: 4.82\n", "step2195 | loss: 1.89663565158844 | dt: 1445.65ms | tok/sec: 2833.32 | norm: 3.98\n", "step2196 | loss: 1.887113094329834 | dt: 1456.17ms | tok/sec: 2812.86 | norm: 3.77\n", "step2197 | loss: 1.9749794006347656 | dt: 1451.44ms | tok/sec: 2822.03 | norm: 3.96\n", "step2198 | loss: 1.8208422660827637 | dt: 1459.69ms | tok/sec: 2806.08 | norm: 3.73\n", "step2199 | loss: 1.7556222677230835 | dt: 1454.50ms | tok/sec: 2816.09 | norm: 3.73\n", "step2200 | loss: 1.7011007070541382 | dt: 1452.75ms | tok/sec: 2819.49 | norm: 3.36\n", "step2201 | loss: 2.2781293392181396 | dt: 1449.38ms | tok/sec: 2826.04 | norm: 4.49\n", "step2202 | loss: 1.9702926874160767 | dt: 1449.52ms | tok/sec: 2825.76 | norm: 3.89\n", "step2203 | loss: 2.4193198680877686 | dt: 1459.20ms | tok/sec: 2807.02 | norm: 4.82\n", "step2204 | loss: 2.154370069503784 | dt: 1447.86ms | tok/sec: 2829.01 | norm: 4.26\n", "step2205 | loss: 2.219050168991089 | dt: 1450.59ms | tok/sec: 2823.68 | norm: 4.36\n", "step2206 | loss: 2.1971330642700195 | dt: 1449.69ms | tok/sec: 2825.42 | norm: 4.31\n", "step2207 | loss: 2.0521042346954346 | dt: 1441.98ms | tok/sec: 2840.54 | norm: 4.69\n", "step2208 | loss: 1.8591368198394775 | dt: 1457.61ms | tok/sec: 2810.08 | norm: 4.25\n", "step2209 | loss: 1.877807378768921 | dt: 1446.23ms | tok/sec: 2832.20 | norm: 3.97\n", "step2210 | loss: 1.8589882850646973 | dt: 1449.22ms | tok/sec: 2826.34 | norm: 3.78\n", "step2211 | loss: 1.8532943725585938 | dt: 1450.92ms | tok/sec: 2823.04 | norm: 3.86\n", "step2212 | loss: 2.040586471557617 | dt: 1450.72ms | tok/sec: 2823.42 | norm: 4.12\n", "step2213 | loss: 2.094449996948242 | dt: 1447.04ms | tok/sec: 2830.62 | norm: 4.01\n", "step2214 | loss: 2.044459581375122 | dt: 1439.84ms | tok/sec: 2844.76 | norm: 4.26\n", "step2215 | loss: 1.8886593580245972 | dt: 1450.58ms | tok/sec: 2823.70 | norm: 4.03\n", "step2216 | loss: 1.9568378925323486 | dt: 1457.23ms | tok/sec: 2810.81 | norm: 4.19\n", "step2217 | loss: 1.8959386348724365 | dt: 1449.09ms | tok/sec: 2826.59 | norm: 4.04\n", "step2218 | loss: 1.9524290561676025 | dt: 1443.32ms | tok/sec: 2837.91 | norm: 3.82\n", "step2219 | loss: 1.7172602415084839 | dt: 1457.98ms | tok/sec: 2809.36 | norm: 3.72\n", "step2220 | loss: 1.728007197380066 | dt: 1453.87ms | tok/sec: 2817.31 | norm: 3.87\n", "step2221 | loss: 2.028167247772217 | dt: 1451.18ms | tok/sec: 2822.52 | norm: 4.03\n", "step2222 | loss: 1.8184196949005127 | dt: 1453.20ms | tok/sec: 2818.61 | norm: 4.23\n", "step2223 | loss: 1.8785803318023682 | dt: 1453.57ms | tok/sec: 2817.88 | norm: 4.19\n", "step2224 | loss: 1.6559967994689941 | dt: 1452.20ms | tok/sec: 2820.55 | norm: 3.95\n", "step2225 | loss: 2.10683536529541 | dt: 1452.02ms | tok/sec: 2820.89 | norm: 4.18\n", "step2226 | loss: 2.068601131439209 | dt: 1455.88ms | tok/sec: 2813.42 | norm: 4.13\n", "step2227 | loss: 2.092600107192993 | dt: 1448.93ms | tok/sec: 2826.92 | norm: 4.30\n", "step2228 | loss: 2.01369571685791 | dt: 1453.11ms | tok/sec: 2818.77 | norm: 4.06\n", "step2229 | loss: 1.9944844245910645 | dt: 1450.27ms | tok/sec: 2824.30 | norm: 4.51\n", "step2230 | loss: 2.0639119148254395 | dt: 1450.07ms | tok/sec: 2824.68 | norm: 4.50\n", "step2231 | loss: 2.0333826541900635 | dt: 1440.50ms | tok/sec: 2843.47 | norm: 4.51\n", "step2232 | loss: 2.0020744800567627 | dt: 1452.44ms | tok/sec: 2820.08 | norm: 4.11\n", "step2233 | loss: 1.8082317113876343 | dt: 1450.22ms | tok/sec: 2824.40 | norm: 3.81\n", "step2234 | loss: 1.646148920059204 | dt: 1449.91ms | tok/sec: 2825.01 | norm: 3.83\n", "step2235 | loss: 1.9969369173049927 | dt: 1445.45ms | tok/sec: 2833.72 | norm: 5.12\n", "step2236 | loss: 2.1153340339660645 | dt: 1442.93ms | tok/sec: 2838.66 | norm: 5.51\n", "step2237 | loss: 2.124811887741089 | dt: 1445.21ms | tok/sec: 2834.18 | norm: 4.98\n", "step2238 | loss: 1.9772003889083862 | dt: 1453.59ms | tok/sec: 2817.86 | norm: 4.59\n", "step2239 | loss: 1.8696794509887695 | dt: 1452.48ms | tok/sec: 2820.00 | norm: 4.30\n", "step2240 | loss: 1.9394477605819702 | dt: 1443.31ms | tok/sec: 2837.91 | norm: 4.04\n", "step2241 | loss: 1.8494600057601929 | dt: 1454.72ms | tok/sec: 2815.66 | norm: 4.13\n", "step2242 | loss: 1.799214482307434 | dt: 1452.65ms | tok/sec: 2819.68 | norm: 3.99\n", "step2243 | loss: 1.842801570892334 | dt: 1446.33ms | tok/sec: 2831.99 | norm: 3.87\n", "step2244 | loss: 2.024428367614746 | dt: 1441.84ms | tok/sec: 2840.82 | norm: 4.13\n", "step2245 | loss: 2.102118492126465 | dt: 1447.20ms | tok/sec: 2830.30 | norm: 4.46\n", "step2246 | loss: 1.9947172403335571 | dt: 1452.83ms | tok/sec: 2819.33 | norm: 4.24\n", "step2247 | loss: 2.115513324737549 | dt: 1444.99ms | tok/sec: 2834.63 | norm: 4.15\n", "step2248 | loss: 1.9002997875213623 | dt: 1446.03ms | tok/sec: 2832.59 | norm: 4.00\n", "step2249 | loss: 1.9134525060653687 | dt: 1446.65ms | tok/sec: 2831.38 | norm: 4.26\n", "step2250 | loss: 1.760016679763794 | dt: 1452.51ms | tok/sec: 2819.96 | norm: 3.81\n", "step2251 | loss: 1.9139543771743774 | dt: 1441.96ms | tok/sec: 2840.58 | norm: 4.16\n", "step2252 | loss: 1.8283716440200806 | dt: 1455.22ms | tok/sec: 2814.70 | norm: 4.09\n", "step2253 | loss: 1.9040021896362305 | dt: 1452.10ms | tok/sec: 2820.74 | norm: 4.08\n", "step2254 | loss: 1.8117672204971313 | dt: 1453.24ms | tok/sec: 2818.53 | norm: 3.86\n", "step2255 | loss: 2.295099973678589 | dt: 1442.16ms | tok/sec: 2840.18 | norm: 4.77\n", "step2256 | loss: 2.1188127994537354 | dt: 1447.11ms | tok/sec: 2830.47 | norm: 4.56\n", "step2257 | loss: 2.0625948905944824 | dt: 1450.69ms | tok/sec: 2823.49 | norm: 4.41\n", "step2258 | loss: 2.1256470680236816 | dt: 1448.12ms | tok/sec: 2828.49 | norm: 4.15\n", "step2259 | loss: 2.290790319442749 | dt: 1457.29ms | tok/sec: 2810.70 | norm: 4.88\n", "step2260 | loss: 2.134263277053833 | dt: 1453.49ms | tok/sec: 2818.04 | norm: 4.43\n", "step2261 | loss: 1.917689323425293 | dt: 1451.89ms | tok/sec: 2821.15 | norm: 4.20\n", "step2262 | loss: 1.948123574256897 | dt: 1452.51ms | tok/sec: 2819.95 | norm: 4.45\n", "step2263 | loss: 2.108781099319458 | dt: 1452.70ms | tok/sec: 2819.58 | norm: 4.30\n", "step2264 | loss: 1.975691556930542 | dt: 1452.54ms | tok/sec: 2819.89 | norm: 4.13\n", "step2265 | loss: 2.0247905254364014 | dt: 1441.43ms | tok/sec: 2841.62 | norm: 4.37\n", "step2266 | loss: 1.948670506477356 | dt: 1447.69ms | tok/sec: 2829.33 | norm: 5.39\n", "step2267 | loss: 1.8666530847549438 | dt: 1438.83ms | tok/sec: 2846.76 | norm: 4.40\n", "step2268 | loss: 1.8328913450241089 | dt: 1448.50ms | tok/sec: 2827.75 | norm: 4.16\n", "step2269 | loss: 2.058659553527832 | dt: 1450.73ms | tok/sec: 2823.40 | norm: 5.28\n", "step2270 | loss: 1.9164977073669434 | dt: 1453.62ms | tok/sec: 2817.79 | norm: 4.53\n", "step2271 | loss: 1.8324296474456787 | dt: 1452.46ms | tok/sec: 2820.04 | norm: 4.25\n", "step2272 | loss: 1.7526136636734009 | dt: 1444.26ms | tok/sec: 2836.05 | norm: 4.31\n", "step2273 | loss: 1.739931583404541 | dt: 1440.53ms | tok/sec: 2843.40 | norm: 4.47\n", "step2274 | loss: 2.174868106842041 | dt: 1452.95ms | tok/sec: 2819.10 | norm: 4.76\n", "step2275 | loss: 2.1657516956329346 | dt: 1454.04ms | tok/sec: 2816.98 | norm: 4.74\n", "step2276 | loss: 1.9379369020462036 | dt: 1451.79ms | tok/sec: 2821.34 | norm: 4.44\n", "step2277 | loss: 1.7662729024887085 | dt: 1442.52ms | tok/sec: 2839.48 | norm: 3.92\n", "step2278 | loss: 1.7540959119796753 | dt: 1452.94ms | tok/sec: 2819.11 | norm: 3.88\n", "step2279 | loss: 1.8639588356018066 | dt: 1450.25ms | tok/sec: 2824.33 | norm: 4.28\n", "step2280 | loss: 1.6937897205352783 | dt: 1442.82ms | tok/sec: 2838.89 | norm: 4.13\n", "step2281 | loss: 1.6349053382873535 | dt: 1448.16ms | tok/sec: 2828.42 | norm: 3.90\n", "step2282 | loss: 1.5732630491256714 | dt: 1454.02ms | tok/sec: 2817.02 | norm: 3.23\n", "step2283 | loss: 2.116758108139038 | dt: 1452.49ms | tok/sec: 2819.99 | norm: 4.07\n", "step2284 | loss: 1.8124632835388184 | dt: 1452.36ms | tok/sec: 2820.23 | norm: 3.66\n", "step2285 | loss: 2.260835886001587 | dt: 1440.46ms | tok/sec: 2843.54 | norm: 4.99\n", "step2286 | loss: 2.0247857570648193 | dt: 1442.85ms | tok/sec: 2838.83 | norm: 4.50\n", "step2287 | loss: 2.0830297470092773 | dt: 1451.30ms | tok/sec: 2822.30 | norm: 4.61\n", "step2288 | loss: 2.067338466644287 | dt: 1452.81ms | tok/sec: 2819.37 | norm: 4.37\n", "step2289 | loss: 1.9186631441116333 | dt: 1440.73ms | tok/sec: 2843.01 | norm: 4.42\n", "step2290 | loss: 1.7429226636886597 | dt: 1450.89ms | tok/sec: 2823.10 | norm: 4.12\n", "step2291 | loss: 1.7814242839813232 | dt: 1447.66ms | tok/sec: 2829.39 | norm: 4.30\n", "step2292 | loss: 1.7524467706680298 | dt: 1448.68ms | tok/sec: 2827.40 | norm: 4.20\n", "step2293 | loss: 1.7286794185638428 | dt: 1454.81ms | tok/sec: 2815.48 | norm: 4.00\n", "step2294 | loss: 1.914747953414917 | dt: 1451.84ms | tok/sec: 2821.25 | norm: 4.29\n", "step2295 | loss: 1.93681800365448 | dt: 1451.82ms | tok/sec: 2821.30 | norm: 3.73\n", "step2296 | loss: 1.8842568397521973 | dt: 1441.00ms | tok/sec: 2842.47 | norm: 3.84\n", "step2297 | loss: 1.758996605873108 | dt: 1445.45ms | tok/sec: 2833.72 | norm: 4.06\n", "step2298 | loss: 1.8544689416885376 | dt: 1452.00ms | tok/sec: 2820.93 | norm: 4.48\n", "step2299 | loss: 1.7800524234771729 | dt: 1452.56ms | tok/sec: 2819.85 | norm: 4.37\n", "step2300 | loss: 1.8033356666564941 | dt: 1453.93ms | tok/sec: 2817.18 | norm: 4.13\n", "step2301 | loss: 1.6028285026550293 | dt: 1443.76ms | tok/sec: 2837.04 | norm: 4.07\n", "step2302 | loss: 1.5983110666275024 | dt: 1446.65ms | tok/sec: 2831.36 | norm: 3.93\n", "step2303 | loss: 1.8910988569259644 | dt: 1446.75ms | tok/sec: 2831.17 | norm: 4.40\n", "step2304 | loss: 1.6930530071258545 | dt: 1452.48ms | tok/sec: 2820.01 | norm: 3.90\n", "step2305 | loss: 1.7411566972732544 | dt: 1451.81ms | tok/sec: 2821.31 | norm: 3.58\n", "step2306 | loss: 1.545021891593933 | dt: 1453.46ms | tok/sec: 2818.11 | norm: 3.80\n", "step2307 | loss: 1.98597252368927 | dt: 1443.19ms | tok/sec: 2838.16 | norm: 4.45\n", "step2308 | loss: 1.9359910488128662 | dt: 1445.95ms | tok/sec: 2832.74 | norm: 4.34\n", "step2309 | loss: 1.952690601348877 | dt: 1450.43ms | tok/sec: 2823.99 | norm: 4.38\n", "step2310 | loss: 1.9109926223754883 | dt: 1447.12ms | tok/sec: 2830.44 | norm: 4.44\n", "step2311 | loss: 1.8947163820266724 | dt: 1450.20ms | tok/sec: 2824.44 | norm: 4.85\n", "step2312 | loss: 1.9214545488357544 | dt: 1448.83ms | tok/sec: 2827.10 | norm: 4.17\n", "step2313 | loss: 1.8827316761016846 | dt: 1453.18ms | tok/sec: 2818.64 | norm: 3.95\n", "step2314 | loss: 1.8459807634353638 | dt: 1451.99ms | tok/sec: 2820.95 | norm: 4.06\n", "step2315 | loss: 1.6790515184402466 | dt: 1449.91ms | tok/sec: 2825.00 | norm: 3.86\n", "step2316 | loss: 1.5293729305267334 | dt: 1443.11ms | tok/sec: 2838.32 | norm: 3.94\n", "step2317 | loss: 1.8729000091552734 | dt: 1444.78ms | tok/sec: 2835.03 | norm: 4.57\n", "step2318 | loss: 1.9820619821548462 | dt: 1452.13ms | tok/sec: 2820.68 | norm: 4.49\n", "step2319 | loss: 1.9906723499298096 | dt: 1445.72ms | tok/sec: 2833.19 | norm: 4.46\n", "step2320 | loss: 1.8500267267227173 | dt: 1444.67ms | tok/sec: 2835.26 | norm: 4.52\n", "step2321 | loss: 1.7525075674057007 | dt: 1452.25ms | tok/sec: 2820.44 | norm: 4.95\n", "step2322 | loss: 1.7963370084762573 | dt: 1452.30ms | tok/sec: 2820.35 | norm: 4.27\n", "step2323 | loss: 1.7035599946975708 | dt: 1436.30ms | tok/sec: 2851.78 | norm: 4.20\n", "step2324 | loss: 1.6603925228118896 | dt: 1454.24ms | tok/sec: 2816.59 | norm: 4.38\n", "step2325 | loss: 1.7057156562805176 | dt: 1452.79ms | tok/sec: 2819.40 | norm: 4.25\n", "step2326 | loss: 1.8737126588821411 | dt: 1445.98ms | tok/sec: 2832.68 | norm: 4.02\n", "step2327 | loss: 1.9317286014556885 | dt: 1446.51ms | tok/sec: 2831.64 | norm: 4.17\n", "step2328 | loss: 1.835050106048584 | dt: 1445.33ms | tok/sec: 2833.95 | norm: 3.98\n", "step2329 | loss: 1.9497987031936646 | dt: 1442.17ms | tok/sec: 2840.16 | norm: 4.16\n", "step2330 | loss: 1.7692300081253052 | dt: 1450.30ms | tok/sec: 2824.24 | norm: 3.96\n", "step2331 | loss: 1.7517300844192505 | dt: 1452.54ms | tok/sec: 2819.88 | norm: 4.36\n", "step2332 | loss: 1.614229679107666 | dt: 1441.48ms | tok/sec: 2841.53 | norm: 4.01\n", "step2333 | loss: 1.7606457471847534 | dt: 1451.67ms | tok/sec: 2821.57 | norm: 4.05\n", "step2334 | loss: 1.6857949495315552 | dt: 1453.43ms | tok/sec: 2818.16 | norm: 3.92\n", "step2335 | loss: 1.7625232934951782 | dt: 1437.04ms | tok/sec: 2850.30 | norm: 3.85\n", "step2336 | loss: 1.6594853401184082 | dt: 1449.07ms | tok/sec: 2826.64 | norm: 3.66\n", "step2337 | loss: 2.135178804397583 | dt: 1438.97ms | tok/sec: 2846.48 | norm: 4.74\n", "step2338 | loss: 1.9647594690322876 | dt: 1450.19ms | tok/sec: 2824.45 | norm: 4.44\n", "step2339 | loss: 1.9198414087295532 | dt: 1453.25ms | tok/sec: 2818.51 | norm: 4.28\n", "step2340 | loss: 1.9577149152755737 | dt: 1448.38ms | tok/sec: 2827.98 | norm: 3.90\n", "step2341 | loss: 2.162044048309326 | dt: 1445.32ms | tok/sec: 2833.98 | norm: 4.70\n", "step2342 | loss: 1.9876433610916138 | dt: 1446.36ms | tok/sec: 2831.93 | norm: 4.34\n", "step2343 | loss: 1.7631951570510864 | dt: 1449.68ms | tok/sec: 2825.46 | norm: 3.98\n", "step2344 | loss: 1.8127027750015259 | dt: 1442.03ms | tok/sec: 2840.43 | norm: 4.48\n", "step2345 | loss: 1.9503133296966553 | dt: 1447.52ms | tok/sec: 2829.68 | norm: 4.53\n", "step2346 | loss: 1.8294658660888672 | dt: 1455.91ms | tok/sec: 2813.36 | norm: 4.45\n", "step2347 | loss: 1.9280619621276855 | dt: 1450.20ms | tok/sec: 2824.43 | norm: 4.79\n", "step2348 | loss: 1.8189836740493774 | dt: 1440.79ms | tok/sec: 2842.88 | norm: 4.39\n", "step2349 | loss: 1.7269326448440552 | dt: 1450.22ms | tok/sec: 2824.41 | norm: 3.94\n", "step2350 | loss: 1.6945915222167969 | dt: 1448.09ms | tok/sec: 2828.55 | norm: 3.66\n", "step2351 | loss: 1.9598686695098877 | dt: 1448.80ms | tok/sec: 2827.18 | norm: 6.32\n", "step2352 | loss: 1.8148558139801025 | dt: 1452.42ms | tok/sec: 2820.11 | norm: 5.12\n", "step2353 | loss: 1.7121814489364624 | dt: 1454.12ms | tok/sec: 2816.83 | norm: 4.84\n", "step2354 | loss: 1.6040068864822388 | dt: 1441.85ms | tok/sec: 2840.80 | norm: 4.13\n", "step2355 | loss: 1.6129326820373535 | dt: 1445.94ms | tok/sec: 2832.75 | norm: 4.38\n", "step2356 | loss: 2.0300440788269043 | dt: 1451.49ms | tok/sec: 2821.92 | norm: 4.46\n", "step2357 | loss: 2.0211732387542725 | dt: 1451.16ms | tok/sec: 2822.58 | norm: 5.01\n", "step2358 | loss: 1.806550145149231 | dt: 1448.71ms | tok/sec: 2827.34 | norm: 4.87\n", "step2359 | loss: 1.6411967277526855 | dt: 1453.49ms | tok/sec: 2818.04 | norm: 4.32\n", "step2360 | loss: 1.6382660865783691 | dt: 1437.80ms | tok/sec: 2848.80 | norm: 3.87\n", "step2361 | loss: 1.7406048774719238 | dt: 1447.65ms | tok/sec: 2829.40 | norm: 4.09\n", "step2362 | loss: 1.5596435070037842 | dt: 1440.14ms | tok/sec: 2844.18 | norm: 3.98\n", "step2363 | loss: 1.5083364248275757 | dt: 1449.69ms | tok/sec: 2825.43 | norm: 4.07\n", "step2364 | loss: 1.4384719133377075 | dt: 1454.35ms | tok/sec: 2816.38 | norm: 3.76\n", "step2365 | loss: 2.0151851177215576 | dt: 1452.04ms | tok/sec: 2820.86 | norm: 4.73\n", "step2366 | loss: 1.686078667640686 | dt: 1453.85ms | tok/sec: 2817.35 | norm: 3.90\n", "step2367 | loss: 2.165253162384033 | dt: 1449.46ms | tok/sec: 2825.88 | norm: 5.05\n", "step2368 | loss: 1.89961838722229 | dt: 1447.48ms | tok/sec: 2829.74 | norm: 4.39\n", "step2369 | loss: 1.9692294597625732 | dt: 1440.60ms | tok/sec: 2843.26 | norm: 4.64\n", "step2370 | loss: 1.946481466293335 | dt: 1451.47ms | tok/sec: 2821.98 | norm: 4.49\n", "step2371 | loss: 1.8158645629882812 | dt: 1450.87ms | tok/sec: 2823.14 | norm: 4.66\n", "step2372 | loss: 1.6149976253509521 | dt: 1445.94ms | tok/sec: 2832.76 | norm: 3.81\n", "step2373 | loss: 1.6460492610931396 | dt: 1444.51ms | tok/sec: 2835.56 | norm: 4.02\n", "step2374 | loss: 1.638087272644043 | dt: 1437.28ms | tok/sec: 2849.82 | norm: 4.31\n", "step2375 | loss: 1.6166325807571411 | dt: 1443.47ms | tok/sec: 2837.61 | norm: 4.04\n", "step2376 | loss: 1.8071461915969849 | dt: 1445.02ms | tok/sec: 2834.57 | norm: 4.65\n", "step2377 | loss: 1.819286584854126 | dt: 1455.11ms | tok/sec: 2814.91 | norm: 4.18\n", "step2378 | loss: 1.7536975145339966 | dt: 1443.07ms | tok/sec: 2838.39 | norm: 4.15\n", "step2379 | loss: 1.666953682899475 | dt: 1451.25ms | tok/sec: 2822.39 | norm: 4.54\n", "step2380 | loss: 1.7191451787948608 | dt: 1457.26ms | tok/sec: 2810.75 | norm: 4.07\n", "step2381 | loss: 1.6530646085739136 | dt: 1447.18ms | tok/sec: 2830.33 | norm: 4.03\n", "step2382 | loss: 1.7062090635299683 | dt: 1449.22ms | tok/sec: 2826.36 | norm: 4.11\n", "step2383 | loss: 1.4791666269302368 | dt: 1448.47ms | tok/sec: 2827.81 | norm: 4.02\n", "step2384 | loss: 1.4742286205291748 | dt: 1450.19ms | tok/sec: 2824.45 | norm: 4.17\n", "step2385 | loss: 1.7629331350326538 | dt: 1453.88ms | tok/sec: 2817.29 | norm: 4.14\n", "step2386 | loss: 1.5657777786254883 | dt: 1453.34ms | tok/sec: 2818.34 | norm: 4.18\n", "step2387 | loss: 1.5956157445907593 | dt: 1444.38ms | tok/sec: 2835.82 | norm: 3.76\n", "step2388 | loss: 1.4075416326522827 | dt: 1443.27ms | tok/sec: 2838.00 | norm: 3.87\n", "step2389 | loss: 1.8464046716690063 | dt: 1450.63ms | tok/sec: 2823.59 | norm: 4.20\n", "step2390 | loss: 1.815690517425537 | dt: 1455.48ms | tok/sec: 2814.20 | norm: 4.35\n", "step2391 | loss: 1.8040225505828857 | dt: 1453.06ms | tok/sec: 2818.88 | norm: 3.92\n", "step2392 | loss: 1.766324520111084 | dt: 1448.91ms | tok/sec: 2826.96 | norm: 4.10\n", "step2393 | loss: 1.7536314725875854 | dt: 1453.71ms | tok/sec: 2817.61 | norm: 4.78\n", "step2394 | loss: 1.7721710205078125 | dt: 1453.77ms | tok/sec: 2817.49 | norm: 4.15\n", "step2395 | loss: 1.7547202110290527 | dt: 1449.05ms | tok/sec: 2826.68 | norm: 4.28\n", "step2396 | loss: 1.6987789869308472 | dt: 1437.85ms | tok/sec: 2848.69 | norm: 4.06\n", "step2397 | loss: 1.5395909547805786 | dt: 1442.78ms | tok/sec: 2838.97 | norm: 3.72\n", "step2398 | loss: 1.384803056716919 | dt: 1447.85ms | tok/sec: 2829.01 | norm: 3.60\n", "step2399 | loss: 1.7431296110153198 | dt: 1448.34ms | tok/sec: 2828.06 | norm: 4.27\n", "step2400 | loss: 1.8382668495178223 | dt: 1458.10ms | tok/sec: 2809.13 | norm: 4.26\n", "step2401 | loss: 1.862168312072754 | dt: 1449.64ms | tok/sec: 2825.54 | norm: 4.32\n", "step2402 | loss: 1.7118881940841675 | dt: 1456.52ms | tok/sec: 2812.19 | norm: 4.66\n", "step2403 | loss: 1.600861668586731 | dt: 1454.12ms | tok/sec: 2816.82 | norm: 4.20\n", "step2404 | loss: 1.6487312316894531 | dt: 1455.07ms | tok/sec: 2814.98 | norm: 4.00\n", "step2405 | loss: 1.5369305610656738 | dt: 1451.51ms | tok/sec: 2821.88 | norm: 3.75\n", "step2406 | loss: 1.5168532133102417 | dt: 1447.82ms | tok/sec: 2829.08 | norm: 4.31\n", "step2407 | loss: 1.5760107040405273 | dt: 1453.65ms | tok/sec: 2817.74 | norm: 4.34\n", "step2408 | loss: 1.7569619417190552 | dt: 1453.60ms | tok/sec: 2817.84 | norm: 4.87\n", "step2409 | loss: 1.8207486867904663 | dt: 1454.32ms | tok/sec: 2816.44 | norm: 4.78\n", "step2410 | loss: 1.7348504066467285 | dt: 1453.04ms | tok/sec: 2818.91 | norm: 4.85\n", "step2411 | loss: 1.8024821281433105 | dt: 1453.40ms | tok/sec: 2818.22 | norm: 4.14\n", "step2412 | loss: 1.6044517755508423 | dt: 1453.56ms | tok/sec: 2817.90 | norm: 3.43\n", "step2413 | loss: 1.5902587175369263 | dt: 1453.34ms | tok/sec: 2818.34 | norm: 3.73\n", "step2414 | loss: 1.4845523834228516 | dt: 1442.96ms | tok/sec: 2838.62 | norm: 4.12\n", "step2415 | loss: 1.5978364944458008 | dt: 1455.77ms | tok/sec: 2813.63 | norm: 3.81\n", "step2416 | loss: 1.5304898023605347 | dt: 1447.25ms | tok/sec: 2830.20 | norm: 3.86\n", "step2417 | loss: 1.6221897602081299 | dt: 1452.49ms | tok/sec: 2819.98 | norm: 4.14\n", "step2418 | loss: 1.5544590950012207 | dt: 1449.45ms | tok/sec: 2825.91 | norm: 4.34\n", "step2419 | loss: 2.018544912338257 | dt: 1452.65ms | tok/sec: 2819.67 | norm: 5.11\n", "step2420 | loss: 1.8798530101776123 | dt: 1458.30ms | tok/sec: 2808.76 | norm: 4.98\n", "step2421 | loss: 1.8019599914550781 | dt: 1451.10ms | tok/sec: 2822.68 | norm: 4.29\n", "step2422 | loss: 1.8130394220352173 | dt: 1458.91ms | tok/sec: 2807.57 | norm: 3.87\n", "step2423 | loss: 1.9969813823699951 | dt: 1446.57ms | tok/sec: 2831.52 | norm: 4.58\n", "step2424 | loss: 1.85893714427948 | dt: 1452.29ms | tok/sec: 2820.38 | norm: 4.75\n", "step2425 | loss: 1.6183245182037354 | dt: 1448.88ms | tok/sec: 2827.02 | norm: 3.91\n", "step2426 | loss: 1.6773314476013184 | dt: 1453.99ms | tok/sec: 2817.08 | norm: 4.06\n", "step2427 | loss: 1.7939426898956299 | dt: 1447.87ms | tok/sec: 2828.98 | norm: 3.90\n", "step2428 | loss: 1.6828089952468872 | dt: 1449.37ms | tok/sec: 2826.06 | norm: 4.17\n", "step2429 | loss: 1.7937896251678467 | dt: 1446.00ms | tok/sec: 2832.64 | norm: 4.60\n", "step2430 | loss: 1.6912707090377808 | dt: 1458.21ms | tok/sec: 2808.92 | norm: 4.95\n", "step2431 | loss: 1.5939662456512451 | dt: 1448.97ms | tok/sec: 2826.84 | norm: 4.24\n", "step2432 | loss: 1.5466444492340088 | dt: 1456.11ms | tok/sec: 2812.97 | norm: 3.82\n", "step2433 | loss: 1.849073052406311 | dt: 1448.97ms | tok/sec: 2826.83 | norm: 4.60\n", "step2434 | loss: 1.6783514022827148 | dt: 1450.64ms | tok/sec: 2823.59 | norm: 4.21\n", "step2435 | loss: 1.5905433893203735 | dt: 1457.97ms | tok/sec: 2809.38 | norm: 4.15\n", "step2436 | loss: 1.4688082933425903 | dt: 1449.56ms | tok/sec: 2825.68 | norm: 4.00\n", "step2437 | loss: 1.4590847492218018 | dt: 1454.79ms | tok/sec: 2815.53 | norm: 4.00\n", "step2438 | loss: 1.8602937459945679 | dt: 1458.61ms | tok/sec: 2808.15 | norm: 4.52\n", "step2439 | loss: 1.901569128036499 | dt: 1450.48ms | tok/sec: 2823.90 | norm: 5.35\n", "step2440 | loss: 1.6850544214248657 | dt: 1451.98ms | tok/sec: 2820.97 | norm: 4.81\n", "step2441 | loss: 1.5059958696365356 | dt: 1451.83ms | tok/sec: 2821.26 | norm: 3.97\n", "step2442 | loss: 1.4991182088851929 | dt: 1452.63ms | tok/sec: 2819.72 | norm: 3.75\n", "step2443 | loss: 1.587546467781067 | dt: 1454.22ms | tok/sec: 2816.64 | norm: 3.93\n", "step2444 | loss: 1.4056382179260254 | dt: 1453.30ms | tok/sec: 2818.41 | norm: 3.84\n", "step2445 | loss: 1.3553385734558105 | dt: 1453.21ms | tok/sec: 2818.58 | norm: 3.50\n", "step2446 | loss: 1.2979568243026733 | dt: 1453.65ms | tok/sec: 2817.73 | norm: 3.26\n", "step2447 | loss: 1.8686509132385254 | dt: 1450.02ms | tok/sec: 2824.79 | norm: 4.69\n", "step2448 | loss: 1.5453035831451416 | dt: 1453.77ms | tok/sec: 2817.51 | norm: 4.07\n", "step2449 | loss: 2.027157783508301 | dt: 1458.70ms | tok/sec: 2807.98 | norm: 5.16\n", "step2450 | loss: 1.8055319786071777 | dt: 1448.16ms | tok/sec: 2828.43 | norm: 4.72\n", "step2451 | loss: 1.8693602085113525 | dt: 1452.49ms | tok/sec: 2819.99 | norm: 4.72\n", "step2452 | loss: 1.8313157558441162 | dt: 1453.13ms | tok/sec: 2818.75 | norm: 4.58\n", "step2453 | loss: 1.7217528820037842 | dt: 1448.80ms | tok/sec: 2827.18 | norm: 5.20\n", "step2454 | loss: 1.5006097555160522 | dt: 1454.59ms | tok/sec: 2815.91 | norm: 4.10\n", "step2455 | loss: 1.5106284618377686 | dt: 1456.03ms | tok/sec: 2813.14 | norm: 3.87\n", "step2456 | loss: 1.4655017852783203 | dt: 1453.58ms | tok/sec: 2817.88 | norm: 3.54\n", "step2457 | loss: 1.4517147541046143 | dt: 1449.94ms | tok/sec: 2824.95 | norm: 3.74\n", "step2458 | loss: 1.6348000764846802 | dt: 1455.08ms | tok/sec: 2814.96 | norm: 4.18\n", "step2459 | loss: 1.6784216165542603 | dt: 1453.21ms | tok/sec: 2818.58 | norm: 4.26\n", "step2460 | loss: 1.609527349472046 | dt: 1457.60ms | tok/sec: 2810.10 | norm: 4.03\n", "step2461 | loss: 1.527980923652649 | dt: 1456.38ms | tok/sec: 2812.45 | norm: 4.01\n", "step2462 | loss: 1.5818957090377808 | dt: 1449.72ms | tok/sec: 2825.37 | norm: 4.34\n", "step2463 | loss: 1.5428587198257446 | dt: 1463.40ms | tok/sec: 2798.96 | norm: 4.93\n", "step2464 | loss: 1.5661265850067139 | dt: 1444.36ms | tok/sec: 2835.86 | norm: 4.37\n", "step2465 | loss: 1.3600953817367554 | dt: 1458.00ms | tok/sec: 2809.32 | norm: 4.07\n", "step2466 | loss: 1.3574161529541016 | dt: 1454.47ms | tok/sec: 2816.14 | norm: 4.10\n", "step2467 | loss: 1.635671854019165 | dt: 1455.26ms | tok/sec: 2814.61 | norm: 4.10\n", "step2468 | loss: 1.4410537481307983 | dt: 1458.81ms | tok/sec: 2807.77 | norm: 4.01\n", "step2469 | loss: 1.4562835693359375 | dt: 1449.36ms | tok/sec: 2826.08 | norm: 3.47\n", "step2470 | loss: 1.273439645767212 | dt: 1462.60ms | tok/sec: 2800.50 | norm: 3.62\n", "step2471 | loss: 1.7068240642547607 | dt: 1453.48ms | tok/sec: 2818.07 | norm: 4.12\n", "step2472 | loss: 1.647735357284546 | dt: 1449.59ms | tok/sec: 2825.63 | norm: 3.83\n", "step2473 | loss: 1.6287541389465332 | dt: 1446.65ms | tok/sec: 2831.37 | norm: 3.79\n", "step2474 | loss: 1.5944780111312866 | dt: 1449.44ms | tok/sec: 2825.92 | norm: 3.68\n", "step2475 | loss: 1.607010006904602 | dt: 1461.62ms | tok/sec: 2802.38 | norm: 4.47\n", "step2476 | loss: 1.6249808073043823 | dt: 1444.82ms | tok/sec: 2834.95 | norm: 4.04\n", "step2477 | loss: 1.6023235321044922 | dt: 1449.34ms | tok/sec: 2826.11 | norm: 4.11\n", "step2478 | loss: 1.536015510559082 | dt: 1450.80ms | tok/sec: 2823.27 | norm: 4.07\n", "step2479 | loss: 1.3775184154510498 | dt: 1460.64ms | tok/sec: 2804.25 | norm: 3.51\n", "step2480 | loss: 1.2656913995742798 | dt: 1460.97ms | tok/sec: 2803.63 | norm: 3.80\n", "step2481 | loss: 1.6034272909164429 | dt: 1449.15ms | tok/sec: 2826.48 | norm: 4.28\n", "step2482 | loss: 1.6805529594421387 | dt: 1444.83ms | tok/sec: 2834.94 | norm: 4.49\n", "step2483 | loss: 1.7212923765182495 | dt: 1452.56ms | tok/sec: 2819.85 | norm: 4.62\n", "step2484 | loss: 1.559260368347168 | dt: 1454.62ms | tok/sec: 2815.86 | norm: 4.29\n", "step2485 | loss: 1.449627161026001 | dt: 1460.73ms | tok/sec: 2804.09 | norm: 4.16\n", "step2486 | loss: 1.513278841972351 | dt: 1449.28ms | tok/sec: 2826.23 | norm: 4.28\n", "step2487 | loss: 1.4080488681793213 | dt: 1449.66ms | tok/sec: 2825.50 | norm: 4.25\n", "step2488 | loss: 1.3824905157089233 | dt: 1459.08ms | tok/sec: 2807.25 | norm: 4.38\n", "step2489 | loss: 1.4418511390686035 | dt: 1460.81ms | tok/sec: 2803.92 | norm: 4.15\n", "step2490 | loss: 1.590196132659912 | dt: 1459.92ms | tok/sec: 2805.62 | norm: 4.05\n", "step2491 | loss: 1.6575039625167847 | dt: 1455.57ms | tok/sec: 2814.02 | norm: 4.20\n", "step2492 | loss: 1.5695641040802002 | dt: 1461.26ms | tok/sec: 2803.06 | norm: 4.03\n", "step2493 | loss: 1.6648973226547241 | dt: 1445.70ms | tok/sec: 2833.22 | norm: 4.41\n", "step2494 | loss: 1.437210202217102 | dt: 1460.68ms | tok/sec: 2804.17 | norm: 3.62\n", "step2495 | loss: 1.4442253112792969 | dt: 1446.81ms | tok/sec: 2831.05 | norm: 3.85\n", "step2496 | loss: 1.3814345598220825 | dt: 1450.72ms | tok/sec: 2823.42 | norm: 3.62\n", "step2497 | loss: 1.4242706298828125 | dt: 1455.25ms | tok/sec: 2814.63 | norm: 3.46\n", "step2498 | loss: 1.3620213270187378 | dt: 1452.06ms | tok/sec: 2820.82 | norm: 3.39\n", "step2499 | loss: 1.442610502243042 | dt: 1455.89ms | tok/sec: 2813.40 | norm: 3.57\n", "step2500 | loss: 1.403993010520935 | dt: 1458.04ms | tok/sec: 2809.25 | norm: 4.16\n", "step2501 | loss: 1.8568072319030762 | dt: 1446.69ms | tok/sec: 2831.30 | norm: 4.51\n", "step2502 | loss: 1.7104779481887817 | dt: 1461.33ms | tok/sec: 2802.93 | norm: 4.51\n", "step2503 | loss: 1.6365740299224854 | dt: 1461.93ms | tok/sec: 2801.78 | norm: 4.28\n", "step2504 | loss: 1.6349953413009644 | dt: 1441.66ms | tok/sec: 2841.17 | norm: 3.96\n", "step2505 | loss: 1.8190425634384155 | dt: 1455.70ms | tok/sec: 2813.77 | norm: 4.39\n", "step2506 | loss: 1.7008137702941895 | dt: 1451.79ms | tok/sec: 2821.34 | norm: 3.94\n", "step2507 | loss: 1.4550315141677856 | dt: 1453.49ms | tok/sec: 2818.05 | norm: 3.71\n", "step2508 | loss: 1.5107600688934326 | dt: 1447.69ms | tok/sec: 2829.34 | norm: 4.26\n", "step2509 | loss: 1.6585828065872192 | dt: 1441.66ms | tok/sec: 2841.17 | norm: 4.67\n", "step2510 | loss: 1.5505459308624268 | dt: 1450.02ms | tok/sec: 2824.79 | norm: 4.41\n", "step2511 | loss: 1.6651084423065186 | dt: 1446.01ms | tok/sec: 2832.62 | norm: 4.55\n", "step2512 | loss: 1.5476833581924438 | dt: 1460.86ms | tok/sec: 2803.82 | norm: 4.43\n", "step2513 | loss: 1.460029125213623 | dt: 1446.81ms | tok/sec: 2831.06 | norm: 3.99\n", "step2514 | loss: 1.4174748659133911 | dt: 1458.34ms | tok/sec: 2808.67 | norm: 3.97\n", "step2515 | loss: 1.7375333309173584 | dt: 1459.73ms | tok/sec: 2806.00 | norm: 5.50\n", "step2516 | loss: 1.5778083801269531 | dt: 1440.86ms | tok/sec: 2842.75 | norm: 4.72\n", "step2517 | loss: 1.4812049865722656 | dt: 1459.59ms | tok/sec: 2806.27 | norm: 4.14\n", "step2518 | loss: 1.3571346998214722 | dt: 1450.21ms | tok/sec: 2824.43 | norm: 4.14\n", "step2519 | loss: 1.3281798362731934 | dt: 1451.91ms | tok/sec: 2821.11 | norm: 4.12\n", "step2520 | loss: 1.7240253686904907 | dt: 1454.24ms | tok/sec: 2816.59 | norm: 4.55\n", "step2521 | loss: 1.775623083114624 | dt: 1453.81ms | tok/sec: 2817.42 | norm: 5.38\n", "step2522 | loss: 1.56985342502594 | dt: 1450.98ms | tok/sec: 2822.92 | norm: 5.03\n", "step2523 | loss: 1.37110435962677 | dt: 1442.67ms | tok/sec: 2839.18 | norm: 4.38\n", "step2524 | loss: 1.387731671333313 | dt: 1445.71ms | tok/sec: 2833.21 | norm: 4.37\n", "step2525 | loss: 1.4532517194747925 | dt: 1446.90ms | tok/sec: 2830.88 | norm: 4.08\n", "step2526 | loss: 1.309374213218689 | dt: 1451.09ms | tok/sec: 2822.70 | norm: 4.58\n", "step2527 | loss: 1.2851331233978271 | dt: 1455.36ms | tok/sec: 2814.43 | norm: 4.72\n", "step2528 | loss: 1.2122862339019775 | dt: 1450.48ms | tok/sec: 2823.90 | norm: 3.99\n", "step2529 | loss: 1.7470275163650513 | dt: 1447.48ms | tok/sec: 2829.74 | norm: 4.25\n", "step2530 | loss: 1.4402034282684326 | dt: 1456.02ms | tok/sec: 2813.16 | norm: 4.23\n", "step2531 | loss: 1.8836948871612549 | dt: 1459.12ms | tok/sec: 2807.18 | norm: 4.95\n", "step2532 | loss: 1.6408312320709229 | dt: 1462.22ms | tok/sec: 2801.22 | norm: 4.24\n", "step2533 | loss: 1.7107917070388794 | dt: 1455.25ms | tok/sec: 2814.63 | norm: 4.50\n", "step2534 | loss: 1.6803665161132812 | dt: 1445.47ms | tok/sec: 2833.69 | norm: 4.31\n", "step2535 | loss: 1.5745893716812134 | dt: 1460.89ms | tok/sec: 2803.76 | norm: 4.41\n", "step2536 | loss: 1.3652729988098145 | dt: 1456.15ms | tok/sec: 2812.90 | norm: 3.93\n", "step2537 | loss: 1.3508734703063965 | dt: 1459.21ms | tok/sec: 2806.99 | norm: 3.79\n", "step2538 | loss: 1.3304169178009033 | dt: 1456.86ms | tok/sec: 2811.54 | norm: 4.09\n", "step2539 | loss: 1.3271392583847046 | dt: 1449.28ms | tok/sec: 2826.22 | norm: 3.92\n", "step2540 | loss: 1.5123941898345947 | dt: 1452.90ms | tok/sec: 2819.20 | norm: 4.31\n", "step2541 | loss: 1.5234240293502808 | dt: 1448.71ms | tok/sec: 2827.34 | norm: 3.87\n", "step2542 | loss: 1.4904471635818481 | dt: 1446.09ms | tok/sec: 2832.46 | norm: 4.21\n", "step2543 | loss: 1.401611089706421 | dt: 1451.85ms | tok/sec: 2821.24 | norm: 4.31\n", "step2544 | loss: 1.4457921981811523 | dt: 1454.42ms | tok/sec: 2816.24 | norm: 4.17\n", "step2545 | loss: 1.4460372924804688 | dt: 1448.75ms | tok/sec: 2827.26 | norm: 4.52\n", "step2546 | loss: 1.4401732683181763 | dt: 1447.92ms | tok/sec: 2828.89 | norm: 4.20\n", "step2547 | loss: 1.2499853372573853 | dt: 1446.45ms | tok/sec: 2831.75 | norm: 4.19\n", "step2548 | loss: 1.2404358386993408 | dt: 1447.58ms | tok/sec: 2829.55 | norm: 4.26\n", "step2549 | loss: 1.4969911575317383 | dt: 1453.10ms | tok/sec: 2818.80 | norm: 3.93\n", "step2550 | loss: 1.3135108947753906 | dt: 1450.91ms | tok/sec: 2823.06 | norm: 3.83\n", "step2551 | loss: 1.3234727382659912 | dt: 1449.68ms | tok/sec: 2825.46 | norm: 3.87\n", "step2552 | loss: 1.1522964239120483 | dt: 1455.07ms | tok/sec: 2814.99 | norm: 3.78\n", "step2553 | loss: 1.5680307149887085 | dt: 1451.17ms | tok/sec: 2822.55 | norm: 3.94\n", "step2554 | loss: 1.5052815675735474 | dt: 1452.04ms | tok/sec: 2820.85 | norm: 3.62\n", "step2555 | loss: 1.4713884592056274 | dt: 1462.50ms | tok/sec: 2800.67 | norm: 3.63\n", "step2556 | loss: 1.4491052627563477 | dt: 1454.65ms | tok/sec: 2815.80 | norm: 3.67\n", "step2557 | loss: 1.4614664316177368 | dt: 1457.39ms | tok/sec: 2810.51 | norm: 3.97\n", "step2558 | loss: 1.4440668821334839 | dt: 1448.21ms | tok/sec: 2828.33 | norm: 3.53\n", "step2559 | loss: 1.4562757015228271 | dt: 1461.09ms | tok/sec: 2803.39 | norm: 3.81\n", "step2560 | loss: 1.3941216468811035 | dt: 1462.48ms | tok/sec: 2800.71 | norm: 3.84\n", "step2561 | loss: 1.2179468870162964 | dt: 1445.65ms | tok/sec: 2833.32 | norm: 3.45\n", "step2562 | loss: 1.1067124605178833 | dt: 1450.67ms | tok/sec: 2823.52 | norm: 3.43\n", "step2563 | loss: 1.4315307140350342 | dt: 1453.49ms | tok/sec: 2818.05 | norm: 3.74\n", "step2564 | loss: 1.5112934112548828 | dt: 1454.56ms | tok/sec: 2815.98 | norm: 3.95\n", "step2565 | loss: 1.560497522354126 | dt: 1453.83ms | tok/sec: 2817.38 | norm: 4.26\n", "step2566 | loss: 1.416483759880066 | dt: 1458.39ms | tok/sec: 2808.57 | norm: 4.18\n", "step2567 | loss: 1.315719723701477 | dt: 1456.41ms | tok/sec: 2812.39 | norm: 4.00\n", "step2568 | loss: 1.3764325380325317 | dt: 1448.29ms | tok/sec: 2828.17 | norm: 4.16\n", "step2569 | loss: 1.2886805534362793 | dt: 1458.15ms | tok/sec: 2809.05 | norm: 4.36\n", "step2570 | loss: 1.2691296339035034 | dt: 1451.78ms | tok/sec: 2821.36 | norm: 4.26\n", "step2571 | loss: 1.2910979986190796 | dt: 1457.82ms | tok/sec: 2809.68 | norm: 3.93\n", "step2572 | loss: 1.4351181983947754 | dt: 1449.76ms | tok/sec: 2825.30 | norm: 3.87\n", "step2573 | loss: 1.5341103076934814 | dt: 1453.21ms | tok/sec: 2818.59 | norm: 4.78\n", "step2574 | loss: 1.4468215703964233 | dt: 1456.44ms | tok/sec: 2812.33 | norm: 4.55\n", "step2575 | loss: 1.5492662191390991 | dt: 1446.74ms | tok/sec: 2831.19 | norm: 4.46\n", "step2576 | loss: 1.3289554119110107 | dt: 1462.96ms | tok/sec: 2799.80 | norm: 3.83\n", "step2577 | loss: 1.3096064329147339 | dt: 1446.57ms | tok/sec: 2831.52 | norm: 3.82\n", "step2578 | loss: 1.2138944864273071 | dt: 1449.71ms | tok/sec: 2825.39 | norm: 3.44\n", "step2579 | loss: 1.2659637928009033 | dt: 1455.79ms | tok/sec: 2813.59 | norm: 3.57\n", "step2580 | loss: 1.210542917251587 | dt: 1449.30ms | tok/sec: 2826.19 | norm: 3.40\n", "step2581 | loss: 1.2918730974197388 | dt: 1453.75ms | tok/sec: 2817.54 | norm: 3.43\n", "step2582 | loss: 1.2508949041366577 | dt: 1454.95ms | tok/sec: 2815.21 | norm: 3.48\n", "step2583 | loss: 1.7143619060516357 | dt: 1449.44ms | tok/sec: 2825.91 | norm: 4.80\n", "step2584 | loss: 1.5602309703826904 | dt: 1460.57ms | tok/sec: 2804.39 | norm: 4.64\n", "step2585 | loss: 1.4834800958633423 | dt: 1451.83ms | tok/sec: 2821.26 | norm: 4.46\n", "step2586 | loss: 1.5120640993118286 | dt: 1455.92ms | tok/sec: 2813.33 | norm: 4.24\n", "step2587 | loss: 1.7011446952819824 | dt: 1461.93ms | tok/sec: 2801.78 | norm: 5.13\n", "step2588 | loss: 1.5688531398773193 | dt: 1445.34ms | tok/sec: 2833.94 | norm: 4.75\n", "step2589 | loss: 1.3282791376113892 | dt: 1459.45ms | tok/sec: 2806.53 | norm: 3.99\n", "step2590 | loss: 1.39814031124115 | dt: 1450.77ms | tok/sec: 2823.33 | norm: 4.06\n", "step2591 | loss: 1.519022822380066 | dt: 1461.39ms | tok/sec: 2802.80 | norm: 4.24\n", "step2592 | loss: 1.4041979312896729 | dt: 1451.74ms | tok/sec: 2821.45 | norm: 4.12\n", "step2593 | loss: 1.524431824684143 | dt: 1458.53ms | tok/sec: 2808.31 | norm: 4.67\n", "step2594 | loss: 1.4266297817230225 | dt: 1453.30ms | tok/sec: 2818.41 | norm: 4.57\n", "step2595 | loss: 1.3501254320144653 | dt: 1455.51ms | tok/sec: 2814.14 | norm: 4.48\n", "step2596 | loss: 1.321139931678772 | dt: 1468.99ms | tok/sec: 2788.32 | norm: 4.38\n", "step2597 | loss: 1.6059420108795166 | dt: 1451.44ms | tok/sec: 2822.03 | norm: 4.80\n", "step2598 | loss: 1.4211556911468506 | dt: 1449.12ms | tok/sec: 2826.53 | norm: 4.30\n", "step2599 | loss: 1.3411824703216553 | dt: 1451.97ms | tok/sec: 2820.99 | norm: 4.41\n", "step2600 | loss: 1.2415732145309448 | dt: 1451.99ms | tok/sec: 2820.95 | norm: 4.39\n", "step2601 | loss: 1.2399675846099854 | dt: 1448.02ms | tok/sec: 2828.68 | norm: 4.46\n", "step2602 | loss: 1.6164302825927734 | dt: 1453.53ms | tok/sec: 2817.96 | norm: 4.51\n", "step2603 | loss: 1.6562861204147339 | dt: 1459.60ms | tok/sec: 2806.25 | norm: 4.83\n", "step2604 | loss: 1.4575114250183105 | dt: 1450.01ms | tok/sec: 2824.81 | norm: 4.71\n", "step2605 | loss: 1.2766727209091187 | dt: 1452.58ms | tok/sec: 2819.81 | norm: 4.51\n", "step2606 | loss: 1.281235694885254 | dt: 1459.07ms | tok/sec: 2807.26 | norm: 4.22\n", "step2607 | loss: 1.3285213708877563 | dt: 1456.04ms | tok/sec: 2813.11 | norm: 4.35\n", "step2608 | loss: 1.1868054866790771 | dt: 1458.09ms | tok/sec: 2809.15 | norm: 4.29\n", "step2609 | loss: 1.172046422958374 | dt: 1452.91ms | tok/sec: 2819.17 | norm: 4.31\n", "step2610 | loss: 1.0978175401687622 | dt: 1450.79ms | tok/sec: 2823.28 | norm: 3.74\n", "step2611 | loss: 1.6081805229187012 | dt: 1449.11ms | tok/sec: 2826.56 | norm: 4.92\n", "step2612 | loss: 1.3154387474060059 | dt: 1445.55ms | tok/sec: 2833.52 | norm: 4.05\n", "step2613 | loss: 1.7575492858886719 | dt: 1452.50ms | tok/sec: 2819.98 | norm: 4.92\n", "step2614 | loss: 1.5206841230392456 | dt: 1454.96ms | tok/sec: 2815.21 | norm: 4.49\n", "step2615 | loss: 1.5777497291564941 | dt: 1447.07ms | tok/sec: 2830.55 | norm: 4.37\n", "step2616 | loss: 1.5546655654907227 | dt: 1459.89ms | tok/sec: 2805.69 | norm: 4.50\n", "step2617 | loss: 1.441141963005066 | dt: 1464.03ms | tok/sec: 2797.75 | norm: 4.40\n", "step2618 | loss: 1.2566111087799072 | dt: 1453.32ms | tok/sec: 2818.37 | norm: 3.99\n", "step2619 | loss: 1.2309948205947876 | dt: 1445.73ms | tok/sec: 2833.17 | norm: 3.92\n", "step2620 | loss: 1.2474448680877686 | dt: 1459.78ms | tok/sec: 2805.90 | norm: 4.13\n", "step2621 | loss: 1.2216137647628784 | dt: 1456.45ms | tok/sec: 2812.33 | norm: 3.94\n", "step2622 | loss: 1.3895950317382812 | dt: 1451.62ms | tok/sec: 2821.67 | norm: 4.29\n", "step2623 | loss: 1.3980809450149536 | dt: 1457.29ms | tok/sec: 2810.70 | norm: 4.07\n", "step2624 | loss: 1.3859364986419678 | dt: 1452.34ms | tok/sec: 2820.28 | norm: 4.35\n", "step2625 | loss: 1.2936701774597168 | dt: 1456.14ms | tok/sec: 2812.91 | norm: 4.25\n", "step2626 | loss: 1.3370206356048584 | dt: 1458.87ms | tok/sec: 2807.65 | norm: 4.41\n", "step2627 | loss: 1.3180861473083496 | dt: 1454.70ms | tok/sec: 2815.71 | norm: 4.34\n", "step2628 | loss: 1.3092365264892578 | dt: 1462.87ms | tok/sec: 2799.97 | norm: 4.03\n", "step2629 | loss: 1.13645601272583 | dt: 1446.94ms | tok/sec: 2830.81 | norm: 4.30\n", "step2630 | loss: 1.1197572946548462 | dt: 1451.87ms | tok/sec: 2821.19 | norm: 4.11\n", "step2631 | loss: 1.3656084537506104 | dt: 1454.59ms | tok/sec: 2815.91 | norm: 4.10\n", "step2632 | loss: 1.1970343589782715 | dt: 1456.86ms | tok/sec: 2811.53 | norm: 4.02\n", "step2633 | loss: 1.2304483652114868 | dt: 1460.35ms | tok/sec: 2804.80 | norm: 4.04\n", "step2634 | loss: 1.0757670402526855 | dt: 1455.94ms | tok/sec: 2813.30 | norm: 4.18\n", "step2635 | loss: 1.4379359483718872 | dt: 1460.07ms | tok/sec: 2805.34 | norm: 4.23\n", "step2636 | loss: 1.3928897380828857 | dt: 1460.41ms | tok/sec: 2804.70 | norm: 4.47\n", "step2637 | loss: 1.3595422506332397 | dt: 1446.01ms | tok/sec: 2832.62 | norm: 4.31\n", "step2638 | loss: 1.3167729377746582 | dt: 1451.19ms | tok/sec: 2822.51 | norm: 3.97\n", "step2639 | loss: 1.3230572938919067 | dt: 1446.78ms | tok/sec: 2831.12 | norm: 4.30\n", "step2640 | loss: 1.3100131750106812 | dt: 1462.57ms | tok/sec: 2800.55 | norm: 3.84\n", "step2641 | loss: 1.3007452487945557 | dt: 1447.47ms | tok/sec: 2829.76 | norm: 3.95\n", "step2642 | loss: 1.2369282245635986 | dt: 1451.95ms | tok/sec: 2821.04 | norm: 3.79\n", "step2643 | loss: 1.0920034646987915 | dt: 1451.77ms | tok/sec: 2821.39 | norm: 3.50\n", "step2644 | loss: 0.997141420841217 | dt: 1458.66ms | tok/sec: 2808.05 | norm: 3.39\n", "step2645 | loss: 1.3031861782073975 | dt: 1464.21ms | tok/sec: 2797.41 | norm: 4.11\n", "step2646 | loss: 1.3800870180130005 | dt: 1452.77ms | tok/sec: 2819.45 | norm: 4.14\n", "step2647 | loss: 1.4343035221099854 | dt: 1452.82ms | tok/sec: 2819.34 | norm: 4.17\n", "step2648 | loss: 1.2916576862335205 | dt: 1446.20ms | tok/sec: 2832.25 | norm: 4.12\n", "step2649 | loss: 1.201280951499939 | dt: 1456.06ms | tok/sec: 2813.07 | norm: 4.26\n", "step2650 | loss: 1.249721884727478 | dt: 1454.49ms | tok/sec: 2816.11 | norm: 4.27\n", "step2651 | loss: 1.1727052927017212 | dt: 1452.88ms | tok/sec: 2819.24 | norm: 4.11\n", "step2652 | loss: 1.1486467123031616 | dt: 1453.26ms | tok/sec: 2818.48 | norm: 4.29\n", "step2653 | loss: 1.1860079765319824 | dt: 1451.34ms | tok/sec: 2822.21 | norm: 4.12\n", "step2654 | loss: 1.3014674186706543 | dt: 1446.95ms | tok/sec: 2830.78 | norm: 4.24\n", "step2655 | loss: 1.3994625806808472 | dt: 1454.48ms | tok/sec: 2816.13 | norm: 4.58\n", "step2656 | loss: 1.3029663562774658 | dt: 1463.59ms | tok/sec: 2798.60 | norm: 4.29\n", "step2657 | loss: 1.410333275794983 | dt: 1443.23ms | tok/sec: 2838.08 | norm: 4.47\n", "step2658 | loss: 1.190945029258728 | dt: 1452.56ms | tok/sec: 2819.85 | norm: 3.77\n", "step2659 | loss: 1.1758654117584229 | dt: 1453.06ms | tok/sec: 2818.87 | norm: 4.11\n", "step2660 | loss: 1.1109890937805176 | dt: 1458.06ms | tok/sec: 2809.21 | norm: 4.13\n", "step2661 | loss: 1.1782500743865967 | dt: 1455.49ms | tok/sec: 2814.17 | norm: 3.98\n", "step2662 | loss: 1.1060147285461426 | dt: 1452.23ms | tok/sec: 2820.50 | norm: 3.57\n", "step2663 | loss: 1.1736117601394653 | dt: 1454.01ms | tok/sec: 2817.04 | norm: 3.78\n", "step2664 | loss: 1.1430894136428833 | dt: 1451.23ms | tok/sec: 2822.43 | norm: 3.85\n", "step2665 | loss: 1.6107773780822754 | dt: 1444.27ms | tok/sec: 2836.04 | norm: 4.82\n", "step2666 | loss: 1.461276888847351 | dt: 1456.24ms | tok/sec: 2812.73 | norm: 4.83\n", "step2667 | loss: 1.3630281686782837 | dt: 1441.54ms | tok/sec: 2841.40 | norm: 4.43\n", "step2668 | loss: 1.3918485641479492 | dt: 1459.66ms | tok/sec: 2806.13 | norm: 4.42\n", "step2669 | loss: 1.5887672901153564 | dt: 1452.67ms | tok/sec: 2819.63 | norm: 4.97\n", "step2670 | loss: 1.4425843954086304 | dt: 1447.54ms | tok/sec: 2829.62 | norm: 4.40\n", "step2671 | loss: 1.2105499505996704 | dt: 1454.70ms | tok/sec: 2815.70 | norm: 4.31\n", "step2672 | loss: 1.279050350189209 | dt: 1450.20ms | tok/sec: 2824.45 | norm: 4.42\n", "step2673 | loss: 1.4223910570144653 | dt: 1451.08ms | tok/sec: 2822.73 | norm: 4.91\n", "step2674 | loss: 1.2842636108398438 | dt: 1452.85ms | tok/sec: 2819.30 | norm: 4.40\n", "step2675 | loss: 1.3974722623825073 | dt: 1453.31ms | tok/sec: 2818.40 | norm: 4.33\n", "step2676 | loss: 1.311394453048706 | dt: 1452.50ms | tok/sec: 2819.97 | norm: 4.66\n", "step2677 | loss: 1.240972638130188 | dt: 1451.27ms | tok/sec: 2822.36 | norm: 4.76\n", "step2678 | loss: 1.1965456008911133 | dt: 1453.89ms | tok/sec: 2817.27 | norm: 4.26\n", "step2679 | loss: 1.4789410829544067 | dt: 1447.55ms | tok/sec: 2829.62 | norm: 5.30\n", "step2680 | loss: 1.3137474060058594 | dt: 1456.78ms | tok/sec: 2811.69 | norm: 4.61\n", "step2681 | loss: 1.2369948625564575 | dt: 1453.73ms | tok/sec: 2817.58 | norm: 4.34\n", "step2682 | loss: 1.1265227794647217 | dt: 1454.99ms | tok/sec: 2815.14 | norm: 4.18\n", "step2683 | loss: 1.1478016376495361 | dt: 1452.54ms | tok/sec: 2819.89 | norm: 4.45\n", "step2684 | loss: 1.447966456413269 | dt: 1452.79ms | tok/sec: 2819.39 | norm: 4.48\n", "step2685 | loss: 1.5600101947784424 | dt: 1454.18ms | tok/sec: 2816.71 | norm: 5.51\n", "step2686 | loss: 1.365616798400879 | dt: 1454.18ms | tok/sec: 2816.71 | norm: 4.87\n", "step2687 | loss: 1.190068244934082 | dt: 1453.26ms | tok/sec: 2818.50 | norm: 4.47\n", "step2688 | loss: 1.1687369346618652 | dt: 1451.40ms | tok/sec: 2822.11 | norm: 3.81\n", "step2689 | loss: 1.2321616411209106 | dt: 1451.67ms | tok/sec: 2821.57 | norm: 4.20\n", "step2690 | loss: 1.0711877346038818 | dt: 1454.79ms | tok/sec: 2815.53 | norm: 3.90\n", "step2691 | loss: 1.0422147512435913 | dt: 1452.15ms | tok/sec: 2820.64 | norm: 4.03\n", "step2692 | loss: 0.9923815727233887 | dt: 1452.46ms | tok/sec: 2820.05 | norm: 3.90\n", "step2693 | loss: 1.4878175258636475 | dt: 1451.99ms | tok/sec: 2820.97 | norm: 4.72\n", "step2694 | loss: 1.1944482326507568 | dt: 1457.91ms | tok/sec: 2809.50 | norm: 4.16\n", "step2695 | loss: 1.6408429145812988 | dt: 1454.07ms | tok/sec: 2816.92 | norm: 5.19\n", "step2696 | loss: 1.421919584274292 | dt: 1450.66ms | tok/sec: 2823.53 | norm: 4.80\n", "step2697 | loss: 1.4718725681304932 | dt: 1453.07ms | tok/sec: 2818.85 | norm: 4.97\n", "step2698 | loss: 1.4568594694137573 | dt: 1452.03ms | tok/sec: 2820.87 | norm: 4.96\n", "step2699 | loss: 1.3209294080734253 | dt: 1439.37ms | tok/sec: 2845.69 | norm: 4.51\n", "step2700 | loss: 1.1331394910812378 | dt: 1451.68ms | tok/sec: 2821.56 | norm: 3.98\n", "step2701 | loss: 1.1171159744262695 | dt: 1443.05ms | tok/sec: 2838.42 | norm: 3.88\n", "step2702 | loss: 1.1348152160644531 | dt: 1460.15ms | tok/sec: 2805.19 | norm: 4.11\n", "step2703 | loss: 1.110341191291809 | dt: 1450.84ms | tok/sec: 2823.19 | norm: 3.96\n", "step2704 | loss: 1.2935601472854614 | dt: 1446.70ms | tok/sec: 2831.26 | norm: 4.86\n", "step2705 | loss: 1.2879663705825806 | dt: 1448.46ms | tok/sec: 2827.83 | norm: 4.41\n", "step2706 | loss: 1.2773609161376953 | dt: 1453.10ms | tok/sec: 2818.80 | norm: 4.58\n", "step2707 | loss: 1.1601557731628418 | dt: 1453.82ms | tok/sec: 2817.40 | norm: 4.06\n", "step2708 | loss: 1.1950608491897583 | dt: 1451.47ms | tok/sec: 2821.97 | norm: 3.84\n", "step2709 | loss: 1.186729907989502 | dt: 1456.86ms | tok/sec: 2811.52 | norm: 4.01\n", "step2710 | loss: 1.161592960357666 | dt: 1456.69ms | tok/sec: 2811.86 | norm: 3.85\n", "step2711 | loss: 1.027384877204895 | dt: 1444.74ms | tok/sec: 2835.11 | norm: 4.34\n", "step2712 | loss: 1.017700433731079 | dt: 1448.42ms | tok/sec: 2827.91 | norm: 4.26\n", "step2713 | loss: 1.2447069883346558 | dt: 1444.97ms | tok/sec: 2834.66 | norm: 3.94\n", "step2714 | loss: 1.0683879852294922 | dt: 1456.58ms | tok/sec: 2812.07 | norm: 3.84\n", "step2715 | loss: 1.1014273166656494 | dt: 1452.48ms | tok/sec: 2820.01 | norm: 3.71\n", "step2716 | loss: 0.9582405686378479 | dt: 1459.35ms | tok/sec: 2806.72 | norm: 3.67\n", "step2717 | loss: 1.3220487833023071 | dt: 1450.59ms | tok/sec: 2823.67 | norm: 4.59\n", "step2718 | loss: 1.2548625469207764 | dt: 1451.51ms | tok/sec: 2821.88 | norm: 4.20\n", "step2719 | loss: 1.2066688537597656 | dt: 1447.81ms | tok/sec: 2829.11 | norm: 3.95\n", "step2720 | loss: 1.162528395652771 | dt: 1456.23ms | tok/sec: 2812.74 | norm: 3.80\n", "step2721 | loss: 1.183803677558899 | dt: 1450.15ms | tok/sec: 2824.54 | norm: 4.13\n", "step2722 | loss: 1.1756118535995483 | dt: 1456.02ms | tok/sec: 2813.15 | norm: 4.08\n", "step2723 | loss: 1.18045973777771 | dt: 1450.51ms | tok/sec: 2823.84 | norm: 4.18\n", "step2724 | loss: 1.137571096420288 | dt: 1455.50ms | tok/sec: 2814.15 | norm: 4.56\n", "step2725 | loss: 0.9793696999549866 | dt: 1454.02ms | tok/sec: 2817.02 | norm: 3.78\n", "step2726 | loss: 0.8773712515830994 | dt: 1442.27ms | tok/sec: 2839.97 | norm: 3.32\n", "step2727 | loss: 1.1592447757720947 | dt: 1449.16ms | tok/sec: 2826.47 | norm: 3.67\n", "step2728 | loss: 1.2507896423339844 | dt: 1450.47ms | tok/sec: 2823.92 | norm: 4.18\n", "step2729 | loss: 1.2907122373580933 | dt: 1453.15ms | tok/sec: 2818.70 | norm: 4.16\n", "step2730 | loss: 1.151871681213379 | dt: 1450.44ms | tok/sec: 2823.97 | norm: 4.07\n", "step2731 | loss: 1.0625852346420288 | dt: 1450.82ms | tok/sec: 2823.24 | norm: 3.67\n", "step2732 | loss: 1.150267243385315 | dt: 1441.98ms | tok/sec: 2840.54 | norm: 4.44\n", "step2733 | loss: 1.0529378652572632 | dt: 1445.34ms | tok/sec: 2833.94 | norm: 4.35\n", "step2734 | loss: 1.0391323566436768 | dt: 1453.76ms | tok/sec: 2817.53 | norm: 4.62\n", "step2735 | loss: 1.065970778465271 | dt: 1452.09ms | tok/sec: 2820.76 | norm: 4.26\n", "step2736 | loss: 1.1910773515701294 | dt: 1450.46ms | tok/sec: 2823.92 | norm: 4.47\n", "step2737 | loss: 1.2928378582000732 | dt: 1453.57ms | tok/sec: 2817.90 | norm: 4.91\n", "step2738 | loss: 1.208407998085022 | dt: 1440.29ms | tok/sec: 2843.88 | norm: 4.68\n", "step2739 | loss: 1.2810527086257935 | dt: 1448.72ms | tok/sec: 2827.33 | norm: 4.31\n", "step2740 | loss: 1.0770059823989868 | dt: 1459.63ms | tok/sec: 2806.20 | norm: 4.01\n", "step2741 | loss: 1.0674982070922852 | dt: 1448.15ms | tok/sec: 2828.43 | norm: 4.35\n", "step2742 | loss: 1.0024683475494385 | dt: 1446.09ms | tok/sec: 2832.47 | norm: 4.08\n", "step2743 | loss: 1.075844645500183 | dt: 1443.55ms | tok/sec: 2837.46 | norm: 4.34\n", "step2744 | loss: 1.0159655809402466 | dt: 1447.32ms | tok/sec: 2830.05 | norm: 4.38\n", "step2745 | loss: 1.0702906847000122 | dt: 1441.23ms | tok/sec: 2842.02 | norm: 4.05\n", "step2746 | loss: 1.0468961000442505 | dt: 1445.42ms | tok/sec: 2833.78 | norm: 3.70\n", "step2747 | loss: 1.4878191947937012 | dt: 1451.47ms | tok/sec: 2821.96 | norm: 4.69\n", "step2748 | loss: 1.320166826248169 | dt: 1447.40ms | tok/sec: 2829.91 | norm: 4.35\n", "step2749 | loss: 1.2659785747528076 | dt: 1448.96ms | tok/sec: 2826.86 | norm: 4.60\n", "step2750 | loss: 1.2802422046661377 | dt: 1447.73ms | tok/sec: 2829.25 | norm: 4.21\n", "step2751 | loss: 1.4864859580993652 | dt: 1448.19ms | tok/sec: 2828.36 | norm: 5.03\n", "step2752 | loss: 1.3243463039398193 | dt: 1451.46ms | tok/sec: 2821.98 | norm: 4.45\n", "step2753 | loss: 1.109192132949829 | dt: 1443.57ms | tok/sec: 2837.41 | norm: 4.25\n", "step2754 | loss: 1.173357605934143 | dt: 1447.88ms | tok/sec: 2828.97 | norm: 4.26\n", "step2755 | loss: 1.3069756031036377 | dt: 1439.85ms | tok/sec: 2844.74 | norm: 4.81\n", "step2756 | loss: 1.1710723638534546 | dt: 1449.20ms | tok/sec: 2826.39 | norm: 4.58\n", "step2757 | loss: 1.3169946670532227 | dt: 1446.91ms | tok/sec: 2830.87 | norm: 5.07\n", "step2758 | loss: 1.2158199548721313 | dt: 1455.34ms | tok/sec: 2814.46 | norm: 4.64\n", "step2759 | loss: 1.1542868614196777 | dt: 1445.67ms | tok/sec: 2833.29 | norm: 4.21\n", "step2760 | loss: 1.1055132150650024 | dt: 1453.72ms | tok/sec: 2817.61 | norm: 4.01\n", "step2761 | loss: 1.396950364112854 | dt: 1446.22ms | tok/sec: 2832.21 | norm: 5.43\n", "step2762 | loss: 1.2484452724456787 | dt: 1448.20ms | tok/sec: 2828.35 | norm: 4.86\n", "step2763 | loss: 1.1570836305618286 | dt: 1450.30ms | tok/sec: 2824.24 | norm: 4.69\n", "step2764 | loss: 1.0161573886871338 | dt: 1445.02ms | tok/sec: 2834.57 | norm: 4.24\n", "step2765 | loss: 1.0284157991409302 | dt: 1445.07ms | tok/sec: 2834.47 | norm: 4.16\n", "step2766 | loss: 1.341704249382019 | dt: 1452.28ms | tok/sec: 2820.39 | norm: 4.66\n", "step2767 | loss: 1.411446452140808 | dt: 1449.74ms | tok/sec: 2825.34 | norm: 5.06\n", "step2768 | loss: 1.223538875579834 | dt: 1439.18ms | tok/sec: 2846.06 | norm: 4.75\n", "step2769 | loss: 1.0488468408584595 | dt: 1454.00ms | tok/sec: 2817.05 | norm: 4.32\n", "step2770 | loss: 1.0508593320846558 | dt: 1438.78ms | tok/sec: 2846.86 | norm: 3.92\n", "step2771 | loss: 1.0991227626800537 | dt: 1444.00ms | tok/sec: 2836.57 | norm: 3.97\n", "step2772 | loss: 0.9542100429534912 | dt: 1453.00ms | tok/sec: 2818.99 | norm: 3.94\n", "step2773 | loss: 0.9295106530189514 | dt: 1437.13ms | tok/sec: 2850.13 | norm: 4.02\n", "step2774 | loss: 0.8851926326751709 | dt: 1449.79ms | tok/sec: 2825.23 | norm: 3.72\n", "step2775 | loss: 1.4087215662002563 | dt: 1439.36ms | tok/sec: 2845.71 | norm: 5.34\n", "step2776 | loss: 1.094377875328064 | dt: 1447.82ms | tok/sec: 2829.08 | norm: 4.13\n", "step2777 | loss: 1.5005468130111694 | dt: 1439.35ms | tok/sec: 2845.73 | norm: 5.16\n", "step2778 | loss: 1.2779356241226196 | dt: 1451.72ms | tok/sec: 2821.48 | norm: 4.59\n", "step2779 | loss: 1.355832815170288 | dt: 1451.22ms | tok/sec: 2822.45 | norm: 4.47\n", "step2780 | loss: 1.3222771883010864 | dt: 1442.92ms | tok/sec: 2838.68 | norm: 4.50\n", "step2781 | loss: 1.2012938261032104 | dt: 1445.85ms | tok/sec: 2832.94 | norm: 4.59\n", "step2782 | loss: 1.0578374862670898 | dt: 1454.90ms | tok/sec: 2815.32 | norm: 4.88\n", "step2783 | loss: 1.0203629732131958 | dt: 1440.20ms | tok/sec: 2844.04 | norm: 4.30\n", "step2784 | loss: 1.0217937231063843 | dt: 1453.41ms | tok/sec: 2818.20 | norm: 4.22\n", "step2785 | loss: 0.9966161847114563 | dt: 1447.87ms | tok/sec: 2828.99 | norm: 3.67\n", "step2786 | loss: 1.1545870304107666 | dt: 1456.56ms | tok/sec: 2812.11 | norm: 3.90\n", "step2787 | loss: 1.193534016609192 | dt: 1447.87ms | tok/sec: 2828.98 | norm: 4.33\n", "step2788 | loss: 1.1668797731399536 | dt: 1453.88ms | tok/sec: 2817.29 | norm: 4.67\n", "step2789 | loss: 1.0546451807022095 | dt: 1440.35ms | tok/sec: 2843.76 | norm: 4.52\n", "step2790 | loss: 1.0915883779525757 | dt: 1452.11ms | tok/sec: 2820.72 | norm: 4.51\n", "step2791 | loss: 1.0947062969207764 | dt: 1451.98ms | tok/sec: 2820.97 | norm: 4.26\n", "step2792 | loss: 1.0588228702545166 | dt: 1452.54ms | tok/sec: 2819.88 | norm: 3.98\n", "step2793 | loss: 0.9180967807769775 | dt: 1440.16ms | tok/sec: 2844.13 | norm: 3.81\n", "step2794 | loss: 0.9070196151733398 | dt: 1451.18ms | tok/sec: 2822.53 | norm: 3.86\n", "step2795 | loss: 1.1259636878967285 | dt: 1452.95ms | tok/sec: 2819.08 | norm: 4.14\n", "step2796 | loss: 0.970669150352478 | dt: 1454.67ms | tok/sec: 2815.75 | norm: 4.10\n", "step2797 | loss: 0.988465428352356 | dt: 1452.45ms | tok/sec: 2820.05 | norm: 3.66\n", "step2798 | loss: 0.8462691903114319 | dt: 1456.31ms | tok/sec: 2812.58 | norm: 3.40\n", "step2799 | loss: 1.2042317390441895 | dt: 1453.32ms | tok/sec: 2818.37 | norm: 4.01\n", "step2800 | loss: 1.1630206108093262 | dt: 1439.64ms | tok/sec: 2845.16 | norm: 4.02\n", "step2801 | loss: 1.1141233444213867 | dt: 1449.84ms | tok/sec: 2825.14 | norm: 4.12\n", "step2802 | loss: 1.0982486009597778 | dt: 1460.19ms | tok/sec: 2805.11 | norm: 4.35\n", "step2803 | loss: 1.103857398033142 | dt: 1452.62ms | tok/sec: 2819.74 | norm: 4.63\n", "step2804 | loss: 1.1136091947555542 | dt: 1447.07ms | tok/sec: 2830.54 | norm: 4.78\n", "step2805 | loss: 1.083019495010376 | dt: 1453.47ms | tok/sec: 2818.09 | norm: 4.16\n", "step2806 | loss: 1.0388107299804688 | dt: 1454.75ms | tok/sec: 2815.61 | norm: 4.21\n", "step2807 | loss: 0.8802213072776794 | dt: 1453.37ms | tok/sec: 2818.27 | norm: 3.81\n", "step2808 | loss: 0.7879467606544495 | dt: 1452.73ms | tok/sec: 2819.51 | norm: 3.88\n", "step2809 | loss: 1.0815728902816772 | dt: 1459.78ms | tok/sec: 2805.91 | norm: 4.84\n", "step2810 | loss: 1.172286868095398 | dt: 1447.53ms | tok/sec: 2829.65 | norm: 4.80\n", "step2811 | loss: 1.1818581819534302 | dt: 1451.29ms | tok/sec: 2822.31 | norm: 4.14\n", "step2812 | loss: 1.0570151805877686 | dt: 1450.75ms | tok/sec: 2823.37 | norm: 4.32\n", "step2813 | loss: 0.9591196775436401 | dt: 1445.08ms | tok/sec: 2834.45 | norm: 3.99\n", "step2814 | loss: 1.034283995628357 | dt: 1452.56ms | tok/sec: 2819.85 | norm: 3.99\n", "step2815 | loss: 0.9446919560432434 | dt: 1450.28ms | tok/sec: 2824.28 | norm: 3.88\n", "step2816 | loss: 0.9481322169303894 | dt: 1450.67ms | tok/sec: 2823.52 | norm: 4.27\n", "step2817 | loss: 0.9576920866966248 | dt: 1452.76ms | tok/sec: 2819.47 | norm: 4.25\n", "step2818 | loss: 1.102465271949768 | dt: 1443.43ms | tok/sec: 2837.69 | norm: 5.00\n", "step2819 | loss: 1.2363258600234985 | dt: 1449.28ms | tok/sec: 2826.24 | norm: 5.85\n", "step2820 | loss: 1.1314505338668823 | dt: 1451.54ms | tok/sec: 2821.82 | norm: 5.21\n", "step2821 | loss: 1.192902684211731 | dt: 1456.09ms | tok/sec: 2813.00 | norm: 5.36\n", "step2822 | loss: 0.9983437061309814 | dt: 1456.20ms | tok/sec: 2812.80 | norm: 4.30\n", "step2823 | loss: 0.9877868294715881 | dt: 1453.55ms | tok/sec: 2817.92 | norm: 4.26\n", "step2824 | loss: 0.9083542227745056 | dt: 1453.79ms | tok/sec: 2817.46 | norm: 3.85\n", "step2825 | loss: 0.9591616988182068 | dt: 1450.26ms | tok/sec: 2824.32 | norm: 3.86\n", "step2826 | loss: 0.9171082377433777 | dt: 1463.40ms | tok/sec: 2798.95 | norm: 3.94\n", "step2827 | loss: 0.9662402272224426 | dt: 1441.59ms | tok/sec: 2841.30 | norm: 4.14\n", "step2828 | loss: 0.9377729892730713 | dt: 1450.30ms | tok/sec: 2824.24 | norm: 3.95\n", "step2829 | loss: 1.3730711936950684 | dt: 1451.95ms | tok/sec: 2821.04 | norm: 4.72\n", "step2830 | loss: 1.208509087562561 | dt: 1456.61ms | tok/sec: 2812.01 | norm: 4.56\n", "step2831 | loss: 1.167611002922058 | dt: 1451.03ms | tok/sec: 2822.82 | norm: 4.35\n", "step2832 | loss: 1.1835558414459229 | dt: 1452.04ms | tok/sec: 2820.86 | norm: 4.13\n", "step2833 | loss: 1.378427505493164 | dt: 1453.30ms | tok/sec: 2818.42 | norm: 4.92\n", "step2834 | loss: 1.2203853130340576 | dt: 1452.99ms | tok/sec: 2819.01 | norm: 4.18\n", "step2835 | loss: 1.0176846981048584 | dt: 1456.25ms | tok/sec: 2812.70 | norm: 4.17\n", "step2836 | loss: 1.0821781158447266 | dt: 1451.82ms | tok/sec: 2821.28 | norm: 4.23\n", "step2837 | loss: 1.1759122610092163 | dt: 1452.76ms | tok/sec: 2819.46 | norm: 4.09\n", "step2838 | loss: 1.0659890174865723 | dt: 1454.09ms | tok/sec: 2816.88 | norm: 4.17\n", "step2839 | loss: 1.1854549646377563 | dt: 1453.23ms | tok/sec: 2818.56 | norm: 4.40\n", "step2840 | loss: 1.093226432800293 | dt: 1458.59ms | tok/sec: 2808.20 | norm: 4.59\n", "step2841 | loss: 1.0563255548477173 | dt: 1455.32ms | tok/sec: 2814.51 | norm: 4.66\n", "step2842 | loss: 0.983540415763855 | dt: 1464.36ms | tok/sec: 2797.12 | norm: 3.98\n", "step2843 | loss: 1.26972496509552 | dt: 1454.07ms | tok/sec: 2816.92 | norm: 4.83\n", "step2844 | loss: 1.1304515600204468 | dt: 1456.17ms | tok/sec: 2812.87 | norm: 4.38\n", "step2845 | loss: 1.034179925918579 | dt: 1454.79ms | tok/sec: 2815.53 | norm: 4.17\n", "step2846 | loss: 0.928015947341919 | dt: 1454.12ms | tok/sec: 2816.82 | norm: 4.11\n", "step2847 | loss: 0.9484207630157471 | dt: 1457.91ms | tok/sec: 2809.50 | norm: 4.55\n", "step2848 | loss: 1.247387170791626 | dt: 1462.53ms | tok/sec: 2800.63 | norm: 4.75\n", "step2849 | loss: 1.327552080154419 | dt: 1447.56ms | tok/sec: 2829.59 | norm: 5.60\n", "step2850 | loss: 1.1469168663024902 | dt: 1446.71ms | tok/sec: 2831.25 | norm: 5.32\n", "step2851 | loss: 0.9477100372314453 | dt: 1455.49ms | tok/sec: 2814.17 | norm: 4.07\n", "step2852 | loss: 0.9268544316291809 | dt: 1464.87ms | tok/sec: 2796.16 | norm: 3.66\n", "step2853 | loss: 0.9969848394393921 | dt: 1454.13ms | tok/sec: 2816.80 | norm: 4.02\n", "step2854 | loss: 0.8470988869667053 | dt: 1460.17ms | tok/sec: 2805.14 | norm: 3.93\n", "step2855 | loss: 0.8514853715896606 | dt: 1461.17ms | tok/sec: 2803.24 | norm: 4.24\n", "step2856 | loss: 0.7942746877670288 | dt: 1456.77ms | tok/sec: 2811.70 | norm: 3.65\n", "step2857 | loss: 1.3160382509231567 | dt: 1456.28ms | tok/sec: 2812.64 | norm: 4.81\n", "step2858 | loss: 1.0065733194351196 | dt: 1456.58ms | tok/sec: 2812.06 | norm: 4.24\n", "step2859 | loss: 1.4350730180740356 | dt: 1444.82ms | tok/sec: 2834.96 | norm: 5.32\n", "step2860 | loss: 1.1968880891799927 | dt: 1447.15ms | tok/sec: 2830.39 | norm: 4.62\n", "step2861 | loss: 1.2387750148773193 | dt: 1452.34ms | tok/sec: 2820.28 | norm: 4.46\n", "step2862 | loss: 1.2187830209732056 | dt: 1448.13ms | tok/sec: 2828.48 | norm: 4.70\n", "step2863 | loss: 1.1106582880020142 | dt: 1458.94ms | tok/sec: 2807.53 | norm: 4.43\n", "step2864 | loss: 0.9592660069465637 | dt: 1448.44ms | tok/sec: 2827.86 | norm: 3.90\n", "step2865 | loss: 0.9185810685157776 | dt: 1447.96ms | tok/sec: 2828.81 | norm: 3.86\n", "step2866 | loss: 0.917462170124054 | dt: 1443.00ms | tok/sec: 2838.52 | norm: 3.77\n", "step2867 | loss: 0.8988720178604126 | dt: 1457.35ms | tok/sec: 2810.57 | norm: 3.80\n", "step2868 | loss: 1.0359327793121338 | dt: 1443.61ms | tok/sec: 2837.34 | norm: 4.10\n", "step2869 | loss: 1.0467925071716309 | dt: 1447.24ms | tok/sec: 2830.21 | norm: 3.63\n", "step2870 | loss: 1.052548885345459 | dt: 1461.00ms | tok/sec: 2803.55 | norm: 4.12\n", "step2871 | loss: 0.9926680326461792 | dt: 1456.52ms | tok/sec: 2812.17 | norm: 4.71\n", "step2872 | loss: 1.032594919204712 | dt: 1452.06ms | tok/sec: 2820.82 | norm: 5.00\n", "step2873 | loss: 1.0079683065414429 | dt: 1454.99ms | tok/sec: 2815.14 | norm: 4.91\n", "step2874 | loss: 0.9582314491271973 | dt: 1460.53ms | tok/sec: 2804.47 | norm: 4.36\n", "step2875 | loss: 0.8277407884597778 | dt: 1459.40ms | tok/sec: 2806.63 | norm: 3.84\n", "step2876 | loss: 0.8142133355140686 | dt: 1454.16ms | tok/sec: 2816.75 | norm: 4.09\n", "step2877 | loss: 1.0374400615692139 | dt: 1458.83ms | tok/sec: 2807.74 | norm: 4.09\n", "step2878 | loss: 0.8688240051269531 | dt: 1464.17ms | tok/sec: 2797.49 | norm: 3.72\n", "step2879 | loss: 0.8721754550933838 | dt: 1456.13ms | tok/sec: 2812.94 | norm: 3.42\n", "step2880 | loss: 0.7469014525413513 | dt: 1449.73ms | tok/sec: 2825.35 | norm: 3.88\n", "step2881 | loss: 1.0990811586380005 | dt: 1452.89ms | tok/sec: 2819.21 | norm: 4.15\n", "step2882 | loss: 1.0270800590515137 | dt: 1462.49ms | tok/sec: 2800.71 | norm: 3.83\n", "step2883 | loss: 0.9768889546394348 | dt: 1451.22ms | tok/sec: 2822.45 | norm: 3.73\n", "step2884 | loss: 0.9632880687713623 | dt: 1458.21ms | tok/sec: 2808.92 | norm: 3.69\n", "step2885 | loss: 0.9864915609359741 | dt: 1463.24ms | tok/sec: 2799.27 | norm: 4.17\n", "step2886 | loss: 0.988692581653595 | dt: 1456.49ms | tok/sec: 2812.24 | norm: 4.00\n", "step2887 | loss: 0.9653513431549072 | dt: 1452.76ms | tok/sec: 2819.45 | norm: 4.28\n", "step2888 | loss: 0.9158448576927185 | dt: 1461.70ms | tok/sec: 2802.21 | norm: 4.06\n", "step2889 | loss: 0.7721574902534485 | dt: 1449.87ms | tok/sec: 2825.09 | norm: 3.70\n", "step2890 | loss: 0.6815444231033325 | dt: 1460.63ms | tok/sec: 2804.27 | norm: 3.21\n", "step2891 | loss: 0.9479718208312988 | dt: 1452.04ms | tok/sec: 2820.85 | norm: 3.85\n", "step2892 | loss: 1.0283340215682983 | dt: 1453.17ms | tok/sec: 2818.67 | norm: 4.07\n", "step2893 | loss: 1.0391883850097656 | dt: 1455.23ms | tok/sec: 2814.68 | norm: 4.07\n", "step2894 | loss: 0.9444149732589722 | dt: 1457.65ms | tok/sec: 2810.00 | norm: 4.04\n", "step2895 | loss: 0.8478289246559143 | dt: 1446.77ms | tok/sec: 2831.13 | norm: 3.77\n", "step2896 | loss: 0.8958778381347656 | dt: 1461.09ms | tok/sec: 2803.39 | norm: 3.61\n", "step2897 | loss: 0.815375030040741 | dt: 1454.05ms | tok/sec: 2816.95 | norm: 3.55\n", "step2898 | loss: 0.8038696646690369 | dt: 1445.02ms | tok/sec: 2834.57 | norm: 3.65\n", "step2899 | loss: 0.8316746950149536 | dt: 1454.50ms | tok/sec: 2816.09 | norm: 3.33\n", "step2900 | loss: 0.9504290223121643 | dt: 1459.27ms | tok/sec: 2806.89 | norm: 3.51\n", "step2901 | loss: 1.0735456943511963 | dt: 1461.96ms | tok/sec: 2801.71 | norm: 4.14\n", "step2902 | loss: 0.9852175116539001 | dt: 1455.13ms | tok/sec: 2814.88 | norm: 4.32\n", "step2903 | loss: 1.0724040269851685 | dt: 1458.75ms | tok/sec: 2807.89 | norm: 4.79\n", "step2904 | loss: 0.881240963935852 | dt: 1458.28ms | tok/sec: 2808.79 | norm: 4.21\n", "step2905 | loss: 0.8616688847541809 | dt: 1463.82ms | tok/sec: 2798.16 | norm: 4.04\n", "step2906 | loss: 0.8035010695457458 | dt: 1454.66ms | tok/sec: 2815.77 | norm: 3.95\n", "step2907 | loss: 0.8362366557121277 | dt: 1454.58ms | tok/sec: 2815.94 | norm: 3.85\n", "step2908 | loss: 0.7933135628700256 | dt: 1461.04ms | tok/sec: 2803.47 | norm: 3.61\n", "step2909 | loss: 0.861945629119873 | dt: 1448.32ms | tok/sec: 2828.10 | norm: 3.82\n", "step2910 | loss: 0.8194599151611328 | dt: 1462.18ms | tok/sec: 2801.30 | norm: 3.57\n", "step2911 | loss: 1.2683213949203491 | dt: 1461.00ms | tok/sec: 2803.57 | norm: 4.93\n", "step2912 | loss: 1.085909128189087 | dt: 1452.66ms | tok/sec: 2819.65 | norm: 4.33\n", "step2913 | loss: 1.0355358123779297 | dt: 1452.22ms | tok/sec: 2820.51 | norm: 4.35\n", "step2914 | loss: 1.0340192317962646 | dt: 1442.53ms | tok/sec: 2839.45 | norm: 3.92\n", "step2915 | loss: 1.2535717487335205 | dt: 1458.12ms | tok/sec: 2809.09 | norm: 4.50\n", "step2916 | loss: 1.087054967880249 | dt: 1454.81ms | tok/sec: 2815.49 | norm: 4.06\n", "step2917 | loss: 0.8855079412460327 | dt: 1447.57ms | tok/sec: 2829.58 | norm: 3.75\n", "step2918 | loss: 0.9541178345680237 | dt: 1461.97ms | tok/sec: 2801.69 | norm: 3.72\n", "step2919 | loss: 1.0278609991073608 | dt: 1462.05ms | tok/sec: 2801.54 | norm: 3.99\n", "step2920 | loss: 0.9450352191925049 | dt: 1454.67ms | tok/sec: 2815.76 | norm: 3.92\n", "step2921 | loss: 1.0651934146881104 | dt: 1463.65ms | tok/sec: 2798.48 | norm: 4.05\n", "step2922 | loss: 0.9631255269050598 | dt: 1456.96ms | tok/sec: 2811.34 | norm: 3.99\n", "step2923 | loss: 0.9430087208747864 | dt: 1458.31ms | tok/sec: 2808.73 | norm: 4.06\n", "step2924 | loss: 0.8692924380302429 | dt: 1458.23ms | tok/sec: 2808.89 | norm: 3.84\n", "step2925 | loss: 1.1484390497207642 | dt: 1461.11ms | tok/sec: 2803.35 | norm: 5.39\n", "step2926 | loss: 1.0056366920471191 | dt: 1445.01ms | tok/sec: 2834.59 | norm: 4.29\n", "step2927 | loss: 0.9240713715553284 | dt: 1461.70ms | tok/sec: 2802.22 | norm: 3.96\n", "step2928 | loss: 0.8222461938858032 | dt: 1449.37ms | tok/sec: 2826.06 | norm: 3.94\n", "step2929 | loss: 0.8357100486755371 | dt: 1460.34ms | tok/sec: 2804.82 | norm: 3.83\n", "step2930 | loss: 1.0956130027770996 | dt: 1454.73ms | tok/sec: 2815.63 | norm: 4.25\n", "step2931 | loss: 1.184012532234192 | dt: 1452.30ms | tok/sec: 2820.36 | norm: 4.97\n", "step2932 | loss: 1.0328007936477661 | dt: 1448.68ms | tok/sec: 2827.41 | norm: 4.87\n", "step2933 | loss: 0.8619753122329712 | dt: 1446.90ms | tok/sec: 2830.88 | norm: 4.35\n", "step2934 | loss: 0.8351168632507324 | dt: 1458.94ms | tok/sec: 2807.51 | norm: 3.70\n", "step2935 | loss: 0.939581573009491 | dt: 1456.04ms | tok/sec: 2813.11 | norm: 4.49\n", "step2936 | loss: 0.7609719038009644 | dt: 1454.21ms | tok/sec: 2816.65 | norm: 3.71\n", "step2937 | loss: 0.7598926424980164 | dt: 1459.63ms | tok/sec: 2806.18 | norm: 3.84\n", "step2938 | loss: 0.7114464640617371 | dt: 1452.99ms | tok/sec: 2819.02 | norm: 3.38\n", "step2939 | loss: 1.1822361946105957 | dt: 1450.81ms | tok/sec: 2823.26 | norm: 4.84\n", "step2940 | loss: 0.905295729637146 | dt: 1453.99ms | tok/sec: 2817.09 | norm: 3.98\n", "step2941 | loss: 1.2749289274215698 | dt: 1458.30ms | tok/sec: 2808.75 | norm: 4.49\n", "step2942 | loss: 1.0872001647949219 | dt: 1462.07ms | tok/sec: 2801.51 | norm: 4.23\n", "step2943 | loss: 1.135585069656372 | dt: 1445.62ms | tok/sec: 2833.39 | norm: 4.48\n", "step2944 | loss: 1.1148649454116821 | dt: 1464.63ms | tok/sec: 2796.60 | norm: 4.52\n", "step2945 | loss: 1.0147030353546143 | dt: 1451.10ms | tok/sec: 2822.69 | norm: 4.38\n", "step2946 | loss: 0.8333762288093567 | dt: 1455.66ms | tok/sec: 2813.85 | norm: 3.80\n", "step2947 | loss: 0.8039814829826355 | dt: 1451.64ms | tok/sec: 2821.63 | norm: 3.64\n", "step2948 | loss: 0.8084970116615295 | dt: 1456.75ms | tok/sec: 2811.75 | norm: 3.55\n", "step2949 | loss: 0.801375687122345 | dt: 1448.05ms | tok/sec: 2828.63 | norm: 3.75\n", "step2950 | loss: 0.9223031997680664 | dt: 1463.03ms | tok/sec: 2799.67 | norm: 4.24\n", "step2951 | loss: 0.9356083869934082 | dt: 1456.64ms | tok/sec: 2811.96 | norm: 3.71\n", "step2952 | loss: 0.9189225435256958 | dt: 1463.20ms | tok/sec: 2799.35 | norm: 3.66\n", "step2953 | loss: 0.860262393951416 | dt: 1445.68ms | tok/sec: 2833.28 | norm: 3.72\n", "step2954 | loss: 0.9004442691802979 | dt: 1451.55ms | tok/sec: 2821.82 | norm: 3.87\n", "step2955 | loss: 0.8802412748336792 | dt: 1450.95ms | tok/sec: 2822.97 | norm: 3.81\n", "step2956 | loss: 0.8475472927093506 | dt: 1448.86ms | tok/sec: 2827.05 | norm: 3.97\n", "step2957 | loss: 0.7241849899291992 | dt: 1453.58ms | tok/sec: 2817.87 | norm: 3.98\n", "step2958 | loss: 0.7314956784248352 | dt: 1461.14ms | tok/sec: 2803.28 | norm: 4.38\n", "step2959 | loss: 0.9194335341453552 | dt: 1464.35ms | tok/sec: 2797.14 | norm: 3.98\n", "step2960 | loss: 0.7558306455612183 | dt: 1448.36ms | tok/sec: 2828.02 | norm: 3.61\n", "step2961 | loss: 0.7730344533920288 | dt: 1449.42ms | tok/sec: 2825.95 | norm: 3.54\n", "step2962 | loss: 0.6681548953056335 | dt: 1454.98ms | tok/sec: 2815.16 | norm: 3.30\n", "step2963 | loss: 0.9896486401557922 | dt: 1460.04ms | tok/sec: 2805.41 | norm: 4.23\n", "step2964 | loss: 0.9258930087089539 | dt: 1446.22ms | tok/sec: 2832.21 | norm: 3.99\n", "step2965 | loss: 0.8664608597755432 | dt: 1457.23ms | tok/sec: 2810.81 | norm: 3.92\n", "step2966 | loss: 0.8455565571784973 | dt: 1458.37ms | tok/sec: 2808.62 | norm: 3.78\n", "step2967 | loss: 0.8533308506011963 | dt: 1455.16ms | tok/sec: 2814.81 | norm: 3.77\n", "step2968 | loss: 0.8713729977607727 | dt: 1446.20ms | tok/sec: 2832.25 | norm: 3.82\n", "step2969 | loss: 0.8822567462921143 | dt: 1458.21ms | tok/sec: 2808.92 | norm: 3.88\n", "step2970 | loss: 0.8058870434761047 | dt: 1458.25ms | tok/sec: 2808.84 | norm: 3.48\n", "step2971 | loss: 0.6603051424026489 | dt: 1452.82ms | tok/sec: 2819.34 | norm: 2.89\n", "step2972 | loss: 0.5713136792182922 | dt: 1444.38ms | tok/sec: 2835.83 | norm: 3.10\n", "step2973 | loss: 0.8268544673919678 | dt: 1454.33ms | tok/sec: 2816.42 | norm: 3.74\n", "step2974 | loss: 0.9088233113288879 | dt: 1457.62ms | tok/sec: 2810.06 | norm: 4.00\n", "step2975 | loss: 0.9472142457962036 | dt: 1461.87ms | tok/sec: 2801.90 | norm: 4.31\n", "step2976 | loss: 0.8375464081764221 | dt: 1449.32ms | tok/sec: 2826.15 | norm: 3.67\n", "step2977 | loss: 0.7592130899429321 | dt: 1447.68ms | tok/sec: 2829.35 | norm: 3.72\n", "step2978 | loss: 0.7891196608543396 | dt: 1453.16ms | tok/sec: 2818.68 | norm: 3.63\n", "step2979 | loss: 0.7164847254753113 | dt: 1455.29ms | tok/sec: 2814.57 | norm: 3.46\n", "step2980 | loss: 0.6925807595252991 | dt: 1455.09ms | tok/sec: 2814.95 | norm: 3.47\n", "step2981 | loss: 0.7073298096656799 | dt: 1451.56ms | tok/sec: 2821.79 | norm: 3.37\n", "step2982 | loss: 0.8292332291603088 | dt: 1450.12ms | tok/sec: 2824.59 | norm: 3.64\n", "step2983 | loss: 0.9580586552619934 | dt: 1452.94ms | tok/sec: 2819.12 | norm: 4.17\n", "step2984 | loss: 0.8777264952659607 | dt: 1445.06ms | tok/sec: 2834.49 | norm: 3.87\n", "step2985 | loss: 0.9558143615722656 | dt: 1446.97ms | tok/sec: 2830.74 | norm: 3.93\n", "step2986 | loss: 0.7952451705932617 | dt: 1453.72ms | tok/sec: 2817.60 | norm: 3.70\n", "step2987 | loss: 0.7484288811683655 | dt: 1451.88ms | tok/sec: 2821.17 | norm: 3.52\n", "step2988 | loss: 0.7100903391838074 | dt: 1453.51ms | tok/sec: 2818.01 | norm: 3.72\n", "step2989 | loss: 0.7398033738136292 | dt: 1447.95ms | tok/sec: 2828.82 | norm: 3.75\n", "step2990 | loss: 0.6913040280342102 | dt: 1444.94ms | tok/sec: 2834.73 | norm: 3.63\n", "step2991 | loss: 0.7407629489898682 | dt: 1444.71ms | tok/sec: 2835.18 | norm: 3.55\n", "step2992 | loss: 0.7020304203033447 | dt: 1448.87ms | tok/sec: 2827.03 | norm: 3.28\n", "step2993 | loss: 1.15683913230896 | dt: 1459.10ms | tok/sec: 2807.21 | norm: 4.90\n", "step2994 | loss: 0.9806463718414307 | dt: 1447.58ms | tok/sec: 2829.56 | norm: 4.43\n", "step2995 | loss: 0.9512102603912354 | dt: 1452.02ms | tok/sec: 2820.89 | norm: 4.66\n", "step2996 | loss: 0.9380258917808533 | dt: 1454.75ms | tok/sec: 2815.61 | norm: 4.06\n", "step2997 | loss: 1.1364938020706177 | dt: 1453.61ms | tok/sec: 2817.82 | norm: 4.52\n", "step2998 | loss: 0.9692773222923279 | dt: 1455.24ms | tok/sec: 2814.66 | norm: 4.12\n", "step2999 | loss: 0.7761778831481934 | dt: 1454.29ms | tok/sec: 2816.50 | norm: 3.82\n", "step3000 | loss: 0.843026876449585 | dt: 1449.75ms | tok/sec: 2825.31 | norm: 3.66\n", "step3001 | loss: 0.9324593544006348 | dt: 1452.24ms | tok/sec: 2820.46 | norm: 3.92\n", "step3002 | loss: 0.85848468542099 | dt: 1452.52ms | tok/sec: 2819.93 | norm: 3.98\n", "step3003 | loss: 0.9653047323226929 | dt: 1454.47ms | tok/sec: 2816.15 | norm: 3.97\n", "step3004 | loss: 0.8896806836128235 | dt: 1453.05ms | tok/sec: 2818.89 | norm: 4.33\n", "step3005 | loss: 0.829444169998169 | dt: 1454.97ms | tok/sec: 2815.18 | norm: 3.84\n", "step3006 | loss: 0.7642397880554199 | dt: 1459.74ms | tok/sec: 2805.98 | norm: 3.41\n", "step3007 | loss: 1.0453873872756958 | dt: 1457.55ms | tok/sec: 2810.19 | norm: 4.01\n", "step3008 | loss: 0.8925451636314392 | dt: 1439.99ms | tok/sec: 2844.46 | norm: 3.83\n", "step3009 | loss: 0.8101173639297485 | dt: 1455.54ms | tok/sec: 2814.07 | norm: 3.84\n", "step3010 | loss: 0.7107744812965393 | dt: 1450.87ms | tok/sec: 2823.13 | norm: 3.46\n", "step3011 | loss: 0.7232898473739624 | dt: 1452.06ms | tok/sec: 2820.82 | norm: 3.50\n", "step3012 | loss: 0.9850313663482666 | dt: 1441.04ms | tok/sec: 2842.38 | norm: 4.10\n", "step3013 | loss: 1.0740448236465454 | dt: 1451.98ms | tok/sec: 2820.97 | norm: 4.79\n", "step3014 | loss: 0.9407662153244019 | dt: 1449.65ms | tok/sec: 2825.51 | norm: 4.87\n", "step3015 | loss: 0.7768028378486633 | dt: 1451.45ms | tok/sec: 2822.00 | norm: 4.21\n", "step3016 | loss: 0.7542487382888794 | dt: 1454.21ms | tok/sec: 2816.65 | norm: 3.73\n", "step3017 | loss: 0.8043234944343567 | dt: 1458.73ms | tok/sec: 2807.93 | norm: 3.66\n", "step3018 | loss: 0.6551101207733154 | dt: 1454.13ms | tok/sec: 2816.80 | norm: 3.51\n", "step3019 | loss: 0.6591563820838928 | dt: 1456.39ms | tok/sec: 2812.43 | norm: 3.85\n", "step3020 | loss: 0.6120330691337585 | dt: 1455.00ms | tok/sec: 2815.13 | norm: 3.54\n", "step3021 | loss: 1.0452386140823364 | dt: 1444.30ms | tok/sec: 2835.98 | norm: 4.98\n", "step3022 | loss: 0.7866680026054382 | dt: 1455.33ms | tok/sec: 2814.48 | norm: 3.58\n", "step3023 | loss: 1.1774342060089111 | dt: 1459.49ms | tok/sec: 2806.46 | norm: 5.09\n", "step3024 | loss: 0.975892961025238 | dt: 1445.36ms | tok/sec: 2833.89 | norm: 4.47\n", "step3025 | loss: 1.0730711221694946 | dt: 1453.24ms | tok/sec: 2818.53 | norm: 4.94\n", "step3026 | loss: 1.0193326473236084 | dt: 1448.43ms | tok/sec: 2827.89 | norm: 4.69\n", "step3027 | loss: 0.9209094643592834 | dt: 1447.12ms | tok/sec: 2830.44 | norm: 4.56\n", "step3028 | loss: 0.7615544199943542 | dt: 1446.03ms | tok/sec: 2832.58 | norm: 3.92\n", "step3029 | loss: 0.7181044220924377 | dt: 1447.00ms | tok/sec: 2830.69 | norm: 3.72\n", "step3030 | loss: 0.7078166604042053 | dt: 1453.06ms | tok/sec: 2818.89 | norm: 3.57\n", "step3031 | loss: 0.7081094980239868 | dt: 1452.19ms | tok/sec: 2820.56 | norm: 3.55\n", "step3032 | loss: 0.8248244524002075 | dt: 1452.37ms | tok/sec: 2820.23 | norm: 3.76\n", "step3033 | loss: 0.8366696238517761 | dt: 1448.35ms | tok/sec: 2828.04 | norm: 3.59\n", "step3034 | loss: 0.8209939002990723 | dt: 1449.31ms | tok/sec: 2826.18 | norm: 3.73\n", "step3035 | loss: 0.7671328783035278 | dt: 1454.03ms | tok/sec: 2817.00 | norm: 3.68\n", "step3036 | loss: 0.8153961896896362 | dt: 1454.80ms | tok/sec: 2815.52 | norm: 4.01\n", "step3037 | loss: 0.8026720881462097 | dt: 1451.23ms | tok/sec: 2822.44 | norm: 3.92\n", "step3038 | loss: 0.7319300770759583 | dt: 1450.52ms | tok/sec: 2823.81 | norm: 3.51\n", "step3039 | loss: 0.6078813076019287 | dt: 1452.47ms | tok/sec: 2820.03 | norm: 3.08\n", "step3040 | loss: 0.6168999075889587 | dt: 1448.70ms | tok/sec: 2827.35 | norm: 3.32\n", "step3041 | loss: 0.7816909551620483 | dt: 1438.34ms | tok/sec: 2847.73 | norm: 3.49\n", "step3042 | loss: 0.6600026488304138 | dt: 1447.49ms | tok/sec: 2829.72 | norm: 3.73\n", "step3043 | loss: 0.6863455772399902 | dt: 1436.41ms | tok/sec: 2851.56 | norm: 3.78\n", "step3044 | loss: 0.5733802914619446 | dt: 1450.79ms | tok/sec: 2823.30 | norm: 3.29\n", "step3045 | loss: 0.8659278750419617 | dt: 1453.63ms | tok/sec: 2817.77 | norm: 3.82\n", "step3046 | loss: 0.8076178431510925 | dt: 1454.45ms | tok/sec: 2816.18 | norm: 3.57\n", "step3047 | loss: 0.7642230987548828 | dt: 1452.92ms | tok/sec: 2819.15 | norm: 3.72\n", "step3048 | loss: 0.7478429079055786 | dt: 1453.33ms | tok/sec: 2818.36 | norm: 3.51\n", "step3049 | loss: 0.75041264295578 | dt: 1451.37ms | tok/sec: 2822.16 | norm: 3.81\n", "step3050 | loss: 0.769582211971283 | dt: 1451.66ms | tok/sec: 2821.60 | norm: 3.79\n", "step3051 | loss: 0.7469361424446106 | dt: 1443.14ms | tok/sec: 2838.26 | norm: 3.61\n", "step3052 | loss: 0.682750403881073 | dt: 1445.97ms | tok/sec: 2832.70 | norm: 3.37\n", "step3053 | loss: 0.5470719933509827 | dt: 1452.67ms | tok/sec: 2819.63 | norm: 2.89\n", "step3054 | loss: 0.4971471130847931 | dt: 1453.96ms | tok/sec: 2817.14 | norm: 3.02\n", "step3055 | loss: 0.7603638768196106 | dt: 1451.53ms | tok/sec: 2821.85 | norm: 3.87\n", "step3056 | loss: 0.8249877095222473 | dt: 1452.80ms | tok/sec: 2819.39 | norm: 4.05\n", "step3057 | loss: 0.8443918228149414 | dt: 1456.56ms | tok/sec: 2812.10 | norm: 3.78\n", "step3058 | loss: 0.7251178026199341 | dt: 1451.77ms | tok/sec: 2821.39 | norm: 3.66\n", "step3059 | loss: 0.6809489727020264 | dt: 1437.07ms | tok/sec: 2850.24 | norm: 3.77\n", "step3060 | loss: 0.686642587184906 | dt: 1449.01ms | tok/sec: 2826.77 | norm: 3.53\n", "step3061 | loss: 0.6300835609436035 | dt: 1440.79ms | tok/sec: 2842.89 | norm: 3.54\n", "step3062 | loss: 0.6024361252784729 | dt: 1445.63ms | tok/sec: 2833.36 | norm: 3.56\n", "step3063 | loss: 0.6183084845542908 | dt: 1450.11ms | tok/sec: 2824.61 | norm: 3.36\n", "step3064 | loss: 0.7265853881835938 | dt: 1455.96ms | tok/sec: 2813.25 | norm: 3.67\n", "step3065 | loss: 0.8499524593353271 | dt: 1438.99ms | tok/sec: 2846.44 | norm: 4.14\n", "step3066 | loss: 0.7641186714172363 | dt: 1449.10ms | tok/sec: 2826.58 | norm: 3.75\n", "step3067 | loss: 0.8338185548782349 | dt: 1441.54ms | tok/sec: 2841.40 | norm: 3.78\n", "step3068 | loss: 0.6767992973327637 | dt: 1449.47ms | tok/sec: 2825.85 | norm: 3.20\n", "step3069 | loss: 0.6285337805747986 | dt: 1453.56ms | tok/sec: 2817.91 | norm: 3.13\n", "step3070 | loss: 0.6095362305641174 | dt: 1446.19ms | tok/sec: 2832.28 | norm: 3.22\n", "step3071 | loss: 0.6399233937263489 | dt: 1439.49ms | tok/sec: 2845.45 | norm: 3.48\n", "step3072 | loss: 0.589543879032135 | dt: 1453.64ms | tok/sec: 2817.76 | norm: 3.27\n", "step3073 | loss: 0.6211308240890503 | dt: 1442.04ms | tok/sec: 2840.42 | norm: 3.36\n", "step3074 | loss: 0.5983361005783081 | dt: 1448.07ms | tok/sec: 2828.60 | norm: 3.21\n", "step3075 | loss: 1.0449376106262207 | dt: 1454.48ms | tok/sec: 2816.13 | norm: 4.80\n", "step3076 | loss: 0.8752556443214417 | dt: 1446.87ms | tok/sec: 2830.93 | norm: 4.26\n", "step3077 | loss: 0.8245408535003662 | dt: 1447.13ms | tok/sec: 2830.44 | norm: 4.03\n", "step3078 | loss: 0.8158571124076843 | dt: 1450.93ms | tok/sec: 2823.02 | norm: 3.77\n", "step3079 | loss: 1.0051718950271606 | dt: 1454.01ms | tok/sec: 2817.04 | norm: 4.15\n", "step3080 | loss: 0.8687157034873962 | dt: 1453.67ms | tok/sec: 2817.70 | norm: 3.87\n", "step3081 | loss: 0.6655299067497253 | dt: 1450.80ms | tok/sec: 2823.26 | norm: 3.56\n", "step3082 | loss: 0.7404137253761292 | dt: 1453.38ms | tok/sec: 2818.26 | norm: 3.67\n", "step3083 | loss: 0.807444155216217 | dt: 1455.26ms | tok/sec: 2814.62 | norm: 3.84\n", "step3084 | loss: 0.7359960079193115 | dt: 1440.70ms | tok/sec: 2843.07 | norm: 3.69\n", "step3085 | loss: 0.8331807255744934 | dt: 1445.17ms | tok/sec: 2834.27 | norm: 3.67\n", "step3086 | loss: 0.7649609446525574 | dt: 1441.24ms | tok/sec: 2842.00 | norm: 3.63\n", "step3087 | loss: 0.7192921042442322 | dt: 1448.66ms | tok/sec: 2827.44 | norm: 3.54\n", "step3088 | loss: 0.6550933718681335 | dt: 1454.05ms | tok/sec: 2816.96 | norm: 3.26\n", "step3089 | loss: 0.9361034035682678 | dt: 1448.59ms | tok/sec: 2827.57 | norm: 4.36\n", "step3090 | loss: 0.7831045389175415 | dt: 1438.62ms | tok/sec: 2847.18 | norm: 3.56\n", "step3091 | loss: 0.7098761200904846 | dt: 1453.06ms | tok/sec: 2818.87 | norm: 3.42\n", "step3092 | loss: 0.599795937538147 | dt: 1450.44ms | tok/sec: 2823.97 | norm: 3.15\n", "step3093 | loss: 0.6304019093513489 | dt: 1450.86ms | tok/sec: 2823.15 | norm: 3.37\n", "step3094 | loss: 0.8553818464279175 | dt: 1440.80ms | tok/sec: 2842.87 | norm: 3.76\n", "step3095 | loss: 0.9632614850997925 | dt: 1449.35ms | tok/sec: 2826.09 | norm: 4.42\n", "step3096 | loss: 0.8260703086853027 | dt: 1452.99ms | tok/sec: 2819.02 | norm: 4.23\n", "step3097 | loss: 0.6774129867553711 | dt: 1456.62ms | tok/sec: 2811.98 | norm: 3.73\n", "step3098 | loss: 0.6390328407287598 | dt: 1453.04ms | tok/sec: 2818.92 | norm: 3.06\n", "step3099 | loss: 0.7078114748001099 | dt: 1446.53ms | tok/sec: 2831.60 | norm: 3.78\n", "step3100 | loss: 0.5643730759620667 | dt: 1443.05ms | tok/sec: 2838.43 | norm: 3.54\n", "step3101 | loss: 0.5760180354118347 | dt: 1448.51ms | tok/sec: 2827.74 | norm: 3.69\n", "step3102 | loss: 0.5567451119422913 | dt: 1453.23ms | tok/sec: 2818.55 | norm: 3.73\n", "step3103 | loss: 0.9633710384368896 | dt: 1454.55ms | tok/sec: 2815.99 | norm: 4.74\n", "step3104 | loss: 0.6993299722671509 | dt: 1456.40ms | tok/sec: 2812.41 | norm: 3.87\n", "step3105 | loss: 1.0422857999801636 | dt: 1452.56ms | tok/sec: 2819.84 | norm: 5.02\n", "step3106 | loss: 0.8437035083770752 | dt: 1446.82ms | tok/sec: 2831.04 | norm: 4.10\n", "step3107 | loss: 0.9595147371292114 | dt: 1458.93ms | tok/sec: 2807.54 | norm: 4.76\n", "step3108 | loss: 0.9262387156486511 | dt: 1449.60ms | tok/sec: 2825.61 | norm: 4.89\n", "step3109 | loss: 0.8326346278190613 | dt: 1451.73ms | tok/sec: 2821.45 | norm: 4.79\n", "step3110 | loss: 0.6763478517532349 | dt: 1449.91ms | tok/sec: 2825.01 | norm: 3.68\n", "step3111 | loss: 0.6270908117294312 | dt: 1455.05ms | tok/sec: 2815.02 | norm: 3.47\n", "step3112 | loss: 0.615352988243103 | dt: 1456.66ms | tok/sec: 2811.91 | norm: 3.21\n", "step3113 | loss: 0.615131676197052 | dt: 1452.25ms | tok/sec: 2820.45 | norm: 3.23\n", "step3114 | loss: 0.7359529733657837 | dt: 1455.06ms | tok/sec: 2815.01 | norm: 3.81\n", "step3115 | loss: 0.725153923034668 | dt: 1454.69ms | tok/sec: 2815.72 | norm: 3.58\n", "step3116 | loss: 0.7027708292007446 | dt: 1452.86ms | tok/sec: 2819.27 | norm: 3.59\n", "step3117 | loss: 0.6270528435707092 | dt: 1444.09ms | tok/sec: 2836.39 | norm: 3.30\n", "step3118 | loss: 0.6919668316841125 | dt: 1450.25ms | tok/sec: 2824.34 | norm: 3.35\n", "step3119 | loss: 0.6764122843742371 | dt: 1454.28ms | tok/sec: 2816.52 | norm: 3.33\n", "step3120 | loss: 0.6209636330604553 | dt: 1453.76ms | tok/sec: 2817.51 | norm: 3.19\n", "step3121 | loss: 0.5067375302314758 | dt: 1457.42ms | tok/sec: 2810.44 | norm: 3.00\n", "step3122 | loss: 0.5145391821861267 | dt: 1452.87ms | tok/sec: 2819.25 | norm: 3.14\n", "step3123 | loss: 0.6658791303634644 | dt: 1455.46ms | tok/sec: 2814.24 | norm: 3.26\n", "step3124 | loss: 0.5690696835517883 | dt: 1454.77ms | tok/sec: 2815.57 | norm: 3.33\n", "step3125 | loss: 0.5731169581413269 | dt: 1458.13ms | tok/sec: 2809.07 | norm: 2.70\n", "step3126 | loss: 0.4764188826084137 | dt: 1453.91ms | tok/sec: 2817.24 | norm: 2.80\n", "step3127 | loss: 0.746883749961853 | dt: 1453.13ms | tok/sec: 2818.74 | norm: 3.54\n", "step3128 | loss: 0.6960915327072144 | dt: 1455.89ms | tok/sec: 2813.40 | norm: 3.51\n", "step3129 | loss: 0.6589552164077759 | dt: 1459.24ms | tok/sec: 2806.94 | norm: 3.49\n", "step3130 | loss: 0.6422629952430725 | dt: 1438.95ms | tok/sec: 2846.51 | norm: 3.44\n", "step3131 | loss: 0.6521874666213989 | dt: 1456.82ms | tok/sec: 2811.61 | norm: 3.61\n", "step3132 | loss: 0.6606907248497009 | dt: 1461.16ms | tok/sec: 2803.26 | norm: 3.15\n", "step3133 | loss: 0.638276219367981 | dt: 1446.21ms | tok/sec: 2832.23 | norm: 3.17\n", "step3134 | loss: 0.5757160782814026 | dt: 1461.64ms | tok/sec: 2802.33 | norm: 3.04\n", "step3135 | loss: 0.45389220118522644 | dt: 1455.12ms | tok/sec: 2814.89 | norm: 2.73\n", "step3136 | loss: 0.40404221415519714 | dt: 1455.80ms | tok/sec: 2813.56 | norm: 2.40\n", "step3137 | loss: 0.6341502666473389 | dt: 1451.07ms | tok/sec: 2822.75 | norm: 3.48\n", "step3138 | loss: 0.7239356637001038 | dt: 1454.00ms | tok/sec: 2817.06 | norm: 3.99\n", "step3139 | loss: 0.7293887138366699 | dt: 1452.45ms | tok/sec: 2820.07 | norm: 3.80\n", "step3140 | loss: 0.6292388439178467 | dt: 1451.90ms | tok/sec: 2821.13 | norm: 3.36\n", "step3141 | loss: 0.5766403079032898 | dt: 1446.81ms | tok/sec: 2831.05 | norm: 3.27\n", "step3142 | loss: 0.591642439365387 | dt: 1464.12ms | tok/sec: 2797.58 | norm: 3.54\n", "step3143 | loss: 0.5237610340118408 | dt: 1447.37ms | tok/sec: 2829.95 | norm: 3.16\n", "step3144 | loss: 0.5116943717002869 | dt: 1448.15ms | tok/sec: 2828.43 | norm: 3.44\n", "step3145 | loss: 0.5513384938240051 | dt: 1459.55ms | tok/sec: 2806.35 | norm: 3.31\n", "step3146 | loss: 0.6237200498580933 | dt: 1460.88ms | tok/sec: 2803.79 | norm: 3.23\n", "step3147 | loss: 0.7259489893913269 | dt: 1449.69ms | tok/sec: 2825.42 | norm: 3.71\n", "step3148 | loss: 0.6543776988983154 | dt: 1458.19ms | tok/sec: 2808.97 | norm: 3.70\n", "step3149 | loss: 0.7221717238426208 | dt: 1461.39ms | tok/sec: 2802.82 | norm: 3.62\n", "step3150 | loss: 0.5612224340438843 | dt: 1442.38ms | tok/sec: 2839.74 | norm: 2.95\n", "step3151 | loss: 0.5345032811164856 | dt: 1457.57ms | tok/sec: 2810.16 | norm: 3.12\n", "step3152 | loss: 0.5209262371063232 | dt: 1454.26ms | tok/sec: 2816.55 | norm: 3.04\n", "step3153 | loss: 0.5423387885093689 | dt: 1458.09ms | tok/sec: 2809.16 | norm: 3.22\n", "step3154 | loss: 0.5044002532958984 | dt: 1461.18ms | tok/sec: 2803.21 | norm: 3.21\n", "step3155 | loss: 0.5322731137275696 | dt: 1442.52ms | tok/sec: 2839.48 | norm: 3.06\n", "step3156 | loss: 0.5086583495140076 | dt: 1451.94ms | tok/sec: 2821.06 | norm: 2.93\n", "step3157 | loss: 0.9358139038085938 | dt: 1460.36ms | tok/sec: 2804.79 | norm: 4.87\n", "step3158 | loss: 0.7702004313468933 | dt: 1456.07ms | tok/sec: 2813.06 | norm: 4.14\n", "step3159 | loss: 0.7277229428291321 | dt: 1449.69ms | tok/sec: 2825.44 | norm: 3.76\n", "step3160 | loss: 0.7169536352157593 | dt: 1460.48ms | tok/sec: 2804.55 | norm: 3.65\n", "step3161 | loss: 0.9076639413833618 | dt: 1456.50ms | tok/sec: 2812.22 | norm: 4.22\n", "step3162 | loss: 0.7619693279266357 | dt: 1459.53ms | tok/sec: 2806.38 | norm: 3.72\n", "step3163 | loss: 0.5953885912895203 | dt: 1456.45ms | tok/sec: 2812.31 | norm: 3.74\n", "step3164 | loss: 0.6626473069190979 | dt: 1452.95ms | tok/sec: 2819.09 | norm: 3.57\n", "step3165 | loss: 0.7233704924583435 | dt: 1454.19ms | tok/sec: 2816.70 | norm: 4.36\n", "step3166 | loss: 0.6367253065109253 | dt: 1455.22ms | tok/sec: 2814.70 | norm: 3.75\n", "step3167 | loss: 0.7438826560974121 | dt: 1455.86ms | tok/sec: 2813.45 | norm: 4.01\n", "step3168 | loss: 0.6658204197883606 | dt: 1457.85ms | tok/sec: 2809.62 | norm: 3.70\n", "step3169 | loss: 0.6264967322349548 | dt: 1456.39ms | tok/sec: 2812.42 | norm: 3.68\n", "step3170 | loss: 0.579067587852478 | dt: 1463.76ms | tok/sec: 2798.26 | norm: 3.36\n", "step3171 | loss: 0.8483079671859741 | dt: 1450.92ms | tok/sec: 2823.03 | norm: 3.89\n", "step3172 | loss: 0.6823495030403137 | dt: 1463.27ms | tok/sec: 2799.21 | norm: 3.30\n", "step3173 | loss: 0.6159766316413879 | dt: 1456.59ms | tok/sec: 2812.05 | norm: 3.20\n", "step3174 | loss: 0.5325368642807007 | dt: 1451.12ms | tok/sec: 2822.65 | norm: 3.40\n", "step3175 | loss: 0.5479498505592346 | dt: 1459.18ms | tok/sec: 2807.06 | norm: 3.21\n", "step3176 | loss: 0.7422831654548645 | dt: 1458.28ms | tok/sec: 2808.79 | norm: 3.56\n", "step3177 | loss: 0.8352875709533691 | dt: 1461.00ms | tok/sec: 2803.56 | norm: 4.29\n", "step3178 | loss: 0.7003669738769531 | dt: 1457.75ms | tok/sec: 2809.82 | norm: 3.74\n", "step3179 | loss: 0.5529468059539795 | dt: 1455.93ms | tok/sec: 2813.32 | norm: 3.06\n", "step3180 | loss: 0.530333936214447 | dt: 1458.49ms | tok/sec: 2808.38 | norm: 2.97\n", "step3181 | loss: 0.6095189452171326 | dt: 1445.24ms | tok/sec: 2834.13 | norm: 3.17\n", "step3182 | loss: 0.47059813141822815 | dt: 1455.85ms | tok/sec: 2813.48 | norm: 2.87\n", "step3183 | loss: 0.4824695587158203 | dt: 1459.92ms | tok/sec: 2805.63 | norm: 3.05\n", "step3184 | loss: 0.45050182938575745 | dt: 1462.04ms | tok/sec: 2801.56 | norm: 2.86\n", "step3185 | loss: 0.8493824005126953 | dt: 1448.32ms | tok/sec: 2828.10 | norm: 4.43\n", "step3186 | loss: 0.6241158246994019 | dt: 1455.06ms | tok/sec: 2814.99 | norm: 3.64\n", "step3187 | loss: 0.9734228253364563 | dt: 1453.23ms | tok/sec: 2818.54 | norm: 4.80\n", "step3188 | loss: 0.7888623476028442 | dt: 1455.56ms | tok/sec: 2814.05 | norm: 4.60\n", "step3189 | loss: 0.8580431938171387 | dt: 1461.89ms | tok/sec: 2801.85 | norm: 4.72\n", "step3190 | loss: 0.8123621940612793 | dt: 1454.83ms | tok/sec: 2815.45 | norm: 4.87\n", "step3191 | loss: 0.7284032702445984 | dt: 1460.35ms | tok/sec: 2804.80 | norm: 4.21\n", "step3192 | loss: 0.5781034231185913 | dt: 1455.43ms | tok/sec: 2814.29 | norm: 3.60\n", "step3193 | loss: 0.5465691089630127 | dt: 1460.05ms | tok/sec: 2805.39 | norm: 3.64\n", "step3194 | loss: 0.5314151644706726 | dt: 1445.12ms | tok/sec: 2834.38 | norm: 3.32\n", "step3195 | loss: 0.5261644124984741 | dt: 1454.49ms | tok/sec: 2816.11 | norm: 2.94\n", "step3196 | loss: 0.64905846118927 | dt: 1455.10ms | tok/sec: 2814.93 | norm: 3.43\n", "step3197 | loss: 0.633415699005127 | dt: 1460.94ms | tok/sec: 2803.67 | norm: 3.21\n", "step3198 | loss: 0.5876832008361816 | dt: 1446.15ms | tok/sec: 2832.35 | norm: 3.04\n", "step3199 | loss: 0.5096126198768616 | dt: 1455.71ms | tok/sec: 2813.76 | norm: 3.15\n", "step3200 | loss: 0.5573536157608032 | dt: 1459.80ms | tok/sec: 2805.87 | norm: 2.99\n", "step3201 | loss: 0.545637845993042 | dt: 1454.96ms | tok/sec: 2815.20 | norm: 2.98\n", "step3202 | loss: 0.5192943811416626 | dt: 1452.65ms | tok/sec: 2819.68 | norm: 3.17\n", "step3203 | loss: 0.410454124212265 | dt: 1451.69ms | tok/sec: 2821.54 | norm: 2.58\n", "step3204 | loss: 0.4169868230819702 | dt: 1449.21ms | tok/sec: 2826.37 | norm: 2.79\n", "step3205 | loss: 0.5582166910171509 | dt: 1462.21ms | tok/sec: 2801.23 | norm: 2.92\n", "step3206 | loss: 0.4668706953525543 | dt: 1453.70ms | tok/sec: 2817.64 | norm: 2.70\n", "step3207 | loss: 0.4719736576080322 | dt: 1453.71ms | tok/sec: 2817.61 | norm: 2.66\n", "step3208 | loss: 0.3989318609237671 | dt: 1450.50ms | tok/sec: 2823.86 | norm: 2.72\n", "step3209 | loss: 0.6447394490242004 | dt: 1459.14ms | tok/sec: 2807.14 | norm: 3.33\n", "step3210 | loss: 0.5888794660568237 | dt: 1455.40ms | tok/sec: 2814.34 | norm: 3.08\n", "step3211 | loss: 0.546201765537262 | dt: 1437.56ms | tok/sec: 2849.27 | norm: 3.08\n", "step3212 | loss: 0.5415995121002197 | dt: 1452.49ms | tok/sec: 2819.99 | norm: 2.99\n", "step3213 | loss: 0.5627989768981934 | dt: 1453.60ms | tok/sec: 2817.82 | norm: 3.58\n", "step3214 | loss: 0.5641206502914429 | dt: 1449.34ms | tok/sec: 2826.11 | norm: 3.32\n", "step3215 | loss: 0.5376478433609009 | dt: 1451.99ms | tok/sec: 2820.97 | norm: 3.15\n", "step3216 | loss: 0.4899735152721405 | dt: 1452.99ms | tok/sec: 2819.02 | norm: 3.00\n", "step3217 | loss: 0.3753836452960968 | dt: 1452.80ms | tok/sec: 2819.38 | norm: 2.52\n", "step3218 | loss: 0.32699838280677795 | dt: 1453.68ms | tok/sec: 2817.68 | norm: 2.43\n", "step3219 | loss: 0.5495966076850891 | dt: 1452.21ms | tok/sec: 2820.52 | norm: 3.39\n", "step3220 | loss: 0.6108653545379639 | dt: 1453.42ms | tok/sec: 2818.19 | norm: 3.47\n", "step3221 | loss: 0.6203581094741821 | dt: 1452.99ms | tok/sec: 2819.01 | norm: 3.43\n", "step3222 | loss: 0.5324509143829346 | dt: 1451.60ms | tok/sec: 2821.71 | norm: 3.16\n", "step3223 | loss: 0.5005719661712646 | dt: 1449.60ms | tok/sec: 2825.60 | norm: 3.40\n", "step3224 | loss: 0.4966569244861603 | dt: 1443.34ms | tok/sec: 2837.85 | norm: 3.05\n", "step3225 | loss: 0.45657241344451904 | dt: 1449.37ms | tok/sec: 2826.06 | norm: 3.12\n", "step3226 | loss: 0.43619638681411743 | dt: 1461.08ms | tok/sec: 2803.40 | norm: 3.03\n", "step3227 | loss: 0.4643835723400116 | dt: 1457.54ms | tok/sec: 2810.21 | norm: 3.11\n", "step3228 | loss: 0.5298284888267517 | dt: 1439.42ms | tok/sec: 2845.58 | norm: 3.47\n", "step3229 | loss: 0.6559164524078369 | dt: 1448.32ms | tok/sec: 2828.11 | norm: 4.29\n", "step3230 | loss: 0.5861451625823975 | dt: 1452.39ms | tok/sec: 2820.18 | norm: 3.90\n", "step3231 | loss: 0.6440727710723877 | dt: 1449.38ms | tok/sec: 2826.04 | norm: 3.74\n", "step3232 | loss: 0.48518675565719604 | dt: 1439.12ms | tok/sec: 2846.18 | norm: 3.12\n", "step3233 | loss: 0.45376890897750854 | dt: 1452.93ms | tok/sec: 2819.12 | norm: 3.11\n", "step3234 | loss: 0.4421846866607666 | dt: 1452.50ms | tok/sec: 2819.97 | norm: 3.55\n", "step3235 | loss: 0.476824015378952 | dt: 1442.14ms | tok/sec: 2840.22 | norm: 3.46\n", "step3236 | loss: 0.4305087625980377 | dt: 1447.96ms | tok/sec: 2828.82 | norm: 2.88\n", "step3237 | loss: 0.46858158707618713 | dt: 1452.78ms | tok/sec: 2819.42 | norm: 3.07\n", "step3238 | loss: 0.4266170263290405 | dt: 1451.02ms | tok/sec: 2822.84 | norm: 2.79\n", "step3239 | loss: 0.8594128489494324 | dt: 1449.16ms | tok/sec: 2826.46 | norm: 4.55\n", "step3240 | loss: 0.7184247970581055 | dt: 1443.57ms | tok/sec: 2837.42 | norm: 4.48\n", "step3241 | loss: 0.6684809327125549 | dt: 1452.87ms | tok/sec: 2819.26 | norm: 4.09\n", "step3242 | loss: 0.6494631171226501 | dt: 1450.43ms | tok/sec: 2823.99 | norm: 3.72\n", "step3243 | loss: 0.8350256085395813 | dt: 1457.15ms | tok/sec: 2810.96 | norm: 4.42\n", "step3244 | loss: 0.6628912091255188 | dt: 1446.49ms | tok/sec: 2831.68 | norm: 3.65\n", "step3245 | loss: 0.5057651996612549 | dt: 1451.60ms | tok/sec: 2821.72 | norm: 3.14\n", "step3246 | loss: 0.5714990496635437 | dt: 1449.12ms | tok/sec: 2826.54 | norm: 2.95\n", "step3247 | loss: 0.6161668300628662 | dt: 1455.66ms | tok/sec: 2813.85 | norm: 3.38\n", "step3248 | loss: 0.5453567504882812 | dt: 1444.54ms | tok/sec: 2835.50 | norm: 3.12\n", "step3249 | loss: 0.6514466404914856 | dt: 1456.14ms | tok/sec: 2812.91 | norm: 3.68\n", "step3250 | loss: 0.5655698776245117 | dt: 1452.44ms | tok/sec: 2820.09 | norm: 3.37\n", "step3251 | loss: 0.5087469220161438 | dt: 1446.79ms | tok/sec: 2831.10 | norm: 3.16\n", "step3252 | loss: 0.48554444313049316 | dt: 1456.42ms | tok/sec: 2812.37 | norm: 3.33\n", "step3253 | loss: 0.7059036493301392 | dt: 1437.47ms | tok/sec: 2849.45 | norm: 3.58\n", "step3254 | loss: 0.5876257419586182 | dt: 1447.35ms | tok/sec: 2830.00 | norm: 3.57\n", "step3255 | loss: 0.5229901075363159 | dt: 1453.20ms | tok/sec: 2818.60 | norm: 3.18\n", "step3256 | loss: 0.4406493604183197 | dt: 1455.33ms | tok/sec: 2814.49 | norm: 2.90\n", "step3257 | loss: 0.4645979404449463 | dt: 1451.95ms | tok/sec: 2821.03 | norm: 3.08\n", "step3258 | loss: 0.6612206697463989 | dt: 1454.31ms | tok/sec: 2816.46 | norm: 3.81\n", "step3259 | loss: 0.7391958236694336 | dt: 1442.64ms | tok/sec: 2839.25 | norm: 4.08\n", "step3260 | loss: 0.600841224193573 | dt: 1448.31ms | tok/sec: 2828.12 | norm: 3.56\n", "step3261 | loss: 0.4589204490184784 | dt: 1451.70ms | tok/sec: 2821.51 | norm: 2.90\n", "step3262 | loss: 0.452273428440094 | dt: 1445.39ms | tok/sec: 2833.83 | norm: 2.81\n", "step3263 | loss: 0.5191649198532104 | dt: 1453.10ms | tok/sec: 2818.79 | norm: 2.90\n", "step3264 | loss: 0.37884098291397095 | dt: 1448.05ms | tok/sec: 2828.63 | norm: 2.67\n", "step3265 | loss: 0.4020369350910187 | dt: 1447.85ms | tok/sec: 2829.02 | norm: 2.72\n", "step3266 | loss: 0.37949439883232117 | dt: 1446.70ms | tok/sec: 2831.27 | norm: 2.73\n", "step3267 | loss: 0.7595158815383911 | dt: 1445.23ms | tok/sec: 2834.16 | norm: 4.19\n", "step3268 | loss: 0.5332900881767273 | dt: 1452.46ms | tok/sec: 2820.04 | norm: 3.39\n", "step3269 | loss: 0.8507614135742188 | dt: 1453.28ms | tok/sec: 2818.45 | norm: 4.41\n", "step3270 | loss: 0.6870543360710144 | dt: 1451.57ms | tok/sec: 2821.78 | norm: 4.15\n", "step3271 | loss: 0.7830570340156555 | dt: 1450.18ms | tok/sec: 2824.47 | norm: 4.47\n", "step3272 | loss: 0.7288860082626343 | dt: 1453.47ms | tok/sec: 2818.09 | norm: 4.25\n", "step3273 | loss: 0.6457468271255493 | dt: 1438.60ms | tok/sec: 2847.22 | norm: 4.01\n", "step3274 | loss: 0.49158358573913574 | dt: 1453.57ms | tok/sec: 2817.89 | norm: 3.23\n", "step3275 | loss: 0.4674893021583557 | dt: 1449.48ms | tok/sec: 2825.84 | norm: 3.17\n", "step3276 | loss: 0.4616836607456207 | dt: 1441.05ms | tok/sec: 2842.38 | norm: 3.23\n", "step3277 | loss: 0.45026513934135437 | dt: 1445.82ms | tok/sec: 2832.99 | norm: 3.06\n", "step3278 | loss: 0.5760475993156433 | dt: 1436.47ms | tok/sec: 2851.43 | norm: 3.81\n", "step3279 | loss: 0.5850035548210144 | dt: 1443.89ms | tok/sec: 2836.77 | norm: 3.45\n", "step3280 | loss: 0.5290530920028687 | dt: 1445.14ms | tok/sec: 2834.32 | norm: 3.32\n", "step3281 | loss: 0.4439435303211212 | dt: 1450.81ms | tok/sec: 2823.25 | norm: 3.11\n", "step3282 | loss: 0.4739343225955963 | dt: 1452.46ms | tok/sec: 2820.05 | norm: 3.25\n", "step3283 | loss: 0.46933072805404663 | dt: 1451.85ms | tok/sec: 2821.23 | norm: 3.17\n", "step3284 | loss: 0.43187856674194336 | dt: 1454.23ms | tok/sec: 2816.62 | norm: 3.16\n", "step3285 | loss: 0.33772826194763184 | dt: 1453.22ms | tok/sec: 2818.56 | norm: 2.65\n", "step3286 | loss: 0.34587737917900085 | dt: 1450.37ms | tok/sec: 2824.10 | norm: 2.65\n", "step3287 | loss: 0.46831414103507996 | dt: 1452.43ms | tok/sec: 2820.11 | norm: 2.90\n", "step3288 | loss: 0.3761124014854431 | dt: 1454.11ms | tok/sec: 2816.84 | norm: 2.52\n", "step3289 | loss: 0.3956233263015747 | dt: 1454.21ms | tok/sec: 2816.65 | norm: 2.87\n", "step3290 | loss: 0.3289048969745636 | dt: 1449.05ms | tok/sec: 2826.69 | norm: 2.48\n", "step3291 | loss: 0.5769907832145691 | dt: 1452.24ms | tok/sec: 2820.48 | norm: 3.74\n", "step3292 | loss: 0.509503185749054 | dt: 1445.70ms | tok/sec: 2833.23 | norm: 3.14\n", "step3293 | loss: 0.4786399304866791 | dt: 1448.57ms | tok/sec: 2827.62 | norm: 3.10\n", "step3294 | loss: 0.46786314249038696 | dt: 1450.02ms | tok/sec: 2824.78 | norm: 3.14\n", "step3295 | loss: 0.48959821462631226 | dt: 1449.06ms | tok/sec: 2826.66 | norm: 3.22\n", "step3296 | loss: 0.4792274236679077 | dt: 1452.94ms | tok/sec: 2819.11 | norm: 3.08\n", "step3297 | loss: 0.45295414328575134 | dt: 1452.90ms | tok/sec: 2819.20 | norm: 2.92\n", "step3298 | loss: 0.41414692997932434 | dt: 1438.72ms | tok/sec: 2846.97 | norm: 2.88\n", "step3299 | loss: 0.3148679733276367 | dt: 1448.88ms | tok/sec: 2827.01 | norm: 2.56\n", "step3300 | loss: 0.27040529251098633 | dt: 1453.63ms | tok/sec: 2817.77 | norm: 2.36\n", "step3301 | loss: 0.4701016843318939 | dt: 1438.11ms | tok/sec: 2848.18 | norm: 3.03\n", "step3302 | loss: 0.5274250507354736 | dt: 1449.91ms | tok/sec: 2825.00 | norm: 3.32\n", "step3303 | loss: 0.5321292877197266 | dt: 1446.93ms | tok/sec: 2830.81 | norm: 3.16\n", "step3304 | loss: 0.4585552513599396 | dt: 1444.86ms | tok/sec: 2834.87 | norm: 3.22\n", "step3305 | loss: 0.42502185702323914 | dt: 1446.54ms | tok/sec: 2831.58 | norm: 2.92\n", "step3306 | loss: 0.4015553891658783 | dt: 1452.58ms | tok/sec: 2819.80 | norm: 2.63\n", "step3307 | loss: 0.3730161786079407 | dt: 1449.12ms | tok/sec: 2826.54 | norm: 2.79\n", "step3308 | loss: 0.34707316756248474 | dt: 1449.07ms | tok/sec: 2826.63 | norm: 2.76\n", "step3309 | loss: 0.386738657951355 | dt: 1457.15ms | tok/sec: 2810.97 | norm: 2.70\n", "step3310 | loss: 0.45374351739883423 | dt: 1443.68ms | tok/sec: 2837.20 | norm: 2.96\n", "step3311 | loss: 0.5488393902778625 | dt: 1446.23ms | tok/sec: 2832.19 | norm: 3.54\n", "step3312 | loss: 0.4911603331565857 | dt: 1450.99ms | tok/sec: 2822.89 | norm: 3.50\n", "step3313 | loss: 0.5582075715065002 | dt: 1439.35ms | tok/sec: 2845.74 | norm: 4.01\n", "step3314 | loss: 0.4140268862247467 | dt: 1452.21ms | tok/sec: 2820.54 | norm: 3.05\n", "step3315 | loss: 0.3964085876941681 | dt: 1452.16ms | tok/sec: 2820.63 | norm: 3.21\n", "step3316 | loss: 0.44082653522491455 | dt: 1451.99ms | tok/sec: 2820.96 | norm: 3.56\n", "step3317 | loss: 0.41943204402923584 | dt: 1452.20ms | tok/sec: 2820.56 | norm: 3.31\n", "step3318 | loss: 0.35888785123825073 | dt: 1454.00ms | tok/sec: 2817.06 | norm: 2.90\n", "step3319 | loss: 0.3946744501590729 | dt: 1452.26ms | tok/sec: 2820.43 | norm: 3.13\n", "step3320 | loss: 0.3670957684516907 | dt: 1450.42ms | tok/sec: 2824.01 | norm: 3.07\n", "step3321 | loss: 0.7735400795936584 | dt: 1452.82ms | tok/sec: 2819.35 | norm: 4.54\n", "step3322 | loss: 0.6283707618713379 | dt: 1450.09ms | tok/sec: 2824.65 | norm: 3.91\n", "step3323 | loss: 0.5687060356140137 | dt: 1452.06ms | tok/sec: 2820.82 | norm: 3.53\n", "step3324 | loss: 0.5572530627250671 | dt: 1455.92ms | tok/sec: 2813.34 | norm: 3.36\n", "step3325 | loss: 0.7430650591850281 | dt: 1450.67ms | tok/sec: 2823.53 | norm: 4.04\n", "step3326 | loss: 0.607353150844574 | dt: 1443.25ms | tok/sec: 2838.05 | norm: 4.15\n", "step3327 | loss: 0.4399116039276123 | dt: 1442.91ms | tok/sec: 2838.70 | norm: 3.28\n", "step3328 | loss: 0.49697136878967285 | dt: 1450.57ms | tok/sec: 2823.71 | norm: 3.31\n", "step3329 | loss: 0.5250030159950256 | dt: 1439.67ms | tok/sec: 2845.10 | norm: 2.99\n", "step3330 | loss: 0.45838984847068787 | dt: 1450.69ms | tok/sec: 2823.47 | norm: 2.87\n", "step3331 | loss: 0.5572376251220703 | dt: 1438.18ms | tok/sec: 2848.04 | norm: 3.40\n", "step3332 | loss: 0.47833511233329773 | dt: 1451.87ms | tok/sec: 2821.19 | norm: 2.91\n", "step3333 | loss: 0.43442320823669434 | dt: 1453.77ms | tok/sec: 2817.51 | norm: 3.09\n", "step3334 | loss: 0.41509929299354553 | dt: 1451.36ms | tok/sec: 2822.18 | norm: 2.82\n", "step3335 | loss: 0.6208544969558716 | dt: 1453.79ms | tok/sec: 2817.47 | norm: 3.58\n", "step3336 | loss: 0.49731674790382385 | dt: 1455.04ms | tok/sec: 2815.04 | norm: 3.21\n", "step3337 | loss: 0.43518179655075073 | dt: 1453.23ms | tok/sec: 2818.55 | norm: 2.94\n", "step3338 | loss: 0.3605155646800995 | dt: 1456.05ms | tok/sec: 2813.08 | norm: 2.67\n", "step3339 | loss: 0.3922889828681946 | dt: 1449.83ms | tok/sec: 2825.15 | norm: 2.93\n", "step3340 | loss: 0.5938664674758911 | dt: 1454.05ms | tok/sec: 2816.97 | norm: 3.79\n", "step3341 | loss: 0.6535912156105042 | dt: 1456.20ms | tok/sec: 2812.80 | norm: 3.78\n", "step3342 | loss: 0.528270423412323 | dt: 1456.87ms | tok/sec: 2811.51 | norm: 3.66\n", "step3343 | loss: 0.4015938937664032 | dt: 1448.84ms | tok/sec: 2827.08 | norm: 3.18\n", "step3344 | loss: 0.381756067276001 | dt: 1454.13ms | tok/sec: 2816.80 | norm: 2.57\n", "step3345 | loss: 0.4428926408290863 | dt: 1454.07ms | tok/sec: 2816.91 | norm: 2.80\n", "step3346 | loss: 0.3176538348197937 | dt: 1455.32ms | tok/sec: 2814.49 | norm: 2.51\n", "step3347 | loss: 0.3469287157058716 | dt: 1439.36ms | tok/sec: 2845.71 | norm: 2.73\n", "step3348 | loss: 0.3375839591026306 | dt: 1451.56ms | tok/sec: 2821.80 | norm: 2.87\n", "step3349 | loss: 0.6416590213775635 | dt: 1453.04ms | tok/sec: 2818.91 | norm: 3.43\n", "step3350 | loss: 0.44767358899116516 | dt: 1458.93ms | tok/sec: 2807.53 | norm: 3.10\n", "step3351 | loss: 0.7931644320487976 | dt: 1451.29ms | tok/sec: 2822.31 | norm: 4.85\n", "step3352 | loss: 0.6244803071022034 | dt: 1456.78ms | tok/sec: 2811.67 | norm: 3.97\n", "step3353 | loss: 0.6816748380661011 | dt: 1453.56ms | tok/sec: 2817.92 | norm: 4.08\n", "step3354 | loss: 0.6411715745925903 | dt: 1455.84ms | tok/sec: 2813.49 | norm: 3.85\n", "step3355 | loss: 0.5785759687423706 | dt: 1442.95ms | tok/sec: 2838.64 | norm: 3.73\n", "step3356 | loss: 0.4281432628631592 | dt: 1463.88ms | tok/sec: 2798.04 | norm: 3.20\n", "step3357 | loss: 0.4019841253757477 | dt: 1444.23ms | tok/sec: 2836.12 | norm: 3.16\n", "step3358 | loss: 0.3898693919181824 | dt: 1455.00ms | tok/sec: 2815.11 | norm: 3.14\n", "step3359 | loss: 0.37297767400741577 | dt: 1452.19ms | tok/sec: 2820.58 | norm: 2.61\n", "step3360 | loss: 0.48386260867118835 | dt: 1447.38ms | tok/sec: 2829.95 | norm: 3.29\n", "step3361 | loss: 0.4800001382827759 | dt: 1453.23ms | tok/sec: 2818.56 | norm: 2.95\n", "step3362 | loss: 0.4503895044326782 | dt: 1459.51ms | tok/sec: 2806.42 | norm: 3.31\n", "step3363 | loss: 0.3779258728027344 | dt: 1453.08ms | tok/sec: 2818.84 | norm: 3.27\n", "step3364 | loss: 0.40782076120376587 | dt: 1455.81ms | tok/sec: 2813.56 | norm: 3.02\n", "step3365 | loss: 0.4060193598270416 | dt: 1452.21ms | tok/sec: 2820.53 | norm: 3.18\n", "step3366 | loss: 0.3588924705982208 | dt: 1447.75ms | tok/sec: 2829.21 | norm: 2.81\n", "step3367 | loss: 0.28552335500717163 | dt: 1456.62ms | tok/sec: 2812.00 | norm: 2.56\n", "step3368 | loss: 0.28869712352752686 | dt: 1453.39ms | tok/sec: 2818.23 | norm: 2.60\n", "step3369 | loss: 0.4009529948234558 | dt: 1450.59ms | tok/sec: 2823.68 | norm: 3.09\n", "step3370 | loss: 0.3096064031124115 | dt: 1457.56ms | tok/sec: 2810.18 | norm: 2.56\n", "step3371 | loss: 0.33679279685020447 | dt: 1451.75ms | tok/sec: 2821.42 | norm: 2.53\n", "step3372 | loss: 0.27699318528175354 | dt: 1451.86ms | tok/sec: 2821.20 | norm: 2.62\n", "step3373 | loss: 0.4879913330078125 | dt: 1454.90ms | tok/sec: 2815.31 | norm: 3.07\n", "step3374 | loss: 0.43786898255348206 | dt: 1449.67ms | tok/sec: 2825.46 | norm: 3.11\n", "step3375 | loss: 0.40475553274154663 | dt: 1456.33ms | tok/sec: 2812.54 | norm: 3.22\n", "step3376 | loss: 0.40319451689720154 | dt: 1461.50ms | tok/sec: 2802.60 | norm: 3.13\n", "step3377 | loss: 0.4145471155643463 | dt: 1453.67ms | tok/sec: 2817.69 | norm: 3.14\n", "step3378 | loss: 0.3959443271160126 | dt: 1453.37ms | tok/sec: 2818.28 | norm: 2.54\n", "step3379 | loss: 0.366414874792099 | dt: 1452.63ms | tok/sec: 2819.72 | norm: 2.50\n", "step3380 | loss: 0.33734986186027527 | dt: 1447.37ms | tok/sec: 2829.95 | norm: 2.48\n", "step3381 | loss: 0.258279412984848 | dt: 1447.46ms | tok/sec: 2829.79 | norm: 2.37\n", "step3382 | loss: 0.2217150777578354 | dt: 1454.54ms | tok/sec: 2816.00 | norm: 2.14\n", "step3383 | loss: 0.3891557455062866 | dt: 1454.04ms | tok/sec: 2816.98 | norm: 2.80\n", "step3384 | loss: 0.4463169574737549 | dt: 1448.08ms | tok/sec: 2828.57 | norm: 3.21\n", "step3385 | loss: 0.45549023151397705 | dt: 1451.13ms | tok/sec: 2822.63 | norm: 3.30\n", "step3386 | loss: 0.3864609897136688 | dt: 1450.89ms | tok/sec: 2823.09 | norm: 3.19\n", "step3387 | loss: 0.35267820954322815 | dt: 1446.06ms | tok/sec: 2832.52 | norm: 2.79\n", "step3388 | loss: 0.3329412639141083 | dt: 1450.64ms | tok/sec: 2823.58 | norm: 2.61\n", "step3389 | loss: 0.3075467646121979 | dt: 1443.32ms | tok/sec: 2837.91 | norm: 2.44\n", "step3390 | loss: 0.28645214438438416 | dt: 1448.04ms | tok/sec: 2828.65 | norm: 2.55\n", "step3391 | loss: 0.3150588870048523 | dt: 1458.35ms | tok/sec: 2808.66 | norm: 2.53\n", "step3392 | loss: 0.37191829085350037 | dt: 1449.52ms | tok/sec: 2825.76 | norm: 2.81\n", "step3393 | loss: 0.4660193622112274 | dt: 1455.32ms | tok/sec: 2814.50 | norm: 3.26\n", "step3394 | loss: 0.4170176386833191 | dt: 1455.11ms | tok/sec: 2814.90 | norm: 3.24\n", "step3395 | loss: 0.46364548802375793 | dt: 1445.85ms | tok/sec: 2832.93 | norm: 3.30\n", "step3396 | loss: 0.3301052451133728 | dt: 1451.20ms | tok/sec: 2822.48 | norm: 2.65\n", "step3397 | loss: 0.3158998191356659 | dt: 1452.96ms | tok/sec: 2819.07 | norm: 2.72\n", "step3398 | loss: 0.3201969563961029 | dt: 1453.91ms | tok/sec: 2817.24 | norm: 2.85\n", "step3399 | loss: 0.3213677704334259 | dt: 1452.59ms | tok/sec: 2819.80 | norm: 2.89\n", "step3400 | loss: 0.29005083441734314 | dt: 1458.54ms | tok/sec: 2808.28 | norm: 2.66\n", "step3401 | loss: 0.3177332282066345 | dt: 1453.01ms | tok/sec: 2818.99 | norm: 2.67\n", "step3402 | loss: 0.30440348386764526 | dt: 1455.35ms | tok/sec: 2814.44 | norm: 2.68\n", "step3403 | loss: 0.6648903489112854 | dt: 1457.38ms | tok/sec: 2810.52 | norm: 4.25\n", "step3404 | loss: 0.5099165439605713 | dt: 1455.88ms | tok/sec: 2813.43 | norm: 3.45\n", "step3405 | loss: 0.45728591084480286 | dt: 1450.01ms | tok/sec: 2824.81 | norm: 3.04\n", "step3406 | loss: 0.4702743589878082 | dt: 1456.47ms | tok/sec: 2812.27 | norm: 3.65\n", "step3407 | loss: 0.6574946045875549 | dt: 1449.28ms | tok/sec: 2826.24 | norm: 4.39\n", "step3408 | loss: 0.5222384929656982 | dt: 1455.90ms | tok/sec: 2813.38 | norm: 3.55\n", "step3409 | loss: 0.360694020986557 | dt: 1449.90ms | tok/sec: 2825.03 | norm: 2.85\n", "step3410 | loss: 0.43336355686187744 | dt: 1454.97ms | tok/sec: 2815.19 | norm: 3.23\n", "step3411 | loss: 0.4579342007637024 | dt: 1454.67ms | tok/sec: 2815.75 | norm: 3.39\n", "step3412 | loss: 0.411491334438324 | dt: 1454.99ms | tok/sec: 2815.13 | norm: 3.69\n", "step3413 | loss: 0.5020852088928223 | dt: 1451.00ms | tok/sec: 2822.88 | norm: 3.71\n", "step3414 | loss: 0.41950422525405884 | dt: 1454.11ms | tok/sec: 2816.84 | norm: 3.33\n", "step3415 | loss: 0.40070244669914246 | dt: 1450.68ms | tok/sec: 2823.50 | norm: 3.29\n", "step3416 | loss: 0.3628179132938385 | dt: 1454.80ms | tok/sec: 2815.51 | norm: 2.70\n", "step3417 | loss: 0.5695099830627441 | dt: 1453.09ms | tok/sec: 2818.83 | norm: 3.71\n", "step3418 | loss: 0.4522571563720703 | dt: 1450.05ms | tok/sec: 2824.74 | norm: 3.37\n", "step3419 | loss: 0.3964574933052063 | dt: 1453.90ms | tok/sec: 2817.26 | norm: 3.01\n", "step3420 | loss: 0.3224320113658905 | dt: 1450.43ms | tok/sec: 2823.99 | norm: 2.76\n", "step3421 | loss: 0.335680216550827 | dt: 1447.84ms | tok/sec: 2829.05 | norm: 2.82\n", "step3422 | loss: 0.49488967657089233 | dt: 1457.05ms | tok/sec: 2811.17 | norm: 3.54\n", "step3423 | loss: 0.542962372303009 | dt: 1453.74ms | tok/sec: 2817.57 | norm: 3.67\n", "step3424 | loss: 0.44110628962516785 | dt: 1449.35ms | tok/sec: 2826.09 | norm: 3.64\n", "step3425 | loss: 0.32825613021850586 | dt: 1448.37ms | tok/sec: 2828.01 | norm: 2.72\n", "step3426 | loss: 0.3172374665737152 | dt: 1448.20ms | tok/sec: 2828.35 | norm: 2.44\n", "step3427 | loss: 0.37644293904304504 | dt: 1450.74ms | tok/sec: 2823.38 | norm: 2.77\n", "step3428 | loss: 0.27525657415390015 | dt: 1453.53ms | tok/sec: 2817.96 | norm: 2.71\n", "step3429 | loss: 0.2973385453224182 | dt: 1452.51ms | tok/sec: 2819.94 | norm: 2.42\n", "step3430 | loss: 0.2810560166835785 | dt: 1452.52ms | tok/sec: 2819.94 | norm: 2.24\n", "step3431 | loss: 0.5541276335716248 | dt: 1451.85ms | tok/sec: 2821.22 | norm: 3.30\n", "step3432 | loss: 0.3899309039115906 | dt: 1449.59ms | tok/sec: 2825.63 | norm: 2.93\n", "step3433 | loss: 0.7133917808532715 | dt: 1444.48ms | tok/sec: 2835.63 | norm: 4.21\n", "step3434 | loss: 0.533329963684082 | dt: 1454.70ms | tok/sec: 2815.71 | norm: 3.18\n", "step3435 | loss: 0.6068437695503235 | dt: 1456.59ms | tok/sec: 2812.05 | norm: 3.68\n", "step3436 | loss: 0.5746145248413086 | dt: 1455.41ms | tok/sec: 2814.33 | norm: 4.31\n", "step3437 | loss: 0.49785423278808594 | dt: 1450.64ms | tok/sec: 2823.59 | norm: 3.63\n", "step3438 | loss: 0.3813440501689911 | dt: 1454.16ms | tok/sec: 2816.74 | norm: 3.30\n", "step3439 | loss: 0.347991406917572 | dt: 1447.98ms | tok/sec: 2828.77 | norm: 2.70\n", "step3440 | loss: 0.35772624611854553 | dt: 1449.53ms | tok/sec: 2825.75 | norm: 3.13\n", "step3441 | loss: 0.32541197538375854 | dt: 1449.90ms | tok/sec: 2825.03 | norm: 2.81\n", "step3442 | loss: 0.4305071532726288 | dt: 1453.08ms | tok/sec: 2818.84 | norm: 3.46\n", "step3443 | loss: 0.39614033699035645 | dt: 1452.95ms | tok/sec: 2819.09 | norm: 2.61\n", "step3444 | loss: 0.36892634630203247 | dt: 1456.63ms | tok/sec: 2811.97 | norm: 2.84\n", "step3445 | loss: 0.3030880093574524 | dt: 1447.48ms | tok/sec: 2829.74 | norm: 2.78\n", "step3446 | loss: 0.3286421597003937 | dt: 1449.73ms | tok/sec: 2825.35 | norm: 2.60\n", "step3447 | loss: 0.3442002534866333 | dt: 1450.99ms | tok/sec: 2822.89 | norm: 2.61\n", "step3448 | loss: 0.2878485321998596 | dt: 1451.54ms | tok/sec: 2821.84 | norm: 2.38\n", "step3449 | loss: 0.23279139399528503 | dt: 1452.68ms | tok/sec: 2819.62 | norm: 2.44\n", "step3450 | loss: 0.24099993705749512 | dt: 1455.12ms | tok/sec: 2814.89 | norm: 2.41\n", "step3451 | loss: 0.3317256271839142 | dt: 1452.52ms | tok/sec: 2819.92 | norm: 2.71\n", "step3452 | loss: 0.25428998470306396 | dt: 1451.83ms | tok/sec: 2821.27 | norm: 2.35\n", "step3453 | loss: 0.28094369173049927 | dt: 1451.36ms | tok/sec: 2822.18 | norm: 2.72\n", "step3454 | loss: 0.23512357473373413 | dt: 1445.33ms | tok/sec: 2833.95 | norm: 2.64\n", "step3455 | loss: 0.413889080286026 | dt: 1452.78ms | tok/sec: 2819.43 | norm: 2.90\n", "step3456 | loss: 0.357578843832016 | dt: 1449.90ms | tok/sec: 2825.02 | norm: 2.69\n", "step3457 | loss: 0.3409411907196045 | dt: 1451.50ms | tok/sec: 2821.91 | norm: 2.93\n", "step3458 | loss: 0.32840102910995483 | dt: 1453.87ms | tok/sec: 2817.30 | norm: 2.60\n", "step3459 | loss: 0.3458179533481598 | dt: 1453.98ms | tok/sec: 2817.09 | norm: 2.96\n", "step3460 | loss: 0.32979556918144226 | dt: 1453.80ms | tok/sec: 2817.45 | norm: 2.93\n", "step3461 | loss: 0.3080250024795532 | dt: 1453.01ms | tok/sec: 2818.98 | norm: 2.83\n", "step3462 | loss: 0.2860088348388672 | dt: 1454.14ms | tok/sec: 2816.79 | norm: 2.67\n", "step3463 | loss: 0.21802206337451935 | dt: 1450.29ms | tok/sec: 2824.27 | norm: 2.37\n", "step3464 | loss: 0.18743687868118286 | dt: 1452.34ms | tok/sec: 2820.28 | norm: 2.18\n", "step3465 | loss: 0.32097482681274414 | dt: 1451.00ms | tok/sec: 2822.87 | norm: 2.62\n", "step3466 | loss: 0.37638115882873535 | dt: 1447.43ms | tok/sec: 2829.84 | norm: 2.88\n", "step3467 | loss: 0.39531031250953674 | dt: 1443.32ms | tok/sec: 2837.89 | norm: 3.00\n", "step3468 | loss: 0.3391145169734955 | dt: 1452.70ms | tok/sec: 2819.58 | norm: 2.82\n", "step3469 | loss: 0.2974793314933777 | dt: 1451.61ms | tok/sec: 2821.70 | norm: 2.69\n", "step3470 | loss: 0.28701794147491455 | dt: 1451.88ms | tok/sec: 2821.18 | norm: 2.98\n", "step3471 | loss: 0.24534478783607483 | dt: 1453.67ms | tok/sec: 2817.70 | norm: 2.21\n", "step3472 | loss: 0.23538164794445038 | dt: 1457.40ms | tok/sec: 2810.48 | norm: 2.28\n", "step3473 | loss: 0.2556591033935547 | dt: 1450.21ms | tok/sec: 2824.41 | norm: 2.25\n", "step3474 | loss: 0.30838337540626526 | dt: 1450.45ms | tok/sec: 2823.96 | norm: 2.53\n", "step3475 | loss: 0.39498114585876465 | dt: 1453.23ms | tok/sec: 2818.55 | norm: 3.32\n", "step3476 | loss: 0.351359099149704 | dt: 1453.01ms | tok/sec: 2818.97 | norm: 3.38\n", "step3477 | loss: 0.39204320311546326 | dt: 1451.19ms | tok/sec: 2822.50 | norm: 3.11\n", "step3478 | loss: 0.2752590775489807 | dt: 1452.92ms | tok/sec: 2819.15 | norm: 2.49\n", "step3479 | loss: 0.2616690397262573 | dt: 1452.36ms | tok/sec: 2820.23 | norm: 2.54\n", "step3480 | loss: 0.26961058378219604 | dt: 1453.04ms | tok/sec: 2818.92 | norm: 2.56\n", "step3481 | loss: 0.26286542415618896 | dt: 1453.25ms | tok/sec: 2818.51 | norm: 2.63\n", "step3482 | loss: 0.2345166802406311 | dt: 1453.68ms | tok/sec: 2817.68 | norm: 2.48\n", "step3483 | loss: 0.2604359984397888 | dt: 1453.98ms | tok/sec: 2817.09 | norm: 2.48\n", "step3484 | loss: 0.2554442882537842 | dt: 1453.45ms | tok/sec: 2818.12 | norm: 2.51\n", "step3485 | loss: 0.6037097573280334 | dt: 1442.38ms | tok/sec: 2839.75 | norm: 4.33\n", "step3486 | loss: 0.4579138159751892 | dt: 1448.72ms | tok/sec: 2827.31 | norm: 3.84\n", "step3487 | loss: 0.40721166133880615 | dt: 1450.09ms | tok/sec: 2824.65 | norm: 3.49\n", "step3488 | loss: 0.41694343090057373 | dt: 1456.02ms | tok/sec: 2813.16 | norm: 3.23\n", "step3489 | loss: 0.5879555344581604 | dt: 1446.30ms | tok/sec: 2832.06 | norm: 3.56\n", "step3490 | loss: 0.4468725025653839 | dt: 1449.73ms | tok/sec: 2825.35 | norm: 3.07\n", "step3491 | loss: 0.29434406757354736 | dt: 1452.78ms | tok/sec: 2819.43 | norm: 2.66\n", "step3492 | loss: 0.37577587366104126 | dt: 1454.16ms | tok/sec: 2816.75 | norm: 3.11\n", "step3493 | loss: 0.39345377683639526 | dt: 1453.40ms | tok/sec: 2818.22 | norm: 3.26\n", "step3494 | loss: 0.3582548499107361 | dt: 1453.26ms | tok/sec: 2818.49 | norm: 3.13\n", "step3495 | loss: 0.46678492426872253 | dt: 1444.83ms | tok/sec: 2834.93 | norm: 3.92\n", "step3496 | loss: 0.39930927753448486 | dt: 1449.17ms | tok/sec: 2826.44 | norm: 3.65\n", "step3497 | loss: 0.34871241450309753 | dt: 1450.65ms | tok/sec: 2823.57 | norm: 3.12\n", "step3498 | loss: 0.291248619556427 | dt: 1442.66ms | tok/sec: 2839.19 | norm: 2.34\n", "step3499 | loss: 0.4986930787563324 | dt: 1446.23ms | tok/sec: 2832.18 | norm: 3.41\n", "step3500 | loss: 0.39983057975769043 | dt: 1449.84ms | tok/sec: 2825.13 | norm: 3.27\n", "step3501 | loss: 0.3415069282054901 | dt: 1452.11ms | tok/sec: 2820.72 | norm: 2.78\n", "step3502 | loss: 0.2787655293941498 | dt: 1452.68ms | tok/sec: 2819.61 | norm: 2.39\n", "step3503 | loss: 0.29276037216186523 | dt: 1454.35ms | tok/sec: 2816.38 | norm: 2.47\n", "step3504 | loss: 0.42478474974632263 | dt: 1452.37ms | tok/sec: 2820.21 | norm: 2.92\n", "step3505 | loss: 0.4864760935306549 | dt: 1458.47ms | tok/sec: 2808.42 | norm: 3.36\n", "step3506 | loss: 0.3991602957248688 | dt: 1457.27ms | tok/sec: 2810.74 | norm: 3.48\n", "step3507 | loss: 0.278678297996521 | dt: 1452.07ms | tok/sec: 2820.80 | norm: 2.66\n", "step3508 | loss: 0.2516343593597412 | dt: 1452.11ms | tok/sec: 2820.72 | norm: 2.19\n", "step3509 | loss: 0.3168447017669678 | dt: 1443.44ms | tok/sec: 2837.67 | norm: 2.47\n", "step3510 | loss: 0.2228914052248001 | dt: 1440.12ms | tok/sec: 2844.20 | norm: 2.17\n", "step3511 | loss: 0.2370024025440216 | dt: 1446.95ms | tok/sec: 2830.79 | norm: 2.30\n", "step3512 | loss: 0.23096300661563873 | dt: 1453.04ms | tok/sec: 2818.92 | norm: 2.01\n", "step3513 | loss: 0.4844990372657776 | dt: 1452.73ms | tok/sec: 2819.53 | norm: 3.33\n", "step3514 | loss: 0.3473210334777832 | dt: 1453.46ms | tok/sec: 2818.10 | norm: 2.81\n", "step3515 | loss: 0.5935741066932678 | dt: 1453.74ms | tok/sec: 2817.55 | norm: 3.47\n", "step3516 | loss: 0.4479731619358063 | dt: 1461.04ms | tok/sec: 2803.48 | norm: 2.91\n", "step3517 | loss: 0.5080766081809998 | dt: 1447.65ms | tok/sec: 2829.42 | norm: 3.22\n", "step3518 | loss: 0.47005778551101685 | dt: 1449.17ms | tok/sec: 2826.45 | norm: 3.27\n", "step3519 | loss: 0.40465834736824036 | dt: 1452.87ms | tok/sec: 2819.25 | norm: 2.85\n", "step3520 | loss: 0.31736963987350464 | dt: 1454.33ms | tok/sec: 2816.42 | norm: 2.71\n", "step3521 | loss: 0.28399673104286194 | dt: 1451.48ms | tok/sec: 2821.95 | norm: 2.56\n", "step3522 | loss: 0.30593857169151306 | dt: 1462.28ms | tok/sec: 2801.11 | norm: 2.65\n", "step3523 | loss: 0.2783176600933075 | dt: 1452.47ms | tok/sec: 2820.03 | norm: 2.56\n", "step3524 | loss: 0.39205625653266907 | dt: 1453.74ms | tok/sec: 2817.56 | norm: 3.41\n", "step3525 | loss: 0.3682010769844055 | dt: 1454.06ms | tok/sec: 2816.94 | norm: 2.83\n", "step3526 | loss: 0.3451246917247772 | dt: 1456.98ms | tok/sec: 2811.29 | norm: 2.89\n", "step3527 | loss: 0.2732250988483429 | dt: 1453.64ms | tok/sec: 2817.75 | norm: 2.75\n", "step3528 | loss: 0.29387709498405457 | dt: 1446.66ms | tok/sec: 2831.35 | norm: 2.77\n", "step3529 | loss: 0.2946386933326721 | dt: 1453.66ms | tok/sec: 2817.71 | norm: 2.80\n", "step3530 | loss: 0.24835722148418427 | dt: 1448.48ms | tok/sec: 2827.78 | norm: 2.59\n", "step3531 | loss: 0.2066141963005066 | dt: 1449.84ms | tok/sec: 2825.14 | norm: 2.19\n", "step3532 | loss: 0.21064306795597076 | dt: 1458.17ms | tok/sec: 2809.00 | norm: 2.44\n", "step3533 | loss: 0.2864423394203186 | dt: 1453.47ms | tok/sec: 2818.09 | norm: 2.52\n", "step3534 | loss: 0.20654244720935822 | dt: 1458.15ms | tok/sec: 2809.04 | norm: 2.21\n", "step3535 | loss: 0.23441708087921143 | dt: 1456.19ms | tok/sec: 2812.83 | norm: 2.47\n", "step3536 | loss: 0.19629475474357605 | dt: 1452.44ms | tok/sec: 2820.09 | norm: 2.20\n", "step3537 | loss: 0.35401153564453125 | dt: 1449.47ms | tok/sec: 2825.86 | norm: 2.92\n", "step3538 | loss: 0.29611214995384216 | dt: 1451.60ms | tok/sec: 2821.71 | norm: 2.51\n", "step3539 | loss: 0.2760133743286133 | dt: 1454.22ms | tok/sec: 2816.62 | norm: 2.31\n", "step3540 | loss: 0.2604278326034546 | dt: 1459.89ms | tok/sec: 2805.69 | norm: 2.11\n", "step3541 | loss: 0.28166288137435913 | dt: 1450.89ms | tok/sec: 2823.09 | norm: 2.32\n", "step3542 | loss: 0.27957314252853394 | dt: 1454.77ms | tok/sec: 2815.56 | norm: 2.68\n", "step3543 | loss: 0.24911950528621674 | dt: 1453.90ms | tok/sec: 2817.25 | norm: 2.67\n", "step3544 | loss: 0.23470592498779297 | dt: 1451.73ms | tok/sec: 2821.46 | norm: 2.51\n", "step3545 | loss: 0.1771635115146637 | dt: 1453.99ms | tok/sec: 2817.07 | norm: 2.31\n", "step3546 | loss: 0.15708766877651215 | dt: 1451.58ms | tok/sec: 2821.76 | norm: 1.97\n", "step3547 | loss: 0.26544931530952454 | dt: 1455.92ms | tok/sec: 2813.35 | norm: 2.59\n", "step3548 | loss: 0.3146316707134247 | dt: 1450.48ms | tok/sec: 2823.90 | norm: 2.52\n", "step3549 | loss: 0.3203616440296173 | dt: 1458.08ms | tok/sec: 2809.17 | norm: 2.55\n", "step3550 | loss: 0.2620166838169098 | dt: 1455.56ms | tok/sec: 2814.04 | norm: 2.10\n", "step3551 | loss: 0.24010229110717773 | dt: 1455.21ms | tok/sec: 2814.72 | norm: 2.39\n", "step3552 | loss: 0.24273279309272766 | dt: 1462.56ms | tok/sec: 2800.56 | norm: 2.53\n", "step3553 | loss: 0.20425288379192352 | dt: 1454.73ms | tok/sec: 2815.64 | norm: 2.37\n", "step3554 | loss: 0.191073477268219 | dt: 1447.37ms | tok/sec: 2829.96 | norm: 2.50\n", "step3555 | loss: 0.2185896933078766 | dt: 1454.04ms | tok/sec: 2816.97 | norm: 2.34\n", "step3556 | loss: 0.24832485616207123 | dt: 1460.71ms | tok/sec: 2804.12 | norm: 2.49\n", "step3557 | loss: 0.3334844708442688 | dt: 1456.73ms | tok/sec: 2811.77 | norm: 3.28\n", "step3558 | loss: 0.30407223105430603 | dt: 1460.44ms | tok/sec: 2804.63 | norm: 2.72\n", "step3559 | loss: 0.31554940342903137 | dt: 1457.28ms | tok/sec: 2810.72 | norm: 2.67\n", "step3560 | loss: 0.21638312935829163 | dt: 1453.06ms | tok/sec: 2818.87 | norm: 2.19\n", "step3561 | loss: 0.2190951555967331 | dt: 1452.86ms | tok/sec: 2819.26 | norm: 2.63\n", "step3562 | loss: 0.20563636720180511 | dt: 1461.03ms | tok/sec: 2803.50 | norm: 2.19\n", "step3563 | loss: 0.2241089940071106 | dt: 1457.68ms | tok/sec: 2809.95 | norm: 2.63\n", "step3564 | loss: 0.20575039088726044 | dt: 1452.76ms | tok/sec: 2819.47 | norm: 2.51\n", "step3565 | loss: 0.2090543508529663 | dt: 1462.39ms | tok/sec: 2800.90 | norm: 2.50\n", "step3566 | loss: 0.20646056532859802 | dt: 1460.44ms | tok/sec: 2804.63 | norm: 2.43\n", "step3567 | loss: 0.5069402456283569 | dt: 1452.71ms | tok/sec: 2819.55 | norm: 3.47\n", "step3568 | loss: 0.38482001423835754 | dt: 1459.33ms | tok/sec: 2806.76 | norm: 3.20\n", "step3569 | loss: 0.35433122515678406 | dt: 1455.09ms | tok/sec: 2814.94 | norm: 3.17\n", "step3570 | loss: 0.35013267397880554 | dt: 1453.19ms | tok/sec: 2818.63 | norm: 3.06\n", "step3571 | loss: 0.4991099536418915 | dt: 1454.06ms | tok/sec: 2816.95 | norm: 3.43\n", "step3572 | loss: 0.37044885754585266 | dt: 1461.33ms | tok/sec: 2802.93 | norm: 2.85\n", "step3573 | loss: 0.24061867594718933 | dt: 1459.45ms | tok/sec: 2806.54 | norm: 2.33\n", "step3574 | loss: 0.3084110915660858 | dt: 1451.22ms | tok/sec: 2822.45 | norm: 2.53\n", "step3575 | loss: 0.3242640793323517 | dt: 1455.99ms | tok/sec: 2813.21 | norm: 2.57\n", "step3576 | loss: 0.2980690598487854 | dt: 1457.70ms | tok/sec: 2809.91 | norm: 2.72\n", "step3577 | loss: 0.38567638397216797 | dt: 1459.62ms | tok/sec: 2806.21 | norm: 3.36\n", "step3578 | loss: 0.33101874589920044 | dt: 1454.97ms | tok/sec: 2815.18 | norm: 3.14\n", "step3579 | loss: 0.2961963415145874 | dt: 1462.13ms | tok/sec: 2801.39 | norm: 3.07\n", "step3580 | loss: 0.2506217658519745 | dt: 1446.65ms | tok/sec: 2831.37 | norm: 2.70\n", "step3581 | loss: 0.4470589756965637 | dt: 1454.43ms | tok/sec: 2816.22 | norm: 3.66\n", "step3582 | loss: 0.33738642930984497 | dt: 1458.58ms | tok/sec: 2808.22 | norm: 2.98\n", "step3583 | loss: 0.2726052403450012 | dt: 1462.59ms | tok/sec: 2800.51 | norm: 2.38\n", "step3584 | loss: 0.22523720562458038 | dt: 1449.92ms | tok/sec: 2824.98 | norm: 2.34\n", "step3585 | loss: 0.24199886620044708 | dt: 1449.55ms | tok/sec: 2825.71 | norm: 2.49\n", "step3586 | loss: 0.3555944859981537 | dt: 1454.82ms | tok/sec: 2815.46 | norm: 2.90\n", "step3587 | loss: 0.41251683235168457 | dt: 1449.66ms | tok/sec: 2825.49 | norm: 3.21\n", "step3588 | loss: 0.33149972558021545 | dt: 1458.23ms | tok/sec: 2808.89 | norm: 2.86\n", "step3589 | loss: 0.2296110838651657 | dt: 1449.35ms | tok/sec: 2826.09 | norm: 2.57\n", "step3590 | loss: 0.20789982378482819 | dt: 1453.66ms | tok/sec: 2817.72 | norm: 2.22\n", "step3591 | loss: 0.26753562688827515 | dt: 1450.59ms | tok/sec: 2823.68 | norm: 2.38\n", "step3592 | loss: 0.17024047672748566 | dt: 1455.33ms | tok/sec: 2814.48 | norm: 1.80\n", "step3593 | loss: 0.20330247282981873 | dt: 1454.68ms | tok/sec: 2815.75 | norm: 2.34\n", "step3594 | loss: 0.18888548016548157 | dt: 1461.05ms | tok/sec: 2803.46 | norm: 1.86\n", "step3595 | loss: 0.42427152395248413 | dt: 1460.57ms | tok/sec: 2804.39 | norm: 3.12\n", "step3596 | loss: 0.2873508930206299 | dt: 1456.95ms | tok/sec: 2811.36 | norm: 2.53\n", "step3597 | loss: 0.5044463276863098 | dt: 1455.08ms | tok/sec: 2814.96 | norm: 3.26\n", "step3598 | loss: 0.3996722996234894 | dt: 1454.80ms | tok/sec: 2815.51 | norm: 3.16\n", "step3599 | loss: 0.45664647221565247 | dt: 1453.53ms | tok/sec: 2817.96 | norm: 3.33\n", "step3600 | loss: 0.41470128297805786 | dt: 1457.90ms | tok/sec: 2809.52 | norm: 3.21\n", "step3601 | loss: 0.3526966869831085 | dt: 1455.28ms | tok/sec: 2814.58 | norm: 2.87\n", "step3602 | loss: 0.26436933875083923 | dt: 1454.77ms | tok/sec: 2815.57 | norm: 2.57\n", "step3603 | loss: 0.24271847307682037 | dt: 1462.17ms | tok/sec: 2801.32 | norm: 2.69\n", "step3604 | loss: 0.2622012495994568 | dt: 1457.28ms | tok/sec: 2810.72 | norm: 2.73\n", "step3605 | loss: 0.23748454451560974 | dt: 1459.59ms | tok/sec: 2806.27 | norm: 2.28\n", "step3606 | loss: 0.3213110864162445 | dt: 1458.58ms | tok/sec: 2808.21 | norm: 2.84\n", "step3607 | loss: 0.30221375823020935 | dt: 1462.93ms | tok/sec: 2799.86 | norm: 2.70\n", "step3608 | loss: 0.2736271023750305 | dt: 1463.18ms | tok/sec: 2799.38 | norm: 2.45\n", "step3609 | loss: 0.22094647586345673 | dt: 1457.98ms | tok/sec: 2809.37 | norm: 2.70\n", "step3610 | loss: 0.2446269690990448 | dt: 1447.92ms | tok/sec: 2828.88 | norm: 2.53\n", "step3611 | loss: 0.2474951595067978 | dt: 1461.83ms | tok/sec: 2801.96 | norm: 2.69\n", "step3612 | loss: 0.21583274006843567 | dt: 1444.61ms | tok/sec: 2835.38 | norm: 2.48\n", "step3613 | loss: 0.17301809787750244 | dt: 1456.71ms | tok/sec: 2811.81 | norm: 2.50\n", "step3614 | loss: 0.17410808801651 | dt: 1454.32ms | tok/sec: 2816.43 | norm: 2.34\n", "step3615 | loss: 0.22351177036762238 | dt: 1457.77ms | tok/sec: 2809.76 | norm: 2.18\n", "step3616 | loss: 0.17724384367465973 | dt: 1453.76ms | tok/sec: 2817.52 | norm: 2.22\n", "step3617 | loss: 0.1951999068260193 | dt: 1448.24ms | tok/sec: 2828.26 | norm: 2.04\n", "step3618 | loss: 0.15953533351421356 | dt: 1459.12ms | tok/sec: 2807.17 | norm: 2.13\n", "step3619 | loss: 0.2835105061531067 | dt: 1443.78ms | tok/sec: 2836.99 | norm: 2.41\n", "step3620 | loss: 0.23825958371162415 | dt: 1451.54ms | tok/sec: 2821.83 | norm: 2.37\n", "step3621 | loss: 0.22106824815273285 | dt: 1457.15ms | tok/sec: 2810.98 | norm: 2.47\n", "step3622 | loss: 0.21041083335876465 | dt: 1455.82ms | tok/sec: 2813.54 | norm: 2.37\n", "step3623 | loss: 0.23113977909088135 | dt: 1447.88ms | tok/sec: 2828.97 | norm: 2.60\n", "step3624 | loss: 0.2242974042892456 | dt: 1455.11ms | tok/sec: 2814.91 | norm: 2.11\n", "step3625 | loss: 0.2050250917673111 | dt: 1452.34ms | tok/sec: 2820.27 | norm: 2.03\n", "step3626 | loss: 0.1909857988357544 | dt: 1458.07ms | tok/sec: 2809.18 | norm: 2.08\n", "step3627 | loss: 0.14787712693214417 | dt: 1457.98ms | tok/sec: 2809.37 | norm: 1.94\n", "step3628 | loss: 0.14067916572093964 | dt: 1451.74ms | tok/sec: 2821.43 | norm: 2.10\n", "step3629 | loss: 0.22214242815971375 | dt: 1450.45ms | tok/sec: 2823.94 | norm: 2.70\n", "step3630 | loss: 0.25108784437179565 | dt: 1459.64ms | tok/sec: 2806.17 | norm: 2.74\n", "step3631 | loss: 0.2743658125400543 | dt: 1453.79ms | tok/sec: 2817.46 | norm: 2.59\n", "step3632 | loss: 0.21593505144119263 | dt: 1455.98ms | tok/sec: 2813.22 | norm: 2.23\n", "step3633 | loss: 0.20443245768547058 | dt: 1462.70ms | tok/sec: 2800.31 | norm: 2.40\n", "step3634 | loss: 0.19141711294651031 | dt: 1454.89ms | tok/sec: 2815.33 | norm: 2.13\n", "step3635 | loss: 0.16896268725395203 | dt: 1443.30ms | tok/sec: 2837.94 | norm: 2.31\n", "step3636 | loss: 0.16052240133285522 | dt: 1454.18ms | tok/sec: 2816.70 | norm: 2.03\n", "step3637 | loss: 0.1749259978532791 | dt: 1457.09ms | tok/sec: 2811.07 | norm: 1.93\n", "step3638 | loss: 0.21629785001277924 | dt: 1462.59ms | tok/sec: 2800.51 | norm: 2.38\n", "step3639 | loss: 0.2892493009567261 | dt: 1456.02ms | tok/sec: 2813.15 | norm: 3.11\n", "step3640 | loss: 0.24953417479991913 | dt: 1456.24ms | tok/sec: 2812.72 | norm: 2.80\n", "step3641 | loss: 0.26569923758506775 | dt: 1467.60ms | tok/sec: 2790.95 | norm: 2.91\n", "step3642 | loss: 0.18076571822166443 | dt: 1447.98ms | tok/sec: 2828.78 | norm: 2.17\n", "step3643 | loss: 0.1825733333826065 | dt: 1457.88ms | tok/sec: 2809.57 | norm: 2.21\n", "step3644 | loss: 0.16978146135807037 | dt: 1447.77ms | tok/sec: 2829.18 | norm: 2.03\n", "step3645 | loss: 0.176087886095047 | dt: 1461.97ms | tok/sec: 2801.70 | norm: 2.27\n", "step3646 | loss: 0.15716175734996796 | dt: 1458.53ms | tok/sec: 2808.31 | norm: 1.97\n", "step3647 | loss: 0.1720770001411438 | dt: 1454.65ms | tok/sec: 2815.79 | norm: 2.02\n", "step3648 | loss: 0.16377878189086914 | dt: 1460.62ms | tok/sec: 2804.28 | norm: 1.74\n", "step3649 | loss: 0.4224478304386139 | dt: 1456.45ms | tok/sec: 2812.32 | norm: 3.11\n", "step3650 | loss: 0.3214223086833954 | dt: 1460.57ms | tok/sec: 2804.38 | norm: 2.81\n", "step3651 | loss: 0.2789984941482544 | dt: 1441.76ms | tok/sec: 2840.97 | norm: 2.92\n", "step3652 | loss: 0.27836596965789795 | dt: 1448.59ms | tok/sec: 2827.58 | norm: 2.26\n", "step3653 | loss: 0.40533971786499023 | dt: 1455.10ms | tok/sec: 2814.93 | norm: 2.77\n", "step3654 | loss: 0.2983416020870209 | dt: 1460.23ms | tok/sec: 2805.05 | norm: 2.40\n", "step3655 | loss: 0.1922546774148941 | dt: 1458.57ms | tok/sec: 2808.23 | norm: 2.24\n", "step3656 | loss: 0.2552379369735718 | dt: 1449.45ms | tok/sec: 2825.89 | norm: 2.55\n", "step3657 | loss: 0.262500524520874 | dt: 1450.88ms | tok/sec: 2823.11 | norm: 2.51\n", "step3658 | loss: 0.24448524415493011 | dt: 1445.58ms | tok/sec: 2833.46 | norm: 2.58\n", "step3659 | loss: 0.3319908678531647 | dt: 1453.29ms | tok/sec: 2818.43 | norm: 3.23\n", "step3660 | loss: 0.2700524628162384 | dt: 1454.59ms | tok/sec: 2815.91 | norm: 2.64\n", "step3661 | loss: 0.252852201461792 | dt: 1451.54ms | tok/sec: 2821.83 | norm: 2.95\n", "step3662 | loss: 0.21949779987335205 | dt: 1458.17ms | tok/sec: 2809.00 | norm: 2.74\n", "step3663 | loss: 0.39041468501091003 | dt: 1451.66ms | tok/sec: 2821.59 | norm: 3.74\n", "step3664 | loss: 0.29451897740364075 | dt: 1447.61ms | tok/sec: 2829.49 | norm: 2.99\n", "step3665 | loss: 0.2397875338792801 | dt: 1458.52ms | tok/sec: 2808.32 | norm: 2.81\n", "step3666 | loss: 0.18679945170879364 | dt: 1450.20ms | tok/sec: 2824.44 | norm: 2.30\n", "step3667 | loss: 0.19891691207885742 | dt: 1461.64ms | tok/sec: 2802.33 | norm: 2.22\n", "step3668 | loss: 0.291250616312027 | dt: 1452.20ms | tok/sec: 2820.55 | norm: 2.50\n", "step3669 | loss: 0.34143656492233276 | dt: 1457.25ms | tok/sec: 2810.78 | norm: 3.02\n", "step3670 | loss: 0.2696934938430786 | dt: 1451.81ms | tok/sec: 2821.30 | norm: 2.81\n", "step3671 | loss: 0.1878792643547058 | dt: 1447.04ms | tok/sec: 2830.61 | norm: 2.31\n", "step3672 | loss: 0.1698518693447113 | dt: 1461.53ms | tok/sec: 2802.54 | norm: 2.11\n", "step3673 | loss: 0.2289280742406845 | dt: 1451.44ms | tok/sec: 2822.03 | norm: 2.54\n", "step3674 | loss: 0.13354063034057617 | dt: 1456.82ms | tok/sec: 2811.61 | norm: 1.86\n", "step3675 | loss: 0.1717129349708557 | dt: 1455.93ms | tok/sec: 2813.32 | norm: 2.08\n", "step3676 | loss: 0.1616203784942627 | dt: 1456.01ms | tok/sec: 2813.16 | norm: 1.97\n", "step3677 | loss: 0.35262995958328247 | dt: 1451.99ms | tok/sec: 2820.95 | norm: 2.94\n", "step3678 | loss: 0.23148973286151886 | dt: 1449.92ms | tok/sec: 2824.99 | norm: 2.21\n", "step3679 | loss: 0.43751320242881775 | dt: 1450.27ms | tok/sec: 2824.30 | norm: 3.43\n", "step3680 | loss: 0.3345540165901184 | dt: 1452.01ms | tok/sec: 2820.91 | norm: 2.94\n", "step3681 | loss: 0.38501834869384766 | dt: 1453.22ms | tok/sec: 2818.56 | norm: 3.15\n", "step3682 | loss: 0.3422798216342926 | dt: 1450.90ms | tok/sec: 2823.08 | norm: 2.86\n", "step3683 | loss: 0.28870323300361633 | dt: 1458.78ms | tok/sec: 2807.82 | norm: 2.65\n", "step3684 | loss: 0.23509949445724487 | dt: 1457.94ms | tok/sec: 2809.45 | norm: 2.46\n", "step3685 | loss: 0.21341277658939362 | dt: 1457.46ms | tok/sec: 2810.37 | norm: 2.18\n", "step3686 | loss: 0.2210061401128769 | dt: 1456.13ms | tok/sec: 2812.93 | norm: 2.23\n", "step3687 | loss: 0.18843123316764832 | dt: 1449.78ms | tok/sec: 2825.25 | norm: 2.13\n", "step3688 | loss: 0.26705169677734375 | dt: 1454.28ms | tok/sec: 2816.51 | norm: 2.69\n", "step3689 | loss: 0.260744571685791 | dt: 1458.03ms | tok/sec: 2809.26 | norm: 2.61\n", "step3690 | loss: 0.22803795337677002 | dt: 1450.03ms | tok/sec: 2824.76 | norm: 2.42\n", "step3691 | loss: 0.18049849569797516 | dt: 1455.70ms | tok/sec: 2813.77 | norm: 2.34\n", "step3692 | loss: 0.20154091715812683 | dt: 1460.75ms | tok/sec: 2804.05 | norm: 2.35\n", "step3693 | loss: 0.20946331322193146 | dt: 1446.79ms | tok/sec: 2831.10 | norm: 2.63\n", "step3694 | loss: 0.17802706360816956 | dt: 1456.92ms | tok/sec: 2811.42 | norm: 2.30\n", "step3695 | loss: 0.14848145842552185 | dt: 1453.55ms | tok/sec: 2817.93 | norm: 2.02\n", "step3696 | loss: 0.14765891432762146 | dt: 1457.33ms | tok/sec: 2810.62 | norm: 2.32\n", "step3697 | loss: 0.1860755831003189 | dt: 1453.71ms | tok/sec: 2817.62 | norm: 2.28\n", "step3698 | loss: 0.14943046867847443 | dt: 1454.84ms | tok/sec: 2815.42 | norm: 1.93\n", "step3699 | loss: 0.1634497195482254 | dt: 1453.17ms | tok/sec: 2818.67 | norm: 2.14\n", "step3700 | loss: 0.12893269956111908 | dt: 1462.89ms | tok/sec: 2799.93 | norm: 1.93\n", "step3701 | loss: 0.2391589879989624 | dt: 1460.20ms | tok/sec: 2805.09 | norm: 2.74\n", "step3702 | loss: 0.20279765129089355 | dt: 1455.30ms | tok/sec: 2814.54 | norm: 2.24\n", "step3703 | loss: 0.19361412525177002 | dt: 1455.97ms | tok/sec: 2813.24 | norm: 2.25\n", "step3704 | loss: 0.18300685286521912 | dt: 1451.40ms | tok/sec: 2822.10 | norm: 2.41\n", "step3705 | loss: 0.2021825760602951 | dt: 1444.48ms | tok/sec: 2835.63 | norm: 2.19\n", "step3706 | loss: 0.18379899859428406 | dt: 1451.26ms | tok/sec: 2822.37 | norm: 1.99\n", "step3707 | loss: 0.1592279076576233 | dt: 1450.40ms | tok/sec: 2824.06 | norm: 2.02\n", "step3708 | loss: 0.14933285117149353 | dt: 1445.75ms | tok/sec: 2833.14 | norm: 1.85\n", "step3709 | loss: 0.11073385179042816 | dt: 1439.94ms | tok/sec: 2844.56 | norm: 1.58\n", "step3710 | loss: 0.11014745384454727 | dt: 1458.47ms | tok/sec: 2808.42 | norm: 2.01\n", "step3711 | loss: 0.19047242403030396 | dt: 1461.61ms | tok/sec: 2802.39 | norm: 2.28\n", "step3712 | loss: 0.21693891286849976 | dt: 1441.94ms | tok/sec: 2840.62 | norm: 2.62\n", "step3713 | loss: 0.21957390010356903 | dt: 1451.45ms | tok/sec: 2822.01 | norm: 2.42\n", "step3714 | loss: 0.18059515953063965 | dt: 1461.47ms | tok/sec: 2802.65 | norm: 2.29\n", "step3715 | loss: 0.17076241970062256 | dt: 1454.00ms | tok/sec: 2817.05 | norm: 2.38\n", "step3716 | loss: 0.14949750900268555 | dt: 1454.73ms | tok/sec: 2815.64 | norm: 1.74\n", "step3717 | loss: 0.1345319300889969 | dt: 1457.47ms | tok/sec: 2810.34 | norm: 1.76\n", "step3718 | loss: 0.1224447712302208 | dt: 1453.93ms | tok/sec: 2817.19 | norm: 1.76\n", "step3719 | loss: 0.13697034120559692 | dt: 1449.93ms | tok/sec: 2824.96 | norm: 1.90\n", "step3720 | loss: 0.16213031113147736 | dt: 1448.17ms | tok/sec: 2828.40 | norm: 1.90\n", "step3721 | loss: 0.23156775534152985 | dt: 1447.22ms | tok/sec: 2830.25 | norm: 2.78\n", "step3722 | loss: 0.20227019488811493 | dt: 1443.21ms | tok/sec: 2838.11 | norm: 2.70\n", "step3723 | loss: 0.2142288237810135 | dt: 1451.55ms | tok/sec: 2821.82 | norm: 2.36\n", "step3724 | loss: 0.15152403712272644 | dt: 1445.42ms | tok/sec: 2833.78 | norm: 2.37\n", "step3725 | loss: 0.14867271482944489 | dt: 1454.16ms | tok/sec: 2816.74 | norm: 2.25\n", "step3726 | loss: 0.13538643717765808 | dt: 1455.81ms | tok/sec: 2813.56 | norm: 1.87\n", "step3727 | loss: 0.1464807093143463 | dt: 1455.68ms | tok/sec: 2813.80 | norm: 2.03\n", "step3728 | loss: 0.12861226499080658 | dt: 1453.29ms | tok/sec: 2818.44 | norm: 1.97\n", "step3729 | loss: 0.14071394503116608 | dt: 1451.97ms | tok/sec: 2821.00 | norm: 2.06\n", "step3730 | loss: 0.11784759163856506 | dt: 1444.84ms | tok/sec: 2834.91 | norm: 1.59\n", "step3731 | loss: 0.37144628167152405 | dt: 1453.17ms | tok/sec: 2818.66 | norm: 3.27\n", "step3732 | loss: 0.27533626556396484 | dt: 1447.04ms | tok/sec: 2830.61 | norm: 2.81\n", "step3733 | loss: 0.23622460663318634 | dt: 1453.02ms | tok/sec: 2818.96 | norm: 2.44\n", "step3734 | loss: 0.2252383679151535 | dt: 1453.84ms | tok/sec: 2817.37 | norm: 2.36\n", "step3735 | loss: 0.3524463474750519 | dt: 1452.54ms | tok/sec: 2819.88 | norm: 3.00\n", "step3736 | loss: 0.25288569927215576 | dt: 1460.71ms | tok/sec: 2804.11 | norm: 2.38\n", "step3737 | loss: 0.1570853888988495 | dt: 1448.29ms | tok/sec: 2828.16 | norm: 1.95\n", "step3738 | loss: 0.21393054723739624 | dt: 1451.84ms | tok/sec: 2821.24 | norm: 2.28\n", "step3739 | loss: 0.21404598653316498 | dt: 1461.01ms | tok/sec: 2803.54 | norm: 2.15\n", "step3740 | loss: 0.20256444811820984 | dt: 1448.30ms | tok/sec: 2828.14 | norm: 2.20\n", "step3741 | loss: 0.27760863304138184 | dt: 1450.47ms | tok/sec: 2823.91 | norm: 2.74\n", "step3742 | loss: 0.2182004600763321 | dt: 1462.53ms | tok/sec: 2800.63 | norm: 2.45\n", "step3743 | loss: 0.2014755755662918 | dt: 1452.44ms | tok/sec: 2820.09 | norm: 2.22\n", "step3744 | loss: 0.17744605243206024 | dt: 1453.44ms | tok/sec: 2818.14 | norm: 2.08\n", "step3745 | loss: 0.3323928713798523 | dt: 1449.18ms | tok/sec: 2826.43 | norm: 3.25\n", "step3746 | loss: 0.24674423038959503 | dt: 1459.19ms | tok/sec: 2807.05 | norm: 3.13\n", "step3747 | loss: 0.2004702389240265 | dt: 1454.86ms | tok/sec: 2815.40 | norm: 2.35\n", "step3748 | loss: 0.1578441858291626 | dt: 1455.38ms | tok/sec: 2814.39 | norm: 2.07\n", "step3749 | loss: 0.1610957533121109 | dt: 1458.05ms | tok/sec: 2809.24 | norm: 2.37\n", "step3750 | loss: 0.23223139345645905 | dt: 1455.42ms | tok/sec: 2814.31 | norm: 2.33\n", "step3751 | loss: 0.2785252332687378 | dt: 1456.10ms | tok/sec: 2813.00 | norm: 2.92\n", "step3752 | loss: 0.22654381394386292 | dt: 1459.31ms | tok/sec: 2806.81 | norm: 2.81\n", "step3753 | loss: 0.15276828408241272 | dt: 1455.76ms | tok/sec: 2813.65 | norm: 2.11\n", "step3754 | loss: 0.1446346491575241 | dt: 1454.14ms | tok/sec: 2816.78 | norm: 2.03\n", "step3755 | loss: 0.18987374007701874 | dt: 1453.67ms | tok/sec: 2817.69 | norm: 2.40\n", "step3756 | loss: 0.10964491963386536 | dt: 1452.50ms | tok/sec: 2819.96 | norm: 1.83\n", "step3757 | loss: 0.1450541913509369 | dt: 1449.62ms | tok/sec: 2825.57 | norm: 2.46\n", "step3758 | loss: 0.13143861293792725 | dt: 1455.83ms | tok/sec: 2813.52 | norm: 2.05\n", "step3759 | loss: 0.29903513193130493 | dt: 1451.45ms | tok/sec: 2822.01 | norm: 3.25\n", "step3760 | loss: 0.18250976502895355 | dt: 1452.03ms | tok/sec: 2820.87 | norm: 2.13\n", "step3761 | loss: 0.36878639459609985 | dt: 1454.96ms | tok/sec: 2815.19 | norm: 3.11\n", "step3762 | loss: 0.2704402208328247 | dt: 1453.46ms | tok/sec: 2818.10 | norm: 2.51\n", "step3763 | loss: 0.31509286165237427 | dt: 1452.32ms | tok/sec: 2820.31 | norm: 2.51\n", "step3764 | loss: 0.28656867146492004 | dt: 1455.25ms | tok/sec: 2814.64 | norm: 2.90\n", "step3765 | loss: 0.2508774697780609 | dt: 1457.60ms | tok/sec: 2810.10 | norm: 2.74\n", "step3766 | loss: 0.20124028623104095 | dt: 1455.14ms | tok/sec: 2814.84 | norm: 2.95\n", "step3767 | loss: 0.17883309721946716 | dt: 1437.82ms | tok/sec: 2848.76 | norm: 2.63\n", "step3768 | loss: 0.188374325633049 | dt: 1455.11ms | tok/sec: 2814.91 | norm: 2.31\n", "step3769 | loss: 0.17531445622444153 | dt: 1453.95ms | tok/sec: 2817.15 | norm: 2.05\n", "step3770 | loss: 0.23761926591396332 | dt: 1454.76ms | tok/sec: 2815.58 | norm: 2.49\n", "step3771 | loss: 0.22393490374088287 | dt: 1454.90ms | tok/sec: 2815.32 | norm: 2.13\n", "step3772 | loss: 0.19844704866409302 | dt: 1453.12ms | tok/sec: 2818.77 | norm: 2.49\n", "step3773 | loss: 0.15818952023983002 | dt: 1450.77ms | tok/sec: 2823.33 | norm: 2.23\n", "step3774 | loss: 0.17690399289131165 | dt: 1450.87ms | tok/sec: 2823.13 | norm: 2.56\n", "step3775 | loss: 0.1902664750814438 | dt: 1460.50ms | tok/sec: 2804.52 | norm: 2.72\n", "step3776 | loss: 0.1565658152103424 | dt: 1454.09ms | tok/sec: 2816.88 | norm: 2.02\n", "step3777 | loss: 0.11831558495759964 | dt: 1453.41ms | tok/sec: 2818.21 | norm: 1.89\n", "step3778 | loss: 0.12777115404605865 | dt: 1438.97ms | tok/sec: 2846.49 | norm: 2.00\n", "step3779 | loss: 0.1478545367717743 | dt: 1452.94ms | tok/sec: 2819.11 | norm: 1.93\n", "step3780 | loss: 0.1265711486339569 | dt: 1458.00ms | tok/sec: 2809.33 | norm: 2.19\n", "step3781 | loss: 0.14236921072006226 | dt: 1455.55ms | tok/sec: 2814.06 | norm: 2.01\n", "step3782 | loss: 0.10669921338558197 | dt: 1452.99ms | tok/sec: 2819.02 | norm: 1.81\n", "step3783 | loss: 0.2137722223997116 | dt: 1455.07ms | tok/sec: 2814.99 | norm: 2.40\n", "step3784 | loss: 0.16282188892364502 | dt: 1454.43ms | tok/sec: 2816.23 | norm: 2.11\n", "step3785 | loss: 0.1560683250427246 | dt: 1452.09ms | tok/sec: 2820.76 | norm: 2.16\n", "step3786 | loss: 0.15465858578681946 | dt: 1449.48ms | tok/sec: 2825.84 | norm: 2.17\n", "step3787 | loss: 0.1640603095293045 | dt: 1461.55ms | tok/sec: 2802.51 | norm: 2.27\n", "step3788 | loss: 0.15530304610729218 | dt: 1442.37ms | tok/sec: 2839.78 | norm: 2.26\n", "step3789 | loss: 0.1421416699886322 | dt: 1451.51ms | tok/sec: 2821.89 | norm: 2.12\n", "step3790 | loss: 0.1265118271112442 | dt: 1449.08ms | tok/sec: 2826.63 | norm: 2.08\n", "step3791 | loss: 0.09876709431409836 | dt: 1454.83ms | tok/sec: 2815.46 | norm: 1.71\n", "step3792 | loss: 0.11095339804887772 | dt: 1452.18ms | tok/sec: 2820.59 | norm: 1.92\n", "step3793 | loss: 0.16844695806503296 | dt: 1455.10ms | tok/sec: 2814.93 | norm: 2.41\n", "step3794 | loss: 0.19459733366966248 | dt: 1451.02ms | tok/sec: 2822.83 | norm: 2.76\n", "step3795 | loss: 0.19468052685260773 | dt: 1454.88ms | tok/sec: 2815.35 | norm: 2.79\n", "step3796 | loss: 0.15100082755088806 | dt: 1453.27ms | tok/sec: 2818.48 | norm: 2.13\n", "step3797 | loss: 0.14177460968494415 | dt: 1448.71ms | tok/sec: 2827.35 | norm: 1.90\n", "step3798 | loss: 0.12069624662399292 | dt: 1441.39ms | tok/sec: 2841.71 | norm: 1.72\n", "step3799 | loss: 0.1067969873547554 | dt: 1445.05ms | tok/sec: 2834.50 | norm: 1.87\n", "step3800 | loss: 0.10488072782754898 | dt: 1456.30ms | tok/sec: 2812.60 | norm: 1.95\n", "step3801 | loss: 0.11620569229125977 | dt: 1443.59ms | tok/sec: 2837.37 | norm: 1.78\n", "step3802 | loss: 0.13085106015205383 | dt: 1454.31ms | tok/sec: 2816.46 | norm: 1.94\n", "step3803 | loss: 0.20472268760204315 | dt: 1452.85ms | tok/sec: 2819.29 | norm: 2.93\n", "step3804 | loss: 0.16652347147464752 | dt: 1450.45ms | tok/sec: 2823.95 | norm: 2.44\n", "step3805 | loss: 0.18212097883224487 | dt: 1454.10ms | tok/sec: 2816.86 | norm: 2.38\n", "step3806 | loss: 0.1314816176891327 | dt: 1465.06ms | tok/sec: 2795.79 | norm: 1.91\n", "step3807 | loss: 0.1296778917312622 | dt: 1456.13ms | tok/sec: 2812.94 | norm: 2.19\n", "step3808 | loss: 0.1055649071931839 | dt: 1439.62ms | tok/sec: 2845.19 | norm: 1.86\n", "step3809 | loss: 0.11923961341381073 | dt: 1445.65ms | tok/sec: 2833.33 | norm: 2.04\n", "step3810 | loss: 0.11192339658737183 | dt: 1443.28ms | tok/sec: 2837.98 | norm: 2.11\n", "step3811 | loss: 0.12622056901454926 | dt: 1455.75ms | tok/sec: 2813.67 | norm: 2.12\n", "step3812 | loss: 0.10215355455875397 | dt: 1452.42ms | tok/sec: 2820.13 | norm: 1.76\n", "step3813 | loss: 0.33713507652282715 | dt: 1445.09ms | tok/sec: 2834.42 | norm: 3.61\n", "step3814 | loss: 0.24812614917755127 | dt: 1453.71ms | tok/sec: 2817.63 | norm: 3.06\n", "step3815 | loss: 0.21207374334335327 | dt: 1460.31ms | tok/sec: 2804.89 | norm: 2.64\n", "step3816 | loss: 0.2149762511253357 | dt: 1449.44ms | tok/sec: 2825.91 | norm: 3.08\n", "step3817 | loss: 0.31825223565101624 | dt: 1448.98ms | tok/sec: 2826.82 | norm: 3.15\n", "step3818 | loss: 0.22847051918506622 | dt: 1457.71ms | tok/sec: 2809.90 | norm: 2.64\n", "step3819 | loss: 0.14261643588542938 | dt: 1458.96ms | tok/sec: 2807.48 | norm: 2.19\n", "step3820 | loss: 0.19023749232292175 | dt: 1454.25ms | tok/sec: 2816.58 | norm: 2.39\n", "step3821 | loss: 0.1879093199968338 | dt: 1454.54ms | tok/sec: 2816.00 | norm: 2.25\n", "step3822 | loss: 0.18434308469295502 | dt: 1452.11ms | tok/sec: 2820.72 | norm: 2.22\n", "step3823 | loss: 0.2526179552078247 | dt: 1450.13ms | tok/sec: 2824.57 | norm: 2.83\n", "step3824 | loss: 0.18972152471542358 | dt: 1454.44ms | tok/sec: 2816.20 | norm: 2.28\n", "step3825 | loss: 0.1737043857574463 | dt: 1450.58ms | tok/sec: 2823.69 | norm: 2.41\n", "step3826 | loss: 0.1480439156293869 | dt: 1457.48ms | tok/sec: 2810.34 | norm: 2.04\n", "step3827 | loss: 0.2905077636241913 | dt: 1457.24ms | tok/sec: 2810.78 | norm: 2.94\n", "step3828 | loss: 0.2224763035774231 | dt: 1448.27ms | tok/sec: 2828.21 | norm: 2.78\n", "step3829 | loss: 0.17284569144248962 | dt: 1459.99ms | tok/sec: 2805.49 | norm: 2.28\n", "step3830 | loss: 0.1457977592945099 | dt: 1455.59ms | tok/sec: 2813.97 | norm: 2.23\n", "step3831 | loss: 0.1486978381872177 | dt: 1451.67ms | tok/sec: 2821.57 | norm: 2.57\n", "step3832 | loss: 0.20108862221240997 | dt: 1458.49ms | tok/sec: 2808.39 | norm: 2.78\n", "step3833 | loss: 0.26426956057548523 | dt: 1458.53ms | tok/sec: 2808.31 | norm: 3.68\n", "step3834 | loss: 0.21026335656642914 | dt: 1455.01ms | tok/sec: 2815.11 | norm: 3.10\n", "step3835 | loss: 0.128698468208313 | dt: 1441.94ms | tok/sec: 2840.62 | norm: 2.23\n", "step3836 | loss: 0.1148872897028923 | dt: 1455.44ms | tok/sec: 2814.26 | norm: 1.67\n", "step3837 | loss: 0.17050902545452118 | dt: 1447.46ms | tok/sec: 2829.79 | norm: 2.41\n", "step3838 | loss: 0.09807047247886658 | dt: 1452.67ms | tok/sec: 2819.64 | norm: 2.03\n", "step3839 | loss: 0.11967926472425461 | dt: 1447.90ms | tok/sec: 2828.93 | norm: 2.15\n", "step3840 | loss: 0.1086922287940979 | dt: 1446.22ms | tok/sec: 2832.21 | norm: 1.73\n", "step3841 | loss: 0.2672763168811798 | dt: 1453.17ms | tok/sec: 2818.67 | norm: 2.90\n", "step3842 | loss: 0.16483834385871887 | dt: 1462.04ms | tok/sec: 2801.56 | norm: 2.60\n", "step3843 | loss: 0.3308486044406891 | dt: 1445.88ms | tok/sec: 2832.88 | norm: 3.53\n", "step3844 | loss: 0.24300359189510345 | dt: 1446.57ms | tok/sec: 2831.53 | norm: 2.68\n", "step3845 | loss: 0.28419381380081177 | dt: 1456.06ms | tok/sec: 2813.06 | norm: 3.03\n", "step3846 | loss: 0.24246719479560852 | dt: 1455.47ms | tok/sec: 2814.21 | norm: 2.73\n", "step3847 | loss: 0.2073313444852829 | dt: 1455.05ms | tok/sec: 2815.02 | norm: 2.30\n", "step3848 | loss: 0.15198901295661926 | dt: 1455.77ms | tok/sec: 2813.64 | norm: 1.99\n", "step3849 | loss: 0.15030401945114136 | dt: 1448.08ms | tok/sec: 2828.58 | norm: 2.17\n", "step3850 | loss: 0.16820497810840607 | dt: 1450.36ms | tok/sec: 2824.12 | norm: 2.46\n", "step3851 | loss: 0.1489752233028412 | dt: 1450.71ms | tok/sec: 2823.45 | norm: 2.25\n", "step3852 | loss: 0.19094745814800262 | dt: 1453.17ms | tok/sec: 2818.66 | norm: 2.41\n", "step3853 | loss: 0.19890020787715912 | dt: 1450.77ms | tok/sec: 2823.34 | norm: 2.54\n", "step3854 | loss: 0.17711153626441956 | dt: 1443.64ms | tok/sec: 2837.27 | norm: 2.32\n", "step3855 | loss: 0.12946805357933044 | dt: 1435.72ms | tok/sec: 2852.92 | norm: 1.95\n", "step3856 | loss: 0.15061983466148376 | dt: 1458.76ms | tok/sec: 2807.86 | norm: 2.31\n", "step3857 | loss: 0.15447287261486053 | dt: 1454.24ms | tok/sec: 2816.58 | norm: 2.40\n", "step3858 | loss: 0.11410341411828995 | dt: 1456.18ms | tok/sec: 2812.84 | norm: 2.03\n", "step3859 | loss: 0.09823379665613174 | dt: 1448.57ms | tok/sec: 2827.63 | norm: 1.78\n", "step3860 | loss: 0.10104015469551086 | dt: 1445.98ms | tok/sec: 2832.68 | norm: 1.81\n", "step3861 | loss: 0.1364942342042923 | dt: 1458.01ms | tok/sec: 2809.31 | norm: 2.59\n", "step3862 | loss: 0.1116282120347023 | dt: 1450.80ms | tok/sec: 2823.27 | norm: 2.05\n", "step3863 | loss: 0.13054777681827545 | dt: 1450.98ms | tok/sec: 2822.93 | norm: 1.82\n", "step3864 | loss: 0.10260938107967377 | dt: 1460.96ms | tok/sec: 2803.64 | norm: 2.06\n", "step3865 | loss: 0.17798511683940887 | dt: 1454.16ms | tok/sec: 2816.74 | norm: 2.39\n", "step3866 | loss: 0.13608507812023163 | dt: 1448.66ms | tok/sec: 2827.44 | norm: 2.09\n", "step3867 | loss: 0.13344477117061615 | dt: 1451.89ms | tok/sec: 2821.15 | norm: 2.04\n", "step3868 | loss: 0.13085618615150452 | dt: 1456.81ms | tok/sec: 2811.63 | norm: 1.93\n", "step3869 | loss: 0.1513628214597702 | dt: 1453.59ms | tok/sec: 2817.85 | norm: 2.41\n", "step3870 | loss: 0.1459427922964096 | dt: 1452.94ms | tok/sec: 2819.10 | norm: 2.46\n", "step3871 | loss: 0.13315902650356293 | dt: 1452.20ms | tok/sec: 2820.56 | norm: 2.18\n", "step3872 | loss: 0.11275094002485275 | dt: 1453.21ms | tok/sec: 2818.58 | norm: 1.79\n", "step3873 | loss: 0.09507273882627487 | dt: 1453.32ms | tok/sec: 2818.37 | norm: 2.08\n", "step3874 | loss: 0.10454541444778442 | dt: 1460.58ms | tok/sec: 2804.37 | norm: 2.11\n", "step3875 | loss: 0.14865301549434662 | dt: 1452.67ms | tok/sec: 2819.63 | norm: 2.22\n", "step3876 | loss: 0.17493991553783417 | dt: 1454.28ms | tok/sec: 2816.51 | norm: 2.43\n", "step3877 | loss: 0.17497147619724274 | dt: 1453.11ms | tok/sec: 2818.77 | norm: 2.64\n", "step3878 | loss: 0.13006971776485443 | dt: 1452.18ms | tok/sec: 2820.59 | norm: 2.16\n", "step3879 | loss: 0.1291326880455017 | dt: 1448.30ms | tok/sec: 2828.14 | norm: 2.18\n", "step3880 | loss: 0.10309534519910812 | dt: 1458.11ms | tok/sec: 2809.12 | norm: 1.98\n", "step3881 | loss: 0.0979541763663292 | dt: 1454.66ms | tok/sec: 2815.78 | norm: 1.91\n", "step3882 | loss: 0.10215254873037338 | dt: 1460.46ms | tok/sec: 2804.60 | norm: 2.32\n", "step3883 | loss: 0.10501040518283844 | dt: 1439.47ms | tok/sec: 2845.48 | norm: 1.73\n", "step3884 | loss: 0.11593540012836456 | dt: 1459.36ms | tok/sec: 2806.71 | norm: 1.81\n", "step3885 | loss: 0.1805296540260315 | dt: 1455.24ms | tok/sec: 2814.65 | norm: 2.77\n", "step3886 | loss: 0.14461073279380798 | dt: 1462.06ms | tok/sec: 2801.53 | norm: 2.47\n", "step3887 | loss: 0.15373595058918 | dt: 1456.74ms | tok/sec: 2811.76 | norm: 2.13\n", "step3888 | loss: 0.1076718270778656 | dt: 1456.22ms | tok/sec: 2812.76 | norm: 2.18\n", "step3889 | loss: 0.10451424866914749 | dt: 1445.13ms | tok/sec: 2834.34 | norm: 1.90\n", "step3890 | loss: 0.10069803148508072 | dt: 1453.33ms | tok/sec: 2818.36 | norm: 2.11\n", "step3891 | loss: 0.10698889940977097 | dt: 1452.89ms | tok/sec: 2819.20 | norm: 2.29\n", "step3892 | loss: 0.10019479691982269 | dt: 1459.86ms | tok/sec: 2805.76 | norm: 2.09\n", "step3893 | loss: 0.09601335972547531 | dt: 1455.34ms | tok/sec: 2814.46 | norm: 1.80\n", "step3894 | loss: 0.08856703341007233 | dt: 1458.25ms | tok/sec: 2808.85 | norm: 2.13\n", "step3895 | loss: 0.27702102065086365 | dt: 1455.05ms | tok/sec: 2815.02 | norm: 3.07\n", "step3896 | loss: 0.1999707818031311 | dt: 1455.45ms | tok/sec: 2814.26 | norm: 2.80\n", "step3897 | loss: 0.16751526296138763 | dt: 1456.48ms | tok/sec: 2812.26 | norm: 2.57\n", "step3898 | loss: 0.17090849578380585 | dt: 1448.18ms | tok/sec: 2828.37 | norm: 2.35\n", "step3899 | loss: 0.2670383155345917 | dt: 1456.73ms | tok/sec: 2811.78 | norm: 2.78\n", "step3900 | loss: 0.19353151321411133 | dt: 1452.85ms | tok/sec: 2819.29 | norm: 2.52\n", "step3901 | loss: 0.12497348338365555 | dt: 1453.70ms | tok/sec: 2817.64 | norm: 2.28\n", "step3902 | loss: 0.16432039439678192 | dt: 1456.62ms | tok/sec: 2811.99 | norm: 2.40\n", "step3903 | loss: 0.1599733829498291 | dt: 1452.97ms | tok/sec: 2819.05 | norm: 2.45\n", "step3904 | loss: 0.1528489887714386 | dt: 1453.52ms | tok/sec: 2817.99 | norm: 2.51\n", "step3905 | loss: 0.22298100590705872 | dt: 1455.78ms | tok/sec: 2813.61 | norm: 2.79\n", "step3906 | loss: 0.18186284601688385 | dt: 1453.77ms | tok/sec: 2817.51 | norm: 2.79\n", "step3907 | loss: 0.160768061876297 | dt: 1445.00ms | tok/sec: 2834.61 | norm: 2.11\n", "step3908 | loss: 0.13802702724933624 | dt: 1454.50ms | tok/sec: 2816.09 | norm: 2.03\n", "step3909 | loss: 0.27190911769866943 | dt: 1459.35ms | tok/sec: 2806.73 | norm: 3.06\n", "step3910 | loss: 0.2020593285560608 | dt: 1452.80ms | tok/sec: 2819.39 | norm: 2.65\n", "step3911 | loss: 0.14951349794864655 | dt: 1450.70ms | tok/sec: 2823.47 | norm: 2.11\n", "step3912 | loss: 0.12365668267011642 | dt: 1450.10ms | tok/sec: 2824.64 | norm: 2.07\n", "step3913 | loss: 0.13272874057292938 | dt: 1455.17ms | tok/sec: 2814.79 | norm: 2.10\n", "step3914 | loss: 0.1833968609571457 | dt: 1458.35ms | tok/sec: 2808.65 | norm: 2.66\n", "step3915 | loss: 0.23843789100646973 | dt: 1449.62ms | tok/sec: 2825.56 | norm: 3.12\n", "step3916 | loss: 0.17193058133125305 | dt: 1459.34ms | tok/sec: 2806.74 | norm: 2.60\n", "step3917 | loss: 0.11655856668949127 | dt: 1455.87ms | tok/sec: 2813.44 | norm: 2.19\n", "step3918 | loss: 0.09898079186677933 | dt: 1453.28ms | tok/sec: 2818.46 | norm: 1.76\n", "step3919 | loss: 0.17187006771564484 | dt: 1452.86ms | tok/sec: 2819.27 | norm: 2.87\n", "step3920 | loss: 0.0880291759967804 | dt: 1462.48ms | tok/sec: 2800.72 | norm: 1.85\n", "step3921 | loss: 0.10005863755941391 | dt: 1443.62ms | tok/sec: 2837.30 | norm: 1.77\n", "step3922 | loss: 0.0930970162153244 | dt: 1462.12ms | tok/sec: 2801.41 | norm: 1.79\n", "step3923 | loss: 0.2093571275472641 | dt: 1452.17ms | tok/sec: 2820.60 | norm: 2.44\n", "step3924 | loss: 0.12545570731163025 | dt: 1450.24ms | tok/sec: 2824.36 | norm: 1.76\n", "step3925 | loss: 0.27792736887931824 | dt: 1452.04ms | tok/sec: 2820.87 | norm: 3.01\n", "step3926 | loss: 0.18384797871112823 | dt: 1453.67ms | tok/sec: 2817.69 | norm: 2.28\n", "step3927 | loss: 0.2392643392086029 | dt: 1457.66ms | tok/sec: 2809.98 | norm: 3.02\n", "step3928 | loss: 0.20910611748695374 | dt: 1458.37ms | tok/sec: 2808.62 | norm: 2.71\n", "step3929 | loss: 0.17594416439533234 | dt: 1456.57ms | tok/sec: 2812.08 | norm: 2.55\n", "step3930 | loss: 0.12174955010414124 | dt: 1454.28ms | tok/sec: 2816.52 | norm: 1.85\n", "step3931 | loss: 0.11724930256605148 | dt: 1454.84ms | tok/sec: 2815.44 | norm: 2.13\n", "step3932 | loss: 0.134203240275383 | dt: 1465.76ms | tok/sec: 2794.45 | norm: 2.01\n", "step3933 | loss: 0.11995621770620346 | dt: 1457.13ms | tok/sec: 2811.00 | norm: 2.14\n", "step3934 | loss: 0.1687619835138321 | dt: 1452.85ms | tok/sec: 2819.29 | norm: 2.43\n", "step3935 | loss: 0.16846685111522675 | dt: 1454.09ms | tok/sec: 2816.88 | norm: 2.33\n", "step3936 | loss: 0.14334112405776978 | dt: 1453.56ms | tok/sec: 2817.91 | norm: 2.00\n", "step3937 | loss: 0.10289536416530609 | dt: 1453.06ms | tok/sec: 2818.89 | norm: 1.97\n", "step3938 | loss: 0.1170831024646759 | dt: 1457.00ms | tok/sec: 2811.27 | norm: 1.91\n", "step3939 | loss: 0.1258583515882492 | dt: 1442.13ms | tok/sec: 2840.25 | norm: 2.09\n", "step3940 | loss: 0.10426764190196991 | dt: 1450.13ms | tok/sec: 2824.58 | norm: 1.97\n", "step3941 | loss: 0.10159585624933243 | dt: 1444.86ms | tok/sec: 2834.88 | norm: 2.33\n", "step3942 | loss: 0.09106671065092087 | dt: 1450.69ms | tok/sec: 2823.48 | norm: 1.81\n", "step3943 | loss: 0.12094495445489883 | dt: 1449.10ms | tok/sec: 2826.58 | norm: 2.08\n", "step3944 | loss: 0.09491202980279922 | dt: 1453.75ms | tok/sec: 2817.55 | norm: 1.78\n", "step3945 | loss: 0.0855354517698288 | dt: 1451.40ms | tok/sec: 2822.11 | norm: 1.50\n", "step3946 | loss: 0.07635314017534256 | dt: 1455.61ms | tok/sec: 2813.94 | norm: 1.57\n", "step3947 | loss: 0.13886284828186035 | dt: 1451.86ms | tok/sec: 2821.21 | norm: 1.93\n", "step3948 | loss: 0.11462404578924179 | dt: 1454.52ms | tok/sec: 2816.05 | norm: 2.02\n", "step3949 | loss: 0.10865321010351181 | dt: 1450.19ms | tok/sec: 2824.45 | norm: 1.89\n", "step3950 | loss: 0.10418574512004852 | dt: 1451.58ms | tok/sec: 2821.76 | norm: 1.91\n", "step3951 | loss: 0.11562184989452362 | dt: 1441.50ms | tok/sec: 2841.48 | norm: 1.81\n", "step3952 | loss: 0.1114373654127121 | dt: 1463.14ms | tok/sec: 2799.46 | norm: 2.06\n", "step3953 | loss: 0.09679725766181946 | dt: 1453.90ms | tok/sec: 2817.25 | norm: 1.84\n", "step3954 | loss: 0.08773120492696762 | dt: 1455.06ms | tok/sec: 2815.00 | norm: 1.74\n", "step3955 | loss: 0.08425816893577576 | dt: 1448.65ms | tok/sec: 2827.46 | norm: 1.94\n", "step3956 | loss: 0.07953040301799774 | dt: 1452.08ms | tok/sec: 2820.78 | norm: 1.48\n", "step3957 | loss: 0.12483648955821991 | dt: 1454.63ms | tok/sec: 2815.83 | norm: 2.36\n", "step3958 | loss: 0.1382896453142166 | dt: 1454.87ms | tok/sec: 2815.38 | norm: 2.19\n", "step3959 | loss: 0.15042142570018768 | dt: 1453.42ms | tok/sec: 2818.18 | norm: 2.57\n", "step3960 | loss: 0.1134289801120758 | dt: 1452.67ms | tok/sec: 2819.63 | norm: 2.01\n", "step3961 | loss: 0.12405310571193695 | dt: 1452.95ms | tok/sec: 2819.09 | norm: 2.57\n", "step3962 | loss: 0.0997130498290062 | dt: 1448.36ms | tok/sec: 2828.02 | norm: 2.13\n", "step3963 | loss: 0.09345140308141708 | dt: 1450.83ms | tok/sec: 2823.22 | norm: 2.13\n", "step3964 | loss: 0.09633325785398483 | dt: 1453.19ms | tok/sec: 2818.62 | norm: 2.16\n", "step3965 | loss: 0.09257005155086517 | dt: 1447.32ms | tok/sec: 2830.05 | norm: 2.14\n", "step3966 | loss: 0.1003948450088501 | dt: 1452.25ms | tok/sec: 2820.46 | norm: 2.11\n", "step3967 | loss: 0.1492462307214737 | dt: 1450.52ms | tok/sec: 2823.82 | norm: 2.43\n", "step3968 | loss: 0.13135598599910736 | dt: 1451.95ms | tok/sec: 2821.04 | norm: 2.53\n", "step3969 | loss: 0.12919451296329498 | dt: 1453.54ms | tok/sec: 2817.94 | norm: 2.07\n", "step3970 | loss: 0.09521026909351349 | dt: 1452.19ms | tok/sec: 2820.56 | norm: 1.86\n", "step3971 | loss: 0.09200818836688995 | dt: 1453.51ms | tok/sec: 2818.00 | norm: 1.96\n", "step3972 | loss: 0.08223738521337509 | dt: 1455.06ms | tok/sec: 2815.01 | norm: 1.69\n", "step3973 | loss: 0.08067475259304047 | dt: 1449.94ms | tok/sec: 2824.95 | norm: 1.61\n", "step3974 | loss: 0.08284617960453033 | dt: 1443.43ms | tok/sec: 2837.68 | norm: 1.95\n", "step3975 | loss: 0.08410495519638062 | dt: 1443.32ms | tok/sec: 2837.89 | norm: 1.67\n", "step3976 | loss: 0.07914762943983078 | dt: 1448.31ms | tok/sec: 2828.12 | norm: 1.93\n", "step3977 | loss: 0.22736014425754547 | dt: 1443.39ms | tok/sec: 2837.77 | norm: 2.82\n", "step3978 | loss: 0.16698531806468964 | dt: 1453.81ms | tok/sec: 2817.43 | norm: 2.65\n", "step3979 | loss: 0.14135795831680298 | dt: 1447.25ms | tok/sec: 2830.19 | norm: 2.27\n", "step3980 | loss: 0.13476571440696716 | dt: 1451.51ms | tok/sec: 2821.89 | norm: 2.11\n", "step3981 | loss: 0.2178320288658142 | dt: 1448.36ms | tok/sec: 2828.02 | norm: 2.48\n", "step3982 | loss: 0.14959068596363068 | dt: 1440.86ms | tok/sec: 2842.75 | norm: 2.09\n", "step3983 | loss: 0.10019863396883011 | dt: 1448.53ms | tok/sec: 2827.69 | norm: 2.04\n", "step3984 | loss: 0.14347043633460999 | dt: 1444.68ms | tok/sec: 2835.23 | norm: 2.54\n", "step3985 | loss: 0.1462097316980362 | dt: 1446.57ms | tok/sec: 2831.53 | norm: 2.59\n", "step3986 | loss: 0.1367923617362976 | dt: 1440.50ms | tok/sec: 2843.46 | norm: 2.26\n", "step3987 | loss: 0.18355098366737366 | dt: 1448.52ms | tok/sec: 2827.72 | norm: 2.55\n", "step3988 | loss: 0.1356094926595688 | dt: 1459.01ms | tok/sec: 2807.37 | norm: 2.17\n", "step3989 | loss: 0.12796297669410706 | dt: 1446.67ms | tok/sec: 2831.34 | norm: 2.15\n", "step3990 | loss: 0.11387480795383453 | dt: 1451.35ms | tok/sec: 2822.19 | norm: 2.02\n", "step3991 | loss: 0.2458215057849884 | dt: 1443.58ms | tok/sec: 2837.40 | norm: 3.15\n", "step3992 | loss: 0.16870799660682678 | dt: 1447.57ms | tok/sec: 2829.57 | norm: 2.69\n", "step3993 | loss: 0.13559690117835999 | dt: 1442.13ms | tok/sec: 2840.25 | norm: 2.31\n", "step3994 | loss: 0.11227740347385406 | dt: 1448.08ms | tok/sec: 2828.56 | norm: 1.97\n", "step3995 | loss: 0.11387322098016739 | dt: 1446.73ms | tok/sec: 2831.21 | norm: 2.21\n", "step3996 | loss: 0.15616095066070557 | dt: 1453.97ms | tok/sec: 2817.12 | norm: 2.44\n", "step3997 | loss: 0.20992165803909302 | dt: 1455.46ms | tok/sec: 2814.23 | norm: 3.11\n", "step3998 | loss: 0.1676238626241684 | dt: 1455.38ms | tok/sec: 2814.38 | norm: 2.89\n", "step3999 | loss: 0.1134057566523552 | dt: 1447.59ms | tok/sec: 2829.53 | norm: 2.48\n", "step4000 | loss: 0.0991256833076477 | dt: 1450.93ms | tok/sec: 2823.01 | norm: 2.00\n", "step4001 | loss: 0.14696656167507172 | dt: 1453.95ms | tok/sec: 2817.16 | norm: 2.65\n", "step4002 | loss: 0.08100149035453796 | dt: 1456.68ms | tok/sec: 2811.88 | norm: 1.88\n", "step4003 | loss: 0.09317673742771149 | dt: 1448.70ms | tok/sec: 2827.36 | norm: 2.10\n", "step4004 | loss: 0.0818205177783966 | dt: 1455.80ms | tok/sec: 2813.58 | norm: 1.75\n", "step4005 | loss: 0.17911164462566376 | dt: 1452.57ms | tok/sec: 2819.84 | norm: 2.42\n", "step4006 | loss: 0.11073881387710571 | dt: 1454.36ms | tok/sec: 2816.36 | norm: 1.95\n", "step4007 | loss: 0.237913116812706 | dt: 1453.53ms | tok/sec: 2817.96 | norm: 2.89\n", "step4008 | loss: 0.1591116040945053 | dt: 1454.69ms | tok/sec: 2815.72 | norm: 2.32\n", "step4009 | loss: 0.2038808912038803 | dt: 1458.96ms | tok/sec: 2807.49 | norm: 2.46\n", "step4010 | loss: 0.15910930931568146 | dt: 1451.17ms | tok/sec: 2822.56 | norm: 2.22\n", "step4011 | loss: 0.15472523868083954 | dt: 1452.78ms | tok/sec: 2819.43 | norm: 2.38\n", "step4012 | loss: 0.10145218670368195 | dt: 1452.43ms | tok/sec: 2820.10 | norm: 1.88\n", "step4013 | loss: 0.09528855234384537 | dt: 1458.04ms | tok/sec: 2809.25 | norm: 1.61\n", "step4014 | loss: 0.11459725350141525 | dt: 1450.30ms | tok/sec: 2824.24 | norm: 2.27\n", "step4015 | loss: 0.10922395437955856 | dt: 1448.57ms | tok/sec: 2827.62 | norm: 2.04\n", "step4016 | loss: 0.13533377647399902 | dt: 1456.01ms | tok/sec: 2813.17 | norm: 2.04\n", "step4017 | loss: 0.13643909990787506 | dt: 1456.07ms | tok/sec: 2813.04 | norm: 2.10\n", "step4018 | loss: 0.11202670633792877 | dt: 1454.55ms | tok/sec: 2815.99 | norm: 1.88\n", "step4019 | loss: 0.08439189940690994 | dt: 1459.78ms | tok/sec: 2805.91 | norm: 1.91\n", "step4020 | loss: 0.09561590105295181 | dt: 1459.90ms | tok/sec: 2805.68 | norm: 2.14\n", "step4021 | loss: 0.10456808656454086 | dt: 1453.83ms | tok/sec: 2817.38 | norm: 2.20\n", "step4022 | loss: 0.08239585161209106 | dt: 1451.26ms | tok/sec: 2822.38 | norm: 1.78\n", "step4023 | loss: 0.07925659418106079 | dt: 1455.93ms | tok/sec: 2813.32 | norm: 1.79\n", "step4024 | loss: 0.07495423406362534 | dt: 1454.92ms | tok/sec: 2815.27 | norm: 1.64\n", "step4025 | loss: 0.10106673091650009 | dt: 1452.81ms | tok/sec: 2819.37 | norm: 2.22\n", "step4026 | loss: 0.07754089683294296 | dt: 1452.68ms | tok/sec: 2819.61 | norm: 1.79\n", "step4027 | loss: 0.07435891032218933 | dt: 1458.72ms | tok/sec: 2807.93 | norm: 1.74\n", "step4028 | loss: 0.06589115411043167 | dt: 1455.50ms | tok/sec: 2814.15 | norm: 1.65\n", "step4029 | loss: 0.12162720412015915 | dt: 1457.91ms | tok/sec: 2809.50 | norm: 2.28\n", "step4030 | loss: 0.09277305752038956 | dt: 1447.44ms | tok/sec: 2829.82 | norm: 1.81\n", "step4031 | loss: 0.09089701622724533 | dt: 1451.44ms | tok/sec: 2822.03 | norm: 1.87\n", "step4032 | loss: 0.08324488252401352 | dt: 1450.72ms | tok/sec: 2823.42 | norm: 1.57\n", "step4033 | loss: 0.1022883728146553 | dt: 1456.25ms | tok/sec: 2812.70 | norm: 2.17\n", "step4034 | loss: 0.09285113215446472 | dt: 1457.76ms | tok/sec: 2809.80 | norm: 1.75\n", "step4035 | loss: 0.08557666838169098 | dt: 1455.95ms | tok/sec: 2813.29 | norm: 1.83\n", "step4036 | loss: 0.07987969368696213 | dt: 1457.28ms | tok/sec: 2810.72 | norm: 1.91\n", "step4037 | loss: 0.07258019596338272 | dt: 1455.94ms | tok/sec: 2813.30 | norm: 1.70\n", "step4038 | loss: 0.07918772101402283 | dt: 1455.98ms | tok/sec: 2813.22 | norm: 1.78\n", "step4039 | loss: 0.13650770485401154 | dt: 1453.90ms | tok/sec: 2817.25 | norm: 2.17\n", "step4040 | loss: 0.11824358999729156 | dt: 1456.88ms | tok/sec: 2811.49 | norm: 2.18\n", "step4041 | loss: 0.1339251697063446 | dt: 1440.77ms | tok/sec: 2842.93 | norm: 2.32\n", "step4042 | loss: 0.0984354242682457 | dt: 1454.74ms | tok/sec: 2815.63 | norm: 2.30\n", "step4043 | loss: 0.09653817862272263 | dt: 1452.99ms | tok/sec: 2819.02 | norm: 1.90\n", "step4044 | loss: 0.08957485109567642 | dt: 1451.79ms | tok/sec: 2821.34 | norm: 2.11\n", "step4045 | loss: 0.0814116820693016 | dt: 1442.00ms | tok/sec: 2840.51 | norm: 2.14\n", "step4046 | loss: 0.08089349418878555 | dt: 1452.78ms | tok/sec: 2819.41 | norm: 1.98\n", "step4047 | loss: 0.07897663116455078 | dt: 1455.36ms | tok/sec: 2814.43 | norm: 2.06\n", "step4048 | loss: 0.08559174835681915 | dt: 1453.29ms | tok/sec: 2818.42 | norm: 2.11\n", "step4049 | loss: 0.12685275077819824 | dt: 1456.27ms | tok/sec: 2812.67 | norm: 2.50\n", "step4050 | loss: 0.11703310161828995 | dt: 1457.41ms | tok/sec: 2810.46 | norm: 2.41\n", "step4051 | loss: 0.11148746311664581 | dt: 1462.74ms | tok/sec: 2800.22 | norm: 2.36\n", "step4052 | loss: 0.07459664344787598 | dt: 1456.55ms | tok/sec: 2812.13 | norm: 1.71\n", "step4053 | loss: 0.07508549839258194 | dt: 1456.77ms | tok/sec: 2811.70 | norm: 1.92\n", "step4054 | loss: 0.08329525589942932 | dt: 1450.48ms | tok/sec: 2823.90 | norm: 2.26\n", "step4055 | loss: 0.07463127374649048 | dt: 1459.99ms | tok/sec: 2805.50 | norm: 2.13\n", "step4056 | loss: 0.07288706302642822 | dt: 1455.87ms | tok/sec: 2813.43 | norm: 2.02\n", "step4057 | loss: 0.07821677625179291 | dt: 1458.73ms | tok/sec: 2807.92 | norm: 2.01\n", "step4058 | loss: 0.07619055360555649 | dt: 1456.28ms | tok/sec: 2812.65 | norm: 1.88\n", "step4059 | loss: 0.22477500140666962 | dt: 1442.43ms | tok/sec: 2839.65 | norm: 3.53\n", "step4060 | loss: 0.16090914607048035 | dt: 1455.16ms | tok/sec: 2814.81 | norm: 2.87\n", "step4061 | loss: 0.1290520280599594 | dt: 1459.28ms | tok/sec: 2806.86 | norm: 2.47\n", "step4062 | loss: 0.13151408731937408 | dt: 1445.33ms | tok/sec: 2833.96 | norm: 2.33\n", "step4063 | loss: 0.19317200779914856 | dt: 1451.18ms | tok/sec: 2822.54 | norm: 2.84\n", "step4064 | loss: 0.13885383307933807 | dt: 1456.84ms | tok/sec: 2811.56 | norm: 2.35\n", "step4065 | loss: 0.09637100994586945 | dt: 1462.30ms | tok/sec: 2801.06 | norm: 2.25\n", "step4066 | loss: 0.11958982795476913 | dt: 1444.14ms | tok/sec: 2836.29 | norm: 2.09\n", "step4067 | loss: 0.11435393989086151 | dt: 1452.39ms | tok/sec: 2820.17 | norm: 2.20\n", "step4068 | loss: 0.12556789815425873 | dt: 1453.18ms | tok/sec: 2818.64 | norm: 2.26\n", "step4069 | loss: 0.18624979257583618 | dt: 1459.41ms | tok/sec: 2806.61 | norm: 3.16\n", "step4070 | loss: 0.22874002158641815 | dt: 1457.40ms | tok/sec: 2810.49 | norm: 4.28\n", "step4071 | loss: 0.11940689384937286 | dt: 1456.04ms | tok/sec: 2813.11 | norm: 2.26\n", "step4072 | loss: 0.10018376260995865 | dt: 1458.78ms | tok/sec: 2807.82 | norm: 1.87\n", "step4073 | loss: 0.19609342515468597 | dt: 1454.20ms | tok/sec: 2816.67 | norm: 2.50\n", "step4074 | loss: 0.1387944370508194 | dt: 1454.30ms | tok/sec: 2816.47 | norm: 2.21\n", "step4075 | loss: 0.11445726454257965 | dt: 1460.13ms | tok/sec: 2805.24 | norm: 2.17\n", "step4076 | loss: 0.0941728949546814 | dt: 1458.05ms | tok/sec: 2809.22 | norm: 2.11\n", "step4077 | loss: 0.09525355696678162 | dt: 1457.39ms | tok/sec: 2810.51 | norm: 2.03\n", "step4078 | loss: 0.13870418071746826 | dt: 1460.70ms | tok/sec: 2804.13 | norm: 2.38\n", "step4079 | loss: 0.16445228457450867 | dt: 1454.38ms | tok/sec: 2816.32 | norm: 2.24\n", "step4080 | loss: 0.13063018023967743 | dt: 1460.22ms | tok/sec: 2805.06 | norm: 2.09\n", "step4081 | loss: 0.08427255600690842 | dt: 1457.54ms | tok/sec: 2810.22 | norm: 1.78\n", "step4082 | loss: 0.07892459630966187 | dt: 1448.62ms | tok/sec: 2827.52 | norm: 1.81\n", "step4083 | loss: 0.1155475601553917 | dt: 1449.57ms | tok/sec: 2825.67 | norm: 2.52\n", "step4084 | loss: 0.06566399335861206 | dt: 1455.17ms | tok/sec: 2814.79 | norm: 1.68\n", "step4085 | loss: 0.08926550298929214 | dt: 1455.93ms | tok/sec: 2813.33 | norm: 2.07\n", "step4086 | loss: 0.07100672274827957 | dt: 1460.02ms | tok/sec: 2805.44 | norm: 1.76\n", "step4087 | loss: 0.15587837994098663 | dt: 1449.15ms | tok/sec: 2826.48 | norm: 2.36\n", "step4088 | loss: 0.09610076993703842 | dt: 1462.07ms | tok/sec: 2801.51 | norm: 1.94\n", "step4089 | loss: 0.1926085352897644 | dt: 1456.73ms | tok/sec: 2811.79 | norm: 2.50\n", "step4090 | loss: 0.1373256891965866 | dt: 1453.90ms | tok/sec: 2817.24 | norm: 2.35\n", "step4091 | loss: 0.16994723677635193 | dt: 1439.45ms | tok/sec: 2845.53 | norm: 2.43\n", "step4092 | loss: 0.14024879038333893 | dt: 1456.62ms | tok/sec: 2811.98 | norm: 2.38\n", "step4093 | loss: 0.12997622787952423 | dt: 1457.80ms | tok/sec: 2809.71 | norm: 2.38\n", "step4094 | loss: 0.08806484937667847 | dt: 1453.56ms | tok/sec: 2817.90 | norm: 1.66\n", "step4095 | loss: 0.07590636610984802 | dt: 1453.80ms | tok/sec: 2817.45 | norm: 1.65\n", "step4096 | loss: 0.0974959135055542 | dt: 1453.49ms | tok/sec: 2818.04 | norm: 1.80\n", "step4097 | loss: 0.0921422466635704 | dt: 1454.49ms | tok/sec: 2816.11 | norm: 1.94\n", "step4098 | loss: 0.1179453581571579 | dt: 1453.93ms | tok/sec: 2817.19 | norm: 2.25\n", "step4099 | loss: 0.11832978576421738 | dt: 1456.73ms | tok/sec: 2811.78 | norm: 1.92\n", "step4100 | loss: 0.10117404907941818 | dt: 1462.62ms | tok/sec: 2800.45 | norm: 2.07\n", "step4101 | loss: 0.07317738234996796 | dt: 1445.29ms | tok/sec: 2834.04 | norm: 1.74\n", "step4102 | loss: 0.08234070986509323 | dt: 1462.71ms | tok/sec: 2800.29 | norm: 2.01\n", "step4103 | loss: 0.09987511485815048 | dt: 1458.63ms | tok/sec: 2808.11 | norm: 2.23\n", "step4104 | loss: 0.08629225939512253 | dt: 1450.38ms | tok/sec: 2824.08 | norm: 2.48\n", "step4105 | loss: 0.06844627112150192 | dt: 1456.06ms | tok/sec: 2813.06 | norm: 1.97\n", "step4106 | loss: 0.07033796608448029 | dt: 1455.97ms | tok/sec: 2813.25 | norm: 2.08\n", "step4107 | loss: 0.09382002055644989 | dt: 1453.19ms | tok/sec: 2818.62 | norm: 1.87\n", "step4108 | loss: 0.06868667155504227 | dt: 1459.50ms | tok/sec: 2806.45 | norm: 1.81\n", "step4109 | loss: 0.0725872740149498 | dt: 1460.66ms | tok/sec: 2804.21 | norm: 1.81\n", "step4110 | loss: 0.058137066662311554 | dt: 1452.92ms | tok/sec: 2819.15 | norm: 1.94\n", "step4111 | loss: 0.10591557621955872 | dt: 1447.11ms | tok/sec: 2830.46 | norm: 1.95\n", "step4112 | loss: 0.08241774141788483 | dt: 1451.30ms | tok/sec: 2822.30 | norm: 1.91\n", "step4113 | loss: 0.07866759598255157 | dt: 1455.89ms | tok/sec: 2813.40 | norm: 1.62\n", "step4114 | loss: 0.07394403964281082 | dt: 1461.13ms | tok/sec: 2803.32 | norm: 1.59\n", "step4115 | loss: 0.08506637811660767 | dt: 1459.82ms | tok/sec: 2805.83 | norm: 1.62\n", "step4116 | loss: 0.07622003555297852 | dt: 1462.38ms | tok/sec: 2800.92 | norm: 1.68\n", "step4117 | loss: 0.06528382748365402 | dt: 1459.47ms | tok/sec: 2806.50 | norm: 1.47\n", "step4118 | loss: 0.06730940192937851 | dt: 1444.28ms | tok/sec: 2836.01 | norm: 1.65\n", "step4119 | loss: 0.056938428431749344 | dt: 1462.11ms | tok/sec: 2801.43 | norm: 1.50\n", "step4120 | loss: 0.0636432021856308 | dt: 1458.35ms | tok/sec: 2808.66 | norm: 1.56\n", "step4121 | loss: 0.09686491638422012 | dt: 1452.53ms | tok/sec: 2819.90 | norm: 1.99\n", "step4122 | loss: 0.09218008071184158 | dt: 1453.61ms | tok/sec: 2817.80 | norm: 1.89\n", "step4123 | loss: 0.0969167947769165 | dt: 1455.02ms | tok/sec: 2815.08 | norm: 1.82\n", "step4124 | loss: 0.07856255769729614 | dt: 1447.38ms | tok/sec: 2829.93 | norm: 1.77\n", "step4125 | loss: 0.07492321729660034 | dt: 1455.59ms | tok/sec: 2813.98 | norm: 1.67\n", "step4126 | loss: 0.07402510941028595 | dt: 1452.12ms | tok/sec: 2820.71 | norm: 1.93\n", "step4127 | loss: 0.06934446841478348 | dt: 1461.46ms | tok/sec: 2802.69 | norm: 1.84\n", "step4128 | loss: 0.06845446676015854 | dt: 1464.01ms | tok/sec: 2797.79 | norm: 1.68\n", "step4129 | loss: 0.07065919786691666 | dt: 1458.40ms | tok/sec: 2808.57 | norm: 1.92\n", "step4130 | loss: 0.0755324736237526 | dt: 1454.25ms | tok/sec: 2816.57 | norm: 1.76\n", "step4131 | loss: 0.09934952855110168 | dt: 1451.91ms | tok/sec: 2821.12 | norm: 2.16\n", "step4132 | loss: 0.0882185772061348 | dt: 1462.49ms | tok/sec: 2800.71 | norm: 2.28\n", "step4133 | loss: 0.08894096314907074 | dt: 1461.55ms | tok/sec: 2802.51 | norm: 2.09\n", "step4134 | loss: 0.06204136461019516 | dt: 1458.35ms | tok/sec: 2808.65 | norm: 1.89\n", "step4135 | loss: 0.07035836577415466 | dt: 1454.90ms | tok/sec: 2815.31 | norm: 1.64\n", "step4136 | loss: 0.12453610450029373 | dt: 1460.06ms | tok/sec: 2805.37 | norm: 1.62\n", "step4137 | loss: 0.061094097793102264 | dt: 1456.30ms | tok/sec: 2812.61 | norm: 1.52\n", "step4138 | loss: 0.06161061301827431 | dt: 1462.09ms | tok/sec: 2801.47 | norm: 1.53\n", "step4139 | loss: 0.062395744025707245 | dt: 1450.72ms | tok/sec: 2823.42 | norm: 1.65\n", "step4140 | loss: 0.06420743465423584 | dt: 1456.28ms | tok/sec: 2812.64 | norm: 1.73\n", "step4141 | loss: 0.1967746615409851 | dt: 1451.43ms | tok/sec: 2822.05 | norm: 3.68\n", "step4142 | loss: 0.13379213213920593 | dt: 1457.82ms | tok/sec: 2809.68 | norm: 2.63\n", "step4143 | loss: 0.116045743227005 | dt: 1450.40ms | tok/sec: 2824.05 | norm: 2.51\n", "step4144 | loss: 0.10788049548864365 | dt: 1455.84ms | tok/sec: 2813.50 | norm: 2.26\n", "step4145 | loss: 0.169019877910614 | dt: 1461.60ms | tok/sec: 2802.40 | norm: 2.94\n", "step4146 | loss: 0.1235920712351799 | dt: 1458.12ms | tok/sec: 2809.10 | norm: 2.60\n", "step4147 | loss: 0.0853448435664177 | dt: 1459.07ms | tok/sec: 2807.26 | norm: 1.95\n", "step4148 | loss: 0.10670021921396255 | dt: 1461.96ms | tok/sec: 2801.71 | norm: 2.53\n", "step4149 | loss: 0.10393806546926498 | dt: 1454.61ms | tok/sec: 2815.87 | norm: 2.06\n", "step4150 | loss: 0.10024803131818771 | dt: 1462.11ms | tok/sec: 2801.44 | norm: 2.06\n", "step4151 | loss: 0.1378142386674881 | dt: 1452.21ms | tok/sec: 2820.52 | norm: 2.48\n", "step4152 | loss: 0.1274375170469284 | dt: 1458.53ms | tok/sec: 2808.31 | norm: 2.28\n", "step4153 | loss: 0.09117956459522247 | dt: 1449.64ms | tok/sec: 2825.52 | norm: 1.79\n", "step4154 | loss: 0.08182138204574585 | dt: 1459.81ms | tok/sec: 2805.85 | norm: 1.99\n", "step4155 | loss: 0.15915529429912567 | dt: 1455.02ms | tok/sec: 2815.08 | norm: 2.48\n", "step4156 | loss: 0.1103009358048439 | dt: 1448.90ms | tok/sec: 2826.97 | norm: 2.22\n", "step4157 | loss: 0.09048499166965485 | dt: 1453.74ms | tok/sec: 2817.56 | norm: 1.89\n", "step4158 | loss: 0.07643184810876846 | dt: 1450.34ms | tok/sec: 2824.16 | norm: 1.71\n", "step4159 | loss: 0.07990076392889023 | dt: 1449.88ms | tok/sec: 2825.05 | norm: 1.99\n", "step4160 | loss: 0.10824739187955856 | dt: 1452.57ms | tok/sec: 2819.82 | norm: 2.04\n", "step4161 | loss: 0.14261220395565033 | dt: 1443.54ms | tok/sec: 2837.46 | norm: 2.65\n", "step4162 | loss: 0.11855980008840561 | dt: 1453.69ms | tok/sec: 2817.66 | norm: 2.38\n", "step4163 | loss: 0.07373833656311035 | dt: 1444.75ms | tok/sec: 2835.09 | norm: 1.97\n", "step4164 | loss: 0.07172231376171112 | dt: 1453.73ms | tok/sec: 2817.58 | norm: 1.85\n", "step4165 | loss: 0.11153856664896011 | dt: 1451.79ms | tok/sec: 2821.35 | norm: 2.17\n", "step4166 | loss: 0.06630782783031464 | dt: 1449.15ms | tok/sec: 2826.49 | norm: 1.84\n", "step4167 | loss: 0.07319734245538712 | dt: 1442.22ms | tok/sec: 2840.06 | norm: 1.87\n", "step4168 | loss: 0.06547323614358902 | dt: 1443.70ms | tok/sec: 2837.16 | norm: 1.71\n", "step4169 | loss: 0.14031654596328735 | dt: 1449.68ms | tok/sec: 2825.46 | norm: 2.44\n", "step4170 | loss: 0.09099137037992477 | dt: 1444.69ms | tok/sec: 2835.21 | norm: 2.36\n", "step4171 | loss: 0.16575965285301208 | dt: 1442.90ms | tok/sec: 2838.72 | norm: 2.41\n", "step4172 | loss: 0.12801536917686462 | dt: 1452.94ms | tok/sec: 2819.10 | norm: 2.40\n", "step4173 | loss: 0.1529141366481781 | dt: 1448.44ms | tok/sec: 2827.86 | norm: 2.44\n", "step4174 | loss: 0.1328396499156952 | dt: 1451.05ms | tok/sec: 2822.79 | norm: 2.53\n", "step4175 | loss: 0.11168430000543594 | dt: 1437.13ms | tok/sec: 2850.13 | norm: 2.29\n", "step4176 | loss: 0.07495199888944626 | dt: 1454.17ms | tok/sec: 2816.74 | norm: 1.92\n", "step4177 | loss: 0.08003339171409607 | dt: 1451.83ms | tok/sec: 2821.27 | norm: 2.40\n", "step4178 | loss: 0.08949542790651321 | dt: 1453.69ms | tok/sec: 2817.66 | norm: 2.43\n", "step4179 | loss: 0.07930354028940201 | dt: 1455.89ms | tok/sec: 2813.41 | norm: 1.74\n", "step4180 | loss: 0.10922586172819138 | dt: 1451.39ms | tok/sec: 2822.13 | norm: 2.12\n", "step4181 | loss: 0.12198714911937714 | dt: 1456.03ms | tok/sec: 2813.14 | norm: 2.03\n", "step4182 | loss: 0.10065490752458572 | dt: 1453.68ms | tok/sec: 2817.67 | norm: 2.33\n", "step4183 | loss: 0.0696820542216301 | dt: 1452.81ms | tok/sec: 2819.37 | norm: 1.74\n", "step4184 | loss: 0.08317656815052032 | dt: 1460.72ms | tok/sec: 2804.09 | norm: 1.93\n", "step4185 | loss: 0.08453568816184998 | dt: 1449.33ms | tok/sec: 2826.13 | norm: 2.15\n", "step4186 | loss: 0.06828727573156357 | dt: 1460.63ms | tok/sec: 2804.27 | norm: 1.66\n", "step4187 | loss: 0.061089739203453064 | dt: 1451.41ms | tok/sec: 2822.09 | norm: 1.85\n", "step4188 | loss: 0.0665360614657402 | dt: 1453.24ms | tok/sec: 2818.53 | norm: 2.02\n", "step4189 | loss: 0.07549335062503815 | dt: 1449.04ms | tok/sec: 2826.69 | norm: 1.74\n", "step4190 | loss: 0.058799147605895996 | dt: 1451.94ms | tok/sec: 2821.05 | norm: 1.65\n", "step4191 | loss: 0.05618942901492119 | dt: 1439.11ms | tok/sec: 2846.21 | norm: 1.65\n", "step4192 | loss: 0.05646864324808121 | dt: 1457.38ms | tok/sec: 2810.52 | norm: 1.87\n", "step4193 | loss: 0.10317900776863098 | dt: 1448.66ms | tok/sec: 2827.44 | norm: 2.24\n", "step4194 | loss: 0.07506705075502396 | dt: 1457.42ms | tok/sec: 2810.44 | norm: 1.79\n", "step4195 | loss: 0.07267577201128006 | dt: 1451.28ms | tok/sec: 2822.33 | norm: 2.11\n", "step4196 | loss: 0.06446552276611328 | dt: 1453.95ms | tok/sec: 2817.16 | norm: 1.74\n", "step4197 | loss: 0.08164183050394058 | dt: 1452.52ms | tok/sec: 2819.93 | norm: 1.91\n", "step4198 | loss: 0.07368307560682297 | dt: 1449.52ms | tok/sec: 2825.77 | norm: 1.82\n", "step4199 | loss: 0.0707310289144516 | dt: 1455.46ms | tok/sec: 2814.22 | norm: 2.10\n", "step4200 | loss: 0.06685356050729752 | dt: 1451.02ms | tok/sec: 2822.85 | norm: 1.96\n", "step4201 | loss: 0.05422907695174217 | dt: 1453.50ms | tok/sec: 2818.02 | norm: 1.62\n", "step4202 | loss: 0.06293331831693649 | dt: 1456.96ms | tok/sec: 2811.33 | norm: 2.02\n", "step4203 | loss: 0.08279475569725037 | dt: 1454.51ms | tok/sec: 2816.08 | norm: 2.05\n", "step4204 | loss: 0.08479180186986923 | dt: 1461.89ms | tok/sec: 2801.85 | norm: 2.20\n", "step4205 | loss: 0.09634831547737122 | dt: 1449.44ms | tok/sec: 2825.92 | norm: 2.00\n", "step4206 | loss: 0.0732811689376831 | dt: 1450.80ms | tok/sec: 2823.27 | norm: 2.18\n", "step4207 | loss: 0.06434593349695206 | dt: 1468.79ms | tok/sec: 2788.69 | norm: 1.71\n", "step4208 | loss: 0.06300438195466995 | dt: 1460.15ms | tok/sec: 2805.19 | norm: 1.91\n", "step4209 | loss: 0.06219456344842911 | dt: 1452.41ms | tok/sec: 2820.13 | norm: 1.81\n", "step4210 | loss: 0.06219646707177162 | dt: 1460.59ms | tok/sec: 2804.35 | norm: 1.49\n", "step4211 | loss: 0.07188727706670761 | dt: 1455.59ms | tok/sec: 2813.98 | norm: 2.02\n", "step4212 | loss: 0.05792386084794998 | dt: 1462.35ms | tok/sec: 2800.97 | norm: 1.47\n", "step4213 | loss: 0.09951961785554886 | dt: 1458.25ms | tok/sec: 2808.85 | norm: 2.19\n", "step4214 | loss: 0.08726444840431213 | dt: 1461.01ms | tok/sec: 2803.55 | norm: 2.02\n", "step4215 | loss: 0.08316897600889206 | dt: 1465.58ms | tok/sec: 2794.79 | norm: 2.03\n", "step4216 | loss: 0.06005455553531647 | dt: 1453.13ms | tok/sec: 2818.74 | norm: 1.71\n", "step4217 | loss: 0.05828113481402397 | dt: 1455.59ms | tok/sec: 2813.98 | norm: 1.59\n", "step4218 | loss: 0.08588185161352158 | dt: 1463.15ms | tok/sec: 2799.43 | norm: 1.72\n", "step4219 | loss: 0.05318928509950638 | dt: 1453.64ms | tok/sec: 2817.75 | norm: 1.46\n", "step4220 | loss: 0.052304599434137344 | dt: 1465.01ms | tok/sec: 2795.89 | norm: 1.53\n", "step4221 | loss: 0.05616983771324158 | dt: 1459.90ms | tok/sec: 2805.66 | norm: 1.77\n", "step4222 | loss: 0.05986002832651138 | dt: 1457.26ms | tok/sec: 2810.76 | norm: 2.05\n", "step4223 | loss: 0.18288837373256683 | dt: 1459.85ms | tok/sec: 2805.76 | norm: 3.19\n", "step4224 | loss: 0.11103866249322891 | dt: 1455.10ms | tok/sec: 2814.93 | norm: 2.40\n", "step4225 | loss: 0.10976030677556992 | dt: 1445.22ms | tok/sec: 2834.17 | norm: 2.55\n", "step4226 | loss: 0.103554867208004 | dt: 1457.23ms | tok/sec: 2810.81 | norm: 2.34\n", "step4227 | loss: 0.1493854969739914 | dt: 1454.76ms | tok/sec: 2815.58 | norm: 2.34\n", "step4228 | loss: 0.11078673601150513 | dt: 1461.82ms | tok/sec: 2801.98 | norm: 2.16\n", "step4229 | loss: 0.07140488922595978 | dt: 1447.72ms | tok/sec: 2829.27 | norm: 1.91\n", "step4230 | loss: 0.09291791915893555 | dt: 1467.46ms | tok/sec: 2791.21 | norm: 2.06\n", "step4231 | loss: 0.09330016374588013 | dt: 1456.33ms | tok/sec: 2812.54 | norm: 2.20\n", "step4232 | loss: 0.08875041455030441 | dt: 1460.99ms | tok/sec: 2803.58 | norm: 1.92\n", "step4233 | loss: 0.12977945804595947 | dt: 1449.32ms | tok/sec: 2826.15 | norm: 2.65\n", "step4234 | loss: 0.11499114334583282 | dt: 1465.44ms | tok/sec: 2795.06 | norm: 2.82\n", "step4235 | loss: 0.08079370856285095 | dt: 1460.85ms | tok/sec: 2803.85 | norm: 2.36\n", "step4236 | loss: 0.07490114867687225 | dt: 1448.04ms | tok/sec: 2828.65 | norm: 1.87\n", "step4237 | loss: 0.1526380181312561 | dt: 1462.30ms | tok/sec: 2801.07 | norm: 3.04\n", "step4238 | loss: 0.10577581822872162 | dt: 1455.55ms | tok/sec: 2814.05 | norm: 2.23\n", "step4239 | loss: 0.08522597700357437 | dt: 1453.43ms | tok/sec: 2818.15 | norm: 1.99\n", "step4240 | loss: 0.07407417893409729 | dt: 1454.70ms | tok/sec: 2815.69 | norm: 1.84\n", "step4241 | loss: 0.07269252091646194 | dt: 1452.46ms | tok/sec: 2820.05 | norm: 1.92\n", "step4242 | loss: 0.09304594248533249 | dt: 1460.02ms | tok/sec: 2805.44 | norm: 1.83\n", "step4243 | loss: 0.1288386434316635 | dt: 1457.94ms | tok/sec: 2809.45 | norm: 2.50\n", "step4244 | loss: 0.0986502543091774 | dt: 1460.53ms | tok/sec: 2804.46 | norm: 2.17\n", "step4245 | loss: 0.0602654367685318 | dt: 1450.60ms | tok/sec: 2823.65 | norm: 1.67\n", "step4246 | loss: 0.06091173365712166 | dt: 1450.02ms | tok/sec: 2824.79 | norm: 1.73\n", "step4247 | loss: 0.08312534540891647 | dt: 1460.86ms | tok/sec: 2803.83 | norm: 1.80\n", "step4248 | loss: 0.04830111563205719 | dt: 1451.50ms | tok/sec: 2821.91 | norm: 1.29\n", "step4249 | loss: 0.06333421915769577 | dt: 1445.25ms | tok/sec: 2834.11 | norm: 1.65\n", "step4250 | loss: 0.05449371784925461 | dt: 1460.16ms | tok/sec: 2805.17 | norm: 1.59\n", "step4251 | loss: 0.12460974603891373 | dt: 1449.48ms | tok/sec: 2825.84 | norm: 2.40\n", "step4252 | loss: 0.07525846362113953 | dt: 1461.60ms | tok/sec: 2802.40 | norm: 1.76\n", "step4253 | loss: 0.15539781749248505 | dt: 1441.59ms | tok/sec: 2841.31 | norm: 2.81\n", "step4254 | loss: 0.1083129271864891 | dt: 1454.93ms | tok/sec: 2815.26 | norm: 2.37\n", "step4255 | loss: 0.135185107588768 | dt: 1457.03ms | tok/sec: 2811.19 | norm: 2.53\n", "step4256 | loss: 0.12099539488554001 | dt: 1461.08ms | tok/sec: 2803.41 | norm: 2.34\n", "step4257 | loss: 0.11566293239593506 | dt: 1453.24ms | tok/sec: 2818.53 | norm: 2.39\n", "step4258 | loss: 0.07691118866205215 | dt: 1455.61ms | tok/sec: 2813.94 | norm: 1.96\n", "step4259 | loss: 0.07235126942396164 | dt: 1452.70ms | tok/sec: 2819.57 | norm: 1.66\n", "step4260 | loss: 0.0850110724568367 | dt: 1451.86ms | tok/sec: 2821.21 | norm: 2.02\n", "step4261 | loss: 0.08192448318004608 | dt: 1447.74ms | tok/sec: 2829.23 | norm: 2.54\n", "step4262 | loss: 0.09203775227069855 | dt: 1460.12ms | tok/sec: 2805.24 | norm: 2.05\n", "step4263 | loss: 0.09575662016868591 | dt: 1445.95ms | tok/sec: 2832.74 | norm: 1.84\n", "step4264 | loss: 0.07981763035058975 | dt: 1460.07ms | tok/sec: 2805.35 | norm: 1.79\n", "step4265 | loss: 0.09219453483819962 | dt: 1455.06ms | tok/sec: 2815.00 | norm: 2.31\n", "step4266 | loss: 0.07283502072095871 | dt: 1455.98ms | tok/sec: 2813.22 | norm: 2.09\n", "step4267 | loss: 0.09320540726184845 | dt: 1455.06ms | tok/sec: 2815.01 | norm: 2.21\n", "step4268 | loss: 0.08335421979427338 | dt: 1454.90ms | tok/sec: 2815.32 | norm: 2.07\n", "step4269 | loss: 0.08023025095462799 | dt: 1454.57ms | tok/sec: 2815.95 | norm: 2.11\n", "step4270 | loss: 0.060845889151096344 | dt: 1454.11ms | tok/sec: 2816.84 | norm: 1.74\n", "step4271 | loss: 0.060705240815877914 | dt: 1456.91ms | tok/sec: 2811.42 | norm: 1.42\n", "step4272 | loss: 0.05950722470879555 | dt: 1449.14ms | tok/sec: 2826.51 | norm: 1.78\n", "step4273 | loss: 0.051514558494091034 | dt: 1453.45ms | tok/sec: 2818.13 | norm: 1.65\n", "step4274 | loss: 0.0471523180603981 | dt: 1456.72ms | tok/sec: 2811.79 | norm: 1.43\n", "step4275 | loss: 0.0895826518535614 | dt: 1453.88ms | tok/sec: 2817.30 | norm: 2.08\n", "step4276 | loss: 0.06271647661924362 | dt: 1453.47ms | tok/sec: 2818.09 | norm: 1.61\n", "step4277 | loss: 0.06833433359861374 | dt: 1455.12ms | tok/sec: 2814.90 | norm: 1.80\n", "step4278 | loss: 0.07443875074386597 | dt: 1451.27ms | tok/sec: 2822.35 | norm: 2.15\n", "step4279 | loss: 0.0778249129652977 | dt: 1454.16ms | tok/sec: 2816.74 | norm: 1.83\n", "step4280 | loss: 0.07982942461967468 | dt: 1454.06ms | tok/sec: 2816.94 | norm: 1.93\n", "step4281 | loss: 0.0723605677485466 | dt: 1454.87ms | tok/sec: 2815.36 | norm: 1.70\n", "step4282 | loss: 0.05396634712815285 | dt: 1441.71ms | tok/sec: 2841.07 | norm: 1.57\n", "step4283 | loss: 0.04753027483820915 | dt: 1451.75ms | tok/sec: 2821.43 | norm: 1.54\n", "step4284 | loss: 0.053524989634752274 | dt: 1464.84ms | tok/sec: 2796.21 | norm: 1.51\n", "step4285 | loss: 0.07383294403553009 | dt: 1455.34ms | tok/sec: 2814.47 | norm: 2.05\n", "step4286 | loss: 0.07364757359027863 | dt: 1456.77ms | tok/sec: 2811.71 | norm: 1.84\n", "step4287 | loss: 0.08133678883314133 | dt: 1453.13ms | tok/sec: 2818.75 | norm: 1.91\n", "step4288 | loss: 0.0638018473982811 | dt: 1454.03ms | tok/sec: 2816.99 | norm: 1.73\n", "step4289 | loss: 0.055399082601070404 | dt: 1453.20ms | tok/sec: 2818.61 | norm: 1.41\n", "step4290 | loss: 0.05470752343535423 | dt: 1454.88ms | tok/sec: 2815.34 | norm: 1.41\n", "step4291 | loss: 0.05224087834358215 | dt: 1451.00ms | tok/sec: 2822.87 | norm: 1.81\n", "step4292 | loss: 0.0513429120182991 | dt: 1453.90ms | tok/sec: 2817.26 | norm: 1.63\n", "step4293 | loss: 0.05339616909623146 | dt: 1452.98ms | tok/sec: 2819.04 | norm: 1.51\n", "step4294 | loss: 0.056238919496536255 | dt: 1456.00ms | tok/sec: 2813.18 | norm: 1.84\n", "step4295 | loss: 0.07856359332799911 | dt: 1451.69ms | tok/sec: 2821.54 | norm: 2.07\n", "step4296 | loss: 0.07566826045513153 | dt: 1451.94ms | tok/sec: 2821.06 | norm: 2.19\n", "step4297 | loss: 0.07180006802082062 | dt: 1456.32ms | tok/sec: 2812.58 | norm: 1.87\n", "step4298 | loss: 0.05091506615281105 | dt: 1454.66ms | tok/sec: 2815.78 | norm: 1.56\n", "step4299 | loss: 0.04920787364244461 | dt: 1453.65ms | tok/sec: 2817.73 | norm: 1.53\n", "step4300 | loss: 0.07125651091337204 | dt: 1457.38ms | tok/sec: 2810.52 | norm: 1.78\n", "step4301 | loss: 0.04728561267256737 | dt: 1446.23ms | tok/sec: 2832.19 | norm: 1.49\n", "step4302 | loss: 0.04580923542380333 | dt: 1453.61ms | tok/sec: 2817.81 | norm: 1.50\n", "step4303 | loss: 0.04932228475809097 | dt: 1453.05ms | tok/sec: 2818.90 | norm: 1.46\n", "step4304 | loss: 0.04971761256456375 | dt: 1452.15ms | tok/sec: 2820.65 | norm: 1.48\n", "step4305 | loss: 0.18387971818447113 | dt: 1453.25ms | tok/sec: 2818.51 | norm: 3.07\n", "step4306 | loss: 0.10536057502031326 | dt: 1455.91ms | tok/sec: 2813.37 | norm: 2.13\n", "step4307 | loss: 0.09489303827285767 | dt: 1449.68ms | tok/sec: 2825.45 | norm: 2.20\n", "step4308 | loss: 0.09084349870681763 | dt: 1445.46ms | tok/sec: 2833.70 | norm: 2.19\n", "step4309 | loss: 0.12084602564573288 | dt: 1449.90ms | tok/sec: 2825.03 | norm: 2.38\n", "step4310 | loss: 0.09626598656177521 | dt: 1452.17ms | tok/sec: 2820.60 | norm: 2.37\n", "step4311 | loss: 0.06852269917726517 | dt: 1457.78ms | tok/sec: 2809.76 | norm: 2.16\n", "step4312 | loss: 0.07837947458028793 | dt: 1451.09ms | tok/sec: 2822.70 | norm: 1.91\n", "step4313 | loss: 0.0777779072523117 | dt: 1449.20ms | tok/sec: 2826.38 | norm: 1.85\n", "step4314 | loss: 0.07870031148195267 | dt: 1452.16ms | tok/sec: 2820.63 | norm: 2.04\n", "step4315 | loss: 0.10628011077642441 | dt: 1439.93ms | tok/sec: 2844.58 | norm: 2.13\n", "step4316 | loss: 0.08417192846536636 | dt: 1448.37ms | tok/sec: 2828.01 | norm: 1.86\n", "step4317 | loss: 0.07113468647003174 | dt: 1460.82ms | tok/sec: 2803.90 | norm: 2.06\n", "step4318 | loss: 0.0644662082195282 | dt: 1449.42ms | tok/sec: 2825.96 | norm: 1.92\n", "step4319 | loss: 0.11763934046030045 | dt: 1452.58ms | tok/sec: 2819.81 | norm: 2.37\n", "step4320 | loss: 0.0855390727519989 | dt: 1453.06ms | tok/sec: 2818.88 | norm: 2.06\n", "step4321 | loss: 0.063722163438797 | dt: 1445.56ms | tok/sec: 2833.50 | norm: 1.74\n", "step4322 | loss: 0.06513939797878265 | dt: 1451.77ms | tok/sec: 2821.39 | norm: 2.10\n", "step4323 | loss: 0.06786910444498062 | dt: 1453.21ms | tok/sec: 2818.59 | norm: 2.00\n", "step4324 | loss: 0.07609588652849197 | dt: 1451.65ms | tok/sec: 2821.61 | norm: 1.94\n", "step4325 | loss: 0.10881923139095306 | dt: 1453.31ms | tok/sec: 2818.40 | norm: 2.49\n", "step4326 | loss: 0.08775594830513 | dt: 1450.09ms | tok/sec: 2824.65 | norm: 2.19\n", "step4327 | loss: 0.056955188512802124 | dt: 1447.78ms | tok/sec: 2829.15 | norm: 1.82\n", "step4328 | loss: 0.05572992563247681 | dt: 1446.68ms | tok/sec: 2831.32 | norm: 1.76\n", "step4329 | loss: 0.0761575996875763 | dt: 1453.34ms | tok/sec: 2818.34 | norm: 1.89\n", "step4330 | loss: 0.05055965855717659 | dt: 1448.13ms | tok/sec: 2828.47 | norm: 2.04\n", "step4331 | loss: 0.05231352150440216 | dt: 1453.15ms | tok/sec: 2818.70 | norm: 1.72\n", "step4332 | loss: 0.04908185452222824 | dt: 1441.53ms | tok/sec: 2841.43 | norm: 1.74\n", "step4333 | loss: 0.1053609699010849 | dt: 1448.87ms | tok/sec: 2827.04 | norm: 2.15\n", "step4334 | loss: 0.06370297074317932 | dt: 1449.29ms | tok/sec: 2826.22 | norm: 1.65\n", "step4335 | loss: 0.13547739386558533 | dt: 1444.48ms | tok/sec: 2835.62 | norm: 2.47\n", "step4336 | loss: 0.0892491415143013 | dt: 1454.14ms | tok/sec: 2816.78 | norm: 2.00\n", "step4337 | loss: 0.1166120171546936 | dt: 1446.07ms | tok/sec: 2832.51 | norm: 2.24\n", "step4338 | loss: 0.09145878255367279 | dt: 1453.34ms | tok/sec: 2818.34 | norm: 1.73\n", "step4339 | loss: 0.0877763107419014 | dt: 1458.06ms | tok/sec: 2809.20 | norm: 2.11\n", "step4340 | loss: 0.05618469789624214 | dt: 1454.52ms | tok/sec: 2816.05 | norm: 1.48\n", "step4341 | loss: 0.05420764908194542 | dt: 1448.26ms | tok/sec: 2828.22 | norm: 1.51\n", "step4342 | loss: 0.07589629292488098 | dt: 1450.16ms | tok/sec: 2824.51 | norm: 2.15\n", "step4343 | loss: 0.07421902567148209 | dt: 1454.43ms | tok/sec: 2816.23 | norm: 1.88\n", "step4344 | loss: 0.0779832974076271 | dt: 1454.45ms | tok/sec: 2816.18 | norm: 1.97\n", "step4345 | loss: 0.09252937883138657 | dt: 1451.02ms | tok/sec: 2822.84 | norm: 2.60\n", "step4346 | loss: 0.07094591856002808 | dt: 1451.72ms | tok/sec: 2821.47 | norm: 2.09\n", "step4347 | loss: 0.06734476238489151 | dt: 1452.88ms | tok/sec: 2819.23 | norm: 2.34\n", "step4348 | loss: 0.06026617810130119 | dt: 1444.55ms | tok/sec: 2835.48 | norm: 1.94\n", "step4349 | loss: 0.08113560080528259 | dt: 1448.36ms | tok/sec: 2828.03 | norm: 2.06\n", "step4350 | loss: 0.056307338178157806 | dt: 1448.31ms | tok/sec: 2828.12 | norm: 1.98\n", "step4351 | loss: 0.04572099447250366 | dt: 1454.59ms | tok/sec: 2815.91 | norm: 1.55\n", "step4352 | loss: 0.050931163132190704 | dt: 1452.40ms | tok/sec: 2820.17 | norm: 1.64\n", "step4353 | loss: 0.055849578231573105 | dt: 1452.44ms | tok/sec: 2820.08 | norm: 1.69\n", "step4354 | loss: 0.047688450664281845 | dt: 1452.32ms | tok/sec: 2820.32 | norm: 1.71\n", "step4355 | loss: 0.055038124322891235 | dt: 1451.25ms | tok/sec: 2822.40 | norm: 1.83\n", "step4356 | loss: 0.04221682250499725 | dt: 1449.32ms | tok/sec: 2826.16 | norm: 1.64\n", "step4357 | loss: 0.10249090939760208 | dt: 1451.91ms | tok/sec: 2821.11 | norm: 2.14\n", "step4358 | loss: 0.0739872083067894 | dt: 1446.00ms | tok/sec: 2832.65 | norm: 2.29\n", "step4359 | loss: 0.058194175362586975 | dt: 1456.93ms | tok/sec: 2811.40 | norm: 1.42\n", "step4360 | loss: 0.07186951488256454 | dt: 1451.74ms | tok/sec: 2821.45 | norm: 1.77\n", "step4361 | loss: 0.062143176794052124 | dt: 1449.89ms | tok/sec: 2825.05 | norm: 1.67\n", "step4362 | loss: 0.06991352140903473 | dt: 1453.12ms | tok/sec: 2818.76 | norm: 1.88\n", "step4363 | loss: 0.07027111202478409 | dt: 1441.88ms | tok/sec: 2840.74 | norm: 1.94\n", "step4364 | loss: 0.06241343542933464 | dt: 1435.87ms | tok/sec: 2852.63 | norm: 1.76\n", "step4365 | loss: 0.05292472243309021 | dt: 1451.54ms | tok/sec: 2821.83 | norm: 1.85\n", "step4366 | loss: 0.06856188178062439 | dt: 1451.96ms | tok/sec: 2821.01 | norm: 1.68\n", "step4367 | loss: 0.06461524963378906 | dt: 1443.85ms | tok/sec: 2836.87 | norm: 1.99\n", "step4368 | loss: 0.05667226389050484 | dt: 1451.97ms | tok/sec: 2820.99 | norm: 1.49\n", "step4369 | loss: 0.06549825519323349 | dt: 1448.61ms | tok/sec: 2827.53 | norm: 1.74\n", "step4370 | loss: 0.049735572189092636 | dt: 1452.47ms | tok/sec: 2820.02 | norm: 1.64\n", "step4371 | loss: 0.052681829780340195 | dt: 1454.78ms | tok/sec: 2815.54 | norm: 1.70\n", "step4372 | loss: 0.04204067587852478 | dt: 1452.93ms | tok/sec: 2819.12 | norm: 1.35\n", "step4373 | loss: 0.053316857665777206 | dt: 1453.74ms | tok/sec: 2817.57 | norm: 1.72\n", "step4374 | loss: 0.04694031924009323 | dt: 1450.65ms | tok/sec: 2823.55 | norm: 1.64\n", "step4375 | loss: 0.04913925379514694 | dt: 1453.84ms | tok/sec: 2817.36 | norm: 1.69\n", "step4376 | loss: 0.049163468182086945 | dt: 1442.86ms | tok/sec: 2838.80 | norm: 1.63\n", "step4377 | loss: 0.07233510166406631 | dt: 1445.28ms | tok/sec: 2834.05 | norm: 1.96\n", "step4378 | loss: 0.06926824897527695 | dt: 1442.83ms | tok/sec: 2838.87 | norm: 1.95\n", "step4379 | loss: 0.0585324726998806 | dt: 1455.79ms | tok/sec: 2813.60 | norm: 1.61\n", "step4380 | loss: 0.043262921273708344 | dt: 1447.75ms | tok/sec: 2829.21 | norm: 1.37\n", "step4381 | loss: 0.047326020896434784 | dt: 1451.33ms | tok/sec: 2822.23 | norm: 1.47\n", "step4382 | loss: 0.05618015304207802 | dt: 1460.54ms | tok/sec: 2804.44 | norm: 2.19\n", "step4383 | loss: 0.04158719629049301 | dt: 1447.19ms | tok/sec: 2830.32 | norm: 1.54\n", "step4384 | loss: 0.04201153293251991 | dt: 1461.18ms | tok/sec: 2803.21 | norm: 1.41\n", "step4385 | loss: 0.04003426060080528 | dt: 1454.72ms | tok/sec: 2815.65 | norm: 1.35\n", "step4386 | loss: 0.05361297354102135 | dt: 1448.11ms | tok/sec: 2828.51 | norm: 1.80\n", "step4387 | loss: 0.1318417340517044 | dt: 1452.02ms | tok/sec: 2820.90 | norm: 2.19\n", "step4388 | loss: 0.08121446520090103 | dt: 1452.99ms | tok/sec: 2819.02 | norm: 2.01\n", "step4389 | loss: 0.08146090805530548 | dt: 1456.12ms | tok/sec: 2812.95 | norm: 1.85\n", "step4390 | loss: 0.07534079998731613 | dt: 1459.06ms | tok/sec: 2807.29 | norm: 1.88\n", "step4391 | loss: 0.10639794170856476 | dt: 1446.37ms | tok/sec: 2831.93 | norm: 2.08\n", "step4392 | loss: 0.07510692626237869 | dt: 1452.82ms | tok/sec: 2819.35 | norm: 1.40\n", "step4393 | loss: 0.05219132453203201 | dt: 1450.83ms | tok/sec: 2823.21 | norm: 1.46\n", "step4394 | loss: 0.06219401955604553 | dt: 1449.12ms | tok/sec: 2826.54 | norm: 1.63\n", "step4395 | loss: 0.05913301929831505 | dt: 1454.45ms | tok/sec: 2816.19 | norm: 1.70\n", "step4396 | loss: 0.06504197418689728 | dt: 1448.10ms | tok/sec: 2828.54 | norm: 2.23\n", "step4397 | loss: 0.07714810222387314 | dt: 1443.63ms | tok/sec: 2837.29 | norm: 1.81\n", "step4398 | loss: 0.06496737897396088 | dt: 1459.81ms | tok/sec: 2805.85 | norm: 1.69\n", "step4399 | loss: 0.06408850103616714 | dt: 1448.90ms | tok/sec: 2826.98 | norm: 1.67\n", "step4400 | loss: 0.064443439245224 | dt: 1448.24ms | tok/sec: 2828.27 | norm: 1.75\n", "step4401 | loss: 0.10198959708213806 | dt: 1453.24ms | tok/sec: 2818.53 | norm: 2.23\n", "step4402 | loss: 0.07500837743282318 | dt: 1456.16ms | tok/sec: 2812.87 | norm: 1.82\n", "step4403 | loss: 0.06559739261865616 | dt: 1450.98ms | tok/sec: 2822.93 | norm: 2.01\n", "step4404 | loss: 0.05723525211215019 | dt: 1454.97ms | tok/sec: 2815.18 | norm: 1.94\n", "step4405 | loss: 0.06825635582208633 | dt: 1450.67ms | tok/sec: 2823.53 | norm: 2.04\n", "step4406 | loss: 0.06765617430210114 | dt: 1459.31ms | tok/sec: 2806.81 | norm: 1.78\n", "step4407 | loss: 0.08272633701562881 | dt: 1454.27ms | tok/sec: 2816.52 | norm: 1.72\n", "step4408 | loss: 0.07262322306632996 | dt: 1446.77ms | tok/sec: 2831.13 | norm: 1.98\n", "step4409 | loss: 0.04771478474140167 | dt: 1453.16ms | tok/sec: 2818.69 | norm: 1.72\n", "step4410 | loss: 0.04967570677399635 | dt: 1462.04ms | tok/sec: 2801.57 | norm: 1.63\n", "step4411 | loss: 0.06521154195070267 | dt: 1463.70ms | tok/sec: 2798.38 | norm: 1.94\n", "step4412 | loss: 0.046218790113925934 | dt: 1457.82ms | tok/sec: 2809.68 | norm: 1.77\n", "step4413 | loss: 0.0541108101606369 | dt: 1454.60ms | tok/sec: 2815.90 | norm: 1.79\n", "step4414 | loss: 0.04758182540535927 | dt: 1456.16ms | tok/sec: 2812.87 | norm: 1.79\n", "step4415 | loss: 0.08233854919672012 | dt: 1455.11ms | tok/sec: 2814.92 | norm: 1.74\n", "step4416 | loss: 0.059295982122421265 | dt: 1464.14ms | tok/sec: 2797.55 | norm: 1.72\n", "step4417 | loss: 0.10004977881908417 | dt: 1463.95ms | tok/sec: 2797.91 | norm: 2.08\n", "step4418 | loss: 0.06791738420724869 | dt: 1449.70ms | tok/sec: 2825.41 | norm: 1.58\n", "step4419 | loss: 0.09363800287246704 | dt: 1454.72ms | tok/sec: 2815.66 | norm: 1.81\n", "step4420 | loss: 0.07457303255796432 | dt: 1449.98ms | tok/sec: 2824.87 | norm: 1.95\n", "step4421 | loss: 0.07394647598266602 | dt: 1449.70ms | tok/sec: 2825.41 | norm: 1.77\n", "step4422 | loss: 0.05872194096446037 | dt: 1457.13ms | tok/sec: 2811.00 | norm: 2.28\n", "step4423 | loss: 0.07053133845329285 | dt: 1459.80ms | tok/sec: 2805.87 | norm: 2.20\n", "step4424 | loss: 0.060772854834795 | dt: 1464.66ms | tok/sec: 2796.56 | norm: 1.74\n", "step4425 | loss: 0.05591066554188728 | dt: 1458.84ms | tok/sec: 2807.71 | norm: 1.48\n", "step4426 | loss: 0.06413722783327103 | dt: 1468.76ms | tok/sec: 2788.74 | norm: 1.74\n", "step4427 | loss: 0.07217150181531906 | dt: 1473.71ms | tok/sec: 2779.38 | norm: 1.82\n", "step4428 | loss: 0.0590781606733799 | dt: 1463.11ms | tok/sec: 2799.52 | norm: 1.72\n", "step4429 | loss: 0.06414702534675598 | dt: 1464.53ms | tok/sec: 2796.81 | norm: 2.11\n", "step4430 | loss: 0.06342889368534088 | dt: 1459.66ms | tok/sec: 2806.13 | norm: 2.17\n", "step4431 | loss: 0.0645698830485344 | dt: 1460.29ms | tok/sec: 2804.93 | norm: 2.29\n", "step4432 | loss: 0.04431110620498657 | dt: 1456.37ms | tok/sec: 2812.48 | norm: 1.53\n", "step4433 | loss: 0.043027088046073914 | dt: 1453.88ms | tok/sec: 2817.28 | norm: 1.72\n", "step4434 | loss: 0.044744376093149185 | dt: 1450.55ms | tok/sec: 2823.76 | norm: 1.54\n", "step4435 | loss: 0.050431761890649796 | dt: 1459.12ms | tok/sec: 2807.16 | norm: 1.48\n", "step4436 | loss: 0.04544740170240402 | dt: 1458.27ms | tok/sec: 2808.82 | norm: 1.37\n", "step4437 | loss: 0.051178522408008575 | dt: 1453.16ms | tok/sec: 2818.68 | norm: 1.68\n", "step4438 | loss: 0.045789070427417755 | dt: 1466.43ms | tok/sec: 2793.17 | norm: 1.76\n", "step4439 | loss: 0.08396630734205246 | dt: 1456.23ms | tok/sec: 2812.74 | norm: 1.98\n", "step4440 | loss: 0.05469736084342003 | dt: 1453.46ms | tok/sec: 2818.11 | norm: 1.69\n", "step4441 | loss: 0.04407348856329918 | dt: 1459.66ms | tok/sec: 2806.14 | norm: 1.38\n", "step4442 | loss: 0.04957149177789688 | dt: 1457.04ms | tok/sec: 2811.19 | norm: 1.41\n", "step4443 | loss: 0.05975184217095375 | dt: 1455.06ms | tok/sec: 2814.99 | norm: 1.46\n", "step4444 | loss: 0.05772475153207779 | dt: 1453.29ms | tok/sec: 2818.43 | norm: 1.53\n", "step4445 | loss: 0.047357793897390366 | dt: 1448.25ms | tok/sec: 2828.24 | norm: 1.50\n", "step4446 | loss: 0.0427536778151989 | dt: 1455.72ms | tok/sec: 2813.73 | norm: 1.25\n", "step4447 | loss: 0.047575853765010834 | dt: 1447.45ms | tok/sec: 2829.81 | norm: 1.50\n", "step4448 | loss: 0.04855827987194061 | dt: 1448.75ms | tok/sec: 2827.26 | norm: 1.78\n", "step4449 | loss: 0.05877631530165672 | dt: 1445.35ms | tok/sec: 2833.93 | norm: 1.63\n", "step4450 | loss: 0.052679166197776794 | dt: 1451.31ms | tok/sec: 2822.28 | norm: 1.66\n", "step4451 | loss: 0.058224134147167206 | dt: 1457.78ms | tok/sec: 2809.75 | norm: 1.76\n", "step4452 | loss: 0.048185739666223526 | dt: 1449.67ms | tok/sec: 2825.48 | norm: 1.60\n", "step4453 | loss: 0.048189010471105576 | dt: 1450.70ms | tok/sec: 2823.46 | norm: 1.64\n", "step4454 | loss: 0.04374758154153824 | dt: 1453.01ms | tok/sec: 2818.97 | norm: 1.55\n", "step4455 | loss: 0.05005211383104324 | dt: 1453.04ms | tok/sec: 2818.92 | norm: 1.96\n", "step4456 | loss: 0.04562044516205788 | dt: 1452.81ms | tok/sec: 2819.37 | norm: 1.58\n", "step4457 | loss: 0.044864967465400696 | dt: 1448.59ms | tok/sec: 2827.58 | norm: 1.51\n", "step4458 | loss: 0.04846872016787529 | dt: 1450.73ms | tok/sec: 2823.41 | norm: 1.59\n", "step4459 | loss: 0.05805091932415962 | dt: 1448.84ms | tok/sec: 2827.09 | norm: 1.95\n", "step4460 | loss: 0.06337270140647888 | dt: 1453.38ms | tok/sec: 2818.27 | norm: 2.19\n", "step4461 | loss: 0.053695693612098694 | dt: 1440.72ms | tok/sec: 2843.02 | norm: 1.68\n", "step4462 | loss: 0.04013886675238609 | dt: 1448.39ms | tok/sec: 2827.97 | norm: 1.55\n", "step4463 | loss: 0.04796314239501953 | dt: 1437.99ms | tok/sec: 2848.41 | norm: 1.77\n", "step4464 | loss: 0.051932577043771744 | dt: 1453.08ms | tok/sec: 2818.84 | norm: 1.77\n", "step4465 | loss: 0.03931073844432831 | dt: 1447.40ms | tok/sec: 2829.91 | norm: 1.31\n", "step4466 | loss: 0.04415067285299301 | dt: 1450.07ms | tok/sec: 2824.70 | norm: 1.71\n", "step4467 | loss: 0.03887380659580231 | dt: 1450.14ms | tok/sec: 2824.56 | norm: 1.36\n", "step4468 | loss: 0.03874949738383293 | dt: 1447.60ms | tok/sec: 2829.51 | norm: 1.41\n", "step4469 | loss: 0.0876961201429367 | dt: 1441.05ms | tok/sec: 2842.38 | norm: 1.88\n", "step4470 | loss: 0.06702582538127899 | dt: 1446.39ms | tok/sec: 2831.87 | norm: 1.83\n", "step4471 | loss: 0.06072167307138443 | dt: 1437.03ms | tok/sec: 2850.32 | norm: 1.72\n", "step4472 | loss: 0.06265320628881454 | dt: 1450.09ms | tok/sec: 2824.65 | norm: 1.95\n", "step4473 | loss: 0.08027173578739166 | dt: 1452.40ms | tok/sec: 2820.16 | norm: 1.81\n", "step4474 | loss: 0.05991322547197342 | dt: 1435.77ms | tok/sec: 2852.82 | norm: 1.75\n", "step4475 | loss: 0.05158567056059837 | dt: 1444.56ms | tok/sec: 2835.46 | norm: 1.78\n", "step4476 | loss: 0.05535658821463585 | dt: 1452.22ms | tok/sec: 2820.51 | norm: 1.72\n", "step4477 | loss: 0.05739472433924675 | dt: 1446.36ms | tok/sec: 2831.93 | norm: 1.93\n", "step4478 | loss: 0.05571630224585533 | dt: 1448.31ms | tok/sec: 2828.12 | norm: 1.81\n", "step4479 | loss: 0.06868338584899902 | dt: 1437.45ms | tok/sec: 2849.49 | norm: 1.92\n", "step4480 | loss: 0.0576491579413414 | dt: 1451.94ms | tok/sec: 2821.05 | norm: 1.88\n", "step4481 | loss: 0.047755494713783264 | dt: 1453.93ms | tok/sec: 2817.20 | norm: 1.67\n", "step4482 | loss: 0.05373899266123772 | dt: 1440.32ms | tok/sec: 2843.82 | norm: 1.77\n", "step4483 | loss: 0.07418527454137802 | dt: 1445.82ms | tok/sec: 2832.99 | norm: 1.85\n", "step4484 | loss: 0.05724868178367615 | dt: 1438.09ms | tok/sec: 2848.22 | norm: 1.72\n", "step4485 | loss: 0.05341101065278053 | dt: 1448.96ms | tok/sec: 2826.85 | norm: 1.58\n", "step4486 | loss: 0.05028935894370079 | dt: 1454.02ms | tok/sec: 2817.01 | norm: 1.65\n", "step4487 | loss: 0.046654172241687775 | dt: 1453.94ms | tok/sec: 2817.18 | norm: 1.56\n", "step4488 | loss: 0.061804383993148804 | dt: 1452.03ms | tok/sec: 2820.87 | norm: 1.99\n", "step4489 | loss: 0.07200896739959717 | dt: 1451.69ms | tok/sec: 2821.55 | norm: 1.95\n", "step4490 | loss: 0.07318397611379623 | dt: 1453.29ms | tok/sec: 2818.42 | norm: 2.18\n", "step4491 | loss: 0.04575303569436073 | dt: 1439.46ms | tok/sec: 2845.51 | norm: 1.49\n", "step4492 | loss: 0.04211416840553284 | dt: 1444.64ms | tok/sec: 2835.32 | norm: 1.46\n", "step4493 | loss: 0.05778766795992851 | dt: 1456.63ms | tok/sec: 2811.96 | norm: 1.80\n", "step4494 | loss: 0.04964907467365265 | dt: 1452.81ms | tok/sec: 2819.37 | norm: 1.87\n", "step4495 | loss: 0.04488968849182129 | dt: 1440.89ms | tok/sec: 2842.68 | norm: 1.68\n", "step4496 | loss: 0.04041562229394913 | dt: 1448.04ms | tok/sec: 2828.65 | norm: 1.65\n", "step4497 | loss: 0.07113106548786163 | dt: 1454.54ms | tok/sec: 2816.00 | norm: 1.89\n", "step4498 | loss: 0.0555100254714489 | dt: 1449.30ms | tok/sec: 2826.19 | norm: 1.87\n", "step4499 | loss: 0.07956134527921677 | dt: 1456.08ms | tok/sec: 2813.03 | norm: 1.90\n", "step4500 | loss: 0.05231801047921181 | dt: 1453.71ms | tok/sec: 2817.62 | norm: 1.37\n", "step4501 | loss: 0.07498537003993988 | dt: 1440.95ms | tok/sec: 2842.57 | norm: 1.42\n", "step4502 | loss: 0.06618655472993851 | dt: 1450.41ms | tok/sec: 2824.04 | norm: 1.74\n", "step4503 | loss: 0.06061910465359688 | dt: 1449.15ms | tok/sec: 2826.48 | norm: 1.79\n", "step4504 | loss: 0.051402777433395386 | dt: 1455.95ms | tok/sec: 2813.28 | norm: 1.76\n", "step4505 | loss: 0.058938197791576385 | dt: 1454.69ms | tok/sec: 2815.73 | norm: 1.93\n", "step4506 | loss: 0.05428919568657875 | dt: 1451.35ms | tok/sec: 2822.20 | norm: 1.80\n", "step4507 | loss: 0.04690626263618469 | dt: 1459.07ms | tok/sec: 2807.28 | norm: 1.72\n", "step4508 | loss: 0.0655059739947319 | dt: 1449.74ms | tok/sec: 2825.33 | norm: 1.87\n", "step4509 | loss: 0.05656161159276962 | dt: 1451.60ms | tok/sec: 2821.71 | norm: 1.53\n", "step4510 | loss: 0.04794362932443619 | dt: 1450.45ms | tok/sec: 2823.95 | norm: 1.38\n", "step4511 | loss: 0.04775615036487579 | dt: 1455.34ms | tok/sec: 2814.47 | norm: 1.72\n", "step4512 | loss: 0.05726109817624092 | dt: 1452.54ms | tok/sec: 2819.90 | norm: 1.66\n", "step4513 | loss: 0.04831807315349579 | dt: 1457.64ms | tok/sec: 2810.02 | norm: 1.79\n", "step4514 | loss: 0.037455178797245026 | dt: 1445.66ms | tok/sec: 2833.31 | norm: 1.55\n", "step4515 | loss: 0.04213079437613487 | dt: 1444.05ms | tok/sec: 2836.46 | norm: 1.92\n", "step4516 | loss: 0.058316804468631744 | dt: 1459.15ms | tok/sec: 2807.11 | norm: 2.25\n", "step4517 | loss: 0.05163551867008209 | dt: 1456.69ms | tok/sec: 2811.85 | norm: 1.88\n", "step4518 | loss: 0.0445721335709095 | dt: 1458.15ms | tok/sec: 2809.04 | norm: 1.87\n", "step4519 | loss: 0.03698142245411873 | dt: 1447.02ms | tok/sec: 2830.64 | norm: 1.68\n", "step4520 | loss: 0.03918280452489853 | dt: 1461.55ms | tok/sec: 2802.51 | norm: 1.52\n", "step4521 | loss: 0.07033717632293701 | dt: 1442.42ms | tok/sec: 2839.68 | norm: 1.92\n", "step4522 | loss: 0.05450150743126869 | dt: 1456.53ms | tok/sec: 2812.15 | norm: 1.67\n", "step4523 | loss: 0.06438137590885162 | dt: 1455.48ms | tok/sec: 2814.19 | norm: 1.94\n", "step4524 | loss: 0.06541825085878372 | dt: 1459.12ms | tok/sec: 2807.18 | norm: 1.65\n", "step4525 | loss: 0.06217622756958008 | dt: 1446.89ms | tok/sec: 2830.90 | norm: 2.01\n", "step4526 | loss: 0.06858789175748825 | dt: 1451.33ms | tok/sec: 2822.24 | norm: 1.51\n", "step4527 | loss: 0.06702201068401337 | dt: 1457.79ms | tok/sec: 2809.73 | norm: 1.56\n", "step4528 | loss: 0.060455985367298126 | dt: 1456.01ms | tok/sec: 2813.16 | norm: 1.36\n", "step4529 | loss: 0.062402572482824326 | dt: 1461.31ms | tok/sec: 2802.97 | norm: 1.58\n", "step4530 | loss: 0.07422851771116257 | dt: 1455.95ms | tok/sec: 2813.28 | norm: 2.25\n", "step4531 | loss: 0.055246345698833466 | dt: 1448.77ms | tok/sec: 2827.23 | norm: 1.73\n", "step4532 | loss: 0.060677848756313324 | dt: 1452.22ms | tok/sec: 2820.50 | norm: 1.82\n", "step4533 | loss: 0.05689135566353798 | dt: 1462.85ms | tok/sec: 2800.01 | norm: 1.69\n", "step4534 | loss: 0.05230383947491646 | dt: 1459.62ms | tok/sec: 2806.21 | norm: 1.61\n", "step4535 | loss: 0.05945403501391411 | dt: 1443.76ms | tok/sec: 2837.03 | norm: 2.20\n", "step4536 | loss: 0.04432365298271179 | dt: 1453.28ms | tok/sec: 2818.44 | norm: 1.66\n", "step4537 | loss: 0.054077595472335815 | dt: 1460.43ms | tok/sec: 2804.65 | norm: 2.09\n", "step4538 | loss: 0.04318859800696373 | dt: 1449.50ms | tok/sec: 2825.81 | norm: 1.68\n", "step4539 | loss: 0.04675300046801567 | dt: 1447.62ms | tok/sec: 2829.46 | norm: 1.82\n", "step4540 | loss: 0.03825610503554344 | dt: 1452.75ms | tok/sec: 2819.49 | norm: 1.23\n", "step4541 | loss: 0.053304001688957214 | dt: 1466.88ms | tok/sec: 2792.33 | norm: 1.61\n", "step4542 | loss: 0.0501277893781662 | dt: 1443.82ms | tok/sec: 2836.91 | norm: 1.51\n", "step4543 | loss: 0.04470665007829666 | dt: 1454.70ms | tok/sec: 2815.69 | norm: 1.36\n", "step4544 | loss: 0.040343061089515686 | dt: 1454.54ms | tok/sec: 2816.00 | norm: 1.21\n", "step4545 | loss: 0.03981224074959755 | dt: 1453.21ms | tok/sec: 2818.59 | norm: 1.50\n", "step4546 | loss: 0.0436648353934288 | dt: 1460.59ms | tok/sec: 2804.35 | norm: 1.90\n", "step4547 | loss: 0.03067799285054207 | dt: 1443.90ms | tok/sec: 2836.75 | norm: 1.22\n", "step4548 | loss: 0.03476674109697342 | dt: 1449.74ms | tok/sec: 2825.34 | norm: 1.27\n", "step4549 | loss: 0.03627289459109306 | dt: 1455.23ms | tok/sec: 2814.67 | norm: 1.50\n", "step4550 | loss: 0.029041942209005356 | dt: 1452.33ms | tok/sec: 2820.29 | norm: 1.00\n", "step4551 | loss: 0.06370633095502853 | dt: 1459.61ms | tok/sec: 2806.22 | norm: 1.50\n", "step4552 | loss: 0.0676872506737709 | dt: 1463.22ms | tok/sec: 2799.31 | norm: 1.70\n", "step4553 | loss: 0.04357348755002022 | dt: 1449.04ms | tok/sec: 2826.70 | norm: 1.65\n", "step4554 | loss: 0.04820120334625244 | dt: 1454.64ms | tok/sec: 2815.82 | norm: 1.43\n", "step4555 | loss: 0.06261280924081802 | dt: 1453.99ms | tok/sec: 2817.07 | norm: 1.77\n", "step4556 | loss: 0.04974351078271866 | dt: 1456.90ms | tok/sec: 2811.45 | norm: 1.35\n", "step4557 | loss: 0.0443485863506794 | dt: 1463.03ms | tok/sec: 2799.66 | norm: 1.56\n", "step4558 | loss: 0.04739799350500107 | dt: 1456.42ms | tok/sec: 2812.37 | norm: 1.47\n", "step4559 | loss: 0.04696950316429138 | dt: 1459.56ms | tok/sec: 2806.32 | norm: 1.61\n", "step4560 | loss: 0.05136996507644653 | dt: 1463.69ms | tok/sec: 2798.41 | norm: 1.92\n", "step4561 | loss: 0.0575612373650074 | dt: 1452.07ms | tok/sec: 2820.81 | norm: 1.51\n", "step4562 | loss: 0.04686441272497177 | dt: 1450.70ms | tok/sec: 2823.46 | norm: 1.37\n", "step4563 | loss: 0.04136396199464798 | dt: 1461.55ms | tok/sec: 2802.50 | norm: 1.66\n", "step4564 | loss: 0.047834135591983795 | dt: 1445.77ms | tok/sec: 2833.08 | norm: 1.69\n", "step4565 | loss: 0.06702059507369995 | dt: 1455.57ms | tok/sec: 2814.01 | norm: 1.75\n", "step4566 | loss: 0.05926915630698204 | dt: 1451.33ms | tok/sec: 2822.24 | norm: 1.91\n", "step4567 | loss: 0.043633755296468735 | dt: 1465.27ms | tok/sec: 2795.39 | norm: 1.35\n", "step4568 | loss: 0.04568542167544365 | dt: 1467.10ms | tok/sec: 2791.91 | norm: 1.48\n", "step4569 | loss: 0.03469910845160484 | dt: 1448.43ms | tok/sec: 2827.89 | norm: 1.32\n", "step4570 | loss: 0.05439278483390808 | dt: 1454.70ms | tok/sec: 2815.70 | norm: 1.71\n", "step4571 | loss: 0.060366228222846985 | dt: 1459.58ms | tok/sec: 2806.29 | norm: 1.43\n", "step4572 | loss: 0.05842243507504463 | dt: 1454.21ms | tok/sec: 2816.65 | norm: 1.87\n", "step4573 | loss: 0.03221810236573219 | dt: 1445.98ms | tok/sec: 2832.69 | norm: 1.37\n", "step4574 | loss: 0.041601285338401794 | dt: 1460.31ms | tok/sec: 2804.88 | norm: 1.55\n", "step4575 | loss: 0.045365165919065475 | dt: 1457.62ms | tok/sec: 2810.07 | norm: 1.46\n", "step4576 | loss: 0.040809281170368195 | dt: 1456.11ms | tok/sec: 2812.98 | norm: 1.48\n", "step4577 | loss: 0.03871956840157509 | dt: 1462.84ms | tok/sec: 2800.03 | norm: 1.36\n", "step4578 | loss: 0.03133257105946541 | dt: 1461.43ms | tok/sec: 2802.74 | norm: 1.48\n", "step4579 | loss: 0.06713681668043137 | dt: 1456.57ms | tok/sec: 2812.08 | norm: 2.17\n", "step4580 | loss: 0.04571422562003136 | dt: 1455.32ms | tok/sec: 2814.50 | norm: 1.55\n", "step4581 | loss: 0.06796853989362717 | dt: 1459.91ms | tok/sec: 2805.65 | norm: 1.70\n", "step4582 | loss: 0.04619492217898369 | dt: 1455.91ms | tok/sec: 2813.35 | norm: 1.44\n", "step4583 | loss: 0.06019408628344536 | dt: 1460.11ms | tok/sec: 2805.27 | norm: 1.87\n", "step4584 | loss: 0.062008798122406006 | dt: 1448.18ms | tok/sec: 2828.37 | norm: 1.84\n", "step4585 | loss: 0.04855315387248993 | dt: 1460.21ms | tok/sec: 2805.07 | norm: 1.56\n", "step4586 | loss: 0.04311247169971466 | dt: 1453.89ms | tok/sec: 2817.27 | norm: 1.67\n", "step4587 | loss: 0.04646310955286026 | dt: 1448.36ms | tok/sec: 2828.02 | norm: 1.84\n", "step4588 | loss: 0.04983701929450035 | dt: 1461.48ms | tok/sec: 2802.64 | norm: 1.81\n", "step4589 | loss: 0.04658770561218262 | dt: 1461.50ms | tok/sec: 2802.61 | norm: 1.64\n", "step4590 | loss: 0.05455780401825905 | dt: 1451.38ms | tok/sec: 2822.14 | norm: 2.11\n", "step4591 | loss: 0.058963216841220856 | dt: 1459.26ms | tok/sec: 2806.89 | norm: 1.80\n", "step4592 | loss: 0.05042378231883049 | dt: 1458.87ms | tok/sec: 2807.66 | norm: 1.92\n", "step4593 | loss: 0.049817055463790894 | dt: 1443.60ms | tok/sec: 2837.35 | norm: 1.84\n", "step4594 | loss: 0.05697072297334671 | dt: 1454.40ms | tok/sec: 2816.28 | norm: 1.90\n", "step4595 | loss: 0.04647957906126976 | dt: 1460.83ms | tok/sec: 2803.88 | norm: 1.77\n", "step4596 | loss: 0.05002404749393463 | dt: 1451.13ms | tok/sec: 2822.62 | norm: 1.76\n", "step4597 | loss: 0.0448397733271122 | dt: 1461.58ms | tok/sec: 2802.44 | norm: 1.94\n", "step4598 | loss: 0.048534005880355835 | dt: 1453.14ms | tok/sec: 2818.72 | norm: 2.09\n", "step4599 | loss: 0.06042483076453209 | dt: 1459.28ms | tok/sec: 2806.86 | norm: 2.54\n", "step4600 | loss: 0.04640922322869301 | dt: 1455.83ms | tok/sec: 2813.51 | norm: 1.94\n", "step4601 | loss: 0.04696521908044815 | dt: 1453.59ms | tok/sec: 2817.86 | norm: 2.15\n", "step4602 | loss: 0.03607497736811638 | dt: 1459.15ms | tok/sec: 2807.11 | norm: 1.70\n", "step4603 | loss: 0.08372434228658676 | dt: 1444.17ms | tok/sec: 2836.22 | norm: 2.00\n", "step4604 | loss: 0.05321718379855156 | dt: 1451.15ms | tok/sec: 2822.59 | norm: 1.77\n", "step4605 | loss: 0.04176490008831024 | dt: 1453.22ms | tok/sec: 2818.56 | norm: 1.53\n", "step4606 | loss: 0.04706805571913719 | dt: 1454.44ms | tok/sec: 2816.21 | norm: 1.52\n", "step4607 | loss: 0.0500505194067955 | dt: 1456.64ms | tok/sec: 2811.95 | norm: 1.62\n", "step4608 | loss: 0.061404936015605927 | dt: 1454.06ms | tok/sec: 2816.94 | norm: 1.93\n", "step4609 | loss: 0.058620307594537735 | dt: 1459.46ms | tok/sec: 2806.52 | norm: 1.71\n", "step4610 | loss: 0.049426160752773285 | dt: 1450.57ms | tok/sec: 2823.72 | norm: 1.69\n", "step4611 | loss: 0.04344815015792847 | dt: 1451.35ms | tok/sec: 2822.19 | norm: 1.77\n", "step4612 | loss: 0.04788760840892792 | dt: 1458.45ms | tok/sec: 2808.46 | norm: 1.71\n", "step4613 | loss: 0.04966459050774574 | dt: 1452.95ms | tok/sec: 2819.10 | norm: 1.69\n", "step4614 | loss: 0.064056895673275 | dt: 1452.09ms | tok/sec: 2820.76 | norm: 2.07\n", "step4615 | loss: 0.0536186583340168 | dt: 1447.11ms | tok/sec: 2830.47 | norm: 1.53\n", "step4616 | loss: 0.044785209000110626 | dt: 1457.71ms | tok/sec: 2809.89 | norm: 1.62\n", "step4617 | loss: 0.05233890935778618 | dt: 1448.87ms | tok/sec: 2827.04 | norm: 1.85\n", "step4618 | loss: 0.04038984328508377 | dt: 1453.03ms | tok/sec: 2818.95 | norm: 1.69\n", "step4619 | loss: 0.046534862369298935 | dt: 1452.34ms | tok/sec: 2820.27 | norm: 1.71\n", "step4620 | loss: 0.05157177895307541 | dt: 1455.99ms | tok/sec: 2813.21 | norm: 1.75\n", "step4621 | loss: 0.04279718175530434 | dt: 1450.48ms | tok/sec: 2823.90 | norm: 1.70\n", "step4622 | loss: 0.04067882522940636 | dt: 1454.14ms | tok/sec: 2816.79 | norm: 1.59\n", "step4623 | loss: 0.047753944993019104 | dt: 1461.70ms | tok/sec: 2802.21 | norm: 1.68\n", "step4624 | loss: 0.04634122923016548 | dt: 1450.38ms | tok/sec: 2824.09 | norm: 1.75\n", "step4625 | loss: 0.05010143667459488 | dt: 1455.10ms | tok/sec: 2814.92 | norm: 1.73\n", "step4626 | loss: 0.03764595836400986 | dt: 1446.98ms | tok/sec: 2830.72 | norm: 1.73\n", "step4627 | loss: 0.05333070829510689 | dt: 1458.80ms | tok/sec: 2807.80 | norm: 2.24\n", "step4628 | loss: 0.03766608238220215 | dt: 1449.45ms | tok/sec: 2825.90 | norm: 1.49\n", "step4629 | loss: 0.031729623675346375 | dt: 1455.82ms | tok/sec: 2813.53 | norm: 1.47\n", "step4630 | loss: 0.03379189223051071 | dt: 1464.76ms | tok/sec: 2796.36 | norm: 1.34\n", "step4631 | loss: 0.03411475941538811 | dt: 1455.15ms | tok/sec: 2814.83 | norm: 1.48\n", "step4632 | loss: 0.030196402221918106 | dt: 1452.42ms | tok/sec: 2820.11 | norm: 1.24\n", "step4633 | loss: 0.06947433203458786 | dt: 1448.17ms | tok/sec: 2828.39 | norm: 2.03\n", "step4634 | loss: 0.0583372637629509 | dt: 1451.06ms | tok/sec: 2822.77 | norm: 1.77\n", "step4635 | loss: 0.05226397514343262 | dt: 1454.92ms | tok/sec: 2815.27 | norm: 2.16\n", "step4636 | loss: 0.04899110645055771 | dt: 1459.19ms | tok/sec: 2807.03 | norm: 1.74\n", "step4637 | loss: 0.06294014304876328 | dt: 1456.48ms | tok/sec: 2812.26 | norm: 1.78\n", "step4638 | loss: 0.04862644150853157 | dt: 1458.38ms | tok/sec: 2808.60 | norm: 1.80\n", "step4639 | loss: 0.040598779916763306 | dt: 1450.77ms | tok/sec: 2823.32 | norm: 1.77\n", "step4640 | loss: 0.0478496178984642 | dt: 1451.39ms | tok/sec: 2822.12 | norm: 1.73\n", "step4641 | loss: 0.04283597692847252 | dt: 1452.41ms | tok/sec: 2820.15 | norm: 1.66\n", "step4642 | loss: 0.05206582695245743 | dt: 1444.97ms | tok/sec: 2834.67 | norm: 1.77\n", "step4643 | loss: 0.05254025757312775 | dt: 1446.11ms | tok/sec: 2832.43 | norm: 1.71\n", "step4644 | loss: 0.05237359181046486 | dt: 1452.27ms | tok/sec: 2820.42 | norm: 2.06\n", "step4645 | loss: 0.04688126593828201 | dt: 1446.75ms | tok/sec: 2831.17 | norm: 1.89\n", "step4646 | loss: 0.04266708716750145 | dt: 1448.48ms | tok/sec: 2827.79 | norm: 1.70\n", "step4647 | loss: 0.06494889408349991 | dt: 1447.50ms | tok/sec: 2829.71 | norm: 2.07\n", "step4648 | loss: 0.05385604128241539 | dt: 1452.51ms | tok/sec: 2819.95 | norm: 1.84\n", "step4649 | loss: 0.04610571265220642 | dt: 1454.02ms | tok/sec: 2817.02 | norm: 1.73\n", "step4650 | loss: 0.034875527024269104 | dt: 1450.97ms | tok/sec: 2822.94 | norm: 1.47\n", "step4651 | loss: 0.03958795219659805 | dt: 1445.93ms | tok/sec: 2832.77 | norm: 1.45\n", "step4652 | loss: 0.050190020352602005 | dt: 1452.28ms | tok/sec: 2820.39 | norm: 1.39\n", "step4653 | loss: 0.04967230558395386 | dt: 1444.57ms | tok/sec: 2835.45 | norm: 1.52\n", "step4654 | loss: 0.05113135650753975 | dt: 1441.02ms | tok/sec: 2842.44 | norm: 1.71\n", "step4655 | loss: 0.03668176755309105 | dt: 1452.42ms | tok/sec: 2820.13 | norm: 1.59\n", "step4656 | loss: 0.03590945154428482 | dt: 1445.03ms | tok/sec: 2834.55 | norm: 1.60\n", "step4657 | loss: 0.04136568307876587 | dt: 1446.08ms | tok/sec: 2832.48 | norm: 1.59\n", "step4658 | loss: 0.04106250777840614 | dt: 1445.00ms | tok/sec: 2834.60 | norm: 1.77\n", "step4659 | loss: 0.02835841104388237 | dt: 1452.16ms | tok/sec: 2820.63 | norm: 1.03\n", "step4660 | loss: 0.03946675732731819 | dt: 1448.93ms | tok/sec: 2826.92 | norm: 1.36\n", "step4661 | loss: 0.056710753589868546 | dt: 1445.73ms | tok/sec: 2833.18 | norm: 1.53\n", "step4662 | loss: 0.04492035508155823 | dt: 1445.99ms | tok/sec: 2832.65 | norm: 1.69\n", "step4663 | loss: 0.06028597056865692 | dt: 1450.06ms | tok/sec: 2824.72 | norm: 1.84\n", "step4664 | loss: 0.04662570357322693 | dt: 1448.91ms | tok/sec: 2826.95 | norm: 1.82\n", "step4665 | loss: 0.05825236067175865 | dt: 1451.58ms | tok/sec: 2821.76 | norm: 1.73\n", "step4666 | loss: 0.046754345297813416 | dt: 1458.76ms | tok/sec: 2807.86 | norm: 1.70\n", "step4667 | loss: 0.047392331063747406 | dt: 1451.30ms | tok/sec: 2822.29 | norm: 1.60\n", "step4668 | loss: 0.04714743420481682 | dt: 1450.27ms | tok/sec: 2824.30 | norm: 1.91\n", "step4669 | loss: 0.04291468858718872 | dt: 1452.24ms | tok/sec: 2820.48 | norm: 1.65\n", "step4670 | loss: 0.050264470279216766 | dt: 1453.72ms | tok/sec: 2817.61 | norm: 1.87\n", "step4671 | loss: 0.03577357158064842 | dt: 1446.21ms | tok/sec: 2832.22 | norm: 1.40\n", "step4672 | loss: 0.05255577713251114 | dt: 1446.89ms | tok/sec: 2830.89 | norm: 1.70\n", "step4673 | loss: 0.04770641773939133 | dt: 1447.58ms | tok/sec: 2829.54 | norm: 1.76\n", "step4674 | loss: 0.04844100400805473 | dt: 1447.08ms | tok/sec: 2830.52 | norm: 1.81\n", "step4675 | loss: 0.04096830636262894 | dt: 1444.12ms | tok/sec: 2836.33 | norm: 1.62\n", "step4676 | loss: 0.040852680802345276 | dt: 1448.47ms | tok/sec: 2827.82 | norm: 1.43\n", "step4677 | loss: 0.04352806136012077 | dt: 1454.73ms | tok/sec: 2815.64 | norm: 1.43\n", "step4678 | loss: 0.03785906732082367 | dt: 1453.04ms | tok/sec: 2818.92 | norm: 1.48\n", "step4679 | loss: 0.03459922969341278 | dt: 1454.88ms | tok/sec: 2815.36 | norm: 1.49\n", "step4680 | loss: 0.03893878683447838 | dt: 1456.72ms | tok/sec: 2811.80 | norm: 1.64\n", "step4681 | loss: 0.0459887832403183 | dt: 1447.61ms | tok/sec: 2829.50 | norm: 1.68\n", "step4682 | loss: 0.03682509809732437 | dt: 1450.61ms | tok/sec: 2823.65 | norm: 1.47\n", "step4683 | loss: 0.035134993493556976 | dt: 1455.30ms | tok/sec: 2814.53 | norm: 1.51\n", "step4684 | loss: 0.033231623470783234 | dt: 1439.81ms | tok/sec: 2844.82 | norm: 1.59\n", "step4685 | loss: 0.05654994398355484 | dt: 1454.70ms | tok/sec: 2815.69 | norm: 1.95\n", "step4686 | loss: 0.05392872914671898 | dt: 1449.76ms | tok/sec: 2825.30 | norm: 1.58\n", "step4687 | loss: 0.056242432445287704 | dt: 1451.65ms | tok/sec: 2821.61 | norm: 1.39\n", "step4688 | loss: 0.05092443525791168 | dt: 1455.17ms | tok/sec: 2814.79 | norm: 1.49\n", "step4689 | loss: 0.046359237283468246 | dt: 1454.19ms | tok/sec: 2816.69 | norm: 1.61\n", "step4690 | loss: 0.050775039941072464 | dt: 1454.21ms | tok/sec: 2816.65 | norm: 1.50\n", "step4691 | loss: 0.03997207432985306 | dt: 1455.76ms | tok/sec: 2813.64 | norm: 1.37\n", "step4692 | loss: 0.038250427693128586 | dt: 1438.33ms | tok/sec: 2847.74 | norm: 1.36\n", "step4693 | loss: 0.04352995753288269 | dt: 1452.26ms | tok/sec: 2820.43 | norm: 1.60\n", "step4694 | loss: 0.08361122757196426 | dt: 1456.43ms | tok/sec: 2812.35 | norm: 2.24\n", "step4695 | loss: 0.04865182936191559 | dt: 1444.21ms | tok/sec: 2836.15 | norm: 1.78\n", "step4696 | loss: 0.04891979694366455 | dt: 1447.79ms | tok/sec: 2829.14 | norm: 1.80\n", "step4697 | loss: 0.04398499056696892 | dt: 1452.87ms | tok/sec: 2819.26 | norm: 1.67\n", "step4698 | loss: 0.0346272774040699 | dt: 1435.55ms | tok/sec: 2853.25 | norm: 1.43\n", "step4699 | loss: 0.0415511392056942 | dt: 1443.09ms | tok/sec: 2838.35 | norm: 1.64\n", "step4700 | loss: 0.03659370541572571 | dt: 1447.93ms | tok/sec: 2828.87 | norm: 1.75\n", "step4701 | loss: 0.05074014887213707 | dt: 1455.31ms | tok/sec: 2814.53 | norm: 2.02\n", "step4702 | loss: 0.046753380447626114 | dt: 1446.41ms | tok/sec: 2831.84 | norm: 1.99\n", "step4703 | loss: 0.05931435152888298 | dt: 1454.20ms | tok/sec: 2816.66 | norm: 2.30\n", "step4704 | loss: 0.039429329335689545 | dt: 1450.85ms | tok/sec: 2823.17 | norm: 1.56\n", "step4705 | loss: 0.04687445983290672 | dt: 1447.12ms | tok/sec: 2830.44 | norm: 1.92\n", "step4706 | loss: 0.045593660324811935 | dt: 1449.73ms | tok/sec: 2825.35 | norm: 1.68\n", "step4707 | loss: 0.03984486684203148 | dt: 1450.73ms | tok/sec: 2823.40 | norm: 1.42\n", "step4708 | loss: 0.03565100580453873 | dt: 1455.57ms | tok/sec: 2814.02 | norm: 1.30\n", "step4709 | loss: 0.04307575151324272 | dt: 1448.96ms | tok/sec: 2826.85 | norm: 1.79\n", "step4710 | loss: 0.0298154279589653 | dt: 1451.63ms | tok/sec: 2821.65 | norm: 1.27\n", "step4711 | loss: 0.03475476801395416 | dt: 1441.86ms | tok/sec: 2840.77 | norm: 1.40\n", "step4712 | loss: 0.030002420768141747 | dt: 1448.00ms | tok/sec: 2828.73 | norm: 1.22\n", "step4713 | loss: 0.03811913728713989 | dt: 1442.80ms | tok/sec: 2838.93 | norm: 1.85\n", "step4714 | loss: 0.0314265713095665 | dt: 1444.66ms | tok/sec: 2835.26 | norm: 1.66\n", "step4715 | loss: 0.047292403876781464 | dt: 1449.02ms | tok/sec: 2826.74 | norm: 1.37\n", "step4716 | loss: 0.05096874386072159 | dt: 1454.95ms | tok/sec: 2815.22 | norm: 1.81\n", "step4717 | loss: 0.03895490989089012 | dt: 1451.16ms | tok/sec: 2822.58 | norm: 1.55\n", "step4718 | loss: 0.048281412571668625 | dt: 1442.12ms | tok/sec: 2840.27 | norm: 1.62\n", "step4719 | loss: 0.05926721170544624 | dt: 1444.16ms | tok/sec: 2836.24 | norm: 1.93\n", "step4720 | loss: 0.053195979446172714 | dt: 1446.24ms | tok/sec: 2832.18 | norm: 2.02\n", "step4721 | loss: 0.04116184636950493 | dt: 1448.75ms | tok/sec: 2827.26 | norm: 1.89\n", "step4722 | loss: 0.048728276044130325 | dt: 1450.17ms | tok/sec: 2824.50 | norm: 1.92\n", "step4723 | loss: 0.04498489201068878 | dt: 1455.12ms | tok/sec: 2814.89 | norm: 1.48\n", "step4724 | loss: 0.04126909747719765 | dt: 1449.05ms | tok/sec: 2826.69 | norm: 1.49\n", "step4725 | loss: 0.0426253080368042 | dt: 1449.52ms | tok/sec: 2825.76 | norm: 1.36\n", "step4726 | loss: 0.046561017632484436 | dt: 1438.17ms | tok/sec: 2848.07 | norm: 1.98\n", "step4727 | loss: 0.040788233280181885 | dt: 1446.55ms | tok/sec: 2831.57 | norm: 1.62\n", "step4728 | loss: 0.0403873585164547 | dt: 1454.62ms | tok/sec: 2815.85 | norm: 1.77\n", "step4729 | loss: 0.05884074047207832 | dt: 1453.11ms | tok/sec: 2818.78 | norm: 1.97\n", "step4730 | loss: 0.04960465431213379 | dt: 1441.83ms | tok/sec: 2840.84 | norm: 1.50\n", "step4731 | loss: 0.03807046636939049 | dt: 1450.22ms | tok/sec: 2824.40 | norm: 1.30\n", "step4732 | loss: 0.03705766424536705 | dt: 1451.38ms | tok/sec: 2822.15 | norm: 1.39\n", "step4733 | loss: 0.0341290682554245 | dt: 1449.84ms | tok/sec: 2825.13 | norm: 1.55\n", "step4734 | loss: 0.04247153922915459 | dt: 1447.04ms | tok/sec: 2830.61 | norm: 1.63\n", "step4735 | loss: 0.05271787568926811 | dt: 1446.35ms | tok/sec: 2831.96 | norm: 1.96\n", "step4736 | loss: 0.051793742924928665 | dt: 1454.96ms | tok/sec: 2815.20 | norm: 1.69\n", "step4737 | loss: 0.04626474902033806 | dt: 1452.77ms | tok/sec: 2819.43 | norm: 1.69\n", "step4738 | loss: 0.034514088183641434 | dt: 1454.47ms | tok/sec: 2816.14 | norm: 1.39\n", "step4739 | loss: 0.04133112356066704 | dt: 1438.31ms | tok/sec: 2847.79 | norm: 1.70\n", "step4740 | loss: 0.0378197506070137 | dt: 1452.08ms | tok/sec: 2820.79 | norm: 1.89\n", "step4741 | loss: 0.03417221084237099 | dt: 1452.23ms | tok/sec: 2820.49 | norm: 1.67\n", "step4742 | loss: 0.030612844973802567 | dt: 1455.25ms | tok/sec: 2814.65 | norm: 1.39\n", "step4743 | loss: 0.055203668773174286 | dt: 1444.21ms | tok/sec: 2836.16 | norm: 1.95\n", "step4744 | loss: 0.03926445171236992 | dt: 1448.80ms | tok/sec: 2827.17 | norm: 1.64\n", "step4745 | loss: 0.06122975796461105 | dt: 1455.84ms | tok/sec: 2813.50 | norm: 2.05\n", "step4746 | loss: 0.04550766572356224 | dt: 1447.05ms | tok/sec: 2830.60 | norm: 1.58\n", "step4747 | loss: 0.054016344249248505 | dt: 1441.48ms | tok/sec: 2841.53 | norm: 1.58\n", "step4748 | loss: 0.04222589731216431 | dt: 1448.49ms | tok/sec: 2827.77 | norm: 1.42\n", "step4749 | loss: 0.044375985860824585 | dt: 1440.01ms | tok/sec: 2844.42 | norm: 1.62\n", "step4750 | loss: 0.03387664631009102 | dt: 1453.74ms | tok/sec: 2817.55 | norm: 1.47\n", "step4751 | loss: 0.03573307394981384 | dt: 1451.44ms | tok/sec: 2822.02 | norm: 1.29\n", "step4752 | loss: 0.045649755746126175 | dt: 1446.73ms | tok/sec: 2831.21 | norm: 1.53\n", "step4753 | loss: 0.03496292233467102 | dt: 1452.14ms | tok/sec: 2820.66 | norm: 1.32\n", "step4754 | loss: 0.0465899221599102 | dt: 1450.76ms | tok/sec: 2823.35 | norm: 1.89\n", "step4755 | loss: 0.045448873192071915 | dt: 1454.06ms | tok/sec: 2816.93 | norm: 1.63\n", "step4756 | loss: 0.03999188542366028 | dt: 1451.56ms | tok/sec: 2821.80 | norm: 1.61\n", "step4757 | loss: 0.043531857430934906 | dt: 1449.21ms | tok/sec: 2826.38 | norm: 1.62\n", "step4758 | loss: 0.039026547223329544 | dt: 1444.40ms | tok/sec: 2835.77 | norm: 1.45\n", "step4759 | loss: 0.04066157713532448 | dt: 1446.86ms | tok/sec: 2830.96 | norm: 1.74\n", "step4760 | loss: 0.04105284437537193 | dt: 1453.74ms | tok/sec: 2817.56 | norm: 1.72\n", "step4761 | loss: 0.03643576055765152 | dt: 1445.40ms | tok/sec: 2833.81 | norm: 1.81\n", "step4762 | loss: 0.043516844511032104 | dt: 1446.03ms | tok/sec: 2832.59 | norm: 1.56\n", "step4763 | loss: 0.05113048478960991 | dt: 1453.07ms | tok/sec: 2818.86 | norm: 1.70\n", "step4764 | loss: 0.03391743451356888 | dt: 1447.77ms | tok/sec: 2829.18 | norm: 1.44\n", "step4765 | loss: 0.030223172158002853 | dt: 1455.24ms | tok/sec: 2814.66 | norm: 1.32\n", "step4766 | loss: 0.030918410047888756 | dt: 1450.16ms | tok/sec: 2824.52 | norm: 1.35\n", "step4767 | loss: 0.04388423264026642 | dt: 1453.64ms | tok/sec: 2817.76 | norm: 1.44\n", "step4768 | loss: 0.04179883375763893 | dt: 1437.63ms | tok/sec: 2849.14 | norm: 1.63\n", "step4769 | loss: 0.0457286536693573 | dt: 1454.36ms | tok/sec: 2816.36 | norm: 1.85\n", "step4770 | loss: 0.038517266511917114 | dt: 1457.53ms | tok/sec: 2810.24 | norm: 1.52\n", "step4771 | loss: 0.04287008196115494 | dt: 1445.92ms | tok/sec: 2832.81 | norm: 1.78\n", "step4772 | loss: 0.038950297981500626 | dt: 1451.03ms | tok/sec: 2822.82 | norm: 1.60\n", "step4773 | loss: 0.0393817201256752 | dt: 1453.14ms | tok/sec: 2818.73 | norm: 1.56\n", "step4774 | loss: 0.04078162461519241 | dt: 1447.70ms | tok/sec: 2829.32 | norm: 1.54\n", "step4775 | loss: 0.037348322570323944 | dt: 1444.96ms | tok/sec: 2834.69 | norm: 1.39\n", "step4776 | loss: 0.046792011708021164 | dt: 1445.58ms | tok/sec: 2833.47 | norm: 1.58\n", "step4777 | loss: 0.05177118256688118 | dt: 1445.81ms | tok/sec: 2833.01 | norm: 1.88\n", "step4778 | loss: 0.04558555409312248 | dt: 1447.06ms | tok/sec: 2830.56 | norm: 1.76\n", "step4779 | loss: 0.04342782869935036 | dt: 1455.58ms | tok/sec: 2814.00 | norm: 1.38\n", "step4780 | loss: 0.0392637699842453 | dt: 1456.87ms | tok/sec: 2811.52 | norm: 1.62\n", "step4781 | loss: 0.051148977130651474 | dt: 1454.30ms | tok/sec: 2816.48 | norm: 1.77\n", "step4782 | loss: 0.036564674228429794 | dt: 1446.65ms | tok/sec: 2831.38 | norm: 1.31\n", "step4783 | loss: 0.043044425547122955 | dt: 1445.50ms | tok/sec: 2833.62 | norm: 1.57\n", "step4784 | loss: 0.03749081492424011 | dt: 1447.59ms | tok/sec: 2829.52 | norm: 1.58\n", "step4785 | loss: 0.050052981823682785 | dt: 1450.82ms | tok/sec: 2823.24 | norm: 1.85\n", "step4786 | loss: 0.06439226865768433 | dt: 1449.01ms | tok/sec: 2826.77 | norm: 1.85\n", "step4787 | loss: 0.039226237684488297 | dt: 1447.37ms | tok/sec: 2829.96 | norm: 1.47\n", "step4788 | loss: 0.03995955362915993 | dt: 1450.83ms | tok/sec: 2823.22 | norm: 1.43\n", "step4789 | loss: 0.0461450032889843 | dt: 1446.09ms | tok/sec: 2832.47 | norm: 1.33\n", "step4790 | loss: 0.02920384891331196 | dt: 1444.58ms | tok/sec: 2835.42 | norm: 0.92\n", "step4791 | loss: 0.04130403324961662 | dt: 1441.42ms | tok/sec: 2841.63 | norm: 1.48\n", "step4792 | loss: 0.03213217854499817 | dt: 1444.72ms | tok/sec: 2835.16 | norm: 1.21\n", "step4793 | loss: 0.030189156532287598 | dt: 1441.87ms | tok/sec: 2840.75 | norm: 1.50\n", "step4794 | loss: 0.029028037562966347 | dt: 1444.74ms | tok/sec: 2835.11 | norm: 1.34\n", "step4795 | loss: 0.037942737340927124 | dt: 1453.48ms | tok/sec: 2818.06 | norm: 1.69\n", "step4796 | loss: 0.036045342683792114 | dt: 1453.07ms | tok/sec: 2818.86 | norm: 1.66\n", "step4797 | loss: 0.05572796240448952 | dt: 1441.54ms | tok/sec: 2841.41 | norm: 1.88\n", "step4798 | loss: 0.04850869998335838 | dt: 1440.36ms | tok/sec: 2843.73 | norm: 1.58\n", "step4799 | loss: 0.037843137979507446 | dt: 1449.09ms | tok/sec: 2826.61 | norm: 1.56\n", "step4800 | loss: 0.04174666479229927 | dt: 1448.01ms | tok/sec: 2828.71 | norm: 1.56\n", "step4801 | loss: 0.05346562713384628 | dt: 1450.55ms | tok/sec: 2823.75 | norm: 1.53\n", "step4802 | loss: 0.041945211589336395 | dt: 1454.30ms | tok/sec: 2816.47 | norm: 1.68\n", "step4803 | loss: 0.036261145025491714 | dt: 1447.02ms | tok/sec: 2830.64 | norm: 1.52\n", "step4804 | loss: 0.04195884242653847 | dt: 1456.05ms | tok/sec: 2813.09 | norm: 1.51\n", "step4805 | loss: 0.03594435751438141 | dt: 1450.56ms | tok/sec: 2823.75 | norm: 1.40\n", "step4806 | loss: 0.04065483435988426 | dt: 1451.42ms | tok/sec: 2822.07 | norm: 1.76\n", "step4807 | loss: 0.04285738989710808 | dt: 1453.12ms | tok/sec: 2818.76 | norm: 1.52\n", "step4808 | loss: 0.04156675934791565 | dt: 1439.33ms | tok/sec: 2845.76 | norm: 1.69\n", "step4809 | loss: 0.038792189210653305 | dt: 1451.44ms | tok/sec: 2822.03 | norm: 1.68\n", "step4810 | loss: 0.04175499454140663 | dt: 1454.49ms | tok/sec: 2816.11 | norm: 1.68\n", "step4811 | loss: 0.0524134635925293 | dt: 1453.15ms | tok/sec: 2818.69 | norm: 1.84\n", "step4812 | loss: 0.041121482849121094 | dt: 1457.98ms | tok/sec: 2809.37 | norm: 1.53\n", "step4813 | loss: 0.037667885422706604 | dt: 1447.91ms | tok/sec: 2828.90 | norm: 1.56\n", "step4814 | loss: 0.035139329731464386 | dt: 1453.30ms | tok/sec: 2818.40 | norm: 1.70\n", "step4815 | loss: 0.036537934094667435 | dt: 1443.05ms | tok/sec: 2838.44 | norm: 1.61\n", "step4816 | loss: 0.04397480562329292 | dt: 1447.02ms | tok/sec: 2830.64 | norm: 1.27\n", "step4817 | loss: 0.049541175365448 | dt: 1448.22ms | tok/sec: 2828.30 | norm: 1.66\n", "step4818 | loss: 0.04273052141070366 | dt: 1452.90ms | tok/sec: 2819.19 | norm: 1.74\n", "step4819 | loss: 0.03357137367129326 | dt: 1442.75ms | tok/sec: 2839.02 | norm: 1.73\n", "step4820 | loss: 0.03507768735289574 | dt: 1446.54ms | tok/sec: 2831.57 | norm: 1.61\n", "step4821 | loss: 0.043864138424396515 | dt: 1447.86ms | tok/sec: 2829.01 | norm: 1.63\n", "step4822 | loss: 0.038346461951732635 | dt: 1447.08ms | tok/sec: 2830.52 | norm: 1.53\n", "step4823 | loss: 0.035006895661354065 | dt: 1450.47ms | tok/sec: 2823.92 | norm: 1.80\n", "step4824 | loss: 0.024091079831123352 | dt: 1448.95ms | tok/sec: 2826.88 | norm: 1.13\n", "step4825 | loss: 0.047423698008060455 | dt: 1454.47ms | tok/sec: 2816.14 | norm: 1.78\n", "step4826 | loss: 0.04779429733753204 | dt: 1453.25ms | tok/sec: 2818.50 | norm: 1.72\n", "step4827 | loss: 0.050841331481933594 | dt: 1454.08ms | tok/sec: 2816.90 | norm: 1.46\n", "step4828 | loss: 0.04167725145816803 | dt: 1448.31ms | tok/sec: 2828.12 | norm: 1.57\n", "step4829 | loss: 0.05240071192383766 | dt: 1445.40ms | tok/sec: 2833.83 | norm: 1.81\n", "step4830 | loss: 0.04358696937561035 | dt: 1452.71ms | tok/sec: 2819.56 | norm: 1.45\n", "step4831 | loss: 0.04850924015045166 | dt: 1437.21ms | tok/sec: 2849.97 | norm: 1.93\n", "step4832 | loss: 0.04156883433461189 | dt: 1453.93ms | tok/sec: 2817.19 | norm: 1.61\n", "step4833 | loss: 0.03850813955068588 | dt: 1451.54ms | tok/sec: 2821.83 | norm: 1.50\n", "step4834 | loss: 0.0432908795773983 | dt: 1450.74ms | tok/sec: 2823.39 | norm: 1.49\n", "step4835 | loss: 0.03183955326676369 | dt: 1441.26ms | tok/sec: 2841.96 | norm: 1.37\n", "step4836 | loss: 0.03492322191596031 | dt: 1449.29ms | tok/sec: 2826.22 | norm: 1.38\n", "step4837 | loss: 0.038943976163864136 | dt: 1443.41ms | tok/sec: 2837.73 | norm: 1.39\n", "step4838 | loss: 0.042727991938591 | dt: 1448.40ms | tok/sec: 2827.95 | norm: 1.85\n", "step4839 | loss: 0.0362791046500206 | dt: 1438.07ms | tok/sec: 2848.25 | norm: 1.40\n", "step4840 | loss: 0.04450438171625137 | dt: 1448.76ms | tok/sec: 2827.25 | norm: 1.57\n", "step4841 | loss: 0.04054583981633186 | dt: 1454.52ms | tok/sec: 2816.05 | norm: 1.52\n", "step4842 | loss: 0.03700898215174675 | dt: 1450.13ms | tok/sec: 2824.57 | norm: 1.50\n", "step4843 | loss: 0.03361998498439789 | dt: 1455.51ms | tok/sec: 2814.13 | norm: 1.32\n", "step4844 | loss: 0.030618462711572647 | dt: 1451.85ms | tok/sec: 2821.23 | norm: 1.30\n", "step4845 | loss: 0.0468284972012043 | dt: 1454.38ms | tok/sec: 2816.33 | norm: 1.73\n", "step4846 | loss: 0.031902726739645004 | dt: 1452.98ms | tok/sec: 2819.03 | norm: 1.62\n", "step4847 | loss: 0.03404323756694794 | dt: 1454.71ms | tok/sec: 2815.68 | norm: 1.57\n", "step4848 | loss: 0.03249155730009079 | dt: 1437.86ms | tok/sec: 2848.67 | norm: 1.51\n", "step4849 | loss: 0.05153582617640495 | dt: 1437.93ms | tok/sec: 2848.55 | norm: 1.54\n", "step4850 | loss: 0.03602244332432747 | dt: 1450.19ms | tok/sec: 2824.47 | norm: 1.52\n", "step4851 | loss: 0.04158301278948784 | dt: 1449.96ms | tok/sec: 2824.90 | norm: 1.68\n", "step4852 | loss: 0.03598809987306595 | dt: 1447.32ms | tok/sec: 2830.06 | norm: 1.54\n", "step4853 | loss: 0.046797193586826324 | dt: 1456.46ms | tok/sec: 2812.31 | norm: 1.88\n", "step4854 | loss: 0.036955684423446655 | dt: 1442.84ms | tok/sec: 2838.84 | norm: 1.66\n", "step4855 | loss: 0.04216228425502777 | dt: 1445.21ms | tok/sec: 2834.20 | norm: 1.56\n", "step4856 | loss: 0.026985639706254005 | dt: 1444.64ms | tok/sec: 2835.31 | norm: 1.12\n", "step4857 | loss: 0.03668055310845375 | dt: 1444.51ms | tok/sec: 2835.57 | norm: 1.41\n", "step4858 | loss: 0.040750883519649506 | dt: 1451.80ms | tok/sec: 2821.32 | norm: 1.35\n", "step4859 | loss: 0.0684012845158577 | dt: 1448.99ms | tok/sec: 2826.80 | norm: 1.80\n", "step4860 | loss: 0.04000108689069748 | dt: 1452.12ms | tok/sec: 2820.70 | norm: 1.50\n", "step4861 | loss: 0.03984519839286804 | dt: 1444.53ms | tok/sec: 2835.52 | norm: 1.61\n", "step4862 | loss: 0.03528119996190071 | dt: 1452.40ms | tok/sec: 2820.16 | norm: 1.55\n", "step4863 | loss: 0.04417164623737335 | dt: 1451.59ms | tok/sec: 2821.73 | norm: 1.60\n", "step4864 | loss: 0.03986363857984543 | dt: 1450.00ms | tok/sec: 2824.82 | norm: 1.56\n", "step4865 | loss: 0.037651680409908295 | dt: 1442.20ms | tok/sec: 2840.10 | norm: 1.52\n", "step4866 | loss: 0.033408019691705704 | dt: 1453.10ms | tok/sec: 2818.79 | norm: 1.42\n", "step4867 | loss: 0.039353836327791214 | dt: 1452.49ms | tok/sec: 2819.99 | norm: 1.51\n", "step4868 | loss: 0.03489718586206436 | dt: 1445.22ms | tok/sec: 2834.17 | norm: 1.53\n", "step4869 | loss: 0.04669487476348877 | dt: 1448.94ms | tok/sec: 2826.89 | norm: 1.97\n", "step4870 | loss: 0.039583902806043625 | dt: 1450.69ms | tok/sec: 2823.48 | norm: 1.33\n", "step4871 | loss: 0.04204916954040527 | dt: 1447.81ms | tok/sec: 2829.09 | norm: 1.67\n", "step4872 | loss: 0.026924964040517807 | dt: 1454.67ms | tok/sec: 2815.76 | norm: 1.15\n", "step4873 | loss: 0.03993462026119232 | dt: 1451.98ms | tok/sec: 2820.98 | norm: 1.17\n", "step4874 | loss: 0.030027559027075768 | dt: 1453.95ms | tok/sec: 2817.16 | norm: 1.21\n", "step4875 | loss: 0.03656815364956856 | dt: 1452.69ms | tok/sec: 2819.60 | norm: 1.55\n", "step4876 | loss: 0.02686552330851555 | dt: 1448.64ms | tok/sec: 2827.49 | norm: 1.10\n", "step4877 | loss: 0.03265635296702385 | dt: 1449.21ms | tok/sec: 2826.37 | norm: 1.54\n", "step4878 | loss: 0.03919566050171852 | dt: 1456.34ms | tok/sec: 2812.53 | norm: 1.35\n", "step4879 | loss: 0.05715072527527809 | dt: 1438.29ms | tok/sec: 2847.82 | norm: 1.91\n", "step4880 | loss: 0.04590975493192673 | dt: 1447.54ms | tok/sec: 2829.62 | norm: 1.62\n", "step4881 | loss: 0.037989817559719086 | dt: 1441.15ms | tok/sec: 2842.18 | norm: 1.92\n", "step4882 | loss: 0.04141993820667267 | dt: 1448.61ms | tok/sec: 2827.54 | norm: 1.49\n", "step4883 | loss: 0.04804278910160065 | dt: 1444.54ms | tok/sec: 2835.51 | norm: 1.64\n", "step4884 | loss: 0.048559073358774185 | dt: 1450.65ms | tok/sec: 2823.56 | norm: 1.27\n", "step4885 | loss: 0.03376181051135063 | dt: 1445.33ms | tok/sec: 2833.96 | norm: 1.37\n", "step4886 | loss: 0.03340127691626549 | dt: 1450.79ms | tok/sec: 2823.28 | norm: 1.31\n", "step4887 | loss: 0.029878007248044014 | dt: 1446.68ms | tok/sec: 2831.31 | norm: 1.27\n", "step4888 | loss: 0.03501550108194351 | dt: 1454.58ms | tok/sec: 2815.92 | norm: 1.49\n", "step4889 | loss: 0.04321124777197838 | dt: 1452.42ms | tok/sec: 2820.12 | norm: 1.61\n", "step4890 | loss: 0.039458267390728 | dt: 1442.33ms | tok/sec: 2839.85 | norm: 1.74\n", "step4891 | loss: 0.03871533274650574 | dt: 1442.83ms | tok/sec: 2838.87 | norm: 1.81\n", "step4892 | loss: 0.03545514866709709 | dt: 1448.63ms | tok/sec: 2827.51 | norm: 1.50\n", "step4893 | loss: 0.04697611182928085 | dt: 1454.69ms | tok/sec: 2815.72 | norm: 1.81\n", "step4894 | loss: 0.038711488246917725 | dt: 1446.68ms | tok/sec: 2831.31 | norm: 1.17\n", "step4895 | loss: 0.03258107975125313 | dt: 1443.32ms | tok/sec: 2837.90 | norm: 1.58\n", "step4896 | loss: 0.036034855991601944 | dt: 1447.91ms | tok/sec: 2828.91 | norm: 1.37\n", "step4897 | loss: 0.03396927937865257 | dt: 1439.58ms | tok/sec: 2845.27 | norm: 1.51\n", "step4898 | loss: 0.034478504210710526 | dt: 1448.96ms | tok/sec: 2826.86 | norm: 1.38\n", "step4899 | loss: 0.03854835778474808 | dt: 1458.54ms | tok/sec: 2808.30 | norm: 1.65\n", "step4900 | loss: 0.036093514412641525 | dt: 1445.14ms | tok/sec: 2834.33 | norm: 1.34\n", "step4901 | loss: 0.036508046090602875 | dt: 1455.94ms | tok/sec: 2813.31 | norm: 1.58\n", "step4902 | loss: 0.031139761209487915 | dt: 1451.67ms | tok/sec: 2821.57 | norm: 1.57\n", "step4903 | loss: 0.03107447177171707 | dt: 1442.39ms | tok/sec: 2839.72 | norm: 1.43\n", "step4904 | loss: 0.03162713721394539 | dt: 1447.22ms | tok/sec: 2830.25 | norm: 1.45\n", "step4905 | loss: 0.03240946680307388 | dt: 1461.69ms | tok/sec: 2802.24 | norm: 1.48\n", "step4906 | loss: 0.02672940492630005 | dt: 1449.25ms | tok/sec: 2826.29 | norm: 1.54\n", "step4907 | loss: 0.04332994297146797 | dt: 1449.52ms | tok/sec: 2825.76 | norm: 1.50\n", "step4908 | loss: 0.03362629562616348 | dt: 1451.04ms | tok/sec: 2822.80 | norm: 1.48\n", "step4909 | loss: 0.044544707983732224 | dt: 1452.64ms | tok/sec: 2819.70 | norm: 1.60\n", "step4910 | loss: 0.04711995646357536 | dt: 1454.01ms | tok/sec: 2817.05 | norm: 1.75\n", "step4911 | loss: 0.04698127508163452 | dt: 1455.76ms | tok/sec: 2813.65 | norm: 2.00\n", "step4912 | loss: 0.03648590296506882 | dt: 1450.36ms | tok/sec: 2824.12 | norm: 1.56\n", "step4913 | loss: 0.03563138097524643 | dt: 1452.04ms | tok/sec: 2820.87 | norm: 1.45\n", "step4914 | loss: 0.026474321261048317 | dt: 1454.76ms | tok/sec: 2815.58 | norm: 1.05\n", "step4915 | loss: 0.03167928382754326 | dt: 1451.77ms | tok/sec: 2821.38 | norm: 1.21\n", "step4916 | loss: 0.03643285483121872 | dt: 1442.47ms | tok/sec: 2839.57 | norm: 1.72\n", "step4917 | loss: 0.031541433185338974 | dt: 1446.45ms | tok/sec: 2831.77 | norm: 1.29\n", "step4918 | loss: 0.03616920858621597 | dt: 1450.49ms | tok/sec: 2823.88 | norm: 1.80\n", "step4919 | loss: 0.03487689793109894 | dt: 1439.31ms | tok/sec: 2845.81 | norm: 1.13\n", "step4920 | loss: 0.034137945622205734 | dt: 1450.19ms | tok/sec: 2824.46 | norm: 1.32\n", "step4921 | loss: 0.029882552102208138 | dt: 1440.33ms | tok/sec: 2843.79 | norm: 1.26\n", "step4922 | loss: 0.03583460673689842 | dt: 1444.57ms | tok/sec: 2835.44 | norm: 1.49\n", "step4923 | loss: 0.039212893694639206 | dt: 1439.64ms | tok/sec: 2845.15 | norm: 1.90\n", "step4924 | loss: 0.03423886001110077 | dt: 1447.90ms | tok/sec: 2828.92 | norm: 1.40\n", "step4925 | loss: 0.03412328660488129 | dt: 1450.07ms | tok/sec: 2824.70 | norm: 1.71\n", "step4926 | loss: 0.03868470713496208 | dt: 1448.58ms | tok/sec: 2827.60 | norm: 1.82\n", "step4927 | loss: 0.040297843515872955 | dt: 1452.48ms | tok/sec: 2820.00 | norm: 1.58\n", "step4928 | loss: 0.03383079171180725 | dt: 1453.68ms | tok/sec: 2817.68 | norm: 1.51\n", "step4929 | loss: 0.026485690847039223 | dt: 1453.24ms | tok/sec: 2818.53 | norm: 1.19\n", "step4930 | loss: 0.029485439881682396 | dt: 1434.19ms | tok/sec: 2855.97 | norm: 1.46\n", "step4931 | loss: 0.04106828570365906 | dt: 1443.46ms | tok/sec: 2837.62 | norm: 1.70\n", "step4932 | loss: 0.030249832198023796 | dt: 1456.02ms | tok/sec: 2813.14 | norm: 1.30\n", "step4933 | loss: 0.035602688789367676 | dt: 1458.84ms | tok/sec: 2807.70 | norm: 1.35\n", "step4934 | loss: 0.036214157938957214 | dt: 1448.03ms | tok/sec: 2828.68 | norm: 1.37\n", "step4935 | loss: 0.04518646001815796 | dt: 1449.48ms | tok/sec: 2825.84 | norm: 1.63\n", "step4936 | loss: 0.039303865283727646 | dt: 1448.99ms | tok/sec: 2826.80 | norm: 1.28\n", "step4937 | loss: 0.03581172600388527 | dt: 1452.03ms | tok/sec: 2820.87 | norm: 1.51\n", "step4938 | loss: 0.0286465585231781 | dt: 1445.60ms | tok/sec: 2833.42 | norm: 1.39\n", "step4939 | loss: 0.034871987998485565 | dt: 1455.58ms | tok/sec: 2813.99 | norm: 1.53\n", "step4940 | loss: 0.03663007915019989 | dt: 1456.38ms | tok/sec: 2812.45 | norm: 1.30\n", "step4941 | loss: 0.06455272436141968 | dt: 1447.54ms | tok/sec: 2829.63 | norm: 1.86\n", "step4942 | loss: 0.039463385939598083 | dt: 1450.49ms | tok/sec: 2823.88 | norm: 1.50\n", "step4943 | loss: 0.039129119366407394 | dt: 1459.66ms | tok/sec: 2806.13 | norm: 1.76\n", "step4944 | loss: 0.03456457331776619 | dt: 1454.93ms | tok/sec: 2815.26 | norm: 1.87\n", "step4945 | loss: 0.03574267774820328 | dt: 1453.95ms | tok/sec: 2817.16 | norm: 1.56\n", "step4946 | loss: 0.03106934390962124 | dt: 1454.23ms | tok/sec: 2816.61 | norm: 1.37\n", "step4947 | loss: 0.03541899099946022 | dt: 1453.66ms | tok/sec: 2817.72 | norm: 1.65\n", "step4948 | loss: 0.04384220018982887 | dt: 1455.56ms | tok/sec: 2814.03 | norm: 1.95\n", "step4949 | loss: 0.029937218874692917 | dt: 1439.91ms | tok/sec: 2844.62 | norm: 1.35\n", "step4950 | loss: 0.04002414643764496 | dt: 1455.23ms | tok/sec: 2814.68 | norm: 1.64\n", "step4951 | loss: 0.03731998801231384 | dt: 1447.12ms | tok/sec: 2830.46 | norm: 1.42\n", "step4952 | loss: 0.0370471365749836 | dt: 1452.85ms | tok/sec: 2819.30 | norm: 1.61\n", "step4953 | loss: 0.033864714205265045 | dt: 1450.25ms | tok/sec: 2824.35 | norm: 1.33\n", "step4954 | loss: 0.023745954036712646 | dt: 1441.50ms | tok/sec: 2841.49 | norm: 1.14\n", "step4955 | loss: 0.03224215656518936 | dt: 1450.18ms | tok/sec: 2824.48 | norm: 1.39\n", "step4956 | loss: 0.030647439882159233 | dt: 1458.83ms | tok/sec: 2807.72 | norm: 1.38\n", "step4957 | loss: 0.03154198080301285 | dt: 1454.01ms | tok/sec: 2817.04 | norm: 1.47\n", "step4958 | loss: 0.034196436405181885 | dt: 1451.64ms | tok/sec: 2821.63 | norm: 1.73\n", "step4959 | loss: 0.033527083694934845 | dt: 1454.38ms | tok/sec: 2816.33 | norm: 1.30\n", "step4960 | loss: 0.02660149708390236 | dt: 1457.72ms | tok/sec: 2809.86 | norm: 1.12\n", "step4961 | loss: 0.0454915389418602 | dt: 1454.81ms | tok/sec: 2815.48 | norm: 1.76\n", "step4962 | loss: 0.04723886027932167 | dt: 1444.37ms | tok/sec: 2835.84 | norm: 1.85\n", "step4963 | loss: 0.04280706122517586 | dt: 1450.61ms | tok/sec: 2823.64 | norm: 1.97\n", "step4964 | loss: 0.04300900176167488 | dt: 1445.53ms | tok/sec: 2833.56 | norm: 1.76\n", "step4965 | loss: 0.04513592645525932 | dt: 1448.68ms | tok/sec: 2827.41 | norm: 1.68\n", "step4966 | loss: 0.036788783967494965 | dt: 1454.66ms | tok/sec: 2815.78 | norm: 1.46\n", "step4967 | loss: 0.037452761083841324 | dt: 1450.43ms | tok/sec: 2823.98 | norm: 1.63\n", "step4968 | loss: 0.0457962304353714 | dt: 1460.94ms | tok/sec: 2803.68 | norm: 1.62\n", "step4969 | loss: 0.02945890463888645 | dt: 1457.88ms | tok/sec: 2809.57 | norm: 1.27\n", "step4970 | loss: 0.03190688043832779 | dt: 1447.50ms | tok/sec: 2829.70 | norm: 1.24\n", "step4971 | loss: 0.0443829670548439 | dt: 1456.07ms | tok/sec: 2813.05 | norm: 1.81\n", "step4972 | loss: 0.036541521549224854 | dt: 1442.92ms | tok/sec: 2838.69 | norm: 1.84\n", "step4973 | loss: 0.03394465520977974 | dt: 1449.11ms | tok/sec: 2826.55 | norm: 1.73\n", "step4974 | loss: 0.03413078933954239 | dt: 1445.75ms | tok/sec: 2833.13 | norm: 1.40\n", "step4975 | loss: 0.051569726318120956 | dt: 1442.15ms | tok/sec: 2840.21 | norm: 1.96\n", "step4976 | loss: 0.042063742876052856 | dt: 1445.54ms | tok/sec: 2833.55 | norm: 2.00\n", "step4977 | loss: 0.03334908187389374 | dt: 1454.38ms | tok/sec: 2816.33 | norm: 1.34\n", "step4978 | loss: 0.03707233443856239 | dt: 1449.82ms | tok/sec: 2825.18 | norm: 1.54\n", "step4979 | loss: 0.03129411116242409 | dt: 1455.80ms | tok/sec: 2813.57 | norm: 1.45\n", "step4980 | loss: 0.035208459943532944 | dt: 1442.77ms | tok/sec: 2838.99 | norm: 1.44\n", "step4981 | loss: 0.03958768770098686 | dt: 1449.53ms | tok/sec: 2825.74 | norm: 1.34\n", "step4982 | loss: 0.03986775502562523 | dt: 1457.52ms | tok/sec: 2810.25 | norm: 1.34\n", "step4983 | loss: 0.0324605330824852 | dt: 1447.62ms | tok/sec: 2829.47 | norm: 1.22\n", "step4984 | loss: 0.03870119899511337 | dt: 1455.59ms | tok/sec: 2813.99 | norm: 1.82\n", "step4985 | loss: 0.03623031824827194 | dt: 1453.12ms | tok/sec: 2818.76 | norm: 1.67\n", "step4986 | loss: 0.03630652278661728 | dt: 1453.15ms | tok/sec: 2818.70 | norm: 1.63\n", "step4987 | loss: 0.033093880861997604 | dt: 1453.80ms | tok/sec: 2817.44 | norm: 1.55\n", "step4988 | loss: 0.031268928200006485 | dt: 1461.06ms | tok/sec: 2803.44 | norm: 1.48\n", "step4989 | loss: 0.039961330592632294 | dt: 1448.11ms | tok/sec: 2828.51 | norm: 1.48\n", "step4990 | loss: 0.0543203130364418 | dt: 1450.03ms | tok/sec: 2824.77 | norm: 1.98\n", "step4991 | loss: 0.04185006394982338 | dt: 1455.18ms | tok/sec: 2814.77 | norm: 1.57\n", "step4992 | loss: 0.03607906773686409 | dt: 1454.33ms | tok/sec: 2816.42 | norm: 1.32\n", "step4993 | loss: 0.04396853595972061 | dt: 1449.67ms | tok/sec: 2825.47 | norm: 1.53\n", "step4994 | loss: 0.03798677399754524 | dt: 1453.66ms | tok/sec: 2817.71 | norm: 1.38\n", "step4995 | loss: 0.03803092986345291 | dt: 1449.31ms | tok/sec: 2826.17 | norm: 1.57\n", "step4996 | loss: 0.03892425447702408 | dt: 1448.30ms | tok/sec: 2828.15 | norm: 1.62\n", "step4997 | loss: 0.04263422265648842 | dt: 1447.34ms | tok/sec: 2830.02 | norm: 1.60\n", "step4998 | loss: 0.0410090796649456 | dt: 1447.12ms | tok/sec: 2830.45 | norm: 1.44\n", "step4999 | loss: 0.03453453257679939 | dt: 1440.25ms | tok/sec: 2843.95 | norm: 1.73\n", "tensor(0.0345, device='cuda:0', grad_fn=)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "GPT(\n", " (transformer): ModuleDict(\n", " (wte): Embedding(50304, 768)\n", " (wpe): Embedding(1024, 768)\n", " (h): ModuleList(\n", " (0-11): 12 x Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): CausalSelfAttention(\n", " (c_attn): Linear(in_features=768, out_features=2304, bias=True)\n", " (c_proj): Linear(in_features=768, out_features=768, bias=True)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): MLP(\n", " (c_fc): Linear(in_features=768, out_features=3072, bias=True)\n", " (gelu): GELU(approximate='tanh')\n", " (c_proj): Linear(in_features=3072, out_features=768, bias=True)\n", " )\n", " )\n", " )\n", " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (lm_head): Linear(in_features=768, out_features=50304, bias=False)\n", ")" ] }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "HI7HTSaYJom9" }, "execution_count": null, "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "colab": { "provenance": [], "gpuType": "T4" }, "accelerator": "GPU" }, "nbformat": 4, "nbformat_minor": 0 }