{ "best_metric": 1.5648810863494873, "best_model_checkpoint": "miner_id_24/checkpoint-2500", "epoch": 0.6214626567528576, "eval_steps": 100, "global_step": 2800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002219509488403063, "eval_loss": 4.4030327796936035, "eval_runtime": 8.3409, "eval_samples_per_second": 21.82, "eval_steps_per_second": 10.91, "step": 1 }, { "epoch": 0.002219509488403063, "grad_norm": 19.423051834106445, "learning_rate": 6.666666740784422e-05, "loss": 17.4736, "step": 10 }, { "epoch": 0.004439018976806126, "grad_norm": 23.127628326416016, "learning_rate": 0.00013333333481568843, "loss": 14.5885, "step": 20 }, { "epoch": 0.006658528465209188, "grad_norm": 26.64627456665039, "learning_rate": 0.00019999999494757503, "loss": 12.7309, "step": 30 }, { "epoch": 0.008878037953612252, "grad_norm": 28.304908752441406, "learning_rate": 0.0001999997184611857, "loss": 11.2968, "step": 40 }, { "epoch": 0.011097547442015314, "grad_norm": 34.180362701416016, "learning_rate": 0.0001999989035539329, "loss": 9.4993, "step": 50 }, { "epoch": 0.013317056930418377, "grad_norm": 16.596452713012695, "learning_rate": 0.00019999755022581667, "loss": 16.039, "step": 60 }, { "epoch": 0.015536566418821441, "grad_norm": 20.411502838134766, "learning_rate": 0.00019999565847683698, "loss": 13.3691, "step": 70 }, { "epoch": 0.017756075907224503, "grad_norm": 21.839550018310547, "learning_rate": 0.00019999321375507861, "loss": 12.2657, "step": 80 }, { "epoch": 0.019975585395627566, "grad_norm": 30.097143173217773, "learning_rate": 0.00019999021606054157, "loss": 9.0958, "step": 90 }, { "epoch": 0.02219509488403063, "grad_norm": 28.231836318969727, "learning_rate": 0.00019998670904897153, "loss": 6.5189, "step": 100 }, { "epoch": 0.02219509488403063, "eval_loss": 2.693746328353882, "eval_runtime": 8.3274, "eval_samples_per_second": 21.856, "eval_steps_per_second": 10.928, "step": 100 }, { "epoch": 0.02441460437243369, "grad_norm": 14.151446342468262, "learning_rate": 0.00019998261996079236, "loss": 13.8854, "step": 110 }, { "epoch": 0.026634113860836754, "grad_norm": 22.46508026123047, "learning_rate": 0.00019997800700366497, "loss": 12.2383, "step": 120 }, { "epoch": 0.02885362334923982, "grad_norm": 28.271400451660156, "learning_rate": 0.00019997287017758936, "loss": 9.6926, "step": 130 }, { "epoch": 0.031073132837642882, "grad_norm": 22.02579116821289, "learning_rate": 0.00019996716582681984, "loss": 11.2338, "step": 140 }, { "epoch": 0.033292642326045944, "grad_norm": 23.614730834960938, "learning_rate": 0.00019996092305518687, "loss": 6.9779, "step": 150 }, { "epoch": 0.03551215181444901, "grad_norm": 14.604187965393066, "learning_rate": 0.00019995414186269045, "loss": 12.6653, "step": 160 }, { "epoch": 0.03773166130285207, "grad_norm": 14.590351104736328, "learning_rate": 0.00019994682224933058, "loss": 11.8954, "step": 170 }, { "epoch": 0.03995117079125513, "grad_norm": 17.893314361572266, "learning_rate": 0.00019993894966319203, "loss": 9.7361, "step": 180 }, { "epoch": 0.042170680279658194, "grad_norm": 18.059738159179688, "learning_rate": 0.00019993053865619004, "loss": 9.0807, "step": 190 }, { "epoch": 0.04439018976806126, "grad_norm": 30.495603561401367, "learning_rate": 0.00019992157467640936, "loss": 7.8292, "step": 200 }, { "epoch": 0.04439018976806126, "eval_loss": 2.4313313961029053, "eval_runtime": 8.3371, "eval_samples_per_second": 21.83, "eval_steps_per_second": 10.915, "step": 200 }, { "epoch": 0.04660969925646432, "grad_norm": 13.674242973327637, "learning_rate": 0.00019991207227576524, "loss": 13.2771, "step": 210 }, { "epoch": 0.04882920874486738, "grad_norm": 19.946685791015625, "learning_rate": 0.0001999020460061729, "loss": 11.7299, "step": 220 }, { "epoch": 0.051048718233270444, "grad_norm": 15.479765892028809, "learning_rate": 0.00019989146676380187, "loss": 10.6449, "step": 230 }, { "epoch": 0.05326822772167351, "grad_norm": 18.783924102783203, "learning_rate": 0.0001998803491005674, "loss": 9.4277, "step": 240 }, { "epoch": 0.055487737210076576, "grad_norm": 24.155685424804688, "learning_rate": 0.00019986867846455425, "loss": 5.4284, "step": 250 }, { "epoch": 0.05770724669847964, "grad_norm": 12.10494327545166, "learning_rate": 0.00019985646940767765, "loss": 13.0231, "step": 260 }, { "epoch": 0.0599267561868827, "grad_norm": 15.322505950927734, "learning_rate": 0.0001998437219299376, "loss": 10.603, "step": 270 }, { "epoch": 0.062146265675285764, "grad_norm": 17.511821746826172, "learning_rate": 0.0001998304360313341, "loss": 9.9269, "step": 280 }, { "epoch": 0.06436577516368883, "grad_norm": 23.818315505981445, "learning_rate": 0.00019981659715995193, "loss": 8.9873, "step": 290 }, { "epoch": 0.06658528465209189, "grad_norm": 30.77426528930664, "learning_rate": 0.0001998022198677063, "loss": 5.3937, "step": 300 }, { "epoch": 0.06658528465209189, "eval_loss": 2.311836004257202, "eval_runtime": 8.3175, "eval_samples_per_second": 21.882, "eval_steps_per_second": 10.941, "step": 300 }, { "epoch": 0.06880479414049495, "grad_norm": 16.257707595825195, "learning_rate": 0.00019978731870651245, "loss": 12.1633, "step": 310 }, { "epoch": 0.07102430362889801, "grad_norm": 15.266409873962402, "learning_rate": 0.0001997718500206247, "loss": 10.1401, "step": 320 }, { "epoch": 0.07324381311730108, "grad_norm": 16.789775848388672, "learning_rate": 0.00019975585746578872, "loss": 9.8536, "step": 330 }, { "epoch": 0.07546332260570414, "grad_norm": 22.06035804748535, "learning_rate": 0.0001997393264900893, "loss": 8.1284, "step": 340 }, { "epoch": 0.0776828320941072, "grad_norm": 22.819156646728516, "learning_rate": 0.0001997222425416112, "loss": 5.9175, "step": 350 }, { "epoch": 0.07990234158251026, "grad_norm": 15.538501739501953, "learning_rate": 0.00019970462017226964, "loss": 11.6068, "step": 360 }, { "epoch": 0.08212185107091333, "grad_norm": 17.851747512817383, "learning_rate": 0.0001996864448301494, "loss": 11.1313, "step": 370 }, { "epoch": 0.08434136055931639, "grad_norm": 19.84052085876465, "learning_rate": 0.00019966774561908096, "loss": 9.4212, "step": 380 }, { "epoch": 0.08656087004771945, "grad_norm": 20.188945770263672, "learning_rate": 0.00019964850798714906, "loss": 8.6834, "step": 390 }, { "epoch": 0.08878037953612251, "grad_norm": 27.768688201904297, "learning_rate": 0.00019962871738243848, "loss": 7.8199, "step": 400 }, { "epoch": 0.08878037953612251, "eval_loss": 2.2124600410461426, "eval_runtime": 8.3319, "eval_samples_per_second": 21.844, "eval_steps_per_second": 10.922, "step": 400 }, { "epoch": 0.09099988902452558, "grad_norm": 17.140703201293945, "learning_rate": 0.00019960838835686445, "loss": 11.0931, "step": 410 }, { "epoch": 0.09321939851292864, "grad_norm": 16.76500701904297, "learning_rate": 0.00019958752091042697, "loss": 10.1156, "step": 420 }, { "epoch": 0.0954389080013317, "grad_norm": 17.35622215270996, "learning_rate": 0.00019956611504312605, "loss": 9.7744, "step": 430 }, { "epoch": 0.09765841748973476, "grad_norm": 21.446239471435547, "learning_rate": 0.00019954415620304644, "loss": 8.4753, "step": 440 }, { "epoch": 0.09987792697813783, "grad_norm": 33.42231750488281, "learning_rate": 0.00019952167349401861, "loss": 6.0182, "step": 450 }, { "epoch": 0.10209743646654089, "grad_norm": 14.635339736938477, "learning_rate": 0.00019949865236412734, "loss": 11.5906, "step": 460 }, { "epoch": 0.10431694595494395, "grad_norm": 18.52564239501953, "learning_rate": 0.00019947507826145738, "loss": 10.0752, "step": 470 }, { "epoch": 0.10653645544334701, "grad_norm": 21.81144142150879, "learning_rate": 0.00019945096573792398, "loss": 9.8854, "step": 480 }, { "epoch": 0.10875596493175008, "grad_norm": 17.849056243896484, "learning_rate": 0.00019942631479352713, "loss": 8.6266, "step": 490 }, { "epoch": 0.11097547442015315, "grad_norm": 20.789081573486328, "learning_rate": 0.00019940112542826682, "loss": 5.4877, "step": 500 }, { "epoch": 0.11097547442015315, "eval_loss": 2.170618772506714, "eval_runtime": 8.3182, "eval_samples_per_second": 21.88, "eval_steps_per_second": 10.94, "step": 500 }, { "epoch": 0.11319498390855622, "grad_norm": 20.524538040161133, "learning_rate": 0.00019937539764214307, "loss": 11.6729, "step": 510 }, { "epoch": 0.11541449339695928, "grad_norm": 15.974942207336426, "learning_rate": 0.00019934913143515587, "loss": 9.8887, "step": 520 }, { "epoch": 0.11763400288536234, "grad_norm": 15.53352165222168, "learning_rate": 0.00019932232680730522, "loss": 8.4546, "step": 530 }, { "epoch": 0.1198535123737654, "grad_norm": 19.57039451599121, "learning_rate": 0.00019929498375859112, "loss": 8.0938, "step": 540 }, { "epoch": 0.12207302186216847, "grad_norm": 25.024276733398438, "learning_rate": 0.00019926710228901356, "loss": 6.0121, "step": 550 }, { "epoch": 0.12429253135057153, "grad_norm": 16.290878295898438, "learning_rate": 0.00019923868239857256, "loss": 11.2073, "step": 560 }, { "epoch": 0.12651204083897458, "grad_norm": 16.25860595703125, "learning_rate": 0.00019920970953535289, "loss": 9.4554, "step": 570 }, { "epoch": 0.12873155032737765, "grad_norm": 17.02968406677246, "learning_rate": 0.00019918021280318499, "loss": 8.9029, "step": 580 }, { "epoch": 0.1309510598157807, "grad_norm": 21.1275691986084, "learning_rate": 0.00019915017765015364, "loss": 7.7254, "step": 590 }, { "epoch": 0.13317056930418378, "grad_norm": 34.23072052001953, "learning_rate": 0.00019911960407625884, "loss": 4.9892, "step": 600 }, { "epoch": 0.13317056930418378, "eval_loss": 2.0679807662963867, "eval_runtime": 8.339, "eval_samples_per_second": 21.825, "eval_steps_per_second": 10.913, "step": 600 }, { "epoch": 0.13539007879258683, "grad_norm": 16.453622817993164, "learning_rate": 0.0001990884920815006, "loss": 10.7183, "step": 610 }, { "epoch": 0.1376095882809899, "grad_norm": 15.334869384765625, "learning_rate": 0.0001990568416658789, "loss": 9.7786, "step": 620 }, { "epoch": 0.13982909776939295, "grad_norm": 17.854019165039062, "learning_rate": 0.00019902463827747852, "loss": 8.5852, "step": 630 }, { "epoch": 0.14204860725779603, "grad_norm": 21.40219497680664, "learning_rate": 0.00019899191102012992, "loss": 7.5876, "step": 640 }, { "epoch": 0.14426811674619908, "grad_norm": 21.883670806884766, "learning_rate": 0.00019895864534191787, "loss": 4.1908, "step": 650 }, { "epoch": 0.14648762623460215, "grad_norm": 14.67003059387207, "learning_rate": 0.00019892484124284238, "loss": 11.5433, "step": 660 }, { "epoch": 0.14870713572300523, "grad_norm": 14.945003509521484, "learning_rate": 0.00019889049872290343, "loss": 10.0715, "step": 670 }, { "epoch": 0.15092664521140828, "grad_norm": 20.415084838867188, "learning_rate": 0.00019885563233401626, "loss": 8.4611, "step": 680 }, { "epoch": 0.15314615469981135, "grad_norm": 20.929723739624023, "learning_rate": 0.00019882021297235042, "loss": 7.1908, "step": 690 }, { "epoch": 0.1553656641882144, "grad_norm": 24.06029510498047, "learning_rate": 0.00019878426974173635, "loss": 5.6608, "step": 700 }, { "epoch": 0.1553656641882144, "eval_loss": 2.051711082458496, "eval_runtime": 8.3234, "eval_samples_per_second": 21.866, "eval_steps_per_second": 10.933, "step": 700 }, { "epoch": 0.15758517367661748, "grad_norm": 13.85684585571289, "learning_rate": 0.00019874778809025884, "loss": 11.1821, "step": 710 }, { "epoch": 0.15980468316502053, "grad_norm": 19.026765823364258, "learning_rate": 0.00019871075346600264, "loss": 9.8956, "step": 720 }, { "epoch": 0.1620241926534236, "grad_norm": 13.71843147277832, "learning_rate": 0.00019867320952471346, "loss": 8.0162, "step": 730 }, { "epoch": 0.16424370214182665, "grad_norm": 20.6693115234375, "learning_rate": 0.00019863512716256082, "loss": 8.3939, "step": 740 }, { "epoch": 0.16646321163022973, "grad_norm": 34.8355598449707, "learning_rate": 0.00019859647727571428, "loss": 4.6651, "step": 750 }, { "epoch": 0.16868272111863278, "grad_norm": 16.13817024230957, "learning_rate": 0.00019855731807183474, "loss": 11.131, "step": 760 }, { "epoch": 0.17090223060703585, "grad_norm": 13.475647926330566, "learning_rate": 0.00019851762044709176, "loss": 10.388, "step": 770 }, { "epoch": 0.1731217400954389, "grad_norm": 17.93106460571289, "learning_rate": 0.00019847739895340055, "loss": 7.349, "step": 780 }, { "epoch": 0.17534124958384198, "grad_norm": 20.98259162902832, "learning_rate": 0.0001984366390388459, "loss": 8.0287, "step": 790 }, { "epoch": 0.17756075907224503, "grad_norm": 25.769512176513672, "learning_rate": 0.00019839532615151256, "loss": 4.671, "step": 800 }, { "epoch": 0.17756075907224503, "eval_loss": 2.024151563644409, "eval_runtime": 8.3368, "eval_samples_per_second": 21.831, "eval_steps_per_second": 10.916, "step": 800 }, { "epoch": 0.1797802685606481, "grad_norm": 15.433488845825195, "learning_rate": 0.00019835350394714624, "loss": 10.0871, "step": 810 }, { "epoch": 0.18199977804905115, "grad_norm": 15.846505165100098, "learning_rate": 0.000198311114218086, "loss": 10.2823, "step": 820 }, { "epoch": 0.18421928753745423, "grad_norm": 13.342632293701172, "learning_rate": 0.000198268229723908, "loss": 8.5304, "step": 830 }, { "epoch": 0.18643879702585728, "grad_norm": 26.68543243408203, "learning_rate": 0.00019822479225695133, "loss": 7.9283, "step": 840 }, { "epoch": 0.18865830651426035, "grad_norm": 41.6745719909668, "learning_rate": 0.00019818083092104644, "loss": 5.7695, "step": 850 }, { "epoch": 0.1908778160026634, "grad_norm": 14.910416603088379, "learning_rate": 0.0001981363311642781, "loss": 11.2911, "step": 860 }, { "epoch": 0.19309732549106648, "grad_norm": 16.549863815307617, "learning_rate": 0.0001980912929866463, "loss": 9.4357, "step": 870 }, { "epoch": 0.19531683497946953, "grad_norm": 17.58315658569336, "learning_rate": 0.00019804571638815105, "loss": 8.1612, "step": 880 }, { "epoch": 0.1975363444678726, "grad_norm": 23.82792854309082, "learning_rate": 0.00019799961592070758, "loss": 8.0985, "step": 890 }, { "epoch": 0.19975585395627565, "grad_norm": 26.863176345825195, "learning_rate": 0.0001979529915843159, "loss": 6.3505, "step": 900 }, { "epoch": 0.19975585395627565, "eval_loss": 1.8601369857788086, "eval_runtime": 8.3313, "eval_samples_per_second": 21.845, "eval_steps_per_second": 10.923, "step": 900 }, { "epoch": 0.20197536344467873, "grad_norm": 20.289323806762695, "learning_rate": 0.00019790582882706076, "loss": 10.5748, "step": 910 }, { "epoch": 0.20419487293308178, "grad_norm": 15.238377571105957, "learning_rate": 0.0001978581422008574, "loss": 8.5886, "step": 920 }, { "epoch": 0.20641438242148485, "grad_norm": 18.25593376159668, "learning_rate": 0.00019780990260187536, "loss": 8.2022, "step": 930 }, { "epoch": 0.2086338919098879, "grad_norm": 22.832481384277344, "learning_rate": 0.00019776115368586034, "loss": 6.4328, "step": 940 }, { "epoch": 0.21085340139829098, "grad_norm": 23.705724716186523, "learning_rate": 0.00019771186634898186, "loss": 4.3975, "step": 950 }, { "epoch": 0.21307291088669403, "grad_norm": 14.876107215881348, "learning_rate": 0.00019766205514315516, "loss": 10.4747, "step": 960 }, { "epoch": 0.2152924203750971, "grad_norm": 17.44968032836914, "learning_rate": 0.000197611705516465, "loss": 8.854, "step": 970 }, { "epoch": 0.21751192986350015, "grad_norm": 16.14581298828125, "learning_rate": 0.00019756083202082664, "loss": 8.405, "step": 980 }, { "epoch": 0.21973143935190323, "grad_norm": 25.348539352416992, "learning_rate": 0.00019750942010432482, "loss": 6.6679, "step": 990 }, { "epoch": 0.2219509488403063, "grad_norm": 27.95829200744629, "learning_rate": 0.00019745748431887478, "loss": 4.1893, "step": 1000 }, { "epoch": 0.2219509488403063, "eval_loss": 1.8662480115890503, "eval_runtime": 8.3397, "eval_samples_per_second": 21.823, "eval_steps_per_second": 10.912, "step": 1000 }, { "epoch": 0.22417045832870935, "grad_norm": 15.58171272277832, "learning_rate": 0.00019740502466447651, "loss": 10.3423, "step": 1010 }, { "epoch": 0.22638996781711243, "grad_norm": 17.35782814025879, "learning_rate": 0.00019735204114113003, "loss": 9.8724, "step": 1020 }, { "epoch": 0.22860947730551548, "grad_norm": 15.173916816711426, "learning_rate": 0.00019729850464500487, "loss": 7.8662, "step": 1030 }, { "epoch": 0.23082898679391856, "grad_norm": 25.888383865356445, "learning_rate": 0.00019724445883184671, "loss": 7.568, "step": 1040 }, { "epoch": 0.2330484962823216, "grad_norm": 19.61490821838379, "learning_rate": 0.00019718988914974034, "loss": 4.456, "step": 1050 }, { "epoch": 0.23526800577072468, "grad_norm": 16.94732093811035, "learning_rate": 0.0001971347810467705, "loss": 10.5227, "step": 1060 }, { "epoch": 0.23748751525912773, "grad_norm": 17.646886825561523, "learning_rate": 0.00019707914907485247, "loss": 8.809, "step": 1070 }, { "epoch": 0.2397070247475308, "grad_norm": 25.40277862548828, "learning_rate": 0.0001970229932339862, "loss": 7.8792, "step": 1080 }, { "epoch": 0.24192653423593385, "grad_norm": 20.657400131225586, "learning_rate": 0.0001969663135241717, "loss": 6.6584, "step": 1090 }, { "epoch": 0.24414604372433693, "grad_norm": 20.5444278717041, "learning_rate": 0.00019690909539349377, "loss": 4.8748, "step": 1100 }, { "epoch": 0.24414604372433693, "eval_loss": 1.8770936727523804, "eval_runtime": 8.3522, "eval_samples_per_second": 21.791, "eval_steps_per_second": 10.895, "step": 1100 }, { "epoch": 0.24636555321273998, "grad_norm": 15.536696434020996, "learning_rate": 0.0001968513533938676, "loss": 10.905, "step": 1110 }, { "epoch": 0.24858506270114306, "grad_norm": 17.137052536010742, "learning_rate": 0.00019679308752529323, "loss": 10.3092, "step": 1120 }, { "epoch": 0.2508045721895461, "grad_norm": 17.024402618408203, "learning_rate": 0.00019673429778777063, "loss": 8.0488, "step": 1130 }, { "epoch": 0.25302408167794915, "grad_norm": 21.898466110229492, "learning_rate": 0.0001966749841812998, "loss": 7.7142, "step": 1140 }, { "epoch": 0.25524359116635226, "grad_norm": 27.144317626953125, "learning_rate": 0.000196615161257796, "loss": 5.1144, "step": 1150 }, { "epoch": 0.2574631006547553, "grad_norm": 18.38833999633789, "learning_rate": 0.0001965547853615135, "loss": 9.566, "step": 1160 }, { "epoch": 0.25968261014315835, "grad_norm": 20.051612854003906, "learning_rate": 0.000196493900148198, "loss": 9.9055, "step": 1170 }, { "epoch": 0.2619021196315614, "grad_norm": 18.773082733154297, "learning_rate": 0.0001964324910659343, "loss": 9.3898, "step": 1180 }, { "epoch": 0.2641216291199645, "grad_norm": 19.012685775756836, "learning_rate": 0.00019637055811472237, "loss": 7.0832, "step": 1190 }, { "epoch": 0.26634113860836756, "grad_norm": 27.383302688598633, "learning_rate": 0.00019630810129456222, "loss": 5.0172, "step": 1200 }, { "epoch": 0.26634113860836756, "eval_loss": 1.8051999807357788, "eval_runtime": 8.3352, "eval_samples_per_second": 21.835, "eval_steps_per_second": 10.918, "step": 1200 }, { "epoch": 0.2685606480967706, "grad_norm": 14.786659240722656, "learning_rate": 0.00019624513515736908, "loss": 10.51, "step": 1210 }, { "epoch": 0.27078015758517365, "grad_norm": 17.665573120117188, "learning_rate": 0.00019618163059931248, "loss": 9.76, "step": 1220 }, { "epoch": 0.27299966707357676, "grad_norm": 19.493696212768555, "learning_rate": 0.00019611760217230767, "loss": 7.4334, "step": 1230 }, { "epoch": 0.2752191765619798, "grad_norm": 23.978652954101562, "learning_rate": 0.00019605304987635463, "loss": 7.6591, "step": 1240 }, { "epoch": 0.27743868605038285, "grad_norm": 27.781898498535156, "learning_rate": 0.0001959879882633686, "loss": 6.0583, "step": 1250 }, { "epoch": 0.2796581955387859, "grad_norm": 22.41964340209961, "learning_rate": 0.00019592238822951913, "loss": 9.4343, "step": 1260 }, { "epoch": 0.281877705027189, "grad_norm": 39.951873779296875, "learning_rate": 0.0001958562934305519, "loss": 9.1174, "step": 1270 }, { "epoch": 0.28409721451559206, "grad_norm": 19.36490249633789, "learning_rate": 0.0001957896602107212, "loss": 7.9006, "step": 1280 }, { "epoch": 0.2863167240039951, "grad_norm": 21.319568634033203, "learning_rate": 0.0001957225176738575, "loss": 7.5524, "step": 1290 }, { "epoch": 0.28853623349239815, "grad_norm": 38.544281005859375, "learning_rate": 0.00019565483671613038, "loss": 5.2344, "step": 1300 }, { "epoch": 0.28853623349239815, "eval_loss": 1.8126747608184814, "eval_runtime": 8.353, "eval_samples_per_second": 21.789, "eval_steps_per_second": 10.894, "step": 1300 }, { "epoch": 0.29075574298080126, "grad_norm": 11.605786323547363, "learning_rate": 0.00019558664644137025, "loss": 10.6607, "step": 1310 }, { "epoch": 0.2929752524692043, "grad_norm": 15.323630332946777, "learning_rate": 0.00019551794684957713, "loss": 10.1969, "step": 1320 }, { "epoch": 0.29519476195760735, "grad_norm": 22.335023880004883, "learning_rate": 0.0001954487233888358, "loss": 9.1436, "step": 1330 }, { "epoch": 0.29741427144601046, "grad_norm": 20.648021697998047, "learning_rate": 0.00019537899061106145, "loss": 6.0644, "step": 1340 }, { "epoch": 0.2996337809344135, "grad_norm": 18.144540786743164, "learning_rate": 0.0001953087339643389, "loss": 4.6119, "step": 1350 }, { "epoch": 0.30185329042281656, "grad_norm": 16.555744171142578, "learning_rate": 0.0001952379388967529, "loss": 9.6129, "step": 1360 }, { "epoch": 0.3040727999112196, "grad_norm": 16.65943145751953, "learning_rate": 0.00019516664906404912, "loss": 8.9638, "step": 1370 }, { "epoch": 0.3062923093996227, "grad_norm": 20.7447452545166, "learning_rate": 0.00019509484991431236, "loss": 7.8651, "step": 1380 }, { "epoch": 0.30851181888802576, "grad_norm": 18.503721237182617, "learning_rate": 0.00019502251234371215, "loss": 7.5487, "step": 1390 }, { "epoch": 0.3107313283764288, "grad_norm": 22.380367279052734, "learning_rate": 0.00019494966545607895, "loss": 4.4286, "step": 1400 }, { "epoch": 0.3107313283764288, "eval_loss": 1.752911925315857, "eval_runtime": 8.3542, "eval_samples_per_second": 21.785, "eval_steps_per_second": 10.893, "step": 1400 }, { "epoch": 0.31295083786483185, "grad_norm": 19.16793441772461, "learning_rate": 0.00019487632380332798, "loss": 9.9042, "step": 1410 }, { "epoch": 0.31517034735323496, "grad_norm": 15.690661430358887, "learning_rate": 0.00019480244372971356, "loss": 7.38, "step": 1420 }, { "epoch": 0.317389856841638, "grad_norm": 25.818464279174805, "learning_rate": 0.00019472805433906615, "loss": 8.3591, "step": 1430 }, { "epoch": 0.31960936633004106, "grad_norm": 23.05858039855957, "learning_rate": 0.00019465315563138574, "loss": 7.4033, "step": 1440 }, { "epoch": 0.3218288758184441, "grad_norm": 34.570255279541016, "learning_rate": 0.00019457773305475712, "loss": 4.727, "step": 1450 }, { "epoch": 0.3240483853068472, "grad_norm": 19.70728874206543, "learning_rate": 0.00019450181571301073, "loss": 9.2492, "step": 1460 }, { "epoch": 0.32626789479525026, "grad_norm": 20.703845977783203, "learning_rate": 0.0001944253599504009, "loss": 7.8776, "step": 1470 }, { "epoch": 0.3284874042836533, "grad_norm": 24.86590003967285, "learning_rate": 0.00019434840942267329, "loss": 7.9848, "step": 1480 }, { "epoch": 0.33070691377205635, "grad_norm": 13.28347396850586, "learning_rate": 0.0001942709495779127, "loss": 6.1992, "step": 1490 }, { "epoch": 0.33292642326045946, "grad_norm": 18.400659561157227, "learning_rate": 0.00019419296586420387, "loss": 4.4394, "step": 1500 }, { "epoch": 0.33292642326045946, "eval_loss": 1.773450255393982, "eval_runtime": 8.3421, "eval_samples_per_second": 21.817, "eval_steps_per_second": 10.908, "step": 1500 }, { "epoch": 0.3351459327488625, "grad_norm": 17.07659339904785, "learning_rate": 0.00019411447283346206, "loss": 9.9283, "step": 1510 }, { "epoch": 0.33736544223726556, "grad_norm": 17.004148483276367, "learning_rate": 0.00019403547048568726, "loss": 9.1765, "step": 1520 }, { "epoch": 0.3395849517256686, "grad_norm": 21.948152542114258, "learning_rate": 0.00019395595882087946, "loss": 7.7623, "step": 1530 }, { "epoch": 0.3418044612140717, "grad_norm": 34.88447189331055, "learning_rate": 0.0001938759523909539, "loss": 7.5144, "step": 1540 }, { "epoch": 0.34402397070247476, "grad_norm": 30.565942764282227, "learning_rate": 0.00019379542209208012, "loss": 4.4681, "step": 1550 }, { "epoch": 0.3462434801908778, "grad_norm": 16.915386199951172, "learning_rate": 0.00019371438247617334, "loss": 9.0171, "step": 1560 }, { "epoch": 0.34846298967928085, "grad_norm": 16.601459503173828, "learning_rate": 0.00019363283354323357, "loss": 8.7836, "step": 1570 }, { "epoch": 0.35068249916768396, "grad_norm": 22.87268829345703, "learning_rate": 0.00019355078984517604, "loss": 6.7447, "step": 1580 }, { "epoch": 0.352902008656087, "grad_norm": 22.47816276550293, "learning_rate": 0.0001934682222781703, "loss": 6.5802, "step": 1590 }, { "epoch": 0.35512151814449006, "grad_norm": 20.829444885253906, "learning_rate": 0.00019338514539413154, "loss": 4.4868, "step": 1600 }, { "epoch": 0.35512151814449006, "eval_loss": 1.6953916549682617, "eval_runtime": 8.3616, "eval_samples_per_second": 21.766, "eval_steps_per_second": 10.883, "step": 1600 }, { "epoch": 0.3573410276328931, "grad_norm": 15.970100402832031, "learning_rate": 0.0001933015591930598, "loss": 8.2415, "step": 1610 }, { "epoch": 0.3595605371212962, "grad_norm": 16.007869720458984, "learning_rate": 0.00019321749277878553, "loss": 8.9989, "step": 1620 }, { "epoch": 0.36178004660969926, "grad_norm": 18.664445877075195, "learning_rate": 0.00019313290249556303, "loss": 8.7947, "step": 1630 }, { "epoch": 0.3639995560981023, "grad_norm": 20.421119689941406, "learning_rate": 0.0001930477883433923, "loss": 7.3399, "step": 1640 }, { "epoch": 0.36621906558650535, "grad_norm": 28.10470962524414, "learning_rate": 0.00019296219397801906, "loss": 3.8775, "step": 1650 }, { "epoch": 0.36843857507490846, "grad_norm": 16.877721786499023, "learning_rate": 0.0001928760902956128, "loss": 10.0496, "step": 1660 }, { "epoch": 0.3706580845633115, "grad_norm": 16.684297561645508, "learning_rate": 0.0001927894918480888, "loss": 7.8384, "step": 1670 }, { "epoch": 0.37287759405171456, "grad_norm": 22.071022033691406, "learning_rate": 0.00019270236953161657, "loss": 7.4606, "step": 1680 }, { "epoch": 0.37509710354011766, "grad_norm": 18.03536033630371, "learning_rate": 0.00019261475245002657, "loss": 6.0387, "step": 1690 }, { "epoch": 0.3773166130285207, "grad_norm": 37.234458923339844, "learning_rate": 0.0001925266406033188, "loss": 4.6618, "step": 1700 }, { "epoch": 0.3773166130285207, "eval_loss": 1.7275357246398926, "eval_runtime": 8.3605, "eval_samples_per_second": 21.769, "eval_steps_per_second": 10.884, "step": 1700 }, { "epoch": 0.37953612251692376, "grad_norm": 19.117931365966797, "learning_rate": 0.00019243801943957806, "loss": 9.9758, "step": 1710 }, { "epoch": 0.3817556320053268, "grad_norm": 20.348711013793945, "learning_rate": 0.0001923488889588043, "loss": 9.3583, "step": 1720 }, { "epoch": 0.3839751414937299, "grad_norm": 20.244722366333008, "learning_rate": 0.00019225927826482803, "loss": 8.1657, "step": 1730 }, { "epoch": 0.38619465098213296, "grad_norm": 25.398527145385742, "learning_rate": 0.00019216914370190352, "loss": 6.6272, "step": 1740 }, { "epoch": 0.388414160470536, "grad_norm": 27.088743209838867, "learning_rate": 0.00019207852892577648, "loss": 4.2692, "step": 1750 }, { "epoch": 0.39063366995893906, "grad_norm": 17.853042602539062, "learning_rate": 0.00019198740483261645, "loss": 9.5596, "step": 1760 }, { "epoch": 0.39285317944734216, "grad_norm": 22.052635192871094, "learning_rate": 0.00019189577142242342, "loss": 8.6341, "step": 1770 }, { "epoch": 0.3950726889357452, "grad_norm": 20.33677864074707, "learning_rate": 0.00019180365779902786, "loss": 7.3823, "step": 1780 }, { "epoch": 0.39729219842414826, "grad_norm": 21.995071411132812, "learning_rate": 0.0001917110348585993, "loss": 6.927, "step": 1790 }, { "epoch": 0.3995117079125513, "grad_norm": 22.73754119873047, "learning_rate": 0.00019161791715305299, "loss": 4.6153, "step": 1800 }, { "epoch": 0.3995117079125513, "eval_loss": 1.6793222427368164, "eval_runtime": 8.3312, "eval_samples_per_second": 21.846, "eval_steps_per_second": 10.923, "step": 1800 }, { "epoch": 0.4017312174009544, "grad_norm": 19.926956176757812, "learning_rate": 0.0001915243046823889, "loss": 9.1662, "step": 1810 }, { "epoch": 0.40395072688935746, "grad_norm": 26.968862533569336, "learning_rate": 0.00019143018289469182, "loss": 8.5969, "step": 1820 }, { "epoch": 0.4061702363777605, "grad_norm": 18.89902114868164, "learning_rate": 0.0001913355808937922, "loss": 7.6986, "step": 1830 }, { "epoch": 0.40838974586616356, "grad_norm": 24.70116424560547, "learning_rate": 0.0001912404695758596, "loss": 7.5041, "step": 1840 }, { "epoch": 0.41060925535456666, "grad_norm": 27.856767654418945, "learning_rate": 0.00019114487804472446, "loss": 4.1596, "step": 1850 }, { "epoch": 0.4128287648429697, "grad_norm": 17.24934959411621, "learning_rate": 0.00019104877719655633, "loss": 9.8398, "step": 1860 }, { "epoch": 0.41504827433137276, "grad_norm": 18.58039665222168, "learning_rate": 0.00019095219613518566, "loss": 9.0097, "step": 1870 }, { "epoch": 0.4172677838197758, "grad_norm": 30.96339225769043, "learning_rate": 0.00019085512030869722, "loss": 8.1273, "step": 1880 }, { "epoch": 0.4194872933081789, "grad_norm": 18.25289535522461, "learning_rate": 0.0001907575351651758, "loss": 6.6732, "step": 1890 }, { "epoch": 0.42170680279658196, "grad_norm": 14.842179298400879, "learning_rate": 0.00019065946980845183, "loss": 5.0591, "step": 1900 }, { "epoch": 0.42170680279658196, "eval_loss": 1.7121679782867432, "eval_runtime": 8.3529, "eval_samples_per_second": 21.789, "eval_steps_per_second": 10.894, "step": 1900 }, { "epoch": 0.423926312284985, "grad_norm": 17.09821319580078, "learning_rate": 0.0001905609096866101, "loss": 9.2758, "step": 1910 }, { "epoch": 0.42614582177338806, "grad_norm": 16.43634796142578, "learning_rate": 0.0001904618547996506, "loss": 8.0329, "step": 1920 }, { "epoch": 0.42836533126179116, "grad_norm": 16.861661911010742, "learning_rate": 0.00019036231969948858, "loss": 6.8135, "step": 1930 }, { "epoch": 0.4305848407501942, "grad_norm": 27.224584579467773, "learning_rate": 0.0001902622898342088, "loss": 6.471, "step": 1940 }, { "epoch": 0.43280435023859726, "grad_norm": 26.88721466064453, "learning_rate": 0.00019016176520381123, "loss": 5.2017, "step": 1950 }, { "epoch": 0.4350238597270003, "grad_norm": 21.070384979248047, "learning_rate": 0.00019006076036021113, "loss": 9.181, "step": 1960 }, { "epoch": 0.4372433692154034, "grad_norm": 15.76734447479248, "learning_rate": 0.00018995924619957805, "loss": 6.5769, "step": 1970 }, { "epoch": 0.43946287870380646, "grad_norm": 22.486614227294922, "learning_rate": 0.00018985726637765765, "loss": 8.1066, "step": 1980 }, { "epoch": 0.4416823881922095, "grad_norm": 22.396352767944336, "learning_rate": 0.00018975477723870426, "loss": 6.9441, "step": 1990 }, { "epoch": 0.4439018976806126, "grad_norm": 23.90460205078125, "learning_rate": 0.00018965182243846357, "loss": 3.6119, "step": 2000 }, { "epoch": 0.4439018976806126, "eval_loss": 1.6412502527236938, "eval_runtime": 8.3631, "eval_samples_per_second": 21.762, "eval_steps_per_second": 10.881, "step": 2000 }, { "epoch": 0.44612140716901566, "grad_norm": 17.437789916992188, "learning_rate": 0.0001895483728731051, "loss": 9.8636, "step": 2010 }, { "epoch": 0.4483409166574187, "grad_norm": 17.548757553100586, "learning_rate": 0.00018944442854262888, "loss": 8.5816, "step": 2020 }, { "epoch": 0.45056042614582176, "grad_norm": 17.01480484008789, "learning_rate": 0.00018934001855086535, "loss": 7.5714, "step": 2030 }, { "epoch": 0.45277993563422486, "grad_norm": 35.391578674316406, "learning_rate": 0.00018923511379398406, "loss": 7.1426, "step": 2040 }, { "epoch": 0.4549994451226279, "grad_norm": 29.070091247558594, "learning_rate": 0.000189129714271985, "loss": 4.4626, "step": 2050 }, { "epoch": 0.45721895461103096, "grad_norm": 21.99411392211914, "learning_rate": 0.0001890238345367834, "loss": 9.9152, "step": 2060 }, { "epoch": 0.459438464099434, "grad_norm": 21.236066818237305, "learning_rate": 0.00018891747458837926, "loss": 7.9696, "step": 2070 }, { "epoch": 0.4616579735878371, "grad_norm": 23.502790451049805, "learning_rate": 0.00018881064897868782, "loss": 7.5659, "step": 2080 }, { "epoch": 0.46387748307624016, "grad_norm": 22.682327270507812, "learning_rate": 0.0001887033140519634, "loss": 6.6718, "step": 2090 }, { "epoch": 0.4660969925646432, "grad_norm": 20.672285079956055, "learning_rate": 0.00018859551346395165, "loss": 3.9039, "step": 2100 }, { "epoch": 0.4660969925646432, "eval_loss": 1.645032286643982, "eval_runtime": 8.3715, "eval_samples_per_second": 21.74, "eval_steps_per_second": 10.87, "step": 2100 }, { "epoch": 0.46831650205304626, "grad_norm": 17.962657928466797, "learning_rate": 0.00018848723266273737, "loss": 7.7199, "step": 2110 }, { "epoch": 0.47053601154144936, "grad_norm": 22.836917877197266, "learning_rate": 0.00018837845709640533, "loss": 7.1944, "step": 2120 }, { "epoch": 0.4727555210298524, "grad_norm": 29.15477180480957, "learning_rate": 0.00018826921586878598, "loss": 7.0804, "step": 2130 }, { "epoch": 0.47497503051825546, "grad_norm": 25.91030502319336, "learning_rate": 0.0001881594944279641, "loss": 6.5615, "step": 2140 }, { "epoch": 0.4771945400066585, "grad_norm": 33.87831115722656, "learning_rate": 0.00018804929277393967, "loss": 4.5424, "step": 2150 }, { "epoch": 0.4794140494950616, "grad_norm": 21.534061431884766, "learning_rate": 0.0001879386109067127, "loss": 8.598, "step": 2160 }, { "epoch": 0.48163355898346466, "grad_norm": 16.35502815246582, "learning_rate": 0.00018782746337819844, "loss": 8.5619, "step": 2170 }, { "epoch": 0.4838530684718677, "grad_norm": 23.98406982421875, "learning_rate": 0.00018771582108456641, "loss": 7.1568, "step": 2180 }, { "epoch": 0.48607257796027076, "grad_norm": 31.4709529876709, "learning_rate": 0.00018760371312964708, "loss": 7.9133, "step": 2190 }, { "epoch": 0.48829208744867386, "grad_norm": 35.74486541748047, "learning_rate": 0.0001874911249615252, "loss": 4.1736, "step": 2200 }, { "epoch": 0.48829208744867386, "eval_loss": 1.626839280128479, "eval_runtime": 8.3709, "eval_samples_per_second": 21.742, "eval_steps_per_second": 10.871, "step": 2200 }, { "epoch": 0.4905115969370769, "grad_norm": 16.416400909423828, "learning_rate": 0.00018737807113211602, "loss": 7.9912, "step": 2210 }, { "epoch": 0.49273110642547996, "grad_norm": 20.901979446411133, "learning_rate": 0.0001872645370895043, "loss": 7.097, "step": 2220 }, { "epoch": 0.494950615913883, "grad_norm": 23.127883911132812, "learning_rate": 0.00018715052283369005, "loss": 7.7404, "step": 2230 }, { "epoch": 0.4971701254022861, "grad_norm": 24.272933959960938, "learning_rate": 0.00018703604291658849, "loss": 6.8828, "step": 2240 }, { "epoch": 0.49938963489068916, "grad_norm": 22.559974670410156, "learning_rate": 0.00018692109733819962, "loss": 3.8557, "step": 2250 }, { "epoch": 0.5016091443790922, "grad_norm": 14.266828536987305, "learning_rate": 0.00018680565699469298, "loss": 9.202, "step": 2260 }, { "epoch": 0.5038286538674953, "grad_norm": 18.53165626525879, "learning_rate": 0.00018668976554181427, "loss": 9.2761, "step": 2270 }, { "epoch": 0.5060481633558983, "grad_norm": 19.584280014038086, "learning_rate": 0.00018657340842764825, "loss": 7.2525, "step": 2280 }, { "epoch": 0.5082676728443014, "grad_norm": 21.20615577697754, "learning_rate": 0.0001864565711002797, "loss": 6.3643, "step": 2290 }, { "epoch": 0.5104871823327045, "grad_norm": 26.360748291015625, "learning_rate": 0.0001863392535597086, "loss": 4.7935, "step": 2300 }, { "epoch": 0.5104871823327045, "eval_loss": 1.6143090724945068, "eval_runtime": 8.3711, "eval_samples_per_second": 21.741, "eval_steps_per_second": 10.871, "step": 2300 }, { "epoch": 0.5127066918211075, "grad_norm": 17.28012466430664, "learning_rate": 0.00018622148490976542, "loss": 9.06, "step": 2310 }, { "epoch": 0.5149262013095106, "grad_norm": 19.515504837036133, "learning_rate": 0.00018610325059853494, "loss": 7.6158, "step": 2320 }, { "epoch": 0.5171457107979137, "grad_norm": 30.2669677734375, "learning_rate": 0.0001859845215221867, "loss": 8.9347, "step": 2330 }, { "epoch": 0.5193652202863167, "grad_norm": 26.745439529418945, "learning_rate": 0.0001858653558883816, "loss": 6.3045, "step": 2340 }, { "epoch": 0.5215847297747198, "grad_norm": 24.73974609375, "learning_rate": 0.00018574571004137397, "loss": 3.826, "step": 2350 }, { "epoch": 0.5238042392631228, "grad_norm": 20.876365661621094, "learning_rate": 0.00018562559853307903, "loss": 8.6758, "step": 2360 }, { "epoch": 0.5260237487515259, "grad_norm": 17.077104568481445, "learning_rate": 0.00018550502136349678, "loss": 8.1656, "step": 2370 }, { "epoch": 0.528243258239929, "grad_norm": 19.01683807373047, "learning_rate": 0.00018538399308454245, "loss": 8.1819, "step": 2380 }, { "epoch": 0.530462767728332, "grad_norm": 21.348461151123047, "learning_rate": 0.0001852624845923856, "loss": 6.8625, "step": 2390 }, { "epoch": 0.5326822772167351, "grad_norm": 26.496326446533203, "learning_rate": 0.00018514052499085665, "loss": 4.0114, "step": 2400 }, { "epoch": 0.5326822772167351, "eval_loss": 1.5969077348709106, "eval_runtime": 8.3465, "eval_samples_per_second": 21.805, "eval_steps_per_second": 10.903, "step": 2400 }, { "epoch": 0.5349017867051382, "grad_norm": 17.964014053344727, "learning_rate": 0.0001850180997280404, "loss": 9.0894, "step": 2410 }, { "epoch": 0.5371212961935412, "grad_norm": 20.483144760131836, "learning_rate": 0.00018489520880393684, "loss": 7.3118, "step": 2420 }, { "epoch": 0.5393408056819443, "grad_norm": 25.237768173217773, "learning_rate": 0.0001847718667704612, "loss": 7.0074, "step": 2430 }, { "epoch": 0.5415603151703473, "grad_norm": 20.504459381103516, "learning_rate": 0.00018464805907569826, "loss": 7.0746, "step": 2440 }, { "epoch": 0.5437798246587504, "grad_norm": 25.664976119995117, "learning_rate": 0.000184523785719648, "loss": 4.6465, "step": 2450 }, { "epoch": 0.5459993341471535, "grad_norm": 14.669504165649414, "learning_rate": 0.00018439906125422567, "loss": 8.247, "step": 2460 }, { "epoch": 0.5482188436355565, "grad_norm": 23.121746063232422, "learning_rate": 0.00018427388567943126, "loss": 7.3341, "step": 2470 }, { "epoch": 0.5504383531239596, "grad_norm": 21.810850143432617, "learning_rate": 0.00018414824444334954, "loss": 7.8084, "step": 2480 }, { "epoch": 0.5526578626123627, "grad_norm": 22.606239318847656, "learning_rate": 0.00018402215209789574, "loss": 6.4487, "step": 2490 }, { "epoch": 0.5548773721007657, "grad_norm": 16.65699005126953, "learning_rate": 0.00018389559409115463, "loss": 3.9156, "step": 2500 }, { "epoch": 0.5548773721007657, "eval_loss": 1.5648810863494873, "eval_runtime": 8.3816, "eval_samples_per_second": 21.714, "eval_steps_per_second": 10.857, "step": 2500 }, { "epoch": 0.5570968815891688, "grad_norm": 17.796775817871094, "learning_rate": 0.00018376858497504145, "loss": 8.7836, "step": 2510 }, { "epoch": 0.5593163910775718, "grad_norm": 22.294870376586914, "learning_rate": 0.00018364112474955618, "loss": 7.562, "step": 2520 }, { "epoch": 0.5615359005659749, "grad_norm": 25.260696411132812, "learning_rate": 0.0001835131988627836, "loss": 6.5871, "step": 2530 }, { "epoch": 0.563755410054378, "grad_norm": 20.715383529663086, "learning_rate": 0.0001833848364185542, "loss": 5.9936, "step": 2540 }, { "epoch": 0.565974919542781, "grad_norm": 12.513394355773926, "learning_rate": 0.00018325600831303746, "loss": 4.1902, "step": 2550 }, { "epoch": 0.5681944290311841, "grad_norm": 22.637521743774414, "learning_rate": 0.00018312672909814864, "loss": 9.6095, "step": 2560 }, { "epoch": 0.5704139385195872, "grad_norm": 19.12101936340332, "learning_rate": 0.00018299699877388775, "loss": 7.7106, "step": 2570 }, { "epoch": 0.5726334480079902, "grad_norm": 20.52643394470215, "learning_rate": 0.00018286681734025478, "loss": 6.501, "step": 2580 }, { "epoch": 0.5748529574963933, "grad_norm": 21.80643081665039, "learning_rate": 0.00018273619934916496, "loss": 6.9054, "step": 2590 }, { "epoch": 0.5770724669847963, "grad_norm": 37.41332244873047, "learning_rate": 0.00018260511569678783, "loss": 4.5978, "step": 2600 }, { "epoch": 0.5770724669847963, "eval_loss": 1.571250319480896, "eval_runtime": 8.3719, "eval_samples_per_second": 21.739, "eval_steps_per_second": 10.87, "step": 2600 }, { "epoch": 0.5792919764731994, "grad_norm": 15.82495403289795, "learning_rate": 0.00018247359548695385, "loss": 9.7545, "step": 2610 }, { "epoch": 0.5815114859616025, "grad_norm": 19.710506439208984, "learning_rate": 0.0001823416241677478, "loss": 7.4027, "step": 2620 }, { "epoch": 0.5837309954500055, "grad_norm": 19.0036678314209, "learning_rate": 0.00018220920173916966, "loss": 7.0675, "step": 2630 }, { "epoch": 0.5859505049384086, "grad_norm": 31.184757232666016, "learning_rate": 0.00018207634275313467, "loss": 6.4282, "step": 2640 }, { "epoch": 0.5881700144268117, "grad_norm": 31.40509796142578, "learning_rate": 0.0001819430326577276, "loss": 4.0363, "step": 2650 }, { "epoch": 0.5903895239152147, "grad_norm": 22.907358169555664, "learning_rate": 0.00018180927145294845, "loss": 8.9021, "step": 2660 }, { "epoch": 0.5926090334036178, "grad_norm": 16.848222732543945, "learning_rate": 0.00018167507369071245, "loss": 7.4892, "step": 2670 }, { "epoch": 0.5948285428920209, "grad_norm": 17.44846534729004, "learning_rate": 0.0001815404393710196, "loss": 6.3707, "step": 2680 }, { "epoch": 0.5970480523804239, "grad_norm": 25.153488159179688, "learning_rate": 0.00018140535394195467, "loss": 7.1394, "step": 2690 }, { "epoch": 0.599267561868827, "grad_norm": 28.365243911743164, "learning_rate": 0.00018126981740351766, "loss": 3.7863, "step": 2700 }, { "epoch": 0.599267561868827, "eval_loss": 1.595349669456482, "eval_runtime": 8.4065, "eval_samples_per_second": 21.65, "eval_steps_per_second": 10.825, "step": 2700 }, { "epoch": 0.60148707135723, "grad_norm": 20.700927734375, "learning_rate": 0.0001811338443076238, "loss": 9.3398, "step": 2710 }, { "epoch": 0.6037065808456331, "grad_norm": 18.146568298339844, "learning_rate": 0.0001809974346542731, "loss": 7.9417, "step": 2720 }, { "epoch": 0.6059260903340362, "grad_norm": 18.820941925048828, "learning_rate": 0.00018086060299538076, "loss": 6.1903, "step": 2730 }, { "epoch": 0.6081455998224392, "grad_norm": 29.816184997558594, "learning_rate": 0.00018072330567520112, "loss": 6.2712, "step": 2740 }, { "epoch": 0.6103651093108423, "grad_norm": 16.67285919189453, "learning_rate": 0.00018058558634947985, "loss": 4.1456, "step": 2750 }, { "epoch": 0.6125846187992454, "grad_norm": 22.284555435180664, "learning_rate": 0.0001804474159143865, "loss": 9.3789, "step": 2760 }, { "epoch": 0.6148041282876484, "grad_norm": 21.401790618896484, "learning_rate": 0.00018030882347375154, "loss": 7.7821, "step": 2770 }, { "epoch": 0.6170236377760515, "grad_norm": 24.44200897216797, "learning_rate": 0.0001801697799237445, "loss": 7.8753, "step": 2780 }, { "epoch": 0.6192431472644545, "grad_norm": 25.069490432739258, "learning_rate": 0.00018003031436819583, "loss": 7.0617, "step": 2790 }, { "epoch": 0.6214626567528576, "grad_norm": 20.222972869873047, "learning_rate": 0.0001798904122551903, "loss": 3.469, "step": 2800 }, { "epoch": 0.6214626567528576, "eval_loss": 1.5994218587875366, "eval_runtime": 8.3996, "eval_samples_per_second": 21.668, "eval_steps_per_second": 10.834, "step": 2800 } ], "logging_steps": 10, "max_steps": 13515, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.25347672997888e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }