diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json"
--- "a/last-checkpoint/trainer_state.json"
+++ "b/last-checkpoint/trainer_state.json"
@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.2502839833396441,
+  "epoch": 0.5005679666792882,
   "eval_steps": 661,
-  "global_step": 661,
+  "global_step": 1322,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -4650,6 +4650,4641 @@
       "eval_samples_per_second": 4.946,
       "eval_steps_per_second": 1.237,
       "step": 661
+    },
+    {
+      "epoch": 0.2506626277925028,
+      "grad_norm": 11.954764366149902,
+      "learning_rate": 0.0001718247236861852,
+      "loss": 4.3744,
+      "step": 662
+    },
+    {
+      "epoch": 0.2510412722453616,
+      "grad_norm": 14.922517776489258,
+      "learning_rate": 0.0001717412733168675,
+      "loss": 4.7506,
+      "step": 663
+    },
+    {
+      "epoch": 0.2514199166982204,
+      "grad_norm": 16.80381965637207,
+      "learning_rate": 0.00017165771987696698,
+      "loss": 4.9678,
+      "step": 664
+    },
+    {
+      "epoch": 0.2517985611510791,
+      "grad_norm": 13.548487663269043,
+      "learning_rate": 0.00017157406348652463,
+      "loss": 4.0003,
+      "step": 665
+    },
+    {
+      "epoch": 0.2521772056039379,
+      "grad_norm": 14.526391983032227,
+      "learning_rate": 0.00017149030426572953,
+      "loss": 4.5138,
+      "step": 666
+    },
+    {
+      "epoch": 0.25255585005679665,
+      "grad_norm": 16.523895263671875,
+      "learning_rate": 0.00017140644233491837,
+      "loss": 4.0987,
+      "step": 667
+    },
+    {
+      "epoch": 0.25293449450965544,
+      "grad_norm": 13.643799781799316,
+      "learning_rate": 0.00017132247781457557,
+      "loss": 4.3543,
+      "step": 668
+    },
+    {
+      "epoch": 0.2533131389625142,
+      "grad_norm": 13.402974128723145,
+      "learning_rate": 0.00017123841082533275,
+      "loss": 3.5844,
+      "step": 669
+    },
+    {
+      "epoch": 0.25369178341537296,
+      "grad_norm": 16.17197036743164,
+      "learning_rate": 0.00017115424148796883,
+      "loss": 4.1618,
+      "step": 670
+    },
+    {
+      "epoch": 0.25407042786823175,
+      "grad_norm": 15.300637245178223,
+      "learning_rate": 0.00017106996992340983,
+      "loss": 2.9017,
+      "step": 671
+    },
+    {
+      "epoch": 0.2544490723210905,
+      "grad_norm": 14.559680938720703,
+      "learning_rate": 0.00017098559625272852,
+      "loss": 1.9764,
+      "step": 672
+    },
+    {
+      "epoch": 0.2548277167739493,
+      "grad_norm": 19.095439910888672,
+      "learning_rate": 0.00017090112059714446,
+      "loss": 2.9313,
+      "step": 673
+    },
+    {
+      "epoch": 0.255206361226808,
+      "grad_norm": 18.04271125793457,
+      "learning_rate": 0.0001708165430780237,
+      "loss": 1.6536,
+      "step": 674
+    },
+    {
+      "epoch": 0.2555850056796668,
+      "grad_norm": 29.377933502197266,
+      "learning_rate": 0.00017073186381687868,
+      "loss": 2.5233,
+      "step": 675
+    },
+    {
+      "epoch": 0.25596365013252553,
+      "grad_norm": 13.520868301391602,
+      "learning_rate": 0.00017064708293536792,
+      "loss": 5.1626,
+      "step": 676
+    },
+    {
+      "epoch": 0.2563422945853843,
+      "grad_norm": 10.009629249572754,
+      "learning_rate": 0.00017056220055529595,
+      "loss": 5.3031,
+      "step": 677
+    },
+    {
+      "epoch": 0.2567209390382431,
+      "grad_norm": 10.07101058959961,
+      "learning_rate": 0.00017047721679861326,
+      "loss": 4.0588,
+      "step": 678
+    },
+    {
+      "epoch": 0.25709958349110185,
+      "grad_norm": 9.826030731201172,
+      "learning_rate": 0.0001703921317874158,
+      "loss": 5.5341,
+      "step": 679
+    },
+    {
+      "epoch": 0.25747822794396064,
+      "grad_norm": 8.8646821975708,
+      "learning_rate": 0.00017030694564394518,
+      "loss": 4.0068,
+      "step": 680
+    },
+    {
+      "epoch": 0.25785687239681937,
+      "grad_norm": 9.862954139709473,
+      "learning_rate": 0.00017022165849058812,
+      "loss": 4.4291,
+      "step": 681
+    },
+    {
+      "epoch": 0.25823551684967816,
+      "grad_norm": 10.417892456054688,
+      "learning_rate": 0.00017013627044987656,
+      "loss": 4.2552,
+      "step": 682
+    },
+    {
+      "epoch": 0.2586141613025369,
+      "grad_norm": 10.884054183959961,
+      "learning_rate": 0.00017005078164448746,
+      "loss": 3.9076,
+      "step": 683
+    },
+    {
+      "epoch": 0.2589928057553957,
+      "grad_norm": 13.036945343017578,
+      "learning_rate": 0.00016996519219724234,
+      "loss": 5.0801,
+      "step": 684
+    },
+    {
+      "epoch": 0.2593714502082545,
+      "grad_norm": 14.050254821777344,
+      "learning_rate": 0.00016987950223110748,
+      "loss": 4.3475,
+      "step": 685
+    },
+    {
+      "epoch": 0.2597500946611132,
+      "grad_norm": 14.108619689941406,
+      "learning_rate": 0.0001697937118691936,
+      "loss": 3.5589,
+      "step": 686
+    },
+    {
+      "epoch": 0.260128739113972,
+      "grad_norm": 15.274951934814453,
+      "learning_rate": 0.00016970782123475547,
+      "loss": 4.3379,
+      "step": 687
+    },
+    {
+      "epoch": 0.26050738356683073,
+      "grad_norm": 16.255102157592773,
+      "learning_rate": 0.00016962183045119214,
+      "loss": 4.2574,
+      "step": 688
+    },
+    {
+      "epoch": 0.2608860280196895,
+      "grad_norm": 13.676608085632324,
+      "learning_rate": 0.00016953573964204638,
+      "loss": 4.7991,
+      "step": 689
+    },
+    {
+      "epoch": 0.26126467247254825,
+      "grad_norm": 16.22185707092285,
+      "learning_rate": 0.00016944954893100475,
+      "loss": 3.9156,
+      "step": 690
+    },
+    {
+      "epoch": 0.26164331692540704,
+      "grad_norm": 11.445201873779297,
+      "learning_rate": 0.0001693632584418973,
+      "loss": 3.3147,
+      "step": 691
+    },
+    {
+      "epoch": 0.26202196137826583,
+      "grad_norm": 12.272850036621094,
+      "learning_rate": 0.0001692768682986975,
+      "loss": 3.7517,
+      "step": 692
+    },
+    {
+      "epoch": 0.26240060583112457,
+      "grad_norm": 13.47509479522705,
+      "learning_rate": 0.0001691903786255219,
+      "loss": 3.0187,
+      "step": 693
+    },
+    {
+      "epoch": 0.26277925028398336,
+      "grad_norm": 12.622843742370605,
+      "learning_rate": 0.00016910378954663013,
+      "loss": 3.404,
+      "step": 694
+    },
+    {
+      "epoch": 0.2631578947368421,
+      "grad_norm": 17.651247024536133,
+      "learning_rate": 0.00016901710118642454,
+      "loss": 3.9933,
+      "step": 695
+    },
+    {
+      "epoch": 0.2635365391897009,
+      "grad_norm": 17.895601272583008,
+      "learning_rate": 0.0001689303136694502,
+      "loss": 2.997,
+      "step": 696
+    },
+    {
+      "epoch": 0.2639151836425596,
+      "grad_norm": 17.362606048583984,
+      "learning_rate": 0.0001688434271203946,
+      "loss": 3.9413,
+      "step": 697
+    },
+    {
+      "epoch": 0.2642938280954184,
+      "grad_norm": 18.201135635375977,
+      "learning_rate": 0.00016875644166408754,
+      "loss": 3.0196,
+      "step": 698
+    },
+    {
+      "epoch": 0.2646724725482772,
+      "grad_norm": 34.7093391418457,
+      "learning_rate": 0.00016866935742550083,
+      "loss": 3.1038,
+      "step": 699
+    },
+    {
+      "epoch": 0.2650511170011359,
+      "grad_norm": 31.69573974609375,
+      "learning_rate": 0.00016858217452974837,
+      "loss": 3.6338,
+      "step": 700
+    },
+    {
+      "epoch": 0.2654297614539947,
+      "grad_norm": 8.198531150817871,
+      "learning_rate": 0.0001684948931020856,
+      "loss": 5.0433,
+      "step": 701
+    },
+    {
+      "epoch": 0.26580840590685345,
+      "grad_norm": 8.688183784484863,
+      "learning_rate": 0.0001684075132679097,
+      "loss": 5.2619,
+      "step": 702
+    },
+    {
+      "epoch": 0.26618705035971224,
+      "grad_norm": 11.000321388244629,
+      "learning_rate": 0.00016832003515275914,
+      "loss": 4.5282,
+      "step": 703
+    },
+    {
+      "epoch": 0.266565694812571,
+      "grad_norm": 10.947884559631348,
+      "learning_rate": 0.00016823245888231356,
+      "loss": 4.5732,
+      "step": 704
+    },
+    {
+      "epoch": 0.26694433926542976,
+      "grad_norm": 11.00478744506836,
+      "learning_rate": 0.0001681447845823937,
+      "loss": 4.2352,
+      "step": 705
+    },
+    {
+      "epoch": 0.26732298371828855,
+      "grad_norm": 12.548285484313965,
+      "learning_rate": 0.00016805701237896105,
+      "loss": 4.8917,
+      "step": 706
+    },
+    {
+      "epoch": 0.2677016281711473,
+      "grad_norm": 9.91434097290039,
+      "learning_rate": 0.00016796914239811786,
+      "loss": 3.5194,
+      "step": 707
+    },
+    {
+      "epoch": 0.2680802726240061,
+      "grad_norm": 10.20582389831543,
+      "learning_rate": 0.00016788117476610677,
+      "loss": 3.5162,
+      "step": 708
+    },
+    {
+      "epoch": 0.2684589170768648,
+      "grad_norm": 11.576292991638184,
+      "learning_rate": 0.00016779310960931073,
+      "loss": 4.2913,
+      "step": 709
+    },
+    {
+      "epoch": 0.2688375615297236,
+      "grad_norm": 12.707137107849121,
+      "learning_rate": 0.0001677049470542529,
+      "loss": 3.8916,
+      "step": 710
+    },
+    {
+      "epoch": 0.26921620598258234,
+      "grad_norm": 13.900711059570312,
+      "learning_rate": 0.00016761668722759622,
+      "loss": 4.5028,
+      "step": 711
+    },
+    {
+      "epoch": 0.2695948504354411,
+      "grad_norm": 15.244569778442383,
+      "learning_rate": 0.0001675283302561435,
+      "loss": 4.7703,
+      "step": 712
+    },
+    {
+      "epoch": 0.2699734948882999,
+      "grad_norm": 12.6697998046875,
+      "learning_rate": 0.00016743987626683703,
+      "loss": 3.6493,
+      "step": 713
+    },
+    {
+      "epoch": 0.27035213934115865,
+      "grad_norm": 14.035780906677246,
+      "learning_rate": 0.00016735132538675854,
+      "loss": 3.7715,
+      "step": 714
+    },
+    {
+      "epoch": 0.27073078379401744,
+      "grad_norm": 14.34146785736084,
+      "learning_rate": 0.00016726267774312898,
+      "loss": 4.4825,
+      "step": 715
+    },
+    {
+      "epoch": 0.27110942824687617,
+      "grad_norm": 14.432804107666016,
+      "learning_rate": 0.00016717393346330828,
+      "loss": 3.3871,
+      "step": 716
+    },
+    {
+      "epoch": 0.27148807269973496,
+      "grad_norm": 13.750441551208496,
+      "learning_rate": 0.0001670850926747952,
+      "loss": 3.1007,
+      "step": 717
+    },
+    {
+      "epoch": 0.2718667171525937,
+      "grad_norm": 16.201181411743164,
+      "learning_rate": 0.00016699615550522717,
+      "loss": 2.6202,
+      "step": 718
+    },
+    {
+      "epoch": 0.2722453616054525,
+      "grad_norm": 18.666667938232422,
+      "learning_rate": 0.0001669071220823801,
+      "loss": 3.7933,
+      "step": 719
+    },
+    {
+      "epoch": 0.2726240060583112,
+      "grad_norm": 17.479516983032227,
+      "learning_rate": 0.0001668179925341682,
+      "loss": 3.846,
+      "step": 720
+    },
+    {
+      "epoch": 0.27300265051117,
+      "grad_norm": 21.877872467041016,
+      "learning_rate": 0.0001667287669886437,
+      "loss": 3.2863,
+      "step": 721
+    },
+    {
+      "epoch": 0.2733812949640288,
+      "grad_norm": 18.93062973022461,
+      "learning_rate": 0.00016663944557399692,
+      "loss": 2.6771,
+      "step": 722
+    },
+    {
+      "epoch": 0.27375993941688753,
+      "grad_norm": 17.544601440429688,
+      "learning_rate": 0.00016655002841855566,
+      "loss": 2.4239,
+      "step": 723
+    },
+    {
+      "epoch": 0.2741385838697463,
+      "grad_norm": 27.646818161010742,
+      "learning_rate": 0.00016646051565078558,
+      "loss": 2.6222,
+      "step": 724
+    },
+    {
+      "epoch": 0.27451722832260506,
+      "grad_norm": 34.713478088378906,
+      "learning_rate": 0.0001663709073992894,
+      "loss": 2.8671,
+      "step": 725
+    },
+    {
+      "epoch": 0.27489587277546385,
+      "grad_norm": 8.818018913269043,
+      "learning_rate": 0.00016628120379280728,
+      "loss": 4.8852,
+      "step": 726
+    },
+    {
+      "epoch": 0.2752745172283226,
+      "grad_norm": 9.805505752563477,
+      "learning_rate": 0.00016619140496021615,
+      "loss": 4.155,
+      "step": 727
+    },
+    {
+      "epoch": 0.27565316168118137,
+      "grad_norm": 8.863174438476562,
+      "learning_rate": 0.00016610151103052995,
+      "loss": 3.8106,
+      "step": 728
+    },
+    {
+      "epoch": 0.27603180613404016,
+      "grad_norm": 9.454705238342285,
+      "learning_rate": 0.00016601152213289913,
+      "loss": 4.0096,
+      "step": 729
+    },
+    {
+      "epoch": 0.2764104505868989,
+      "grad_norm": 10.974586486816406,
+      "learning_rate": 0.00016592143839661057,
+      "loss": 4.3561,
+      "step": 730
+    },
+    {
+      "epoch": 0.2767890950397577,
+      "grad_norm": 12.553147315979004,
+      "learning_rate": 0.0001658312599510875,
+      "loss": 3.901,
+      "step": 731
+    },
+    {
+      "epoch": 0.2771677394926164,
+      "grad_norm": 11.170998573303223,
+      "learning_rate": 0.00016574098692588915,
+      "loss": 4.3408,
+      "step": 732
+    },
+    {
+      "epoch": 0.2775463839454752,
+      "grad_norm": 13.828832626342773,
+      "learning_rate": 0.0001656506194507106,
+      "loss": 4.7403,
+      "step": 733
+    },
+    {
+      "epoch": 0.27792502839833394,
+      "grad_norm": 13.855401992797852,
+      "learning_rate": 0.00016556015765538273,
+      "loss": 4.6504,
+      "step": 734
+    },
+    {
+      "epoch": 0.27830367285119273,
+      "grad_norm": 11.40543270111084,
+      "learning_rate": 0.0001654696016698718,
+      "loss": 3.4119,
+      "step": 735
+    },
+    {
+      "epoch": 0.2786823173040515,
+      "grad_norm": 12.30098819732666,
+      "learning_rate": 0.00016537895162427955,
+      "loss": 3.635,
+      "step": 736
+    },
+    {
+      "epoch": 0.27906096175691025,
+      "grad_norm": 12.096563339233398,
+      "learning_rate": 0.0001652882076488427,
+      "loss": 4.2051,
+      "step": 737
+    },
+    {
+      "epoch": 0.27943960620976904,
+      "grad_norm": 11.935840606689453,
+      "learning_rate": 0.00016519736987393303,
+      "loss": 3.8025,
+      "step": 738
+    },
+    {
+      "epoch": 0.2798182506626278,
+      "grad_norm": 13.400490760803223,
+      "learning_rate": 0.000165106438430057,
+      "loss": 4.2775,
+      "step": 739
+    },
+    {
+      "epoch": 0.28019689511548657,
+      "grad_norm": 10.693985939025879,
+      "learning_rate": 0.00016501541344785572,
+      "loss": 2.8859,
+      "step": 740
+    },
+    {
+      "epoch": 0.2805755395683453,
+      "grad_norm": 13.22080135345459,
+      "learning_rate": 0.0001649242950581046,
+      "loss": 2.4878,
+      "step": 741
+    },
+    {
+      "epoch": 0.2809541840212041,
+      "grad_norm": 14.28111743927002,
+      "learning_rate": 0.00016483308339171335,
+      "loss": 3.9025,
+      "step": 742
+    },
+    {
+      "epoch": 0.2813328284740629,
+      "grad_norm": 17.349239349365234,
+      "learning_rate": 0.0001647417785797256,
+      "loss": 3.6947,
+      "step": 743
+    },
+    {
+      "epoch": 0.2817114729269216,
+      "grad_norm": 15.542529106140137,
+      "learning_rate": 0.0001646503807533189,
+      "loss": 3.1008,
+      "step": 744
+    },
+    {
+      "epoch": 0.2820901173797804,
+      "grad_norm": 13.73243522644043,
+      "learning_rate": 0.0001645588900438043,
+      "loss": 2.3237,
+      "step": 745
+    },
+    {
+      "epoch": 0.28246876183263914,
+      "grad_norm": 18.583194732666016,
+      "learning_rate": 0.0001644673065826264,
+      "loss": 3.3123,
+      "step": 746
+    },
+    {
+      "epoch": 0.2828474062854979,
+      "grad_norm": 20.04288673400879,
+      "learning_rate": 0.00016437563050136303,
+      "loss": 2.8265,
+      "step": 747
+    },
+    {
+      "epoch": 0.28322605073835666,
+      "grad_norm": 15.773954391479492,
+      "learning_rate": 0.00016428386193172506,
+      "loss": 2.1103,
+      "step": 748
+    },
+    {
+      "epoch": 0.28360469519121545,
+      "grad_norm": 19.099443435668945,
+      "learning_rate": 0.0001641920010055563,
+      "loss": 1.9673,
+      "step": 749
+    },
+    {
+      "epoch": 0.28398333964407424,
+      "grad_norm": 40.27873611450195,
+      "learning_rate": 0.00016410004785483316,
+      "loss": 5.3713,
+      "step": 750
+    },
+    {
+      "epoch": 0.284361984096933,
+      "grad_norm": 10.12539005279541,
+      "learning_rate": 0.00016400800261166465,
+      "loss": 4.9746,
+      "step": 751
+    },
+    {
+      "epoch": 0.28474062854979176,
+      "grad_norm": 10.15647029876709,
+      "learning_rate": 0.000163915865408292,
+      "loss": 4.8416,
+      "step": 752
+    },
+    {
+      "epoch": 0.2851192730026505,
+      "grad_norm": 9.944365501403809,
+      "learning_rate": 0.00016382363637708865,
+      "loss": 4.3926,
+      "step": 753
+    },
+    {
+      "epoch": 0.2854979174555093,
+      "grad_norm": 9.265400886535645,
+      "learning_rate": 0.0001637313156505598,
+      "loss": 3.6671,
+      "step": 754
+    },
+    {
+      "epoch": 0.285876561908368,
+      "grad_norm": 10.70794677734375,
+      "learning_rate": 0.00016363890336134262,
+      "loss": 4.5764,
+      "step": 755
+    },
+    {
+      "epoch": 0.2862552063612268,
+      "grad_norm": 11.477482795715332,
+      "learning_rate": 0.00016354639964220568,
+      "loss": 4.5665,
+      "step": 756
+    },
+    {
+      "epoch": 0.2866338508140856,
+      "grad_norm": 10.951593399047852,
+      "learning_rate": 0.0001634538046260489,
+      "loss": 4.2272,
+      "step": 757
+    },
+    {
+      "epoch": 0.28701249526694433,
+      "grad_norm": 11.813931465148926,
+      "learning_rate": 0.00016336111844590345,
+      "loss": 3.8581,
+      "step": 758
+    },
+    {
+      "epoch": 0.2873911397198031,
+      "grad_norm": 12.866728782653809,
+      "learning_rate": 0.0001632683412349314,
+      "loss": 4.2478,
+      "step": 759
+    },
+    {
+      "epoch": 0.28776978417266186,
+      "grad_norm": 11.82854175567627,
+      "learning_rate": 0.00016317547312642562,
+      "loss": 4.416,
+      "step": 760
+    },
+    {
+      "epoch": 0.28814842862552065,
+      "grad_norm": 12.294820785522461,
+      "learning_rate": 0.00016308251425380962,
+      "loss": 4.3508,
+      "step": 761
+    },
+    {
+      "epoch": 0.2885270730783794,
+      "grad_norm": 11.736769676208496,
+      "learning_rate": 0.00016298946475063733,
+      "loss": 3.5181,
+      "step": 762
+    },
+    {
+      "epoch": 0.28890571753123817,
+      "grad_norm": 10.93974781036377,
+      "learning_rate": 0.0001628963247505927,
+      "loss": 2.8494,
+      "step": 763
+    },
+    {
+      "epoch": 0.28928436198409696,
+      "grad_norm": 15.365312576293945,
+      "learning_rate": 0.00016280309438748992,
+      "loss": 3.8264,
+      "step": 764
+    },
+    {
+      "epoch": 0.2896630064369557,
+      "grad_norm": 13.349133491516113,
+      "learning_rate": 0.00016270977379527292,
+      "loss": 4.0294,
+      "step": 765
+    },
+    {
+      "epoch": 0.2900416508898145,
+      "grad_norm": 13.878774642944336,
+      "learning_rate": 0.00016261636310801523,
+      "loss": 3.6898,
+      "step": 766
+    },
+    {
+      "epoch": 0.2904202953426732,
+      "grad_norm": 13.974386215209961,
+      "learning_rate": 0.00016252286245991987,
+      "loss": 3.1476,
+      "step": 767
+    },
+    {
+      "epoch": 0.290798939795532,
+      "grad_norm": 12.599011421203613,
+      "learning_rate": 0.0001624292719853191,
+      "loss": 3.7,
+      "step": 768
+    },
+    {
+      "epoch": 0.29117758424839074,
+      "grad_norm": 14.90402603149414,
+      "learning_rate": 0.00016233559181867414,
+      "loss": 2.84,
+      "step": 769
+    },
+    {
+      "epoch": 0.29155622870124953,
+      "grad_norm": 15.247842788696289,
+      "learning_rate": 0.00016224182209457523,
+      "loss": 2.9135,
+      "step": 770
+    },
+    {
+      "epoch": 0.29193487315410827,
+      "grad_norm": 15.978056907653809,
+      "learning_rate": 0.00016214796294774115,
+      "loss": 3.8344,
+      "step": 771
+    },
+    {
+      "epoch": 0.29231351760696705,
+      "grad_norm": 15.876410484313965,
+      "learning_rate": 0.00016205401451301925,
+      "loss": 2.1739,
+      "step": 772
+    },
+    {
+      "epoch": 0.29269216205982584,
+      "grad_norm": 16.77569007873535,
+      "learning_rate": 0.00016195997692538506,
+      "loss": 2.1749,
+      "step": 773
+    },
+    {
+      "epoch": 0.2930708065126846,
+      "grad_norm": 23.31680679321289,
+      "learning_rate": 0.00016186585031994225,
+      "loss": 2.8665,
+      "step": 774
+    },
+    {
+      "epoch": 0.29344945096554337,
+      "grad_norm": 10.028854370117188,
+      "learning_rate": 0.0001617716348319224,
+      "loss": 0.8782,
+      "step": 775
+    },
+    {
+      "epoch": 0.2938280954184021,
+      "grad_norm": 8.688515663146973,
+      "learning_rate": 0.00016167733059668478,
+      "loss": 3.93,
+      "step": 776
+    },
+    {
+      "epoch": 0.2942067398712609,
+      "grad_norm": 9.398271560668945,
+      "learning_rate": 0.00016158293774971608,
+      "loss": 4.4695,
+      "step": 777
+    },
+    {
+      "epoch": 0.2945853843241196,
+      "grad_norm": 10.657846450805664,
+      "learning_rate": 0.00016148845642663043,
+      "loss": 4.401,
+      "step": 778
+    },
+    {
+      "epoch": 0.2949640287769784,
+      "grad_norm": 10.177902221679688,
+      "learning_rate": 0.000161393886763169,
+      "loss": 3.8614,
+      "step": 779
+    },
+    {
+      "epoch": 0.2953426732298372,
+      "grad_norm": 10.739095687866211,
+      "learning_rate": 0.0001612992288951998,
+      "loss": 3.9037,
+      "step": 780
+    },
+    {
+      "epoch": 0.29572131768269594,
+      "grad_norm": 11.997400283813477,
+      "learning_rate": 0.00016120448295871783,
+      "loss": 3.6965,
+      "step": 781
+    },
+    {
+      "epoch": 0.29609996213555473,
+      "grad_norm": 12.047724723815918,
+      "learning_rate": 0.00016110964908984428,
+      "loss": 4.1741,
+      "step": 782
+    },
+    {
+      "epoch": 0.29647860658841346,
+      "grad_norm": 11.252506256103516,
+      "learning_rate": 0.00016101472742482685,
+      "loss": 4.2626,
+      "step": 783
+    },
+    {
+      "epoch": 0.29685725104127225,
+      "grad_norm": 10.244424819946289,
+      "learning_rate": 0.00016091971810003946,
+      "loss": 3.8371,
+      "step": 784
+    },
+    {
+      "epoch": 0.297235895494131,
+      "grad_norm": 11.887914657592773,
+      "learning_rate": 0.00016082462125198177,
+      "loss": 3.7736,
+      "step": 785
+    },
+    {
+      "epoch": 0.2976145399469898,
+      "grad_norm": 11.956177711486816,
+      "learning_rate": 0.00016072943701727932,
+      "loss": 4.0997,
+      "step": 786
+    },
+    {
+      "epoch": 0.29799318439984857,
+      "grad_norm": 11.499533653259277,
+      "learning_rate": 0.00016063416553268315,
+      "loss": 3.995,
+      "step": 787
+    },
+    {
+      "epoch": 0.2983718288527073,
+      "grad_norm": 14.390945434570312,
+      "learning_rate": 0.00016053880693506968,
+      "loss": 4.1593,
+      "step": 788
+    },
+    {
+      "epoch": 0.2987504733055661,
+      "grad_norm": 12.83646297454834,
+      "learning_rate": 0.00016044336136144044,
+      "loss": 3.2662,
+      "step": 789
+    },
+    {
+      "epoch": 0.2991291177584248,
+      "grad_norm": 12.761232376098633,
+      "learning_rate": 0.00016034782894892198,
+      "loss": 2.7353,
+      "step": 790
+    },
+    {
+      "epoch": 0.2995077622112836,
+      "grad_norm": 13.886045455932617,
+      "learning_rate": 0.00016025220983476555,
+      "loss": 3.6852,
+      "step": 791
+    },
+    {
+      "epoch": 0.29988640666414235,
+      "grad_norm": 16.431631088256836,
+      "learning_rate": 0.00016015650415634704,
+      "loss": 4.5693,
+      "step": 792
+    },
+    {
+      "epoch": 0.30026505111700114,
+      "grad_norm": 15.884831428527832,
+      "learning_rate": 0.00016006071205116657,
+      "loss": 4.0334,
+      "step": 793
+    },
+    {
+      "epoch": 0.3006436955698599,
+      "grad_norm": 16.197486877441406,
+      "learning_rate": 0.00015996483365684862,
+      "loss": 3.0299,
+      "step": 794
+    },
+    {
+      "epoch": 0.30102234002271866,
+      "grad_norm": 13.327005386352539,
+      "learning_rate": 0.00015986886911114145,
+      "loss": 2.5927,
+      "step": 795
+    },
+    {
+      "epoch": 0.30140098447557745,
+      "grad_norm": 13.829025268554688,
+      "learning_rate": 0.00015977281855191725,
+      "loss": 2.6192,
+      "step": 796
+    },
+    {
+      "epoch": 0.3017796289284362,
+      "grad_norm": 15.983011245727539,
+      "learning_rate": 0.00015967668211717167,
+      "loss": 2.3621,
+      "step": 797
+    },
+    {
+      "epoch": 0.302158273381295,
+      "grad_norm": 19.83639144897461,
+      "learning_rate": 0.00015958045994502384,
+      "loss": 2.7834,
+      "step": 798
+    },
+    {
+      "epoch": 0.3025369178341537,
+      "grad_norm": 19.925039291381836,
+      "learning_rate": 0.00015948415217371595,
+      "loss": 2.8116,
+      "step": 799
+    },
+    {
+      "epoch": 0.3029155622870125,
+      "grad_norm": 21.867938995361328,
+      "learning_rate": 0.0001593877589416133,
+      "loss": 1.7513,
+      "step": 800
+    },
+    {
+      "epoch": 0.3032942067398713,
+      "grad_norm": 8.560530662536621,
+      "learning_rate": 0.00015929128038720384,
+      "loss": 5.1137,
+      "step": 801
+    },
+    {
+      "epoch": 0.30367285119273,
+      "grad_norm": 8.668681144714355,
+      "learning_rate": 0.00015919471664909823,
+      "loss": 3.8616,
+      "step": 802
+    },
+    {
+      "epoch": 0.3040514956455888,
+      "grad_norm": 10.437018394470215,
+      "learning_rate": 0.0001590980678660294,
+      "loss": 3.7993,
+      "step": 803
+    },
+    {
+      "epoch": 0.30443014009844754,
+      "grad_norm": 10.498896598815918,
+      "learning_rate": 0.0001590013341768526,
+      "loss": 3.8712,
+      "step": 804
+    },
+    {
+      "epoch": 0.30480878455130633,
+      "grad_norm": 9.216273307800293,
+      "learning_rate": 0.00015890451572054482,
+      "loss": 4.0495,
+      "step": 805
+    },
+    {
+      "epoch": 0.30518742900416507,
+      "grad_norm": 10.508468627929688,
+      "learning_rate": 0.00015880761263620515,
+      "loss": 3.4153,
+      "step": 806
+    },
+    {
+      "epoch": 0.30556607345702386,
+      "grad_norm": 13.808286666870117,
+      "learning_rate": 0.00015871062506305408,
+      "loss": 3.4353,
+      "step": 807
+    },
+    {
+      "epoch": 0.30594471790988265,
+      "grad_norm": 12.350955963134766,
+      "learning_rate": 0.00015861355314043343,
+      "loss": 3.5035,
+      "step": 808
+    },
+    {
+      "epoch": 0.3063233623627414,
+      "grad_norm": 12.85565185546875,
+      "learning_rate": 0.00015851639700780642,
+      "loss": 3.8184,
+      "step": 809
+    },
+    {
+      "epoch": 0.30670200681560017,
+      "grad_norm": 13.963553428649902,
+      "learning_rate": 0.000158419156804757,
+      "loss": 4.7287,
+      "step": 810
+    },
+    {
+      "epoch": 0.3070806512684589,
+      "grad_norm": 15.577609062194824,
+      "learning_rate": 0.0001583218326709901,
+      "loss": 3.6594,
+      "step": 811
+    },
+    {
+      "epoch": 0.3074592957213177,
+      "grad_norm": 11.10647201538086,
+      "learning_rate": 0.00015822442474633115,
+      "loss": 2.9355,
+      "step": 812
+    },
+    {
+      "epoch": 0.3078379401741764,
+      "grad_norm": 13.10251522064209,
+      "learning_rate": 0.00015812693317072596,
+      "loss": 4.3878,
+      "step": 813
+    },
+    {
+      "epoch": 0.3082165846270352,
+      "grad_norm": 12.302017211914062,
+      "learning_rate": 0.00015802935808424055,
+      "loss": 2.902,
+      "step": 814
+    },
+    {
+      "epoch": 0.308595229079894,
+      "grad_norm": 13.663749694824219,
+      "learning_rate": 0.00015793169962706092,
+      "loss": 2.7841,
+      "step": 815
+    },
+    {
+      "epoch": 0.30897387353275274,
+      "grad_norm": 13.366521835327148,
+      "learning_rate": 0.00015783395793949278,
+      "loss": 3.4101,
+      "step": 816
+    },
+    {
+      "epoch": 0.30935251798561153,
+      "grad_norm": 16.41577911376953,
+      "learning_rate": 0.00015773613316196147,
+      "loss": 3.334,
+      "step": 817
+    },
+    {
+      "epoch": 0.30973116243847026,
+      "grad_norm": 15.605032920837402,
+      "learning_rate": 0.0001576382254350118,
+      "loss": 3.7084,
+      "step": 818
+    },
+    {
+      "epoch": 0.31010980689132905,
+      "grad_norm": 14.417840003967285,
+      "learning_rate": 0.00015754023489930754,
+      "loss": 3.0134,
+      "step": 819
+    },
+    {
+      "epoch": 0.3104884513441878,
+      "grad_norm": 17.02623176574707,
+      "learning_rate": 0.00015744216169563164,
+      "loss": 3.0973,
+      "step": 820
+    },
+    {
+      "epoch": 0.3108670957970466,
+      "grad_norm": 14.048128128051758,
+      "learning_rate": 0.00015734400596488567,
+      "loss": 2.4681,
+      "step": 821
+    },
+    {
+      "epoch": 0.3112457402499053,
+      "grad_norm": 22.928178787231445,
+      "learning_rate": 0.00015724576784808986,
+      "loss": 4.2287,
+      "step": 822
+    },
+    {
+      "epoch": 0.3116243847027641,
+      "grad_norm": 16.560827255249023,
+      "learning_rate": 0.00015714744748638278,
+      "loss": 2.381,
+      "step": 823
+    },
+    {
+      "epoch": 0.3120030291556229,
+      "grad_norm": 16.747251510620117,
+      "learning_rate": 0.0001570490450210211,
+      "loss": 1.6694,
+      "step": 824
+    },
+    {
+      "epoch": 0.3123816736084816,
+      "grad_norm": 26.673425674438477,
+      "learning_rate": 0.00015695056059337952,
+      "loss": 1.5667,
+      "step": 825
+    },
+    {
+      "epoch": 0.3127603180613404,
+      "grad_norm": 7.984857082366943,
+      "learning_rate": 0.00015685199434495051,
+      "loss": 4.4119,
+      "step": 826
+    },
+    {
+      "epoch": 0.31313896251419915,
+      "grad_norm": 9.042461395263672,
+      "learning_rate": 0.00015675334641734398,
+      "loss": 4.3624,
+      "step": 827
+    },
+    {
+      "epoch": 0.31351760696705794,
+      "grad_norm": 10.055127143859863,
+      "learning_rate": 0.00015665461695228735,
+      "loss": 4.276,
+      "step": 828
+    },
+    {
+      "epoch": 0.31389625141991667,
+      "grad_norm": 9.659919738769531,
+      "learning_rate": 0.00015655580609162504,
+      "loss": 3.5357,
+      "step": 829
+    },
+    {
+      "epoch": 0.31427489587277546,
+      "grad_norm": 10.656012535095215,
+      "learning_rate": 0.00015645691397731852,
+      "loss": 4.0171,
+      "step": 830
+    },
+    {
+      "epoch": 0.31465354032563425,
+      "grad_norm": 11.442161560058594,
+      "learning_rate": 0.00015635794075144588,
+      "loss": 3.8396,
+      "step": 831
+    },
+    {
+      "epoch": 0.315032184778493,
+      "grad_norm": 12.612800598144531,
+      "learning_rate": 0.00015625888655620187,
+      "loss": 4.2947,
+      "step": 832
+    },
+    {
+      "epoch": 0.3154108292313518,
+      "grad_norm": 12.016472816467285,
+      "learning_rate": 0.00015615975153389746,
+      "loss": 3.9577,
+      "step": 833
+    },
+    {
+      "epoch": 0.3157894736842105,
+      "grad_norm": 10.963457107543945,
+      "learning_rate": 0.00015606053582695984,
+      "loss": 4.1569,
+      "step": 834
+    },
+    {
+      "epoch": 0.3161681181370693,
+      "grad_norm": 12.133650779724121,
+      "learning_rate": 0.00015596123957793202,
+      "loss": 3.681,
+      "step": 835
+    },
+    {
+      "epoch": 0.31654676258992803,
+      "grad_norm": 12.980992317199707,
+      "learning_rate": 0.0001558618629294728,
+      "loss": 3.614,
+      "step": 836
+    },
+    {
+      "epoch": 0.3169254070427868,
+      "grad_norm": 11.19620132446289,
+      "learning_rate": 0.0001557624060243565,
+      "loss": 3.6321,
+      "step": 837
+    },
+    {
+      "epoch": 0.3173040514956456,
+      "grad_norm": 14.250601768493652,
+      "learning_rate": 0.00015566286900547266,
+      "loss": 4.1902,
+      "step": 838
+    },
+    {
+      "epoch": 0.31768269594850435,
+      "grad_norm": 12.371217727661133,
+      "learning_rate": 0.000155563252015826,
+      "loss": 2.7028,
+      "step": 839
+    },
+    {
+      "epoch": 0.31806134040136314,
+      "grad_norm": 12.687495231628418,
+      "learning_rate": 0.00015546355519853607,
+      "loss": 2.4365,
+      "step": 840
+    },
+    {
+      "epoch": 0.31843998485422187,
+      "grad_norm": 12.307214736938477,
+      "learning_rate": 0.00015536377869683718,
+      "loss": 2.7681,
+      "step": 841
+    },
+    {
+      "epoch": 0.31881862930708066,
+      "grad_norm": 15.518838882446289,
+      "learning_rate": 0.0001552639226540781,
+      "loss": 3.1019,
+      "step": 842
+    },
+    {
+      "epoch": 0.3191972737599394,
+      "grad_norm": 14.274090766906738,
+      "learning_rate": 0.00015516398721372179,
+      "loss": 2.8421,
+      "step": 843
+    },
+    {
+      "epoch": 0.3195759182127982,
+      "grad_norm": 19.139890670776367,
+      "learning_rate": 0.00015506397251934543,
+      "loss": 2.5628,
+      "step": 844
+    },
+    {
+      "epoch": 0.31995456266565697,
+      "grad_norm": 17.884008407592773,
+      "learning_rate": 0.00015496387871463988,
+      "loss": 2.3613,
+      "step": 845
+    },
+    {
+      "epoch": 0.3203332071185157,
+      "grad_norm": 16.46691131591797,
+      "learning_rate": 0.0001548637059434099,
+      "loss": 2.4046,
+      "step": 846
+    },
+    {
+      "epoch": 0.3207118515713745,
+      "grad_norm": 16.158769607543945,
+      "learning_rate": 0.00015476345434957346,
+      "loss": 2.9732,
+      "step": 847
+    },
+    {
+      "epoch": 0.32109049602423323,
+      "grad_norm": 25.788095474243164,
+      "learning_rate": 0.00015466312407716194,
+      "loss": 3.0837,
+      "step": 848
+    },
+    {
+      "epoch": 0.321469140477092,
+      "grad_norm": 27.709606170654297,
+      "learning_rate": 0.00015456271527031966,
+      "loss": 2.3595,
+      "step": 849
+    },
+    {
+      "epoch": 0.32184778492995075,
+      "grad_norm": 27.167621612548828,
+      "learning_rate": 0.00015446222807330383,
+      "loss": 2.2286,
+      "step": 850
+    },
+    {
+      "epoch": 0.32222642938280954,
+      "grad_norm": 8.955855369567871,
+      "learning_rate": 0.00015436166263048425,
+      "loss": 4.3385,
+      "step": 851
+    },
+    {
+      "epoch": 0.32260507383566833,
+      "grad_norm": 8.619714736938477,
+      "learning_rate": 0.00015426101908634312,
+      "loss": 3.7368,
+      "step": 852
+    },
+    {
+      "epoch": 0.32298371828852707,
+      "grad_norm": 9.597879409790039,
+      "learning_rate": 0.00015416029758547493,
+      "loss": 3.8133,
+      "step": 853
+    },
+    {
+      "epoch": 0.32336236274138586,
+      "grad_norm": 10.818007469177246,
+      "learning_rate": 0.00015405949827258604,
+      "loss": 4.1761,
+      "step": 854
+    },
+    {
+      "epoch": 0.3237410071942446,
+      "grad_norm": 10.386642456054688,
+      "learning_rate": 0.00015395862129249474,
+      "loss": 3.6592,
+      "step": 855
+    },
+    {
+      "epoch": 0.3241196516471034,
+      "grad_norm": 11.960341453552246,
+      "learning_rate": 0.00015385766679013081,
+      "loss": 3.6471,
+      "step": 856
+    },
+    {
+      "epoch": 0.3244982960999621,
+      "grad_norm": 13.14782428741455,
+      "learning_rate": 0.00015375663491053545,
+      "loss": 3.9707,
+      "step": 857
+    },
+    {
+      "epoch": 0.3248769405528209,
+      "grad_norm": 12.082589149475098,
+      "learning_rate": 0.000153655525798861,
+      "loss": 3.5612,
+      "step": 858
+    },
+    {
+      "epoch": 0.3252555850056797,
+      "grad_norm": 11.448456764221191,
+      "learning_rate": 0.00015355433960037077,
+      "loss": 3.737,
+      "step": 859
+    },
+    {
+      "epoch": 0.3256342294585384,
+      "grad_norm": 12.987861633300781,
+      "learning_rate": 0.0001534530764604389,
+      "loss": 3.8811,
+      "step": 860
+    },
+    {
+      "epoch": 0.3260128739113972,
+      "grad_norm": 12.712824821472168,
+      "learning_rate": 0.00015335173652454985,
+      "loss": 3.5249,
+      "step": 861
+    },
+    {
+      "epoch": 0.32639151836425595,
+      "grad_norm": 11.121883392333984,
+      "learning_rate": 0.00015325031993829868,
+      "loss": 2.6656,
+      "step": 862
+    },
+    {
+      "epoch": 0.32677016281711474,
+      "grad_norm": 14.241087913513184,
+      "learning_rate": 0.0001531488268473904,
+      "loss": 3.9731,
+      "step": 863
+    },
+    {
+      "epoch": 0.3271488072699735,
+      "grad_norm": 13.581354141235352,
+      "learning_rate": 0.00015304725739764,
+      "loss": 3.2629,
+      "step": 864
+    },
+    {
+      "epoch": 0.32752745172283226,
+      "grad_norm": 15.62415599822998,
+      "learning_rate": 0.00015294561173497215,
+      "loss": 3.9048,
+      "step": 865
+    },
+    {
+      "epoch": 0.32790609617569105,
+      "grad_norm": 12.98635196685791,
+      "learning_rate": 0.00015284389000542103,
+      "loss": 2.6195,
+      "step": 866
+    },
+    {
+      "epoch": 0.3282847406285498,
+      "grad_norm": 15.516901016235352,
+      "learning_rate": 0.00015274209235513014,
+      "loss": 3.2572,
+      "step": 867
+    },
+    {
+      "epoch": 0.3286633850814086,
+      "grad_norm": 13.609155654907227,
+      "learning_rate": 0.00015264021893035193,
+      "loss": 2.7172,
+      "step": 868
+    },
+    {
+      "epoch": 0.3290420295342673,
+      "grad_norm": 15.977977752685547,
+      "learning_rate": 0.00015253826987744789,
+      "loss": 3.2585,
+      "step": 869
+    },
+    {
+      "epoch": 0.3294206739871261,
+      "grad_norm": 14.53819751739502,
+      "learning_rate": 0.00015243624534288803,
+      "loss": 2.9884,
+      "step": 870
+    },
+    {
+      "epoch": 0.32979931843998483,
+      "grad_norm": 18.142704010009766,
+      "learning_rate": 0.00015233414547325083,
+      "loss": 3.0888,
+      "step": 871
+    },
+    {
+      "epoch": 0.3301779628928436,
+      "grad_norm": 17.9478816986084,
+      "learning_rate": 0.00015223197041522307,
+      "loss": 2.0567,
+      "step": 872
+    },
+    {
+      "epoch": 0.33055660734570236,
+      "grad_norm": 15.515186309814453,
+      "learning_rate": 0.00015212972031559946,
+      "loss": 2.056,
+      "step": 873
+    },
+    {
+      "epoch": 0.33093525179856115,
+      "grad_norm": 20.402446746826172,
+      "learning_rate": 0.00015202739532128265,
+      "loss": 1.867,
+      "step": 874
+    },
+    {
+      "epoch": 0.33131389625141994,
+      "grad_norm": 13.986373901367188,
+      "learning_rate": 0.0001519249955792827,
+      "loss": 1.5481,
+      "step": 875
+    },
+    {
+      "epoch": 0.33169254070427867,
+      "grad_norm": 9.035808563232422,
+      "learning_rate": 0.00015182252123671725,
+      "loss": 4.4831,
+      "step": 876
+    },
+    {
+      "epoch": 0.33207118515713746,
+      "grad_norm": 9.396247863769531,
+      "learning_rate": 0.000151719972440811,
+      "loss": 4.1913,
+      "step": 877
+    },
+    {
+      "epoch": 0.3324498296099962,
+      "grad_norm": 10.645395278930664,
+      "learning_rate": 0.0001516173493388957,
+      "loss": 4.71,
+      "step": 878
+    },
+    {
+      "epoch": 0.332828474062855,
+      "grad_norm": 11.150712966918945,
+      "learning_rate": 0.00015151465207840977,
+      "loss": 4.2096,
+      "step": 879
+    },
+    {
+      "epoch": 0.3332071185157137,
+      "grad_norm": 10.260977745056152,
+      "learning_rate": 0.00015141188080689826,
+      "loss": 3.1771,
+      "step": 880
+    },
+    {
+      "epoch": 0.3335857629685725,
+      "grad_norm": 10.818496704101562,
+      "learning_rate": 0.00015130903567201243,
+      "loss": 2.9112,
+      "step": 881
+    },
+    {
+      "epoch": 0.3339644074214313,
+      "grad_norm": 11.379049301147461,
+      "learning_rate": 0.0001512061168215098,
+      "loss": 3.8058,
+      "step": 882
+    },
+    {
+      "epoch": 0.33434305187429003,
+      "grad_norm": 12.107205390930176,
+      "learning_rate": 0.00015110312440325368,
+      "loss": 3.271,
+      "step": 883
+    },
+    {
+      "epoch": 0.3347216963271488,
+      "grad_norm": 12.379898071289062,
+      "learning_rate": 0.0001510000585652132,
+      "loss": 2.992,
+      "step": 884
+    },
+    {
+      "epoch": 0.33510034078000756,
+      "grad_norm": 11.953714370727539,
+      "learning_rate": 0.00015089691945546283,
+      "loss": 3.1566,
+      "step": 885
+    },
+    {
+      "epoch": 0.33547898523286634,
+      "grad_norm": 13.055462837219238,
+      "learning_rate": 0.00015079370722218243,
+      "loss": 2.5646,
+      "step": 886
+    },
+    {
+      "epoch": 0.3358576296857251,
+      "grad_norm": 12.182693481445312,
+      "learning_rate": 0.00015069042201365683,
+      "loss": 2.9366,
+      "step": 887
+    },
+    {
+      "epoch": 0.33623627413858387,
+      "grad_norm": 13.180964469909668,
+      "learning_rate": 0.00015058706397827573,
+      "loss": 4.0075,
+      "step": 888
+    },
+    {
+      "epoch": 0.33661491859144266,
+      "grad_norm": 12.289628982543945,
+      "learning_rate": 0.0001504836332645335,
+      "loss": 2.5069,
+      "step": 889
+    },
+    {
+      "epoch": 0.3369935630443014,
+      "grad_norm": 11.804617881774902,
+      "learning_rate": 0.00015038013002102892,
+      "loss": 2.0101,
+      "step": 890
+    },
+    {
+      "epoch": 0.3373722074971602,
+      "grad_norm": 14.811490058898926,
+      "learning_rate": 0.00015027655439646488,
+      "loss": 3.8222,
+      "step": 891
+    },
+    {
+      "epoch": 0.3377508519500189,
+      "grad_norm": 15.269726753234863,
+      "learning_rate": 0.00015017290653964835,
+      "loss": 2.9604,
+      "step": 892
+    },
+    {
+      "epoch": 0.3381294964028777,
+      "grad_norm": 13.442567825317383,
+      "learning_rate": 0.0001500691865994901,
+      "loss": 3.0957,
+      "step": 893
+    },
+    {
+      "epoch": 0.33850814085573644,
+      "grad_norm": 15.218294143676758,
+      "learning_rate": 0.00014996539472500437,
+      "loss": 2.7899,
+      "step": 894
+    },
+    {
+      "epoch": 0.33888678530859523,
+      "grad_norm": 13.601509094238281,
+      "learning_rate": 0.00014986153106530883,
+      "loss": 2.6892,
+      "step": 895
+    },
+    {
+      "epoch": 0.339265429761454,
+      "grad_norm": 13.653763771057129,
+      "learning_rate": 0.00014975759576962424,
+      "loss": 2.2024,
+      "step": 896
+    },
+    {
+      "epoch": 0.33964407421431275,
+      "grad_norm": 17.351696014404297,
+      "learning_rate": 0.00014965358898727423,
+      "loss": 2.777,
+      "step": 897
+    },
+    {
+      "epoch": 0.34002271866717154,
+      "grad_norm": 32.49483108520508,
+      "learning_rate": 0.00014954951086768525,
+      "loss": 2.3369,
+      "step": 898
+    },
+    {
+      "epoch": 0.3404013631200303,
+      "grad_norm": 37.68558120727539,
+      "learning_rate": 0.0001494453615603862,
+      "loss": 3.0663,
+      "step": 899
+    },
+    {
+      "epoch": 0.34078000757288907,
+      "grad_norm": 27.460304260253906,
+      "learning_rate": 0.00014934114121500818,
+      "loss": 2.0837,
+      "step": 900
+    },
+    {
+      "epoch": 0.3411586520257478,
+      "grad_norm": 19.336395263671875,
+      "learning_rate": 0.00014923684998128446,
+      "loss": 4.6271,
+      "step": 901
+    },
+    {
+      "epoch": 0.3415372964786066,
+      "grad_norm": 11.99440860748291,
+      "learning_rate": 0.00014913248800905006,
+      "loss": 4.4893,
+      "step": 902
+    },
+    {
+      "epoch": 0.3419159409314654,
+      "grad_norm": 10.598093032836914,
+      "learning_rate": 0.00014902805544824175,
+      "loss": 3.813,
+      "step": 903
+    },
+    {
+      "epoch": 0.3422945853843241,
+      "grad_norm": 10.407685279846191,
+      "learning_rate": 0.00014892355244889752,
+      "loss": 4.3924,
+      "step": 904
+    },
+    {
+      "epoch": 0.3426732298371829,
+      "grad_norm": 11.969062805175781,
+      "learning_rate": 0.0001488189791611568,
+      "loss": 3.9199,
+      "step": 905
+    },
+    {
+      "epoch": 0.34305187429004164,
+      "grad_norm": 10.909595489501953,
+      "learning_rate": 0.00014871433573525976,
+      "loss": 3.5213,
+      "step": 906
+    },
+    {
+      "epoch": 0.3434305187429004,
+      "grad_norm": 11.326231956481934,
+      "learning_rate": 0.00014860962232154755,
+      "loss": 3.2244,
+      "step": 907
+    },
+    {
+      "epoch": 0.34380916319575916,
+      "grad_norm": 12.978373527526855,
+      "learning_rate": 0.00014850483907046175,
+      "loss": 4.087,
+      "step": 908
+    },
+    {
+      "epoch": 0.34418780764861795,
+      "grad_norm": 13.51850414276123,
+      "learning_rate": 0.00014839998613254432,
+      "loss": 3.7443,
+      "step": 909
+    },
+    {
+      "epoch": 0.34456645210147674,
+      "grad_norm": 13.952939987182617,
+      "learning_rate": 0.00014829506365843725,
+      "loss": 4.2233,
+      "step": 910
+    },
+    {
+      "epoch": 0.3449450965543355,
+      "grad_norm": 14.313178062438965,
+      "learning_rate": 0.00014819007179888262,
+      "loss": 3.744,
+      "step": 911
+    },
+    {
+      "epoch": 0.34532374100719426,
+      "grad_norm": 13.837858200073242,
+      "learning_rate": 0.000148085010704722,
+      "loss": 3.4982,
+      "step": 912
+    },
+    {
+      "epoch": 0.345702385460053,
+      "grad_norm": 12.670626640319824,
+      "learning_rate": 0.0001479798805268965,
+      "loss": 2.5508,
+      "step": 913
+    },
+    {
+      "epoch": 0.3460810299129118,
+      "grad_norm": 14.74666976928711,
+      "learning_rate": 0.00014787468141644658,
+      "loss": 3.6456,
+      "step": 914
+    },
+    {
+      "epoch": 0.3464596743657705,
+      "grad_norm": 14.362848281860352,
+      "learning_rate": 0.0001477694135245116,
+      "loss": 3.3422,
+      "step": 915
+    },
+    {
+      "epoch": 0.3468383188186293,
+      "grad_norm": 12.029289245605469,
+      "learning_rate": 0.00014766407700232974,
+      "loss": 2.7627,
+      "step": 916
+    },
+    {
+      "epoch": 0.3472169632714881,
+      "grad_norm": 13.28024673461914,
+      "learning_rate": 0.00014755867200123789,
+      "loss": 2.4415,
+      "step": 917
+    },
+    {
+      "epoch": 0.34759560772434683,
+      "grad_norm": 16.25495719909668,
+      "learning_rate": 0.00014745319867267122,
+      "loss": 3.8264,
+      "step": 918
+    },
+    {
+      "epoch": 0.3479742521772056,
+      "grad_norm": 14.264103889465332,
+      "learning_rate": 0.00014734765716816316,
+      "loss": 2.3678,
+      "step": 919
+    },
+    {
+      "epoch": 0.34835289663006436,
+      "grad_norm": 16.4278507232666,
+      "learning_rate": 0.00014724204763934498,
+      "loss": 3.2339,
+      "step": 920
+    },
+    {
+      "epoch": 0.34873154108292315,
+      "grad_norm": 12.346698760986328,
+      "learning_rate": 0.0001471363702379458,
+      "loss": 2.3282,
+      "step": 921
+    },
+    {
+      "epoch": 0.3491101855357819,
+      "grad_norm": 16.423734664916992,
+      "learning_rate": 0.00014703062511579212,
+      "loss": 2.2432,
+      "step": 922
+    },
+    {
+      "epoch": 0.34948882998864067,
+      "grad_norm": 36.795833587646484,
+      "learning_rate": 0.00014692481242480784,
+      "loss": 2.8118,
+      "step": 923
+    },
+    {
+      "epoch": 0.34986747444149946,
+      "grad_norm": 22.425527572631836,
+      "learning_rate": 0.0001468189323170139,
+      "loss": 2.0988,
+      "step": 924
+    },
+    {
+      "epoch": 0.3502461188943582,
+      "grad_norm": 21.815776824951172,
+      "learning_rate": 0.00014671298494452808,
+      "loss": 2.1386,
+      "step": 925
+    },
+    {
+      "epoch": 0.350624763347217,
+      "grad_norm": 9.949638366699219,
+      "learning_rate": 0.0001466069704595648,
+      "loss": 4.477,
+      "step": 926
+    },
+    {
+      "epoch": 0.3510034078000757,
+      "grad_norm": 10.098043441772461,
+      "learning_rate": 0.000146500889014435,
+      "loss": 3.9642,
+      "step": 927
+    },
+    {
+      "epoch": 0.3513820522529345,
+      "grad_norm": 9.761126518249512,
+      "learning_rate": 0.00014639474076154566,
+      "loss": 3.7614,
+      "step": 928
+    },
+    {
+      "epoch": 0.35176069670579324,
+      "grad_norm": 11.026824951171875,
+      "learning_rate": 0.00014628852585339984,
+      "loss": 4.2254,
+      "step": 929
+    },
+    {
+      "epoch": 0.35213934115865203,
+      "grad_norm": 11.74862289428711,
+      "learning_rate": 0.00014618224444259628,
+      "loss": 3.1092,
+      "step": 930
+    },
+    {
+      "epoch": 0.35251798561151076,
+      "grad_norm": 10.165847778320312,
+      "learning_rate": 0.00014607589668182947,
+      "loss": 2.6807,
+      "step": 931
+    },
+    {
+      "epoch": 0.35289663006436955,
+      "grad_norm": 12.149169921875,
+      "learning_rate": 0.00014596948272388896,
+      "loss": 2.9791,
+      "step": 932
+    },
+    {
+      "epoch": 0.35327527451722834,
+      "grad_norm": 12.490134239196777,
+      "learning_rate": 0.0001458630027216596,
+      "loss": 3.9789,
+      "step": 933
+    },
+    {
+      "epoch": 0.3536539189700871,
+      "grad_norm": 13.850975036621094,
+      "learning_rate": 0.000145756456828121,
+      "loss": 3.4066,
+      "step": 934
+    },
+    {
+      "epoch": 0.35403256342294587,
+      "grad_norm": 15.180842399597168,
+      "learning_rate": 0.00014564984519634754,
+      "loss": 3.2428,
+      "step": 935
+    },
+    {
+      "epoch": 0.3544112078758046,
+      "grad_norm": 13.27072811126709,
+      "learning_rate": 0.00014554316797950797,
+      "loss": 2.6158,
+      "step": 936
+    },
+    {
+      "epoch": 0.3547898523286634,
+      "grad_norm": 12.887181282043457,
+      "learning_rate": 0.0001454364253308653,
+      "loss": 3.6556,
+      "step": 937
+    },
+    {
+      "epoch": 0.3551684967815221,
+      "grad_norm": 14.38553237915039,
+      "learning_rate": 0.00014532961740377652,
+      "loss": 3.6761,
+      "step": 938
+    },
+    {
+      "epoch": 0.3555471412343809,
+      "grad_norm": 13.48885726928711,
+      "learning_rate": 0.00014522274435169245,
+      "loss": 2.8547,
+      "step": 939
+    },
+    {
+      "epoch": 0.3559257856872397,
+      "grad_norm": 12.696219444274902,
+      "learning_rate": 0.00014511580632815742,
+      "loss": 2.4686,
+      "step": 940
+    },
+    {
+      "epoch": 0.35630443014009844,
+      "grad_norm": 12.52086067199707,
+      "learning_rate": 0.00014500880348680917,
+      "loss": 3.3242,
+      "step": 941
+    },
+    {
+      "epoch": 0.3566830745929572,
+      "grad_norm": 13.25282096862793,
+      "learning_rate": 0.00014490173598137845,
+      "loss": 2.3792,
+      "step": 942
+    },
+    {
+      "epoch": 0.35706171904581596,
+      "grad_norm": 12.935431480407715,
+      "learning_rate": 0.0001447946039656891,
+      "loss": 2.1999,
+      "step": 943
+    },
+    {
+      "epoch": 0.35744036349867475,
+      "grad_norm": 13.861615180969238,
+      "learning_rate": 0.00014468740759365743,
+      "loss": 2.7313,
+      "step": 944
+    },
+    {
+      "epoch": 0.3578190079515335,
+      "grad_norm": 15.322652816772461,
+      "learning_rate": 0.00014458014701929239,
+      "loss": 2.6993,
+      "step": 945
+    },
+    {
+      "epoch": 0.3581976524043923,
+      "grad_norm": 15.554706573486328,
+      "learning_rate": 0.00014447282239669502,
+      "loss": 2.1881,
+      "step": 946
+    },
+    {
+      "epoch": 0.35857629685725106,
+      "grad_norm": 15.744156837463379,
+      "learning_rate": 0.0001443654338800585,
+      "loss": 3.1557,
+      "step": 947
+    },
+    {
+      "epoch": 0.3589549413101098,
+      "grad_norm": 15.191664695739746,
+      "learning_rate": 0.00014425798162366775,
+      "loss": 2.1443,
+      "step": 948
+    },
+    {
+      "epoch": 0.3593335857629686,
+      "grad_norm": 16.317235946655273,
+      "learning_rate": 0.00014415046578189928,
+      "loss": 1.921,
+      "step": 949
+    },
+    {
+      "epoch": 0.3597122302158273,
+      "grad_norm": 35.329994201660156,
+      "learning_rate": 0.0001440428865092209,
+      "loss": 3.1096,
+      "step": 950
+    },
+    {
+      "epoch": 0.3600908746686861,
+      "grad_norm": 9.379858016967773,
+      "learning_rate": 0.0001439352439601916,
+      "loss": 4.936,
+      "step": 951
+    },
+    {
+      "epoch": 0.36046951912154485,
+      "grad_norm": 10.979476928710938,
+      "learning_rate": 0.0001438275382894613,
+      "loss": 3.8354,
+      "step": 952
+    },
+    {
+      "epoch": 0.36084816357440364,
+      "grad_norm": 10.961803436279297,
+      "learning_rate": 0.00014371976965177062,
+      "loss": 3.6228,
+      "step": 953
+    },
+    {
+      "epoch": 0.3612268080272624,
+      "grad_norm": 11.461506843566895,
+      "learning_rate": 0.00014361193820195046,
+      "loss": 4.6714,
+      "step": 954
+    },
+    {
+      "epoch": 0.36160545248012116,
+      "grad_norm": 11.015750885009766,
+      "learning_rate": 0.0001435040440949223,
+      "loss": 3.3826,
+      "step": 955
+    },
+    {
+      "epoch": 0.36198409693297995,
+      "grad_norm": 10.362982749938965,
+      "learning_rate": 0.0001433960874856973,
+      "loss": 3.1965,
+      "step": 956
+    },
+    {
+      "epoch": 0.3623627413858387,
+      "grad_norm": 11.998297691345215,
+      "learning_rate": 0.0001432880685293766,
+      "loss": 3.4358,
+      "step": 957
+    },
+    {
+      "epoch": 0.36274138583869747,
+      "grad_norm": 12.979171752929688,
+      "learning_rate": 0.00014317998738115091,
+      "loss": 2.9082,
+      "step": 958
+    },
+    {
+      "epoch": 0.3631200302915562,
+      "grad_norm": 15.333057403564453,
+      "learning_rate": 0.00014307184419630028,
+      "loss": 3.7046,
+      "step": 959
+    },
+    {
+      "epoch": 0.363498674744415,
+      "grad_norm": 17.005517959594727,
+      "learning_rate": 0.0001429636391301938,
+      "loss": 4.5541,
+      "step": 960
+    },
+    {
+      "epoch": 0.3638773191972738,
+      "grad_norm": 12.545903205871582,
+      "learning_rate": 0.00014285537233828954,
+      "loss": 3.2909,
+      "step": 961
+    },
+    {
+      "epoch": 0.3642559636501325,
+      "grad_norm": 13.042165756225586,
+      "learning_rate": 0.00014274704397613426,
+      "loss": 3.3752,
+      "step": 962
+    },
+    {
+      "epoch": 0.3646346081029913,
+      "grad_norm": 13.057799339294434,
+      "learning_rate": 0.00014263865419936316,
+      "loss": 2.7918,
+      "step": 963
+    },
+    {
+      "epoch": 0.36501325255585004,
+      "grad_norm": 13.173884391784668,
+      "learning_rate": 0.00014253020316369968,
+      "loss": 3.1801,
+      "step": 964
+    },
+    {
+      "epoch": 0.36539189700870883,
+      "grad_norm": 13.131632804870605,
+      "learning_rate": 0.00014242169102495527,
+      "loss": 3.3128,
+      "step": 965
+    },
+    {
+      "epoch": 0.36577054146156757,
+      "grad_norm": 13.377184867858887,
+      "learning_rate": 0.0001423131179390291,
+      "loss": 2.649,
+      "step": 966
+    },
+    {
+      "epoch": 0.36614918591442636,
+      "grad_norm": 12.528219223022461,
+      "learning_rate": 0.00014220448406190807,
+      "loss": 3.169,
+      "step": 967
+    },
+    {
+      "epoch": 0.36652783036728515,
+      "grad_norm": 13.746808052062988,
+      "learning_rate": 0.0001420957895496662,
+      "loss": 2.7259,
+      "step": 968
+    },
+    {
+      "epoch": 0.3669064748201439,
+      "grad_norm": 21.110719680786133,
+      "learning_rate": 0.00014198703455846484,
+      "loss": 3.5514,
+      "step": 969
+    },
+    {
+      "epoch": 0.36728511927300267,
+      "grad_norm": 12.612068176269531,
+      "learning_rate": 0.00014187821924455208,
+      "loss": 2.0534,
+      "step": 970
+    },
+    {
+      "epoch": 0.3676637637258614,
+      "grad_norm": 20.146154403686523,
+      "learning_rate": 0.0001417693437642627,
+      "loss": 2.7005,
+      "step": 971
+    },
+    {
+      "epoch": 0.3680424081787202,
+      "grad_norm": 13.088459014892578,
+      "learning_rate": 0.00014166040827401797,
+      "loss": 1.9876,
+      "step": 972
+    },
+    {
+      "epoch": 0.3684210526315789,
+      "grad_norm": 18.44115447998047,
+      "learning_rate": 0.00014155141293032536,
+      "loss": 1.6056,
+      "step": 973
+    },
+    {
+      "epoch": 0.3687996970844377,
+      "grad_norm": 17.64615249633789,
+      "learning_rate": 0.0001414423578897783,
+      "loss": 1.8792,
+      "step": 974
+    },
+    {
+      "epoch": 0.3691783415372965,
+      "grad_norm": 17.28131103515625,
+      "learning_rate": 0.00014133324330905603,
+      "loss": 1.3712,
+      "step": 975
+    },
+    {
+      "epoch": 0.36955698599015524,
+      "grad_norm": 9.265350341796875,
+      "learning_rate": 0.0001412240693449233,
+      "loss": 3.5385,
+      "step": 976
+    },
+    {
+      "epoch": 0.36993563044301403,
+      "grad_norm": 10.460281372070312,
+      "learning_rate": 0.00014111483615423018,
+      "loss": 3.5476,
+      "step": 977
+    },
+    {
+      "epoch": 0.37031427489587276,
+      "grad_norm": 12.234232902526855,
+      "learning_rate": 0.00014100554389391182,
+      "loss": 5.028,
+      "step": 978
+    },
+    {
+      "epoch": 0.37069291934873155,
+      "grad_norm": 12.799249649047852,
+      "learning_rate": 0.0001408961927209883,
+      "loss": 4.389,
+      "step": 979
+    },
+    {
+      "epoch": 0.3710715638015903,
+      "grad_norm": 10.977117538452148,
+      "learning_rate": 0.00014078678279256423,
+      "loss": 3.5701,
+      "step": 980
+    },
+    {
+      "epoch": 0.3714502082544491,
+      "grad_norm": 11.370275497436523,
+      "learning_rate": 0.00014067731426582877,
+      "loss": 3.4377,
+      "step": 981
+    },
+    {
+      "epoch": 0.3718288527073078,
+      "grad_norm": 10.520308494567871,
+      "learning_rate": 0.00014056778729805512,
+      "loss": 3.1299,
+      "step": 982
+    },
+    {
+      "epoch": 0.3722074971601666,
+      "grad_norm": 12.40962028503418,
+      "learning_rate": 0.00014045820204660055,
+      "loss": 3.2693,
+      "step": 983
+    },
+    {
+      "epoch": 0.3725861416130254,
+      "grad_norm": 11.964371681213379,
+      "learning_rate": 0.00014034855866890602,
+      "loss": 3.8952,
+      "step": 984
+    },
+    {
+      "epoch": 0.3729647860658841,
+      "grad_norm": 12.887282371520996,
+      "learning_rate": 0.000140238857322496,
+      "loss": 2.8382,
+      "step": 985
+    },
+    {
+      "epoch": 0.3733434305187429,
+      "grad_norm": 12.985127449035645,
+      "learning_rate": 0.0001401290981649783,
+      "loss": 3.4678,
+      "step": 986
+    },
+    {
+      "epoch": 0.37372207497160165,
+      "grad_norm": 14.884915351867676,
+      "learning_rate": 0.0001400192813540437,
+      "loss": 3.6069,
+      "step": 987
+    },
+    {
+      "epoch": 0.37410071942446044,
+      "grad_norm": 14.4747953414917,
+      "learning_rate": 0.00013990940704746585,
+      "loss": 2.9554,
+      "step": 988
+    },
+    {
+      "epoch": 0.37447936387731917,
+      "grad_norm": 14.479387283325195,
+      "learning_rate": 0.00013979947540310102,
+      "loss": 2.7698,
+      "step": 989
+    },
+    {
+      "epoch": 0.37485800833017796,
+      "grad_norm": 14.599347114562988,
+      "learning_rate": 0.00013968948657888788,
+      "loss": 2.87,
+      "step": 990
+    },
+    {
+      "epoch": 0.37523665278303675,
+      "grad_norm": 14.958257675170898,
+      "learning_rate": 0.00013957944073284714,
+      "loss": 2.8528,
+      "step": 991
+    },
+    {
+      "epoch": 0.3756152972358955,
+      "grad_norm": 15.495623588562012,
+      "learning_rate": 0.00013946933802308156,
+      "loss": 3.7293,
+      "step": 992
+    },
+    {
+      "epoch": 0.3759939416887543,
+      "grad_norm": 10.040778160095215,
+      "learning_rate": 0.00013935917860777555,
+      "loss": 1.5618,
+      "step": 993
+    },
+    {
+      "epoch": 0.376372586141613,
+      "grad_norm": 15.657940864562988,
+      "learning_rate": 0.00013924896264519491,
+      "loss": 2.2425,
+      "step": 994
+    },
+    {
+      "epoch": 0.3767512305944718,
+      "grad_norm": 13.899797439575195,
+      "learning_rate": 0.00013913869029368682,
+      "loss": 2.3471,
+      "step": 995
+    },
+    {
+      "epoch": 0.37712987504733053,
+      "grad_norm": 14.696097373962402,
+      "learning_rate": 0.00013902836171167938,
+      "loss": 2.637,
+      "step": 996
+    },
+    {
+      "epoch": 0.3775085195001893,
+      "grad_norm": 18.564838409423828,
+      "learning_rate": 0.00013891797705768155,
+      "loss": 1.3815,
+      "step": 997
+    },
+    {
+      "epoch": 0.3778871639530481,
+      "grad_norm": 17.13040542602539,
+      "learning_rate": 0.00013880753649028274,
+      "loss": 2.0306,
+      "step": 998
+    },
+    {
+      "epoch": 0.37826580840590684,
+      "grad_norm": 22.14106559753418,
+      "learning_rate": 0.00013869704016815276,
+      "loss": 2.567,
+      "step": 999
+    },
+    {
+      "epoch": 0.37864445285876563,
+      "grad_norm": 20.40251922607422,
+      "learning_rate": 0.00013858648825004156,
+      "loss": 2.1573,
+      "step": 1000
+    },
+    {
+      "epoch": 0.37902309731162437,
+      "grad_norm": 9.879082679748535,
+      "learning_rate": 0.00013847588089477888,
+      "loss": 5.068,
+      "step": 1001
+    },
+    {
+      "epoch": 0.37940174176448316,
+      "grad_norm": 9.758732795715332,
+      "learning_rate": 0.00013836521826127412,
+      "loss": 3.3331,
+      "step": 1002
+    },
+    {
+      "epoch": 0.3797803862173419,
+      "grad_norm": 10.761160850524902,
+      "learning_rate": 0.00013825450050851623,
+      "loss": 3.4942,
+      "step": 1003
+    },
+    {
+      "epoch": 0.3801590306702007,
+      "grad_norm": 10.23125171661377,
+      "learning_rate": 0.00013814372779557312,
+      "loss": 3.689,
+      "step": 1004
+    },
+    {
+      "epoch": 0.38053767512305947,
+      "grad_norm": 12.541388511657715,
+      "learning_rate": 0.00013803290028159185,
+      "loss": 4.2033,
+      "step": 1005
+    },
+    {
+      "epoch": 0.3809163195759182,
+      "grad_norm": 10.713353157043457,
+      "learning_rate": 0.00013792201812579816,
+      "loss": 3.4712,
+      "step": 1006
+    },
+    {
+      "epoch": 0.381294964028777,
+      "grad_norm": 11.47879695892334,
+      "learning_rate": 0.00013781108148749625,
+      "loss": 3.4701,
+      "step": 1007
+    },
+    {
+      "epoch": 0.38167360848163573,
+      "grad_norm": 10.18622875213623,
+      "learning_rate": 0.00013770009052606862,
+      "loss": 2.702,
+      "step": 1008
+    },
+    {
+      "epoch": 0.3820522529344945,
+      "grad_norm": 15.202455520629883,
+      "learning_rate": 0.00013758904540097587,
+      "loss": 2.9407,
+      "step": 1009
+    },
+    {
+      "epoch": 0.38243089738735325,
+      "grad_norm": 13.018632888793945,
+      "learning_rate": 0.00013747794627175632,
+      "loss": 3.8735,
+      "step": 1010
+    },
+    {
+      "epoch": 0.38280954184021204,
+      "grad_norm": 13.541316986083984,
+      "learning_rate": 0.00013736679329802594,
+      "loss": 2.2223,
+      "step": 1011
+    },
+    {
+      "epoch": 0.38318818629307083,
+      "grad_norm": 14.50750732421875,
+      "learning_rate": 0.00013725558663947807,
+      "loss": 3.7973,
+      "step": 1012
+    },
+    {
+      "epoch": 0.38356683074592957,
+      "grad_norm": 16.93392562866211,
+      "learning_rate": 0.00013714432645588312,
+      "loss": 4.062,
+      "step": 1013
+    },
+    {
+      "epoch": 0.38394547519878836,
+      "grad_norm": 13.330880165100098,
+      "learning_rate": 0.00013703301290708843,
+      "loss": 2.7007,
+      "step": 1014
+    },
+    {
+      "epoch": 0.3843241196516471,
+      "grad_norm": 13.55959701538086,
+      "learning_rate": 0.00013692164615301808,
+      "loss": 3.2762,
+      "step": 1015
+    },
+    {
+      "epoch": 0.3847027641045059,
+      "grad_norm": 13.232234001159668,
+      "learning_rate": 0.00013681022635367245,
+      "loss": 2.4535,
+      "step": 1016
+    },
+    {
+      "epoch": 0.3850814085573646,
+      "grad_norm": 15.532430648803711,
+      "learning_rate": 0.00013669875366912823,
+      "loss": 2.5774,
+      "step": 1017
+    },
+    {
+      "epoch": 0.3854600530102234,
+      "grad_norm": 15.254117965698242,
+      "learning_rate": 0.00013658722825953806,
+      "loss": 2.6327,
+      "step": 1018
+    },
+    {
+      "epoch": 0.3858386974630822,
+      "grad_norm": 16.092777252197266,
+      "learning_rate": 0.00013647565028513037,
+      "loss": 2.2312,
+      "step": 1019
+    },
+    {
+      "epoch": 0.3862173419159409,
+      "grad_norm": 16.16512107849121,
+      "learning_rate": 0.00013636401990620896,
+      "loss": 2.8618,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3865959863687997,
+      "grad_norm": 12.459589958190918,
+      "learning_rate": 0.00013625233728315318,
+      "loss": 2.3862,
+      "step": 1021
+    },
+    {
+      "epoch": 0.38697463082165845,
+      "grad_norm": 14.847145080566406,
+      "learning_rate": 0.0001361406025764172,
+      "loss": 1.8623,
+      "step": 1022
+    },
+    {
+      "epoch": 0.38735327527451724,
+      "grad_norm": 16.524620056152344,
+      "learning_rate": 0.00013602881594653016,
+      "loss": 1.6795,
+      "step": 1023
+    },
+    {
+      "epoch": 0.387731919727376,
+      "grad_norm": 24.25473976135254,
+      "learning_rate": 0.00013591697755409573,
+      "loss": 3.2906,
+      "step": 1024
+    },
+    {
+      "epoch": 0.38811056418023476,
+      "grad_norm": 27.711610794067383,
+      "learning_rate": 0.0001358050875597919,
+      "loss": 1.9261,
+      "step": 1025
+    },
+    {
+      "epoch": 0.38848920863309355,
+      "grad_norm": 9.252058029174805,
+      "learning_rate": 0.00013569314612437098,
+      "loss": 4.3016,
+      "step": 1026
+    },
+    {
+      "epoch": 0.3888678530859523,
+      "grad_norm": 10.211181640625,
+      "learning_rate": 0.00013558115340865897,
+      "loss": 4.699,
+      "step": 1027
+    },
+    {
+      "epoch": 0.3892464975388111,
+      "grad_norm": 9.520386695861816,
+      "learning_rate": 0.0001354691095735557,
+      "loss": 3.8331,
+      "step": 1028
+    },
+    {
+      "epoch": 0.3896251419916698,
+      "grad_norm": 11.07938289642334,
+      "learning_rate": 0.00013535701478003439,
+      "loss": 2.8687,
+      "step": 1029
+    },
+    {
+      "epoch": 0.3900037864445286,
+      "grad_norm": 11.311447143554688,
+      "learning_rate": 0.0001352448691891414,
+      "loss": 3.0744,
+      "step": 1030
+    },
+    {
+      "epoch": 0.39038243089738733,
+      "grad_norm": 11.157035827636719,
+      "learning_rate": 0.00013513267296199618,
+      "loss": 3.2869,
+      "step": 1031
+    },
+    {
+      "epoch": 0.3907610753502461,
+      "grad_norm": 10.669620513916016,
+      "learning_rate": 0.0001350204262597909,
+      "loss": 3.5071,
+      "step": 1032
+    },
+    {
+      "epoch": 0.39113971980310486,
+      "grad_norm": 11.097278594970703,
+      "learning_rate": 0.00013490812924379022,
+      "loss": 2.2786,
+      "step": 1033
+    },
+    {
+      "epoch": 0.39151836425596365,
+      "grad_norm": 12.581409454345703,
+      "learning_rate": 0.0001347957820753311,
+      "loss": 3.7062,
+      "step": 1034
+    },
+    {
+      "epoch": 0.39189700870882244,
+      "grad_norm": 10.81505298614502,
+      "learning_rate": 0.00013468338491582252,
+      "loss": 3.2618,
+      "step": 1035
+    },
+    {
+      "epoch": 0.39227565316168117,
+      "grad_norm": 12.934078216552734,
+      "learning_rate": 0.00013457093792674537,
+      "loss": 3.0491,
+      "step": 1036
+    },
+    {
+      "epoch": 0.39265429761453996,
+      "grad_norm": 12.945857048034668,
+      "learning_rate": 0.00013445844126965206,
+      "loss": 2.3032,
+      "step": 1037
+    },
+    {
+      "epoch": 0.3930329420673987,
+      "grad_norm": 13.578465461730957,
+      "learning_rate": 0.00013434589510616634,
+      "loss": 2.4166,
+      "step": 1038
+    },
+    {
+      "epoch": 0.3934115865202575,
+      "grad_norm": 15.570049285888672,
+      "learning_rate": 0.00013423329959798315,
+      "loss": 3.2948,
+      "step": 1039
+    },
+    {
+      "epoch": 0.3937902309731162,
+      "grad_norm": 15.420329093933105,
+      "learning_rate": 0.0001341206549068683,
+      "loss": 3.0431,
+      "step": 1040
+    },
+    {
+      "epoch": 0.394168875425975,
+      "grad_norm": 16.096786499023438,
+      "learning_rate": 0.00013400796119465824,
+      "loss": 2.3038,
+      "step": 1041
+    },
+    {
+      "epoch": 0.3945475198788338,
+      "grad_norm": 15.989263534545898,
+      "learning_rate": 0.00013389521862325985,
+      "loss": 3.3304,
+      "step": 1042
+    },
+    {
+      "epoch": 0.39492616433169253,
+      "grad_norm": 16.243566513061523,
+      "learning_rate": 0.00013378242735465022,
+      "loss": 4.0894,
+      "step": 1043
+    },
+    {
+      "epoch": 0.3953048087845513,
+      "grad_norm": 13.244513511657715,
+      "learning_rate": 0.00013366958755087644,
+      "loss": 2.5639,
+      "step": 1044
+    },
+    {
+      "epoch": 0.39568345323741005,
+      "grad_norm": 13.560445785522461,
+      "learning_rate": 0.00013355669937405526,
+      "loss": 2.6478,
+      "step": 1045
+    },
+    {
+      "epoch": 0.39606209769026884,
+      "grad_norm": 19.593982696533203,
+      "learning_rate": 0.00013344376298637294,
+      "loss": 2.9598,
+      "step": 1046
+    },
+    {
+      "epoch": 0.3964407421431276,
+      "grad_norm": 13.61347770690918,
+      "learning_rate": 0.00013333077855008508,
+      "loss": 2.0055,
+      "step": 1047
+    },
+    {
+      "epoch": 0.39681938659598637,
+      "grad_norm": 14.087455749511719,
+      "learning_rate": 0.00013321774622751618,
+      "loss": 2.1689,
+      "step": 1048
+    },
+    {
+      "epoch": 0.39719803104884516,
+      "grad_norm": 20.69855499267578,
+      "learning_rate": 0.0001331046661810597,
+      "loss": 1.4113,
+      "step": 1049
+    },
+    {
+      "epoch": 0.3975766755017039,
+      "grad_norm": 34.63194274902344,
+      "learning_rate": 0.00013299153857317748,
+      "loss": 2.1471,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3979553199545627,
+      "grad_norm": 8.219612121582031,
+      "learning_rate": 0.0001328783635663999,
+      "loss": 3.5702,
+      "step": 1051
+    },
+    {
+      "epoch": 0.3983339644074214,
+      "grad_norm": 9.948872566223145,
+      "learning_rate": 0.00013276514132332521,
+      "loss": 3.3578,
+      "step": 1052
+    },
+    {
+      "epoch": 0.3987126088602802,
+      "grad_norm": 11.182106018066406,
+      "learning_rate": 0.00013265187200661976,
+      "loss": 3.9353,
+      "step": 1053
+    },
+    {
+      "epoch": 0.39909125331313894,
+      "grad_norm": 12.669611930847168,
+      "learning_rate": 0.00013253855577901732,
+      "loss": 3.9309,
+      "step": 1054
+    },
+    {
+      "epoch": 0.39946989776599773,
+      "grad_norm": 12.40208625793457,
+      "learning_rate": 0.0001324251928033192,
+      "loss": 3.6691,
+      "step": 1055
+    },
+    {
+      "epoch": 0.3998485422188565,
+      "grad_norm": 10.716998100280762,
+      "learning_rate": 0.00013231178324239377,
+      "loss": 3.3575,
+      "step": 1056
+    },
+    {
+      "epoch": 0.40022718667171525,
+      "grad_norm": 12.901732444763184,
+      "learning_rate": 0.00013219832725917645,
+      "loss": 3.5777,
+      "step": 1057
+    },
+    {
+      "epoch": 0.40060583112457404,
+      "grad_norm": 11.288579940795898,
+      "learning_rate": 0.00013208482501666924,
+      "loss": 2.7736,
+      "step": 1058
+    },
+    {
+      "epoch": 0.4009844755774328,
+      "grad_norm": 11.336037635803223,
+      "learning_rate": 0.00013197127667794066,
+      "loss": 3.0309,
+      "step": 1059
+    },
+    {
+      "epoch": 0.40136312003029156,
+      "grad_norm": 12.790970802307129,
+      "learning_rate": 0.00013185768240612543,
+      "loss": 2.9778,
+      "step": 1060
+    },
+    {
+      "epoch": 0.4017417644831503,
+      "grad_norm": 10.891714096069336,
+      "learning_rate": 0.0001317440423644243,
+      "loss": 2.77,
+      "step": 1061
+    },
+    {
+      "epoch": 0.4021204089360091,
+      "grad_norm": 14.804855346679688,
+      "learning_rate": 0.00013163035671610374,
+      "loss": 2.9571,
+      "step": 1062
+    },
+    {
+      "epoch": 0.4024990533888679,
+      "grad_norm": 16.438711166381836,
+      "learning_rate": 0.00013151662562449576,
+      "loss": 3.4882,
+      "step": 1063
+    },
+    {
+      "epoch": 0.4028776978417266,
+      "grad_norm": 13.646224975585938,
+      "learning_rate": 0.00013140284925299762,
+      "loss": 3.3764,
+      "step": 1064
+    },
+    {
+      "epoch": 0.4032563422945854,
+      "grad_norm": 13.510947227478027,
+      "learning_rate": 0.00013128902776507172,
+      "loss": 2.5878,
+      "step": 1065
+    },
+    {
+      "epoch": 0.40363498674744414,
+      "grad_norm": 14.393485069274902,
+      "learning_rate": 0.00013117516132424517,
+      "loss": 3.1052,
+      "step": 1066
+    },
+    {
+      "epoch": 0.4040136312003029,
+      "grad_norm": 13.308830261230469,
+      "learning_rate": 0.00013106125009410978,
+      "loss": 2.3341,
+      "step": 1067
+    },
+    {
+      "epoch": 0.40439227565316166,
+      "grad_norm": 14.394597053527832,
+      "learning_rate": 0.0001309472942383216,
+      "loss": 2.546,
+      "step": 1068
+    },
+    {
+      "epoch": 0.40477092010602045,
+      "grad_norm": 15.71528434753418,
+      "learning_rate": 0.0001308332939206009,
+      "loss": 2.3768,
+      "step": 1069
+    },
+    {
+      "epoch": 0.40514956455887924,
+      "grad_norm": 14.074331283569336,
+      "learning_rate": 0.0001307192493047317,
+      "loss": 2.0017,
+      "step": 1070
+    },
+    {
+      "epoch": 0.405528209011738,
+      "grad_norm": 14.615304946899414,
+      "learning_rate": 0.00013060516055456175,
+      "loss": 1.9632,
+      "step": 1071
+    },
+    {
+      "epoch": 0.40590685346459676,
+      "grad_norm": 15.81937313079834,
+      "learning_rate": 0.00013049102783400221,
+      "loss": 1.5349,
+      "step": 1072
+    },
+    {
+      "epoch": 0.4062854979174555,
+      "grad_norm": 18.53114891052246,
+      "learning_rate": 0.00013037685130702742,
+      "loss": 2.08,
+      "step": 1073
+    },
+    {
+      "epoch": 0.4066641423703143,
+      "grad_norm": 18.639833450317383,
+      "learning_rate": 0.0001302626311376746,
+      "loss": 1.4834,
+      "step": 1074
+    },
+    {
+      "epoch": 0.407042786823173,
+      "grad_norm": 24.953411102294922,
+      "learning_rate": 0.00013014836749004367,
+      "loss": 1.6101,
+      "step": 1075
+    },
+    {
+      "epoch": 0.4074214312760318,
+      "grad_norm": 7.899003505706787,
+      "learning_rate": 0.00013003406052829706,
+      "loss": 3.2613,
+      "step": 1076
+    },
+    {
+      "epoch": 0.4078000757288906,
+      "grad_norm": 9.678568840026855,
+      "learning_rate": 0.0001299197104166595,
+      "loss": 3.4739,
+      "step": 1077
+    },
+    {
+      "epoch": 0.40817872018174933,
+      "grad_norm": 11.191865921020508,
+      "learning_rate": 0.0001298053173194175,
+      "loss": 3.6228,
+      "step": 1078
+    },
+    {
+      "epoch": 0.4085573646346081,
+      "grad_norm": 11.87701416015625,
+      "learning_rate": 0.00012969088140091955,
+      "loss": 3.041,
+      "step": 1079
+    },
+    {
+      "epoch": 0.40893600908746686,
+      "grad_norm": 12.016412734985352,
+      "learning_rate": 0.00012957640282557553,
+      "loss": 3.7958,
+      "step": 1080
+    },
+    {
+      "epoch": 0.40931465354032565,
+      "grad_norm": 12.6616792678833,
+      "learning_rate": 0.00012946188175785666,
+      "loss": 3.2154,
+      "step": 1081
+    },
+    {
+      "epoch": 0.4096932979931844,
+      "grad_norm": 12.319831848144531,
+      "learning_rate": 0.00012934731836229514,
+      "loss": 3.8766,
+      "step": 1082
+    },
+    {
+      "epoch": 0.41007194244604317,
+      "grad_norm": 12.028902053833008,
+      "learning_rate": 0.0001292327128034841,
+      "loss": 2.853,
+      "step": 1083
+    },
+    {
+      "epoch": 0.4104505868989019,
+      "grad_norm": 11.936314582824707,
+      "learning_rate": 0.00012911806524607713,
+      "loss": 3.7024,
+      "step": 1084
+    },
+    {
+      "epoch": 0.4108292313517607,
+      "grad_norm": 12.414243698120117,
+      "learning_rate": 0.00012900337585478825,
+      "loss": 3.3653,
+      "step": 1085
+    },
+    {
+      "epoch": 0.4112078758046195,
+      "grad_norm": 13.156413078308105,
+      "learning_rate": 0.0001288886447943915,
+      "loss": 3.0347,
+      "step": 1086
+    },
+    {
+      "epoch": 0.4115865202574782,
+      "grad_norm": 12.574990272521973,
+      "learning_rate": 0.00012877387222972087,
+      "loss": 2.6169,
+      "step": 1087
+    },
+    {
+      "epoch": 0.411965164710337,
+      "grad_norm": 17.557424545288086,
+      "learning_rate": 0.00012865905832566989,
+      "loss": 3.3377,
+      "step": 1088
+    },
+    {
+      "epoch": 0.41234380916319574,
+      "grad_norm": 12.320211410522461,
+      "learning_rate": 0.0001285442032471916,
+      "loss": 2.6103,
+      "step": 1089
+    },
+    {
+      "epoch": 0.41272245361605453,
+      "grad_norm": 13.786900520324707,
+      "learning_rate": 0.00012842930715929802,
+      "loss": 3.1307,
+      "step": 1090
+    },
+    {
+      "epoch": 0.41310109806891326,
+      "grad_norm": 16.15777587890625,
+      "learning_rate": 0.0001283143702270603,
+      "loss": 3.149,
+      "step": 1091
+    },
+    {
+      "epoch": 0.41347974252177205,
+      "grad_norm": 15.2261323928833,
+      "learning_rate": 0.00012819939261560806,
+      "loss": 1.8673,
+      "step": 1092
+    },
+    {
+      "epoch": 0.41385838697463084,
+      "grad_norm": 14.948053359985352,
+      "learning_rate": 0.00012808437449012957,
+      "loss": 2.8997,
+      "step": 1093
+    },
+    {
+      "epoch": 0.4142370314274896,
+      "grad_norm": 19.149866104125977,
+      "learning_rate": 0.00012796931601587113,
+      "loss": 2.147,
+      "step": 1094
+    },
+    {
+      "epoch": 0.41461567588034837,
+      "grad_norm": 17.016815185546875,
+      "learning_rate": 0.0001278542173581371,
+      "loss": 2.3585,
+      "step": 1095
+    },
+    {
+      "epoch": 0.4149943203332071,
+      "grad_norm": 16.220598220825195,
+      "learning_rate": 0.00012773907868228956,
+      "loss": 2.0916,
+      "step": 1096
+    },
+    {
+      "epoch": 0.4153729647860659,
+      "grad_norm": 17.185651779174805,
+      "learning_rate": 0.0001276239001537481,
+      "loss": 2.3026,
+      "step": 1097
+    },
+    {
+      "epoch": 0.4157516092389246,
+      "grad_norm": 15.259086608886719,
+      "learning_rate": 0.0001275086819379895,
+      "loss": 1.2933,
+      "step": 1098
+    },
+    {
+      "epoch": 0.4161302536917834,
+      "grad_norm": 24.932565689086914,
+      "learning_rate": 0.00012739342420054763,
+      "loss": 1.583,
+      "step": 1099
+    },
+    {
+      "epoch": 0.4165088981446422,
+      "grad_norm": 26.60433578491211,
+      "learning_rate": 0.0001272781271070131,
+      "loss": 2.5607,
+      "step": 1100
+    },
+    {
+      "epoch": 0.41688754259750094,
+      "grad_norm": 11.937226295471191,
+      "learning_rate": 0.00012716279082303312,
+      "loss": 4.2044,
+      "step": 1101
+    },
+    {
+      "epoch": 0.4172661870503597,
+      "grad_norm": 10.27784252166748,
+      "learning_rate": 0.0001270474155143111,
+      "loss": 4.1484,
+      "step": 1102
+    },
+    {
+      "epoch": 0.41764483150321846,
+      "grad_norm": 9.727765083312988,
+      "learning_rate": 0.00012693200134660662,
+      "loss": 3.0787,
+      "step": 1103
+    },
+    {
+      "epoch": 0.41802347595607725,
+      "grad_norm": 10.214356422424316,
+      "learning_rate": 0.00012681654848573502,
+      "loss": 2.7503,
+      "step": 1104
+    },
+    {
+      "epoch": 0.418402120408936,
+      "grad_norm": 10.071405410766602,
+      "learning_rate": 0.00012670105709756727,
+      "loss": 2.8888,
+      "step": 1105
+    },
+    {
+      "epoch": 0.4187807648617948,
+      "grad_norm": 13.693557739257812,
+      "learning_rate": 0.00012658552734802963,
+      "loss": 3.9183,
+      "step": 1106
+    },
+    {
+      "epoch": 0.41915940931465356,
+      "grad_norm": 10.267026901245117,
+      "learning_rate": 0.00012646995940310363,
+      "loss": 2.5214,
+      "step": 1107
+    },
+    {
+      "epoch": 0.4195380537675123,
+      "grad_norm": 12.434460639953613,
+      "learning_rate": 0.00012635435342882548,
+      "loss": 3.0185,
+      "step": 1108
+    },
+    {
+      "epoch": 0.4199166982203711,
+      "grad_norm": 11.598405838012695,
+      "learning_rate": 0.00012623870959128615,
+      "loss": 2.7773,
+      "step": 1109
+    },
+    {
+      "epoch": 0.4202953426732298,
+      "grad_norm": 14.913825035095215,
+      "learning_rate": 0.00012612302805663098,
+      "loss": 3.8533,
+      "step": 1110
+    },
+    {
+      "epoch": 0.4206739871260886,
+      "grad_norm": 11.085430145263672,
+      "learning_rate": 0.0001260073089910594,
+      "loss": 2.6134,
+      "step": 1111
+    },
+    {
+      "epoch": 0.42105263157894735,
+      "grad_norm": 12.33950138092041,
+      "learning_rate": 0.00012589155256082489,
+      "loss": 2.9382,
+      "step": 1112
+    },
+    {
+      "epoch": 0.42143127603180613,
+      "grad_norm": 13.180621147155762,
+      "learning_rate": 0.00012577575893223456,
+      "loss": 2.8428,
+      "step": 1113
+    },
+    {
+      "epoch": 0.4218099204846649,
+      "grad_norm": 15.379983901977539,
+      "learning_rate": 0.0001256599282716489,
+      "loss": 2.5916,
+      "step": 1114
+    },
+    {
+      "epoch": 0.42218856493752366,
+      "grad_norm": 14.148529052734375,
+      "learning_rate": 0.00012554406074548165,
+      "loss": 2.5504,
+      "step": 1115
+    },
+    {
+      "epoch": 0.42256720939038245,
+      "grad_norm": 15.524250030517578,
+      "learning_rate": 0.00012542815652019952,
+      "loss": 2.6872,
+      "step": 1116
+    },
+    {
+      "epoch": 0.4229458538432412,
+      "grad_norm": 13.896522521972656,
+      "learning_rate": 0.00012531221576232197,
+      "loss": 2.3257,
+      "step": 1117
+    },
+    {
+      "epoch": 0.42332449829609997,
+      "grad_norm": 13.984559059143066,
+      "learning_rate": 0.0001251962386384209,
+      "loss": 2.2887,
+      "step": 1118
+    },
+    {
+      "epoch": 0.4237031427489587,
+      "grad_norm": 14.945381164550781,
+      "learning_rate": 0.00012508022531512047,
+      "loss": 2.2639,
+      "step": 1119
+    },
+    {
+      "epoch": 0.4240817872018175,
+      "grad_norm": 14.590594291687012,
+      "learning_rate": 0.00012496417595909685,
+      "loss": 2.7817,
+      "step": 1120
+    },
+    {
+      "epoch": 0.4244604316546763,
+      "grad_norm": 22.159513473510742,
+      "learning_rate": 0.00012484809073707803,
+      "loss": 3.3067,
+      "step": 1121
+    },
+    {
+      "epoch": 0.424839076107535,
+      "grad_norm": 19.108047485351562,
+      "learning_rate": 0.00012473196981584338,
+      "loss": 2.6282,
+      "step": 1122
+    },
+    {
+      "epoch": 0.4252177205603938,
+      "grad_norm": 15.237470626831055,
+      "learning_rate": 0.00012461581336222378,
+      "loss": 1.917,
+      "step": 1123
+    },
+    {
+      "epoch": 0.42559636501325254,
+      "grad_norm": 13.147758483886719,
+      "learning_rate": 0.0001244996215431009,
+      "loss": 1.1269,
+      "step": 1124
+    },
+    {
+      "epoch": 0.42597500946611133,
+      "grad_norm": 30.5366268157959,
+      "learning_rate": 0.00012438339452540748,
+      "loss": 1.766,
+      "step": 1125
+    },
+    {
+      "epoch": 0.42635365391897007,
+      "grad_norm": 8.803793907165527,
+      "learning_rate": 0.00012426713247612665,
+      "loss": 3.8758,
+      "step": 1126
+    },
+    {
+      "epoch": 0.42673229837182886,
+      "grad_norm": 10.560848236083984,
+      "learning_rate": 0.00012415083556229192,
+      "loss": 3.5995,
+      "step": 1127
+    },
+    {
+      "epoch": 0.42711094282468764,
+      "grad_norm": 11.299087524414062,
+      "learning_rate": 0.00012403450395098695,
+      "loss": 4.2221,
+      "step": 1128
+    },
+    {
+      "epoch": 0.4274895872775464,
+      "grad_norm": 11.33618450164795,
+      "learning_rate": 0.00012391813780934514,
+      "loss": 4.1682,
+      "step": 1129
+    },
+    {
+      "epoch": 0.42786823173040517,
+      "grad_norm": 10.318195343017578,
+      "learning_rate": 0.00012380173730454957,
+      "loss": 3.3889,
+      "step": 1130
+    },
+    {
+      "epoch": 0.4282468761832639,
+      "grad_norm": 11.54907512664795,
+      "learning_rate": 0.00012368530260383268,
+      "loss": 2.8639,
+      "step": 1131
+    },
+    {
+      "epoch": 0.4286255206361227,
+      "grad_norm": 11.327589988708496,
+      "learning_rate": 0.00012356883387447601,
+      "loss": 2.3551,
+      "step": 1132
+    },
+    {
+      "epoch": 0.4290041650889814,
+      "grad_norm": 12.675344467163086,
+      "learning_rate": 0.00012345233128381006,
+      "loss": 3.7048,
+      "step": 1133
+    },
+    {
+      "epoch": 0.4293828095418402,
+      "grad_norm": 10.90146255493164,
+      "learning_rate": 0.00012333579499921392,
+      "loss": 3.0984,
+      "step": 1134
+    },
+    {
+      "epoch": 0.429761453994699,
+      "grad_norm": 13.599529266357422,
+      "learning_rate": 0.00012321922518811508,
+      "loss": 2.9593,
+      "step": 1135
+    },
+    {
+      "epoch": 0.43014009844755774,
+      "grad_norm": 12.997097969055176,
+      "learning_rate": 0.00012310262201798924,
+      "loss": 3.048,
+      "step": 1136
+    },
+    {
+      "epoch": 0.43051874290041653,
+      "grad_norm": 13.863821029663086,
+      "learning_rate": 0.00012298598565636,
+      "loss": 2.9528,
+      "step": 1137
+    },
+    {
+      "epoch": 0.43089738735327526,
+      "grad_norm": 14.177045822143555,
+      "learning_rate": 0.00012286931627079862,
+      "loss": 2.5402,
+      "step": 1138
+    },
+    {
+      "epoch": 0.43127603180613405,
+      "grad_norm": 14.45673942565918,
+      "learning_rate": 0.00012275261402892388,
+      "loss": 2.1941,
+      "step": 1139
+    },
+    {
+      "epoch": 0.4316546762589928,
+      "grad_norm": 16.615707397460938,
+      "learning_rate": 0.0001226358790984017,
+      "loss": 2.7464,
+      "step": 1140
+    },
+    {
+      "epoch": 0.4320333207118516,
+      "grad_norm": 13.864429473876953,
+      "learning_rate": 0.000122519111646945,
+      "loss": 2.384,
+      "step": 1141
+    },
+    {
+      "epoch": 0.4324119651647103,
+      "grad_norm": 15.059038162231445,
+      "learning_rate": 0.00012240231184231336,
+      "loss": 1.735,
+      "step": 1142
+    },
+    {
+      "epoch": 0.4327906096175691,
+      "grad_norm": 15.821595191955566,
+      "learning_rate": 0.00012228547985231297,
+      "loss": 2.953,
+      "step": 1143
+    },
+    {
+      "epoch": 0.4331692540704279,
+      "grad_norm": 13.79995346069336,
+      "learning_rate": 0.00012216861584479608,
+      "loss": 2.3279,
+      "step": 1144
+    },
+    {
+      "epoch": 0.4335478985232866,
+      "grad_norm": 11.45645523071289,
+      "learning_rate": 0.00012205171998766114,
+      "loss": 1.7425,
+      "step": 1145
+    },
+    {
+      "epoch": 0.4339265429761454,
+      "grad_norm": 15.549623489379883,
+      "learning_rate": 0.00012193479244885217,
+      "loss": 2.452,
+      "step": 1146
+    },
+    {
+      "epoch": 0.43430518742900415,
+      "grad_norm": 14.682928085327148,
+      "learning_rate": 0.00012181783339635888,
+      "loss": 2.1395,
+      "step": 1147
+    },
+    {
+      "epoch": 0.43468383188186294,
+      "grad_norm": 19.542850494384766,
+      "learning_rate": 0.00012170084299821609,
+      "loss": 2.4162,
+      "step": 1148
+    },
+    {
+      "epoch": 0.43506247633472167,
+      "grad_norm": 15.998048782348633,
+      "learning_rate": 0.00012158382142250379,
+      "loss": 1.5397,
+      "step": 1149
+    },
+    {
+      "epoch": 0.43544112078758046,
+      "grad_norm": 37.20795822143555,
+      "learning_rate": 0.00012146676883734671,
+      "loss": 3.4346,
+      "step": 1150
+    },
+    {
+      "epoch": 0.43581976524043925,
+      "grad_norm": 8.654630661010742,
+      "learning_rate": 0.00012134968541091405,
+      "loss": 4.2973,
+      "step": 1151
+    },
+    {
+      "epoch": 0.436198409693298,
+      "grad_norm": 9.75950813293457,
+      "learning_rate": 0.0001212325713114195,
+      "loss": 3.3641,
+      "step": 1152
+    },
+    {
+      "epoch": 0.4365770541461568,
+      "grad_norm": 9.88634204864502,
+      "learning_rate": 0.00012111542670712066,
+      "loss": 3.6815,
+      "step": 1153
+    },
+    {
+      "epoch": 0.4369556985990155,
+      "grad_norm": 12.256867408752441,
+      "learning_rate": 0.00012099825176631902,
+      "loss": 3.2275,
+      "step": 1154
+    },
+    {
+      "epoch": 0.4373343430518743,
+      "grad_norm": 12.367258071899414,
+      "learning_rate": 0.00012088104665735964,
+      "loss": 2.9504,
+      "step": 1155
+    },
+    {
+      "epoch": 0.43771298750473303,
+      "grad_norm": 13.042316436767578,
+      "learning_rate": 0.00012076381154863095,
+      "loss": 3.0564,
+      "step": 1156
+    },
+    {
+      "epoch": 0.4380916319575918,
+      "grad_norm": 11.0169677734375,
+      "learning_rate": 0.00012064654660856445,
+      "loss": 3.4256,
+      "step": 1157
+    },
+    {
+      "epoch": 0.4384702764104506,
+      "grad_norm": 11.372369766235352,
+      "learning_rate": 0.0001205292520056345,
+      "loss": 3.5504,
+      "step": 1158
+    },
+    {
+      "epoch": 0.43884892086330934,
+      "grad_norm": 10.504295349121094,
+      "learning_rate": 0.00012041192790835811,
+      "loss": 2.7411,
+      "step": 1159
+    },
+    {
+      "epoch": 0.43922756531616813,
+      "grad_norm": 13.477766036987305,
+      "learning_rate": 0.00012029457448529459,
+      "loss": 2.9257,
+      "step": 1160
+    },
+    {
+      "epoch": 0.43960620976902687,
+      "grad_norm": 12.110424041748047,
+      "learning_rate": 0.00012017719190504551,
+      "loss": 2.8799,
+      "step": 1161
+    },
+    {
+      "epoch": 0.43998485422188566,
+      "grad_norm": 13.188323020935059,
+      "learning_rate": 0.00012005978033625416,
+      "loss": 2.5087,
+      "step": 1162
+    },
+    {
+      "epoch": 0.4403634986747444,
+      "grad_norm": 11.588294982910156,
+      "learning_rate": 0.00011994233994760567,
+      "loss": 2.5272,
+      "step": 1163
+    },
+    {
+      "epoch": 0.4407421431276032,
+      "grad_norm": 15.151694297790527,
+      "learning_rate": 0.00011982487090782638,
+      "loss": 2.7985,
+      "step": 1164
+    },
+    {
+      "epoch": 0.44112078758046197,
+      "grad_norm": 14.004260063171387,
+      "learning_rate": 0.00011970737338568394,
+      "loss": 2.7696,
+      "step": 1165
+    },
+    {
+      "epoch": 0.4414994320333207,
+      "grad_norm": 14.581443786621094,
+      "learning_rate": 0.00011958984754998685,
+      "loss": 2.2614,
+      "step": 1166
+    },
+    {
+      "epoch": 0.4418780764861795,
+      "grad_norm": 12.546298027038574,
+      "learning_rate": 0.00011947229356958434,
+      "loss": 2.3896,
+      "step": 1167
+    },
+    {
+      "epoch": 0.44225672093903823,
+      "grad_norm": 14.990707397460938,
+      "learning_rate": 0.000119354711613366,
+      "loss": 3.1594,
+      "step": 1168
+    },
+    {
+      "epoch": 0.442635365391897,
+      "grad_norm": 14.658981323242188,
+      "learning_rate": 0.00011923710185026169,
+      "loss": 2.4297,
+      "step": 1169
+    },
+    {
+      "epoch": 0.44301400984475575,
+      "grad_norm": 13.724644660949707,
+      "learning_rate": 0.00011911946444924116,
+      "loss": 1.5228,
+      "step": 1170
+    },
+    {
+      "epoch": 0.44339265429761454,
+      "grad_norm": 19.209369659423828,
+      "learning_rate": 0.0001190017995793139,
+      "loss": 3.4329,
+      "step": 1171
+    },
+    {
+      "epoch": 0.44377129875047333,
+      "grad_norm": 21.529495239257812,
+      "learning_rate": 0.00011888410740952887,
+      "loss": 2.5655,
+      "step": 1172
+    },
+    {
+      "epoch": 0.44414994320333206,
+      "grad_norm": 24.351722717285156,
+      "learning_rate": 0.00011876638810897422,
+      "loss": 2.6329,
+      "step": 1173
+    },
+    {
+      "epoch": 0.44452858765619085,
+      "grad_norm": 15.183594703674316,
+      "learning_rate": 0.00011864864184677711,
+      "loss": 0.8859,
+      "step": 1174
+    },
+    {
+      "epoch": 0.4449072321090496,
+      "grad_norm": 13.775147438049316,
+      "learning_rate": 0.00011853086879210342,
+      "loss": 1.3488,
+      "step": 1175
+    },
+    {
+      "epoch": 0.4452858765619084,
+      "grad_norm": 8.975238800048828,
+      "learning_rate": 0.00011841306911415753,
+      "loss": 3.21,
+      "step": 1176
+    },
+    {
+      "epoch": 0.4456645210147671,
+      "grad_norm": 11.082070350646973,
+      "learning_rate": 0.00011829524298218207,
+      "loss": 4.19,
+      "step": 1177
+    },
+    {
+      "epoch": 0.4460431654676259,
+      "grad_norm": 10.536282539367676,
+      "learning_rate": 0.00011817739056545762,
+      "loss": 3.5267,
+      "step": 1178
+    },
+    {
+      "epoch": 0.4464218099204847,
+      "grad_norm": 10.50727367401123,
+      "learning_rate": 0.00011805951203330266,
+      "loss": 3.3532,
+      "step": 1179
+    },
+    {
+      "epoch": 0.4468004543733434,
+      "grad_norm": 10.488901138305664,
+      "learning_rate": 0.00011794160755507304,
+      "loss": 2.9757,
+      "step": 1180
+    },
+    {
+      "epoch": 0.4471790988262022,
+      "grad_norm": 12.007133483886719,
+      "learning_rate": 0.000117823677300162,
+      "loss": 3.0183,
+      "step": 1181
+    },
+    {
+      "epoch": 0.44755774327906095,
+      "grad_norm": 12.38204574584961,
+      "learning_rate": 0.00011770572143799971,
+      "loss": 3.0908,
+      "step": 1182
+    },
+    {
+      "epoch": 0.44793638773191974,
+      "grad_norm": 12.608494758605957,
+      "learning_rate": 0.00011758774013805325,
+      "loss": 3.0191,
+      "step": 1183
+    },
+    {
+      "epoch": 0.4483150321847785,
+      "grad_norm": 10.949199676513672,
+      "learning_rate": 0.00011746973356982614,
+      "loss": 2.5306,
+      "step": 1184
+    },
+    {
+      "epoch": 0.44869367663763726,
+      "grad_norm": 12.805669784545898,
+      "learning_rate": 0.00011735170190285825,
+      "loss": 3.2759,
+      "step": 1185
+    },
+    {
+      "epoch": 0.44907232109049605,
+      "grad_norm": 12.965691566467285,
+      "learning_rate": 0.00011723364530672549,
+      "loss": 3.0626,
+      "step": 1186
+    },
+    {
+      "epoch": 0.4494509655433548,
+      "grad_norm": 11.967156410217285,
+      "learning_rate": 0.00011711556395103964,
+      "loss": 2.4325,
+      "step": 1187
+    },
+    {
+      "epoch": 0.4498296099962136,
+      "grad_norm": 13.925737380981445,
+      "learning_rate": 0.00011699745800544798,
+      "loss": 2.8316,
+      "step": 1188
+    },
+    {
+      "epoch": 0.4502082544490723,
+      "grad_norm": 13.926861763000488,
+      "learning_rate": 0.00011687932763963319,
+      "loss": 3.4606,
+      "step": 1189
+    },
+    {
+      "epoch": 0.4505868989019311,
+      "grad_norm": 13.918458938598633,
+      "learning_rate": 0.00011676117302331291,
+      "loss": 2.5946,
+      "step": 1190
+    },
+    {
+      "epoch": 0.45096554335478983,
+      "grad_norm": 16.527910232543945,
+      "learning_rate": 0.00011664299432623979,
+      "loss": 2.2876,
+      "step": 1191
+    },
+    {
+      "epoch": 0.4513441878076486,
+      "grad_norm": 14.137311935424805,
+      "learning_rate": 0.00011652479171820097,
+      "loss": 2.9587,
+      "step": 1192
+    },
+    {
+      "epoch": 0.45172283226050736,
+      "grad_norm": 17.192485809326172,
+      "learning_rate": 0.00011640656536901796,
+      "loss": 1.5583,
+      "step": 1193
+    },
+    {
+      "epoch": 0.45210147671336615,
+      "grad_norm": 14.512371063232422,
+      "learning_rate": 0.00011628831544854635,
+      "loss": 2.3428,
+      "step": 1194
+    },
+    {
+      "epoch": 0.45248012116622494,
+      "grad_norm": 16.016895294189453,
+      "learning_rate": 0.00011617004212667566,
+      "loss": 2.4906,
+      "step": 1195
+    },
+    {
+      "epoch": 0.45285876561908367,
+      "grad_norm": 13.380924224853516,
+      "learning_rate": 0.000116051745573329,
+      "loss": 1.8266,
+      "step": 1196
+    },
+    {
+      "epoch": 0.45323741007194246,
+      "grad_norm": 12.72845458984375,
+      "learning_rate": 0.00011593342595846288,
+      "loss": 1.166,
+      "step": 1197
+    },
+    {
+      "epoch": 0.4536160545248012,
+      "grad_norm": 14.16887092590332,
+      "learning_rate": 0.00011581508345206689,
+      "loss": 1.3564,
+      "step": 1198
+    },
+    {
+      "epoch": 0.45399469897766,
+      "grad_norm": 28.907073974609375,
+      "learning_rate": 0.0001156967182241635,
+      "loss": 1.5071,
+      "step": 1199
+    },
+    {
+      "epoch": 0.4543733434305187,
+      "grad_norm": 17.37041473388672,
+      "learning_rate": 0.00011557833044480792,
+      "loss": 1.1685,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4547519878833775,
+      "grad_norm": 10.693912506103516,
+      "learning_rate": 0.0001154599202840877,
+      "loss": 3.2915,
+      "step": 1201
+    },
+    {
+      "epoch": 0.4551306323362363,
+      "grad_norm": 13.119062423706055,
+      "learning_rate": 0.0001153414879121225,
+      "loss": 4.6147,
+      "step": 1202
+    },
+    {
+      "epoch": 0.45550927678909503,
+      "grad_norm": 11.448525428771973,
+      "learning_rate": 0.00011522303349906399,
+      "loss": 2.79,
+      "step": 1203
+    },
+    {
+      "epoch": 0.4558879212419538,
+      "grad_norm": 11.742964744567871,
+      "learning_rate": 0.00011510455721509537,
+      "loss": 3.2349,
+      "step": 1204
+    },
+    {
+      "epoch": 0.45626656569481255,
+      "grad_norm": 10.76633358001709,
+      "learning_rate": 0.00011498605923043145,
+      "loss": 3.0203,
+      "step": 1205
+    },
+    {
+      "epoch": 0.45664521014767134,
+      "grad_norm": 11.407468795776367,
+      "learning_rate": 0.00011486753971531801,
+      "loss": 3.6872,
+      "step": 1206
+    },
+    {
+      "epoch": 0.4570238546005301,
+      "grad_norm": 11.357184410095215,
+      "learning_rate": 0.00011474899884003196,
+      "loss": 2.7635,
+      "step": 1207
+    },
+    {
+      "epoch": 0.45740249905338887,
+      "grad_norm": 12.275900840759277,
+      "learning_rate": 0.00011463043677488073,
+      "loss": 2.7735,
+      "step": 1208
+    },
+    {
+      "epoch": 0.45778114350624766,
+      "grad_norm": 12.097725868225098,
+      "learning_rate": 0.0001145118536902023,
+      "loss": 2.7413,
+      "step": 1209
+    },
+    {
+      "epoch": 0.4581597879591064,
+      "grad_norm": 10.203941345214844,
+      "learning_rate": 0.0001143932497563648,
+      "loss": 2.3056,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4585384324119652,
+      "grad_norm": 12.463147163391113,
+      "learning_rate": 0.00011427462514376637,
+      "loss": 3.1588,
+      "step": 1211
+    },
+    {
+      "epoch": 0.4589170768648239,
+      "grad_norm": 10.687355041503906,
+      "learning_rate": 0.00011415598002283474,
+      "loss": 1.4561,
+      "step": 1212
+    },
+    {
+      "epoch": 0.4592957213176827,
+      "grad_norm": 13.218606948852539,
+      "learning_rate": 0.00011403731456402727,
+      "loss": 2.156,
+      "step": 1213
+    },
+    {
+      "epoch": 0.45967436577054144,
+      "grad_norm": 15.726714134216309,
+      "learning_rate": 0.00011391862893783038,
+      "loss": 2.621,
+      "step": 1214
+    },
+    {
+      "epoch": 0.4600530102234002,
+      "grad_norm": 15.450735092163086,
+      "learning_rate": 0.0001137999233147596,
+      "loss": 2.6854,
+      "step": 1215
+    },
+    {
+      "epoch": 0.460431654676259,
+      "grad_norm": 14.271288871765137,
+      "learning_rate": 0.00011368119786535906,
+      "loss": 2.3983,
+      "step": 1216
+    },
+    {
+      "epoch": 0.46081029912911775,
+      "grad_norm": 16.259143829345703,
+      "learning_rate": 0.0001135624527602015,
+      "loss": 3.0149,
+      "step": 1217
+    },
+    {
+      "epoch": 0.46118894358197654,
+      "grad_norm": 21.305139541625977,
+      "learning_rate": 0.00011344368816988779,
+      "loss": 2.5145,
+      "step": 1218
+    },
+    {
+      "epoch": 0.4615675880348353,
+      "grad_norm": 18.001358032226562,
+      "learning_rate": 0.00011332490426504688,
+      "loss": 2.6175,
+      "step": 1219
+    },
+    {
+      "epoch": 0.46194623248769406,
+      "grad_norm": 15.817441940307617,
+      "learning_rate": 0.00011320610121633542,
+      "loss": 2.0215,
+      "step": 1220
+    },
+    {
+      "epoch": 0.4623248769405528,
+      "grad_norm": 18.465803146362305,
+      "learning_rate": 0.00011308727919443756,
+      "loss": 2.2702,
+      "step": 1221
+    },
+    {
+      "epoch": 0.4627035213934116,
+      "grad_norm": 15.902999877929688,
+      "learning_rate": 0.00011296843837006477,
+      "loss": 2.0862,
+      "step": 1222
+    },
+    {
+      "epoch": 0.4630821658462704,
+      "grad_norm": 18.18279457092285,
+      "learning_rate": 0.00011284957891395545,
+      "loss": 1.6971,
+      "step": 1223
+    },
+    {
+      "epoch": 0.4634608102991291,
+      "grad_norm": 20.656322479248047,
+      "learning_rate": 0.00011273070099687482,
+      "loss": 1.8615,
+      "step": 1224
+    },
+    {
+      "epoch": 0.4638394547519879,
+      "grad_norm": 37.89259719848633,
+      "learning_rate": 0.0001126118047896146,
+      "loss": 2.1817,
+      "step": 1225
+    },
+    {
+      "epoch": 0.46421809920484663,
+      "grad_norm": 8.783308982849121,
+      "learning_rate": 0.0001124928904629928,
+      "loss": 3.3508,
+      "step": 1226
+    },
+    {
+      "epoch": 0.4645967436577054,
+      "grad_norm": 12.971296310424805,
+      "learning_rate": 0.0001123739581878535,
+      "loss": 3.7262,
+      "step": 1227
+    },
+    {
+      "epoch": 0.46497538811056416,
+      "grad_norm": 10.869105339050293,
+      "learning_rate": 0.00011225500813506645,
+      "loss": 3.2334,
+      "step": 1228
+    },
+    {
+      "epoch": 0.46535403256342295,
+      "grad_norm": 11.33836555480957,
+      "learning_rate": 0.00011213604047552708,
+      "loss": 3.5119,
+      "step": 1229
+    },
+    {
+      "epoch": 0.46573267701628174,
+      "grad_norm": 10.899227142333984,
+      "learning_rate": 0.00011201705538015604,
+      "loss": 3.5351,
+      "step": 1230
+    },
+    {
+      "epoch": 0.46611132146914047,
+      "grad_norm": 11.528409957885742,
+      "learning_rate": 0.00011189805301989904,
+      "loss": 3.1705,
+      "step": 1231
+    },
+    {
+      "epoch": 0.46648996592199926,
+      "grad_norm": 10.381014823913574,
+      "learning_rate": 0.00011177903356572659,
+      "loss": 1.9777,
+      "step": 1232
+    },
+    {
+      "epoch": 0.466868610374858,
+      "grad_norm": 11.280335426330566,
+      "learning_rate": 0.00011165999718863379,
+      "loss": 2.5228,
+      "step": 1233
+    },
+    {
+      "epoch": 0.4672472548277168,
+      "grad_norm": 14.46865177154541,
+      "learning_rate": 0.00011154094405963996,
+      "loss": 2.5568,
+      "step": 1234
+    },
+    {
+      "epoch": 0.4676258992805755,
+      "grad_norm": 13.52888011932373,
+      "learning_rate": 0.00011142187434978866,
+      "loss": 3.2911,
+      "step": 1235
+    },
+    {
+      "epoch": 0.4680045437334343,
+      "grad_norm": 11.23714828491211,
+      "learning_rate": 0.00011130278823014709,
+      "loss": 2.2005,
+      "step": 1236
+    },
+    {
+      "epoch": 0.4683831881862931,
+      "grad_norm": 12.224804878234863,
+      "learning_rate": 0.00011118368587180614,
+      "loss": 2.2755,
+      "step": 1237
+    },
+    {
+      "epoch": 0.46876183263915183,
+      "grad_norm": 12.343790054321289,
+      "learning_rate": 0.00011106456744587996,
+      "loss": 2.8197,
+      "step": 1238
+    },
+    {
+      "epoch": 0.4691404770920106,
+      "grad_norm": 13.172083854675293,
+      "learning_rate": 0.0001109454331235059,
+      "loss": 2.586,
+      "step": 1239
+    },
+    {
+      "epoch": 0.46951912154486936,
+      "grad_norm": 12.991609573364258,
+      "learning_rate": 0.00011082628307584397,
+      "loss": 2.0318,
+      "step": 1240
+    },
+    {
+      "epoch": 0.46989776599772815,
+      "grad_norm": 13.485008239746094,
+      "learning_rate": 0.00011070711747407694,
+      "loss": 2.2734,
+      "step": 1241
+    },
+    {
+      "epoch": 0.4702764104505869,
+      "grad_norm": 19.911563873291016,
+      "learning_rate": 0.0001105879364894098,
+      "loss": 2.9116,
+      "step": 1242
+    },
+    {
+      "epoch": 0.47065505490344567,
+      "grad_norm": 14.824417114257812,
+      "learning_rate": 0.00011046874029306975,
+      "loss": 2.0742,
+      "step": 1243
+    },
+    {
+      "epoch": 0.4710336993563044,
+      "grad_norm": 17.6142578125,
+      "learning_rate": 0.00011034952905630576,
+      "loss": 2.6475,
+      "step": 1244
+    },
+    {
+      "epoch": 0.4714123438091632,
+      "grad_norm": 13.6873140335083,
+      "learning_rate": 0.00011023030295038846,
+      "loss": 2.1793,
+      "step": 1245
+    },
+    {
+      "epoch": 0.471790988262022,
+      "grad_norm": 15.636033058166504,
+      "learning_rate": 0.0001101110621466098,
+      "loss": 1.6981,
+      "step": 1246
+    },
+    {
+      "epoch": 0.4721696327148807,
+      "grad_norm": 17.11579132080078,
+      "learning_rate": 0.00010999180681628288,
+      "loss": 1.6256,
+      "step": 1247
+    },
+    {
+      "epoch": 0.4725482771677395,
+      "grad_norm": 20.186901092529297,
+      "learning_rate": 0.00010987253713074165,
+      "loss": 2.4091,
+      "step": 1248
+    },
+    {
+      "epoch": 0.47292692162059824,
+      "grad_norm": 15.602944374084473,
+      "learning_rate": 0.00010975325326134071,
+      "loss": 1.8002,
+      "step": 1249
+    },
+    {
+      "epoch": 0.47330556607345703,
+      "grad_norm": 23.223661422729492,
+      "learning_rate": 0.00010963395537945502,
+      "loss": 2.0938,
+      "step": 1250
+    },
+    {
+      "epoch": 0.47368421052631576,
+      "grad_norm": 10.464275360107422,
+      "learning_rate": 0.00010951464365647967,
+      "loss": 4.1863,
+      "step": 1251
+    },
+    {
+      "epoch": 0.47406285497917455,
+      "grad_norm": 10.853160858154297,
+      "learning_rate": 0.00010939531826382963,
+      "loss": 3.6832,
+      "step": 1252
+    },
+    {
+      "epoch": 0.47444149943203334,
+      "grad_norm": 12.23708724975586,
+      "learning_rate": 0.00010927597937293952,
+      "loss": 3.7507,
+      "step": 1253
+    },
+    {
+      "epoch": 0.4748201438848921,
+      "grad_norm": 12.157914161682129,
+      "learning_rate": 0.00010915662715526336,
+      "loss": 2.7929,
+      "step": 1254
+    },
+    {
+      "epoch": 0.47519878833775087,
+      "grad_norm": 14.618999481201172,
+      "learning_rate": 0.00010903726178227432,
+      "loss": 3.9901,
+      "step": 1255
+    },
+    {
+      "epoch": 0.4755774327906096,
+      "grad_norm": 11.460221290588379,
+      "learning_rate": 0.0001089178834254644,
+      "loss": 3.0165,
+      "step": 1256
+    },
+    {
+      "epoch": 0.4759560772434684,
+      "grad_norm": 11.18032455444336,
+      "learning_rate": 0.00010879849225634438,
+      "loss": 1.9716,
+      "step": 1257
+    },
+    {
+      "epoch": 0.4763347216963271,
+      "grad_norm": 11.510719299316406,
+      "learning_rate": 0.00010867908844644335,
+      "loss": 1.7553,
+      "step": 1258
+    },
+    {
+      "epoch": 0.4767133661491859,
+      "grad_norm": 10.82070255279541,
+      "learning_rate": 0.00010855967216730858,
+      "loss": 2.6911,
+      "step": 1259
+    },
+    {
+      "epoch": 0.4770920106020447,
+      "grad_norm": 13.530522346496582,
+      "learning_rate": 0.00010844024359050527,
+      "loss": 2.8952,
+      "step": 1260
+    },
+    {
+      "epoch": 0.47747065505490344,
+      "grad_norm": 10.605006217956543,
+      "learning_rate": 0.0001083208028876163,
+      "loss": 2.2925,
+      "step": 1261
+    },
+    {
+      "epoch": 0.4778492995077622,
+      "grad_norm": 12.863495826721191,
+      "learning_rate": 0.00010820135023024192,
+      "loss": 2.3114,
+      "step": 1262
+    },
+    {
+      "epoch": 0.47822794396062096,
+      "grad_norm": 16.1364688873291,
+      "learning_rate": 0.00010808188578999963,
+      "loss": 3.0539,
+      "step": 1263
+    },
+    {
+      "epoch": 0.47860658841347975,
+      "grad_norm": 12.478103637695312,
+      "learning_rate": 0.00010796240973852376,
+      "loss": 2.0726,
+      "step": 1264
+    },
+    {
+      "epoch": 0.4789852328663385,
+      "grad_norm": 13.423611640930176,
+      "learning_rate": 0.00010784292224746546,
+      "loss": 2.8393,
+      "step": 1265
+    },
+    {
+      "epoch": 0.4793638773191973,
+      "grad_norm": 14.295774459838867,
+      "learning_rate": 0.00010772342348849216,
+      "loss": 2.7654,
+      "step": 1266
+    },
+    {
+      "epoch": 0.47974252177205606,
+      "grad_norm": 15.330755233764648,
+      "learning_rate": 0.00010760391363328762,
+      "loss": 1.9282,
+      "step": 1267
+    },
+    {
+      "epoch": 0.4801211662249148,
+      "grad_norm": 19.332740783691406,
+      "learning_rate": 0.00010748439285355138,
+      "loss": 1.8195,
+      "step": 1268
+    },
+    {
+      "epoch": 0.4804998106777736,
+      "grad_norm": 16.43891143798828,
+      "learning_rate": 0.00010736486132099888,
+      "loss": 2.0598,
+      "step": 1269
+    },
+    {
+      "epoch": 0.4808784551306323,
+      "grad_norm": 12.18430233001709,
+      "learning_rate": 0.00010724531920736086,
+      "loss": 0.99,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4812570995834911,
+      "grad_norm": 15.264763832092285,
+      "learning_rate": 0.00010712576668438323,
+      "loss": 1.8075,
+      "step": 1271
+    },
+    {
+      "epoch": 0.48163574403634984,
+      "grad_norm": 21.91768455505371,
+      "learning_rate": 0.00010700620392382701,
+      "loss": 2.6154,
+      "step": 1272
+    },
+    {
+      "epoch": 0.48201438848920863,
+      "grad_norm": 16.14089012145996,
+      "learning_rate": 0.00010688663109746784,
+      "loss": 1.5317,
+      "step": 1273
+    },
+    {
+      "epoch": 0.4823930329420674,
+      "grad_norm": 32.41860580444336,
+      "learning_rate": 0.00010676704837709576,
+      "loss": 1.8389,
+      "step": 1274
+    },
+    {
+      "epoch": 0.48277167739492616,
+      "grad_norm": 23.59526252746582,
+      "learning_rate": 0.00010664745593451516,
+      "loss": 1.1361,
+      "step": 1275
+    },
+    {
+      "epoch": 0.48315032184778495,
+      "grad_norm": 10.691109657287598,
+      "learning_rate": 0.00010652785394154427,
+      "loss": 3.2863,
+      "step": 1276
+    },
+    {
+      "epoch": 0.4835289663006437,
+      "grad_norm": 12.289042472839355,
+      "learning_rate": 0.00010640824257001516,
+      "loss": 4.0967,
+      "step": 1277
+    },
+    {
+      "epoch": 0.48390761075350247,
+      "grad_norm": 10.609498023986816,
+      "learning_rate": 0.00010628862199177327,
+      "loss": 2.915,
+      "step": 1278
+    },
+    {
+      "epoch": 0.4842862552063612,
+      "grad_norm": 13.162012100219727,
+      "learning_rate": 0.00010616899237867733,
+      "loss": 3.3384,
+      "step": 1279
+    },
+    {
+      "epoch": 0.48466489965922,
+      "grad_norm": 12.458738327026367,
+      "learning_rate": 0.000106049353902599,
+      "loss": 2.8678,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4850435441120788,
+      "grad_norm": 12.008556365966797,
+      "learning_rate": 0.00010592970673542277,
+      "loss": 2.9199,
+      "step": 1281
+    },
+    {
+      "epoch": 0.4854221885649375,
+      "grad_norm": 10.63491153717041,
+      "learning_rate": 0.00010581005104904549,
+      "loss": 2.4852,
+      "step": 1282
+    },
+    {
+      "epoch": 0.4858008330177963,
+      "grad_norm": 10.767313957214355,
+      "learning_rate": 0.00010569038701537633,
+      "loss": 3.4581,
+      "step": 1283
+    },
+    {
+      "epoch": 0.48617947747065504,
+      "grad_norm": 12.88519287109375,
+      "learning_rate": 0.00010557071480633643,
+      "loss": 3.5616,
+      "step": 1284
+    },
+    {
+      "epoch": 0.48655812192351383,
+      "grad_norm": 12.250274658203125,
+      "learning_rate": 0.00010545103459385868,
+      "loss": 2.8215,
+      "step": 1285
+    },
+    {
+      "epoch": 0.48693676637637257,
+      "grad_norm": 12.7329683303833,
+      "learning_rate": 0.00010533134654988746,
+      "loss": 3.5789,
+      "step": 1286
+    },
+    {
+      "epoch": 0.48731541082923135,
+      "grad_norm": 12.87328815460205,
+      "learning_rate": 0.00010521165084637843,
+      "loss": 2.854,
+      "step": 1287
+    },
+    {
+      "epoch": 0.48769405528209014,
+      "grad_norm": 11.388814926147461,
+      "learning_rate": 0.00010509194765529821,
+      "loss": 2.0008,
+      "step": 1288
+    },
+    {
+      "epoch": 0.4880726997349489,
+      "grad_norm": 12.551799774169922,
+      "learning_rate": 0.00010497223714862424,
+      "loss": 2.4604,
+      "step": 1289
+    },
+    {
+      "epoch": 0.48845134418780767,
+      "grad_norm": 10.640294075012207,
+      "learning_rate": 0.00010485251949834436,
+      "loss": 1.6856,
+      "step": 1290
+    },
+    {
+      "epoch": 0.4888299886406664,
+      "grad_norm": 13.196956634521484,
+      "learning_rate": 0.0001047327948764568,
+      "loss": 2.2765,
+      "step": 1291
+    },
+    {
+      "epoch": 0.4892086330935252,
+      "grad_norm": 17.06575584411621,
+      "learning_rate": 0.00010461306345496972,
+      "loss": 2.8379,
+      "step": 1292
+    },
+    {
+      "epoch": 0.4895872775463839,
+      "grad_norm": 17.766448974609375,
+      "learning_rate": 0.00010449332540590114,
+      "loss": 1.885,
+      "step": 1293
+    },
+    {
+      "epoch": 0.4899659219992427,
+      "grad_norm": 12.942706108093262,
+      "learning_rate": 0.00010437358090127847,
+      "loss": 1.6903,
+      "step": 1294
+    },
+    {
+      "epoch": 0.49034456645210145,
+      "grad_norm": 16.92314910888672,
+      "learning_rate": 0.00010425383011313844,
+      "loss": 2.4453,
+      "step": 1295
+    },
+    {
+      "epoch": 0.49072321090496024,
+      "grad_norm": 17.436086654663086,
+      "learning_rate": 0.00010413407321352695,
+      "loss": 1.9032,
+      "step": 1296
+    },
+    {
+      "epoch": 0.49110185535781903,
+      "grad_norm": 18.94797706604004,
+      "learning_rate": 0.00010401431037449847,
+      "loss": 2.0191,
+      "step": 1297
+    },
+    {
+      "epoch": 0.49148049981067776,
+      "grad_norm": 15.610849380493164,
+      "learning_rate": 0.0001038945417681161,
+      "loss": 1.19,
+      "step": 1298
+    },
+    {
+      "epoch": 0.49185914426353655,
+      "grad_norm": 16.951602935791016,
+      "learning_rate": 0.00010377476756645128,
+      "loss": 1.4745,
+      "step": 1299
+    },
+    {
+      "epoch": 0.4922377887163953,
+      "grad_norm": 45.024925231933594,
+      "learning_rate": 0.00010365498794158337,
+      "loss": 3.5771,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4926164331692541,
+      "grad_norm": 10.677453994750977,
+      "learning_rate": 0.00010353520306559963,
+      "loss": 3.5375,
+      "step": 1301
+    },
+    {
+      "epoch": 0.4929950776221128,
+      "grad_norm": 12.31181812286377,
+      "learning_rate": 0.00010341541311059478,
+      "loss": 3.5221,
+      "step": 1302
+    },
+    {
+      "epoch": 0.4933737220749716,
+      "grad_norm": 11.114928245544434,
+      "learning_rate": 0.00010329561824867089,
+      "loss": 2.9916,
+      "step": 1303
+    },
+    {
+      "epoch": 0.4937523665278304,
+      "grad_norm": 14.953704833984375,
+      "learning_rate": 0.00010317581865193704,
+      "loss": 2.4552,
+      "step": 1304
+    },
+    {
+      "epoch": 0.4941310109806891,
+      "grad_norm": 11.37937068939209,
+      "learning_rate": 0.00010305601449250919,
+      "loss": 2.9803,
+      "step": 1305
+    },
+    {
+      "epoch": 0.4945096554335479,
+      "grad_norm": 10.58877944946289,
+      "learning_rate": 0.00010293620594250974,
+      "loss": 2.0205,
+      "step": 1306
+    },
+    {
+      "epoch": 0.49488829988640665,
+      "grad_norm": 11.108804702758789,
+      "learning_rate": 0.00010281639317406752,
+      "loss": 2.4598,
+      "step": 1307
+    },
+    {
+      "epoch": 0.49526694433926544,
+      "grad_norm": 11.565478324890137,
+      "learning_rate": 0.00010269657635931731,
+      "loss": 1.909,
+      "step": 1308
+    },
+    {
+      "epoch": 0.49564558879212417,
+      "grad_norm": 12.14426326751709,
+      "learning_rate": 0.00010257675567039979,
+      "loss": 3.0371,
+      "step": 1309
+    },
+    {
+      "epoch": 0.49602423324498296,
+      "grad_norm": 10.85464096069336,
+      "learning_rate": 0.00010245693127946112,
+      "loss": 2.3844,
+      "step": 1310
+    },
+    {
+      "epoch": 0.49640287769784175,
+      "grad_norm": 11.257962226867676,
+      "learning_rate": 0.0001023371033586529,
+      "loss": 2.1763,
+      "step": 1311
+    },
+    {
+      "epoch": 0.4967815221507005,
+      "grad_norm": 10.673297882080078,
+      "learning_rate": 0.00010221727208013166,
+      "loss": 1.9263,
+      "step": 1312
+    },
+    {
+      "epoch": 0.4971601666035593,
+      "grad_norm": 14.040605545043945,
+      "learning_rate": 0.00010209743761605885,
+      "loss": 2.7561,
+      "step": 1313
+    },
+    {
+      "epoch": 0.497538811056418,
+      "grad_norm": 13.651562690734863,
+      "learning_rate": 0.00010197760013860047,
+      "loss": 2.1574,
+      "step": 1314
+    },
+    {
+      "epoch": 0.4979174555092768,
+      "grad_norm": 13.463566780090332,
+      "learning_rate": 0.00010185775981992689,
+      "loss": 2.1069,
+      "step": 1315
+    },
+    {
+      "epoch": 0.49829609996213553,
+      "grad_norm": 11.810751914978027,
+      "learning_rate": 0.00010173791683221244,
+      "loss": 1.9149,
+      "step": 1316
+    },
+    {
+      "epoch": 0.4986747444149943,
+      "grad_norm": 19.515695571899414,
+      "learning_rate": 0.00010161807134763543,
+      "loss": 3.2127,
+      "step": 1317
+    },
+    {
+      "epoch": 0.4990533888678531,
+      "grad_norm": 19.75203514099121,
+      "learning_rate": 0.00010149822353837768,
+      "loss": 1.3851,
+      "step": 1318
+    },
+    {
+      "epoch": 0.49943203332071184,
+      "grad_norm": 16.31900978088379,
+      "learning_rate": 0.00010137837357662432,
+      "loss": 2.0814,
+      "step": 1319
+    },
+    {
+      "epoch": 0.49981067777357063,
+      "grad_norm": 16.237138748168945,
+      "learning_rate": 0.00010125852163456368,
+      "loss": 2.0635,
+      "step": 1320
+    },
+    {
+      "epoch": 0.5001893222264294,
+      "grad_norm": 17.72742462158203,
+      "learning_rate": 0.00010113866788438684,
+      "loss": 1.084,
+      "step": 1321
+    },
+    {
+      "epoch": 0.5005679666792882,
+      "grad_norm": 15.087898254394531,
+      "learning_rate": 0.00010101881249828748,
+      "loss": 1.5248,
+      "step": 1322
+    },
+    {
+      "epoch": 0.5005679666792882,
+      "eval_loss": 0.27747318148612976,
+      "eval_runtime": 896.6071,
+      "eval_samples_per_second": 4.961,
+      "eval_steps_per_second": 1.24,
+      "step": 1322
     }
   ],
   "logging_steps": 1,
@@ -4669,7 +9304,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.718153047487021e+18,
+  "total_flos": 7.436306094974042e+18,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null