diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.2502839833396441, + "epoch": 0.5005679666792882, "eval_steps": 661, - "global_step": 661, + "global_step": 1322, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -4650,6 +4650,4641 @@ "eval_samples_per_second": 4.946, "eval_steps_per_second": 1.237, "step": 661 + }, + { + "epoch": 0.2506626277925028, + "grad_norm": 11.954764366149902, + "learning_rate": 0.0001718247236861852, + "loss": 4.3744, + "step": 662 + }, + { + "epoch": 0.2510412722453616, + "grad_norm": 14.922517776489258, + "learning_rate": 0.0001717412733168675, + "loss": 4.7506, + "step": 663 + }, + { + "epoch": 0.2514199166982204, + "grad_norm": 16.80381965637207, + "learning_rate": 0.00017165771987696698, + "loss": 4.9678, + "step": 664 + }, + { + "epoch": 0.2517985611510791, + "grad_norm": 13.548487663269043, + "learning_rate": 0.00017157406348652463, + "loss": 4.0003, + "step": 665 + }, + { + "epoch": 0.2521772056039379, + "grad_norm": 14.526391983032227, + "learning_rate": 0.00017149030426572953, + "loss": 4.5138, + "step": 666 + }, + { + "epoch": 0.25255585005679665, + "grad_norm": 16.523895263671875, + "learning_rate": 0.00017140644233491837, + "loss": 4.0987, + "step": 667 + }, + { + "epoch": 0.25293449450965544, + "grad_norm": 13.643799781799316, + "learning_rate": 0.00017132247781457557, + "loss": 4.3543, + "step": 668 + }, + { + "epoch": 0.2533131389625142, + "grad_norm": 13.402974128723145, + "learning_rate": 0.00017123841082533275, + "loss": 3.5844, + "step": 669 + }, + { + "epoch": 0.25369178341537296, + "grad_norm": 16.17197036743164, + "learning_rate": 0.00017115424148796883, + "loss": 4.1618, + "step": 670 + }, + { + "epoch": 0.25407042786823175, + "grad_norm": 15.300637245178223, + "learning_rate": 0.00017106996992340983, + "loss": 2.9017, + "step": 671 + }, + { + "epoch": 0.2544490723210905, + "grad_norm": 14.559680938720703, + "learning_rate": 0.00017098559625272852, + "loss": 1.9764, + "step": 672 + }, + { + "epoch": 0.2548277167739493, + "grad_norm": 19.095439910888672, + "learning_rate": 0.00017090112059714446, + "loss": 2.9313, + "step": 673 + }, + { + "epoch": 0.255206361226808, + "grad_norm": 18.04271125793457, + "learning_rate": 0.0001708165430780237, + "loss": 1.6536, + "step": 674 + }, + { + "epoch": 0.2555850056796668, + "grad_norm": 29.377933502197266, + "learning_rate": 0.00017073186381687868, + "loss": 2.5233, + "step": 675 + }, + { + "epoch": 0.25596365013252553, + "grad_norm": 13.520868301391602, + "learning_rate": 0.00017064708293536792, + "loss": 5.1626, + "step": 676 + }, + { + "epoch": 0.2563422945853843, + "grad_norm": 10.009629249572754, + "learning_rate": 0.00017056220055529595, + "loss": 5.3031, + "step": 677 + }, + { + "epoch": 0.2567209390382431, + "grad_norm": 10.07101058959961, + "learning_rate": 0.00017047721679861326, + "loss": 4.0588, + "step": 678 + }, + { + "epoch": 0.25709958349110185, + "grad_norm": 9.826030731201172, + "learning_rate": 0.0001703921317874158, + "loss": 5.5341, + "step": 679 + }, + { + "epoch": 0.25747822794396064, + "grad_norm": 8.8646821975708, + "learning_rate": 0.00017030694564394518, + "loss": 4.0068, + "step": 680 + }, + { + "epoch": 0.25785687239681937, + "grad_norm": 9.862954139709473, + "learning_rate": 0.00017022165849058812, + "loss": 4.4291, + "step": 681 + }, + { + "epoch": 0.25823551684967816, + "grad_norm": 10.417892456054688, + "learning_rate": 0.00017013627044987656, + "loss": 4.2552, + "step": 682 + }, + { + "epoch": 0.2586141613025369, + "grad_norm": 10.884054183959961, + "learning_rate": 0.00017005078164448746, + "loss": 3.9076, + "step": 683 + }, + { + "epoch": 0.2589928057553957, + "grad_norm": 13.036945343017578, + "learning_rate": 0.00016996519219724234, + "loss": 5.0801, + "step": 684 + }, + { + "epoch": 0.2593714502082545, + "grad_norm": 14.050254821777344, + "learning_rate": 0.00016987950223110748, + "loss": 4.3475, + "step": 685 + }, + { + "epoch": 0.2597500946611132, + "grad_norm": 14.108619689941406, + "learning_rate": 0.0001697937118691936, + "loss": 3.5589, + "step": 686 + }, + { + "epoch": 0.260128739113972, + "grad_norm": 15.274951934814453, + "learning_rate": 0.00016970782123475547, + "loss": 4.3379, + "step": 687 + }, + { + "epoch": 0.26050738356683073, + "grad_norm": 16.255102157592773, + "learning_rate": 0.00016962183045119214, + "loss": 4.2574, + "step": 688 + }, + { + "epoch": 0.2608860280196895, + "grad_norm": 13.676608085632324, + "learning_rate": 0.00016953573964204638, + "loss": 4.7991, + "step": 689 + }, + { + "epoch": 0.26126467247254825, + "grad_norm": 16.22185707092285, + "learning_rate": 0.00016944954893100475, + "loss": 3.9156, + "step": 690 + }, + { + "epoch": 0.26164331692540704, + "grad_norm": 11.445201873779297, + "learning_rate": 0.0001693632584418973, + "loss": 3.3147, + "step": 691 + }, + { + "epoch": 0.26202196137826583, + "grad_norm": 12.272850036621094, + "learning_rate": 0.0001692768682986975, + "loss": 3.7517, + "step": 692 + }, + { + "epoch": 0.26240060583112457, + "grad_norm": 13.47509479522705, + "learning_rate": 0.0001691903786255219, + "loss": 3.0187, + "step": 693 + }, + { + "epoch": 0.26277925028398336, + "grad_norm": 12.622843742370605, + "learning_rate": 0.00016910378954663013, + "loss": 3.404, + "step": 694 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 17.651247024536133, + "learning_rate": 0.00016901710118642454, + "loss": 3.9933, + "step": 695 + }, + { + "epoch": 0.2635365391897009, + "grad_norm": 17.895601272583008, + "learning_rate": 0.0001689303136694502, + "loss": 2.997, + "step": 696 + }, + { + "epoch": 0.2639151836425596, + "grad_norm": 17.362606048583984, + "learning_rate": 0.0001688434271203946, + "loss": 3.9413, + "step": 697 + }, + { + "epoch": 0.2642938280954184, + "grad_norm": 18.201135635375977, + "learning_rate": 0.00016875644166408754, + "loss": 3.0196, + "step": 698 + }, + { + "epoch": 0.2646724725482772, + "grad_norm": 34.7093391418457, + "learning_rate": 0.00016866935742550083, + "loss": 3.1038, + "step": 699 + }, + { + "epoch": 0.2650511170011359, + "grad_norm": 31.69573974609375, + "learning_rate": 0.00016858217452974837, + "loss": 3.6338, + "step": 700 + }, + { + "epoch": 0.2654297614539947, + "grad_norm": 8.198531150817871, + "learning_rate": 0.0001684948931020856, + "loss": 5.0433, + "step": 701 + }, + { + "epoch": 0.26580840590685345, + "grad_norm": 8.688183784484863, + "learning_rate": 0.0001684075132679097, + "loss": 5.2619, + "step": 702 + }, + { + "epoch": 0.26618705035971224, + "grad_norm": 11.000321388244629, + "learning_rate": 0.00016832003515275914, + "loss": 4.5282, + "step": 703 + }, + { + "epoch": 0.266565694812571, + "grad_norm": 10.947884559631348, + "learning_rate": 0.00016823245888231356, + "loss": 4.5732, + "step": 704 + }, + { + "epoch": 0.26694433926542976, + "grad_norm": 11.00478744506836, + "learning_rate": 0.0001681447845823937, + "loss": 4.2352, + "step": 705 + }, + { + "epoch": 0.26732298371828855, + "grad_norm": 12.548285484313965, + "learning_rate": 0.00016805701237896105, + "loss": 4.8917, + "step": 706 + }, + { + "epoch": 0.2677016281711473, + "grad_norm": 9.91434097290039, + "learning_rate": 0.00016796914239811786, + "loss": 3.5194, + "step": 707 + }, + { + "epoch": 0.2680802726240061, + "grad_norm": 10.20582389831543, + "learning_rate": 0.00016788117476610677, + "loss": 3.5162, + "step": 708 + }, + { + "epoch": 0.2684589170768648, + "grad_norm": 11.576292991638184, + "learning_rate": 0.00016779310960931073, + "loss": 4.2913, + "step": 709 + }, + { + "epoch": 0.2688375615297236, + "grad_norm": 12.707137107849121, + "learning_rate": 0.0001677049470542529, + "loss": 3.8916, + "step": 710 + }, + { + "epoch": 0.26921620598258234, + "grad_norm": 13.900711059570312, + "learning_rate": 0.00016761668722759622, + "loss": 4.5028, + "step": 711 + }, + { + "epoch": 0.2695948504354411, + "grad_norm": 15.244569778442383, + "learning_rate": 0.0001675283302561435, + "loss": 4.7703, + "step": 712 + }, + { + "epoch": 0.2699734948882999, + "grad_norm": 12.6697998046875, + "learning_rate": 0.00016743987626683703, + "loss": 3.6493, + "step": 713 + }, + { + "epoch": 0.27035213934115865, + "grad_norm": 14.035780906677246, + "learning_rate": 0.00016735132538675854, + "loss": 3.7715, + "step": 714 + }, + { + "epoch": 0.27073078379401744, + "grad_norm": 14.34146785736084, + "learning_rate": 0.00016726267774312898, + "loss": 4.4825, + "step": 715 + }, + { + "epoch": 0.27110942824687617, + "grad_norm": 14.432804107666016, + "learning_rate": 0.00016717393346330828, + "loss": 3.3871, + "step": 716 + }, + { + "epoch": 0.27148807269973496, + "grad_norm": 13.750441551208496, + "learning_rate": 0.0001670850926747952, + "loss": 3.1007, + "step": 717 + }, + { + "epoch": 0.2718667171525937, + "grad_norm": 16.201181411743164, + "learning_rate": 0.00016699615550522717, + "loss": 2.6202, + "step": 718 + }, + { + "epoch": 0.2722453616054525, + "grad_norm": 18.666667938232422, + "learning_rate": 0.0001669071220823801, + "loss": 3.7933, + "step": 719 + }, + { + "epoch": 0.2726240060583112, + "grad_norm": 17.479516983032227, + "learning_rate": 0.0001668179925341682, + "loss": 3.846, + "step": 720 + }, + { + "epoch": 0.27300265051117, + "grad_norm": 21.877872467041016, + "learning_rate": 0.0001667287669886437, + "loss": 3.2863, + "step": 721 + }, + { + "epoch": 0.2733812949640288, + "grad_norm": 18.93062973022461, + "learning_rate": 0.00016663944557399692, + "loss": 2.6771, + "step": 722 + }, + { + "epoch": 0.27375993941688753, + "grad_norm": 17.544601440429688, + "learning_rate": 0.00016655002841855566, + "loss": 2.4239, + "step": 723 + }, + { + "epoch": 0.2741385838697463, + "grad_norm": 27.646818161010742, + "learning_rate": 0.00016646051565078558, + "loss": 2.6222, + "step": 724 + }, + { + "epoch": 0.27451722832260506, + "grad_norm": 34.713478088378906, + "learning_rate": 0.0001663709073992894, + "loss": 2.8671, + "step": 725 + }, + { + "epoch": 0.27489587277546385, + "grad_norm": 8.818018913269043, + "learning_rate": 0.00016628120379280728, + "loss": 4.8852, + "step": 726 + }, + { + "epoch": 0.2752745172283226, + "grad_norm": 9.805505752563477, + "learning_rate": 0.00016619140496021615, + "loss": 4.155, + "step": 727 + }, + { + "epoch": 0.27565316168118137, + "grad_norm": 8.863174438476562, + "learning_rate": 0.00016610151103052995, + "loss": 3.8106, + "step": 728 + }, + { + "epoch": 0.27603180613404016, + "grad_norm": 9.454705238342285, + "learning_rate": 0.00016601152213289913, + "loss": 4.0096, + "step": 729 + }, + { + "epoch": 0.2764104505868989, + "grad_norm": 10.974586486816406, + "learning_rate": 0.00016592143839661057, + "loss": 4.3561, + "step": 730 + }, + { + "epoch": 0.2767890950397577, + "grad_norm": 12.553147315979004, + "learning_rate": 0.0001658312599510875, + "loss": 3.901, + "step": 731 + }, + { + "epoch": 0.2771677394926164, + "grad_norm": 11.170998573303223, + "learning_rate": 0.00016574098692588915, + "loss": 4.3408, + "step": 732 + }, + { + "epoch": 0.2775463839454752, + "grad_norm": 13.828832626342773, + "learning_rate": 0.0001656506194507106, + "loss": 4.7403, + "step": 733 + }, + { + "epoch": 0.27792502839833394, + "grad_norm": 13.855401992797852, + "learning_rate": 0.00016556015765538273, + "loss": 4.6504, + "step": 734 + }, + { + "epoch": 0.27830367285119273, + "grad_norm": 11.40543270111084, + "learning_rate": 0.0001654696016698718, + "loss": 3.4119, + "step": 735 + }, + { + "epoch": 0.2786823173040515, + "grad_norm": 12.30098819732666, + "learning_rate": 0.00016537895162427955, + "loss": 3.635, + "step": 736 + }, + { + "epoch": 0.27906096175691025, + "grad_norm": 12.096563339233398, + "learning_rate": 0.0001652882076488427, + "loss": 4.2051, + "step": 737 + }, + { + "epoch": 0.27943960620976904, + "grad_norm": 11.935840606689453, + "learning_rate": 0.00016519736987393303, + "loss": 3.8025, + "step": 738 + }, + { + "epoch": 0.2798182506626278, + "grad_norm": 13.400490760803223, + "learning_rate": 0.000165106438430057, + "loss": 4.2775, + "step": 739 + }, + { + "epoch": 0.28019689511548657, + "grad_norm": 10.693985939025879, + "learning_rate": 0.00016501541344785572, + "loss": 2.8859, + "step": 740 + }, + { + "epoch": 0.2805755395683453, + "grad_norm": 13.22080135345459, + "learning_rate": 0.0001649242950581046, + "loss": 2.4878, + "step": 741 + }, + { + "epoch": 0.2809541840212041, + "grad_norm": 14.28111743927002, + "learning_rate": 0.00016483308339171335, + "loss": 3.9025, + "step": 742 + }, + { + "epoch": 0.2813328284740629, + "grad_norm": 17.349239349365234, + "learning_rate": 0.0001647417785797256, + "loss": 3.6947, + "step": 743 + }, + { + "epoch": 0.2817114729269216, + "grad_norm": 15.542529106140137, + "learning_rate": 0.0001646503807533189, + "loss": 3.1008, + "step": 744 + }, + { + "epoch": 0.2820901173797804, + "grad_norm": 13.73243522644043, + "learning_rate": 0.0001645588900438043, + "loss": 2.3237, + "step": 745 + }, + { + "epoch": 0.28246876183263914, + "grad_norm": 18.583194732666016, + "learning_rate": 0.0001644673065826264, + "loss": 3.3123, + "step": 746 + }, + { + "epoch": 0.2828474062854979, + "grad_norm": 20.04288673400879, + "learning_rate": 0.00016437563050136303, + "loss": 2.8265, + "step": 747 + }, + { + "epoch": 0.28322605073835666, + "grad_norm": 15.773954391479492, + "learning_rate": 0.00016428386193172506, + "loss": 2.1103, + "step": 748 + }, + { + "epoch": 0.28360469519121545, + "grad_norm": 19.099443435668945, + "learning_rate": 0.0001641920010055563, + "loss": 1.9673, + "step": 749 + }, + { + "epoch": 0.28398333964407424, + "grad_norm": 40.27873611450195, + "learning_rate": 0.00016410004785483316, + "loss": 5.3713, + "step": 750 + }, + { + "epoch": 0.284361984096933, + "grad_norm": 10.12539005279541, + "learning_rate": 0.00016400800261166465, + "loss": 4.9746, + "step": 751 + }, + { + "epoch": 0.28474062854979176, + "grad_norm": 10.15647029876709, + "learning_rate": 0.000163915865408292, + "loss": 4.8416, + "step": 752 + }, + { + "epoch": 0.2851192730026505, + "grad_norm": 9.944365501403809, + "learning_rate": 0.00016382363637708865, + "loss": 4.3926, + "step": 753 + }, + { + "epoch": 0.2854979174555093, + "grad_norm": 9.265400886535645, + "learning_rate": 0.0001637313156505598, + "loss": 3.6671, + "step": 754 + }, + { + "epoch": 0.285876561908368, + "grad_norm": 10.70794677734375, + "learning_rate": 0.00016363890336134262, + "loss": 4.5764, + "step": 755 + }, + { + "epoch": 0.2862552063612268, + "grad_norm": 11.477482795715332, + "learning_rate": 0.00016354639964220568, + "loss": 4.5665, + "step": 756 + }, + { + "epoch": 0.2866338508140856, + "grad_norm": 10.951593399047852, + "learning_rate": 0.0001634538046260489, + "loss": 4.2272, + "step": 757 + }, + { + "epoch": 0.28701249526694433, + "grad_norm": 11.813931465148926, + "learning_rate": 0.00016336111844590345, + "loss": 3.8581, + "step": 758 + }, + { + "epoch": 0.2873911397198031, + "grad_norm": 12.866728782653809, + "learning_rate": 0.0001632683412349314, + "loss": 4.2478, + "step": 759 + }, + { + "epoch": 0.28776978417266186, + "grad_norm": 11.82854175567627, + "learning_rate": 0.00016317547312642562, + "loss": 4.416, + "step": 760 + }, + { + "epoch": 0.28814842862552065, + "grad_norm": 12.294820785522461, + "learning_rate": 0.00016308251425380962, + "loss": 4.3508, + "step": 761 + }, + { + "epoch": 0.2885270730783794, + "grad_norm": 11.736769676208496, + "learning_rate": 0.00016298946475063733, + "loss": 3.5181, + "step": 762 + }, + { + "epoch": 0.28890571753123817, + "grad_norm": 10.93974781036377, + "learning_rate": 0.0001628963247505927, + "loss": 2.8494, + "step": 763 + }, + { + "epoch": 0.28928436198409696, + "grad_norm": 15.365312576293945, + "learning_rate": 0.00016280309438748992, + "loss": 3.8264, + "step": 764 + }, + { + "epoch": 0.2896630064369557, + "grad_norm": 13.349133491516113, + "learning_rate": 0.00016270977379527292, + "loss": 4.0294, + "step": 765 + }, + { + "epoch": 0.2900416508898145, + "grad_norm": 13.878774642944336, + "learning_rate": 0.00016261636310801523, + "loss": 3.6898, + "step": 766 + }, + { + "epoch": 0.2904202953426732, + "grad_norm": 13.974386215209961, + "learning_rate": 0.00016252286245991987, + "loss": 3.1476, + "step": 767 + }, + { + "epoch": 0.290798939795532, + "grad_norm": 12.599011421203613, + "learning_rate": 0.0001624292719853191, + "loss": 3.7, + "step": 768 + }, + { + "epoch": 0.29117758424839074, + "grad_norm": 14.90402603149414, + "learning_rate": 0.00016233559181867414, + "loss": 2.84, + "step": 769 + }, + { + "epoch": 0.29155622870124953, + "grad_norm": 15.247842788696289, + "learning_rate": 0.00016224182209457523, + "loss": 2.9135, + "step": 770 + }, + { + "epoch": 0.29193487315410827, + "grad_norm": 15.978056907653809, + "learning_rate": 0.00016214796294774115, + "loss": 3.8344, + "step": 771 + }, + { + "epoch": 0.29231351760696705, + "grad_norm": 15.876410484313965, + "learning_rate": 0.00016205401451301925, + "loss": 2.1739, + "step": 772 + }, + { + "epoch": 0.29269216205982584, + "grad_norm": 16.77569007873535, + "learning_rate": 0.00016195997692538506, + "loss": 2.1749, + "step": 773 + }, + { + "epoch": 0.2930708065126846, + "grad_norm": 23.31680679321289, + "learning_rate": 0.00016186585031994225, + "loss": 2.8665, + "step": 774 + }, + { + "epoch": 0.29344945096554337, + "grad_norm": 10.028854370117188, + "learning_rate": 0.0001617716348319224, + "loss": 0.8782, + "step": 775 + }, + { + "epoch": 0.2938280954184021, + "grad_norm": 8.688515663146973, + "learning_rate": 0.00016167733059668478, + "loss": 3.93, + "step": 776 + }, + { + "epoch": 0.2942067398712609, + "grad_norm": 9.398271560668945, + "learning_rate": 0.00016158293774971608, + "loss": 4.4695, + "step": 777 + }, + { + "epoch": 0.2945853843241196, + "grad_norm": 10.657846450805664, + "learning_rate": 0.00016148845642663043, + "loss": 4.401, + "step": 778 + }, + { + "epoch": 0.2949640287769784, + "grad_norm": 10.177902221679688, + "learning_rate": 0.000161393886763169, + "loss": 3.8614, + "step": 779 + }, + { + "epoch": 0.2953426732298372, + "grad_norm": 10.739095687866211, + "learning_rate": 0.0001612992288951998, + "loss": 3.9037, + "step": 780 + }, + { + "epoch": 0.29572131768269594, + "grad_norm": 11.997400283813477, + "learning_rate": 0.00016120448295871783, + "loss": 3.6965, + "step": 781 + }, + { + "epoch": 0.29609996213555473, + "grad_norm": 12.047724723815918, + "learning_rate": 0.00016110964908984428, + "loss": 4.1741, + "step": 782 + }, + { + "epoch": 0.29647860658841346, + "grad_norm": 11.252506256103516, + "learning_rate": 0.00016101472742482685, + "loss": 4.2626, + "step": 783 + }, + { + "epoch": 0.29685725104127225, + "grad_norm": 10.244424819946289, + "learning_rate": 0.00016091971810003946, + "loss": 3.8371, + "step": 784 + }, + { + "epoch": 0.297235895494131, + "grad_norm": 11.887914657592773, + "learning_rate": 0.00016082462125198177, + "loss": 3.7736, + "step": 785 + }, + { + "epoch": 0.2976145399469898, + "grad_norm": 11.956177711486816, + "learning_rate": 0.00016072943701727932, + "loss": 4.0997, + "step": 786 + }, + { + "epoch": 0.29799318439984857, + "grad_norm": 11.499533653259277, + "learning_rate": 0.00016063416553268315, + "loss": 3.995, + "step": 787 + }, + { + "epoch": 0.2983718288527073, + "grad_norm": 14.390945434570312, + "learning_rate": 0.00016053880693506968, + "loss": 4.1593, + "step": 788 + }, + { + "epoch": 0.2987504733055661, + "grad_norm": 12.83646297454834, + "learning_rate": 0.00016044336136144044, + "loss": 3.2662, + "step": 789 + }, + { + "epoch": 0.2991291177584248, + "grad_norm": 12.761232376098633, + "learning_rate": 0.00016034782894892198, + "loss": 2.7353, + "step": 790 + }, + { + "epoch": 0.2995077622112836, + "grad_norm": 13.886045455932617, + "learning_rate": 0.00016025220983476555, + "loss": 3.6852, + "step": 791 + }, + { + "epoch": 0.29988640666414235, + "grad_norm": 16.431631088256836, + "learning_rate": 0.00016015650415634704, + "loss": 4.5693, + "step": 792 + }, + { + "epoch": 0.30026505111700114, + "grad_norm": 15.884831428527832, + "learning_rate": 0.00016006071205116657, + "loss": 4.0334, + "step": 793 + }, + { + "epoch": 0.3006436955698599, + "grad_norm": 16.197486877441406, + "learning_rate": 0.00015996483365684862, + "loss": 3.0299, + "step": 794 + }, + { + "epoch": 0.30102234002271866, + "grad_norm": 13.327005386352539, + "learning_rate": 0.00015986886911114145, + "loss": 2.5927, + "step": 795 + }, + { + "epoch": 0.30140098447557745, + "grad_norm": 13.829025268554688, + "learning_rate": 0.00015977281855191725, + "loss": 2.6192, + "step": 796 + }, + { + "epoch": 0.3017796289284362, + "grad_norm": 15.983011245727539, + "learning_rate": 0.00015967668211717167, + "loss": 2.3621, + "step": 797 + }, + { + "epoch": 0.302158273381295, + "grad_norm": 19.83639144897461, + "learning_rate": 0.00015958045994502384, + "loss": 2.7834, + "step": 798 + }, + { + "epoch": 0.3025369178341537, + "grad_norm": 19.925039291381836, + "learning_rate": 0.00015948415217371595, + "loss": 2.8116, + "step": 799 + }, + { + "epoch": 0.3029155622870125, + "grad_norm": 21.867938995361328, + "learning_rate": 0.0001593877589416133, + "loss": 1.7513, + "step": 800 + }, + { + "epoch": 0.3032942067398713, + "grad_norm": 8.560530662536621, + "learning_rate": 0.00015929128038720384, + "loss": 5.1137, + "step": 801 + }, + { + "epoch": 0.30367285119273, + "grad_norm": 8.668681144714355, + "learning_rate": 0.00015919471664909823, + "loss": 3.8616, + "step": 802 + }, + { + "epoch": 0.3040514956455888, + "grad_norm": 10.437018394470215, + "learning_rate": 0.0001590980678660294, + "loss": 3.7993, + "step": 803 + }, + { + "epoch": 0.30443014009844754, + "grad_norm": 10.498896598815918, + "learning_rate": 0.0001590013341768526, + "loss": 3.8712, + "step": 804 + }, + { + "epoch": 0.30480878455130633, + "grad_norm": 9.216273307800293, + "learning_rate": 0.00015890451572054482, + "loss": 4.0495, + "step": 805 + }, + { + "epoch": 0.30518742900416507, + "grad_norm": 10.508468627929688, + "learning_rate": 0.00015880761263620515, + "loss": 3.4153, + "step": 806 + }, + { + "epoch": 0.30556607345702386, + "grad_norm": 13.808286666870117, + "learning_rate": 0.00015871062506305408, + "loss": 3.4353, + "step": 807 + }, + { + "epoch": 0.30594471790988265, + "grad_norm": 12.350955963134766, + "learning_rate": 0.00015861355314043343, + "loss": 3.5035, + "step": 808 + }, + { + "epoch": 0.3063233623627414, + "grad_norm": 12.85565185546875, + "learning_rate": 0.00015851639700780642, + "loss": 3.8184, + "step": 809 + }, + { + "epoch": 0.30670200681560017, + "grad_norm": 13.963553428649902, + "learning_rate": 0.000158419156804757, + "loss": 4.7287, + "step": 810 + }, + { + "epoch": 0.3070806512684589, + "grad_norm": 15.577609062194824, + "learning_rate": 0.0001583218326709901, + "loss": 3.6594, + "step": 811 + }, + { + "epoch": 0.3074592957213177, + "grad_norm": 11.10647201538086, + "learning_rate": 0.00015822442474633115, + "loss": 2.9355, + "step": 812 + }, + { + "epoch": 0.3078379401741764, + "grad_norm": 13.10251522064209, + "learning_rate": 0.00015812693317072596, + "loss": 4.3878, + "step": 813 + }, + { + "epoch": 0.3082165846270352, + "grad_norm": 12.302017211914062, + "learning_rate": 0.00015802935808424055, + "loss": 2.902, + "step": 814 + }, + { + "epoch": 0.308595229079894, + "grad_norm": 13.663749694824219, + "learning_rate": 0.00015793169962706092, + "loss": 2.7841, + "step": 815 + }, + { + "epoch": 0.30897387353275274, + "grad_norm": 13.366521835327148, + "learning_rate": 0.00015783395793949278, + "loss": 3.4101, + "step": 816 + }, + { + "epoch": 0.30935251798561153, + "grad_norm": 16.41577911376953, + "learning_rate": 0.00015773613316196147, + "loss": 3.334, + "step": 817 + }, + { + "epoch": 0.30973116243847026, + "grad_norm": 15.605032920837402, + "learning_rate": 0.0001576382254350118, + "loss": 3.7084, + "step": 818 + }, + { + "epoch": 0.31010980689132905, + "grad_norm": 14.417840003967285, + "learning_rate": 0.00015754023489930754, + "loss": 3.0134, + "step": 819 + }, + { + "epoch": 0.3104884513441878, + "grad_norm": 17.02623176574707, + "learning_rate": 0.00015744216169563164, + "loss": 3.0973, + "step": 820 + }, + { + "epoch": 0.3108670957970466, + "grad_norm": 14.048128128051758, + "learning_rate": 0.00015734400596488567, + "loss": 2.4681, + "step": 821 + }, + { + "epoch": 0.3112457402499053, + "grad_norm": 22.928178787231445, + "learning_rate": 0.00015724576784808986, + "loss": 4.2287, + "step": 822 + }, + { + "epoch": 0.3116243847027641, + "grad_norm": 16.560827255249023, + "learning_rate": 0.00015714744748638278, + "loss": 2.381, + "step": 823 + }, + { + "epoch": 0.3120030291556229, + "grad_norm": 16.747251510620117, + "learning_rate": 0.0001570490450210211, + "loss": 1.6694, + "step": 824 + }, + { + "epoch": 0.3123816736084816, + "grad_norm": 26.673425674438477, + "learning_rate": 0.00015695056059337952, + "loss": 1.5667, + "step": 825 + }, + { + "epoch": 0.3127603180613404, + "grad_norm": 7.984857082366943, + "learning_rate": 0.00015685199434495051, + "loss": 4.4119, + "step": 826 + }, + { + "epoch": 0.31313896251419915, + "grad_norm": 9.042461395263672, + "learning_rate": 0.00015675334641734398, + "loss": 4.3624, + "step": 827 + }, + { + "epoch": 0.31351760696705794, + "grad_norm": 10.055127143859863, + "learning_rate": 0.00015665461695228735, + "loss": 4.276, + "step": 828 + }, + { + "epoch": 0.31389625141991667, + "grad_norm": 9.659919738769531, + "learning_rate": 0.00015655580609162504, + "loss": 3.5357, + "step": 829 + }, + { + "epoch": 0.31427489587277546, + "grad_norm": 10.656012535095215, + "learning_rate": 0.00015645691397731852, + "loss": 4.0171, + "step": 830 + }, + { + "epoch": 0.31465354032563425, + "grad_norm": 11.442161560058594, + "learning_rate": 0.00015635794075144588, + "loss": 3.8396, + "step": 831 + }, + { + "epoch": 0.315032184778493, + "grad_norm": 12.612800598144531, + "learning_rate": 0.00015625888655620187, + "loss": 4.2947, + "step": 832 + }, + { + "epoch": 0.3154108292313518, + "grad_norm": 12.016472816467285, + "learning_rate": 0.00015615975153389746, + "loss": 3.9577, + "step": 833 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 10.963457107543945, + "learning_rate": 0.00015606053582695984, + "loss": 4.1569, + "step": 834 + }, + { + "epoch": 0.3161681181370693, + "grad_norm": 12.133650779724121, + "learning_rate": 0.00015596123957793202, + "loss": 3.681, + "step": 835 + }, + { + "epoch": 0.31654676258992803, + "grad_norm": 12.980992317199707, + "learning_rate": 0.0001558618629294728, + "loss": 3.614, + "step": 836 + }, + { + "epoch": 0.3169254070427868, + "grad_norm": 11.19620132446289, + "learning_rate": 0.0001557624060243565, + "loss": 3.6321, + "step": 837 + }, + { + "epoch": 0.3173040514956456, + "grad_norm": 14.250601768493652, + "learning_rate": 0.00015566286900547266, + "loss": 4.1902, + "step": 838 + }, + { + "epoch": 0.31768269594850435, + "grad_norm": 12.371217727661133, + "learning_rate": 0.000155563252015826, + "loss": 2.7028, + "step": 839 + }, + { + "epoch": 0.31806134040136314, + "grad_norm": 12.687495231628418, + "learning_rate": 0.00015546355519853607, + "loss": 2.4365, + "step": 840 + }, + { + "epoch": 0.31843998485422187, + "grad_norm": 12.307214736938477, + "learning_rate": 0.00015536377869683718, + "loss": 2.7681, + "step": 841 + }, + { + "epoch": 0.31881862930708066, + "grad_norm": 15.518838882446289, + "learning_rate": 0.0001552639226540781, + "loss": 3.1019, + "step": 842 + }, + { + "epoch": 0.3191972737599394, + "grad_norm": 14.274090766906738, + "learning_rate": 0.00015516398721372179, + "loss": 2.8421, + "step": 843 + }, + { + "epoch": 0.3195759182127982, + "grad_norm": 19.139890670776367, + "learning_rate": 0.00015506397251934543, + "loss": 2.5628, + "step": 844 + }, + { + "epoch": 0.31995456266565697, + "grad_norm": 17.884008407592773, + "learning_rate": 0.00015496387871463988, + "loss": 2.3613, + "step": 845 + }, + { + "epoch": 0.3203332071185157, + "grad_norm": 16.46691131591797, + "learning_rate": 0.0001548637059434099, + "loss": 2.4046, + "step": 846 + }, + { + "epoch": 0.3207118515713745, + "grad_norm": 16.158769607543945, + "learning_rate": 0.00015476345434957346, + "loss": 2.9732, + "step": 847 + }, + { + "epoch": 0.32109049602423323, + "grad_norm": 25.788095474243164, + "learning_rate": 0.00015466312407716194, + "loss": 3.0837, + "step": 848 + }, + { + "epoch": 0.321469140477092, + "grad_norm": 27.709606170654297, + "learning_rate": 0.00015456271527031966, + "loss": 2.3595, + "step": 849 + }, + { + "epoch": 0.32184778492995075, + "grad_norm": 27.167621612548828, + "learning_rate": 0.00015446222807330383, + "loss": 2.2286, + "step": 850 + }, + { + "epoch": 0.32222642938280954, + "grad_norm": 8.955855369567871, + "learning_rate": 0.00015436166263048425, + "loss": 4.3385, + "step": 851 + }, + { + "epoch": 0.32260507383566833, + "grad_norm": 8.619714736938477, + "learning_rate": 0.00015426101908634312, + "loss": 3.7368, + "step": 852 + }, + { + "epoch": 0.32298371828852707, + "grad_norm": 9.597879409790039, + "learning_rate": 0.00015416029758547493, + "loss": 3.8133, + "step": 853 + }, + { + "epoch": 0.32336236274138586, + "grad_norm": 10.818007469177246, + "learning_rate": 0.00015405949827258604, + "loss": 4.1761, + "step": 854 + }, + { + "epoch": 0.3237410071942446, + "grad_norm": 10.386642456054688, + "learning_rate": 0.00015395862129249474, + "loss": 3.6592, + "step": 855 + }, + { + "epoch": 0.3241196516471034, + "grad_norm": 11.960341453552246, + "learning_rate": 0.00015385766679013081, + "loss": 3.6471, + "step": 856 + }, + { + "epoch": 0.3244982960999621, + "grad_norm": 13.14782428741455, + "learning_rate": 0.00015375663491053545, + "loss": 3.9707, + "step": 857 + }, + { + "epoch": 0.3248769405528209, + "grad_norm": 12.082589149475098, + "learning_rate": 0.000153655525798861, + "loss": 3.5612, + "step": 858 + }, + { + "epoch": 0.3252555850056797, + "grad_norm": 11.448456764221191, + "learning_rate": 0.00015355433960037077, + "loss": 3.737, + "step": 859 + }, + { + "epoch": 0.3256342294585384, + "grad_norm": 12.987861633300781, + "learning_rate": 0.0001534530764604389, + "loss": 3.8811, + "step": 860 + }, + { + "epoch": 0.3260128739113972, + "grad_norm": 12.712824821472168, + "learning_rate": 0.00015335173652454985, + "loss": 3.5249, + "step": 861 + }, + { + "epoch": 0.32639151836425595, + "grad_norm": 11.121883392333984, + "learning_rate": 0.00015325031993829868, + "loss": 2.6656, + "step": 862 + }, + { + "epoch": 0.32677016281711474, + "grad_norm": 14.241087913513184, + "learning_rate": 0.0001531488268473904, + "loss": 3.9731, + "step": 863 + }, + { + "epoch": 0.3271488072699735, + "grad_norm": 13.581354141235352, + "learning_rate": 0.00015304725739764, + "loss": 3.2629, + "step": 864 + }, + { + "epoch": 0.32752745172283226, + "grad_norm": 15.62415599822998, + "learning_rate": 0.00015294561173497215, + "loss": 3.9048, + "step": 865 + }, + { + "epoch": 0.32790609617569105, + "grad_norm": 12.98635196685791, + "learning_rate": 0.00015284389000542103, + "loss": 2.6195, + "step": 866 + }, + { + "epoch": 0.3282847406285498, + "grad_norm": 15.516901016235352, + "learning_rate": 0.00015274209235513014, + "loss": 3.2572, + "step": 867 + }, + { + "epoch": 0.3286633850814086, + "grad_norm": 13.609155654907227, + "learning_rate": 0.00015264021893035193, + "loss": 2.7172, + "step": 868 + }, + { + "epoch": 0.3290420295342673, + "grad_norm": 15.977977752685547, + "learning_rate": 0.00015253826987744789, + "loss": 3.2585, + "step": 869 + }, + { + "epoch": 0.3294206739871261, + "grad_norm": 14.53819751739502, + "learning_rate": 0.00015243624534288803, + "loss": 2.9884, + "step": 870 + }, + { + "epoch": 0.32979931843998483, + "grad_norm": 18.142704010009766, + "learning_rate": 0.00015233414547325083, + "loss": 3.0888, + "step": 871 + }, + { + "epoch": 0.3301779628928436, + "grad_norm": 17.9478816986084, + "learning_rate": 0.00015223197041522307, + "loss": 2.0567, + "step": 872 + }, + { + "epoch": 0.33055660734570236, + "grad_norm": 15.515186309814453, + "learning_rate": 0.00015212972031559946, + "loss": 2.056, + "step": 873 + }, + { + "epoch": 0.33093525179856115, + "grad_norm": 20.402446746826172, + "learning_rate": 0.00015202739532128265, + "loss": 1.867, + "step": 874 + }, + { + "epoch": 0.33131389625141994, + "grad_norm": 13.986373901367188, + "learning_rate": 0.0001519249955792827, + "loss": 1.5481, + "step": 875 + }, + { + "epoch": 0.33169254070427867, + "grad_norm": 9.035808563232422, + "learning_rate": 0.00015182252123671725, + "loss": 4.4831, + "step": 876 + }, + { + "epoch": 0.33207118515713746, + "grad_norm": 9.396247863769531, + "learning_rate": 0.000151719972440811, + "loss": 4.1913, + "step": 877 + }, + { + "epoch": 0.3324498296099962, + "grad_norm": 10.645395278930664, + "learning_rate": 0.0001516173493388957, + "loss": 4.71, + "step": 878 + }, + { + "epoch": 0.332828474062855, + "grad_norm": 11.150712966918945, + "learning_rate": 0.00015151465207840977, + "loss": 4.2096, + "step": 879 + }, + { + "epoch": 0.3332071185157137, + "grad_norm": 10.260977745056152, + "learning_rate": 0.00015141188080689826, + "loss": 3.1771, + "step": 880 + }, + { + "epoch": 0.3335857629685725, + "grad_norm": 10.818496704101562, + "learning_rate": 0.00015130903567201243, + "loss": 2.9112, + "step": 881 + }, + { + "epoch": 0.3339644074214313, + "grad_norm": 11.379049301147461, + "learning_rate": 0.0001512061168215098, + "loss": 3.8058, + "step": 882 + }, + { + "epoch": 0.33434305187429003, + "grad_norm": 12.107205390930176, + "learning_rate": 0.00015110312440325368, + "loss": 3.271, + "step": 883 + }, + { + "epoch": 0.3347216963271488, + "grad_norm": 12.379898071289062, + "learning_rate": 0.0001510000585652132, + "loss": 2.992, + "step": 884 + }, + { + "epoch": 0.33510034078000756, + "grad_norm": 11.953714370727539, + "learning_rate": 0.00015089691945546283, + "loss": 3.1566, + "step": 885 + }, + { + "epoch": 0.33547898523286634, + "grad_norm": 13.055462837219238, + "learning_rate": 0.00015079370722218243, + "loss": 2.5646, + "step": 886 + }, + { + "epoch": 0.3358576296857251, + "grad_norm": 12.182693481445312, + "learning_rate": 0.00015069042201365683, + "loss": 2.9366, + "step": 887 + }, + { + "epoch": 0.33623627413858387, + "grad_norm": 13.180964469909668, + "learning_rate": 0.00015058706397827573, + "loss": 4.0075, + "step": 888 + }, + { + "epoch": 0.33661491859144266, + "grad_norm": 12.289628982543945, + "learning_rate": 0.0001504836332645335, + "loss": 2.5069, + "step": 889 + }, + { + "epoch": 0.3369935630443014, + "grad_norm": 11.804617881774902, + "learning_rate": 0.00015038013002102892, + "loss": 2.0101, + "step": 890 + }, + { + "epoch": 0.3373722074971602, + "grad_norm": 14.811490058898926, + "learning_rate": 0.00015027655439646488, + "loss": 3.8222, + "step": 891 + }, + { + "epoch": 0.3377508519500189, + "grad_norm": 15.269726753234863, + "learning_rate": 0.00015017290653964835, + "loss": 2.9604, + "step": 892 + }, + { + "epoch": 0.3381294964028777, + "grad_norm": 13.442567825317383, + "learning_rate": 0.0001500691865994901, + "loss": 3.0957, + "step": 893 + }, + { + "epoch": 0.33850814085573644, + "grad_norm": 15.218294143676758, + "learning_rate": 0.00014996539472500437, + "loss": 2.7899, + "step": 894 + }, + { + "epoch": 0.33888678530859523, + "grad_norm": 13.601509094238281, + "learning_rate": 0.00014986153106530883, + "loss": 2.6892, + "step": 895 + }, + { + "epoch": 0.339265429761454, + "grad_norm": 13.653763771057129, + "learning_rate": 0.00014975759576962424, + "loss": 2.2024, + "step": 896 + }, + { + "epoch": 0.33964407421431275, + "grad_norm": 17.351696014404297, + "learning_rate": 0.00014965358898727423, + "loss": 2.777, + "step": 897 + }, + { + "epoch": 0.34002271866717154, + "grad_norm": 32.49483108520508, + "learning_rate": 0.00014954951086768525, + "loss": 2.3369, + "step": 898 + }, + { + "epoch": 0.3404013631200303, + "grad_norm": 37.68558120727539, + "learning_rate": 0.0001494453615603862, + "loss": 3.0663, + "step": 899 + }, + { + "epoch": 0.34078000757288907, + "grad_norm": 27.460304260253906, + "learning_rate": 0.00014934114121500818, + "loss": 2.0837, + "step": 900 + }, + { + "epoch": 0.3411586520257478, + "grad_norm": 19.336395263671875, + "learning_rate": 0.00014923684998128446, + "loss": 4.6271, + "step": 901 + }, + { + "epoch": 0.3415372964786066, + "grad_norm": 11.99440860748291, + "learning_rate": 0.00014913248800905006, + "loss": 4.4893, + "step": 902 + }, + { + "epoch": 0.3419159409314654, + "grad_norm": 10.598093032836914, + "learning_rate": 0.00014902805544824175, + "loss": 3.813, + "step": 903 + }, + { + "epoch": 0.3422945853843241, + "grad_norm": 10.407685279846191, + "learning_rate": 0.00014892355244889752, + "loss": 4.3924, + "step": 904 + }, + { + "epoch": 0.3426732298371829, + "grad_norm": 11.969062805175781, + "learning_rate": 0.0001488189791611568, + "loss": 3.9199, + "step": 905 + }, + { + "epoch": 0.34305187429004164, + "grad_norm": 10.909595489501953, + "learning_rate": 0.00014871433573525976, + "loss": 3.5213, + "step": 906 + }, + { + "epoch": 0.3434305187429004, + "grad_norm": 11.326231956481934, + "learning_rate": 0.00014860962232154755, + "loss": 3.2244, + "step": 907 + }, + { + "epoch": 0.34380916319575916, + "grad_norm": 12.978373527526855, + "learning_rate": 0.00014850483907046175, + "loss": 4.087, + "step": 908 + }, + { + "epoch": 0.34418780764861795, + "grad_norm": 13.51850414276123, + "learning_rate": 0.00014839998613254432, + "loss": 3.7443, + "step": 909 + }, + { + "epoch": 0.34456645210147674, + "grad_norm": 13.952939987182617, + "learning_rate": 0.00014829506365843725, + "loss": 4.2233, + "step": 910 + }, + { + "epoch": 0.3449450965543355, + "grad_norm": 14.313178062438965, + "learning_rate": 0.00014819007179888262, + "loss": 3.744, + "step": 911 + }, + { + "epoch": 0.34532374100719426, + "grad_norm": 13.837858200073242, + "learning_rate": 0.000148085010704722, + "loss": 3.4982, + "step": 912 + }, + { + "epoch": 0.345702385460053, + "grad_norm": 12.670626640319824, + "learning_rate": 0.0001479798805268965, + "loss": 2.5508, + "step": 913 + }, + { + "epoch": 0.3460810299129118, + "grad_norm": 14.74666976928711, + "learning_rate": 0.00014787468141644658, + "loss": 3.6456, + "step": 914 + }, + { + "epoch": 0.3464596743657705, + "grad_norm": 14.362848281860352, + "learning_rate": 0.0001477694135245116, + "loss": 3.3422, + "step": 915 + }, + { + "epoch": 0.3468383188186293, + "grad_norm": 12.029289245605469, + "learning_rate": 0.00014766407700232974, + "loss": 2.7627, + "step": 916 + }, + { + "epoch": 0.3472169632714881, + "grad_norm": 13.28024673461914, + "learning_rate": 0.00014755867200123789, + "loss": 2.4415, + "step": 917 + }, + { + "epoch": 0.34759560772434683, + "grad_norm": 16.25495719909668, + "learning_rate": 0.00014745319867267122, + "loss": 3.8264, + "step": 918 + }, + { + "epoch": 0.3479742521772056, + "grad_norm": 14.264103889465332, + "learning_rate": 0.00014734765716816316, + "loss": 2.3678, + "step": 919 + }, + { + "epoch": 0.34835289663006436, + "grad_norm": 16.4278507232666, + "learning_rate": 0.00014724204763934498, + "loss": 3.2339, + "step": 920 + }, + { + "epoch": 0.34873154108292315, + "grad_norm": 12.346698760986328, + "learning_rate": 0.0001471363702379458, + "loss": 2.3282, + "step": 921 + }, + { + "epoch": 0.3491101855357819, + "grad_norm": 16.423734664916992, + "learning_rate": 0.00014703062511579212, + "loss": 2.2432, + "step": 922 + }, + { + "epoch": 0.34948882998864067, + "grad_norm": 36.795833587646484, + "learning_rate": 0.00014692481242480784, + "loss": 2.8118, + "step": 923 + }, + { + "epoch": 0.34986747444149946, + "grad_norm": 22.425527572631836, + "learning_rate": 0.0001468189323170139, + "loss": 2.0988, + "step": 924 + }, + { + "epoch": 0.3502461188943582, + "grad_norm": 21.815776824951172, + "learning_rate": 0.00014671298494452808, + "loss": 2.1386, + "step": 925 + }, + { + "epoch": 0.350624763347217, + "grad_norm": 9.949638366699219, + "learning_rate": 0.0001466069704595648, + "loss": 4.477, + "step": 926 + }, + { + "epoch": 0.3510034078000757, + "grad_norm": 10.098043441772461, + "learning_rate": 0.000146500889014435, + "loss": 3.9642, + "step": 927 + }, + { + "epoch": 0.3513820522529345, + "grad_norm": 9.761126518249512, + "learning_rate": 0.00014639474076154566, + "loss": 3.7614, + "step": 928 + }, + { + "epoch": 0.35176069670579324, + "grad_norm": 11.026824951171875, + "learning_rate": 0.00014628852585339984, + "loss": 4.2254, + "step": 929 + }, + { + "epoch": 0.35213934115865203, + "grad_norm": 11.74862289428711, + "learning_rate": 0.00014618224444259628, + "loss": 3.1092, + "step": 930 + }, + { + "epoch": 0.35251798561151076, + "grad_norm": 10.165847778320312, + "learning_rate": 0.00014607589668182947, + "loss": 2.6807, + "step": 931 + }, + { + "epoch": 0.35289663006436955, + "grad_norm": 12.149169921875, + "learning_rate": 0.00014596948272388896, + "loss": 2.9791, + "step": 932 + }, + { + "epoch": 0.35327527451722834, + "grad_norm": 12.490134239196777, + "learning_rate": 0.0001458630027216596, + "loss": 3.9789, + "step": 933 + }, + { + "epoch": 0.3536539189700871, + "grad_norm": 13.850975036621094, + "learning_rate": 0.000145756456828121, + "loss": 3.4066, + "step": 934 + }, + { + "epoch": 0.35403256342294587, + "grad_norm": 15.180842399597168, + "learning_rate": 0.00014564984519634754, + "loss": 3.2428, + "step": 935 + }, + { + "epoch": 0.3544112078758046, + "grad_norm": 13.27072811126709, + "learning_rate": 0.00014554316797950797, + "loss": 2.6158, + "step": 936 + }, + { + "epoch": 0.3547898523286634, + "grad_norm": 12.887181282043457, + "learning_rate": 0.0001454364253308653, + "loss": 3.6556, + "step": 937 + }, + { + "epoch": 0.3551684967815221, + "grad_norm": 14.38553237915039, + "learning_rate": 0.00014532961740377652, + "loss": 3.6761, + "step": 938 + }, + { + "epoch": 0.3555471412343809, + "grad_norm": 13.48885726928711, + "learning_rate": 0.00014522274435169245, + "loss": 2.8547, + "step": 939 + }, + { + "epoch": 0.3559257856872397, + "grad_norm": 12.696219444274902, + "learning_rate": 0.00014511580632815742, + "loss": 2.4686, + "step": 940 + }, + { + "epoch": 0.35630443014009844, + "grad_norm": 12.52086067199707, + "learning_rate": 0.00014500880348680917, + "loss": 3.3242, + "step": 941 + }, + { + "epoch": 0.3566830745929572, + "grad_norm": 13.25282096862793, + "learning_rate": 0.00014490173598137845, + "loss": 2.3792, + "step": 942 + }, + { + "epoch": 0.35706171904581596, + "grad_norm": 12.935431480407715, + "learning_rate": 0.0001447946039656891, + "loss": 2.1999, + "step": 943 + }, + { + "epoch": 0.35744036349867475, + "grad_norm": 13.861615180969238, + "learning_rate": 0.00014468740759365743, + "loss": 2.7313, + "step": 944 + }, + { + "epoch": 0.3578190079515335, + "grad_norm": 15.322652816772461, + "learning_rate": 0.00014458014701929239, + "loss": 2.6993, + "step": 945 + }, + { + "epoch": 0.3581976524043923, + "grad_norm": 15.554706573486328, + "learning_rate": 0.00014447282239669502, + "loss": 2.1881, + "step": 946 + }, + { + "epoch": 0.35857629685725106, + "grad_norm": 15.744156837463379, + "learning_rate": 0.0001443654338800585, + "loss": 3.1557, + "step": 947 + }, + { + "epoch": 0.3589549413101098, + "grad_norm": 15.191664695739746, + "learning_rate": 0.00014425798162366775, + "loss": 2.1443, + "step": 948 + }, + { + "epoch": 0.3593335857629686, + "grad_norm": 16.317235946655273, + "learning_rate": 0.00014415046578189928, + "loss": 1.921, + "step": 949 + }, + { + "epoch": 0.3597122302158273, + "grad_norm": 35.329994201660156, + "learning_rate": 0.0001440428865092209, + "loss": 3.1096, + "step": 950 + }, + { + "epoch": 0.3600908746686861, + "grad_norm": 9.379858016967773, + "learning_rate": 0.0001439352439601916, + "loss": 4.936, + "step": 951 + }, + { + "epoch": 0.36046951912154485, + "grad_norm": 10.979476928710938, + "learning_rate": 0.0001438275382894613, + "loss": 3.8354, + "step": 952 + }, + { + "epoch": 0.36084816357440364, + "grad_norm": 10.961803436279297, + "learning_rate": 0.00014371976965177062, + "loss": 3.6228, + "step": 953 + }, + { + "epoch": 0.3612268080272624, + "grad_norm": 11.461506843566895, + "learning_rate": 0.00014361193820195046, + "loss": 4.6714, + "step": 954 + }, + { + "epoch": 0.36160545248012116, + "grad_norm": 11.015750885009766, + "learning_rate": 0.0001435040440949223, + "loss": 3.3826, + "step": 955 + }, + { + "epoch": 0.36198409693297995, + "grad_norm": 10.362982749938965, + "learning_rate": 0.0001433960874856973, + "loss": 3.1965, + "step": 956 + }, + { + "epoch": 0.3623627413858387, + "grad_norm": 11.998297691345215, + "learning_rate": 0.0001432880685293766, + "loss": 3.4358, + "step": 957 + }, + { + "epoch": 0.36274138583869747, + "grad_norm": 12.979171752929688, + "learning_rate": 0.00014317998738115091, + "loss": 2.9082, + "step": 958 + }, + { + "epoch": 0.3631200302915562, + "grad_norm": 15.333057403564453, + "learning_rate": 0.00014307184419630028, + "loss": 3.7046, + "step": 959 + }, + { + "epoch": 0.363498674744415, + "grad_norm": 17.005517959594727, + "learning_rate": 0.0001429636391301938, + "loss": 4.5541, + "step": 960 + }, + { + "epoch": 0.3638773191972738, + "grad_norm": 12.545903205871582, + "learning_rate": 0.00014285537233828954, + "loss": 3.2909, + "step": 961 + }, + { + "epoch": 0.3642559636501325, + "grad_norm": 13.042165756225586, + "learning_rate": 0.00014274704397613426, + "loss": 3.3752, + "step": 962 + }, + { + "epoch": 0.3646346081029913, + "grad_norm": 13.057799339294434, + "learning_rate": 0.00014263865419936316, + "loss": 2.7918, + "step": 963 + }, + { + "epoch": 0.36501325255585004, + "grad_norm": 13.173884391784668, + "learning_rate": 0.00014253020316369968, + "loss": 3.1801, + "step": 964 + }, + { + "epoch": 0.36539189700870883, + "grad_norm": 13.131632804870605, + "learning_rate": 0.00014242169102495527, + "loss": 3.3128, + "step": 965 + }, + { + "epoch": 0.36577054146156757, + "grad_norm": 13.377184867858887, + "learning_rate": 0.0001423131179390291, + "loss": 2.649, + "step": 966 + }, + { + "epoch": 0.36614918591442636, + "grad_norm": 12.528219223022461, + "learning_rate": 0.00014220448406190807, + "loss": 3.169, + "step": 967 + }, + { + "epoch": 0.36652783036728515, + "grad_norm": 13.746808052062988, + "learning_rate": 0.0001420957895496662, + "loss": 2.7259, + "step": 968 + }, + { + "epoch": 0.3669064748201439, + "grad_norm": 21.110719680786133, + "learning_rate": 0.00014198703455846484, + "loss": 3.5514, + "step": 969 + }, + { + "epoch": 0.36728511927300267, + "grad_norm": 12.612068176269531, + "learning_rate": 0.00014187821924455208, + "loss": 2.0534, + "step": 970 + }, + { + "epoch": 0.3676637637258614, + "grad_norm": 20.146154403686523, + "learning_rate": 0.0001417693437642627, + "loss": 2.7005, + "step": 971 + }, + { + "epoch": 0.3680424081787202, + "grad_norm": 13.088459014892578, + "learning_rate": 0.00014166040827401797, + "loss": 1.9876, + "step": 972 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 18.44115447998047, + "learning_rate": 0.00014155141293032536, + "loss": 1.6056, + "step": 973 + }, + { + "epoch": 0.3687996970844377, + "grad_norm": 17.64615249633789, + "learning_rate": 0.0001414423578897783, + "loss": 1.8792, + "step": 974 + }, + { + "epoch": 0.3691783415372965, + "grad_norm": 17.28131103515625, + "learning_rate": 0.00014133324330905603, + "loss": 1.3712, + "step": 975 + }, + { + "epoch": 0.36955698599015524, + "grad_norm": 9.265350341796875, + "learning_rate": 0.0001412240693449233, + "loss": 3.5385, + "step": 976 + }, + { + "epoch": 0.36993563044301403, + "grad_norm": 10.460281372070312, + "learning_rate": 0.00014111483615423018, + "loss": 3.5476, + "step": 977 + }, + { + "epoch": 0.37031427489587276, + "grad_norm": 12.234232902526855, + "learning_rate": 0.00014100554389391182, + "loss": 5.028, + "step": 978 + }, + { + "epoch": 0.37069291934873155, + "grad_norm": 12.799249649047852, + "learning_rate": 0.0001408961927209883, + "loss": 4.389, + "step": 979 + }, + { + "epoch": 0.3710715638015903, + "grad_norm": 10.977117538452148, + "learning_rate": 0.00014078678279256423, + "loss": 3.5701, + "step": 980 + }, + { + "epoch": 0.3714502082544491, + "grad_norm": 11.370275497436523, + "learning_rate": 0.00014067731426582877, + "loss": 3.4377, + "step": 981 + }, + { + "epoch": 0.3718288527073078, + "grad_norm": 10.520308494567871, + "learning_rate": 0.00014056778729805512, + "loss": 3.1299, + "step": 982 + }, + { + "epoch": 0.3722074971601666, + "grad_norm": 12.40962028503418, + "learning_rate": 0.00014045820204660055, + "loss": 3.2693, + "step": 983 + }, + { + "epoch": 0.3725861416130254, + "grad_norm": 11.964371681213379, + "learning_rate": 0.00014034855866890602, + "loss": 3.8952, + "step": 984 + }, + { + "epoch": 0.3729647860658841, + "grad_norm": 12.887282371520996, + "learning_rate": 0.000140238857322496, + "loss": 2.8382, + "step": 985 + }, + { + "epoch": 0.3733434305187429, + "grad_norm": 12.985127449035645, + "learning_rate": 0.0001401290981649783, + "loss": 3.4678, + "step": 986 + }, + { + "epoch": 0.37372207497160165, + "grad_norm": 14.884915351867676, + "learning_rate": 0.0001400192813540437, + "loss": 3.6069, + "step": 987 + }, + { + "epoch": 0.37410071942446044, + "grad_norm": 14.4747953414917, + "learning_rate": 0.00013990940704746585, + "loss": 2.9554, + "step": 988 + }, + { + "epoch": 0.37447936387731917, + "grad_norm": 14.479387283325195, + "learning_rate": 0.00013979947540310102, + "loss": 2.7698, + "step": 989 + }, + { + "epoch": 0.37485800833017796, + "grad_norm": 14.599347114562988, + "learning_rate": 0.00013968948657888788, + "loss": 2.87, + "step": 990 + }, + { + "epoch": 0.37523665278303675, + "grad_norm": 14.958257675170898, + "learning_rate": 0.00013957944073284714, + "loss": 2.8528, + "step": 991 + }, + { + "epoch": 0.3756152972358955, + "grad_norm": 15.495623588562012, + "learning_rate": 0.00013946933802308156, + "loss": 3.7293, + "step": 992 + }, + { + "epoch": 0.3759939416887543, + "grad_norm": 10.040778160095215, + "learning_rate": 0.00013935917860777555, + "loss": 1.5618, + "step": 993 + }, + { + "epoch": 0.376372586141613, + "grad_norm": 15.657940864562988, + "learning_rate": 0.00013924896264519491, + "loss": 2.2425, + "step": 994 + }, + { + "epoch": 0.3767512305944718, + "grad_norm": 13.899797439575195, + "learning_rate": 0.00013913869029368682, + "loss": 2.3471, + "step": 995 + }, + { + "epoch": 0.37712987504733053, + "grad_norm": 14.696097373962402, + "learning_rate": 0.00013902836171167938, + "loss": 2.637, + "step": 996 + }, + { + "epoch": 0.3775085195001893, + "grad_norm": 18.564838409423828, + "learning_rate": 0.00013891797705768155, + "loss": 1.3815, + "step": 997 + }, + { + "epoch": 0.3778871639530481, + "grad_norm": 17.13040542602539, + "learning_rate": 0.00013880753649028274, + "loss": 2.0306, + "step": 998 + }, + { + "epoch": 0.37826580840590684, + "grad_norm": 22.14106559753418, + "learning_rate": 0.00013869704016815276, + "loss": 2.567, + "step": 999 + }, + { + "epoch": 0.37864445285876563, + "grad_norm": 20.40251922607422, + "learning_rate": 0.00013858648825004156, + "loss": 2.1573, + "step": 1000 + }, + { + "epoch": 0.37902309731162437, + "grad_norm": 9.879082679748535, + "learning_rate": 0.00013847588089477888, + "loss": 5.068, + "step": 1001 + }, + { + "epoch": 0.37940174176448316, + "grad_norm": 9.758732795715332, + "learning_rate": 0.00013836521826127412, + "loss": 3.3331, + "step": 1002 + }, + { + "epoch": 0.3797803862173419, + "grad_norm": 10.761160850524902, + "learning_rate": 0.00013825450050851623, + "loss": 3.4942, + "step": 1003 + }, + { + "epoch": 0.3801590306702007, + "grad_norm": 10.23125171661377, + "learning_rate": 0.00013814372779557312, + "loss": 3.689, + "step": 1004 + }, + { + "epoch": 0.38053767512305947, + "grad_norm": 12.541388511657715, + "learning_rate": 0.00013803290028159185, + "loss": 4.2033, + "step": 1005 + }, + { + "epoch": 0.3809163195759182, + "grad_norm": 10.713353157043457, + "learning_rate": 0.00013792201812579816, + "loss": 3.4712, + "step": 1006 + }, + { + "epoch": 0.381294964028777, + "grad_norm": 11.47879695892334, + "learning_rate": 0.00013781108148749625, + "loss": 3.4701, + "step": 1007 + }, + { + "epoch": 0.38167360848163573, + "grad_norm": 10.18622875213623, + "learning_rate": 0.00013770009052606862, + "loss": 2.702, + "step": 1008 + }, + { + "epoch": 0.3820522529344945, + "grad_norm": 15.202455520629883, + "learning_rate": 0.00013758904540097587, + "loss": 2.9407, + "step": 1009 + }, + { + "epoch": 0.38243089738735325, + "grad_norm": 13.018632888793945, + "learning_rate": 0.00013747794627175632, + "loss": 3.8735, + "step": 1010 + }, + { + "epoch": 0.38280954184021204, + "grad_norm": 13.541316986083984, + "learning_rate": 0.00013736679329802594, + "loss": 2.2223, + "step": 1011 + }, + { + "epoch": 0.38318818629307083, + "grad_norm": 14.50750732421875, + "learning_rate": 0.00013725558663947807, + "loss": 3.7973, + "step": 1012 + }, + { + "epoch": 0.38356683074592957, + "grad_norm": 16.93392562866211, + "learning_rate": 0.00013714432645588312, + "loss": 4.062, + "step": 1013 + }, + { + "epoch": 0.38394547519878836, + "grad_norm": 13.330880165100098, + "learning_rate": 0.00013703301290708843, + "loss": 2.7007, + "step": 1014 + }, + { + "epoch": 0.3843241196516471, + "grad_norm": 13.55959701538086, + "learning_rate": 0.00013692164615301808, + "loss": 3.2762, + "step": 1015 + }, + { + "epoch": 0.3847027641045059, + "grad_norm": 13.232234001159668, + "learning_rate": 0.00013681022635367245, + "loss": 2.4535, + "step": 1016 + }, + { + "epoch": 0.3850814085573646, + "grad_norm": 15.532430648803711, + "learning_rate": 0.00013669875366912823, + "loss": 2.5774, + "step": 1017 + }, + { + "epoch": 0.3854600530102234, + "grad_norm": 15.254117965698242, + "learning_rate": 0.00013658722825953806, + "loss": 2.6327, + "step": 1018 + }, + { + "epoch": 0.3858386974630822, + "grad_norm": 16.092777252197266, + "learning_rate": 0.00013647565028513037, + "loss": 2.2312, + "step": 1019 + }, + { + "epoch": 0.3862173419159409, + "grad_norm": 16.16512107849121, + "learning_rate": 0.00013636401990620896, + "loss": 2.8618, + "step": 1020 + }, + { + "epoch": 0.3865959863687997, + "grad_norm": 12.459589958190918, + "learning_rate": 0.00013625233728315318, + "loss": 2.3862, + "step": 1021 + }, + { + "epoch": 0.38697463082165845, + "grad_norm": 14.847145080566406, + "learning_rate": 0.0001361406025764172, + "loss": 1.8623, + "step": 1022 + }, + { + "epoch": 0.38735327527451724, + "grad_norm": 16.524620056152344, + "learning_rate": 0.00013602881594653016, + "loss": 1.6795, + "step": 1023 + }, + { + "epoch": 0.387731919727376, + "grad_norm": 24.25473976135254, + "learning_rate": 0.00013591697755409573, + "loss": 3.2906, + "step": 1024 + }, + { + "epoch": 0.38811056418023476, + "grad_norm": 27.711610794067383, + "learning_rate": 0.0001358050875597919, + "loss": 1.9261, + "step": 1025 + }, + { + "epoch": 0.38848920863309355, + "grad_norm": 9.252058029174805, + "learning_rate": 0.00013569314612437098, + "loss": 4.3016, + "step": 1026 + }, + { + "epoch": 0.3888678530859523, + "grad_norm": 10.211181640625, + "learning_rate": 0.00013558115340865897, + "loss": 4.699, + "step": 1027 + }, + { + "epoch": 0.3892464975388111, + "grad_norm": 9.520386695861816, + "learning_rate": 0.0001354691095735557, + "loss": 3.8331, + "step": 1028 + }, + { + "epoch": 0.3896251419916698, + "grad_norm": 11.07938289642334, + "learning_rate": 0.00013535701478003439, + "loss": 2.8687, + "step": 1029 + }, + { + "epoch": 0.3900037864445286, + "grad_norm": 11.311447143554688, + "learning_rate": 0.0001352448691891414, + "loss": 3.0744, + "step": 1030 + }, + { + "epoch": 0.39038243089738733, + "grad_norm": 11.157035827636719, + "learning_rate": 0.00013513267296199618, + "loss": 3.2869, + "step": 1031 + }, + { + "epoch": 0.3907610753502461, + "grad_norm": 10.669620513916016, + "learning_rate": 0.0001350204262597909, + "loss": 3.5071, + "step": 1032 + }, + { + "epoch": 0.39113971980310486, + "grad_norm": 11.097278594970703, + "learning_rate": 0.00013490812924379022, + "loss": 2.2786, + "step": 1033 + }, + { + "epoch": 0.39151836425596365, + "grad_norm": 12.581409454345703, + "learning_rate": 0.0001347957820753311, + "loss": 3.7062, + "step": 1034 + }, + { + "epoch": 0.39189700870882244, + "grad_norm": 10.81505298614502, + "learning_rate": 0.00013468338491582252, + "loss": 3.2618, + "step": 1035 + }, + { + "epoch": 0.39227565316168117, + "grad_norm": 12.934078216552734, + "learning_rate": 0.00013457093792674537, + "loss": 3.0491, + "step": 1036 + }, + { + "epoch": 0.39265429761453996, + "grad_norm": 12.945857048034668, + "learning_rate": 0.00013445844126965206, + "loss": 2.3032, + "step": 1037 + }, + { + "epoch": 0.3930329420673987, + "grad_norm": 13.578465461730957, + "learning_rate": 0.00013434589510616634, + "loss": 2.4166, + "step": 1038 + }, + { + "epoch": 0.3934115865202575, + "grad_norm": 15.570049285888672, + "learning_rate": 0.00013423329959798315, + "loss": 3.2948, + "step": 1039 + }, + { + "epoch": 0.3937902309731162, + "grad_norm": 15.420329093933105, + "learning_rate": 0.0001341206549068683, + "loss": 3.0431, + "step": 1040 + }, + { + "epoch": 0.394168875425975, + "grad_norm": 16.096786499023438, + "learning_rate": 0.00013400796119465824, + "loss": 2.3038, + "step": 1041 + }, + { + "epoch": 0.3945475198788338, + "grad_norm": 15.989263534545898, + "learning_rate": 0.00013389521862325985, + "loss": 3.3304, + "step": 1042 + }, + { + "epoch": 0.39492616433169253, + "grad_norm": 16.243566513061523, + "learning_rate": 0.00013378242735465022, + "loss": 4.0894, + "step": 1043 + }, + { + "epoch": 0.3953048087845513, + "grad_norm": 13.244513511657715, + "learning_rate": 0.00013366958755087644, + "loss": 2.5639, + "step": 1044 + }, + { + "epoch": 0.39568345323741005, + "grad_norm": 13.560445785522461, + "learning_rate": 0.00013355669937405526, + "loss": 2.6478, + "step": 1045 + }, + { + "epoch": 0.39606209769026884, + "grad_norm": 19.593982696533203, + "learning_rate": 0.00013344376298637294, + "loss": 2.9598, + "step": 1046 + }, + { + "epoch": 0.3964407421431276, + "grad_norm": 13.61347770690918, + "learning_rate": 0.00013333077855008508, + "loss": 2.0055, + "step": 1047 + }, + { + "epoch": 0.39681938659598637, + "grad_norm": 14.087455749511719, + "learning_rate": 0.00013321774622751618, + "loss": 2.1689, + "step": 1048 + }, + { + "epoch": 0.39719803104884516, + "grad_norm": 20.69855499267578, + "learning_rate": 0.0001331046661810597, + "loss": 1.4113, + "step": 1049 + }, + { + "epoch": 0.3975766755017039, + "grad_norm": 34.63194274902344, + "learning_rate": 0.00013299153857317748, + "loss": 2.1471, + "step": 1050 + }, + { + "epoch": 0.3979553199545627, + "grad_norm": 8.219612121582031, + "learning_rate": 0.0001328783635663999, + "loss": 3.5702, + "step": 1051 + }, + { + "epoch": 0.3983339644074214, + "grad_norm": 9.948872566223145, + "learning_rate": 0.00013276514132332521, + "loss": 3.3578, + "step": 1052 + }, + { + "epoch": 0.3987126088602802, + "grad_norm": 11.182106018066406, + "learning_rate": 0.00013265187200661976, + "loss": 3.9353, + "step": 1053 + }, + { + "epoch": 0.39909125331313894, + "grad_norm": 12.669611930847168, + "learning_rate": 0.00013253855577901732, + "loss": 3.9309, + "step": 1054 + }, + { + "epoch": 0.39946989776599773, + "grad_norm": 12.40208625793457, + "learning_rate": 0.0001324251928033192, + "loss": 3.6691, + "step": 1055 + }, + { + "epoch": 0.3998485422188565, + "grad_norm": 10.716998100280762, + "learning_rate": 0.00013231178324239377, + "loss": 3.3575, + "step": 1056 + }, + { + "epoch": 0.40022718667171525, + "grad_norm": 12.901732444763184, + "learning_rate": 0.00013219832725917645, + "loss": 3.5777, + "step": 1057 + }, + { + "epoch": 0.40060583112457404, + "grad_norm": 11.288579940795898, + "learning_rate": 0.00013208482501666924, + "loss": 2.7736, + "step": 1058 + }, + { + "epoch": 0.4009844755774328, + "grad_norm": 11.336037635803223, + "learning_rate": 0.00013197127667794066, + "loss": 3.0309, + "step": 1059 + }, + { + "epoch": 0.40136312003029156, + "grad_norm": 12.790970802307129, + "learning_rate": 0.00013185768240612543, + "loss": 2.9778, + "step": 1060 + }, + { + "epoch": 0.4017417644831503, + "grad_norm": 10.891714096069336, + "learning_rate": 0.0001317440423644243, + "loss": 2.77, + "step": 1061 + }, + { + "epoch": 0.4021204089360091, + "grad_norm": 14.804855346679688, + "learning_rate": 0.00013163035671610374, + "loss": 2.9571, + "step": 1062 + }, + { + "epoch": 0.4024990533888679, + "grad_norm": 16.438711166381836, + "learning_rate": 0.00013151662562449576, + "loss": 3.4882, + "step": 1063 + }, + { + "epoch": 0.4028776978417266, + "grad_norm": 13.646224975585938, + "learning_rate": 0.00013140284925299762, + "loss": 3.3764, + "step": 1064 + }, + { + "epoch": 0.4032563422945854, + "grad_norm": 13.510947227478027, + "learning_rate": 0.00013128902776507172, + "loss": 2.5878, + "step": 1065 + }, + { + "epoch": 0.40363498674744414, + "grad_norm": 14.393485069274902, + "learning_rate": 0.00013117516132424517, + "loss": 3.1052, + "step": 1066 + }, + { + "epoch": 0.4040136312003029, + "grad_norm": 13.308830261230469, + "learning_rate": 0.00013106125009410978, + "loss": 2.3341, + "step": 1067 + }, + { + "epoch": 0.40439227565316166, + "grad_norm": 14.394597053527832, + "learning_rate": 0.0001309472942383216, + "loss": 2.546, + "step": 1068 + }, + { + "epoch": 0.40477092010602045, + "grad_norm": 15.71528434753418, + "learning_rate": 0.0001308332939206009, + "loss": 2.3768, + "step": 1069 + }, + { + "epoch": 0.40514956455887924, + "grad_norm": 14.074331283569336, + "learning_rate": 0.0001307192493047317, + "loss": 2.0017, + "step": 1070 + }, + { + "epoch": 0.405528209011738, + "grad_norm": 14.615304946899414, + "learning_rate": 0.00013060516055456175, + "loss": 1.9632, + "step": 1071 + }, + { + "epoch": 0.40590685346459676, + "grad_norm": 15.81937313079834, + "learning_rate": 0.00013049102783400221, + "loss": 1.5349, + "step": 1072 + }, + { + "epoch": 0.4062854979174555, + "grad_norm": 18.53114891052246, + "learning_rate": 0.00013037685130702742, + "loss": 2.08, + "step": 1073 + }, + { + "epoch": 0.4066641423703143, + "grad_norm": 18.639833450317383, + "learning_rate": 0.0001302626311376746, + "loss": 1.4834, + "step": 1074 + }, + { + "epoch": 0.407042786823173, + "grad_norm": 24.953411102294922, + "learning_rate": 0.00013014836749004367, + "loss": 1.6101, + "step": 1075 + }, + { + "epoch": 0.4074214312760318, + "grad_norm": 7.899003505706787, + "learning_rate": 0.00013003406052829706, + "loss": 3.2613, + "step": 1076 + }, + { + "epoch": 0.4078000757288906, + "grad_norm": 9.678568840026855, + "learning_rate": 0.0001299197104166595, + "loss": 3.4739, + "step": 1077 + }, + { + "epoch": 0.40817872018174933, + "grad_norm": 11.191865921020508, + "learning_rate": 0.0001298053173194175, + "loss": 3.6228, + "step": 1078 + }, + { + "epoch": 0.4085573646346081, + "grad_norm": 11.87701416015625, + "learning_rate": 0.00012969088140091955, + "loss": 3.041, + "step": 1079 + }, + { + "epoch": 0.40893600908746686, + "grad_norm": 12.016412734985352, + "learning_rate": 0.00012957640282557553, + "loss": 3.7958, + "step": 1080 + }, + { + "epoch": 0.40931465354032565, + "grad_norm": 12.6616792678833, + "learning_rate": 0.00012946188175785666, + "loss": 3.2154, + "step": 1081 + }, + { + "epoch": 0.4096932979931844, + "grad_norm": 12.319831848144531, + "learning_rate": 0.00012934731836229514, + "loss": 3.8766, + "step": 1082 + }, + { + "epoch": 0.41007194244604317, + "grad_norm": 12.028902053833008, + "learning_rate": 0.0001292327128034841, + "loss": 2.853, + "step": 1083 + }, + { + "epoch": 0.4104505868989019, + "grad_norm": 11.936314582824707, + "learning_rate": 0.00012911806524607713, + "loss": 3.7024, + "step": 1084 + }, + { + "epoch": 0.4108292313517607, + "grad_norm": 12.414243698120117, + "learning_rate": 0.00012900337585478825, + "loss": 3.3653, + "step": 1085 + }, + { + "epoch": 0.4112078758046195, + "grad_norm": 13.156413078308105, + "learning_rate": 0.0001288886447943915, + "loss": 3.0347, + "step": 1086 + }, + { + "epoch": 0.4115865202574782, + "grad_norm": 12.574990272521973, + "learning_rate": 0.00012877387222972087, + "loss": 2.6169, + "step": 1087 + }, + { + "epoch": 0.411965164710337, + "grad_norm": 17.557424545288086, + "learning_rate": 0.00012865905832566989, + "loss": 3.3377, + "step": 1088 + }, + { + "epoch": 0.41234380916319574, + "grad_norm": 12.320211410522461, + "learning_rate": 0.0001285442032471916, + "loss": 2.6103, + "step": 1089 + }, + { + "epoch": 0.41272245361605453, + "grad_norm": 13.786900520324707, + "learning_rate": 0.00012842930715929802, + "loss": 3.1307, + "step": 1090 + }, + { + "epoch": 0.41310109806891326, + "grad_norm": 16.15777587890625, + "learning_rate": 0.0001283143702270603, + "loss": 3.149, + "step": 1091 + }, + { + "epoch": 0.41347974252177205, + "grad_norm": 15.2261323928833, + "learning_rate": 0.00012819939261560806, + "loss": 1.8673, + "step": 1092 + }, + { + "epoch": 0.41385838697463084, + "grad_norm": 14.948053359985352, + "learning_rate": 0.00012808437449012957, + "loss": 2.8997, + "step": 1093 + }, + { + "epoch": 0.4142370314274896, + "grad_norm": 19.149866104125977, + "learning_rate": 0.00012796931601587113, + "loss": 2.147, + "step": 1094 + }, + { + "epoch": 0.41461567588034837, + "grad_norm": 17.016815185546875, + "learning_rate": 0.0001278542173581371, + "loss": 2.3585, + "step": 1095 + }, + { + "epoch": 0.4149943203332071, + "grad_norm": 16.220598220825195, + "learning_rate": 0.00012773907868228956, + "loss": 2.0916, + "step": 1096 + }, + { + "epoch": 0.4153729647860659, + "grad_norm": 17.185651779174805, + "learning_rate": 0.0001276239001537481, + "loss": 2.3026, + "step": 1097 + }, + { + "epoch": 0.4157516092389246, + "grad_norm": 15.259086608886719, + "learning_rate": 0.0001275086819379895, + "loss": 1.2933, + "step": 1098 + }, + { + "epoch": 0.4161302536917834, + "grad_norm": 24.932565689086914, + "learning_rate": 0.00012739342420054763, + "loss": 1.583, + "step": 1099 + }, + { + "epoch": 0.4165088981446422, + "grad_norm": 26.60433578491211, + "learning_rate": 0.0001272781271070131, + "loss": 2.5607, + "step": 1100 + }, + { + "epoch": 0.41688754259750094, + "grad_norm": 11.937226295471191, + "learning_rate": 0.00012716279082303312, + "loss": 4.2044, + "step": 1101 + }, + { + "epoch": 0.4172661870503597, + "grad_norm": 10.27784252166748, + "learning_rate": 0.0001270474155143111, + "loss": 4.1484, + "step": 1102 + }, + { + "epoch": 0.41764483150321846, + "grad_norm": 9.727765083312988, + "learning_rate": 0.00012693200134660662, + "loss": 3.0787, + "step": 1103 + }, + { + "epoch": 0.41802347595607725, + "grad_norm": 10.214356422424316, + "learning_rate": 0.00012681654848573502, + "loss": 2.7503, + "step": 1104 + }, + { + "epoch": 0.418402120408936, + "grad_norm": 10.071405410766602, + "learning_rate": 0.00012670105709756727, + "loss": 2.8888, + "step": 1105 + }, + { + "epoch": 0.4187807648617948, + "grad_norm": 13.693557739257812, + "learning_rate": 0.00012658552734802963, + "loss": 3.9183, + "step": 1106 + }, + { + "epoch": 0.41915940931465356, + "grad_norm": 10.267026901245117, + "learning_rate": 0.00012646995940310363, + "loss": 2.5214, + "step": 1107 + }, + { + "epoch": 0.4195380537675123, + "grad_norm": 12.434460639953613, + "learning_rate": 0.00012635435342882548, + "loss": 3.0185, + "step": 1108 + }, + { + "epoch": 0.4199166982203711, + "grad_norm": 11.598405838012695, + "learning_rate": 0.00012623870959128615, + "loss": 2.7773, + "step": 1109 + }, + { + "epoch": 0.4202953426732298, + "grad_norm": 14.913825035095215, + "learning_rate": 0.00012612302805663098, + "loss": 3.8533, + "step": 1110 + }, + { + "epoch": 0.4206739871260886, + "grad_norm": 11.085430145263672, + "learning_rate": 0.0001260073089910594, + "loss": 2.6134, + "step": 1111 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 12.33950138092041, + "learning_rate": 0.00012589155256082489, + "loss": 2.9382, + "step": 1112 + }, + { + "epoch": 0.42143127603180613, + "grad_norm": 13.180621147155762, + "learning_rate": 0.00012577575893223456, + "loss": 2.8428, + "step": 1113 + }, + { + "epoch": 0.4218099204846649, + "grad_norm": 15.379983901977539, + "learning_rate": 0.0001256599282716489, + "loss": 2.5916, + "step": 1114 + }, + { + "epoch": 0.42218856493752366, + "grad_norm": 14.148529052734375, + "learning_rate": 0.00012554406074548165, + "loss": 2.5504, + "step": 1115 + }, + { + "epoch": 0.42256720939038245, + "grad_norm": 15.524250030517578, + "learning_rate": 0.00012542815652019952, + "loss": 2.6872, + "step": 1116 + }, + { + "epoch": 0.4229458538432412, + "grad_norm": 13.896522521972656, + "learning_rate": 0.00012531221576232197, + "loss": 2.3257, + "step": 1117 + }, + { + "epoch": 0.42332449829609997, + "grad_norm": 13.984559059143066, + "learning_rate": 0.0001251962386384209, + "loss": 2.2887, + "step": 1118 + }, + { + "epoch": 0.4237031427489587, + "grad_norm": 14.945381164550781, + "learning_rate": 0.00012508022531512047, + "loss": 2.2639, + "step": 1119 + }, + { + "epoch": 0.4240817872018175, + "grad_norm": 14.590594291687012, + "learning_rate": 0.00012496417595909685, + "loss": 2.7817, + "step": 1120 + }, + { + "epoch": 0.4244604316546763, + "grad_norm": 22.159513473510742, + "learning_rate": 0.00012484809073707803, + "loss": 3.3067, + "step": 1121 + }, + { + "epoch": 0.424839076107535, + "grad_norm": 19.108047485351562, + "learning_rate": 0.00012473196981584338, + "loss": 2.6282, + "step": 1122 + }, + { + "epoch": 0.4252177205603938, + "grad_norm": 15.237470626831055, + "learning_rate": 0.00012461581336222378, + "loss": 1.917, + "step": 1123 + }, + { + "epoch": 0.42559636501325254, + "grad_norm": 13.147758483886719, + "learning_rate": 0.0001244996215431009, + "loss": 1.1269, + "step": 1124 + }, + { + "epoch": 0.42597500946611133, + "grad_norm": 30.5366268157959, + "learning_rate": 0.00012438339452540748, + "loss": 1.766, + "step": 1125 + }, + { + "epoch": 0.42635365391897007, + "grad_norm": 8.803793907165527, + "learning_rate": 0.00012426713247612665, + "loss": 3.8758, + "step": 1126 + }, + { + "epoch": 0.42673229837182886, + "grad_norm": 10.560848236083984, + "learning_rate": 0.00012415083556229192, + "loss": 3.5995, + "step": 1127 + }, + { + "epoch": 0.42711094282468764, + "grad_norm": 11.299087524414062, + "learning_rate": 0.00012403450395098695, + "loss": 4.2221, + "step": 1128 + }, + { + "epoch": 0.4274895872775464, + "grad_norm": 11.33618450164795, + "learning_rate": 0.00012391813780934514, + "loss": 4.1682, + "step": 1129 + }, + { + "epoch": 0.42786823173040517, + "grad_norm": 10.318195343017578, + "learning_rate": 0.00012380173730454957, + "loss": 3.3889, + "step": 1130 + }, + { + "epoch": 0.4282468761832639, + "grad_norm": 11.54907512664795, + "learning_rate": 0.00012368530260383268, + "loss": 2.8639, + "step": 1131 + }, + { + "epoch": 0.4286255206361227, + "grad_norm": 11.327589988708496, + "learning_rate": 0.00012356883387447601, + "loss": 2.3551, + "step": 1132 + }, + { + "epoch": 0.4290041650889814, + "grad_norm": 12.675344467163086, + "learning_rate": 0.00012345233128381006, + "loss": 3.7048, + "step": 1133 + }, + { + "epoch": 0.4293828095418402, + "grad_norm": 10.90146255493164, + "learning_rate": 0.00012333579499921392, + "loss": 3.0984, + "step": 1134 + }, + { + "epoch": 0.429761453994699, + "grad_norm": 13.599529266357422, + "learning_rate": 0.00012321922518811508, + "loss": 2.9593, + "step": 1135 + }, + { + "epoch": 0.43014009844755774, + "grad_norm": 12.997097969055176, + "learning_rate": 0.00012310262201798924, + "loss": 3.048, + "step": 1136 + }, + { + "epoch": 0.43051874290041653, + "grad_norm": 13.863821029663086, + "learning_rate": 0.00012298598565636, + "loss": 2.9528, + "step": 1137 + }, + { + "epoch": 0.43089738735327526, + "grad_norm": 14.177045822143555, + "learning_rate": 0.00012286931627079862, + "loss": 2.5402, + "step": 1138 + }, + { + "epoch": 0.43127603180613405, + "grad_norm": 14.45673942565918, + "learning_rate": 0.00012275261402892388, + "loss": 2.1941, + "step": 1139 + }, + { + "epoch": 0.4316546762589928, + "grad_norm": 16.615707397460938, + "learning_rate": 0.0001226358790984017, + "loss": 2.7464, + "step": 1140 + }, + { + "epoch": 0.4320333207118516, + "grad_norm": 13.864429473876953, + "learning_rate": 0.000122519111646945, + "loss": 2.384, + "step": 1141 + }, + { + "epoch": 0.4324119651647103, + "grad_norm": 15.059038162231445, + "learning_rate": 0.00012240231184231336, + "loss": 1.735, + "step": 1142 + }, + { + "epoch": 0.4327906096175691, + "grad_norm": 15.821595191955566, + "learning_rate": 0.00012228547985231297, + "loss": 2.953, + "step": 1143 + }, + { + "epoch": 0.4331692540704279, + "grad_norm": 13.79995346069336, + "learning_rate": 0.00012216861584479608, + "loss": 2.3279, + "step": 1144 + }, + { + "epoch": 0.4335478985232866, + "grad_norm": 11.45645523071289, + "learning_rate": 0.00012205171998766114, + "loss": 1.7425, + "step": 1145 + }, + { + "epoch": 0.4339265429761454, + "grad_norm": 15.549623489379883, + "learning_rate": 0.00012193479244885217, + "loss": 2.452, + "step": 1146 + }, + { + "epoch": 0.43430518742900415, + "grad_norm": 14.682928085327148, + "learning_rate": 0.00012181783339635888, + "loss": 2.1395, + "step": 1147 + }, + { + "epoch": 0.43468383188186294, + "grad_norm": 19.542850494384766, + "learning_rate": 0.00012170084299821609, + "loss": 2.4162, + "step": 1148 + }, + { + "epoch": 0.43506247633472167, + "grad_norm": 15.998048782348633, + "learning_rate": 0.00012158382142250379, + "loss": 1.5397, + "step": 1149 + }, + { + "epoch": 0.43544112078758046, + "grad_norm": 37.20795822143555, + "learning_rate": 0.00012146676883734671, + "loss": 3.4346, + "step": 1150 + }, + { + "epoch": 0.43581976524043925, + "grad_norm": 8.654630661010742, + "learning_rate": 0.00012134968541091405, + "loss": 4.2973, + "step": 1151 + }, + { + "epoch": 0.436198409693298, + "grad_norm": 9.75950813293457, + "learning_rate": 0.0001212325713114195, + "loss": 3.3641, + "step": 1152 + }, + { + "epoch": 0.4365770541461568, + "grad_norm": 9.88634204864502, + "learning_rate": 0.00012111542670712066, + "loss": 3.6815, + "step": 1153 + }, + { + "epoch": 0.4369556985990155, + "grad_norm": 12.256867408752441, + "learning_rate": 0.00012099825176631902, + "loss": 3.2275, + "step": 1154 + }, + { + "epoch": 0.4373343430518743, + "grad_norm": 12.367258071899414, + "learning_rate": 0.00012088104665735964, + "loss": 2.9504, + "step": 1155 + }, + { + "epoch": 0.43771298750473303, + "grad_norm": 13.042316436767578, + "learning_rate": 0.00012076381154863095, + "loss": 3.0564, + "step": 1156 + }, + { + "epoch": 0.4380916319575918, + "grad_norm": 11.0169677734375, + "learning_rate": 0.00012064654660856445, + "loss": 3.4256, + "step": 1157 + }, + { + "epoch": 0.4384702764104506, + "grad_norm": 11.372369766235352, + "learning_rate": 0.0001205292520056345, + "loss": 3.5504, + "step": 1158 + }, + { + "epoch": 0.43884892086330934, + "grad_norm": 10.504295349121094, + "learning_rate": 0.00012041192790835811, + "loss": 2.7411, + "step": 1159 + }, + { + "epoch": 0.43922756531616813, + "grad_norm": 13.477766036987305, + "learning_rate": 0.00012029457448529459, + "loss": 2.9257, + "step": 1160 + }, + { + "epoch": 0.43960620976902687, + "grad_norm": 12.110424041748047, + "learning_rate": 0.00012017719190504551, + "loss": 2.8799, + "step": 1161 + }, + { + "epoch": 0.43998485422188566, + "grad_norm": 13.188323020935059, + "learning_rate": 0.00012005978033625416, + "loss": 2.5087, + "step": 1162 + }, + { + "epoch": 0.4403634986747444, + "grad_norm": 11.588294982910156, + "learning_rate": 0.00011994233994760567, + "loss": 2.5272, + "step": 1163 + }, + { + "epoch": 0.4407421431276032, + "grad_norm": 15.151694297790527, + "learning_rate": 0.00011982487090782638, + "loss": 2.7985, + "step": 1164 + }, + { + "epoch": 0.44112078758046197, + "grad_norm": 14.004260063171387, + "learning_rate": 0.00011970737338568394, + "loss": 2.7696, + "step": 1165 + }, + { + "epoch": 0.4414994320333207, + "grad_norm": 14.581443786621094, + "learning_rate": 0.00011958984754998685, + "loss": 2.2614, + "step": 1166 + }, + { + "epoch": 0.4418780764861795, + "grad_norm": 12.546298027038574, + "learning_rate": 0.00011947229356958434, + "loss": 2.3896, + "step": 1167 + }, + { + "epoch": 0.44225672093903823, + "grad_norm": 14.990707397460938, + "learning_rate": 0.000119354711613366, + "loss": 3.1594, + "step": 1168 + }, + { + "epoch": 0.442635365391897, + "grad_norm": 14.658981323242188, + "learning_rate": 0.00011923710185026169, + "loss": 2.4297, + "step": 1169 + }, + { + "epoch": 0.44301400984475575, + "grad_norm": 13.724644660949707, + "learning_rate": 0.00011911946444924116, + "loss": 1.5228, + "step": 1170 + }, + { + "epoch": 0.44339265429761454, + "grad_norm": 19.209369659423828, + "learning_rate": 0.0001190017995793139, + "loss": 3.4329, + "step": 1171 + }, + { + "epoch": 0.44377129875047333, + "grad_norm": 21.529495239257812, + "learning_rate": 0.00011888410740952887, + "loss": 2.5655, + "step": 1172 + }, + { + "epoch": 0.44414994320333206, + "grad_norm": 24.351722717285156, + "learning_rate": 0.00011876638810897422, + "loss": 2.6329, + "step": 1173 + }, + { + "epoch": 0.44452858765619085, + "grad_norm": 15.183594703674316, + "learning_rate": 0.00011864864184677711, + "loss": 0.8859, + "step": 1174 + }, + { + "epoch": 0.4449072321090496, + "grad_norm": 13.775147438049316, + "learning_rate": 0.00011853086879210342, + "loss": 1.3488, + "step": 1175 + }, + { + "epoch": 0.4452858765619084, + "grad_norm": 8.975238800048828, + "learning_rate": 0.00011841306911415753, + "loss": 3.21, + "step": 1176 + }, + { + "epoch": 0.4456645210147671, + "grad_norm": 11.082070350646973, + "learning_rate": 0.00011829524298218207, + "loss": 4.19, + "step": 1177 + }, + { + "epoch": 0.4460431654676259, + "grad_norm": 10.536282539367676, + "learning_rate": 0.00011817739056545762, + "loss": 3.5267, + "step": 1178 + }, + { + "epoch": 0.4464218099204847, + "grad_norm": 10.50727367401123, + "learning_rate": 0.00011805951203330266, + "loss": 3.3532, + "step": 1179 + }, + { + "epoch": 0.4468004543733434, + "grad_norm": 10.488901138305664, + "learning_rate": 0.00011794160755507304, + "loss": 2.9757, + "step": 1180 + }, + { + "epoch": 0.4471790988262022, + "grad_norm": 12.007133483886719, + "learning_rate": 0.000117823677300162, + "loss": 3.0183, + "step": 1181 + }, + { + "epoch": 0.44755774327906095, + "grad_norm": 12.38204574584961, + "learning_rate": 0.00011770572143799971, + "loss": 3.0908, + "step": 1182 + }, + { + "epoch": 0.44793638773191974, + "grad_norm": 12.608494758605957, + "learning_rate": 0.00011758774013805325, + "loss": 3.0191, + "step": 1183 + }, + { + "epoch": 0.4483150321847785, + "grad_norm": 10.949199676513672, + "learning_rate": 0.00011746973356982614, + "loss": 2.5306, + "step": 1184 + }, + { + "epoch": 0.44869367663763726, + "grad_norm": 12.805669784545898, + "learning_rate": 0.00011735170190285825, + "loss": 3.2759, + "step": 1185 + }, + { + "epoch": 0.44907232109049605, + "grad_norm": 12.965691566467285, + "learning_rate": 0.00011723364530672549, + "loss": 3.0626, + "step": 1186 + }, + { + "epoch": 0.4494509655433548, + "grad_norm": 11.967156410217285, + "learning_rate": 0.00011711556395103964, + "loss": 2.4325, + "step": 1187 + }, + { + "epoch": 0.4498296099962136, + "grad_norm": 13.925737380981445, + "learning_rate": 0.00011699745800544798, + "loss": 2.8316, + "step": 1188 + }, + { + "epoch": 0.4502082544490723, + "grad_norm": 13.926861763000488, + "learning_rate": 0.00011687932763963319, + "loss": 3.4606, + "step": 1189 + }, + { + "epoch": 0.4505868989019311, + "grad_norm": 13.918458938598633, + "learning_rate": 0.00011676117302331291, + "loss": 2.5946, + "step": 1190 + }, + { + "epoch": 0.45096554335478983, + "grad_norm": 16.527910232543945, + "learning_rate": 0.00011664299432623979, + "loss": 2.2876, + "step": 1191 + }, + { + "epoch": 0.4513441878076486, + "grad_norm": 14.137311935424805, + "learning_rate": 0.00011652479171820097, + "loss": 2.9587, + "step": 1192 + }, + { + "epoch": 0.45172283226050736, + "grad_norm": 17.192485809326172, + "learning_rate": 0.00011640656536901796, + "loss": 1.5583, + "step": 1193 + }, + { + "epoch": 0.45210147671336615, + "grad_norm": 14.512371063232422, + "learning_rate": 0.00011628831544854635, + "loss": 2.3428, + "step": 1194 + }, + { + "epoch": 0.45248012116622494, + "grad_norm": 16.016895294189453, + "learning_rate": 0.00011617004212667566, + "loss": 2.4906, + "step": 1195 + }, + { + "epoch": 0.45285876561908367, + "grad_norm": 13.380924224853516, + "learning_rate": 0.000116051745573329, + "loss": 1.8266, + "step": 1196 + }, + { + "epoch": 0.45323741007194246, + "grad_norm": 12.72845458984375, + "learning_rate": 0.00011593342595846288, + "loss": 1.166, + "step": 1197 + }, + { + "epoch": 0.4536160545248012, + "grad_norm": 14.16887092590332, + "learning_rate": 0.00011581508345206689, + "loss": 1.3564, + "step": 1198 + }, + { + "epoch": 0.45399469897766, + "grad_norm": 28.907073974609375, + "learning_rate": 0.0001156967182241635, + "loss": 1.5071, + "step": 1199 + }, + { + "epoch": 0.4543733434305187, + "grad_norm": 17.37041473388672, + "learning_rate": 0.00011557833044480792, + "loss": 1.1685, + "step": 1200 + }, + { + "epoch": 0.4547519878833775, + "grad_norm": 10.693912506103516, + "learning_rate": 0.0001154599202840877, + "loss": 3.2915, + "step": 1201 + }, + { + "epoch": 0.4551306323362363, + "grad_norm": 13.119062423706055, + "learning_rate": 0.0001153414879121225, + "loss": 4.6147, + "step": 1202 + }, + { + "epoch": 0.45550927678909503, + "grad_norm": 11.448525428771973, + "learning_rate": 0.00011522303349906399, + "loss": 2.79, + "step": 1203 + }, + { + "epoch": 0.4558879212419538, + "grad_norm": 11.742964744567871, + "learning_rate": 0.00011510455721509537, + "loss": 3.2349, + "step": 1204 + }, + { + "epoch": 0.45626656569481255, + "grad_norm": 10.76633358001709, + "learning_rate": 0.00011498605923043145, + "loss": 3.0203, + "step": 1205 + }, + { + "epoch": 0.45664521014767134, + "grad_norm": 11.407468795776367, + "learning_rate": 0.00011486753971531801, + "loss": 3.6872, + "step": 1206 + }, + { + "epoch": 0.4570238546005301, + "grad_norm": 11.357184410095215, + "learning_rate": 0.00011474899884003196, + "loss": 2.7635, + "step": 1207 + }, + { + "epoch": 0.45740249905338887, + "grad_norm": 12.275900840759277, + "learning_rate": 0.00011463043677488073, + "loss": 2.7735, + "step": 1208 + }, + { + "epoch": 0.45778114350624766, + "grad_norm": 12.097725868225098, + "learning_rate": 0.0001145118536902023, + "loss": 2.7413, + "step": 1209 + }, + { + "epoch": 0.4581597879591064, + "grad_norm": 10.203941345214844, + "learning_rate": 0.0001143932497563648, + "loss": 2.3056, + "step": 1210 + }, + { + "epoch": 0.4585384324119652, + "grad_norm": 12.463147163391113, + "learning_rate": 0.00011427462514376637, + "loss": 3.1588, + "step": 1211 + }, + { + "epoch": 0.4589170768648239, + "grad_norm": 10.687355041503906, + "learning_rate": 0.00011415598002283474, + "loss": 1.4561, + "step": 1212 + }, + { + "epoch": 0.4592957213176827, + "grad_norm": 13.218606948852539, + "learning_rate": 0.00011403731456402727, + "loss": 2.156, + "step": 1213 + }, + { + "epoch": 0.45967436577054144, + "grad_norm": 15.726714134216309, + "learning_rate": 0.00011391862893783038, + "loss": 2.621, + "step": 1214 + }, + { + "epoch": 0.4600530102234002, + "grad_norm": 15.450735092163086, + "learning_rate": 0.0001137999233147596, + "loss": 2.6854, + "step": 1215 + }, + { + "epoch": 0.460431654676259, + "grad_norm": 14.271288871765137, + "learning_rate": 0.00011368119786535906, + "loss": 2.3983, + "step": 1216 + }, + { + "epoch": 0.46081029912911775, + "grad_norm": 16.259143829345703, + "learning_rate": 0.0001135624527602015, + "loss": 3.0149, + "step": 1217 + }, + { + "epoch": 0.46118894358197654, + "grad_norm": 21.305139541625977, + "learning_rate": 0.00011344368816988779, + "loss": 2.5145, + "step": 1218 + }, + { + "epoch": 0.4615675880348353, + "grad_norm": 18.001358032226562, + "learning_rate": 0.00011332490426504688, + "loss": 2.6175, + "step": 1219 + }, + { + "epoch": 0.46194623248769406, + "grad_norm": 15.817441940307617, + "learning_rate": 0.00011320610121633542, + "loss": 2.0215, + "step": 1220 + }, + { + "epoch": 0.4623248769405528, + "grad_norm": 18.465803146362305, + "learning_rate": 0.00011308727919443756, + "loss": 2.2702, + "step": 1221 + }, + { + "epoch": 0.4627035213934116, + "grad_norm": 15.902999877929688, + "learning_rate": 0.00011296843837006477, + "loss": 2.0862, + "step": 1222 + }, + { + "epoch": 0.4630821658462704, + "grad_norm": 18.18279457092285, + "learning_rate": 0.00011284957891395545, + "loss": 1.6971, + "step": 1223 + }, + { + "epoch": 0.4634608102991291, + "grad_norm": 20.656322479248047, + "learning_rate": 0.00011273070099687482, + "loss": 1.8615, + "step": 1224 + }, + { + "epoch": 0.4638394547519879, + "grad_norm": 37.89259719848633, + "learning_rate": 0.0001126118047896146, + "loss": 2.1817, + "step": 1225 + }, + { + "epoch": 0.46421809920484663, + "grad_norm": 8.783308982849121, + "learning_rate": 0.0001124928904629928, + "loss": 3.3508, + "step": 1226 + }, + { + "epoch": 0.4645967436577054, + "grad_norm": 12.971296310424805, + "learning_rate": 0.0001123739581878535, + "loss": 3.7262, + "step": 1227 + }, + { + "epoch": 0.46497538811056416, + "grad_norm": 10.869105339050293, + "learning_rate": 0.00011225500813506645, + "loss": 3.2334, + "step": 1228 + }, + { + "epoch": 0.46535403256342295, + "grad_norm": 11.33836555480957, + "learning_rate": 0.00011213604047552708, + "loss": 3.5119, + "step": 1229 + }, + { + "epoch": 0.46573267701628174, + "grad_norm": 10.899227142333984, + "learning_rate": 0.00011201705538015604, + "loss": 3.5351, + "step": 1230 + }, + { + "epoch": 0.46611132146914047, + "grad_norm": 11.528409957885742, + "learning_rate": 0.00011189805301989904, + "loss": 3.1705, + "step": 1231 + }, + { + "epoch": 0.46648996592199926, + "grad_norm": 10.381014823913574, + "learning_rate": 0.00011177903356572659, + "loss": 1.9777, + "step": 1232 + }, + { + "epoch": 0.466868610374858, + "grad_norm": 11.280335426330566, + "learning_rate": 0.00011165999718863379, + "loss": 2.5228, + "step": 1233 + }, + { + "epoch": 0.4672472548277168, + "grad_norm": 14.46865177154541, + "learning_rate": 0.00011154094405963996, + "loss": 2.5568, + "step": 1234 + }, + { + "epoch": 0.4676258992805755, + "grad_norm": 13.52888011932373, + "learning_rate": 0.00011142187434978866, + "loss": 3.2911, + "step": 1235 + }, + { + "epoch": 0.4680045437334343, + "grad_norm": 11.23714828491211, + "learning_rate": 0.00011130278823014709, + "loss": 2.2005, + "step": 1236 + }, + { + "epoch": 0.4683831881862931, + "grad_norm": 12.224804878234863, + "learning_rate": 0.00011118368587180614, + "loss": 2.2755, + "step": 1237 + }, + { + "epoch": 0.46876183263915183, + "grad_norm": 12.343790054321289, + "learning_rate": 0.00011106456744587996, + "loss": 2.8197, + "step": 1238 + }, + { + "epoch": 0.4691404770920106, + "grad_norm": 13.172083854675293, + "learning_rate": 0.0001109454331235059, + "loss": 2.586, + "step": 1239 + }, + { + "epoch": 0.46951912154486936, + "grad_norm": 12.991609573364258, + "learning_rate": 0.00011082628307584397, + "loss": 2.0318, + "step": 1240 + }, + { + "epoch": 0.46989776599772815, + "grad_norm": 13.485008239746094, + "learning_rate": 0.00011070711747407694, + "loss": 2.2734, + "step": 1241 + }, + { + "epoch": 0.4702764104505869, + "grad_norm": 19.911563873291016, + "learning_rate": 0.0001105879364894098, + "loss": 2.9116, + "step": 1242 + }, + { + "epoch": 0.47065505490344567, + "grad_norm": 14.824417114257812, + "learning_rate": 0.00011046874029306975, + "loss": 2.0742, + "step": 1243 + }, + { + "epoch": 0.4710336993563044, + "grad_norm": 17.6142578125, + "learning_rate": 0.00011034952905630576, + "loss": 2.6475, + "step": 1244 + }, + { + "epoch": 0.4714123438091632, + "grad_norm": 13.6873140335083, + "learning_rate": 0.00011023030295038846, + "loss": 2.1793, + "step": 1245 + }, + { + "epoch": 0.471790988262022, + "grad_norm": 15.636033058166504, + "learning_rate": 0.0001101110621466098, + "loss": 1.6981, + "step": 1246 + }, + { + "epoch": 0.4721696327148807, + "grad_norm": 17.11579132080078, + "learning_rate": 0.00010999180681628288, + "loss": 1.6256, + "step": 1247 + }, + { + "epoch": 0.4725482771677395, + "grad_norm": 20.186901092529297, + "learning_rate": 0.00010987253713074165, + "loss": 2.4091, + "step": 1248 + }, + { + "epoch": 0.47292692162059824, + "grad_norm": 15.602944374084473, + "learning_rate": 0.00010975325326134071, + "loss": 1.8002, + "step": 1249 + }, + { + "epoch": 0.47330556607345703, + "grad_norm": 23.223661422729492, + "learning_rate": 0.00010963395537945502, + "loss": 2.0938, + "step": 1250 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 10.464275360107422, + "learning_rate": 0.00010951464365647967, + "loss": 4.1863, + "step": 1251 + }, + { + "epoch": 0.47406285497917455, + "grad_norm": 10.853160858154297, + "learning_rate": 0.00010939531826382963, + "loss": 3.6832, + "step": 1252 + }, + { + "epoch": 0.47444149943203334, + "grad_norm": 12.23708724975586, + "learning_rate": 0.00010927597937293952, + "loss": 3.7507, + "step": 1253 + }, + { + "epoch": 0.4748201438848921, + "grad_norm": 12.157914161682129, + "learning_rate": 0.00010915662715526336, + "loss": 2.7929, + "step": 1254 + }, + { + "epoch": 0.47519878833775087, + "grad_norm": 14.618999481201172, + "learning_rate": 0.00010903726178227432, + "loss": 3.9901, + "step": 1255 + }, + { + "epoch": 0.4755774327906096, + "grad_norm": 11.460221290588379, + "learning_rate": 0.0001089178834254644, + "loss": 3.0165, + "step": 1256 + }, + { + "epoch": 0.4759560772434684, + "grad_norm": 11.18032455444336, + "learning_rate": 0.00010879849225634438, + "loss": 1.9716, + "step": 1257 + }, + { + "epoch": 0.4763347216963271, + "grad_norm": 11.510719299316406, + "learning_rate": 0.00010867908844644335, + "loss": 1.7553, + "step": 1258 + }, + { + "epoch": 0.4767133661491859, + "grad_norm": 10.82070255279541, + "learning_rate": 0.00010855967216730858, + "loss": 2.6911, + "step": 1259 + }, + { + "epoch": 0.4770920106020447, + "grad_norm": 13.530522346496582, + "learning_rate": 0.00010844024359050527, + "loss": 2.8952, + "step": 1260 + }, + { + "epoch": 0.47747065505490344, + "grad_norm": 10.605006217956543, + "learning_rate": 0.0001083208028876163, + "loss": 2.2925, + "step": 1261 + }, + { + "epoch": 0.4778492995077622, + "grad_norm": 12.863495826721191, + "learning_rate": 0.00010820135023024192, + "loss": 2.3114, + "step": 1262 + }, + { + "epoch": 0.47822794396062096, + "grad_norm": 16.1364688873291, + "learning_rate": 0.00010808188578999963, + "loss": 3.0539, + "step": 1263 + }, + { + "epoch": 0.47860658841347975, + "grad_norm": 12.478103637695312, + "learning_rate": 0.00010796240973852376, + "loss": 2.0726, + "step": 1264 + }, + { + "epoch": 0.4789852328663385, + "grad_norm": 13.423611640930176, + "learning_rate": 0.00010784292224746546, + "loss": 2.8393, + "step": 1265 + }, + { + "epoch": 0.4793638773191973, + "grad_norm": 14.295774459838867, + "learning_rate": 0.00010772342348849216, + "loss": 2.7654, + "step": 1266 + }, + { + "epoch": 0.47974252177205606, + "grad_norm": 15.330755233764648, + "learning_rate": 0.00010760391363328762, + "loss": 1.9282, + "step": 1267 + }, + { + "epoch": 0.4801211662249148, + "grad_norm": 19.332740783691406, + "learning_rate": 0.00010748439285355138, + "loss": 1.8195, + "step": 1268 + }, + { + "epoch": 0.4804998106777736, + "grad_norm": 16.43891143798828, + "learning_rate": 0.00010736486132099888, + "loss": 2.0598, + "step": 1269 + }, + { + "epoch": 0.4808784551306323, + "grad_norm": 12.18430233001709, + "learning_rate": 0.00010724531920736086, + "loss": 0.99, + "step": 1270 + }, + { + "epoch": 0.4812570995834911, + "grad_norm": 15.264763832092285, + "learning_rate": 0.00010712576668438323, + "loss": 1.8075, + "step": 1271 + }, + { + "epoch": 0.48163574403634984, + "grad_norm": 21.91768455505371, + "learning_rate": 0.00010700620392382701, + "loss": 2.6154, + "step": 1272 + }, + { + "epoch": 0.48201438848920863, + "grad_norm": 16.14089012145996, + "learning_rate": 0.00010688663109746784, + "loss": 1.5317, + "step": 1273 + }, + { + "epoch": 0.4823930329420674, + "grad_norm": 32.41860580444336, + "learning_rate": 0.00010676704837709576, + "loss": 1.8389, + "step": 1274 + }, + { + "epoch": 0.48277167739492616, + "grad_norm": 23.59526252746582, + "learning_rate": 0.00010664745593451516, + "loss": 1.1361, + "step": 1275 + }, + { + "epoch": 0.48315032184778495, + "grad_norm": 10.691109657287598, + "learning_rate": 0.00010652785394154427, + "loss": 3.2863, + "step": 1276 + }, + { + "epoch": 0.4835289663006437, + "grad_norm": 12.289042472839355, + "learning_rate": 0.00010640824257001516, + "loss": 4.0967, + "step": 1277 + }, + { + "epoch": 0.48390761075350247, + "grad_norm": 10.609498023986816, + "learning_rate": 0.00010628862199177327, + "loss": 2.915, + "step": 1278 + }, + { + "epoch": 0.4842862552063612, + "grad_norm": 13.162012100219727, + "learning_rate": 0.00010616899237867733, + "loss": 3.3384, + "step": 1279 + }, + { + "epoch": 0.48466489965922, + "grad_norm": 12.458738327026367, + "learning_rate": 0.000106049353902599, + "loss": 2.8678, + "step": 1280 + }, + { + "epoch": 0.4850435441120788, + "grad_norm": 12.008556365966797, + "learning_rate": 0.00010592970673542277, + "loss": 2.9199, + "step": 1281 + }, + { + "epoch": 0.4854221885649375, + "grad_norm": 10.63491153717041, + "learning_rate": 0.00010581005104904549, + "loss": 2.4852, + "step": 1282 + }, + { + "epoch": 0.4858008330177963, + "grad_norm": 10.767313957214355, + "learning_rate": 0.00010569038701537633, + "loss": 3.4581, + "step": 1283 + }, + { + "epoch": 0.48617947747065504, + "grad_norm": 12.88519287109375, + "learning_rate": 0.00010557071480633643, + "loss": 3.5616, + "step": 1284 + }, + { + "epoch": 0.48655812192351383, + "grad_norm": 12.250274658203125, + "learning_rate": 0.00010545103459385868, + "loss": 2.8215, + "step": 1285 + }, + { + "epoch": 0.48693676637637257, + "grad_norm": 12.7329683303833, + "learning_rate": 0.00010533134654988746, + "loss": 3.5789, + "step": 1286 + }, + { + "epoch": 0.48731541082923135, + "grad_norm": 12.87328815460205, + "learning_rate": 0.00010521165084637843, + "loss": 2.854, + "step": 1287 + }, + { + "epoch": 0.48769405528209014, + "grad_norm": 11.388814926147461, + "learning_rate": 0.00010509194765529821, + "loss": 2.0008, + "step": 1288 + }, + { + "epoch": 0.4880726997349489, + "grad_norm": 12.551799774169922, + "learning_rate": 0.00010497223714862424, + "loss": 2.4604, + "step": 1289 + }, + { + "epoch": 0.48845134418780767, + "grad_norm": 10.640294075012207, + "learning_rate": 0.00010485251949834436, + "loss": 1.6856, + "step": 1290 + }, + { + "epoch": 0.4888299886406664, + "grad_norm": 13.196956634521484, + "learning_rate": 0.0001047327948764568, + "loss": 2.2765, + "step": 1291 + }, + { + "epoch": 0.4892086330935252, + "grad_norm": 17.06575584411621, + "learning_rate": 0.00010461306345496972, + "loss": 2.8379, + "step": 1292 + }, + { + "epoch": 0.4895872775463839, + "grad_norm": 17.766448974609375, + "learning_rate": 0.00010449332540590114, + "loss": 1.885, + "step": 1293 + }, + { + "epoch": 0.4899659219992427, + "grad_norm": 12.942706108093262, + "learning_rate": 0.00010437358090127847, + "loss": 1.6903, + "step": 1294 + }, + { + "epoch": 0.49034456645210145, + "grad_norm": 16.92314910888672, + "learning_rate": 0.00010425383011313844, + "loss": 2.4453, + "step": 1295 + }, + { + "epoch": 0.49072321090496024, + "grad_norm": 17.436086654663086, + "learning_rate": 0.00010413407321352695, + "loss": 1.9032, + "step": 1296 + }, + { + "epoch": 0.49110185535781903, + "grad_norm": 18.94797706604004, + "learning_rate": 0.00010401431037449847, + "loss": 2.0191, + "step": 1297 + }, + { + "epoch": 0.49148049981067776, + "grad_norm": 15.610849380493164, + "learning_rate": 0.0001038945417681161, + "loss": 1.19, + "step": 1298 + }, + { + "epoch": 0.49185914426353655, + "grad_norm": 16.951602935791016, + "learning_rate": 0.00010377476756645128, + "loss": 1.4745, + "step": 1299 + }, + { + "epoch": 0.4922377887163953, + "grad_norm": 45.024925231933594, + "learning_rate": 0.00010365498794158337, + "loss": 3.5771, + "step": 1300 + }, + { + "epoch": 0.4926164331692541, + "grad_norm": 10.677453994750977, + "learning_rate": 0.00010353520306559963, + "loss": 3.5375, + "step": 1301 + }, + { + "epoch": 0.4929950776221128, + "grad_norm": 12.31181812286377, + "learning_rate": 0.00010341541311059478, + "loss": 3.5221, + "step": 1302 + }, + { + "epoch": 0.4933737220749716, + "grad_norm": 11.114928245544434, + "learning_rate": 0.00010329561824867089, + "loss": 2.9916, + "step": 1303 + }, + { + "epoch": 0.4937523665278304, + "grad_norm": 14.953704833984375, + "learning_rate": 0.00010317581865193704, + "loss": 2.4552, + "step": 1304 + }, + { + "epoch": 0.4941310109806891, + "grad_norm": 11.37937068939209, + "learning_rate": 0.00010305601449250919, + "loss": 2.9803, + "step": 1305 + }, + { + "epoch": 0.4945096554335479, + "grad_norm": 10.58877944946289, + "learning_rate": 0.00010293620594250974, + "loss": 2.0205, + "step": 1306 + }, + { + "epoch": 0.49488829988640665, + "grad_norm": 11.108804702758789, + "learning_rate": 0.00010281639317406752, + "loss": 2.4598, + "step": 1307 + }, + { + "epoch": 0.49526694433926544, + "grad_norm": 11.565478324890137, + "learning_rate": 0.00010269657635931731, + "loss": 1.909, + "step": 1308 + }, + { + "epoch": 0.49564558879212417, + "grad_norm": 12.14426326751709, + "learning_rate": 0.00010257675567039979, + "loss": 3.0371, + "step": 1309 + }, + { + "epoch": 0.49602423324498296, + "grad_norm": 10.85464096069336, + "learning_rate": 0.00010245693127946112, + "loss": 2.3844, + "step": 1310 + }, + { + "epoch": 0.49640287769784175, + "grad_norm": 11.257962226867676, + "learning_rate": 0.0001023371033586529, + "loss": 2.1763, + "step": 1311 + }, + { + "epoch": 0.4967815221507005, + "grad_norm": 10.673297882080078, + "learning_rate": 0.00010221727208013166, + "loss": 1.9263, + "step": 1312 + }, + { + "epoch": 0.4971601666035593, + "grad_norm": 14.040605545043945, + "learning_rate": 0.00010209743761605885, + "loss": 2.7561, + "step": 1313 + }, + { + "epoch": 0.497538811056418, + "grad_norm": 13.651562690734863, + "learning_rate": 0.00010197760013860047, + "loss": 2.1574, + "step": 1314 + }, + { + "epoch": 0.4979174555092768, + "grad_norm": 13.463566780090332, + "learning_rate": 0.00010185775981992689, + "loss": 2.1069, + "step": 1315 + }, + { + "epoch": 0.49829609996213553, + "grad_norm": 11.810751914978027, + "learning_rate": 0.00010173791683221244, + "loss": 1.9149, + "step": 1316 + }, + { + "epoch": 0.4986747444149943, + "grad_norm": 19.515695571899414, + "learning_rate": 0.00010161807134763543, + "loss": 3.2127, + "step": 1317 + }, + { + "epoch": 0.4990533888678531, + "grad_norm": 19.75203514099121, + "learning_rate": 0.00010149822353837768, + "loss": 1.3851, + "step": 1318 + }, + { + "epoch": 0.49943203332071184, + "grad_norm": 16.31900978088379, + "learning_rate": 0.00010137837357662432, + "loss": 2.0814, + "step": 1319 + }, + { + "epoch": 0.49981067777357063, + "grad_norm": 16.237138748168945, + "learning_rate": 0.00010125852163456368, + "loss": 2.0635, + "step": 1320 + }, + { + "epoch": 0.5001893222264294, + "grad_norm": 17.72742462158203, + "learning_rate": 0.00010113866788438684, + "loss": 1.084, + "step": 1321 + }, + { + "epoch": 0.5005679666792882, + "grad_norm": 15.087898254394531, + "learning_rate": 0.00010101881249828748, + "loss": 1.5248, + "step": 1322 + }, + { + "epoch": 0.5005679666792882, + "eval_loss": 0.27747318148612976, + "eval_runtime": 896.6071, + "eval_samples_per_second": 4.961, + "eval_steps_per_second": 1.24, + "step": 1322 } ], "logging_steps": 1, @@ -4669,7 +9304,7 @@ "attributes": {} } }, - "total_flos": 3.718153047487021e+18, + "total_flos": 7.436306094974042e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null