{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 661, "global_step": 2641, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003786444528587656, "grad_norm": 10.434814453125, "learning_rate": 1e-05, "loss": 5.8683, "step": 1 }, { "epoch": 0.0003786444528587656, "eval_loss": 0.8921470642089844, "eval_runtime": 901.0053, "eval_samples_per_second": 4.937, "eval_steps_per_second": 1.234, "step": 1 }, { "epoch": 0.0007572889057175312, "grad_norm": 11.7908353805542, "learning_rate": 2e-05, "loss": 6.5457, "step": 2 }, { "epoch": 0.001135933358576297, "grad_norm": 12.84145736694336, "learning_rate": 3e-05, "loss": 6.2297, "step": 3 }, { "epoch": 0.0015145778114350624, "grad_norm": 12.087944984436035, "learning_rate": 4e-05, "loss": 5.6302, "step": 4 }, { "epoch": 0.001893222264293828, "grad_norm": 11.6513090133667, "learning_rate": 5e-05, "loss": 5.7574, "step": 5 }, { "epoch": 0.002271866717152594, "grad_norm": 11.966485977172852, "learning_rate": 6e-05, "loss": 6.4193, "step": 6 }, { "epoch": 0.0026505111700113595, "grad_norm": 14.711249351501465, "learning_rate": 7e-05, "loss": 5.9372, "step": 7 }, { "epoch": 0.003029155622870125, "grad_norm": 14.61629581451416, "learning_rate": 8e-05, "loss": 5.8455, "step": 8 }, { "epoch": 0.0034078000757288905, "grad_norm": 16.477096557617188, "learning_rate": 9e-05, "loss": 6.3195, "step": 9 }, { "epoch": 0.003786444528587656, "grad_norm": 17.702224731445312, "learning_rate": 0.0001, "loss": 6.2648, "step": 10 }, { "epoch": 0.0041650889814464215, "grad_norm": 18.931350708007812, "learning_rate": 0.00011000000000000002, "loss": 5.413, "step": 11 }, { "epoch": 0.004543733434305188, "grad_norm": 18.159408569335938, "learning_rate": 0.00012, "loss": 5.7632, "step": 12 }, { "epoch": 0.004922377887163953, "grad_norm": 15.117518424987793, "learning_rate": 0.00013000000000000002, "loss": 5.1309, "step": 13 }, { "epoch": 0.005301022340022719, "grad_norm": 13.553899765014648, "learning_rate": 0.00014, "loss": 5.5902, "step": 14 }, { "epoch": 0.005679666792881484, "grad_norm": 14.156839370727539, "learning_rate": 0.00015000000000000001, "loss": 5.8427, "step": 15 }, { "epoch": 0.00605831124574025, "grad_norm": 14.818575859069824, "learning_rate": 0.00016, "loss": 5.5943, "step": 16 }, { "epoch": 0.006436955698599016, "grad_norm": 14.836395263671875, "learning_rate": 0.00017, "loss": 5.6252, "step": 17 }, { "epoch": 0.006815600151457781, "grad_norm": 16.203628540039062, "learning_rate": 0.00018, "loss": 5.1571, "step": 18 }, { "epoch": 0.007194244604316547, "grad_norm": 16.7708797454834, "learning_rate": 0.00019, "loss": 5.1433, "step": 19 }, { "epoch": 0.007572889057175312, "grad_norm": 25.078933715820312, "learning_rate": 0.0002, "loss": 5.6986, "step": 20 }, { "epoch": 0.007951533510034078, "grad_norm": 29.939088821411133, "learning_rate": 0.00019999992816507284, "loss": 5.1664, "step": 21 }, { "epoch": 0.008330177962892843, "grad_norm": 18.42095947265625, "learning_rate": 0.0001999997126603945, "loss": 5.1388, "step": 22 }, { "epoch": 0.00870882241575161, "grad_norm": 25.811283111572266, "learning_rate": 0.00019999935348627464, "loss": 5.3499, "step": 23 }, { "epoch": 0.009087466868610375, "grad_norm": 33.81028747558594, "learning_rate": 0.00019999885064322928, "loss": 5.0324, "step": 24 }, { "epoch": 0.00946611132146914, "grad_norm": 40.47433853149414, "learning_rate": 0.00019999820413198083, "loss": 4.3909, "step": 25 }, { "epoch": 0.009844755774327906, "grad_norm": 19.015504837036133, "learning_rate": 0.00019999741395345812, "loss": 6.7567, "step": 26 }, { "epoch": 0.010223400227186671, "grad_norm": 12.075000762939453, "learning_rate": 0.00019999648010879647, "loss": 7.2374, "step": 27 }, { "epoch": 0.010602044680045438, "grad_norm": 12.168185234069824, "learning_rate": 0.00019999540259933745, "loss": 5.8711, "step": 28 }, { "epoch": 0.010980689132904203, "grad_norm": 9.592961311340332, "learning_rate": 0.00019999418142662917, "loss": 5.6626, "step": 29 }, { "epoch": 0.011359333585762969, "grad_norm": 8.446025848388672, "learning_rate": 0.00019999281659242608, "loss": 5.5054, "step": 30 }, { "epoch": 0.011737978038621734, "grad_norm": 8.979402542114258, "learning_rate": 0.000199991308098689, "loss": 5.268, "step": 31 }, { "epoch": 0.0121166224914805, "grad_norm": 11.317891120910645, "learning_rate": 0.00019998965594758523, "loss": 5.7779, "step": 32 }, { "epoch": 0.012495266944339266, "grad_norm": 9.691301345825195, "learning_rate": 0.00019998786014148838, "loss": 5.4439, "step": 33 }, { "epoch": 0.012873911397198031, "grad_norm": 9.59664249420166, "learning_rate": 0.0001999859206829785, "loss": 5.0315, "step": 34 }, { "epoch": 0.013252555850056797, "grad_norm": 12.453145980834961, "learning_rate": 0.000199983837574842, "loss": 5.7834, "step": 35 }, { "epoch": 0.013631200302915562, "grad_norm": 10.473176956176758, "learning_rate": 0.00019998161082007164, "loss": 5.4829, "step": 36 }, { "epoch": 0.014009844755774327, "grad_norm": 9.838224411010742, "learning_rate": 0.0001999792404218667, "loss": 5.0974, "step": 37 }, { "epoch": 0.014388489208633094, "grad_norm": 12.153512954711914, "learning_rate": 0.00019997672638363262, "loss": 5.4264, "step": 38 }, { "epoch": 0.01476713366149186, "grad_norm": 11.956207275390625, "learning_rate": 0.00019997406870898133, "loss": 5.5113, "step": 39 }, { "epoch": 0.015145778114350625, "grad_norm": 12.791664123535156, "learning_rate": 0.00019997126740173114, "loss": 5.2499, "step": 40 }, { "epoch": 0.01552442256720939, "grad_norm": 9.361381530761719, "learning_rate": 0.0001999683224659067, "loss": 4.8344, "step": 41 }, { "epoch": 0.015903067020068155, "grad_norm": 10.928776741027832, "learning_rate": 0.000199965233905739, "loss": 6.2701, "step": 42 }, { "epoch": 0.01628171147292692, "grad_norm": 12.90079402923584, "learning_rate": 0.00019996200172566527, "loss": 5.0639, "step": 43 }, { "epoch": 0.016660355925785686, "grad_norm": 13.444856643676758, "learning_rate": 0.0001999586259303293, "loss": 4.7613, "step": 44 }, { "epoch": 0.01703900037864445, "grad_norm": 13.767925262451172, "learning_rate": 0.00019995510652458105, "loss": 4.0273, "step": 45 }, { "epoch": 0.01741764483150322, "grad_norm": 15.626961708068848, "learning_rate": 0.00019995144351347678, "loss": 4.7746, "step": 46 }, { "epoch": 0.017796289284361985, "grad_norm": 26.331802368164062, "learning_rate": 0.00019994763690227925, "loss": 4.8851, "step": 47 }, { "epoch": 0.01817493373722075, "grad_norm": 22.59031867980957, "learning_rate": 0.0001999436866964573, "loss": 4.2097, "step": 48 }, { "epoch": 0.018553578190079516, "grad_norm": 17.976661682128906, "learning_rate": 0.00019993959290168627, "loss": 2.8742, "step": 49 }, { "epoch": 0.01893222264293828, "grad_norm": 41.0118408203125, "learning_rate": 0.00019993535552384766, "loss": 5.117, "step": 50 }, { "epoch": 0.019310867095797046, "grad_norm": 16.585399627685547, "learning_rate": 0.0001999309745690293, "loss": 6.8806, "step": 51 }, { "epoch": 0.01968951154865581, "grad_norm": 11.035297393798828, "learning_rate": 0.00019992645004352535, "loss": 6.3081, "step": 52 }, { "epoch": 0.020068156001514577, "grad_norm": 8.331199645996094, "learning_rate": 0.00019992178195383614, "loss": 5.6585, "step": 53 }, { "epoch": 0.020446800454373342, "grad_norm": 9.50080394744873, "learning_rate": 0.00019991697030666833, "loss": 6.0226, "step": 54 }, { "epoch": 0.020825444907232107, "grad_norm": 8.85689640045166, "learning_rate": 0.00019991201510893483, "loss": 6.2203, "step": 55 }, { "epoch": 0.021204089360090876, "grad_norm": 10.029090881347656, "learning_rate": 0.00019990691636775473, "loss": 6.0392, "step": 56 }, { "epoch": 0.02158273381294964, "grad_norm": 8.855071067810059, "learning_rate": 0.0001999016740904534, "loss": 5.3339, "step": 57 }, { "epoch": 0.021961378265808407, "grad_norm": 9.614381790161133, "learning_rate": 0.00019989628828456237, "loss": 4.4834, "step": 58 }, { "epoch": 0.022340022718667172, "grad_norm": 11.6102933883667, "learning_rate": 0.00019989075895781948, "loss": 5.65, "step": 59 }, { "epoch": 0.022718667171525937, "grad_norm": 10.178080558776855, "learning_rate": 0.00019988508611816868, "loss": 5.7066, "step": 60 }, { "epoch": 0.023097311624384703, "grad_norm": 10.321159362792969, "learning_rate": 0.00019987926977376014, "loss": 6.1788, "step": 61 }, { "epoch": 0.023475956077243468, "grad_norm": 9.083809852600098, "learning_rate": 0.00019987330993295014, "loss": 4.5461, "step": 62 }, { "epoch": 0.023854600530102233, "grad_norm": 10.712754249572754, "learning_rate": 0.00019986720660430124, "loss": 5.7984, "step": 63 }, { "epoch": 0.024233244982961, "grad_norm": 11.429099082946777, "learning_rate": 0.0001998609597965821, "loss": 5.7685, "step": 64 }, { "epoch": 0.024611889435819764, "grad_norm": 11.242938995361328, "learning_rate": 0.00019985456951876742, "loss": 4.7109, "step": 65 }, { "epoch": 0.024990533888678532, "grad_norm": 13.852066993713379, "learning_rate": 0.00019984803578003817, "loss": 5.1454, "step": 66 }, { "epoch": 0.025369178341537298, "grad_norm": 11.528448104858398, "learning_rate": 0.00019984135858978132, "loss": 4.6155, "step": 67 }, { "epoch": 0.025747822794396063, "grad_norm": 15.846125602722168, "learning_rate": 0.00019983453795759, "loss": 6.0121, "step": 68 }, { "epoch": 0.026126467247254828, "grad_norm": 14.896081924438477, "learning_rate": 0.00019982757389326342, "loss": 4.7377, "step": 69 }, { "epoch": 0.026505111700113593, "grad_norm": 19.171016693115234, "learning_rate": 0.0001998204664068068, "loss": 5.2492, "step": 70 }, { "epoch": 0.02688375615297236, "grad_norm": 17.842302322387695, "learning_rate": 0.0001998132155084315, "loss": 4.3207, "step": 71 }, { "epoch": 0.027262400605831124, "grad_norm": 15.214621543884277, "learning_rate": 0.00019980582120855483, "loss": 5.0446, "step": 72 }, { "epoch": 0.02764104505868989, "grad_norm": 18.836454391479492, "learning_rate": 0.0001997982835178002, "loss": 4.0261, "step": 73 }, { "epoch": 0.028019689511548655, "grad_norm": 37.965946197509766, "learning_rate": 0.00019979060244699698, "loss": 4.8663, "step": 74 }, { "epoch": 0.02839833396440742, "grad_norm": 56.08090591430664, "learning_rate": 0.00019978277800718054, "loss": 6.0144, "step": 75 }, { "epoch": 0.02877697841726619, "grad_norm": 10.698841094970703, "learning_rate": 0.0001997748102095923, "loss": 6.8784, "step": 76 }, { "epoch": 0.029155622870124954, "grad_norm": 9.394986152648926, "learning_rate": 0.00019976669906567954, "loss": 6.6554, "step": 77 }, { "epoch": 0.02953426732298372, "grad_norm": 7.796587944030762, "learning_rate": 0.00019975844458709557, "loss": 6.1919, "step": 78 }, { "epoch": 0.029912911775842484, "grad_norm": 9.376069068908691, "learning_rate": 0.0001997500467856995, "loss": 5.957, "step": 79 }, { "epoch": 0.03029155622870125, "grad_norm": 10.019186973571777, "learning_rate": 0.00019974150567355655, "loss": 6.4973, "step": 80 }, { "epoch": 0.030670200681560015, "grad_norm": 10.175936698913574, "learning_rate": 0.00019973282126293758, "loss": 6.0903, "step": 81 }, { "epoch": 0.03104884513441878, "grad_norm": 9.391570091247559, "learning_rate": 0.00019972399356631964, "loss": 5.2329, "step": 82 }, { "epoch": 0.03142748958727755, "grad_norm": 9.361041069030762, "learning_rate": 0.00019971502259638534, "loss": 6.0391, "step": 83 }, { "epoch": 0.03180613404013631, "grad_norm": 9.759671211242676, "learning_rate": 0.00019970590836602335, "loss": 6.0924, "step": 84 }, { "epoch": 0.03218477849299508, "grad_norm": 9.57455062866211, "learning_rate": 0.000199696650888328, "loss": 5.4275, "step": 85 }, { "epoch": 0.03256342294585384, "grad_norm": 9.738369941711426, "learning_rate": 0.00019968725017659953, "loss": 5.2149, "step": 86 }, { "epoch": 0.03294206739871261, "grad_norm": 10.45261287689209, "learning_rate": 0.00019967770624434387, "loss": 4.5745, "step": 87 }, { "epoch": 0.03332071185157137, "grad_norm": 11.853706359863281, "learning_rate": 0.00019966801910527288, "loss": 5.6621, "step": 88 }, { "epoch": 0.03369935630443014, "grad_norm": 9.945990562438965, "learning_rate": 0.000199658188773304, "loss": 4.975, "step": 89 }, { "epoch": 0.0340780007572889, "grad_norm": 9.144219398498535, "learning_rate": 0.00019964821526256043, "loss": 4.6526, "step": 90 }, { "epoch": 0.03445664521014767, "grad_norm": 10.96102237701416, "learning_rate": 0.00019963809858737115, "loss": 5.5929, "step": 91 }, { "epoch": 0.03483528966300644, "grad_norm": 12.377371788024902, "learning_rate": 0.0001996278387622707, "loss": 5.2634, "step": 92 }, { "epoch": 0.0352139341158652, "grad_norm": 12.394866943359375, "learning_rate": 0.00019961743580199946, "loss": 5.6475, "step": 93 }, { "epoch": 0.03559257856872397, "grad_norm": 15.493999481201172, "learning_rate": 0.00019960688972150327, "loss": 5.0573, "step": 94 }, { "epoch": 0.03597122302158273, "grad_norm": 14.89577865600586, "learning_rate": 0.00019959620053593366, "loss": 4.3286, "step": 95 }, { "epoch": 0.0363498674744415, "grad_norm": 17.289690017700195, "learning_rate": 0.00019958536826064784, "loss": 4.417, "step": 96 }, { "epoch": 0.03672851192730026, "grad_norm": 19.77994155883789, "learning_rate": 0.00019957439291120848, "loss": 5.0353, "step": 97 }, { "epoch": 0.03710715638015903, "grad_norm": 35.42997741699219, "learning_rate": 0.00019956327450338382, "loss": 4.8566, "step": 98 }, { "epoch": 0.03748580083301779, "grad_norm": 34.25278091430664, "learning_rate": 0.00019955201305314768, "loss": 5.0527, "step": 99 }, { "epoch": 0.03786444528587656, "grad_norm": 32.984031677246094, "learning_rate": 0.00019954060857667942, "loss": 4.1489, "step": 100 }, { "epoch": 0.03824308973873533, "grad_norm": 25.177045822143555, "learning_rate": 0.00019952906109036377, "loss": 7.8832, "step": 101 }, { "epoch": 0.03862173419159409, "grad_norm": 11.03814697265625, "learning_rate": 0.00019951737061079102, "loss": 6.4552, "step": 102 }, { "epoch": 0.03900037864445286, "grad_norm": 9.427952766418457, "learning_rate": 0.00019950553715475684, "loss": 6.2599, "step": 103 }, { "epoch": 0.03937902309731162, "grad_norm": 9.926714897155762, "learning_rate": 0.00019949356073926236, "loss": 5.2806, "step": 104 }, { "epoch": 0.03975766755017039, "grad_norm": 9.968575477600098, "learning_rate": 0.00019948144138151407, "loss": 5.6615, "step": 105 }, { "epoch": 0.040136312003029154, "grad_norm": 9.264948844909668, "learning_rate": 0.00019946917909892384, "loss": 5.1696, "step": 106 }, { "epoch": 0.04051495645588792, "grad_norm": 11.23089599609375, "learning_rate": 0.00019945677390910887, "loss": 5.7446, "step": 107 }, { "epoch": 0.040893600908746684, "grad_norm": 12.40099048614502, "learning_rate": 0.0001994442258298917, "loss": 5.274, "step": 108 }, { "epoch": 0.04127224536160545, "grad_norm": 10.000561714172363, "learning_rate": 0.00019943153487930005, "loss": 5.4166, "step": 109 }, { "epoch": 0.041650889814464215, "grad_norm": 9.609817504882812, "learning_rate": 0.00019941870107556713, "loss": 5.4101, "step": 110 }, { "epoch": 0.042029534267322984, "grad_norm": 11.121451377868652, "learning_rate": 0.00019940572443713115, "loss": 5.1717, "step": 111 }, { "epoch": 0.04240817872018175, "grad_norm": 9.745224952697754, "learning_rate": 0.0001993926049826356, "loss": 5.4372, "step": 112 }, { "epoch": 0.042786823173040514, "grad_norm": 10.347518920898438, "learning_rate": 0.00019937934273092932, "loss": 5.8101, "step": 113 }, { "epoch": 0.04316546762589928, "grad_norm": 12.633049011230469, "learning_rate": 0.00019936593770106603, "loss": 5.6172, "step": 114 }, { "epoch": 0.043544112078758045, "grad_norm": 11.7897310256958, "learning_rate": 0.00019935238991230473, "loss": 5.5913, "step": 115 }, { "epoch": 0.04392275653161681, "grad_norm": 11.529661178588867, "learning_rate": 0.0001993386993841096, "loss": 4.7866, "step": 116 }, { "epoch": 0.044301400984475575, "grad_norm": 9.104240417480469, "learning_rate": 0.00019932486613614972, "loss": 4.1991, "step": 117 }, { "epoch": 0.044680045437334344, "grad_norm": 12.118027687072754, "learning_rate": 0.00019931089018829934, "loss": 4.7862, "step": 118 }, { "epoch": 0.045058689890193106, "grad_norm": 12.452719688415527, "learning_rate": 0.00019929677156063766, "loss": 4.1519, "step": 119 }, { "epoch": 0.045437334343051874, "grad_norm": 15.220785140991211, "learning_rate": 0.00019928251027344888, "loss": 4.8224, "step": 120 }, { "epoch": 0.04581597879591064, "grad_norm": 19.72614097595215, "learning_rate": 0.0001992681063472222, "loss": 5.4116, "step": 121 }, { "epoch": 0.046194623248769405, "grad_norm": 15.22668170928955, "learning_rate": 0.00019925355980265176, "loss": 4.1883, "step": 122 }, { "epoch": 0.046573267701628174, "grad_norm": 22.6198673248291, "learning_rate": 0.00019923887066063643, "loss": 4.0129, "step": 123 }, { "epoch": 0.046951912154486936, "grad_norm": 38.672752380371094, "learning_rate": 0.0001992240389422802, "loss": 3.4216, "step": 124 }, { "epoch": 0.047330556607345704, "grad_norm": 32.17597198486328, "learning_rate": 0.00019920906466889174, "loss": 4.9508, "step": 125 }, { "epoch": 0.047709201060204466, "grad_norm": 14.039003372192383, "learning_rate": 0.00019919394786198453, "loss": 6.7088, "step": 126 }, { "epoch": 0.048087845513063235, "grad_norm": 9.275696754455566, "learning_rate": 0.00019917868854327692, "loss": 5.7713, "step": 127 }, { "epoch": 0.048466489965922, "grad_norm": 9.453801155090332, "learning_rate": 0.00019916328673469193, "loss": 5.5684, "step": 128 }, { "epoch": 0.048845134418780765, "grad_norm": 9.081092834472656, "learning_rate": 0.0001991477424583573, "loss": 6.0058, "step": 129 }, { "epoch": 0.04922377887163953, "grad_norm": 7.833642482757568, "learning_rate": 0.00019913205573660552, "loss": 5.4775, "step": 130 }, { "epoch": 0.049602423324498296, "grad_norm": 8.797674179077148, "learning_rate": 0.0001991162265919736, "loss": 6.0334, "step": 131 }, { "epoch": 0.049981067777357065, "grad_norm": 8.712818145751953, "learning_rate": 0.00019910025504720332, "loss": 5.0432, "step": 132 }, { "epoch": 0.050359712230215826, "grad_norm": 10.346916198730469, "learning_rate": 0.00019908414112524092, "loss": 5.2967, "step": 133 }, { "epoch": 0.050738356683074595, "grad_norm": 9.813155174255371, "learning_rate": 0.0001990678848492373, "loss": 5.2965, "step": 134 }, { "epoch": 0.05111700113593336, "grad_norm": 9.53530216217041, "learning_rate": 0.0001990514862425478, "loss": 5.3226, "step": 135 }, { "epoch": 0.051495645588792126, "grad_norm": 9.706903457641602, "learning_rate": 0.00019903494532873226, "loss": 5.1397, "step": 136 }, { "epoch": 0.05187429004165089, "grad_norm": 9.555363655090332, "learning_rate": 0.00019901826213155504, "loss": 4.7094, "step": 137 }, { "epoch": 0.052252934494509656, "grad_norm": 9.73580265045166, "learning_rate": 0.00019900143667498477, "loss": 4.8708, "step": 138 }, { "epoch": 0.05263157894736842, "grad_norm": 12.988117218017578, "learning_rate": 0.0001989844689831947, "loss": 5.4945, "step": 139 }, { "epoch": 0.05301022340022719, "grad_norm": 11.392786979675293, "learning_rate": 0.00019896735908056217, "loss": 4.9868, "step": 140 }, { "epoch": 0.053388867853085956, "grad_norm": 11.049524307250977, "learning_rate": 0.00019895010699166895, "loss": 5.6386, "step": 141 }, { "epoch": 0.05376751230594472, "grad_norm": 15.501945495605469, "learning_rate": 0.0001989327127413012, "loss": 4.9812, "step": 142 }, { "epoch": 0.054146156758803486, "grad_norm": 12.038402557373047, "learning_rate": 0.00019891517635444909, "loss": 4.5501, "step": 143 }, { "epoch": 0.05452480121166225, "grad_norm": 14.716442108154297, "learning_rate": 0.00019889749785630722, "loss": 5.4678, "step": 144 }, { "epoch": 0.05490344566452102, "grad_norm": 14.685711860656738, "learning_rate": 0.00019887967727227418, "loss": 4.0556, "step": 145 }, { "epoch": 0.05528209011737978, "grad_norm": 19.5123291015625, "learning_rate": 0.00019886171462795283, "loss": 4.4198, "step": 146 }, { "epoch": 0.05566073457023855, "grad_norm": 20.309396743774414, "learning_rate": 0.00019884360994915006, "loss": 5.0207, "step": 147 }, { "epoch": 0.05603937902309731, "grad_norm": 18.461915969848633, "learning_rate": 0.00019882536326187685, "loss": 4.3499, "step": 148 }, { "epoch": 0.05641802347595608, "grad_norm": 28.44086265563965, "learning_rate": 0.00019880697459234817, "loss": 3.1848, "step": 149 }, { "epoch": 0.05679666792881484, "grad_norm": 58.63621520996094, "learning_rate": 0.00019878844396698298, "loss": 5.8651, "step": 150 }, { "epoch": 0.05717531238167361, "grad_norm": 13.71030044555664, "learning_rate": 0.00019876977141240426, "loss": 6.241, "step": 151 }, { "epoch": 0.05755395683453238, "grad_norm": 10.938446044921875, "learning_rate": 0.00019875095695543875, "loss": 5.6771, "step": 152 }, { "epoch": 0.05793260128739114, "grad_norm": 10.96714973449707, "learning_rate": 0.00019873200062311725, "loss": 5.2314, "step": 153 }, { "epoch": 0.05831124574024991, "grad_norm": 7.606492519378662, "learning_rate": 0.00019871290244267425, "loss": 5.7249, "step": 154 }, { "epoch": 0.05868989019310867, "grad_norm": 8.784101486206055, "learning_rate": 0.00019869366244154804, "loss": 4.9694, "step": 155 }, { "epoch": 0.05906853464596744, "grad_norm": 11.263976097106934, "learning_rate": 0.00019867428064738077, "loss": 5.5875, "step": 156 }, { "epoch": 0.0594471790988262, "grad_norm": 9.343450546264648, "learning_rate": 0.0001986547570880182, "loss": 6.221, "step": 157 }, { "epoch": 0.05982582355168497, "grad_norm": 9.731782913208008, "learning_rate": 0.00019863509179150984, "loss": 6.2793, "step": 158 }, { "epoch": 0.06020446800454373, "grad_norm": 10.603925704956055, "learning_rate": 0.00019861528478610873, "loss": 5.226, "step": 159 }, { "epoch": 0.0605831124574025, "grad_norm": 8.70156192779541, "learning_rate": 0.00019859533610027162, "loss": 5.7189, "step": 160 }, { "epoch": 0.06096175691026127, "grad_norm": 11.445813179016113, "learning_rate": 0.00019857524576265872, "loss": 5.772, "step": 161 }, { "epoch": 0.06134040136312003, "grad_norm": 9.810565948486328, "learning_rate": 0.0001985550138021338, "loss": 5.2862, "step": 162 }, { "epoch": 0.0617190458159788, "grad_norm": 9.25048828125, "learning_rate": 0.00019853464024776406, "loss": 4.5556, "step": 163 }, { "epoch": 0.06209769026883756, "grad_norm": 9.317825317382812, "learning_rate": 0.00019851412512882023, "loss": 5.3411, "step": 164 }, { "epoch": 0.06247633472169633, "grad_norm": 11.587838172912598, "learning_rate": 0.0001984934684747763, "loss": 5.739, "step": 165 }, { "epoch": 0.0628549791745551, "grad_norm": 12.702302932739258, "learning_rate": 0.00019847267031530965, "loss": 4.9714, "step": 166 }, { "epoch": 0.06323362362741386, "grad_norm": 14.249470710754395, "learning_rate": 0.00019845173068030097, "loss": 4.5709, "step": 167 }, { "epoch": 0.06361226808027262, "grad_norm": 14.03624439239502, "learning_rate": 0.00019843064959983422, "loss": 4.487, "step": 168 }, { "epoch": 0.06399091253313138, "grad_norm": 12.434381484985352, "learning_rate": 0.00019840942710419658, "loss": 4.296, "step": 169 }, { "epoch": 0.06436955698599016, "grad_norm": 15.566539764404297, "learning_rate": 0.00019838806322387828, "loss": 4.1964, "step": 170 }, { "epoch": 0.06474820143884892, "grad_norm": 12.214476585388184, "learning_rate": 0.0001983665579895729, "loss": 3.9038, "step": 171 }, { "epoch": 0.06512684589170768, "grad_norm": 16.57448387145996, "learning_rate": 0.0001983449114321769, "loss": 4.5337, "step": 172 }, { "epoch": 0.06550549034456646, "grad_norm": 18.46966552734375, "learning_rate": 0.0001983231235827899, "loss": 4.4065, "step": 173 }, { "epoch": 0.06588413479742522, "grad_norm": 24.38216781616211, "learning_rate": 0.00019830119447271442, "loss": 2.9628, "step": 174 }, { "epoch": 0.06626277925028398, "grad_norm": 23.528114318847656, "learning_rate": 0.00019827912413345603, "loss": 3.3465, "step": 175 }, { "epoch": 0.06664142370314274, "grad_norm": 10.8902587890625, "learning_rate": 0.00019825691259672313, "loss": 6.1824, "step": 176 }, { "epoch": 0.06702006815600152, "grad_norm": 10.006114959716797, "learning_rate": 0.000198234559894427, "loss": 5.6762, "step": 177 }, { "epoch": 0.06739871260886028, "grad_norm": 9.918802261352539, "learning_rate": 0.00019821206605868174, "loss": 5.5663, "step": 178 }, { "epoch": 0.06777735706171904, "grad_norm": 8.497994422912598, "learning_rate": 0.00019818943112180423, "loss": 5.8234, "step": 179 }, { "epoch": 0.0681560015145778, "grad_norm": 9.795154571533203, "learning_rate": 0.00019816665511631403, "loss": 5.3252, "step": 180 }, { "epoch": 0.06853464596743658, "grad_norm": 11.03689193725586, "learning_rate": 0.0001981437380749334, "loss": 6.0853, "step": 181 }, { "epoch": 0.06891329042029534, "grad_norm": 9.795255661010742, "learning_rate": 0.00019812068003058721, "loss": 5.0421, "step": 182 }, { "epoch": 0.0692919348731541, "grad_norm": 10.504554748535156, "learning_rate": 0.00019809748101640295, "loss": 5.2529, "step": 183 }, { "epoch": 0.06967057932601288, "grad_norm": 9.605035781860352, "learning_rate": 0.0001980741410657106, "loss": 5.0307, "step": 184 }, { "epoch": 0.07004922377887164, "grad_norm": 10.972379684448242, "learning_rate": 0.00019805066021204258, "loss": 5.13, "step": 185 }, { "epoch": 0.0704278682317304, "grad_norm": 10.463446617126465, "learning_rate": 0.00019802703848913384, "loss": 4.6112, "step": 186 }, { "epoch": 0.07080651268458917, "grad_norm": 11.090287208557129, "learning_rate": 0.0001980032759309217, "loss": 5.1514, "step": 187 }, { "epoch": 0.07118515713744794, "grad_norm": 11.830557823181152, "learning_rate": 0.00019797937257154573, "loss": 5.6081, "step": 188 }, { "epoch": 0.0715638015903067, "grad_norm": 10.591259002685547, "learning_rate": 0.00019795532844534792, "loss": 4.729, "step": 189 }, { "epoch": 0.07194244604316546, "grad_norm": 10.960124015808105, "learning_rate": 0.00019793114358687236, "loss": 4.6169, "step": 190 }, { "epoch": 0.07232109049602424, "grad_norm": 11.412923812866211, "learning_rate": 0.00019790681803086548, "loss": 4.6233, "step": 191 }, { "epoch": 0.072699734948883, "grad_norm": 11.271405220031738, "learning_rate": 0.00019788235181227574, "loss": 4.5077, "step": 192 }, { "epoch": 0.07307837940174176, "grad_norm": 11.715191841125488, "learning_rate": 0.00019785774496625366, "loss": 4.5266, "step": 193 }, { "epoch": 0.07345702385460053, "grad_norm": 14.390351295471191, "learning_rate": 0.00019783299752815196, "loss": 5.2515, "step": 194 }, { "epoch": 0.0738356683074593, "grad_norm": 11.806098937988281, "learning_rate": 0.00019780810953352518, "loss": 3.7989, "step": 195 }, { "epoch": 0.07421431276031806, "grad_norm": 13.76820182800293, "learning_rate": 0.00019778308101812988, "loss": 3.8526, "step": 196 }, { "epoch": 0.07459295721317682, "grad_norm": 16.82176399230957, "learning_rate": 0.0001977579120179245, "loss": 4.024, "step": 197 }, { "epoch": 0.07497160166603559, "grad_norm": 27.145509719848633, "learning_rate": 0.0001977326025690693, "loss": 4.9692, "step": 198 }, { "epoch": 0.07535024611889436, "grad_norm": 17.646276473999023, "learning_rate": 0.00019770715270792634, "loss": 2.3489, "step": 199 }, { "epoch": 0.07572889057175312, "grad_norm": 56.70100021362305, "learning_rate": 0.00019768156247105937, "loss": 3.9912, "step": 200 }, { "epoch": 0.07610753502461189, "grad_norm": 9.720958709716797, "learning_rate": 0.0001976558318952339, "loss": 6.4383, "step": 201 }, { "epoch": 0.07648617947747066, "grad_norm": 10.620763778686523, "learning_rate": 0.00019762996101741696, "loss": 7.2243, "step": 202 }, { "epoch": 0.07686482393032942, "grad_norm": 8.535510063171387, "learning_rate": 0.00019760394987477722, "loss": 5.143, "step": 203 }, { "epoch": 0.07724346838318819, "grad_norm": 9.765297889709473, "learning_rate": 0.00019757779850468484, "loss": 5.1503, "step": 204 }, { "epoch": 0.07762211283604695, "grad_norm": 9.695032119750977, "learning_rate": 0.00019755150694471146, "loss": 6.0913, "step": 205 }, { "epoch": 0.07800075728890572, "grad_norm": 8.690482139587402, "learning_rate": 0.00019752507523263015, "loss": 5.1187, "step": 206 }, { "epoch": 0.07837940174176448, "grad_norm": 8.73969554901123, "learning_rate": 0.0001974985034064153, "loss": 5.0969, "step": 207 }, { "epoch": 0.07875804619462325, "grad_norm": 9.594573020935059, "learning_rate": 0.0001974717915042426, "loss": 4.5138, "step": 208 }, { "epoch": 0.07913669064748201, "grad_norm": 10.64561653137207, "learning_rate": 0.00019744493956448897, "loss": 6.0733, "step": 209 }, { "epoch": 0.07951533510034078, "grad_norm": 10.740833282470703, "learning_rate": 0.00019741794762573266, "loss": 4.8035, "step": 210 }, { "epoch": 0.07989397955319955, "grad_norm": 11.910998344421387, "learning_rate": 0.0001973908157267528, "loss": 4.9078, "step": 211 }, { "epoch": 0.08027262400605831, "grad_norm": 10.62619400024414, "learning_rate": 0.00019736354390652988, "loss": 4.7867, "step": 212 }, { "epoch": 0.08065126845891708, "grad_norm": 12.65106201171875, "learning_rate": 0.00019733613220424524, "loss": 4.7825, "step": 213 }, { "epoch": 0.08102991291177584, "grad_norm": 10.566100120544434, "learning_rate": 0.0001973085806592812, "loss": 4.808, "step": 214 }, { "epoch": 0.0814085573646346, "grad_norm": 14.50074291229248, "learning_rate": 0.00019728088931122105, "loss": 5.8235, "step": 215 }, { "epoch": 0.08178720181749337, "grad_norm": 11.592037200927734, "learning_rate": 0.00019725305819984893, "loss": 4.4702, "step": 216 }, { "epoch": 0.08216584627035214, "grad_norm": 11.895447731018066, "learning_rate": 0.00019722508736514974, "loss": 4.6943, "step": 217 }, { "epoch": 0.0825444907232109, "grad_norm": 13.651464462280273, "learning_rate": 0.00019719697684730914, "loss": 4.6499, "step": 218 }, { "epoch": 0.08292313517606967, "grad_norm": 14.508546829223633, "learning_rate": 0.00019716872668671344, "loss": 4.4073, "step": 219 }, { "epoch": 0.08330177962892843, "grad_norm": 12.980317115783691, "learning_rate": 0.00019714033692394965, "loss": 4.389, "step": 220 }, { "epoch": 0.0836804240817872, "grad_norm": 17.773025512695312, "learning_rate": 0.00019711180759980529, "loss": 3.8144, "step": 221 }, { "epoch": 0.08405906853464597, "grad_norm": 16.80002784729004, "learning_rate": 0.00019708313875526834, "loss": 4.2691, "step": 222 }, { "epoch": 0.08443771298750473, "grad_norm": 16.477399826049805, "learning_rate": 0.00019705433043152736, "loss": 3.5554, "step": 223 }, { "epoch": 0.0848163574403635, "grad_norm": 26.655338287353516, "learning_rate": 0.00019702538266997124, "loss": 3.5923, "step": 224 }, { "epoch": 0.08519500189322227, "grad_norm": 28.16261863708496, "learning_rate": 0.0001969962955121891, "loss": 3.247, "step": 225 }, { "epoch": 0.08557364634608103, "grad_norm": 10.550549507141113, "learning_rate": 0.00019696706899997052, "loss": 6.8701, "step": 226 }, { "epoch": 0.08595229079893979, "grad_norm": 8.839621543884277, "learning_rate": 0.0001969377031753051, "loss": 5.5944, "step": 227 }, { "epoch": 0.08633093525179857, "grad_norm": 8.947502136230469, "learning_rate": 0.00019690819808038272, "loss": 5.7622, "step": 228 }, { "epoch": 0.08670957970465733, "grad_norm": 9.0411376953125, "learning_rate": 0.00019687855375759327, "loss": 4.649, "step": 229 }, { "epoch": 0.08708822415751609, "grad_norm": 8.791936874389648, "learning_rate": 0.0001968487702495268, "loss": 5.1811, "step": 230 }, { "epoch": 0.08746686861037486, "grad_norm": 9.556682586669922, "learning_rate": 0.00019681884759897308, "loss": 5.9121, "step": 231 }, { "epoch": 0.08784551306323363, "grad_norm": 10.459571838378906, "learning_rate": 0.00019678878584892208, "loss": 5.6164, "step": 232 }, { "epoch": 0.08822415751609239, "grad_norm": 11.417348861694336, "learning_rate": 0.00019675858504256344, "loss": 4.8234, "step": 233 }, { "epoch": 0.08860280196895115, "grad_norm": 8.151843070983887, "learning_rate": 0.00019672824522328655, "loss": 4.9158, "step": 234 }, { "epoch": 0.08898144642180993, "grad_norm": 10.862298965454102, "learning_rate": 0.00019669776643468066, "loss": 5.3044, "step": 235 }, { "epoch": 0.08936009087466869, "grad_norm": 11.182097434997559, "learning_rate": 0.00019666714872053454, "loss": 5.8071, "step": 236 }, { "epoch": 0.08973873532752745, "grad_norm": 10.572265625, "learning_rate": 0.00019663639212483665, "loss": 4.8596, "step": 237 }, { "epoch": 0.09011737978038621, "grad_norm": 9.833358764648438, "learning_rate": 0.00019660549669177495, "loss": 4.7743, "step": 238 }, { "epoch": 0.09049602423324499, "grad_norm": 10.828356742858887, "learning_rate": 0.00019657446246573685, "loss": 5.5859, "step": 239 }, { "epoch": 0.09087466868610375, "grad_norm": 9.41773796081543, "learning_rate": 0.00019654328949130916, "loss": 4.6524, "step": 240 }, { "epoch": 0.09125331313896251, "grad_norm": 10.468668937683105, "learning_rate": 0.0001965119778132781, "loss": 3.8107, "step": 241 }, { "epoch": 0.09163195759182129, "grad_norm": 13.526198387145996, "learning_rate": 0.00019648052747662907, "loss": 4.77, "step": 242 }, { "epoch": 0.09201060204468005, "grad_norm": 14.636107444763184, "learning_rate": 0.0001964489385265467, "loss": 5.4413, "step": 243 }, { "epoch": 0.09238924649753881, "grad_norm": 13.989765167236328, "learning_rate": 0.00019641721100841487, "loss": 4.2013, "step": 244 }, { "epoch": 0.09276789095039757, "grad_norm": 17.5861873626709, "learning_rate": 0.0001963853449678164, "loss": 3.5504, "step": 245 }, { "epoch": 0.09314653540325635, "grad_norm": 16.095678329467773, "learning_rate": 0.00019635334045053318, "loss": 4.5176, "step": 246 }, { "epoch": 0.09352517985611511, "grad_norm": 19.897119522094727, "learning_rate": 0.00019632119750254606, "loss": 4.4155, "step": 247 }, { "epoch": 0.09390382430897387, "grad_norm": 21.077598571777344, "learning_rate": 0.0001962889161700348, "loss": 4.0351, "step": 248 }, { "epoch": 0.09428246876183263, "grad_norm": 28.60135841369629, "learning_rate": 0.00019625649649937792, "loss": 4.0419, "step": 249 }, { "epoch": 0.09466111321469141, "grad_norm": 48.00245666503906, "learning_rate": 0.00019622393853715265, "loss": 4.1211, "step": 250 }, { "epoch": 0.09503975766755017, "grad_norm": 12.560955047607422, "learning_rate": 0.00019619124233013512, "loss": 6.4683, "step": 251 }, { "epoch": 0.09541840212040893, "grad_norm": 11.030938148498535, "learning_rate": 0.00019615840792529978, "loss": 6.5968, "step": 252 }, { "epoch": 0.09579704657326771, "grad_norm": 8.982504844665527, "learning_rate": 0.00019612543536981982, "loss": 5.2818, "step": 253 }, { "epoch": 0.09617569102612647, "grad_norm": 7.904403209686279, "learning_rate": 0.00019609232471106688, "loss": 5.9209, "step": 254 }, { "epoch": 0.09655433547898523, "grad_norm": 9.775873184204102, "learning_rate": 0.00019605907599661097, "loss": 5.3489, "step": 255 }, { "epoch": 0.096932979931844, "grad_norm": 8.759675979614258, "learning_rate": 0.0001960256892742205, "loss": 4.6493, "step": 256 }, { "epoch": 0.09731162438470277, "grad_norm": 11.45134449005127, "learning_rate": 0.0001959921645918621, "loss": 4.4718, "step": 257 }, { "epoch": 0.09769026883756153, "grad_norm": 10.730209350585938, "learning_rate": 0.0001959585019977006, "loss": 5.1965, "step": 258 }, { "epoch": 0.09806891329042029, "grad_norm": 10.355484962463379, "learning_rate": 0.0001959247015400991, "loss": 4.0992, "step": 259 }, { "epoch": 0.09844755774327905, "grad_norm": 11.505188941955566, "learning_rate": 0.00019589076326761854, "loss": 5.201, "step": 260 }, { "epoch": 0.09882620219613783, "grad_norm": 13.447498321533203, "learning_rate": 0.00019585668722901808, "loss": 6.0457, "step": 261 }, { "epoch": 0.09920484664899659, "grad_norm": 10.8496732711792, "learning_rate": 0.00019582247347325473, "loss": 4.9541, "step": 262 }, { "epoch": 0.09958349110185535, "grad_norm": 10.681647300720215, "learning_rate": 0.00019578812204948328, "loss": 4.8772, "step": 263 }, { "epoch": 0.09996213555471413, "grad_norm": 11.055303573608398, "learning_rate": 0.00019575363300705637, "loss": 4.7443, "step": 264 }, { "epoch": 0.10034078000757289, "grad_norm": 11.89393424987793, "learning_rate": 0.00019571900639552437, "loss": 4.4099, "step": 265 }, { "epoch": 0.10071942446043165, "grad_norm": 11.34334659576416, "learning_rate": 0.0001956842422646353, "loss": 4.9191, "step": 266 }, { "epoch": 0.10109806891329041, "grad_norm": 9.913498878479004, "learning_rate": 0.00019564934066433476, "loss": 3.6103, "step": 267 }, { "epoch": 0.10147671336614919, "grad_norm": 12.267012596130371, "learning_rate": 0.00019561430164476574, "loss": 4.3453, "step": 268 }, { "epoch": 0.10185535781900795, "grad_norm": 10.8731050491333, "learning_rate": 0.00019557912525626885, "loss": 3.7477, "step": 269 }, { "epoch": 0.10223400227186671, "grad_norm": 15.239813804626465, "learning_rate": 0.0001955438115493819, "loss": 4.467, "step": 270 }, { "epoch": 0.10261264672472548, "grad_norm": 17.561635971069336, "learning_rate": 0.00019550836057484003, "loss": 3.9279, "step": 271 }, { "epoch": 0.10299129117758425, "grad_norm": 14.543050765991211, "learning_rate": 0.00019547277238357564, "loss": 3.559, "step": 272 }, { "epoch": 0.10336993563044301, "grad_norm": 14.89653205871582, "learning_rate": 0.0001954370470267182, "loss": 2.3824, "step": 273 }, { "epoch": 0.10374858008330178, "grad_norm": 23.81206703186035, "learning_rate": 0.00019540118455559435, "loss": 3.3979, "step": 274 }, { "epoch": 0.10412722453616055, "grad_norm": 26.980783462524414, "learning_rate": 0.00019536518502172756, "loss": 4.0859, "step": 275 }, { "epoch": 0.10450586898901931, "grad_norm": 13.720477104187012, "learning_rate": 0.00019532904847683832, "loss": 6.7626, "step": 276 }, { "epoch": 0.10488451344187807, "grad_norm": 11.518758773803711, "learning_rate": 0.00019529277497284402, "loss": 5.9555, "step": 277 }, { "epoch": 0.10526315789473684, "grad_norm": 10.95704174041748, "learning_rate": 0.00019525636456185866, "loss": 6.4592, "step": 278 }, { "epoch": 0.10564180234759561, "grad_norm": 10.139583587646484, "learning_rate": 0.0001952198172961931, "loss": 4.9463, "step": 279 }, { "epoch": 0.10602044680045437, "grad_norm": 8.685033798217773, "learning_rate": 0.00019518313322835468, "loss": 4.8444, "step": 280 }, { "epoch": 0.10639909125331314, "grad_norm": 8.559419631958008, "learning_rate": 0.00019514631241104744, "loss": 5.7126, "step": 281 }, { "epoch": 0.10677773570617191, "grad_norm": 9.680383682250977, "learning_rate": 0.0001951093548971717, "loss": 5.0644, "step": 282 }, { "epoch": 0.10715638015903067, "grad_norm": 10.89194393157959, "learning_rate": 0.00019507226073982428, "loss": 4.7752, "step": 283 }, { "epoch": 0.10753502461188943, "grad_norm": 10.400115013122559, "learning_rate": 0.00019503502999229834, "loss": 4.8316, "step": 284 }, { "epoch": 0.1079136690647482, "grad_norm": 11.178241729736328, "learning_rate": 0.0001949976627080832, "loss": 5.167, "step": 285 }, { "epoch": 0.10829231351760697, "grad_norm": 11.463688850402832, "learning_rate": 0.00019496015894086445, "loss": 5.0064, "step": 286 }, { "epoch": 0.10867095797046573, "grad_norm": 12.74399471282959, "learning_rate": 0.00019492251874452364, "loss": 5.8686, "step": 287 }, { "epoch": 0.1090496024233245, "grad_norm": 10.723747253417969, "learning_rate": 0.0001948847421731384, "loss": 4.0739, "step": 288 }, { "epoch": 0.10942824687618326, "grad_norm": 10.196253776550293, "learning_rate": 0.00019484682928098225, "loss": 5.0363, "step": 289 }, { "epoch": 0.10980689132904203, "grad_norm": 14.19273853302002, "learning_rate": 0.00019480878012252464, "loss": 4.8781, "step": 290 }, { "epoch": 0.1101855357819008, "grad_norm": 12.241954803466797, "learning_rate": 0.00019477059475243072, "loss": 5.3741, "step": 291 }, { "epoch": 0.11056418023475956, "grad_norm": 8.766860008239746, "learning_rate": 0.00019473227322556132, "loss": 3.1036, "step": 292 }, { "epoch": 0.11094282468761833, "grad_norm": 13.060121536254883, "learning_rate": 0.00019469381559697295, "loss": 4.9652, "step": 293 }, { "epoch": 0.1113214691404771, "grad_norm": 12.351059913635254, "learning_rate": 0.00019465522192191762, "loss": 4.3625, "step": 294 }, { "epoch": 0.11170011359333586, "grad_norm": 13.481490135192871, "learning_rate": 0.00019461649225584285, "loss": 3.8786, "step": 295 }, { "epoch": 0.11207875804619462, "grad_norm": 14.277658462524414, "learning_rate": 0.00019457762665439144, "loss": 3.3642, "step": 296 }, { "epoch": 0.1124574024990534, "grad_norm": 15.829534530639648, "learning_rate": 0.00019453862517340156, "loss": 3.4833, "step": 297 }, { "epoch": 0.11283604695191216, "grad_norm": 18.78076934814453, "learning_rate": 0.00019449948786890656, "loss": 4.5821, "step": 298 }, { "epoch": 0.11321469140477092, "grad_norm": 19.851030349731445, "learning_rate": 0.000194460214797135, "loss": 3.411, "step": 299 }, { "epoch": 0.11359333585762968, "grad_norm": 38.37712478637695, "learning_rate": 0.00019442080601451042, "loss": 4.709, "step": 300 }, { "epoch": 0.11397198031048845, "grad_norm": 12.775144577026367, "learning_rate": 0.00019438126157765137, "loss": 6.2073, "step": 301 }, { "epoch": 0.11435062476334722, "grad_norm": 12.176517486572266, "learning_rate": 0.00019434158154337127, "loss": 5.3956, "step": 302 }, { "epoch": 0.11472926921620598, "grad_norm": 8.858514785766602, "learning_rate": 0.00019430176596867832, "loss": 4.7154, "step": 303 }, { "epoch": 0.11510791366906475, "grad_norm": 10.479050636291504, "learning_rate": 0.0001942618149107756, "loss": 4.7401, "step": 304 }, { "epoch": 0.11548655812192352, "grad_norm": 8.652934074401855, "learning_rate": 0.00019422172842706065, "loss": 5.2193, "step": 305 }, { "epoch": 0.11586520257478228, "grad_norm": 9.481331825256348, "learning_rate": 0.00019418150657512574, "loss": 4.7876, "step": 306 }, { "epoch": 0.11624384702764104, "grad_norm": 11.53617000579834, "learning_rate": 0.00019414114941275745, "loss": 4.7514, "step": 307 }, { "epoch": 0.11662249148049982, "grad_norm": 9.821549415588379, "learning_rate": 0.00019410065699793693, "loss": 4.9545, "step": 308 }, { "epoch": 0.11700113593335858, "grad_norm": 10.33404541015625, "learning_rate": 0.00019406002938883958, "loss": 4.8945, "step": 309 }, { "epoch": 0.11737978038621734, "grad_norm": 12.466033935546875, "learning_rate": 0.000194019266643835, "loss": 5.2858, "step": 310 }, { "epoch": 0.1177584248390761, "grad_norm": 11.954521179199219, "learning_rate": 0.00019397836882148695, "loss": 5.3408, "step": 311 }, { "epoch": 0.11813706929193488, "grad_norm": 12.428413391113281, "learning_rate": 0.00019393733598055328, "loss": 5.1357, "step": 312 }, { "epoch": 0.11851571374479364, "grad_norm": 11.977699279785156, "learning_rate": 0.00019389616817998582, "loss": 4.8637, "step": 313 }, { "epoch": 0.1188943581976524, "grad_norm": 12.119193077087402, "learning_rate": 0.00019385486547893028, "loss": 4.5933, "step": 314 }, { "epoch": 0.11927300265051118, "grad_norm": 11.337264060974121, "learning_rate": 0.00019381342793672624, "loss": 4.2781, "step": 315 }, { "epoch": 0.11965164710336994, "grad_norm": 10.534137725830078, "learning_rate": 0.00019377185561290689, "loss": 4.0069, "step": 316 }, { "epoch": 0.1200302915562287, "grad_norm": 11.264338493347168, "learning_rate": 0.00019373014856719918, "loss": 4.7428, "step": 317 }, { "epoch": 0.12040893600908746, "grad_norm": 11.423133850097656, "learning_rate": 0.0001936883068595235, "loss": 4.2359, "step": 318 }, { "epoch": 0.12078758046194624, "grad_norm": 14.29877758026123, "learning_rate": 0.00019364633054999383, "loss": 4.4332, "step": 319 }, { "epoch": 0.121166224914805, "grad_norm": 17.930879592895508, "learning_rate": 0.00019360421969891745, "loss": 5.1328, "step": 320 }, { "epoch": 0.12154486936766376, "grad_norm": 14.796630859375, "learning_rate": 0.00019356197436679496, "loss": 2.7526, "step": 321 }, { "epoch": 0.12192351382052254, "grad_norm": 21.729623794555664, "learning_rate": 0.00019351959461432015, "loss": 3.8724, "step": 322 }, { "epoch": 0.1223021582733813, "grad_norm": 28.3909912109375, "learning_rate": 0.00019347708050237997, "loss": 4.0754, "step": 323 }, { "epoch": 0.12268080272624006, "grad_norm": 23.190147399902344, "learning_rate": 0.00019343443209205436, "loss": 3.1569, "step": 324 }, { "epoch": 0.12305944717909882, "grad_norm": 26.0140380859375, "learning_rate": 0.00019339164944461628, "loss": 4.3824, "step": 325 }, { "epoch": 0.1234380916319576, "grad_norm": 9.705833435058594, "learning_rate": 0.00019334873262153143, "loss": 5.5853, "step": 326 }, { "epoch": 0.12381673608481636, "grad_norm": 9.483610153198242, "learning_rate": 0.0001933056816844584, "loss": 6.2104, "step": 327 }, { "epoch": 0.12419538053767512, "grad_norm": 9.369464874267578, "learning_rate": 0.00019326249669524836, "loss": 5.1741, "step": 328 }, { "epoch": 0.12457402499053388, "grad_norm": 8.173629760742188, "learning_rate": 0.0001932191777159452, "loss": 5.8212, "step": 329 }, { "epoch": 0.12495266944339266, "grad_norm": 8.955101013183594, "learning_rate": 0.00019317572480878514, "loss": 4.8004, "step": 330 }, { "epoch": 0.1253313138962514, "grad_norm": 9.041908264160156, "learning_rate": 0.00019313213803619697, "loss": 5.6992, "step": 331 }, { "epoch": 0.1257099583491102, "grad_norm": 10.43190860748291, "learning_rate": 0.00019308841746080172, "loss": 4.9818, "step": 332 }, { "epoch": 0.12608860280196896, "grad_norm": 10.404027938842773, "learning_rate": 0.0001930445631454127, "loss": 5.1713, "step": 333 }, { "epoch": 0.12646724725482772, "grad_norm": 11.429192543029785, "learning_rate": 0.0001930005751530353, "loss": 4.9948, "step": 334 }, { "epoch": 0.12684589170768648, "grad_norm": 11.61748218536377, "learning_rate": 0.00019295645354686704, "loss": 4.8311, "step": 335 }, { "epoch": 0.12722453616054524, "grad_norm": 12.844706535339355, "learning_rate": 0.00019291219839029735, "loss": 5.2656, "step": 336 }, { "epoch": 0.127603180613404, "grad_norm": 11.768765449523926, "learning_rate": 0.00019286780974690754, "loss": 5.8559, "step": 337 }, { "epoch": 0.12798182506626277, "grad_norm": 11.951184272766113, "learning_rate": 0.00019282328768047076, "loss": 5.5301, "step": 338 }, { "epoch": 0.12836046951912156, "grad_norm": 11.188042640686035, "learning_rate": 0.0001927786322549517, "loss": 5.3409, "step": 339 }, { "epoch": 0.12873911397198032, "grad_norm": 12.103419303894043, "learning_rate": 0.00019273384353450687, "loss": 5.1385, "step": 340 }, { "epoch": 0.12911775842483908, "grad_norm": 13.991079330444336, "learning_rate": 0.00019268892158348408, "loss": 4.6209, "step": 341 }, { "epoch": 0.12949640287769784, "grad_norm": 31.928539276123047, "learning_rate": 0.00019264386646642266, "loss": 5.2701, "step": 342 }, { "epoch": 0.1298750473305566, "grad_norm": 12.100227355957031, "learning_rate": 0.00019259867824805317, "loss": 4.0977, "step": 343 }, { "epoch": 0.13025369178341537, "grad_norm": 26.673351287841797, "learning_rate": 0.00019255335699329754, "loss": 4.4181, "step": 344 }, { "epoch": 0.13063233623627413, "grad_norm": 12.389229774475098, "learning_rate": 0.0001925079027672687, "loss": 4.243, "step": 345 }, { "epoch": 0.13101098068913292, "grad_norm": 12.767072677612305, "learning_rate": 0.0001924623156352707, "loss": 4.1363, "step": 346 }, { "epoch": 0.13138962514199168, "grad_norm": 13.129895210266113, "learning_rate": 0.00019241659566279851, "loss": 3.5315, "step": 347 }, { "epoch": 0.13176826959485044, "grad_norm": 19.601654052734375, "learning_rate": 0.00019237074291553793, "loss": 3.4685, "step": 348 }, { "epoch": 0.1321469140477092, "grad_norm": 29.587953567504883, "learning_rate": 0.00019232475745936548, "loss": 3.344, "step": 349 }, { "epoch": 0.13252555850056796, "grad_norm": 19.867443084716797, "learning_rate": 0.00019227863936034848, "loss": 2.5789, "step": 350 }, { "epoch": 0.13290420295342673, "grad_norm": 11.518388748168945, "learning_rate": 0.00019223238868474476, "loss": 5.9315, "step": 351 }, { "epoch": 0.1332828474062855, "grad_norm": 10.65963363647461, "learning_rate": 0.0001921860054990025, "loss": 6.3699, "step": 352 }, { "epoch": 0.13366149185914428, "grad_norm": 11.396893501281738, "learning_rate": 0.0001921394898697604, "loss": 5.3195, "step": 353 }, { "epoch": 0.13404013631200304, "grad_norm": 9.930930137634277, "learning_rate": 0.00019209284186384742, "loss": 5.6726, "step": 354 }, { "epoch": 0.1344187807648618, "grad_norm": 8.916218757629395, "learning_rate": 0.00019204606154828264, "loss": 4.7663, "step": 355 }, { "epoch": 0.13479742521772056, "grad_norm": 9.720449447631836, "learning_rate": 0.00019199914899027532, "loss": 4.8931, "step": 356 }, { "epoch": 0.13517606967057932, "grad_norm": 9.971704483032227, "learning_rate": 0.00019195210425722463, "loss": 5.7539, "step": 357 }, { "epoch": 0.13555471412343809, "grad_norm": 11.575483322143555, "learning_rate": 0.00019190492741671968, "loss": 4.9698, "step": 358 }, { "epoch": 0.13593335857629685, "grad_norm": 10.350571632385254, "learning_rate": 0.00019185761853653935, "loss": 5.9123, "step": 359 }, { "epoch": 0.1363120030291556, "grad_norm": 12.966808319091797, "learning_rate": 0.00019181017768465225, "loss": 4.678, "step": 360 }, { "epoch": 0.1366906474820144, "grad_norm": 11.714643478393555, "learning_rate": 0.0001917626049292166, "loss": 5.3699, "step": 361 }, { "epoch": 0.13706929193487316, "grad_norm": 14.007036209106445, "learning_rate": 0.00019171490033858009, "loss": 5.6013, "step": 362 }, { "epoch": 0.13744793638773192, "grad_norm": 14.195833206176758, "learning_rate": 0.00019166706398127985, "loss": 5.4985, "step": 363 }, { "epoch": 0.13782658084059068, "grad_norm": 11.887788772583008, "learning_rate": 0.0001916190959260423, "loss": 4.0214, "step": 364 }, { "epoch": 0.13820522529344945, "grad_norm": 12.19540023803711, "learning_rate": 0.00019157099624178306, "loss": 4.935, "step": 365 }, { "epoch": 0.1385838697463082, "grad_norm": 12.359858512878418, "learning_rate": 0.0001915227649976069, "loss": 3.9815, "step": 366 }, { "epoch": 0.13896251419916697, "grad_norm": 11.449932098388672, "learning_rate": 0.00019147440226280753, "loss": 4.8552, "step": 367 }, { "epoch": 0.13934115865202576, "grad_norm": 14.259176254272461, "learning_rate": 0.0001914259081068677, "loss": 4.586, "step": 368 }, { "epoch": 0.13971980310488452, "grad_norm": 12.771395683288574, "learning_rate": 0.00019137728259945882, "loss": 4.048, "step": 369 }, { "epoch": 0.14009844755774328, "grad_norm": 15.282382011413574, "learning_rate": 0.00019132852581044114, "loss": 4.755, "step": 370 }, { "epoch": 0.14047709201060204, "grad_norm": 17.148212432861328, "learning_rate": 0.0001912796378098634, "loss": 4.2456, "step": 371 }, { "epoch": 0.1408557364634608, "grad_norm": 16.571382522583008, "learning_rate": 0.00019123061866796302, "loss": 3.6083, "step": 372 }, { "epoch": 0.14123438091631957, "grad_norm": 26.597492218017578, "learning_rate": 0.00019118146845516562, "loss": 2.7945, "step": 373 }, { "epoch": 0.14161302536917833, "grad_norm": 22.75865936279297, "learning_rate": 0.00019113218724208533, "loss": 3.9925, "step": 374 }, { "epoch": 0.14199166982203712, "grad_norm": 30.639041900634766, "learning_rate": 0.00019108277509952433, "loss": 4.4992, "step": 375 }, { "epoch": 0.14237031427489588, "grad_norm": 12.82249927520752, "learning_rate": 0.00019103323209847305, "loss": 5.3655, "step": 376 }, { "epoch": 0.14274895872775464, "grad_norm": 15.458394050598145, "learning_rate": 0.00019098355831010974, "loss": 5.8707, "step": 377 }, { "epoch": 0.1431276031806134, "grad_norm": 13.276158332824707, "learning_rate": 0.00019093375380580075, "loss": 5.574, "step": 378 }, { "epoch": 0.14350624763347217, "grad_norm": 12.430033683776855, "learning_rate": 0.00019088381865710007, "loss": 4.9323, "step": 379 }, { "epoch": 0.14388489208633093, "grad_norm": 10.998027801513672, "learning_rate": 0.0001908337529357495, "loss": 5.8475, "step": 380 }, { "epoch": 0.1442635365391897, "grad_norm": 11.086531639099121, "learning_rate": 0.00019078355671367842, "loss": 5.1857, "step": 381 }, { "epoch": 0.14464218099204848, "grad_norm": 10.455646514892578, "learning_rate": 0.00019073323006300362, "loss": 5.0082, "step": 382 }, { "epoch": 0.14502082544490724, "grad_norm": 13.326669692993164, "learning_rate": 0.00019068277305602936, "loss": 5.4143, "step": 383 }, { "epoch": 0.145399469897766, "grad_norm": 10.433615684509277, "learning_rate": 0.00019063218576524706, "loss": 4.5149, "step": 384 }, { "epoch": 0.14577811435062477, "grad_norm": 11.804021835327148, "learning_rate": 0.00019058146826333552, "loss": 4.7288, "step": 385 }, { "epoch": 0.14615675880348353, "grad_norm": 12.542786598205566, "learning_rate": 0.00019053062062316043, "loss": 5.7741, "step": 386 }, { "epoch": 0.1465354032563423, "grad_norm": 10.54061222076416, "learning_rate": 0.00019047964291777456, "loss": 4.7578, "step": 387 }, { "epoch": 0.14691404770920105, "grad_norm": 12.497821807861328, "learning_rate": 0.0001904285352204175, "loss": 4.6442, "step": 388 }, { "epoch": 0.1472926921620598, "grad_norm": 11.979016304016113, "learning_rate": 0.00019037729760451556, "loss": 3.495, "step": 389 }, { "epoch": 0.1476713366149186, "grad_norm": 13.17095947265625, "learning_rate": 0.0001903259301436818, "loss": 4.0682, "step": 390 }, { "epoch": 0.14804998106777736, "grad_norm": 16.20962905883789, "learning_rate": 0.00019027443291171574, "loss": 5.0638, "step": 391 }, { "epoch": 0.14842862552063613, "grad_norm": 12.081317901611328, "learning_rate": 0.0001902228059826034, "loss": 4.3665, "step": 392 }, { "epoch": 0.1488072699734949, "grad_norm": 15.122051239013672, "learning_rate": 0.0001901710494305171, "loss": 5.2251, "step": 393 }, { "epoch": 0.14918591442635365, "grad_norm": 12.867919921875, "learning_rate": 0.00019011916332981548, "loss": 3.2506, "step": 394 }, { "epoch": 0.1495645588792124, "grad_norm": 14.751724243164062, "learning_rate": 0.00019006714775504307, "loss": 4.2651, "step": 395 }, { "epoch": 0.14994320333207117, "grad_norm": 15.070958137512207, "learning_rate": 0.0001900150027809307, "loss": 3.4987, "step": 396 }, { "epoch": 0.15032184778492996, "grad_norm": 17.090017318725586, "learning_rate": 0.00018996272848239494, "loss": 4.3325, "step": 397 }, { "epoch": 0.15070049223778872, "grad_norm": 21.952957153320312, "learning_rate": 0.0001899103249345382, "loss": 3.5858, "step": 398 }, { "epoch": 0.1510791366906475, "grad_norm": 26.79568099975586, "learning_rate": 0.00018985779221264854, "loss": 4.2193, "step": 399 }, { "epoch": 0.15145778114350625, "grad_norm": 49.78997039794922, "learning_rate": 0.00018980513039219973, "loss": 5.5597, "step": 400 }, { "epoch": 0.151836425596365, "grad_norm": 8.66366195678711, "learning_rate": 0.00018975233954885082, "loss": 5.6595, "step": 401 }, { "epoch": 0.15221507004922377, "grad_norm": 9.13800048828125, "learning_rate": 0.00018969941975844644, "loss": 5.9827, "step": 402 }, { "epoch": 0.15259371450208253, "grad_norm": 9.065213203430176, "learning_rate": 0.00018964637109701636, "loss": 4.9153, "step": 403 }, { "epoch": 0.15297235895494132, "grad_norm": 9.054080963134766, "learning_rate": 0.00018959319364077545, "loss": 5.0867, "step": 404 }, { "epoch": 0.15335100340780008, "grad_norm": 9.656022071838379, "learning_rate": 0.00018953988746612372, "loss": 4.3978, "step": 405 }, { "epoch": 0.15372964786065885, "grad_norm": 9.716662406921387, "learning_rate": 0.00018948645264964609, "loss": 5.8155, "step": 406 }, { "epoch": 0.1541082923135176, "grad_norm": 10.407852172851562, "learning_rate": 0.00018943288926811226, "loss": 4.8687, "step": 407 }, { "epoch": 0.15448693676637637, "grad_norm": 11.507822036743164, "learning_rate": 0.0001893791973984767, "loss": 5.4475, "step": 408 }, { "epoch": 0.15486558121923513, "grad_norm": 10.88329029083252, "learning_rate": 0.0001893253771178784, "loss": 4.3342, "step": 409 }, { "epoch": 0.1552442256720939, "grad_norm": 10.991971015930176, "learning_rate": 0.00018927142850364088, "loss": 5.6984, "step": 410 }, { "epoch": 0.15562287012495266, "grad_norm": 11.597615242004395, "learning_rate": 0.00018921735163327205, "loss": 4.9601, "step": 411 }, { "epoch": 0.15600151457781145, "grad_norm": 9.658888816833496, "learning_rate": 0.0001891631465844641, "loss": 3.6239, "step": 412 }, { "epoch": 0.1563801590306702, "grad_norm": 14.736695289611816, "learning_rate": 0.00018910881343509327, "loss": 5.292, "step": 413 }, { "epoch": 0.15675880348352897, "grad_norm": 12.756696701049805, "learning_rate": 0.00018905435226322, "loss": 4.8917, "step": 414 }, { "epoch": 0.15713744793638773, "grad_norm": 11.116317749023438, "learning_rate": 0.0001889997631470885, "loss": 4.5305, "step": 415 }, { "epoch": 0.1575160923892465, "grad_norm": 14.593806266784668, "learning_rate": 0.0001889450461651269, "loss": 5.3616, "step": 416 }, { "epoch": 0.15789473684210525, "grad_norm": 12.08388614654541, "learning_rate": 0.00018889020139594705, "loss": 3.9476, "step": 417 }, { "epoch": 0.15827338129496402, "grad_norm": 13.409226417541504, "learning_rate": 0.00018883522891834434, "loss": 4.7586, "step": 418 }, { "epoch": 0.1586520257478228, "grad_norm": 15.655117988586426, "learning_rate": 0.00018878012881129758, "loss": 5.0882, "step": 419 }, { "epoch": 0.15903067020068157, "grad_norm": 13.614988327026367, "learning_rate": 0.00018872490115396908, "loss": 4.4696, "step": 420 }, { "epoch": 0.15940931465354033, "grad_norm": 13.35642147064209, "learning_rate": 0.0001886695460257043, "loss": 3.1605, "step": 421 }, { "epoch": 0.1597879591063991, "grad_norm": 21.85063934326172, "learning_rate": 0.0001886140635060319, "loss": 5.235, "step": 422 }, { "epoch": 0.16016660355925785, "grad_norm": 21.174924850463867, "learning_rate": 0.00018855845367466353, "loss": 4.3507, "step": 423 }, { "epoch": 0.16054524801211661, "grad_norm": 17.3688907623291, "learning_rate": 0.00018850271661149376, "loss": 2.2297, "step": 424 }, { "epoch": 0.16092389246497538, "grad_norm": 39.54337692260742, "learning_rate": 0.00018844685239659988, "loss": 2.7965, "step": 425 }, { "epoch": 0.16130253691783417, "grad_norm": 6.966166973114014, "learning_rate": 0.00018839086111024204, "loss": 4.994, "step": 426 }, { "epoch": 0.16168118137069293, "grad_norm": 8.462141036987305, "learning_rate": 0.00018833474283286273, "loss": 5.645, "step": 427 }, { "epoch": 0.1620598258235517, "grad_norm": 9.555349349975586, "learning_rate": 0.00018827849764508706, "loss": 4.6212, "step": 428 }, { "epoch": 0.16243847027641045, "grad_norm": 9.305408477783203, "learning_rate": 0.00018822212562772238, "loss": 4.4289, "step": 429 }, { "epoch": 0.1628171147292692, "grad_norm": 9.897664070129395, "learning_rate": 0.00018816562686175834, "loss": 5.1709, "step": 430 }, { "epoch": 0.16319575918212798, "grad_norm": 10.320230484008789, "learning_rate": 0.0001881090014283666, "loss": 5.0989, "step": 431 }, { "epoch": 0.16357440363498674, "grad_norm": 10.196443557739258, "learning_rate": 0.0001880522494089008, "loss": 5.5656, "step": 432 }, { "epoch": 0.16395304808784553, "grad_norm": 9.999099731445312, "learning_rate": 0.00018799537088489654, "loss": 5.2194, "step": 433 }, { "epoch": 0.1643316925407043, "grad_norm": 10.33040714263916, "learning_rate": 0.0001879383659380711, "loss": 5.2323, "step": 434 }, { "epoch": 0.16471033699356305, "grad_norm": 11.10696792602539, "learning_rate": 0.00018788123465032335, "loss": 4.6551, "step": 435 }, { "epoch": 0.1650889814464218, "grad_norm": 11.029682159423828, "learning_rate": 0.00018782397710373377, "loss": 5.0993, "step": 436 }, { "epoch": 0.16546762589928057, "grad_norm": 9.93604850769043, "learning_rate": 0.00018776659338056427, "loss": 3.7054, "step": 437 }, { "epoch": 0.16584627035213934, "grad_norm": 12.357624053955078, "learning_rate": 0.00018770908356325784, "loss": 4.4637, "step": 438 }, { "epoch": 0.1662249148049981, "grad_norm": 13.088432312011719, "learning_rate": 0.00018765144773443877, "loss": 4.3871, "step": 439 }, { "epoch": 0.16660355925785686, "grad_norm": 11.933065414428711, "learning_rate": 0.00018759368597691243, "loss": 4.3666, "step": 440 }, { "epoch": 0.16698220371071565, "grad_norm": 13.432201385498047, "learning_rate": 0.00018753579837366502, "loss": 3.8948, "step": 441 }, { "epoch": 0.1673608481635744, "grad_norm": 15.856587409973145, "learning_rate": 0.00018747778500786358, "loss": 4.3036, "step": 442 }, { "epoch": 0.16773949261643317, "grad_norm": 16.909637451171875, "learning_rate": 0.00018741964596285583, "loss": 4.5476, "step": 443 }, { "epoch": 0.16811813706929193, "grad_norm": 17.030946731567383, "learning_rate": 0.00018736138132217003, "loss": 3.0895, "step": 444 }, { "epoch": 0.1684967815221507, "grad_norm": 17.348941802978516, "learning_rate": 0.00018730299116951493, "loss": 4.9647, "step": 445 }, { "epoch": 0.16887542597500946, "grad_norm": 14.687129020690918, "learning_rate": 0.00018724447558877958, "loss": 2.7539, "step": 446 }, { "epoch": 0.16925407042786822, "grad_norm": 17.83780288696289, "learning_rate": 0.0001871858346640332, "loss": 3.5633, "step": 447 }, { "epoch": 0.169632714880727, "grad_norm": 30.752113342285156, "learning_rate": 0.00018712706847952515, "loss": 4.545, "step": 448 }, { "epoch": 0.17001135933358577, "grad_norm": 16.965770721435547, "learning_rate": 0.00018706817711968473, "loss": 2.9911, "step": 449 }, { "epoch": 0.17039000378644453, "grad_norm": 32.481101989746094, "learning_rate": 0.00018700916066912102, "loss": 2.4111, "step": 450 }, { "epoch": 0.1707686482393033, "grad_norm": 8.298652648925781, "learning_rate": 0.00018695001921262288, "loss": 6.4388, "step": 451 }, { "epoch": 0.17114729269216206, "grad_norm": 8.251225471496582, "learning_rate": 0.00018689075283515882, "loss": 5.6452, "step": 452 }, { "epoch": 0.17152593714502082, "grad_norm": 8.718252182006836, "learning_rate": 0.0001868313616218767, "loss": 5.2732, "step": 453 }, { "epoch": 0.17190458159787958, "grad_norm": 9.202814102172852, "learning_rate": 0.00018677184565810378, "loss": 5.1873, "step": 454 }, { "epoch": 0.17228322605073837, "grad_norm": 10.298333168029785, "learning_rate": 0.00018671220502934662, "loss": 5.4461, "step": 455 }, { "epoch": 0.17266187050359713, "grad_norm": 9.323832511901855, "learning_rate": 0.00018665243982129076, "loss": 5.1539, "step": 456 }, { "epoch": 0.1730405149564559, "grad_norm": 9.709290504455566, "learning_rate": 0.00018659255011980083, "loss": 4.5545, "step": 457 }, { "epoch": 0.17341915940931465, "grad_norm": 12.013558387756348, "learning_rate": 0.00018653253601092027, "loss": 4.8025, "step": 458 }, { "epoch": 0.17379780386217342, "grad_norm": 10.444851875305176, "learning_rate": 0.00018647239758087122, "loss": 4.3912, "step": 459 }, { "epoch": 0.17417644831503218, "grad_norm": 11.680195808410645, "learning_rate": 0.00018641213491605454, "loss": 4.0869, "step": 460 }, { "epoch": 0.17455509276789094, "grad_norm": 13.132681846618652, "learning_rate": 0.00018635174810304944, "loss": 3.7436, "step": 461 }, { "epoch": 0.17493373722074973, "grad_norm": 11.139749526977539, "learning_rate": 0.00018629123722861365, "loss": 3.2608, "step": 462 }, { "epoch": 0.1753123816736085, "grad_norm": 14.833144187927246, "learning_rate": 0.00018623060237968298, "loss": 4.9499, "step": 463 }, { "epoch": 0.17569102612646725, "grad_norm": 12.821535110473633, "learning_rate": 0.00018616984364337147, "loss": 4.1431, "step": 464 }, { "epoch": 0.17606967057932602, "grad_norm": 12.597731590270996, "learning_rate": 0.00018610896110697112, "loss": 4.4357, "step": 465 }, { "epoch": 0.17644831503218478, "grad_norm": 10.909318923950195, "learning_rate": 0.00018604795485795174, "loss": 4.1517, "step": 466 }, { "epoch": 0.17682695948504354, "grad_norm": 13.418599128723145, "learning_rate": 0.00018598682498396096, "loss": 4.525, "step": 467 }, { "epoch": 0.1772056039379023, "grad_norm": 14.079731941223145, "learning_rate": 0.00018592557157282393, "loss": 4.8267, "step": 468 }, { "epoch": 0.17758424839076106, "grad_norm": 13.757750511169434, "learning_rate": 0.00018586419471254337, "loss": 3.4362, "step": 469 }, { "epoch": 0.17796289284361985, "grad_norm": 14.358112335205078, "learning_rate": 0.00018580269449129934, "loss": 3.3671, "step": 470 }, { "epoch": 0.1783415372964786, "grad_norm": 15.010443687438965, "learning_rate": 0.0001857410709974491, "loss": 2.8819, "step": 471 }, { "epoch": 0.17872018174933738, "grad_norm": 16.135650634765625, "learning_rate": 0.00018567932431952703, "loss": 3.2814, "step": 472 }, { "epoch": 0.17909882620219614, "grad_norm": 17.758377075195312, "learning_rate": 0.00018561745454624448, "loss": 3.9894, "step": 473 }, { "epoch": 0.1794774706550549, "grad_norm": 18.282501220703125, "learning_rate": 0.00018555546176648972, "loss": 2.9159, "step": 474 }, { "epoch": 0.17985611510791366, "grad_norm": 59.88269805908203, "learning_rate": 0.00018549334606932763, "loss": 3.7333, "step": 475 }, { "epoch": 0.18023475956077242, "grad_norm": 8.664223670959473, "learning_rate": 0.00018543110754399975, "loss": 5.3577, "step": 476 }, { "epoch": 0.1806134040136312, "grad_norm": 9.118324279785156, "learning_rate": 0.00018536874627992408, "loss": 5.3931, "step": 477 }, { "epoch": 0.18099204846648997, "grad_norm": 8.87546157836914, "learning_rate": 0.00018530626236669498, "loss": 4.8429, "step": 478 }, { "epoch": 0.18137069291934874, "grad_norm": 9.68464183807373, "learning_rate": 0.00018524365589408297, "loss": 5.464, "step": 479 }, { "epoch": 0.1817493373722075, "grad_norm": 10.263278007507324, "learning_rate": 0.0001851809269520347, "loss": 4.618, "step": 480 }, { "epoch": 0.18212798182506626, "grad_norm": 11.374741554260254, "learning_rate": 0.00018511807563067274, "loss": 5.1748, "step": 481 }, { "epoch": 0.18250662627792502, "grad_norm": 8.904349327087402, "learning_rate": 0.00018505510202029547, "loss": 5.1542, "step": 482 }, { "epoch": 0.18288527073078378, "grad_norm": 9.953553199768066, "learning_rate": 0.00018499200621137701, "loss": 4.3502, "step": 483 }, { "epoch": 0.18326391518364257, "grad_norm": 12.080293655395508, "learning_rate": 0.00018492878829456702, "loss": 4.9385, "step": 484 }, { "epoch": 0.18364255963650133, "grad_norm": 15.62290096282959, "learning_rate": 0.00018486544836069063, "loss": 4.9754, "step": 485 }, { "epoch": 0.1840212040893601, "grad_norm": 15.642521858215332, "learning_rate": 0.00018480198650074812, "loss": 5.3047, "step": 486 }, { "epoch": 0.18439984854221886, "grad_norm": 11.466191291809082, "learning_rate": 0.00018473840280591513, "loss": 3.8782, "step": 487 }, { "epoch": 0.18477849299507762, "grad_norm": 13.736114501953125, "learning_rate": 0.00018467469736754225, "loss": 4.5983, "step": 488 }, { "epoch": 0.18515713744793638, "grad_norm": 13.423456192016602, "learning_rate": 0.00018461087027715498, "loss": 5.3427, "step": 489 }, { "epoch": 0.18553578190079514, "grad_norm": 12.900038719177246, "learning_rate": 0.00018454692162645363, "loss": 4.3188, "step": 490 }, { "epoch": 0.1859144263536539, "grad_norm": 10.984456062316895, "learning_rate": 0.0001844828515073131, "loss": 3.9218, "step": 491 }, { "epoch": 0.1862930708065127, "grad_norm": 14.656373977661133, "learning_rate": 0.00018441866001178285, "loss": 4.7434, "step": 492 }, { "epoch": 0.18667171525937146, "grad_norm": 12.752568244934082, "learning_rate": 0.00018435434723208674, "loss": 4.8496, "step": 493 }, { "epoch": 0.18705035971223022, "grad_norm": 11.184649467468262, "learning_rate": 0.0001842899132606228, "loss": 2.916, "step": 494 }, { "epoch": 0.18742900416508898, "grad_norm": 14.828624725341797, "learning_rate": 0.0001842253581899632, "loss": 3.9765, "step": 495 }, { "epoch": 0.18780764861794774, "grad_norm": 20.33783721923828, "learning_rate": 0.0001841606821128542, "loss": 4.2373, "step": 496 }, { "epoch": 0.1881862930708065, "grad_norm": 15.583966255187988, "learning_rate": 0.0001840958851222158, "loss": 4.2481, "step": 497 }, { "epoch": 0.18856493752366527, "grad_norm": 17.68903160095215, "learning_rate": 0.0001840309673111417, "loss": 2.7633, "step": 498 }, { "epoch": 0.18894358197652406, "grad_norm": 13.663451194763184, "learning_rate": 0.00018396592877289926, "loss": 1.3758, "step": 499 }, { "epoch": 0.18932222642938282, "grad_norm": 20.513288497924805, "learning_rate": 0.00018390076960092926, "loss": 2.9705, "step": 500 }, { "epoch": 0.18970087088224158, "grad_norm": 8.191598892211914, "learning_rate": 0.00018383548988884575, "loss": 5.3532, "step": 501 }, { "epoch": 0.19007951533510034, "grad_norm": 8.629252433776855, "learning_rate": 0.000183770089730436, "loss": 4.9063, "step": 502 }, { "epoch": 0.1904581597879591, "grad_norm": 10.426155090332031, "learning_rate": 0.0001837045692196604, "loss": 5.3042, "step": 503 }, { "epoch": 0.19083680424081786, "grad_norm": 9.645724296569824, "learning_rate": 0.00018363892845065207, "loss": 5.0665, "step": 504 }, { "epoch": 0.19121544869367663, "grad_norm": 13.083929061889648, "learning_rate": 0.00018357316751771704, "loss": 5.6006, "step": 505 }, { "epoch": 0.19159409314653542, "grad_norm": 11.490813255310059, "learning_rate": 0.00018350728651533396, "loss": 5.5563, "step": 506 }, { "epoch": 0.19197273759939418, "grad_norm": 9.692631721496582, "learning_rate": 0.00018344128553815397, "loss": 4.7903, "step": 507 }, { "epoch": 0.19235138205225294, "grad_norm": 10.534561157226562, "learning_rate": 0.0001833751646810006, "loss": 4.777, "step": 508 }, { "epoch": 0.1927300265051117, "grad_norm": 10.0263090133667, "learning_rate": 0.00018330892403886954, "loss": 4.7177, "step": 509 }, { "epoch": 0.19310867095797046, "grad_norm": 10.845623970031738, "learning_rate": 0.00018324256370692867, "loss": 4.4374, "step": 510 }, { "epoch": 0.19348731541082922, "grad_norm": 11.177064895629883, "learning_rate": 0.00018317608378051774, "loss": 4.9647, "step": 511 }, { "epoch": 0.193865959863688, "grad_norm": 13.082832336425781, "learning_rate": 0.00018310948435514842, "loss": 4.6881, "step": 512 }, { "epoch": 0.19424460431654678, "grad_norm": 13.74007797241211, "learning_rate": 0.00018304276552650394, "loss": 4.1824, "step": 513 }, { "epoch": 0.19462324876940554, "grad_norm": 12.836389541625977, "learning_rate": 0.00018297592739043917, "loss": 4.5745, "step": 514 }, { "epoch": 0.1950018932222643, "grad_norm": 13.968857765197754, "learning_rate": 0.00018290897004298037, "loss": 3.7023, "step": 515 }, { "epoch": 0.19538053767512306, "grad_norm": 16.355371475219727, "learning_rate": 0.00018284189358032507, "loss": 4.8914, "step": 516 }, { "epoch": 0.19575918212798182, "grad_norm": 12.298672676086426, "learning_rate": 0.0001827746980988419, "loss": 4.1704, "step": 517 }, { "epoch": 0.19613782658084059, "grad_norm": 12.946645736694336, "learning_rate": 0.00018270738369507056, "loss": 3.8664, "step": 518 }, { "epoch": 0.19651647103369935, "grad_norm": 12.944162368774414, "learning_rate": 0.00018263995046572152, "loss": 4.1455, "step": 519 }, { "epoch": 0.1968951154865581, "grad_norm": 13.202437400817871, "learning_rate": 0.00018257239850767598, "loss": 3.9128, "step": 520 }, { "epoch": 0.1972737599394169, "grad_norm": 15.04906940460205, "learning_rate": 0.00018250472791798576, "loss": 3.9912, "step": 521 }, { "epoch": 0.19765240439227566, "grad_norm": 15.138761520385742, "learning_rate": 0.00018243693879387314, "loss": 2.8543, "step": 522 }, { "epoch": 0.19803104884513442, "grad_norm": 20.136871337890625, "learning_rate": 0.00018236903123273058, "loss": 3.1619, "step": 523 }, { "epoch": 0.19840969329799318, "grad_norm": 17.752416610717773, "learning_rate": 0.00018230100533212084, "loss": 2.2304, "step": 524 }, { "epoch": 0.19878833775085195, "grad_norm": 40.383480072021484, "learning_rate": 0.00018223286118977664, "loss": 3.2921, "step": 525 }, { "epoch": 0.1991669822037107, "grad_norm": 7.091005802154541, "learning_rate": 0.0001821645989036005, "loss": 4.928, "step": 526 }, { "epoch": 0.19954562665656947, "grad_norm": 7.8941569328308105, "learning_rate": 0.00018209621857166475, "loss": 5.0736, "step": 527 }, { "epoch": 0.19992427110942826, "grad_norm": 8.998451232910156, "learning_rate": 0.0001820277202922114, "loss": 5.0461, "step": 528 }, { "epoch": 0.20030291556228702, "grad_norm": 9.947563171386719, "learning_rate": 0.00018195910416365173, "loss": 5.4385, "step": 529 }, { "epoch": 0.20068156001514578, "grad_norm": 9.720036506652832, "learning_rate": 0.00018189037028456653, "loss": 5.0625, "step": 530 }, { "epoch": 0.20106020446800454, "grad_norm": 10.58145523071289, "learning_rate": 0.00018182151875370558, "loss": 4.9718, "step": 531 }, { "epoch": 0.2014388489208633, "grad_norm": 11.047203063964844, "learning_rate": 0.0001817525496699878, "loss": 4.5654, "step": 532 }, { "epoch": 0.20181749337372207, "grad_norm": 10.208518981933594, "learning_rate": 0.00018168346313250097, "loss": 4.5213, "step": 533 }, { "epoch": 0.20219613782658083, "grad_norm": 11.760814666748047, "learning_rate": 0.00018161425924050165, "loss": 4.5826, "step": 534 }, { "epoch": 0.20257478227943962, "grad_norm": 9.820549011230469, "learning_rate": 0.00018154493809341494, "loss": 4.1414, "step": 535 }, { "epoch": 0.20295342673229838, "grad_norm": 9.911032676696777, "learning_rate": 0.00018147549979083443, "loss": 4.395, "step": 536 }, { "epoch": 0.20333207118515714, "grad_norm": 11.928678512573242, "learning_rate": 0.00018140594443252203, "loss": 4.5902, "step": 537 }, { "epoch": 0.2037107156380159, "grad_norm": 11.273340225219727, "learning_rate": 0.00018133627211840784, "loss": 4.099, "step": 538 }, { "epoch": 0.20408936009087467, "grad_norm": 12.365876197814941, "learning_rate": 0.00018126648294858994, "loss": 4.9772, "step": 539 }, { "epoch": 0.20446800454373343, "grad_norm": 13.742144584655762, "learning_rate": 0.00018119657702333436, "loss": 4.0028, "step": 540 }, { "epoch": 0.2048466489965922, "grad_norm": 12.48265266418457, "learning_rate": 0.00018112655444307485, "loss": 4.1452, "step": 541 }, { "epoch": 0.20522529344945095, "grad_norm": 14.862174034118652, "learning_rate": 0.0001810564153084127, "loss": 3.7215, "step": 542 }, { "epoch": 0.20560393790230974, "grad_norm": 14.677619934082031, "learning_rate": 0.00018098615972011675, "loss": 3.5887, "step": 543 }, { "epoch": 0.2059825823551685, "grad_norm": 12.201497077941895, "learning_rate": 0.00018091578777912307, "loss": 2.6873, "step": 544 }, { "epoch": 0.20636122680802726, "grad_norm": 14.59697151184082, "learning_rate": 0.00018084529958653492, "loss": 3.5864, "step": 545 }, { "epoch": 0.20673987126088603, "grad_norm": 26.02948760986328, "learning_rate": 0.00018077469524362263, "loss": 3.8935, "step": 546 }, { "epoch": 0.2071185157137448, "grad_norm": 18.998506546020508, "learning_rate": 0.0001807039748518233, "loss": 2.6402, "step": 547 }, { "epoch": 0.20749716016660355, "grad_norm": 25.073461532592773, "learning_rate": 0.00018063313851274089, "loss": 2.7662, "step": 548 }, { "epoch": 0.2078758046194623, "grad_norm": 23.39455223083496, "learning_rate": 0.00018056218632814575, "loss": 3.1726, "step": 549 }, { "epoch": 0.2082544490723211, "grad_norm": 27.59321403503418, "learning_rate": 0.0001804911183999749, "loss": 2.3217, "step": 550 }, { "epoch": 0.20863309352517986, "grad_norm": 8.587485313415527, "learning_rate": 0.00018041993483033144, "loss": 4.8921, "step": 551 }, { "epoch": 0.20901173797803863, "grad_norm": 10.552020072937012, "learning_rate": 0.00018034863572148475, "loss": 5.781, "step": 552 }, { "epoch": 0.2093903824308974, "grad_norm": 9.02157974243164, "learning_rate": 0.00018027722117587016, "loss": 5.0234, "step": 553 }, { "epoch": 0.20976902688375615, "grad_norm": 10.152100563049316, "learning_rate": 0.00018020569129608883, "loss": 4.6749, "step": 554 }, { "epoch": 0.2101476713366149, "grad_norm": 9.074097633361816, "learning_rate": 0.0001801340461849076, "loss": 4.1354, "step": 555 }, { "epoch": 0.21052631578947367, "grad_norm": 9.53470516204834, "learning_rate": 0.00018006228594525894, "loss": 4.0305, "step": 556 }, { "epoch": 0.21090496024233246, "grad_norm": 11.014595985412598, "learning_rate": 0.00017999041068024064, "loss": 5.5452, "step": 557 }, { "epoch": 0.21128360469519122, "grad_norm": 10.290766716003418, "learning_rate": 0.00017991842049311585, "loss": 4.8177, "step": 558 }, { "epoch": 0.21166224914804999, "grad_norm": 10.907533645629883, "learning_rate": 0.00017984631548731273, "loss": 5.1779, "step": 559 }, { "epoch": 0.21204089360090875, "grad_norm": 10.802809715270996, "learning_rate": 0.00017977409576642444, "loss": 4.1945, "step": 560 }, { "epoch": 0.2124195380537675, "grad_norm": 10.898703575134277, "learning_rate": 0.00017970176143420894, "loss": 4.2533, "step": 561 }, { "epoch": 0.21279818250662627, "grad_norm": 13.079732894897461, "learning_rate": 0.00017962931259458888, "loss": 4.8476, "step": 562 }, { "epoch": 0.21317682695948503, "grad_norm": 12.071998596191406, "learning_rate": 0.00017955674935165138, "loss": 4.632, "step": 563 }, { "epoch": 0.21355547141234382, "grad_norm": 12.636747360229492, "learning_rate": 0.00017948407180964798, "loss": 4.8321, "step": 564 }, { "epoch": 0.21393411586520258, "grad_norm": 14.457389831542969, "learning_rate": 0.00017941128007299434, "loss": 4.3604, "step": 565 }, { "epoch": 0.21431276031806135, "grad_norm": 12.480835914611816, "learning_rate": 0.00017933837424627028, "loss": 3.7171, "step": 566 }, { "epoch": 0.2146914047709201, "grad_norm": 13.800310134887695, "learning_rate": 0.00017926535443421954, "loss": 4.7733, "step": 567 }, { "epoch": 0.21507004922377887, "grad_norm": 14.182160377502441, "learning_rate": 0.00017919222074174948, "loss": 3.6579, "step": 568 }, { "epoch": 0.21544869367663763, "grad_norm": 12.221685409545898, "learning_rate": 0.00017911897327393126, "loss": 3.3661, "step": 569 }, { "epoch": 0.2158273381294964, "grad_norm": 14.561601638793945, "learning_rate": 0.00017904561213599932, "loss": 4.3849, "step": 570 }, { "epoch": 0.21620598258235516, "grad_norm": 15.966811180114746, "learning_rate": 0.0001789721374333516, "loss": 3.6633, "step": 571 }, { "epoch": 0.21658462703521394, "grad_norm": 16.546342849731445, "learning_rate": 0.00017889854927154901, "loss": 2.8848, "step": 572 }, { "epoch": 0.2169632714880727, "grad_norm": 21.277877807617188, "learning_rate": 0.0001788248477563156, "loss": 3.3454, "step": 573 }, { "epoch": 0.21734191594093147, "grad_norm": 30.5257568359375, "learning_rate": 0.00017875103299353824, "loss": 3.1811, "step": 574 }, { "epoch": 0.21772056039379023, "grad_norm": 53.368526458740234, "learning_rate": 0.00017867710508926647, "loss": 3.2357, "step": 575 }, { "epoch": 0.218099204846649, "grad_norm": 8.367820739746094, "learning_rate": 0.0001786030641497124, "loss": 5.0559, "step": 576 }, { "epoch": 0.21847784929950775, "grad_norm": 9.639812469482422, "learning_rate": 0.00017852891028125053, "loss": 5.0726, "step": 577 }, { "epoch": 0.21885649375236652, "grad_norm": 9.26480484008789, "learning_rate": 0.00017845464359041765, "loss": 4.88, "step": 578 }, { "epoch": 0.2192351382052253, "grad_norm": 9.129890441894531, "learning_rate": 0.0001783802641839126, "loss": 3.9438, "step": 579 }, { "epoch": 0.21961378265808407, "grad_norm": 9.979665756225586, "learning_rate": 0.00017830577216859615, "loss": 3.7425, "step": 580 }, { "epoch": 0.21999242711094283, "grad_norm": 9.814806938171387, "learning_rate": 0.00017823116765149086, "loss": 5.3337, "step": 581 }, { "epoch": 0.2203710715638016, "grad_norm": 9.500343322753906, "learning_rate": 0.00017815645073978096, "loss": 4.5891, "step": 582 }, { "epoch": 0.22074971601666035, "grad_norm": 11.922303199768066, "learning_rate": 0.00017808162154081208, "loss": 4.1409, "step": 583 }, { "epoch": 0.22112836046951911, "grad_norm": 13.801411628723145, "learning_rate": 0.00017800668016209128, "loss": 4.7769, "step": 584 }, { "epoch": 0.22150700492237788, "grad_norm": 11.385126113891602, "learning_rate": 0.00017793162671128672, "loss": 3.7034, "step": 585 }, { "epoch": 0.22188564937523667, "grad_norm": 11.122262954711914, "learning_rate": 0.00017785646129622756, "loss": 5.1685, "step": 586 }, { "epoch": 0.22226429382809543, "grad_norm": 10.478163719177246, "learning_rate": 0.00017778118402490383, "loss": 3.5963, "step": 587 }, { "epoch": 0.2226429382809542, "grad_norm": 11.611848831176758, "learning_rate": 0.00017770579500546628, "loss": 4.1824, "step": 588 }, { "epoch": 0.22302158273381295, "grad_norm": 14.5952787399292, "learning_rate": 0.00017763029434622626, "loss": 3.979, "step": 589 }, { "epoch": 0.2234002271866717, "grad_norm": 11.742944717407227, "learning_rate": 0.00017755468215565538, "loss": 4.4411, "step": 590 }, { "epoch": 0.22377887163953047, "grad_norm": 14.67832088470459, "learning_rate": 0.00017747895854238564, "loss": 4.5713, "step": 591 }, { "epoch": 0.22415751609238924, "grad_norm": 12.356891632080078, "learning_rate": 0.00017740312361520897, "loss": 3.7381, "step": 592 }, { "epoch": 0.22453616054524803, "grad_norm": 12.271285057067871, "learning_rate": 0.00017732717748307735, "loss": 3.2774, "step": 593 }, { "epoch": 0.2249148049981068, "grad_norm": 13.871350288391113, "learning_rate": 0.00017725112025510247, "loss": 2.9768, "step": 594 }, { "epoch": 0.22529344945096555, "grad_norm": 16.23955726623535, "learning_rate": 0.0001771749520405556, "loss": 2.9301, "step": 595 }, { "epoch": 0.2256720939038243, "grad_norm": 15.869973182678223, "learning_rate": 0.00017709867294886757, "loss": 3.1639, "step": 596 }, { "epoch": 0.22605073835668307, "grad_norm": 17.27775001525879, "learning_rate": 0.0001770222830896284, "loss": 4.2995, "step": 597 }, { "epoch": 0.22642938280954183, "grad_norm": 18.232769012451172, "learning_rate": 0.00017694578257258727, "loss": 3.2705, "step": 598 }, { "epoch": 0.2268080272624006, "grad_norm": 27.304903030395508, "learning_rate": 0.00017686917150765244, "loss": 3.3116, "step": 599 }, { "epoch": 0.22718667171525936, "grad_norm": 64.07026672363281, "learning_rate": 0.0001767924500048908, "loss": 3.8853, "step": 600 }, { "epoch": 0.22756531616811815, "grad_norm": 7.719545841217041, "learning_rate": 0.00017671561817452812, "loss": 4.9895, "step": 601 }, { "epoch": 0.2279439606209769, "grad_norm": 9.30318546295166, "learning_rate": 0.00017663867612694852, "loss": 6.2789, "step": 602 }, { "epoch": 0.22832260507383567, "grad_norm": 8.493380546569824, "learning_rate": 0.00017656162397269455, "loss": 4.0822, "step": 603 }, { "epoch": 0.22870124952669443, "grad_norm": 10.322513580322266, "learning_rate": 0.0001764844618224669, "loss": 4.2392, "step": 604 }, { "epoch": 0.2290798939795532, "grad_norm": 10.969583511352539, "learning_rate": 0.00017640718978712442, "loss": 4.9037, "step": 605 }, { "epoch": 0.22945853843241196, "grad_norm": 10.155062675476074, "learning_rate": 0.0001763298079776836, "loss": 3.9269, "step": 606 }, { "epoch": 0.22983718288527072, "grad_norm": 11.783178329467773, "learning_rate": 0.00017625231650531884, "loss": 5.0255, "step": 607 }, { "epoch": 0.2302158273381295, "grad_norm": 13.353561401367188, "learning_rate": 0.000176174715481362, "loss": 5.0201, "step": 608 }, { "epoch": 0.23059447179098827, "grad_norm": 14.431788444519043, "learning_rate": 0.0001760970050173024, "loss": 4.1539, "step": 609 }, { "epoch": 0.23097311624384703, "grad_norm": 11.670136451721191, "learning_rate": 0.00017601918522478651, "loss": 4.2111, "step": 610 }, { "epoch": 0.2313517606967058, "grad_norm": 12.365769386291504, "learning_rate": 0.0001759412562156179, "loss": 4.7332, "step": 611 }, { "epoch": 0.23173040514956456, "grad_norm": 11.194862365722656, "learning_rate": 0.00017586321810175712, "loss": 4.0395, "step": 612 }, { "epoch": 0.23210904960242332, "grad_norm": 12.092097282409668, "learning_rate": 0.00017578507099532138, "loss": 3.2322, "step": 613 }, { "epoch": 0.23248769405528208, "grad_norm": 12.20335578918457, "learning_rate": 0.0001757068150085845, "loss": 3.9589, "step": 614 }, { "epoch": 0.23286633850814087, "grad_norm": 14.438755989074707, "learning_rate": 0.00017562845025397678, "loss": 4.2131, "step": 615 }, { "epoch": 0.23324498296099963, "grad_norm": 14.180761337280273, "learning_rate": 0.00017554997684408473, "loss": 3.6403, "step": 616 }, { "epoch": 0.2336236274138584, "grad_norm": 12.253854751586914, "learning_rate": 0.00017547139489165097, "loss": 3.2084, "step": 617 }, { "epoch": 0.23400227186671715, "grad_norm": 13.61539363861084, "learning_rate": 0.0001753927045095741, "loss": 3.9068, "step": 618 }, { "epoch": 0.23438091631957592, "grad_norm": 14.397199630737305, "learning_rate": 0.00017531390581090845, "loss": 3.4509, "step": 619 }, { "epoch": 0.23475956077243468, "grad_norm": 14.57890796661377, "learning_rate": 0.00017523499890886401, "loss": 3.025, "step": 620 }, { "epoch": 0.23513820522529344, "grad_norm": 15.363025665283203, "learning_rate": 0.00017515598391680626, "loss": 3.3226, "step": 621 }, { "epoch": 0.2355168496781522, "grad_norm": 14.477925300598145, "learning_rate": 0.0001750768609482558, "loss": 3.4475, "step": 622 }, { "epoch": 0.235895494131011, "grad_norm": 19.390026092529297, "learning_rate": 0.00017499763011688863, "loss": 4.1527, "step": 623 }, { "epoch": 0.23627413858386975, "grad_norm": 18.3753662109375, "learning_rate": 0.0001749182915365355, "loss": 2.1395, "step": 624 }, { "epoch": 0.23665278303672851, "grad_norm": 25.714628219604492, "learning_rate": 0.000174838845321182, "loss": 3.1295, "step": 625 }, { "epoch": 0.23703142748958728, "grad_norm": 8.490006446838379, "learning_rate": 0.0001747592915849684, "loss": 4.353, "step": 626 }, { "epoch": 0.23741007194244604, "grad_norm": 9.353875160217285, "learning_rate": 0.00017467963044218951, "loss": 4.8076, "step": 627 }, { "epoch": 0.2377887163953048, "grad_norm": 11.424484252929688, "learning_rate": 0.00017459986200729432, "loss": 4.6193, "step": 628 }, { "epoch": 0.23816736084816356, "grad_norm": 9.105878829956055, "learning_rate": 0.00017451998639488606, "loss": 4.1965, "step": 629 }, { "epoch": 0.23854600530102235, "grad_norm": 9.515813827514648, "learning_rate": 0.0001744400037197218, "loss": 4.4786, "step": 630 }, { "epoch": 0.2389246497538811, "grad_norm": 9.369331359863281, "learning_rate": 0.0001743599140967127, "loss": 3.6103, "step": 631 }, { "epoch": 0.23930329420673987, "grad_norm": 11.47270393371582, "learning_rate": 0.00017427971764092328, "loss": 4.4817, "step": 632 }, { "epoch": 0.23968193865959864, "grad_norm": 12.192450523376465, "learning_rate": 0.00017419941446757174, "loss": 4.4893, "step": 633 }, { "epoch": 0.2400605831124574, "grad_norm": 12.567253112792969, "learning_rate": 0.00017411900469202943, "loss": 4.5299, "step": 634 }, { "epoch": 0.24043922756531616, "grad_norm": 14.549039840698242, "learning_rate": 0.0001740384884298211, "loss": 4.4132, "step": 635 }, { "epoch": 0.24081787201817492, "grad_norm": 12.618002891540527, "learning_rate": 0.00017395786579662423, "loss": 3.7163, "step": 636 }, { "epoch": 0.2411965164710337, "grad_norm": 10.437346458435059, "learning_rate": 0.00017387713690826932, "loss": 3.2317, "step": 637 }, { "epoch": 0.24157516092389247, "grad_norm": 13.972203254699707, "learning_rate": 0.00017379630188073941, "loss": 3.9498, "step": 638 }, { "epoch": 0.24195380537675124, "grad_norm": 13.032102584838867, "learning_rate": 0.00017371536083017004, "loss": 3.5371, "step": 639 }, { "epoch": 0.24233244982961, "grad_norm": 14.505037307739258, "learning_rate": 0.00017363431387284914, "loss": 4.754, "step": 640 }, { "epoch": 0.24271109428246876, "grad_norm": 10.701925277709961, "learning_rate": 0.00017355316112521675, "loss": 2.8655, "step": 641 }, { "epoch": 0.24308973873532752, "grad_norm": 12.83195686340332, "learning_rate": 0.00017347190270386488, "loss": 3.7197, "step": 642 }, { "epoch": 0.24346838318818628, "grad_norm": 12.69071102142334, "learning_rate": 0.00017339053872553742, "loss": 2.9367, "step": 643 }, { "epoch": 0.24384702764104507, "grad_norm": 14.964492797851562, "learning_rate": 0.00017330906930712988, "loss": 3.9672, "step": 644 }, { "epoch": 0.24422567209390383, "grad_norm": 14.813679695129395, "learning_rate": 0.0001732274945656892, "loss": 2.1732, "step": 645 }, { "epoch": 0.2446043165467626, "grad_norm": 14.024624824523926, "learning_rate": 0.00017314581461841378, "loss": 3.4523, "step": 646 }, { "epoch": 0.24498296099962136, "grad_norm": 18.821441650390625, "learning_rate": 0.00017306402958265299, "loss": 3.0951, "step": 647 }, { "epoch": 0.24536160545248012, "grad_norm": 17.203479766845703, "learning_rate": 0.0001729821395759073, "loss": 2.7061, "step": 648 }, { "epoch": 0.24574024990533888, "grad_norm": 21.973928451538086, "learning_rate": 0.000172900144715828, "loss": 1.9662, "step": 649 }, { "epoch": 0.24611889435819764, "grad_norm": 24.348388671875, "learning_rate": 0.00017281804512021695, "loss": 2.4137, "step": 650 }, { "epoch": 0.2464975388110564, "grad_norm": 8.82381820678711, "learning_rate": 0.00017273584090702655, "loss": 5.5763, "step": 651 }, { "epoch": 0.2468761832639152, "grad_norm": 8.850475311279297, "learning_rate": 0.00017265353219435943, "loss": 5.0723, "step": 652 }, { "epoch": 0.24725482771677396, "grad_norm": 8.894290924072266, "learning_rate": 0.00017257111910046842, "loss": 4.3985, "step": 653 }, { "epoch": 0.24763347216963272, "grad_norm": 10.235475540161133, "learning_rate": 0.00017248860174375632, "loss": 4.657, "step": 654 }, { "epoch": 0.24801211662249148, "grad_norm": 9.949228286743164, "learning_rate": 0.00017240598024277566, "loss": 3.5698, "step": 655 }, { "epoch": 0.24839076107535024, "grad_norm": 10.317509651184082, "learning_rate": 0.00017232325471622863, "loss": 3.9962, "step": 656 }, { "epoch": 0.248769405528209, "grad_norm": 9.790380477905273, "learning_rate": 0.0001722404252829669, "loss": 3.9819, "step": 657 }, { "epoch": 0.24914804998106777, "grad_norm": 12.6632661819458, "learning_rate": 0.00017215749206199137, "loss": 4.5065, "step": 658 }, { "epoch": 0.24952669443392655, "grad_norm": 12.218596458435059, "learning_rate": 0.00017207445517245212, "loss": 4.2112, "step": 659 }, { "epoch": 0.24990533888678532, "grad_norm": 13.204305648803711, "learning_rate": 0.00017199131473364805, "loss": 4.5837, "step": 660 }, { "epoch": 0.2502839833396441, "grad_norm": 11.75940990447998, "learning_rate": 0.00017190807086502695, "loss": 4.6862, "step": 661 }, { "epoch": 0.2502839833396441, "eval_loss": 0.4858866035938263, "eval_runtime": 899.2669, "eval_samples_per_second": 4.946, "eval_steps_per_second": 1.237, "step": 661 }, { "epoch": 0.2506626277925028, "grad_norm": 11.954764366149902, "learning_rate": 0.0001718247236861852, "loss": 4.3744, "step": 662 }, { "epoch": 0.2510412722453616, "grad_norm": 14.922517776489258, "learning_rate": 0.0001717412733168675, "loss": 4.7506, "step": 663 }, { "epoch": 0.2514199166982204, "grad_norm": 16.80381965637207, "learning_rate": 0.00017165771987696698, "loss": 4.9678, "step": 664 }, { "epoch": 0.2517985611510791, "grad_norm": 13.548487663269043, "learning_rate": 0.00017157406348652463, "loss": 4.0003, "step": 665 }, { "epoch": 0.2521772056039379, "grad_norm": 14.526391983032227, "learning_rate": 0.00017149030426572953, "loss": 4.5138, "step": 666 }, { "epoch": 0.25255585005679665, "grad_norm": 16.523895263671875, "learning_rate": 0.00017140644233491837, "loss": 4.0987, "step": 667 }, { "epoch": 0.25293449450965544, "grad_norm": 13.643799781799316, "learning_rate": 0.00017132247781457557, "loss": 4.3543, "step": 668 }, { "epoch": 0.2533131389625142, "grad_norm": 13.402974128723145, "learning_rate": 0.00017123841082533275, "loss": 3.5844, "step": 669 }, { "epoch": 0.25369178341537296, "grad_norm": 16.17197036743164, "learning_rate": 0.00017115424148796883, "loss": 4.1618, "step": 670 }, { "epoch": 0.25407042786823175, "grad_norm": 15.300637245178223, "learning_rate": 0.00017106996992340983, "loss": 2.9017, "step": 671 }, { "epoch": 0.2544490723210905, "grad_norm": 14.559680938720703, "learning_rate": 0.00017098559625272852, "loss": 1.9764, "step": 672 }, { "epoch": 0.2548277167739493, "grad_norm": 19.095439910888672, "learning_rate": 0.00017090112059714446, "loss": 2.9313, "step": 673 }, { "epoch": 0.255206361226808, "grad_norm": 18.04271125793457, "learning_rate": 0.0001708165430780237, "loss": 1.6536, "step": 674 }, { "epoch": 0.2555850056796668, "grad_norm": 29.377933502197266, "learning_rate": 0.00017073186381687868, "loss": 2.5233, "step": 675 }, { "epoch": 0.25596365013252553, "grad_norm": 13.520868301391602, "learning_rate": 0.00017064708293536792, "loss": 5.1626, "step": 676 }, { "epoch": 0.2563422945853843, "grad_norm": 10.009629249572754, "learning_rate": 0.00017056220055529595, "loss": 5.3031, "step": 677 }, { "epoch": 0.2567209390382431, "grad_norm": 10.07101058959961, "learning_rate": 0.00017047721679861326, "loss": 4.0588, "step": 678 }, { "epoch": 0.25709958349110185, "grad_norm": 9.826030731201172, "learning_rate": 0.0001703921317874158, "loss": 5.5341, "step": 679 }, { "epoch": 0.25747822794396064, "grad_norm": 8.8646821975708, "learning_rate": 0.00017030694564394518, "loss": 4.0068, "step": 680 }, { "epoch": 0.25785687239681937, "grad_norm": 9.862954139709473, "learning_rate": 0.00017022165849058812, "loss": 4.4291, "step": 681 }, { "epoch": 0.25823551684967816, "grad_norm": 10.417892456054688, "learning_rate": 0.00017013627044987656, "loss": 4.2552, "step": 682 }, { "epoch": 0.2586141613025369, "grad_norm": 10.884054183959961, "learning_rate": 0.00017005078164448746, "loss": 3.9076, "step": 683 }, { "epoch": 0.2589928057553957, "grad_norm": 13.036945343017578, "learning_rate": 0.00016996519219724234, "loss": 5.0801, "step": 684 }, { "epoch": 0.2593714502082545, "grad_norm": 14.050254821777344, "learning_rate": 0.00016987950223110748, "loss": 4.3475, "step": 685 }, { "epoch": 0.2597500946611132, "grad_norm": 14.108619689941406, "learning_rate": 0.0001697937118691936, "loss": 3.5589, "step": 686 }, { "epoch": 0.260128739113972, "grad_norm": 15.274951934814453, "learning_rate": 0.00016970782123475547, "loss": 4.3379, "step": 687 }, { "epoch": 0.26050738356683073, "grad_norm": 16.255102157592773, "learning_rate": 0.00016962183045119214, "loss": 4.2574, "step": 688 }, { "epoch": 0.2608860280196895, "grad_norm": 13.676608085632324, "learning_rate": 0.00016953573964204638, "loss": 4.7991, "step": 689 }, { "epoch": 0.26126467247254825, "grad_norm": 16.22185707092285, "learning_rate": 0.00016944954893100475, "loss": 3.9156, "step": 690 }, { "epoch": 0.26164331692540704, "grad_norm": 11.445201873779297, "learning_rate": 0.0001693632584418973, "loss": 3.3147, "step": 691 }, { "epoch": 0.26202196137826583, "grad_norm": 12.272850036621094, "learning_rate": 0.0001692768682986975, "loss": 3.7517, "step": 692 }, { "epoch": 0.26240060583112457, "grad_norm": 13.47509479522705, "learning_rate": 0.0001691903786255219, "loss": 3.0187, "step": 693 }, { "epoch": 0.26277925028398336, "grad_norm": 12.622843742370605, "learning_rate": 0.00016910378954663013, "loss": 3.404, "step": 694 }, { "epoch": 0.2631578947368421, "grad_norm": 17.651247024536133, "learning_rate": 0.00016901710118642454, "loss": 3.9933, "step": 695 }, { "epoch": 0.2635365391897009, "grad_norm": 17.895601272583008, "learning_rate": 0.0001689303136694502, "loss": 2.997, "step": 696 }, { "epoch": 0.2639151836425596, "grad_norm": 17.362606048583984, "learning_rate": 0.0001688434271203946, "loss": 3.9413, "step": 697 }, { "epoch": 0.2642938280954184, "grad_norm": 18.201135635375977, "learning_rate": 0.00016875644166408754, "loss": 3.0196, "step": 698 }, { "epoch": 0.2646724725482772, "grad_norm": 34.7093391418457, "learning_rate": 0.00016866935742550083, "loss": 3.1038, "step": 699 }, { "epoch": 0.2650511170011359, "grad_norm": 31.69573974609375, "learning_rate": 0.00016858217452974837, "loss": 3.6338, "step": 700 }, { "epoch": 0.2654297614539947, "grad_norm": 8.198531150817871, "learning_rate": 0.0001684948931020856, "loss": 5.0433, "step": 701 }, { "epoch": 0.26580840590685345, "grad_norm": 8.688183784484863, "learning_rate": 0.0001684075132679097, "loss": 5.2619, "step": 702 }, { "epoch": 0.26618705035971224, "grad_norm": 11.000321388244629, "learning_rate": 0.00016832003515275914, "loss": 4.5282, "step": 703 }, { "epoch": 0.266565694812571, "grad_norm": 10.947884559631348, "learning_rate": 0.00016823245888231356, "loss": 4.5732, "step": 704 }, { "epoch": 0.26694433926542976, "grad_norm": 11.00478744506836, "learning_rate": 0.0001681447845823937, "loss": 4.2352, "step": 705 }, { "epoch": 0.26732298371828855, "grad_norm": 12.548285484313965, "learning_rate": 0.00016805701237896105, "loss": 4.8917, "step": 706 }, { "epoch": 0.2677016281711473, "grad_norm": 9.91434097290039, "learning_rate": 0.00016796914239811786, "loss": 3.5194, "step": 707 }, { "epoch": 0.2680802726240061, "grad_norm": 10.20582389831543, "learning_rate": 0.00016788117476610677, "loss": 3.5162, "step": 708 }, { "epoch": 0.2684589170768648, "grad_norm": 11.576292991638184, "learning_rate": 0.00016779310960931073, "loss": 4.2913, "step": 709 }, { "epoch": 0.2688375615297236, "grad_norm": 12.707137107849121, "learning_rate": 0.0001677049470542529, "loss": 3.8916, "step": 710 }, { "epoch": 0.26921620598258234, "grad_norm": 13.900711059570312, "learning_rate": 0.00016761668722759622, "loss": 4.5028, "step": 711 }, { "epoch": 0.2695948504354411, "grad_norm": 15.244569778442383, "learning_rate": 0.0001675283302561435, "loss": 4.7703, "step": 712 }, { "epoch": 0.2699734948882999, "grad_norm": 12.6697998046875, "learning_rate": 0.00016743987626683703, "loss": 3.6493, "step": 713 }, { "epoch": 0.27035213934115865, "grad_norm": 14.035780906677246, "learning_rate": 0.00016735132538675854, "loss": 3.7715, "step": 714 }, { "epoch": 0.27073078379401744, "grad_norm": 14.34146785736084, "learning_rate": 0.00016726267774312898, "loss": 4.4825, "step": 715 }, { "epoch": 0.27110942824687617, "grad_norm": 14.432804107666016, "learning_rate": 0.00016717393346330828, "loss": 3.3871, "step": 716 }, { "epoch": 0.27148807269973496, "grad_norm": 13.750441551208496, "learning_rate": 0.0001670850926747952, "loss": 3.1007, "step": 717 }, { "epoch": 0.2718667171525937, "grad_norm": 16.201181411743164, "learning_rate": 0.00016699615550522717, "loss": 2.6202, "step": 718 }, { "epoch": 0.2722453616054525, "grad_norm": 18.666667938232422, "learning_rate": 0.0001669071220823801, "loss": 3.7933, "step": 719 }, { "epoch": 0.2726240060583112, "grad_norm": 17.479516983032227, "learning_rate": 0.0001668179925341682, "loss": 3.846, "step": 720 }, { "epoch": 0.27300265051117, "grad_norm": 21.877872467041016, "learning_rate": 0.0001667287669886437, "loss": 3.2863, "step": 721 }, { "epoch": 0.2733812949640288, "grad_norm": 18.93062973022461, "learning_rate": 0.00016663944557399692, "loss": 2.6771, "step": 722 }, { "epoch": 0.27375993941688753, "grad_norm": 17.544601440429688, "learning_rate": 0.00016655002841855566, "loss": 2.4239, "step": 723 }, { "epoch": 0.2741385838697463, "grad_norm": 27.646818161010742, "learning_rate": 0.00016646051565078558, "loss": 2.6222, "step": 724 }, { "epoch": 0.27451722832260506, "grad_norm": 34.713478088378906, "learning_rate": 0.0001663709073992894, "loss": 2.8671, "step": 725 }, { "epoch": 0.27489587277546385, "grad_norm": 8.818018913269043, "learning_rate": 0.00016628120379280728, "loss": 4.8852, "step": 726 }, { "epoch": 0.2752745172283226, "grad_norm": 9.805505752563477, "learning_rate": 0.00016619140496021615, "loss": 4.155, "step": 727 }, { "epoch": 0.27565316168118137, "grad_norm": 8.863174438476562, "learning_rate": 0.00016610151103052995, "loss": 3.8106, "step": 728 }, { "epoch": 0.27603180613404016, "grad_norm": 9.454705238342285, "learning_rate": 0.00016601152213289913, "loss": 4.0096, "step": 729 }, { "epoch": 0.2764104505868989, "grad_norm": 10.974586486816406, "learning_rate": 0.00016592143839661057, "loss": 4.3561, "step": 730 }, { "epoch": 0.2767890950397577, "grad_norm": 12.553147315979004, "learning_rate": 0.0001658312599510875, "loss": 3.901, "step": 731 }, { "epoch": 0.2771677394926164, "grad_norm": 11.170998573303223, "learning_rate": 0.00016574098692588915, "loss": 4.3408, "step": 732 }, { "epoch": 0.2775463839454752, "grad_norm": 13.828832626342773, "learning_rate": 0.0001656506194507106, "loss": 4.7403, "step": 733 }, { "epoch": 0.27792502839833394, "grad_norm": 13.855401992797852, "learning_rate": 0.00016556015765538273, "loss": 4.6504, "step": 734 }, { "epoch": 0.27830367285119273, "grad_norm": 11.40543270111084, "learning_rate": 0.0001654696016698718, "loss": 3.4119, "step": 735 }, { "epoch": 0.2786823173040515, "grad_norm": 12.30098819732666, "learning_rate": 0.00016537895162427955, "loss": 3.635, "step": 736 }, { "epoch": 0.27906096175691025, "grad_norm": 12.096563339233398, "learning_rate": 0.0001652882076488427, "loss": 4.2051, "step": 737 }, { "epoch": 0.27943960620976904, "grad_norm": 11.935840606689453, "learning_rate": 0.00016519736987393303, "loss": 3.8025, "step": 738 }, { "epoch": 0.2798182506626278, "grad_norm": 13.400490760803223, "learning_rate": 0.000165106438430057, "loss": 4.2775, "step": 739 }, { "epoch": 0.28019689511548657, "grad_norm": 10.693985939025879, "learning_rate": 0.00016501541344785572, "loss": 2.8859, "step": 740 }, { "epoch": 0.2805755395683453, "grad_norm": 13.22080135345459, "learning_rate": 0.0001649242950581046, "loss": 2.4878, "step": 741 }, { "epoch": 0.2809541840212041, "grad_norm": 14.28111743927002, "learning_rate": 0.00016483308339171335, "loss": 3.9025, "step": 742 }, { "epoch": 0.2813328284740629, "grad_norm": 17.349239349365234, "learning_rate": 0.0001647417785797256, "loss": 3.6947, "step": 743 }, { "epoch": 0.2817114729269216, "grad_norm": 15.542529106140137, "learning_rate": 0.0001646503807533189, "loss": 3.1008, "step": 744 }, { "epoch": 0.2820901173797804, "grad_norm": 13.73243522644043, "learning_rate": 0.0001645588900438043, "loss": 2.3237, "step": 745 }, { "epoch": 0.28246876183263914, "grad_norm": 18.583194732666016, "learning_rate": 0.0001644673065826264, "loss": 3.3123, "step": 746 }, { "epoch": 0.2828474062854979, "grad_norm": 20.04288673400879, "learning_rate": 0.00016437563050136303, "loss": 2.8265, "step": 747 }, { "epoch": 0.28322605073835666, "grad_norm": 15.773954391479492, "learning_rate": 0.00016428386193172506, "loss": 2.1103, "step": 748 }, { "epoch": 0.28360469519121545, "grad_norm": 19.099443435668945, "learning_rate": 0.0001641920010055563, "loss": 1.9673, "step": 749 }, { "epoch": 0.28398333964407424, "grad_norm": 40.27873611450195, "learning_rate": 0.00016410004785483316, "loss": 5.3713, "step": 750 }, { "epoch": 0.284361984096933, "grad_norm": 10.12539005279541, "learning_rate": 0.00016400800261166465, "loss": 4.9746, "step": 751 }, { "epoch": 0.28474062854979176, "grad_norm": 10.15647029876709, "learning_rate": 0.000163915865408292, "loss": 4.8416, "step": 752 }, { "epoch": 0.2851192730026505, "grad_norm": 9.944365501403809, "learning_rate": 0.00016382363637708865, "loss": 4.3926, "step": 753 }, { "epoch": 0.2854979174555093, "grad_norm": 9.265400886535645, "learning_rate": 0.0001637313156505598, "loss": 3.6671, "step": 754 }, { "epoch": 0.285876561908368, "grad_norm": 10.70794677734375, "learning_rate": 0.00016363890336134262, "loss": 4.5764, "step": 755 }, { "epoch": 0.2862552063612268, "grad_norm": 11.477482795715332, "learning_rate": 0.00016354639964220568, "loss": 4.5665, "step": 756 }, { "epoch": 0.2866338508140856, "grad_norm": 10.951593399047852, "learning_rate": 0.0001634538046260489, "loss": 4.2272, "step": 757 }, { "epoch": 0.28701249526694433, "grad_norm": 11.813931465148926, "learning_rate": 0.00016336111844590345, "loss": 3.8581, "step": 758 }, { "epoch": 0.2873911397198031, "grad_norm": 12.866728782653809, "learning_rate": 0.0001632683412349314, "loss": 4.2478, "step": 759 }, { "epoch": 0.28776978417266186, "grad_norm": 11.82854175567627, "learning_rate": 0.00016317547312642562, "loss": 4.416, "step": 760 }, { "epoch": 0.28814842862552065, "grad_norm": 12.294820785522461, "learning_rate": 0.00016308251425380962, "loss": 4.3508, "step": 761 }, { "epoch": 0.2885270730783794, "grad_norm": 11.736769676208496, "learning_rate": 0.00016298946475063733, "loss": 3.5181, "step": 762 }, { "epoch": 0.28890571753123817, "grad_norm": 10.93974781036377, "learning_rate": 0.0001628963247505927, "loss": 2.8494, "step": 763 }, { "epoch": 0.28928436198409696, "grad_norm": 15.365312576293945, "learning_rate": 0.00016280309438748992, "loss": 3.8264, "step": 764 }, { "epoch": 0.2896630064369557, "grad_norm": 13.349133491516113, "learning_rate": 0.00016270977379527292, "loss": 4.0294, "step": 765 }, { "epoch": 0.2900416508898145, "grad_norm": 13.878774642944336, "learning_rate": 0.00016261636310801523, "loss": 3.6898, "step": 766 }, { "epoch": 0.2904202953426732, "grad_norm": 13.974386215209961, "learning_rate": 0.00016252286245991987, "loss": 3.1476, "step": 767 }, { "epoch": 0.290798939795532, "grad_norm": 12.599011421203613, "learning_rate": 0.0001624292719853191, "loss": 3.7, "step": 768 }, { "epoch": 0.29117758424839074, "grad_norm": 14.90402603149414, "learning_rate": 0.00016233559181867414, "loss": 2.84, "step": 769 }, { "epoch": 0.29155622870124953, "grad_norm": 15.247842788696289, "learning_rate": 0.00016224182209457523, "loss": 2.9135, "step": 770 }, { "epoch": 0.29193487315410827, "grad_norm": 15.978056907653809, "learning_rate": 0.00016214796294774115, "loss": 3.8344, "step": 771 }, { "epoch": 0.29231351760696705, "grad_norm": 15.876410484313965, "learning_rate": 0.00016205401451301925, "loss": 2.1739, "step": 772 }, { "epoch": 0.29269216205982584, "grad_norm": 16.77569007873535, "learning_rate": 0.00016195997692538506, "loss": 2.1749, "step": 773 }, { "epoch": 0.2930708065126846, "grad_norm": 23.31680679321289, "learning_rate": 0.00016186585031994225, "loss": 2.8665, "step": 774 }, { "epoch": 0.29344945096554337, "grad_norm": 10.028854370117188, "learning_rate": 0.0001617716348319224, "loss": 0.8782, "step": 775 }, { "epoch": 0.2938280954184021, "grad_norm": 8.688515663146973, "learning_rate": 0.00016167733059668478, "loss": 3.93, "step": 776 }, { "epoch": 0.2942067398712609, "grad_norm": 9.398271560668945, "learning_rate": 0.00016158293774971608, "loss": 4.4695, "step": 777 }, { "epoch": 0.2945853843241196, "grad_norm": 10.657846450805664, "learning_rate": 0.00016148845642663043, "loss": 4.401, "step": 778 }, { "epoch": 0.2949640287769784, "grad_norm": 10.177902221679688, "learning_rate": 0.000161393886763169, "loss": 3.8614, "step": 779 }, { "epoch": 0.2953426732298372, "grad_norm": 10.739095687866211, "learning_rate": 0.0001612992288951998, "loss": 3.9037, "step": 780 }, { "epoch": 0.29572131768269594, "grad_norm": 11.997400283813477, "learning_rate": 0.00016120448295871783, "loss": 3.6965, "step": 781 }, { "epoch": 0.29609996213555473, "grad_norm": 12.047724723815918, "learning_rate": 0.00016110964908984428, "loss": 4.1741, "step": 782 }, { "epoch": 0.29647860658841346, "grad_norm": 11.252506256103516, "learning_rate": 0.00016101472742482685, "loss": 4.2626, "step": 783 }, { "epoch": 0.29685725104127225, "grad_norm": 10.244424819946289, "learning_rate": 0.00016091971810003946, "loss": 3.8371, "step": 784 }, { "epoch": 0.297235895494131, "grad_norm": 11.887914657592773, "learning_rate": 0.00016082462125198177, "loss": 3.7736, "step": 785 }, { "epoch": 0.2976145399469898, "grad_norm": 11.956177711486816, "learning_rate": 0.00016072943701727932, "loss": 4.0997, "step": 786 }, { "epoch": 0.29799318439984857, "grad_norm": 11.499533653259277, "learning_rate": 0.00016063416553268315, "loss": 3.995, "step": 787 }, { "epoch": 0.2983718288527073, "grad_norm": 14.390945434570312, "learning_rate": 0.00016053880693506968, "loss": 4.1593, "step": 788 }, { "epoch": 0.2987504733055661, "grad_norm": 12.83646297454834, "learning_rate": 0.00016044336136144044, "loss": 3.2662, "step": 789 }, { "epoch": 0.2991291177584248, "grad_norm": 12.761232376098633, "learning_rate": 0.00016034782894892198, "loss": 2.7353, "step": 790 }, { "epoch": 0.2995077622112836, "grad_norm": 13.886045455932617, "learning_rate": 0.00016025220983476555, "loss": 3.6852, "step": 791 }, { "epoch": 0.29988640666414235, "grad_norm": 16.431631088256836, "learning_rate": 0.00016015650415634704, "loss": 4.5693, "step": 792 }, { "epoch": 0.30026505111700114, "grad_norm": 15.884831428527832, "learning_rate": 0.00016006071205116657, "loss": 4.0334, "step": 793 }, { "epoch": 0.3006436955698599, "grad_norm": 16.197486877441406, "learning_rate": 0.00015996483365684862, "loss": 3.0299, "step": 794 }, { "epoch": 0.30102234002271866, "grad_norm": 13.327005386352539, "learning_rate": 0.00015986886911114145, "loss": 2.5927, "step": 795 }, { "epoch": 0.30140098447557745, "grad_norm": 13.829025268554688, "learning_rate": 0.00015977281855191725, "loss": 2.6192, "step": 796 }, { "epoch": 0.3017796289284362, "grad_norm": 15.983011245727539, "learning_rate": 0.00015967668211717167, "loss": 2.3621, "step": 797 }, { "epoch": 0.302158273381295, "grad_norm": 19.83639144897461, "learning_rate": 0.00015958045994502384, "loss": 2.7834, "step": 798 }, { "epoch": 0.3025369178341537, "grad_norm": 19.925039291381836, "learning_rate": 0.00015948415217371595, "loss": 2.8116, "step": 799 }, { "epoch": 0.3029155622870125, "grad_norm": 21.867938995361328, "learning_rate": 0.0001593877589416133, "loss": 1.7513, "step": 800 }, { "epoch": 0.3032942067398713, "grad_norm": 8.560530662536621, "learning_rate": 0.00015929128038720384, "loss": 5.1137, "step": 801 }, { "epoch": 0.30367285119273, "grad_norm": 8.668681144714355, "learning_rate": 0.00015919471664909823, "loss": 3.8616, "step": 802 }, { "epoch": 0.3040514956455888, "grad_norm": 10.437018394470215, "learning_rate": 0.0001590980678660294, "loss": 3.7993, "step": 803 }, { "epoch": 0.30443014009844754, "grad_norm": 10.498896598815918, "learning_rate": 0.0001590013341768526, "loss": 3.8712, "step": 804 }, { "epoch": 0.30480878455130633, "grad_norm": 9.216273307800293, "learning_rate": 0.00015890451572054482, "loss": 4.0495, "step": 805 }, { "epoch": 0.30518742900416507, "grad_norm": 10.508468627929688, "learning_rate": 0.00015880761263620515, "loss": 3.4153, "step": 806 }, { "epoch": 0.30556607345702386, "grad_norm": 13.808286666870117, "learning_rate": 0.00015871062506305408, "loss": 3.4353, "step": 807 }, { "epoch": 0.30594471790988265, "grad_norm": 12.350955963134766, "learning_rate": 0.00015861355314043343, "loss": 3.5035, "step": 808 }, { "epoch": 0.3063233623627414, "grad_norm": 12.85565185546875, "learning_rate": 0.00015851639700780642, "loss": 3.8184, "step": 809 }, { "epoch": 0.30670200681560017, "grad_norm": 13.963553428649902, "learning_rate": 0.000158419156804757, "loss": 4.7287, "step": 810 }, { "epoch": 0.3070806512684589, "grad_norm": 15.577609062194824, "learning_rate": 0.0001583218326709901, "loss": 3.6594, "step": 811 }, { "epoch": 0.3074592957213177, "grad_norm": 11.10647201538086, "learning_rate": 0.00015822442474633115, "loss": 2.9355, "step": 812 }, { "epoch": 0.3078379401741764, "grad_norm": 13.10251522064209, "learning_rate": 0.00015812693317072596, "loss": 4.3878, "step": 813 }, { "epoch": 0.3082165846270352, "grad_norm": 12.302017211914062, "learning_rate": 0.00015802935808424055, "loss": 2.902, "step": 814 }, { "epoch": 0.308595229079894, "grad_norm": 13.663749694824219, "learning_rate": 0.00015793169962706092, "loss": 2.7841, "step": 815 }, { "epoch": 0.30897387353275274, "grad_norm": 13.366521835327148, "learning_rate": 0.00015783395793949278, "loss": 3.4101, "step": 816 }, { "epoch": 0.30935251798561153, "grad_norm": 16.41577911376953, "learning_rate": 0.00015773613316196147, "loss": 3.334, "step": 817 }, { "epoch": 0.30973116243847026, "grad_norm": 15.605032920837402, "learning_rate": 0.0001576382254350118, "loss": 3.7084, "step": 818 }, { "epoch": 0.31010980689132905, "grad_norm": 14.417840003967285, "learning_rate": 0.00015754023489930754, "loss": 3.0134, "step": 819 }, { "epoch": 0.3104884513441878, "grad_norm": 17.02623176574707, "learning_rate": 0.00015744216169563164, "loss": 3.0973, "step": 820 }, { "epoch": 0.3108670957970466, "grad_norm": 14.048128128051758, "learning_rate": 0.00015734400596488567, "loss": 2.4681, "step": 821 }, { "epoch": 0.3112457402499053, "grad_norm": 22.928178787231445, "learning_rate": 0.00015724576784808986, "loss": 4.2287, "step": 822 }, { "epoch": 0.3116243847027641, "grad_norm": 16.560827255249023, "learning_rate": 0.00015714744748638278, "loss": 2.381, "step": 823 }, { "epoch": 0.3120030291556229, "grad_norm": 16.747251510620117, "learning_rate": 0.0001570490450210211, "loss": 1.6694, "step": 824 }, { "epoch": 0.3123816736084816, "grad_norm": 26.673425674438477, "learning_rate": 0.00015695056059337952, "loss": 1.5667, "step": 825 }, { "epoch": 0.3127603180613404, "grad_norm": 7.984857082366943, "learning_rate": 0.00015685199434495051, "loss": 4.4119, "step": 826 }, { "epoch": 0.31313896251419915, "grad_norm": 9.042461395263672, "learning_rate": 0.00015675334641734398, "loss": 4.3624, "step": 827 }, { "epoch": 0.31351760696705794, "grad_norm": 10.055127143859863, "learning_rate": 0.00015665461695228735, "loss": 4.276, "step": 828 }, { "epoch": 0.31389625141991667, "grad_norm": 9.659919738769531, "learning_rate": 0.00015655580609162504, "loss": 3.5357, "step": 829 }, { "epoch": 0.31427489587277546, "grad_norm": 10.656012535095215, "learning_rate": 0.00015645691397731852, "loss": 4.0171, "step": 830 }, { "epoch": 0.31465354032563425, "grad_norm": 11.442161560058594, "learning_rate": 0.00015635794075144588, "loss": 3.8396, "step": 831 }, { "epoch": 0.315032184778493, "grad_norm": 12.612800598144531, "learning_rate": 0.00015625888655620187, "loss": 4.2947, "step": 832 }, { "epoch": 0.3154108292313518, "grad_norm": 12.016472816467285, "learning_rate": 0.00015615975153389746, "loss": 3.9577, "step": 833 }, { "epoch": 0.3157894736842105, "grad_norm": 10.963457107543945, "learning_rate": 0.00015606053582695984, "loss": 4.1569, "step": 834 }, { "epoch": 0.3161681181370693, "grad_norm": 12.133650779724121, "learning_rate": 0.00015596123957793202, "loss": 3.681, "step": 835 }, { "epoch": 0.31654676258992803, "grad_norm": 12.980992317199707, "learning_rate": 0.0001558618629294728, "loss": 3.614, "step": 836 }, { "epoch": 0.3169254070427868, "grad_norm": 11.19620132446289, "learning_rate": 0.0001557624060243565, "loss": 3.6321, "step": 837 }, { "epoch": 0.3173040514956456, "grad_norm": 14.250601768493652, "learning_rate": 0.00015566286900547266, "loss": 4.1902, "step": 838 }, { "epoch": 0.31768269594850435, "grad_norm": 12.371217727661133, "learning_rate": 0.000155563252015826, "loss": 2.7028, "step": 839 }, { "epoch": 0.31806134040136314, "grad_norm": 12.687495231628418, "learning_rate": 0.00015546355519853607, "loss": 2.4365, "step": 840 }, { "epoch": 0.31843998485422187, "grad_norm": 12.307214736938477, "learning_rate": 0.00015536377869683718, "loss": 2.7681, "step": 841 }, { "epoch": 0.31881862930708066, "grad_norm": 15.518838882446289, "learning_rate": 0.0001552639226540781, "loss": 3.1019, "step": 842 }, { "epoch": 0.3191972737599394, "grad_norm": 14.274090766906738, "learning_rate": 0.00015516398721372179, "loss": 2.8421, "step": 843 }, { "epoch": 0.3195759182127982, "grad_norm": 19.139890670776367, "learning_rate": 0.00015506397251934543, "loss": 2.5628, "step": 844 }, { "epoch": 0.31995456266565697, "grad_norm": 17.884008407592773, "learning_rate": 0.00015496387871463988, "loss": 2.3613, "step": 845 }, { "epoch": 0.3203332071185157, "grad_norm": 16.46691131591797, "learning_rate": 0.0001548637059434099, "loss": 2.4046, "step": 846 }, { "epoch": 0.3207118515713745, "grad_norm": 16.158769607543945, "learning_rate": 0.00015476345434957346, "loss": 2.9732, "step": 847 }, { "epoch": 0.32109049602423323, "grad_norm": 25.788095474243164, "learning_rate": 0.00015466312407716194, "loss": 3.0837, "step": 848 }, { "epoch": 0.321469140477092, "grad_norm": 27.709606170654297, "learning_rate": 0.00015456271527031966, "loss": 2.3595, "step": 849 }, { "epoch": 0.32184778492995075, "grad_norm": 27.167621612548828, "learning_rate": 0.00015446222807330383, "loss": 2.2286, "step": 850 }, { "epoch": 0.32222642938280954, "grad_norm": 8.955855369567871, "learning_rate": 0.00015436166263048425, "loss": 4.3385, "step": 851 }, { "epoch": 0.32260507383566833, "grad_norm": 8.619714736938477, "learning_rate": 0.00015426101908634312, "loss": 3.7368, "step": 852 }, { "epoch": 0.32298371828852707, "grad_norm": 9.597879409790039, "learning_rate": 0.00015416029758547493, "loss": 3.8133, "step": 853 }, { "epoch": 0.32336236274138586, "grad_norm": 10.818007469177246, "learning_rate": 0.00015405949827258604, "loss": 4.1761, "step": 854 }, { "epoch": 0.3237410071942446, "grad_norm": 10.386642456054688, "learning_rate": 0.00015395862129249474, "loss": 3.6592, "step": 855 }, { "epoch": 0.3241196516471034, "grad_norm": 11.960341453552246, "learning_rate": 0.00015385766679013081, "loss": 3.6471, "step": 856 }, { "epoch": 0.3244982960999621, "grad_norm": 13.14782428741455, "learning_rate": 0.00015375663491053545, "loss": 3.9707, "step": 857 }, { "epoch": 0.3248769405528209, "grad_norm": 12.082589149475098, "learning_rate": 0.000153655525798861, "loss": 3.5612, "step": 858 }, { "epoch": 0.3252555850056797, "grad_norm": 11.448456764221191, "learning_rate": 0.00015355433960037077, "loss": 3.737, "step": 859 }, { "epoch": 0.3256342294585384, "grad_norm": 12.987861633300781, "learning_rate": 0.0001534530764604389, "loss": 3.8811, "step": 860 }, { "epoch": 0.3260128739113972, "grad_norm": 12.712824821472168, "learning_rate": 0.00015335173652454985, "loss": 3.5249, "step": 861 }, { "epoch": 0.32639151836425595, "grad_norm": 11.121883392333984, "learning_rate": 0.00015325031993829868, "loss": 2.6656, "step": 862 }, { "epoch": 0.32677016281711474, "grad_norm": 14.241087913513184, "learning_rate": 0.0001531488268473904, "loss": 3.9731, "step": 863 }, { "epoch": 0.3271488072699735, "grad_norm": 13.581354141235352, "learning_rate": 0.00015304725739764, "loss": 3.2629, "step": 864 }, { "epoch": 0.32752745172283226, "grad_norm": 15.62415599822998, "learning_rate": 0.00015294561173497215, "loss": 3.9048, "step": 865 }, { "epoch": 0.32790609617569105, "grad_norm": 12.98635196685791, "learning_rate": 0.00015284389000542103, "loss": 2.6195, "step": 866 }, { "epoch": 0.3282847406285498, "grad_norm": 15.516901016235352, "learning_rate": 0.00015274209235513014, "loss": 3.2572, "step": 867 }, { "epoch": 0.3286633850814086, "grad_norm": 13.609155654907227, "learning_rate": 0.00015264021893035193, "loss": 2.7172, "step": 868 }, { "epoch": 0.3290420295342673, "grad_norm": 15.977977752685547, "learning_rate": 0.00015253826987744789, "loss": 3.2585, "step": 869 }, { "epoch": 0.3294206739871261, "grad_norm": 14.53819751739502, "learning_rate": 0.00015243624534288803, "loss": 2.9884, "step": 870 }, { "epoch": 0.32979931843998483, "grad_norm": 18.142704010009766, "learning_rate": 0.00015233414547325083, "loss": 3.0888, "step": 871 }, { "epoch": 0.3301779628928436, "grad_norm": 17.9478816986084, "learning_rate": 0.00015223197041522307, "loss": 2.0567, "step": 872 }, { "epoch": 0.33055660734570236, "grad_norm": 15.515186309814453, "learning_rate": 0.00015212972031559946, "loss": 2.056, "step": 873 }, { "epoch": 0.33093525179856115, "grad_norm": 20.402446746826172, "learning_rate": 0.00015202739532128265, "loss": 1.867, "step": 874 }, { "epoch": 0.33131389625141994, "grad_norm": 13.986373901367188, "learning_rate": 0.0001519249955792827, "loss": 1.5481, "step": 875 }, { "epoch": 0.33169254070427867, "grad_norm": 9.035808563232422, "learning_rate": 0.00015182252123671725, "loss": 4.4831, "step": 876 }, { "epoch": 0.33207118515713746, "grad_norm": 9.396247863769531, "learning_rate": 0.000151719972440811, "loss": 4.1913, "step": 877 }, { "epoch": 0.3324498296099962, "grad_norm": 10.645395278930664, "learning_rate": 0.0001516173493388957, "loss": 4.71, "step": 878 }, { "epoch": 0.332828474062855, "grad_norm": 11.150712966918945, "learning_rate": 0.00015151465207840977, "loss": 4.2096, "step": 879 }, { "epoch": 0.3332071185157137, "grad_norm": 10.260977745056152, "learning_rate": 0.00015141188080689826, "loss": 3.1771, "step": 880 }, { "epoch": 0.3335857629685725, "grad_norm": 10.818496704101562, "learning_rate": 0.00015130903567201243, "loss": 2.9112, "step": 881 }, { "epoch": 0.3339644074214313, "grad_norm": 11.379049301147461, "learning_rate": 0.0001512061168215098, "loss": 3.8058, "step": 882 }, { "epoch": 0.33434305187429003, "grad_norm": 12.107205390930176, "learning_rate": 0.00015110312440325368, "loss": 3.271, "step": 883 }, { "epoch": 0.3347216963271488, "grad_norm": 12.379898071289062, "learning_rate": 0.0001510000585652132, "loss": 2.992, "step": 884 }, { "epoch": 0.33510034078000756, "grad_norm": 11.953714370727539, "learning_rate": 0.00015089691945546283, "loss": 3.1566, "step": 885 }, { "epoch": 0.33547898523286634, "grad_norm": 13.055462837219238, "learning_rate": 0.00015079370722218243, "loss": 2.5646, "step": 886 }, { "epoch": 0.3358576296857251, "grad_norm": 12.182693481445312, "learning_rate": 0.00015069042201365683, "loss": 2.9366, "step": 887 }, { "epoch": 0.33623627413858387, "grad_norm": 13.180964469909668, "learning_rate": 0.00015058706397827573, "loss": 4.0075, "step": 888 }, { "epoch": 0.33661491859144266, "grad_norm": 12.289628982543945, "learning_rate": 0.0001504836332645335, "loss": 2.5069, "step": 889 }, { "epoch": 0.3369935630443014, "grad_norm": 11.804617881774902, "learning_rate": 0.00015038013002102892, "loss": 2.0101, "step": 890 }, { "epoch": 0.3373722074971602, "grad_norm": 14.811490058898926, "learning_rate": 0.00015027655439646488, "loss": 3.8222, "step": 891 }, { "epoch": 0.3377508519500189, "grad_norm": 15.269726753234863, "learning_rate": 0.00015017290653964835, "loss": 2.9604, "step": 892 }, { "epoch": 0.3381294964028777, "grad_norm": 13.442567825317383, "learning_rate": 0.0001500691865994901, "loss": 3.0957, "step": 893 }, { "epoch": 0.33850814085573644, "grad_norm": 15.218294143676758, "learning_rate": 0.00014996539472500437, "loss": 2.7899, "step": 894 }, { "epoch": 0.33888678530859523, "grad_norm": 13.601509094238281, "learning_rate": 0.00014986153106530883, "loss": 2.6892, "step": 895 }, { "epoch": 0.339265429761454, "grad_norm": 13.653763771057129, "learning_rate": 0.00014975759576962424, "loss": 2.2024, "step": 896 }, { "epoch": 0.33964407421431275, "grad_norm": 17.351696014404297, "learning_rate": 0.00014965358898727423, "loss": 2.777, "step": 897 }, { "epoch": 0.34002271866717154, "grad_norm": 32.49483108520508, "learning_rate": 0.00014954951086768525, "loss": 2.3369, "step": 898 }, { "epoch": 0.3404013631200303, "grad_norm": 37.68558120727539, "learning_rate": 0.0001494453615603862, "loss": 3.0663, "step": 899 }, { "epoch": 0.34078000757288907, "grad_norm": 27.460304260253906, "learning_rate": 0.00014934114121500818, "loss": 2.0837, "step": 900 }, { "epoch": 0.3411586520257478, "grad_norm": 19.336395263671875, "learning_rate": 0.00014923684998128446, "loss": 4.6271, "step": 901 }, { "epoch": 0.3415372964786066, "grad_norm": 11.99440860748291, "learning_rate": 0.00014913248800905006, "loss": 4.4893, "step": 902 }, { "epoch": 0.3419159409314654, "grad_norm": 10.598093032836914, "learning_rate": 0.00014902805544824175, "loss": 3.813, "step": 903 }, { "epoch": 0.3422945853843241, "grad_norm": 10.407685279846191, "learning_rate": 0.00014892355244889752, "loss": 4.3924, "step": 904 }, { "epoch": 0.3426732298371829, "grad_norm": 11.969062805175781, "learning_rate": 0.0001488189791611568, "loss": 3.9199, "step": 905 }, { "epoch": 0.34305187429004164, "grad_norm": 10.909595489501953, "learning_rate": 0.00014871433573525976, "loss": 3.5213, "step": 906 }, { "epoch": 0.3434305187429004, "grad_norm": 11.326231956481934, "learning_rate": 0.00014860962232154755, "loss": 3.2244, "step": 907 }, { "epoch": 0.34380916319575916, "grad_norm": 12.978373527526855, "learning_rate": 0.00014850483907046175, "loss": 4.087, "step": 908 }, { "epoch": 0.34418780764861795, "grad_norm": 13.51850414276123, "learning_rate": 0.00014839998613254432, "loss": 3.7443, "step": 909 }, { "epoch": 0.34456645210147674, "grad_norm": 13.952939987182617, "learning_rate": 0.00014829506365843725, "loss": 4.2233, "step": 910 }, { "epoch": 0.3449450965543355, "grad_norm": 14.313178062438965, "learning_rate": 0.00014819007179888262, "loss": 3.744, "step": 911 }, { "epoch": 0.34532374100719426, "grad_norm": 13.837858200073242, "learning_rate": 0.000148085010704722, "loss": 3.4982, "step": 912 }, { "epoch": 0.345702385460053, "grad_norm": 12.670626640319824, "learning_rate": 0.0001479798805268965, "loss": 2.5508, "step": 913 }, { "epoch": 0.3460810299129118, "grad_norm": 14.74666976928711, "learning_rate": 0.00014787468141644658, "loss": 3.6456, "step": 914 }, { "epoch": 0.3464596743657705, "grad_norm": 14.362848281860352, "learning_rate": 0.0001477694135245116, "loss": 3.3422, "step": 915 }, { "epoch": 0.3468383188186293, "grad_norm": 12.029289245605469, "learning_rate": 0.00014766407700232974, "loss": 2.7627, "step": 916 }, { "epoch": 0.3472169632714881, "grad_norm": 13.28024673461914, "learning_rate": 0.00014755867200123789, "loss": 2.4415, "step": 917 }, { "epoch": 0.34759560772434683, "grad_norm": 16.25495719909668, "learning_rate": 0.00014745319867267122, "loss": 3.8264, "step": 918 }, { "epoch": 0.3479742521772056, "grad_norm": 14.264103889465332, "learning_rate": 0.00014734765716816316, "loss": 2.3678, "step": 919 }, { "epoch": 0.34835289663006436, "grad_norm": 16.4278507232666, "learning_rate": 0.00014724204763934498, "loss": 3.2339, "step": 920 }, { "epoch": 0.34873154108292315, "grad_norm": 12.346698760986328, "learning_rate": 0.0001471363702379458, "loss": 2.3282, "step": 921 }, { "epoch": 0.3491101855357819, "grad_norm": 16.423734664916992, "learning_rate": 0.00014703062511579212, "loss": 2.2432, "step": 922 }, { "epoch": 0.34948882998864067, "grad_norm": 36.795833587646484, "learning_rate": 0.00014692481242480784, "loss": 2.8118, "step": 923 }, { "epoch": 0.34986747444149946, "grad_norm": 22.425527572631836, "learning_rate": 0.0001468189323170139, "loss": 2.0988, "step": 924 }, { "epoch": 0.3502461188943582, "grad_norm": 21.815776824951172, "learning_rate": 0.00014671298494452808, "loss": 2.1386, "step": 925 }, { "epoch": 0.350624763347217, "grad_norm": 9.949638366699219, "learning_rate": 0.0001466069704595648, "loss": 4.477, "step": 926 }, { "epoch": 0.3510034078000757, "grad_norm": 10.098043441772461, "learning_rate": 0.000146500889014435, "loss": 3.9642, "step": 927 }, { "epoch": 0.3513820522529345, "grad_norm": 9.761126518249512, "learning_rate": 0.00014639474076154566, "loss": 3.7614, "step": 928 }, { "epoch": 0.35176069670579324, "grad_norm": 11.026824951171875, "learning_rate": 0.00014628852585339984, "loss": 4.2254, "step": 929 }, { "epoch": 0.35213934115865203, "grad_norm": 11.74862289428711, "learning_rate": 0.00014618224444259628, "loss": 3.1092, "step": 930 }, { "epoch": 0.35251798561151076, "grad_norm": 10.165847778320312, "learning_rate": 0.00014607589668182947, "loss": 2.6807, "step": 931 }, { "epoch": 0.35289663006436955, "grad_norm": 12.149169921875, "learning_rate": 0.00014596948272388896, "loss": 2.9791, "step": 932 }, { "epoch": 0.35327527451722834, "grad_norm": 12.490134239196777, "learning_rate": 0.0001458630027216596, "loss": 3.9789, "step": 933 }, { "epoch": 0.3536539189700871, "grad_norm": 13.850975036621094, "learning_rate": 0.000145756456828121, "loss": 3.4066, "step": 934 }, { "epoch": 0.35403256342294587, "grad_norm": 15.180842399597168, "learning_rate": 0.00014564984519634754, "loss": 3.2428, "step": 935 }, { "epoch": 0.3544112078758046, "grad_norm": 13.27072811126709, "learning_rate": 0.00014554316797950797, "loss": 2.6158, "step": 936 }, { "epoch": 0.3547898523286634, "grad_norm": 12.887181282043457, "learning_rate": 0.0001454364253308653, "loss": 3.6556, "step": 937 }, { "epoch": 0.3551684967815221, "grad_norm": 14.38553237915039, "learning_rate": 0.00014532961740377652, "loss": 3.6761, "step": 938 }, { "epoch": 0.3555471412343809, "grad_norm": 13.48885726928711, "learning_rate": 0.00014522274435169245, "loss": 2.8547, "step": 939 }, { "epoch": 0.3559257856872397, "grad_norm": 12.696219444274902, "learning_rate": 0.00014511580632815742, "loss": 2.4686, "step": 940 }, { "epoch": 0.35630443014009844, "grad_norm": 12.52086067199707, "learning_rate": 0.00014500880348680917, "loss": 3.3242, "step": 941 }, { "epoch": 0.3566830745929572, "grad_norm": 13.25282096862793, "learning_rate": 0.00014490173598137845, "loss": 2.3792, "step": 942 }, { "epoch": 0.35706171904581596, "grad_norm": 12.935431480407715, "learning_rate": 0.0001447946039656891, "loss": 2.1999, "step": 943 }, { "epoch": 0.35744036349867475, "grad_norm": 13.861615180969238, "learning_rate": 0.00014468740759365743, "loss": 2.7313, "step": 944 }, { "epoch": 0.3578190079515335, "grad_norm": 15.322652816772461, "learning_rate": 0.00014458014701929239, "loss": 2.6993, "step": 945 }, { "epoch": 0.3581976524043923, "grad_norm": 15.554706573486328, "learning_rate": 0.00014447282239669502, "loss": 2.1881, "step": 946 }, { "epoch": 0.35857629685725106, "grad_norm": 15.744156837463379, "learning_rate": 0.0001443654338800585, "loss": 3.1557, "step": 947 }, { "epoch": 0.3589549413101098, "grad_norm": 15.191664695739746, "learning_rate": 0.00014425798162366775, "loss": 2.1443, "step": 948 }, { "epoch": 0.3593335857629686, "grad_norm": 16.317235946655273, "learning_rate": 0.00014415046578189928, "loss": 1.921, "step": 949 }, { "epoch": 0.3597122302158273, "grad_norm": 35.329994201660156, "learning_rate": 0.0001440428865092209, "loss": 3.1096, "step": 950 }, { "epoch": 0.3600908746686861, "grad_norm": 9.379858016967773, "learning_rate": 0.0001439352439601916, "loss": 4.936, "step": 951 }, { "epoch": 0.36046951912154485, "grad_norm": 10.979476928710938, "learning_rate": 0.0001438275382894613, "loss": 3.8354, "step": 952 }, { "epoch": 0.36084816357440364, "grad_norm": 10.961803436279297, "learning_rate": 0.00014371976965177062, "loss": 3.6228, "step": 953 }, { "epoch": 0.3612268080272624, "grad_norm": 11.461506843566895, "learning_rate": 0.00014361193820195046, "loss": 4.6714, "step": 954 }, { "epoch": 0.36160545248012116, "grad_norm": 11.015750885009766, "learning_rate": 0.0001435040440949223, "loss": 3.3826, "step": 955 }, { "epoch": 0.36198409693297995, "grad_norm": 10.362982749938965, "learning_rate": 0.0001433960874856973, "loss": 3.1965, "step": 956 }, { "epoch": 0.3623627413858387, "grad_norm": 11.998297691345215, "learning_rate": 0.0001432880685293766, "loss": 3.4358, "step": 957 }, { "epoch": 0.36274138583869747, "grad_norm": 12.979171752929688, "learning_rate": 0.00014317998738115091, "loss": 2.9082, "step": 958 }, { "epoch": 0.3631200302915562, "grad_norm": 15.333057403564453, "learning_rate": 0.00014307184419630028, "loss": 3.7046, "step": 959 }, { "epoch": 0.363498674744415, "grad_norm": 17.005517959594727, "learning_rate": 0.0001429636391301938, "loss": 4.5541, "step": 960 }, { "epoch": 0.3638773191972738, "grad_norm": 12.545903205871582, "learning_rate": 0.00014285537233828954, "loss": 3.2909, "step": 961 }, { "epoch": 0.3642559636501325, "grad_norm": 13.042165756225586, "learning_rate": 0.00014274704397613426, "loss": 3.3752, "step": 962 }, { "epoch": 0.3646346081029913, "grad_norm": 13.057799339294434, "learning_rate": 0.00014263865419936316, "loss": 2.7918, "step": 963 }, { "epoch": 0.36501325255585004, "grad_norm": 13.173884391784668, "learning_rate": 0.00014253020316369968, "loss": 3.1801, "step": 964 }, { "epoch": 0.36539189700870883, "grad_norm": 13.131632804870605, "learning_rate": 0.00014242169102495527, "loss": 3.3128, "step": 965 }, { "epoch": 0.36577054146156757, "grad_norm": 13.377184867858887, "learning_rate": 0.0001423131179390291, "loss": 2.649, "step": 966 }, { "epoch": 0.36614918591442636, "grad_norm": 12.528219223022461, "learning_rate": 0.00014220448406190807, "loss": 3.169, "step": 967 }, { "epoch": 0.36652783036728515, "grad_norm": 13.746808052062988, "learning_rate": 0.0001420957895496662, "loss": 2.7259, "step": 968 }, { "epoch": 0.3669064748201439, "grad_norm": 21.110719680786133, "learning_rate": 0.00014198703455846484, "loss": 3.5514, "step": 969 }, { "epoch": 0.36728511927300267, "grad_norm": 12.612068176269531, "learning_rate": 0.00014187821924455208, "loss": 2.0534, "step": 970 }, { "epoch": 0.3676637637258614, "grad_norm": 20.146154403686523, "learning_rate": 0.0001417693437642627, "loss": 2.7005, "step": 971 }, { "epoch": 0.3680424081787202, "grad_norm": 13.088459014892578, "learning_rate": 0.00014166040827401797, "loss": 1.9876, "step": 972 }, { "epoch": 0.3684210526315789, "grad_norm": 18.44115447998047, "learning_rate": 0.00014155141293032536, "loss": 1.6056, "step": 973 }, { "epoch": 0.3687996970844377, "grad_norm": 17.64615249633789, "learning_rate": 0.0001414423578897783, "loss": 1.8792, "step": 974 }, { "epoch": 0.3691783415372965, "grad_norm": 17.28131103515625, "learning_rate": 0.00014133324330905603, "loss": 1.3712, "step": 975 }, { "epoch": 0.36955698599015524, "grad_norm": 9.265350341796875, "learning_rate": 0.0001412240693449233, "loss": 3.5385, "step": 976 }, { "epoch": 0.36993563044301403, "grad_norm": 10.460281372070312, "learning_rate": 0.00014111483615423018, "loss": 3.5476, "step": 977 }, { "epoch": 0.37031427489587276, "grad_norm": 12.234232902526855, "learning_rate": 0.00014100554389391182, "loss": 5.028, "step": 978 }, { "epoch": 0.37069291934873155, "grad_norm": 12.799249649047852, "learning_rate": 0.0001408961927209883, "loss": 4.389, "step": 979 }, { "epoch": 0.3710715638015903, "grad_norm": 10.977117538452148, "learning_rate": 0.00014078678279256423, "loss": 3.5701, "step": 980 }, { "epoch": 0.3714502082544491, "grad_norm": 11.370275497436523, "learning_rate": 0.00014067731426582877, "loss": 3.4377, "step": 981 }, { "epoch": 0.3718288527073078, "grad_norm": 10.520308494567871, "learning_rate": 0.00014056778729805512, "loss": 3.1299, "step": 982 }, { "epoch": 0.3722074971601666, "grad_norm": 12.40962028503418, "learning_rate": 0.00014045820204660055, "loss": 3.2693, "step": 983 }, { "epoch": 0.3725861416130254, "grad_norm": 11.964371681213379, "learning_rate": 0.00014034855866890602, "loss": 3.8952, "step": 984 }, { "epoch": 0.3729647860658841, "grad_norm": 12.887282371520996, "learning_rate": 0.000140238857322496, "loss": 2.8382, "step": 985 }, { "epoch": 0.3733434305187429, "grad_norm": 12.985127449035645, "learning_rate": 0.0001401290981649783, "loss": 3.4678, "step": 986 }, { "epoch": 0.37372207497160165, "grad_norm": 14.884915351867676, "learning_rate": 0.0001400192813540437, "loss": 3.6069, "step": 987 }, { "epoch": 0.37410071942446044, "grad_norm": 14.4747953414917, "learning_rate": 0.00013990940704746585, "loss": 2.9554, "step": 988 }, { "epoch": 0.37447936387731917, "grad_norm": 14.479387283325195, "learning_rate": 0.00013979947540310102, "loss": 2.7698, "step": 989 }, { "epoch": 0.37485800833017796, "grad_norm": 14.599347114562988, "learning_rate": 0.00013968948657888788, "loss": 2.87, "step": 990 }, { "epoch": 0.37523665278303675, "grad_norm": 14.958257675170898, "learning_rate": 0.00013957944073284714, "loss": 2.8528, "step": 991 }, { "epoch": 0.3756152972358955, "grad_norm": 15.495623588562012, "learning_rate": 0.00013946933802308156, "loss": 3.7293, "step": 992 }, { "epoch": 0.3759939416887543, "grad_norm": 10.040778160095215, "learning_rate": 0.00013935917860777555, "loss": 1.5618, "step": 993 }, { "epoch": 0.376372586141613, "grad_norm": 15.657940864562988, "learning_rate": 0.00013924896264519491, "loss": 2.2425, "step": 994 }, { "epoch": 0.3767512305944718, "grad_norm": 13.899797439575195, "learning_rate": 0.00013913869029368682, "loss": 2.3471, "step": 995 }, { "epoch": 0.37712987504733053, "grad_norm": 14.696097373962402, "learning_rate": 0.00013902836171167938, "loss": 2.637, "step": 996 }, { "epoch": 0.3775085195001893, "grad_norm": 18.564838409423828, "learning_rate": 0.00013891797705768155, "loss": 1.3815, "step": 997 }, { "epoch": 0.3778871639530481, "grad_norm": 17.13040542602539, "learning_rate": 0.00013880753649028274, "loss": 2.0306, "step": 998 }, { "epoch": 0.37826580840590684, "grad_norm": 22.14106559753418, "learning_rate": 0.00013869704016815276, "loss": 2.567, "step": 999 }, { "epoch": 0.37864445285876563, "grad_norm": 20.40251922607422, "learning_rate": 0.00013858648825004156, "loss": 2.1573, "step": 1000 }, { "epoch": 0.37902309731162437, "grad_norm": 9.879082679748535, "learning_rate": 0.00013847588089477888, "loss": 5.068, "step": 1001 }, { "epoch": 0.37940174176448316, "grad_norm": 9.758732795715332, "learning_rate": 0.00013836521826127412, "loss": 3.3331, "step": 1002 }, { "epoch": 0.3797803862173419, "grad_norm": 10.761160850524902, "learning_rate": 0.00013825450050851623, "loss": 3.4942, "step": 1003 }, { "epoch": 0.3801590306702007, "grad_norm": 10.23125171661377, "learning_rate": 0.00013814372779557312, "loss": 3.689, "step": 1004 }, { "epoch": 0.38053767512305947, "grad_norm": 12.541388511657715, "learning_rate": 0.00013803290028159185, "loss": 4.2033, "step": 1005 }, { "epoch": 0.3809163195759182, "grad_norm": 10.713353157043457, "learning_rate": 0.00013792201812579816, "loss": 3.4712, "step": 1006 }, { "epoch": 0.381294964028777, "grad_norm": 11.47879695892334, "learning_rate": 0.00013781108148749625, "loss": 3.4701, "step": 1007 }, { "epoch": 0.38167360848163573, "grad_norm": 10.18622875213623, "learning_rate": 0.00013770009052606862, "loss": 2.702, "step": 1008 }, { "epoch": 0.3820522529344945, "grad_norm": 15.202455520629883, "learning_rate": 0.00013758904540097587, "loss": 2.9407, "step": 1009 }, { "epoch": 0.38243089738735325, "grad_norm": 13.018632888793945, "learning_rate": 0.00013747794627175632, "loss": 3.8735, "step": 1010 }, { "epoch": 0.38280954184021204, "grad_norm": 13.541316986083984, "learning_rate": 0.00013736679329802594, "loss": 2.2223, "step": 1011 }, { "epoch": 0.38318818629307083, "grad_norm": 14.50750732421875, "learning_rate": 0.00013725558663947807, "loss": 3.7973, "step": 1012 }, { "epoch": 0.38356683074592957, "grad_norm": 16.93392562866211, "learning_rate": 0.00013714432645588312, "loss": 4.062, "step": 1013 }, { "epoch": 0.38394547519878836, "grad_norm": 13.330880165100098, "learning_rate": 0.00013703301290708843, "loss": 2.7007, "step": 1014 }, { "epoch": 0.3843241196516471, "grad_norm": 13.55959701538086, "learning_rate": 0.00013692164615301808, "loss": 3.2762, "step": 1015 }, { "epoch": 0.3847027641045059, "grad_norm": 13.232234001159668, "learning_rate": 0.00013681022635367245, "loss": 2.4535, "step": 1016 }, { "epoch": 0.3850814085573646, "grad_norm": 15.532430648803711, "learning_rate": 0.00013669875366912823, "loss": 2.5774, "step": 1017 }, { "epoch": 0.3854600530102234, "grad_norm": 15.254117965698242, "learning_rate": 0.00013658722825953806, "loss": 2.6327, "step": 1018 }, { "epoch": 0.3858386974630822, "grad_norm": 16.092777252197266, "learning_rate": 0.00013647565028513037, "loss": 2.2312, "step": 1019 }, { "epoch": 0.3862173419159409, "grad_norm": 16.16512107849121, "learning_rate": 0.00013636401990620896, "loss": 2.8618, "step": 1020 }, { "epoch": 0.3865959863687997, "grad_norm": 12.459589958190918, "learning_rate": 0.00013625233728315318, "loss": 2.3862, "step": 1021 }, { "epoch": 0.38697463082165845, "grad_norm": 14.847145080566406, "learning_rate": 0.0001361406025764172, "loss": 1.8623, "step": 1022 }, { "epoch": 0.38735327527451724, "grad_norm": 16.524620056152344, "learning_rate": 0.00013602881594653016, "loss": 1.6795, "step": 1023 }, { "epoch": 0.387731919727376, "grad_norm": 24.25473976135254, "learning_rate": 0.00013591697755409573, "loss": 3.2906, "step": 1024 }, { "epoch": 0.38811056418023476, "grad_norm": 27.711610794067383, "learning_rate": 0.0001358050875597919, "loss": 1.9261, "step": 1025 }, { "epoch": 0.38848920863309355, "grad_norm": 9.252058029174805, "learning_rate": 0.00013569314612437098, "loss": 4.3016, "step": 1026 }, { "epoch": 0.3888678530859523, "grad_norm": 10.211181640625, "learning_rate": 0.00013558115340865897, "loss": 4.699, "step": 1027 }, { "epoch": 0.3892464975388111, "grad_norm": 9.520386695861816, "learning_rate": 0.0001354691095735557, "loss": 3.8331, "step": 1028 }, { "epoch": 0.3896251419916698, "grad_norm": 11.07938289642334, "learning_rate": 0.00013535701478003439, "loss": 2.8687, "step": 1029 }, { "epoch": 0.3900037864445286, "grad_norm": 11.311447143554688, "learning_rate": 0.0001352448691891414, "loss": 3.0744, "step": 1030 }, { "epoch": 0.39038243089738733, "grad_norm": 11.157035827636719, "learning_rate": 0.00013513267296199618, "loss": 3.2869, "step": 1031 }, { "epoch": 0.3907610753502461, "grad_norm": 10.669620513916016, "learning_rate": 0.0001350204262597909, "loss": 3.5071, "step": 1032 }, { "epoch": 0.39113971980310486, "grad_norm": 11.097278594970703, "learning_rate": 0.00013490812924379022, "loss": 2.2786, "step": 1033 }, { "epoch": 0.39151836425596365, "grad_norm": 12.581409454345703, "learning_rate": 0.0001347957820753311, "loss": 3.7062, "step": 1034 }, { "epoch": 0.39189700870882244, "grad_norm": 10.81505298614502, "learning_rate": 0.00013468338491582252, "loss": 3.2618, "step": 1035 }, { "epoch": 0.39227565316168117, "grad_norm": 12.934078216552734, "learning_rate": 0.00013457093792674537, "loss": 3.0491, "step": 1036 }, { "epoch": 0.39265429761453996, "grad_norm": 12.945857048034668, "learning_rate": 0.00013445844126965206, "loss": 2.3032, "step": 1037 }, { "epoch": 0.3930329420673987, "grad_norm": 13.578465461730957, "learning_rate": 0.00013434589510616634, "loss": 2.4166, "step": 1038 }, { "epoch": 0.3934115865202575, "grad_norm": 15.570049285888672, "learning_rate": 0.00013423329959798315, "loss": 3.2948, "step": 1039 }, { "epoch": 0.3937902309731162, "grad_norm": 15.420329093933105, "learning_rate": 0.0001341206549068683, "loss": 3.0431, "step": 1040 }, { "epoch": 0.394168875425975, "grad_norm": 16.096786499023438, "learning_rate": 0.00013400796119465824, "loss": 2.3038, "step": 1041 }, { "epoch": 0.3945475198788338, "grad_norm": 15.989263534545898, "learning_rate": 0.00013389521862325985, "loss": 3.3304, "step": 1042 }, { "epoch": 0.39492616433169253, "grad_norm": 16.243566513061523, "learning_rate": 0.00013378242735465022, "loss": 4.0894, "step": 1043 }, { "epoch": 0.3953048087845513, "grad_norm": 13.244513511657715, "learning_rate": 0.00013366958755087644, "loss": 2.5639, "step": 1044 }, { "epoch": 0.39568345323741005, "grad_norm": 13.560445785522461, "learning_rate": 0.00013355669937405526, "loss": 2.6478, "step": 1045 }, { "epoch": 0.39606209769026884, "grad_norm": 19.593982696533203, "learning_rate": 0.00013344376298637294, "loss": 2.9598, "step": 1046 }, { "epoch": 0.3964407421431276, "grad_norm": 13.61347770690918, "learning_rate": 0.00013333077855008508, "loss": 2.0055, "step": 1047 }, { "epoch": 0.39681938659598637, "grad_norm": 14.087455749511719, "learning_rate": 0.00013321774622751618, "loss": 2.1689, "step": 1048 }, { "epoch": 0.39719803104884516, "grad_norm": 20.69855499267578, "learning_rate": 0.0001331046661810597, "loss": 1.4113, "step": 1049 }, { "epoch": 0.3975766755017039, "grad_norm": 34.63194274902344, "learning_rate": 0.00013299153857317748, "loss": 2.1471, "step": 1050 }, { "epoch": 0.3979553199545627, "grad_norm": 8.219612121582031, "learning_rate": 0.0001328783635663999, "loss": 3.5702, "step": 1051 }, { "epoch": 0.3983339644074214, "grad_norm": 9.948872566223145, "learning_rate": 0.00013276514132332521, "loss": 3.3578, "step": 1052 }, { "epoch": 0.3987126088602802, "grad_norm": 11.182106018066406, "learning_rate": 0.00013265187200661976, "loss": 3.9353, "step": 1053 }, { "epoch": 0.39909125331313894, "grad_norm": 12.669611930847168, "learning_rate": 0.00013253855577901732, "loss": 3.9309, "step": 1054 }, { "epoch": 0.39946989776599773, "grad_norm": 12.40208625793457, "learning_rate": 0.0001324251928033192, "loss": 3.6691, "step": 1055 }, { "epoch": 0.3998485422188565, "grad_norm": 10.716998100280762, "learning_rate": 0.00013231178324239377, "loss": 3.3575, "step": 1056 }, { "epoch": 0.40022718667171525, "grad_norm": 12.901732444763184, "learning_rate": 0.00013219832725917645, "loss": 3.5777, "step": 1057 }, { "epoch": 0.40060583112457404, "grad_norm": 11.288579940795898, "learning_rate": 0.00013208482501666924, "loss": 2.7736, "step": 1058 }, { "epoch": 0.4009844755774328, "grad_norm": 11.336037635803223, "learning_rate": 0.00013197127667794066, "loss": 3.0309, "step": 1059 }, { "epoch": 0.40136312003029156, "grad_norm": 12.790970802307129, "learning_rate": 0.00013185768240612543, "loss": 2.9778, "step": 1060 }, { "epoch": 0.4017417644831503, "grad_norm": 10.891714096069336, "learning_rate": 0.0001317440423644243, "loss": 2.77, "step": 1061 }, { "epoch": 0.4021204089360091, "grad_norm": 14.804855346679688, "learning_rate": 0.00013163035671610374, "loss": 2.9571, "step": 1062 }, { "epoch": 0.4024990533888679, "grad_norm": 16.438711166381836, "learning_rate": 0.00013151662562449576, "loss": 3.4882, "step": 1063 }, { "epoch": 0.4028776978417266, "grad_norm": 13.646224975585938, "learning_rate": 0.00013140284925299762, "loss": 3.3764, "step": 1064 }, { "epoch": 0.4032563422945854, "grad_norm": 13.510947227478027, "learning_rate": 0.00013128902776507172, "loss": 2.5878, "step": 1065 }, { "epoch": 0.40363498674744414, "grad_norm": 14.393485069274902, "learning_rate": 0.00013117516132424517, "loss": 3.1052, "step": 1066 }, { "epoch": 0.4040136312003029, "grad_norm": 13.308830261230469, "learning_rate": 0.00013106125009410978, "loss": 2.3341, "step": 1067 }, { "epoch": 0.40439227565316166, "grad_norm": 14.394597053527832, "learning_rate": 0.0001309472942383216, "loss": 2.546, "step": 1068 }, { "epoch": 0.40477092010602045, "grad_norm": 15.71528434753418, "learning_rate": 0.0001308332939206009, "loss": 2.3768, "step": 1069 }, { "epoch": 0.40514956455887924, "grad_norm": 14.074331283569336, "learning_rate": 0.0001307192493047317, "loss": 2.0017, "step": 1070 }, { "epoch": 0.405528209011738, "grad_norm": 14.615304946899414, "learning_rate": 0.00013060516055456175, "loss": 1.9632, "step": 1071 }, { "epoch": 0.40590685346459676, "grad_norm": 15.81937313079834, "learning_rate": 0.00013049102783400221, "loss": 1.5349, "step": 1072 }, { "epoch": 0.4062854979174555, "grad_norm": 18.53114891052246, "learning_rate": 0.00013037685130702742, "loss": 2.08, "step": 1073 }, { "epoch": 0.4066641423703143, "grad_norm": 18.639833450317383, "learning_rate": 0.0001302626311376746, "loss": 1.4834, "step": 1074 }, { "epoch": 0.407042786823173, "grad_norm": 24.953411102294922, "learning_rate": 0.00013014836749004367, "loss": 1.6101, "step": 1075 }, { "epoch": 0.4074214312760318, "grad_norm": 7.899003505706787, "learning_rate": 0.00013003406052829706, "loss": 3.2613, "step": 1076 }, { "epoch": 0.4078000757288906, "grad_norm": 9.678568840026855, "learning_rate": 0.0001299197104166595, "loss": 3.4739, "step": 1077 }, { "epoch": 0.40817872018174933, "grad_norm": 11.191865921020508, "learning_rate": 0.0001298053173194175, "loss": 3.6228, "step": 1078 }, { "epoch": 0.4085573646346081, "grad_norm": 11.87701416015625, "learning_rate": 0.00012969088140091955, "loss": 3.041, "step": 1079 }, { "epoch": 0.40893600908746686, "grad_norm": 12.016412734985352, "learning_rate": 0.00012957640282557553, "loss": 3.7958, "step": 1080 }, { "epoch": 0.40931465354032565, "grad_norm": 12.6616792678833, "learning_rate": 0.00012946188175785666, "loss": 3.2154, "step": 1081 }, { "epoch": 0.4096932979931844, "grad_norm": 12.319831848144531, "learning_rate": 0.00012934731836229514, "loss": 3.8766, "step": 1082 }, { "epoch": 0.41007194244604317, "grad_norm": 12.028902053833008, "learning_rate": 0.0001292327128034841, "loss": 2.853, "step": 1083 }, { "epoch": 0.4104505868989019, "grad_norm": 11.936314582824707, "learning_rate": 0.00012911806524607713, "loss": 3.7024, "step": 1084 }, { "epoch": 0.4108292313517607, "grad_norm": 12.414243698120117, "learning_rate": 0.00012900337585478825, "loss": 3.3653, "step": 1085 }, { "epoch": 0.4112078758046195, "grad_norm": 13.156413078308105, "learning_rate": 0.0001288886447943915, "loss": 3.0347, "step": 1086 }, { "epoch": 0.4115865202574782, "grad_norm": 12.574990272521973, "learning_rate": 0.00012877387222972087, "loss": 2.6169, "step": 1087 }, { "epoch": 0.411965164710337, "grad_norm": 17.557424545288086, "learning_rate": 0.00012865905832566989, "loss": 3.3377, "step": 1088 }, { "epoch": 0.41234380916319574, "grad_norm": 12.320211410522461, "learning_rate": 0.0001285442032471916, "loss": 2.6103, "step": 1089 }, { "epoch": 0.41272245361605453, "grad_norm": 13.786900520324707, "learning_rate": 0.00012842930715929802, "loss": 3.1307, "step": 1090 }, { "epoch": 0.41310109806891326, "grad_norm": 16.15777587890625, "learning_rate": 0.0001283143702270603, "loss": 3.149, "step": 1091 }, { "epoch": 0.41347974252177205, "grad_norm": 15.2261323928833, "learning_rate": 0.00012819939261560806, "loss": 1.8673, "step": 1092 }, { "epoch": 0.41385838697463084, "grad_norm": 14.948053359985352, "learning_rate": 0.00012808437449012957, "loss": 2.8997, "step": 1093 }, { "epoch": 0.4142370314274896, "grad_norm": 19.149866104125977, "learning_rate": 0.00012796931601587113, "loss": 2.147, "step": 1094 }, { "epoch": 0.41461567588034837, "grad_norm": 17.016815185546875, "learning_rate": 0.0001278542173581371, "loss": 2.3585, "step": 1095 }, { "epoch": 0.4149943203332071, "grad_norm": 16.220598220825195, "learning_rate": 0.00012773907868228956, "loss": 2.0916, "step": 1096 }, { "epoch": 0.4153729647860659, "grad_norm": 17.185651779174805, "learning_rate": 0.0001276239001537481, "loss": 2.3026, "step": 1097 }, { "epoch": 0.4157516092389246, "grad_norm": 15.259086608886719, "learning_rate": 0.0001275086819379895, "loss": 1.2933, "step": 1098 }, { "epoch": 0.4161302536917834, "grad_norm": 24.932565689086914, "learning_rate": 0.00012739342420054763, "loss": 1.583, "step": 1099 }, { "epoch": 0.4165088981446422, "grad_norm": 26.60433578491211, "learning_rate": 0.0001272781271070131, "loss": 2.5607, "step": 1100 }, { "epoch": 0.41688754259750094, "grad_norm": 11.937226295471191, "learning_rate": 0.00012716279082303312, "loss": 4.2044, "step": 1101 }, { "epoch": 0.4172661870503597, "grad_norm": 10.27784252166748, "learning_rate": 0.0001270474155143111, "loss": 4.1484, "step": 1102 }, { "epoch": 0.41764483150321846, "grad_norm": 9.727765083312988, "learning_rate": 0.00012693200134660662, "loss": 3.0787, "step": 1103 }, { "epoch": 0.41802347595607725, "grad_norm": 10.214356422424316, "learning_rate": 0.00012681654848573502, "loss": 2.7503, "step": 1104 }, { "epoch": 0.418402120408936, "grad_norm": 10.071405410766602, "learning_rate": 0.00012670105709756727, "loss": 2.8888, "step": 1105 }, { "epoch": 0.4187807648617948, "grad_norm": 13.693557739257812, "learning_rate": 0.00012658552734802963, "loss": 3.9183, "step": 1106 }, { "epoch": 0.41915940931465356, "grad_norm": 10.267026901245117, "learning_rate": 0.00012646995940310363, "loss": 2.5214, "step": 1107 }, { "epoch": 0.4195380537675123, "grad_norm": 12.434460639953613, "learning_rate": 0.00012635435342882548, "loss": 3.0185, "step": 1108 }, { "epoch": 0.4199166982203711, "grad_norm": 11.598405838012695, "learning_rate": 0.00012623870959128615, "loss": 2.7773, "step": 1109 }, { "epoch": 0.4202953426732298, "grad_norm": 14.913825035095215, "learning_rate": 0.00012612302805663098, "loss": 3.8533, "step": 1110 }, { "epoch": 0.4206739871260886, "grad_norm": 11.085430145263672, "learning_rate": 0.0001260073089910594, "loss": 2.6134, "step": 1111 }, { "epoch": 0.42105263157894735, "grad_norm": 12.33950138092041, "learning_rate": 0.00012589155256082489, "loss": 2.9382, "step": 1112 }, { "epoch": 0.42143127603180613, "grad_norm": 13.180621147155762, "learning_rate": 0.00012577575893223456, "loss": 2.8428, "step": 1113 }, { "epoch": 0.4218099204846649, "grad_norm": 15.379983901977539, "learning_rate": 0.0001256599282716489, "loss": 2.5916, "step": 1114 }, { "epoch": 0.42218856493752366, "grad_norm": 14.148529052734375, "learning_rate": 0.00012554406074548165, "loss": 2.5504, "step": 1115 }, { "epoch": 0.42256720939038245, "grad_norm": 15.524250030517578, "learning_rate": 0.00012542815652019952, "loss": 2.6872, "step": 1116 }, { "epoch": 0.4229458538432412, "grad_norm": 13.896522521972656, "learning_rate": 0.00012531221576232197, "loss": 2.3257, "step": 1117 }, { "epoch": 0.42332449829609997, "grad_norm": 13.984559059143066, "learning_rate": 0.0001251962386384209, "loss": 2.2887, "step": 1118 }, { "epoch": 0.4237031427489587, "grad_norm": 14.945381164550781, "learning_rate": 0.00012508022531512047, "loss": 2.2639, "step": 1119 }, { "epoch": 0.4240817872018175, "grad_norm": 14.590594291687012, "learning_rate": 0.00012496417595909685, "loss": 2.7817, "step": 1120 }, { "epoch": 0.4244604316546763, "grad_norm": 22.159513473510742, "learning_rate": 0.00012484809073707803, "loss": 3.3067, "step": 1121 }, { "epoch": 0.424839076107535, "grad_norm": 19.108047485351562, "learning_rate": 0.00012473196981584338, "loss": 2.6282, "step": 1122 }, { "epoch": 0.4252177205603938, "grad_norm": 15.237470626831055, "learning_rate": 0.00012461581336222378, "loss": 1.917, "step": 1123 }, { "epoch": 0.42559636501325254, "grad_norm": 13.147758483886719, "learning_rate": 0.0001244996215431009, "loss": 1.1269, "step": 1124 }, { "epoch": 0.42597500946611133, "grad_norm": 30.5366268157959, "learning_rate": 0.00012438339452540748, "loss": 1.766, "step": 1125 }, { "epoch": 0.42635365391897007, "grad_norm": 8.803793907165527, "learning_rate": 0.00012426713247612665, "loss": 3.8758, "step": 1126 }, { "epoch": 0.42673229837182886, "grad_norm": 10.560848236083984, "learning_rate": 0.00012415083556229192, "loss": 3.5995, "step": 1127 }, { "epoch": 0.42711094282468764, "grad_norm": 11.299087524414062, "learning_rate": 0.00012403450395098695, "loss": 4.2221, "step": 1128 }, { "epoch": 0.4274895872775464, "grad_norm": 11.33618450164795, "learning_rate": 0.00012391813780934514, "loss": 4.1682, "step": 1129 }, { "epoch": 0.42786823173040517, "grad_norm": 10.318195343017578, "learning_rate": 0.00012380173730454957, "loss": 3.3889, "step": 1130 }, { "epoch": 0.4282468761832639, "grad_norm": 11.54907512664795, "learning_rate": 0.00012368530260383268, "loss": 2.8639, "step": 1131 }, { "epoch": 0.4286255206361227, "grad_norm": 11.327589988708496, "learning_rate": 0.00012356883387447601, "loss": 2.3551, "step": 1132 }, { "epoch": 0.4290041650889814, "grad_norm": 12.675344467163086, "learning_rate": 0.00012345233128381006, "loss": 3.7048, "step": 1133 }, { "epoch": 0.4293828095418402, "grad_norm": 10.90146255493164, "learning_rate": 0.00012333579499921392, "loss": 3.0984, "step": 1134 }, { "epoch": 0.429761453994699, "grad_norm": 13.599529266357422, "learning_rate": 0.00012321922518811508, "loss": 2.9593, "step": 1135 }, { "epoch": 0.43014009844755774, "grad_norm": 12.997097969055176, "learning_rate": 0.00012310262201798924, "loss": 3.048, "step": 1136 }, { "epoch": 0.43051874290041653, "grad_norm": 13.863821029663086, "learning_rate": 0.00012298598565636, "loss": 2.9528, "step": 1137 }, { "epoch": 0.43089738735327526, "grad_norm": 14.177045822143555, "learning_rate": 0.00012286931627079862, "loss": 2.5402, "step": 1138 }, { "epoch": 0.43127603180613405, "grad_norm": 14.45673942565918, "learning_rate": 0.00012275261402892388, "loss": 2.1941, "step": 1139 }, { "epoch": 0.4316546762589928, "grad_norm": 16.615707397460938, "learning_rate": 0.0001226358790984017, "loss": 2.7464, "step": 1140 }, { "epoch": 0.4320333207118516, "grad_norm": 13.864429473876953, "learning_rate": 0.000122519111646945, "loss": 2.384, "step": 1141 }, { "epoch": 0.4324119651647103, "grad_norm": 15.059038162231445, "learning_rate": 0.00012240231184231336, "loss": 1.735, "step": 1142 }, { "epoch": 0.4327906096175691, "grad_norm": 15.821595191955566, "learning_rate": 0.00012228547985231297, "loss": 2.953, "step": 1143 }, { "epoch": 0.4331692540704279, "grad_norm": 13.79995346069336, "learning_rate": 0.00012216861584479608, "loss": 2.3279, "step": 1144 }, { "epoch": 0.4335478985232866, "grad_norm": 11.45645523071289, "learning_rate": 0.00012205171998766114, "loss": 1.7425, "step": 1145 }, { "epoch": 0.4339265429761454, "grad_norm": 15.549623489379883, "learning_rate": 0.00012193479244885217, "loss": 2.452, "step": 1146 }, { "epoch": 0.43430518742900415, "grad_norm": 14.682928085327148, "learning_rate": 0.00012181783339635888, "loss": 2.1395, "step": 1147 }, { "epoch": 0.43468383188186294, "grad_norm": 19.542850494384766, "learning_rate": 0.00012170084299821609, "loss": 2.4162, "step": 1148 }, { "epoch": 0.43506247633472167, "grad_norm": 15.998048782348633, "learning_rate": 0.00012158382142250379, "loss": 1.5397, "step": 1149 }, { "epoch": 0.43544112078758046, "grad_norm": 37.20795822143555, "learning_rate": 0.00012146676883734671, "loss": 3.4346, "step": 1150 }, { "epoch": 0.43581976524043925, "grad_norm": 8.654630661010742, "learning_rate": 0.00012134968541091405, "loss": 4.2973, "step": 1151 }, { "epoch": 0.436198409693298, "grad_norm": 9.75950813293457, "learning_rate": 0.0001212325713114195, "loss": 3.3641, "step": 1152 }, { "epoch": 0.4365770541461568, "grad_norm": 9.88634204864502, "learning_rate": 0.00012111542670712066, "loss": 3.6815, "step": 1153 }, { "epoch": 0.4369556985990155, "grad_norm": 12.256867408752441, "learning_rate": 0.00012099825176631902, "loss": 3.2275, "step": 1154 }, { "epoch": 0.4373343430518743, "grad_norm": 12.367258071899414, "learning_rate": 0.00012088104665735964, "loss": 2.9504, "step": 1155 }, { "epoch": 0.43771298750473303, "grad_norm": 13.042316436767578, "learning_rate": 0.00012076381154863095, "loss": 3.0564, "step": 1156 }, { "epoch": 0.4380916319575918, "grad_norm": 11.0169677734375, "learning_rate": 0.00012064654660856445, "loss": 3.4256, "step": 1157 }, { "epoch": 0.4384702764104506, "grad_norm": 11.372369766235352, "learning_rate": 0.0001205292520056345, "loss": 3.5504, "step": 1158 }, { "epoch": 0.43884892086330934, "grad_norm": 10.504295349121094, "learning_rate": 0.00012041192790835811, "loss": 2.7411, "step": 1159 }, { "epoch": 0.43922756531616813, "grad_norm": 13.477766036987305, "learning_rate": 0.00012029457448529459, "loss": 2.9257, "step": 1160 }, { "epoch": 0.43960620976902687, "grad_norm": 12.110424041748047, "learning_rate": 0.00012017719190504551, "loss": 2.8799, "step": 1161 }, { "epoch": 0.43998485422188566, "grad_norm": 13.188323020935059, "learning_rate": 0.00012005978033625416, "loss": 2.5087, "step": 1162 }, { "epoch": 0.4403634986747444, "grad_norm": 11.588294982910156, "learning_rate": 0.00011994233994760567, "loss": 2.5272, "step": 1163 }, { "epoch": 0.4407421431276032, "grad_norm": 15.151694297790527, "learning_rate": 0.00011982487090782638, "loss": 2.7985, "step": 1164 }, { "epoch": 0.44112078758046197, "grad_norm": 14.004260063171387, "learning_rate": 0.00011970737338568394, "loss": 2.7696, "step": 1165 }, { "epoch": 0.4414994320333207, "grad_norm": 14.581443786621094, "learning_rate": 0.00011958984754998685, "loss": 2.2614, "step": 1166 }, { "epoch": 0.4418780764861795, "grad_norm": 12.546298027038574, "learning_rate": 0.00011947229356958434, "loss": 2.3896, "step": 1167 }, { "epoch": 0.44225672093903823, "grad_norm": 14.990707397460938, "learning_rate": 0.000119354711613366, "loss": 3.1594, "step": 1168 }, { "epoch": 0.442635365391897, "grad_norm": 14.658981323242188, "learning_rate": 0.00011923710185026169, "loss": 2.4297, "step": 1169 }, { "epoch": 0.44301400984475575, "grad_norm": 13.724644660949707, "learning_rate": 0.00011911946444924116, "loss": 1.5228, "step": 1170 }, { "epoch": 0.44339265429761454, "grad_norm": 19.209369659423828, "learning_rate": 0.0001190017995793139, "loss": 3.4329, "step": 1171 }, { "epoch": 0.44377129875047333, "grad_norm": 21.529495239257812, "learning_rate": 0.00011888410740952887, "loss": 2.5655, "step": 1172 }, { "epoch": 0.44414994320333206, "grad_norm": 24.351722717285156, "learning_rate": 0.00011876638810897422, "loss": 2.6329, "step": 1173 }, { "epoch": 0.44452858765619085, "grad_norm": 15.183594703674316, "learning_rate": 0.00011864864184677711, "loss": 0.8859, "step": 1174 }, { "epoch": 0.4449072321090496, "grad_norm": 13.775147438049316, "learning_rate": 0.00011853086879210342, "loss": 1.3488, "step": 1175 }, { "epoch": 0.4452858765619084, "grad_norm": 8.975238800048828, "learning_rate": 0.00011841306911415753, "loss": 3.21, "step": 1176 }, { "epoch": 0.4456645210147671, "grad_norm": 11.082070350646973, "learning_rate": 0.00011829524298218207, "loss": 4.19, "step": 1177 }, { "epoch": 0.4460431654676259, "grad_norm": 10.536282539367676, "learning_rate": 0.00011817739056545762, "loss": 3.5267, "step": 1178 }, { "epoch": 0.4464218099204847, "grad_norm": 10.50727367401123, "learning_rate": 0.00011805951203330266, "loss": 3.3532, "step": 1179 }, { "epoch": 0.4468004543733434, "grad_norm": 10.488901138305664, "learning_rate": 0.00011794160755507304, "loss": 2.9757, "step": 1180 }, { "epoch": 0.4471790988262022, "grad_norm": 12.007133483886719, "learning_rate": 0.000117823677300162, "loss": 3.0183, "step": 1181 }, { "epoch": 0.44755774327906095, "grad_norm": 12.38204574584961, "learning_rate": 0.00011770572143799971, "loss": 3.0908, "step": 1182 }, { "epoch": 0.44793638773191974, "grad_norm": 12.608494758605957, "learning_rate": 0.00011758774013805325, "loss": 3.0191, "step": 1183 }, { "epoch": 0.4483150321847785, "grad_norm": 10.949199676513672, "learning_rate": 0.00011746973356982614, "loss": 2.5306, "step": 1184 }, { "epoch": 0.44869367663763726, "grad_norm": 12.805669784545898, "learning_rate": 0.00011735170190285825, "loss": 3.2759, "step": 1185 }, { "epoch": 0.44907232109049605, "grad_norm": 12.965691566467285, "learning_rate": 0.00011723364530672549, "loss": 3.0626, "step": 1186 }, { "epoch": 0.4494509655433548, "grad_norm": 11.967156410217285, "learning_rate": 0.00011711556395103964, "loss": 2.4325, "step": 1187 }, { "epoch": 0.4498296099962136, "grad_norm": 13.925737380981445, "learning_rate": 0.00011699745800544798, "loss": 2.8316, "step": 1188 }, { "epoch": 0.4502082544490723, "grad_norm": 13.926861763000488, "learning_rate": 0.00011687932763963319, "loss": 3.4606, "step": 1189 }, { "epoch": 0.4505868989019311, "grad_norm": 13.918458938598633, "learning_rate": 0.00011676117302331291, "loss": 2.5946, "step": 1190 }, { "epoch": 0.45096554335478983, "grad_norm": 16.527910232543945, "learning_rate": 0.00011664299432623979, "loss": 2.2876, "step": 1191 }, { "epoch": 0.4513441878076486, "grad_norm": 14.137311935424805, "learning_rate": 0.00011652479171820097, "loss": 2.9587, "step": 1192 }, { "epoch": 0.45172283226050736, "grad_norm": 17.192485809326172, "learning_rate": 0.00011640656536901796, "loss": 1.5583, "step": 1193 }, { "epoch": 0.45210147671336615, "grad_norm": 14.512371063232422, "learning_rate": 0.00011628831544854635, "loss": 2.3428, "step": 1194 }, { "epoch": 0.45248012116622494, "grad_norm": 16.016895294189453, "learning_rate": 0.00011617004212667566, "loss": 2.4906, "step": 1195 }, { "epoch": 0.45285876561908367, "grad_norm": 13.380924224853516, "learning_rate": 0.000116051745573329, "loss": 1.8266, "step": 1196 }, { "epoch": 0.45323741007194246, "grad_norm": 12.72845458984375, "learning_rate": 0.00011593342595846288, "loss": 1.166, "step": 1197 }, { "epoch": 0.4536160545248012, "grad_norm": 14.16887092590332, "learning_rate": 0.00011581508345206689, "loss": 1.3564, "step": 1198 }, { "epoch": 0.45399469897766, "grad_norm": 28.907073974609375, "learning_rate": 0.0001156967182241635, "loss": 1.5071, "step": 1199 }, { "epoch": 0.4543733434305187, "grad_norm": 17.37041473388672, "learning_rate": 0.00011557833044480792, "loss": 1.1685, "step": 1200 }, { "epoch": 0.4547519878833775, "grad_norm": 10.693912506103516, "learning_rate": 0.0001154599202840877, "loss": 3.2915, "step": 1201 }, { "epoch": 0.4551306323362363, "grad_norm": 13.119062423706055, "learning_rate": 0.0001153414879121225, "loss": 4.6147, "step": 1202 }, { "epoch": 0.45550927678909503, "grad_norm": 11.448525428771973, "learning_rate": 0.00011522303349906399, "loss": 2.79, "step": 1203 }, { "epoch": 0.4558879212419538, "grad_norm": 11.742964744567871, "learning_rate": 0.00011510455721509537, "loss": 3.2349, "step": 1204 }, { "epoch": 0.45626656569481255, "grad_norm": 10.76633358001709, "learning_rate": 0.00011498605923043145, "loss": 3.0203, "step": 1205 }, { "epoch": 0.45664521014767134, "grad_norm": 11.407468795776367, "learning_rate": 0.00011486753971531801, "loss": 3.6872, "step": 1206 }, { "epoch": 0.4570238546005301, "grad_norm": 11.357184410095215, "learning_rate": 0.00011474899884003196, "loss": 2.7635, "step": 1207 }, { "epoch": 0.45740249905338887, "grad_norm": 12.275900840759277, "learning_rate": 0.00011463043677488073, "loss": 2.7735, "step": 1208 }, { "epoch": 0.45778114350624766, "grad_norm": 12.097725868225098, "learning_rate": 0.0001145118536902023, "loss": 2.7413, "step": 1209 }, { "epoch": 0.4581597879591064, "grad_norm": 10.203941345214844, "learning_rate": 0.0001143932497563648, "loss": 2.3056, "step": 1210 }, { "epoch": 0.4585384324119652, "grad_norm": 12.463147163391113, "learning_rate": 0.00011427462514376637, "loss": 3.1588, "step": 1211 }, { "epoch": 0.4589170768648239, "grad_norm": 10.687355041503906, "learning_rate": 0.00011415598002283474, "loss": 1.4561, "step": 1212 }, { "epoch": 0.4592957213176827, "grad_norm": 13.218606948852539, "learning_rate": 0.00011403731456402727, "loss": 2.156, "step": 1213 }, { "epoch": 0.45967436577054144, "grad_norm": 15.726714134216309, "learning_rate": 0.00011391862893783038, "loss": 2.621, "step": 1214 }, { "epoch": 0.4600530102234002, "grad_norm": 15.450735092163086, "learning_rate": 0.0001137999233147596, "loss": 2.6854, "step": 1215 }, { "epoch": 0.460431654676259, "grad_norm": 14.271288871765137, "learning_rate": 0.00011368119786535906, "loss": 2.3983, "step": 1216 }, { "epoch": 0.46081029912911775, "grad_norm": 16.259143829345703, "learning_rate": 0.0001135624527602015, "loss": 3.0149, "step": 1217 }, { "epoch": 0.46118894358197654, "grad_norm": 21.305139541625977, "learning_rate": 0.00011344368816988779, "loss": 2.5145, "step": 1218 }, { "epoch": 0.4615675880348353, "grad_norm": 18.001358032226562, "learning_rate": 0.00011332490426504688, "loss": 2.6175, "step": 1219 }, { "epoch": 0.46194623248769406, "grad_norm": 15.817441940307617, "learning_rate": 0.00011320610121633542, "loss": 2.0215, "step": 1220 }, { "epoch": 0.4623248769405528, "grad_norm": 18.465803146362305, "learning_rate": 0.00011308727919443756, "loss": 2.2702, "step": 1221 }, { "epoch": 0.4627035213934116, "grad_norm": 15.902999877929688, "learning_rate": 0.00011296843837006477, "loss": 2.0862, "step": 1222 }, { "epoch": 0.4630821658462704, "grad_norm": 18.18279457092285, "learning_rate": 0.00011284957891395545, "loss": 1.6971, "step": 1223 }, { "epoch": 0.4634608102991291, "grad_norm": 20.656322479248047, "learning_rate": 0.00011273070099687482, "loss": 1.8615, "step": 1224 }, { "epoch": 0.4638394547519879, "grad_norm": 37.89259719848633, "learning_rate": 0.0001126118047896146, "loss": 2.1817, "step": 1225 }, { "epoch": 0.46421809920484663, "grad_norm": 8.783308982849121, "learning_rate": 0.0001124928904629928, "loss": 3.3508, "step": 1226 }, { "epoch": 0.4645967436577054, "grad_norm": 12.971296310424805, "learning_rate": 0.0001123739581878535, "loss": 3.7262, "step": 1227 }, { "epoch": 0.46497538811056416, "grad_norm": 10.869105339050293, "learning_rate": 0.00011225500813506645, "loss": 3.2334, "step": 1228 }, { "epoch": 0.46535403256342295, "grad_norm": 11.33836555480957, "learning_rate": 0.00011213604047552708, "loss": 3.5119, "step": 1229 }, { "epoch": 0.46573267701628174, "grad_norm": 10.899227142333984, "learning_rate": 0.00011201705538015604, "loss": 3.5351, "step": 1230 }, { "epoch": 0.46611132146914047, "grad_norm": 11.528409957885742, "learning_rate": 0.00011189805301989904, "loss": 3.1705, "step": 1231 }, { "epoch": 0.46648996592199926, "grad_norm": 10.381014823913574, "learning_rate": 0.00011177903356572659, "loss": 1.9777, "step": 1232 }, { "epoch": 0.466868610374858, "grad_norm": 11.280335426330566, "learning_rate": 0.00011165999718863379, "loss": 2.5228, "step": 1233 }, { "epoch": 0.4672472548277168, "grad_norm": 14.46865177154541, "learning_rate": 0.00011154094405963996, "loss": 2.5568, "step": 1234 }, { "epoch": 0.4676258992805755, "grad_norm": 13.52888011932373, "learning_rate": 0.00011142187434978866, "loss": 3.2911, "step": 1235 }, { "epoch": 0.4680045437334343, "grad_norm": 11.23714828491211, "learning_rate": 0.00011130278823014709, "loss": 2.2005, "step": 1236 }, { "epoch": 0.4683831881862931, "grad_norm": 12.224804878234863, "learning_rate": 0.00011118368587180614, "loss": 2.2755, "step": 1237 }, { "epoch": 0.46876183263915183, "grad_norm": 12.343790054321289, "learning_rate": 0.00011106456744587996, "loss": 2.8197, "step": 1238 }, { "epoch": 0.4691404770920106, "grad_norm": 13.172083854675293, "learning_rate": 0.0001109454331235059, "loss": 2.586, "step": 1239 }, { "epoch": 0.46951912154486936, "grad_norm": 12.991609573364258, "learning_rate": 0.00011082628307584397, "loss": 2.0318, "step": 1240 }, { "epoch": 0.46989776599772815, "grad_norm": 13.485008239746094, "learning_rate": 0.00011070711747407694, "loss": 2.2734, "step": 1241 }, { "epoch": 0.4702764104505869, "grad_norm": 19.911563873291016, "learning_rate": 0.0001105879364894098, "loss": 2.9116, "step": 1242 }, { "epoch": 0.47065505490344567, "grad_norm": 14.824417114257812, "learning_rate": 0.00011046874029306975, "loss": 2.0742, "step": 1243 }, { "epoch": 0.4710336993563044, "grad_norm": 17.6142578125, "learning_rate": 0.00011034952905630576, "loss": 2.6475, "step": 1244 }, { "epoch": 0.4714123438091632, "grad_norm": 13.6873140335083, "learning_rate": 0.00011023030295038846, "loss": 2.1793, "step": 1245 }, { "epoch": 0.471790988262022, "grad_norm": 15.636033058166504, "learning_rate": 0.0001101110621466098, "loss": 1.6981, "step": 1246 }, { "epoch": 0.4721696327148807, "grad_norm": 17.11579132080078, "learning_rate": 0.00010999180681628288, "loss": 1.6256, "step": 1247 }, { "epoch": 0.4725482771677395, "grad_norm": 20.186901092529297, "learning_rate": 0.00010987253713074165, "loss": 2.4091, "step": 1248 }, { "epoch": 0.47292692162059824, "grad_norm": 15.602944374084473, "learning_rate": 0.00010975325326134071, "loss": 1.8002, "step": 1249 }, { "epoch": 0.47330556607345703, "grad_norm": 23.223661422729492, "learning_rate": 0.00010963395537945502, "loss": 2.0938, "step": 1250 }, { "epoch": 0.47368421052631576, "grad_norm": 10.464275360107422, "learning_rate": 0.00010951464365647967, "loss": 4.1863, "step": 1251 }, { "epoch": 0.47406285497917455, "grad_norm": 10.853160858154297, "learning_rate": 0.00010939531826382963, "loss": 3.6832, "step": 1252 }, { "epoch": 0.47444149943203334, "grad_norm": 12.23708724975586, "learning_rate": 0.00010927597937293952, "loss": 3.7507, "step": 1253 }, { "epoch": 0.4748201438848921, "grad_norm": 12.157914161682129, "learning_rate": 0.00010915662715526336, "loss": 2.7929, "step": 1254 }, { "epoch": 0.47519878833775087, "grad_norm": 14.618999481201172, "learning_rate": 0.00010903726178227432, "loss": 3.9901, "step": 1255 }, { "epoch": 0.4755774327906096, "grad_norm": 11.460221290588379, "learning_rate": 0.0001089178834254644, "loss": 3.0165, "step": 1256 }, { "epoch": 0.4759560772434684, "grad_norm": 11.18032455444336, "learning_rate": 0.00010879849225634438, "loss": 1.9716, "step": 1257 }, { "epoch": 0.4763347216963271, "grad_norm": 11.510719299316406, "learning_rate": 0.00010867908844644335, "loss": 1.7553, "step": 1258 }, { "epoch": 0.4767133661491859, "grad_norm": 10.82070255279541, "learning_rate": 0.00010855967216730858, "loss": 2.6911, "step": 1259 }, { "epoch": 0.4770920106020447, "grad_norm": 13.530522346496582, "learning_rate": 0.00010844024359050527, "loss": 2.8952, "step": 1260 }, { "epoch": 0.47747065505490344, "grad_norm": 10.605006217956543, "learning_rate": 0.0001083208028876163, "loss": 2.2925, "step": 1261 }, { "epoch": 0.4778492995077622, "grad_norm": 12.863495826721191, "learning_rate": 0.00010820135023024192, "loss": 2.3114, "step": 1262 }, { "epoch": 0.47822794396062096, "grad_norm": 16.1364688873291, "learning_rate": 0.00010808188578999963, "loss": 3.0539, "step": 1263 }, { "epoch": 0.47860658841347975, "grad_norm": 12.478103637695312, "learning_rate": 0.00010796240973852376, "loss": 2.0726, "step": 1264 }, { "epoch": 0.4789852328663385, "grad_norm": 13.423611640930176, "learning_rate": 0.00010784292224746546, "loss": 2.8393, "step": 1265 }, { "epoch": 0.4793638773191973, "grad_norm": 14.295774459838867, "learning_rate": 0.00010772342348849216, "loss": 2.7654, "step": 1266 }, { "epoch": 0.47974252177205606, "grad_norm": 15.330755233764648, "learning_rate": 0.00010760391363328762, "loss": 1.9282, "step": 1267 }, { "epoch": 0.4801211662249148, "grad_norm": 19.332740783691406, "learning_rate": 0.00010748439285355138, "loss": 1.8195, "step": 1268 }, { "epoch": 0.4804998106777736, "grad_norm": 16.43891143798828, "learning_rate": 0.00010736486132099888, "loss": 2.0598, "step": 1269 }, { "epoch": 0.4808784551306323, "grad_norm": 12.18430233001709, "learning_rate": 0.00010724531920736086, "loss": 0.99, "step": 1270 }, { "epoch": 0.4812570995834911, "grad_norm": 15.264763832092285, "learning_rate": 0.00010712576668438323, "loss": 1.8075, "step": 1271 }, { "epoch": 0.48163574403634984, "grad_norm": 21.91768455505371, "learning_rate": 0.00010700620392382701, "loss": 2.6154, "step": 1272 }, { "epoch": 0.48201438848920863, "grad_norm": 16.14089012145996, "learning_rate": 0.00010688663109746784, "loss": 1.5317, "step": 1273 }, { "epoch": 0.4823930329420674, "grad_norm": 32.41860580444336, "learning_rate": 0.00010676704837709576, "loss": 1.8389, "step": 1274 }, { "epoch": 0.48277167739492616, "grad_norm": 23.59526252746582, "learning_rate": 0.00010664745593451516, "loss": 1.1361, "step": 1275 }, { "epoch": 0.48315032184778495, "grad_norm": 10.691109657287598, "learning_rate": 0.00010652785394154427, "loss": 3.2863, "step": 1276 }, { "epoch": 0.4835289663006437, "grad_norm": 12.289042472839355, "learning_rate": 0.00010640824257001516, "loss": 4.0967, "step": 1277 }, { "epoch": 0.48390761075350247, "grad_norm": 10.609498023986816, "learning_rate": 0.00010628862199177327, "loss": 2.915, "step": 1278 }, { "epoch": 0.4842862552063612, "grad_norm": 13.162012100219727, "learning_rate": 0.00010616899237867733, "loss": 3.3384, "step": 1279 }, { "epoch": 0.48466489965922, "grad_norm": 12.458738327026367, "learning_rate": 0.000106049353902599, "loss": 2.8678, "step": 1280 }, { "epoch": 0.4850435441120788, "grad_norm": 12.008556365966797, "learning_rate": 0.00010592970673542277, "loss": 2.9199, "step": 1281 }, { "epoch": 0.4854221885649375, "grad_norm": 10.63491153717041, "learning_rate": 0.00010581005104904549, "loss": 2.4852, "step": 1282 }, { "epoch": 0.4858008330177963, "grad_norm": 10.767313957214355, "learning_rate": 0.00010569038701537633, "loss": 3.4581, "step": 1283 }, { "epoch": 0.48617947747065504, "grad_norm": 12.88519287109375, "learning_rate": 0.00010557071480633643, "loss": 3.5616, "step": 1284 }, { "epoch": 0.48655812192351383, "grad_norm": 12.250274658203125, "learning_rate": 0.00010545103459385868, "loss": 2.8215, "step": 1285 }, { "epoch": 0.48693676637637257, "grad_norm": 12.7329683303833, "learning_rate": 0.00010533134654988746, "loss": 3.5789, "step": 1286 }, { "epoch": 0.48731541082923135, "grad_norm": 12.87328815460205, "learning_rate": 0.00010521165084637843, "loss": 2.854, "step": 1287 }, { "epoch": 0.48769405528209014, "grad_norm": 11.388814926147461, "learning_rate": 0.00010509194765529821, "loss": 2.0008, "step": 1288 }, { "epoch": 0.4880726997349489, "grad_norm": 12.551799774169922, "learning_rate": 0.00010497223714862424, "loss": 2.4604, "step": 1289 }, { "epoch": 0.48845134418780767, "grad_norm": 10.640294075012207, "learning_rate": 0.00010485251949834436, "loss": 1.6856, "step": 1290 }, { "epoch": 0.4888299886406664, "grad_norm": 13.196956634521484, "learning_rate": 0.0001047327948764568, "loss": 2.2765, "step": 1291 }, { "epoch": 0.4892086330935252, "grad_norm": 17.06575584411621, "learning_rate": 0.00010461306345496972, "loss": 2.8379, "step": 1292 }, { "epoch": 0.4895872775463839, "grad_norm": 17.766448974609375, "learning_rate": 0.00010449332540590114, "loss": 1.885, "step": 1293 }, { "epoch": 0.4899659219992427, "grad_norm": 12.942706108093262, "learning_rate": 0.00010437358090127847, "loss": 1.6903, "step": 1294 }, { "epoch": 0.49034456645210145, "grad_norm": 16.92314910888672, "learning_rate": 0.00010425383011313844, "loss": 2.4453, "step": 1295 }, { "epoch": 0.49072321090496024, "grad_norm": 17.436086654663086, "learning_rate": 0.00010413407321352695, "loss": 1.9032, "step": 1296 }, { "epoch": 0.49110185535781903, "grad_norm": 18.94797706604004, "learning_rate": 0.00010401431037449847, "loss": 2.0191, "step": 1297 }, { "epoch": 0.49148049981067776, "grad_norm": 15.610849380493164, "learning_rate": 0.0001038945417681161, "loss": 1.19, "step": 1298 }, { "epoch": 0.49185914426353655, "grad_norm": 16.951602935791016, "learning_rate": 0.00010377476756645128, "loss": 1.4745, "step": 1299 }, { "epoch": 0.4922377887163953, "grad_norm": 45.024925231933594, "learning_rate": 0.00010365498794158337, "loss": 3.5771, "step": 1300 }, { "epoch": 0.4926164331692541, "grad_norm": 10.677453994750977, "learning_rate": 0.00010353520306559963, "loss": 3.5375, "step": 1301 }, { "epoch": 0.4929950776221128, "grad_norm": 12.31181812286377, "learning_rate": 0.00010341541311059478, "loss": 3.5221, "step": 1302 }, { "epoch": 0.4933737220749716, "grad_norm": 11.114928245544434, "learning_rate": 0.00010329561824867089, "loss": 2.9916, "step": 1303 }, { "epoch": 0.4937523665278304, "grad_norm": 14.953704833984375, "learning_rate": 0.00010317581865193704, "loss": 2.4552, "step": 1304 }, { "epoch": 0.4941310109806891, "grad_norm": 11.37937068939209, "learning_rate": 0.00010305601449250919, "loss": 2.9803, "step": 1305 }, { "epoch": 0.4945096554335479, "grad_norm": 10.58877944946289, "learning_rate": 0.00010293620594250974, "loss": 2.0205, "step": 1306 }, { "epoch": 0.49488829988640665, "grad_norm": 11.108804702758789, "learning_rate": 0.00010281639317406752, "loss": 2.4598, "step": 1307 }, { "epoch": 0.49526694433926544, "grad_norm": 11.565478324890137, "learning_rate": 0.00010269657635931731, "loss": 1.909, "step": 1308 }, { "epoch": 0.49564558879212417, "grad_norm": 12.14426326751709, "learning_rate": 0.00010257675567039979, "loss": 3.0371, "step": 1309 }, { "epoch": 0.49602423324498296, "grad_norm": 10.85464096069336, "learning_rate": 0.00010245693127946112, "loss": 2.3844, "step": 1310 }, { "epoch": 0.49640287769784175, "grad_norm": 11.257962226867676, "learning_rate": 0.0001023371033586529, "loss": 2.1763, "step": 1311 }, { "epoch": 0.4967815221507005, "grad_norm": 10.673297882080078, "learning_rate": 0.00010221727208013166, "loss": 1.9263, "step": 1312 }, { "epoch": 0.4971601666035593, "grad_norm": 14.040605545043945, "learning_rate": 0.00010209743761605885, "loss": 2.7561, "step": 1313 }, { "epoch": 0.497538811056418, "grad_norm": 13.651562690734863, "learning_rate": 0.00010197760013860047, "loss": 2.1574, "step": 1314 }, { "epoch": 0.4979174555092768, "grad_norm": 13.463566780090332, "learning_rate": 0.00010185775981992689, "loss": 2.1069, "step": 1315 }, { "epoch": 0.49829609996213553, "grad_norm": 11.810751914978027, "learning_rate": 0.00010173791683221244, "loss": 1.9149, "step": 1316 }, { "epoch": 0.4986747444149943, "grad_norm": 19.515695571899414, "learning_rate": 0.00010161807134763543, "loss": 3.2127, "step": 1317 }, { "epoch": 0.4990533888678531, "grad_norm": 19.75203514099121, "learning_rate": 0.00010149822353837768, "loss": 1.3851, "step": 1318 }, { "epoch": 0.49943203332071184, "grad_norm": 16.31900978088379, "learning_rate": 0.00010137837357662432, "loss": 2.0814, "step": 1319 }, { "epoch": 0.49981067777357063, "grad_norm": 16.237138748168945, "learning_rate": 0.00010125852163456368, "loss": 2.0635, "step": 1320 }, { "epoch": 0.5001893222264294, "grad_norm": 17.72742462158203, "learning_rate": 0.00010113866788438684, "loss": 1.084, "step": 1321 }, { "epoch": 0.5005679666792882, "grad_norm": 15.087898254394531, "learning_rate": 0.00010101881249828748, "loss": 1.5248, "step": 1322 }, { "epoch": 0.5005679666792882, "eval_loss": 0.27747318148612976, "eval_runtime": 896.6071, "eval_samples_per_second": 4.961, "eval_steps_per_second": 1.24, "step": 1322 }, { "epoch": 0.5009466111321469, "grad_norm": 23.188968658447266, "learning_rate": 0.00010089895564846173, "loss": 1.7491, "step": 1323 }, { "epoch": 0.5013252555850056, "grad_norm": 19.608421325683594, "learning_rate": 0.00010077909750710766, "loss": 1.5236, "step": 1324 }, { "epoch": 0.5017039000378645, "grad_norm": 18.046968460083008, "learning_rate": 0.00010065923824642538, "loss": 1.359, "step": 1325 }, { "epoch": 0.5020825444907232, "grad_norm": 10.796852111816406, "learning_rate": 0.00010053937803861644, "loss": 3.2132, "step": 1326 }, { "epoch": 0.5024611889435819, "grad_norm": 10.203907012939453, "learning_rate": 0.00010041951705588388, "loss": 2.4803, "step": 1327 }, { "epoch": 0.5028398333964408, "grad_norm": 12.127469062805176, "learning_rate": 0.00010029965547043174, "loss": 2.9674, "step": 1328 }, { "epoch": 0.5032184778492995, "grad_norm": 14.912565231323242, "learning_rate": 0.00010017979345446506, "loss": 3.5652, "step": 1329 }, { "epoch": 0.5035971223021583, "grad_norm": 11.530462265014648, "learning_rate": 0.00010005993118018937, "loss": 2.9292, "step": 1330 }, { "epoch": 0.503975766755017, "grad_norm": 12.469483375549316, "learning_rate": 9.994006881981064e-05, "loss": 2.2923, "step": 1331 }, { "epoch": 0.5043544112078758, "grad_norm": 13.778362274169922, "learning_rate": 9.982020654553498e-05, "loss": 2.5739, "step": 1332 }, { "epoch": 0.5047330556607346, "grad_norm": 10.92608642578125, "learning_rate": 9.970034452956826e-05, "loss": 2.2934, "step": 1333 }, { "epoch": 0.5051117001135933, "grad_norm": 12.248685836791992, "learning_rate": 9.958048294411615e-05, "loss": 1.5193, "step": 1334 }, { "epoch": 0.5054903445664521, "grad_norm": 12.408686637878418, "learning_rate": 9.94606219613836e-05, "loss": 2.4581, "step": 1335 }, { "epoch": 0.5058689890193109, "grad_norm": 9.72839641571045, "learning_rate": 9.934076175357467e-05, "loss": 1.5408, "step": 1336 }, { "epoch": 0.5062476334721696, "grad_norm": 13.062264442443848, "learning_rate": 9.922090249289234e-05, "loss": 2.2559, "step": 1337 }, { "epoch": 0.5066262779250283, "grad_norm": 12.8737211227417, "learning_rate": 9.910104435153831e-05, "loss": 1.9116, "step": 1338 }, { "epoch": 0.5070049223778872, "grad_norm": 13.710526466369629, "learning_rate": 9.898118750171254e-05, "loss": 1.9169, "step": 1339 }, { "epoch": 0.5073835668307459, "grad_norm": 13.345243453979492, "learning_rate": 9.886133211561321e-05, "loss": 1.7205, "step": 1340 }, { "epoch": 0.5077622112836047, "grad_norm": 16.378582000732422, "learning_rate": 9.874147836543634e-05, "loss": 2.4532, "step": 1341 }, { "epoch": 0.5081408557364635, "grad_norm": 14.343537330627441, "learning_rate": 9.86216264233757e-05, "loss": 1.9869, "step": 1342 }, { "epoch": 0.5085195001893222, "grad_norm": 13.579465866088867, "learning_rate": 9.850177646162236e-05, "loss": 1.3558, "step": 1343 }, { "epoch": 0.508898144642181, "grad_norm": 13.948097229003906, "learning_rate": 9.83819286523646e-05, "loss": 1.8276, "step": 1344 }, { "epoch": 0.5092767890950397, "grad_norm": 11.791662216186523, "learning_rate": 9.826208316778756e-05, "loss": 1.6347, "step": 1345 }, { "epoch": 0.5096554335478986, "grad_norm": 15.745250701904297, "learning_rate": 9.814224018007315e-05, "loss": 1.7317, "step": 1346 }, { "epoch": 0.5100340780007573, "grad_norm": 13.366907119750977, "learning_rate": 9.802239986139954e-05, "loss": 1.242, "step": 1347 }, { "epoch": 0.510412722453616, "grad_norm": 14.714960098266602, "learning_rate": 9.790256238394117e-05, "loss": 1.2925, "step": 1348 }, { "epoch": 0.5107913669064749, "grad_norm": 21.305391311645508, "learning_rate": 9.778272791986835e-05, "loss": 1.7974, "step": 1349 }, { "epoch": 0.5111700113593336, "grad_norm": 20.41576385498047, "learning_rate": 9.766289664134712e-05, "loss": 1.5471, "step": 1350 }, { "epoch": 0.5115486558121923, "grad_norm": 9.854787826538086, "learning_rate": 9.754306872053889e-05, "loss": 3.6666, "step": 1351 }, { "epoch": 0.5119273002650511, "grad_norm": 13.436993598937988, "learning_rate": 9.742324432960025e-05, "loss": 3.8502, "step": 1352 }, { "epoch": 0.5123059447179099, "grad_norm": 11.966554641723633, "learning_rate": 9.730342364068269e-05, "loss": 2.7924, "step": 1353 }, { "epoch": 0.5126845891707686, "grad_norm": 11.301067352294922, "learning_rate": 9.718360682593249e-05, "loss": 2.8969, "step": 1354 }, { "epoch": 0.5130632336236274, "grad_norm": 12.656258583068848, "learning_rate": 9.706379405749027e-05, "loss": 2.7867, "step": 1355 }, { "epoch": 0.5134418780764862, "grad_norm": 12.093953132629395, "learning_rate": 9.694398550749084e-05, "loss": 2.1031, "step": 1356 }, { "epoch": 0.513820522529345, "grad_norm": 14.962082862854004, "learning_rate": 9.682418134806294e-05, "loss": 3.3662, "step": 1357 }, { "epoch": 0.5141991669822037, "grad_norm": 10.993176460266113, "learning_rate": 9.670438175132913e-05, "loss": 2.5581, "step": 1358 }, { "epoch": 0.5145778114350624, "grad_norm": 12.125143051147461, "learning_rate": 9.658458688940525e-05, "loss": 2.7468, "step": 1359 }, { "epoch": 0.5149564558879213, "grad_norm": 10.39024543762207, "learning_rate": 9.646479693440042e-05, "loss": 2.3151, "step": 1360 }, { "epoch": 0.51533510034078, "grad_norm": 10.704225540161133, "learning_rate": 9.634501205841663e-05, "loss": 1.7652, "step": 1361 }, { "epoch": 0.5157137447936387, "grad_norm": 11.33167552947998, "learning_rate": 9.622523243354873e-05, "loss": 2.4141, "step": 1362 }, { "epoch": 0.5160923892464976, "grad_norm": 12.553760528564453, "learning_rate": 9.61054582318839e-05, "loss": 2.2951, "step": 1363 }, { "epoch": 0.5164710336993563, "grad_norm": 10.5225191116333, "learning_rate": 9.598568962550156e-05, "loss": 1.6068, "step": 1364 }, { "epoch": 0.516849678152215, "grad_norm": 11.86349868774414, "learning_rate": 9.586592678647306e-05, "loss": 2.167, "step": 1365 }, { "epoch": 0.5172283226050738, "grad_norm": 10.950725555419922, "learning_rate": 9.574616988686156e-05, "loss": 1.7917, "step": 1366 }, { "epoch": 0.5176069670579326, "grad_norm": 17.839439392089844, "learning_rate": 9.562641909872157e-05, "loss": 1.8092, "step": 1367 }, { "epoch": 0.5179856115107914, "grad_norm": 15.295711517333984, "learning_rate": 9.55066745940989e-05, "loss": 1.6489, "step": 1368 }, { "epoch": 0.5183642559636501, "grad_norm": 13.916023254394531, "learning_rate": 9.538693654503027e-05, "loss": 1.3487, "step": 1369 }, { "epoch": 0.518742900416509, "grad_norm": 17.014102935791016, "learning_rate": 9.526720512354321e-05, "loss": 2.022, "step": 1370 }, { "epoch": 0.5191215448693677, "grad_norm": 17.49681282043457, "learning_rate": 9.514748050165568e-05, "loss": 1.6427, "step": 1371 }, { "epoch": 0.5195001893222264, "grad_norm": 18.096622467041016, "learning_rate": 9.502776285137582e-05, "loss": 1.9191, "step": 1372 }, { "epoch": 0.5198788337750851, "grad_norm": 17.069625854492188, "learning_rate": 9.49080523447018e-05, "loss": 1.9726, "step": 1373 }, { "epoch": 0.520257478227944, "grad_norm": 29.278518676757812, "learning_rate": 9.478834915362158e-05, "loss": 1.7824, "step": 1374 }, { "epoch": 0.5206361226808027, "grad_norm": 33.05897521972656, "learning_rate": 9.466865345011256e-05, "loss": 0.8399, "step": 1375 }, { "epoch": 0.5210147671336615, "grad_norm": 10.60912799835205, "learning_rate": 9.454896540614137e-05, "loss": 3.4099, "step": 1376 }, { "epoch": 0.5213934115865203, "grad_norm": 11.779023170471191, "learning_rate": 9.442928519366358e-05, "loss": 3.1778, "step": 1377 }, { "epoch": 0.521772056039379, "grad_norm": 11.27571964263916, "learning_rate": 9.43096129846237e-05, "loss": 3.0483, "step": 1378 }, { "epoch": 0.5221507004922378, "grad_norm": 12.95877742767334, "learning_rate": 9.418994895095455e-05, "loss": 3.6757, "step": 1379 }, { "epoch": 0.5225293449450965, "grad_norm": 10.566274642944336, "learning_rate": 9.407029326457727e-05, "loss": 1.876, "step": 1380 }, { "epoch": 0.5229079893979554, "grad_norm": 9.094916343688965, "learning_rate": 9.395064609740098e-05, "loss": 2.1696, "step": 1381 }, { "epoch": 0.5232866338508141, "grad_norm": 11.233878135681152, "learning_rate": 9.383100762132268e-05, "loss": 2.6414, "step": 1382 }, { "epoch": 0.5236652783036728, "grad_norm": 10.057049751281738, "learning_rate": 9.371137800822676e-05, "loss": 2.6053, "step": 1383 }, { "epoch": 0.5240439227565317, "grad_norm": 11.56207275390625, "learning_rate": 9.359175742998487e-05, "loss": 2.5873, "step": 1384 }, { "epoch": 0.5244225672093904, "grad_norm": 11.773988723754883, "learning_rate": 9.347214605845572e-05, "loss": 2.0471, "step": 1385 }, { "epoch": 0.5248012116622491, "grad_norm": 10.886469841003418, "learning_rate": 9.335254406548485e-05, "loss": 2.4144, "step": 1386 }, { "epoch": 0.5251798561151079, "grad_norm": 11.620787620544434, "learning_rate": 9.323295162290426e-05, "loss": 2.5279, "step": 1387 }, { "epoch": 0.5255585005679667, "grad_norm": 11.3408842086792, "learning_rate": 9.311336890253222e-05, "loss": 2.411, "step": 1388 }, { "epoch": 0.5259371450208254, "grad_norm": 10.297867774963379, "learning_rate": 9.2993796076173e-05, "loss": 1.4556, "step": 1389 }, { "epoch": 0.5263157894736842, "grad_norm": 12.511656761169434, "learning_rate": 9.28742333156168e-05, "loss": 2.3806, "step": 1390 }, { "epoch": 0.526694433926543, "grad_norm": 11.93029499053955, "learning_rate": 9.275468079263918e-05, "loss": 1.439, "step": 1391 }, { "epoch": 0.5270730783794018, "grad_norm": 15.259188652038574, "learning_rate": 9.263513867900113e-05, "loss": 2.1696, "step": 1392 }, { "epoch": 0.5274517228322605, "grad_norm": 15.692005157470703, "learning_rate": 9.25156071464486e-05, "loss": 2.3041, "step": 1393 }, { "epoch": 0.5278303672851192, "grad_norm": 17.192949295043945, "learning_rate": 9.239608636671241e-05, "loss": 1.9343, "step": 1394 }, { "epoch": 0.5282090117379781, "grad_norm": 18.41315269470215, "learning_rate": 9.227657651150785e-05, "loss": 2.4181, "step": 1395 }, { "epoch": 0.5285876561908368, "grad_norm": 16.52252197265625, "learning_rate": 9.215707775253459e-05, "loss": 1.3533, "step": 1396 }, { "epoch": 0.5289663006436955, "grad_norm": 17.064762115478516, "learning_rate": 9.203759026147623e-05, "loss": 1.4858, "step": 1397 }, { "epoch": 0.5293449450965544, "grad_norm": 17.58762550354004, "learning_rate": 9.19181142100004e-05, "loss": 1.5188, "step": 1398 }, { "epoch": 0.5297235895494131, "grad_norm": 13.186128616333008, "learning_rate": 9.17986497697581e-05, "loss": 0.8084, "step": 1399 }, { "epoch": 0.5301022340022719, "grad_norm": 14.67434310913086, "learning_rate": 9.167919711238375e-05, "loss": 0.9687, "step": 1400 }, { "epoch": 0.5304808784551306, "grad_norm": 9.912325859069824, "learning_rate": 9.155975640949474e-05, "loss": 3.0574, "step": 1401 }, { "epoch": 0.5308595229079894, "grad_norm": 11.234810829162598, "learning_rate": 9.144032783269145e-05, "loss": 3.4685, "step": 1402 }, { "epoch": 0.5312381673608482, "grad_norm": 10.217662811279297, "learning_rate": 9.132091155355669e-05, "loss": 2.3554, "step": 1403 }, { "epoch": 0.5316168118137069, "grad_norm": 12.113655090332031, "learning_rate": 9.120150774365566e-05, "loss": 2.9232, "step": 1404 }, { "epoch": 0.5319954562665657, "grad_norm": 11.836487770080566, "learning_rate": 9.108211657453561e-05, "loss": 2.3512, "step": 1405 }, { "epoch": 0.5323741007194245, "grad_norm": 12.795507431030273, "learning_rate": 9.09627382177257e-05, "loss": 2.0436, "step": 1406 }, { "epoch": 0.5327527451722832, "grad_norm": 13.872608184814453, "learning_rate": 9.084337284473666e-05, "loss": 2.8823, "step": 1407 }, { "epoch": 0.533131389625142, "grad_norm": 11.411468505859375, "learning_rate": 9.072402062706052e-05, "loss": 2.4558, "step": 1408 }, { "epoch": 0.5335100340780008, "grad_norm": 11.590777397155762, "learning_rate": 9.060468173617037e-05, "loss": 2.7861, "step": 1409 }, { "epoch": 0.5338886785308595, "grad_norm": 12.85844898223877, "learning_rate": 9.048535634352035e-05, "loss": 2.7793, "step": 1410 }, { "epoch": 0.5342673229837183, "grad_norm": 11.9672212600708, "learning_rate": 9.036604462054499e-05, "loss": 2.2917, "step": 1411 }, { "epoch": 0.5346459674365771, "grad_norm": 10.404266357421875, "learning_rate": 9.024674673865931e-05, "loss": 1.8752, "step": 1412 }, { "epoch": 0.5350246118894358, "grad_norm": 13.43481159210205, "learning_rate": 9.012746286925837e-05, "loss": 2.5337, "step": 1413 }, { "epoch": 0.5354032563422946, "grad_norm": 11.364248275756836, "learning_rate": 9.000819318371716e-05, "loss": 1.8052, "step": 1414 }, { "epoch": 0.5357819007951533, "grad_norm": 16.928659439086914, "learning_rate": 8.988893785339023e-05, "loss": 3.3743, "step": 1415 }, { "epoch": 0.5361605452480122, "grad_norm": 13.398554801940918, "learning_rate": 8.976969704961158e-05, "loss": 2.1919, "step": 1416 }, { "epoch": 0.5365391897008709, "grad_norm": 12.915987968444824, "learning_rate": 8.965047094369425e-05, "loss": 1.6691, "step": 1417 }, { "epoch": 0.5369178341537296, "grad_norm": 14.154939651489258, "learning_rate": 8.953125970693027e-05, "loss": 2.0216, "step": 1418 }, { "epoch": 0.5372964786065885, "grad_norm": 12.075900077819824, "learning_rate": 8.941206351059022e-05, "loss": 1.3928, "step": 1419 }, { "epoch": 0.5376751230594472, "grad_norm": 16.365467071533203, "learning_rate": 8.929288252592312e-05, "loss": 1.6521, "step": 1420 }, { "epoch": 0.5380537675123059, "grad_norm": 16.188310623168945, "learning_rate": 8.917371692415604e-05, "loss": 2.1927, "step": 1421 }, { "epoch": 0.5384324119651647, "grad_norm": 18.18952178955078, "learning_rate": 8.905456687649413e-05, "loss": 1.5676, "step": 1422 }, { "epoch": 0.5388110564180235, "grad_norm": 15.597309112548828, "learning_rate": 8.893543255412005e-05, "loss": 1.2254, "step": 1423 }, { "epoch": 0.5391897008708822, "grad_norm": 23.57103729248047, "learning_rate": 8.881631412819391e-05, "loss": 1.8856, "step": 1424 }, { "epoch": 0.539568345323741, "grad_norm": 37.834720611572266, "learning_rate": 8.869721176985292e-05, "loss": 3.2871, "step": 1425 }, { "epoch": 0.5399469897765998, "grad_norm": 10.20251178741455, "learning_rate": 8.857812565021138e-05, "loss": 3.8475, "step": 1426 }, { "epoch": 0.5403256342294586, "grad_norm": 10.686601638793945, "learning_rate": 8.845905594036005e-05, "loss": 3.0081, "step": 1427 }, { "epoch": 0.5407042786823173, "grad_norm": 11.162152290344238, "learning_rate": 8.834000281136628e-05, "loss": 2.5349, "step": 1428 }, { "epoch": 0.541082923135176, "grad_norm": 14.102071762084961, "learning_rate": 8.822096643427342e-05, "loss": 4.0784, "step": 1429 }, { "epoch": 0.5414615675880349, "grad_norm": 13.380843162536621, "learning_rate": 8.810194698010099e-05, "loss": 2.542, "step": 1430 }, { "epoch": 0.5418402120408936, "grad_norm": 11.970414161682129, "learning_rate": 8.798294461984399e-05, "loss": 2.5757, "step": 1431 }, { "epoch": 0.5422188564937523, "grad_norm": 14.05677604675293, "learning_rate": 8.786395952447295e-05, "loss": 2.4476, "step": 1432 }, { "epoch": 0.5425975009466111, "grad_norm": 10.139318466186523, "learning_rate": 8.774499186493355e-05, "loss": 2.0435, "step": 1433 }, { "epoch": 0.5429761453994699, "grad_norm": 11.968459129333496, "learning_rate": 8.762604181214654e-05, "loss": 1.9219, "step": 1434 }, { "epoch": 0.5433547898523287, "grad_norm": 11.334850311279297, "learning_rate": 8.750710953700722e-05, "loss": 1.8097, "step": 1435 }, { "epoch": 0.5437334343051874, "grad_norm": 13.664094924926758, "learning_rate": 8.738819521038544e-05, "loss": 2.3751, "step": 1436 }, { "epoch": 0.5441120787580462, "grad_norm": 11.123648643493652, "learning_rate": 8.726929900312522e-05, "loss": 2.1468, "step": 1437 }, { "epoch": 0.544490723210905, "grad_norm": 13.197842597961426, "learning_rate": 8.715042108604459e-05, "loss": 1.3656, "step": 1438 }, { "epoch": 0.5448693676637637, "grad_norm": 14.880546569824219, "learning_rate": 8.703156162993524e-05, "loss": 2.4551, "step": 1439 }, { "epoch": 0.5452480121166224, "grad_norm": 15.34524154663086, "learning_rate": 8.691272080556245e-05, "loss": 2.3583, "step": 1440 }, { "epoch": 0.5456266565694813, "grad_norm": 15.10900592803955, "learning_rate": 8.67938987836646e-05, "loss": 2.0645, "step": 1441 }, { "epoch": 0.54600530102234, "grad_norm": 11.277461051940918, "learning_rate": 8.667509573495313e-05, "loss": 1.2385, "step": 1442 }, { "epoch": 0.5463839454751988, "grad_norm": 15.505908012390137, "learning_rate": 8.655631183011223e-05, "loss": 2.1701, "step": 1443 }, { "epoch": 0.5467625899280576, "grad_norm": 14.985272407531738, "learning_rate": 8.643754723979855e-05, "loss": 1.8104, "step": 1444 }, { "epoch": 0.5471412343809163, "grad_norm": 13.010786056518555, "learning_rate": 8.631880213464095e-05, "loss": 1.5154, "step": 1445 }, { "epoch": 0.5475198788337751, "grad_norm": 19.537506103515625, "learning_rate": 8.620007668524041e-05, "loss": 1.8402, "step": 1446 }, { "epoch": 0.5478985232866338, "grad_norm": 14.794095039367676, "learning_rate": 8.608137106216963e-05, "loss": 1.4104, "step": 1447 }, { "epoch": 0.5482771677394926, "grad_norm": 15.571112632751465, "learning_rate": 8.596268543597277e-05, "loss": 1.0502, "step": 1448 }, { "epoch": 0.5486558121923514, "grad_norm": 17.556333541870117, "learning_rate": 8.584401997716524e-05, "loss": 1.0116, "step": 1449 }, { "epoch": 0.5490344566452101, "grad_norm": 17.548202514648438, "learning_rate": 8.572537485623366e-05, "loss": 1.6556, "step": 1450 }, { "epoch": 0.549413101098069, "grad_norm": 8.829911231994629, "learning_rate": 8.560675024363521e-05, "loss": 3.0078, "step": 1451 }, { "epoch": 0.5497917455509277, "grad_norm": 10.047151565551758, "learning_rate": 8.548814630979774e-05, "loss": 2.7423, "step": 1452 }, { "epoch": 0.5501703900037864, "grad_norm": 11.719676971435547, "learning_rate": 8.536956322511927e-05, "loss": 2.5747, "step": 1453 }, { "epoch": 0.5505490344566452, "grad_norm": 11.917893409729004, "learning_rate": 8.525100115996806e-05, "loss": 2.9659, "step": 1454 }, { "epoch": 0.550927678909504, "grad_norm": 9.511363983154297, "learning_rate": 8.5132460284682e-05, "loss": 1.793, "step": 1455 }, { "epoch": 0.5513063233623627, "grad_norm": 12.761884689331055, "learning_rate": 8.501394076956862e-05, "loss": 3.1682, "step": 1456 }, { "epoch": 0.5516849678152215, "grad_norm": 11.228672981262207, "learning_rate": 8.489544278490463e-05, "loss": 1.9256, "step": 1457 }, { "epoch": 0.5520636122680803, "grad_norm": 12.580592155456543, "learning_rate": 8.477696650093605e-05, "loss": 2.1031, "step": 1458 }, { "epoch": 0.552442256720939, "grad_norm": 12.473135948181152, "learning_rate": 8.465851208787752e-05, "loss": 1.6606, "step": 1459 }, { "epoch": 0.5528209011737978, "grad_norm": 14.68351936340332, "learning_rate": 8.454007971591234e-05, "loss": 2.2417, "step": 1460 }, { "epoch": 0.5531995456266565, "grad_norm": 10.24385929107666, "learning_rate": 8.442166955519209e-05, "loss": 1.5332, "step": 1461 }, { "epoch": 0.5535781900795154, "grad_norm": 11.53762149810791, "learning_rate": 8.430328177583652e-05, "loss": 1.5537, "step": 1462 }, { "epoch": 0.5539568345323741, "grad_norm": 12.351707458496094, "learning_rate": 8.418491654793314e-05, "loss": 1.409, "step": 1463 }, { "epoch": 0.5543354789852328, "grad_norm": 12.86665153503418, "learning_rate": 8.406657404153716e-05, "loss": 2.2676, "step": 1464 }, { "epoch": 0.5547141234380917, "grad_norm": 13.24296760559082, "learning_rate": 8.394825442667099e-05, "loss": 1.3261, "step": 1465 }, { "epoch": 0.5550927678909504, "grad_norm": 16.09861946105957, "learning_rate": 8.382995787332435e-05, "loss": 1.7134, "step": 1466 }, { "epoch": 0.5554714123438091, "grad_norm": 16.746488571166992, "learning_rate": 8.371168455145369e-05, "loss": 2.2622, "step": 1467 }, { "epoch": 0.5558500567966679, "grad_norm": 16.900218963623047, "learning_rate": 8.359343463098211e-05, "loss": 2.0996, "step": 1468 }, { "epoch": 0.5562287012495267, "grad_norm": 12.299836158752441, "learning_rate": 8.347520828179904e-05, "loss": 1.431, "step": 1469 }, { "epoch": 0.5566073457023855, "grad_norm": 17.264678955078125, "learning_rate": 8.335700567376022e-05, "loss": 1.8366, "step": 1470 }, { "epoch": 0.5569859901552442, "grad_norm": 12.874016761779785, "learning_rate": 8.32388269766871e-05, "loss": 1.3755, "step": 1471 }, { "epoch": 0.557364634608103, "grad_norm": 20.252826690673828, "learning_rate": 8.312067236036686e-05, "loss": 1.6212, "step": 1472 }, { "epoch": 0.5577432790609618, "grad_norm": 22.93166732788086, "learning_rate": 8.300254199455202e-05, "loss": 1.1822, "step": 1473 }, { "epoch": 0.5581219235138205, "grad_norm": 20.5257568359375, "learning_rate": 8.288443604896037e-05, "loss": 1.6172, "step": 1474 }, { "epoch": 0.5585005679666792, "grad_norm": 26.587392807006836, "learning_rate": 8.276635469327453e-05, "loss": 1.2758, "step": 1475 }, { "epoch": 0.5588792124195381, "grad_norm": 9.38240909576416, "learning_rate": 8.264829809714179e-05, "loss": 3.7325, "step": 1476 }, { "epoch": 0.5592578568723968, "grad_norm": 11.468181610107422, "learning_rate": 8.253026643017387e-05, "loss": 2.9115, "step": 1477 }, { "epoch": 0.5596365013252556, "grad_norm": 11.618803977966309, "learning_rate": 8.241225986194678e-05, "loss": 2.391, "step": 1478 }, { "epoch": 0.5600151457781144, "grad_norm": 9.317463874816895, "learning_rate": 8.22942785620003e-05, "loss": 2.0636, "step": 1479 }, { "epoch": 0.5603937902309731, "grad_norm": 12.376789093017578, "learning_rate": 8.217632269983805e-05, "loss": 2.3723, "step": 1480 }, { "epoch": 0.5607724346838319, "grad_norm": 11.691132545471191, "learning_rate": 8.205839244492696e-05, "loss": 2.7211, "step": 1481 }, { "epoch": 0.5611510791366906, "grad_norm": 11.633959770202637, "learning_rate": 8.194048796669735e-05, "loss": 2.0301, "step": 1482 }, { "epoch": 0.5615297235895494, "grad_norm": 14.521055221557617, "learning_rate": 8.18226094345424e-05, "loss": 2.3052, "step": 1483 }, { "epoch": 0.5619083680424082, "grad_norm": 12.353595733642578, "learning_rate": 8.1704757017818e-05, "loss": 2.3252, "step": 1484 }, { "epoch": 0.5622870124952669, "grad_norm": 11.810384750366211, "learning_rate": 8.158693088584249e-05, "loss": 2.1598, "step": 1485 }, { "epoch": 0.5626656569481258, "grad_norm": 12.60114574432373, "learning_rate": 8.146913120789661e-05, "loss": 1.9394, "step": 1486 }, { "epoch": 0.5630443014009845, "grad_norm": 12.273627281188965, "learning_rate": 8.13513581532229e-05, "loss": 2.1709, "step": 1487 }, { "epoch": 0.5634229458538432, "grad_norm": 11.343994140625, "learning_rate": 8.12336118910258e-05, "loss": 1.5691, "step": 1488 }, { "epoch": 0.563801590306702, "grad_norm": 15.734774589538574, "learning_rate": 8.111589259047114e-05, "loss": 2.3682, "step": 1489 }, { "epoch": 0.5641802347595608, "grad_norm": 10.958934783935547, "learning_rate": 8.099820042068611e-05, "loss": 1.8171, "step": 1490 }, { "epoch": 0.5645588792124195, "grad_norm": 13.153830528259277, "learning_rate": 8.088053555075888e-05, "loss": 1.7173, "step": 1491 }, { "epoch": 0.5649375236652783, "grad_norm": 12.882039070129395, "learning_rate": 8.076289814973835e-05, "loss": 1.6263, "step": 1492 }, { "epoch": 0.5653161681181371, "grad_norm": 13.913519859313965, "learning_rate": 8.0645288386634e-05, "loss": 1.6379, "step": 1493 }, { "epoch": 0.5656948125709959, "grad_norm": 14.35728931427002, "learning_rate": 8.052770643041567e-05, "loss": 1.8375, "step": 1494 }, { "epoch": 0.5660734570238546, "grad_norm": 14.613945960998535, "learning_rate": 8.041015245001317e-05, "loss": 1.6447, "step": 1495 }, { "epoch": 0.5664521014767133, "grad_norm": 12.095513343811035, "learning_rate": 8.029262661431611e-05, "loss": 1.1415, "step": 1496 }, { "epoch": 0.5668307459295722, "grad_norm": 12.972216606140137, "learning_rate": 8.017512909217363e-05, "loss": 1.2514, "step": 1497 }, { "epoch": 0.5672093903824309, "grad_norm": 13.936952590942383, "learning_rate": 8.005766005239437e-05, "loss": 0.6841, "step": 1498 }, { "epoch": 0.5675880348352896, "grad_norm": 16.832914352416992, "learning_rate": 7.994021966374585e-05, "loss": 1.1084, "step": 1499 }, { "epoch": 0.5679666792881485, "grad_norm": 18.750106811523438, "learning_rate": 7.982280809495454e-05, "loss": 1.3188, "step": 1500 }, { "epoch": 0.5683453237410072, "grad_norm": 9.141880989074707, "learning_rate": 7.97054255147054e-05, "loss": 2.9659, "step": 1501 }, { "epoch": 0.568723968193866, "grad_norm": 12.191515922546387, "learning_rate": 7.958807209164191e-05, "loss": 4.0187, "step": 1502 }, { "epoch": 0.5691026126467247, "grad_norm": 11.993229866027832, "learning_rate": 7.947074799436551e-05, "loss": 3.2366, "step": 1503 }, { "epoch": 0.5694812570995835, "grad_norm": 10.854785919189453, "learning_rate": 7.935345339143559e-05, "loss": 2.6519, "step": 1504 }, { "epoch": 0.5698599015524423, "grad_norm": 12.944028854370117, "learning_rate": 7.923618845136905e-05, "loss": 2.6492, "step": 1505 }, { "epoch": 0.570238546005301, "grad_norm": 9.960305213928223, "learning_rate": 7.911895334264037e-05, "loss": 1.684, "step": 1506 }, { "epoch": 0.5706171904581598, "grad_norm": 11.914902687072754, "learning_rate": 7.900174823368101e-05, "loss": 2.3815, "step": 1507 }, { "epoch": 0.5709958349110186, "grad_norm": 11.254213333129883, "learning_rate": 7.888457329287937e-05, "loss": 2.1002, "step": 1508 }, { "epoch": 0.5713744793638773, "grad_norm": 11.244025230407715, "learning_rate": 7.876742868858051e-05, "loss": 1.655, "step": 1509 }, { "epoch": 0.571753123816736, "grad_norm": 11.511517524719238, "learning_rate": 7.865031458908596e-05, "loss": 2.0051, "step": 1510 }, { "epoch": 0.5721317682695949, "grad_norm": 11.663280487060547, "learning_rate": 7.853323116265332e-05, "loss": 2.0539, "step": 1511 }, { "epoch": 0.5725104127224536, "grad_norm": 12.518500328063965, "learning_rate": 7.841617857749622e-05, "loss": 1.7307, "step": 1512 }, { "epoch": 0.5728890571753124, "grad_norm": 11.354671478271484, "learning_rate": 7.82991570017839e-05, "loss": 1.709, "step": 1513 }, { "epoch": 0.5732677016281712, "grad_norm": 15.118674278259277, "learning_rate": 7.818216660364115e-05, "loss": 2.3183, "step": 1514 }, { "epoch": 0.5736463460810299, "grad_norm": 12.973912239074707, "learning_rate": 7.806520755114784e-05, "loss": 1.3712, "step": 1515 }, { "epoch": 0.5740249905338887, "grad_norm": 9.847899436950684, "learning_rate": 7.79482800123389e-05, "loss": 0.9702, "step": 1516 }, { "epoch": 0.5744036349867474, "grad_norm": 14.514501571655273, "learning_rate": 7.783138415520391e-05, "loss": 1.8236, "step": 1517 }, { "epoch": 0.5747822794396062, "grad_norm": 17.328170776367188, "learning_rate": 7.771452014768707e-05, "loss": 1.4498, "step": 1518 }, { "epoch": 0.575160923892465, "grad_norm": 17.00506591796875, "learning_rate": 7.759768815768666e-05, "loss": 2.0773, "step": 1519 }, { "epoch": 0.5755395683453237, "grad_norm": 26.234683990478516, "learning_rate": 7.748088835305504e-05, "loss": 2.1469, "step": 1520 }, { "epoch": 0.5759182127981826, "grad_norm": 17.365535736083984, "learning_rate": 7.73641209015983e-05, "loss": 1.283, "step": 1521 }, { "epoch": 0.5762968572510413, "grad_norm": 17.68212890625, "learning_rate": 7.724738597107613e-05, "loss": 1.5122, "step": 1522 }, { "epoch": 0.5766755017039, "grad_norm": 21.17378807067871, "learning_rate": 7.71306837292014e-05, "loss": 1.8685, "step": 1523 }, { "epoch": 0.5770541461567588, "grad_norm": 19.040849685668945, "learning_rate": 7.701401434364004e-05, "loss": 1.5123, "step": 1524 }, { "epoch": 0.5774327906096176, "grad_norm": 29.659151077270508, "learning_rate": 7.689737798201077e-05, "loss": 1.8413, "step": 1525 }, { "epoch": 0.5778114350624763, "grad_norm": 8.76536750793457, "learning_rate": 7.678077481188492e-05, "loss": 2.4995, "step": 1526 }, { "epoch": 0.5781900795153351, "grad_norm": 10.525693893432617, "learning_rate": 7.66642050007861e-05, "loss": 3.0242, "step": 1527 }, { "epoch": 0.5785687239681939, "grad_norm": 12.041314125061035, "learning_rate": 7.654766871618996e-05, "loss": 2.677, "step": 1528 }, { "epoch": 0.5789473684210527, "grad_norm": 12.015291213989258, "learning_rate": 7.6431166125524e-05, "loss": 2.6627, "step": 1529 }, { "epoch": 0.5793260128739114, "grad_norm": 11.851201057434082, "learning_rate": 7.631469739616736e-05, "loss": 2.3649, "step": 1530 }, { "epoch": 0.5797046573267701, "grad_norm": 12.085638999938965, "learning_rate": 7.619826269545047e-05, "loss": 2.0379, "step": 1531 }, { "epoch": 0.580083301779629, "grad_norm": 13.909358978271484, "learning_rate": 7.608186219065491e-05, "loss": 2.3182, "step": 1532 }, { "epoch": 0.5804619462324877, "grad_norm": 11.08741283416748, "learning_rate": 7.596549604901309e-05, "loss": 1.9734, "step": 1533 }, { "epoch": 0.5808405906853464, "grad_norm": 12.980990409851074, "learning_rate": 7.584916443770809e-05, "loss": 1.7537, "step": 1534 }, { "epoch": 0.5812192351382052, "grad_norm": 10.36605167388916, "learning_rate": 7.573286752387339e-05, "loss": 1.6055, "step": 1535 }, { "epoch": 0.581597879591064, "grad_norm": 10.553936004638672, "learning_rate": 7.561660547459254e-05, "loss": 1.097, "step": 1536 }, { "epoch": 0.5819765240439227, "grad_norm": 10.760160446166992, "learning_rate": 7.55003784568991e-05, "loss": 1.4099, "step": 1537 }, { "epoch": 0.5823551684967815, "grad_norm": 10.054794311523438, "learning_rate": 7.538418663777626e-05, "loss": 1.3054, "step": 1538 }, { "epoch": 0.5827338129496403, "grad_norm": 12.116494178771973, "learning_rate": 7.526803018415663e-05, "loss": 1.7737, "step": 1539 }, { "epoch": 0.5831124574024991, "grad_norm": 13.055445671081543, "learning_rate": 7.515190926292202e-05, "loss": 2.8606, "step": 1540 }, { "epoch": 0.5834911018553578, "grad_norm": 14.023700714111328, "learning_rate": 7.503582404090314e-05, "loss": 1.962, "step": 1541 }, { "epoch": 0.5838697463082165, "grad_norm": 10.624642372131348, "learning_rate": 7.491977468487954e-05, "loss": 1.5016, "step": 1542 }, { "epoch": 0.5842483907610754, "grad_norm": 13.45031452178955, "learning_rate": 7.480376136157911e-05, "loss": 0.6894, "step": 1543 }, { "epoch": 0.5846270352139341, "grad_norm": 24.013113021850586, "learning_rate": 7.468778423767806e-05, "loss": 2.5765, "step": 1544 }, { "epoch": 0.5850056796667928, "grad_norm": 13.491368293762207, "learning_rate": 7.457184347980048e-05, "loss": 0.8878, "step": 1545 }, { "epoch": 0.5853843241196517, "grad_norm": 14.426898956298828, "learning_rate": 7.445593925451836e-05, "loss": 1.2205, "step": 1546 }, { "epoch": 0.5857629685725104, "grad_norm": 19.010414123535156, "learning_rate": 7.434007172835113e-05, "loss": 1.7827, "step": 1547 }, { "epoch": 0.5861416130253692, "grad_norm": 15.635154724121094, "learning_rate": 7.422424106776548e-05, "loss": 1.3202, "step": 1548 }, { "epoch": 0.5865202574782279, "grad_norm": 12.9798583984375, "learning_rate": 7.410844743917508e-05, "loss": 0.5754, "step": 1549 }, { "epoch": 0.5868989019310867, "grad_norm": 8.101846694946289, "learning_rate": 7.399269100894061e-05, "loss": 0.4233, "step": 1550 }, { "epoch": 0.5872775463839455, "grad_norm": 9.274727821350098, "learning_rate": 7.387697194336907e-05, "loss": 3.147, "step": 1551 }, { "epoch": 0.5876561908368042, "grad_norm": 13.327276229858398, "learning_rate": 7.37612904087139e-05, "loss": 3.536, "step": 1552 }, { "epoch": 0.588034835289663, "grad_norm": 11.654951095581055, "learning_rate": 7.364564657117452e-05, "loss": 2.1161, "step": 1553 }, { "epoch": 0.5884134797425218, "grad_norm": 11.4492769241333, "learning_rate": 7.353004059689639e-05, "loss": 2.244, "step": 1554 }, { "epoch": 0.5887921241953805, "grad_norm": 11.104554176330566, "learning_rate": 7.341447265197038e-05, "loss": 2.1674, "step": 1555 }, { "epoch": 0.5891707686482393, "grad_norm": 11.553046226501465, "learning_rate": 7.329894290243278e-05, "loss": 2.1246, "step": 1556 }, { "epoch": 0.5895494131010981, "grad_norm": 13.842034339904785, "learning_rate": 7.318345151426502e-05, "loss": 2.2264, "step": 1557 }, { "epoch": 0.5899280575539568, "grad_norm": 16.452533721923828, "learning_rate": 7.306799865339342e-05, "loss": 2.8113, "step": 1558 }, { "epoch": 0.5903067020068156, "grad_norm": 11.77454662322998, "learning_rate": 7.295258448568894e-05, "loss": 1.6987, "step": 1559 }, { "epoch": 0.5906853464596744, "grad_norm": 12.484272956848145, "learning_rate": 7.28372091769669e-05, "loss": 1.822, "step": 1560 }, { "epoch": 0.5910639909125331, "grad_norm": 12.614217758178711, "learning_rate": 7.272187289298689e-05, "loss": 1.9588, "step": 1561 }, { "epoch": 0.5914426353653919, "grad_norm": 12.127996444702148, "learning_rate": 7.260657579945238e-05, "loss": 1.4436, "step": 1562 }, { "epoch": 0.5918212798182506, "grad_norm": 11.5969877243042, "learning_rate": 7.249131806201052e-05, "loss": 1.6575, "step": 1563 }, { "epoch": 0.5921999242711095, "grad_norm": 16.556608200073242, "learning_rate": 7.237609984625194e-05, "loss": 1.2706, "step": 1564 }, { "epoch": 0.5925785687239682, "grad_norm": 14.947774887084961, "learning_rate": 7.226092131771044e-05, "loss": 1.98, "step": 1565 }, { "epoch": 0.5929572131768269, "grad_norm": 11.194940567016602, "learning_rate": 7.214578264186292e-05, "loss": 1.2654, "step": 1566 }, { "epoch": 0.5933358576296858, "grad_norm": 14.525056838989258, "learning_rate": 7.203068398412891e-05, "loss": 1.8403, "step": 1567 }, { "epoch": 0.5937145020825445, "grad_norm": 20.65723419189453, "learning_rate": 7.191562550987048e-05, "loss": 2.7911, "step": 1568 }, { "epoch": 0.5940931465354032, "grad_norm": 15.444211959838867, "learning_rate": 7.180060738439194e-05, "loss": 1.6506, "step": 1569 }, { "epoch": 0.594471790988262, "grad_norm": 12.846336364746094, "learning_rate": 7.168562977293973e-05, "loss": 1.1754, "step": 1570 }, { "epoch": 0.5948504354411208, "grad_norm": 16.229555130004883, "learning_rate": 7.1570692840702e-05, "loss": 1.4905, "step": 1571 }, { "epoch": 0.5952290798939796, "grad_norm": 17.11097526550293, "learning_rate": 7.145579675280846e-05, "loss": 1.4694, "step": 1572 }, { "epoch": 0.5956077243468383, "grad_norm": 20.334720611572266, "learning_rate": 7.134094167433011e-05, "loss": 1.1872, "step": 1573 }, { "epoch": 0.5959863687996971, "grad_norm": 17.798227310180664, "learning_rate": 7.122612777027915e-05, "loss": 0.8351, "step": 1574 }, { "epoch": 0.5963650132525559, "grad_norm": 29.591724395751953, "learning_rate": 7.111135520560852e-05, "loss": 1.2727, "step": 1575 }, { "epoch": 0.5967436577054146, "grad_norm": 8.743229866027832, "learning_rate": 7.09966241452118e-05, "loss": 3.0307, "step": 1576 }, { "epoch": 0.5971223021582733, "grad_norm": 9.908708572387695, "learning_rate": 7.088193475392288e-05, "loss": 2.5491, "step": 1577 }, { "epoch": 0.5975009466111322, "grad_norm": 9.864290237426758, "learning_rate": 7.076728719651593e-05, "loss": 1.7458, "step": 1578 }, { "epoch": 0.5978795910639909, "grad_norm": 12.844700813293457, "learning_rate": 7.065268163770489e-05, "loss": 2.7917, "step": 1579 }, { "epoch": 0.5982582355168496, "grad_norm": 11.376795768737793, "learning_rate": 7.053811824214339e-05, "loss": 1.9335, "step": 1580 }, { "epoch": 0.5986368799697085, "grad_norm": 13.557278633117676, "learning_rate": 7.042359717442448e-05, "loss": 2.0026, "step": 1581 }, { "epoch": 0.5990155244225672, "grad_norm": 12.9830961227417, "learning_rate": 7.030911859908047e-05, "loss": 2.0402, "step": 1582 }, { "epoch": 0.599394168875426, "grad_norm": 13.103957176208496, "learning_rate": 7.019468268058253e-05, "loss": 1.8606, "step": 1583 }, { "epoch": 0.5997728133282847, "grad_norm": 11.742697715759277, "learning_rate": 7.008028958334054e-05, "loss": 1.7867, "step": 1584 }, { "epoch": 0.6001514577811435, "grad_norm": 12.962128639221191, "learning_rate": 6.996593947170292e-05, "loss": 1.8571, "step": 1585 }, { "epoch": 0.6005301022340023, "grad_norm": 13.344432830810547, "learning_rate": 6.985163250995635e-05, "loss": 2.1094, "step": 1586 }, { "epoch": 0.600908746686861, "grad_norm": 14.72805404663086, "learning_rate": 6.973736886232545e-05, "loss": 2.2171, "step": 1587 }, { "epoch": 0.6012873911397199, "grad_norm": 13.182044982910156, "learning_rate": 6.962314869297261e-05, "loss": 1.3178, "step": 1588 }, { "epoch": 0.6016660355925786, "grad_norm": 11.262931823730469, "learning_rate": 6.950897216599778e-05, "loss": 1.7268, "step": 1589 }, { "epoch": 0.6020446800454373, "grad_norm": 12.56017017364502, "learning_rate": 6.939483944543826e-05, "loss": 1.6671, "step": 1590 }, { "epoch": 0.602423324498296, "grad_norm": 14.90925407409668, "learning_rate": 6.928075069526833e-05, "loss": 1.8046, "step": 1591 }, { "epoch": 0.6028019689511549, "grad_norm": 11.18519401550293, "learning_rate": 6.916670607939914e-05, "loss": 1.1406, "step": 1592 }, { "epoch": 0.6031806134040136, "grad_norm": 17.620182037353516, "learning_rate": 6.905270576167839e-05, "loss": 1.4821, "step": 1593 }, { "epoch": 0.6035592578568724, "grad_norm": 12.173837661743164, "learning_rate": 6.893874990589023e-05, "loss": 0.7965, "step": 1594 }, { "epoch": 0.6039379023097312, "grad_norm": 15.003965377807617, "learning_rate": 6.882483867575484e-05, "loss": 1.2245, "step": 1595 }, { "epoch": 0.60431654676259, "grad_norm": 18.290672302246094, "learning_rate": 6.871097223492833e-05, "loss": 1.2314, "step": 1596 }, { "epoch": 0.6046951912154487, "grad_norm": 16.927207946777344, "learning_rate": 6.859715074700239e-05, "loss": 0.8744, "step": 1597 }, { "epoch": 0.6050738356683074, "grad_norm": 15.662732124328613, "learning_rate": 6.848337437550427e-05, "loss": 1.2892, "step": 1598 }, { "epoch": 0.6054524801211663, "grad_norm": 18.691028594970703, "learning_rate": 6.83696432838963e-05, "loss": 0.9354, "step": 1599 }, { "epoch": 0.605831124574025, "grad_norm": 24.3162899017334, "learning_rate": 6.825595763557573e-05, "loss": 0.8494, "step": 1600 }, { "epoch": 0.6062097690268837, "grad_norm": 9.813689231872559, "learning_rate": 6.814231759387457e-05, "loss": 2.7551, "step": 1601 }, { "epoch": 0.6065884134797426, "grad_norm": 11.249338150024414, "learning_rate": 6.802872332205936e-05, "loss": 3.2319, "step": 1602 }, { "epoch": 0.6069670579326013, "grad_norm": 12.754864692687988, "learning_rate": 6.791517498333079e-05, "loss": 3.7526, "step": 1603 }, { "epoch": 0.60734570238546, "grad_norm": 11.166112899780273, "learning_rate": 6.780167274082359e-05, "loss": 2.138, "step": 1604 }, { "epoch": 0.6077243468383188, "grad_norm": 11.834754943847656, "learning_rate": 6.768821675760626e-05, "loss": 2.2302, "step": 1605 }, { "epoch": 0.6081029912911776, "grad_norm": 11.900190353393555, "learning_rate": 6.757480719668086e-05, "loss": 1.8719, "step": 1606 }, { "epoch": 0.6084816357440364, "grad_norm": 11.648653984069824, "learning_rate": 6.746144422098272e-05, "loss": 2.31, "step": 1607 }, { "epoch": 0.6088602801968951, "grad_norm": 12.013163566589355, "learning_rate": 6.734812799338028e-05, "loss": 1.8018, "step": 1608 }, { "epoch": 0.6092389246497539, "grad_norm": 12.163000106811523, "learning_rate": 6.72348586766748e-05, "loss": 1.6066, "step": 1609 }, { "epoch": 0.6096175691026127, "grad_norm": 12.44361686706543, "learning_rate": 6.712163643360014e-05, "loss": 1.7852, "step": 1610 }, { "epoch": 0.6099962135554714, "grad_norm": 12.513566017150879, "learning_rate": 6.700846142682254e-05, "loss": 1.334, "step": 1611 }, { "epoch": 0.6103748580083301, "grad_norm": 10.970904350280762, "learning_rate": 6.689533381894035e-05, "loss": 1.3563, "step": 1612 }, { "epoch": 0.610753502461189, "grad_norm": 14.621386528015137, "learning_rate": 6.678225377248383e-05, "loss": 1.8224, "step": 1613 }, { "epoch": 0.6111321469140477, "grad_norm": 15.46141529083252, "learning_rate": 6.666922144991496e-05, "loss": 2.9525, "step": 1614 }, { "epoch": 0.6115107913669064, "grad_norm": 10.950165748596191, "learning_rate": 6.655623701362709e-05, "loss": 1.0231, "step": 1615 }, { "epoch": 0.6118894358197653, "grad_norm": 15.33579158782959, "learning_rate": 6.644330062594479e-05, "loss": 2.0181, "step": 1616 }, { "epoch": 0.612268080272624, "grad_norm": 15.245741844177246, "learning_rate": 6.633041244912357e-05, "loss": 1.9906, "step": 1617 }, { "epoch": 0.6126467247254828, "grad_norm": 11.654925346374512, "learning_rate": 6.621757264534978e-05, "loss": 0.9377, "step": 1618 }, { "epoch": 0.6130253691783415, "grad_norm": 13.580925941467285, "learning_rate": 6.610478137674018e-05, "loss": 1.2033, "step": 1619 }, { "epoch": 0.6134040136312003, "grad_norm": 14.483481407165527, "learning_rate": 6.59920388053418e-05, "loss": 1.0346, "step": 1620 }, { "epoch": 0.6137826580840591, "grad_norm": 17.45773696899414, "learning_rate": 6.58793450931317e-05, "loss": 1.4839, "step": 1621 }, { "epoch": 0.6141613025369178, "grad_norm": 24.019071578979492, "learning_rate": 6.576670040201685e-05, "loss": 1.1557, "step": 1622 }, { "epoch": 0.6145399469897767, "grad_norm": 17.134916305541992, "learning_rate": 6.565410489383368e-05, "loss": 1.7296, "step": 1623 }, { "epoch": 0.6149185914426354, "grad_norm": 22.194942474365234, "learning_rate": 6.554155873034797e-05, "loss": 1.4728, "step": 1624 }, { "epoch": 0.6152972358954941, "grad_norm": 11.970625877380371, "learning_rate": 6.542906207325463e-05, "loss": 0.5581, "step": 1625 }, { "epoch": 0.6156758803483529, "grad_norm": 10.641767501831055, "learning_rate": 6.531661508417748e-05, "loss": 2.8425, "step": 1626 }, { "epoch": 0.6160545248012117, "grad_norm": 9.416507720947266, "learning_rate": 6.520421792466893e-05, "loss": 2.2395, "step": 1627 }, { "epoch": 0.6164331692540704, "grad_norm": 10.790740013122559, "learning_rate": 6.509187075620982e-05, "loss": 1.7253, "step": 1628 }, { "epoch": 0.6168118137069292, "grad_norm": 11.30124568939209, "learning_rate": 6.497957374020911e-05, "loss": 2.2069, "step": 1629 }, { "epoch": 0.617190458159788, "grad_norm": 10.714532852172852, "learning_rate": 6.486732703800383e-05, "loss": 1.4562, "step": 1630 }, { "epoch": 0.6175691026126467, "grad_norm": 10.571722984313965, "learning_rate": 6.475513081085864e-05, "loss": 1.9811, "step": 1631 }, { "epoch": 0.6179477470655055, "grad_norm": 15.255899429321289, "learning_rate": 6.464298521996565e-05, "loss": 2.4763, "step": 1632 }, { "epoch": 0.6183263915183642, "grad_norm": 15.959975242614746, "learning_rate": 6.45308904264443e-05, "loss": 2.0421, "step": 1633 }, { "epoch": 0.6187050359712231, "grad_norm": 13.930294036865234, "learning_rate": 6.441884659134104e-05, "loss": 2.191, "step": 1634 }, { "epoch": 0.6190836804240818, "grad_norm": 11.18117618560791, "learning_rate": 6.430685387562905e-05, "loss": 1.9175, "step": 1635 }, { "epoch": 0.6194623248769405, "grad_norm": 14.037912368774414, "learning_rate": 6.419491244020812e-05, "loss": 2.3341, "step": 1636 }, { "epoch": 0.6198409693297994, "grad_norm": 11.853575706481934, "learning_rate": 6.40830224459043e-05, "loss": 1.4432, "step": 1637 }, { "epoch": 0.6202196137826581, "grad_norm": 11.784163475036621, "learning_rate": 6.397118405346984e-05, "loss": 1.6225, "step": 1638 }, { "epoch": 0.6205982582355168, "grad_norm": 14.302020072937012, "learning_rate": 6.38593974235828e-05, "loss": 2.014, "step": 1639 }, { "epoch": 0.6209769026883756, "grad_norm": 12.568229675292969, "learning_rate": 6.374766271684685e-05, "loss": 1.5157, "step": 1640 }, { "epoch": 0.6213555471412344, "grad_norm": 15.189311027526855, "learning_rate": 6.363598009379102e-05, "loss": 1.5156, "step": 1641 }, { "epoch": 0.6217341915940932, "grad_norm": 14.21696662902832, "learning_rate": 6.352434971486966e-05, "loss": 1.7828, "step": 1642 }, { "epoch": 0.6221128360469519, "grad_norm": 12.216951370239258, "learning_rate": 6.341277174046196e-05, "loss": 1.2994, "step": 1643 }, { "epoch": 0.6224914804998106, "grad_norm": 15.08211612701416, "learning_rate": 6.330124633087179e-05, "loss": 1.5579, "step": 1644 }, { "epoch": 0.6228701249526695, "grad_norm": 10.246252059936523, "learning_rate": 6.318977364632756e-05, "loss": 1.0659, "step": 1645 }, { "epoch": 0.6232487694055282, "grad_norm": 16.210229873657227, "learning_rate": 6.307835384698194e-05, "loss": 1.1103, "step": 1646 }, { "epoch": 0.6236274138583869, "grad_norm": 10.36191463470459, "learning_rate": 6.296698709291158e-05, "loss": 0.7741, "step": 1647 }, { "epoch": 0.6240060583112458, "grad_norm": 18.22682762145996, "learning_rate": 6.285567354411692e-05, "loss": 1.0443, "step": 1648 }, { "epoch": 0.6243847027641045, "grad_norm": 13.446081161499023, "learning_rate": 6.274441336052195e-05, "loss": 0.5116, "step": 1649 }, { "epoch": 0.6247633472169632, "grad_norm": 13.318770408630371, "learning_rate": 6.263320670197407e-05, "loss": 1.0974, "step": 1650 }, { "epoch": 0.625141991669822, "grad_norm": 9.985925674438477, "learning_rate": 6.25220537282437e-05, "loss": 3.212, "step": 1651 }, { "epoch": 0.6255206361226808, "grad_norm": 10.251411437988281, "learning_rate": 6.241095459902416e-05, "loss": 2.7164, "step": 1652 }, { "epoch": 0.6258992805755396, "grad_norm": 11.226200103759766, "learning_rate": 6.229990947393138e-05, "loss": 2.2586, "step": 1653 }, { "epoch": 0.6262779250283983, "grad_norm": 11.405097007751465, "learning_rate": 6.218891851250376e-05, "loss": 2.1889, "step": 1654 }, { "epoch": 0.6266565694812571, "grad_norm": 10.564579010009766, "learning_rate": 6.207798187420188e-05, "loss": 1.8469, "step": 1655 }, { "epoch": 0.6270352139341159, "grad_norm": 12.711569786071777, "learning_rate": 6.196709971840814e-05, "loss": 1.7636, "step": 1656 }, { "epoch": 0.6274138583869746, "grad_norm": 12.631609916687012, "learning_rate": 6.185627220442688e-05, "loss": 1.7316, "step": 1657 }, { "epoch": 0.6277925028398333, "grad_norm": 13.21821403503418, "learning_rate": 6.17454994914838e-05, "loss": 2.1718, "step": 1658 }, { "epoch": 0.6281711472926922, "grad_norm": 15.37727165222168, "learning_rate": 6.163478173872588e-05, "loss": 1.8102, "step": 1659 }, { "epoch": 0.6285497917455509, "grad_norm": 12.71149730682373, "learning_rate": 6.152411910522117e-05, "loss": 2.1751, "step": 1660 }, { "epoch": 0.6289284361984097, "grad_norm": 14.922344207763672, "learning_rate": 6.141351174995844e-05, "loss": 1.968, "step": 1661 }, { "epoch": 0.6293070806512685, "grad_norm": 15.136613845825195, "learning_rate": 6.130295983184724e-05, "loss": 1.6706, "step": 1662 }, { "epoch": 0.6296857251041272, "grad_norm": 12.078071594238281, "learning_rate": 6.119246350971728e-05, "loss": 1.5198, "step": 1663 }, { "epoch": 0.630064369556986, "grad_norm": 15.14970874786377, "learning_rate": 6.108202294231848e-05, "loss": 1.4467, "step": 1664 }, { "epoch": 0.6304430140098447, "grad_norm": 10.578989028930664, "learning_rate": 6.09716382883206e-05, "loss": 1.1749, "step": 1665 }, { "epoch": 0.6308216584627035, "grad_norm": 11.3906888961792, "learning_rate": 6.0861309706313186e-05, "loss": 0.8337, "step": 1666 }, { "epoch": 0.6312003029155623, "grad_norm": 13.095053672790527, "learning_rate": 6.0751037354805116e-05, "loss": 1.2113, "step": 1667 }, { "epoch": 0.631578947368421, "grad_norm": 12.179159164428711, "learning_rate": 6.064082139222451e-05, "loss": 1.1175, "step": 1668 }, { "epoch": 0.6319575918212799, "grad_norm": 10.358685493469238, "learning_rate": 6.0530661976918436e-05, "loss": 0.8094, "step": 1669 }, { "epoch": 0.6323362362741386, "grad_norm": 14.990509986877441, "learning_rate": 6.042055926715287e-05, "loss": 1.3502, "step": 1670 }, { "epoch": 0.6327148807269973, "grad_norm": 14.088288307189941, "learning_rate": 6.031051342111216e-05, "loss": 0.928, "step": 1671 }, { "epoch": 0.6330935251798561, "grad_norm": 18.399137496948242, "learning_rate": 6.0200524596899e-05, "loss": 1.0594, "step": 1672 }, { "epoch": 0.6334721696327149, "grad_norm": 15.077046394348145, "learning_rate": 6.009059295253414e-05, "loss": 0.8398, "step": 1673 }, { "epoch": 0.6338508140855736, "grad_norm": 17.941368103027344, "learning_rate": 5.998071864595631e-05, "loss": 0.9831, "step": 1674 }, { "epoch": 0.6342294585384324, "grad_norm": 31.952571868896484, "learning_rate": 5.987090183502173e-05, "loss": 1.3572, "step": 1675 }, { "epoch": 0.6346081029912912, "grad_norm": 9.207669258117676, "learning_rate": 5.976114267750402e-05, "loss": 2.6796, "step": 1676 }, { "epoch": 0.63498674744415, "grad_norm": 14.74022102355957, "learning_rate": 5.965144133109401e-05, "loss": 1.9448, "step": 1677 }, { "epoch": 0.6353653918970087, "grad_norm": 9.095487594604492, "learning_rate": 5.9541797953399494e-05, "loss": 1.4069, "step": 1678 }, { "epoch": 0.6357440363498674, "grad_norm": 13.430038452148438, "learning_rate": 5.943221270194492e-05, "loss": 1.8898, "step": 1679 }, { "epoch": 0.6361226808027263, "grad_norm": 11.262763977050781, "learning_rate": 5.9322685734171254e-05, "loss": 1.3069, "step": 1680 }, { "epoch": 0.636501325255585, "grad_norm": 12.138426780700684, "learning_rate": 5.921321720743576e-05, "loss": 1.6482, "step": 1681 }, { "epoch": 0.6368799697084437, "grad_norm": 13.413007736206055, "learning_rate": 5.9103807279011725e-05, "loss": 1.4074, "step": 1682 }, { "epoch": 0.6372586141613026, "grad_norm": 12.61064338684082, "learning_rate": 5.899445610608819e-05, "loss": 1.7473, "step": 1683 }, { "epoch": 0.6376372586141613, "grad_norm": 12.261919975280762, "learning_rate": 5.8885163845769854e-05, "loss": 1.6637, "step": 1684 }, { "epoch": 0.63801590306702, "grad_norm": 11.287856101989746, "learning_rate": 5.8775930655076704e-05, "loss": 1.2522, "step": 1685 }, { "epoch": 0.6383945475198788, "grad_norm": 12.25338363647461, "learning_rate": 5.866675669094398e-05, "loss": 1.5052, "step": 1686 }, { "epoch": 0.6387731919727376, "grad_norm": 15.127142906188965, "learning_rate": 5.855764211022172e-05, "loss": 1.5966, "step": 1687 }, { "epoch": 0.6391518364255964, "grad_norm": 10.839902877807617, "learning_rate": 5.8448587069674666e-05, "loss": 1.1426, "step": 1688 }, { "epoch": 0.6395304808784551, "grad_norm": 10.834675788879395, "learning_rate": 5.833959172598202e-05, "loss": 1.323, "step": 1689 }, { "epoch": 0.6399091253313139, "grad_norm": 10.989094734191895, "learning_rate": 5.823065623573731e-05, "loss": 1.0836, "step": 1690 }, { "epoch": 0.6402877697841727, "grad_norm": 12.263811111450195, "learning_rate": 5.8121780755447966e-05, "loss": 1.0887, "step": 1691 }, { "epoch": 0.6406664142370314, "grad_norm": 13.519360542297363, "learning_rate": 5.8012965441535195e-05, "loss": 1.9619, "step": 1692 }, { "epoch": 0.6410450586898901, "grad_norm": 13.868734359741211, "learning_rate": 5.790421045033378e-05, "loss": 1.3938, "step": 1693 }, { "epoch": 0.641423703142749, "grad_norm": 14.806131362915039, "learning_rate": 5.779551593809196e-05, "loss": 0.9819, "step": 1694 }, { "epoch": 0.6418023475956077, "grad_norm": 10.446589469909668, "learning_rate": 5.768688206097092e-05, "loss": 1.0251, "step": 1695 }, { "epoch": 0.6421809920484665, "grad_norm": 9.428298950195312, "learning_rate": 5.757830897504479e-05, "loss": 0.6983, "step": 1696 }, { "epoch": 0.6425596365013253, "grad_norm": 12.95252799987793, "learning_rate": 5.746979683630033e-05, "loss": 0.9792, "step": 1697 }, { "epoch": 0.642938280954184, "grad_norm": 11.873827934265137, "learning_rate": 5.736134580063686e-05, "loss": 0.5997, "step": 1698 }, { "epoch": 0.6433169254070428, "grad_norm": 16.311582565307617, "learning_rate": 5.725295602386576e-05, "loss": 0.6361, "step": 1699 }, { "epoch": 0.6436955698599015, "grad_norm": 15.930671691894531, "learning_rate": 5.7144627661710496e-05, "loss": 0.5375, "step": 1700 }, { "epoch": 0.6440742143127604, "grad_norm": 9.91962718963623, "learning_rate": 5.7036360869806206e-05, "loss": 2.7529, "step": 1701 }, { "epoch": 0.6444528587656191, "grad_norm": 11.140485763549805, "learning_rate": 5.692815580369972e-05, "loss": 2.7474, "step": 1702 }, { "epoch": 0.6448315032184778, "grad_norm": 13.083551406860352, "learning_rate": 5.682001261884906e-05, "loss": 3.4048, "step": 1703 }, { "epoch": 0.6452101476713367, "grad_norm": 11.796628952026367, "learning_rate": 5.671193147062339e-05, "loss": 2.2006, "step": 1704 }, { "epoch": 0.6455887921241954, "grad_norm": 13.825222969055176, "learning_rate": 5.660391251430268e-05, "loss": 2.5323, "step": 1705 }, { "epoch": 0.6459674365770541, "grad_norm": 11.869791030883789, "learning_rate": 5.64959559050777e-05, "loss": 1.6295, "step": 1706 }, { "epoch": 0.6463460810299129, "grad_norm": 12.862152099609375, "learning_rate": 5.6388061798049516e-05, "loss": 1.5773, "step": 1707 }, { "epoch": 0.6467247254827717, "grad_norm": 12.500361442565918, "learning_rate": 5.6280230348229426e-05, "loss": 1.5375, "step": 1708 }, { "epoch": 0.6471033699356304, "grad_norm": 11.606138229370117, "learning_rate": 5.617246171053867e-05, "loss": 1.8171, "step": 1709 }, { "epoch": 0.6474820143884892, "grad_norm": 12.76290225982666, "learning_rate": 5.60647560398084e-05, "loss": 1.9527, "step": 1710 }, { "epoch": 0.647860658841348, "grad_norm": 13.754075050354004, "learning_rate": 5.5957113490779125e-05, "loss": 1.4713, "step": 1711 }, { "epoch": 0.6482393032942068, "grad_norm": 10.187721252441406, "learning_rate": 5.584953421810075e-05, "loss": 1.1924, "step": 1712 }, { "epoch": 0.6486179477470655, "grad_norm": 12.58003044128418, "learning_rate": 5.574201837633226e-05, "loss": 1.4034, "step": 1713 }, { "epoch": 0.6489965921999242, "grad_norm": 11.447388648986816, "learning_rate": 5.5634566119941523e-05, "loss": 1.379, "step": 1714 }, { "epoch": 0.6493752366527831, "grad_norm": 12.835458755493164, "learning_rate": 5.5527177603305013e-05, "loss": 1.4084, "step": 1715 }, { "epoch": 0.6497538811056418, "grad_norm": 13.401217460632324, "learning_rate": 5.541985298070763e-05, "loss": 1.4964, "step": 1716 }, { "epoch": 0.6501325255585005, "grad_norm": 11.862832069396973, "learning_rate": 5.531259240634259e-05, "loss": 0.8745, "step": 1717 }, { "epoch": 0.6505111700113594, "grad_norm": 13.94647216796875, "learning_rate": 5.520539603431094e-05, "loss": 1.2798, "step": 1718 }, { "epoch": 0.6508898144642181, "grad_norm": 15.898548126220703, "learning_rate": 5.509826401862158e-05, "loss": 1.4354, "step": 1719 }, { "epoch": 0.6512684589170769, "grad_norm": 10.587430953979492, "learning_rate": 5.49911965131909e-05, "loss": 0.9296, "step": 1720 }, { "epoch": 0.6516471033699356, "grad_norm": 15.46255111694336, "learning_rate": 5.4884193671842606e-05, "loss": 0.8751, "step": 1721 }, { "epoch": 0.6520257478227944, "grad_norm": 13.906989097595215, "learning_rate": 5.477725564830758e-05, "loss": 0.8695, "step": 1722 }, { "epoch": 0.6524043922756532, "grad_norm": 15.492587089538574, "learning_rate": 5.467038259622351e-05, "loss": 1.0782, "step": 1723 }, { "epoch": 0.6527830367285119, "grad_norm": 10.68310546875, "learning_rate": 5.4563574669134754e-05, "loss": 0.4139, "step": 1724 }, { "epoch": 0.6531616811813707, "grad_norm": 21.091798782348633, "learning_rate": 5.4456832020492035e-05, "loss": 1.0104, "step": 1725 }, { "epoch": 0.6535403256342295, "grad_norm": 10.046089172363281, "learning_rate": 5.435015480365247e-05, "loss": 2.6476, "step": 1726 }, { "epoch": 0.6539189700870882, "grad_norm": 10.421363830566406, "learning_rate": 5.4243543171879005e-05, "loss": 1.6091, "step": 1727 }, { "epoch": 0.654297614539947, "grad_norm": 11.276755332946777, "learning_rate": 5.413699727834044e-05, "loss": 2.2515, "step": 1728 }, { "epoch": 0.6546762589928058, "grad_norm": 12.087563514709473, "learning_rate": 5.403051727611104e-05, "loss": 1.816, "step": 1729 }, { "epoch": 0.6550549034456645, "grad_norm": 12.18307876586914, "learning_rate": 5.392410331817055e-05, "loss": 1.6672, "step": 1730 }, { "epoch": 0.6554335478985233, "grad_norm": 12.103124618530273, "learning_rate": 5.3817755557403714e-05, "loss": 1.3725, "step": 1731 }, { "epoch": 0.6558121923513821, "grad_norm": 11.449660301208496, "learning_rate": 5.3711474146600225e-05, "loss": 1.243, "step": 1732 }, { "epoch": 0.6561908368042408, "grad_norm": 10.194807052612305, "learning_rate": 5.3605259238454365e-05, "loss": 1.7381, "step": 1733 }, { "epoch": 0.6565694812570996, "grad_norm": 13.64271354675293, "learning_rate": 5.3499110985565014e-05, "loss": 1.7795, "step": 1734 }, { "epoch": 0.6569481257099583, "grad_norm": 12.527607917785645, "learning_rate": 5.339302954043519e-05, "loss": 1.8996, "step": 1735 }, { "epoch": 0.6573267701628172, "grad_norm": 15.393874168395996, "learning_rate": 5.328701505547196e-05, "loss": 1.8334, "step": 1736 }, { "epoch": 0.6577054146156759, "grad_norm": 14.819316864013672, "learning_rate": 5.3181067682986106e-05, "loss": 2.2629, "step": 1737 }, { "epoch": 0.6580840590685346, "grad_norm": 11.10079288482666, "learning_rate": 5.3075187575192164e-05, "loss": 0.888, "step": 1738 }, { "epoch": 0.6584627035213935, "grad_norm": 13.662109375, "learning_rate": 5.29693748842079e-05, "loss": 1.2014, "step": 1739 }, { "epoch": 0.6588413479742522, "grad_norm": 12.86874008178711, "learning_rate": 5.286362976205424e-05, "loss": 1.3894, "step": 1740 }, { "epoch": 0.6592199924271109, "grad_norm": 11.5730562210083, "learning_rate": 5.275795236065501e-05, "loss": 1.5128, "step": 1741 }, { "epoch": 0.6595986368799697, "grad_norm": 13.506361961364746, "learning_rate": 5.2652342831836857e-05, "loss": 1.1636, "step": 1742 }, { "epoch": 0.6599772813328285, "grad_norm": 12.426424026489258, "learning_rate": 5.254680132732879e-05, "loss": 1.0925, "step": 1743 }, { "epoch": 0.6603559257856872, "grad_norm": 19.900741577148438, "learning_rate": 5.244132799876216e-05, "loss": 0.7747, "step": 1744 }, { "epoch": 0.660734570238546, "grad_norm": 12.254927635192871, "learning_rate": 5.233592299767027e-05, "loss": 1.2514, "step": 1745 }, { "epoch": 0.6611132146914047, "grad_norm": 12.052815437316895, "learning_rate": 5.223058647548843e-05, "loss": 0.524, "step": 1746 }, { "epoch": 0.6614918591442636, "grad_norm": 13.00101375579834, "learning_rate": 5.212531858355343e-05, "loss": 0.6209, "step": 1747 }, { "epoch": 0.6618705035971223, "grad_norm": 18.25760269165039, "learning_rate": 5.20201194731035e-05, "loss": 1.6117, "step": 1748 }, { "epoch": 0.662249148049981, "grad_norm": 20.45023536682129, "learning_rate": 5.1914989295278006e-05, "loss": 1.3157, "step": 1749 }, { "epoch": 0.6626277925028399, "grad_norm": 16.092721939086914, "learning_rate": 5.1809928201117385e-05, "loss": 1.1807, "step": 1750 }, { "epoch": 0.6630064369556986, "grad_norm": 11.557684898376465, "learning_rate": 5.170493634156275e-05, "loss": 2.9544, "step": 1751 }, { "epoch": 0.6633850814085573, "grad_norm": 9.875344276428223, "learning_rate": 5.160001386745572e-05, "loss": 2.0698, "step": 1752 }, { "epoch": 0.6637637258614161, "grad_norm": 10.10754108428955, "learning_rate": 5.149516092953823e-05, "loss": 1.5838, "step": 1753 }, { "epoch": 0.6641423703142749, "grad_norm": 11.273519515991211, "learning_rate": 5.139037767845244e-05, "loss": 2.0788, "step": 1754 }, { "epoch": 0.6645210147671337, "grad_norm": 13.150516510009766, "learning_rate": 5.128566426474024e-05, "loss": 1.8676, "step": 1755 }, { "epoch": 0.6648996592199924, "grad_norm": 11.240986824035645, "learning_rate": 5.118102083884324e-05, "loss": 1.9268, "step": 1756 }, { "epoch": 0.6652783036728512, "grad_norm": 10.94487476348877, "learning_rate": 5.1076447551102505e-05, "loss": 1.2648, "step": 1757 }, { "epoch": 0.66565694812571, "grad_norm": 10.50646686553955, "learning_rate": 5.0971944551758264e-05, "loss": 1.3279, "step": 1758 }, { "epoch": 0.6660355925785687, "grad_norm": 15.379045486450195, "learning_rate": 5.086751199094992e-05, "loss": 1.4228, "step": 1759 }, { "epoch": 0.6664142370314274, "grad_norm": 11.095296859741211, "learning_rate": 5.0763150018715544e-05, "loss": 1.226, "step": 1760 }, { "epoch": 0.6667928814842863, "grad_norm": 12.928812026977539, "learning_rate": 5.065885878499184e-05, "loss": 1.7798, "step": 1761 }, { "epoch": 0.667171525937145, "grad_norm": 11.685075759887695, "learning_rate": 5.0554638439613836e-05, "loss": 1.6079, "step": 1762 }, { "epoch": 0.6675501703900037, "grad_norm": 10.985690116882324, "learning_rate": 5.0450489132314784e-05, "loss": 0.7803, "step": 1763 }, { "epoch": 0.6679288148428626, "grad_norm": 11.910584449768066, "learning_rate": 5.034641101272579e-05, "loss": 1.1839, "step": 1764 }, { "epoch": 0.6683074592957213, "grad_norm": 9.787635803222656, "learning_rate": 5.024240423037581e-05, "loss": 0.9764, "step": 1765 }, { "epoch": 0.6686861037485801, "grad_norm": 13.238972663879395, "learning_rate": 5.013846893469121e-05, "loss": 1.3853, "step": 1766 }, { "epoch": 0.6690647482014388, "grad_norm": 11.12163257598877, "learning_rate": 5.003460527499566e-05, "loss": 0.9702, "step": 1767 }, { "epoch": 0.6694433926542976, "grad_norm": 20.462934494018555, "learning_rate": 4.99308134005099e-05, "loss": 1.2393, "step": 1768 }, { "epoch": 0.6698220371071564, "grad_norm": 11.374852180480957, "learning_rate": 4.982709346035165e-05, "loss": 0.756, "step": 1769 }, { "epoch": 0.6702006815600151, "grad_norm": 18.068233489990234, "learning_rate": 4.9723445603535155e-05, "loss": 1.116, "step": 1770 }, { "epoch": 0.670579326012874, "grad_norm": 13.328974723815918, "learning_rate": 4.9619869978971133e-05, "loss": 0.8217, "step": 1771 }, { "epoch": 0.6709579704657327, "grad_norm": 14.381114959716797, "learning_rate": 4.9516366735466503e-05, "loss": 0.5854, "step": 1772 }, { "epoch": 0.6713366149185914, "grad_norm": 23.591033935546875, "learning_rate": 4.941293602172429e-05, "loss": 1.5894, "step": 1773 }, { "epoch": 0.6717152593714502, "grad_norm": 14.45864486694336, "learning_rate": 4.930957798634321e-05, "loss": 1.031, "step": 1774 }, { "epoch": 0.672093903824309, "grad_norm": 29.705875396728516, "learning_rate": 4.920629277781762e-05, "loss": 1.9322, "step": 1775 }, { "epoch": 0.6724725482771677, "grad_norm": 11.356168746948242, "learning_rate": 4.910308054453717e-05, "loss": 3.2243, "step": 1776 }, { "epoch": 0.6728511927300265, "grad_norm": 12.019675254821777, "learning_rate": 4.899994143478682e-05, "loss": 2.687, "step": 1777 }, { "epoch": 0.6732298371828853, "grad_norm": 10.62689208984375, "learning_rate": 4.889687559674634e-05, "loss": 1.5605, "step": 1778 }, { "epoch": 0.673608481635744, "grad_norm": 13.42601490020752, "learning_rate": 4.879388317849025e-05, "loss": 2.2713, "step": 1779 }, { "epoch": 0.6739871260886028, "grad_norm": 11.323573112487793, "learning_rate": 4.8690964327987576e-05, "loss": 1.7842, "step": 1780 }, { "epoch": 0.6743657705414615, "grad_norm": 10.211767196655273, "learning_rate": 4.858811919310177e-05, "loss": 1.4646, "step": 1781 }, { "epoch": 0.6747444149943204, "grad_norm": 9.475430488586426, "learning_rate": 4.848534792159024e-05, "loss": 1.0703, "step": 1782 }, { "epoch": 0.6751230594471791, "grad_norm": 10.751679420471191, "learning_rate": 4.8382650661104326e-05, "loss": 1.0614, "step": 1783 }, { "epoch": 0.6755017039000378, "grad_norm": 11.262096405029297, "learning_rate": 4.828002755918898e-05, "loss": 1.5222, "step": 1784 }, { "epoch": 0.6758803483528967, "grad_norm": 10.757667541503906, "learning_rate": 4.817747876328276e-05, "loss": 1.4398, "step": 1785 }, { "epoch": 0.6762589928057554, "grad_norm": 10.265283584594727, "learning_rate": 4.8075004420717315e-05, "loss": 1.2355, "step": 1786 }, { "epoch": 0.6766376372586141, "grad_norm": 10.750070571899414, "learning_rate": 4.7972604678717404e-05, "loss": 1.4494, "step": 1787 }, { "epoch": 0.6770162817114729, "grad_norm": 10.770772933959961, "learning_rate": 4.787027968440053e-05, "loss": 0.9969, "step": 1788 }, { "epoch": 0.6773949261643317, "grad_norm": 14.513525009155273, "learning_rate": 4.776802958477695e-05, "loss": 1.365, "step": 1789 }, { "epoch": 0.6777735706171905, "grad_norm": 13.11719036102295, "learning_rate": 4.76658545267492e-05, "loss": 0.9921, "step": 1790 }, { "epoch": 0.6781522150700492, "grad_norm": 12.227423667907715, "learning_rate": 4.7563754657112014e-05, "loss": 1.0625, "step": 1791 }, { "epoch": 0.678530859522908, "grad_norm": 12.794466018676758, "learning_rate": 4.746173012255212e-05, "loss": 1.0499, "step": 1792 }, { "epoch": 0.6789095039757668, "grad_norm": 12.062575340270996, "learning_rate": 4.7359781069648065e-05, "loss": 0.955, "step": 1793 }, { "epoch": 0.6792881484286255, "grad_norm": 14.67883014678955, "learning_rate": 4.725790764486988e-05, "loss": 1.0038, "step": 1794 }, { "epoch": 0.6796667928814842, "grad_norm": 12.12856674194336, "learning_rate": 4.715610999457898e-05, "loss": 0.9497, "step": 1795 }, { "epoch": 0.6800454373343431, "grad_norm": 14.579923629760742, "learning_rate": 4.7054388265027836e-05, "loss": 0.9125, "step": 1796 }, { "epoch": 0.6804240817872018, "grad_norm": 14.618803977966309, "learning_rate": 4.695274260236e-05, "loss": 1.2214, "step": 1797 }, { "epoch": 0.6808027262400606, "grad_norm": 14.874523162841797, "learning_rate": 4.68511731526096e-05, "loss": 0.9044, "step": 1798 }, { "epoch": 0.6811813706929194, "grad_norm": 21.41942596435547, "learning_rate": 4.674968006170134e-05, "loss": 0.8181, "step": 1799 }, { "epoch": 0.6815600151457781, "grad_norm": 14.437376976013184, "learning_rate": 4.664826347545013e-05, "loss": 0.6449, "step": 1800 }, { "epoch": 0.6819386595986369, "grad_norm": 9.211664199829102, "learning_rate": 4.6546923539561115e-05, "loss": 2.2155, "step": 1801 }, { "epoch": 0.6823173040514956, "grad_norm": 8.313687324523926, "learning_rate": 4.644566039962921e-05, "loss": 1.5871, "step": 1802 }, { "epoch": 0.6826959485043544, "grad_norm": 11.544638633728027, "learning_rate": 4.634447420113901e-05, "loss": 1.9184, "step": 1803 }, { "epoch": 0.6830745929572132, "grad_norm": 12.808171272277832, "learning_rate": 4.624336508946457e-05, "loss": 2.2559, "step": 1804 }, { "epoch": 0.6834532374100719, "grad_norm": 11.647665977478027, "learning_rate": 4.6142333209869215e-05, "loss": 1.8731, "step": 1805 }, { "epoch": 0.6838318818629308, "grad_norm": 10.710838317871094, "learning_rate": 4.6041378707505265e-05, "loss": 1.0663, "step": 1806 }, { "epoch": 0.6842105263157895, "grad_norm": 11.316000938415527, "learning_rate": 4.5940501727413966e-05, "loss": 1.2883, "step": 1807 }, { "epoch": 0.6845891707686482, "grad_norm": 10.989466667175293, "learning_rate": 4.583970241452511e-05, "loss": 1.2558, "step": 1808 }, { "epoch": 0.684967815221507, "grad_norm": 10.801972389221191, "learning_rate": 4.57389809136569e-05, "loss": 1.229, "step": 1809 }, { "epoch": 0.6853464596743658, "grad_norm": 15.042647361755371, "learning_rate": 4.563833736951581e-05, "loss": 1.2801, "step": 1810 }, { "epoch": 0.6857251041272245, "grad_norm": 11.570405006408691, "learning_rate": 4.553777192669622e-05, "loss": 1.2189, "step": 1811 }, { "epoch": 0.6861037485800833, "grad_norm": 10.531160354614258, "learning_rate": 4.543728472968035e-05, "loss": 1.022, "step": 1812 }, { "epoch": 0.6864823930329421, "grad_norm": 11.157527923583984, "learning_rate": 4.533687592283809e-05, "loss": 1.176, "step": 1813 }, { "epoch": 0.6868610374858009, "grad_norm": 12.036185264587402, "learning_rate": 4.523654565042657e-05, "loss": 0.7022, "step": 1814 }, { "epoch": 0.6872396819386596, "grad_norm": 10.491613388061523, "learning_rate": 4.513629405659014e-05, "loss": 0.8299, "step": 1815 }, { "epoch": 0.6876183263915183, "grad_norm": 11.52048110961914, "learning_rate": 4.503612128536012e-05, "loss": 1.1048, "step": 1816 }, { "epoch": 0.6879969708443772, "grad_norm": 12.521361351013184, "learning_rate": 4.493602748065463e-05, "loss": 0.7601, "step": 1817 }, { "epoch": 0.6883756152972359, "grad_norm": 10.226174354553223, "learning_rate": 4.483601278627825e-05, "loss": 1.0436, "step": 1818 }, { "epoch": 0.6887542597500946, "grad_norm": 13.399140357971191, "learning_rate": 4.4736077345921964e-05, "loss": 0.8807, "step": 1819 }, { "epoch": 0.6891329042029535, "grad_norm": 16.16910171508789, "learning_rate": 4.463622130316283e-05, "loss": 1.2152, "step": 1820 }, { "epoch": 0.6895115486558122, "grad_norm": 10.724682807922363, "learning_rate": 4.453644480146395e-05, "loss": 0.7913, "step": 1821 }, { "epoch": 0.689890193108671, "grad_norm": 14.903891563415527, "learning_rate": 4.443674798417404e-05, "loss": 1.0286, "step": 1822 }, { "epoch": 0.6902688375615297, "grad_norm": 19.939565658569336, "learning_rate": 4.433713099452738e-05, "loss": 1.2353, "step": 1823 }, { "epoch": 0.6906474820143885, "grad_norm": 20.52518081665039, "learning_rate": 4.423759397564352e-05, "loss": 0.4131, "step": 1824 }, { "epoch": 0.6910261264672473, "grad_norm": 19.928573608398438, "learning_rate": 4.413813707052721e-05, "loss": 1.0443, "step": 1825 }, { "epoch": 0.691404770920106, "grad_norm": 10.02829647064209, "learning_rate": 4.4038760422068006e-05, "loss": 2.9946, "step": 1826 }, { "epoch": 0.6917834153729648, "grad_norm": 10.289177894592285, "learning_rate": 4.3939464173040215e-05, "loss": 1.8126, "step": 1827 }, { "epoch": 0.6921620598258236, "grad_norm": 13.08002758026123, "learning_rate": 4.384024846610254e-05, "loss": 2.3466, "step": 1828 }, { "epoch": 0.6925407042786823, "grad_norm": 8.982109069824219, "learning_rate": 4.374111344379815e-05, "loss": 0.9872, "step": 1829 }, { "epoch": 0.692919348731541, "grad_norm": 11.387478828430176, "learning_rate": 4.3642059248554135e-05, "loss": 1.3572, "step": 1830 }, { "epoch": 0.6932979931843999, "grad_norm": 12.528159141540527, "learning_rate": 4.3543086022681525e-05, "loss": 1.605, "step": 1831 }, { "epoch": 0.6936766376372586, "grad_norm": 13.56595516204834, "learning_rate": 4.344419390837495e-05, "loss": 1.7875, "step": 1832 }, { "epoch": 0.6940552820901174, "grad_norm": 9.830058097839355, "learning_rate": 4.334538304771266e-05, "loss": 1.124, "step": 1833 }, { "epoch": 0.6944339265429762, "grad_norm": 11.811656951904297, "learning_rate": 4.3246653582656026e-05, "loss": 1.5946, "step": 1834 }, { "epoch": 0.6948125709958349, "grad_norm": 13.503132820129395, "learning_rate": 4.3148005655049536e-05, "loss": 1.4873, "step": 1835 }, { "epoch": 0.6951912154486937, "grad_norm": 11.535758972167969, "learning_rate": 4.3049439406620485e-05, "loss": 1.4229, "step": 1836 }, { "epoch": 0.6955698599015524, "grad_norm": 11.357640266418457, "learning_rate": 4.295095497897892e-05, "loss": 0.9196, "step": 1837 }, { "epoch": 0.6959485043544112, "grad_norm": 8.721981048583984, "learning_rate": 4.285255251361725e-05, "loss": 0.9061, "step": 1838 }, { "epoch": 0.69632714880727, "grad_norm": 13.046642303466797, "learning_rate": 4.2754232151910154e-05, "loss": 1.0874, "step": 1839 }, { "epoch": 0.6967057932601287, "grad_norm": 11.249588012695312, "learning_rate": 4.265599403511432e-05, "loss": 1.2721, "step": 1840 }, { "epoch": 0.6970844377129876, "grad_norm": 9.732141494750977, "learning_rate": 4.255783830436837e-05, "loss": 1.0042, "step": 1841 }, { "epoch": 0.6974630821658463, "grad_norm": 12.074952125549316, "learning_rate": 4.245976510069246e-05, "loss": 0.855, "step": 1842 }, { "epoch": 0.697841726618705, "grad_norm": 13.875428199768066, "learning_rate": 4.236177456498824e-05, "loss": 1.2586, "step": 1843 }, { "epoch": 0.6982203710715638, "grad_norm": 10.86148452758789, "learning_rate": 4.2263866838038515e-05, "loss": 0.8662, "step": 1844 }, { "epoch": 0.6985990155244226, "grad_norm": 12.373757362365723, "learning_rate": 4.216604206050724e-05, "loss": 1.3314, "step": 1845 }, { "epoch": 0.6989776599772813, "grad_norm": 17.46698760986328, "learning_rate": 4.2068300372939105e-05, "loss": 0.9795, "step": 1846 }, { "epoch": 0.6993563044301401, "grad_norm": 15.95254898071289, "learning_rate": 4.1970641915759466e-05, "loss": 0.7993, "step": 1847 }, { "epoch": 0.6997349488829989, "grad_norm": 9.590779304504395, "learning_rate": 4.187306682927402e-05, "loss": 0.478, "step": 1848 }, { "epoch": 0.7001135933358577, "grad_norm": 16.6131534576416, "learning_rate": 4.177557525366884e-05, "loss": 0.5367, "step": 1849 }, { "epoch": 0.7004922377887164, "grad_norm": 28.021787643432617, "learning_rate": 4.16781673290099e-05, "loss": 1.0093, "step": 1850 }, { "epoch": 0.7008708822415751, "grad_norm": 9.637764930725098, "learning_rate": 4.1580843195243016e-05, "loss": 2.675, "step": 1851 }, { "epoch": 0.701249526694434, "grad_norm": 9.948892593383789, "learning_rate": 4.1483602992193614e-05, "loss": 1.6966, "step": 1852 }, { "epoch": 0.7016281711472927, "grad_norm": 11.320772171020508, "learning_rate": 4.1386446859566575e-05, "loss": 1.7174, "step": 1853 }, { "epoch": 0.7020068156001514, "grad_norm": 10.746500015258789, "learning_rate": 4.1289374936945935e-05, "loss": 1.7516, "step": 1854 }, { "epoch": 0.7023854600530102, "grad_norm": 11.209330558776855, "learning_rate": 4.119238736379485e-05, "loss": 1.4297, "step": 1855 }, { "epoch": 0.702764104505869, "grad_norm": 12.015074729919434, "learning_rate": 4.1095484279455186e-05, "loss": 1.3255, "step": 1856 }, { "epoch": 0.7031427489587277, "grad_norm": 10.18040657043457, "learning_rate": 4.099866582314747e-05, "loss": 1.347, "step": 1857 }, { "epoch": 0.7035213934115865, "grad_norm": 12.753857612609863, "learning_rate": 4.0901932133970624e-05, "loss": 1.6483, "step": 1858 }, { "epoch": 0.7039000378644453, "grad_norm": 10.111617088317871, "learning_rate": 4.080528335090181e-05, "loss": 1.1496, "step": 1859 }, { "epoch": 0.7042786823173041, "grad_norm": 11.63160514831543, "learning_rate": 4.070871961279617e-05, "loss": 1.1372, "step": 1860 }, { "epoch": 0.7046573267701628, "grad_norm": 12.99141788482666, "learning_rate": 4.0612241058386736e-05, "loss": 1.0544, "step": 1861 }, { "epoch": 0.7050359712230215, "grad_norm": 10.836715698242188, "learning_rate": 4.0515847826284073e-05, "loss": 0.9002, "step": 1862 }, { "epoch": 0.7054146156758804, "grad_norm": 12.811058044433594, "learning_rate": 4.0419540054976203e-05, "loss": 1.2274, "step": 1863 }, { "epoch": 0.7057932601287391, "grad_norm": 11.749159812927246, "learning_rate": 4.032331788282833e-05, "loss": 1.1407, "step": 1864 }, { "epoch": 0.7061719045815978, "grad_norm": 16.195268630981445, "learning_rate": 4.0227181448082775e-05, "loss": 1.5495, "step": 1865 }, { "epoch": 0.7065505490344567, "grad_norm": 11.737068176269531, "learning_rate": 4.013113088885857e-05, "loss": 0.7468, "step": 1866 }, { "epoch": 0.7069291934873154, "grad_norm": 12.873053550720215, "learning_rate": 4.003516634315143e-05, "loss": 0.8903, "step": 1867 }, { "epoch": 0.7073078379401742, "grad_norm": 15.100240707397461, "learning_rate": 3.993928794883343e-05, "loss": 1.054, "step": 1868 }, { "epoch": 0.7076864823930329, "grad_norm": 11.190807342529297, "learning_rate": 3.9843495843652986e-05, "loss": 0.8378, "step": 1869 }, { "epoch": 0.7080651268458917, "grad_norm": 14.742902755737305, "learning_rate": 3.974779016523447e-05, "loss": 1.07, "step": 1870 }, { "epoch": 0.7084437712987505, "grad_norm": 14.68301010131836, "learning_rate": 3.965217105107806e-05, "loss": 0.6623, "step": 1871 }, { "epoch": 0.7088224157516092, "grad_norm": 19.6263484954834, "learning_rate": 3.955663863855956e-05, "loss": 1.1637, "step": 1872 }, { "epoch": 0.709201060204468, "grad_norm": 8.338847160339355, "learning_rate": 3.946119306493035e-05, "loss": 0.206, "step": 1873 }, { "epoch": 0.7095797046573268, "grad_norm": 12.545536994934082, "learning_rate": 3.9365834467316874e-05, "loss": 0.4798, "step": 1874 }, { "epoch": 0.7099583491101855, "grad_norm": 23.568941116333008, "learning_rate": 3.9270562982720726e-05, "loss": 0.8148, "step": 1875 }, { "epoch": 0.7103369935630442, "grad_norm": 9.232584953308105, "learning_rate": 3.917537874801824e-05, "loss": 1.8688, "step": 1876 }, { "epoch": 0.7107156380159031, "grad_norm": 11.553984642028809, "learning_rate": 3.908028189996057e-05, "loss": 1.9732, "step": 1877 }, { "epoch": 0.7110942824687618, "grad_norm": 11.267699241638184, "learning_rate": 3.898527257517316e-05, "loss": 1.8366, "step": 1878 }, { "epoch": 0.7114729269216206, "grad_norm": 12.230257034301758, "learning_rate": 3.889035091015577e-05, "loss": 1.6373, "step": 1879 }, { "epoch": 0.7118515713744794, "grad_norm": 10.604666709899902, "learning_rate": 3.87955170412822e-05, "loss": 1.3017, "step": 1880 }, { "epoch": 0.7122302158273381, "grad_norm": 10.961071968078613, "learning_rate": 3.870077110480018e-05, "loss": 1.4312, "step": 1881 }, { "epoch": 0.7126088602801969, "grad_norm": 13.234198570251465, "learning_rate": 3.8606113236831054e-05, "loss": 1.3651, "step": 1882 }, { "epoch": 0.7129875047330556, "grad_norm": 11.261490821838379, "learning_rate": 3.8511543573369616e-05, "loss": 1.4866, "step": 1883 }, { "epoch": 0.7133661491859145, "grad_norm": 11.345348358154297, "learning_rate": 3.841706225028392e-05, "loss": 0.9801, "step": 1884 }, { "epoch": 0.7137447936387732, "grad_norm": 13.572781562805176, "learning_rate": 3.8322669403315246e-05, "loss": 1.1718, "step": 1885 }, { "epoch": 0.7141234380916319, "grad_norm": 11.878171920776367, "learning_rate": 3.822836516807762e-05, "loss": 1.163, "step": 1886 }, { "epoch": 0.7145020825444908, "grad_norm": 12.007922172546387, "learning_rate": 3.8134149680057775e-05, "loss": 1.1351, "step": 1887 }, { "epoch": 0.7148807269973495, "grad_norm": 13.84189510345459, "learning_rate": 3.804002307461495e-05, "loss": 1.2838, "step": 1888 }, { "epoch": 0.7152593714502082, "grad_norm": 9.613104820251465, "learning_rate": 3.794598548698076e-05, "loss": 0.7941, "step": 1889 }, { "epoch": 0.715638015903067, "grad_norm": 15.930800437927246, "learning_rate": 3.785203705225886e-05, "loss": 1.2321, "step": 1890 }, { "epoch": 0.7160166603559258, "grad_norm": 10.889845848083496, "learning_rate": 3.7758177905424794e-05, "loss": 0.924, "step": 1891 }, { "epoch": 0.7163953048087845, "grad_norm": 11.394916534423828, "learning_rate": 3.766440818132586e-05, "loss": 0.898, "step": 1892 }, { "epoch": 0.7167739492616433, "grad_norm": 12.949947357177734, "learning_rate": 3.757072801468092e-05, "loss": 0.9169, "step": 1893 }, { "epoch": 0.7171525937145021, "grad_norm": 11.941484451293945, "learning_rate": 3.747713754008013e-05, "loss": 0.9864, "step": 1894 }, { "epoch": 0.7175312381673609, "grad_norm": 15.34373664855957, "learning_rate": 3.738363689198477e-05, "loss": 0.876, "step": 1895 }, { "epoch": 0.7179098826202196, "grad_norm": 10.606496810913086, "learning_rate": 3.7290226204727066e-05, "loss": 0.5997, "step": 1896 }, { "epoch": 0.7182885270730783, "grad_norm": 11.646666526794434, "learning_rate": 3.7196905612510066e-05, "loss": 0.677, "step": 1897 }, { "epoch": 0.7186671715259372, "grad_norm": 13.844237327575684, "learning_rate": 3.710367524940731e-05, "loss": 0.6662, "step": 1898 }, { "epoch": 0.7190458159787959, "grad_norm": 7.377936363220215, "learning_rate": 3.701053524936271e-05, "loss": 0.263, "step": 1899 }, { "epoch": 0.7194244604316546, "grad_norm": 10.436668395996094, "learning_rate": 3.691748574619038e-05, "loss": 0.5728, "step": 1900 }, { "epoch": 0.7198031048845135, "grad_norm": 9.430604934692383, "learning_rate": 3.6824526873574403e-05, "loss": 2.504, "step": 1901 }, { "epoch": 0.7201817493373722, "grad_norm": 13.745413780212402, "learning_rate": 3.673165876506862e-05, "loss": 2.1829, "step": 1902 }, { "epoch": 0.720560393790231, "grad_norm": 9.768301963806152, "learning_rate": 3.663888155409657e-05, "loss": 1.6069, "step": 1903 }, { "epoch": 0.7209390382430897, "grad_norm": 11.978598594665527, "learning_rate": 3.654619537395112e-05, "loss": 1.9238, "step": 1904 }, { "epoch": 0.7213176826959485, "grad_norm": 11.131295204162598, "learning_rate": 3.645360035779436e-05, "loss": 1.5727, "step": 1905 }, { "epoch": 0.7216963271488073, "grad_norm": 10.731742858886719, "learning_rate": 3.6361096638657396e-05, "loss": 1.2462, "step": 1906 }, { "epoch": 0.722074971601666, "grad_norm": 11.437942504882812, "learning_rate": 3.626868434944023e-05, "loss": 1.1258, "step": 1907 }, { "epoch": 0.7224536160545248, "grad_norm": 13.103399276733398, "learning_rate": 3.617636362291139e-05, "loss": 1.6088, "step": 1908 }, { "epoch": 0.7228322605073836, "grad_norm": 10.438884735107422, "learning_rate": 3.6084134591708007e-05, "loss": 1.2029, "step": 1909 }, { "epoch": 0.7232109049602423, "grad_norm": 11.234047889709473, "learning_rate": 3.5991997388335376e-05, "loss": 0.9769, "step": 1910 }, { "epoch": 0.723589549413101, "grad_norm": 16.857528686523438, "learning_rate": 3.589995214516687e-05, "loss": 0.9308, "step": 1911 }, { "epoch": 0.7239681938659599, "grad_norm": 10.424492835998535, "learning_rate": 3.5807998994443725e-05, "loss": 1.0171, "step": 1912 }, { "epoch": 0.7243468383188186, "grad_norm": 11.165146827697754, "learning_rate": 3.571613806827496e-05, "loss": 0.9691, "step": 1913 }, { "epoch": 0.7247254827716774, "grad_norm": 14.247552871704102, "learning_rate": 3.562436949863702e-05, "loss": 1.3855, "step": 1914 }, { "epoch": 0.7251041272245362, "grad_norm": 20.06420135498047, "learning_rate": 3.5532693417373656e-05, "loss": 0.8901, "step": 1915 }, { "epoch": 0.7254827716773949, "grad_norm": 13.092787742614746, "learning_rate": 3.544110995619573e-05, "loss": 1.186, "step": 1916 }, { "epoch": 0.7258614161302537, "grad_norm": 11.927745819091797, "learning_rate": 3.534961924668113e-05, "loss": 0.7738, "step": 1917 }, { "epoch": 0.7262400605831124, "grad_norm": 12.455108642578125, "learning_rate": 3.525822142027441e-05, "loss": 0.7835, "step": 1918 }, { "epoch": 0.7266187050359713, "grad_norm": 14.28428840637207, "learning_rate": 3.516691660828668e-05, "loss": 0.7465, "step": 1919 }, { "epoch": 0.72699734948883, "grad_norm": 14.431679725646973, "learning_rate": 3.5075704941895404e-05, "loss": 1.0906, "step": 1920 }, { "epoch": 0.7273759939416887, "grad_norm": 11.988348007202148, "learning_rate": 3.498458655214431e-05, "loss": 0.5688, "step": 1921 }, { "epoch": 0.7277546383945476, "grad_norm": 16.642642974853516, "learning_rate": 3.489356156994301e-05, "loss": 1.0525, "step": 1922 }, { "epoch": 0.7281332828474063, "grad_norm": 14.775089263916016, "learning_rate": 3.4802630126067003e-05, "loss": 0.8226, "step": 1923 }, { "epoch": 0.728511927300265, "grad_norm": 12.773204803466797, "learning_rate": 3.471179235115729e-05, "loss": 0.49, "step": 1924 }, { "epoch": 0.7288905717531238, "grad_norm": 25.836416244506836, "learning_rate": 3.4621048375720455e-05, "loss": 1.1491, "step": 1925 }, { "epoch": 0.7292692162059826, "grad_norm": 10.431486129760742, "learning_rate": 3.453039833012819e-05, "loss": 2.532, "step": 1926 }, { "epoch": 0.7296478606588414, "grad_norm": 10.91063404083252, "learning_rate": 3.44398423446173e-05, "loss": 2.2555, "step": 1927 }, { "epoch": 0.7300265051117001, "grad_norm": 12.758307456970215, "learning_rate": 3.4349380549289386e-05, "loss": 2.5956, "step": 1928 }, { "epoch": 0.7304051495645589, "grad_norm": 14.680657386779785, "learning_rate": 3.425901307411087e-05, "loss": 2.2177, "step": 1929 }, { "epoch": 0.7307837940174177, "grad_norm": 10.425040245056152, "learning_rate": 3.416874004891252e-05, "loss": 1.573, "step": 1930 }, { "epoch": 0.7311624384702764, "grad_norm": 10.728297233581543, "learning_rate": 3.4078561603389445e-05, "loss": 1.2583, "step": 1931 }, { "epoch": 0.7315410829231351, "grad_norm": 14.865288734436035, "learning_rate": 3.3988477867100884e-05, "loss": 1.6658, "step": 1932 }, { "epoch": 0.731919727375994, "grad_norm": 10.989377975463867, "learning_rate": 3.389848896947006e-05, "loss": 1.4482, "step": 1933 }, { "epoch": 0.7322983718288527, "grad_norm": 13.664108276367188, "learning_rate": 3.3808595039783863e-05, "loss": 1.2909, "step": 1934 }, { "epoch": 0.7326770162817114, "grad_norm": 16.50423812866211, "learning_rate": 3.371879620719276e-05, "loss": 1.8106, "step": 1935 }, { "epoch": 0.7330556607345703, "grad_norm": 9.069459915161133, "learning_rate": 3.362909260071059e-05, "loss": 0.9974, "step": 1936 }, { "epoch": 0.733434305187429, "grad_norm": 12.695276260375977, "learning_rate": 3.3539484349214434e-05, "loss": 1.4379, "step": 1937 }, { "epoch": 0.7338129496402878, "grad_norm": 12.206181526184082, "learning_rate": 3.344997158144433e-05, "loss": 0.9946, "step": 1938 }, { "epoch": 0.7341915940931465, "grad_norm": 9.503734588623047, "learning_rate": 3.336055442600312e-05, "loss": 0.809, "step": 1939 }, { "epoch": 0.7345702385460053, "grad_norm": 9.835858345031738, "learning_rate": 3.327123301135627e-05, "loss": 0.9489, "step": 1940 }, { "epoch": 0.7349488829988641, "grad_norm": 11.248756408691406, "learning_rate": 3.318200746583182e-05, "loss": 0.9913, "step": 1941 }, { "epoch": 0.7353275274517228, "grad_norm": 11.77039623260498, "learning_rate": 3.3092877917619916e-05, "loss": 0.8691, "step": 1942 }, { "epoch": 0.7357061719045817, "grad_norm": 12.531624794006348, "learning_rate": 3.300384449477286e-05, "loss": 0.8603, "step": 1943 }, { "epoch": 0.7360848163574404, "grad_norm": 10.96230411529541, "learning_rate": 3.29149073252048e-05, "loss": 0.6922, "step": 1944 }, { "epoch": 0.7364634608102991, "grad_norm": 7.6564836502075195, "learning_rate": 3.282606653669174e-05, "loss": 0.4803, "step": 1945 }, { "epoch": 0.7368421052631579, "grad_norm": 16.544363021850586, "learning_rate": 3.273732225687103e-05, "loss": 1.2175, "step": 1946 }, { "epoch": 0.7372207497160167, "grad_norm": 9.774057388305664, "learning_rate": 3.264867461324147e-05, "loss": 0.3528, "step": 1947 }, { "epoch": 0.7375993941688754, "grad_norm": 8.952574729919434, "learning_rate": 3.2560123733163004e-05, "loss": 0.498, "step": 1948 }, { "epoch": 0.7379780386217342, "grad_norm": 16.56863021850586, "learning_rate": 3.2471669743856546e-05, "loss": 0.6109, "step": 1949 }, { "epoch": 0.738356683074593, "grad_norm": 18.624706268310547, "learning_rate": 3.2383312772403786e-05, "loss": 0.5395, "step": 1950 }, { "epoch": 0.7387353275274517, "grad_norm": 9.164006233215332, "learning_rate": 3.229505294574712e-05, "loss": 2.4068, "step": 1951 }, { "epoch": 0.7391139719803105, "grad_norm": 10.482575416564941, "learning_rate": 3.220689039068927e-05, "loss": 1.691, "step": 1952 }, { "epoch": 0.7394926164331692, "grad_norm": 10.619009017944336, "learning_rate": 3.211882523389327e-05, "loss": 1.463, "step": 1953 }, { "epoch": 0.7398712608860281, "grad_norm": 12.097020149230957, "learning_rate": 3.203085760188219e-05, "loss": 1.7848, "step": 1954 }, { "epoch": 0.7402499053388868, "grad_norm": 10.063963890075684, "learning_rate": 3.194298762103899e-05, "loss": 1.3469, "step": 1955 }, { "epoch": 0.7406285497917455, "grad_norm": 9.453032493591309, "learning_rate": 3.185521541760633e-05, "loss": 1.0812, "step": 1956 }, { "epoch": 0.7410071942446043, "grad_norm": 11.660839080810547, "learning_rate": 3.176754111768646e-05, "loss": 1.0648, "step": 1957 }, { "epoch": 0.7413858386974631, "grad_norm": 13.103858947753906, "learning_rate": 3.1679964847240894e-05, "loss": 1.5599, "step": 1958 }, { "epoch": 0.7417644831503218, "grad_norm": 12.977222442626953, "learning_rate": 3.159248673209032e-05, "loss": 0.8717, "step": 1959 }, { "epoch": 0.7421431276031806, "grad_norm": 11.061594009399414, "learning_rate": 3.150510689791439e-05, "loss": 1.1983, "step": 1960 }, { "epoch": 0.7425217720560394, "grad_norm": 12.495918273925781, "learning_rate": 3.141782547025167e-05, "loss": 0.9424, "step": 1961 }, { "epoch": 0.7429004165088982, "grad_norm": 11.077157974243164, "learning_rate": 3.1330642574499205e-05, "loss": 1.2254, "step": 1962 }, { "epoch": 0.7432790609617569, "grad_norm": 13.865285873413086, "learning_rate": 3.124355833591252e-05, "loss": 1.3142, "step": 1963 }, { "epoch": 0.7436577054146156, "grad_norm": 11.34232234954834, "learning_rate": 3.1156572879605426e-05, "loss": 1.073, "step": 1964 }, { "epoch": 0.7440363498674745, "grad_norm": 12.628717422485352, "learning_rate": 3.1069686330549844e-05, "loss": 0.8286, "step": 1965 }, { "epoch": 0.7444149943203332, "grad_norm": 12.249566078186035, "learning_rate": 3.09828988135755e-05, "loss": 0.8404, "step": 1966 }, { "epoch": 0.7447936387731919, "grad_norm": 13.016851425170898, "learning_rate": 3.0896210453369924e-05, "loss": 0.7903, "step": 1967 }, { "epoch": 0.7451722832260508, "grad_norm": 11.74942684173584, "learning_rate": 3.0809621374478106e-05, "loss": 0.6621, "step": 1968 }, { "epoch": 0.7455509276789095, "grad_norm": 12.12247371673584, "learning_rate": 3.072313170130251e-05, "loss": 1.0012, "step": 1969 }, { "epoch": 0.7459295721317682, "grad_norm": 7.160279750823975, "learning_rate": 3.063674155810271e-05, "loss": 0.3738, "step": 1970 }, { "epoch": 0.746308216584627, "grad_norm": 11.304152488708496, "learning_rate": 3.055045106899529e-05, "loss": 0.5173, "step": 1971 }, { "epoch": 0.7466868610374858, "grad_norm": 13.208447456359863, "learning_rate": 3.0464260357953643e-05, "loss": 0.756, "step": 1972 }, { "epoch": 0.7470655054903446, "grad_norm": 16.926542282104492, "learning_rate": 3.0378169548807888e-05, "loss": 0.5975, "step": 1973 }, { "epoch": 0.7474441499432033, "grad_norm": 10.966843605041504, "learning_rate": 3.029217876524455e-05, "loss": 0.461, "step": 1974 }, { "epoch": 0.7478227943960621, "grad_norm": 14.193634033203125, "learning_rate": 3.0206288130806447e-05, "loss": 0.9746, "step": 1975 }, { "epoch": 0.7482014388489209, "grad_norm": 9.383747100830078, "learning_rate": 3.0120497768892507e-05, "loss": 2.0945, "step": 1976 }, { "epoch": 0.7485800833017796, "grad_norm": 11.682758331298828, "learning_rate": 3.003480780275768e-05, "loss": 2.2888, "step": 1977 }, { "epoch": 0.7489587277546383, "grad_norm": 11.035470962524414, "learning_rate": 2.9949218355512586e-05, "loss": 1.5237, "step": 1978 }, { "epoch": 0.7493373722074972, "grad_norm": 12.306325912475586, "learning_rate": 2.9863729550123443e-05, "loss": 1.3262, "step": 1979 }, { "epoch": 0.7497160166603559, "grad_norm": 10.777233123779297, "learning_rate": 2.977834150941189e-05, "loss": 1.7051, "step": 1980 }, { "epoch": 0.7500946611132147, "grad_norm": 12.063309669494629, "learning_rate": 2.969305435605484e-05, "loss": 1.1838, "step": 1981 }, { "epoch": 0.7504733055660735, "grad_norm": 11.307486534118652, "learning_rate": 2.96078682125842e-05, "loss": 1.335, "step": 1982 }, { "epoch": 0.7508519500189322, "grad_norm": 10.423018455505371, "learning_rate": 2.9522783201386774e-05, "loss": 1.0347, "step": 1983 }, { "epoch": 0.7508519500189322, "eval_loss": 0.123275026679039, "eval_runtime": 897.0644, "eval_samples_per_second": 4.958, "eval_steps_per_second": 1.24, "step": 1983 }, { "epoch": 0.751230594471791, "grad_norm": 12.873611450195312, "learning_rate": 2.943779944470404e-05, "loss": 1.5529, "step": 1984 }, { "epoch": 0.7516092389246497, "grad_norm": 11.92525577545166, "learning_rate": 2.9352917064632112e-05, "loss": 0.7601, "step": 1985 }, { "epoch": 0.7519878833775085, "grad_norm": 12.782543182373047, "learning_rate": 2.926813618312134e-05, "loss": 1.086, "step": 1986 }, { "epoch": 0.7523665278303673, "grad_norm": 10.654260635375977, "learning_rate": 2.9183456921976304e-05, "loss": 1.0646, "step": 1987 }, { "epoch": 0.752745172283226, "grad_norm": 11.402060508728027, "learning_rate": 2.909887940285554e-05, "loss": 0.82, "step": 1988 }, { "epoch": 0.7531238167360849, "grad_norm": 15.131863594055176, "learning_rate": 2.901440374727149e-05, "loss": 1.0551, "step": 1989 }, { "epoch": 0.7535024611889436, "grad_norm": 12.485123634338379, "learning_rate": 2.8930030076590198e-05, "loss": 0.7528, "step": 1990 }, { "epoch": 0.7538811056418023, "grad_norm": 7.999950885772705, "learning_rate": 2.8845758512031186e-05, "loss": 0.4973, "step": 1991 }, { "epoch": 0.7542597500946611, "grad_norm": 13.8598051071167, "learning_rate": 2.876158917466726e-05, "loss": 0.8543, "step": 1992 }, { "epoch": 0.7546383945475199, "grad_norm": 13.514735221862793, "learning_rate": 2.867752218542443e-05, "loss": 0.9004, "step": 1993 }, { "epoch": 0.7550170390003786, "grad_norm": 13.104480743408203, "learning_rate": 2.8593557665081616e-05, "loss": 0.8655, "step": 1994 }, { "epoch": 0.7553956834532374, "grad_norm": 13.811482429504395, "learning_rate": 2.8509695734270492e-05, "loss": 0.5821, "step": 1995 }, { "epoch": 0.7557743279060962, "grad_norm": 13.989906311035156, "learning_rate": 2.8425936513475395e-05, "loss": 0.6422, "step": 1996 }, { "epoch": 0.756152972358955, "grad_norm": 10.727309226989746, "learning_rate": 2.834228012303306e-05, "loss": 0.4677, "step": 1997 }, { "epoch": 0.7565316168118137, "grad_norm": 15.157230377197266, "learning_rate": 2.8258726683132474e-05, "loss": 0.8738, "step": 1998 }, { "epoch": 0.7569102612646724, "grad_norm": 17.531665802001953, "learning_rate": 2.8175276313814813e-05, "loss": 0.7335, "step": 1999 }, { "epoch": 0.7572889057175313, "grad_norm": 10.896133422851562, "learning_rate": 2.809192913497306e-05, "loss": 0.5936, "step": 2000 }, { "epoch": 0.75766755017039, "grad_norm": 10.433149337768555, "learning_rate": 2.8008685266351988e-05, "loss": 2.4051, "step": 2001 }, { "epoch": 0.7580461946232487, "grad_norm": 9.504281044006348, "learning_rate": 2.7925544827547933e-05, "loss": 1.7767, "step": 2002 }, { "epoch": 0.7584248390761076, "grad_norm": 10.231252670288086, "learning_rate": 2.7842507938008666e-05, "loss": 1.8186, "step": 2003 }, { "epoch": 0.7588034835289663, "grad_norm": 9.87753677368164, "learning_rate": 2.775957471703311e-05, "loss": 1.2146, "step": 2004 }, { "epoch": 0.759182127981825, "grad_norm": 11.625948905944824, "learning_rate": 2.7676745283771388e-05, "loss": 1.5717, "step": 2005 }, { "epoch": 0.7595607724346838, "grad_norm": 11.169037818908691, "learning_rate": 2.7594019757224364e-05, "loss": 1.3143, "step": 2006 }, { "epoch": 0.7599394168875426, "grad_norm": 9.829212188720703, "learning_rate": 2.7511398256243716e-05, "loss": 0.9765, "step": 2007 }, { "epoch": 0.7603180613404014, "grad_norm": 10.827011108398438, "learning_rate": 2.7428880899531585e-05, "loss": 0.7852, "step": 2008 }, { "epoch": 0.7606967057932601, "grad_norm": 12.02198314666748, "learning_rate": 2.7346467805640585e-05, "loss": 1.0464, "step": 2009 }, { "epoch": 0.7610753502461189, "grad_norm": 12.59335708618164, "learning_rate": 2.7264159092973484e-05, "loss": 0.8952, "step": 2010 }, { "epoch": 0.7614539946989777, "grad_norm": 13.947036743164062, "learning_rate": 2.718195487978308e-05, "loss": 1.4524, "step": 2011 }, { "epoch": 0.7618326391518364, "grad_norm": 12.672119140625, "learning_rate": 2.7099855284172017e-05, "loss": 1.3051, "step": 2012 }, { "epoch": 0.7622112836046951, "grad_norm": 10.371209144592285, "learning_rate": 2.7017860424092712e-05, "loss": 0.7874, "step": 2013 }, { "epoch": 0.762589928057554, "grad_norm": 12.310194969177246, "learning_rate": 2.6935970417347057e-05, "loss": 0.9649, "step": 2014 }, { "epoch": 0.7629685725104127, "grad_norm": 10.61960506439209, "learning_rate": 2.6854185381586273e-05, "loss": 0.8684, "step": 2015 }, { "epoch": 0.7633472169632715, "grad_norm": 14.664178848266602, "learning_rate": 2.6772505434310803e-05, "loss": 1.0722, "step": 2016 }, { "epoch": 0.7637258614161303, "grad_norm": 8.189818382263184, "learning_rate": 2.6690930692870143e-05, "loss": 0.4872, "step": 2017 }, { "epoch": 0.764104505868989, "grad_norm": 11.075437545776367, "learning_rate": 2.6609461274462588e-05, "loss": 0.7672, "step": 2018 }, { "epoch": 0.7644831503218478, "grad_norm": 10.562186241149902, "learning_rate": 2.6528097296135135e-05, "loss": 0.727, "step": 2019 }, { "epoch": 0.7648617947747065, "grad_norm": 13.31433391571045, "learning_rate": 2.6446838874783254e-05, "loss": 0.7146, "step": 2020 }, { "epoch": 0.7652404392275653, "grad_norm": 10.966352462768555, "learning_rate": 2.636568612715087e-05, "loss": 0.3659, "step": 2021 }, { "epoch": 0.7656190836804241, "grad_norm": 23.072904586791992, "learning_rate": 2.6284639169829973e-05, "loss": 0.6862, "step": 2022 }, { "epoch": 0.7659977281332828, "grad_norm": 10.244979858398438, "learning_rate": 2.6203698119260632e-05, "loss": 0.3511, "step": 2023 }, { "epoch": 0.7663763725861417, "grad_norm": 11.128731727600098, "learning_rate": 2.6122863091730686e-05, "loss": 0.329, "step": 2024 }, { "epoch": 0.7667550170390004, "grad_norm": 34.793052673339844, "learning_rate": 2.6042134203375767e-05, "loss": 1.7021, "step": 2025 }, { "epoch": 0.7671336614918591, "grad_norm": 10.744704246520996, "learning_rate": 2.596151157017892e-05, "loss": 2.497, "step": 2026 }, { "epoch": 0.7675123059447179, "grad_norm": 11.219593048095703, "learning_rate": 2.588099530797058e-05, "loss": 2.192, "step": 2027 }, { "epoch": 0.7678909503975767, "grad_norm": 10.790613174438477, "learning_rate": 2.580058553242829e-05, "loss": 1.1766, "step": 2028 }, { "epoch": 0.7682695948504354, "grad_norm": 9.38663101196289, "learning_rate": 2.572028235907673e-05, "loss": 1.0091, "step": 2029 }, { "epoch": 0.7686482393032942, "grad_norm": 12.048510551452637, "learning_rate": 2.5640085903287313e-05, "loss": 1.4117, "step": 2030 }, { "epoch": 0.769026883756153, "grad_norm": 14.081696510314941, "learning_rate": 2.5559996280278196e-05, "loss": 1.3464, "step": 2031 }, { "epoch": 0.7694055282090118, "grad_norm": 12.886058807373047, "learning_rate": 2.548001360511396e-05, "loss": 1.6561, "step": 2032 }, { "epoch": 0.7697841726618705, "grad_norm": 10.942986488342285, "learning_rate": 2.5400137992705686e-05, "loss": 1.01, "step": 2033 }, { "epoch": 0.7701628171147292, "grad_norm": 10.077105522155762, "learning_rate": 2.5320369557810496e-05, "loss": 0.8631, "step": 2034 }, { "epoch": 0.7705414615675881, "grad_norm": 16.22908592224121, "learning_rate": 2.52407084150316e-05, "loss": 2.036, "step": 2035 }, { "epoch": 0.7709201060204468, "grad_norm": 12.806578636169434, "learning_rate": 2.516115467881801e-05, "loss": 1.5009, "step": 2036 }, { "epoch": 0.7712987504733055, "grad_norm": 8.642051696777344, "learning_rate": 2.5081708463464525e-05, "loss": 0.5311, "step": 2037 }, { "epoch": 0.7716773949261644, "grad_norm": 9.904001235961914, "learning_rate": 2.5002369883111375e-05, "loss": 0.8588, "step": 2038 }, { "epoch": 0.7720560393790231, "grad_norm": 11.078864097595215, "learning_rate": 2.492313905174418e-05, "loss": 0.6772, "step": 2039 }, { "epoch": 0.7724346838318819, "grad_norm": 14.320399284362793, "learning_rate": 2.4844016083193745e-05, "loss": 1.6373, "step": 2040 }, { "epoch": 0.7728133282847406, "grad_norm": 12.595783233642578, "learning_rate": 2.4765001091135965e-05, "loss": 0.7032, "step": 2041 }, { "epoch": 0.7731919727375994, "grad_norm": 11.836488723754883, "learning_rate": 2.4686094189091548e-05, "loss": 0.4463, "step": 2042 }, { "epoch": 0.7735706171904582, "grad_norm": 9.597789764404297, "learning_rate": 2.460729549042592e-05, "loss": 0.4927, "step": 2043 }, { "epoch": 0.7739492616433169, "grad_norm": 8.733850479125977, "learning_rate": 2.4528605108349044e-05, "loss": 0.4759, "step": 2044 }, { "epoch": 0.7743279060961757, "grad_norm": 11.219993591308594, "learning_rate": 2.4450023155915304e-05, "loss": 0.7328, "step": 2045 }, { "epoch": 0.7747065505490345, "grad_norm": 27.845932006835938, "learning_rate": 2.4371549746023214e-05, "loss": 1.4339, "step": 2046 }, { "epoch": 0.7750851950018932, "grad_norm": 10.190560340881348, "learning_rate": 2.4293184991415496e-05, "loss": 0.3212, "step": 2047 }, { "epoch": 0.775463839454752, "grad_norm": 16.55055809020996, "learning_rate": 2.4214929004678644e-05, "loss": 0.726, "step": 2048 }, { "epoch": 0.7758424839076108, "grad_norm": 19.725399017333984, "learning_rate": 2.41367818982429e-05, "loss": 0.6447, "step": 2049 }, { "epoch": 0.7762211283604695, "grad_norm": 3.192269802093506, "learning_rate": 2.405874378438212e-05, "loss": 0.1195, "step": 2050 }, { "epoch": 0.7765997728133283, "grad_norm": 9.078378677368164, "learning_rate": 2.3980814775213546e-05, "loss": 2.2485, "step": 2051 }, { "epoch": 0.7769784172661871, "grad_norm": 12.071084976196289, "learning_rate": 2.3902994982697625e-05, "loss": 2.0394, "step": 2052 }, { "epoch": 0.7773570617190458, "grad_norm": 11.375093460083008, "learning_rate": 2.3825284518638026e-05, "loss": 1.6176, "step": 2053 }, { "epoch": 0.7777357061719046, "grad_norm": 10.902304649353027, "learning_rate": 2.3747683494681193e-05, "loss": 1.3598, "step": 2054 }, { "epoch": 0.7781143506247633, "grad_norm": 9.987029075622559, "learning_rate": 2.367019202231644e-05, "loss": 0.9933, "step": 2055 }, { "epoch": 0.7784929950776222, "grad_norm": 12.458052635192871, "learning_rate": 2.3592810212875615e-05, "loss": 1.0712, "step": 2056 }, { "epoch": 0.7788716395304809, "grad_norm": 13.190988540649414, "learning_rate": 2.351553817753309e-05, "loss": 0.9407, "step": 2057 }, { "epoch": 0.7792502839833396, "grad_norm": 13.746674537658691, "learning_rate": 2.3438376027305486e-05, "loss": 1.8128, "step": 2058 }, { "epoch": 0.7796289284361985, "grad_norm": 12.625304222106934, "learning_rate": 2.336132387305152e-05, "loss": 1.0643, "step": 2059 }, { "epoch": 0.7800075728890572, "grad_norm": 9.5991792678833, "learning_rate": 2.32843818254719e-05, "loss": 0.9507, "step": 2060 }, { "epoch": 0.7803862173419159, "grad_norm": 11.39356517791748, "learning_rate": 2.3207549995109213e-05, "loss": 0.8554, "step": 2061 }, { "epoch": 0.7807648617947747, "grad_norm": 10.822458267211914, "learning_rate": 2.3130828492347613e-05, "loss": 0.7323, "step": 2062 }, { "epoch": 0.7811435062476335, "grad_norm": 14.810583114624023, "learning_rate": 2.305421742741275e-05, "loss": 1.0549, "step": 2063 }, { "epoch": 0.7815221507004922, "grad_norm": 14.234593391418457, "learning_rate": 2.2977716910371617e-05, "loss": 0.8004, "step": 2064 }, { "epoch": 0.781900795153351, "grad_norm": 9.462289810180664, "learning_rate": 2.2901327051132436e-05, "loss": 0.6268, "step": 2065 }, { "epoch": 0.7822794396062097, "grad_norm": 9.984630584716797, "learning_rate": 2.2825047959444402e-05, "loss": 0.5759, "step": 2066 }, { "epoch": 0.7826580840590686, "grad_norm": 8.960870742797852, "learning_rate": 2.2748879744897566e-05, "loss": 0.478, "step": 2067 }, { "epoch": 0.7830367285119273, "grad_norm": 12.594315528869629, "learning_rate": 2.2672822516922664e-05, "loss": 0.7478, "step": 2068 }, { "epoch": 0.783415372964786, "grad_norm": 11.013339042663574, "learning_rate": 2.2596876384791044e-05, "loss": 0.4485, "step": 2069 }, { "epoch": 0.7837940174176449, "grad_norm": 14.870506286621094, "learning_rate": 2.25210414576144e-05, "loss": 1.0851, "step": 2070 }, { "epoch": 0.7841726618705036, "grad_norm": 11.4357271194458, "learning_rate": 2.2445317844344648e-05, "loss": 0.6047, "step": 2071 }, { "epoch": 0.7845513063233623, "grad_norm": 14.853757858276367, "learning_rate": 2.2369705653773765e-05, "loss": 0.696, "step": 2072 }, { "epoch": 0.7849299507762211, "grad_norm": 19.04551124572754, "learning_rate": 2.2294204994533728e-05, "loss": 0.5838, "step": 2073 }, { "epoch": 0.7853085952290799, "grad_norm": 21.33306312561035, "learning_rate": 2.2218815975096207e-05, "loss": 0.232, "step": 2074 }, { "epoch": 0.7856872396819387, "grad_norm": 7.215750694274902, "learning_rate": 2.2143538703772493e-05, "loss": 0.3659, "step": 2075 }, { "epoch": 0.7860658841347974, "grad_norm": 10.023611068725586, "learning_rate": 2.2068373288713294e-05, "loss": 1.963, "step": 2076 }, { "epoch": 0.7864445285876562, "grad_norm": 11.151053428649902, "learning_rate": 2.1993319837908722e-05, "loss": 1.8424, "step": 2077 }, { "epoch": 0.786823173040515, "grad_norm": 10.198956489562988, "learning_rate": 2.1918378459187928e-05, "loss": 1.7327, "step": 2078 }, { "epoch": 0.7872018174933737, "grad_norm": 13.809074401855469, "learning_rate": 2.1843549260219075e-05, "loss": 1.5646, "step": 2079 }, { "epoch": 0.7875804619462324, "grad_norm": 10.254709243774414, "learning_rate": 2.176883234850914e-05, "loss": 0.9571, "step": 2080 }, { "epoch": 0.7879591063990913, "grad_norm": 9.651457786560059, "learning_rate": 2.1694227831403868e-05, "loss": 1.0734, "step": 2081 }, { "epoch": 0.78833775085195, "grad_norm": 12.739899635314941, "learning_rate": 2.1619735816087417e-05, "loss": 1.1163, "step": 2082 }, { "epoch": 0.7887163953048087, "grad_norm": 9.453640937805176, "learning_rate": 2.154535640958235e-05, "loss": 0.7136, "step": 2083 }, { "epoch": 0.7890950397576676, "grad_norm": 11.674582481384277, "learning_rate": 2.147108971874946e-05, "loss": 1.197, "step": 2084 }, { "epoch": 0.7894736842105263, "grad_norm": 11.435443878173828, "learning_rate": 2.1396935850287615e-05, "loss": 0.8756, "step": 2085 }, { "epoch": 0.7898523286633851, "grad_norm": 11.762517929077148, "learning_rate": 2.1322894910733547e-05, "loss": 0.8406, "step": 2086 }, { "epoch": 0.7902309731162438, "grad_norm": 11.67284107208252, "learning_rate": 2.1248967006461783e-05, "loss": 1.0792, "step": 2087 }, { "epoch": 0.7906096175691026, "grad_norm": 11.377511978149414, "learning_rate": 2.117515224368438e-05, "loss": 0.8553, "step": 2088 }, { "epoch": 0.7909882620219614, "grad_norm": 13.04310131072998, "learning_rate": 2.110145072845099e-05, "loss": 0.9179, "step": 2089 }, { "epoch": 0.7913669064748201, "grad_norm": 8.919587135314941, "learning_rate": 2.1027862566648425e-05, "loss": 0.415, "step": 2090 }, { "epoch": 0.791745550927679, "grad_norm": 14.16692066192627, "learning_rate": 2.095438786400068e-05, "loss": 0.6586, "step": 2091 }, { "epoch": 0.7921241953805377, "grad_norm": 9.801429748535156, "learning_rate": 2.0881026726068775e-05, "loss": 0.5952, "step": 2092 }, { "epoch": 0.7925028398333964, "grad_norm": 14.237808227539062, "learning_rate": 2.0807779258250537e-05, "loss": 0.5729, "step": 2093 }, { "epoch": 0.7928814842862552, "grad_norm": 10.763073921203613, "learning_rate": 2.073464556578051e-05, "loss": 0.634, "step": 2094 }, { "epoch": 0.793260128739114, "grad_norm": 14.481758117675781, "learning_rate": 2.0661625753729707e-05, "loss": 0.6255, "step": 2095 }, { "epoch": 0.7936387731919727, "grad_norm": 12.738697052001953, "learning_rate": 2.058871992700567e-05, "loss": 0.4656, "step": 2096 }, { "epoch": 0.7940174176448315, "grad_norm": 9.8129243850708, "learning_rate": 2.0515928190352052e-05, "loss": 0.352, "step": 2097 }, { "epoch": 0.7943960620976903, "grad_norm": 7.001706123352051, "learning_rate": 2.0443250648348645e-05, "loss": 0.1885, "step": 2098 }, { "epoch": 0.794774706550549, "grad_norm": 10.005912780761719, "learning_rate": 2.037068740541116e-05, "loss": 0.4027, "step": 2099 }, { "epoch": 0.7951533510034078, "grad_norm": 29.922603607177734, "learning_rate": 2.0298238565791072e-05, "loss": 0.9724, "step": 2100 }, { "epoch": 0.7955319954562665, "grad_norm": 9.894120216369629, "learning_rate": 2.0225904233575586e-05, "loss": 2.2812, "step": 2101 }, { "epoch": 0.7959106399091254, "grad_norm": 10.518170356750488, "learning_rate": 2.0153684512687297e-05, "loss": 1.9124, "step": 2102 }, { "epoch": 0.7962892843619841, "grad_norm": 11.77073860168457, "learning_rate": 2.0081579506884184e-05, "loss": 1.714, "step": 2103 }, { "epoch": 0.7966679288148428, "grad_norm": 10.411770820617676, "learning_rate": 2.0009589319759358e-05, "loss": 1.084, "step": 2104 }, { "epoch": 0.7970465732677017, "grad_norm": 10.7216796875, "learning_rate": 1.9937714054741095e-05, "loss": 1.2028, "step": 2105 }, { "epoch": 0.7974252177205604, "grad_norm": 12.313328742980957, "learning_rate": 1.9865953815092443e-05, "loss": 1.3527, "step": 2106 }, { "epoch": 0.7978038621734191, "grad_norm": 11.44083023071289, "learning_rate": 1.9794308703911223e-05, "loss": 1.308, "step": 2107 }, { "epoch": 0.7981825066262779, "grad_norm": 7.195469856262207, "learning_rate": 1.972277882412985e-05, "loss": 0.6015, "step": 2108 }, { "epoch": 0.7985611510791367, "grad_norm": 9.268226623535156, "learning_rate": 1.965136427851525e-05, "loss": 0.713, "step": 2109 }, { "epoch": 0.7989397955319955, "grad_norm": 11.629197120666504, "learning_rate": 1.958006516966857e-05, "loss": 1.1342, "step": 2110 }, { "epoch": 0.7993184399848542, "grad_norm": 9.284026145935059, "learning_rate": 1.950888160002513e-05, "loss": 0.652, "step": 2111 }, { "epoch": 0.799697084437713, "grad_norm": 11.595834732055664, "learning_rate": 1.9437813671854243e-05, "loss": 0.7744, "step": 2112 }, { "epoch": 0.8000757288905718, "grad_norm": 12.920877456665039, "learning_rate": 1.9366861487259134e-05, "loss": 0.7837, "step": 2113 }, { "epoch": 0.8004543733434305, "grad_norm": 11.606154441833496, "learning_rate": 1.92960251481767e-05, "loss": 0.7631, "step": 2114 }, { "epoch": 0.8008330177962892, "grad_norm": 11.07241153717041, "learning_rate": 1.9225304756377394e-05, "loss": 0.6119, "step": 2115 }, { "epoch": 0.8012116622491481, "grad_norm": 9.720746994018555, "learning_rate": 1.9154700413465077e-05, "loss": 0.5343, "step": 2116 }, { "epoch": 0.8015903067020068, "grad_norm": 13.459938049316406, "learning_rate": 1.9084212220876942e-05, "loss": 0.8311, "step": 2117 }, { "epoch": 0.8019689511548656, "grad_norm": 19.92449188232422, "learning_rate": 1.9013840279883267e-05, "loss": 1.5106, "step": 2118 }, { "epoch": 0.8023475956077244, "grad_norm": 12.470542907714844, "learning_rate": 1.8943584691587313e-05, "loss": 0.5453, "step": 2119 }, { "epoch": 0.8027262400605831, "grad_norm": 11.011085510253906, "learning_rate": 1.887344555692515e-05, "loss": 0.9123, "step": 2120 }, { "epoch": 0.8031048845134419, "grad_norm": 9.609660148620605, "learning_rate": 1.880342297666563e-05, "loss": 0.3907, "step": 2121 }, { "epoch": 0.8034835289663006, "grad_norm": 18.808673858642578, "learning_rate": 1.8733517051410054e-05, "loss": 0.7887, "step": 2122 }, { "epoch": 0.8038621734191594, "grad_norm": 11.743169784545898, "learning_rate": 1.8663727881592176e-05, "loss": 0.6305, "step": 2123 }, { "epoch": 0.8042408178720182, "grad_norm": 15.70638656616211, "learning_rate": 1.8594055567477965e-05, "loss": 0.2316, "step": 2124 }, { "epoch": 0.8046194623248769, "grad_norm": 18.647750854492188, "learning_rate": 1.8524500209165573e-05, "loss": 0.2247, "step": 2125 }, { "epoch": 0.8049981067777358, "grad_norm": 11.47064208984375, "learning_rate": 1.8455061906585068e-05, "loss": 2.2448, "step": 2126 }, { "epoch": 0.8053767512305945, "grad_norm": 11.74728775024414, "learning_rate": 1.838574075949836e-05, "loss": 1.9097, "step": 2127 }, { "epoch": 0.8057553956834532, "grad_norm": 10.916732788085938, "learning_rate": 1.8316536867499013e-05, "loss": 1.5175, "step": 2128 }, { "epoch": 0.806134040136312, "grad_norm": 12.84622573852539, "learning_rate": 1.8247450330012206e-05, "loss": 1.2506, "step": 2129 }, { "epoch": 0.8065126845891708, "grad_norm": 11.161234855651855, "learning_rate": 1.8178481246294433e-05, "loss": 1.2847, "step": 2130 }, { "epoch": 0.8068913290420295, "grad_norm": 11.394064903259277, "learning_rate": 1.8109629715433497e-05, "loss": 1.2467, "step": 2131 }, { "epoch": 0.8072699734948883, "grad_norm": 14.243141174316406, "learning_rate": 1.804089583634825e-05, "loss": 1.1874, "step": 2132 }, { "epoch": 0.8076486179477471, "grad_norm": 9.28836727142334, "learning_rate": 1.7972279707788608e-05, "loss": 0.9387, "step": 2133 }, { "epoch": 0.8080272624006059, "grad_norm": 8.111406326293945, "learning_rate": 1.790378142833524e-05, "loss": 0.538, "step": 2134 }, { "epoch": 0.8084059068534646, "grad_norm": 12.397017478942871, "learning_rate": 1.783540109639953e-05, "loss": 0.8542, "step": 2135 }, { "epoch": 0.8087845513063233, "grad_norm": 13.441441535949707, "learning_rate": 1.776713881022337e-05, "loss": 0.6562, "step": 2136 }, { "epoch": 0.8091631957591822, "grad_norm": 15.309789657592773, "learning_rate": 1.769899466787913e-05, "loss": 1.038, "step": 2137 }, { "epoch": 0.8095418402120409, "grad_norm": 9.063993453979492, "learning_rate": 1.7630968767269396e-05, "loss": 0.6075, "step": 2138 }, { "epoch": 0.8099204846648996, "grad_norm": 13.524983406066895, "learning_rate": 1.7563061206126875e-05, "loss": 1.1014, "step": 2139 }, { "epoch": 0.8102991291177585, "grad_norm": 10.966734886169434, "learning_rate": 1.7495272082014235e-05, "loss": 0.633, "step": 2140 }, { "epoch": 0.8106777735706172, "grad_norm": 13.029152870178223, "learning_rate": 1.742760149232404e-05, "loss": 0.7217, "step": 2141 }, { "epoch": 0.811056418023476, "grad_norm": 10.429238319396973, "learning_rate": 1.736004953427852e-05, "loss": 0.7237, "step": 2142 }, { "epoch": 0.8114350624763347, "grad_norm": 10.06002140045166, "learning_rate": 1.7292616304929454e-05, "loss": 0.5242, "step": 2143 }, { "epoch": 0.8118137069291935, "grad_norm": 12.623034477233887, "learning_rate": 1.7225301901158097e-05, "loss": 0.679, "step": 2144 }, { "epoch": 0.8121923513820523, "grad_norm": 11.331562995910645, "learning_rate": 1.7158106419674956e-05, "loss": 0.5334, "step": 2145 }, { "epoch": 0.812570995834911, "grad_norm": 5.492001533508301, "learning_rate": 1.7091029957019656e-05, "loss": 0.2017, "step": 2146 }, { "epoch": 0.8129496402877698, "grad_norm": 15.369951248168945, "learning_rate": 1.702407260956087e-05, "loss": 0.6085, "step": 2147 }, { "epoch": 0.8133282847406286, "grad_norm": 14.828356742858887, "learning_rate": 1.6957234473496087e-05, "loss": 0.5092, "step": 2148 }, { "epoch": 0.8137069291934873, "grad_norm": 9.543940544128418, "learning_rate": 1.6890515644851612e-05, "loss": 0.283, "step": 2149 }, { "epoch": 0.814085573646346, "grad_norm": 10.726631164550781, "learning_rate": 1.6823916219482273e-05, "loss": 0.5966, "step": 2150 }, { "epoch": 0.8144642180992049, "grad_norm": 8.761448860168457, "learning_rate": 1.6757436293071362e-05, "loss": 1.9309, "step": 2151 }, { "epoch": 0.8148428625520636, "grad_norm": 8.565874099731445, "learning_rate": 1.6691075961130452e-05, "loss": 1.2893, "step": 2152 }, { "epoch": 0.8152215070049224, "grad_norm": 9.82201099395752, "learning_rate": 1.662483531899941e-05, "loss": 1.4838, "step": 2153 }, { "epoch": 0.8156001514577812, "grad_norm": 14.917886734008789, "learning_rate": 1.6558714461846025e-05, "loss": 1.4838, "step": 2154 }, { "epoch": 0.8159787959106399, "grad_norm": 10.092191696166992, "learning_rate": 1.6492713484666057e-05, "loss": 1.0222, "step": 2155 }, { "epoch": 0.8163574403634987, "grad_norm": 10.887097358703613, "learning_rate": 1.6426832482282973e-05, "loss": 1.1809, "step": 2156 }, { "epoch": 0.8167360848163574, "grad_norm": 9.894757270812988, "learning_rate": 1.636107154934796e-05, "loss": 1.2049, "step": 2157 }, { "epoch": 0.8171147292692162, "grad_norm": 12.424516677856445, "learning_rate": 1.629543078033964e-05, "loss": 0.8421, "step": 2158 }, { "epoch": 0.817493373722075, "grad_norm": 8.398805618286133, "learning_rate": 1.622991026956401e-05, "loss": 0.7292, "step": 2159 }, { "epoch": 0.8178720181749337, "grad_norm": 7.698610305786133, "learning_rate": 1.616451011115426e-05, "loss": 0.6156, "step": 2160 }, { "epoch": 0.8182506626277926, "grad_norm": 12.355453491210938, "learning_rate": 1.6099230399070763e-05, "loss": 0.7348, "step": 2161 }, { "epoch": 0.8186293070806513, "grad_norm": 9.122222900390625, "learning_rate": 1.6034071227100755e-05, "loss": 0.6662, "step": 2162 }, { "epoch": 0.81900795153351, "grad_norm": 13.555700302124023, "learning_rate": 1.596903268885832e-05, "loss": 0.7399, "step": 2163 }, { "epoch": 0.8193865959863688, "grad_norm": 10.907391548156738, "learning_rate": 1.5904114877784205e-05, "loss": 0.5784, "step": 2164 }, { "epoch": 0.8197652404392276, "grad_norm": 10.284018516540527, "learning_rate": 1.5839317887145798e-05, "loss": 0.7392, "step": 2165 }, { "epoch": 0.8201438848920863, "grad_norm": 11.968799591064453, "learning_rate": 1.5774641810036793e-05, "loss": 0.776, "step": 2166 }, { "epoch": 0.8205225293449451, "grad_norm": 8.554612159729004, "learning_rate": 1.571008673937724e-05, "loss": 0.3263, "step": 2167 }, { "epoch": 0.8209011737978038, "grad_norm": 12.886919021606445, "learning_rate": 1.5645652767913287e-05, "loss": 0.9284, "step": 2168 }, { "epoch": 0.8212798182506627, "grad_norm": 13.11758041381836, "learning_rate": 1.5581339988217157e-05, "loss": 0.926, "step": 2169 }, { "epoch": 0.8216584627035214, "grad_norm": 7.769589900970459, "learning_rate": 1.5517148492686918e-05, "loss": 0.4201, "step": 2170 }, { "epoch": 0.8220371071563801, "grad_norm": 11.231391906738281, "learning_rate": 1.5453078373546405e-05, "loss": 0.4707, "step": 2171 }, { "epoch": 0.822415751609239, "grad_norm": 10.800396919250488, "learning_rate": 1.538912972284502e-05, "loss": 0.3822, "step": 2172 }, { "epoch": 0.8227943960620977, "grad_norm": 4.893832683563232, "learning_rate": 1.532530263245776e-05, "loss": 0.2115, "step": 2173 }, { "epoch": 0.8231730405149564, "grad_norm": 14.236945152282715, "learning_rate": 1.5261597194084876e-05, "loss": 0.3221, "step": 2174 }, { "epoch": 0.8235516849678152, "grad_norm": 26.767309188842773, "learning_rate": 1.5198013499251895e-05, "loss": 1.1176, "step": 2175 }, { "epoch": 0.823930329420674, "grad_norm": 10.102524757385254, "learning_rate": 1.513455163930938e-05, "loss": 2.0341, "step": 2176 }, { "epoch": 0.8243089738735327, "grad_norm": 10.047316551208496, "learning_rate": 1.5071211705432953e-05, "loss": 1.598, "step": 2177 }, { "epoch": 0.8246876183263915, "grad_norm": 11.753214836120605, "learning_rate": 1.5007993788622977e-05, "loss": 1.2507, "step": 2178 }, { "epoch": 0.8250662627792503, "grad_norm": 9.349088668823242, "learning_rate": 1.4944897979704531e-05, "loss": 1.216, "step": 2179 }, { "epoch": 0.8254449072321091, "grad_norm": 8.784585952758789, "learning_rate": 1.4881924369327261e-05, "loss": 0.7352, "step": 2180 }, { "epoch": 0.8258235516849678, "grad_norm": 11.812366485595703, "learning_rate": 1.4819073047965304e-05, "loss": 0.9288, "step": 2181 }, { "epoch": 0.8262021961378265, "grad_norm": 8.885299682617188, "learning_rate": 1.475634410591703e-05, "loss": 0.8133, "step": 2182 }, { "epoch": 0.8265808405906854, "grad_norm": 14.608964920043945, "learning_rate": 1.4693737633305038e-05, "loss": 1.073, "step": 2183 }, { "epoch": 0.8269594850435441, "grad_norm": 12.234383583068848, "learning_rate": 1.463125372007591e-05, "loss": 1.1562, "step": 2184 }, { "epoch": 0.8273381294964028, "grad_norm": 14.701723098754883, "learning_rate": 1.456889245600026e-05, "loss": 1.4302, "step": 2185 }, { "epoch": 0.8277167739492617, "grad_norm": 8.882140159606934, "learning_rate": 1.4506653930672387e-05, "loss": 0.6384, "step": 2186 }, { "epoch": 0.8280954184021204, "grad_norm": 9.586640357971191, "learning_rate": 1.4444538233510296e-05, "loss": 0.545, "step": 2187 }, { "epoch": 0.8284740628549792, "grad_norm": 8.397727012634277, "learning_rate": 1.4382545453755524e-05, "loss": 0.4565, "step": 2188 }, { "epoch": 0.8288527073078379, "grad_norm": 13.309687614440918, "learning_rate": 1.4320675680472995e-05, "loss": 0.9385, "step": 2189 }, { "epoch": 0.8292313517606967, "grad_norm": 12.719165802001953, "learning_rate": 1.4258929002550925e-05, "loss": 0.8029, "step": 2190 }, { "epoch": 0.8296099962135555, "grad_norm": 11.410460472106934, "learning_rate": 1.4197305508700665e-05, "loss": 0.618, "step": 2191 }, { "epoch": 0.8299886406664142, "grad_norm": 11.324352264404297, "learning_rate": 1.4135805287456638e-05, "loss": 0.543, "step": 2192 }, { "epoch": 0.830367285119273, "grad_norm": 11.747859954833984, "learning_rate": 1.407442842717609e-05, "loss": 0.575, "step": 2193 }, { "epoch": 0.8307459295721318, "grad_norm": 9.637734413146973, "learning_rate": 1.4013175016039082e-05, "loss": 0.3842, "step": 2194 }, { "epoch": 0.8311245740249905, "grad_norm": 10.062605857849121, "learning_rate": 1.3952045142048287e-05, "loss": 0.398, "step": 2195 }, { "epoch": 0.8315032184778492, "grad_norm": 9.133951187133789, "learning_rate": 1.3891038893028897e-05, "loss": 0.3874, "step": 2196 }, { "epoch": 0.8318818629307081, "grad_norm": 11.680546760559082, "learning_rate": 1.3830156356628531e-05, "loss": 0.5123, "step": 2197 }, { "epoch": 0.8322605073835668, "grad_norm": 9.764351844787598, "learning_rate": 1.3769397620317038e-05, "loss": 0.5171, "step": 2198 }, { "epoch": 0.8326391518364256, "grad_norm": 14.688189506530762, "learning_rate": 1.3708762771386386e-05, "loss": 0.2875, "step": 2199 }, { "epoch": 0.8330177962892844, "grad_norm": 25.08920669555664, "learning_rate": 1.364825189695056e-05, "loss": 0.6495, "step": 2200 }, { "epoch": 0.8333964407421431, "grad_norm": 9.983552932739258, "learning_rate": 1.3587865083945483e-05, "loss": 1.9564, "step": 2201 }, { "epoch": 0.8337750851950019, "grad_norm": 10.196476936340332, "learning_rate": 1.3527602419128793e-05, "loss": 1.8545, "step": 2202 }, { "epoch": 0.8341537296478606, "grad_norm": 10.242213249206543, "learning_rate": 1.3467463989079764e-05, "loss": 1.3327, "step": 2203 }, { "epoch": 0.8345323741007195, "grad_norm": 11.082919120788574, "learning_rate": 1.3407449880199175e-05, "loss": 1.2277, "step": 2204 }, { "epoch": 0.8349110185535782, "grad_norm": 14.372567176818848, "learning_rate": 1.334756017870924e-05, "loss": 1.2489, "step": 2205 }, { "epoch": 0.8352896630064369, "grad_norm": 9.300726890563965, "learning_rate": 1.328779497065339e-05, "loss": 0.6838, "step": 2206 }, { "epoch": 0.8356683074592958, "grad_norm": 10.5520658493042, "learning_rate": 1.3228154341896225e-05, "loss": 0.9467, "step": 2207 }, { "epoch": 0.8360469519121545, "grad_norm": 11.126348495483398, "learning_rate": 1.316863837812331e-05, "loss": 0.9089, "step": 2208 }, { "epoch": 0.8364255963650132, "grad_norm": 14.790655136108398, "learning_rate": 1.3109247164841199e-05, "loss": 0.6592, "step": 2209 }, { "epoch": 0.836804240817872, "grad_norm": 10.294766426086426, "learning_rate": 1.3049980787377126e-05, "loss": 0.5883, "step": 2210 }, { "epoch": 0.8371828852707308, "grad_norm": 14.918768882751465, "learning_rate": 1.2990839330879024e-05, "loss": 0.8069, "step": 2211 }, { "epoch": 0.8375615297235895, "grad_norm": 11.903068542480469, "learning_rate": 1.2931822880315303e-05, "loss": 1.0219, "step": 2212 }, { "epoch": 0.8379401741764483, "grad_norm": 11.20407772064209, "learning_rate": 1.2872931520474873e-05, "loss": 0.5456, "step": 2213 }, { "epoch": 0.8383188186293071, "grad_norm": 11.456269264221191, "learning_rate": 1.281416533596682e-05, "loss": 0.5934, "step": 2214 }, { "epoch": 0.8386974630821659, "grad_norm": 13.103646278381348, "learning_rate": 1.2755524411220455e-05, "loss": 1.0718, "step": 2215 }, { "epoch": 0.8390761075350246, "grad_norm": 11.291081428527832, "learning_rate": 1.269700883048508e-05, "loss": 0.7964, "step": 2216 }, { "epoch": 0.8394547519878833, "grad_norm": 9.637993812561035, "learning_rate": 1.2638618677829983e-05, "loss": 0.5809, "step": 2217 }, { "epoch": 0.8398333964407422, "grad_norm": 17.41292953491211, "learning_rate": 1.2580354037144194e-05, "loss": 0.8842, "step": 2218 }, { "epoch": 0.8402120408936009, "grad_norm": 7.992457866668701, "learning_rate": 1.2522214992136449e-05, "loss": 0.3386, "step": 2219 }, { "epoch": 0.8405906853464596, "grad_norm": 8.05163288116455, "learning_rate": 1.2464201626334982e-05, "loss": 0.467, "step": 2220 }, { "epoch": 0.8409693297993185, "grad_norm": 7.449150562286377, "learning_rate": 1.2406314023087584e-05, "loss": 0.3405, "step": 2221 }, { "epoch": 0.8413479742521772, "grad_norm": 8.64903736114502, "learning_rate": 1.2348552265561242e-05, "loss": 0.3131, "step": 2222 }, { "epoch": 0.841726618705036, "grad_norm": 11.01096248626709, "learning_rate": 1.2290916436742205e-05, "loss": 0.377, "step": 2223 }, { "epoch": 0.8421052631578947, "grad_norm": 9.92861270904541, "learning_rate": 1.223340661943576e-05, "loss": 0.2074, "step": 2224 }, { "epoch": 0.8424839076107535, "grad_norm": 8.516009330749512, "learning_rate": 1.2176022896266214e-05, "loss": 0.1966, "step": 2225 }, { "epoch": 0.8428625520636123, "grad_norm": 13.25462818145752, "learning_rate": 1.2118765349676664e-05, "loss": 2.6535, "step": 2226 }, { "epoch": 0.843241196516471, "grad_norm": 12.067037582397461, "learning_rate": 1.206163406192895e-05, "loss": 1.7434, "step": 2227 }, { "epoch": 0.8436198409693298, "grad_norm": 11.304587364196777, "learning_rate": 1.2004629115103471e-05, "loss": 1.6441, "step": 2228 }, { "epoch": 0.8439984854221886, "grad_norm": 9.689388275146484, "learning_rate": 1.1947750591099206e-05, "loss": 0.9401, "step": 2229 }, { "epoch": 0.8443771298750473, "grad_norm": 13.32154655456543, "learning_rate": 1.1890998571633427e-05, "loss": 1.0714, "step": 2230 }, { "epoch": 0.844755774327906, "grad_norm": 11.89297103881836, "learning_rate": 1.1834373138241672e-05, "loss": 1.1308, "step": 2231 }, { "epoch": 0.8451344187807649, "grad_norm": 11.245408058166504, "learning_rate": 1.1777874372277597e-05, "loss": 0.7964, "step": 2232 }, { "epoch": 0.8455130632336236, "grad_norm": 8.075695991516113, "learning_rate": 1.1721502354912939e-05, "loss": 0.5496, "step": 2233 }, { "epoch": 0.8458917076864824, "grad_norm": 13.818008422851562, "learning_rate": 1.1665257167137289e-05, "loss": 0.8236, "step": 2234 }, { "epoch": 0.8462703521393412, "grad_norm": 10.466621398925781, "learning_rate": 1.1609138889757998e-05, "loss": 0.7633, "step": 2235 }, { "epoch": 0.8466489965921999, "grad_norm": 10.655759811401367, "learning_rate": 1.1553147603400139e-05, "loss": 0.7656, "step": 2236 }, { "epoch": 0.8470276410450587, "grad_norm": 11.960763931274414, "learning_rate": 1.1497283388506285e-05, "loss": 0.7411, "step": 2237 }, { "epoch": 0.8474062854979174, "grad_norm": 10.754295349121094, "learning_rate": 1.1441546325336505e-05, "loss": 0.6966, "step": 2238 }, { "epoch": 0.8477849299507763, "grad_norm": 10.447911262512207, "learning_rate": 1.1385936493968108e-05, "loss": 0.5975, "step": 2239 }, { "epoch": 0.848163574403635, "grad_norm": 7.784918308258057, "learning_rate": 1.1330453974295708e-05, "loss": 0.5167, "step": 2240 }, { "epoch": 0.8485422188564937, "grad_norm": 7.070992946624756, "learning_rate": 1.127509884603095e-05, "loss": 0.4106, "step": 2241 }, { "epoch": 0.8489208633093526, "grad_norm": 10.769533157348633, "learning_rate": 1.1219871188702447e-05, "loss": 0.6198, "step": 2242 }, { "epoch": 0.8492995077622113, "grad_norm": 11.375531196594238, "learning_rate": 1.1164771081655712e-05, "loss": 0.4514, "step": 2243 }, { "epoch": 0.84967815221507, "grad_norm": 10.334220886230469, "learning_rate": 1.1109798604052957e-05, "loss": 0.239, "step": 2244 }, { "epoch": 0.8500567966679288, "grad_norm": 5.8680315017700195, "learning_rate": 1.1054953834873095e-05, "loss": 0.1725, "step": 2245 }, { "epoch": 0.8504354411207876, "grad_norm": 11.254388809204102, "learning_rate": 1.1000236852911527e-05, "loss": 0.4195, "step": 2246 }, { "epoch": 0.8508140855736464, "grad_norm": 10.101336479187012, "learning_rate": 1.0945647736780052e-05, "loss": 0.3015, "step": 2247 }, { "epoch": 0.8511927300265051, "grad_norm": 10.285369873046875, "learning_rate": 1.0891186564906742e-05, "loss": 0.2216, "step": 2248 }, { "epoch": 0.8515713744793639, "grad_norm": 10.76842212677002, "learning_rate": 1.083685341553593e-05, "loss": 0.2561, "step": 2249 }, { "epoch": 0.8519500189322227, "grad_norm": 24.11894989013672, "learning_rate": 1.0782648366727965e-05, "loss": 0.5513, "step": 2250 }, { "epoch": 0.8523286633850814, "grad_norm": 11.82591438293457, "learning_rate": 1.072857149635914e-05, "loss": 2.4543, "step": 2251 }, { "epoch": 0.8527073078379401, "grad_norm": 9.538484573364258, "learning_rate": 1.067462288212162e-05, "loss": 1.1095, "step": 2252 }, { "epoch": 0.853085952290799, "grad_norm": 11.511687278747559, "learning_rate": 1.0620802601523316e-05, "loss": 1.5651, "step": 2253 }, { "epoch": 0.8534645967436577, "grad_norm": 11.207715034484863, "learning_rate": 1.0567110731887742e-05, "loss": 0.9555, "step": 2254 }, { "epoch": 0.8538432411965164, "grad_norm": 10.063400268554688, "learning_rate": 1.0513547350353936e-05, "loss": 1.0487, "step": 2255 }, { "epoch": 0.8542218856493753, "grad_norm": 11.091261863708496, "learning_rate": 1.0460112533876287e-05, "loss": 1.0456, "step": 2256 }, { "epoch": 0.854600530102234, "grad_norm": 11.866201400756836, "learning_rate": 1.0406806359224574e-05, "loss": 1.1276, "step": 2257 }, { "epoch": 0.8549791745550928, "grad_norm": 12.267317771911621, "learning_rate": 1.035362890298368e-05, "loss": 0.8773, "step": 2258 }, { "epoch": 0.8553578190079515, "grad_norm": 12.725680351257324, "learning_rate": 1.030058024155357e-05, "loss": 0.8035, "step": 2259 }, { "epoch": 0.8557364634608103, "grad_norm": 11.86599063873291, "learning_rate": 1.0247660451149166e-05, "loss": 0.9379, "step": 2260 }, { "epoch": 0.8561151079136691, "grad_norm": 13.769980430603027, "learning_rate": 1.0194869607800305e-05, "loss": 0.9552, "step": 2261 }, { "epoch": 0.8564937523665278, "grad_norm": 10.183456420898438, "learning_rate": 1.0142207787351465e-05, "loss": 0.7875, "step": 2262 }, { "epoch": 0.8568723968193867, "grad_norm": 13.380812644958496, "learning_rate": 1.0089675065461834e-05, "loss": 0.6164, "step": 2263 }, { "epoch": 0.8572510412722454, "grad_norm": 9.049885749816895, "learning_rate": 1.0037271517605063e-05, "loss": 0.64, "step": 2264 }, { "epoch": 0.8576296857251041, "grad_norm": 12.912697792053223, "learning_rate": 9.984997219069304e-06, "loss": 0.6789, "step": 2265 }, { "epoch": 0.8580083301779629, "grad_norm": 9.801331520080566, "learning_rate": 9.932852244956936e-06, "loss": 0.4367, "step": 2266 }, { "epoch": 0.8583869746308217, "grad_norm": 16.610326766967773, "learning_rate": 9.880836670184567e-06, "loss": 0.6217, "step": 2267 }, { "epoch": 0.8587656190836804, "grad_norm": 10.286986351013184, "learning_rate": 9.828950569482875e-06, "loss": 0.4884, "step": 2268 }, { "epoch": 0.8591442635365392, "grad_norm": 11.326078414916992, "learning_rate": 9.777194017396595e-06, "loss": 0.7175, "step": 2269 }, { "epoch": 0.859522907989398, "grad_norm": 11.866266250610352, "learning_rate": 9.72556708828427e-06, "loss": 0.5466, "step": 2270 }, { "epoch": 0.8599015524422567, "grad_norm": 13.79624080657959, "learning_rate": 9.674069856318224e-06, "loss": 0.5976, "step": 2271 }, { "epoch": 0.8602801968951155, "grad_norm": 8.457572937011719, "learning_rate": 9.622702395484451e-06, "loss": 0.2856, "step": 2272 }, { "epoch": 0.8606588413479742, "grad_norm": 17.572168350219727, "learning_rate": 9.571464779582529e-06, "loss": 0.2946, "step": 2273 }, { "epoch": 0.8610374858008331, "grad_norm": 16.828079223632812, "learning_rate": 9.52035708222545e-06, "loss": 0.7045, "step": 2274 }, { "epoch": 0.8614161302536918, "grad_norm": 19.846851348876953, "learning_rate": 9.469379376839582e-06, "loss": 0.1168, "step": 2275 }, { "epoch": 0.8617947747065505, "grad_norm": 10.008796691894531, "learning_rate": 9.418531736664483e-06, "loss": 2.1273, "step": 2276 }, { "epoch": 0.8621734191594093, "grad_norm": 10.289381980895996, "learning_rate": 9.367814234752937e-06, "loss": 1.7055, "step": 2277 }, { "epoch": 0.8625520636122681, "grad_norm": 11.30695629119873, "learning_rate": 9.31722694397067e-06, "loss": 1.472, "step": 2278 }, { "epoch": 0.8629307080651268, "grad_norm": 8.154560089111328, "learning_rate": 9.266769936996389e-06, "loss": 0.9404, "step": 2279 }, { "epoch": 0.8633093525179856, "grad_norm": 10.423186302185059, "learning_rate": 9.216443286321586e-06, "loss": 1.3779, "step": 2280 }, { "epoch": 0.8636879969708444, "grad_norm": 11.121103286743164, "learning_rate": 9.166247064250477e-06, "loss": 0.5472, "step": 2281 }, { "epoch": 0.8640666414237032, "grad_norm": 9.69433307647705, "learning_rate": 9.116181342899932e-06, "loss": 0.7709, "step": 2282 }, { "epoch": 0.8644452858765619, "grad_norm": 9.098278045654297, "learning_rate": 9.06624619419928e-06, "loss": 0.4906, "step": 2283 }, { "epoch": 0.8648239303294206, "grad_norm": 14.784154891967773, "learning_rate": 9.016441689890286e-06, "loss": 0.8121, "step": 2284 }, { "epoch": 0.8652025747822795, "grad_norm": 9.00545597076416, "learning_rate": 8.966767901527007e-06, "loss": 0.5983, "step": 2285 }, { "epoch": 0.8655812192351382, "grad_norm": 10.185830116271973, "learning_rate": 8.917224900475695e-06, "loss": 0.8618, "step": 2286 }, { "epoch": 0.8659598636879969, "grad_norm": 13.381327629089355, "learning_rate": 8.867812757914694e-06, "loss": 1.0504, "step": 2287 }, { "epoch": 0.8663385081408558, "grad_norm": 14.24845027923584, "learning_rate": 8.818531544834385e-06, "loss": 0.7165, "step": 2288 }, { "epoch": 0.8667171525937145, "grad_norm": 12.008301734924316, "learning_rate": 8.76938133203702e-06, "loss": 0.6329, "step": 2289 }, { "epoch": 0.8670957970465732, "grad_norm": 8.582221031188965, "learning_rate": 8.720362190136611e-06, "loss": 0.5233, "step": 2290 }, { "epoch": 0.867474441499432, "grad_norm": 14.074053764343262, "learning_rate": 8.671474189558903e-06, "loss": 0.8896, "step": 2291 }, { "epoch": 0.8678530859522908, "grad_norm": 8.265854835510254, "learning_rate": 8.622717400541192e-06, "loss": 0.423, "step": 2292 }, { "epoch": 0.8682317304051496, "grad_norm": 14.022979736328125, "learning_rate": 8.57409189313233e-06, "loss": 0.5092, "step": 2293 }, { "epoch": 0.8686103748580083, "grad_norm": 7.6086106300354, "learning_rate": 8.525597737192481e-06, "loss": 0.3162, "step": 2294 }, { "epoch": 0.8689890193108671, "grad_norm": 10.01663875579834, "learning_rate": 8.477235002393147e-06, "loss": 0.363, "step": 2295 }, { "epoch": 0.8693676637637259, "grad_norm": 10.2891206741333, "learning_rate": 8.429003758216959e-06, "loss": 0.4843, "step": 2296 }, { "epoch": 0.8697463082165846, "grad_norm": 10.89184284210205, "learning_rate": 8.380904073957729e-06, "loss": 0.6032, "step": 2297 }, { "epoch": 0.8701249526694433, "grad_norm": 11.962915420532227, "learning_rate": 8.332936018720171e-06, "loss": 0.4728, "step": 2298 }, { "epoch": 0.8705035971223022, "grad_norm": 11.58560562133789, "learning_rate": 8.285099661419926e-06, "loss": 0.4367, "step": 2299 }, { "epoch": 0.8708822415751609, "grad_norm": 18.291141510009766, "learning_rate": 8.237395070783404e-06, "loss": 0.651, "step": 2300 }, { "epoch": 0.8712608860280197, "grad_norm": 11.406949996948242, "learning_rate": 8.189822315347762e-06, "loss": 2.1475, "step": 2301 }, { "epoch": 0.8716395304808785, "grad_norm": 10.053242683410645, "learning_rate": 8.14238146346068e-06, "loss": 1.4275, "step": 2302 }, { "epoch": 0.8720181749337372, "grad_norm": 9.507181167602539, "learning_rate": 8.09507258328036e-06, "loss": 1.1072, "step": 2303 }, { "epoch": 0.872396819386596, "grad_norm": 10.980565071105957, "learning_rate": 8.04789574277538e-06, "loss": 1.0862, "step": 2304 }, { "epoch": 0.8727754638394547, "grad_norm": 10.014975547790527, "learning_rate": 8.000851009724696e-06, "loss": 0.8774, "step": 2305 }, { "epoch": 0.8731541082923135, "grad_norm": 10.634843826293945, "learning_rate": 7.95393845171737e-06, "loss": 0.7953, "step": 2306 }, { "epoch": 0.8735327527451723, "grad_norm": 12.149109840393066, "learning_rate": 7.907158136152604e-06, "loss": 0.8086, "step": 2307 }, { "epoch": 0.873911397198031, "grad_norm": 9.334737777709961, "learning_rate": 7.860510130239607e-06, "loss": 0.5934, "step": 2308 }, { "epoch": 0.8742900416508899, "grad_norm": 13.120880126953125, "learning_rate": 7.813994500997524e-06, "loss": 1.0779, "step": 2309 }, { "epoch": 0.8746686861037486, "grad_norm": 10.691913604736328, "learning_rate": 7.767611315255275e-06, "loss": 0.9073, "step": 2310 }, { "epoch": 0.8750473305566073, "grad_norm": 10.472341537475586, "learning_rate": 7.72136063965152e-06, "loss": 0.4952, "step": 2311 }, { "epoch": 0.8754259750094661, "grad_norm": 10.533929824829102, "learning_rate": 7.67524254063452e-06, "loss": 0.7114, "step": 2312 }, { "epoch": 0.8758046194623249, "grad_norm": 6.5337042808532715, "learning_rate": 7.6292570844621045e-06, "loss": 0.3483, "step": 2313 }, { "epoch": 0.8761832639151836, "grad_norm": 8.839234352111816, "learning_rate": 7.583404337201516e-06, "loss": 0.5599, "step": 2314 }, { "epoch": 0.8765619083680424, "grad_norm": 10.337873458862305, "learning_rate": 7.5376843647293024e-06, "loss": 0.5017, "step": 2315 }, { "epoch": 0.8769405528209012, "grad_norm": 8.032730102539062, "learning_rate": 7.4920972327312875e-06, "loss": 0.4731, "step": 2316 }, { "epoch": 0.87731919727376, "grad_norm": 8.474766731262207, "learning_rate": 7.446643006702469e-06, "loss": 0.3606, "step": 2317 }, { "epoch": 0.8776978417266187, "grad_norm": 10.331032752990723, "learning_rate": 7.4013217519468325e-06, "loss": 0.21, "step": 2318 }, { "epoch": 0.8780764861794774, "grad_norm": 15.5275297164917, "learning_rate": 7.356133533577369e-06, "loss": 0.4189, "step": 2319 }, { "epoch": 0.8784551306323363, "grad_norm": 8.125926971435547, "learning_rate": 7.311078416515926e-06, "loss": 0.1247, "step": 2320 }, { "epoch": 0.878833775085195, "grad_norm": 10.590972900390625, "learning_rate": 7.266156465493124e-06, "loss": 0.3929, "step": 2321 }, { "epoch": 0.8792124195380537, "grad_norm": 6.178790092468262, "learning_rate": 7.221367745048279e-06, "loss": 0.2636, "step": 2322 }, { "epoch": 0.8795910639909126, "grad_norm": 16.32522201538086, "learning_rate": 7.1767123195292666e-06, "loss": 0.4226, "step": 2323 }, { "epoch": 0.8799697084437713, "grad_norm": 7.7940850257873535, "learning_rate": 7.132190253092452e-06, "loss": 0.1687, "step": 2324 }, { "epoch": 0.88034835289663, "grad_norm": 10.466033935546875, "learning_rate": 7.08780160970266e-06, "loss": 0.4764, "step": 2325 }, { "epoch": 0.8807269973494888, "grad_norm": 12.871282577514648, "learning_rate": 7.043546453132977e-06, "loss": 2.4303, "step": 2326 }, { "epoch": 0.8811056418023476, "grad_norm": 11.891122817993164, "learning_rate": 6.99942484696472e-06, "loss": 1.6218, "step": 2327 }, { "epoch": 0.8814842862552064, "grad_norm": 8.916067123413086, "learning_rate": 6.955436854587327e-06, "loss": 0.8128, "step": 2328 }, { "epoch": 0.8818629307080651, "grad_norm": 8.550481796264648, "learning_rate": 6.9115825391982806e-06, "loss": 0.8219, "step": 2329 }, { "epoch": 0.8822415751609239, "grad_norm": 10.76794719696045, "learning_rate": 6.867861963803035e-06, "loss": 0.7949, "step": 2330 }, { "epoch": 0.8826202196137827, "grad_norm": 8.747878074645996, "learning_rate": 6.824275191214868e-06, "loss": 0.7019, "step": 2331 }, { "epoch": 0.8829988640666414, "grad_norm": 11.711833953857422, "learning_rate": 6.780822284054833e-06, "loss": 0.7592, "step": 2332 }, { "epoch": 0.8833775085195001, "grad_norm": 13.456823348999023, "learning_rate": 6.7375033047516464e-06, "loss": 0.9945, "step": 2333 }, { "epoch": 0.883756152972359, "grad_norm": 8.14212703704834, "learning_rate": 6.694318315541637e-06, "loss": 0.6043, "step": 2334 }, { "epoch": 0.8841347974252177, "grad_norm": 8.493631362915039, "learning_rate": 6.651267378468584e-06, "loss": 0.5508, "step": 2335 }, { "epoch": 0.8845134418780765, "grad_norm": 10.33297061920166, "learning_rate": 6.608350555383758e-06, "loss": 0.6154, "step": 2336 }, { "epoch": 0.8848920863309353, "grad_norm": 10.296257019042969, "learning_rate": 6.565567907945658e-06, "loss": 0.6082, "step": 2337 }, { "epoch": 0.885270730783794, "grad_norm": 9.22463321685791, "learning_rate": 6.522919497620073e-06, "loss": 0.6438, "step": 2338 }, { "epoch": 0.8856493752366528, "grad_norm": 11.198843002319336, "learning_rate": 6.480405385679888e-06, "loss": 0.7403, "step": 2339 }, { "epoch": 0.8860280196895115, "grad_norm": 10.24124526977539, "learning_rate": 6.43802563320508e-06, "loss": 0.565, "step": 2340 }, { "epoch": 0.8864066641423703, "grad_norm": 11.018882751464844, "learning_rate": 6.395780301082577e-06, "loss": 0.5413, "step": 2341 }, { "epoch": 0.8867853085952291, "grad_norm": 10.295462608337402, "learning_rate": 6.353669450006194e-06, "loss": 0.4042, "step": 2342 }, { "epoch": 0.8871639530480878, "grad_norm": 13.353784561157227, "learning_rate": 6.3116931404765265e-06, "loss": 0.5982, "step": 2343 }, { "epoch": 0.8875425975009467, "grad_norm": 11.05883502960205, "learning_rate": 6.269851432800855e-06, "loss": 0.5592, "step": 2344 }, { "epoch": 0.8879212419538054, "grad_norm": 14.049494743347168, "learning_rate": 6.228144387093127e-06, "loss": 0.4035, "step": 2345 }, { "epoch": 0.8882998864066641, "grad_norm": 12.806116104125977, "learning_rate": 6.1865720632737875e-06, "loss": 0.514, "step": 2346 }, { "epoch": 0.8886785308595229, "grad_norm": 9.267263412475586, "learning_rate": 6.145134521069729e-06, "loss": 0.5728, "step": 2347 }, { "epoch": 0.8890571753123817, "grad_norm": 9.22109603881836, "learning_rate": 6.103831820014194e-06, "loss": 0.288, "step": 2348 }, { "epoch": 0.8894358197652404, "grad_norm": 11.868034362792969, "learning_rate": 6.062664019446751e-06, "loss": 0.2524, "step": 2349 }, { "epoch": 0.8898144642180992, "grad_norm": 9.9324951171875, "learning_rate": 6.021631178513087e-06, "loss": 0.4626, "step": 2350 }, { "epoch": 0.890193108670958, "grad_norm": 11.608717918395996, "learning_rate": 5.9807333561650355e-06, "loss": 1.7653, "step": 2351 }, { "epoch": 0.8905717531238168, "grad_norm": 10.697403907775879, "learning_rate": 5.939970611160428e-06, "loss": 1.5739, "step": 2352 }, { "epoch": 0.8909503975766755, "grad_norm": 10.519599914550781, "learning_rate": 5.899343002063063e-06, "loss": 1.4995, "step": 2353 }, { "epoch": 0.8913290420295342, "grad_norm": 12.994054794311523, "learning_rate": 5.858850587242559e-06, "loss": 1.0682, "step": 2354 }, { "epoch": 0.8917076864823931, "grad_norm": 12.894742012023926, "learning_rate": 5.818493424874294e-06, "loss": 1.2836, "step": 2355 }, { "epoch": 0.8920863309352518, "grad_norm": 12.090742111206055, "learning_rate": 5.778271572939354e-06, "loss": 1.4098, "step": 2356 }, { "epoch": 0.8924649753881105, "grad_norm": 8.5830717086792, "learning_rate": 5.738185089224424e-06, "loss": 0.6956, "step": 2357 }, { "epoch": 0.8928436198409694, "grad_norm": 9.2327880859375, "learning_rate": 5.698234031321692e-06, "loss": 0.4974, "step": 2358 }, { "epoch": 0.8932222642938281, "grad_norm": 12.713973999023438, "learning_rate": 5.658418456628778e-06, "loss": 0.6973, "step": 2359 }, { "epoch": 0.8936009087466869, "grad_norm": 8.313766479492188, "learning_rate": 5.618738422348646e-06, "loss": 0.4688, "step": 2360 }, { "epoch": 0.8939795531995456, "grad_norm": 11.878710746765137, "learning_rate": 5.579193985489584e-06, "loss": 0.6091, "step": 2361 }, { "epoch": 0.8943581976524044, "grad_norm": 9.968457221984863, "learning_rate": 5.5397852028649996e-06, "loss": 0.7507, "step": 2362 }, { "epoch": 0.8947368421052632, "grad_norm": 9.140604972839355, "learning_rate": 5.500512131093438e-06, "loss": 0.4721, "step": 2363 }, { "epoch": 0.8951154865581219, "grad_norm": 9.671671867370605, "learning_rate": 5.461374826598453e-06, "loss": 0.5719, "step": 2364 }, { "epoch": 0.8954941310109807, "grad_norm": 14.937457084655762, "learning_rate": 5.422373345608578e-06, "loss": 0.5815, "step": 2365 }, { "epoch": 0.8958727754638395, "grad_norm": 12.879511833190918, "learning_rate": 5.383507744157179e-06, "loss": 0.5853, "step": 2366 }, { "epoch": 0.8962514199166982, "grad_norm": 10.898216247558594, "learning_rate": 5.344778078082391e-06, "loss": 0.4625, "step": 2367 }, { "epoch": 0.896630064369557, "grad_norm": 14.941044807434082, "learning_rate": 5.306184403027059e-06, "loss": 0.6827, "step": 2368 }, { "epoch": 0.8970087088224158, "grad_norm": 14.840867042541504, "learning_rate": 5.267726774438697e-06, "loss": 0.5103, "step": 2369 }, { "epoch": 0.8973873532752745, "grad_norm": 4.7114577293396, "learning_rate": 5.229405247569308e-06, "loss": 0.1327, "step": 2370 }, { "epoch": 0.8977659977281333, "grad_norm": 9.676827430725098, "learning_rate": 5.191219877475373e-06, "loss": 0.4447, "step": 2371 }, { "epoch": 0.8981446421809921, "grad_norm": 7.863070964813232, "learning_rate": 5.153170719017741e-06, "loss": 0.2609, "step": 2372 }, { "epoch": 0.8985232866338508, "grad_norm": 8.543060302734375, "learning_rate": 5.115257826861619e-06, "loss": 0.2568, "step": 2373 }, { "epoch": 0.8989019310867096, "grad_norm": 11.265084266662598, "learning_rate": 5.077481255476368e-06, "loss": 0.2466, "step": 2374 }, { "epoch": 0.8992805755395683, "grad_norm": 20.728683471679688, "learning_rate": 5.039841059135553e-06, "loss": 1.1415, "step": 2375 }, { "epoch": 0.8996592199924272, "grad_norm": 10.630395889282227, "learning_rate": 5.002337291916792e-06, "loss": 1.858, "step": 2376 }, { "epoch": 0.9000378644452859, "grad_norm": 10.02651596069336, "learning_rate": 4.9649700077016635e-06, "loss": 1.4943, "step": 2377 }, { "epoch": 0.9004165088981446, "grad_norm": 11.16691780090332, "learning_rate": 4.927739260175735e-06, "loss": 1.3521, "step": 2378 }, { "epoch": 0.9007951533510034, "grad_norm": 10.301544189453125, "learning_rate": 4.8906451028283285e-06, "loss": 0.8017, "step": 2379 }, { "epoch": 0.9011737978038622, "grad_norm": 10.982889175415039, "learning_rate": 4.853687588952594e-06, "loss": 0.7324, "step": 2380 }, { "epoch": 0.9015524422567209, "grad_norm": 10.89743423461914, "learning_rate": 4.816866771645323e-06, "loss": 0.9049, "step": 2381 }, { "epoch": 0.9019310867095797, "grad_norm": 10.184319496154785, "learning_rate": 4.7801827038069234e-06, "loss": 0.7181, "step": 2382 }, { "epoch": 0.9023097311624385, "grad_norm": 7.678170680999756, "learning_rate": 4.7436354381413476e-06, "loss": 0.4933, "step": 2383 }, { "epoch": 0.9026883756152972, "grad_norm": 8.026887893676758, "learning_rate": 4.707225027156015e-06, "loss": 0.5746, "step": 2384 }, { "epoch": 0.903067020068156, "grad_norm": 11.845023155212402, "learning_rate": 4.670951523161693e-06, "loss": 0.537, "step": 2385 }, { "epoch": 0.9034456645210147, "grad_norm": 12.805367469787598, "learning_rate": 4.634814978272473e-06, "loss": 0.6111, "step": 2386 }, { "epoch": 0.9038243089738736, "grad_norm": 10.827343940734863, "learning_rate": 4.598815444405691e-06, "loss": 0.7474, "step": 2387 }, { "epoch": 0.9042029534267323, "grad_norm": 12.786199569702148, "learning_rate": 4.5629529732817864e-06, "loss": 0.8605, "step": 2388 }, { "epoch": 0.904581597879591, "grad_norm": 7.475493907928467, "learning_rate": 4.527227616424368e-06, "loss": 0.3846, "step": 2389 }, { "epoch": 0.9049602423324499, "grad_norm": 16.920345306396484, "learning_rate": 4.491639425159988e-06, "loss": 0.8434, "step": 2390 }, { "epoch": 0.9053388867853086, "grad_norm": 13.1689453125, "learning_rate": 4.4561884506181266e-06, "loss": 0.5591, "step": 2391 }, { "epoch": 0.9057175312381673, "grad_norm": 13.537805557250977, "learning_rate": 4.420874743731163e-06, "loss": 0.6376, "step": 2392 }, { "epoch": 0.9060961756910261, "grad_norm": 12.794946670532227, "learning_rate": 4.385698355234258e-06, "loss": 0.6448, "step": 2393 }, { "epoch": 0.9064748201438849, "grad_norm": 11.212878227233887, "learning_rate": 4.350659335665275e-06, "loss": 0.2639, "step": 2394 }, { "epoch": 0.9068534645967437, "grad_norm": 7.262661933898926, "learning_rate": 4.315757735364712e-06, "loss": 0.3209, "step": 2395 }, { "epoch": 0.9072321090496024, "grad_norm": 8.17447566986084, "learning_rate": 4.280993604475636e-06, "loss": 0.3456, "step": 2396 }, { "epoch": 0.9076107535024612, "grad_norm": 7.121233940124512, "learning_rate": 4.246366992943662e-06, "loss": 0.2374, "step": 2397 }, { "epoch": 0.90798939795532, "grad_norm": 14.229004859924316, "learning_rate": 4.211877950516763e-06, "loss": 0.2939, "step": 2398 }, { "epoch": 0.9083680424081787, "grad_norm": 55.41293716430664, "learning_rate": 4.177526526745301e-06, "loss": 0.5769, "step": 2399 }, { "epoch": 0.9087466868610374, "grad_norm": 10.350135803222656, "learning_rate": 4.143312770981911e-06, "loss": 0.3735, "step": 2400 }, { "epoch": 0.9091253313138963, "grad_norm": 9.91360092163086, "learning_rate": 4.109236732381461e-06, "loss": 1.6225, "step": 2401 }, { "epoch": 0.909503975766755, "grad_norm": 8.888208389282227, "learning_rate": 4.075298459900933e-06, "loss": 1.2812, "step": 2402 }, { "epoch": 0.9098826202196137, "grad_norm": 14.481423377990723, "learning_rate": 4.0414980022994045e-06, "loss": 1.5647, "step": 2403 }, { "epoch": 0.9102612646724726, "grad_norm": 9.676767349243164, "learning_rate": 4.007835408137928e-06, "loss": 0.696, "step": 2404 }, { "epoch": 0.9106399091253313, "grad_norm": 8.741473197937012, "learning_rate": 3.974310725779518e-06, "loss": 0.7849, "step": 2405 }, { "epoch": 0.9110185535781901, "grad_norm": 14.236115455627441, "learning_rate": 3.940924003389046e-06, "loss": 0.877, "step": 2406 }, { "epoch": 0.9113971980310488, "grad_norm": 10.6664400100708, "learning_rate": 3.907675288933144e-06, "loss": 0.6007, "step": 2407 }, { "epoch": 0.9117758424839076, "grad_norm": 9.50271987915039, "learning_rate": 3.874564630180188e-06, "loss": 0.7913, "step": 2408 }, { "epoch": 0.9121544869367664, "grad_norm": 11.143498420715332, "learning_rate": 3.84159207470024e-06, "loss": 0.7398, "step": 2409 }, { "epoch": 0.9125331313896251, "grad_norm": 9.227463722229004, "learning_rate": 3.808757669864904e-06, "loss": 0.7372, "step": 2410 }, { "epoch": 0.912911775842484, "grad_norm": 12.129105567932129, "learning_rate": 3.7760614628473357e-06, "loss": 0.8929, "step": 2411 }, { "epoch": 0.9132904202953427, "grad_norm": 10.593716621398926, "learning_rate": 3.743503500622103e-06, "loss": 0.7059, "step": 2412 }, { "epoch": 0.9136690647482014, "grad_norm": 13.297348976135254, "learning_rate": 3.711083829965212e-06, "loss": 0.5883, "step": 2413 }, { "epoch": 0.9140477092010602, "grad_norm": 8.455639839172363, "learning_rate": 3.678802497453948e-06, "loss": 0.4366, "step": 2414 }, { "epoch": 0.914426353653919, "grad_norm": 8.234000205993652, "learning_rate": 3.6466595494668353e-06, "loss": 0.363, "step": 2415 }, { "epoch": 0.9148049981067777, "grad_norm": 13.38591194152832, "learning_rate": 3.6146550321836116e-06, "loss": 0.5627, "step": 2416 }, { "epoch": 0.9151836425596365, "grad_norm": 12.83345890045166, "learning_rate": 3.58278899158514e-06, "loss": 0.6422, "step": 2417 }, { "epoch": 0.9155622870124953, "grad_norm": 13.449295997619629, "learning_rate": 3.5510614734532876e-06, "loss": 0.6531, "step": 2418 }, { "epoch": 0.915940931465354, "grad_norm": 10.407699584960938, "learning_rate": 3.519472523370948e-06, "loss": 0.6168, "step": 2419 }, { "epoch": 0.9163195759182128, "grad_norm": 17.99662208557129, "learning_rate": 3.4880221867219064e-06, "loss": 0.257, "step": 2420 }, { "epoch": 0.9166982203710715, "grad_norm": 7.612576484680176, "learning_rate": 3.45671050869083e-06, "loss": 0.2634, "step": 2421 }, { "epoch": 0.9170768648239304, "grad_norm": 6.6135358810424805, "learning_rate": 3.425537534263168e-06, "loss": 0.1463, "step": 2422 }, { "epoch": 0.9174555092767891, "grad_norm": 6.440648078918457, "learning_rate": 3.394503308225061e-06, "loss": 0.1608, "step": 2423 }, { "epoch": 0.9178341537296478, "grad_norm": 1.7489157915115356, "learning_rate": 3.363607875163366e-06, "loss": 0.047, "step": 2424 }, { "epoch": 0.9182127981825067, "grad_norm": 30.02290153503418, "learning_rate": 3.3328512794654652e-06, "loss": 0.9788, "step": 2425 }, { "epoch": 0.9185914426353654, "grad_norm": 11.564863204956055, "learning_rate": 3.302233565319357e-06, "loss": 2.0172, "step": 2426 }, { "epoch": 0.9189700870882241, "grad_norm": 9.934062957763672, "learning_rate": 3.2717547767134538e-06, "loss": 1.326, "step": 2427 }, { "epoch": 0.9193487315410829, "grad_norm": 10.045295715332031, "learning_rate": 3.2414149574365836e-06, "loss": 0.9745, "step": 2428 }, { "epoch": 0.9197273759939417, "grad_norm": 11.001640319824219, "learning_rate": 3.2112141510779127e-06, "loss": 1.2666, "step": 2429 }, { "epoch": 0.9201060204468005, "grad_norm": 10.749781608581543, "learning_rate": 3.18115240102691e-06, "loss": 1.0313, "step": 2430 }, { "epoch": 0.9204846648996592, "grad_norm": 11.977824211120605, "learning_rate": 3.151229750473239e-06, "loss": 0.8656, "step": 2431 }, { "epoch": 0.920863309352518, "grad_norm": 11.430237770080566, "learning_rate": 3.1214462424067335e-06, "loss": 1.0509, "step": 2432 }, { "epoch": 0.9212419538053768, "grad_norm": 10.933598518371582, "learning_rate": 3.0918019196173096e-06, "loss": 0.9562, "step": 2433 }, { "epoch": 0.9216205982582355, "grad_norm": 7.044180393218994, "learning_rate": 3.0622968246949213e-06, "loss": 0.5424, "step": 2434 }, { "epoch": 0.9219992427110942, "grad_norm": 14.46313762664795, "learning_rate": 3.0329310000295153e-06, "loss": 0.4784, "step": 2435 }, { "epoch": 0.9223778871639531, "grad_norm": 9.44637393951416, "learning_rate": 3.003704487810899e-06, "loss": 0.649, "step": 2436 }, { "epoch": 0.9227565316168118, "grad_norm": 8.902215003967285, "learning_rate": 2.9746173300287837e-06, "loss": 0.523, "step": 2437 }, { "epoch": 0.9231351760696705, "grad_norm": 8.846084594726562, "learning_rate": 2.945669568472631e-06, "loss": 0.5283, "step": 2438 }, { "epoch": 0.9235138205225294, "grad_norm": 9.392375946044922, "learning_rate": 2.916861244731661e-06, "loss": 0.5592, "step": 2439 }, { "epoch": 0.9238924649753881, "grad_norm": 8.410120964050293, "learning_rate": 2.888192400194745e-06, "loss": 0.4376, "step": 2440 }, { "epoch": 0.9242711094282469, "grad_norm": 9.047958374023438, "learning_rate": 2.8596630760503673e-06, "loss": 0.4578, "step": 2441 }, { "epoch": 0.9246497538811056, "grad_norm": 9.126653671264648, "learning_rate": 2.8312733132865754e-06, "loss": 0.423, "step": 2442 }, { "epoch": 0.9250283983339644, "grad_norm": 6.317657470703125, "learning_rate": 2.803023152690887e-06, "loss": 0.2639, "step": 2443 }, { "epoch": 0.9254070427868232, "grad_norm": 10.31212043762207, "learning_rate": 2.7749126348502684e-06, "loss": 0.3573, "step": 2444 }, { "epoch": 0.9257856872396819, "grad_norm": 15.265264511108398, "learning_rate": 2.7469418001510704e-06, "loss": 0.4853, "step": 2445 }, { "epoch": 0.9261643316925408, "grad_norm": 7.364819526672363, "learning_rate": 2.7191106887789473e-06, "loss": 0.3669, "step": 2446 }, { "epoch": 0.9265429761453995, "grad_norm": 9.149045944213867, "learning_rate": 2.6914193407188146e-06, "loss": 0.1694, "step": 2447 }, { "epoch": 0.9269216205982582, "grad_norm": 6.317681789398193, "learning_rate": 2.663867795754771e-06, "loss": 0.34, "step": 2448 }, { "epoch": 0.927300265051117, "grad_norm": 6.21943998336792, "learning_rate": 2.636456093470119e-06, "loss": 0.272, "step": 2449 }, { "epoch": 0.9276789095039758, "grad_norm": 19.55099868774414, "learning_rate": 2.6091842732472006e-06, "loss": 0.6493, "step": 2450 }, { "epoch": 0.9280575539568345, "grad_norm": 9.162104606628418, "learning_rate": 2.582052374267385e-06, "loss": 1.5711, "step": 2451 }, { "epoch": 0.9284361984096933, "grad_norm": 9.272793769836426, "learning_rate": 2.555060435511025e-06, "loss": 1.2648, "step": 2452 }, { "epoch": 0.9288148428625521, "grad_norm": 12.99858283996582, "learning_rate": 2.5282084957574226e-06, "loss": 1.4216, "step": 2453 }, { "epoch": 0.9291934873154108, "grad_norm": 11.940863609313965, "learning_rate": 2.5014965935847178e-06, "loss": 1.0989, "step": 2454 }, { "epoch": 0.9295721317682696, "grad_norm": 10.320348739624023, "learning_rate": 2.4749247673698573e-06, "loss": 0.743, "step": 2455 }, { "epoch": 0.9299507762211283, "grad_norm": 11.852913856506348, "learning_rate": 2.4484930552885365e-06, "loss": 0.9047, "step": 2456 }, { "epoch": 0.9303294206739872, "grad_norm": 8.977890014648438, "learning_rate": 2.4222014953151686e-06, "loss": 0.7247, "step": 2457 }, { "epoch": 0.9307080651268459, "grad_norm": 10.382013320922852, "learning_rate": 2.396050125222793e-06, "loss": 0.7898, "step": 2458 }, { "epoch": 0.9310867095797046, "grad_norm": 11.049487113952637, "learning_rate": 2.370038982583056e-06, "loss": 0.8068, "step": 2459 }, { "epoch": 0.9314653540325635, "grad_norm": 15.248514175415039, "learning_rate": 2.344168104766109e-06, "loss": 0.7894, "step": 2460 }, { "epoch": 0.9318439984854222, "grad_norm": 13.173613548278809, "learning_rate": 2.3184375289406202e-06, "loss": 1.3524, "step": 2461 }, { "epoch": 0.9322226429382809, "grad_norm": 9.832080841064453, "learning_rate": 2.2928472920736744e-06, "loss": 0.3662, "step": 2462 }, { "epoch": 0.9326012873911397, "grad_norm": 11.172598838806152, "learning_rate": 2.2673974309307066e-06, "loss": 0.5176, "step": 2463 }, { "epoch": 0.9329799318439985, "grad_norm": 8.801765441894531, "learning_rate": 2.2420879820755023e-06, "loss": 0.4289, "step": 2464 }, { "epoch": 0.9333585762968573, "grad_norm": 7.546141624450684, "learning_rate": 2.2169189818701307e-06, "loss": 0.4665, "step": 2465 }, { "epoch": 0.933737220749716, "grad_norm": 11.858968734741211, "learning_rate": 2.191890466474844e-06, "loss": 0.5132, "step": 2466 }, { "epoch": 0.9341158652025748, "grad_norm": 11.472759246826172, "learning_rate": 2.1670024718480675e-06, "loss": 0.3973, "step": 2467 }, { "epoch": 0.9344945096554336, "grad_norm": 11.55582332611084, "learning_rate": 2.1422550337463322e-06, "loss": 0.5675, "step": 2468 }, { "epoch": 0.9348731541082923, "grad_norm": 8.275731086730957, "learning_rate": 2.117648187724286e-06, "loss": 0.3216, "step": 2469 }, { "epoch": 0.935251798561151, "grad_norm": 17.31087875366211, "learning_rate": 2.0931819691345277e-06, "loss": 0.4359, "step": 2470 }, { "epoch": 0.9356304430140099, "grad_norm": 6.022243976593018, "learning_rate": 2.06885641312764e-06, "loss": 0.1946, "step": 2471 }, { "epoch": 0.9360090874668686, "grad_norm": 7.993716239929199, "learning_rate": 2.0446715546521112e-06, "loss": 0.2927, "step": 2472 }, { "epoch": 0.9363877319197274, "grad_norm": 5.820225238800049, "learning_rate": 2.0206274284542804e-06, "loss": 0.1208, "step": 2473 }, { "epoch": 0.9367663763725862, "grad_norm": 10.578958511352539, "learning_rate": 1.9967240690783262e-06, "loss": 0.2527, "step": 2474 }, { "epoch": 0.9371450208254449, "grad_norm": 6.542937755584717, "learning_rate": 1.972961510866178e-06, "loss": 0.3776, "step": 2475 }, { "epoch": 0.9375236652783037, "grad_norm": 9.507660865783691, "learning_rate": 1.9493397879574493e-06, "loss": 1.6838, "step": 2476 }, { "epoch": 0.9379023097311624, "grad_norm": 11.040675163269043, "learning_rate": 1.9258589342894485e-06, "loss": 1.5473, "step": 2477 }, { "epoch": 0.9382809541840212, "grad_norm": 9.63912296295166, "learning_rate": 1.902518983597068e-06, "loss": 1.1266, "step": 2478 }, { "epoch": 0.93865959863688, "grad_norm": 11.154900550842285, "learning_rate": 1.879319969412796e-06, "loss": 1.205, "step": 2479 }, { "epoch": 0.9390382430897387, "grad_norm": 11.759103775024414, "learning_rate": 1.8562619250666047e-06, "loss": 1.2244, "step": 2480 }, { "epoch": 0.9394168875425976, "grad_norm": 10.546182632446289, "learning_rate": 1.8333448836859723e-06, "loss": 0.8522, "step": 2481 }, { "epoch": 0.9397955319954563, "grad_norm": 11.785140991210938, "learning_rate": 1.810568878195773e-06, "loss": 0.8214, "step": 2482 }, { "epoch": 0.940174176448315, "grad_norm": 13.62626838684082, "learning_rate": 1.787933941318265e-06, "loss": 0.9527, "step": 2483 }, { "epoch": 0.9405528209011738, "grad_norm": 12.887011528015137, "learning_rate": 1.7654401055730129e-06, "loss": 1.0685, "step": 2484 }, { "epoch": 0.9409314653540326, "grad_norm": 9.904094696044922, "learning_rate": 1.7430874032768885e-06, "loss": 0.7109, "step": 2485 }, { "epoch": 0.9413101098068913, "grad_norm": 7.248521327972412, "learning_rate": 1.7208758665439917e-06, "loss": 0.3924, "step": 2486 }, { "epoch": 0.9416887542597501, "grad_norm": 9.920915603637695, "learning_rate": 1.6988055272855962e-06, "loss": 0.3695, "step": 2487 }, { "epoch": 0.9420673987126088, "grad_norm": 8.971296310424805, "learning_rate": 1.676876417210127e-06, "loss": 0.3631, "step": 2488 }, { "epoch": 0.9424460431654677, "grad_norm": 15.373361587524414, "learning_rate": 1.6550885678231042e-06, "loss": 0.9075, "step": 2489 }, { "epoch": 0.9428246876183264, "grad_norm": 10.004647254943848, "learning_rate": 1.6334420104271109e-06, "loss": 0.6522, "step": 2490 }, { "epoch": 0.9432033320711851, "grad_norm": 10.934609413146973, "learning_rate": 1.6119367761217142e-06, "loss": 0.4777, "step": 2491 }, { "epoch": 0.943581976524044, "grad_norm": 8.394415855407715, "learning_rate": 1.590572895803455e-06, "loss": 0.2631, "step": 2492 }, { "epoch": 0.9439606209769027, "grad_norm": 11.458495140075684, "learning_rate": 1.569350400165781e-06, "loss": 0.62, "step": 2493 }, { "epoch": 0.9443392654297614, "grad_norm": 15.056721687316895, "learning_rate": 1.548269319699036e-06, "loss": 0.4027, "step": 2494 }, { "epoch": 0.9447179098826202, "grad_norm": 12.485556602478027, "learning_rate": 1.5273296846903707e-06, "loss": 0.3424, "step": 2495 }, { "epoch": 0.945096554335479, "grad_norm": 10.782124519348145, "learning_rate": 1.50653152522372e-06, "loss": 0.4142, "step": 2496 }, { "epoch": 0.9454751987883377, "grad_norm": 7.636911869049072, "learning_rate": 1.4858748711797822e-06, "loss": 0.2571, "step": 2497 }, { "epoch": 0.9458538432411965, "grad_norm": 7.60603666305542, "learning_rate": 1.4653597522359396e-06, "loss": 0.2453, "step": 2498 }, { "epoch": 0.9462324876940553, "grad_norm": 10.03763198852539, "learning_rate": 1.444986197866227e-06, "loss": 0.3472, "step": 2499 }, { "epoch": 0.9466111321469141, "grad_norm": 3.0592634677886963, "learning_rate": 1.424754237341297e-06, "loss": 0.0667, "step": 2500 }, { "epoch": 0.9469897765997728, "grad_norm": 10.650063514709473, "learning_rate": 1.4046638997283978e-06, "loss": 2.1954, "step": 2501 }, { "epoch": 0.9473684210526315, "grad_norm": 8.665024757385254, "learning_rate": 1.3847152138912744e-06, "loss": 1.0272, "step": 2502 }, { "epoch": 0.9477470655054904, "grad_norm": 9.950023651123047, "learning_rate": 1.3649082084901676e-06, "loss": 1.3072, "step": 2503 }, { "epoch": 0.9481257099583491, "grad_norm": 9.413839340209961, "learning_rate": 1.345242911981781e-06, "loss": 0.8727, "step": 2504 }, { "epoch": 0.9485043544112078, "grad_norm": 10.411212921142578, "learning_rate": 1.3257193526192257e-06, "loss": 0.6985, "step": 2505 }, { "epoch": 0.9488829988640667, "grad_norm": 12.88664436340332, "learning_rate": 1.3063375584519532e-06, "loss": 0.6111, "step": 2506 }, { "epoch": 0.9492616433169254, "grad_norm": 9.880784034729004, "learning_rate": 1.2870975573257783e-06, "loss": 0.6718, "step": 2507 }, { "epoch": 0.9496402877697842, "grad_norm": 9.987343788146973, "learning_rate": 1.267999376882767e-06, "loss": 0.6956, "step": 2508 }, { "epoch": 0.9500189322226429, "grad_norm": 10.966734886169434, "learning_rate": 1.2490430445612488e-06, "loss": 0.7292, "step": 2509 }, { "epoch": 0.9503975766755017, "grad_norm": 13.379451751708984, "learning_rate": 1.230228587595772e-06, "loss": 0.527, "step": 2510 }, { "epoch": 0.9507762211283605, "grad_norm": 11.814391136169434, "learning_rate": 1.2115560330170362e-06, "loss": 0.6957, "step": 2511 }, { "epoch": 0.9511548655812192, "grad_norm": 9.619377136230469, "learning_rate": 1.1930254076518488e-06, "loss": 0.753, "step": 2512 }, { "epoch": 0.951533510034078, "grad_norm": 10.99007797241211, "learning_rate": 1.1746367381231582e-06, "loss": 0.463, "step": 2513 }, { "epoch": 0.9519121544869368, "grad_norm": 13.722533226013184, "learning_rate": 1.1563900508499425e-06, "loss": 0.6937, "step": 2514 }, { "epoch": 0.9522907989397955, "grad_norm": 11.671647071838379, "learning_rate": 1.1382853720471764e-06, "loss": 0.4535, "step": 2515 }, { "epoch": 0.9526694433926542, "grad_norm": 7.917880535125732, "learning_rate": 1.1203227277258198e-06, "loss": 0.4509, "step": 2516 }, { "epoch": 0.9530480878455131, "grad_norm": 12.8670072555542, "learning_rate": 1.1025021436927962e-06, "loss": 0.5246, "step": 2517 }, { "epoch": 0.9534267322983718, "grad_norm": 13.827978134155273, "learning_rate": 1.0848236455509031e-06, "loss": 0.5806, "step": 2518 }, { "epoch": 0.9538053767512306, "grad_norm": 12.70480728149414, "learning_rate": 1.0672872586988237e-06, "loss": 0.4097, "step": 2519 }, { "epoch": 0.9541840212040894, "grad_norm": 10.580239295959473, "learning_rate": 1.0498930083310376e-06, "loss": 0.4366, "step": 2520 }, { "epoch": 0.9545626656569481, "grad_norm": 8.138960838317871, "learning_rate": 1.032640919437844e-06, "loss": 0.2703, "step": 2521 }, { "epoch": 0.9549413101098069, "grad_norm": 6.230162620544434, "learning_rate": 1.0155310168053156e-06, "loss": 0.2412, "step": 2522 }, { "epoch": 0.9553199545626656, "grad_norm": 17.800689697265625, "learning_rate": 9.985633250152116e-07, "loss": 0.4268, "step": 2523 }, { "epoch": 0.9556985990155245, "grad_norm": 29.00356674194336, "learning_rate": 9.817378684449763e-07, "loss": 0.2153, "step": 2524 }, { "epoch": 0.9560772434683832, "grad_norm": 14.519379615783691, "learning_rate": 9.6505467126774e-07, "loss": 0.1794, "step": 2525 }, { "epoch": 0.9564558879212419, "grad_norm": 9.758588790893555, "learning_rate": 9.485137574522185e-07, "loss": 1.728, "step": 2526 }, { "epoch": 0.9568345323741008, "grad_norm": 10.424653053283691, "learning_rate": 9.321151507627135e-07, "loss": 1.3501, "step": 2527 }, { "epoch": 0.9572131768269595, "grad_norm": 10.270801544189453, "learning_rate": 9.158588747590902e-07, "loss": 1.3, "step": 2528 }, { "epoch": 0.9575918212798182, "grad_norm": 10.49842643737793, "learning_rate": 8.997449527966994e-07, "loss": 0.8599, "step": 2529 }, { "epoch": 0.957970465732677, "grad_norm": 8.8501615524292, "learning_rate": 8.837734080264116e-07, "loss": 0.7401, "step": 2530 }, { "epoch": 0.9583491101855358, "grad_norm": 10.654547691345215, "learning_rate": 8.679442633945156e-07, "loss": 0.8812, "step": 2531 }, { "epoch": 0.9587277546383945, "grad_norm": 10.941558837890625, "learning_rate": 8.522575416426981e-07, "loss": 1.1451, "step": 2532 }, { "epoch": 0.9591063990912533, "grad_norm": 11.180508613586426, "learning_rate": 8.367132653080867e-07, "loss": 0.774, "step": 2533 }, { "epoch": 0.9594850435441121, "grad_norm": 11.109345436096191, "learning_rate": 8.213114567230951e-07, "loss": 0.5891, "step": 2534 }, { "epoch": 0.9598636879969709, "grad_norm": 10.302473068237305, "learning_rate": 8.060521380154784e-07, "loss": 0.6217, "step": 2535 }, { "epoch": 0.9602423324498296, "grad_norm": 10.956271171569824, "learning_rate": 7.90935331108289e-07, "loss": 0.6269, "step": 2536 }, { "epoch": 0.9606209769026883, "grad_norm": 11.260363578796387, "learning_rate": 7.759610577198206e-07, "loss": 0.5793, "step": 2537 }, { "epoch": 0.9609996213555472, "grad_norm": 14.293161392211914, "learning_rate": 7.611293393635755e-07, "loss": 0.8875, "step": 2538 }, { "epoch": 0.9613782658084059, "grad_norm": 11.707453727722168, "learning_rate": 7.46440197348286e-07, "loss": 0.5792, "step": 2539 }, { "epoch": 0.9617569102612646, "grad_norm": 10.193665504455566, "learning_rate": 7.318936527777931e-07, "loss": 0.4881, "step": 2540 }, { "epoch": 0.9621355547141235, "grad_norm": 9.130789756774902, "learning_rate": 7.174897265511238e-07, "loss": 0.4617, "step": 2541 }, { "epoch": 0.9625141991669822, "grad_norm": 7.929288864135742, "learning_rate": 7.032284393623579e-07, "loss": 0.3009, "step": 2542 }, { "epoch": 0.962892843619841, "grad_norm": 6.23085355758667, "learning_rate": 6.891098117006833e-07, "loss": 0.2789, "step": 2543 }, { "epoch": 0.9632714880726997, "grad_norm": 16.519512176513672, "learning_rate": 6.751338638502858e-07, "loss": 0.558, "step": 2544 }, { "epoch": 0.9636501325255585, "grad_norm": 9.027918815612793, "learning_rate": 6.613006158904145e-07, "loss": 0.3583, "step": 2545 }, { "epoch": 0.9640287769784173, "grad_norm": 9.856430053710938, "learning_rate": 6.476100876952718e-07, "loss": 0.2975, "step": 2546 }, { "epoch": 0.964407421431276, "grad_norm": 10.270342826843262, "learning_rate": 6.340622989340128e-07, "loss": 0.4139, "step": 2547 }, { "epoch": 0.9647860658841348, "grad_norm": 10.692875862121582, "learning_rate": 6.206572690707125e-07, "loss": 0.2542, "step": 2548 }, { "epoch": 0.9651647103369936, "grad_norm": 36.20051574707031, "learning_rate": 6.073950173643873e-07, "loss": 0.6115, "step": 2549 }, { "epoch": 0.9655433547898523, "grad_norm": 26.375398635864258, "learning_rate": 5.942755628688845e-07, "loss": 0.4395, "step": 2550 }, { "epoch": 0.965921999242711, "grad_norm": 10.20322322845459, "learning_rate": 5.812989244328937e-07, "loss": 1.5327, "step": 2551 }, { "epoch": 0.9663006436955699, "grad_norm": 10.076367378234863, "learning_rate": 5.684651206999347e-07, "loss": 1.2404, "step": 2552 }, { "epoch": 0.9666792881484286, "grad_norm": 9.947564125061035, "learning_rate": 5.557741701083363e-07, "loss": 0.924, "step": 2553 }, { "epoch": 0.9670579326012874, "grad_norm": 10.444324493408203, "learning_rate": 5.432260908911358e-07, "loss": 1.1889, "step": 2554 }, { "epoch": 0.9674365770541462, "grad_norm": 9.716720581054688, "learning_rate": 5.308209010761678e-07, "loss": 0.7316, "step": 2555 }, { "epoch": 0.9678152215070049, "grad_norm": 14.292815208435059, "learning_rate": 5.185586184859426e-07, "loss": 0.8652, "step": 2556 }, { "epoch": 0.9681938659598637, "grad_norm": 12.14730453491211, "learning_rate": 5.064392607376567e-07, "loss": 0.7933, "step": 2557 }, { "epoch": 0.9685725104127224, "grad_norm": 9.1866455078125, "learning_rate": 4.94462845243171e-07, "loss": 0.501, "step": 2558 }, { "epoch": 0.9689511548655813, "grad_norm": 12.76934814453125, "learning_rate": 4.826293892089995e-07, "loss": 0.6006, "step": 2559 }, { "epoch": 0.96932979931844, "grad_norm": 13.298689842224121, "learning_rate": 4.709389096362427e-07, "loss": 0.654, "step": 2560 }, { "epoch": 0.9697084437712987, "grad_norm": 12.402892112731934, "learning_rate": 4.593914233205987e-07, "loss": 1.0524, "step": 2561 }, { "epoch": 0.9700870882241576, "grad_norm": 11.499163627624512, "learning_rate": 4.4798694685231903e-07, "loss": 0.5996, "step": 2562 }, { "epoch": 0.9704657326770163, "grad_norm": 8.201879501342773, "learning_rate": 4.367254966161971e-07, "loss": 0.5142, "step": 2563 }, { "epoch": 0.970844377129875, "grad_norm": 14.199493408203125, "learning_rate": 4.2560708879154645e-07, "loss": 0.5288, "step": 2564 }, { "epoch": 0.9712230215827338, "grad_norm": 9.493123054504395, "learning_rate": 4.1463173935216703e-07, "loss": 0.4889, "step": 2565 }, { "epoch": 0.9716016660355926, "grad_norm": 11.741094589233398, "learning_rate": 4.037994640663345e-07, "loss": 0.4841, "step": 2566 }, { "epoch": 0.9719803104884513, "grad_norm": 10.63720417022705, "learning_rate": 3.9311027849674444e-07, "loss": 0.4452, "step": 2567 }, { "epoch": 0.9723589549413101, "grad_norm": 10.38010025024414, "learning_rate": 3.8256419800055675e-07, "loss": 0.4737, "step": 2568 }, { "epoch": 0.9727375993941689, "grad_norm": 12.463581085205078, "learning_rate": 3.721612377292849e-07, "loss": 0.2774, "step": 2569 }, { "epoch": 0.9731162438470277, "grad_norm": 9.494747161865234, "learning_rate": 3.6190141262887333e-07, "loss": 0.2915, "step": 2570 }, { "epoch": 0.9734948882998864, "grad_norm": 11.594443321228027, "learning_rate": 3.517847374395755e-07, "loss": 0.2961, "step": 2571 }, { "epoch": 0.9738735327527451, "grad_norm": 7.852182388305664, "learning_rate": 3.418112266960205e-07, "loss": 0.2635, "step": 2572 }, { "epoch": 0.974252177205604, "grad_norm": 4.673264026641846, "learning_rate": 3.319808947271241e-07, "loss": 0.0947, "step": 2573 }, { "epoch": 0.9746308216584627, "grad_norm": 5.447509288787842, "learning_rate": 3.222937556561223e-07, "loss": 0.0963, "step": 2574 }, { "epoch": 0.9750094661113214, "grad_norm": 4.423956394195557, "learning_rate": 3.127498234005044e-07, "loss": 0.1488, "step": 2575 }, { "epoch": 0.9753881105641803, "grad_norm": 11.625571250915527, "learning_rate": 3.033491116720244e-07, "loss": 2.4858, "step": 2576 }, { "epoch": 0.975766755017039, "grad_norm": 10.204331398010254, "learning_rate": 2.940916339766675e-07, "loss": 1.637, "step": 2577 }, { "epoch": 0.9761453994698978, "grad_norm": 10.439549446105957, "learning_rate": 2.849774036146502e-07, "loss": 1.3357, "step": 2578 }, { "epoch": 0.9765240439227565, "grad_norm": 10.696296691894531, "learning_rate": 2.7600643368036473e-07, "loss": 1.1023, "step": 2579 }, { "epoch": 0.9769026883756153, "grad_norm": 12.158334732055664, "learning_rate": 2.6717873706240125e-07, "loss": 1.1039, "step": 2580 }, { "epoch": 0.9772813328284741, "grad_norm": 11.918547630310059, "learning_rate": 2.5849432644348136e-07, "loss": 0.8639, "step": 2581 }, { "epoch": 0.9776599772813328, "grad_norm": 9.715611457824707, "learning_rate": 2.4995321430050235e-07, "loss": 0.567, "step": 2582 }, { "epoch": 0.9780386217341916, "grad_norm": 11.445382118225098, "learning_rate": 2.415554129044595e-07, "loss": 0.9997, "step": 2583 }, { "epoch": 0.9784172661870504, "grad_norm": 10.35413932800293, "learning_rate": 2.333009343204573e-07, "loss": 0.7591, "step": 2584 }, { "epoch": 0.9787959106399091, "grad_norm": 7.9862165451049805, "learning_rate": 2.2518979040769827e-07, "loss": 0.3323, "step": 2585 }, { "epoch": 0.9791745550927679, "grad_norm": 12.614017486572266, "learning_rate": 2.1722199281944967e-07, "loss": 0.9701, "step": 2586 }, { "epoch": 0.9795531995456267, "grad_norm": 11.604450225830078, "learning_rate": 2.0939755300304342e-07, "loss": 0.8196, "step": 2587 }, { "epoch": 0.9799318439984854, "grad_norm": 12.22137451171875, "learning_rate": 2.0171648219982074e-07, "loss": 0.4567, "step": 2588 }, { "epoch": 0.9803104884513442, "grad_norm": 11.08056926727295, "learning_rate": 1.941787914451876e-07, "loss": 0.7081, "step": 2589 }, { "epoch": 0.9806891329042029, "grad_norm": 8.053107261657715, "learning_rate": 1.8678449156852573e-07, "loss": 0.5096, "step": 2590 }, { "epoch": 0.9810677773570617, "grad_norm": 10.370348930358887, "learning_rate": 1.7953359319320406e-07, "loss": 0.5069, "step": 2591 }, { "epoch": 0.9814464218099205, "grad_norm": 8.252673149108887, "learning_rate": 1.7242610673658954e-07, "loss": 0.2886, "step": 2592 }, { "epoch": 0.9818250662627792, "grad_norm": 14.678871154785156, "learning_rate": 1.6546204240999174e-07, "loss": 0.3913, "step": 2593 }, { "epoch": 0.9822037107156381, "grad_norm": 12.055886268615723, "learning_rate": 1.5864141021868506e-07, "loss": 0.4829, "step": 2594 }, { "epoch": 0.9825823551684968, "grad_norm": 7.955121040344238, "learning_rate": 1.5196421996184207e-07, "loss": 0.3313, "step": 2595 }, { "epoch": 0.9829609996213555, "grad_norm": 6.439930438995361, "learning_rate": 1.4543048123257796e-07, "loss": 0.1825, "step": 2596 }, { "epoch": 0.9833396440742143, "grad_norm": 5.3940582275390625, "learning_rate": 1.3904020341791724e-07, "loss": 0.1774, "step": 2597 }, { "epoch": 0.9837182885270731, "grad_norm": 23.00211524963379, "learning_rate": 1.3279339569874926e-07, "loss": 0.2979, "step": 2598 }, { "epoch": 0.9840969329799318, "grad_norm": 10.200654983520508, "learning_rate": 1.2669006704986164e-07, "loss": 0.3489, "step": 2599 }, { "epoch": 0.9844755774327906, "grad_norm": 12.444687843322754, "learning_rate": 1.2073022623988462e-07, "loss": 0.4883, "step": 2600 }, { "epoch": 0.9848542218856494, "grad_norm": 8.539031982421875, "learning_rate": 1.1491388183133556e-07, "loss": 1.2458, "step": 2601 }, { "epoch": 0.9852328663385082, "grad_norm": 12.276918411254883, "learning_rate": 1.092410421805301e-07, "loss": 1.1199, "step": 2602 }, { "epoch": 0.9856115107913669, "grad_norm": 9.444790840148926, "learning_rate": 1.0371171543763769e-07, "loss": 1.1647, "step": 2603 }, { "epoch": 0.9859901552442256, "grad_norm": 9.76896858215332, "learning_rate": 9.832590954662602e-08, "loss": 0.8912, "step": 2604 }, { "epoch": 0.9863687996970845, "grad_norm": 9.74208927154541, "learning_rate": 9.308363224528327e-08, "loss": 0.8606, "step": 2605 }, { "epoch": 0.9867474441499432, "grad_norm": 8.999712944030762, "learning_rate": 8.798489106517371e-08, "loss": 0.6109, "step": 2606 }, { "epoch": 0.9871260886028019, "grad_norm": 7.842771530151367, "learning_rate": 8.302969333165989e-08, "loss": 0.5287, "step": 2607 }, { "epoch": 0.9875047330556608, "grad_norm": 11.93226432800293, "learning_rate": 7.821804616384709e-08, "loss": 0.5248, "step": 2608 }, { "epoch": 0.9878833775085195, "grad_norm": 14.00164794921875, "learning_rate": 7.354995647465002e-08, "loss": 0.996, "step": 2609 }, { "epoch": 0.9882620219613782, "grad_norm": 9.637811660766602, "learning_rate": 6.90254309706928e-08, "loss": 0.5948, "step": 2610 }, { "epoch": 0.988640666414237, "grad_norm": 10.103828430175781, "learning_rate": 6.464447615235347e-08, "loss": 0.667, "step": 2611 }, { "epoch": 0.9890193108670958, "grad_norm": 8.21738338470459, "learning_rate": 6.04070983137417e-08, "loss": 0.6523, "step": 2612 }, { "epoch": 0.9893979553199546, "grad_norm": 10.094844818115234, "learning_rate": 5.631330354269882e-08, "loss": 0.57, "step": 2613 }, { "epoch": 0.9897765997728133, "grad_norm": 9.283138275146484, "learning_rate": 5.236309772077563e-08, "loss": 0.5518, "step": 2614 }, { "epoch": 0.9901552442256721, "grad_norm": 8.835543632507324, "learning_rate": 4.855648652321021e-08, "loss": 0.3256, "step": 2615 }, { "epoch": 0.9905338886785309, "grad_norm": 6.809988498687744, "learning_rate": 4.4893475418983365e-08, "loss": 0.1993, "step": 2616 }, { "epoch": 0.9909125331313896, "grad_norm": 7.190274238586426, "learning_rate": 4.137406967070767e-08, "loss": 0.3838, "step": 2617 }, { "epoch": 0.9912911775842483, "grad_norm": 13.799723625183105, "learning_rate": 3.799827433472736e-08, "loss": 0.5659, "step": 2618 }, { "epoch": 0.9916698220371072, "grad_norm": 6.342631816864014, "learning_rate": 3.47660942610295e-08, "loss": 0.227, "step": 2619 }, { "epoch": 0.9920484664899659, "grad_norm": 7.028225421905518, "learning_rate": 3.1677534093299545e-08, "loss": 0.2465, "step": 2620 }, { "epoch": 0.9924271109428247, "grad_norm": 9.69013786315918, "learning_rate": 2.873259826885466e-08, "loss": 0.3781, "step": 2621 }, { "epoch": 0.9928057553956835, "grad_norm": 9.710000038146973, "learning_rate": 2.5931291018677086e-08, "loss": 0.2296, "step": 2622 }, { "epoch": 0.9931843998485422, "grad_norm": 11.306065559387207, "learning_rate": 2.3273616367414097e-08, "loss": 0.2997, "step": 2623 }, { "epoch": 0.993563044301401, "grad_norm": 6.006664276123047, "learning_rate": 2.0759578133333623e-08, "loss": 0.1308, "step": 2624 }, { "epoch": 0.9939416887542597, "grad_norm": 6.388942241668701, "learning_rate": 1.8389179928357538e-08, "loss": 0.1969, "step": 2625 }, { "epoch": 0.9943203332071185, "grad_norm": 11.715986251831055, "learning_rate": 1.616242515802835e-08, "loss": 1.8615, "step": 2626 }, { "epoch": 0.9946989776599773, "grad_norm": 9.530550003051758, "learning_rate": 1.4079317021520321e-08, "loss": 1.3649, "step": 2627 }, { "epoch": 0.995077622112836, "grad_norm": 11.392560958862305, "learning_rate": 1.2139858511628356e-08, "loss": 0.9551, "step": 2628 }, { "epoch": 0.9954562665656949, "grad_norm": 11.880497932434082, "learning_rate": 1.0344052414779094e-08, "loss": 0.8825, "step": 2629 }, { "epoch": 0.9958349110185536, "grad_norm": 8.874764442443848, "learning_rate": 8.691901310997619e-09, "loss": 0.5746, "step": 2630 }, { "epoch": 0.9962135554714123, "grad_norm": 10.284222602844238, "learning_rate": 7.1834075739296566e-09, "loss": 0.7843, "step": 2631 }, { "epoch": 0.9965921999242711, "grad_norm": 10.232461929321289, "learning_rate": 5.818573370830471e-09, "loss": 0.7046, "step": 2632 }, { "epoch": 0.9969708443771299, "grad_norm": 11.678064346313477, "learning_rate": 4.597400662553764e-09, "loss": 0.4164, "step": 2633 }, { "epoch": 0.9973494888299886, "grad_norm": 9.27035903930664, "learning_rate": 3.5198912035516727e-09, "loss": 0.3791, "step": 2634 }, { "epoch": 0.9977281332828474, "grad_norm": 12.330588340759277, "learning_rate": 2.586046541874776e-09, "loss": 0.5568, "step": 2635 }, { "epoch": 0.9981067777357062, "grad_norm": 11.334096908569336, "learning_rate": 1.7958680191942911e-09, "loss": 0.5329, "step": 2636 }, { "epoch": 0.998485422188565, "grad_norm": 12.605074882507324, "learning_rate": 1.149356770746568e-09, "loss": 0.4182, "step": 2637 }, { "epoch": 0.9988640666414237, "grad_norm": 7.650578022003174, "learning_rate": 6.465137253663934e-10, "loss": 0.2754, "step": 2638 }, { "epoch": 0.9992427110942824, "grad_norm": 8.941444396972656, "learning_rate": 2.873396055091959e-10, "loss": 0.3273, "step": 2639 }, { "epoch": 0.9996213555471413, "grad_norm": 10.42584228515625, "learning_rate": 7.183492717333096e-11, "loss": 0.1813, "step": 2640 }, { "epoch": 1.0, "grad_norm": 7.037883281707764, "learning_rate": 0.0, "loss": 0.2703, "step": 2641 } ], "logging_steps": 1, "max_steps": 2641, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 661, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.485573706265238e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }