diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,11310 +10,11318 @@ "log_history": [ { "epoch": 0.0001239387742455227, - "grad_norm": 0.6156698132479959, + "grad_norm": 0.36398436718107713, "learning_rate": 2.4783147459727386e-07, - "loss": 4.9844, + "loss": 2.5391, "step": 1 }, { "epoch": 0.0006196938712276135, - "grad_norm": 0.6052976200579151, + "grad_norm": 0.3553971091320693, "learning_rate": 1.2391573729863693e-06, - "loss": 4.7617, + "loss": 2.3916, "step": 5 }, { "epoch": 0.001239387742455227, - "grad_norm": 0.7601239478563812, + "grad_norm": 0.33706223881954883, "learning_rate": 2.4783147459727386e-06, - "loss": 4.8406, + "loss": 2.4906, "step": 10 }, { "epoch": 0.0018590816136828407, - "grad_norm": 0.7526341301126038, + "grad_norm": 0.38128850820849297, "learning_rate": 3.717472118959108e-06, - "loss": 4.8391, + "loss": 2.4898, "step": 15 }, { "epoch": 0.002478775484910454, - "grad_norm": 0.6322948184451165, + "grad_norm": 0.43552764540168315, "learning_rate": 4.956629491945477e-06, - "loss": 4.9266, + "loss": 2.4367, "step": 20 }, { "epoch": 0.003098469356138068, - "grad_norm": 0.6990885844721237, + "grad_norm": 0.30493677951675263, "learning_rate": 6.195786864931847e-06, - "loss": 4.8547, + "loss": 2.4398, "step": 25 }, { "epoch": 0.0037181632273656814, - "grad_norm": 0.7984745761097714, + "grad_norm": 0.40892724849804757, "learning_rate": 7.434944237918216e-06, - "loss": 4.8336, + "loss": 2.3828, "step": 30 }, { "epoch": 0.004337857098593295, - "grad_norm": 0.7814803522780867, + "grad_norm": 0.41597118556401486, "learning_rate": 8.674101610904585e-06, - "loss": 4.8438, + "loss": 2.459, "step": 35 }, { "epoch": 0.004957550969820908, - "grad_norm": 0.7825601910016009, + "grad_norm": 0.34011631695052974, "learning_rate": 9.913258983890955e-06, - "loss": 4.8141, + "loss": 2.3883, "step": 40 }, { "epoch": 0.005577244841048522, - "grad_norm": 0.7686051064734422, + "grad_norm": 0.42036838195034787, "learning_rate": 1.1152416356877324e-05, - "loss": 4.8469, + "loss": 2.407, "step": 45 }, { "epoch": 0.006196938712276136, - "grad_norm": 0.7149556352785319, + "grad_norm": 0.32938017778181394, "learning_rate": 1.2391573729863694e-05, - "loss": 4.8094, + "loss": 2.4359, "step": 50 }, { "epoch": 0.006816632583503749, - "grad_norm": 0.7305536513174533, + "grad_norm": 0.522486908526655, "learning_rate": 1.3630731102850064e-05, - "loss": 4.7266, + "loss": 2.3625, "step": 55 }, { "epoch": 0.007436326454731363, - "grad_norm": 0.8178979347533347, + "grad_norm": 0.39947004413100906, "learning_rate": 1.4869888475836432e-05, - "loss": 4.7031, + "loss": 2.4008, "step": 60 }, { "epoch": 0.008056020325958976, - "grad_norm": 0.8334181887763964, + "grad_norm": 0.4627809567954664, "learning_rate": 1.61090458488228e-05, - "loss": 4.6656, + "loss": 2.3168, "step": 65 }, { "epoch": 0.00867571419718659, - "grad_norm": 0.7695228033324475, + "grad_norm": 0.4876882111255118, "learning_rate": 1.734820322180917e-05, - "loss": 4.7344, + "loss": 2.343, "step": 70 }, { "epoch": 0.009295408068414203, - "grad_norm": 0.8811868823646412, + "grad_norm": 0.3811791732250246, "learning_rate": 1.858736059479554e-05, - "loss": 4.6906, + "loss": 2.3398, "step": 75 }, { "epoch": 0.009915101939641817, - "grad_norm": 1.1199523468579367, + "grad_norm": 0.4376425547982147, "learning_rate": 1.982651796778191e-05, - "loss": 4.5859, + "loss": 2.3344, "step": 80 }, { "epoch": 0.01053479581086943, - "grad_norm": 1.0597694158945299, + "grad_norm": 0.3931646393426777, "learning_rate": 2.106567534076828e-05, - "loss": 4.4813, + "loss": 2.332, "step": 85 }, { "epoch": 0.011154489682097044, - "grad_norm": 0.8117839754264472, + "grad_norm": 0.4461816921220648, "learning_rate": 2.230483271375465e-05, - "loss": 4.5922, + "loss": 2.2609, "step": 90 }, { "epoch": 0.011774183553324657, - "grad_norm": 0.8958360055388476, + "grad_norm": 0.37584050496240057, "learning_rate": 2.3543990086741015e-05, - "loss": 4.4766, + "loss": 2.2234, "step": 95 }, { "epoch": 0.012393877424552271, - "grad_norm": 0.8817057210428043, + "grad_norm": 0.33119569793020404, "learning_rate": 2.4783147459727388e-05, - "loss": 4.5094, + "loss": 2.173, "step": 100 }, { "epoch": 0.013013571295779885, - "grad_norm": 0.537206137804486, + "grad_norm": 0.3705649194998293, "learning_rate": 2.6022304832713758e-05, - "loss": 4.3797, + "loss": 2.0984, "step": 105 }, { "epoch": 0.013633265167007498, - "grad_norm": 0.5300195861022883, + "grad_norm": 0.2504511318591155, "learning_rate": 2.7261462205700128e-05, - "loss": 4.3156, + "loss": 2.109, "step": 110 }, { "epoch": 0.014252959038235112, - "grad_norm": 0.3948064250822983, + "grad_norm": 0.23798871106040215, "learning_rate": 2.8500619578686494e-05, - "loss": 4.257, + "loss": 2.1016, "step": 115 }, { "epoch": 0.014872652909462726, - "grad_norm": 0.364515312322803, + "grad_norm": 0.21122743560448873, "learning_rate": 2.9739776951672864e-05, - "loss": 4.2406, + "loss": 2.1574, "step": 120 }, { "epoch": 0.01549234678069034, - "grad_norm": 0.37164041744407617, + "grad_norm": 0.1813741930176235, "learning_rate": 3.0978934324659233e-05, - "loss": 4.2383, + "loss": 2.1199, "step": 125 }, { "epoch": 0.01611204065191795, - "grad_norm": 0.3245015145961911, + "grad_norm": 0.1990606894765091, "learning_rate": 3.22180916976456e-05, - "loss": 4.1914, + "loss": 2.1242, "step": 130 }, { "epoch": 0.016731734523145567, - "grad_norm": 0.30522684760315766, + "grad_norm": 0.15350817933593797, "learning_rate": 3.345724907063197e-05, - "loss": 4.0875, + "loss": 2.1379, "step": 135 }, { "epoch": 0.01735142839437318, - "grad_norm": 0.35317894181130866, + "grad_norm": 0.15326065617394624, "learning_rate": 3.469640644361834e-05, - "loss": 4.1297, + "loss": 2.1441, "step": 140 }, { "epoch": 0.017971122265600794, - "grad_norm": 0.34626796389259323, + "grad_norm": 0.1630492804177675, "learning_rate": 3.593556381660471e-05, - "loss": 4.1219, + "loss": 2.0914, "step": 145 }, { "epoch": 0.018590816136828406, - "grad_norm": 0.32611609298532446, + "grad_norm": 0.17107386161983282, "learning_rate": 3.717472118959108e-05, - "loss": 4.0687, + "loss": 2.0324, "step": 150 }, { "epoch": 0.01921051000805602, - "grad_norm": 0.2826877224295346, + "grad_norm": 0.16054446888841606, "learning_rate": 3.841387856257745e-05, - "loss": 4.2211, + "loss": 2.1078, "step": 155 }, { "epoch": 0.019830203879283633, - "grad_norm": 0.2887063005587996, + "grad_norm": 0.14956110781178233, "learning_rate": 3.965303593556382e-05, - "loss": 4.0383, + "loss": 2.0621, "step": 160 }, { "epoch": 0.02044989775051125, - "grad_norm": 0.2914285373125258, + "grad_norm": 0.13835441017934677, "learning_rate": 4.0892193308550185e-05, - "loss": 4.107, + "loss": 2.059, "step": 165 }, { "epoch": 0.02106959162173886, - "grad_norm": 0.2565212414056402, + "grad_norm": 0.12666794482225688, "learning_rate": 4.213135068153656e-05, - "loss": 4.1594, + "loss": 2.0848, "step": 170 }, { "epoch": 0.021689285492966476, - "grad_norm": 0.2440231811546915, + "grad_norm": 0.12997209523884481, "learning_rate": 4.337050805452293e-05, - "loss": 4.093, + "loss": 2.0883, "step": 175 }, { "epoch": 0.022308979364194088, - "grad_norm": 0.23545457064479375, + "grad_norm": 0.13662868551878915, "learning_rate": 4.46096654275093e-05, - "loss": 4.1391, + "loss": 2.0496, "step": 180 }, { "epoch": 0.022928673235421703, - "grad_norm": 0.24355219689682756, + "grad_norm": 0.12904027963907533, "learning_rate": 4.5848822800495664e-05, - "loss": 4.1117, + "loss": 2.0391, "step": 185 }, { "epoch": 0.023548367106649315, - "grad_norm": 0.25029203671068706, + "grad_norm": 0.11673326298638313, "learning_rate": 4.708798017348203e-05, - "loss": 4.143, + "loss": 1.9664, "step": 190 }, { "epoch": 0.02416806097787693, - "grad_norm": 0.23034997869099355, + "grad_norm": 0.11261296057950389, "learning_rate": 4.83271375464684e-05, - "loss": 4.0078, + "loss": 2.023, "step": 195 }, { "epoch": 0.024787754849104542, - "grad_norm": 0.22754977237904275, + "grad_norm": 0.12060637138665908, "learning_rate": 4.9566294919454776e-05, - "loss": 4.0586, + "loss": 2.0344, "step": 200 }, { "epoch": 0.025407448720332158, - "grad_norm": 0.21627590215778986, + "grad_norm": 0.11902127583462105, "learning_rate": 5.080545229244115e-05, - "loss": 4.057, + "loss": 2.0461, "step": 205 }, { "epoch": 0.02602714259155977, - "grad_norm": 0.22617748275994737, + "grad_norm": 0.11966181981073709, "learning_rate": 5.2044609665427516e-05, - "loss": 4.0891, + "loss": 2.0465, "step": 210 }, { "epoch": 0.02664683646278738, - "grad_norm": 0.2210255983769268, + "grad_norm": 0.11020817117690564, "learning_rate": 5.328376703841388e-05, - "loss": 4.0289, + "loss": 2.0453, "step": 215 }, { "epoch": 0.027266530334014997, - "grad_norm": 0.23704324491853482, + "grad_norm": 0.11068723686941234, "learning_rate": 5.4522924411400255e-05, - "loss": 4.1234, + "loss": 2.0457, "step": 220 }, { "epoch": 0.02788622420524261, - "grad_norm": 0.21415865510311569, + "grad_norm": 0.10772518213779579, "learning_rate": 5.576208178438662e-05, - "loss": 3.9641, + "loss": 2.0211, "step": 225 }, { "epoch": 0.028505918076470224, - "grad_norm": 0.20869571114446672, + "grad_norm": 0.11617312325311774, "learning_rate": 5.700123915737299e-05, - "loss": 4.0156, + "loss": 2.0199, "step": 230 }, { "epoch": 0.029125611947697836, - "grad_norm": 0.2295558494942413, + "grad_norm": 0.10687892359246541, "learning_rate": 5.8240396530359354e-05, - "loss": 4.0008, + "loss": 2.0168, "step": 235 }, { "epoch": 0.02974530581892545, - "grad_norm": 0.20993210502465767, + "grad_norm": 0.10534898334754005, "learning_rate": 5.947955390334573e-05, - "loss": 4.2164, + "loss": 1.9563, "step": 240 }, { "epoch": 0.030364999690153063, - "grad_norm": 0.22805360919484266, + "grad_norm": 0.11188760033211348, "learning_rate": 6.0718711276332094e-05, - "loss": 4.0617, + "loss": 2.0211, "step": 245 }, { "epoch": 0.03098469356138068, - "grad_norm": 0.21679836800491897, + "grad_norm": 0.11217525164835511, "learning_rate": 6.195786864931847e-05, - "loss": 3.9891, + "loss": 2.0055, "step": 250 }, { "epoch": 0.031604387432608294, - "grad_norm": 0.20555177272110325, + "grad_norm": 0.1123864808685792, "learning_rate": 6.319702602230483e-05, - "loss": 3.9547, + "loss": 2.0078, "step": 255 }, { "epoch": 0.0322240813038359, - "grad_norm": 0.22756620407217162, + "grad_norm": 0.10988735365168292, "learning_rate": 6.44361833952912e-05, - "loss": 4.0219, + "loss": 1.9512, "step": 260 }, { "epoch": 0.03284377517506352, - "grad_norm": 0.21772033945390115, + "grad_norm": 0.11717299164220939, "learning_rate": 6.567534076827757e-05, - "loss": 4.0328, + "loss": 2.0352, "step": 265 }, { "epoch": 0.03346346904629113, - "grad_norm": 0.22974604776495353, + "grad_norm": 0.11127165660011876, "learning_rate": 6.691449814126395e-05, - "loss": 3.8969, + "loss": 2.0063, "step": 270 }, { "epoch": 0.03408316291751875, - "grad_norm": 0.22447015222263075, + "grad_norm": 0.11434862645507657, "learning_rate": 6.815365551425031e-05, - "loss": 3.957, + "loss": 1.8934, "step": 275 }, { "epoch": 0.03470285678874636, - "grad_norm": 0.2126380837798873, + "grad_norm": 0.11029266191053742, "learning_rate": 6.939281288723668e-05, - "loss": 3.9594, + "loss": 1.9535, "step": 280 }, { "epoch": 0.03532255065997397, - "grad_norm": 0.228888586034849, + "grad_norm": 0.1118252814826643, "learning_rate": 7.063197026022306e-05, - "loss": 3.982, + "loss": 2.0055, "step": 285 }, { "epoch": 0.03594224453120159, - "grad_norm": 0.22194460293105397, + "grad_norm": 0.1158412859452615, "learning_rate": 7.187112763320942e-05, - "loss": 4.0539, + "loss": 1.9695, "step": 290 }, { "epoch": 0.0365619384024292, - "grad_norm": 0.22114527588166774, + "grad_norm": 0.10966650026564005, "learning_rate": 7.311028500619579e-05, - "loss": 4.0297, + "loss": 1.9957, "step": 295 }, { "epoch": 0.03718163227365681, - "grad_norm": 0.2253633278202601, + "grad_norm": 0.11337717400230966, "learning_rate": 7.434944237918216e-05, - "loss": 3.9047, + "loss": 1.923, "step": 300 }, { "epoch": 0.03780132614488443, - "grad_norm": 0.21472709323547942, + "grad_norm": 0.10903076207822042, "learning_rate": 7.558859975216854e-05, - "loss": 3.9953, + "loss": 1.9684, "step": 305 }, { "epoch": 0.03842102001611204, - "grad_norm": 0.21240689967710813, + "grad_norm": 0.10798533162202754, "learning_rate": 7.68277571251549e-05, - "loss": 3.9609, + "loss": 1.9453, "step": 310 }, { "epoch": 0.03904071388733965, - "grad_norm": 0.2138973933351751, + "grad_norm": 0.11175148400384678, "learning_rate": 7.806691449814127e-05, - "loss": 3.932, + "loss": 1.9535, "step": 315 }, { "epoch": 0.039660407758567266, - "grad_norm": 0.21095964738376724, + "grad_norm": 0.10703314633558067, "learning_rate": 7.930607187112764e-05, - "loss": 3.9281, + "loss": 1.9563, "step": 320 }, { "epoch": 0.04028010162979488, - "grad_norm": 0.23767132219880815, + "grad_norm": 0.10454396198544379, "learning_rate": 8.0545229244114e-05, - "loss": 3.9227, + "loss": 1.9418, "step": 325 }, { "epoch": 0.0408997955010225, - "grad_norm": 0.21145595313668683, + "grad_norm": 0.10965164879246644, "learning_rate": 8.178438661710037e-05, - "loss": 3.8734, + "loss": 1.9852, "step": 330 }, { "epoch": 0.041519489372250105, - "grad_norm": 0.22018542729191426, + "grad_norm": 0.11046491057161609, "learning_rate": 8.302354399008675e-05, - "loss": 4.0695, + "loss": 1.9461, "step": 335 }, { "epoch": 0.04213918324347772, - "grad_norm": 0.2223712124211028, + "grad_norm": 0.10873277373716497, "learning_rate": 8.426270136307312e-05, - "loss": 3.9984, + "loss": 1.8918, "step": 340 }, { "epoch": 0.042758877114705336, - "grad_norm": 0.22870383890024354, + "grad_norm": 0.11520873752551357, "learning_rate": 8.550185873605948e-05, - "loss": 3.9656, + "loss": 1.9738, "step": 345 }, { "epoch": 0.04337857098593295, - "grad_norm": 0.21918918087091638, + "grad_norm": 0.11319565385145965, "learning_rate": 8.674101610904586e-05, - "loss": 3.9547, + "loss": 1.8652, "step": 350 }, { "epoch": 0.04399826485716056, - "grad_norm": 0.24597902739695576, + "grad_norm": 0.11118195705926588, "learning_rate": 8.798017348203223e-05, - "loss": 3.8828, + "loss": 1.9871, "step": 355 }, { "epoch": 0.044617958728388175, - "grad_norm": 0.23311556933668512, + "grad_norm": 0.11282384360301383, "learning_rate": 8.92193308550186e-05, - "loss": 3.9781, + "loss": 2.0211, "step": 360 }, { "epoch": 0.04523765259961579, - "grad_norm": 0.2335730208221731, + "grad_norm": 0.12014079628091061, "learning_rate": 9.045848822800496e-05, - "loss": 3.932, + "loss": 1.9898, "step": 365 }, { "epoch": 0.045857346470843406, - "grad_norm": 0.22758589754195266, + "grad_norm": 0.12069690241752978, "learning_rate": 9.169764560099133e-05, - "loss": 3.9328, + "loss": 1.9766, "step": 370 }, { "epoch": 0.046477040342071015, - "grad_norm": 0.21887801195676554, + "grad_norm": 0.11312380995936393, "learning_rate": 9.29368029739777e-05, - "loss": 3.8961, + "loss": 1.9609, "step": 375 }, { "epoch": 0.04709673421329863, - "grad_norm": 0.23691795117542777, + "grad_norm": 0.10660555317363303, "learning_rate": 9.417596034696406e-05, - "loss": 3.9922, + "loss": 1.9391, "step": 380 }, { "epoch": 0.047716428084526245, - "grad_norm": 0.22314356605972202, + "grad_norm": 0.11034212285475851, "learning_rate": 9.541511771995044e-05, - "loss": 3.9156, + "loss": 1.9402, "step": 385 }, { "epoch": 0.04833612195575386, - "grad_norm": 0.22883014470157503, + "grad_norm": 0.10700147884403914, "learning_rate": 9.66542750929368e-05, - "loss": 3.6133, + "loss": 1.9547, "step": 390 }, { "epoch": 0.04895581582698147, - "grad_norm": 0.2261978377608961, + "grad_norm": 0.11111784345239176, "learning_rate": 9.789343246592317e-05, - "loss": 4.0008, + "loss": 1.9352, "step": 395 }, { "epoch": 0.049575509698209085, - "grad_norm": 0.23167762207515435, + "grad_norm": 0.1156370481856559, "learning_rate": 9.913258983890955e-05, - "loss": 3.8664, + "loss": 1.9441, "step": 400 }, { "epoch": 0.0501952035694367, - "grad_norm": 0.23913876095025915, + "grad_norm": 0.1138918955121605, "learning_rate": 0.0001003717472118959, - "loss": 3.8797, + "loss": 1.9289, "step": 405 }, { "epoch": 0.050814897440664315, - "grad_norm": 0.21300252960890317, + "grad_norm": 0.11700414573434677, "learning_rate": 0.0001016109045848823, - "loss": 3.9016, + "loss": 1.9359, "step": 410 }, { "epoch": 0.051434591311891924, - "grad_norm": 0.24940978228703184, + "grad_norm": 0.11419554025531545, "learning_rate": 0.00010285006195786867, - "loss": 3.7727, + "loss": 1.9324, "step": 415 }, { "epoch": 0.05205428518311954, - "grad_norm": 0.23257277242036176, + "grad_norm": 0.11511324040424104, "learning_rate": 0.00010408921933085503, - "loss": 3.9555, + "loss": 1.9391, "step": 420 }, { "epoch": 0.052673979054347154, - "grad_norm": 0.22550835138918388, + "grad_norm": 0.11598251416819207, "learning_rate": 0.0001053283767038414, - "loss": 3.8078, + "loss": 1.8605, "step": 425 }, { "epoch": 0.05329367292557476, - "grad_norm": 0.2214992349435567, + "grad_norm": 0.11945825058588103, "learning_rate": 0.00010656753407682776, - "loss": 3.8711, + "loss": 1.9219, "step": 430 }, { "epoch": 0.05391336679680238, - "grad_norm": 0.2385884965327802, + "grad_norm": 0.12186744920368066, "learning_rate": 0.00010780669144981412, - "loss": 3.8664, + "loss": 1.9664, "step": 435 }, { "epoch": 0.054533060668029994, - "grad_norm": 0.2393533535483792, + "grad_norm": 0.11978444755418904, "learning_rate": 0.00010904584882280051, - "loss": 3.9273, + "loss": 1.9582, "step": 440 }, { "epoch": 0.05515275453925761, - "grad_norm": 0.23892895926770114, + "grad_norm": 0.11151838879430113, "learning_rate": 0.00011028500619578688, - "loss": 3.9016, + "loss": 1.9141, "step": 445 }, { "epoch": 0.05577244841048522, - "grad_norm": 0.25293116707319535, + "grad_norm": 0.11753526250283612, "learning_rate": 0.00011152416356877324, - "loss": 3.8977, + "loss": 1.9355, "step": 450 }, { "epoch": 0.05639214228171283, - "grad_norm": 0.2339751975680064, + "grad_norm": 0.12027058315054732, "learning_rate": 0.00011276332094175961, - "loss": 3.8297, + "loss": 1.9434, "step": 455 }, { "epoch": 0.05701183615294045, - "grad_norm": 0.22040331210242547, + "grad_norm": 0.12114137216676991, "learning_rate": 0.00011400247831474598, - "loss": 3.875, + "loss": 1.8844, "step": 460 }, { "epoch": 0.057631530024168064, - "grad_norm": 0.22773047199756685, + "grad_norm": 0.11303473278613534, "learning_rate": 0.00011524163568773234, - "loss": 3.8602, + "loss": 1.9352, "step": 465 }, { "epoch": 0.05825122389539567, - "grad_norm": 0.21974486332656235, + "grad_norm": 0.12224817657479514, "learning_rate": 0.00011648079306071871, - "loss": 3.8344, + "loss": 1.957, "step": 470 }, { "epoch": 0.05887091776662329, - "grad_norm": 0.24683791311372597, + "grad_norm": 0.11169054867070344, "learning_rate": 0.00011771995043370509, - "loss": 3.8414, + "loss": 1.9418, "step": 475 }, { "epoch": 0.0594906116378509, - "grad_norm": 0.22784797807715068, + "grad_norm": 0.11248246802350605, "learning_rate": 0.00011895910780669145, - "loss": 3.7875, + "loss": 1.9105, "step": 480 }, { "epoch": 0.06011030550907852, - "grad_norm": 0.22112074000806836, + "grad_norm": 0.11732018140367607, "learning_rate": 0.00012019826517967782, - "loss": 3.8953, + "loss": 1.9449, "step": 485 }, { "epoch": 0.06072999938030613, - "grad_norm": 0.22438071094190917, + "grad_norm": 0.12552929938535534, "learning_rate": 0.00012143742255266419, - "loss": 4.0109, + "loss": 1.9254, "step": 490 }, { "epoch": 0.06134969325153374, - "grad_norm": 0.24426703173088538, + "grad_norm": 0.11437642863096387, "learning_rate": 0.00012267657992565055, - "loss": 3.8891, + "loss": 1.8961, "step": 495 }, { "epoch": 0.06196938712276136, - "grad_norm": 0.24796530136608588, + "grad_norm": 0.1260600647441986, "learning_rate": 0.00012391573729863693, - "loss": 3.9898, + "loss": 1.8875, "step": 500 }, { "epoch": 0.06258908099398897, - "grad_norm": 0.2504226663983639, + "grad_norm": 0.11848912394998189, "learning_rate": 0.00012515489467162331, - "loss": 3.9109, + "loss": 1.8762, "step": 505 }, { "epoch": 0.06320877486521659, - "grad_norm": 0.24728151359797979, + "grad_norm": 0.12197039580850588, "learning_rate": 0.00012639405204460967, - "loss": 3.7414, + "loss": 1.8574, "step": 510 }, { "epoch": 0.0638284687364442, - "grad_norm": 0.2568748492336261, + "grad_norm": 0.11905836258470184, "learning_rate": 0.00012763320941759605, - "loss": 3.85, + "loss": 1.9691, "step": 515 }, { "epoch": 0.0644481626076718, - "grad_norm": 0.2311918499284196, + "grad_norm": 0.11404810658049691, "learning_rate": 0.0001288723667905824, - "loss": 3.8188, + "loss": 1.9176, "step": 520 }, { "epoch": 0.06506785647889943, - "grad_norm": 0.25292533490820934, + "grad_norm": 0.11921939557929893, "learning_rate": 0.00013011152416356878, - "loss": 3.7531, + "loss": 1.9133, "step": 525 }, { "epoch": 0.06568755035012704, - "grad_norm": 0.23091615388429254, + "grad_norm": 0.1161201458568179, "learning_rate": 0.00013135068153655513, - "loss": 3.818, + "loss": 1.9879, "step": 530 }, { "epoch": 0.06630724422135464, - "grad_norm": 0.2423671174824872, + "grad_norm": 0.12386141326873432, "learning_rate": 0.0001325898389095415, - "loss": 3.9406, + "loss": 1.9461, "step": 535 }, { "epoch": 0.06692693809258227, - "grad_norm": 0.24386526601080538, + "grad_norm": 0.12126274245647896, "learning_rate": 0.0001338289962825279, - "loss": 3.7711, + "loss": 1.9969, "step": 540 }, { "epoch": 0.06754663196380987, - "grad_norm": 0.2436193360083945, + "grad_norm": 0.11855513780674662, "learning_rate": 0.00013506815365551427, - "loss": 3.7484, + "loss": 1.9379, "step": 545 }, { "epoch": 0.0681663258350375, - "grad_norm": 0.24440315661081868, + "grad_norm": 0.12454701841587497, "learning_rate": 0.00013630731102850062, - "loss": 3.8039, + "loss": 1.893, "step": 550 }, { "epoch": 0.0687860197062651, - "grad_norm": 0.24658694689728877, + "grad_norm": 0.13585017663988036, "learning_rate": 0.000137546468401487, - "loss": 3.9062, + "loss": 1.9676, "step": 555 }, { "epoch": 0.06940571357749271, - "grad_norm": 0.24860076091533706, + "grad_norm": 0.12286666020259437, "learning_rate": 0.00013878562577447336, - "loss": 3.9148, + "loss": 1.9734, "step": 560 }, { "epoch": 0.07002540744872034, - "grad_norm": 0.24643140253653378, + "grad_norm": 0.12842932394891735, "learning_rate": 0.00014002478314745974, - "loss": 3.7945, + "loss": 1.9359, "step": 565 }, { "epoch": 0.07064510131994794, - "grad_norm": 0.24897544182458003, + "grad_norm": 0.12453124899243173, "learning_rate": 0.00014126394052044612, - "loss": 3.8984, + "loss": 1.8824, "step": 570 }, { "epoch": 0.07126479519117555, - "grad_norm": 0.2553723832902909, + "grad_norm": 0.12198945086913854, "learning_rate": 0.00014250309789343247, - "loss": 3.9109, + "loss": 1.9359, "step": 575 }, { "epoch": 0.07188448906240318, - "grad_norm": 0.25975637237924604, + "grad_norm": 0.11984988586999516, "learning_rate": 0.00014374225526641885, - "loss": 3.8664, + "loss": 1.9258, "step": 580 }, { "epoch": 0.07250418293363078, - "grad_norm": 0.2436903470260683, + "grad_norm": 0.11931391810243619, "learning_rate": 0.0001449814126394052, - "loss": 3.8789, + "loss": 1.9586, "step": 585 }, { "epoch": 0.0731238768048584, - "grad_norm": 0.24798071594725046, + "grad_norm": 0.1218417705429113, "learning_rate": 0.00014622057001239158, - "loss": 3.8836, + "loss": 1.882, "step": 590 }, { "epoch": 0.07374357067608601, - "grad_norm": 0.2649972707608675, + "grad_norm": 0.12362223035885297, "learning_rate": 0.00014745972738537794, - "loss": 3.7984, + "loss": 1.9336, "step": 595 }, { "epoch": 0.07436326454731362, - "grad_norm": 0.2647217197239789, + "grad_norm": 0.12709267111450237, "learning_rate": 0.00014869888475836432, - "loss": 3.7625, + "loss": 1.9195, "step": 600 }, { "epoch": 0.07498295841854125, - "grad_norm": 0.24856131885729846, + "grad_norm": 0.12352240140821688, "learning_rate": 0.0001499380421313507, - "loss": 3.9086, + "loss": 1.9566, "step": 605 }, { "epoch": 0.07560265228976885, - "grad_norm": 0.2703748729511876, + "grad_norm": 0.11695893050509344, "learning_rate": 0.00015117719950433707, - "loss": 3.8547, + "loss": 1.9055, "step": 610 }, { "epoch": 0.07622234616099646, - "grad_norm": 0.24654173035745675, + "grad_norm": 0.12061523502751974, "learning_rate": 0.00015241635687732343, - "loss": 3.8789, + "loss": 1.8879, "step": 615 }, { "epoch": 0.07684204003222408, - "grad_norm": 0.2692972261734959, + "grad_norm": 0.13093586226295037, "learning_rate": 0.0001536555142503098, - "loss": 3.8945, + "loss": 1.9043, "step": 620 }, { "epoch": 0.0774617339034517, - "grad_norm": 0.27594609338449094, + "grad_norm": 0.12506688122473264, "learning_rate": 0.00015489467162329616, - "loss": 3.8836, + "loss": 1.9359, "step": 625 }, { "epoch": 0.0780814277746793, - "grad_norm": 0.25550613767047675, + "grad_norm": 0.12410187155175814, "learning_rate": 0.00015613382899628254, - "loss": 3.8148, + "loss": 1.9324, "step": 630 }, { "epoch": 0.07870112164590692, - "grad_norm": 0.25399112427320353, + "grad_norm": 0.1272473564630399, "learning_rate": 0.00015737298636926892, - "loss": 3.8023, + "loss": 1.9199, "step": 635 }, { "epoch": 0.07932081551713453, - "grad_norm": 0.26411453154811365, + "grad_norm": 0.13031962614082446, "learning_rate": 0.00015861214374225527, - "loss": 3.8914, + "loss": 1.8988, "step": 640 }, { "epoch": 0.07994050938836215, - "grad_norm": 0.2681270955278968, + "grad_norm": 0.14034276494108713, "learning_rate": 0.00015985130111524165, - "loss": 3.7773, + "loss": 1.9531, "step": 645 }, { "epoch": 0.08056020325958976, - "grad_norm": 0.25446306197588175, + "grad_norm": 0.1240075132613875, "learning_rate": 0.000161090458488228, - "loss": 3.8664, + "loss": 1.8977, "step": 650 }, { "epoch": 0.08117989713081737, - "grad_norm": 0.24587567906822394, + "grad_norm": 0.12295043509444326, "learning_rate": 0.00016232961586121439, - "loss": 3.9359, + "loss": 1.9316, "step": 655 }, { "epoch": 0.081799591002045, - "grad_norm": 0.2418693517722617, + "grad_norm": 0.1307165114400111, "learning_rate": 0.00016356877323420074, - "loss": 3.8328, + "loss": 1.9141, "step": 660 }, { "epoch": 0.0824192848732726, - "grad_norm": 0.2506664149033651, + "grad_norm": 0.12576570368395784, "learning_rate": 0.00016480793060718712, - "loss": 3.8836, + "loss": 1.9578, "step": 665 }, { "epoch": 0.08303897874450021, - "grad_norm": 0.26233517731779643, + "grad_norm": 0.1279849537637525, "learning_rate": 0.0001660470879801735, - "loss": 3.7844, + "loss": 1.8535, "step": 670 }, { "epoch": 0.08365867261572783, - "grad_norm": 0.2677004498186573, + "grad_norm": 0.1277752146528915, "learning_rate": 0.00016728624535315988, - "loss": 3.8125, + "loss": 1.9305, "step": 675 }, { "epoch": 0.08427836648695544, - "grad_norm": 0.2398641506201725, + "grad_norm": 0.13382769963247415, "learning_rate": 0.00016852540272614623, - "loss": 3.8055, + "loss": 1.8758, "step": 680 }, { "epoch": 0.08489806035818306, - "grad_norm": 0.2602146016824983, + "grad_norm": 0.1338785424771461, "learning_rate": 0.00016976456009913258, - "loss": 3.9375, + "loss": 1.8848, "step": 685 }, { "epoch": 0.08551775422941067, - "grad_norm": 0.266805223760884, + "grad_norm": 0.13378100950887833, "learning_rate": 0.00017100371747211896, - "loss": 3.9594, + "loss": 1.9797, "step": 690 }, { "epoch": 0.08613744810063828, - "grad_norm": 0.2500429973976006, + "grad_norm": 0.12759853308304364, "learning_rate": 0.00017224287484510532, - "loss": 3.6695, + "loss": 1.927, "step": 695 }, { "epoch": 0.0867571419718659, - "grad_norm": 0.2603605758250552, + "grad_norm": 0.13003091028679756, "learning_rate": 0.00017348203221809172, - "loss": 3.8656, + "loss": 1.9258, "step": 700 }, { "epoch": 0.08737683584309351, - "grad_norm": 0.2817796768485939, + "grad_norm": 0.1256819224648164, "learning_rate": 0.00017472118959107808, - "loss": 3.7898, + "loss": 1.8852, "step": 705 }, { "epoch": 0.08799652971432112, - "grad_norm": 0.26066467074037064, + "grad_norm": 0.12540250359127755, "learning_rate": 0.00017596034696406446, - "loss": 3.757, + "loss": 1.9125, "step": 710 }, { "epoch": 0.08861622358554874, - "grad_norm": 0.264331246410504, + "grad_norm": 0.1262469778474483, "learning_rate": 0.0001771995043370508, - "loss": 3.8305, + "loss": 1.9309, "step": 715 }, { "epoch": 0.08923591745677635, - "grad_norm": 0.26208453033679957, + "grad_norm": 0.12256263065722443, "learning_rate": 0.0001784386617100372, - "loss": 3.8859, + "loss": 1.8797, "step": 720 }, { "epoch": 0.08985561132800396, - "grad_norm": 0.2608654719771367, + "grad_norm": 0.12933452681614585, "learning_rate": 0.00017967781908302354, - "loss": 3.8828, + "loss": 1.902, "step": 725 }, { "epoch": 0.09047530519923158, - "grad_norm": 0.26424596197452516, + "grad_norm": 0.12866605933389477, "learning_rate": 0.00018091697645600992, - "loss": 3.6992, + "loss": 1.9633, "step": 730 }, { "epoch": 0.09109499907045919, - "grad_norm": 0.2554376625702158, + "grad_norm": 0.11929142114827074, "learning_rate": 0.0001821561338289963, - "loss": 3.9094, + "loss": 1.9008, "step": 735 }, { "epoch": 0.09171469294168681, - "grad_norm": 0.25635680373399056, + "grad_norm": 0.12559495279922095, "learning_rate": 0.00018339529120198265, - "loss": 3.7016, + "loss": 1.898, "step": 740 }, { "epoch": 0.09233438681291442, - "grad_norm": 0.24261097763834139, + "grad_norm": 0.1351676419101842, "learning_rate": 0.00018463444857496903, - "loss": 3.7203, + "loss": 1.9105, "step": 745 }, { "epoch": 0.09295408068414203, - "grad_norm": 0.26970318223299494, + "grad_norm": 0.12357127136653635, "learning_rate": 0.0001858736059479554, - "loss": 3.8492, + "loss": 1.8855, "step": 750 }, { "epoch": 0.09357377455536965, - "grad_norm": 0.30498817244113025, + "grad_norm": 0.1256351267334546, "learning_rate": 0.00018711276332094177, - "loss": 3.9031, + "loss": 1.8832, "step": 755 }, { "epoch": 0.09419346842659726, - "grad_norm": 0.25720828519533, + "grad_norm": 0.1263964187356334, "learning_rate": 0.00018835192069392812, - "loss": 3.8156, + "loss": 1.9742, "step": 760 }, { "epoch": 0.09481316229782487, - "grad_norm": 0.29468196401306584, + "grad_norm": 0.125010680336928, "learning_rate": 0.00018959107806691453, - "loss": 3.7273, + "loss": 1.9094, "step": 765 }, { "epoch": 0.09543285616905249, - "grad_norm": 0.2684612791109344, + "grad_norm": 0.12818194425097587, "learning_rate": 0.00019083023543990088, - "loss": 3.8953, + "loss": 1.9152, "step": 770 }, { "epoch": 0.0960525500402801, - "grad_norm": 0.2667571911932907, + "grad_norm": 0.1340819661726965, "learning_rate": 0.00019206939281288726, - "loss": 3.7086, + "loss": 1.8914, "step": 775 }, { "epoch": 0.09667224391150772, - "grad_norm": 0.2456038107295753, + "grad_norm": 0.12618688913153653, "learning_rate": 0.0001933085501858736, - "loss": 3.8047, + "loss": 1.9062, "step": 780 }, { "epoch": 0.09729193778273533, - "grad_norm": 0.2621406327829109, + "grad_norm": 0.12816103290715988, "learning_rate": 0.00019454770755886, - "loss": 3.6516, + "loss": 1.868, "step": 785 }, { "epoch": 0.09791163165396294, - "grad_norm": 0.2620837502334776, + "grad_norm": 0.1344741373451341, "learning_rate": 0.00019578686493184635, - "loss": 3.8641, + "loss": 1.9312, "step": 790 }, { "epoch": 0.09853132552519056, - "grad_norm": 0.25330053822524057, + "grad_norm": 0.12832878658024055, "learning_rate": 0.00019702602230483272, - "loss": 3.7797, + "loss": 1.8883, "step": 795 }, { "epoch": 0.09915101939641817, - "grad_norm": 0.2529835083853911, + "grad_norm": 0.13311964726869252, "learning_rate": 0.0001982651796778191, - "loss": 3.7844, + "loss": 1.9547, "step": 800 }, { "epoch": 0.09977071326764578, - "grad_norm": 0.27582627311828656, + "grad_norm": 0.12760395325139873, "learning_rate": 0.00019950433705080546, - "loss": 3.8797, + "loss": 1.9055, "step": 805 }, { "epoch": 0.1003904071388734, - "grad_norm": 0.24831818922645793, + "grad_norm": 0.14141498619951107, "learning_rate": 0.00019999991575981254, - "loss": 3.7563, + "loss": 1.8609, "step": 810 }, { "epoch": 0.10101010101010101, - "grad_norm": 0.24819054534630788, + "grad_norm": 0.12959126389640468, "learning_rate": 0.00019999940095918086, - "loss": 3.8828, + "loss": 1.9059, "step": 815 }, { "epoch": 0.10162979488132863, - "grad_norm": 0.28265968045148, + "grad_norm": 0.12650382383931852, "learning_rate": 0.000199998418160428, - "loss": 3.8594, + "loss": 1.9281, "step": 820 }, { "epoch": 0.10224948875255624, - "grad_norm": 0.2733982142303492, + "grad_norm": 0.12221495910665915, "learning_rate": 0.00019999696736815346, - "loss": 3.8406, + "loss": 1.9273, "step": 825 }, { "epoch": 0.10286918262378385, - "grad_norm": 0.29382519994780215, + "grad_norm": 0.1287913010486784, "learning_rate": 0.000199995048589147, - "loss": 3.6219, + "loss": 1.8945, "step": 830 }, { "epoch": 0.10348887649501147, - "grad_norm": 0.2854445476699885, + "grad_norm": 0.1311248902853389, "learning_rate": 0.00019999266183238847, - "loss": 3.7812, + "loss": 1.8992, "step": 835 }, { "epoch": 0.10410857036623908, - "grad_norm": 0.26006686205278173, + "grad_norm": 0.1349754560323098, "learning_rate": 0.00019998980710904794, - "loss": 3.7742, + "loss": 1.9187, "step": 840 }, { "epoch": 0.10472826423746669, - "grad_norm": 0.25601656542470685, + "grad_norm": 0.13835648010235793, "learning_rate": 0.00019998648443248556, - "loss": 3.7938, + "loss": 1.9113, "step": 845 }, { "epoch": 0.10534795810869431, - "grad_norm": 0.23820163447829, + "grad_norm": 0.12332364542564311, "learning_rate": 0.00019998269381825147, - "loss": 3.6883, + "loss": 1.8734, "step": 850 }, { "epoch": 0.10596765197992192, - "grad_norm": 0.24260416655510744, + "grad_norm": 0.13003332916776286, "learning_rate": 0.00019997843528408576, - "loss": 3.7914, + "loss": 1.8773, "step": 855 }, { "epoch": 0.10658734585114953, - "grad_norm": 0.25457045020460534, + "grad_norm": 0.1375059879901149, "learning_rate": 0.00019997370884991842, - "loss": 3.8531, + "loss": 1.9059, "step": 860 }, { "epoch": 0.10720703972237715, - "grad_norm": 0.24724606814852315, + "grad_norm": 0.1248324688392207, "learning_rate": 0.0001999685145378692, - "loss": 3.668, + "loss": 1.8902, "step": 865 }, { "epoch": 0.10782673359360476, - "grad_norm": 0.24533597733438564, + "grad_norm": 0.13819337572100693, "learning_rate": 0.00019996285237224758, - "loss": 3.6961, + "loss": 1.857, "step": 870 }, { "epoch": 0.10844642746483238, - "grad_norm": 0.24640097398276353, + "grad_norm": 0.1311935728170451, "learning_rate": 0.00019995672237955246, - "loss": 3.7758, + "loss": 1.9184, "step": 875 }, { "epoch": 0.10906612133605999, - "grad_norm": 0.2576016534107627, + "grad_norm": 0.12800957952899197, "learning_rate": 0.00019995012458847233, - "loss": 3.8813, + "loss": 1.9172, "step": 880 }, { "epoch": 0.1096858152072876, - "grad_norm": 0.26762877144872255, + "grad_norm": 0.13668641206418503, "learning_rate": 0.00019994305902988488, - "loss": 3.7875, + "loss": 1.8477, "step": 885 }, { "epoch": 0.11030550907851522, - "grad_norm": 0.2828605598611797, + "grad_norm": 0.13119404407174387, "learning_rate": 0.00019993552573685703, - "loss": 3.7727, + "loss": 1.952, "step": 890 }, { "epoch": 0.11092520294974283, - "grad_norm": 0.2530807405713651, + "grad_norm": 0.12829852963816493, "learning_rate": 0.00019992752474464463, - "loss": 3.7836, + "loss": 1.8984, "step": 895 }, { "epoch": 0.11154489682097044, - "grad_norm": 0.24995839965943753, + "grad_norm": 0.13471553863362062, "learning_rate": 0.00019991905609069237, - "loss": 3.7469, + "loss": 1.9227, "step": 900 }, { "epoch": 0.11216459069219806, - "grad_norm": 0.2445922348449931, + "grad_norm": 0.13020408574344738, "learning_rate": 0.0001999101198146337, - "loss": 3.8023, + "loss": 1.8969, "step": 905 }, { "epoch": 0.11278428456342567, - "grad_norm": 0.25886682005652195, + "grad_norm": 0.13094295669195982, "learning_rate": 0.0001999007159582904, - "loss": 3.7359, + "loss": 1.8957, "step": 910 }, { "epoch": 0.11340397843465329, - "grad_norm": 0.2596083110218009, + "grad_norm": 0.1500020819120754, "learning_rate": 0.00019989084456567267, - "loss": 3.9406, + "loss": 1.8844, "step": 915 }, { "epoch": 0.1140236723058809, - "grad_norm": 0.27691730769188033, + "grad_norm": 0.13558321660460868, "learning_rate": 0.00019988050568297866, - "loss": 3.7781, + "loss": 1.909, "step": 920 }, { "epoch": 0.1146433661771085, - "grad_norm": 0.24898319031007313, + "grad_norm": 0.134435880311451, "learning_rate": 0.0001998696993585945, - "loss": 3.7016, + "loss": 1.9082, "step": 925 }, { "epoch": 0.11526306004833613, - "grad_norm": 0.26070691986935773, + "grad_norm": 0.13545407410456206, "learning_rate": 0.00019985842564309382, - "loss": 3.7398, + "loss": 1.9605, "step": 930 }, { "epoch": 0.11588275391956374, - "grad_norm": 0.2680878780430638, + "grad_norm": 0.12233937748981194, "learning_rate": 0.00019984668458923775, - "loss": 3.7141, + "loss": 1.8906, "step": 935 }, { "epoch": 0.11650244779079134, - "grad_norm": 0.26826550613584305, + "grad_norm": 0.12657415594685154, "learning_rate": 0.00019983447625197457, - "loss": 3.7281, + "loss": 1.8859, "step": 940 }, { "epoch": 0.11712214166201897, - "grad_norm": 0.24978764564648406, + "grad_norm": 0.12294879566199171, "learning_rate": 0.0001998218006884393, - "loss": 3.8539, + "loss": 1.9539, "step": 945 }, { "epoch": 0.11774183553324657, - "grad_norm": 0.24266170387051966, + "grad_norm": 0.12925893885107417, "learning_rate": 0.0001998086579579538, - "loss": 3.8109, + "loss": 1.8777, "step": 950 }, { "epoch": 0.11836152940447418, - "grad_norm": 0.2456266645553349, + "grad_norm": 0.12420792452753447, "learning_rate": 0.00019979504812202612, - "loss": 3.8133, + "loss": 1.907, "step": 955 }, { "epoch": 0.1189812232757018, - "grad_norm": 0.257918004226877, + "grad_norm": 0.1341600688997077, "learning_rate": 0.00019978097124435042, - "loss": 3.7789, + "loss": 1.9879, "step": 960 }, { "epoch": 0.11960091714692941, - "grad_norm": 0.25054565001999723, + "grad_norm": 0.12957070872516174, "learning_rate": 0.00019976642739080665, - "loss": 3.8383, + "loss": 1.8902, "step": 965 }, { "epoch": 0.12022061101815704, - "grad_norm": 0.24754360275279963, + "grad_norm": 0.12730629525810772, "learning_rate": 0.00019975141662946014, - "loss": 3.832, + "loss": 1.8961, "step": 970 }, { "epoch": 0.12084030488938464, - "grad_norm": 0.2649378383036184, + "grad_norm": 0.13670751436447726, "learning_rate": 0.0001997359390305614, - "loss": 3.7023, + "loss": 1.8367, "step": 975 }, { "epoch": 0.12145999876061225, - "grad_norm": 0.24620912612433915, + "grad_norm": 0.12099186401629365, "learning_rate": 0.00019971999466654577, - "loss": 3.6609, + "loss": 1.8691, "step": 980 }, { "epoch": 0.12207969263183988, - "grad_norm": 0.2664106735221588, + "grad_norm": 0.12888102009545702, "learning_rate": 0.000199703583612033, - "loss": 3.793, + "loss": 1.8719, "step": 985 }, { "epoch": 0.12269938650306748, - "grad_norm": 0.2525567461568162, + "grad_norm": 0.12535560588591485, "learning_rate": 0.00019968670594382694, - "loss": 3.9133, + "loss": 1.918, "step": 990 }, { "epoch": 0.12331908037429509, - "grad_norm": 0.24961131643058485, + "grad_norm": 0.13738798941417166, "learning_rate": 0.00019966936174091527, - "loss": 3.7922, + "loss": 1.8742, "step": 995 }, { "epoch": 0.12393877424552271, - "grad_norm": 0.25735420432831124, + "grad_norm": 0.12749783789851546, "learning_rate": 0.00019965155108446906, - "loss": 3.8508, + "loss": 1.9262, "step": 1000 }, { "epoch": 0.12455846811675032, - "grad_norm": 0.2465006690918658, + "grad_norm": 0.1297736019911043, "learning_rate": 0.00019963327405784226, - "loss": 3.7047, + "loss": 1.9117, "step": 1005 }, { "epoch": 0.12517816198797793, - "grad_norm": 0.2624010766660747, + "grad_norm": 0.14543122470335054, "learning_rate": 0.0001996145307465716, - "loss": 3.8375, + "loss": 1.9094, "step": 1010 }, { "epoch": 0.12579785585920555, - "grad_norm": 0.2794925439442463, + "grad_norm": 0.12860903642292762, "learning_rate": 0.00019959532123837588, - "loss": 3.8727, + "loss": 1.923, "step": 1015 }, { "epoch": 0.12641754973043318, - "grad_norm": 0.25253463553227007, + "grad_norm": 0.13001166660436106, "learning_rate": 0.00019957564562315583, - "loss": 3.7984, + "loss": 1.8422, "step": 1020 }, { "epoch": 0.12703724360166077, - "grad_norm": 0.26407657263039247, + "grad_norm": 0.12915585032705884, "learning_rate": 0.0001995555039929935, - "loss": 3.7, + "loss": 1.9133, "step": 1025 }, { "epoch": 0.1276569374728884, - "grad_norm": 0.26992370813940614, + "grad_norm": 0.12545620705652993, "learning_rate": 0.00019953489644215186, - "loss": 3.8031, + "loss": 1.9145, "step": 1030 }, { "epoch": 0.12827663134411602, - "grad_norm": 0.2619589866347836, + "grad_norm": 0.1281887633125648, "learning_rate": 0.00019951382306707452, - "loss": 3.8203, + "loss": 1.8559, "step": 1035 }, { "epoch": 0.1288963252153436, - "grad_norm": 0.24801681267457396, + "grad_norm": 0.12889758276242114, "learning_rate": 0.00019949228396638503, - "loss": 3.8016, + "loss": 1.8949, "step": 1040 }, { "epoch": 0.12951601908657123, - "grad_norm": 0.23959097732969808, + "grad_norm": 0.13027383363416042, "learning_rate": 0.00019947027924088656, - "loss": 3.9289, + "loss": 1.875, "step": 1045 }, { "epoch": 0.13013571295779885, - "grad_norm": 0.2644886464951842, + "grad_norm": 0.1289194496997327, "learning_rate": 0.00019944780899356146, - "loss": 3.7023, + "loss": 1.9328, "step": 1050 }, { "epoch": 0.13075540682902645, - "grad_norm": 0.28045523884024126, + "grad_norm": 0.13042414632394062, "learning_rate": 0.00019942487332957066, - "loss": 3.7516, + "loss": 1.8754, "step": 1055 }, { "epoch": 0.13137510070025407, - "grad_norm": 0.2568392250391569, + "grad_norm": 0.1330424701830741, "learning_rate": 0.0001994014723562533, - "loss": 3.7563, + "loss": 1.8813, "step": 1060 }, { "epoch": 0.1319947945714817, - "grad_norm": 0.27666747511265155, + "grad_norm": 0.13484246451599052, "learning_rate": 0.00019937760618312617, - "loss": 3.7469, + "loss": 1.9234, "step": 1065 }, { "epoch": 0.1326144884427093, - "grad_norm": 0.2669419700479829, + "grad_norm": 0.14282459314129015, "learning_rate": 0.00019935327492188315, - "loss": 3.7328, + "loss": 1.9066, "step": 1070 }, { "epoch": 0.1332341823139369, - "grad_norm": 0.2795594584675045, + "grad_norm": 0.1340969880273331, "learning_rate": 0.0001993284786863948, - "loss": 3.7563, + "loss": 1.8687, "step": 1075 }, { "epoch": 0.13385387618516453, - "grad_norm": 0.26410153485873783, + "grad_norm": 0.13321869788563462, "learning_rate": 0.0001993032175927077, - "loss": 3.643, + "loss": 1.9355, "step": 1080 }, { "epoch": 0.13447357005639216, - "grad_norm": 0.28530160898231344, + "grad_norm": 0.1312621201180442, "learning_rate": 0.00019927749175904403, - "loss": 3.6891, + "loss": 1.8875, "step": 1085 }, { "epoch": 0.13509326392761975, - "grad_norm": 0.2540957405520998, + "grad_norm": 0.1336541528152664, "learning_rate": 0.0001992513013058009, - "loss": 3.743, + "loss": 1.8848, "step": 1090 }, { "epoch": 0.13571295779884737, - "grad_norm": 0.2490743424896684, + "grad_norm": 0.13090078215695478, "learning_rate": 0.00019922464635554988, - "loss": 3.7648, + "loss": 1.8434, "step": 1095 }, { "epoch": 0.136332651670075, - "grad_norm": 0.25315072498030244, + "grad_norm": 0.12341244251108834, "learning_rate": 0.0001991975270330364, - "loss": 3.7797, + "loss": 1.8719, "step": 1100 }, { "epoch": 0.1369523455413026, - "grad_norm": 0.26466186854297175, + "grad_norm": 0.13591893507471717, "learning_rate": 0.00019916994346517915, - "loss": 3.6898, + "loss": 1.8703, "step": 1105 }, { "epoch": 0.1375720394125302, - "grad_norm": 0.2615425377093863, + "grad_norm": 0.1288119831415113, "learning_rate": 0.00019914189578106945, - "loss": 3.9242, + "loss": 1.9027, "step": 1110 }, { "epoch": 0.13819173328375783, - "grad_norm": 0.26269293698244284, + "grad_norm": 0.1306962730366003, "learning_rate": 0.00019911338411197075, - "loss": 3.6852, + "loss": 1.9148, "step": 1115 }, { "epoch": 0.13881142715498543, - "grad_norm": 0.250180142337476, + "grad_norm": 0.1293793599928549, "learning_rate": 0.0001990844085913179, - "loss": 3.8563, + "loss": 1.8297, "step": 1120 }, { "epoch": 0.13943112102621305, - "grad_norm": 0.2541871059004602, + "grad_norm": 0.12111724806408185, "learning_rate": 0.00019905496935471658, - "loss": 3.775, + "loss": 1.8953, "step": 1125 }, { "epoch": 0.14005081489744067, - "grad_norm": 0.2679123179600542, + "grad_norm": 0.13310075929391021, "learning_rate": 0.00019902506653994277, - "loss": 3.7586, + "loss": 1.8297, "step": 1130 }, { "epoch": 0.14067050876866827, - "grad_norm": 0.26249500264698533, + "grad_norm": 0.12631873949945813, "learning_rate": 0.00019899470028694185, - "loss": 3.7977, + "loss": 1.8629, "step": 1135 }, { "epoch": 0.1412902026398959, - "grad_norm": 0.2617063111629921, + "grad_norm": 0.1375008660650019, "learning_rate": 0.0001989638707378282, - "loss": 3.7687, + "loss": 1.9668, "step": 1140 }, { "epoch": 0.1419098965111235, - "grad_norm": 0.2458327484673384, + "grad_norm": 0.13120298623344726, "learning_rate": 0.0001989325780368844, - "loss": 3.7109, + "loss": 1.8828, "step": 1145 }, { "epoch": 0.1425295903823511, - "grad_norm": 0.2522314523920583, + "grad_norm": 0.1417726162795728, "learning_rate": 0.00019890082233056054, - "loss": 3.7414, + "loss": 1.9273, "step": 1150 }, { "epoch": 0.14314928425357873, - "grad_norm": 0.24581360314066542, + "grad_norm": 0.13606374663242826, "learning_rate": 0.00019886860376747362, - "loss": 3.8203, + "loss": 1.8746, "step": 1155 }, { "epoch": 0.14376897812480635, - "grad_norm": 0.25279607405695015, + "grad_norm": 0.12914991669138237, "learning_rate": 0.0001988359224984069, - "loss": 3.8406, + "loss": 1.9184, "step": 1160 }, { "epoch": 0.14438867199603395, - "grad_norm": 0.2567164750916268, + "grad_norm": 0.13097438822182214, "learning_rate": 0.0001988027786763089, - "loss": 3.732, + "loss": 1.8188, "step": 1165 }, { "epoch": 0.14500836586726157, - "grad_norm": 0.23997160090056055, + "grad_norm": 0.12995850441011195, "learning_rate": 0.0001987691724562931, - "loss": 3.8172, + "loss": 1.8746, "step": 1170 }, { "epoch": 0.1456280597384892, - "grad_norm": 0.2542739318371719, + "grad_norm": 0.13478414995635102, "learning_rate": 0.00019873510399563688, - "loss": 3.6367, + "loss": 1.9023, "step": 1175 }, { "epoch": 0.1462477536097168, - "grad_norm": 0.2531159941787502, + "grad_norm": 0.1280137392385234, "learning_rate": 0.00019870057345378097, - "loss": 3.8031, + "loss": 1.8648, "step": 1180 }, { "epoch": 0.1468674474809444, - "grad_norm": 0.2788637520889176, + "grad_norm": 0.12710511011056266, "learning_rate": 0.00019866558099232862, - "loss": 3.8148, + "loss": 1.8637, "step": 1185 }, { "epoch": 0.14748714135217203, - "grad_norm": 0.29860991131828724, + "grad_norm": 0.13534609625301908, "learning_rate": 0.00019863012677504485, - "loss": 3.8219, + "loss": 1.8734, "step": 1190 }, { "epoch": 0.14810683522339965, - "grad_norm": 0.2569903027660448, + "grad_norm": 0.13068732229954763, "learning_rate": 0.00019859421096785575, - "loss": 3.8258, + "loss": 1.8891, "step": 1195 }, { "epoch": 0.14872652909462725, - "grad_norm": 0.25592105637718865, + "grad_norm": 0.13802118838387212, "learning_rate": 0.00019855783373884763, - "loss": 3.8234, + "loss": 1.8449, "step": 1200 }, { "epoch": 0.14934622296585487, - "grad_norm": 0.28504647584615034, + "grad_norm": 0.13152850461786877, "learning_rate": 0.00019852099525826628, - "loss": 3.7945, + "loss": 1.9012, "step": 1205 }, { "epoch": 0.1499659168370825, - "grad_norm": 0.2865619721254542, + "grad_norm": 0.13162793815607204, "learning_rate": 0.00019848369569851608, - "loss": 3.8242, + "loss": 1.8387, "step": 1210 }, { "epoch": 0.15058561070831009, - "grad_norm": 0.2551591601723071, + "grad_norm": 0.12825559680706589, "learning_rate": 0.00019844593523415935, - "loss": 3.8141, + "loss": 1.9254, "step": 1215 }, { "epoch": 0.1512053045795377, - "grad_norm": 0.25507234107508003, + "grad_norm": 0.14271049535774913, "learning_rate": 0.00019840771404191538, - "loss": 3.6891, + "loss": 1.8801, "step": 1220 }, { "epoch": 0.15182499845076533, - "grad_norm": 0.24464355974846816, + "grad_norm": 0.12772835947385766, "learning_rate": 0.00019836903230065973, - "loss": 3.8875, + "loss": 1.9152, "step": 1225 }, { "epoch": 0.15244469232199293, - "grad_norm": 0.2628854899768638, + "grad_norm": 0.13346214143251017, "learning_rate": 0.0001983298901914233, - "loss": 3.8266, + "loss": 1.8984, "step": 1230 }, { "epoch": 0.15306438619322055, - "grad_norm": 0.2470165170093542, + "grad_norm": 0.13348957014420162, "learning_rate": 0.00019829028789739156, - "loss": 3.8023, + "loss": 1.884, "step": 1235 }, { "epoch": 0.15368408006444817, - "grad_norm": 0.2641621943386744, + "grad_norm": 0.13030548172513623, "learning_rate": 0.00019825022560390353, - "loss": 3.8789, + "loss": 1.8844, "step": 1240 }, { "epoch": 0.15430377393567576, - "grad_norm": 0.2730690120133877, + "grad_norm": 0.12325474630414128, "learning_rate": 0.00019820970349845117, - "loss": 3.7766, + "loss": 1.916, "step": 1245 }, { "epoch": 0.1549234678069034, - "grad_norm": 0.2590003477418892, + "grad_norm": 0.12371037361630992, "learning_rate": 0.0001981687217706783, - "loss": 3.6641, + "loss": 1.8988, "step": 1250 }, { "epoch": 0.155543161678131, - "grad_norm": 0.2616580067830838, + "grad_norm": 0.12514967560905205, "learning_rate": 0.0001981272806123798, - "loss": 3.7219, + "loss": 1.8445, "step": 1255 }, { "epoch": 0.1561628555493586, - "grad_norm": 0.2714185703992873, + "grad_norm": 0.13355835058145732, "learning_rate": 0.00019808538021750063, - "loss": 3.8219, + "loss": 1.8598, "step": 1260 }, { "epoch": 0.15678254942058623, - "grad_norm": 0.2502930025265547, + "grad_norm": 0.1302860636545541, "learning_rate": 0.00019804302078213506, - "loss": 3.7719, + "loss": 1.9062, "step": 1265 }, { "epoch": 0.15740224329181385, - "grad_norm": 0.26422875872349105, + "grad_norm": 0.12797901973525885, "learning_rate": 0.00019800020250452564, - "loss": 3.7344, + "loss": 1.9098, "step": 1270 }, { "epoch": 0.15802193716304147, - "grad_norm": 0.2692299665019243, + "grad_norm": 0.1308306448611844, "learning_rate": 0.00019795692558506232, - "loss": 3.6547, + "loss": 1.8914, "step": 1275 }, { "epoch": 0.15864163103426906, - "grad_norm": 0.2639854392728509, + "grad_norm": 0.12802955593426202, "learning_rate": 0.0001979131902262814, - "loss": 3.6492, + "loss": 1.8906, "step": 1280 }, { "epoch": 0.1592613249054967, - "grad_norm": 0.27254984637905877, + "grad_norm": 0.12518672264585973, "learning_rate": 0.00019786899663286486, - "loss": 3.6633, + "loss": 1.9082, "step": 1285 }, { "epoch": 0.1598810187767243, - "grad_norm": 0.2513431426248905, + "grad_norm": 0.13434808762447917, "learning_rate": 0.00019782434501163909, - "loss": 3.7039, + "loss": 1.934, "step": 1290 }, { "epoch": 0.1605007126479519, - "grad_norm": 0.25083823338612543, + "grad_norm": 0.13038607351214834, "learning_rate": 0.0001977792355715741, - "loss": 3.875, + "loss": 1.9078, "step": 1295 }, { "epoch": 0.16112040651917953, - "grad_norm": 0.2384372233764081, + "grad_norm": 0.1276058968842769, "learning_rate": 0.00019773366852378246, - "loss": 3.7117, + "loss": 1.8352, "step": 1300 }, { "epoch": 0.16174010039040715, - "grad_norm": 0.2691834755767079, + "grad_norm": 0.141321358957705, "learning_rate": 0.0001976876440815184, - "loss": 3.693, + "loss": 1.8395, "step": 1305 }, { "epoch": 0.16235979426163474, - "grad_norm": 0.2644517005629403, + "grad_norm": 0.12259766843977997, "learning_rate": 0.0001976411624601767, - "loss": 3.8352, + "loss": 1.8359, "step": 1310 }, { "epoch": 0.16297948813286237, - "grad_norm": 0.2629094670093715, + "grad_norm": 0.12782259758660008, "learning_rate": 0.00019759422387729183, - "loss": 3.832, + "loss": 1.8418, "step": 1315 }, { "epoch": 0.16359918200409, - "grad_norm": 0.25827593256160586, + "grad_norm": 0.12829083288045331, "learning_rate": 0.00019754682855253674, - "loss": 3.7898, + "loss": 1.8805, "step": 1320 }, { "epoch": 0.16421887587531758, - "grad_norm": 0.258617601577871, + "grad_norm": 0.12446530921607402, "learning_rate": 0.00019749897670772205, - "loss": 3.7484, + "loss": 1.8352, "step": 1325 }, { "epoch": 0.1648385697465452, - "grad_norm": 0.2541932905242892, + "grad_norm": 0.12468008193096836, "learning_rate": 0.00019745066856679478, - "loss": 3.7734, + "loss": 1.8758, "step": 1330 }, { "epoch": 0.16545826361777283, - "grad_norm": 0.2513321092655162, + "grad_norm": 0.13195885649345943, "learning_rate": 0.0001974019043558375, - "loss": 3.6531, + "loss": 1.8508, "step": 1335 }, { "epoch": 0.16607795748900042, - "grad_norm": 0.28546456311685575, + "grad_norm": 0.12664766926925763, "learning_rate": 0.00019735268430306718, - "loss": 3.6289, + "loss": 1.8418, "step": 1340 }, { "epoch": 0.16669765136022804, - "grad_norm": 0.25516373996645875, + "grad_norm": 0.1332680932050387, "learning_rate": 0.00019730300863883405, - "loss": 3.8406, + "loss": 1.8695, "step": 1345 }, { "epoch": 0.16731734523145567, - "grad_norm": 0.23390699759641462, + "grad_norm": 0.12608647770348977, "learning_rate": 0.0001972528775956208, - "loss": 3.8266, + "loss": 1.8973, "step": 1350 }, { "epoch": 0.16793703910268326, - "grad_norm": 0.2557994609470224, + "grad_norm": 0.1316408940066455, "learning_rate": 0.0001972022914080411, - "loss": 3.8008, + "loss": 1.8328, "step": 1355 }, { "epoch": 0.16855673297391088, - "grad_norm": 0.25385825342982604, + "grad_norm": 0.12013595809919007, "learning_rate": 0.00019715125031283877, - "loss": 3.7539, + "loss": 1.8621, "step": 1360 }, { "epoch": 0.1691764268451385, - "grad_norm": 0.2644407002872055, + "grad_norm": 0.13491567947036534, "learning_rate": 0.00019709975454888662, - "loss": 3.7758, + "loss": 1.841, "step": 1365 }, { "epoch": 0.16979612071636613, - "grad_norm": 0.2460714641028012, + "grad_norm": 0.13054236047944698, "learning_rate": 0.00019704780435718532, - "loss": 3.7328, + "loss": 1.8957, "step": 1370 }, { "epoch": 0.17041581458759372, - "grad_norm": 0.2533400380158052, + "grad_norm": 0.13274138101276614, "learning_rate": 0.00019699539998086223, - "loss": 3.8414, + "loss": 1.8496, "step": 1375 }, { "epoch": 0.17103550845882134, - "grad_norm": 0.27880303783152216, + "grad_norm": 0.12178156573489422, "learning_rate": 0.00019694254166517032, - "loss": 3.7539, + "loss": 1.8836, "step": 1380 }, { "epoch": 0.17165520233004897, - "grad_norm": 0.25638461089130843, + "grad_norm": 0.13438311435319733, "learning_rate": 0.00019688922965748696, - "loss": 3.75, + "loss": 1.898, "step": 1385 }, { "epoch": 0.17227489620127656, - "grad_norm": 0.2702151632755114, + "grad_norm": 0.13378924908569717, "learning_rate": 0.0001968354642073129, - "loss": 3.6875, + "loss": 1.8383, "step": 1390 }, { "epoch": 0.17289459007250418, - "grad_norm": 0.24814604099538068, + "grad_norm": 0.1307401060574418, "learning_rate": 0.00019678124556627094, - "loss": 3.6586, + "loss": 1.8445, "step": 1395 }, { "epoch": 0.1735142839437318, - "grad_norm": 0.2535122565803045, + "grad_norm": 0.13793109916885396, "learning_rate": 0.00019672657398810478, - "loss": 3.6727, + "loss": 1.8559, "step": 1400 }, { "epoch": 0.1741339778149594, - "grad_norm": 0.25247054392702717, + "grad_norm": 0.13413192708976615, "learning_rate": 0.00019667144972867795, - "loss": 3.7258, + "loss": 1.8781, "step": 1405 }, { "epoch": 0.17475367168618702, - "grad_norm": 0.266294228324591, + "grad_norm": 0.13537690686486675, "learning_rate": 0.00019661587304597243, - "loss": 3.7164, + "loss": 1.8824, "step": 1410 }, { "epoch": 0.17537336555741465, - "grad_norm": 0.25031838105186166, + "grad_norm": 0.13031507569408332, "learning_rate": 0.0001965598442000877, - "loss": 3.7711, + "loss": 1.9043, "step": 1415 }, { "epoch": 0.17599305942864224, - "grad_norm": 0.2504169373702073, + "grad_norm": 0.13469238598709168, "learning_rate": 0.0001965033634532392, - "loss": 3.8008, + "loss": 1.8605, "step": 1420 }, { "epoch": 0.17661275329986986, - "grad_norm": 0.26471381309079295, + "grad_norm": 0.12447049113871386, "learning_rate": 0.00019644643106975739, - "loss": 3.657, + "loss": 1.8254, "step": 1425 }, { "epoch": 0.17723244717109748, - "grad_norm": 0.26233759886744334, + "grad_norm": 0.13493098167599366, "learning_rate": 0.00019638904731608637, - "loss": 3.7328, + "loss": 1.9117, "step": 1430 }, { "epoch": 0.17785214104232508, - "grad_norm": 0.25917034112988885, + "grad_norm": 0.12556872950040296, "learning_rate": 0.00019633121246078256, - "loss": 3.8891, + "loss": 1.8539, "step": 1435 }, { "epoch": 0.1784718349135527, - "grad_norm": 0.25024559107597927, + "grad_norm": 0.12732770515159572, "learning_rate": 0.00019627292677451368, - "loss": 3.8781, + "loss": 1.8891, "step": 1440 }, { "epoch": 0.17909152878478032, - "grad_norm": 0.2657054391608054, + "grad_norm": 0.13636272478996864, "learning_rate": 0.00019621419053005726, - "loss": 3.7422, + "loss": 1.882, "step": 1445 }, { "epoch": 0.17971122265600792, - "grad_norm": 0.26565528290139706, + "grad_norm": 0.126334909805653, "learning_rate": 0.00019615500400229946, - "loss": 3.7602, + "loss": 1.8887, "step": 1450 }, { "epoch": 0.18033091652723554, - "grad_norm": 0.25397414953791075, + "grad_norm": 0.13476880789699827, "learning_rate": 0.0001960953674682338, - "loss": 3.8078, + "loss": 1.8449, "step": 1455 }, { "epoch": 0.18095061039846316, - "grad_norm": 0.2586188121736365, + "grad_norm": 0.12610772664657266, "learning_rate": 0.00019603528120695982, - "loss": 3.9047, + "loss": 1.9023, "step": 1460 }, { "epoch": 0.18157030426969079, - "grad_norm": 0.28307612786781294, + "grad_norm": 0.12956314166827756, "learning_rate": 0.00019597474549968173, - "loss": 3.7742, + "loss": 1.8555, "step": 1465 }, { "epoch": 0.18218999814091838, - "grad_norm": 0.25496358354507603, + "grad_norm": 0.11672761587595178, "learning_rate": 0.00019591376062970728, - "loss": 3.7016, + "loss": 1.952, "step": 1470 }, { "epoch": 0.182809692012146, - "grad_norm": 0.2531796270841028, + "grad_norm": 0.1292512982323371, "learning_rate": 0.00019585232688244613, - "loss": 3.6437, + "loss": 1.8727, "step": 1475 }, { "epoch": 0.18342938588337362, - "grad_norm": 0.25281430577583736, + "grad_norm": 0.1309312239173881, "learning_rate": 0.00019579044454540883, - "loss": 3.7797, + "loss": 1.8344, "step": 1480 }, { "epoch": 0.18404907975460122, - "grad_norm": 0.24827004047230228, + "grad_norm": 0.13366728300584968, "learning_rate": 0.0001957281139082053, - "loss": 3.8031, + "loss": 1.9125, "step": 1485 }, { "epoch": 0.18466877362582884, - "grad_norm": 0.2533494089035691, + "grad_norm": 0.12537903174842133, "learning_rate": 0.00019566533526254348, - "loss": 3.6648, + "loss": 1.8684, "step": 1490 }, { "epoch": 0.18528846749705646, - "grad_norm": 0.25713999090986667, + "grad_norm": 0.12392425386470547, "learning_rate": 0.00019560210890222802, - "loss": 3.6875, + "loss": 1.8906, "step": 1495 }, { "epoch": 0.18590816136828406, - "grad_norm": 0.26243921456103164, + "grad_norm": 0.12391896609740323, "learning_rate": 0.00019553843512315887, - "loss": 3.7484, + "loss": 1.9105, "step": 1500 }, { "epoch": 0.18652785523951168, - "grad_norm": 0.26616483181088857, + "grad_norm": 0.12818849494366744, "learning_rate": 0.00019547431422332992, - "loss": 3.6391, + "loss": 1.85, "step": 1505 }, { "epoch": 0.1871475491107393, - "grad_norm": 0.23959324829479736, + "grad_norm": 0.12671159771811846, "learning_rate": 0.00019540974650282756, - "loss": 3.7461, + "loss": 1.916, "step": 1510 }, { "epoch": 0.1877672429819669, - "grad_norm": 0.2470546763118829, + "grad_norm": 0.13333844500063538, "learning_rate": 0.0001953447322638293, - "loss": 3.725, + "loss": 1.8625, "step": 1515 }, { "epoch": 0.18838693685319452, - "grad_norm": 0.256423918618775, + "grad_norm": 0.13840145694731687, "learning_rate": 0.0001952792718106024, - "loss": 3.7383, + "loss": 1.8809, "step": 1520 }, { "epoch": 0.18900663072442214, - "grad_norm": 0.2654600928240678, + "grad_norm": 0.12778029272918287, "learning_rate": 0.00019521336544950238, - "loss": 3.8961, + "loss": 1.8668, "step": 1525 }, { "epoch": 0.18962632459564974, - "grad_norm": 0.26718843333620523, + "grad_norm": 0.12913540845829236, "learning_rate": 0.00019514701348897164, - "loss": 3.743, + "loss": 1.8062, "step": 1530 }, { "epoch": 0.19024601846687736, - "grad_norm": 0.25878122211171267, + "grad_norm": 0.12826357981537254, "learning_rate": 0.00019508021623953795, - "loss": 3.7523, + "loss": 1.918, "step": 1535 }, { "epoch": 0.19086571233810498, - "grad_norm": 0.2598862844221595, + "grad_norm": 0.1342517702736094, "learning_rate": 0.00019501297401381304, - "loss": 3.7656, + "loss": 1.8668, "step": 1540 }, { "epoch": 0.19148540620933258, - "grad_norm": 0.24434694229901105, + "grad_norm": 0.138832925243521, "learning_rate": 0.00019494528712649117, - "loss": 3.7805, + "loss": 1.9117, "step": 1545 }, { "epoch": 0.1921051000805602, - "grad_norm": 0.25796351085501806, + "grad_norm": 0.12937438208518143, "learning_rate": 0.0001948771558943476, - "loss": 3.7406, + "loss": 1.907, "step": 1550 }, { "epoch": 0.19272479395178782, - "grad_norm": 0.24856327668868097, + "grad_norm": 0.1222226036118704, "learning_rate": 0.00019480858063623715, - "loss": 3.8391, + "loss": 1.8738, "step": 1555 }, { "epoch": 0.19334448782301544, - "grad_norm": 0.2692860471247067, + "grad_norm": 0.13553806750268793, "learning_rate": 0.0001947395616730926, - "loss": 3.7516, + "loss": 1.9059, "step": 1560 }, { "epoch": 0.19396418169424304, - "grad_norm": 0.24956285830888045, + "grad_norm": 0.13213241501616083, "learning_rate": 0.00019467009932792336, - "loss": 3.725, + "loss": 1.8684, "step": 1565 }, { "epoch": 0.19458387556547066, - "grad_norm": 0.24405925694161748, + "grad_norm": 0.1277167096208847, "learning_rate": 0.00019460019392581387, - "loss": 3.8078, + "loss": 1.9062, "step": 1570 }, { "epoch": 0.19520356943669828, - "grad_norm": 0.24476026905778137, + "grad_norm": 0.1269571278814333, "learning_rate": 0.00019452984579392205, - "loss": 3.7578, + "loss": 1.8527, "step": 1575 }, { "epoch": 0.19582326330792588, - "grad_norm": 0.25941791216039106, + "grad_norm": 0.12728857278045366, "learning_rate": 0.0001944590552614778, - "loss": 3.8273, + "loss": 1.8555, "step": 1580 }, { "epoch": 0.1964429571791535, - "grad_norm": 0.2605888382676801, + "grad_norm": 0.1290005338315187, "learning_rate": 0.0001943878226597815, - "loss": 3.6281, + "loss": 1.8637, "step": 1585 }, { "epoch": 0.19706265105038112, - "grad_norm": 0.25389288675364113, + "grad_norm": 0.1338620592085664, "learning_rate": 0.0001943161483222023, - "loss": 3.6195, + "loss": 1.85, "step": 1590 }, { "epoch": 0.19768234492160872, - "grad_norm": 0.25255096270670657, + "grad_norm": 0.12707291639830157, "learning_rate": 0.00019424403258417683, - "loss": 3.8656, + "loss": 1.8746, "step": 1595 }, { "epoch": 0.19830203879283634, - "grad_norm": 0.2546545729930429, + "grad_norm": 0.13144019328164225, "learning_rate": 0.00019417147578320744, - "loss": 3.7852, + "loss": 1.8664, "step": 1600 }, { "epoch": 0.19892173266406396, - "grad_norm": 0.25448534902199255, + "grad_norm": 0.12826047192933435, "learning_rate": 0.00019409847825886054, - "loss": 3.6578, + "loss": 1.9234, "step": 1605 }, { "epoch": 0.19954142653529156, - "grad_norm": 0.2531732716191023, + "grad_norm": 0.1397378039359775, "learning_rate": 0.00019402504035276525, - "loss": 3.7539, + "loss": 1.8086, "step": 1610 }, { "epoch": 0.20016112040651918, - "grad_norm": 0.2629784531419189, + "grad_norm": 0.12570861948831283, "learning_rate": 0.00019395116240861172, - "loss": 3.7195, + "loss": 1.8324, "step": 1615 }, { "epoch": 0.2007808142777468, - "grad_norm": 0.25369503528632603, + "grad_norm": 0.12079023395005281, "learning_rate": 0.0001938768447721493, - "loss": 3.7469, + "loss": 1.8758, "step": 1620 }, { "epoch": 0.2014005081489744, - "grad_norm": 0.2645391294956752, + "grad_norm": 0.13715689189513838, "learning_rate": 0.00019380208779118532, - "loss": 3.6891, + "loss": 1.8438, "step": 1625 }, { "epoch": 0.20202020202020202, - "grad_norm": 0.249604741933017, + "grad_norm": 0.12784219253316514, "learning_rate": 0.00019372689181558307, - "loss": 3.6945, + "loss": 1.8254, "step": 1630 }, { "epoch": 0.20263989589142964, - "grad_norm": 0.2612227448118402, + "grad_norm": 0.13127637020426203, "learning_rate": 0.00019365125719726046, - "loss": 3.6328, + "loss": 1.8363, "step": 1635 }, { "epoch": 0.20325958976265726, - "grad_norm": 0.25850843906032656, + "grad_norm": 0.12390585547014671, "learning_rate": 0.00019357518429018815, - "loss": 3.8016, + "loss": 1.8984, "step": 1640 }, { "epoch": 0.20387928363388486, - "grad_norm": 0.2473382579306127, + "grad_norm": 0.1296901168065532, "learning_rate": 0.00019349867345038808, - "loss": 3.8117, + "loss": 1.8891, "step": 1645 }, { "epoch": 0.20449897750511248, - "grad_norm": 0.24637915773390445, + "grad_norm": 0.13009453466712564, "learning_rate": 0.0001934217250359317, - "loss": 3.7609, + "loss": 1.8418, "step": 1650 }, { "epoch": 0.2051186713763401, - "grad_norm": 0.3072935679669988, + "grad_norm": 0.12805361264829143, "learning_rate": 0.0001933443394069383, - "loss": 3.6883, + "loss": 1.8605, "step": 1655 }, { "epoch": 0.2057383652475677, - "grad_norm": 0.250656654674928, + "grad_norm": 0.12965700777684402, "learning_rate": 0.0001932665169255733, - "loss": 3.8211, + "loss": 1.8945, "step": 1660 }, { "epoch": 0.20635805911879532, - "grad_norm": 0.25258094915573104, + "grad_norm": 0.14176151018901473, "learning_rate": 0.00019318825795604667, - "loss": 3.6961, + "loss": 1.868, "step": 1665 }, { "epoch": 0.20697775299002294, - "grad_norm": 0.2593606925045606, + "grad_norm": 0.12783314302778317, "learning_rate": 0.00019310956286461108, - "loss": 3.8031, + "loss": 1.8461, "step": 1670 }, { "epoch": 0.20759744686125053, - "grad_norm": 0.25793981295340673, + "grad_norm": 0.12476969665972379, "learning_rate": 0.00019303043201956033, - "loss": 3.8773, + "loss": 1.8465, "step": 1675 }, { "epoch": 0.20821714073247816, - "grad_norm": 0.2475268229332171, + "grad_norm": 0.13447320778956978, "learning_rate": 0.00019295086579122748, - "loss": 3.6773, + "loss": 1.807, "step": 1680 }, { "epoch": 0.20883683460370578, - "grad_norm": 0.2576357399823311, + "grad_norm": 0.12224616952170654, "learning_rate": 0.0001928708645519832, - "loss": 3.7305, + "loss": 1.8797, "step": 1685 }, { "epoch": 0.20945652847493337, - "grad_norm": 0.27035332613564755, + "grad_norm": 0.14073036790617033, "learning_rate": 0.00019279042867623405, - "loss": 3.7625, + "loss": 1.8734, "step": 1690 }, { "epoch": 0.210076222346161, - "grad_norm": 0.25029387687847293, + "grad_norm": 0.12419689358583988, "learning_rate": 0.00019270955854042065, - "loss": 3.7711, + "loss": 1.8969, "step": 1695 }, { "epoch": 0.21069591621738862, - "grad_norm": 0.26762236137322387, + "grad_norm": 0.1384451198274959, "learning_rate": 0.00019262825452301603, - "loss": 3.8469, + "loss": 1.8703, "step": 1700 }, { "epoch": 0.2113156100886162, - "grad_norm": 0.24285364151377078, + "grad_norm": 0.13754390968700744, "learning_rate": 0.0001925465170045237, - "loss": 3.7234, + "loss": 1.8277, "step": 1705 }, { "epoch": 0.21193530395984383, - "grad_norm": 0.2506313769424627, + "grad_norm": 0.12544899126243475, "learning_rate": 0.00019246434636747603, - "loss": 3.8797, + "loss": 1.8746, "step": 1710 }, { "epoch": 0.21255499783107146, - "grad_norm": 0.2623791318366264, + "grad_norm": 0.1316314602696802, "learning_rate": 0.00019238174299643235, - "loss": 3.6961, + "loss": 1.8535, "step": 1715 }, { "epoch": 0.21317469170229905, - "grad_norm": 0.24286466702261822, + "grad_norm": 0.12691928860824186, "learning_rate": 0.00019229870727797716, - "loss": 3.7289, + "loss": 1.9141, "step": 1720 }, { "epoch": 0.21379438557352667, - "grad_norm": 0.269295012443464, + "grad_norm": 0.13203627480238206, "learning_rate": 0.00019221523960071847, - "loss": 3.7172, + "loss": 1.8492, "step": 1725 }, { "epoch": 0.2144140794447543, - "grad_norm": 0.24914306646362705, + "grad_norm": 0.12505426294779426, "learning_rate": 0.00019213134035528574, - "loss": 3.8211, + "loss": 1.8984, "step": 1730 }, { "epoch": 0.21503377331598192, - "grad_norm": 0.261309598468901, + "grad_norm": 0.12969984327318165, "learning_rate": 0.0001920470099343282, - "loss": 3.7805, + "loss": 1.8727, "step": 1735 }, { "epoch": 0.2156534671872095, - "grad_norm": 0.2633828071876572, + "grad_norm": 0.12717555043591958, "learning_rate": 0.000191962248732513, - "loss": 3.857, + "loss": 1.9051, "step": 1740 }, { "epoch": 0.21627316105843714, - "grad_norm": 0.25552537780686957, + "grad_norm": 0.12782790245258435, "learning_rate": 0.00019187705714652337, - "loss": 3.7883, + "loss": 1.8527, "step": 1745 }, { "epoch": 0.21689285492966476, - "grad_norm": 0.25873934840784607, + "grad_norm": 0.12718190088753764, "learning_rate": 0.00019179143557505676, - "loss": 3.8133, + "loss": 1.8527, "step": 1750 }, { "epoch": 0.21751254880089235, - "grad_norm": 0.25040643355771275, + "grad_norm": 0.12935442052963278, "learning_rate": 0.0001917053844188228, - "loss": 3.7898, + "loss": 1.8668, "step": 1755 }, { "epoch": 0.21813224267211997, - "grad_norm": 0.2385870198485536, + "grad_norm": 0.13483832081677283, "learning_rate": 0.0001916189040805418, - "loss": 3.6844, + "loss": 1.8332, "step": 1760 }, { "epoch": 0.2187519365433476, - "grad_norm": 0.24428707193063878, + "grad_norm": 0.1231867163781486, "learning_rate": 0.0001915319949649425, - "loss": 3.6703, + "loss": 1.8461, "step": 1765 }, { "epoch": 0.2193716304145752, - "grad_norm": 0.25981823425530814, + "grad_norm": 0.1296012114936124, "learning_rate": 0.00019144465747876038, - "loss": 3.7188, + "loss": 1.8633, "step": 1770 }, { "epoch": 0.21999132428580281, - "grad_norm": 0.24977675057584312, + "grad_norm": 0.1358765864573398, "learning_rate": 0.00019135689203073563, - "loss": 3.6984, + "loss": 1.8586, "step": 1775 }, { "epoch": 0.22061101815703044, - "grad_norm": 0.2595528653750561, + "grad_norm": 0.12635502496539475, "learning_rate": 0.00019126869903161146, - "loss": 3.7547, + "loss": 1.9148, "step": 1780 }, { "epoch": 0.22123071202825803, - "grad_norm": 0.26071155800209916, + "grad_norm": 0.1382566677032265, "learning_rate": 0.00019118007889413186, - "loss": 3.7578, + "loss": 1.8082, "step": 1785 }, { "epoch": 0.22185040589948565, - "grad_norm": 0.257088768747436, + "grad_norm": 0.13033747033534399, "learning_rate": 0.00019109103203303988, - "loss": 3.8156, + "loss": 1.8367, "step": 1790 }, { "epoch": 0.22247009977071328, - "grad_norm": 0.25375238741582434, + "grad_norm": 0.12701483391149915, "learning_rate": 0.00019100155886507566, - "loss": 3.6352, + "loss": 1.9246, "step": 1795 }, { "epoch": 0.22308979364194087, - "grad_norm": 0.26492351236454703, + "grad_norm": 0.1364067044564757, "learning_rate": 0.0001909116598089745, - "loss": 3.6828, + "loss": 1.8582, "step": 1800 }, { "epoch": 0.2237094875131685, - "grad_norm": 0.24876024736288194, + "grad_norm": 0.12282125656108547, "learning_rate": 0.00019082133528546476, - "loss": 3.5773, + "loss": 1.8738, "step": 1805 }, { "epoch": 0.22432918138439611, - "grad_norm": 0.2576625159577276, + "grad_norm": 0.1272071393926594, "learning_rate": 0.0001907305857172661, - "loss": 3.7227, + "loss": 1.884, "step": 1810 }, { "epoch": 0.2249488752556237, - "grad_norm": 0.287526982161955, + "grad_norm": 0.12322060913981069, "learning_rate": 0.00019063941152908727, - "loss": 3.7297, + "loss": 1.8641, "step": 1815 }, { "epoch": 0.22556856912685133, - "grad_norm": 0.26483091023621297, + "grad_norm": 0.1269763797987862, "learning_rate": 0.00019054781314762433, - "loss": 3.7305, + "loss": 1.8559, "step": 1820 }, { "epoch": 0.22618826299807895, - "grad_norm": 0.2595249577164607, + "grad_norm": 0.13038449210432226, "learning_rate": 0.0001904557910015586, - "loss": 3.7523, + "loss": 1.8762, "step": 1825 }, { "epoch": 0.22680795686930658, - "grad_norm": 0.2505670142517455, + "grad_norm": 0.12529384355483927, "learning_rate": 0.00019036334552155452, - "loss": 3.6516, + "loss": 1.8332, "step": 1830 }, { "epoch": 0.22742765074053417, - "grad_norm": 0.27127556440251427, + "grad_norm": 0.1262927530224639, "learning_rate": 0.00019027047714025784, - "loss": 3.7352, + "loss": 1.8523, "step": 1835 }, { "epoch": 0.2280473446117618, - "grad_norm": 0.25134431087469267, + "grad_norm": 0.13984844751042014, "learning_rate": 0.0001901771862922934, - "loss": 3.7844, + "loss": 1.8566, "step": 1840 }, { "epoch": 0.22866703848298942, - "grad_norm": 0.2460567514125104, + "grad_norm": 0.14759646104938404, "learning_rate": 0.00019008347341426324, - "loss": 3.6508, + "loss": 1.8492, "step": 1845 }, { "epoch": 0.229286732354217, - "grad_norm": 0.2629987393065304, + "grad_norm": 0.12762255487022825, "learning_rate": 0.0001899893389447445, - "loss": 3.7219, + "loss": 1.8805, "step": 1850 }, { "epoch": 0.22990642622544463, - "grad_norm": 0.27575106848753533, + "grad_norm": 0.13415887670392074, "learning_rate": 0.0001898947833242874, - "loss": 3.7836, + "loss": 1.8461, "step": 1855 }, { "epoch": 0.23052612009667225, - "grad_norm": 0.25662445782308285, + "grad_norm": 0.13815909675804716, "learning_rate": 0.00018979980699541308, - "loss": 3.6367, + "loss": 1.8609, "step": 1860 }, { "epoch": 0.23114581396789985, - "grad_norm": 0.2507993462493921, + "grad_norm": 0.1261638415507045, "learning_rate": 0.00018970441040261165, - "loss": 3.6445, + "loss": 1.8043, "step": 1865 }, { "epoch": 0.23176550783912747, - "grad_norm": 0.2676156725812598, + "grad_norm": 0.132226368455845, "learning_rate": 0.00018960859399234006, - "loss": 3.7938, + "loss": 1.8152, "step": 1870 }, { "epoch": 0.2323852017103551, - "grad_norm": 0.2577237299425109, + "grad_norm": 0.12702999428539313, "learning_rate": 0.00018951235821301995, - "loss": 3.7422, + "loss": 1.8914, "step": 1875 }, { "epoch": 0.2330048955815827, - "grad_norm": 0.26252136602344933, + "grad_norm": 0.13100989208545588, "learning_rate": 0.0001894157035150357, - "loss": 3.6305, + "loss": 1.8391, "step": 1880 }, { "epoch": 0.2336245894528103, - "grad_norm": 0.26133595577475316, + "grad_norm": 0.12474165165129887, "learning_rate": 0.00018931863035073217, - "loss": 3.6617, + "loss": 1.9211, "step": 1885 }, { "epoch": 0.23424428332403793, - "grad_norm": 0.262362699420388, + "grad_norm": 0.12400290821379777, "learning_rate": 0.00018922113917441269, - "loss": 3.7039, + "loss": 1.8531, "step": 1890 }, { "epoch": 0.23486397719526553, - "grad_norm": 0.2657160759884634, + "grad_norm": 0.1293696900950748, "learning_rate": 0.00018912323044233684, - "loss": 3.7758, + "loss": 1.8555, "step": 1895 }, { "epoch": 0.23548367106649315, - "grad_norm": 0.2524857167548615, + "grad_norm": 0.13775339083965105, "learning_rate": 0.00018902490461271843, - "loss": 3.8492, + "loss": 1.877, "step": 1900 }, { "epoch": 0.23610336493772077, - "grad_norm": 0.26495908971079557, + "grad_norm": 0.12991294611318904, "learning_rate": 0.00018892616214572319, - "loss": 3.6422, + "loss": 1.8926, "step": 1905 }, { "epoch": 0.23672305880894837, - "grad_norm": 0.2684330797115416, + "grad_norm": 0.1306998350118421, "learning_rate": 0.0001888270035034668, - "loss": 3.7172, + "loss": 1.8473, "step": 1910 }, { "epoch": 0.237342752680176, - "grad_norm": 0.2683759303138262, + "grad_norm": 0.12748036755001935, "learning_rate": 0.00018872742915001267, - "loss": 3.6922, + "loss": 1.8184, "step": 1915 }, { "epoch": 0.2379624465514036, - "grad_norm": 0.26240769269212194, + "grad_norm": 0.13129037118419162, "learning_rate": 0.00018862743955136966, - "loss": 3.7898, + "loss": 1.8344, "step": 1920 }, { "epoch": 0.23858214042263123, - "grad_norm": 0.26589301276971006, + "grad_norm": 0.13675780584711228, "learning_rate": 0.00018852703517549, - "loss": 3.6891, + "loss": 1.8785, "step": 1925 }, { "epoch": 0.23920183429385883, - "grad_norm": 0.2542964819622325, + "grad_norm": 0.13672370672678413, "learning_rate": 0.00018842621649226712, - "loss": 3.6898, + "loss": 1.8977, "step": 1930 }, { "epoch": 0.23982152816508645, - "grad_norm": 0.2526849705264363, + "grad_norm": 0.136994334104686, "learning_rate": 0.00018832498397353337, - "loss": 3.7172, + "loss": 1.9043, "step": 1935 }, { "epoch": 0.24044122203631407, - "grad_norm": 0.25106965146419435, + "grad_norm": 0.12404741020853537, "learning_rate": 0.0001882233380930579, - "loss": 3.8023, + "loss": 1.9023, "step": 1940 }, { "epoch": 0.24106091590754167, - "grad_norm": 0.2651428117203693, + "grad_norm": 0.14219684197325005, "learning_rate": 0.00018812127932654437, - "loss": 3.7484, + "loss": 1.8391, "step": 1945 }, { "epoch": 0.2416806097787693, - "grad_norm": 0.2760638561741223, + "grad_norm": 0.14076460578083125, "learning_rate": 0.00018801880815162873, - "loss": 3.6172, + "loss": 1.8676, "step": 1950 }, { "epoch": 0.2423003036499969, - "grad_norm": 0.2456224462085717, + "grad_norm": 0.12874788431477818, "learning_rate": 0.00018791592504787704, - "loss": 3.7297, + "loss": 1.8375, "step": 1955 }, { "epoch": 0.2429199975212245, - "grad_norm": 0.2494028881960688, + "grad_norm": 0.12553931062925705, "learning_rate": 0.00018781263049678318, - "loss": 3.6156, + "loss": 1.8508, "step": 1960 }, { "epoch": 0.24353969139245213, - "grad_norm": 0.26095765336800963, + "grad_norm": 0.12091941894161848, "learning_rate": 0.00018770892498176658, - "loss": 3.807, + "loss": 1.9434, "step": 1965 }, { "epoch": 0.24415938526367975, - "grad_norm": 0.2704204880074346, + "grad_norm": 0.1339190592545441, "learning_rate": 0.00018760480898817003, - "loss": 3.6828, + "loss": 1.8926, "step": 1970 }, { "epoch": 0.24477907913490735, - "grad_norm": 0.2801106765194449, + "grad_norm": 0.12907299618607065, "learning_rate": 0.00018750028300325733, - "loss": 3.6758, + "loss": 1.7914, "step": 1975 }, { "epoch": 0.24539877300613497, - "grad_norm": 0.2492976531159216, + "grad_norm": 0.1400210529224134, "learning_rate": 0.0001873953475162111, - "loss": 3.7031, + "loss": 1.8559, "step": 1980 }, { "epoch": 0.2460184668773626, - "grad_norm": 0.24369383072754122, + "grad_norm": 0.1313714701900259, "learning_rate": 0.00018729000301813032, - "loss": 3.7703, + "loss": 1.902, "step": 1985 }, { "epoch": 0.24663816074859019, - "grad_norm": 0.2634660714556242, + "grad_norm": 0.1374832083968447, "learning_rate": 0.00018718425000202826, - "loss": 3.6648, + "loss": 1.8305, "step": 1990 }, { "epoch": 0.2472578546198178, - "grad_norm": 0.2506439015505356, + "grad_norm": 0.1305538539148369, "learning_rate": 0.00018707808896283, - "loss": 3.6344, + "loss": 1.8934, "step": 1995 }, { "epoch": 0.24787754849104543, - "grad_norm": 0.26475470061946976, + "grad_norm": 0.12798042833527448, "learning_rate": 0.00018697152039737018, - "loss": 3.8516, + "loss": 1.798, "step": 2000 }, { "epoch": 0.24849724236227302, - "grad_norm": 0.2519925926674702, + "grad_norm": 0.1224972820994739, "learning_rate": 0.0001868645448043907, - "loss": 3.8375, + "loss": 1.8176, "step": 2005 }, { "epoch": 0.24911693623350065, - "grad_norm": 0.2525671283371183, + "grad_norm": 0.13156854661928893, "learning_rate": 0.00018675716268453827, - "loss": 3.7492, + "loss": 1.8309, "step": 2010 }, { "epoch": 0.24973663010472827, - "grad_norm": 0.2717739214225155, + "grad_norm": 0.1326350510944194, "learning_rate": 0.00018664937454036226, - "loss": 3.6961, + "loss": 1.8426, "step": 2015 }, { "epoch": 0.25035632397595586, - "grad_norm": 0.2652751934837347, + "grad_norm": 0.12724824879009583, "learning_rate": 0.0001865411808763122, - "loss": 3.7086, + "loss": 1.8633, "step": 2020 }, { "epoch": 0.2509760178471835, - "grad_norm": 0.2648807010430378, + "grad_norm": 0.13143276628129275, "learning_rate": 0.00018643258219873534, - "loss": 3.7617, + "loss": 1.8672, "step": 2025 }, { "epoch": 0.2515957117184111, - "grad_norm": 0.24785920340040016, + "grad_norm": 0.12055786233157745, "learning_rate": 0.00018632357901587456, - "loss": 3.7375, + "loss": 1.8992, "step": 2030 }, { "epoch": 0.25221540558963873, - "grad_norm": 0.25091594030972114, + "grad_norm": 0.1283496889470771, "learning_rate": 0.00018621417183786577, - "loss": 3.7328, + "loss": 1.8969, "step": 2035 }, { "epoch": 0.25283509946086635, - "grad_norm": 0.24568898688178742, + "grad_norm": 0.12479918550598595, "learning_rate": 0.00018610436117673555, - "loss": 3.7461, + "loss": 1.8383, "step": 2040 }, { "epoch": 0.2534547933320939, - "grad_norm": 0.2631910743510119, + "grad_norm": 0.12527718321183984, "learning_rate": 0.00018599414754639883, - "loss": 3.6859, + "loss": 1.8961, "step": 2045 }, { "epoch": 0.25407448720332154, - "grad_norm": 0.2579231850021295, + "grad_norm": 0.13056814750458098, "learning_rate": 0.00018588353146265643, - "loss": 3.7602, + "loss": 1.8562, "step": 2050 }, { "epoch": 0.25469418107454916, - "grad_norm": 0.2504974253689712, + "grad_norm": 0.12452625637363586, "learning_rate": 0.0001857725134431926, - "loss": 3.7172, + "loss": 1.8633, "step": 2055 }, { "epoch": 0.2553138749457768, - "grad_norm": 0.2560287024318882, + "grad_norm": 0.12684769404009308, "learning_rate": 0.00018566109400757272, - "loss": 3.7547, + "loss": 1.8727, "step": 2060 }, { "epoch": 0.2559335688170044, - "grad_norm": 0.2607957181258484, + "grad_norm": 0.1313216307636194, "learning_rate": 0.0001855492736772408, - "loss": 3.8188, + "loss": 1.8906, "step": 2065 }, { "epoch": 0.25655326268823203, - "grad_norm": 0.25419874060956393, + "grad_norm": 0.13639962397305716, "learning_rate": 0.00018543705297551698, - "loss": 3.7648, + "loss": 1.9059, "step": 2070 }, { "epoch": 0.25717295655945965, - "grad_norm": 0.2647249642923883, + "grad_norm": 0.13156534728467967, "learning_rate": 0.0001853244324275952, - "loss": 3.6305, + "loss": 1.8484, "step": 2075 }, { "epoch": 0.2577926504306872, - "grad_norm": 0.2560264250612117, + "grad_norm": 0.13081763644721794, "learning_rate": 0.00018521141256054067, - "loss": 3.7625, + "loss": 1.8695, "step": 2080 }, { "epoch": 0.25841234430191484, - "grad_norm": 0.2719021879374889, + "grad_norm": 0.12594234500358284, "learning_rate": 0.00018509799390328742, - "loss": 3.7797, + "loss": 1.8414, "step": 2085 }, { "epoch": 0.25903203817314246, - "grad_norm": 0.26428917216504066, + "grad_norm": 0.13042394051093623, "learning_rate": 0.00018498417698663584, - "loss": 3.7148, + "loss": 1.8094, "step": 2090 }, { "epoch": 0.2596517320443701, - "grad_norm": 0.2513232763080223, + "grad_norm": 0.1281810590087273, "learning_rate": 0.00018486996234325009, - "loss": 3.8078, + "loss": 1.8336, "step": 2095 }, { "epoch": 0.2602714259155977, - "grad_norm": 0.2614505704904821, + "grad_norm": 0.12569535944479185, "learning_rate": 0.00018475535050765577, - "loss": 3.6078, + "loss": 1.8434, "step": 2100 }, { "epoch": 0.26089111978682533, - "grad_norm": 0.27228462221200644, + "grad_norm": 0.12417048765178107, "learning_rate": 0.00018464034201623737, - "loss": 3.7547, + "loss": 1.8211, "step": 2105 }, { "epoch": 0.2615108136580529, - "grad_norm": 0.2505037237378973, + "grad_norm": 0.12883572085670414, "learning_rate": 0.00018452493740723567, - "loss": 3.7414, + "loss": 1.8805, "step": 2110 }, { "epoch": 0.2621305075292805, - "grad_norm": 0.270150970994609, + "grad_norm": 0.12862790707551727, "learning_rate": 0.0001844091372207453, - "loss": 3.7188, + "loss": 1.7957, "step": 2115 }, { "epoch": 0.26275020140050814, - "grad_norm": 0.2594189582815906, + "grad_norm": 0.12715104158430907, "learning_rate": 0.00018429294199871218, - "loss": 3.7172, + "loss": 1.8562, "step": 2120 }, { "epoch": 0.26336989527173577, - "grad_norm": 0.25810176583742167, + "grad_norm": 0.12629988077958154, "learning_rate": 0.0001841763522849311, - "loss": 3.7359, + "loss": 1.825, "step": 2125 }, { "epoch": 0.2639895891429634, - "grad_norm": 0.2650696314876113, + "grad_norm": 0.13297933463768885, "learning_rate": 0.00018405936862504293, - "loss": 3.693, + "loss": 1.8133, "step": 2130 }, { "epoch": 0.264609283014191, - "grad_norm": 0.25786214468577856, + "grad_norm": 0.12653209526655865, "learning_rate": 0.00018394199156653233, - "loss": 3.7633, + "loss": 1.8574, "step": 2135 }, { "epoch": 0.2652289768854186, - "grad_norm": 0.26529708459406043, + "grad_norm": 0.12202510651987598, "learning_rate": 0.00018382422165872498, - "loss": 3.8055, + "loss": 1.8496, "step": 2140 }, { "epoch": 0.2658486707566462, - "grad_norm": 0.25696309007901624, + "grad_norm": 0.1353666668217046, "learning_rate": 0.00018370605945278512, - "loss": 3.6859, + "loss": 1.8637, "step": 2145 }, { "epoch": 0.2664683646278738, - "grad_norm": 0.2503534337459447, + "grad_norm": 0.13805247188442749, "learning_rate": 0.00018358750550171303, - "loss": 3.7547, + "loss": 1.9406, "step": 2150 }, { "epoch": 0.26708805849910144, - "grad_norm": 0.24875491233752253, + "grad_norm": 0.1250178668969665, "learning_rate": 0.00018346856036034225, - "loss": 3.7773, + "loss": 1.8527, "step": 2155 }, { "epoch": 0.26770775237032907, - "grad_norm": 0.2533128310142299, + "grad_norm": 0.12679232416551073, "learning_rate": 0.0001833492245853371, - "loss": 3.7398, + "loss": 1.8559, "step": 2160 }, { "epoch": 0.2683274462415567, - "grad_norm": 0.2539163968924575, + "grad_norm": 0.1363953224589558, "learning_rate": 0.00018322949873519028, - "loss": 3.7344, + "loss": 1.8895, "step": 2165 }, { "epoch": 0.2689471401127843, - "grad_norm": 0.2596999484239305, + "grad_norm": 0.1316621505083288, "learning_rate": 0.00018310938337021967, - "loss": 3.5758, + "loss": 1.8863, "step": 2170 }, { "epoch": 0.2695668339840119, - "grad_norm": 0.2689073376598725, + "grad_norm": 0.13658584100967, "learning_rate": 0.00018298887905256642, - "loss": 3.6281, + "loss": 1.8426, "step": 2175 }, { "epoch": 0.2701865278552395, - "grad_norm": 0.2674798310731981, + "grad_norm": 0.1399720872648625, "learning_rate": 0.00018286798634619178, - "loss": 3.7422, + "loss": 1.8887, "step": 2180 }, { "epoch": 0.2708062217264671, - "grad_norm": 0.24878965043899778, + "grad_norm": 0.13009508757933128, "learning_rate": 0.0001827467058168748, - "loss": 3.6938, + "loss": 1.8621, "step": 2185 }, { "epoch": 0.27142591559769474, - "grad_norm": 0.2564005390096162, + "grad_norm": 0.12921741777040827, "learning_rate": 0.00018262503803220941, - "loss": 3.6469, + "loss": 1.9246, "step": 2190 }, { "epoch": 0.27204560946892237, - "grad_norm": 0.26875929187484876, + "grad_norm": 0.12712470891174327, "learning_rate": 0.00018250298356160203, - "loss": 3.7062, + "loss": 1.8152, "step": 2195 }, { "epoch": 0.27266530334015, - "grad_norm": 0.25702595886613516, + "grad_norm": 0.13590278878639858, "learning_rate": 0.00018238054297626868, - "loss": 3.7711, + "loss": 1.8336, "step": 2200 }, { "epoch": 0.27328499721137756, - "grad_norm": 0.26204309861599234, + "grad_norm": 0.1238425521745261, "learning_rate": 0.0001822577168492324, - "loss": 3.6945, + "loss": 1.8859, "step": 2205 }, { "epoch": 0.2739046910826052, - "grad_norm": 0.25048034699771604, + "grad_norm": 0.11850858633920164, "learning_rate": 0.00018213450575532068, - "loss": 3.7687, + "loss": 1.8977, "step": 2210 }, { "epoch": 0.2745243849538328, - "grad_norm": 0.25907241148512433, + "grad_norm": 0.12913707015807815, "learning_rate": 0.0001820109102711625, - "loss": 3.7227, + "loss": 1.8637, "step": 2215 }, { "epoch": 0.2751440788250604, - "grad_norm": 0.2542048392477533, + "grad_norm": 0.1275936025502557, "learning_rate": 0.00018188693097518589, - "loss": 3.7156, + "loss": 1.8895, "step": 2220 }, { "epoch": 0.27576377269628805, - "grad_norm": 0.25291052345796766, + "grad_norm": 0.1289979754489145, "learning_rate": 0.00018176256844761515, - "loss": 3.6727, + "loss": 1.857, "step": 2225 }, { "epoch": 0.27638346656751567, - "grad_norm": 0.26113718400524455, + "grad_norm": 0.12857925042981372, "learning_rate": 0.000181637823270468, - "loss": 3.693, + "loss": 1.909, "step": 2230 }, { "epoch": 0.27700316043874323, - "grad_norm": 0.2630654914651346, + "grad_norm": 0.13116917461488, "learning_rate": 0.00018151269602755305, - "loss": 3.832, + "loss": 1.8363, "step": 2235 }, { "epoch": 0.27762285430997086, - "grad_norm": 0.27022263122244966, + "grad_norm": 0.12114760763849447, "learning_rate": 0.00018138718730446694, - "loss": 3.7406, + "loss": 1.8156, "step": 2240 }, { "epoch": 0.2782425481811985, - "grad_norm": 0.27804099995875925, + "grad_norm": 0.12970449110445717, "learning_rate": 0.00018126129768859166, - "loss": 3.6273, + "loss": 1.8266, "step": 2245 }, { "epoch": 0.2788622420524261, - "grad_norm": 0.26155058708223855, + "grad_norm": 0.13107665022911177, "learning_rate": 0.0001811350277690918, - "loss": 3.8539, + "loss": 1.8668, "step": 2250 }, { "epoch": 0.2794819359236537, - "grad_norm": 0.2571035053961106, + "grad_norm": 0.13037179328112342, "learning_rate": 0.00018100837813691173, - "loss": 3.8227, + "loss": 1.8992, "step": 2255 }, { "epoch": 0.28010162979488135, - "grad_norm": 0.2532323009912883, + "grad_norm": 0.13261626647936872, "learning_rate": 0.00018088134938477285, - "loss": 3.6812, + "loss": 1.9145, "step": 2260 }, { "epoch": 0.28072132366610897, - "grad_norm": 0.2653512113079037, + "grad_norm": 0.12467462496231817, "learning_rate": 0.00018075394210717097, - "loss": 3.8602, + "loss": 1.9527, "step": 2265 }, { "epoch": 0.28134101753733654, - "grad_norm": 0.2572939728938908, + "grad_norm": 0.1302006394858805, "learning_rate": 0.0001806261569003733, - "loss": 3.7461, + "loss": 1.8801, "step": 2270 }, { "epoch": 0.28196071140856416, - "grad_norm": 0.25590770385874284, + "grad_norm": 0.12656138072020032, "learning_rate": 0.00018049799436241584, - "loss": 3.6844, + "loss": 1.8168, "step": 2275 }, { "epoch": 0.2825804052797918, - "grad_norm": 0.25039800307535715, + "grad_norm": 0.13340657198127945, "learning_rate": 0.00018036945509310035, - "loss": 3.7859, + "loss": 1.8332, "step": 2280 }, { "epoch": 0.2832000991510194, - "grad_norm": 0.2588818018107696, + "grad_norm": 0.14913861128980632, "learning_rate": 0.00018024053969399186, - "loss": 3.7945, + "loss": 1.8199, "step": 2285 }, { "epoch": 0.283819793022247, - "grad_norm": 0.2560361453547467, + "grad_norm": 0.1306108935364312, "learning_rate": 0.00018011124876841564, - "loss": 3.657, + "loss": 1.8574, "step": 2290 }, { "epoch": 0.28443948689347465, - "grad_norm": 0.25369507057586327, + "grad_norm": 0.13401325283879312, "learning_rate": 0.0001799815829214544, - "loss": 3.6859, + "loss": 1.8879, "step": 2295 }, { "epoch": 0.2850591807647022, - "grad_norm": 0.26821575452634666, + "grad_norm": 0.12714018883217135, "learning_rate": 0.00017985154275994546, - "loss": 3.9195, + "loss": 1.843, "step": 2300 }, { "epoch": 0.28567887463592984, - "grad_norm": 0.25981458312044664, + "grad_norm": 0.12920439535349587, "learning_rate": 0.00017972112889247808, - "loss": 3.7891, + "loss": 1.8574, "step": 2305 }, { "epoch": 0.28629856850715746, - "grad_norm": 0.26331362477647047, + "grad_norm": 0.13423426078686024, "learning_rate": 0.00017959034192939027, - "loss": 3.7188, + "loss": 1.7715, "step": 2310 }, { "epoch": 0.2869182623783851, - "grad_norm": 0.2722924028576296, + "grad_norm": 0.13865809582419814, "learning_rate": 0.0001794591824827663, - "loss": 3.6688, + "loss": 1.8223, "step": 2315 }, { "epoch": 0.2875379562496127, - "grad_norm": 0.2514795296292142, + "grad_norm": 0.13168701447577058, "learning_rate": 0.0001793276511664335, - "loss": 3.6875, + "loss": 1.8875, "step": 2320 }, { "epoch": 0.2881576501208403, - "grad_norm": 0.2484928821399613, + "grad_norm": 0.12997910837997945, "learning_rate": 0.00017919574859595977, - "loss": 3.6039, + "loss": 1.8531, "step": 2325 }, { "epoch": 0.2887773439920679, - "grad_norm": 0.2627066856031102, + "grad_norm": 0.1223672846252809, "learning_rate": 0.00017906347538865021, - "loss": 3.6539, + "loss": 1.8539, "step": 2330 }, { "epoch": 0.2893970378632955, - "grad_norm": 0.25896716308452516, + "grad_norm": 0.13376807374029479, "learning_rate": 0.00017893083216354477, - "loss": 3.8016, + "loss": 1.882, "step": 2335 }, { "epoch": 0.29001673173452314, - "grad_norm": 0.260088068114753, + "grad_norm": 0.12472196578748385, "learning_rate": 0.00017879781954141497, - "loss": 3.6742, + "loss": 1.823, "step": 2340 }, { "epoch": 0.29063642560575076, - "grad_norm": 0.254152692296712, + "grad_norm": 0.13189694188445558, "learning_rate": 0.00017866443814476107, - "loss": 3.6477, + "loss": 1.8527, "step": 2345 }, { "epoch": 0.2912561194769784, - "grad_norm": 0.24886258494332086, + "grad_norm": 0.13537569866831578, "learning_rate": 0.0001785306885978092, - "loss": 3.8352, + "loss": 1.8832, "step": 2350 }, { "epoch": 0.291875813348206, - "grad_norm": 0.24656789699910267, + "grad_norm": 0.13295767983771653, "learning_rate": 0.00017839657152650856, - "loss": 3.7727, + "loss": 1.8457, "step": 2355 }, { "epoch": 0.2924955072194336, - "grad_norm": 0.25800157226082343, + "grad_norm": 0.1282927097281891, "learning_rate": 0.00017826208755852827, - "loss": 3.7109, + "loss": 1.8816, "step": 2360 }, { "epoch": 0.2931152010906612, - "grad_norm": 0.2690360891332733, + "grad_norm": 0.13049913149183853, "learning_rate": 0.00017812723732325446, - "loss": 3.7773, + "loss": 1.8641, "step": 2365 }, { "epoch": 0.2937348949618888, - "grad_norm": 0.2463411380990516, + "grad_norm": 0.13433617093166675, "learning_rate": 0.00017799202145178758, - "loss": 3.8328, + "loss": 1.8414, "step": 2370 }, { "epoch": 0.29435458883311644, - "grad_norm": 0.27495830500412854, + "grad_norm": 0.12882036108794354, "learning_rate": 0.00017785644057693913, - "loss": 3.9031, + "loss": 1.8551, "step": 2375 }, { "epoch": 0.29497428270434406, - "grad_norm": 0.2558003349123726, + "grad_norm": 0.12432550105269342, "learning_rate": 0.0001777204953332288, - "loss": 3.6437, + "loss": 1.8871, "step": 2380 }, { "epoch": 0.2955939765755717, - "grad_norm": 0.2765345649233851, + "grad_norm": 0.13582464892038923, "learning_rate": 0.00017758418635688167, - "loss": 3.7281, + "loss": 1.9059, "step": 2385 }, { "epoch": 0.2962136704467993, - "grad_norm": 0.24566185406030525, + "grad_norm": 0.1272933073062253, "learning_rate": 0.00017744751428582496, - "loss": 3.6414, + "loss": 1.8855, "step": 2390 }, { "epoch": 0.29683336431802687, - "grad_norm": 0.2720845656807831, + "grad_norm": 0.13122540435689012, "learning_rate": 0.00017731047975968523, - "loss": 3.6953, + "loss": 1.8566, "step": 2395 }, { "epoch": 0.2974530581892545, - "grad_norm": 0.26681875281668055, + "grad_norm": 0.1303624882568514, "learning_rate": 0.00017717308341978538, - "loss": 3.718, + "loss": 1.8562, "step": 2400 }, { "epoch": 0.2980727520604821, - "grad_norm": 0.2730768460301598, + "grad_norm": 0.12353259441658039, "learning_rate": 0.00017703532590914147, - "loss": 3.6094, + "loss": 1.8684, "step": 2405 }, { "epoch": 0.29869244593170974, - "grad_norm": 0.25665099624288207, + "grad_norm": 0.13170341964131901, "learning_rate": 0.00017689720787245997, - "loss": 3.6328, + "loss": 1.923, "step": 2410 }, { "epoch": 0.29931213980293736, - "grad_norm": 0.2671341617363374, + "grad_norm": 0.12876096829831757, "learning_rate": 0.00017675872995613458, - "loss": 3.6383, + "loss": 1.8668, "step": 2415 }, { "epoch": 0.299931833674165, - "grad_norm": 0.2548902040563108, + "grad_norm": 0.12369005168320343, "learning_rate": 0.0001766198928082432, - "loss": 3.6313, + "loss": 1.8613, "step": 2420 }, { "epoch": 0.30055152754539255, - "grad_norm": 0.25534063192461004, + "grad_norm": 0.13131764221149608, "learning_rate": 0.00017648069707854497, - "loss": 3.6695, + "loss": 1.9008, "step": 2425 }, { "epoch": 0.30117122141662017, - "grad_norm": 0.262691153256819, + "grad_norm": 0.12451191294123531, "learning_rate": 0.0001763411434184772, - "loss": 3.7055, + "loss": 1.8531, "step": 2430 }, { "epoch": 0.3017909152878478, - "grad_norm": 0.27308998925615785, + "grad_norm": 0.13236533602259273, "learning_rate": 0.00017620123248115235, - "loss": 3.6695, + "loss": 1.8691, "step": 2435 }, { "epoch": 0.3024106091590754, - "grad_norm": 0.25813554283407286, + "grad_norm": 0.12496123040012845, "learning_rate": 0.0001760609649213548, - "loss": 3.7078, + "loss": 1.8895, "step": 2440 }, { "epoch": 0.30303030303030304, - "grad_norm": 0.26071668852667024, + "grad_norm": 0.12136460576867035, "learning_rate": 0.00017592034139553812, - "loss": 3.6914, + "loss": 1.9016, "step": 2445 }, { "epoch": 0.30364999690153066, - "grad_norm": 0.25305199056761896, + "grad_norm": 0.12521529870266043, "learning_rate": 0.00017577936256182167, - "loss": 3.7555, + "loss": 1.8586, "step": 2450 }, { "epoch": 0.3042696907727583, - "grad_norm": 0.26384333576048713, + "grad_norm": 0.13202218968445123, "learning_rate": 0.00017563802907998773, - "loss": 3.7148, + "loss": 1.8457, "step": 2455 }, { "epoch": 0.30488938464398585, - "grad_norm": 0.25628783851634435, + "grad_norm": 0.12735675597883855, "learning_rate": 0.00017549634161147823, - "loss": 3.6867, + "loss": 1.8461, "step": 2460 }, { "epoch": 0.3055090785152135, - "grad_norm": 0.2532902126189516, + "grad_norm": 0.12834901220735337, "learning_rate": 0.0001753543008193919, - "loss": 3.6672, + "loss": 1.8824, "step": 2465 }, { "epoch": 0.3061287723864411, - "grad_norm": 0.25920690025946386, + "grad_norm": 0.13076216762403509, "learning_rate": 0.00017521190736848096, - "loss": 3.7102, + "loss": 1.8941, "step": 2470 }, { "epoch": 0.3067484662576687, - "grad_norm": 0.26672975988767644, + "grad_norm": 0.12527567116147303, "learning_rate": 0.00017506916192514801, - "loss": 3.7539, + "loss": 1.8156, "step": 2475 }, { "epoch": 0.30736816012889634, - "grad_norm": 0.26966415309842773, + "grad_norm": 0.12718249741367704, "learning_rate": 0.0001749260651574431, - "loss": 3.725, + "loss": 1.8629, "step": 2480 }, { "epoch": 0.30798785400012396, - "grad_norm": 0.25823480092565315, + "grad_norm": 0.13306132546258587, "learning_rate": 0.00017478261773506043, - "loss": 3.7008, + "loss": 1.8305, "step": 2485 }, { "epoch": 0.30860754787135153, - "grad_norm": 0.26871657326044296, + "grad_norm": 0.1307987467060954, "learning_rate": 0.00017463882032933524, - "loss": 3.643, + "loss": 1.8238, "step": 2490 }, { "epoch": 0.30922724174257915, - "grad_norm": 0.2676289335607888, + "grad_norm": 0.1351453589760986, "learning_rate": 0.00017449467361324076, - "loss": 3.5414, + "loss": 1.8543, "step": 2495 }, { "epoch": 0.3098469356138068, - "grad_norm": 0.27029407180457116, + "grad_norm": 0.12307357041545904, "learning_rate": 0.0001743501782613849, - "loss": 3.6156, + "loss": 1.9164, "step": 2500 }, { "epoch": 0.3104666294850344, - "grad_norm": 0.25225214661766965, + "grad_norm": 0.13149377717893693, "learning_rate": 0.00017420533495000727, - "loss": 3.6461, + "loss": 1.8473, "step": 2505 }, { "epoch": 0.311086323356262, - "grad_norm": 0.2519061042368278, + "grad_norm": 0.12779775983346198, "learning_rate": 0.0001740601443569759, - "loss": 3.7109, + "loss": 1.8031, "step": 2510 }, { "epoch": 0.31170601722748964, - "grad_norm": 0.26008459941014955, + "grad_norm": 0.13210455876193677, "learning_rate": 0.0001739146071617841, - "loss": 3.7523, + "loss": 1.8637, "step": 2515 }, { "epoch": 0.3123257110987172, - "grad_norm": 0.25855244878903594, + "grad_norm": 0.12810367969961917, "learning_rate": 0.0001737687240455473, - "loss": 3.8008, + "loss": 1.8316, "step": 2520 }, { "epoch": 0.31294540496994483, - "grad_norm": 0.25573633147599, + "grad_norm": 0.12704595997205193, "learning_rate": 0.00017362249569099982, - "loss": 3.6852, + "loss": 1.8445, "step": 2525 }, { "epoch": 0.31356509884117245, - "grad_norm": 0.2613543224212082, + "grad_norm": 0.12919348193178157, "learning_rate": 0.00017347592278249175, - "loss": 3.7219, + "loss": 1.8586, "step": 2530 }, { "epoch": 0.3141847927124001, - "grad_norm": 0.26233818596949066, + "grad_norm": 0.13802571807791786, "learning_rate": 0.00017332900600598562, - "loss": 3.7789, + "loss": 1.8703, "step": 2535 }, { "epoch": 0.3148044865836277, - "grad_norm": 0.25769955374979814, + "grad_norm": 0.1353603976296967, "learning_rate": 0.00017318174604905327, - "loss": 3.5703, + "loss": 1.8824, "step": 2540 }, { "epoch": 0.3154241804548553, - "grad_norm": 0.2501072256047701, + "grad_norm": 0.12737035386801074, "learning_rate": 0.00017303414360087278, - "loss": 3.7758, + "loss": 1.8246, "step": 2545 }, { "epoch": 0.31604387432608294, - "grad_norm": 0.26789431822777315, + "grad_norm": 0.13290623068299737, "learning_rate": 0.00017288619935222486, - "loss": 3.8359, + "loss": 1.8527, "step": 2550 }, { "epoch": 0.3166635681973105, - "grad_norm": 0.2523306821617771, + "grad_norm": 0.1207102015079047, "learning_rate": 0.00017273791399548998, - "loss": 3.793, + "loss": 1.8414, "step": 2555 }, { "epoch": 0.31728326206853813, - "grad_norm": 0.29899788219343393, + "grad_norm": 0.13358773743845612, "learning_rate": 0.000172589288224645, - "loss": 3.7398, + "loss": 1.8574, "step": 2560 }, { "epoch": 0.31790295593976575, - "grad_norm": 0.27736503749696745, + "grad_norm": 0.1279169729154266, "learning_rate": 0.00017244032273525995, - "loss": 3.743, + "loss": 1.8414, "step": 2565 }, { "epoch": 0.3185226498109934, - "grad_norm": 0.2624525004569622, + "grad_norm": 0.12718429357614378, "learning_rate": 0.0001722910182244946, - "loss": 3.668, + "loss": 1.8523, "step": 2570 }, { "epoch": 0.319142343682221, - "grad_norm": 0.2754468952649189, + "grad_norm": 0.1288546174064208, "learning_rate": 0.00017214137539109552, - "loss": 3.6688, + "loss": 1.8805, "step": 2575 }, { "epoch": 0.3197620375534486, - "grad_norm": 0.27290988521036397, + "grad_norm": 0.1339746303542721, "learning_rate": 0.0001719913949353925, - "loss": 3.7594, + "loss": 1.8687, "step": 2580 }, { "epoch": 0.3203817314246762, - "grad_norm": 0.2663978562264804, + "grad_norm": 0.12550192433671525, "learning_rate": 0.0001718410775592955, - "loss": 3.6953, + "loss": 1.8957, "step": 2585 }, { "epoch": 0.3210014252959038, - "grad_norm": 0.251095453478846, + "grad_norm": 0.13266972541549396, "learning_rate": 0.00017169042396629117, - "loss": 3.7336, + "loss": 1.8492, "step": 2590 }, { "epoch": 0.32162111916713143, - "grad_norm": 0.2669971493186749, + "grad_norm": 0.14169225665911975, "learning_rate": 0.00017153943486143978, - "loss": 3.6742, + "loss": 1.8582, "step": 2595 }, { "epoch": 0.32224081303835905, - "grad_norm": 0.24850318565449725, + "grad_norm": 0.12830541584647112, "learning_rate": 0.00017138811095137175, - "loss": 3.7211, + "loss": 1.8621, "step": 2600 }, { "epoch": 0.3228605069095867, - "grad_norm": 0.2895715920171382, + "grad_norm": 0.13328089401058096, "learning_rate": 0.0001712364529442843, - "loss": 3.7852, + "loss": 1.8047, "step": 2605 }, { "epoch": 0.3234802007808143, - "grad_norm": 0.27180495114459347, + "grad_norm": 0.12824865870602767, "learning_rate": 0.00017108446154993838, - "loss": 3.6875, + "loss": 1.8648, "step": 2610 }, { "epoch": 0.32409989465204186, - "grad_norm": 0.2548966676504203, + "grad_norm": 0.13486957428477936, "learning_rate": 0.0001709321374796551, - "loss": 3.7875, + "loss": 1.8773, "step": 2615 }, { "epoch": 0.3247195885232695, - "grad_norm": 0.26288507772094294, + "grad_norm": 0.14478150049090685, "learning_rate": 0.00017077948144631248, - "loss": 3.6688, + "loss": 1.8574, "step": 2620 }, { "epoch": 0.3253392823944971, - "grad_norm": 0.26277070377953077, + "grad_norm": 0.13019704674061816, "learning_rate": 0.00017062649416434223, - "loss": 3.6469, + "loss": 1.8359, "step": 2625 }, { "epoch": 0.32595897626572473, - "grad_norm": 0.26365748542058276, + "grad_norm": 0.12921336608460762, "learning_rate": 0.00017047317634972617, - "loss": 3.6617, + "loss": 1.8598, "step": 2630 }, { "epoch": 0.32657867013695235, - "grad_norm": 0.2458080656469107, + "grad_norm": 0.12820404459063692, "learning_rate": 0.00017031952871999315, - "loss": 3.7359, + "loss": 1.925, "step": 2635 }, { "epoch": 0.32719836400818, - "grad_norm": 0.27576035893728557, + "grad_norm": 0.1333050138527027, "learning_rate": 0.0001701655519942155, - "loss": 3.7281, + "loss": 1.8805, "step": 2640 }, { "epoch": 0.3278180578794076, - "grad_norm": 0.2671236481432457, + "grad_norm": 0.12777262066895273, "learning_rate": 0.00017001124689300568, - "loss": 3.7727, + "loss": 1.8207, "step": 2645 }, { "epoch": 0.32843775175063517, - "grad_norm": 0.25646009969395855, + "grad_norm": 0.13548725964588937, "learning_rate": 0.00016985661413851304, - "loss": 3.6984, + "loss": 1.8512, "step": 2650 }, { "epoch": 0.3290574456218628, - "grad_norm": 0.24855468305555162, + "grad_norm": 0.1291350186134146, "learning_rate": 0.00016970165445442023, - "loss": 3.5922, + "loss": 1.8574, "step": 2655 }, { "epoch": 0.3296771394930904, - "grad_norm": 0.2488779303959064, + "grad_norm": 0.13286049954171614, "learning_rate": 0.00016954636856594005, - "loss": 3.7273, + "loss": 1.8398, "step": 2660 }, { "epoch": 0.33029683336431803, - "grad_norm": 0.24991520834091216, + "grad_norm": 0.12250586047506855, "learning_rate": 0.00016939075719981194, - "loss": 3.6844, + "loss": 1.8598, "step": 2665 }, { "epoch": 0.33091652723554565, - "grad_norm": 0.27341253703778134, + "grad_norm": 0.12758848458031308, "learning_rate": 0.00016923482108429844, - "loss": 3.6289, + "loss": 1.8445, "step": 2670 }, { "epoch": 0.3315362211067733, - "grad_norm": 0.25347885179835555, + "grad_norm": 0.13664664659730347, "learning_rate": 0.00016907856094918207, - "loss": 3.7313, + "loss": 1.8059, "step": 2675 }, { "epoch": 0.33215591497800084, - "grad_norm": 0.2664388675272623, + "grad_norm": 0.12746464026705376, "learning_rate": 0.0001689219775257617, - "loss": 3.7398, + "loss": 1.8188, "step": 2680 }, { "epoch": 0.33277560884922847, - "grad_norm": 0.26965614612569255, + "grad_norm": 0.12755329667300946, "learning_rate": 0.00016876507154684918, - "loss": 3.6805, + "loss": 1.8035, "step": 2685 }, { "epoch": 0.3333953027204561, - "grad_norm": 0.25477336727357236, + "grad_norm": 0.1285133337473147, "learning_rate": 0.00016860784374676593, - "loss": 3.7359, + "loss": 1.8609, "step": 2690 }, { "epoch": 0.3340149965916837, - "grad_norm": 0.26828594917552684, + "grad_norm": 0.1313491570500189, "learning_rate": 0.00016845029486133956, - "loss": 3.6789, + "loss": 1.8512, "step": 2695 }, { "epoch": 0.33463469046291133, - "grad_norm": 0.28043441092000476, + "grad_norm": 0.1258582951786215, "learning_rate": 0.00016829242562790026, - "loss": 3.7039, + "loss": 1.8379, "step": 2700 }, { "epoch": 0.33525438433413896, - "grad_norm": 0.2719579591511516, + "grad_norm": 0.12654293645403783, "learning_rate": 0.00016813423678527754, - "loss": 3.7852, + "loss": 1.9211, "step": 2705 }, { "epoch": 0.3358740782053665, - "grad_norm": 0.25604408210266827, + "grad_norm": 0.1430960515630514, "learning_rate": 0.00016797572907379667, - "loss": 3.6742, + "loss": 1.848, "step": 2710 }, { "epoch": 0.33649377207659414, - "grad_norm": 0.26206590705100935, + "grad_norm": 0.13294562149445674, "learning_rate": 0.00016781690323527511, - "loss": 3.8953, + "loss": 1.8676, "step": 2715 }, { "epoch": 0.33711346594782177, - "grad_norm": 0.26396741494967757, + "grad_norm": 0.12823331423568302, "learning_rate": 0.00016765776001301933, - "loss": 3.7289, + "loss": 1.8555, "step": 2720 }, { "epoch": 0.3377331598190494, - "grad_norm": 0.2547598811513136, + "grad_norm": 0.13133147708443205, "learning_rate": 0.00016749830015182107, - "loss": 3.6258, + "loss": 1.9402, "step": 2725 }, { "epoch": 0.338352853690277, - "grad_norm": 0.2741006690732575, + "grad_norm": 0.1307417640679184, "learning_rate": 0.00016733852439795394, - "loss": 3.7898, + "loss": 1.8723, "step": 2730 }, { "epoch": 0.33897254756150463, - "grad_norm": 0.2641777459426088, + "grad_norm": 0.14036159913262772, "learning_rate": 0.00016717843349916994, - "loss": 3.7031, + "loss": 1.8359, "step": 2735 }, { "epoch": 0.33959224143273226, - "grad_norm": 0.2629606866295394, + "grad_norm": 0.1275526648144705, "learning_rate": 0.000167018028204696, - "loss": 3.8133, + "loss": 1.8734, "step": 2740 }, { "epoch": 0.3402119353039598, - "grad_norm": 0.24477609948639015, + "grad_norm": 0.12574598292314898, "learning_rate": 0.0001668573092652303, - "loss": 3.7102, + "loss": 1.8637, "step": 2745 }, { "epoch": 0.34083162917518744, - "grad_norm": 0.27068594003654395, + "grad_norm": 0.13622787470190775, "learning_rate": 0.00016669627743293907, - "loss": 3.682, + "loss": 1.9211, "step": 2750 }, { "epoch": 0.34145132304641507, - "grad_norm": 0.26929411586592, + "grad_norm": 0.13437389620413182, "learning_rate": 0.00016653493346145267, - "loss": 3.6445, + "loss": 1.8262, "step": 2755 }, { "epoch": 0.3420710169176427, - "grad_norm": 0.24939469621006297, + "grad_norm": 0.1344605856943175, "learning_rate": 0.00016637327810586246, - "loss": 3.7125, + "loss": 1.8367, "step": 2760 }, { "epoch": 0.3426907107888703, - "grad_norm": 0.2530396996711045, + "grad_norm": 0.1291930553452629, "learning_rate": 0.00016621131212271695, - "loss": 3.6828, + "loss": 1.8316, "step": 2765 }, { "epoch": 0.34331040466009793, - "grad_norm": 0.25826979485096646, + "grad_norm": 0.1343538598231125, "learning_rate": 0.00016604903627001844, - "loss": 3.718, + "loss": 1.8586, "step": 2770 }, { "epoch": 0.3439300985313255, - "grad_norm": 0.27768697587316615, + "grad_norm": 0.13323136800247856, "learning_rate": 0.00016588645130721948, - "loss": 3.618, + "loss": 1.9086, "step": 2775 }, { "epoch": 0.3445497924025531, - "grad_norm": 0.2650900726296455, + "grad_norm": 0.12809181295437982, "learning_rate": 0.00016572355799521912, - "loss": 3.7219, + "loss": 1.8621, "step": 2780 }, { "epoch": 0.34516948627378075, - "grad_norm": 0.2633177048203003, + "grad_norm": 0.13373391040989493, "learning_rate": 0.0001655603570963596, - "loss": 3.718, + "loss": 1.8402, "step": 2785 }, { "epoch": 0.34578918014500837, - "grad_norm": 0.26296181136909086, + "grad_norm": 0.12823175261000916, "learning_rate": 0.00016539684937442263, - "loss": 3.7039, + "loss": 1.8855, "step": 2790 }, { "epoch": 0.346408874016236, - "grad_norm": 0.2783942226825099, + "grad_norm": 0.12427006053911512, "learning_rate": 0.0001652330355946259, - "loss": 3.6758, + "loss": 1.8035, "step": 2795 }, { "epoch": 0.3470285678874636, - "grad_norm": 0.2600566703174294, + "grad_norm": 0.14309427079315348, "learning_rate": 0.00016506891652361933, - "loss": 3.6969, + "loss": 1.8367, "step": 2800 }, { "epoch": 0.3476482617586912, - "grad_norm": 0.2542741242950709, + "grad_norm": 0.1346672703530673, "learning_rate": 0.0001649044929294818, - "loss": 3.6078, + "loss": 1.8531, "step": 2805 }, { "epoch": 0.3482679556299188, - "grad_norm": 0.25822378785350575, + "grad_norm": 0.1348324178068194, "learning_rate": 0.00016473976558171714, - "loss": 3.7125, + "loss": 1.9059, "step": 2810 }, { "epoch": 0.3488876495011464, - "grad_norm": 0.2641370960337805, + "grad_norm": 0.13127286752655276, "learning_rate": 0.00016457473525125093, - "loss": 3.7141, + "loss": 1.8086, "step": 2815 }, { "epoch": 0.34950734337237405, - "grad_norm": 0.25569765031719666, + "grad_norm": 0.13288592496132326, "learning_rate": 0.00016440940271042663, - "loss": 3.643, + "loss": 1.8898, "step": 2820 }, { "epoch": 0.35012703724360167, - "grad_norm": 0.26808702545434854, + "grad_norm": 0.13569741783533862, "learning_rate": 0.00016424376873300207, - "loss": 3.6211, + "loss": 1.8484, "step": 2825 }, { "epoch": 0.3507467311148293, - "grad_norm": 0.2608273876865326, + "grad_norm": 0.13416342536609563, "learning_rate": 0.00016407783409414577, - "loss": 3.6508, + "loss": 1.8609, "step": 2830 }, { "epoch": 0.3513664249860569, - "grad_norm": 0.2479224348257157, + "grad_norm": 0.1169790399721275, "learning_rate": 0.00016391159957043335, - "loss": 3.6773, + "loss": 1.8531, "step": 2835 }, { "epoch": 0.3519861188572845, - "grad_norm": 0.27099013945725076, + "grad_norm": 0.13616470490760044, "learning_rate": 0.000163745065939844, - "loss": 3.775, + "loss": 1.8703, "step": 2840 }, { "epoch": 0.3526058127285121, - "grad_norm": 0.2678681599061846, + "grad_norm": 0.1253790357970052, "learning_rate": 0.0001635782339817566, - "loss": 3.7352, + "loss": 1.9043, "step": 2845 }, { "epoch": 0.3532255065997397, - "grad_norm": 0.25434553862487636, + "grad_norm": 0.12874313573044202, "learning_rate": 0.00016341110447694624, - "loss": 3.7531, + "loss": 1.8266, "step": 2850 }, { "epoch": 0.35384520047096735, - "grad_norm": 0.2600099088957585, + "grad_norm": 0.1424792287256125, "learning_rate": 0.00016324367820758057, - "loss": 3.7352, + "loss": 1.8273, "step": 2855 }, { "epoch": 0.35446489434219497, - "grad_norm": 0.2576820816746782, + "grad_norm": 0.1274395857428731, "learning_rate": 0.0001630759559572161, - "loss": 3.6844, + "loss": 1.8473, "step": 2860 }, { "epoch": 0.3550845882134226, - "grad_norm": 0.26429572988622685, + "grad_norm": 0.12980584268562773, "learning_rate": 0.00016290793851079447, - "loss": 3.6648, + "loss": 1.8859, "step": 2865 }, { "epoch": 0.35570428208465016, - "grad_norm": 0.25628056770143104, + "grad_norm": 0.1284857003389399, "learning_rate": 0.00016273962665463892, - "loss": 3.768, + "loss": 1.8883, "step": 2870 }, { "epoch": 0.3563239759558778, - "grad_norm": 0.26314053347734023, + "grad_norm": 0.13062335696884655, "learning_rate": 0.00016257102117645048, - "loss": 3.7867, + "loss": 1.8113, "step": 2875 }, { "epoch": 0.3569436698271054, - "grad_norm": 0.26827011010620455, + "grad_norm": 0.13020290307748666, "learning_rate": 0.00016240212286530432, - "loss": 3.5789, + "loss": 1.8633, "step": 2880 }, { "epoch": 0.357563363698333, - "grad_norm": 0.2717920774102298, + "grad_norm": 0.12282658467967292, "learning_rate": 0.00016223293251164616, - "loss": 3.7508, + "loss": 1.8703, "step": 2885 }, { "epoch": 0.35818305756956065, - "grad_norm": 0.2553465718103751, + "grad_norm": 0.12853916417645042, "learning_rate": 0.00016206345090728834, - "loss": 3.7437, + "loss": 1.8379, "step": 2890 }, { "epoch": 0.35880275144078827, - "grad_norm": 0.24526796804421827, + "grad_norm": 0.12994523896895013, "learning_rate": 0.00016189367884540638, - "loss": 3.6945, + "loss": 1.8469, "step": 2895 }, { "epoch": 0.35942244531201584, - "grad_norm": 0.2602330023370647, + "grad_norm": 0.13578425779672484, "learning_rate": 0.00016172361712053513, - "loss": 3.5922, + "loss": 1.8594, "step": 2900 }, { "epoch": 0.36004213918324346, - "grad_norm": 0.25584605115473547, + "grad_norm": 0.1264516379346521, "learning_rate": 0.00016155326652856497, - "loss": 3.775, + "loss": 1.8816, "step": 2905 }, { "epoch": 0.3606618330544711, - "grad_norm": 0.2679851558322471, + "grad_norm": 0.1285061053282456, "learning_rate": 0.0001613826278667383, - "loss": 3.7484, + "loss": 1.7895, "step": 2910 }, { "epoch": 0.3612815269256987, - "grad_norm": 0.2721594838633521, + "grad_norm": 0.13094212766467833, "learning_rate": 0.00016121170193364557, - "loss": 3.632, + "loss": 1.8172, "step": 2915 }, { "epoch": 0.3619012207969263, - "grad_norm": 0.365423422711038, + "grad_norm": 0.12276398604923604, "learning_rate": 0.0001610404895292218, - "loss": 3.7281, + "loss": 1.9117, "step": 2920 }, { "epoch": 0.36252091466815395, - "grad_norm": 0.2633704122091394, + "grad_norm": 0.12739854449016166, "learning_rate": 0.0001608689914547426, - "loss": 3.6812, + "loss": 1.8629, "step": 2925 }, { "epoch": 0.36314060853938157, - "grad_norm": 0.24858833944835435, + "grad_norm": 0.1292586487909269, "learning_rate": 0.00016069720851282052, - "loss": 3.6594, + "loss": 1.8477, "step": 2930 }, { "epoch": 0.36376030241060914, - "grad_norm": 0.2615686620373773, + "grad_norm": 0.12593917875384036, "learning_rate": 0.00016052514150740135, - "loss": 3.7031, + "loss": 1.8938, "step": 2935 }, { "epoch": 0.36437999628183676, - "grad_norm": 0.2608420079149271, + "grad_norm": 0.12784560560432576, "learning_rate": 0.00016035279124376026, - "loss": 3.7375, + "loss": 1.8219, "step": 2940 }, { "epoch": 0.3649996901530644, - "grad_norm": 0.2669515382231752, + "grad_norm": 0.1257259960150117, "learning_rate": 0.00016018015852849806, - "loss": 3.6461, + "loss": 1.8211, "step": 2945 }, { "epoch": 0.365619384024292, - "grad_norm": 0.25530803310408307, + "grad_norm": 0.12929114347135717, "learning_rate": 0.00016000724416953744, - "loss": 3.7367, + "loss": 1.8547, "step": 2950 }, { "epoch": 0.3662390778955196, - "grad_norm": 0.2627912871812224, + "grad_norm": 0.13234287937382339, "learning_rate": 0.00015983404897611928, - "loss": 3.8031, + "loss": 1.8484, "step": 2955 }, { "epoch": 0.36685877176674725, - "grad_norm": 0.25961614988600124, + "grad_norm": 0.1340348171981682, "learning_rate": 0.00015966057375879858, - "loss": 3.7047, + "loss": 1.8441, "step": 2960 }, { "epoch": 0.3674784656379748, - "grad_norm": 0.26065112696557874, + "grad_norm": 0.1331258514431446, "learning_rate": 0.00015948681932944104, - "loss": 3.8273, + "loss": 1.8586, "step": 2965 }, { "epoch": 0.36809815950920244, - "grad_norm": 0.24901377524126064, + "grad_norm": 0.13335124378868662, "learning_rate": 0.00015931278650121897, - "loss": 3.9438, + "loss": 1.866, "step": 2970 }, { "epoch": 0.36871785338043006, - "grad_norm": 0.2632224432964343, + "grad_norm": 0.13250687545966788, "learning_rate": 0.0001591384760886076, - "loss": 3.6617, + "loss": 1.8316, "step": 2975 }, { "epoch": 0.3693375472516577, - "grad_norm": 0.2589867585144316, + "grad_norm": 0.13274506427928803, "learning_rate": 0.00015896388890738127, - "loss": 3.6992, + "loss": 1.8535, "step": 2980 }, { "epoch": 0.3699572411228853, - "grad_norm": 0.25293946941175477, + "grad_norm": 0.13699657491823822, "learning_rate": 0.00015878902577460963, - "loss": 3.7984, + "loss": 1.8355, "step": 2985 }, { "epoch": 0.3705769349941129, - "grad_norm": 0.2551526711728292, + "grad_norm": 0.12986974166640744, "learning_rate": 0.00015861388750865375, - "loss": 3.6258, + "loss": 1.8656, "step": 2990 }, { "epoch": 0.3711966288653405, - "grad_norm": 0.2694067525937876, + "grad_norm": 0.13547504112097924, "learning_rate": 0.00015843847492916235, - "loss": 3.6578, + "loss": 1.859, "step": 2995 }, { "epoch": 0.3718163227365681, - "grad_norm": 0.2799079437013687, + "grad_norm": 0.14199787848571688, "learning_rate": 0.00015826278885706788, - "loss": 3.6336, + "loss": 1.8289, "step": 3000 }, { "epoch": 0.37243601660779574, - "grad_norm": 0.26696040065372506, + "grad_norm": 0.1304953733283427, "learning_rate": 0.0001580868301145828, - "loss": 3.8367, + "loss": 1.857, "step": 3005 }, { "epoch": 0.37305571047902336, - "grad_norm": 0.2748637513729295, + "grad_norm": 0.1448823429952658, "learning_rate": 0.00015791059952519567, - "loss": 3.6117, + "loss": 1.8668, "step": 3010 }, { "epoch": 0.373675404350251, - "grad_norm": 0.2678057810299332, + "grad_norm": 0.1257480778182083, "learning_rate": 0.00015773409791366728, - "loss": 3.7703, + "loss": 1.8605, "step": 3015 }, { "epoch": 0.3742950982214786, - "grad_norm": 0.26522017217912247, + "grad_norm": 0.13028189442894114, "learning_rate": 0.00015755732610602677, - "loss": 3.6922, + "loss": 1.8863, "step": 3020 }, { "epoch": 0.37491479209270623, - "grad_norm": 0.2663224361660835, + "grad_norm": 0.1265454045809503, "learning_rate": 0.00015738028492956786, - "loss": 3.6187, + "loss": 1.8395, "step": 3025 }, { "epoch": 0.3755344859639338, - "grad_norm": 0.27035194720649564, + "grad_norm": 0.13032347867143443, "learning_rate": 0.0001572029752128449, - "loss": 3.8258, + "loss": 1.8648, "step": 3030 }, { "epoch": 0.3761541798351614, - "grad_norm": 0.2838577149104673, + "grad_norm": 0.13110543872248587, "learning_rate": 0.00015702539778566897, - "loss": 3.7875, + "loss": 1.8715, "step": 3035 }, { "epoch": 0.37677387370638904, - "grad_norm": 0.2703312743767914, + "grad_norm": 0.1274284160679105, "learning_rate": 0.0001568475534791041, - "loss": 3.757, + "loss": 1.8086, "step": 3040 }, { "epoch": 0.37739356757761666, - "grad_norm": 0.2560504441017639, + "grad_norm": 0.1398602369199391, "learning_rate": 0.00015666944312546328, - "loss": 3.7008, + "loss": 1.8254, "step": 3045 }, { "epoch": 0.3780132614488443, - "grad_norm": 0.2698560889509554, + "grad_norm": 0.12891041218992944, "learning_rate": 0.0001564910675583046, - "loss": 3.6461, + "loss": 1.8781, "step": 3050 }, { "epoch": 0.3786329553200719, - "grad_norm": 0.28083643078936255, + "grad_norm": 0.1282542155930589, "learning_rate": 0.0001563124276124274, - "loss": 3.6453, + "loss": 1.884, "step": 3055 }, { "epoch": 0.3792526491912995, - "grad_norm": 0.271443414081348, + "grad_norm": 0.1356534667765166, "learning_rate": 0.00015613352412386825, - "loss": 3.6398, + "loss": 1.8012, "step": 3060 }, { "epoch": 0.3798723430625271, - "grad_norm": 0.2838039929744833, + "grad_norm": 0.1305522702140196, "learning_rate": 0.00015595435792989718, - "loss": 3.7586, + "loss": 1.8656, "step": 3065 }, { "epoch": 0.3804920369337547, - "grad_norm": 0.2647322485900605, + "grad_norm": 0.1337186431902072, "learning_rate": 0.0001557749298690135, - "loss": 3.6781, + "loss": 1.8883, "step": 3070 }, { "epoch": 0.38111173080498234, - "grad_norm": 0.2626946352782939, + "grad_norm": 0.1267318699700854, "learning_rate": 0.00015559524078094235, - "loss": 3.5805, + "loss": 1.8426, "step": 3075 }, { "epoch": 0.38173142467620996, - "grad_norm": 0.25933872796885626, + "grad_norm": 0.135229302814105, "learning_rate": 0.00015541529150663022, - "loss": 3.6148, + "loss": 1.8293, "step": 3080 }, { "epoch": 0.3823511185474376, - "grad_norm": 0.2821869561149129, + "grad_norm": 0.1325469745229473, "learning_rate": 0.00015523508288824145, - "loss": 3.6742, + "loss": 1.8598, "step": 3085 }, { "epoch": 0.38297081241866515, - "grad_norm": 0.25882550675825317, + "grad_norm": 0.13377461767007065, "learning_rate": 0.00015505461576915402, - "loss": 3.6023, + "loss": 1.8438, "step": 3090 }, { "epoch": 0.3835905062898928, - "grad_norm": 0.2589531444373828, + "grad_norm": 0.12407978728389873, "learning_rate": 0.00015487389099395565, - "loss": 3.6742, + "loss": 1.8715, "step": 3095 }, { "epoch": 0.3842102001611204, - "grad_norm": 0.26451321091244295, + "grad_norm": 0.1359925759348009, "learning_rate": 0.00015469290940844005, - "loss": 3.5445, + "loss": 1.8883, "step": 3100 }, { "epoch": 0.384829894032348, - "grad_norm": 0.2546537891704748, + "grad_norm": 0.13659768219370053, "learning_rate": 0.00015451167185960267, - "loss": 3.6609, + "loss": 1.8687, "step": 3105 }, { "epoch": 0.38544958790357564, - "grad_norm": 0.25027138697419005, + "grad_norm": 0.13102087732269108, "learning_rate": 0.00015433017919563692, - "loss": 3.5992, + "loss": 1.8902, "step": 3110 }, { "epoch": 0.38606928177480326, - "grad_norm": 0.27380537472291283, + "grad_norm": 0.13153742455134876, "learning_rate": 0.00015414843226593016, - "loss": 3.8633, + "loss": 1.8695, "step": 3115 }, { "epoch": 0.3866889756460309, - "grad_norm": 0.26921886223630787, + "grad_norm": 0.12492614271915935, "learning_rate": 0.0001539664319210597, - "loss": 3.7367, + "loss": 1.8867, "step": 3120 }, { "epoch": 0.38730866951725845, - "grad_norm": 0.25751518580294624, + "grad_norm": 0.1270230322153982, "learning_rate": 0.0001537841790127888, - "loss": 3.6586, + "loss": 1.8914, "step": 3125 }, { "epoch": 0.3879283633884861, - "grad_norm": 0.25605562468731, + "grad_norm": 0.13908099972235655, "learning_rate": 0.00015360167439406274, - "loss": 3.6141, + "loss": 1.8918, "step": 3130 }, { "epoch": 0.3885480572597137, - "grad_norm": 0.26611424342018436, + "grad_norm": 0.13260524566329848, "learning_rate": 0.00015341891891900494, - "loss": 3.6664, + "loss": 1.8469, "step": 3135 }, { "epoch": 0.3891677511309413, - "grad_norm": 0.2550066843855869, + "grad_norm": 0.1324975813379851, "learning_rate": 0.00015323591344291258, - "loss": 3.6484, + "loss": 1.8289, "step": 3140 }, { "epoch": 0.38978744500216894, - "grad_norm": 0.2821207360840596, + "grad_norm": 0.13120666575694298, "learning_rate": 0.00015305265882225303, - "loss": 3.6, + "loss": 1.8547, "step": 3145 }, { "epoch": 0.39040713887339656, - "grad_norm": 0.266631454756881, + "grad_norm": 0.12685427898285384, "learning_rate": 0.00015286915591465969, - "loss": 3.6953, + "loss": 1.8461, "step": 3150 }, { "epoch": 0.39102683274462413, - "grad_norm": 0.2686000416360816, + "grad_norm": 0.13572934465730757, "learning_rate": 0.00015268540557892773, - "loss": 3.6992, + "loss": 1.8562, "step": 3155 }, { "epoch": 0.39164652661585175, - "grad_norm": 0.25655840028409566, + "grad_norm": 0.12897435466478618, "learning_rate": 0.0001525014086750105, - "loss": 3.6492, + "loss": 1.8648, "step": 3160 }, { "epoch": 0.3922662204870794, - "grad_norm": 0.2567247759313816, + "grad_norm": 0.12721217574888402, "learning_rate": 0.0001523171660640152, - "loss": 3.6383, + "loss": 1.85, "step": 3165 }, { "epoch": 0.392885914358307, - "grad_norm": 0.2614801283302848, + "grad_norm": 0.1315866272081269, "learning_rate": 0.00015213267860819896, - "loss": 3.718, + "loss": 1.834, "step": 3170 }, { "epoch": 0.3935056082295346, - "grad_norm": 0.2538151707407498, + "grad_norm": 0.1302044760111287, "learning_rate": 0.00015194794717096475, - "loss": 3.8602, + "loss": 1.8512, "step": 3175 }, { "epoch": 0.39412530210076224, - "grad_norm": 0.2791424481656367, + "grad_norm": 0.13001468678731848, "learning_rate": 0.00015176297261685742, - "loss": 3.6617, + "loss": 1.8543, "step": 3180 }, { "epoch": 0.3947449959719898, - "grad_norm": 0.25798245204294173, + "grad_norm": 0.12952145972803736, "learning_rate": 0.00015157775581155957, - "loss": 3.8, + "loss": 1.8969, "step": 3185 }, { "epoch": 0.39536468984321743, - "grad_norm": 0.2614655828962737, + "grad_norm": 0.1267527829101824, "learning_rate": 0.00015139229762188761, - "loss": 3.6961, + "loss": 1.8113, "step": 3190 }, { "epoch": 0.39598438371444505, - "grad_norm": 0.25516127522728255, + "grad_norm": 0.1300639208984554, "learning_rate": 0.00015120659891578754, - "loss": 3.6828, + "loss": 1.7961, "step": 3195 }, { "epoch": 0.3966040775856727, - "grad_norm": 0.2585870302228633, + "grad_norm": 0.13154316703911634, "learning_rate": 0.00015102066056233104, - "loss": 3.5375, + "loss": 1.809, "step": 3200 }, { "epoch": 0.3972237714569003, - "grad_norm": 0.26672211172503196, + "grad_norm": 0.12568749277869354, "learning_rate": 0.00015083448343171138, - "loss": 3.632, + "loss": 1.8641, "step": 3205 }, { "epoch": 0.3978434653281279, - "grad_norm": 0.2641159714007487, + "grad_norm": 0.1319736744786111, "learning_rate": 0.00015064806839523915, - "loss": 3.7586, + "loss": 1.8223, "step": 3210 }, { "epoch": 0.39846315919935554, - "grad_norm": 0.2546667091800772, + "grad_norm": 0.13834323428246187, "learning_rate": 0.00015046141632533844, - "loss": 3.6641, + "loss": 1.798, "step": 3215 }, { "epoch": 0.3990828530705831, - "grad_norm": 0.25959333523005085, + "grad_norm": 0.1383968813858178, "learning_rate": 0.0001502745280955428, - "loss": 3.6898, + "loss": 1.8828, "step": 3220 }, { "epoch": 0.39970254694181073, - "grad_norm": 0.26928119386383115, + "grad_norm": 0.1350264311456221, "learning_rate": 0.00015008740458049075, - "loss": 3.7297, + "loss": 1.85, "step": 3225 }, { "epoch": 0.40032224081303835, - "grad_norm": 0.26421342010073934, + "grad_norm": 0.12512719310704826, "learning_rate": 0.0001499000466559221, - "loss": 3.7062, + "loss": 1.8148, "step": 3230 }, { "epoch": 0.400941934684266, - "grad_norm": 0.2643356968703811, + "grad_norm": 0.12311809256785136, "learning_rate": 0.0001497124551986737, - "loss": 3.5867, + "loss": 1.8617, "step": 3235 }, { "epoch": 0.4015616285554936, - "grad_norm": 0.26207830122927056, + "grad_norm": 0.13652089005902895, "learning_rate": 0.00014952463108667527, - "loss": 3.6039, + "loss": 1.8809, "step": 3240 }, { "epoch": 0.4021813224267212, - "grad_norm": 0.24755063014948148, + "grad_norm": 0.13149944904761232, "learning_rate": 0.0001493365751989454, - "loss": 3.6055, + "loss": 1.8309, "step": 3245 }, { "epoch": 0.4028010162979488, - "grad_norm": 0.2549356478882979, + "grad_norm": 0.1345131553547909, "learning_rate": 0.0001491482884155874, - "loss": 3.7414, + "loss": 1.8598, "step": 3250 }, { "epoch": 0.4034207101691764, - "grad_norm": 0.25938021841255027, + "grad_norm": 0.1316820704074098, "learning_rate": 0.00014895977161778515, - "loss": 3.6727, + "loss": 1.8113, "step": 3255 }, { "epoch": 0.40404040404040403, - "grad_norm": 0.2749384696162015, + "grad_norm": 0.13378258777396831, "learning_rate": 0.00014877102568779896, - "loss": 3.7242, + "loss": 1.8176, "step": 3260 }, { "epoch": 0.40466009791163166, - "grad_norm": 0.2648786444292922, + "grad_norm": 0.12936930942353125, "learning_rate": 0.00014858205150896161, - "loss": 3.5367, + "loss": 1.8996, "step": 3265 }, { "epoch": 0.4052797917828593, - "grad_norm": 0.26384530229777414, + "grad_norm": 0.12946325447363718, "learning_rate": 0.00014839284996567392, - "loss": 3.7203, + "loss": 1.8574, "step": 3270 }, { "epoch": 0.4058994856540869, - "grad_norm": 0.25117656502349667, + "grad_norm": 0.12501290179364566, "learning_rate": 0.00014820342194340097, - "loss": 3.7609, + "loss": 1.8613, "step": 3275 }, { "epoch": 0.4065191795253145, - "grad_norm": 0.2729406020541514, + "grad_norm": 0.13006013274328393, "learning_rate": 0.00014801376832866754, - "loss": 3.7437, + "loss": 1.7965, "step": 3280 }, { "epoch": 0.4071388733965421, - "grad_norm": 0.2863416580211302, + "grad_norm": 0.1388419140952983, "learning_rate": 0.00014782389000905435, - "loss": 3.6664, + "loss": 1.8965, "step": 3285 }, { "epoch": 0.4077585672677697, - "grad_norm": 0.2698057546507351, + "grad_norm": 0.1324648077294965, "learning_rate": 0.00014763378787319373, - "loss": 3.7109, + "loss": 1.8023, "step": 3290 }, { "epoch": 0.40837826113899733, - "grad_norm": 0.2697403365035814, + "grad_norm": 0.12507324333204756, "learning_rate": 0.00014744346281076536, - "loss": 3.7273, + "loss": 1.8473, "step": 3295 }, { "epoch": 0.40899795501022496, - "grad_norm": 0.28347942725540815, + "grad_norm": 0.13248330177911272, "learning_rate": 0.00014725291571249236, - "loss": 3.5773, + "loss": 1.798, "step": 3300 }, { "epoch": 0.4096176488814526, - "grad_norm": 0.26301691355137335, + "grad_norm": 0.13183523671686795, "learning_rate": 0.00014706214747013685, - "loss": 3.7578, + "loss": 1.9359, "step": 3305 }, { "epoch": 0.4102373427526802, - "grad_norm": 0.26809494981415455, + "grad_norm": 0.1338057384961836, "learning_rate": 0.00014687115897649603, - "loss": 3.6211, + "loss": 1.8891, "step": 3310 }, { "epoch": 0.41085703662390777, - "grad_norm": 0.2723319521596946, + "grad_norm": 0.15985021119124002, "learning_rate": 0.00014667995112539774, - "loss": 3.6117, + "loss": 1.8215, "step": 3315 }, { "epoch": 0.4114767304951354, - "grad_norm": 0.26087202121957026, + "grad_norm": 0.1293260479521013, "learning_rate": 0.00014648852481169658, - "loss": 3.6602, + "loss": 1.8918, "step": 3320 }, { "epoch": 0.412096424366363, - "grad_norm": 0.26358093924993475, + "grad_norm": 0.13146519522588096, "learning_rate": 0.0001462968809312694, - "loss": 3.7313, + "loss": 1.875, "step": 3325 }, { "epoch": 0.41271611823759063, - "grad_norm": 0.2712089109808533, + "grad_norm": 0.1315902361033057, "learning_rate": 0.00014610502038101138, - "loss": 3.6992, + "loss": 1.8781, "step": 3330 }, { "epoch": 0.41333581210881826, - "grad_norm": 0.2590088092278918, + "grad_norm": 0.1333738909869963, "learning_rate": 0.00014591294405883162, - "loss": 3.6703, + "loss": 1.8441, "step": 3335 }, { "epoch": 0.4139555059800459, - "grad_norm": 0.241021196941427, + "grad_norm": 0.12218014895336611, "learning_rate": 0.00014572065286364908, - "loss": 3.8258, + "loss": 1.884, "step": 3340 }, { "epoch": 0.41457519985127345, - "grad_norm": 0.2702742950321888, + "grad_norm": 0.12242530399637636, "learning_rate": 0.00014552814769538844, - "loss": 3.7008, + "loss": 1.8645, "step": 3345 }, { "epoch": 0.41519489372250107, - "grad_norm": 0.27282696995789457, + "grad_norm": 0.13310842863340136, "learning_rate": 0.00014533542945497553, - "loss": 3.5375, + "loss": 1.8629, "step": 3350 }, { "epoch": 0.4158145875937287, - "grad_norm": 0.2573140589414904, + "grad_norm": 0.1284963494667993, "learning_rate": 0.00014514249904433362, - "loss": 3.6344, + "loss": 1.8809, "step": 3355 }, { "epoch": 0.4164342814649563, - "grad_norm": 0.26868082368329343, + "grad_norm": 0.1319653346345284, "learning_rate": 0.0001449493573663787, - "loss": 3.6586, + "loss": 1.9082, "step": 3360 }, { "epoch": 0.41705397533618394, - "grad_norm": 0.26050328505952564, + "grad_norm": 0.14072551811843476, "learning_rate": 0.00014475600532501566, - "loss": 3.7141, + "loss": 1.8258, "step": 3365 }, { "epoch": 0.41767366920741156, - "grad_norm": 0.260282390367127, + "grad_norm": 0.1274232137809557, "learning_rate": 0.00014456244382513386, - "loss": 3.7531, + "loss": 1.8305, "step": 3370 }, { "epoch": 0.4182933630786392, - "grad_norm": 0.2813266769594806, + "grad_norm": 0.12333734269464691, "learning_rate": 0.0001443686737726029, - "loss": 3.6906, + "loss": 1.8648, "step": 3375 }, { "epoch": 0.41891305694986675, - "grad_norm": 0.2739367336724833, + "grad_norm": 0.13623034425385716, "learning_rate": 0.00014417469607426838, - "loss": 3.6008, + "loss": 1.8617, "step": 3380 }, { "epoch": 0.41953275082109437, - "grad_norm": 0.280672946378796, + "grad_norm": 0.13153662546427136, "learning_rate": 0.00014398051163794776, - "loss": 3.6461, + "loss": 1.8262, "step": 3385 }, { "epoch": 0.420152444692322, - "grad_norm": 0.2743306758287759, + "grad_norm": 0.13256507723918945, "learning_rate": 0.000143786121372426, - "loss": 3.7211, + "loss": 1.8586, "step": 3390 }, { "epoch": 0.4207721385635496, - "grad_norm": 0.261594675478806, + "grad_norm": 0.12990196578968077, "learning_rate": 0.00014359152618745132, - "loss": 3.6203, + "loss": 1.8621, "step": 3395 }, { "epoch": 0.42139183243477724, - "grad_norm": 0.2575401667881192, + "grad_norm": 0.13238313238253183, "learning_rate": 0.00014339672699373104, - "loss": 3.7273, + "loss": 1.8941, "step": 3400 }, { "epoch": 0.42201152630600486, - "grad_norm": 0.2850665124359037, + "grad_norm": 0.14263827897990308, "learning_rate": 0.0001432017247029271, - "loss": 3.9281, + "loss": 1.8219, "step": 3405 }, { "epoch": 0.4226312201772324, - "grad_norm": 0.25780272018370204, + "grad_norm": 0.15066475507049107, "learning_rate": 0.00014300652022765207, - "loss": 3.6789, + "loss": 1.8301, "step": 3410 }, { "epoch": 0.42325091404846005, - "grad_norm": 0.2680886719065801, + "grad_norm": 0.13504045941780513, "learning_rate": 0.00014281111448146468, - "loss": 3.6797, + "loss": 1.7824, "step": 3415 }, { "epoch": 0.42387060791968767, - "grad_norm": 0.29366146711228375, + "grad_norm": 0.12845118948413084, "learning_rate": 0.00014261550837886566, - "loss": 3.5805, + "loss": 1.8328, "step": 3420 }, { "epoch": 0.4244903017909153, - "grad_norm": 0.27265368036979465, + "grad_norm": 0.13416853262567752, "learning_rate": 0.00014241970283529338, - "loss": 3.718, + "loss": 1.8379, "step": 3425 }, { "epoch": 0.4251099956621429, - "grad_norm": 0.2462250838243364, + "grad_norm": 0.1275150961925631, "learning_rate": 0.00014222369876711955, - "loss": 3.7469, + "loss": 1.8461, "step": 3430 }, { "epoch": 0.42572968953337054, - "grad_norm": 0.2693760214974965, + "grad_norm": 0.129005270669972, "learning_rate": 0.00014202749709164506, - "loss": 3.6523, + "loss": 1.873, "step": 3435 }, { "epoch": 0.4263493834045981, - "grad_norm": 0.26792153518324946, + "grad_norm": 0.1366842217344907, "learning_rate": 0.00014183109872709557, - "loss": 3.7125, + "loss": 1.8344, "step": 3440 }, { "epoch": 0.4269690772758257, - "grad_norm": 0.26064403191013946, + "grad_norm": 0.1302801253762385, "learning_rate": 0.0001416345045926172, - "loss": 3.6281, + "loss": 1.8301, "step": 3445 }, { "epoch": 0.42758877114705335, - "grad_norm": 0.26654172823695954, + "grad_norm": 0.12938457791256947, "learning_rate": 0.00014143771560827238, - "loss": 3.6727, + "loss": 1.8762, "step": 3450 }, { "epoch": 0.42820846501828097, - "grad_norm": 0.260911428860639, + "grad_norm": 0.1324358723606549, "learning_rate": 0.00014124073269503534, - "loss": 3.7625, + "loss": 1.8738, "step": 3455 }, { "epoch": 0.4288281588895086, - "grad_norm": 0.2651966413063993, + "grad_norm": 0.12690470679362756, "learning_rate": 0.0001410435567747879, - "loss": 3.6906, + "loss": 1.8445, "step": 3460 }, { "epoch": 0.4294478527607362, - "grad_norm": 0.25866632724130795, + "grad_norm": 0.12607166701292938, "learning_rate": 0.00014084618877031524, - "loss": 3.7539, + "loss": 1.8586, "step": 3465 }, { "epoch": 0.43006754663196384, - "grad_norm": 0.2633955145782413, + "grad_norm": 0.1295605508529987, "learning_rate": 0.00014064862960530143, - "loss": 3.6023, + "loss": 1.8449, "step": 3470 }, { "epoch": 0.4306872405031914, - "grad_norm": 0.26034897512497684, + "grad_norm": 0.12894088957046304, "learning_rate": 0.0001404508802043252, - "loss": 3.6977, + "loss": 1.8484, "step": 3475 }, { "epoch": 0.431306934374419, - "grad_norm": 0.2613913190175063, + "grad_norm": 0.1310486419482613, "learning_rate": 0.0001402529414928556, - "loss": 3.7391, + "loss": 1.8367, "step": 3480 }, { "epoch": 0.43192662824564665, - "grad_norm": 0.28135113964539377, + "grad_norm": 0.13439055155902957, "learning_rate": 0.00014005481439724753, - "loss": 3.6219, + "loss": 1.8523, "step": 3485 }, { "epoch": 0.43254632211687427, - "grad_norm": 0.2611757651696741, + "grad_norm": 0.14979693773268604, "learning_rate": 0.00013985649984473773, - "loss": 3.7156, + "loss": 1.8684, "step": 3490 }, { "epoch": 0.4331660159881019, - "grad_norm": 0.24919761728858297, + "grad_norm": 0.12645292055136997, "learning_rate": 0.0001396579987634401, - "loss": 3.6852, + "loss": 1.7848, "step": 3495 }, { "epoch": 0.4337857098593295, - "grad_norm": 0.26626990952610513, + "grad_norm": 0.13147498487604903, "learning_rate": 0.00013945931208234156, - "loss": 3.6867, + "loss": 1.8895, "step": 3500 }, { "epoch": 0.4344054037305571, - "grad_norm": 0.26859713621539, + "grad_norm": 0.13150080296671127, "learning_rate": 0.00013926044073129759, - "loss": 3.7789, + "loss": 1.8254, "step": 3505 }, { "epoch": 0.4350250976017847, - "grad_norm": 0.26988300474797133, + "grad_norm": 0.13683895326823053, "learning_rate": 0.00013906138564102793, - "loss": 3.7164, + "loss": 1.8613, "step": 3510 }, { "epoch": 0.4356447914730123, - "grad_norm": 0.261113153649504, + "grad_norm": 0.12805602955656897, "learning_rate": 0.0001388621477431123, - "loss": 3.6758, + "loss": 1.8676, "step": 3515 }, { "epoch": 0.43626448534423995, - "grad_norm": 0.2552976713312743, + "grad_norm": 0.12813673559403296, "learning_rate": 0.00013866272796998587, - "loss": 3.7383, + "loss": 1.8883, "step": 3520 }, { "epoch": 0.43688417921546757, - "grad_norm": 0.30498123644083946, + "grad_norm": 0.1370489786872922, "learning_rate": 0.00013846312725493504, - "loss": 3.7609, + "loss": 1.9152, "step": 3525 }, { "epoch": 0.4375038730866952, - "grad_norm": 0.2570153268183945, + "grad_norm": 0.13373578100982492, "learning_rate": 0.00013826334653209297, - "loss": 3.6797, + "loss": 1.8449, "step": 3530 }, { "epoch": 0.43812356695792276, - "grad_norm": 0.25538549220186507, + "grad_norm": 0.14141792280370644, "learning_rate": 0.00013806338673643534, - "loss": 3.6398, + "loss": 1.8258, "step": 3535 }, { "epoch": 0.4387432608291504, - "grad_norm": 0.28356474464611625, + "grad_norm": 0.12964895565829695, "learning_rate": 0.00013786324880377576, - "loss": 3.7352, + "loss": 1.8156, "step": 3540 }, { "epoch": 0.439362954700378, - "grad_norm": 0.26457024903444254, + "grad_norm": 0.13419405554541766, "learning_rate": 0.0001376629336707617, - "loss": 3.6758, + "loss": 1.8219, "step": 3545 }, { "epoch": 0.43998264857160563, - "grad_norm": 0.25432212592677816, + "grad_norm": 0.1328791545044252, "learning_rate": 0.0001374624422748698, - "loss": 3.6469, + "loss": 1.873, "step": 3550 }, { "epoch": 0.44060234244283325, - "grad_norm": 0.2734955603605783, + "grad_norm": 0.136160988058161, "learning_rate": 0.00013726177555440164, - "loss": 3.7242, + "loss": 1.8789, "step": 3555 }, { "epoch": 0.4412220363140609, - "grad_norm": 0.27154068833373923, + "grad_norm": 0.1335013252887933, "learning_rate": 0.0001370609344484793, - "loss": 3.8172, + "loss": 1.8137, "step": 3560 }, { "epoch": 0.4418417301852885, - "grad_norm": 0.2715147456646156, + "grad_norm": 0.13713205846127619, "learning_rate": 0.00013685991989704105, - "loss": 3.7508, + "loss": 1.8207, "step": 3565 }, { "epoch": 0.44246142405651606, - "grad_norm": 0.26561938249699224, + "grad_norm": 0.12998754814726307, "learning_rate": 0.00013665873284083685, - "loss": 3.6898, + "loss": 1.8371, "step": 3570 }, { "epoch": 0.4430811179277437, - "grad_norm": 0.25783961144820916, + "grad_norm": 0.13470803808424642, "learning_rate": 0.000136457374221424, - "loss": 3.5945, + "loss": 1.8477, "step": 3575 }, { "epoch": 0.4437008117989713, - "grad_norm": 0.27738329704213377, + "grad_norm": 0.13606447312464934, "learning_rate": 0.00013625584498116262, - "loss": 3.8328, + "loss": 1.8625, "step": 3580 }, { "epoch": 0.44432050567019893, - "grad_norm": 0.2647663920533736, + "grad_norm": 0.1274570535945815, "learning_rate": 0.00013605414606321148, - "loss": 3.693, + "loss": 1.8641, "step": 3585 }, { "epoch": 0.44494019954142655, - "grad_norm": 0.2592544235227693, + "grad_norm": 0.1380534637059273, "learning_rate": 0.0001358522784115234, - "loss": 3.7898, + "loss": 1.8723, "step": 3590 }, { "epoch": 0.4455598934126542, - "grad_norm": 0.2614819616971976, + "grad_norm": 0.13189168748093322, "learning_rate": 0.00013565024297084084, - "loss": 3.6461, + "loss": 1.8891, "step": 3595 }, { "epoch": 0.44617958728388174, - "grad_norm": 0.26575079677829216, + "grad_norm": 0.1343904076364209, "learning_rate": 0.0001354480406866915, - "loss": 3.6117, + "loss": 1.802, "step": 3600 }, { "epoch": 0.44679928115510936, - "grad_norm": 0.2856386573982759, + "grad_norm": 0.13145575069138363, "learning_rate": 0.00013524567250538396, - "loss": 3.6984, + "loss": 1.8598, "step": 3605 }, { "epoch": 0.447418975026337, - "grad_norm": 0.26706444441282795, + "grad_norm": 0.1253081356231435, "learning_rate": 0.00013504313937400317, - "loss": 3.7031, + "loss": 1.8656, "step": 3610 }, { "epoch": 0.4480386688975646, - "grad_norm": 0.2611623259231443, + "grad_norm": 0.1359581987770173, "learning_rate": 0.00013484044224040606, - "loss": 3.6125, + "loss": 1.8406, "step": 3615 }, { "epoch": 0.44865836276879223, - "grad_norm": 0.27732328827061903, + "grad_norm": 0.13280063216880456, "learning_rate": 0.00013463758205321715, - "loss": 3.7352, + "loss": 1.8676, "step": 3620 }, { "epoch": 0.44927805664001985, - "grad_norm": 0.26044312902407674, + "grad_norm": 0.13852509864663975, "learning_rate": 0.0001344345597618239, - "loss": 3.6938, + "loss": 1.8273, "step": 3625 }, { "epoch": 0.4498977505112474, - "grad_norm": 0.2912247141373232, + "grad_norm": 0.13283988137837371, "learning_rate": 0.00013423137631637258, - "loss": 3.6617, + "loss": 1.8582, "step": 3630 }, { "epoch": 0.45051744438247504, - "grad_norm": 0.2571060992437349, + "grad_norm": 0.13375149735488798, "learning_rate": 0.00013402803266776353, - "loss": 3.7484, + "loss": 1.8668, "step": 3635 }, { "epoch": 0.45113713825370266, - "grad_norm": 0.25618111573779184, + "grad_norm": 0.1385343237508899, "learning_rate": 0.00013382452976764693, - "loss": 3.7219, + "loss": 1.8301, "step": 3640 }, { "epoch": 0.4517568321249303, - "grad_norm": 0.27582839571574014, + "grad_norm": 0.1336021388356578, "learning_rate": 0.00013362086856841826, - "loss": 3.6859, + "loss": 1.868, "step": 3645 }, { "epoch": 0.4523765259961579, - "grad_norm": 0.27749394162596547, + "grad_norm": 0.1328856221983309, "learning_rate": 0.0001334170500232138, - "loss": 3.7211, + "loss": 1.8555, "step": 3650 }, { "epoch": 0.45299621986738553, - "grad_norm": 0.2584168295006351, + "grad_norm": 0.1341766922517231, "learning_rate": 0.00013321307508590624, - "loss": 3.6625, + "loss": 1.8523, "step": 3655 }, { "epoch": 0.45361591373861315, - "grad_norm": 0.27693424489312013, + "grad_norm": 0.1306615008434302, "learning_rate": 0.00013300894471110014, - "loss": 3.6625, + "loss": 1.8617, "step": 3660 }, { "epoch": 0.4542356076098407, - "grad_norm": 0.25983758091273174, + "grad_norm": 0.13912806028945696, "learning_rate": 0.00013280465985412757, - "loss": 3.6641, + "loss": 1.8266, "step": 3665 }, { "epoch": 0.45485530148106834, - "grad_norm": 0.26415706763108404, + "grad_norm": 0.12851937389904652, "learning_rate": 0.00013260022147104354, - "loss": 3.6781, + "loss": 1.8516, "step": 3670 }, { "epoch": 0.45547499535229596, - "grad_norm": 0.2719567568985379, + "grad_norm": 0.13943073131832875, "learning_rate": 0.00013239563051862158, - "loss": 3.5859, + "loss": 1.825, "step": 3675 }, { "epoch": 0.4560946892235236, - "grad_norm": 0.26545474762934684, + "grad_norm": 0.13670686199925514, "learning_rate": 0.00013219088795434923, - "loss": 3.8219, + "loss": 1.8387, "step": 3680 }, { "epoch": 0.4567143830947512, - "grad_norm": 0.2708748781556674, + "grad_norm": 0.1356087167140527, "learning_rate": 0.00013198599473642354, - "loss": 3.7672, + "loss": 1.8902, "step": 3685 }, { "epoch": 0.45733407696597883, - "grad_norm": 0.2715721444457544, + "grad_norm": 0.13676670096618956, "learning_rate": 0.00013178095182374676, - "loss": 3.6945, + "loss": 1.7934, "step": 3690 }, { "epoch": 0.4579537708372064, - "grad_norm": 0.2716587540915426, + "grad_norm": 0.13391467393093984, "learning_rate": 0.00013157576017592157, - "loss": 3.682, + "loss": 1.8254, "step": 3695 }, { "epoch": 0.458573464708434, - "grad_norm": 0.2582125776966492, + "grad_norm": 0.13764440897029354, "learning_rate": 0.0001313704207532468, - "loss": 3.6469, + "loss": 1.8453, "step": 3700 }, { "epoch": 0.45919315857966164, - "grad_norm": 0.2589681636647802, + "grad_norm": 0.13160749306039446, "learning_rate": 0.00013116493451671279, - "loss": 3.7172, + "loss": 1.8352, "step": 3705 }, { "epoch": 0.45981285245088926, - "grad_norm": 0.268075424802474, + "grad_norm": 0.12949459213686204, "learning_rate": 0.000130959302427997, - "loss": 3.6781, + "loss": 1.7816, "step": 3710 }, { "epoch": 0.4604325463221169, - "grad_norm": 0.2701875386080495, + "grad_norm": 0.13738089812902068, "learning_rate": 0.00013075352544945966, - "loss": 3.6602, + "loss": 1.8633, "step": 3715 }, { "epoch": 0.4610522401933445, - "grad_norm": 0.2874768630812172, + "grad_norm": 0.1338213874344654, "learning_rate": 0.00013054760454413882, - "loss": 3.632, + "loss": 1.8727, "step": 3720 }, { "epoch": 0.4616719340645721, - "grad_norm": 0.2912578065906555, + "grad_norm": 0.13465341196540553, "learning_rate": 0.00013034154067574622, - "loss": 3.7148, + "loss": 1.8824, "step": 3725 }, { "epoch": 0.4622916279357997, - "grad_norm": 0.2710623247745548, + "grad_norm": 0.1312583538008533, "learning_rate": 0.00013013533480866273, - "loss": 3.7938, + "loss": 1.8133, "step": 3730 }, { "epoch": 0.4629113218070273, - "grad_norm": 0.26344004601414733, + "grad_norm": 0.13231035374316044, "learning_rate": 0.00012992898790793362, - "loss": 3.5797, + "loss": 1.8496, "step": 3735 }, { "epoch": 0.46353101567825494, - "grad_norm": 0.26207544083031686, + "grad_norm": 0.1322425189925765, "learning_rate": 0.00012972250093926436, - "loss": 3.6602, + "loss": 1.8512, "step": 3740 }, { "epoch": 0.46415070954948257, - "grad_norm": 0.2601250259564004, + "grad_norm": 0.12728720946766525, "learning_rate": 0.0001295158748690159, - "loss": 3.6859, + "loss": 1.8441, "step": 3745 }, { "epoch": 0.4647704034207102, - "grad_norm": 0.26401294757239785, + "grad_norm": 0.12825916363660014, "learning_rate": 0.0001293091106642001, - "loss": 3.7633, + "loss": 1.8645, "step": 3750 }, { "epoch": 0.4653900972919378, - "grad_norm": 0.26767917213945047, + "grad_norm": 0.13151182512874157, "learning_rate": 0.00012910220929247538, - "loss": 3.7188, + "loss": 1.8449, "step": 3755 }, { "epoch": 0.4660097911631654, - "grad_norm": 0.258331889535157, + "grad_norm": 0.12577699319145508, "learning_rate": 0.00012889517172214206, - "loss": 3.8008, + "loss": 1.8254, "step": 3760 }, { "epoch": 0.466629485034393, - "grad_norm": 0.2662816231079526, + "grad_norm": 0.13662024127159117, "learning_rate": 0.0001286879989221379, - "loss": 3.5742, + "loss": 1.8336, "step": 3765 }, { "epoch": 0.4672491789056206, - "grad_norm": 0.2678195712749907, + "grad_norm": 0.13740650410044333, "learning_rate": 0.0001284806918620335, - "loss": 3.518, + "loss": 1.8289, "step": 3770 }, { "epoch": 0.46786887277684824, - "grad_norm": 0.26956828977591457, + "grad_norm": 0.12670365930896482, "learning_rate": 0.00012827325151202782, - "loss": 3.6875, + "loss": 1.8402, "step": 3775 }, { "epoch": 0.46848856664807587, - "grad_norm": 0.2632140447372612, + "grad_norm": 0.13596871709025227, "learning_rate": 0.00012806567884294362, - "loss": 3.6727, + "loss": 1.8809, "step": 3780 }, { "epoch": 0.4691082605193035, - "grad_norm": 0.26622278914922576, + "grad_norm": 0.12839958268082616, "learning_rate": 0.00012785797482622294, - "loss": 3.7008, + "loss": 1.8414, "step": 3785 }, { "epoch": 0.46972795439053106, - "grad_norm": 0.25787244958734795, + "grad_norm": 0.13501786379676173, "learning_rate": 0.00012765014043392242, - "loss": 3.7172, + "loss": 1.8508, "step": 3790 }, { "epoch": 0.4703476482617587, - "grad_norm": 0.2628067137621567, + "grad_norm": 0.14517241168125267, "learning_rate": 0.00012744217663870902, - "loss": 3.6273, + "loss": 1.8797, "step": 3795 }, { "epoch": 0.4709673421329863, - "grad_norm": 0.26372203837092634, + "grad_norm": 0.14211490967215193, "learning_rate": 0.00012723408441385521, - "loss": 3.682, + "loss": 1.8613, "step": 3800 }, { "epoch": 0.4715870360042139, - "grad_norm": 0.2824894463232923, + "grad_norm": 0.12861242954967542, "learning_rate": 0.0001270258647332345, - "loss": 3.5516, + "loss": 1.8535, "step": 3805 }, { "epoch": 0.47220672987544154, - "grad_norm": 0.27690777294629687, + "grad_norm": 0.12940845574021922, "learning_rate": 0.00012681751857131693, - "loss": 3.693, + "loss": 1.7973, "step": 3810 }, { "epoch": 0.47282642374666917, - "grad_norm": 0.2819860304503665, + "grad_norm": 0.13214817236795431, "learning_rate": 0.00012660904690316445, - "loss": 3.7016, + "loss": 1.8492, "step": 3815 }, { "epoch": 0.47344611761789673, - "grad_norm": 0.26197810073423866, + "grad_norm": 0.12562431874962993, "learning_rate": 0.00012640045070442643, - "loss": 3.7711, + "loss": 1.8629, "step": 3820 }, { "epoch": 0.47406581148912436, - "grad_norm": 0.27307637087247627, + "grad_norm": 0.1373453111960608, "learning_rate": 0.000126191730951335, - "loss": 3.7219, + "loss": 1.8086, "step": 3825 }, { "epoch": 0.474685505360352, - "grad_norm": 0.28020444831757324, + "grad_norm": 0.12730725941472784, "learning_rate": 0.0001259828886207005, - "loss": 3.6328, + "loss": 1.85, "step": 3830 }, { "epoch": 0.4753051992315796, - "grad_norm": 0.27306251075030796, + "grad_norm": 0.14568143374443207, "learning_rate": 0.00012577392468990695, - "loss": 3.6797, + "loss": 1.9332, "step": 3835 }, { "epoch": 0.4759248931028072, - "grad_norm": 0.25355275857310394, + "grad_norm": 0.1294161453337625, "learning_rate": 0.00012556484013690763, - "loss": 3.6898, + "loss": 1.8328, "step": 3840 }, { "epoch": 0.47654458697403485, - "grad_norm": 0.2597476632713353, + "grad_norm": 0.1360420665603735, "learning_rate": 0.00012535563594022, - "loss": 3.7625, + "loss": 1.8281, "step": 3845 }, { "epoch": 0.47716428084526247, - "grad_norm": 0.2658878725226564, + "grad_norm": 0.13045423595487943, "learning_rate": 0.0001251463130789217, - "loss": 3.6867, + "loss": 1.8367, "step": 3850 }, { "epoch": 0.47778397471649003, - "grad_norm": 0.2605086032950418, + "grad_norm": 0.1293506708386949, "learning_rate": 0.0001249368725326457, - "loss": 3.7094, + "loss": 1.8082, "step": 3855 }, { "epoch": 0.47840366858771766, - "grad_norm": 0.2616579389175633, + "grad_norm": 0.1279784621009452, "learning_rate": 0.00012472731528157563, - "loss": 3.7898, + "loss": 1.843, "step": 3860 }, { "epoch": 0.4790233624589453, - "grad_norm": 0.2685603453669324, + "grad_norm": 0.13460853167180994, "learning_rate": 0.00012451764230644145, - "loss": 3.7, + "loss": 1.893, "step": 3865 }, { "epoch": 0.4796430563301729, - "grad_norm": 0.26551263915399026, + "grad_norm": 0.13623636515106394, "learning_rate": 0.0001243078545885145, - "loss": 3.7445, + "loss": 1.8594, "step": 3870 }, { "epoch": 0.4802627502014005, - "grad_norm": 0.26462924352305306, + "grad_norm": 0.13469126915119556, "learning_rate": 0.00012409795310960333, - "loss": 3.7461, + "loss": 1.8188, "step": 3875 }, { "epoch": 0.48088244407262815, - "grad_norm": 0.2885862355706348, + "grad_norm": 0.14728913841193927, "learning_rate": 0.00012388793885204875, - "loss": 3.7258, + "loss": 1.8496, "step": 3880 }, { "epoch": 0.4815021379438557, - "grad_norm": 0.2572423376845503, + "grad_norm": 0.1329784046218279, "learning_rate": 0.00012367781279871946, - "loss": 3.7477, + "loss": 1.7828, "step": 3885 }, { "epoch": 0.48212183181508333, - "grad_norm": 0.2575035889902655, + "grad_norm": 0.14450915407347387, "learning_rate": 0.00012346757593300733, - "loss": 3.7156, + "loss": 1.8199, "step": 3890 }, { "epoch": 0.48274152568631096, - "grad_norm": 0.26737117219004586, + "grad_norm": 0.13074669708751388, "learning_rate": 0.00012325722923882285, - "loss": 3.7125, + "loss": 1.8406, "step": 3895 }, { "epoch": 0.4833612195575386, - "grad_norm": 0.26778941850758253, + "grad_norm": 0.1320766633941541, "learning_rate": 0.00012304677370059047, - "loss": 3.7703, + "loss": 1.825, "step": 3900 }, { "epoch": 0.4839809134287662, - "grad_norm": 0.2617718410058238, + "grad_norm": 0.13382977679788616, "learning_rate": 0.00012283621030324403, - "loss": 3.6742, + "loss": 1.7426, "step": 3905 }, { "epoch": 0.4846006072999938, - "grad_norm": 0.27110705498999677, + "grad_norm": 0.1300774595042929, "learning_rate": 0.00012262554003222221, - "loss": 3.6625, + "loss": 1.8277, "step": 3910 }, { "epoch": 0.4852203011712214, - "grad_norm": 0.26656137927275286, + "grad_norm": 0.13823641067099687, "learning_rate": 0.00012241476387346386, - "loss": 3.5656, + "loss": 1.8344, "step": 3915 }, { "epoch": 0.485839995042449, - "grad_norm": 0.27327458650598496, + "grad_norm": 0.13309285522719982, "learning_rate": 0.00012220388281340328, - "loss": 3.6492, + "loss": 1.8332, "step": 3920 }, { "epoch": 0.48645968891367664, - "grad_norm": 0.26535475968509176, + "grad_norm": 0.13989344182618982, "learning_rate": 0.00012199289783896582, - "loss": 3.6297, + "loss": 1.8418, "step": 3925 }, { "epoch": 0.48707938278490426, - "grad_norm": 0.27391081465042744, + "grad_norm": 0.13908793914835565, "learning_rate": 0.0001217818099375631, - "loss": 3.75, + "loss": 1.7738, "step": 3930 }, { "epoch": 0.4876990766561319, - "grad_norm": 0.2706935971949415, + "grad_norm": 0.13203859661605985, "learning_rate": 0.00012157062009708847, - "loss": 3.6688, + "loss": 1.8012, "step": 3935 }, { "epoch": 0.4883187705273595, - "grad_norm": 0.26673132822964696, + "grad_norm": 0.1387991780525731, "learning_rate": 0.00012135932930591232, - "loss": 3.7383, + "loss": 1.8258, "step": 3940 }, { "epoch": 0.4889384643985871, - "grad_norm": 0.2587169856596499, + "grad_norm": 0.1272927886215849, "learning_rate": 0.00012114793855287749, - "loss": 3.7258, + "loss": 1.8207, "step": 3945 }, { "epoch": 0.4895581582698147, - "grad_norm": 0.27051352083466496, + "grad_norm": 0.13317685573409976, "learning_rate": 0.00012093644882729473, - "loss": 3.7766, + "loss": 1.8484, "step": 3950 }, { "epoch": 0.4901778521410423, - "grad_norm": 0.2846141346585085, + "grad_norm": 0.13060406777243355, "learning_rate": 0.0001207248611189378, - "loss": 3.7437, + "loss": 1.8062, "step": 3955 }, { "epoch": 0.49079754601226994, - "grad_norm": 0.2901661878259012, + "grad_norm": 0.1323653833316828, "learning_rate": 0.00012051317641803921, - "loss": 3.6555, + "loss": 1.868, "step": 3960 }, { "epoch": 0.49141723988349756, - "grad_norm": 0.25373316084020386, + "grad_norm": 0.13374731314053315, "learning_rate": 0.00012030139571528534, - "loss": 3.6492, + "loss": 1.8027, "step": 3965 }, { "epoch": 0.4920369337547252, - "grad_norm": 0.26797411021921486, + "grad_norm": 0.13117397580253184, "learning_rate": 0.00012008952000181175, - "loss": 3.7234, + "loss": 1.8105, "step": 3970 }, { "epoch": 0.4926566276259528, - "grad_norm": 0.2586687656902996, + "grad_norm": 0.13913815292173712, "learning_rate": 0.0001198775502691988, - "loss": 3.5828, + "loss": 1.9066, "step": 3975 }, { "epoch": 0.49327632149718037, - "grad_norm": 0.2769207433199797, + "grad_norm": 0.13254556995995362, "learning_rate": 0.00011966548750946678, - "loss": 3.693, + "loss": 1.7578, "step": 3980 }, { "epoch": 0.493896015368408, - "grad_norm": 0.2660860997146998, + "grad_norm": 0.1263645566258469, "learning_rate": 0.00011945333271507138, - "loss": 3.6289, + "loss": 1.8551, "step": 3985 }, { "epoch": 0.4945157092396356, - "grad_norm": 0.28692103057195106, + "grad_norm": 0.13848000567316143, "learning_rate": 0.00011924108687889899, - "loss": 3.6469, + "loss": 1.8434, "step": 3990 }, { "epoch": 0.49513540311086324, - "grad_norm": 0.3003655228076814, + "grad_norm": 0.1361787753395237, "learning_rate": 0.00011902875099426207, - "loss": 3.5742, + "loss": 1.8449, "step": 3995 }, { "epoch": 0.49575509698209086, - "grad_norm": 0.26679154810260447, + "grad_norm": 0.12990782377541138, "learning_rate": 0.00011881632605489457, - "loss": 3.732, + "loss": 1.898, "step": 4000 }, { "epoch": 0.4963747908533185, - "grad_norm": 0.2584591602474966, + "grad_norm": 0.13345131711305486, "learning_rate": 0.0001186038130549471, - "loss": 3.6531, + "loss": 1.798, "step": 4005 }, { "epoch": 0.49699448472454605, - "grad_norm": 0.26241847954669006, + "grad_norm": 0.13073403167785114, "learning_rate": 0.00011839121298898253, - "loss": 3.6906, + "loss": 1.9184, "step": 4010 }, { "epoch": 0.49761417859577367, - "grad_norm": 0.2698264200108138, + "grad_norm": 0.128726619507065, "learning_rate": 0.00011817852685197109, - "loss": 3.6711, + "loss": 1.7777, "step": 4015 }, { "epoch": 0.4982338724670013, - "grad_norm": 0.26601297976121263, + "grad_norm": 0.1402678002131599, "learning_rate": 0.00011796575563928591, - "loss": 3.6437, + "loss": 1.8352, "step": 4020 }, { "epoch": 0.4988535663382289, - "grad_norm": 0.2611501124428443, + "grad_norm": 0.1532780370824944, "learning_rate": 0.00011775290034669822, - "loss": 3.6219, + "loss": 1.8375, "step": 4025 }, { "epoch": 0.49947326020945654, - "grad_norm": 0.2740304561773694, + "grad_norm": 0.1428411670169983, "learning_rate": 0.00011753996197037272, - "loss": 3.7164, + "loss": 1.8383, "step": 4030 }, { "epoch": 0.5000929540806841, - "grad_norm": 0.2604912300476886, + "grad_norm": 0.12794764388496246, "learning_rate": 0.00011732694150686301, - "loss": 3.7711, + "loss": 1.8629, "step": 4035 }, { "epoch": 0.5007126479519117, - "grad_norm": 0.2639969283942867, + "grad_norm": 0.12573790381059152, "learning_rate": 0.00011711383995310681, - "loss": 3.6781, + "loss": 1.8379, "step": 4040 }, { "epoch": 0.5013323418231393, - "grad_norm": 0.26395125505247186, + "grad_norm": 0.13444917133351966, "learning_rate": 0.00011690065830642143, - "loss": 3.6398, + "loss": 1.7645, "step": 4045 }, { "epoch": 0.501952035694367, - "grad_norm": 0.26422182168550484, + "grad_norm": 0.14311239318297794, "learning_rate": 0.00011668739756449885, - "loss": 3.7203, + "loss": 1.7723, "step": 4050 }, { "epoch": 0.5025717295655946, - "grad_norm": 0.2617310020917149, + "grad_norm": 0.13642980498274354, "learning_rate": 0.00011647405872540138, - "loss": 3.6469, + "loss": 1.8195, "step": 4055 }, { "epoch": 0.5031914234368222, - "grad_norm": 0.27066930312905935, + "grad_norm": 0.12807575497597842, "learning_rate": 0.00011626064278755673, - "loss": 3.8062, + "loss": 1.7895, "step": 4060 }, { "epoch": 0.5038111173080498, - "grad_norm": 0.27961161598532136, + "grad_norm": 0.1357891940700086, "learning_rate": 0.00011604715074975347, - "loss": 3.6719, + "loss": 1.8449, "step": 4065 }, { "epoch": 0.5044308111792775, - "grad_norm": 0.2666222034516121, + "grad_norm": 0.12954313901516068, "learning_rate": 0.00011583358361113632, - "loss": 3.7719, + "loss": 1.9016, "step": 4070 }, { "epoch": 0.5050505050505051, - "grad_norm": 0.2676080885232622, + "grad_norm": 0.13833393699221658, "learning_rate": 0.00011561994237120148, - "loss": 3.7039, + "loss": 1.8691, "step": 4075 }, { "epoch": 0.5056701989217327, - "grad_norm": 0.26051227668479, + "grad_norm": 0.13271819169332488, "learning_rate": 0.00011540622802979187, - "loss": 3.6602, + "loss": 1.8602, "step": 4080 }, { "epoch": 0.5062898927929603, - "grad_norm": 0.2620984894937847, + "grad_norm": 0.13361629977356454, "learning_rate": 0.00011519244158709263, - "loss": 3.7961, + "loss": 1.8945, "step": 4085 }, { "epoch": 0.5069095866641878, - "grad_norm": 0.27682148528587996, + "grad_norm": 0.13835947674800525, "learning_rate": 0.00011497858404362631, - "loss": 3.7305, + "loss": 1.8945, "step": 4090 }, { "epoch": 0.5075292805354155, - "grad_norm": 0.26332627536351605, + "grad_norm": 0.13507741578680432, "learning_rate": 0.00011476465640024814, - "loss": 3.7531, + "loss": 1.8098, "step": 4095 }, { "epoch": 0.5081489744066431, - "grad_norm": 0.2733755117038867, + "grad_norm": 0.12961267041068253, "learning_rate": 0.00011455065965814148, - "loss": 3.6711, + "loss": 1.8758, "step": 4100 }, { "epoch": 0.5087686682778707, - "grad_norm": 0.2547409812758734, + "grad_norm": 0.1426487693561521, "learning_rate": 0.00011433659481881307, - "loss": 3.8086, + "loss": 1.8414, "step": 4105 }, { "epoch": 0.5093883621490983, - "grad_norm": 0.2571255669278216, + "grad_norm": 0.129533245275186, "learning_rate": 0.00011412246288408835, - "loss": 3.6516, + "loss": 1.8836, "step": 4110 }, { "epoch": 0.510008056020326, - "grad_norm": 0.2854681210247425, + "grad_norm": 0.13192948437358754, "learning_rate": 0.00011390826485610675, - "loss": 3.6461, + "loss": 1.8355, "step": 4115 }, { "epoch": 0.5106277498915536, - "grad_norm": 0.2559172024534188, + "grad_norm": 0.12792800610505192, "learning_rate": 0.000113694001737317, - "loss": 3.7508, + "loss": 1.884, "step": 4120 }, { "epoch": 0.5112474437627812, - "grad_norm": 0.25176439604382256, + "grad_norm": 0.12825945154342566, "learning_rate": 0.00011347967453047248, - "loss": 3.7828, + "loss": 1.8324, "step": 4125 }, { "epoch": 0.5118671376340088, - "grad_norm": 0.2655443073530328, + "grad_norm": 0.12864350361168767, "learning_rate": 0.00011326528423862653, - "loss": 3.782, + "loss": 1.8367, "step": 4130 }, { "epoch": 0.5124868315052364, - "grad_norm": 0.2660618678109569, + "grad_norm": 0.13080630072709626, "learning_rate": 0.00011305083186512765, - "loss": 3.6, + "loss": 1.8031, "step": 4135 }, { "epoch": 0.5131065253764641, - "grad_norm": 0.2566751684212444, + "grad_norm": 0.1334944999274583, "learning_rate": 0.00011283631841361499, - "loss": 3.7523, + "loss": 1.8277, "step": 4140 }, { "epoch": 0.5137262192476917, - "grad_norm": 0.2798916301465522, + "grad_norm": 0.13277800972691645, "learning_rate": 0.00011262174488801349, - "loss": 3.7398, + "loss": 1.868, "step": 4145 }, { "epoch": 0.5143459131189193, - "grad_norm": 0.2631319357716128, + "grad_norm": 0.13163204057888897, "learning_rate": 0.00011240711229252915, - "loss": 3.668, + "loss": 1.827, "step": 4150 }, { "epoch": 0.5149656069901468, - "grad_norm": 0.2573477794528783, + "grad_norm": 0.1294009876519144, "learning_rate": 0.00011219242163164457, - "loss": 3.6672, + "loss": 1.927, "step": 4155 }, { "epoch": 0.5155853008613744, - "grad_norm": 0.2696847838745472, + "grad_norm": 0.13220239163919237, "learning_rate": 0.000111977673910114, - "loss": 3.625, + "loss": 1.9047, "step": 4160 }, { "epoch": 0.5162049947326021, - "grad_norm": 0.25202944696465945, + "grad_norm": 0.132410040296461, "learning_rate": 0.00011176287013295879, - "loss": 3.7016, + "loss": 1.8996, "step": 4165 }, { "epoch": 0.5168246886038297, - "grad_norm": 0.2833311975297903, + "grad_norm": 0.13967410850834808, "learning_rate": 0.0001115480113054626, - "loss": 3.6531, + "loss": 1.8652, "step": 4170 }, { "epoch": 0.5174443824750573, - "grad_norm": 0.25698009648870623, + "grad_norm": 0.13812711555448426, "learning_rate": 0.00011133309843316669, - "loss": 3.5906, + "loss": 1.8859, "step": 4175 }, { "epoch": 0.5180640763462849, - "grad_norm": 0.2628386812811596, + "grad_norm": 0.1370329050816476, "learning_rate": 0.0001111181325218653, - "loss": 3.5805, + "loss": 1.7641, "step": 4180 }, { "epoch": 0.5186837702175126, - "grad_norm": 0.27066938247651806, + "grad_norm": 0.13608145182115125, "learning_rate": 0.00011090311457760094, - "loss": 3.6664, + "loss": 1.8043, "step": 4185 }, { "epoch": 0.5193034640887402, - "grad_norm": 0.27158751271221065, + "grad_norm": 0.134999506527859, "learning_rate": 0.0001106880456066595, - "loss": 3.5961, + "loss": 1.7937, "step": 4190 }, { "epoch": 0.5199231579599678, - "grad_norm": 0.2752422036574746, + "grad_norm": 0.12986685167360099, "learning_rate": 0.00011047292661556581, - "loss": 3.7078, + "loss": 1.8008, "step": 4195 }, { "epoch": 0.5205428518311954, - "grad_norm": 0.2641601154800112, + "grad_norm": 0.1344755084959134, "learning_rate": 0.00011025775861107874, - "loss": 3.7266, + "loss": 1.8262, "step": 4200 }, { "epoch": 0.521162545702423, - "grad_norm": 0.2665093713894038, + "grad_norm": 0.13044935784007144, "learning_rate": 0.00011004254260018648, - "loss": 3.7969, + "loss": 1.8434, "step": 4205 }, { "epoch": 0.5217822395736507, - "grad_norm": 0.27287397250323636, + "grad_norm": 0.13877368447780922, "learning_rate": 0.00010982727959010201, - "loss": 3.7023, + "loss": 1.8852, "step": 4210 }, { "epoch": 0.5224019334448782, - "grad_norm": 0.2550715980357899, + "grad_norm": 0.12973163353638237, "learning_rate": 0.00010961197058825817, - "loss": 3.6227, + "loss": 1.8676, "step": 4215 }, { "epoch": 0.5230216273161058, - "grad_norm": 0.2620907560500356, + "grad_norm": 0.13199230862465572, "learning_rate": 0.00010939661660230309, - "loss": 3.7016, + "loss": 1.8328, "step": 4220 }, { "epoch": 0.5236413211873334, - "grad_norm": 0.26858792866309283, + "grad_norm": 0.13376789718860482, "learning_rate": 0.00010918121864009543, - "loss": 3.7938, + "loss": 1.8305, "step": 4225 }, { "epoch": 0.524261015058561, - "grad_norm": 0.2800359718630766, + "grad_norm": 0.13923517431663307, "learning_rate": 0.00010896577770969964, - "loss": 3.6867, + "loss": 1.8027, "step": 4230 }, { "epoch": 0.5248807089297887, - "grad_norm": 0.2571645347786101, + "grad_norm": 0.13044002358446727, "learning_rate": 0.00010875029481938126, - "loss": 3.6711, + "loss": 1.9016, "step": 4235 }, { "epoch": 0.5255004028010163, - "grad_norm": 0.2608272606052285, + "grad_norm": 0.1299472760176014, "learning_rate": 0.00010853477097760222, - "loss": 3.6078, + "loss": 1.8285, "step": 4240 }, { "epoch": 0.5261200966722439, - "grad_norm": 0.26451451779524215, + "grad_norm": 0.13239585756853023, "learning_rate": 0.0001083192071930161, - "loss": 3.7781, + "loss": 1.8016, "step": 4245 }, { "epoch": 0.5267397905434715, - "grad_norm": 0.26351086041096267, + "grad_norm": 0.12416531787933346, "learning_rate": 0.00010810360447446335, - "loss": 3.7883, + "loss": 1.852, "step": 4250 }, { "epoch": 0.5273594844146992, - "grad_norm": 0.28221768245515083, + "grad_norm": 0.1379243677220307, "learning_rate": 0.00010788796383096676, - "loss": 3.6695, + "loss": 1.8152, "step": 4255 }, { "epoch": 0.5279791782859268, - "grad_norm": 0.2639447901747608, + "grad_norm": 0.12963488868452042, "learning_rate": 0.00010767228627172645, - "loss": 3.8016, + "loss": 1.8582, "step": 4260 }, { "epoch": 0.5285988721571544, - "grad_norm": 0.2679083471470788, + "grad_norm": 0.1328002434282655, "learning_rate": 0.00010745657280611552, - "loss": 3.6914, + "loss": 1.8641, "step": 4265 }, { "epoch": 0.529218566028382, - "grad_norm": 0.27311699346488155, + "grad_norm": 0.13106171306296183, "learning_rate": 0.00010724082444367485, - "loss": 3.7383, + "loss": 1.8445, "step": 4270 }, { "epoch": 0.5298382598996096, - "grad_norm": 0.27104674035190124, + "grad_norm": 0.13741774367232246, "learning_rate": 0.00010702504219410884, - "loss": 3.6758, + "loss": 1.8777, "step": 4275 }, { "epoch": 0.5304579537708372, - "grad_norm": 0.2681807255742581, + "grad_norm": 0.13098908649184046, "learning_rate": 0.00010680922706728041, - "loss": 3.6836, + "loss": 1.7973, "step": 4280 }, { "epoch": 0.5310776476420648, - "grad_norm": 0.2704989786880266, + "grad_norm": 0.13693787562687978, "learning_rate": 0.00010659338007320632, - "loss": 3.6414, + "loss": 1.9187, "step": 4285 }, { "epoch": 0.5316973415132924, - "grad_norm": 0.2656517152994718, + "grad_norm": 0.13261045311512387, "learning_rate": 0.00010637750222205253, - "loss": 3.7219, + "loss": 1.8277, "step": 4290 }, { "epoch": 0.53231703538452, - "grad_norm": 0.27147564166779353, + "grad_norm": 0.1361721967821798, "learning_rate": 0.00010616159452412939, - "loss": 3.6375, + "loss": 1.8301, "step": 4295 }, { "epoch": 0.5329367292557476, - "grad_norm": 0.26355038804511355, + "grad_norm": 0.1330959419108472, "learning_rate": 0.00010594565798988689, - "loss": 3.6742, + "loss": 1.8496, "step": 4300 }, { "epoch": 0.5335564231269753, - "grad_norm": 0.2670813512281771, + "grad_norm": 0.14146841910755462, "learning_rate": 0.00010572969362990998, - "loss": 3.6531, + "loss": 1.8523, "step": 4305 }, { "epoch": 0.5341761169982029, - "grad_norm": 0.26063814209234343, + "grad_norm": 0.1335707575288798, "learning_rate": 0.00010551370245491394, - "loss": 3.6086, + "loss": 1.8457, "step": 4310 }, { "epoch": 0.5347958108694305, - "grad_norm": 0.2844653210765238, + "grad_norm": 0.1337012914277915, "learning_rate": 0.00010529768547573942, - "loss": 3.682, + "loss": 1.8125, "step": 4315 }, { "epoch": 0.5354155047406581, - "grad_norm": 0.26651449146907025, + "grad_norm": 0.13666744181424356, "learning_rate": 0.00010508164370334787, - "loss": 3.7648, + "loss": 1.8461, "step": 4320 }, { "epoch": 0.5360351986118858, - "grad_norm": 0.29026927384527157, + "grad_norm": 0.13958679117203346, "learning_rate": 0.00010486557814881686, - "loss": 3.6891, + "loss": 1.8363, "step": 4325 }, { "epoch": 0.5366548924831134, - "grad_norm": 0.2951899420486929, + "grad_norm": 0.13607366667952386, "learning_rate": 0.00010464948982333504, - "loss": 3.4922, + "loss": 1.8137, "step": 4330 }, { "epoch": 0.537274586354341, - "grad_norm": 0.2681518077012756, + "grad_norm": 0.1350440016774684, "learning_rate": 0.00010443337973819791, - "loss": 3.6727, + "loss": 1.8121, "step": 4335 }, { "epoch": 0.5378942802255686, - "grad_norm": 0.2677952279703973, + "grad_norm": 0.1322464115487655, "learning_rate": 0.00010421724890480258, - "loss": 3.6484, + "loss": 1.8227, "step": 4340 }, { "epoch": 0.5385139740967961, - "grad_norm": 0.26992711864448443, + "grad_norm": 0.13809058095945276, "learning_rate": 0.00010400109833464338, - "loss": 3.6117, + "loss": 1.8594, "step": 4345 }, { "epoch": 0.5391336679680238, - "grad_norm": 0.27337921580576696, + "grad_norm": 0.13052069620811604, "learning_rate": 0.00010378492903930699, - "loss": 3.6383, + "loss": 1.825, "step": 4350 }, { "epoch": 0.5397533618392514, - "grad_norm": 0.2624992989486429, + "grad_norm": 0.14779827571036525, "learning_rate": 0.00010356874203046766, - "loss": 3.6789, + "loss": 1.8391, "step": 4355 }, { "epoch": 0.540373055710479, - "grad_norm": 0.27710421143802777, + "grad_norm": 0.12710789410858492, "learning_rate": 0.00010335253831988267, - "loss": 3.4875, + "loss": 1.8184, "step": 4360 }, { "epoch": 0.5409927495817066, - "grad_norm": 0.2826706906139555, + "grad_norm": 0.1285834215695178, "learning_rate": 0.00010313631891938736, - "loss": 3.6797, + "loss": 1.8633, "step": 4365 }, { "epoch": 0.5416124434529342, - "grad_norm": 0.2707904993711854, + "grad_norm": 0.13350900347998482, "learning_rate": 0.00010292008484089047, - "loss": 3.7258, + "loss": 1.8516, "step": 4370 }, { "epoch": 0.5422321373241619, - "grad_norm": 0.2734811974169758, + "grad_norm": 0.1320142499534893, "learning_rate": 0.0001027038370963695, - "loss": 3.8141, + "loss": 1.8543, "step": 4375 }, { "epoch": 0.5428518311953895, - "grad_norm": 0.26637771423373513, + "grad_norm": 0.13382411859073567, "learning_rate": 0.00010248757669786594, - "loss": 3.7797, + "loss": 1.8438, "step": 4380 }, { "epoch": 0.5434715250666171, - "grad_norm": 0.28036695867937245, + "grad_norm": 0.15096282450997914, "learning_rate": 0.00010227130465748045, - "loss": 3.6367, + "loss": 1.8844, "step": 4385 }, { "epoch": 0.5440912189378447, - "grad_norm": 0.2641150707890887, + "grad_norm": 0.13050426547539765, "learning_rate": 0.00010205502198736816, - "loss": 3.6641, + "loss": 1.7953, "step": 4390 }, { "epoch": 0.5447109128090724, - "grad_norm": 0.2814260608847561, + "grad_norm": 0.12786255907842825, "learning_rate": 0.00010183872969973396, - "loss": 3.7086, + "loss": 1.8191, "step": 4395 }, { "epoch": 0.5453306066803, - "grad_norm": 0.2682954596928813, + "grad_norm": 0.14113604167832816, "learning_rate": 0.00010162242880682776, - "loss": 3.5836, + "loss": 1.8598, "step": 4400 }, { "epoch": 0.5459503005515276, - "grad_norm": 0.27065092520145695, + "grad_norm": 0.1341328866770354, "learning_rate": 0.00010140612032093972, - "loss": 3.675, + "loss": 1.825, "step": 4405 }, { "epoch": 0.5465699944227551, - "grad_norm": 0.2698049008297175, + "grad_norm": 0.13380233821056914, "learning_rate": 0.00010118980525439559, - "loss": 3.6375, + "loss": 1.843, "step": 4410 }, { "epoch": 0.5471896882939827, - "grad_norm": 0.2542929379997657, + "grad_norm": 0.1343984620510971, "learning_rate": 0.00010097348461955186, - "loss": 3.7352, + "loss": 1.7957, "step": 4415 }, { "epoch": 0.5478093821652104, - "grad_norm": 0.2557245988208345, + "grad_norm": 0.1351500470677798, "learning_rate": 0.00010075715942879114, - "loss": 3.5656, + "loss": 1.8129, "step": 4420 }, { "epoch": 0.548429076036438, - "grad_norm": 0.27292418147518904, + "grad_norm": 0.13404154577268396, "learning_rate": 0.00010054083069451728, - "loss": 3.5719, + "loss": 1.8422, "step": 4425 }, { "epoch": 0.5490487699076656, - "grad_norm": 0.26425016532979473, + "grad_norm": 0.13166534889000214, "learning_rate": 0.00010032449942915072, - "loss": 3.6578, + "loss": 1.8156, "step": 4430 }, { "epoch": 0.5496684637788932, - "grad_norm": 0.27604141996830334, + "grad_norm": 0.13506639450706173, "learning_rate": 0.00010010816664512389, - "loss": 3.718, + "loss": 1.8012, "step": 4435 }, { "epoch": 0.5502881576501208, - "grad_norm": 0.289694464188095, + "grad_norm": 0.13182983003356816, "learning_rate": 9.989183335487615e-05, - "loss": 3.7164, + "loss": 1.8676, "step": 4440 }, { "epoch": 0.5509078515213485, - "grad_norm": 0.2600073920423731, + "grad_norm": 0.14142433786552008, "learning_rate": 9.96755005708493e-05, - "loss": 3.6734, + "loss": 1.8723, "step": 4445 }, { "epoch": 0.5515275453925761, - "grad_norm": 0.27299233498222497, + "grad_norm": 0.1334151262150459, "learning_rate": 9.945916930548276e-05, - "loss": 3.6578, + "loss": 1.9223, "step": 4450 }, { "epoch": 0.5521472392638037, - "grad_norm": 0.2825950975586584, + "grad_norm": 0.13047834012682685, "learning_rate": 9.924284057120888e-05, - "loss": 3.6187, + "loss": 1.8035, "step": 4455 }, { "epoch": 0.5527669331350313, - "grad_norm": 0.2690506554715704, + "grad_norm": 0.13551238503921087, "learning_rate": 9.902651538044814e-05, - "loss": 3.6023, + "loss": 1.8793, "step": 4460 }, { "epoch": 0.553386627006259, - "grad_norm": 0.2692063879106752, + "grad_norm": 0.13009663685025677, "learning_rate": 9.88101947456044e-05, - "loss": 3.657, + "loss": 1.8641, "step": 4465 }, { "epoch": 0.5540063208774865, - "grad_norm": 0.2760619915971685, + "grad_norm": 0.1360455529484941, "learning_rate": 9.859387967906033e-05, - "loss": 3.657, + "loss": 1.8391, "step": 4470 }, { "epoch": 0.5546260147487141, - "grad_norm": 0.2753779861348704, + "grad_norm": 0.1289716476183338, "learning_rate": 9.837757119317228e-05, - "loss": 3.6414, + "loss": 1.8742, "step": 4475 }, { "epoch": 0.5552457086199417, - "grad_norm": 0.27011777687757504, + "grad_norm": 0.13275590576368201, "learning_rate": 9.816127030026607e-05, - "loss": 3.8336, + "loss": 1.8613, "step": 4480 }, { "epoch": 0.5558654024911693, - "grad_norm": 0.2720905676563954, + "grad_norm": 0.14348224249868619, "learning_rate": 9.794497801263185e-05, - "loss": 3.6656, + "loss": 1.8391, "step": 4485 }, { "epoch": 0.556485096362397, - "grad_norm": 0.2666666192966718, + "grad_norm": 0.13464069972617826, "learning_rate": 9.772869534251956e-05, - "loss": 3.643, + "loss": 1.827, "step": 4490 }, { "epoch": 0.5571047902336246, - "grad_norm": 0.27442530437873, + "grad_norm": 0.13214411129137058, "learning_rate": 9.751242330213407e-05, - "loss": 3.5914, + "loss": 1.8836, "step": 4495 }, { "epoch": 0.5577244841048522, - "grad_norm": 0.2762518832157627, + "grad_norm": 0.13401560105396315, "learning_rate": 9.729616290363051e-05, - "loss": 3.6383, + "loss": 1.8488, "step": 4500 }, { "epoch": 0.5583441779760798, - "grad_norm": 0.2719838257171385, + "grad_norm": 0.13560522706619668, "learning_rate": 9.707991515910957e-05, - "loss": 3.757, + "loss": 1.8621, "step": 4505 }, { "epoch": 0.5589638718473074, - "grad_norm": 0.2665995820731799, + "grad_norm": 0.1378604398311812, "learning_rate": 9.686368108061268e-05, - "loss": 3.6352, + "loss": 1.9074, "step": 4510 }, { "epoch": 0.5595835657185351, - "grad_norm": 0.2793809708656953, + "grad_norm": 0.12967243553989305, "learning_rate": 9.664746168011734e-05, - "loss": 3.5305, + "loss": 1.8211, "step": 4515 }, { "epoch": 0.5602032595897627, - "grad_norm": 0.27282729383029186, + "grad_norm": 0.1273548051846235, "learning_rate": 9.643125796953233e-05, - "loss": 3.7062, + "loss": 1.8742, "step": 4520 }, { "epoch": 0.5608229534609903, - "grad_norm": 0.26852576393578903, + "grad_norm": 0.13392981702989407, "learning_rate": 9.621507096069304e-05, - "loss": 3.6648, + "loss": 1.8578, "step": 4525 }, { "epoch": 0.5614426473322179, - "grad_norm": 0.27059227818607917, + "grad_norm": 0.13085951763754228, "learning_rate": 9.599890166535665e-05, - "loss": 3.5891, + "loss": 1.8684, "step": 4530 }, { "epoch": 0.5620623412034454, - "grad_norm": 0.26232970416408835, + "grad_norm": 0.14167378893366092, "learning_rate": 9.578275109519744e-05, - "loss": 3.6594, + "loss": 1.893, "step": 4535 }, { "epoch": 0.5626820350746731, - "grad_norm": 0.27443731038719305, + "grad_norm": 0.13395690664276264, "learning_rate": 9.556662026180212e-05, - "loss": 3.5516, + "loss": 1.8559, "step": 4540 }, { "epoch": 0.5633017289459007, - "grad_norm": 0.2618687741186096, + "grad_norm": 0.13785153538830483, "learning_rate": 9.535051017666497e-05, - "loss": 3.6773, + "loss": 1.8742, "step": 4545 }, { "epoch": 0.5639214228171283, - "grad_norm": 0.2684216768813871, + "grad_norm": 0.13326090153428086, "learning_rate": 9.513442185118319e-05, - "loss": 3.7344, + "loss": 1.8723, "step": 4550 }, { "epoch": 0.5645411166883559, - "grad_norm": 0.28067208731324905, + "grad_norm": 0.1325737274991862, "learning_rate": 9.491835629665214e-05, - "loss": 3.6211, + "loss": 1.8371, "step": 4555 }, { "epoch": 0.5651608105595836, - "grad_norm": 0.2670591286203052, + "grad_norm": 0.13484451999521505, "learning_rate": 9.470231452426059e-05, - "loss": 3.5281, + "loss": 1.7844, "step": 4560 }, { "epoch": 0.5657805044308112, - "grad_norm": 0.26120176936520656, + "grad_norm": 0.13667668532140465, "learning_rate": 9.448629754508607e-05, - "loss": 3.7922, + "loss": 1.8711, "step": 4565 }, { "epoch": 0.5664001983020388, - "grad_norm": 0.2653916879077019, + "grad_norm": 0.13989769457594814, "learning_rate": 9.427030637009003e-05, - "loss": 3.7422, + "loss": 1.8859, "step": 4570 }, { "epoch": 0.5670198921732664, - "grad_norm": 0.27047334889618924, + "grad_norm": 0.12883812907785516, "learning_rate": 9.405434201011313e-05, - "loss": 3.6961, + "loss": 1.8988, "step": 4575 }, { "epoch": 0.567639586044494, - "grad_norm": 0.26233721834749346, + "grad_norm": 0.130205864302805, "learning_rate": 9.383840547587064e-05, - "loss": 3.7984, + "loss": 1.8863, "step": 4580 }, { "epoch": 0.5682592799157217, - "grad_norm": 0.2671699568255034, + "grad_norm": 0.1333369490609614, "learning_rate": 9.362249777794749e-05, - "loss": 3.6398, + "loss": 1.8434, "step": 4585 }, { "epoch": 0.5688789737869493, - "grad_norm": 0.3075923924701826, + "grad_norm": 0.13277669705453948, "learning_rate": 9.340661992679369e-05, - "loss": 3.675, + "loss": 1.8246, "step": 4590 }, { "epoch": 0.5694986676581769, - "grad_norm": 0.29037913669304627, + "grad_norm": 0.1388883320912154, "learning_rate": 9.319077293271961e-05, - "loss": 3.7156, + "loss": 1.8754, "step": 4595 }, { "epoch": 0.5701183615294044, - "grad_norm": 0.27370521659421365, + "grad_norm": 0.1320971868111603, "learning_rate": 9.297495780589118e-05, - "loss": 3.5, + "loss": 1.85, "step": 4600 }, { "epoch": 0.570738055400632, - "grad_norm": 0.25765783085364835, + "grad_norm": 0.13017211313722013, "learning_rate": 9.27591755563252e-05, - "loss": 3.5992, + "loss": 1.7656, "step": 4605 }, { "epoch": 0.5713577492718597, - "grad_norm": 0.26943210575764764, + "grad_norm": 0.129888126477682, "learning_rate": 9.254342719388454e-05, - "loss": 3.6242, + "loss": 1.7785, "step": 4610 }, { "epoch": 0.5719774431430873, - "grad_norm": 0.2745748208620268, + "grad_norm": 0.14770097601386062, "learning_rate": 9.232771372827356e-05, - "loss": 3.6227, + "loss": 1.877, "step": 4615 }, { "epoch": 0.5725971370143149, - "grad_norm": 0.26354715484573016, + "grad_norm": 0.1383845014722268, "learning_rate": 9.211203616903328e-05, - "loss": 3.5875, + "loss": 1.8395, "step": 4620 }, { "epoch": 0.5732168308855425, - "grad_norm": 0.2850816926028819, + "grad_norm": 0.12825268242494714, "learning_rate": 9.189639552553667e-05, - "loss": 3.6516, + "loss": 1.898, "step": 4625 }, { "epoch": 0.5738365247567702, - "grad_norm": 0.2727825297247378, + "grad_norm": 0.14095823402273389, "learning_rate": 9.168079280698391e-05, - "loss": 3.7016, + "loss": 1.8254, "step": 4630 }, { "epoch": 0.5744562186279978, - "grad_norm": 0.26533216629165046, + "grad_norm": 0.13138557864881723, "learning_rate": 9.146522902239781e-05, - "loss": 3.6914, + "loss": 1.8512, "step": 4635 }, { "epoch": 0.5750759124992254, - "grad_norm": 0.2759553912069646, + "grad_norm": 0.1275168924292215, "learning_rate": 9.124970518061877e-05, - "loss": 3.5375, + "loss": 1.8637, "step": 4640 }, { "epoch": 0.575695606370453, - "grad_norm": 0.2706374034574014, + "grad_norm": 0.1301661963978751, "learning_rate": 9.103422229030038e-05, - "loss": 3.7633, + "loss": 1.8434, "step": 4645 }, { "epoch": 0.5763153002416807, - "grad_norm": 0.2730578135363883, + "grad_norm": 0.13374425189552858, "learning_rate": 9.081878135990458e-05, - "loss": 3.7047, + "loss": 1.7465, "step": 4650 }, { "epoch": 0.5769349941129083, - "grad_norm": 0.25414420934326987, + "grad_norm": 0.13808982686111124, "learning_rate": 9.06033833976969e-05, - "loss": 3.7484, + "loss": 1.8422, "step": 4655 }, { "epoch": 0.5775546879841358, - "grad_norm": 0.256400849310976, + "grad_norm": 0.13447543310447194, "learning_rate": 9.038802941174187e-05, - "loss": 3.7687, + "loss": 1.7785, "step": 4660 }, { "epoch": 0.5781743818553634, - "grad_norm": 0.29634415084122473, + "grad_norm": 0.13300414692423948, "learning_rate": 9.017272040989804e-05, - "loss": 3.8375, + "loss": 1.8574, "step": 4665 }, { "epoch": 0.578794075726591, - "grad_norm": 0.27794653757814963, + "grad_norm": 0.13096590711343975, "learning_rate": 8.995745739981355e-05, - "loss": 3.7055, + "loss": 1.8609, "step": 4670 }, { "epoch": 0.5794137695978187, - "grad_norm": 0.27219582612115456, + "grad_norm": 0.13820508880678925, "learning_rate": 8.974224138892127e-05, - "loss": 3.807, + "loss": 1.8523, "step": 4675 }, { "epoch": 0.5800334634690463, - "grad_norm": 0.2717849560452208, + "grad_norm": 0.12765070621427313, "learning_rate": 8.952707338443418e-05, - "loss": 3.6008, + "loss": 1.8145, "step": 4680 }, { "epoch": 0.5806531573402739, - "grad_norm": 0.2689613705566208, + "grad_norm": 0.13872971817201188, "learning_rate": 8.931195439334048e-05, - "loss": 3.6508, + "loss": 1.8512, "step": 4685 }, { "epoch": 0.5812728512115015, - "grad_norm": 0.2679893379960508, + "grad_norm": 0.13654187286164757, "learning_rate": 8.90968854223991e-05, - "loss": 3.6219, + "loss": 1.8188, "step": 4690 }, { "epoch": 0.5818925450827291, - "grad_norm": 0.2720978068136715, + "grad_norm": 0.1358565435204054, "learning_rate": 8.888186747813473e-05, - "loss": 3.5727, + "loss": 1.8594, "step": 4695 }, { "epoch": 0.5825122389539568, - "grad_norm": 0.2790232242548787, + "grad_norm": 0.1361212793989369, "learning_rate": 8.866690156683332e-05, - "loss": 3.6594, + "loss": 1.825, "step": 4700 }, { "epoch": 0.5831319328251844, - "grad_norm": 0.2550213840737251, + "grad_norm": 0.1313054079631678, "learning_rate": 8.845198869453742e-05, - "loss": 3.7766, + "loss": 1.8617, "step": 4705 }, { "epoch": 0.583751626696412, - "grad_norm": 0.27335294108007446, + "grad_norm": 0.1340543850970045, "learning_rate": 8.823712986704121e-05, - "loss": 3.6156, + "loss": 1.7836, "step": 4710 }, { "epoch": 0.5843713205676396, - "grad_norm": 0.25931700709755595, + "grad_norm": 0.13217833814152594, "learning_rate": 8.802232608988604e-05, - "loss": 3.7563, + "loss": 1.832, "step": 4715 }, { "epoch": 0.5849910144388673, - "grad_norm": 0.26874870206277024, + "grad_norm": 0.14629328645282433, "learning_rate": 8.780757836835544e-05, - "loss": 3.6766, + "loss": 1.823, "step": 4720 }, { "epoch": 0.5856107083100948, - "grad_norm": 0.2707225242276941, + "grad_norm": 0.13494154906279057, "learning_rate": 8.759288770747087e-05, - "loss": 3.7711, + "loss": 1.8539, "step": 4725 }, { "epoch": 0.5862304021813224, - "grad_norm": 0.2651627038636793, + "grad_norm": 0.14440441221946163, "learning_rate": 8.737825511198654e-05, - "loss": 3.6562, + "loss": 1.8441, "step": 4730 }, { "epoch": 0.58685009605255, - "grad_norm": 0.2785945180431172, + "grad_norm": 0.13863154183602425, "learning_rate": 8.7163681586385e-05, - "loss": 3.6945, + "loss": 1.8242, "step": 4735 }, { "epoch": 0.5874697899237776, - "grad_norm": 0.2653182542622149, + "grad_norm": 0.1318855989670804, "learning_rate": 8.694916813487233e-05, - "loss": 3.632, + "loss": 1.834, "step": 4740 }, { "epoch": 0.5880894837950053, - "grad_norm": 0.26490316164430944, + "grad_norm": 0.13920618174796423, "learning_rate": 8.67347157613735e-05, - "loss": 3.65, + "loss": 1.7969, "step": 4745 }, { "epoch": 0.5887091776662329, - "grad_norm": 0.2709427564963539, + "grad_norm": 0.13767154490310451, "learning_rate": 8.652032546952754e-05, - "loss": 3.693, + "loss": 1.807, "step": 4750 }, { "epoch": 0.5893288715374605, - "grad_norm": 0.2748718169619932, + "grad_norm": 0.1354204158207126, "learning_rate": 8.630599826268303e-05, - "loss": 3.757, + "loss": 1.9137, "step": 4755 }, { "epoch": 0.5899485654086881, - "grad_norm": 0.30416816756324155, + "grad_norm": 0.13637608173257612, "learning_rate": 8.609173514389328e-05, - "loss": 3.618, + "loss": 1.8375, "step": 4760 }, { "epoch": 0.5905682592799157, - "grad_norm": 0.2739395450601053, + "grad_norm": 0.1392456473512201, "learning_rate": 8.587753711591166e-05, - "loss": 3.7812, + "loss": 1.7957, "step": 4765 }, { "epoch": 0.5911879531511434, - "grad_norm": 0.26917898434493065, + "grad_norm": 0.14079378358935743, "learning_rate": 8.566340518118695e-05, - "loss": 3.7789, + "loss": 1.8789, "step": 4770 }, { "epoch": 0.591807647022371, - "grad_norm": 0.26497623397022974, + "grad_norm": 0.1326673419979531, "learning_rate": 8.544934034185854e-05, - "loss": 3.6297, + "loss": 1.8301, "step": 4775 }, { "epoch": 0.5924273408935986, - "grad_norm": 0.257955707351069, + "grad_norm": 0.1324300142166372, "learning_rate": 8.523534359975189e-05, - "loss": 3.6953, + "loss": 1.8129, "step": 4780 }, { "epoch": 0.5930470347648262, - "grad_norm": 0.26226027788634904, + "grad_norm": 0.13436159136571374, "learning_rate": 8.502141595637371e-05, - "loss": 3.6797, + "loss": 1.8234, "step": 4785 }, { "epoch": 0.5936667286360537, - "grad_norm": 0.2695043028263795, + "grad_norm": 0.1340824171889328, "learning_rate": 8.480755841290736e-05, - "loss": 3.6016, + "loss": 1.8719, "step": 4790 }, { "epoch": 0.5942864225072814, - "grad_norm": 0.2710516349098267, + "grad_norm": 0.13942656890716446, "learning_rate": 8.459377197020813e-05, - "loss": 3.7289, + "loss": 1.8418, "step": 4795 }, { "epoch": 0.594906116378509, - "grad_norm": 0.25498187017040835, + "grad_norm": 0.1375776284383645, "learning_rate": 8.438005762879856e-05, - "loss": 3.7164, + "loss": 1.8352, "step": 4800 }, { "epoch": 0.5955258102497366, - "grad_norm": 0.2662837360767922, + "grad_norm": 0.13817749230266843, "learning_rate": 8.416641638886369e-05, - "loss": 3.582, + "loss": 1.8715, "step": 4805 }, { "epoch": 0.5961455041209642, - "grad_norm": 0.2700127101757652, + "grad_norm": 0.13644813853970703, "learning_rate": 8.395284925024654e-05, - "loss": 3.5852, + "loss": 1.8141, "step": 4810 }, { "epoch": 0.5967651979921919, - "grad_norm": 0.2678327716698091, + "grad_norm": 0.14009468198120875, "learning_rate": 8.373935721244329e-05, - "loss": 3.8133, + "loss": 1.8398, "step": 4815 }, { "epoch": 0.5973848918634195, - "grad_norm": 0.26206854136484303, + "grad_norm": 0.13539696078697683, "learning_rate": 8.352594127459865e-05, - "loss": 3.7641, + "loss": 1.7586, "step": 4820 }, { "epoch": 0.5980045857346471, - "grad_norm": 0.27440252125300807, + "grad_norm": 0.1366948032582978, "learning_rate": 8.331260243550119e-05, - "loss": 3.6336, + "loss": 1.8266, "step": 4825 }, { "epoch": 0.5986242796058747, - "grad_norm": 0.27610081992251445, + "grad_norm": 0.12944284870629252, "learning_rate": 8.309934169357862e-05, - "loss": 3.6305, + "loss": 1.8117, "step": 4830 }, { "epoch": 0.5992439734771023, - "grad_norm": 0.27199189429278114, + "grad_norm": 0.13536533441301316, "learning_rate": 8.28861600468932e-05, - "loss": 3.7133, + "loss": 1.7957, "step": 4835 }, { "epoch": 0.59986366734833, - "grad_norm": 0.26713808906578573, + "grad_norm": 0.13946065741059344, "learning_rate": 8.267305849313702e-05, - "loss": 3.85, + "loss": 1.8605, "step": 4840 }, { "epoch": 0.6004833612195576, - "grad_norm": 0.2646602068441273, + "grad_norm": 0.1389191099799506, "learning_rate": 8.246003802962732e-05, - "loss": 3.7367, + "loss": 1.7836, "step": 4845 }, { "epoch": 0.6011030550907851, - "grad_norm": 0.2834428909362969, + "grad_norm": 0.1379459598325412, "learning_rate": 8.224709965330182e-05, - "loss": 3.6453, + "loss": 1.9152, "step": 4850 }, { "epoch": 0.6017227489620127, - "grad_norm": 0.26601716548743887, + "grad_norm": 0.13789049940846776, "learning_rate": 8.203424436071413e-05, - "loss": 3.6734, + "loss": 1.8289, "step": 4855 }, { "epoch": 0.6023424428332403, - "grad_norm": 0.27536953883017307, + "grad_norm": 0.13702308254276288, "learning_rate": 8.182147314802892e-05, - "loss": 3.6914, + "loss": 1.8297, "step": 4860 }, { "epoch": 0.602962136704468, - "grad_norm": 0.2863152536982749, + "grad_norm": 0.13119036631588685, "learning_rate": 8.160878701101751e-05, - "loss": 3.5758, + "loss": 1.8105, "step": 4865 }, { "epoch": 0.6035818305756956, - "grad_norm": 0.2597341825241679, + "grad_norm": 0.14154762542667904, "learning_rate": 8.139618694505292e-05, - "loss": 3.7016, + "loss": 1.7879, "step": 4870 }, { "epoch": 0.6042015244469232, - "grad_norm": 0.2677923848745103, + "grad_norm": 0.1353547074450921, "learning_rate": 8.118367394510544e-05, - "loss": 3.543, + "loss": 1.8203, "step": 4875 }, { "epoch": 0.6048212183181508, - "grad_norm": 0.27300541950447416, + "grad_norm": 0.1268917466267913, "learning_rate": 8.097124900573795e-05, - "loss": 3.6367, + "loss": 1.7988, "step": 4880 }, { "epoch": 0.6054409121893785, - "grad_norm": 0.2775292613482833, + "grad_norm": 0.1307148187671773, "learning_rate": 8.075891312110104e-05, - "loss": 3.6172, + "loss": 1.8664, "step": 4885 }, { "epoch": 0.6060606060606061, - "grad_norm": 0.271824184100202, + "grad_norm": 0.1380952234229187, "learning_rate": 8.054666728492864e-05, - "loss": 3.7484, + "loss": 1.8156, "step": 4890 }, { "epoch": 0.6066802999318337, - "grad_norm": 0.26384159534759966, + "grad_norm": 0.13996339797842827, "learning_rate": 8.033451249053324e-05, - "loss": 3.5281, + "loss": 1.8059, "step": 4895 }, { "epoch": 0.6072999938030613, - "grad_norm": 0.27157171737048585, + "grad_norm": 0.13226203390711466, "learning_rate": 8.01224497308012e-05, - "loss": 3.6742, + "loss": 1.9059, "step": 4900 }, { "epoch": 0.6079196876742889, - "grad_norm": 0.25821685280582, + "grad_norm": 0.13671539535832183, "learning_rate": 7.991047999818825e-05, - "loss": 3.7383, + "loss": 1.8492, "step": 4905 }, { "epoch": 0.6085393815455166, - "grad_norm": 0.2659945632497991, + "grad_norm": 0.13858915396512983, "learning_rate": 7.969860428471472e-05, - "loss": 3.6859, + "loss": 1.8258, "step": 4910 }, { "epoch": 0.6091590754167441, - "grad_norm": 0.2701267645891113, + "grad_norm": 0.13457679501799066, "learning_rate": 7.948682358196081e-05, - "loss": 3.5867, + "loss": 1.8109, "step": 4915 }, { "epoch": 0.6097787692879717, - "grad_norm": 0.2764927840431044, + "grad_norm": 0.13773955152308393, "learning_rate": 7.927513888106222e-05, - "loss": 3.8344, + "loss": 1.868, "step": 4920 }, { "epoch": 0.6103984631591993, - "grad_norm": 0.2714043191480644, + "grad_norm": 0.13608898492240817, "learning_rate": 7.90635511727053e-05, - "loss": 3.6148, + "loss": 1.8598, "step": 4925 }, { "epoch": 0.611018157030427, - "grad_norm": 0.2609093098453354, + "grad_norm": 0.13594465543273473, "learning_rate": 7.88520614471225e-05, - "loss": 3.6852, + "loss": 1.823, "step": 4930 }, { "epoch": 0.6116378509016546, - "grad_norm": 0.2674835719736552, + "grad_norm": 0.13620784093910132, "learning_rate": 7.864067069408773e-05, - "loss": 3.6867, + "loss": 1.8395, "step": 4935 }, { "epoch": 0.6122575447728822, - "grad_norm": 0.26158771055150254, + "grad_norm": 0.13036549008869344, "learning_rate": 7.842937990291157e-05, - "loss": 3.6469, + "loss": 1.7992, "step": 4940 }, { "epoch": 0.6128772386441098, - "grad_norm": 0.26450556985138535, + "grad_norm": 0.14264302125929879, "learning_rate": 7.821819006243691e-05, - "loss": 3.6164, + "loss": 1.8539, "step": 4945 }, { "epoch": 0.6134969325153374, - "grad_norm": 0.25698316515907754, + "grad_norm": 0.14528351932829806, "learning_rate": 7.80071021610342e-05, - "loss": 3.7086, + "loss": 1.9066, "step": 4950 }, { "epoch": 0.6141166263865651, - "grad_norm": 0.27112001318405343, + "grad_norm": 0.13669847537271232, "learning_rate": 7.779611718659674e-05, - "loss": 3.7711, + "loss": 1.8215, "step": 4955 }, { "epoch": 0.6147363202577927, - "grad_norm": 0.26272113575637257, + "grad_norm": 0.13166972553564527, "learning_rate": 7.758523612653617e-05, - "loss": 3.6203, + "loss": 1.7715, "step": 4960 }, { "epoch": 0.6153560141290203, - "grad_norm": 0.267094681815031, + "grad_norm": 0.13422436268794732, "learning_rate": 7.737445996777784e-05, - "loss": 3.7563, + "loss": 1.8004, "step": 4965 }, { "epoch": 0.6159757080002479, - "grad_norm": 0.26865139865456006, + "grad_norm": 0.13621849103473319, "learning_rate": 7.716378969675599e-05, - "loss": 3.6984, + "loss": 1.834, "step": 4970 }, { "epoch": 0.6165954018714755, - "grad_norm": 0.2656146898765198, + "grad_norm": 0.1396825107082279, "learning_rate": 7.695322629940957e-05, - "loss": 3.5805, + "loss": 1.8137, "step": 4975 }, { "epoch": 0.6172150957427031, - "grad_norm": 0.27153591932306037, + "grad_norm": 0.14110952896496498, "learning_rate": 7.674277076117718e-05, - "loss": 3.6555, + "loss": 1.8672, "step": 4980 }, { "epoch": 0.6178347896139307, - "grad_norm": 0.2639320933604655, + "grad_norm": 0.13523516678932618, "learning_rate": 7.653242406699267e-05, - "loss": 3.6633, + "loss": 1.8195, "step": 4985 }, { "epoch": 0.6184544834851583, - "grad_norm": 0.2827465508666598, + "grad_norm": 0.13297527973482937, "learning_rate": 7.632218720128056e-05, - "loss": 3.5836, + "loss": 1.8781, "step": 4990 }, { "epoch": 0.6190741773563859, - "grad_norm": 0.25845031918040845, + "grad_norm": 0.13350350195977134, "learning_rate": 7.611206114795126e-05, - "loss": 3.6258, + "loss": 1.8102, "step": 4995 }, { "epoch": 0.6196938712276135, - "grad_norm": 0.27017652307124396, + "grad_norm": 0.13726854441704386, "learning_rate": 7.590204689039671e-05, - "loss": 3.7398, + "loss": 1.8469, "step": 5000 }, { "epoch": 0.6203135650988412, - "grad_norm": 0.263829705804654, + "grad_norm": 0.12880233502711264, "learning_rate": 7.569214541148551e-05, - "loss": 3.7102, + "loss": 1.8051, "step": 5005 }, { "epoch": 0.6209332589700688, - "grad_norm": 0.272388249038324, + "grad_norm": 0.14314377591100924, "learning_rate": 7.548235769355858e-05, - "loss": 3.7344, + "loss": 1.8531, "step": 5010 }, { "epoch": 0.6215529528412964, - "grad_norm": 0.2740607682775529, + "grad_norm": 0.1398214144314476, "learning_rate": 7.527268471842436e-05, - "loss": 3.6148, + "loss": 1.8633, "step": 5015 }, { "epoch": 0.622172646712524, - "grad_norm": 0.261982458328612, + "grad_norm": 0.1422631562696818, "learning_rate": 7.506312746735432e-05, - "loss": 3.6453, + "loss": 1.8121, "step": 5020 }, { "epoch": 0.6227923405837517, - "grad_norm": 0.30048336140769255, + "grad_norm": 0.13683576892585453, "learning_rate": 7.48536869210783e-05, - "loss": 3.6078, + "loss": 1.8523, "step": 5025 }, { "epoch": 0.6234120344549793, - "grad_norm": 0.2605099143553377, + "grad_norm": 0.13252236443713647, "learning_rate": 7.464436405978002e-05, - "loss": 3.7477, + "loss": 1.8219, "step": 5030 }, { "epoch": 0.6240317283262069, - "grad_norm": 0.2783089384043496, + "grad_norm": 0.13856901697884766, "learning_rate": 7.44351598630924e-05, - "loss": 3.6391, + "loss": 1.8586, "step": 5035 }, { "epoch": 0.6246514221974344, - "grad_norm": 0.269056158083971, + "grad_norm": 0.13467211382407307, "learning_rate": 7.422607531009302e-05, - "loss": 3.7281, + "loss": 1.8246, "step": 5040 }, { "epoch": 0.625271116068662, - "grad_norm": 0.2719744983748762, + "grad_norm": 0.13455069844136516, "learning_rate": 7.401711137929955e-05, - "loss": 3.7648, + "loss": 1.8809, "step": 5045 }, { "epoch": 0.6258908099398897, - "grad_norm": 0.27808306980704667, + "grad_norm": 0.12705255181918326, "learning_rate": 7.380826904866504e-05, - "loss": 3.6828, + "loss": 1.8727, "step": 5050 }, { "epoch": 0.6265105038111173, - "grad_norm": 0.2785298104106332, + "grad_norm": 0.14168378260835907, "learning_rate": 7.35995492955736e-05, - "loss": 3.6023, + "loss": 1.8375, "step": 5055 }, { "epoch": 0.6271301976823449, - "grad_norm": 0.25365798748393725, + "grad_norm": 0.12904228750519411, "learning_rate": 7.339095309683557e-05, - "loss": 3.7969, + "loss": 1.8516, "step": 5060 }, { "epoch": 0.6277498915535725, - "grad_norm": 0.258311153426152, + "grad_norm": 0.12951799383226725, "learning_rate": 7.31824814286831e-05, - "loss": 3.7062, + "loss": 1.827, "step": 5065 }, { "epoch": 0.6283695854248001, - "grad_norm": 0.2681029758178154, + "grad_norm": 0.1425730592753484, "learning_rate": 7.297413526676553e-05, - "loss": 3.6703, + "loss": 1.7684, "step": 5070 }, { "epoch": 0.6289892792960278, - "grad_norm": 0.25425870016804963, + "grad_norm": 0.13509911728359397, "learning_rate": 7.276591558614482e-05, - "loss": 3.7852, + "loss": 1.8562, "step": 5075 }, { "epoch": 0.6296089731672554, - "grad_norm": 0.28200262079953675, + "grad_norm": 0.12664229753196302, "learning_rate": 7.2557823361291e-05, - "loss": 3.7766, + "loss": 1.8227, "step": 5080 }, { "epoch": 0.630228667038483, - "grad_norm": 0.2602859160465504, + "grad_norm": 0.1351445427240646, "learning_rate": 7.23498595660776e-05, - "loss": 3.6867, + "loss": 1.8434, "step": 5085 }, { "epoch": 0.6308483609097106, - "grad_norm": 0.2705199490904154, + "grad_norm": 0.13550229832786473, "learning_rate": 7.21420251737771e-05, - "loss": 3.75, + "loss": 1.8648, "step": 5090 }, { "epoch": 0.6314680547809383, - "grad_norm": 0.25985423252580486, + "grad_norm": 0.13783515484834394, "learning_rate": 7.19343211570564e-05, - "loss": 3.7039, + "loss": 1.8687, "step": 5095 }, { "epoch": 0.6320877486521659, - "grad_norm": 0.26575649360663545, + "grad_norm": 0.13698632676880806, "learning_rate": 7.172674848797219e-05, - "loss": 3.7219, + "loss": 1.7918, "step": 5100 }, { "epoch": 0.6327074425233934, - "grad_norm": 0.269094262766888, + "grad_norm": 0.13469212714793175, "learning_rate": 7.151930813796655e-05, - "loss": 3.6102, + "loss": 1.8109, "step": 5105 }, { "epoch": 0.633327136394621, - "grad_norm": 0.26672713090869327, + "grad_norm": 0.13371221943648842, "learning_rate": 7.131200107786214e-05, - "loss": 3.5633, + "loss": 1.8078, "step": 5110 }, { "epoch": 0.6339468302658486, - "grad_norm": 0.2676506827919143, + "grad_norm": 0.13978406533114363, "learning_rate": 7.110482827785796e-05, - "loss": 3.6844, + "loss": 1.7805, "step": 5115 }, { "epoch": 0.6345665241370763, - "grad_norm": 0.2680216210875524, + "grad_norm": 0.14360037321686958, "learning_rate": 7.089779070752463e-05, - "loss": 3.8469, + "loss": 1.8004, "step": 5120 }, { "epoch": 0.6351862180083039, - "grad_norm": 0.27889275766782334, + "grad_norm": 0.1441559480748562, "learning_rate": 7.069088933579988e-05, - "loss": 3.5789, + "loss": 1.816, "step": 5125 }, { "epoch": 0.6358059118795315, - "grad_norm": 0.27602158130456417, + "grad_norm": 0.13537029010980783, "learning_rate": 7.048412513098412e-05, - "loss": 3.6328, + "loss": 1.8727, "step": 5130 }, { "epoch": 0.6364256057507591, - "grad_norm": 0.27748934261100744, + "grad_norm": 0.1394840279411983, "learning_rate": 7.027749906073564e-05, - "loss": 3.5906, + "loss": 1.7949, "step": 5135 }, { "epoch": 0.6370452996219867, - "grad_norm": 0.2573107306456965, + "grad_norm": 0.14014978043715506, "learning_rate": 7.007101209206639e-05, - "loss": 3.6711, + "loss": 1.8586, "step": 5140 }, { "epoch": 0.6376649934932144, - "grad_norm": 0.2781670907626585, + "grad_norm": 0.13162962388412522, "learning_rate": 6.98646651913373e-05, - "loss": 3.7609, + "loss": 1.8465, "step": 5145 }, { "epoch": 0.638284687364442, - "grad_norm": 0.2871360053272032, + "grad_norm": 0.14382551661597553, "learning_rate": 6.965845932425377e-05, - "loss": 3.6289, + "loss": 1.866, "step": 5150 }, { "epoch": 0.6389043812356696, - "grad_norm": 0.28246011585382963, + "grad_norm": 0.13458729772042977, "learning_rate": 6.945239545586117e-05, - "loss": 3.5617, + "loss": 1.8449, "step": 5155 }, { "epoch": 0.6395240751068972, - "grad_norm": 0.27900694670602694, + "grad_norm": 0.13804106195535212, "learning_rate": 6.924647455054036e-05, - "loss": 3.6117, + "loss": 1.7883, "step": 5160 }, { "epoch": 0.6401437689781249, - "grad_norm": 0.2723062853542401, + "grad_norm": 0.13515409775532647, "learning_rate": 6.9040697572003e-05, - "loss": 3.7, + "loss": 1.8359, "step": 5165 }, { "epoch": 0.6407634628493524, - "grad_norm": 0.2914722506156454, + "grad_norm": 0.13624779550221264, "learning_rate": 6.883506548328724e-05, - "loss": 3.6047, + "loss": 1.8066, "step": 5170 }, { "epoch": 0.64138315672058, - "grad_norm": 0.25979291503704416, + "grad_norm": 0.1404713378731896, "learning_rate": 6.862957924675323e-05, - "loss": 3.6648, + "loss": 1.8582, "step": 5175 }, { "epoch": 0.6420028505918076, - "grad_norm": 0.2759588390942882, + "grad_norm": 0.13353167504613667, "learning_rate": 6.842423982407842e-05, - "loss": 3.8023, + "loss": 1.843, "step": 5180 }, { "epoch": 0.6426225444630352, - "grad_norm": 0.26539685236538607, + "grad_norm": 0.13711887655413635, "learning_rate": 6.821904817625326e-05, - "loss": 3.7352, + "loss": 1.8637, "step": 5185 }, { "epoch": 0.6432422383342629, - "grad_norm": 0.28057212369468476, + "grad_norm": 0.13286338586049254, "learning_rate": 6.801400526357647e-05, - "loss": 3.6742, + "loss": 1.8516, "step": 5190 }, { "epoch": 0.6438619322054905, - "grad_norm": 0.25859286944728005, + "grad_norm": 0.1362928199879506, "learning_rate": 6.78091120456508e-05, - "loss": 3.6766, + "loss": 1.798, "step": 5195 }, { "epoch": 0.6444816260767181, - "grad_norm": 0.2811565662064264, + "grad_norm": 0.13578384732652682, "learning_rate": 6.760436948137843e-05, - "loss": 3.8312, + "loss": 1.8605, "step": 5200 }, { "epoch": 0.6451013199479457, - "grad_norm": 0.2848133060880273, + "grad_norm": 0.13894639333958217, "learning_rate": 6.739977852895647e-05, - "loss": 3.6781, + "loss": 1.8391, "step": 5205 }, { "epoch": 0.6457210138191734, - "grad_norm": 0.26811279612194816, + "grad_norm": 0.1444595514101014, "learning_rate": 6.719534014587243e-05, - "loss": 3.6383, + "loss": 1.8305, "step": 5210 }, { "epoch": 0.646340707690401, - "grad_norm": 0.2690681612279308, + "grad_norm": 0.129775059828516, "learning_rate": 6.699105528889988e-05, - "loss": 3.6758, + "loss": 1.8738, "step": 5215 }, { "epoch": 0.6469604015616286, - "grad_norm": 0.2739057803867215, + "grad_norm": 0.13988858403098833, "learning_rate": 6.678692491409378e-05, - "loss": 3.682, + "loss": 1.8102, "step": 5220 }, { "epoch": 0.6475800954328562, - "grad_norm": 0.2641595804456671, + "grad_norm": 0.1344900153408345, "learning_rate": 6.658294997678621e-05, - "loss": 3.8078, + "loss": 1.8879, "step": 5225 }, { "epoch": 0.6481997893040837, - "grad_norm": 0.25907535741498083, + "grad_norm": 0.13501587523263156, "learning_rate": 6.637913143158175e-05, - "loss": 3.6281, + "loss": 1.823, "step": 5230 }, { "epoch": 0.6488194831753114, - "grad_norm": 0.26304684862546324, + "grad_norm": 0.1322718696914636, "learning_rate": 6.617547023235309e-05, - "loss": 3.7375, + "loss": 1.8813, "step": 5235 }, { "epoch": 0.649439177046539, - "grad_norm": 0.2563673164937869, + "grad_norm": 0.13942300547170608, "learning_rate": 6.597196733223651e-05, - "loss": 3.6063, + "loss": 1.8574, "step": 5240 }, { "epoch": 0.6500588709177666, - "grad_norm": 0.2746685068321447, + "grad_norm": 0.14428546074299917, "learning_rate": 6.576862368362747e-05, - "loss": 3.6219, + "loss": 1.8062, "step": 5245 }, { "epoch": 0.6506785647889942, - "grad_norm": 0.26941330753939924, + "grad_norm": 0.13480317318245413, "learning_rate": 6.556544023817613e-05, - "loss": 3.6773, + "loss": 1.9059, "step": 5250 }, { "epoch": 0.6512982586602218, - "grad_norm": 0.3084803609497407, + "grad_norm": 0.1312291352484845, "learning_rate": 6.536241794678288e-05, - "loss": 3.6852, + "loss": 1.8102, "step": 5255 }, { "epoch": 0.6519179525314495, - "grad_norm": 0.2681737705088487, + "grad_norm": 0.13249440712367896, "learning_rate": 6.515955775959394e-05, - "loss": 3.7117, + "loss": 1.8719, "step": 5260 }, { "epoch": 0.6525376464026771, - "grad_norm": 0.27332131990349773, + "grad_norm": 0.13525668669855212, "learning_rate": 6.495686062599684e-05, - "loss": 3.6141, + "loss": 1.7945, "step": 5265 }, { "epoch": 0.6531573402739047, - "grad_norm": 0.26817048963551954, + "grad_norm": 0.13953894070499534, "learning_rate": 6.475432749461607e-05, - "loss": 3.757, + "loss": 1.8211, "step": 5270 }, { "epoch": 0.6537770341451323, - "grad_norm": 0.2912555620129492, + "grad_norm": 0.13339702671582024, "learning_rate": 6.455195931330855e-05, - "loss": 3.7703, + "loss": 1.8375, "step": 5275 }, { "epoch": 0.65439672801636, - "grad_norm": 0.2715643651310061, + "grad_norm": 0.1308872237787671, "learning_rate": 6.43497570291592e-05, - "loss": 3.6313, + "loss": 1.8645, "step": 5280 }, { "epoch": 0.6550164218875876, - "grad_norm": 0.25897755858739324, + "grad_norm": 0.13823305004742287, "learning_rate": 6.414772158847661e-05, - "loss": 3.7648, + "loss": 1.8766, "step": 5285 }, { "epoch": 0.6556361157588152, - "grad_norm": 0.27462922238791165, + "grad_norm": 0.1399613751358962, "learning_rate": 6.394585393678851e-05, - "loss": 3.5742, + "loss": 1.8031, "step": 5290 }, { "epoch": 0.6562558096300427, - "grad_norm": 0.27731728573308645, + "grad_norm": 0.13564743794929152, "learning_rate": 6.374415501883741e-05, - "loss": 3.6219, + "loss": 1.7996, "step": 5295 }, { "epoch": 0.6568755035012703, - "grad_norm": 0.26603376474150264, + "grad_norm": 0.13777383842407462, "learning_rate": 6.354262577857606e-05, - "loss": 3.7352, + "loss": 1.8645, "step": 5300 }, { "epoch": 0.657495197372498, - "grad_norm": 0.2786861537303143, + "grad_norm": 0.14386060040722395, "learning_rate": 6.334126715916318e-05, - "loss": 3.5867, + "loss": 1.8031, "step": 5305 }, { "epoch": 0.6581148912437256, - "grad_norm": 0.2801961113701643, + "grad_norm": 0.13632842622265176, "learning_rate": 6.314008010295897e-05, - "loss": 3.7102, + "loss": 1.7953, "step": 5310 }, { "epoch": 0.6587345851149532, - "grad_norm": 0.2787675183338051, + "grad_norm": 0.14031049401636042, "learning_rate": 6.293906555152072e-05, - "loss": 3.7641, + "loss": 1.8473, "step": 5315 }, { "epoch": 0.6593542789861808, - "grad_norm": 0.2682963267821453, + "grad_norm": 0.13222425685504624, "learning_rate": 6.273822444559839e-05, - "loss": 3.6117, + "loss": 1.8613, "step": 5320 }, { "epoch": 0.6599739728574084, - "grad_norm": 0.27492368327956523, + "grad_norm": 0.13291257291641337, "learning_rate": 6.253755772513024e-05, - "loss": 3.7219, + "loss": 1.7691, "step": 5325 }, { "epoch": 0.6605936667286361, - "grad_norm": 0.2759166364793097, + "grad_norm": 0.13144689432166498, "learning_rate": 6.233706632923832e-05, - "loss": 3.6437, + "loss": 1.8609, "step": 5330 }, { "epoch": 0.6612133605998637, - "grad_norm": 0.2653509999216259, + "grad_norm": 0.13941925756899518, "learning_rate": 6.213675119622425e-05, - "loss": 3.7406, + "loss": 1.8164, "step": 5335 }, { "epoch": 0.6618330544710913, - "grad_norm": 0.28142671319418383, + "grad_norm": 0.13100150642405495, "learning_rate": 6.19366132635647e-05, - "loss": 3.6461, + "loss": 1.8211, "step": 5340 }, { "epoch": 0.6624527483423189, - "grad_norm": 0.2831668256834555, + "grad_norm": 0.13942514680969523, "learning_rate": 6.173665346790704e-05, - "loss": 3.6984, + "loss": 1.9023, "step": 5345 }, { "epoch": 0.6630724422135466, - "grad_norm": 0.2665448241837328, + "grad_norm": 0.13192303146868606, "learning_rate": 6.153687274506501e-05, - "loss": 3.7305, + "loss": 1.8059, "step": 5350 }, { "epoch": 0.6636921360847742, - "grad_norm": 0.29779814887341804, + "grad_norm": 0.1388421354260116, "learning_rate": 6.133727203001415e-05, - "loss": 3.757, + "loss": 1.8008, "step": 5355 }, { "epoch": 0.6643118299560017, - "grad_norm": 0.28358375762398536, + "grad_norm": 0.1333146563999724, "learning_rate": 6.113785225688772e-05, - "loss": 3.693, + "loss": 1.8508, "step": 5360 }, { "epoch": 0.6649315238272293, - "grad_norm": 0.27688829546795823, + "grad_norm": 0.13829025809259454, "learning_rate": 6.093861435897208e-05, - "loss": 3.6906, + "loss": 1.8555, "step": 5365 }, { "epoch": 0.6655512176984569, - "grad_norm": 0.27159258119460017, + "grad_norm": 0.13128060200833697, "learning_rate": 6.073955926870243e-05, - "loss": 3.6133, + "loss": 1.8176, "step": 5370 }, { "epoch": 0.6661709115696846, - "grad_norm": 0.2706578605527812, + "grad_norm": 0.14008029597105764, "learning_rate": 6.0540687917658445e-05, - "loss": 3.6516, + "loss": 1.7871, "step": 5375 }, { "epoch": 0.6667906054409122, - "grad_norm": 0.272240993622507, + "grad_norm": 0.13309428884917443, "learning_rate": 6.034200123655993e-05, - "loss": 3.7875, + "loss": 1.8508, "step": 5380 }, { "epoch": 0.6674102993121398, - "grad_norm": 0.2691414978331236, + "grad_norm": 0.13778281479016122, "learning_rate": 6.01435001552623e-05, - "loss": 3.707, + "loss": 1.8551, "step": 5385 }, { "epoch": 0.6680299931833674, - "grad_norm": 0.2748226868840819, + "grad_norm": 0.13712329360654402, "learning_rate": 5.9945185602752496e-05, - "loss": 3.7852, + "loss": 1.8121, "step": 5390 }, { "epoch": 0.668649687054595, - "grad_norm": 0.2796562783963983, + "grad_norm": 0.14056090297114665, "learning_rate": 5.974705850714444e-05, - "loss": 3.6086, + "loss": 1.859, "step": 5395 }, { "epoch": 0.6692693809258227, - "grad_norm": 0.2741689382602694, + "grad_norm": 0.1347457309130447, "learning_rate": 5.95491197956748e-05, - "loss": 3.6648, + "loss": 1.8328, "step": 5400 }, { "epoch": 0.6698890747970503, - "grad_norm": 0.2647913630087514, + "grad_norm": 0.1298320820999991, "learning_rate": 5.9351370394698604e-05, - "loss": 3.6727, + "loss": 1.834, "step": 5405 }, { "epoch": 0.6705087686682779, - "grad_norm": 0.26773543962456603, + "grad_norm": 0.13333710741549507, "learning_rate": 5.9153811229684794e-05, - "loss": 3.7328, + "loss": 1.8566, "step": 5410 }, { "epoch": 0.6711284625395055, - "grad_norm": 0.27461350736207607, + "grad_norm": 0.14198552870822626, "learning_rate": 5.895644322521212e-05, - "loss": 3.7398, + "loss": 1.8516, "step": 5415 }, { "epoch": 0.671748156410733, - "grad_norm": 0.261829577537663, + "grad_norm": 0.13856885989348303, "learning_rate": 5.875926730496471e-05, - "loss": 3.7109, + "loss": 1.8672, "step": 5420 }, { "epoch": 0.6723678502819607, - "grad_norm": 0.27459196700473204, + "grad_norm": 0.13510293511167396, "learning_rate": 5.856228439172764e-05, - "loss": 3.7156, + "loss": 1.9039, "step": 5425 }, { "epoch": 0.6729875441531883, - "grad_norm": 0.2671129219642882, + "grad_norm": 0.13788198959803624, "learning_rate": 5.836549540738281e-05, - "loss": 3.6805, + "loss": 1.8254, "step": 5430 }, { "epoch": 0.6736072380244159, - "grad_norm": 0.2720255207257116, + "grad_norm": 0.13159584100189814, "learning_rate": 5.816890127290446e-05, - "loss": 3.7313, + "loss": 1.8254, "step": 5435 }, { "epoch": 0.6742269318956435, - "grad_norm": 0.2779000239190421, + "grad_norm": 0.14690626686001587, "learning_rate": 5.7972502908354954e-05, - "loss": 3.618, + "loss": 1.8363, "step": 5440 }, { "epoch": 0.6748466257668712, - "grad_norm": 0.28122200811943143, + "grad_norm": 0.13059037203311238, "learning_rate": 5.777630123288046e-05, - "loss": 3.7664, + "loss": 1.8855, "step": 5445 }, { "epoch": 0.6754663196380988, - "grad_norm": 0.2816358882326997, + "grad_norm": 0.13878103289346766, "learning_rate": 5.758029716470664e-05, - "loss": 3.6461, + "loss": 1.8887, "step": 5450 }, { "epoch": 0.6760860135093264, - "grad_norm": 0.2772970420175764, + "grad_norm": 0.14633351758116236, "learning_rate": 5.738449162113435e-05, - "loss": 3.6594, + "loss": 1.8047, "step": 5455 }, { "epoch": 0.676705707380554, - "grad_norm": 0.27756919708943467, + "grad_norm": 0.13885286212193357, "learning_rate": 5.7188885518535365e-05, - "loss": 3.7172, + "loss": 1.8578, "step": 5460 }, { "epoch": 0.6773254012517816, - "grad_norm": 0.25998511603464325, + "grad_norm": 0.13247881906826084, "learning_rate": 5.699347977234799e-05, - "loss": 3.5633, + "loss": 1.8477, "step": 5465 }, { "epoch": 0.6779450951230093, - "grad_norm": 0.2774423540805963, + "grad_norm": 0.1368360748545682, "learning_rate": 5.679827529707295e-05, - "loss": 3.693, + "loss": 1.7785, "step": 5470 }, { "epoch": 0.6785647889942369, - "grad_norm": 0.27067129052342487, + "grad_norm": 0.13529477063650758, "learning_rate": 5.660327300626902e-05, - "loss": 3.6727, + "loss": 1.8754, "step": 5475 }, { "epoch": 0.6791844828654645, - "grad_norm": 0.2777602302202561, + "grad_norm": 0.14429065728590754, "learning_rate": 5.640847381254869e-05, - "loss": 3.6094, + "loss": 1.827, "step": 5480 }, { "epoch": 0.679804176736692, - "grad_norm": 0.2767422532288637, + "grad_norm": 0.13699866016353732, "learning_rate": 5.621387862757397e-05, - "loss": 3.6922, + "loss": 1.8109, "step": 5485 }, { "epoch": 0.6804238706079196, - "grad_norm": 0.26569803803864545, + "grad_norm": 0.13888564340023535, "learning_rate": 5.6019488362052255e-05, - "loss": 3.6375, + "loss": 1.8152, "step": 5490 }, { "epoch": 0.6810435644791473, - "grad_norm": 0.2685320593943573, + "grad_norm": 0.13583014916724537, "learning_rate": 5.582530392573164e-05, - "loss": 3.6742, + "loss": 1.8211, "step": 5495 }, { "epoch": 0.6816632583503749, - "grad_norm": 0.2696298711513978, + "grad_norm": 0.13614465884313762, "learning_rate": 5.563132622739713e-05, - "loss": 3.693, + "loss": 1.841, "step": 5500 }, { "epoch": 0.6822829522216025, - "grad_norm": 0.27986715683283847, + "grad_norm": 0.13628815897778207, "learning_rate": 5.5437556174866156e-05, - "loss": 3.6586, + "loss": 1.7945, "step": 5505 }, { "epoch": 0.6829026460928301, - "grad_norm": 0.2846125032247335, + "grad_norm": 0.13622985157113032, "learning_rate": 5.5243994674984345e-05, - "loss": 3.6461, + "loss": 1.8762, "step": 5510 }, { "epoch": 0.6835223399640578, - "grad_norm": 0.2679407955321904, + "grad_norm": 0.13892313184585137, "learning_rate": 5.505064263362136e-05, - "loss": 3.6672, + "loss": 1.8637, "step": 5515 }, { "epoch": 0.6841420338352854, - "grad_norm": 0.27825325641781, + "grad_norm": 0.133688611710239, "learning_rate": 5.485750095566644e-05, - "loss": 3.693, + "loss": 1.9133, "step": 5520 }, { "epoch": 0.684761727706513, - "grad_norm": 0.26863788090470064, + "grad_norm": 0.14133553441876628, "learning_rate": 5.46645705450245e-05, - "loss": 3.6656, + "loss": 1.8555, "step": 5525 }, { "epoch": 0.6853814215777406, - "grad_norm": 0.2635749210177757, + "grad_norm": 0.13349705900448725, "learning_rate": 5.447185230461156e-05, - "loss": 3.807, + "loss": 1.7844, "step": 5530 }, { "epoch": 0.6860011154489682, - "grad_norm": 0.2712057403712623, + "grad_norm": 0.1328239605606481, "learning_rate": 5.427934713635088e-05, - "loss": 3.7086, + "loss": 1.8617, "step": 5535 }, { "epoch": 0.6866208093201959, - "grad_norm": 0.27093839270676695, + "grad_norm": 0.1381643142277639, "learning_rate": 5.4087055941168384e-05, - "loss": 3.7414, + "loss": 1.825, "step": 5540 }, { "epoch": 0.6872405031914235, - "grad_norm": 0.2713959823426367, + "grad_norm": 0.13528845623515193, "learning_rate": 5.389497961898866e-05, - "loss": 3.6578, + "loss": 1.8598, "step": 5545 }, { "epoch": 0.687860197062651, - "grad_norm": 0.28889436188917833, + "grad_norm": 0.12988354878682856, "learning_rate": 5.370311906873062e-05, - "loss": 3.5922, + "loss": 1.8645, "step": 5550 }, { "epoch": 0.6884798909338786, - "grad_norm": 0.27027978550422393, + "grad_norm": 0.1278301642155368, "learning_rate": 5.351147518830345e-05, - "loss": 3.7133, + "loss": 1.8238, "step": 5555 }, { "epoch": 0.6890995848051062, - "grad_norm": 0.26119809455788323, + "grad_norm": 0.1323076496864634, "learning_rate": 5.3320048874602266e-05, - "loss": 3.7734, + "loss": 1.8457, "step": 5560 }, { "epoch": 0.6897192786763339, - "grad_norm": 0.2535064884203829, + "grad_norm": 0.13812568199928557, "learning_rate": 5.3128841023504e-05, - "loss": 3.7375, + "loss": 1.8723, "step": 5565 }, { "epoch": 0.6903389725475615, - "grad_norm": 0.2793430975177648, + "grad_norm": 0.139934405228208, "learning_rate": 5.293785252986321e-05, - "loss": 3.6523, + "loss": 1.8379, "step": 5570 }, { "epoch": 0.6909586664187891, - "grad_norm": 0.27778118811253677, + "grad_norm": 0.13715770643194863, "learning_rate": 5.274708428750765e-05, - "loss": 3.5797, + "loss": 1.7949, "step": 5575 }, { "epoch": 0.6915783602900167, - "grad_norm": 0.270323450033622, + "grad_norm": 0.1298711106881199, "learning_rate": 5.255653718923463e-05, - "loss": 3.6961, + "loss": 1.8375, "step": 5580 }, { "epoch": 0.6921980541612444, - "grad_norm": 0.2653637163384434, + "grad_norm": 0.13859980183407575, "learning_rate": 5.236621212680628e-05, - "loss": 3.7023, + "loss": 1.8297, "step": 5585 }, { "epoch": 0.692817748032472, - "grad_norm": 0.2753589669008408, + "grad_norm": 0.1318933840751103, "learning_rate": 5.217610999094563e-05, - "loss": 3.7102, + "loss": 1.841, "step": 5590 }, { "epoch": 0.6934374419036996, - "grad_norm": 0.29022430253276155, + "grad_norm": 0.14800086449754893, "learning_rate": 5.1986231671332454e-05, - "loss": 3.6742, + "loss": 1.843, "step": 5595 }, { "epoch": 0.6940571357749272, - "grad_norm": 0.27688395623809187, + "grad_norm": 0.1323008676469033, "learning_rate": 5.179657805659908e-05, - "loss": 3.5992, + "loss": 1.8613, "step": 5600 }, { "epoch": 0.6946768296461548, - "grad_norm": 0.26855552012781275, + "grad_norm": 0.13870752821590712, "learning_rate": 5.160715003432608e-05, - "loss": 3.718, + "loss": 1.8664, "step": 5605 }, { "epoch": 0.6952965235173824, - "grad_norm": 0.2605313018963682, + "grad_norm": 0.13883802765782519, "learning_rate": 5.1417948491038416e-05, - "loss": 3.7578, + "loss": 1.8098, "step": 5610 }, { "epoch": 0.69591621738861, - "grad_norm": 0.2664551240286889, + "grad_norm": 0.14505650835163597, "learning_rate": 5.122897431220104e-05, - "loss": 3.7234, + "loss": 1.848, "step": 5615 }, { "epoch": 0.6965359112598376, - "grad_norm": 0.27718395586925093, + "grad_norm": 0.142610612049516, "learning_rate": 5.104022838221487e-05, - "loss": 3.6727, + "loss": 1.7668, "step": 5620 }, { "epoch": 0.6971556051310652, - "grad_norm": 0.2779483743020303, + "grad_norm": 0.13663598600996665, "learning_rate": 5.085171158441261e-05, - "loss": 3.6977, + "loss": 1.8543, "step": 5625 }, { "epoch": 0.6977752990022928, - "grad_norm": 0.2851749067821478, + "grad_norm": 0.14135398916556138, "learning_rate": 5.0663424801054595e-05, - "loss": 3.5562, + "loss": 1.8148, "step": 5630 }, { "epoch": 0.6983949928735205, - "grad_norm": 0.2735140799612901, + "grad_norm": 0.1382568356418709, "learning_rate": 5.047536891332473e-05, - "loss": 3.7687, + "loss": 1.8004, "step": 5635 }, { "epoch": 0.6990146867447481, - "grad_norm": 0.27891630984844623, + "grad_norm": 0.13404657874607154, "learning_rate": 5.0287544801326293e-05, - "loss": 3.6484, + "loss": 1.8324, "step": 5640 }, { "epoch": 0.6996343806159757, - "grad_norm": 0.2798336852376535, + "grad_norm": 0.13905564445774873, "learning_rate": 5.0099953344077885e-05, - "loss": 3.6914, + "loss": 1.8457, "step": 5645 }, { "epoch": 0.7002540744872033, - "grad_norm": 0.28114121108671125, + "grad_norm": 0.1369113594997885, "learning_rate": 4.991259541950924e-05, - "loss": 3.6063, + "loss": 1.8586, "step": 5650 }, { "epoch": 0.700873768358431, - "grad_norm": 0.27673349011973397, + "grad_norm": 0.13278800041844127, "learning_rate": 4.972547190445723e-05, - "loss": 3.7367, + "loss": 1.8719, "step": 5655 }, { "epoch": 0.7014934622296586, - "grad_norm": 0.2662079818812042, + "grad_norm": 0.14229177852733657, "learning_rate": 4.953858367466155e-05, - "loss": 3.6422, + "loss": 1.8949, "step": 5660 }, { "epoch": 0.7021131561008862, - "grad_norm": 0.27756807998773575, + "grad_norm": 0.14078175269659313, "learning_rate": 4.9351931604760907e-05, - "loss": 3.7484, + "loss": 1.8633, "step": 5665 }, { "epoch": 0.7027328499721138, - "grad_norm": 0.27804509429596413, + "grad_norm": 0.1317958863467384, "learning_rate": 4.9165516568288674e-05, - "loss": 3.6445, + "loss": 1.8641, "step": 5670 }, { "epoch": 0.7033525438433413, - "grad_norm": 0.275651335130277, + "grad_norm": 0.13544413510947215, "learning_rate": 4.897933943766897e-05, - "loss": 3.6531, + "loss": 1.816, "step": 5675 }, { "epoch": 0.703972237714569, - "grad_norm": 0.27122700127379756, + "grad_norm": 0.1298237481765858, "learning_rate": 4.879340108421248e-05, - "loss": 3.7055, + "loss": 1.8805, "step": 5680 }, { "epoch": 0.7045919315857966, - "grad_norm": 0.26980941705603506, + "grad_norm": 0.13472711627814213, "learning_rate": 4.8607702378112415e-05, - "loss": 3.7414, + "loss": 1.841, "step": 5685 }, { "epoch": 0.7052116254570242, - "grad_norm": 0.26003012135298476, + "grad_norm": 0.13535347127585984, "learning_rate": 4.842224418844045e-05, - "loss": 3.5023, + "loss": 1.8094, "step": 5690 }, { "epoch": 0.7058313193282518, - "grad_norm": 0.2647503412298433, + "grad_norm": 0.13436886422476685, "learning_rate": 4.823702738314262e-05, - "loss": 3.7102, + "loss": 1.823, "step": 5695 }, { "epoch": 0.7064510131994794, - "grad_norm": 0.2712871372151752, + "grad_norm": 0.14068253039093517, "learning_rate": 4.8052052829035275e-05, - "loss": 3.8047, + "loss": 1.8824, "step": 5700 }, { "epoch": 0.7070707070707071, - "grad_norm": 0.27859849030011535, + "grad_norm": 0.1346479936388671, "learning_rate": 4.7867321391801065e-05, - "loss": 3.682, + "loss": 1.8781, "step": 5705 }, { "epoch": 0.7076904009419347, - "grad_norm": 0.2820394324659631, + "grad_norm": 0.13435815599940173, "learning_rate": 4.768283393598484e-05, - "loss": 3.5344, + "loss": 1.8207, "step": 5710 }, { "epoch": 0.7083100948131623, - "grad_norm": 0.2591429454872442, + "grad_norm": 0.14223328089430068, "learning_rate": 4.749859132498953e-05, - "loss": 3.6195, + "loss": 1.8543, "step": 5715 }, { "epoch": 0.7089297886843899, - "grad_norm": 0.3010598054872879, + "grad_norm": 0.13941692924625854, "learning_rate": 4.73145944210723e-05, - "loss": 3.6203, + "loss": 1.8285, "step": 5720 }, { "epoch": 0.7095494825556176, - "grad_norm": 0.2722081655502301, + "grad_norm": 0.13441710577473923, "learning_rate": 4.713084408534035e-05, - "loss": 3.6688, + "loss": 1.7984, "step": 5725 }, { "epoch": 0.7101691764268452, - "grad_norm": 0.27122867523097466, + "grad_norm": 0.14059083732086994, "learning_rate": 4.6947341177746926e-05, - "loss": 3.7344, + "loss": 1.7797, "step": 5730 }, { "epoch": 0.7107888702980728, - "grad_norm": 0.27824464760436307, + "grad_norm": 0.13080419041067848, "learning_rate": 4.6764086557087406e-05, - "loss": 3.7719, + "loss": 1.8066, "step": 5735 }, { "epoch": 0.7114085641693003, - "grad_norm": 0.27414327511872144, + "grad_norm": 0.14130261841489294, "learning_rate": 4.65810810809951e-05, - "loss": 3.6859, + "loss": 1.8453, "step": 5740 }, { "epoch": 0.7120282580405279, - "grad_norm": 0.27915537657137607, + "grad_norm": 0.14387364542120532, "learning_rate": 4.6398325605937265e-05, - "loss": 3.6406, + "loss": 1.8383, "step": 5745 }, { "epoch": 0.7126479519117556, - "grad_norm": 0.2781760005338474, + "grad_norm": 0.138013788809857, "learning_rate": 4.621582098721124e-05, - "loss": 3.7109, + "loss": 1.8703, "step": 5750 }, { "epoch": 0.7132676457829832, - "grad_norm": 0.279762376148418, + "grad_norm": 0.13149717266725255, "learning_rate": 4.6033568078940345e-05, - "loss": 3.6594, + "loss": 1.8422, "step": 5755 }, { "epoch": 0.7138873396542108, - "grad_norm": 0.2635731530602869, + "grad_norm": 0.14300678827280267, "learning_rate": 4.585156773406986e-05, - "loss": 3.5781, + "loss": 1.8402, "step": 5760 }, { "epoch": 0.7145070335254384, - "grad_norm": 0.2807646872525827, + "grad_norm": 0.1369909491023122, "learning_rate": 4.5669820804363116e-05, - "loss": 3.7719, + "loss": 1.827, "step": 5765 }, { "epoch": 0.715126727396666, - "grad_norm": 0.28003214694151435, + "grad_norm": 0.1381026280394612, "learning_rate": 4.5488328140397364e-05, - "loss": 3.6719, + "loss": 1.8656, "step": 5770 }, { "epoch": 0.7157464212678937, - "grad_norm": 0.2679072939914116, + "grad_norm": 0.13344045108890723, "learning_rate": 4.530709059155994e-05, - "loss": 3.7164, + "loss": 1.8281, "step": 5775 }, { "epoch": 0.7163661151391213, - "grad_norm": 0.26558897424509803, + "grad_norm": 0.14375868703270098, "learning_rate": 4.512610900604434e-05, - "loss": 3.6875, + "loss": 1.8422, "step": 5780 }, { "epoch": 0.7169858090103489, - "grad_norm": 0.2738273465942126, + "grad_norm": 0.1373988139845734, "learning_rate": 4.4945384230846e-05, - "loss": 3.7906, + "loss": 1.8176, "step": 5785 }, { "epoch": 0.7176055028815765, - "grad_norm": 0.27969995555479055, + "grad_norm": 0.13871113648194647, "learning_rate": 4.476491711175854e-05, - "loss": 3.6391, + "loss": 1.8008, "step": 5790 }, { "epoch": 0.7182251967528042, - "grad_norm": 0.25880179682133825, + "grad_norm": 0.1358378570022186, "learning_rate": 4.45847084933698e-05, - "loss": 3.7211, + "loss": 1.8734, "step": 5795 }, { "epoch": 0.7188448906240317, - "grad_norm": 0.25811067963296236, + "grad_norm": 0.1331598981484069, "learning_rate": 4.440475921905768e-05, - "loss": 3.6898, + "loss": 1.7898, "step": 5800 }, { "epoch": 0.7194645844952593, - "grad_norm": 0.27238020603219465, + "grad_norm": 0.1332569979541051, "learning_rate": 4.422507013098651e-05, - "loss": 3.5922, + "loss": 1.8148, "step": 5805 }, { "epoch": 0.7200842783664869, - "grad_norm": 0.2763272460657019, + "grad_norm": 0.1401241076701764, "learning_rate": 4.404564207010288e-05, - "loss": 3.6836, + "loss": 1.8598, "step": 5810 }, { "epoch": 0.7207039722377145, - "grad_norm": 0.27148027454186435, + "grad_norm": 0.13606072750770695, "learning_rate": 4.3866475876131764e-05, - "loss": 3.7961, + "loss": 1.7977, "step": 5815 }, { "epoch": 0.7213236661089422, - "grad_norm": 0.25605901536244036, + "grad_norm": 0.12799996771306443, "learning_rate": 4.3687572387572605e-05, - "loss": 3.7375, + "loss": 1.8617, "step": 5820 }, { "epoch": 0.7219433599801698, - "grad_norm": 0.26265291674637486, + "grad_norm": 0.1392372680256112, "learning_rate": 4.350893244169541e-05, - "loss": 3.7172, + "loss": 1.8543, "step": 5825 }, { "epoch": 0.7225630538513974, - "grad_norm": 0.28286131134250764, + "grad_norm": 0.13632894278294244, "learning_rate": 4.333055687453673e-05, - "loss": 3.557, + "loss": 1.8211, "step": 5830 }, { "epoch": 0.723182747722625, - "grad_norm": 0.27841967359618486, + "grad_norm": 0.14283083422916973, "learning_rate": 4.315244652089592e-05, - "loss": 3.6977, + "loss": 1.7918, "step": 5835 }, { "epoch": 0.7238024415938527, - "grad_norm": 0.2608987536285254, + "grad_norm": 0.14036541324610963, "learning_rate": 4.297460221433104e-05, - "loss": 3.6414, + "loss": 1.8301, "step": 5840 }, { "epoch": 0.7244221354650803, - "grad_norm": 0.2825656150573623, + "grad_norm": 0.13411932671691454, "learning_rate": 4.2797024787155114e-05, - "loss": 3.6492, + "loss": 1.8434, "step": 5845 }, { "epoch": 0.7250418293363079, - "grad_norm": 0.2876457246577729, + "grad_norm": 0.13644407967970396, "learning_rate": 4.2619715070432174e-05, - "loss": 3.7148, + "loss": 1.8492, "step": 5850 }, { "epoch": 0.7256615232075355, - "grad_norm": 0.27880007263838713, + "grad_norm": 0.13314774631995285, "learning_rate": 4.244267389397326e-05, - "loss": 3.6133, + "loss": 1.825, "step": 5855 }, { "epoch": 0.7262812170787631, - "grad_norm": 0.2720496922839693, + "grad_norm": 0.1300981208309135, "learning_rate": 4.226590208633275e-05, - "loss": 3.7305, + "loss": 1.8262, "step": 5860 }, { "epoch": 0.7269009109499907, - "grad_norm": 0.272676520442399, + "grad_norm": 0.14053955987130684, "learning_rate": 4.208940047480434e-05, - "loss": 3.718, + "loss": 1.8016, "step": 5865 }, { "epoch": 0.7275206048212183, - "grad_norm": 0.26497822927460635, + "grad_norm": 0.13840346422495164, "learning_rate": 4.191316988541721e-05, - "loss": 3.632, + "loss": 1.8449, "step": 5870 }, { "epoch": 0.7281402986924459, - "grad_norm": 0.27723156490255046, + "grad_norm": 0.14187233849517403, "learning_rate": 4.173721114293214e-05, - "loss": 3.8031, + "loss": 1.8168, "step": 5875 }, { "epoch": 0.7287599925636735, - "grad_norm": 0.26094788474282904, + "grad_norm": 0.13386643476403284, "learning_rate": 4.156152507083767e-05, - "loss": 3.6758, + "loss": 1.8105, "step": 5880 }, { "epoch": 0.7293796864349011, - "grad_norm": 0.28172055855879213, + "grad_norm": 0.1345475089808684, "learning_rate": 4.1386112491346255e-05, - "loss": 3.743, + "loss": 1.8125, "step": 5885 }, { "epoch": 0.7299993803061288, - "grad_norm": 0.2732741509695242, + "grad_norm": 0.13643936735112303, "learning_rate": 4.121097422539036e-05, - "loss": 3.6992, + "loss": 1.8852, "step": 5890 }, { "epoch": 0.7306190741773564, - "grad_norm": 0.28094804569295945, + "grad_norm": 0.12903168333708062, "learning_rate": 4.1036111092618725e-05, - "loss": 3.5977, + "loss": 1.8113, "step": 5895 }, { "epoch": 0.731238768048584, - "grad_norm": 0.2677662106993077, + "grad_norm": 0.14243171873479077, "learning_rate": 4.0861523911392406e-05, - "loss": 3.6539, + "loss": 1.7836, "step": 5900 }, { "epoch": 0.7318584619198116, - "grad_norm": 0.2697207576664988, + "grad_norm": 0.14039456232514363, "learning_rate": 4.068721349878107e-05, - "loss": 3.5273, + "loss": 1.8797, "step": 5905 }, { "epoch": 0.7324781557910393, - "grad_norm": 0.2707629838608235, + "grad_norm": 0.13211859990032615, "learning_rate": 4.051318067055898e-05, - "loss": 3.632, + "loss": 1.8375, "step": 5910 }, { "epoch": 0.7330978496622669, - "grad_norm": 0.2674857148566874, + "grad_norm": 0.14206687574027346, "learning_rate": 4.033942624120143e-05, - "loss": 3.7164, + "loss": 1.8238, "step": 5915 }, { "epoch": 0.7337175435334945, - "grad_norm": 0.2655492853840789, + "grad_norm": 0.13499243739697084, "learning_rate": 4.0165951023880746e-05, - "loss": 3.782, + "loss": 1.8441, "step": 5920 }, { "epoch": 0.7343372374047221, - "grad_norm": 0.27416095873987867, + "grad_norm": 0.13559241628088486, "learning_rate": 3.999275583046256e-05, - "loss": 3.7094, + "loss": 1.7824, "step": 5925 }, { "epoch": 0.7349569312759496, - "grad_norm": 0.267483841770565, + "grad_norm": 0.14192010466529414, "learning_rate": 3.981984147150196e-05, - "loss": 3.7852, + "loss": 1.8527, "step": 5930 }, { "epoch": 0.7355766251471773, - "grad_norm": 0.2745830184563242, + "grad_norm": 0.1318846856274811, "learning_rate": 3.964720875623976e-05, - "loss": 3.6359, + "loss": 1.8039, "step": 5935 }, { "epoch": 0.7361963190184049, - "grad_norm": 0.2757045668181282, + "grad_norm": 0.13396322343104464, "learning_rate": 3.9474858492598653e-05, - "loss": 3.7078, + "loss": 1.8039, "step": 5940 }, { "epoch": 0.7368160128896325, - "grad_norm": 0.27129935273188194, + "grad_norm": 0.13529179713111514, "learning_rate": 3.930279148717948e-05, - "loss": 3.7453, + "loss": 1.7961, "step": 5945 }, { "epoch": 0.7374357067608601, - "grad_norm": 0.2652954671649276, + "grad_norm": 0.13808062326374956, "learning_rate": 3.913100854525742e-05, - "loss": 3.6898, + "loss": 1.857, "step": 5950 }, { "epoch": 0.7380554006320877, - "grad_norm": 0.2660265511643317, + "grad_norm": 0.14259484377808126, "learning_rate": 3.895951047077821e-05, - "loss": 3.5609, + "loss": 1.8074, "step": 5955 }, { "epoch": 0.7386750945033154, - "grad_norm": 0.2850360173732016, + "grad_norm": 0.14048933061505342, "learning_rate": 3.8788298066354464e-05, - "loss": 3.6125, + "loss": 1.8113, "step": 5960 }, { "epoch": 0.739294788374543, - "grad_norm": 0.2747779595842181, + "grad_norm": 0.1365199990868293, "learning_rate": 3.8617372133261766e-05, - "loss": 3.7523, + "loss": 1.8387, "step": 5965 }, { "epoch": 0.7399144822457706, - "grad_norm": 0.27791872773574944, + "grad_norm": 0.1304295788973774, "learning_rate": 3.844673347143507e-05, - "loss": 3.6641, + "loss": 1.8598, "step": 5970 }, { "epoch": 0.7405341761169982, - "grad_norm": 0.26936680667842383, + "grad_norm": 0.14192605565965738, "learning_rate": 3.827638287946489e-05, - "loss": 3.5922, + "loss": 1.8555, "step": 5975 }, { "epoch": 0.7411538699882259, - "grad_norm": 0.2676597010665189, + "grad_norm": 0.13697173512407929, "learning_rate": 3.8106321154593605e-05, - "loss": 3.6336, + "loss": 1.8055, "step": 5980 }, { "epoch": 0.7417735638594535, - "grad_norm": 0.27102690603506263, + "grad_norm": 0.13183561867638896, "learning_rate": 3.793654909271169e-05, - "loss": 3.6437, + "loss": 1.7809, "step": 5985 }, { "epoch": 0.742393257730681, - "grad_norm": 0.267102384870593, + "grad_norm": 0.14024605247591226, "learning_rate": 3.776706748835388e-05, - "loss": 3.6969, + "loss": 1.8723, "step": 5990 }, { "epoch": 0.7430129516019086, - "grad_norm": 0.2764451949708146, + "grad_norm": 0.14086892950568974, "learning_rate": 3.759787713469569e-05, - "loss": 3.7094, + "loss": 1.8406, "step": 5995 }, { "epoch": 0.7436326454731362, - "grad_norm": 0.27181670294451116, + "grad_norm": 0.14334020779281637, "learning_rate": 3.7428978823549545e-05, - "loss": 3.7445, + "loss": 1.8184, "step": 6000 }, { "epoch": 0.7442523393443639, - "grad_norm": 0.2700157262463554, + "grad_norm": 0.13571907865119778, "learning_rate": 3.726037334536109e-05, - "loss": 3.6367, + "loss": 1.8562, "step": 6005 }, { "epoch": 0.7448720332155915, - "grad_norm": 0.2649639847023256, + "grad_norm": 0.14184536059545447, "learning_rate": 3.709206148920553e-05, - "loss": 3.6531, + "loss": 1.9035, "step": 6010 }, { "epoch": 0.7454917270868191, - "grad_norm": 0.2661623158171092, + "grad_norm": 0.13624732293876154, "learning_rate": 3.692404404278395e-05, - "loss": 3.7008, + "loss": 1.8234, "step": 6015 }, { "epoch": 0.7461114209580467, - "grad_norm": 0.2626176135511345, + "grad_norm": 0.13276441124031632, "learning_rate": 3.675632179241946e-05, - "loss": 3.6617, + "loss": 1.8305, "step": 6020 }, { "epoch": 0.7467311148292743, - "grad_norm": 0.2733085135357621, + "grad_norm": 0.13930808879264342, "learning_rate": 3.658889552305376e-05, - "loss": 3.6055, + "loss": 1.7789, "step": 6025 }, { "epoch": 0.747350808700502, - "grad_norm": 0.2699184735568989, + "grad_norm": 0.13764635628250024, "learning_rate": 3.642176601824341e-05, - "loss": 3.5484, + "loss": 1.8172, "step": 6030 }, { "epoch": 0.7479705025717296, - "grad_norm": 0.2753762018171419, + "grad_norm": 0.13461489368112944, "learning_rate": 3.6254934060156e-05, - "loss": 3.7484, + "loss": 1.8633, "step": 6035 }, { "epoch": 0.7485901964429572, - "grad_norm": 0.26788292526469076, + "grad_norm": 0.13966985047575237, "learning_rate": 3.608840042956666e-05, - "loss": 3.6727, + "loss": 1.8715, "step": 6040 }, { "epoch": 0.7492098903141848, - "grad_norm": 0.2797673176500415, + "grad_norm": 0.13990874030023231, "learning_rate": 3.592216590585427e-05, - "loss": 3.6391, + "loss": 1.8723, "step": 6045 }, { "epoch": 0.7498295841854125, - "grad_norm": 0.26026046751945825, + "grad_norm": 0.14361890024985108, "learning_rate": 3.5756231266997965e-05, - "loss": 3.5984, + "loss": 1.8285, "step": 6050 }, { "epoch": 0.75044927805664, - "grad_norm": 0.26334684058291796, + "grad_norm": 0.14084856041667215, "learning_rate": 3.559059728957338e-05, - "loss": 3.6688, + "loss": 1.8383, "step": 6055 }, { "epoch": 0.7510689719278676, - "grad_norm": 0.2816745822735808, + "grad_norm": 0.13120761896400793, "learning_rate": 3.5425264748749074e-05, - "loss": 3.7313, + "loss": 1.8734, "step": 6060 }, { "epoch": 0.7516886657990952, - "grad_norm": 0.28352625683115584, + "grad_norm": 0.13658964579723912, "learning_rate": 3.5260234418282865e-05, - "loss": 3.6672, + "loss": 1.8109, "step": 6065 }, { "epoch": 0.7523083596703228, - "grad_norm": 0.27308255212562715, + "grad_norm": 0.1344443341684798, "learning_rate": 3.509550707051823e-05, - "loss": 3.5969, + "loss": 1.8398, "step": 6070 }, { "epoch": 0.7529280535415505, - "grad_norm": 0.28045714684906514, + "grad_norm": 0.14012917508847528, "learning_rate": 3.493108347638067e-05, - "loss": 3.7687, + "loss": 1.8484, "step": 6075 }, { "epoch": 0.7535477474127781, - "grad_norm": 0.28038747649942236, + "grad_norm": 0.13379672444004473, "learning_rate": 3.476696440537413e-05, - "loss": 3.6875, + "loss": 1.7824, "step": 6080 }, { "epoch": 0.7541674412840057, - "grad_norm": 0.2757804471211753, + "grad_norm": 0.1434039130004496, "learning_rate": 3.460315062557737e-05, - "loss": 3.6906, + "loss": 1.7945, "step": 6085 }, { "epoch": 0.7547871351552333, - "grad_norm": 0.27039100222498835, + "grad_norm": 0.13398319194874977, "learning_rate": 3.443964290364041e-05, - "loss": 3.6086, + "loss": 1.8031, "step": 6090 }, { "epoch": 0.755406829026461, - "grad_norm": 0.27574840398743766, + "grad_norm": 0.13976767423578978, "learning_rate": 3.4276442004780916e-05, - "loss": 3.6977, + "loss": 1.9234, "step": 6095 }, { "epoch": 0.7560265228976886, - "grad_norm": 0.26908095664325493, + "grad_norm": 0.14215579154672933, "learning_rate": 3.411354869278056e-05, - "loss": 3.7055, + "loss": 1.8199, "step": 6100 }, { "epoch": 0.7566462167689162, - "grad_norm": 0.2871430737464743, + "grad_norm": 0.14305847528188373, "learning_rate": 3.3950963729981565e-05, - "loss": 3.7453, + "loss": 1.8391, "step": 6105 }, { "epoch": 0.7572659106401438, - "grad_norm": 0.27111861031633594, + "grad_norm": 0.13159704286365811, "learning_rate": 3.378868787728308e-05, - "loss": 3.5867, + "loss": 1.8926, "step": 6110 }, { "epoch": 0.7578856045113714, - "grad_norm": 0.276205387609455, + "grad_norm": 0.13826500141162315, "learning_rate": 3.362672189413756e-05, - "loss": 3.7328, + "loss": 1.832, "step": 6115 }, { "epoch": 0.758505298382599, - "grad_norm": 0.26012347649205353, + "grad_norm": 0.13932153252369295, "learning_rate": 3.346506653854734e-05, - "loss": 3.5984, + "loss": 1.8195, "step": 6120 }, { "epoch": 0.7591249922538266, - "grad_norm": 0.29007047304566813, + "grad_norm": 0.1408428855634542, "learning_rate": 3.3303722567060956e-05, - "loss": 3.6797, + "loss": 1.7531, "step": 6125 }, { "epoch": 0.7597446861250542, - "grad_norm": 0.2690965755326681, + "grad_norm": 0.1310748687649542, "learning_rate": 3.31426907347697e-05, - "loss": 3.7141, + "loss": 1.8559, "step": 6130 }, { "epoch": 0.7603643799962818, - "grad_norm": 0.2677627631712178, + "grad_norm": 0.13962627369469138, "learning_rate": 3.2981971795304026e-05, - "loss": 3.6375, + "loss": 1.8816, "step": 6135 }, { "epoch": 0.7609840738675094, - "grad_norm": 0.2661547296225876, + "grad_norm": 0.1341856783680393, "learning_rate": 3.282156650083006e-05, - "loss": 3.6875, + "loss": 1.8637, "step": 6140 }, { "epoch": 0.7616037677387371, - "grad_norm": 0.27694800622542926, + "grad_norm": 0.13593015684887874, "learning_rate": 3.266147560204608e-05, - "loss": 3.6703, + "loss": 1.8684, "step": 6145 }, { "epoch": 0.7622234616099647, - "grad_norm": 0.2744597463214922, + "grad_norm": 0.1416671199439836, "learning_rate": 3.250169984817897e-05, - "loss": 3.7047, + "loss": 1.8223, "step": 6150 }, { "epoch": 0.7628431554811923, - "grad_norm": 0.2760498499832667, + "grad_norm": 0.13085965612004039, "learning_rate": 3.23422399869807e-05, - "loss": 3.7289, + "loss": 1.8141, "step": 6155 }, { "epoch": 0.7634628493524199, - "grad_norm": 0.3144507061509538, + "grad_norm": 0.13830254030035186, "learning_rate": 3.2183096764724915e-05, - "loss": 3.6688, + "loss": 1.8324, "step": 6160 }, { "epoch": 0.7640825432236475, - "grad_norm": 0.2719439357170228, + "grad_norm": 0.13986388265543723, "learning_rate": 3.2024270926203384e-05, - "loss": 3.6875, + "loss": 1.7957, "step": 6165 }, { "epoch": 0.7647022370948752, - "grad_norm": 0.2782653971635696, + "grad_norm": 0.14393270333700767, "learning_rate": 3.1865763214722474e-05, - "loss": 3.8164, + "loss": 1.848, "step": 6170 }, { "epoch": 0.7653219309661028, - "grad_norm": 0.26283063441760457, + "grad_norm": 0.12961867709571917, "learning_rate": 3.1707574372099754e-05, - "loss": 3.7227, + "loss": 1.8449, "step": 6175 }, { "epoch": 0.7659416248373303, - "grad_norm": 0.2640248567397006, + "grad_norm": 0.13331591764693343, "learning_rate": 3.154970513866047e-05, - "loss": 3.7109, + "loss": 1.8234, "step": 6180 }, { "epoch": 0.7665613187085579, - "grad_norm": 0.2821077631231624, + "grad_norm": 0.13854732063824962, "learning_rate": 3.1392156253234086e-05, - "loss": 3.7023, + "loss": 1.8215, "step": 6185 }, { "epoch": 0.7671810125797855, - "grad_norm": 0.2647061292808031, + "grad_norm": 0.13593317269346725, "learning_rate": 3.123492845315086e-05, - "loss": 3.6844, + "loss": 1.8738, "step": 6190 }, { "epoch": 0.7678007064510132, - "grad_norm": 0.25522158800287487, + "grad_norm": 0.1366378836050727, "learning_rate": 3.1078022474238334e-05, - "loss": 3.6484, + "loss": 1.8262, "step": 6195 }, { "epoch": 0.7684204003222408, - "grad_norm": 0.27734156829652845, + "grad_norm": 0.13829425232888887, "learning_rate": 3.092143905081794e-05, - "loss": 3.7102, + "loss": 1.8305, "step": 6200 }, { "epoch": 0.7690400941934684, - "grad_norm": 0.26325851682400353, + "grad_norm": 0.1353286080001516, "learning_rate": 3.07651789157016e-05, - "loss": 3.7055, + "loss": 1.8023, "step": 6205 }, { "epoch": 0.769659788064696, - "grad_norm": 0.27806975585433374, + "grad_norm": 0.13398727438252167, "learning_rate": 3.060924280018811e-05, - "loss": 3.6555, + "loss": 1.7555, "step": 6210 }, { "epoch": 0.7702794819359237, - "grad_norm": 0.2752537883343241, + "grad_norm": 0.14202328524094357, "learning_rate": 3.0453631434059958e-05, - "loss": 3.6266, + "loss": 1.7934, "step": 6215 }, { "epoch": 0.7708991758071513, - "grad_norm": 0.2718545931189434, + "grad_norm": 0.1333220975895458, "learning_rate": 3.0298345545579787e-05, - "loss": 3.6258, + "loss": 1.832, "step": 6220 }, { "epoch": 0.7715188696783789, - "grad_norm": 0.2650532143764226, + "grad_norm": 0.1320352616233952, "learning_rate": 3.0143385861486974e-05, - "loss": 3.5984, + "loss": 1.8793, "step": 6225 }, { "epoch": 0.7721385635496065, - "grad_norm": 0.2748769017897296, + "grad_norm": 0.1492565680814568, "learning_rate": 2.9988753106994306e-05, - "loss": 3.7234, + "loss": 1.8461, "step": 6230 }, { "epoch": 0.7727582574208341, - "grad_norm": 0.28224340503720746, + "grad_norm": 0.1341082625454802, "learning_rate": 2.983444800578452e-05, - "loss": 3.7195, + "loss": 1.8508, "step": 6235 }, { "epoch": 0.7733779512920618, - "grad_norm": 0.27479324651216624, + "grad_norm": 0.1357404576853704, "learning_rate": 2.9680471280006848e-05, - "loss": 3.6797, + "loss": 1.8469, "step": 6240 }, { "epoch": 0.7739976451632893, - "grad_norm": 0.2767199114775174, + "grad_norm": 0.13561038043919632, "learning_rate": 2.9526823650273837e-05, - "loss": 3.6172, + "loss": 1.8293, "step": 6245 }, { "epoch": 0.7746173390345169, - "grad_norm": 0.26562638908851804, + "grad_norm": 0.14624823865011252, "learning_rate": 2.93735058356578e-05, - "loss": 3.6367, + "loss": 1.7902, "step": 6250 }, { "epoch": 0.7752370329057445, - "grad_norm": 0.26872418612230703, + "grad_norm": 0.1361302394715922, "learning_rate": 2.9220518553687526e-05, - "loss": 3.6898, + "loss": 1.8559, "step": 6255 }, { "epoch": 0.7758567267769721, - "grad_norm": 0.27873653338300347, + "grad_norm": 0.14017086141892965, "learning_rate": 2.9067862520344956e-05, - "loss": 3.7484, + "loss": 1.8379, "step": 6260 }, { "epoch": 0.7764764206481998, - "grad_norm": 0.263032563358348, + "grad_norm": 0.1338549821915282, "learning_rate": 2.891553845006165e-05, - "loss": 3.6781, + "loss": 1.8273, "step": 6265 }, { "epoch": 0.7770961145194274, - "grad_norm": 0.2777153283779717, + "grad_norm": 0.14098907928260412, "learning_rate": 2.87635470557157e-05, - "loss": 3.7594, + "loss": 1.8465, "step": 6270 }, { "epoch": 0.777715808390655, - "grad_norm": 0.2702436881365533, + "grad_norm": 0.14080305362194845, "learning_rate": 2.861188904862827e-05, - "loss": 3.7055, + "loss": 1.8457, "step": 6275 }, { "epoch": 0.7783355022618826, - "grad_norm": 0.26759599717089805, + "grad_norm": 0.12929469590085493, "learning_rate": 2.8460565138560212e-05, - "loss": 3.7031, + "loss": 1.8199, "step": 6280 }, { "epoch": 0.7789551961331103, - "grad_norm": 0.26819723161508713, + "grad_norm": 0.13426337891967977, "learning_rate": 2.830957603370883e-05, - "loss": 3.6867, + "loss": 1.8402, "step": 6285 }, { "epoch": 0.7795748900043379, - "grad_norm": 0.28385416525089446, + "grad_norm": 0.13768387356351205, "learning_rate": 2.815892244070455e-05, - "loss": 3.65, + "loss": 1.8809, "step": 6290 }, { "epoch": 0.7801945838755655, - "grad_norm": 0.26991854803623094, + "grad_norm": 0.1357497474261774, "learning_rate": 2.8008605064607528e-05, - "loss": 3.6922, + "loss": 1.8629, "step": 6295 }, { "epoch": 0.7808142777467931, - "grad_norm": 0.2669082534769859, + "grad_norm": 0.1380951933528559, "learning_rate": 2.7858624608904515e-05, - "loss": 3.6484, + "loss": 1.9027, "step": 6300 }, { "epoch": 0.7814339716180208, - "grad_norm": 0.268626321217136, + "grad_norm": 0.14271323304318345, "learning_rate": 2.7708981775505416e-05, - "loss": 3.6273, + "loss": 1.8309, "step": 6305 }, { "epoch": 0.7820536654892483, - "grad_norm": 0.2673209837044881, + "grad_norm": 0.1421418046102633, "learning_rate": 2.755967726474007e-05, - "loss": 3.7453, + "loss": 1.8266, "step": 6310 }, { "epoch": 0.7826733593604759, - "grad_norm": 0.26924472996042415, + "grad_norm": 0.14136246949703826, "learning_rate": 2.741071177535499e-05, - "loss": 3.6211, + "loss": 1.8238, "step": 6315 }, { "epoch": 0.7832930532317035, - "grad_norm": 0.2849096803234581, + "grad_norm": 0.1287667654475233, "learning_rate": 2.7262086004510023e-05, - "loss": 3.6344, + "loss": 1.8285, "step": 6320 }, { "epoch": 0.7839127471029311, - "grad_norm": 0.2685985355407374, + "grad_norm": 0.1363243768230169, "learning_rate": 2.7113800647775156e-05, - "loss": 3.7352, + "loss": 1.7789, "step": 6325 }, { "epoch": 0.7845324409741588, - "grad_norm": 0.2783351123681416, + "grad_norm": 0.1403800799180287, "learning_rate": 2.6965856399127232e-05, - "loss": 3.6258, + "loss": 1.8434, "step": 6330 }, { "epoch": 0.7851521348453864, - "grad_norm": 0.28724851570545096, + "grad_norm": 0.14473756811626665, "learning_rate": 2.6818253950946704e-05, - "loss": 3.6836, + "loss": 1.8477, "step": 6335 }, { "epoch": 0.785771828716614, - "grad_norm": 0.2644860014989463, + "grad_norm": 0.15235633299839338, "learning_rate": 2.6670993994014394e-05, - "loss": 3.6313, + "loss": 1.8754, "step": 6340 }, { "epoch": 0.7863915225878416, - "grad_norm": 0.2781026506109821, + "grad_norm": 0.13349212068682054, "learning_rate": 2.6524077217508292e-05, - "loss": 3.5844, + "loss": 1.8309, "step": 6345 }, { "epoch": 0.7870112164590692, - "grad_norm": 0.28371233138789237, + "grad_norm": 0.14063273951161426, "learning_rate": 2.63775043090002e-05, - "loss": 3.6945, + "loss": 1.8309, "step": 6350 }, { "epoch": 0.7876309103302969, - "grad_norm": 0.2676866471094794, + "grad_norm": 0.13554326916726495, "learning_rate": 2.623127595445274e-05, - "loss": 3.6805, + "loss": 1.7812, "step": 6355 }, { "epoch": 0.7882506042015245, - "grad_norm": 0.2804037276692652, + "grad_norm": 0.14617594465762834, "learning_rate": 2.6085392838215938e-05, - "loss": 3.7062, + "loss": 1.8766, "step": 6360 }, { "epoch": 0.7888702980727521, - "grad_norm": 0.2677893021797037, + "grad_norm": 0.14063474885828448, "learning_rate": 2.5939855643024136e-05, - "loss": 3.6555, + "loss": 1.8336, "step": 6365 }, { "epoch": 0.7894899919439796, - "grad_norm": 0.2726351905018635, + "grad_norm": 0.13631025672810165, "learning_rate": 2.5794665049992762e-05, - "loss": 3.6461, + "loss": 1.7414, "step": 6370 }, { "epoch": 0.7901096858152072, - "grad_norm": 0.2900933246217791, + "grad_norm": 0.13754469382158213, "learning_rate": 2.564982173861512e-05, - "loss": 3.675, + "loss": 1.8371, "step": 6375 }, { "epoch": 0.7907293796864349, - "grad_norm": 0.2838750953944875, + "grad_norm": 0.1373525895674881, "learning_rate": 2.5505326386759254e-05, - "loss": 3.6828, + "loss": 1.8539, "step": 6380 }, { "epoch": 0.7913490735576625, - "grad_norm": 0.2694169455941463, + "grad_norm": 0.1331753501860663, "learning_rate": 2.536117967066476e-05, - "loss": 3.6039, + "loss": 1.7859, "step": 6385 }, { "epoch": 0.7919687674288901, - "grad_norm": 0.26213248615454743, + "grad_norm": 0.1320713578647196, "learning_rate": 2.521738226493957e-05, - "loss": 3.5781, + "loss": 1.8109, "step": 6390 }, { "epoch": 0.7925884613001177, - "grad_norm": 0.260333774889878, + "grad_norm": 0.1383762808515008, "learning_rate": 2.50739348425569e-05, - "loss": 3.6797, + "loss": 1.7887, "step": 6395 }, { "epoch": 0.7932081551713454, - "grad_norm": 0.273921652812745, + "grad_norm": 0.14212639343800593, "learning_rate": 2.4930838074852026e-05, - "loss": 3.6359, + "loss": 1.8687, "step": 6400 }, { "epoch": 0.793827849042573, - "grad_norm": 0.26712516101630807, + "grad_norm": 0.1434562089301648, "learning_rate": 2.47880926315191e-05, - "loss": 3.6078, + "loss": 1.8445, "step": 6405 }, { "epoch": 0.7944475429138006, - "grad_norm": 0.2693988596271278, + "grad_norm": 0.13947244740426135, "learning_rate": 2.4645699180608127e-05, - "loss": 3.5023, + "loss": 1.8414, "step": 6410 }, { "epoch": 0.7950672367850282, - "grad_norm": 0.26822140724293647, + "grad_norm": 0.13237054669581869, "learning_rate": 2.45036583885218e-05, - "loss": 3.6805, + "loss": 1.8285, "step": 6415 }, { "epoch": 0.7956869306562558, - "grad_norm": 0.26510464834957814, + "grad_norm": 0.12680327271666422, "learning_rate": 2.4361970920012313e-05, - "loss": 3.5883, + "loss": 1.9254, "step": 6420 }, { "epoch": 0.7963066245274835, - "grad_norm": 0.27406804031937193, + "grad_norm": 0.1384006795821082, "learning_rate": 2.4220637438178317e-05, - "loss": 3.7289, + "loss": 1.8344, "step": 6425 }, { "epoch": 0.7969263183987111, - "grad_norm": 0.2783616044061502, + "grad_norm": 0.13493566470479693, "learning_rate": 2.4079658604461896e-05, - "loss": 3.6969, + "loss": 1.8164, "step": 6430 }, { "epoch": 0.7975460122699386, - "grad_norm": 0.27499764910548563, + "grad_norm": 0.13904637123993435, "learning_rate": 2.393903507864521e-05, - "loss": 3.75, + "loss": 1.8328, "step": 6435 }, { "epoch": 0.7981657061411662, - "grad_norm": 0.2810591412318894, + "grad_norm": 0.1363832017283468, "learning_rate": 2.3798767518847687e-05, - "loss": 3.5781, + "loss": 1.8594, "step": 6440 }, { "epoch": 0.7987854000123938, - "grad_norm": 0.2896760112930254, + "grad_norm": 0.13774336173857357, "learning_rate": 2.3658856581522804e-05, - "loss": 3.6688, + "loss": 1.8348, "step": 6445 }, { "epoch": 0.7994050938836215, - "grad_norm": 0.27371011149072194, + "grad_norm": 0.13986649111637628, "learning_rate": 2.3519302921455033e-05, - "loss": 3.6703, + "loss": 1.8211, "step": 6450 }, { "epoch": 0.8000247877548491, - "grad_norm": 0.27322521002152894, + "grad_norm": 0.13669443840005838, "learning_rate": 2.338010719175684e-05, - "loss": 3.4781, + "loss": 1.7703, "step": 6455 }, { "epoch": 0.8006444816260767, - "grad_norm": 0.27492595731794456, + "grad_norm": 0.13007801549105374, "learning_rate": 2.324127004386546e-05, - "loss": 3.6719, + "loss": 1.852, "step": 6460 }, { "epoch": 0.8012641754973043, - "grad_norm": 0.269841379201818, + "grad_norm": 0.13317280097098233, "learning_rate": 2.310279212754006e-05, - "loss": 3.6742, + "loss": 1.8387, "step": 6465 }, { "epoch": 0.801883869368532, - "grad_norm": 0.2631112072013383, + "grad_norm": 0.13720871260598752, "learning_rate": 2.296467409085853e-05, - "loss": 3.7, + "loss": 1.8492, "step": 6470 }, { "epoch": 0.8025035632397596, - "grad_norm": 0.26544959609730795, + "grad_norm": 0.13677530269293067, "learning_rate": 2.2826916580214632e-05, - "loss": 3.7, + "loss": 1.7738, "step": 6475 }, { "epoch": 0.8031232571109872, - "grad_norm": 0.27135804528233304, + "grad_norm": 0.1383749917899896, "learning_rate": 2.2689520240314755e-05, - "loss": 3.6781, + "loss": 1.8594, "step": 6480 }, { "epoch": 0.8037429509822148, - "grad_norm": 0.2653641462255707, + "grad_norm": 0.13930908242456305, "learning_rate": 2.2552485714175064e-05, - "loss": 3.6906, + "loss": 1.8461, "step": 6485 }, { "epoch": 0.8043626448534424, - "grad_norm": 0.28084635001022096, + "grad_norm": 0.1323961629115279, "learning_rate": 2.2415813643118356e-05, - "loss": 3.8, + "loss": 1.8254, "step": 6490 }, { "epoch": 0.8049823387246701, - "grad_norm": 0.27853856158817114, + "grad_norm": 0.13640022237848362, "learning_rate": 2.227950466677121e-05, - "loss": 3.5984, + "loss": 1.8008, "step": 6495 }, { "epoch": 0.8056020325958976, - "grad_norm": 0.2799281087638109, + "grad_norm": 0.13393523254678819, "learning_rate": 2.21435594230609e-05, - "loss": 3.6453, + "loss": 1.8148, "step": 6500 }, { "epoch": 0.8062217264671252, - "grad_norm": 0.2864919996155976, + "grad_norm": 0.13856918377591282, "learning_rate": 2.2007978548212425e-05, - "loss": 3.7008, + "loss": 1.7848, "step": 6505 }, { "epoch": 0.8068414203383528, - "grad_norm": 0.2749310670018361, + "grad_norm": 0.1467820578644545, "learning_rate": 2.1872762676745563e-05, - "loss": 3.7477, + "loss": 1.8687, "step": 6510 }, { "epoch": 0.8074611142095804, - "grad_norm": 0.2694571853494997, + "grad_norm": 0.13199784721447735, "learning_rate": 2.1737912441471787e-05, - "loss": 3.7133, + "loss": 1.7953, "step": 6515 }, { "epoch": 0.8080808080808081, - "grad_norm": 0.2618758865915513, + "grad_norm": 0.13521097371996904, "learning_rate": 2.160342847349144e-05, - "loss": 3.6539, + "loss": 1.8305, "step": 6520 }, { "epoch": 0.8087005019520357, - "grad_norm": 0.2733166377034909, + "grad_norm": 0.14340242068765546, "learning_rate": 2.1469311402190794e-05, - "loss": 3.8234, + "loss": 1.8246, "step": 6525 }, { "epoch": 0.8093201958232633, - "grad_norm": 0.26976744475111125, + "grad_norm": 0.13128847998828025, "learning_rate": 2.133556185523895e-05, - "loss": 3.6914, + "loss": 1.8203, "step": 6530 }, { "epoch": 0.8099398896944909, - "grad_norm": 0.2714542565616823, + "grad_norm": 0.13591682961831256, "learning_rate": 2.120218045858503e-05, - "loss": 3.7508, + "loss": 1.8637, "step": 6535 }, { "epoch": 0.8105595835657186, - "grad_norm": 0.2669981069887573, + "grad_norm": 0.14188949741756163, "learning_rate": 2.1069167836455228e-05, - "loss": 3.6297, + "loss": 1.823, "step": 6540 }, { "epoch": 0.8111792774369462, - "grad_norm": 0.2634714817754169, + "grad_norm": 0.14951548768393097, "learning_rate": 2.0936524611349795e-05, - "loss": 3.5695, + "loss": 1.7926, "step": 6545 }, { "epoch": 0.8117989713081738, - "grad_norm": 0.2800560817358675, + "grad_norm": 0.1463305546882131, "learning_rate": 2.080425140404029e-05, - "loss": 3.6023, + "loss": 1.8512, "step": 6550 }, { "epoch": 0.8124186651794014, - "grad_norm": 0.26936666732046705, + "grad_norm": 0.1370880820734347, "learning_rate": 2.0672348833566512e-05, - "loss": 3.7266, + "loss": 1.8703, "step": 6555 }, { "epoch": 0.813038359050629, - "grad_norm": 0.2782298698283211, + "grad_norm": 0.13265798046306984, "learning_rate": 2.0540817517233735e-05, - "loss": 3.7586, + "loss": 1.8309, "step": 6560 }, { "epoch": 0.8136580529218566, - "grad_norm": 0.27492647831446826, + "grad_norm": 0.1298504140699306, "learning_rate": 2.0409658070609738e-05, - "loss": 3.6, + "loss": 1.8246, "step": 6565 }, { "epoch": 0.8142777467930842, - "grad_norm": 0.2675626790996732, + "grad_norm": 0.13644051403236102, "learning_rate": 2.0278871107521936e-05, - "loss": 3.6469, + "loss": 1.8023, "step": 6570 }, { "epoch": 0.8148974406643118, - "grad_norm": 0.27782396820372657, + "grad_norm": 0.1397736858054863, "learning_rate": 2.014845724005453e-05, - "loss": 3.6766, + "loss": 1.8457, "step": 6575 }, { "epoch": 0.8155171345355394, - "grad_norm": 0.26540581302587424, + "grad_norm": 0.13415177608850457, "learning_rate": 2.0018417078545614e-05, - "loss": 3.7437, + "loss": 1.8301, "step": 6580 }, { "epoch": 0.816136828406767, - "grad_norm": 0.270984981257796, + "grad_norm": 0.13204065568378437, "learning_rate": 1.988875123158437e-05, - "loss": 3.6602, + "loss": 1.802, "step": 6585 }, { "epoch": 0.8167565222779947, - "grad_norm": 0.26379727764864, + "grad_norm": 0.1372522407858421, "learning_rate": 1.975946030600814e-05, - "loss": 3.7078, + "loss": 1.7984, "step": 6590 }, { "epoch": 0.8173762161492223, - "grad_norm": 0.28352021928974896, + "grad_norm": 0.1402427316379218, "learning_rate": 1.9630544906899672e-05, - "loss": 3.6172, + "loss": 1.7902, "step": 6595 }, { "epoch": 0.8179959100204499, - "grad_norm": 0.2929059747351139, + "grad_norm": 0.13504947552206878, "learning_rate": 1.9502005637584198e-05, - "loss": 3.7469, + "loss": 1.8594, "step": 6600 }, { "epoch": 0.8186156038916775, - "grad_norm": 0.2621616613487277, + "grad_norm": 0.13671633407154715, "learning_rate": 1.93738430996267e-05, - "loss": 3.6984, + "loss": 1.8473, "step": 6605 }, { "epoch": 0.8192352977629052, - "grad_norm": 0.2543683174187999, + "grad_norm": 0.14505886127883016, "learning_rate": 1.9246057892829038e-05, - "loss": 3.6883, + "loss": 1.8297, "step": 6610 }, { "epoch": 0.8198549916341328, - "grad_norm": 0.2676990427144938, + "grad_norm": 0.13744286662175678, "learning_rate": 1.9118650615227162e-05, - "loss": 3.6273, + "loss": 1.8254, "step": 6615 }, { "epoch": 0.8204746855053604, - "grad_norm": 0.26701918525313256, + "grad_norm": 0.1378170278876299, "learning_rate": 1.8991621863088315e-05, - "loss": 3.7531, + "loss": 1.8301, "step": 6620 }, { "epoch": 0.8210943793765879, - "grad_norm": 0.2735829555403975, + "grad_norm": 0.13425156743572028, "learning_rate": 1.886497223090823e-05, - "loss": 3.6688, + "loss": 1.8391, "step": 6625 }, { "epoch": 0.8217140732478155, - "grad_norm": 0.2586013500555307, + "grad_norm": 0.136925795997487, "learning_rate": 1.8738702311408352e-05, - "loss": 3.8141, + "loss": 1.8523, "step": 6630 }, { "epoch": 0.8223337671190432, - "grad_norm": 0.2682042573349656, + "grad_norm": 0.13555305693474248, "learning_rate": 1.8612812695533077e-05, - "loss": 3.668, + "loss": 1.9012, "step": 6635 }, { "epoch": 0.8229534609902708, - "grad_norm": 0.2798025021737809, + "grad_norm": 0.14021386873564234, "learning_rate": 1.8487303972446966e-05, - "loss": 3.6023, + "loss": 1.8211, "step": 6640 }, { "epoch": 0.8235731548614984, - "grad_norm": 0.2665439914850495, + "grad_norm": 0.13435340169695256, "learning_rate": 1.836217672953201e-05, - "loss": 3.7883, + "loss": 1.8738, "step": 6645 }, { "epoch": 0.824192848732726, - "grad_norm": 0.2782919179548854, + "grad_norm": 0.13825299288157739, "learning_rate": 1.8237431552384887e-05, - "loss": 3.5961, + "loss": 1.8168, "step": 6650 }, { "epoch": 0.8248125426039536, - "grad_norm": 0.2654144345290904, + "grad_norm": 0.1365578497978942, "learning_rate": 1.811306902481412e-05, - "loss": 3.7172, + "loss": 1.7988, "step": 6655 }, { "epoch": 0.8254322364751813, - "grad_norm": 0.27250768303187856, + "grad_norm": 0.14191275010866652, "learning_rate": 1.798908972883754e-05, - "loss": 3.6125, + "loss": 1.8621, "step": 6660 }, { "epoch": 0.8260519303464089, - "grad_norm": 0.2679383566934312, + "grad_norm": 0.14261769291356297, "learning_rate": 1.786549424467936e-05, - "loss": 3.7078, + "loss": 1.95, "step": 6665 }, { "epoch": 0.8266716242176365, - "grad_norm": 0.270235333933691, + "grad_norm": 0.1476188503443379, "learning_rate": 1.7742283150767614e-05, - "loss": 3.6305, + "loss": 1.9102, "step": 6670 }, { "epoch": 0.8272913180888641, - "grad_norm": 0.27558887009789595, + "grad_norm": 0.130356698831072, "learning_rate": 1.7619457023731355e-05, - "loss": 3.6703, + "loss": 1.8477, "step": 6675 }, { "epoch": 0.8279110119600918, - "grad_norm": 0.2683680281474502, + "grad_norm": 0.13967859191640508, "learning_rate": 1.7497016438397984e-05, - "loss": 3.6719, + "loss": 1.8336, "step": 6680 }, { "epoch": 0.8285307058313194, - "grad_norm": 0.2730520742473007, + "grad_norm": 0.1445625801135568, "learning_rate": 1.737496196779059e-05, - "loss": 3.6461, + "loss": 1.7988, "step": 6685 }, { "epoch": 0.8291503997025469, - "grad_norm": 0.2936511481286826, + "grad_norm": 0.13772616272180707, "learning_rate": 1.7253294183125223e-05, - "loss": 3.5742, + "loss": 1.8387, "step": 6690 }, { "epoch": 0.8297700935737745, - "grad_norm": 0.28855084456291186, + "grad_norm": 0.13927654860478775, "learning_rate": 1.7132013653808222e-05, - "loss": 3.6719, + "loss": 1.8344, "step": 6695 }, { "epoch": 0.8303897874450021, - "grad_norm": 0.2597746796894285, + "grad_norm": 0.1367478820289552, "learning_rate": 1.70111209474336e-05, - "loss": 3.6102, + "loss": 1.8711, "step": 6700 }, { "epoch": 0.8310094813162298, - "grad_norm": 0.27814469005937525, + "grad_norm": 0.13778305851562409, "learning_rate": 1.6890616629780364e-05, - "loss": 3.7359, + "loss": 1.8105, "step": 6705 }, { "epoch": 0.8316291751874574, - "grad_norm": 0.277325774566865, + "grad_norm": 0.13937638411283737, "learning_rate": 1.6770501264809778e-05, - "loss": 3.6242, + "loss": 1.8516, "step": 6710 }, { "epoch": 0.832248869058685, - "grad_norm": 0.2693934876663354, + "grad_norm": 0.14142715677631995, "learning_rate": 1.665077541466289e-05, - "loss": 3.7117, + "loss": 1.848, "step": 6715 }, { "epoch": 0.8328685629299126, - "grad_norm": 0.2663304575953115, + "grad_norm": 0.13547775806902937, "learning_rate": 1.6531439639657776e-05, - "loss": 3.7172, + "loss": 1.8512, "step": 6720 }, { "epoch": 0.8334882568011402, - "grad_norm": 0.2680430225306164, + "grad_norm": 0.13259567267028294, "learning_rate": 1.641249449828699e-05, - "loss": 3.7172, + "loss": 1.9055, "step": 6725 }, { "epoch": 0.8341079506723679, - "grad_norm": 0.28908859826508604, + "grad_norm": 0.13430439200938232, "learning_rate": 1.6293940547214905e-05, - "loss": 3.8102, + "loss": 1.743, "step": 6730 }, { "epoch": 0.8347276445435955, - "grad_norm": 0.2723242580613196, + "grad_norm": 0.14323647170257087, "learning_rate": 1.617577834127506e-05, - "loss": 3.7539, + "loss": 1.7785, "step": 6735 }, { "epoch": 0.8353473384148231, - "grad_norm": 0.26396208033874435, + "grad_norm": 0.13887966152843348, "learning_rate": 1.6058008433467698e-05, - "loss": 3.618, + "loss": 1.7848, "step": 6740 }, { "epoch": 0.8359670322860507, - "grad_norm": 0.2690220541626815, + "grad_norm": 0.12926377677093945, "learning_rate": 1.594063137495707e-05, - "loss": 3.807, + "loss": 1.8285, "step": 6745 }, { "epoch": 0.8365867261572784, - "grad_norm": 0.28665826800813404, + "grad_norm": 0.13985627925415461, "learning_rate": 1.582364771506891e-05, - "loss": 3.6242, + "loss": 1.8617, "step": 6750 }, { "epoch": 0.8372064200285059, - "grad_norm": 0.27065792545829087, + "grad_norm": 0.14071537495548447, "learning_rate": 1.570705800128781e-05, - "loss": 3.6313, + "loss": 1.798, "step": 6755 }, { "epoch": 0.8378261138997335, - "grad_norm": 0.2607272849200065, + "grad_norm": 0.14005570372758716, "learning_rate": 1.5590862779254746e-05, - "loss": 3.6539, + "loss": 1.807, "step": 6760 }, { "epoch": 0.8384458077709611, - "grad_norm": 0.2787653779356175, + "grad_norm": 0.1373432585304391, "learning_rate": 1.5475062592764346e-05, - "loss": 3.682, + "loss": 1.8082, "step": 6765 }, { "epoch": 0.8390655016421887, - "grad_norm": 0.27568525546442, + "grad_norm": 0.13648916484535767, "learning_rate": 1.5359657983762632e-05, - "loss": 3.6875, + "loss": 1.8035, "step": 6770 }, { "epoch": 0.8396851955134164, - "grad_norm": 0.2950892796467241, + "grad_norm": 0.13659166896958833, "learning_rate": 1.524464949234422e-05, - "loss": 3.6586, + "loss": 1.8188, "step": 6775 }, { "epoch": 0.840304889384644, - "grad_norm": 0.2764190380094884, + "grad_norm": 0.13285103674527363, "learning_rate": 1.5130037656749918e-05, - "loss": 3.5883, + "loss": 1.7789, "step": 6780 }, { "epoch": 0.8409245832558716, - "grad_norm": 0.26761586390041553, + "grad_norm": 0.13234536077305287, "learning_rate": 1.5015823013364183e-05, - "loss": 3.7297, + "loss": 1.8586, "step": 6785 }, { "epoch": 0.8415442771270992, - "grad_norm": 0.2912541676210491, + "grad_norm": 0.13222815236741142, "learning_rate": 1.4902006096712572e-05, - "loss": 3.5719, + "loss": 1.7836, "step": 6790 }, { "epoch": 0.8421639709983268, - "grad_norm": 0.27254549706733305, + "grad_norm": 0.13495804055854296, "learning_rate": 1.4788587439459323e-05, - "loss": 3.6945, + "loss": 1.8051, "step": 6795 }, { "epoch": 0.8427836648695545, - "grad_norm": 0.2620221582714912, + "grad_norm": 0.13630743372332232, "learning_rate": 1.4675567572404803e-05, - "loss": 3.7812, + "loss": 1.8426, "step": 6800 }, { "epoch": 0.8434033587407821, - "grad_norm": 0.289922770515983, + "grad_norm": 0.13164111897888933, "learning_rate": 1.4562947024483031e-05, - "loss": 3.6508, + "loss": 1.7629, "step": 6805 }, { "epoch": 0.8440230526120097, - "grad_norm": 0.2616919452385473, + "grad_norm": 0.13579891156886684, "learning_rate": 1.4450726322759223e-05, - "loss": 3.6617, + "loss": 1.8094, "step": 6810 }, { "epoch": 0.8446427464832372, - "grad_norm": 0.2847412415269478, + "grad_norm": 0.13553237661955567, "learning_rate": 1.4338905992427287e-05, - "loss": 3.6508, + "loss": 1.8687, "step": 6815 }, { "epoch": 0.8452624403544649, - "grad_norm": 0.2791306174520287, + "grad_norm": 0.14720160991862427, "learning_rate": 1.4227486556807412e-05, - "loss": 3.693, + "loss": 1.8113, "step": 6820 }, { "epoch": 0.8458821342256925, - "grad_norm": 0.27972768023171085, + "grad_norm": 0.1340030456048872, "learning_rate": 1.4116468537343585e-05, - "loss": 3.7508, + "loss": 1.8234, "step": 6825 }, { "epoch": 0.8465018280969201, - "grad_norm": 0.27808447135210174, + "grad_norm": 0.14666047167725307, "learning_rate": 1.4005852453601164e-05, - "loss": 3.6789, + "loss": 1.8621, "step": 6830 }, { "epoch": 0.8471215219681477, - "grad_norm": 0.26347659861052786, + "grad_norm": 0.13668555746075953, "learning_rate": 1.3895638823264446e-05, - "loss": 3.7016, + "loss": 1.848, "step": 6835 }, { "epoch": 0.8477412158393753, - "grad_norm": 0.2860968542159929, + "grad_norm": 0.1339418741254649, "learning_rate": 1.3785828162134252e-05, - "loss": 3.6992, + "loss": 1.834, "step": 6840 }, { "epoch": 0.848360909710603, - "grad_norm": 0.2711258258595594, + "grad_norm": 0.12912937848665926, "learning_rate": 1.367642098412546e-05, - "loss": 3.6148, + "loss": 1.8633, "step": 6845 }, { "epoch": 0.8489806035818306, - "grad_norm": 0.29543579306133466, + "grad_norm": 0.13665464782157216, "learning_rate": 1.3567417801264692e-05, - "loss": 3.6797, + "loss": 1.8273, "step": 6850 }, { "epoch": 0.8496002974530582, - "grad_norm": 0.26902475678532334, + "grad_norm": 0.14072966290231814, "learning_rate": 1.345881912368785e-05, - "loss": 3.6586, + "loss": 1.8637, "step": 6855 }, { "epoch": 0.8502199913242858, - "grad_norm": 0.25812138980263255, + "grad_norm": 0.139383826652858, "learning_rate": 1.3350625459637744e-05, - "loss": 3.7273, + "loss": 1.8113, "step": 6860 }, { "epoch": 0.8508396851955135, - "grad_norm": 0.2830901391479715, + "grad_norm": 0.1347401215990617, "learning_rate": 1.3242837315461732e-05, - "loss": 3.6719, + "loss": 1.8414, "step": 6865 }, { "epoch": 0.8514593790667411, - "grad_norm": 0.29031597061519, + "grad_norm": 0.14070257303503275, "learning_rate": 1.3135455195609325e-05, - "loss": 3.7188, + "loss": 1.8148, "step": 6870 }, { "epoch": 0.8520790729379687, - "grad_norm": 0.2728925825069603, + "grad_norm": 0.1317491498855389, "learning_rate": 1.3028479602629839e-05, - "loss": 3.6187, + "loss": 1.8465, "step": 6875 }, { "epoch": 0.8526987668091962, - "grad_norm": 0.2764576115800626, + "grad_norm": 0.13406734047768462, "learning_rate": 1.292191103717002e-05, - "loss": 3.7492, + "loss": 1.8473, "step": 6880 }, { "epoch": 0.8533184606804238, - "grad_norm": 0.2660636993335465, + "grad_norm": 0.13835653776561033, "learning_rate": 1.281574999797176e-05, - "loss": 3.5781, + "loss": 1.8797, "step": 6885 }, { "epoch": 0.8539381545516515, - "grad_norm": 0.28013707569590557, + "grad_norm": 0.1376875291822998, "learning_rate": 1.2709996981869699e-05, - "loss": 3.8156, + "loss": 1.7785, "step": 6890 }, { "epoch": 0.8545578484228791, - "grad_norm": 0.27399258918575153, + "grad_norm": 0.13107009499549113, "learning_rate": 1.2604652483788948e-05, - "loss": 3.575, + "loss": 1.8418, "step": 6895 }, { "epoch": 0.8551775422941067, - "grad_norm": 0.2731456828634934, + "grad_norm": 0.1439061599075864, "learning_rate": 1.2499716996742694e-05, - "loss": 3.5609, + "loss": 1.8465, "step": 6900 }, { "epoch": 0.8557972361653343, - "grad_norm": 0.26547407841220094, + "grad_norm": 0.13615157558842733, "learning_rate": 1.2395191011829999e-05, - "loss": 3.6492, + "loss": 1.8504, "step": 6905 }, { "epoch": 0.8564169300365619, - "grad_norm": 0.27355370923211936, + "grad_norm": 0.13740008939070456, "learning_rate": 1.2291075018233445e-05, - "loss": 3.6117, + "loss": 1.8633, "step": 6910 }, { "epoch": 0.8570366239077896, - "grad_norm": 0.2722276850451908, + "grad_norm": 0.1416192880003319, "learning_rate": 1.218736950321685e-05, - "loss": 3.7898, + "loss": 1.773, "step": 6915 }, { "epoch": 0.8576563177790172, - "grad_norm": 0.2711882167084163, + "grad_norm": 0.13447661457180524, "learning_rate": 1.2084074952122959e-05, - "loss": 3.6336, + "loss": 1.8652, "step": 6920 }, { "epoch": 0.8582760116502448, - "grad_norm": 0.2657796423409746, + "grad_norm": 0.13856068621293663, "learning_rate": 1.1981191848371287e-05, - "loss": 3.6664, + "loss": 1.8359, "step": 6925 }, { "epoch": 0.8588957055214724, - "grad_norm": 0.2672916194424286, + "grad_norm": 0.13705953632821657, "learning_rate": 1.1878720673455645e-05, - "loss": 3.782, + "loss": 1.8625, "step": 6930 }, { "epoch": 0.8595153993927, - "grad_norm": 0.2775303155331989, + "grad_norm": 0.14394551527944272, "learning_rate": 1.1776661906942099e-05, - "loss": 3.5602, + "loss": 1.8199, "step": 6935 }, { "epoch": 0.8601350932639277, - "grad_norm": 0.28179048470431456, + "grad_norm": 0.14385500400158302, "learning_rate": 1.1675016026466633e-05, - "loss": 3.6703, + "loss": 1.7957, "step": 6940 }, { "epoch": 0.8607547871351552, - "grad_norm": 0.2721522958602641, + "grad_norm": 0.13886377592614751, "learning_rate": 1.1573783507732893e-05, - "loss": 3.5633, + "loss": 1.8582, "step": 6945 }, { "epoch": 0.8613744810063828, - "grad_norm": 0.29373625990302477, + "grad_norm": 0.14377812118726135, "learning_rate": 1.1472964824510035e-05, - "loss": 3.6336, + "loss": 1.8348, "step": 6950 }, { "epoch": 0.8619941748776104, - "grad_norm": 0.28485020077333817, + "grad_norm": 0.13169196792358331, "learning_rate": 1.1372560448630376e-05, - "loss": 3.7281, + "loss": 1.8074, "step": 6955 }, { "epoch": 0.862613868748838, - "grad_norm": 0.29189846663791436, + "grad_norm": 0.13793344384621267, "learning_rate": 1.1272570849987351e-05, - "loss": 3.5758, + "loss": 1.8148, "step": 6960 }, { "epoch": 0.8632335626200657, - "grad_norm": 0.2727089581422032, + "grad_norm": 0.13768118568152948, "learning_rate": 1.1172996496533194e-05, - "loss": 3.5297, + "loss": 1.7746, "step": 6965 }, { "epoch": 0.8638532564912933, - "grad_norm": 0.27053269853938855, + "grad_norm": 0.1372503445366227, "learning_rate": 1.1073837854276826e-05, - "loss": 3.6812, + "loss": 1.8055, "step": 6970 }, { "epoch": 0.8644729503625209, - "grad_norm": 0.270257705336746, + "grad_norm": 0.14304524613047395, "learning_rate": 1.0975095387281587e-05, - "loss": 3.6547, + "loss": 1.8211, "step": 6975 }, { "epoch": 0.8650926442337485, - "grad_norm": 0.2709254834993136, + "grad_norm": 0.1341063071760674, "learning_rate": 1.087676955766318e-05, - "loss": 3.6617, + "loss": 1.8406, "step": 6980 }, { "epoch": 0.8657123381049762, - "grad_norm": 0.26787101687269665, + "grad_norm": 0.1369014351632983, "learning_rate": 1.0778860825587323e-05, - "loss": 3.7102, + "loss": 1.8379, "step": 6985 }, { "epoch": 0.8663320319762038, - "grad_norm": 0.25834763408294725, + "grad_norm": 0.13986771142969012, "learning_rate": 1.0681369649267836e-05, - "loss": 3.5211, + "loss": 1.8508, "step": 6990 }, { "epoch": 0.8669517258474314, - "grad_norm": 0.2772978504813741, + "grad_norm": 0.13871013013154287, "learning_rate": 1.0584296484964318e-05, - "loss": 3.6898, + "loss": 1.8746, "step": 6995 }, { "epoch": 0.867571419718659, - "grad_norm": 0.2672970881451939, + "grad_norm": 0.13536162110331093, "learning_rate": 1.0487641786980063e-05, - "loss": 3.6641, + "loss": 1.8387, "step": 7000 }, { "epoch": 0.8681911135898865, - "grad_norm": 0.27556223642075184, + "grad_norm": 0.13688561948696315, "learning_rate": 1.0391406007659964e-05, - "loss": 3.6203, + "loss": 1.7699, "step": 7005 }, { "epoch": 0.8688108074611142, - "grad_norm": 0.27886193281631777, + "grad_norm": 0.1379553663310307, "learning_rate": 1.0295589597388355e-05, - "loss": 3.6961, + "loss": 1.8297, "step": 7010 }, { "epoch": 0.8694305013323418, - "grad_norm": 0.2730774804147792, + "grad_norm": 0.1351138643848652, "learning_rate": 1.0200193004586922e-05, - "loss": 3.6781, + "loss": 1.8023, "step": 7015 }, { "epoch": 0.8700501952035694, - "grad_norm": 0.28077069068129035, + "grad_norm": 0.1352093710005571, "learning_rate": 1.0105216675712592e-05, - "loss": 3.6867, + "loss": 1.827, "step": 7020 }, { "epoch": 0.870669889074797, - "grad_norm": 0.2703429141608011, + "grad_norm": 0.14066110824426709, "learning_rate": 1.0010661055255488e-05, - "loss": 3.7523, + "loss": 1.8387, "step": 7025 }, { "epoch": 0.8712895829460247, - "grad_norm": 0.2623300718988336, + "grad_norm": 0.14804843977254, "learning_rate": 9.916526585736763e-06, - "loss": 3.5602, + "loss": 1.8551, "step": 7030 }, { "epoch": 0.8719092768172523, - "grad_norm": 0.27726179794151795, + "grad_norm": 0.13708112000990683, "learning_rate": 9.822813707706625e-06, - "loss": 3.5547, + "loss": 1.8391, "step": 7035 }, { "epoch": 0.8725289706884799, - "grad_norm": 0.273938282589832, + "grad_norm": 0.13764098800291605, "learning_rate": 9.729522859742191e-06, - "loss": 3.6773, + "loss": 1.8969, "step": 7040 }, { "epoch": 0.8731486645597075, - "grad_norm": 0.2626253209294718, + "grad_norm": 0.14148033017124853, "learning_rate": 9.636654478445494e-06, - "loss": 3.7633, + "loss": 1.8402, "step": 7045 }, { "epoch": 0.8737683584309351, - "grad_norm": 0.2709833878561481, + "grad_norm": 0.14686796135710695, "learning_rate": 9.544208998441428e-06, - "loss": 3.5719, + "loss": 1.8852, "step": 7050 }, { "epoch": 0.8743880523021628, - "grad_norm": 0.2688365440025024, + "grad_norm": 0.12880894029142761, "learning_rate": 9.452186852375678e-06, - "loss": 3.6688, + "loss": 1.8914, "step": 7055 }, { "epoch": 0.8750077461733904, - "grad_norm": 0.27760423536798684, + "grad_norm": 0.13556387811025775, "learning_rate": 9.360588470912756e-06, - "loss": 3.6711, + "loss": 1.8605, "step": 7060 }, { "epoch": 0.875627440044618, - "grad_norm": 0.2715108252926431, + "grad_norm": 0.142830646840542, "learning_rate": 9.269414282733924e-06, - "loss": 3.6313, + "loss": 1.8789, "step": 7065 }, { "epoch": 0.8762471339158455, - "grad_norm": 0.2820261278373799, + "grad_norm": 0.14113733132481313, "learning_rate": 9.178664714535235e-06, - "loss": 3.6602, + "loss": 1.8367, "step": 7070 }, { "epoch": 0.8768668277870731, - "grad_norm": 0.27157932121191247, + "grad_norm": 0.1380731856123245, "learning_rate": 9.088340191025501e-06, - "loss": 3.5891, + "loss": 1.859, "step": 7075 }, { "epoch": 0.8774865216583008, - "grad_norm": 0.26208545069180406, + "grad_norm": 0.13483398239386993, "learning_rate": 8.998441134924318e-06, - "loss": 3.8312, + "loss": 1.8738, "step": 7080 }, { "epoch": 0.8781062155295284, - "grad_norm": 0.2869454691805871, + "grad_norm": 0.13921045630136186, "learning_rate": 8.908967966960124e-06, - "loss": 3.6219, + "loss": 1.891, "step": 7085 }, { "epoch": 0.878725909400756, - "grad_norm": 0.2774369353780167, + "grad_norm": 0.14820885945750578, "learning_rate": 8.819921105868167e-06, - "loss": 3.7266, + "loss": 1.8176, "step": 7090 }, { "epoch": 0.8793456032719836, - "grad_norm": 0.2741228692101217, + "grad_norm": 0.13074251738222936, "learning_rate": 8.731300968388556e-06, - "loss": 3.7305, + "loss": 1.8359, "step": 7095 }, { "epoch": 0.8799652971432113, - "grad_norm": 0.27480937404345795, + "grad_norm": 0.13913021647845775, "learning_rate": 8.643107969264375e-06, - "loss": 3.6227, + "loss": 1.877, "step": 7100 }, { "epoch": 0.8805849910144389, - "grad_norm": 0.27131019564431835, + "grad_norm": 0.15160693761931768, "learning_rate": 8.555342521239662e-06, - "loss": 3.6844, + "loss": 1.8176, "step": 7105 }, { "epoch": 0.8812046848856665, - "grad_norm": 0.28727731987544974, + "grad_norm": 0.14319289119952988, "learning_rate": 8.468005035057536e-06, - "loss": 3.7648, + "loss": 1.8672, "step": 7110 }, { "epoch": 0.8818243787568941, - "grad_norm": 0.2772638259965115, + "grad_norm": 0.144700531964563, "learning_rate": 8.381095919458226e-06, - "loss": 3.5984, + "loss": 1.8527, "step": 7115 }, { "epoch": 0.8824440726281217, - "grad_norm": 0.2767919200596619, + "grad_norm": 0.14274877382833165, "learning_rate": 8.294615581177223e-06, - "loss": 3.6305, + "loss": 1.8434, "step": 7120 }, { "epoch": 0.8830637664993494, - "grad_norm": 0.275136982546241, + "grad_norm": 0.14240350591746134, "learning_rate": 8.208564424943288e-06, - "loss": 3.6969, + "loss": 1.8141, "step": 7125 }, { "epoch": 0.883683460370577, - "grad_norm": 0.2820723985783487, + "grad_norm": 0.13397820323823306, "learning_rate": 8.122942853476633e-06, - "loss": 3.6609, + "loss": 1.8988, "step": 7130 }, { "epoch": 0.8843031542418045, - "grad_norm": 0.2668391059350675, + "grad_norm": 0.14005322447201954, "learning_rate": 8.037751267487003e-06, - "loss": 3.657, + "loss": 1.8488, "step": 7135 }, { "epoch": 0.8849228481130321, - "grad_norm": 0.2721769752050762, + "grad_norm": 0.13988626846631608, "learning_rate": 7.952990065671817e-06, - "loss": 3.7703, + "loss": 1.8379, "step": 7140 }, { "epoch": 0.8855425419842597, - "grad_norm": 0.26632991478502493, + "grad_norm": 0.13867297243276785, "learning_rate": 7.868659644714294e-06, - "loss": 3.7148, + "loss": 1.8148, "step": 7145 }, { "epoch": 0.8861622358554874, - "grad_norm": 0.27844675439554767, + "grad_norm": 0.13631205393737392, "learning_rate": 7.784760399281554e-06, - "loss": 3.6, + "loss": 1.873, "step": 7150 }, { "epoch": 0.886781929726715, - "grad_norm": 0.27248180350893103, + "grad_norm": 0.13939741587279572, "learning_rate": 7.701292722022846e-06, - "loss": 3.7742, + "loss": 1.8008, "step": 7155 }, { "epoch": 0.8874016235979426, - "grad_norm": 0.29819852968664107, + "grad_norm": 0.13560938210869783, "learning_rate": 7.618257003567675e-06, - "loss": 3.6305, + "loss": 1.8172, "step": 7160 }, { "epoch": 0.8880213174691702, - "grad_norm": 0.2731747004186133, + "grad_norm": 0.13381257956234077, "learning_rate": 7.5356536325239755e-06, - "loss": 3.7445, + "loss": 1.773, "step": 7165 }, { "epoch": 0.8886410113403979, - "grad_norm": 0.2862676566426175, + "grad_norm": 0.1356484036949059, "learning_rate": 7.453482995476291e-06, - "loss": 3.6805, + "loss": 1.8176, "step": 7170 }, { "epoch": 0.8892607052116255, - "grad_norm": 0.27154514041437766, + "grad_norm": 0.13704309120390254, "learning_rate": 7.371745476983982e-06, - "loss": 3.5953, + "loss": 1.7602, "step": 7175 }, { "epoch": 0.8898803990828531, - "grad_norm": 0.271667748213158, + "grad_norm": 0.13713003743053534, "learning_rate": 7.2904414595793556e-06, - "loss": 3.5953, + "loss": 1.8395, "step": 7180 }, { "epoch": 0.8905000929540807, - "grad_norm": 0.25748281840553405, + "grad_norm": 0.13231072977298505, "learning_rate": 7.209571323765973e-06, - "loss": 3.7242, + "loss": 1.7906, "step": 7185 }, { "epoch": 0.8911197868253083, - "grad_norm": 0.27283419697356404, + "grad_norm": 0.1334467482508184, "learning_rate": 7.129135448016821e-06, - "loss": 3.6867, + "loss": 1.8305, "step": 7190 }, { "epoch": 0.8917394806965359, - "grad_norm": 0.2747934640984711, + "grad_norm": 0.1315592276353881, "learning_rate": 7.049134208772545e-06, - "loss": 3.6289, + "loss": 1.8535, "step": 7195 }, { "epoch": 0.8923591745677635, - "grad_norm": 0.2593985093047879, + "grad_norm": 0.13605396800853758, "learning_rate": 6.969567980439706e-06, - "loss": 3.7414, + "loss": 1.8613, "step": 7200 }, { "epoch": 0.8929788684389911, - "grad_norm": 0.27123165084229295, + "grad_norm": 0.13865586242794894, "learning_rate": 6.890437135388939e-06, - "loss": 3.5883, + "loss": 1.8625, "step": 7205 }, { "epoch": 0.8935985623102187, - "grad_norm": 0.284080180234527, + "grad_norm": 0.13560722459607658, "learning_rate": 6.8117420439533615e-06, - "loss": 3.6562, + "loss": 1.8523, "step": 7210 }, { "epoch": 0.8942182561814463, - "grad_norm": 0.26671975217197447, + "grad_norm": 0.13599381595733698, "learning_rate": 6.733483074426716e-06, - "loss": 3.6891, + "loss": 1.8551, "step": 7215 }, { "epoch": 0.894837950052674, - "grad_norm": 0.2781899517093553, + "grad_norm": 0.14264115635666896, "learning_rate": 6.655660593061719e-06, - "loss": 3.5781, + "loss": 1.8402, "step": 7220 }, { "epoch": 0.8954576439239016, - "grad_norm": 0.27489213812878466, + "grad_norm": 0.13748490683095682, "learning_rate": 6.578274964068298e-06, - "loss": 3.6289, + "loss": 1.8867, "step": 7225 }, { "epoch": 0.8960773377951292, - "grad_norm": 0.2711321321359575, + "grad_norm": 0.13058170557642237, "learning_rate": 6.50132654961193e-06, - "loss": 3.7266, + "loss": 1.8395, "step": 7230 }, { "epoch": 0.8966970316663568, - "grad_norm": 0.28198864613751984, + "grad_norm": 0.14230736706254918, "learning_rate": 6.424815709811871e-06, - "loss": 3.7094, + "loss": 1.8887, "step": 7235 }, { "epoch": 0.8973167255375845, - "grad_norm": 0.2769375687826022, + "grad_norm": 0.1407417474259601, "learning_rate": 6.3487428027395715e-06, - "loss": 3.6281, + "loss": 1.7859, "step": 7240 }, { "epoch": 0.8979364194088121, - "grad_norm": 0.2728007227425074, + "grad_norm": 0.13411033665330713, "learning_rate": 6.273108184416943e-06, - "loss": 3.6445, + "loss": 1.8422, "step": 7245 }, { "epoch": 0.8985561132800397, - "grad_norm": 0.27970919433014235, + "grad_norm": 0.13860443760395488, "learning_rate": 6.197912208814694e-06, - "loss": 3.6945, + "loss": 1.8723, "step": 7250 }, { "epoch": 0.8991758071512673, - "grad_norm": 0.28984382958053845, + "grad_norm": 0.1516944965846428, "learning_rate": 6.123155227850708e-06, - "loss": 3.6227, + "loss": 1.8492, "step": 7255 }, { "epoch": 0.8997955010224948, - "grad_norm": 0.26511116487605124, + "grad_norm": 0.13130989891564823, "learning_rate": 6.048837591388301e-06, - "loss": 3.575, + "loss": 1.8359, "step": 7260 }, { "epoch": 0.9004151948937225, - "grad_norm": 0.2591993698926708, + "grad_norm": 0.13496732630623953, "learning_rate": 5.974959647234746e-06, - "loss": 3.6633, + "loss": 1.8262, "step": 7265 }, { "epoch": 0.9010348887649501, - "grad_norm": 0.2631505045251582, + "grad_norm": 0.1398604330990237, "learning_rate": 5.901521741139482e-06, - "loss": 3.7563, + "loss": 1.8078, "step": 7270 }, { "epoch": 0.9016545826361777, - "grad_norm": 0.26728853198677144, + "grad_norm": 0.13392388488639656, "learning_rate": 5.828524216792586e-06, - "loss": 3.6656, + "loss": 1.7805, "step": 7275 }, { "epoch": 0.9022742765074053, - "grad_norm": 0.27823983832771093, + "grad_norm": 0.14015109449246452, "learning_rate": 5.75596741582316e-06, - "loss": 3.593, + "loss": 1.85, "step": 7280 }, { "epoch": 0.902893970378633, - "grad_norm": 0.26263872602397725, + "grad_norm": 0.1366604111163144, "learning_rate": 5.6838516777977135e-06, - "loss": 3.7344, + "loss": 1.8242, "step": 7285 }, { "epoch": 0.9035136642498606, - "grad_norm": 0.2794394137406013, + "grad_norm": 0.13873999414384125, "learning_rate": 5.6121773402185385e-06, - "loss": 3.7031, + "loss": 1.784, "step": 7290 }, { "epoch": 0.9041333581210882, - "grad_norm": 0.2733651674052135, + "grad_norm": 0.12957247714445108, "learning_rate": 5.540944738522203e-06, - "loss": 3.7102, + "loss": 1.8805, "step": 7295 }, { "epoch": 0.9047530519923158, - "grad_norm": 0.27487780611573903, + "grad_norm": 0.1375066888534788, "learning_rate": 5.470154206077949e-06, - "loss": 3.5016, + "loss": 1.8293, "step": 7300 }, { "epoch": 0.9053727458635434, - "grad_norm": 0.27113279352532604, + "grad_norm": 0.14306785500804953, "learning_rate": 5.3998060741861314e-06, - "loss": 3.6992, + "loss": 1.9141, "step": 7305 }, { "epoch": 0.9059924397347711, - "grad_norm": 0.2727201470546044, + "grad_norm": 0.13984693487259178, "learning_rate": 5.329900672076637e-06, - "loss": 3.6477, + "loss": 1.8313, "step": 7310 }, { "epoch": 0.9066121336059987, - "grad_norm": 0.27094646523483445, + "grad_norm": 0.132439352315789, "learning_rate": 5.260438326907413e-06, - "loss": 3.6203, + "loss": 1.8492, "step": 7315 }, { "epoch": 0.9072318274772263, - "grad_norm": 0.2631067653335526, + "grad_norm": 0.13944473906087765, "learning_rate": 5.191419363762873e-06, - "loss": 3.682, + "loss": 1.8215, "step": 7320 }, { "epoch": 0.9078515213484538, - "grad_norm": 0.28831558778089184, + "grad_norm": 0.13074588703804052, "learning_rate": 5.122844105652402e-06, - "loss": 3.6875, + "loss": 1.9125, "step": 7325 }, { "epoch": 0.9084712152196814, - "grad_norm": 0.2801500582422861, + "grad_norm": 0.14840324829722784, "learning_rate": 5.054712873508827e-06, - "loss": 3.8047, + "loss": 1.8781, "step": 7330 }, { "epoch": 0.9090909090909091, - "grad_norm": 0.2723292619898439, + "grad_norm": 0.13734906506051153, "learning_rate": 4.987025986186966e-06, - "loss": 3.7938, + "loss": 1.8535, "step": 7335 }, { "epoch": 0.9097106029621367, - "grad_norm": 0.27867354050039317, + "grad_norm": 0.1390678545194565, "learning_rate": 4.919783760462082e-06, - "loss": 3.5711, + "loss": 1.7563, "step": 7340 }, { "epoch": 0.9103302968333643, - "grad_norm": 0.27019084773565777, + "grad_norm": 0.1345470887887599, "learning_rate": 4.85298651102839e-06, - "loss": 3.7297, + "loss": 1.875, "step": 7345 }, { "epoch": 0.9109499907045919, - "grad_norm": 0.2811769070748566, + "grad_norm": 0.14077957222738707, "learning_rate": 4.786634550497637e-06, - "loss": 3.7406, + "loss": 1.8637, "step": 7350 }, { "epoch": 0.9115696845758195, - "grad_norm": 0.2737420543087705, + "grad_norm": 0.14011299364588545, "learning_rate": 4.720728189397628e-06, - "loss": 3.6914, + "loss": 1.7785, "step": 7355 }, { "epoch": 0.9121893784470472, - "grad_norm": 0.2752784244407715, + "grad_norm": 0.1407516641368298, "learning_rate": 4.655267736170732e-06, - "loss": 3.6805, + "loss": 1.8277, "step": 7360 }, { "epoch": 0.9128090723182748, - "grad_norm": 0.27025007110186383, + "grad_norm": 0.14111369095031798, "learning_rate": 4.5902534971724806e-06, - "loss": 3.8078, + "loss": 1.8016, "step": 7365 }, { "epoch": 0.9134287661895024, - "grad_norm": 0.27393454394470435, + "grad_norm": 0.13620410776159086, "learning_rate": 4.525685776670108e-06, - "loss": 3.6594, + "loss": 1.7734, "step": 7370 }, { "epoch": 0.91404846006073, - "grad_norm": 0.2727325638604466, + "grad_norm": 0.14478479182293405, "learning_rate": 4.46156487684114e-06, - "loss": 3.657, + "loss": 1.8207, "step": 7375 }, { "epoch": 0.9146681539319577, - "grad_norm": 0.27444588062659564, + "grad_norm": 0.1404958421396234, "learning_rate": 4.397891097771989e-06, - "loss": 3.6883, + "loss": 1.8289, "step": 7380 }, { "epoch": 0.9152878478031852, - "grad_norm": 0.2706709319540534, + "grad_norm": 0.13977123482192788, "learning_rate": 4.334664737456539e-06, - "loss": 3.6016, + "loss": 1.8039, "step": 7385 }, { "epoch": 0.9159075416744128, - "grad_norm": 0.27981114336986884, + "grad_norm": 0.1385005141537386, "learning_rate": 4.271886091794719e-06, - "loss": 3.6891, + "loss": 1.8824, "step": 7390 }, { "epoch": 0.9165272355456404, - "grad_norm": 0.28334376858050014, + "grad_norm": 0.1338052818221739, "learning_rate": 4.209555454591197e-06, - "loss": 3.7117, + "loss": 1.7785, "step": 7395 }, { "epoch": 0.917146929416868, - "grad_norm": 0.28061498567931165, + "grad_norm": 0.13855858216006417, "learning_rate": 4.147673117553896e-06, - "loss": 3.5844, + "loss": 1.8516, "step": 7400 }, { "epoch": 0.9177666232880957, - "grad_norm": 0.27056248644458036, + "grad_norm": 0.13709575865497378, "learning_rate": 4.086239370292755e-06, - "loss": 3.675, + "loss": 1.8473, "step": 7405 }, { "epoch": 0.9183863171593233, - "grad_norm": 0.2623174731789405, + "grad_norm": 0.14296857282320086, "learning_rate": 4.025254500318265e-06, - "loss": 3.7977, + "loss": 1.8418, "step": 7410 }, { "epoch": 0.9190060110305509, - "grad_norm": 0.2668902359387414, + "grad_norm": 0.13897241781853462, "learning_rate": 3.964718793040178e-06, - "loss": 3.7039, + "loss": 1.816, "step": 7415 }, { "epoch": 0.9196257049017785, - "grad_norm": 0.2682284186560273, + "grad_norm": 0.1412624511626614, "learning_rate": 3.904632531766195e-06, - "loss": 3.6523, + "loss": 1.7785, "step": 7420 }, { "epoch": 0.9202453987730062, - "grad_norm": 0.2817539903708584, + "grad_norm": 0.14569448140912444, "learning_rate": 3.84499599770054e-06, - "loss": 3.7437, + "loss": 1.8477, "step": 7425 }, { "epoch": 0.9208650926442338, - "grad_norm": 0.28262725368468417, + "grad_norm": 0.13269332412014898, "learning_rate": 3.785809469942758e-06, - "loss": 3.6078, + "loss": 1.8219, "step": 7430 }, { "epoch": 0.9214847865154614, - "grad_norm": 0.2872841560550761, + "grad_norm": 0.14219480383775535, "learning_rate": 3.727073225486344e-06, - "loss": 3.5781, + "loss": 1.8141, "step": 7435 }, { "epoch": 0.922104480386689, - "grad_norm": 0.26171137152403856, + "grad_norm": 0.13581718669588042, "learning_rate": 3.6687875392174665e-06, - "loss": 3.6469, + "loss": 1.8223, "step": 7440 }, { "epoch": 0.9227241742579166, - "grad_norm": 0.2721412298030815, + "grad_norm": 0.1441209126264499, "learning_rate": 3.61095268391366e-06, - "loss": 3.6672, + "loss": 1.832, "step": 7445 }, { "epoch": 0.9233438681291442, - "grad_norm": 0.2689383341336418, + "grad_norm": 0.14114396455281378, "learning_rate": 3.5535689302426236e-06, - "loss": 3.6063, + "loss": 1.8387, "step": 7450 }, { "epoch": 0.9239635620003718, - "grad_norm": 0.27165413446195674, + "grad_norm": 0.1387853984081508, "learning_rate": 3.496636546760812e-06, - "loss": 3.6414, + "loss": 1.7766, "step": 7455 }, { "epoch": 0.9245832558715994, - "grad_norm": 0.29153732834104146, + "grad_norm": 0.1395682007656557, "learning_rate": 3.4401557999123146e-06, - "loss": 3.5938, + "loss": 1.8605, "step": 7460 }, { "epoch": 0.925202949742827, - "grad_norm": 0.26713819753828977, + "grad_norm": 0.13184826376020614, "learning_rate": 3.3841269540275553e-06, - "loss": 3.6484, + "loss": 1.8656, "step": 7465 }, { "epoch": 0.9258226436140546, - "grad_norm": 0.2814855237337874, + "grad_norm": 0.13132758063469174, "learning_rate": 3.3285502713220617e-06, - "loss": 3.6789, + "loss": 1.9137, "step": 7470 }, { "epoch": 0.9264423374852823, - "grad_norm": 0.2743082315201234, + "grad_norm": 0.13838010265846035, "learning_rate": 3.2734260118952307e-06, - "loss": 3.5234, + "loss": 1.8414, "step": 7475 }, { "epoch": 0.9270620313565099, - "grad_norm": 0.26820958735170797, + "grad_norm": 0.13791135376377014, "learning_rate": 3.218754433729065e-06, - "loss": 3.6195, + "loss": 1.8473, "step": 7480 }, { "epoch": 0.9276817252277375, - "grad_norm": 0.28914257172709573, + "grad_norm": 0.13341582163949117, "learning_rate": 3.1645357926870955e-06, - "loss": 3.6086, + "loss": 1.8469, "step": 7485 }, { "epoch": 0.9283014190989651, - "grad_norm": 0.2756007960053005, + "grad_norm": 0.12360153082389551, "learning_rate": 3.110770342513036e-06, - "loss": 3.6508, + "loss": 1.8477, "step": 7490 }, { "epoch": 0.9289211129701928, - "grad_norm": 0.27707405209101954, + "grad_norm": 0.14160081548498316, "learning_rate": 3.057458334829699e-06, - "loss": 3.6102, + "loss": 1.8289, "step": 7495 }, { "epoch": 0.9295408068414204, - "grad_norm": 0.288026781525324, + "grad_norm": 0.13906615538183184, "learning_rate": 3.0046000191377934e-06, - "loss": 3.7766, + "loss": 1.8344, "step": 7500 }, { "epoch": 0.930160500712648, - "grad_norm": 0.2683433788201112, + "grad_norm": 0.13640236310813858, "learning_rate": 2.9521956428146923e-06, - "loss": 3.7789, + "loss": 1.7816, "step": 7505 }, { "epoch": 0.9307801945838756, - "grad_norm": 0.29027153699942254, + "grad_norm": 0.13041948410128965, "learning_rate": 2.9002454511133923e-06, - "loss": 3.6172, + "loss": 1.8438, "step": 7510 }, { "epoch": 0.9313998884551031, - "grad_norm": 0.2667413134023255, + "grad_norm": 0.13181252651115552, "learning_rate": 2.8487496871612453e-06, - "loss": 3.5625, + "loss": 1.8445, "step": 7515 }, { "epoch": 0.9320195823263308, - "grad_norm": 0.2704211917414998, + "grad_norm": 0.13523830165567224, "learning_rate": 2.7977085919589254e-06, - "loss": 3.6969, + "loss": 1.8453, "step": 7520 }, { "epoch": 0.9326392761975584, - "grad_norm": 0.2709804732784805, + "grad_norm": 0.13314991745111548, "learning_rate": 2.7471224043792098e-06, - "loss": 3.6844, + "loss": 1.8387, "step": 7525 }, { "epoch": 0.933258970068786, - "grad_norm": 0.27017634645825134, + "grad_norm": 0.13734870109010086, "learning_rate": 2.6969913611659457e-06, - "loss": 3.6047, + "loss": 1.907, "step": 7530 }, { "epoch": 0.9338786639400136, - "grad_norm": 0.28977647686803293, + "grad_norm": 0.1353394145014779, "learning_rate": 2.6473156969328503e-06, - "loss": 3.6891, + "loss": 1.7922, "step": 7535 }, { "epoch": 0.9344983578112412, - "grad_norm": 0.26221324731263035, + "grad_norm": 0.13982214073105317, "learning_rate": 2.5980956441625236e-06, - "loss": 3.7344, + "loss": 1.8031, "step": 7540 }, { "epoch": 0.9351180516824689, - "grad_norm": 0.27043105541626966, + "grad_norm": 0.13754359803353158, "learning_rate": 2.5493314332052377e-06, - "loss": 3.7297, + "loss": 1.8371, "step": 7545 }, { "epoch": 0.9357377455536965, - "grad_norm": 0.2645303314795488, + "grad_norm": 0.13846264686921053, "learning_rate": 2.501023292277971e-06, - "loss": 3.6734, + "loss": 1.85, "step": 7550 }, { "epoch": 0.9363574394249241, - "grad_norm": 0.2872448404216296, + "grad_norm": 0.13537888334342718, "learning_rate": 2.453171447463265e-06, - "loss": 3.5742, + "loss": 1.8641, "step": 7555 }, { "epoch": 0.9369771332961517, - "grad_norm": 0.28659408753141385, + "grad_norm": 0.14209971212182437, "learning_rate": 2.4057761227081923e-06, - "loss": 3.6891, + "loss": 1.7953, "step": 7560 }, { "epoch": 0.9375968271673794, - "grad_norm": 0.2878587690908836, + "grad_norm": 0.13889335296014865, "learning_rate": 2.358837539823311e-06, - "loss": 3.7313, + "loss": 1.8383, "step": 7565 }, { "epoch": 0.938216521038607, - "grad_norm": 0.2653905160082446, + "grad_norm": 0.13458314070605037, "learning_rate": 2.3123559184816344e-06, - "loss": 3.6633, + "loss": 1.7836, "step": 7570 }, { "epoch": 0.9388362149098345, - "grad_norm": 0.2735739394398778, + "grad_norm": 0.13737327467603194, "learning_rate": 2.2663314762175647e-06, - "loss": 3.8047, + "loss": 1.875, "step": 7575 }, { "epoch": 0.9394559087810621, - "grad_norm": 0.28433844946547354, + "grad_norm": 0.13624035692346814, "learning_rate": 2.2207644284259256e-06, - "loss": 3.6414, + "loss": 1.8383, "step": 7580 }, { "epoch": 0.9400756026522897, - "grad_norm": 0.27475684476595646, + "grad_norm": 0.13373715255768495, "learning_rate": 2.1756549883609313e-06, - "loss": 3.7437, + "loss": 1.8426, "step": 7585 }, { "epoch": 0.9406952965235174, - "grad_norm": 0.2780996306984348, + "grad_norm": 0.1328307541501429, "learning_rate": 2.131003367135154e-06, - "loss": 3.6781, + "loss": 1.8473, "step": 7590 }, { "epoch": 0.941314990394745, - "grad_norm": 0.2583439651780764, + "grad_norm": 0.1365831528128905, "learning_rate": 2.086809773718601e-06, - "loss": 3.675, + "loss": 1.8145, "step": 7595 }, { "epoch": 0.9419346842659726, - "grad_norm": 0.27909927819630376, + "grad_norm": 0.13862135902790484, "learning_rate": 2.0430744149377177e-06, - "loss": 3.6602, + "loss": 1.8508, "step": 7600 }, { "epoch": 0.9425543781372002, - "grad_norm": 0.2840758366639068, + "grad_norm": 0.14064173658635798, "learning_rate": 1.999797495474365e-06, - "loss": 3.768, + "loss": 1.8242, "step": 7605 }, { "epoch": 0.9431740720084278, - "grad_norm": 0.28264929875319833, + "grad_norm": 0.13911440336191405, "learning_rate": 1.9569792178649405e-06, - "loss": 3.6391, + "loss": 1.8699, "step": 7610 }, { "epoch": 0.9437937658796555, - "grad_norm": 0.2699490069013214, + "grad_norm": 0.13490137157085297, "learning_rate": 1.914619782499383e-06, - "loss": 3.7398, + "loss": 1.891, "step": 7615 }, { "epoch": 0.9444134597508831, - "grad_norm": 0.28918121210572356, + "grad_norm": 0.14290819661650486, "learning_rate": 1.8727193876202143e-06, - "loss": 3.5922, + "loss": 1.8387, "step": 7620 }, { "epoch": 0.9450331536221107, - "grad_norm": 0.26447781129623193, + "grad_norm": 0.13470735057128577, "learning_rate": 1.8312782293216979e-06, - "loss": 3.6164, + "loss": 1.834, "step": 7625 }, { "epoch": 0.9456528474933383, - "grad_norm": 0.2655529192807083, + "grad_norm": 0.13573866020248607, "learning_rate": 1.7902965015488381e-06, - "loss": 3.7133, + "loss": 1.8918, "step": 7630 }, { "epoch": 0.946272541364566, - "grad_norm": 0.26970685815626805, + "grad_norm": 0.14082077621818678, "learning_rate": 1.749774396096482e-06, - "loss": 3.7609, + "loss": 1.8762, "step": 7635 }, { "epoch": 0.9468922352357935, - "grad_norm": 0.27989040137666626, + "grad_norm": 0.13993020823079827, "learning_rate": 1.709712102608463e-06, - "loss": 3.6578, + "loss": 1.8129, "step": 7640 }, { "epoch": 0.9475119291070211, - "grad_norm": 0.2768738052873664, + "grad_norm": 0.13932041357054537, "learning_rate": 1.6701098085767031e-06, - "loss": 3.5641, + "loss": 1.8062, "step": 7645 }, { "epoch": 0.9481316229782487, - "grad_norm": 0.26954554955538307, + "grad_norm": 0.13250968165271637, "learning_rate": 1.630967699340269e-06, - "loss": 3.5883, + "loss": 1.8555, "step": 7650 }, { "epoch": 0.9487513168494763, - "grad_norm": 0.2672662342631292, + "grad_norm": 0.1322252771091484, "learning_rate": 1.5922859580846271e-06, - "loss": 3.7078, + "loss": 1.8555, "step": 7655 }, { "epoch": 0.949371010720704, - "grad_norm": 0.2703231564176733, + "grad_norm": 0.14227157372874508, "learning_rate": 1.5540647658406682e-06, - "loss": 3.6016, + "loss": 1.8102, "step": 7660 }, { "epoch": 0.9499907045919316, - "grad_norm": 0.2783617624945699, + "grad_norm": 0.14627459669348514, "learning_rate": 1.5163043014839284e-06, - "loss": 3.5859, + "loss": 1.7781, "step": 7665 }, { "epoch": 0.9506103984631592, - "grad_norm": 0.27960467541396195, + "grad_norm": 0.1365770378516687, "learning_rate": 1.479004741733736e-06, - "loss": 3.7055, + "loss": 1.859, "step": 7670 }, { "epoch": 0.9512300923343868, - "grad_norm": 0.27926562477154315, + "grad_norm": 0.1351641368471888, "learning_rate": 1.4421662611523667e-06, - "loss": 3.675, + "loss": 1.748, "step": 7675 }, { "epoch": 0.9518497862056144, - "grad_norm": 0.2717062431835573, + "grad_norm": 0.14013923132334566, "learning_rate": 1.4057890321442558e-06, - "loss": 3.6898, + "loss": 1.8434, "step": 7680 }, { "epoch": 0.9524694800768421, - "grad_norm": 0.28087453505860577, + "grad_norm": 0.14025363267615307, "learning_rate": 1.3698732249551648e-06, - "loss": 3.7852, + "loss": 1.7656, "step": 7685 }, { "epoch": 0.9530891739480697, - "grad_norm": 0.28704266677304857, + "grad_norm": 0.13360928888669812, "learning_rate": 1.3344190076714059e-06, - "loss": 3.675, + "loss": 1.9168, "step": 7690 }, { "epoch": 0.9537088678192973, - "grad_norm": 0.25837975350935316, + "grad_norm": 0.13388017546109943, "learning_rate": 1.2994265462190513e-06, - "loss": 3.7273, + "loss": 1.7984, "step": 7695 }, { "epoch": 0.9543285616905249, - "grad_norm": 0.2673603885607993, + "grad_norm": 0.1355624389649471, "learning_rate": 1.2648960043631474e-06, - "loss": 3.7102, + "loss": 1.8902, "step": 7700 }, { "epoch": 0.9549482555617524, - "grad_norm": 0.27804443443459514, + "grad_norm": 0.14142442625050944, "learning_rate": 1.230827543706925e-06, - "loss": 3.6562, + "loss": 1.791, "step": 7705 }, { "epoch": 0.9555679494329801, - "grad_norm": 0.26707459118833776, + "grad_norm": 0.13651526693745009, "learning_rate": 1.1972213236911112e-06, - "loss": 3.6453, + "loss": 1.8707, "step": 7710 }, { "epoch": 0.9561876433042077, - "grad_norm": 0.2724309585648748, + "grad_norm": 0.1330434243692908, "learning_rate": 1.1640775015931304e-06, - "loss": 3.5938, + "loss": 1.8598, "step": 7715 }, { "epoch": 0.9568073371754353, - "grad_norm": 0.2717087150472206, + "grad_norm": 0.13280703904278598, "learning_rate": 1.1313962325263717e-06, - "loss": 3.5758, + "loss": 1.8473, "step": 7720 }, { "epoch": 0.9574270310466629, - "grad_norm": 0.27390839096167285, + "grad_norm": 0.1371503651919113, "learning_rate": 1.0991776694394883e-06, - "loss": 3.6859, + "loss": 1.8715, "step": 7725 }, { "epoch": 0.9580467249178906, - "grad_norm": 0.2664273842532675, + "grad_norm": 0.13562632762998766, "learning_rate": 1.0674219631156334e-06, - "loss": 3.6953, + "loss": 1.8465, "step": 7730 }, { "epoch": 0.9586664187891182, - "grad_norm": 0.27668769079201644, + "grad_norm": 0.1359086084771363, "learning_rate": 1.0361292621718145e-06, - "loss": 3.7211, + "loss": 1.9387, "step": 7735 }, { "epoch": 0.9592861126603458, - "grad_norm": 0.29312759026667695, + "grad_norm": 0.13701787879508073, "learning_rate": 1.005299713058161e-06, - "loss": 3.6727, + "loss": 1.8426, "step": 7740 }, { "epoch": 0.9599058065315734, - "grad_norm": 0.2896106625011773, + "grad_norm": 0.13839945104830967, "learning_rate": 9.74933460057248e-07, - "loss": 3.7094, + "loss": 1.8203, "step": 7745 }, { "epoch": 0.960525500402801, - "grad_norm": 0.27674410647842534, + "grad_norm": 0.13677112829600235, "learning_rate": 9.450306452834179e-07, - "loss": 3.6437, + "loss": 1.8578, "step": 7750 }, { "epoch": 0.9611451942740287, - "grad_norm": 0.2911730586333492, + "grad_norm": 0.13877686312026696, "learning_rate": 9.15591408682126e-07, - "loss": 3.5891, + "loss": 1.8062, "step": 7755 }, { "epoch": 0.9617648881452563, - "grad_norm": 0.2793676329228869, + "grad_norm": 0.14210169193275773, "learning_rate": 8.866158880292741e-07, - "loss": 3.6141, + "loss": 1.7824, "step": 7760 }, { "epoch": 0.9623845820164838, - "grad_norm": 0.274996874794654, + "grad_norm": 0.1379664945215972, "learning_rate": 8.581042189305555e-07, - "loss": 3.7687, + "loss": 1.8281, "step": 7765 }, { "epoch": 0.9630042758877114, - "grad_norm": 0.2807429875667943, + "grad_norm": 0.13641761386203363, "learning_rate": 8.300565348208556e-07, - "loss": 3.6562, + "loss": 1.8309, "step": 7770 }, { "epoch": 0.963623969758939, - "grad_norm": 0.2649603137716782, + "grad_norm": 0.13365261844070073, "learning_rate": 8.024729669635967e-07, - "loss": 3.643, + "loss": 1.868, "step": 7775 }, { "epoch": 0.9642436636301667, - "grad_norm": 0.2810592469822913, + "grad_norm": 0.1378426462575117, "learning_rate": 7.753536444501164e-07, - "loss": 3.625, + "loss": 1.7684, "step": 7780 }, { "epoch": 0.9648633575013943, - "grad_norm": 0.27763596036414384, + "grad_norm": 0.1379659044326487, "learning_rate": 7.486986941991125e-07, - "loss": 3.7219, + "loss": 1.7703, "step": 7785 }, { "epoch": 0.9654830513726219, - "grad_norm": 0.2735361381171989, + "grad_norm": 0.13320897754847472, "learning_rate": 7.225082409559881e-07, - "loss": 3.7336, + "loss": 1.8211, "step": 7790 }, { "epoch": 0.9661027452438495, - "grad_norm": 0.2860714882582262, + "grad_norm": 0.13014133781314957, "learning_rate": 6.967824072923068e-07, - "loss": 3.6891, + "loss": 1.7672, "step": 7795 }, { "epoch": 0.9667224391150772, - "grad_norm": 0.25802780361746747, + "grad_norm": 0.13168145205742476, "learning_rate": 6.715213136052056e-07, - "loss": 3.7352, + "loss": 1.8258, "step": 7800 }, { "epoch": 0.9673421329863048, - "grad_norm": 0.2721644334655904, + "grad_norm": 0.1469376174744886, "learning_rate": 6.467250781168499e-07, - "loss": 3.7266, + "loss": 1.8527, "step": 7805 }, { "epoch": 0.9679618268575324, - "grad_norm": 0.2616403656619833, + "grad_norm": 0.13977784865229265, "learning_rate": 6.223938168738341e-07, - "loss": 3.6125, + "loss": 1.8059, "step": 7810 }, { "epoch": 0.96858152072876, - "grad_norm": 0.2837728188131973, + "grad_norm": 0.13532853632359126, "learning_rate": 5.985276437467046e-07, - "loss": 3.7289, + "loss": 1.8426, "step": 7815 }, { "epoch": 0.9692012145999876, - "grad_norm": 0.2674268745522877, + "grad_norm": 0.14519841690310878, "learning_rate": 5.751266704293601e-07, - "loss": 3.6313, + "loss": 1.8184, "step": 7820 }, { "epoch": 0.9698209084712153, - "grad_norm": 0.2746401213632551, + "grad_norm": 0.13321098148161775, "learning_rate": 5.521910064385627e-07, - "loss": 3.6617, + "loss": 1.8414, "step": 7825 }, { "epoch": 0.9704406023424428, - "grad_norm": 0.26258980431856366, + "grad_norm": 0.1353961588033009, "learning_rate": 5.297207591134612e-07, - "loss": 3.693, + "loss": 1.7965, "step": 7830 }, { "epoch": 0.9710602962136704, - "grad_norm": 0.2736638794617298, + "grad_norm": 0.13453437613463365, "learning_rate": 5.077160336149911e-07, - "loss": 3.6656, + "loss": 1.8438, "step": 7835 }, { "epoch": 0.971679990084898, - "grad_norm": 0.2693661535971264, + "grad_norm": 0.14592528106657068, "learning_rate": 4.861769329254862e-07, - "loss": 3.6242, + "loss": 1.8098, "step": 7840 }, { "epoch": 0.9722996839561256, - "grad_norm": 0.2645066499365123, + "grad_norm": 0.14121740990989298, "learning_rate": 4.651035578481344e-07, - "loss": 3.682, + "loss": 1.8746, "step": 7845 }, { "epoch": 0.9729193778273533, - "grad_norm": 0.27512468136313495, + "grad_norm": 0.13805818067090103, "learning_rate": 4.4449600700652296e-07, - "loss": 3.643, + "loss": 1.8492, "step": 7850 }, { "epoch": 0.9735390716985809, - "grad_norm": 0.27388444761756614, + "grad_norm": 0.14604169034139383, "learning_rate": 4.243543768441827e-07, - "loss": 3.7188, + "loss": 1.8508, "step": 7855 }, { "epoch": 0.9741587655698085, - "grad_norm": 0.28910817736216143, + "grad_norm": 0.13383491518372784, "learning_rate": 4.046787616241221e-07, - "loss": 3.6852, + "loss": 1.8828, "step": 7860 }, { "epoch": 0.9747784594410361, - "grad_norm": 0.2926126161949655, + "grad_norm": 0.13449740986423936, "learning_rate": 3.8546925342842764e-07, - "loss": 3.543, + "loss": 1.7391, "step": 7865 }, { "epoch": 0.9753981533122638, - "grad_norm": 0.26509568145736156, + "grad_norm": 0.13967533122339457, "learning_rate": 3.6672594215774183e-07, - "loss": 3.5805, + "loss": 1.8344, "step": 7870 }, { "epoch": 0.9760178471834914, - "grad_norm": 0.27175913423112874, + "grad_norm": 0.14033499838206215, "learning_rate": 3.484489155309523e-07, - "loss": 3.6203, + "loss": 1.8375, "step": 7875 }, { "epoch": 0.976637541054719, - "grad_norm": 0.27950277725425826, + "grad_norm": 0.14348107282049077, "learning_rate": 3.3063825908471456e-07, - "loss": 3.6234, + "loss": 1.8531, "step": 7880 }, { "epoch": 0.9772572349259466, - "grad_norm": 0.2849384937701663, + "grad_norm": 0.13855132244869428, "learning_rate": 3.132940561730524e-07, - "loss": 3.6492, + "loss": 1.7863, "step": 7885 }, { "epoch": 0.9778769287971742, - "grad_norm": 0.27675344143637354, + "grad_norm": 0.13157405016785448, "learning_rate": 2.9641638796701344e-07, - "loss": 3.7031, + "loss": 1.8426, "step": 7890 }, { "epoch": 0.9784966226684018, - "grad_norm": 0.26836533772485915, + "grad_norm": 0.1321435703732912, "learning_rate": 2.800053334542363e-07, - "loss": 3.6266, + "loss": 1.8637, "step": 7895 }, { "epoch": 0.9791163165396294, - "grad_norm": 0.2784555897429475, + "grad_norm": 0.13158992278105788, "learning_rate": 2.6406096943859537e-07, - "loss": 3.7789, + "loss": 1.8746, "step": 7900 }, { "epoch": 0.979736010410857, - "grad_norm": 0.2739776407729332, + "grad_norm": 0.13933541006297465, "learning_rate": 2.485833705398677e-07, - "loss": 3.7125, + "loss": 1.8199, "step": 7905 }, { "epoch": 0.9803557042820846, - "grad_norm": 0.2801988570193097, + "grad_norm": 0.13437527753897568, "learning_rate": 2.3357260919336654e-07, - "loss": 3.6797, + "loss": 1.8555, "step": 7910 }, { "epoch": 0.9809753981533123, - "grad_norm": 0.2817935004020373, + "grad_norm": 0.14403000978113806, "learning_rate": 2.1902875564958624e-07, - "loss": 3.6828, + "loss": 1.8395, "step": 7915 }, { "epoch": 0.9815950920245399, - "grad_norm": 0.26306490781018277, + "grad_norm": 0.13600747653132192, "learning_rate": 2.0495187797390236e-07, - "loss": 3.7039, + "loss": 1.868, "step": 7920 }, { "epoch": 0.9822147858957675, - "grad_norm": 0.2666398441401795, + "grad_norm": 0.1471070831087313, "learning_rate": 1.913420420462164e-07, - "loss": 3.668, + "loss": 1.818, "step": 7925 }, { "epoch": 0.9828344797669951, - "grad_norm": 0.2822222017860193, + "grad_norm": 0.1335371161396645, "learning_rate": 1.7819931156071168e-07, - "loss": 3.6266, + "loss": 1.8746, "step": 7930 }, { "epoch": 0.9834541736382227, - "grad_norm": 0.27776801734518003, + "grad_norm": 0.1368536776343064, "learning_rate": 1.6552374802546454e-07, - "loss": 3.6852, + "loss": 1.8391, "step": 7935 }, { "epoch": 0.9840738675094504, - "grad_norm": 0.26060817961696686, + "grad_norm": 0.1330749589293093, "learning_rate": 1.5331541076225585e-07, - "loss": 3.7227, + "loss": 1.875, "step": 7940 }, { "epoch": 0.984693561380678, - "grad_norm": 0.2675642716698818, + "grad_norm": 0.13468183805269288, "learning_rate": 1.4157435690619337e-07, - "loss": 3.6625, + "loss": 1.7766, "step": 7945 }, { "epoch": 0.9853132552519056, - "grad_norm": 0.27291219918221404, + "grad_norm": 0.13485415942676057, "learning_rate": 1.30300641405523e-07, - "loss": 3.6742, + "loss": 1.857, "step": 7950 }, { "epoch": 0.9859329491231331, - "grad_norm": 0.26586541429232524, + "grad_norm": 0.13908303443528403, "learning_rate": 1.194943170213403e-07, - "loss": 3.5688, + "loss": 1.8359, "step": 7955 }, { "epoch": 0.9865526429943607, - "grad_norm": 0.268823372681042, + "grad_norm": 0.13274714646219282, "learning_rate": 1.0915543432733488e-07, - "loss": 3.7336, + "loss": 1.8434, "step": 7960 }, { "epoch": 0.9871723368655884, - "grad_norm": 0.2681016890128257, + "grad_norm": 0.13810326013237345, "learning_rate": 9.928404170959082e-08, - "loss": 3.5867, + "loss": 1.8164, "step": 7965 }, { "epoch": 0.987792030736816, - "grad_norm": 0.26956499858488664, + "grad_norm": 0.13614189789038067, "learning_rate": 8.988018536630893e-08, - "loss": 3.725, + "loss": 1.8387, "step": 7970 }, { "epoch": 0.9884117246080436, - "grad_norm": 0.28229057560627346, + "grad_norm": 0.13375315569066618, "learning_rate": 8.094390930762919e-08, - "loss": 3.7289, + "loss": 1.8, "step": 7975 }, { "epoch": 0.9890314184792712, - "grad_norm": 0.27614610199741324, + "grad_norm": 0.13158730991646866, "learning_rate": 7.247525535538647e-08, - "loss": 3.6398, + "loss": 1.7934, "step": 7980 }, { "epoch": 0.9896511123504989, - "grad_norm": 0.2803128283949079, + "grad_norm": 0.14477857824803034, "learning_rate": 6.447426314297734e-08, - "loss": 3.7266, + "loss": 1.8332, "step": 7985 }, { "epoch": 0.9902708062217265, - "grad_norm": 0.2618509943839932, + "grad_norm": 0.1342267918315416, "learning_rate": 5.6940970115115785e-08, - "loss": 3.6914, + "loss": 1.8547, "step": 7990 }, { "epoch": 0.9908905000929541, - "grad_norm": 0.2726310143354448, + "grad_norm": 0.13785984492136893, "learning_rate": 4.9875411527677826e-08, - "loss": 3.8883, + "loss": 1.7652, "step": 7995 }, { "epoch": 0.9915101939641817, - "grad_norm": 0.2766308585212035, + "grad_norm": 0.13807203287629677, "learning_rate": 4.327762044755712e-08, - "loss": 3.6102, + "loss": 1.8668, "step": 8000 }, { "epoch": 0.9921298878354093, - "grad_norm": 0.2662764431301861, + "grad_norm": 0.13646195929256758, "learning_rate": 3.714762775245406e-08, - "loss": 3.6828, + "loss": 1.7598, "step": 8005 }, { "epoch": 0.992749581706637, - "grad_norm": 0.2698434445721707, + "grad_norm": 0.13971148375976059, "learning_rate": 3.148546213080916e-08, - "loss": 3.6898, + "loss": 1.8727, "step": 8010 }, { "epoch": 0.9933692755778646, - "grad_norm": 0.29310884330046, + "grad_norm": 0.13334054828935307, "learning_rate": 2.6291150081603212e-08, - "loss": 3.5945, + "loss": 1.8863, "step": 8015 }, { "epoch": 0.9939889694490921, - "grad_norm": 0.2770822794995166, + "grad_norm": 0.14352383701066074, "learning_rate": 2.156471591426845e-08, - "loss": 3.718, + "loss": 1.8934, "step": 8020 }, { "epoch": 0.9946086633203197, - "grad_norm": 0.27590920995954854, + "grad_norm": 0.13909857608391912, "learning_rate": 1.7306181748566463e-08, - "loss": 3.7328, + "loss": 1.8133, "step": 8025 }, { "epoch": 0.9952283571915473, - "grad_norm": 0.2739410257909428, + "grad_norm": 0.13480520184307876, "learning_rate": 1.351556751445493e-08, - "loss": 3.7203, + "loss": 1.8406, "step": 8030 }, { "epoch": 0.995848051062775, - "grad_norm": 0.26566933154278977, + "grad_norm": 0.14279753608828183, "learning_rate": 1.0192890952054334e-08, - "loss": 3.5352, + "loss": 1.8004, "step": 8035 }, { "epoch": 0.9964677449340026, - "grad_norm": 0.2684622940211439, + "grad_norm": 0.14041065956395132, "learning_rate": 7.338167611536939e-09, - "loss": 3.7133, + "loss": 1.7984, "step": 8040 }, { "epoch": 0.9970874388052302, - "grad_norm": 0.27691748975672703, + "grad_norm": 0.1353003051841239, "learning_rate": 4.9514108530157585e-09, - "loss": 3.575, + "loss": 1.7949, "step": 8045 }, { "epoch": 0.9977071326764578, - "grad_norm": 0.27023267310378274, + "grad_norm": 0.14086128743096502, "learning_rate": 3.0326318465334625e-09, - "loss": 3.65, + "loss": 1.8348, "step": 8050 }, { "epoch": 0.9983268265476855, - "grad_norm": 0.27647933459013596, + "grad_norm": 0.1353261527492863, "learning_rate": 1.5818395720068602e-09, - "loss": 3.6539, + "loss": 1.757, "step": 8055 }, { "epoch": 0.9989465204189131, - "grad_norm": 0.273299155688557, + "grad_norm": 0.13528022729742975, "learning_rate": 5.99040819149188e-10, - "loss": 3.5516, + "loss": 1.8582, "step": 8060 }, { "epoch": 0.9995662142901407, - "grad_norm": 0.2738691124969836, + "grad_norm": 0.13580379005193177, "learning_rate": 8.42401874701082e-11, - "loss": 3.5602, + "loss": 1.8289, "step": 8065 }, + { + "epoch": 0.9999380306128772, + "eval_loss": 1.7869144678115845, + "eval_runtime": 146.7975, + "eval_samples_per_second": 97.27, + "eval_steps_per_second": 6.083, + "step": 8068 + }, { "epoch": 0.9999380306128772, "step": 8068, - "total_flos": 4248659495485440.0, - "train_loss": 0.0, - "train_runtime": 0.0126, - "train_samples_per_second": 10236188.723, - "train_steps_per_second": 639727.105 + "total_flos": 4248917998829568.0, + "train_loss": 1.8640377591255577, + "train_runtime": 7899.8856, + "train_samples_per_second": 16.341, + "train_steps_per_second": 1.021 } ], "logging_steps": 5, @@ -11333,7 +11341,7 @@ "attributes": {} } }, - "total_flos": 4248659495485440.0, + "total_flos": 4248917998829568.0, "train_batch_size": 4, "trial_name": null, "trial_params": null