diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11348 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999380306128772, + "eval_steps": 500, + "global_step": 8068, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001239387742455227, + "grad_norm": 0.3613625061479819, + "learning_rate": 2.4783147459727386e-07, + "loss": 2.5352, + "step": 1 + }, + { + "epoch": 0.0006196938712276135, + "grad_norm": 0.36095038255815987, + "learning_rate": 1.2391573729863693e-06, + "loss": 2.3916, + "step": 5 + }, + { + "epoch": 0.001239387742455227, + "grad_norm": 0.3254584998211072, + "learning_rate": 2.4783147459727386e-06, + "loss": 2.4891, + "step": 10 + }, + { + "epoch": 0.0018590816136828407, + "grad_norm": 0.38126326552507134, + "learning_rate": 3.717472118959108e-06, + "loss": 2.4898, + "step": 15 + }, + { + "epoch": 0.002478775484910454, + "grad_norm": 0.441689451221483, + "learning_rate": 4.956629491945477e-06, + "loss": 2.4367, + "step": 20 + }, + { + "epoch": 0.003098469356138068, + "grad_norm": 0.2973969565575579, + "learning_rate": 6.195786864931847e-06, + "loss": 2.4406, + "step": 25 + }, + { + "epoch": 0.0037181632273656814, + "grad_norm": 0.40935208612020174, + "learning_rate": 7.434944237918216e-06, + "loss": 2.382, + "step": 30 + }, + { + "epoch": 0.004337857098593295, + "grad_norm": 0.4181134678122744, + "learning_rate": 8.674101610904585e-06, + "loss": 2.4586, + "step": 35 + }, + { + "epoch": 0.004957550969820908, + "grad_norm": 0.3374239900963284, + "learning_rate": 9.913258983890955e-06, + "loss": 2.3852, + "step": 40 + }, + { + "epoch": 0.005577244841048522, + "grad_norm": 0.4319325262792448, + "learning_rate": 1.1152416356877324e-05, + "loss": 2.4039, + "step": 45 + }, + { + "epoch": 0.006196938712276136, + "grad_norm": 0.32665555964629, + "learning_rate": 1.2391573729863694e-05, + "loss": 2.4352, + "step": 50 + }, + { + "epoch": 0.006816632583503749, + "grad_norm": 0.5172119594644248, + "learning_rate": 1.3630731102850064e-05, + "loss": 2.3609, + "step": 55 + }, + { + "epoch": 0.007436326454731363, + "grad_norm": 0.4087931864681625, + "learning_rate": 1.4869888475836432e-05, + "loss": 2.4031, + "step": 60 + }, + { + "epoch": 0.008056020325958976, + "grad_norm": 0.46592670985648305, + "learning_rate": 1.61090458488228e-05, + "loss": 2.3156, + "step": 65 + }, + { + "epoch": 0.00867571419718659, + "grad_norm": 0.49541312013878036, + "learning_rate": 1.734820322180917e-05, + "loss": 2.3438, + "step": 70 + }, + { + "epoch": 0.009295408068414203, + "grad_norm": 0.38324458134659845, + "learning_rate": 1.858736059479554e-05, + "loss": 2.3406, + "step": 75 + }, + { + "epoch": 0.009915101939641817, + "grad_norm": 0.43338092650638166, + "learning_rate": 1.982651796778191e-05, + "loss": 2.3375, + "step": 80 + }, + { + "epoch": 0.01053479581086943, + "grad_norm": 0.40416289269468914, + "learning_rate": 2.106567534076828e-05, + "loss": 2.3328, + "step": 85 + }, + { + "epoch": 0.011154489682097044, + "grad_norm": 0.4473494205404002, + "learning_rate": 2.230483271375465e-05, + "loss": 2.2656, + "step": 90 + }, + { + "epoch": 0.011774183553324657, + "grad_norm": 0.3813410762384386, + "learning_rate": 2.3543990086741015e-05, + "loss": 2.2234, + "step": 95 + }, + { + "epoch": 0.012393877424552271, + "grad_norm": 0.3290497483814934, + "learning_rate": 2.4783147459727388e-05, + "loss": 2.1754, + "step": 100 + }, + { + "epoch": 0.013013571295779885, + "grad_norm": 0.3774917329867058, + "learning_rate": 2.6022304832713758e-05, + "loss": 2.0992, + "step": 105 + }, + { + "epoch": 0.013633265167007498, + "grad_norm": 0.24634234770137786, + "learning_rate": 2.7261462205700128e-05, + "loss": 2.1066, + "step": 110 + }, + { + "epoch": 0.014252959038235112, + "grad_norm": 0.2541916772452527, + "learning_rate": 2.8500619578686494e-05, + "loss": 2.1039, + "step": 115 + }, + { + "epoch": 0.014872652909462726, + "grad_norm": 0.21483622246957815, + "learning_rate": 2.9739776951672864e-05, + "loss": 2.1547, + "step": 120 + }, + { + "epoch": 0.01549234678069034, + "grad_norm": 0.1832960445686121, + "learning_rate": 3.0978934324659233e-05, + "loss": 2.1211, + "step": 125 + }, + { + "epoch": 0.01611204065191795, + "grad_norm": 0.2024480547670622, + "learning_rate": 3.22180916976456e-05, + "loss": 2.1246, + "step": 130 + }, + { + "epoch": 0.016731734523145567, + "grad_norm": 0.1587252934099839, + "learning_rate": 3.345724907063197e-05, + "loss": 2.1402, + "step": 135 + }, + { + "epoch": 0.01735142839437318, + "grad_norm": 0.1573425276705792, + "learning_rate": 3.469640644361834e-05, + "loss": 2.1453, + "step": 140 + }, + { + "epoch": 0.017971122265600794, + "grad_norm": 0.1649139918619751, + "learning_rate": 3.593556381660471e-05, + "loss": 2.091, + "step": 145 + }, + { + "epoch": 0.018590816136828406, + "grad_norm": 0.16906893865479186, + "learning_rate": 3.717472118959108e-05, + "loss": 2.0336, + "step": 150 + }, + { + "epoch": 0.01921051000805602, + "grad_norm": 0.16828989999123184, + "learning_rate": 3.841387856257745e-05, + "loss": 2.1066, + "step": 155 + }, + { + "epoch": 0.019830203879283633, + "grad_norm": 0.15086389397017358, + "learning_rate": 3.965303593556382e-05, + "loss": 2.0637, + "step": 160 + }, + { + "epoch": 0.02044989775051125, + "grad_norm": 0.13741000516222265, + "learning_rate": 4.0892193308550185e-05, + "loss": 2.0602, + "step": 165 + }, + { + "epoch": 0.02106959162173886, + "grad_norm": 0.13109937575764552, + "learning_rate": 4.213135068153656e-05, + "loss": 2.0883, + "step": 170 + }, + { + "epoch": 0.021689285492966476, + "grad_norm": 0.13563532496297764, + "learning_rate": 4.337050805452293e-05, + "loss": 2.0859, + "step": 175 + }, + { + "epoch": 0.022308979364194088, + "grad_norm": 0.13613940691719567, + "learning_rate": 4.46096654275093e-05, + "loss": 2.05, + "step": 180 + }, + { + "epoch": 0.022928673235421703, + "grad_norm": 0.12247823742232716, + "learning_rate": 4.5848822800495664e-05, + "loss": 2.041, + "step": 185 + }, + { + "epoch": 0.023548367106649315, + "grad_norm": 0.11943084710129132, + "learning_rate": 4.708798017348203e-05, + "loss": 1.9684, + "step": 190 + }, + { + "epoch": 0.02416806097787693, + "grad_norm": 0.10996794203028536, + "learning_rate": 4.83271375464684e-05, + "loss": 2.0195, + "step": 195 + }, + { + "epoch": 0.024787754849104542, + "grad_norm": 0.11919593904845416, + "learning_rate": 4.9566294919454776e-05, + "loss": 2.0355, + "step": 200 + }, + { + "epoch": 0.025407448720332158, + "grad_norm": 0.13573935802440174, + "learning_rate": 5.080545229244115e-05, + "loss": 2.0465, + "step": 205 + }, + { + "epoch": 0.02602714259155977, + "grad_norm": 0.11810880112190057, + "learning_rate": 5.2044609665427516e-05, + "loss": 2.0449, + "step": 210 + }, + { + "epoch": 0.02664683646278738, + "grad_norm": 0.10714263949710778, + "learning_rate": 5.328376703841388e-05, + "loss": 2.0453, + "step": 215 + }, + { + "epoch": 0.027266530334014997, + "grad_norm": 0.10808893820532955, + "learning_rate": 5.4522924411400255e-05, + "loss": 2.0484, + "step": 220 + }, + { + "epoch": 0.02788622420524261, + "grad_norm": 0.1079631127104769, + "learning_rate": 5.576208178438662e-05, + "loss": 2.0219, + "step": 225 + }, + { + "epoch": 0.028505918076470224, + "grad_norm": 0.115215707179092, + "learning_rate": 5.700123915737299e-05, + "loss": 2.0219, + "step": 230 + }, + { + "epoch": 0.029125611947697836, + "grad_norm": 0.10429743509556648, + "learning_rate": 5.8240396530359354e-05, + "loss": 2.0191, + "step": 235 + }, + { + "epoch": 0.02974530581892545, + "grad_norm": 0.10512419844589924, + "learning_rate": 5.947955390334573e-05, + "loss": 1.9578, + "step": 240 + }, + { + "epoch": 0.030364999690153063, + "grad_norm": 0.10841051120377791, + "learning_rate": 6.0718711276332094e-05, + "loss": 2.023, + "step": 245 + }, + { + "epoch": 0.03098469356138068, + "grad_norm": 0.1123866377946053, + "learning_rate": 6.195786864931847e-05, + "loss": 2.0043, + "step": 250 + }, + { + "epoch": 0.031604387432608294, + "grad_norm": 0.1071143230575679, + "learning_rate": 6.319702602230483e-05, + "loss": 2.0059, + "step": 255 + }, + { + "epoch": 0.0322240813038359, + "grad_norm": 0.11205273394431785, + "learning_rate": 6.44361833952912e-05, + "loss": 1.9516, + "step": 260 + }, + { + "epoch": 0.03284377517506352, + "grad_norm": 0.11124243248908515, + "learning_rate": 6.567534076827757e-05, + "loss": 2.0324, + "step": 265 + }, + { + "epoch": 0.03346346904629113, + "grad_norm": 0.11056302895417636, + "learning_rate": 6.691449814126395e-05, + "loss": 2.0059, + "step": 270 + }, + { + "epoch": 0.03408316291751875, + "grad_norm": 0.1150685621638358, + "learning_rate": 6.815365551425031e-05, + "loss": 1.8949, + "step": 275 + }, + { + "epoch": 0.03470285678874636, + "grad_norm": 0.11053590058509483, + "learning_rate": 6.939281288723668e-05, + "loss": 1.9539, + "step": 280 + }, + { + "epoch": 0.03532255065997397, + "grad_norm": 0.11097452419745213, + "learning_rate": 7.063197026022306e-05, + "loss": 2.0055, + "step": 285 + }, + { + "epoch": 0.03594224453120159, + "grad_norm": 0.10981036645663586, + "learning_rate": 7.187112763320942e-05, + "loss": 1.9684, + "step": 290 + }, + { + "epoch": 0.0365619384024292, + "grad_norm": 0.10851974717472565, + "learning_rate": 7.311028500619579e-05, + "loss": 1.9937, + "step": 295 + }, + { + "epoch": 0.03718163227365681, + "grad_norm": 0.1097063039665451, + "learning_rate": 7.434944237918216e-05, + "loss": 1.9258, + "step": 300 + }, + { + "epoch": 0.03780132614488443, + "grad_norm": 0.11108822223408563, + "learning_rate": 7.558859975216854e-05, + "loss": 1.9703, + "step": 305 + }, + { + "epoch": 0.03842102001611204, + "grad_norm": 0.10760540355469742, + "learning_rate": 7.68277571251549e-05, + "loss": 1.9465, + "step": 310 + }, + { + "epoch": 0.03904071388733965, + "grad_norm": 0.11516090750067655, + "learning_rate": 7.806691449814127e-05, + "loss": 1.952, + "step": 315 + }, + { + "epoch": 0.039660407758567266, + "grad_norm": 0.1043408406511001, + "learning_rate": 7.930607187112764e-05, + "loss": 1.9531, + "step": 320 + }, + { + "epoch": 0.04028010162979488, + "grad_norm": 0.1039519846752159, + "learning_rate": 8.0545229244114e-05, + "loss": 1.9426, + "step": 325 + }, + { + "epoch": 0.0408997955010225, + "grad_norm": 0.10746843855868768, + "learning_rate": 8.178438661710037e-05, + "loss": 1.9855, + "step": 330 + }, + { + "epoch": 0.041519489372250105, + "grad_norm": 0.10777591543631691, + "learning_rate": 8.302354399008675e-05, + "loss": 1.9445, + "step": 335 + }, + { + "epoch": 0.04213918324347772, + "grad_norm": 0.10806752734339195, + "learning_rate": 8.426270136307312e-05, + "loss": 1.8906, + "step": 340 + }, + { + "epoch": 0.042758877114705336, + "grad_norm": 0.11502306811099913, + "learning_rate": 8.550185873605948e-05, + "loss": 1.9727, + "step": 345 + }, + { + "epoch": 0.04337857098593295, + "grad_norm": 0.11448974921102456, + "learning_rate": 8.674101610904586e-05, + "loss": 1.8648, + "step": 350 + }, + { + "epoch": 0.04399826485716056, + "grad_norm": 0.1078095422643582, + "learning_rate": 8.798017348203223e-05, + "loss": 1.9875, + "step": 355 + }, + { + "epoch": 0.044617958728388175, + "grad_norm": 0.11060380654587663, + "learning_rate": 8.92193308550186e-05, + "loss": 2.0211, + "step": 360 + }, + { + "epoch": 0.04523765259961579, + "grad_norm": 0.11613817321861132, + "learning_rate": 9.045848822800496e-05, + "loss": 1.9891, + "step": 365 + }, + { + "epoch": 0.045857346470843406, + "grad_norm": 0.11368583100057289, + "learning_rate": 9.169764560099133e-05, + "loss": 1.9773, + "step": 370 + }, + { + "epoch": 0.046477040342071015, + "grad_norm": 0.10894141405623521, + "learning_rate": 9.29368029739777e-05, + "loss": 1.9566, + "step": 375 + }, + { + "epoch": 0.04709673421329863, + "grad_norm": 0.11421341665297284, + "learning_rate": 9.417596034696406e-05, + "loss": 1.9379, + "step": 380 + }, + { + "epoch": 0.047716428084526245, + "grad_norm": 0.11239189457188502, + "learning_rate": 9.541511771995044e-05, + "loss": 1.9359, + "step": 385 + }, + { + "epoch": 0.04833612195575386, + "grad_norm": 0.10853175700299506, + "learning_rate": 9.66542750929368e-05, + "loss": 1.9539, + "step": 390 + }, + { + "epoch": 0.04895581582698147, + "grad_norm": 0.11310381389332551, + "learning_rate": 9.789343246592317e-05, + "loss": 1.9348, + "step": 395 + }, + { + "epoch": 0.049575509698209085, + "grad_norm": 0.11560816087723923, + "learning_rate": 9.913258983890955e-05, + "loss": 1.9453, + "step": 400 + }, + { + "epoch": 0.0501952035694367, + "grad_norm": 0.12016209733805872, + "learning_rate": 0.0001003717472118959, + "loss": 1.9293, + "step": 405 + }, + { + "epoch": 0.050814897440664315, + "grad_norm": 0.11841169962945568, + "learning_rate": 0.0001016109045848823, + "loss": 1.9348, + "step": 410 + }, + { + "epoch": 0.051434591311891924, + "grad_norm": 0.11168273067829344, + "learning_rate": 0.00010285006195786867, + "loss": 1.932, + "step": 415 + }, + { + "epoch": 0.05205428518311954, + "grad_norm": 0.10983446111870222, + "learning_rate": 0.00010408921933085503, + "loss": 1.9398, + "step": 420 + }, + { + "epoch": 0.052673979054347154, + "grad_norm": 0.11246006030379667, + "learning_rate": 0.0001053283767038414, + "loss": 1.8598, + "step": 425 + }, + { + "epoch": 0.05329367292557476, + "grad_norm": 0.11621584827465425, + "learning_rate": 0.00010656753407682776, + "loss": 1.9211, + "step": 430 + }, + { + "epoch": 0.05391336679680238, + "grad_norm": 0.12034248030098742, + "learning_rate": 0.00010780669144981412, + "loss": 1.966, + "step": 435 + }, + { + "epoch": 0.054533060668029994, + "grad_norm": 0.1160099754241247, + "learning_rate": 0.00010904584882280051, + "loss": 1.9609, + "step": 440 + }, + { + "epoch": 0.05515275453925761, + "grad_norm": 0.11125026431874137, + "learning_rate": 0.00011028500619578688, + "loss": 1.9117, + "step": 445 + }, + { + "epoch": 0.05577244841048522, + "grad_norm": 0.11612202738414985, + "learning_rate": 0.00011152416356877324, + "loss": 1.9352, + "step": 450 + }, + { + "epoch": 0.05639214228171283, + "grad_norm": 0.11452837316949507, + "learning_rate": 0.00011276332094175961, + "loss": 1.9422, + "step": 455 + }, + { + "epoch": 0.05701183615294045, + "grad_norm": 0.12866517250152734, + "learning_rate": 0.00011400247831474598, + "loss": 1.8824, + "step": 460 + }, + { + "epoch": 0.057631530024168064, + "grad_norm": 0.11047960782931952, + "learning_rate": 0.00011524163568773234, + "loss": 1.9336, + "step": 465 + }, + { + "epoch": 0.05825122389539567, + "grad_norm": 0.12040909536957894, + "learning_rate": 0.00011648079306071871, + "loss": 1.9598, + "step": 470 + }, + { + "epoch": 0.05887091776662329, + "grad_norm": 0.11354827146813799, + "learning_rate": 0.00011771995043370509, + "loss": 1.9406, + "step": 475 + }, + { + "epoch": 0.0594906116378509, + "grad_norm": 0.11018041152409878, + "learning_rate": 0.00011895910780669145, + "loss": 1.909, + "step": 480 + }, + { + "epoch": 0.06011030550907852, + "grad_norm": 0.11350276690749768, + "learning_rate": 0.00012019826517967782, + "loss": 1.9453, + "step": 485 + }, + { + "epoch": 0.06072999938030613, + "grad_norm": 0.12128379272584458, + "learning_rate": 0.00012143742255266419, + "loss": 1.9266, + "step": 490 + }, + { + "epoch": 0.06134969325153374, + "grad_norm": 0.11759828192541434, + "learning_rate": 0.00012267657992565055, + "loss": 1.8941, + "step": 495 + }, + { + "epoch": 0.06196938712276136, + "grad_norm": 0.12384786337021533, + "learning_rate": 0.00012391573729863693, + "loss": 1.8883, + "step": 500 + }, + { + "epoch": 0.06258908099398897, + "grad_norm": 0.11752293374203299, + "learning_rate": 0.00012515489467162331, + "loss": 1.8762, + "step": 505 + }, + { + "epoch": 0.06320877486521659, + "grad_norm": 0.11824701347044138, + "learning_rate": 0.00012639405204460967, + "loss": 1.8578, + "step": 510 + }, + { + "epoch": 0.0638284687364442, + "grad_norm": 0.11789835565142269, + "learning_rate": 0.00012763320941759605, + "loss": 1.9703, + "step": 515 + }, + { + "epoch": 0.0644481626076718, + "grad_norm": 0.11561056451604006, + "learning_rate": 0.0001288723667905824, + "loss": 1.9176, + "step": 520 + }, + { + "epoch": 0.06506785647889943, + "grad_norm": 0.12047906524830992, + "learning_rate": 0.00013011152416356878, + "loss": 1.9141, + "step": 525 + }, + { + "epoch": 0.06568755035012704, + "grad_norm": 0.11738452165789834, + "learning_rate": 0.00013135068153655513, + "loss": 1.9887, + "step": 530 + }, + { + "epoch": 0.06630724422135464, + "grad_norm": 0.12704500717051184, + "learning_rate": 0.0001325898389095415, + "loss": 1.9461, + "step": 535 + }, + { + "epoch": 0.06692693809258227, + "grad_norm": 0.12232806111597486, + "learning_rate": 0.0001338289962825279, + "loss": 1.9965, + "step": 540 + }, + { + "epoch": 0.06754663196380987, + "grad_norm": 0.11511330645040316, + "learning_rate": 0.00013506815365551427, + "loss": 1.9387, + "step": 545 + }, + { + "epoch": 0.0681663258350375, + "grad_norm": 0.12623271700622418, + "learning_rate": 0.00013630731102850062, + "loss": 1.8922, + "step": 550 + }, + { + "epoch": 0.0687860197062651, + "grad_norm": 0.1233130933433468, + "learning_rate": 0.000137546468401487, + "loss": 1.9695, + "step": 555 + }, + { + "epoch": 0.06940571357749271, + "grad_norm": 0.12482945990926926, + "learning_rate": 0.00013878562577447336, + "loss": 1.973, + "step": 560 + }, + { + "epoch": 0.07002540744872034, + "grad_norm": 0.12412994649354765, + "learning_rate": 0.00014002478314745974, + "loss": 1.9359, + "step": 565 + }, + { + "epoch": 0.07064510131994794, + "grad_norm": 0.12960183005831585, + "learning_rate": 0.00014126394052044612, + "loss": 1.8832, + "step": 570 + }, + { + "epoch": 0.07126479519117555, + "grad_norm": 0.13101220872323543, + "learning_rate": 0.00014250309789343247, + "loss": 1.9379, + "step": 575 + }, + { + "epoch": 0.07188448906240318, + "grad_norm": 0.11679270737421997, + "learning_rate": 0.00014374225526641885, + "loss": 1.9242, + "step": 580 + }, + { + "epoch": 0.07250418293363078, + "grad_norm": 0.11469156246728562, + "learning_rate": 0.0001449814126394052, + "loss": 1.9559, + "step": 585 + }, + { + "epoch": 0.0731238768048584, + "grad_norm": 0.1190243467694481, + "learning_rate": 0.00014622057001239158, + "loss": 1.8816, + "step": 590 + }, + { + "epoch": 0.07374357067608601, + "grad_norm": 0.12510722707465416, + "learning_rate": 0.00014745972738537794, + "loss": 1.9332, + "step": 595 + }, + { + "epoch": 0.07436326454731362, + "grad_norm": 0.14223237579204956, + "learning_rate": 0.00014869888475836432, + "loss": 1.9219, + "step": 600 + }, + { + "epoch": 0.07498295841854125, + "grad_norm": 0.12710564296924554, + "learning_rate": 0.0001499380421313507, + "loss": 1.9555, + "step": 605 + }, + { + "epoch": 0.07560265228976885, + "grad_norm": 0.11559551956238681, + "learning_rate": 0.00015117719950433707, + "loss": 1.9074, + "step": 610 + }, + { + "epoch": 0.07622234616099646, + "grad_norm": 0.11995055927268913, + "learning_rate": 0.00015241635687732343, + "loss": 1.8871, + "step": 615 + }, + { + "epoch": 0.07684204003222408, + "grad_norm": 0.13173160493796293, + "learning_rate": 0.0001536555142503098, + "loss": 1.9035, + "step": 620 + }, + { + "epoch": 0.0774617339034517, + "grad_norm": 0.1254908751927584, + "learning_rate": 0.00015489467162329616, + "loss": 1.9352, + "step": 625 + }, + { + "epoch": 0.0780814277746793, + "grad_norm": 0.12357179246830041, + "learning_rate": 0.00015613382899628254, + "loss": 1.932, + "step": 630 + }, + { + "epoch": 0.07870112164590692, + "grad_norm": 0.12344265317790121, + "learning_rate": 0.00015737298636926892, + "loss": 1.9211, + "step": 635 + }, + { + "epoch": 0.07932081551713453, + "grad_norm": 0.12852213663167542, + "learning_rate": 0.00015861214374225527, + "loss": 1.9, + "step": 640 + }, + { + "epoch": 0.07994050938836215, + "grad_norm": 0.131328675524144, + "learning_rate": 0.00015985130111524165, + "loss": 1.9531, + "step": 645 + }, + { + "epoch": 0.08056020325958976, + "grad_norm": 0.12494108251782414, + "learning_rate": 0.000161090458488228, + "loss": 1.8996, + "step": 650 + }, + { + "epoch": 0.08117989713081737, + "grad_norm": 0.128320637236905, + "learning_rate": 0.00016232961586121439, + "loss": 1.9301, + "step": 655 + }, + { + "epoch": 0.081799591002045, + "grad_norm": 0.12771710393625474, + "learning_rate": 0.00016356877323420074, + "loss": 1.9137, + "step": 660 + }, + { + "epoch": 0.0824192848732726, + "grad_norm": 0.12694987323527102, + "learning_rate": 0.00016480793060718712, + "loss": 1.9598, + "step": 665 + }, + { + "epoch": 0.08303897874450021, + "grad_norm": 0.13588091244879139, + "learning_rate": 0.0001660470879801735, + "loss": 1.8539, + "step": 670 + }, + { + "epoch": 0.08365867261572783, + "grad_norm": 0.12779362690623974, + "learning_rate": 0.00016728624535315988, + "loss": 1.9277, + "step": 675 + }, + { + "epoch": 0.08427836648695544, + "grad_norm": 0.1325286483243818, + "learning_rate": 0.00016852540272614623, + "loss": 1.8766, + "step": 680 + }, + { + "epoch": 0.08489806035818306, + "grad_norm": 0.127439252199406, + "learning_rate": 0.00016976456009913258, + "loss": 1.8848, + "step": 685 + }, + { + "epoch": 0.08551775422941067, + "grad_norm": 0.1241157391254646, + "learning_rate": 0.00017100371747211896, + "loss": 1.9816, + "step": 690 + }, + { + "epoch": 0.08613744810063828, + "grad_norm": 0.13332515296422381, + "learning_rate": 0.00017224287484510532, + "loss": 1.9262, + "step": 695 + }, + { + "epoch": 0.0867571419718659, + "grad_norm": 0.12252637760350603, + "learning_rate": 0.00017348203221809172, + "loss": 1.9293, + "step": 700 + }, + { + "epoch": 0.08737683584309351, + "grad_norm": 0.12496069484354921, + "learning_rate": 0.00017472118959107808, + "loss": 1.8855, + "step": 705 + }, + { + "epoch": 0.08799652971432112, + "grad_norm": 0.13098416875774188, + "learning_rate": 0.00017596034696406446, + "loss": 1.9121, + "step": 710 + }, + { + "epoch": 0.08861622358554874, + "grad_norm": 0.12342851589611775, + "learning_rate": 0.0001771995043370508, + "loss": 1.9309, + "step": 715 + }, + { + "epoch": 0.08923591745677635, + "grad_norm": 0.12375138351033128, + "learning_rate": 0.0001784386617100372, + "loss": 1.8801, + "step": 720 + }, + { + "epoch": 0.08985561132800396, + "grad_norm": 0.12516445329278314, + "learning_rate": 0.00017967781908302354, + "loss": 1.902, + "step": 725 + }, + { + "epoch": 0.09047530519923158, + "grad_norm": 0.12939315510028193, + "learning_rate": 0.00018091697645600992, + "loss": 1.9633, + "step": 730 + }, + { + "epoch": 0.09109499907045919, + "grad_norm": 0.12962320171696218, + "learning_rate": 0.0001821561338289963, + "loss": 1.9008, + "step": 735 + }, + { + "epoch": 0.09171469294168681, + "grad_norm": 0.12286398386603685, + "learning_rate": 0.00018339529120198265, + "loss": 1.8996, + "step": 740 + }, + { + "epoch": 0.09233438681291442, + "grad_norm": 0.1304419130957006, + "learning_rate": 0.00018463444857496903, + "loss": 1.9105, + "step": 745 + }, + { + "epoch": 0.09295408068414203, + "grad_norm": 0.1283679639528834, + "learning_rate": 0.0001858736059479554, + "loss": 1.8859, + "step": 750 + }, + { + "epoch": 0.09357377455536965, + "grad_norm": 0.12212618023022137, + "learning_rate": 0.00018711276332094177, + "loss": 1.882, + "step": 755 + }, + { + "epoch": 0.09419346842659726, + "grad_norm": 0.12769862702936974, + "learning_rate": 0.00018835192069392812, + "loss": 1.9723, + "step": 760 + }, + { + "epoch": 0.09481316229782487, + "grad_norm": 0.12391205743839302, + "learning_rate": 0.00018959107806691453, + "loss": 1.907, + "step": 765 + }, + { + "epoch": 0.09543285616905249, + "grad_norm": 0.12369280757608463, + "learning_rate": 0.00019083023543990088, + "loss": 1.9145, + "step": 770 + }, + { + "epoch": 0.0960525500402801, + "grad_norm": 0.12396791138067718, + "learning_rate": 0.00019206939281288726, + "loss": 1.8906, + "step": 775 + }, + { + "epoch": 0.09667224391150772, + "grad_norm": 0.12724885128353655, + "learning_rate": 0.0001933085501858736, + "loss": 1.907, + "step": 780 + }, + { + "epoch": 0.09729193778273533, + "grad_norm": 0.12534998156680616, + "learning_rate": 0.00019454770755886, + "loss": 1.8695, + "step": 785 + }, + { + "epoch": 0.09791163165396294, + "grad_norm": 0.13128368147328945, + "learning_rate": 0.00019578686493184635, + "loss": 1.9289, + "step": 790 + }, + { + "epoch": 0.09853132552519056, + "grad_norm": 0.13142114693955176, + "learning_rate": 0.00019702602230483272, + "loss": 1.8863, + "step": 795 + }, + { + "epoch": 0.09915101939641817, + "grad_norm": 0.12804903283927577, + "learning_rate": 0.0001982651796778191, + "loss": 1.9574, + "step": 800 + }, + { + "epoch": 0.09977071326764578, + "grad_norm": 0.12560616934907057, + "learning_rate": 0.00019950433705080546, + "loss": 1.9027, + "step": 805 + }, + { + "epoch": 0.1003904071388734, + "grad_norm": 0.13603145482080747, + "learning_rate": 0.00019999991575981254, + "loss": 1.8602, + "step": 810 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 0.1283260358018925, + "learning_rate": 0.00019999940095918086, + "loss": 1.9051, + "step": 815 + }, + { + "epoch": 0.10162979488132863, + "grad_norm": 0.1293026088162003, + "learning_rate": 0.000199998418160428, + "loss": 1.9285, + "step": 820 + }, + { + "epoch": 0.10224948875255624, + "grad_norm": 0.13075097487753526, + "learning_rate": 0.00019999696736815346, + "loss": 1.9273, + "step": 825 + }, + { + "epoch": 0.10286918262378385, + "grad_norm": 0.12662617461298337, + "learning_rate": 0.000199995048589147, + "loss": 1.8934, + "step": 830 + }, + { + "epoch": 0.10348887649501147, + "grad_norm": 0.13113444016935866, + "learning_rate": 0.00019999266183238847, + "loss": 1.8984, + "step": 835 + }, + { + "epoch": 0.10410857036623908, + "grad_norm": 0.12944550629626753, + "learning_rate": 0.00019998980710904794, + "loss": 1.9191, + "step": 840 + }, + { + "epoch": 0.10472826423746669, + "grad_norm": 0.13398656236337397, + "learning_rate": 0.00019998648443248556, + "loss": 1.9109, + "step": 845 + }, + { + "epoch": 0.10534795810869431, + "grad_norm": 0.1267682688223831, + "learning_rate": 0.00019998269381825147, + "loss": 1.875, + "step": 850 + }, + { + "epoch": 0.10596765197992192, + "grad_norm": 0.12746917546254172, + "learning_rate": 0.00019997843528408576, + "loss": 1.8754, + "step": 855 + }, + { + "epoch": 0.10658734585114953, + "grad_norm": 0.1254555174289125, + "learning_rate": 0.00019997370884991842, + "loss": 1.9078, + "step": 860 + }, + { + "epoch": 0.10720703972237715, + "grad_norm": 0.12777549717459696, + "learning_rate": 0.0001999685145378692, + "loss": 1.8938, + "step": 865 + }, + { + "epoch": 0.10782673359360476, + "grad_norm": 0.14795135942631768, + "learning_rate": 0.00019996285237224758, + "loss": 1.8574, + "step": 870 + }, + { + "epoch": 0.10844642746483238, + "grad_norm": 0.13079486561769416, + "learning_rate": 0.00019995672237955246, + "loss": 1.9207, + "step": 875 + }, + { + "epoch": 0.10906612133605999, + "grad_norm": 0.12773479517789088, + "learning_rate": 0.00019995012458847233, + "loss": 1.9152, + "step": 880 + }, + { + "epoch": 0.1096858152072876, + "grad_norm": 0.14019466780605774, + "learning_rate": 0.00019994305902988488, + "loss": 1.8484, + "step": 885 + }, + { + "epoch": 0.11030550907851522, + "grad_norm": 0.12914830902783628, + "learning_rate": 0.00019993552573685703, + "loss": 1.9531, + "step": 890 + }, + { + "epoch": 0.11092520294974283, + "grad_norm": 0.12289429503961614, + "learning_rate": 0.00019992752474464463, + "loss": 1.8996, + "step": 895 + }, + { + "epoch": 0.11154489682097044, + "grad_norm": 0.13824146618127445, + "learning_rate": 0.00019991905609069237, + "loss": 1.9203, + "step": 900 + }, + { + "epoch": 0.11216459069219806, + "grad_norm": 0.12867807538864207, + "learning_rate": 0.0001999101198146337, + "loss": 1.8961, + "step": 905 + }, + { + "epoch": 0.11278428456342567, + "grad_norm": 0.1284327023064257, + "learning_rate": 0.0001999007159582904, + "loss": 1.8984, + "step": 910 + }, + { + "epoch": 0.11340397843465329, + "grad_norm": 0.13149370957180515, + "learning_rate": 0.00019989084456567267, + "loss": 1.8809, + "step": 915 + }, + { + "epoch": 0.1140236723058809, + "grad_norm": 0.14535516131324824, + "learning_rate": 0.00019988050568297866, + "loss": 1.9066, + "step": 920 + }, + { + "epoch": 0.1146433661771085, + "grad_norm": 0.13159281057665195, + "learning_rate": 0.0001998696993585945, + "loss": 1.9094, + "step": 925 + }, + { + "epoch": 0.11526306004833613, + "grad_norm": 0.1361806065321, + "learning_rate": 0.00019985842564309382, + "loss": 1.9602, + "step": 930 + }, + { + "epoch": 0.11588275391956374, + "grad_norm": 0.12097934827295764, + "learning_rate": 0.00019984668458923775, + "loss": 1.8895, + "step": 935 + }, + { + "epoch": 0.11650244779079134, + "grad_norm": 0.12566168126158242, + "learning_rate": 0.00019983447625197457, + "loss": 1.8875, + "step": 940 + }, + { + "epoch": 0.11712214166201897, + "grad_norm": 0.12430870531788435, + "learning_rate": 0.0001998218006884393, + "loss": 1.9559, + "step": 945 + }, + { + "epoch": 0.11774183553324657, + "grad_norm": 0.12903502537406508, + "learning_rate": 0.0001998086579579538, + "loss": 1.8797, + "step": 950 + }, + { + "epoch": 0.11836152940447418, + "grad_norm": 0.12526103131226338, + "learning_rate": 0.00019979504812202612, + "loss": 1.907, + "step": 955 + }, + { + "epoch": 0.1189812232757018, + "grad_norm": 0.1291422540842116, + "learning_rate": 0.00019978097124435042, + "loss": 1.9887, + "step": 960 + }, + { + "epoch": 0.11960091714692941, + "grad_norm": 0.13690123211378433, + "learning_rate": 0.00019976642739080665, + "loss": 1.8879, + "step": 965 + }, + { + "epoch": 0.12022061101815704, + "grad_norm": 0.13042250114631507, + "learning_rate": 0.00019975141662946014, + "loss": 1.8988, + "step": 970 + }, + { + "epoch": 0.12084030488938464, + "grad_norm": 0.1450939461723542, + "learning_rate": 0.0001997359390305614, + "loss": 1.8363, + "step": 975 + }, + { + "epoch": 0.12145999876061225, + "grad_norm": 0.12494449135760256, + "learning_rate": 0.00019971999466654577, + "loss": 1.8691, + "step": 980 + }, + { + "epoch": 0.12207969263183988, + "grad_norm": 0.13013968478788646, + "learning_rate": 0.000199703583612033, + "loss": 1.8727, + "step": 985 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 0.12026110255667402, + "learning_rate": 0.00019968670594382694, + "loss": 1.9199, + "step": 990 + }, + { + "epoch": 0.12331908037429509, + "grad_norm": 0.13639861474847226, + "learning_rate": 0.00019966936174091527, + "loss": 1.875, + "step": 995 + }, + { + "epoch": 0.12393877424552271, + "grad_norm": 0.1311732320208071, + "learning_rate": 0.00019965155108446906, + "loss": 1.9246, + "step": 1000 + }, + { + "epoch": 0.12455846811675032, + "grad_norm": 0.13525131264110185, + "learning_rate": 0.00019963327405784226, + "loss": 1.9121, + "step": 1005 + }, + { + "epoch": 0.12517816198797793, + "grad_norm": 0.1377173253291631, + "learning_rate": 0.0001996145307465716, + "loss": 1.9074, + "step": 1010 + }, + { + "epoch": 0.12579785585920555, + "grad_norm": 0.12714986650171514, + "learning_rate": 0.00019959532123837588, + "loss": 1.9238, + "step": 1015 + }, + { + "epoch": 0.12641754973043318, + "grad_norm": 0.1316460319074934, + "learning_rate": 0.00019957564562315583, + "loss": 1.8453, + "step": 1020 + }, + { + "epoch": 0.12703724360166077, + "grad_norm": 0.12924564279125103, + "learning_rate": 0.0001995555039929935, + "loss": 1.9152, + "step": 1025 + }, + { + "epoch": 0.1276569374728884, + "grad_norm": 0.12389037014071365, + "learning_rate": 0.00019953489644215186, + "loss": 1.9145, + "step": 1030 + }, + { + "epoch": 0.12827663134411602, + "grad_norm": 0.12731994833926935, + "learning_rate": 0.00019951382306707452, + "loss": 1.8539, + "step": 1035 + }, + { + "epoch": 0.1288963252153436, + "grad_norm": 0.13145951711925763, + "learning_rate": 0.00019949228396638503, + "loss": 1.8926, + "step": 1040 + }, + { + "epoch": 0.12951601908657123, + "grad_norm": 0.1345163191426047, + "learning_rate": 0.00019947027924088656, + "loss": 1.8746, + "step": 1045 + }, + { + "epoch": 0.13013571295779885, + "grad_norm": 0.13149828806285122, + "learning_rate": 0.00019944780899356146, + "loss": 1.9363, + "step": 1050 + }, + { + "epoch": 0.13075540682902645, + "grad_norm": 0.131642891757479, + "learning_rate": 0.00019942487332957066, + "loss": 1.8734, + "step": 1055 + }, + { + "epoch": 0.13137510070025407, + "grad_norm": 0.1357864698335193, + "learning_rate": 0.0001994014723562533, + "loss": 1.8805, + "step": 1060 + }, + { + "epoch": 0.1319947945714817, + "grad_norm": 0.13726338790841353, + "learning_rate": 0.00019937760618312617, + "loss": 1.9238, + "step": 1065 + }, + { + "epoch": 0.1326144884427093, + "grad_norm": 0.13519938007390137, + "learning_rate": 0.00019935327492188315, + "loss": 1.9059, + "step": 1070 + }, + { + "epoch": 0.1332341823139369, + "grad_norm": 0.1324182900169985, + "learning_rate": 0.0001993284786863948, + "loss": 1.8719, + "step": 1075 + }, + { + "epoch": 0.13385387618516453, + "grad_norm": 0.13711559928347855, + "learning_rate": 0.0001993032175927077, + "loss": 1.9367, + "step": 1080 + }, + { + "epoch": 0.13447357005639216, + "grad_norm": 0.1334759916213516, + "learning_rate": 0.00019927749175904403, + "loss": 1.8883, + "step": 1085 + }, + { + "epoch": 0.13509326392761975, + "grad_norm": 0.12943730877342607, + "learning_rate": 0.0001992513013058009, + "loss": 1.8887, + "step": 1090 + }, + { + "epoch": 0.13571295779884737, + "grad_norm": 0.12672939466440764, + "learning_rate": 0.00019922464635554988, + "loss": 1.8449, + "step": 1095 + }, + { + "epoch": 0.136332651670075, + "grad_norm": 0.12225320336434065, + "learning_rate": 0.0001991975270330364, + "loss": 1.8719, + "step": 1100 + }, + { + "epoch": 0.1369523455413026, + "grad_norm": 0.12850014269871904, + "learning_rate": 0.00019916994346517915, + "loss": 1.8727, + "step": 1105 + }, + { + "epoch": 0.1375720394125302, + "grad_norm": 0.13328718338585124, + "learning_rate": 0.00019914189578106945, + "loss": 1.9055, + "step": 1110 + }, + { + "epoch": 0.13819173328375783, + "grad_norm": 0.12984485764601156, + "learning_rate": 0.00019911338411197075, + "loss": 1.9129, + "step": 1115 + }, + { + "epoch": 0.13881142715498543, + "grad_norm": 0.12879734723470498, + "learning_rate": 0.0001990844085913179, + "loss": 1.8289, + "step": 1120 + }, + { + "epoch": 0.13943112102621305, + "grad_norm": 0.12045308358521092, + "learning_rate": 0.00019905496935471658, + "loss": 1.8953, + "step": 1125 + }, + { + "epoch": 0.14005081489744067, + "grad_norm": 0.13681652513593023, + "learning_rate": 0.00019902506653994277, + "loss": 1.8285, + "step": 1130 + }, + { + "epoch": 0.14067050876866827, + "grad_norm": 0.12537351067620614, + "learning_rate": 0.00019899470028694185, + "loss": 1.8625, + "step": 1135 + }, + { + "epoch": 0.1412902026398959, + "grad_norm": 0.13310144010439956, + "learning_rate": 0.0001989638707378282, + "loss": 1.9676, + "step": 1140 + }, + { + "epoch": 0.1419098965111235, + "grad_norm": 0.12651220372995958, + "learning_rate": 0.0001989325780368844, + "loss": 1.8848, + "step": 1145 + }, + { + "epoch": 0.1425295903823511, + "grad_norm": 0.13498557406467906, + "learning_rate": 0.00019890082233056054, + "loss": 1.9281, + "step": 1150 + }, + { + "epoch": 0.14314928425357873, + "grad_norm": 0.13324683156087155, + "learning_rate": 0.00019886860376747362, + "loss": 1.8738, + "step": 1155 + }, + { + "epoch": 0.14376897812480635, + "grad_norm": 0.12727156852136712, + "learning_rate": 0.0001988359224984069, + "loss": 1.9187, + "step": 1160 + }, + { + "epoch": 0.14438867199603395, + "grad_norm": 0.13322593653239903, + "learning_rate": 0.0001988027786763089, + "loss": 1.8191, + "step": 1165 + }, + { + "epoch": 0.14500836586726157, + "grad_norm": 0.13430900015454927, + "learning_rate": 0.0001987691724562931, + "loss": 1.8746, + "step": 1170 + }, + { + "epoch": 0.1456280597384892, + "grad_norm": 0.1335734771928228, + "learning_rate": 0.00019873510399563688, + "loss": 1.9012, + "step": 1175 + }, + { + "epoch": 0.1462477536097168, + "grad_norm": 0.12408030014942514, + "learning_rate": 0.00019870057345378097, + "loss": 1.8648, + "step": 1180 + }, + { + "epoch": 0.1468674474809444, + "grad_norm": 0.11956244949759506, + "learning_rate": 0.00019866558099232862, + "loss": 1.8637, + "step": 1185 + }, + { + "epoch": 0.14748714135217203, + "grad_norm": 0.138357841858996, + "learning_rate": 0.00019863012677504485, + "loss": 1.8734, + "step": 1190 + }, + { + "epoch": 0.14810683522339965, + "grad_norm": 0.12851504413100462, + "learning_rate": 0.00019859421096785575, + "loss": 1.8891, + "step": 1195 + }, + { + "epoch": 0.14872652909462725, + "grad_norm": 0.13319470956489035, + "learning_rate": 0.00019855783373884763, + "loss": 1.8418, + "step": 1200 + }, + { + "epoch": 0.14934622296585487, + "grad_norm": 0.1251908795342198, + "learning_rate": 0.00019852099525826628, + "loss": 1.9004, + "step": 1205 + }, + { + "epoch": 0.1499659168370825, + "grad_norm": 0.12994851392351664, + "learning_rate": 0.00019848369569851608, + "loss": 1.8406, + "step": 1210 + }, + { + "epoch": 0.15058561070831009, + "grad_norm": 0.12421201591064086, + "learning_rate": 0.00019844593523415935, + "loss": 1.927, + "step": 1215 + }, + { + "epoch": 0.1512053045795377, + "grad_norm": 0.1348561729670815, + "learning_rate": 0.00019840771404191538, + "loss": 1.8785, + "step": 1220 + }, + { + "epoch": 0.15182499845076533, + "grad_norm": 0.12902823419358234, + "learning_rate": 0.00019836903230065973, + "loss": 1.9129, + "step": 1225 + }, + { + "epoch": 0.15244469232199293, + "grad_norm": 0.1303696455593782, + "learning_rate": 0.0001983298901914233, + "loss": 1.8965, + "step": 1230 + }, + { + "epoch": 0.15306438619322055, + "grad_norm": 0.12750859082981958, + "learning_rate": 0.00019829028789739156, + "loss": 1.8863, + "step": 1235 + }, + { + "epoch": 0.15368408006444817, + "grad_norm": 0.13301195737140625, + "learning_rate": 0.00019825022560390353, + "loss": 1.8848, + "step": 1240 + }, + { + "epoch": 0.15430377393567576, + "grad_norm": 0.12458777864772885, + "learning_rate": 0.00019820970349845117, + "loss": 1.918, + "step": 1245 + }, + { + "epoch": 0.1549234678069034, + "grad_norm": 0.12157252838516859, + "learning_rate": 0.0001981687217706783, + "loss": 1.9004, + "step": 1250 + }, + { + "epoch": 0.155543161678131, + "grad_norm": 0.12992471411020662, + "learning_rate": 0.0001981272806123798, + "loss": 1.8449, + "step": 1255 + }, + { + "epoch": 0.1561628555493586, + "grad_norm": 0.13490267199807432, + "learning_rate": 0.00019808538021750063, + "loss": 1.8594, + "step": 1260 + }, + { + "epoch": 0.15678254942058623, + "grad_norm": 0.13254926124911628, + "learning_rate": 0.00019804302078213506, + "loss": 1.9066, + "step": 1265 + }, + { + "epoch": 0.15740224329181385, + "grad_norm": 0.13655436155848294, + "learning_rate": 0.00019800020250452564, + "loss": 1.9094, + "step": 1270 + }, + { + "epoch": 0.15802193716304147, + "grad_norm": 0.12605752395912317, + "learning_rate": 0.00019795692558506232, + "loss": 1.8914, + "step": 1275 + }, + { + "epoch": 0.15864163103426906, + "grad_norm": 0.12417347152492184, + "learning_rate": 0.0001979131902262814, + "loss": 1.891, + "step": 1280 + }, + { + "epoch": 0.1592613249054967, + "grad_norm": 0.1230993318638121, + "learning_rate": 0.00019786899663286486, + "loss": 1.9051, + "step": 1285 + }, + { + "epoch": 0.1598810187767243, + "grad_norm": 0.13002111203651795, + "learning_rate": 0.00019782434501163909, + "loss": 1.9371, + "step": 1290 + }, + { + "epoch": 0.1605007126479519, + "grad_norm": 0.12883861862990198, + "learning_rate": 0.0001977792355715741, + "loss": 1.9074, + "step": 1295 + }, + { + "epoch": 0.16112040651917953, + "grad_norm": 0.12844516943687437, + "learning_rate": 0.00019773366852378246, + "loss": 1.8348, + "step": 1300 + }, + { + "epoch": 0.16174010039040715, + "grad_norm": 0.12861625452896877, + "learning_rate": 0.0001976876440815184, + "loss": 1.8395, + "step": 1305 + }, + { + "epoch": 0.16235979426163474, + "grad_norm": 0.12058020178260095, + "learning_rate": 0.0001976411624601767, + "loss": 1.8328, + "step": 1310 + }, + { + "epoch": 0.16297948813286237, + "grad_norm": 0.1266307036564728, + "learning_rate": 0.00019759422387729183, + "loss": 1.8406, + "step": 1315 + }, + { + "epoch": 0.16359918200409, + "grad_norm": 0.1320325104662958, + "learning_rate": 0.00019754682855253674, + "loss": 1.8789, + "step": 1320 + }, + { + "epoch": 0.16421887587531758, + "grad_norm": 0.13663051531666534, + "learning_rate": 0.00019749897670772205, + "loss": 1.8375, + "step": 1325 + }, + { + "epoch": 0.1648385697465452, + "grad_norm": 0.1264957686459983, + "learning_rate": 0.00019745066856679478, + "loss": 1.8758, + "step": 1330 + }, + { + "epoch": 0.16545826361777283, + "grad_norm": 0.13992651333444445, + "learning_rate": 0.0001974019043558375, + "loss": 1.8523, + "step": 1335 + }, + { + "epoch": 0.16607795748900042, + "grad_norm": 0.12649107132742746, + "learning_rate": 0.00019735268430306718, + "loss": 1.8438, + "step": 1340 + }, + { + "epoch": 0.16669765136022804, + "grad_norm": 0.12995820389298687, + "learning_rate": 0.00019730300863883405, + "loss": 1.8723, + "step": 1345 + }, + { + "epoch": 0.16731734523145567, + "grad_norm": 0.12383168873813469, + "learning_rate": 0.0001972528775956208, + "loss": 1.8973, + "step": 1350 + }, + { + "epoch": 0.16793703910268326, + "grad_norm": 0.12991019310770477, + "learning_rate": 0.0001972022914080411, + "loss": 1.8313, + "step": 1355 + }, + { + "epoch": 0.16855673297391088, + "grad_norm": 0.12295152055363918, + "learning_rate": 0.00019715125031283877, + "loss": 1.8652, + "step": 1360 + }, + { + "epoch": 0.1691764268451385, + "grad_norm": 0.13895756409315704, + "learning_rate": 0.00019709975454888662, + "loss": 1.8391, + "step": 1365 + }, + { + "epoch": 0.16979612071636613, + "grad_norm": 0.12962858797129284, + "learning_rate": 0.00019704780435718532, + "loss": 1.8965, + "step": 1370 + }, + { + "epoch": 0.17041581458759372, + "grad_norm": 0.13434107430628525, + "learning_rate": 0.00019699539998086223, + "loss": 1.8488, + "step": 1375 + }, + { + "epoch": 0.17103550845882134, + "grad_norm": 0.12227223666385428, + "learning_rate": 0.00019694254166517032, + "loss": 1.8836, + "step": 1380 + }, + { + "epoch": 0.17165520233004897, + "grad_norm": 0.13008761760930657, + "learning_rate": 0.00019688922965748696, + "loss": 1.8973, + "step": 1385 + }, + { + "epoch": 0.17227489620127656, + "grad_norm": 0.12801298155182156, + "learning_rate": 0.0001968354642073129, + "loss": 1.8383, + "step": 1390 + }, + { + "epoch": 0.17289459007250418, + "grad_norm": 0.1292755368394873, + "learning_rate": 0.00019678124556627094, + "loss": 1.8434, + "step": 1395 + }, + { + "epoch": 0.1735142839437318, + "grad_norm": 0.13946593061603393, + "learning_rate": 0.00019672657398810478, + "loss": 1.8535, + "step": 1400 + }, + { + "epoch": 0.1741339778149594, + "grad_norm": 0.14090247834119649, + "learning_rate": 0.00019667144972867795, + "loss": 1.8801, + "step": 1405 + }, + { + "epoch": 0.17475367168618702, + "grad_norm": 0.13668444514745795, + "learning_rate": 0.00019661587304597243, + "loss": 1.8801, + "step": 1410 + }, + { + "epoch": 0.17537336555741465, + "grad_norm": 0.13208454467125907, + "learning_rate": 0.0001965598442000877, + "loss": 1.9043, + "step": 1415 + }, + { + "epoch": 0.17599305942864224, + "grad_norm": 0.1392934854468196, + "learning_rate": 0.0001965033634532392, + "loss": 1.8613, + "step": 1420 + }, + { + "epoch": 0.17661275329986986, + "grad_norm": 0.12790012697738787, + "learning_rate": 0.00019644643106975739, + "loss": 1.8238, + "step": 1425 + }, + { + "epoch": 0.17723244717109748, + "grad_norm": 0.13787833838331934, + "learning_rate": 0.00019638904731608637, + "loss": 1.9125, + "step": 1430 + }, + { + "epoch": 0.17785214104232508, + "grad_norm": 0.1243448758230821, + "learning_rate": 0.00019633121246078256, + "loss": 1.8512, + "step": 1435 + }, + { + "epoch": 0.1784718349135527, + "grad_norm": 0.12359501590686292, + "learning_rate": 0.00019627292677451368, + "loss": 1.8918, + "step": 1440 + }, + { + "epoch": 0.17909152878478032, + "grad_norm": 0.14192690308588027, + "learning_rate": 0.00019621419053005726, + "loss": 1.8824, + "step": 1445 + }, + { + "epoch": 0.17971122265600792, + "grad_norm": 0.12650409825125716, + "learning_rate": 0.00019615500400229946, + "loss": 1.8875, + "step": 1450 + }, + { + "epoch": 0.18033091652723554, + "grad_norm": 0.130110107018088, + "learning_rate": 0.0001960953674682338, + "loss": 1.8426, + "step": 1455 + }, + { + "epoch": 0.18095061039846316, + "grad_norm": 0.12396235695123667, + "learning_rate": 0.00019603528120695982, + "loss": 1.9012, + "step": 1460 + }, + { + "epoch": 0.18157030426969079, + "grad_norm": 0.12716887462079618, + "learning_rate": 0.00019597474549968173, + "loss": 1.8551, + "step": 1465 + }, + { + "epoch": 0.18218999814091838, + "grad_norm": 0.11791740501594407, + "learning_rate": 0.00019591376062970728, + "loss": 1.9535, + "step": 1470 + }, + { + "epoch": 0.182809692012146, + "grad_norm": 0.1274909732144608, + "learning_rate": 0.00019585232688244613, + "loss": 1.8738, + "step": 1475 + }, + { + "epoch": 0.18342938588337362, + "grad_norm": 0.12950727276315843, + "learning_rate": 0.00019579044454540883, + "loss": 1.8336, + "step": 1480 + }, + { + "epoch": 0.18404907975460122, + "grad_norm": 0.12786571322089985, + "learning_rate": 0.0001957281139082053, + "loss": 1.9125, + "step": 1485 + }, + { + "epoch": 0.18466877362582884, + "grad_norm": 0.1238604246672542, + "learning_rate": 0.00019566533526254348, + "loss": 1.8684, + "step": 1490 + }, + { + "epoch": 0.18528846749705646, + "grad_norm": 0.11970443128542427, + "learning_rate": 0.00019560210890222802, + "loss": 1.8883, + "step": 1495 + }, + { + "epoch": 0.18590816136828406, + "grad_norm": 0.12314025496676621, + "learning_rate": 0.00019553843512315887, + "loss": 1.9094, + "step": 1500 + }, + { + "epoch": 0.18652785523951168, + "grad_norm": 0.12798261977540473, + "learning_rate": 0.00019547431422332992, + "loss": 1.8488, + "step": 1505 + }, + { + "epoch": 0.1871475491107393, + "grad_norm": 0.1286897352380571, + "learning_rate": 0.00019540974650282756, + "loss": 1.9156, + "step": 1510 + }, + { + "epoch": 0.1877672429819669, + "grad_norm": 0.1359355825058481, + "learning_rate": 0.0001953447322638293, + "loss": 1.8629, + "step": 1515 + }, + { + "epoch": 0.18838693685319452, + "grad_norm": 0.13358157574590807, + "learning_rate": 0.0001952792718106024, + "loss": 1.8797, + "step": 1520 + }, + { + "epoch": 0.18900663072442214, + "grad_norm": 0.1284059481658899, + "learning_rate": 0.00019521336544950238, + "loss": 1.8664, + "step": 1525 + }, + { + "epoch": 0.18962632459564974, + "grad_norm": 0.14299111302674886, + "learning_rate": 0.00019514701348897164, + "loss": 1.8043, + "step": 1530 + }, + { + "epoch": 0.19024601846687736, + "grad_norm": 0.13170613006037593, + "learning_rate": 0.00019508021623953795, + "loss": 1.918, + "step": 1535 + }, + { + "epoch": 0.19086571233810498, + "grad_norm": 0.1322137048503741, + "learning_rate": 0.00019501297401381304, + "loss": 1.8687, + "step": 1540 + }, + { + "epoch": 0.19148540620933258, + "grad_norm": 0.1372246664399263, + "learning_rate": 0.00019494528712649117, + "loss": 1.9109, + "step": 1545 + }, + { + "epoch": 0.1921051000805602, + "grad_norm": 0.13020416605079768, + "learning_rate": 0.0001948771558943476, + "loss": 1.907, + "step": 1550 + }, + { + "epoch": 0.19272479395178782, + "grad_norm": 0.12282250558731672, + "learning_rate": 0.00019480858063623715, + "loss": 1.8738, + "step": 1555 + }, + { + "epoch": 0.19334448782301544, + "grad_norm": 0.1347150806147481, + "learning_rate": 0.0001947395616730926, + "loss": 1.907, + "step": 1560 + }, + { + "epoch": 0.19396418169424304, + "grad_norm": 0.12434032734923321, + "learning_rate": 0.00019467009932792336, + "loss": 1.8703, + "step": 1565 + }, + { + "epoch": 0.19458387556547066, + "grad_norm": 0.13012917758896814, + "learning_rate": 0.00019460019392581387, + "loss": 1.9035, + "step": 1570 + }, + { + "epoch": 0.19520356943669828, + "grad_norm": 0.12768898254941669, + "learning_rate": 0.00019452984579392205, + "loss": 1.8516, + "step": 1575 + }, + { + "epoch": 0.19582326330792588, + "grad_norm": 0.12366914376030068, + "learning_rate": 0.0001944590552614778, + "loss": 1.857, + "step": 1580 + }, + { + "epoch": 0.1964429571791535, + "grad_norm": 0.15841193439353185, + "learning_rate": 0.0001943878226597815, + "loss": 1.8621, + "step": 1585 + }, + { + "epoch": 0.19706265105038112, + "grad_norm": 0.13946946613234204, + "learning_rate": 0.0001943161483222023, + "loss": 1.8508, + "step": 1590 + }, + { + "epoch": 0.19768234492160872, + "grad_norm": 0.12879914509204915, + "learning_rate": 0.00019424403258417683, + "loss": 1.875, + "step": 1595 + }, + { + "epoch": 0.19830203879283634, + "grad_norm": 0.13449677568515495, + "learning_rate": 0.00019417147578320744, + "loss": 1.8648, + "step": 1600 + }, + { + "epoch": 0.19892173266406396, + "grad_norm": 0.1251274556275624, + "learning_rate": 0.00019409847825886054, + "loss": 1.923, + "step": 1605 + }, + { + "epoch": 0.19954142653529156, + "grad_norm": 0.13841429870418875, + "learning_rate": 0.00019402504035276525, + "loss": 1.8086, + "step": 1610 + }, + { + "epoch": 0.20016112040651918, + "grad_norm": 0.12356779695871337, + "learning_rate": 0.00019395116240861172, + "loss": 1.8313, + "step": 1615 + }, + { + "epoch": 0.2007808142777468, + "grad_norm": 0.1192001269564105, + "learning_rate": 0.0001938768447721493, + "loss": 1.8734, + "step": 1620 + }, + { + "epoch": 0.2014005081489744, + "grad_norm": 0.12435187316100045, + "learning_rate": 0.00019380208779118532, + "loss": 1.8449, + "step": 1625 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.12735167929794539, + "learning_rate": 0.00019372689181558307, + "loss": 1.8262, + "step": 1630 + }, + { + "epoch": 0.20263989589142964, + "grad_norm": 0.12643283485876602, + "learning_rate": 0.00019365125719726046, + "loss": 1.8355, + "step": 1635 + }, + { + "epoch": 0.20325958976265726, + "grad_norm": 0.13092666514756482, + "learning_rate": 0.00019357518429018815, + "loss": 1.8996, + "step": 1640 + }, + { + "epoch": 0.20387928363388486, + "grad_norm": 0.1277631476280917, + "learning_rate": 0.00019349867345038808, + "loss": 1.8914, + "step": 1645 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 0.13075728650959278, + "learning_rate": 0.0001934217250359317, + "loss": 1.8426, + "step": 1650 + }, + { + "epoch": 0.2051186713763401, + "grad_norm": 0.12669704558351194, + "learning_rate": 0.0001933443394069383, + "loss": 1.8613, + "step": 1655 + }, + { + "epoch": 0.2057383652475677, + "grad_norm": 0.12867521676946295, + "learning_rate": 0.0001932665169255733, + "loss": 1.8926, + "step": 1660 + }, + { + "epoch": 0.20635805911879532, + "grad_norm": 0.1404983490484146, + "learning_rate": 0.00019318825795604667, + "loss": 1.8676, + "step": 1665 + }, + { + "epoch": 0.20697775299002294, + "grad_norm": 0.12750956916082043, + "learning_rate": 0.00019310956286461108, + "loss": 1.8461, + "step": 1670 + }, + { + "epoch": 0.20759744686125053, + "grad_norm": 0.1272989702864413, + "learning_rate": 0.00019303043201956033, + "loss": 1.8441, + "step": 1675 + }, + { + "epoch": 0.20821714073247816, + "grad_norm": 0.13245347386183784, + "learning_rate": 0.00019295086579122748, + "loss": 1.8059, + "step": 1680 + }, + { + "epoch": 0.20883683460370578, + "grad_norm": 0.11974342684601433, + "learning_rate": 0.0001928708645519832, + "loss": 1.8805, + "step": 1685 + }, + { + "epoch": 0.20945652847493337, + "grad_norm": 0.1373585266936289, + "learning_rate": 0.00019279042867623405, + "loss": 1.875, + "step": 1690 + }, + { + "epoch": 0.210076222346161, + "grad_norm": 0.126610203931597, + "learning_rate": 0.00019270955854042065, + "loss": 1.898, + "step": 1695 + }, + { + "epoch": 0.21069591621738862, + "grad_norm": 0.13468171577740737, + "learning_rate": 0.00019262825452301603, + "loss": 1.8695, + "step": 1700 + }, + { + "epoch": 0.2113156100886162, + "grad_norm": 0.13874720674647448, + "learning_rate": 0.0001925465170045237, + "loss": 1.8281, + "step": 1705 + }, + { + "epoch": 0.21193530395984383, + "grad_norm": 0.12340652395591858, + "learning_rate": 0.00019246434636747603, + "loss": 1.8727, + "step": 1710 + }, + { + "epoch": 0.21255499783107146, + "grad_norm": 0.13391821183419367, + "learning_rate": 0.00019238174299643235, + "loss": 1.8535, + "step": 1715 + }, + { + "epoch": 0.21317469170229905, + "grad_norm": 0.1238891855327304, + "learning_rate": 0.00019229870727797716, + "loss": 1.9117, + "step": 1720 + }, + { + "epoch": 0.21379438557352667, + "grad_norm": 0.13921826355338024, + "learning_rate": 0.00019221523960071847, + "loss": 1.8496, + "step": 1725 + }, + { + "epoch": 0.2144140794447543, + "grad_norm": 0.1296981763357881, + "learning_rate": 0.00019213134035528574, + "loss": 1.9004, + "step": 1730 + }, + { + "epoch": 0.21503377331598192, + "grad_norm": 0.1341944973320657, + "learning_rate": 0.0001920470099343282, + "loss": 1.8727, + "step": 1735 + }, + { + "epoch": 0.2156534671872095, + "grad_norm": 0.12518986519019676, + "learning_rate": 0.000191962248732513, + "loss": 1.9031, + "step": 1740 + }, + { + "epoch": 0.21627316105843714, + "grad_norm": 0.12632835935474943, + "learning_rate": 0.00019187705714652337, + "loss": 1.8535, + "step": 1745 + }, + { + "epoch": 0.21689285492966476, + "grad_norm": 0.1263494950807743, + "learning_rate": 0.00019179143557505676, + "loss": 1.8516, + "step": 1750 + }, + { + "epoch": 0.21751254880089235, + "grad_norm": 0.12443856243939533, + "learning_rate": 0.0001917053844188228, + "loss": 1.8672, + "step": 1755 + }, + { + "epoch": 0.21813224267211997, + "grad_norm": 0.1320392752907634, + "learning_rate": 0.0001916189040805418, + "loss": 1.834, + "step": 1760 + }, + { + "epoch": 0.2187519365433476, + "grad_norm": 0.12493530880754049, + "learning_rate": 0.0001915319949649425, + "loss": 1.8453, + "step": 1765 + }, + { + "epoch": 0.2193716304145752, + "grad_norm": 0.1322646318302995, + "learning_rate": 0.00019144465747876038, + "loss": 1.866, + "step": 1770 + }, + { + "epoch": 0.21999132428580281, + "grad_norm": 0.1349930390610452, + "learning_rate": 0.00019135689203073563, + "loss": 1.857, + "step": 1775 + }, + { + "epoch": 0.22061101815703044, + "grad_norm": 0.12533754824423338, + "learning_rate": 0.00019126869903161146, + "loss": 1.9152, + "step": 1780 + }, + { + "epoch": 0.22123071202825803, + "grad_norm": 0.13663631158953193, + "learning_rate": 0.00019118007889413186, + "loss": 1.8102, + "step": 1785 + }, + { + "epoch": 0.22185040589948565, + "grad_norm": 0.12533002892577624, + "learning_rate": 0.00019109103203303988, + "loss": 1.8387, + "step": 1790 + }, + { + "epoch": 0.22247009977071328, + "grad_norm": 0.1359058002472198, + "learning_rate": 0.00019100155886507566, + "loss": 1.9219, + "step": 1795 + }, + { + "epoch": 0.22308979364194087, + "grad_norm": 0.12945958578973213, + "learning_rate": 0.0001909116598089745, + "loss": 1.8602, + "step": 1800 + }, + { + "epoch": 0.2237094875131685, + "grad_norm": 0.1264797795918434, + "learning_rate": 0.00019082133528546476, + "loss": 1.8727, + "step": 1805 + }, + { + "epoch": 0.22432918138439611, + "grad_norm": 0.12673897565989584, + "learning_rate": 0.0001907305857172661, + "loss": 1.8844, + "step": 1810 + }, + { + "epoch": 0.2249488752556237, + "grad_norm": 0.11996164040647495, + "learning_rate": 0.00019063941152908727, + "loss": 1.8664, + "step": 1815 + }, + { + "epoch": 0.22556856912685133, + "grad_norm": 0.12717207452871931, + "learning_rate": 0.00019054781314762433, + "loss": 1.8543, + "step": 1820 + }, + { + "epoch": 0.22618826299807895, + "grad_norm": 0.12807052491062512, + "learning_rate": 0.0001904557910015586, + "loss": 1.8781, + "step": 1825 + }, + { + "epoch": 0.22680795686930658, + "grad_norm": 0.1310959903315099, + "learning_rate": 0.00019036334552155452, + "loss": 1.8336, + "step": 1830 + }, + { + "epoch": 0.22742765074053417, + "grad_norm": 0.13530374164558082, + "learning_rate": 0.00019027047714025784, + "loss": 1.8531, + "step": 1835 + }, + { + "epoch": 0.2280473446117618, + "grad_norm": 0.13283518160109148, + "learning_rate": 0.0001901771862922934, + "loss": 1.8594, + "step": 1840 + }, + { + "epoch": 0.22866703848298942, + "grad_norm": 0.14381832693100272, + "learning_rate": 0.00019008347341426324, + "loss": 1.8473, + "step": 1845 + }, + { + "epoch": 0.229286732354217, + "grad_norm": 0.12561347944988807, + "learning_rate": 0.0001899893389447445, + "loss": 1.8813, + "step": 1850 + }, + { + "epoch": 0.22990642622544463, + "grad_norm": 0.12885427981187214, + "learning_rate": 0.0001898947833242874, + "loss": 1.8465, + "step": 1855 + }, + { + "epoch": 0.23052612009667225, + "grad_norm": 0.1398342796676745, + "learning_rate": 0.00018979980699541308, + "loss": 1.8602, + "step": 1860 + }, + { + "epoch": 0.23114581396789985, + "grad_norm": 0.12603617420354743, + "learning_rate": 0.00018970441040261165, + "loss": 1.8055, + "step": 1865 + }, + { + "epoch": 0.23176550783912747, + "grad_norm": 0.12826056653704176, + "learning_rate": 0.00018960859399234006, + "loss": 1.8164, + "step": 1870 + }, + { + "epoch": 0.2323852017103551, + "grad_norm": 0.12843392111688107, + "learning_rate": 0.00018951235821301995, + "loss": 1.8934, + "step": 1875 + }, + { + "epoch": 0.2330048955815827, + "grad_norm": 0.12781068150685332, + "learning_rate": 0.0001894157035150357, + "loss": 1.8414, + "step": 1880 + }, + { + "epoch": 0.2336245894528103, + "grad_norm": 0.12383040948770438, + "learning_rate": 0.00018931863035073217, + "loss": 1.9203, + "step": 1885 + }, + { + "epoch": 0.23424428332403793, + "grad_norm": 0.12366748753721642, + "learning_rate": 0.00018922113917441269, + "loss": 1.8516, + "step": 1890 + }, + { + "epoch": 0.23486397719526553, + "grad_norm": 0.1325347292455204, + "learning_rate": 0.00018912323044233684, + "loss": 1.8555, + "step": 1895 + }, + { + "epoch": 0.23548367106649315, + "grad_norm": 0.13177269918492968, + "learning_rate": 0.00018902490461271843, + "loss": 1.8777, + "step": 1900 + }, + { + "epoch": 0.23610336493772077, + "grad_norm": 0.13170172393384472, + "learning_rate": 0.00018892616214572319, + "loss": 1.8934, + "step": 1905 + }, + { + "epoch": 0.23672305880894837, + "grad_norm": 0.14106704771321893, + "learning_rate": 0.0001888270035034668, + "loss": 1.8449, + "step": 1910 + }, + { + "epoch": 0.237342752680176, + "grad_norm": 0.12621612288999542, + "learning_rate": 0.00018872742915001267, + "loss": 1.8184, + "step": 1915 + }, + { + "epoch": 0.2379624465514036, + "grad_norm": 0.13102027385319423, + "learning_rate": 0.00018862743955136966, + "loss": 1.8348, + "step": 1920 + }, + { + "epoch": 0.23858214042263123, + "grad_norm": 0.12924449635578353, + "learning_rate": 0.00018852703517549, + "loss": 1.8789, + "step": 1925 + }, + { + "epoch": 0.23920183429385883, + "grad_norm": 0.137053445230958, + "learning_rate": 0.00018842621649226712, + "loss": 1.8961, + "step": 1930 + }, + { + "epoch": 0.23982152816508645, + "grad_norm": 0.13676238584015243, + "learning_rate": 0.00018832498397353337, + "loss": 1.9055, + "step": 1935 + }, + { + "epoch": 0.24044122203631407, + "grad_norm": 0.12473045166890744, + "learning_rate": 0.0001882233380930579, + "loss": 1.9051, + "step": 1940 + }, + { + "epoch": 0.24106091590754167, + "grad_norm": 0.13737094149660017, + "learning_rate": 0.00018812127932654437, + "loss": 1.8383, + "step": 1945 + }, + { + "epoch": 0.2416806097787693, + "grad_norm": 0.13541763846499583, + "learning_rate": 0.00018801880815162873, + "loss": 1.8684, + "step": 1950 + }, + { + "epoch": 0.2423003036499969, + "grad_norm": 0.12777856593407638, + "learning_rate": 0.00018791592504787704, + "loss": 1.8383, + "step": 1955 + }, + { + "epoch": 0.2429199975212245, + "grad_norm": 0.13129622175885042, + "learning_rate": 0.00018781263049678318, + "loss": 1.8508, + "step": 1960 + }, + { + "epoch": 0.24353969139245213, + "grad_norm": 0.12557864503351404, + "learning_rate": 0.00018770892498176658, + "loss": 1.9406, + "step": 1965 + }, + { + "epoch": 0.24415938526367975, + "grad_norm": 0.13946020278404306, + "learning_rate": 0.00018760480898817003, + "loss": 1.893, + "step": 1970 + }, + { + "epoch": 0.24477907913490735, + "grad_norm": 0.13062544520558794, + "learning_rate": 0.00018750028300325733, + "loss": 1.7953, + "step": 1975 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 0.1335190355390475, + "learning_rate": 0.0001873953475162111, + "loss": 1.8555, + "step": 1980 + }, + { + "epoch": 0.2460184668773626, + "grad_norm": 0.13163131758724117, + "learning_rate": 0.00018729000301813032, + "loss": 1.9035, + "step": 1985 + }, + { + "epoch": 0.24663816074859019, + "grad_norm": 0.1339228643234811, + "learning_rate": 0.00018718425000202826, + "loss": 1.8313, + "step": 1990 + }, + { + "epoch": 0.2472578546198178, + "grad_norm": 0.1306228391266261, + "learning_rate": 0.00018707808896283, + "loss": 1.8941, + "step": 1995 + }, + { + "epoch": 0.24787754849104543, + "grad_norm": 0.12487620123598475, + "learning_rate": 0.00018697152039737018, + "loss": 1.7969, + "step": 2000 + }, + { + "epoch": 0.24849724236227302, + "grad_norm": 0.12511623039704756, + "learning_rate": 0.0001868645448043907, + "loss": 1.8188, + "step": 2005 + }, + { + "epoch": 0.24911693623350065, + "grad_norm": 0.1312732190979278, + "learning_rate": 0.00018675716268453827, + "loss": 1.832, + "step": 2010 + }, + { + "epoch": 0.24973663010472827, + "grad_norm": 0.1297564730601911, + "learning_rate": 0.00018664937454036226, + "loss": 1.8414, + "step": 2015 + }, + { + "epoch": 0.25035632397595586, + "grad_norm": 0.13336748960832828, + "learning_rate": 0.0001865411808763122, + "loss": 1.866, + "step": 2020 + }, + { + "epoch": 0.2509760178471835, + "grad_norm": 0.13371049555198566, + "learning_rate": 0.00018643258219873534, + "loss": 1.8648, + "step": 2025 + }, + { + "epoch": 0.2515957117184111, + "grad_norm": 0.12145110937533733, + "learning_rate": 0.00018632357901587456, + "loss": 1.8977, + "step": 2030 + }, + { + "epoch": 0.25221540558963873, + "grad_norm": 0.1273408283653277, + "learning_rate": 0.00018621417183786577, + "loss": 1.8953, + "step": 2035 + }, + { + "epoch": 0.25283509946086635, + "grad_norm": 0.1290600654953522, + "learning_rate": 0.00018610436117673555, + "loss": 1.8383, + "step": 2040 + }, + { + "epoch": 0.2534547933320939, + "grad_norm": 0.1253131282820343, + "learning_rate": 0.00018599414754639883, + "loss": 1.8941, + "step": 2045 + }, + { + "epoch": 0.25407448720332154, + "grad_norm": 0.1300726181872789, + "learning_rate": 0.00018588353146265643, + "loss": 1.8547, + "step": 2050 + }, + { + "epoch": 0.25469418107454916, + "grad_norm": 0.12572572001230634, + "learning_rate": 0.0001857725134431926, + "loss": 1.8625, + "step": 2055 + }, + { + "epoch": 0.2553138749457768, + "grad_norm": 0.12568185275742066, + "learning_rate": 0.00018566109400757272, + "loss": 1.8734, + "step": 2060 + }, + { + "epoch": 0.2559335688170044, + "grad_norm": 0.13533271397949245, + "learning_rate": 0.0001855492736772408, + "loss": 1.8902, + "step": 2065 + }, + { + "epoch": 0.25655326268823203, + "grad_norm": 0.1327835673505901, + "learning_rate": 0.00018543705297551698, + "loss": 1.9043, + "step": 2070 + }, + { + "epoch": 0.25717295655945965, + "grad_norm": 0.1332233655568078, + "learning_rate": 0.0001853244324275952, + "loss": 1.8496, + "step": 2075 + }, + { + "epoch": 0.2577926504306872, + "grad_norm": 0.12647398734735402, + "learning_rate": 0.00018521141256054067, + "loss": 1.8711, + "step": 2080 + }, + { + "epoch": 0.25841234430191484, + "grad_norm": 0.12370914252810508, + "learning_rate": 0.00018509799390328742, + "loss": 1.8402, + "step": 2085 + }, + { + "epoch": 0.25903203817314246, + "grad_norm": 0.1342541241421312, + "learning_rate": 0.00018498417698663584, + "loss": 1.809, + "step": 2090 + }, + { + "epoch": 0.2596517320443701, + "grad_norm": 0.12945415515878775, + "learning_rate": 0.00018486996234325009, + "loss": 1.8363, + "step": 2095 + }, + { + "epoch": 0.2602714259155977, + "grad_norm": 0.1208865775592005, + "learning_rate": 0.00018475535050765577, + "loss": 1.848, + "step": 2100 + }, + { + "epoch": 0.26089111978682533, + "grad_norm": 0.1227715928980261, + "learning_rate": 0.00018464034201623737, + "loss": 1.8215, + "step": 2105 + }, + { + "epoch": 0.2615108136580529, + "grad_norm": 0.12505462857949878, + "learning_rate": 0.00018452493740723567, + "loss": 1.8801, + "step": 2110 + }, + { + "epoch": 0.2621305075292805, + "grad_norm": 0.12835962262802786, + "learning_rate": 0.0001844091372207453, + "loss": 1.7945, + "step": 2115 + }, + { + "epoch": 0.26275020140050814, + "grad_norm": 0.12508804411904637, + "learning_rate": 0.00018429294199871218, + "loss": 1.8551, + "step": 2120 + }, + { + "epoch": 0.26336989527173577, + "grad_norm": 0.12702387355591513, + "learning_rate": 0.0001841763522849311, + "loss": 1.8238, + "step": 2125 + }, + { + "epoch": 0.2639895891429634, + "grad_norm": 0.1348850938489859, + "learning_rate": 0.00018405936862504293, + "loss": 1.8121, + "step": 2130 + }, + { + "epoch": 0.264609283014191, + "grad_norm": 0.1278684357058116, + "learning_rate": 0.00018394199156653233, + "loss": 1.8574, + "step": 2135 + }, + { + "epoch": 0.2652289768854186, + "grad_norm": 0.12483388812413534, + "learning_rate": 0.00018382422165872498, + "loss": 1.8504, + "step": 2140 + }, + { + "epoch": 0.2658486707566462, + "grad_norm": 0.13515557229656505, + "learning_rate": 0.00018370605945278512, + "loss": 1.8609, + "step": 2145 + }, + { + "epoch": 0.2664683646278738, + "grad_norm": 0.13397417059997252, + "learning_rate": 0.00018358750550171303, + "loss": 1.9426, + "step": 2150 + }, + { + "epoch": 0.26708805849910144, + "grad_norm": 0.12551446472885072, + "learning_rate": 0.00018346856036034225, + "loss": 1.8531, + "step": 2155 + }, + { + "epoch": 0.26770775237032907, + "grad_norm": 0.12574015924446805, + "learning_rate": 0.0001833492245853371, + "loss": 1.857, + "step": 2160 + }, + { + "epoch": 0.2683274462415567, + "grad_norm": 0.1301344380849854, + "learning_rate": 0.00018322949873519028, + "loss": 1.8902, + "step": 2165 + }, + { + "epoch": 0.2689471401127843, + "grad_norm": 0.134107216212603, + "learning_rate": 0.00018310938337021967, + "loss": 1.8844, + "step": 2170 + }, + { + "epoch": 0.2695668339840119, + "grad_norm": 0.13544984849232908, + "learning_rate": 0.00018298887905256642, + "loss": 1.8387, + "step": 2175 + }, + { + "epoch": 0.2701865278552395, + "grad_norm": 0.13740107014915645, + "learning_rate": 0.00018286798634619178, + "loss": 1.8867, + "step": 2180 + }, + { + "epoch": 0.2708062217264671, + "grad_norm": 0.13047734738410044, + "learning_rate": 0.0001827467058168748, + "loss": 1.8621, + "step": 2185 + }, + { + "epoch": 0.27142591559769474, + "grad_norm": 0.1275936167540542, + "learning_rate": 0.00018262503803220941, + "loss": 1.9242, + "step": 2190 + }, + { + "epoch": 0.27204560946892237, + "grad_norm": 0.12739893626922644, + "learning_rate": 0.00018250298356160203, + "loss": 1.8133, + "step": 2195 + }, + { + "epoch": 0.27266530334015, + "grad_norm": 0.13208684150469857, + "learning_rate": 0.00018238054297626868, + "loss": 1.8328, + "step": 2200 + }, + { + "epoch": 0.27328499721137756, + "grad_norm": 0.12642860309275045, + "learning_rate": 0.0001822577168492324, + "loss": 1.8848, + "step": 2205 + }, + { + "epoch": 0.2739046910826052, + "grad_norm": 0.1207837863995249, + "learning_rate": 0.00018213450575532068, + "loss": 1.898, + "step": 2210 + }, + { + "epoch": 0.2745243849538328, + "grad_norm": 0.134513799642859, + "learning_rate": 0.0001820109102711625, + "loss": 1.8652, + "step": 2215 + }, + { + "epoch": 0.2751440788250604, + "grad_norm": 0.12522620018177075, + "learning_rate": 0.00018188693097518589, + "loss": 1.8902, + "step": 2220 + }, + { + "epoch": 0.27576377269628805, + "grad_norm": 0.12565243572374343, + "learning_rate": 0.00018176256844761515, + "loss": 1.8578, + "step": 2225 + }, + { + "epoch": 0.27638346656751567, + "grad_norm": 0.12338799943023959, + "learning_rate": 0.000181637823270468, + "loss": 1.9074, + "step": 2230 + }, + { + "epoch": 0.27700316043874323, + "grad_norm": 0.1313348559311083, + "learning_rate": 0.00018151269602755305, + "loss": 1.8352, + "step": 2235 + }, + { + "epoch": 0.27762285430997086, + "grad_norm": 0.1262516619013633, + "learning_rate": 0.00018138718730446694, + "loss": 1.8164, + "step": 2240 + }, + { + "epoch": 0.2782425481811985, + "grad_norm": 0.13220071119654697, + "learning_rate": 0.00018126129768859166, + "loss": 1.8285, + "step": 2245 + }, + { + "epoch": 0.2788622420524261, + "grad_norm": 0.12746131878131237, + "learning_rate": 0.0001811350277690918, + "loss": 1.8668, + "step": 2250 + }, + { + "epoch": 0.2794819359236537, + "grad_norm": 0.12708251481689406, + "learning_rate": 0.00018100837813691173, + "loss": 1.8996, + "step": 2255 + }, + { + "epoch": 0.28010162979488135, + "grad_norm": 0.13164815119644507, + "learning_rate": 0.00018088134938477285, + "loss": 1.9145, + "step": 2260 + }, + { + "epoch": 0.28072132366610897, + "grad_norm": 0.12382229396871405, + "learning_rate": 0.00018075394210717097, + "loss": 1.9527, + "step": 2265 + }, + { + "epoch": 0.28134101753733654, + "grad_norm": 0.13047982998936075, + "learning_rate": 0.0001806261569003733, + "loss": 1.8805, + "step": 2270 + }, + { + "epoch": 0.28196071140856416, + "grad_norm": 0.12580182752485483, + "learning_rate": 0.00018049799436241584, + "loss": 1.8184, + "step": 2275 + }, + { + "epoch": 0.2825804052797918, + "grad_norm": 0.13179353346365458, + "learning_rate": 0.00018036945509310035, + "loss": 1.8332, + "step": 2280 + }, + { + "epoch": 0.2832000991510194, + "grad_norm": 0.1490150630853317, + "learning_rate": 0.00018024053969399186, + "loss": 1.8203, + "step": 2285 + }, + { + "epoch": 0.283819793022247, + "grad_norm": 0.13210374436560907, + "learning_rate": 0.00018011124876841564, + "loss": 1.8578, + "step": 2290 + }, + { + "epoch": 0.28443948689347465, + "grad_norm": 0.13422338407254142, + "learning_rate": 0.0001799815829214544, + "loss": 1.8871, + "step": 2295 + }, + { + "epoch": 0.2850591807647022, + "grad_norm": 0.1297246090756236, + "learning_rate": 0.00017985154275994546, + "loss": 1.841, + "step": 2300 + }, + { + "epoch": 0.28567887463592984, + "grad_norm": 0.12536111156667865, + "learning_rate": 0.00017972112889247808, + "loss": 1.859, + "step": 2305 + }, + { + "epoch": 0.28629856850715746, + "grad_norm": 0.138340567721797, + "learning_rate": 0.00017959034192939027, + "loss": 1.7711, + "step": 2310 + }, + { + "epoch": 0.2869182623783851, + "grad_norm": 0.13158620923757655, + "learning_rate": 0.0001794591824827663, + "loss": 1.8227, + "step": 2315 + }, + { + "epoch": 0.2875379562496127, + "grad_norm": 0.1292868891126794, + "learning_rate": 0.0001793276511664335, + "loss": 1.8871, + "step": 2320 + }, + { + "epoch": 0.2881576501208403, + "grad_norm": 0.12811495044307233, + "learning_rate": 0.00017919574859595977, + "loss": 1.8543, + "step": 2325 + }, + { + "epoch": 0.2887773439920679, + "grad_norm": 0.12149407958829071, + "learning_rate": 0.00017906347538865021, + "loss": 1.8535, + "step": 2330 + }, + { + "epoch": 0.2893970378632955, + "grad_norm": 0.1317042941594247, + "learning_rate": 0.00017893083216354477, + "loss": 1.8824, + "step": 2335 + }, + { + "epoch": 0.29001673173452314, + "grad_norm": 0.12569760233357527, + "learning_rate": 0.00017879781954141497, + "loss": 1.8234, + "step": 2340 + }, + { + "epoch": 0.29063642560575076, + "grad_norm": 0.13084498370382033, + "learning_rate": 0.00017866443814476107, + "loss": 1.852, + "step": 2345 + }, + { + "epoch": 0.2912561194769784, + "grad_norm": 0.13205282361051077, + "learning_rate": 0.0001785306885978092, + "loss": 1.8832, + "step": 2350 + }, + { + "epoch": 0.291875813348206, + "grad_norm": 0.12919651417069067, + "learning_rate": 0.00017839657152650856, + "loss": 1.8453, + "step": 2355 + }, + { + "epoch": 0.2924955072194336, + "grad_norm": 0.12973043338594262, + "learning_rate": 0.00017826208755852827, + "loss": 1.8797, + "step": 2360 + }, + { + "epoch": 0.2931152010906612, + "grad_norm": 0.13301208195385272, + "learning_rate": 0.00017812723732325446, + "loss": 1.8641, + "step": 2365 + }, + { + "epoch": 0.2937348949618888, + "grad_norm": 0.12949835549240182, + "learning_rate": 0.00017799202145178758, + "loss": 1.8406, + "step": 2370 + }, + { + "epoch": 0.29435458883311644, + "grad_norm": 0.1293697097422646, + "learning_rate": 0.00017785644057693913, + "loss": 1.8559, + "step": 2375 + }, + { + "epoch": 0.29497428270434406, + "grad_norm": 0.1520616458448482, + "learning_rate": 0.0001777204953332288, + "loss": 1.8887, + "step": 2380 + }, + { + "epoch": 0.2955939765755717, + "grad_norm": 0.13380400311187673, + "learning_rate": 0.00017758418635688167, + "loss": 1.9051, + "step": 2385 + }, + { + "epoch": 0.2962136704467993, + "grad_norm": 0.12836449608199532, + "learning_rate": 0.00017744751428582496, + "loss": 1.8859, + "step": 2390 + }, + { + "epoch": 0.29683336431802687, + "grad_norm": 0.13080261256263226, + "learning_rate": 0.00017731047975968523, + "loss": 1.8539, + "step": 2395 + }, + { + "epoch": 0.2974530581892545, + "grad_norm": 0.12988299510465, + "learning_rate": 0.00017717308341978538, + "loss": 1.8566, + "step": 2400 + }, + { + "epoch": 0.2980727520604821, + "grad_norm": 0.125683224518266, + "learning_rate": 0.00017703532590914147, + "loss": 1.8676, + "step": 2405 + }, + { + "epoch": 0.29869244593170974, + "grad_norm": 0.13352496446514742, + "learning_rate": 0.00017689720787245997, + "loss": 1.923, + "step": 2410 + }, + { + "epoch": 0.29931213980293736, + "grad_norm": 0.1293774171094288, + "learning_rate": 0.00017675872995613458, + "loss": 1.868, + "step": 2415 + }, + { + "epoch": 0.299931833674165, + "grad_norm": 0.12527135631615616, + "learning_rate": 0.0001766198928082432, + "loss": 1.8621, + "step": 2420 + }, + { + "epoch": 0.30055152754539255, + "grad_norm": 0.12613717510254913, + "learning_rate": 0.00017648069707854497, + "loss": 1.9, + "step": 2425 + }, + { + "epoch": 0.30117122141662017, + "grad_norm": 0.12391342907752452, + "learning_rate": 0.0001763411434184772, + "loss": 1.8531, + "step": 2430 + }, + { + "epoch": 0.3017909152878478, + "grad_norm": 0.13123817620567785, + "learning_rate": 0.00017620123248115235, + "loss": 1.8664, + "step": 2435 + }, + { + "epoch": 0.3024106091590754, + "grad_norm": 0.12780408489494227, + "learning_rate": 0.0001760609649213548, + "loss": 1.8922, + "step": 2440 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.12077509717420204, + "learning_rate": 0.00017592034139553812, + "loss": 1.9, + "step": 2445 + }, + { + "epoch": 0.30364999690153066, + "grad_norm": 0.12597036196005937, + "learning_rate": 0.00017577936256182167, + "loss": 1.8551, + "step": 2450 + }, + { + "epoch": 0.3042696907727583, + "grad_norm": 0.13100450560697094, + "learning_rate": 0.00017563802907998773, + "loss": 1.8453, + "step": 2455 + }, + { + "epoch": 0.30488938464398585, + "grad_norm": 0.128806698790364, + "learning_rate": 0.00017549634161147823, + "loss": 1.8484, + "step": 2460 + }, + { + "epoch": 0.3055090785152135, + "grad_norm": 0.12628216846498763, + "learning_rate": 0.0001753543008193919, + "loss": 1.882, + "step": 2465 + }, + { + "epoch": 0.3061287723864411, + "grad_norm": 0.12839071253465598, + "learning_rate": 0.00017521190736848096, + "loss": 1.8957, + "step": 2470 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 0.12646846306076212, + "learning_rate": 0.00017506916192514801, + "loss": 1.8148, + "step": 2475 + }, + { + "epoch": 0.30736816012889634, + "grad_norm": 0.1300463966004593, + "learning_rate": 0.0001749260651574431, + "loss": 1.8625, + "step": 2480 + }, + { + "epoch": 0.30798785400012396, + "grad_norm": 0.13684852530638011, + "learning_rate": 0.00017478261773506043, + "loss": 1.8324, + "step": 2485 + }, + { + "epoch": 0.30860754787135153, + "grad_norm": 0.1333149939890643, + "learning_rate": 0.00017463882032933524, + "loss": 1.8238, + "step": 2490 + }, + { + "epoch": 0.30922724174257915, + "grad_norm": 0.13250928441732007, + "learning_rate": 0.00017449467361324076, + "loss": 1.8562, + "step": 2495 + }, + { + "epoch": 0.3098469356138068, + "grad_norm": 0.12650581345903503, + "learning_rate": 0.0001743501782613849, + "loss": 1.918, + "step": 2500 + }, + { + "epoch": 0.3104666294850344, + "grad_norm": 0.13012679795096624, + "learning_rate": 0.00017420533495000727, + "loss": 1.8469, + "step": 2505 + }, + { + "epoch": 0.311086323356262, + "grad_norm": 0.1297240234416192, + "learning_rate": 0.0001740601443569759, + "loss": 1.8047, + "step": 2510 + }, + { + "epoch": 0.31170601722748964, + "grad_norm": 0.1309817865465321, + "learning_rate": 0.0001739146071617841, + "loss": 1.8641, + "step": 2515 + }, + { + "epoch": 0.3123257110987172, + "grad_norm": 0.12922856903591073, + "learning_rate": 0.0001737687240455473, + "loss": 1.834, + "step": 2520 + }, + { + "epoch": 0.31294540496994483, + "grad_norm": 0.12557065472012063, + "learning_rate": 0.00017362249569099982, + "loss": 1.8438, + "step": 2525 + }, + { + "epoch": 0.31356509884117245, + "grad_norm": 0.13353066027905686, + "learning_rate": 0.00017347592278249175, + "loss": 1.8586, + "step": 2530 + }, + { + "epoch": 0.3141847927124001, + "grad_norm": 0.13316269753092513, + "learning_rate": 0.00017332900600598562, + "loss": 1.8707, + "step": 2535 + }, + { + "epoch": 0.3148044865836277, + "grad_norm": 0.13353296735977976, + "learning_rate": 0.00017318174604905327, + "loss": 1.8836, + "step": 2540 + }, + { + "epoch": 0.3154241804548553, + "grad_norm": 0.12858939196532748, + "learning_rate": 0.00017303414360087278, + "loss": 1.825, + "step": 2545 + }, + { + "epoch": 0.31604387432608294, + "grad_norm": 0.13179975498557261, + "learning_rate": 0.00017288619935222486, + "loss": 1.8516, + "step": 2550 + }, + { + "epoch": 0.3166635681973105, + "grad_norm": 0.12482617396470475, + "learning_rate": 0.00017273791399548998, + "loss": 1.8391, + "step": 2555 + }, + { + "epoch": 0.31728326206853813, + "grad_norm": 0.13139623735524517, + "learning_rate": 0.000172589288224645, + "loss": 1.8605, + "step": 2560 + }, + { + "epoch": 0.31790295593976575, + "grad_norm": 0.1310695530113606, + "learning_rate": 0.00017244032273525995, + "loss": 1.8414, + "step": 2565 + }, + { + "epoch": 0.3185226498109934, + "grad_norm": 0.12710157216789886, + "learning_rate": 0.0001722910182244946, + "loss": 1.8543, + "step": 2570 + }, + { + "epoch": 0.319142343682221, + "grad_norm": 0.12590627027340592, + "learning_rate": 0.00017214137539109552, + "loss": 1.8793, + "step": 2575 + }, + { + "epoch": 0.3197620375534486, + "grad_norm": 0.13436407452579877, + "learning_rate": 0.0001719913949353925, + "loss": 1.8652, + "step": 2580 + }, + { + "epoch": 0.3203817314246762, + "grad_norm": 0.12643462648025297, + "learning_rate": 0.0001718410775592955, + "loss": 1.8992, + "step": 2585 + }, + { + "epoch": 0.3210014252959038, + "grad_norm": 0.1305952405164263, + "learning_rate": 0.00017169042396629117, + "loss": 1.848, + "step": 2590 + }, + { + "epoch": 0.32162111916713143, + "grad_norm": 0.150091829244743, + "learning_rate": 0.00017153943486143978, + "loss": 1.8566, + "step": 2595 + }, + { + "epoch": 0.32224081303835905, + "grad_norm": 0.1254009301014799, + "learning_rate": 0.00017138811095137175, + "loss": 1.8629, + "step": 2600 + }, + { + "epoch": 0.3228605069095867, + "grad_norm": 0.13664056730838736, + "learning_rate": 0.0001712364529442843, + "loss": 1.8066, + "step": 2605 + }, + { + "epoch": 0.3234802007808143, + "grad_norm": 0.1335954880543391, + "learning_rate": 0.00017108446154993838, + "loss": 1.868, + "step": 2610 + }, + { + "epoch": 0.32409989465204186, + "grad_norm": 0.1306781290863923, + "learning_rate": 0.0001709321374796551, + "loss": 1.8758, + "step": 2615 + }, + { + "epoch": 0.3247195885232695, + "grad_norm": 0.14742493532611176, + "learning_rate": 0.00017077948144631248, + "loss": 1.8562, + "step": 2620 + }, + { + "epoch": 0.3253392823944971, + "grad_norm": 0.12846260878216087, + "learning_rate": 0.00017062649416434223, + "loss": 1.8352, + "step": 2625 + }, + { + "epoch": 0.32595897626572473, + "grad_norm": 0.13165013316462518, + "learning_rate": 0.00017047317634972617, + "loss": 1.8586, + "step": 2630 + }, + { + "epoch": 0.32657867013695235, + "grad_norm": 0.13366541465113413, + "learning_rate": 0.00017031952871999315, + "loss": 1.9242, + "step": 2635 + }, + { + "epoch": 0.32719836400818, + "grad_norm": 0.13286362925121087, + "learning_rate": 0.0001701655519942155, + "loss": 1.8793, + "step": 2640 + }, + { + "epoch": 0.3278180578794076, + "grad_norm": 0.13827798198391242, + "learning_rate": 0.00017001124689300568, + "loss": 1.8211, + "step": 2645 + }, + { + "epoch": 0.32843775175063517, + "grad_norm": 0.13443963143632734, + "learning_rate": 0.00016985661413851304, + "loss": 1.8512, + "step": 2650 + }, + { + "epoch": 0.3290574456218628, + "grad_norm": 0.12931297062470157, + "learning_rate": 0.00016970165445442023, + "loss": 1.8586, + "step": 2655 + }, + { + "epoch": 0.3296771394930904, + "grad_norm": 0.12783732546786497, + "learning_rate": 0.00016954636856594005, + "loss": 1.8367, + "step": 2660 + }, + { + "epoch": 0.33029683336431803, + "grad_norm": 0.12212361016917726, + "learning_rate": 0.00016939075719981194, + "loss": 1.8582, + "step": 2665 + }, + { + "epoch": 0.33091652723554565, + "grad_norm": 0.1294327845651695, + "learning_rate": 0.00016923482108429844, + "loss": 1.8465, + "step": 2670 + }, + { + "epoch": 0.3315362211067733, + "grad_norm": 0.13189238283540644, + "learning_rate": 0.00016907856094918207, + "loss": 1.8051, + "step": 2675 + }, + { + "epoch": 0.33215591497800084, + "grad_norm": 0.1276135033870951, + "learning_rate": 0.0001689219775257617, + "loss": 1.8199, + "step": 2680 + }, + { + "epoch": 0.33277560884922847, + "grad_norm": 0.12275325328072717, + "learning_rate": 0.00016876507154684918, + "loss": 1.8047, + "step": 2685 + }, + { + "epoch": 0.3333953027204561, + "grad_norm": 0.13012589500787683, + "learning_rate": 0.00016860784374676593, + "loss": 1.8621, + "step": 2690 + }, + { + "epoch": 0.3340149965916837, + "grad_norm": 0.13152129389683137, + "learning_rate": 0.00016845029486133956, + "loss": 1.8516, + "step": 2695 + }, + { + "epoch": 0.33463469046291133, + "grad_norm": 0.12631329142384912, + "learning_rate": 0.00016829242562790026, + "loss": 1.8387, + "step": 2700 + }, + { + "epoch": 0.33525438433413896, + "grad_norm": 0.13018878722373028, + "learning_rate": 0.00016813423678527754, + "loss": 1.923, + "step": 2705 + }, + { + "epoch": 0.3358740782053665, + "grad_norm": 0.14109498005999074, + "learning_rate": 0.00016797572907379667, + "loss": 1.8469, + "step": 2710 + }, + { + "epoch": 0.33649377207659414, + "grad_norm": 0.13811693482252332, + "learning_rate": 0.00016781690323527511, + "loss": 1.8676, + "step": 2715 + }, + { + "epoch": 0.33711346594782177, + "grad_norm": 0.1291292771360962, + "learning_rate": 0.00016765776001301933, + "loss": 1.8566, + "step": 2720 + }, + { + "epoch": 0.3377331598190494, + "grad_norm": 0.13823691210457728, + "learning_rate": 0.00016749830015182107, + "loss": 1.9406, + "step": 2725 + }, + { + "epoch": 0.338352853690277, + "grad_norm": 0.13089413884817042, + "learning_rate": 0.00016733852439795394, + "loss": 1.8734, + "step": 2730 + }, + { + "epoch": 0.33897254756150463, + "grad_norm": 0.14018900588379418, + "learning_rate": 0.00016717843349916994, + "loss": 1.8359, + "step": 2735 + }, + { + "epoch": 0.33959224143273226, + "grad_norm": 0.12736041535496453, + "learning_rate": 0.000167018028204696, + "loss": 1.8742, + "step": 2740 + }, + { + "epoch": 0.3402119353039598, + "grad_norm": 0.12533414989711167, + "learning_rate": 0.0001668573092652303, + "loss": 1.8648, + "step": 2745 + }, + { + "epoch": 0.34083162917518744, + "grad_norm": 0.13957662916108884, + "learning_rate": 0.00016669627743293907, + "loss": 1.9223, + "step": 2750 + }, + { + "epoch": 0.34145132304641507, + "grad_norm": 0.13354411893886944, + "learning_rate": 0.00016653493346145267, + "loss": 1.8301, + "step": 2755 + }, + { + "epoch": 0.3420710169176427, + "grad_norm": 0.13557566100086996, + "learning_rate": 0.00016637327810586246, + "loss": 1.8387, + "step": 2760 + }, + { + "epoch": 0.3426907107888703, + "grad_norm": 0.12737311407019075, + "learning_rate": 0.00016621131212271695, + "loss": 1.832, + "step": 2765 + }, + { + "epoch": 0.34331040466009793, + "grad_norm": 0.13905113414944018, + "learning_rate": 0.00016604903627001844, + "loss": 1.8586, + "step": 2770 + }, + { + "epoch": 0.3439300985313255, + "grad_norm": 0.13141740905832047, + "learning_rate": 0.00016588645130721948, + "loss": 1.9059, + "step": 2775 + }, + { + "epoch": 0.3445497924025531, + "grad_norm": 0.1292375170284331, + "learning_rate": 0.00016572355799521912, + "loss": 1.8629, + "step": 2780 + }, + { + "epoch": 0.34516948627378075, + "grad_norm": 0.13097804366675025, + "learning_rate": 0.0001655603570963596, + "loss": 1.8402, + "step": 2785 + }, + { + "epoch": 0.34578918014500837, + "grad_norm": 0.12857874757255375, + "learning_rate": 0.00016539684937442263, + "loss": 1.8871, + "step": 2790 + }, + { + "epoch": 0.346408874016236, + "grad_norm": 0.12786886162667369, + "learning_rate": 0.0001652330355946259, + "loss": 1.8047, + "step": 2795 + }, + { + "epoch": 0.3470285678874636, + "grad_norm": 0.143616962351737, + "learning_rate": 0.00016506891652361933, + "loss": 1.8383, + "step": 2800 + }, + { + "epoch": 0.3476482617586912, + "grad_norm": 0.13453451476859302, + "learning_rate": 0.0001649044929294818, + "loss": 1.8516, + "step": 2805 + }, + { + "epoch": 0.3482679556299188, + "grad_norm": 0.13515198281229954, + "learning_rate": 0.00016473976558171714, + "loss": 1.9059, + "step": 2810 + }, + { + "epoch": 0.3488876495011464, + "grad_norm": 0.13811663765869722, + "learning_rate": 0.00016457473525125093, + "loss": 1.8078, + "step": 2815 + }, + { + "epoch": 0.34950734337237405, + "grad_norm": 0.12993511623884146, + "learning_rate": 0.00016440940271042663, + "loss": 1.8887, + "step": 2820 + }, + { + "epoch": 0.35012703724360167, + "grad_norm": 0.1373771927813058, + "learning_rate": 0.00016424376873300207, + "loss": 1.8465, + "step": 2825 + }, + { + "epoch": 0.3507467311148293, + "grad_norm": 0.1330128701703067, + "learning_rate": 0.00016407783409414577, + "loss": 1.8609, + "step": 2830 + }, + { + "epoch": 0.3513664249860569, + "grad_norm": 0.11946071667616472, + "learning_rate": 0.00016391159957043335, + "loss": 1.8535, + "step": 2835 + }, + { + "epoch": 0.3519861188572845, + "grad_norm": 0.1358363297173592, + "learning_rate": 0.000163745065939844, + "loss": 1.8699, + "step": 2840 + }, + { + "epoch": 0.3526058127285121, + "grad_norm": 0.128552153238772, + "learning_rate": 0.0001635782339817566, + "loss": 1.9055, + "step": 2845 + }, + { + "epoch": 0.3532255065997397, + "grad_norm": 0.13106197892945834, + "learning_rate": 0.00016341110447694624, + "loss": 1.827, + "step": 2850 + }, + { + "epoch": 0.35384520047096735, + "grad_norm": 0.14036033306901916, + "learning_rate": 0.00016324367820758057, + "loss": 1.8266, + "step": 2855 + }, + { + "epoch": 0.35446489434219497, + "grad_norm": 0.12890303821028082, + "learning_rate": 0.0001630759559572161, + "loss": 1.8477, + "step": 2860 + }, + { + "epoch": 0.3550845882134226, + "grad_norm": 0.13118051072317582, + "learning_rate": 0.00016290793851079447, + "loss": 1.8879, + "step": 2865 + }, + { + "epoch": 0.35570428208465016, + "grad_norm": 0.12771050153008334, + "learning_rate": 0.00016273962665463892, + "loss": 1.8883, + "step": 2870 + }, + { + "epoch": 0.3563239759558778, + "grad_norm": 0.12557326568723504, + "learning_rate": 0.00016257102117645048, + "loss": 1.8121, + "step": 2875 + }, + { + "epoch": 0.3569436698271054, + "grad_norm": 0.1317338184540393, + "learning_rate": 0.00016240212286530432, + "loss": 1.8645, + "step": 2880 + }, + { + "epoch": 0.357563363698333, + "grad_norm": 0.12301737660160941, + "learning_rate": 0.00016223293251164616, + "loss": 1.8715, + "step": 2885 + }, + { + "epoch": 0.35818305756956065, + "grad_norm": 0.127922605349782, + "learning_rate": 0.00016206345090728834, + "loss": 1.8387, + "step": 2890 + }, + { + "epoch": 0.35880275144078827, + "grad_norm": 0.1294192610668883, + "learning_rate": 0.00016189367884540638, + "loss": 1.848, + "step": 2895 + }, + { + "epoch": 0.35942244531201584, + "grad_norm": 0.13398944968217794, + "learning_rate": 0.00016172361712053513, + "loss": 1.8586, + "step": 2900 + }, + { + "epoch": 0.36004213918324346, + "grad_norm": 0.12899074210444184, + "learning_rate": 0.00016155326652856497, + "loss": 1.8805, + "step": 2905 + }, + { + "epoch": 0.3606618330544711, + "grad_norm": 0.12795186731054056, + "learning_rate": 0.0001613826278667383, + "loss": 1.7922, + "step": 2910 + }, + { + "epoch": 0.3612815269256987, + "grad_norm": 0.1292195667193673, + "learning_rate": 0.00016121170193364557, + "loss": 1.816, + "step": 2915 + }, + { + "epoch": 0.3619012207969263, + "grad_norm": 0.1242942240620901, + "learning_rate": 0.0001610404895292218, + "loss": 1.9125, + "step": 2920 + }, + { + "epoch": 0.36252091466815395, + "grad_norm": 0.13101097061222353, + "learning_rate": 0.0001608689914547426, + "loss": 1.8641, + "step": 2925 + }, + { + "epoch": 0.36314060853938157, + "grad_norm": 0.12617646446429898, + "learning_rate": 0.00016069720851282052, + "loss": 1.8461, + "step": 2930 + }, + { + "epoch": 0.36376030241060914, + "grad_norm": 0.12484378328833615, + "learning_rate": 0.00016052514150740135, + "loss": 1.8918, + "step": 2935 + }, + { + "epoch": 0.36437999628183676, + "grad_norm": 0.128015270056116, + "learning_rate": 0.00016035279124376026, + "loss": 1.8203, + "step": 2940 + }, + { + "epoch": 0.3649996901530644, + "grad_norm": 0.12690248111261712, + "learning_rate": 0.00016018015852849806, + "loss": 1.8211, + "step": 2945 + }, + { + "epoch": 0.365619384024292, + "grad_norm": 0.13279636029030822, + "learning_rate": 0.00016000724416953744, + "loss": 1.8551, + "step": 2950 + }, + { + "epoch": 0.3662390778955196, + "grad_norm": 0.13480615224006273, + "learning_rate": 0.00015983404897611928, + "loss": 1.8484, + "step": 2955 + }, + { + "epoch": 0.36685877176674725, + "grad_norm": 0.12954165767486733, + "learning_rate": 0.00015966057375879858, + "loss": 1.8434, + "step": 2960 + }, + { + "epoch": 0.3674784656379748, + "grad_norm": 0.13031965865973283, + "learning_rate": 0.00015948681932944104, + "loss": 1.8582, + "step": 2965 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.13110591784796458, + "learning_rate": 0.00015931278650121897, + "loss": 1.866, + "step": 2970 + }, + { + "epoch": 0.36871785338043006, + "grad_norm": 0.13182197561526388, + "learning_rate": 0.0001591384760886076, + "loss": 1.8301, + "step": 2975 + }, + { + "epoch": 0.3693375472516577, + "grad_norm": 0.13143677143578264, + "learning_rate": 0.00015896388890738127, + "loss": 1.8531, + "step": 2980 + }, + { + "epoch": 0.3699572411228853, + "grad_norm": 0.13715925277059812, + "learning_rate": 0.00015878902577460963, + "loss": 1.8344, + "step": 2985 + }, + { + "epoch": 0.3705769349941129, + "grad_norm": 0.13007938351554157, + "learning_rate": 0.00015861388750865375, + "loss": 1.8637, + "step": 2990 + }, + { + "epoch": 0.3711966288653405, + "grad_norm": 0.13870936181867594, + "learning_rate": 0.00015843847492916235, + "loss": 1.8594, + "step": 2995 + }, + { + "epoch": 0.3718163227365681, + "grad_norm": 0.1338434765895186, + "learning_rate": 0.00015826278885706788, + "loss": 1.8293, + "step": 3000 + }, + { + "epoch": 0.37243601660779574, + "grad_norm": 0.13035091608618987, + "learning_rate": 0.0001580868301145828, + "loss": 1.8566, + "step": 3005 + }, + { + "epoch": 0.37305571047902336, + "grad_norm": 0.14555147420902945, + "learning_rate": 0.00015791059952519567, + "loss": 1.8672, + "step": 3010 + }, + { + "epoch": 0.373675404350251, + "grad_norm": 0.1295782115936669, + "learning_rate": 0.00015773409791366728, + "loss": 1.8598, + "step": 3015 + }, + { + "epoch": 0.3742950982214786, + "grad_norm": 0.13140008433844813, + "learning_rate": 0.00015755732610602677, + "loss": 1.8836, + "step": 3020 + }, + { + "epoch": 0.37491479209270623, + "grad_norm": 0.12659202187281082, + "learning_rate": 0.00015738028492956786, + "loss": 1.8406, + "step": 3025 + }, + { + "epoch": 0.3755344859639338, + "grad_norm": 0.13023530855829013, + "learning_rate": 0.0001572029752128449, + "loss": 1.8645, + "step": 3030 + }, + { + "epoch": 0.3761541798351614, + "grad_norm": 0.13142420798148008, + "learning_rate": 0.00015702539778566897, + "loss": 1.8742, + "step": 3035 + }, + { + "epoch": 0.37677387370638904, + "grad_norm": 0.13011574441351573, + "learning_rate": 0.0001568475534791041, + "loss": 1.8105, + "step": 3040 + }, + { + "epoch": 0.37739356757761666, + "grad_norm": 0.1385944738745723, + "learning_rate": 0.00015666944312546328, + "loss": 1.8242, + "step": 3045 + }, + { + "epoch": 0.3780132614488443, + "grad_norm": 0.13206170515775578, + "learning_rate": 0.0001564910675583046, + "loss": 1.8781, + "step": 3050 + }, + { + "epoch": 0.3786329553200719, + "grad_norm": 0.1272155538752064, + "learning_rate": 0.0001563124276124274, + "loss": 1.8844, + "step": 3055 + }, + { + "epoch": 0.3792526491912995, + "grad_norm": 0.12879360712674345, + "learning_rate": 0.00015613352412386825, + "loss": 1.8016, + "step": 3060 + }, + { + "epoch": 0.3798723430625271, + "grad_norm": 0.13141621693892783, + "learning_rate": 0.00015595435792989718, + "loss": 1.866, + "step": 3065 + }, + { + "epoch": 0.3804920369337547, + "grad_norm": 0.13148734211884924, + "learning_rate": 0.0001557749298690135, + "loss": 1.8859, + "step": 3070 + }, + { + "epoch": 0.38111173080498234, + "grad_norm": 0.1233370862820081, + "learning_rate": 0.00015559524078094235, + "loss": 1.8398, + "step": 3075 + }, + { + "epoch": 0.38173142467620996, + "grad_norm": 0.1343298831070218, + "learning_rate": 0.00015541529150663022, + "loss": 1.8273, + "step": 3080 + }, + { + "epoch": 0.3823511185474376, + "grad_norm": 0.1350754368311016, + "learning_rate": 0.00015523508288824145, + "loss": 1.8598, + "step": 3085 + }, + { + "epoch": 0.38297081241866515, + "grad_norm": 0.13186218005747855, + "learning_rate": 0.00015505461576915402, + "loss": 1.8434, + "step": 3090 + }, + { + "epoch": 0.3835905062898928, + "grad_norm": 0.1256587528049024, + "learning_rate": 0.00015487389099395565, + "loss": 1.868, + "step": 3095 + }, + { + "epoch": 0.3842102001611204, + "grad_norm": 0.13460533409016884, + "learning_rate": 0.00015469290940844005, + "loss": 1.8883, + "step": 3100 + }, + { + "epoch": 0.384829894032348, + "grad_norm": 0.13262872301234474, + "learning_rate": 0.00015451167185960267, + "loss": 1.8703, + "step": 3105 + }, + { + "epoch": 0.38544958790357564, + "grad_norm": 0.1307836877374553, + "learning_rate": 0.00015433017919563692, + "loss": 1.8898, + "step": 3110 + }, + { + "epoch": 0.38606928177480326, + "grad_norm": 0.13241485557865607, + "learning_rate": 0.00015414843226593016, + "loss": 1.8715, + "step": 3115 + }, + { + "epoch": 0.3866889756460309, + "grad_norm": 0.12308084527810047, + "learning_rate": 0.0001539664319210597, + "loss": 1.8891, + "step": 3120 + }, + { + "epoch": 0.38730866951725845, + "grad_norm": 0.12579495949571556, + "learning_rate": 0.0001537841790127888, + "loss": 1.891, + "step": 3125 + }, + { + "epoch": 0.3879283633884861, + "grad_norm": 0.13544825188327536, + "learning_rate": 0.00015360167439406274, + "loss": 1.8902, + "step": 3130 + }, + { + "epoch": 0.3885480572597137, + "grad_norm": 0.1367382298904419, + "learning_rate": 0.00015341891891900494, + "loss": 1.8469, + "step": 3135 + }, + { + "epoch": 0.3891677511309413, + "grad_norm": 0.1281769119603224, + "learning_rate": 0.00015323591344291258, + "loss": 1.8297, + "step": 3140 + }, + { + "epoch": 0.38978744500216894, + "grad_norm": 0.1295407756633459, + "learning_rate": 0.00015305265882225303, + "loss": 1.8543, + "step": 3145 + }, + { + "epoch": 0.39040713887339656, + "grad_norm": 0.12523656961779545, + "learning_rate": 0.00015286915591465969, + "loss": 1.8477, + "step": 3150 + }, + { + "epoch": 0.39102683274462413, + "grad_norm": 0.14659564859622612, + "learning_rate": 0.00015268540557892773, + "loss": 1.8566, + "step": 3155 + }, + { + "epoch": 0.39164652661585175, + "grad_norm": 0.12755016149566428, + "learning_rate": 0.0001525014086750105, + "loss": 1.8641, + "step": 3160 + }, + { + "epoch": 0.3922662204870794, + "grad_norm": 0.12426211762458562, + "learning_rate": 0.0001523171660640152, + "loss": 1.8516, + "step": 3165 + }, + { + "epoch": 0.392885914358307, + "grad_norm": 0.13100509076688815, + "learning_rate": 0.00015213267860819896, + "loss": 1.8355, + "step": 3170 + }, + { + "epoch": 0.3935056082295346, + "grad_norm": 0.13166122634442548, + "learning_rate": 0.00015194794717096475, + "loss": 1.8535, + "step": 3175 + }, + { + "epoch": 0.39412530210076224, + "grad_norm": 0.12851391806205498, + "learning_rate": 0.00015176297261685742, + "loss": 1.8531, + "step": 3180 + }, + { + "epoch": 0.3947449959719898, + "grad_norm": 0.13014329204831934, + "learning_rate": 0.00015157775581155957, + "loss": 1.8949, + "step": 3185 + }, + { + "epoch": 0.39536468984321743, + "grad_norm": 0.12402746941996257, + "learning_rate": 0.00015139229762188761, + "loss": 1.8113, + "step": 3190 + }, + { + "epoch": 0.39598438371444505, + "grad_norm": 0.1340147610045317, + "learning_rate": 0.00015120659891578754, + "loss": 1.7973, + "step": 3195 + }, + { + "epoch": 0.3966040775856727, + "grad_norm": 0.13126263396327228, + "learning_rate": 0.00015102066056233104, + "loss": 1.8086, + "step": 3200 + }, + { + "epoch": 0.3972237714569003, + "grad_norm": 0.1254442119538948, + "learning_rate": 0.00015083448343171138, + "loss": 1.8617, + "step": 3205 + }, + { + "epoch": 0.3978434653281279, + "grad_norm": 0.13193426597795005, + "learning_rate": 0.00015064806839523915, + "loss": 1.8191, + "step": 3210 + }, + { + "epoch": 0.39846315919935554, + "grad_norm": 0.13785220644107396, + "learning_rate": 0.00015046141632533844, + "loss": 1.7977, + "step": 3215 + }, + { + "epoch": 0.3990828530705831, + "grad_norm": 0.13241964464110698, + "learning_rate": 0.0001502745280955428, + "loss": 1.882, + "step": 3220 + }, + { + "epoch": 0.39970254694181073, + "grad_norm": 0.13524242845383544, + "learning_rate": 0.00015008740458049075, + "loss": 1.8496, + "step": 3225 + }, + { + "epoch": 0.40032224081303835, + "grad_norm": 0.1292830611807527, + "learning_rate": 0.0001499000466559221, + "loss": 1.8172, + "step": 3230 + }, + { + "epoch": 0.400941934684266, + "grad_norm": 0.12650175037356148, + "learning_rate": 0.0001497124551986737, + "loss": 1.8625, + "step": 3235 + }, + { + "epoch": 0.4015616285554936, + "grad_norm": 0.13556994409473824, + "learning_rate": 0.00014952463108667527, + "loss": 1.8828, + "step": 3240 + }, + { + "epoch": 0.4021813224267212, + "grad_norm": 0.13166998058062032, + "learning_rate": 0.0001493365751989454, + "loss": 1.832, + "step": 3245 + }, + { + "epoch": 0.4028010162979488, + "grad_norm": 0.13449713435485267, + "learning_rate": 0.0001491482884155874, + "loss": 1.8602, + "step": 3250 + }, + { + "epoch": 0.4034207101691764, + "grad_norm": 0.13389299126436824, + "learning_rate": 0.00014895977161778515, + "loss": 1.8105, + "step": 3255 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.1379880279783267, + "learning_rate": 0.00014877102568779896, + "loss": 1.8164, + "step": 3260 + }, + { + "epoch": 0.40466009791163166, + "grad_norm": 0.13303918044811053, + "learning_rate": 0.00014858205150896161, + "loss": 1.8984, + "step": 3265 + }, + { + "epoch": 0.4052797917828593, + "grad_norm": 0.1321191306570561, + "learning_rate": 0.00014839284996567392, + "loss": 1.8586, + "step": 3270 + }, + { + "epoch": 0.4058994856540869, + "grad_norm": 0.12648828416896452, + "learning_rate": 0.00014820342194340097, + "loss": 1.8602, + "step": 3275 + }, + { + "epoch": 0.4065191795253145, + "grad_norm": 0.1305701561156593, + "learning_rate": 0.00014801376832866754, + "loss": 1.798, + "step": 3280 + }, + { + "epoch": 0.4071388733965421, + "grad_norm": 0.1334941945609644, + "learning_rate": 0.00014782389000905435, + "loss": 1.8973, + "step": 3285 + }, + { + "epoch": 0.4077585672677697, + "grad_norm": 0.13360995175426915, + "learning_rate": 0.00014763378787319373, + "loss": 1.8, + "step": 3290 + }, + { + "epoch": 0.40837826113899733, + "grad_norm": 0.12345991314875522, + "learning_rate": 0.00014744346281076536, + "loss": 1.8465, + "step": 3295 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 0.13250381935580435, + "learning_rate": 0.00014725291571249236, + "loss": 1.7988, + "step": 3300 + }, + { + "epoch": 0.4096176488814526, + "grad_norm": 0.13402729816174527, + "learning_rate": 0.00014706214747013685, + "loss": 1.9363, + "step": 3305 + }, + { + "epoch": 0.4102373427526802, + "grad_norm": 0.1279554878458354, + "learning_rate": 0.00014687115897649603, + "loss": 1.8898, + "step": 3310 + }, + { + "epoch": 0.41085703662390777, + "grad_norm": 0.1666624811197605, + "learning_rate": 0.00014667995112539774, + "loss": 1.8234, + "step": 3315 + }, + { + "epoch": 0.4114767304951354, + "grad_norm": 0.12776748050466544, + "learning_rate": 0.00014648852481169658, + "loss": 1.893, + "step": 3320 + }, + { + "epoch": 0.412096424366363, + "grad_norm": 0.1286599439516026, + "learning_rate": 0.0001462968809312694, + "loss": 1.8762, + "step": 3325 + }, + { + "epoch": 0.41271611823759063, + "grad_norm": 0.13282507876071478, + "learning_rate": 0.00014610502038101138, + "loss": 1.8781, + "step": 3330 + }, + { + "epoch": 0.41333581210881826, + "grad_norm": 0.1337124937581548, + "learning_rate": 0.00014591294405883162, + "loss": 1.8426, + "step": 3335 + }, + { + "epoch": 0.4139555059800459, + "grad_norm": 0.12243201500863707, + "learning_rate": 0.00014572065286364908, + "loss": 1.8836, + "step": 3340 + }, + { + "epoch": 0.41457519985127345, + "grad_norm": 0.12112561857132004, + "learning_rate": 0.00014552814769538844, + "loss": 1.866, + "step": 3345 + }, + { + "epoch": 0.41519489372250107, + "grad_norm": 0.13221217015109857, + "learning_rate": 0.00014533542945497553, + "loss": 1.8617, + "step": 3350 + }, + { + "epoch": 0.4158145875937287, + "grad_norm": 0.13241339851253814, + "learning_rate": 0.00014514249904433362, + "loss": 1.8809, + "step": 3355 + }, + { + "epoch": 0.4164342814649563, + "grad_norm": 0.13060791556881998, + "learning_rate": 0.0001449493573663787, + "loss": 1.907, + "step": 3360 + }, + { + "epoch": 0.41705397533618394, + "grad_norm": 0.1463492820761344, + "learning_rate": 0.00014475600532501566, + "loss": 1.825, + "step": 3365 + }, + { + "epoch": 0.41767366920741156, + "grad_norm": 0.1267514749396063, + "learning_rate": 0.00014456244382513386, + "loss": 1.8289, + "step": 3370 + }, + { + "epoch": 0.4182933630786392, + "grad_norm": 0.12439164502803565, + "learning_rate": 0.0001443686737726029, + "loss": 1.8645, + "step": 3375 + }, + { + "epoch": 0.41891305694986675, + "grad_norm": 0.13123665423117487, + "learning_rate": 0.00014417469607426838, + "loss": 1.8602, + "step": 3380 + }, + { + "epoch": 0.41953275082109437, + "grad_norm": 0.1986304122646991, + "learning_rate": 0.00014398051163794776, + "loss": 1.8242, + "step": 3385 + }, + { + "epoch": 0.420152444692322, + "grad_norm": 0.14117447083626353, + "learning_rate": 0.000143786121372426, + "loss": 1.8586, + "step": 3390 + }, + { + "epoch": 0.4207721385635496, + "grad_norm": 0.13095017207646675, + "learning_rate": 0.00014359152618745132, + "loss": 1.8605, + "step": 3395 + }, + { + "epoch": 0.42139183243477724, + "grad_norm": 0.13210900899766176, + "learning_rate": 0.00014339672699373104, + "loss": 1.8926, + "step": 3400 + }, + { + "epoch": 0.42201152630600486, + "grad_norm": 0.14521019539611432, + "learning_rate": 0.0001432017247029271, + "loss": 1.8207, + "step": 3405 + }, + { + "epoch": 0.4226312201772324, + "grad_norm": 0.15105206106498664, + "learning_rate": 0.00014300652022765207, + "loss": 1.8301, + "step": 3410 + }, + { + "epoch": 0.42325091404846005, + "grad_norm": 0.12934697800384842, + "learning_rate": 0.00014281111448146468, + "loss": 1.7816, + "step": 3415 + }, + { + "epoch": 0.42387060791968767, + "grad_norm": 0.127108123074777, + "learning_rate": 0.00014261550837886566, + "loss": 1.8328, + "step": 3420 + }, + { + "epoch": 0.4244903017909153, + "grad_norm": 0.13025062170320045, + "learning_rate": 0.00014241970283529338, + "loss": 1.8387, + "step": 3425 + }, + { + "epoch": 0.4251099956621429, + "grad_norm": 0.1365173510804643, + "learning_rate": 0.00014222369876711955, + "loss": 1.8457, + "step": 3430 + }, + { + "epoch": 0.42572968953337054, + "grad_norm": 0.13032412192310092, + "learning_rate": 0.00014202749709164506, + "loss": 1.8723, + "step": 3435 + }, + { + "epoch": 0.4263493834045981, + "grad_norm": 0.13718771041804576, + "learning_rate": 0.00014183109872709557, + "loss": 1.8332, + "step": 3440 + }, + { + "epoch": 0.4269690772758257, + "grad_norm": 0.12990350823599006, + "learning_rate": 0.0001416345045926172, + "loss": 1.8297, + "step": 3445 + }, + { + "epoch": 0.42758877114705335, + "grad_norm": 0.130712651106168, + "learning_rate": 0.00014143771560827238, + "loss": 1.8789, + "step": 3450 + }, + { + "epoch": 0.42820846501828097, + "grad_norm": 0.1300038947077918, + "learning_rate": 0.00014124073269503534, + "loss": 1.873, + "step": 3455 + }, + { + "epoch": 0.4288281588895086, + "grad_norm": 0.12574163306755254, + "learning_rate": 0.0001410435567747879, + "loss": 1.8453, + "step": 3460 + }, + { + "epoch": 0.4294478527607362, + "grad_norm": 0.12626722646867042, + "learning_rate": 0.00014084618877031524, + "loss": 1.8578, + "step": 3465 + }, + { + "epoch": 0.43006754663196384, + "grad_norm": 0.13023525157695576, + "learning_rate": 0.00014064862960530143, + "loss": 1.8457, + "step": 3470 + }, + { + "epoch": 0.4306872405031914, + "grad_norm": 0.12891851891269074, + "learning_rate": 0.0001404508802043252, + "loss": 1.8496, + "step": 3475 + }, + { + "epoch": 0.431306934374419, + "grad_norm": 0.1317883450339982, + "learning_rate": 0.0001402529414928556, + "loss": 1.8367, + "step": 3480 + }, + { + "epoch": 0.43192662824564665, + "grad_norm": 0.1320357092237638, + "learning_rate": 0.00014005481439724753, + "loss": 1.8512, + "step": 3485 + }, + { + "epoch": 0.43254632211687427, + "grad_norm": 0.13106411312502514, + "learning_rate": 0.00013985649984473773, + "loss": 1.8703, + "step": 3490 + }, + { + "epoch": 0.4331660159881019, + "grad_norm": 0.1293849394353448, + "learning_rate": 0.0001396579987634401, + "loss": 1.7855, + "step": 3495 + }, + { + "epoch": 0.4337857098593295, + "grad_norm": 0.13104584752365614, + "learning_rate": 0.00013945931208234156, + "loss": 1.8879, + "step": 3500 + }, + { + "epoch": 0.4344054037305571, + "grad_norm": 0.13185634907081104, + "learning_rate": 0.00013926044073129759, + "loss": 1.827, + "step": 3505 + }, + { + "epoch": 0.4350250976017847, + "grad_norm": 0.13564066779325817, + "learning_rate": 0.00013906138564102793, + "loss": 1.8605, + "step": 3510 + }, + { + "epoch": 0.4356447914730123, + "grad_norm": 0.12676901341611954, + "learning_rate": 0.0001388621477431123, + "loss": 1.8656, + "step": 3515 + }, + { + "epoch": 0.43626448534423995, + "grad_norm": 0.12510362563216565, + "learning_rate": 0.00013866272796998587, + "loss": 1.8875, + "step": 3520 + }, + { + "epoch": 0.43688417921546757, + "grad_norm": 0.13507506019177096, + "learning_rate": 0.00013846312725493504, + "loss": 1.9164, + "step": 3525 + }, + { + "epoch": 0.4375038730866952, + "grad_norm": 0.132093881424524, + "learning_rate": 0.00013826334653209297, + "loss": 1.843, + "step": 3530 + }, + { + "epoch": 0.43812356695792276, + "grad_norm": 0.1421065871061589, + "learning_rate": 0.00013806338673643534, + "loss": 1.8262, + "step": 3535 + }, + { + "epoch": 0.4387432608291504, + "grad_norm": 0.13102399751421243, + "learning_rate": 0.00013786324880377576, + "loss": 1.8156, + "step": 3540 + }, + { + "epoch": 0.439362954700378, + "grad_norm": 0.13358396691833502, + "learning_rate": 0.0001376629336707617, + "loss": 1.8195, + "step": 3545 + }, + { + "epoch": 0.43998264857160563, + "grad_norm": 0.13047350696632068, + "learning_rate": 0.0001374624422748698, + "loss": 1.8738, + "step": 3550 + }, + { + "epoch": 0.44060234244283325, + "grad_norm": 0.13627153690659918, + "learning_rate": 0.00013726177555440164, + "loss": 1.8785, + "step": 3555 + }, + { + "epoch": 0.4412220363140609, + "grad_norm": 0.13540316590544368, + "learning_rate": 0.0001370609344484793, + "loss": 1.8113, + "step": 3560 + }, + { + "epoch": 0.4418417301852885, + "grad_norm": 0.14107784380640415, + "learning_rate": 0.00013685991989704105, + "loss": 1.8199, + "step": 3565 + }, + { + "epoch": 0.44246142405651606, + "grad_norm": 0.12911502937346492, + "learning_rate": 0.00013665873284083685, + "loss": 1.8387, + "step": 3570 + }, + { + "epoch": 0.4430811179277437, + "grad_norm": 0.13109208746838247, + "learning_rate": 0.000136457374221424, + "loss": 1.8465, + "step": 3575 + }, + { + "epoch": 0.4437008117989713, + "grad_norm": 0.1409108873500928, + "learning_rate": 0.00013625584498116262, + "loss": 1.8617, + "step": 3580 + }, + { + "epoch": 0.44432050567019893, + "grad_norm": 0.1296434499076529, + "learning_rate": 0.00013605414606321148, + "loss": 1.8629, + "step": 3585 + }, + { + "epoch": 0.44494019954142655, + "grad_norm": 0.1438673184528456, + "learning_rate": 0.0001358522784115234, + "loss": 1.8715, + "step": 3590 + }, + { + "epoch": 0.4455598934126542, + "grad_norm": 0.13609879637032504, + "learning_rate": 0.00013565024297084084, + "loss": 1.8895, + "step": 3595 + }, + { + "epoch": 0.44617958728388174, + "grad_norm": 0.13036349295369937, + "learning_rate": 0.0001354480406866915, + "loss": 1.8031, + "step": 3600 + }, + { + "epoch": 0.44679928115510936, + "grad_norm": 0.1350939728981667, + "learning_rate": 0.00013524567250538396, + "loss": 1.8594, + "step": 3605 + }, + { + "epoch": 0.447418975026337, + "grad_norm": 0.12714971692456176, + "learning_rate": 0.00013504313937400317, + "loss": 1.8668, + "step": 3610 + }, + { + "epoch": 0.4480386688975646, + "grad_norm": 0.13951214846411, + "learning_rate": 0.00013484044224040606, + "loss": 1.8418, + "step": 3615 + }, + { + "epoch": 0.44865836276879223, + "grad_norm": 0.1386490336407402, + "learning_rate": 0.00013463758205321715, + "loss": 1.8695, + "step": 3620 + }, + { + "epoch": 0.44927805664001985, + "grad_norm": 0.13782618102933727, + "learning_rate": 0.0001344345597618239, + "loss": 1.8273, + "step": 3625 + }, + { + "epoch": 0.4498977505112474, + "grad_norm": 0.13579485790542123, + "learning_rate": 0.00013423137631637258, + "loss": 1.8586, + "step": 3630 + }, + { + "epoch": 0.45051744438247504, + "grad_norm": 0.1300030317668718, + "learning_rate": 0.00013402803266776353, + "loss": 1.8652, + "step": 3635 + }, + { + "epoch": 0.45113713825370266, + "grad_norm": 0.13598316227960938, + "learning_rate": 0.00013382452976764693, + "loss": 1.8309, + "step": 3640 + }, + { + "epoch": 0.4517568321249303, + "grad_norm": 0.12953667957338744, + "learning_rate": 0.00013362086856841826, + "loss": 1.8668, + "step": 3645 + }, + { + "epoch": 0.4523765259961579, + "grad_norm": 0.13451954909685226, + "learning_rate": 0.0001334170500232138, + "loss": 1.8543, + "step": 3650 + }, + { + "epoch": 0.45299621986738553, + "grad_norm": 0.13195858085157425, + "learning_rate": 0.00013321307508590624, + "loss": 1.8504, + "step": 3655 + }, + { + "epoch": 0.45361591373861315, + "grad_norm": 0.1305239663789138, + "learning_rate": 0.00013300894471110014, + "loss": 1.8621, + "step": 3660 + }, + { + "epoch": 0.4542356076098407, + "grad_norm": 0.13635413078375763, + "learning_rate": 0.00013280465985412757, + "loss": 1.827, + "step": 3665 + }, + { + "epoch": 0.45485530148106834, + "grad_norm": 0.1258025644634598, + "learning_rate": 0.00013260022147104354, + "loss": 1.85, + "step": 3670 + }, + { + "epoch": 0.45547499535229596, + "grad_norm": 0.1415524251745301, + "learning_rate": 0.00013239563051862158, + "loss": 1.8246, + "step": 3675 + }, + { + "epoch": 0.4560946892235236, + "grad_norm": 0.13622385001598838, + "learning_rate": 0.00013219088795434923, + "loss": 1.8383, + "step": 3680 + }, + { + "epoch": 0.4567143830947512, + "grad_norm": 0.1396238647704036, + "learning_rate": 0.00013198599473642354, + "loss": 1.8922, + "step": 3685 + }, + { + "epoch": 0.45733407696597883, + "grad_norm": 0.12825728444600357, + "learning_rate": 0.00013178095182374676, + "loss": 1.7937, + "step": 3690 + }, + { + "epoch": 0.4579537708372064, + "grad_norm": 0.14084681880849642, + "learning_rate": 0.00013157576017592157, + "loss": 1.8258, + "step": 3695 + }, + { + "epoch": 0.458573464708434, + "grad_norm": 0.1385483862626899, + "learning_rate": 0.0001313704207532468, + "loss": 1.8488, + "step": 3700 + }, + { + "epoch": 0.45919315857966164, + "grad_norm": 0.1289536639011512, + "learning_rate": 0.00013116493451671279, + "loss": 1.8352, + "step": 3705 + }, + { + "epoch": 0.45981285245088926, + "grad_norm": 0.1289125265280001, + "learning_rate": 0.000130959302427997, + "loss": 1.7816, + "step": 3710 + }, + { + "epoch": 0.4604325463221169, + "grad_norm": 0.13577383918982386, + "learning_rate": 0.00013075352544945966, + "loss": 1.866, + "step": 3715 + }, + { + "epoch": 0.4610522401933445, + "grad_norm": 0.13279564421407647, + "learning_rate": 0.00013054760454413882, + "loss": 1.8707, + "step": 3720 + }, + { + "epoch": 0.4616719340645721, + "grad_norm": 0.13165317327395187, + "learning_rate": 0.00013034154067574622, + "loss": 1.8813, + "step": 3725 + }, + { + "epoch": 0.4622916279357997, + "grad_norm": 0.13474300432005532, + "learning_rate": 0.00013013533480866273, + "loss": 1.8141, + "step": 3730 + }, + { + "epoch": 0.4629113218070273, + "grad_norm": 0.1348771890055806, + "learning_rate": 0.00012992898790793362, + "loss": 1.8488, + "step": 3735 + }, + { + "epoch": 0.46353101567825494, + "grad_norm": 0.13101703748539598, + "learning_rate": 0.00012972250093926436, + "loss": 1.85, + "step": 3740 + }, + { + "epoch": 0.46415070954948257, + "grad_norm": 0.12926638302593185, + "learning_rate": 0.0001295158748690159, + "loss": 1.8449, + "step": 3745 + }, + { + "epoch": 0.4647704034207102, + "grad_norm": 0.12685006085742664, + "learning_rate": 0.0001293091106642001, + "loss": 1.8676, + "step": 3750 + }, + { + "epoch": 0.4653900972919378, + "grad_norm": 0.13335175429016402, + "learning_rate": 0.00012910220929247538, + "loss": 1.8445, + "step": 3755 + }, + { + "epoch": 0.4660097911631654, + "grad_norm": 0.1334428844711798, + "learning_rate": 0.00012889517172214206, + "loss": 1.8258, + "step": 3760 + }, + { + "epoch": 0.466629485034393, + "grad_norm": 0.13235299019438176, + "learning_rate": 0.0001286879989221379, + "loss": 1.8328, + "step": 3765 + }, + { + "epoch": 0.4672491789056206, + "grad_norm": 0.13692059560954664, + "learning_rate": 0.0001284806918620335, + "loss": 1.8285, + "step": 3770 + }, + { + "epoch": 0.46786887277684824, + "grad_norm": 0.1241361680899772, + "learning_rate": 0.00012827325151202782, + "loss": 1.8387, + "step": 3775 + }, + { + "epoch": 0.46848856664807587, + "grad_norm": 0.1339809286185403, + "learning_rate": 0.00012806567884294362, + "loss": 1.8816, + "step": 3780 + }, + { + "epoch": 0.4691082605193035, + "grad_norm": 0.12897521994670574, + "learning_rate": 0.00012785797482622294, + "loss": 1.8398, + "step": 3785 + }, + { + "epoch": 0.46972795439053106, + "grad_norm": 0.13985587399436297, + "learning_rate": 0.00012765014043392242, + "loss": 1.848, + "step": 3790 + }, + { + "epoch": 0.4703476482617587, + "grad_norm": 0.13828870435326854, + "learning_rate": 0.00012744217663870902, + "loss": 1.8797, + "step": 3795 + }, + { + "epoch": 0.4709673421329863, + "grad_norm": 0.138650980685239, + "learning_rate": 0.00012723408441385521, + "loss": 1.8598, + "step": 3800 + }, + { + "epoch": 0.4715870360042139, + "grad_norm": 0.13009664188771436, + "learning_rate": 0.0001270258647332345, + "loss": 1.8523, + "step": 3805 + }, + { + "epoch": 0.47220672987544154, + "grad_norm": 0.129438419879605, + "learning_rate": 0.00012681751857131693, + "loss": 1.8004, + "step": 3810 + }, + { + "epoch": 0.47282642374666917, + "grad_norm": 0.13070922171883706, + "learning_rate": 0.00012660904690316445, + "loss": 1.8504, + "step": 3815 + }, + { + "epoch": 0.47344611761789673, + "grad_norm": 0.12668656962083866, + "learning_rate": 0.00012640045070442643, + "loss": 1.8609, + "step": 3820 + }, + { + "epoch": 0.47406581148912436, + "grad_norm": 0.13741354466112743, + "learning_rate": 0.000126191730951335, + "loss": 1.809, + "step": 3825 + }, + { + "epoch": 0.474685505360352, + "grad_norm": 0.12823342134235233, + "learning_rate": 0.0001259828886207005, + "loss": 1.8492, + "step": 3830 + }, + { + "epoch": 0.4753051992315796, + "grad_norm": 0.1379250744950109, + "learning_rate": 0.00012577392468990695, + "loss": 1.934, + "step": 3835 + }, + { + "epoch": 0.4759248931028072, + "grad_norm": 0.13202143342192066, + "learning_rate": 0.00012556484013690763, + "loss": 1.8352, + "step": 3840 + }, + { + "epoch": 0.47654458697403485, + "grad_norm": 0.13299754683798998, + "learning_rate": 0.00012535563594022, + "loss": 1.8297, + "step": 3845 + }, + { + "epoch": 0.47716428084526247, + "grad_norm": 0.12790876592097397, + "learning_rate": 0.0001251463130789217, + "loss": 1.8383, + "step": 3850 + }, + { + "epoch": 0.47778397471649003, + "grad_norm": 0.12997304481536676, + "learning_rate": 0.0001249368725326457, + "loss": 1.8055, + "step": 3855 + }, + { + "epoch": 0.47840366858771766, + "grad_norm": 0.12972319050241293, + "learning_rate": 0.00012472731528157563, + "loss": 1.8434, + "step": 3860 + }, + { + "epoch": 0.4790233624589453, + "grad_norm": 0.13199195937817895, + "learning_rate": 0.00012451764230644145, + "loss": 1.8934, + "step": 3865 + }, + { + "epoch": 0.4796430563301729, + "grad_norm": 0.13461248829630099, + "learning_rate": 0.0001243078545885145, + "loss": 1.8602, + "step": 3870 + }, + { + "epoch": 0.4802627502014005, + "grad_norm": 0.1355432249144203, + "learning_rate": 0.00012409795310960333, + "loss": 1.8156, + "step": 3875 + }, + { + "epoch": 0.48088244407262815, + "grad_norm": 0.14482071533572835, + "learning_rate": 0.00012388793885204875, + "loss": 1.8492, + "step": 3880 + }, + { + "epoch": 0.4815021379438557, + "grad_norm": 0.13371691055623847, + "learning_rate": 0.00012367781279871946, + "loss": 1.782, + "step": 3885 + }, + { + "epoch": 0.48212183181508333, + "grad_norm": 0.1447992804565308, + "learning_rate": 0.00012346757593300733, + "loss": 1.8227, + "step": 3890 + }, + { + "epoch": 0.48274152568631096, + "grad_norm": 0.12767703586264148, + "learning_rate": 0.00012325722923882285, + "loss": 1.841, + "step": 3895 + }, + { + "epoch": 0.4833612195575386, + "grad_norm": 0.1314306698392859, + "learning_rate": 0.00012304677370059047, + "loss": 1.825, + "step": 3900 + }, + { + "epoch": 0.4839809134287662, + "grad_norm": 0.1345967559038808, + "learning_rate": 0.00012283621030324403, + "loss": 1.7437, + "step": 3905 + }, + { + "epoch": 0.4846006072999938, + "grad_norm": 0.13106442792164216, + "learning_rate": 0.00012262554003222221, + "loss": 1.8285, + "step": 3910 + }, + { + "epoch": 0.4852203011712214, + "grad_norm": 0.1398213559576168, + "learning_rate": 0.00012241476387346386, + "loss": 1.8332, + "step": 3915 + }, + { + "epoch": 0.485839995042449, + "grad_norm": 0.13124036303853287, + "learning_rate": 0.00012220388281340328, + "loss": 1.8324, + "step": 3920 + }, + { + "epoch": 0.48645968891367664, + "grad_norm": 0.1421186518175135, + "learning_rate": 0.00012199289783896582, + "loss": 1.8418, + "step": 3925 + }, + { + "epoch": 0.48707938278490426, + "grad_norm": 0.1364339784527228, + "learning_rate": 0.0001217818099375631, + "loss": 1.775, + "step": 3930 + }, + { + "epoch": 0.4876990766561319, + "grad_norm": 0.1335459935708431, + "learning_rate": 0.00012157062009708847, + "loss": 1.8012, + "step": 3935 + }, + { + "epoch": 0.4883187705273595, + "grad_norm": 0.1359760422124762, + "learning_rate": 0.00012135932930591232, + "loss": 1.8246, + "step": 3940 + }, + { + "epoch": 0.4889384643985871, + "grad_norm": 0.12394543676184719, + "learning_rate": 0.00012114793855287749, + "loss": 1.8207, + "step": 3945 + }, + { + "epoch": 0.4895581582698147, + "grad_norm": 0.1304584073821454, + "learning_rate": 0.00012093644882729473, + "loss": 1.8484, + "step": 3950 + }, + { + "epoch": 0.4901778521410423, + "grad_norm": 0.1277387513791641, + "learning_rate": 0.0001207248611189378, + "loss": 1.8074, + "step": 3955 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.13299457188730449, + "learning_rate": 0.00012051317641803921, + "loss": 1.8664, + "step": 3960 + }, + { + "epoch": 0.49141723988349756, + "grad_norm": 0.1342304756979945, + "learning_rate": 0.00012030139571528534, + "loss": 1.8043, + "step": 3965 + }, + { + "epoch": 0.4920369337547252, + "grad_norm": 0.13564000088494185, + "learning_rate": 0.00012008952000181175, + "loss": 1.8113, + "step": 3970 + }, + { + "epoch": 0.4926566276259528, + "grad_norm": 0.1351196466939399, + "learning_rate": 0.0001198775502691988, + "loss": 1.9066, + "step": 3975 + }, + { + "epoch": 0.49327632149718037, + "grad_norm": 0.1357026978518113, + "learning_rate": 0.00011966548750946678, + "loss": 1.7566, + "step": 3980 + }, + { + "epoch": 0.493896015368408, + "grad_norm": 0.13117708579809673, + "learning_rate": 0.00011945333271507138, + "loss": 1.8559, + "step": 3985 + }, + { + "epoch": 0.4945157092396356, + "grad_norm": 0.13842921233959002, + "learning_rate": 0.00011924108687889899, + "loss": 1.8441, + "step": 3990 + }, + { + "epoch": 0.49513540311086324, + "grad_norm": 0.1328649899399259, + "learning_rate": 0.00011902875099426207, + "loss": 1.8445, + "step": 3995 + }, + { + "epoch": 0.49575509698209086, + "grad_norm": 0.13155897661920435, + "learning_rate": 0.00011881632605489457, + "loss": 1.8977, + "step": 4000 + }, + { + "epoch": 0.4963747908533185, + "grad_norm": 0.13187321718339992, + "learning_rate": 0.0001186038130549471, + "loss": 1.7977, + "step": 4005 + }, + { + "epoch": 0.49699448472454605, + "grad_norm": 0.13286908074253748, + "learning_rate": 0.00011839121298898253, + "loss": 1.9191, + "step": 4010 + }, + { + "epoch": 0.49761417859577367, + "grad_norm": 0.13199065193371257, + "learning_rate": 0.00011817852685197109, + "loss": 1.7773, + "step": 4015 + }, + { + "epoch": 0.4982338724670013, + "grad_norm": 0.1373404116593631, + "learning_rate": 0.00011796575563928591, + "loss": 1.8336, + "step": 4020 + }, + { + "epoch": 0.4988535663382289, + "grad_norm": 0.13885642586504665, + "learning_rate": 0.00011775290034669822, + "loss": 1.8387, + "step": 4025 + }, + { + "epoch": 0.49947326020945654, + "grad_norm": 0.14373947655079228, + "learning_rate": 0.00011753996197037272, + "loss": 1.8379, + "step": 4030 + }, + { + "epoch": 0.5000929540806841, + "grad_norm": 0.12835515006238413, + "learning_rate": 0.00011732694150686301, + "loss": 1.8641, + "step": 4035 + }, + { + "epoch": 0.5007126479519117, + "grad_norm": 0.13281937799249963, + "learning_rate": 0.00011711383995310681, + "loss": 1.8371, + "step": 4040 + }, + { + "epoch": 0.5013323418231393, + "grad_norm": 0.134641067216936, + "learning_rate": 0.00011690065830642143, + "loss": 1.766, + "step": 4045 + }, + { + "epoch": 0.501952035694367, + "grad_norm": 0.1406950506844368, + "learning_rate": 0.00011668739756449885, + "loss": 1.7727, + "step": 4050 + }, + { + "epoch": 0.5025717295655946, + "grad_norm": 0.1377858503969937, + "learning_rate": 0.00011647405872540138, + "loss": 1.8195, + "step": 4055 + }, + { + "epoch": 0.5031914234368222, + "grad_norm": 0.13313565675420963, + "learning_rate": 0.00011626064278755673, + "loss": 1.7883, + "step": 4060 + }, + { + "epoch": 0.5038111173080498, + "grad_norm": 0.13459471367809545, + "learning_rate": 0.00011604715074975347, + "loss": 1.8438, + "step": 4065 + }, + { + "epoch": 0.5044308111792775, + "grad_norm": 0.1311158940726712, + "learning_rate": 0.00011583358361113632, + "loss": 1.9012, + "step": 4070 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.13876753398913574, + "learning_rate": 0.00011561994237120148, + "loss": 1.8672, + "step": 4075 + }, + { + "epoch": 0.5056701989217327, + "grad_norm": 0.13183560468669342, + "learning_rate": 0.00011540622802979187, + "loss": 1.8582, + "step": 4080 + }, + { + "epoch": 0.5062898927929603, + "grad_norm": 0.13580340297486979, + "learning_rate": 0.00011519244158709263, + "loss": 1.8945, + "step": 4085 + }, + { + "epoch": 0.5069095866641878, + "grad_norm": 0.13991883408059513, + "learning_rate": 0.00011497858404362631, + "loss": 1.8926, + "step": 4090 + }, + { + "epoch": 0.5075292805354155, + "grad_norm": 0.14012773463184303, + "learning_rate": 0.00011476465640024814, + "loss": 1.8121, + "step": 4095 + }, + { + "epoch": 0.5081489744066431, + "grad_norm": 0.1301910255761767, + "learning_rate": 0.00011455065965814148, + "loss": 1.8758, + "step": 4100 + }, + { + "epoch": 0.5087686682778707, + "grad_norm": 0.13887521410538325, + "learning_rate": 0.00011433659481881307, + "loss": 1.841, + "step": 4105 + }, + { + "epoch": 0.5093883621490983, + "grad_norm": 0.12807030791037602, + "learning_rate": 0.00011412246288408835, + "loss": 1.8855, + "step": 4110 + }, + { + "epoch": 0.510008056020326, + "grad_norm": 0.12976882936743359, + "learning_rate": 0.00011390826485610675, + "loss": 1.8348, + "step": 4115 + }, + { + "epoch": 0.5106277498915536, + "grad_norm": 0.12871810923015659, + "learning_rate": 0.000113694001737317, + "loss": 1.8852, + "step": 4120 + }, + { + "epoch": 0.5112474437627812, + "grad_norm": 0.12717099476381863, + "learning_rate": 0.00011347967453047248, + "loss": 1.8328, + "step": 4125 + }, + { + "epoch": 0.5118671376340088, + "grad_norm": 0.13121903677104316, + "learning_rate": 0.00011326528423862653, + "loss": 1.8375, + "step": 4130 + }, + { + "epoch": 0.5124868315052364, + "grad_norm": 0.12986993222627213, + "learning_rate": 0.00011305083186512765, + "loss": 1.8043, + "step": 4135 + }, + { + "epoch": 0.5131065253764641, + "grad_norm": 0.13292488082615117, + "learning_rate": 0.00011283631841361499, + "loss": 1.8258, + "step": 4140 + }, + { + "epoch": 0.5137262192476917, + "grad_norm": 0.1290602203925276, + "learning_rate": 0.00011262174488801349, + "loss": 1.868, + "step": 4145 + }, + { + "epoch": 0.5143459131189193, + "grad_norm": 0.1340007696562376, + "learning_rate": 0.00011240711229252915, + "loss": 1.8273, + "step": 4150 + }, + { + "epoch": 0.5149656069901468, + "grad_norm": 0.13145813073711832, + "learning_rate": 0.00011219242163164457, + "loss": 1.9297, + "step": 4155 + }, + { + "epoch": 0.5155853008613744, + "grad_norm": 0.1313762201180359, + "learning_rate": 0.000111977673910114, + "loss": 1.9039, + "step": 4160 + }, + { + "epoch": 0.5162049947326021, + "grad_norm": 0.1323692673503486, + "learning_rate": 0.00011176287013295879, + "loss": 1.8992, + "step": 4165 + }, + { + "epoch": 0.5168246886038297, + "grad_norm": 0.14101520024243003, + "learning_rate": 0.0001115480113054626, + "loss": 1.8641, + "step": 4170 + }, + { + "epoch": 0.5174443824750573, + "grad_norm": 0.1351388006249665, + "learning_rate": 0.00011133309843316669, + "loss": 1.8871, + "step": 4175 + }, + { + "epoch": 0.5180640763462849, + "grad_norm": 0.1401631083665635, + "learning_rate": 0.0001111181325218653, + "loss": 1.7629, + "step": 4180 + }, + { + "epoch": 0.5186837702175126, + "grad_norm": 0.1385892991917342, + "learning_rate": 0.00011090311457760094, + "loss": 1.8035, + "step": 4185 + }, + { + "epoch": 0.5193034640887402, + "grad_norm": 0.13371107142455393, + "learning_rate": 0.0001106880456066595, + "loss": 1.7957, + "step": 4190 + }, + { + "epoch": 0.5199231579599678, + "grad_norm": 0.13066975486189664, + "learning_rate": 0.00011047292661556581, + "loss": 1.8027, + "step": 4195 + }, + { + "epoch": 0.5205428518311954, + "grad_norm": 0.1358953784147378, + "learning_rate": 0.00011025775861107874, + "loss": 1.827, + "step": 4200 + }, + { + "epoch": 0.521162545702423, + "grad_norm": 0.1312496989386351, + "learning_rate": 0.00011004254260018648, + "loss": 1.8441, + "step": 4205 + }, + { + "epoch": 0.5217822395736507, + "grad_norm": 0.13783425853932574, + "learning_rate": 0.00010982727959010201, + "loss": 1.8816, + "step": 4210 + }, + { + "epoch": 0.5224019334448782, + "grad_norm": 0.127457472607005, + "learning_rate": 0.00010961197058825817, + "loss": 1.8676, + "step": 4215 + }, + { + "epoch": 0.5230216273161058, + "grad_norm": 0.13820731677300183, + "learning_rate": 0.00010939661660230309, + "loss": 1.8344, + "step": 4220 + }, + { + "epoch": 0.5236413211873334, + "grad_norm": 0.13449834908571123, + "learning_rate": 0.00010918121864009543, + "loss": 1.8309, + "step": 4225 + }, + { + "epoch": 0.524261015058561, + "grad_norm": 0.13724886784231616, + "learning_rate": 0.00010896577770969964, + "loss": 1.8043, + "step": 4230 + }, + { + "epoch": 0.5248807089297887, + "grad_norm": 0.12837832712288905, + "learning_rate": 0.00010875029481938126, + "loss": 1.902, + "step": 4235 + }, + { + "epoch": 0.5255004028010163, + "grad_norm": 0.13135449306647268, + "learning_rate": 0.00010853477097760222, + "loss": 1.8266, + "step": 4240 + }, + { + "epoch": 0.5261200966722439, + "grad_norm": 0.136587596414044, + "learning_rate": 0.0001083192071930161, + "loss": 1.8004, + "step": 4245 + }, + { + "epoch": 0.5267397905434715, + "grad_norm": 0.12616824686941636, + "learning_rate": 0.00010810360447446335, + "loss": 1.8512, + "step": 4250 + }, + { + "epoch": 0.5273594844146992, + "grad_norm": 0.13515955918539224, + "learning_rate": 0.00010788796383096676, + "loss": 1.8156, + "step": 4255 + }, + { + "epoch": 0.5279791782859268, + "grad_norm": 0.13077554655712312, + "learning_rate": 0.00010767228627172645, + "loss": 1.8566, + "step": 4260 + }, + { + "epoch": 0.5285988721571544, + "grad_norm": 0.1311515087885154, + "learning_rate": 0.00010745657280611552, + "loss": 1.8633, + "step": 4265 + }, + { + "epoch": 0.529218566028382, + "grad_norm": 0.13183696062014078, + "learning_rate": 0.00010724082444367485, + "loss": 1.8453, + "step": 4270 + }, + { + "epoch": 0.5298382598996096, + "grad_norm": 0.1326471397034417, + "learning_rate": 0.00010702504219410884, + "loss": 1.8797, + "step": 4275 + }, + { + "epoch": 0.5304579537708372, + "grad_norm": 0.13074802402419886, + "learning_rate": 0.00010680922706728041, + "loss": 1.7977, + "step": 4280 + }, + { + "epoch": 0.5310776476420648, + "grad_norm": 0.13471386870044813, + "learning_rate": 0.00010659338007320632, + "loss": 1.9219, + "step": 4285 + }, + { + "epoch": 0.5316973415132924, + "grad_norm": 0.13101063536055021, + "learning_rate": 0.00010637750222205253, + "loss": 1.8273, + "step": 4290 + }, + { + "epoch": 0.53231703538452, + "grad_norm": 0.1330552224706942, + "learning_rate": 0.00010616159452412939, + "loss": 1.8289, + "step": 4295 + }, + { + "epoch": 0.5329367292557476, + "grad_norm": 0.13232572723510502, + "learning_rate": 0.00010594565798988689, + "loss": 1.8492, + "step": 4300 + }, + { + "epoch": 0.5335564231269753, + "grad_norm": 0.14341248190041536, + "learning_rate": 0.00010572969362990998, + "loss": 1.8523, + "step": 4305 + }, + { + "epoch": 0.5341761169982029, + "grad_norm": 0.1326750594937519, + "learning_rate": 0.00010551370245491394, + "loss": 1.843, + "step": 4310 + }, + { + "epoch": 0.5347958108694305, + "grad_norm": 0.13873579584003962, + "learning_rate": 0.00010529768547573942, + "loss": 1.8121, + "step": 4315 + }, + { + "epoch": 0.5354155047406581, + "grad_norm": 0.13163567087274103, + "learning_rate": 0.00010508164370334787, + "loss": 1.8465, + "step": 4320 + }, + { + "epoch": 0.5360351986118858, + "grad_norm": 0.13968960114860696, + "learning_rate": 0.00010486557814881686, + "loss": 1.8352, + "step": 4325 + }, + { + "epoch": 0.5366548924831134, + "grad_norm": 0.13788540405986352, + "learning_rate": 0.00010464948982333504, + "loss": 1.8113, + "step": 4330 + }, + { + "epoch": 0.537274586354341, + "grad_norm": 0.1369901284801625, + "learning_rate": 0.00010443337973819791, + "loss": 1.8141, + "step": 4335 + }, + { + "epoch": 0.5378942802255686, + "grad_norm": 0.14142406746780717, + "learning_rate": 0.00010421724890480258, + "loss": 1.8234, + "step": 4340 + }, + { + "epoch": 0.5385139740967961, + "grad_norm": 0.14077233215811605, + "learning_rate": 0.00010400109833464338, + "loss": 1.859, + "step": 4345 + }, + { + "epoch": 0.5391336679680238, + "grad_norm": 0.14302584088863268, + "learning_rate": 0.00010378492903930699, + "loss": 1.8238, + "step": 4350 + }, + { + "epoch": 0.5397533618392514, + "grad_norm": 0.14375440559453245, + "learning_rate": 0.00010356874203046766, + "loss": 1.8383, + "step": 4355 + }, + { + "epoch": 0.540373055710479, + "grad_norm": 0.12698327084195696, + "learning_rate": 0.00010335253831988267, + "loss": 1.8203, + "step": 4360 + }, + { + "epoch": 0.5409927495817066, + "grad_norm": 0.12860058438618013, + "learning_rate": 0.00010313631891938736, + "loss": 1.8637, + "step": 4365 + }, + { + "epoch": 0.5416124434529342, + "grad_norm": 0.13436040029788598, + "learning_rate": 0.00010292008484089047, + "loss": 1.8523, + "step": 4370 + }, + { + "epoch": 0.5422321373241619, + "grad_norm": 0.1345634167056628, + "learning_rate": 0.0001027038370963695, + "loss": 1.8539, + "step": 4375 + }, + { + "epoch": 0.5428518311953895, + "grad_norm": 0.13589085190830785, + "learning_rate": 0.00010248757669786594, + "loss": 1.841, + "step": 4380 + }, + { + "epoch": 0.5434715250666171, + "grad_norm": 0.1394154520082444, + "learning_rate": 0.00010227130465748045, + "loss": 1.8855, + "step": 4385 + }, + { + "epoch": 0.5440912189378447, + "grad_norm": 0.13059784478842867, + "learning_rate": 0.00010205502198736816, + "loss": 1.7937, + "step": 4390 + }, + { + "epoch": 0.5447109128090724, + "grad_norm": 0.12746980330896387, + "learning_rate": 0.00010183872969973396, + "loss": 1.8199, + "step": 4395 + }, + { + "epoch": 0.5453306066803, + "grad_norm": 0.1535462716817909, + "learning_rate": 0.00010162242880682776, + "loss": 1.8605, + "step": 4400 + }, + { + "epoch": 0.5459503005515276, + "grad_norm": 0.1342716210660633, + "learning_rate": 0.00010140612032093972, + "loss": 1.8234, + "step": 4405 + }, + { + "epoch": 0.5465699944227551, + "grad_norm": 0.1318603678326961, + "learning_rate": 0.00010118980525439559, + "loss": 1.8414, + "step": 4410 + }, + { + "epoch": 0.5471896882939827, + "grad_norm": 0.1344047381087586, + "learning_rate": 0.00010097348461955186, + "loss": 1.7937, + "step": 4415 + }, + { + "epoch": 0.5478093821652104, + "grad_norm": 0.13155594328043496, + "learning_rate": 0.00010075715942879114, + "loss": 1.8137, + "step": 4420 + }, + { + "epoch": 0.548429076036438, + "grad_norm": 0.13086140046383155, + "learning_rate": 0.00010054083069451728, + "loss": 1.841, + "step": 4425 + }, + { + "epoch": 0.5490487699076656, + "grad_norm": 0.13099764443257447, + "learning_rate": 0.00010032449942915072, + "loss": 1.8141, + "step": 4430 + }, + { + "epoch": 0.5496684637788932, + "grad_norm": 0.14093728038783374, + "learning_rate": 0.00010010816664512389, + "loss": 1.8008, + "step": 4435 + }, + { + "epoch": 0.5502881576501208, + "grad_norm": 0.12948947274284553, + "learning_rate": 9.989183335487615e-05, + "loss": 1.8656, + "step": 4440 + }, + { + "epoch": 0.5509078515213485, + "grad_norm": 0.1346182653084658, + "learning_rate": 9.96755005708493e-05, + "loss": 1.8719, + "step": 4445 + }, + { + "epoch": 0.5515275453925761, + "grad_norm": 0.13528704687502488, + "learning_rate": 9.945916930548276e-05, + "loss": 1.9227, + "step": 4450 + }, + { + "epoch": 0.5521472392638037, + "grad_norm": 0.13495978964006375, + "learning_rate": 9.924284057120888e-05, + "loss": 1.8027, + "step": 4455 + }, + { + "epoch": 0.5527669331350313, + "grad_norm": 0.13224572304109156, + "learning_rate": 9.902651538044814e-05, + "loss": 1.8789, + "step": 4460 + }, + { + "epoch": 0.553386627006259, + "grad_norm": 0.13366684428179468, + "learning_rate": 9.88101947456044e-05, + "loss": 1.8629, + "step": 4465 + }, + { + "epoch": 0.5540063208774865, + "grad_norm": 0.13117562103130084, + "learning_rate": 9.859387967906033e-05, + "loss": 1.8375, + "step": 4470 + }, + { + "epoch": 0.5546260147487141, + "grad_norm": 0.13152848331502962, + "learning_rate": 9.837757119317228e-05, + "loss": 1.8738, + "step": 4475 + }, + { + "epoch": 0.5552457086199417, + "grad_norm": 0.1290209815631564, + "learning_rate": 9.816127030026607e-05, + "loss": 1.8594, + "step": 4480 + }, + { + "epoch": 0.5558654024911693, + "grad_norm": 0.1415132208460771, + "learning_rate": 9.794497801263185e-05, + "loss": 1.8383, + "step": 4485 + }, + { + "epoch": 0.556485096362397, + "grad_norm": 0.13215668855742108, + "learning_rate": 9.772869534251956e-05, + "loss": 1.8262, + "step": 4490 + }, + { + "epoch": 0.5571047902336246, + "grad_norm": 0.13620380571406365, + "learning_rate": 9.751242330213407e-05, + "loss": 1.8852, + "step": 4495 + }, + { + "epoch": 0.5577244841048522, + "grad_norm": 0.1382439074725802, + "learning_rate": 9.729616290363051e-05, + "loss": 1.8473, + "step": 4500 + }, + { + "epoch": 0.5583441779760798, + "grad_norm": 0.13763527468041403, + "learning_rate": 9.707991515910957e-05, + "loss": 1.8625, + "step": 4505 + }, + { + "epoch": 0.5589638718473074, + "grad_norm": 0.1387628426822122, + "learning_rate": 9.686368108061268e-05, + "loss": 1.9066, + "step": 4510 + }, + { + "epoch": 0.5595835657185351, + "grad_norm": 0.1284143925815832, + "learning_rate": 9.664746168011734e-05, + "loss": 1.8227, + "step": 4515 + }, + { + "epoch": 0.5602032595897627, + "grad_norm": 0.13073807322245737, + "learning_rate": 9.643125796953233e-05, + "loss": 1.8734, + "step": 4520 + }, + { + "epoch": 0.5608229534609903, + "grad_norm": 0.13788095056614413, + "learning_rate": 9.621507096069304e-05, + "loss": 1.8582, + "step": 4525 + }, + { + "epoch": 0.5614426473322179, + "grad_norm": 0.13116436340123142, + "learning_rate": 9.599890166535665e-05, + "loss": 1.8668, + "step": 4530 + }, + { + "epoch": 0.5620623412034454, + "grad_norm": 0.1417381732399684, + "learning_rate": 9.578275109519744e-05, + "loss": 1.891, + "step": 4535 + }, + { + "epoch": 0.5626820350746731, + "grad_norm": 0.1378428491993231, + "learning_rate": 9.556662026180212e-05, + "loss": 1.8562, + "step": 4540 + }, + { + "epoch": 0.5633017289459007, + "grad_norm": 0.13590235062753475, + "learning_rate": 9.535051017666497e-05, + "loss": 1.877, + "step": 4545 + }, + { + "epoch": 0.5639214228171283, + "grad_norm": 0.13282424118769054, + "learning_rate": 9.513442185118319e-05, + "loss": 1.8711, + "step": 4550 + }, + { + "epoch": 0.5645411166883559, + "grad_norm": 0.1329650614632228, + "learning_rate": 9.491835629665214e-05, + "loss": 1.8375, + "step": 4555 + }, + { + "epoch": 0.5651608105595836, + "grad_norm": 0.13566530561601037, + "learning_rate": 9.470231452426059e-05, + "loss": 1.7836, + "step": 4560 + }, + { + "epoch": 0.5657805044308112, + "grad_norm": 0.13493729363122003, + "learning_rate": 9.448629754508607e-05, + "loss": 1.8703, + "step": 4565 + }, + { + "epoch": 0.5664001983020388, + "grad_norm": 0.1397619680131406, + "learning_rate": 9.427030637009003e-05, + "loss": 1.8855, + "step": 4570 + }, + { + "epoch": 0.5670198921732664, + "grad_norm": 0.13164387557122, + "learning_rate": 9.405434201011313e-05, + "loss": 1.8969, + "step": 4575 + }, + { + "epoch": 0.567639586044494, + "grad_norm": 0.12900052483979663, + "learning_rate": 9.383840547587064e-05, + "loss": 1.8848, + "step": 4580 + }, + { + "epoch": 0.5682592799157217, + "grad_norm": 0.1345646794503773, + "learning_rate": 9.362249777794749e-05, + "loss": 1.8422, + "step": 4585 + }, + { + "epoch": 0.5688789737869493, + "grad_norm": 0.1334915488354972, + "learning_rate": 9.340661992679369e-05, + "loss": 1.8246, + "step": 4590 + }, + { + "epoch": 0.5694986676581769, + "grad_norm": 0.13798209662252225, + "learning_rate": 9.319077293271961e-05, + "loss": 1.8746, + "step": 4595 + }, + { + "epoch": 0.5701183615294044, + "grad_norm": 0.13590886596850502, + "learning_rate": 9.297495780589118e-05, + "loss": 1.8484, + "step": 4600 + }, + { + "epoch": 0.570738055400632, + "grad_norm": 0.13290713678985736, + "learning_rate": 9.27591755563252e-05, + "loss": 1.766, + "step": 4605 + }, + { + "epoch": 0.5713577492718597, + "grad_norm": 0.12641011719135234, + "learning_rate": 9.254342719388454e-05, + "loss": 1.7781, + "step": 4610 + }, + { + "epoch": 0.5719774431430873, + "grad_norm": 0.14121891454527805, + "learning_rate": 9.232771372827356e-05, + "loss": 1.877, + "step": 4615 + }, + { + "epoch": 0.5725971370143149, + "grad_norm": 0.13878039672119347, + "learning_rate": 9.211203616903328e-05, + "loss": 1.8391, + "step": 4620 + }, + { + "epoch": 0.5732168308855425, + "grad_norm": 0.1266251624297058, + "learning_rate": 9.189639552553667e-05, + "loss": 1.8941, + "step": 4625 + }, + { + "epoch": 0.5738365247567702, + "grad_norm": 0.14463093120155013, + "learning_rate": 9.168079280698391e-05, + "loss": 1.8234, + "step": 4630 + }, + { + "epoch": 0.5744562186279978, + "grad_norm": 0.1322360423319378, + "learning_rate": 9.146522902239781e-05, + "loss": 1.85, + "step": 4635 + }, + { + "epoch": 0.5750759124992254, + "grad_norm": 0.1298579011473491, + "learning_rate": 9.124970518061877e-05, + "loss": 1.8664, + "step": 4640 + }, + { + "epoch": 0.575695606370453, + "grad_norm": 0.13432468619740123, + "learning_rate": 9.103422229030038e-05, + "loss": 1.8441, + "step": 4645 + }, + { + "epoch": 0.5763153002416807, + "grad_norm": 0.13199808903312424, + "learning_rate": 9.081878135990458e-05, + "loss": 1.7465, + "step": 4650 + }, + { + "epoch": 0.5769349941129083, + "grad_norm": 0.1348707902460552, + "learning_rate": 9.06033833976969e-05, + "loss": 1.8445, + "step": 4655 + }, + { + "epoch": 0.5775546879841358, + "grad_norm": 0.1323248488216555, + "learning_rate": 9.038802941174187e-05, + "loss": 1.7797, + "step": 4660 + }, + { + "epoch": 0.5781743818553634, + "grad_norm": 0.13490844471066665, + "learning_rate": 9.017272040989804e-05, + "loss": 1.8605, + "step": 4665 + }, + { + "epoch": 0.578794075726591, + "grad_norm": 0.129326446856862, + "learning_rate": 8.995745739981355e-05, + "loss": 1.8629, + "step": 4670 + }, + { + "epoch": 0.5794137695978187, + "grad_norm": 0.1405323300926802, + "learning_rate": 8.974224138892127e-05, + "loss": 1.852, + "step": 4675 + }, + { + "epoch": 0.5800334634690463, + "grad_norm": 0.13103817592164685, + "learning_rate": 8.952707338443418e-05, + "loss": 1.8113, + "step": 4680 + }, + { + "epoch": 0.5806531573402739, + "grad_norm": 0.1370095616856299, + "learning_rate": 8.931195439334048e-05, + "loss": 1.8484, + "step": 4685 + }, + { + "epoch": 0.5812728512115015, + "grad_norm": 0.13307208775504528, + "learning_rate": 8.90968854223991e-05, + "loss": 1.8207, + "step": 4690 + }, + { + "epoch": 0.5818925450827291, + "grad_norm": 0.13560010132734596, + "learning_rate": 8.888186747813473e-05, + "loss": 1.8574, + "step": 4695 + }, + { + "epoch": 0.5825122389539568, + "grad_norm": 0.13395887302734633, + "learning_rate": 8.866690156683332e-05, + "loss": 1.827, + "step": 4700 + }, + { + "epoch": 0.5831319328251844, + "grad_norm": 0.13429729678044644, + "learning_rate": 8.845198869453742e-05, + "loss": 1.8605, + "step": 4705 + }, + { + "epoch": 0.583751626696412, + "grad_norm": 0.1363221017644863, + "learning_rate": 8.823712986704121e-05, + "loss": 1.7816, + "step": 4710 + }, + { + "epoch": 0.5843713205676396, + "grad_norm": 0.1335714214307853, + "learning_rate": 8.802232608988604e-05, + "loss": 1.8316, + "step": 4715 + }, + { + "epoch": 0.5849910144388673, + "grad_norm": 0.15216425952602824, + "learning_rate": 8.780757836835544e-05, + "loss": 1.8223, + "step": 4720 + }, + { + "epoch": 0.5856107083100948, + "grad_norm": 0.13275202428606528, + "learning_rate": 8.759288770747087e-05, + "loss": 1.8527, + "step": 4725 + }, + { + "epoch": 0.5862304021813224, + "grad_norm": 0.1549023027007513, + "learning_rate": 8.737825511198654e-05, + "loss": 1.8434, + "step": 4730 + }, + { + "epoch": 0.58685009605255, + "grad_norm": 0.14026036781473247, + "learning_rate": 8.7163681586385e-05, + "loss": 1.8254, + "step": 4735 + }, + { + "epoch": 0.5874697899237776, + "grad_norm": 0.13233339755282925, + "learning_rate": 8.694916813487233e-05, + "loss": 1.834, + "step": 4740 + }, + { + "epoch": 0.5880894837950053, + "grad_norm": 0.13863065649650821, + "learning_rate": 8.67347157613735e-05, + "loss": 1.7969, + "step": 4745 + }, + { + "epoch": 0.5887091776662329, + "grad_norm": 0.13446458501695888, + "learning_rate": 8.652032546952754e-05, + "loss": 1.8039, + "step": 4750 + }, + { + "epoch": 0.5893288715374605, + "grad_norm": 0.1349155466617635, + "learning_rate": 8.630599826268303e-05, + "loss": 1.9152, + "step": 4755 + }, + { + "epoch": 0.5899485654086881, + "grad_norm": 0.1364181833568848, + "learning_rate": 8.609173514389328e-05, + "loss": 1.8367, + "step": 4760 + }, + { + "epoch": 0.5905682592799157, + "grad_norm": 0.13658301611381946, + "learning_rate": 8.587753711591166e-05, + "loss": 1.7961, + "step": 4765 + }, + { + "epoch": 0.5911879531511434, + "grad_norm": 0.13948936070832027, + "learning_rate": 8.566340518118695e-05, + "loss": 1.8777, + "step": 4770 + }, + { + "epoch": 0.591807647022371, + "grad_norm": 0.13193053858601933, + "learning_rate": 8.544934034185854e-05, + "loss": 1.8289, + "step": 4775 + }, + { + "epoch": 0.5924273408935986, + "grad_norm": 0.1291550431062787, + "learning_rate": 8.523534359975189e-05, + "loss": 1.8105, + "step": 4780 + }, + { + "epoch": 0.5930470347648262, + "grad_norm": 0.1337244591947349, + "learning_rate": 8.502141595637371e-05, + "loss": 1.8254, + "step": 4785 + }, + { + "epoch": 0.5936667286360537, + "grad_norm": 0.13532142100296568, + "learning_rate": 8.480755841290736e-05, + "loss": 1.8703, + "step": 4790 + }, + { + "epoch": 0.5942864225072814, + "grad_norm": 0.1377470717648544, + "learning_rate": 8.459377197020813e-05, + "loss": 1.8426, + "step": 4795 + }, + { + "epoch": 0.594906116378509, + "grad_norm": 0.13709338754981154, + "learning_rate": 8.438005762879856e-05, + "loss": 1.8359, + "step": 4800 + }, + { + "epoch": 0.5955258102497366, + "grad_norm": 0.13560842681432764, + "learning_rate": 8.416641638886369e-05, + "loss": 1.8703, + "step": 4805 + }, + { + "epoch": 0.5961455041209642, + "grad_norm": 0.1419275087886935, + "learning_rate": 8.395284925024654e-05, + "loss": 1.8141, + "step": 4810 + }, + { + "epoch": 0.5967651979921919, + "grad_norm": 0.14348376755535794, + "learning_rate": 8.373935721244329e-05, + "loss": 1.8414, + "step": 4815 + }, + { + "epoch": 0.5973848918634195, + "grad_norm": 0.13863281114527268, + "learning_rate": 8.352594127459865e-05, + "loss": 1.7605, + "step": 4820 + }, + { + "epoch": 0.5980045857346471, + "grad_norm": 0.1374738138416099, + "learning_rate": 8.331260243550119e-05, + "loss": 1.8258, + "step": 4825 + }, + { + "epoch": 0.5986242796058747, + "grad_norm": 0.13038481110656677, + "learning_rate": 8.309934169357862e-05, + "loss": 1.8102, + "step": 4830 + }, + { + "epoch": 0.5992439734771023, + "grad_norm": 0.13222459650512203, + "learning_rate": 8.28861600468932e-05, + "loss": 1.7926, + "step": 4835 + }, + { + "epoch": 0.59986366734833, + "grad_norm": 0.1341387150049404, + "learning_rate": 8.267305849313702e-05, + "loss": 1.8582, + "step": 4840 + }, + { + "epoch": 0.6004833612195576, + "grad_norm": 0.14125522067847465, + "learning_rate": 8.246003802962732e-05, + "loss": 1.7863, + "step": 4845 + }, + { + "epoch": 0.6011030550907851, + "grad_norm": 0.13606861615713078, + "learning_rate": 8.224709965330182e-05, + "loss": 1.916, + "step": 4850 + }, + { + "epoch": 0.6017227489620127, + "grad_norm": 0.14156145445397822, + "learning_rate": 8.203424436071413e-05, + "loss": 1.8277, + "step": 4855 + }, + { + "epoch": 0.6023424428332403, + "grad_norm": 0.13589787935379263, + "learning_rate": 8.182147314802892e-05, + "loss": 1.8293, + "step": 4860 + }, + { + "epoch": 0.602962136704468, + "grad_norm": 0.13209541273739087, + "learning_rate": 8.160878701101751e-05, + "loss": 1.8113, + "step": 4865 + }, + { + "epoch": 0.6035818305756956, + "grad_norm": 0.1422468671145226, + "learning_rate": 8.139618694505292e-05, + "loss": 1.7863, + "step": 4870 + }, + { + "epoch": 0.6042015244469232, + "grad_norm": 0.13502272532252504, + "learning_rate": 8.118367394510544e-05, + "loss": 1.8195, + "step": 4875 + }, + { + "epoch": 0.6048212183181508, + "grad_norm": 0.1288915778981244, + "learning_rate": 8.097124900573795e-05, + "loss": 1.7996, + "step": 4880 + }, + { + "epoch": 0.6054409121893785, + "grad_norm": 0.13113784023771752, + "learning_rate": 8.075891312110104e-05, + "loss": 1.8668, + "step": 4885 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.13594399497080922, + "learning_rate": 8.054666728492864e-05, + "loss": 1.818, + "step": 4890 + }, + { + "epoch": 0.6066802999318337, + "grad_norm": 0.13803949866445053, + "learning_rate": 8.033451249053324e-05, + "loss": 1.8051, + "step": 4895 + }, + { + "epoch": 0.6072999938030613, + "grad_norm": 0.1319319227151321, + "learning_rate": 8.01224497308012e-05, + "loss": 1.9059, + "step": 4900 + }, + { + "epoch": 0.6079196876742889, + "grad_norm": 0.13577815653327518, + "learning_rate": 7.991047999818825e-05, + "loss": 1.8473, + "step": 4905 + }, + { + "epoch": 0.6085393815455166, + "grad_norm": 0.13549175693333843, + "learning_rate": 7.969860428471472e-05, + "loss": 1.8254, + "step": 4910 + }, + { + "epoch": 0.6091590754167441, + "grad_norm": 0.13452651891535064, + "learning_rate": 7.948682358196081e-05, + "loss": 1.8109, + "step": 4915 + }, + { + "epoch": 0.6097787692879717, + "grad_norm": 0.13296839704090957, + "learning_rate": 7.927513888106222e-05, + "loss": 1.8684, + "step": 4920 + }, + { + "epoch": 0.6103984631591993, + "grad_norm": 0.13298206942158278, + "learning_rate": 7.90635511727053e-05, + "loss": 1.8617, + "step": 4925 + }, + { + "epoch": 0.611018157030427, + "grad_norm": 0.13471862453317543, + "learning_rate": 7.88520614471225e-05, + "loss": 1.8207, + "step": 4930 + }, + { + "epoch": 0.6116378509016546, + "grad_norm": 0.1356902660201222, + "learning_rate": 7.864067069408773e-05, + "loss": 1.841, + "step": 4935 + }, + { + "epoch": 0.6122575447728822, + "grad_norm": 0.13190339436344312, + "learning_rate": 7.842937990291157e-05, + "loss": 1.7984, + "step": 4940 + }, + { + "epoch": 0.6128772386441098, + "grad_norm": 0.13568034294738784, + "learning_rate": 7.821819006243691e-05, + "loss": 1.8543, + "step": 4945 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.1417235964582138, + "learning_rate": 7.80071021610342e-05, + "loss": 1.9066, + "step": 4950 + }, + { + "epoch": 0.6141166263865651, + "grad_norm": 0.13549923024658295, + "learning_rate": 7.779611718659674e-05, + "loss": 1.8219, + "step": 4955 + }, + { + "epoch": 0.6147363202577927, + "grad_norm": 0.13335577403941054, + "learning_rate": 7.758523612653617e-05, + "loss": 1.7711, + "step": 4960 + }, + { + "epoch": 0.6153560141290203, + "grad_norm": 0.13770283681260875, + "learning_rate": 7.737445996777784e-05, + "loss": 1.8008, + "step": 4965 + }, + { + "epoch": 0.6159757080002479, + "grad_norm": 0.1350132818669121, + "learning_rate": 7.716378969675599e-05, + "loss": 1.8332, + "step": 4970 + }, + { + "epoch": 0.6165954018714755, + "grad_norm": 0.1379620012305667, + "learning_rate": 7.695322629940957e-05, + "loss": 1.8152, + "step": 4975 + }, + { + "epoch": 0.6172150957427031, + "grad_norm": 0.14223438579861916, + "learning_rate": 7.674277076117718e-05, + "loss": 1.8676, + "step": 4980 + }, + { + "epoch": 0.6178347896139307, + "grad_norm": 0.13631387246079188, + "learning_rate": 7.653242406699267e-05, + "loss": 1.8195, + "step": 4985 + }, + { + "epoch": 0.6184544834851583, + "grad_norm": 0.13139403130867852, + "learning_rate": 7.632218720128056e-05, + "loss": 1.8789, + "step": 4990 + }, + { + "epoch": 0.6190741773563859, + "grad_norm": 0.1333637361950243, + "learning_rate": 7.611206114795126e-05, + "loss": 1.8102, + "step": 4995 + }, + { + "epoch": 0.6196938712276135, + "grad_norm": 0.13724109963380457, + "learning_rate": 7.590204689039671e-05, + "loss": 1.8473, + "step": 5000 + }, + { + "epoch": 0.6203135650988412, + "grad_norm": 0.13124493232609577, + "learning_rate": 7.569214541148551e-05, + "loss": 1.8051, + "step": 5005 + }, + { + "epoch": 0.6209332589700688, + "grad_norm": 0.13887359555527148, + "learning_rate": 7.548235769355858e-05, + "loss": 1.8527, + "step": 5010 + }, + { + "epoch": 0.6215529528412964, + "grad_norm": 0.14358380718074817, + "learning_rate": 7.527268471842436e-05, + "loss": 1.8605, + "step": 5015 + }, + { + "epoch": 0.622172646712524, + "grad_norm": 0.137106352425573, + "learning_rate": 7.506312746735432e-05, + "loss": 1.8133, + "step": 5020 + }, + { + "epoch": 0.6227923405837517, + "grad_norm": 0.1397418008866006, + "learning_rate": 7.48536869210783e-05, + "loss": 1.8531, + "step": 5025 + }, + { + "epoch": 0.6234120344549793, + "grad_norm": 0.1297107756247531, + "learning_rate": 7.464436405978002e-05, + "loss": 1.8203, + "step": 5030 + }, + { + "epoch": 0.6240317283262069, + "grad_norm": 0.13710425450182642, + "learning_rate": 7.44351598630924e-05, + "loss": 1.8586, + "step": 5035 + }, + { + "epoch": 0.6246514221974344, + "grad_norm": 0.1317466014455055, + "learning_rate": 7.422607531009302e-05, + "loss": 1.8246, + "step": 5040 + }, + { + "epoch": 0.625271116068662, + "grad_norm": 0.13346546234173723, + "learning_rate": 7.401711137929955e-05, + "loss": 1.8824, + "step": 5045 + }, + { + "epoch": 0.6258908099398897, + "grad_norm": 0.12791424524198686, + "learning_rate": 7.380826904866504e-05, + "loss": 1.8742, + "step": 5050 + }, + { + "epoch": 0.6265105038111173, + "grad_norm": 0.13961087823065946, + "learning_rate": 7.35995492955736e-05, + "loss": 1.8375, + "step": 5055 + }, + { + "epoch": 0.6271301976823449, + "grad_norm": 0.1294602335761151, + "learning_rate": 7.339095309683557e-05, + "loss": 1.852, + "step": 5060 + }, + { + "epoch": 0.6277498915535725, + "grad_norm": 0.1335668188096444, + "learning_rate": 7.31824814286831e-05, + "loss": 1.8281, + "step": 5065 + }, + { + "epoch": 0.6283695854248001, + "grad_norm": 0.1457801204886514, + "learning_rate": 7.297413526676553e-05, + "loss": 1.7699, + "step": 5070 + }, + { + "epoch": 0.6289892792960278, + "grad_norm": 0.1333202164334989, + "learning_rate": 7.276591558614482e-05, + "loss": 1.8578, + "step": 5075 + }, + { + "epoch": 0.6296089731672554, + "grad_norm": 0.12964753000237725, + "learning_rate": 7.2557823361291e-05, + "loss": 1.8234, + "step": 5080 + }, + { + "epoch": 0.630228667038483, + "grad_norm": 0.1335935100958171, + "learning_rate": 7.23498595660776e-05, + "loss": 1.8457, + "step": 5085 + }, + { + "epoch": 0.6308483609097106, + "grad_norm": 0.13566562420556766, + "learning_rate": 7.21420251737771e-05, + "loss": 1.8629, + "step": 5090 + }, + { + "epoch": 0.6314680547809383, + "grad_norm": 0.1350181379111902, + "learning_rate": 7.19343211570564e-05, + "loss": 1.8711, + "step": 5095 + }, + { + "epoch": 0.6320877486521659, + "grad_norm": 0.13697942997561122, + "learning_rate": 7.172674848797219e-05, + "loss": 1.7922, + "step": 5100 + }, + { + "epoch": 0.6327074425233934, + "grad_norm": 0.14031317331319004, + "learning_rate": 7.151930813796655e-05, + "loss": 1.8117, + "step": 5105 + }, + { + "epoch": 0.633327136394621, + "grad_norm": 0.13755025256595127, + "learning_rate": 7.131200107786214e-05, + "loss": 1.8062, + "step": 5110 + }, + { + "epoch": 0.6339468302658486, + "grad_norm": 0.13894392598493008, + "learning_rate": 7.110482827785796e-05, + "loss": 1.7805, + "step": 5115 + }, + { + "epoch": 0.6345665241370763, + "grad_norm": 0.14482768933598048, + "learning_rate": 7.089779070752463e-05, + "loss": 1.798, + "step": 5120 + }, + { + "epoch": 0.6351862180083039, + "grad_norm": 0.14158917920715833, + "learning_rate": 7.069088933579988e-05, + "loss": 1.8164, + "step": 5125 + }, + { + "epoch": 0.6358059118795315, + "grad_norm": 0.1353622024235782, + "learning_rate": 7.048412513098412e-05, + "loss": 1.8758, + "step": 5130 + }, + { + "epoch": 0.6364256057507591, + "grad_norm": 0.13637740239228766, + "learning_rate": 7.027749906073564e-05, + "loss": 1.7945, + "step": 5135 + }, + { + "epoch": 0.6370452996219867, + "grad_norm": 0.1388858528887692, + "learning_rate": 7.007101209206639e-05, + "loss": 1.8609, + "step": 5140 + }, + { + "epoch": 0.6376649934932144, + "grad_norm": 0.13169802010265266, + "learning_rate": 6.98646651913373e-05, + "loss": 1.8457, + "step": 5145 + }, + { + "epoch": 0.638284687364442, + "grad_norm": 0.1432891542870216, + "learning_rate": 6.965845932425377e-05, + "loss": 1.8676, + "step": 5150 + }, + { + "epoch": 0.6389043812356696, + "grad_norm": 0.13521941306370572, + "learning_rate": 6.945239545586117e-05, + "loss": 1.8477, + "step": 5155 + }, + { + "epoch": 0.6395240751068972, + "grad_norm": 0.13771072724035743, + "learning_rate": 6.924647455054036e-05, + "loss": 1.7895, + "step": 5160 + }, + { + "epoch": 0.6401437689781249, + "grad_norm": 0.135151419363287, + "learning_rate": 6.9040697572003e-05, + "loss": 1.8367, + "step": 5165 + }, + { + "epoch": 0.6407634628493524, + "grad_norm": 0.1384020407161956, + "learning_rate": 6.883506548328724e-05, + "loss": 1.8074, + "step": 5170 + }, + { + "epoch": 0.64138315672058, + "grad_norm": 0.14469575370154378, + "learning_rate": 6.862957924675323e-05, + "loss": 1.8559, + "step": 5175 + }, + { + "epoch": 0.6420028505918076, + "grad_norm": 0.1359705775673227, + "learning_rate": 6.842423982407842e-05, + "loss": 1.8418, + "step": 5180 + }, + { + "epoch": 0.6426225444630352, + "grad_norm": 0.1381154848467315, + "learning_rate": 6.821904817625326e-05, + "loss": 1.8617, + "step": 5185 + }, + { + "epoch": 0.6432422383342629, + "grad_norm": 0.13127462304573068, + "learning_rate": 6.801400526357647e-05, + "loss": 1.8523, + "step": 5190 + }, + { + "epoch": 0.6438619322054905, + "grad_norm": 0.14055038700162212, + "learning_rate": 6.78091120456508e-05, + "loss": 1.7988, + "step": 5195 + }, + { + "epoch": 0.6444816260767181, + "grad_norm": 0.13763358948089932, + "learning_rate": 6.760436948137843e-05, + "loss": 1.8621, + "step": 5200 + }, + { + "epoch": 0.6451013199479457, + "grad_norm": 0.13962503302729862, + "learning_rate": 6.739977852895647e-05, + "loss": 1.8379, + "step": 5205 + }, + { + "epoch": 0.6457210138191734, + "grad_norm": 0.14185188312406627, + "learning_rate": 6.719534014587243e-05, + "loss": 1.8309, + "step": 5210 + }, + { + "epoch": 0.646340707690401, + "grad_norm": 0.13486063326929387, + "learning_rate": 6.699105528889988e-05, + "loss": 1.873, + "step": 5215 + }, + { + "epoch": 0.6469604015616286, + "grad_norm": 0.1362204464921119, + "learning_rate": 6.678692491409378e-05, + "loss": 1.8094, + "step": 5220 + }, + { + "epoch": 0.6475800954328562, + "grad_norm": 0.1336262838647592, + "learning_rate": 6.658294997678621e-05, + "loss": 1.8859, + "step": 5225 + }, + { + "epoch": 0.6481997893040837, + "grad_norm": 0.1426175317216606, + "learning_rate": 6.637913143158175e-05, + "loss": 1.8219, + "step": 5230 + }, + { + "epoch": 0.6488194831753114, + "grad_norm": 0.13139603334052216, + "learning_rate": 6.617547023235309e-05, + "loss": 1.8805, + "step": 5235 + }, + { + "epoch": 0.649439177046539, + "grad_norm": 0.13499721675068688, + "learning_rate": 6.597196733223651e-05, + "loss": 1.8562, + "step": 5240 + }, + { + "epoch": 0.6500588709177666, + "grad_norm": 0.14124375108338427, + "learning_rate": 6.576862368362747e-05, + "loss": 1.8074, + "step": 5245 + }, + { + "epoch": 0.6506785647889942, + "grad_norm": 0.13385751122133496, + "learning_rate": 6.556544023817613e-05, + "loss": 1.9098, + "step": 5250 + }, + { + "epoch": 0.6512982586602218, + "grad_norm": 0.13027632164466119, + "learning_rate": 6.536241794678288e-05, + "loss": 1.8102, + "step": 5255 + }, + { + "epoch": 0.6519179525314495, + "grad_norm": 0.12988529012895247, + "learning_rate": 6.515955775959394e-05, + "loss": 1.8711, + "step": 5260 + }, + { + "epoch": 0.6525376464026771, + "grad_norm": 0.13882339488018036, + "learning_rate": 6.495686062599684e-05, + "loss": 1.7949, + "step": 5265 + }, + { + "epoch": 0.6531573402739047, + "grad_norm": 0.133952007292679, + "learning_rate": 6.475432749461607e-05, + "loss": 1.8203, + "step": 5270 + }, + { + "epoch": 0.6537770341451323, + "grad_norm": 0.1348251697867989, + "learning_rate": 6.455195931330855e-05, + "loss": 1.8379, + "step": 5275 + }, + { + "epoch": 0.65439672801636, + "grad_norm": 0.13015983597998307, + "learning_rate": 6.43497570291592e-05, + "loss": 1.8641, + "step": 5280 + }, + { + "epoch": 0.6550164218875876, + "grad_norm": 0.14151362596069492, + "learning_rate": 6.414772158847661e-05, + "loss": 1.8773, + "step": 5285 + }, + { + "epoch": 0.6556361157588152, + "grad_norm": 0.1392005904812876, + "learning_rate": 6.394585393678851e-05, + "loss": 1.8027, + "step": 5290 + }, + { + "epoch": 0.6562558096300427, + "grad_norm": 0.13728007241822773, + "learning_rate": 6.374415501883741e-05, + "loss": 1.8016, + "step": 5295 + }, + { + "epoch": 0.6568755035012703, + "grad_norm": 0.13755360326233745, + "learning_rate": 6.354262577857606e-05, + "loss": 1.8641, + "step": 5300 + }, + { + "epoch": 0.657495197372498, + "grad_norm": 0.1407832990027108, + "learning_rate": 6.334126715916318e-05, + "loss": 1.8004, + "step": 5305 + }, + { + "epoch": 0.6581148912437256, + "grad_norm": 0.13848872290550496, + "learning_rate": 6.314008010295897e-05, + "loss": 1.7961, + "step": 5310 + }, + { + "epoch": 0.6587345851149532, + "grad_norm": 0.13814509728706104, + "learning_rate": 6.293906555152072e-05, + "loss": 1.8469, + "step": 5315 + }, + { + "epoch": 0.6593542789861808, + "grad_norm": 0.1310131249847111, + "learning_rate": 6.273822444559839e-05, + "loss": 1.8582, + "step": 5320 + }, + { + "epoch": 0.6599739728574084, + "grad_norm": 0.13818644699764637, + "learning_rate": 6.253755772513024e-05, + "loss": 1.7691, + "step": 5325 + }, + { + "epoch": 0.6605936667286361, + "grad_norm": 0.13439005537004714, + "learning_rate": 6.233706632923832e-05, + "loss": 1.8617, + "step": 5330 + }, + { + "epoch": 0.6612133605998637, + "grad_norm": 0.14117004629994614, + "learning_rate": 6.213675119622425e-05, + "loss": 1.8156, + "step": 5335 + }, + { + "epoch": 0.6618330544710913, + "grad_norm": 0.13441504473083238, + "learning_rate": 6.19366132635647e-05, + "loss": 1.8191, + "step": 5340 + }, + { + "epoch": 0.6624527483423189, + "grad_norm": 0.1376510356237994, + "learning_rate": 6.173665346790704e-05, + "loss": 1.9027, + "step": 5345 + }, + { + "epoch": 0.6630724422135466, + "grad_norm": 0.13260573599872516, + "learning_rate": 6.153687274506501e-05, + "loss": 1.8062, + "step": 5350 + }, + { + "epoch": 0.6636921360847742, + "grad_norm": 0.14452496979165114, + "learning_rate": 6.133727203001415e-05, + "loss": 1.8031, + "step": 5355 + }, + { + "epoch": 0.6643118299560017, + "grad_norm": 0.13310538735250482, + "learning_rate": 6.113785225688772e-05, + "loss": 1.8523, + "step": 5360 + }, + { + "epoch": 0.6649315238272293, + "grad_norm": 0.14020085644370098, + "learning_rate": 6.093861435897208e-05, + "loss": 1.8535, + "step": 5365 + }, + { + "epoch": 0.6655512176984569, + "grad_norm": 0.13281311353444464, + "learning_rate": 6.073955926870243e-05, + "loss": 1.8172, + "step": 5370 + }, + { + "epoch": 0.6661709115696846, + "grad_norm": 0.14163086589006268, + "learning_rate": 6.0540687917658445e-05, + "loss": 1.7875, + "step": 5375 + }, + { + "epoch": 0.6667906054409122, + "grad_norm": 0.1311369179551235, + "learning_rate": 6.034200123655993e-05, + "loss": 1.8496, + "step": 5380 + }, + { + "epoch": 0.6674102993121398, + "grad_norm": 0.13835525169206805, + "learning_rate": 6.01435001552623e-05, + "loss": 1.8555, + "step": 5385 + }, + { + "epoch": 0.6680299931833674, + "grad_norm": 0.13402711274212736, + "learning_rate": 5.9945185602752496e-05, + "loss": 1.8148, + "step": 5390 + }, + { + "epoch": 0.668649687054595, + "grad_norm": 0.1406576203494184, + "learning_rate": 5.974705850714444e-05, + "loss": 1.8578, + "step": 5395 + }, + { + "epoch": 0.6692693809258227, + "grad_norm": 0.1334842815628208, + "learning_rate": 5.95491197956748e-05, + "loss": 1.8332, + "step": 5400 + }, + { + "epoch": 0.6698890747970503, + "grad_norm": 0.13248412735805298, + "learning_rate": 5.9351370394698604e-05, + "loss": 1.8344, + "step": 5405 + }, + { + "epoch": 0.6705087686682779, + "grad_norm": 0.13323431139084796, + "learning_rate": 5.9153811229684794e-05, + "loss": 1.8582, + "step": 5410 + }, + { + "epoch": 0.6711284625395055, + "grad_norm": 0.13598450846192175, + "learning_rate": 5.895644322521212e-05, + "loss": 1.8555, + "step": 5415 + }, + { + "epoch": 0.671748156410733, + "grad_norm": 0.13924831817973832, + "learning_rate": 5.875926730496471e-05, + "loss": 1.8695, + "step": 5420 + }, + { + "epoch": 0.6723678502819607, + "grad_norm": 0.13200834015914215, + "learning_rate": 5.856228439172764e-05, + "loss": 1.9035, + "step": 5425 + }, + { + "epoch": 0.6729875441531883, + "grad_norm": 0.13431588152455146, + "learning_rate": 5.836549540738281e-05, + "loss": 1.827, + "step": 5430 + }, + { + "epoch": 0.6736072380244159, + "grad_norm": 0.13074786666140048, + "learning_rate": 5.816890127290446e-05, + "loss": 1.8262, + "step": 5435 + }, + { + "epoch": 0.6742269318956435, + "grad_norm": 0.14226796953838972, + "learning_rate": 5.7972502908354954e-05, + "loss": 1.8348, + "step": 5440 + }, + { + "epoch": 0.6748466257668712, + "grad_norm": 0.13089302513048215, + "learning_rate": 5.777630123288046e-05, + "loss": 1.8863, + "step": 5445 + }, + { + "epoch": 0.6754663196380988, + "grad_norm": 0.13586048592762726, + "learning_rate": 5.758029716470664e-05, + "loss": 1.8871, + "step": 5450 + }, + { + "epoch": 0.6760860135093264, + "grad_norm": 0.1445088669511517, + "learning_rate": 5.738449162113435e-05, + "loss": 1.8043, + "step": 5455 + }, + { + "epoch": 0.676705707380554, + "grad_norm": 0.1396596555264478, + "learning_rate": 5.7188885518535365e-05, + "loss": 1.8555, + "step": 5460 + }, + { + "epoch": 0.6773254012517816, + "grad_norm": 0.1356062175747125, + "learning_rate": 5.699347977234799e-05, + "loss": 1.8465, + "step": 5465 + }, + { + "epoch": 0.6779450951230093, + "grad_norm": 0.14319873842375655, + "learning_rate": 5.679827529707295e-05, + "loss": 1.7801, + "step": 5470 + }, + { + "epoch": 0.6785647889942369, + "grad_norm": 0.13252156474119606, + "learning_rate": 5.660327300626902e-05, + "loss": 1.873, + "step": 5475 + }, + { + "epoch": 0.6791844828654645, + "grad_norm": 0.14296503617575715, + "learning_rate": 5.640847381254869e-05, + "loss": 1.8297, + "step": 5480 + }, + { + "epoch": 0.679804176736692, + "grad_norm": 0.13214280074229467, + "learning_rate": 5.621387862757397e-05, + "loss": 1.8082, + "step": 5485 + }, + { + "epoch": 0.6804238706079196, + "grad_norm": 0.13982102437227564, + "learning_rate": 5.6019488362052255e-05, + "loss": 1.8121, + "step": 5490 + }, + { + "epoch": 0.6810435644791473, + "grad_norm": 0.1354634885782795, + "learning_rate": 5.582530392573164e-05, + "loss": 1.823, + "step": 5495 + }, + { + "epoch": 0.6816632583503749, + "grad_norm": 0.13692962283590215, + "learning_rate": 5.563132622739713e-05, + "loss": 1.8406, + "step": 5500 + }, + { + "epoch": 0.6822829522216025, + "grad_norm": 0.1344469592023015, + "learning_rate": 5.5437556174866156e-05, + "loss": 1.7934, + "step": 5505 + }, + { + "epoch": 0.6829026460928301, + "grad_norm": 0.1360058691875614, + "learning_rate": 5.5243994674984345e-05, + "loss": 1.8754, + "step": 5510 + }, + { + "epoch": 0.6835223399640578, + "grad_norm": 0.14351473803114279, + "learning_rate": 5.505064263362136e-05, + "loss": 1.866, + "step": 5515 + }, + { + "epoch": 0.6841420338352854, + "grad_norm": 0.1322311208239839, + "learning_rate": 5.485750095566644e-05, + "loss": 1.9109, + "step": 5520 + }, + { + "epoch": 0.684761727706513, + "grad_norm": 0.1384464211302108, + "learning_rate": 5.46645705450245e-05, + "loss": 1.8535, + "step": 5525 + }, + { + "epoch": 0.6853814215777406, + "grad_norm": 0.13694935698109853, + "learning_rate": 5.447185230461156e-05, + "loss": 1.784, + "step": 5530 + }, + { + "epoch": 0.6860011154489682, + "grad_norm": 0.13546497838866572, + "learning_rate": 5.427934713635088e-05, + "loss": 1.8625, + "step": 5535 + }, + { + "epoch": 0.6866208093201959, + "grad_norm": 0.1400633356132821, + "learning_rate": 5.4087055941168384e-05, + "loss": 1.8258, + "step": 5540 + }, + { + "epoch": 0.6872405031914235, + "grad_norm": 0.14126924542721914, + "learning_rate": 5.389497961898866e-05, + "loss": 1.8609, + "step": 5545 + }, + { + "epoch": 0.687860197062651, + "grad_norm": 0.1320352339912381, + "learning_rate": 5.370311906873062e-05, + "loss": 1.8625, + "step": 5550 + }, + { + "epoch": 0.6884798909338786, + "grad_norm": 0.13076785413411318, + "learning_rate": 5.351147518830345e-05, + "loss": 1.8238, + "step": 5555 + }, + { + "epoch": 0.6890995848051062, + "grad_norm": 0.13316624015952805, + "learning_rate": 5.3320048874602266e-05, + "loss": 1.8465, + "step": 5560 + }, + { + "epoch": 0.6897192786763339, + "grad_norm": 0.13440609604282788, + "learning_rate": 5.3128841023504e-05, + "loss": 1.8742, + "step": 5565 + }, + { + "epoch": 0.6903389725475615, + "grad_norm": 0.13545277719506474, + "learning_rate": 5.293785252986321e-05, + "loss": 1.8367, + "step": 5570 + }, + { + "epoch": 0.6909586664187891, + "grad_norm": 0.1417270903396745, + "learning_rate": 5.274708428750765e-05, + "loss": 1.7941, + "step": 5575 + }, + { + "epoch": 0.6915783602900167, + "grad_norm": 0.12988305752922388, + "learning_rate": 5.255653718923463e-05, + "loss": 1.8363, + "step": 5580 + }, + { + "epoch": 0.6921980541612444, + "grad_norm": 0.13799787499371047, + "learning_rate": 5.236621212680628e-05, + "loss": 1.8273, + "step": 5585 + }, + { + "epoch": 0.692817748032472, + "grad_norm": 0.13054648174080216, + "learning_rate": 5.217610999094563e-05, + "loss": 1.8426, + "step": 5590 + }, + { + "epoch": 0.6934374419036996, + "grad_norm": 0.1510525206781296, + "learning_rate": 5.1986231671332454e-05, + "loss": 1.8438, + "step": 5595 + }, + { + "epoch": 0.6940571357749272, + "grad_norm": 0.1333950578838988, + "learning_rate": 5.179657805659908e-05, + "loss": 1.8633, + "step": 5600 + }, + { + "epoch": 0.6946768296461548, + "grad_norm": 0.1349388448110292, + "learning_rate": 5.160715003432608e-05, + "loss": 1.8672, + "step": 5605 + }, + { + "epoch": 0.6952965235173824, + "grad_norm": 0.13659731635234268, + "learning_rate": 5.1417948491038416e-05, + "loss": 1.8078, + "step": 5610 + }, + { + "epoch": 0.69591621738861, + "grad_norm": 0.149573593747872, + "learning_rate": 5.122897431220104e-05, + "loss": 1.8465, + "step": 5615 + }, + { + "epoch": 0.6965359112598376, + "grad_norm": 0.14078615205837303, + "learning_rate": 5.104022838221487e-05, + "loss": 1.7676, + "step": 5620 + }, + { + "epoch": 0.6971556051310652, + "grad_norm": 0.1380562039018842, + "learning_rate": 5.085171158441261e-05, + "loss": 1.8547, + "step": 5625 + }, + { + "epoch": 0.6977752990022928, + "grad_norm": 0.15447513080222938, + "learning_rate": 5.0663424801054595e-05, + "loss": 1.8164, + "step": 5630 + }, + { + "epoch": 0.6983949928735205, + "grad_norm": 0.13906912145428393, + "learning_rate": 5.047536891332473e-05, + "loss": 1.8, + "step": 5635 + }, + { + "epoch": 0.6990146867447481, + "grad_norm": 0.13378728455920846, + "learning_rate": 5.0287544801326293e-05, + "loss": 1.8328, + "step": 5640 + }, + { + "epoch": 0.6996343806159757, + "grad_norm": 0.1380817489955438, + "learning_rate": 5.0099953344077885e-05, + "loss": 1.843, + "step": 5645 + }, + { + "epoch": 0.7002540744872033, + "grad_norm": 0.13842527792443207, + "learning_rate": 4.991259541950924e-05, + "loss": 1.8582, + "step": 5650 + }, + { + "epoch": 0.700873768358431, + "grad_norm": 0.13436379086426098, + "learning_rate": 4.972547190445723e-05, + "loss": 1.8711, + "step": 5655 + }, + { + "epoch": 0.7014934622296586, + "grad_norm": 0.145219301193658, + "learning_rate": 4.953858367466155e-05, + "loss": 1.8957, + "step": 5660 + }, + { + "epoch": 0.7021131561008862, + "grad_norm": 0.14332316898614764, + "learning_rate": 4.9351931604760907e-05, + "loss": 1.8605, + "step": 5665 + }, + { + "epoch": 0.7027328499721138, + "grad_norm": 0.1313147508930661, + "learning_rate": 4.9165516568288674e-05, + "loss": 1.8637, + "step": 5670 + }, + { + "epoch": 0.7033525438433413, + "grad_norm": 0.13599383374559293, + "learning_rate": 4.897933943766897e-05, + "loss": 1.8156, + "step": 5675 + }, + { + "epoch": 0.703972237714569, + "grad_norm": 0.1310129091650981, + "learning_rate": 4.879340108421248e-05, + "loss": 1.8793, + "step": 5680 + }, + { + "epoch": 0.7045919315857966, + "grad_norm": 0.1358272525240438, + "learning_rate": 4.8607702378112415e-05, + "loss": 1.8398, + "step": 5685 + }, + { + "epoch": 0.7052116254570242, + "grad_norm": 0.13857163612920126, + "learning_rate": 4.842224418844045e-05, + "loss": 1.8082, + "step": 5690 + }, + { + "epoch": 0.7058313193282518, + "grad_norm": 0.13217816459332826, + "learning_rate": 4.823702738314262e-05, + "loss": 1.8242, + "step": 5695 + }, + { + "epoch": 0.7064510131994794, + "grad_norm": 0.13929364642164002, + "learning_rate": 4.8052052829035275e-05, + "loss": 1.882, + "step": 5700 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.13409665553182737, + "learning_rate": 4.7867321391801065e-05, + "loss": 1.8777, + "step": 5705 + }, + { + "epoch": 0.7076904009419347, + "grad_norm": 0.13751526203517048, + "learning_rate": 4.768283393598484e-05, + "loss": 1.8227, + "step": 5710 + }, + { + "epoch": 0.7083100948131623, + "grad_norm": 0.14070595152235213, + "learning_rate": 4.749859132498953e-05, + "loss": 1.8531, + "step": 5715 + }, + { + "epoch": 0.7089297886843899, + "grad_norm": 0.14049704552673886, + "learning_rate": 4.73145944210723e-05, + "loss": 1.8289, + "step": 5720 + }, + { + "epoch": 0.7095494825556176, + "grad_norm": 0.13542766328955186, + "learning_rate": 4.713084408534035e-05, + "loss": 1.7961, + "step": 5725 + }, + { + "epoch": 0.7101691764268452, + "grad_norm": 0.13784595495438537, + "learning_rate": 4.6947341177746926e-05, + "loss": 1.7789, + "step": 5730 + }, + { + "epoch": 0.7107888702980728, + "grad_norm": 0.1346102286615244, + "learning_rate": 4.6764086557087406e-05, + "loss": 1.8078, + "step": 5735 + }, + { + "epoch": 0.7114085641693003, + "grad_norm": 0.1399061673906458, + "learning_rate": 4.65810810809951e-05, + "loss": 1.8445, + "step": 5740 + }, + { + "epoch": 0.7120282580405279, + "grad_norm": 0.1392295817810214, + "learning_rate": 4.6398325605937265e-05, + "loss": 1.8391, + "step": 5745 + }, + { + "epoch": 0.7126479519117556, + "grad_norm": 0.13422887919186408, + "learning_rate": 4.621582098721124e-05, + "loss": 1.8699, + "step": 5750 + }, + { + "epoch": 0.7132676457829832, + "grad_norm": 0.13029477241952278, + "learning_rate": 4.6033568078940345e-05, + "loss": 1.8422, + "step": 5755 + }, + { + "epoch": 0.7138873396542108, + "grad_norm": 0.13926763308772894, + "learning_rate": 4.585156773406986e-05, + "loss": 1.8414, + "step": 5760 + }, + { + "epoch": 0.7145070335254384, + "grad_norm": 0.13603681047097105, + "learning_rate": 4.5669820804363116e-05, + "loss": 1.8285, + "step": 5765 + }, + { + "epoch": 0.715126727396666, + "grad_norm": 0.13825962890714413, + "learning_rate": 4.5488328140397364e-05, + "loss": 1.8645, + "step": 5770 + }, + { + "epoch": 0.7157464212678937, + "grad_norm": 0.13476991285028014, + "learning_rate": 4.530709059155994e-05, + "loss": 1.8305, + "step": 5775 + }, + { + "epoch": 0.7163661151391213, + "grad_norm": 0.13893798411440325, + "learning_rate": 4.512610900604434e-05, + "loss": 1.8445, + "step": 5780 + }, + { + "epoch": 0.7169858090103489, + "grad_norm": 0.13255106459645197, + "learning_rate": 4.4945384230846e-05, + "loss": 1.8152, + "step": 5785 + }, + { + "epoch": 0.7176055028815765, + "grad_norm": 0.13775635236702505, + "learning_rate": 4.476491711175854e-05, + "loss": 1.8016, + "step": 5790 + }, + { + "epoch": 0.7182251967528042, + "grad_norm": 0.134500964920823, + "learning_rate": 4.45847084933698e-05, + "loss": 1.8723, + "step": 5795 + }, + { + "epoch": 0.7188448906240317, + "grad_norm": 0.13597330490660237, + "learning_rate": 4.440475921905768e-05, + "loss": 1.7895, + "step": 5800 + }, + { + "epoch": 0.7194645844952593, + "grad_norm": 0.1315917432903297, + "learning_rate": 4.422507013098651e-05, + "loss": 1.8133, + "step": 5805 + }, + { + "epoch": 0.7200842783664869, + "grad_norm": 0.13875462075386993, + "learning_rate": 4.404564207010288e-05, + "loss": 1.857, + "step": 5810 + }, + { + "epoch": 0.7207039722377145, + "grad_norm": 0.13431544947110896, + "learning_rate": 4.3866475876131764e-05, + "loss": 1.7969, + "step": 5815 + }, + { + "epoch": 0.7213236661089422, + "grad_norm": 0.13155675895860156, + "learning_rate": 4.3687572387572605e-05, + "loss": 1.8605, + "step": 5820 + }, + { + "epoch": 0.7219433599801698, + "grad_norm": 0.13995415211194065, + "learning_rate": 4.350893244169541e-05, + "loss": 1.8555, + "step": 5825 + }, + { + "epoch": 0.7225630538513974, + "grad_norm": 0.13449577476692642, + "learning_rate": 4.333055687453673e-05, + "loss": 1.8195, + "step": 5830 + }, + { + "epoch": 0.723182747722625, + "grad_norm": 0.14401065172175462, + "learning_rate": 4.315244652089592e-05, + "loss": 1.7914, + "step": 5835 + }, + { + "epoch": 0.7238024415938527, + "grad_norm": 0.1389024704353045, + "learning_rate": 4.297460221433104e-05, + "loss": 1.8316, + "step": 5840 + }, + { + "epoch": 0.7244221354650803, + "grad_norm": 0.13282353788709725, + "learning_rate": 4.2797024787155114e-05, + "loss": 1.8426, + "step": 5845 + }, + { + "epoch": 0.7250418293363079, + "grad_norm": 0.13791586639383402, + "learning_rate": 4.2619715070432174e-05, + "loss": 1.8484, + "step": 5850 + }, + { + "epoch": 0.7256615232075355, + "grad_norm": 0.13459363626623866, + "learning_rate": 4.244267389397326e-05, + "loss": 1.825, + "step": 5855 + }, + { + "epoch": 0.7262812170787631, + "grad_norm": 0.13114527523173383, + "learning_rate": 4.226590208633275e-05, + "loss": 1.827, + "step": 5860 + }, + { + "epoch": 0.7269009109499907, + "grad_norm": 0.13950898565386535, + "learning_rate": 4.208940047480434e-05, + "loss": 1.8, + "step": 5865 + }, + { + "epoch": 0.7275206048212183, + "grad_norm": 0.13754357299749095, + "learning_rate": 4.191316988541721e-05, + "loss": 1.8449, + "step": 5870 + }, + { + "epoch": 0.7281402986924459, + "grad_norm": 0.1412420242297336, + "learning_rate": 4.173721114293214e-05, + "loss": 1.8195, + "step": 5875 + }, + { + "epoch": 0.7287599925636735, + "grad_norm": 0.13460299907847717, + "learning_rate": 4.156152507083767e-05, + "loss": 1.8094, + "step": 5880 + }, + { + "epoch": 0.7293796864349011, + "grad_norm": 0.1335314650244059, + "learning_rate": 4.1386112491346255e-05, + "loss": 1.8129, + "step": 5885 + }, + { + "epoch": 0.7299993803061288, + "grad_norm": 0.13934436501401282, + "learning_rate": 4.121097422539036e-05, + "loss": 1.8836, + "step": 5890 + }, + { + "epoch": 0.7306190741773564, + "grad_norm": 0.13065863803776073, + "learning_rate": 4.1036111092618725e-05, + "loss": 1.8102, + "step": 5895 + }, + { + "epoch": 0.731238768048584, + "grad_norm": 0.139176383450031, + "learning_rate": 4.0861523911392406e-05, + "loss": 1.784, + "step": 5900 + }, + { + "epoch": 0.7318584619198116, + "grad_norm": 0.1382728042720779, + "learning_rate": 4.068721349878107e-05, + "loss": 1.8781, + "step": 5905 + }, + { + "epoch": 0.7324781557910393, + "grad_norm": 0.13071903823760816, + "learning_rate": 4.051318067055898e-05, + "loss": 1.8363, + "step": 5910 + }, + { + "epoch": 0.7330978496622669, + "grad_norm": 0.13148997495963968, + "learning_rate": 4.033942624120143e-05, + "loss": 1.8223, + "step": 5915 + }, + { + "epoch": 0.7337175435334945, + "grad_norm": 0.13306227335689444, + "learning_rate": 4.0165951023880746e-05, + "loss": 1.8449, + "step": 5920 + }, + { + "epoch": 0.7343372374047221, + "grad_norm": 0.13278952164174923, + "learning_rate": 3.999275583046256e-05, + "loss": 1.7824, + "step": 5925 + }, + { + "epoch": 0.7349569312759496, + "grad_norm": 0.14316705856810816, + "learning_rate": 3.981984147150196e-05, + "loss": 1.8535, + "step": 5930 + }, + { + "epoch": 0.7355766251471773, + "grad_norm": 0.13612015046365858, + "learning_rate": 3.964720875623976e-05, + "loss": 1.8055, + "step": 5935 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.13203944006386598, + "learning_rate": 3.9474858492598653e-05, + "loss": 1.8031, + "step": 5940 + }, + { + "epoch": 0.7368160128896325, + "grad_norm": 0.1327410609332686, + "learning_rate": 3.930279148717948e-05, + "loss": 1.7957, + "step": 5945 + }, + { + "epoch": 0.7374357067608601, + "grad_norm": 0.1412184409098123, + "learning_rate": 3.913100854525742e-05, + "loss": 1.8562, + "step": 5950 + }, + { + "epoch": 0.7380554006320877, + "grad_norm": 0.14211921706904118, + "learning_rate": 3.895951047077821e-05, + "loss": 1.8051, + "step": 5955 + }, + { + "epoch": 0.7386750945033154, + "grad_norm": 0.14363924491542174, + "learning_rate": 3.8788298066354464e-05, + "loss": 1.8109, + "step": 5960 + }, + { + "epoch": 0.739294788374543, + "grad_norm": 0.13367304925942577, + "learning_rate": 3.8617372133261766e-05, + "loss": 1.8359, + "step": 5965 + }, + { + "epoch": 0.7399144822457706, + "grad_norm": 0.1283371909012973, + "learning_rate": 3.844673347143507e-05, + "loss": 1.8594, + "step": 5970 + }, + { + "epoch": 0.7405341761169982, + "grad_norm": 0.140355325772872, + "learning_rate": 3.827638287946489e-05, + "loss": 1.8551, + "step": 5975 + }, + { + "epoch": 0.7411538699882259, + "grad_norm": 0.13603925926463906, + "learning_rate": 3.8106321154593605e-05, + "loss": 1.8039, + "step": 5980 + }, + { + "epoch": 0.7417735638594535, + "grad_norm": 0.13333303827812676, + "learning_rate": 3.793654909271169e-05, + "loss": 1.7832, + "step": 5985 + }, + { + "epoch": 0.742393257730681, + "grad_norm": 0.14092753476620795, + "learning_rate": 3.776706748835388e-05, + "loss": 1.8727, + "step": 5990 + }, + { + "epoch": 0.7430129516019086, + "grad_norm": 0.13526664213664327, + "learning_rate": 3.759787713469569e-05, + "loss": 1.8406, + "step": 5995 + }, + { + "epoch": 0.7436326454731362, + "grad_norm": 0.13890336096217432, + "learning_rate": 3.7428978823549545e-05, + "loss": 1.8184, + "step": 6000 + }, + { + "epoch": 0.7442523393443639, + "grad_norm": 0.136193007857896, + "learning_rate": 3.726037334536109e-05, + "loss": 1.8562, + "step": 6005 + }, + { + "epoch": 0.7448720332155915, + "grad_norm": 0.14442667788304958, + "learning_rate": 3.709206148920553e-05, + "loss": 1.9062, + "step": 6010 + }, + { + "epoch": 0.7454917270868191, + "grad_norm": 0.13679257439666426, + "learning_rate": 3.692404404278395e-05, + "loss": 1.8246, + "step": 6015 + }, + { + "epoch": 0.7461114209580467, + "grad_norm": 0.13120658103553706, + "learning_rate": 3.675632179241946e-05, + "loss": 1.8305, + "step": 6020 + }, + { + "epoch": 0.7467311148292743, + "grad_norm": 0.14446220470432317, + "learning_rate": 3.658889552305376e-05, + "loss": 1.7777, + "step": 6025 + }, + { + "epoch": 0.747350808700502, + "grad_norm": 0.13770389116571016, + "learning_rate": 3.642176601824341e-05, + "loss": 1.8184, + "step": 6030 + }, + { + "epoch": 0.7479705025717296, + "grad_norm": 0.13267683993530827, + "learning_rate": 3.6254934060156e-05, + "loss": 1.8637, + "step": 6035 + }, + { + "epoch": 0.7485901964429572, + "grad_norm": 0.13804463607426604, + "learning_rate": 3.608840042956666e-05, + "loss": 1.8715, + "step": 6040 + }, + { + "epoch": 0.7492098903141848, + "grad_norm": 0.13803991697296203, + "learning_rate": 3.592216590585427e-05, + "loss": 1.8703, + "step": 6045 + }, + { + "epoch": 0.7498295841854125, + "grad_norm": 0.13771290613002898, + "learning_rate": 3.5756231266997965e-05, + "loss": 1.8266, + "step": 6050 + }, + { + "epoch": 0.75044927805664, + "grad_norm": 0.1377426301197608, + "learning_rate": 3.559059728957338e-05, + "loss": 1.8387, + "step": 6055 + }, + { + "epoch": 0.7510689719278676, + "grad_norm": 0.13142143493494793, + "learning_rate": 3.5425264748749074e-05, + "loss": 1.8754, + "step": 6060 + }, + { + "epoch": 0.7516886657990952, + "grad_norm": 0.13870584512058637, + "learning_rate": 3.5260234418282865e-05, + "loss": 1.8102, + "step": 6065 + }, + { + "epoch": 0.7523083596703228, + "grad_norm": 0.13263561172267807, + "learning_rate": 3.509550707051823e-05, + "loss": 1.8395, + "step": 6070 + }, + { + "epoch": 0.7529280535415505, + "grad_norm": 0.13347345228759588, + "learning_rate": 3.493108347638067e-05, + "loss": 1.8488, + "step": 6075 + }, + { + "epoch": 0.7535477474127781, + "grad_norm": 0.1348178977080782, + "learning_rate": 3.476696440537413e-05, + "loss": 1.7828, + "step": 6080 + }, + { + "epoch": 0.7541674412840057, + "grad_norm": 0.14134614205432544, + "learning_rate": 3.460315062557737e-05, + "loss": 1.793, + "step": 6085 + }, + { + "epoch": 0.7547871351552333, + "grad_norm": 0.1302945011937037, + "learning_rate": 3.443964290364041e-05, + "loss": 1.802, + "step": 6090 + }, + { + "epoch": 0.755406829026461, + "grad_norm": 0.13968371731188192, + "learning_rate": 3.4276442004780916e-05, + "loss": 1.9215, + "step": 6095 + }, + { + "epoch": 0.7560265228976886, + "grad_norm": 0.1455653462790958, + "learning_rate": 3.411354869278056e-05, + "loss": 1.8215, + "step": 6100 + }, + { + "epoch": 0.7566462167689162, + "grad_norm": 0.14760360535248854, + "learning_rate": 3.3950963729981565e-05, + "loss": 1.8395, + "step": 6105 + }, + { + "epoch": 0.7572659106401438, + "grad_norm": 0.13441648798291483, + "learning_rate": 3.378868787728308e-05, + "loss": 1.891, + "step": 6110 + }, + { + "epoch": 0.7578856045113714, + "grad_norm": 0.13945591613635544, + "learning_rate": 3.362672189413756e-05, + "loss": 1.832, + "step": 6115 + }, + { + "epoch": 0.758505298382599, + "grad_norm": 0.1356119340760843, + "learning_rate": 3.346506653854734e-05, + "loss": 1.8215, + "step": 6120 + }, + { + "epoch": 0.7591249922538266, + "grad_norm": 0.14180993594873473, + "learning_rate": 3.3303722567060956e-05, + "loss": 1.7512, + "step": 6125 + }, + { + "epoch": 0.7597446861250542, + "grad_norm": 0.12910186660875014, + "learning_rate": 3.31426907347697e-05, + "loss": 1.8566, + "step": 6130 + }, + { + "epoch": 0.7603643799962818, + "grad_norm": 0.14017170111652089, + "learning_rate": 3.2981971795304026e-05, + "loss": 1.8836, + "step": 6135 + }, + { + "epoch": 0.7609840738675094, + "grad_norm": 0.13260660637237812, + "learning_rate": 3.282156650083006e-05, + "loss": 1.8633, + "step": 6140 + }, + { + "epoch": 0.7616037677387371, + "grad_norm": 0.13413829542520983, + "learning_rate": 3.266147560204608e-05, + "loss": 1.8672, + "step": 6145 + }, + { + "epoch": 0.7622234616099647, + "grad_norm": 0.14500649061949047, + "learning_rate": 3.250169984817897e-05, + "loss": 1.8227, + "step": 6150 + }, + { + "epoch": 0.7628431554811923, + "grad_norm": 0.13143903720477662, + "learning_rate": 3.23422399869807e-05, + "loss": 1.8137, + "step": 6155 + }, + { + "epoch": 0.7634628493524199, + "grad_norm": 0.13853349840431195, + "learning_rate": 3.2183096764724915e-05, + "loss": 1.8305, + "step": 6160 + }, + { + "epoch": 0.7640825432236475, + "grad_norm": 0.13860343379835846, + "learning_rate": 3.2024270926203384e-05, + "loss": 1.7961, + "step": 6165 + }, + { + "epoch": 0.7647022370948752, + "grad_norm": 0.1384361618622998, + "learning_rate": 3.1865763214722474e-05, + "loss": 1.8492, + "step": 6170 + }, + { + "epoch": 0.7653219309661028, + "grad_norm": 0.13199549343512088, + "learning_rate": 3.1707574372099754e-05, + "loss": 1.8434, + "step": 6175 + }, + { + "epoch": 0.7659416248373303, + "grad_norm": 0.13451064848409564, + "learning_rate": 3.154970513866047e-05, + "loss": 1.8215, + "step": 6180 + }, + { + "epoch": 0.7665613187085579, + "grad_norm": 0.1378993997426509, + "learning_rate": 3.1392156253234086e-05, + "loss": 1.8219, + "step": 6185 + }, + { + "epoch": 0.7671810125797855, + "grad_norm": 0.13271589858843205, + "learning_rate": 3.123492845315086e-05, + "loss": 1.8707, + "step": 6190 + }, + { + "epoch": 0.7678007064510132, + "grad_norm": 0.13419734586499776, + "learning_rate": 3.1078022474238334e-05, + "loss": 1.8223, + "step": 6195 + }, + { + "epoch": 0.7684204003222408, + "grad_norm": 0.13926944052109394, + "learning_rate": 3.092143905081794e-05, + "loss": 1.8313, + "step": 6200 + }, + { + "epoch": 0.7690400941934684, + "grad_norm": 0.13678235355553914, + "learning_rate": 3.07651789157016e-05, + "loss": 1.8035, + "step": 6205 + }, + { + "epoch": 0.769659788064696, + "grad_norm": 0.13151996809359814, + "learning_rate": 3.060924280018811e-05, + "loss": 1.7555, + "step": 6210 + }, + { + "epoch": 0.7702794819359237, + "grad_norm": 0.14276236865321887, + "learning_rate": 3.0453631434059958e-05, + "loss": 1.7926, + "step": 6215 + }, + { + "epoch": 0.7708991758071513, + "grad_norm": 0.13592822741431418, + "learning_rate": 3.0298345545579787e-05, + "loss": 1.8324, + "step": 6220 + }, + { + "epoch": 0.7715188696783789, + "grad_norm": 0.15065267997122453, + "learning_rate": 3.0143385861486974e-05, + "loss": 1.8789, + "step": 6225 + }, + { + "epoch": 0.7721385635496065, + "grad_norm": 0.13878430796743169, + "learning_rate": 2.9988753106994306e-05, + "loss": 1.848, + "step": 6230 + }, + { + "epoch": 0.7727582574208341, + "grad_norm": 0.13441719402081825, + "learning_rate": 2.983444800578452e-05, + "loss": 1.8508, + "step": 6235 + }, + { + "epoch": 0.7733779512920618, + "grad_norm": 0.1409089714119921, + "learning_rate": 2.9680471280006848e-05, + "loss": 1.8441, + "step": 6240 + }, + { + "epoch": 0.7739976451632893, + "grad_norm": 0.13250438881223234, + "learning_rate": 2.9526823650273837e-05, + "loss": 1.8266, + "step": 6245 + }, + { + "epoch": 0.7746173390345169, + "grad_norm": 0.14901974178220379, + "learning_rate": 2.93735058356578e-05, + "loss": 1.7891, + "step": 6250 + }, + { + "epoch": 0.7752370329057445, + "grad_norm": 0.13981158606565733, + "learning_rate": 2.9220518553687526e-05, + "loss": 1.8555, + "step": 6255 + }, + { + "epoch": 0.7758567267769721, + "grad_norm": 0.13508067673905183, + "learning_rate": 2.9067862520344956e-05, + "loss": 1.8375, + "step": 6260 + }, + { + "epoch": 0.7764764206481998, + "grad_norm": 0.1329903186719515, + "learning_rate": 2.891553845006165e-05, + "loss": 1.825, + "step": 6265 + }, + { + "epoch": 0.7770961145194274, + "grad_norm": 0.1379727364911777, + "learning_rate": 2.87635470557157e-05, + "loss": 1.8465, + "step": 6270 + }, + { + "epoch": 0.777715808390655, + "grad_norm": 0.1410408068306655, + "learning_rate": 2.861188904862827e-05, + "loss": 1.8449, + "step": 6275 + }, + { + "epoch": 0.7783355022618826, + "grad_norm": 0.1314735833953845, + "learning_rate": 2.8460565138560212e-05, + "loss": 1.8219, + "step": 6280 + }, + { + "epoch": 0.7789551961331103, + "grad_norm": 0.13288247697952463, + "learning_rate": 2.830957603370883e-05, + "loss": 1.8398, + "step": 6285 + }, + { + "epoch": 0.7795748900043379, + "grad_norm": 0.13510806378576828, + "learning_rate": 2.815892244070455e-05, + "loss": 1.8801, + "step": 6290 + }, + { + "epoch": 0.7801945838755655, + "grad_norm": 0.13593787427552473, + "learning_rate": 2.8008605064607528e-05, + "loss": 1.8637, + "step": 6295 + }, + { + "epoch": 0.7808142777467931, + "grad_norm": 0.13570594578366504, + "learning_rate": 2.7858624608904515e-05, + "loss": 1.9043, + "step": 6300 + }, + { + "epoch": 0.7814339716180208, + "grad_norm": 0.13671078512318602, + "learning_rate": 2.7708981775505416e-05, + "loss": 1.8309, + "step": 6305 + }, + { + "epoch": 0.7820536654892483, + "grad_norm": 0.1411639120932341, + "learning_rate": 2.755967726474007e-05, + "loss": 1.8258, + "step": 6310 + }, + { + "epoch": 0.7826733593604759, + "grad_norm": 0.13774898564586507, + "learning_rate": 2.741071177535499e-05, + "loss": 1.8227, + "step": 6315 + }, + { + "epoch": 0.7832930532317035, + "grad_norm": 0.13076963037262845, + "learning_rate": 2.7262086004510023e-05, + "loss": 1.8277, + "step": 6320 + }, + { + "epoch": 0.7839127471029311, + "grad_norm": 0.1343651996214831, + "learning_rate": 2.7113800647775156e-05, + "loss": 1.7766, + "step": 6325 + }, + { + "epoch": 0.7845324409741588, + "grad_norm": 0.13468642096762548, + "learning_rate": 2.6965856399127232e-05, + "loss": 1.8441, + "step": 6330 + }, + { + "epoch": 0.7851521348453864, + "grad_norm": 0.14720986177976159, + "learning_rate": 2.6818253950946704e-05, + "loss": 1.8488, + "step": 6335 + }, + { + "epoch": 0.785771828716614, + "grad_norm": 0.1560079026652874, + "learning_rate": 2.6670993994014394e-05, + "loss": 1.875, + "step": 6340 + }, + { + "epoch": 0.7863915225878416, + "grad_norm": 0.13378317082115668, + "learning_rate": 2.6524077217508292e-05, + "loss": 1.8277, + "step": 6345 + }, + { + "epoch": 0.7870112164590692, + "grad_norm": 0.14218358804125789, + "learning_rate": 2.63775043090002e-05, + "loss": 1.8301, + "step": 6350 + }, + { + "epoch": 0.7876309103302969, + "grad_norm": 0.13527135821037153, + "learning_rate": 2.623127595445274e-05, + "loss": 1.7805, + "step": 6355 + }, + { + "epoch": 0.7882506042015245, + "grad_norm": 0.14734614025952317, + "learning_rate": 2.6085392838215938e-05, + "loss": 1.875, + "step": 6360 + }, + { + "epoch": 0.7888702980727521, + "grad_norm": 0.14019751079756237, + "learning_rate": 2.5939855643024136e-05, + "loss": 1.8344, + "step": 6365 + }, + { + "epoch": 0.7894899919439796, + "grad_norm": 0.1363368429297401, + "learning_rate": 2.5794665049992762e-05, + "loss": 1.7402, + "step": 6370 + }, + { + "epoch": 0.7901096858152072, + "grad_norm": 0.13954747652635188, + "learning_rate": 2.564982173861512e-05, + "loss": 1.8367, + "step": 6375 + }, + { + "epoch": 0.7907293796864349, + "grad_norm": 0.13494365879866646, + "learning_rate": 2.5505326386759254e-05, + "loss": 1.8523, + "step": 6380 + }, + { + "epoch": 0.7913490735576625, + "grad_norm": 0.13314683645391906, + "learning_rate": 2.536117967066476e-05, + "loss": 1.7867, + "step": 6385 + }, + { + "epoch": 0.7919687674288901, + "grad_norm": 0.13327408799707985, + "learning_rate": 2.521738226493957e-05, + "loss": 1.8102, + "step": 6390 + }, + { + "epoch": 0.7925884613001177, + "grad_norm": 0.13992946116216445, + "learning_rate": 2.50739348425569e-05, + "loss": 1.7902, + "step": 6395 + }, + { + "epoch": 0.7932081551713454, + "grad_norm": 0.14100811061457322, + "learning_rate": 2.4930838074852026e-05, + "loss": 1.8656, + "step": 6400 + }, + { + "epoch": 0.793827849042573, + "grad_norm": 0.1474245956678477, + "learning_rate": 2.47880926315191e-05, + "loss": 1.8477, + "step": 6405 + }, + { + "epoch": 0.7944475429138006, + "grad_norm": 0.13533489898855472, + "learning_rate": 2.4645699180608127e-05, + "loss": 1.8414, + "step": 6410 + }, + { + "epoch": 0.7950672367850282, + "grad_norm": 0.136517801601565, + "learning_rate": 2.45036583885218e-05, + "loss": 1.8234, + "step": 6415 + }, + { + "epoch": 0.7956869306562558, + "grad_norm": 0.13002283300770023, + "learning_rate": 2.4361970920012313e-05, + "loss": 1.9266, + "step": 6420 + }, + { + "epoch": 0.7963066245274835, + "grad_norm": 0.1379400020050846, + "learning_rate": 2.4220637438178317e-05, + "loss": 1.8352, + "step": 6425 + }, + { + "epoch": 0.7969263183987111, + "grad_norm": 0.1336448212657819, + "learning_rate": 2.4079658604461896e-05, + "loss": 1.8168, + "step": 6430 + }, + { + "epoch": 0.7975460122699386, + "grad_norm": 0.13573713481362645, + "learning_rate": 2.393903507864521e-05, + "loss": 1.8309, + "step": 6435 + }, + { + "epoch": 0.7981657061411662, + "grad_norm": 0.1386764771891764, + "learning_rate": 2.3798767518847687e-05, + "loss": 1.859, + "step": 6440 + }, + { + "epoch": 0.7987854000123938, + "grad_norm": 0.13800165635093364, + "learning_rate": 2.3658856581522804e-05, + "loss": 1.8344, + "step": 6445 + }, + { + "epoch": 0.7994050938836215, + "grad_norm": 0.14205563126041354, + "learning_rate": 2.3519302921455033e-05, + "loss": 1.8223, + "step": 6450 + }, + { + "epoch": 0.8000247877548491, + "grad_norm": 0.1367572578358323, + "learning_rate": 2.338010719175684e-05, + "loss": 1.7699, + "step": 6455 + }, + { + "epoch": 0.8006444816260767, + "grad_norm": 0.13208109576129815, + "learning_rate": 2.324127004386546e-05, + "loss": 1.8512, + "step": 6460 + }, + { + "epoch": 0.8012641754973043, + "grad_norm": 0.13593091577531968, + "learning_rate": 2.310279212754006e-05, + "loss": 1.8379, + "step": 6465 + }, + { + "epoch": 0.801883869368532, + "grad_norm": 0.1342950750798231, + "learning_rate": 2.296467409085853e-05, + "loss": 1.8473, + "step": 6470 + }, + { + "epoch": 0.8025035632397596, + "grad_norm": 0.14153433233252935, + "learning_rate": 2.2826916580214632e-05, + "loss": 1.775, + "step": 6475 + }, + { + "epoch": 0.8031232571109872, + "grad_norm": 0.1384192064540353, + "learning_rate": 2.2689520240314755e-05, + "loss": 1.8586, + "step": 6480 + }, + { + "epoch": 0.8037429509822148, + "grad_norm": 0.13797923662673559, + "learning_rate": 2.2552485714175064e-05, + "loss": 1.8453, + "step": 6485 + }, + { + "epoch": 0.8043626448534424, + "grad_norm": 0.1318062253579002, + "learning_rate": 2.2415813643118356e-05, + "loss": 1.8266, + "step": 6490 + }, + { + "epoch": 0.8049823387246701, + "grad_norm": 0.1380940923690682, + "learning_rate": 2.227950466677121e-05, + "loss": 1.8016, + "step": 6495 + }, + { + "epoch": 0.8056020325958976, + "grad_norm": 0.13292787617402785, + "learning_rate": 2.21435594230609e-05, + "loss": 1.8113, + "step": 6500 + }, + { + "epoch": 0.8062217264671252, + "grad_norm": 0.14153364728285703, + "learning_rate": 2.2007978548212425e-05, + "loss": 1.7828, + "step": 6505 + }, + { + "epoch": 0.8068414203383528, + "grad_norm": 0.13985017879700073, + "learning_rate": 2.1872762676745563e-05, + "loss": 1.8672, + "step": 6510 + }, + { + "epoch": 0.8074611142095804, + "grad_norm": 0.1316374604121444, + "learning_rate": 2.1737912441471787e-05, + "loss": 1.7961, + "step": 6515 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.13947504872687227, + "learning_rate": 2.160342847349144e-05, + "loss": 1.8305, + "step": 6520 + }, + { + "epoch": 0.8087005019520357, + "grad_norm": 0.1415289846578962, + "learning_rate": 2.1469311402190794e-05, + "loss": 1.8211, + "step": 6525 + }, + { + "epoch": 0.8093201958232633, + "grad_norm": 0.13209481549795832, + "learning_rate": 2.133556185523895e-05, + "loss": 1.8215, + "step": 6530 + }, + { + "epoch": 0.8099398896944909, + "grad_norm": 0.13213273877464646, + "learning_rate": 2.120218045858503e-05, + "loss": 1.8617, + "step": 6535 + }, + { + "epoch": 0.8105595835657186, + "grad_norm": 0.14048998695177167, + "learning_rate": 2.1069167836455228e-05, + "loss": 1.8254, + "step": 6540 + }, + { + "epoch": 0.8111792774369462, + "grad_norm": 0.13984057341213088, + "learning_rate": 2.0936524611349795e-05, + "loss": 1.7922, + "step": 6545 + }, + { + "epoch": 0.8117989713081738, + "grad_norm": 0.14234566575879473, + "learning_rate": 2.080425140404029e-05, + "loss": 1.85, + "step": 6550 + }, + { + "epoch": 0.8124186651794014, + "grad_norm": 0.13619722325941194, + "learning_rate": 2.0672348833566512e-05, + "loss": 1.8723, + "step": 6555 + }, + { + "epoch": 0.813038359050629, + "grad_norm": 0.13354456391525196, + "learning_rate": 2.0540817517233735e-05, + "loss": 1.8309, + "step": 6560 + }, + { + "epoch": 0.8136580529218566, + "grad_norm": 0.1306309793562597, + "learning_rate": 2.0409658070609738e-05, + "loss": 1.8254, + "step": 6565 + }, + { + "epoch": 0.8142777467930842, + "grad_norm": 0.13739179800016613, + "learning_rate": 2.0278871107521936e-05, + "loss": 1.8023, + "step": 6570 + }, + { + "epoch": 0.8148974406643118, + "grad_norm": 0.13666140093469806, + "learning_rate": 2.014845724005453e-05, + "loss": 1.8449, + "step": 6575 + }, + { + "epoch": 0.8155171345355394, + "grad_norm": 0.13040603198376724, + "learning_rate": 2.0018417078545614e-05, + "loss": 1.8324, + "step": 6580 + }, + { + "epoch": 0.816136828406767, + "grad_norm": 0.14022816506048247, + "learning_rate": 1.988875123158437e-05, + "loss": 1.8051, + "step": 6585 + }, + { + "epoch": 0.8167565222779947, + "grad_norm": 0.13382954051159468, + "learning_rate": 1.975946030600814e-05, + "loss": 1.7973, + "step": 6590 + }, + { + "epoch": 0.8173762161492223, + "grad_norm": 0.14241355395415345, + "learning_rate": 1.9630544906899672e-05, + "loss": 1.791, + "step": 6595 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 0.13466759194537686, + "learning_rate": 1.9502005637584198e-05, + "loss": 1.8574, + "step": 6600 + }, + { + "epoch": 0.8186156038916775, + "grad_norm": 0.13508090700592257, + "learning_rate": 1.93738430996267e-05, + "loss": 1.8477, + "step": 6605 + }, + { + "epoch": 0.8192352977629052, + "grad_norm": 0.14482298397934196, + "learning_rate": 1.9246057892829038e-05, + "loss": 1.8273, + "step": 6610 + }, + { + "epoch": 0.8198549916341328, + "grad_norm": 0.1407416637413334, + "learning_rate": 1.9118650615227162e-05, + "loss": 1.8258, + "step": 6615 + }, + { + "epoch": 0.8204746855053604, + "grad_norm": 0.13884770681711742, + "learning_rate": 1.8991621863088315e-05, + "loss": 1.827, + "step": 6620 + }, + { + "epoch": 0.8210943793765879, + "grad_norm": 0.13406222131993933, + "learning_rate": 1.886497223090823e-05, + "loss": 1.8367, + "step": 6625 + }, + { + "epoch": 0.8217140732478155, + "grad_norm": 0.13345740447218, + "learning_rate": 1.8738702311408352e-05, + "loss": 1.85, + "step": 6630 + }, + { + "epoch": 0.8223337671190432, + "grad_norm": 0.1361039949242507, + "learning_rate": 1.8612812695533077e-05, + "loss": 1.9008, + "step": 6635 + }, + { + "epoch": 0.8229534609902708, + "grad_norm": 0.14532071771167876, + "learning_rate": 1.8487303972446966e-05, + "loss": 1.8203, + "step": 6640 + }, + { + "epoch": 0.8235731548614984, + "grad_norm": 0.13415668268856182, + "learning_rate": 1.836217672953201e-05, + "loss": 1.8738, + "step": 6645 + }, + { + "epoch": 0.824192848732726, + "grad_norm": 0.14031869927136761, + "learning_rate": 1.8237431552384887e-05, + "loss": 1.8176, + "step": 6650 + }, + { + "epoch": 0.8248125426039536, + "grad_norm": 0.13674539846811842, + "learning_rate": 1.811306902481412e-05, + "loss": 1.7992, + "step": 6655 + }, + { + "epoch": 0.8254322364751813, + "grad_norm": 0.14014061563694274, + "learning_rate": 1.798908972883754e-05, + "loss": 1.8609, + "step": 6660 + }, + { + "epoch": 0.8260519303464089, + "grad_norm": 0.14206135339382653, + "learning_rate": 1.786549424467936e-05, + "loss": 1.9484, + "step": 6665 + }, + { + "epoch": 0.8266716242176365, + "grad_norm": 0.14489028988827174, + "learning_rate": 1.7742283150767614e-05, + "loss": 1.9082, + "step": 6670 + }, + { + "epoch": 0.8272913180888641, + "grad_norm": 0.13006211826169928, + "learning_rate": 1.7619457023731355e-05, + "loss": 1.8477, + "step": 6675 + }, + { + "epoch": 0.8279110119600918, + "grad_norm": 0.13916462938836544, + "learning_rate": 1.7497016438397984e-05, + "loss": 1.8336, + "step": 6680 + }, + { + "epoch": 0.8285307058313194, + "grad_norm": 0.13832093116873143, + "learning_rate": 1.737496196779059e-05, + "loss": 1.798, + "step": 6685 + }, + { + "epoch": 0.8291503997025469, + "grad_norm": 0.14176420760152497, + "learning_rate": 1.7253294183125223e-05, + "loss": 1.8395, + "step": 6690 + }, + { + "epoch": 0.8297700935737745, + "grad_norm": 0.14071427615818557, + "learning_rate": 1.7132013653808222e-05, + "loss": 1.8344, + "step": 6695 + }, + { + "epoch": 0.8303897874450021, + "grad_norm": 0.13551316956515488, + "learning_rate": 1.70111209474336e-05, + "loss": 1.8719, + "step": 6700 + }, + { + "epoch": 0.8310094813162298, + "grad_norm": 0.1369378832967966, + "learning_rate": 1.6890616629780364e-05, + "loss": 1.8125, + "step": 6705 + }, + { + "epoch": 0.8316291751874574, + "grad_norm": 0.13670780086941425, + "learning_rate": 1.6770501264809778e-05, + "loss": 1.8508, + "step": 6710 + }, + { + "epoch": 0.832248869058685, + "grad_norm": 0.14168797345046086, + "learning_rate": 1.665077541466289e-05, + "loss": 1.8492, + "step": 6715 + }, + { + "epoch": 0.8328685629299126, + "grad_norm": 0.12899525112771104, + "learning_rate": 1.6531439639657776e-05, + "loss": 1.8504, + "step": 6720 + }, + { + "epoch": 0.8334882568011402, + "grad_norm": 0.13276523815484698, + "learning_rate": 1.641249449828699e-05, + "loss": 1.9055, + "step": 6725 + }, + { + "epoch": 0.8341079506723679, + "grad_norm": 0.13386303228577315, + "learning_rate": 1.6293940547214905e-05, + "loss": 1.7434, + "step": 6730 + }, + { + "epoch": 0.8347276445435955, + "grad_norm": 0.1383082816059096, + "learning_rate": 1.617577834127506e-05, + "loss": 1.7766, + "step": 6735 + }, + { + "epoch": 0.8353473384148231, + "grad_norm": 0.13758595605104323, + "learning_rate": 1.6058008433467698e-05, + "loss": 1.7832, + "step": 6740 + }, + { + "epoch": 0.8359670322860507, + "grad_norm": 0.13142188279747996, + "learning_rate": 1.594063137495707e-05, + "loss": 1.8305, + "step": 6745 + }, + { + "epoch": 0.8365867261572784, + "grad_norm": 0.13834331995085114, + "learning_rate": 1.582364771506891e-05, + "loss": 1.8633, + "step": 6750 + }, + { + "epoch": 0.8372064200285059, + "grad_norm": 0.1426359228453992, + "learning_rate": 1.570705800128781e-05, + "loss": 1.7992, + "step": 6755 + }, + { + "epoch": 0.8378261138997335, + "grad_norm": 0.14010058644661322, + "learning_rate": 1.5590862779254746e-05, + "loss": 1.8043, + "step": 6760 + }, + { + "epoch": 0.8384458077709611, + "grad_norm": 0.1363378846450556, + "learning_rate": 1.5475062592764346e-05, + "loss": 1.8078, + "step": 6765 + }, + { + "epoch": 0.8390655016421887, + "grad_norm": 0.13928139362824718, + "learning_rate": 1.5359657983762632e-05, + "loss": 1.8055, + "step": 6770 + }, + { + "epoch": 0.8396851955134164, + "grad_norm": 0.1372046018249784, + "learning_rate": 1.524464949234422e-05, + "loss": 1.8211, + "step": 6775 + }, + { + "epoch": 0.840304889384644, + "grad_norm": 0.13039972587829407, + "learning_rate": 1.5130037656749918e-05, + "loss": 1.7777, + "step": 6780 + }, + { + "epoch": 0.8409245832558716, + "grad_norm": 0.13925521199854124, + "learning_rate": 1.5015823013364183e-05, + "loss": 1.8609, + "step": 6785 + }, + { + "epoch": 0.8415442771270992, + "grad_norm": 0.1318479986936156, + "learning_rate": 1.4902006096712572e-05, + "loss": 1.7832, + "step": 6790 + }, + { + "epoch": 0.8421639709983268, + "grad_norm": 0.13360512591823362, + "learning_rate": 1.4788587439459323e-05, + "loss": 1.8039, + "step": 6795 + }, + { + "epoch": 0.8427836648695545, + "grad_norm": 0.13712021933762084, + "learning_rate": 1.4675567572404803e-05, + "loss": 1.8434, + "step": 6800 + }, + { + "epoch": 0.8434033587407821, + "grad_norm": 0.13608874848867214, + "learning_rate": 1.4562947024483031e-05, + "loss": 1.7625, + "step": 6805 + }, + { + "epoch": 0.8440230526120097, + "grad_norm": 0.13426634730012743, + "learning_rate": 1.4450726322759223e-05, + "loss": 1.8086, + "step": 6810 + }, + { + "epoch": 0.8446427464832372, + "grad_norm": 0.13382844194945753, + "learning_rate": 1.4338905992427287e-05, + "loss": 1.8695, + "step": 6815 + }, + { + "epoch": 0.8452624403544649, + "grad_norm": 0.14966906376149425, + "learning_rate": 1.4227486556807412e-05, + "loss": 1.8121, + "step": 6820 + }, + { + "epoch": 0.8458821342256925, + "grad_norm": 0.13735951665790422, + "learning_rate": 1.4116468537343585e-05, + "loss": 1.8242, + "step": 6825 + }, + { + "epoch": 0.8465018280969201, + "grad_norm": 0.14377264999175535, + "learning_rate": 1.4005852453601164e-05, + "loss": 1.8648, + "step": 6830 + }, + { + "epoch": 0.8471215219681477, + "grad_norm": 0.1350064432331249, + "learning_rate": 1.3895638823264446e-05, + "loss": 1.8465, + "step": 6835 + }, + { + "epoch": 0.8477412158393753, + "grad_norm": 0.13075932246700298, + "learning_rate": 1.3785828162134252e-05, + "loss": 1.8348, + "step": 6840 + }, + { + "epoch": 0.848360909710603, + "grad_norm": 0.13247777446205836, + "learning_rate": 1.367642098412546e-05, + "loss": 1.8629, + "step": 6845 + }, + { + "epoch": 0.8489806035818306, + "grad_norm": 0.13338219328252035, + "learning_rate": 1.3567417801264692e-05, + "loss": 1.8297, + "step": 6850 + }, + { + "epoch": 0.8496002974530582, + "grad_norm": 0.13787557857168983, + "learning_rate": 1.345881912368785e-05, + "loss": 1.8621, + "step": 6855 + }, + { + "epoch": 0.8502199913242858, + "grad_norm": 0.13952008714498, + "learning_rate": 1.3350625459637744e-05, + "loss": 1.8117, + "step": 6860 + }, + { + "epoch": 0.8508396851955135, + "grad_norm": 0.13937544594263107, + "learning_rate": 1.3242837315461732e-05, + "loss": 1.8418, + "step": 6865 + }, + { + "epoch": 0.8514593790667411, + "grad_norm": 0.13851152521957338, + "learning_rate": 1.3135455195609325e-05, + "loss": 1.8137, + "step": 6870 + }, + { + "epoch": 0.8520790729379687, + "grad_norm": 0.13245292293717914, + "learning_rate": 1.3028479602629839e-05, + "loss": 1.8445, + "step": 6875 + }, + { + "epoch": 0.8526987668091962, + "grad_norm": 0.13176137950037486, + "learning_rate": 1.292191103717002e-05, + "loss": 1.8457, + "step": 6880 + }, + { + "epoch": 0.8533184606804238, + "grad_norm": 0.14274560973973033, + "learning_rate": 1.281574999797176e-05, + "loss": 1.8793, + "step": 6885 + }, + { + "epoch": 0.8539381545516515, + "grad_norm": 0.13669793644265746, + "learning_rate": 1.2709996981869699e-05, + "loss": 1.7793, + "step": 6890 + }, + { + "epoch": 0.8545578484228791, + "grad_norm": 0.13401311583823233, + "learning_rate": 1.2604652483788948e-05, + "loss": 1.841, + "step": 6895 + }, + { + "epoch": 0.8551775422941067, + "grad_norm": 0.14562512309081663, + "learning_rate": 1.2499716996742694e-05, + "loss": 1.8477, + "step": 6900 + }, + { + "epoch": 0.8557972361653343, + "grad_norm": 0.13981374112922862, + "learning_rate": 1.2395191011829999e-05, + "loss": 1.8508, + "step": 6905 + }, + { + "epoch": 0.8564169300365619, + "grad_norm": 0.13795922585737252, + "learning_rate": 1.2291075018233445e-05, + "loss": 1.8637, + "step": 6910 + }, + { + "epoch": 0.8570366239077896, + "grad_norm": 0.13648363243463277, + "learning_rate": 1.218736950321685e-05, + "loss": 1.7707, + "step": 6915 + }, + { + "epoch": 0.8576563177790172, + "grad_norm": 0.13536889404424596, + "learning_rate": 1.2084074952122959e-05, + "loss": 1.8629, + "step": 6920 + }, + { + "epoch": 0.8582760116502448, + "grad_norm": 0.14023215465264866, + "learning_rate": 1.1981191848371287e-05, + "loss": 1.8355, + "step": 6925 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.13669990706649254, + "learning_rate": 1.1878720673455645e-05, + "loss": 1.8586, + "step": 6930 + }, + { + "epoch": 0.8595153993927, + "grad_norm": 0.13993547428958963, + "learning_rate": 1.1776661906942099e-05, + "loss": 1.8172, + "step": 6935 + }, + { + "epoch": 0.8601350932639277, + "grad_norm": 0.15053693553773634, + "learning_rate": 1.1675016026466633e-05, + "loss": 1.7953, + "step": 6940 + }, + { + "epoch": 0.8607547871351552, + "grad_norm": 0.1400512786653631, + "learning_rate": 1.1573783507732893e-05, + "loss": 1.8559, + "step": 6945 + }, + { + "epoch": 0.8613744810063828, + "grad_norm": 0.14309923516749187, + "learning_rate": 1.1472964824510035e-05, + "loss": 1.8348, + "step": 6950 + }, + { + "epoch": 0.8619941748776104, + "grad_norm": 0.13484282365125583, + "learning_rate": 1.1372560448630376e-05, + "loss": 1.809, + "step": 6955 + }, + { + "epoch": 0.862613868748838, + "grad_norm": 0.13986355786084606, + "learning_rate": 1.1272570849987351e-05, + "loss": 1.8152, + "step": 6960 + }, + { + "epoch": 0.8632335626200657, + "grad_norm": 0.1378893093004656, + "learning_rate": 1.1172996496533194e-05, + "loss": 1.7738, + "step": 6965 + }, + { + "epoch": 0.8638532564912933, + "grad_norm": 0.1345430991025174, + "learning_rate": 1.1073837854276826e-05, + "loss": 1.8082, + "step": 6970 + }, + { + "epoch": 0.8644729503625209, + "grad_norm": 0.1441895547115774, + "learning_rate": 1.0975095387281587e-05, + "loss": 1.8234, + "step": 6975 + }, + { + "epoch": 0.8650926442337485, + "grad_norm": 0.1374998969278752, + "learning_rate": 1.087676955766318e-05, + "loss": 1.841, + "step": 6980 + }, + { + "epoch": 0.8657123381049762, + "grad_norm": 0.13606561355593746, + "learning_rate": 1.0778860825587323e-05, + "loss": 1.8352, + "step": 6985 + }, + { + "epoch": 0.8663320319762038, + "grad_norm": 0.13749250934423712, + "learning_rate": 1.0681369649267836e-05, + "loss": 1.8496, + "step": 6990 + }, + { + "epoch": 0.8669517258474314, + "grad_norm": 0.1368331262299894, + "learning_rate": 1.0584296484964318e-05, + "loss": 1.8711, + "step": 6995 + }, + { + "epoch": 0.867571419718659, + "grad_norm": 0.13863183942887752, + "learning_rate": 1.0487641786980063e-05, + "loss": 1.8391, + "step": 7000 + }, + { + "epoch": 0.8681911135898865, + "grad_norm": 0.13434259874384205, + "learning_rate": 1.0391406007659964e-05, + "loss": 1.7699, + "step": 7005 + }, + { + "epoch": 0.8688108074611142, + "grad_norm": 0.14095478527168656, + "learning_rate": 1.0295589597388355e-05, + "loss": 1.8336, + "step": 7010 + }, + { + "epoch": 0.8694305013323418, + "grad_norm": 0.1348345066351467, + "learning_rate": 1.0200193004586922e-05, + "loss": 1.802, + "step": 7015 + }, + { + "epoch": 0.8700501952035694, + "grad_norm": 0.13470060368449044, + "learning_rate": 1.0105216675712592e-05, + "loss": 1.8281, + "step": 7020 + }, + { + "epoch": 0.870669889074797, + "grad_norm": 0.1498496809323783, + "learning_rate": 1.0010661055255488e-05, + "loss": 1.8391, + "step": 7025 + }, + { + "epoch": 0.8712895829460247, + "grad_norm": 0.146463161739714, + "learning_rate": 9.916526585736763e-06, + "loss": 1.8535, + "step": 7030 + }, + { + "epoch": 0.8719092768172523, + "grad_norm": 0.14092293708406953, + "learning_rate": 9.822813707706625e-06, + "loss": 1.8406, + "step": 7035 + }, + { + "epoch": 0.8725289706884799, + "grad_norm": 0.14019511041970145, + "learning_rate": 9.729522859742191e-06, + "loss": 1.8938, + "step": 7040 + }, + { + "epoch": 0.8731486645597075, + "grad_norm": 0.1411623147758512, + "learning_rate": 9.636654478445494e-06, + "loss": 1.8387, + "step": 7045 + }, + { + "epoch": 0.8737683584309351, + "grad_norm": 0.1401437249903466, + "learning_rate": 9.544208998441428e-06, + "loss": 1.8867, + "step": 7050 + }, + { + "epoch": 0.8743880523021628, + "grad_norm": 0.13345487580370216, + "learning_rate": 9.452186852375678e-06, + "loss": 1.8891, + "step": 7055 + }, + { + "epoch": 0.8750077461733904, + "grad_norm": 0.13538270401639796, + "learning_rate": 9.360588470912756e-06, + "loss": 1.8609, + "step": 7060 + }, + { + "epoch": 0.875627440044618, + "grad_norm": 0.13939914091565453, + "learning_rate": 9.269414282733924e-06, + "loss": 1.8781, + "step": 7065 + }, + { + "epoch": 0.8762471339158455, + "grad_norm": 0.138068735546144, + "learning_rate": 9.178664714535235e-06, + "loss": 1.8359, + "step": 7070 + }, + { + "epoch": 0.8768668277870731, + "grad_norm": 0.13765544825239814, + "learning_rate": 9.088340191025501e-06, + "loss": 1.8586, + "step": 7075 + }, + { + "epoch": 0.8774865216583008, + "grad_norm": 0.13486521127469114, + "learning_rate": 8.998441134924318e-06, + "loss": 1.8758, + "step": 7080 + }, + { + "epoch": 0.8781062155295284, + "grad_norm": 0.13651144599742507, + "learning_rate": 8.908967966960124e-06, + "loss": 1.8922, + "step": 7085 + }, + { + "epoch": 0.878725909400756, + "grad_norm": 0.1447116043420326, + "learning_rate": 8.819921105868167e-06, + "loss": 1.8168, + "step": 7090 + }, + { + "epoch": 0.8793456032719836, + "grad_norm": 0.12956578472131167, + "learning_rate": 8.731300968388556e-06, + "loss": 1.8375, + "step": 7095 + }, + { + "epoch": 0.8799652971432113, + "grad_norm": 0.14217108418937452, + "learning_rate": 8.643107969264375e-06, + "loss": 1.8793, + "step": 7100 + }, + { + "epoch": 0.8805849910144389, + "grad_norm": 0.1475313460315241, + "learning_rate": 8.555342521239662e-06, + "loss": 1.8191, + "step": 7105 + }, + { + "epoch": 0.8812046848856665, + "grad_norm": 0.14056958492979643, + "learning_rate": 8.468005035057536e-06, + "loss": 1.8664, + "step": 7110 + }, + { + "epoch": 0.8818243787568941, + "grad_norm": 0.14479567172868937, + "learning_rate": 8.381095919458226e-06, + "loss": 1.8539, + "step": 7115 + }, + { + "epoch": 0.8824440726281217, + "grad_norm": 0.14164836437320075, + "learning_rate": 8.294615581177223e-06, + "loss": 1.8445, + "step": 7120 + }, + { + "epoch": 0.8830637664993494, + "grad_norm": 0.14339217053169717, + "learning_rate": 8.208564424943288e-06, + "loss": 1.8113, + "step": 7125 + }, + { + "epoch": 0.883683460370577, + "grad_norm": 0.13508727975928309, + "learning_rate": 8.122942853476633e-06, + "loss": 1.8961, + "step": 7130 + }, + { + "epoch": 0.8843031542418045, + "grad_norm": 0.1431420751411207, + "learning_rate": 8.037751267487003e-06, + "loss": 1.852, + "step": 7135 + }, + { + "epoch": 0.8849228481130321, + "grad_norm": 0.13776837332926897, + "learning_rate": 7.952990065671817e-06, + "loss": 1.8367, + "step": 7140 + }, + { + "epoch": 0.8855425419842597, + "grad_norm": 0.1357035406570165, + "learning_rate": 7.868659644714294e-06, + "loss": 1.8164, + "step": 7145 + }, + { + "epoch": 0.8861622358554874, + "grad_norm": 0.13774817584994603, + "learning_rate": 7.784760399281554e-06, + "loss": 1.8715, + "step": 7150 + }, + { + "epoch": 0.886781929726715, + "grad_norm": 0.13709267085065505, + "learning_rate": 7.701292722022846e-06, + "loss": 1.8035, + "step": 7155 + }, + { + "epoch": 0.8874016235979426, + "grad_norm": 0.1381104044552148, + "learning_rate": 7.618257003567675e-06, + "loss": 1.8172, + "step": 7160 + }, + { + "epoch": 0.8880213174691702, + "grad_norm": 0.13578670341989943, + "learning_rate": 7.5356536325239755e-06, + "loss": 1.7742, + "step": 7165 + }, + { + "epoch": 0.8886410113403979, + "grad_norm": 0.1375594605300216, + "learning_rate": 7.453482995476291e-06, + "loss": 1.8172, + "step": 7170 + }, + { + "epoch": 0.8892607052116255, + "grad_norm": 0.13878029380784612, + "learning_rate": 7.371745476983982e-06, + "loss": 1.7582, + "step": 7175 + }, + { + "epoch": 0.8898803990828531, + "grad_norm": 0.13908494179294276, + "learning_rate": 7.2904414595793556e-06, + "loss": 1.8398, + "step": 7180 + }, + { + "epoch": 0.8905000929540807, + "grad_norm": 0.13339193424993012, + "learning_rate": 7.209571323765973e-06, + "loss": 1.7902, + "step": 7185 + }, + { + "epoch": 0.8911197868253083, + "grad_norm": 0.1331999537324601, + "learning_rate": 7.129135448016821e-06, + "loss": 1.8301, + "step": 7190 + }, + { + "epoch": 0.8917394806965359, + "grad_norm": 0.13428548155659575, + "learning_rate": 7.049134208772545e-06, + "loss": 1.852, + "step": 7195 + }, + { + "epoch": 0.8923591745677635, + "grad_norm": 0.13624592336240599, + "learning_rate": 6.969567980439706e-06, + "loss": 1.8602, + "step": 7200 + }, + { + "epoch": 0.8929788684389911, + "grad_norm": 0.13683342194643683, + "learning_rate": 6.890437135388939e-06, + "loss": 1.8613, + "step": 7205 + }, + { + "epoch": 0.8935985623102187, + "grad_norm": 0.13394086830888138, + "learning_rate": 6.8117420439533615e-06, + "loss": 1.8535, + "step": 7210 + }, + { + "epoch": 0.8942182561814463, + "grad_norm": 0.1308504194307238, + "learning_rate": 6.733483074426716e-06, + "loss": 1.8527, + "step": 7215 + }, + { + "epoch": 0.894837950052674, + "grad_norm": 0.142886229081844, + "learning_rate": 6.655660593061719e-06, + "loss": 1.8387, + "step": 7220 + }, + { + "epoch": 0.8954576439239016, + "grad_norm": 0.1394768164104558, + "learning_rate": 6.578274964068298e-06, + "loss": 1.891, + "step": 7225 + }, + { + "epoch": 0.8960773377951292, + "grad_norm": 0.13125339904213718, + "learning_rate": 6.50132654961193e-06, + "loss": 1.8387, + "step": 7230 + }, + { + "epoch": 0.8966970316663568, + "grad_norm": 0.14041832103312948, + "learning_rate": 6.424815709811871e-06, + "loss": 1.8852, + "step": 7235 + }, + { + "epoch": 0.8973167255375845, + "grad_norm": 0.13965593340949, + "learning_rate": 6.3487428027395715e-06, + "loss": 1.7867, + "step": 7240 + }, + { + "epoch": 0.8979364194088121, + "grad_norm": 0.1326849396556491, + "learning_rate": 6.273108184416943e-06, + "loss": 1.8414, + "step": 7245 + }, + { + "epoch": 0.8985561132800397, + "grad_norm": 0.14154992344541478, + "learning_rate": 6.197912208814694e-06, + "loss": 1.8711, + "step": 7250 + }, + { + "epoch": 0.8991758071512673, + "grad_norm": 0.14900298957887134, + "learning_rate": 6.123155227850708e-06, + "loss": 1.8496, + "step": 7255 + }, + { + "epoch": 0.8997955010224948, + "grad_norm": 0.13378506525283548, + "learning_rate": 6.048837591388301e-06, + "loss": 1.8355, + "step": 7260 + }, + { + "epoch": 0.9004151948937225, + "grad_norm": 0.1364642041434309, + "learning_rate": 5.974959647234746e-06, + "loss": 1.827, + "step": 7265 + }, + { + "epoch": 0.9010348887649501, + "grad_norm": 0.13884954804156385, + "learning_rate": 5.901521741139482e-06, + "loss": 1.8055, + "step": 7270 + }, + { + "epoch": 0.9016545826361777, + "grad_norm": 0.13583763712007216, + "learning_rate": 5.828524216792586e-06, + "loss": 1.7793, + "step": 7275 + }, + { + "epoch": 0.9022742765074053, + "grad_norm": 0.1373270304349077, + "learning_rate": 5.75596741582316e-06, + "loss": 1.8512, + "step": 7280 + }, + { + "epoch": 0.902893970378633, + "grad_norm": 0.1352208909295617, + "learning_rate": 5.6838516777977135e-06, + "loss": 1.8242, + "step": 7285 + }, + { + "epoch": 0.9035136642498606, + "grad_norm": 0.14464708429301287, + "learning_rate": 5.6121773402185385e-06, + "loss": 1.7852, + "step": 7290 + }, + { + "epoch": 0.9041333581210882, + "grad_norm": 0.13241462166706447, + "learning_rate": 5.540944738522203e-06, + "loss": 1.8809, + "step": 7295 + }, + { + "epoch": 0.9047530519923158, + "grad_norm": 0.13280584084305314, + "learning_rate": 5.470154206077949e-06, + "loss": 1.8277, + "step": 7300 + }, + { + "epoch": 0.9053727458635434, + "grad_norm": 0.14278733047702366, + "learning_rate": 5.3998060741861314e-06, + "loss": 1.9148, + "step": 7305 + }, + { + "epoch": 0.9059924397347711, + "grad_norm": 0.14264404944438486, + "learning_rate": 5.329900672076637e-06, + "loss": 1.8313, + "step": 7310 + }, + { + "epoch": 0.9066121336059987, + "grad_norm": 0.13404018286932412, + "learning_rate": 5.260438326907413e-06, + "loss": 1.8496, + "step": 7315 + }, + { + "epoch": 0.9072318274772263, + "grad_norm": 0.139299854893475, + "learning_rate": 5.191419363762873e-06, + "loss": 1.8215, + "step": 7320 + }, + { + "epoch": 0.9078515213484538, + "grad_norm": 0.12938710234881426, + "learning_rate": 5.122844105652402e-06, + "loss": 1.9102, + "step": 7325 + }, + { + "epoch": 0.9084712152196814, + "grad_norm": 0.1431339768847217, + "learning_rate": 5.054712873508827e-06, + "loss": 1.8809, + "step": 7330 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.14185976129658012, + "learning_rate": 4.987025986186966e-06, + "loss": 1.8551, + "step": 7335 + }, + { + "epoch": 0.9097106029621367, + "grad_norm": 0.13984745090181067, + "learning_rate": 4.919783760462082e-06, + "loss": 1.7578, + "step": 7340 + }, + { + "epoch": 0.9103302968333643, + "grad_norm": 0.1316498232846164, + "learning_rate": 4.85298651102839e-06, + "loss": 1.8746, + "step": 7345 + }, + { + "epoch": 0.9109499907045919, + "grad_norm": 0.13622065727170202, + "learning_rate": 4.786634550497637e-06, + "loss": 1.8641, + "step": 7350 + }, + { + "epoch": 0.9115696845758195, + "grad_norm": 0.13872927734480942, + "learning_rate": 4.720728189397628e-06, + "loss": 1.7785, + "step": 7355 + }, + { + "epoch": 0.9121893784470472, + "grad_norm": 0.14013017608713682, + "learning_rate": 4.655267736170732e-06, + "loss": 1.8273, + "step": 7360 + }, + { + "epoch": 0.9128090723182748, + "grad_norm": 0.140766561179319, + "learning_rate": 4.5902534971724806e-06, + "loss": 1.8023, + "step": 7365 + }, + { + "epoch": 0.9134287661895024, + "grad_norm": 0.13550910020269402, + "learning_rate": 4.525685776670108e-06, + "loss": 1.7758, + "step": 7370 + }, + { + "epoch": 0.91404846006073, + "grad_norm": 0.1456560002195272, + "learning_rate": 4.46156487684114e-06, + "loss": 1.8203, + "step": 7375 + }, + { + "epoch": 0.9146681539319577, + "grad_norm": 0.13985502488308021, + "learning_rate": 4.397891097771989e-06, + "loss": 1.8293, + "step": 7380 + }, + { + "epoch": 0.9152878478031852, + "grad_norm": 0.14148217516325698, + "learning_rate": 4.334664737456539e-06, + "loss": 1.8039, + "step": 7385 + }, + { + "epoch": 0.9159075416744128, + "grad_norm": 0.13723569523716944, + "learning_rate": 4.271886091794719e-06, + "loss": 1.8836, + "step": 7390 + }, + { + "epoch": 0.9165272355456404, + "grad_norm": 0.13503953990532955, + "learning_rate": 4.209555454591197e-06, + "loss": 1.7797, + "step": 7395 + }, + { + "epoch": 0.917146929416868, + "grad_norm": 0.13814876703450732, + "learning_rate": 4.147673117553896e-06, + "loss": 1.8531, + "step": 7400 + }, + { + "epoch": 0.9177666232880957, + "grad_norm": 0.13586514530672164, + "learning_rate": 4.086239370292755e-06, + "loss": 1.848, + "step": 7405 + }, + { + "epoch": 0.9183863171593233, + "grad_norm": 0.14182653498050868, + "learning_rate": 4.025254500318265e-06, + "loss": 1.8422, + "step": 7410 + }, + { + "epoch": 0.9190060110305509, + "grad_norm": 0.1388334635973296, + "learning_rate": 3.964718793040178e-06, + "loss": 1.8148, + "step": 7415 + }, + { + "epoch": 0.9196257049017785, + "grad_norm": 0.1423659419451689, + "learning_rate": 3.904632531766195e-06, + "loss": 1.7766, + "step": 7420 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 0.14335609098468574, + "learning_rate": 3.84499599770054e-06, + "loss": 1.8473, + "step": 7425 + }, + { + "epoch": 0.9208650926442338, + "grad_norm": 0.13248049762452474, + "learning_rate": 3.785809469942758e-06, + "loss": 1.8238, + "step": 7430 + }, + { + "epoch": 0.9214847865154614, + "grad_norm": 0.13729453036726105, + "learning_rate": 3.727073225486344e-06, + "loss": 1.8168, + "step": 7435 + }, + { + "epoch": 0.922104480386689, + "grad_norm": 0.1353112000663283, + "learning_rate": 3.6687875392174665e-06, + "loss": 1.8223, + "step": 7440 + }, + { + "epoch": 0.9227241742579166, + "grad_norm": 0.1447311577090011, + "learning_rate": 3.61095268391366e-06, + "loss": 1.8316, + "step": 7445 + }, + { + "epoch": 0.9233438681291442, + "grad_norm": 0.13228279149862204, + "learning_rate": 3.5535689302426236e-06, + "loss": 1.8383, + "step": 7450 + }, + { + "epoch": 0.9239635620003718, + "grad_norm": 0.13613961418557433, + "learning_rate": 3.496636546760812e-06, + "loss": 1.777, + "step": 7455 + }, + { + "epoch": 0.9245832558715994, + "grad_norm": 0.14235960272198284, + "learning_rate": 3.4401557999123146e-06, + "loss": 1.8613, + "step": 7460 + }, + { + "epoch": 0.925202949742827, + "grad_norm": 0.12842862529934493, + "learning_rate": 3.3841269540275553e-06, + "loss": 1.8629, + "step": 7465 + }, + { + "epoch": 0.9258226436140546, + "grad_norm": 0.13216080090217125, + "learning_rate": 3.3285502713220617e-06, + "loss": 1.9145, + "step": 7470 + }, + { + "epoch": 0.9264423374852823, + "grad_norm": 0.13750703177725587, + "learning_rate": 3.2734260118952307e-06, + "loss": 1.8406, + "step": 7475 + }, + { + "epoch": 0.9270620313565099, + "grad_norm": 0.1362535989445485, + "learning_rate": 3.218754433729065e-06, + "loss": 1.8477, + "step": 7480 + }, + { + "epoch": 0.9276817252277375, + "grad_norm": 0.13293349873817525, + "learning_rate": 3.1645357926870955e-06, + "loss": 1.8473, + "step": 7485 + }, + { + "epoch": 0.9283014190989651, + "grad_norm": 0.12370366878412518, + "learning_rate": 3.110770342513036e-06, + "loss": 1.8441, + "step": 7490 + }, + { + "epoch": 0.9289211129701928, + "grad_norm": 0.14093551922685052, + "learning_rate": 3.057458334829699e-06, + "loss": 1.8262, + "step": 7495 + }, + { + "epoch": 0.9295408068414204, + "grad_norm": 0.14089913168838317, + "learning_rate": 3.0046000191377934e-06, + "loss": 1.8363, + "step": 7500 + }, + { + "epoch": 0.930160500712648, + "grad_norm": 0.13655806820666927, + "learning_rate": 2.9521956428146923e-06, + "loss": 1.7809, + "step": 7505 + }, + { + "epoch": 0.9307801945838756, + "grad_norm": 0.12930406578508882, + "learning_rate": 2.9002454511133923e-06, + "loss": 1.8426, + "step": 7510 + }, + { + "epoch": 0.9313998884551031, + "grad_norm": 0.13223003361326363, + "learning_rate": 2.8487496871612453e-06, + "loss": 1.8457, + "step": 7515 + }, + { + "epoch": 0.9320195823263308, + "grad_norm": 0.13191348370431163, + "learning_rate": 2.7977085919589254e-06, + "loss": 1.8461, + "step": 7520 + }, + { + "epoch": 0.9326392761975584, + "grad_norm": 0.13149475248273712, + "learning_rate": 2.7471224043792098e-06, + "loss": 1.8406, + "step": 7525 + }, + { + "epoch": 0.933258970068786, + "grad_norm": 0.13701793086079692, + "learning_rate": 2.6969913611659457e-06, + "loss": 1.907, + "step": 7530 + }, + { + "epoch": 0.9338786639400136, + "grad_norm": 0.132647067429414, + "learning_rate": 2.6473156969328503e-06, + "loss": 1.7945, + "step": 7535 + }, + { + "epoch": 0.9344983578112412, + "grad_norm": 0.13813325094756712, + "learning_rate": 2.5980956441625236e-06, + "loss": 1.802, + "step": 7540 + }, + { + "epoch": 0.9351180516824689, + "grad_norm": 0.13672748834661805, + "learning_rate": 2.5493314332052377e-06, + "loss": 1.8391, + "step": 7545 + }, + { + "epoch": 0.9357377455536965, + "grad_norm": 0.14163135141821864, + "learning_rate": 2.501023292277971e-06, + "loss": 1.85, + "step": 7550 + }, + { + "epoch": 0.9363574394249241, + "grad_norm": 0.13621074708206138, + "learning_rate": 2.453171447463265e-06, + "loss": 1.8648, + "step": 7555 + }, + { + "epoch": 0.9369771332961517, + "grad_norm": 0.143929080240879, + "learning_rate": 2.4057761227081923e-06, + "loss": 1.793, + "step": 7560 + }, + { + "epoch": 0.9375968271673794, + "grad_norm": 0.13558520445018998, + "learning_rate": 2.358837539823311e-06, + "loss": 1.8395, + "step": 7565 + }, + { + "epoch": 0.938216521038607, + "grad_norm": 0.1321488531469034, + "learning_rate": 2.3123559184816344e-06, + "loss": 1.7812, + "step": 7570 + }, + { + "epoch": 0.9388362149098345, + "grad_norm": 0.13733894017904172, + "learning_rate": 2.2663314762175647e-06, + "loss": 1.8727, + "step": 7575 + }, + { + "epoch": 0.9394559087810621, + "grad_norm": 0.13649001800785526, + "learning_rate": 2.2207644284259256e-06, + "loss": 1.8367, + "step": 7580 + }, + { + "epoch": 0.9400756026522897, + "grad_norm": 0.136966039329494, + "learning_rate": 2.1756549883609313e-06, + "loss": 1.8402, + "step": 7585 + }, + { + "epoch": 0.9406952965235174, + "grad_norm": 0.1323250881965697, + "learning_rate": 2.131003367135154e-06, + "loss": 1.8465, + "step": 7590 + }, + { + "epoch": 0.941314990394745, + "grad_norm": 0.1293883428081351, + "learning_rate": 2.086809773718601e-06, + "loss": 1.8121, + "step": 7595 + }, + { + "epoch": 0.9419346842659726, + "grad_norm": 0.13859211529936802, + "learning_rate": 2.0430744149377177e-06, + "loss": 1.8469, + "step": 7600 + }, + { + "epoch": 0.9425543781372002, + "grad_norm": 0.14096409695978848, + "learning_rate": 1.999797495474365e-06, + "loss": 1.8234, + "step": 7605 + }, + { + "epoch": 0.9431740720084278, + "grad_norm": 0.13640618682039807, + "learning_rate": 1.9569792178649405e-06, + "loss": 1.8687, + "step": 7610 + }, + { + "epoch": 0.9437937658796555, + "grad_norm": 0.12992959333703022, + "learning_rate": 1.914619782499383e-06, + "loss": 1.8938, + "step": 7615 + }, + { + "epoch": 0.9444134597508831, + "grad_norm": 0.14310321729194458, + "learning_rate": 1.8727193876202143e-06, + "loss": 1.8379, + "step": 7620 + }, + { + "epoch": 0.9450331536221107, + "grad_norm": 0.13433525346251443, + "learning_rate": 1.8312782293216979e-06, + "loss": 1.8344, + "step": 7625 + }, + { + "epoch": 0.9456528474933383, + "grad_norm": 0.131679794414834, + "learning_rate": 1.7902965015488381e-06, + "loss": 1.8887, + "step": 7630 + }, + { + "epoch": 0.946272541364566, + "grad_norm": 0.14674405226127388, + "learning_rate": 1.749774396096482e-06, + "loss": 1.8754, + "step": 7635 + }, + { + "epoch": 0.9468922352357935, + "grad_norm": 0.14156250279993213, + "learning_rate": 1.709712102608463e-06, + "loss": 1.8133, + "step": 7640 + }, + { + "epoch": 0.9475119291070211, + "grad_norm": 0.1383482897158417, + "learning_rate": 1.6701098085767031e-06, + "loss": 1.8031, + "step": 7645 + }, + { + "epoch": 0.9481316229782487, + "grad_norm": 0.1314285827216017, + "learning_rate": 1.630967699340269e-06, + "loss": 1.8559, + "step": 7650 + }, + { + "epoch": 0.9487513168494763, + "grad_norm": 0.1311075376517613, + "learning_rate": 1.5922859580846271e-06, + "loss": 1.8535, + "step": 7655 + }, + { + "epoch": 0.949371010720704, + "grad_norm": 0.14104548842656045, + "learning_rate": 1.5540647658406682e-06, + "loss": 1.8109, + "step": 7660 + }, + { + "epoch": 0.9499907045919316, + "grad_norm": 0.147336471013033, + "learning_rate": 1.5163043014839284e-06, + "loss": 1.7785, + "step": 7665 + }, + { + "epoch": 0.9506103984631592, + "grad_norm": 0.13166400385713117, + "learning_rate": 1.479004741733736e-06, + "loss": 1.8582, + "step": 7670 + }, + { + "epoch": 0.9512300923343868, + "grad_norm": 0.13572116993162228, + "learning_rate": 1.4421662611523667e-06, + "loss": 1.7457, + "step": 7675 + }, + { + "epoch": 0.9518497862056144, + "grad_norm": 0.14180501417698368, + "learning_rate": 1.4057890321442558e-06, + "loss": 1.8422, + "step": 7680 + }, + { + "epoch": 0.9524694800768421, + "grad_norm": 0.1401530258048634, + "learning_rate": 1.3698732249551648e-06, + "loss": 1.7668, + "step": 7685 + }, + { + "epoch": 0.9530891739480697, + "grad_norm": 0.13397195109554486, + "learning_rate": 1.3344190076714059e-06, + "loss": 1.9152, + "step": 7690 + }, + { + "epoch": 0.9537088678192973, + "grad_norm": 0.1338896772880402, + "learning_rate": 1.2994265462190513e-06, + "loss": 1.7973, + "step": 7695 + }, + { + "epoch": 0.9543285616905249, + "grad_norm": 0.13683971013494062, + "learning_rate": 1.2648960043631474e-06, + "loss": 1.8883, + "step": 7700 + }, + { + "epoch": 0.9549482555617524, + "grad_norm": 0.1340322150174887, + "learning_rate": 1.230827543706925e-06, + "loss": 1.7891, + "step": 7705 + }, + { + "epoch": 0.9555679494329801, + "grad_norm": 0.13540828994071039, + "learning_rate": 1.1972213236911112e-06, + "loss": 1.8691, + "step": 7710 + }, + { + "epoch": 0.9561876433042077, + "grad_norm": 0.13431224805457315, + "learning_rate": 1.1640775015931304e-06, + "loss": 1.8598, + "step": 7715 + }, + { + "epoch": 0.9568073371754353, + "grad_norm": 0.14041245444776923, + "learning_rate": 1.1313962325263717e-06, + "loss": 1.8453, + "step": 7720 + }, + { + "epoch": 0.9574270310466629, + "grad_norm": 0.13706149413387225, + "learning_rate": 1.0991776694394883e-06, + "loss": 1.8711, + "step": 7725 + }, + { + "epoch": 0.9580467249178906, + "grad_norm": 0.13378885841123092, + "learning_rate": 1.0674219631156334e-06, + "loss": 1.8477, + "step": 7730 + }, + { + "epoch": 0.9586664187891182, + "grad_norm": 0.13369307726742216, + "learning_rate": 1.0361292621718145e-06, + "loss": 1.9383, + "step": 7735 + }, + { + "epoch": 0.9592861126603458, + "grad_norm": 0.13470678685337034, + "learning_rate": 1.005299713058161e-06, + "loss": 1.8449, + "step": 7740 + }, + { + "epoch": 0.9599058065315734, + "grad_norm": 0.13827959853680175, + "learning_rate": 9.74933460057248e-07, + "loss": 1.8227, + "step": 7745 + }, + { + "epoch": 0.960525500402801, + "grad_norm": 0.13861638685269106, + "learning_rate": 9.450306452834179e-07, + "loss": 1.8598, + "step": 7750 + }, + { + "epoch": 0.9611451942740287, + "grad_norm": 0.13966148760312724, + "learning_rate": 9.15591408682126e-07, + "loss": 1.807, + "step": 7755 + }, + { + "epoch": 0.9617648881452563, + "grad_norm": 0.14081009116641513, + "learning_rate": 8.866158880292741e-07, + "loss": 1.782, + "step": 7760 + }, + { + "epoch": 0.9623845820164838, + "grad_norm": 0.14057620583361458, + "learning_rate": 8.581042189305555e-07, + "loss": 1.827, + "step": 7765 + }, + { + "epoch": 0.9630042758877114, + "grad_norm": 0.13937796915185546, + "learning_rate": 8.300565348208556e-07, + "loss": 1.8313, + "step": 7770 + }, + { + "epoch": 0.963623969758939, + "grad_norm": 0.13273514279259374, + "learning_rate": 8.024729669635967e-07, + "loss": 1.868, + "step": 7775 + }, + { + "epoch": 0.9642436636301667, + "grad_norm": 0.13598653532834637, + "learning_rate": 7.753536444501164e-07, + "loss": 1.7715, + "step": 7780 + }, + { + "epoch": 0.9648633575013943, + "grad_norm": 0.13907272985299546, + "learning_rate": 7.486986941991125e-07, + "loss": 1.7699, + "step": 7785 + }, + { + "epoch": 0.9654830513726219, + "grad_norm": 0.13321673642848147, + "learning_rate": 7.225082409559881e-07, + "loss": 1.8223, + "step": 7790 + }, + { + "epoch": 0.9661027452438495, + "grad_norm": 0.12879372395742059, + "learning_rate": 6.967824072923068e-07, + "loss": 1.768, + "step": 7795 + }, + { + "epoch": 0.9667224391150772, + "grad_norm": 0.13586715394976565, + "learning_rate": 6.715213136052056e-07, + "loss": 1.823, + "step": 7800 + }, + { + "epoch": 0.9673421329863048, + "grad_norm": 0.14497724016910185, + "learning_rate": 6.467250781168499e-07, + "loss": 1.8527, + "step": 7805 + }, + { + "epoch": 0.9679618268575324, + "grad_norm": 0.13612908889114356, + "learning_rate": 6.223938168738341e-07, + "loss": 1.8047, + "step": 7810 + }, + { + "epoch": 0.96858152072876, + "grad_norm": 0.1358005796129076, + "learning_rate": 5.985276437467046e-07, + "loss": 1.8422, + "step": 7815 + }, + { + "epoch": 0.9692012145999876, + "grad_norm": 0.14280390232394205, + "learning_rate": 5.751266704293601e-07, + "loss": 1.8168, + "step": 7820 + }, + { + "epoch": 0.9698209084712153, + "grad_norm": 0.13237756581743151, + "learning_rate": 5.521910064385627e-07, + "loss": 1.8402, + "step": 7825 + }, + { + "epoch": 0.9704406023424428, + "grad_norm": 0.13587015108604314, + "learning_rate": 5.297207591134612e-07, + "loss": 1.7957, + "step": 7830 + }, + { + "epoch": 0.9710602962136704, + "grad_norm": 0.13443918341905792, + "learning_rate": 5.077160336149911e-07, + "loss": 1.8457, + "step": 7835 + }, + { + "epoch": 0.971679990084898, + "grad_norm": 0.14765379085939143, + "learning_rate": 4.861769329254862e-07, + "loss": 1.8121, + "step": 7840 + }, + { + "epoch": 0.9722996839561256, + "grad_norm": 0.1394457904355103, + "learning_rate": 4.651035578481344e-07, + "loss": 1.8746, + "step": 7845 + }, + { + "epoch": 0.9729193778273533, + "grad_norm": 0.13752679317753222, + "learning_rate": 4.4449600700652296e-07, + "loss": 1.8461, + "step": 7850 + }, + { + "epoch": 0.9735390716985809, + "grad_norm": 0.14296404692218137, + "learning_rate": 4.243543768441827e-07, + "loss": 1.85, + "step": 7855 + }, + { + "epoch": 0.9741587655698085, + "grad_norm": 0.13087948478602446, + "learning_rate": 4.046787616241221e-07, + "loss": 1.8859, + "step": 7860 + }, + { + "epoch": 0.9747784594410361, + "grad_norm": 0.1344356130652384, + "learning_rate": 3.8546925342842764e-07, + "loss": 1.7363, + "step": 7865 + }, + { + "epoch": 0.9753981533122638, + "grad_norm": 0.1370666125721412, + "learning_rate": 3.6672594215774183e-07, + "loss": 1.8355, + "step": 7870 + }, + { + "epoch": 0.9760178471834914, + "grad_norm": 0.13939543925141024, + "learning_rate": 3.484489155309523e-07, + "loss": 1.8383, + "step": 7875 + }, + { + "epoch": 0.976637541054719, + "grad_norm": 0.13817647293293314, + "learning_rate": 3.3063825908471456e-07, + "loss": 1.8535, + "step": 7880 + }, + { + "epoch": 0.9772572349259466, + "grad_norm": 0.13705290483161534, + "learning_rate": 3.132940561730524e-07, + "loss": 1.7863, + "step": 7885 + }, + { + "epoch": 0.9778769287971742, + "grad_norm": 0.12778167892179987, + "learning_rate": 2.9641638796701344e-07, + "loss": 1.8418, + "step": 7890 + }, + { + "epoch": 0.9784966226684018, + "grad_norm": 0.13146765481563064, + "learning_rate": 2.800053334542363e-07, + "loss": 1.8629, + "step": 7895 + }, + { + "epoch": 0.9791163165396294, + "grad_norm": 0.13236765959057267, + "learning_rate": 2.6406096943859537e-07, + "loss": 1.8734, + "step": 7900 + }, + { + "epoch": 0.979736010410857, + "grad_norm": 0.1353069444117587, + "learning_rate": 2.485833705398677e-07, + "loss": 1.8191, + "step": 7905 + }, + { + "epoch": 0.9803557042820846, + "grad_norm": 0.1332817560129625, + "learning_rate": 2.3357260919336654e-07, + "loss": 1.8543, + "step": 7910 + }, + { + "epoch": 0.9809753981533123, + "grad_norm": 0.1427542111025049, + "learning_rate": 2.1902875564958624e-07, + "loss": 1.8406, + "step": 7915 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.13737706138886846, + "learning_rate": 2.0495187797390236e-07, + "loss": 1.8676, + "step": 7920 + }, + { + "epoch": 0.9822147858957675, + "grad_norm": 0.14196006592167704, + "learning_rate": 1.913420420462164e-07, + "loss": 1.8176, + "step": 7925 + }, + { + "epoch": 0.9828344797669951, + "grad_norm": 0.1346411791497383, + "learning_rate": 1.7819931156071168e-07, + "loss": 1.8715, + "step": 7930 + }, + { + "epoch": 0.9834541736382227, + "grad_norm": 0.13353971756702795, + "learning_rate": 1.6552374802546454e-07, + "loss": 1.8387, + "step": 7935 + }, + { + "epoch": 0.9840738675094504, + "grad_norm": 0.13534999592660807, + "learning_rate": 1.5331541076225585e-07, + "loss": 1.8742, + "step": 7940 + }, + { + "epoch": 0.984693561380678, + "grad_norm": 0.1375002640543611, + "learning_rate": 1.4157435690619337e-07, + "loss": 1.7785, + "step": 7945 + }, + { + "epoch": 0.9853132552519056, + "grad_norm": 0.13478477532193128, + "learning_rate": 1.30300641405523e-07, + "loss": 1.8578, + "step": 7950 + }, + { + "epoch": 0.9859329491231331, + "grad_norm": 0.13734004121483884, + "learning_rate": 1.194943170213403e-07, + "loss": 1.8367, + "step": 7955 + }, + { + "epoch": 0.9865526429943607, + "grad_norm": 0.13266755629771385, + "learning_rate": 1.0915543432733488e-07, + "loss": 1.8426, + "step": 7960 + }, + { + "epoch": 0.9871723368655884, + "grad_norm": 0.13522525243545225, + "learning_rate": 9.928404170959082e-08, + "loss": 1.8188, + "step": 7965 + }, + { + "epoch": 0.987792030736816, + "grad_norm": 0.13802548293591557, + "learning_rate": 8.988018536630893e-08, + "loss": 1.8395, + "step": 7970 + }, + { + "epoch": 0.9884117246080436, + "grad_norm": 0.1506854239518021, + "learning_rate": 8.094390930762919e-08, + "loss": 1.7988, + "step": 7975 + }, + { + "epoch": 0.9890314184792712, + "grad_norm": 0.13219180433160424, + "learning_rate": 7.247525535538647e-08, + "loss": 1.7918, + "step": 7980 + }, + { + "epoch": 0.9896511123504989, + "grad_norm": 0.14248580603180083, + "learning_rate": 6.447426314297734e-08, + "loss": 1.832, + "step": 7985 + }, + { + "epoch": 0.9902708062217265, + "grad_norm": 0.1332276335475975, + "learning_rate": 5.6940970115115785e-08, + "loss": 1.8535, + "step": 7990 + }, + { + "epoch": 0.9908905000929541, + "grad_norm": 0.13540820808598406, + "learning_rate": 4.9875411527677826e-08, + "loss": 1.7641, + "step": 7995 + }, + { + "epoch": 0.9915101939641817, + "grad_norm": 0.13658683750615433, + "learning_rate": 4.327762044755712e-08, + "loss": 1.8637, + "step": 8000 + }, + { + "epoch": 0.9921298878354093, + "grad_norm": 0.13582846490136452, + "learning_rate": 3.714762775245406e-08, + "loss": 1.7602, + "step": 8005 + }, + { + "epoch": 0.992749581706637, + "grad_norm": 0.13741100922644967, + "learning_rate": 3.148546213080916e-08, + "loss": 1.8711, + "step": 8010 + }, + { + "epoch": 0.9933692755778646, + "grad_norm": 0.13336480139796178, + "learning_rate": 2.6291150081603212e-08, + "loss": 1.8867, + "step": 8015 + }, + { + "epoch": 0.9939889694490921, + "grad_norm": 0.14304602235765015, + "learning_rate": 2.156471591426845e-08, + "loss": 1.8926, + "step": 8020 + }, + { + "epoch": 0.9946086633203197, + "grad_norm": 0.13292064745465249, + "learning_rate": 1.7306181748566463e-08, + "loss": 1.8152, + "step": 8025 + }, + { + "epoch": 0.9952283571915473, + "grad_norm": 0.1350953647523571, + "learning_rate": 1.351556751445493e-08, + "loss": 1.8414, + "step": 8030 + }, + { + "epoch": 0.995848051062775, + "grad_norm": 0.14228963587482882, + "learning_rate": 1.0192890952054334e-08, + "loss": 1.802, + "step": 8035 + }, + { + "epoch": 0.9964677449340026, + "grad_norm": 0.14132019863120235, + "learning_rate": 7.338167611536939e-09, + "loss": 1.7988, + "step": 8040 + }, + { + "epoch": 0.9970874388052302, + "grad_norm": 0.13716626276276406, + "learning_rate": 4.9514108530157585e-09, + "loss": 1.7957, + "step": 8045 + }, + { + "epoch": 0.9977071326764578, + "grad_norm": 0.14215685681688475, + "learning_rate": 3.0326318465334625e-09, + "loss": 1.8348, + "step": 8050 + }, + { + "epoch": 0.9983268265476855, + "grad_norm": 0.13642854353006256, + "learning_rate": 1.5818395720068602e-09, + "loss": 1.7578, + "step": 8055 + }, + { + "epoch": 0.9989465204189131, + "grad_norm": 0.1342584967064569, + "learning_rate": 5.99040819149188e-10, + "loss": 1.8578, + "step": 8060 + }, + { + "epoch": 0.9995662142901407, + "grad_norm": 0.13380846893093876, + "learning_rate": 8.42401874701082e-11, + "loss": 1.8281, + "step": 8065 + }, + { + "epoch": 0.9999380306128772, + "eval_loss": 1.7868006229400635, + "eval_runtime": 207.8873, + "eval_samples_per_second": 68.686, + "eval_steps_per_second": 4.296, + "step": 8068 + }, + { + "epoch": 0.9999380306128772, + "step": 8068, + "total_flos": 4248917998829568.0, + "train_loss": 1.863956661347298, + "train_runtime": 6909.389, + "train_samples_per_second": 18.684, + "train_steps_per_second": 1.168 + } + ], + "logging_steps": 5, + "max_steps": 8068, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4248917998829568.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}