diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.7508519500189322, + "epoch": 1.0, "eval_steps": 661, - "global_step": 1983, + "global_step": 2641, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -13920,6 +13920,4612 @@ "eval_samples_per_second": 4.958, "eval_steps_per_second": 1.24, "step": 1983 + }, + { + "epoch": 0.751230594471791, + "grad_norm": 12.873611450195312, + "learning_rate": 2.943779944470404e-05, + "loss": 1.5529, + "step": 1984 + }, + { + "epoch": 0.7516092389246497, + "grad_norm": 11.92525577545166, + "learning_rate": 2.9352917064632112e-05, + "loss": 0.7601, + "step": 1985 + }, + { + "epoch": 0.7519878833775085, + "grad_norm": 12.782543182373047, + "learning_rate": 2.926813618312134e-05, + "loss": 1.086, + "step": 1986 + }, + { + "epoch": 0.7523665278303673, + "grad_norm": 10.654260635375977, + "learning_rate": 2.9183456921976304e-05, + "loss": 1.0646, + "step": 1987 + }, + { + "epoch": 0.752745172283226, + "grad_norm": 11.402060508728027, + "learning_rate": 2.909887940285554e-05, + "loss": 0.82, + "step": 1988 + }, + { + "epoch": 0.7531238167360849, + "grad_norm": 15.131863594055176, + "learning_rate": 2.901440374727149e-05, + "loss": 1.0551, + "step": 1989 + }, + { + "epoch": 0.7535024611889436, + "grad_norm": 12.485123634338379, + "learning_rate": 2.8930030076590198e-05, + "loss": 0.7528, + "step": 1990 + }, + { + "epoch": 0.7538811056418023, + "grad_norm": 7.999950885772705, + "learning_rate": 2.8845758512031186e-05, + "loss": 0.4973, + "step": 1991 + }, + { + "epoch": 0.7542597500946611, + "grad_norm": 13.8598051071167, + "learning_rate": 2.876158917466726e-05, + "loss": 0.8543, + "step": 1992 + }, + { + "epoch": 0.7546383945475199, + "grad_norm": 13.514735221862793, + "learning_rate": 2.867752218542443e-05, + "loss": 0.9004, + "step": 1993 + }, + { + "epoch": 0.7550170390003786, + "grad_norm": 13.104480743408203, + "learning_rate": 2.8593557665081616e-05, + "loss": 0.8655, + "step": 1994 + }, + { + "epoch": 0.7553956834532374, + "grad_norm": 13.811482429504395, + "learning_rate": 2.8509695734270492e-05, + "loss": 0.5821, + "step": 1995 + }, + { + "epoch": 0.7557743279060962, + "grad_norm": 13.989906311035156, + "learning_rate": 2.8425936513475395e-05, + "loss": 0.6422, + "step": 1996 + }, + { + "epoch": 0.756152972358955, + "grad_norm": 10.727309226989746, + "learning_rate": 2.834228012303306e-05, + "loss": 0.4677, + "step": 1997 + }, + { + "epoch": 0.7565316168118137, + "grad_norm": 15.157230377197266, + "learning_rate": 2.8258726683132474e-05, + "loss": 0.8738, + "step": 1998 + }, + { + "epoch": 0.7569102612646724, + "grad_norm": 17.531665802001953, + "learning_rate": 2.8175276313814813e-05, + "loss": 0.7335, + "step": 1999 + }, + { + "epoch": 0.7572889057175313, + "grad_norm": 10.896133422851562, + "learning_rate": 2.809192913497306e-05, + "loss": 0.5936, + "step": 2000 + }, + { + "epoch": 0.75766755017039, + "grad_norm": 10.433149337768555, + "learning_rate": 2.8008685266351988e-05, + "loss": 2.4051, + "step": 2001 + }, + { + "epoch": 0.7580461946232487, + "grad_norm": 9.504281044006348, + "learning_rate": 2.7925544827547933e-05, + "loss": 1.7767, + "step": 2002 + }, + { + "epoch": 0.7584248390761076, + "grad_norm": 10.231252670288086, + "learning_rate": 2.7842507938008666e-05, + "loss": 1.8186, + "step": 2003 + }, + { + "epoch": 0.7588034835289663, + "grad_norm": 9.87753677368164, + "learning_rate": 2.775957471703311e-05, + "loss": 1.2146, + "step": 2004 + }, + { + "epoch": 0.759182127981825, + "grad_norm": 11.625948905944824, + "learning_rate": 2.7676745283771388e-05, + "loss": 1.5717, + "step": 2005 + }, + { + "epoch": 0.7595607724346838, + "grad_norm": 11.169037818908691, + "learning_rate": 2.7594019757224364e-05, + "loss": 1.3143, + "step": 2006 + }, + { + "epoch": 0.7599394168875426, + "grad_norm": 9.829212188720703, + "learning_rate": 2.7511398256243716e-05, + "loss": 0.9765, + "step": 2007 + }, + { + "epoch": 0.7603180613404014, + "grad_norm": 10.827011108398438, + "learning_rate": 2.7428880899531585e-05, + "loss": 0.7852, + "step": 2008 + }, + { + "epoch": 0.7606967057932601, + "grad_norm": 12.02198314666748, + "learning_rate": 2.7346467805640585e-05, + "loss": 1.0464, + "step": 2009 + }, + { + "epoch": 0.7610753502461189, + "grad_norm": 12.59335708618164, + "learning_rate": 2.7264159092973484e-05, + "loss": 0.8952, + "step": 2010 + }, + { + "epoch": 0.7614539946989777, + "grad_norm": 13.947036743164062, + "learning_rate": 2.718195487978308e-05, + "loss": 1.4524, + "step": 2011 + }, + { + "epoch": 0.7618326391518364, + "grad_norm": 12.672119140625, + "learning_rate": 2.7099855284172017e-05, + "loss": 1.3051, + "step": 2012 + }, + { + "epoch": 0.7622112836046951, + "grad_norm": 10.371209144592285, + "learning_rate": 2.7017860424092712e-05, + "loss": 0.7874, + "step": 2013 + }, + { + "epoch": 0.762589928057554, + "grad_norm": 12.310194969177246, + "learning_rate": 2.6935970417347057e-05, + "loss": 0.9649, + "step": 2014 + }, + { + "epoch": 0.7629685725104127, + "grad_norm": 10.61960506439209, + "learning_rate": 2.6854185381586273e-05, + "loss": 0.8684, + "step": 2015 + }, + { + "epoch": 0.7633472169632715, + "grad_norm": 14.664178848266602, + "learning_rate": 2.6772505434310803e-05, + "loss": 1.0722, + "step": 2016 + }, + { + "epoch": 0.7637258614161303, + "grad_norm": 8.189818382263184, + "learning_rate": 2.6690930692870143e-05, + "loss": 0.4872, + "step": 2017 + }, + { + "epoch": 0.764104505868989, + "grad_norm": 11.075437545776367, + "learning_rate": 2.6609461274462588e-05, + "loss": 0.7672, + "step": 2018 + }, + { + "epoch": 0.7644831503218478, + "grad_norm": 10.562186241149902, + "learning_rate": 2.6528097296135135e-05, + "loss": 0.727, + "step": 2019 + }, + { + "epoch": 0.7648617947747065, + "grad_norm": 13.31433391571045, + "learning_rate": 2.6446838874783254e-05, + "loss": 0.7146, + "step": 2020 + }, + { + "epoch": 0.7652404392275653, + "grad_norm": 10.966352462768555, + "learning_rate": 2.636568612715087e-05, + "loss": 0.3659, + "step": 2021 + }, + { + "epoch": 0.7656190836804241, + "grad_norm": 23.072904586791992, + "learning_rate": 2.6284639169829973e-05, + "loss": 0.6862, + "step": 2022 + }, + { + "epoch": 0.7659977281332828, + "grad_norm": 10.244979858398438, + "learning_rate": 2.6203698119260632e-05, + "loss": 0.3511, + "step": 2023 + }, + { + "epoch": 0.7663763725861417, + "grad_norm": 11.128731727600098, + "learning_rate": 2.6122863091730686e-05, + "loss": 0.329, + "step": 2024 + }, + { + "epoch": 0.7667550170390004, + "grad_norm": 34.793052673339844, + "learning_rate": 2.6042134203375767e-05, + "loss": 1.7021, + "step": 2025 + }, + { + "epoch": 0.7671336614918591, + "grad_norm": 10.744704246520996, + "learning_rate": 2.596151157017892e-05, + "loss": 2.497, + "step": 2026 + }, + { + "epoch": 0.7675123059447179, + "grad_norm": 11.219593048095703, + "learning_rate": 2.588099530797058e-05, + "loss": 2.192, + "step": 2027 + }, + { + "epoch": 0.7678909503975767, + "grad_norm": 10.790613174438477, + "learning_rate": 2.580058553242829e-05, + "loss": 1.1766, + "step": 2028 + }, + { + "epoch": 0.7682695948504354, + "grad_norm": 9.38663101196289, + "learning_rate": 2.572028235907673e-05, + "loss": 1.0091, + "step": 2029 + }, + { + "epoch": 0.7686482393032942, + "grad_norm": 12.048510551452637, + "learning_rate": 2.5640085903287313e-05, + "loss": 1.4117, + "step": 2030 + }, + { + "epoch": 0.769026883756153, + "grad_norm": 14.081696510314941, + "learning_rate": 2.5559996280278196e-05, + "loss": 1.3464, + "step": 2031 + }, + { + "epoch": 0.7694055282090118, + "grad_norm": 12.886058807373047, + "learning_rate": 2.548001360511396e-05, + "loss": 1.6561, + "step": 2032 + }, + { + "epoch": 0.7697841726618705, + "grad_norm": 10.942986488342285, + "learning_rate": 2.5400137992705686e-05, + "loss": 1.01, + "step": 2033 + }, + { + "epoch": 0.7701628171147292, + "grad_norm": 10.077105522155762, + "learning_rate": 2.5320369557810496e-05, + "loss": 0.8631, + "step": 2034 + }, + { + "epoch": 0.7705414615675881, + "grad_norm": 16.22908592224121, + "learning_rate": 2.52407084150316e-05, + "loss": 2.036, + "step": 2035 + }, + { + "epoch": 0.7709201060204468, + "grad_norm": 12.806578636169434, + "learning_rate": 2.516115467881801e-05, + "loss": 1.5009, + "step": 2036 + }, + { + "epoch": 0.7712987504733055, + "grad_norm": 8.642051696777344, + "learning_rate": 2.5081708463464525e-05, + "loss": 0.5311, + "step": 2037 + }, + { + "epoch": 0.7716773949261644, + "grad_norm": 9.904001235961914, + "learning_rate": 2.5002369883111375e-05, + "loss": 0.8588, + "step": 2038 + }, + { + "epoch": 0.7720560393790231, + "grad_norm": 11.078864097595215, + "learning_rate": 2.492313905174418e-05, + "loss": 0.6772, + "step": 2039 + }, + { + "epoch": 0.7724346838318819, + "grad_norm": 14.320399284362793, + "learning_rate": 2.4844016083193745e-05, + "loss": 1.6373, + "step": 2040 + }, + { + "epoch": 0.7728133282847406, + "grad_norm": 12.595783233642578, + "learning_rate": 2.4765001091135965e-05, + "loss": 0.7032, + "step": 2041 + }, + { + "epoch": 0.7731919727375994, + "grad_norm": 11.836488723754883, + "learning_rate": 2.4686094189091548e-05, + "loss": 0.4463, + "step": 2042 + }, + { + "epoch": 0.7735706171904582, + "grad_norm": 9.597789764404297, + "learning_rate": 2.460729549042592e-05, + "loss": 0.4927, + "step": 2043 + }, + { + "epoch": 0.7739492616433169, + "grad_norm": 8.733850479125977, + "learning_rate": 2.4528605108349044e-05, + "loss": 0.4759, + "step": 2044 + }, + { + "epoch": 0.7743279060961757, + "grad_norm": 11.219993591308594, + "learning_rate": 2.4450023155915304e-05, + "loss": 0.7328, + "step": 2045 + }, + { + "epoch": 0.7747065505490345, + "grad_norm": 27.845932006835938, + "learning_rate": 2.4371549746023214e-05, + "loss": 1.4339, + "step": 2046 + }, + { + "epoch": 0.7750851950018932, + "grad_norm": 10.190560340881348, + "learning_rate": 2.4293184991415496e-05, + "loss": 0.3212, + "step": 2047 + }, + { + "epoch": 0.775463839454752, + "grad_norm": 16.55055809020996, + "learning_rate": 2.4214929004678644e-05, + "loss": 0.726, + "step": 2048 + }, + { + "epoch": 0.7758424839076108, + "grad_norm": 19.725399017333984, + "learning_rate": 2.41367818982429e-05, + "loss": 0.6447, + "step": 2049 + }, + { + "epoch": 0.7762211283604695, + "grad_norm": 3.192269802093506, + "learning_rate": 2.405874378438212e-05, + "loss": 0.1195, + "step": 2050 + }, + { + "epoch": 0.7765997728133283, + "grad_norm": 9.078378677368164, + "learning_rate": 2.3980814775213546e-05, + "loss": 2.2485, + "step": 2051 + }, + { + "epoch": 0.7769784172661871, + "grad_norm": 12.071084976196289, + "learning_rate": 2.3902994982697625e-05, + "loss": 2.0394, + "step": 2052 + }, + { + "epoch": 0.7773570617190458, + "grad_norm": 11.375093460083008, + "learning_rate": 2.3825284518638026e-05, + "loss": 1.6176, + "step": 2053 + }, + { + "epoch": 0.7777357061719046, + "grad_norm": 10.902304649353027, + "learning_rate": 2.3747683494681193e-05, + "loss": 1.3598, + "step": 2054 + }, + { + "epoch": 0.7781143506247633, + "grad_norm": 9.987029075622559, + "learning_rate": 2.367019202231644e-05, + "loss": 0.9933, + "step": 2055 + }, + { + "epoch": 0.7784929950776222, + "grad_norm": 12.458052635192871, + "learning_rate": 2.3592810212875615e-05, + "loss": 1.0712, + "step": 2056 + }, + { + "epoch": 0.7788716395304809, + "grad_norm": 13.190988540649414, + "learning_rate": 2.351553817753309e-05, + "loss": 0.9407, + "step": 2057 + }, + { + "epoch": 0.7792502839833396, + "grad_norm": 13.746674537658691, + "learning_rate": 2.3438376027305486e-05, + "loss": 1.8128, + "step": 2058 + }, + { + "epoch": 0.7796289284361985, + "grad_norm": 12.625304222106934, + "learning_rate": 2.336132387305152e-05, + "loss": 1.0643, + "step": 2059 + }, + { + "epoch": 0.7800075728890572, + "grad_norm": 9.5991792678833, + "learning_rate": 2.32843818254719e-05, + "loss": 0.9507, + "step": 2060 + }, + { + "epoch": 0.7803862173419159, + "grad_norm": 11.39356517791748, + "learning_rate": 2.3207549995109213e-05, + "loss": 0.8554, + "step": 2061 + }, + { + "epoch": 0.7807648617947747, + "grad_norm": 10.822458267211914, + "learning_rate": 2.3130828492347613e-05, + "loss": 0.7323, + "step": 2062 + }, + { + "epoch": 0.7811435062476335, + "grad_norm": 14.810583114624023, + "learning_rate": 2.305421742741275e-05, + "loss": 1.0549, + "step": 2063 + }, + { + "epoch": 0.7815221507004922, + "grad_norm": 14.234593391418457, + "learning_rate": 2.2977716910371617e-05, + "loss": 0.8004, + "step": 2064 + }, + { + "epoch": 0.781900795153351, + "grad_norm": 9.462289810180664, + "learning_rate": 2.2901327051132436e-05, + "loss": 0.6268, + "step": 2065 + }, + { + "epoch": 0.7822794396062097, + "grad_norm": 9.984630584716797, + "learning_rate": 2.2825047959444402e-05, + "loss": 0.5759, + "step": 2066 + }, + { + "epoch": 0.7826580840590686, + "grad_norm": 8.960870742797852, + "learning_rate": 2.2748879744897566e-05, + "loss": 0.478, + "step": 2067 + }, + { + "epoch": 0.7830367285119273, + "grad_norm": 12.594315528869629, + "learning_rate": 2.2672822516922664e-05, + "loss": 0.7478, + "step": 2068 + }, + { + "epoch": 0.783415372964786, + "grad_norm": 11.013339042663574, + "learning_rate": 2.2596876384791044e-05, + "loss": 0.4485, + "step": 2069 + }, + { + "epoch": 0.7837940174176449, + "grad_norm": 14.870506286621094, + "learning_rate": 2.25210414576144e-05, + "loss": 1.0851, + "step": 2070 + }, + { + "epoch": 0.7841726618705036, + "grad_norm": 11.4357271194458, + "learning_rate": 2.2445317844344648e-05, + "loss": 0.6047, + "step": 2071 + }, + { + "epoch": 0.7845513063233623, + "grad_norm": 14.853757858276367, + "learning_rate": 2.2369705653773765e-05, + "loss": 0.696, + "step": 2072 + }, + { + "epoch": 0.7849299507762211, + "grad_norm": 19.04551124572754, + "learning_rate": 2.2294204994533728e-05, + "loss": 0.5838, + "step": 2073 + }, + { + "epoch": 0.7853085952290799, + "grad_norm": 21.33306312561035, + "learning_rate": 2.2218815975096207e-05, + "loss": 0.232, + "step": 2074 + }, + { + "epoch": 0.7856872396819387, + "grad_norm": 7.215750694274902, + "learning_rate": 2.2143538703772493e-05, + "loss": 0.3659, + "step": 2075 + }, + { + "epoch": 0.7860658841347974, + "grad_norm": 10.023611068725586, + "learning_rate": 2.2068373288713294e-05, + "loss": 1.963, + "step": 2076 + }, + { + "epoch": 0.7864445285876562, + "grad_norm": 11.151053428649902, + "learning_rate": 2.1993319837908722e-05, + "loss": 1.8424, + "step": 2077 + }, + { + "epoch": 0.786823173040515, + "grad_norm": 10.198956489562988, + "learning_rate": 2.1918378459187928e-05, + "loss": 1.7327, + "step": 2078 + }, + { + "epoch": 0.7872018174933737, + "grad_norm": 13.809074401855469, + "learning_rate": 2.1843549260219075e-05, + "loss": 1.5646, + "step": 2079 + }, + { + "epoch": 0.7875804619462324, + "grad_norm": 10.254709243774414, + "learning_rate": 2.176883234850914e-05, + "loss": 0.9571, + "step": 2080 + }, + { + "epoch": 0.7879591063990913, + "grad_norm": 9.651457786560059, + "learning_rate": 2.1694227831403868e-05, + "loss": 1.0734, + "step": 2081 + }, + { + "epoch": 0.78833775085195, + "grad_norm": 12.739899635314941, + "learning_rate": 2.1619735816087417e-05, + "loss": 1.1163, + "step": 2082 + }, + { + "epoch": 0.7887163953048087, + "grad_norm": 9.453640937805176, + "learning_rate": 2.154535640958235e-05, + "loss": 0.7136, + "step": 2083 + }, + { + "epoch": 0.7890950397576676, + "grad_norm": 11.674582481384277, + "learning_rate": 2.147108971874946e-05, + "loss": 1.197, + "step": 2084 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 11.435443878173828, + "learning_rate": 2.1396935850287615e-05, + "loss": 0.8756, + "step": 2085 + }, + { + "epoch": 0.7898523286633851, + "grad_norm": 11.762517929077148, + "learning_rate": 2.1322894910733547e-05, + "loss": 0.8406, + "step": 2086 + }, + { + "epoch": 0.7902309731162438, + "grad_norm": 11.67284107208252, + "learning_rate": 2.1248967006461783e-05, + "loss": 1.0792, + "step": 2087 + }, + { + "epoch": 0.7906096175691026, + "grad_norm": 11.377511978149414, + "learning_rate": 2.117515224368438e-05, + "loss": 0.8553, + "step": 2088 + }, + { + "epoch": 0.7909882620219614, + "grad_norm": 13.04310131072998, + "learning_rate": 2.110145072845099e-05, + "loss": 0.9179, + "step": 2089 + }, + { + "epoch": 0.7913669064748201, + "grad_norm": 8.919587135314941, + "learning_rate": 2.1027862566648425e-05, + "loss": 0.415, + "step": 2090 + }, + { + "epoch": 0.791745550927679, + "grad_norm": 14.16692066192627, + "learning_rate": 2.095438786400068e-05, + "loss": 0.6586, + "step": 2091 + }, + { + "epoch": 0.7921241953805377, + "grad_norm": 9.801429748535156, + "learning_rate": 2.0881026726068775e-05, + "loss": 0.5952, + "step": 2092 + }, + { + "epoch": 0.7925028398333964, + "grad_norm": 14.237808227539062, + "learning_rate": 2.0807779258250537e-05, + "loss": 0.5729, + "step": 2093 + }, + { + "epoch": 0.7928814842862552, + "grad_norm": 10.763073921203613, + "learning_rate": 2.073464556578051e-05, + "loss": 0.634, + "step": 2094 + }, + { + "epoch": 0.793260128739114, + "grad_norm": 14.481758117675781, + "learning_rate": 2.0661625753729707e-05, + "loss": 0.6255, + "step": 2095 + }, + { + "epoch": 0.7936387731919727, + "grad_norm": 12.738697052001953, + "learning_rate": 2.058871992700567e-05, + "loss": 0.4656, + "step": 2096 + }, + { + "epoch": 0.7940174176448315, + "grad_norm": 9.8129243850708, + "learning_rate": 2.0515928190352052e-05, + "loss": 0.352, + "step": 2097 + }, + { + "epoch": 0.7943960620976903, + "grad_norm": 7.001706123352051, + "learning_rate": 2.0443250648348645e-05, + "loss": 0.1885, + "step": 2098 + }, + { + "epoch": 0.794774706550549, + "grad_norm": 10.005912780761719, + "learning_rate": 2.037068740541116e-05, + "loss": 0.4027, + "step": 2099 + }, + { + "epoch": 0.7951533510034078, + "grad_norm": 29.922603607177734, + "learning_rate": 2.0298238565791072e-05, + "loss": 0.9724, + "step": 2100 + }, + { + "epoch": 0.7955319954562665, + "grad_norm": 9.894120216369629, + "learning_rate": 2.0225904233575586e-05, + "loss": 2.2812, + "step": 2101 + }, + { + "epoch": 0.7959106399091254, + "grad_norm": 10.518170356750488, + "learning_rate": 2.0153684512687297e-05, + "loss": 1.9124, + "step": 2102 + }, + { + "epoch": 0.7962892843619841, + "grad_norm": 11.77073860168457, + "learning_rate": 2.0081579506884184e-05, + "loss": 1.714, + "step": 2103 + }, + { + "epoch": 0.7966679288148428, + "grad_norm": 10.411770820617676, + "learning_rate": 2.0009589319759358e-05, + "loss": 1.084, + "step": 2104 + }, + { + "epoch": 0.7970465732677017, + "grad_norm": 10.7216796875, + "learning_rate": 1.9937714054741095e-05, + "loss": 1.2028, + "step": 2105 + }, + { + "epoch": 0.7974252177205604, + "grad_norm": 12.313328742980957, + "learning_rate": 1.9865953815092443e-05, + "loss": 1.3527, + "step": 2106 + }, + { + "epoch": 0.7978038621734191, + "grad_norm": 11.44083023071289, + "learning_rate": 1.9794308703911223e-05, + "loss": 1.308, + "step": 2107 + }, + { + "epoch": 0.7981825066262779, + "grad_norm": 7.195469856262207, + "learning_rate": 1.972277882412985e-05, + "loss": 0.6015, + "step": 2108 + }, + { + "epoch": 0.7985611510791367, + "grad_norm": 9.268226623535156, + "learning_rate": 1.965136427851525e-05, + "loss": 0.713, + "step": 2109 + }, + { + "epoch": 0.7989397955319955, + "grad_norm": 11.629197120666504, + "learning_rate": 1.958006516966857e-05, + "loss": 1.1342, + "step": 2110 + }, + { + "epoch": 0.7993184399848542, + "grad_norm": 9.284026145935059, + "learning_rate": 1.950888160002513e-05, + "loss": 0.652, + "step": 2111 + }, + { + "epoch": 0.799697084437713, + "grad_norm": 11.595834732055664, + "learning_rate": 1.9437813671854243e-05, + "loss": 0.7744, + "step": 2112 + }, + { + "epoch": 0.8000757288905718, + "grad_norm": 12.920877456665039, + "learning_rate": 1.9366861487259134e-05, + "loss": 0.7837, + "step": 2113 + }, + { + "epoch": 0.8004543733434305, + "grad_norm": 11.606154441833496, + "learning_rate": 1.92960251481767e-05, + "loss": 0.7631, + "step": 2114 + }, + { + "epoch": 0.8008330177962892, + "grad_norm": 11.07241153717041, + "learning_rate": 1.9225304756377394e-05, + "loss": 0.6119, + "step": 2115 + }, + { + "epoch": 0.8012116622491481, + "grad_norm": 9.720746994018555, + "learning_rate": 1.9154700413465077e-05, + "loss": 0.5343, + "step": 2116 + }, + { + "epoch": 0.8015903067020068, + "grad_norm": 13.459938049316406, + "learning_rate": 1.9084212220876942e-05, + "loss": 0.8311, + "step": 2117 + }, + { + "epoch": 0.8019689511548656, + "grad_norm": 19.92449188232422, + "learning_rate": 1.9013840279883267e-05, + "loss": 1.5106, + "step": 2118 + }, + { + "epoch": 0.8023475956077244, + "grad_norm": 12.470542907714844, + "learning_rate": 1.8943584691587313e-05, + "loss": 0.5453, + "step": 2119 + }, + { + "epoch": 0.8027262400605831, + "grad_norm": 11.011085510253906, + "learning_rate": 1.887344555692515e-05, + "loss": 0.9123, + "step": 2120 + }, + { + "epoch": 0.8031048845134419, + "grad_norm": 9.609660148620605, + "learning_rate": 1.880342297666563e-05, + "loss": 0.3907, + "step": 2121 + }, + { + "epoch": 0.8034835289663006, + "grad_norm": 18.808673858642578, + "learning_rate": 1.8733517051410054e-05, + "loss": 0.7887, + "step": 2122 + }, + { + "epoch": 0.8038621734191594, + "grad_norm": 11.743169784545898, + "learning_rate": 1.8663727881592176e-05, + "loss": 0.6305, + "step": 2123 + }, + { + "epoch": 0.8042408178720182, + "grad_norm": 15.70638656616211, + "learning_rate": 1.8594055567477965e-05, + "loss": 0.2316, + "step": 2124 + }, + { + "epoch": 0.8046194623248769, + "grad_norm": 18.647750854492188, + "learning_rate": 1.8524500209165573e-05, + "loss": 0.2247, + "step": 2125 + }, + { + "epoch": 0.8049981067777358, + "grad_norm": 11.47064208984375, + "learning_rate": 1.8455061906585068e-05, + "loss": 2.2448, + "step": 2126 + }, + { + "epoch": 0.8053767512305945, + "grad_norm": 11.74728775024414, + "learning_rate": 1.838574075949836e-05, + "loss": 1.9097, + "step": 2127 + }, + { + "epoch": 0.8057553956834532, + "grad_norm": 10.916732788085938, + "learning_rate": 1.8316536867499013e-05, + "loss": 1.5175, + "step": 2128 + }, + { + "epoch": 0.806134040136312, + "grad_norm": 12.84622573852539, + "learning_rate": 1.8247450330012206e-05, + "loss": 1.2506, + "step": 2129 + }, + { + "epoch": 0.8065126845891708, + "grad_norm": 11.161234855651855, + "learning_rate": 1.8178481246294433e-05, + "loss": 1.2847, + "step": 2130 + }, + { + "epoch": 0.8068913290420295, + "grad_norm": 11.394064903259277, + "learning_rate": 1.8109629715433497e-05, + "loss": 1.2467, + "step": 2131 + }, + { + "epoch": 0.8072699734948883, + "grad_norm": 14.243141174316406, + "learning_rate": 1.804089583634825e-05, + "loss": 1.1874, + "step": 2132 + }, + { + "epoch": 0.8076486179477471, + "grad_norm": 9.28836727142334, + "learning_rate": 1.7972279707788608e-05, + "loss": 0.9387, + "step": 2133 + }, + { + "epoch": 0.8080272624006059, + "grad_norm": 8.111406326293945, + "learning_rate": 1.790378142833524e-05, + "loss": 0.538, + "step": 2134 + }, + { + "epoch": 0.8084059068534646, + "grad_norm": 12.397017478942871, + "learning_rate": 1.783540109639953e-05, + "loss": 0.8542, + "step": 2135 + }, + { + "epoch": 0.8087845513063233, + "grad_norm": 13.441441535949707, + "learning_rate": 1.776713881022337e-05, + "loss": 0.6562, + "step": 2136 + }, + { + "epoch": 0.8091631957591822, + "grad_norm": 15.309789657592773, + "learning_rate": 1.769899466787913e-05, + "loss": 1.038, + "step": 2137 + }, + { + "epoch": 0.8095418402120409, + "grad_norm": 9.063993453979492, + "learning_rate": 1.7630968767269396e-05, + "loss": 0.6075, + "step": 2138 + }, + { + "epoch": 0.8099204846648996, + "grad_norm": 13.524983406066895, + "learning_rate": 1.7563061206126875e-05, + "loss": 1.1014, + "step": 2139 + }, + { + "epoch": 0.8102991291177585, + "grad_norm": 10.966734886169434, + "learning_rate": 1.7495272082014235e-05, + "loss": 0.633, + "step": 2140 + }, + { + "epoch": 0.8106777735706172, + "grad_norm": 13.029152870178223, + "learning_rate": 1.742760149232404e-05, + "loss": 0.7217, + "step": 2141 + }, + { + "epoch": 0.811056418023476, + "grad_norm": 10.429238319396973, + "learning_rate": 1.736004953427852e-05, + "loss": 0.7237, + "step": 2142 + }, + { + "epoch": 0.8114350624763347, + "grad_norm": 10.06002140045166, + "learning_rate": 1.7292616304929454e-05, + "loss": 0.5242, + "step": 2143 + }, + { + "epoch": 0.8118137069291935, + "grad_norm": 12.623034477233887, + "learning_rate": 1.7225301901158097e-05, + "loss": 0.679, + "step": 2144 + }, + { + "epoch": 0.8121923513820523, + "grad_norm": 11.331562995910645, + "learning_rate": 1.7158106419674956e-05, + "loss": 0.5334, + "step": 2145 + }, + { + "epoch": 0.812570995834911, + "grad_norm": 5.492001533508301, + "learning_rate": 1.7091029957019656e-05, + "loss": 0.2017, + "step": 2146 + }, + { + "epoch": 0.8129496402877698, + "grad_norm": 15.369951248168945, + "learning_rate": 1.702407260956087e-05, + "loss": 0.6085, + "step": 2147 + }, + { + "epoch": 0.8133282847406286, + "grad_norm": 14.828356742858887, + "learning_rate": 1.6957234473496087e-05, + "loss": 0.5092, + "step": 2148 + }, + { + "epoch": 0.8137069291934873, + "grad_norm": 9.543940544128418, + "learning_rate": 1.6890515644851612e-05, + "loss": 0.283, + "step": 2149 + }, + { + "epoch": 0.814085573646346, + "grad_norm": 10.726631164550781, + "learning_rate": 1.6823916219482273e-05, + "loss": 0.5966, + "step": 2150 + }, + { + "epoch": 0.8144642180992049, + "grad_norm": 8.761448860168457, + "learning_rate": 1.6757436293071362e-05, + "loss": 1.9309, + "step": 2151 + }, + { + "epoch": 0.8148428625520636, + "grad_norm": 8.565874099731445, + "learning_rate": 1.6691075961130452e-05, + "loss": 1.2893, + "step": 2152 + }, + { + "epoch": 0.8152215070049224, + "grad_norm": 9.82201099395752, + "learning_rate": 1.662483531899941e-05, + "loss": 1.4838, + "step": 2153 + }, + { + "epoch": 0.8156001514577812, + "grad_norm": 14.917886734008789, + "learning_rate": 1.6558714461846025e-05, + "loss": 1.4838, + "step": 2154 + }, + { + "epoch": 0.8159787959106399, + "grad_norm": 10.092191696166992, + "learning_rate": 1.6492713484666057e-05, + "loss": 1.0222, + "step": 2155 + }, + { + "epoch": 0.8163574403634987, + "grad_norm": 10.887097358703613, + "learning_rate": 1.6426832482282973e-05, + "loss": 1.1809, + "step": 2156 + }, + { + "epoch": 0.8167360848163574, + "grad_norm": 9.894757270812988, + "learning_rate": 1.636107154934796e-05, + "loss": 1.2049, + "step": 2157 + }, + { + "epoch": 0.8171147292692162, + "grad_norm": 12.424516677856445, + "learning_rate": 1.629543078033964e-05, + "loss": 0.8421, + "step": 2158 + }, + { + "epoch": 0.817493373722075, + "grad_norm": 8.398805618286133, + "learning_rate": 1.622991026956401e-05, + "loss": 0.7292, + "step": 2159 + }, + { + "epoch": 0.8178720181749337, + "grad_norm": 7.698610305786133, + "learning_rate": 1.616451011115426e-05, + "loss": 0.6156, + "step": 2160 + }, + { + "epoch": 0.8182506626277926, + "grad_norm": 12.355453491210938, + "learning_rate": 1.6099230399070763e-05, + "loss": 0.7348, + "step": 2161 + }, + { + "epoch": 0.8186293070806513, + "grad_norm": 9.122222900390625, + "learning_rate": 1.6034071227100755e-05, + "loss": 0.6662, + "step": 2162 + }, + { + "epoch": 0.81900795153351, + "grad_norm": 13.555700302124023, + "learning_rate": 1.596903268885832e-05, + "loss": 0.7399, + "step": 2163 + }, + { + "epoch": 0.8193865959863688, + "grad_norm": 10.907391548156738, + "learning_rate": 1.5904114877784205e-05, + "loss": 0.5784, + "step": 2164 + }, + { + "epoch": 0.8197652404392276, + "grad_norm": 10.284018516540527, + "learning_rate": 1.5839317887145798e-05, + "loss": 0.7392, + "step": 2165 + }, + { + "epoch": 0.8201438848920863, + "grad_norm": 11.968799591064453, + "learning_rate": 1.5774641810036793e-05, + "loss": 0.776, + "step": 2166 + }, + { + "epoch": 0.8205225293449451, + "grad_norm": 8.554612159729004, + "learning_rate": 1.571008673937724e-05, + "loss": 0.3263, + "step": 2167 + }, + { + "epoch": 0.8209011737978038, + "grad_norm": 12.886919021606445, + "learning_rate": 1.5645652767913287e-05, + "loss": 0.9284, + "step": 2168 + }, + { + "epoch": 0.8212798182506627, + "grad_norm": 13.11758041381836, + "learning_rate": 1.5581339988217157e-05, + "loss": 0.926, + "step": 2169 + }, + { + "epoch": 0.8216584627035214, + "grad_norm": 7.769589900970459, + "learning_rate": 1.5517148492686918e-05, + "loss": 0.4201, + "step": 2170 + }, + { + "epoch": 0.8220371071563801, + "grad_norm": 11.231391906738281, + "learning_rate": 1.5453078373546405e-05, + "loss": 0.4707, + "step": 2171 + }, + { + "epoch": 0.822415751609239, + "grad_norm": 10.800396919250488, + "learning_rate": 1.538912972284502e-05, + "loss": 0.3822, + "step": 2172 + }, + { + "epoch": 0.8227943960620977, + "grad_norm": 4.893832683563232, + "learning_rate": 1.532530263245776e-05, + "loss": 0.2115, + "step": 2173 + }, + { + "epoch": 0.8231730405149564, + "grad_norm": 14.236945152282715, + "learning_rate": 1.5261597194084876e-05, + "loss": 0.3221, + "step": 2174 + }, + { + "epoch": 0.8235516849678152, + "grad_norm": 26.767309188842773, + "learning_rate": 1.5198013499251895e-05, + "loss": 1.1176, + "step": 2175 + }, + { + "epoch": 0.823930329420674, + "grad_norm": 10.102524757385254, + "learning_rate": 1.513455163930938e-05, + "loss": 2.0341, + "step": 2176 + }, + { + "epoch": 0.8243089738735327, + "grad_norm": 10.047316551208496, + "learning_rate": 1.5071211705432953e-05, + "loss": 1.598, + "step": 2177 + }, + { + "epoch": 0.8246876183263915, + "grad_norm": 11.753214836120605, + "learning_rate": 1.5007993788622977e-05, + "loss": 1.2507, + "step": 2178 + }, + { + "epoch": 0.8250662627792503, + "grad_norm": 9.349088668823242, + "learning_rate": 1.4944897979704531e-05, + "loss": 1.216, + "step": 2179 + }, + { + "epoch": 0.8254449072321091, + "grad_norm": 8.784585952758789, + "learning_rate": 1.4881924369327261e-05, + "loss": 0.7352, + "step": 2180 + }, + { + "epoch": 0.8258235516849678, + "grad_norm": 11.812366485595703, + "learning_rate": 1.4819073047965304e-05, + "loss": 0.9288, + "step": 2181 + }, + { + "epoch": 0.8262021961378265, + "grad_norm": 8.885299682617188, + "learning_rate": 1.475634410591703e-05, + "loss": 0.8133, + "step": 2182 + }, + { + "epoch": 0.8265808405906854, + "grad_norm": 14.608964920043945, + "learning_rate": 1.4693737633305038e-05, + "loss": 1.073, + "step": 2183 + }, + { + "epoch": 0.8269594850435441, + "grad_norm": 12.234383583068848, + "learning_rate": 1.463125372007591e-05, + "loss": 1.1562, + "step": 2184 + }, + { + "epoch": 0.8273381294964028, + "grad_norm": 14.701723098754883, + "learning_rate": 1.456889245600026e-05, + "loss": 1.4302, + "step": 2185 + }, + { + "epoch": 0.8277167739492617, + "grad_norm": 8.882140159606934, + "learning_rate": 1.4506653930672387e-05, + "loss": 0.6384, + "step": 2186 + }, + { + "epoch": 0.8280954184021204, + "grad_norm": 9.586640357971191, + "learning_rate": 1.4444538233510296e-05, + "loss": 0.545, + "step": 2187 + }, + { + "epoch": 0.8284740628549792, + "grad_norm": 8.397727012634277, + "learning_rate": 1.4382545453755524e-05, + "loss": 0.4565, + "step": 2188 + }, + { + "epoch": 0.8288527073078379, + "grad_norm": 13.309687614440918, + "learning_rate": 1.4320675680472995e-05, + "loss": 0.9385, + "step": 2189 + }, + { + "epoch": 0.8292313517606967, + "grad_norm": 12.719165802001953, + "learning_rate": 1.4258929002550925e-05, + "loss": 0.8029, + "step": 2190 + }, + { + "epoch": 0.8296099962135555, + "grad_norm": 11.410460472106934, + "learning_rate": 1.4197305508700665e-05, + "loss": 0.618, + "step": 2191 + }, + { + "epoch": 0.8299886406664142, + "grad_norm": 11.324352264404297, + "learning_rate": 1.4135805287456638e-05, + "loss": 0.543, + "step": 2192 + }, + { + "epoch": 0.830367285119273, + "grad_norm": 11.747859954833984, + "learning_rate": 1.407442842717609e-05, + "loss": 0.575, + "step": 2193 + }, + { + "epoch": 0.8307459295721318, + "grad_norm": 9.637734413146973, + "learning_rate": 1.4013175016039082e-05, + "loss": 0.3842, + "step": 2194 + }, + { + "epoch": 0.8311245740249905, + "grad_norm": 10.062605857849121, + "learning_rate": 1.3952045142048287e-05, + "loss": 0.398, + "step": 2195 + }, + { + "epoch": 0.8315032184778492, + "grad_norm": 9.133951187133789, + "learning_rate": 1.3891038893028897e-05, + "loss": 0.3874, + "step": 2196 + }, + { + "epoch": 0.8318818629307081, + "grad_norm": 11.680546760559082, + "learning_rate": 1.3830156356628531e-05, + "loss": 0.5123, + "step": 2197 + }, + { + "epoch": 0.8322605073835668, + "grad_norm": 9.764351844787598, + "learning_rate": 1.3769397620317038e-05, + "loss": 0.5171, + "step": 2198 + }, + { + "epoch": 0.8326391518364256, + "grad_norm": 14.688189506530762, + "learning_rate": 1.3708762771386386e-05, + "loss": 0.2875, + "step": 2199 + }, + { + "epoch": 0.8330177962892844, + "grad_norm": 25.08920669555664, + "learning_rate": 1.364825189695056e-05, + "loss": 0.6495, + "step": 2200 + }, + { + "epoch": 0.8333964407421431, + "grad_norm": 9.983552932739258, + "learning_rate": 1.3587865083945483e-05, + "loss": 1.9564, + "step": 2201 + }, + { + "epoch": 0.8337750851950019, + "grad_norm": 10.196476936340332, + "learning_rate": 1.3527602419128793e-05, + "loss": 1.8545, + "step": 2202 + }, + { + "epoch": 0.8341537296478606, + "grad_norm": 10.242213249206543, + "learning_rate": 1.3467463989079764e-05, + "loss": 1.3327, + "step": 2203 + }, + { + "epoch": 0.8345323741007195, + "grad_norm": 11.082919120788574, + "learning_rate": 1.3407449880199175e-05, + "loss": 1.2277, + "step": 2204 + }, + { + "epoch": 0.8349110185535782, + "grad_norm": 14.372567176818848, + "learning_rate": 1.334756017870924e-05, + "loss": 1.2489, + "step": 2205 + }, + { + "epoch": 0.8352896630064369, + "grad_norm": 9.300726890563965, + "learning_rate": 1.328779497065339e-05, + "loss": 0.6838, + "step": 2206 + }, + { + "epoch": 0.8356683074592958, + "grad_norm": 10.5520658493042, + "learning_rate": 1.3228154341896225e-05, + "loss": 0.9467, + "step": 2207 + }, + { + "epoch": 0.8360469519121545, + "grad_norm": 11.126348495483398, + "learning_rate": 1.316863837812331e-05, + "loss": 0.9089, + "step": 2208 + }, + { + "epoch": 0.8364255963650132, + "grad_norm": 14.790655136108398, + "learning_rate": 1.3109247164841199e-05, + "loss": 0.6592, + "step": 2209 + }, + { + "epoch": 0.836804240817872, + "grad_norm": 10.294766426086426, + "learning_rate": 1.3049980787377126e-05, + "loss": 0.5883, + "step": 2210 + }, + { + "epoch": 0.8371828852707308, + "grad_norm": 14.918768882751465, + "learning_rate": 1.2990839330879024e-05, + "loss": 0.8069, + "step": 2211 + }, + { + "epoch": 0.8375615297235895, + "grad_norm": 11.903068542480469, + "learning_rate": 1.2931822880315303e-05, + "loss": 1.0219, + "step": 2212 + }, + { + "epoch": 0.8379401741764483, + "grad_norm": 11.20407772064209, + "learning_rate": 1.2872931520474873e-05, + "loss": 0.5456, + "step": 2213 + }, + { + "epoch": 0.8383188186293071, + "grad_norm": 11.456269264221191, + "learning_rate": 1.281416533596682e-05, + "loss": 0.5934, + "step": 2214 + }, + { + "epoch": 0.8386974630821659, + "grad_norm": 13.103646278381348, + "learning_rate": 1.2755524411220455e-05, + "loss": 1.0718, + "step": 2215 + }, + { + "epoch": 0.8390761075350246, + "grad_norm": 11.291081428527832, + "learning_rate": 1.269700883048508e-05, + "loss": 0.7964, + "step": 2216 + }, + { + "epoch": 0.8394547519878833, + "grad_norm": 9.637993812561035, + "learning_rate": 1.2638618677829983e-05, + "loss": 0.5809, + "step": 2217 + }, + { + "epoch": 0.8398333964407422, + "grad_norm": 17.41292953491211, + "learning_rate": 1.2580354037144194e-05, + "loss": 0.8842, + "step": 2218 + }, + { + "epoch": 0.8402120408936009, + "grad_norm": 7.992457866668701, + "learning_rate": 1.2522214992136449e-05, + "loss": 0.3386, + "step": 2219 + }, + { + "epoch": 0.8405906853464596, + "grad_norm": 8.05163288116455, + "learning_rate": 1.2464201626334982e-05, + "loss": 0.467, + "step": 2220 + }, + { + "epoch": 0.8409693297993185, + "grad_norm": 7.449150562286377, + "learning_rate": 1.2406314023087584e-05, + "loss": 0.3405, + "step": 2221 + }, + { + "epoch": 0.8413479742521772, + "grad_norm": 8.64903736114502, + "learning_rate": 1.2348552265561242e-05, + "loss": 0.3131, + "step": 2222 + }, + { + "epoch": 0.841726618705036, + "grad_norm": 11.01096248626709, + "learning_rate": 1.2290916436742205e-05, + "loss": 0.377, + "step": 2223 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 9.92861270904541, + "learning_rate": 1.223340661943576e-05, + "loss": 0.2074, + "step": 2224 + }, + { + "epoch": 0.8424839076107535, + "grad_norm": 8.516009330749512, + "learning_rate": 1.2176022896266214e-05, + "loss": 0.1966, + "step": 2225 + }, + { + "epoch": 0.8428625520636123, + "grad_norm": 13.25462818145752, + "learning_rate": 1.2118765349676664e-05, + "loss": 2.6535, + "step": 2226 + }, + { + "epoch": 0.843241196516471, + "grad_norm": 12.067037582397461, + "learning_rate": 1.206163406192895e-05, + "loss": 1.7434, + "step": 2227 + }, + { + "epoch": 0.8436198409693298, + "grad_norm": 11.304587364196777, + "learning_rate": 1.2004629115103471e-05, + "loss": 1.6441, + "step": 2228 + }, + { + "epoch": 0.8439984854221886, + "grad_norm": 9.689388275146484, + "learning_rate": 1.1947750591099206e-05, + "loss": 0.9401, + "step": 2229 + }, + { + "epoch": 0.8443771298750473, + "grad_norm": 13.32154655456543, + "learning_rate": 1.1890998571633427e-05, + "loss": 1.0714, + "step": 2230 + }, + { + "epoch": 0.844755774327906, + "grad_norm": 11.89297103881836, + "learning_rate": 1.1834373138241672e-05, + "loss": 1.1308, + "step": 2231 + }, + { + "epoch": 0.8451344187807649, + "grad_norm": 11.245408058166504, + "learning_rate": 1.1777874372277597e-05, + "loss": 0.7964, + "step": 2232 + }, + { + "epoch": 0.8455130632336236, + "grad_norm": 8.075695991516113, + "learning_rate": 1.1721502354912939e-05, + "loss": 0.5496, + "step": 2233 + }, + { + "epoch": 0.8458917076864824, + "grad_norm": 13.818008422851562, + "learning_rate": 1.1665257167137289e-05, + "loss": 0.8236, + "step": 2234 + }, + { + "epoch": 0.8462703521393412, + "grad_norm": 10.466621398925781, + "learning_rate": 1.1609138889757998e-05, + "loss": 0.7633, + "step": 2235 + }, + { + "epoch": 0.8466489965921999, + "grad_norm": 10.655759811401367, + "learning_rate": 1.1553147603400139e-05, + "loss": 0.7656, + "step": 2236 + }, + { + "epoch": 0.8470276410450587, + "grad_norm": 11.960763931274414, + "learning_rate": 1.1497283388506285e-05, + "loss": 0.7411, + "step": 2237 + }, + { + "epoch": 0.8474062854979174, + "grad_norm": 10.754295349121094, + "learning_rate": 1.1441546325336505e-05, + "loss": 0.6966, + "step": 2238 + }, + { + "epoch": 0.8477849299507763, + "grad_norm": 10.447911262512207, + "learning_rate": 1.1385936493968108e-05, + "loss": 0.5975, + "step": 2239 + }, + { + "epoch": 0.848163574403635, + "grad_norm": 7.784918308258057, + "learning_rate": 1.1330453974295708e-05, + "loss": 0.5167, + "step": 2240 + }, + { + "epoch": 0.8485422188564937, + "grad_norm": 7.070992946624756, + "learning_rate": 1.127509884603095e-05, + "loss": 0.4106, + "step": 2241 + }, + { + "epoch": 0.8489208633093526, + "grad_norm": 10.769533157348633, + "learning_rate": 1.1219871188702447e-05, + "loss": 0.6198, + "step": 2242 + }, + { + "epoch": 0.8492995077622113, + "grad_norm": 11.375531196594238, + "learning_rate": 1.1164771081655712e-05, + "loss": 0.4514, + "step": 2243 + }, + { + "epoch": 0.84967815221507, + "grad_norm": 10.334220886230469, + "learning_rate": 1.1109798604052957e-05, + "loss": 0.239, + "step": 2244 + }, + { + "epoch": 0.8500567966679288, + "grad_norm": 5.8680315017700195, + "learning_rate": 1.1054953834873095e-05, + "loss": 0.1725, + "step": 2245 + }, + { + "epoch": 0.8504354411207876, + "grad_norm": 11.254388809204102, + "learning_rate": 1.1000236852911527e-05, + "loss": 0.4195, + "step": 2246 + }, + { + "epoch": 0.8508140855736464, + "grad_norm": 10.101336479187012, + "learning_rate": 1.0945647736780052e-05, + "loss": 0.3015, + "step": 2247 + }, + { + "epoch": 0.8511927300265051, + "grad_norm": 10.285369873046875, + "learning_rate": 1.0891186564906742e-05, + "loss": 0.2216, + "step": 2248 + }, + { + "epoch": 0.8515713744793639, + "grad_norm": 10.76842212677002, + "learning_rate": 1.083685341553593e-05, + "loss": 0.2561, + "step": 2249 + }, + { + "epoch": 0.8519500189322227, + "grad_norm": 24.11894989013672, + "learning_rate": 1.0782648366727965e-05, + "loss": 0.5513, + "step": 2250 + }, + { + "epoch": 0.8523286633850814, + "grad_norm": 11.82591438293457, + "learning_rate": 1.072857149635914e-05, + "loss": 2.4543, + "step": 2251 + }, + { + "epoch": 0.8527073078379401, + "grad_norm": 9.538484573364258, + "learning_rate": 1.067462288212162e-05, + "loss": 1.1095, + "step": 2252 + }, + { + "epoch": 0.853085952290799, + "grad_norm": 11.511687278747559, + "learning_rate": 1.0620802601523316e-05, + "loss": 1.5651, + "step": 2253 + }, + { + "epoch": 0.8534645967436577, + "grad_norm": 11.207715034484863, + "learning_rate": 1.0567110731887742e-05, + "loss": 0.9555, + "step": 2254 + }, + { + "epoch": 0.8538432411965164, + "grad_norm": 10.063400268554688, + "learning_rate": 1.0513547350353936e-05, + "loss": 1.0487, + "step": 2255 + }, + { + "epoch": 0.8542218856493753, + "grad_norm": 11.091261863708496, + "learning_rate": 1.0460112533876287e-05, + "loss": 1.0456, + "step": 2256 + }, + { + "epoch": 0.854600530102234, + "grad_norm": 11.866201400756836, + "learning_rate": 1.0406806359224574e-05, + "loss": 1.1276, + "step": 2257 + }, + { + "epoch": 0.8549791745550928, + "grad_norm": 12.267317771911621, + "learning_rate": 1.035362890298368e-05, + "loss": 0.8773, + "step": 2258 + }, + { + "epoch": 0.8553578190079515, + "grad_norm": 12.725680351257324, + "learning_rate": 1.030058024155357e-05, + "loss": 0.8035, + "step": 2259 + }, + { + "epoch": 0.8557364634608103, + "grad_norm": 11.86599063873291, + "learning_rate": 1.0247660451149166e-05, + "loss": 0.9379, + "step": 2260 + }, + { + "epoch": 0.8561151079136691, + "grad_norm": 13.769980430603027, + "learning_rate": 1.0194869607800305e-05, + "loss": 0.9552, + "step": 2261 + }, + { + "epoch": 0.8564937523665278, + "grad_norm": 10.183456420898438, + "learning_rate": 1.0142207787351465e-05, + "loss": 0.7875, + "step": 2262 + }, + { + "epoch": 0.8568723968193867, + "grad_norm": 13.380812644958496, + "learning_rate": 1.0089675065461834e-05, + "loss": 0.6164, + "step": 2263 + }, + { + "epoch": 0.8572510412722454, + "grad_norm": 9.049885749816895, + "learning_rate": 1.0037271517605063e-05, + "loss": 0.64, + "step": 2264 + }, + { + "epoch": 0.8576296857251041, + "grad_norm": 12.912697792053223, + "learning_rate": 9.984997219069304e-06, + "loss": 0.6789, + "step": 2265 + }, + { + "epoch": 0.8580083301779629, + "grad_norm": 9.801331520080566, + "learning_rate": 9.932852244956936e-06, + "loss": 0.4367, + "step": 2266 + }, + { + "epoch": 0.8583869746308217, + "grad_norm": 16.610326766967773, + "learning_rate": 9.880836670184567e-06, + "loss": 0.6217, + "step": 2267 + }, + { + "epoch": 0.8587656190836804, + "grad_norm": 10.286986351013184, + "learning_rate": 9.828950569482875e-06, + "loss": 0.4884, + "step": 2268 + }, + { + "epoch": 0.8591442635365392, + "grad_norm": 11.326078414916992, + "learning_rate": 9.777194017396595e-06, + "loss": 0.7175, + "step": 2269 + }, + { + "epoch": 0.859522907989398, + "grad_norm": 11.866266250610352, + "learning_rate": 9.72556708828427e-06, + "loss": 0.5466, + "step": 2270 + }, + { + "epoch": 0.8599015524422567, + "grad_norm": 13.79624080657959, + "learning_rate": 9.674069856318224e-06, + "loss": 0.5976, + "step": 2271 + }, + { + "epoch": 0.8602801968951155, + "grad_norm": 8.457572937011719, + "learning_rate": 9.622702395484451e-06, + "loss": 0.2856, + "step": 2272 + }, + { + "epoch": 0.8606588413479742, + "grad_norm": 17.572168350219727, + "learning_rate": 9.571464779582529e-06, + "loss": 0.2946, + "step": 2273 + }, + { + "epoch": 0.8610374858008331, + "grad_norm": 16.828079223632812, + "learning_rate": 9.52035708222545e-06, + "loss": 0.7045, + "step": 2274 + }, + { + "epoch": 0.8614161302536918, + "grad_norm": 19.846851348876953, + "learning_rate": 9.469379376839582e-06, + "loss": 0.1168, + "step": 2275 + }, + { + "epoch": 0.8617947747065505, + "grad_norm": 10.008796691894531, + "learning_rate": 9.418531736664483e-06, + "loss": 2.1273, + "step": 2276 + }, + { + "epoch": 0.8621734191594093, + "grad_norm": 10.289381980895996, + "learning_rate": 9.367814234752937e-06, + "loss": 1.7055, + "step": 2277 + }, + { + "epoch": 0.8625520636122681, + "grad_norm": 11.30695629119873, + "learning_rate": 9.31722694397067e-06, + "loss": 1.472, + "step": 2278 + }, + { + "epoch": 0.8629307080651268, + "grad_norm": 8.154560089111328, + "learning_rate": 9.266769936996389e-06, + "loss": 0.9404, + "step": 2279 + }, + { + "epoch": 0.8633093525179856, + "grad_norm": 10.423186302185059, + "learning_rate": 9.216443286321586e-06, + "loss": 1.3779, + "step": 2280 + }, + { + "epoch": 0.8636879969708444, + "grad_norm": 11.121103286743164, + "learning_rate": 9.166247064250477e-06, + "loss": 0.5472, + "step": 2281 + }, + { + "epoch": 0.8640666414237032, + "grad_norm": 9.69433307647705, + "learning_rate": 9.116181342899932e-06, + "loss": 0.7709, + "step": 2282 + }, + { + "epoch": 0.8644452858765619, + "grad_norm": 9.098278045654297, + "learning_rate": 9.06624619419928e-06, + "loss": 0.4906, + "step": 2283 + }, + { + "epoch": 0.8648239303294206, + "grad_norm": 14.784154891967773, + "learning_rate": 9.016441689890286e-06, + "loss": 0.8121, + "step": 2284 + }, + { + "epoch": 0.8652025747822795, + "grad_norm": 9.00545597076416, + "learning_rate": 8.966767901527007e-06, + "loss": 0.5983, + "step": 2285 + }, + { + "epoch": 0.8655812192351382, + "grad_norm": 10.185830116271973, + "learning_rate": 8.917224900475695e-06, + "loss": 0.8618, + "step": 2286 + }, + { + "epoch": 0.8659598636879969, + "grad_norm": 13.381327629089355, + "learning_rate": 8.867812757914694e-06, + "loss": 1.0504, + "step": 2287 + }, + { + "epoch": 0.8663385081408558, + "grad_norm": 14.24845027923584, + "learning_rate": 8.818531544834385e-06, + "loss": 0.7165, + "step": 2288 + }, + { + "epoch": 0.8667171525937145, + "grad_norm": 12.008301734924316, + "learning_rate": 8.76938133203702e-06, + "loss": 0.6329, + "step": 2289 + }, + { + "epoch": 0.8670957970465732, + "grad_norm": 8.582221031188965, + "learning_rate": 8.720362190136611e-06, + "loss": 0.5233, + "step": 2290 + }, + { + "epoch": 0.867474441499432, + "grad_norm": 14.074053764343262, + "learning_rate": 8.671474189558903e-06, + "loss": 0.8896, + "step": 2291 + }, + { + "epoch": 0.8678530859522908, + "grad_norm": 8.265854835510254, + "learning_rate": 8.622717400541192e-06, + "loss": 0.423, + "step": 2292 + }, + { + "epoch": 0.8682317304051496, + "grad_norm": 14.022979736328125, + "learning_rate": 8.57409189313233e-06, + "loss": 0.5092, + "step": 2293 + }, + { + "epoch": 0.8686103748580083, + "grad_norm": 7.6086106300354, + "learning_rate": 8.525597737192481e-06, + "loss": 0.3162, + "step": 2294 + }, + { + "epoch": 0.8689890193108671, + "grad_norm": 10.01663875579834, + "learning_rate": 8.477235002393147e-06, + "loss": 0.363, + "step": 2295 + }, + { + "epoch": 0.8693676637637259, + "grad_norm": 10.2891206741333, + "learning_rate": 8.429003758216959e-06, + "loss": 0.4843, + "step": 2296 + }, + { + "epoch": 0.8697463082165846, + "grad_norm": 10.89184284210205, + "learning_rate": 8.380904073957729e-06, + "loss": 0.6032, + "step": 2297 + }, + { + "epoch": 0.8701249526694433, + "grad_norm": 11.962915420532227, + "learning_rate": 8.332936018720171e-06, + "loss": 0.4728, + "step": 2298 + }, + { + "epoch": 0.8705035971223022, + "grad_norm": 11.58560562133789, + "learning_rate": 8.285099661419926e-06, + "loss": 0.4367, + "step": 2299 + }, + { + "epoch": 0.8708822415751609, + "grad_norm": 18.291141510009766, + "learning_rate": 8.237395070783404e-06, + "loss": 0.651, + "step": 2300 + }, + { + "epoch": 0.8712608860280197, + "grad_norm": 11.406949996948242, + "learning_rate": 8.189822315347762e-06, + "loss": 2.1475, + "step": 2301 + }, + { + "epoch": 0.8716395304808785, + "grad_norm": 10.053242683410645, + "learning_rate": 8.14238146346068e-06, + "loss": 1.4275, + "step": 2302 + }, + { + "epoch": 0.8720181749337372, + "grad_norm": 9.507181167602539, + "learning_rate": 8.09507258328036e-06, + "loss": 1.1072, + "step": 2303 + }, + { + "epoch": 0.872396819386596, + "grad_norm": 10.980565071105957, + "learning_rate": 8.04789574277538e-06, + "loss": 1.0862, + "step": 2304 + }, + { + "epoch": 0.8727754638394547, + "grad_norm": 10.014975547790527, + "learning_rate": 8.000851009724696e-06, + "loss": 0.8774, + "step": 2305 + }, + { + "epoch": 0.8731541082923135, + "grad_norm": 10.634843826293945, + "learning_rate": 7.95393845171737e-06, + "loss": 0.7953, + "step": 2306 + }, + { + "epoch": 0.8735327527451723, + "grad_norm": 12.149109840393066, + "learning_rate": 7.907158136152604e-06, + "loss": 0.8086, + "step": 2307 + }, + { + "epoch": 0.873911397198031, + "grad_norm": 9.334737777709961, + "learning_rate": 7.860510130239607e-06, + "loss": 0.5934, + "step": 2308 + }, + { + "epoch": 0.8742900416508899, + "grad_norm": 13.120880126953125, + "learning_rate": 7.813994500997524e-06, + "loss": 1.0779, + "step": 2309 + }, + { + "epoch": 0.8746686861037486, + "grad_norm": 10.691913604736328, + "learning_rate": 7.767611315255275e-06, + "loss": 0.9073, + "step": 2310 + }, + { + "epoch": 0.8750473305566073, + "grad_norm": 10.472341537475586, + "learning_rate": 7.72136063965152e-06, + "loss": 0.4952, + "step": 2311 + }, + { + "epoch": 0.8754259750094661, + "grad_norm": 10.533929824829102, + "learning_rate": 7.67524254063452e-06, + "loss": 0.7114, + "step": 2312 + }, + { + "epoch": 0.8758046194623249, + "grad_norm": 6.5337042808532715, + "learning_rate": 7.6292570844621045e-06, + "loss": 0.3483, + "step": 2313 + }, + { + "epoch": 0.8761832639151836, + "grad_norm": 8.839234352111816, + "learning_rate": 7.583404337201516e-06, + "loss": 0.5599, + "step": 2314 + }, + { + "epoch": 0.8765619083680424, + "grad_norm": 10.337873458862305, + "learning_rate": 7.5376843647293024e-06, + "loss": 0.5017, + "step": 2315 + }, + { + "epoch": 0.8769405528209012, + "grad_norm": 8.032730102539062, + "learning_rate": 7.4920972327312875e-06, + "loss": 0.4731, + "step": 2316 + }, + { + "epoch": 0.87731919727376, + "grad_norm": 8.474766731262207, + "learning_rate": 7.446643006702469e-06, + "loss": 0.3606, + "step": 2317 + }, + { + "epoch": 0.8776978417266187, + "grad_norm": 10.331032752990723, + "learning_rate": 7.4013217519468325e-06, + "loss": 0.21, + "step": 2318 + }, + { + "epoch": 0.8780764861794774, + "grad_norm": 15.5275297164917, + "learning_rate": 7.356133533577369e-06, + "loss": 0.4189, + "step": 2319 + }, + { + "epoch": 0.8784551306323363, + "grad_norm": 8.125926971435547, + "learning_rate": 7.311078416515926e-06, + "loss": 0.1247, + "step": 2320 + }, + { + "epoch": 0.878833775085195, + "grad_norm": 10.590972900390625, + "learning_rate": 7.266156465493124e-06, + "loss": 0.3929, + "step": 2321 + }, + { + "epoch": 0.8792124195380537, + "grad_norm": 6.178790092468262, + "learning_rate": 7.221367745048279e-06, + "loss": 0.2636, + "step": 2322 + }, + { + "epoch": 0.8795910639909126, + "grad_norm": 16.32522201538086, + "learning_rate": 7.1767123195292666e-06, + "loss": 0.4226, + "step": 2323 + }, + { + "epoch": 0.8799697084437713, + "grad_norm": 7.7940850257873535, + "learning_rate": 7.132190253092452e-06, + "loss": 0.1687, + "step": 2324 + }, + { + "epoch": 0.88034835289663, + "grad_norm": 10.466033935546875, + "learning_rate": 7.08780160970266e-06, + "loss": 0.4764, + "step": 2325 + }, + { + "epoch": 0.8807269973494888, + "grad_norm": 12.871282577514648, + "learning_rate": 7.043546453132977e-06, + "loss": 2.4303, + "step": 2326 + }, + { + "epoch": 0.8811056418023476, + "grad_norm": 11.891122817993164, + "learning_rate": 6.99942484696472e-06, + "loss": 1.6218, + "step": 2327 + }, + { + "epoch": 0.8814842862552064, + "grad_norm": 8.916067123413086, + "learning_rate": 6.955436854587327e-06, + "loss": 0.8128, + "step": 2328 + }, + { + "epoch": 0.8818629307080651, + "grad_norm": 8.550481796264648, + "learning_rate": 6.9115825391982806e-06, + "loss": 0.8219, + "step": 2329 + }, + { + "epoch": 0.8822415751609239, + "grad_norm": 10.76794719696045, + "learning_rate": 6.867861963803035e-06, + "loss": 0.7949, + "step": 2330 + }, + { + "epoch": 0.8826202196137827, + "grad_norm": 8.747878074645996, + "learning_rate": 6.824275191214868e-06, + "loss": 0.7019, + "step": 2331 + }, + { + "epoch": 0.8829988640666414, + "grad_norm": 11.711833953857422, + "learning_rate": 6.780822284054833e-06, + "loss": 0.7592, + "step": 2332 + }, + { + "epoch": 0.8833775085195001, + "grad_norm": 13.456823348999023, + "learning_rate": 6.7375033047516464e-06, + "loss": 0.9945, + "step": 2333 + }, + { + "epoch": 0.883756152972359, + "grad_norm": 8.14212703704834, + "learning_rate": 6.694318315541637e-06, + "loss": 0.6043, + "step": 2334 + }, + { + "epoch": 0.8841347974252177, + "grad_norm": 8.493631362915039, + "learning_rate": 6.651267378468584e-06, + "loss": 0.5508, + "step": 2335 + }, + { + "epoch": 0.8845134418780765, + "grad_norm": 10.33297061920166, + "learning_rate": 6.608350555383758e-06, + "loss": 0.6154, + "step": 2336 + }, + { + "epoch": 0.8848920863309353, + "grad_norm": 10.296257019042969, + "learning_rate": 6.565567907945658e-06, + "loss": 0.6082, + "step": 2337 + }, + { + "epoch": 0.885270730783794, + "grad_norm": 9.22463321685791, + "learning_rate": 6.522919497620073e-06, + "loss": 0.6438, + "step": 2338 + }, + { + "epoch": 0.8856493752366528, + "grad_norm": 11.198843002319336, + "learning_rate": 6.480405385679888e-06, + "loss": 0.7403, + "step": 2339 + }, + { + "epoch": 0.8860280196895115, + "grad_norm": 10.24124526977539, + "learning_rate": 6.43802563320508e-06, + "loss": 0.565, + "step": 2340 + }, + { + "epoch": 0.8864066641423703, + "grad_norm": 11.018882751464844, + "learning_rate": 6.395780301082577e-06, + "loss": 0.5413, + "step": 2341 + }, + { + "epoch": 0.8867853085952291, + "grad_norm": 10.295462608337402, + "learning_rate": 6.353669450006194e-06, + "loss": 0.4042, + "step": 2342 + }, + { + "epoch": 0.8871639530480878, + "grad_norm": 13.353784561157227, + "learning_rate": 6.3116931404765265e-06, + "loss": 0.5982, + "step": 2343 + }, + { + "epoch": 0.8875425975009467, + "grad_norm": 11.05883502960205, + "learning_rate": 6.269851432800855e-06, + "loss": 0.5592, + "step": 2344 + }, + { + "epoch": 0.8879212419538054, + "grad_norm": 14.049494743347168, + "learning_rate": 6.228144387093127e-06, + "loss": 0.4035, + "step": 2345 + }, + { + "epoch": 0.8882998864066641, + "grad_norm": 12.806116104125977, + "learning_rate": 6.1865720632737875e-06, + "loss": 0.514, + "step": 2346 + }, + { + "epoch": 0.8886785308595229, + "grad_norm": 9.267263412475586, + "learning_rate": 6.145134521069729e-06, + "loss": 0.5728, + "step": 2347 + }, + { + "epoch": 0.8890571753123817, + "grad_norm": 9.22109603881836, + "learning_rate": 6.103831820014194e-06, + "loss": 0.288, + "step": 2348 + }, + { + "epoch": 0.8894358197652404, + "grad_norm": 11.868034362792969, + "learning_rate": 6.062664019446751e-06, + "loss": 0.2524, + "step": 2349 + }, + { + "epoch": 0.8898144642180992, + "grad_norm": 9.9324951171875, + "learning_rate": 6.021631178513087e-06, + "loss": 0.4626, + "step": 2350 + }, + { + "epoch": 0.890193108670958, + "grad_norm": 11.608717918395996, + "learning_rate": 5.9807333561650355e-06, + "loss": 1.7653, + "step": 2351 + }, + { + "epoch": 0.8905717531238168, + "grad_norm": 10.697403907775879, + "learning_rate": 5.939970611160428e-06, + "loss": 1.5739, + "step": 2352 + }, + { + "epoch": 0.8909503975766755, + "grad_norm": 10.519599914550781, + "learning_rate": 5.899343002063063e-06, + "loss": 1.4995, + "step": 2353 + }, + { + "epoch": 0.8913290420295342, + "grad_norm": 12.994054794311523, + "learning_rate": 5.858850587242559e-06, + "loss": 1.0682, + "step": 2354 + }, + { + "epoch": 0.8917076864823931, + "grad_norm": 12.894742012023926, + "learning_rate": 5.818493424874294e-06, + "loss": 1.2836, + "step": 2355 + }, + { + "epoch": 0.8920863309352518, + "grad_norm": 12.090742111206055, + "learning_rate": 5.778271572939354e-06, + "loss": 1.4098, + "step": 2356 + }, + { + "epoch": 0.8924649753881105, + "grad_norm": 8.5830717086792, + "learning_rate": 5.738185089224424e-06, + "loss": 0.6956, + "step": 2357 + }, + { + "epoch": 0.8928436198409694, + "grad_norm": 9.2327880859375, + "learning_rate": 5.698234031321692e-06, + "loss": 0.4974, + "step": 2358 + }, + { + "epoch": 0.8932222642938281, + "grad_norm": 12.713973999023438, + "learning_rate": 5.658418456628778e-06, + "loss": 0.6973, + "step": 2359 + }, + { + "epoch": 0.8936009087466869, + "grad_norm": 8.313766479492188, + "learning_rate": 5.618738422348646e-06, + "loss": 0.4688, + "step": 2360 + }, + { + "epoch": 0.8939795531995456, + "grad_norm": 11.878710746765137, + "learning_rate": 5.579193985489584e-06, + "loss": 0.6091, + "step": 2361 + }, + { + "epoch": 0.8943581976524044, + "grad_norm": 9.968457221984863, + "learning_rate": 5.5397852028649996e-06, + "loss": 0.7507, + "step": 2362 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 9.140604972839355, + "learning_rate": 5.500512131093438e-06, + "loss": 0.4721, + "step": 2363 + }, + { + "epoch": 0.8951154865581219, + "grad_norm": 9.671671867370605, + "learning_rate": 5.461374826598453e-06, + "loss": 0.5719, + "step": 2364 + }, + { + "epoch": 0.8954941310109807, + "grad_norm": 14.937457084655762, + "learning_rate": 5.422373345608578e-06, + "loss": 0.5815, + "step": 2365 + }, + { + "epoch": 0.8958727754638395, + "grad_norm": 12.879511833190918, + "learning_rate": 5.383507744157179e-06, + "loss": 0.5853, + "step": 2366 + }, + { + "epoch": 0.8962514199166982, + "grad_norm": 10.898216247558594, + "learning_rate": 5.344778078082391e-06, + "loss": 0.4625, + "step": 2367 + }, + { + "epoch": 0.896630064369557, + "grad_norm": 14.941044807434082, + "learning_rate": 5.306184403027059e-06, + "loss": 0.6827, + "step": 2368 + }, + { + "epoch": 0.8970087088224158, + "grad_norm": 14.840867042541504, + "learning_rate": 5.267726774438697e-06, + "loss": 0.5103, + "step": 2369 + }, + { + "epoch": 0.8973873532752745, + "grad_norm": 4.7114577293396, + "learning_rate": 5.229405247569308e-06, + "loss": 0.1327, + "step": 2370 + }, + { + "epoch": 0.8977659977281333, + "grad_norm": 9.676827430725098, + "learning_rate": 5.191219877475373e-06, + "loss": 0.4447, + "step": 2371 + }, + { + "epoch": 0.8981446421809921, + "grad_norm": 7.863070964813232, + "learning_rate": 5.153170719017741e-06, + "loss": 0.2609, + "step": 2372 + }, + { + "epoch": 0.8985232866338508, + "grad_norm": 8.543060302734375, + "learning_rate": 5.115257826861619e-06, + "loss": 0.2568, + "step": 2373 + }, + { + "epoch": 0.8989019310867096, + "grad_norm": 11.265084266662598, + "learning_rate": 5.077481255476368e-06, + "loss": 0.2466, + "step": 2374 + }, + { + "epoch": 0.8992805755395683, + "grad_norm": 20.728683471679688, + "learning_rate": 5.039841059135553e-06, + "loss": 1.1415, + "step": 2375 + }, + { + "epoch": 0.8996592199924272, + "grad_norm": 10.630395889282227, + "learning_rate": 5.002337291916792e-06, + "loss": 1.858, + "step": 2376 + }, + { + "epoch": 0.9000378644452859, + "grad_norm": 10.02651596069336, + "learning_rate": 4.9649700077016635e-06, + "loss": 1.4943, + "step": 2377 + }, + { + "epoch": 0.9004165088981446, + "grad_norm": 11.16691780090332, + "learning_rate": 4.927739260175735e-06, + "loss": 1.3521, + "step": 2378 + }, + { + "epoch": 0.9007951533510034, + "grad_norm": 10.301544189453125, + "learning_rate": 4.8906451028283285e-06, + "loss": 0.8017, + "step": 2379 + }, + { + "epoch": 0.9011737978038622, + "grad_norm": 10.982889175415039, + "learning_rate": 4.853687588952594e-06, + "loss": 0.7324, + "step": 2380 + }, + { + "epoch": 0.9015524422567209, + "grad_norm": 10.89743423461914, + "learning_rate": 4.816866771645323e-06, + "loss": 0.9049, + "step": 2381 + }, + { + "epoch": 0.9019310867095797, + "grad_norm": 10.184319496154785, + "learning_rate": 4.7801827038069234e-06, + "loss": 0.7181, + "step": 2382 + }, + { + "epoch": 0.9023097311624385, + "grad_norm": 7.678170680999756, + "learning_rate": 4.7436354381413476e-06, + "loss": 0.4933, + "step": 2383 + }, + { + "epoch": 0.9026883756152972, + "grad_norm": 8.026887893676758, + "learning_rate": 4.707225027156015e-06, + "loss": 0.5746, + "step": 2384 + }, + { + "epoch": 0.903067020068156, + "grad_norm": 11.845023155212402, + "learning_rate": 4.670951523161693e-06, + "loss": 0.537, + "step": 2385 + }, + { + "epoch": 0.9034456645210147, + "grad_norm": 12.805367469787598, + "learning_rate": 4.634814978272473e-06, + "loss": 0.6111, + "step": 2386 + }, + { + "epoch": 0.9038243089738736, + "grad_norm": 10.827343940734863, + "learning_rate": 4.598815444405691e-06, + "loss": 0.7474, + "step": 2387 + }, + { + "epoch": 0.9042029534267323, + "grad_norm": 12.786199569702148, + "learning_rate": 4.5629529732817864e-06, + "loss": 0.8605, + "step": 2388 + }, + { + "epoch": 0.904581597879591, + "grad_norm": 7.475493907928467, + "learning_rate": 4.527227616424368e-06, + "loss": 0.3846, + "step": 2389 + }, + { + "epoch": 0.9049602423324499, + "grad_norm": 16.920345306396484, + "learning_rate": 4.491639425159988e-06, + "loss": 0.8434, + "step": 2390 + }, + { + "epoch": 0.9053388867853086, + "grad_norm": 13.1689453125, + "learning_rate": 4.4561884506181266e-06, + "loss": 0.5591, + "step": 2391 + }, + { + "epoch": 0.9057175312381673, + "grad_norm": 13.537805557250977, + "learning_rate": 4.420874743731163e-06, + "loss": 0.6376, + "step": 2392 + }, + { + "epoch": 0.9060961756910261, + "grad_norm": 12.794946670532227, + "learning_rate": 4.385698355234258e-06, + "loss": 0.6448, + "step": 2393 + }, + { + "epoch": 0.9064748201438849, + "grad_norm": 11.212878227233887, + "learning_rate": 4.350659335665275e-06, + "loss": 0.2639, + "step": 2394 + }, + { + "epoch": 0.9068534645967437, + "grad_norm": 7.262661933898926, + "learning_rate": 4.315757735364712e-06, + "loss": 0.3209, + "step": 2395 + }, + { + "epoch": 0.9072321090496024, + "grad_norm": 8.17447566986084, + "learning_rate": 4.280993604475636e-06, + "loss": 0.3456, + "step": 2396 + }, + { + "epoch": 0.9076107535024612, + "grad_norm": 7.121233940124512, + "learning_rate": 4.246366992943662e-06, + "loss": 0.2374, + "step": 2397 + }, + { + "epoch": 0.90798939795532, + "grad_norm": 14.229004859924316, + "learning_rate": 4.211877950516763e-06, + "loss": 0.2939, + "step": 2398 + }, + { + "epoch": 0.9083680424081787, + "grad_norm": 55.41293716430664, + "learning_rate": 4.177526526745301e-06, + "loss": 0.5769, + "step": 2399 + }, + { + "epoch": 0.9087466868610374, + "grad_norm": 10.350135803222656, + "learning_rate": 4.143312770981911e-06, + "loss": 0.3735, + "step": 2400 + }, + { + "epoch": 0.9091253313138963, + "grad_norm": 9.91360092163086, + "learning_rate": 4.109236732381461e-06, + "loss": 1.6225, + "step": 2401 + }, + { + "epoch": 0.909503975766755, + "grad_norm": 8.888208389282227, + "learning_rate": 4.075298459900933e-06, + "loss": 1.2812, + "step": 2402 + }, + { + "epoch": 0.9098826202196137, + "grad_norm": 14.481423377990723, + "learning_rate": 4.0414980022994045e-06, + "loss": 1.5647, + "step": 2403 + }, + { + "epoch": 0.9102612646724726, + "grad_norm": 9.676767349243164, + "learning_rate": 4.007835408137928e-06, + "loss": 0.696, + "step": 2404 + }, + { + "epoch": 0.9106399091253313, + "grad_norm": 8.741473197937012, + "learning_rate": 3.974310725779518e-06, + "loss": 0.7849, + "step": 2405 + }, + { + "epoch": 0.9110185535781901, + "grad_norm": 14.236115455627441, + "learning_rate": 3.940924003389046e-06, + "loss": 0.877, + "step": 2406 + }, + { + "epoch": 0.9113971980310488, + "grad_norm": 10.6664400100708, + "learning_rate": 3.907675288933144e-06, + "loss": 0.6007, + "step": 2407 + }, + { + "epoch": 0.9117758424839076, + "grad_norm": 9.50271987915039, + "learning_rate": 3.874564630180188e-06, + "loss": 0.7913, + "step": 2408 + }, + { + "epoch": 0.9121544869367664, + "grad_norm": 11.143498420715332, + "learning_rate": 3.84159207470024e-06, + "loss": 0.7398, + "step": 2409 + }, + { + "epoch": 0.9125331313896251, + "grad_norm": 9.227463722229004, + "learning_rate": 3.808757669864904e-06, + "loss": 0.7372, + "step": 2410 + }, + { + "epoch": 0.912911775842484, + "grad_norm": 12.129105567932129, + "learning_rate": 3.7760614628473357e-06, + "loss": 0.8929, + "step": 2411 + }, + { + "epoch": 0.9132904202953427, + "grad_norm": 10.593716621398926, + "learning_rate": 3.743503500622103e-06, + "loss": 0.7059, + "step": 2412 + }, + { + "epoch": 0.9136690647482014, + "grad_norm": 13.297348976135254, + "learning_rate": 3.711083829965212e-06, + "loss": 0.5883, + "step": 2413 + }, + { + "epoch": 0.9140477092010602, + "grad_norm": 8.455639839172363, + "learning_rate": 3.678802497453948e-06, + "loss": 0.4366, + "step": 2414 + }, + { + "epoch": 0.914426353653919, + "grad_norm": 8.234000205993652, + "learning_rate": 3.6466595494668353e-06, + "loss": 0.363, + "step": 2415 + }, + { + "epoch": 0.9148049981067777, + "grad_norm": 13.38591194152832, + "learning_rate": 3.6146550321836116e-06, + "loss": 0.5627, + "step": 2416 + }, + { + "epoch": 0.9151836425596365, + "grad_norm": 12.83345890045166, + "learning_rate": 3.58278899158514e-06, + "loss": 0.6422, + "step": 2417 + }, + { + "epoch": 0.9155622870124953, + "grad_norm": 13.449295997619629, + "learning_rate": 3.5510614734532876e-06, + "loss": 0.6531, + "step": 2418 + }, + { + "epoch": 0.915940931465354, + "grad_norm": 10.407699584960938, + "learning_rate": 3.519472523370948e-06, + "loss": 0.6168, + "step": 2419 + }, + { + "epoch": 0.9163195759182128, + "grad_norm": 17.99662208557129, + "learning_rate": 3.4880221867219064e-06, + "loss": 0.257, + "step": 2420 + }, + { + "epoch": 0.9166982203710715, + "grad_norm": 7.612576484680176, + "learning_rate": 3.45671050869083e-06, + "loss": 0.2634, + "step": 2421 + }, + { + "epoch": 0.9170768648239304, + "grad_norm": 6.6135358810424805, + "learning_rate": 3.425537534263168e-06, + "loss": 0.1463, + "step": 2422 + }, + { + "epoch": 0.9174555092767891, + "grad_norm": 6.440648078918457, + "learning_rate": 3.394503308225061e-06, + "loss": 0.1608, + "step": 2423 + }, + { + "epoch": 0.9178341537296478, + "grad_norm": 1.7489157915115356, + "learning_rate": 3.363607875163366e-06, + "loss": 0.047, + "step": 2424 + }, + { + "epoch": 0.9182127981825067, + "grad_norm": 30.02290153503418, + "learning_rate": 3.3328512794654652e-06, + "loss": 0.9788, + "step": 2425 + }, + { + "epoch": 0.9185914426353654, + "grad_norm": 11.564863204956055, + "learning_rate": 3.302233565319357e-06, + "loss": 2.0172, + "step": 2426 + }, + { + "epoch": 0.9189700870882241, + "grad_norm": 9.934062957763672, + "learning_rate": 3.2717547767134538e-06, + "loss": 1.326, + "step": 2427 + }, + { + "epoch": 0.9193487315410829, + "grad_norm": 10.045295715332031, + "learning_rate": 3.2414149574365836e-06, + "loss": 0.9745, + "step": 2428 + }, + { + "epoch": 0.9197273759939417, + "grad_norm": 11.001640319824219, + "learning_rate": 3.2112141510779127e-06, + "loss": 1.2666, + "step": 2429 + }, + { + "epoch": 0.9201060204468005, + "grad_norm": 10.749781608581543, + "learning_rate": 3.18115240102691e-06, + "loss": 1.0313, + "step": 2430 + }, + { + "epoch": 0.9204846648996592, + "grad_norm": 11.977824211120605, + "learning_rate": 3.151229750473239e-06, + "loss": 0.8656, + "step": 2431 + }, + { + "epoch": 0.920863309352518, + "grad_norm": 11.430237770080566, + "learning_rate": 3.1214462424067335e-06, + "loss": 1.0509, + "step": 2432 + }, + { + "epoch": 0.9212419538053768, + "grad_norm": 10.933598518371582, + "learning_rate": 3.0918019196173096e-06, + "loss": 0.9562, + "step": 2433 + }, + { + "epoch": 0.9216205982582355, + "grad_norm": 7.044180393218994, + "learning_rate": 3.0622968246949213e-06, + "loss": 0.5424, + "step": 2434 + }, + { + "epoch": 0.9219992427110942, + "grad_norm": 14.46313762664795, + "learning_rate": 3.0329310000295153e-06, + "loss": 0.4784, + "step": 2435 + }, + { + "epoch": 0.9223778871639531, + "grad_norm": 9.44637393951416, + "learning_rate": 3.003704487810899e-06, + "loss": 0.649, + "step": 2436 + }, + { + "epoch": 0.9227565316168118, + "grad_norm": 8.902215003967285, + "learning_rate": 2.9746173300287837e-06, + "loss": 0.523, + "step": 2437 + }, + { + "epoch": 0.9231351760696705, + "grad_norm": 8.846084594726562, + "learning_rate": 2.945669568472631e-06, + "loss": 0.5283, + "step": 2438 + }, + { + "epoch": 0.9235138205225294, + "grad_norm": 9.392375946044922, + "learning_rate": 2.916861244731661e-06, + "loss": 0.5592, + "step": 2439 + }, + { + "epoch": 0.9238924649753881, + "grad_norm": 8.410120964050293, + "learning_rate": 2.888192400194745e-06, + "loss": 0.4376, + "step": 2440 + }, + { + "epoch": 0.9242711094282469, + "grad_norm": 9.047958374023438, + "learning_rate": 2.8596630760503673e-06, + "loss": 0.4578, + "step": 2441 + }, + { + "epoch": 0.9246497538811056, + "grad_norm": 9.126653671264648, + "learning_rate": 2.8312733132865754e-06, + "loss": 0.423, + "step": 2442 + }, + { + "epoch": 0.9250283983339644, + "grad_norm": 6.317657470703125, + "learning_rate": 2.803023152690887e-06, + "loss": 0.2639, + "step": 2443 + }, + { + "epoch": 0.9254070427868232, + "grad_norm": 10.31212043762207, + "learning_rate": 2.7749126348502684e-06, + "loss": 0.3573, + "step": 2444 + }, + { + "epoch": 0.9257856872396819, + "grad_norm": 15.265264511108398, + "learning_rate": 2.7469418001510704e-06, + "loss": 0.4853, + "step": 2445 + }, + { + "epoch": 0.9261643316925408, + "grad_norm": 7.364819526672363, + "learning_rate": 2.7191106887789473e-06, + "loss": 0.3669, + "step": 2446 + }, + { + "epoch": 0.9265429761453995, + "grad_norm": 9.149045944213867, + "learning_rate": 2.6914193407188146e-06, + "loss": 0.1694, + "step": 2447 + }, + { + "epoch": 0.9269216205982582, + "grad_norm": 6.317681789398193, + "learning_rate": 2.663867795754771e-06, + "loss": 0.34, + "step": 2448 + }, + { + "epoch": 0.927300265051117, + "grad_norm": 6.21943998336792, + "learning_rate": 2.636456093470119e-06, + "loss": 0.272, + "step": 2449 + }, + { + "epoch": 0.9276789095039758, + "grad_norm": 19.55099868774414, + "learning_rate": 2.6091842732472006e-06, + "loss": 0.6493, + "step": 2450 + }, + { + "epoch": 0.9280575539568345, + "grad_norm": 9.162104606628418, + "learning_rate": 2.582052374267385e-06, + "loss": 1.5711, + "step": 2451 + }, + { + "epoch": 0.9284361984096933, + "grad_norm": 9.272793769836426, + "learning_rate": 2.555060435511025e-06, + "loss": 1.2648, + "step": 2452 + }, + { + "epoch": 0.9288148428625521, + "grad_norm": 12.99858283996582, + "learning_rate": 2.5282084957574226e-06, + "loss": 1.4216, + "step": 2453 + }, + { + "epoch": 0.9291934873154108, + "grad_norm": 11.940863609313965, + "learning_rate": 2.5014965935847178e-06, + "loss": 1.0989, + "step": 2454 + }, + { + "epoch": 0.9295721317682696, + "grad_norm": 10.320348739624023, + "learning_rate": 2.4749247673698573e-06, + "loss": 0.743, + "step": 2455 + }, + { + "epoch": 0.9299507762211283, + "grad_norm": 11.852913856506348, + "learning_rate": 2.4484930552885365e-06, + "loss": 0.9047, + "step": 2456 + }, + { + "epoch": 0.9303294206739872, + "grad_norm": 8.977890014648438, + "learning_rate": 2.4222014953151686e-06, + "loss": 0.7247, + "step": 2457 + }, + { + "epoch": 0.9307080651268459, + "grad_norm": 10.382013320922852, + "learning_rate": 2.396050125222793e-06, + "loss": 0.7898, + "step": 2458 + }, + { + "epoch": 0.9310867095797046, + "grad_norm": 11.049487113952637, + "learning_rate": 2.370038982583056e-06, + "loss": 0.8068, + "step": 2459 + }, + { + "epoch": 0.9314653540325635, + "grad_norm": 15.248514175415039, + "learning_rate": 2.344168104766109e-06, + "loss": 0.7894, + "step": 2460 + }, + { + "epoch": 0.9318439984854222, + "grad_norm": 13.173613548278809, + "learning_rate": 2.3184375289406202e-06, + "loss": 1.3524, + "step": 2461 + }, + { + "epoch": 0.9322226429382809, + "grad_norm": 9.832080841064453, + "learning_rate": 2.2928472920736744e-06, + "loss": 0.3662, + "step": 2462 + }, + { + "epoch": 0.9326012873911397, + "grad_norm": 11.172598838806152, + "learning_rate": 2.2673974309307066e-06, + "loss": 0.5176, + "step": 2463 + }, + { + "epoch": 0.9329799318439985, + "grad_norm": 8.801765441894531, + "learning_rate": 2.2420879820755023e-06, + "loss": 0.4289, + "step": 2464 + }, + { + "epoch": 0.9333585762968573, + "grad_norm": 7.546141624450684, + "learning_rate": 2.2169189818701307e-06, + "loss": 0.4665, + "step": 2465 + }, + { + "epoch": 0.933737220749716, + "grad_norm": 11.858968734741211, + "learning_rate": 2.191890466474844e-06, + "loss": 0.5132, + "step": 2466 + }, + { + "epoch": 0.9341158652025748, + "grad_norm": 11.472759246826172, + "learning_rate": 2.1670024718480675e-06, + "loss": 0.3973, + "step": 2467 + }, + { + "epoch": 0.9344945096554336, + "grad_norm": 11.55582332611084, + "learning_rate": 2.1422550337463322e-06, + "loss": 0.5675, + "step": 2468 + }, + { + "epoch": 0.9348731541082923, + "grad_norm": 8.275731086730957, + "learning_rate": 2.117648187724286e-06, + "loss": 0.3216, + "step": 2469 + }, + { + "epoch": 0.935251798561151, + "grad_norm": 17.31087875366211, + "learning_rate": 2.0931819691345277e-06, + "loss": 0.4359, + "step": 2470 + }, + { + "epoch": 0.9356304430140099, + "grad_norm": 6.022243976593018, + "learning_rate": 2.06885641312764e-06, + "loss": 0.1946, + "step": 2471 + }, + { + "epoch": 0.9360090874668686, + "grad_norm": 7.993716239929199, + "learning_rate": 2.0446715546521112e-06, + "loss": 0.2927, + "step": 2472 + }, + { + "epoch": 0.9363877319197274, + "grad_norm": 5.820225238800049, + "learning_rate": 2.0206274284542804e-06, + "loss": 0.1208, + "step": 2473 + }, + { + "epoch": 0.9367663763725862, + "grad_norm": 10.578958511352539, + "learning_rate": 1.9967240690783262e-06, + "loss": 0.2527, + "step": 2474 + }, + { + "epoch": 0.9371450208254449, + "grad_norm": 6.542937755584717, + "learning_rate": 1.972961510866178e-06, + "loss": 0.3776, + "step": 2475 + }, + { + "epoch": 0.9375236652783037, + "grad_norm": 9.507660865783691, + "learning_rate": 1.9493397879574493e-06, + "loss": 1.6838, + "step": 2476 + }, + { + "epoch": 0.9379023097311624, + "grad_norm": 11.040675163269043, + "learning_rate": 1.9258589342894485e-06, + "loss": 1.5473, + "step": 2477 + }, + { + "epoch": 0.9382809541840212, + "grad_norm": 9.63912296295166, + "learning_rate": 1.902518983597068e-06, + "loss": 1.1266, + "step": 2478 + }, + { + "epoch": 0.93865959863688, + "grad_norm": 11.154900550842285, + "learning_rate": 1.879319969412796e-06, + "loss": 1.205, + "step": 2479 + }, + { + "epoch": 0.9390382430897387, + "grad_norm": 11.759103775024414, + "learning_rate": 1.8562619250666047e-06, + "loss": 1.2244, + "step": 2480 + }, + { + "epoch": 0.9394168875425976, + "grad_norm": 10.546182632446289, + "learning_rate": 1.8333448836859723e-06, + "loss": 0.8522, + "step": 2481 + }, + { + "epoch": 0.9397955319954563, + "grad_norm": 11.785140991210938, + "learning_rate": 1.810568878195773e-06, + "loss": 0.8214, + "step": 2482 + }, + { + "epoch": 0.940174176448315, + "grad_norm": 13.62626838684082, + "learning_rate": 1.787933941318265e-06, + "loss": 0.9527, + "step": 2483 + }, + { + "epoch": 0.9405528209011738, + "grad_norm": 12.887011528015137, + "learning_rate": 1.7654401055730129e-06, + "loss": 1.0685, + "step": 2484 + }, + { + "epoch": 0.9409314653540326, + "grad_norm": 9.904094696044922, + "learning_rate": 1.7430874032768885e-06, + "loss": 0.7109, + "step": 2485 + }, + { + "epoch": 0.9413101098068913, + "grad_norm": 7.248521327972412, + "learning_rate": 1.7208758665439917e-06, + "loss": 0.3924, + "step": 2486 + }, + { + "epoch": 0.9416887542597501, + "grad_norm": 9.920915603637695, + "learning_rate": 1.6988055272855962e-06, + "loss": 0.3695, + "step": 2487 + }, + { + "epoch": 0.9420673987126088, + "grad_norm": 8.971296310424805, + "learning_rate": 1.676876417210127e-06, + "loss": 0.3631, + "step": 2488 + }, + { + "epoch": 0.9424460431654677, + "grad_norm": 15.373361587524414, + "learning_rate": 1.6550885678231042e-06, + "loss": 0.9075, + "step": 2489 + }, + { + "epoch": 0.9428246876183264, + "grad_norm": 10.004647254943848, + "learning_rate": 1.6334420104271109e-06, + "loss": 0.6522, + "step": 2490 + }, + { + "epoch": 0.9432033320711851, + "grad_norm": 10.934609413146973, + "learning_rate": 1.6119367761217142e-06, + "loss": 0.4777, + "step": 2491 + }, + { + "epoch": 0.943581976524044, + "grad_norm": 8.394415855407715, + "learning_rate": 1.590572895803455e-06, + "loss": 0.2631, + "step": 2492 + }, + { + "epoch": 0.9439606209769027, + "grad_norm": 11.458495140075684, + "learning_rate": 1.569350400165781e-06, + "loss": 0.62, + "step": 2493 + }, + { + "epoch": 0.9443392654297614, + "grad_norm": 15.056721687316895, + "learning_rate": 1.548269319699036e-06, + "loss": 0.4027, + "step": 2494 + }, + { + "epoch": 0.9447179098826202, + "grad_norm": 12.485556602478027, + "learning_rate": 1.5273296846903707e-06, + "loss": 0.3424, + "step": 2495 + }, + { + "epoch": 0.945096554335479, + "grad_norm": 10.782124519348145, + "learning_rate": 1.50653152522372e-06, + "loss": 0.4142, + "step": 2496 + }, + { + "epoch": 0.9454751987883377, + "grad_norm": 7.636911869049072, + "learning_rate": 1.4858748711797822e-06, + "loss": 0.2571, + "step": 2497 + }, + { + "epoch": 0.9458538432411965, + "grad_norm": 7.60603666305542, + "learning_rate": 1.4653597522359396e-06, + "loss": 0.2453, + "step": 2498 + }, + { + "epoch": 0.9462324876940553, + "grad_norm": 10.03763198852539, + "learning_rate": 1.444986197866227e-06, + "loss": 0.3472, + "step": 2499 + }, + { + "epoch": 0.9466111321469141, + "grad_norm": 3.0592634677886963, + "learning_rate": 1.424754237341297e-06, + "loss": 0.0667, + "step": 2500 + }, + { + "epoch": 0.9469897765997728, + "grad_norm": 10.650063514709473, + "learning_rate": 1.4046638997283978e-06, + "loss": 2.1954, + "step": 2501 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 8.665024757385254, + "learning_rate": 1.3847152138912744e-06, + "loss": 1.0272, + "step": 2502 + }, + { + "epoch": 0.9477470655054904, + "grad_norm": 9.950023651123047, + "learning_rate": 1.3649082084901676e-06, + "loss": 1.3072, + "step": 2503 + }, + { + "epoch": 0.9481257099583491, + "grad_norm": 9.413839340209961, + "learning_rate": 1.345242911981781e-06, + "loss": 0.8727, + "step": 2504 + }, + { + "epoch": 0.9485043544112078, + "grad_norm": 10.411212921142578, + "learning_rate": 1.3257193526192257e-06, + "loss": 0.6985, + "step": 2505 + }, + { + "epoch": 0.9488829988640667, + "grad_norm": 12.88664436340332, + "learning_rate": 1.3063375584519532e-06, + "loss": 0.6111, + "step": 2506 + }, + { + "epoch": 0.9492616433169254, + "grad_norm": 9.880784034729004, + "learning_rate": 1.2870975573257783e-06, + "loss": 0.6718, + "step": 2507 + }, + { + "epoch": 0.9496402877697842, + "grad_norm": 9.987343788146973, + "learning_rate": 1.267999376882767e-06, + "loss": 0.6956, + "step": 2508 + }, + { + "epoch": 0.9500189322226429, + "grad_norm": 10.966734886169434, + "learning_rate": 1.2490430445612488e-06, + "loss": 0.7292, + "step": 2509 + }, + { + "epoch": 0.9503975766755017, + "grad_norm": 13.379451751708984, + "learning_rate": 1.230228587595772e-06, + "loss": 0.527, + "step": 2510 + }, + { + "epoch": 0.9507762211283605, + "grad_norm": 11.814391136169434, + "learning_rate": 1.2115560330170362e-06, + "loss": 0.6957, + "step": 2511 + }, + { + "epoch": 0.9511548655812192, + "grad_norm": 9.619377136230469, + "learning_rate": 1.1930254076518488e-06, + "loss": 0.753, + "step": 2512 + }, + { + "epoch": 0.951533510034078, + "grad_norm": 10.99007797241211, + "learning_rate": 1.1746367381231582e-06, + "loss": 0.463, + "step": 2513 + }, + { + "epoch": 0.9519121544869368, + "grad_norm": 13.722533226013184, + "learning_rate": 1.1563900508499425e-06, + "loss": 0.6937, + "step": 2514 + }, + { + "epoch": 0.9522907989397955, + "grad_norm": 11.671647071838379, + "learning_rate": 1.1382853720471764e-06, + "loss": 0.4535, + "step": 2515 + }, + { + "epoch": 0.9526694433926542, + "grad_norm": 7.917880535125732, + "learning_rate": 1.1203227277258198e-06, + "loss": 0.4509, + "step": 2516 + }, + { + "epoch": 0.9530480878455131, + "grad_norm": 12.8670072555542, + "learning_rate": 1.1025021436927962e-06, + "loss": 0.5246, + "step": 2517 + }, + { + "epoch": 0.9534267322983718, + "grad_norm": 13.827978134155273, + "learning_rate": 1.0848236455509031e-06, + "loss": 0.5806, + "step": 2518 + }, + { + "epoch": 0.9538053767512306, + "grad_norm": 12.70480728149414, + "learning_rate": 1.0672872586988237e-06, + "loss": 0.4097, + "step": 2519 + }, + { + "epoch": 0.9541840212040894, + "grad_norm": 10.580239295959473, + "learning_rate": 1.0498930083310376e-06, + "loss": 0.4366, + "step": 2520 + }, + { + "epoch": 0.9545626656569481, + "grad_norm": 8.138960838317871, + "learning_rate": 1.032640919437844e-06, + "loss": 0.2703, + "step": 2521 + }, + { + "epoch": 0.9549413101098069, + "grad_norm": 6.230162620544434, + "learning_rate": 1.0155310168053156e-06, + "loss": 0.2412, + "step": 2522 + }, + { + "epoch": 0.9553199545626656, + "grad_norm": 17.800689697265625, + "learning_rate": 9.985633250152116e-07, + "loss": 0.4268, + "step": 2523 + }, + { + "epoch": 0.9556985990155245, + "grad_norm": 29.00356674194336, + "learning_rate": 9.817378684449763e-07, + "loss": 0.2153, + "step": 2524 + }, + { + "epoch": 0.9560772434683832, + "grad_norm": 14.519379615783691, + "learning_rate": 9.6505467126774e-07, + "loss": 0.1794, + "step": 2525 + }, + { + "epoch": 0.9564558879212419, + "grad_norm": 9.758588790893555, + "learning_rate": 9.485137574522185e-07, + "loss": 1.728, + "step": 2526 + }, + { + "epoch": 0.9568345323741008, + "grad_norm": 10.424653053283691, + "learning_rate": 9.321151507627135e-07, + "loss": 1.3501, + "step": 2527 + }, + { + "epoch": 0.9572131768269595, + "grad_norm": 10.270801544189453, + "learning_rate": 9.158588747590902e-07, + "loss": 1.3, + "step": 2528 + }, + { + "epoch": 0.9575918212798182, + "grad_norm": 10.49842643737793, + "learning_rate": 8.997449527966994e-07, + "loss": 0.8599, + "step": 2529 + }, + { + "epoch": 0.957970465732677, + "grad_norm": 8.8501615524292, + "learning_rate": 8.837734080264116e-07, + "loss": 0.7401, + "step": 2530 + }, + { + "epoch": 0.9583491101855358, + "grad_norm": 10.654547691345215, + "learning_rate": 8.679442633945156e-07, + "loss": 0.8812, + "step": 2531 + }, + { + "epoch": 0.9587277546383945, + "grad_norm": 10.941558837890625, + "learning_rate": 8.522575416426981e-07, + "loss": 1.1451, + "step": 2532 + }, + { + "epoch": 0.9591063990912533, + "grad_norm": 11.180508613586426, + "learning_rate": 8.367132653080867e-07, + "loss": 0.774, + "step": 2533 + }, + { + "epoch": 0.9594850435441121, + "grad_norm": 11.109345436096191, + "learning_rate": 8.213114567230951e-07, + "loss": 0.5891, + "step": 2534 + }, + { + "epoch": 0.9598636879969709, + "grad_norm": 10.302473068237305, + "learning_rate": 8.060521380154784e-07, + "loss": 0.6217, + "step": 2535 + }, + { + "epoch": 0.9602423324498296, + "grad_norm": 10.956271171569824, + "learning_rate": 7.90935331108289e-07, + "loss": 0.6269, + "step": 2536 + }, + { + "epoch": 0.9606209769026883, + "grad_norm": 11.260363578796387, + "learning_rate": 7.759610577198206e-07, + "loss": 0.5793, + "step": 2537 + }, + { + "epoch": 0.9609996213555472, + "grad_norm": 14.293161392211914, + "learning_rate": 7.611293393635755e-07, + "loss": 0.8875, + "step": 2538 + }, + { + "epoch": 0.9613782658084059, + "grad_norm": 11.707453727722168, + "learning_rate": 7.46440197348286e-07, + "loss": 0.5792, + "step": 2539 + }, + { + "epoch": 0.9617569102612646, + "grad_norm": 10.193665504455566, + "learning_rate": 7.318936527777931e-07, + "loss": 0.4881, + "step": 2540 + }, + { + "epoch": 0.9621355547141235, + "grad_norm": 9.130789756774902, + "learning_rate": 7.174897265511238e-07, + "loss": 0.4617, + "step": 2541 + }, + { + "epoch": 0.9625141991669822, + "grad_norm": 7.929288864135742, + "learning_rate": 7.032284393623579e-07, + "loss": 0.3009, + "step": 2542 + }, + { + "epoch": 0.962892843619841, + "grad_norm": 6.23085355758667, + "learning_rate": 6.891098117006833e-07, + "loss": 0.2789, + "step": 2543 + }, + { + "epoch": 0.9632714880726997, + "grad_norm": 16.519512176513672, + "learning_rate": 6.751338638502858e-07, + "loss": 0.558, + "step": 2544 + }, + { + "epoch": 0.9636501325255585, + "grad_norm": 9.027918815612793, + "learning_rate": 6.613006158904145e-07, + "loss": 0.3583, + "step": 2545 + }, + { + "epoch": 0.9640287769784173, + "grad_norm": 9.856430053710938, + "learning_rate": 6.476100876952718e-07, + "loss": 0.2975, + "step": 2546 + }, + { + "epoch": 0.964407421431276, + "grad_norm": 10.270342826843262, + "learning_rate": 6.340622989340128e-07, + "loss": 0.4139, + "step": 2547 + }, + { + "epoch": 0.9647860658841348, + "grad_norm": 10.692875862121582, + "learning_rate": 6.206572690707125e-07, + "loss": 0.2542, + "step": 2548 + }, + { + "epoch": 0.9651647103369936, + "grad_norm": 36.20051574707031, + "learning_rate": 6.073950173643873e-07, + "loss": 0.6115, + "step": 2549 + }, + { + "epoch": 0.9655433547898523, + "grad_norm": 26.375398635864258, + "learning_rate": 5.942755628688845e-07, + "loss": 0.4395, + "step": 2550 + }, + { + "epoch": 0.965921999242711, + "grad_norm": 10.20322322845459, + "learning_rate": 5.812989244328937e-07, + "loss": 1.5327, + "step": 2551 + }, + { + "epoch": 0.9663006436955699, + "grad_norm": 10.076367378234863, + "learning_rate": 5.684651206999347e-07, + "loss": 1.2404, + "step": 2552 + }, + { + "epoch": 0.9666792881484286, + "grad_norm": 9.947564125061035, + "learning_rate": 5.557741701083363e-07, + "loss": 0.924, + "step": 2553 + }, + { + "epoch": 0.9670579326012874, + "grad_norm": 10.444324493408203, + "learning_rate": 5.432260908911358e-07, + "loss": 1.1889, + "step": 2554 + }, + { + "epoch": 0.9674365770541462, + "grad_norm": 9.716720581054688, + "learning_rate": 5.308209010761678e-07, + "loss": 0.7316, + "step": 2555 + }, + { + "epoch": 0.9678152215070049, + "grad_norm": 14.292815208435059, + "learning_rate": 5.185586184859426e-07, + "loss": 0.8652, + "step": 2556 + }, + { + "epoch": 0.9681938659598637, + "grad_norm": 12.14730453491211, + "learning_rate": 5.064392607376567e-07, + "loss": 0.7933, + "step": 2557 + }, + { + "epoch": 0.9685725104127224, + "grad_norm": 9.1866455078125, + "learning_rate": 4.94462845243171e-07, + "loss": 0.501, + "step": 2558 + }, + { + "epoch": 0.9689511548655813, + "grad_norm": 12.76934814453125, + "learning_rate": 4.826293892089995e-07, + "loss": 0.6006, + "step": 2559 + }, + { + "epoch": 0.96932979931844, + "grad_norm": 13.298689842224121, + "learning_rate": 4.709389096362427e-07, + "loss": 0.654, + "step": 2560 + }, + { + "epoch": 0.9697084437712987, + "grad_norm": 12.402892112731934, + "learning_rate": 4.593914233205987e-07, + "loss": 1.0524, + "step": 2561 + }, + { + "epoch": 0.9700870882241576, + "grad_norm": 11.499163627624512, + "learning_rate": 4.4798694685231903e-07, + "loss": 0.5996, + "step": 2562 + }, + { + "epoch": 0.9704657326770163, + "grad_norm": 8.201879501342773, + "learning_rate": 4.367254966161971e-07, + "loss": 0.5142, + "step": 2563 + }, + { + "epoch": 0.970844377129875, + "grad_norm": 14.199493408203125, + "learning_rate": 4.2560708879154645e-07, + "loss": 0.5288, + "step": 2564 + }, + { + "epoch": 0.9712230215827338, + "grad_norm": 9.493123054504395, + "learning_rate": 4.1463173935216703e-07, + "loss": 0.4889, + "step": 2565 + }, + { + "epoch": 0.9716016660355926, + "grad_norm": 11.741094589233398, + "learning_rate": 4.037994640663345e-07, + "loss": 0.4841, + "step": 2566 + }, + { + "epoch": 0.9719803104884513, + "grad_norm": 10.63720417022705, + "learning_rate": 3.9311027849674444e-07, + "loss": 0.4452, + "step": 2567 + }, + { + "epoch": 0.9723589549413101, + "grad_norm": 10.38010025024414, + "learning_rate": 3.8256419800055675e-07, + "loss": 0.4737, + "step": 2568 + }, + { + "epoch": 0.9727375993941689, + "grad_norm": 12.463581085205078, + "learning_rate": 3.721612377292849e-07, + "loss": 0.2774, + "step": 2569 + }, + { + "epoch": 0.9731162438470277, + "grad_norm": 9.494747161865234, + "learning_rate": 3.6190141262887333e-07, + "loss": 0.2915, + "step": 2570 + }, + { + "epoch": 0.9734948882998864, + "grad_norm": 11.594443321228027, + "learning_rate": 3.517847374395755e-07, + "loss": 0.2961, + "step": 2571 + }, + { + "epoch": 0.9738735327527451, + "grad_norm": 7.852182388305664, + "learning_rate": 3.418112266960205e-07, + "loss": 0.2635, + "step": 2572 + }, + { + "epoch": 0.974252177205604, + "grad_norm": 4.673264026641846, + "learning_rate": 3.319808947271241e-07, + "loss": 0.0947, + "step": 2573 + }, + { + "epoch": 0.9746308216584627, + "grad_norm": 5.447509288787842, + "learning_rate": 3.222937556561223e-07, + "loss": 0.0963, + "step": 2574 + }, + { + "epoch": 0.9750094661113214, + "grad_norm": 4.423956394195557, + "learning_rate": 3.127498234005044e-07, + "loss": 0.1488, + "step": 2575 + }, + { + "epoch": 0.9753881105641803, + "grad_norm": 11.625571250915527, + "learning_rate": 3.033491116720244e-07, + "loss": 2.4858, + "step": 2576 + }, + { + "epoch": 0.975766755017039, + "grad_norm": 10.204331398010254, + "learning_rate": 2.940916339766675e-07, + "loss": 1.637, + "step": 2577 + }, + { + "epoch": 0.9761453994698978, + "grad_norm": 10.439549446105957, + "learning_rate": 2.849774036146502e-07, + "loss": 1.3357, + "step": 2578 + }, + { + "epoch": 0.9765240439227565, + "grad_norm": 10.696296691894531, + "learning_rate": 2.7600643368036473e-07, + "loss": 1.1023, + "step": 2579 + }, + { + "epoch": 0.9769026883756153, + "grad_norm": 12.158334732055664, + "learning_rate": 2.6717873706240125e-07, + "loss": 1.1039, + "step": 2580 + }, + { + "epoch": 0.9772813328284741, + "grad_norm": 11.918547630310059, + "learning_rate": 2.5849432644348136e-07, + "loss": 0.8639, + "step": 2581 + }, + { + "epoch": 0.9776599772813328, + "grad_norm": 9.715611457824707, + "learning_rate": 2.4995321430050235e-07, + "loss": 0.567, + "step": 2582 + }, + { + "epoch": 0.9780386217341916, + "grad_norm": 11.445382118225098, + "learning_rate": 2.415554129044595e-07, + "loss": 0.9997, + "step": 2583 + }, + { + "epoch": 0.9784172661870504, + "grad_norm": 10.35413932800293, + "learning_rate": 2.333009343204573e-07, + "loss": 0.7591, + "step": 2584 + }, + { + "epoch": 0.9787959106399091, + "grad_norm": 7.9862165451049805, + "learning_rate": 2.2518979040769827e-07, + "loss": 0.3323, + "step": 2585 + }, + { + "epoch": 0.9791745550927679, + "grad_norm": 12.614017486572266, + "learning_rate": 2.1722199281944967e-07, + "loss": 0.9701, + "step": 2586 + }, + { + "epoch": 0.9795531995456267, + "grad_norm": 11.604450225830078, + "learning_rate": 2.0939755300304342e-07, + "loss": 0.8196, + "step": 2587 + }, + { + "epoch": 0.9799318439984854, + "grad_norm": 12.22137451171875, + "learning_rate": 2.0171648219982074e-07, + "loss": 0.4567, + "step": 2588 + }, + { + "epoch": 0.9803104884513442, + "grad_norm": 11.08056926727295, + "learning_rate": 1.941787914451876e-07, + "loss": 0.7081, + "step": 2589 + }, + { + "epoch": 0.9806891329042029, + "grad_norm": 8.053107261657715, + "learning_rate": 1.8678449156852573e-07, + "loss": 0.5096, + "step": 2590 + }, + { + "epoch": 0.9810677773570617, + "grad_norm": 10.370348930358887, + "learning_rate": 1.7953359319320406e-07, + "loss": 0.5069, + "step": 2591 + }, + { + "epoch": 0.9814464218099205, + "grad_norm": 8.252673149108887, + "learning_rate": 1.7242610673658954e-07, + "loss": 0.2886, + "step": 2592 + }, + { + "epoch": 0.9818250662627792, + "grad_norm": 14.678871154785156, + "learning_rate": 1.6546204240999174e-07, + "loss": 0.3913, + "step": 2593 + }, + { + "epoch": 0.9822037107156381, + "grad_norm": 12.055886268615723, + "learning_rate": 1.5864141021868506e-07, + "loss": 0.4829, + "step": 2594 + }, + { + "epoch": 0.9825823551684968, + "grad_norm": 7.955121040344238, + "learning_rate": 1.5196421996184207e-07, + "loss": 0.3313, + "step": 2595 + }, + { + "epoch": 0.9829609996213555, + "grad_norm": 6.439930438995361, + "learning_rate": 1.4543048123257796e-07, + "loss": 0.1825, + "step": 2596 + }, + { + "epoch": 0.9833396440742143, + "grad_norm": 5.3940582275390625, + "learning_rate": 1.3904020341791724e-07, + "loss": 0.1774, + "step": 2597 + }, + { + "epoch": 0.9837182885270731, + "grad_norm": 23.00211524963379, + "learning_rate": 1.3279339569874926e-07, + "loss": 0.2979, + "step": 2598 + }, + { + "epoch": 0.9840969329799318, + "grad_norm": 10.200654983520508, + "learning_rate": 1.2669006704986164e-07, + "loss": 0.3489, + "step": 2599 + }, + { + "epoch": 0.9844755774327906, + "grad_norm": 12.444687843322754, + "learning_rate": 1.2073022623988462e-07, + "loss": 0.4883, + "step": 2600 + }, + { + "epoch": 0.9848542218856494, + "grad_norm": 8.539031982421875, + "learning_rate": 1.1491388183133556e-07, + "loss": 1.2458, + "step": 2601 + }, + { + "epoch": 0.9852328663385082, + "grad_norm": 12.276918411254883, + "learning_rate": 1.092410421805301e-07, + "loss": 1.1199, + "step": 2602 + }, + { + "epoch": 0.9856115107913669, + "grad_norm": 9.444790840148926, + "learning_rate": 1.0371171543763769e-07, + "loss": 1.1647, + "step": 2603 + }, + { + "epoch": 0.9859901552442256, + "grad_norm": 9.76896858215332, + "learning_rate": 9.832590954662602e-08, + "loss": 0.8912, + "step": 2604 + }, + { + "epoch": 0.9863687996970845, + "grad_norm": 9.74208927154541, + "learning_rate": 9.308363224528327e-08, + "loss": 0.8606, + "step": 2605 + }, + { + "epoch": 0.9867474441499432, + "grad_norm": 8.999712944030762, + "learning_rate": 8.798489106517371e-08, + "loss": 0.6109, + "step": 2606 + }, + { + "epoch": 0.9871260886028019, + "grad_norm": 7.842771530151367, + "learning_rate": 8.302969333165989e-08, + "loss": 0.5287, + "step": 2607 + }, + { + "epoch": 0.9875047330556608, + "grad_norm": 11.93226432800293, + "learning_rate": 7.821804616384709e-08, + "loss": 0.5248, + "step": 2608 + }, + { + "epoch": 0.9878833775085195, + "grad_norm": 14.00164794921875, + "learning_rate": 7.354995647465002e-08, + "loss": 0.996, + "step": 2609 + }, + { + "epoch": 0.9882620219613782, + "grad_norm": 9.637811660766602, + "learning_rate": 6.90254309706928e-08, + "loss": 0.5948, + "step": 2610 + }, + { + "epoch": 0.988640666414237, + "grad_norm": 10.103828430175781, + "learning_rate": 6.464447615235347e-08, + "loss": 0.667, + "step": 2611 + }, + { + "epoch": 0.9890193108670958, + "grad_norm": 8.21738338470459, + "learning_rate": 6.04070983137417e-08, + "loss": 0.6523, + "step": 2612 + }, + { + "epoch": 0.9893979553199546, + "grad_norm": 10.094844818115234, + "learning_rate": 5.631330354269882e-08, + "loss": 0.57, + "step": 2613 + }, + { + "epoch": 0.9897765997728133, + "grad_norm": 9.283138275146484, + "learning_rate": 5.236309772077563e-08, + "loss": 0.5518, + "step": 2614 + }, + { + "epoch": 0.9901552442256721, + "grad_norm": 8.835543632507324, + "learning_rate": 4.855648652321021e-08, + "loss": 0.3256, + "step": 2615 + }, + { + "epoch": 0.9905338886785309, + "grad_norm": 6.809988498687744, + "learning_rate": 4.4893475418983365e-08, + "loss": 0.1993, + "step": 2616 + }, + { + "epoch": 0.9909125331313896, + "grad_norm": 7.190274238586426, + "learning_rate": 4.137406967070767e-08, + "loss": 0.3838, + "step": 2617 + }, + { + "epoch": 0.9912911775842483, + "grad_norm": 13.799723625183105, + "learning_rate": 3.799827433472736e-08, + "loss": 0.5659, + "step": 2618 + }, + { + "epoch": 0.9916698220371072, + "grad_norm": 6.342631816864014, + "learning_rate": 3.47660942610295e-08, + "loss": 0.227, + "step": 2619 + }, + { + "epoch": 0.9920484664899659, + "grad_norm": 7.028225421905518, + "learning_rate": 3.1677534093299545e-08, + "loss": 0.2465, + "step": 2620 + }, + { + "epoch": 0.9924271109428247, + "grad_norm": 9.69013786315918, + "learning_rate": 2.873259826885466e-08, + "loss": 0.3781, + "step": 2621 + }, + { + "epoch": 0.9928057553956835, + "grad_norm": 9.710000038146973, + "learning_rate": 2.5931291018677086e-08, + "loss": 0.2296, + "step": 2622 + }, + { + "epoch": 0.9931843998485422, + "grad_norm": 11.306065559387207, + "learning_rate": 2.3273616367414097e-08, + "loss": 0.2997, + "step": 2623 + }, + { + "epoch": 0.993563044301401, + "grad_norm": 6.006664276123047, + "learning_rate": 2.0759578133333623e-08, + "loss": 0.1308, + "step": 2624 + }, + { + "epoch": 0.9939416887542597, + "grad_norm": 6.388942241668701, + "learning_rate": 1.8389179928357538e-08, + "loss": 0.1969, + "step": 2625 + }, + { + "epoch": 0.9943203332071185, + "grad_norm": 11.715986251831055, + "learning_rate": 1.616242515802835e-08, + "loss": 1.8615, + "step": 2626 + }, + { + "epoch": 0.9946989776599773, + "grad_norm": 9.530550003051758, + "learning_rate": 1.4079317021520321e-08, + "loss": 1.3649, + "step": 2627 + }, + { + "epoch": 0.995077622112836, + "grad_norm": 11.392560958862305, + "learning_rate": 1.2139858511628356e-08, + "loss": 0.9551, + "step": 2628 + }, + { + "epoch": 0.9954562665656949, + "grad_norm": 11.880497932434082, + "learning_rate": 1.0344052414779094e-08, + "loss": 0.8825, + "step": 2629 + }, + { + "epoch": 0.9958349110185536, + "grad_norm": 8.874764442443848, + "learning_rate": 8.691901310997619e-09, + "loss": 0.5746, + "step": 2630 + }, + { + "epoch": 0.9962135554714123, + "grad_norm": 10.284222602844238, + "learning_rate": 7.1834075739296566e-09, + "loss": 0.7843, + "step": 2631 + }, + { + "epoch": 0.9965921999242711, + "grad_norm": 10.232461929321289, + "learning_rate": 5.818573370830471e-09, + "loss": 0.7046, + "step": 2632 + }, + { + "epoch": 0.9969708443771299, + "grad_norm": 11.678064346313477, + "learning_rate": 4.597400662553764e-09, + "loss": 0.4164, + "step": 2633 + }, + { + "epoch": 0.9973494888299886, + "grad_norm": 9.27035903930664, + "learning_rate": 3.5198912035516727e-09, + "loss": 0.3791, + "step": 2634 + }, + { + "epoch": 0.9977281332828474, + "grad_norm": 12.330588340759277, + "learning_rate": 2.586046541874776e-09, + "loss": 0.5568, + "step": 2635 + }, + { + "epoch": 0.9981067777357062, + "grad_norm": 11.334096908569336, + "learning_rate": 1.7958680191942911e-09, + "loss": 0.5329, + "step": 2636 + }, + { + "epoch": 0.998485422188565, + "grad_norm": 12.605074882507324, + "learning_rate": 1.149356770746568e-09, + "loss": 0.4182, + "step": 2637 + }, + { + "epoch": 0.9988640666414237, + "grad_norm": 7.650578022003174, + "learning_rate": 6.465137253663934e-10, + "loss": 0.2754, + "step": 2638 + }, + { + "epoch": 0.9992427110942824, + "grad_norm": 8.941444396972656, + "learning_rate": 2.873396055091959e-10, + "loss": 0.3273, + "step": 2639 + }, + { + "epoch": 0.9996213555471413, + "grad_norm": 10.42584228515625, + "learning_rate": 7.183492717333096e-11, + "loss": 0.1813, + "step": 2640 + }, + { + "epoch": 1.0, + "grad_norm": 7.037883281707764, + "learning_rate": 0.0, + "loss": 0.2703, + "step": 2641 } ], "logging_steps": 1, @@ -13934,12 +18540,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 1.1154459142461063e+19, + "total_flos": 1.485573706265238e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null