{ "best_metric": 1.3534409999847412, "best_model_checkpoint": "miner_id_24/checkpoint-300", "epoch": 1.9429265330904675, "eval_steps": 25, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0064764217769682254, "grad_norm": 23.97552490234375, "learning_rate": 5.555555555555556e-06, "loss": 100.1855, "step": 1 }, { "epoch": 0.0064764217769682254, "eval_loss": 1.6956336498260498, "eval_runtime": 1.1522, "eval_samples_per_second": 43.396, "eval_steps_per_second": 11.283, "step": 1 }, { "epoch": 0.012952843553936451, "grad_norm": 26.440359115600586, "learning_rate": 1.1111111111111112e-05, "loss": 105.4449, "step": 2 }, { "epoch": 0.019429265330904676, "grad_norm": 25.485361099243164, "learning_rate": 1.6666666666666667e-05, "loss": 108.4078, "step": 3 }, { "epoch": 0.025905687107872902, "grad_norm": 28.422931671142578, "learning_rate": 2.2222222222222223e-05, "loss": 106.2226, "step": 4 }, { "epoch": 0.032382108884841124, "grad_norm": 30.359506607055664, "learning_rate": 2.777777777777778e-05, "loss": 103.6669, "step": 5 }, { "epoch": 0.03885853066180935, "grad_norm": 30.204206466674805, "learning_rate": 3.3333333333333335e-05, "loss": 104.4979, "step": 6 }, { "epoch": 0.045334952438777575, "grad_norm": 27.09565544128418, "learning_rate": 3.888888888888889e-05, "loss": 105.1719, "step": 7 }, { "epoch": 0.051811374215745803, "grad_norm": 25.235811233520508, "learning_rate": 4.4444444444444447e-05, "loss": 103.7075, "step": 8 }, { "epoch": 0.058287795992714025, "grad_norm": 24.163663864135742, "learning_rate": 5e-05, "loss": 104.425, "step": 9 }, { "epoch": 0.06476421776968225, "grad_norm": 23.483850479125977, "learning_rate": 5.555555555555556e-05, "loss": 103.3892, "step": 10 }, { "epoch": 0.07124063954665047, "grad_norm": 23.89116668701172, "learning_rate": 6.111111111111112e-05, "loss": 101.6393, "step": 11 }, { "epoch": 0.0777170613236187, "grad_norm": 25.195266723632812, "learning_rate": 6.666666666666667e-05, "loss": 101.5441, "step": 12 }, { "epoch": 0.08419348310058693, "grad_norm": 22.76078224182129, "learning_rate": 7.222222222222222e-05, "loss": 95.5516, "step": 13 }, { "epoch": 0.09066990487755515, "grad_norm": 11.53641128540039, "learning_rate": 7.777777777777778e-05, "loss": 94.9058, "step": 14 }, { "epoch": 0.09714632665452337, "grad_norm": 11.36025333404541, "learning_rate": 8.333333333333334e-05, "loss": 95.5154, "step": 15 }, { "epoch": 0.10362274843149161, "grad_norm": 12.996624946594238, "learning_rate": 8.888888888888889e-05, "loss": 95.8656, "step": 16 }, { "epoch": 0.11009917020845983, "grad_norm": 13.566268920898438, "learning_rate": 9.444444444444444e-05, "loss": 90.6193, "step": 17 }, { "epoch": 0.11657559198542805, "grad_norm": 10.979094505310059, "learning_rate": 0.0001, "loss": 89.9395, "step": 18 }, { "epoch": 0.12305201376239627, "grad_norm": 9.502087593078613, "learning_rate": 9.999884773765534e-05, "loss": 89.6716, "step": 19 }, { "epoch": 0.1295284355393645, "grad_norm": 9.084819793701172, "learning_rate": 9.999539100963065e-05, "loss": 90.615, "step": 20 }, { "epoch": 0.13600485731633272, "grad_norm": 9.052833557128906, "learning_rate": 9.998962999295068e-05, "loss": 90.5139, "step": 21 }, { "epoch": 0.14248127909330094, "grad_norm": 8.019015312194824, "learning_rate": 9.998156498264669e-05, "loss": 92.0318, "step": 22 }, { "epoch": 0.1489577008702692, "grad_norm": 8.435918807983398, "learning_rate": 9.997119639174122e-05, "loss": 90.2595, "step": 23 }, { "epoch": 0.1554341226472374, "grad_norm": 8.477025985717773, "learning_rate": 9.995852475122702e-05, "loss": 94.2843, "step": 24 }, { "epoch": 0.16191054442420563, "grad_norm": 11.691642761230469, "learning_rate": 9.994355071003984e-05, "loss": 91.224, "step": 25 }, { "epoch": 0.16191054442420563, "eval_loss": 1.406772255897522, "eval_runtime": 1.1519, "eval_samples_per_second": 43.408, "eval_steps_per_second": 11.286, "step": 25 }, { "epoch": 0.16838696620117385, "grad_norm": 19.560291290283203, "learning_rate": 9.992627503502517e-05, "loss": 89.9376, "step": 26 }, { "epoch": 0.17486338797814208, "grad_norm": 15.648947715759277, "learning_rate": 9.990669861089904e-05, "loss": 93.7231, "step": 27 }, { "epoch": 0.1813398097551103, "grad_norm": 9.684481620788574, "learning_rate": 9.988482244020256e-05, "loss": 92.8276, "step": 28 }, { "epoch": 0.18781623153207852, "grad_norm": 7.268749713897705, "learning_rate": 9.986064764325079e-05, "loss": 91.284, "step": 29 }, { "epoch": 0.19429265330904674, "grad_norm": 10.009303092956543, "learning_rate": 9.983417545807521e-05, "loss": 89.6324, "step": 30 }, { "epoch": 0.20076907508601496, "grad_norm": 11.520773887634277, "learning_rate": 9.980540724036031e-05, "loss": 87.147, "step": 31 }, { "epoch": 0.20724549686298321, "grad_norm": 10.741743087768555, "learning_rate": 9.977434446337431e-05, "loss": 86.8421, "step": 32 }, { "epoch": 0.21372191863995144, "grad_norm": 9.010642051696777, "learning_rate": 9.974098871789359e-05, "loss": 89.375, "step": 33 }, { "epoch": 0.22019834041691966, "grad_norm": 7.450740337371826, "learning_rate": 9.970534171212117e-05, "loss": 88.9096, "step": 34 }, { "epoch": 0.22667476219388788, "grad_norm": 5.7022552490234375, "learning_rate": 9.966740527159945e-05, "loss": 90.8134, "step": 35 }, { "epoch": 0.2331511839708561, "grad_norm": 6.012399673461914, "learning_rate": 9.962718133911648e-05, "loss": 89.778, "step": 36 }, { "epoch": 0.23962760574782432, "grad_norm": 7.656160354614258, "learning_rate": 9.958467197460662e-05, "loss": 92.8274, "step": 37 }, { "epoch": 0.24610402752479255, "grad_norm": 14.242902755737305, "learning_rate": 9.953987935504497e-05, "loss": 92.6749, "step": 38 }, { "epoch": 0.25258044930176077, "grad_norm": 18.931732177734375, "learning_rate": 9.949280577433593e-05, "loss": 92.5159, "step": 39 }, { "epoch": 0.259056871078729, "grad_norm": 15.716794967651367, "learning_rate": 9.944345364319571e-05, "loss": 91.8517, "step": 40 }, { "epoch": 0.2655332928556972, "grad_norm": 11.944485664367676, "learning_rate": 9.939182548902883e-05, "loss": 89.6787, "step": 41 }, { "epoch": 0.27200971463266543, "grad_norm": 7.472383499145508, "learning_rate": 9.933792395579877e-05, "loss": 87.6417, "step": 42 }, { "epoch": 0.27848613640963366, "grad_norm": 4.86712121963501, "learning_rate": 9.928175180389254e-05, "loss": 86.1079, "step": 43 }, { "epoch": 0.2849625581866019, "grad_norm": 5.600007057189941, "learning_rate": 9.922331190997922e-05, "loss": 87.5013, "step": 44 }, { "epoch": 0.29143897996357016, "grad_norm": 6.959059238433838, "learning_rate": 9.916260726686278e-05, "loss": 86.7291, "step": 45 }, { "epoch": 0.2979154017405384, "grad_norm": 7.974250316619873, "learning_rate": 9.909964098332879e-05, "loss": 88.1271, "step": 46 }, { "epoch": 0.3043918235175066, "grad_norm": 8.772577285766602, "learning_rate": 9.903441628398511e-05, "loss": 88.9066, "step": 47 }, { "epoch": 0.3108682452944748, "grad_norm": 8.172077178955078, "learning_rate": 9.896693650909686e-05, "loss": 89.6845, "step": 48 }, { "epoch": 0.31734466707144304, "grad_norm": 8.20016860961914, "learning_rate": 9.889720511441532e-05, "loss": 90.427, "step": 49 }, { "epoch": 0.32382108884841126, "grad_norm": 9.877755165100098, "learning_rate": 9.882522567100093e-05, "loss": 94.1133, "step": 50 }, { "epoch": 0.32382108884841126, "eval_loss": 1.3826290369033813, "eval_runtime": 1.1514, "eval_samples_per_second": 43.424, "eval_steps_per_second": 11.29, "step": 50 }, { "epoch": 0.3302975106253795, "grad_norm": 9.74140739440918, "learning_rate": 9.875100186504046e-05, "loss": 86.965, "step": 51 }, { "epoch": 0.3367739324023477, "grad_norm": 11.414432525634766, "learning_rate": 9.867453749765821e-05, "loss": 92.3536, "step": 52 }, { "epoch": 0.34325035417931593, "grad_norm": 10.815231323242188, "learning_rate": 9.859583648472133e-05, "loss": 89.6963, "step": 53 }, { "epoch": 0.34972677595628415, "grad_norm": 8.915569305419922, "learning_rate": 9.851490285663937e-05, "loss": 87.884, "step": 54 }, { "epoch": 0.3562031977332524, "grad_norm": 6.973565578460693, "learning_rate": 9.84317407581577e-05, "loss": 85.9538, "step": 55 }, { "epoch": 0.3626796195102206, "grad_norm": 5.8767828941345215, "learning_rate": 9.834635444814545e-05, "loss": 87.2282, "step": 56 }, { "epoch": 0.3691560412871888, "grad_norm": 5.039903163909912, "learning_rate": 9.825874829937722e-05, "loss": 87.0509, "step": 57 }, { "epoch": 0.37563246306415704, "grad_norm": 5.50130033493042, "learning_rate": 9.816892679830937e-05, "loss": 85.606, "step": 58 }, { "epoch": 0.38210888484112526, "grad_norm": 6.082223892211914, "learning_rate": 9.807689454485e-05, "loss": 87.1984, "step": 59 }, { "epoch": 0.3885853066180935, "grad_norm": 7.246311187744141, "learning_rate": 9.798265625212358e-05, "loss": 89.5389, "step": 60 }, { "epoch": 0.3950617283950617, "grad_norm": 8.20361614227295, "learning_rate": 9.788621674622949e-05, "loss": 89.7561, "step": 61 }, { "epoch": 0.40153815017202993, "grad_norm": 9.213728904724121, "learning_rate": 9.778758096599488e-05, "loss": 91.1433, "step": 62 }, { "epoch": 0.4080145719489982, "grad_norm": 13.030341148376465, "learning_rate": 9.76867539627218e-05, "loss": 91.0905, "step": 63 }, { "epoch": 0.41449099372596643, "grad_norm": 7.345123767852783, "learning_rate": 9.758374089992841e-05, "loss": 90.6507, "step": 64 }, { "epoch": 0.42096741550293465, "grad_norm": 6.850903511047363, "learning_rate": 9.747854705308464e-05, "loss": 91.4556, "step": 65 }, { "epoch": 0.42744383727990287, "grad_norm": 6.114011287689209, "learning_rate": 9.737117780934197e-05, "loss": 90.1023, "step": 66 }, { "epoch": 0.4339202590568711, "grad_norm": 5.3909807205200195, "learning_rate": 9.726163866725763e-05, "loss": 85.6907, "step": 67 }, { "epoch": 0.4403966808338393, "grad_norm": 5.315631866455078, "learning_rate": 9.714993523651283e-05, "loss": 86.3473, "step": 68 }, { "epoch": 0.44687310261080754, "grad_norm": 5.265559673309326, "learning_rate": 9.703607323762569e-05, "loss": 86.3845, "step": 69 }, { "epoch": 0.45334952438777576, "grad_norm": 6.135387897491455, "learning_rate": 9.692005850165816e-05, "loss": 88.6293, "step": 70 }, { "epoch": 0.459825946164744, "grad_norm": 6.263807773590088, "learning_rate": 9.680189696991742e-05, "loss": 88.7029, "step": 71 }, { "epoch": 0.4663023679417122, "grad_norm": 6.212631702423096, "learning_rate": 9.668159469365163e-05, "loss": 89.5084, "step": 72 }, { "epoch": 0.4727787897186804, "grad_norm": 6.447956562042236, "learning_rate": 9.655915783374005e-05, "loss": 89.8585, "step": 73 }, { "epoch": 0.47925521149564865, "grad_norm": 7.500765323638916, "learning_rate": 9.643459266037744e-05, "loss": 91.5395, "step": 74 }, { "epoch": 0.48573163327261687, "grad_norm": 14.504000663757324, "learning_rate": 9.630790555275313e-05, "loss": 93.0197, "step": 75 }, { "epoch": 0.48573163327261687, "eval_loss": 1.37493097782135, "eval_runtime": 1.1523, "eval_samples_per_second": 43.39, "eval_steps_per_second": 11.281, "step": 75 }, { "epoch": 0.4922080550495851, "grad_norm": 9.299421310424805, "learning_rate": 9.617910299872416e-05, "loss": 86.5963, "step": 76 }, { "epoch": 0.4986844768265533, "grad_norm": 9.247332572937012, "learning_rate": 9.604819159448309e-05, "loss": 90.0338, "step": 77 }, { "epoch": 0.5051608986035215, "grad_norm": 8.090618133544922, "learning_rate": 9.591517804422023e-05, "loss": 88.5582, "step": 78 }, { "epoch": 0.5116373203804898, "grad_norm": 6.98775053024292, "learning_rate": 9.578006915978022e-05, "loss": 87.2568, "step": 79 }, { "epoch": 0.518113742157458, "grad_norm": 5.27877950668335, "learning_rate": 9.564287186031333e-05, "loss": 85.4709, "step": 80 }, { "epoch": 0.5245901639344263, "grad_norm": 4.718703746795654, "learning_rate": 9.550359317192096e-05, "loss": 86.4072, "step": 81 }, { "epoch": 0.5310665857113944, "grad_norm": 5.233160972595215, "learning_rate": 9.536224022729591e-05, "loss": 86.6844, "step": 82 }, { "epoch": 0.5375430074883627, "grad_norm": 6.239477157592773, "learning_rate": 9.521882026535708e-05, "loss": 85.8801, "step": 83 }, { "epoch": 0.5440194292653309, "grad_norm": 6.476158142089844, "learning_rate": 9.50733406308788e-05, "loss": 87.4998, "step": 84 }, { "epoch": 0.5504958510422991, "grad_norm": 6.8752570152282715, "learning_rate": 9.492580877411456e-05, "loss": 89.1882, "step": 85 }, { "epoch": 0.5569722728192673, "grad_norm": 7.568345069885254, "learning_rate": 9.477623225041565e-05, "loss": 89.0199, "step": 86 }, { "epoch": 0.5634486945962356, "grad_norm": 7.331943988800049, "learning_rate": 9.462461871984411e-05, "loss": 89.0723, "step": 87 }, { "epoch": 0.5699251163732038, "grad_norm": 9.244547843933105, "learning_rate": 9.447097594678046e-05, "loss": 88.1396, "step": 88 }, { "epoch": 0.576401538150172, "grad_norm": 8.298840522766113, "learning_rate": 9.431531179952613e-05, "loss": 89.0989, "step": 89 }, { "epoch": 0.5828779599271403, "grad_norm": 8.666487693786621, "learning_rate": 9.415763424990047e-05, "loss": 89.389, "step": 90 }, { "epoch": 0.5893543817041085, "grad_norm": 7.654580116271973, "learning_rate": 9.39979513728325e-05, "loss": 87.9425, "step": 91 }, { "epoch": 0.5958308034810768, "grad_norm": 6.478038311004639, "learning_rate": 9.383627134594741e-05, "loss": 86.6396, "step": 92 }, { "epoch": 0.6023072252580449, "grad_norm": 5.588522911071777, "learning_rate": 9.367260244914768e-05, "loss": 85.5872, "step": 93 }, { "epoch": 0.6087836470350132, "grad_norm": 5.041703224182129, "learning_rate": 9.350695306418922e-05, "loss": 86.5651, "step": 94 }, { "epoch": 0.6152600688119814, "grad_norm": 5.039709091186523, "learning_rate": 9.333933167425194e-05, "loss": 87.8268, "step": 95 }, { "epoch": 0.6217364905889496, "grad_norm": 5.965563774108887, "learning_rate": 9.316974686350542e-05, "loss": 87.7339, "step": 96 }, { "epoch": 0.6282129123659178, "grad_norm": 6.65131139755249, "learning_rate": 9.299820731666933e-05, "loss": 85.7635, "step": 97 }, { "epoch": 0.6346893341428861, "grad_norm": 7.395946025848389, "learning_rate": 9.282472181856854e-05, "loss": 90.331, "step": 98 }, { "epoch": 0.6411657559198543, "grad_norm": 7.647375106811523, "learning_rate": 9.264929925368338e-05, "loss": 90.684, "step": 99 }, { "epoch": 0.6476421776968225, "grad_norm": 11.403766632080078, "learning_rate": 9.247194860569454e-05, "loss": 91.5051, "step": 100 }, { "epoch": 0.6476421776968225, "eval_loss": 1.3694149255752563, "eval_runtime": 1.1503, "eval_samples_per_second": 43.467, "eval_steps_per_second": 11.301, "step": 100 }, { "epoch": 0.6541185994737907, "grad_norm": 7.575782775878906, "learning_rate": 9.229267895702307e-05, "loss": 86.2207, "step": 101 }, { "epoch": 0.660595021250759, "grad_norm": 7.333686351776123, "learning_rate": 9.211149948836523e-05, "loss": 89.5888, "step": 102 }, { "epoch": 0.6670714430277271, "grad_norm": 6.610541343688965, "learning_rate": 9.192841947822232e-05, "loss": 89.1431, "step": 103 }, { "epoch": 0.6735478648046954, "grad_norm": 5.832754135131836, "learning_rate": 9.17434483024255e-05, "loss": 86.9471, "step": 104 }, { "epoch": 0.6800242865816636, "grad_norm": 5.11637544631958, "learning_rate": 9.155659543365574e-05, "loss": 85.5211, "step": 105 }, { "epoch": 0.6865007083586319, "grad_norm": 5.000216007232666, "learning_rate": 9.136787044095856e-05, "loss": 84.5617, "step": 106 }, { "epoch": 0.6929771301356, "grad_norm": 5.362670421600342, "learning_rate": 9.117728298925407e-05, "loss": 83.8331, "step": 107 }, { "epoch": 0.6994535519125683, "grad_norm": 5.481074810028076, "learning_rate": 9.0984842838842e-05, "loss": 85.5844, "step": 108 }, { "epoch": 0.7059299736895366, "grad_norm": 5.987509250640869, "learning_rate": 9.079055984490186e-05, "loss": 87.3264, "step": 109 }, { "epoch": 0.7124063954665047, "grad_norm": 6.313266754150391, "learning_rate": 9.059444395698823e-05, "loss": 87.9586, "step": 110 }, { "epoch": 0.718882817243473, "grad_norm": 7.019072532653809, "learning_rate": 9.039650521852124e-05, "loss": 88.2712, "step": 111 }, { "epoch": 0.7253592390204412, "grad_norm": 7.743898391723633, "learning_rate": 9.019675376627223e-05, "loss": 88.4723, "step": 112 }, { "epoch": 0.7318356607974095, "grad_norm": 7.635955333709717, "learning_rate": 8.99951998298446e-05, "loss": 86.6761, "step": 113 }, { "epoch": 0.7383120825743776, "grad_norm": 7.730625629425049, "learning_rate": 8.979185373114996e-05, "loss": 90.206, "step": 114 }, { "epoch": 0.7447885043513459, "grad_norm": 8.310251235961914, "learning_rate": 8.958672588387953e-05, "loss": 91.235, "step": 115 }, { "epoch": 0.7512649261283141, "grad_norm": 7.113434314727783, "learning_rate": 8.937982679297084e-05, "loss": 89.6536, "step": 116 }, { "epoch": 0.7577413479052824, "grad_norm": 5.876744270324707, "learning_rate": 8.917116705406973e-05, "loss": 85.881, "step": 117 }, { "epoch": 0.7642177696822505, "grad_norm": 6.944094181060791, "learning_rate": 8.89607573529878e-05, "loss": 85.536, "step": 118 }, { "epoch": 0.7706941914592188, "grad_norm": 5.213791370391846, "learning_rate": 8.8748608465155e-05, "loss": 86.7831, "step": 119 }, { "epoch": 0.777170613236187, "grad_norm": 5.233664035797119, "learning_rate": 8.853473125506803e-05, "loss": 86.6629, "step": 120 }, { "epoch": 0.7836470350131552, "grad_norm": 5.996572017669678, "learning_rate": 8.831913667573379e-05, "loss": 85.3778, "step": 121 }, { "epoch": 0.7901234567901234, "grad_norm": 6.251316547393799, "learning_rate": 8.810183576810856e-05, "loss": 89.5681, "step": 122 }, { "epoch": 0.7965998785670917, "grad_norm": 6.619566440582275, "learning_rate": 8.788283966053244e-05, "loss": 88.062, "step": 123 }, { "epoch": 0.8030763003440599, "grad_norm": 7.528046131134033, "learning_rate": 8.766215956815959e-05, "loss": 90.3257, "step": 124 }, { "epoch": 0.8095527221210281, "grad_norm": 12.629373550415039, "learning_rate": 8.743980679238385e-05, "loss": 90.4118, "step": 125 }, { "epoch": 0.8095527221210281, "eval_loss": 1.3662638664245605, "eval_runtime": 1.1518, "eval_samples_per_second": 43.412, "eval_steps_per_second": 11.287, "step": 125 }, { "epoch": 0.8160291438979964, "grad_norm": 6.821308135986328, "learning_rate": 8.721579272025989e-05, "loss": 85.6014, "step": 126 }, { "epoch": 0.8225055656749646, "grad_norm": 6.770315647125244, "learning_rate": 8.699012882392018e-05, "loss": 87.5623, "step": 127 }, { "epoch": 0.8289819874519329, "grad_norm": 6.329639911651611, "learning_rate": 8.676282665998736e-05, "loss": 89.1028, "step": 128 }, { "epoch": 0.835458409228901, "grad_norm": 5.401063919067383, "learning_rate": 8.653389786898255e-05, "loss": 85.6681, "step": 129 }, { "epoch": 0.8419348310058693, "grad_norm": 7.508073329925537, "learning_rate": 8.630335417472909e-05, "loss": 85.8936, "step": 130 }, { "epoch": 0.8484112527828375, "grad_norm": 4.998537063598633, "learning_rate": 8.607120738375219e-05, "loss": 86.1916, "step": 131 }, { "epoch": 0.8548876745598057, "grad_norm": 5.240179061889648, "learning_rate": 8.583746938467436e-05, "loss": 86.2778, "step": 132 }, { "epoch": 0.8613640963367739, "grad_norm": 5.5274763107299805, "learning_rate": 8.560215214760647e-05, "loss": 88.0238, "step": 133 }, { "epoch": 0.8678405181137422, "grad_norm": 6.343565940856934, "learning_rate": 8.53652677235348e-05, "loss": 87.5827, "step": 134 }, { "epoch": 0.8743169398907104, "grad_norm": 6.233267784118652, "learning_rate": 8.512682824370386e-05, "loss": 88.3115, "step": 135 }, { "epoch": 0.8807933616676786, "grad_norm": 6.717974662780762, "learning_rate": 8.48868459189952e-05, "loss": 90.1193, "step": 136 }, { "epoch": 0.8872697834446468, "grad_norm": 7.572277069091797, "learning_rate": 8.464533303930195e-05, "loss": 92.2225, "step": 137 }, { "epoch": 0.8937462052216151, "grad_norm": 8.858343124389648, "learning_rate": 8.440230197289955e-05, "loss": 88.4339, "step": 138 }, { "epoch": 0.9002226269985832, "grad_norm": 8.37955093383789, "learning_rate": 8.415776516581229e-05, "loss": 88.8891, "step": 139 }, { "epoch": 0.9066990487755515, "grad_norm": 9.093411445617676, "learning_rate": 8.391173514117591e-05, "loss": 89.8529, "step": 140 }, { "epoch": 0.9131754705525197, "grad_norm": 7.6774516105651855, "learning_rate": 8.366422449859635e-05, "loss": 88.5709, "step": 141 }, { "epoch": 0.919651892329488, "grad_norm": 6.732501029968262, "learning_rate": 8.34152459135044e-05, "loss": 85.4348, "step": 142 }, { "epoch": 0.9261283141064561, "grad_norm": 5.530577659606934, "learning_rate": 8.316481213650668e-05, "loss": 86.0735, "step": 143 }, { "epoch": 0.9326047358834244, "grad_norm": 5.121070861816406, "learning_rate": 8.291293599273253e-05, "loss": 85.602, "step": 144 }, { "epoch": 0.9390811576603927, "grad_norm": 5.192986011505127, "learning_rate": 8.265963038117736e-05, "loss": 86.5851, "step": 145 }, { "epoch": 0.9455575794373609, "grad_norm": 5.9747161865234375, "learning_rate": 8.240490827404196e-05, "loss": 87.7343, "step": 146 }, { "epoch": 0.9520340012143291, "grad_norm": 6.966287136077881, "learning_rate": 8.21487827160682e-05, "loss": 87.5681, "step": 147 }, { "epoch": 0.9585104229912973, "grad_norm": 7.882429122924805, "learning_rate": 8.189126682387103e-05, "loss": 86.6152, "step": 148 }, { "epoch": 0.9649868447682656, "grad_norm": 8.548166275024414, "learning_rate": 8.163237378526669e-05, "loss": 90.6725, "step": 149 }, { "epoch": 0.9714632665452337, "grad_norm": 14.08648681640625, "learning_rate": 8.137211685859739e-05, "loss": 91.253, "step": 150 }, { "epoch": 0.9714632665452337, "eval_loss": 1.3635057210922241, "eval_runtime": 1.1499, "eval_samples_per_second": 43.482, "eval_steps_per_second": 11.305, "step": 150 }, { "epoch": 0.977939688322202, "grad_norm": 6.204248905181885, "learning_rate": 8.111050937205231e-05, "loss": 86.4112, "step": 151 }, { "epoch": 0.9844161100991702, "grad_norm": 5.269873142242432, "learning_rate": 8.084756472298504e-05, "loss": 86.2894, "step": 152 }, { "epoch": 0.9908925318761385, "grad_norm": 5.337291240692139, "learning_rate": 8.05832963772275e-05, "loss": 87.7411, "step": 153 }, { "epoch": 0.9973689536531066, "grad_norm": 5.485300064086914, "learning_rate": 8.031771786840027e-05, "loss": 88.958, "step": 154 }, { "epoch": 1.003845375430075, "grad_norm": 6.286268711090088, "learning_rate": 8.005084279721962e-05, "loss": 87.1405, "step": 155 }, { "epoch": 1.010321797207043, "grad_norm": 7.272406101226807, "learning_rate": 7.978268483080086e-05, "loss": 89.0797, "step": 156 }, { "epoch": 1.0167982189840112, "grad_norm": 5.948840618133545, "learning_rate": 7.951325770195858e-05, "loss": 89.2735, "step": 157 }, { "epoch": 1.0232746407609796, "grad_norm": 5.509191989898682, "learning_rate": 7.924257520850318e-05, "loss": 86.8927, "step": 158 }, { "epoch": 1.0297510625379478, "grad_norm": 5.0609354972839355, "learning_rate": 7.897065121253442e-05, "loss": 84.5423, "step": 159 }, { "epoch": 1.036227484314916, "grad_norm": 5.4893412590026855, "learning_rate": 7.86974996397315e-05, "loss": 83.8795, "step": 160 }, { "epoch": 1.0427039060918841, "grad_norm": 5.744421005249023, "learning_rate": 7.842313447863978e-05, "loss": 85.2153, "step": 161 }, { "epoch": 1.0491803278688525, "grad_norm": 6.274890422821045, "learning_rate": 7.814756977995459e-05, "loss": 84.5926, "step": 162 }, { "epoch": 1.0556567496458207, "grad_norm": 6.743546962738037, "learning_rate": 7.78708196558015e-05, "loss": 84.7119, "step": 163 }, { "epoch": 1.0621331714227888, "grad_norm": 6.911246299743652, "learning_rate": 7.75928982790137e-05, "loss": 86.6985, "step": 164 }, { "epoch": 1.0686095931997572, "grad_norm": 7.222784042358398, "learning_rate": 7.73138198824062e-05, "loss": 86.8881, "step": 165 }, { "epoch": 1.0750860149767254, "grad_norm": 7.774824142456055, "learning_rate": 7.703359875804689e-05, "loss": 89.2754, "step": 166 }, { "epoch": 1.0815624367536936, "grad_norm": 11.337868690490723, "learning_rate": 7.675224925652463e-05, "loss": 89.5731, "step": 167 }, { "epoch": 1.0880388585306617, "grad_norm": 8.670846939086914, "learning_rate": 7.646978578621437e-05, "loss": 86.3168, "step": 168 }, { "epoch": 1.0945152803076301, "grad_norm": 9.115431785583496, "learning_rate": 7.618622281253924e-05, "loss": 88.8576, "step": 169 }, { "epoch": 1.1009917020845983, "grad_norm": 8.850008010864258, "learning_rate": 7.590157485722982e-05, "loss": 89.2506, "step": 170 }, { "epoch": 1.1074681238615665, "grad_norm": 7.5737457275390625, "learning_rate": 7.561585649758028e-05, "loss": 87.3879, "step": 171 }, { "epoch": 1.1139445456385346, "grad_norm": 6.677452087402344, "learning_rate": 7.532908236570209e-05, "loss": 83.5083, "step": 172 }, { "epoch": 1.120420967415503, "grad_norm": 5.898469924926758, "learning_rate": 7.504126714777451e-05, "loss": 84.0906, "step": 173 }, { "epoch": 1.1268973891924712, "grad_norm": 5.492910861968994, "learning_rate": 7.475242558329254e-05, "loss": 85.4018, "step": 174 }, { "epoch": 1.1333738109694393, "grad_norm": 5.900984287261963, "learning_rate": 7.446257246431213e-05, "loss": 84.4847, "step": 175 }, { "epoch": 1.1333738109694393, "eval_loss": 1.3622227907180786, "eval_runtime": 1.1506, "eval_samples_per_second": 43.454, "eval_steps_per_second": 11.298, "step": 175 }, { "epoch": 1.1398502327464075, "grad_norm": 6.4240803718566895, "learning_rate": 7.417172263469256e-05, "loss": 84.0926, "step": 176 }, { "epoch": 1.146326654523376, "grad_norm": 7.57835578918457, "learning_rate": 7.387989098933635e-05, "loss": 86.8981, "step": 177 }, { "epoch": 1.152803076300344, "grad_norm": 8.353677749633789, "learning_rate": 7.358709247342646e-05, "loss": 87.7625, "step": 178 }, { "epoch": 1.1592794980773122, "grad_norm": 10.412530899047852, "learning_rate": 7.329334208166084e-05, "loss": 87.1761, "step": 179 }, { "epoch": 1.1657559198542806, "grad_norm": 9.623198509216309, "learning_rate": 7.299865485748463e-05, "loss": 85.1966, "step": 180 }, { "epoch": 1.1722323416312488, "grad_norm": 6.3871750831604, "learning_rate": 7.270304589231966e-05, "loss": 86.9198, "step": 181 }, { "epoch": 1.178708763408217, "grad_norm": 6.1822919845581055, "learning_rate": 7.24065303247917e-05, "loss": 87.9364, "step": 182 }, { "epoch": 1.1851851851851851, "grad_norm": 5.828580379486084, "learning_rate": 7.21091233399551e-05, "loss": 85.442, "step": 183 }, { "epoch": 1.1916616069621533, "grad_norm": 5.938571453094482, "learning_rate": 7.181084016851518e-05, "loss": 84.8435, "step": 184 }, { "epoch": 1.1981380287391217, "grad_norm": 5.861661434173584, "learning_rate": 7.151169608604823e-05, "loss": 83.805, "step": 185 }, { "epoch": 1.2046144505160898, "grad_norm": 5.848051071166992, "learning_rate": 7.121170641221921e-05, "loss": 82.7687, "step": 186 }, { "epoch": 1.211090872293058, "grad_norm": 6.26589298248291, "learning_rate": 7.091088650999727e-05, "loss": 83.7361, "step": 187 }, { "epoch": 1.2175672940700264, "grad_norm": 6.257192611694336, "learning_rate": 7.060925178486883e-05, "loss": 84.7459, "step": 188 }, { "epoch": 1.2240437158469946, "grad_norm": 6.821758270263672, "learning_rate": 7.030681768404885e-05, "loss": 86.512, "step": 189 }, { "epoch": 1.2305201376239627, "grad_norm": 7.374625205993652, "learning_rate": 7.000359969568959e-05, "loss": 86.6492, "step": 190 }, { "epoch": 1.236996559400931, "grad_norm": 8.044811248779297, "learning_rate": 6.96996133480875e-05, "loss": 90.982, "step": 191 }, { "epoch": 1.2434729811778993, "grad_norm": 9.82665729522705, "learning_rate": 6.9394874208888e-05, "loss": 87.8916, "step": 192 }, { "epoch": 1.2499494029548675, "grad_norm": 8.18597412109375, "learning_rate": 6.908939788428818e-05, "loss": 87.5598, "step": 193 }, { "epoch": 1.2564258247318356, "grad_norm": 8.252458572387695, "learning_rate": 6.878320001823764e-05, "loss": 89.2586, "step": 194 }, { "epoch": 1.262902246508804, "grad_norm": 7.4712815284729, "learning_rate": 6.847629629163734e-05, "loss": 88.1015, "step": 195 }, { "epoch": 1.2693786682857722, "grad_norm": 6.774599075317383, "learning_rate": 6.816870242153649e-05, "loss": 86.0356, "step": 196 }, { "epoch": 1.2758550900627403, "grad_norm": 6.240269660949707, "learning_rate": 6.786043416032772e-05, "loss": 84.1552, "step": 197 }, { "epoch": 1.2823315118397085, "grad_norm": 5.876425743103027, "learning_rate": 6.755150729494033e-05, "loss": 81.8465, "step": 198 }, { "epoch": 1.2888079336166767, "grad_norm": 5.887735366821289, "learning_rate": 6.724193764603185e-05, "loss": 85.5337, "step": 199 }, { "epoch": 1.295284355393645, "grad_norm": 6.710705280303955, "learning_rate": 6.693174106717781e-05, "loss": 83.43, "step": 200 }, { "epoch": 1.295284355393645, "eval_loss": 1.3603808879852295, "eval_runtime": 1.1516, "eval_samples_per_second": 43.416, "eval_steps_per_second": 11.288, "step": 200 }, { "epoch": 1.3017607771706132, "grad_norm": 7.395152568817139, "learning_rate": 6.662093344405984e-05, "loss": 86.9395, "step": 201 }, { "epoch": 1.3082371989475814, "grad_norm": 7.946622371673584, "learning_rate": 6.630953069365224e-05, "loss": 84.3904, "step": 202 }, { "epoch": 1.3147136207245498, "grad_norm": 8.76561164855957, "learning_rate": 6.599754876340666e-05, "loss": 87.8727, "step": 203 }, { "epoch": 1.321190042501518, "grad_norm": 10.111847877502441, "learning_rate": 6.568500363043561e-05, "loss": 88.9806, "step": 204 }, { "epoch": 1.3276664642784861, "grad_norm": 7.7700982093811035, "learning_rate": 6.53719113006941e-05, "loss": 85.9951, "step": 205 }, { "epoch": 1.3341428860554543, "grad_norm": 7.805271148681641, "learning_rate": 6.505828780815993e-05, "loss": 88.9993, "step": 206 }, { "epoch": 1.3406193078324227, "grad_norm": 7.5320258140563965, "learning_rate": 6.474414921401274e-05, "loss": 88.5286, "step": 207 }, { "epoch": 1.3470957296093908, "grad_norm": 11.297571182250977, "learning_rate": 6.442951160581135e-05, "loss": 85.9591, "step": 208 }, { "epoch": 1.353572151386359, "grad_norm": 6.681571006774902, "learning_rate": 6.411439109666985e-05, "loss": 84.0142, "step": 209 }, { "epoch": 1.3600485731633274, "grad_norm": 6.609443187713623, "learning_rate": 6.379880382443258e-05, "loss": 84.1161, "step": 210 }, { "epoch": 1.3665249949402956, "grad_norm": 6.309790134429932, "learning_rate": 6.348276595084753e-05, "loss": 83.4006, "step": 211 }, { "epoch": 1.3730014167172637, "grad_norm": 6.506324768066406, "learning_rate": 6.316629366073878e-05, "loss": 84.3522, "step": 212 }, { "epoch": 1.379477838494232, "grad_norm": 6.594834804534912, "learning_rate": 6.284940316117752e-05, "loss": 83.8696, "step": 213 }, { "epoch": 1.3859542602712, "grad_norm": 7.250606536865234, "learning_rate": 6.253211068065226e-05, "loss": 84.9643, "step": 214 }, { "epoch": 1.3924306820481684, "grad_norm": 8.0385160446167, "learning_rate": 6.221443246823753e-05, "loss": 86.0789, "step": 215 }, { "epoch": 1.3989071038251366, "grad_norm": 8.668357849121094, "learning_rate": 6.189638479276184e-05, "loss": 86.9037, "step": 216 }, { "epoch": 1.4053835256021048, "grad_norm": 12.026268005371094, "learning_rate": 6.157798394197457e-05, "loss": 85.4328, "step": 217 }, { "epoch": 1.4118599473790732, "grad_norm": 7.798686504364014, "learning_rate": 6.125924622171172e-05, "loss": 86.2251, "step": 218 }, { "epoch": 1.4183363691560413, "grad_norm": 7.524980545043945, "learning_rate": 6.094018795506101e-05, "loss": 88.9988, "step": 219 }, { "epoch": 1.4248127909330095, "grad_norm": 7.057831287384033, "learning_rate": 6.062082548152581e-05, "loss": 85.9796, "step": 220 }, { "epoch": 1.4312892127099777, "grad_norm": 6.44320821762085, "learning_rate": 6.030117515618848e-05, "loss": 84.572, "step": 221 }, { "epoch": 1.4377656344869458, "grad_norm": 6.346105098724365, "learning_rate": 5.9981253348872704e-05, "loss": 84.6194, "step": 222 }, { "epoch": 1.4442420562639142, "grad_norm": 6.338534355163574, "learning_rate": 5.966107644330524e-05, "loss": 83.8731, "step": 223 }, { "epoch": 1.4507184780408824, "grad_norm": 6.344674110412598, "learning_rate": 5.934066083627683e-05, "loss": 83.7774, "step": 224 }, { "epoch": 1.4571948998178508, "grad_norm": 7.0422797203063965, "learning_rate": 5.902002293680253e-05, "loss": 86.4043, "step": 225 }, { "epoch": 1.4571948998178508, "eval_loss": 1.3590971231460571, "eval_runtime": 1.1496, "eval_samples_per_second": 43.495, "eval_steps_per_second": 11.309, "step": 225 }, { "epoch": 1.463671321594819, "grad_norm": 7.233029842376709, "learning_rate": 5.869917916528134e-05, "loss": 85.8549, "step": 226 }, { "epoch": 1.470147743371787, "grad_norm": 7.845042705535889, "learning_rate": 5.837814595265534e-05, "loss": 85.77, "step": 227 }, { "epoch": 1.4766241651487553, "grad_norm": 8.548230171203613, "learning_rate": 5.805693973956819e-05, "loss": 87.5861, "step": 228 }, { "epoch": 1.4831005869257234, "grad_norm": 9.914074897766113, "learning_rate": 5.77355769755232e-05, "loss": 87.9156, "step": 229 }, { "epoch": 1.4895770087026918, "grad_norm": 8.572760581970215, "learning_rate": 5.7414074118040863e-05, "loss": 83.0358, "step": 230 }, { "epoch": 1.49605343047966, "grad_norm": 8.278693199157715, "learning_rate": 5.709244763181616e-05, "loss": 87.8157, "step": 231 }, { "epoch": 1.5025298522566282, "grad_norm": 8.25027084350586, "learning_rate": 5.677071398787526e-05, "loss": 88.9732, "step": 232 }, { "epoch": 1.5090062740335966, "grad_norm": 7.392897129058838, "learning_rate": 5.644888966273209e-05, "loss": 86.2674, "step": 233 }, { "epoch": 1.5154826958105647, "grad_norm": 6.923493385314941, "learning_rate": 5.612699113754446e-05, "loss": 84.8558, "step": 234 }, { "epoch": 1.5219591175875329, "grad_norm": 6.476316452026367, "learning_rate": 5.5805034897270144e-05, "loss": 84.0044, "step": 235 }, { "epoch": 1.528435539364501, "grad_norm": 6.528478145599365, "learning_rate": 5.5483037429822585e-05, "loss": 84.6853, "step": 236 }, { "epoch": 1.5349119611414692, "grad_norm": 6.715808391571045, "learning_rate": 5.516101522522651e-05, "loss": 83.4079, "step": 237 }, { "epoch": 1.5413883829184376, "grad_norm": 7.000762939453125, "learning_rate": 5.483898477477349e-05, "loss": 83.7282, "step": 238 }, { "epoch": 1.5478648046954058, "grad_norm": 7.617155075073242, "learning_rate": 5.451696257017742e-05, "loss": 86.2884, "step": 239 }, { "epoch": 1.5543412264723742, "grad_norm": 8.479978561401367, "learning_rate": 5.419496510272985e-05, "loss": 87.1545, "step": 240 }, { "epoch": 1.5608176482493423, "grad_norm": 9.737258911132812, "learning_rate": 5.3873008862455546e-05, "loss": 88.2807, "step": 241 }, { "epoch": 1.5672940700263105, "grad_norm": 13.30903148651123, "learning_rate": 5.355111033726792e-05, "loss": 87.2007, "step": 242 }, { "epoch": 1.5737704918032787, "grad_norm": 7.551815986633301, "learning_rate": 5.3229286012124745e-05, "loss": 86.398, "step": 243 }, { "epoch": 1.5802469135802468, "grad_norm": 7.769435882568359, "learning_rate": 5.2907552368183847e-05, "loss": 87.9565, "step": 244 }, { "epoch": 1.586723335357215, "grad_norm": 7.145780563354492, "learning_rate": 5.258592588195914e-05, "loss": 87.9097, "step": 245 }, { "epoch": 1.5931997571341834, "grad_norm": 6.6358442306518555, "learning_rate": 5.2264423024476816e-05, "loss": 85.2343, "step": 246 }, { "epoch": 1.5996761789111515, "grad_norm": 6.325075626373291, "learning_rate": 5.194306026043181e-05, "loss": 83.2337, "step": 247 }, { "epoch": 1.60615260068812, "grad_norm": 6.256945610046387, "learning_rate": 5.1621854047344665e-05, "loss": 84.8934, "step": 248 }, { "epoch": 1.612629022465088, "grad_norm": 6.676874160766602, "learning_rate": 5.1300820834718664e-05, "loss": 82.5999, "step": 249 }, { "epoch": 1.6191054442420563, "grad_norm": 6.871759414672852, "learning_rate": 5.0979977063197494e-05, "loss": 85.2274, "step": 250 }, { "epoch": 1.6191054442420563, "eval_loss": 1.358162522315979, "eval_runtime": 1.1489, "eval_samples_per_second": 43.519, "eval_steps_per_second": 11.315, "step": 250 }, { "epoch": 1.6255818660190244, "grad_norm": 7.170279026031494, "learning_rate": 5.0659339163723186e-05, "loss": 87.0733, "step": 251 }, { "epoch": 1.6320582877959926, "grad_norm": 7.764693260192871, "learning_rate": 5.0338923556694776e-05, "loss": 87.1822, "step": 252 }, { "epoch": 1.638534709572961, "grad_norm": 9.593485832214355, "learning_rate": 5.0018746651127313e-05, "loss": 86.049, "step": 253 }, { "epoch": 1.6450111313499292, "grad_norm": 10.47472858428955, "learning_rate": 4.969882484381153e-05, "loss": 89.1522, "step": 254 }, { "epoch": 1.6514875531268975, "grad_norm": 9.065328598022461, "learning_rate": 4.937917451847419e-05, "loss": 85.0014, "step": 255 }, { "epoch": 1.6579639749038657, "grad_norm": 8.042845726013184, "learning_rate": 4.9059812044939e-05, "loss": 88.0944, "step": 256 }, { "epoch": 1.6644403966808339, "grad_norm": 7.660902500152588, "learning_rate": 4.8740753778288286e-05, "loss": 87.8028, "step": 257 }, { "epoch": 1.670916818457802, "grad_norm": 7.008611679077148, "learning_rate": 4.842201605802545e-05, "loss": 86.4977, "step": 258 }, { "epoch": 1.6773932402347702, "grad_norm": 6.6476898193359375, "learning_rate": 4.810361520723817e-05, "loss": 83.0529, "step": 259 }, { "epoch": 1.6838696620117384, "grad_norm": 6.531733989715576, "learning_rate": 4.778556753176249e-05, "loss": 81.942, "step": 260 }, { "epoch": 1.6903460837887068, "grad_norm": 6.486162185668945, "learning_rate": 4.746788931934775e-05, "loss": 84.2968, "step": 261 }, { "epoch": 1.696822505565675, "grad_norm": 6.685736656188965, "learning_rate": 4.715059683882248e-05, "loss": 84.2725, "step": 262 }, { "epoch": 1.7032989273426433, "grad_norm": 7.1622538566589355, "learning_rate": 4.6833706339261254e-05, "loss": 84.6905, "step": 263 }, { "epoch": 1.7097753491196115, "grad_norm": 7.275223255157471, "learning_rate": 4.6517234049152484e-05, "loss": 85.658, "step": 264 }, { "epoch": 1.7162517708965797, "grad_norm": 8.095952033996582, "learning_rate": 4.620119617556743e-05, "loss": 86.4005, "step": 265 }, { "epoch": 1.7227281926735478, "grad_norm": 9.836173057556152, "learning_rate": 4.588560890333016e-05, "loss": 84.0733, "step": 266 }, { "epoch": 1.729204614450516, "grad_norm": 13.220867156982422, "learning_rate": 4.557048839418867e-05, "loss": 86.62, "step": 267 }, { "epoch": 1.7356810362274842, "grad_norm": 7.757680892944336, "learning_rate": 4.525585078598726e-05, "loss": 85.4336, "step": 268 }, { "epoch": 1.7421574580044525, "grad_norm": 7.764101028442383, "learning_rate": 4.494171219184008e-05, "loss": 89.0821, "step": 269 }, { "epoch": 1.748633879781421, "grad_norm": 7.054685115814209, "learning_rate": 4.462808869930592e-05, "loss": 87.4714, "step": 270 }, { "epoch": 1.755110301558389, "grad_norm": 6.625744342803955, "learning_rate": 4.4314996369564386e-05, "loss": 85.2647, "step": 271 }, { "epoch": 1.7615867233353573, "grad_norm": 6.45122766494751, "learning_rate": 4.4002451236593335e-05, "loss": 82.98, "step": 272 }, { "epoch": 1.7680631451123254, "grad_norm": 6.553142070770264, "learning_rate": 4.369046930634777e-05, "loss": 85.6206, "step": 273 }, { "epoch": 1.7745395668892936, "grad_norm": 6.669130802154541, "learning_rate": 4.337906655594016e-05, "loss": 84.2493, "step": 274 }, { "epoch": 1.7810159886662618, "grad_norm": 6.9759840965271, "learning_rate": 4.3068258932822214e-05, "loss": 83.7095, "step": 275 }, { "epoch": 1.7810159886662618, "eval_loss": 1.3560975790023804, "eval_runtime": 1.158, "eval_samples_per_second": 43.179, "eval_steps_per_second": 11.227, "step": 275 }, { "epoch": 1.7874924104432302, "grad_norm": 7.41339635848999, "learning_rate": 4.2758062353968166e-05, "loss": 83.5531, "step": 276 }, { "epoch": 1.7939688322201983, "grad_norm": 7.747604846954346, "learning_rate": 4.2448492705059685e-05, "loss": 85.9682, "step": 277 }, { "epoch": 1.8004452539971667, "grad_norm": 8.86090087890625, "learning_rate": 4.21395658396723e-05, "loss": 87.1045, "step": 278 }, { "epoch": 1.8069216757741349, "grad_norm": 10.854351043701172, "learning_rate": 4.183129757846353e-05, "loss": 87.4209, "step": 279 }, { "epoch": 1.813398097551103, "grad_norm": 9.061321258544922, "learning_rate": 4.152370370836268e-05, "loss": 85.7737, "step": 280 }, { "epoch": 1.8198745193280712, "grad_norm": 8.29134464263916, "learning_rate": 4.1216799981762386e-05, "loss": 88.162, "step": 281 }, { "epoch": 1.8263509411050394, "grad_norm": 7.982885837554932, "learning_rate": 4.091060211571185e-05, "loss": 87.455, "step": 282 }, { "epoch": 1.8328273628820075, "grad_norm": 7.286195755004883, "learning_rate": 4.060512579111203e-05, "loss": 87.0443, "step": 283 }, { "epoch": 1.839303784658976, "grad_norm": 7.0868144035339355, "learning_rate": 4.030038665191251e-05, "loss": 82.5468, "step": 284 }, { "epoch": 1.845780206435944, "grad_norm": 6.869359493255615, "learning_rate": 3.999640030431042e-05, "loss": 83.7136, "step": 285 }, { "epoch": 1.8522566282129125, "grad_norm": 6.86782693862915, "learning_rate": 3.969318231595116e-05, "loss": 84.5827, "step": 286 }, { "epoch": 1.8587330499898806, "grad_norm": 6.908777713775635, "learning_rate": 3.939074821513117e-05, "loss": 86.1444, "step": 287 }, { "epoch": 1.8652094717668488, "grad_norm": 7.287711143493652, "learning_rate": 3.908911349000274e-05, "loss": 84.529, "step": 288 }, { "epoch": 1.871685893543817, "grad_norm": 7.556633949279785, "learning_rate": 3.878829358778078e-05, "loss": 87.2159, "step": 289 }, { "epoch": 1.8781623153207851, "grad_norm": 8.383976936340332, "learning_rate": 3.848830391395177e-05, "loss": 84.2072, "step": 290 }, { "epoch": 1.8846387370977535, "grad_norm": 9.594328880310059, "learning_rate": 3.818915983148482e-05, "loss": 88.4876, "step": 291 }, { "epoch": 1.8911151588747217, "grad_norm": 13.546067237854004, "learning_rate": 3.78908766600449e-05, "loss": 85.8718, "step": 292 }, { "epoch": 1.89759158065169, "grad_norm": 10.582322120666504, "learning_rate": 3.7593469675208316e-05, "loss": 85.1854, "step": 293 }, { "epoch": 1.9040680024286583, "grad_norm": 7.625749588012695, "learning_rate": 3.729695410768035e-05, "loss": 88.0194, "step": 294 }, { "epoch": 1.9105444242056264, "grad_norm": 7.433224678039551, "learning_rate": 3.7001345142515385e-05, "loss": 88.7619, "step": 295 }, { "epoch": 1.9170208459825946, "grad_norm": 6.673389434814453, "learning_rate": 3.6706657918339163e-05, "loss": 84.7882, "step": 296 }, { "epoch": 1.9234972677595628, "grad_norm": 6.622186660766602, "learning_rate": 3.641290752657355e-05, "loss": 82.4685, "step": 297 }, { "epoch": 1.929973689536531, "grad_norm": 6.761792182922363, "learning_rate": 3.612010901066366e-05, "loss": 81.4877, "step": 298 }, { "epoch": 1.9364501113134993, "grad_norm": 6.871157646179199, "learning_rate": 3.582827736530745e-05, "loss": 81.5665, "step": 299 }, { "epoch": 1.9429265330904675, "grad_norm": 7.098099708557129, "learning_rate": 3.553742753568789e-05, "loss": 83.9212, "step": 300 }, { "epoch": 1.9429265330904675, "eval_loss": 1.3534409999847412, "eval_runtime": 1.1501, "eval_samples_per_second": 43.474, "eval_steps_per_second": 11.303, "step": 300 } ], "logging_steps": 1, "max_steps": 457, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.560565147225293e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }