diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,68572 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 9790, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010214504596527068, + "grad_norm": 24.5991329176078, + "learning_rate": 6.802721088435375e-08, + "loss": 2.2848, + "step": 1 + }, + { + "epoch": 0.00020429009193054137, + "grad_norm": 26.689937379942236, + "learning_rate": 1.360544217687075e-07, + "loss": 2.2822, + "step": 2 + }, + { + "epoch": 0.00030643513789581204, + "grad_norm": 29.618260135945295, + "learning_rate": 2.0408163265306121e-07, + "loss": 2.4747, + "step": 3 + }, + { + "epoch": 0.00040858018386108274, + "grad_norm": 30.76652113148423, + "learning_rate": 2.72108843537415e-07, + "loss": 2.565, + "step": 4 + }, + { + "epoch": 0.0005107252298263534, + "grad_norm": 34.77649002482841, + "learning_rate": 3.401360544217688e-07, + "loss": 2.7352, + "step": 5 + }, + { + "epoch": 0.0006128702757916241, + "grad_norm": 26.199640845074015, + "learning_rate": 4.0816326530612243e-07, + "loss": 2.4032, + "step": 6 + }, + { + "epoch": 0.0007150153217568948, + "grad_norm": 25.4498992514559, + "learning_rate": 4.7619047619047623e-07, + "loss": 2.2179, + "step": 7 + }, + { + "epoch": 0.0008171603677221655, + "grad_norm": 26.10129625751608, + "learning_rate": 5.4421768707483e-07, + "loss": 2.23, + "step": 8 + }, + { + "epoch": 0.0009193054136874361, + "grad_norm": 27.916045826055516, + "learning_rate": 6.122448979591837e-07, + "loss": 2.4002, + "step": 9 + }, + { + "epoch": 0.0010214504596527069, + "grad_norm": 22.706817758677374, + "learning_rate": 6.802721088435376e-07, + "loss": 2.2149, + "step": 10 + }, + { + "epoch": 0.0011235955056179776, + "grad_norm": 25.25466775253154, + "learning_rate": 7.482993197278913e-07, + "loss": 2.4988, + "step": 11 + }, + { + "epoch": 0.0012257405515832482, + "grad_norm": 16.502357021610447, + "learning_rate": 8.163265306122449e-07, + "loss": 1.9654, + "step": 12 + }, + { + "epoch": 0.001327885597548519, + "grad_norm": 21.93641937168555, + "learning_rate": 8.843537414965988e-07, + "loss": 2.3907, + "step": 13 + }, + { + "epoch": 0.0014300306435137897, + "grad_norm": 18.19832506133001, + "learning_rate": 9.523809523809525e-07, + "loss": 2.06, + "step": 14 + }, + { + "epoch": 0.0015321756894790602, + "grad_norm": 15.799339815975372, + "learning_rate": 1.0204081632653063e-06, + "loss": 1.9616, + "step": 15 + }, + { + "epoch": 0.001634320735444331, + "grad_norm": 19.08729217369872, + "learning_rate": 1.08843537414966e-06, + "loss": 2.02, + "step": 16 + }, + { + "epoch": 0.0017364657814096017, + "grad_norm": 17.409340578425272, + "learning_rate": 1.1564625850340136e-06, + "loss": 1.8118, + "step": 17 + }, + { + "epoch": 0.0018386108273748722, + "grad_norm": 22.852017868556352, + "learning_rate": 1.2244897959183673e-06, + "loss": 2.1514, + "step": 18 + }, + { + "epoch": 0.001940755873340143, + "grad_norm": 21.746796575398797, + "learning_rate": 1.2925170068027212e-06, + "loss": 2.1073, + "step": 19 + }, + { + "epoch": 0.0020429009193054137, + "grad_norm": 21.970393666076028, + "learning_rate": 1.3605442176870751e-06, + "loss": 2.0132, + "step": 20 + }, + { + "epoch": 0.0021450459652706845, + "grad_norm": 18.47075930471974, + "learning_rate": 1.4285714285714286e-06, + "loss": 1.9016, + "step": 21 + }, + { + "epoch": 0.0022471910112359553, + "grad_norm": 19.13191802744562, + "learning_rate": 1.4965986394557825e-06, + "loss": 1.8869, + "step": 22 + }, + { + "epoch": 0.0023493360572012256, + "grad_norm": 20.980926233635074, + "learning_rate": 1.5646258503401362e-06, + "loss": 1.8411, + "step": 23 + }, + { + "epoch": 0.0024514811031664963, + "grad_norm": 14.495264246481026, + "learning_rate": 1.6326530612244897e-06, + "loss": 1.5245, + "step": 24 + }, + { + "epoch": 0.002553626149131767, + "grad_norm": 13.249080816156338, + "learning_rate": 1.7006802721088436e-06, + "loss": 1.5568, + "step": 25 + }, + { + "epoch": 0.002655771195097038, + "grad_norm": 11.986619676400249, + "learning_rate": 1.7687074829931975e-06, + "loss": 1.4914, + "step": 26 + }, + { + "epoch": 0.0027579162410623086, + "grad_norm": 15.893350275113658, + "learning_rate": 1.8367346938775512e-06, + "loss": 1.3491, + "step": 27 + }, + { + "epoch": 0.0028600612870275793, + "grad_norm": 12.389233496652443, + "learning_rate": 1.904761904761905e-06, + "loss": 1.1885, + "step": 28 + }, + { + "epoch": 0.0029622063329928497, + "grad_norm": 11.068388300477746, + "learning_rate": 1.9727891156462586e-06, + "loss": 1.2589, + "step": 29 + }, + { + "epoch": 0.0030643513789581204, + "grad_norm": 15.458274294530403, + "learning_rate": 2.0408163265306125e-06, + "loss": 1.098, + "step": 30 + }, + { + "epoch": 0.003166496424923391, + "grad_norm": 9.893105406446107, + "learning_rate": 2.1088435374149664e-06, + "loss": 1.1345, + "step": 31 + }, + { + "epoch": 0.003268641470888662, + "grad_norm": 10.238031953467008, + "learning_rate": 2.17687074829932e-06, + "loss": 0.9939, + "step": 32 + }, + { + "epoch": 0.0033707865168539327, + "grad_norm": 7.177534050314678, + "learning_rate": 2.244897959183674e-06, + "loss": 0.9789, + "step": 33 + }, + { + "epoch": 0.0034729315628192034, + "grad_norm": 3.6925445053829624, + "learning_rate": 2.3129251700680273e-06, + "loss": 0.8936, + "step": 34 + }, + { + "epoch": 0.003575076608784474, + "grad_norm": 2.1311857004428254, + "learning_rate": 2.380952380952381e-06, + "loss": 0.8226, + "step": 35 + }, + { + "epoch": 0.0036772216547497445, + "grad_norm": 2.114237694447485, + "learning_rate": 2.4489795918367347e-06, + "loss": 0.8576, + "step": 36 + }, + { + "epoch": 0.0037793667007150152, + "grad_norm": 1.7798161938458779, + "learning_rate": 2.5170068027210886e-06, + "loss": 0.7929, + "step": 37 + }, + { + "epoch": 0.003881511746680286, + "grad_norm": 2.1284386403464612, + "learning_rate": 2.5850340136054425e-06, + "loss": 0.8067, + "step": 38 + }, + { + "epoch": 0.003983656792645556, + "grad_norm": 2.051715887744448, + "learning_rate": 2.6530612244897964e-06, + "loss": 0.8059, + "step": 39 + }, + { + "epoch": 0.0040858018386108275, + "grad_norm": 1.9090586734108757, + "learning_rate": 2.7210884353741503e-06, + "loss": 0.781, + "step": 40 + }, + { + "epoch": 0.004187946884576098, + "grad_norm": 1.930879616738226, + "learning_rate": 2.7891156462585034e-06, + "loss": 0.8795, + "step": 41 + }, + { + "epoch": 0.004290091930541369, + "grad_norm": 1.7311599953935373, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.783, + "step": 42 + }, + { + "epoch": 0.004392236976506639, + "grad_norm": 1.7921573751754891, + "learning_rate": 2.925170068027211e-06, + "loss": 0.8301, + "step": 43 + }, + { + "epoch": 0.0044943820224719105, + "grad_norm": 1.8516372555657379, + "learning_rate": 2.993197278911565e-06, + "loss": 0.8177, + "step": 44 + }, + { + "epoch": 0.004596527068437181, + "grad_norm": 1.8180026983940232, + "learning_rate": 3.0612244897959185e-06, + "loss": 0.8082, + "step": 45 + }, + { + "epoch": 0.004698672114402451, + "grad_norm": 1.787553864501069, + "learning_rate": 3.1292517006802725e-06, + "loss": 0.8534, + "step": 46 + }, + { + "epoch": 0.004800817160367722, + "grad_norm": 1.7674821523573645, + "learning_rate": 3.1972789115646264e-06, + "loss": 0.7113, + "step": 47 + }, + { + "epoch": 0.004902962206332993, + "grad_norm": 1.7673234675114342, + "learning_rate": 3.2653061224489794e-06, + "loss": 0.8221, + "step": 48 + }, + { + "epoch": 0.005005107252298264, + "grad_norm": 1.853059425454463, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.7571, + "step": 49 + }, + { + "epoch": 0.005107252298263534, + "grad_norm": 1.8825858830560922, + "learning_rate": 3.4013605442176872e-06, + "loss": 0.7544, + "step": 50 + }, + { + "epoch": 0.0052093973442288045, + "grad_norm": 1.5495490747161171, + "learning_rate": 3.469387755102041e-06, + "loss": 0.7932, + "step": 51 + }, + { + "epoch": 0.005311542390194076, + "grad_norm": 1.6914705556391993, + "learning_rate": 3.537414965986395e-06, + "loss": 0.769, + "step": 52 + }, + { + "epoch": 0.005413687436159346, + "grad_norm": 1.5809579323730074, + "learning_rate": 3.6054421768707485e-06, + "loss": 0.7488, + "step": 53 + }, + { + "epoch": 0.005515832482124617, + "grad_norm": 1.6351397266977088, + "learning_rate": 3.6734693877551024e-06, + "loss": 0.7445, + "step": 54 + }, + { + "epoch": 0.0056179775280898875, + "grad_norm": 1.7543673403299578, + "learning_rate": 3.7414965986394563e-06, + "loss": 0.8146, + "step": 55 + }, + { + "epoch": 0.005720122574055159, + "grad_norm": 1.6393310890394845, + "learning_rate": 3.80952380952381e-06, + "loss": 0.7224, + "step": 56 + }, + { + "epoch": 0.005822267620020429, + "grad_norm": 1.6193877525173186, + "learning_rate": 3.877551020408164e-06, + "loss": 0.8524, + "step": 57 + }, + { + "epoch": 0.005924412665985699, + "grad_norm": 1.6556719265796729, + "learning_rate": 3.945578231292517e-06, + "loss": 0.7241, + "step": 58 + }, + { + "epoch": 0.0060265577119509705, + "grad_norm": 1.7118024761672823, + "learning_rate": 4.013605442176871e-06, + "loss": 0.8001, + "step": 59 + }, + { + "epoch": 0.006128702757916241, + "grad_norm": 1.7056909426326199, + "learning_rate": 4.081632653061225e-06, + "loss": 0.8776, + "step": 60 + }, + { + "epoch": 0.006230847803881512, + "grad_norm": 1.7769958621411681, + "learning_rate": 4.1496598639455785e-06, + "loss": 0.7095, + "step": 61 + }, + { + "epoch": 0.006332992849846782, + "grad_norm": 1.6667081033813227, + "learning_rate": 4.217687074829933e-06, + "loss": 0.825, + "step": 62 + }, + { + "epoch": 0.0064351378958120535, + "grad_norm": 1.7396099308185704, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.8232, + "step": 63 + }, + { + "epoch": 0.006537282941777324, + "grad_norm": 1.5707682172154676, + "learning_rate": 4.35374149659864e-06, + "loss": 0.6576, + "step": 64 + }, + { + "epoch": 0.006639427987742594, + "grad_norm": 1.6418845282111552, + "learning_rate": 4.421768707482993e-06, + "loss": 0.7043, + "step": 65 + }, + { + "epoch": 0.006741573033707865, + "grad_norm": 1.7075199999313757, + "learning_rate": 4.489795918367348e-06, + "loss": 0.7945, + "step": 66 + }, + { + "epoch": 0.006843718079673136, + "grad_norm": 1.8398294781739963, + "learning_rate": 4.557823129251701e-06, + "loss": 0.787, + "step": 67 + }, + { + "epoch": 0.006945863125638407, + "grad_norm": 1.5084581476623506, + "learning_rate": 4.6258503401360546e-06, + "loss": 0.7204, + "step": 68 + }, + { + "epoch": 0.007048008171603677, + "grad_norm": 1.7077426782576781, + "learning_rate": 4.693877551020409e-06, + "loss": 0.7126, + "step": 69 + }, + { + "epoch": 0.007150153217568948, + "grad_norm": 1.7319515911963492, + "learning_rate": 4.761904761904762e-06, + "loss": 0.774, + "step": 70 + }, + { + "epoch": 0.007252298263534219, + "grad_norm": 1.6110605030554606, + "learning_rate": 4.829931972789116e-06, + "loss": 0.6792, + "step": 71 + }, + { + "epoch": 0.007354443309499489, + "grad_norm": 1.6872445859245582, + "learning_rate": 4.897959183673469e-06, + "loss": 0.6961, + "step": 72 + }, + { + "epoch": 0.00745658835546476, + "grad_norm": 1.8419337432316705, + "learning_rate": 4.965986394557824e-06, + "loss": 0.6634, + "step": 73 + }, + { + "epoch": 0.0075587334014300305, + "grad_norm": 1.6798481460683667, + "learning_rate": 5.034013605442177e-06, + "loss": 0.7768, + "step": 74 + }, + { + "epoch": 0.007660878447395302, + "grad_norm": 1.6100273438676207, + "learning_rate": 5.1020408163265315e-06, + "loss": 0.6997, + "step": 75 + }, + { + "epoch": 0.007763023493360572, + "grad_norm": 1.71066206035823, + "learning_rate": 5.170068027210885e-06, + "loss": 0.692, + "step": 76 + }, + { + "epoch": 0.007865168539325843, + "grad_norm": 1.8157128315419326, + "learning_rate": 5.2380952380952384e-06, + "loss": 0.7918, + "step": 77 + }, + { + "epoch": 0.007967313585291113, + "grad_norm": 1.7095248307867774, + "learning_rate": 5.306122448979593e-06, + "loss": 0.7059, + "step": 78 + }, + { + "epoch": 0.008069458631256384, + "grad_norm": 1.7370650727490233, + "learning_rate": 5.374149659863946e-06, + "loss": 0.6581, + "step": 79 + }, + { + "epoch": 0.008171603677221655, + "grad_norm": 1.5022918246899075, + "learning_rate": 5.442176870748301e-06, + "loss": 0.7065, + "step": 80 + }, + { + "epoch": 0.008273748723186926, + "grad_norm": 1.6772017183621166, + "learning_rate": 5.510204081632653e-06, + "loss": 0.6651, + "step": 81 + }, + { + "epoch": 0.008375893769152196, + "grad_norm": 1.61774747933001, + "learning_rate": 5.578231292517007e-06, + "loss": 0.7368, + "step": 82 + }, + { + "epoch": 0.008478038815117467, + "grad_norm": 1.6658071780723769, + "learning_rate": 5.646258503401361e-06, + "loss": 0.7258, + "step": 83 + }, + { + "epoch": 0.008580183861082738, + "grad_norm": 1.669071710605808, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.7969, + "step": 84 + }, + { + "epoch": 0.008682328907048007, + "grad_norm": 1.8579457330459581, + "learning_rate": 5.782312925170068e-06, + "loss": 0.6907, + "step": 85 + }, + { + "epoch": 0.008784473953013279, + "grad_norm": 1.7176272414291363, + "learning_rate": 5.850340136054422e-06, + "loss": 0.6941, + "step": 86 + }, + { + "epoch": 0.00888661899897855, + "grad_norm": 1.7660467367038595, + "learning_rate": 5.918367346938776e-06, + "loss": 0.7132, + "step": 87 + }, + { + "epoch": 0.008988764044943821, + "grad_norm": 1.638935033482523, + "learning_rate": 5.98639455782313e-06, + "loss": 0.7117, + "step": 88 + }, + { + "epoch": 0.00909090909090909, + "grad_norm": 1.7943910830705998, + "learning_rate": 6.054421768707484e-06, + "loss": 0.8004, + "step": 89 + }, + { + "epoch": 0.009193054136874362, + "grad_norm": 1.858461886121563, + "learning_rate": 6.122448979591837e-06, + "loss": 0.7581, + "step": 90 + }, + { + "epoch": 0.009295199182839633, + "grad_norm": 1.5113421439996582, + "learning_rate": 6.1904761904761914e-06, + "loss": 0.7104, + "step": 91 + }, + { + "epoch": 0.009397344228804902, + "grad_norm": 1.7763599753453199, + "learning_rate": 6.258503401360545e-06, + "loss": 0.8159, + "step": 92 + }, + { + "epoch": 0.009499489274770173, + "grad_norm": 1.5799451481356066, + "learning_rate": 6.326530612244899e-06, + "loss": 0.771, + "step": 93 + }, + { + "epoch": 0.009601634320735445, + "grad_norm": 1.6975121060835323, + "learning_rate": 6.394557823129253e-06, + "loss": 0.7183, + "step": 94 + }, + { + "epoch": 0.009703779366700716, + "grad_norm": 1.645376004221033, + "learning_rate": 6.462585034013606e-06, + "loss": 0.773, + "step": 95 + }, + { + "epoch": 0.009805924412665985, + "grad_norm": 1.7624371076071546, + "learning_rate": 6.530612244897959e-06, + "loss": 0.7485, + "step": 96 + }, + { + "epoch": 0.009908069458631256, + "grad_norm": 1.9495126468948434, + "learning_rate": 6.598639455782313e-06, + "loss": 0.7994, + "step": 97 + }, + { + "epoch": 0.010010214504596528, + "grad_norm": 1.598074382630312, + "learning_rate": 6.666666666666667e-06, + "loss": 0.6719, + "step": 98 + }, + { + "epoch": 0.010112359550561797, + "grad_norm": 1.630640534815859, + "learning_rate": 6.734693877551021e-06, + "loss": 0.7443, + "step": 99 + }, + { + "epoch": 0.010214504596527068, + "grad_norm": 1.675352230442309, + "learning_rate": 6.8027210884353745e-06, + "loss": 0.7456, + "step": 100 + }, + { + "epoch": 0.01031664964249234, + "grad_norm": 1.826682592735814, + "learning_rate": 6.870748299319728e-06, + "loss": 0.6965, + "step": 101 + }, + { + "epoch": 0.010418794688457609, + "grad_norm": 1.8860094246191392, + "learning_rate": 6.938775510204082e-06, + "loss": 0.7326, + "step": 102 + }, + { + "epoch": 0.01052093973442288, + "grad_norm": 1.730954945401936, + "learning_rate": 7.006802721088436e-06, + "loss": 0.7144, + "step": 103 + }, + { + "epoch": 0.010623084780388151, + "grad_norm": 1.5375316324963417, + "learning_rate": 7.07482993197279e-06, + "loss": 0.7126, + "step": 104 + }, + { + "epoch": 0.010725229826353423, + "grad_norm": 1.7738652321511554, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.7301, + "step": 105 + }, + { + "epoch": 0.010827374872318692, + "grad_norm": 1.8410963440617436, + "learning_rate": 7.210884353741497e-06, + "loss": 0.741, + "step": 106 + }, + { + "epoch": 0.010929519918283963, + "grad_norm": 1.798370972457453, + "learning_rate": 7.278911564625851e-06, + "loss": 0.6855, + "step": 107 + }, + { + "epoch": 0.011031664964249234, + "grad_norm": 1.6166116878132524, + "learning_rate": 7.346938775510205e-06, + "loss": 0.7772, + "step": 108 + }, + { + "epoch": 0.011133810010214504, + "grad_norm": 1.9580684840168145, + "learning_rate": 7.414965986394559e-06, + "loss": 0.7244, + "step": 109 + }, + { + "epoch": 0.011235955056179775, + "grad_norm": 1.7065434743561958, + "learning_rate": 7.482993197278913e-06, + "loss": 0.7254, + "step": 110 + }, + { + "epoch": 0.011338100102145046, + "grad_norm": 1.7525889212294594, + "learning_rate": 7.551020408163265e-06, + "loss": 0.8227, + "step": 111 + }, + { + "epoch": 0.011440245148110317, + "grad_norm": 1.9076416714464244, + "learning_rate": 7.61904761904762e-06, + "loss": 0.6655, + "step": 112 + }, + { + "epoch": 0.011542390194075587, + "grad_norm": 1.631377216930173, + "learning_rate": 7.687074829931972e-06, + "loss": 0.734, + "step": 113 + }, + { + "epoch": 0.011644535240040858, + "grad_norm": 1.7763009147340159, + "learning_rate": 7.755102040816327e-06, + "loss": 0.6994, + "step": 114 + }, + { + "epoch": 0.01174668028600613, + "grad_norm": 1.746756564467626, + "learning_rate": 7.823129251700681e-06, + "loss": 0.7995, + "step": 115 + }, + { + "epoch": 0.011848825331971399, + "grad_norm": 1.6623568361016479, + "learning_rate": 7.891156462585034e-06, + "loss": 0.6245, + "step": 116 + }, + { + "epoch": 0.01195097037793667, + "grad_norm": 1.6868704683946245, + "learning_rate": 7.959183673469388e-06, + "loss": 0.7797, + "step": 117 + }, + { + "epoch": 0.012053115423901941, + "grad_norm": 1.5387174275978197, + "learning_rate": 8.027210884353741e-06, + "loss": 0.623, + "step": 118 + }, + { + "epoch": 0.012155260469867212, + "grad_norm": 1.6130834721838176, + "learning_rate": 8.095238095238097e-06, + "loss": 0.6125, + "step": 119 + }, + { + "epoch": 0.012257405515832482, + "grad_norm": 1.591501857340208, + "learning_rate": 8.16326530612245e-06, + "loss": 0.8235, + "step": 120 + }, + { + "epoch": 0.012359550561797753, + "grad_norm": 1.753758701962058, + "learning_rate": 8.231292517006804e-06, + "loss": 0.7735, + "step": 121 + }, + { + "epoch": 0.012461695607763024, + "grad_norm": 1.4108382782919635, + "learning_rate": 8.299319727891157e-06, + "loss": 0.6539, + "step": 122 + }, + { + "epoch": 0.012563840653728293, + "grad_norm": 1.7317979549539477, + "learning_rate": 8.36734693877551e-06, + "loss": 0.7092, + "step": 123 + }, + { + "epoch": 0.012665985699693565, + "grad_norm": 1.7272447316706576, + "learning_rate": 8.435374149659866e-06, + "loss": 0.6365, + "step": 124 + }, + { + "epoch": 0.012768130745658836, + "grad_norm": 1.8271664185653176, + "learning_rate": 8.503401360544217e-06, + "loss": 0.8789, + "step": 125 + }, + { + "epoch": 0.012870275791624107, + "grad_norm": 1.7663221688313868, + "learning_rate": 8.571428571428571e-06, + "loss": 0.673, + "step": 126 + }, + { + "epoch": 0.012972420837589376, + "grad_norm": 1.5353243105307623, + "learning_rate": 8.639455782312926e-06, + "loss": 0.6768, + "step": 127 + }, + { + "epoch": 0.013074565883554648, + "grad_norm": 1.6616810202667323, + "learning_rate": 8.70748299319728e-06, + "loss": 0.8092, + "step": 128 + }, + { + "epoch": 0.013176710929519919, + "grad_norm": 1.575094304505959, + "learning_rate": 8.775510204081633e-06, + "loss": 0.7702, + "step": 129 + }, + { + "epoch": 0.013278855975485188, + "grad_norm": 1.782638617469749, + "learning_rate": 8.843537414965987e-06, + "loss": 0.7127, + "step": 130 + }, + { + "epoch": 0.01338100102145046, + "grad_norm": 1.6846980944597323, + "learning_rate": 8.91156462585034e-06, + "loss": 0.7413, + "step": 131 + }, + { + "epoch": 0.01348314606741573, + "grad_norm": 1.72903262346004, + "learning_rate": 8.979591836734695e-06, + "loss": 0.6658, + "step": 132 + }, + { + "epoch": 0.013585291113381002, + "grad_norm": 1.8581330047604752, + "learning_rate": 9.047619047619049e-06, + "loss": 0.7698, + "step": 133 + }, + { + "epoch": 0.013687436159346271, + "grad_norm": 1.6775962447900343, + "learning_rate": 9.115646258503402e-06, + "loss": 0.7782, + "step": 134 + }, + { + "epoch": 0.013789581205311542, + "grad_norm": 1.782752664617114, + "learning_rate": 9.183673469387756e-06, + "loss": 0.8082, + "step": 135 + }, + { + "epoch": 0.013891726251276814, + "grad_norm": 1.668617754573196, + "learning_rate": 9.251700680272109e-06, + "loss": 0.6672, + "step": 136 + }, + { + "epoch": 0.013993871297242083, + "grad_norm": 1.6118861990864957, + "learning_rate": 9.319727891156464e-06, + "loss": 0.7963, + "step": 137 + }, + { + "epoch": 0.014096016343207354, + "grad_norm": 1.6761421515526789, + "learning_rate": 9.387755102040818e-06, + "loss": 0.7422, + "step": 138 + }, + { + "epoch": 0.014198161389172625, + "grad_norm": 1.726881522958956, + "learning_rate": 9.455782312925171e-06, + "loss": 0.6591, + "step": 139 + }, + { + "epoch": 0.014300306435137897, + "grad_norm": 1.8151721433036694, + "learning_rate": 9.523809523809525e-06, + "loss": 0.81, + "step": 140 + }, + { + "epoch": 0.014402451481103166, + "grad_norm": 1.7914170589563632, + "learning_rate": 9.591836734693878e-06, + "loss": 0.7645, + "step": 141 + }, + { + "epoch": 0.014504596527068437, + "grad_norm": 1.6518787386343419, + "learning_rate": 9.659863945578232e-06, + "loss": 0.7456, + "step": 142 + }, + { + "epoch": 0.014606741573033709, + "grad_norm": 1.659105613563791, + "learning_rate": 9.727891156462585e-06, + "loss": 0.832, + "step": 143 + }, + { + "epoch": 0.014708886618998978, + "grad_norm": 1.6376917775876176, + "learning_rate": 9.795918367346939e-06, + "loss": 0.6964, + "step": 144 + }, + { + "epoch": 0.01481103166496425, + "grad_norm": 1.7268140730240362, + "learning_rate": 9.863945578231294e-06, + "loss": 0.7475, + "step": 145 + }, + { + "epoch": 0.01491317671092952, + "grad_norm": 1.6995295154953174, + "learning_rate": 9.931972789115647e-06, + "loss": 0.7166, + "step": 146 + }, + { + "epoch": 0.01501532175689479, + "grad_norm": 1.5709403314483485, + "learning_rate": 1e-05, + "loss": 0.7314, + "step": 147 + }, + { + "epoch": 0.015117466802860061, + "grad_norm": 1.720495364028145, + "learning_rate": 1.0068027210884354e-05, + "loss": 0.7551, + "step": 148 + }, + { + "epoch": 0.015219611848825332, + "grad_norm": 1.819951190412068, + "learning_rate": 1.0136054421768708e-05, + "loss": 0.7591, + "step": 149 + }, + { + "epoch": 0.015321756894790603, + "grad_norm": 1.6213023995255083, + "learning_rate": 1.0204081632653063e-05, + "loss": 0.6375, + "step": 150 + }, + { + "epoch": 0.015423901940755873, + "grad_norm": 1.8198256408141487, + "learning_rate": 1.0272108843537416e-05, + "loss": 0.8037, + "step": 151 + }, + { + "epoch": 0.015526046986721144, + "grad_norm": 1.717390105568949, + "learning_rate": 1.034013605442177e-05, + "loss": 0.7034, + "step": 152 + }, + { + "epoch": 0.015628192032686415, + "grad_norm": 1.7525426165424476, + "learning_rate": 1.0408163265306123e-05, + "loss": 0.8328, + "step": 153 + }, + { + "epoch": 0.015730337078651686, + "grad_norm": 2.018821327836352, + "learning_rate": 1.0476190476190477e-05, + "loss": 0.8377, + "step": 154 + }, + { + "epoch": 0.015832482124616958, + "grad_norm": 1.9651480001068806, + "learning_rate": 1.0544217687074832e-05, + "loss": 0.7725, + "step": 155 + }, + { + "epoch": 0.015934627170582225, + "grad_norm": 1.7303060951319966, + "learning_rate": 1.0612244897959186e-05, + "loss": 0.6728, + "step": 156 + }, + { + "epoch": 0.016036772216547496, + "grad_norm": 1.832914627252907, + "learning_rate": 1.0680272108843539e-05, + "loss": 0.7327, + "step": 157 + }, + { + "epoch": 0.016138917262512768, + "grad_norm": 1.9952743671628559, + "learning_rate": 1.0748299319727893e-05, + "loss": 0.7062, + "step": 158 + }, + { + "epoch": 0.01624106230847804, + "grad_norm": 1.6901300303721887, + "learning_rate": 1.0816326530612246e-05, + "loss": 0.7215, + "step": 159 + }, + { + "epoch": 0.01634320735444331, + "grad_norm": 1.6882719908313557, + "learning_rate": 1.0884353741496601e-05, + "loss": 0.6899, + "step": 160 + }, + { + "epoch": 0.01644535240040858, + "grad_norm": 1.7126975472492683, + "learning_rate": 1.0952380952380955e-05, + "loss": 0.7668, + "step": 161 + }, + { + "epoch": 0.016547497446373852, + "grad_norm": 1.860833048235229, + "learning_rate": 1.1020408163265306e-05, + "loss": 0.7757, + "step": 162 + }, + { + "epoch": 0.01664964249233912, + "grad_norm": 1.6691646075823658, + "learning_rate": 1.108843537414966e-05, + "loss": 0.6983, + "step": 163 + }, + { + "epoch": 0.01675178753830439, + "grad_norm": 1.6710931737332289, + "learning_rate": 1.1156462585034013e-05, + "loss": 0.703, + "step": 164 + }, + { + "epoch": 0.016853932584269662, + "grad_norm": 1.6028278372345446, + "learning_rate": 1.1224489795918367e-05, + "loss": 0.7139, + "step": 165 + }, + { + "epoch": 0.016956077630234934, + "grad_norm": 1.6871631323522613, + "learning_rate": 1.1292517006802722e-05, + "loss": 0.6787, + "step": 166 + }, + { + "epoch": 0.017058222676200205, + "grad_norm": 1.5901260536870798, + "learning_rate": 1.1360544217687076e-05, + "loss": 0.6613, + "step": 167 + }, + { + "epoch": 0.017160367722165476, + "grad_norm": 1.6000584814482617, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.7295, + "step": 168 + }, + { + "epoch": 0.017262512768130747, + "grad_norm": 1.7022611355821193, + "learning_rate": 1.1496598639455783e-05, + "loss": 0.7237, + "step": 169 + }, + { + "epoch": 0.017364657814096015, + "grad_norm": 1.8639312676359368, + "learning_rate": 1.1564625850340136e-05, + "loss": 0.8106, + "step": 170 + }, + { + "epoch": 0.017466802860061286, + "grad_norm": 1.9157902018907373, + "learning_rate": 1.1632653061224491e-05, + "loss": 0.8267, + "step": 171 + }, + { + "epoch": 0.017568947906026557, + "grad_norm": 1.785882858974327, + "learning_rate": 1.1700680272108845e-05, + "loss": 0.8219, + "step": 172 + }, + { + "epoch": 0.01767109295199183, + "grad_norm": 1.7530962357015458, + "learning_rate": 1.1768707482993198e-05, + "loss": 0.7005, + "step": 173 + }, + { + "epoch": 0.0177732379979571, + "grad_norm": 1.8202815347760355, + "learning_rate": 1.1836734693877552e-05, + "loss": 0.6966, + "step": 174 + }, + { + "epoch": 0.01787538304392237, + "grad_norm": 1.5678831440125445, + "learning_rate": 1.1904761904761905e-05, + "loss": 0.7865, + "step": 175 + }, + { + "epoch": 0.017977528089887642, + "grad_norm": 1.6051735570217414, + "learning_rate": 1.197278911564626e-05, + "loss": 0.8575, + "step": 176 + }, + { + "epoch": 0.01807967313585291, + "grad_norm": 1.6369558411506475, + "learning_rate": 1.2040816326530614e-05, + "loss": 0.674, + "step": 177 + }, + { + "epoch": 0.01818181818181818, + "grad_norm": 1.8105242715857925, + "learning_rate": 1.2108843537414967e-05, + "loss": 0.6989, + "step": 178 + }, + { + "epoch": 0.018283963227783452, + "grad_norm": 1.6483872674217979, + "learning_rate": 1.217687074829932e-05, + "loss": 0.7412, + "step": 179 + }, + { + "epoch": 0.018386108273748723, + "grad_norm": 1.4853293043136457, + "learning_rate": 1.2244897959183674e-05, + "loss": 0.6996, + "step": 180 + }, + { + "epoch": 0.018488253319713994, + "grad_norm": 1.6830465494255156, + "learning_rate": 1.231292517006803e-05, + "loss": 0.7431, + "step": 181 + }, + { + "epoch": 0.018590398365679266, + "grad_norm": 1.6223540718110974, + "learning_rate": 1.2380952380952383e-05, + "loss": 0.6751, + "step": 182 + }, + { + "epoch": 0.018692543411644537, + "grad_norm": 1.7860008803194065, + "learning_rate": 1.2448979591836736e-05, + "loss": 0.8121, + "step": 183 + }, + { + "epoch": 0.018794688457609805, + "grad_norm": 1.7770504613473805, + "learning_rate": 1.251700680272109e-05, + "loss": 0.7278, + "step": 184 + }, + { + "epoch": 0.018896833503575076, + "grad_norm": 1.8001236496603807, + "learning_rate": 1.2585034013605443e-05, + "loss": 0.8101, + "step": 185 + }, + { + "epoch": 0.018998978549540347, + "grad_norm": 1.7187471925775293, + "learning_rate": 1.2653061224489798e-05, + "loss": 0.6679, + "step": 186 + }, + { + "epoch": 0.019101123595505618, + "grad_norm": 1.7432579851728531, + "learning_rate": 1.2721088435374152e-05, + "loss": 0.7274, + "step": 187 + }, + { + "epoch": 0.01920326864147089, + "grad_norm": 1.7177537817958337, + "learning_rate": 1.2789115646258505e-05, + "loss": 0.7787, + "step": 188 + }, + { + "epoch": 0.01930541368743616, + "grad_norm": 1.6834932053378133, + "learning_rate": 1.2857142857142859e-05, + "loss": 0.6854, + "step": 189 + }, + { + "epoch": 0.01940755873340143, + "grad_norm": 1.949245504046552, + "learning_rate": 1.2925170068027212e-05, + "loss": 0.8335, + "step": 190 + }, + { + "epoch": 0.0195097037793667, + "grad_norm": 1.7420522713597348, + "learning_rate": 1.2993197278911568e-05, + "loss": 0.696, + "step": 191 + }, + { + "epoch": 0.01961184882533197, + "grad_norm": 1.7722594170393748, + "learning_rate": 1.3061224489795918e-05, + "loss": 0.7023, + "step": 192 + }, + { + "epoch": 0.019713993871297242, + "grad_norm": 1.632257598748308, + "learning_rate": 1.3129251700680273e-05, + "loss": 0.7893, + "step": 193 + }, + { + "epoch": 0.019816138917262513, + "grad_norm": 1.7162977302301121, + "learning_rate": 1.3197278911564626e-05, + "loss": 0.7816, + "step": 194 + }, + { + "epoch": 0.019918283963227784, + "grad_norm": 1.6428808771496362, + "learning_rate": 1.326530612244898e-05, + "loss": 0.6069, + "step": 195 + }, + { + "epoch": 0.020020429009193055, + "grad_norm": 1.7608765496826089, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.8061, + "step": 196 + }, + { + "epoch": 0.020122574055158327, + "grad_norm": 1.6945246408232142, + "learning_rate": 1.3401360544217687e-05, + "loss": 0.6908, + "step": 197 + }, + { + "epoch": 0.020224719101123594, + "grad_norm": 1.6245687218591212, + "learning_rate": 1.3469387755102042e-05, + "loss": 0.6128, + "step": 198 + }, + { + "epoch": 0.020326864147088865, + "grad_norm": 1.5964224806883631, + "learning_rate": 1.3537414965986395e-05, + "loss": 0.6373, + "step": 199 + }, + { + "epoch": 0.020429009193054137, + "grad_norm": 1.623491593147068, + "learning_rate": 1.3605442176870749e-05, + "loss": 0.7421, + "step": 200 + }, + { + "epoch": 0.020531154239019408, + "grad_norm": 1.5199753628257748, + "learning_rate": 1.3673469387755102e-05, + "loss": 0.5401, + "step": 201 + }, + { + "epoch": 0.02063329928498468, + "grad_norm": 1.6903192779010237, + "learning_rate": 1.3741496598639456e-05, + "loss": 0.7139, + "step": 202 + }, + { + "epoch": 0.02073544433094995, + "grad_norm": 1.6703009895863918, + "learning_rate": 1.3809523809523811e-05, + "loss": 0.805, + "step": 203 + }, + { + "epoch": 0.020837589376915218, + "grad_norm": 1.6561649172999005, + "learning_rate": 1.3877551020408165e-05, + "loss": 0.6521, + "step": 204 + }, + { + "epoch": 0.02093973442288049, + "grad_norm": 1.6374563688361488, + "learning_rate": 1.3945578231292518e-05, + "loss": 0.7981, + "step": 205 + }, + { + "epoch": 0.02104187946884576, + "grad_norm": 1.6011019389196248, + "learning_rate": 1.4013605442176872e-05, + "loss": 0.6574, + "step": 206 + }, + { + "epoch": 0.02114402451481103, + "grad_norm": 1.7318980429273745, + "learning_rate": 1.4081632653061225e-05, + "loss": 0.7204, + "step": 207 + }, + { + "epoch": 0.021246169560776303, + "grad_norm": 1.667116251107914, + "learning_rate": 1.414965986394558e-05, + "loss": 0.7088, + "step": 208 + }, + { + "epoch": 0.021348314606741574, + "grad_norm": 1.7146530136245697, + "learning_rate": 1.4217687074829934e-05, + "loss": 0.7269, + "step": 209 + }, + { + "epoch": 0.021450459652706845, + "grad_norm": 1.7339521332968388, + "learning_rate": 1.4285714285714287e-05, + "loss": 0.5921, + "step": 210 + }, + { + "epoch": 0.021552604698672113, + "grad_norm": 1.7307420670093898, + "learning_rate": 1.435374149659864e-05, + "loss": 0.6899, + "step": 211 + }, + { + "epoch": 0.021654749744637384, + "grad_norm": 1.6792597753169058, + "learning_rate": 1.4421768707482994e-05, + "loss": 0.7691, + "step": 212 + }, + { + "epoch": 0.021756894790602655, + "grad_norm": 1.69983704609484, + "learning_rate": 1.448979591836735e-05, + "loss": 0.6724, + "step": 213 + }, + { + "epoch": 0.021859039836567926, + "grad_norm": 1.6834153652082477, + "learning_rate": 1.4557823129251703e-05, + "loss": 0.7277, + "step": 214 + }, + { + "epoch": 0.021961184882533197, + "grad_norm": 1.6887632288306307, + "learning_rate": 1.4625850340136056e-05, + "loss": 0.737, + "step": 215 + }, + { + "epoch": 0.02206332992849847, + "grad_norm": 1.750575761137874, + "learning_rate": 1.469387755102041e-05, + "loss": 0.5893, + "step": 216 + }, + { + "epoch": 0.02216547497446374, + "grad_norm": 1.760104995544491, + "learning_rate": 1.4761904761904763e-05, + "loss": 0.7191, + "step": 217 + }, + { + "epoch": 0.022267620020429008, + "grad_norm": 1.6966140601261814, + "learning_rate": 1.4829931972789118e-05, + "loss": 0.8299, + "step": 218 + }, + { + "epoch": 0.02236976506639428, + "grad_norm": 1.7096511991105092, + "learning_rate": 1.4897959183673472e-05, + "loss": 0.6451, + "step": 219 + }, + { + "epoch": 0.02247191011235955, + "grad_norm": 1.7601676205959085, + "learning_rate": 1.4965986394557825e-05, + "loss": 0.7569, + "step": 220 + }, + { + "epoch": 0.02257405515832482, + "grad_norm": 1.715999457369528, + "learning_rate": 1.5034013605442177e-05, + "loss": 0.7148, + "step": 221 + }, + { + "epoch": 0.022676200204290092, + "grad_norm": 1.6585584667190563, + "learning_rate": 1.510204081632653e-05, + "loss": 0.7202, + "step": 222 + }, + { + "epoch": 0.022778345250255363, + "grad_norm": 1.656971523509079, + "learning_rate": 1.5170068027210884e-05, + "loss": 0.7582, + "step": 223 + }, + { + "epoch": 0.022880490296220635, + "grad_norm": 1.5368838757342893, + "learning_rate": 1.523809523809524e-05, + "loss": 0.7434, + "step": 224 + }, + { + "epoch": 0.022982635342185902, + "grad_norm": 1.6102753910803371, + "learning_rate": 1.530612244897959e-05, + "loss": 0.6667, + "step": 225 + }, + { + "epoch": 0.023084780388151174, + "grad_norm": 1.7496249149226775, + "learning_rate": 1.5374149659863945e-05, + "loss": 0.7434, + "step": 226 + }, + { + "epoch": 0.023186925434116445, + "grad_norm": 1.7378752250236362, + "learning_rate": 1.54421768707483e-05, + "loss": 0.8048, + "step": 227 + }, + { + "epoch": 0.023289070480081716, + "grad_norm": 1.6995440865015914, + "learning_rate": 1.5510204081632655e-05, + "loss": 0.7731, + "step": 228 + }, + { + "epoch": 0.023391215526046987, + "grad_norm": 1.6948162036390637, + "learning_rate": 1.557823129251701e-05, + "loss": 0.6862, + "step": 229 + }, + { + "epoch": 0.02349336057201226, + "grad_norm": 1.7526251091046414, + "learning_rate": 1.5646258503401362e-05, + "loss": 0.6658, + "step": 230 + }, + { + "epoch": 0.02359550561797753, + "grad_norm": 1.7286133632001344, + "learning_rate": 1.5714285714285715e-05, + "loss": 0.8782, + "step": 231 + }, + { + "epoch": 0.023697650663942797, + "grad_norm": 1.6708372326114151, + "learning_rate": 1.578231292517007e-05, + "loss": 0.7649, + "step": 232 + }, + { + "epoch": 0.02379979570990807, + "grad_norm": 1.7309187424472103, + "learning_rate": 1.5850340136054422e-05, + "loss": 0.7077, + "step": 233 + }, + { + "epoch": 0.02390194075587334, + "grad_norm": 1.7207735528501127, + "learning_rate": 1.5918367346938776e-05, + "loss": 0.6373, + "step": 234 + }, + { + "epoch": 0.02400408580183861, + "grad_norm": 1.542492375282899, + "learning_rate": 1.598639455782313e-05, + "loss": 0.7017, + "step": 235 + }, + { + "epoch": 0.024106230847803882, + "grad_norm": 1.7310693731168392, + "learning_rate": 1.6054421768707483e-05, + "loss": 0.7241, + "step": 236 + }, + { + "epoch": 0.024208375893769153, + "grad_norm": 1.591240757601507, + "learning_rate": 1.612244897959184e-05, + "loss": 0.727, + "step": 237 + }, + { + "epoch": 0.024310520939734424, + "grad_norm": 1.6240608809840893, + "learning_rate": 1.6190476190476193e-05, + "loss": 0.6995, + "step": 238 + }, + { + "epoch": 0.024412665985699692, + "grad_norm": 1.6138243337460614, + "learning_rate": 1.6258503401360547e-05, + "loss": 0.7657, + "step": 239 + }, + { + "epoch": 0.024514811031664963, + "grad_norm": 1.7302630769224912, + "learning_rate": 1.63265306122449e-05, + "loss": 0.7491, + "step": 240 + }, + { + "epoch": 0.024616956077630234, + "grad_norm": 1.8695354004194862, + "learning_rate": 1.6394557823129254e-05, + "loss": 0.7828, + "step": 241 + }, + { + "epoch": 0.024719101123595506, + "grad_norm": 1.8416740418279043, + "learning_rate": 1.6462585034013607e-05, + "loss": 0.8681, + "step": 242 + }, + { + "epoch": 0.024821246169560777, + "grad_norm": 1.4945412066264474, + "learning_rate": 1.653061224489796e-05, + "loss": 0.7292, + "step": 243 + }, + { + "epoch": 0.024923391215526048, + "grad_norm": 1.4726427374583286, + "learning_rate": 1.6598639455782314e-05, + "loss": 0.6267, + "step": 244 + }, + { + "epoch": 0.02502553626149132, + "grad_norm": 1.531520904197372, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.6931, + "step": 245 + }, + { + "epoch": 0.025127681307456587, + "grad_norm": 1.6189735160692857, + "learning_rate": 1.673469387755102e-05, + "loss": 0.6459, + "step": 246 + }, + { + "epoch": 0.025229826353421858, + "grad_norm": 1.7952277677611934, + "learning_rate": 1.6802721088435378e-05, + "loss": 0.7225, + "step": 247 + }, + { + "epoch": 0.02533197139938713, + "grad_norm": 1.4750822617726922, + "learning_rate": 1.687074829931973e-05, + "loss": 0.6693, + "step": 248 + }, + { + "epoch": 0.0254341164453524, + "grad_norm": 1.5025866791924054, + "learning_rate": 1.6938775510204085e-05, + "loss": 0.7338, + "step": 249 + }, + { + "epoch": 0.02553626149131767, + "grad_norm": 1.9184730298445185, + "learning_rate": 1.7006802721088435e-05, + "loss": 0.822, + "step": 250 + }, + { + "epoch": 0.025638406537282943, + "grad_norm": 1.7640638289695652, + "learning_rate": 1.707482993197279e-05, + "loss": 0.7888, + "step": 251 + }, + { + "epoch": 0.025740551583248214, + "grad_norm": 1.6289474440466316, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.645, + "step": 252 + }, + { + "epoch": 0.025842696629213482, + "grad_norm": 1.6168415775559752, + "learning_rate": 1.72108843537415e-05, + "loss": 0.8349, + "step": 253 + }, + { + "epoch": 0.025944841675178753, + "grad_norm": 1.6972890558710467, + "learning_rate": 1.7278911564625852e-05, + "loss": 0.8225, + "step": 254 + }, + { + "epoch": 0.026046986721144024, + "grad_norm": 1.7546501844753204, + "learning_rate": 1.7346938775510206e-05, + "loss": 0.793, + "step": 255 + }, + { + "epoch": 0.026149131767109295, + "grad_norm": 1.605222248842367, + "learning_rate": 1.741496598639456e-05, + "loss": 0.7618, + "step": 256 + }, + { + "epoch": 0.026251276813074566, + "grad_norm": 1.7734820728515044, + "learning_rate": 1.7482993197278913e-05, + "loss": 0.8619, + "step": 257 + }, + { + "epoch": 0.026353421859039838, + "grad_norm": 1.7856967588397747, + "learning_rate": 1.7551020408163266e-05, + "loss": 0.6808, + "step": 258 + }, + { + "epoch": 0.02645556690500511, + "grad_norm": 1.7125105641531968, + "learning_rate": 1.761904761904762e-05, + "loss": 0.7379, + "step": 259 + }, + { + "epoch": 0.026557711950970377, + "grad_norm": 1.747847044271819, + "learning_rate": 1.7687074829931973e-05, + "loss": 0.7189, + "step": 260 + }, + { + "epoch": 0.026659856996935648, + "grad_norm": 1.7306235385439581, + "learning_rate": 1.7755102040816327e-05, + "loss": 0.8412, + "step": 261 + }, + { + "epoch": 0.02676200204290092, + "grad_norm": 1.6081901194935102, + "learning_rate": 1.782312925170068e-05, + "loss": 0.7475, + "step": 262 + }, + { + "epoch": 0.02686414708886619, + "grad_norm": 1.5510312240707456, + "learning_rate": 1.7891156462585037e-05, + "loss": 0.7793, + "step": 263 + }, + { + "epoch": 0.02696629213483146, + "grad_norm": 1.7514148922507933, + "learning_rate": 1.795918367346939e-05, + "loss": 0.7567, + "step": 264 + }, + { + "epoch": 0.027068437180796732, + "grad_norm": 1.6813056729305516, + "learning_rate": 1.8027210884353744e-05, + "loss": 0.7805, + "step": 265 + }, + { + "epoch": 0.027170582226762004, + "grad_norm": 1.636398050011782, + "learning_rate": 1.8095238095238097e-05, + "loss": 0.6795, + "step": 266 + }, + { + "epoch": 0.02727272727272727, + "grad_norm": 1.5779239857805816, + "learning_rate": 1.816326530612245e-05, + "loss": 0.6869, + "step": 267 + }, + { + "epoch": 0.027374872318692543, + "grad_norm": 1.4412803271887495, + "learning_rate": 1.8231292517006804e-05, + "loss": 0.6369, + "step": 268 + }, + { + "epoch": 0.027477017364657814, + "grad_norm": 1.804770986377828, + "learning_rate": 1.8299319727891158e-05, + "loss": 0.8157, + "step": 269 + }, + { + "epoch": 0.027579162410623085, + "grad_norm": 1.668485876468596, + "learning_rate": 1.836734693877551e-05, + "loss": 0.6413, + "step": 270 + }, + { + "epoch": 0.027681307456588356, + "grad_norm": 1.7856336273013087, + "learning_rate": 1.8435374149659865e-05, + "loss": 0.8693, + "step": 271 + }, + { + "epoch": 0.027783452502553627, + "grad_norm": 1.697632519966551, + "learning_rate": 1.8503401360544218e-05, + "loss": 0.7679, + "step": 272 + }, + { + "epoch": 0.0278855975485189, + "grad_norm": 1.7171739215199602, + "learning_rate": 1.8571428571428575e-05, + "loss": 0.778, + "step": 273 + }, + { + "epoch": 0.027987742594484166, + "grad_norm": 1.5938726988377543, + "learning_rate": 1.863945578231293e-05, + "loss": 0.7612, + "step": 274 + }, + { + "epoch": 0.028089887640449437, + "grad_norm": 1.5906655717814715, + "learning_rate": 1.8707482993197282e-05, + "loss": 0.5723, + "step": 275 + }, + { + "epoch": 0.02819203268641471, + "grad_norm": 1.5078696721505913, + "learning_rate": 1.8775510204081636e-05, + "loss": 0.7363, + "step": 276 + }, + { + "epoch": 0.02829417773237998, + "grad_norm": 1.5446916452430135, + "learning_rate": 1.884353741496599e-05, + "loss": 0.7853, + "step": 277 + }, + { + "epoch": 0.02839632277834525, + "grad_norm": 1.708121037225686, + "learning_rate": 1.8911564625850343e-05, + "loss": 0.7252, + "step": 278 + }, + { + "epoch": 0.028498467824310522, + "grad_norm": 1.5934769581812638, + "learning_rate": 1.8979591836734696e-05, + "loss": 0.8717, + "step": 279 + }, + { + "epoch": 0.028600612870275793, + "grad_norm": 1.5107727511597122, + "learning_rate": 1.904761904761905e-05, + "loss": 0.7137, + "step": 280 + }, + { + "epoch": 0.02870275791624106, + "grad_norm": 1.3976840822460448, + "learning_rate": 1.9115646258503403e-05, + "loss": 0.6616, + "step": 281 + }, + { + "epoch": 0.028804902962206332, + "grad_norm": 1.7475390113391824, + "learning_rate": 1.9183673469387756e-05, + "loss": 0.7495, + "step": 282 + }, + { + "epoch": 0.028907048008171603, + "grad_norm": 1.765610249198674, + "learning_rate": 1.925170068027211e-05, + "loss": 0.7275, + "step": 283 + }, + { + "epoch": 0.029009193054136875, + "grad_norm": 1.7270537589170198, + "learning_rate": 1.9319727891156463e-05, + "loss": 0.7495, + "step": 284 + }, + { + "epoch": 0.029111338100102146, + "grad_norm": 1.747335050821353, + "learning_rate": 1.9387755102040817e-05, + "loss": 0.709, + "step": 285 + }, + { + "epoch": 0.029213483146067417, + "grad_norm": 1.5635922172080774, + "learning_rate": 1.945578231292517e-05, + "loss": 0.655, + "step": 286 + }, + { + "epoch": 0.029315628192032685, + "grad_norm": 1.6530065973763257, + "learning_rate": 1.9523809523809524e-05, + "loss": 0.5877, + "step": 287 + }, + { + "epoch": 0.029417773237997956, + "grad_norm": 1.6733903459496355, + "learning_rate": 1.9591836734693877e-05, + "loss": 0.7207, + "step": 288 + }, + { + "epoch": 0.029519918283963227, + "grad_norm": 1.685661854915379, + "learning_rate": 1.965986394557823e-05, + "loss": 0.7612, + "step": 289 + }, + { + "epoch": 0.0296220633299285, + "grad_norm": 1.4862629797845708, + "learning_rate": 1.9727891156462588e-05, + "loss": 0.6418, + "step": 290 + }, + { + "epoch": 0.02972420837589377, + "grad_norm": 1.6535641609527183, + "learning_rate": 1.979591836734694e-05, + "loss": 0.7818, + "step": 291 + }, + { + "epoch": 0.02982635342185904, + "grad_norm": 1.6678727911393412, + "learning_rate": 1.9863945578231295e-05, + "loss": 0.7316, + "step": 292 + }, + { + "epoch": 0.029928498467824312, + "grad_norm": 1.857236685258155, + "learning_rate": 1.9931972789115648e-05, + "loss": 0.7219, + "step": 293 + }, + { + "epoch": 0.03003064351378958, + "grad_norm": 1.7130211183964796, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 294 + }, + { + "epoch": 0.03013278855975485, + "grad_norm": 1.6306750132118761, + "learning_rate": 1.9999999452746773e-05, + "loss": 0.8894, + "step": 295 + }, + { + "epoch": 0.030234933605720122, + "grad_norm": 1.596068821772123, + "learning_rate": 1.999999781098714e-05, + "loss": 0.6652, + "step": 296 + }, + { + "epoch": 0.030337078651685393, + "grad_norm": 1.5023620977647107, + "learning_rate": 1.9999995074721287e-05, + "loss": 0.653, + "step": 297 + }, + { + "epoch": 0.030439223697650664, + "grad_norm": 1.7996223461899603, + "learning_rate": 1.999999124394951e-05, + "loss": 0.7884, + "step": 298 + }, + { + "epoch": 0.030541368743615935, + "grad_norm": 1.5801199353543882, + "learning_rate": 1.9999986318672236e-05, + "loss": 0.764, + "step": 299 + }, + { + "epoch": 0.030643513789581207, + "grad_norm": 1.4371662538136063, + "learning_rate": 1.9999980298889996e-05, + "loss": 0.6438, + "step": 300 + }, + { + "epoch": 0.030745658835546474, + "grad_norm": 1.4869738768014928, + "learning_rate": 1.9999973184603453e-05, + "loss": 0.6223, + "step": 301 + }, + { + "epoch": 0.030847803881511746, + "grad_norm": 1.6940793723174485, + "learning_rate": 1.999996497581338e-05, + "loss": 0.7169, + "step": 302 + }, + { + "epoch": 0.030949948927477017, + "grad_norm": 1.685173831152958, + "learning_rate": 1.9999955672520682e-05, + "loss": 0.6781, + "step": 303 + }, + { + "epoch": 0.031052093973442288, + "grad_norm": 1.7090834649740707, + "learning_rate": 1.9999945274726376e-05, + "loss": 0.7915, + "step": 304 + }, + { + "epoch": 0.03115423901940756, + "grad_norm": 1.671882306345056, + "learning_rate": 1.9999933782431596e-05, + "loss": 0.6592, + "step": 305 + }, + { + "epoch": 0.03125638406537283, + "grad_norm": 1.4801728664351115, + "learning_rate": 1.9999921195637606e-05, + "loss": 0.7148, + "step": 306 + }, + { + "epoch": 0.0313585291113381, + "grad_norm": 1.6220174463697596, + "learning_rate": 1.999990751434578e-05, + "loss": 0.6391, + "step": 307 + }, + { + "epoch": 0.03146067415730337, + "grad_norm": 1.7585727306421493, + "learning_rate": 1.9999892738557615e-05, + "loss": 0.8403, + "step": 308 + }, + { + "epoch": 0.031562819203268644, + "grad_norm": 1.6104542939414912, + "learning_rate": 1.9999876868274727e-05, + "loss": 0.6965, + "step": 309 + }, + { + "epoch": 0.031664964249233915, + "grad_norm": 1.5333296122061046, + "learning_rate": 1.9999859903498856e-05, + "loss": 0.6839, + "step": 310 + }, + { + "epoch": 0.031767109295199186, + "grad_norm": 1.5891278151624393, + "learning_rate": 1.9999841844231857e-05, + "loss": 0.6996, + "step": 311 + }, + { + "epoch": 0.03186925434116445, + "grad_norm": 1.574480556315509, + "learning_rate": 1.9999822690475713e-05, + "loss": 0.7041, + "step": 312 + }, + { + "epoch": 0.03197139938712972, + "grad_norm": 1.591178739260255, + "learning_rate": 1.999980244223251e-05, + "loss": 0.7303, + "step": 313 + }, + { + "epoch": 0.03207354443309499, + "grad_norm": 1.587964682884301, + "learning_rate": 1.9999781099504466e-05, + "loss": 0.6733, + "step": 314 + }, + { + "epoch": 0.032175689479060264, + "grad_norm": 1.524198174484014, + "learning_rate": 1.9999758662293926e-05, + "loss": 0.8069, + "step": 315 + }, + { + "epoch": 0.032277834525025535, + "grad_norm": 1.4841818332002192, + "learning_rate": 1.9999735130603336e-05, + "loss": 0.7504, + "step": 316 + }, + { + "epoch": 0.032379979570990806, + "grad_norm": 1.6605234050501971, + "learning_rate": 1.9999710504435278e-05, + "loss": 0.7957, + "step": 317 + }, + { + "epoch": 0.03248212461695608, + "grad_norm": 1.4966583829015518, + "learning_rate": 1.9999684783792445e-05, + "loss": 0.7275, + "step": 318 + }, + { + "epoch": 0.03258426966292135, + "grad_norm": 1.643817257907392, + "learning_rate": 1.999965796867765e-05, + "loss": 0.7958, + "step": 319 + }, + { + "epoch": 0.03268641470888662, + "grad_norm": 1.5659770135920033, + "learning_rate": 1.999963005909383e-05, + "loss": 0.7684, + "step": 320 + }, + { + "epoch": 0.03278855975485189, + "grad_norm": 1.6574457624811796, + "learning_rate": 1.9999601055044036e-05, + "loss": 0.7444, + "step": 321 + }, + { + "epoch": 0.03289070480081716, + "grad_norm": 1.5399348865853, + "learning_rate": 1.999957095653145e-05, + "loss": 0.8094, + "step": 322 + }, + { + "epoch": 0.032992849846782434, + "grad_norm": 1.5944043156877568, + "learning_rate": 1.9999539763559362e-05, + "loss": 0.8015, + "step": 323 + }, + { + "epoch": 0.033094994892747705, + "grad_norm": 1.7176720338121858, + "learning_rate": 1.999950747613119e-05, + "loss": 0.826, + "step": 324 + }, + { + "epoch": 0.03319713993871297, + "grad_norm": 1.5113923216000331, + "learning_rate": 1.9999474094250457e-05, + "loss": 0.7037, + "step": 325 + }, + { + "epoch": 0.03329928498467824, + "grad_norm": 1.3853075170693228, + "learning_rate": 1.9999439617920825e-05, + "loss": 0.7392, + "step": 326 + }, + { + "epoch": 0.03340143003064351, + "grad_norm": 1.5232185394028046, + "learning_rate": 1.999940404714607e-05, + "loss": 0.7098, + "step": 327 + }, + { + "epoch": 0.03350357507660878, + "grad_norm": 1.8802732147649686, + "learning_rate": 1.999936738193008e-05, + "loss": 0.8102, + "step": 328 + }, + { + "epoch": 0.033605720122574054, + "grad_norm": 1.4225413446281991, + "learning_rate": 1.9999329622276867e-05, + "loss": 0.6202, + "step": 329 + }, + { + "epoch": 0.033707865168539325, + "grad_norm": 1.6435347543779217, + "learning_rate": 1.999929076819057e-05, + "loss": 0.9419, + "step": 330 + }, + { + "epoch": 0.033810010214504596, + "grad_norm": 1.818942270512346, + "learning_rate": 1.9999250819675436e-05, + "loss": 0.7188, + "step": 331 + }, + { + "epoch": 0.03391215526046987, + "grad_norm": 1.629449058835817, + "learning_rate": 1.9999209776735844e-05, + "loss": 0.7595, + "step": 332 + }, + { + "epoch": 0.03401430030643514, + "grad_norm": 1.5463631599213123, + "learning_rate": 1.9999167639376277e-05, + "loss": 0.6287, + "step": 333 + }, + { + "epoch": 0.03411644535240041, + "grad_norm": 1.760281293616789, + "learning_rate": 1.9999124407601353e-05, + "loss": 0.6983, + "step": 334 + }, + { + "epoch": 0.03421859039836568, + "grad_norm": 1.6512443349780666, + "learning_rate": 1.9999080081415802e-05, + "loss": 0.7428, + "step": 335 + }, + { + "epoch": 0.03432073544433095, + "grad_norm": 1.6602032171866863, + "learning_rate": 1.9999034660824476e-05, + "loss": 0.8147, + "step": 336 + }, + { + "epoch": 0.03442288049029622, + "grad_norm": 1.4931542508576237, + "learning_rate": 1.9998988145832348e-05, + "loss": 0.7465, + "step": 337 + }, + { + "epoch": 0.034525025536261494, + "grad_norm": 1.3321594599952813, + "learning_rate": 1.99989405364445e-05, + "loss": 0.6092, + "step": 338 + }, + { + "epoch": 0.03462717058222676, + "grad_norm": 1.493017008015358, + "learning_rate": 1.999889183266616e-05, + "loss": 0.6399, + "step": 339 + }, + { + "epoch": 0.03472931562819203, + "grad_norm": 1.8223996061744088, + "learning_rate": 1.9998842034502644e-05, + "loss": 0.7442, + "step": 340 + }, + { + "epoch": 0.0348314606741573, + "grad_norm": 1.7639551650847805, + "learning_rate": 1.999879114195941e-05, + "loss": 0.7286, + "step": 341 + }, + { + "epoch": 0.03493360572012257, + "grad_norm": 1.531757375530844, + "learning_rate": 1.999873915504202e-05, + "loss": 0.6388, + "step": 342 + }, + { + "epoch": 0.03503575076608784, + "grad_norm": 1.6711013011690217, + "learning_rate": 1.9998686073756174e-05, + "loss": 0.764, + "step": 343 + }, + { + "epoch": 0.035137895812053115, + "grad_norm": 1.599774294140866, + "learning_rate": 1.9998631898107675e-05, + "loss": 0.7366, + "step": 344 + }, + { + "epoch": 0.035240040858018386, + "grad_norm": 1.4249024224943276, + "learning_rate": 1.9998576628102455e-05, + "loss": 0.6737, + "step": 345 + }, + { + "epoch": 0.03534218590398366, + "grad_norm": 1.6793259124814255, + "learning_rate": 1.9998520263746564e-05, + "loss": 0.7076, + "step": 346 + }, + { + "epoch": 0.03544433094994893, + "grad_norm": 1.5590748935269132, + "learning_rate": 1.999846280504617e-05, + "loss": 0.6703, + "step": 347 + }, + { + "epoch": 0.0355464759959142, + "grad_norm": 1.656939834153814, + "learning_rate": 1.999840425200756e-05, + "loss": 0.7481, + "step": 348 + }, + { + "epoch": 0.03564862104187947, + "grad_norm": 1.5175613108654284, + "learning_rate": 1.9998344604637148e-05, + "loss": 0.6894, + "step": 349 + }, + { + "epoch": 0.03575076608784474, + "grad_norm": 1.684746418460725, + "learning_rate": 1.9998283862941457e-05, + "loss": 0.6748, + "step": 350 + }, + { + "epoch": 0.03585291113381001, + "grad_norm": 1.5357017913763469, + "learning_rate": 1.9998222026927138e-05, + "loss": 0.7063, + "step": 351 + }, + { + "epoch": 0.035955056179775284, + "grad_norm": 1.46456112975668, + "learning_rate": 1.9998159096600962e-05, + "loss": 0.7087, + "step": 352 + }, + { + "epoch": 0.03605720122574055, + "grad_norm": 1.6441176951800112, + "learning_rate": 1.9998095071969808e-05, + "loss": 0.8299, + "step": 353 + }, + { + "epoch": 0.03615934627170582, + "grad_norm": 1.6363800478989037, + "learning_rate": 1.9998029953040693e-05, + "loss": 0.7238, + "step": 354 + }, + { + "epoch": 0.03626149131767109, + "grad_norm": 1.6570628364055506, + "learning_rate": 1.9997963739820737e-05, + "loss": 0.7974, + "step": 355 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 1.5979569956631086, + "learning_rate": 1.9997896432317193e-05, + "loss": 0.7773, + "step": 356 + }, + { + "epoch": 0.03646578140960163, + "grad_norm": 1.6277886279965166, + "learning_rate": 1.999782803053742e-05, + "loss": 0.7895, + "step": 357 + }, + { + "epoch": 0.036567926455566904, + "grad_norm": 1.6179991781659897, + "learning_rate": 1.9997758534488915e-05, + "loss": 0.7239, + "step": 358 + }, + { + "epoch": 0.036670071501532175, + "grad_norm": 1.5977292493497925, + "learning_rate": 1.999768794417928e-05, + "loss": 0.7186, + "step": 359 + }, + { + "epoch": 0.03677221654749745, + "grad_norm": 1.6260876869448038, + "learning_rate": 1.9997616259616236e-05, + "loss": 0.7696, + "step": 360 + }, + { + "epoch": 0.03687436159346272, + "grad_norm": 1.5045152360940806, + "learning_rate": 1.9997543480807635e-05, + "loss": 0.7997, + "step": 361 + }, + { + "epoch": 0.03697650663942799, + "grad_norm": 1.6966463079599634, + "learning_rate": 1.999746960776144e-05, + "loss": 0.837, + "step": 362 + }, + { + "epoch": 0.03707865168539326, + "grad_norm": 1.5346645215134478, + "learning_rate": 1.9997394640485738e-05, + "loss": 0.7537, + "step": 363 + }, + { + "epoch": 0.03718079673135853, + "grad_norm": 1.6332849422753557, + "learning_rate": 1.9997318578988732e-05, + "loss": 0.6529, + "step": 364 + }, + { + "epoch": 0.0372829417773238, + "grad_norm": 1.6062702267766975, + "learning_rate": 1.999724142327875e-05, + "loss": 0.8201, + "step": 365 + }, + { + "epoch": 0.037385086823289074, + "grad_norm": 1.7595573766043457, + "learning_rate": 1.9997163173364233e-05, + "loss": 0.8048, + "step": 366 + }, + { + "epoch": 0.03748723186925434, + "grad_norm": 1.531230432584508, + "learning_rate": 1.9997083829253748e-05, + "loss": 0.6983, + "step": 367 + }, + { + "epoch": 0.03758937691521961, + "grad_norm": 1.7098444699568267, + "learning_rate": 1.999700339095598e-05, + "loss": 0.8021, + "step": 368 + }, + { + "epoch": 0.03769152196118488, + "grad_norm": 1.5761250003328382, + "learning_rate": 1.9996921858479733e-05, + "loss": 0.8436, + "step": 369 + }, + { + "epoch": 0.03779366700715015, + "grad_norm": 1.7226213868594205, + "learning_rate": 1.999683923183393e-05, + "loss": 0.7574, + "step": 370 + }, + { + "epoch": 0.03789581205311542, + "grad_norm": 1.522931463193398, + "learning_rate": 1.9996755511027613e-05, + "loss": 0.7175, + "step": 371 + }, + { + "epoch": 0.037997957099080694, + "grad_norm": 1.738367365517668, + "learning_rate": 1.9996670696069947e-05, + "loss": 0.7346, + "step": 372 + }, + { + "epoch": 0.038100102145045965, + "grad_norm": 1.6328014144266472, + "learning_rate": 1.9996584786970214e-05, + "loss": 0.7633, + "step": 373 + }, + { + "epoch": 0.038202247191011236, + "grad_norm": 1.7662587538174572, + "learning_rate": 1.999649778373782e-05, + "loss": 0.7376, + "step": 374 + }, + { + "epoch": 0.03830439223697651, + "grad_norm": 1.6698967486625416, + "learning_rate": 1.9996409686382278e-05, + "loss": 0.8357, + "step": 375 + }, + { + "epoch": 0.03840653728294178, + "grad_norm": 1.6776205167172553, + "learning_rate": 1.9996320494913245e-05, + "loss": 0.7336, + "step": 376 + }, + { + "epoch": 0.03850868232890705, + "grad_norm": 1.6430954813325385, + "learning_rate": 1.999623020934047e-05, + "loss": 0.8032, + "step": 377 + }, + { + "epoch": 0.03861082737487232, + "grad_norm": 1.654899567519746, + "learning_rate": 1.9996138829673844e-05, + "loss": 0.724, + "step": 378 + }, + { + "epoch": 0.03871297242083759, + "grad_norm": 1.6907175181600396, + "learning_rate": 1.9996046355923365e-05, + "loss": 0.8435, + "step": 379 + }, + { + "epoch": 0.03881511746680286, + "grad_norm": 1.7540290532399327, + "learning_rate": 1.999595278809915e-05, + "loss": 0.6889, + "step": 380 + }, + { + "epoch": 0.03891726251276813, + "grad_norm": 1.5127412714008044, + "learning_rate": 1.999585812621145e-05, + "loss": 0.7169, + "step": 381 + }, + { + "epoch": 0.0390194075587334, + "grad_norm": 1.3896934203025388, + "learning_rate": 1.9995762370270616e-05, + "loss": 0.6606, + "step": 382 + }, + { + "epoch": 0.03912155260469867, + "grad_norm": 1.6430319317316633, + "learning_rate": 1.999566552028713e-05, + "loss": 0.762, + "step": 383 + }, + { + "epoch": 0.03922369765066394, + "grad_norm": 1.8095769884768498, + "learning_rate": 1.99955675762716e-05, + "loss": 0.7503, + "step": 384 + }, + { + "epoch": 0.03932584269662921, + "grad_norm": 1.6134360376707184, + "learning_rate": 1.9995468538234738e-05, + "loss": 0.7931, + "step": 385 + }, + { + "epoch": 0.039427987742594484, + "grad_norm": 1.7125351230052142, + "learning_rate": 1.9995368406187387e-05, + "loss": 0.6807, + "step": 386 + }, + { + "epoch": 0.039530132788559755, + "grad_norm": 1.5854645387766508, + "learning_rate": 1.9995267180140504e-05, + "loss": 0.777, + "step": 387 + }, + { + "epoch": 0.039632277834525026, + "grad_norm": 1.7868588964792638, + "learning_rate": 1.9995164860105176e-05, + "loss": 0.7762, + "step": 388 + }, + { + "epoch": 0.0397344228804903, + "grad_norm": 1.71671415273508, + "learning_rate": 1.9995061446092593e-05, + "loss": 0.7728, + "step": 389 + }, + { + "epoch": 0.03983656792645557, + "grad_norm": 1.4664603461526882, + "learning_rate": 1.9994956938114075e-05, + "loss": 0.6703, + "step": 390 + }, + { + "epoch": 0.03993871297242084, + "grad_norm": 1.376104314039649, + "learning_rate": 1.9994851336181065e-05, + "loss": 0.6903, + "step": 391 + }, + { + "epoch": 0.04004085801838611, + "grad_norm": 1.7120683874024334, + "learning_rate": 1.9994744640305118e-05, + "loss": 0.7492, + "step": 392 + }, + { + "epoch": 0.04014300306435138, + "grad_norm": 1.5266195487385579, + "learning_rate": 1.9994636850497914e-05, + "loss": 0.8543, + "step": 393 + }, + { + "epoch": 0.04024514811031665, + "grad_norm": 1.7370112684271555, + "learning_rate": 1.999452796677125e-05, + "loss": 0.7076, + "step": 394 + }, + { + "epoch": 0.04034729315628192, + "grad_norm": 1.5306005443036559, + "learning_rate": 1.9994417989137042e-05, + "loss": 0.6494, + "step": 395 + }, + { + "epoch": 0.04044943820224719, + "grad_norm": 1.714731082919517, + "learning_rate": 1.9994306917607328e-05, + "loss": 0.8529, + "step": 396 + }, + { + "epoch": 0.04055158324821246, + "grad_norm": 1.653950434390749, + "learning_rate": 1.9994194752194262e-05, + "loss": 0.7136, + "step": 397 + }, + { + "epoch": 0.04065372829417773, + "grad_norm": 1.6357864809981595, + "learning_rate": 1.9994081492910126e-05, + "loss": 0.8166, + "step": 398 + }, + { + "epoch": 0.040755873340143, + "grad_norm": 1.5747167760334395, + "learning_rate": 1.9993967139767313e-05, + "loss": 0.7278, + "step": 399 + }, + { + "epoch": 0.04085801838610827, + "grad_norm": 1.689504266752484, + "learning_rate": 1.999385169277834e-05, + "loss": 0.8446, + "step": 400 + }, + { + "epoch": 0.040960163432073544, + "grad_norm": 1.7690625261396564, + "learning_rate": 1.999373515195584e-05, + "loss": 0.677, + "step": 401 + }, + { + "epoch": 0.041062308478038816, + "grad_norm": 1.5242754554386093, + "learning_rate": 1.9993617517312573e-05, + "loss": 0.6768, + "step": 402 + }, + { + "epoch": 0.04116445352400409, + "grad_norm": 1.4788487504520307, + "learning_rate": 1.999349878886141e-05, + "loss": 0.6219, + "step": 403 + }, + { + "epoch": 0.04126659856996936, + "grad_norm": 1.7810048941677938, + "learning_rate": 1.999337896661535e-05, + "loss": 0.7806, + "step": 404 + }, + { + "epoch": 0.04136874361593463, + "grad_norm": 1.5778067115298302, + "learning_rate": 1.9993258050587503e-05, + "loss": 0.7152, + "step": 405 + }, + { + "epoch": 0.0414708886618999, + "grad_norm": 1.582008540833576, + "learning_rate": 1.999313604079111e-05, + "loss": 0.7615, + "step": 406 + }, + { + "epoch": 0.04157303370786517, + "grad_norm": 1.5850712044121877, + "learning_rate": 1.999301293723952e-05, + "loss": 0.6787, + "step": 407 + }, + { + "epoch": 0.041675178753830436, + "grad_norm": 1.612463597676338, + "learning_rate": 1.9992888739946207e-05, + "loss": 0.6229, + "step": 408 + }, + { + "epoch": 0.04177732379979571, + "grad_norm": 1.5661592926069352, + "learning_rate": 1.9992763448924763e-05, + "loss": 0.7595, + "step": 409 + }, + { + "epoch": 0.04187946884576098, + "grad_norm": 1.6317731553287673, + "learning_rate": 1.9992637064188906e-05, + "loss": 0.836, + "step": 410 + }, + { + "epoch": 0.04198161389172625, + "grad_norm": 1.7424545360235912, + "learning_rate": 1.9992509585752465e-05, + "loss": 0.7934, + "step": 411 + }, + { + "epoch": 0.04208375893769152, + "grad_norm": 1.4864095271972726, + "learning_rate": 1.9992381013629397e-05, + "loss": 0.8132, + "step": 412 + }, + { + "epoch": 0.04218590398365679, + "grad_norm": 1.5790188794772937, + "learning_rate": 1.9992251347833766e-05, + "loss": 0.7902, + "step": 413 + }, + { + "epoch": 0.04228804902962206, + "grad_norm": 1.6618138742842077, + "learning_rate": 1.9992120588379774e-05, + "loss": 0.7934, + "step": 414 + }, + { + "epoch": 0.042390194075587334, + "grad_norm": 1.5066893644067565, + "learning_rate": 1.9991988735281724e-05, + "loss": 0.7341, + "step": 415 + }, + { + "epoch": 0.042492339121552605, + "grad_norm": 1.7165765959051955, + "learning_rate": 1.9991855788554055e-05, + "loss": 0.8411, + "step": 416 + }, + { + "epoch": 0.042594484167517876, + "grad_norm": 1.513931606897242, + "learning_rate": 1.999172174821131e-05, + "loss": 0.7091, + "step": 417 + }, + { + "epoch": 0.04269662921348315, + "grad_norm": 1.4733180904385181, + "learning_rate": 1.9991586614268166e-05, + "loss": 0.7547, + "step": 418 + }, + { + "epoch": 0.04279877425944842, + "grad_norm": 1.6487688037815096, + "learning_rate": 1.999145038673941e-05, + "loss": 0.8272, + "step": 419 + }, + { + "epoch": 0.04290091930541369, + "grad_norm": 1.6313369499823922, + "learning_rate": 1.9991313065639956e-05, + "loss": 0.7994, + "step": 420 + }, + { + "epoch": 0.04300306435137896, + "grad_norm": 1.7312501065629007, + "learning_rate": 1.9991174650984832e-05, + "loss": 0.812, + "step": 421 + }, + { + "epoch": 0.043105209397344225, + "grad_norm": 1.5753455465226176, + "learning_rate": 1.9991035142789187e-05, + "loss": 0.8568, + "step": 422 + }, + { + "epoch": 0.0432073544433095, + "grad_norm": 1.516052496234355, + "learning_rate": 1.999089454106829e-05, + "loss": 0.6674, + "step": 423 + }, + { + "epoch": 0.04330949948927477, + "grad_norm": 1.5456793116039083, + "learning_rate": 1.999075284583753e-05, + "loss": 0.8071, + "step": 424 + }, + { + "epoch": 0.04341164453524004, + "grad_norm": 1.6815766452711434, + "learning_rate": 1.999061005711242e-05, + "loss": 0.7161, + "step": 425 + }, + { + "epoch": 0.04351378958120531, + "grad_norm": 1.6035608458533839, + "learning_rate": 1.999046617490858e-05, + "loss": 0.63, + "step": 426 + }, + { + "epoch": 0.04361593462717058, + "grad_norm": 1.5743440037884069, + "learning_rate": 1.9990321199241765e-05, + "loss": 0.8395, + "step": 427 + }, + { + "epoch": 0.04371807967313585, + "grad_norm": 1.5575052038038233, + "learning_rate": 1.9990175130127837e-05, + "loss": 0.735, + "step": 428 + }, + { + "epoch": 0.043820224719101124, + "grad_norm": 1.5457309060173676, + "learning_rate": 1.999002796758279e-05, + "loss": 0.724, + "step": 429 + }, + { + "epoch": 0.043922369765066395, + "grad_norm": 1.7407269912291348, + "learning_rate": 1.9989879711622726e-05, + "loss": 0.794, + "step": 430 + }, + { + "epoch": 0.044024514811031666, + "grad_norm": 1.6411432528678405, + "learning_rate": 1.9989730362263874e-05, + "loss": 0.7356, + "step": 431 + }, + { + "epoch": 0.04412665985699694, + "grad_norm": 1.6459306563987728, + "learning_rate": 1.998957991952258e-05, + "loss": 0.811, + "step": 432 + }, + { + "epoch": 0.04422880490296221, + "grad_norm": 1.551529819349357, + "learning_rate": 1.998942838341531e-05, + "loss": 0.6689, + "step": 433 + }, + { + "epoch": 0.04433094994892748, + "grad_norm": 1.669418577461631, + "learning_rate": 1.9989275753958646e-05, + "loss": 0.7974, + "step": 434 + }, + { + "epoch": 0.04443309499489275, + "grad_norm": 1.6083260424621268, + "learning_rate": 1.9989122031169303e-05, + "loss": 0.603, + "step": 435 + }, + { + "epoch": 0.044535240040858015, + "grad_norm": 1.730858785645276, + "learning_rate": 1.9988967215064096e-05, + "loss": 0.891, + "step": 436 + }, + { + "epoch": 0.044637385086823286, + "grad_norm": 1.5440714801961926, + "learning_rate": 1.998881130565997e-05, + "loss": 0.6839, + "step": 437 + }, + { + "epoch": 0.04473953013278856, + "grad_norm": 1.5262191281826751, + "learning_rate": 1.9988654302974e-05, + "loss": 0.682, + "step": 438 + }, + { + "epoch": 0.04484167517875383, + "grad_norm": 1.5339464791930282, + "learning_rate": 1.998849620702336e-05, + "loss": 0.6362, + "step": 439 + }, + { + "epoch": 0.0449438202247191, + "grad_norm": 1.5615261336960227, + "learning_rate": 1.9988337017825355e-05, + "loss": 0.8495, + "step": 440 + }, + { + "epoch": 0.04504596527068437, + "grad_norm": 1.5228235326627682, + "learning_rate": 1.9988176735397414e-05, + "loss": 0.8616, + "step": 441 + }, + { + "epoch": 0.04514811031664964, + "grad_norm": 1.565493645775326, + "learning_rate": 1.9988015359757075e-05, + "loss": 0.7127, + "step": 442 + }, + { + "epoch": 0.04525025536261491, + "grad_norm": 1.5632633415007537, + "learning_rate": 1.9987852890922e-05, + "loss": 0.679, + "step": 443 + }, + { + "epoch": 0.045352400408580185, + "grad_norm": 1.4951795322950667, + "learning_rate": 1.9987689328909973e-05, + "loss": 0.7127, + "step": 444 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 1.6508795094702773, + "learning_rate": 1.9987524673738897e-05, + "loss": 0.8676, + "step": 445 + }, + { + "epoch": 0.04555669050051073, + "grad_norm": 1.5517521677287343, + "learning_rate": 1.998735892542679e-05, + "loss": 0.7402, + "step": 446 + }, + { + "epoch": 0.045658835546476, + "grad_norm": 1.4303236217913078, + "learning_rate": 1.99871920839918e-05, + "loss": 0.7569, + "step": 447 + }, + { + "epoch": 0.04576098059244127, + "grad_norm": 1.6020744350189244, + "learning_rate": 1.9987024149452183e-05, + "loss": 0.7093, + "step": 448 + }, + { + "epoch": 0.04586312563840654, + "grad_norm": 1.5160906930493678, + "learning_rate": 1.998685512182632e-05, + "loss": 0.7835, + "step": 449 + }, + { + "epoch": 0.045965270684371805, + "grad_norm": 1.476476819397115, + "learning_rate": 1.9986685001132712e-05, + "loss": 0.7317, + "step": 450 + }, + { + "epoch": 0.046067415730337076, + "grad_norm": 1.5560080475435927, + "learning_rate": 1.9986513787389977e-05, + "loss": 0.7305, + "step": 451 + }, + { + "epoch": 0.04616956077630235, + "grad_norm": 1.5714651262145016, + "learning_rate": 1.9986341480616856e-05, + "loss": 0.7018, + "step": 452 + }, + { + "epoch": 0.04627170582226762, + "grad_norm": 1.6352297711977473, + "learning_rate": 1.998616808083221e-05, + "loss": 0.7336, + "step": 453 + }, + { + "epoch": 0.04637385086823289, + "grad_norm": 1.4413342018676707, + "learning_rate": 1.998599358805501e-05, + "loss": 0.6515, + "step": 454 + }, + { + "epoch": 0.04647599591419816, + "grad_norm": 1.6903614834734375, + "learning_rate": 1.9985818002304367e-05, + "loss": 0.8132, + "step": 455 + }, + { + "epoch": 0.04657814096016343, + "grad_norm": 1.8258184877594659, + "learning_rate": 1.998564132359949e-05, + "loss": 0.8263, + "step": 456 + }, + { + "epoch": 0.0466802860061287, + "grad_norm": 1.6567282783905968, + "learning_rate": 1.9985463551959715e-05, + "loss": 0.7446, + "step": 457 + }, + { + "epoch": 0.046782431052093974, + "grad_norm": 1.5563465442212512, + "learning_rate": 1.998528468740451e-05, + "loss": 0.5944, + "step": 458 + }, + { + "epoch": 0.046884576098059245, + "grad_norm": 1.546733433320347, + "learning_rate": 1.9985104729953438e-05, + "loss": 0.7259, + "step": 459 + }, + { + "epoch": 0.04698672114402452, + "grad_norm": 1.7834205927440032, + "learning_rate": 1.9984923679626205e-05, + "loss": 0.7441, + "step": 460 + }, + { + "epoch": 0.04708886618998979, + "grad_norm": 1.5980607898721175, + "learning_rate": 1.9984741536442622e-05, + "loss": 0.811, + "step": 461 + }, + { + "epoch": 0.04719101123595506, + "grad_norm": 1.5931779877444234, + "learning_rate": 1.998455830042263e-05, + "loss": 0.7936, + "step": 462 + }, + { + "epoch": 0.04729315628192033, + "grad_norm": 1.6641931540896815, + "learning_rate": 1.998437397158628e-05, + "loss": 0.7834, + "step": 463 + }, + { + "epoch": 0.047395301327885594, + "grad_norm": 1.6176479362590956, + "learning_rate": 1.9984188549953747e-05, + "loss": 0.8281, + "step": 464 + }, + { + "epoch": 0.047497446373850866, + "grad_norm": 1.436029553368375, + "learning_rate": 1.9984002035545327e-05, + "loss": 0.5851, + "step": 465 + }, + { + "epoch": 0.04759959141981614, + "grad_norm": 1.6815793284759746, + "learning_rate": 1.9983814428381433e-05, + "loss": 0.7416, + "step": 466 + }, + { + "epoch": 0.04770173646578141, + "grad_norm": 1.6775855993331013, + "learning_rate": 1.99836257284826e-05, + "loss": 0.7009, + "step": 467 + }, + { + "epoch": 0.04780388151174668, + "grad_norm": 1.6912070041998748, + "learning_rate": 1.9983435935869483e-05, + "loss": 0.7175, + "step": 468 + }, + { + "epoch": 0.04790602655771195, + "grad_norm": 1.5353035836739497, + "learning_rate": 1.998324505056285e-05, + "loss": 0.6991, + "step": 469 + }, + { + "epoch": 0.04800817160367722, + "grad_norm": 1.7177225253697452, + "learning_rate": 1.9983053072583595e-05, + "loss": 0.7749, + "step": 470 + }, + { + "epoch": 0.04811031664964249, + "grad_norm": 1.5518824824200352, + "learning_rate": 1.9982860001952735e-05, + "loss": 0.7881, + "step": 471 + }, + { + "epoch": 0.048212461695607764, + "grad_norm": 1.6189876512945427, + "learning_rate": 1.9982665838691396e-05, + "loss": 0.7223, + "step": 472 + }, + { + "epoch": 0.048314606741573035, + "grad_norm": 1.5862052439655876, + "learning_rate": 1.998247058282083e-05, + "loss": 0.6951, + "step": 473 + }, + { + "epoch": 0.048416751787538306, + "grad_norm": 1.664954892936885, + "learning_rate": 1.9982274234362414e-05, + "loss": 0.7931, + "step": 474 + }, + { + "epoch": 0.04851889683350358, + "grad_norm": 1.713903726048864, + "learning_rate": 1.998207679333763e-05, + "loss": 0.7588, + "step": 475 + }, + { + "epoch": 0.04862104187946885, + "grad_norm": 1.682176230758614, + "learning_rate": 1.9981878259768094e-05, + "loss": 0.8976, + "step": 476 + }, + { + "epoch": 0.04872318692543412, + "grad_norm": 1.9723092918071983, + "learning_rate": 1.998167863367553e-05, + "loss": 0.7619, + "step": 477 + }, + { + "epoch": 0.048825331971399384, + "grad_norm": 1.599296990545799, + "learning_rate": 1.9981477915081794e-05, + "loss": 0.7556, + "step": 478 + }, + { + "epoch": 0.048927477017364655, + "grad_norm": 1.3960674296691826, + "learning_rate": 1.9981276104008848e-05, + "loss": 0.7784, + "step": 479 + }, + { + "epoch": 0.049029622063329927, + "grad_norm": 1.6407098154989128, + "learning_rate": 1.9981073200478787e-05, + "loss": 0.7919, + "step": 480 + }, + { + "epoch": 0.0491317671092952, + "grad_norm": 1.595878767512802, + "learning_rate": 1.9980869204513814e-05, + "loss": 0.781, + "step": 481 + }, + { + "epoch": 0.04923391215526047, + "grad_norm": 1.573015042069417, + "learning_rate": 1.9980664116136255e-05, + "loss": 0.8291, + "step": 482 + }, + { + "epoch": 0.04933605720122574, + "grad_norm": 1.6073490789153841, + "learning_rate": 1.9980457935368565e-05, + "loss": 0.6668, + "step": 483 + }, + { + "epoch": 0.04943820224719101, + "grad_norm": 1.5462474517851519, + "learning_rate": 1.99802506622333e-05, + "loss": 0.6628, + "step": 484 + }, + { + "epoch": 0.04954034729315628, + "grad_norm": 1.6142670346195724, + "learning_rate": 1.9980042296753158e-05, + "loss": 0.7874, + "step": 485 + }, + { + "epoch": 0.049642492339121554, + "grad_norm": 1.6400766316865425, + "learning_rate": 1.9979832838950937e-05, + "loss": 0.747, + "step": 486 + }, + { + "epoch": 0.049744637385086825, + "grad_norm": 1.416262241420444, + "learning_rate": 1.9979622288849563e-05, + "loss": 0.6473, + "step": 487 + }, + { + "epoch": 0.049846782431052096, + "grad_norm": 1.6284847907731343, + "learning_rate": 1.9979410646472084e-05, + "loss": 0.7313, + "step": 488 + }, + { + "epoch": 0.04994892747701737, + "grad_norm": 1.5246099600341796, + "learning_rate": 1.997919791184166e-05, + "loss": 0.6147, + "step": 489 + }, + { + "epoch": 0.05005107252298264, + "grad_norm": 1.5482113841987348, + "learning_rate": 1.9978984084981578e-05, + "loss": 0.7131, + "step": 490 + }, + { + "epoch": 0.0501532175689479, + "grad_norm": 1.5261039055113574, + "learning_rate": 1.997876916591524e-05, + "loss": 0.8526, + "step": 491 + }, + { + "epoch": 0.050255362614913174, + "grad_norm": 1.5850472171550933, + "learning_rate": 1.997855315466617e-05, + "loss": 0.8454, + "step": 492 + }, + { + "epoch": 0.050357507660878445, + "grad_norm": 1.635046304306875, + "learning_rate": 1.9978336051258012e-05, + "loss": 0.7252, + "step": 493 + }, + { + "epoch": 0.050459652706843716, + "grad_norm": 1.8475031041628454, + "learning_rate": 1.9978117855714524e-05, + "loss": 0.8357, + "step": 494 + }, + { + "epoch": 0.05056179775280899, + "grad_norm": 1.5477499977293454, + "learning_rate": 1.9977898568059592e-05, + "loss": 0.7261, + "step": 495 + }, + { + "epoch": 0.05066394279877426, + "grad_norm": 1.5736238463668162, + "learning_rate": 1.9977678188317213e-05, + "loss": 0.7558, + "step": 496 + }, + { + "epoch": 0.05076608784473953, + "grad_norm": 1.750954972848195, + "learning_rate": 1.997745671651151e-05, + "loss": 0.8026, + "step": 497 + }, + { + "epoch": 0.0508682328907048, + "grad_norm": 1.7538040961994257, + "learning_rate": 1.9977234152666723e-05, + "loss": 0.768, + "step": 498 + }, + { + "epoch": 0.05097037793667007, + "grad_norm": 1.49839006573758, + "learning_rate": 1.997701049680721e-05, + "loss": 0.7114, + "step": 499 + }, + { + "epoch": 0.05107252298263534, + "grad_norm": 1.487862582013371, + "learning_rate": 1.997678574895746e-05, + "loss": 0.7208, + "step": 500 + }, + { + "epoch": 0.051174668028600614, + "grad_norm": 1.4400234174710935, + "learning_rate": 1.9976559909142057e-05, + "loss": 0.6979, + "step": 501 + }, + { + "epoch": 0.051276813074565886, + "grad_norm": 1.646071259640487, + "learning_rate": 1.997633297738573e-05, + "loss": 0.7387, + "step": 502 + }, + { + "epoch": 0.05137895812053116, + "grad_norm": 1.5022436848759284, + "learning_rate": 1.997610495371331e-05, + "loss": 0.6309, + "step": 503 + }, + { + "epoch": 0.05148110316649643, + "grad_norm": 1.6163719101583476, + "learning_rate": 1.9975875838149758e-05, + "loss": 0.8308, + "step": 504 + }, + { + "epoch": 0.05158324821246169, + "grad_norm": 1.5272768152282914, + "learning_rate": 1.9975645630720152e-05, + "loss": 0.7628, + "step": 505 + }, + { + "epoch": 0.051685393258426963, + "grad_norm": 1.4855036847637648, + "learning_rate": 1.9975414331449684e-05, + "loss": 0.7701, + "step": 506 + }, + { + "epoch": 0.051787538304392235, + "grad_norm": 1.5871173934837504, + "learning_rate": 1.9975181940363675e-05, + "loss": 0.7707, + "step": 507 + }, + { + "epoch": 0.051889683350357506, + "grad_norm": 1.6252297582884947, + "learning_rate": 1.997494845748756e-05, + "loss": 0.7864, + "step": 508 + }, + { + "epoch": 0.05199182839632278, + "grad_norm": 1.6982028690022921, + "learning_rate": 1.9974713882846885e-05, + "loss": 0.7825, + "step": 509 + }, + { + "epoch": 0.05209397344228805, + "grad_norm": 1.4933623426526994, + "learning_rate": 1.9974478216467333e-05, + "loss": 0.8042, + "step": 510 + }, + { + "epoch": 0.05219611848825332, + "grad_norm": 1.5149481162013883, + "learning_rate": 1.99742414583747e-05, + "loss": 0.7284, + "step": 511 + }, + { + "epoch": 0.05229826353421859, + "grad_norm": 1.631588226488018, + "learning_rate": 1.997400360859489e-05, + "loss": 0.7637, + "step": 512 + }, + { + "epoch": 0.05240040858018386, + "grad_norm": 1.6478162754339205, + "learning_rate": 1.9973764667153944e-05, + "loss": 0.6624, + "step": 513 + }, + { + "epoch": 0.05250255362614913, + "grad_norm": 1.557462708731741, + "learning_rate": 1.9973524634078012e-05, + "loss": 0.7373, + "step": 514 + }, + { + "epoch": 0.052604698672114404, + "grad_norm": 1.636878955660981, + "learning_rate": 1.9973283509393364e-05, + "loss": 0.7685, + "step": 515 + }, + { + "epoch": 0.052706843718079675, + "grad_norm": 1.523844984732845, + "learning_rate": 1.9973041293126392e-05, + "loss": 0.7082, + "step": 516 + }, + { + "epoch": 0.052808988764044947, + "grad_norm": 1.6214690089363601, + "learning_rate": 1.997279798530361e-05, + "loss": 0.8186, + "step": 517 + }, + { + "epoch": 0.05291113381001022, + "grad_norm": 1.5019751681067, + "learning_rate": 1.997255358595164e-05, + "loss": 0.7605, + "step": 518 + }, + { + "epoch": 0.05301327885597548, + "grad_norm": 1.8309363486052423, + "learning_rate": 1.997230809509724e-05, + "loss": 0.7709, + "step": 519 + }, + { + "epoch": 0.05311542390194075, + "grad_norm": 1.572658511634003, + "learning_rate": 1.9972061512767276e-05, + "loss": 0.8028, + "step": 520 + }, + { + "epoch": 0.053217568947906024, + "grad_norm": 1.6805782612053086, + "learning_rate": 1.9971813838988736e-05, + "loss": 0.8838, + "step": 521 + }, + { + "epoch": 0.053319713993871296, + "grad_norm": 1.4313241773408503, + "learning_rate": 1.9971565073788728e-05, + "loss": 0.6039, + "step": 522 + }, + { + "epoch": 0.05342185903983657, + "grad_norm": 1.5878763956757955, + "learning_rate": 1.997131521719448e-05, + "loss": 0.843, + "step": 523 + }, + { + "epoch": 0.05352400408580184, + "grad_norm": 1.5120214772681129, + "learning_rate": 1.9971064269233343e-05, + "loss": 0.6371, + "step": 524 + }, + { + "epoch": 0.05362614913176711, + "grad_norm": 1.4288915245762615, + "learning_rate": 1.9970812229932777e-05, + "loss": 0.7581, + "step": 525 + }, + { + "epoch": 0.05372829417773238, + "grad_norm": 1.5143270955064858, + "learning_rate": 1.997055909932037e-05, + "loss": 0.7561, + "step": 526 + }, + { + "epoch": 0.05383043922369765, + "grad_norm": 1.6536682572378336, + "learning_rate": 1.9970304877423827e-05, + "loss": 0.7383, + "step": 527 + }, + { + "epoch": 0.05393258426966292, + "grad_norm": 1.4776304363949475, + "learning_rate": 1.9970049564270975e-05, + "loss": 0.7054, + "step": 528 + }, + { + "epoch": 0.054034729315628194, + "grad_norm": 1.523494785377469, + "learning_rate": 1.9969793159889758e-05, + "loss": 0.698, + "step": 529 + }, + { + "epoch": 0.054136874361593465, + "grad_norm": 1.620583783092881, + "learning_rate": 1.9969535664308237e-05, + "loss": 0.8122, + "step": 530 + }, + { + "epoch": 0.054239019407558736, + "grad_norm": 1.5368145123426755, + "learning_rate": 1.9969277077554597e-05, + "loss": 0.7826, + "step": 531 + }, + { + "epoch": 0.05434116445352401, + "grad_norm": 1.5291641763068498, + "learning_rate": 1.996901739965714e-05, + "loss": 0.8057, + "step": 532 + }, + { + "epoch": 0.05444330949948927, + "grad_norm": 1.4092524903000994, + "learning_rate": 1.9968756630644287e-05, + "loss": 0.7227, + "step": 533 + }, + { + "epoch": 0.05454545454545454, + "grad_norm": 1.5490642289609866, + "learning_rate": 1.996849477054458e-05, + "loss": 0.607, + "step": 534 + }, + { + "epoch": 0.054647599591419814, + "grad_norm": 1.5506759266480605, + "learning_rate": 1.9968231819386677e-05, + "loss": 0.7096, + "step": 535 + }, + { + "epoch": 0.054749744637385085, + "grad_norm": 1.558974258222842, + "learning_rate": 1.9967967777199366e-05, + "loss": 0.7737, + "step": 536 + }, + { + "epoch": 0.054851889683350356, + "grad_norm": 1.6078716807902478, + "learning_rate": 1.9967702644011538e-05, + "loss": 0.8519, + "step": 537 + }, + { + "epoch": 0.05495403472931563, + "grad_norm": 1.532917311097514, + "learning_rate": 1.996743641985222e-05, + "loss": 0.7292, + "step": 538 + }, + { + "epoch": 0.0550561797752809, + "grad_norm": 1.5539661819233375, + "learning_rate": 1.996716910475054e-05, + "loss": 0.8819, + "step": 539 + }, + { + "epoch": 0.05515832482124617, + "grad_norm": 1.609369477387334, + "learning_rate": 1.9966900698735764e-05, + "loss": 0.779, + "step": 540 + }, + { + "epoch": 0.05526046986721144, + "grad_norm": 2.094831186753633, + "learning_rate": 1.996663120183727e-05, + "loss": 0.7686, + "step": 541 + }, + { + "epoch": 0.05536261491317671, + "grad_norm": 1.5550813485343937, + "learning_rate": 1.996636061408455e-05, + "loss": 0.8563, + "step": 542 + }, + { + "epoch": 0.055464759959141983, + "grad_norm": 1.769620850762701, + "learning_rate": 1.996608893550722e-05, + "loss": 0.837, + "step": 543 + }, + { + "epoch": 0.055566905005107255, + "grad_norm": 1.4468661425150449, + "learning_rate": 1.996581616613502e-05, + "loss": 0.6905, + "step": 544 + }, + { + "epoch": 0.055669050051072526, + "grad_norm": 1.550372147968441, + "learning_rate": 1.99655423059978e-05, + "loss": 0.7644, + "step": 545 + }, + { + "epoch": 0.0557711950970378, + "grad_norm": 1.5846849090314357, + "learning_rate": 1.996526735512554e-05, + "loss": 0.8444, + "step": 546 + }, + { + "epoch": 0.05587334014300306, + "grad_norm": 1.5786291793989105, + "learning_rate": 1.9964991313548326e-05, + "loss": 0.805, + "step": 547 + }, + { + "epoch": 0.05597548518896833, + "grad_norm": 1.511048158131181, + "learning_rate": 1.9964714181296374e-05, + "loss": 0.7388, + "step": 548 + }, + { + "epoch": 0.056077630234933604, + "grad_norm": 1.6033130407298832, + "learning_rate": 1.9964435958400016e-05, + "loss": 0.7402, + "step": 549 + }, + { + "epoch": 0.056179775280898875, + "grad_norm": 1.644533878086931, + "learning_rate": 1.9964156644889707e-05, + "loss": 0.826, + "step": 550 + }, + { + "epoch": 0.056281920326864146, + "grad_norm": 1.5918361700773669, + "learning_rate": 1.9963876240796015e-05, + "loss": 0.802, + "step": 551 + }, + { + "epoch": 0.05638406537282942, + "grad_norm": 1.5934854457855434, + "learning_rate": 1.996359474614963e-05, + "loss": 0.7374, + "step": 552 + }, + { + "epoch": 0.05648621041879469, + "grad_norm": 1.545601552720927, + "learning_rate": 1.9963312160981365e-05, + "loss": 0.7748, + "step": 553 + }, + { + "epoch": 0.05658835546475996, + "grad_norm": 1.531004944122486, + "learning_rate": 1.9963028485322145e-05, + "loss": 0.6628, + "step": 554 + }, + { + "epoch": 0.05669050051072523, + "grad_norm": 1.5441027613915532, + "learning_rate": 1.996274371920302e-05, + "loss": 0.7104, + "step": 555 + }, + { + "epoch": 0.0567926455566905, + "grad_norm": 1.5711363595726566, + "learning_rate": 1.996245786265516e-05, + "loss": 0.8046, + "step": 556 + }, + { + "epoch": 0.05689479060265577, + "grad_norm": 1.6185700584690819, + "learning_rate": 1.9962170915709848e-05, + "loss": 0.7038, + "step": 557 + }, + { + "epoch": 0.056996935648621044, + "grad_norm": 1.6658338054592956, + "learning_rate": 1.9961882878398493e-05, + "loss": 0.7685, + "step": 558 + }, + { + "epoch": 0.057099080694586316, + "grad_norm": 1.6462724659163284, + "learning_rate": 1.9961593750752623e-05, + "loss": 0.7543, + "step": 559 + }, + { + "epoch": 0.05720122574055159, + "grad_norm": 1.3781033539757612, + "learning_rate": 1.9961303532803876e-05, + "loss": 0.8265, + "step": 560 + }, + { + "epoch": 0.05730337078651685, + "grad_norm": 1.7254477092011642, + "learning_rate": 1.996101222458403e-05, + "loss": 0.8623, + "step": 561 + }, + { + "epoch": 0.05740551583248212, + "grad_norm": 1.7514153248574444, + "learning_rate": 1.996071982612495e-05, + "loss": 0.7886, + "step": 562 + }, + { + "epoch": 0.05750766087844739, + "grad_norm": 1.710241000449227, + "learning_rate": 1.996042633745866e-05, + "loss": 0.7457, + "step": 563 + }, + { + "epoch": 0.057609805924412665, + "grad_norm": 1.5905894639715203, + "learning_rate": 1.9960131758617267e-05, + "loss": 0.8155, + "step": 564 + }, + { + "epoch": 0.057711950970377936, + "grad_norm": 1.5460268595231133, + "learning_rate": 1.9959836089633016e-05, + "loss": 0.7128, + "step": 565 + }, + { + "epoch": 0.05781409601634321, + "grad_norm": 1.5691851861406998, + "learning_rate": 1.9959539330538274e-05, + "loss": 0.8104, + "step": 566 + }, + { + "epoch": 0.05791624106230848, + "grad_norm": 1.3747345740424768, + "learning_rate": 1.9959241481365516e-05, + "loss": 0.7899, + "step": 567 + }, + { + "epoch": 0.05801838610827375, + "grad_norm": 1.5894366067354702, + "learning_rate": 1.9958942542147342e-05, + "loss": 0.7379, + "step": 568 + }, + { + "epoch": 0.05812053115423902, + "grad_norm": 1.5656637984176518, + "learning_rate": 1.9958642512916475e-05, + "loss": 0.7532, + "step": 569 + }, + { + "epoch": 0.05822267620020429, + "grad_norm": 1.6208811836271306, + "learning_rate": 1.9958341393705753e-05, + "loss": 0.7773, + "step": 570 + }, + { + "epoch": 0.05832482124616956, + "grad_norm": 1.4510835220131038, + "learning_rate": 1.9958039184548124e-05, + "loss": 0.7486, + "step": 571 + }, + { + "epoch": 0.058426966292134834, + "grad_norm": 1.3265444317940052, + "learning_rate": 1.995773588547668e-05, + "loss": 0.6078, + "step": 572 + }, + { + "epoch": 0.058529111338100105, + "grad_norm": 1.7889459854017975, + "learning_rate": 1.9957431496524603e-05, + "loss": 0.7607, + "step": 573 + }, + { + "epoch": 0.05863125638406537, + "grad_norm": 1.6092639296922533, + "learning_rate": 1.995712601772522e-05, + "loss": 0.8071, + "step": 574 + }, + { + "epoch": 0.05873340143003064, + "grad_norm": 1.5569820386518356, + "learning_rate": 1.995681944911196e-05, + "loss": 0.7053, + "step": 575 + }, + { + "epoch": 0.05883554647599591, + "grad_norm": 1.4914129111365901, + "learning_rate": 1.995651179071838e-05, + "loss": 0.7801, + "step": 576 + }, + { + "epoch": 0.05893769152196118, + "grad_norm": 1.5695007891508992, + "learning_rate": 1.995620304257815e-05, + "loss": 0.6953, + "step": 577 + }, + { + "epoch": 0.059039836567926454, + "grad_norm": 1.4422134060583685, + "learning_rate": 1.9955893204725062e-05, + "loss": 0.7145, + "step": 578 + }, + { + "epoch": 0.059141981613891725, + "grad_norm": 1.6665487085279531, + "learning_rate": 1.9955582277193036e-05, + "loss": 0.7044, + "step": 579 + }, + { + "epoch": 0.059244126659857, + "grad_norm": 1.7181099967750908, + "learning_rate": 1.9955270260016096e-05, + "loss": 0.776, + "step": 580 + }, + { + "epoch": 0.05934627170582227, + "grad_norm": 1.590228139316221, + "learning_rate": 1.995495715322839e-05, + "loss": 0.6424, + "step": 581 + }, + { + "epoch": 0.05944841675178754, + "grad_norm": 1.589196741933278, + "learning_rate": 1.9954642956864198e-05, + "loss": 0.766, + "step": 582 + }, + { + "epoch": 0.05955056179775281, + "grad_norm": 1.6068330611078379, + "learning_rate": 1.9954327670957898e-05, + "loss": 0.8129, + "step": 583 + }, + { + "epoch": 0.05965270684371808, + "grad_norm": 1.5243254401184279, + "learning_rate": 1.9954011295544004e-05, + "loss": 0.8694, + "step": 584 + }, + { + "epoch": 0.05975485188968335, + "grad_norm": 1.5752865210891498, + "learning_rate": 1.9953693830657143e-05, + "loss": 0.748, + "step": 585 + }, + { + "epoch": 0.059856996935648624, + "grad_norm": 1.4165480654836802, + "learning_rate": 1.9953375276332064e-05, + "loss": 0.6782, + "step": 586 + }, + { + "epoch": 0.059959141981613895, + "grad_norm": 1.6993650859970464, + "learning_rate": 1.9953055632603627e-05, + "loss": 0.7063, + "step": 587 + }, + { + "epoch": 0.06006128702757916, + "grad_norm": 1.777596350073618, + "learning_rate": 1.995273489950682e-05, + "loss": 0.7054, + "step": 588 + }, + { + "epoch": 0.06016343207354443, + "grad_norm": 1.6533109003456685, + "learning_rate": 1.995241307707675e-05, + "loss": 0.755, + "step": 589 + }, + { + "epoch": 0.0602655771195097, + "grad_norm": 1.5631367505558615, + "learning_rate": 1.995209016534864e-05, + "loss": 0.7397, + "step": 590 + }, + { + "epoch": 0.06036772216547497, + "grad_norm": 1.64619328578527, + "learning_rate": 1.9951766164357827e-05, + "loss": 0.7568, + "step": 591 + }, + { + "epoch": 0.060469867211440244, + "grad_norm": 1.4740392958486426, + "learning_rate": 1.995144107413978e-05, + "loss": 0.7219, + "step": 592 + }, + { + "epoch": 0.060572012257405515, + "grad_norm": 1.6425538694037616, + "learning_rate": 1.995111489473008e-05, + "loss": 0.8175, + "step": 593 + }, + { + "epoch": 0.060674157303370786, + "grad_norm": 1.6721734378179682, + "learning_rate": 1.9950787626164424e-05, + "loss": 0.6805, + "step": 594 + }, + { + "epoch": 0.06077630234933606, + "grad_norm": 1.6541518838558171, + "learning_rate": 1.9950459268478632e-05, + "loss": 0.8255, + "step": 595 + }, + { + "epoch": 0.06087844739530133, + "grad_norm": 1.5076635760283956, + "learning_rate": 1.9950129821708644e-05, + "loss": 0.7373, + "step": 596 + }, + { + "epoch": 0.0609805924412666, + "grad_norm": 1.4708847455718552, + "learning_rate": 1.994979928589052e-05, + "loss": 0.7017, + "step": 597 + }, + { + "epoch": 0.06108273748723187, + "grad_norm": 1.4735246989363984, + "learning_rate": 1.9949467661060435e-05, + "loss": 0.6551, + "step": 598 + }, + { + "epoch": 0.06118488253319714, + "grad_norm": 1.620560433529078, + "learning_rate": 1.9949134947254687e-05, + "loss": 0.8583, + "step": 599 + }, + { + "epoch": 0.06128702757916241, + "grad_norm": 1.5202001725670053, + "learning_rate": 1.994880114450969e-05, + "loss": 0.7184, + "step": 600 + }, + { + "epoch": 0.061389172625127685, + "grad_norm": 1.6874192179963887, + "learning_rate": 1.9948466252861982e-05, + "loss": 0.6951, + "step": 601 + }, + { + "epoch": 0.06149131767109295, + "grad_norm": 1.4496333113262838, + "learning_rate": 1.9948130272348213e-05, + "loss": 0.6575, + "step": 602 + }, + { + "epoch": 0.06159346271705822, + "grad_norm": 1.5120439459007018, + "learning_rate": 1.9947793203005157e-05, + "loss": 0.768, + "step": 603 + }, + { + "epoch": 0.06169560776302349, + "grad_norm": 1.5308792722786435, + "learning_rate": 1.9947455044869716e-05, + "loss": 0.7208, + "step": 604 + }, + { + "epoch": 0.06179775280898876, + "grad_norm": 1.6111333480085013, + "learning_rate": 1.9947115797978886e-05, + "loss": 0.732, + "step": 605 + }, + { + "epoch": 0.061899897854954034, + "grad_norm": 1.4558751531569698, + "learning_rate": 1.9946775462369806e-05, + "loss": 0.6121, + "step": 606 + }, + { + "epoch": 0.062002042900919305, + "grad_norm": 1.50971799023137, + "learning_rate": 1.9946434038079724e-05, + "loss": 0.6609, + "step": 607 + }, + { + "epoch": 0.062104187946884576, + "grad_norm": 1.4495462026953028, + "learning_rate": 1.9946091525146015e-05, + "loss": 0.6942, + "step": 608 + }, + { + "epoch": 0.06220633299284985, + "grad_norm": 1.9258324900622656, + "learning_rate": 1.994574792360616e-05, + "loss": 0.7457, + "step": 609 + }, + { + "epoch": 0.06230847803881512, + "grad_norm": 1.5265900948180682, + "learning_rate": 1.9945403233497766e-05, + "loss": 0.6708, + "step": 610 + }, + { + "epoch": 0.06241062308478039, + "grad_norm": 1.6015515748634863, + "learning_rate": 1.994505745485857e-05, + "loss": 0.7489, + "step": 611 + }, + { + "epoch": 0.06251276813074566, + "grad_norm": 1.5696067837819851, + "learning_rate": 1.99447105877264e-05, + "loss": 0.7903, + "step": 612 + }, + { + "epoch": 0.06261491317671093, + "grad_norm": 1.5196455189921743, + "learning_rate": 1.994436263213924e-05, + "loss": 0.7553, + "step": 613 + }, + { + "epoch": 0.0627170582226762, + "grad_norm": 1.5666728170951127, + "learning_rate": 1.994401358813516e-05, + "loss": 0.8312, + "step": 614 + }, + { + "epoch": 0.06281920326864147, + "grad_norm": 1.3917764579346945, + "learning_rate": 1.994366345575237e-05, + "loss": 0.7198, + "step": 615 + }, + { + "epoch": 0.06292134831460675, + "grad_norm": 1.5003092381742327, + "learning_rate": 1.9943312235029192e-05, + "loss": 0.8565, + "step": 616 + }, + { + "epoch": 0.06302349336057202, + "grad_norm": 1.6050289324744085, + "learning_rate": 1.9942959926004065e-05, + "loss": 0.7621, + "step": 617 + }, + { + "epoch": 0.06312563840653729, + "grad_norm": 1.4683720072335726, + "learning_rate": 1.9942606528715547e-05, + "loss": 0.6446, + "step": 618 + }, + { + "epoch": 0.06322778345250256, + "grad_norm": 1.634182783379614, + "learning_rate": 1.9942252043202325e-05, + "loss": 0.7343, + "step": 619 + }, + { + "epoch": 0.06332992849846783, + "grad_norm": 1.5269348810724157, + "learning_rate": 1.994189646950319e-05, + "loss": 0.7183, + "step": 620 + }, + { + "epoch": 0.0634320735444331, + "grad_norm": 1.6680220462987225, + "learning_rate": 1.9941539807657064e-05, + "loss": 0.7474, + "step": 621 + }, + { + "epoch": 0.06353421859039837, + "grad_norm": 1.6384141210354473, + "learning_rate": 1.994118205770298e-05, + "loss": 0.7789, + "step": 622 + }, + { + "epoch": 0.06363636363636363, + "grad_norm": 1.6344947011807236, + "learning_rate": 1.9940823219680102e-05, + "loss": 0.7822, + "step": 623 + }, + { + "epoch": 0.0637385086823289, + "grad_norm": 1.5476356635630524, + "learning_rate": 1.99404632936277e-05, + "loss": 0.693, + "step": 624 + }, + { + "epoch": 0.06384065372829417, + "grad_norm": 1.5978943230502578, + "learning_rate": 1.9940102279585164e-05, + "loss": 0.746, + "step": 625 + }, + { + "epoch": 0.06394279877425944, + "grad_norm": 1.5417716602283837, + "learning_rate": 1.9939740177592015e-05, + "loss": 0.7295, + "step": 626 + }, + { + "epoch": 0.06404494382022471, + "grad_norm": 1.637605439818522, + "learning_rate": 1.9939376987687883e-05, + "loss": 0.7761, + "step": 627 + }, + { + "epoch": 0.06414708886618999, + "grad_norm": 1.6837369991094335, + "learning_rate": 1.9939012709912512e-05, + "loss": 0.7225, + "step": 628 + }, + { + "epoch": 0.06424923391215526, + "grad_norm": 1.5839454603856955, + "learning_rate": 1.9938647344305782e-05, + "loss": 0.7269, + "step": 629 + }, + { + "epoch": 0.06435137895812053, + "grad_norm": 1.6013596310684362, + "learning_rate": 1.993828089090768e-05, + "loss": 0.7591, + "step": 630 + }, + { + "epoch": 0.0644535240040858, + "grad_norm": 1.6984502816932658, + "learning_rate": 1.993791334975831e-05, + "loss": 0.8338, + "step": 631 + }, + { + "epoch": 0.06455566905005107, + "grad_norm": 1.5903603697224313, + "learning_rate": 1.9937544720897907e-05, + "loss": 0.8551, + "step": 632 + }, + { + "epoch": 0.06465781409601634, + "grad_norm": 1.7649648243138163, + "learning_rate": 1.9937175004366812e-05, + "loss": 0.8415, + "step": 633 + }, + { + "epoch": 0.06475995914198161, + "grad_norm": 1.5726902291857323, + "learning_rate": 1.9936804200205496e-05, + "loss": 0.7818, + "step": 634 + }, + { + "epoch": 0.06486210418794688, + "grad_norm": 1.674714523420893, + "learning_rate": 1.9936432308454537e-05, + "loss": 0.8311, + "step": 635 + }, + { + "epoch": 0.06496424923391216, + "grad_norm": 1.6467588792381955, + "learning_rate": 1.993605932915464e-05, + "loss": 0.6923, + "step": 636 + }, + { + "epoch": 0.06506639427987743, + "grad_norm": 1.509599656060883, + "learning_rate": 1.9935685262346634e-05, + "loss": 0.7742, + "step": 637 + }, + { + "epoch": 0.0651685393258427, + "grad_norm": 1.4578257392850222, + "learning_rate": 1.9935310108071453e-05, + "loss": 0.6792, + "step": 638 + }, + { + "epoch": 0.06527068437180797, + "grad_norm": 1.7251091820920132, + "learning_rate": 1.9934933866370162e-05, + "loss": 0.9141, + "step": 639 + }, + { + "epoch": 0.06537282941777324, + "grad_norm": 1.7072827191149216, + "learning_rate": 1.9934556537283946e-05, + "loss": 0.7508, + "step": 640 + }, + { + "epoch": 0.06547497446373851, + "grad_norm": 1.8231793662869957, + "learning_rate": 1.9934178120854095e-05, + "loss": 0.8053, + "step": 641 + }, + { + "epoch": 0.06557711950970378, + "grad_norm": 1.301354381951905, + "learning_rate": 1.9933798617122025e-05, + "loss": 0.7137, + "step": 642 + }, + { + "epoch": 0.06567926455566905, + "grad_norm": 1.600505738452158, + "learning_rate": 1.9933418026129286e-05, + "loss": 0.7759, + "step": 643 + }, + { + "epoch": 0.06578140960163432, + "grad_norm": 1.5899448311873305, + "learning_rate": 1.993303634791752e-05, + "loss": 0.8273, + "step": 644 + }, + { + "epoch": 0.0658835546475996, + "grad_norm": 1.6798411521996006, + "learning_rate": 1.9932653582528517e-05, + "loss": 0.7645, + "step": 645 + }, + { + "epoch": 0.06598569969356487, + "grad_norm": 1.7277547870483982, + "learning_rate": 1.9932269730004155e-05, + "loss": 0.7776, + "step": 646 + }, + { + "epoch": 0.06608784473953014, + "grad_norm": 1.5878907315721582, + "learning_rate": 1.9931884790386454e-05, + "loss": 0.8447, + "step": 647 + }, + { + "epoch": 0.06618998978549541, + "grad_norm": 1.6000326237782907, + "learning_rate": 1.9931498763717548e-05, + "loss": 0.7636, + "step": 648 + }, + { + "epoch": 0.06629213483146068, + "grad_norm": 1.6543339803988673, + "learning_rate": 1.9931111650039687e-05, + "loss": 0.6744, + "step": 649 + }, + { + "epoch": 0.06639427987742594, + "grad_norm": 1.472374993391623, + "learning_rate": 1.9930723449395236e-05, + "loss": 0.7685, + "step": 650 + }, + { + "epoch": 0.06649642492339121, + "grad_norm": 1.6415638653209483, + "learning_rate": 1.993033416182669e-05, + "loss": 0.7568, + "step": 651 + }, + { + "epoch": 0.06659856996935648, + "grad_norm": 1.6010789108313617, + "learning_rate": 1.9929943787376652e-05, + "loss": 0.7398, + "step": 652 + }, + { + "epoch": 0.06670071501532175, + "grad_norm": 1.5422819405802102, + "learning_rate": 1.9929552326087856e-05, + "loss": 0.8178, + "step": 653 + }, + { + "epoch": 0.06680286006128702, + "grad_norm": 1.5318593065550377, + "learning_rate": 1.9929159778003137e-05, + "loss": 0.7564, + "step": 654 + }, + { + "epoch": 0.0669050051072523, + "grad_norm": 1.7590596625204797, + "learning_rate": 1.9928766143165466e-05, + "loss": 0.7798, + "step": 655 + }, + { + "epoch": 0.06700715015321757, + "grad_norm": 1.551189663550595, + "learning_rate": 1.992837142161793e-05, + "loss": 0.8, + "step": 656 + }, + { + "epoch": 0.06710929519918284, + "grad_norm": 1.606162706518352, + "learning_rate": 1.992797561340372e-05, + "loss": 0.8859, + "step": 657 + }, + { + "epoch": 0.06721144024514811, + "grad_norm": 1.558381297421778, + "learning_rate": 1.9927578718566173e-05, + "loss": 0.6391, + "step": 658 + }, + { + "epoch": 0.06731358529111338, + "grad_norm": 1.5091285818872757, + "learning_rate": 1.9927180737148718e-05, + "loss": 0.7152, + "step": 659 + }, + { + "epoch": 0.06741573033707865, + "grad_norm": 1.4881232449263553, + "learning_rate": 1.9926781669194916e-05, + "loss": 0.822, + "step": 660 + }, + { + "epoch": 0.06751787538304392, + "grad_norm": 1.4684678201962496, + "learning_rate": 1.992638151474845e-05, + "loss": 0.705, + "step": 661 + }, + { + "epoch": 0.06762002042900919, + "grad_norm": 1.6144178141316767, + "learning_rate": 1.9925980273853108e-05, + "loss": 0.8541, + "step": 662 + }, + { + "epoch": 0.06772216547497446, + "grad_norm": 1.4072855254516001, + "learning_rate": 1.9925577946552815e-05, + "loss": 0.7339, + "step": 663 + }, + { + "epoch": 0.06782431052093973, + "grad_norm": 1.4531206148226108, + "learning_rate": 1.9925174532891604e-05, + "loss": 0.6722, + "step": 664 + }, + { + "epoch": 0.067926455566905, + "grad_norm": 1.6322310197028325, + "learning_rate": 1.992477003291363e-05, + "loss": 0.8147, + "step": 665 + }, + { + "epoch": 0.06802860061287028, + "grad_norm": 1.7031047741715521, + "learning_rate": 1.992436444666316e-05, + "loss": 0.8484, + "step": 666 + }, + { + "epoch": 0.06813074565883555, + "grad_norm": 1.464101499114788, + "learning_rate": 1.992395777418459e-05, + "loss": 0.7673, + "step": 667 + }, + { + "epoch": 0.06823289070480082, + "grad_norm": 1.5529368939218136, + "learning_rate": 1.992355001552243e-05, + "loss": 0.8471, + "step": 668 + }, + { + "epoch": 0.06833503575076609, + "grad_norm": 1.7265945033132117, + "learning_rate": 1.992314117072131e-05, + "loss": 0.7832, + "step": 669 + }, + { + "epoch": 0.06843718079673136, + "grad_norm": 1.6098369488798387, + "learning_rate": 1.9922731239825978e-05, + "loss": 0.8438, + "step": 670 + }, + { + "epoch": 0.06853932584269663, + "grad_norm": 1.557280759518139, + "learning_rate": 1.9922320222881303e-05, + "loss": 0.8054, + "step": 671 + }, + { + "epoch": 0.0686414708886619, + "grad_norm": 1.6145992300166256, + "learning_rate": 1.9921908119932264e-05, + "loss": 0.79, + "step": 672 + }, + { + "epoch": 0.06874361593462718, + "grad_norm": 1.5615568820912313, + "learning_rate": 1.992149493102397e-05, + "loss": 0.7661, + "step": 673 + }, + { + "epoch": 0.06884576098059245, + "grad_norm": 1.4485106388941436, + "learning_rate": 1.9921080656201656e-05, + "loss": 0.7599, + "step": 674 + }, + { + "epoch": 0.06894790602655772, + "grad_norm": 1.5911176880846032, + "learning_rate": 1.9920665295510644e-05, + "loss": 0.7543, + "step": 675 + }, + { + "epoch": 0.06905005107252299, + "grad_norm": 1.4857460673627971, + "learning_rate": 1.992024884899641e-05, + "loss": 0.8055, + "step": 676 + }, + { + "epoch": 0.06915219611848826, + "grad_norm": 1.7098513265507742, + "learning_rate": 1.9919831316704528e-05, + "loss": 0.7858, + "step": 677 + }, + { + "epoch": 0.06925434116445352, + "grad_norm": 1.5768377750627027, + "learning_rate": 1.9919412698680704e-05, + "loss": 0.8942, + "step": 678 + }, + { + "epoch": 0.06935648621041879, + "grad_norm": 1.5544200792278677, + "learning_rate": 1.9918992994970746e-05, + "loss": 0.7406, + "step": 679 + }, + { + "epoch": 0.06945863125638406, + "grad_norm": 1.683868501646567, + "learning_rate": 1.9918572205620598e-05, + "loss": 0.7079, + "step": 680 + }, + { + "epoch": 0.06956077630234933, + "grad_norm": 1.6008296559091235, + "learning_rate": 1.9918150330676314e-05, + "loss": 0.7637, + "step": 681 + }, + { + "epoch": 0.0696629213483146, + "grad_norm": 1.6282029718071185, + "learning_rate": 1.991772737018407e-05, + "loss": 0.7462, + "step": 682 + }, + { + "epoch": 0.06976506639427987, + "grad_norm": 1.5364638687936156, + "learning_rate": 1.9917303324190156e-05, + "loss": 0.7966, + "step": 683 + }, + { + "epoch": 0.06986721144024514, + "grad_norm": 1.5769647562019777, + "learning_rate": 1.9916878192740987e-05, + "loss": 0.7351, + "step": 684 + }, + { + "epoch": 0.06996935648621042, + "grad_norm": 1.572562450978231, + "learning_rate": 1.9916451975883092e-05, + "loss": 0.7753, + "step": 685 + }, + { + "epoch": 0.07007150153217569, + "grad_norm": 1.4796469239160819, + "learning_rate": 1.991602467366312e-05, + "loss": 0.8036, + "step": 686 + }, + { + "epoch": 0.07017364657814096, + "grad_norm": 1.7182920829827204, + "learning_rate": 1.9915596286127843e-05, + "loss": 0.8053, + "step": 687 + }, + { + "epoch": 0.07027579162410623, + "grad_norm": 1.6979670130254412, + "learning_rate": 1.9915166813324145e-05, + "loss": 0.831, + "step": 688 + }, + { + "epoch": 0.0703779366700715, + "grad_norm": 1.5195726888095025, + "learning_rate": 1.9914736255299033e-05, + "loss": 0.6631, + "step": 689 + }, + { + "epoch": 0.07048008171603677, + "grad_norm": 1.5128278782252023, + "learning_rate": 1.9914304612099633e-05, + "loss": 0.6983, + "step": 690 + }, + { + "epoch": 0.07058222676200204, + "grad_norm": 1.4687556704285503, + "learning_rate": 1.9913871883773188e-05, + "loss": 0.6949, + "step": 691 + }, + { + "epoch": 0.07068437180796731, + "grad_norm": 1.3865099430284489, + "learning_rate": 1.9913438070367058e-05, + "loss": 0.7254, + "step": 692 + }, + { + "epoch": 0.07078651685393259, + "grad_norm": 1.596453530090198, + "learning_rate": 1.9913003171928727e-05, + "loss": 0.7507, + "step": 693 + }, + { + "epoch": 0.07088866189989786, + "grad_norm": 1.53493286244014, + "learning_rate": 1.9912567188505796e-05, + "loss": 0.7905, + "step": 694 + }, + { + "epoch": 0.07099080694586313, + "grad_norm": 1.4433145177523503, + "learning_rate": 1.991213012014598e-05, + "loss": 0.7007, + "step": 695 + }, + { + "epoch": 0.0710929519918284, + "grad_norm": 1.5678657067346933, + "learning_rate": 1.9911691966897118e-05, + "loss": 0.8142, + "step": 696 + }, + { + "epoch": 0.07119509703779367, + "grad_norm": 1.4595333474514265, + "learning_rate": 1.9911252728807167e-05, + "loss": 0.8165, + "step": 697 + }, + { + "epoch": 0.07129724208375894, + "grad_norm": 1.5359168885617551, + "learning_rate": 1.99108124059242e-05, + "loss": 0.769, + "step": 698 + }, + { + "epoch": 0.07139938712972421, + "grad_norm": 1.6874209864064396, + "learning_rate": 1.991037099829642e-05, + "loss": 0.7905, + "step": 699 + }, + { + "epoch": 0.07150153217568948, + "grad_norm": 1.6732078623193523, + "learning_rate": 1.9909928505972123e-05, + "loss": 0.7971, + "step": 700 + }, + { + "epoch": 0.07160367722165475, + "grad_norm": 1.8307454778427759, + "learning_rate": 1.9909484928999752e-05, + "loss": 0.7915, + "step": 701 + }, + { + "epoch": 0.07170582226762003, + "grad_norm": 1.3265328922254678, + "learning_rate": 1.990904026742785e-05, + "loss": 0.6835, + "step": 702 + }, + { + "epoch": 0.0718079673135853, + "grad_norm": 1.5616331071149123, + "learning_rate": 1.990859452130509e-05, + "loss": 0.6752, + "step": 703 + }, + { + "epoch": 0.07191011235955057, + "grad_norm": 1.781903509047666, + "learning_rate": 1.990814769068026e-05, + "loss": 0.7371, + "step": 704 + }, + { + "epoch": 0.07201225740551584, + "grad_norm": 1.4997407588919809, + "learning_rate": 1.9907699775602262e-05, + "loss": 0.7359, + "step": 705 + }, + { + "epoch": 0.0721144024514811, + "grad_norm": 1.6027053774428606, + "learning_rate": 1.9907250776120123e-05, + "loss": 0.7568, + "step": 706 + }, + { + "epoch": 0.07221654749744637, + "grad_norm": 1.6407751251477933, + "learning_rate": 1.9906800692282983e-05, + "loss": 0.9391, + "step": 707 + }, + { + "epoch": 0.07231869254341164, + "grad_norm": 1.4735190523413029, + "learning_rate": 1.990634952414011e-05, + "loss": 0.7301, + "step": 708 + }, + { + "epoch": 0.07242083758937691, + "grad_norm": 1.4884076613314179, + "learning_rate": 1.990589727174088e-05, + "loss": 0.7576, + "step": 709 + }, + { + "epoch": 0.07252298263534218, + "grad_norm": 1.3524800634259417, + "learning_rate": 1.990544393513479e-05, + "loss": 0.6862, + "step": 710 + }, + { + "epoch": 0.07262512768130745, + "grad_norm": 1.6463373831241659, + "learning_rate": 1.9904989514371467e-05, + "loss": 0.7797, + "step": 711 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 1.5462069913992529, + "learning_rate": 1.990453400950064e-05, + "loss": 0.8187, + "step": 712 + }, + { + "epoch": 0.072829417773238, + "grad_norm": 1.699006101490331, + "learning_rate": 1.990407742057217e-05, + "loss": 0.743, + "step": 713 + }, + { + "epoch": 0.07293156281920327, + "grad_norm": 1.6061828755109313, + "learning_rate": 1.9903619747636022e-05, + "loss": 0.7521, + "step": 714 + }, + { + "epoch": 0.07303370786516854, + "grad_norm": 1.6243691110560676, + "learning_rate": 1.99031609907423e-05, + "loss": 0.8141, + "step": 715 + }, + { + "epoch": 0.07313585291113381, + "grad_norm": 1.65902958838249, + "learning_rate": 1.9902701149941204e-05, + "loss": 0.7947, + "step": 716 + }, + { + "epoch": 0.07323799795709908, + "grad_norm": 1.7082412412467654, + "learning_rate": 1.990224022528307e-05, + "loss": 0.8251, + "step": 717 + }, + { + "epoch": 0.07334014300306435, + "grad_norm": 1.6513869806788732, + "learning_rate": 1.9901778216818347e-05, + "loss": 0.7945, + "step": 718 + }, + { + "epoch": 0.07344228804902962, + "grad_norm": 1.4609448055067984, + "learning_rate": 1.99013151245976e-05, + "loss": 0.8098, + "step": 719 + }, + { + "epoch": 0.0735444330949949, + "grad_norm": 1.57467649884625, + "learning_rate": 1.9900850948671515e-05, + "loss": 0.7695, + "step": 720 + }, + { + "epoch": 0.07364657814096016, + "grad_norm": 1.5724447961707912, + "learning_rate": 1.99003856890909e-05, + "loss": 0.7731, + "step": 721 + }, + { + "epoch": 0.07374872318692544, + "grad_norm": 1.4186462889716356, + "learning_rate": 1.989991934590667e-05, + "loss": 0.7015, + "step": 722 + }, + { + "epoch": 0.0738508682328907, + "grad_norm": 1.621292148811876, + "learning_rate": 1.9899451919169875e-05, + "loss": 0.9366, + "step": 723 + }, + { + "epoch": 0.07395301327885598, + "grad_norm": 1.5438176975374884, + "learning_rate": 1.9898983408931668e-05, + "loss": 0.7126, + "step": 724 + }, + { + "epoch": 0.07405515832482125, + "grad_norm": 1.4776689784371302, + "learning_rate": 1.9898513815243338e-05, + "loss": 0.7993, + "step": 725 + }, + { + "epoch": 0.07415730337078652, + "grad_norm": 1.6348592038527079, + "learning_rate": 1.989804313815627e-05, + "loss": 0.8359, + "step": 726 + }, + { + "epoch": 0.07425944841675179, + "grad_norm": 1.5377131113836813, + "learning_rate": 1.9897571377721988e-05, + "loss": 0.7586, + "step": 727 + }, + { + "epoch": 0.07436159346271706, + "grad_norm": 1.586864813250946, + "learning_rate": 1.9897098533992122e-05, + "loss": 0.7267, + "step": 728 + }, + { + "epoch": 0.07446373850868233, + "grad_norm": 1.370270570433998, + "learning_rate": 1.9896624607018427e-05, + "loss": 0.7329, + "step": 729 + }, + { + "epoch": 0.0745658835546476, + "grad_norm": 1.5477175276987212, + "learning_rate": 1.989614959685278e-05, + "loss": 0.7014, + "step": 730 + }, + { + "epoch": 0.07466802860061288, + "grad_norm": 1.6205493800980881, + "learning_rate": 1.989567350354716e-05, + "loss": 0.7644, + "step": 731 + }, + { + "epoch": 0.07477017364657815, + "grad_norm": 1.4524553614458884, + "learning_rate": 1.9895196327153684e-05, + "loss": 0.7321, + "step": 732 + }, + { + "epoch": 0.0748723186925434, + "grad_norm": 1.5151719588795212, + "learning_rate": 1.9894718067724577e-05, + "loss": 0.7411, + "step": 733 + }, + { + "epoch": 0.07497446373850868, + "grad_norm": 1.4766899841660894, + "learning_rate": 1.9894238725312186e-05, + "loss": 0.8123, + "step": 734 + }, + { + "epoch": 0.07507660878447395, + "grad_norm": 1.547990257754269, + "learning_rate": 1.989375829996897e-05, + "loss": 0.7884, + "step": 735 + }, + { + "epoch": 0.07517875383043922, + "grad_norm": 1.5315344486305633, + "learning_rate": 1.989327679174752e-05, + "loss": 0.7777, + "step": 736 + }, + { + "epoch": 0.07528089887640449, + "grad_norm": 1.7029751826087458, + "learning_rate": 1.989279420070053e-05, + "loss": 0.7959, + "step": 737 + }, + { + "epoch": 0.07538304392236976, + "grad_norm": 1.4664468437029052, + "learning_rate": 1.9892310526880822e-05, + "loss": 0.7233, + "step": 738 + }, + { + "epoch": 0.07548518896833503, + "grad_norm": 1.6436581056046273, + "learning_rate": 1.989182577034134e-05, + "loss": 0.791, + "step": 739 + }, + { + "epoch": 0.0755873340143003, + "grad_norm": 1.7126920305247544, + "learning_rate": 1.9891339931135137e-05, + "loss": 0.8633, + "step": 740 + }, + { + "epoch": 0.07568947906026557, + "grad_norm": 1.4813937524265346, + "learning_rate": 1.989085300931538e-05, + "loss": 0.7213, + "step": 741 + }, + { + "epoch": 0.07579162410623085, + "grad_norm": 1.5995606942956262, + "learning_rate": 1.989036500493538e-05, + "loss": 0.7798, + "step": 742 + }, + { + "epoch": 0.07589376915219612, + "grad_norm": 1.5256275244075512, + "learning_rate": 1.9889875918048534e-05, + "loss": 0.7354, + "step": 743 + }, + { + "epoch": 0.07599591419816139, + "grad_norm": 1.518060767587871, + "learning_rate": 1.9889385748708382e-05, + "loss": 0.7424, + "step": 744 + }, + { + "epoch": 0.07609805924412666, + "grad_norm": 1.6176905243628699, + "learning_rate": 1.988889449696857e-05, + "loss": 0.7268, + "step": 745 + }, + { + "epoch": 0.07620020429009193, + "grad_norm": 1.2258361858494313, + "learning_rate": 1.9888402162882867e-05, + "loss": 0.6337, + "step": 746 + }, + { + "epoch": 0.0763023493360572, + "grad_norm": 1.4919956686934734, + "learning_rate": 1.988790874650516e-05, + "loss": 0.6678, + "step": 747 + }, + { + "epoch": 0.07640449438202247, + "grad_norm": 1.502786516150113, + "learning_rate": 1.988741424788945e-05, + "loss": 0.732, + "step": 748 + }, + { + "epoch": 0.07650663942798774, + "grad_norm": 1.602405416370382, + "learning_rate": 1.9886918667089864e-05, + "loss": 0.7905, + "step": 749 + }, + { + "epoch": 0.07660878447395301, + "grad_norm": 1.6610767815020735, + "learning_rate": 1.988642200416064e-05, + "loss": 0.7595, + "step": 750 + }, + { + "epoch": 0.07671092951991829, + "grad_norm": 1.4912374533280652, + "learning_rate": 1.988592425915614e-05, + "loss": 0.7594, + "step": 751 + }, + { + "epoch": 0.07681307456588356, + "grad_norm": 1.4204070325857612, + "learning_rate": 1.9885425432130842e-05, + "loss": 0.6754, + "step": 752 + }, + { + "epoch": 0.07691521961184883, + "grad_norm": 1.5317845821517249, + "learning_rate": 1.9884925523139347e-05, + "loss": 0.6774, + "step": 753 + }, + { + "epoch": 0.0770173646578141, + "grad_norm": 1.5919564661606977, + "learning_rate": 1.9884424532236366e-05, + "loss": 0.7577, + "step": 754 + }, + { + "epoch": 0.07711950970377937, + "grad_norm": 1.6650090508036093, + "learning_rate": 1.9883922459476734e-05, + "loss": 0.7953, + "step": 755 + }, + { + "epoch": 0.07722165474974464, + "grad_norm": 1.4379715547000977, + "learning_rate": 1.98834193049154e-05, + "loss": 0.7142, + "step": 756 + }, + { + "epoch": 0.07732379979570991, + "grad_norm": 1.4366099066907718, + "learning_rate": 1.988291506860744e-05, + "loss": 0.7266, + "step": 757 + }, + { + "epoch": 0.07742594484167518, + "grad_norm": 1.6099513116698367, + "learning_rate": 1.988240975060804e-05, + "loss": 0.7915, + "step": 758 + }, + { + "epoch": 0.07752808988764046, + "grad_norm": 1.5486251855676378, + "learning_rate": 1.9881903350972508e-05, + "loss": 0.8171, + "step": 759 + }, + { + "epoch": 0.07763023493360573, + "grad_norm": 1.4550210812471185, + "learning_rate": 1.9881395869756272e-05, + "loss": 0.6751, + "step": 760 + }, + { + "epoch": 0.07773237997957098, + "grad_norm": 1.4715384750036027, + "learning_rate": 1.988088730701487e-05, + "loss": 0.5825, + "step": 761 + }, + { + "epoch": 0.07783452502553626, + "grad_norm": 1.7486606767552886, + "learning_rate": 1.988037766280397e-05, + "loss": 0.7462, + "step": 762 + }, + { + "epoch": 0.07793667007150153, + "grad_norm": 1.6997136563404789, + "learning_rate": 1.987986693717935e-05, + "loss": 0.7463, + "step": 763 + }, + { + "epoch": 0.0780388151174668, + "grad_norm": 1.6543343326848066, + "learning_rate": 1.9879355130196914e-05, + "loss": 0.8159, + "step": 764 + }, + { + "epoch": 0.07814096016343207, + "grad_norm": 1.6581740189588405, + "learning_rate": 1.9878842241912672e-05, + "loss": 0.75, + "step": 765 + }, + { + "epoch": 0.07824310520939734, + "grad_norm": 1.480553301449181, + "learning_rate": 1.987832827238277e-05, + "loss": 0.7625, + "step": 766 + }, + { + "epoch": 0.07834525025536261, + "grad_norm": 1.6234046309946566, + "learning_rate": 1.987781322166345e-05, + "loss": 0.7769, + "step": 767 + }, + { + "epoch": 0.07844739530132788, + "grad_norm": 1.54715934175039, + "learning_rate": 1.9877297089811095e-05, + "loss": 0.7055, + "step": 768 + }, + { + "epoch": 0.07854954034729315, + "grad_norm": 1.5143382835940802, + "learning_rate": 1.987677987688219e-05, + "loss": 0.908, + "step": 769 + }, + { + "epoch": 0.07865168539325842, + "grad_norm": 1.528983724094074, + "learning_rate": 1.9876261582933348e-05, + "loss": 0.7632, + "step": 770 + }, + { + "epoch": 0.0787538304392237, + "grad_norm": 1.4173996927090302, + "learning_rate": 1.9875742208021292e-05, + "loss": 0.6574, + "step": 771 + }, + { + "epoch": 0.07885597548518897, + "grad_norm": 1.464518733576873, + "learning_rate": 1.9875221752202872e-05, + "loss": 0.6784, + "step": 772 + }, + { + "epoch": 0.07895812053115424, + "grad_norm": 1.3667126138228713, + "learning_rate": 1.9874700215535053e-05, + "loss": 0.7041, + "step": 773 + }, + { + "epoch": 0.07906026557711951, + "grad_norm": 1.583922414261538, + "learning_rate": 1.9874177598074915e-05, + "loss": 0.892, + "step": 774 + }, + { + "epoch": 0.07916241062308478, + "grad_norm": 1.654368210394596, + "learning_rate": 1.9873653899879655e-05, + "loss": 0.8314, + "step": 775 + }, + { + "epoch": 0.07926455566905005, + "grad_norm": 1.455394945038289, + "learning_rate": 1.9873129121006602e-05, + "loss": 0.674, + "step": 776 + }, + { + "epoch": 0.07936670071501532, + "grad_norm": 1.428920027582383, + "learning_rate": 1.9872603261513184e-05, + "loss": 0.6809, + "step": 777 + }, + { + "epoch": 0.0794688457609806, + "grad_norm": 1.3997409338536504, + "learning_rate": 1.9872076321456962e-05, + "loss": 0.7237, + "step": 778 + }, + { + "epoch": 0.07957099080694587, + "grad_norm": 1.4866396602756338, + "learning_rate": 1.987154830089561e-05, + "loss": 0.6898, + "step": 779 + }, + { + "epoch": 0.07967313585291114, + "grad_norm": 1.6736638969346522, + "learning_rate": 1.9871019199886916e-05, + "loss": 0.876, + "step": 780 + }, + { + "epoch": 0.07977528089887641, + "grad_norm": 1.5023448351180575, + "learning_rate": 1.9870489018488793e-05, + "loss": 0.7453, + "step": 781 + }, + { + "epoch": 0.07987742594484168, + "grad_norm": 1.4255061349577813, + "learning_rate": 1.9869957756759273e-05, + "loss": 0.7273, + "step": 782 + }, + { + "epoch": 0.07997957099080695, + "grad_norm": 1.5790262378656115, + "learning_rate": 1.9869425414756498e-05, + "loss": 0.5604, + "step": 783 + }, + { + "epoch": 0.08008171603677222, + "grad_norm": 1.5083152187830278, + "learning_rate": 1.986889199253873e-05, + "loss": 0.7644, + "step": 784 + }, + { + "epoch": 0.08018386108273749, + "grad_norm": 1.5375834404833606, + "learning_rate": 1.9868357490164367e-05, + "loss": 0.7871, + "step": 785 + }, + { + "epoch": 0.08028600612870276, + "grad_norm": 1.6478695551781142, + "learning_rate": 1.9867821907691894e-05, + "loss": 0.771, + "step": 786 + }, + { + "epoch": 0.08038815117466803, + "grad_norm": 1.7135939688805435, + "learning_rate": 1.986728524517994e-05, + "loss": 0.8638, + "step": 787 + }, + { + "epoch": 0.0804902962206333, + "grad_norm": 1.5666635072247177, + "learning_rate": 1.986674750268724e-05, + "loss": 0.7413, + "step": 788 + }, + { + "epoch": 0.08059244126659856, + "grad_norm": 1.5624782421230234, + "learning_rate": 1.9866208680272653e-05, + "loss": 0.7927, + "step": 789 + }, + { + "epoch": 0.08069458631256383, + "grad_norm": 1.417811274465328, + "learning_rate": 1.986566877799515e-05, + "loss": 0.763, + "step": 790 + }, + { + "epoch": 0.0807967313585291, + "grad_norm": 1.5093397415037746, + "learning_rate": 1.9865127795913826e-05, + "loss": 0.7269, + "step": 791 + }, + { + "epoch": 0.08089887640449438, + "grad_norm": 1.5983634921869176, + "learning_rate": 1.986458573408789e-05, + "loss": 0.8614, + "step": 792 + }, + { + "epoch": 0.08100102145045965, + "grad_norm": 1.4699588960762733, + "learning_rate": 1.9864042592576674e-05, + "loss": 0.6837, + "step": 793 + }, + { + "epoch": 0.08110316649642492, + "grad_norm": 1.6686341256597426, + "learning_rate": 1.986349837143962e-05, + "loss": 0.8628, + "step": 794 + }, + { + "epoch": 0.08120531154239019, + "grad_norm": 1.7327450098838917, + "learning_rate": 1.9862953070736298e-05, + "loss": 0.7782, + "step": 795 + }, + { + "epoch": 0.08130745658835546, + "grad_norm": 1.4915645258091934, + "learning_rate": 1.986240669052639e-05, + "loss": 0.7639, + "step": 796 + }, + { + "epoch": 0.08140960163432073, + "grad_norm": 1.682551088860698, + "learning_rate": 1.98618592308697e-05, + "loss": 0.858, + "step": 797 + }, + { + "epoch": 0.081511746680286, + "grad_norm": 1.2839623368381212, + "learning_rate": 1.9861310691826143e-05, + "loss": 0.6939, + "step": 798 + }, + { + "epoch": 0.08161389172625128, + "grad_norm": 1.4278261582709826, + "learning_rate": 1.986076107345576e-05, + "loss": 0.7933, + "step": 799 + }, + { + "epoch": 0.08171603677221655, + "grad_norm": 1.4489440344317464, + "learning_rate": 1.9860210375818707e-05, + "loss": 0.6521, + "step": 800 + }, + { + "epoch": 0.08181818181818182, + "grad_norm": 1.4721306914730188, + "learning_rate": 1.9859658598975257e-05, + "loss": 0.6779, + "step": 801 + }, + { + "epoch": 0.08192032686414709, + "grad_norm": 1.692738841349463, + "learning_rate": 1.9859105742985803e-05, + "loss": 0.8404, + "step": 802 + }, + { + "epoch": 0.08202247191011236, + "grad_norm": 1.5552306293363731, + "learning_rate": 1.9858551807910856e-05, + "loss": 0.7664, + "step": 803 + }, + { + "epoch": 0.08212461695607763, + "grad_norm": 1.7757978074082381, + "learning_rate": 1.9857996793811046e-05, + "loss": 0.8659, + "step": 804 + }, + { + "epoch": 0.0822267620020429, + "grad_norm": 1.360922155263416, + "learning_rate": 1.9857440700747118e-05, + "loss": 0.6373, + "step": 805 + }, + { + "epoch": 0.08232890704800817, + "grad_norm": 1.7338778585528292, + "learning_rate": 1.9856883528779934e-05, + "loss": 0.8424, + "step": 806 + }, + { + "epoch": 0.08243105209397344, + "grad_norm": 1.4599206311539763, + "learning_rate": 1.9856325277970484e-05, + "loss": 0.63, + "step": 807 + }, + { + "epoch": 0.08253319713993872, + "grad_norm": 1.5647437015477845, + "learning_rate": 1.9855765948379858e-05, + "loss": 0.735, + "step": 808 + }, + { + "epoch": 0.08263534218590399, + "grad_norm": 1.4430131020274404, + "learning_rate": 1.9855205540069288e-05, + "loss": 0.7342, + "step": 809 + }, + { + "epoch": 0.08273748723186926, + "grad_norm": 1.6340244378319413, + "learning_rate": 1.9854644053100097e-05, + "loss": 0.8719, + "step": 810 + }, + { + "epoch": 0.08283963227783453, + "grad_norm": 1.4633354893425299, + "learning_rate": 1.985408148753375e-05, + "loss": 0.6136, + "step": 811 + }, + { + "epoch": 0.0829417773237998, + "grad_norm": 1.3820516118314197, + "learning_rate": 1.9853517843431823e-05, + "loss": 0.8004, + "step": 812 + }, + { + "epoch": 0.08304392236976507, + "grad_norm": 1.3954143673228985, + "learning_rate": 1.9852953120855995e-05, + "loss": 0.754, + "step": 813 + }, + { + "epoch": 0.08314606741573034, + "grad_norm": 1.461763770126284, + "learning_rate": 1.9852387319868085e-05, + "loss": 0.7728, + "step": 814 + }, + { + "epoch": 0.08324821246169561, + "grad_norm": 1.5594550676812877, + "learning_rate": 1.9851820440530018e-05, + "loss": 0.7631, + "step": 815 + }, + { + "epoch": 0.08335035750766087, + "grad_norm": 1.4121140136898949, + "learning_rate": 1.9851252482903837e-05, + "loss": 0.706, + "step": 816 + }, + { + "epoch": 0.08345250255362614, + "grad_norm": 1.6768742025591088, + "learning_rate": 1.9850683447051707e-05, + "loss": 0.7596, + "step": 817 + }, + { + "epoch": 0.08355464759959141, + "grad_norm": 1.4840062183728497, + "learning_rate": 1.9850113333035913e-05, + "loss": 0.7564, + "step": 818 + }, + { + "epoch": 0.08365679264555669, + "grad_norm": 1.5074222259461636, + "learning_rate": 1.9849542140918847e-05, + "loss": 0.7703, + "step": 819 + }, + { + "epoch": 0.08375893769152196, + "grad_norm": 1.5065764301217976, + "learning_rate": 1.984896987076303e-05, + "loss": 0.7261, + "step": 820 + }, + { + "epoch": 0.08386108273748723, + "grad_norm": 1.4425233733404113, + "learning_rate": 1.98483965226311e-05, + "loss": 0.7128, + "step": 821 + }, + { + "epoch": 0.0839632277834525, + "grad_norm": 1.4782159146197773, + "learning_rate": 1.9847822096585805e-05, + "loss": 0.7477, + "step": 822 + }, + { + "epoch": 0.08406537282941777, + "grad_norm": 1.5409926640930578, + "learning_rate": 1.9847246592690022e-05, + "loss": 0.7657, + "step": 823 + }, + { + "epoch": 0.08416751787538304, + "grad_norm": 1.4147519294055058, + "learning_rate": 1.9846670011006735e-05, + "loss": 0.6879, + "step": 824 + }, + { + "epoch": 0.08426966292134831, + "grad_norm": 1.4840139264747112, + "learning_rate": 1.9846092351599054e-05, + "loss": 0.6602, + "step": 825 + }, + { + "epoch": 0.08437180796731358, + "grad_norm": 1.7122099507533417, + "learning_rate": 1.9845513614530203e-05, + "loss": 0.8124, + "step": 826 + }, + { + "epoch": 0.08447395301327885, + "grad_norm": 1.7229563796040182, + "learning_rate": 1.9844933799863526e-05, + "loss": 0.7928, + "step": 827 + }, + { + "epoch": 0.08457609805924413, + "grad_norm": 1.4135953103967285, + "learning_rate": 1.9844352907662486e-05, + "loss": 0.7566, + "step": 828 + }, + { + "epoch": 0.0846782431052094, + "grad_norm": 1.6710072642746758, + "learning_rate": 1.9843770937990658e-05, + "loss": 0.768, + "step": 829 + }, + { + "epoch": 0.08478038815117467, + "grad_norm": 1.525259579103551, + "learning_rate": 1.984318789091174e-05, + "loss": 0.7441, + "step": 830 + }, + { + "epoch": 0.08488253319713994, + "grad_norm": 1.4840255779132323, + "learning_rate": 1.984260376648955e-05, + "loss": 0.7261, + "step": 831 + }, + { + "epoch": 0.08498467824310521, + "grad_norm": 1.806837967751418, + "learning_rate": 1.984201856478802e-05, + "loss": 0.7181, + "step": 832 + }, + { + "epoch": 0.08508682328907048, + "grad_norm": 1.631149606829919, + "learning_rate": 1.9841432285871198e-05, + "loss": 0.7397, + "step": 833 + }, + { + "epoch": 0.08518896833503575, + "grad_norm": 1.4496522135636345, + "learning_rate": 1.984084492980325e-05, + "loss": 0.7489, + "step": 834 + }, + { + "epoch": 0.08529111338100102, + "grad_norm": 1.5294238044507162, + "learning_rate": 1.9840256496648468e-05, + "loss": 0.7066, + "step": 835 + }, + { + "epoch": 0.0853932584269663, + "grad_norm": 1.7448167131128816, + "learning_rate": 1.9839666986471256e-05, + "loss": 0.8274, + "step": 836 + }, + { + "epoch": 0.08549540347293157, + "grad_norm": 1.590341784894217, + "learning_rate": 1.9839076399336138e-05, + "loss": 0.8077, + "step": 837 + }, + { + "epoch": 0.08559754851889684, + "grad_norm": 1.5733700494905307, + "learning_rate": 1.983848473530775e-05, + "loss": 0.7376, + "step": 838 + }, + { + "epoch": 0.08569969356486211, + "grad_norm": 1.5794162519850896, + "learning_rate": 1.983789199445085e-05, + "loss": 0.798, + "step": 839 + }, + { + "epoch": 0.08580183861082738, + "grad_norm": 1.6813426649202494, + "learning_rate": 1.9837298176830317e-05, + "loss": 0.7654, + "step": 840 + }, + { + "epoch": 0.08590398365679265, + "grad_norm": 1.7162930259840956, + "learning_rate": 1.9836703282511137e-05, + "loss": 0.8303, + "step": 841 + }, + { + "epoch": 0.08600612870275792, + "grad_norm": 1.4623338096532696, + "learning_rate": 1.9836107311558434e-05, + "loss": 0.7273, + "step": 842 + }, + { + "epoch": 0.0861082737487232, + "grad_norm": 1.5255223717546944, + "learning_rate": 1.9835510264037426e-05, + "loss": 0.7602, + "step": 843 + }, + { + "epoch": 0.08621041879468845, + "grad_norm": 1.6161301368382748, + "learning_rate": 1.983491214001347e-05, + "loss": 0.6849, + "step": 844 + }, + { + "epoch": 0.08631256384065372, + "grad_norm": 1.7714352477479653, + "learning_rate": 1.9834312939552022e-05, + "loss": 0.7908, + "step": 845 + }, + { + "epoch": 0.086414708886619, + "grad_norm": 1.5888815693134404, + "learning_rate": 1.983371266271867e-05, + "loss": 0.749, + "step": 846 + }, + { + "epoch": 0.08651685393258426, + "grad_norm": 1.356413066008791, + "learning_rate": 1.9833111309579112e-05, + "loss": 0.6391, + "step": 847 + }, + { + "epoch": 0.08661899897854954, + "grad_norm": 1.6019716048238979, + "learning_rate": 1.983250888019917e-05, + "loss": 0.7782, + "step": 848 + }, + { + "epoch": 0.0867211440245148, + "grad_norm": 1.5384344743887846, + "learning_rate": 1.983190537464478e-05, + "loss": 0.7433, + "step": 849 + }, + { + "epoch": 0.08682328907048008, + "grad_norm": 1.4962183146123815, + "learning_rate": 1.9831300792981994e-05, + "loss": 0.7768, + "step": 850 + }, + { + "epoch": 0.08692543411644535, + "grad_norm": 1.368787082962646, + "learning_rate": 1.9830695135276982e-05, + "loss": 0.7428, + "step": 851 + }, + { + "epoch": 0.08702757916241062, + "grad_norm": 1.634690284174617, + "learning_rate": 1.9830088401596036e-05, + "loss": 0.6678, + "step": 852 + }, + { + "epoch": 0.08712972420837589, + "grad_norm": 1.6140357483361458, + "learning_rate": 1.9829480592005566e-05, + "loss": 0.6981, + "step": 853 + }, + { + "epoch": 0.08723186925434116, + "grad_norm": 1.4094843826197572, + "learning_rate": 1.9828871706572096e-05, + "loss": 0.6323, + "step": 854 + }, + { + "epoch": 0.08733401430030643, + "grad_norm": 1.686124900906538, + "learning_rate": 1.9828261745362262e-05, + "loss": 0.7347, + "step": 855 + }, + { + "epoch": 0.0874361593462717, + "grad_norm": 1.4063934258439823, + "learning_rate": 1.9827650708442836e-05, + "loss": 0.7882, + "step": 856 + }, + { + "epoch": 0.08753830439223698, + "grad_norm": 1.5190213871997271, + "learning_rate": 1.9827038595880688e-05, + "loss": 0.8105, + "step": 857 + }, + { + "epoch": 0.08764044943820225, + "grad_norm": 1.4380671354280348, + "learning_rate": 1.982642540774281e-05, + "loss": 0.6453, + "step": 858 + }, + { + "epoch": 0.08774259448416752, + "grad_norm": 1.5025846569855732, + "learning_rate": 1.9825811144096333e-05, + "loss": 0.8155, + "step": 859 + }, + { + "epoch": 0.08784473953013279, + "grad_norm": 1.4676118109568428, + "learning_rate": 1.9825195805008476e-05, + "loss": 0.7929, + "step": 860 + }, + { + "epoch": 0.08794688457609806, + "grad_norm": 1.3751748279834934, + "learning_rate": 1.9824579390546586e-05, + "loss": 0.6421, + "step": 861 + }, + { + "epoch": 0.08804902962206333, + "grad_norm": 1.4990570842691318, + "learning_rate": 1.982396190077814e-05, + "loss": 0.7473, + "step": 862 + }, + { + "epoch": 0.0881511746680286, + "grad_norm": 1.3410887995637522, + "learning_rate": 1.982334333577071e-05, + "loss": 0.7659, + "step": 863 + }, + { + "epoch": 0.08825331971399387, + "grad_norm": 1.4572545901732041, + "learning_rate": 1.9822723695592007e-05, + "loss": 0.6999, + "step": 864 + }, + { + "epoch": 0.08835546475995915, + "grad_norm": 1.552223699430407, + "learning_rate": 1.982210298030985e-05, + "loss": 0.7945, + "step": 865 + }, + { + "epoch": 0.08845760980592442, + "grad_norm": 1.6220141237598558, + "learning_rate": 1.9821481189992175e-05, + "loss": 0.7589, + "step": 866 + }, + { + "epoch": 0.08855975485188969, + "grad_norm": 1.6223640833234694, + "learning_rate": 1.982085832470704e-05, + "loss": 0.7834, + "step": 867 + }, + { + "epoch": 0.08866189989785496, + "grad_norm": 1.5556122987309737, + "learning_rate": 1.9820234384522617e-05, + "loss": 0.7302, + "step": 868 + }, + { + "epoch": 0.08876404494382023, + "grad_norm": 1.5130214699337237, + "learning_rate": 1.9819609369507194e-05, + "loss": 0.6756, + "step": 869 + }, + { + "epoch": 0.0888661899897855, + "grad_norm": 1.6280051708419594, + "learning_rate": 1.981898327972918e-05, + "loss": 0.7624, + "step": 870 + }, + { + "epoch": 0.08896833503575077, + "grad_norm": 1.6227536567478529, + "learning_rate": 1.9818356115257104e-05, + "loss": 0.8002, + "step": 871 + }, + { + "epoch": 0.08907048008171603, + "grad_norm": 1.5261325070193603, + "learning_rate": 1.981772787615961e-05, + "loss": 0.8195, + "step": 872 + }, + { + "epoch": 0.0891726251276813, + "grad_norm": 1.7044919112596812, + "learning_rate": 1.9817098562505454e-05, + "loss": 0.7506, + "step": 873 + }, + { + "epoch": 0.08927477017364657, + "grad_norm": 1.6259638475250309, + "learning_rate": 1.981646817436352e-05, + "loss": 0.7411, + "step": 874 + }, + { + "epoch": 0.08937691521961184, + "grad_norm": 1.405259031614385, + "learning_rate": 1.98158367118028e-05, + "loss": 0.7416, + "step": 875 + }, + { + "epoch": 0.08947906026557712, + "grad_norm": 1.6656868407488183, + "learning_rate": 1.981520417489241e-05, + "loss": 0.7209, + "step": 876 + }, + { + "epoch": 0.08958120531154239, + "grad_norm": 1.5492602806531655, + "learning_rate": 1.981457056370158e-05, + "loss": 0.7849, + "step": 877 + }, + { + "epoch": 0.08968335035750766, + "grad_norm": 1.4769806549032172, + "learning_rate": 1.9813935878299663e-05, + "loss": 0.7935, + "step": 878 + }, + { + "epoch": 0.08978549540347293, + "grad_norm": 1.4192083509317202, + "learning_rate": 1.9813300118756125e-05, + "loss": 0.6606, + "step": 879 + }, + { + "epoch": 0.0898876404494382, + "grad_norm": 1.4206634803657674, + "learning_rate": 1.9812663285140546e-05, + "loss": 0.875, + "step": 880 + }, + { + "epoch": 0.08998978549540347, + "grad_norm": 1.6049033800597063, + "learning_rate": 1.9812025377522633e-05, + "loss": 0.8148, + "step": 881 + }, + { + "epoch": 0.09009193054136874, + "grad_norm": 1.7258080579116855, + "learning_rate": 1.9811386395972202e-05, + "loss": 0.8029, + "step": 882 + }, + { + "epoch": 0.09019407558733401, + "grad_norm": 1.421843603111564, + "learning_rate": 1.981074634055919e-05, + "loss": 0.6283, + "step": 883 + }, + { + "epoch": 0.09029622063329928, + "grad_norm": 1.4575145172629398, + "learning_rate": 1.9810105211353656e-05, + "loss": 0.7996, + "step": 884 + }, + { + "epoch": 0.09039836567926456, + "grad_norm": 1.4893253618472304, + "learning_rate": 1.9809463008425765e-05, + "loss": 0.7306, + "step": 885 + }, + { + "epoch": 0.09050051072522983, + "grad_norm": 1.3534431765352035, + "learning_rate": 1.980881973184581e-05, + "loss": 0.6758, + "step": 886 + }, + { + "epoch": 0.0906026557711951, + "grad_norm": 1.4524154318699805, + "learning_rate": 1.98081753816842e-05, + "loss": 0.6753, + "step": 887 + }, + { + "epoch": 0.09070480081716037, + "grad_norm": 1.5538733477037465, + "learning_rate": 1.9807529958011457e-05, + "loss": 0.8651, + "step": 888 + }, + { + "epoch": 0.09080694586312564, + "grad_norm": 1.5585416799150469, + "learning_rate": 1.980688346089822e-05, + "loss": 0.7714, + "step": 889 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 1.5828426151284556, + "learning_rate": 1.9806235890415257e-05, + "loss": 0.7933, + "step": 890 + }, + { + "epoch": 0.09101123595505618, + "grad_norm": 1.6448461278910054, + "learning_rate": 1.9805587246633436e-05, + "loss": 0.7855, + "step": 891 + }, + { + "epoch": 0.09111338100102145, + "grad_norm": 1.530293547554862, + "learning_rate": 1.980493752962376e-05, + "loss": 0.7784, + "step": 892 + }, + { + "epoch": 0.09121552604698673, + "grad_norm": 1.495940270298718, + "learning_rate": 1.9804286739457335e-05, + "loss": 0.7604, + "step": 893 + }, + { + "epoch": 0.091317671092952, + "grad_norm": 1.4913273932204647, + "learning_rate": 1.980363487620539e-05, + "loss": 0.6655, + "step": 894 + }, + { + "epoch": 0.09141981613891727, + "grad_norm": 1.5773076757232762, + "learning_rate": 1.9802981939939276e-05, + "loss": 0.683, + "step": 895 + }, + { + "epoch": 0.09152196118488254, + "grad_norm": 1.5032513749087435, + "learning_rate": 1.9802327930730457e-05, + "loss": 0.715, + "step": 896 + }, + { + "epoch": 0.09162410623084781, + "grad_norm": 1.6378338568172568, + "learning_rate": 1.980167284865051e-05, + "loss": 0.7641, + "step": 897 + }, + { + "epoch": 0.09172625127681308, + "grad_norm": 1.6424016881391832, + "learning_rate": 1.9801016693771137e-05, + "loss": 0.7579, + "step": 898 + }, + { + "epoch": 0.09182839632277834, + "grad_norm": 1.5688594892105001, + "learning_rate": 1.9800359466164154e-05, + "loss": 0.8109, + "step": 899 + }, + { + "epoch": 0.09193054136874361, + "grad_norm": 1.553520275259485, + "learning_rate": 1.9799701165901498e-05, + "loss": 0.7681, + "step": 900 + }, + { + "epoch": 0.09203268641470888, + "grad_norm": 1.597249136006614, + "learning_rate": 1.9799041793055217e-05, + "loss": 0.7311, + "step": 901 + }, + { + "epoch": 0.09213483146067415, + "grad_norm": 1.712686887918941, + "learning_rate": 1.979838134769748e-05, + "loss": 0.6871, + "step": 902 + }, + { + "epoch": 0.09223697650663942, + "grad_norm": 1.4619654615917665, + "learning_rate": 1.9797719829900575e-05, + "loss": 0.7894, + "step": 903 + }, + { + "epoch": 0.0923391215526047, + "grad_norm": 1.539392201239084, + "learning_rate": 1.97970572397369e-05, + "loss": 0.647, + "step": 904 + }, + { + "epoch": 0.09244126659856997, + "grad_norm": 1.634055876247212, + "learning_rate": 1.9796393577278988e-05, + "loss": 0.83, + "step": 905 + }, + { + "epoch": 0.09254341164453524, + "grad_norm": 1.414305344368932, + "learning_rate": 1.9795728842599466e-05, + "loss": 0.6678, + "step": 906 + }, + { + "epoch": 0.09264555669050051, + "grad_norm": 1.7054663304351356, + "learning_rate": 1.979506303577109e-05, + "loss": 0.8213, + "step": 907 + }, + { + "epoch": 0.09274770173646578, + "grad_norm": 1.6482320243261614, + "learning_rate": 1.9794396156866738e-05, + "loss": 0.7433, + "step": 908 + }, + { + "epoch": 0.09284984678243105, + "grad_norm": 1.5641111015995306, + "learning_rate": 1.9793728205959405e-05, + "loss": 0.587, + "step": 909 + }, + { + "epoch": 0.09295199182839632, + "grad_norm": 1.4727159809697772, + "learning_rate": 1.9793059183122184e-05, + "loss": 0.8073, + "step": 910 + }, + { + "epoch": 0.09305413687436159, + "grad_norm": 1.4852320000582167, + "learning_rate": 1.9792389088428316e-05, + "loss": 0.8124, + "step": 911 + }, + { + "epoch": 0.09315628192032686, + "grad_norm": 1.3676632308089758, + "learning_rate": 1.9791717921951126e-05, + "loss": 0.7521, + "step": 912 + }, + { + "epoch": 0.09325842696629214, + "grad_norm": 1.333238580325899, + "learning_rate": 1.979104568376409e-05, + "loss": 0.6769, + "step": 913 + }, + { + "epoch": 0.0933605720122574, + "grad_norm": 1.4487597637598837, + "learning_rate": 1.979037237394078e-05, + "loss": 0.6905, + "step": 914 + }, + { + "epoch": 0.09346271705822268, + "grad_norm": 1.4693204822563655, + "learning_rate": 1.9789697992554884e-05, + "loss": 0.8051, + "step": 915 + }, + { + "epoch": 0.09356486210418795, + "grad_norm": 1.5911308747422055, + "learning_rate": 1.9789022539680215e-05, + "loss": 0.7371, + "step": 916 + }, + { + "epoch": 0.09366700715015322, + "grad_norm": 1.481090729042003, + "learning_rate": 1.978834601539071e-05, + "loss": 0.7916, + "step": 917 + }, + { + "epoch": 0.09376915219611849, + "grad_norm": 1.3255619406382855, + "learning_rate": 1.9787668419760407e-05, + "loss": 0.7514, + "step": 918 + }, + { + "epoch": 0.09387129724208376, + "grad_norm": 1.7346530489362686, + "learning_rate": 1.9786989752863475e-05, + "loss": 0.8005, + "step": 919 + }, + { + "epoch": 0.09397344228804903, + "grad_norm": 1.6633517908047575, + "learning_rate": 1.9786310014774187e-05, + "loss": 0.7166, + "step": 920 + }, + { + "epoch": 0.0940755873340143, + "grad_norm": 1.581306581079274, + "learning_rate": 1.9785629205566946e-05, + "loss": 0.7771, + "step": 921 + }, + { + "epoch": 0.09417773237997958, + "grad_norm": 1.3220695340545767, + "learning_rate": 1.9784947325316267e-05, + "loss": 0.8083, + "step": 922 + }, + { + "epoch": 0.09427987742594485, + "grad_norm": 1.758164691278845, + "learning_rate": 1.978426437409678e-05, + "loss": 0.8166, + "step": 923 + }, + { + "epoch": 0.09438202247191012, + "grad_norm": 1.563338732698904, + "learning_rate": 1.978358035198324e-05, + "loss": 0.8369, + "step": 924 + }, + { + "epoch": 0.09448416751787539, + "grad_norm": 1.4793236281382585, + "learning_rate": 1.9782895259050505e-05, + "loss": 0.7605, + "step": 925 + }, + { + "epoch": 0.09458631256384066, + "grad_norm": 1.5496608138805956, + "learning_rate": 1.9782209095373565e-05, + "loss": 0.8162, + "step": 926 + }, + { + "epoch": 0.09468845760980592, + "grad_norm": 1.448925366912586, + "learning_rate": 1.978152186102752e-05, + "loss": 0.7521, + "step": 927 + }, + { + "epoch": 0.09479060265577119, + "grad_norm": 1.4797535004677953, + "learning_rate": 1.9780833556087588e-05, + "loss": 0.6501, + "step": 928 + }, + { + "epoch": 0.09489274770173646, + "grad_norm": 1.4758269869906364, + "learning_rate": 1.9780144180629103e-05, + "loss": 0.7062, + "step": 929 + }, + { + "epoch": 0.09499489274770173, + "grad_norm": 1.3421072019631117, + "learning_rate": 1.977945373472752e-05, + "loss": 0.6394, + "step": 930 + }, + { + "epoch": 0.095097037793667, + "grad_norm": 1.4751750547065008, + "learning_rate": 1.9778762218458403e-05, + "loss": 0.7302, + "step": 931 + }, + { + "epoch": 0.09519918283963227, + "grad_norm": 1.457181462647776, + "learning_rate": 1.9778069631897446e-05, + "loss": 0.7136, + "step": 932 + }, + { + "epoch": 0.09530132788559754, + "grad_norm": 1.6367787019134947, + "learning_rate": 1.9777375975120453e-05, + "loss": 0.788, + "step": 933 + }, + { + "epoch": 0.09540347293156282, + "grad_norm": 1.5440112199616998, + "learning_rate": 1.9776681248203342e-05, + "loss": 0.7905, + "step": 934 + }, + { + "epoch": 0.09550561797752809, + "grad_norm": 1.482758153559448, + "learning_rate": 1.977598545122215e-05, + "loss": 0.8693, + "step": 935 + }, + { + "epoch": 0.09560776302349336, + "grad_norm": 1.5019975142719908, + "learning_rate": 1.9775288584253034e-05, + "loss": 0.7404, + "step": 936 + }, + { + "epoch": 0.09570990806945863, + "grad_norm": 1.4422841648228601, + "learning_rate": 1.9774590647372267e-05, + "loss": 0.7715, + "step": 937 + }, + { + "epoch": 0.0958120531154239, + "grad_norm": 1.6374522012972488, + "learning_rate": 1.977389164065624e-05, + "loss": 0.8055, + "step": 938 + }, + { + "epoch": 0.09591419816138917, + "grad_norm": 1.6181114850846268, + "learning_rate": 1.9773191564181454e-05, + "loss": 0.884, + "step": 939 + }, + { + "epoch": 0.09601634320735444, + "grad_norm": 1.6861344823706046, + "learning_rate": 1.9772490418024543e-05, + "loss": 0.7129, + "step": 940 + }, + { + "epoch": 0.09611848825331971, + "grad_norm": 1.6726536961960072, + "learning_rate": 1.977178820226224e-05, + "loss": 0.7871, + "step": 941 + }, + { + "epoch": 0.09622063329928499, + "grad_norm": 1.4630568723771824, + "learning_rate": 1.97710849169714e-05, + "loss": 0.7664, + "step": 942 + }, + { + "epoch": 0.09632277834525026, + "grad_norm": 1.4361129616472423, + "learning_rate": 1.9770380562229006e-05, + "loss": 0.635, + "step": 943 + }, + { + "epoch": 0.09642492339121553, + "grad_norm": 1.5865003913190385, + "learning_rate": 1.976967513811215e-05, + "loss": 0.8809, + "step": 944 + }, + { + "epoch": 0.0965270684371808, + "grad_norm": 1.4803357787221834, + "learning_rate": 1.9768968644698036e-05, + "loss": 0.6568, + "step": 945 + }, + { + "epoch": 0.09662921348314607, + "grad_norm": 1.477908993220323, + "learning_rate": 1.976826108206399e-05, + "loss": 0.7369, + "step": 946 + }, + { + "epoch": 0.09673135852911134, + "grad_norm": 1.4587094253692428, + "learning_rate": 1.976755245028746e-05, + "loss": 0.6607, + "step": 947 + }, + { + "epoch": 0.09683350357507661, + "grad_norm": 1.4658564717015132, + "learning_rate": 1.9766842749446004e-05, + "loss": 0.6987, + "step": 948 + }, + { + "epoch": 0.09693564862104188, + "grad_norm": 1.558194164125307, + "learning_rate": 1.97661319796173e-05, + "loss": 0.7635, + "step": 949 + }, + { + "epoch": 0.09703779366700716, + "grad_norm": 1.4704086617906666, + "learning_rate": 1.9765420140879137e-05, + "loss": 0.7255, + "step": 950 + }, + { + "epoch": 0.09713993871297243, + "grad_norm": 1.4730396591420127, + "learning_rate": 1.976470723330943e-05, + "loss": 0.6741, + "step": 951 + }, + { + "epoch": 0.0972420837589377, + "grad_norm": 1.3039910815210418, + "learning_rate": 1.9763993256986215e-05, + "loss": 0.6434, + "step": 952 + }, + { + "epoch": 0.09734422880490297, + "grad_norm": 1.4775103489065533, + "learning_rate": 1.976327821198762e-05, + "loss": 0.6766, + "step": 953 + }, + { + "epoch": 0.09744637385086824, + "grad_norm": 1.5776372083024406, + "learning_rate": 1.9762562098391922e-05, + "loss": 0.7452, + "step": 954 + }, + { + "epoch": 0.0975485188968335, + "grad_norm": 1.5583070802795673, + "learning_rate": 1.9761844916277496e-05, + "loss": 0.8046, + "step": 955 + }, + { + "epoch": 0.09765066394279877, + "grad_norm": 1.7470901466903863, + "learning_rate": 1.9761126665722835e-05, + "loss": 0.7099, + "step": 956 + }, + { + "epoch": 0.09775280898876404, + "grad_norm": 1.482867770568344, + "learning_rate": 1.9760407346806555e-05, + "loss": 0.7122, + "step": 957 + }, + { + "epoch": 0.09785495403472931, + "grad_norm": 1.4378085173546427, + "learning_rate": 1.9759686959607385e-05, + "loss": 0.7045, + "step": 958 + }, + { + "epoch": 0.09795709908069458, + "grad_norm": 1.5122287550337, + "learning_rate": 1.975896550420417e-05, + "loss": 0.7209, + "step": 959 + }, + { + "epoch": 0.09805924412665985, + "grad_norm": 1.7803133458264089, + "learning_rate": 1.9758242980675876e-05, + "loss": 0.7201, + "step": 960 + }, + { + "epoch": 0.09816138917262512, + "grad_norm": 1.4864698119183952, + "learning_rate": 1.9757519389101583e-05, + "loss": 0.7521, + "step": 961 + }, + { + "epoch": 0.0982635342185904, + "grad_norm": 1.5932527645758576, + "learning_rate": 1.975679472956049e-05, + "loss": 0.8326, + "step": 962 + }, + { + "epoch": 0.09836567926455567, + "grad_norm": 1.6543202167305613, + "learning_rate": 1.9756069002131908e-05, + "loss": 0.825, + "step": 963 + }, + { + "epoch": 0.09846782431052094, + "grad_norm": 1.4463693997998088, + "learning_rate": 1.9755342206895274e-05, + "loss": 0.7518, + "step": 964 + }, + { + "epoch": 0.09856996935648621, + "grad_norm": 1.6063518742478968, + "learning_rate": 1.975461434393013e-05, + "loss": 0.7253, + "step": 965 + }, + { + "epoch": 0.09867211440245148, + "grad_norm": 1.4877549812942281, + "learning_rate": 1.9753885413316143e-05, + "loss": 0.7249, + "step": 966 + }, + { + "epoch": 0.09877425944841675, + "grad_norm": 1.3326528855874147, + "learning_rate": 1.9753155415133097e-05, + "loss": 0.7369, + "step": 967 + }, + { + "epoch": 0.09887640449438202, + "grad_norm": 1.4714953511265507, + "learning_rate": 1.9752424349460888e-05, + "loss": 0.7083, + "step": 968 + }, + { + "epoch": 0.0989785495403473, + "grad_norm": 1.4045230827554696, + "learning_rate": 1.9751692216379538e-05, + "loss": 0.7008, + "step": 969 + }, + { + "epoch": 0.09908069458631256, + "grad_norm": 1.41148734085975, + "learning_rate": 1.975095901596917e-05, + "loss": 0.7569, + "step": 970 + }, + { + "epoch": 0.09918283963227784, + "grad_norm": 1.6547030130396438, + "learning_rate": 1.9750224748310036e-05, + "loss": 0.8257, + "step": 971 + }, + { + "epoch": 0.09928498467824311, + "grad_norm": 1.5953106596724391, + "learning_rate": 1.974948941348251e-05, + "loss": 0.7068, + "step": 972 + }, + { + "epoch": 0.09938712972420838, + "grad_norm": 1.6975504063540594, + "learning_rate": 1.9748753011567063e-05, + "loss": 0.7403, + "step": 973 + }, + { + "epoch": 0.09948927477017365, + "grad_norm": 1.6567869110170834, + "learning_rate": 1.9748015542644304e-05, + "loss": 0.7955, + "step": 974 + }, + { + "epoch": 0.09959141981613892, + "grad_norm": 1.4566763449068436, + "learning_rate": 1.9747277006794943e-05, + "loss": 0.675, + "step": 975 + }, + { + "epoch": 0.09969356486210419, + "grad_norm": 1.3840306808623692, + "learning_rate": 1.9746537404099818e-05, + "loss": 0.7338, + "step": 976 + }, + { + "epoch": 0.09979570990806946, + "grad_norm": 1.4520267334727888, + "learning_rate": 1.9745796734639877e-05, + "loss": 0.7519, + "step": 977 + }, + { + "epoch": 0.09989785495403473, + "grad_norm": 1.5906652066312208, + "learning_rate": 1.9745054998496186e-05, + "loss": 0.7921, + "step": 978 + }, + { + "epoch": 0.1, + "grad_norm": 1.8458016554749022, + "learning_rate": 1.974431219574993e-05, + "loss": 0.7262, + "step": 979 + }, + { + "epoch": 0.10010214504596528, + "grad_norm": 1.6046986126533127, + "learning_rate": 1.974356832648241e-05, + "loss": 0.7491, + "step": 980 + }, + { + "epoch": 0.10020429009193055, + "grad_norm": 1.4656463383202505, + "learning_rate": 1.9742823390775038e-05, + "loss": 0.7235, + "step": 981 + }, + { + "epoch": 0.1003064351378958, + "grad_norm": 1.6927113481283556, + "learning_rate": 1.9742077388709354e-05, + "loss": 0.8676, + "step": 982 + }, + { + "epoch": 0.10040858018386108, + "grad_norm": 1.5775193434317438, + "learning_rate": 1.9741330320367003e-05, + "loss": 0.8681, + "step": 983 + }, + { + "epoch": 0.10051072522982635, + "grad_norm": 1.4187466816216623, + "learning_rate": 1.9740582185829758e-05, + "loss": 0.7256, + "step": 984 + }, + { + "epoch": 0.10061287027579162, + "grad_norm": 1.7057356358705964, + "learning_rate": 1.9739832985179496e-05, + "loss": 0.8539, + "step": 985 + }, + { + "epoch": 0.10071501532175689, + "grad_norm": 1.436147638666035, + "learning_rate": 1.9739082718498225e-05, + "loss": 0.7029, + "step": 986 + }, + { + "epoch": 0.10081716036772216, + "grad_norm": 1.466806296811124, + "learning_rate": 1.9738331385868057e-05, + "loss": 0.7136, + "step": 987 + }, + { + "epoch": 0.10091930541368743, + "grad_norm": 1.5674119607445685, + "learning_rate": 1.9737578987371228e-05, + "loss": 0.7591, + "step": 988 + }, + { + "epoch": 0.1010214504596527, + "grad_norm": 1.586697300144906, + "learning_rate": 1.973682552309008e-05, + "loss": 0.8178, + "step": 989 + }, + { + "epoch": 0.10112359550561797, + "grad_norm": 1.5910089426981955, + "learning_rate": 1.9736070993107096e-05, + "loss": 0.7809, + "step": 990 + }, + { + "epoch": 0.10122574055158325, + "grad_norm": 1.405608333447748, + "learning_rate": 1.973531539750485e-05, + "loss": 0.7101, + "step": 991 + }, + { + "epoch": 0.10132788559754852, + "grad_norm": 1.3404674305678805, + "learning_rate": 1.973455873636604e-05, + "loss": 0.7214, + "step": 992 + }, + { + "epoch": 0.10143003064351379, + "grad_norm": 1.3524429057620553, + "learning_rate": 1.9733801009773493e-05, + "loss": 0.6585, + "step": 993 + }, + { + "epoch": 0.10153217568947906, + "grad_norm": 1.4920286250840071, + "learning_rate": 1.9733042217810132e-05, + "loss": 0.7548, + "step": 994 + }, + { + "epoch": 0.10163432073544433, + "grad_norm": 1.4909766479476947, + "learning_rate": 1.9732282360559013e-05, + "loss": 0.6958, + "step": 995 + }, + { + "epoch": 0.1017364657814096, + "grad_norm": 1.4771995706066727, + "learning_rate": 1.9731521438103302e-05, + "loss": 0.6878, + "step": 996 + }, + { + "epoch": 0.10183861082737487, + "grad_norm": 1.4062500744157633, + "learning_rate": 1.9730759450526283e-05, + "loss": 0.6565, + "step": 997 + }, + { + "epoch": 0.10194075587334014, + "grad_norm": 1.5604525269143246, + "learning_rate": 1.9729996397911354e-05, + "loss": 0.8305, + "step": 998 + }, + { + "epoch": 0.10204290091930542, + "grad_norm": 1.5364580628298345, + "learning_rate": 1.9729232280342035e-05, + "loss": 0.7152, + "step": 999 + }, + { + "epoch": 0.10214504596527069, + "grad_norm": 1.4468280092950851, + "learning_rate": 1.9728467097901954e-05, + "loss": 0.7943, + "step": 1000 + }, + { + "epoch": 0.10224719101123596, + "grad_norm": 1.3990501658212111, + "learning_rate": 1.9727700850674868e-05, + "loss": 0.6525, + "step": 1001 + }, + { + "epoch": 0.10234933605720123, + "grad_norm": 1.7582794258530305, + "learning_rate": 1.9726933538744636e-05, + "loss": 0.863, + "step": 1002 + }, + { + "epoch": 0.1024514811031665, + "grad_norm": 1.4979593135081044, + "learning_rate": 1.9726165162195244e-05, + "loss": 0.7641, + "step": 1003 + }, + { + "epoch": 0.10255362614913177, + "grad_norm": 1.633991705835641, + "learning_rate": 1.972539572111079e-05, + "loss": 0.6708, + "step": 1004 + }, + { + "epoch": 0.10265577119509704, + "grad_norm": 1.6315050251407412, + "learning_rate": 1.972462521557549e-05, + "loss": 0.7879, + "step": 1005 + }, + { + "epoch": 0.10275791624106231, + "grad_norm": 1.5218473411352251, + "learning_rate": 1.972385364567368e-05, + "loss": 0.693, + "step": 1006 + }, + { + "epoch": 0.10286006128702758, + "grad_norm": 1.537691951507889, + "learning_rate": 1.9723081011489808e-05, + "loss": 0.7028, + "step": 1007 + }, + { + "epoch": 0.10296220633299286, + "grad_norm": 1.5743260630199554, + "learning_rate": 1.9722307313108434e-05, + "loss": 0.7258, + "step": 1008 + }, + { + "epoch": 0.10306435137895813, + "grad_norm": 1.4768790424166727, + "learning_rate": 1.9721532550614243e-05, + "loss": 0.789, + "step": 1009 + }, + { + "epoch": 0.10316649642492338, + "grad_norm": 1.699258153445073, + "learning_rate": 1.9720756724092033e-05, + "loss": 0.6559, + "step": 1010 + }, + { + "epoch": 0.10326864147088866, + "grad_norm": 1.3808968819690532, + "learning_rate": 1.9719979833626723e-05, + "loss": 0.7556, + "step": 1011 + }, + { + "epoch": 0.10337078651685393, + "grad_norm": 1.4855624790134385, + "learning_rate": 1.971920187930334e-05, + "loss": 0.7814, + "step": 1012 + }, + { + "epoch": 0.1034729315628192, + "grad_norm": 1.5518038113996941, + "learning_rate": 1.9718422861207028e-05, + "loss": 0.725, + "step": 1013 + }, + { + "epoch": 0.10357507660878447, + "grad_norm": 1.6336153863697138, + "learning_rate": 1.9717642779423056e-05, + "loss": 0.9471, + "step": 1014 + }, + { + "epoch": 0.10367722165474974, + "grad_norm": 1.64745504326712, + "learning_rate": 1.9716861634036807e-05, + "loss": 0.7723, + "step": 1015 + }, + { + "epoch": 0.10377936670071501, + "grad_norm": 1.3576936215628461, + "learning_rate": 1.971607942513377e-05, + "loss": 0.6835, + "step": 1016 + }, + { + "epoch": 0.10388151174668028, + "grad_norm": 1.1858525253337149, + "learning_rate": 1.9715296152799563e-05, + "loss": 0.6177, + "step": 1017 + }, + { + "epoch": 0.10398365679264555, + "grad_norm": 1.6705054575036173, + "learning_rate": 1.9714511817119913e-05, + "loss": 0.7585, + "step": 1018 + }, + { + "epoch": 0.10408580183861083, + "grad_norm": 1.7704489777671324, + "learning_rate": 1.9713726418180673e-05, + "loss": 0.6826, + "step": 1019 + }, + { + "epoch": 0.1041879468845761, + "grad_norm": 1.3940675481157598, + "learning_rate": 1.97129399560678e-05, + "loss": 0.8696, + "step": 1020 + }, + { + "epoch": 0.10429009193054137, + "grad_norm": 1.5279804213259633, + "learning_rate": 1.971215243086737e-05, + "loss": 0.7345, + "step": 1021 + }, + { + "epoch": 0.10439223697650664, + "grad_norm": 1.6089632661400797, + "learning_rate": 1.971136384266559e-05, + "loss": 0.799, + "step": 1022 + }, + { + "epoch": 0.10449438202247191, + "grad_norm": 1.4371380693389366, + "learning_rate": 1.9710574191548755e-05, + "loss": 0.7716, + "step": 1023 + }, + { + "epoch": 0.10459652706843718, + "grad_norm": 1.490234195045288, + "learning_rate": 1.97097834776033e-05, + "loss": 0.8002, + "step": 1024 + }, + { + "epoch": 0.10469867211440245, + "grad_norm": 1.5363878532634723, + "learning_rate": 1.9708991700915775e-05, + "loss": 0.6424, + "step": 1025 + }, + { + "epoch": 0.10480081716036772, + "grad_norm": 1.72045974933191, + "learning_rate": 1.9708198861572834e-05, + "loss": 0.6349, + "step": 1026 + }, + { + "epoch": 0.104902962206333, + "grad_norm": 1.5713777215251707, + "learning_rate": 1.9707404959661253e-05, + "loss": 0.6532, + "step": 1027 + }, + { + "epoch": 0.10500510725229827, + "grad_norm": 1.862917125630126, + "learning_rate": 1.970660999526793e-05, + "loss": 0.8655, + "step": 1028 + }, + { + "epoch": 0.10510725229826354, + "grad_norm": 1.6590294079206702, + "learning_rate": 1.9705813968479873e-05, + "loss": 0.711, + "step": 1029 + }, + { + "epoch": 0.10520939734422881, + "grad_norm": 1.538498523148317, + "learning_rate": 1.97050168793842e-05, + "loss": 0.7911, + "step": 1030 + }, + { + "epoch": 0.10531154239019408, + "grad_norm": 1.6016890528590662, + "learning_rate": 1.9704218728068165e-05, + "loss": 0.8423, + "step": 1031 + }, + { + "epoch": 0.10541368743615935, + "grad_norm": 1.7442154812077557, + "learning_rate": 1.970341951461912e-05, + "loss": 0.873, + "step": 1032 + }, + { + "epoch": 0.10551583248212462, + "grad_norm": 1.5684267190963708, + "learning_rate": 1.970261923912454e-05, + "loss": 0.7215, + "step": 1033 + }, + { + "epoch": 0.10561797752808989, + "grad_norm": 1.3995705990407452, + "learning_rate": 1.9701817901672012e-05, + "loss": 0.6237, + "step": 1034 + }, + { + "epoch": 0.10572012257405516, + "grad_norm": 1.447184745783299, + "learning_rate": 1.970101550234925e-05, + "loss": 0.7456, + "step": 1035 + }, + { + "epoch": 0.10582226762002044, + "grad_norm": 1.584305978918886, + "learning_rate": 1.9700212041244075e-05, + "loss": 0.8151, + "step": 1036 + }, + { + "epoch": 0.1059244126659857, + "grad_norm": 1.5659777252377847, + "learning_rate": 1.969940751844442e-05, + "loss": 0.7402, + "step": 1037 + }, + { + "epoch": 0.10602655771195096, + "grad_norm": 1.5132296037226554, + "learning_rate": 1.969860193403835e-05, + "loss": 0.7419, + "step": 1038 + }, + { + "epoch": 0.10612870275791624, + "grad_norm": 1.6292286607140969, + "learning_rate": 1.969779528811403e-05, + "loss": 0.8968, + "step": 1039 + }, + { + "epoch": 0.1062308478038815, + "grad_norm": 1.5212551035984276, + "learning_rate": 1.969698758075975e-05, + "loss": 0.6529, + "step": 1040 + }, + { + "epoch": 0.10633299284984678, + "grad_norm": 1.5870000281839585, + "learning_rate": 1.9696178812063916e-05, + "loss": 0.7447, + "step": 1041 + }, + { + "epoch": 0.10643513789581205, + "grad_norm": 1.4670247471720168, + "learning_rate": 1.9695368982115045e-05, + "loss": 0.7211, + "step": 1042 + }, + { + "epoch": 0.10653728294177732, + "grad_norm": 1.5900334373794982, + "learning_rate": 1.9694558091001775e-05, + "loss": 0.7828, + "step": 1043 + }, + { + "epoch": 0.10663942798774259, + "grad_norm": 1.5754049427546888, + "learning_rate": 1.9693746138812857e-05, + "loss": 0.7867, + "step": 1044 + }, + { + "epoch": 0.10674157303370786, + "grad_norm": 1.662951472327358, + "learning_rate": 1.9692933125637164e-05, + "loss": 0.7096, + "step": 1045 + }, + { + "epoch": 0.10684371807967313, + "grad_norm": 1.4677802436261256, + "learning_rate": 1.9692119051563676e-05, + "loss": 0.7837, + "step": 1046 + }, + { + "epoch": 0.1069458631256384, + "grad_norm": 1.5147859973405229, + "learning_rate": 1.9691303916681496e-05, + "loss": 0.7031, + "step": 1047 + }, + { + "epoch": 0.10704800817160368, + "grad_norm": 1.6365709294718904, + "learning_rate": 1.9690487721079844e-05, + "loss": 0.7939, + "step": 1048 + }, + { + "epoch": 0.10715015321756895, + "grad_norm": 1.4352163050125037, + "learning_rate": 1.9689670464848046e-05, + "loss": 0.7386, + "step": 1049 + }, + { + "epoch": 0.10725229826353422, + "grad_norm": 1.8710853126370701, + "learning_rate": 1.9688852148075558e-05, + "loss": 0.7721, + "step": 1050 + }, + { + "epoch": 0.10735444330949949, + "grad_norm": 1.5375805206814441, + "learning_rate": 1.968803277085194e-05, + "loss": 0.7564, + "step": 1051 + }, + { + "epoch": 0.10745658835546476, + "grad_norm": 1.493112625445584, + "learning_rate": 1.9687212333266878e-05, + "loss": 0.6944, + "step": 1052 + }, + { + "epoch": 0.10755873340143003, + "grad_norm": 1.4681266876362211, + "learning_rate": 1.9686390835410166e-05, + "loss": 0.7386, + "step": 1053 + }, + { + "epoch": 0.1076608784473953, + "grad_norm": 1.504073201526041, + "learning_rate": 1.9685568277371722e-05, + "loss": 0.821, + "step": 1054 + }, + { + "epoch": 0.10776302349336057, + "grad_norm": 1.608016314217692, + "learning_rate": 1.9684744659241567e-05, + "loss": 0.8452, + "step": 1055 + }, + { + "epoch": 0.10786516853932585, + "grad_norm": 1.554589638810335, + "learning_rate": 1.9683919981109855e-05, + "loss": 0.7232, + "step": 1056 + }, + { + "epoch": 0.10796731358529112, + "grad_norm": 1.5565380453279256, + "learning_rate": 1.9683094243066846e-05, + "loss": 0.9074, + "step": 1057 + }, + { + "epoch": 0.10806945863125639, + "grad_norm": 1.4666115680684282, + "learning_rate": 1.9682267445202915e-05, + "loss": 0.7396, + "step": 1058 + }, + { + "epoch": 0.10817160367722166, + "grad_norm": 1.50996832618128, + "learning_rate": 1.9681439587608556e-05, + "loss": 0.8248, + "step": 1059 + }, + { + "epoch": 0.10827374872318693, + "grad_norm": 1.4959611080051747, + "learning_rate": 1.9680610670374377e-05, + "loss": 0.7332, + "step": 1060 + }, + { + "epoch": 0.1083758937691522, + "grad_norm": 1.5105977612121393, + "learning_rate": 1.967978069359111e-05, + "loss": 0.7496, + "step": 1061 + }, + { + "epoch": 0.10847803881511747, + "grad_norm": 1.4872475345984455, + "learning_rate": 1.9678949657349588e-05, + "loss": 0.7721, + "step": 1062 + }, + { + "epoch": 0.10858018386108274, + "grad_norm": 1.6243417248933554, + "learning_rate": 1.9678117561740775e-05, + "loss": 0.6835, + "step": 1063 + }, + { + "epoch": 0.10868232890704801, + "grad_norm": 1.536704023708454, + "learning_rate": 1.967728440685574e-05, + "loss": 0.7915, + "step": 1064 + }, + { + "epoch": 0.10878447395301327, + "grad_norm": 1.5305739582406057, + "learning_rate": 1.9676450192785678e-05, + "loss": 0.8078, + "step": 1065 + }, + { + "epoch": 0.10888661899897854, + "grad_norm": 1.4659650067504544, + "learning_rate": 1.967561491962189e-05, + "loss": 0.7155, + "step": 1066 + }, + { + "epoch": 0.10898876404494381, + "grad_norm": 1.446269152380502, + "learning_rate": 1.9674778587455794e-05, + "loss": 0.6517, + "step": 1067 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 1.5716526221715839, + "learning_rate": 1.9673941196378934e-05, + "loss": 0.7789, + "step": 1068 + }, + { + "epoch": 0.10919305413687436, + "grad_norm": 1.5825011813906076, + "learning_rate": 1.967310274648296e-05, + "loss": 0.7292, + "step": 1069 + }, + { + "epoch": 0.10929519918283963, + "grad_norm": 1.5485946834973099, + "learning_rate": 1.967226323785964e-05, + "loss": 0.7593, + "step": 1070 + }, + { + "epoch": 0.1093973442288049, + "grad_norm": 1.5215823121876997, + "learning_rate": 1.9671422670600856e-05, + "loss": 0.792, + "step": 1071 + }, + { + "epoch": 0.10949948927477017, + "grad_norm": 1.453396643211816, + "learning_rate": 1.967058104479862e-05, + "loss": 0.747, + "step": 1072 + }, + { + "epoch": 0.10960163432073544, + "grad_norm": 1.4168331086486683, + "learning_rate": 1.9669738360545035e-05, + "loss": 0.7049, + "step": 1073 + }, + { + "epoch": 0.10970377936670071, + "grad_norm": 1.3263318882519275, + "learning_rate": 1.966889461793234e-05, + "loss": 0.7235, + "step": 1074 + }, + { + "epoch": 0.10980592441266598, + "grad_norm": 1.6194805510631765, + "learning_rate": 1.966804981705288e-05, + "loss": 0.7342, + "step": 1075 + }, + { + "epoch": 0.10990806945863126, + "grad_norm": 1.491035625213636, + "learning_rate": 1.9667203957999127e-05, + "loss": 0.716, + "step": 1076 + }, + { + "epoch": 0.11001021450459653, + "grad_norm": 1.5293161670236433, + "learning_rate": 1.9666357040863652e-05, + "loss": 0.6948, + "step": 1077 + }, + { + "epoch": 0.1101123595505618, + "grad_norm": 1.4427104989338841, + "learning_rate": 1.966550906573915e-05, + "loss": 0.7067, + "step": 1078 + }, + { + "epoch": 0.11021450459652707, + "grad_norm": 1.5419103208271614, + "learning_rate": 1.966466003271844e-05, + "loss": 0.7692, + "step": 1079 + }, + { + "epoch": 0.11031664964249234, + "grad_norm": 1.287527417083186, + "learning_rate": 1.966380994189444e-05, + "loss": 0.7177, + "step": 1080 + }, + { + "epoch": 0.11041879468845761, + "grad_norm": 1.5432135302878576, + "learning_rate": 1.9662958793360206e-05, + "loss": 0.7233, + "step": 1081 + }, + { + "epoch": 0.11052093973442288, + "grad_norm": 1.5879548534579644, + "learning_rate": 1.9662106587208886e-05, + "loss": 0.8102, + "step": 1082 + }, + { + "epoch": 0.11062308478038815, + "grad_norm": 1.5714630736277018, + "learning_rate": 1.9661253323533757e-05, + "loss": 0.8048, + "step": 1083 + }, + { + "epoch": 0.11072522982635342, + "grad_norm": 1.4428855909461007, + "learning_rate": 1.966039900242821e-05, + "loss": 0.7014, + "step": 1084 + }, + { + "epoch": 0.1108273748723187, + "grad_norm": 1.6870074077482777, + "learning_rate": 1.965954362398575e-05, + "loss": 0.8031, + "step": 1085 + }, + { + "epoch": 0.11092951991828397, + "grad_norm": 1.5615804599702627, + "learning_rate": 1.96586871883e-05, + "loss": 0.7303, + "step": 1086 + }, + { + "epoch": 0.11103166496424924, + "grad_norm": 1.508248958415601, + "learning_rate": 1.9657829695464698e-05, + "loss": 0.832, + "step": 1087 + }, + { + "epoch": 0.11113381001021451, + "grad_norm": 1.5302067947637197, + "learning_rate": 1.9656971145573697e-05, + "loss": 0.7552, + "step": 1088 + }, + { + "epoch": 0.11123595505617978, + "grad_norm": 1.520783082734965, + "learning_rate": 1.965611153872096e-05, + "loss": 0.7495, + "step": 1089 + }, + { + "epoch": 0.11133810010214505, + "grad_norm": 1.3272489177167366, + "learning_rate": 1.965525087500058e-05, + "loss": 0.6495, + "step": 1090 + }, + { + "epoch": 0.11144024514811032, + "grad_norm": 1.730070806759725, + "learning_rate": 1.9654389154506754e-05, + "loss": 0.759, + "step": 1091 + }, + { + "epoch": 0.1115423901940756, + "grad_norm": 1.4720479151362003, + "learning_rate": 1.9653526377333796e-05, + "loss": 0.6225, + "step": 1092 + }, + { + "epoch": 0.11164453524004085, + "grad_norm": 1.6282340642885975, + "learning_rate": 1.965266254357614e-05, + "loss": 0.8137, + "step": 1093 + }, + { + "epoch": 0.11174668028600612, + "grad_norm": 1.5395414105341485, + "learning_rate": 1.9651797653328332e-05, + "loss": 0.6893, + "step": 1094 + }, + { + "epoch": 0.1118488253319714, + "grad_norm": 1.4591337079925815, + "learning_rate": 1.9650931706685036e-05, + "loss": 0.7059, + "step": 1095 + }, + { + "epoch": 0.11195097037793666, + "grad_norm": 1.5163459799789831, + "learning_rate": 1.965006470374103e-05, + "loss": 0.7266, + "step": 1096 + }, + { + "epoch": 0.11205311542390194, + "grad_norm": 1.56214228902307, + "learning_rate": 1.9649196644591203e-05, + "loss": 0.7501, + "step": 1097 + }, + { + "epoch": 0.11215526046986721, + "grad_norm": 1.6932561748304926, + "learning_rate": 1.9648327529330574e-05, + "loss": 0.7603, + "step": 1098 + }, + { + "epoch": 0.11225740551583248, + "grad_norm": 1.5758000546280584, + "learning_rate": 1.9647457358054258e-05, + "loss": 0.7621, + "step": 1099 + }, + { + "epoch": 0.11235955056179775, + "grad_norm": 1.6192369089675644, + "learning_rate": 1.9646586130857504e-05, + "loss": 0.7703, + "step": 1100 + }, + { + "epoch": 0.11246169560776302, + "grad_norm": 1.6423027363222782, + "learning_rate": 1.9645713847835666e-05, + "loss": 0.7898, + "step": 1101 + }, + { + "epoch": 0.11256384065372829, + "grad_norm": 1.5697964937421576, + "learning_rate": 1.9644840509084218e-05, + "loss": 0.8043, + "step": 1102 + }, + { + "epoch": 0.11266598569969356, + "grad_norm": 1.3761870021062004, + "learning_rate": 1.964396611469874e-05, + "loss": 0.6692, + "step": 1103 + }, + { + "epoch": 0.11276813074565883, + "grad_norm": 1.501636530068321, + "learning_rate": 1.964309066477494e-05, + "loss": 0.7291, + "step": 1104 + }, + { + "epoch": 0.1128702757916241, + "grad_norm": 1.3787217226253703, + "learning_rate": 1.9642214159408637e-05, + "loss": 0.7984, + "step": 1105 + }, + { + "epoch": 0.11297242083758938, + "grad_norm": 1.5683555170323638, + "learning_rate": 1.964133659869577e-05, + "loss": 0.8193, + "step": 1106 + }, + { + "epoch": 0.11307456588355465, + "grad_norm": 1.4264983752144587, + "learning_rate": 1.964045798273238e-05, + "loss": 0.7473, + "step": 1107 + }, + { + "epoch": 0.11317671092951992, + "grad_norm": 1.529251767852379, + "learning_rate": 1.9639578311614633e-05, + "loss": 0.8392, + "step": 1108 + }, + { + "epoch": 0.11327885597548519, + "grad_norm": 1.5241632486928207, + "learning_rate": 1.9638697585438816e-05, + "loss": 0.701, + "step": 1109 + }, + { + "epoch": 0.11338100102145046, + "grad_norm": 1.57816366971916, + "learning_rate": 1.9637815804301315e-05, + "loss": 0.7713, + "step": 1110 + }, + { + "epoch": 0.11348314606741573, + "grad_norm": 1.6561260930902602, + "learning_rate": 1.9636932968298652e-05, + "loss": 0.7932, + "step": 1111 + }, + { + "epoch": 0.113585291113381, + "grad_norm": 1.5135538410194025, + "learning_rate": 1.963604907752745e-05, + "loss": 0.6946, + "step": 1112 + }, + { + "epoch": 0.11368743615934628, + "grad_norm": 1.4696545723569394, + "learning_rate": 1.9635164132084447e-05, + "loss": 0.7785, + "step": 1113 + }, + { + "epoch": 0.11378958120531155, + "grad_norm": 1.711207260627573, + "learning_rate": 1.9634278132066503e-05, + "loss": 0.831, + "step": 1114 + }, + { + "epoch": 0.11389172625127682, + "grad_norm": 1.4100080671510697, + "learning_rate": 1.96333910775706e-05, + "loss": 0.7305, + "step": 1115 + }, + { + "epoch": 0.11399387129724209, + "grad_norm": 1.3506679078293322, + "learning_rate": 1.9632502968693815e-05, + "loss": 0.6626, + "step": 1116 + }, + { + "epoch": 0.11409601634320736, + "grad_norm": 1.5736982046628725, + "learning_rate": 1.9631613805533357e-05, + "loss": 0.8463, + "step": 1117 + }, + { + "epoch": 0.11419816138917263, + "grad_norm": 1.6030400426484313, + "learning_rate": 1.9630723588186544e-05, + "loss": 0.7356, + "step": 1118 + }, + { + "epoch": 0.1143003064351379, + "grad_norm": 1.591432351263304, + "learning_rate": 1.9629832316750814e-05, + "loss": 0.8572, + "step": 1119 + }, + { + "epoch": 0.11440245148110317, + "grad_norm": 1.4525598995329345, + "learning_rate": 1.9628939991323717e-05, + "loss": 0.7119, + "step": 1120 + }, + { + "epoch": 0.11450459652706843, + "grad_norm": 1.4730526995554696, + "learning_rate": 1.9628046612002912e-05, + "loss": 0.7002, + "step": 1121 + }, + { + "epoch": 0.1146067415730337, + "grad_norm": 1.6655097959241574, + "learning_rate": 1.9627152178886192e-05, + "loss": 0.7306, + "step": 1122 + }, + { + "epoch": 0.11470888661899897, + "grad_norm": 1.4487324452504642, + "learning_rate": 1.9626256692071443e-05, + "loss": 0.7038, + "step": 1123 + }, + { + "epoch": 0.11481103166496424, + "grad_norm": 1.459069387552358, + "learning_rate": 1.9625360151656676e-05, + "loss": 0.7114, + "step": 1124 + }, + { + "epoch": 0.11491317671092952, + "grad_norm": 1.4514532613334061, + "learning_rate": 1.9624462557740026e-05, + "loss": 0.7659, + "step": 1125 + }, + { + "epoch": 0.11501532175689479, + "grad_norm": 1.6161659026524873, + "learning_rate": 1.9623563910419725e-05, + "loss": 0.8431, + "step": 1126 + }, + { + "epoch": 0.11511746680286006, + "grad_norm": 1.6585267557248564, + "learning_rate": 1.9622664209794143e-05, + "loss": 0.7978, + "step": 1127 + }, + { + "epoch": 0.11521961184882533, + "grad_norm": 1.5705270438937884, + "learning_rate": 1.9621763455961743e-05, + "loss": 0.8928, + "step": 1128 + }, + { + "epoch": 0.1153217568947906, + "grad_norm": 1.490333260246094, + "learning_rate": 1.9620861649021114e-05, + "loss": 0.7757, + "step": 1129 + }, + { + "epoch": 0.11542390194075587, + "grad_norm": 1.393001386740254, + "learning_rate": 1.9619958789070964e-05, + "loss": 0.7052, + "step": 1130 + }, + { + "epoch": 0.11552604698672114, + "grad_norm": 1.4685504649997314, + "learning_rate": 1.961905487621011e-05, + "loss": 0.7579, + "step": 1131 + }, + { + "epoch": 0.11562819203268641, + "grad_norm": 1.6085513164929615, + "learning_rate": 1.9618149910537486e-05, + "loss": 0.7252, + "step": 1132 + }, + { + "epoch": 0.11573033707865168, + "grad_norm": 1.6250195972146972, + "learning_rate": 1.961724389215214e-05, + "loss": 0.8278, + "step": 1133 + }, + { + "epoch": 0.11583248212461696, + "grad_norm": 1.5661964553561925, + "learning_rate": 1.9616336821153234e-05, + "loss": 0.6943, + "step": 1134 + }, + { + "epoch": 0.11593462717058223, + "grad_norm": 1.4715500789745353, + "learning_rate": 1.961542869764005e-05, + "loss": 0.7396, + "step": 1135 + }, + { + "epoch": 0.1160367722165475, + "grad_norm": 1.579461063679608, + "learning_rate": 1.961451952171199e-05, + "loss": 0.7588, + "step": 1136 + }, + { + "epoch": 0.11613891726251277, + "grad_norm": 1.6259954884744157, + "learning_rate": 1.9613609293468547e-05, + "loss": 0.7895, + "step": 1137 + }, + { + "epoch": 0.11624106230847804, + "grad_norm": 1.6283075636519224, + "learning_rate": 1.961269801300936e-05, + "loss": 0.7949, + "step": 1138 + }, + { + "epoch": 0.11634320735444331, + "grad_norm": 1.4242442476519803, + "learning_rate": 1.9611785680434163e-05, + "loss": 0.6252, + "step": 1139 + }, + { + "epoch": 0.11644535240040858, + "grad_norm": 1.6164926118724838, + "learning_rate": 1.9610872295842817e-05, + "loss": 0.6829, + "step": 1140 + }, + { + "epoch": 0.11654749744637385, + "grad_norm": 1.4699206989738522, + "learning_rate": 1.9609957859335284e-05, + "loss": 0.7251, + "step": 1141 + }, + { + "epoch": 0.11664964249233913, + "grad_norm": 1.6834752269614504, + "learning_rate": 1.960904237101166e-05, + "loss": 0.799, + "step": 1142 + }, + { + "epoch": 0.1167517875383044, + "grad_norm": 1.6881694148931172, + "learning_rate": 1.9608125830972137e-05, + "loss": 0.8069, + "step": 1143 + }, + { + "epoch": 0.11685393258426967, + "grad_norm": 1.5036997167358905, + "learning_rate": 1.9607208239317033e-05, + "loss": 0.8254, + "step": 1144 + }, + { + "epoch": 0.11695607763023494, + "grad_norm": 1.4370888244086095, + "learning_rate": 1.9606289596146778e-05, + "loss": 0.6801, + "step": 1145 + }, + { + "epoch": 0.11705822267620021, + "grad_norm": 1.5769554592067168, + "learning_rate": 1.9605369901561925e-05, + "loss": 0.8396, + "step": 1146 + }, + { + "epoch": 0.11716036772216548, + "grad_norm": 1.472737712399418, + "learning_rate": 1.960444915566313e-05, + "loss": 0.7266, + "step": 1147 + }, + { + "epoch": 0.11726251276813074, + "grad_norm": 1.4592928041257078, + "learning_rate": 1.9603527358551168e-05, + "loss": 0.7671, + "step": 1148 + }, + { + "epoch": 0.11736465781409601, + "grad_norm": 1.6234213775530273, + "learning_rate": 1.960260451032693e-05, + "loss": 0.8097, + "step": 1149 + }, + { + "epoch": 0.11746680286006128, + "grad_norm": 1.411561192994768, + "learning_rate": 1.960168061109143e-05, + "loss": 0.7693, + "step": 1150 + }, + { + "epoch": 0.11756894790602655, + "grad_norm": 1.5513596604042557, + "learning_rate": 1.960075566094578e-05, + "loss": 0.7398, + "step": 1151 + }, + { + "epoch": 0.11767109295199182, + "grad_norm": 1.6576902488653413, + "learning_rate": 1.9599829659991218e-05, + "loss": 0.8122, + "step": 1152 + }, + { + "epoch": 0.1177732379979571, + "grad_norm": 1.2937566604443824, + "learning_rate": 1.9598902608329103e-05, + "loss": 0.6592, + "step": 1153 + }, + { + "epoch": 0.11787538304392237, + "grad_norm": 1.5144225465854024, + "learning_rate": 1.959797450606089e-05, + "loss": 0.6989, + "step": 1154 + }, + { + "epoch": 0.11797752808988764, + "grad_norm": 1.6574589217740479, + "learning_rate": 1.9597045353288168e-05, + "loss": 0.8498, + "step": 1155 + }, + { + "epoch": 0.11807967313585291, + "grad_norm": 1.4082506086397073, + "learning_rate": 1.9596115150112634e-05, + "loss": 0.6985, + "step": 1156 + }, + { + "epoch": 0.11818181818181818, + "grad_norm": 1.508702350477482, + "learning_rate": 1.9595183896636096e-05, + "loss": 0.7123, + "step": 1157 + }, + { + "epoch": 0.11828396322778345, + "grad_norm": 1.5366612361137204, + "learning_rate": 1.959425159296048e-05, + "loss": 0.8071, + "step": 1158 + }, + { + "epoch": 0.11838610827374872, + "grad_norm": 1.420998684150009, + "learning_rate": 1.9593318239187827e-05, + "loss": 0.7303, + "step": 1159 + }, + { + "epoch": 0.118488253319714, + "grad_norm": 1.5498408206922487, + "learning_rate": 1.9592383835420297e-05, + "loss": 0.7587, + "step": 1160 + }, + { + "epoch": 0.11859039836567926, + "grad_norm": 1.469113125338137, + "learning_rate": 1.959144838176016e-05, + "loss": 0.6966, + "step": 1161 + }, + { + "epoch": 0.11869254341164454, + "grad_norm": 1.48914328035654, + "learning_rate": 1.95905118783098e-05, + "loss": 0.7145, + "step": 1162 + }, + { + "epoch": 0.1187946884576098, + "grad_norm": 1.5182866382588345, + "learning_rate": 1.958957432517172e-05, + "loss": 0.8128, + "step": 1163 + }, + { + "epoch": 0.11889683350357508, + "grad_norm": 1.5061765053438796, + "learning_rate": 1.958863572244853e-05, + "loss": 0.6881, + "step": 1164 + }, + { + "epoch": 0.11899897854954035, + "grad_norm": 1.5422965648610834, + "learning_rate": 1.958769607024297e-05, + "loss": 0.7557, + "step": 1165 + }, + { + "epoch": 0.11910112359550562, + "grad_norm": 1.5167992794347664, + "learning_rate": 1.9586755368657877e-05, + "loss": 0.8036, + "step": 1166 + }, + { + "epoch": 0.11920326864147089, + "grad_norm": 1.677729482694591, + "learning_rate": 1.9585813617796216e-05, + "loss": 0.7432, + "step": 1167 + }, + { + "epoch": 0.11930541368743616, + "grad_norm": 1.6046651107365564, + "learning_rate": 1.958487081776106e-05, + "loss": 0.773, + "step": 1168 + }, + { + "epoch": 0.11940755873340143, + "grad_norm": 1.5176283952203122, + "learning_rate": 1.9583926968655605e-05, + "loss": 0.7842, + "step": 1169 + }, + { + "epoch": 0.1195097037793667, + "grad_norm": 1.6493618120879856, + "learning_rate": 1.9582982070583147e-05, + "loss": 0.7523, + "step": 1170 + }, + { + "epoch": 0.11961184882533198, + "grad_norm": 1.5673699702571005, + "learning_rate": 1.9582036123647113e-05, + "loss": 0.7886, + "step": 1171 + }, + { + "epoch": 0.11971399387129725, + "grad_norm": 1.587491673900439, + "learning_rate": 1.958108912795103e-05, + "loss": 0.7634, + "step": 1172 + }, + { + "epoch": 0.11981613891726252, + "grad_norm": 1.515341532472603, + "learning_rate": 1.9580141083598555e-05, + "loss": 0.7304, + "step": 1173 + }, + { + "epoch": 0.11991828396322779, + "grad_norm": 1.5622614848581975, + "learning_rate": 1.9579191990693448e-05, + "loss": 0.7874, + "step": 1174 + }, + { + "epoch": 0.12002042900919306, + "grad_norm": 1.3199339007577178, + "learning_rate": 1.957824184933959e-05, + "loss": 0.6765, + "step": 1175 + }, + { + "epoch": 0.12012257405515832, + "grad_norm": 1.6096869492673378, + "learning_rate": 1.957729065964097e-05, + "loss": 0.7237, + "step": 1176 + }, + { + "epoch": 0.12022471910112359, + "grad_norm": 1.61113041065203, + "learning_rate": 1.9576338421701704e-05, + "loss": 0.8896, + "step": 1177 + }, + { + "epoch": 0.12032686414708886, + "grad_norm": 1.5691180985889015, + "learning_rate": 1.957538513562601e-05, + "loss": 0.7583, + "step": 1178 + }, + { + "epoch": 0.12042900919305413, + "grad_norm": 1.5851408398053064, + "learning_rate": 1.9574430801518224e-05, + "loss": 0.7986, + "step": 1179 + }, + { + "epoch": 0.1205311542390194, + "grad_norm": 1.5157203742961871, + "learning_rate": 1.95734754194828e-05, + "loss": 0.7079, + "step": 1180 + }, + { + "epoch": 0.12063329928498467, + "grad_norm": 1.5218548154218454, + "learning_rate": 1.957251898962431e-05, + "loss": 0.6954, + "step": 1181 + }, + { + "epoch": 0.12073544433094995, + "grad_norm": 1.6277891174330776, + "learning_rate": 1.9571561512047426e-05, + "loss": 0.7921, + "step": 1182 + }, + { + "epoch": 0.12083758937691522, + "grad_norm": 1.3707842287380825, + "learning_rate": 1.9570602986856956e-05, + "loss": 0.7534, + "step": 1183 + }, + { + "epoch": 0.12093973442288049, + "grad_norm": 1.6508759615286497, + "learning_rate": 1.9569643414157804e-05, + "loss": 0.7422, + "step": 1184 + }, + { + "epoch": 0.12104187946884576, + "grad_norm": 1.4064717885342997, + "learning_rate": 1.9568682794055e-05, + "loss": 0.8545, + "step": 1185 + }, + { + "epoch": 0.12114402451481103, + "grad_norm": 1.5297819422412304, + "learning_rate": 1.956772112665368e-05, + "loss": 0.7621, + "step": 1186 + }, + { + "epoch": 0.1212461695607763, + "grad_norm": 1.6584398639779026, + "learning_rate": 1.9566758412059098e-05, + "loss": 0.9568, + "step": 1187 + }, + { + "epoch": 0.12134831460674157, + "grad_norm": 1.5671169981433042, + "learning_rate": 1.956579465037663e-05, + "loss": 0.8319, + "step": 1188 + }, + { + "epoch": 0.12145045965270684, + "grad_norm": 1.5780710279331873, + "learning_rate": 1.9564829841711756e-05, + "loss": 0.6693, + "step": 1189 + }, + { + "epoch": 0.12155260469867211, + "grad_norm": 1.543220695185644, + "learning_rate": 1.956386398617008e-05, + "loss": 0.7546, + "step": 1190 + }, + { + "epoch": 0.12165474974463739, + "grad_norm": 1.513080354208372, + "learning_rate": 1.9562897083857306e-05, + "loss": 0.8116, + "step": 1191 + }, + { + "epoch": 0.12175689479060266, + "grad_norm": 1.4152589378323934, + "learning_rate": 1.956192913487927e-05, + "loss": 0.7406, + "step": 1192 + }, + { + "epoch": 0.12185903983656793, + "grad_norm": 1.502052507526006, + "learning_rate": 1.9560960139341912e-05, + "loss": 0.7904, + "step": 1193 + }, + { + "epoch": 0.1219611848825332, + "grad_norm": 1.4209718536726594, + "learning_rate": 1.955999009735129e-05, + "loss": 0.6884, + "step": 1194 + }, + { + "epoch": 0.12206332992849847, + "grad_norm": 1.487880776322938, + "learning_rate": 1.9559019009013575e-05, + "loss": 0.8, + "step": 1195 + }, + { + "epoch": 0.12216547497446374, + "grad_norm": 1.5596167530552554, + "learning_rate": 1.9558046874435053e-05, + "loss": 0.7751, + "step": 1196 + }, + { + "epoch": 0.12226762002042901, + "grad_norm": 1.595673203573733, + "learning_rate": 1.9557073693722127e-05, + "loss": 0.6974, + "step": 1197 + }, + { + "epoch": 0.12236976506639428, + "grad_norm": 1.5430194348370545, + "learning_rate": 1.955609946698131e-05, + "loss": 0.8015, + "step": 1198 + }, + { + "epoch": 0.12247191011235956, + "grad_norm": 1.5942440693844087, + "learning_rate": 1.9555124194319235e-05, + "loss": 0.6994, + "step": 1199 + }, + { + "epoch": 0.12257405515832483, + "grad_norm": 1.5154002049573354, + "learning_rate": 1.955414787584264e-05, + "loss": 0.82, + "step": 1200 + }, + { + "epoch": 0.1226762002042901, + "grad_norm": 1.4479051219079702, + "learning_rate": 1.955317051165839e-05, + "loss": 0.7366, + "step": 1201 + }, + { + "epoch": 0.12277834525025537, + "grad_norm": 1.5359739446267027, + "learning_rate": 1.955219210187345e-05, + "loss": 0.697, + "step": 1202 + }, + { + "epoch": 0.12288049029622064, + "grad_norm": 1.429308799480471, + "learning_rate": 1.955121264659492e-05, + "loss": 0.6499, + "step": 1203 + }, + { + "epoch": 0.1229826353421859, + "grad_norm": 1.4372434832685075, + "learning_rate": 1.955023214592999e-05, + "loss": 0.6346, + "step": 1204 + }, + { + "epoch": 0.12308478038815117, + "grad_norm": 1.5091235348899132, + "learning_rate": 1.9549250599985982e-05, + "loss": 0.8377, + "step": 1205 + }, + { + "epoch": 0.12318692543411644, + "grad_norm": 1.6379882550955176, + "learning_rate": 1.954826800887033e-05, + "loss": 0.7423, + "step": 1206 + }, + { + "epoch": 0.12328907048008171, + "grad_norm": 1.4341127596488723, + "learning_rate": 1.9547284372690568e-05, + "loss": 0.8556, + "step": 1207 + }, + { + "epoch": 0.12339121552604698, + "grad_norm": 1.480754107242904, + "learning_rate": 1.9546299691554368e-05, + "loss": 0.7573, + "step": 1208 + }, + { + "epoch": 0.12349336057201225, + "grad_norm": 1.6924205463669784, + "learning_rate": 1.95453139655695e-05, + "loss": 0.851, + "step": 1209 + }, + { + "epoch": 0.12359550561797752, + "grad_norm": 1.4876519480318136, + "learning_rate": 1.954432719484385e-05, + "loss": 0.8429, + "step": 1210 + }, + { + "epoch": 0.1236976506639428, + "grad_norm": 1.6435897107326578, + "learning_rate": 1.954333937948542e-05, + "loss": 0.8184, + "step": 1211 + }, + { + "epoch": 0.12379979570990807, + "grad_norm": 1.4616993314083842, + "learning_rate": 1.9542350519602334e-05, + "loss": 0.7895, + "step": 1212 + }, + { + "epoch": 0.12390194075587334, + "grad_norm": 1.6124068707634893, + "learning_rate": 1.9541360615302815e-05, + "loss": 0.8018, + "step": 1213 + }, + { + "epoch": 0.12400408580183861, + "grad_norm": 1.5408240938846258, + "learning_rate": 1.9540369666695213e-05, + "loss": 0.8217, + "step": 1214 + }, + { + "epoch": 0.12410623084780388, + "grad_norm": 1.4938413820397354, + "learning_rate": 1.9539377673887986e-05, + "loss": 0.7922, + "step": 1215 + }, + { + "epoch": 0.12420837589376915, + "grad_norm": 1.477164919560015, + "learning_rate": 1.953838463698971e-05, + "loss": 0.7786, + "step": 1216 + }, + { + "epoch": 0.12431052093973442, + "grad_norm": 1.5690190584247838, + "learning_rate": 1.9537390556109073e-05, + "loss": 0.7951, + "step": 1217 + }, + { + "epoch": 0.1244126659856997, + "grad_norm": 1.451598369396406, + "learning_rate": 1.9536395431354877e-05, + "loss": 0.8184, + "step": 1218 + }, + { + "epoch": 0.12451481103166497, + "grad_norm": 1.5424861400142693, + "learning_rate": 1.9535399262836045e-05, + "loss": 0.7921, + "step": 1219 + }, + { + "epoch": 0.12461695607763024, + "grad_norm": 1.348330119934175, + "learning_rate": 1.9534402050661596e-05, + "loss": 0.6841, + "step": 1220 + }, + { + "epoch": 0.12471910112359551, + "grad_norm": 1.4802448089171993, + "learning_rate": 1.9533403794940685e-05, + "loss": 0.7919, + "step": 1221 + }, + { + "epoch": 0.12482124616956078, + "grad_norm": 1.7236807896478832, + "learning_rate": 1.953240449578257e-05, + "loss": 0.7787, + "step": 1222 + }, + { + "epoch": 0.12492339121552605, + "grad_norm": 1.6457683616895697, + "learning_rate": 1.9531404153296624e-05, + "loss": 0.8624, + "step": 1223 + }, + { + "epoch": 0.12502553626149132, + "grad_norm": 1.5388545431253615, + "learning_rate": 1.9530402767592337e-05, + "loss": 0.7026, + "step": 1224 + }, + { + "epoch": 0.12512768130745658, + "grad_norm": 1.6011602411064116, + "learning_rate": 1.9529400338779304e-05, + "loss": 0.8759, + "step": 1225 + }, + { + "epoch": 0.12522982635342186, + "grad_norm": 1.5443837398441254, + "learning_rate": 1.952839686696725e-05, + "loss": 0.7175, + "step": 1226 + }, + { + "epoch": 0.12533197139938712, + "grad_norm": 1.5533678977592649, + "learning_rate": 1.9527392352266007e-05, + "loss": 0.8163, + "step": 1227 + }, + { + "epoch": 0.1254341164453524, + "grad_norm": 1.5194944724104429, + "learning_rate": 1.9526386794785514e-05, + "loss": 0.7864, + "step": 1228 + }, + { + "epoch": 0.12553626149131766, + "grad_norm": 1.5596736490843033, + "learning_rate": 1.9525380194635825e-05, + "loss": 0.855, + "step": 1229 + }, + { + "epoch": 0.12563840653728295, + "grad_norm": 1.501947991926002, + "learning_rate": 1.9524372551927128e-05, + "loss": 0.7163, + "step": 1230 + }, + { + "epoch": 0.1257405515832482, + "grad_norm": 1.590038770143226, + "learning_rate": 1.9523363866769695e-05, + "loss": 0.9486, + "step": 1231 + }, + { + "epoch": 0.1258426966292135, + "grad_norm": 1.3737947858009558, + "learning_rate": 1.9522354139273937e-05, + "loss": 0.6417, + "step": 1232 + }, + { + "epoch": 0.12594484167517875, + "grad_norm": 1.4189486373843732, + "learning_rate": 1.9521343369550365e-05, + "loss": 0.7811, + "step": 1233 + }, + { + "epoch": 0.12604698672114403, + "grad_norm": 1.6212084434284937, + "learning_rate": 1.9520331557709615e-05, + "loss": 0.7716, + "step": 1234 + }, + { + "epoch": 0.1261491317671093, + "grad_norm": 1.4103584984512114, + "learning_rate": 1.9519318703862418e-05, + "loss": 0.7178, + "step": 1235 + }, + { + "epoch": 0.12625127681307458, + "grad_norm": 1.6483854093234263, + "learning_rate": 1.951830480811964e-05, + "loss": 0.7433, + "step": 1236 + }, + { + "epoch": 0.12635342185903983, + "grad_norm": 1.4243029379299237, + "learning_rate": 1.9517289870592254e-05, + "loss": 0.8385, + "step": 1237 + }, + { + "epoch": 0.12645556690500512, + "grad_norm": 1.5651855621549706, + "learning_rate": 1.9516273891391342e-05, + "loss": 0.7024, + "step": 1238 + }, + { + "epoch": 0.12655771195097038, + "grad_norm": 1.5572859040246025, + "learning_rate": 1.95152568706281e-05, + "loss": 0.7596, + "step": 1239 + }, + { + "epoch": 0.12665985699693566, + "grad_norm": 1.4300605871870788, + "learning_rate": 1.951423880841385e-05, + "loss": 0.7464, + "step": 1240 + }, + { + "epoch": 0.12676200204290092, + "grad_norm": 1.5176634948195888, + "learning_rate": 1.9513219704860016e-05, + "loss": 0.7086, + "step": 1241 + }, + { + "epoch": 0.1268641470888662, + "grad_norm": 1.3279057278393684, + "learning_rate": 1.9512199560078137e-05, + "loss": 0.7435, + "step": 1242 + }, + { + "epoch": 0.12696629213483146, + "grad_norm": 1.6322991409058123, + "learning_rate": 1.951117837417987e-05, + "loss": 0.8146, + "step": 1243 + }, + { + "epoch": 0.12706843718079674, + "grad_norm": 1.4382790233270137, + "learning_rate": 1.9510156147276988e-05, + "loss": 0.8311, + "step": 1244 + }, + { + "epoch": 0.127170582226762, + "grad_norm": 1.4436498383514786, + "learning_rate": 1.950913287948137e-05, + "loss": 0.7165, + "step": 1245 + }, + { + "epoch": 0.12727272727272726, + "grad_norm": 1.527741830480384, + "learning_rate": 1.9508108570905013e-05, + "loss": 0.7783, + "step": 1246 + }, + { + "epoch": 0.12737487231869254, + "grad_norm": 1.6083031572909405, + "learning_rate": 1.950708322166003e-05, + "loss": 0.7499, + "step": 1247 + }, + { + "epoch": 0.1274770173646578, + "grad_norm": 1.505195219056494, + "learning_rate": 1.950605683185865e-05, + "loss": 0.8373, + "step": 1248 + }, + { + "epoch": 0.1275791624106231, + "grad_norm": 1.7304183447760781, + "learning_rate": 1.95050294016132e-05, + "loss": 0.814, + "step": 1249 + }, + { + "epoch": 0.12768130745658834, + "grad_norm": 1.3974639477310231, + "learning_rate": 1.950400093103615e-05, + "loss": 0.741, + "step": 1250 + }, + { + "epoch": 0.12778345250255363, + "grad_norm": 1.3839809198826516, + "learning_rate": 1.9502971420240052e-05, + "loss": 0.6405, + "step": 1251 + }, + { + "epoch": 0.1278855975485189, + "grad_norm": 1.5778035995302413, + "learning_rate": 1.9501940869337595e-05, + "loss": 0.9277, + "step": 1252 + }, + { + "epoch": 0.12798774259448417, + "grad_norm": 1.7643454749799408, + "learning_rate": 1.9500909278441573e-05, + "loss": 0.7097, + "step": 1253 + }, + { + "epoch": 0.12808988764044943, + "grad_norm": 1.426867190810617, + "learning_rate": 1.9499876647664885e-05, + "loss": 0.809, + "step": 1254 + }, + { + "epoch": 0.12819203268641471, + "grad_norm": 1.5393127596372518, + "learning_rate": 1.9498842977120564e-05, + "loss": 0.7591, + "step": 1255 + }, + { + "epoch": 0.12829417773237997, + "grad_norm": 1.576772161834551, + "learning_rate": 1.9497808266921746e-05, + "loss": 0.7773, + "step": 1256 + }, + { + "epoch": 0.12839632277834526, + "grad_norm": 1.534560439883318, + "learning_rate": 1.949677251718167e-05, + "loss": 0.7538, + "step": 1257 + }, + { + "epoch": 0.1284984678243105, + "grad_norm": 1.4807566812345876, + "learning_rate": 1.9495735728013708e-05, + "loss": 0.726, + "step": 1258 + }, + { + "epoch": 0.1286006128702758, + "grad_norm": 1.4769360453666605, + "learning_rate": 1.9494697899531338e-05, + "loss": 0.798, + "step": 1259 + }, + { + "epoch": 0.12870275791624106, + "grad_norm": 1.5131516329856072, + "learning_rate": 1.9493659031848148e-05, + "loss": 0.8166, + "step": 1260 + }, + { + "epoch": 0.12880490296220634, + "grad_norm": 1.523882899949307, + "learning_rate": 1.9492619125077844e-05, + "loss": 0.7228, + "step": 1261 + }, + { + "epoch": 0.1289070480081716, + "grad_norm": 1.4289521472821463, + "learning_rate": 1.9491578179334244e-05, + "loss": 0.7096, + "step": 1262 + }, + { + "epoch": 0.12900919305413688, + "grad_norm": 1.4967584239560239, + "learning_rate": 1.9490536194731276e-05, + "loss": 0.7837, + "step": 1263 + }, + { + "epoch": 0.12911133810010214, + "grad_norm": 1.661852848620628, + "learning_rate": 1.9489493171382993e-05, + "loss": 0.7938, + "step": 1264 + }, + { + "epoch": 0.12921348314606743, + "grad_norm": 1.3594635746150734, + "learning_rate": 1.948844910940355e-05, + "loss": 0.6353, + "step": 1265 + }, + { + "epoch": 0.12931562819203268, + "grad_norm": 1.5197666808732042, + "learning_rate": 1.9487404008907222e-05, + "loss": 0.7502, + "step": 1266 + }, + { + "epoch": 0.12941777323799797, + "grad_norm": 1.554233137007806, + "learning_rate": 1.9486357870008397e-05, + "loss": 0.7949, + "step": 1267 + }, + { + "epoch": 0.12951991828396323, + "grad_norm": 1.4390956841570666, + "learning_rate": 1.9485310692821572e-05, + "loss": 0.6916, + "step": 1268 + }, + { + "epoch": 0.1296220633299285, + "grad_norm": 1.4287033312432231, + "learning_rate": 1.9484262477461365e-05, + "loss": 0.6125, + "step": 1269 + }, + { + "epoch": 0.12972420837589377, + "grad_norm": 1.500760928122116, + "learning_rate": 1.94832132240425e-05, + "loss": 0.7405, + "step": 1270 + }, + { + "epoch": 0.12982635342185905, + "grad_norm": 1.4439762419213706, + "learning_rate": 1.9482162932679824e-05, + "loss": 0.8158, + "step": 1271 + }, + { + "epoch": 0.1299284984678243, + "grad_norm": 1.6065463329205603, + "learning_rate": 1.9481111603488284e-05, + "loss": 0.7566, + "step": 1272 + }, + { + "epoch": 0.13003064351378957, + "grad_norm": 1.3978531502606641, + "learning_rate": 1.948005923658296e-05, + "loss": 0.7509, + "step": 1273 + }, + { + "epoch": 0.13013278855975485, + "grad_norm": 1.4805404323480404, + "learning_rate": 1.9479005832079022e-05, + "loss": 0.7183, + "step": 1274 + }, + { + "epoch": 0.1302349336057201, + "grad_norm": 1.3731977149725076, + "learning_rate": 1.9477951390091772e-05, + "loss": 0.7126, + "step": 1275 + }, + { + "epoch": 0.1303370786516854, + "grad_norm": 1.3545611955295465, + "learning_rate": 1.9476895910736624e-05, + "loss": 0.7196, + "step": 1276 + }, + { + "epoch": 0.13043922369765065, + "grad_norm": 1.4642178006924533, + "learning_rate": 1.9475839394129093e-05, + "loss": 0.8878, + "step": 1277 + }, + { + "epoch": 0.13054136874361594, + "grad_norm": 1.5491755067832083, + "learning_rate": 1.9474781840384816e-05, + "loss": 0.6414, + "step": 1278 + }, + { + "epoch": 0.1306435137895812, + "grad_norm": 1.5179398342410995, + "learning_rate": 1.9473723249619545e-05, + "loss": 0.7288, + "step": 1279 + }, + { + "epoch": 0.13074565883554648, + "grad_norm": 1.4280745019034362, + "learning_rate": 1.9472663621949147e-05, + "loss": 0.677, + "step": 1280 + }, + { + "epoch": 0.13084780388151174, + "grad_norm": 1.4342691281373265, + "learning_rate": 1.947160295748959e-05, + "loss": 0.7647, + "step": 1281 + }, + { + "epoch": 0.13094994892747702, + "grad_norm": 1.459768333429644, + "learning_rate": 1.9470541256356976e-05, + "loss": 0.741, + "step": 1282 + }, + { + "epoch": 0.13105209397344228, + "grad_norm": 1.402060939716417, + "learning_rate": 1.94694785186675e-05, + "loss": 0.862, + "step": 1283 + }, + { + "epoch": 0.13115423901940756, + "grad_norm": 1.5040641667412629, + "learning_rate": 1.946841474453748e-05, + "loss": 0.7683, + "step": 1284 + }, + { + "epoch": 0.13125638406537282, + "grad_norm": 1.5512216794858509, + "learning_rate": 1.9467349934083353e-05, + "loss": 0.7777, + "step": 1285 + }, + { + "epoch": 0.1313585291113381, + "grad_norm": 1.4563363227319008, + "learning_rate": 1.9466284087421657e-05, + "loss": 0.7635, + "step": 1286 + }, + { + "epoch": 0.13146067415730336, + "grad_norm": 1.5866943978222612, + "learning_rate": 1.946521720466905e-05, + "loss": 0.7069, + "step": 1287 + }, + { + "epoch": 0.13156281920326865, + "grad_norm": 1.6021632179471066, + "learning_rate": 1.946414928594231e-05, + "loss": 0.7996, + "step": 1288 + }, + { + "epoch": 0.1316649642492339, + "grad_norm": 1.7024833072792165, + "learning_rate": 1.946308033135831e-05, + "loss": 0.8261, + "step": 1289 + }, + { + "epoch": 0.1317671092951992, + "grad_norm": 1.4611987403682454, + "learning_rate": 1.9462010341034054e-05, + "loss": 0.8057, + "step": 1290 + }, + { + "epoch": 0.13186925434116445, + "grad_norm": 1.5828858730965563, + "learning_rate": 1.9460939315086656e-05, + "loss": 0.6879, + "step": 1291 + }, + { + "epoch": 0.13197139938712973, + "grad_norm": 1.5268272763769084, + "learning_rate": 1.9459867253633336e-05, + "loss": 0.7829, + "step": 1292 + }, + { + "epoch": 0.132073544433095, + "grad_norm": 1.3799394442782487, + "learning_rate": 1.9458794156791434e-05, + "loss": 0.7399, + "step": 1293 + }, + { + "epoch": 0.13217568947906028, + "grad_norm": 1.8226588432193558, + "learning_rate": 1.9457720024678403e-05, + "loss": 0.8049, + "step": 1294 + }, + { + "epoch": 0.13227783452502553, + "grad_norm": 1.5747986639029585, + "learning_rate": 1.94566448574118e-05, + "loss": 0.7791, + "step": 1295 + }, + { + "epoch": 0.13237997957099082, + "grad_norm": 1.5707765502019684, + "learning_rate": 1.9455568655109308e-05, + "loss": 0.6383, + "step": 1296 + }, + { + "epoch": 0.13248212461695608, + "grad_norm": 1.5390867489841076, + "learning_rate": 1.945449141788872e-05, + "loss": 0.7131, + "step": 1297 + }, + { + "epoch": 0.13258426966292136, + "grad_norm": 1.5738936594218587, + "learning_rate": 1.9453413145867935e-05, + "loss": 0.7667, + "step": 1298 + }, + { + "epoch": 0.13268641470888662, + "grad_norm": 1.3888136625789946, + "learning_rate": 1.9452333839164977e-05, + "loss": 0.6533, + "step": 1299 + }, + { + "epoch": 0.13278855975485188, + "grad_norm": 1.4486621457095246, + "learning_rate": 1.945125349789797e-05, + "loss": 0.7136, + "step": 1300 + }, + { + "epoch": 0.13289070480081716, + "grad_norm": 1.5733295922815216, + "learning_rate": 1.9450172122185166e-05, + "loss": 0.6742, + "step": 1301 + }, + { + "epoch": 0.13299284984678242, + "grad_norm": 1.5505307195551454, + "learning_rate": 1.9449089712144912e-05, + "loss": 0.7434, + "step": 1302 + }, + { + "epoch": 0.1330949948927477, + "grad_norm": 1.4918367742236909, + "learning_rate": 1.9448006267895688e-05, + "loss": 0.7375, + "step": 1303 + }, + { + "epoch": 0.13319713993871296, + "grad_norm": 1.6285234979713898, + "learning_rate": 1.9446921789556072e-05, + "loss": 0.8028, + "step": 1304 + }, + { + "epoch": 0.13329928498467825, + "grad_norm": 1.50801812580762, + "learning_rate": 1.9445836277244764e-05, + "loss": 0.7068, + "step": 1305 + }, + { + "epoch": 0.1334014300306435, + "grad_norm": 1.4443984402819352, + "learning_rate": 1.944474973108057e-05, + "loss": 0.7831, + "step": 1306 + }, + { + "epoch": 0.1335035750766088, + "grad_norm": 1.5812596004458372, + "learning_rate": 1.944366215118242e-05, + "loss": 0.8283, + "step": 1307 + }, + { + "epoch": 0.13360572012257405, + "grad_norm": 1.481286361820175, + "learning_rate": 1.9442573537669344e-05, + "loss": 0.7908, + "step": 1308 + }, + { + "epoch": 0.13370786516853933, + "grad_norm": 1.5321188104668377, + "learning_rate": 1.9441483890660494e-05, + "loss": 0.7134, + "step": 1309 + }, + { + "epoch": 0.1338100102145046, + "grad_norm": 1.5282223488236462, + "learning_rate": 1.944039321027513e-05, + "loss": 0.7157, + "step": 1310 + }, + { + "epoch": 0.13391215526046987, + "grad_norm": 1.7117893928552739, + "learning_rate": 1.9439301496632634e-05, + "loss": 0.668, + "step": 1311 + }, + { + "epoch": 0.13401430030643513, + "grad_norm": 1.5003983486835706, + "learning_rate": 1.9438208749852486e-05, + "loss": 0.8136, + "step": 1312 + }, + { + "epoch": 0.13411644535240042, + "grad_norm": 1.6334235620493318, + "learning_rate": 1.9437114970054294e-05, + "loss": 0.8367, + "step": 1313 + }, + { + "epoch": 0.13421859039836567, + "grad_norm": 1.3918477316569242, + "learning_rate": 1.9436020157357772e-05, + "loss": 0.7825, + "step": 1314 + }, + { + "epoch": 0.13432073544433096, + "grad_norm": 1.4074311673112576, + "learning_rate": 1.9434924311882747e-05, + "loss": 0.7197, + "step": 1315 + }, + { + "epoch": 0.13442288049029621, + "grad_norm": 1.4626436779780858, + "learning_rate": 1.943382743374916e-05, + "loss": 0.7666, + "step": 1316 + }, + { + "epoch": 0.1345250255362615, + "grad_norm": 1.393085008920469, + "learning_rate": 1.943272952307707e-05, + "loss": 0.6859, + "step": 1317 + }, + { + "epoch": 0.13462717058222676, + "grad_norm": 1.4130247019420528, + "learning_rate": 1.9431630579986635e-05, + "loss": 0.7929, + "step": 1318 + }, + { + "epoch": 0.13472931562819204, + "grad_norm": 1.483480212650707, + "learning_rate": 1.9430530604598137e-05, + "loss": 0.727, + "step": 1319 + }, + { + "epoch": 0.1348314606741573, + "grad_norm": 1.4358273100426875, + "learning_rate": 1.9429429597031976e-05, + "loss": 0.8062, + "step": 1320 + }, + { + "epoch": 0.13493360572012258, + "grad_norm": 1.3728179486111791, + "learning_rate": 1.942832755740865e-05, + "loss": 0.7409, + "step": 1321 + }, + { + "epoch": 0.13503575076608784, + "grad_norm": 1.544396196573522, + "learning_rate": 1.9427224485848783e-05, + "loss": 0.7213, + "step": 1322 + }, + { + "epoch": 0.13513789581205313, + "grad_norm": 1.7213121530133915, + "learning_rate": 1.9426120382473108e-05, + "loss": 0.8171, + "step": 1323 + }, + { + "epoch": 0.13524004085801838, + "grad_norm": 1.7749093718289375, + "learning_rate": 1.942501524740246e-05, + "loss": 0.7975, + "step": 1324 + }, + { + "epoch": 0.13534218590398367, + "grad_norm": 1.5688829761916672, + "learning_rate": 1.942390908075781e-05, + "loss": 0.8525, + "step": 1325 + }, + { + "epoch": 0.13544433094994893, + "grad_norm": 1.4841413656191014, + "learning_rate": 1.9422801882660223e-05, + "loss": 0.8521, + "step": 1326 + }, + { + "epoch": 0.1355464759959142, + "grad_norm": 1.5283490215530708, + "learning_rate": 1.942169365323088e-05, + "loss": 0.6904, + "step": 1327 + }, + { + "epoch": 0.13564862104187947, + "grad_norm": 1.655275932915154, + "learning_rate": 1.942058439259108e-05, + "loss": 0.8149, + "step": 1328 + }, + { + "epoch": 0.13575076608784473, + "grad_norm": 1.5603683493673701, + "learning_rate": 1.941947410086223e-05, + "loss": 0.6689, + "step": 1329 + }, + { + "epoch": 0.13585291113381, + "grad_norm": 1.6616119726147993, + "learning_rate": 1.9418362778165855e-05, + "loss": 0.7794, + "step": 1330 + }, + { + "epoch": 0.13595505617977527, + "grad_norm": 1.543469652291921, + "learning_rate": 1.9417250424623588e-05, + "loss": 0.6894, + "step": 1331 + }, + { + "epoch": 0.13605720122574055, + "grad_norm": 1.3779853751687607, + "learning_rate": 1.9416137040357176e-05, + "loss": 0.7143, + "step": 1332 + }, + { + "epoch": 0.1361593462717058, + "grad_norm": 1.4305466348146565, + "learning_rate": 1.9415022625488485e-05, + "loss": 0.6136, + "step": 1333 + }, + { + "epoch": 0.1362614913176711, + "grad_norm": 1.413453657167387, + "learning_rate": 1.9413907180139483e-05, + "loss": 0.7399, + "step": 1334 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 1.4824260310503163, + "learning_rate": 1.9412790704432258e-05, + "loss": 0.8489, + "step": 1335 + }, + { + "epoch": 0.13646578140960164, + "grad_norm": 1.6269049337234491, + "learning_rate": 1.941167319848901e-05, + "loss": 0.7291, + "step": 1336 + }, + { + "epoch": 0.1365679264555669, + "grad_norm": 1.4560968142741078, + "learning_rate": 1.941055466243205e-05, + "loss": 0.6166, + "step": 1337 + }, + { + "epoch": 0.13667007150153218, + "grad_norm": 1.376626204111802, + "learning_rate": 1.94094350963838e-05, + "loss": 0.6606, + "step": 1338 + }, + { + "epoch": 0.13677221654749744, + "grad_norm": 1.561054857458974, + "learning_rate": 1.94083145004668e-05, + "loss": 0.7449, + "step": 1339 + }, + { + "epoch": 0.13687436159346272, + "grad_norm": 1.5790169536603222, + "learning_rate": 1.9407192874803703e-05, + "loss": 0.9003, + "step": 1340 + }, + { + "epoch": 0.13697650663942798, + "grad_norm": 1.4951587843090144, + "learning_rate": 1.9406070219517264e-05, + "loss": 0.6879, + "step": 1341 + }, + { + "epoch": 0.13707865168539327, + "grad_norm": 1.4384364850723763, + "learning_rate": 1.9404946534730365e-05, + "loss": 0.7172, + "step": 1342 + }, + { + "epoch": 0.13718079673135852, + "grad_norm": 1.3699886233438812, + "learning_rate": 1.940382182056599e-05, + "loss": 0.7686, + "step": 1343 + }, + { + "epoch": 0.1372829417773238, + "grad_norm": 1.6379405576048405, + "learning_rate": 1.9402696077147238e-05, + "loss": 0.812, + "step": 1344 + }, + { + "epoch": 0.13738508682328907, + "grad_norm": 1.4284180064912348, + "learning_rate": 1.940156930459733e-05, + "loss": 0.7538, + "step": 1345 + }, + { + "epoch": 0.13748723186925435, + "grad_norm": 1.533963594673483, + "learning_rate": 1.9400441503039586e-05, + "loss": 0.7255, + "step": 1346 + }, + { + "epoch": 0.1375893769152196, + "grad_norm": 1.4263382666907924, + "learning_rate": 1.9399312672597447e-05, + "loss": 0.8372, + "step": 1347 + }, + { + "epoch": 0.1376915219611849, + "grad_norm": 1.5152440970366143, + "learning_rate": 1.939818281339446e-05, + "loss": 0.8038, + "step": 1348 + }, + { + "epoch": 0.13779366700715015, + "grad_norm": 1.508343586759886, + "learning_rate": 1.9397051925554294e-05, + "loss": 0.7741, + "step": 1349 + }, + { + "epoch": 0.13789581205311544, + "grad_norm": 1.3650663556253753, + "learning_rate": 1.9395920009200722e-05, + "loss": 0.876, + "step": 1350 + }, + { + "epoch": 0.1379979570990807, + "grad_norm": 1.5103456966424964, + "learning_rate": 1.939478706445764e-05, + "loss": 0.7709, + "step": 1351 + }, + { + "epoch": 0.13810010214504598, + "grad_norm": 1.5280813303792649, + "learning_rate": 1.939365309144904e-05, + "loss": 0.8724, + "step": 1352 + }, + { + "epoch": 0.13820224719101123, + "grad_norm": 1.4698126131473697, + "learning_rate": 1.939251809029904e-05, + "loss": 0.8719, + "step": 1353 + }, + { + "epoch": 0.13830439223697652, + "grad_norm": 1.5404653841737956, + "learning_rate": 1.9391382061131865e-05, + "loss": 0.7133, + "step": 1354 + }, + { + "epoch": 0.13840653728294178, + "grad_norm": 1.4550072490606445, + "learning_rate": 1.939024500407186e-05, + "loss": 0.6767, + "step": 1355 + }, + { + "epoch": 0.13850868232890703, + "grad_norm": 1.6688748917700813, + "learning_rate": 1.938910691924347e-05, + "loss": 0.7601, + "step": 1356 + }, + { + "epoch": 0.13861082737487232, + "grad_norm": 1.5667379945866036, + "learning_rate": 1.9387967806771263e-05, + "loss": 0.7577, + "step": 1357 + }, + { + "epoch": 0.13871297242083758, + "grad_norm": 1.576333592323303, + "learning_rate": 1.938682766677991e-05, + "loss": 0.7406, + "step": 1358 + }, + { + "epoch": 0.13881511746680286, + "grad_norm": 1.5932437174249479, + "learning_rate": 1.9385686499394208e-05, + "loss": 0.8517, + "step": 1359 + }, + { + "epoch": 0.13891726251276812, + "grad_norm": 1.5736017529706714, + "learning_rate": 1.9384544304739053e-05, + "loss": 0.8212, + "step": 1360 + }, + { + "epoch": 0.1390194075587334, + "grad_norm": 1.41905562640827, + "learning_rate": 1.938340108293946e-05, + "loss": 0.773, + "step": 1361 + }, + { + "epoch": 0.13912155260469866, + "grad_norm": 1.3786796593876844, + "learning_rate": 1.9382256834120562e-05, + "loss": 0.7801, + "step": 1362 + }, + { + "epoch": 0.13922369765066395, + "grad_norm": 1.4530673814590769, + "learning_rate": 1.9381111558407585e-05, + "loss": 0.7972, + "step": 1363 + }, + { + "epoch": 0.1393258426966292, + "grad_norm": 1.513303311858783, + "learning_rate": 1.9379965255925887e-05, + "loss": 0.6332, + "step": 1364 + }, + { + "epoch": 0.1394279877425945, + "grad_norm": 1.8621510199139821, + "learning_rate": 1.9378817926800938e-05, + "loss": 0.8913, + "step": 1365 + }, + { + "epoch": 0.13953013278855975, + "grad_norm": 1.6131804902974567, + "learning_rate": 1.9377669571158302e-05, + "loss": 0.8056, + "step": 1366 + }, + { + "epoch": 0.13963227783452503, + "grad_norm": 1.4439086302212054, + "learning_rate": 1.9376520189123675e-05, + "loss": 0.7196, + "step": 1367 + }, + { + "epoch": 0.1397344228804903, + "grad_norm": 1.4079054076740687, + "learning_rate": 1.937536978082285e-05, + "loss": 0.7259, + "step": 1368 + }, + { + "epoch": 0.13983656792645557, + "grad_norm": 1.504774731438275, + "learning_rate": 1.937421834638175e-05, + "loss": 0.7839, + "step": 1369 + }, + { + "epoch": 0.13993871297242083, + "grad_norm": 1.378707243885884, + "learning_rate": 1.9373065885926396e-05, + "loss": 0.7863, + "step": 1370 + }, + { + "epoch": 0.14004085801838612, + "grad_norm": 1.5639070409621605, + "learning_rate": 1.9371912399582924e-05, + "loss": 0.9166, + "step": 1371 + }, + { + "epoch": 0.14014300306435137, + "grad_norm": 1.4912691865465266, + "learning_rate": 1.9370757887477585e-05, + "loss": 0.6059, + "step": 1372 + }, + { + "epoch": 0.14024514811031666, + "grad_norm": 1.7796257513381397, + "learning_rate": 1.936960234973674e-05, + "loss": 0.8543, + "step": 1373 + }, + { + "epoch": 0.14034729315628192, + "grad_norm": 1.4614805988248598, + "learning_rate": 1.936844578648686e-05, + "loss": 0.7885, + "step": 1374 + }, + { + "epoch": 0.1404494382022472, + "grad_norm": 1.5688929373248865, + "learning_rate": 1.9367288197854544e-05, + "loss": 0.6908, + "step": 1375 + }, + { + "epoch": 0.14055158324821246, + "grad_norm": 1.50045233455611, + "learning_rate": 1.936612958396648e-05, + "loss": 0.8329, + "step": 1376 + }, + { + "epoch": 0.14065372829417774, + "grad_norm": 1.5752476079955477, + "learning_rate": 1.9364969944949482e-05, + "loss": 0.843, + "step": 1377 + }, + { + "epoch": 0.140755873340143, + "grad_norm": 1.5990392987981594, + "learning_rate": 1.936380928093047e-05, + "loss": 0.6926, + "step": 1378 + }, + { + "epoch": 0.14085801838610829, + "grad_norm": 1.4790781526314436, + "learning_rate": 1.9362647592036486e-05, + "loss": 0.7661, + "step": 1379 + }, + { + "epoch": 0.14096016343207354, + "grad_norm": 1.4716674800810408, + "learning_rate": 1.936148487839467e-05, + "loss": 0.7081, + "step": 1380 + }, + { + "epoch": 0.14106230847803883, + "grad_norm": 1.5709982042963817, + "learning_rate": 1.9360321140132293e-05, + "loss": 0.7921, + "step": 1381 + }, + { + "epoch": 0.14116445352400409, + "grad_norm": 1.599873337308863, + "learning_rate": 1.9359156377376714e-05, + "loss": 0.8262, + "step": 1382 + }, + { + "epoch": 0.14126659856996934, + "grad_norm": 1.788449074412186, + "learning_rate": 1.9357990590255424e-05, + "loss": 0.8253, + "step": 1383 + }, + { + "epoch": 0.14136874361593463, + "grad_norm": 1.5913189352470587, + "learning_rate": 1.9356823778896015e-05, + "loss": 0.6756, + "step": 1384 + }, + { + "epoch": 0.14147088866189989, + "grad_norm": 1.5406190115304836, + "learning_rate": 1.93556559434262e-05, + "loss": 0.827, + "step": 1385 + }, + { + "epoch": 0.14157303370786517, + "grad_norm": 1.3037905478178327, + "learning_rate": 1.93544870839738e-05, + "loss": 0.6651, + "step": 1386 + }, + { + "epoch": 0.14167517875383043, + "grad_norm": 1.4623088059288514, + "learning_rate": 1.935331720066674e-05, + "loss": 0.7472, + "step": 1387 + }, + { + "epoch": 0.1417773237997957, + "grad_norm": 1.3607830899730813, + "learning_rate": 1.9352146293633075e-05, + "loss": 0.6945, + "step": 1388 + }, + { + "epoch": 0.14187946884576097, + "grad_norm": 1.5794147762084267, + "learning_rate": 1.9350974363000954e-05, + "loss": 0.7526, + "step": 1389 + }, + { + "epoch": 0.14198161389172625, + "grad_norm": 1.3437609366040257, + "learning_rate": 1.934980140889865e-05, + "loss": 0.6844, + "step": 1390 + }, + { + "epoch": 0.1420837589376915, + "grad_norm": 1.6194175430599684, + "learning_rate": 1.9348627431454535e-05, + "loss": 0.8139, + "step": 1391 + }, + { + "epoch": 0.1421859039836568, + "grad_norm": 1.4677720205491822, + "learning_rate": 1.9347452430797107e-05, + "loss": 0.7707, + "step": 1392 + }, + { + "epoch": 0.14228804902962205, + "grad_norm": 1.5420864374045482, + "learning_rate": 1.9346276407054977e-05, + "loss": 0.8047, + "step": 1393 + }, + { + "epoch": 0.14239019407558734, + "grad_norm": 1.521937007190085, + "learning_rate": 1.9345099360356855e-05, + "loss": 0.768, + "step": 1394 + }, + { + "epoch": 0.1424923391215526, + "grad_norm": 1.4872828146564219, + "learning_rate": 1.9343921290831568e-05, + "loss": 0.8, + "step": 1395 + }, + { + "epoch": 0.14259448416751788, + "grad_norm": 1.6513987228829032, + "learning_rate": 1.934274219860806e-05, + "loss": 0.7522, + "step": 1396 + }, + { + "epoch": 0.14269662921348314, + "grad_norm": 1.4432876187192811, + "learning_rate": 1.934156208381538e-05, + "loss": 0.6924, + "step": 1397 + }, + { + "epoch": 0.14279877425944842, + "grad_norm": 1.506645953117852, + "learning_rate": 1.9340380946582694e-05, + "loss": 0.7886, + "step": 1398 + }, + { + "epoch": 0.14290091930541368, + "grad_norm": 1.4171081837989667, + "learning_rate": 1.9339198787039285e-05, + "loss": 0.7126, + "step": 1399 + }, + { + "epoch": 0.14300306435137897, + "grad_norm": 1.4739622925960878, + "learning_rate": 1.933801560531453e-05, + "loss": 0.7008, + "step": 1400 + }, + { + "epoch": 0.14310520939734422, + "grad_norm": 1.4706000588407875, + "learning_rate": 1.9336831401537933e-05, + "loss": 0.778, + "step": 1401 + }, + { + "epoch": 0.1432073544433095, + "grad_norm": 1.4622166757971402, + "learning_rate": 1.9335646175839108e-05, + "loss": 0.7631, + "step": 1402 + }, + { + "epoch": 0.14330949948927477, + "grad_norm": 1.6228633576062323, + "learning_rate": 1.933445992834778e-05, + "loss": 0.7279, + "step": 1403 + }, + { + "epoch": 0.14341164453524005, + "grad_norm": 1.4940925937386755, + "learning_rate": 1.933327265919378e-05, + "loss": 0.8152, + "step": 1404 + }, + { + "epoch": 0.1435137895812053, + "grad_norm": 1.5523548988134994, + "learning_rate": 1.9332084368507054e-05, + "loss": 0.704, + "step": 1405 + }, + { + "epoch": 0.1436159346271706, + "grad_norm": 1.5344956944348773, + "learning_rate": 1.933089505641767e-05, + "loss": 0.7647, + "step": 1406 + }, + { + "epoch": 0.14371807967313585, + "grad_norm": 1.782563891796052, + "learning_rate": 1.9329704723055794e-05, + "loss": 0.78, + "step": 1407 + }, + { + "epoch": 0.14382022471910114, + "grad_norm": 1.6328428395049432, + "learning_rate": 1.9328513368551705e-05, + "loss": 0.6539, + "step": 1408 + }, + { + "epoch": 0.1439223697650664, + "grad_norm": 1.613057439061014, + "learning_rate": 1.9327320993035798e-05, + "loss": 0.7319, + "step": 1409 + }, + { + "epoch": 0.14402451481103168, + "grad_norm": 1.471923621647136, + "learning_rate": 1.932612759663859e-05, + "loss": 0.6916, + "step": 1410 + }, + { + "epoch": 0.14412665985699694, + "grad_norm": 1.4607931320206955, + "learning_rate": 1.9324933179490685e-05, + "loss": 0.7397, + "step": 1411 + }, + { + "epoch": 0.1442288049029622, + "grad_norm": 1.5368415983405097, + "learning_rate": 1.9323737741722822e-05, + "loss": 0.7576, + "step": 1412 + }, + { + "epoch": 0.14433094994892748, + "grad_norm": 1.5486350482380276, + "learning_rate": 1.9322541283465836e-05, + "loss": 0.7142, + "step": 1413 + }, + { + "epoch": 0.14443309499489274, + "grad_norm": 1.593433715364878, + "learning_rate": 1.9321343804850685e-05, + "loss": 0.7307, + "step": 1414 + }, + { + "epoch": 0.14453524004085802, + "grad_norm": 1.4320179414715932, + "learning_rate": 1.932014530600843e-05, + "loss": 0.7237, + "step": 1415 + }, + { + "epoch": 0.14463738508682328, + "grad_norm": 1.5985084758234467, + "learning_rate": 1.9318945787070254e-05, + "loss": 0.7798, + "step": 1416 + }, + { + "epoch": 0.14473953013278856, + "grad_norm": 1.534389328107597, + "learning_rate": 1.931774524816744e-05, + "loss": 0.8358, + "step": 1417 + }, + { + "epoch": 0.14484167517875382, + "grad_norm": 1.4777602409689825, + "learning_rate": 1.9316543689431386e-05, + "loss": 0.7575, + "step": 1418 + }, + { + "epoch": 0.1449438202247191, + "grad_norm": 1.3380835073849822, + "learning_rate": 1.9315341110993605e-05, + "loss": 0.7664, + "step": 1419 + }, + { + "epoch": 0.14504596527068436, + "grad_norm": 1.5120988912766862, + "learning_rate": 1.9314137512985724e-05, + "loss": 0.7337, + "step": 1420 + }, + { + "epoch": 0.14514811031664965, + "grad_norm": 1.6401398475298732, + "learning_rate": 1.9312932895539475e-05, + "loss": 0.7502, + "step": 1421 + }, + { + "epoch": 0.1452502553626149, + "grad_norm": 1.6279320168785258, + "learning_rate": 1.9311727258786703e-05, + "loss": 0.835, + "step": 1422 + }, + { + "epoch": 0.1453524004085802, + "grad_norm": 1.5352814262650878, + "learning_rate": 1.9310520602859365e-05, + "loss": 0.7525, + "step": 1423 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 1.5431191122346422, + "learning_rate": 1.9309312927889534e-05, + "loss": 0.7357, + "step": 1424 + }, + { + "epoch": 0.14555669050051073, + "grad_norm": 1.4817092929156874, + "learning_rate": 1.9308104234009386e-05, + "loss": 0.6962, + "step": 1425 + }, + { + "epoch": 0.145658835546476, + "grad_norm": 1.421136600226062, + "learning_rate": 1.9306894521351215e-05, + "loss": 0.6703, + "step": 1426 + }, + { + "epoch": 0.14576098059244127, + "grad_norm": 1.3933778803200363, + "learning_rate": 1.930568379004743e-05, + "loss": 0.7926, + "step": 1427 + }, + { + "epoch": 0.14586312563840653, + "grad_norm": 1.5154338291269438, + "learning_rate": 1.9304472040230536e-05, + "loss": 0.7942, + "step": 1428 + }, + { + "epoch": 0.14596527068437182, + "grad_norm": 1.549006847205402, + "learning_rate": 1.9303259272033172e-05, + "loss": 0.7881, + "step": 1429 + }, + { + "epoch": 0.14606741573033707, + "grad_norm": 1.6967940317252428, + "learning_rate": 1.9302045485588067e-05, + "loss": 0.7721, + "step": 1430 + }, + { + "epoch": 0.14616956077630236, + "grad_norm": 1.5144421939877968, + "learning_rate": 1.9300830681028075e-05, + "loss": 0.7273, + "step": 1431 + }, + { + "epoch": 0.14627170582226762, + "grad_norm": 1.6361562885904688, + "learning_rate": 1.9299614858486153e-05, + "loss": 0.7706, + "step": 1432 + }, + { + "epoch": 0.1463738508682329, + "grad_norm": 1.5441568350056314, + "learning_rate": 1.9298398018095378e-05, + "loss": 0.7106, + "step": 1433 + }, + { + "epoch": 0.14647599591419816, + "grad_norm": 1.5046355983891921, + "learning_rate": 1.9297180159988932e-05, + "loss": 0.8757, + "step": 1434 + }, + { + "epoch": 0.14657814096016344, + "grad_norm": 1.5692773136133311, + "learning_rate": 1.929596128430011e-05, + "loss": 0.6381, + "step": 1435 + }, + { + "epoch": 0.1466802860061287, + "grad_norm": 1.346942079068653, + "learning_rate": 1.929474139116232e-05, + "loss": 0.6742, + "step": 1436 + }, + { + "epoch": 0.146782431052094, + "grad_norm": 1.5823780363492548, + "learning_rate": 1.929352048070908e-05, + "loss": 0.8039, + "step": 1437 + }, + { + "epoch": 0.14688457609805924, + "grad_norm": 1.4353076685727753, + "learning_rate": 1.929229855307402e-05, + "loss": 0.7874, + "step": 1438 + }, + { + "epoch": 0.1469867211440245, + "grad_norm": 1.620129456542622, + "learning_rate": 1.9291075608390878e-05, + "loss": 0.8344, + "step": 1439 + }, + { + "epoch": 0.1470888661899898, + "grad_norm": 1.654984107401775, + "learning_rate": 1.928985164679351e-05, + "loss": 0.7368, + "step": 1440 + }, + { + "epoch": 0.14719101123595504, + "grad_norm": 1.707254850164429, + "learning_rate": 1.9288626668415875e-05, + "loss": 0.8333, + "step": 1441 + }, + { + "epoch": 0.14729315628192033, + "grad_norm": 1.521310645851988, + "learning_rate": 1.9287400673392055e-05, + "loss": 0.7475, + "step": 1442 + }, + { + "epoch": 0.1473953013278856, + "grad_norm": 1.6554247954801538, + "learning_rate": 1.9286173661856225e-05, + "loss": 0.759, + "step": 1443 + }, + { + "epoch": 0.14749744637385087, + "grad_norm": 1.361857290535015, + "learning_rate": 1.928494563394269e-05, + "loss": 0.755, + "step": 1444 + }, + { + "epoch": 0.14759959141981613, + "grad_norm": 1.5282863387931205, + "learning_rate": 1.9283716589785853e-05, + "loss": 0.6872, + "step": 1445 + }, + { + "epoch": 0.1477017364657814, + "grad_norm": 1.5782870301353333, + "learning_rate": 1.9282486529520244e-05, + "loss": 0.7992, + "step": 1446 + }, + { + "epoch": 0.14780388151174667, + "grad_norm": 1.383890072208761, + "learning_rate": 1.9281255453280484e-05, + "loss": 0.7153, + "step": 1447 + }, + { + "epoch": 0.14790602655771196, + "grad_norm": 1.3671009102299712, + "learning_rate": 1.9280023361201318e-05, + "loss": 0.7143, + "step": 1448 + }, + { + "epoch": 0.1480081716036772, + "grad_norm": 1.644965230665166, + "learning_rate": 1.92787902534176e-05, + "loss": 0.7854, + "step": 1449 + }, + { + "epoch": 0.1481103166496425, + "grad_norm": 1.5032252828546246, + "learning_rate": 1.9277556130064294e-05, + "loss": 0.7645, + "step": 1450 + }, + { + "epoch": 0.14821246169560776, + "grad_norm": 1.5630553567925038, + "learning_rate": 1.927632099127647e-05, + "loss": 0.7335, + "step": 1451 + }, + { + "epoch": 0.14831460674157304, + "grad_norm": 1.44658191805543, + "learning_rate": 1.9275084837189327e-05, + "loss": 0.7225, + "step": 1452 + }, + { + "epoch": 0.1484167517875383, + "grad_norm": 1.4567876122771806, + "learning_rate": 1.927384766793815e-05, + "loss": 0.767, + "step": 1453 + }, + { + "epoch": 0.14851889683350358, + "grad_norm": 1.594979566644765, + "learning_rate": 1.927260948365836e-05, + "loss": 0.8176, + "step": 1454 + }, + { + "epoch": 0.14862104187946884, + "grad_norm": 1.3493345185355654, + "learning_rate": 1.9271370284485473e-05, + "loss": 0.7765, + "step": 1455 + }, + { + "epoch": 0.14872318692543413, + "grad_norm": 1.5042715552702999, + "learning_rate": 1.9270130070555113e-05, + "loss": 0.7083, + "step": 1456 + }, + { + "epoch": 0.14882533197139938, + "grad_norm": 1.3788728031226123, + "learning_rate": 1.926888884200303e-05, + "loss": 0.6611, + "step": 1457 + }, + { + "epoch": 0.14892747701736467, + "grad_norm": 1.5957093881571296, + "learning_rate": 1.9267646598965072e-05, + "loss": 0.7957, + "step": 1458 + }, + { + "epoch": 0.14902962206332993, + "grad_norm": 1.5570548361848133, + "learning_rate": 1.9266403341577207e-05, + "loss": 0.8656, + "step": 1459 + }, + { + "epoch": 0.1491317671092952, + "grad_norm": 1.542929474556619, + "learning_rate": 1.926515906997551e-05, + "loss": 0.8636, + "step": 1460 + }, + { + "epoch": 0.14923391215526047, + "grad_norm": 1.5859799930217757, + "learning_rate": 1.9263913784296167e-05, + "loss": 0.8741, + "step": 1461 + }, + { + "epoch": 0.14933605720122575, + "grad_norm": 1.545057123754058, + "learning_rate": 1.9262667484675475e-05, + "loss": 0.7846, + "step": 1462 + }, + { + "epoch": 0.149438202247191, + "grad_norm": 1.517605416164882, + "learning_rate": 1.9261420171249845e-05, + "loss": 0.7935, + "step": 1463 + }, + { + "epoch": 0.1495403472931563, + "grad_norm": 1.5915475863525619, + "learning_rate": 1.926017184415579e-05, + "loss": 0.7503, + "step": 1464 + }, + { + "epoch": 0.14964249233912155, + "grad_norm": 1.6578246845499418, + "learning_rate": 1.9258922503529947e-05, + "loss": 0.7272, + "step": 1465 + }, + { + "epoch": 0.1497446373850868, + "grad_norm": 1.4727733271924286, + "learning_rate": 1.925767214950905e-05, + "loss": 0.7806, + "step": 1466 + }, + { + "epoch": 0.1498467824310521, + "grad_norm": 1.5813826776329285, + "learning_rate": 1.9256420782229955e-05, + "loss": 0.7835, + "step": 1467 + }, + { + "epoch": 0.14994892747701735, + "grad_norm": 1.4336086867236078, + "learning_rate": 1.925516840182963e-05, + "loss": 0.7781, + "step": 1468 + }, + { + "epoch": 0.15005107252298264, + "grad_norm": 1.5041070794548579, + "learning_rate": 1.925391500844514e-05, + "loss": 0.7518, + "step": 1469 + }, + { + "epoch": 0.1501532175689479, + "grad_norm": 1.5636170630895998, + "learning_rate": 1.9252660602213673e-05, + "loss": 0.9089, + "step": 1470 + }, + { + "epoch": 0.15025536261491318, + "grad_norm": 1.5084333374279997, + "learning_rate": 1.9251405183272526e-05, + "loss": 0.8935, + "step": 1471 + }, + { + "epoch": 0.15035750766087844, + "grad_norm": 1.6000781055724267, + "learning_rate": 1.9250148751759107e-05, + "loss": 0.8639, + "step": 1472 + }, + { + "epoch": 0.15045965270684372, + "grad_norm": 1.5188856248433633, + "learning_rate": 1.9248891307810926e-05, + "loss": 0.763, + "step": 1473 + }, + { + "epoch": 0.15056179775280898, + "grad_norm": 1.5348044454410106, + "learning_rate": 1.924763285156562e-05, + "loss": 0.68, + "step": 1474 + }, + { + "epoch": 0.15066394279877426, + "grad_norm": 1.5829844872951264, + "learning_rate": 1.9246373383160922e-05, + "loss": 0.7297, + "step": 1475 + }, + { + "epoch": 0.15076608784473952, + "grad_norm": 1.3605914489917799, + "learning_rate": 1.9245112902734684e-05, + "loss": 0.7421, + "step": 1476 + }, + { + "epoch": 0.1508682328907048, + "grad_norm": 1.4888298769889006, + "learning_rate": 1.9243851410424864e-05, + "loss": 0.7598, + "step": 1477 + }, + { + "epoch": 0.15097037793667006, + "grad_norm": 1.5265355255894464, + "learning_rate": 1.9242588906369538e-05, + "loss": 0.7434, + "step": 1478 + }, + { + "epoch": 0.15107252298263535, + "grad_norm": 1.5138511762317632, + "learning_rate": 1.924132539070688e-05, + "loss": 0.8592, + "step": 1479 + }, + { + "epoch": 0.1511746680286006, + "grad_norm": 1.340045628142175, + "learning_rate": 1.924006086357519e-05, + "loss": 0.6999, + "step": 1480 + }, + { + "epoch": 0.1512768130745659, + "grad_norm": 1.4160012428397106, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.6999, + "step": 1481 + }, + { + "epoch": 0.15137895812053115, + "grad_norm": 1.475502651293873, + "learning_rate": 1.9237528775458433e-05, + "loss": 0.8055, + "step": 1482 + }, + { + "epoch": 0.15148110316649643, + "grad_norm": 1.4894674637463519, + "learning_rate": 1.9236261214750497e-05, + "loss": 0.7311, + "step": 1483 + }, + { + "epoch": 0.1515832482124617, + "grad_norm": 1.5462416288328762, + "learning_rate": 1.923499264312781e-05, + "loss": 0.7023, + "step": 1484 + }, + { + "epoch": 0.15168539325842698, + "grad_norm": 1.5578133567163723, + "learning_rate": 1.923372306072921e-05, + "loss": 0.6695, + "step": 1485 + }, + { + "epoch": 0.15178753830439223, + "grad_norm": 1.4336058336887387, + "learning_rate": 1.9232452467693658e-05, + "loss": 0.6998, + "step": 1486 + }, + { + "epoch": 0.15188968335035752, + "grad_norm": 1.602402124303217, + "learning_rate": 1.9231180864160213e-05, + "loss": 0.8109, + "step": 1487 + }, + { + "epoch": 0.15199182839632278, + "grad_norm": 1.3743314215042068, + "learning_rate": 1.922990825026806e-05, + "loss": 0.6227, + "step": 1488 + }, + { + "epoch": 0.15209397344228806, + "grad_norm": 1.5344504938757169, + "learning_rate": 1.9228634626156486e-05, + "loss": 0.8438, + "step": 1489 + }, + { + "epoch": 0.15219611848825332, + "grad_norm": 1.482415502182226, + "learning_rate": 1.9227359991964892e-05, + "loss": 0.7681, + "step": 1490 + }, + { + "epoch": 0.1522982635342186, + "grad_norm": 1.5620482590062323, + "learning_rate": 1.9226084347832784e-05, + "loss": 0.7147, + "step": 1491 + }, + { + "epoch": 0.15240040858018386, + "grad_norm": 1.3812474160284345, + "learning_rate": 1.9224807693899784e-05, + "loss": 0.85, + "step": 1492 + }, + { + "epoch": 0.15250255362614915, + "grad_norm": 1.5578333842420518, + "learning_rate": 1.922353003030562e-05, + "loss": 0.7358, + "step": 1493 + }, + { + "epoch": 0.1526046986721144, + "grad_norm": 1.4892093037434606, + "learning_rate": 1.9222251357190135e-05, + "loss": 0.6208, + "step": 1494 + }, + { + "epoch": 0.15270684371807966, + "grad_norm": 1.4776823028883899, + "learning_rate": 1.922097167469328e-05, + "loss": 0.7417, + "step": 1495 + }, + { + "epoch": 0.15280898876404495, + "grad_norm": 1.5491537611185378, + "learning_rate": 1.921969098295512e-05, + "loss": 0.7588, + "step": 1496 + }, + { + "epoch": 0.1529111338100102, + "grad_norm": 1.4044560034990186, + "learning_rate": 1.9218409282115823e-05, + "loss": 0.7251, + "step": 1497 + }, + { + "epoch": 0.1530132788559755, + "grad_norm": 1.7215135328411288, + "learning_rate": 1.9217126572315677e-05, + "loss": 0.8966, + "step": 1498 + }, + { + "epoch": 0.15311542390194074, + "grad_norm": 1.5991875664048536, + "learning_rate": 1.921584285369507e-05, + "loss": 0.7446, + "step": 1499 + }, + { + "epoch": 0.15321756894790603, + "grad_norm": 1.380693499610882, + "learning_rate": 1.921455812639451e-05, + "loss": 0.7235, + "step": 1500 + }, + { + "epoch": 0.1533197139938713, + "grad_norm": 1.646195549493454, + "learning_rate": 1.9213272390554608e-05, + "loss": 0.7211, + "step": 1501 + }, + { + "epoch": 0.15342185903983657, + "grad_norm": 1.5680388039401867, + "learning_rate": 1.9211985646316094e-05, + "loss": 0.7268, + "step": 1502 + }, + { + "epoch": 0.15352400408580183, + "grad_norm": 1.5685827032315967, + "learning_rate": 1.9210697893819795e-05, + "loss": 0.7977, + "step": 1503 + }, + { + "epoch": 0.15362614913176711, + "grad_norm": 1.5395566516193993, + "learning_rate": 1.9209409133206662e-05, + "loss": 0.7629, + "step": 1504 + }, + { + "epoch": 0.15372829417773237, + "grad_norm": 1.615791092400343, + "learning_rate": 1.920811936461775e-05, + "loss": 0.7218, + "step": 1505 + }, + { + "epoch": 0.15383043922369766, + "grad_norm": 1.6597999014845815, + "learning_rate": 1.9206828588194228e-05, + "loss": 0.7826, + "step": 1506 + }, + { + "epoch": 0.15393258426966291, + "grad_norm": 1.6256963903646897, + "learning_rate": 1.920553680407736e-05, + "loss": 0.7556, + "step": 1507 + }, + { + "epoch": 0.1540347293156282, + "grad_norm": 1.3284838058199782, + "learning_rate": 1.920424401240855e-05, + "loss": 0.6448, + "step": 1508 + }, + { + "epoch": 0.15413687436159346, + "grad_norm": 1.6595027537997935, + "learning_rate": 1.9202950213329282e-05, + "loss": 0.7422, + "step": 1509 + }, + { + "epoch": 0.15423901940755874, + "grad_norm": 1.5061231992012265, + "learning_rate": 1.9201655406981167e-05, + "loss": 0.6989, + "step": 1510 + }, + { + "epoch": 0.154341164453524, + "grad_norm": 1.5240042211537637, + "learning_rate": 1.9200359593505925e-05, + "loss": 0.8295, + "step": 1511 + }, + { + "epoch": 0.15444330949948928, + "grad_norm": 1.6374454769112932, + "learning_rate": 1.9199062773045378e-05, + "loss": 0.7429, + "step": 1512 + }, + { + "epoch": 0.15454545454545454, + "grad_norm": 1.4122754474247312, + "learning_rate": 1.9197764945741467e-05, + "loss": 0.7814, + "step": 1513 + }, + { + "epoch": 0.15464759959141983, + "grad_norm": 1.4822959702434024, + "learning_rate": 1.9196466111736245e-05, + "loss": 0.6865, + "step": 1514 + }, + { + "epoch": 0.15474974463738508, + "grad_norm": 1.5595195495231793, + "learning_rate": 1.919516627117186e-05, + "loss": 0.7121, + "step": 1515 + }, + { + "epoch": 0.15485188968335037, + "grad_norm": 1.3592413632360343, + "learning_rate": 1.919386542419059e-05, + "loss": 0.7111, + "step": 1516 + }, + { + "epoch": 0.15495403472931563, + "grad_norm": 1.5174767150464523, + "learning_rate": 1.9192563570934805e-05, + "loss": 0.7298, + "step": 1517 + }, + { + "epoch": 0.1550561797752809, + "grad_norm": 1.4459753236903439, + "learning_rate": 1.9191260711547003e-05, + "loss": 0.8273, + "step": 1518 + }, + { + "epoch": 0.15515832482124617, + "grad_norm": 1.4302288315410678, + "learning_rate": 1.9189956846169774e-05, + "loss": 0.6923, + "step": 1519 + }, + { + "epoch": 0.15526046986721145, + "grad_norm": 1.5742852400191951, + "learning_rate": 1.918865197494583e-05, + "loss": 0.6542, + "step": 1520 + }, + { + "epoch": 0.1553626149131767, + "grad_norm": 1.4717277874606733, + "learning_rate": 1.9187346098017993e-05, + "loss": 0.8227, + "step": 1521 + }, + { + "epoch": 0.15546475995914197, + "grad_norm": 1.6055055790888269, + "learning_rate": 1.918603921552919e-05, + "loss": 0.8173, + "step": 1522 + }, + { + "epoch": 0.15556690500510725, + "grad_norm": 1.5319015388508856, + "learning_rate": 1.918473132762246e-05, + "loss": 0.6821, + "step": 1523 + }, + { + "epoch": 0.1556690500510725, + "grad_norm": 1.6163640272912092, + "learning_rate": 1.9183422434440953e-05, + "loss": 0.7904, + "step": 1524 + }, + { + "epoch": 0.1557711950970378, + "grad_norm": 1.6624114628896023, + "learning_rate": 1.9182112536127925e-05, + "loss": 0.8284, + "step": 1525 + }, + { + "epoch": 0.15587334014300305, + "grad_norm": 1.3443132516635576, + "learning_rate": 1.918080163282675e-05, + "loss": 0.6762, + "step": 1526 + }, + { + "epoch": 0.15597548518896834, + "grad_norm": 1.5798935339418345, + "learning_rate": 1.91794897246809e-05, + "loss": 0.7859, + "step": 1527 + }, + { + "epoch": 0.1560776302349336, + "grad_norm": 1.5076642313916433, + "learning_rate": 1.917817681183397e-05, + "loss": 0.6371, + "step": 1528 + }, + { + "epoch": 0.15617977528089888, + "grad_norm": 1.5151495360068639, + "learning_rate": 1.917686289442966e-05, + "loss": 0.7391, + "step": 1529 + }, + { + "epoch": 0.15628192032686414, + "grad_norm": 1.4692456111997299, + "learning_rate": 1.917554797261178e-05, + "loss": 0.7334, + "step": 1530 + }, + { + "epoch": 0.15638406537282942, + "grad_norm": 1.5147098174425102, + "learning_rate": 1.9174232046524245e-05, + "loss": 0.7338, + "step": 1531 + }, + { + "epoch": 0.15648621041879468, + "grad_norm": 1.5380149503605998, + "learning_rate": 1.9172915116311083e-05, + "loss": 0.7689, + "step": 1532 + }, + { + "epoch": 0.15658835546475997, + "grad_norm": 1.63547965910899, + "learning_rate": 1.9171597182116434e-05, + "loss": 0.7391, + "step": 1533 + }, + { + "epoch": 0.15669050051072522, + "grad_norm": 1.7614095923960307, + "learning_rate": 1.917027824408455e-05, + "loss": 0.7361, + "step": 1534 + }, + { + "epoch": 0.1567926455566905, + "grad_norm": 1.6000708403381863, + "learning_rate": 1.9168958302359785e-05, + "loss": 0.7581, + "step": 1535 + }, + { + "epoch": 0.15689479060265576, + "grad_norm": 1.4446067445378494, + "learning_rate": 1.9167637357086614e-05, + "loss": 0.723, + "step": 1536 + }, + { + "epoch": 0.15699693564862105, + "grad_norm": 1.4763492302373629, + "learning_rate": 1.9166315408409608e-05, + "loss": 0.7449, + "step": 1537 + }, + { + "epoch": 0.1570990806945863, + "grad_norm": 1.5607560889595977, + "learning_rate": 1.916499245647346e-05, + "loss": 0.6792, + "step": 1538 + }, + { + "epoch": 0.1572012257405516, + "grad_norm": 1.417972093858151, + "learning_rate": 1.9163668501422966e-05, + "loss": 0.6908, + "step": 1539 + }, + { + "epoch": 0.15730337078651685, + "grad_norm": 1.5048630223013457, + "learning_rate": 1.9162343543403032e-05, + "loss": 0.6314, + "step": 1540 + }, + { + "epoch": 0.15740551583248213, + "grad_norm": 1.4791391080528098, + "learning_rate": 1.9161017582558678e-05, + "loss": 0.7655, + "step": 1541 + }, + { + "epoch": 0.1575076608784474, + "grad_norm": 1.3976669928793843, + "learning_rate": 1.9159690619035034e-05, + "loss": 0.7092, + "step": 1542 + }, + { + "epoch": 0.15760980592441268, + "grad_norm": 1.5121925015100246, + "learning_rate": 1.9158362652977332e-05, + "loss": 0.7172, + "step": 1543 + }, + { + "epoch": 0.15771195097037793, + "grad_norm": 1.5281812662543834, + "learning_rate": 1.915703368453092e-05, + "loss": 0.8073, + "step": 1544 + }, + { + "epoch": 0.15781409601634322, + "grad_norm": 1.3753627719149022, + "learning_rate": 1.9155703713841257e-05, + "loss": 0.6972, + "step": 1545 + }, + { + "epoch": 0.15791624106230848, + "grad_norm": 1.4781102825009205, + "learning_rate": 1.9154372741053904e-05, + "loss": 0.7872, + "step": 1546 + }, + { + "epoch": 0.15801838610827376, + "grad_norm": 1.5051689086469504, + "learning_rate": 1.915304076631454e-05, + "loss": 0.6509, + "step": 1547 + }, + { + "epoch": 0.15812053115423902, + "grad_norm": 1.614441452147944, + "learning_rate": 1.9151707789768954e-05, + "loss": 0.8859, + "step": 1548 + }, + { + "epoch": 0.15822267620020428, + "grad_norm": 1.5911981379130813, + "learning_rate": 1.9150373811563038e-05, + "loss": 0.8913, + "step": 1549 + }, + { + "epoch": 0.15832482124616956, + "grad_norm": 1.5358565625034422, + "learning_rate": 1.9149038831842793e-05, + "loss": 0.7916, + "step": 1550 + }, + { + "epoch": 0.15842696629213482, + "grad_norm": 1.3837364232136753, + "learning_rate": 1.9147702850754338e-05, + "loss": 0.6278, + "step": 1551 + }, + { + "epoch": 0.1585291113381001, + "grad_norm": 1.480022898686007, + "learning_rate": 1.9146365868443895e-05, + "loss": 0.7792, + "step": 1552 + }, + { + "epoch": 0.15863125638406536, + "grad_norm": 1.6377686966321894, + "learning_rate": 1.9145027885057802e-05, + "loss": 0.8093, + "step": 1553 + }, + { + "epoch": 0.15873340143003065, + "grad_norm": 1.3155003021820721, + "learning_rate": 1.914368890074249e-05, + "loss": 0.6804, + "step": 1554 + }, + { + "epoch": 0.1588355464759959, + "grad_norm": 1.4193701981551832, + "learning_rate": 1.914234891564453e-05, + "loss": 0.8151, + "step": 1555 + }, + { + "epoch": 0.1589376915219612, + "grad_norm": 1.4887780671077047, + "learning_rate": 1.914100792991057e-05, + "loss": 0.789, + "step": 1556 + }, + { + "epoch": 0.15903983656792645, + "grad_norm": 1.5612249613266986, + "learning_rate": 1.9139665943687386e-05, + "loss": 0.7155, + "step": 1557 + }, + { + "epoch": 0.15914198161389173, + "grad_norm": 1.4921441466224368, + "learning_rate": 1.9138322957121863e-05, + "loss": 0.7558, + "step": 1558 + }, + { + "epoch": 0.159244126659857, + "grad_norm": 1.4604607455546217, + "learning_rate": 1.9136978970360985e-05, + "loss": 0.7232, + "step": 1559 + }, + { + "epoch": 0.15934627170582227, + "grad_norm": 1.47070213255059, + "learning_rate": 1.9135633983551853e-05, + "loss": 0.7714, + "step": 1560 + }, + { + "epoch": 0.15944841675178753, + "grad_norm": 1.592211691002247, + "learning_rate": 1.9134287996841683e-05, + "loss": 0.81, + "step": 1561 + }, + { + "epoch": 0.15955056179775282, + "grad_norm": 1.4377448699102515, + "learning_rate": 1.913294101037779e-05, + "loss": 0.6944, + "step": 1562 + }, + { + "epoch": 0.15965270684371807, + "grad_norm": 1.455325051217147, + "learning_rate": 1.9131593024307602e-05, + "loss": 0.7738, + "step": 1563 + }, + { + "epoch": 0.15975485188968336, + "grad_norm": 1.481435674067964, + "learning_rate": 1.9130244038778658e-05, + "loss": 0.7416, + "step": 1564 + }, + { + "epoch": 0.15985699693564862, + "grad_norm": 1.546720915440481, + "learning_rate": 1.9128894053938603e-05, + "loss": 0.766, + "step": 1565 + }, + { + "epoch": 0.1599591419816139, + "grad_norm": 1.4836755370296506, + "learning_rate": 1.9127543069935198e-05, + "loss": 0.794, + "step": 1566 + }, + { + "epoch": 0.16006128702757916, + "grad_norm": 1.5141990875593103, + "learning_rate": 1.912619108691631e-05, + "loss": 0.7479, + "step": 1567 + }, + { + "epoch": 0.16016343207354444, + "grad_norm": 1.6818059126281242, + "learning_rate": 1.9124838105029904e-05, + "loss": 0.8669, + "step": 1568 + }, + { + "epoch": 0.1602655771195097, + "grad_norm": 1.4416524069653935, + "learning_rate": 1.9123484124424075e-05, + "loss": 0.7633, + "step": 1569 + }, + { + "epoch": 0.16036772216547499, + "grad_norm": 1.4671853293715984, + "learning_rate": 1.9122129145247018e-05, + "loss": 0.6866, + "step": 1570 + }, + { + "epoch": 0.16046986721144024, + "grad_norm": 1.454264529064163, + "learning_rate": 1.9120773167647025e-05, + "loss": 0.7947, + "step": 1571 + }, + { + "epoch": 0.16057201225740553, + "grad_norm": 1.4938966085776046, + "learning_rate": 1.9119416191772524e-05, + "loss": 0.7928, + "step": 1572 + }, + { + "epoch": 0.16067415730337078, + "grad_norm": 1.5570766453747198, + "learning_rate": 1.9118058217772023e-05, + "loss": 0.8438, + "step": 1573 + }, + { + "epoch": 0.16077630234933607, + "grad_norm": 1.4767403309856386, + "learning_rate": 1.9116699245794162e-05, + "loss": 0.8599, + "step": 1574 + }, + { + "epoch": 0.16087844739530133, + "grad_norm": 1.3723243203124202, + "learning_rate": 1.9115339275987678e-05, + "loss": 0.6862, + "step": 1575 + }, + { + "epoch": 0.1609805924412666, + "grad_norm": 1.3514468478506851, + "learning_rate": 1.911397830850142e-05, + "loss": 0.6456, + "step": 1576 + }, + { + "epoch": 0.16108273748723187, + "grad_norm": 1.5004921041519557, + "learning_rate": 1.911261634348435e-05, + "loss": 0.7983, + "step": 1577 + }, + { + "epoch": 0.16118488253319713, + "grad_norm": 1.4344489601454746, + "learning_rate": 1.911125338108553e-05, + "loss": 0.7563, + "step": 1578 + }, + { + "epoch": 0.1612870275791624, + "grad_norm": 1.5290196833232637, + "learning_rate": 1.9109889421454143e-05, + "loss": 0.7134, + "step": 1579 + }, + { + "epoch": 0.16138917262512767, + "grad_norm": 1.4947955057209659, + "learning_rate": 1.9108524464739474e-05, + "loss": 0.8382, + "step": 1580 + }, + { + "epoch": 0.16149131767109295, + "grad_norm": 1.4963982353622198, + "learning_rate": 1.9107158511090916e-05, + "loss": 0.7365, + "step": 1581 + }, + { + "epoch": 0.1615934627170582, + "grad_norm": 1.528699384509226, + "learning_rate": 1.9105791560657977e-05, + "loss": 0.7352, + "step": 1582 + }, + { + "epoch": 0.1616956077630235, + "grad_norm": 1.4260632097602914, + "learning_rate": 1.9104423613590266e-05, + "loss": 0.6898, + "step": 1583 + }, + { + "epoch": 0.16179775280898875, + "grad_norm": 1.6464849610678862, + "learning_rate": 1.910305467003751e-05, + "loss": 0.7349, + "step": 1584 + }, + { + "epoch": 0.16189989785495404, + "grad_norm": 1.4892312291190488, + "learning_rate": 1.9101684730149536e-05, + "loss": 0.6974, + "step": 1585 + }, + { + "epoch": 0.1620020429009193, + "grad_norm": 1.5715725767644013, + "learning_rate": 1.910031379407629e-05, + "loss": 0.648, + "step": 1586 + }, + { + "epoch": 0.16210418794688458, + "grad_norm": 1.5415212504769706, + "learning_rate": 1.9098941861967822e-05, + "loss": 0.7375, + "step": 1587 + }, + { + "epoch": 0.16220633299284984, + "grad_norm": 1.5394846204242916, + "learning_rate": 1.9097568933974283e-05, + "loss": 0.754, + "step": 1588 + }, + { + "epoch": 0.16230847803881512, + "grad_norm": 1.6122890734647868, + "learning_rate": 1.909619501024595e-05, + "loss": 0.7641, + "step": 1589 + }, + { + "epoch": 0.16241062308478038, + "grad_norm": 1.5247764241788768, + "learning_rate": 1.90948200909332e-05, + "loss": 0.8328, + "step": 1590 + }, + { + "epoch": 0.16251276813074567, + "grad_norm": 1.5472102284933233, + "learning_rate": 1.909344417618651e-05, + "loss": 0.7388, + "step": 1591 + }, + { + "epoch": 0.16261491317671092, + "grad_norm": 1.615325720855714, + "learning_rate": 1.909206726615648e-05, + "loss": 0.7925, + "step": 1592 + }, + { + "epoch": 0.1627170582226762, + "grad_norm": 1.6082703249241432, + "learning_rate": 1.9090689360993814e-05, + "loss": 0.8157, + "step": 1593 + }, + { + "epoch": 0.16281920326864147, + "grad_norm": 1.6368737751832967, + "learning_rate": 1.9089310460849323e-05, + "loss": 0.8331, + "step": 1594 + }, + { + "epoch": 0.16292134831460675, + "grad_norm": 1.436017136385229, + "learning_rate": 1.9087930565873933e-05, + "loss": 0.7167, + "step": 1595 + }, + { + "epoch": 0.163023493360572, + "grad_norm": 1.5541491630484596, + "learning_rate": 1.908654967621867e-05, + "loss": 0.7357, + "step": 1596 + }, + { + "epoch": 0.1631256384065373, + "grad_norm": 1.587327283240723, + "learning_rate": 1.9085167792034672e-05, + "loss": 0.7445, + "step": 1597 + }, + { + "epoch": 0.16322778345250255, + "grad_norm": 1.3306423415552662, + "learning_rate": 1.908378491347319e-05, + "loss": 0.7349, + "step": 1598 + }, + { + "epoch": 0.16332992849846784, + "grad_norm": 1.5799631880135903, + "learning_rate": 1.9082401040685583e-05, + "loss": 0.8166, + "step": 1599 + }, + { + "epoch": 0.1634320735444331, + "grad_norm": 1.4433442551386697, + "learning_rate": 1.908101617382331e-05, + "loss": 0.7399, + "step": 1600 + }, + { + "epoch": 0.16353421859039838, + "grad_norm": 1.6791272776834318, + "learning_rate": 1.9079630313037954e-05, + "loss": 0.7634, + "step": 1601 + }, + { + "epoch": 0.16363636363636364, + "grad_norm": 1.4695100875587683, + "learning_rate": 1.9078243458481188e-05, + "loss": 0.6476, + "step": 1602 + }, + { + "epoch": 0.16373850868232892, + "grad_norm": 1.5343137150417558, + "learning_rate": 1.9076855610304817e-05, + "loss": 0.7994, + "step": 1603 + }, + { + "epoch": 0.16384065372829418, + "grad_norm": 1.3965652394025752, + "learning_rate": 1.907546676866073e-05, + "loss": 0.6857, + "step": 1604 + }, + { + "epoch": 0.16394279877425944, + "grad_norm": 1.6047653580553858, + "learning_rate": 1.9074076933700944e-05, + "loss": 0.7206, + "step": 1605 + }, + { + "epoch": 0.16404494382022472, + "grad_norm": 1.48974091554732, + "learning_rate": 1.9072686105577574e-05, + "loss": 0.7064, + "step": 1606 + }, + { + "epoch": 0.16414708886618998, + "grad_norm": 1.5917365928271061, + "learning_rate": 1.907129428444285e-05, + "loss": 0.862, + "step": 1607 + }, + { + "epoch": 0.16424923391215526, + "grad_norm": 1.4433989530460103, + "learning_rate": 1.9069901470449107e-05, + "loss": 0.7456, + "step": 1608 + }, + { + "epoch": 0.16435137895812052, + "grad_norm": 1.4111019303125132, + "learning_rate": 1.9068507663748785e-05, + "loss": 0.749, + "step": 1609 + }, + { + "epoch": 0.1644535240040858, + "grad_norm": 1.4841416599529411, + "learning_rate": 1.906711286449444e-05, + "loss": 0.7686, + "step": 1610 + }, + { + "epoch": 0.16455566905005106, + "grad_norm": 1.3265517359956176, + "learning_rate": 1.9065717072838734e-05, + "loss": 0.6114, + "step": 1611 + }, + { + "epoch": 0.16465781409601635, + "grad_norm": 1.4405609255588618, + "learning_rate": 1.906432028893444e-05, + "loss": 0.625, + "step": 1612 + }, + { + "epoch": 0.1647599591419816, + "grad_norm": 1.3942795525285383, + "learning_rate": 1.9062922512934432e-05, + "loss": 0.7814, + "step": 1613 + }, + { + "epoch": 0.1648621041879469, + "grad_norm": 1.6826560793806584, + "learning_rate": 1.9061523744991698e-05, + "loss": 0.7932, + "step": 1614 + }, + { + "epoch": 0.16496424923391215, + "grad_norm": 1.3958695304113196, + "learning_rate": 1.906012398525934e-05, + "loss": 0.6472, + "step": 1615 + }, + { + "epoch": 0.16506639427987743, + "grad_norm": 1.4853656218634286, + "learning_rate": 1.905872323389055e-05, + "loss": 0.7572, + "step": 1616 + }, + { + "epoch": 0.1651685393258427, + "grad_norm": 1.5614366654222802, + "learning_rate": 1.905732149103866e-05, + "loss": 0.7935, + "step": 1617 + }, + { + "epoch": 0.16527068437180797, + "grad_norm": 1.5425771245399245, + "learning_rate": 1.9055918756857075e-05, + "loss": 0.7606, + "step": 1618 + }, + { + "epoch": 0.16537282941777323, + "grad_norm": 1.473843224768117, + "learning_rate": 1.9054515031499332e-05, + "loss": 0.7424, + "step": 1619 + }, + { + "epoch": 0.16547497446373852, + "grad_norm": 1.6776054540734124, + "learning_rate": 1.9053110315119068e-05, + "loss": 0.6987, + "step": 1620 + }, + { + "epoch": 0.16557711950970377, + "grad_norm": 1.4237983479845966, + "learning_rate": 1.905170460787003e-05, + "loss": 0.7126, + "step": 1621 + }, + { + "epoch": 0.16567926455566906, + "grad_norm": 1.5458059101262946, + "learning_rate": 1.9050297909906077e-05, + "loss": 0.7562, + "step": 1622 + }, + { + "epoch": 0.16578140960163432, + "grad_norm": 1.3695219129668585, + "learning_rate": 1.904889022138117e-05, + "loss": 0.7346, + "step": 1623 + }, + { + "epoch": 0.1658835546475996, + "grad_norm": 1.4675259058785992, + "learning_rate": 1.9047481542449384e-05, + "loss": 0.7778, + "step": 1624 + }, + { + "epoch": 0.16598569969356486, + "grad_norm": 1.3920339858865147, + "learning_rate": 1.9046071873264895e-05, + "loss": 0.6954, + "step": 1625 + }, + { + "epoch": 0.16608784473953014, + "grad_norm": 1.5396115403901631, + "learning_rate": 1.9044661213981994e-05, + "loss": 0.8501, + "step": 1626 + }, + { + "epoch": 0.1661899897854954, + "grad_norm": 1.3931308818360806, + "learning_rate": 1.9043249564755082e-05, + "loss": 0.6029, + "step": 1627 + }, + { + "epoch": 0.1662921348314607, + "grad_norm": 1.539706105291121, + "learning_rate": 1.9041836925738662e-05, + "loss": 0.8168, + "step": 1628 + }, + { + "epoch": 0.16639427987742594, + "grad_norm": 1.4326754397199128, + "learning_rate": 1.9040423297087348e-05, + "loss": 0.8848, + "step": 1629 + }, + { + "epoch": 0.16649642492339123, + "grad_norm": 1.5637292629946007, + "learning_rate": 1.9039008678955864e-05, + "loss": 0.8295, + "step": 1630 + }, + { + "epoch": 0.16659856996935649, + "grad_norm": 1.4435106433332063, + "learning_rate": 1.903759307149904e-05, + "loss": 0.7546, + "step": 1631 + }, + { + "epoch": 0.16670071501532174, + "grad_norm": 1.6650833725835272, + "learning_rate": 1.9036176474871814e-05, + "loss": 0.859, + "step": 1632 + }, + { + "epoch": 0.16680286006128703, + "grad_norm": 1.4482629629279744, + "learning_rate": 1.9034758889229236e-05, + "loss": 0.6713, + "step": 1633 + }, + { + "epoch": 0.16690500510725229, + "grad_norm": 1.4366977213699657, + "learning_rate": 1.903334031472646e-05, + "loss": 0.6919, + "step": 1634 + }, + { + "epoch": 0.16700715015321757, + "grad_norm": 1.5007942142242667, + "learning_rate": 1.903192075151875e-05, + "loss": 0.8195, + "step": 1635 + }, + { + "epoch": 0.16710929519918283, + "grad_norm": 1.542868374325661, + "learning_rate": 1.903050019976148e-05, + "loss": 0.7311, + "step": 1636 + }, + { + "epoch": 0.1672114402451481, + "grad_norm": 1.5816541235893211, + "learning_rate": 1.9029078659610127e-05, + "loss": 0.72, + "step": 1637 + }, + { + "epoch": 0.16731358529111337, + "grad_norm": 1.4812070898416783, + "learning_rate": 1.902765613122028e-05, + "loss": 0.7582, + "step": 1638 + }, + { + "epoch": 0.16741573033707866, + "grad_norm": 1.4969046936565689, + "learning_rate": 1.9026232614747638e-05, + "loss": 0.8215, + "step": 1639 + }, + { + "epoch": 0.1675178753830439, + "grad_norm": 1.5196408117797398, + "learning_rate": 1.9024808110348006e-05, + "loss": 0.7909, + "step": 1640 + }, + { + "epoch": 0.1676200204290092, + "grad_norm": 1.5169849370018311, + "learning_rate": 1.9023382618177292e-05, + "loss": 0.7584, + "step": 1641 + }, + { + "epoch": 0.16772216547497446, + "grad_norm": 1.410582016029108, + "learning_rate": 1.9021956138391524e-05, + "loss": 0.7924, + "step": 1642 + }, + { + "epoch": 0.16782431052093974, + "grad_norm": 1.5233424308517216, + "learning_rate": 1.902052867114683e-05, + "loss": 0.785, + "step": 1643 + }, + { + "epoch": 0.167926455566905, + "grad_norm": 1.4748544634787273, + "learning_rate": 1.901910021659944e-05, + "loss": 0.7925, + "step": 1644 + }, + { + "epoch": 0.16802860061287028, + "grad_norm": 1.516866789335725, + "learning_rate": 1.9017670774905707e-05, + "loss": 0.7976, + "step": 1645 + }, + { + "epoch": 0.16813074565883554, + "grad_norm": 1.544873845940206, + "learning_rate": 1.901624034622208e-05, + "loss": 0.7765, + "step": 1646 + }, + { + "epoch": 0.16823289070480082, + "grad_norm": 1.3202588061972562, + "learning_rate": 1.9014808930705123e-05, + "loss": 0.7309, + "step": 1647 + }, + { + "epoch": 0.16833503575076608, + "grad_norm": 1.3568307837141995, + "learning_rate": 1.9013376528511504e-05, + "loss": 0.7539, + "step": 1648 + }, + { + "epoch": 0.16843718079673137, + "grad_norm": 1.3370193342875056, + "learning_rate": 1.9011943139797998e-05, + "loss": 0.7534, + "step": 1649 + }, + { + "epoch": 0.16853932584269662, + "grad_norm": 1.615637409148707, + "learning_rate": 1.9010508764721496e-05, + "loss": 0.7305, + "step": 1650 + }, + { + "epoch": 0.1686414708886619, + "grad_norm": 1.4473167641327425, + "learning_rate": 1.9009073403438988e-05, + "loss": 0.6791, + "step": 1651 + }, + { + "epoch": 0.16874361593462717, + "grad_norm": 1.4738993515134662, + "learning_rate": 1.9007637056107576e-05, + "loss": 0.7266, + "step": 1652 + }, + { + "epoch": 0.16884576098059245, + "grad_norm": 1.4243258978178772, + "learning_rate": 1.9006199722884465e-05, + "loss": 0.7606, + "step": 1653 + }, + { + "epoch": 0.1689479060265577, + "grad_norm": 1.6226558215822604, + "learning_rate": 1.9004761403926978e-05, + "loss": 0.734, + "step": 1654 + }, + { + "epoch": 0.169050051072523, + "grad_norm": 1.437669269080698, + "learning_rate": 1.9003322099392535e-05, + "loss": 0.7954, + "step": 1655 + }, + { + "epoch": 0.16915219611848825, + "grad_norm": 1.5897118148864038, + "learning_rate": 1.9001881809438677e-05, + "loss": 0.8397, + "step": 1656 + }, + { + "epoch": 0.16925434116445354, + "grad_norm": 1.4546214424000696, + "learning_rate": 1.9000440534223034e-05, + "loss": 0.7838, + "step": 1657 + }, + { + "epoch": 0.1693564862104188, + "grad_norm": 1.433737339585538, + "learning_rate": 1.899899827390336e-05, + "loss": 0.6751, + "step": 1658 + }, + { + "epoch": 0.16945863125638408, + "grad_norm": 1.4775422645084217, + "learning_rate": 1.8997555028637513e-05, + "loss": 0.7986, + "step": 1659 + }, + { + "epoch": 0.16956077630234934, + "grad_norm": 1.547474440855746, + "learning_rate": 1.8996110798583452e-05, + "loss": 0.7999, + "step": 1660 + }, + { + "epoch": 0.1696629213483146, + "grad_norm": 1.5037817648206524, + "learning_rate": 1.8994665583899256e-05, + "loss": 0.8058, + "step": 1661 + }, + { + "epoch": 0.16976506639427988, + "grad_norm": 1.555321904064513, + "learning_rate": 1.89932193847431e-05, + "loss": 0.7712, + "step": 1662 + }, + { + "epoch": 0.16986721144024514, + "grad_norm": 1.484447776692159, + "learning_rate": 1.8991772201273267e-05, + "loss": 0.8166, + "step": 1663 + }, + { + "epoch": 0.16996935648621042, + "grad_norm": 1.3467747148005869, + "learning_rate": 1.899032403364816e-05, + "loss": 0.747, + "step": 1664 + }, + { + "epoch": 0.17007150153217568, + "grad_norm": 1.9864911667709322, + "learning_rate": 1.898887488202628e-05, + "loss": 0.7765, + "step": 1665 + }, + { + "epoch": 0.17017364657814096, + "grad_norm": 1.7092137999296921, + "learning_rate": 1.8987424746566237e-05, + "loss": 0.8286, + "step": 1666 + }, + { + "epoch": 0.17027579162410622, + "grad_norm": 1.5092524002994716, + "learning_rate": 1.8985973627426747e-05, + "loss": 0.8011, + "step": 1667 + }, + { + "epoch": 0.1703779366700715, + "grad_norm": 1.5219706444123955, + "learning_rate": 1.898452152476664e-05, + "loss": 0.6489, + "step": 1668 + }, + { + "epoch": 0.17048008171603676, + "grad_norm": 1.5244824046008114, + "learning_rate": 1.8983068438744846e-05, + "loss": 0.8236, + "step": 1669 + }, + { + "epoch": 0.17058222676200205, + "grad_norm": 1.4897687453708552, + "learning_rate": 1.8981614369520406e-05, + "loss": 0.7701, + "step": 1670 + }, + { + "epoch": 0.1706843718079673, + "grad_norm": 1.38929744303368, + "learning_rate": 1.8980159317252473e-05, + "loss": 0.7358, + "step": 1671 + }, + { + "epoch": 0.1707865168539326, + "grad_norm": 1.3616941405183018, + "learning_rate": 1.8978703282100298e-05, + "loss": 0.6559, + "step": 1672 + }, + { + "epoch": 0.17088866189989785, + "grad_norm": 1.4490928666444185, + "learning_rate": 1.8977246264223252e-05, + "loss": 0.7408, + "step": 1673 + }, + { + "epoch": 0.17099080694586313, + "grad_norm": 1.5254046801576682, + "learning_rate": 1.8975788263780797e-05, + "loss": 0.7193, + "step": 1674 + }, + { + "epoch": 0.1710929519918284, + "grad_norm": 1.401988719623699, + "learning_rate": 1.8974329280932522e-05, + "loss": 0.7341, + "step": 1675 + }, + { + "epoch": 0.17119509703779368, + "grad_norm": 1.370941036548307, + "learning_rate": 1.897286931583811e-05, + "loss": 0.7095, + "step": 1676 + }, + { + "epoch": 0.17129724208375893, + "grad_norm": 1.5717144347486354, + "learning_rate": 1.897140836865735e-05, + "loss": 0.743, + "step": 1677 + }, + { + "epoch": 0.17139938712972422, + "grad_norm": 1.5696759400880376, + "learning_rate": 1.896994643955015e-05, + "loss": 0.7576, + "step": 1678 + }, + { + "epoch": 0.17150153217568948, + "grad_norm": 1.595211228526451, + "learning_rate": 1.8968483528676515e-05, + "loss": 0.6652, + "step": 1679 + }, + { + "epoch": 0.17160367722165476, + "grad_norm": 1.5410575434653095, + "learning_rate": 1.8967019636196565e-05, + "loss": 0.7147, + "step": 1680 + }, + { + "epoch": 0.17170582226762002, + "grad_norm": 1.5802419666414576, + "learning_rate": 1.896555476227052e-05, + "loss": 0.7235, + "step": 1681 + }, + { + "epoch": 0.1718079673135853, + "grad_norm": 1.6342262656995694, + "learning_rate": 1.8964088907058717e-05, + "loss": 0.8215, + "step": 1682 + }, + { + "epoch": 0.17191011235955056, + "grad_norm": 1.461574624337787, + "learning_rate": 1.896262207072159e-05, + "loss": 0.7181, + "step": 1683 + }, + { + "epoch": 0.17201225740551584, + "grad_norm": 1.4962878716478465, + "learning_rate": 1.896115425341969e-05, + "loss": 0.7495, + "step": 1684 + }, + { + "epoch": 0.1721144024514811, + "grad_norm": 1.6056608198479452, + "learning_rate": 1.8959685455313663e-05, + "loss": 0.8148, + "step": 1685 + }, + { + "epoch": 0.1722165474974464, + "grad_norm": 1.4743101807622847, + "learning_rate": 1.8958215676564275e-05, + "loss": 0.8504, + "step": 1686 + }, + { + "epoch": 0.17231869254341164, + "grad_norm": 1.6917360973097622, + "learning_rate": 1.8956744917332394e-05, + "loss": 0.798, + "step": 1687 + }, + { + "epoch": 0.1724208375893769, + "grad_norm": 1.5956456734270041, + "learning_rate": 1.8955273177778996e-05, + "loss": 0.8106, + "step": 1688 + }, + { + "epoch": 0.1725229826353422, + "grad_norm": 1.5494555511141463, + "learning_rate": 1.895380045806516e-05, + "loss": 0.8154, + "step": 1689 + }, + { + "epoch": 0.17262512768130744, + "grad_norm": 1.4902731482896234, + "learning_rate": 1.8952326758352083e-05, + "loss": 0.7682, + "step": 1690 + }, + { + "epoch": 0.17272727272727273, + "grad_norm": 1.6816579178473003, + "learning_rate": 1.8950852078801058e-05, + "loss": 0.7391, + "step": 1691 + }, + { + "epoch": 0.172829417773238, + "grad_norm": 1.4523307965134538, + "learning_rate": 1.8949376419573484e-05, + "loss": 0.7163, + "step": 1692 + }, + { + "epoch": 0.17293156281920327, + "grad_norm": 1.8230617415071149, + "learning_rate": 1.8947899780830884e-05, + "loss": 0.6767, + "step": 1693 + }, + { + "epoch": 0.17303370786516853, + "grad_norm": 1.4893545332384173, + "learning_rate": 1.8946422162734872e-05, + "loss": 0.6906, + "step": 1694 + }, + { + "epoch": 0.17313585291113381, + "grad_norm": 1.6362313700989841, + "learning_rate": 1.8944943565447174e-05, + "loss": 0.7963, + "step": 1695 + }, + { + "epoch": 0.17323799795709907, + "grad_norm": 1.4728666981718914, + "learning_rate": 1.894346398912962e-05, + "loss": 0.7412, + "step": 1696 + }, + { + "epoch": 0.17334014300306436, + "grad_norm": 1.326485111681694, + "learning_rate": 1.894198343394416e-05, + "loss": 0.7192, + "step": 1697 + }, + { + "epoch": 0.1734422880490296, + "grad_norm": 1.3980486604699218, + "learning_rate": 1.894050190005283e-05, + "loss": 0.7413, + "step": 1698 + }, + { + "epoch": 0.1735444330949949, + "grad_norm": 1.5371689175028298, + "learning_rate": 1.8939019387617796e-05, + "loss": 0.7602, + "step": 1699 + }, + { + "epoch": 0.17364657814096016, + "grad_norm": 1.5481695900138497, + "learning_rate": 1.8937535896801312e-05, + "loss": 0.9276, + "step": 1700 + }, + { + "epoch": 0.17374872318692544, + "grad_norm": 1.5351098299443398, + "learning_rate": 1.893605142776575e-05, + "loss": 0.7257, + "step": 1701 + }, + { + "epoch": 0.1738508682328907, + "grad_norm": 1.6029943099877884, + "learning_rate": 1.8934565980673585e-05, + "loss": 0.7507, + "step": 1702 + }, + { + "epoch": 0.17395301327885598, + "grad_norm": 1.5307854744504417, + "learning_rate": 1.8933079555687402e-05, + "loss": 0.7397, + "step": 1703 + }, + { + "epoch": 0.17405515832482124, + "grad_norm": 1.5044294398950129, + "learning_rate": 1.893159215296989e-05, + "loss": 0.7666, + "step": 1704 + }, + { + "epoch": 0.17415730337078653, + "grad_norm": 1.3884329514997193, + "learning_rate": 1.8930103772683846e-05, + "loss": 0.7925, + "step": 1705 + }, + { + "epoch": 0.17425944841675178, + "grad_norm": 1.4064140369115274, + "learning_rate": 1.8928614414992173e-05, + "loss": 0.628, + "step": 1706 + }, + { + "epoch": 0.17436159346271707, + "grad_norm": 1.526064389183152, + "learning_rate": 1.8927124080057884e-05, + "loss": 0.7506, + "step": 1707 + }, + { + "epoch": 0.17446373850868233, + "grad_norm": 1.6120619424463882, + "learning_rate": 1.89256327680441e-05, + "loss": 0.7424, + "step": 1708 + }, + { + "epoch": 0.1745658835546476, + "grad_norm": 1.4316899499049005, + "learning_rate": 1.8924140479114043e-05, + "loss": 0.7658, + "step": 1709 + }, + { + "epoch": 0.17466802860061287, + "grad_norm": 1.49541935024094, + "learning_rate": 1.892264721343104e-05, + "loss": 0.7514, + "step": 1710 + }, + { + "epoch": 0.17477017364657815, + "grad_norm": 1.5955335117602674, + "learning_rate": 1.8921152971158537e-05, + "loss": 0.7183, + "step": 1711 + }, + { + "epoch": 0.1748723186925434, + "grad_norm": 1.6931673273060281, + "learning_rate": 1.891965775246008e-05, + "loss": 0.7434, + "step": 1712 + }, + { + "epoch": 0.1749744637385087, + "grad_norm": 1.6245549274268523, + "learning_rate": 1.8918161557499316e-05, + "loss": 0.7578, + "step": 1713 + }, + { + "epoch": 0.17507660878447395, + "grad_norm": 1.466628674040457, + "learning_rate": 1.8916664386440008e-05, + "loss": 0.8323, + "step": 1714 + }, + { + "epoch": 0.1751787538304392, + "grad_norm": 1.4580569827536138, + "learning_rate": 1.8915166239446024e-05, + "loss": 0.7374, + "step": 1715 + }, + { + "epoch": 0.1752808988764045, + "grad_norm": 1.5609216741974261, + "learning_rate": 1.8913667116681334e-05, + "loss": 0.8141, + "step": 1716 + }, + { + "epoch": 0.17538304392236975, + "grad_norm": 1.360626923508468, + "learning_rate": 1.8912167018310018e-05, + "loss": 0.7411, + "step": 1717 + }, + { + "epoch": 0.17548518896833504, + "grad_norm": 1.4238521757524139, + "learning_rate": 1.8910665944496264e-05, + "loss": 0.678, + "step": 1718 + }, + { + "epoch": 0.1755873340143003, + "grad_norm": 1.4592700507641492, + "learning_rate": 1.8909163895404367e-05, + "loss": 0.6802, + "step": 1719 + }, + { + "epoch": 0.17568947906026558, + "grad_norm": 1.6002946149476283, + "learning_rate": 1.8907660871198725e-05, + "loss": 0.8288, + "step": 1720 + }, + { + "epoch": 0.17579162410623084, + "grad_norm": 1.447649547321314, + "learning_rate": 1.8906156872043846e-05, + "loss": 0.6893, + "step": 1721 + }, + { + "epoch": 0.17589376915219612, + "grad_norm": 1.3471277256394956, + "learning_rate": 1.8904651898104346e-05, + "loss": 0.6609, + "step": 1722 + }, + { + "epoch": 0.17599591419816138, + "grad_norm": 1.632667771473945, + "learning_rate": 1.8903145949544935e-05, + "loss": 0.7966, + "step": 1723 + }, + { + "epoch": 0.17609805924412666, + "grad_norm": 1.3770684692710315, + "learning_rate": 1.8901639026530453e-05, + "loss": 0.8224, + "step": 1724 + }, + { + "epoch": 0.17620020429009192, + "grad_norm": 1.4596555305124719, + "learning_rate": 1.8900131129225827e-05, + "loss": 0.786, + "step": 1725 + }, + { + "epoch": 0.1763023493360572, + "grad_norm": 1.344385851415088, + "learning_rate": 1.88986222577961e-05, + "loss": 0.6983, + "step": 1726 + }, + { + "epoch": 0.17640449438202246, + "grad_norm": 1.645206585394134, + "learning_rate": 1.8897112412406415e-05, + "loss": 0.7984, + "step": 1727 + }, + { + "epoch": 0.17650663942798775, + "grad_norm": 1.4308462162760553, + "learning_rate": 1.889560159322203e-05, + "loss": 0.7816, + "step": 1728 + }, + { + "epoch": 0.176608784473953, + "grad_norm": 1.498765778288011, + "learning_rate": 1.8894089800408302e-05, + "loss": 0.7535, + "step": 1729 + }, + { + "epoch": 0.1767109295199183, + "grad_norm": 1.5602934755552544, + "learning_rate": 1.8892577034130704e-05, + "loss": 0.7742, + "step": 1730 + }, + { + "epoch": 0.17681307456588355, + "grad_norm": 1.3375155651194042, + "learning_rate": 1.8891063294554798e-05, + "loss": 0.7397, + "step": 1731 + }, + { + "epoch": 0.17691521961184883, + "grad_norm": 1.500351151931169, + "learning_rate": 1.888954858184627e-05, + "loss": 0.6575, + "step": 1732 + }, + { + "epoch": 0.1770173646578141, + "grad_norm": 1.4751098173716477, + "learning_rate": 1.888803289617091e-05, + "loss": 0.838, + "step": 1733 + }, + { + "epoch": 0.17711950970377938, + "grad_norm": 1.3106026967193014, + "learning_rate": 1.888651623769461e-05, + "loss": 0.687, + "step": 1734 + }, + { + "epoch": 0.17722165474974463, + "grad_norm": 1.3567574603849475, + "learning_rate": 1.888499860658336e-05, + "loss": 0.6326, + "step": 1735 + }, + { + "epoch": 0.17732379979570992, + "grad_norm": 1.5292129666665577, + "learning_rate": 1.8883480003003272e-05, + "loss": 0.8152, + "step": 1736 + }, + { + "epoch": 0.17742594484167518, + "grad_norm": 1.403810663265192, + "learning_rate": 1.8881960427120562e-05, + "loss": 0.6388, + "step": 1737 + }, + { + "epoch": 0.17752808988764046, + "grad_norm": 1.330945086995526, + "learning_rate": 1.8880439879101543e-05, + "loss": 0.6775, + "step": 1738 + }, + { + "epoch": 0.17763023493360572, + "grad_norm": 1.5331745981873093, + "learning_rate": 1.8878918359112644e-05, + "loss": 0.8849, + "step": 1739 + }, + { + "epoch": 0.177732379979571, + "grad_norm": 1.556113685371182, + "learning_rate": 1.8877395867320392e-05, + "loss": 0.697, + "step": 1740 + }, + { + "epoch": 0.17783452502553626, + "grad_norm": 1.4319459537640808, + "learning_rate": 1.8875872403891425e-05, + "loss": 0.6804, + "step": 1741 + }, + { + "epoch": 0.17793667007150155, + "grad_norm": 1.555378852452123, + "learning_rate": 1.8874347968992493e-05, + "loss": 0.7376, + "step": 1742 + }, + { + "epoch": 0.1780388151174668, + "grad_norm": 1.3664630482299116, + "learning_rate": 1.887282256279044e-05, + "loss": 0.7259, + "step": 1743 + }, + { + "epoch": 0.17814096016343206, + "grad_norm": 1.460144219999322, + "learning_rate": 1.8871296185452225e-05, + "loss": 0.7497, + "step": 1744 + }, + { + "epoch": 0.17824310520939735, + "grad_norm": 1.5321043045469942, + "learning_rate": 1.8869768837144908e-05, + "loss": 0.6947, + "step": 1745 + }, + { + "epoch": 0.1783452502553626, + "grad_norm": 1.5054494438453947, + "learning_rate": 1.8868240518035667e-05, + "loss": 0.7661, + "step": 1746 + }, + { + "epoch": 0.1784473953013279, + "grad_norm": 1.621487456620026, + "learning_rate": 1.8866711228291768e-05, + "loss": 0.7292, + "step": 1747 + }, + { + "epoch": 0.17854954034729315, + "grad_norm": 1.3935445277579703, + "learning_rate": 1.8865180968080595e-05, + "loss": 0.7232, + "step": 1748 + }, + { + "epoch": 0.17865168539325843, + "grad_norm": 1.6799115300525853, + "learning_rate": 1.886364973756964e-05, + "loss": 0.8039, + "step": 1749 + }, + { + "epoch": 0.1787538304392237, + "grad_norm": 1.5013111853513779, + "learning_rate": 1.8862117536926498e-05, + "loss": 0.7039, + "step": 1750 + }, + { + "epoch": 0.17885597548518897, + "grad_norm": 1.527134971761119, + "learning_rate": 1.886058436631886e-05, + "loss": 0.8871, + "step": 1751 + }, + { + "epoch": 0.17895812053115423, + "grad_norm": 1.5475602842355707, + "learning_rate": 1.885905022591454e-05, + "loss": 0.7291, + "step": 1752 + }, + { + "epoch": 0.17906026557711952, + "grad_norm": 1.5281695675969766, + "learning_rate": 1.8857515115881447e-05, + "loss": 0.6369, + "step": 1753 + }, + { + "epoch": 0.17916241062308477, + "grad_norm": 1.3615608515268662, + "learning_rate": 1.8855979036387607e-05, + "loss": 0.7154, + "step": 1754 + }, + { + "epoch": 0.17926455566905006, + "grad_norm": 1.4776607990469777, + "learning_rate": 1.8854441987601137e-05, + "loss": 0.7451, + "step": 1755 + }, + { + "epoch": 0.17936670071501531, + "grad_norm": 1.464285683725378, + "learning_rate": 1.885290396969027e-05, + "loss": 0.7573, + "step": 1756 + }, + { + "epoch": 0.1794688457609806, + "grad_norm": 1.6051134360793224, + "learning_rate": 1.8851364982823342e-05, + "loss": 0.7426, + "step": 1757 + }, + { + "epoch": 0.17957099080694586, + "grad_norm": 1.5387703770992684, + "learning_rate": 1.8849825027168804e-05, + "loss": 0.8334, + "step": 1758 + }, + { + "epoch": 0.17967313585291114, + "grad_norm": 1.549289613041583, + "learning_rate": 1.8848284102895194e-05, + "loss": 0.7647, + "step": 1759 + }, + { + "epoch": 0.1797752808988764, + "grad_norm": 1.3342260548054745, + "learning_rate": 1.8846742210171177e-05, + "loss": 0.6078, + "step": 1760 + }, + { + "epoch": 0.17987742594484168, + "grad_norm": 1.5539474255802859, + "learning_rate": 1.8845199349165505e-05, + "loss": 0.7049, + "step": 1761 + }, + { + "epoch": 0.17997957099080694, + "grad_norm": 1.5160313403182686, + "learning_rate": 1.884365552004705e-05, + "loss": 0.7299, + "step": 1762 + }, + { + "epoch": 0.18008171603677223, + "grad_norm": 1.4075691925509088, + "learning_rate": 1.8842110722984787e-05, + "loss": 0.7894, + "step": 1763 + }, + { + "epoch": 0.18018386108273748, + "grad_norm": 1.6545597835502586, + "learning_rate": 1.884056495814779e-05, + "loss": 0.768, + "step": 1764 + }, + { + "epoch": 0.18028600612870277, + "grad_norm": 1.3552097667371947, + "learning_rate": 1.8839018225705247e-05, + "loss": 0.7508, + "step": 1765 + }, + { + "epoch": 0.18038815117466803, + "grad_norm": 1.6976829293541074, + "learning_rate": 1.883747052582645e-05, + "loss": 0.7671, + "step": 1766 + }, + { + "epoch": 0.1804902962206333, + "grad_norm": 1.4390663098752343, + "learning_rate": 1.8835921858680793e-05, + "loss": 0.9087, + "step": 1767 + }, + { + "epoch": 0.18059244126659857, + "grad_norm": 1.5926655514771286, + "learning_rate": 1.8834372224437782e-05, + "loss": 0.9643, + "step": 1768 + }, + { + "epoch": 0.18069458631256385, + "grad_norm": 1.5124824287204868, + "learning_rate": 1.883282162326702e-05, + "loss": 0.7677, + "step": 1769 + }, + { + "epoch": 0.1807967313585291, + "grad_norm": 1.7516725496348795, + "learning_rate": 1.8831270055338223e-05, + "loss": 0.8012, + "step": 1770 + }, + { + "epoch": 0.18089887640449437, + "grad_norm": 1.5253110669517245, + "learning_rate": 1.8829717520821217e-05, + "loss": 0.8358, + "step": 1771 + }, + { + "epoch": 0.18100102145045965, + "grad_norm": 1.5417836281123047, + "learning_rate": 1.8828164019885923e-05, + "loss": 0.7064, + "step": 1772 + }, + { + "epoch": 0.1811031664964249, + "grad_norm": 1.497711602314893, + "learning_rate": 1.8826609552702373e-05, + "loss": 0.8343, + "step": 1773 + }, + { + "epoch": 0.1812053115423902, + "grad_norm": 1.3855986453864657, + "learning_rate": 1.8825054119440707e-05, + "loss": 0.7801, + "step": 1774 + }, + { + "epoch": 0.18130745658835545, + "grad_norm": 1.5148514645104005, + "learning_rate": 1.8823497720271162e-05, + "loss": 0.7157, + "step": 1775 + }, + { + "epoch": 0.18140960163432074, + "grad_norm": 1.6466224709714379, + "learning_rate": 1.8821940355364094e-05, + "loss": 0.7871, + "step": 1776 + }, + { + "epoch": 0.181511746680286, + "grad_norm": 1.459122073675351, + "learning_rate": 1.882038202488995e-05, + "loss": 0.8131, + "step": 1777 + }, + { + "epoch": 0.18161389172625128, + "grad_norm": 1.478351886327625, + "learning_rate": 1.8818822729019296e-05, + "loss": 0.8378, + "step": 1778 + }, + { + "epoch": 0.18171603677221654, + "grad_norm": 1.4776996874510193, + "learning_rate": 1.88172624679228e-05, + "loss": 0.6862, + "step": 1779 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 1.4358298192796701, + "learning_rate": 1.8815701241771226e-05, + "loss": 0.7026, + "step": 1780 + }, + { + "epoch": 0.18192032686414708, + "grad_norm": 1.424238215107601, + "learning_rate": 1.8814139050735458e-05, + "loss": 0.7794, + "step": 1781 + }, + { + "epoch": 0.18202247191011237, + "grad_norm": 1.5530215080498537, + "learning_rate": 1.8812575894986476e-05, + "loss": 0.7813, + "step": 1782 + }, + { + "epoch": 0.18212461695607762, + "grad_norm": 1.3936397898948503, + "learning_rate": 1.8811011774695368e-05, + "loss": 0.7901, + "step": 1783 + }, + { + "epoch": 0.1822267620020429, + "grad_norm": 1.5462288078001551, + "learning_rate": 1.880944669003333e-05, + "loss": 0.8313, + "step": 1784 + }, + { + "epoch": 0.18232890704800817, + "grad_norm": 1.3908833098633122, + "learning_rate": 1.8807880641171658e-05, + "loss": 0.6667, + "step": 1785 + }, + { + "epoch": 0.18243105209397345, + "grad_norm": 1.5046107400429856, + "learning_rate": 1.880631362828176e-05, + "loss": 0.7726, + "step": 1786 + }, + { + "epoch": 0.1825331971399387, + "grad_norm": 1.481318713359148, + "learning_rate": 1.8804745651535147e-05, + "loss": 0.7834, + "step": 1787 + }, + { + "epoch": 0.182635342185904, + "grad_norm": 1.6737020184541211, + "learning_rate": 1.8803176711103432e-05, + "loss": 0.8335, + "step": 1788 + }, + { + "epoch": 0.18273748723186925, + "grad_norm": 1.4675559585919482, + "learning_rate": 1.8801606807158342e-05, + "loss": 0.7547, + "step": 1789 + }, + { + "epoch": 0.18283963227783454, + "grad_norm": 1.5216604232340878, + "learning_rate": 1.8800035939871697e-05, + "loss": 0.7888, + "step": 1790 + }, + { + "epoch": 0.1829417773237998, + "grad_norm": 1.4887658648595692, + "learning_rate": 1.879846410941543e-05, + "loss": 0.6972, + "step": 1791 + }, + { + "epoch": 0.18304392236976508, + "grad_norm": 1.617558762147985, + "learning_rate": 1.879689131596159e-05, + "loss": 0.7916, + "step": 1792 + }, + { + "epoch": 0.18314606741573033, + "grad_norm": 1.3399225389184626, + "learning_rate": 1.8795317559682305e-05, + "loss": 0.7203, + "step": 1793 + }, + { + "epoch": 0.18324821246169562, + "grad_norm": 1.6207718185361897, + "learning_rate": 1.879374284074983e-05, + "loss": 1.0082, + "step": 1794 + }, + { + "epoch": 0.18335035750766088, + "grad_norm": 1.6288233394742437, + "learning_rate": 1.8792167159336526e-05, + "loss": 0.7659, + "step": 1795 + }, + { + "epoch": 0.18345250255362616, + "grad_norm": 1.4509509474637101, + "learning_rate": 1.8790590515614842e-05, + "loss": 0.707, + "step": 1796 + }, + { + "epoch": 0.18355464759959142, + "grad_norm": 1.383497665963032, + "learning_rate": 1.878901290975735e-05, + "loss": 0.7846, + "step": 1797 + }, + { + "epoch": 0.18365679264555668, + "grad_norm": 1.4421538897054473, + "learning_rate": 1.878743434193671e-05, + "loss": 0.6717, + "step": 1798 + }, + { + "epoch": 0.18375893769152196, + "grad_norm": 1.383079349080227, + "learning_rate": 1.8785854812325704e-05, + "loss": 0.6825, + "step": 1799 + }, + { + "epoch": 0.18386108273748722, + "grad_norm": 1.5161614744753042, + "learning_rate": 1.878427432109722e-05, + "loss": 0.7663, + "step": 1800 + }, + { + "epoch": 0.1839632277834525, + "grad_norm": 1.6012388046505472, + "learning_rate": 1.878269286842423e-05, + "loss": 0.8353, + "step": 1801 + }, + { + "epoch": 0.18406537282941776, + "grad_norm": 1.5088291014967619, + "learning_rate": 1.8781110454479834e-05, + "loss": 0.7301, + "step": 1802 + }, + { + "epoch": 0.18416751787538305, + "grad_norm": 1.5049724087621288, + "learning_rate": 1.8779527079437226e-05, + "loss": 0.7403, + "step": 1803 + }, + { + "epoch": 0.1842696629213483, + "grad_norm": 1.6220299596196548, + "learning_rate": 1.8777942743469705e-05, + "loss": 0.7866, + "step": 1804 + }, + { + "epoch": 0.1843718079673136, + "grad_norm": 1.500484583070211, + "learning_rate": 1.877635744675068e-05, + "loss": 0.7101, + "step": 1805 + }, + { + "epoch": 0.18447395301327885, + "grad_norm": 1.5410474319740113, + "learning_rate": 1.877477118945366e-05, + "loss": 0.732, + "step": 1806 + }, + { + "epoch": 0.18457609805924413, + "grad_norm": 1.5232889682497255, + "learning_rate": 1.8773183971752266e-05, + "loss": 0.7835, + "step": 1807 + }, + { + "epoch": 0.1846782431052094, + "grad_norm": 1.4978308793429884, + "learning_rate": 1.8771595793820218e-05, + "loss": 0.8195, + "step": 1808 + }, + { + "epoch": 0.18478038815117467, + "grad_norm": 1.5009145033963818, + "learning_rate": 1.8770006655831344e-05, + "loss": 0.8184, + "step": 1809 + }, + { + "epoch": 0.18488253319713993, + "grad_norm": 1.345350146097316, + "learning_rate": 1.8768416557959578e-05, + "loss": 0.666, + "step": 1810 + }, + { + "epoch": 0.18498467824310522, + "grad_norm": 1.4638958631785517, + "learning_rate": 1.876682550037895e-05, + "loss": 0.6191, + "step": 1811 + }, + { + "epoch": 0.18508682328907047, + "grad_norm": 1.441706471078032, + "learning_rate": 1.876523348326361e-05, + "loss": 0.8151, + "step": 1812 + }, + { + "epoch": 0.18518896833503576, + "grad_norm": 1.483149396013851, + "learning_rate": 1.8763640506787795e-05, + "loss": 0.7702, + "step": 1813 + }, + { + "epoch": 0.18529111338100102, + "grad_norm": 1.4107501271156073, + "learning_rate": 1.8762046571125873e-05, + "loss": 0.7867, + "step": 1814 + }, + { + "epoch": 0.1853932584269663, + "grad_norm": 1.4906393281742727, + "learning_rate": 1.876045167645229e-05, + "loss": 0.6602, + "step": 1815 + }, + { + "epoch": 0.18549540347293156, + "grad_norm": 1.3885181269134927, + "learning_rate": 1.8758855822941614e-05, + "loss": 0.84, + "step": 1816 + }, + { + "epoch": 0.18559754851889684, + "grad_norm": 1.6826654665163017, + "learning_rate": 1.87572590107685e-05, + "loss": 0.7574, + "step": 1817 + }, + { + "epoch": 0.1856996935648621, + "grad_norm": 1.6679059396292548, + "learning_rate": 1.875566124010774e-05, + "loss": 0.7335, + "step": 1818 + }, + { + "epoch": 0.18580183861082739, + "grad_norm": 1.5083446964707001, + "learning_rate": 1.8754062511134195e-05, + "loss": 0.7384, + "step": 1819 + }, + { + "epoch": 0.18590398365679264, + "grad_norm": 1.4766639320775647, + "learning_rate": 1.8752462824022853e-05, + "loss": 0.8127, + "step": 1820 + }, + { + "epoch": 0.18600612870275793, + "grad_norm": 1.3793029796210652, + "learning_rate": 1.87508621789488e-05, + "loss": 0.7941, + "step": 1821 + }, + { + "epoch": 0.18610827374872319, + "grad_norm": 1.5755246060005315, + "learning_rate": 1.8749260576087227e-05, + "loss": 0.8592, + "step": 1822 + }, + { + "epoch": 0.18621041879468847, + "grad_norm": 1.3192273290851888, + "learning_rate": 1.874765801561343e-05, + "loss": 0.6727, + "step": 1823 + }, + { + "epoch": 0.18631256384065373, + "grad_norm": 1.3903652546849665, + "learning_rate": 1.874605449770282e-05, + "loss": 0.7759, + "step": 1824 + }, + { + "epoch": 0.186414708886619, + "grad_norm": 1.6034452935072083, + "learning_rate": 1.8744450022530883e-05, + "loss": 0.6549, + "step": 1825 + }, + { + "epoch": 0.18651685393258427, + "grad_norm": 1.4175914052444982, + "learning_rate": 1.874284459027325e-05, + "loss": 0.8694, + "step": 1826 + }, + { + "epoch": 0.18661899897854953, + "grad_norm": 1.5122645635509433, + "learning_rate": 1.8741238201105622e-05, + "loss": 0.732, + "step": 1827 + }, + { + "epoch": 0.1867211440245148, + "grad_norm": 1.6021854069114603, + "learning_rate": 1.8739630855203828e-05, + "loss": 0.8332, + "step": 1828 + }, + { + "epoch": 0.18682328907048007, + "grad_norm": 1.5840423856404344, + "learning_rate": 1.873802255274379e-05, + "loss": 0.7127, + "step": 1829 + }, + { + "epoch": 0.18692543411644535, + "grad_norm": 1.7424203973594155, + "learning_rate": 1.8736413293901542e-05, + "loss": 0.6892, + "step": 1830 + }, + { + "epoch": 0.1870275791624106, + "grad_norm": 1.4116257096844307, + "learning_rate": 1.8734803078853208e-05, + "loss": 0.6099, + "step": 1831 + }, + { + "epoch": 0.1871297242083759, + "grad_norm": 1.3619963609424839, + "learning_rate": 1.8733191907775036e-05, + "loss": 0.6665, + "step": 1832 + }, + { + "epoch": 0.18723186925434115, + "grad_norm": 1.486470907959484, + "learning_rate": 1.8731579780843368e-05, + "loss": 0.7367, + "step": 1833 + }, + { + "epoch": 0.18733401430030644, + "grad_norm": 1.6356446898867938, + "learning_rate": 1.8729966698234655e-05, + "loss": 0.7376, + "step": 1834 + }, + { + "epoch": 0.1874361593462717, + "grad_norm": 1.5358069162501742, + "learning_rate": 1.8728352660125438e-05, + "loss": 0.7308, + "step": 1835 + }, + { + "epoch": 0.18753830439223698, + "grad_norm": 1.5514393338682109, + "learning_rate": 1.872673766669239e-05, + "loss": 0.8184, + "step": 1836 + }, + { + "epoch": 0.18764044943820224, + "grad_norm": 1.5460397059521618, + "learning_rate": 1.8725121718112268e-05, + "loss": 0.8031, + "step": 1837 + }, + { + "epoch": 0.18774259448416752, + "grad_norm": 1.5090377119032092, + "learning_rate": 1.872350481456193e-05, + "loss": 0.8639, + "step": 1838 + }, + { + "epoch": 0.18784473953013278, + "grad_norm": 1.4686764996879726, + "learning_rate": 1.8721886956218357e-05, + "loss": 0.7653, + "step": 1839 + }, + { + "epoch": 0.18794688457609807, + "grad_norm": 1.5418366060267479, + "learning_rate": 1.8720268143258625e-05, + "loss": 0.7325, + "step": 1840 + }, + { + "epoch": 0.18804902962206332, + "grad_norm": 1.5963803873062974, + "learning_rate": 1.8718648375859905e-05, + "loss": 0.9121, + "step": 1841 + }, + { + "epoch": 0.1881511746680286, + "grad_norm": 1.3639099727070416, + "learning_rate": 1.8717027654199488e-05, + "loss": 0.7013, + "step": 1842 + }, + { + "epoch": 0.18825331971399387, + "grad_norm": 1.402047728140098, + "learning_rate": 1.871540597845477e-05, + "loss": 0.8189, + "step": 1843 + }, + { + "epoch": 0.18835546475995915, + "grad_norm": 1.6018790761428272, + "learning_rate": 1.8713783348803228e-05, + "loss": 0.8649, + "step": 1844 + }, + { + "epoch": 0.1884576098059244, + "grad_norm": 1.3914856267087141, + "learning_rate": 1.871215976542247e-05, + "loss": 0.6692, + "step": 1845 + }, + { + "epoch": 0.1885597548518897, + "grad_norm": 1.7299364332596034, + "learning_rate": 1.87105352284902e-05, + "loss": 0.7515, + "step": 1846 + }, + { + "epoch": 0.18866189989785495, + "grad_norm": 1.6872798683832921, + "learning_rate": 1.8708909738184217e-05, + "loss": 0.7718, + "step": 1847 + }, + { + "epoch": 0.18876404494382024, + "grad_norm": 1.6181316927710099, + "learning_rate": 1.870728329468244e-05, + "loss": 0.7306, + "step": 1848 + }, + { + "epoch": 0.1888661899897855, + "grad_norm": 1.6245973290106248, + "learning_rate": 1.870565589816288e-05, + "loss": 0.7498, + "step": 1849 + }, + { + "epoch": 0.18896833503575078, + "grad_norm": 1.3817131575806303, + "learning_rate": 1.870402754880366e-05, + "loss": 0.8759, + "step": 1850 + }, + { + "epoch": 0.18907048008171604, + "grad_norm": 1.535542191029484, + "learning_rate": 1.8702398246783e-05, + "loss": 0.7067, + "step": 1851 + }, + { + "epoch": 0.18917262512768132, + "grad_norm": 1.4381358358818255, + "learning_rate": 1.870076799227923e-05, + "loss": 0.7366, + "step": 1852 + }, + { + "epoch": 0.18927477017364658, + "grad_norm": 1.4800359993420105, + "learning_rate": 1.8699136785470776e-05, + "loss": 0.6912, + "step": 1853 + }, + { + "epoch": 0.18937691521961184, + "grad_norm": 1.3828234057439701, + "learning_rate": 1.8697504626536186e-05, + "loss": 0.8071, + "step": 1854 + }, + { + "epoch": 0.18947906026557712, + "grad_norm": 1.4314531742642738, + "learning_rate": 1.8695871515654094e-05, + "loss": 0.7988, + "step": 1855 + }, + { + "epoch": 0.18958120531154238, + "grad_norm": 1.5781007461499097, + "learning_rate": 1.8694237453003245e-05, + "loss": 0.8044, + "step": 1856 + }, + { + "epoch": 0.18968335035750766, + "grad_norm": 1.528131515904029, + "learning_rate": 1.8692602438762494e-05, + "loss": 0.801, + "step": 1857 + }, + { + "epoch": 0.18978549540347292, + "grad_norm": 1.493350536943422, + "learning_rate": 1.8690966473110786e-05, + "loss": 0.819, + "step": 1858 + }, + { + "epoch": 0.1898876404494382, + "grad_norm": 1.3690976206972525, + "learning_rate": 1.868932955622718e-05, + "loss": 0.6622, + "step": 1859 + }, + { + "epoch": 0.18998978549540346, + "grad_norm": 1.4450488388196274, + "learning_rate": 1.8687691688290844e-05, + "loss": 0.6883, + "step": 1860 + }, + { + "epoch": 0.19009193054136875, + "grad_norm": 1.455465197830054, + "learning_rate": 1.868605286948104e-05, + "loss": 0.6401, + "step": 1861 + }, + { + "epoch": 0.190194075587334, + "grad_norm": 1.3850561552178788, + "learning_rate": 1.8684413099977138e-05, + "loss": 0.7819, + "step": 1862 + }, + { + "epoch": 0.1902962206332993, + "grad_norm": 1.4758628906664164, + "learning_rate": 1.8682772379958607e-05, + "loss": 0.8318, + "step": 1863 + }, + { + "epoch": 0.19039836567926455, + "grad_norm": 1.3538602537036406, + "learning_rate": 1.868113070960503e-05, + "loss": 0.7179, + "step": 1864 + }, + { + "epoch": 0.19050051072522983, + "grad_norm": 1.5387815915942704, + "learning_rate": 1.8679488089096092e-05, + "loss": 0.8461, + "step": 1865 + }, + { + "epoch": 0.1906026557711951, + "grad_norm": 1.3976824294560608, + "learning_rate": 1.8677844518611575e-05, + "loss": 0.7678, + "step": 1866 + }, + { + "epoch": 0.19070480081716037, + "grad_norm": 1.5810223584642418, + "learning_rate": 1.8676199998331368e-05, + "loss": 0.6961, + "step": 1867 + }, + { + "epoch": 0.19080694586312563, + "grad_norm": 1.626365738288862, + "learning_rate": 1.8674554528435463e-05, + "loss": 0.7584, + "step": 1868 + }, + { + "epoch": 0.19090909090909092, + "grad_norm": 1.6077810178001672, + "learning_rate": 1.8672908109103963e-05, + "loss": 0.8119, + "step": 1869 + }, + { + "epoch": 0.19101123595505617, + "grad_norm": 1.6741140356003115, + "learning_rate": 1.8671260740517066e-05, + "loss": 0.7464, + "step": 1870 + }, + { + "epoch": 0.19111338100102146, + "grad_norm": 1.2776327613783214, + "learning_rate": 1.8669612422855078e-05, + "loss": 0.6256, + "step": 1871 + }, + { + "epoch": 0.19121552604698672, + "grad_norm": 1.4387947498518552, + "learning_rate": 1.866796315629841e-05, + "loss": 0.6996, + "step": 1872 + }, + { + "epoch": 0.191317671092952, + "grad_norm": 1.5078622680932743, + "learning_rate": 1.8666312941027574e-05, + "loss": 0.732, + "step": 1873 + }, + { + "epoch": 0.19141981613891726, + "grad_norm": 1.4129728956947285, + "learning_rate": 1.8664661777223186e-05, + "loss": 0.8086, + "step": 1874 + }, + { + "epoch": 0.19152196118488254, + "grad_norm": 1.4015431412158756, + "learning_rate": 1.866300966506597e-05, + "loss": 0.7117, + "step": 1875 + }, + { + "epoch": 0.1916241062308478, + "grad_norm": 1.7719634414037608, + "learning_rate": 1.866135660473675e-05, + "loss": 0.7304, + "step": 1876 + }, + { + "epoch": 0.1917262512768131, + "grad_norm": 1.6776031928414037, + "learning_rate": 1.8659702596416453e-05, + "loss": 0.7946, + "step": 1877 + }, + { + "epoch": 0.19182839632277834, + "grad_norm": 1.6375638804262858, + "learning_rate": 1.865804764028611e-05, + "loss": 0.7835, + "step": 1878 + }, + { + "epoch": 0.19193054136874363, + "grad_norm": 1.6179990019852686, + "learning_rate": 1.8656391736526866e-05, + "loss": 0.7849, + "step": 1879 + }, + { + "epoch": 0.1920326864147089, + "grad_norm": 1.460960367095404, + "learning_rate": 1.8654734885319944e-05, + "loss": 0.798, + "step": 1880 + }, + { + "epoch": 0.19213483146067414, + "grad_norm": 1.5828678069230913, + "learning_rate": 1.86530770868467e-05, + "loss": 0.81, + "step": 1881 + }, + { + "epoch": 0.19223697650663943, + "grad_norm": 1.5249023921072191, + "learning_rate": 1.865141834128858e-05, + "loss": 0.8151, + "step": 1882 + }, + { + "epoch": 0.19233912155260469, + "grad_norm": 1.4508863810355825, + "learning_rate": 1.8649758648827133e-05, + "loss": 0.7209, + "step": 1883 + }, + { + "epoch": 0.19244126659856997, + "grad_norm": 1.4243962873432259, + "learning_rate": 1.8648098009644012e-05, + "loss": 0.7746, + "step": 1884 + }, + { + "epoch": 0.19254341164453523, + "grad_norm": 1.4154742707954997, + "learning_rate": 1.864643642392098e-05, + "loss": 0.7587, + "step": 1885 + }, + { + "epoch": 0.1926455566905005, + "grad_norm": 1.482793213357357, + "learning_rate": 1.864477389183989e-05, + "loss": 0.7325, + "step": 1886 + }, + { + "epoch": 0.19274770173646577, + "grad_norm": 1.4006109574226546, + "learning_rate": 1.864311041358272e-05, + "loss": 0.7799, + "step": 1887 + }, + { + "epoch": 0.19284984678243106, + "grad_norm": 1.5423755699242454, + "learning_rate": 1.8641445989331525e-05, + "loss": 0.7521, + "step": 1888 + }, + { + "epoch": 0.1929519918283963, + "grad_norm": 1.5107425400423258, + "learning_rate": 1.8639780619268484e-05, + "loss": 0.6824, + "step": 1889 + }, + { + "epoch": 0.1930541368743616, + "grad_norm": 1.4022406563421497, + "learning_rate": 1.863811430357587e-05, + "loss": 0.7835, + "step": 1890 + }, + { + "epoch": 0.19315628192032686, + "grad_norm": 1.5402200953891934, + "learning_rate": 1.863644704243607e-05, + "loss": 0.6614, + "step": 1891 + }, + { + "epoch": 0.19325842696629214, + "grad_norm": 1.4577731384804729, + "learning_rate": 1.8634778836031558e-05, + "loss": 0.7888, + "step": 1892 + }, + { + "epoch": 0.1933605720122574, + "grad_norm": 1.588695888542344, + "learning_rate": 1.8633109684544924e-05, + "loss": 0.7575, + "step": 1893 + }, + { + "epoch": 0.19346271705822268, + "grad_norm": 1.3729918210137502, + "learning_rate": 1.8631439588158858e-05, + "loss": 0.6792, + "step": 1894 + }, + { + "epoch": 0.19356486210418794, + "grad_norm": 1.411363842431786, + "learning_rate": 1.862976854705615e-05, + "loss": 0.6889, + "step": 1895 + }, + { + "epoch": 0.19366700715015323, + "grad_norm": 1.6070516218488293, + "learning_rate": 1.8628096561419703e-05, + "loss": 0.8046, + "step": 1896 + }, + { + "epoch": 0.19376915219611848, + "grad_norm": 1.4750123144322365, + "learning_rate": 1.862642363143251e-05, + "loss": 0.7965, + "step": 1897 + }, + { + "epoch": 0.19387129724208377, + "grad_norm": 1.5009741606429343, + "learning_rate": 1.862474975727768e-05, + "loss": 0.7776, + "step": 1898 + }, + { + "epoch": 0.19397344228804902, + "grad_norm": 1.5994998080441714, + "learning_rate": 1.8623074939138412e-05, + "loss": 0.831, + "step": 1899 + }, + { + "epoch": 0.1940755873340143, + "grad_norm": 1.424427887050029, + "learning_rate": 1.8621399177198024e-05, + "loss": 0.6619, + "step": 1900 + }, + { + "epoch": 0.19417773237997957, + "grad_norm": 1.3860322049818534, + "learning_rate": 1.8619722471639926e-05, + "loss": 0.7187, + "step": 1901 + }, + { + "epoch": 0.19427987742594485, + "grad_norm": 1.4118451742368041, + "learning_rate": 1.8618044822647632e-05, + "loss": 0.8235, + "step": 1902 + }, + { + "epoch": 0.1943820224719101, + "grad_norm": 1.51959420797935, + "learning_rate": 1.8616366230404766e-05, + "loss": 0.8163, + "step": 1903 + }, + { + "epoch": 0.1944841675178754, + "grad_norm": 1.582372673945881, + "learning_rate": 1.861468669509505e-05, + "loss": 0.8239, + "step": 1904 + }, + { + "epoch": 0.19458631256384065, + "grad_norm": 1.593714968221231, + "learning_rate": 1.8613006216902308e-05, + "loss": 0.8247, + "step": 1905 + }, + { + "epoch": 0.19468845760980594, + "grad_norm": 1.473919090855708, + "learning_rate": 1.861132479601047e-05, + "loss": 0.7108, + "step": 1906 + }, + { + "epoch": 0.1947906026557712, + "grad_norm": 1.4756489829032506, + "learning_rate": 1.8609642432603572e-05, + "loss": 0.6867, + "step": 1907 + }, + { + "epoch": 0.19489274770173648, + "grad_norm": 1.4242614829739424, + "learning_rate": 1.8607959126865745e-05, + "loss": 0.797, + "step": 1908 + }, + { + "epoch": 0.19499489274770174, + "grad_norm": 1.540765670424351, + "learning_rate": 1.860627487898123e-05, + "loss": 0.7588, + "step": 1909 + }, + { + "epoch": 0.195097037793667, + "grad_norm": 1.3859286822537837, + "learning_rate": 1.8604589689134372e-05, + "loss": 0.6985, + "step": 1910 + }, + { + "epoch": 0.19519918283963228, + "grad_norm": 1.4748488944200038, + "learning_rate": 1.8602903557509614e-05, + "loss": 0.7315, + "step": 1911 + }, + { + "epoch": 0.19530132788559754, + "grad_norm": 1.572858652310003, + "learning_rate": 1.86012164842915e-05, + "loss": 0.8717, + "step": 1912 + }, + { + "epoch": 0.19540347293156282, + "grad_norm": 1.4037941666807017, + "learning_rate": 1.8599528469664686e-05, + "loss": 0.753, + "step": 1913 + }, + { + "epoch": 0.19550561797752808, + "grad_norm": 1.3429018085967726, + "learning_rate": 1.8597839513813923e-05, + "loss": 0.7411, + "step": 1914 + }, + { + "epoch": 0.19560776302349336, + "grad_norm": 1.4246578833523384, + "learning_rate": 1.8596149616924074e-05, + "loss": 0.6829, + "step": 1915 + }, + { + "epoch": 0.19570990806945862, + "grad_norm": 1.4512162991090363, + "learning_rate": 1.859445877918009e-05, + "loss": 0.8141, + "step": 1916 + }, + { + "epoch": 0.1958120531154239, + "grad_norm": 1.3049286636486919, + "learning_rate": 1.8592767000767045e-05, + "loss": 0.7225, + "step": 1917 + }, + { + "epoch": 0.19591419816138916, + "grad_norm": 1.5067016238727486, + "learning_rate": 1.85910742818701e-05, + "loss": 0.7098, + "step": 1918 + }, + { + "epoch": 0.19601634320735445, + "grad_norm": 1.3232204956764402, + "learning_rate": 1.858938062267452e-05, + "loss": 0.7405, + "step": 1919 + }, + { + "epoch": 0.1961184882533197, + "grad_norm": 1.6134026411615998, + "learning_rate": 1.8587686023365685e-05, + "loss": 0.8508, + "step": 1920 + }, + { + "epoch": 0.196220633299285, + "grad_norm": 1.4218705507206422, + "learning_rate": 1.8585990484129068e-05, + "loss": 0.6873, + "step": 1921 + }, + { + "epoch": 0.19632277834525025, + "grad_norm": 1.3306354822203843, + "learning_rate": 1.858429400515024e-05, + "loss": 0.7596, + "step": 1922 + }, + { + "epoch": 0.19642492339121553, + "grad_norm": 1.5564872715920985, + "learning_rate": 1.858259658661489e-05, + "loss": 0.7861, + "step": 1923 + }, + { + "epoch": 0.1965270684371808, + "grad_norm": 1.3899564337708528, + "learning_rate": 1.85808982287088e-05, + "loss": 0.6688, + "step": 1924 + }, + { + "epoch": 0.19662921348314608, + "grad_norm": 1.4797554434541693, + "learning_rate": 1.857919893161785e-05, + "loss": 0.7476, + "step": 1925 + }, + { + "epoch": 0.19673135852911133, + "grad_norm": 1.4798779955283625, + "learning_rate": 1.857749869552804e-05, + "loss": 0.6764, + "step": 1926 + }, + { + "epoch": 0.19683350357507662, + "grad_norm": 1.4943403560117658, + "learning_rate": 1.8575797520625455e-05, + "loss": 0.8671, + "step": 1927 + }, + { + "epoch": 0.19693564862104188, + "grad_norm": 1.5565047182305098, + "learning_rate": 1.8574095407096288e-05, + "loss": 0.8176, + "step": 1928 + }, + { + "epoch": 0.19703779366700716, + "grad_norm": 1.2468424023893614, + "learning_rate": 1.857239235512684e-05, + "loss": 0.6553, + "step": 1929 + }, + { + "epoch": 0.19713993871297242, + "grad_norm": 1.496461421038721, + "learning_rate": 1.857068836490351e-05, + "loss": 0.85, + "step": 1930 + }, + { + "epoch": 0.1972420837589377, + "grad_norm": 1.471542687876823, + "learning_rate": 1.85689834366128e-05, + "loss": 0.7993, + "step": 1931 + }, + { + "epoch": 0.19734422880490296, + "grad_norm": 1.5698689898477731, + "learning_rate": 1.856727757044132e-05, + "loss": 0.8024, + "step": 1932 + }, + { + "epoch": 0.19744637385086825, + "grad_norm": 1.3410885483553978, + "learning_rate": 1.8565570766575773e-05, + "loss": 0.7714, + "step": 1933 + }, + { + "epoch": 0.1975485188968335, + "grad_norm": 1.4115390443411366, + "learning_rate": 1.856386302520297e-05, + "loss": 0.8079, + "step": 1934 + }, + { + "epoch": 0.1976506639427988, + "grad_norm": 1.4288246439813557, + "learning_rate": 1.8562154346509826e-05, + "loss": 0.8133, + "step": 1935 + }, + { + "epoch": 0.19775280898876405, + "grad_norm": 1.4887123448310347, + "learning_rate": 1.856044473068336e-05, + "loss": 0.6497, + "step": 1936 + }, + { + "epoch": 0.1978549540347293, + "grad_norm": 1.5025613321783005, + "learning_rate": 1.8558734177910684e-05, + "loss": 0.7289, + "step": 1937 + }, + { + "epoch": 0.1979570990806946, + "grad_norm": 1.5937089680265561, + "learning_rate": 1.8557022688379027e-05, + "loss": 0.7769, + "step": 1938 + }, + { + "epoch": 0.19805924412665984, + "grad_norm": 1.5331409208630988, + "learning_rate": 1.8555310262275704e-05, + "loss": 0.8643, + "step": 1939 + }, + { + "epoch": 0.19816138917262513, + "grad_norm": 1.4937707428868934, + "learning_rate": 1.8553596899788145e-05, + "loss": 0.7264, + "step": 1940 + }, + { + "epoch": 0.1982635342185904, + "grad_norm": 1.501415343683645, + "learning_rate": 1.8551882601103882e-05, + "loss": 0.6718, + "step": 1941 + }, + { + "epoch": 0.19836567926455567, + "grad_norm": 1.3535263529173234, + "learning_rate": 1.8550167366410543e-05, + "loss": 0.7615, + "step": 1942 + }, + { + "epoch": 0.19846782431052093, + "grad_norm": 1.5345621231029483, + "learning_rate": 1.854845119589586e-05, + "loss": 0.6778, + "step": 1943 + }, + { + "epoch": 0.19856996935648621, + "grad_norm": 1.527429128809299, + "learning_rate": 1.854673408974767e-05, + "loss": 0.7269, + "step": 1944 + }, + { + "epoch": 0.19867211440245147, + "grad_norm": 1.5286100575086587, + "learning_rate": 1.8545016048153918e-05, + "loss": 0.7642, + "step": 1945 + }, + { + "epoch": 0.19877425944841676, + "grad_norm": 1.5833916586056414, + "learning_rate": 1.854329707130263e-05, + "loss": 0.7759, + "step": 1946 + }, + { + "epoch": 0.19887640449438201, + "grad_norm": 1.5565213204671897, + "learning_rate": 1.8541577159381964e-05, + "loss": 0.6472, + "step": 1947 + }, + { + "epoch": 0.1989785495403473, + "grad_norm": 1.3948486246636957, + "learning_rate": 1.8539856312580157e-05, + "loss": 0.7066, + "step": 1948 + }, + { + "epoch": 0.19908069458631256, + "grad_norm": 1.5923729631280517, + "learning_rate": 1.853813453108556e-05, + "loss": 0.7033, + "step": 1949 + }, + { + "epoch": 0.19918283963227784, + "grad_norm": 1.5486725694228305, + "learning_rate": 1.853641181508662e-05, + "loss": 0.8215, + "step": 1950 + }, + { + "epoch": 0.1992849846782431, + "grad_norm": 1.494804139996211, + "learning_rate": 1.8534688164771894e-05, + "loss": 0.7935, + "step": 1951 + }, + { + "epoch": 0.19938712972420838, + "grad_norm": 1.4768290753090216, + "learning_rate": 1.8532963580330034e-05, + "loss": 0.7194, + "step": 1952 + }, + { + "epoch": 0.19948927477017364, + "grad_norm": 1.308830779646724, + "learning_rate": 1.8531238061949795e-05, + "loss": 0.6173, + "step": 1953 + }, + { + "epoch": 0.19959141981613893, + "grad_norm": 2.628761458649275, + "learning_rate": 1.8529511609820038e-05, + "loss": 0.6455, + "step": 1954 + }, + { + "epoch": 0.19969356486210418, + "grad_norm": 1.5133101579122967, + "learning_rate": 1.8527784224129724e-05, + "loss": 0.8339, + "step": 1955 + }, + { + "epoch": 0.19979570990806947, + "grad_norm": 1.4048530607589202, + "learning_rate": 1.852605590506792e-05, + "loss": 0.7987, + "step": 1956 + }, + { + "epoch": 0.19989785495403473, + "grad_norm": 1.39960794402212, + "learning_rate": 1.852432665282379e-05, + "loss": 0.7656, + "step": 1957 + }, + { + "epoch": 0.2, + "grad_norm": 1.6460628434217563, + "learning_rate": 1.85225964675866e-05, + "loss": 0.7192, + "step": 1958 + }, + { + "epoch": 0.20010214504596527, + "grad_norm": 1.6355988495570337, + "learning_rate": 1.852086534954572e-05, + "loss": 0.7113, + "step": 1959 + }, + { + "epoch": 0.20020429009193055, + "grad_norm": 1.3818621921642458, + "learning_rate": 1.8519133298890616e-05, + "loss": 0.6766, + "step": 1960 + }, + { + "epoch": 0.2003064351378958, + "grad_norm": 1.4465293767329117, + "learning_rate": 1.8517400315810878e-05, + "loss": 0.8144, + "step": 1961 + }, + { + "epoch": 0.2004085801838611, + "grad_norm": 1.5246992590946782, + "learning_rate": 1.8515666400496164e-05, + "loss": 0.6801, + "step": 1962 + }, + { + "epoch": 0.20051072522982635, + "grad_norm": 1.4651214133224106, + "learning_rate": 1.8513931553136263e-05, + "loss": 0.7262, + "step": 1963 + }, + { + "epoch": 0.2006128702757916, + "grad_norm": 1.5502974719813756, + "learning_rate": 1.8512195773921056e-05, + "loss": 0.74, + "step": 1964 + }, + { + "epoch": 0.2007150153217569, + "grad_norm": 1.4982588487837019, + "learning_rate": 1.851045906304052e-05, + "loss": 0.761, + "step": 1965 + }, + { + "epoch": 0.20081716036772215, + "grad_norm": 1.5258121740907602, + "learning_rate": 1.8508721420684743e-05, + "loss": 0.7761, + "step": 1966 + }, + { + "epoch": 0.20091930541368744, + "grad_norm": 1.5314986028424151, + "learning_rate": 1.8506982847043906e-05, + "loss": 0.7799, + "step": 1967 + }, + { + "epoch": 0.2010214504596527, + "grad_norm": 1.49753058932652, + "learning_rate": 1.8505243342308302e-05, + "loss": 0.7539, + "step": 1968 + }, + { + "epoch": 0.20112359550561798, + "grad_norm": 1.5102195609725788, + "learning_rate": 1.8503502906668318e-05, + "loss": 0.813, + "step": 1969 + }, + { + "epoch": 0.20122574055158324, + "grad_norm": 1.4294148616171312, + "learning_rate": 1.850176154031445e-05, + "loss": 0.8452, + "step": 1970 + }, + { + "epoch": 0.20132788559754852, + "grad_norm": 1.6134728165619554, + "learning_rate": 1.8500019243437287e-05, + "loss": 0.7918, + "step": 1971 + }, + { + "epoch": 0.20143003064351378, + "grad_norm": 1.4888672023968599, + "learning_rate": 1.8498276016227525e-05, + "loss": 0.6697, + "step": 1972 + }, + { + "epoch": 0.20153217568947907, + "grad_norm": 1.4067934388596872, + "learning_rate": 1.849653185887596e-05, + "loss": 0.791, + "step": 1973 + }, + { + "epoch": 0.20163432073544432, + "grad_norm": 1.5579291710654195, + "learning_rate": 1.8494786771573495e-05, + "loss": 0.7772, + "step": 1974 + }, + { + "epoch": 0.2017364657814096, + "grad_norm": 1.493381037018715, + "learning_rate": 1.8493040754511132e-05, + "loss": 0.8466, + "step": 1975 + }, + { + "epoch": 0.20183861082737486, + "grad_norm": 1.5590402182649785, + "learning_rate": 1.8491293807879972e-05, + "loss": 0.7136, + "step": 1976 + }, + { + "epoch": 0.20194075587334015, + "grad_norm": 1.4019830053847468, + "learning_rate": 1.8489545931871214e-05, + "loss": 0.6857, + "step": 1977 + }, + { + "epoch": 0.2020429009193054, + "grad_norm": 1.4784031950862793, + "learning_rate": 1.8487797126676174e-05, + "loss": 0.693, + "step": 1978 + }, + { + "epoch": 0.2021450459652707, + "grad_norm": 1.4839370120001454, + "learning_rate": 1.848604739248625e-05, + "loss": 0.7291, + "step": 1979 + }, + { + "epoch": 0.20224719101123595, + "grad_norm": 1.6531613286945974, + "learning_rate": 1.8484296729492958e-05, + "loss": 0.7761, + "step": 1980 + }, + { + "epoch": 0.20234933605720123, + "grad_norm": 1.4980778473229213, + "learning_rate": 1.8482545137887912e-05, + "loss": 0.7785, + "step": 1981 + }, + { + "epoch": 0.2024514811031665, + "grad_norm": 1.4518042801621294, + "learning_rate": 1.8480792617862816e-05, + "loss": 0.6263, + "step": 1982 + }, + { + "epoch": 0.20255362614913178, + "grad_norm": 1.485100487599264, + "learning_rate": 1.8479039169609488e-05, + "loss": 0.7862, + "step": 1983 + }, + { + "epoch": 0.20265577119509703, + "grad_norm": 1.4308328446326286, + "learning_rate": 1.8477284793319843e-05, + "loss": 0.7139, + "step": 1984 + }, + { + "epoch": 0.20275791624106232, + "grad_norm": 1.328213752396637, + "learning_rate": 1.8475529489185904e-05, + "loss": 0.6588, + "step": 1985 + }, + { + "epoch": 0.20286006128702758, + "grad_norm": 1.6186169013989817, + "learning_rate": 1.8473773257399786e-05, + "loss": 0.7592, + "step": 1986 + }, + { + "epoch": 0.20296220633299286, + "grad_norm": 1.617177365724882, + "learning_rate": 1.847201609815371e-05, + "loss": 0.8496, + "step": 1987 + }, + { + "epoch": 0.20306435137895812, + "grad_norm": 1.6298769061893112, + "learning_rate": 1.8470258011639998e-05, + "loss": 0.8767, + "step": 1988 + }, + { + "epoch": 0.2031664964249234, + "grad_norm": 1.5049975488487637, + "learning_rate": 1.846849899805107e-05, + "loss": 0.7352, + "step": 1989 + }, + { + "epoch": 0.20326864147088866, + "grad_norm": 1.5278642226644494, + "learning_rate": 1.846673905757946e-05, + "loss": 0.6461, + "step": 1990 + }, + { + "epoch": 0.20337078651685395, + "grad_norm": 1.6295635029207831, + "learning_rate": 1.846497819041779e-05, + "loss": 0.8611, + "step": 1991 + }, + { + "epoch": 0.2034729315628192, + "grad_norm": 1.377571698136443, + "learning_rate": 1.8463216396758788e-05, + "loss": 0.6533, + "step": 1992 + }, + { + "epoch": 0.20357507660878446, + "grad_norm": 1.572635835252953, + "learning_rate": 1.8461453676795284e-05, + "loss": 0.7948, + "step": 1993 + }, + { + "epoch": 0.20367722165474975, + "grad_norm": 1.8295475991324666, + "learning_rate": 1.8459690030720206e-05, + "loss": 0.9703, + "step": 1994 + }, + { + "epoch": 0.203779366700715, + "grad_norm": 1.5020685263382414, + "learning_rate": 1.845792545872659e-05, + "loss": 0.7606, + "step": 1995 + }, + { + "epoch": 0.2038815117466803, + "grad_norm": 1.4870910578830265, + "learning_rate": 1.845615996100757e-05, + "loss": 0.7735, + "step": 1996 + }, + { + "epoch": 0.20398365679264555, + "grad_norm": 1.6040578783438644, + "learning_rate": 1.845439353775637e-05, + "loss": 0.8046, + "step": 1997 + }, + { + "epoch": 0.20408580183861083, + "grad_norm": 1.4108683605792747, + "learning_rate": 1.8452626189166345e-05, + "loss": 0.8333, + "step": 1998 + }, + { + "epoch": 0.2041879468845761, + "grad_norm": 1.5799295583486759, + "learning_rate": 1.8450857915430918e-05, + "loss": 0.8773, + "step": 1999 + }, + { + "epoch": 0.20429009193054137, + "grad_norm": 1.5751828695429502, + "learning_rate": 1.8449088716743633e-05, + "loss": 0.7924, + "step": 2000 + }, + { + "epoch": 0.20439223697650663, + "grad_norm": 1.5645695377719366, + "learning_rate": 1.844731859329813e-05, + "loss": 0.6189, + "step": 2001 + }, + { + "epoch": 0.20449438202247192, + "grad_norm": 1.5015370914764101, + "learning_rate": 1.8445547545288146e-05, + "loss": 0.7861, + "step": 2002 + }, + { + "epoch": 0.20459652706843717, + "grad_norm": 1.3632761947355794, + "learning_rate": 1.844377557290753e-05, + "loss": 0.7927, + "step": 2003 + }, + { + "epoch": 0.20469867211440246, + "grad_norm": 1.5717071407541336, + "learning_rate": 1.8442002676350224e-05, + "loss": 0.8023, + "step": 2004 + }, + { + "epoch": 0.20480081716036772, + "grad_norm": 1.5153615319188158, + "learning_rate": 1.844022885581027e-05, + "loss": 0.6348, + "step": 2005 + }, + { + "epoch": 0.204902962206333, + "grad_norm": 1.5194370039645273, + "learning_rate": 1.8438454111481808e-05, + "loss": 0.7191, + "step": 2006 + }, + { + "epoch": 0.20500510725229826, + "grad_norm": 1.3552030344916781, + "learning_rate": 1.84366784435591e-05, + "loss": 0.6666, + "step": 2007 + }, + { + "epoch": 0.20510725229826354, + "grad_norm": 1.5506437213023405, + "learning_rate": 1.8434901852236482e-05, + "loss": 0.7843, + "step": 2008 + }, + { + "epoch": 0.2052093973442288, + "grad_norm": 1.4449150711497472, + "learning_rate": 1.8433124337708404e-05, + "loss": 0.8281, + "step": 2009 + }, + { + "epoch": 0.20531154239019409, + "grad_norm": 1.5864974878969003, + "learning_rate": 1.8431345900169422e-05, + "loss": 0.7422, + "step": 2010 + }, + { + "epoch": 0.20541368743615934, + "grad_norm": 1.4693047016163825, + "learning_rate": 1.8429566539814186e-05, + "loss": 0.7514, + "step": 2011 + }, + { + "epoch": 0.20551583248212463, + "grad_norm": 1.5056618554993821, + "learning_rate": 1.842778625683744e-05, + "loss": 0.7536, + "step": 2012 + }, + { + "epoch": 0.20561797752808988, + "grad_norm": 1.4416577657806737, + "learning_rate": 1.842600505143405e-05, + "loss": 0.6774, + "step": 2013 + }, + { + "epoch": 0.20572012257405517, + "grad_norm": 1.506422446381562, + "learning_rate": 1.8424222923798956e-05, + "loss": 0.7627, + "step": 2014 + }, + { + "epoch": 0.20582226762002043, + "grad_norm": 1.4433964433851796, + "learning_rate": 1.8422439874127226e-05, + "loss": 0.6599, + "step": 2015 + }, + { + "epoch": 0.2059244126659857, + "grad_norm": 1.580549495557907, + "learning_rate": 1.8420655902614007e-05, + "loss": 0.7836, + "step": 2016 + }, + { + "epoch": 0.20602655771195097, + "grad_norm": 1.4897306924484361, + "learning_rate": 1.841887100945456e-05, + "loss": 0.8188, + "step": 2017 + }, + { + "epoch": 0.20612870275791625, + "grad_norm": 1.5047856578520649, + "learning_rate": 1.841708519484424e-05, + "loss": 0.7897, + "step": 2018 + }, + { + "epoch": 0.2062308478038815, + "grad_norm": 1.5094717210603286, + "learning_rate": 1.841529845897851e-05, + "loss": 0.8066, + "step": 2019 + }, + { + "epoch": 0.20633299284984677, + "grad_norm": 1.5899988397134077, + "learning_rate": 1.8413510802052926e-05, + "loss": 0.8315, + "step": 2020 + }, + { + "epoch": 0.20643513789581205, + "grad_norm": 1.5544695230386947, + "learning_rate": 1.8411722224263152e-05, + "loss": 0.8818, + "step": 2021 + }, + { + "epoch": 0.2065372829417773, + "grad_norm": 1.4019880121267256, + "learning_rate": 1.840993272580494e-05, + "loss": 0.692, + "step": 2022 + }, + { + "epoch": 0.2066394279877426, + "grad_norm": 1.423634387031326, + "learning_rate": 1.840814230687416e-05, + "loss": 0.7227, + "step": 2023 + }, + { + "epoch": 0.20674157303370785, + "grad_norm": 1.487160879474208, + "learning_rate": 1.8406350967666774e-05, + "loss": 0.7468, + "step": 2024 + }, + { + "epoch": 0.20684371807967314, + "grad_norm": 1.3286581571720821, + "learning_rate": 1.8404558708378843e-05, + "loss": 0.8188, + "step": 2025 + }, + { + "epoch": 0.2069458631256384, + "grad_norm": 1.5704137611571891, + "learning_rate": 1.8402765529206528e-05, + "loss": 0.777, + "step": 2026 + }, + { + "epoch": 0.20704800817160368, + "grad_norm": 1.6179467516882442, + "learning_rate": 1.84009714303461e-05, + "loss": 0.8077, + "step": 2027 + }, + { + "epoch": 0.20715015321756894, + "grad_norm": 1.3004220069934909, + "learning_rate": 1.839917641199392e-05, + "loss": 0.6739, + "step": 2028 + }, + { + "epoch": 0.20725229826353422, + "grad_norm": 1.500408916975731, + "learning_rate": 1.8397380474346456e-05, + "loss": 0.7216, + "step": 2029 + }, + { + "epoch": 0.20735444330949948, + "grad_norm": 1.4209433267665676, + "learning_rate": 1.839558361760027e-05, + "loss": 0.7866, + "step": 2030 + }, + { + "epoch": 0.20745658835546477, + "grad_norm": 1.5818601032682016, + "learning_rate": 1.839378584195204e-05, + "loss": 0.7316, + "step": 2031 + }, + { + "epoch": 0.20755873340143002, + "grad_norm": 1.3869440640764208, + "learning_rate": 1.839198714759852e-05, + "loss": 0.7804, + "step": 2032 + }, + { + "epoch": 0.2076608784473953, + "grad_norm": 1.6204912852391895, + "learning_rate": 1.839018753473659e-05, + "loss": 0.6945, + "step": 2033 + }, + { + "epoch": 0.20776302349336057, + "grad_norm": 1.468773924920138, + "learning_rate": 1.838838700356321e-05, + "loss": 0.7717, + "step": 2034 + }, + { + "epoch": 0.20786516853932585, + "grad_norm": 1.4269646212277507, + "learning_rate": 1.838658555427545e-05, + "loss": 0.7997, + "step": 2035 + }, + { + "epoch": 0.2079673135852911, + "grad_norm": 1.3739776483999735, + "learning_rate": 1.8384783187070487e-05, + "loss": 0.7342, + "step": 2036 + }, + { + "epoch": 0.2080694586312564, + "grad_norm": 1.3562759997073126, + "learning_rate": 1.838297990214559e-05, + "loss": 0.7419, + "step": 2037 + }, + { + "epoch": 0.20817160367722165, + "grad_norm": 1.5417145194308144, + "learning_rate": 1.838117569969812e-05, + "loss": 0.7188, + "step": 2038 + }, + { + "epoch": 0.20827374872318694, + "grad_norm": 1.3101003277730476, + "learning_rate": 1.837937057992556e-05, + "loss": 0.7758, + "step": 2039 + }, + { + "epoch": 0.2083758937691522, + "grad_norm": 1.4675705583988556, + "learning_rate": 1.8377564543025474e-05, + "loss": 0.6654, + "step": 2040 + }, + { + "epoch": 0.20847803881511748, + "grad_norm": 1.7065416919413976, + "learning_rate": 1.8375757589195532e-05, + "loss": 0.817, + "step": 2041 + }, + { + "epoch": 0.20858018386108274, + "grad_norm": 1.4444808048060602, + "learning_rate": 1.8373949718633518e-05, + "loss": 0.7596, + "step": 2042 + }, + { + "epoch": 0.20868232890704802, + "grad_norm": 1.3794600152584189, + "learning_rate": 1.8372140931537294e-05, + "loss": 0.7312, + "step": 2043 + }, + { + "epoch": 0.20878447395301328, + "grad_norm": 1.571723845117633, + "learning_rate": 1.837033122810483e-05, + "loss": 0.8345, + "step": 2044 + }, + { + "epoch": 0.20888661899897856, + "grad_norm": 1.3139337186162048, + "learning_rate": 1.8368520608534216e-05, + "loss": 0.7829, + "step": 2045 + }, + { + "epoch": 0.20898876404494382, + "grad_norm": 1.4616660447801493, + "learning_rate": 1.8366709073023608e-05, + "loss": 0.7297, + "step": 2046 + }, + { + "epoch": 0.20909090909090908, + "grad_norm": 1.4744081581011423, + "learning_rate": 1.8364896621771287e-05, + "loss": 0.7169, + "step": 2047 + }, + { + "epoch": 0.20919305413687436, + "grad_norm": 1.510535600973433, + "learning_rate": 1.8363083254975627e-05, + "loss": 0.7373, + "step": 2048 + }, + { + "epoch": 0.20929519918283962, + "grad_norm": 1.4004000289526342, + "learning_rate": 1.83612689728351e-05, + "loss": 0.684, + "step": 2049 + }, + { + "epoch": 0.2093973442288049, + "grad_norm": 1.4307149467262503, + "learning_rate": 1.8359453775548283e-05, + "loss": 0.6795, + "step": 2050 + }, + { + "epoch": 0.20949948927477016, + "grad_norm": 1.7064406437417563, + "learning_rate": 1.8357637663313852e-05, + "loss": 0.7316, + "step": 2051 + }, + { + "epoch": 0.20960163432073545, + "grad_norm": 1.541461610898297, + "learning_rate": 1.8355820636330577e-05, + "loss": 0.793, + "step": 2052 + }, + { + "epoch": 0.2097037793667007, + "grad_norm": 1.5889879491363323, + "learning_rate": 1.835400269479733e-05, + "loss": 0.8224, + "step": 2053 + }, + { + "epoch": 0.209805924412666, + "grad_norm": 1.4265537252153697, + "learning_rate": 1.8352183838913097e-05, + "loss": 0.8203, + "step": 2054 + }, + { + "epoch": 0.20990806945863125, + "grad_norm": 1.6309985673485417, + "learning_rate": 1.8350364068876946e-05, + "loss": 0.6241, + "step": 2055 + }, + { + "epoch": 0.21001021450459653, + "grad_norm": 1.407549674262613, + "learning_rate": 1.834854338488805e-05, + "loss": 0.7287, + "step": 2056 + }, + { + "epoch": 0.2101123595505618, + "grad_norm": 1.413590711742134, + "learning_rate": 1.834672178714569e-05, + "loss": 0.8517, + "step": 2057 + }, + { + "epoch": 0.21021450459652707, + "grad_norm": 1.3353719348380286, + "learning_rate": 1.8344899275849237e-05, + "loss": 0.7017, + "step": 2058 + }, + { + "epoch": 0.21031664964249233, + "grad_norm": 1.604792385605004, + "learning_rate": 1.8343075851198163e-05, + "loss": 0.7329, + "step": 2059 + }, + { + "epoch": 0.21041879468845762, + "grad_norm": 1.3924637181215194, + "learning_rate": 1.8341251513392052e-05, + "loss": 0.6772, + "step": 2060 + }, + { + "epoch": 0.21052093973442287, + "grad_norm": 1.5911575409181304, + "learning_rate": 1.8339426262630572e-05, + "loss": 0.7236, + "step": 2061 + }, + { + "epoch": 0.21062308478038816, + "grad_norm": 1.4542117485798298, + "learning_rate": 1.8337600099113496e-05, + "loss": 0.6965, + "step": 2062 + }, + { + "epoch": 0.21072522982635342, + "grad_norm": 1.4101642710176907, + "learning_rate": 1.8335773023040703e-05, + "loss": 0.757, + "step": 2063 + }, + { + "epoch": 0.2108273748723187, + "grad_norm": 1.4393450520729103, + "learning_rate": 1.833394503461217e-05, + "loss": 0.7471, + "step": 2064 + }, + { + "epoch": 0.21092951991828396, + "grad_norm": 1.5948754727264873, + "learning_rate": 1.8332116134027972e-05, + "loss": 0.723, + "step": 2065 + }, + { + "epoch": 0.21103166496424924, + "grad_norm": 1.4668231794758515, + "learning_rate": 1.8330286321488273e-05, + "loss": 0.6852, + "step": 2066 + }, + { + "epoch": 0.2111338100102145, + "grad_norm": 1.51108979980516, + "learning_rate": 1.832845559719336e-05, + "loss": 0.7959, + "step": 2067 + }, + { + "epoch": 0.21123595505617979, + "grad_norm": 1.5123843416906917, + "learning_rate": 1.8326623961343592e-05, + "loss": 0.8118, + "step": 2068 + }, + { + "epoch": 0.21133810010214504, + "grad_norm": 1.6035584156977898, + "learning_rate": 1.8324791414139456e-05, + "loss": 0.828, + "step": 2069 + }, + { + "epoch": 0.21144024514811033, + "grad_norm": 1.3744926444855061, + "learning_rate": 1.8322957955781525e-05, + "loss": 0.7085, + "step": 2070 + }, + { + "epoch": 0.21154239019407559, + "grad_norm": 1.388727978012941, + "learning_rate": 1.8321123586470467e-05, + "loss": 0.6965, + "step": 2071 + }, + { + "epoch": 0.21164453524004087, + "grad_norm": 1.5973752776047894, + "learning_rate": 1.8319288306407053e-05, + "loss": 0.7312, + "step": 2072 + }, + { + "epoch": 0.21174668028600613, + "grad_norm": 1.606928888842806, + "learning_rate": 1.8317452115792163e-05, + "loss": 0.7089, + "step": 2073 + }, + { + "epoch": 0.2118488253319714, + "grad_norm": 1.3935734696945647, + "learning_rate": 1.8315615014826762e-05, + "loss": 0.7905, + "step": 2074 + }, + { + "epoch": 0.21195097037793667, + "grad_norm": 1.6725542655152714, + "learning_rate": 1.8313777003711927e-05, + "loss": 0.755, + "step": 2075 + }, + { + "epoch": 0.21205311542390193, + "grad_norm": 1.6531169744306426, + "learning_rate": 1.8311938082648827e-05, + "loss": 0.7662, + "step": 2076 + }, + { + "epoch": 0.2121552604698672, + "grad_norm": 1.6492877696717072, + "learning_rate": 1.8310098251838735e-05, + "loss": 0.6538, + "step": 2077 + }, + { + "epoch": 0.21225740551583247, + "grad_norm": 1.4807942407080226, + "learning_rate": 1.8308257511483018e-05, + "loss": 0.7229, + "step": 2078 + }, + { + "epoch": 0.21235955056179776, + "grad_norm": 1.581636977746479, + "learning_rate": 1.8306415861783148e-05, + "loss": 0.7848, + "step": 2079 + }, + { + "epoch": 0.212461695607763, + "grad_norm": 1.3967852371062568, + "learning_rate": 1.83045733029407e-05, + "loss": 0.785, + "step": 2080 + }, + { + "epoch": 0.2125638406537283, + "grad_norm": 1.4523256904193949, + "learning_rate": 1.8302729835157335e-05, + "loss": 0.7177, + "step": 2081 + }, + { + "epoch": 0.21266598569969355, + "grad_norm": 1.306571562523013, + "learning_rate": 1.8300885458634824e-05, + "loss": 0.7532, + "step": 2082 + }, + { + "epoch": 0.21276813074565884, + "grad_norm": 1.6133269522473785, + "learning_rate": 1.829904017357504e-05, + "loss": 0.7049, + "step": 2083 + }, + { + "epoch": 0.2128702757916241, + "grad_norm": 1.5373634117186494, + "learning_rate": 1.8297193980179943e-05, + "loss": 0.7188, + "step": 2084 + }, + { + "epoch": 0.21297242083758938, + "grad_norm": 1.3887690358930573, + "learning_rate": 1.8295346878651606e-05, + "loss": 0.6514, + "step": 2085 + }, + { + "epoch": 0.21307456588355464, + "grad_norm": 1.4938789958938496, + "learning_rate": 1.8293498869192198e-05, + "loss": 0.7768, + "step": 2086 + }, + { + "epoch": 0.21317671092951992, + "grad_norm": 1.4819939852680848, + "learning_rate": 1.8291649952003972e-05, + "loss": 0.7177, + "step": 2087 + }, + { + "epoch": 0.21327885597548518, + "grad_norm": 1.5502525209606006, + "learning_rate": 1.8289800127289308e-05, + "loss": 0.7997, + "step": 2088 + }, + { + "epoch": 0.21338100102145047, + "grad_norm": 1.540853794114667, + "learning_rate": 1.828794939525066e-05, + "loss": 0.6858, + "step": 2089 + }, + { + "epoch": 0.21348314606741572, + "grad_norm": 1.5431259225314777, + "learning_rate": 1.82860977560906e-05, + "loss": 0.7772, + "step": 2090 + }, + { + "epoch": 0.213585291113381, + "grad_norm": 1.3593147927026836, + "learning_rate": 1.8284245210011787e-05, + "loss": 0.6476, + "step": 2091 + }, + { + "epoch": 0.21368743615934627, + "grad_norm": 1.788935535671766, + "learning_rate": 1.828239175721698e-05, + "loss": 0.7655, + "step": 2092 + }, + { + "epoch": 0.21378958120531155, + "grad_norm": 1.488968606739639, + "learning_rate": 1.8280537397909048e-05, + "loss": 0.8084, + "step": 2093 + }, + { + "epoch": 0.2138917262512768, + "grad_norm": 1.4409127232229466, + "learning_rate": 1.8278682132290946e-05, + "loss": 0.7598, + "step": 2094 + }, + { + "epoch": 0.2139938712972421, + "grad_norm": 1.5499899735415374, + "learning_rate": 1.8276825960565733e-05, + "loss": 0.8267, + "step": 2095 + }, + { + "epoch": 0.21409601634320735, + "grad_norm": 1.428975859350018, + "learning_rate": 1.8274968882936576e-05, + "loss": 0.6844, + "step": 2096 + }, + { + "epoch": 0.21419816138917264, + "grad_norm": 1.3985035190401145, + "learning_rate": 1.8273110899606727e-05, + "loss": 0.7164, + "step": 2097 + }, + { + "epoch": 0.2143003064351379, + "grad_norm": 1.6055106408312223, + "learning_rate": 1.8271252010779546e-05, + "loss": 0.7214, + "step": 2098 + }, + { + "epoch": 0.21440245148110318, + "grad_norm": 1.6122077685824703, + "learning_rate": 1.8269392216658485e-05, + "loss": 0.732, + "step": 2099 + }, + { + "epoch": 0.21450459652706844, + "grad_norm": 1.5042028174754953, + "learning_rate": 1.8267531517447104e-05, + "loss": 0.8137, + "step": 2100 + }, + { + "epoch": 0.21460674157303372, + "grad_norm": 1.3900401158657485, + "learning_rate": 1.826566991334906e-05, + "loss": 0.7466, + "step": 2101 + }, + { + "epoch": 0.21470888661899898, + "grad_norm": 1.636502160804699, + "learning_rate": 1.82638074045681e-05, + "loss": 0.8565, + "step": 2102 + }, + { + "epoch": 0.21481103166496424, + "grad_norm": 1.3915238660264562, + "learning_rate": 1.8261943991308082e-05, + "loss": 0.6745, + "step": 2103 + }, + { + "epoch": 0.21491317671092952, + "grad_norm": 1.4444631272058663, + "learning_rate": 1.826007967377296e-05, + "loss": 0.8136, + "step": 2104 + }, + { + "epoch": 0.21501532175689478, + "grad_norm": 1.6304712970290334, + "learning_rate": 1.8258214452166773e-05, + "loss": 0.8468, + "step": 2105 + }, + { + "epoch": 0.21511746680286006, + "grad_norm": 1.4084882167106183, + "learning_rate": 1.8256348326693684e-05, + "loss": 0.7414, + "step": 2106 + }, + { + "epoch": 0.21521961184882532, + "grad_norm": 1.7061448383206608, + "learning_rate": 1.8254481297557935e-05, + "loss": 0.7698, + "step": 2107 + }, + { + "epoch": 0.2153217568947906, + "grad_norm": 1.7035587949568587, + "learning_rate": 1.8252613364963875e-05, + "loss": 0.8073, + "step": 2108 + }, + { + "epoch": 0.21542390194075586, + "grad_norm": 1.4553185199455754, + "learning_rate": 1.825074452911595e-05, + "loss": 0.7455, + "step": 2109 + }, + { + "epoch": 0.21552604698672115, + "grad_norm": 1.4525814825844223, + "learning_rate": 1.8248874790218707e-05, + "loss": 0.6685, + "step": 2110 + }, + { + "epoch": 0.2156281920326864, + "grad_norm": 1.4069050366260896, + "learning_rate": 1.824700414847679e-05, + "loss": 0.8173, + "step": 2111 + }, + { + "epoch": 0.2157303370786517, + "grad_norm": 1.3989181068482597, + "learning_rate": 1.8245132604094936e-05, + "loss": 0.6993, + "step": 2112 + }, + { + "epoch": 0.21583248212461695, + "grad_norm": 1.4719824888977369, + "learning_rate": 1.8243260157277995e-05, + "loss": 0.7268, + "step": 2113 + }, + { + "epoch": 0.21593462717058223, + "grad_norm": 1.489918526224448, + "learning_rate": 1.8241386808230903e-05, + "loss": 0.6918, + "step": 2114 + }, + { + "epoch": 0.2160367722165475, + "grad_norm": 1.4178084805745839, + "learning_rate": 1.82395125571587e-05, + "loss": 0.8055, + "step": 2115 + }, + { + "epoch": 0.21613891726251278, + "grad_norm": 1.4900179229326167, + "learning_rate": 1.8237637404266526e-05, + "loss": 0.7398, + "step": 2116 + }, + { + "epoch": 0.21624106230847803, + "grad_norm": 1.4894374211654242, + "learning_rate": 1.8235761349759612e-05, + "loss": 0.8566, + "step": 2117 + }, + { + "epoch": 0.21634320735444332, + "grad_norm": 1.5581749471972128, + "learning_rate": 1.8233884393843298e-05, + "loss": 0.7762, + "step": 2118 + }, + { + "epoch": 0.21644535240040857, + "grad_norm": 1.4518022440033582, + "learning_rate": 1.8232006536723024e-05, + "loss": 0.7583, + "step": 2119 + }, + { + "epoch": 0.21654749744637386, + "grad_norm": 1.4844528573118905, + "learning_rate": 1.823012777860431e-05, + "loss": 0.7391, + "step": 2120 + }, + { + "epoch": 0.21664964249233912, + "grad_norm": 1.7865463264112236, + "learning_rate": 1.8228248119692793e-05, + "loss": 0.8152, + "step": 2121 + }, + { + "epoch": 0.2167517875383044, + "grad_norm": 1.400114069474454, + "learning_rate": 1.8226367560194207e-05, + "loss": 0.709, + "step": 2122 + }, + { + "epoch": 0.21685393258426966, + "grad_norm": 1.4013632179523992, + "learning_rate": 1.8224486100314373e-05, + "loss": 0.7756, + "step": 2123 + }, + { + "epoch": 0.21695607763023494, + "grad_norm": 1.4682817331361064, + "learning_rate": 1.8222603740259225e-05, + "loss": 0.7522, + "step": 2124 + }, + { + "epoch": 0.2170582226762002, + "grad_norm": 1.6217799793622887, + "learning_rate": 1.822072048023478e-05, + "loss": 0.8209, + "step": 2125 + }, + { + "epoch": 0.2171603677221655, + "grad_norm": 1.445991947316743, + "learning_rate": 1.821883632044717e-05, + "loss": 0.7197, + "step": 2126 + }, + { + "epoch": 0.21726251276813074, + "grad_norm": 1.3955558761203861, + "learning_rate": 1.8216951261102617e-05, + "loss": 0.7987, + "step": 2127 + }, + { + "epoch": 0.21736465781409603, + "grad_norm": 1.4410819127944816, + "learning_rate": 1.8215065302407434e-05, + "loss": 0.716, + "step": 2128 + }, + { + "epoch": 0.2174668028600613, + "grad_norm": 1.3906123715857208, + "learning_rate": 1.821317844456805e-05, + "loss": 0.7869, + "step": 2129 + }, + { + "epoch": 0.21756894790602654, + "grad_norm": 1.7597452841216537, + "learning_rate": 1.8211290687790982e-05, + "loss": 0.8277, + "step": 2130 + }, + { + "epoch": 0.21767109295199183, + "grad_norm": 1.684821468580705, + "learning_rate": 1.8209402032282836e-05, + "loss": 0.7409, + "step": 2131 + }, + { + "epoch": 0.2177732379979571, + "grad_norm": 1.6270149238149112, + "learning_rate": 1.8207512478250338e-05, + "loss": 0.8205, + "step": 2132 + }, + { + "epoch": 0.21787538304392237, + "grad_norm": 1.5866400383225503, + "learning_rate": 1.8205622025900296e-05, + "loss": 0.753, + "step": 2133 + }, + { + "epoch": 0.21797752808988763, + "grad_norm": 1.521239109076847, + "learning_rate": 1.8203730675439622e-05, + "loss": 0.8686, + "step": 2134 + }, + { + "epoch": 0.21807967313585291, + "grad_norm": 1.5062432591629473, + "learning_rate": 1.8201838427075323e-05, + "loss": 0.7432, + "step": 2135 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 1.4352007878742952, + "learning_rate": 1.819994528101451e-05, + "loss": 0.65, + "step": 2136 + }, + { + "epoch": 0.21828396322778346, + "grad_norm": 1.4154865926017972, + "learning_rate": 1.8198051237464393e-05, + "loss": 0.672, + "step": 2137 + }, + { + "epoch": 0.2183861082737487, + "grad_norm": 1.4607241510790332, + "learning_rate": 1.8196156296632263e-05, + "loss": 0.8406, + "step": 2138 + }, + { + "epoch": 0.218488253319714, + "grad_norm": 1.617681807204396, + "learning_rate": 1.8194260458725533e-05, + "loss": 0.7737, + "step": 2139 + }, + { + "epoch": 0.21859039836567926, + "grad_norm": 1.2463906254703652, + "learning_rate": 1.8192363723951705e-05, + "loss": 0.7782, + "step": 2140 + }, + { + "epoch": 0.21869254341164454, + "grad_norm": 1.4870730004981867, + "learning_rate": 1.8190466092518375e-05, + "loss": 0.7454, + "step": 2141 + }, + { + "epoch": 0.2187946884576098, + "grad_norm": 1.5918394593422303, + "learning_rate": 1.8188567564633237e-05, + "loss": 0.7298, + "step": 2142 + }, + { + "epoch": 0.21889683350357508, + "grad_norm": 1.5701632289869287, + "learning_rate": 1.818666814050409e-05, + "loss": 0.7444, + "step": 2143 + }, + { + "epoch": 0.21899897854954034, + "grad_norm": 1.5344081849011026, + "learning_rate": 1.8184767820338825e-05, + "loss": 0.7035, + "step": 2144 + }, + { + "epoch": 0.21910112359550563, + "grad_norm": 1.4940957162966149, + "learning_rate": 1.8182866604345433e-05, + "loss": 0.7385, + "step": 2145 + }, + { + "epoch": 0.21920326864147088, + "grad_norm": 1.6896339591632972, + "learning_rate": 1.8180964492732007e-05, + "loss": 0.6489, + "step": 2146 + }, + { + "epoch": 0.21930541368743617, + "grad_norm": 1.3852214206008133, + "learning_rate": 1.817906148570673e-05, + "loss": 0.704, + "step": 2147 + }, + { + "epoch": 0.21940755873340143, + "grad_norm": 1.553754856312667, + "learning_rate": 1.817715758347789e-05, + "loss": 0.8483, + "step": 2148 + }, + { + "epoch": 0.2195097037793667, + "grad_norm": 1.5647800268835241, + "learning_rate": 1.817525278625387e-05, + "loss": 0.7021, + "step": 2149 + }, + { + "epoch": 0.21961184882533197, + "grad_norm": 1.6297907260472146, + "learning_rate": 1.8173347094243145e-05, + "loss": 0.8895, + "step": 2150 + }, + { + "epoch": 0.21971399387129725, + "grad_norm": 1.3964193726162373, + "learning_rate": 1.8171440507654306e-05, + "loss": 0.6856, + "step": 2151 + }, + { + "epoch": 0.2198161389172625, + "grad_norm": 1.481262439060626, + "learning_rate": 1.8169533026696022e-05, + "loss": 0.805, + "step": 2152 + }, + { + "epoch": 0.2199182839632278, + "grad_norm": 1.4603100794421457, + "learning_rate": 1.816762465157707e-05, + "loss": 0.7575, + "step": 2153 + }, + { + "epoch": 0.22002042900919305, + "grad_norm": 1.489964007505512, + "learning_rate": 1.8165715382506322e-05, + "loss": 0.8119, + "step": 2154 + }, + { + "epoch": 0.22012257405515834, + "grad_norm": 1.5148362329959033, + "learning_rate": 1.816380521969275e-05, + "loss": 0.7456, + "step": 2155 + }, + { + "epoch": 0.2202247191011236, + "grad_norm": 1.4194025002024102, + "learning_rate": 1.8161894163345425e-05, + "loss": 0.6658, + "step": 2156 + }, + { + "epoch": 0.22032686414708888, + "grad_norm": 1.5293083797520495, + "learning_rate": 1.8159982213673507e-05, + "loss": 0.7905, + "step": 2157 + }, + { + "epoch": 0.22042900919305414, + "grad_norm": 1.3632866001313433, + "learning_rate": 1.8158069370886268e-05, + "loss": 0.7123, + "step": 2158 + }, + { + "epoch": 0.2205311542390194, + "grad_norm": 1.6215404876928532, + "learning_rate": 1.815615563519306e-05, + "loss": 0.7296, + "step": 2159 + }, + { + "epoch": 0.22063329928498468, + "grad_norm": 1.4363715359576077, + "learning_rate": 1.815424100680335e-05, + "loss": 0.7896, + "step": 2160 + }, + { + "epoch": 0.22073544433094994, + "grad_norm": 1.4995682322010773, + "learning_rate": 1.8152325485926695e-05, + "loss": 0.7912, + "step": 2161 + }, + { + "epoch": 0.22083758937691522, + "grad_norm": 1.4691670166720254, + "learning_rate": 1.8150409072772747e-05, + "loss": 0.7641, + "step": 2162 + }, + { + "epoch": 0.22093973442288048, + "grad_norm": 1.5246341366930463, + "learning_rate": 1.814849176755126e-05, + "loss": 0.7959, + "step": 2163 + }, + { + "epoch": 0.22104187946884576, + "grad_norm": 1.4281279670554279, + "learning_rate": 1.814657357047209e-05, + "loss": 0.7247, + "step": 2164 + }, + { + "epoch": 0.22114402451481102, + "grad_norm": 1.5555611075939852, + "learning_rate": 1.814465448174517e-05, + "loss": 0.7781, + "step": 2165 + }, + { + "epoch": 0.2212461695607763, + "grad_norm": 1.6561648739818293, + "learning_rate": 1.8142734501580563e-05, + "loss": 0.6929, + "step": 2166 + }, + { + "epoch": 0.22134831460674156, + "grad_norm": 1.494375267982028, + "learning_rate": 1.81408136301884e-05, + "loss": 0.7546, + "step": 2167 + }, + { + "epoch": 0.22145045965270685, + "grad_norm": 1.461359056038184, + "learning_rate": 1.8138891867778924e-05, + "loss": 0.7025, + "step": 2168 + }, + { + "epoch": 0.2215526046986721, + "grad_norm": 1.5122281699532736, + "learning_rate": 1.813696921456248e-05, + "loss": 0.7346, + "step": 2169 + }, + { + "epoch": 0.2216547497446374, + "grad_norm": 1.3092740253981447, + "learning_rate": 1.8135045670749495e-05, + "loss": 0.6549, + "step": 2170 + }, + { + "epoch": 0.22175689479060265, + "grad_norm": 1.509960393811027, + "learning_rate": 1.8133121236550503e-05, + "loss": 0.7185, + "step": 2171 + }, + { + "epoch": 0.22185903983656793, + "grad_norm": 1.452398623609375, + "learning_rate": 1.813119591217614e-05, + "loss": 0.7273, + "step": 2172 + }, + { + "epoch": 0.2219611848825332, + "grad_norm": 1.5024907486125059, + "learning_rate": 1.812926969783713e-05, + "loss": 0.7154, + "step": 2173 + }, + { + "epoch": 0.22206332992849848, + "grad_norm": 1.5458364026483244, + "learning_rate": 1.81273425937443e-05, + "loss": 0.7987, + "step": 2174 + }, + { + "epoch": 0.22216547497446373, + "grad_norm": 1.4018816479705711, + "learning_rate": 1.812541460010857e-05, + "loss": 0.592, + "step": 2175 + }, + { + "epoch": 0.22226762002042902, + "grad_norm": 1.5922269384916061, + "learning_rate": 1.8123485717140966e-05, + "loss": 0.7026, + "step": 2176 + }, + { + "epoch": 0.22236976506639428, + "grad_norm": 1.362015913345122, + "learning_rate": 1.81215559450526e-05, + "loss": 0.6748, + "step": 2177 + }, + { + "epoch": 0.22247191011235956, + "grad_norm": 1.5506780225655552, + "learning_rate": 1.8119625284054688e-05, + "loss": 0.6892, + "step": 2178 + }, + { + "epoch": 0.22257405515832482, + "grad_norm": 1.7001436218866008, + "learning_rate": 1.8117693734358546e-05, + "loss": 0.74, + "step": 2179 + }, + { + "epoch": 0.2226762002042901, + "grad_norm": 1.6096451273504218, + "learning_rate": 1.8115761296175577e-05, + "loss": 0.8141, + "step": 2180 + }, + { + "epoch": 0.22277834525025536, + "grad_norm": 1.3747471864596215, + "learning_rate": 1.8113827969717294e-05, + "loss": 0.6984, + "step": 2181 + }, + { + "epoch": 0.22288049029622065, + "grad_norm": 1.4312242756424636, + "learning_rate": 1.811189375519529e-05, + "loss": 0.7605, + "step": 2182 + }, + { + "epoch": 0.2229826353421859, + "grad_norm": 1.5555113687906015, + "learning_rate": 1.810995865282128e-05, + "loss": 0.87, + "step": 2183 + }, + { + "epoch": 0.2230847803881512, + "grad_norm": 1.388715590990118, + "learning_rate": 1.8108022662807056e-05, + "loss": 0.6701, + "step": 2184 + }, + { + "epoch": 0.22318692543411645, + "grad_norm": 1.5447230233770404, + "learning_rate": 1.810608578536451e-05, + "loss": 0.6952, + "step": 2185 + }, + { + "epoch": 0.2232890704800817, + "grad_norm": 1.534131897551025, + "learning_rate": 1.810414802070564e-05, + "loss": 0.7598, + "step": 2186 + }, + { + "epoch": 0.223391215526047, + "grad_norm": 1.6157871090779623, + "learning_rate": 1.810220936904253e-05, + "loss": 0.8224, + "step": 2187 + }, + { + "epoch": 0.22349336057201225, + "grad_norm": 1.4937764987169526, + "learning_rate": 1.810026983058737e-05, + "loss": 0.8178, + "step": 2188 + }, + { + "epoch": 0.22359550561797753, + "grad_norm": 1.5226654330374452, + "learning_rate": 1.8098329405552445e-05, + "loss": 0.8251, + "step": 2189 + }, + { + "epoch": 0.2236976506639428, + "grad_norm": 1.4318903812085957, + "learning_rate": 1.809638809415014e-05, + "loss": 0.7453, + "step": 2190 + }, + { + "epoch": 0.22379979570990807, + "grad_norm": 1.570671992897667, + "learning_rate": 1.8094445896592917e-05, + "loss": 0.77, + "step": 2191 + }, + { + "epoch": 0.22390194075587333, + "grad_norm": 1.5508999517808788, + "learning_rate": 1.809250281309337e-05, + "loss": 0.7708, + "step": 2192 + }, + { + "epoch": 0.22400408580183861, + "grad_norm": 1.5953760915182307, + "learning_rate": 1.8090558843864157e-05, + "loss": 0.755, + "step": 2193 + }, + { + "epoch": 0.22410623084780387, + "grad_norm": 1.5282893215491953, + "learning_rate": 1.808861398911805e-05, + "loss": 0.8884, + "step": 2194 + }, + { + "epoch": 0.22420837589376916, + "grad_norm": 1.4078613235692472, + "learning_rate": 1.8086668249067917e-05, + "loss": 0.7796, + "step": 2195 + }, + { + "epoch": 0.22431052093973441, + "grad_norm": 1.4086118130852237, + "learning_rate": 1.8084721623926723e-05, + "loss": 0.7473, + "step": 2196 + }, + { + "epoch": 0.2244126659856997, + "grad_norm": 1.5961317142602676, + "learning_rate": 1.808277411390752e-05, + "loss": 0.7479, + "step": 2197 + }, + { + "epoch": 0.22451481103166496, + "grad_norm": 1.3118973019852176, + "learning_rate": 1.8080825719223468e-05, + "loss": 0.7609, + "step": 2198 + }, + { + "epoch": 0.22461695607763024, + "grad_norm": 1.500695164054136, + "learning_rate": 1.8078876440087825e-05, + "loss": 0.8477, + "step": 2199 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 1.4424821144357522, + "learning_rate": 1.807692627671393e-05, + "loss": 0.7675, + "step": 2200 + }, + { + "epoch": 0.22482124616956078, + "grad_norm": 1.567971020963631, + "learning_rate": 1.8074975229315238e-05, + "loss": 0.7718, + "step": 2201 + }, + { + "epoch": 0.22492339121552604, + "grad_norm": 1.4949306728392664, + "learning_rate": 1.807302329810529e-05, + "loss": 0.7225, + "step": 2202 + }, + { + "epoch": 0.22502553626149133, + "grad_norm": 1.5428241516277315, + "learning_rate": 1.8071070483297726e-05, + "loss": 0.7814, + "step": 2203 + }, + { + "epoch": 0.22512768130745658, + "grad_norm": 1.5234889319730684, + "learning_rate": 1.8069116785106284e-05, + "loss": 0.741, + "step": 2204 + }, + { + "epoch": 0.22522982635342187, + "grad_norm": 1.5758575602871117, + "learning_rate": 1.80671622037448e-05, + "loss": 0.7757, + "step": 2205 + }, + { + "epoch": 0.22533197139938713, + "grad_norm": 1.7847750150565742, + "learning_rate": 1.8065206739427194e-05, + "loss": 0.7292, + "step": 2206 + }, + { + "epoch": 0.2254341164453524, + "grad_norm": 1.5244089832649776, + "learning_rate": 1.80632503923675e-05, + "loss": 0.7869, + "step": 2207 + }, + { + "epoch": 0.22553626149131767, + "grad_norm": 1.5266363300135084, + "learning_rate": 1.8061293162779844e-05, + "loss": 0.678, + "step": 2208 + }, + { + "epoch": 0.22563840653728295, + "grad_norm": 2.016420386121811, + "learning_rate": 1.8059335050878444e-05, + "loss": 0.7701, + "step": 2209 + }, + { + "epoch": 0.2257405515832482, + "grad_norm": 1.513049220012714, + "learning_rate": 1.8057376056877614e-05, + "loss": 0.7414, + "step": 2210 + }, + { + "epoch": 0.2258426966292135, + "grad_norm": 1.3410379113382804, + "learning_rate": 1.8055416180991768e-05, + "loss": 0.6121, + "step": 2211 + }, + { + "epoch": 0.22594484167517875, + "grad_norm": 1.553020154456576, + "learning_rate": 1.805345542343542e-05, + "loss": 0.7998, + "step": 2212 + }, + { + "epoch": 0.226046986721144, + "grad_norm": 1.3773108991464726, + "learning_rate": 1.8051493784423168e-05, + "loss": 0.7759, + "step": 2213 + }, + { + "epoch": 0.2261491317671093, + "grad_norm": 1.474910656165063, + "learning_rate": 1.804953126416972e-05, + "loss": 0.6994, + "step": 2214 + }, + { + "epoch": 0.22625127681307455, + "grad_norm": 1.3407411016043098, + "learning_rate": 1.8047567862889878e-05, + "loss": 0.6962, + "step": 2215 + }, + { + "epoch": 0.22635342185903984, + "grad_norm": 1.729455997868836, + "learning_rate": 1.804560358079853e-05, + "loss": 0.8504, + "step": 2216 + }, + { + "epoch": 0.2264555669050051, + "grad_norm": 1.5392278751420363, + "learning_rate": 1.8043638418110677e-05, + "loss": 0.728, + "step": 2217 + }, + { + "epoch": 0.22655771195097038, + "grad_norm": 1.3839962961302597, + "learning_rate": 1.80416723750414e-05, + "loss": 0.7321, + "step": 2218 + }, + { + "epoch": 0.22665985699693564, + "grad_norm": 1.495738662653201, + "learning_rate": 1.8039705451805883e-05, + "loss": 0.7508, + "step": 2219 + }, + { + "epoch": 0.22676200204290092, + "grad_norm": 1.6459511767404105, + "learning_rate": 1.8037737648619413e-05, + "loss": 0.757, + "step": 2220 + }, + { + "epoch": 0.22686414708886618, + "grad_norm": 1.4631819411596434, + "learning_rate": 1.803576896569736e-05, + "loss": 0.7744, + "step": 2221 + }, + { + "epoch": 0.22696629213483147, + "grad_norm": 1.5913993505268982, + "learning_rate": 1.803379940325521e-05, + "loss": 0.7432, + "step": 2222 + }, + { + "epoch": 0.22706843718079672, + "grad_norm": 1.566916547400519, + "learning_rate": 1.803182896150852e-05, + "loss": 0.8052, + "step": 2223 + }, + { + "epoch": 0.227170582226762, + "grad_norm": 1.5511398490670008, + "learning_rate": 1.8029857640672957e-05, + "loss": 0.8809, + "step": 2224 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 1.3257929635315593, + "learning_rate": 1.8027885440964295e-05, + "loss": 0.7429, + "step": 2225 + }, + { + "epoch": 0.22737487231869255, + "grad_norm": 1.4638744637389853, + "learning_rate": 1.802591236259838e-05, + "loss": 0.6428, + "step": 2226 + }, + { + "epoch": 0.2274770173646578, + "grad_norm": 1.5106584619626535, + "learning_rate": 1.8023938405791173e-05, + "loss": 0.6864, + "step": 2227 + }, + { + "epoch": 0.2275791624106231, + "grad_norm": 1.479067116735724, + "learning_rate": 1.802196357075872e-05, + "loss": 0.8227, + "step": 2228 + }, + { + "epoch": 0.22768130745658835, + "grad_norm": 1.3970166515304214, + "learning_rate": 1.8019987857717178e-05, + "loss": 0.746, + "step": 2229 + }, + { + "epoch": 0.22778345250255363, + "grad_norm": 1.3318998198006518, + "learning_rate": 1.801801126688278e-05, + "loss": 0.6928, + "step": 2230 + }, + { + "epoch": 0.2278855975485189, + "grad_norm": 1.502952300935393, + "learning_rate": 1.801603379847187e-05, + "loss": 0.6767, + "step": 2231 + }, + { + "epoch": 0.22798774259448418, + "grad_norm": 1.5419401587002324, + "learning_rate": 1.801405545270088e-05, + "loss": 0.7674, + "step": 2232 + }, + { + "epoch": 0.22808988764044943, + "grad_norm": 1.5914319304211697, + "learning_rate": 1.8012076229786338e-05, + "loss": 0.7798, + "step": 2233 + }, + { + "epoch": 0.22819203268641472, + "grad_norm": 1.4889831167447773, + "learning_rate": 1.8010096129944884e-05, + "loss": 0.6627, + "step": 2234 + }, + { + "epoch": 0.22829417773237998, + "grad_norm": 1.5239622940090982, + "learning_rate": 1.8008115153393234e-05, + "loss": 0.7905, + "step": 2235 + }, + { + "epoch": 0.22839632277834526, + "grad_norm": 1.4660430251983063, + "learning_rate": 1.8006133300348197e-05, + "loss": 0.7685, + "step": 2236 + }, + { + "epoch": 0.22849846782431052, + "grad_norm": 1.4604166406308885, + "learning_rate": 1.800415057102671e-05, + "loss": 0.8402, + "step": 2237 + }, + { + "epoch": 0.2286006128702758, + "grad_norm": 1.5125380256688954, + "learning_rate": 1.800216696564576e-05, + "loss": 0.6569, + "step": 2238 + }, + { + "epoch": 0.22870275791624106, + "grad_norm": 1.436990583324024, + "learning_rate": 1.8000182484422474e-05, + "loss": 0.739, + "step": 2239 + }, + { + "epoch": 0.22880490296220635, + "grad_norm": 1.5639613248402557, + "learning_rate": 1.7998197127574043e-05, + "loss": 0.7093, + "step": 2240 + }, + { + "epoch": 0.2289070480081716, + "grad_norm": 1.5102232644057905, + "learning_rate": 1.799621089531777e-05, + "loss": 0.7061, + "step": 2241 + }, + { + "epoch": 0.22900919305413686, + "grad_norm": 1.6028482002374724, + "learning_rate": 1.7994223787871048e-05, + "loss": 0.6863, + "step": 2242 + }, + { + "epoch": 0.22911133810010215, + "grad_norm": 1.4837161609534522, + "learning_rate": 1.7992235805451367e-05, + "loss": 0.7504, + "step": 2243 + }, + { + "epoch": 0.2292134831460674, + "grad_norm": 1.439252436000204, + "learning_rate": 1.7990246948276314e-05, + "loss": 0.7402, + "step": 2244 + }, + { + "epoch": 0.2293156281920327, + "grad_norm": 1.548878248967547, + "learning_rate": 1.798825721656357e-05, + "loss": 0.7152, + "step": 2245 + }, + { + "epoch": 0.22941777323799795, + "grad_norm": 1.4043650632998554, + "learning_rate": 1.798626661053091e-05, + "loss": 0.8063, + "step": 2246 + }, + { + "epoch": 0.22951991828396323, + "grad_norm": 1.385090671964129, + "learning_rate": 1.7984275130396214e-05, + "loss": 0.7733, + "step": 2247 + }, + { + "epoch": 0.2296220633299285, + "grad_norm": 1.5282215285001788, + "learning_rate": 1.7982282776377443e-05, + "loss": 0.7861, + "step": 2248 + }, + { + "epoch": 0.22972420837589377, + "grad_norm": 1.575765077090057, + "learning_rate": 1.7980289548692667e-05, + "loss": 0.7618, + "step": 2249 + }, + { + "epoch": 0.22982635342185903, + "grad_norm": 1.4417347604966313, + "learning_rate": 1.7978295447560043e-05, + "loss": 0.7698, + "step": 2250 + }, + { + "epoch": 0.22992849846782432, + "grad_norm": 1.543460035628267, + "learning_rate": 1.797630047319783e-05, + "loss": 0.7873, + "step": 2251 + }, + { + "epoch": 0.23003064351378957, + "grad_norm": 1.542635570381685, + "learning_rate": 1.7974304625824373e-05, + "loss": 0.8224, + "step": 2252 + }, + { + "epoch": 0.23013278855975486, + "grad_norm": 1.3440967047450516, + "learning_rate": 1.7972307905658123e-05, + "loss": 0.6651, + "step": 2253 + }, + { + "epoch": 0.23023493360572012, + "grad_norm": 1.6517787884755837, + "learning_rate": 1.7970310312917624e-05, + "loss": 0.6768, + "step": 2254 + }, + { + "epoch": 0.2303370786516854, + "grad_norm": 1.4379487366888686, + "learning_rate": 1.796831184782151e-05, + "loss": 0.6449, + "step": 2255 + }, + { + "epoch": 0.23043922369765066, + "grad_norm": 1.2802216611143595, + "learning_rate": 1.7966312510588517e-05, + "loss": 0.645, + "step": 2256 + }, + { + "epoch": 0.23054136874361594, + "grad_norm": 1.5774276613648448, + "learning_rate": 1.7964312301437474e-05, + "loss": 0.7485, + "step": 2257 + }, + { + "epoch": 0.2306435137895812, + "grad_norm": 1.5859717961743545, + "learning_rate": 1.7962311220587304e-05, + "loss": 0.849, + "step": 2258 + }, + { + "epoch": 0.23074565883554649, + "grad_norm": 1.440608682898191, + "learning_rate": 1.7960309268257024e-05, + "loss": 0.7194, + "step": 2259 + }, + { + "epoch": 0.23084780388151174, + "grad_norm": 1.418599357047127, + "learning_rate": 1.7958306444665755e-05, + "loss": 0.776, + "step": 2260 + }, + { + "epoch": 0.23094994892747703, + "grad_norm": 1.6003001298581863, + "learning_rate": 1.79563027500327e-05, + "loss": 0.7081, + "step": 2261 + }, + { + "epoch": 0.23105209397344229, + "grad_norm": 1.5156679118681746, + "learning_rate": 1.7954298184577173e-05, + "loss": 0.8683, + "step": 2262 + }, + { + "epoch": 0.23115423901940757, + "grad_norm": 1.5114724498732142, + "learning_rate": 1.795229274851857e-05, + "loss": 0.7587, + "step": 2263 + }, + { + "epoch": 0.23125638406537283, + "grad_norm": 1.559938467075922, + "learning_rate": 1.7950286442076383e-05, + "loss": 0.8863, + "step": 2264 + }, + { + "epoch": 0.2313585291113381, + "grad_norm": 1.3752438335468948, + "learning_rate": 1.7948279265470214e-05, + "loss": 0.6629, + "step": 2265 + }, + { + "epoch": 0.23146067415730337, + "grad_norm": 1.5562406263921862, + "learning_rate": 1.794627121891974e-05, + "loss": 0.7745, + "step": 2266 + }, + { + "epoch": 0.23156281920326865, + "grad_norm": 1.5226662327447575, + "learning_rate": 1.794426230264475e-05, + "loss": 0.9143, + "step": 2267 + }, + { + "epoch": 0.2316649642492339, + "grad_norm": 1.381626866864444, + "learning_rate": 1.7942252516865117e-05, + "loss": 0.67, + "step": 2268 + }, + { + "epoch": 0.23176710929519917, + "grad_norm": 1.4066136790909638, + "learning_rate": 1.7940241861800814e-05, + "loss": 0.7225, + "step": 2269 + }, + { + "epoch": 0.23186925434116445, + "grad_norm": 1.2364683906554712, + "learning_rate": 1.793823033767191e-05, + "loss": 0.6689, + "step": 2270 + }, + { + "epoch": 0.2319713993871297, + "grad_norm": 1.5745694456952424, + "learning_rate": 1.7936217944698566e-05, + "loss": 0.759, + "step": 2271 + }, + { + "epoch": 0.232073544433095, + "grad_norm": 1.5910801588719057, + "learning_rate": 1.793420468310104e-05, + "loss": 0.761, + "step": 2272 + }, + { + "epoch": 0.23217568947906025, + "grad_norm": 1.5634218543931175, + "learning_rate": 1.7932190553099687e-05, + "loss": 0.7146, + "step": 2273 + }, + { + "epoch": 0.23227783452502554, + "grad_norm": 1.445991166567057, + "learning_rate": 1.7930175554914955e-05, + "loss": 0.8187, + "step": 2274 + }, + { + "epoch": 0.2323799795709908, + "grad_norm": 1.613578782633479, + "learning_rate": 1.7928159688767382e-05, + "loss": 0.7272, + "step": 2275 + }, + { + "epoch": 0.23248212461695608, + "grad_norm": 1.4775973486812264, + "learning_rate": 1.792614295487761e-05, + "loss": 0.7676, + "step": 2276 + }, + { + "epoch": 0.23258426966292134, + "grad_norm": 1.4112900050313055, + "learning_rate": 1.792412535346637e-05, + "loss": 0.7633, + "step": 2277 + }, + { + "epoch": 0.23268641470888662, + "grad_norm": 1.5181607025987163, + "learning_rate": 1.7922106884754488e-05, + "loss": 0.7006, + "step": 2278 + }, + { + "epoch": 0.23278855975485188, + "grad_norm": 1.5488822933025903, + "learning_rate": 1.7920087548962893e-05, + "loss": 0.8707, + "step": 2279 + }, + { + "epoch": 0.23289070480081717, + "grad_norm": 1.428331501273192, + "learning_rate": 1.79180673463126e-05, + "loss": 0.7661, + "step": 2280 + }, + { + "epoch": 0.23299284984678242, + "grad_norm": 1.2971838353715497, + "learning_rate": 1.7916046277024716e-05, + "loss": 0.6081, + "step": 2281 + }, + { + "epoch": 0.2330949948927477, + "grad_norm": 1.334761741405998, + "learning_rate": 1.791402434132045e-05, + "loss": 0.6518, + "step": 2282 + }, + { + "epoch": 0.23319713993871297, + "grad_norm": 1.4348481025212665, + "learning_rate": 1.7912001539421115e-05, + "loss": 0.7731, + "step": 2283 + }, + { + "epoch": 0.23329928498467825, + "grad_norm": 1.5022604650174964, + "learning_rate": 1.7909977871548093e-05, + "loss": 0.8634, + "step": 2284 + }, + { + "epoch": 0.2334014300306435, + "grad_norm": 1.495464531484559, + "learning_rate": 1.7907953337922886e-05, + "loss": 0.7308, + "step": 2285 + }, + { + "epoch": 0.2335035750766088, + "grad_norm": 1.6540739460663558, + "learning_rate": 1.7905927938767078e-05, + "loss": 0.8601, + "step": 2286 + }, + { + "epoch": 0.23360572012257405, + "grad_norm": 1.4413272571248266, + "learning_rate": 1.7903901674302346e-05, + "loss": 0.7354, + "step": 2287 + }, + { + "epoch": 0.23370786516853934, + "grad_norm": 1.4878559597083703, + "learning_rate": 1.790187454475047e-05, + "loss": 0.7418, + "step": 2288 + }, + { + "epoch": 0.2338100102145046, + "grad_norm": 1.422485039213835, + "learning_rate": 1.7899846550333318e-05, + "loss": 0.7895, + "step": 2289 + }, + { + "epoch": 0.23391215526046988, + "grad_norm": 1.4873503032403526, + "learning_rate": 1.789781769127286e-05, + "loss": 0.7935, + "step": 2290 + }, + { + "epoch": 0.23401430030643514, + "grad_norm": 1.4863173869585053, + "learning_rate": 1.7895787967791155e-05, + "loss": 0.7646, + "step": 2291 + }, + { + "epoch": 0.23411644535240042, + "grad_norm": 1.6957628828034594, + "learning_rate": 1.7893757380110352e-05, + "loss": 0.695, + "step": 2292 + }, + { + "epoch": 0.23421859039836568, + "grad_norm": 1.4699294198663664, + "learning_rate": 1.7891725928452704e-05, + "loss": 0.8275, + "step": 2293 + }, + { + "epoch": 0.23432073544433096, + "grad_norm": 1.3278889630107682, + "learning_rate": 1.7889693613040556e-05, + "loss": 0.6059, + "step": 2294 + }, + { + "epoch": 0.23442288049029622, + "grad_norm": 1.5211357248757373, + "learning_rate": 1.788766043409634e-05, + "loss": 0.7526, + "step": 2295 + }, + { + "epoch": 0.23452502553626148, + "grad_norm": 1.4327442554026883, + "learning_rate": 1.78856263918426e-05, + "loss": 0.6395, + "step": 2296 + }, + { + "epoch": 0.23462717058222676, + "grad_norm": 1.7133308758286023, + "learning_rate": 1.7883591486501953e-05, + "loss": 0.8447, + "step": 2297 + }, + { + "epoch": 0.23472931562819202, + "grad_norm": 1.519156984194039, + "learning_rate": 1.7881555718297124e-05, + "loss": 0.7098, + "step": 2298 + }, + { + "epoch": 0.2348314606741573, + "grad_norm": 1.3168533700859788, + "learning_rate": 1.7879519087450933e-05, + "loss": 0.6878, + "step": 2299 + }, + { + "epoch": 0.23493360572012256, + "grad_norm": 1.3947351799650776, + "learning_rate": 1.7877481594186285e-05, + "loss": 0.6934, + "step": 2300 + }, + { + "epoch": 0.23503575076608785, + "grad_norm": 1.570408057358551, + "learning_rate": 1.7875443238726186e-05, + "loss": 0.7915, + "step": 2301 + }, + { + "epoch": 0.2351378958120531, + "grad_norm": 1.557453309794812, + "learning_rate": 1.787340402129374e-05, + "loss": 0.7653, + "step": 2302 + }, + { + "epoch": 0.2352400408580184, + "grad_norm": 1.5698234406783456, + "learning_rate": 1.787136394211213e-05, + "loss": 0.8094, + "step": 2303 + }, + { + "epoch": 0.23534218590398365, + "grad_norm": 1.422834337062916, + "learning_rate": 1.7869323001404657e-05, + "loss": 0.7707, + "step": 2304 + }, + { + "epoch": 0.23544433094994893, + "grad_norm": 1.573601072098912, + "learning_rate": 1.7867281199394692e-05, + "loss": 0.7173, + "step": 2305 + }, + { + "epoch": 0.2355464759959142, + "grad_norm": 1.468374520029808, + "learning_rate": 1.786523853630572e-05, + "loss": 0.8133, + "step": 2306 + }, + { + "epoch": 0.23564862104187947, + "grad_norm": 1.3964703783396522, + "learning_rate": 1.7863195012361313e-05, + "loss": 0.8023, + "step": 2307 + }, + { + "epoch": 0.23575076608784473, + "grad_norm": 1.5276908490215264, + "learning_rate": 1.7861150627785124e-05, + "loss": 0.7689, + "step": 2308 + }, + { + "epoch": 0.23585291113381002, + "grad_norm": 1.3439849746765093, + "learning_rate": 1.7859105382800925e-05, + "loss": 0.7287, + "step": 2309 + }, + { + "epoch": 0.23595505617977527, + "grad_norm": 1.6359460599042688, + "learning_rate": 1.7857059277632562e-05, + "loss": 0.8252, + "step": 2310 + }, + { + "epoch": 0.23605720122574056, + "grad_norm": 1.5300452464971621, + "learning_rate": 1.7855012312503984e-05, + "loss": 0.7194, + "step": 2311 + }, + { + "epoch": 0.23615934627170582, + "grad_norm": 1.2742065030103453, + "learning_rate": 1.7852964487639234e-05, + "loss": 0.7506, + "step": 2312 + }, + { + "epoch": 0.2362614913176711, + "grad_norm": 1.4162841743178576, + "learning_rate": 1.785091580326245e-05, + "loss": 0.7138, + "step": 2313 + }, + { + "epoch": 0.23636363636363636, + "grad_norm": 1.4645735268805768, + "learning_rate": 1.7848866259597856e-05, + "loss": 0.7468, + "step": 2314 + }, + { + "epoch": 0.23646578140960164, + "grad_norm": 1.5580889777555762, + "learning_rate": 1.784681585686978e-05, + "loss": 0.8557, + "step": 2315 + }, + { + "epoch": 0.2365679264555669, + "grad_norm": 1.3356311348412555, + "learning_rate": 1.784476459530264e-05, + "loss": 0.6268, + "step": 2316 + }, + { + "epoch": 0.2366700715015322, + "grad_norm": 1.6289846518430162, + "learning_rate": 1.7842712475120944e-05, + "loss": 0.8419, + "step": 2317 + }, + { + "epoch": 0.23677221654749744, + "grad_norm": 1.4988142177307007, + "learning_rate": 1.78406594965493e-05, + "loss": 0.7524, + "step": 2318 + }, + { + "epoch": 0.23687436159346273, + "grad_norm": 1.57127692249938, + "learning_rate": 1.783860565981241e-05, + "loss": 0.9007, + "step": 2319 + }, + { + "epoch": 0.236976506639428, + "grad_norm": 1.4793839755248703, + "learning_rate": 1.7836550965135067e-05, + "loss": 0.7987, + "step": 2320 + }, + { + "epoch": 0.23707865168539327, + "grad_norm": 1.5007739995668181, + "learning_rate": 1.7834495412742157e-05, + "loss": 0.7539, + "step": 2321 + }, + { + "epoch": 0.23718079673135853, + "grad_norm": 1.3279366150972207, + "learning_rate": 1.783243900285866e-05, + "loss": 0.7629, + "step": 2322 + }, + { + "epoch": 0.2372829417773238, + "grad_norm": 1.4215120029599968, + "learning_rate": 1.7830381735709656e-05, + "loss": 0.6693, + "step": 2323 + }, + { + "epoch": 0.23738508682328907, + "grad_norm": 1.4520168704811396, + "learning_rate": 1.7828323611520315e-05, + "loss": 0.7403, + "step": 2324 + }, + { + "epoch": 0.23748723186925433, + "grad_norm": 1.6298136376119485, + "learning_rate": 1.7826264630515894e-05, + "loss": 0.7234, + "step": 2325 + }, + { + "epoch": 0.2375893769152196, + "grad_norm": 1.4576078641144812, + "learning_rate": 1.782420479292175e-05, + "loss": 0.6412, + "step": 2326 + }, + { + "epoch": 0.23769152196118487, + "grad_norm": 1.4276453595865672, + "learning_rate": 1.7822144098963342e-05, + "loss": 0.7456, + "step": 2327 + }, + { + "epoch": 0.23779366700715016, + "grad_norm": 1.5220145879582514, + "learning_rate": 1.7820082548866206e-05, + "loss": 0.7072, + "step": 2328 + }, + { + "epoch": 0.2378958120531154, + "grad_norm": 1.498515970486518, + "learning_rate": 1.7818020142855982e-05, + "loss": 0.6709, + "step": 2329 + }, + { + "epoch": 0.2379979570990807, + "grad_norm": 1.5185129348834838, + "learning_rate": 1.7815956881158404e-05, + "loss": 0.7443, + "step": 2330 + }, + { + "epoch": 0.23810010214504596, + "grad_norm": 1.397257004228778, + "learning_rate": 1.781389276399929e-05, + "loss": 0.6209, + "step": 2331 + }, + { + "epoch": 0.23820224719101124, + "grad_norm": 1.4470808556333505, + "learning_rate": 1.781182779160457e-05, + "loss": 0.7866, + "step": 2332 + }, + { + "epoch": 0.2383043922369765, + "grad_norm": 1.6450900329554976, + "learning_rate": 1.780976196420025e-05, + "loss": 0.7973, + "step": 2333 + }, + { + "epoch": 0.23840653728294178, + "grad_norm": 1.5119696538564331, + "learning_rate": 1.7807695282012436e-05, + "loss": 0.7195, + "step": 2334 + }, + { + "epoch": 0.23850868232890704, + "grad_norm": 1.5274218770247723, + "learning_rate": 1.780562774526733e-05, + "loss": 0.8483, + "step": 2335 + }, + { + "epoch": 0.23861082737487233, + "grad_norm": 1.5547254156086003, + "learning_rate": 1.7803559354191226e-05, + "loss": 0.8588, + "step": 2336 + }, + { + "epoch": 0.23871297242083758, + "grad_norm": 1.452495943115992, + "learning_rate": 1.7801490109010506e-05, + "loss": 0.7402, + "step": 2337 + }, + { + "epoch": 0.23881511746680287, + "grad_norm": 1.5766398233516086, + "learning_rate": 1.7799420009951657e-05, + "loss": 0.7572, + "step": 2338 + }, + { + "epoch": 0.23891726251276812, + "grad_norm": 1.5893488076973672, + "learning_rate": 1.7797349057241244e-05, + "loss": 0.7321, + "step": 2339 + }, + { + "epoch": 0.2390194075587334, + "grad_norm": 1.529135594533242, + "learning_rate": 1.7795277251105942e-05, + "loss": 0.7683, + "step": 2340 + }, + { + "epoch": 0.23912155260469867, + "grad_norm": 1.5571252938205509, + "learning_rate": 1.7793204591772506e-05, + "loss": 0.7857, + "step": 2341 + }, + { + "epoch": 0.23922369765066395, + "grad_norm": 1.4694894733231714, + "learning_rate": 1.7791131079467792e-05, + "loss": 0.7661, + "step": 2342 + }, + { + "epoch": 0.2393258426966292, + "grad_norm": 1.4898869219799205, + "learning_rate": 1.778905671441875e-05, + "loss": 0.7899, + "step": 2343 + }, + { + "epoch": 0.2394279877425945, + "grad_norm": 1.4120202773707085, + "learning_rate": 1.778698149685242e-05, + "loss": 0.7782, + "step": 2344 + }, + { + "epoch": 0.23953013278855975, + "grad_norm": 1.532063777137145, + "learning_rate": 1.7784905426995933e-05, + "loss": 0.8282, + "step": 2345 + }, + { + "epoch": 0.23963227783452504, + "grad_norm": 1.4789475295666736, + "learning_rate": 1.7782828505076516e-05, + "loss": 0.7144, + "step": 2346 + }, + { + "epoch": 0.2397344228804903, + "grad_norm": 1.4529151144171197, + "learning_rate": 1.778075073132149e-05, + "loss": 0.7181, + "step": 2347 + }, + { + "epoch": 0.23983656792645558, + "grad_norm": 1.4809647561212889, + "learning_rate": 1.7778672105958272e-05, + "loss": 0.7772, + "step": 2348 + }, + { + "epoch": 0.23993871297242084, + "grad_norm": 1.5143417817580695, + "learning_rate": 1.7776592629214362e-05, + "loss": 0.7565, + "step": 2349 + }, + { + "epoch": 0.24004085801838612, + "grad_norm": 1.3095464468008544, + "learning_rate": 1.777451230131737e-05, + "loss": 0.6469, + "step": 2350 + }, + { + "epoch": 0.24014300306435138, + "grad_norm": 1.3558318111966303, + "learning_rate": 1.777243112249498e-05, + "loss": 0.6463, + "step": 2351 + }, + { + "epoch": 0.24024514811031664, + "grad_norm": 1.6386088260531508, + "learning_rate": 1.7770349092974984e-05, + "loss": 0.7223, + "step": 2352 + }, + { + "epoch": 0.24034729315628192, + "grad_norm": 1.3108908501220131, + "learning_rate": 1.7768266212985256e-05, + "loss": 0.7616, + "step": 2353 + }, + { + "epoch": 0.24044943820224718, + "grad_norm": 1.5526747792606155, + "learning_rate": 1.7766182482753774e-05, + "loss": 0.7811, + "step": 2354 + }, + { + "epoch": 0.24055158324821246, + "grad_norm": 1.4225897167292565, + "learning_rate": 1.7764097902508604e-05, + "loss": 0.6207, + "step": 2355 + }, + { + "epoch": 0.24065372829417772, + "grad_norm": 1.5294448049579061, + "learning_rate": 1.77620124724779e-05, + "loss": 0.7056, + "step": 2356 + }, + { + "epoch": 0.240755873340143, + "grad_norm": 1.5466285318949227, + "learning_rate": 1.7759926192889918e-05, + "loss": 0.7202, + "step": 2357 + }, + { + "epoch": 0.24085801838610826, + "grad_norm": 1.3081101799256516, + "learning_rate": 1.7757839063972998e-05, + "loss": 0.6699, + "step": 2358 + }, + { + "epoch": 0.24096016343207355, + "grad_norm": 1.5767085023541487, + "learning_rate": 1.775575108595558e-05, + "loss": 0.7507, + "step": 2359 + }, + { + "epoch": 0.2410623084780388, + "grad_norm": 1.4661620916689682, + "learning_rate": 1.7753662259066198e-05, + "loss": 0.7489, + "step": 2360 + }, + { + "epoch": 0.2411644535240041, + "grad_norm": 1.6677185033743713, + "learning_rate": 1.775157258353347e-05, + "loss": 0.8029, + "step": 2361 + }, + { + "epoch": 0.24126659856996935, + "grad_norm": 1.5180751787437001, + "learning_rate": 1.7749482059586112e-05, + "loss": 0.7026, + "step": 2362 + }, + { + "epoch": 0.24136874361593463, + "grad_norm": 1.3320274527157165, + "learning_rate": 1.774739068745294e-05, + "loss": 0.6834, + "step": 2363 + }, + { + "epoch": 0.2414708886618999, + "grad_norm": 1.4041688775491048, + "learning_rate": 1.7745298467362852e-05, + "loss": 0.7352, + "step": 2364 + }, + { + "epoch": 0.24157303370786518, + "grad_norm": 1.480708371358585, + "learning_rate": 1.774320539954484e-05, + "loss": 0.7907, + "step": 2365 + }, + { + "epoch": 0.24167517875383043, + "grad_norm": 1.5085178274648747, + "learning_rate": 1.7741111484227996e-05, + "loss": 0.8016, + "step": 2366 + }, + { + "epoch": 0.24177732379979572, + "grad_norm": 1.4210421800690174, + "learning_rate": 1.7739016721641498e-05, + "loss": 0.6579, + "step": 2367 + }, + { + "epoch": 0.24187946884576098, + "grad_norm": 1.551289673957261, + "learning_rate": 1.773692111201462e-05, + "loss": 0.7459, + "step": 2368 + }, + { + "epoch": 0.24198161389172626, + "grad_norm": 1.464698914506377, + "learning_rate": 1.773482465557673e-05, + "loss": 0.6947, + "step": 2369 + }, + { + "epoch": 0.24208375893769152, + "grad_norm": 1.4269626820144585, + "learning_rate": 1.773272735255728e-05, + "loss": 0.7747, + "step": 2370 + }, + { + "epoch": 0.2421859039836568, + "grad_norm": 1.494376630773842, + "learning_rate": 1.7730629203185825e-05, + "loss": 0.6532, + "step": 2371 + }, + { + "epoch": 0.24228804902962206, + "grad_norm": 1.6400547181897738, + "learning_rate": 1.772853020769201e-05, + "loss": 0.6818, + "step": 2372 + }, + { + "epoch": 0.24239019407558735, + "grad_norm": 1.6105616815516357, + "learning_rate": 1.7726430366305574e-05, + "loss": 0.6569, + "step": 2373 + }, + { + "epoch": 0.2424923391215526, + "grad_norm": 1.4792272073054589, + "learning_rate": 1.772432967925634e-05, + "loss": 0.8057, + "step": 2374 + }, + { + "epoch": 0.2425944841675179, + "grad_norm": 1.4203686544907335, + "learning_rate": 1.7722228146774233e-05, + "loss": 0.8411, + "step": 2375 + }, + { + "epoch": 0.24269662921348314, + "grad_norm": 1.4812832404025646, + "learning_rate": 1.7720125769089262e-05, + "loss": 0.8277, + "step": 2376 + }, + { + "epoch": 0.24279877425944843, + "grad_norm": 1.6338901346513521, + "learning_rate": 1.771802254643154e-05, + "loss": 0.69, + "step": 2377 + }, + { + "epoch": 0.2429009193054137, + "grad_norm": 1.4664711521019633, + "learning_rate": 1.7715918479031267e-05, + "loss": 0.7233, + "step": 2378 + }, + { + "epoch": 0.24300306435137894, + "grad_norm": 1.4304005434065687, + "learning_rate": 1.7713813567118728e-05, + "loss": 0.7249, + "step": 2379 + }, + { + "epoch": 0.24310520939734423, + "grad_norm": 1.5198919714298205, + "learning_rate": 1.7711707810924313e-05, + "loss": 0.74, + "step": 2380 + }, + { + "epoch": 0.2432073544433095, + "grad_norm": 1.3856765039225727, + "learning_rate": 1.7709601210678493e-05, + "loss": 0.8445, + "step": 2381 + }, + { + "epoch": 0.24330949948927477, + "grad_norm": 1.3648789124432636, + "learning_rate": 1.770749376661184e-05, + "loss": 0.7381, + "step": 2382 + }, + { + "epoch": 0.24341164453524003, + "grad_norm": 1.4066400860429227, + "learning_rate": 1.7705385478955014e-05, + "loss": 0.853, + "step": 2383 + }, + { + "epoch": 0.24351378958120531, + "grad_norm": 1.4449899374944386, + "learning_rate": 1.770327634793877e-05, + "loss": 0.7248, + "step": 2384 + }, + { + "epoch": 0.24361593462717057, + "grad_norm": 1.4233804420031348, + "learning_rate": 1.7701166373793955e-05, + "loss": 0.655, + "step": 2385 + }, + { + "epoch": 0.24371807967313586, + "grad_norm": 1.7629905098275387, + "learning_rate": 1.7699055556751502e-05, + "loss": 0.8323, + "step": 2386 + }, + { + "epoch": 0.24382022471910111, + "grad_norm": 1.438484558975726, + "learning_rate": 1.7696943897042444e-05, + "loss": 0.7486, + "step": 2387 + }, + { + "epoch": 0.2439223697650664, + "grad_norm": 1.4987952575529664, + "learning_rate": 1.7694831394897904e-05, + "loss": 0.7808, + "step": 2388 + }, + { + "epoch": 0.24402451481103166, + "grad_norm": 1.4549700150458358, + "learning_rate": 1.7692718050549097e-05, + "loss": 0.7289, + "step": 2389 + }, + { + "epoch": 0.24412665985699694, + "grad_norm": 1.507981833163515, + "learning_rate": 1.769060386422733e-05, + "loss": 0.8092, + "step": 2390 + }, + { + "epoch": 0.2442288049029622, + "grad_norm": 1.5320845843661672, + "learning_rate": 1.7688488836164e-05, + "loss": 0.6939, + "step": 2391 + }, + { + "epoch": 0.24433094994892748, + "grad_norm": 1.5029724377291633, + "learning_rate": 1.7686372966590598e-05, + "loss": 0.7223, + "step": 2392 + }, + { + "epoch": 0.24443309499489274, + "grad_norm": 1.5426524861163202, + "learning_rate": 1.768425625573871e-05, + "loss": 0.6834, + "step": 2393 + }, + { + "epoch": 0.24453524004085803, + "grad_norm": 1.3584803287585723, + "learning_rate": 1.7682138703840014e-05, + "loss": 0.6999, + "step": 2394 + }, + { + "epoch": 0.24463738508682328, + "grad_norm": 1.450883767963888, + "learning_rate": 1.768002031112627e-05, + "loss": 0.7316, + "step": 2395 + }, + { + "epoch": 0.24473953013278857, + "grad_norm": 1.480473785598229, + "learning_rate": 1.767790107782934e-05, + "loss": 0.7828, + "step": 2396 + }, + { + "epoch": 0.24484167517875383, + "grad_norm": 1.6405185914311855, + "learning_rate": 1.7675781004181177e-05, + "loss": 0.8154, + "step": 2397 + }, + { + "epoch": 0.2449438202247191, + "grad_norm": 1.4920254930923424, + "learning_rate": 1.7673660090413825e-05, + "loss": 0.7102, + "step": 2398 + }, + { + "epoch": 0.24504596527068437, + "grad_norm": 1.3355352090232555, + "learning_rate": 1.7671538336759418e-05, + "loss": 0.7116, + "step": 2399 + }, + { + "epoch": 0.24514811031664965, + "grad_norm": 1.5751864691561008, + "learning_rate": 1.766941574345018e-05, + "loss": 0.82, + "step": 2400 + }, + { + "epoch": 0.2452502553626149, + "grad_norm": 1.3606555163608096, + "learning_rate": 1.766729231071844e-05, + "loss": 0.7304, + "step": 2401 + }, + { + "epoch": 0.2453524004085802, + "grad_norm": 1.395238870403432, + "learning_rate": 1.76651680387966e-05, + "loss": 0.7966, + "step": 2402 + }, + { + "epoch": 0.24545454545454545, + "grad_norm": 1.6160266469726008, + "learning_rate": 1.7663042927917165e-05, + "loss": 0.8005, + "step": 2403 + }, + { + "epoch": 0.24555669050051074, + "grad_norm": 1.363140090600953, + "learning_rate": 1.7660916978312733e-05, + "loss": 0.7167, + "step": 2404 + }, + { + "epoch": 0.245658835546476, + "grad_norm": 1.4651707930250255, + "learning_rate": 1.7658790190215987e-05, + "loss": 0.8045, + "step": 2405 + }, + { + "epoch": 0.24576098059244128, + "grad_norm": 1.5965999833709852, + "learning_rate": 1.7656662563859702e-05, + "loss": 0.7401, + "step": 2406 + }, + { + "epoch": 0.24586312563840654, + "grad_norm": 1.8824709320875674, + "learning_rate": 1.765453409947676e-05, + "loss": 0.8479, + "step": 2407 + }, + { + "epoch": 0.2459652706843718, + "grad_norm": 1.5366865341769178, + "learning_rate": 1.7652404797300114e-05, + "loss": 0.6067, + "step": 2408 + }, + { + "epoch": 0.24606741573033708, + "grad_norm": 1.2967898002305431, + "learning_rate": 1.7650274657562815e-05, + "loss": 0.7205, + "step": 2409 + }, + { + "epoch": 0.24616956077630234, + "grad_norm": 1.471879112081625, + "learning_rate": 1.7648143680498012e-05, + "loss": 0.7807, + "step": 2410 + }, + { + "epoch": 0.24627170582226762, + "grad_norm": 1.480730862721299, + "learning_rate": 1.7646011866338946e-05, + "loss": 0.7316, + "step": 2411 + }, + { + "epoch": 0.24637385086823288, + "grad_norm": 1.5574511585398474, + "learning_rate": 1.7643879215318938e-05, + "loss": 0.6395, + "step": 2412 + }, + { + "epoch": 0.24647599591419816, + "grad_norm": 1.4742350271716305, + "learning_rate": 1.764174572767141e-05, + "loss": 0.7625, + "step": 2413 + }, + { + "epoch": 0.24657814096016342, + "grad_norm": 1.5590627604033827, + "learning_rate": 1.7639611403629878e-05, + "loss": 0.8724, + "step": 2414 + }, + { + "epoch": 0.2466802860061287, + "grad_norm": 1.5810763809914643, + "learning_rate": 1.763747624342794e-05, + "loss": 0.7716, + "step": 2415 + }, + { + "epoch": 0.24678243105209396, + "grad_norm": 1.534494376720642, + "learning_rate": 1.76353402472993e-05, + "loss": 0.8068, + "step": 2416 + }, + { + "epoch": 0.24688457609805925, + "grad_norm": 1.4421118626501486, + "learning_rate": 1.7633203415477725e-05, + "loss": 0.7254, + "step": 2417 + }, + { + "epoch": 0.2469867211440245, + "grad_norm": 1.6607433446557796, + "learning_rate": 1.7631065748197113e-05, + "loss": 0.7762, + "step": 2418 + }, + { + "epoch": 0.2470888661899898, + "grad_norm": 1.4768915718985887, + "learning_rate": 1.762892724569142e-05, + "loss": 0.7165, + "step": 2419 + }, + { + "epoch": 0.24719101123595505, + "grad_norm": 1.2793392497238998, + "learning_rate": 1.7626787908194716e-05, + "loss": 0.7239, + "step": 2420 + }, + { + "epoch": 0.24729315628192033, + "grad_norm": 1.4151451700371307, + "learning_rate": 1.7624647735941143e-05, + "loss": 0.6944, + "step": 2421 + }, + { + "epoch": 0.2473953013278856, + "grad_norm": 1.4532080003554693, + "learning_rate": 1.762250672916495e-05, + "loss": 0.8138, + "step": 2422 + }, + { + "epoch": 0.24749744637385088, + "grad_norm": 1.4760857758709363, + "learning_rate": 1.7620364888100475e-05, + "loss": 0.768, + "step": 2423 + }, + { + "epoch": 0.24759959141981613, + "grad_norm": 1.497074189235693, + "learning_rate": 1.7618222212982135e-05, + "loss": 0.7517, + "step": 2424 + }, + { + "epoch": 0.24770173646578142, + "grad_norm": 1.5447997021489788, + "learning_rate": 1.7616078704044454e-05, + "loss": 0.7279, + "step": 2425 + }, + { + "epoch": 0.24780388151174668, + "grad_norm": 1.505465684130928, + "learning_rate": 1.7613934361522035e-05, + "loss": 0.8398, + "step": 2426 + }, + { + "epoch": 0.24790602655771196, + "grad_norm": 1.3911482459316213, + "learning_rate": 1.7611789185649584e-05, + "loss": 0.7155, + "step": 2427 + }, + { + "epoch": 0.24800817160367722, + "grad_norm": 1.4600345477847263, + "learning_rate": 1.7609643176661887e-05, + "loss": 0.7057, + "step": 2428 + }, + { + "epoch": 0.2481103166496425, + "grad_norm": 1.4654499777225087, + "learning_rate": 1.760749633479383e-05, + "loss": 0.7498, + "step": 2429 + }, + { + "epoch": 0.24821246169560776, + "grad_norm": 1.4845547691744059, + "learning_rate": 1.7605348660280384e-05, + "loss": 0.7896, + "step": 2430 + }, + { + "epoch": 0.24831460674157305, + "grad_norm": 1.3791372973965634, + "learning_rate": 1.760320015335661e-05, + "loss": 0.7521, + "step": 2431 + }, + { + "epoch": 0.2484167517875383, + "grad_norm": 1.4443624014500467, + "learning_rate": 1.760105081425767e-05, + "loss": 0.7074, + "step": 2432 + }, + { + "epoch": 0.2485188968335036, + "grad_norm": 1.5242569887136201, + "learning_rate": 1.7598900643218807e-05, + "loss": 0.8413, + "step": 2433 + }, + { + "epoch": 0.24862104187946885, + "grad_norm": 1.424725355238924, + "learning_rate": 1.7596749640475362e-05, + "loss": 0.7151, + "step": 2434 + }, + { + "epoch": 0.2487231869254341, + "grad_norm": 1.5444500017297091, + "learning_rate": 1.7594597806262755e-05, + "loss": 0.8003, + "step": 2435 + }, + { + "epoch": 0.2488253319713994, + "grad_norm": 1.4427705615013624, + "learning_rate": 1.7592445140816515e-05, + "loss": 0.8018, + "step": 2436 + }, + { + "epoch": 0.24892747701736465, + "grad_norm": 1.5563090770848207, + "learning_rate": 1.759029164437225e-05, + "loss": 0.667, + "step": 2437 + }, + { + "epoch": 0.24902962206332993, + "grad_norm": 1.4311162344212127, + "learning_rate": 1.7588137317165656e-05, + "loss": 0.7, + "step": 2438 + }, + { + "epoch": 0.2491317671092952, + "grad_norm": 1.5559287357976523, + "learning_rate": 1.7585982159432534e-05, + "loss": 0.8859, + "step": 2439 + }, + { + "epoch": 0.24923391215526047, + "grad_norm": 1.4447791248480248, + "learning_rate": 1.758382617140876e-05, + "loss": 0.6832, + "step": 2440 + }, + { + "epoch": 0.24933605720122573, + "grad_norm": 1.5569893188803954, + "learning_rate": 1.7581669353330314e-05, + "loss": 0.7232, + "step": 2441 + }, + { + "epoch": 0.24943820224719102, + "grad_norm": 1.6071322627415818, + "learning_rate": 1.757951170543326e-05, + "loss": 0.7723, + "step": 2442 + }, + { + "epoch": 0.24954034729315627, + "grad_norm": 1.3331138364392787, + "learning_rate": 1.7577353227953748e-05, + "loss": 0.727, + "step": 2443 + }, + { + "epoch": 0.24964249233912156, + "grad_norm": 1.4416978045306965, + "learning_rate": 1.7575193921128037e-05, + "loss": 0.8391, + "step": 2444 + }, + { + "epoch": 0.24974463738508682, + "grad_norm": 1.7060462707902826, + "learning_rate": 1.7573033785192454e-05, + "loss": 0.6732, + "step": 2445 + }, + { + "epoch": 0.2498467824310521, + "grad_norm": 1.4897470731280371, + "learning_rate": 1.757087282038343e-05, + "loss": 0.7137, + "step": 2446 + }, + { + "epoch": 0.24994892747701736, + "grad_norm": 1.388446458758296, + "learning_rate": 1.756871102693748e-05, + "loss": 0.7705, + "step": 2447 + }, + { + "epoch": 0.25005107252298264, + "grad_norm": 1.4888803986269923, + "learning_rate": 1.7566548405091223e-05, + "loss": 0.7483, + "step": 2448 + }, + { + "epoch": 0.2501532175689479, + "grad_norm": 1.6574932603710408, + "learning_rate": 1.756438495508135e-05, + "loss": 0.8423, + "step": 2449 + }, + { + "epoch": 0.25025536261491316, + "grad_norm": 1.6806067033295211, + "learning_rate": 1.7562220677144664e-05, + "loss": 0.6819, + "step": 2450 + }, + { + "epoch": 0.25035750766087844, + "grad_norm": 1.410017732171322, + "learning_rate": 1.7560055571518034e-05, + "loss": 0.7458, + "step": 2451 + }, + { + "epoch": 0.2504596527068437, + "grad_norm": 1.7129430986357244, + "learning_rate": 1.755788963843844e-05, + "loss": 0.7836, + "step": 2452 + }, + { + "epoch": 0.250561797752809, + "grad_norm": 1.5937253828973257, + "learning_rate": 1.755572287814294e-05, + "loss": 0.7841, + "step": 2453 + }, + { + "epoch": 0.25066394279877424, + "grad_norm": 1.5264025692588492, + "learning_rate": 1.755355529086869e-05, + "loss": 0.6556, + "step": 2454 + }, + { + "epoch": 0.2507660878447395, + "grad_norm": 1.4927912298365436, + "learning_rate": 1.7551386876852933e-05, + "loss": 0.708, + "step": 2455 + }, + { + "epoch": 0.2508682328907048, + "grad_norm": 1.4801661756968634, + "learning_rate": 1.7549217636333005e-05, + "loss": 0.7421, + "step": 2456 + }, + { + "epoch": 0.2509703779366701, + "grad_norm": 1.4716682950343924, + "learning_rate": 1.7547047569546328e-05, + "loss": 0.7962, + "step": 2457 + }, + { + "epoch": 0.2510725229826353, + "grad_norm": 1.531938351706911, + "learning_rate": 1.7544876676730417e-05, + "loss": 0.7895, + "step": 2458 + }, + { + "epoch": 0.2511746680286006, + "grad_norm": 1.3902575555486032, + "learning_rate": 1.7542704958122882e-05, + "loss": 0.6955, + "step": 2459 + }, + { + "epoch": 0.2512768130745659, + "grad_norm": 1.5196329047156716, + "learning_rate": 1.7540532413961413e-05, + "loss": 0.7982, + "step": 2460 + }, + { + "epoch": 0.2513789581205312, + "grad_norm": 1.541687848125844, + "learning_rate": 1.75383590444838e-05, + "loss": 0.7273, + "step": 2461 + }, + { + "epoch": 0.2514811031664964, + "grad_norm": 1.4611238266361886, + "learning_rate": 1.7536184849927922e-05, + "loss": 0.6934, + "step": 2462 + }, + { + "epoch": 0.2515832482124617, + "grad_norm": 1.3373169727863152, + "learning_rate": 1.7534009830531742e-05, + "loss": 0.7496, + "step": 2463 + }, + { + "epoch": 0.251685393258427, + "grad_norm": 1.6330527183887573, + "learning_rate": 1.7531833986533318e-05, + "loss": 0.9134, + "step": 2464 + }, + { + "epoch": 0.2517875383043922, + "grad_norm": 1.3707177336426284, + "learning_rate": 1.7529657318170798e-05, + "loss": 0.7355, + "step": 2465 + }, + { + "epoch": 0.2518896833503575, + "grad_norm": 1.6421972849918223, + "learning_rate": 1.752747982568242e-05, + "loss": 0.7167, + "step": 2466 + }, + { + "epoch": 0.2519918283963228, + "grad_norm": 1.5093522175105039, + "learning_rate": 1.752530150930651e-05, + "loss": 0.8964, + "step": 2467 + }, + { + "epoch": 0.25209397344228807, + "grad_norm": 1.5119352181772094, + "learning_rate": 1.7523122369281488e-05, + "loss": 0.7688, + "step": 2468 + }, + { + "epoch": 0.2521961184882533, + "grad_norm": 1.1717346113327243, + "learning_rate": 1.7520942405845864e-05, + "loss": 0.5172, + "step": 2469 + }, + { + "epoch": 0.2522982635342186, + "grad_norm": 1.4396644136059134, + "learning_rate": 1.7518761619238234e-05, + "loss": 0.6575, + "step": 2470 + }, + { + "epoch": 0.25240040858018387, + "grad_norm": 1.4496678115893424, + "learning_rate": 1.7516580009697287e-05, + "loss": 0.8468, + "step": 2471 + }, + { + "epoch": 0.25250255362614915, + "grad_norm": 1.5165708480545144, + "learning_rate": 1.7514397577461803e-05, + "loss": 0.6896, + "step": 2472 + }, + { + "epoch": 0.2526046986721144, + "grad_norm": 1.5602780340089109, + "learning_rate": 1.7512214322770646e-05, + "loss": 0.7057, + "step": 2473 + }, + { + "epoch": 0.25270684371807967, + "grad_norm": 1.5192559223739266, + "learning_rate": 1.751003024586278e-05, + "loss": 0.8266, + "step": 2474 + }, + { + "epoch": 0.25280898876404495, + "grad_norm": 1.4647564757868894, + "learning_rate": 1.750784534697725e-05, + "loss": 0.7663, + "step": 2475 + }, + { + "epoch": 0.25291113381001024, + "grad_norm": 1.3355296825148013, + "learning_rate": 1.75056596263532e-05, + "loss": 0.7194, + "step": 2476 + }, + { + "epoch": 0.25301327885597547, + "grad_norm": 1.6077019398415318, + "learning_rate": 1.7503473084229846e-05, + "loss": 0.8057, + "step": 2477 + }, + { + "epoch": 0.25311542390194075, + "grad_norm": 1.33279044991065, + "learning_rate": 1.7501285720846523e-05, + "loss": 0.7312, + "step": 2478 + }, + { + "epoch": 0.25321756894790604, + "grad_norm": 1.6253531646833466, + "learning_rate": 1.749909753644263e-05, + "loss": 0.8413, + "step": 2479 + }, + { + "epoch": 0.2533197139938713, + "grad_norm": 1.5339948129886503, + "learning_rate": 1.7496908531257666e-05, + "loss": 0.805, + "step": 2480 + }, + { + "epoch": 0.25342185903983655, + "grad_norm": 4.492418736885847, + "learning_rate": 1.749471870553122e-05, + "loss": 0.7006, + "step": 2481 + }, + { + "epoch": 0.25352400408580184, + "grad_norm": 1.5480504229660053, + "learning_rate": 1.7492528059502966e-05, + "loss": 0.7453, + "step": 2482 + }, + { + "epoch": 0.2536261491317671, + "grad_norm": 1.4121637706640053, + "learning_rate": 1.749033659341268e-05, + "loss": 0.7526, + "step": 2483 + }, + { + "epoch": 0.2537282941777324, + "grad_norm": 1.460684151437332, + "learning_rate": 1.7488144307500214e-05, + "loss": 0.7504, + "step": 2484 + }, + { + "epoch": 0.25383043922369763, + "grad_norm": 1.566965089252394, + "learning_rate": 1.7485951202005514e-05, + "loss": 0.7228, + "step": 2485 + }, + { + "epoch": 0.2539325842696629, + "grad_norm": 1.4683808569093877, + "learning_rate": 1.7483757277168617e-05, + "loss": 0.9251, + "step": 2486 + }, + { + "epoch": 0.2540347293156282, + "grad_norm": 1.304300866353276, + "learning_rate": 1.748156253322965e-05, + "loss": 0.6368, + "step": 2487 + }, + { + "epoch": 0.2541368743615935, + "grad_norm": 1.3569700616182394, + "learning_rate": 1.7479366970428833e-05, + "loss": 0.6261, + "step": 2488 + }, + { + "epoch": 0.2542390194075587, + "grad_norm": 1.4100159812192496, + "learning_rate": 1.7477170589006468e-05, + "loss": 0.736, + "step": 2489 + }, + { + "epoch": 0.254341164453524, + "grad_norm": 1.453620145974583, + "learning_rate": 1.7474973389202953e-05, + "loss": 0.6989, + "step": 2490 + }, + { + "epoch": 0.2544433094994893, + "grad_norm": 1.4765189719442993, + "learning_rate": 1.747277537125877e-05, + "loss": 0.6685, + "step": 2491 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 1.7161609927060864, + "learning_rate": 1.7470576535414492e-05, + "loss": 0.7923, + "step": 2492 + }, + { + "epoch": 0.2546475995914198, + "grad_norm": 1.62423539457153, + "learning_rate": 1.746837688191079e-05, + "loss": 0.8328, + "step": 2493 + }, + { + "epoch": 0.2547497446373851, + "grad_norm": 1.5176620634628604, + "learning_rate": 1.746617641098841e-05, + "loss": 0.6953, + "step": 2494 + }, + { + "epoch": 0.2548518896833504, + "grad_norm": 1.4704562717589216, + "learning_rate": 1.74639751228882e-05, + "loss": 0.6608, + "step": 2495 + }, + { + "epoch": 0.2549540347293156, + "grad_norm": 1.5147407390165644, + "learning_rate": 1.746177301785109e-05, + "loss": 0.8198, + "step": 2496 + }, + { + "epoch": 0.2550561797752809, + "grad_norm": 1.5104917487397456, + "learning_rate": 1.74595700961181e-05, + "loss": 0.6858, + "step": 2497 + }, + { + "epoch": 0.2551583248212462, + "grad_norm": 1.4198018408635444, + "learning_rate": 1.7457366357930343e-05, + "loss": 0.8058, + "step": 2498 + }, + { + "epoch": 0.25526046986721146, + "grad_norm": 1.8597769830263262, + "learning_rate": 1.7455161803529025e-05, + "loss": 0.6807, + "step": 2499 + }, + { + "epoch": 0.2553626149131767, + "grad_norm": 1.6423930319217643, + "learning_rate": 1.7452956433155427e-05, + "loss": 0.7743, + "step": 2500 + }, + { + "epoch": 0.255464759959142, + "grad_norm": 1.4570993699965709, + "learning_rate": 1.745075024705093e-05, + "loss": 0.8229, + "step": 2501 + }, + { + "epoch": 0.25556690500510726, + "grad_norm": 1.359563072389055, + "learning_rate": 1.744854324545701e-05, + "loss": 0.6327, + "step": 2502 + }, + { + "epoch": 0.25566905005107254, + "grad_norm": 1.5241561813698241, + "learning_rate": 1.7446335428615217e-05, + "loss": 0.686, + "step": 2503 + }, + { + "epoch": 0.2557711950970378, + "grad_norm": 1.6263860495979683, + "learning_rate": 1.74441267967672e-05, + "loss": 0.7349, + "step": 2504 + }, + { + "epoch": 0.25587334014300306, + "grad_norm": 1.463234343645937, + "learning_rate": 1.74419173501547e-05, + "loss": 0.6576, + "step": 2505 + }, + { + "epoch": 0.25597548518896834, + "grad_norm": 1.3907928379952301, + "learning_rate": 1.743970708901953e-05, + "loss": 0.7154, + "step": 2506 + }, + { + "epoch": 0.25607763023493363, + "grad_norm": 1.2870484780553566, + "learning_rate": 1.743749601360362e-05, + "loss": 0.7361, + "step": 2507 + }, + { + "epoch": 0.25617977528089886, + "grad_norm": 1.6068122594285188, + "learning_rate": 1.743528412414896e-05, + "loss": 0.8294, + "step": 2508 + }, + { + "epoch": 0.25628192032686414, + "grad_norm": 1.469824547644569, + "learning_rate": 1.7433071420897653e-05, + "loss": 0.7936, + "step": 2509 + }, + { + "epoch": 0.25638406537282943, + "grad_norm": 1.4497261693289538, + "learning_rate": 1.7430857904091873e-05, + "loss": 0.7387, + "step": 2510 + }, + { + "epoch": 0.2564862104187947, + "grad_norm": 1.6397995786431518, + "learning_rate": 1.7428643573973895e-05, + "loss": 0.6616, + "step": 2511 + }, + { + "epoch": 0.25658835546475994, + "grad_norm": 1.4034425486798285, + "learning_rate": 1.7426428430786083e-05, + "loss": 0.7603, + "step": 2512 + }, + { + "epoch": 0.25669050051072523, + "grad_norm": 1.6043854670748998, + "learning_rate": 1.7424212474770875e-05, + "loss": 0.7474, + "step": 2513 + }, + { + "epoch": 0.2567926455566905, + "grad_norm": 1.6530455811890246, + "learning_rate": 1.742199570617082e-05, + "loss": 0.8297, + "step": 2514 + }, + { + "epoch": 0.2568947906026558, + "grad_norm": 1.4434252528521987, + "learning_rate": 1.7419778125228538e-05, + "loss": 0.6857, + "step": 2515 + }, + { + "epoch": 0.256996935648621, + "grad_norm": 1.5153853694350543, + "learning_rate": 1.7417559732186747e-05, + "loss": 0.872, + "step": 2516 + }, + { + "epoch": 0.2570990806945863, + "grad_norm": 1.569630800106155, + "learning_rate": 1.7415340527288247e-05, + "loss": 0.9354, + "step": 2517 + }, + { + "epoch": 0.2572012257405516, + "grad_norm": 1.5462828477202166, + "learning_rate": 1.741312051077594e-05, + "loss": 0.8827, + "step": 2518 + }, + { + "epoch": 0.2573033707865168, + "grad_norm": 1.6126111620149348, + "learning_rate": 1.7410899682892802e-05, + "loss": 0.8399, + "step": 2519 + }, + { + "epoch": 0.2574055158324821, + "grad_norm": 1.420590093068615, + "learning_rate": 1.7408678043881905e-05, + "loss": 0.6294, + "step": 2520 + }, + { + "epoch": 0.2575076608784474, + "grad_norm": 1.5996841334286362, + "learning_rate": 1.740645559398641e-05, + "loss": 0.7573, + "step": 2521 + }, + { + "epoch": 0.2576098059244127, + "grad_norm": 1.4190033206039538, + "learning_rate": 1.7404232333449566e-05, + "loss": 0.7002, + "step": 2522 + }, + { + "epoch": 0.2577119509703779, + "grad_norm": 1.6258667617437892, + "learning_rate": 1.7402008262514706e-05, + "loss": 0.8791, + "step": 2523 + }, + { + "epoch": 0.2578140960163432, + "grad_norm": 1.3590453456515896, + "learning_rate": 1.7399783381425264e-05, + "loss": 0.6989, + "step": 2524 + }, + { + "epoch": 0.2579162410623085, + "grad_norm": 1.4830960215986995, + "learning_rate": 1.7397557690424748e-05, + "loss": 0.706, + "step": 2525 + }, + { + "epoch": 0.25801838610827377, + "grad_norm": 1.5192457907690529, + "learning_rate": 1.7395331189756763e-05, + "loss": 0.7171, + "step": 2526 + }, + { + "epoch": 0.258120531154239, + "grad_norm": 1.3622636626355842, + "learning_rate": 1.7393103879665e-05, + "loss": 0.7405, + "step": 2527 + }, + { + "epoch": 0.2582226762002043, + "grad_norm": 1.3863364065659285, + "learning_rate": 1.7390875760393245e-05, + "loss": 0.7437, + "step": 2528 + }, + { + "epoch": 0.25832482124616957, + "grad_norm": 1.6038163906735787, + "learning_rate": 1.7388646832185358e-05, + "loss": 0.8477, + "step": 2529 + }, + { + "epoch": 0.25842696629213485, + "grad_norm": 1.5744136045514094, + "learning_rate": 1.7386417095285308e-05, + "loss": 0.7987, + "step": 2530 + }, + { + "epoch": 0.2585291113381001, + "grad_norm": 1.55256592535351, + "learning_rate": 1.7384186549937124e-05, + "loss": 0.7301, + "step": 2531 + }, + { + "epoch": 0.25863125638406537, + "grad_norm": 1.4174166781686748, + "learning_rate": 1.7381955196384962e-05, + "loss": 0.7502, + "step": 2532 + }, + { + "epoch": 0.25873340143003065, + "grad_norm": 1.469171656177889, + "learning_rate": 1.737972303487303e-05, + "loss": 0.7369, + "step": 2533 + }, + { + "epoch": 0.25883554647599594, + "grad_norm": 1.6808143844062435, + "learning_rate": 1.7377490065645643e-05, + "loss": 0.8497, + "step": 2534 + }, + { + "epoch": 0.25893769152196117, + "grad_norm": 1.437318569631455, + "learning_rate": 1.7375256288947203e-05, + "loss": 0.7336, + "step": 2535 + }, + { + "epoch": 0.25903983656792645, + "grad_norm": 1.4314490253269458, + "learning_rate": 1.7373021705022197e-05, + "loss": 0.6936, + "step": 2536 + }, + { + "epoch": 0.25914198161389174, + "grad_norm": 1.4687293827111565, + "learning_rate": 1.73707863141152e-05, + "loss": 0.6611, + "step": 2537 + }, + { + "epoch": 0.259244126659857, + "grad_norm": 1.4261926985299205, + "learning_rate": 1.736855011647088e-05, + "loss": 0.7058, + "step": 2538 + }, + { + "epoch": 0.25934627170582225, + "grad_norm": 1.3145077188043062, + "learning_rate": 1.7366313112333993e-05, + "loss": 0.6085, + "step": 2539 + }, + { + "epoch": 0.25944841675178754, + "grad_norm": 1.5570003248760094, + "learning_rate": 1.7364075301949374e-05, + "loss": 0.7837, + "step": 2540 + }, + { + "epoch": 0.2595505617977528, + "grad_norm": 1.4893310902576453, + "learning_rate": 1.7361836685561954e-05, + "loss": 0.7056, + "step": 2541 + }, + { + "epoch": 0.2596527068437181, + "grad_norm": 1.5764267476701108, + "learning_rate": 1.735959726341675e-05, + "loss": 0.7784, + "step": 2542 + }, + { + "epoch": 0.25975485188968334, + "grad_norm": 1.4574646136279008, + "learning_rate": 1.7357357035758875e-05, + "loss": 0.6259, + "step": 2543 + }, + { + "epoch": 0.2598569969356486, + "grad_norm": 1.4553885613493278, + "learning_rate": 1.735511600283352e-05, + "loss": 0.7996, + "step": 2544 + }, + { + "epoch": 0.2599591419816139, + "grad_norm": 1.3823775467417545, + "learning_rate": 1.7352874164885964e-05, + "loss": 0.752, + "step": 2545 + }, + { + "epoch": 0.26006128702757914, + "grad_norm": 1.5586763941157193, + "learning_rate": 1.735063152216158e-05, + "loss": 0.7232, + "step": 2546 + }, + { + "epoch": 0.2601634320735444, + "grad_norm": 1.4248645634957147, + "learning_rate": 1.734838807490583e-05, + "loss": 0.7109, + "step": 2547 + }, + { + "epoch": 0.2602655771195097, + "grad_norm": 1.3727171900547974, + "learning_rate": 1.7346143823364252e-05, + "loss": 0.7235, + "step": 2548 + }, + { + "epoch": 0.260367722165475, + "grad_norm": 1.5174154516143281, + "learning_rate": 1.734389876778249e-05, + "loss": 0.7048, + "step": 2549 + }, + { + "epoch": 0.2604698672114402, + "grad_norm": 1.512344286849908, + "learning_rate": 1.734165290840626e-05, + "loss": 0.7521, + "step": 2550 + }, + { + "epoch": 0.2605720122574055, + "grad_norm": 1.372586099134527, + "learning_rate": 1.7339406245481378e-05, + "loss": 0.7023, + "step": 2551 + }, + { + "epoch": 0.2606741573033708, + "grad_norm": 1.4599542776826786, + "learning_rate": 1.7337158779253743e-05, + "loss": 0.7809, + "step": 2552 + }, + { + "epoch": 0.2607763023493361, + "grad_norm": 1.3738173405216818, + "learning_rate": 1.7334910509969335e-05, + "loss": 0.7816, + "step": 2553 + }, + { + "epoch": 0.2608784473953013, + "grad_norm": 1.4101160596615534, + "learning_rate": 1.7332661437874235e-05, + "loss": 0.7248, + "step": 2554 + }, + { + "epoch": 0.2609805924412666, + "grad_norm": 1.5954432771459077, + "learning_rate": 1.73304115632146e-05, + "loss": 0.8169, + "step": 2555 + }, + { + "epoch": 0.2610827374872319, + "grad_norm": 1.433861268340218, + "learning_rate": 1.732816088623669e-05, + "loss": 0.7619, + "step": 2556 + }, + { + "epoch": 0.26118488253319716, + "grad_norm": 1.4602216140665614, + "learning_rate": 1.732590940718683e-05, + "loss": 0.7005, + "step": 2557 + }, + { + "epoch": 0.2612870275791624, + "grad_norm": 1.4914068586334843, + "learning_rate": 1.7323657126311454e-05, + "loss": 0.7101, + "step": 2558 + }, + { + "epoch": 0.2613891726251277, + "grad_norm": 1.5912432249550088, + "learning_rate": 1.7321404043857076e-05, + "loss": 0.7397, + "step": 2559 + }, + { + "epoch": 0.26149131767109296, + "grad_norm": 1.5427259462402998, + "learning_rate": 1.7319150160070292e-05, + "loss": 0.6991, + "step": 2560 + }, + { + "epoch": 0.26159346271705824, + "grad_norm": 1.4267636595591144, + "learning_rate": 1.7316895475197796e-05, + "loss": 0.7526, + "step": 2561 + }, + { + "epoch": 0.2616956077630235, + "grad_norm": 1.5120069113897876, + "learning_rate": 1.7314639989486364e-05, + "loss": 0.6901, + "step": 2562 + }, + { + "epoch": 0.26179775280898876, + "grad_norm": 1.4606811726735793, + "learning_rate": 1.7312383703182857e-05, + "loss": 0.7241, + "step": 2563 + }, + { + "epoch": 0.26189989785495404, + "grad_norm": 1.3575072357885103, + "learning_rate": 1.7310126616534232e-05, + "loss": 0.774, + "step": 2564 + }, + { + "epoch": 0.26200204290091933, + "grad_norm": 1.6002884476183385, + "learning_rate": 1.7307868729787524e-05, + "loss": 0.792, + "step": 2565 + }, + { + "epoch": 0.26210418794688456, + "grad_norm": 1.5225220314153924, + "learning_rate": 1.730561004318986e-05, + "loss": 0.7879, + "step": 2566 + }, + { + "epoch": 0.26220633299284984, + "grad_norm": 1.4257587427082887, + "learning_rate": 1.7303350556988457e-05, + "loss": 0.8273, + "step": 2567 + }, + { + "epoch": 0.26230847803881513, + "grad_norm": 1.4072810140894876, + "learning_rate": 1.7301090271430622e-05, + "loss": 0.8085, + "step": 2568 + }, + { + "epoch": 0.2624106230847804, + "grad_norm": 1.7163092285645056, + "learning_rate": 1.729882918676374e-05, + "loss": 0.869, + "step": 2569 + }, + { + "epoch": 0.26251276813074564, + "grad_norm": 1.6170774137739983, + "learning_rate": 1.729656730323528e-05, + "loss": 0.7057, + "step": 2570 + }, + { + "epoch": 0.26261491317671093, + "grad_norm": 1.4418590387031005, + "learning_rate": 1.729430462109282e-05, + "loss": 0.7984, + "step": 2571 + }, + { + "epoch": 0.2627170582226762, + "grad_norm": 1.4177237403308627, + "learning_rate": 1.7292041140584005e-05, + "loss": 0.6945, + "step": 2572 + }, + { + "epoch": 0.26281920326864144, + "grad_norm": 1.4062452927225306, + "learning_rate": 1.7289776861956576e-05, + "loss": 0.7476, + "step": 2573 + }, + { + "epoch": 0.26292134831460673, + "grad_norm": 1.4449053220202175, + "learning_rate": 1.7287511785458358e-05, + "loss": 0.7807, + "step": 2574 + }, + { + "epoch": 0.263023493360572, + "grad_norm": 1.3600144363567668, + "learning_rate": 1.728524591133727e-05, + "loss": 0.6867, + "step": 2575 + }, + { + "epoch": 0.2631256384065373, + "grad_norm": 1.7884559326480263, + "learning_rate": 1.7282979239841312e-05, + "loss": 0.825, + "step": 2576 + }, + { + "epoch": 0.26322778345250253, + "grad_norm": 1.595011626536254, + "learning_rate": 1.7280711771218564e-05, + "loss": 0.7214, + "step": 2577 + }, + { + "epoch": 0.2633299284984678, + "grad_norm": 1.4815314908237394, + "learning_rate": 1.7278443505717214e-05, + "loss": 0.7101, + "step": 2578 + }, + { + "epoch": 0.2634320735444331, + "grad_norm": 1.548533378429834, + "learning_rate": 1.7276174443585518e-05, + "loss": 0.7109, + "step": 2579 + }, + { + "epoch": 0.2635342185903984, + "grad_norm": 1.5134585592924366, + "learning_rate": 1.727390458507183e-05, + "loss": 0.7409, + "step": 2580 + }, + { + "epoch": 0.2636363636363636, + "grad_norm": 1.5098391169665875, + "learning_rate": 1.7271633930424584e-05, + "loss": 0.6816, + "step": 2581 + }, + { + "epoch": 0.2637385086823289, + "grad_norm": 1.3022167865283196, + "learning_rate": 1.7269362479892304e-05, + "loss": 0.7948, + "step": 2582 + }, + { + "epoch": 0.2638406537282942, + "grad_norm": 1.6514932505623174, + "learning_rate": 1.7267090233723606e-05, + "loss": 0.7503, + "step": 2583 + }, + { + "epoch": 0.26394279877425947, + "grad_norm": 1.4960113748500468, + "learning_rate": 1.7264817192167186e-05, + "loss": 0.7785, + "step": 2584 + }, + { + "epoch": 0.2640449438202247, + "grad_norm": 1.347994350926256, + "learning_rate": 1.7262543355471834e-05, + "loss": 0.7211, + "step": 2585 + }, + { + "epoch": 0.26414708886619, + "grad_norm": 1.3989797706569274, + "learning_rate": 1.7260268723886416e-05, + "loss": 0.5961, + "step": 2586 + }, + { + "epoch": 0.26424923391215527, + "grad_norm": 1.4640591504571363, + "learning_rate": 1.7257993297659897e-05, + "loss": 0.6993, + "step": 2587 + }, + { + "epoch": 0.26435137895812055, + "grad_norm": 1.3710118829241082, + "learning_rate": 1.725571707704132e-05, + "loss": 0.8234, + "step": 2588 + }, + { + "epoch": 0.2644535240040858, + "grad_norm": 1.4887185599174333, + "learning_rate": 1.7253440062279825e-05, + "loss": 0.6585, + "step": 2589 + }, + { + "epoch": 0.26455566905005107, + "grad_norm": 1.4462067989209562, + "learning_rate": 1.7251162253624624e-05, + "loss": 0.6664, + "step": 2590 + }, + { + "epoch": 0.26465781409601635, + "grad_norm": 1.4415258812394312, + "learning_rate": 1.7248883651325033e-05, + "loss": 0.7796, + "step": 2591 + }, + { + "epoch": 0.26475995914198164, + "grad_norm": 1.4975523439984146, + "learning_rate": 1.7246604255630443e-05, + "loss": 0.7454, + "step": 2592 + }, + { + "epoch": 0.26486210418794687, + "grad_norm": 1.405537713584991, + "learning_rate": 1.7244324066790336e-05, + "loss": 0.7534, + "step": 2593 + }, + { + "epoch": 0.26496424923391215, + "grad_norm": 1.3375152933428638, + "learning_rate": 1.7242043085054278e-05, + "loss": 0.6631, + "step": 2594 + }, + { + "epoch": 0.26506639427987744, + "grad_norm": 1.4754694364773744, + "learning_rate": 1.7239761310671923e-05, + "loss": 0.7421, + "step": 2595 + }, + { + "epoch": 0.2651685393258427, + "grad_norm": 1.4324979778724882, + "learning_rate": 1.723747874389302e-05, + "loss": 0.8013, + "step": 2596 + }, + { + "epoch": 0.26527068437180795, + "grad_norm": 1.541976736218398, + "learning_rate": 1.7235195384967388e-05, + "loss": 0.6678, + "step": 2597 + }, + { + "epoch": 0.26537282941777324, + "grad_norm": 1.5804864344753153, + "learning_rate": 1.7232911234144947e-05, + "loss": 0.6474, + "step": 2598 + }, + { + "epoch": 0.2654749744637385, + "grad_norm": 1.647174256251865, + "learning_rate": 1.7230626291675702e-05, + "loss": 0.8284, + "step": 2599 + }, + { + "epoch": 0.26557711950970375, + "grad_norm": 1.6204698327624225, + "learning_rate": 1.7228340557809734e-05, + "loss": 0.8262, + "step": 2600 + }, + { + "epoch": 0.26567926455566904, + "grad_norm": 1.3633740671849883, + "learning_rate": 1.7226054032797223e-05, + "loss": 0.7736, + "step": 2601 + }, + { + "epoch": 0.2657814096016343, + "grad_norm": 1.2677299921300325, + "learning_rate": 1.7223766716888432e-05, + "loss": 0.6868, + "step": 2602 + }, + { + "epoch": 0.2658835546475996, + "grad_norm": 1.4575284242366675, + "learning_rate": 1.7221478610333708e-05, + "loss": 0.7536, + "step": 2603 + }, + { + "epoch": 0.26598569969356484, + "grad_norm": 1.66703457406756, + "learning_rate": 1.7219189713383477e-05, + "loss": 0.8388, + "step": 2604 + }, + { + "epoch": 0.2660878447395301, + "grad_norm": 1.5989751549480957, + "learning_rate": 1.7216900026288272e-05, + "loss": 0.8566, + "step": 2605 + }, + { + "epoch": 0.2661899897854954, + "grad_norm": 1.6842324880039306, + "learning_rate": 1.72146095492987e-05, + "loss": 0.7016, + "step": 2606 + }, + { + "epoch": 0.2662921348314607, + "grad_norm": 1.4566409704351093, + "learning_rate": 1.7212318282665442e-05, + "loss": 0.7062, + "step": 2607 + }, + { + "epoch": 0.2663942798774259, + "grad_norm": 1.429521751121697, + "learning_rate": 1.72100262266393e-05, + "loss": 0.7792, + "step": 2608 + }, + { + "epoch": 0.2664964249233912, + "grad_norm": 1.4589532508338408, + "learning_rate": 1.7207733381471122e-05, + "loss": 0.654, + "step": 2609 + }, + { + "epoch": 0.2665985699693565, + "grad_norm": 1.3996357210499535, + "learning_rate": 1.7205439747411867e-05, + "loss": 0.6873, + "step": 2610 + }, + { + "epoch": 0.2667007150153218, + "grad_norm": 1.4891202337744267, + "learning_rate": 1.720314532471258e-05, + "loss": 0.7736, + "step": 2611 + }, + { + "epoch": 0.266802860061287, + "grad_norm": 1.5208262563013906, + "learning_rate": 1.7200850113624384e-05, + "loss": 0.7282, + "step": 2612 + }, + { + "epoch": 0.2669050051072523, + "grad_norm": 1.3052087803141619, + "learning_rate": 1.719855411439849e-05, + "loss": 0.6592, + "step": 2613 + }, + { + "epoch": 0.2670071501532176, + "grad_norm": 1.472917562630221, + "learning_rate": 1.7196257327286195e-05, + "loss": 0.8273, + "step": 2614 + }, + { + "epoch": 0.26710929519918286, + "grad_norm": 1.4479760356290818, + "learning_rate": 1.7193959752538886e-05, + "loss": 0.7423, + "step": 2615 + }, + { + "epoch": 0.2672114402451481, + "grad_norm": 1.2954904787858723, + "learning_rate": 1.719166139040804e-05, + "loss": 0.6636, + "step": 2616 + }, + { + "epoch": 0.2673135852911134, + "grad_norm": 1.539730300261252, + "learning_rate": 1.7189362241145202e-05, + "loss": 0.6679, + "step": 2617 + }, + { + "epoch": 0.26741573033707866, + "grad_norm": 1.5069173701769514, + "learning_rate": 1.7187062305002025e-05, + "loss": 0.857, + "step": 2618 + }, + { + "epoch": 0.26751787538304395, + "grad_norm": 1.495559316813588, + "learning_rate": 1.7184761582230233e-05, + "loss": 0.6719, + "step": 2619 + }, + { + "epoch": 0.2676200204290092, + "grad_norm": 1.5727692731142129, + "learning_rate": 1.7182460073081644e-05, + "loss": 0.7304, + "step": 2620 + }, + { + "epoch": 0.26772216547497446, + "grad_norm": 1.3280373947043984, + "learning_rate": 1.718015777780816e-05, + "loss": 0.6062, + "step": 2621 + }, + { + "epoch": 0.26782431052093975, + "grad_norm": 1.4141652113642578, + "learning_rate": 1.7177854696661774e-05, + "loss": 0.6872, + "step": 2622 + }, + { + "epoch": 0.26792645556690503, + "grad_norm": 1.6295738361306191, + "learning_rate": 1.7175550829894545e-05, + "loss": 0.6925, + "step": 2623 + }, + { + "epoch": 0.26802860061287026, + "grad_norm": 1.6093269761681401, + "learning_rate": 1.717324617775865e-05, + "loss": 0.8464, + "step": 2624 + }, + { + "epoch": 0.26813074565883555, + "grad_norm": 1.4136816011681543, + "learning_rate": 1.7170940740506318e-05, + "loss": 0.7431, + "step": 2625 + }, + { + "epoch": 0.26823289070480083, + "grad_norm": 1.3353829216752438, + "learning_rate": 1.7168634518389896e-05, + "loss": 0.6679, + "step": 2626 + }, + { + "epoch": 0.2683350357507661, + "grad_norm": 1.3911946856564807, + "learning_rate": 1.7166327511661788e-05, + "loss": 0.7022, + "step": 2627 + }, + { + "epoch": 0.26843718079673135, + "grad_norm": 1.2940846490331492, + "learning_rate": 1.716401972057451e-05, + "loss": 0.6863, + "step": 2628 + }, + { + "epoch": 0.26853932584269663, + "grad_norm": 1.497118255198858, + "learning_rate": 1.716171114538064e-05, + "loss": 0.7402, + "step": 2629 + }, + { + "epoch": 0.2686414708886619, + "grad_norm": 1.5833943878999488, + "learning_rate": 1.7159401786332862e-05, + "loss": 0.7949, + "step": 2630 + }, + { + "epoch": 0.26874361593462714, + "grad_norm": 1.66428270474977, + "learning_rate": 1.7157091643683932e-05, + "loss": 0.7643, + "step": 2631 + }, + { + "epoch": 0.26884576098059243, + "grad_norm": 1.559043388225005, + "learning_rate": 1.7154780717686695e-05, + "loss": 0.8236, + "step": 2632 + }, + { + "epoch": 0.2689479060265577, + "grad_norm": 1.4788791377011152, + "learning_rate": 1.715246900859409e-05, + "loss": 0.8109, + "step": 2633 + }, + { + "epoch": 0.269050051072523, + "grad_norm": 1.5054228764923097, + "learning_rate": 1.7150156516659127e-05, + "loss": 0.6948, + "step": 2634 + }, + { + "epoch": 0.26915219611848823, + "grad_norm": 1.484573646124468, + "learning_rate": 1.7147843242134915e-05, + "loss": 0.6386, + "step": 2635 + }, + { + "epoch": 0.2692543411644535, + "grad_norm": 1.4800926169361108, + "learning_rate": 1.7145529185274644e-05, + "loss": 0.7017, + "step": 2636 + }, + { + "epoch": 0.2693564862104188, + "grad_norm": 1.6286395649407879, + "learning_rate": 1.7143214346331586e-05, + "loss": 0.9433, + "step": 2637 + }, + { + "epoch": 0.2694586312563841, + "grad_norm": 1.7125761880265544, + "learning_rate": 1.71408987255591e-05, + "loss": 0.8322, + "step": 2638 + }, + { + "epoch": 0.2695607763023493, + "grad_norm": 1.573273062007928, + "learning_rate": 1.7138582323210635e-05, + "loss": 0.7032, + "step": 2639 + }, + { + "epoch": 0.2696629213483146, + "grad_norm": 1.4996973945044365, + "learning_rate": 1.7136265139539725e-05, + "loss": 0.7459, + "step": 2640 + }, + { + "epoch": 0.2697650663942799, + "grad_norm": 1.3248726143376297, + "learning_rate": 1.7133947174799984e-05, + "loss": 0.7095, + "step": 2641 + }, + { + "epoch": 0.26986721144024517, + "grad_norm": 1.493091223523191, + "learning_rate": 1.7131628429245117e-05, + "loss": 0.7093, + "step": 2642 + }, + { + "epoch": 0.2699693564862104, + "grad_norm": 1.5553983240600513, + "learning_rate": 1.712930890312891e-05, + "loss": 0.7343, + "step": 2643 + }, + { + "epoch": 0.2700715015321757, + "grad_norm": 1.593448444578975, + "learning_rate": 1.712698859670524e-05, + "loss": 0.8282, + "step": 2644 + }, + { + "epoch": 0.27017364657814097, + "grad_norm": 1.4520453075067568, + "learning_rate": 1.712466751022806e-05, + "loss": 0.7499, + "step": 2645 + }, + { + "epoch": 0.27027579162410625, + "grad_norm": 1.5074051470023935, + "learning_rate": 1.7122345643951418e-05, + "loss": 0.8329, + "step": 2646 + }, + { + "epoch": 0.2703779366700715, + "grad_norm": 1.3920831840632624, + "learning_rate": 1.7120022998129445e-05, + "loss": 0.6624, + "step": 2647 + }, + { + "epoch": 0.27048008171603677, + "grad_norm": 1.5244277965039217, + "learning_rate": 1.7117699573016353e-05, + "loss": 0.8023, + "step": 2648 + }, + { + "epoch": 0.27058222676200205, + "grad_norm": 1.546285309414115, + "learning_rate": 1.7115375368866444e-05, + "loss": 0.7926, + "step": 2649 + }, + { + "epoch": 0.27068437180796734, + "grad_norm": 1.6483171700518657, + "learning_rate": 1.7113050385934107e-05, + "loss": 0.6854, + "step": 2650 + }, + { + "epoch": 0.27078651685393257, + "grad_norm": 1.3923591212877138, + "learning_rate": 1.7110724624473808e-05, + "loss": 0.7254, + "step": 2651 + }, + { + "epoch": 0.27088866189989785, + "grad_norm": 1.4816820257265522, + "learning_rate": 1.71083980847401e-05, + "loss": 0.8438, + "step": 2652 + }, + { + "epoch": 0.27099080694586314, + "grad_norm": 1.4187506873532743, + "learning_rate": 1.7106070766987636e-05, + "loss": 0.6166, + "step": 2653 + }, + { + "epoch": 0.2710929519918284, + "grad_norm": 1.3642969069045667, + "learning_rate": 1.710374267147113e-05, + "loss": 0.8211, + "step": 2654 + }, + { + "epoch": 0.27119509703779365, + "grad_norm": 1.3410571875961252, + "learning_rate": 1.7101413798445404e-05, + "loss": 0.7595, + "step": 2655 + }, + { + "epoch": 0.27129724208375894, + "grad_norm": 1.4583230738658919, + "learning_rate": 1.7099084148165344e-05, + "loss": 0.721, + "step": 2656 + }, + { + "epoch": 0.2713993871297242, + "grad_norm": 1.42047014785119, + "learning_rate": 1.709675372088594e-05, + "loss": 0.7737, + "step": 2657 + }, + { + "epoch": 0.27150153217568945, + "grad_norm": 1.3974156350464202, + "learning_rate": 1.709442251686226e-05, + "loss": 0.7159, + "step": 2658 + }, + { + "epoch": 0.27160367722165474, + "grad_norm": 1.3054891804741153, + "learning_rate": 1.709209053634945e-05, + "loss": 0.6308, + "step": 2659 + }, + { + "epoch": 0.27170582226762, + "grad_norm": 1.420709002055265, + "learning_rate": 1.7089757779602747e-05, + "loss": 0.7175, + "step": 2660 + }, + { + "epoch": 0.2718079673135853, + "grad_norm": 1.5848739371794853, + "learning_rate": 1.7087424246877474e-05, + "loss": 0.7118, + "step": 2661 + }, + { + "epoch": 0.27191011235955054, + "grad_norm": 1.4871111146499125, + "learning_rate": 1.708508993842904e-05, + "loss": 0.8389, + "step": 2662 + }, + { + "epoch": 0.2720122574055158, + "grad_norm": 1.3652945290603586, + "learning_rate": 1.7082754854512932e-05, + "loss": 0.7052, + "step": 2663 + }, + { + "epoch": 0.2721144024514811, + "grad_norm": 1.5253014922218116, + "learning_rate": 1.7080418995384733e-05, + "loss": 0.7414, + "step": 2664 + }, + { + "epoch": 0.2722165474974464, + "grad_norm": 1.4386115755708755, + "learning_rate": 1.70780823613001e-05, + "loss": 0.789, + "step": 2665 + }, + { + "epoch": 0.2723186925434116, + "grad_norm": 1.4781905312151657, + "learning_rate": 1.7075744952514774e-05, + "loss": 0.7083, + "step": 2666 + }, + { + "epoch": 0.2724208375893769, + "grad_norm": 1.417197515595689, + "learning_rate": 1.7073406769284594e-05, + "loss": 0.7992, + "step": 2667 + }, + { + "epoch": 0.2725229826353422, + "grad_norm": 1.6280921591438862, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.7997, + "step": 2668 + }, + { + "epoch": 0.2726251276813075, + "grad_norm": 1.6097314594668686, + "learning_rate": 1.7068728080513417e-05, + "loss": 0.8193, + "step": 2669 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 1.5960715787940696, + "learning_rate": 1.7066387575484502e-05, + "loss": 0.7556, + "step": 2670 + }, + { + "epoch": 0.272829417773238, + "grad_norm": 1.3707855115982637, + "learning_rate": 1.70640462970349e-05, + "loss": 0.7026, + "step": 2671 + }, + { + "epoch": 0.2729315628192033, + "grad_norm": 1.3990838439606879, + "learning_rate": 1.706170424542087e-05, + "loss": 0.6821, + "step": 2672 + }, + { + "epoch": 0.27303370786516856, + "grad_norm": 1.224444433666637, + "learning_rate": 1.7059361420898743e-05, + "loss": 0.6499, + "step": 2673 + }, + { + "epoch": 0.2731358529111338, + "grad_norm": 1.6150672438730833, + "learning_rate": 1.705701782372495e-05, + "loss": 0.8249, + "step": 2674 + }, + { + "epoch": 0.2732379979570991, + "grad_norm": 1.555482420963003, + "learning_rate": 1.7054673454155997e-05, + "loss": 0.7855, + "step": 2675 + }, + { + "epoch": 0.27334014300306436, + "grad_norm": 1.400417303694646, + "learning_rate": 1.7052328312448475e-05, + "loss": 0.7424, + "step": 2676 + }, + { + "epoch": 0.27344228804902965, + "grad_norm": 1.4776822238448912, + "learning_rate": 1.7049982398859065e-05, + "loss": 0.6699, + "step": 2677 + }, + { + "epoch": 0.2735444330949949, + "grad_norm": 1.5096942864103626, + "learning_rate": 1.7047635713644528e-05, + "loss": 0.8032, + "step": 2678 + }, + { + "epoch": 0.27364657814096016, + "grad_norm": 1.4634104240886805, + "learning_rate": 1.7045288257061704e-05, + "loss": 0.704, + "step": 2679 + }, + { + "epoch": 0.27374872318692545, + "grad_norm": 1.5095164623655783, + "learning_rate": 1.7042940029367532e-05, + "loss": 0.8195, + "step": 2680 + }, + { + "epoch": 0.27385086823289073, + "grad_norm": 1.4061515363450419, + "learning_rate": 1.7040591030819022e-05, + "loss": 0.687, + "step": 2681 + }, + { + "epoch": 0.27395301327885596, + "grad_norm": 1.288834386184951, + "learning_rate": 1.703824126167328e-05, + "loss": 0.7711, + "step": 2682 + }, + { + "epoch": 0.27405515832482125, + "grad_norm": 1.4172085059463584, + "learning_rate": 1.7035890722187477e-05, + "loss": 0.7508, + "step": 2683 + }, + { + "epoch": 0.27415730337078653, + "grad_norm": 1.551307937035291, + "learning_rate": 1.7033539412618892e-05, + "loss": 0.7151, + "step": 2684 + }, + { + "epoch": 0.27425944841675176, + "grad_norm": 1.437928189223863, + "learning_rate": 1.7031187333224874e-05, + "loss": 0.7296, + "step": 2685 + }, + { + "epoch": 0.27436159346271705, + "grad_norm": 1.506923697623631, + "learning_rate": 1.702883448426286e-05, + "loss": 0.7073, + "step": 2686 + }, + { + "epoch": 0.27446373850868233, + "grad_norm": 1.3984482978267556, + "learning_rate": 1.702648086599037e-05, + "loss": 0.6967, + "step": 2687 + }, + { + "epoch": 0.2745658835546476, + "grad_norm": 1.7439214007364532, + "learning_rate": 1.702412647866501e-05, + "loss": 0.8031, + "step": 2688 + }, + { + "epoch": 0.27466802860061285, + "grad_norm": 1.4680059828849006, + "learning_rate": 1.7021771322544465e-05, + "loss": 0.7142, + "step": 2689 + }, + { + "epoch": 0.27477017364657813, + "grad_norm": 1.5954742591347104, + "learning_rate": 1.701941539788652e-05, + "loss": 0.8175, + "step": 2690 + }, + { + "epoch": 0.2748723186925434, + "grad_norm": 1.4159407648291686, + "learning_rate": 1.7017058704949017e-05, + "loss": 0.7699, + "step": 2691 + }, + { + "epoch": 0.2749744637385087, + "grad_norm": 1.4131290065180206, + "learning_rate": 1.7014701243989908e-05, + "loss": 0.8113, + "step": 2692 + }, + { + "epoch": 0.27507660878447393, + "grad_norm": 1.3746077997788342, + "learning_rate": 1.7012343015267214e-05, + "loss": 0.6559, + "step": 2693 + }, + { + "epoch": 0.2751787538304392, + "grad_norm": 1.593375505766051, + "learning_rate": 1.700998401903905e-05, + "loss": 0.8184, + "step": 2694 + }, + { + "epoch": 0.2752808988764045, + "grad_norm": 1.3793637290999876, + "learning_rate": 1.70076242555636e-05, + "loss": 0.7029, + "step": 2695 + }, + { + "epoch": 0.2753830439223698, + "grad_norm": 1.295422466292458, + "learning_rate": 1.7005263725099156e-05, + "loss": 0.6995, + "step": 2696 + }, + { + "epoch": 0.275485188968335, + "grad_norm": 1.3863095979657845, + "learning_rate": 1.7002902427904065e-05, + "loss": 0.6945, + "step": 2697 + }, + { + "epoch": 0.2755873340143003, + "grad_norm": 1.438489973693737, + "learning_rate": 1.7000540364236782e-05, + "loss": 0.7397, + "step": 2698 + }, + { + "epoch": 0.2756894790602656, + "grad_norm": 1.653902437420994, + "learning_rate": 1.6998177534355834e-05, + "loss": 0.7013, + "step": 2699 + }, + { + "epoch": 0.27579162410623087, + "grad_norm": 1.5227313030545238, + "learning_rate": 1.699581393851983e-05, + "loss": 0.8071, + "step": 2700 + }, + { + "epoch": 0.2758937691521961, + "grad_norm": 1.5969077189900034, + "learning_rate": 1.6993449576987476e-05, + "loss": 0.7325, + "step": 2701 + }, + { + "epoch": 0.2759959141981614, + "grad_norm": 1.4940562249976852, + "learning_rate": 1.699108445001754e-05, + "loss": 0.7686, + "step": 2702 + }, + { + "epoch": 0.27609805924412667, + "grad_norm": 1.3970824598559681, + "learning_rate": 1.69887185578689e-05, + "loss": 0.7098, + "step": 2703 + }, + { + "epoch": 0.27620020429009196, + "grad_norm": 1.5457069519026894, + "learning_rate": 1.6986351900800495e-05, + "loss": 0.7898, + "step": 2704 + }, + { + "epoch": 0.2763023493360572, + "grad_norm": 1.4859900593318567, + "learning_rate": 1.6983984479071364e-05, + "loss": 0.6659, + "step": 2705 + }, + { + "epoch": 0.27640449438202247, + "grad_norm": 1.4855798408977536, + "learning_rate": 1.6981616292940616e-05, + "loss": 0.7984, + "step": 2706 + }, + { + "epoch": 0.27650663942798775, + "grad_norm": 1.469142541559895, + "learning_rate": 1.6979247342667457e-05, + "loss": 0.8047, + "step": 2707 + }, + { + "epoch": 0.27660878447395304, + "grad_norm": 1.58748545612327, + "learning_rate": 1.6976877628511163e-05, + "loss": 0.7323, + "step": 2708 + }, + { + "epoch": 0.27671092951991827, + "grad_norm": 1.4804273531573104, + "learning_rate": 1.697450715073111e-05, + "loss": 0.7613, + "step": 2709 + }, + { + "epoch": 0.27681307456588355, + "grad_norm": 1.4239506375111246, + "learning_rate": 1.6972135909586742e-05, + "loss": 0.6764, + "step": 2710 + }, + { + "epoch": 0.27691521961184884, + "grad_norm": 1.5947370392060487, + "learning_rate": 1.696976390533759e-05, + "loss": 0.7802, + "step": 2711 + }, + { + "epoch": 0.27701736465781407, + "grad_norm": 1.3708948045360412, + "learning_rate": 1.696739113824328e-05, + "loss": 0.7691, + "step": 2712 + }, + { + "epoch": 0.27711950970377935, + "grad_norm": 1.4575766926668987, + "learning_rate": 1.6965017608563507e-05, + "loss": 0.7276, + "step": 2713 + }, + { + "epoch": 0.27722165474974464, + "grad_norm": 1.4488488809750262, + "learning_rate": 1.696264331655806e-05, + "loss": 0.8104, + "step": 2714 + }, + { + "epoch": 0.2773237997957099, + "grad_norm": 1.3600543908031333, + "learning_rate": 1.6960268262486796e-05, + "loss": 0.7655, + "step": 2715 + }, + { + "epoch": 0.27742594484167515, + "grad_norm": 1.4940221924143973, + "learning_rate": 1.6957892446609682e-05, + "loss": 0.7464, + "step": 2716 + }, + { + "epoch": 0.27752808988764044, + "grad_norm": 1.540959286939293, + "learning_rate": 1.695551586918674e-05, + "loss": 0.8456, + "step": 2717 + }, + { + "epoch": 0.2776302349336057, + "grad_norm": 1.4490763101433775, + "learning_rate": 1.6953138530478093e-05, + "loss": 0.6973, + "step": 2718 + }, + { + "epoch": 0.277732379979571, + "grad_norm": 1.483751867549589, + "learning_rate": 1.695076043074394e-05, + "loss": 0.8316, + "step": 2719 + }, + { + "epoch": 0.27783452502553624, + "grad_norm": 1.4781664239902208, + "learning_rate": 1.694838157024457e-05, + "loss": 0.7753, + "step": 2720 + }, + { + "epoch": 0.2779366700715015, + "grad_norm": 1.717470494869726, + "learning_rate": 1.6946001949240347e-05, + "loss": 0.7369, + "step": 2721 + }, + { + "epoch": 0.2780388151174668, + "grad_norm": 1.5457589847756297, + "learning_rate": 1.694362156799172e-05, + "loss": 0.7292, + "step": 2722 + }, + { + "epoch": 0.2781409601634321, + "grad_norm": 1.4433376967960552, + "learning_rate": 1.694124042675923e-05, + "loss": 0.6502, + "step": 2723 + }, + { + "epoch": 0.2782431052093973, + "grad_norm": 1.5798180280547671, + "learning_rate": 1.6938858525803488e-05, + "loss": 0.734, + "step": 2724 + }, + { + "epoch": 0.2783452502553626, + "grad_norm": 1.597946201710472, + "learning_rate": 1.69364758653852e-05, + "loss": 0.8071, + "step": 2725 + }, + { + "epoch": 0.2784473953013279, + "grad_norm": 1.4802526806978218, + "learning_rate": 1.6934092445765145e-05, + "loss": 0.6918, + "step": 2726 + }, + { + "epoch": 0.2785495403472932, + "grad_norm": 1.4752357688489037, + "learning_rate": 1.693170826720419e-05, + "loss": 0.835, + "step": 2727 + }, + { + "epoch": 0.2786516853932584, + "grad_norm": 1.514618837048056, + "learning_rate": 1.692932332996329e-05, + "loss": 0.716, + "step": 2728 + }, + { + "epoch": 0.2787538304392237, + "grad_norm": 1.393579411627894, + "learning_rate": 1.6926937634303472e-05, + "loss": 0.7181, + "step": 2729 + }, + { + "epoch": 0.278855975485189, + "grad_norm": 1.4781618717206944, + "learning_rate": 1.6924551180485858e-05, + "loss": 0.7838, + "step": 2730 + }, + { + "epoch": 0.27895812053115426, + "grad_norm": 1.3615241376444494, + "learning_rate": 1.6922163968771644e-05, + "loss": 0.7139, + "step": 2731 + }, + { + "epoch": 0.2790602655771195, + "grad_norm": 1.5784290867441937, + "learning_rate": 1.6919775999422108e-05, + "loss": 0.8609, + "step": 2732 + }, + { + "epoch": 0.2791624106230848, + "grad_norm": 1.530901473593101, + "learning_rate": 1.6917387272698618e-05, + "loss": 0.6067, + "step": 2733 + }, + { + "epoch": 0.27926455566905006, + "grad_norm": 1.8603584015824977, + "learning_rate": 1.6914997788862622e-05, + "loss": 0.7243, + "step": 2734 + }, + { + "epoch": 0.27936670071501535, + "grad_norm": 1.374737236816398, + "learning_rate": 1.691260754817565e-05, + "loss": 0.6409, + "step": 2735 + }, + { + "epoch": 0.2794688457609806, + "grad_norm": 1.4264357872517461, + "learning_rate": 1.691021655089932e-05, + "loss": 0.7245, + "step": 2736 + }, + { + "epoch": 0.27957099080694586, + "grad_norm": 1.6736113763237774, + "learning_rate": 1.690782479729532e-05, + "loss": 0.6807, + "step": 2737 + }, + { + "epoch": 0.27967313585291115, + "grad_norm": 1.3318814019281573, + "learning_rate": 1.6905432287625433e-05, + "loss": 0.6771, + "step": 2738 + }, + { + "epoch": 0.2797752808988764, + "grad_norm": 1.3191784959238555, + "learning_rate": 1.690303902215152e-05, + "loss": 0.7805, + "step": 2739 + }, + { + "epoch": 0.27987742594484166, + "grad_norm": 1.3930927106326545, + "learning_rate": 1.690064500113553e-05, + "loss": 0.6925, + "step": 2740 + }, + { + "epoch": 0.27997957099080695, + "grad_norm": 1.4133192311260205, + "learning_rate": 1.6898250224839485e-05, + "loss": 0.6053, + "step": 2741 + }, + { + "epoch": 0.28008171603677223, + "grad_norm": 1.5948243352157303, + "learning_rate": 1.6895854693525494e-05, + "loss": 0.6746, + "step": 2742 + }, + { + "epoch": 0.28018386108273746, + "grad_norm": 1.470731987805263, + "learning_rate": 1.6893458407455752e-05, + "loss": 0.7222, + "step": 2743 + }, + { + "epoch": 0.28028600612870275, + "grad_norm": 1.5703532000444576, + "learning_rate": 1.6891061366892533e-05, + "loss": 0.7096, + "step": 2744 + }, + { + "epoch": 0.28038815117466803, + "grad_norm": 1.4130981859295657, + "learning_rate": 1.6888663572098198e-05, + "loss": 0.7633, + "step": 2745 + }, + { + "epoch": 0.2804902962206333, + "grad_norm": 1.5002437232087638, + "learning_rate": 1.688626502333518e-05, + "loss": 0.7772, + "step": 2746 + }, + { + "epoch": 0.28059244126659855, + "grad_norm": 1.6962806075028674, + "learning_rate": 1.6883865720866008e-05, + "loss": 0.7236, + "step": 2747 + }, + { + "epoch": 0.28069458631256383, + "grad_norm": 1.4650366414531233, + "learning_rate": 1.6881465664953286e-05, + "loss": 0.7184, + "step": 2748 + }, + { + "epoch": 0.2807967313585291, + "grad_norm": 1.4770889921460757, + "learning_rate": 1.6879064855859702e-05, + "loss": 0.786, + "step": 2749 + }, + { + "epoch": 0.2808988764044944, + "grad_norm": 1.5048470136593244, + "learning_rate": 1.6876663293848024e-05, + "loss": 0.7877, + "step": 2750 + }, + { + "epoch": 0.28100102145045963, + "grad_norm": 1.4932386816512246, + "learning_rate": 1.6874260979181105e-05, + "loss": 0.727, + "step": 2751 + }, + { + "epoch": 0.2811031664964249, + "grad_norm": 1.5782043999091189, + "learning_rate": 1.6871857912121882e-05, + "loss": 0.8501, + "step": 2752 + }, + { + "epoch": 0.2812053115423902, + "grad_norm": 1.4413373233862905, + "learning_rate": 1.6869454092933368e-05, + "loss": 0.8206, + "step": 2753 + }, + { + "epoch": 0.2813074565883555, + "grad_norm": 1.550982880637789, + "learning_rate": 1.686704952187867e-05, + "loss": 0.7871, + "step": 2754 + }, + { + "epoch": 0.2814096016343207, + "grad_norm": 1.4327917308453353, + "learning_rate": 1.686464419922096e-05, + "loss": 0.7159, + "step": 2755 + }, + { + "epoch": 0.281511746680286, + "grad_norm": 1.599050719649285, + "learning_rate": 1.686223812522351e-05, + "loss": 0.8334, + "step": 2756 + }, + { + "epoch": 0.2816138917262513, + "grad_norm": 1.4682349846466691, + "learning_rate": 1.6859831300149664e-05, + "loss": 0.7098, + "step": 2757 + }, + { + "epoch": 0.28171603677221657, + "grad_norm": 1.490262393051143, + "learning_rate": 1.685742372426285e-05, + "loss": 0.752, + "step": 2758 + }, + { + "epoch": 0.2818181818181818, + "grad_norm": 1.3606932546575097, + "learning_rate": 1.685501539782658e-05, + "loss": 0.8104, + "step": 2759 + }, + { + "epoch": 0.2819203268641471, + "grad_norm": 1.5520105288143495, + "learning_rate": 1.6852606321104442e-05, + "loss": 0.6835, + "step": 2760 + }, + { + "epoch": 0.28202247191011237, + "grad_norm": 1.5069558708340467, + "learning_rate": 1.6850196494360116e-05, + "loss": 0.7498, + "step": 2761 + }, + { + "epoch": 0.28212461695607766, + "grad_norm": 1.5041780553464301, + "learning_rate": 1.684778591785736e-05, + "loss": 0.7839, + "step": 2762 + }, + { + "epoch": 0.2822267620020429, + "grad_norm": 1.607590745019505, + "learning_rate": 1.6845374591860005e-05, + "loss": 0.774, + "step": 2763 + }, + { + "epoch": 0.28232890704800817, + "grad_norm": 1.5616933909339754, + "learning_rate": 1.6842962516631985e-05, + "loss": 0.7749, + "step": 2764 + }, + { + "epoch": 0.28243105209397346, + "grad_norm": 1.4241140101972511, + "learning_rate": 1.6840549692437295e-05, + "loss": 0.7377, + "step": 2765 + }, + { + "epoch": 0.2825331971399387, + "grad_norm": 1.612642885127295, + "learning_rate": 1.6838136119540024e-05, + "loss": 0.6413, + "step": 2766 + }, + { + "epoch": 0.28263534218590397, + "grad_norm": 1.491721739930825, + "learning_rate": 1.6835721798204333e-05, + "loss": 0.6679, + "step": 2767 + }, + { + "epoch": 0.28273748723186926, + "grad_norm": 1.5983074541336582, + "learning_rate": 1.6833306728694476e-05, + "loss": 0.8002, + "step": 2768 + }, + { + "epoch": 0.28283963227783454, + "grad_norm": 1.7189214631960457, + "learning_rate": 1.683089091127478e-05, + "loss": 0.7432, + "step": 2769 + }, + { + "epoch": 0.28294177732379977, + "grad_norm": 1.3779697795289045, + "learning_rate": 1.6828474346209662e-05, + "loss": 0.7266, + "step": 2770 + }, + { + "epoch": 0.28304392236976506, + "grad_norm": 1.577932266003647, + "learning_rate": 1.6826057033763618e-05, + "loss": 0.8496, + "step": 2771 + }, + { + "epoch": 0.28314606741573034, + "grad_norm": 1.4520125099088317, + "learning_rate": 1.682363897420122e-05, + "loss": 0.7282, + "step": 2772 + }, + { + "epoch": 0.2832482124616956, + "grad_norm": 1.4909023084109942, + "learning_rate": 1.6821220167787126e-05, + "loss": 0.6602, + "step": 2773 + }, + { + "epoch": 0.28335035750766085, + "grad_norm": 1.4305616272001505, + "learning_rate": 1.6818800614786077e-05, + "loss": 0.7517, + "step": 2774 + }, + { + "epoch": 0.28345250255362614, + "grad_norm": 1.4998628821363957, + "learning_rate": 1.6816380315462895e-05, + "loss": 0.7301, + "step": 2775 + }, + { + "epoch": 0.2835546475995914, + "grad_norm": 1.5605652540760686, + "learning_rate": 1.6813959270082486e-05, + "loss": 0.8421, + "step": 2776 + }, + { + "epoch": 0.2836567926455567, + "grad_norm": 1.4986014312341027, + "learning_rate": 1.681153747890983e-05, + "loss": 0.7782, + "step": 2777 + }, + { + "epoch": 0.28375893769152194, + "grad_norm": 1.5154072467185085, + "learning_rate": 1.6809114942209997e-05, + "loss": 0.7447, + "step": 2778 + }, + { + "epoch": 0.2838610827374872, + "grad_norm": 1.399010552162622, + "learning_rate": 1.6806691660248137e-05, + "loss": 0.694, + "step": 2779 + }, + { + "epoch": 0.2839632277834525, + "grad_norm": 1.6053481361047428, + "learning_rate": 1.6804267633289476e-05, + "loss": 0.5971, + "step": 2780 + }, + { + "epoch": 0.2840653728294178, + "grad_norm": 1.5852817182313759, + "learning_rate": 1.6801842861599326e-05, + "loss": 0.8626, + "step": 2781 + }, + { + "epoch": 0.284167517875383, + "grad_norm": 1.6499951491544196, + "learning_rate": 1.679941734544308e-05, + "loss": 0.6536, + "step": 2782 + }, + { + "epoch": 0.2842696629213483, + "grad_norm": 1.510953994438809, + "learning_rate": 1.6796991085086212e-05, + "loss": 0.7424, + "step": 2783 + }, + { + "epoch": 0.2843718079673136, + "grad_norm": 1.4623172630468966, + "learning_rate": 1.6794564080794277e-05, + "loss": 0.8547, + "step": 2784 + }, + { + "epoch": 0.2844739530132789, + "grad_norm": 1.2099953594852613, + "learning_rate": 1.6792136332832916e-05, + "loss": 0.7187, + "step": 2785 + }, + { + "epoch": 0.2845760980592441, + "grad_norm": 1.3752156860319218, + "learning_rate": 1.6789707841467845e-05, + "loss": 0.6105, + "step": 2786 + }, + { + "epoch": 0.2846782431052094, + "grad_norm": 1.499864856012898, + "learning_rate": 1.678727860696486e-05, + "loss": 0.7716, + "step": 2787 + }, + { + "epoch": 0.2847803881511747, + "grad_norm": 1.5226392424192683, + "learning_rate": 1.6784848629589852e-05, + "loss": 0.7838, + "step": 2788 + }, + { + "epoch": 0.28488253319713996, + "grad_norm": 1.4383104587668107, + "learning_rate": 1.6782417909608777e-05, + "loss": 0.6304, + "step": 2789 + }, + { + "epoch": 0.2849846782431052, + "grad_norm": 1.4503944562449598, + "learning_rate": 1.6779986447287678e-05, + "loss": 0.772, + "step": 2790 + }, + { + "epoch": 0.2850868232890705, + "grad_norm": 1.3926352495480518, + "learning_rate": 1.677755424289268e-05, + "loss": 0.7395, + "step": 2791 + }, + { + "epoch": 0.28518896833503576, + "grad_norm": 1.5801671649376114, + "learning_rate": 1.6775121296689992e-05, + "loss": 0.8078, + "step": 2792 + }, + { + "epoch": 0.28529111338100105, + "grad_norm": 1.4833249963724287, + "learning_rate": 1.6772687608945905e-05, + "loss": 0.7824, + "step": 2793 + }, + { + "epoch": 0.2853932584269663, + "grad_norm": 1.3790783660149224, + "learning_rate": 1.6770253179926782e-05, + "loss": 0.7207, + "step": 2794 + }, + { + "epoch": 0.28549540347293156, + "grad_norm": 1.6019423520873124, + "learning_rate": 1.676781800989907e-05, + "loss": 0.7156, + "step": 2795 + }, + { + "epoch": 0.28559754851889685, + "grad_norm": 1.3761208340859388, + "learning_rate": 1.6765382099129307e-05, + "loss": 0.6948, + "step": 2796 + }, + { + "epoch": 0.2856996935648621, + "grad_norm": 1.6908249161054503, + "learning_rate": 1.6762945447884105e-05, + "loss": 0.9073, + "step": 2797 + }, + { + "epoch": 0.28580183861082736, + "grad_norm": 1.4317232212413435, + "learning_rate": 1.676050805643015e-05, + "loss": 0.6277, + "step": 2798 + }, + { + "epoch": 0.28590398365679265, + "grad_norm": 1.6827361390946232, + "learning_rate": 1.6758069925034222e-05, + "loss": 0.7688, + "step": 2799 + }, + { + "epoch": 0.28600612870275793, + "grad_norm": 1.3534526114191314, + "learning_rate": 1.6755631053963176e-05, + "loss": 0.7132, + "step": 2800 + }, + { + "epoch": 0.28610827374872316, + "grad_norm": 1.4179859355922673, + "learning_rate": 1.6753191443483943e-05, + "loss": 0.627, + "step": 2801 + }, + { + "epoch": 0.28621041879468845, + "grad_norm": 1.4546346739533043, + "learning_rate": 1.6750751093863548e-05, + "loss": 0.7547, + "step": 2802 + }, + { + "epoch": 0.28631256384065373, + "grad_norm": 1.434871454560573, + "learning_rate": 1.6748310005369082e-05, + "loss": 0.7767, + "step": 2803 + }, + { + "epoch": 0.286414708886619, + "grad_norm": 1.5766168604663289, + "learning_rate": 1.6745868178267724e-05, + "loss": 0.7929, + "step": 2804 + }, + { + "epoch": 0.28651685393258425, + "grad_norm": 1.5406590020667785, + "learning_rate": 1.674342561282674e-05, + "loss": 0.7354, + "step": 2805 + }, + { + "epoch": 0.28661899897854953, + "grad_norm": 1.5162257771877374, + "learning_rate": 1.674098230931346e-05, + "loss": 0.848, + "step": 2806 + }, + { + "epoch": 0.2867211440245148, + "grad_norm": 1.616532947570184, + "learning_rate": 1.6738538267995315e-05, + "loss": 0.7164, + "step": 2807 + }, + { + "epoch": 0.2868232890704801, + "grad_norm": 1.2143075154378131, + "learning_rate": 1.67360934891398e-05, + "loss": 0.6611, + "step": 2808 + }, + { + "epoch": 0.28692543411644533, + "grad_norm": 1.514482053352889, + "learning_rate": 1.6733647973014503e-05, + "loss": 0.6677, + "step": 2809 + }, + { + "epoch": 0.2870275791624106, + "grad_norm": 1.4564734791511453, + "learning_rate": 1.6731201719887087e-05, + "loss": 0.7029, + "step": 2810 + }, + { + "epoch": 0.2871297242083759, + "grad_norm": 1.5047961948856832, + "learning_rate": 1.672875473002529e-05, + "loss": 0.7842, + "step": 2811 + }, + { + "epoch": 0.2872318692543412, + "grad_norm": 1.5307217867772727, + "learning_rate": 1.672630700369694e-05, + "loss": 0.7329, + "step": 2812 + }, + { + "epoch": 0.2873340143003064, + "grad_norm": 1.3558646712150189, + "learning_rate": 1.6723858541169946e-05, + "loss": 0.793, + "step": 2813 + }, + { + "epoch": 0.2874361593462717, + "grad_norm": 1.4610971768613235, + "learning_rate": 1.672140934271229e-05, + "loss": 0.748, + "step": 2814 + }, + { + "epoch": 0.287538304392237, + "grad_norm": 1.5049190345467856, + "learning_rate": 1.6718959408592036e-05, + "loss": 0.7263, + "step": 2815 + }, + { + "epoch": 0.2876404494382023, + "grad_norm": 1.419989670804442, + "learning_rate": 1.6716508739077335e-05, + "loss": 0.7889, + "step": 2816 + }, + { + "epoch": 0.2877425944841675, + "grad_norm": 1.3592013552854598, + "learning_rate": 1.6714057334436416e-05, + "loss": 0.6861, + "step": 2817 + }, + { + "epoch": 0.2878447395301328, + "grad_norm": 1.56967823676569, + "learning_rate": 1.671160519493758e-05, + "loss": 0.7581, + "step": 2818 + }, + { + "epoch": 0.28794688457609807, + "grad_norm": 1.5269493584796219, + "learning_rate": 1.670915232084922e-05, + "loss": 0.7908, + "step": 2819 + }, + { + "epoch": 0.28804902962206336, + "grad_norm": 1.5289571934557498, + "learning_rate": 1.6706698712439807e-05, + "loss": 0.7294, + "step": 2820 + }, + { + "epoch": 0.2881511746680286, + "grad_norm": 1.8668822609717257, + "learning_rate": 1.6704244369977885e-05, + "loss": 0.8379, + "step": 2821 + }, + { + "epoch": 0.28825331971399387, + "grad_norm": 1.5497375525912822, + "learning_rate": 1.6701789293732083e-05, + "loss": 0.7145, + "step": 2822 + }, + { + "epoch": 0.28835546475995916, + "grad_norm": 1.4010942243117177, + "learning_rate": 1.669933348397111e-05, + "loss": 0.6806, + "step": 2823 + }, + { + "epoch": 0.2884576098059244, + "grad_norm": 1.4415451416714418, + "learning_rate": 1.6696876940963765e-05, + "loss": 0.6197, + "step": 2824 + }, + { + "epoch": 0.28855975485188967, + "grad_norm": 1.3158274213835415, + "learning_rate": 1.6694419664978912e-05, + "loss": 0.6913, + "step": 2825 + }, + { + "epoch": 0.28866189989785496, + "grad_norm": 1.5253776230452414, + "learning_rate": 1.66919616562855e-05, + "loss": 0.8631, + "step": 2826 + }, + { + "epoch": 0.28876404494382024, + "grad_norm": 1.4430692042349824, + "learning_rate": 1.6689502915152562e-05, + "loss": 0.7671, + "step": 2827 + }, + { + "epoch": 0.28886618998978547, + "grad_norm": 1.6100101822440054, + "learning_rate": 1.6687043441849206e-05, + "loss": 0.7941, + "step": 2828 + }, + { + "epoch": 0.28896833503575076, + "grad_norm": 1.5831921937611138, + "learning_rate": 1.6684583236644627e-05, + "loss": 0.7736, + "step": 2829 + }, + { + "epoch": 0.28907048008171604, + "grad_norm": 1.5337898189615202, + "learning_rate": 1.6682122299808092e-05, + "loss": 0.7236, + "step": 2830 + }, + { + "epoch": 0.2891726251276813, + "grad_norm": 1.48549946348166, + "learning_rate": 1.6679660631608955e-05, + "loss": 0.7448, + "step": 2831 + }, + { + "epoch": 0.28927477017364656, + "grad_norm": 1.5325980198451972, + "learning_rate": 1.6677198232316646e-05, + "loss": 0.8228, + "step": 2832 + }, + { + "epoch": 0.28937691521961184, + "grad_norm": 1.485924272785635, + "learning_rate": 1.6674735102200675e-05, + "loss": 0.673, + "step": 2833 + }, + { + "epoch": 0.2894790602655771, + "grad_norm": 1.3985405167411264, + "learning_rate": 1.667227124153064e-05, + "loss": 0.8041, + "step": 2834 + }, + { + "epoch": 0.2895812053115424, + "grad_norm": 1.4530551162690877, + "learning_rate": 1.66698066505762e-05, + "loss": 0.693, + "step": 2835 + }, + { + "epoch": 0.28968335035750764, + "grad_norm": 1.4663825897174396, + "learning_rate": 1.6667341329607118e-05, + "loss": 0.6589, + "step": 2836 + }, + { + "epoch": 0.2897854954034729, + "grad_norm": 1.600620126936236, + "learning_rate": 1.6664875278893216e-05, + "loss": 0.7195, + "step": 2837 + }, + { + "epoch": 0.2898876404494382, + "grad_norm": 1.4188925545365707, + "learning_rate": 1.666240849870441e-05, + "loss": 0.6847, + "step": 2838 + }, + { + "epoch": 0.2899897854954035, + "grad_norm": 1.6044867972793402, + "learning_rate": 1.665994098931069e-05, + "loss": 0.8431, + "step": 2839 + }, + { + "epoch": 0.2900919305413687, + "grad_norm": 1.4330934703744602, + "learning_rate": 1.6657472750982126e-05, + "loss": 0.765, + "step": 2840 + }, + { + "epoch": 0.290194075587334, + "grad_norm": 1.4179666155909492, + "learning_rate": 1.6655003783988868e-05, + "loss": 0.7772, + "step": 2841 + }, + { + "epoch": 0.2902962206332993, + "grad_norm": 1.504545090179601, + "learning_rate": 1.6652534088601147e-05, + "loss": 0.7939, + "step": 2842 + }, + { + "epoch": 0.2903983656792646, + "grad_norm": 1.4304168775649606, + "learning_rate": 1.6650063665089268e-05, + "loss": 0.7651, + "step": 2843 + }, + { + "epoch": 0.2905005107252298, + "grad_norm": 1.430929012414795, + "learning_rate": 1.6647592513723627e-05, + "loss": 0.6768, + "step": 2844 + }, + { + "epoch": 0.2906026557711951, + "grad_norm": 1.2893322075298452, + "learning_rate": 1.6645120634774692e-05, + "loss": 0.6647, + "step": 2845 + }, + { + "epoch": 0.2907048008171604, + "grad_norm": 1.5628170490246414, + "learning_rate": 1.664264802851301e-05, + "loss": 0.7141, + "step": 2846 + }, + { + "epoch": 0.29080694586312567, + "grad_norm": 1.346970084072366, + "learning_rate": 1.664017469520921e-05, + "loss": 0.6637, + "step": 2847 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 1.4616828889998055, + "learning_rate": 1.6637700635133996e-05, + "loss": 0.7907, + "step": 2848 + }, + { + "epoch": 0.2910112359550562, + "grad_norm": 1.475142222493101, + "learning_rate": 1.663522584855816e-05, + "loss": 0.7789, + "step": 2849 + }, + { + "epoch": 0.29111338100102147, + "grad_norm": 1.4167016524832552, + "learning_rate": 1.663275033575257e-05, + "loss": 0.652, + "step": 2850 + }, + { + "epoch": 0.2912155260469867, + "grad_norm": 1.4353201818073666, + "learning_rate": 1.6630274096988167e-05, + "loss": 0.7383, + "step": 2851 + }, + { + "epoch": 0.291317671092952, + "grad_norm": 1.5940225832741983, + "learning_rate": 1.6627797132535983e-05, + "loss": 0.779, + "step": 2852 + }, + { + "epoch": 0.29141981613891726, + "grad_norm": 1.5541596991353728, + "learning_rate": 1.662531944266712e-05, + "loss": 0.7335, + "step": 2853 + }, + { + "epoch": 0.29152196118488255, + "grad_norm": 1.5354036339263717, + "learning_rate": 1.6622841027652764e-05, + "loss": 0.8779, + "step": 2854 + }, + { + "epoch": 0.2916241062308478, + "grad_norm": 1.4311257985244856, + "learning_rate": 1.6620361887764178e-05, + "loss": 0.6242, + "step": 2855 + }, + { + "epoch": 0.29172625127681306, + "grad_norm": 1.5221237709693678, + "learning_rate": 1.6617882023272708e-05, + "loss": 0.8523, + "step": 2856 + }, + { + "epoch": 0.29182839632277835, + "grad_norm": 1.5549215781204933, + "learning_rate": 1.6615401434449775e-05, + "loss": 0.7678, + "step": 2857 + }, + { + "epoch": 0.29193054136874363, + "grad_norm": 1.4353851932861221, + "learning_rate": 1.6612920121566876e-05, + "loss": 0.7897, + "step": 2858 + }, + { + "epoch": 0.29203268641470886, + "grad_norm": 1.4366036317612432, + "learning_rate": 1.6610438084895602e-05, + "loss": 0.6792, + "step": 2859 + }, + { + "epoch": 0.29213483146067415, + "grad_norm": 1.3971259359523729, + "learning_rate": 1.6607955324707608e-05, + "loss": 0.7179, + "step": 2860 + }, + { + "epoch": 0.29223697650663943, + "grad_norm": 1.5717274838097084, + "learning_rate": 1.6605471841274634e-05, + "loss": 0.7502, + "step": 2861 + }, + { + "epoch": 0.2923391215526047, + "grad_norm": 1.5162169985543725, + "learning_rate": 1.66029876348685e-05, + "loss": 0.8161, + "step": 2862 + }, + { + "epoch": 0.29244126659856995, + "grad_norm": 1.5004810658208525, + "learning_rate": 1.6600502705761103e-05, + "loss": 0.7393, + "step": 2863 + }, + { + "epoch": 0.29254341164453523, + "grad_norm": 1.3031910464098717, + "learning_rate": 1.659801705422442e-05, + "loss": 0.6812, + "step": 2864 + }, + { + "epoch": 0.2926455566905005, + "grad_norm": 1.3906044798076644, + "learning_rate": 1.6595530680530504e-05, + "loss": 0.7528, + "step": 2865 + }, + { + "epoch": 0.2927477017364658, + "grad_norm": 1.4079983387515254, + "learning_rate": 1.6593043584951497e-05, + "loss": 0.8166, + "step": 2866 + }, + { + "epoch": 0.29284984678243103, + "grad_norm": 1.5311361489857431, + "learning_rate": 1.659055576775961e-05, + "loss": 0.6136, + "step": 2867 + }, + { + "epoch": 0.2929519918283963, + "grad_norm": 1.4206634724934915, + "learning_rate": 1.6588067229227137e-05, + "loss": 0.6952, + "step": 2868 + }, + { + "epoch": 0.2930541368743616, + "grad_norm": 1.6360590197487377, + "learning_rate": 1.658557796962645e-05, + "loss": 0.7068, + "step": 2869 + }, + { + "epoch": 0.2931562819203269, + "grad_norm": 1.3888543748557232, + "learning_rate": 1.6583087989229997e-05, + "loss": 0.7586, + "step": 2870 + }, + { + "epoch": 0.2932584269662921, + "grad_norm": 1.5474050189858701, + "learning_rate": 1.658059728831031e-05, + "loss": 0.7932, + "step": 2871 + }, + { + "epoch": 0.2933605720122574, + "grad_norm": 1.3922608176119244, + "learning_rate": 1.657810586714e-05, + "loss": 0.6973, + "step": 2872 + }, + { + "epoch": 0.2934627170582227, + "grad_norm": 1.4537355441413866, + "learning_rate": 1.6575613725991753e-05, + "loss": 0.8902, + "step": 2873 + }, + { + "epoch": 0.293564862104188, + "grad_norm": 1.574747421874692, + "learning_rate": 1.657312086513833e-05, + "loss": 0.8187, + "step": 2874 + }, + { + "epoch": 0.2936670071501532, + "grad_norm": 1.5186059133226095, + "learning_rate": 1.657062728485259e-05, + "loss": 0.8286, + "step": 2875 + }, + { + "epoch": 0.2937691521961185, + "grad_norm": 1.513854892585937, + "learning_rate": 1.6568132985407444e-05, + "loss": 0.8191, + "step": 2876 + }, + { + "epoch": 0.2938712972420838, + "grad_norm": 1.517462268187125, + "learning_rate": 1.65656379670759e-05, + "loss": 0.7219, + "step": 2877 + }, + { + "epoch": 0.293973442288049, + "grad_norm": 1.3258942406376375, + "learning_rate": 1.656314223013104e-05, + "loss": 0.7145, + "step": 2878 + }, + { + "epoch": 0.2940755873340143, + "grad_norm": 1.3961400543503997, + "learning_rate": 1.656064577484602e-05, + "loss": 0.6767, + "step": 2879 + }, + { + "epoch": 0.2941777323799796, + "grad_norm": 1.465867172098062, + "learning_rate": 1.6558148601494082e-05, + "loss": 0.7762, + "step": 2880 + }, + { + "epoch": 0.29427987742594486, + "grad_norm": 1.330529008426249, + "learning_rate": 1.6555650710348543e-05, + "loss": 0.7321, + "step": 2881 + }, + { + "epoch": 0.2943820224719101, + "grad_norm": 1.419509622184691, + "learning_rate": 1.6553152101682797e-05, + "loss": 0.6949, + "step": 2882 + }, + { + "epoch": 0.2944841675178754, + "grad_norm": 1.4367340656489234, + "learning_rate": 1.655065277577032e-05, + "loss": 0.7875, + "step": 2883 + }, + { + "epoch": 0.29458631256384066, + "grad_norm": 1.3834987931234641, + "learning_rate": 1.6548152732884664e-05, + "loss": 0.8609, + "step": 2884 + }, + { + "epoch": 0.29468845760980594, + "grad_norm": 1.3949974541994266, + "learning_rate": 1.654565197329946e-05, + "loss": 0.7451, + "step": 2885 + }, + { + "epoch": 0.2947906026557712, + "grad_norm": 1.3476796189888747, + "learning_rate": 1.6543150497288426e-05, + "loss": 0.7388, + "step": 2886 + }, + { + "epoch": 0.29489274770173646, + "grad_norm": 1.4456774935562005, + "learning_rate": 1.6540648305125334e-05, + "loss": 0.674, + "step": 2887 + }, + { + "epoch": 0.29499489274770174, + "grad_norm": 1.4494321871032114, + "learning_rate": 1.653814539708406e-05, + "loss": 0.7845, + "step": 2888 + }, + { + "epoch": 0.295097037793667, + "grad_norm": 1.4666687648116359, + "learning_rate": 1.653564177343855e-05, + "loss": 0.7289, + "step": 2889 + }, + { + "epoch": 0.29519918283963226, + "grad_norm": 1.470239545966603, + "learning_rate": 1.6533137434462827e-05, + "loss": 0.7662, + "step": 2890 + }, + { + "epoch": 0.29530132788559754, + "grad_norm": 1.5251565658143804, + "learning_rate": 1.6530632380430992e-05, + "loss": 0.7309, + "step": 2891 + }, + { + "epoch": 0.2954034729315628, + "grad_norm": 1.5111177474206396, + "learning_rate": 1.652812661161722e-05, + "loss": 0.7451, + "step": 2892 + }, + { + "epoch": 0.2955056179775281, + "grad_norm": 1.415504315145862, + "learning_rate": 1.6525620128295772e-05, + "loss": 0.764, + "step": 2893 + }, + { + "epoch": 0.29560776302349334, + "grad_norm": 1.4078136694284162, + "learning_rate": 1.652311293074099e-05, + "loss": 0.8005, + "step": 2894 + }, + { + "epoch": 0.2957099080694586, + "grad_norm": 1.584088411795648, + "learning_rate": 1.6520605019227275e-05, + "loss": 0.631, + "step": 2895 + }, + { + "epoch": 0.2958120531154239, + "grad_norm": 1.5858650341567242, + "learning_rate": 1.6518096394029132e-05, + "loss": 0.7102, + "step": 2896 + }, + { + "epoch": 0.2959141981613892, + "grad_norm": 1.5211052965394134, + "learning_rate": 1.6515587055421128e-05, + "loss": 0.7589, + "step": 2897 + }, + { + "epoch": 0.2960163432073544, + "grad_norm": 1.6303217709368616, + "learning_rate": 1.6513077003677912e-05, + "loss": 0.8261, + "step": 2898 + }, + { + "epoch": 0.2961184882533197, + "grad_norm": 1.414917464994601, + "learning_rate": 1.651056623907421e-05, + "loss": 0.6939, + "step": 2899 + }, + { + "epoch": 0.296220633299285, + "grad_norm": 1.4411507421356449, + "learning_rate": 1.6508054761884828e-05, + "loss": 0.7422, + "step": 2900 + }, + { + "epoch": 0.2963227783452503, + "grad_norm": 1.4247201475106037, + "learning_rate": 1.6505542572384643e-05, + "loss": 0.6344, + "step": 2901 + }, + { + "epoch": 0.2964249233912155, + "grad_norm": 1.5294782380518392, + "learning_rate": 1.6503029670848624e-05, + "loss": 0.7614, + "step": 2902 + }, + { + "epoch": 0.2965270684371808, + "grad_norm": 1.6520815281035013, + "learning_rate": 1.6500516057551802e-05, + "loss": 0.761, + "step": 2903 + }, + { + "epoch": 0.2966292134831461, + "grad_norm": 1.4375623372915718, + "learning_rate": 1.6498001732769303e-05, + "loss": 0.715, + "step": 2904 + }, + { + "epoch": 0.2967313585291113, + "grad_norm": 1.5830050638800264, + "learning_rate": 1.6495486696776312e-05, + "loss": 0.7341, + "step": 2905 + }, + { + "epoch": 0.2968335035750766, + "grad_norm": 1.5135091164641334, + "learning_rate": 1.6492970949848108e-05, + "loss": 0.7902, + "step": 2906 + }, + { + "epoch": 0.2969356486210419, + "grad_norm": 1.3243270245147984, + "learning_rate": 1.6490454492260036e-05, + "loss": 0.6196, + "step": 2907 + }, + { + "epoch": 0.29703779366700717, + "grad_norm": 1.689875193454914, + "learning_rate": 1.648793732428753e-05, + "loss": 0.8381, + "step": 2908 + }, + { + "epoch": 0.2971399387129724, + "grad_norm": 1.4909728774563242, + "learning_rate": 1.648541944620609e-05, + "loss": 0.753, + "step": 2909 + }, + { + "epoch": 0.2972420837589377, + "grad_norm": 1.3620683061481003, + "learning_rate": 1.64829008582913e-05, + "loss": 0.7772, + "step": 2910 + }, + { + "epoch": 0.29734422880490297, + "grad_norm": 1.3653853428049207, + "learning_rate": 1.6480381560818824e-05, + "loss": 0.6945, + "step": 2911 + }, + { + "epoch": 0.29744637385086825, + "grad_norm": 1.3394932694923543, + "learning_rate": 1.6477861554064397e-05, + "loss": 0.6226, + "step": 2912 + }, + { + "epoch": 0.2975485188968335, + "grad_norm": 1.4128449121502766, + "learning_rate": 1.6475340838303843e-05, + "loss": 0.7495, + "step": 2913 + }, + { + "epoch": 0.29765066394279877, + "grad_norm": 1.480706003822705, + "learning_rate": 1.6472819413813045e-05, + "loss": 0.7392, + "step": 2914 + }, + { + "epoch": 0.29775280898876405, + "grad_norm": 1.62549317355222, + "learning_rate": 1.6470297280867983e-05, + "loss": 0.7241, + "step": 2915 + }, + { + "epoch": 0.29785495403472934, + "grad_norm": 1.3821823662257937, + "learning_rate": 1.6467774439744704e-05, + "loss": 0.7084, + "step": 2916 + }, + { + "epoch": 0.29795709908069457, + "grad_norm": 1.531755530094324, + "learning_rate": 1.6465250890719335e-05, + "loss": 0.7546, + "step": 2917 + }, + { + "epoch": 0.29805924412665985, + "grad_norm": 1.5087001997849436, + "learning_rate": 1.6462726634068077e-05, + "loss": 0.7577, + "step": 2918 + }, + { + "epoch": 0.29816138917262514, + "grad_norm": 1.5390813658444604, + "learning_rate": 1.646020167006721e-05, + "loss": 0.6994, + "step": 2919 + }, + { + "epoch": 0.2982635342185904, + "grad_norm": 1.3335420469670134, + "learning_rate": 1.6457675998993102e-05, + "loss": 0.6778, + "step": 2920 + }, + { + "epoch": 0.29836567926455565, + "grad_norm": 1.4470621367305514, + "learning_rate": 1.645514962112218e-05, + "loss": 0.7507, + "step": 2921 + }, + { + "epoch": 0.29846782431052093, + "grad_norm": 1.4432926123560361, + "learning_rate": 1.6452622536730968e-05, + "loss": 0.7649, + "step": 2922 + }, + { + "epoch": 0.2985699693564862, + "grad_norm": 1.4303968036103398, + "learning_rate": 1.6450094746096043e-05, + "loss": 0.6432, + "step": 2923 + }, + { + "epoch": 0.2986721144024515, + "grad_norm": 1.6533297432775904, + "learning_rate": 1.644756624949408e-05, + "loss": 0.7014, + "step": 2924 + }, + { + "epoch": 0.29877425944841673, + "grad_norm": 1.4914067670574191, + "learning_rate": 1.644503704720183e-05, + "loss": 0.7978, + "step": 2925 + }, + { + "epoch": 0.298876404494382, + "grad_norm": 1.5182971302416068, + "learning_rate": 1.644250713949611e-05, + "loss": 0.7905, + "step": 2926 + }, + { + "epoch": 0.2989785495403473, + "grad_norm": 1.5203266543109104, + "learning_rate": 1.643997652665382e-05, + "loss": 0.6518, + "step": 2927 + }, + { + "epoch": 0.2990806945863126, + "grad_norm": 1.3971185682428557, + "learning_rate": 1.643744520895194e-05, + "loss": 0.8061, + "step": 2928 + }, + { + "epoch": 0.2991828396322778, + "grad_norm": 1.487988272554379, + "learning_rate": 1.643491318666752e-05, + "loss": 0.7341, + "step": 2929 + }, + { + "epoch": 0.2992849846782431, + "grad_norm": 1.4657399898270884, + "learning_rate": 1.64323804600777e-05, + "loss": 0.7759, + "step": 2930 + }, + { + "epoch": 0.2993871297242084, + "grad_norm": 1.5206389787821928, + "learning_rate": 1.6429847029459676e-05, + "loss": 0.7856, + "step": 2931 + }, + { + "epoch": 0.2994892747701736, + "grad_norm": 1.515864313254794, + "learning_rate": 1.6427312895090744e-05, + "loss": 0.68, + "step": 2932 + }, + { + "epoch": 0.2995914198161389, + "grad_norm": 1.5513670496790655, + "learning_rate": 1.6424778057248262e-05, + "loss": 0.7494, + "step": 2933 + }, + { + "epoch": 0.2996935648621042, + "grad_norm": 3.664115278414078, + "learning_rate": 1.6422242516209673e-05, + "loss": 0.7683, + "step": 2934 + }, + { + "epoch": 0.2997957099080695, + "grad_norm": 1.4280942068603537, + "learning_rate": 1.641970627225249e-05, + "loss": 0.7742, + "step": 2935 + }, + { + "epoch": 0.2998978549540347, + "grad_norm": 1.5342558870774012, + "learning_rate": 1.6417169325654306e-05, + "loss": 0.617, + "step": 2936 + }, + { + "epoch": 0.3, + "grad_norm": 1.4298730766335166, + "learning_rate": 1.6414631676692794e-05, + "loss": 0.7013, + "step": 2937 + }, + { + "epoch": 0.3001021450459653, + "grad_norm": 1.5346043016438722, + "learning_rate": 1.6412093325645702e-05, + "loss": 0.7234, + "step": 2938 + }, + { + "epoch": 0.30020429009193056, + "grad_norm": 1.4712538220212281, + "learning_rate": 1.640955427279085e-05, + "loss": 0.652, + "step": 2939 + }, + { + "epoch": 0.3003064351378958, + "grad_norm": 1.5475529667094963, + "learning_rate": 1.6407014518406145e-05, + "loss": 0.6731, + "step": 2940 + }, + { + "epoch": 0.3004085801838611, + "grad_norm": 1.5750543388915255, + "learning_rate": 1.6404474062769557e-05, + "loss": 0.7743, + "step": 2941 + }, + { + "epoch": 0.30051072522982636, + "grad_norm": 1.5187609471980226, + "learning_rate": 1.640193290615915e-05, + "loss": 0.8586, + "step": 2942 + }, + { + "epoch": 0.30061287027579164, + "grad_norm": 1.3686523624579994, + "learning_rate": 1.639939104885305e-05, + "loss": 0.7381, + "step": 2943 + }, + { + "epoch": 0.3007150153217569, + "grad_norm": 1.5309711593386164, + "learning_rate": 1.6396848491129462e-05, + "loss": 0.8211, + "step": 2944 + }, + { + "epoch": 0.30081716036772216, + "grad_norm": 1.4832227316100668, + "learning_rate": 1.6394305233266674e-05, + "loss": 0.8156, + "step": 2945 + }, + { + "epoch": 0.30091930541368744, + "grad_norm": 1.4688256340602475, + "learning_rate": 1.639176127554305e-05, + "loss": 0.7078, + "step": 2946 + }, + { + "epoch": 0.30102145045965273, + "grad_norm": 1.610395113232806, + "learning_rate": 1.638921661823702e-05, + "loss": 0.7888, + "step": 2947 + }, + { + "epoch": 0.30112359550561796, + "grad_norm": 1.3610819588055967, + "learning_rate": 1.638667126162711e-05, + "loss": 0.7219, + "step": 2948 + }, + { + "epoch": 0.30122574055158324, + "grad_norm": 1.4864985365765986, + "learning_rate": 1.63841252059919e-05, + "loss": 0.6854, + "step": 2949 + }, + { + "epoch": 0.30132788559754853, + "grad_norm": 1.4277915296352843, + "learning_rate": 1.6381578451610062e-05, + "loss": 0.7831, + "step": 2950 + }, + { + "epoch": 0.3014300306435138, + "grad_norm": 1.526164367582635, + "learning_rate": 1.637903099876034e-05, + "loss": 0.7593, + "step": 2951 + }, + { + "epoch": 0.30153217568947904, + "grad_norm": 2.09754533939028, + "learning_rate": 1.6376482847721553e-05, + "loss": 0.6572, + "step": 2952 + }, + { + "epoch": 0.30163432073544433, + "grad_norm": 1.4758808431190351, + "learning_rate": 1.6373933998772597e-05, + "loss": 0.6381, + "step": 2953 + }, + { + "epoch": 0.3017364657814096, + "grad_norm": 1.4599549892955195, + "learning_rate": 1.637138445219245e-05, + "loss": 0.7031, + "step": 2954 + }, + { + "epoch": 0.3018386108273749, + "grad_norm": 1.6082262121873285, + "learning_rate": 1.6368834208260155e-05, + "loss": 0.6932, + "step": 2955 + }, + { + "epoch": 0.3019407558733401, + "grad_norm": 1.5667207671001133, + "learning_rate": 1.6366283267254842e-05, + "loss": 0.6257, + "step": 2956 + }, + { + "epoch": 0.3020429009193054, + "grad_norm": 1.6467063384506067, + "learning_rate": 1.636373162945571e-05, + "loss": 0.8482, + "step": 2957 + }, + { + "epoch": 0.3021450459652707, + "grad_norm": 1.4985411284293428, + "learning_rate": 1.636117929514205e-05, + "loss": 0.7767, + "step": 2958 + }, + { + "epoch": 0.302247191011236, + "grad_norm": 1.6396412584855686, + "learning_rate": 1.6358626264593195e-05, + "loss": 0.6589, + "step": 2959 + }, + { + "epoch": 0.3023493360572012, + "grad_norm": 1.343234381827057, + "learning_rate": 1.635607253808859e-05, + "loss": 0.6706, + "step": 2960 + }, + { + "epoch": 0.3024514811031665, + "grad_norm": 1.5207641040280568, + "learning_rate": 1.6353518115907742e-05, + "loss": 0.7944, + "step": 2961 + }, + { + "epoch": 0.3025536261491318, + "grad_norm": 1.4443279280110644, + "learning_rate": 1.6350962998330232e-05, + "loss": 0.6582, + "step": 2962 + }, + { + "epoch": 0.302655771195097, + "grad_norm": 1.418962614044098, + "learning_rate": 1.6348407185635714e-05, + "loss": 0.783, + "step": 2963 + }, + { + "epoch": 0.3027579162410623, + "grad_norm": 1.5383864193453312, + "learning_rate": 1.634585067810393e-05, + "loss": 0.7429, + "step": 2964 + }, + { + "epoch": 0.3028600612870276, + "grad_norm": 1.4980263988136335, + "learning_rate": 1.634329347601469e-05, + "loss": 0.7364, + "step": 2965 + }, + { + "epoch": 0.30296220633299287, + "grad_norm": 1.5604354609325344, + "learning_rate": 1.634073557964788e-05, + "loss": 0.8035, + "step": 2966 + }, + { + "epoch": 0.3030643513789581, + "grad_norm": 1.4885223804742989, + "learning_rate": 1.6338176989283464e-05, + "loss": 0.6796, + "step": 2967 + }, + { + "epoch": 0.3031664964249234, + "grad_norm": 1.4462283813799703, + "learning_rate": 1.633561770520148e-05, + "loss": 0.7429, + "step": 2968 + }, + { + "epoch": 0.30326864147088867, + "grad_norm": 1.4203512695129104, + "learning_rate": 1.6333057727682048e-05, + "loss": 0.7339, + "step": 2969 + }, + { + "epoch": 0.30337078651685395, + "grad_norm": 1.637186513350223, + "learning_rate": 1.6330497057005355e-05, + "loss": 0.8423, + "step": 2970 + }, + { + "epoch": 0.3034729315628192, + "grad_norm": 1.5181812093359124, + "learning_rate": 1.632793569345167e-05, + "loss": 0.8474, + "step": 2971 + }, + { + "epoch": 0.30357507660878447, + "grad_norm": 1.6511547667884172, + "learning_rate": 1.6325373637301332e-05, + "loss": 0.7353, + "step": 2972 + }, + { + "epoch": 0.30367722165474975, + "grad_norm": 1.4546755011640309, + "learning_rate": 1.6322810888834765e-05, + "loss": 0.7728, + "step": 2973 + }, + { + "epoch": 0.30377936670071504, + "grad_norm": 1.3495997974083354, + "learning_rate": 1.6320247448332464e-05, + "loss": 0.7342, + "step": 2974 + }, + { + "epoch": 0.30388151174668027, + "grad_norm": 1.6508505696324733, + "learning_rate": 1.631768331607499e-05, + "loss": 0.7263, + "step": 2975 + }, + { + "epoch": 0.30398365679264555, + "grad_norm": 1.4621332829880127, + "learning_rate": 1.6315118492343e-05, + "loss": 0.6282, + "step": 2976 + }, + { + "epoch": 0.30408580183861084, + "grad_norm": 1.3829854064490696, + "learning_rate": 1.631255297741721e-05, + "loss": 0.6467, + "step": 2977 + }, + { + "epoch": 0.3041879468845761, + "grad_norm": 1.404607283843521, + "learning_rate": 1.6309986771578415e-05, + "loss": 0.626, + "step": 2978 + }, + { + "epoch": 0.30429009193054135, + "grad_norm": 1.3418539116756683, + "learning_rate": 1.6307419875107496e-05, + "loss": 0.7031, + "step": 2979 + }, + { + "epoch": 0.30439223697650664, + "grad_norm": 1.4783172199146684, + "learning_rate": 1.6304852288285393e-05, + "loss": 0.7902, + "step": 2980 + }, + { + "epoch": 0.3044943820224719, + "grad_norm": 1.3635804445266602, + "learning_rate": 1.6302284011393136e-05, + "loss": 0.674, + "step": 2981 + }, + { + "epoch": 0.3045965270684372, + "grad_norm": 1.4085882732638786, + "learning_rate": 1.629971504471182e-05, + "loss": 0.7481, + "step": 2982 + }, + { + "epoch": 0.30469867211440244, + "grad_norm": 1.6269960130940964, + "learning_rate": 1.6297145388522625e-05, + "loss": 0.7944, + "step": 2983 + }, + { + "epoch": 0.3048008171603677, + "grad_norm": 1.3859331620215272, + "learning_rate": 1.6294575043106794e-05, + "loss": 0.7034, + "step": 2984 + }, + { + "epoch": 0.304902962206333, + "grad_norm": 1.5176137032442059, + "learning_rate": 1.629200400874566e-05, + "loss": 0.86, + "step": 2985 + }, + { + "epoch": 0.3050051072522983, + "grad_norm": 1.6833596373373285, + "learning_rate": 1.6289432285720623e-05, + "loss": 0.8431, + "step": 2986 + }, + { + "epoch": 0.3051072522982635, + "grad_norm": 1.5367363345133567, + "learning_rate": 1.628685987431316e-05, + "loss": 0.741, + "step": 2987 + }, + { + "epoch": 0.3052093973442288, + "grad_norm": 1.5686612982424375, + "learning_rate": 1.628428677480482e-05, + "loss": 0.7529, + "step": 2988 + }, + { + "epoch": 0.3053115423901941, + "grad_norm": 1.510191242769554, + "learning_rate": 1.628171298747723e-05, + "loss": 0.7072, + "step": 2989 + }, + { + "epoch": 0.3054136874361593, + "grad_norm": 1.5263761390227724, + "learning_rate": 1.62791385126121e-05, + "loss": 0.6769, + "step": 2990 + }, + { + "epoch": 0.3055158324821246, + "grad_norm": 1.5666469252613624, + "learning_rate": 1.62765633504912e-05, + "loss": 0.7939, + "step": 2991 + }, + { + "epoch": 0.3056179775280899, + "grad_norm": 1.4969327017590897, + "learning_rate": 1.6273987501396388e-05, + "loss": 0.7237, + "step": 2992 + }, + { + "epoch": 0.3057201225740552, + "grad_norm": 1.4968080794642804, + "learning_rate": 1.627141096560959e-05, + "loss": 0.7024, + "step": 2993 + }, + { + "epoch": 0.3058222676200204, + "grad_norm": 1.3776636117539025, + "learning_rate": 1.626883374341281e-05, + "loss": 0.7211, + "step": 2994 + }, + { + "epoch": 0.3059244126659857, + "grad_norm": 1.5870745275850817, + "learning_rate": 1.6266255835088123e-05, + "loss": 0.7908, + "step": 2995 + }, + { + "epoch": 0.306026557711951, + "grad_norm": 1.6836189174380305, + "learning_rate": 1.626367724091769e-05, + "loss": 0.7055, + "step": 2996 + }, + { + "epoch": 0.30612870275791626, + "grad_norm": 1.3947974755432426, + "learning_rate": 1.6261097961183736e-05, + "loss": 0.7129, + "step": 2997 + }, + { + "epoch": 0.3062308478038815, + "grad_norm": 1.3550227539447222, + "learning_rate": 1.6258517996168565e-05, + "loss": 0.7175, + "step": 2998 + }, + { + "epoch": 0.3063329928498468, + "grad_norm": 1.426185052638176, + "learning_rate": 1.6255937346154555e-05, + "loss": 0.8229, + "step": 2999 + }, + { + "epoch": 0.30643513789581206, + "grad_norm": 1.512170598824874, + "learning_rate": 1.625335601142416e-05, + "loss": 0.7188, + "step": 3000 + }, + { + "epoch": 0.30653728294177734, + "grad_norm": 1.6163562700855492, + "learning_rate": 1.625077399225991e-05, + "loss": 0.7765, + "step": 3001 + }, + { + "epoch": 0.3066394279877426, + "grad_norm": 1.3935909309383394, + "learning_rate": 1.624819128894441e-05, + "loss": 0.723, + "step": 3002 + }, + { + "epoch": 0.30674157303370786, + "grad_norm": 1.4123189558202773, + "learning_rate": 1.6245607901760334e-05, + "loss": 0.6323, + "step": 3003 + }, + { + "epoch": 0.30684371807967314, + "grad_norm": 1.4388189324756848, + "learning_rate": 1.6243023830990438e-05, + "loss": 0.6966, + "step": 3004 + }, + { + "epoch": 0.30694586312563843, + "grad_norm": 1.4993678765796332, + "learning_rate": 1.6240439076917552e-05, + "loss": 0.8046, + "step": 3005 + }, + { + "epoch": 0.30704800817160366, + "grad_norm": 1.4645296271586108, + "learning_rate": 1.6237853639824576e-05, + "loss": 0.7098, + "step": 3006 + }, + { + "epoch": 0.30715015321756894, + "grad_norm": 1.4900076437308027, + "learning_rate": 1.6235267519994485e-05, + "loss": 0.6922, + "step": 3007 + }, + { + "epoch": 0.30725229826353423, + "grad_norm": 1.4743236454455042, + "learning_rate": 1.6232680717710342e-05, + "loss": 0.7221, + "step": 3008 + }, + { + "epoch": 0.3073544433094995, + "grad_norm": 1.4861671267874368, + "learning_rate": 1.6230093233255265e-05, + "loss": 0.7873, + "step": 3009 + }, + { + "epoch": 0.30745658835546474, + "grad_norm": 1.5262693390613993, + "learning_rate": 1.622750506691246e-05, + "loss": 0.7265, + "step": 3010 + }, + { + "epoch": 0.30755873340143003, + "grad_norm": 1.332127647759973, + "learning_rate": 1.6224916218965198e-05, + "loss": 0.6794, + "step": 3011 + }, + { + "epoch": 0.3076608784473953, + "grad_norm": 1.6221497597829193, + "learning_rate": 1.6222326689696838e-05, + "loss": 0.8461, + "step": 3012 + }, + { + "epoch": 0.3077630234933606, + "grad_norm": 1.40873259258424, + "learning_rate": 1.62197364793908e-05, + "loss": 0.7601, + "step": 3013 + }, + { + "epoch": 0.30786516853932583, + "grad_norm": 1.326302968075467, + "learning_rate": 1.6217145588330587e-05, + "loss": 0.663, + "step": 3014 + }, + { + "epoch": 0.3079673135852911, + "grad_norm": 1.4398039992373803, + "learning_rate": 1.621455401679977e-05, + "loss": 0.7456, + "step": 3015 + }, + { + "epoch": 0.3080694586312564, + "grad_norm": 1.3998465509450893, + "learning_rate": 1.6211961765082e-05, + "loss": 0.697, + "step": 3016 + }, + { + "epoch": 0.30817160367722163, + "grad_norm": 1.4994353348988576, + "learning_rate": 1.6209368833461006e-05, + "loss": 0.7334, + "step": 3017 + }, + { + "epoch": 0.3082737487231869, + "grad_norm": 1.5527164831227749, + "learning_rate": 1.6206775222220578e-05, + "loss": 0.7359, + "step": 3018 + }, + { + "epoch": 0.3083758937691522, + "grad_norm": 1.3536278539386644, + "learning_rate": 1.620418093164459e-05, + "loss": 0.652, + "step": 3019 + }, + { + "epoch": 0.3084780388151175, + "grad_norm": 1.3253847080521448, + "learning_rate": 1.6201585962016995e-05, + "loss": 0.7919, + "step": 3020 + }, + { + "epoch": 0.3085801838610827, + "grad_norm": 1.409819090707468, + "learning_rate": 1.619899031362181e-05, + "loss": 0.6603, + "step": 3021 + }, + { + "epoch": 0.308682328907048, + "grad_norm": 1.5450983462175814, + "learning_rate": 1.619639398674313e-05, + "loss": 0.646, + "step": 3022 + }, + { + "epoch": 0.3087844739530133, + "grad_norm": 1.484686154347848, + "learning_rate": 1.619379698166512e-05, + "loss": 0.685, + "step": 3023 + }, + { + "epoch": 0.30888661899897857, + "grad_norm": 1.3334198958772887, + "learning_rate": 1.6191199298672032e-05, + "loss": 0.7167, + "step": 3024 + }, + { + "epoch": 0.3089887640449438, + "grad_norm": 1.6874450681500093, + "learning_rate": 1.6188600938048185e-05, + "loss": 0.7868, + "step": 3025 + }, + { + "epoch": 0.3090909090909091, + "grad_norm": 1.4703910097891506, + "learning_rate": 1.6186001900077962e-05, + "loss": 0.7859, + "step": 3026 + }, + { + "epoch": 0.30919305413687437, + "grad_norm": 1.431432071895974, + "learning_rate": 1.6183402185045833e-05, + "loss": 0.7748, + "step": 3027 + }, + { + "epoch": 0.30929519918283965, + "grad_norm": 1.5968137604601693, + "learning_rate": 1.6180801793236342e-05, + "loss": 0.7329, + "step": 3028 + }, + { + "epoch": 0.3093973442288049, + "grad_norm": 1.630846402382367, + "learning_rate": 1.61782007249341e-05, + "loss": 0.8414, + "step": 3029 + }, + { + "epoch": 0.30949948927477017, + "grad_norm": 1.6655821816248195, + "learning_rate": 1.61755989804238e-05, + "loss": 0.7824, + "step": 3030 + }, + { + "epoch": 0.30960163432073545, + "grad_norm": 1.687518245748029, + "learning_rate": 1.6172996559990197e-05, + "loss": 0.7468, + "step": 3031 + }, + { + "epoch": 0.30970377936670074, + "grad_norm": 1.3909140287869024, + "learning_rate": 1.6170393463918137e-05, + "loss": 0.767, + "step": 3032 + }, + { + "epoch": 0.30980592441266597, + "grad_norm": 1.3877852826241506, + "learning_rate": 1.6167789692492522e-05, + "loss": 0.7865, + "step": 3033 + }, + { + "epoch": 0.30990806945863125, + "grad_norm": 1.4526652866708634, + "learning_rate": 1.6165185245998346e-05, + "loss": 0.7429, + "step": 3034 + }, + { + "epoch": 0.31001021450459654, + "grad_norm": 1.3996391509884138, + "learning_rate": 1.6162580124720653e-05, + "loss": 0.729, + "step": 3035 + }, + { + "epoch": 0.3101123595505618, + "grad_norm": 1.5455310168130485, + "learning_rate": 1.615997432894459e-05, + "loss": 0.7655, + "step": 3036 + }, + { + "epoch": 0.31021450459652705, + "grad_norm": 1.460210174628382, + "learning_rate": 1.6157367858955358e-05, + "loss": 0.7029, + "step": 3037 + }, + { + "epoch": 0.31031664964249234, + "grad_norm": 1.4486431775649355, + "learning_rate": 1.615476071503823e-05, + "loss": 0.7245, + "step": 3038 + }, + { + "epoch": 0.3104187946884576, + "grad_norm": 1.4501729806051957, + "learning_rate": 1.615215289747857e-05, + "loss": 0.8189, + "step": 3039 + }, + { + "epoch": 0.3105209397344229, + "grad_norm": 1.3530291419608678, + "learning_rate": 1.6149544406561797e-05, + "loss": 0.6959, + "step": 3040 + }, + { + "epoch": 0.31062308478038814, + "grad_norm": 1.4748528992186238, + "learning_rate": 1.614693524257342e-05, + "loss": 0.696, + "step": 3041 + }, + { + "epoch": 0.3107252298263534, + "grad_norm": 1.569515928356824, + "learning_rate": 1.614432540579901e-05, + "loss": 0.6905, + "step": 3042 + }, + { + "epoch": 0.3108273748723187, + "grad_norm": 1.3845131031411568, + "learning_rate": 1.6141714896524208e-05, + "loss": 0.7513, + "step": 3043 + }, + { + "epoch": 0.31092951991828394, + "grad_norm": 1.4869464889987924, + "learning_rate": 1.6139103715034746e-05, + "loss": 0.6353, + "step": 3044 + }, + { + "epoch": 0.3110316649642492, + "grad_norm": 1.48500908825774, + "learning_rate": 1.6136491861616414e-05, + "loss": 0.7292, + "step": 3045 + }, + { + "epoch": 0.3111338100102145, + "grad_norm": 1.537539942001847, + "learning_rate": 1.6133879336555085e-05, + "loss": 0.7759, + "step": 3046 + }, + { + "epoch": 0.3112359550561798, + "grad_norm": 1.5202636033182637, + "learning_rate": 1.61312661401367e-05, + "loss": 0.7287, + "step": 3047 + }, + { + "epoch": 0.311338100102145, + "grad_norm": 1.4807668749227374, + "learning_rate": 1.6128652272647274e-05, + "loss": 0.7319, + "step": 3048 + }, + { + "epoch": 0.3114402451481103, + "grad_norm": 1.430217392925012, + "learning_rate": 1.61260377343729e-05, + "loss": 0.7888, + "step": 3049 + }, + { + "epoch": 0.3115423901940756, + "grad_norm": 1.7903840588151976, + "learning_rate": 1.6123422525599735e-05, + "loss": 0.8025, + "step": 3050 + }, + { + "epoch": 0.3116445352400409, + "grad_norm": 1.6264293020015002, + "learning_rate": 1.6120806646614018e-05, + "loss": 0.7091, + "step": 3051 + }, + { + "epoch": 0.3117466802860061, + "grad_norm": 1.4875781678219508, + "learning_rate": 1.611819009770206e-05, + "loss": 0.701, + "step": 3052 + }, + { + "epoch": 0.3118488253319714, + "grad_norm": 1.4465137806625734, + "learning_rate": 1.6115572879150243e-05, + "loss": 0.6744, + "step": 3053 + }, + { + "epoch": 0.3119509703779367, + "grad_norm": 1.2894489678481411, + "learning_rate": 1.6112954991245023e-05, + "loss": 0.7584, + "step": 3054 + }, + { + "epoch": 0.31205311542390196, + "grad_norm": 1.2740205157182116, + "learning_rate": 1.6110336434272927e-05, + "loss": 0.7173, + "step": 3055 + }, + { + "epoch": 0.3121552604698672, + "grad_norm": 1.4450953260891202, + "learning_rate": 1.6107717208520563e-05, + "loss": 0.6146, + "step": 3056 + }, + { + "epoch": 0.3122574055158325, + "grad_norm": 1.4179392588351547, + "learning_rate": 1.6105097314274605e-05, + "loss": 0.7363, + "step": 3057 + }, + { + "epoch": 0.31235955056179776, + "grad_norm": 1.4884520098184097, + "learning_rate": 1.6102476751821804e-05, + "loss": 0.7006, + "step": 3058 + }, + { + "epoch": 0.31246169560776305, + "grad_norm": 1.5072974325170947, + "learning_rate": 1.6099855521448975e-05, + "loss": 0.6804, + "step": 3059 + }, + { + "epoch": 0.3125638406537283, + "grad_norm": 1.5448349331432443, + "learning_rate": 1.609723362344302e-05, + "loss": 0.7348, + "step": 3060 + }, + { + "epoch": 0.31266598569969356, + "grad_norm": 1.2587571267707682, + "learning_rate": 1.6094611058090905e-05, + "loss": 0.591, + "step": 3061 + }, + { + "epoch": 0.31276813074565885, + "grad_norm": 1.4009027161346672, + "learning_rate": 1.6091987825679672e-05, + "loss": 0.7295, + "step": 3062 + }, + { + "epoch": 0.31287027579162413, + "grad_norm": 1.4802466221493724, + "learning_rate": 1.6089363926496436e-05, + "loss": 0.7569, + "step": 3063 + }, + { + "epoch": 0.31297242083758936, + "grad_norm": 1.4710691017784918, + "learning_rate": 1.6086739360828385e-05, + "loss": 0.7824, + "step": 3064 + }, + { + "epoch": 0.31307456588355465, + "grad_norm": 1.3428302905445242, + "learning_rate": 1.608411412896278e-05, + "loss": 0.8099, + "step": 3065 + }, + { + "epoch": 0.31317671092951993, + "grad_norm": 1.4396268825799485, + "learning_rate": 1.608148823118695e-05, + "loss": 0.7202, + "step": 3066 + }, + { + "epoch": 0.3132788559754852, + "grad_norm": 1.4235875429270053, + "learning_rate": 1.6078861667788307e-05, + "loss": 0.7336, + "step": 3067 + }, + { + "epoch": 0.31338100102145044, + "grad_norm": 1.4175319425862563, + "learning_rate": 1.607623443905432e-05, + "loss": 0.749, + "step": 3068 + }, + { + "epoch": 0.31348314606741573, + "grad_norm": 1.298328559908578, + "learning_rate": 1.6073606545272555e-05, + "loss": 0.7409, + "step": 3069 + }, + { + "epoch": 0.313585291113381, + "grad_norm": 1.5564802144213863, + "learning_rate": 1.6070977986730625e-05, + "loss": 0.6281, + "step": 3070 + }, + { + "epoch": 0.31368743615934624, + "grad_norm": 1.5205225055890226, + "learning_rate": 1.6068348763716237e-05, + "loss": 0.7095, + "step": 3071 + }, + { + "epoch": 0.31378958120531153, + "grad_norm": 1.3776727449209483, + "learning_rate": 1.606571887651715e-05, + "loss": 0.7872, + "step": 3072 + }, + { + "epoch": 0.3138917262512768, + "grad_norm": 1.3588663605504723, + "learning_rate": 1.6063088325421218e-05, + "loss": 0.6551, + "step": 3073 + }, + { + "epoch": 0.3139938712972421, + "grad_norm": 1.517960542665752, + "learning_rate": 1.6060457110716346e-05, + "loss": 0.7017, + "step": 3074 + }, + { + "epoch": 0.31409601634320733, + "grad_norm": 1.6318436163581094, + "learning_rate": 1.6057825232690538e-05, + "loss": 0.8613, + "step": 3075 + }, + { + "epoch": 0.3141981613891726, + "grad_norm": 1.5475269555410534, + "learning_rate": 1.605519269163183e-05, + "loss": 0.7734, + "step": 3076 + }, + { + "epoch": 0.3143003064351379, + "grad_norm": 1.4042244684580985, + "learning_rate": 1.6052559487828382e-05, + "loss": 0.7653, + "step": 3077 + }, + { + "epoch": 0.3144024514811032, + "grad_norm": 1.5003909503481574, + "learning_rate": 1.6049925621568384e-05, + "loss": 0.7562, + "step": 3078 + }, + { + "epoch": 0.3145045965270684, + "grad_norm": 1.3464103901206892, + "learning_rate": 1.6047291093140116e-05, + "loss": 0.708, + "step": 3079 + }, + { + "epoch": 0.3146067415730337, + "grad_norm": 1.564135381951507, + "learning_rate": 1.604465590283193e-05, + "loss": 0.7882, + "step": 3080 + }, + { + "epoch": 0.314708886618999, + "grad_norm": 1.5776893622208517, + "learning_rate": 1.604202005093225e-05, + "loss": 0.6244, + "step": 3081 + }, + { + "epoch": 0.31481103166496427, + "grad_norm": 1.6314371353896648, + "learning_rate": 1.6039383537729577e-05, + "loss": 0.7916, + "step": 3082 + }, + { + "epoch": 0.3149131767109295, + "grad_norm": 1.5453943080862158, + "learning_rate": 1.603674636351247e-05, + "loss": 0.7008, + "step": 3083 + }, + { + "epoch": 0.3150153217568948, + "grad_norm": 1.3540603718937378, + "learning_rate": 1.6034108528569574e-05, + "loss": 0.7427, + "step": 3084 + }, + { + "epoch": 0.31511746680286007, + "grad_norm": 1.44405631108969, + "learning_rate": 1.60314700331896e-05, + "loss": 0.724, + "step": 3085 + }, + { + "epoch": 0.31521961184882535, + "grad_norm": 1.5229590803299635, + "learning_rate": 1.6028830877661334e-05, + "loss": 0.7232, + "step": 3086 + }, + { + "epoch": 0.3153217568947906, + "grad_norm": 1.4930109491698582, + "learning_rate": 1.6026191062273636e-05, + "loss": 0.7456, + "step": 3087 + }, + { + "epoch": 0.31542390194075587, + "grad_norm": 1.4591704762135231, + "learning_rate": 1.602355058731543e-05, + "loss": 0.6834, + "step": 3088 + }, + { + "epoch": 0.31552604698672115, + "grad_norm": 1.3818664334612503, + "learning_rate": 1.602090945307572e-05, + "loss": 0.7314, + "step": 3089 + }, + { + "epoch": 0.31562819203268644, + "grad_norm": 1.4070299100122066, + "learning_rate": 1.6018267659843584e-05, + "loss": 0.7394, + "step": 3090 + }, + { + "epoch": 0.31573033707865167, + "grad_norm": 1.6474723453163884, + "learning_rate": 1.6015625207908162e-05, + "loss": 0.7134, + "step": 3091 + }, + { + "epoch": 0.31583248212461695, + "grad_norm": 1.6297677820582663, + "learning_rate": 1.6012982097558675e-05, + "loss": 0.7906, + "step": 3092 + }, + { + "epoch": 0.31593462717058224, + "grad_norm": 1.5734812516448629, + "learning_rate": 1.601033832908441e-05, + "loss": 0.7983, + "step": 3093 + }, + { + "epoch": 0.3160367722165475, + "grad_norm": 1.5401753830626395, + "learning_rate": 1.6007693902774735e-05, + "loss": 0.7228, + "step": 3094 + }, + { + "epoch": 0.31613891726251275, + "grad_norm": 1.490498790217978, + "learning_rate": 1.600504881891908e-05, + "loss": 0.7451, + "step": 3095 + }, + { + "epoch": 0.31624106230847804, + "grad_norm": 1.582059730670658, + "learning_rate": 1.6002403077806952e-05, + "loss": 0.7293, + "step": 3096 + }, + { + "epoch": 0.3163432073544433, + "grad_norm": 1.374902956599118, + "learning_rate": 1.599975667972793e-05, + "loss": 0.6923, + "step": 3097 + }, + { + "epoch": 0.31644535240040855, + "grad_norm": 1.3376025654143355, + "learning_rate": 1.599710962497166e-05, + "loss": 0.7181, + "step": 3098 + }, + { + "epoch": 0.31654749744637384, + "grad_norm": 1.5256944847953056, + "learning_rate": 1.599446191382787e-05, + "loss": 0.7699, + "step": 3099 + }, + { + "epoch": 0.3166496424923391, + "grad_norm": 1.4698314193472413, + "learning_rate": 1.5991813546586346e-05, + "loss": 0.7787, + "step": 3100 + }, + { + "epoch": 0.3167517875383044, + "grad_norm": 1.5174651017065346, + "learning_rate": 1.5989164523536964e-05, + "loss": 0.7487, + "step": 3101 + }, + { + "epoch": 0.31685393258426964, + "grad_norm": 1.48289533624744, + "learning_rate": 1.5986514844969655e-05, + "loss": 0.6847, + "step": 3102 + }, + { + "epoch": 0.3169560776302349, + "grad_norm": 1.5531717099186495, + "learning_rate": 1.5983864511174425e-05, + "loss": 0.6584, + "step": 3103 + }, + { + "epoch": 0.3170582226762002, + "grad_norm": 1.6455938973451356, + "learning_rate": 1.5981213522441358e-05, + "loss": 0.7722, + "step": 3104 + }, + { + "epoch": 0.3171603677221655, + "grad_norm": 1.6543100617449766, + "learning_rate": 1.5978561879060608e-05, + "loss": 0.7815, + "step": 3105 + }, + { + "epoch": 0.3172625127681307, + "grad_norm": 1.4813002408413227, + "learning_rate": 1.59759095813224e-05, + "loss": 0.7035, + "step": 3106 + }, + { + "epoch": 0.317364657814096, + "grad_norm": 1.4549448443312227, + "learning_rate": 1.5973256629517026e-05, + "loss": 0.7501, + "step": 3107 + }, + { + "epoch": 0.3174668028600613, + "grad_norm": 1.2605158717334182, + "learning_rate": 1.597060302393485e-05, + "loss": 0.7577, + "step": 3108 + }, + { + "epoch": 0.3175689479060266, + "grad_norm": 1.3808301198882824, + "learning_rate": 1.5967948764866324e-05, + "loss": 0.78, + "step": 3109 + }, + { + "epoch": 0.3176710929519918, + "grad_norm": 1.7306359890667031, + "learning_rate": 1.5965293852601944e-05, + "loss": 0.7736, + "step": 3110 + }, + { + "epoch": 0.3177732379979571, + "grad_norm": 1.4330875879895189, + "learning_rate": 1.59626382874323e-05, + "loss": 0.7955, + "step": 3111 + }, + { + "epoch": 0.3178753830439224, + "grad_norm": 1.5819126773600796, + "learning_rate": 1.595998206964804e-05, + "loss": 0.7288, + "step": 3112 + }, + { + "epoch": 0.31797752808988766, + "grad_norm": 1.5042557934951588, + "learning_rate": 1.5957325199539894e-05, + "loss": 0.7439, + "step": 3113 + }, + { + "epoch": 0.3180796731358529, + "grad_norm": 1.4669119195743343, + "learning_rate": 1.5954667677398656e-05, + "loss": 0.735, + "step": 3114 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 1.4778851228347591, + "learning_rate": 1.5952009503515195e-05, + "loss": 0.7017, + "step": 3115 + }, + { + "epoch": 0.31828396322778346, + "grad_norm": 1.5231365885489327, + "learning_rate": 1.5949350678180446e-05, + "loss": 0.7967, + "step": 3116 + }, + { + "epoch": 0.31838610827374875, + "grad_norm": 1.366553037593949, + "learning_rate": 1.594669120168542e-05, + "loss": 0.6538, + "step": 3117 + }, + { + "epoch": 0.318488253319714, + "grad_norm": 1.4441295369191471, + "learning_rate": 1.5944031074321205e-05, + "loss": 0.7573, + "step": 3118 + }, + { + "epoch": 0.31859039836567926, + "grad_norm": 1.4822607635545413, + "learning_rate": 1.5941370296378943e-05, + "loss": 0.708, + "step": 3119 + }, + { + "epoch": 0.31869254341164455, + "grad_norm": 1.4286238142859087, + "learning_rate": 1.5938708868149867e-05, + "loss": 0.697, + "step": 3120 + }, + { + "epoch": 0.31879468845760983, + "grad_norm": 1.3537816954976416, + "learning_rate": 1.5936046789925268e-05, + "loss": 0.6319, + "step": 3121 + }, + { + "epoch": 0.31889683350357506, + "grad_norm": 1.6901698259549744, + "learning_rate": 1.5933384061996515e-05, + "loss": 0.7934, + "step": 3122 + }, + { + "epoch": 0.31899897854954035, + "grad_norm": 1.3963072846062035, + "learning_rate": 1.593072068465504e-05, + "loss": 0.6693, + "step": 3123 + }, + { + "epoch": 0.31910112359550563, + "grad_norm": 1.4364062624366685, + "learning_rate": 1.5928056658192353e-05, + "loss": 0.815, + "step": 3124 + }, + { + "epoch": 0.3192032686414709, + "grad_norm": 1.4041114324768975, + "learning_rate": 1.5925391982900038e-05, + "loss": 0.7075, + "step": 3125 + }, + { + "epoch": 0.31930541368743615, + "grad_norm": 1.457495197868525, + "learning_rate": 1.592272665906974e-05, + "loss": 0.7757, + "step": 3126 + }, + { + "epoch": 0.31940755873340143, + "grad_norm": 1.5165108904143538, + "learning_rate": 1.5920060686993184e-05, + "loss": 0.7771, + "step": 3127 + }, + { + "epoch": 0.3195097037793667, + "grad_norm": 1.4989976046027713, + "learning_rate": 1.591739406696216e-05, + "loss": 0.7439, + "step": 3128 + }, + { + "epoch": 0.31961184882533195, + "grad_norm": 1.5262232841654173, + "learning_rate": 1.5914726799268532e-05, + "loss": 0.6633, + "step": 3129 + }, + { + "epoch": 0.31971399387129723, + "grad_norm": 1.4092019469654895, + "learning_rate": 1.5912058884204234e-05, + "loss": 0.7546, + "step": 3130 + }, + { + "epoch": 0.3198161389172625, + "grad_norm": 1.4703629863413072, + "learning_rate": 1.5909390322061273e-05, + "loss": 0.819, + "step": 3131 + }, + { + "epoch": 0.3199182839632278, + "grad_norm": 1.447766714788458, + "learning_rate": 1.590672111313172e-05, + "loss": 0.7853, + "step": 3132 + }, + { + "epoch": 0.32002042900919303, + "grad_norm": 1.2620085582897298, + "learning_rate": 1.590405125770773e-05, + "loss": 0.6594, + "step": 3133 + }, + { + "epoch": 0.3201225740551583, + "grad_norm": 1.4127196902528627, + "learning_rate": 1.5901380756081516e-05, + "loss": 0.7225, + "step": 3134 + }, + { + "epoch": 0.3202247191011236, + "grad_norm": 1.5254939412977586, + "learning_rate": 1.5898709608545358e-05, + "loss": 0.707, + "step": 3135 + }, + { + "epoch": 0.3203268641470889, + "grad_norm": 1.4381238734409696, + "learning_rate": 1.5896037815391628e-05, + "loss": 0.7337, + "step": 3136 + }, + { + "epoch": 0.3204290091930541, + "grad_norm": 1.4088205886503185, + "learning_rate": 1.589336537691275e-05, + "loss": 0.6336, + "step": 3137 + }, + { + "epoch": 0.3205311542390194, + "grad_norm": 1.5079191076378837, + "learning_rate": 1.5890692293401223e-05, + "loss": 0.7211, + "step": 3138 + }, + { + "epoch": 0.3206332992849847, + "grad_norm": 1.4218699664081809, + "learning_rate": 1.5888018565149615e-05, + "loss": 0.7349, + "step": 3139 + }, + { + "epoch": 0.32073544433094997, + "grad_norm": 1.5307806227362002, + "learning_rate": 1.5885344192450577e-05, + "loss": 0.7286, + "step": 3140 + }, + { + "epoch": 0.3208375893769152, + "grad_norm": 1.512777391543793, + "learning_rate": 1.588266917559681e-05, + "loss": 0.9063, + "step": 3141 + }, + { + "epoch": 0.3209397344228805, + "grad_norm": 1.3185820465752274, + "learning_rate": 1.58799935148811e-05, + "loss": 0.6893, + "step": 3142 + }, + { + "epoch": 0.32104187946884577, + "grad_norm": 1.614629945443676, + "learning_rate": 1.5877317210596305e-05, + "loss": 0.7478, + "step": 3143 + }, + { + "epoch": 0.32114402451481106, + "grad_norm": 1.4018372136618655, + "learning_rate": 1.5874640263035343e-05, + "loss": 0.717, + "step": 3144 + }, + { + "epoch": 0.3212461695607763, + "grad_norm": 1.4789247198622901, + "learning_rate": 1.587196267249121e-05, + "loss": 0.7336, + "step": 3145 + }, + { + "epoch": 0.32134831460674157, + "grad_norm": 1.3913728821159912, + "learning_rate": 1.5869284439256965e-05, + "loss": 0.6261, + "step": 3146 + }, + { + "epoch": 0.32145045965270685, + "grad_norm": 1.3837854021565528, + "learning_rate": 1.586660556362575e-05, + "loss": 0.7622, + "step": 3147 + }, + { + "epoch": 0.32155260469867214, + "grad_norm": 1.3077053121375404, + "learning_rate": 1.586392604589076e-05, + "loss": 0.6457, + "step": 3148 + }, + { + "epoch": 0.32165474974463737, + "grad_norm": 1.4909438059921996, + "learning_rate": 1.586124588634528e-05, + "loss": 0.7323, + "step": 3149 + }, + { + "epoch": 0.32175689479060265, + "grad_norm": 1.6009696292917666, + "learning_rate": 1.585856508528265e-05, + "loss": 0.7531, + "step": 3150 + }, + { + "epoch": 0.32185903983656794, + "grad_norm": 1.615000614156137, + "learning_rate": 1.585588364299629e-05, + "loss": 0.754, + "step": 3151 + }, + { + "epoch": 0.3219611848825332, + "grad_norm": 1.644114762029356, + "learning_rate": 1.585320155977968e-05, + "loss": 0.7645, + "step": 3152 + }, + { + "epoch": 0.32206332992849845, + "grad_norm": 1.4614037724000324, + "learning_rate": 1.5850518835926373e-05, + "loss": 0.7749, + "step": 3153 + }, + { + "epoch": 0.32216547497446374, + "grad_norm": 1.4831604856505511, + "learning_rate": 1.5847835471730002e-05, + "loss": 0.6788, + "step": 3154 + }, + { + "epoch": 0.322267620020429, + "grad_norm": 1.5093990636490668, + "learning_rate": 1.5845151467484265e-05, + "loss": 0.8108, + "step": 3155 + }, + { + "epoch": 0.32236976506639425, + "grad_norm": 1.381916304040464, + "learning_rate": 1.5842466823482917e-05, + "loss": 0.7196, + "step": 3156 + }, + { + "epoch": 0.32247191011235954, + "grad_norm": 1.4095040515783297, + "learning_rate": 1.5839781540019803e-05, + "loss": 0.7394, + "step": 3157 + }, + { + "epoch": 0.3225740551583248, + "grad_norm": 1.7946142818552229, + "learning_rate": 1.5837095617388828e-05, + "loss": 0.6953, + "step": 3158 + }, + { + "epoch": 0.3226762002042901, + "grad_norm": 1.801415091775609, + "learning_rate": 1.5834409055883964e-05, + "loss": 0.9012, + "step": 3159 + }, + { + "epoch": 0.32277834525025534, + "grad_norm": 1.5968695004440543, + "learning_rate": 1.5831721855799257e-05, + "loss": 0.8781, + "step": 3160 + }, + { + "epoch": 0.3228804902962206, + "grad_norm": 1.3775667428999565, + "learning_rate": 1.582903401742883e-05, + "loss": 0.7462, + "step": 3161 + }, + { + "epoch": 0.3229826353421859, + "grad_norm": 1.3513245425781428, + "learning_rate": 1.582634554106686e-05, + "loss": 0.5911, + "step": 3162 + }, + { + "epoch": 0.3230847803881512, + "grad_norm": 1.5372303101442195, + "learning_rate": 1.582365642700761e-05, + "loss": 0.7475, + "step": 3163 + }, + { + "epoch": 0.3231869254341164, + "grad_norm": 1.4062154194413885, + "learning_rate": 1.58209666755454e-05, + "loss": 0.6229, + "step": 3164 + }, + { + "epoch": 0.3232890704800817, + "grad_norm": 1.5092287280348367, + "learning_rate": 1.581827628697463e-05, + "loss": 0.6711, + "step": 3165 + }, + { + "epoch": 0.323391215526047, + "grad_norm": 1.4453441331123345, + "learning_rate": 1.5815585261589755e-05, + "loss": 0.7218, + "step": 3166 + }, + { + "epoch": 0.3234933605720123, + "grad_norm": 1.6067039256901203, + "learning_rate": 1.581289359968532e-05, + "loss": 0.7231, + "step": 3167 + }, + { + "epoch": 0.3235955056179775, + "grad_norm": 1.2805422802578352, + "learning_rate": 1.5810201301555922e-05, + "loss": 0.6406, + "step": 3168 + }, + { + "epoch": 0.3236976506639428, + "grad_norm": 1.4944750854197961, + "learning_rate": 1.5807508367496238e-05, + "loss": 0.7755, + "step": 3169 + }, + { + "epoch": 0.3237997957099081, + "grad_norm": 1.439145131178109, + "learning_rate": 1.5804814797801014e-05, + "loss": 0.7648, + "step": 3170 + }, + { + "epoch": 0.32390194075587336, + "grad_norm": 1.490692409040485, + "learning_rate": 1.5802120592765055e-05, + "loss": 0.7744, + "step": 3171 + }, + { + "epoch": 0.3240040858018386, + "grad_norm": 1.4527040626391075, + "learning_rate": 1.579942575268325e-05, + "loss": 0.72, + "step": 3172 + }, + { + "epoch": 0.3241062308478039, + "grad_norm": 1.5082261146308416, + "learning_rate": 1.5796730277850554e-05, + "loss": 0.6905, + "step": 3173 + }, + { + "epoch": 0.32420837589376916, + "grad_norm": 1.488172649732497, + "learning_rate": 1.5794034168561984e-05, + "loss": 0.7455, + "step": 3174 + }, + { + "epoch": 0.32431052093973445, + "grad_norm": 1.4513689627174102, + "learning_rate": 1.5791337425112626e-05, + "loss": 0.7022, + "step": 3175 + }, + { + "epoch": 0.3244126659856997, + "grad_norm": 1.4647076374182229, + "learning_rate": 1.5788640047797645e-05, + "loss": 0.7983, + "step": 3176 + }, + { + "epoch": 0.32451481103166496, + "grad_norm": 1.5571821743789693, + "learning_rate": 1.5785942036912275e-05, + "loss": 0.7128, + "step": 3177 + }, + { + "epoch": 0.32461695607763025, + "grad_norm": 1.5453019596632513, + "learning_rate": 1.5783243392751806e-05, + "loss": 0.7323, + "step": 3178 + }, + { + "epoch": 0.32471910112359553, + "grad_norm": 1.6056266796390353, + "learning_rate": 1.5780544115611615e-05, + "loss": 0.7751, + "step": 3179 + }, + { + "epoch": 0.32482124616956076, + "grad_norm": 1.4771976341250417, + "learning_rate": 1.5777844205787133e-05, + "loss": 0.7415, + "step": 3180 + }, + { + "epoch": 0.32492339121552605, + "grad_norm": 1.5650452541116633, + "learning_rate": 1.577514366357387e-05, + "loss": 0.7624, + "step": 3181 + }, + { + "epoch": 0.32502553626149133, + "grad_norm": 1.4604677141952278, + "learning_rate": 1.5772442489267406e-05, + "loss": 0.6682, + "step": 3182 + }, + { + "epoch": 0.32512768130745656, + "grad_norm": 1.515884128103966, + "learning_rate": 1.576974068316338e-05, + "loss": 0.7131, + "step": 3183 + }, + { + "epoch": 0.32522982635342185, + "grad_norm": 1.4612145656279392, + "learning_rate": 1.5767038245557505e-05, + "loss": 0.7247, + "step": 3184 + }, + { + "epoch": 0.32533197139938713, + "grad_norm": 1.3295340565880491, + "learning_rate": 1.576433517674557e-05, + "loss": 0.7107, + "step": 3185 + }, + { + "epoch": 0.3254341164453524, + "grad_norm": 1.4142487280569698, + "learning_rate": 1.5761631477023426e-05, + "loss": 0.7974, + "step": 3186 + }, + { + "epoch": 0.32553626149131765, + "grad_norm": 1.379941691974214, + "learning_rate": 1.5758927146686997e-05, + "loss": 0.5921, + "step": 3187 + }, + { + "epoch": 0.32563840653728293, + "grad_norm": 1.4172497954989012, + "learning_rate": 1.5756222186032268e-05, + "loss": 0.7809, + "step": 3188 + }, + { + "epoch": 0.3257405515832482, + "grad_norm": 1.5045828061375472, + "learning_rate": 1.5753516595355303e-05, + "loss": 0.8268, + "step": 3189 + }, + { + "epoch": 0.3258426966292135, + "grad_norm": 1.59257601836005, + "learning_rate": 1.575081037495223e-05, + "loss": 0.7619, + "step": 3190 + }, + { + "epoch": 0.32594484167517873, + "grad_norm": 1.402149768876966, + "learning_rate": 1.5748103525119245e-05, + "loss": 0.7387, + "step": 3191 + }, + { + "epoch": 0.326046986721144, + "grad_norm": 1.4853098463676446, + "learning_rate": 1.5745396046152612e-05, + "loss": 0.7495, + "step": 3192 + }, + { + "epoch": 0.3261491317671093, + "grad_norm": 1.4255981624160279, + "learning_rate": 1.5742687938348674e-05, + "loss": 0.6476, + "step": 3193 + }, + { + "epoch": 0.3262512768130746, + "grad_norm": 1.4539988230119991, + "learning_rate": 1.573997920200383e-05, + "loss": 0.8233, + "step": 3194 + }, + { + "epoch": 0.3263534218590398, + "grad_norm": 1.6198400467018736, + "learning_rate": 1.5737269837414554e-05, + "loss": 0.742, + "step": 3195 + }, + { + "epoch": 0.3264555669050051, + "grad_norm": 1.5534621823727732, + "learning_rate": 1.5734559844877385e-05, + "loss": 0.6889, + "step": 3196 + }, + { + "epoch": 0.3265577119509704, + "grad_norm": 1.3695887739846708, + "learning_rate": 1.573184922468894e-05, + "loss": 0.7033, + "step": 3197 + }, + { + "epoch": 0.32665985699693567, + "grad_norm": 1.5309164790822922, + "learning_rate": 1.5729137977145895e-05, + "loss": 0.6969, + "step": 3198 + }, + { + "epoch": 0.3267620020429009, + "grad_norm": 1.8111785168776477, + "learning_rate": 1.5726426102544994e-05, + "loss": 0.6838, + "step": 3199 + }, + { + "epoch": 0.3268641470888662, + "grad_norm": 1.4252885476313923, + "learning_rate": 1.5723713601183056e-05, + "loss": 0.726, + "step": 3200 + }, + { + "epoch": 0.32696629213483147, + "grad_norm": 1.6040566064955546, + "learning_rate": 1.5721000473356967e-05, + "loss": 0.7839, + "step": 3201 + }, + { + "epoch": 0.32706843718079676, + "grad_norm": 1.7360121551456873, + "learning_rate": 1.571828671936368e-05, + "loss": 0.7981, + "step": 3202 + }, + { + "epoch": 0.327170582226762, + "grad_norm": 1.4428765490399944, + "learning_rate": 1.5715572339500217e-05, + "loss": 0.7501, + "step": 3203 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 1.2929382342232392, + "learning_rate": 1.5712857334063668e-05, + "loss": 0.6532, + "step": 3204 + }, + { + "epoch": 0.32737487231869256, + "grad_norm": 1.376749520144248, + "learning_rate": 1.5710141703351195e-05, + "loss": 0.665, + "step": 3205 + }, + { + "epoch": 0.32747701736465784, + "grad_norm": 1.504068715685511, + "learning_rate": 1.570742544766002e-05, + "loss": 0.7442, + "step": 3206 + }, + { + "epoch": 0.32757916241062307, + "grad_norm": 1.3437591305543541, + "learning_rate": 1.5704708567287442e-05, + "loss": 0.7324, + "step": 3207 + }, + { + "epoch": 0.32768130745658836, + "grad_norm": 1.6581196350180274, + "learning_rate": 1.570199106253083e-05, + "loss": 0.7784, + "step": 3208 + }, + { + "epoch": 0.32778345250255364, + "grad_norm": 1.5000147236590053, + "learning_rate": 1.569927293368761e-05, + "loss": 0.7552, + "step": 3209 + }, + { + "epoch": 0.32788559754851887, + "grad_norm": 1.4104842674995575, + "learning_rate": 1.5696554181055287e-05, + "loss": 0.6991, + "step": 3210 + }, + { + "epoch": 0.32798774259448416, + "grad_norm": 1.4944582596542075, + "learning_rate": 1.5693834804931424e-05, + "loss": 0.7273, + "step": 3211 + }, + { + "epoch": 0.32808988764044944, + "grad_norm": 1.4642661036929416, + "learning_rate": 1.5691114805613668e-05, + "loss": 0.8251, + "step": 3212 + }, + { + "epoch": 0.3281920326864147, + "grad_norm": 1.4080630930759865, + "learning_rate": 1.5688394183399717e-05, + "loss": 0.8083, + "step": 3213 + }, + { + "epoch": 0.32829417773237995, + "grad_norm": 1.5462047125626246, + "learning_rate": 1.5685672938587347e-05, + "loss": 0.827, + "step": 3214 + }, + { + "epoch": 0.32839632277834524, + "grad_norm": 1.4190457700193895, + "learning_rate": 1.56829510714744e-05, + "loss": 0.6672, + "step": 3215 + }, + { + "epoch": 0.3284984678243105, + "grad_norm": 1.4664530886055338, + "learning_rate": 1.5680228582358786e-05, + "loss": 0.7056, + "step": 3216 + }, + { + "epoch": 0.3286006128702758, + "grad_norm": 1.4884559542880456, + "learning_rate": 1.567750547153849e-05, + "loss": 0.6796, + "step": 3217 + }, + { + "epoch": 0.32870275791624104, + "grad_norm": 1.5411691071699969, + "learning_rate": 1.5674781739311545e-05, + "loss": 0.8391, + "step": 3218 + }, + { + "epoch": 0.3288049029622063, + "grad_norm": 1.4377979561367276, + "learning_rate": 1.5672057385976076e-05, + "loss": 0.6357, + "step": 3219 + }, + { + "epoch": 0.3289070480081716, + "grad_norm": 1.417739240547038, + "learning_rate": 1.5669332411830258e-05, + "loss": 0.6553, + "step": 3220 + }, + { + "epoch": 0.3290091930541369, + "grad_norm": 1.4295232627732297, + "learning_rate": 1.566660681717235e-05, + "loss": 0.7158, + "step": 3221 + }, + { + "epoch": 0.3291113381001021, + "grad_norm": 1.3991895310723974, + "learning_rate": 1.566388060230066e-05, + "loss": 0.6807, + "step": 3222 + }, + { + "epoch": 0.3292134831460674, + "grad_norm": 1.4739461852517082, + "learning_rate": 1.5661153767513582e-05, + "loss": 0.8427, + "step": 3223 + }, + { + "epoch": 0.3293156281920327, + "grad_norm": 1.5325490107618203, + "learning_rate": 1.565842631310956e-05, + "loss": 0.746, + "step": 3224 + }, + { + "epoch": 0.329417773237998, + "grad_norm": 1.4394424834094535, + "learning_rate": 1.5655698239387128e-05, + "loss": 0.7761, + "step": 3225 + }, + { + "epoch": 0.3295199182839632, + "grad_norm": 1.4830921796079124, + "learning_rate": 1.5652969546644872e-05, + "loss": 0.8025, + "step": 3226 + }, + { + "epoch": 0.3296220633299285, + "grad_norm": 1.4858218100554632, + "learning_rate": 1.5650240235181443e-05, + "loss": 0.6648, + "step": 3227 + }, + { + "epoch": 0.3297242083758938, + "grad_norm": 1.4170608906685958, + "learning_rate": 1.564751030529557e-05, + "loss": 0.7561, + "step": 3228 + }, + { + "epoch": 0.32982635342185906, + "grad_norm": 1.5587312785237493, + "learning_rate": 1.5644779757286045e-05, + "loss": 0.7219, + "step": 3229 + }, + { + "epoch": 0.3299284984678243, + "grad_norm": 1.5105058862432446, + "learning_rate": 1.564204859145173e-05, + "loss": 0.6899, + "step": 3230 + }, + { + "epoch": 0.3300306435137896, + "grad_norm": 1.3537566807077885, + "learning_rate": 1.563931680809155e-05, + "loss": 0.7471, + "step": 3231 + }, + { + "epoch": 0.33013278855975486, + "grad_norm": 1.5907046914699265, + "learning_rate": 1.5636584407504503e-05, + "loss": 0.7846, + "step": 3232 + }, + { + "epoch": 0.33023493360572015, + "grad_norm": 1.5175111835997452, + "learning_rate": 1.563385138998965e-05, + "loss": 0.6581, + "step": 3233 + }, + { + "epoch": 0.3303370786516854, + "grad_norm": 1.4820670821889848, + "learning_rate": 1.5631117755846124e-05, + "loss": 0.6944, + "step": 3234 + }, + { + "epoch": 0.33043922369765066, + "grad_norm": 1.475688515608224, + "learning_rate": 1.562838350537312e-05, + "loss": 0.7611, + "step": 3235 + }, + { + "epoch": 0.33054136874361595, + "grad_norm": 1.4861096018849942, + "learning_rate": 1.5625648638869907e-05, + "loss": 0.7882, + "step": 3236 + }, + { + "epoch": 0.3306435137895812, + "grad_norm": 1.6703513440105708, + "learning_rate": 1.5622913156635814e-05, + "loss": 0.7852, + "step": 3237 + }, + { + "epoch": 0.33074565883554646, + "grad_norm": 1.4250526659507556, + "learning_rate": 1.562017705897024e-05, + "loss": 0.7116, + "step": 3238 + }, + { + "epoch": 0.33084780388151175, + "grad_norm": 1.4899933030397654, + "learning_rate": 1.5617440346172662e-05, + "loss": 0.8083, + "step": 3239 + }, + { + "epoch": 0.33094994892747703, + "grad_norm": 1.5058450739569742, + "learning_rate": 1.5614703018542605e-05, + "loss": 0.7349, + "step": 3240 + }, + { + "epoch": 0.33105209397344226, + "grad_norm": 1.5296706420979667, + "learning_rate": 1.5611965076379675e-05, + "loss": 0.8003, + "step": 3241 + }, + { + "epoch": 0.33115423901940755, + "grad_norm": 1.389387504490224, + "learning_rate": 1.5609226519983542e-05, + "loss": 0.7483, + "step": 3242 + }, + { + "epoch": 0.33125638406537283, + "grad_norm": 1.4157852037769132, + "learning_rate": 1.5606487349653945e-05, + "loss": 0.747, + "step": 3243 + }, + { + "epoch": 0.3313585291113381, + "grad_norm": 1.3594504359038992, + "learning_rate": 1.5603747565690682e-05, + "loss": 0.7737, + "step": 3244 + }, + { + "epoch": 0.33146067415730335, + "grad_norm": 1.4805687007310917, + "learning_rate": 1.560100716839363e-05, + "loss": 0.7246, + "step": 3245 + }, + { + "epoch": 0.33156281920326863, + "grad_norm": 1.4705456276609332, + "learning_rate": 1.5598266158062724e-05, + "loss": 0.78, + "step": 3246 + }, + { + "epoch": 0.3316649642492339, + "grad_norm": 1.4741283280059332, + "learning_rate": 1.559552453499797e-05, + "loss": 0.7354, + "step": 3247 + }, + { + "epoch": 0.3317671092951992, + "grad_norm": 1.582734547492715, + "learning_rate": 1.5592782299499437e-05, + "loss": 0.7321, + "step": 3248 + }, + { + "epoch": 0.33186925434116443, + "grad_norm": 1.420538424313716, + "learning_rate": 1.559003945186727e-05, + "loss": 0.7086, + "step": 3249 + }, + { + "epoch": 0.3319713993871297, + "grad_norm": 1.3546936074253384, + "learning_rate": 1.5587295992401675e-05, + "loss": 0.7241, + "step": 3250 + }, + { + "epoch": 0.332073544433095, + "grad_norm": 1.4814248709663673, + "learning_rate": 1.558455192140292e-05, + "loss": 0.7578, + "step": 3251 + }, + { + "epoch": 0.3321756894790603, + "grad_norm": 1.482691396240518, + "learning_rate": 1.5581807239171353e-05, + "loss": 0.7862, + "step": 3252 + }, + { + "epoch": 0.3322778345250255, + "grad_norm": 1.5750308813678915, + "learning_rate": 1.557906194600738e-05, + "loss": 0.8405, + "step": 3253 + }, + { + "epoch": 0.3323799795709908, + "grad_norm": 1.5687637368943848, + "learning_rate": 1.557631604221147e-05, + "loss": 0.8054, + "step": 3254 + }, + { + "epoch": 0.3324821246169561, + "grad_norm": 1.3853995155285455, + "learning_rate": 1.5573569528084163e-05, + "loss": 0.6503, + "step": 3255 + }, + { + "epoch": 0.3325842696629214, + "grad_norm": 1.4712951004815336, + "learning_rate": 1.5570822403926072e-05, + "loss": 0.7611, + "step": 3256 + }, + { + "epoch": 0.3326864147088866, + "grad_norm": 1.3936765100370476, + "learning_rate": 1.556807467003787e-05, + "loss": 0.6976, + "step": 3257 + }, + { + "epoch": 0.3327885597548519, + "grad_norm": 1.3901479111218875, + "learning_rate": 1.55653263267203e-05, + "loss": 0.6964, + "step": 3258 + }, + { + "epoch": 0.33289070480081717, + "grad_norm": 1.4568241506181387, + "learning_rate": 1.5562577374274165e-05, + "loss": 0.7547, + "step": 3259 + }, + { + "epoch": 0.33299284984678246, + "grad_norm": 1.4273555373360076, + "learning_rate": 1.5559827813000343e-05, + "loss": 0.7569, + "step": 3260 + }, + { + "epoch": 0.3330949948927477, + "grad_norm": 1.3646348030683986, + "learning_rate": 1.5557077643199775e-05, + "loss": 0.6259, + "step": 3261 + }, + { + "epoch": 0.33319713993871297, + "grad_norm": 1.4270874474809343, + "learning_rate": 1.5554326865173468e-05, + "loss": 0.7927, + "step": 3262 + }, + { + "epoch": 0.33329928498467826, + "grad_norm": 1.4487679303899983, + "learning_rate": 1.5551575479222497e-05, + "loss": 0.6719, + "step": 3263 + }, + { + "epoch": 0.3334014300306435, + "grad_norm": 1.4209759881235449, + "learning_rate": 1.5548823485648003e-05, + "loss": 0.6819, + "step": 3264 + }, + { + "epoch": 0.33350357507660877, + "grad_norm": 1.355652353074514, + "learning_rate": 1.5546070884751197e-05, + "loss": 0.6353, + "step": 3265 + }, + { + "epoch": 0.33360572012257406, + "grad_norm": 1.5064286591210039, + "learning_rate": 1.5543317676833346e-05, + "loss": 0.7923, + "step": 3266 + }, + { + "epoch": 0.33370786516853934, + "grad_norm": 1.4405624564922124, + "learning_rate": 1.5540563862195796e-05, + "loss": 0.6525, + "step": 3267 + }, + { + "epoch": 0.33381001021450457, + "grad_norm": 1.3540593012084174, + "learning_rate": 1.553780944113995e-05, + "loss": 0.7083, + "step": 3268 + }, + { + "epoch": 0.33391215526046986, + "grad_norm": 1.5304081513299377, + "learning_rate": 1.5535054413967282e-05, + "loss": 0.7905, + "step": 3269 + }, + { + "epoch": 0.33401430030643514, + "grad_norm": 1.3942962524055378, + "learning_rate": 1.5532298780979334e-05, + "loss": 0.776, + "step": 3270 + }, + { + "epoch": 0.3341164453524004, + "grad_norm": 1.5065332611311364, + "learning_rate": 1.552954254247771e-05, + "loss": 0.6446, + "step": 3271 + }, + { + "epoch": 0.33421859039836566, + "grad_norm": 1.3934988741390135, + "learning_rate": 1.5526785698764083e-05, + "loss": 0.7259, + "step": 3272 + }, + { + "epoch": 0.33432073544433094, + "grad_norm": 1.621800375156446, + "learning_rate": 1.552402825014019e-05, + "loss": 0.7743, + "step": 3273 + }, + { + "epoch": 0.3344228804902962, + "grad_norm": 1.5317575709755615, + "learning_rate": 1.5521270196907833e-05, + "loss": 0.7385, + "step": 3274 + }, + { + "epoch": 0.3345250255362615, + "grad_norm": 1.3858287353078322, + "learning_rate": 1.5518511539368887e-05, + "loss": 0.6952, + "step": 3275 + }, + { + "epoch": 0.33462717058222674, + "grad_norm": 1.3232904159298622, + "learning_rate": 1.551575227782529e-05, + "loss": 0.6959, + "step": 3276 + }, + { + "epoch": 0.334729315628192, + "grad_norm": 1.3327894417159267, + "learning_rate": 1.551299241257904e-05, + "loss": 0.6131, + "step": 3277 + }, + { + "epoch": 0.3348314606741573, + "grad_norm": 1.4482016217809337, + "learning_rate": 1.551023194393221e-05, + "loss": 0.8464, + "step": 3278 + }, + { + "epoch": 0.3349336057201226, + "grad_norm": 1.5305604344737402, + "learning_rate": 1.5507470872186937e-05, + "loss": 0.82, + "step": 3279 + }, + { + "epoch": 0.3350357507660878, + "grad_norm": 1.5473837823763479, + "learning_rate": 1.550470919764541e-05, + "loss": 0.6676, + "step": 3280 + }, + { + "epoch": 0.3351378958120531, + "grad_norm": 1.4840177997712543, + "learning_rate": 1.5501946920609913e-05, + "loss": 0.7853, + "step": 3281 + }, + { + "epoch": 0.3352400408580184, + "grad_norm": 1.4862972202392228, + "learning_rate": 1.5499184041382766e-05, + "loss": 0.7284, + "step": 3282 + }, + { + "epoch": 0.3353421859039837, + "grad_norm": 1.389601148518838, + "learning_rate": 1.5496420560266376e-05, + "loss": 0.7483, + "step": 3283 + }, + { + "epoch": 0.3354443309499489, + "grad_norm": 1.554107948782582, + "learning_rate": 1.54936564775632e-05, + "loss": 0.8009, + "step": 3284 + }, + { + "epoch": 0.3355464759959142, + "grad_norm": 1.540788565703329, + "learning_rate": 1.5490891793575776e-05, + "loss": 0.7022, + "step": 3285 + }, + { + "epoch": 0.3356486210418795, + "grad_norm": 1.3546561314558745, + "learning_rate": 1.5488126508606703e-05, + "loss": 0.6079, + "step": 3286 + }, + { + "epoch": 0.33575076608784477, + "grad_norm": 1.624089662665287, + "learning_rate": 1.548536062295863e-05, + "loss": 0.7486, + "step": 3287 + }, + { + "epoch": 0.33585291113381, + "grad_norm": 1.7169547667715563, + "learning_rate": 1.5482594136934294e-05, + "loss": 0.7682, + "step": 3288 + }, + { + "epoch": 0.3359550561797753, + "grad_norm": 1.5872660302032644, + "learning_rate": 1.5479827050836493e-05, + "loss": 0.7647, + "step": 3289 + }, + { + "epoch": 0.33605720122574056, + "grad_norm": 1.463373074227921, + "learning_rate": 1.5477059364968075e-05, + "loss": 0.7035, + "step": 3290 + }, + { + "epoch": 0.3361593462717058, + "grad_norm": 1.5063153124677793, + "learning_rate": 1.5474291079631974e-05, + "loss": 0.717, + "step": 3291 + }, + { + "epoch": 0.3362614913176711, + "grad_norm": 1.6154186070418297, + "learning_rate": 1.5471522195131176e-05, + "loss": 0.7438, + "step": 3292 + }, + { + "epoch": 0.33636363636363636, + "grad_norm": 1.5740479828427727, + "learning_rate": 1.5468752711768737e-05, + "loss": 0.8456, + "step": 3293 + }, + { + "epoch": 0.33646578140960165, + "grad_norm": 1.6122946783515235, + "learning_rate": 1.5465982629847785e-05, + "loss": 0.8598, + "step": 3294 + }, + { + "epoch": 0.3365679264555669, + "grad_norm": 1.5752251537811204, + "learning_rate": 1.54632119496715e-05, + "loss": 0.7607, + "step": 3295 + }, + { + "epoch": 0.33667007150153216, + "grad_norm": 1.51489808651215, + "learning_rate": 1.5460440671543135e-05, + "loss": 0.7611, + "step": 3296 + }, + { + "epoch": 0.33677221654749745, + "grad_norm": 1.3844384735656063, + "learning_rate": 1.5457668795766016e-05, + "loss": 0.647, + "step": 3297 + }, + { + "epoch": 0.33687436159346273, + "grad_norm": 1.404439985078323, + "learning_rate": 1.5454896322643516e-05, + "loss": 0.7321, + "step": 3298 + }, + { + "epoch": 0.33697650663942796, + "grad_norm": 1.3924645822976185, + "learning_rate": 1.5452123252479092e-05, + "loss": 0.6265, + "step": 3299 + }, + { + "epoch": 0.33707865168539325, + "grad_norm": 1.3772933745027827, + "learning_rate": 1.5449349585576254e-05, + "loss": 0.7704, + "step": 3300 + }, + { + "epoch": 0.33718079673135853, + "grad_norm": 1.4226809755292762, + "learning_rate": 1.5446575322238584e-05, + "loss": 0.7358, + "step": 3301 + }, + { + "epoch": 0.3372829417773238, + "grad_norm": 1.3402406284942294, + "learning_rate": 1.5443800462769728e-05, + "loss": 0.6951, + "step": 3302 + }, + { + "epoch": 0.33738508682328905, + "grad_norm": 1.53859699252445, + "learning_rate": 1.5441025007473394e-05, + "loss": 0.7451, + "step": 3303 + }, + { + "epoch": 0.33748723186925433, + "grad_norm": 1.5790097234292393, + "learning_rate": 1.543824895665335e-05, + "loss": 0.7098, + "step": 3304 + }, + { + "epoch": 0.3375893769152196, + "grad_norm": 1.3331208242031245, + "learning_rate": 1.543547231061345e-05, + "loss": 0.6585, + "step": 3305 + }, + { + "epoch": 0.3376915219611849, + "grad_norm": 1.60704440583945, + "learning_rate": 1.5432695069657596e-05, + "loss": 0.8641, + "step": 3306 + }, + { + "epoch": 0.33779366700715013, + "grad_norm": 1.4445083378762913, + "learning_rate": 1.5429917234089758e-05, + "loss": 0.653, + "step": 3307 + }, + { + "epoch": 0.3378958120531154, + "grad_norm": 1.3998612812053595, + "learning_rate": 1.5427138804213962e-05, + "loss": 0.7616, + "step": 3308 + }, + { + "epoch": 0.3379979570990807, + "grad_norm": 1.5914737369902654, + "learning_rate": 1.5424359780334326e-05, + "loss": 0.7901, + "step": 3309 + }, + { + "epoch": 0.338100102145046, + "grad_norm": 1.4992085523670196, + "learning_rate": 1.5421580162755003e-05, + "loss": 0.7541, + "step": 3310 + }, + { + "epoch": 0.3382022471910112, + "grad_norm": 1.5175183486489991, + "learning_rate": 1.541879995178023e-05, + "loss": 0.7468, + "step": 3311 + }, + { + "epoch": 0.3383043922369765, + "grad_norm": 1.5274448962791183, + "learning_rate": 1.54160191477143e-05, + "loss": 0.6931, + "step": 3312 + }, + { + "epoch": 0.3384065372829418, + "grad_norm": 1.44161675607315, + "learning_rate": 1.541323775086158e-05, + "loss": 0.6184, + "step": 3313 + }, + { + "epoch": 0.3385086823289071, + "grad_norm": 1.4060724041997152, + "learning_rate": 1.5410455761526484e-05, + "loss": 0.7778, + "step": 3314 + }, + { + "epoch": 0.3386108273748723, + "grad_norm": 1.4299226949615216, + "learning_rate": 1.5407673180013513e-05, + "loss": 0.7344, + "step": 3315 + }, + { + "epoch": 0.3387129724208376, + "grad_norm": 1.4765630577339524, + "learning_rate": 1.5404890006627214e-05, + "loss": 0.8285, + "step": 3316 + }, + { + "epoch": 0.3388151174668029, + "grad_norm": 1.1260006448172057, + "learning_rate": 1.5402106241672218e-05, + "loss": 0.5868, + "step": 3317 + }, + { + "epoch": 0.33891726251276816, + "grad_norm": 1.5121913297871572, + "learning_rate": 1.5399321885453204e-05, + "loss": 0.717, + "step": 3318 + }, + { + "epoch": 0.3390194075587334, + "grad_norm": 1.3312734342530954, + "learning_rate": 1.5396536938274915e-05, + "loss": 0.7897, + "step": 3319 + }, + { + "epoch": 0.3391215526046987, + "grad_norm": 1.48292191731946, + "learning_rate": 1.5393751400442176e-05, + "loss": 0.7083, + "step": 3320 + }, + { + "epoch": 0.33922369765066396, + "grad_norm": 1.4965597775256376, + "learning_rate": 1.539096527225986e-05, + "loss": 0.7132, + "step": 3321 + }, + { + "epoch": 0.3393258426966292, + "grad_norm": 1.4123325764417054, + "learning_rate": 1.538817855403291e-05, + "loss": 0.7032, + "step": 3322 + }, + { + "epoch": 0.3394279877425945, + "grad_norm": 1.6268857247240145, + "learning_rate": 1.538539124606634e-05, + "loss": 0.7439, + "step": 3323 + }, + { + "epoch": 0.33953013278855976, + "grad_norm": 1.4891674071196392, + "learning_rate": 1.5382603348665215e-05, + "loss": 0.7959, + "step": 3324 + }, + { + "epoch": 0.33963227783452504, + "grad_norm": 1.463889003517186, + "learning_rate": 1.5379814862134677e-05, + "loss": 0.721, + "step": 3325 + }, + { + "epoch": 0.33973442288049027, + "grad_norm": 1.285506973442181, + "learning_rate": 1.537702578677993e-05, + "loss": 0.6956, + "step": 3326 + }, + { + "epoch": 0.33983656792645556, + "grad_norm": 1.5110286865269755, + "learning_rate": 1.5374236122906233e-05, + "loss": 0.754, + "step": 3327 + }, + { + "epoch": 0.33993871297242084, + "grad_norm": 1.4863904476908825, + "learning_rate": 1.537144587081892e-05, + "loss": 0.6914, + "step": 3328 + }, + { + "epoch": 0.3400408580183861, + "grad_norm": 1.5256055667223307, + "learning_rate": 1.5368655030823388e-05, + "loss": 0.7439, + "step": 3329 + }, + { + "epoch": 0.34014300306435136, + "grad_norm": 1.4284593196147581, + "learning_rate": 1.536586360322509e-05, + "loss": 0.6974, + "step": 3330 + }, + { + "epoch": 0.34024514811031664, + "grad_norm": 1.5563762394161158, + "learning_rate": 1.536307158832956e-05, + "loss": 0.7644, + "step": 3331 + }, + { + "epoch": 0.3403472931562819, + "grad_norm": 1.256478386698582, + "learning_rate": 1.5360278986442376e-05, + "loss": 0.6987, + "step": 3332 + }, + { + "epoch": 0.3404494382022472, + "grad_norm": 1.518868212352236, + "learning_rate": 1.5357485797869192e-05, + "loss": 0.7461, + "step": 3333 + }, + { + "epoch": 0.34055158324821244, + "grad_norm": 1.5343462715601681, + "learning_rate": 1.5354692022915733e-05, + "loss": 0.7902, + "step": 3334 + }, + { + "epoch": 0.3406537282941777, + "grad_norm": 1.3447880836313535, + "learning_rate": 1.535189766188777e-05, + "loss": 0.7819, + "step": 3335 + }, + { + "epoch": 0.340755873340143, + "grad_norm": 1.5495100692126673, + "learning_rate": 1.5349102715091144e-05, + "loss": 0.7107, + "step": 3336 + }, + { + "epoch": 0.3408580183861083, + "grad_norm": 1.3718131329451053, + "learning_rate": 1.5346307182831775e-05, + "loss": 0.7419, + "step": 3337 + }, + { + "epoch": 0.3409601634320735, + "grad_norm": 1.4017797398540883, + "learning_rate": 1.534351106541563e-05, + "loss": 0.7271, + "step": 3338 + }, + { + "epoch": 0.3410623084780388, + "grad_norm": 1.4604436569380952, + "learning_rate": 1.5340714363148746e-05, + "loss": 0.7384, + "step": 3339 + }, + { + "epoch": 0.3411644535240041, + "grad_norm": 1.4253974854376215, + "learning_rate": 1.5337917076337222e-05, + "loss": 0.7415, + "step": 3340 + }, + { + "epoch": 0.3412665985699694, + "grad_norm": 1.5726795663048634, + "learning_rate": 1.533511920528723e-05, + "loss": 0.7816, + "step": 3341 + }, + { + "epoch": 0.3413687436159346, + "grad_norm": 1.4958733996899503, + "learning_rate": 1.5332320750304994e-05, + "loss": 0.8024, + "step": 3342 + }, + { + "epoch": 0.3414708886618999, + "grad_norm": 1.3021417431399696, + "learning_rate": 1.5329521711696805e-05, + "loss": 0.6668, + "step": 3343 + }, + { + "epoch": 0.3415730337078652, + "grad_norm": 1.465441727287025, + "learning_rate": 1.532672208976902e-05, + "loss": 0.9229, + "step": 3344 + }, + { + "epoch": 0.34167517875383047, + "grad_norm": 1.553032757363597, + "learning_rate": 1.532392188482806e-05, + "loss": 0.7421, + "step": 3345 + }, + { + "epoch": 0.3417773237997957, + "grad_norm": 1.66435108372262, + "learning_rate": 1.5321121097180414e-05, + "loss": 0.6508, + "step": 3346 + }, + { + "epoch": 0.341879468845761, + "grad_norm": 1.365435676369416, + "learning_rate": 1.5318319727132625e-05, + "loss": 0.782, + "step": 3347 + }, + { + "epoch": 0.34198161389172627, + "grad_norm": 1.445110387830733, + "learning_rate": 1.5315517774991303e-05, + "loss": 0.8075, + "step": 3348 + }, + { + "epoch": 0.3420837589376915, + "grad_norm": 1.3394268781671446, + "learning_rate": 1.5312715241063128e-05, + "loss": 0.7988, + "step": 3349 + }, + { + "epoch": 0.3421859039836568, + "grad_norm": 1.4619371548541915, + "learning_rate": 1.530991212565484e-05, + "loss": 0.7783, + "step": 3350 + }, + { + "epoch": 0.34228804902962207, + "grad_norm": 1.4910610273297067, + "learning_rate": 1.5307108429073237e-05, + "loss": 0.7818, + "step": 3351 + }, + { + "epoch": 0.34239019407558735, + "grad_norm": 1.3957346663597578, + "learning_rate": 1.5304304151625185e-05, + "loss": 0.723, + "step": 3352 + }, + { + "epoch": 0.3424923391215526, + "grad_norm": 1.57406759961631, + "learning_rate": 1.530149929361762e-05, + "loss": 0.6834, + "step": 3353 + }, + { + "epoch": 0.34259448416751787, + "grad_norm": 1.5363157234836475, + "learning_rate": 1.529869385535753e-05, + "loss": 0.7606, + "step": 3354 + }, + { + "epoch": 0.34269662921348315, + "grad_norm": 1.4365716888318951, + "learning_rate": 1.5295887837151977e-05, + "loss": 0.7621, + "step": 3355 + }, + { + "epoch": 0.34279877425944844, + "grad_norm": 1.313998023706264, + "learning_rate": 1.5293081239308074e-05, + "loss": 0.6554, + "step": 3356 + }, + { + "epoch": 0.34290091930541367, + "grad_norm": 1.6567305420412044, + "learning_rate": 1.5290274062133015e-05, + "loss": 0.901, + "step": 3357 + }, + { + "epoch": 0.34300306435137895, + "grad_norm": 1.5225169070614142, + "learning_rate": 1.528746630593404e-05, + "loss": 0.7319, + "step": 3358 + }, + { + "epoch": 0.34310520939734424, + "grad_norm": 1.3491875636552484, + "learning_rate": 1.528465797101846e-05, + "loss": 0.7435, + "step": 3359 + }, + { + "epoch": 0.3432073544433095, + "grad_norm": 1.476300789979567, + "learning_rate": 1.528184905769365e-05, + "loss": 0.7791, + "step": 3360 + }, + { + "epoch": 0.34330949948927475, + "grad_norm": 1.5605263918426975, + "learning_rate": 1.527903956626705e-05, + "loss": 0.7029, + "step": 3361 + }, + { + "epoch": 0.34341164453524003, + "grad_norm": 1.3666902001507952, + "learning_rate": 1.527622949704616e-05, + "loss": 0.6527, + "step": 3362 + }, + { + "epoch": 0.3435137895812053, + "grad_norm": 1.3029056227785296, + "learning_rate": 1.5273418850338542e-05, + "loss": 0.688, + "step": 3363 + }, + { + "epoch": 0.3436159346271706, + "grad_norm": 1.3867867853246734, + "learning_rate": 1.527060762645182e-05, + "loss": 0.7276, + "step": 3364 + }, + { + "epoch": 0.34371807967313583, + "grad_norm": 1.4936931658326622, + "learning_rate": 1.5267795825693693e-05, + "loss": 0.8404, + "step": 3365 + }, + { + "epoch": 0.3438202247191011, + "grad_norm": 1.4503041754999721, + "learning_rate": 1.5264983448371907e-05, + "loss": 0.6835, + "step": 3366 + }, + { + "epoch": 0.3439223697650664, + "grad_norm": 1.436405493899439, + "learning_rate": 1.526217049479428e-05, + "loss": 0.7392, + "step": 3367 + }, + { + "epoch": 0.3440245148110317, + "grad_norm": 1.6534927222554228, + "learning_rate": 1.5259356965268695e-05, + "loss": 0.8119, + "step": 3368 + }, + { + "epoch": 0.3441266598569969, + "grad_norm": 1.5312410489326242, + "learning_rate": 1.5256542860103091e-05, + "loss": 0.811, + "step": 3369 + }, + { + "epoch": 0.3442288049029622, + "grad_norm": 1.5191926596049206, + "learning_rate": 1.5253728179605479e-05, + "loss": 0.846, + "step": 3370 + }, + { + "epoch": 0.3443309499489275, + "grad_norm": 1.4769420038099945, + "learning_rate": 1.5250912924083915e-05, + "loss": 0.8066, + "step": 3371 + }, + { + "epoch": 0.3444330949948928, + "grad_norm": 1.4546985029209918, + "learning_rate": 1.5248097093846545e-05, + "loss": 0.79, + "step": 3372 + }, + { + "epoch": 0.344535240040858, + "grad_norm": 1.3897632115074896, + "learning_rate": 1.5245280689201556e-05, + "loss": 0.815, + "step": 3373 + }, + { + "epoch": 0.3446373850868233, + "grad_norm": 1.4671974502722975, + "learning_rate": 1.5242463710457206e-05, + "loss": 0.7651, + "step": 3374 + }, + { + "epoch": 0.3447395301327886, + "grad_norm": 1.6341393402331448, + "learning_rate": 1.5239646157921817e-05, + "loss": 0.7912, + "step": 3375 + }, + { + "epoch": 0.3448416751787538, + "grad_norm": 1.4799974358657892, + "learning_rate": 1.523682803190377e-05, + "loss": 0.7871, + "step": 3376 + }, + { + "epoch": 0.3449438202247191, + "grad_norm": 1.4650289632212923, + "learning_rate": 1.5234009332711512e-05, + "loss": 0.6692, + "step": 3377 + }, + { + "epoch": 0.3450459652706844, + "grad_norm": 1.51744225649683, + "learning_rate": 1.523119006065355e-05, + "loss": 0.8473, + "step": 3378 + }, + { + "epoch": 0.34514811031664966, + "grad_norm": 1.3094245334154937, + "learning_rate": 1.5228370216038455e-05, + "loss": 0.7104, + "step": 3379 + }, + { + "epoch": 0.3452502553626149, + "grad_norm": 1.5442241536259158, + "learning_rate": 1.5225549799174863e-05, + "loss": 0.7623, + "step": 3380 + }, + { + "epoch": 0.3453524004085802, + "grad_norm": 1.4530108725648814, + "learning_rate": 1.522272881037147e-05, + "loss": 0.7884, + "step": 3381 + }, + { + "epoch": 0.34545454545454546, + "grad_norm": 1.5058314385062694, + "learning_rate": 1.5219907249937036e-05, + "loss": 0.7337, + "step": 3382 + }, + { + "epoch": 0.34555669050051074, + "grad_norm": 1.4492761932463916, + "learning_rate": 1.5217085118180377e-05, + "loss": 0.827, + "step": 3383 + }, + { + "epoch": 0.345658835546476, + "grad_norm": 1.470222863519962, + "learning_rate": 1.5214262415410384e-05, + "loss": 0.7552, + "step": 3384 + }, + { + "epoch": 0.34576098059244126, + "grad_norm": 1.5634339613105022, + "learning_rate": 1.5211439141936e-05, + "loss": 0.7548, + "step": 3385 + }, + { + "epoch": 0.34586312563840654, + "grad_norm": 1.3837891548369847, + "learning_rate": 1.5208615298066237e-05, + "loss": 0.7685, + "step": 3386 + }, + { + "epoch": 0.34596527068437183, + "grad_norm": 1.3664539290780435, + "learning_rate": 1.5205790884110161e-05, + "loss": 0.71, + "step": 3387 + }, + { + "epoch": 0.34606741573033706, + "grad_norm": 1.5372413174893773, + "learning_rate": 1.520296590037691e-05, + "loss": 0.7299, + "step": 3388 + }, + { + "epoch": 0.34616956077630234, + "grad_norm": 1.319893487694971, + "learning_rate": 1.5200140347175683e-05, + "loss": 0.7022, + "step": 3389 + }, + { + "epoch": 0.34627170582226763, + "grad_norm": 1.4939101597645068, + "learning_rate": 1.5197314224815732e-05, + "loss": 0.7206, + "step": 3390 + }, + { + "epoch": 0.3463738508682329, + "grad_norm": 1.4616953213066424, + "learning_rate": 1.5194487533606382e-05, + "loss": 0.7478, + "step": 3391 + }, + { + "epoch": 0.34647599591419814, + "grad_norm": 1.3979791502407504, + "learning_rate": 1.5191660273857013e-05, + "loss": 0.7256, + "step": 3392 + }, + { + "epoch": 0.3465781409601634, + "grad_norm": 1.5292771036050423, + "learning_rate": 1.5188832445877075e-05, + "loss": 0.7095, + "step": 3393 + }, + { + "epoch": 0.3466802860061287, + "grad_norm": 1.2429152568794204, + "learning_rate": 1.5186004049976075e-05, + "loss": 0.7607, + "step": 3394 + }, + { + "epoch": 0.346782431052094, + "grad_norm": 1.4826450261413437, + "learning_rate": 1.5183175086463577e-05, + "loss": 0.7012, + "step": 3395 + }, + { + "epoch": 0.3468845760980592, + "grad_norm": 1.351328627507031, + "learning_rate": 1.5180345555649221e-05, + "loss": 0.6151, + "step": 3396 + }, + { + "epoch": 0.3469867211440245, + "grad_norm": 1.5180780738584176, + "learning_rate": 1.5177515457842695e-05, + "loss": 0.7667, + "step": 3397 + }, + { + "epoch": 0.3470888661899898, + "grad_norm": 1.4311420068513432, + "learning_rate": 1.517468479335376e-05, + "loss": 0.6755, + "step": 3398 + }, + { + "epoch": 0.3471910112359551, + "grad_norm": 1.5549500688027087, + "learning_rate": 1.517185356249223e-05, + "loss": 0.7548, + "step": 3399 + }, + { + "epoch": 0.3472931562819203, + "grad_norm": 1.4420062826177098, + "learning_rate": 1.5169021765567982e-05, + "loss": 0.7128, + "step": 3400 + }, + { + "epoch": 0.3473953013278856, + "grad_norm": 1.57363409920895, + "learning_rate": 1.5166189402890964e-05, + "loss": 0.7371, + "step": 3401 + }, + { + "epoch": 0.3474974463738509, + "grad_norm": 1.4636790509833857, + "learning_rate": 1.516335647477118e-05, + "loss": 0.7682, + "step": 3402 + }, + { + "epoch": 0.3475995914198161, + "grad_norm": 1.398249307043079, + "learning_rate": 1.5160522981518693e-05, + "loss": 0.7503, + "step": 3403 + }, + { + "epoch": 0.3477017364657814, + "grad_norm": 1.3814416886241143, + "learning_rate": 1.5157688923443631e-05, + "loss": 0.6841, + "step": 3404 + }, + { + "epoch": 0.3478038815117467, + "grad_norm": 1.4062328275645273, + "learning_rate": 1.5154854300856183e-05, + "loss": 0.7348, + "step": 3405 + }, + { + "epoch": 0.34790602655771197, + "grad_norm": 1.4644360574743611, + "learning_rate": 1.5152019114066607e-05, + "loss": 0.6388, + "step": 3406 + }, + { + "epoch": 0.3480081716036772, + "grad_norm": 1.4108025819183712, + "learning_rate": 1.5149183363385204e-05, + "loss": 0.7791, + "step": 3407 + }, + { + "epoch": 0.3481103166496425, + "grad_norm": 1.324044635313514, + "learning_rate": 1.5146347049122359e-05, + "loss": 0.7488, + "step": 3408 + }, + { + "epoch": 0.34821246169560777, + "grad_norm": 1.2990363260539666, + "learning_rate": 1.5143510171588503e-05, + "loss": 0.5292, + "step": 3409 + }, + { + "epoch": 0.34831460674157305, + "grad_norm": 1.3252633140563592, + "learning_rate": 1.5140672731094132e-05, + "loss": 0.6736, + "step": 3410 + }, + { + "epoch": 0.3484167517875383, + "grad_norm": 1.5846164995459888, + "learning_rate": 1.5137834727949816e-05, + "loss": 0.7278, + "step": 3411 + }, + { + "epoch": 0.34851889683350357, + "grad_norm": 1.556090445659398, + "learning_rate": 1.5134996162466165e-05, + "loss": 0.6273, + "step": 3412 + }, + { + "epoch": 0.34862104187946885, + "grad_norm": 1.612785848448576, + "learning_rate": 1.5132157034953868e-05, + "loss": 0.7402, + "step": 3413 + }, + { + "epoch": 0.34872318692543414, + "grad_norm": 1.6135834246552838, + "learning_rate": 1.5129317345723666e-05, + "loss": 0.7582, + "step": 3414 + }, + { + "epoch": 0.34882533197139937, + "grad_norm": 1.4487798633456597, + "learning_rate": 1.5126477095086369e-05, + "loss": 0.6119, + "step": 3415 + }, + { + "epoch": 0.34892747701736465, + "grad_norm": 1.430202752605433, + "learning_rate": 1.512363628335284e-05, + "loss": 0.7548, + "step": 3416 + }, + { + "epoch": 0.34902962206332994, + "grad_norm": 1.4468497281872048, + "learning_rate": 1.5120794910834011e-05, + "loss": 0.8076, + "step": 3417 + }, + { + "epoch": 0.3491317671092952, + "grad_norm": 1.352474348751627, + "learning_rate": 1.511795297784087e-05, + "loss": 0.7886, + "step": 3418 + }, + { + "epoch": 0.34923391215526045, + "grad_norm": 1.4771702269152291, + "learning_rate": 1.5115110484684468e-05, + "loss": 0.7286, + "step": 3419 + }, + { + "epoch": 0.34933605720122574, + "grad_norm": 1.546063194323747, + "learning_rate": 1.5112267431675917e-05, + "loss": 0.6583, + "step": 3420 + }, + { + "epoch": 0.349438202247191, + "grad_norm": 1.5270215823614932, + "learning_rate": 1.51094238191264e-05, + "loss": 0.7424, + "step": 3421 + }, + { + "epoch": 0.3495403472931563, + "grad_norm": 1.4887727674525129, + "learning_rate": 1.5106579647347137e-05, + "loss": 0.7102, + "step": 3422 + }, + { + "epoch": 0.34964249233912154, + "grad_norm": 1.4410281802057714, + "learning_rate": 1.5103734916649435e-05, + "loss": 0.8522, + "step": 3423 + }, + { + "epoch": 0.3497446373850868, + "grad_norm": 1.3465310216277304, + "learning_rate": 1.510088962734465e-05, + "loss": 0.5364, + "step": 3424 + }, + { + "epoch": 0.3498467824310521, + "grad_norm": 1.35223319576278, + "learning_rate": 1.5098043779744199e-05, + "loss": 0.6654, + "step": 3425 + }, + { + "epoch": 0.3499489274770174, + "grad_norm": 1.5292233619163258, + "learning_rate": 1.5095197374159563e-05, + "loss": 0.6185, + "step": 3426 + }, + { + "epoch": 0.3500510725229826, + "grad_norm": 1.3571795257377226, + "learning_rate": 1.509235041090228e-05, + "loss": 0.8161, + "step": 3427 + }, + { + "epoch": 0.3501532175689479, + "grad_norm": 1.4044316185486687, + "learning_rate": 1.5089502890283956e-05, + "loss": 0.6069, + "step": 3428 + }, + { + "epoch": 0.3502553626149132, + "grad_norm": 1.4672762982420333, + "learning_rate": 1.5086654812616252e-05, + "loss": 0.7699, + "step": 3429 + }, + { + "epoch": 0.3503575076608784, + "grad_norm": 1.530689324186679, + "learning_rate": 1.5083806178210896e-05, + "loss": 0.7831, + "step": 3430 + }, + { + "epoch": 0.3504596527068437, + "grad_norm": 1.492672955090508, + "learning_rate": 1.5080956987379667e-05, + "loss": 0.764, + "step": 3431 + }, + { + "epoch": 0.350561797752809, + "grad_norm": 1.4420435803893972, + "learning_rate": 1.507810724043441e-05, + "loss": 0.6622, + "step": 3432 + }, + { + "epoch": 0.3506639427987743, + "grad_norm": 1.4629403640541534, + "learning_rate": 1.5075256937687037e-05, + "loss": 0.7666, + "step": 3433 + }, + { + "epoch": 0.3507660878447395, + "grad_norm": 1.3296728703171432, + "learning_rate": 1.5072406079449513e-05, + "loss": 0.6275, + "step": 3434 + }, + { + "epoch": 0.3508682328907048, + "grad_norm": 1.5129103232056698, + "learning_rate": 1.5069554666033868e-05, + "loss": 0.7632, + "step": 3435 + }, + { + "epoch": 0.3509703779366701, + "grad_norm": 1.3253104284979316, + "learning_rate": 1.5066702697752189e-05, + "loss": 0.6696, + "step": 3436 + }, + { + "epoch": 0.35107252298263536, + "grad_norm": 1.3599403749533974, + "learning_rate": 1.5063850174916623e-05, + "loss": 0.6437, + "step": 3437 + }, + { + "epoch": 0.3511746680286006, + "grad_norm": 1.4580357630489664, + "learning_rate": 1.5060997097839387e-05, + "loss": 0.7275, + "step": 3438 + }, + { + "epoch": 0.3512768130745659, + "grad_norm": 1.4802066664080362, + "learning_rate": 1.5058143466832746e-05, + "loss": 0.8175, + "step": 3439 + }, + { + "epoch": 0.35137895812053116, + "grad_norm": 1.4531321987947696, + "learning_rate": 1.5055289282209038e-05, + "loss": 0.7293, + "step": 3440 + }, + { + "epoch": 0.35148110316649644, + "grad_norm": 1.5941396051836583, + "learning_rate": 1.5052434544280653e-05, + "loss": 0.697, + "step": 3441 + }, + { + "epoch": 0.3515832482124617, + "grad_norm": 1.4136744475621397, + "learning_rate": 1.5049579253360036e-05, + "loss": 0.6729, + "step": 3442 + }, + { + "epoch": 0.35168539325842696, + "grad_norm": 1.546678772075708, + "learning_rate": 1.5046723409759714e-05, + "loss": 0.8121, + "step": 3443 + }, + { + "epoch": 0.35178753830439224, + "grad_norm": 1.363543008520354, + "learning_rate": 1.5043867013792247e-05, + "loss": 0.6115, + "step": 3444 + }, + { + "epoch": 0.35188968335035753, + "grad_norm": 1.5075641164706608, + "learning_rate": 1.5041010065770283e-05, + "loss": 0.7689, + "step": 3445 + }, + { + "epoch": 0.35199182839632276, + "grad_norm": 1.4259825773755006, + "learning_rate": 1.5038152566006509e-05, + "loss": 0.7309, + "step": 3446 + }, + { + "epoch": 0.35209397344228804, + "grad_norm": 1.508435323702614, + "learning_rate": 1.5035294514813683e-05, + "loss": 0.7875, + "step": 3447 + }, + { + "epoch": 0.35219611848825333, + "grad_norm": 1.449852434100444, + "learning_rate": 1.5032435912504614e-05, + "loss": 0.7518, + "step": 3448 + }, + { + "epoch": 0.3522982635342186, + "grad_norm": 1.3928249433419313, + "learning_rate": 1.5029576759392188e-05, + "loss": 0.7527, + "step": 3449 + }, + { + "epoch": 0.35240040858018384, + "grad_norm": 1.3565495072430203, + "learning_rate": 1.5026717055789335e-05, + "loss": 0.6281, + "step": 3450 + }, + { + "epoch": 0.35250255362614913, + "grad_norm": 1.4871685648001876, + "learning_rate": 1.5023856802009051e-05, + "loss": 0.66, + "step": 3451 + }, + { + "epoch": 0.3526046986721144, + "grad_norm": 1.3942758923372949, + "learning_rate": 1.5020995998364396e-05, + "loss": 0.7584, + "step": 3452 + }, + { + "epoch": 0.3527068437180797, + "grad_norm": 1.462698950660686, + "learning_rate": 1.5018134645168485e-05, + "loss": 0.7722, + "step": 3453 + }, + { + "epoch": 0.35280898876404493, + "grad_norm": 1.4361997782887943, + "learning_rate": 1.5015272742734492e-05, + "loss": 0.6669, + "step": 3454 + }, + { + "epoch": 0.3529111338100102, + "grad_norm": 1.4663552686239123, + "learning_rate": 1.501241029137566e-05, + "loss": 0.7998, + "step": 3455 + }, + { + "epoch": 0.3530132788559755, + "grad_norm": 1.2869539340291634, + "learning_rate": 1.5009547291405281e-05, + "loss": 0.6173, + "step": 3456 + }, + { + "epoch": 0.35311542390194073, + "grad_norm": 1.4809190532159078, + "learning_rate": 1.5006683743136718e-05, + "loss": 0.748, + "step": 3457 + }, + { + "epoch": 0.353217568947906, + "grad_norm": 1.3587705125384204, + "learning_rate": 1.5003819646883382e-05, + "loss": 0.7565, + "step": 3458 + }, + { + "epoch": 0.3533197139938713, + "grad_norm": 1.4941894834093674, + "learning_rate": 1.5000955002958755e-05, + "loss": 0.7689, + "step": 3459 + }, + { + "epoch": 0.3534218590398366, + "grad_norm": 1.4420640472446187, + "learning_rate": 1.4998089811676369e-05, + "loss": 0.6285, + "step": 3460 + }, + { + "epoch": 0.3535240040858018, + "grad_norm": 1.4642921926943364, + "learning_rate": 1.4995224073349823e-05, + "loss": 0.7587, + "step": 3461 + }, + { + "epoch": 0.3536261491317671, + "grad_norm": 1.4832647330136208, + "learning_rate": 1.4992357788292777e-05, + "loss": 0.6677, + "step": 3462 + }, + { + "epoch": 0.3537282941777324, + "grad_norm": 1.3651078885340238, + "learning_rate": 1.4989490956818946e-05, + "loss": 0.6582, + "step": 3463 + }, + { + "epoch": 0.35383043922369767, + "grad_norm": 1.5129772279950182, + "learning_rate": 1.4986623579242108e-05, + "loss": 0.7867, + "step": 3464 + }, + { + "epoch": 0.3539325842696629, + "grad_norm": 1.4015547421215944, + "learning_rate": 1.4983755655876094e-05, + "loss": 0.7968, + "step": 3465 + }, + { + "epoch": 0.3540347293156282, + "grad_norm": 1.5685636800194525, + "learning_rate": 1.4980887187034802e-05, + "loss": 0.755, + "step": 3466 + }, + { + "epoch": 0.35413687436159347, + "grad_norm": 1.5158432370555537, + "learning_rate": 1.4978018173032194e-05, + "loss": 0.72, + "step": 3467 + }, + { + "epoch": 0.35423901940755875, + "grad_norm": 1.4901399747260724, + "learning_rate": 1.4975148614182278e-05, + "loss": 0.7535, + "step": 3468 + }, + { + "epoch": 0.354341164453524, + "grad_norm": 1.5628745024265729, + "learning_rate": 1.4972278510799132e-05, + "loss": 0.7556, + "step": 3469 + }, + { + "epoch": 0.35444330949948927, + "grad_norm": 1.5339507765919724, + "learning_rate": 1.4969407863196892e-05, + "loss": 0.7727, + "step": 3470 + }, + { + "epoch": 0.35454545454545455, + "grad_norm": 1.5089756292586483, + "learning_rate": 1.496653667168975e-05, + "loss": 0.7992, + "step": 3471 + }, + { + "epoch": 0.35464759959141984, + "grad_norm": 1.4500057358146876, + "learning_rate": 1.496366493659196e-05, + "loss": 0.7376, + "step": 3472 + }, + { + "epoch": 0.35474974463738507, + "grad_norm": 1.4892080436987092, + "learning_rate": 1.4960792658217833e-05, + "loss": 0.7403, + "step": 3473 + }, + { + "epoch": 0.35485188968335035, + "grad_norm": 1.5421938469781589, + "learning_rate": 1.4957919836881749e-05, + "loss": 0.6524, + "step": 3474 + }, + { + "epoch": 0.35495403472931564, + "grad_norm": 1.497063599842979, + "learning_rate": 1.4955046472898136e-05, + "loss": 0.7861, + "step": 3475 + }, + { + "epoch": 0.3550561797752809, + "grad_norm": 1.3199440293674702, + "learning_rate": 1.4952172566581483e-05, + "loss": 0.7108, + "step": 3476 + }, + { + "epoch": 0.35515832482124615, + "grad_norm": 1.3792026544676559, + "learning_rate": 1.494929811824634e-05, + "loss": 0.6613, + "step": 3477 + }, + { + "epoch": 0.35526046986721144, + "grad_norm": 1.5444835796379772, + "learning_rate": 1.4946423128207323e-05, + "loss": 0.8557, + "step": 3478 + }, + { + "epoch": 0.3553626149131767, + "grad_norm": 1.5221103489030912, + "learning_rate": 1.4943547596779101e-05, + "loss": 0.7461, + "step": 3479 + }, + { + "epoch": 0.355464759959142, + "grad_norm": 1.382138657837125, + "learning_rate": 1.4940671524276397e-05, + "loss": 0.7458, + "step": 3480 + }, + { + "epoch": 0.35556690500510724, + "grad_norm": 1.3067897680407983, + "learning_rate": 1.4937794911014006e-05, + "loss": 0.7342, + "step": 3481 + }, + { + "epoch": 0.3556690500510725, + "grad_norm": 1.450379527853015, + "learning_rate": 1.493491775730677e-05, + "loss": 0.7262, + "step": 3482 + }, + { + "epoch": 0.3557711950970378, + "grad_norm": 1.5820857410791551, + "learning_rate": 1.4932040063469596e-05, + "loss": 0.7621, + "step": 3483 + }, + { + "epoch": 0.3558733401430031, + "grad_norm": 1.595097436051451, + "learning_rate": 1.4929161829817456e-05, + "loss": 0.862, + "step": 3484 + }, + { + "epoch": 0.3559754851889683, + "grad_norm": 1.433964036543034, + "learning_rate": 1.4926283056665366e-05, + "loss": 0.6741, + "step": 3485 + }, + { + "epoch": 0.3560776302349336, + "grad_norm": 1.5682518553936355, + "learning_rate": 1.4923403744328408e-05, + "loss": 0.7717, + "step": 3486 + }, + { + "epoch": 0.3561797752808989, + "grad_norm": 1.5500563506333622, + "learning_rate": 1.4920523893121735e-05, + "loss": 0.6815, + "step": 3487 + }, + { + "epoch": 0.3562819203268641, + "grad_norm": 1.3639794747253813, + "learning_rate": 1.4917643503360539e-05, + "loss": 0.6409, + "step": 3488 + }, + { + "epoch": 0.3563840653728294, + "grad_norm": 1.4646025986180748, + "learning_rate": 1.4914762575360087e-05, + "loss": 0.7432, + "step": 3489 + }, + { + "epoch": 0.3564862104187947, + "grad_norm": 1.4506767383957861, + "learning_rate": 1.4911881109435693e-05, + "loss": 0.7821, + "step": 3490 + }, + { + "epoch": 0.35658835546476, + "grad_norm": 1.3308058075038096, + "learning_rate": 1.4908999105902742e-05, + "loss": 0.7471, + "step": 3491 + }, + { + "epoch": 0.3566905005107252, + "grad_norm": 1.5030790935971772, + "learning_rate": 1.4906116565076667e-05, + "loss": 0.8164, + "step": 3492 + }, + { + "epoch": 0.3567926455566905, + "grad_norm": 1.36923071995333, + "learning_rate": 1.4903233487272959e-05, + "loss": 0.6974, + "step": 3493 + }, + { + "epoch": 0.3568947906026558, + "grad_norm": 1.4601808354390002, + "learning_rate": 1.4900349872807183e-05, + "loss": 0.762, + "step": 3494 + }, + { + "epoch": 0.35699693564862106, + "grad_norm": 1.5543228749845361, + "learning_rate": 1.4897465721994943e-05, + "loss": 0.7681, + "step": 3495 + }, + { + "epoch": 0.3570990806945863, + "grad_norm": 1.6335611690412835, + "learning_rate": 1.4894581035151921e-05, + "loss": 0.791, + "step": 3496 + }, + { + "epoch": 0.3572012257405516, + "grad_norm": 1.5247387578776723, + "learning_rate": 1.4891695812593838e-05, + "loss": 0.711, + "step": 3497 + }, + { + "epoch": 0.35730337078651686, + "grad_norm": 1.4405323528418739, + "learning_rate": 1.4888810054636491e-05, + "loss": 0.7896, + "step": 3498 + }, + { + "epoch": 0.35740551583248215, + "grad_norm": 1.533419544009103, + "learning_rate": 1.4885923761595724e-05, + "loss": 0.7727, + "step": 3499 + }, + { + "epoch": 0.3575076608784474, + "grad_norm": 1.4683954123028158, + "learning_rate": 1.4883036933787446e-05, + "loss": 0.7563, + "step": 3500 + }, + { + "epoch": 0.35760980592441266, + "grad_norm": 1.4881631049242774, + "learning_rate": 1.4880149571527616e-05, + "loss": 0.7897, + "step": 3501 + }, + { + "epoch": 0.35771195097037795, + "grad_norm": 1.3734756063283173, + "learning_rate": 1.4877261675132267e-05, + "loss": 0.6308, + "step": 3502 + }, + { + "epoch": 0.35781409601634323, + "grad_norm": 1.3623119703365125, + "learning_rate": 1.4874373244917473e-05, + "loss": 0.6912, + "step": 3503 + }, + { + "epoch": 0.35791624106230846, + "grad_norm": 1.4789831179889628, + "learning_rate": 1.4871484281199381e-05, + "loss": 0.8306, + "step": 3504 + }, + { + "epoch": 0.35801838610827375, + "grad_norm": 1.4084139593351896, + "learning_rate": 1.4868594784294183e-05, + "loss": 0.7265, + "step": 3505 + }, + { + "epoch": 0.35812053115423903, + "grad_norm": 1.2346101749999052, + "learning_rate": 1.4865704754518144e-05, + "loss": 0.6024, + "step": 3506 + }, + { + "epoch": 0.3582226762002043, + "grad_norm": 1.4930524368727602, + "learning_rate": 1.4862814192187575e-05, + "loss": 0.8381, + "step": 3507 + }, + { + "epoch": 0.35832482124616954, + "grad_norm": 1.482995172744597, + "learning_rate": 1.4859923097618854e-05, + "loss": 0.7924, + "step": 3508 + }, + { + "epoch": 0.35842696629213483, + "grad_norm": 1.5561701857128722, + "learning_rate": 1.4857031471128407e-05, + "loss": 0.8678, + "step": 3509 + }, + { + "epoch": 0.3585291113381001, + "grad_norm": 1.476051750138663, + "learning_rate": 1.4854139313032727e-05, + "loss": 0.6984, + "step": 3510 + }, + { + "epoch": 0.3586312563840654, + "grad_norm": 1.3657890636717835, + "learning_rate": 1.4851246623648364e-05, + "loss": 0.699, + "step": 3511 + }, + { + "epoch": 0.35873340143003063, + "grad_norm": 1.5620935653976191, + "learning_rate": 1.4848353403291924e-05, + "loss": 0.7332, + "step": 3512 + }, + { + "epoch": 0.3588355464759959, + "grad_norm": 1.4404267957674066, + "learning_rate": 1.4845459652280069e-05, + "loss": 0.7652, + "step": 3513 + }, + { + "epoch": 0.3589376915219612, + "grad_norm": 1.4888897071950422, + "learning_rate": 1.4842565370929528e-05, + "loss": 0.7558, + "step": 3514 + }, + { + "epoch": 0.35903983656792643, + "grad_norm": 1.4881428369182534, + "learning_rate": 1.4839670559557076e-05, + "loss": 0.7267, + "step": 3515 + }, + { + "epoch": 0.3591419816138917, + "grad_norm": 1.4926361925075338, + "learning_rate": 1.4836775218479558e-05, + "loss": 0.7338, + "step": 3516 + }, + { + "epoch": 0.359244126659857, + "grad_norm": 1.5546982153545426, + "learning_rate": 1.4833879348013862e-05, + "loss": 0.8417, + "step": 3517 + }, + { + "epoch": 0.3593462717058223, + "grad_norm": 1.2873060988536296, + "learning_rate": 1.483098294847695e-05, + "loss": 0.6561, + "step": 3518 + }, + { + "epoch": 0.3594484167517875, + "grad_norm": 1.4805183222159815, + "learning_rate": 1.4828086020185837e-05, + "loss": 0.7404, + "step": 3519 + }, + { + "epoch": 0.3595505617977528, + "grad_norm": 1.3884670157696586, + "learning_rate": 1.4825188563457586e-05, + "loss": 0.7653, + "step": 3520 + }, + { + "epoch": 0.3596527068437181, + "grad_norm": 1.3166762654571889, + "learning_rate": 1.4822290578609329e-05, + "loss": 0.7986, + "step": 3521 + }, + { + "epoch": 0.35975485188968337, + "grad_norm": 1.4960164084815841, + "learning_rate": 1.4819392065958253e-05, + "loss": 0.7157, + "step": 3522 + }, + { + "epoch": 0.3598569969356486, + "grad_norm": 1.4307481875637516, + "learning_rate": 1.4816493025821603e-05, + "loss": 0.6549, + "step": 3523 + }, + { + "epoch": 0.3599591419816139, + "grad_norm": 1.315270486104617, + "learning_rate": 1.4813593458516677e-05, + "loss": 0.6173, + "step": 3524 + }, + { + "epoch": 0.36006128702757917, + "grad_norm": 1.4996307917914726, + "learning_rate": 1.4810693364360839e-05, + "loss": 0.7079, + "step": 3525 + }, + { + "epoch": 0.36016343207354445, + "grad_norm": 1.6254771963374428, + "learning_rate": 1.4807792743671504e-05, + "loss": 0.7053, + "step": 3526 + }, + { + "epoch": 0.3602655771195097, + "grad_norm": 1.4042117546019866, + "learning_rate": 1.480489159676615e-05, + "loss": 0.812, + "step": 3527 + }, + { + "epoch": 0.36036772216547497, + "grad_norm": 1.4381609526182655, + "learning_rate": 1.4801989923962304e-05, + "loss": 0.7484, + "step": 3528 + }, + { + "epoch": 0.36046986721144025, + "grad_norm": 1.4427478658430821, + "learning_rate": 1.4799087725577557e-05, + "loss": 0.7837, + "step": 3529 + }, + { + "epoch": 0.36057201225740554, + "grad_norm": 1.4767667404789748, + "learning_rate": 1.4796185001929558e-05, + "loss": 0.7183, + "step": 3530 + }, + { + "epoch": 0.36067415730337077, + "grad_norm": 1.5139926169321702, + "learning_rate": 1.4793281753336013e-05, + "loss": 0.7475, + "step": 3531 + }, + { + "epoch": 0.36077630234933605, + "grad_norm": 1.2692024736879741, + "learning_rate": 1.4790377980114682e-05, + "loss": 0.6582, + "step": 3532 + }, + { + "epoch": 0.36087844739530134, + "grad_norm": 1.5150379757787118, + "learning_rate": 1.4787473682583384e-05, + "loss": 0.7402, + "step": 3533 + }, + { + "epoch": 0.3609805924412666, + "grad_norm": 1.4497426092568104, + "learning_rate": 1.478456886106e-05, + "loss": 0.8044, + "step": 3534 + }, + { + "epoch": 0.36108273748723185, + "grad_norm": 1.518901540540455, + "learning_rate": 1.4781663515862465e-05, + "loss": 0.791, + "step": 3535 + }, + { + "epoch": 0.36118488253319714, + "grad_norm": 1.5533440110834615, + "learning_rate": 1.477875764730877e-05, + "loss": 0.759, + "step": 3536 + }, + { + "epoch": 0.3612870275791624, + "grad_norm": 1.4167996124687194, + "learning_rate": 1.4775851255716958e-05, + "loss": 0.7571, + "step": 3537 + }, + { + "epoch": 0.3613891726251277, + "grad_norm": 1.5586093911179746, + "learning_rate": 1.4772944341405145e-05, + "loss": 0.7743, + "step": 3538 + }, + { + "epoch": 0.36149131767109294, + "grad_norm": 1.4781070385699004, + "learning_rate": 1.4770036904691487e-05, + "loss": 0.6837, + "step": 3539 + }, + { + "epoch": 0.3615934627170582, + "grad_norm": 1.4783462457769279, + "learning_rate": 1.4767128945894211e-05, + "loss": 0.7563, + "step": 3540 + }, + { + "epoch": 0.3616956077630235, + "grad_norm": 1.4093634812298927, + "learning_rate": 1.476422046533159e-05, + "loss": 0.7297, + "step": 3541 + }, + { + "epoch": 0.36179775280898874, + "grad_norm": 1.3186063401576988, + "learning_rate": 1.4761311463321959e-05, + "loss": 0.6499, + "step": 3542 + }, + { + "epoch": 0.361899897854954, + "grad_norm": 1.4834311932183237, + "learning_rate": 1.4758401940183715e-05, + "loss": 0.7136, + "step": 3543 + }, + { + "epoch": 0.3620020429009193, + "grad_norm": 1.5848396428660996, + "learning_rate": 1.4755491896235304e-05, + "loss": 0.7469, + "step": 3544 + }, + { + "epoch": 0.3621041879468846, + "grad_norm": 1.4501279764240527, + "learning_rate": 1.4752581331795233e-05, + "loss": 0.7052, + "step": 3545 + }, + { + "epoch": 0.3622063329928498, + "grad_norm": 1.5569997587194149, + "learning_rate": 1.4749670247182064e-05, + "loss": 0.7531, + "step": 3546 + }, + { + "epoch": 0.3623084780388151, + "grad_norm": 1.6030123379742232, + "learning_rate": 1.4746758642714415e-05, + "loss": 0.6892, + "step": 3547 + }, + { + "epoch": 0.3624106230847804, + "grad_norm": 1.5653356367269906, + "learning_rate": 1.4743846518710971e-05, + "loss": 0.7479, + "step": 3548 + }, + { + "epoch": 0.3625127681307457, + "grad_norm": 1.3774972060344115, + "learning_rate": 1.4740933875490456e-05, + "loss": 0.814, + "step": 3549 + }, + { + "epoch": 0.3626149131767109, + "grad_norm": 1.3085954413459302, + "learning_rate": 1.4738020713371668e-05, + "loss": 0.8017, + "step": 3550 + }, + { + "epoch": 0.3627170582226762, + "grad_norm": 1.52505904006268, + "learning_rate": 1.473510703267345e-05, + "loss": 0.7377, + "step": 3551 + }, + { + "epoch": 0.3628192032686415, + "grad_norm": 1.4460463479772836, + "learning_rate": 1.4732192833714712e-05, + "loss": 0.653, + "step": 3552 + }, + { + "epoch": 0.36292134831460676, + "grad_norm": 1.597986277757192, + "learning_rate": 1.4729278116814406e-05, + "loss": 0.752, + "step": 3553 + }, + { + "epoch": 0.363023493360572, + "grad_norm": 1.3403146746156354, + "learning_rate": 1.4726362882291555e-05, + "loss": 0.6718, + "step": 3554 + }, + { + "epoch": 0.3631256384065373, + "grad_norm": 1.3829788518123365, + "learning_rate": 1.4723447130465236e-05, + "loss": 0.7344, + "step": 3555 + }, + { + "epoch": 0.36322778345250256, + "grad_norm": 1.602423752132864, + "learning_rate": 1.4720530861654577e-05, + "loss": 0.7567, + "step": 3556 + }, + { + "epoch": 0.36332992849846785, + "grad_norm": 1.5575462470250936, + "learning_rate": 1.4717614076178761e-05, + "loss": 0.8836, + "step": 3557 + }, + { + "epoch": 0.3634320735444331, + "grad_norm": 1.4794524056913083, + "learning_rate": 1.471469677435704e-05, + "loss": 0.7257, + "step": 3558 + }, + { + "epoch": 0.36353421859039836, + "grad_norm": 1.3914877391548583, + "learning_rate": 1.4711778956508708e-05, + "loss": 0.8264, + "step": 3559 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.395574635582073, + "learning_rate": 1.4708860622953132e-05, + "loss": 0.7423, + "step": 3560 + }, + { + "epoch": 0.36373850868232893, + "grad_norm": 1.530605813155384, + "learning_rate": 1.4705941774009708e-05, + "loss": 0.879, + "step": 3561 + }, + { + "epoch": 0.36384065372829416, + "grad_norm": 1.4115340550072433, + "learning_rate": 1.4703022409997924e-05, + "loss": 0.6706, + "step": 3562 + }, + { + "epoch": 0.36394279877425945, + "grad_norm": 1.5215885550646886, + "learning_rate": 1.47001025312373e-05, + "loss": 0.804, + "step": 3563 + }, + { + "epoch": 0.36404494382022473, + "grad_norm": 1.5038036353148072, + "learning_rate": 1.4697182138047412e-05, + "loss": 0.7966, + "step": 3564 + }, + { + "epoch": 0.36414708886619, + "grad_norm": 1.407248670395217, + "learning_rate": 1.4694261230747903e-05, + "loss": 0.6933, + "step": 3565 + }, + { + "epoch": 0.36424923391215525, + "grad_norm": 1.498742470689245, + "learning_rate": 1.4691339809658473e-05, + "loss": 0.7316, + "step": 3566 + }, + { + "epoch": 0.36435137895812053, + "grad_norm": 1.35984646029236, + "learning_rate": 1.468841787509887e-05, + "loss": 0.7201, + "step": 3567 + }, + { + "epoch": 0.3644535240040858, + "grad_norm": 1.350166528795013, + "learning_rate": 1.4685495427388903e-05, + "loss": 0.8239, + "step": 3568 + }, + { + "epoch": 0.36455566905005105, + "grad_norm": 1.4629909454744277, + "learning_rate": 1.468257246684843e-05, + "loss": 0.85, + "step": 3569 + }, + { + "epoch": 0.36465781409601633, + "grad_norm": 1.532535643249225, + "learning_rate": 1.467964899379738e-05, + "loss": 0.8106, + "step": 3570 + }, + { + "epoch": 0.3647599591419816, + "grad_norm": 1.5350234660708026, + "learning_rate": 1.4676725008555719e-05, + "loss": 0.7743, + "step": 3571 + }, + { + "epoch": 0.3648621041879469, + "grad_norm": 1.5712565211612515, + "learning_rate": 1.4673800511443488e-05, + "loss": 0.7415, + "step": 3572 + }, + { + "epoch": 0.36496424923391213, + "grad_norm": 1.4503835070169362, + "learning_rate": 1.467087550278077e-05, + "loss": 0.7637, + "step": 3573 + }, + { + "epoch": 0.3650663942798774, + "grad_norm": 1.245451434261055, + "learning_rate": 1.4667949982887711e-05, + "loss": 0.6575, + "step": 3574 + }, + { + "epoch": 0.3651685393258427, + "grad_norm": 1.5660989426379437, + "learning_rate": 1.466502395208451e-05, + "loss": 0.7802, + "step": 3575 + }, + { + "epoch": 0.365270684371808, + "grad_norm": 1.5882683599553815, + "learning_rate": 1.4662097410691422e-05, + "loss": 0.7925, + "step": 3576 + }, + { + "epoch": 0.3653728294177732, + "grad_norm": 1.4219513286888783, + "learning_rate": 1.4659170359028763e-05, + "loss": 0.7209, + "step": 3577 + }, + { + "epoch": 0.3654749744637385, + "grad_norm": 1.2313316789307804, + "learning_rate": 1.4656242797416895e-05, + "loss": 0.6937, + "step": 3578 + }, + { + "epoch": 0.3655771195097038, + "grad_norm": 1.539023476740282, + "learning_rate": 1.4653314726176249e-05, + "loss": 0.7205, + "step": 3579 + }, + { + "epoch": 0.36567926455566907, + "grad_norm": 1.4615364153418366, + "learning_rate": 1.4650386145627298e-05, + "loss": 0.8191, + "step": 3580 + }, + { + "epoch": 0.3657814096016343, + "grad_norm": 1.523091558614056, + "learning_rate": 1.4647457056090575e-05, + "loss": 0.7378, + "step": 3581 + }, + { + "epoch": 0.3658835546475996, + "grad_norm": 1.4125594920393905, + "learning_rate": 1.464452745788668e-05, + "loss": 0.6643, + "step": 3582 + }, + { + "epoch": 0.36598569969356487, + "grad_norm": 1.5445807858286138, + "learning_rate": 1.464159735133625e-05, + "loss": 0.7748, + "step": 3583 + }, + { + "epoch": 0.36608784473953015, + "grad_norm": 1.4142546214682299, + "learning_rate": 1.4638666736759992e-05, + "loss": 0.7141, + "step": 3584 + }, + { + "epoch": 0.3661899897854954, + "grad_norm": 1.5960918702239622, + "learning_rate": 1.4635735614478663e-05, + "loss": 0.7577, + "step": 3585 + }, + { + "epoch": 0.36629213483146067, + "grad_norm": 1.6719239579670662, + "learning_rate": 1.4632803984813076e-05, + "loss": 0.8539, + "step": 3586 + }, + { + "epoch": 0.36639427987742595, + "grad_norm": 1.4515077378926515, + "learning_rate": 1.4629871848084101e-05, + "loss": 0.7558, + "step": 3587 + }, + { + "epoch": 0.36649642492339124, + "grad_norm": 1.3695855978324316, + "learning_rate": 1.4626939204612657e-05, + "loss": 0.7564, + "step": 3588 + }, + { + "epoch": 0.36659856996935647, + "grad_norm": 1.4320087213060648, + "learning_rate": 1.4624006054719733e-05, + "loss": 0.6628, + "step": 3589 + }, + { + "epoch": 0.36670071501532175, + "grad_norm": 1.4735600735206476, + "learning_rate": 1.4621072398726357e-05, + "loss": 0.7483, + "step": 3590 + }, + { + "epoch": 0.36680286006128704, + "grad_norm": 1.413726306613324, + "learning_rate": 1.461813823695362e-05, + "loss": 0.7098, + "step": 3591 + }, + { + "epoch": 0.3669050051072523, + "grad_norm": 1.3723791641071754, + "learning_rate": 1.4615203569722672e-05, + "loss": 0.6798, + "step": 3592 + }, + { + "epoch": 0.36700715015321755, + "grad_norm": 1.3787401723519346, + "learning_rate": 1.4612268397354706e-05, + "loss": 0.7892, + "step": 3593 + }, + { + "epoch": 0.36710929519918284, + "grad_norm": 1.43741502588015, + "learning_rate": 1.4609332720170988e-05, + "loss": 0.6665, + "step": 3594 + }, + { + "epoch": 0.3672114402451481, + "grad_norm": 1.3497835651672883, + "learning_rate": 1.4606396538492824e-05, + "loss": 0.6772, + "step": 3595 + }, + { + "epoch": 0.36731358529111335, + "grad_norm": 1.2802686187741845, + "learning_rate": 1.4603459852641586e-05, + "loss": 0.6644, + "step": 3596 + }, + { + "epoch": 0.36741573033707864, + "grad_norm": 1.3846898101306624, + "learning_rate": 1.460052266293869e-05, + "loss": 0.6732, + "step": 3597 + }, + { + "epoch": 0.3675178753830439, + "grad_norm": 1.4850561884244704, + "learning_rate": 1.4597584969705616e-05, + "loss": 0.6702, + "step": 3598 + }, + { + "epoch": 0.3676200204290092, + "grad_norm": 1.6417667206036424, + "learning_rate": 1.45946467732639e-05, + "loss": 0.7736, + "step": 3599 + }, + { + "epoch": 0.36772216547497444, + "grad_norm": 1.508648257280374, + "learning_rate": 1.4591708073935122e-05, + "loss": 0.7441, + "step": 3600 + }, + { + "epoch": 0.3678243105209397, + "grad_norm": 1.5581586534899627, + "learning_rate": 1.458876887204093e-05, + "loss": 0.8311, + "step": 3601 + }, + { + "epoch": 0.367926455566905, + "grad_norm": 1.3647480128365639, + "learning_rate": 1.4585829167903022e-05, + "loss": 0.7277, + "step": 3602 + }, + { + "epoch": 0.3680286006128703, + "grad_norm": 1.3015608574213278, + "learning_rate": 1.4582888961843147e-05, + "loss": 0.8404, + "step": 3603 + }, + { + "epoch": 0.3681307456588355, + "grad_norm": 1.5796521861656745, + "learning_rate": 1.4579948254183116e-05, + "loss": 0.7277, + "step": 3604 + }, + { + "epoch": 0.3682328907048008, + "grad_norm": 1.4794105921201846, + "learning_rate": 1.4577007045244787e-05, + "loss": 0.794, + "step": 3605 + }, + { + "epoch": 0.3683350357507661, + "grad_norm": 1.340764716127591, + "learning_rate": 1.457406533535008e-05, + "loss": 0.6224, + "step": 3606 + }, + { + "epoch": 0.3684371807967314, + "grad_norm": 1.4284995813516426, + "learning_rate": 1.4571123124820969e-05, + "loss": 0.7437, + "step": 3607 + }, + { + "epoch": 0.3685393258426966, + "grad_norm": 1.4187360706387864, + "learning_rate": 1.4568180413979478e-05, + "loss": 0.7095, + "step": 3608 + }, + { + "epoch": 0.3686414708886619, + "grad_norm": 1.4317089037253494, + "learning_rate": 1.4565237203147688e-05, + "loss": 0.752, + "step": 3609 + }, + { + "epoch": 0.3687436159346272, + "grad_norm": 1.5682739572886992, + "learning_rate": 1.4562293492647737e-05, + "loss": 0.7201, + "step": 3610 + }, + { + "epoch": 0.36884576098059246, + "grad_norm": 1.2986345471611553, + "learning_rate": 1.4559349282801818e-05, + "loss": 0.5799, + "step": 3611 + }, + { + "epoch": 0.3689479060265577, + "grad_norm": 1.5179461423012208, + "learning_rate": 1.455640457393217e-05, + "loss": 0.8102, + "step": 3612 + }, + { + "epoch": 0.369050051072523, + "grad_norm": 1.7483568923034332, + "learning_rate": 1.45534593663611e-05, + "loss": 0.7897, + "step": 3613 + }, + { + "epoch": 0.36915219611848826, + "grad_norm": 1.577203999248307, + "learning_rate": 1.4550513660410962e-05, + "loss": 0.7017, + "step": 3614 + }, + { + "epoch": 0.36925434116445355, + "grad_norm": 1.543514234227448, + "learning_rate": 1.4547567456404162e-05, + "loss": 0.7633, + "step": 3615 + }, + { + "epoch": 0.3693564862104188, + "grad_norm": 1.5495580145609447, + "learning_rate": 1.4544620754663165e-05, + "loss": 0.6678, + "step": 3616 + }, + { + "epoch": 0.36945863125638406, + "grad_norm": 1.3462967627681197, + "learning_rate": 1.4541673555510491e-05, + "loss": 0.7039, + "step": 3617 + }, + { + "epoch": 0.36956077630234935, + "grad_norm": 1.709141253043606, + "learning_rate": 1.4538725859268711e-05, + "loss": 0.7238, + "step": 3618 + }, + { + "epoch": 0.36966292134831463, + "grad_norm": 1.4772003166161658, + "learning_rate": 1.4535777666260456e-05, + "loss": 0.7611, + "step": 3619 + }, + { + "epoch": 0.36976506639427986, + "grad_norm": 1.5657073474429466, + "learning_rate": 1.4532828976808404e-05, + "loss": 0.7501, + "step": 3620 + }, + { + "epoch": 0.36986721144024515, + "grad_norm": 1.638456528154201, + "learning_rate": 1.4529879791235291e-05, + "loss": 0.9016, + "step": 3621 + }, + { + "epoch": 0.36996935648621043, + "grad_norm": 1.389396222147818, + "learning_rate": 1.4526930109863905e-05, + "loss": 0.711, + "step": 3622 + }, + { + "epoch": 0.37007150153217566, + "grad_norm": 1.4731595081452353, + "learning_rate": 1.4523979933017101e-05, + "loss": 0.7118, + "step": 3623 + }, + { + "epoch": 0.37017364657814095, + "grad_norm": 1.5588683015656337, + "learning_rate": 1.4521029261017765e-05, + "loss": 0.7637, + "step": 3624 + }, + { + "epoch": 0.37027579162410623, + "grad_norm": 1.3652004998121787, + "learning_rate": 1.4518078094188856e-05, + "loss": 0.6267, + "step": 3625 + }, + { + "epoch": 0.3703779366700715, + "grad_norm": 1.4836445625923325, + "learning_rate": 1.4515126432853384e-05, + "loss": 0.6966, + "step": 3626 + }, + { + "epoch": 0.37048008171603675, + "grad_norm": 1.6489376444744281, + "learning_rate": 1.4512174277334403e-05, + "loss": 0.865, + "step": 3627 + }, + { + "epoch": 0.37058222676200203, + "grad_norm": 1.5178896456982356, + "learning_rate": 1.4509221627955037e-05, + "loss": 0.7282, + "step": 3628 + }, + { + "epoch": 0.3706843718079673, + "grad_norm": 1.503620313065253, + "learning_rate": 1.4506268485038445e-05, + "loss": 0.8446, + "step": 3629 + }, + { + "epoch": 0.3707865168539326, + "grad_norm": 1.4410003223255292, + "learning_rate": 1.4503314848907857e-05, + "loss": 0.7576, + "step": 3630 + }, + { + "epoch": 0.37088866189989783, + "grad_norm": 1.3713676399909998, + "learning_rate": 1.4500360719886551e-05, + "loss": 0.6399, + "step": 3631 + }, + { + "epoch": 0.3709908069458631, + "grad_norm": 1.5353132471547601, + "learning_rate": 1.4497406098297858e-05, + "loss": 0.7025, + "step": 3632 + }, + { + "epoch": 0.3710929519918284, + "grad_norm": 1.5221069189221201, + "learning_rate": 1.4494450984465156e-05, + "loss": 0.7279, + "step": 3633 + }, + { + "epoch": 0.3711950970377937, + "grad_norm": 1.5885983348957806, + "learning_rate": 1.4491495378711895e-05, + "loss": 0.858, + "step": 3634 + }, + { + "epoch": 0.3712972420837589, + "grad_norm": 1.3536907145924706, + "learning_rate": 1.4488539281361562e-05, + "loss": 0.6201, + "step": 3635 + }, + { + "epoch": 0.3713993871297242, + "grad_norm": 1.3276582062833509, + "learning_rate": 1.4485582692737705e-05, + "loss": 0.7055, + "step": 3636 + }, + { + "epoch": 0.3715015321756895, + "grad_norm": 1.4870232981047469, + "learning_rate": 1.4482625613163921e-05, + "loss": 0.8094, + "step": 3637 + }, + { + "epoch": 0.37160367722165477, + "grad_norm": 1.2694764332601938, + "learning_rate": 1.447966804296387e-05, + "loss": 0.6804, + "step": 3638 + }, + { + "epoch": 0.37170582226762, + "grad_norm": 1.5513568999490308, + "learning_rate": 1.4476709982461258e-05, + "loss": 0.8177, + "step": 3639 + }, + { + "epoch": 0.3718079673135853, + "grad_norm": 1.508325105598013, + "learning_rate": 1.4473751431979845e-05, + "loss": 0.7769, + "step": 3640 + }, + { + "epoch": 0.37191011235955057, + "grad_norm": 1.4247597802181968, + "learning_rate": 1.4470792391843449e-05, + "loss": 0.6332, + "step": 3641 + }, + { + "epoch": 0.37201225740551586, + "grad_norm": 1.5244592734311038, + "learning_rate": 1.4467832862375934e-05, + "loss": 0.7402, + "step": 3642 + }, + { + "epoch": 0.3721144024514811, + "grad_norm": 1.54550819124998, + "learning_rate": 1.446487284390123e-05, + "loss": 0.8492, + "step": 3643 + }, + { + "epoch": 0.37221654749744637, + "grad_norm": 1.3368328148037967, + "learning_rate": 1.4461912336743304e-05, + "loss": 0.6975, + "step": 3644 + }, + { + "epoch": 0.37231869254341166, + "grad_norm": 1.4451872842812368, + "learning_rate": 1.4458951341226192e-05, + "loss": 0.7298, + "step": 3645 + }, + { + "epoch": 0.37242083758937694, + "grad_norm": 1.4005427107381334, + "learning_rate": 1.4455989857673978e-05, + "loss": 0.7656, + "step": 3646 + }, + { + "epoch": 0.37252298263534217, + "grad_norm": 1.5103998740457987, + "learning_rate": 1.4453027886410792e-05, + "loss": 0.788, + "step": 3647 + }, + { + "epoch": 0.37262512768130746, + "grad_norm": 1.528519627260269, + "learning_rate": 1.4450065427760827e-05, + "loss": 0.6873, + "step": 3648 + }, + { + "epoch": 0.37272727272727274, + "grad_norm": 1.4177724285951554, + "learning_rate": 1.4447102482048324e-05, + "loss": 0.7027, + "step": 3649 + }, + { + "epoch": 0.372829417773238, + "grad_norm": 1.4671763159742377, + "learning_rate": 1.4444139049597583e-05, + "loss": 0.8318, + "step": 3650 + }, + { + "epoch": 0.37293156281920326, + "grad_norm": 1.4804035151544732, + "learning_rate": 1.4441175130732952e-05, + "loss": 0.7582, + "step": 3651 + }, + { + "epoch": 0.37303370786516854, + "grad_norm": 1.6098988641694498, + "learning_rate": 1.4438210725778833e-05, + "loss": 0.7635, + "step": 3652 + }, + { + "epoch": 0.3731358529111338, + "grad_norm": 1.2607461152484556, + "learning_rate": 1.4435245835059684e-05, + "loss": 0.81, + "step": 3653 + }, + { + "epoch": 0.37323799795709905, + "grad_norm": 1.310994512230082, + "learning_rate": 1.443228045890001e-05, + "loss": 0.6724, + "step": 3654 + }, + { + "epoch": 0.37334014300306434, + "grad_norm": 1.425602610155195, + "learning_rate": 1.442931459762438e-05, + "loss": 0.7376, + "step": 3655 + }, + { + "epoch": 0.3734422880490296, + "grad_norm": 1.5197204592130922, + "learning_rate": 1.4426348251557402e-05, + "loss": 0.7371, + "step": 3656 + }, + { + "epoch": 0.3735444330949949, + "grad_norm": 1.4743155850125698, + "learning_rate": 1.442338142102375e-05, + "loss": 0.7005, + "step": 3657 + }, + { + "epoch": 0.37364657814096014, + "grad_norm": 1.418190801576121, + "learning_rate": 1.4420414106348144e-05, + "loss": 0.8089, + "step": 3658 + }, + { + "epoch": 0.3737487231869254, + "grad_norm": 1.45883140527573, + "learning_rate": 1.4417446307855356e-05, + "loss": 0.693, + "step": 3659 + }, + { + "epoch": 0.3738508682328907, + "grad_norm": 1.467968645465181, + "learning_rate": 1.4414478025870218e-05, + "loss": 0.6693, + "step": 3660 + }, + { + "epoch": 0.373953013278856, + "grad_norm": 1.5287750918175633, + "learning_rate": 1.4411509260717607e-05, + "loss": 0.7814, + "step": 3661 + }, + { + "epoch": 0.3740551583248212, + "grad_norm": 1.5025478379114838, + "learning_rate": 1.4408540012722456e-05, + "loss": 0.6616, + "step": 3662 + }, + { + "epoch": 0.3741573033707865, + "grad_norm": 1.5146175780449924, + "learning_rate": 1.4405570282209756e-05, + "loss": 0.8505, + "step": 3663 + }, + { + "epoch": 0.3742594484167518, + "grad_norm": 1.3314386004605876, + "learning_rate": 1.4402600069504537e-05, + "loss": 0.7081, + "step": 3664 + }, + { + "epoch": 0.3743615934627171, + "grad_norm": 1.49247692578184, + "learning_rate": 1.4399629374931898e-05, + "loss": 0.8108, + "step": 3665 + }, + { + "epoch": 0.3744637385086823, + "grad_norm": 1.4406517821466518, + "learning_rate": 1.4396658198816982e-05, + "loss": 0.7396, + "step": 3666 + }, + { + "epoch": 0.3745658835546476, + "grad_norm": 1.2781214740996365, + "learning_rate": 1.4393686541484986e-05, + "loss": 0.7179, + "step": 3667 + }, + { + "epoch": 0.3746680286006129, + "grad_norm": 1.565929302486357, + "learning_rate": 1.4390714403261159e-05, + "loss": 0.7522, + "step": 3668 + }, + { + "epoch": 0.37477017364657816, + "grad_norm": 1.4080543472205684, + "learning_rate": 1.43877417844708e-05, + "loss": 0.7937, + "step": 3669 + }, + { + "epoch": 0.3748723186925434, + "grad_norm": 1.5997233268125852, + "learning_rate": 1.4384768685439274e-05, + "loss": 0.8142, + "step": 3670 + }, + { + "epoch": 0.3749744637385087, + "grad_norm": 1.400126980126301, + "learning_rate": 1.438179510649198e-05, + "loss": 0.7449, + "step": 3671 + }, + { + "epoch": 0.37507660878447396, + "grad_norm": 1.4296666331125398, + "learning_rate": 1.4378821047954382e-05, + "loss": 0.8201, + "step": 3672 + }, + { + "epoch": 0.37517875383043925, + "grad_norm": 1.3828789512666668, + "learning_rate": 1.4375846510151989e-05, + "loss": 0.7891, + "step": 3673 + }, + { + "epoch": 0.3752808988764045, + "grad_norm": 1.4930069049360375, + "learning_rate": 1.4372871493410368e-05, + "loss": 0.6832, + "step": 3674 + }, + { + "epoch": 0.37538304392236976, + "grad_norm": 1.3683770365734156, + "learning_rate": 1.436989599805514e-05, + "loss": 0.7581, + "step": 3675 + }, + { + "epoch": 0.37548518896833505, + "grad_norm": 1.4842508517401583, + "learning_rate": 1.4366920024411968e-05, + "loss": 0.713, + "step": 3676 + }, + { + "epoch": 0.37558733401430033, + "grad_norm": 1.4433914854046919, + "learning_rate": 1.4363943572806579e-05, + "loss": 0.6982, + "step": 3677 + }, + { + "epoch": 0.37568947906026556, + "grad_norm": 1.3743766785039202, + "learning_rate": 1.4360966643564746e-05, + "loss": 0.7392, + "step": 3678 + }, + { + "epoch": 0.37579162410623085, + "grad_norm": 1.4702753621169795, + "learning_rate": 1.43579892370123e-05, + "loss": 0.7977, + "step": 3679 + }, + { + "epoch": 0.37589376915219613, + "grad_norm": 1.6260752696094032, + "learning_rate": 1.4355011353475115e-05, + "loss": 0.762, + "step": 3680 + }, + { + "epoch": 0.37599591419816136, + "grad_norm": 1.5748777799045441, + "learning_rate": 1.435203299327912e-05, + "loss": 0.7215, + "step": 3681 + }, + { + "epoch": 0.37609805924412665, + "grad_norm": 1.4190204929655832, + "learning_rate": 1.4349054156750303e-05, + "loss": 0.6805, + "step": 3682 + }, + { + "epoch": 0.37620020429009193, + "grad_norm": 1.384949434398776, + "learning_rate": 1.43460748442147e-05, + "loss": 0.6795, + "step": 3683 + }, + { + "epoch": 0.3763023493360572, + "grad_norm": 1.4736325322077315, + "learning_rate": 1.43430950559984e-05, + "loss": 0.7586, + "step": 3684 + }, + { + "epoch": 0.37640449438202245, + "grad_norm": 1.4384662897864549, + "learning_rate": 1.4340114792427535e-05, + "loss": 0.7204, + "step": 3685 + }, + { + "epoch": 0.37650663942798773, + "grad_norm": 1.4445058368114152, + "learning_rate": 1.4337134053828305e-05, + "loss": 0.7119, + "step": 3686 + }, + { + "epoch": 0.376608784473953, + "grad_norm": 1.5585012024048948, + "learning_rate": 1.4334152840526951e-05, + "loss": 0.7159, + "step": 3687 + }, + { + "epoch": 0.3767109295199183, + "grad_norm": 1.39463092304519, + "learning_rate": 1.4331171152849769e-05, + "loss": 0.7594, + "step": 3688 + }, + { + "epoch": 0.37681307456588353, + "grad_norm": 1.6271617254435795, + "learning_rate": 1.4328188991123103e-05, + "loss": 0.7371, + "step": 3689 + }, + { + "epoch": 0.3769152196118488, + "grad_norm": 1.4373512172275276, + "learning_rate": 1.4325206355673357e-05, + "loss": 0.6563, + "step": 3690 + }, + { + "epoch": 0.3770173646578141, + "grad_norm": 1.4965718250069318, + "learning_rate": 1.432222324682698e-05, + "loss": 0.671, + "step": 3691 + }, + { + "epoch": 0.3771195097037794, + "grad_norm": 1.5189566042131477, + "learning_rate": 1.431923966491048e-05, + "loss": 0.742, + "step": 3692 + }, + { + "epoch": 0.3772216547497446, + "grad_norm": 1.5988516777730521, + "learning_rate": 1.4316255610250402e-05, + "loss": 0.9064, + "step": 3693 + }, + { + "epoch": 0.3773237997957099, + "grad_norm": 1.4724745399528418, + "learning_rate": 1.4313271083173363e-05, + "loss": 0.733, + "step": 3694 + }, + { + "epoch": 0.3774259448416752, + "grad_norm": 1.667581844666148, + "learning_rate": 1.4310286084006015e-05, + "loss": 0.7644, + "step": 3695 + }, + { + "epoch": 0.3775280898876405, + "grad_norm": 1.5978535770090205, + "learning_rate": 1.4307300613075072e-05, + "loss": 0.7012, + "step": 3696 + }, + { + "epoch": 0.3776302349336057, + "grad_norm": 1.3162020021484282, + "learning_rate": 1.4304314670707292e-05, + "loss": 0.7035, + "step": 3697 + }, + { + "epoch": 0.377732379979571, + "grad_norm": 1.4983221085175242, + "learning_rate": 1.4301328257229494e-05, + "loss": 0.7335, + "step": 3698 + }, + { + "epoch": 0.37783452502553627, + "grad_norm": 1.3503137815561121, + "learning_rate": 1.4298341372968538e-05, + "loss": 0.6871, + "step": 3699 + }, + { + "epoch": 0.37793667007150156, + "grad_norm": 1.4911447894232301, + "learning_rate": 1.4295354018251342e-05, + "loss": 0.6425, + "step": 3700 + }, + { + "epoch": 0.3780388151174668, + "grad_norm": 1.5193988873355548, + "learning_rate": 1.429236619340487e-05, + "loss": 0.8458, + "step": 3701 + }, + { + "epoch": 0.37814096016343207, + "grad_norm": 1.4355064530714263, + "learning_rate": 1.428937789875615e-05, + "loss": 0.7544, + "step": 3702 + }, + { + "epoch": 0.37824310520939736, + "grad_norm": 1.6579765946523064, + "learning_rate": 1.4286389134632244e-05, + "loss": 0.7322, + "step": 3703 + }, + { + "epoch": 0.37834525025536264, + "grad_norm": 1.4106138978328489, + "learning_rate": 1.428339990136028e-05, + "loss": 0.7751, + "step": 3704 + }, + { + "epoch": 0.37844739530132787, + "grad_norm": 1.5234619484207599, + "learning_rate": 1.4280410199267427e-05, + "loss": 0.78, + "step": 3705 + }, + { + "epoch": 0.37854954034729316, + "grad_norm": 1.2624715661817962, + "learning_rate": 1.4277420028680913e-05, + "loss": 0.6238, + "step": 3706 + }, + { + "epoch": 0.37865168539325844, + "grad_norm": 1.4745822614060566, + "learning_rate": 1.4274429389928015e-05, + "loss": 0.7816, + "step": 3707 + }, + { + "epoch": 0.37875383043922367, + "grad_norm": 1.4355075811812978, + "learning_rate": 1.4271438283336057e-05, + "loss": 0.693, + "step": 3708 + }, + { + "epoch": 0.37885597548518896, + "grad_norm": 1.4016615716543896, + "learning_rate": 1.4268446709232418e-05, + "loss": 0.7281, + "step": 3709 + }, + { + "epoch": 0.37895812053115424, + "grad_norm": 1.5860334399469556, + "learning_rate": 1.4265454667944529e-05, + "loss": 0.804, + "step": 3710 + }, + { + "epoch": 0.3790602655771195, + "grad_norm": 1.5650443676433952, + "learning_rate": 1.4262462159799874e-05, + "loss": 0.8108, + "step": 3711 + }, + { + "epoch": 0.37916241062308476, + "grad_norm": 1.5100485779065302, + "learning_rate": 1.4259469185125977e-05, + "loss": 0.7249, + "step": 3712 + }, + { + "epoch": 0.37926455566905004, + "grad_norm": 1.4061241920262901, + "learning_rate": 1.425647574425043e-05, + "loss": 0.6421, + "step": 3713 + }, + { + "epoch": 0.3793667007150153, + "grad_norm": 1.4017877872572864, + "learning_rate": 1.4253481837500862e-05, + "loss": 0.7803, + "step": 3714 + }, + { + "epoch": 0.3794688457609806, + "grad_norm": 1.380132325265422, + "learning_rate": 1.4250487465204958e-05, + "loss": 0.7704, + "step": 3715 + }, + { + "epoch": 0.37957099080694584, + "grad_norm": 1.4108780735824247, + "learning_rate": 1.4247492627690456e-05, + "loss": 0.7464, + "step": 3716 + }, + { + "epoch": 0.3796731358529111, + "grad_norm": 1.5049028566881042, + "learning_rate": 1.424449732528514e-05, + "loss": 0.7133, + "step": 3717 + }, + { + "epoch": 0.3797752808988764, + "grad_norm": 1.493439067978785, + "learning_rate": 1.424150155831685e-05, + "loss": 0.7596, + "step": 3718 + }, + { + "epoch": 0.3798774259448417, + "grad_norm": 1.4141203789591381, + "learning_rate": 1.4238505327113475e-05, + "loss": 0.7841, + "step": 3719 + }, + { + "epoch": 0.3799795709908069, + "grad_norm": 1.5226858009081776, + "learning_rate": 1.4235508632002952e-05, + "loss": 0.7644, + "step": 3720 + }, + { + "epoch": 0.3800817160367722, + "grad_norm": 1.4257253786178932, + "learning_rate": 1.4232511473313273e-05, + "loss": 0.7498, + "step": 3721 + }, + { + "epoch": 0.3801838610827375, + "grad_norm": 1.3304964414600713, + "learning_rate": 1.4229513851372479e-05, + "loss": 0.6852, + "step": 3722 + }, + { + "epoch": 0.3802860061287028, + "grad_norm": 1.4914631982178206, + "learning_rate": 1.4226515766508662e-05, + "loss": 0.7117, + "step": 3723 + }, + { + "epoch": 0.380388151174668, + "grad_norm": 1.3990165680896844, + "learning_rate": 1.4223517219049964e-05, + "loss": 0.6958, + "step": 3724 + }, + { + "epoch": 0.3804902962206333, + "grad_norm": 1.4560786132387258, + "learning_rate": 1.4220518209324574e-05, + "loss": 0.7568, + "step": 3725 + }, + { + "epoch": 0.3805924412665986, + "grad_norm": 1.5442021922467273, + "learning_rate": 1.4217518737660743e-05, + "loss": 0.7148, + "step": 3726 + }, + { + "epoch": 0.38069458631256387, + "grad_norm": 1.468354984985535, + "learning_rate": 1.4214518804386761e-05, + "loss": 0.7848, + "step": 3727 + }, + { + "epoch": 0.3807967313585291, + "grad_norm": 1.5635493779056633, + "learning_rate": 1.4211518409830973e-05, + "loss": 0.8243, + "step": 3728 + }, + { + "epoch": 0.3808988764044944, + "grad_norm": 1.564700102969133, + "learning_rate": 1.4208517554321772e-05, + "loss": 0.7712, + "step": 3729 + }, + { + "epoch": 0.38100102145045966, + "grad_norm": 1.431261789372016, + "learning_rate": 1.4205516238187606e-05, + "loss": 0.7831, + "step": 3730 + }, + { + "epoch": 0.38110316649642495, + "grad_norm": 1.7771771344287863, + "learning_rate": 1.4202514461756974e-05, + "loss": 0.7545, + "step": 3731 + }, + { + "epoch": 0.3812053115423902, + "grad_norm": 1.2728338330729665, + "learning_rate": 1.4199512225358416e-05, + "loss": 0.7209, + "step": 3732 + }, + { + "epoch": 0.38130745658835546, + "grad_norm": 1.4464226967437996, + "learning_rate": 1.419650952932053e-05, + "loss": 0.6923, + "step": 3733 + }, + { + "epoch": 0.38140960163432075, + "grad_norm": 1.605981382305307, + "learning_rate": 1.4193506373971968e-05, + "loss": 0.7107, + "step": 3734 + }, + { + "epoch": 0.381511746680286, + "grad_norm": 1.6557637613588452, + "learning_rate": 1.4190502759641422e-05, + "loss": 0.7477, + "step": 3735 + }, + { + "epoch": 0.38161389172625126, + "grad_norm": 1.4419692472749346, + "learning_rate": 1.4187498686657644e-05, + "loss": 0.7653, + "step": 3736 + }, + { + "epoch": 0.38171603677221655, + "grad_norm": 1.6109487716279298, + "learning_rate": 1.4184494155349424e-05, + "loss": 0.7467, + "step": 3737 + }, + { + "epoch": 0.38181818181818183, + "grad_norm": 1.4581239679879419, + "learning_rate": 1.4181489166045622e-05, + "loss": 0.6508, + "step": 3738 + }, + { + "epoch": 0.38192032686414706, + "grad_norm": 1.405300183414637, + "learning_rate": 1.4178483719075124e-05, + "loss": 0.697, + "step": 3739 + }, + { + "epoch": 0.38202247191011235, + "grad_norm": 1.515876877148843, + "learning_rate": 1.4175477814766888e-05, + "loss": 0.7262, + "step": 3740 + }, + { + "epoch": 0.38212461695607763, + "grad_norm": 1.4641851800028836, + "learning_rate": 1.4172471453449902e-05, + "loss": 0.7379, + "step": 3741 + }, + { + "epoch": 0.3822267620020429, + "grad_norm": 1.459256633376696, + "learning_rate": 1.4169464635453223e-05, + "loss": 0.7473, + "step": 3742 + }, + { + "epoch": 0.38232890704800815, + "grad_norm": 1.543179187681333, + "learning_rate": 1.4166457361105947e-05, + "loss": 0.6657, + "step": 3743 + }, + { + "epoch": 0.38243105209397343, + "grad_norm": 1.5318158907408823, + "learning_rate": 1.4163449630737219e-05, + "loss": 0.7223, + "step": 3744 + }, + { + "epoch": 0.3825331971399387, + "grad_norm": 1.4931845726201447, + "learning_rate": 1.4160441444676239e-05, + "loss": 0.8146, + "step": 3745 + }, + { + "epoch": 0.382635342185904, + "grad_norm": 1.373586899921609, + "learning_rate": 1.4157432803252256e-05, + "loss": 0.7108, + "step": 3746 + }, + { + "epoch": 0.38273748723186923, + "grad_norm": 1.3674300852249217, + "learning_rate": 1.4154423706794565e-05, + "loss": 0.6478, + "step": 3747 + }, + { + "epoch": 0.3828396322778345, + "grad_norm": 1.386548544355511, + "learning_rate": 1.4151414155632517e-05, + "loss": 0.6511, + "step": 3748 + }, + { + "epoch": 0.3829417773237998, + "grad_norm": 1.3586852007376644, + "learning_rate": 1.4148404150095503e-05, + "loss": 0.6245, + "step": 3749 + }, + { + "epoch": 0.3830439223697651, + "grad_norm": 1.4769244178200376, + "learning_rate": 1.414539369051298e-05, + "loss": 0.6111, + "step": 3750 + }, + { + "epoch": 0.3831460674157303, + "grad_norm": 1.6297641512463896, + "learning_rate": 1.4142382777214438e-05, + "loss": 0.7673, + "step": 3751 + }, + { + "epoch": 0.3832482124616956, + "grad_norm": 1.7546460296680568, + "learning_rate": 1.4139371410529425e-05, + "loss": 0.7863, + "step": 3752 + }, + { + "epoch": 0.3833503575076609, + "grad_norm": 1.442053202609649, + "learning_rate": 1.4136359590787534e-05, + "loss": 0.7213, + "step": 3753 + }, + { + "epoch": 0.3834525025536262, + "grad_norm": 1.6829229290533938, + "learning_rate": 1.4133347318318416e-05, + "loss": 0.7498, + "step": 3754 + }, + { + "epoch": 0.3835546475995914, + "grad_norm": 1.314189114712145, + "learning_rate": 1.4130334593451763e-05, + "loss": 0.6791, + "step": 3755 + }, + { + "epoch": 0.3836567926455567, + "grad_norm": 1.4865539650770723, + "learning_rate": 1.4127321416517319e-05, + "loss": 0.826, + "step": 3756 + }, + { + "epoch": 0.383758937691522, + "grad_norm": 1.4364678363513173, + "learning_rate": 1.4124307787844879e-05, + "loss": 0.8683, + "step": 3757 + }, + { + "epoch": 0.38386108273748726, + "grad_norm": 1.5967542217465929, + "learning_rate": 1.412129370776429e-05, + "loss": 0.765, + "step": 3758 + }, + { + "epoch": 0.3839632277834525, + "grad_norm": 1.5899493539223315, + "learning_rate": 1.4118279176605439e-05, + "loss": 0.732, + "step": 3759 + }, + { + "epoch": 0.3840653728294178, + "grad_norm": 1.5266697886543328, + "learning_rate": 1.4115264194698275e-05, + "loss": 0.7243, + "step": 3760 + }, + { + "epoch": 0.38416751787538306, + "grad_norm": 1.47361758322714, + "learning_rate": 1.4112248762372782e-05, + "loss": 0.7139, + "step": 3761 + }, + { + "epoch": 0.3842696629213483, + "grad_norm": 1.3812918779790728, + "learning_rate": 1.4109232879959008e-05, + "loss": 0.7044, + "step": 3762 + }, + { + "epoch": 0.3843718079673136, + "grad_norm": 1.5062525420602184, + "learning_rate": 1.410621654778704e-05, + "loss": 0.6855, + "step": 3763 + }, + { + "epoch": 0.38447395301327886, + "grad_norm": 1.5263955181166646, + "learning_rate": 1.4103199766187015e-05, + "loss": 0.6596, + "step": 3764 + }, + { + "epoch": 0.38457609805924414, + "grad_norm": 1.4341766171056096, + "learning_rate": 1.4100182535489127e-05, + "loss": 0.6153, + "step": 3765 + }, + { + "epoch": 0.38467824310520937, + "grad_norm": 1.504410121728268, + "learning_rate": 1.409716485602361e-05, + "loss": 0.7123, + "step": 3766 + }, + { + "epoch": 0.38478038815117466, + "grad_norm": 1.5097753849551514, + "learning_rate": 1.4094146728120755e-05, + "loss": 0.6825, + "step": 3767 + }, + { + "epoch": 0.38488253319713994, + "grad_norm": 1.2836896028166562, + "learning_rate": 1.4091128152110896e-05, + "loss": 0.6557, + "step": 3768 + }, + { + "epoch": 0.3849846782431052, + "grad_norm": 1.4172337011280671, + "learning_rate": 1.4088109128324412e-05, + "loss": 0.7636, + "step": 3769 + }, + { + "epoch": 0.38508682328907046, + "grad_norm": 1.4493320130582017, + "learning_rate": 1.4085089657091748e-05, + "loss": 0.7121, + "step": 3770 + }, + { + "epoch": 0.38518896833503574, + "grad_norm": 1.4293336287887488, + "learning_rate": 1.4082069738743379e-05, + "loss": 0.6615, + "step": 3771 + }, + { + "epoch": 0.385291113381001, + "grad_norm": 1.561262664771609, + "learning_rate": 1.407904937360984e-05, + "loss": 0.7523, + "step": 3772 + }, + { + "epoch": 0.3853932584269663, + "grad_norm": 1.5752742650109106, + "learning_rate": 1.4076028562021712e-05, + "loss": 0.6941, + "step": 3773 + }, + { + "epoch": 0.38549540347293154, + "grad_norm": 1.6157365084836919, + "learning_rate": 1.4073007304309625e-05, + "loss": 0.7265, + "step": 3774 + }, + { + "epoch": 0.3855975485188968, + "grad_norm": 1.2843072739037762, + "learning_rate": 1.4069985600804259e-05, + "loss": 0.6216, + "step": 3775 + }, + { + "epoch": 0.3856996935648621, + "grad_norm": 1.3638624381058624, + "learning_rate": 1.4066963451836336e-05, + "loss": 0.7802, + "step": 3776 + }, + { + "epoch": 0.3858018386108274, + "grad_norm": 1.4913842153789156, + "learning_rate": 1.4063940857736635e-05, + "loss": 0.67, + "step": 3777 + }, + { + "epoch": 0.3859039836567926, + "grad_norm": 1.4256038551502161, + "learning_rate": 1.4060917818835984e-05, + "loss": 0.682, + "step": 3778 + }, + { + "epoch": 0.3860061287027579, + "grad_norm": 1.3237038158235426, + "learning_rate": 1.4057894335465254e-05, + "loss": 0.6387, + "step": 3779 + }, + { + "epoch": 0.3861082737487232, + "grad_norm": 1.5701135455518098, + "learning_rate": 1.4054870407955368e-05, + "loss": 0.7722, + "step": 3780 + }, + { + "epoch": 0.3862104187946885, + "grad_norm": 1.3586204222088831, + "learning_rate": 1.4051846036637291e-05, + "loss": 0.718, + "step": 3781 + }, + { + "epoch": 0.3863125638406537, + "grad_norm": 1.4477910154981761, + "learning_rate": 1.4048821221842053e-05, + "loss": 0.6886, + "step": 3782 + }, + { + "epoch": 0.386414708886619, + "grad_norm": 1.3975691930435843, + "learning_rate": 1.4045795963900712e-05, + "loss": 0.7267, + "step": 3783 + }, + { + "epoch": 0.3865168539325843, + "grad_norm": 1.5426801824516347, + "learning_rate": 1.4042770263144394e-05, + "loss": 0.7604, + "step": 3784 + }, + { + "epoch": 0.38661899897854957, + "grad_norm": 1.3444052619075637, + "learning_rate": 1.4039744119904255e-05, + "loss": 0.6736, + "step": 3785 + }, + { + "epoch": 0.3867211440245148, + "grad_norm": 1.4464471907102965, + "learning_rate": 1.4036717534511512e-05, + "loss": 0.7979, + "step": 3786 + }, + { + "epoch": 0.3868232890704801, + "grad_norm": 1.5506339656209958, + "learning_rate": 1.4033690507297431e-05, + "loss": 0.7148, + "step": 3787 + }, + { + "epoch": 0.38692543411644537, + "grad_norm": 1.3616792913693565, + "learning_rate": 1.4030663038593313e-05, + "loss": 0.5903, + "step": 3788 + }, + { + "epoch": 0.3870275791624106, + "grad_norm": 1.477234958466394, + "learning_rate": 1.4027635128730524e-05, + "loss": 0.7175, + "step": 3789 + }, + { + "epoch": 0.3871297242083759, + "grad_norm": 1.483024065976932, + "learning_rate": 1.4024606778040468e-05, + "loss": 0.8525, + "step": 3790 + }, + { + "epoch": 0.38723186925434117, + "grad_norm": 1.4399692570865557, + "learning_rate": 1.4021577986854597e-05, + "loss": 0.6785, + "step": 3791 + }, + { + "epoch": 0.38733401430030645, + "grad_norm": 1.543952091241829, + "learning_rate": 1.401854875550442e-05, + "loss": 0.7644, + "step": 3792 + }, + { + "epoch": 0.3874361593462717, + "grad_norm": 1.367701794259396, + "learning_rate": 1.4015519084321483e-05, + "loss": 0.6241, + "step": 3793 + }, + { + "epoch": 0.38753830439223697, + "grad_norm": 1.4167830025298678, + "learning_rate": 1.401248897363739e-05, + "loss": 0.7812, + "step": 3794 + }, + { + "epoch": 0.38764044943820225, + "grad_norm": 1.5361223845566874, + "learning_rate": 1.4009458423783786e-05, + "loss": 0.7346, + "step": 3795 + }, + { + "epoch": 0.38774259448416754, + "grad_norm": 1.4929830568668665, + "learning_rate": 1.4006427435092367e-05, + "loss": 0.8091, + "step": 3796 + }, + { + "epoch": 0.38784473953013276, + "grad_norm": 1.4505281031501869, + "learning_rate": 1.4003396007894877e-05, + "loss": 0.6086, + "step": 3797 + }, + { + "epoch": 0.38794688457609805, + "grad_norm": 1.3364063002693791, + "learning_rate": 1.4000364142523104e-05, + "loss": 0.5811, + "step": 3798 + }, + { + "epoch": 0.38804902962206334, + "grad_norm": 1.471310892213482, + "learning_rate": 1.3997331839308897e-05, + "loss": 0.8029, + "step": 3799 + }, + { + "epoch": 0.3881511746680286, + "grad_norm": 1.4016025835588313, + "learning_rate": 1.3994299098584132e-05, + "loss": 0.6538, + "step": 3800 + }, + { + "epoch": 0.38825331971399385, + "grad_norm": 1.4803810223368623, + "learning_rate": 1.3991265920680755e-05, + "loss": 0.6896, + "step": 3801 + }, + { + "epoch": 0.38835546475995913, + "grad_norm": 1.3342000455648262, + "learning_rate": 1.3988232305930742e-05, + "loss": 0.7344, + "step": 3802 + }, + { + "epoch": 0.3884576098059244, + "grad_norm": 1.4028968859425381, + "learning_rate": 1.3985198254666123e-05, + "loss": 0.7669, + "step": 3803 + }, + { + "epoch": 0.3885597548518897, + "grad_norm": 1.4105008365856497, + "learning_rate": 1.3982163767218988e-05, + "loss": 0.7082, + "step": 3804 + }, + { + "epoch": 0.38866189989785493, + "grad_norm": 1.4716133457961391, + "learning_rate": 1.397912884392145e-05, + "loss": 0.6897, + "step": 3805 + }, + { + "epoch": 0.3887640449438202, + "grad_norm": 1.4736651135307453, + "learning_rate": 1.397609348510569e-05, + "loss": 0.6549, + "step": 3806 + }, + { + "epoch": 0.3888661899897855, + "grad_norm": 1.3449189314402574, + "learning_rate": 1.397305769110393e-05, + "loss": 0.7241, + "step": 3807 + }, + { + "epoch": 0.3889683350357508, + "grad_norm": 1.560656152061021, + "learning_rate": 1.3970021462248438e-05, + "loss": 0.685, + "step": 3808 + }, + { + "epoch": 0.389070480081716, + "grad_norm": 1.5500911803409483, + "learning_rate": 1.3966984798871533e-05, + "loss": 0.7301, + "step": 3809 + }, + { + "epoch": 0.3891726251276813, + "grad_norm": 1.5774516460888948, + "learning_rate": 1.3963947701305576e-05, + "loss": 0.8131, + "step": 3810 + }, + { + "epoch": 0.3892747701736466, + "grad_norm": 1.4057606200694768, + "learning_rate": 1.3960910169882986e-05, + "loss": 0.7497, + "step": 3811 + }, + { + "epoch": 0.3893769152196119, + "grad_norm": 1.6710326193218412, + "learning_rate": 1.3957872204936217e-05, + "loss": 0.7145, + "step": 3812 + }, + { + "epoch": 0.3894790602655771, + "grad_norm": 1.5103702633037224, + "learning_rate": 1.3954833806797777e-05, + "loss": 0.7026, + "step": 3813 + }, + { + "epoch": 0.3895812053115424, + "grad_norm": 1.4687401064318544, + "learning_rate": 1.3951794975800223e-05, + "loss": 0.7619, + "step": 3814 + }, + { + "epoch": 0.3896833503575077, + "grad_norm": 1.3630355205105802, + "learning_rate": 1.3948755712276156e-05, + "loss": 0.7237, + "step": 3815 + }, + { + "epoch": 0.38978549540347296, + "grad_norm": 1.415145336045743, + "learning_rate": 1.3945716016558227e-05, + "loss": 0.6942, + "step": 3816 + }, + { + "epoch": 0.3898876404494382, + "grad_norm": 1.492937099869908, + "learning_rate": 1.3942675888979126e-05, + "loss": 0.6405, + "step": 3817 + }, + { + "epoch": 0.3899897854954035, + "grad_norm": 1.447914265191576, + "learning_rate": 1.3939635329871606e-05, + "loss": 0.7372, + "step": 3818 + }, + { + "epoch": 0.39009193054136876, + "grad_norm": 1.582482153145002, + "learning_rate": 1.3936594339568453e-05, + "loss": 0.7709, + "step": 3819 + }, + { + "epoch": 0.390194075587334, + "grad_norm": 1.4372404652014534, + "learning_rate": 1.3933552918402504e-05, + "loss": 0.7453, + "step": 3820 + }, + { + "epoch": 0.3902962206332993, + "grad_norm": 1.5969613261130475, + "learning_rate": 1.3930511066706647e-05, + "loss": 0.6739, + "step": 3821 + }, + { + "epoch": 0.39039836567926456, + "grad_norm": 1.2944315027282458, + "learning_rate": 1.3927468784813816e-05, + "loss": 0.6752, + "step": 3822 + }, + { + "epoch": 0.39050051072522984, + "grad_norm": 1.4333652858388763, + "learning_rate": 1.3924426073056988e-05, + "loss": 0.6743, + "step": 3823 + }, + { + "epoch": 0.3906026557711951, + "grad_norm": 1.6869741803673837, + "learning_rate": 1.3921382931769193e-05, + "loss": 0.755, + "step": 3824 + }, + { + "epoch": 0.39070480081716036, + "grad_norm": 1.5869567302424308, + "learning_rate": 1.3918339361283498e-05, + "loss": 0.7461, + "step": 3825 + }, + { + "epoch": 0.39080694586312564, + "grad_norm": 1.4032276389396365, + "learning_rate": 1.391529536193303e-05, + "loss": 0.8414, + "step": 3826 + }, + { + "epoch": 0.39090909090909093, + "grad_norm": 1.5070123528417991, + "learning_rate": 1.3912250934050955e-05, + "loss": 0.7143, + "step": 3827 + }, + { + "epoch": 0.39101123595505616, + "grad_norm": 1.4976870547125534, + "learning_rate": 1.390920607797049e-05, + "loss": 0.7817, + "step": 3828 + }, + { + "epoch": 0.39111338100102144, + "grad_norm": 1.5725928830087903, + "learning_rate": 1.3906160794024892e-05, + "loss": 0.9126, + "step": 3829 + }, + { + "epoch": 0.39121552604698673, + "grad_norm": 1.5718699984987594, + "learning_rate": 1.390311508254747e-05, + "loss": 0.6704, + "step": 3830 + }, + { + "epoch": 0.391317671092952, + "grad_norm": 1.4603811231893467, + "learning_rate": 1.3900068943871585e-05, + "loss": 0.7165, + "step": 3831 + }, + { + "epoch": 0.39141981613891724, + "grad_norm": 1.7015118894806842, + "learning_rate": 1.3897022378330631e-05, + "loss": 0.8638, + "step": 3832 + }, + { + "epoch": 0.3915219611848825, + "grad_norm": 1.4157809966635893, + "learning_rate": 1.389397538625806e-05, + "loss": 0.7468, + "step": 3833 + }, + { + "epoch": 0.3916241062308478, + "grad_norm": 1.3652585344398818, + "learning_rate": 1.3890927967987368e-05, + "loss": 0.5949, + "step": 3834 + }, + { + "epoch": 0.3917262512768131, + "grad_norm": 1.3374366716300028, + "learning_rate": 1.3887880123852097e-05, + "loss": 0.7264, + "step": 3835 + }, + { + "epoch": 0.3918283963227783, + "grad_norm": 1.2945434004372032, + "learning_rate": 1.3884831854185833e-05, + "loss": 0.582, + "step": 3836 + }, + { + "epoch": 0.3919305413687436, + "grad_norm": 1.5209896190377057, + "learning_rate": 1.3881783159322212e-05, + "loss": 0.7425, + "step": 3837 + }, + { + "epoch": 0.3920326864147089, + "grad_norm": 1.4302410886237777, + "learning_rate": 1.3878734039594919e-05, + "loss": 0.7638, + "step": 3838 + }, + { + "epoch": 0.3921348314606742, + "grad_norm": 1.3544835154354662, + "learning_rate": 1.3875684495337677e-05, + "loss": 0.6926, + "step": 3839 + }, + { + "epoch": 0.3922369765066394, + "grad_norm": 1.5641842309190743, + "learning_rate": 1.3872634526884263e-05, + "loss": 0.7424, + "step": 3840 + }, + { + "epoch": 0.3923391215526047, + "grad_norm": 1.428636050913717, + "learning_rate": 1.3869584134568498e-05, + "loss": 0.675, + "step": 3841 + }, + { + "epoch": 0.39244126659857, + "grad_norm": 1.5404072261839439, + "learning_rate": 1.3866533318724251e-05, + "loss": 0.7564, + "step": 3842 + }, + { + "epoch": 0.39254341164453527, + "grad_norm": 1.5180082883217445, + "learning_rate": 1.3863482079685434e-05, + "loss": 0.7947, + "step": 3843 + }, + { + "epoch": 0.3926455566905005, + "grad_norm": 1.533991228673674, + "learning_rate": 1.3860430417786007e-05, + "loss": 0.8351, + "step": 3844 + }, + { + "epoch": 0.3927477017364658, + "grad_norm": 1.3786754232558622, + "learning_rate": 1.3857378333359974e-05, + "loss": 0.7524, + "step": 3845 + }, + { + "epoch": 0.39284984678243107, + "grad_norm": 1.4671042680694282, + "learning_rate": 1.3854325826741394e-05, + "loss": 0.6452, + "step": 3846 + }, + { + "epoch": 0.3929519918283963, + "grad_norm": 1.459733720941895, + "learning_rate": 1.385127289826436e-05, + "loss": 0.8106, + "step": 3847 + }, + { + "epoch": 0.3930541368743616, + "grad_norm": 1.419738861353782, + "learning_rate": 1.384821954826302e-05, + "loss": 0.6574, + "step": 3848 + }, + { + "epoch": 0.39315628192032687, + "grad_norm": 1.4382932929675243, + "learning_rate": 1.3845165777071563e-05, + "loss": 0.8002, + "step": 3849 + }, + { + "epoch": 0.39325842696629215, + "grad_norm": 1.498302725620065, + "learning_rate": 1.3842111585024228e-05, + "loss": 0.8133, + "step": 3850 + }, + { + "epoch": 0.3933605720122574, + "grad_norm": 1.3826833795166764, + "learning_rate": 1.3839056972455298e-05, + "loss": 0.7271, + "step": 3851 + }, + { + "epoch": 0.39346271705822267, + "grad_norm": 1.5736391347419851, + "learning_rate": 1.3836001939699103e-05, + "loss": 0.7228, + "step": 3852 + }, + { + "epoch": 0.39356486210418795, + "grad_norm": 1.4211492796325893, + "learning_rate": 1.3832946487090013e-05, + "loss": 0.7225, + "step": 3853 + }, + { + "epoch": 0.39366700715015324, + "grad_norm": 1.4849278164492081, + "learning_rate": 1.3829890614962458e-05, + "loss": 0.7395, + "step": 3854 + }, + { + "epoch": 0.39376915219611847, + "grad_norm": 1.4827839462755892, + "learning_rate": 1.3826834323650899e-05, + "loss": 0.8246, + "step": 3855 + }, + { + "epoch": 0.39387129724208375, + "grad_norm": 1.4550382409605542, + "learning_rate": 1.3823777613489853e-05, + "loss": 0.8323, + "step": 3856 + }, + { + "epoch": 0.39397344228804904, + "grad_norm": 1.2993347507634034, + "learning_rate": 1.3820720484813874e-05, + "loss": 0.6335, + "step": 3857 + }, + { + "epoch": 0.3940755873340143, + "grad_norm": 1.6127229718692306, + "learning_rate": 1.381766293795757e-05, + "loss": 0.7798, + "step": 3858 + }, + { + "epoch": 0.39417773237997955, + "grad_norm": 1.4971257082698561, + "learning_rate": 1.381460497325559e-05, + "loss": 0.8368, + "step": 3859 + }, + { + "epoch": 0.39427987742594484, + "grad_norm": 1.4936932312978695, + "learning_rate": 1.3811546591042632e-05, + "loss": 0.6884, + "step": 3860 + }, + { + "epoch": 0.3943820224719101, + "grad_norm": 1.447778982344869, + "learning_rate": 1.3808487791653438e-05, + "loss": 0.7363, + "step": 3861 + }, + { + "epoch": 0.3944841675178754, + "grad_norm": 1.4496123710529063, + "learning_rate": 1.3805428575422795e-05, + "loss": 0.6958, + "step": 3862 + }, + { + "epoch": 0.39458631256384064, + "grad_norm": 1.5031828740202973, + "learning_rate": 1.3802368942685536e-05, + "loss": 0.7102, + "step": 3863 + }, + { + "epoch": 0.3946884576098059, + "grad_norm": 1.4712468047199487, + "learning_rate": 1.3799308893776537e-05, + "loss": 0.5528, + "step": 3864 + }, + { + "epoch": 0.3947906026557712, + "grad_norm": 1.575318837743212, + "learning_rate": 1.3796248429030727e-05, + "loss": 0.8104, + "step": 3865 + }, + { + "epoch": 0.3948927477017365, + "grad_norm": 1.6240248308077432, + "learning_rate": 1.3793187548783073e-05, + "loss": 0.7764, + "step": 3866 + }, + { + "epoch": 0.3949948927477017, + "grad_norm": 1.5629759436729338, + "learning_rate": 1.379012625336859e-05, + "loss": 0.6918, + "step": 3867 + }, + { + "epoch": 0.395097037793667, + "grad_norm": 1.4860945088833404, + "learning_rate": 1.3787064543122344e-05, + "loss": 0.7773, + "step": 3868 + }, + { + "epoch": 0.3951991828396323, + "grad_norm": 1.3403475529031001, + "learning_rate": 1.3784002418379432e-05, + "loss": 0.8167, + "step": 3869 + }, + { + "epoch": 0.3953013278855976, + "grad_norm": 1.3831819229421647, + "learning_rate": 1.3780939879475013e-05, + "loss": 0.7459, + "step": 3870 + }, + { + "epoch": 0.3954034729315628, + "grad_norm": 1.6226970679988384, + "learning_rate": 1.3777876926744279e-05, + "loss": 0.851, + "step": 3871 + }, + { + "epoch": 0.3955056179775281, + "grad_norm": 1.421789867750657, + "learning_rate": 1.3774813560522477e-05, + "loss": 0.6483, + "step": 3872 + }, + { + "epoch": 0.3956077630234934, + "grad_norm": 1.3039819223380655, + "learning_rate": 1.3771749781144893e-05, + "loss": 0.6996, + "step": 3873 + }, + { + "epoch": 0.3957099080694586, + "grad_norm": 1.5443144584540511, + "learning_rate": 1.3768685588946855e-05, + "loss": 0.7145, + "step": 3874 + }, + { + "epoch": 0.3958120531154239, + "grad_norm": 1.5121372025367539, + "learning_rate": 1.3765620984263747e-05, + "loss": 0.7554, + "step": 3875 + }, + { + "epoch": 0.3959141981613892, + "grad_norm": 1.5581868135274402, + "learning_rate": 1.3762555967430988e-05, + "loss": 0.8459, + "step": 3876 + }, + { + "epoch": 0.39601634320735446, + "grad_norm": 1.4945122856245026, + "learning_rate": 1.3759490538784051e-05, + "loss": 0.6874, + "step": 3877 + }, + { + "epoch": 0.3961184882533197, + "grad_norm": 1.3293848597846276, + "learning_rate": 1.3756424698658442e-05, + "loss": 0.6224, + "step": 3878 + }, + { + "epoch": 0.396220633299285, + "grad_norm": 1.444593536172488, + "learning_rate": 1.3753358447389722e-05, + "loss": 0.7244, + "step": 3879 + }, + { + "epoch": 0.39632277834525026, + "grad_norm": 1.4577429935459794, + "learning_rate": 1.3750291785313498e-05, + "loss": 0.7573, + "step": 3880 + }, + { + "epoch": 0.39642492339121554, + "grad_norm": 1.4053607183030763, + "learning_rate": 1.3747224712765413e-05, + "loss": 0.7458, + "step": 3881 + }, + { + "epoch": 0.3965270684371808, + "grad_norm": 1.5280314723356028, + "learning_rate": 1.374415723008116e-05, + "loss": 0.7045, + "step": 3882 + }, + { + "epoch": 0.39662921348314606, + "grad_norm": 1.4203749531374863, + "learning_rate": 1.3741089337596485e-05, + "loss": 0.7055, + "step": 3883 + }, + { + "epoch": 0.39673135852911134, + "grad_norm": 1.511675214775165, + "learning_rate": 1.3738021035647162e-05, + "loss": 0.7812, + "step": 3884 + }, + { + "epoch": 0.39683350357507663, + "grad_norm": 1.4393952808548964, + "learning_rate": 1.3734952324569022e-05, + "loss": 0.7806, + "step": 3885 + }, + { + "epoch": 0.39693564862104186, + "grad_norm": 1.5449155181200394, + "learning_rate": 1.3731883204697933e-05, + "loss": 0.7001, + "step": 3886 + }, + { + "epoch": 0.39703779366700714, + "grad_norm": 1.4263016552807994, + "learning_rate": 1.3728813676369824e-05, + "loss": 0.6522, + "step": 3887 + }, + { + "epoch": 0.39713993871297243, + "grad_norm": 1.574472780914036, + "learning_rate": 1.3725743739920643e-05, + "loss": 0.7653, + "step": 3888 + }, + { + "epoch": 0.3972420837589377, + "grad_norm": 1.3321799224685371, + "learning_rate": 1.3722673395686403e-05, + "loss": 0.693, + "step": 3889 + }, + { + "epoch": 0.39734422880490294, + "grad_norm": 1.5680343527042515, + "learning_rate": 1.3719602644003157e-05, + "loss": 0.8382, + "step": 3890 + }, + { + "epoch": 0.39744637385086823, + "grad_norm": 1.4625928324280115, + "learning_rate": 1.3716531485206996e-05, + "loss": 0.737, + "step": 3891 + }, + { + "epoch": 0.3975485188968335, + "grad_norm": 1.377147868448086, + "learning_rate": 1.3713459919634065e-05, + "loss": 0.7215, + "step": 3892 + }, + { + "epoch": 0.3976506639427988, + "grad_norm": 1.443674178481787, + "learning_rate": 1.3710387947620545e-05, + "loss": 0.7047, + "step": 3893 + }, + { + "epoch": 0.39775280898876403, + "grad_norm": 1.3803633943725935, + "learning_rate": 1.3707315569502666e-05, + "loss": 0.6387, + "step": 3894 + }, + { + "epoch": 0.3978549540347293, + "grad_norm": 1.4414392107718101, + "learning_rate": 1.3704242785616706e-05, + "loss": 0.8225, + "step": 3895 + }, + { + "epoch": 0.3979570990806946, + "grad_norm": 1.3682923726441567, + "learning_rate": 1.3701169596298978e-05, + "loss": 0.624, + "step": 3896 + }, + { + "epoch": 0.3980592441266599, + "grad_norm": 1.4218465584577573, + "learning_rate": 1.3698096001885847e-05, + "loss": 0.7508, + "step": 3897 + }, + { + "epoch": 0.3981613891726251, + "grad_norm": 1.4811466445923365, + "learning_rate": 1.3695022002713718e-05, + "loss": 0.7379, + "step": 3898 + }, + { + "epoch": 0.3982635342185904, + "grad_norm": 1.405100123345751, + "learning_rate": 1.3691947599119045e-05, + "loss": 0.7282, + "step": 3899 + }, + { + "epoch": 0.3983656792645557, + "grad_norm": 1.3975491629649694, + "learning_rate": 1.3688872791438321e-05, + "loss": 0.7171, + "step": 3900 + }, + { + "epoch": 0.3984678243105209, + "grad_norm": 1.528797309142376, + "learning_rate": 1.368579758000809e-05, + "loss": 0.6896, + "step": 3901 + }, + { + "epoch": 0.3985699693564862, + "grad_norm": 1.511922383903831, + "learning_rate": 1.3682721965164927e-05, + "loss": 0.7519, + "step": 3902 + }, + { + "epoch": 0.3986721144024515, + "grad_norm": 1.5482907535340173, + "learning_rate": 1.3679645947245468e-05, + "loss": 0.7152, + "step": 3903 + }, + { + "epoch": 0.39877425944841677, + "grad_norm": 1.4637116582041945, + "learning_rate": 1.3676569526586383e-05, + "loss": 0.7795, + "step": 3904 + }, + { + "epoch": 0.398876404494382, + "grad_norm": 1.6253152936574335, + "learning_rate": 1.3673492703524387e-05, + "loss": 0.7342, + "step": 3905 + }, + { + "epoch": 0.3989785495403473, + "grad_norm": 1.4816691856353705, + "learning_rate": 1.3670415478396241e-05, + "loss": 0.6209, + "step": 3906 + }, + { + "epoch": 0.39908069458631257, + "grad_norm": 1.604340339585944, + "learning_rate": 1.3667337851538753e-05, + "loss": 0.7176, + "step": 3907 + }, + { + "epoch": 0.39918283963227785, + "grad_norm": 1.4886907759318158, + "learning_rate": 1.3664259823288764e-05, + "loss": 0.6721, + "step": 3908 + }, + { + "epoch": 0.3992849846782431, + "grad_norm": 1.4849185479657037, + "learning_rate": 1.3661181393983171e-05, + "loss": 0.7908, + "step": 3909 + }, + { + "epoch": 0.39938712972420837, + "grad_norm": 1.5228324369368267, + "learning_rate": 1.365810256395891e-05, + "loss": 0.6371, + "step": 3910 + }, + { + "epoch": 0.39948927477017365, + "grad_norm": 1.360855616566488, + "learning_rate": 1.3655023333552957e-05, + "loss": 0.7231, + "step": 3911 + }, + { + "epoch": 0.39959141981613894, + "grad_norm": 1.385931559379355, + "learning_rate": 1.3651943703102344e-05, + "loss": 0.7833, + "step": 3912 + }, + { + "epoch": 0.39969356486210417, + "grad_norm": 1.5691124569216457, + "learning_rate": 1.3648863672944129e-05, + "loss": 0.846, + "step": 3913 + }, + { + "epoch": 0.39979570990806945, + "grad_norm": 1.5164288859050117, + "learning_rate": 1.3645783243415427e-05, + "loss": 0.6665, + "step": 3914 + }, + { + "epoch": 0.39989785495403474, + "grad_norm": 1.4868281796022975, + "learning_rate": 1.3642702414853395e-05, + "loss": 0.6937, + "step": 3915 + }, + { + "epoch": 0.4, + "grad_norm": 1.5487084566709683, + "learning_rate": 1.3639621187595231e-05, + "loss": 0.656, + "step": 3916 + }, + { + "epoch": 0.40010214504596525, + "grad_norm": 1.3863966900893654, + "learning_rate": 1.3636539561978177e-05, + "loss": 0.704, + "step": 3917 + }, + { + "epoch": 0.40020429009193054, + "grad_norm": 1.35704258014032, + "learning_rate": 1.3633457538339514e-05, + "loss": 0.7869, + "step": 3918 + }, + { + "epoch": 0.4003064351378958, + "grad_norm": 1.4464960838959504, + "learning_rate": 1.3630375117016581e-05, + "loss": 0.8135, + "step": 3919 + }, + { + "epoch": 0.4004085801838611, + "grad_norm": 1.4228264552800438, + "learning_rate": 1.3627292298346745e-05, + "loss": 0.7292, + "step": 3920 + }, + { + "epoch": 0.40051072522982634, + "grad_norm": 1.458528323337574, + "learning_rate": 1.3624209082667421e-05, + "loss": 0.7324, + "step": 3921 + }, + { + "epoch": 0.4006128702757916, + "grad_norm": 1.7129037280627757, + "learning_rate": 1.3621125470316075e-05, + "loss": 0.7734, + "step": 3922 + }, + { + "epoch": 0.4007150153217569, + "grad_norm": 1.5492497224615298, + "learning_rate": 1.3618041461630203e-05, + "loss": 0.7483, + "step": 3923 + }, + { + "epoch": 0.4008171603677222, + "grad_norm": 1.4286927839640557, + "learning_rate": 1.3614957056947358e-05, + "loss": 0.6844, + "step": 3924 + }, + { + "epoch": 0.4009193054136874, + "grad_norm": 1.3623250262125992, + "learning_rate": 1.3611872256605126e-05, + "loss": 0.7252, + "step": 3925 + }, + { + "epoch": 0.4010214504596527, + "grad_norm": 1.268783404075245, + "learning_rate": 1.3608787060941143e-05, + "loss": 0.6798, + "step": 3926 + }, + { + "epoch": 0.401123595505618, + "grad_norm": 1.3210037544716697, + "learning_rate": 1.3605701470293084e-05, + "loss": 0.7512, + "step": 3927 + }, + { + "epoch": 0.4012257405515832, + "grad_norm": 1.3830901556247912, + "learning_rate": 1.3602615484998669e-05, + "loss": 0.6789, + "step": 3928 + }, + { + "epoch": 0.4013278855975485, + "grad_norm": 1.3599463721294547, + "learning_rate": 1.3599529105395664e-05, + "loss": 0.6441, + "step": 3929 + }, + { + "epoch": 0.4014300306435138, + "grad_norm": 1.3537263893132887, + "learning_rate": 1.3596442331821868e-05, + "loss": 0.7625, + "step": 3930 + }, + { + "epoch": 0.4015321756894791, + "grad_norm": 1.4744110922660467, + "learning_rate": 1.3593355164615139e-05, + "loss": 0.755, + "step": 3931 + }, + { + "epoch": 0.4016343207354443, + "grad_norm": 1.4421667567694034, + "learning_rate": 1.3590267604113363e-05, + "loss": 0.829, + "step": 3932 + }, + { + "epoch": 0.4017364657814096, + "grad_norm": 1.4935713368118413, + "learning_rate": 1.3587179650654483e-05, + "loss": 0.6835, + "step": 3933 + }, + { + "epoch": 0.4018386108273749, + "grad_norm": 1.5524718741980919, + "learning_rate": 1.3584091304576468e-05, + "loss": 0.7731, + "step": 3934 + }, + { + "epoch": 0.40194075587334016, + "grad_norm": 1.4621447786162751, + "learning_rate": 1.3581002566217346e-05, + "loss": 0.7162, + "step": 3935 + }, + { + "epoch": 0.4020429009193054, + "grad_norm": 1.3211761698932112, + "learning_rate": 1.3577913435915179e-05, + "loss": 0.7729, + "step": 3936 + }, + { + "epoch": 0.4021450459652707, + "grad_norm": 1.347435497718406, + "learning_rate": 1.3574823914008075e-05, + "loss": 0.6075, + "step": 3937 + }, + { + "epoch": 0.40224719101123596, + "grad_norm": 1.4375418782120868, + "learning_rate": 1.3571734000834184e-05, + "loss": 0.7448, + "step": 3938 + }, + { + "epoch": 0.40234933605720125, + "grad_norm": 1.5612880733078671, + "learning_rate": 1.3568643696731701e-05, + "loss": 0.7331, + "step": 3939 + }, + { + "epoch": 0.4024514811031665, + "grad_norm": 1.2174957421342552, + "learning_rate": 1.3565553002038857e-05, + "loss": 0.6215, + "step": 3940 + }, + { + "epoch": 0.40255362614913176, + "grad_norm": 1.40596212385612, + "learning_rate": 1.3562461917093933e-05, + "loss": 0.6735, + "step": 3941 + }, + { + "epoch": 0.40265577119509705, + "grad_norm": 1.5559281528455997, + "learning_rate": 1.3559370442235248e-05, + "loss": 0.869, + "step": 3942 + }, + { + "epoch": 0.40275791624106233, + "grad_norm": 1.4389679535163094, + "learning_rate": 1.3556278577801174e-05, + "loss": 0.6565, + "step": 3943 + }, + { + "epoch": 0.40286006128702756, + "grad_norm": 1.30427387696139, + "learning_rate": 1.3553186324130113e-05, + "loss": 0.6137, + "step": 3944 + }, + { + "epoch": 0.40296220633299284, + "grad_norm": 1.5912372970477873, + "learning_rate": 1.355009368156051e-05, + "loss": 0.8153, + "step": 3945 + }, + { + "epoch": 0.40306435137895813, + "grad_norm": 1.4314962657769297, + "learning_rate": 1.354700065043086e-05, + "loss": 0.6701, + "step": 3946 + }, + { + "epoch": 0.4031664964249234, + "grad_norm": 1.3398392432440458, + "learning_rate": 1.3543907231079695e-05, + "loss": 0.6382, + "step": 3947 + }, + { + "epoch": 0.40326864147088864, + "grad_norm": 1.408398024456932, + "learning_rate": 1.3540813423845598e-05, + "loss": 0.701, + "step": 3948 + }, + { + "epoch": 0.40337078651685393, + "grad_norm": 1.430391931582551, + "learning_rate": 1.3537719229067182e-05, + "loss": 0.6059, + "step": 3949 + }, + { + "epoch": 0.4034729315628192, + "grad_norm": 1.40930955091006, + "learning_rate": 1.353462464708311e-05, + "loss": 0.6494, + "step": 3950 + }, + { + "epoch": 0.4035750766087845, + "grad_norm": 1.707247250730458, + "learning_rate": 1.353152967823209e-05, + "loss": 0.699, + "step": 3951 + }, + { + "epoch": 0.40367722165474973, + "grad_norm": 1.7545118283765881, + "learning_rate": 1.352843432285286e-05, + "loss": 0.7723, + "step": 3952 + }, + { + "epoch": 0.403779366700715, + "grad_norm": 1.2924869944767148, + "learning_rate": 1.3525338581284217e-05, + "loss": 0.6966, + "step": 3953 + }, + { + "epoch": 0.4038815117466803, + "grad_norm": 1.4809601285026897, + "learning_rate": 1.3522242453864989e-05, + "loss": 0.8494, + "step": 3954 + }, + { + "epoch": 0.40398365679264553, + "grad_norm": 1.6202538099161883, + "learning_rate": 1.3519145940934046e-05, + "loss": 0.7635, + "step": 3955 + }, + { + "epoch": 0.4040858018386108, + "grad_norm": 1.4840210725938705, + "learning_rate": 1.3516049042830309e-05, + "loss": 0.6187, + "step": 3956 + }, + { + "epoch": 0.4041879468845761, + "grad_norm": 1.4016511165760077, + "learning_rate": 1.3512951759892732e-05, + "loss": 0.6902, + "step": 3957 + }, + { + "epoch": 0.4042900919305414, + "grad_norm": 1.4490165364294925, + "learning_rate": 1.3509854092460312e-05, + "loss": 0.8288, + "step": 3958 + }, + { + "epoch": 0.4043922369765066, + "grad_norm": 1.5945364482774207, + "learning_rate": 1.3506756040872098e-05, + "loss": 0.8579, + "step": 3959 + }, + { + "epoch": 0.4044943820224719, + "grad_norm": 1.6134813830821937, + "learning_rate": 1.3503657605467169e-05, + "loss": 0.7971, + "step": 3960 + }, + { + "epoch": 0.4045965270684372, + "grad_norm": 1.4130723425582792, + "learning_rate": 1.3500558786584652e-05, + "loss": 0.7422, + "step": 3961 + }, + { + "epoch": 0.40469867211440247, + "grad_norm": 1.3897062579924135, + "learning_rate": 1.349745958456371e-05, + "loss": 0.6762, + "step": 3962 + }, + { + "epoch": 0.4048008171603677, + "grad_norm": 1.4443174499233737, + "learning_rate": 1.349435999974356e-05, + "loss": 0.7461, + "step": 3963 + }, + { + "epoch": 0.404902962206333, + "grad_norm": 1.3455979722265679, + "learning_rate": 1.349126003246345e-05, + "loss": 0.6902, + "step": 3964 + }, + { + "epoch": 0.40500510725229827, + "grad_norm": 1.4714956855903425, + "learning_rate": 1.3488159683062676e-05, + "loss": 0.792, + "step": 3965 + }, + { + "epoch": 0.40510725229826355, + "grad_norm": 1.3750750688392048, + "learning_rate": 1.3485058951880567e-05, + "loss": 0.7326, + "step": 3966 + }, + { + "epoch": 0.4052093973442288, + "grad_norm": 1.4790583048513137, + "learning_rate": 1.3481957839256507e-05, + "loss": 0.6843, + "step": 3967 + }, + { + "epoch": 0.40531154239019407, + "grad_norm": 1.5837593203258813, + "learning_rate": 1.3478856345529912e-05, + "loss": 0.8519, + "step": 3968 + }, + { + "epoch": 0.40541368743615935, + "grad_norm": 1.5041052195877496, + "learning_rate": 1.3475754471040241e-05, + "loss": 0.7565, + "step": 3969 + }, + { + "epoch": 0.40551583248212464, + "grad_norm": 1.410686834330223, + "learning_rate": 1.3472652216126995e-05, + "loss": 0.6298, + "step": 3970 + }, + { + "epoch": 0.40561797752808987, + "grad_norm": 1.3890734150718758, + "learning_rate": 1.3469549581129726e-05, + "loss": 0.7079, + "step": 3971 + }, + { + "epoch": 0.40572012257405515, + "grad_norm": 1.4950766854091868, + "learning_rate": 1.3466446566388009e-05, + "loss": 0.8109, + "step": 3972 + }, + { + "epoch": 0.40582226762002044, + "grad_norm": 1.3807013259326402, + "learning_rate": 1.3463343172241481e-05, + "loss": 0.7695, + "step": 3973 + }, + { + "epoch": 0.4059244126659857, + "grad_norm": 1.6045034408931116, + "learning_rate": 1.3460239399029797e-05, + "loss": 0.8856, + "step": 3974 + }, + { + "epoch": 0.40602655771195095, + "grad_norm": 1.566836627955001, + "learning_rate": 1.3457135247092681e-05, + "loss": 0.7502, + "step": 3975 + }, + { + "epoch": 0.40612870275791624, + "grad_norm": 1.5340444028732412, + "learning_rate": 1.3454030716769877e-05, + "loss": 0.8823, + "step": 3976 + }, + { + "epoch": 0.4062308478038815, + "grad_norm": 1.7764155344383277, + "learning_rate": 1.3450925808401183e-05, + "loss": 0.7316, + "step": 3977 + }, + { + "epoch": 0.4063329928498468, + "grad_norm": 1.315757738793011, + "learning_rate": 1.3447820522326424e-05, + "loss": 0.6989, + "step": 3978 + }, + { + "epoch": 0.40643513789581204, + "grad_norm": 1.4030004981576072, + "learning_rate": 1.3444714858885483e-05, + "loss": 0.6856, + "step": 3979 + }, + { + "epoch": 0.4065372829417773, + "grad_norm": 1.3876633012636734, + "learning_rate": 1.3441608818418279e-05, + "loss": 0.6768, + "step": 3980 + }, + { + "epoch": 0.4066394279877426, + "grad_norm": 1.421451630320385, + "learning_rate": 1.3438502401264761e-05, + "loss": 0.673, + "step": 3981 + }, + { + "epoch": 0.4067415730337079, + "grad_norm": 1.4119383108502743, + "learning_rate": 1.3435395607764937e-05, + "loss": 0.6616, + "step": 3982 + }, + { + "epoch": 0.4068437180796731, + "grad_norm": 1.202245556386396, + "learning_rate": 1.3432288438258842e-05, + "loss": 0.7176, + "step": 3983 + }, + { + "epoch": 0.4069458631256384, + "grad_norm": 1.6158847333721884, + "learning_rate": 1.3429180893086563e-05, + "loss": 0.7897, + "step": 3984 + }, + { + "epoch": 0.4070480081716037, + "grad_norm": 1.5983544361641924, + "learning_rate": 1.3426072972588218e-05, + "loss": 0.7201, + "step": 3985 + }, + { + "epoch": 0.4071501532175689, + "grad_norm": 1.393940110995502, + "learning_rate": 1.3422964677103969e-05, + "loss": 0.6928, + "step": 3986 + }, + { + "epoch": 0.4072522982635342, + "grad_norm": 1.4549544872447078, + "learning_rate": 1.341985600697403e-05, + "loss": 0.824, + "step": 3987 + }, + { + "epoch": 0.4073544433094995, + "grad_norm": 1.4000378655231784, + "learning_rate": 1.341674696253864e-05, + "loss": 0.7821, + "step": 3988 + }, + { + "epoch": 0.4074565883554648, + "grad_norm": 1.318623792533593, + "learning_rate": 1.3413637544138088e-05, + "loss": 0.6568, + "step": 3989 + }, + { + "epoch": 0.40755873340143, + "grad_norm": 1.4527997464739035, + "learning_rate": 1.3410527752112699e-05, + "loss": 0.6684, + "step": 3990 + }, + { + "epoch": 0.4076608784473953, + "grad_norm": 1.423173170425724, + "learning_rate": 1.3407417586802845e-05, + "loss": 0.7722, + "step": 3991 + }, + { + "epoch": 0.4077630234933606, + "grad_norm": 1.4800233853858624, + "learning_rate": 1.3404307048548934e-05, + "loss": 0.6447, + "step": 3992 + }, + { + "epoch": 0.40786516853932586, + "grad_norm": 1.6048929331152626, + "learning_rate": 1.340119613769142e-05, + "loss": 0.8272, + "step": 3993 + }, + { + "epoch": 0.4079673135852911, + "grad_norm": 1.5216292578030324, + "learning_rate": 1.3398084854570788e-05, + "loss": 0.6717, + "step": 3994 + }, + { + "epoch": 0.4080694586312564, + "grad_norm": 1.3884734952559645, + "learning_rate": 1.3394973199527575e-05, + "loss": 0.6891, + "step": 3995 + }, + { + "epoch": 0.40817160367722166, + "grad_norm": 1.4620912276892837, + "learning_rate": 1.339186117290235e-05, + "loss": 0.711, + "step": 3996 + }, + { + "epoch": 0.40827374872318695, + "grad_norm": 1.6461523101663367, + "learning_rate": 1.3388748775035732e-05, + "loss": 0.7496, + "step": 3997 + }, + { + "epoch": 0.4083758937691522, + "grad_norm": 1.4212794504962631, + "learning_rate": 1.3385636006268367e-05, + "loss": 0.6946, + "step": 3998 + }, + { + "epoch": 0.40847803881511746, + "grad_norm": 1.5977174117370683, + "learning_rate": 1.3382522866940955e-05, + "loss": 0.7254, + "step": 3999 + }, + { + "epoch": 0.40858018386108275, + "grad_norm": 1.4268446372800223, + "learning_rate": 1.3379409357394231e-05, + "loss": 0.6467, + "step": 4000 + }, + { + "epoch": 0.40868232890704803, + "grad_norm": 1.3955436396136944, + "learning_rate": 1.3376295477968968e-05, + "loss": 0.7009, + "step": 4001 + }, + { + "epoch": 0.40878447395301326, + "grad_norm": 1.4970253741797461, + "learning_rate": 1.3373181229005985e-05, + "loss": 0.7233, + "step": 4002 + }, + { + "epoch": 0.40888661899897855, + "grad_norm": 1.6427893487077017, + "learning_rate": 1.3370066610846136e-05, + "loss": 0.7567, + "step": 4003 + }, + { + "epoch": 0.40898876404494383, + "grad_norm": 1.6248064347821283, + "learning_rate": 1.336695162383032e-05, + "loss": 0.8051, + "step": 4004 + }, + { + "epoch": 0.4090909090909091, + "grad_norm": 1.2635339919605855, + "learning_rate": 1.3363836268299472e-05, + "loss": 0.6449, + "step": 4005 + }, + { + "epoch": 0.40919305413687435, + "grad_norm": 1.5407783019862278, + "learning_rate": 1.3360720544594572e-05, + "loss": 0.7464, + "step": 4006 + }, + { + "epoch": 0.40929519918283963, + "grad_norm": 1.429155086244256, + "learning_rate": 1.3357604453056636e-05, + "loss": 0.8006, + "step": 4007 + }, + { + "epoch": 0.4093973442288049, + "grad_norm": 1.4740954139759064, + "learning_rate": 1.3354487994026726e-05, + "loss": 0.6859, + "step": 4008 + }, + { + "epoch": 0.4094994892747702, + "grad_norm": 1.4983981257284926, + "learning_rate": 1.3351371167845938e-05, + "loss": 0.6519, + "step": 4009 + }, + { + "epoch": 0.40960163432073543, + "grad_norm": 1.4467496037837162, + "learning_rate": 1.3348253974855407e-05, + "loss": 0.6773, + "step": 4010 + }, + { + "epoch": 0.4097037793667007, + "grad_norm": 1.464234118623483, + "learning_rate": 1.3345136415396317e-05, + "loss": 0.7006, + "step": 4011 + }, + { + "epoch": 0.409805924412666, + "grad_norm": 1.617996398857369, + "learning_rate": 1.3342018489809885e-05, + "loss": 0.7233, + "step": 4012 + }, + { + "epoch": 0.40990806945863123, + "grad_norm": 1.485882012502083, + "learning_rate": 1.333890019843737e-05, + "loss": 0.6574, + "step": 4013 + }, + { + "epoch": 0.4100102145045965, + "grad_norm": 1.3351104660967639, + "learning_rate": 1.333578154162007e-05, + "loss": 0.666, + "step": 4014 + }, + { + "epoch": 0.4101123595505618, + "grad_norm": 1.4480156211530573, + "learning_rate": 1.3332662519699326e-05, + "loss": 0.7046, + "step": 4015 + }, + { + "epoch": 0.4102145045965271, + "grad_norm": 1.4492612273184573, + "learning_rate": 1.3329543133016519e-05, + "loss": 0.7284, + "step": 4016 + }, + { + "epoch": 0.4103166496424923, + "grad_norm": 1.6242313864696112, + "learning_rate": 1.3326423381913061e-05, + "loss": 0.7703, + "step": 4017 + }, + { + "epoch": 0.4104187946884576, + "grad_norm": 1.5514048505913542, + "learning_rate": 1.3323303266730414e-05, + "loss": 0.6317, + "step": 4018 + }, + { + "epoch": 0.4105209397344229, + "grad_norm": 1.369421139176379, + "learning_rate": 1.3320182787810081e-05, + "loss": 0.7265, + "step": 4019 + }, + { + "epoch": 0.41062308478038817, + "grad_norm": 1.4230252208871446, + "learning_rate": 1.3317061945493595e-05, + "loss": 0.7471, + "step": 4020 + }, + { + "epoch": 0.4107252298263534, + "grad_norm": 1.704884280639212, + "learning_rate": 1.3313940740122535e-05, + "loss": 0.7548, + "step": 4021 + }, + { + "epoch": 0.4108273748723187, + "grad_norm": 1.4497886501886585, + "learning_rate": 1.331081917203852e-05, + "loss": 0.7968, + "step": 4022 + }, + { + "epoch": 0.41092951991828397, + "grad_norm": 1.574166629314942, + "learning_rate": 1.3307697241583209e-05, + "loss": 0.683, + "step": 4023 + }, + { + "epoch": 0.41103166496424925, + "grad_norm": 1.4704711616671917, + "learning_rate": 1.3304574949098298e-05, + "loss": 0.7706, + "step": 4024 + }, + { + "epoch": 0.4111338100102145, + "grad_norm": 1.4982989974075311, + "learning_rate": 1.3301452294925524e-05, + "loss": 0.6384, + "step": 4025 + }, + { + "epoch": 0.41123595505617977, + "grad_norm": 1.5614672109440955, + "learning_rate": 1.329832927940666e-05, + "loss": 0.809, + "step": 4026 + }, + { + "epoch": 0.41133810010214505, + "grad_norm": 1.517092556989473, + "learning_rate": 1.329520590288353e-05, + "loss": 0.7272, + "step": 4027 + }, + { + "epoch": 0.41144024514811034, + "grad_norm": 1.3980918942386242, + "learning_rate": 1.3292082165697981e-05, + "loss": 0.6642, + "step": 4028 + }, + { + "epoch": 0.41154239019407557, + "grad_norm": 1.4862855714792382, + "learning_rate": 1.3288958068191915e-05, + "loss": 0.6683, + "step": 4029 + }, + { + "epoch": 0.41164453524004085, + "grad_norm": 1.4313906118644868, + "learning_rate": 1.328583361070726e-05, + "loss": 0.7199, + "step": 4030 + }, + { + "epoch": 0.41174668028600614, + "grad_norm": 1.4230495103718672, + "learning_rate": 1.3282708793585996e-05, + "loss": 0.7442, + "step": 4031 + }, + { + "epoch": 0.4118488253319714, + "grad_norm": 1.5117735105782308, + "learning_rate": 1.3279583617170136e-05, + "loss": 0.7102, + "step": 4032 + }, + { + "epoch": 0.41195097037793665, + "grad_norm": 1.637452319174453, + "learning_rate": 1.3276458081801727e-05, + "loss": 0.8491, + "step": 4033 + }, + { + "epoch": 0.41205311542390194, + "grad_norm": 1.4813784444032168, + "learning_rate": 1.3273332187822862e-05, + "loss": 0.8105, + "step": 4034 + }, + { + "epoch": 0.4121552604698672, + "grad_norm": 1.5295130979732519, + "learning_rate": 1.3270205935575677e-05, + "loss": 0.8018, + "step": 4035 + }, + { + "epoch": 0.4122574055158325, + "grad_norm": 1.4692277187154772, + "learning_rate": 1.3267079325402341e-05, + "loss": 0.8603, + "step": 4036 + }, + { + "epoch": 0.41235955056179774, + "grad_norm": 1.4965906261863555, + "learning_rate": 1.326395235764506e-05, + "loss": 0.7362, + "step": 4037 + }, + { + "epoch": 0.412461695607763, + "grad_norm": 1.6077418685398024, + "learning_rate": 1.3260825032646083e-05, + "loss": 0.78, + "step": 4038 + }, + { + "epoch": 0.4125638406537283, + "grad_norm": 1.5059544814678323, + "learning_rate": 1.3257697350747702e-05, + "loss": 0.7208, + "step": 4039 + }, + { + "epoch": 0.41266598569969354, + "grad_norm": 1.6701202522057155, + "learning_rate": 1.325456931229224e-05, + "loss": 0.7448, + "step": 4040 + }, + { + "epoch": 0.4127681307456588, + "grad_norm": 1.4841410210905146, + "learning_rate": 1.3251440917622067e-05, + "loss": 0.7369, + "step": 4041 + }, + { + "epoch": 0.4128702757916241, + "grad_norm": 1.5065484493022716, + "learning_rate": 1.3248312167079583e-05, + "loss": 0.6968, + "step": 4042 + }, + { + "epoch": 0.4129724208375894, + "grad_norm": 1.4774057288347526, + "learning_rate": 1.324518306100723e-05, + "loss": 0.7776, + "step": 4043 + }, + { + "epoch": 0.4130745658835546, + "grad_norm": 1.5848087520484264, + "learning_rate": 1.32420535997475e-05, + "loss": 0.8316, + "step": 4044 + }, + { + "epoch": 0.4131767109295199, + "grad_norm": 1.4537341024289954, + "learning_rate": 1.3238923783642905e-05, + "loss": 0.7916, + "step": 4045 + }, + { + "epoch": 0.4132788559754852, + "grad_norm": 1.505996164577858, + "learning_rate": 1.323579361303601e-05, + "loss": 0.6997, + "step": 4046 + }, + { + "epoch": 0.4133810010214505, + "grad_norm": 1.525822819697486, + "learning_rate": 1.3232663088269414e-05, + "loss": 0.6872, + "step": 4047 + }, + { + "epoch": 0.4134831460674157, + "grad_norm": 1.4401478573283029, + "learning_rate": 1.3229532209685756e-05, + "loss": 0.6956, + "step": 4048 + }, + { + "epoch": 0.413585291113381, + "grad_norm": 1.3974748623576538, + "learning_rate": 1.3226400977627709e-05, + "loss": 0.702, + "step": 4049 + }, + { + "epoch": 0.4136874361593463, + "grad_norm": 2.4503745115723503, + "learning_rate": 1.3223269392437989e-05, + "loss": 0.7215, + "step": 4050 + }, + { + "epoch": 0.41378958120531156, + "grad_norm": 1.4791876318802741, + "learning_rate": 1.3220137454459357e-05, + "loss": 0.7362, + "step": 4051 + }, + { + "epoch": 0.4138917262512768, + "grad_norm": 1.5036532983298305, + "learning_rate": 1.3217005164034596e-05, + "loss": 0.8564, + "step": 4052 + }, + { + "epoch": 0.4139938712972421, + "grad_norm": 1.5496950064737762, + "learning_rate": 1.3213872521506543e-05, + "loss": 0.7661, + "step": 4053 + }, + { + "epoch": 0.41409601634320736, + "grad_norm": 1.5364254294810693, + "learning_rate": 1.3210739527218064e-05, + "loss": 0.8544, + "step": 4054 + }, + { + "epoch": 0.41419816138917265, + "grad_norm": 1.3950736600627378, + "learning_rate": 1.320760618151207e-05, + "loss": 0.6737, + "step": 4055 + }, + { + "epoch": 0.4143003064351379, + "grad_norm": 1.382637759625659, + "learning_rate": 1.3204472484731508e-05, + "loss": 0.595, + "step": 4056 + }, + { + "epoch": 0.41440245148110316, + "grad_norm": 1.4614796348015409, + "learning_rate": 1.3201338437219362e-05, + "loss": 0.7135, + "step": 4057 + }, + { + "epoch": 0.41450459652706845, + "grad_norm": 1.4128923217460214, + "learning_rate": 1.3198204039318654e-05, + "loss": 0.6091, + "step": 4058 + }, + { + "epoch": 0.41460674157303373, + "grad_norm": 1.4079889623854227, + "learning_rate": 1.3195069291372451e-05, + "loss": 0.7852, + "step": 4059 + }, + { + "epoch": 0.41470888661899896, + "grad_norm": 1.3788582400195226, + "learning_rate": 1.3191934193723848e-05, + "loss": 0.7591, + "step": 4060 + }, + { + "epoch": 0.41481103166496425, + "grad_norm": 1.4839946955713155, + "learning_rate": 1.3188798746715985e-05, + "loss": 0.784, + "step": 4061 + }, + { + "epoch": 0.41491317671092953, + "grad_norm": 1.4310828363512647, + "learning_rate": 1.3185662950692036e-05, + "loss": 0.7725, + "step": 4062 + }, + { + "epoch": 0.4150153217568948, + "grad_norm": 1.4338389284971307, + "learning_rate": 1.3182526805995223e-05, + "loss": 0.7388, + "step": 4063 + }, + { + "epoch": 0.41511746680286005, + "grad_norm": 1.333643721706044, + "learning_rate": 1.3179390312968793e-05, + "loss": 0.6143, + "step": 4064 + }, + { + "epoch": 0.41521961184882533, + "grad_norm": 1.4905184779583718, + "learning_rate": 1.3176253471956043e-05, + "loss": 0.6835, + "step": 4065 + }, + { + "epoch": 0.4153217568947906, + "grad_norm": 1.3421428373345718, + "learning_rate": 1.3173116283300293e-05, + "loss": 0.7133, + "step": 4066 + }, + { + "epoch": 0.41542390194075585, + "grad_norm": 1.4617748488641993, + "learning_rate": 1.3169978747344919e-05, + "loss": 0.6358, + "step": 4067 + }, + { + "epoch": 0.41552604698672113, + "grad_norm": 1.2958353501005206, + "learning_rate": 1.3166840864433322e-05, + "loss": 0.6483, + "step": 4068 + }, + { + "epoch": 0.4156281920326864, + "grad_norm": 1.380326311400662, + "learning_rate": 1.3163702634908946e-05, + "loss": 0.6627, + "step": 4069 + }, + { + "epoch": 0.4157303370786517, + "grad_norm": 1.4701725629732483, + "learning_rate": 1.316056405911527e-05, + "loss": 0.7819, + "step": 4070 + }, + { + "epoch": 0.41583248212461693, + "grad_norm": 1.3946076579594058, + "learning_rate": 1.315742513739582e-05, + "loss": 0.7495, + "step": 4071 + }, + { + "epoch": 0.4159346271705822, + "grad_norm": 1.3511651739021993, + "learning_rate": 1.3154285870094147e-05, + "loss": 0.6703, + "step": 4072 + }, + { + "epoch": 0.4160367722165475, + "grad_norm": 1.6026528476763535, + "learning_rate": 1.315114625755385e-05, + "loss": 0.8442, + "step": 4073 + }, + { + "epoch": 0.4161389172625128, + "grad_norm": 1.5981961137947693, + "learning_rate": 1.3148006300118554e-05, + "loss": 0.7094, + "step": 4074 + }, + { + "epoch": 0.416241062308478, + "grad_norm": 1.491200505025334, + "learning_rate": 1.3144865998131939e-05, + "loss": 0.7596, + "step": 4075 + }, + { + "epoch": 0.4163432073544433, + "grad_norm": 1.4710762972812466, + "learning_rate": 1.3141725351937709e-05, + "loss": 0.7569, + "step": 4076 + }, + { + "epoch": 0.4164453524004086, + "grad_norm": 1.7610737542326798, + "learning_rate": 1.3138584361879607e-05, + "loss": 0.7357, + "step": 4077 + }, + { + "epoch": 0.41654749744637387, + "grad_norm": 1.5061838862324517, + "learning_rate": 1.313544302830142e-05, + "loss": 0.7304, + "step": 4078 + }, + { + "epoch": 0.4166496424923391, + "grad_norm": 1.6595599660071645, + "learning_rate": 1.3132301351546968e-05, + "loss": 0.6979, + "step": 4079 + }, + { + "epoch": 0.4167517875383044, + "grad_norm": 1.3876253947644883, + "learning_rate": 1.3129159331960109e-05, + "loss": 0.5958, + "step": 4080 + }, + { + "epoch": 0.41685393258426967, + "grad_norm": 1.494077824775189, + "learning_rate": 1.3126016969884739e-05, + "loss": 0.7525, + "step": 4081 + }, + { + "epoch": 0.41695607763023496, + "grad_norm": 1.359325684243245, + "learning_rate": 1.312287426566479e-05, + "loss": 0.6103, + "step": 4082 + }, + { + "epoch": 0.4170582226762002, + "grad_norm": 1.2950168283734158, + "learning_rate": 1.3119731219644238e-05, + "loss": 0.6323, + "step": 4083 + }, + { + "epoch": 0.41716036772216547, + "grad_norm": 1.5200803771793918, + "learning_rate": 1.3116587832167089e-05, + "loss": 0.8037, + "step": 4084 + }, + { + "epoch": 0.41726251276813076, + "grad_norm": 1.397554587784112, + "learning_rate": 1.3113444103577387e-05, + "loss": 0.6801, + "step": 4085 + }, + { + "epoch": 0.41736465781409604, + "grad_norm": 1.4499611613835213, + "learning_rate": 1.3110300034219217e-05, + "loss": 0.6486, + "step": 4086 + }, + { + "epoch": 0.41746680286006127, + "grad_norm": 1.611375214389681, + "learning_rate": 1.3107155624436696e-05, + "loss": 0.7909, + "step": 4087 + }, + { + "epoch": 0.41756894790602656, + "grad_norm": 1.4177099615406614, + "learning_rate": 1.3104010874573987e-05, + "loss": 0.6209, + "step": 4088 + }, + { + "epoch": 0.41767109295199184, + "grad_norm": 1.5614550261294569, + "learning_rate": 1.3100865784975281e-05, + "loss": 0.7607, + "step": 4089 + }, + { + "epoch": 0.4177732379979571, + "grad_norm": 1.4913476704737025, + "learning_rate": 1.3097720355984812e-05, + "loss": 0.7222, + "step": 4090 + }, + { + "epoch": 0.41787538304392235, + "grad_norm": 1.2766152094934622, + "learning_rate": 1.3094574587946847e-05, + "loss": 0.6318, + "step": 4091 + }, + { + "epoch": 0.41797752808988764, + "grad_norm": 1.5814835836471746, + "learning_rate": 1.3091428481205697e-05, + "loss": 0.7422, + "step": 4092 + }, + { + "epoch": 0.4180796731358529, + "grad_norm": 1.5392623197466557, + "learning_rate": 1.3088282036105701e-05, + "loss": 0.7404, + "step": 4093 + }, + { + "epoch": 0.41818181818181815, + "grad_norm": 1.3660922183503126, + "learning_rate": 1.3085135252991238e-05, + "loss": 0.6749, + "step": 4094 + }, + { + "epoch": 0.41828396322778344, + "grad_norm": 1.500850860237259, + "learning_rate": 1.3081988132206735e-05, + "loss": 0.8192, + "step": 4095 + }, + { + "epoch": 0.4183861082737487, + "grad_norm": 1.5414610143151695, + "learning_rate": 1.3078840674096636e-05, + "loss": 0.8525, + "step": 4096 + }, + { + "epoch": 0.418488253319714, + "grad_norm": 1.6335581950671578, + "learning_rate": 1.3075692879005436e-05, + "loss": 0.7956, + "step": 4097 + }, + { + "epoch": 0.41859039836567924, + "grad_norm": 1.325549804648626, + "learning_rate": 1.3072544747277663e-05, + "loss": 0.603, + "step": 4098 + }, + { + "epoch": 0.4186925434116445, + "grad_norm": 1.4895319153991593, + "learning_rate": 1.3069396279257882e-05, + "loss": 0.7239, + "step": 4099 + }, + { + "epoch": 0.4187946884576098, + "grad_norm": 1.4306054853927743, + "learning_rate": 1.3066247475290696e-05, + "loss": 0.7257, + "step": 4100 + }, + { + "epoch": 0.4188968335035751, + "grad_norm": 1.2257462647611135, + "learning_rate": 1.3063098335720743e-05, + "loss": 0.6458, + "step": 4101 + }, + { + "epoch": 0.4189989785495403, + "grad_norm": 1.3818102701848796, + "learning_rate": 1.3059948860892696e-05, + "loss": 0.7495, + "step": 4102 + }, + { + "epoch": 0.4191011235955056, + "grad_norm": 1.5467326414267168, + "learning_rate": 1.305679905115127e-05, + "loss": 0.6762, + "step": 4103 + }, + { + "epoch": 0.4192032686414709, + "grad_norm": 1.3934748653316449, + "learning_rate": 1.3053648906841216e-05, + "loss": 0.7351, + "step": 4104 + }, + { + "epoch": 0.4193054136874362, + "grad_norm": 1.6369496700388027, + "learning_rate": 1.3050498428307315e-05, + "loss": 0.8069, + "step": 4105 + }, + { + "epoch": 0.4194075587334014, + "grad_norm": 1.4390088599382682, + "learning_rate": 1.3047347615894386e-05, + "loss": 0.8243, + "step": 4106 + }, + { + "epoch": 0.4195097037793667, + "grad_norm": 1.572644150538147, + "learning_rate": 1.3044196469947296e-05, + "loss": 0.7333, + "step": 4107 + }, + { + "epoch": 0.419611848825332, + "grad_norm": 1.456051791935575, + "learning_rate": 1.3041044990810933e-05, + "loss": 0.6258, + "step": 4108 + }, + { + "epoch": 0.41971399387129726, + "grad_norm": 1.454781171265891, + "learning_rate": 1.3037893178830234e-05, + "loss": 0.643, + "step": 4109 + }, + { + "epoch": 0.4198161389172625, + "grad_norm": 1.7481206820761466, + "learning_rate": 1.3034741034350162e-05, + "loss": 0.8228, + "step": 4110 + }, + { + "epoch": 0.4199182839632278, + "grad_norm": 1.6683891688452452, + "learning_rate": 1.3031588557715721e-05, + "loss": 0.7802, + "step": 4111 + }, + { + "epoch": 0.42002042900919306, + "grad_norm": 1.469212460184517, + "learning_rate": 1.302843574927196e-05, + "loss": 0.613, + "step": 4112 + }, + { + "epoch": 0.42012257405515835, + "grad_norm": 1.2670444782115955, + "learning_rate": 1.3025282609363943e-05, + "loss": 0.7991, + "step": 4113 + }, + { + "epoch": 0.4202247191011236, + "grad_norm": 1.4568412502466304, + "learning_rate": 1.3022129138336792e-05, + "loss": 0.7549, + "step": 4114 + }, + { + "epoch": 0.42032686414708886, + "grad_norm": 1.6850376317991482, + "learning_rate": 1.3018975336535658e-05, + "loss": 0.7966, + "step": 4115 + }, + { + "epoch": 0.42042900919305415, + "grad_norm": 1.3926599918040983, + "learning_rate": 1.3015821204305716e-05, + "loss": 0.6788, + "step": 4116 + }, + { + "epoch": 0.42053115423901943, + "grad_norm": 1.5124648792164923, + "learning_rate": 1.3012666741992202e-05, + "loss": 0.7618, + "step": 4117 + }, + { + "epoch": 0.42063329928498466, + "grad_norm": 1.325928373671755, + "learning_rate": 1.3009511949940359e-05, + "loss": 0.7059, + "step": 4118 + }, + { + "epoch": 0.42073544433094995, + "grad_norm": 1.528607455210471, + "learning_rate": 1.3006356828495495e-05, + "loss": 0.9157, + "step": 4119 + }, + { + "epoch": 0.42083758937691523, + "grad_norm": 1.3301703344120657, + "learning_rate": 1.3003201378002929e-05, + "loss": 0.7041, + "step": 4120 + }, + { + "epoch": 0.42093973442288046, + "grad_norm": 1.3850296142170697, + "learning_rate": 1.3000045598808035e-05, + "loss": 0.7068, + "step": 4121 + }, + { + "epoch": 0.42104187946884575, + "grad_norm": 1.4007606860416288, + "learning_rate": 1.299688949125621e-05, + "loss": 0.7533, + "step": 4122 + }, + { + "epoch": 0.42114402451481103, + "grad_norm": 1.5627785692444696, + "learning_rate": 1.2993733055692897e-05, + "loss": 0.7628, + "step": 4123 + }, + { + "epoch": 0.4212461695607763, + "grad_norm": 1.2754867209599365, + "learning_rate": 1.2990576292463563e-05, + "loss": 0.7234, + "step": 4124 + }, + { + "epoch": 0.42134831460674155, + "grad_norm": 1.6681084506778683, + "learning_rate": 1.2987419201913724e-05, + "loss": 0.7951, + "step": 4125 + }, + { + "epoch": 0.42145045965270683, + "grad_norm": 1.4505808235332542, + "learning_rate": 1.2984261784388923e-05, + "loss": 0.7359, + "step": 4126 + }, + { + "epoch": 0.4215526046986721, + "grad_norm": 1.4705792051374922, + "learning_rate": 1.2981104040234742e-05, + "loss": 0.7951, + "step": 4127 + }, + { + "epoch": 0.4216547497446374, + "grad_norm": 1.3701889891001708, + "learning_rate": 1.2977945969796796e-05, + "loss": 0.631, + "step": 4128 + }, + { + "epoch": 0.42175689479060263, + "grad_norm": 1.4463389266616375, + "learning_rate": 1.2974787573420744e-05, + "loss": 0.6531, + "step": 4129 + }, + { + "epoch": 0.4218590398365679, + "grad_norm": 1.4530120591992444, + "learning_rate": 1.2971628851452263e-05, + "loss": 0.7059, + "step": 4130 + }, + { + "epoch": 0.4219611848825332, + "grad_norm": 1.5219217795240638, + "learning_rate": 1.2968469804237088e-05, + "loss": 0.6238, + "step": 4131 + }, + { + "epoch": 0.4220633299284985, + "grad_norm": 1.5017364988446407, + "learning_rate": 1.2965310432120978e-05, + "loss": 0.7114, + "step": 4132 + }, + { + "epoch": 0.4221654749744637, + "grad_norm": 1.4389094641372986, + "learning_rate": 1.2962150735449724e-05, + "loss": 0.722, + "step": 4133 + }, + { + "epoch": 0.422267620020429, + "grad_norm": 1.4698535732220495, + "learning_rate": 1.2958990714569154e-05, + "loss": 0.7758, + "step": 4134 + }, + { + "epoch": 0.4223697650663943, + "grad_norm": 1.4454234873644298, + "learning_rate": 1.2955830369825141e-05, + "loss": 0.8351, + "step": 4135 + }, + { + "epoch": 0.42247191011235957, + "grad_norm": 1.344756066930458, + "learning_rate": 1.2952669701563588e-05, + "loss": 0.7998, + "step": 4136 + }, + { + "epoch": 0.4225740551583248, + "grad_norm": 1.418681941640728, + "learning_rate": 1.2949508710130423e-05, + "loss": 0.7328, + "step": 4137 + }, + { + "epoch": 0.4226762002042901, + "grad_norm": 1.4906992595098487, + "learning_rate": 1.2946347395871626e-05, + "loss": 0.7859, + "step": 4138 + }, + { + "epoch": 0.42277834525025537, + "grad_norm": 1.4599196904873275, + "learning_rate": 1.2943185759133203e-05, + "loss": 0.6149, + "step": 4139 + }, + { + "epoch": 0.42288049029622066, + "grad_norm": 1.5352293689277754, + "learning_rate": 1.2940023800261197e-05, + "loss": 0.6974, + "step": 4140 + }, + { + "epoch": 0.4229826353421859, + "grad_norm": 1.4747391792025863, + "learning_rate": 1.2936861519601689e-05, + "loss": 0.692, + "step": 4141 + }, + { + "epoch": 0.42308478038815117, + "grad_norm": 1.592182718455755, + "learning_rate": 1.2933698917500788e-05, + "loss": 0.6323, + "step": 4142 + }, + { + "epoch": 0.42318692543411646, + "grad_norm": 1.4931453626253548, + "learning_rate": 1.2930535994304643e-05, + "loss": 0.7523, + "step": 4143 + }, + { + "epoch": 0.42328907048008174, + "grad_norm": 1.5729729186806827, + "learning_rate": 1.2927372750359443e-05, + "loss": 0.7543, + "step": 4144 + }, + { + "epoch": 0.42339121552604697, + "grad_norm": 1.3550745282975585, + "learning_rate": 1.2924209186011405e-05, + "loss": 0.6482, + "step": 4145 + }, + { + "epoch": 0.42349336057201226, + "grad_norm": 1.5746811100354057, + "learning_rate": 1.2921045301606777e-05, + "loss": 0.83, + "step": 4146 + }, + { + "epoch": 0.42359550561797754, + "grad_norm": 1.4042866430321983, + "learning_rate": 1.2917881097491858e-05, + "loss": 0.7683, + "step": 4147 + }, + { + "epoch": 0.4236976506639428, + "grad_norm": 1.441740187024938, + "learning_rate": 1.2914716574012968e-05, + "loss": 0.7477, + "step": 4148 + }, + { + "epoch": 0.42379979570990806, + "grad_norm": 1.5703460896779449, + "learning_rate": 1.2911551731516467e-05, + "loss": 0.7446, + "step": 4149 + }, + { + "epoch": 0.42390194075587334, + "grad_norm": 1.4185097964926956, + "learning_rate": 1.290838657034874e-05, + "loss": 0.7464, + "step": 4150 + }, + { + "epoch": 0.4240040858018386, + "grad_norm": 1.4990641399195994, + "learning_rate": 1.2905221090856232e-05, + "loss": 0.6285, + "step": 4151 + }, + { + "epoch": 0.42410623084780386, + "grad_norm": 1.529098470574368, + "learning_rate": 1.2902055293385396e-05, + "loss": 0.8503, + "step": 4152 + }, + { + "epoch": 0.42420837589376914, + "grad_norm": 1.5725713886116188, + "learning_rate": 1.2898889178282733e-05, + "loss": 0.7078, + "step": 4153 + }, + { + "epoch": 0.4243105209397344, + "grad_norm": 1.5295258079947283, + "learning_rate": 1.2895722745894777e-05, + "loss": 0.7121, + "step": 4154 + }, + { + "epoch": 0.4244126659856997, + "grad_norm": 1.5095825940448466, + "learning_rate": 1.2892555996568094e-05, + "loss": 0.768, + "step": 4155 + }, + { + "epoch": 0.42451481103166494, + "grad_norm": 1.5112433002257941, + "learning_rate": 1.2889388930649291e-05, + "loss": 0.7579, + "step": 4156 + }, + { + "epoch": 0.4246169560776302, + "grad_norm": 1.514226228406768, + "learning_rate": 1.2886221548485e-05, + "loss": 0.7692, + "step": 4157 + }, + { + "epoch": 0.4247191011235955, + "grad_norm": 1.4757653130565505, + "learning_rate": 1.2883053850421899e-05, + "loss": 0.7661, + "step": 4158 + }, + { + "epoch": 0.4248212461695608, + "grad_norm": 1.357419997721736, + "learning_rate": 1.2879885836806689e-05, + "loss": 0.6896, + "step": 4159 + }, + { + "epoch": 0.424923391215526, + "grad_norm": 1.3972495833831715, + "learning_rate": 1.2876717507986114e-05, + "loss": 0.7992, + "step": 4160 + }, + { + "epoch": 0.4250255362614913, + "grad_norm": 1.4475067719021075, + "learning_rate": 1.287354886430695e-05, + "loss": 0.6546, + "step": 4161 + }, + { + "epoch": 0.4251276813074566, + "grad_norm": 1.5324668861592707, + "learning_rate": 1.2870379906116005e-05, + "loss": 0.8312, + "step": 4162 + }, + { + "epoch": 0.4252298263534219, + "grad_norm": 1.429368183012482, + "learning_rate": 1.2867210633760126e-05, + "loss": 0.7044, + "step": 4163 + }, + { + "epoch": 0.4253319713993871, + "grad_norm": 1.4798231709875453, + "learning_rate": 1.2864041047586189e-05, + "loss": 0.6841, + "step": 4164 + }, + { + "epoch": 0.4254341164453524, + "grad_norm": 1.5703651989985423, + "learning_rate": 1.2860871147941109e-05, + "loss": 0.6238, + "step": 4165 + }, + { + "epoch": 0.4255362614913177, + "grad_norm": 1.27730640312045, + "learning_rate": 1.2857700935171835e-05, + "loss": 0.7165, + "step": 4166 + }, + { + "epoch": 0.42563840653728297, + "grad_norm": 1.3485400667462677, + "learning_rate": 1.2854530409625346e-05, + "loss": 0.7165, + "step": 4167 + }, + { + "epoch": 0.4257405515832482, + "grad_norm": 1.5053817130801825, + "learning_rate": 1.285135957164866e-05, + "loss": 0.8391, + "step": 4168 + }, + { + "epoch": 0.4258426966292135, + "grad_norm": 1.373072070707707, + "learning_rate": 1.2848188421588827e-05, + "loss": 0.7164, + "step": 4169 + }, + { + "epoch": 0.42594484167517876, + "grad_norm": 1.5248536954527256, + "learning_rate": 1.2845016959792931e-05, + "loss": 0.7186, + "step": 4170 + }, + { + "epoch": 0.42604698672114405, + "grad_norm": 1.4049527114111642, + "learning_rate": 1.284184518660809e-05, + "loss": 0.6818, + "step": 4171 + }, + { + "epoch": 0.4261491317671093, + "grad_norm": 1.3851174270655728, + "learning_rate": 1.2838673102381458e-05, + "loss": 0.694, + "step": 4172 + }, + { + "epoch": 0.42625127681307456, + "grad_norm": 1.5562554404716402, + "learning_rate": 1.2835500707460223e-05, + "loss": 0.6424, + "step": 4173 + }, + { + "epoch": 0.42635342185903985, + "grad_norm": 1.4227628680922861, + "learning_rate": 1.2832328002191599e-05, + "loss": 0.7868, + "step": 4174 + }, + { + "epoch": 0.42645556690500513, + "grad_norm": 1.4643328018310775, + "learning_rate": 1.2829154986922847e-05, + "loss": 0.6896, + "step": 4175 + }, + { + "epoch": 0.42655771195097036, + "grad_norm": 1.4745459924297115, + "learning_rate": 1.2825981662001256e-05, + "loss": 0.7656, + "step": 4176 + }, + { + "epoch": 0.42665985699693565, + "grad_norm": 1.5029292321623076, + "learning_rate": 1.2822808027774143e-05, + "loss": 0.7277, + "step": 4177 + }, + { + "epoch": 0.42676200204290093, + "grad_norm": 1.4823195118611412, + "learning_rate": 1.281963408458887e-05, + "loss": 0.7652, + "step": 4178 + }, + { + "epoch": 0.42686414708886616, + "grad_norm": 1.3853250689794836, + "learning_rate": 1.2816459832792822e-05, + "loss": 0.75, + "step": 4179 + }, + { + "epoch": 0.42696629213483145, + "grad_norm": 1.7521923192322615, + "learning_rate": 1.2813285272733429e-05, + "loss": 0.7804, + "step": 4180 + }, + { + "epoch": 0.42706843718079673, + "grad_norm": 1.4189134851768206, + "learning_rate": 1.2810110404758143e-05, + "loss": 0.752, + "step": 4181 + }, + { + "epoch": 0.427170582226762, + "grad_norm": 1.5064516408262625, + "learning_rate": 1.2806935229214456e-05, + "loss": 0.7408, + "step": 4182 + }, + { + "epoch": 0.42727272727272725, + "grad_norm": 1.5434466837584009, + "learning_rate": 1.28037597464499e-05, + "loss": 0.7083, + "step": 4183 + }, + { + "epoch": 0.42737487231869253, + "grad_norm": 1.4315682255893107, + "learning_rate": 1.2800583956812025e-05, + "loss": 0.7168, + "step": 4184 + }, + { + "epoch": 0.4274770173646578, + "grad_norm": 1.551951380065014, + "learning_rate": 1.2797407860648427e-05, + "loss": 0.6986, + "step": 4185 + }, + { + "epoch": 0.4275791624106231, + "grad_norm": 1.4989583080589504, + "learning_rate": 1.2794231458306732e-05, + "loss": 0.8221, + "step": 4186 + }, + { + "epoch": 0.42768130745658833, + "grad_norm": 1.4211755988326256, + "learning_rate": 1.2791054750134597e-05, + "loss": 0.7164, + "step": 4187 + }, + { + "epoch": 0.4277834525025536, + "grad_norm": 1.4687098144890882, + "learning_rate": 1.2787877736479719e-05, + "loss": 0.6884, + "step": 4188 + }, + { + "epoch": 0.4278855975485189, + "grad_norm": 1.434373871437835, + "learning_rate": 1.2784700417689817e-05, + "loss": 0.706, + "step": 4189 + }, + { + "epoch": 0.4279877425944842, + "grad_norm": 1.4761514426845452, + "learning_rate": 1.2781522794112658e-05, + "loss": 0.7788, + "step": 4190 + }, + { + "epoch": 0.4280898876404494, + "grad_norm": 1.5371224217495576, + "learning_rate": 1.2778344866096032e-05, + "loss": 0.7553, + "step": 4191 + }, + { + "epoch": 0.4281920326864147, + "grad_norm": 1.505334300874722, + "learning_rate": 1.2775166633987765e-05, + "loss": 0.6602, + "step": 4192 + }, + { + "epoch": 0.42829417773238, + "grad_norm": 1.3980104338401964, + "learning_rate": 1.2771988098135719e-05, + "loss": 0.7006, + "step": 4193 + }, + { + "epoch": 0.4283963227783453, + "grad_norm": 1.383802255642878, + "learning_rate": 1.276880925888778e-05, + "loss": 0.7447, + "step": 4194 + }, + { + "epoch": 0.4284984678243105, + "grad_norm": 1.4202791704004998, + "learning_rate": 1.2765630116591884e-05, + "loss": 0.6968, + "step": 4195 + }, + { + "epoch": 0.4286006128702758, + "grad_norm": 1.4590125606068858, + "learning_rate": 1.2762450671595983e-05, + "loss": 0.7108, + "step": 4196 + }, + { + "epoch": 0.4287027579162411, + "grad_norm": 1.4020338981341036, + "learning_rate": 1.275927092424807e-05, + "loss": 0.6834, + "step": 4197 + }, + { + "epoch": 0.42880490296220636, + "grad_norm": 1.5133927724807894, + "learning_rate": 1.2756090874896171e-05, + "loss": 0.7314, + "step": 4198 + }, + { + "epoch": 0.4289070480081716, + "grad_norm": 1.4240112947773271, + "learning_rate": 1.2752910523888347e-05, + "loss": 0.6645, + "step": 4199 + }, + { + "epoch": 0.4290091930541369, + "grad_norm": 1.4354208813429918, + "learning_rate": 1.274972987157269e-05, + "loss": 0.7336, + "step": 4200 + }, + { + "epoch": 0.42911133810010216, + "grad_norm": 1.4461515262142526, + "learning_rate": 1.2746548918297318e-05, + "loss": 0.6986, + "step": 4201 + }, + { + "epoch": 0.42921348314606744, + "grad_norm": 1.4815422836040038, + "learning_rate": 1.2743367664410391e-05, + "loss": 0.7797, + "step": 4202 + }, + { + "epoch": 0.42931562819203267, + "grad_norm": 1.8068029698968613, + "learning_rate": 1.2740186110260104e-05, + "loss": 0.7516, + "step": 4203 + }, + { + "epoch": 0.42941777323799796, + "grad_norm": 1.5583473027966743, + "learning_rate": 1.2737004256194676e-05, + "loss": 0.7247, + "step": 4204 + }, + { + "epoch": 0.42951991828396324, + "grad_norm": 1.5807737796431178, + "learning_rate": 1.2733822102562366e-05, + "loss": 0.7589, + "step": 4205 + }, + { + "epoch": 0.42962206332992847, + "grad_norm": 1.6256543319508912, + "learning_rate": 1.2730639649711453e-05, + "loss": 0.7787, + "step": 4206 + }, + { + "epoch": 0.42972420837589376, + "grad_norm": 1.5856240579972127, + "learning_rate": 1.2727456897990276e-05, + "loss": 0.7561, + "step": 4207 + }, + { + "epoch": 0.42982635342185904, + "grad_norm": 1.3395595041880035, + "learning_rate": 1.2724273847747173e-05, + "loss": 0.6978, + "step": 4208 + }, + { + "epoch": 0.4299284984678243, + "grad_norm": 1.421668395737883, + "learning_rate": 1.2721090499330542e-05, + "loss": 0.7474, + "step": 4209 + }, + { + "epoch": 0.43003064351378956, + "grad_norm": 1.4186737209031755, + "learning_rate": 1.2717906853088793e-05, + "loss": 0.7714, + "step": 4210 + }, + { + "epoch": 0.43013278855975484, + "grad_norm": 1.531953079437075, + "learning_rate": 1.2714722909370383e-05, + "loss": 0.7227, + "step": 4211 + }, + { + "epoch": 0.4302349336057201, + "grad_norm": 1.541353837000716, + "learning_rate": 1.2711538668523802e-05, + "loss": 0.7264, + "step": 4212 + }, + { + "epoch": 0.4303370786516854, + "grad_norm": 1.5580233017897014, + "learning_rate": 1.2708354130897555e-05, + "loss": 0.7778, + "step": 4213 + }, + { + "epoch": 0.43043922369765064, + "grad_norm": 1.4851710871351311, + "learning_rate": 1.2705169296840203e-05, + "loss": 0.7109, + "step": 4214 + }, + { + "epoch": 0.4305413687436159, + "grad_norm": 1.383274129234747, + "learning_rate": 1.2701984166700324e-05, + "loss": 0.6398, + "step": 4215 + }, + { + "epoch": 0.4306435137895812, + "grad_norm": 1.4384865579196828, + "learning_rate": 1.2698798740826531e-05, + "loss": 0.7613, + "step": 4216 + }, + { + "epoch": 0.4307456588355465, + "grad_norm": 1.5110176748999187, + "learning_rate": 1.2695613019567472e-05, + "loss": 0.7787, + "step": 4217 + }, + { + "epoch": 0.4308478038815117, + "grad_norm": 1.490556998129876, + "learning_rate": 1.2692427003271823e-05, + "loss": 0.7777, + "step": 4218 + }, + { + "epoch": 0.430949948927477, + "grad_norm": 1.7190476961541812, + "learning_rate": 1.2689240692288305e-05, + "loss": 0.692, + "step": 4219 + }, + { + "epoch": 0.4310520939734423, + "grad_norm": 1.3351591805615528, + "learning_rate": 1.2686054086965653e-05, + "loss": 0.6479, + "step": 4220 + }, + { + "epoch": 0.4311542390194076, + "grad_norm": 1.2994211639198199, + "learning_rate": 1.2682867187652645e-05, + "loss": 0.6713, + "step": 4221 + }, + { + "epoch": 0.4312563840653728, + "grad_norm": 1.6173038652989569, + "learning_rate": 1.267967999469809e-05, + "loss": 0.7172, + "step": 4222 + }, + { + "epoch": 0.4313585291113381, + "grad_norm": 1.4013839991973194, + "learning_rate": 1.267649250845083e-05, + "loss": 0.7636, + "step": 4223 + }, + { + "epoch": 0.4314606741573034, + "grad_norm": 1.4726324122435535, + "learning_rate": 1.2673304729259737e-05, + "loss": 0.7195, + "step": 4224 + }, + { + "epoch": 0.43156281920326867, + "grad_norm": 1.305989334081915, + "learning_rate": 1.267011665747371e-05, + "loss": 0.6655, + "step": 4225 + }, + { + "epoch": 0.4316649642492339, + "grad_norm": 1.4848371949272288, + "learning_rate": 1.2666928293441692e-05, + "loss": 0.794, + "step": 4226 + }, + { + "epoch": 0.4317671092951992, + "grad_norm": 1.5279658758639154, + "learning_rate": 1.2663739637512648e-05, + "loss": 0.7679, + "step": 4227 + }, + { + "epoch": 0.43186925434116447, + "grad_norm": 1.4891591759877514, + "learning_rate": 1.2660550690035582e-05, + "loss": 0.6606, + "step": 4228 + }, + { + "epoch": 0.43197139938712975, + "grad_norm": 1.560479938170031, + "learning_rate": 1.2657361451359524e-05, + "loss": 0.7087, + "step": 4229 + }, + { + "epoch": 0.432073544433095, + "grad_norm": 1.446457484049207, + "learning_rate": 1.2654171921833536e-05, + "loss": 0.6367, + "step": 4230 + }, + { + "epoch": 0.43217568947906027, + "grad_norm": 1.4640792583203828, + "learning_rate": 1.2650982101806717e-05, + "loss": 0.7146, + "step": 4231 + }, + { + "epoch": 0.43227783452502555, + "grad_norm": 1.305597056616421, + "learning_rate": 1.2647791991628195e-05, + "loss": 0.735, + "step": 4232 + }, + { + "epoch": 0.4323799795709908, + "grad_norm": 1.6604632371241312, + "learning_rate": 1.2644601591647127e-05, + "loss": 0.7913, + "step": 4233 + }, + { + "epoch": 0.43248212461695607, + "grad_norm": 1.3623833546588544, + "learning_rate": 1.2641410902212707e-05, + "loss": 0.6871, + "step": 4234 + }, + { + "epoch": 0.43258426966292135, + "grad_norm": 1.4119739414357706, + "learning_rate": 1.2638219923674158e-05, + "loss": 0.7034, + "step": 4235 + }, + { + "epoch": 0.43268641470888664, + "grad_norm": 1.3472515115276802, + "learning_rate": 1.2635028656380735e-05, + "loss": 0.7406, + "step": 4236 + }, + { + "epoch": 0.43278855975485186, + "grad_norm": 1.4665677431941686, + "learning_rate": 1.2631837100681724e-05, + "loss": 0.7524, + "step": 4237 + }, + { + "epoch": 0.43289070480081715, + "grad_norm": 1.5849462424163931, + "learning_rate": 1.2628645256926438e-05, + "loss": 0.7419, + "step": 4238 + }, + { + "epoch": 0.43299284984678243, + "grad_norm": 1.5234466519793624, + "learning_rate": 1.262545312546423e-05, + "loss": 0.7857, + "step": 4239 + }, + { + "epoch": 0.4330949948927477, + "grad_norm": 1.4564028783733853, + "learning_rate": 1.2622260706644482e-05, + "loss": 0.6648, + "step": 4240 + }, + { + "epoch": 0.43319713993871295, + "grad_norm": 1.4957898544082597, + "learning_rate": 1.261906800081661e-05, + "loss": 0.7938, + "step": 4241 + }, + { + "epoch": 0.43329928498467823, + "grad_norm": 1.3152159561122425, + "learning_rate": 1.261587500833005e-05, + "loss": 0.7076, + "step": 4242 + }, + { + "epoch": 0.4334014300306435, + "grad_norm": 1.4533069707570487, + "learning_rate": 1.2612681729534277e-05, + "loss": 0.7397, + "step": 4243 + }, + { + "epoch": 0.4335035750766088, + "grad_norm": 1.5302892140573456, + "learning_rate": 1.2609488164778805e-05, + "loss": 0.7107, + "step": 4244 + }, + { + "epoch": 0.43360572012257403, + "grad_norm": 1.4286947601473468, + "learning_rate": 1.2606294314413169e-05, + "loss": 0.7125, + "step": 4245 + }, + { + "epoch": 0.4337078651685393, + "grad_norm": 1.5247547188056398, + "learning_rate": 1.2603100178786928e-05, + "loss": 0.7492, + "step": 4246 + }, + { + "epoch": 0.4338100102145046, + "grad_norm": 1.3594453166938771, + "learning_rate": 1.25999057582497e-05, + "loss": 0.6974, + "step": 4247 + }, + { + "epoch": 0.4339121552604699, + "grad_norm": 1.5915995566954642, + "learning_rate": 1.2596711053151103e-05, + "loss": 0.8248, + "step": 4248 + }, + { + "epoch": 0.4340143003064351, + "grad_norm": 1.4011718665018413, + "learning_rate": 1.2593516063840805e-05, + "loss": 0.6668, + "step": 4249 + }, + { + "epoch": 0.4341164453524004, + "grad_norm": 1.6427006796073262, + "learning_rate": 1.2590320790668493e-05, + "loss": 0.6919, + "step": 4250 + }, + { + "epoch": 0.4342185903983657, + "grad_norm": 1.4392778565463424, + "learning_rate": 1.25871252339839e-05, + "loss": 0.7708, + "step": 4251 + }, + { + "epoch": 0.434320735444331, + "grad_norm": 1.4867793635511608, + "learning_rate": 1.2583929394136783e-05, + "loss": 0.7478, + "step": 4252 + }, + { + "epoch": 0.4344228804902962, + "grad_norm": 1.4906834013008994, + "learning_rate": 1.258073327147692e-05, + "loss": 0.7746, + "step": 4253 + }, + { + "epoch": 0.4345250255362615, + "grad_norm": 1.5456711155769758, + "learning_rate": 1.2577536866354136e-05, + "loss": 0.6246, + "step": 4254 + }, + { + "epoch": 0.4346271705822268, + "grad_norm": 1.5987569158121842, + "learning_rate": 1.2574340179118271e-05, + "loss": 0.7534, + "step": 4255 + }, + { + "epoch": 0.43472931562819206, + "grad_norm": 1.5285868518881474, + "learning_rate": 1.2571143210119216e-05, + "loss": 0.7932, + "step": 4256 + }, + { + "epoch": 0.4348314606741573, + "grad_norm": 1.4742076762058434, + "learning_rate": 1.2567945959706873e-05, + "loss": 0.7, + "step": 4257 + }, + { + "epoch": 0.4349336057201226, + "grad_norm": 1.4508402517673071, + "learning_rate": 1.2564748428231186e-05, + "loss": 0.7187, + "step": 4258 + }, + { + "epoch": 0.43503575076608786, + "grad_norm": 1.4697177232710263, + "learning_rate": 1.2561550616042126e-05, + "loss": 0.7721, + "step": 4259 + }, + { + "epoch": 0.4351378958120531, + "grad_norm": 1.4806895140853427, + "learning_rate": 1.2558352523489696e-05, + "loss": 0.8045, + "step": 4260 + }, + { + "epoch": 0.4352400408580184, + "grad_norm": 1.4513621245151793, + "learning_rate": 1.255515415092393e-05, + "loss": 0.6891, + "step": 4261 + }, + { + "epoch": 0.43534218590398366, + "grad_norm": 1.415982889032856, + "learning_rate": 1.2551955498694893e-05, + "loss": 0.755, + "step": 4262 + }, + { + "epoch": 0.43544433094994894, + "grad_norm": 1.3861999338764512, + "learning_rate": 1.2548756567152674e-05, + "loss": 0.7952, + "step": 4263 + }, + { + "epoch": 0.4355464759959142, + "grad_norm": 1.524254702554869, + "learning_rate": 1.2545557356647405e-05, + "loss": 0.7251, + "step": 4264 + }, + { + "epoch": 0.43564862104187946, + "grad_norm": 1.6317547648668884, + "learning_rate": 1.2542357867529236e-05, + "loss": 0.7831, + "step": 4265 + }, + { + "epoch": 0.43575076608784474, + "grad_norm": 1.647448086376945, + "learning_rate": 1.2539158100148358e-05, + "loss": 0.7037, + "step": 4266 + }, + { + "epoch": 0.43585291113381003, + "grad_norm": 1.4759070532949015, + "learning_rate": 1.2535958054854984e-05, + "loss": 0.7594, + "step": 4267 + }, + { + "epoch": 0.43595505617977526, + "grad_norm": 1.2660794496718846, + "learning_rate": 1.2532757731999365e-05, + "loss": 0.6832, + "step": 4268 + }, + { + "epoch": 0.43605720122574054, + "grad_norm": 1.4173481738742841, + "learning_rate": 1.2529557131931773e-05, + "loss": 0.6088, + "step": 4269 + }, + { + "epoch": 0.43615934627170583, + "grad_norm": 1.580060537762344, + "learning_rate": 1.252635625500252e-05, + "loss": 0.7747, + "step": 4270 + }, + { + "epoch": 0.4362614913176711, + "grad_norm": 1.6104125873782262, + "learning_rate": 1.2523155101561943e-05, + "loss": 0.7463, + "step": 4271 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 1.3927999737394492, + "learning_rate": 1.2519953671960407e-05, + "loss": 0.6961, + "step": 4272 + }, + { + "epoch": 0.4364657814096016, + "grad_norm": 1.551204619477217, + "learning_rate": 1.251675196654832e-05, + "loss": 0.7178, + "step": 4273 + }, + { + "epoch": 0.4365679264555669, + "grad_norm": 1.424812835566812, + "learning_rate": 1.2513549985676098e-05, + "loss": 0.7082, + "step": 4274 + }, + { + "epoch": 0.4366700715015322, + "grad_norm": 1.5665642202278396, + "learning_rate": 1.2510347729694208e-05, + "loss": 0.686, + "step": 4275 + }, + { + "epoch": 0.4367722165474974, + "grad_norm": 1.4117778375924306, + "learning_rate": 1.2507145198953139e-05, + "loss": 0.7138, + "step": 4276 + }, + { + "epoch": 0.4368743615934627, + "grad_norm": 1.4872404928955392, + "learning_rate": 1.2503942393803405e-05, + "loss": 0.6918, + "step": 4277 + }, + { + "epoch": 0.436976506639428, + "grad_norm": 1.4129641957705419, + "learning_rate": 1.2500739314595562e-05, + "loss": 0.69, + "step": 4278 + }, + { + "epoch": 0.4370786516853933, + "grad_norm": 1.5922274940638164, + "learning_rate": 1.249753596168018e-05, + "loss": 0.7062, + "step": 4279 + }, + { + "epoch": 0.4371807967313585, + "grad_norm": 1.4689943534505607, + "learning_rate": 1.2494332335407879e-05, + "loss": 0.7371, + "step": 4280 + }, + { + "epoch": 0.4372829417773238, + "grad_norm": 1.3529574828601063, + "learning_rate": 1.2491128436129292e-05, + "loss": 0.6346, + "step": 4281 + }, + { + "epoch": 0.4373850868232891, + "grad_norm": 1.4052686012092208, + "learning_rate": 1.2487924264195084e-05, + "loss": 0.725, + "step": 4282 + }, + { + "epoch": 0.43748723186925437, + "grad_norm": 1.756972850170746, + "learning_rate": 1.248471981995596e-05, + "loss": 0.7604, + "step": 4283 + }, + { + "epoch": 0.4375893769152196, + "grad_norm": 1.5635699228613305, + "learning_rate": 1.2481515103762644e-05, + "loss": 0.8931, + "step": 4284 + }, + { + "epoch": 0.4376915219611849, + "grad_norm": 1.6618465365811392, + "learning_rate": 1.2478310115965901e-05, + "loss": 0.6541, + "step": 4285 + }, + { + "epoch": 0.43779366700715017, + "grad_norm": 1.5157149535528396, + "learning_rate": 1.2475104856916512e-05, + "loss": 0.7138, + "step": 4286 + }, + { + "epoch": 0.4378958120531154, + "grad_norm": 1.5692036851755695, + "learning_rate": 1.2471899326965298e-05, + "loss": 0.8129, + "step": 4287 + }, + { + "epoch": 0.4379979570990807, + "grad_norm": 1.5866358561429488, + "learning_rate": 1.2468693526463107e-05, + "loss": 0.749, + "step": 4288 + }, + { + "epoch": 0.43810010214504597, + "grad_norm": 1.469598574919986, + "learning_rate": 1.2465487455760811e-05, + "loss": 0.7657, + "step": 4289 + }, + { + "epoch": 0.43820224719101125, + "grad_norm": 1.4751974073668996, + "learning_rate": 1.2462281115209324e-05, + "loss": 0.701, + "step": 4290 + }, + { + "epoch": 0.4383043922369765, + "grad_norm": 1.5118602405768673, + "learning_rate": 1.2459074505159577e-05, + "loss": 0.7591, + "step": 4291 + }, + { + "epoch": 0.43840653728294177, + "grad_norm": 1.386400343417803, + "learning_rate": 1.2455867625962534e-05, + "loss": 0.7637, + "step": 4292 + }, + { + "epoch": 0.43850868232890705, + "grad_norm": 1.438132032564504, + "learning_rate": 1.2452660477969197e-05, + "loss": 0.7462, + "step": 4293 + }, + { + "epoch": 0.43861082737487234, + "grad_norm": 1.578058093343192, + "learning_rate": 1.244945306153058e-05, + "loss": 0.7451, + "step": 4294 + }, + { + "epoch": 0.43871297242083757, + "grad_norm": 1.4165549492823388, + "learning_rate": 1.2446245376997747e-05, + "loss": 0.7411, + "step": 4295 + }, + { + "epoch": 0.43881511746680285, + "grad_norm": 1.6445460778641914, + "learning_rate": 1.2443037424721775e-05, + "loss": 0.8098, + "step": 4296 + }, + { + "epoch": 0.43891726251276814, + "grad_norm": 1.5080588856632076, + "learning_rate": 1.2439829205053781e-05, + "loss": 0.6541, + "step": 4297 + }, + { + "epoch": 0.4390194075587334, + "grad_norm": 1.5216849013003038, + "learning_rate": 1.2436620718344906e-05, + "loss": 0.6985, + "step": 4298 + }, + { + "epoch": 0.43912155260469865, + "grad_norm": 1.5243592312663756, + "learning_rate": 1.2433411964946314e-05, + "loss": 0.6725, + "step": 4299 + }, + { + "epoch": 0.43922369765066394, + "grad_norm": 1.3153444437401756, + "learning_rate": 1.2430202945209213e-05, + "loss": 0.6508, + "step": 4300 + }, + { + "epoch": 0.4393258426966292, + "grad_norm": 1.322529874816043, + "learning_rate": 1.2426993659484827e-05, + "loss": 0.7185, + "step": 4301 + }, + { + "epoch": 0.4394279877425945, + "grad_norm": 1.3363266600642454, + "learning_rate": 1.2423784108124422e-05, + "loss": 0.6425, + "step": 4302 + }, + { + "epoch": 0.43953013278855974, + "grad_norm": 1.5189178699273036, + "learning_rate": 1.2420574291479275e-05, + "loss": 0.5936, + "step": 4303 + }, + { + "epoch": 0.439632277834525, + "grad_norm": 1.5499711754372096, + "learning_rate": 1.2417364209900711e-05, + "loss": 0.7395, + "step": 4304 + }, + { + "epoch": 0.4397344228804903, + "grad_norm": 1.5058191572520587, + "learning_rate": 1.2414153863740073e-05, + "loss": 0.8413, + "step": 4305 + }, + { + "epoch": 0.4398365679264556, + "grad_norm": 1.3766694861285038, + "learning_rate": 1.2410943253348733e-05, + "loss": 0.6354, + "step": 4306 + }, + { + "epoch": 0.4399387129724208, + "grad_norm": 1.512251402805886, + "learning_rate": 1.2407732379078095e-05, + "loss": 0.7682, + "step": 4307 + }, + { + "epoch": 0.4400408580183861, + "grad_norm": 1.2428102591146084, + "learning_rate": 1.2404521241279595e-05, + "loss": 0.6779, + "step": 4308 + }, + { + "epoch": 0.4401430030643514, + "grad_norm": 1.7695786688638393, + "learning_rate": 1.2401309840304689e-05, + "loss": 0.8618, + "step": 4309 + }, + { + "epoch": 0.4402451481103167, + "grad_norm": 1.4813845762135314, + "learning_rate": 1.2398098176504873e-05, + "loss": 0.7545, + "step": 4310 + }, + { + "epoch": 0.4403472931562819, + "grad_norm": 1.4063653331311956, + "learning_rate": 1.239488625023166e-05, + "loss": 0.622, + "step": 4311 + }, + { + "epoch": 0.4404494382022472, + "grad_norm": 1.3263698740502583, + "learning_rate": 1.2391674061836601e-05, + "loss": 0.6706, + "step": 4312 + }, + { + "epoch": 0.4405515832482125, + "grad_norm": 1.4644829744161356, + "learning_rate": 1.238846161167127e-05, + "loss": 0.6802, + "step": 4313 + }, + { + "epoch": 0.44065372829417776, + "grad_norm": 1.4676019613216038, + "learning_rate": 1.2385248900087272e-05, + "loss": 0.7248, + "step": 4314 + }, + { + "epoch": 0.440755873340143, + "grad_norm": 1.4555216495576928, + "learning_rate": 1.2382035927436242e-05, + "loss": 0.6482, + "step": 4315 + }, + { + "epoch": 0.4408580183861083, + "grad_norm": 1.542980701200706, + "learning_rate": 1.2378822694069838e-05, + "loss": 0.8055, + "step": 4316 + }, + { + "epoch": 0.44096016343207356, + "grad_norm": 1.5247808891023396, + "learning_rate": 1.2375609200339757e-05, + "loss": 0.6696, + "step": 4317 + }, + { + "epoch": 0.4410623084780388, + "grad_norm": 1.4698196529612217, + "learning_rate": 1.2372395446597711e-05, + "loss": 0.8496, + "step": 4318 + }, + { + "epoch": 0.4411644535240041, + "grad_norm": 1.3867930671224251, + "learning_rate": 1.2369181433195451e-05, + "loss": 0.7064, + "step": 4319 + }, + { + "epoch": 0.44126659856996936, + "grad_norm": 1.633360646761823, + "learning_rate": 1.2365967160484755e-05, + "loss": 0.8671, + "step": 4320 + }, + { + "epoch": 0.44136874361593464, + "grad_norm": 1.5241977124885135, + "learning_rate": 1.2362752628817423e-05, + "loss": 0.7087, + "step": 4321 + }, + { + "epoch": 0.4414708886618999, + "grad_norm": 1.4411299567597549, + "learning_rate": 1.235953783854529e-05, + "loss": 0.6368, + "step": 4322 + }, + { + "epoch": 0.44157303370786516, + "grad_norm": 1.5709462306510522, + "learning_rate": 1.2356322790020214e-05, + "loss": 0.732, + "step": 4323 + }, + { + "epoch": 0.44167517875383044, + "grad_norm": 1.4522546422766027, + "learning_rate": 1.2353107483594089e-05, + "loss": 0.7425, + "step": 4324 + }, + { + "epoch": 0.44177732379979573, + "grad_norm": 1.4020149706744478, + "learning_rate": 1.234989191961883e-05, + "loss": 0.6809, + "step": 4325 + }, + { + "epoch": 0.44187946884576096, + "grad_norm": 1.5624484753992463, + "learning_rate": 1.234667609844638e-05, + "loss": 0.7472, + "step": 4326 + }, + { + "epoch": 0.44198161389172624, + "grad_norm": 1.5846059399937262, + "learning_rate": 1.2343460020428715e-05, + "loss": 0.7475, + "step": 4327 + }, + { + "epoch": 0.44208375893769153, + "grad_norm": 1.4182512117393002, + "learning_rate": 1.2340243685917836e-05, + "loss": 0.7199, + "step": 4328 + }, + { + "epoch": 0.4421859039836568, + "grad_norm": 1.6064853051905523, + "learning_rate": 1.2337027095265775e-05, + "loss": 0.7312, + "step": 4329 + }, + { + "epoch": 0.44228804902962204, + "grad_norm": 1.5858264427646682, + "learning_rate": 1.2333810248824588e-05, + "loss": 0.8269, + "step": 4330 + }, + { + "epoch": 0.44239019407558733, + "grad_norm": 1.6260113523230295, + "learning_rate": 1.2330593146946362e-05, + "loss": 0.7912, + "step": 4331 + }, + { + "epoch": 0.4424923391215526, + "grad_norm": 1.5447404557460256, + "learning_rate": 1.232737578998321e-05, + "loss": 0.7347, + "step": 4332 + }, + { + "epoch": 0.4425944841675179, + "grad_norm": 1.4230145103673555, + "learning_rate": 1.2324158178287274e-05, + "loss": 0.7982, + "step": 4333 + }, + { + "epoch": 0.44269662921348313, + "grad_norm": 1.4613193056367413, + "learning_rate": 1.2320940312210725e-05, + "loss": 0.6198, + "step": 4334 + }, + { + "epoch": 0.4427987742594484, + "grad_norm": 1.404732768170166, + "learning_rate": 1.2317722192105757e-05, + "loss": 0.6204, + "step": 4335 + }, + { + "epoch": 0.4429009193054137, + "grad_norm": 1.398786951959995, + "learning_rate": 1.2314503818324597e-05, + "loss": 0.6363, + "step": 4336 + }, + { + "epoch": 0.443003064351379, + "grad_norm": 1.3095174777510055, + "learning_rate": 1.2311285191219501e-05, + "loss": 0.6024, + "step": 4337 + }, + { + "epoch": 0.4431052093973442, + "grad_norm": 1.5102474032263602, + "learning_rate": 1.2308066311142747e-05, + "loss": 0.8179, + "step": 4338 + }, + { + "epoch": 0.4432073544433095, + "grad_norm": 1.426766688907201, + "learning_rate": 1.2304847178446643e-05, + "loss": 0.7161, + "step": 4339 + }, + { + "epoch": 0.4433094994892748, + "grad_norm": 1.4421464549915892, + "learning_rate": 1.2301627793483527e-05, + "loss": 0.7611, + "step": 4340 + }, + { + "epoch": 0.44341164453524007, + "grad_norm": 1.5614923544265915, + "learning_rate": 1.2298408156605763e-05, + "loss": 0.7406, + "step": 4341 + }, + { + "epoch": 0.4435137895812053, + "grad_norm": 1.578988848333134, + "learning_rate": 1.2295188268165743e-05, + "loss": 0.7601, + "step": 4342 + }, + { + "epoch": 0.4436159346271706, + "grad_norm": 1.375854058276013, + "learning_rate": 1.2291968128515878e-05, + "loss": 0.7274, + "step": 4343 + }, + { + "epoch": 0.44371807967313587, + "grad_norm": 1.4424386116312715, + "learning_rate": 1.2288747738008626e-05, + "loss": 0.7352, + "step": 4344 + }, + { + "epoch": 0.4438202247191011, + "grad_norm": 1.4568709246195262, + "learning_rate": 1.2285527096996455e-05, + "loss": 0.7474, + "step": 4345 + }, + { + "epoch": 0.4439223697650664, + "grad_norm": 1.4114088741971806, + "learning_rate": 1.2282306205831866e-05, + "loss": 0.7832, + "step": 4346 + }, + { + "epoch": 0.44402451481103167, + "grad_norm": 1.6934672679577285, + "learning_rate": 1.2279085064867387e-05, + "loss": 0.6854, + "step": 4347 + }, + { + "epoch": 0.44412665985699695, + "grad_norm": 1.3919749285446825, + "learning_rate": 1.2275863674455576e-05, + "loss": 0.7036, + "step": 4348 + }, + { + "epoch": 0.4442288049029622, + "grad_norm": 1.3783320978581106, + "learning_rate": 1.2272642034949013e-05, + "loss": 0.6925, + "step": 4349 + }, + { + "epoch": 0.44433094994892747, + "grad_norm": 1.4304731140633082, + "learning_rate": 1.2269420146700312e-05, + "loss": 0.7424, + "step": 4350 + }, + { + "epoch": 0.44443309499489275, + "grad_norm": 1.4204502808265755, + "learning_rate": 1.2266198010062112e-05, + "loss": 0.6975, + "step": 4351 + }, + { + "epoch": 0.44453524004085804, + "grad_norm": 1.5211648061553873, + "learning_rate": 1.2262975625387074e-05, + "loss": 0.7859, + "step": 4352 + }, + { + "epoch": 0.44463738508682327, + "grad_norm": 1.43845791217103, + "learning_rate": 1.2259752993027893e-05, + "loss": 0.7281, + "step": 4353 + }, + { + "epoch": 0.44473953013278855, + "grad_norm": 1.3810689485156413, + "learning_rate": 1.2256530113337287e-05, + "loss": 0.7739, + "step": 4354 + }, + { + "epoch": 0.44484167517875384, + "grad_norm": 1.4576886326680158, + "learning_rate": 1.2253306986667999e-05, + "loss": 0.7645, + "step": 4355 + }, + { + "epoch": 0.4449438202247191, + "grad_norm": 1.3302933881990837, + "learning_rate": 1.2250083613372807e-05, + "loss": 0.7041, + "step": 4356 + }, + { + "epoch": 0.44504596527068435, + "grad_norm": 1.4774879036721376, + "learning_rate": 1.224685999380451e-05, + "loss": 0.7488, + "step": 4357 + }, + { + "epoch": 0.44514811031664964, + "grad_norm": 1.4832276230968722, + "learning_rate": 1.224363612831594e-05, + "loss": 0.6589, + "step": 4358 + }, + { + "epoch": 0.4452502553626149, + "grad_norm": 1.473863136827675, + "learning_rate": 1.224041201725994e-05, + "loss": 0.7756, + "step": 4359 + }, + { + "epoch": 0.4453524004085802, + "grad_norm": 1.500343603157219, + "learning_rate": 1.2237187660989396e-05, + "loss": 0.7811, + "step": 4360 + }, + { + "epoch": 0.44545454545454544, + "grad_norm": 1.3326896825393737, + "learning_rate": 1.2233963059857222e-05, + "loss": 0.7125, + "step": 4361 + }, + { + "epoch": 0.4455566905005107, + "grad_norm": 1.5108023488563913, + "learning_rate": 1.2230738214216344e-05, + "loss": 0.6309, + "step": 4362 + }, + { + "epoch": 0.445658835546476, + "grad_norm": 1.4666688851993972, + "learning_rate": 1.222751312441973e-05, + "loss": 0.6696, + "step": 4363 + }, + { + "epoch": 0.4457609805924413, + "grad_norm": 1.5359144566587273, + "learning_rate": 1.2224287790820367e-05, + "loss": 0.7074, + "step": 4364 + }, + { + "epoch": 0.4458631256384065, + "grad_norm": 1.5298543394776472, + "learning_rate": 1.2221062213771261e-05, + "loss": 0.7, + "step": 4365 + }, + { + "epoch": 0.4459652706843718, + "grad_norm": 1.3351588777663383, + "learning_rate": 1.2217836393625469e-05, + "loss": 0.7323, + "step": 4366 + }, + { + "epoch": 0.4460674157303371, + "grad_norm": 1.3951986652096973, + "learning_rate": 1.2214610330736043e-05, + "loss": 0.8777, + "step": 4367 + }, + { + "epoch": 0.4461695607763024, + "grad_norm": 1.5284464624583667, + "learning_rate": 1.2211384025456092e-05, + "loss": 0.7218, + "step": 4368 + }, + { + "epoch": 0.4462717058222676, + "grad_norm": 1.397552943358692, + "learning_rate": 1.2208157478138728e-05, + "loss": 0.7657, + "step": 4369 + }, + { + "epoch": 0.4463738508682329, + "grad_norm": 1.4253191380478456, + "learning_rate": 1.2204930689137103e-05, + "loss": 0.7466, + "step": 4370 + }, + { + "epoch": 0.4464759959141982, + "grad_norm": 1.699449729249936, + "learning_rate": 1.2201703658804386e-05, + "loss": 0.7295, + "step": 4371 + }, + { + "epoch": 0.4465781409601634, + "grad_norm": 1.4557497957359848, + "learning_rate": 1.2198476387493783e-05, + "loss": 0.7694, + "step": 4372 + }, + { + "epoch": 0.4466802860061287, + "grad_norm": 1.5786541820164806, + "learning_rate": 1.2195248875558521e-05, + "loss": 0.7621, + "step": 4373 + }, + { + "epoch": 0.446782431052094, + "grad_norm": 1.4830492297264768, + "learning_rate": 1.2192021123351846e-05, + "loss": 0.7227, + "step": 4374 + }, + { + "epoch": 0.44688457609805926, + "grad_norm": 1.4370125807475895, + "learning_rate": 1.2188793131227049e-05, + "loss": 0.7343, + "step": 4375 + }, + { + "epoch": 0.4469867211440245, + "grad_norm": 1.4686824926980728, + "learning_rate": 1.2185564899537425e-05, + "loss": 0.735, + "step": 4376 + }, + { + "epoch": 0.4470888661899898, + "grad_norm": 1.3952056934191304, + "learning_rate": 1.2182336428636314e-05, + "loss": 0.7218, + "step": 4377 + }, + { + "epoch": 0.44719101123595506, + "grad_norm": 1.4037639559460904, + "learning_rate": 1.217910771887707e-05, + "loss": 0.6432, + "step": 4378 + }, + { + "epoch": 0.44729315628192035, + "grad_norm": 1.5182739448173013, + "learning_rate": 1.2175878770613077e-05, + "loss": 0.6754, + "step": 4379 + }, + { + "epoch": 0.4473953013278856, + "grad_norm": 1.4535938967674906, + "learning_rate": 1.2172649584197746e-05, + "loss": 0.7118, + "step": 4380 + }, + { + "epoch": 0.44749744637385086, + "grad_norm": 1.4896893676586502, + "learning_rate": 1.2169420159984517e-05, + "loss": 0.7093, + "step": 4381 + }, + { + "epoch": 0.44759959141981615, + "grad_norm": 1.321420613608895, + "learning_rate": 1.2166190498326849e-05, + "loss": 0.663, + "step": 4382 + }, + { + "epoch": 0.44770173646578143, + "grad_norm": 1.437288504541632, + "learning_rate": 1.216296059957823e-05, + "loss": 0.741, + "step": 4383 + }, + { + "epoch": 0.44780388151174666, + "grad_norm": 1.5730459794623028, + "learning_rate": 1.2159730464092176e-05, + "loss": 0.7918, + "step": 4384 + }, + { + "epoch": 0.44790602655771194, + "grad_norm": 1.4530273916342542, + "learning_rate": 1.215650009222223e-05, + "loss": 0.7308, + "step": 4385 + }, + { + "epoch": 0.44800817160367723, + "grad_norm": 1.4207751415847125, + "learning_rate": 1.2153269484321956e-05, + "loss": 0.7016, + "step": 4386 + }, + { + "epoch": 0.4481103166496425, + "grad_norm": 1.4770457015481053, + "learning_rate": 1.2150038640744942e-05, + "loss": 0.6615, + "step": 4387 + }, + { + "epoch": 0.44821246169560774, + "grad_norm": 1.2838986437172024, + "learning_rate": 1.214680756184481e-05, + "loss": 0.7217, + "step": 4388 + }, + { + "epoch": 0.44831460674157303, + "grad_norm": 1.451117080800461, + "learning_rate": 1.2143576247975207e-05, + "loss": 0.8044, + "step": 4389 + }, + { + "epoch": 0.4484167517875383, + "grad_norm": 1.516318690198013, + "learning_rate": 1.2140344699489796e-05, + "loss": 0.8021, + "step": 4390 + }, + { + "epoch": 0.4485188968335036, + "grad_norm": 1.6220569601971606, + "learning_rate": 1.2137112916742275e-05, + "loss": 0.7667, + "step": 4391 + }, + { + "epoch": 0.44862104187946883, + "grad_norm": 1.5677845537498745, + "learning_rate": 1.2133880900086364e-05, + "loss": 0.829, + "step": 4392 + }, + { + "epoch": 0.4487231869254341, + "grad_norm": 1.4765287355672096, + "learning_rate": 1.2130648649875812e-05, + "loss": 0.7262, + "step": 4393 + }, + { + "epoch": 0.4488253319713994, + "grad_norm": 1.5058052829512634, + "learning_rate": 1.2127416166464387e-05, + "loss": 0.7776, + "step": 4394 + }, + { + "epoch": 0.4489274770173647, + "grad_norm": 1.3821313019612402, + "learning_rate": 1.2124183450205886e-05, + "loss": 0.6601, + "step": 4395 + }, + { + "epoch": 0.4490296220633299, + "grad_norm": 1.4557119782506014, + "learning_rate": 1.2120950501454138e-05, + "loss": 0.6493, + "step": 4396 + }, + { + "epoch": 0.4491317671092952, + "grad_norm": 1.5664360796236632, + "learning_rate": 1.2117717320562986e-05, + "loss": 0.821, + "step": 4397 + }, + { + "epoch": 0.4492339121552605, + "grad_norm": 1.650589519160757, + "learning_rate": 1.2114483907886307e-05, + "loss": 0.7134, + "step": 4398 + }, + { + "epoch": 0.4493360572012257, + "grad_norm": 1.3733277789530847, + "learning_rate": 1.2111250263777991e-05, + "loss": 0.7191, + "step": 4399 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 1.4600761141790548, + "learning_rate": 1.2108016388591976e-05, + "loss": 0.7335, + "step": 4400 + }, + { + "epoch": 0.4495403472931563, + "grad_norm": 1.4035557072562581, + "learning_rate": 1.2104782282682203e-05, + "loss": 0.8445, + "step": 4401 + }, + { + "epoch": 0.44964249233912157, + "grad_norm": 1.4187037891871126, + "learning_rate": 1.2101547946402653e-05, + "loss": 0.7001, + "step": 4402 + }, + { + "epoch": 0.4497446373850868, + "grad_norm": 1.4077158538699681, + "learning_rate": 1.2098313380107319e-05, + "loss": 0.7009, + "step": 4403 + }, + { + "epoch": 0.4498467824310521, + "grad_norm": 1.4544018778405412, + "learning_rate": 1.2095078584150228e-05, + "loss": 0.6942, + "step": 4404 + }, + { + "epoch": 0.44994892747701737, + "grad_norm": 1.4039242708460282, + "learning_rate": 1.2091843558885436e-05, + "loss": 0.6463, + "step": 4405 + }, + { + "epoch": 0.45005107252298265, + "grad_norm": 1.47201556512031, + "learning_rate": 1.2088608304667014e-05, + "loss": 0.7531, + "step": 4406 + }, + { + "epoch": 0.4501532175689479, + "grad_norm": 1.399603494788756, + "learning_rate": 1.2085372821849063e-05, + "loss": 0.7037, + "step": 4407 + }, + { + "epoch": 0.45025536261491317, + "grad_norm": 1.4921025181436893, + "learning_rate": 1.2082137110785713e-05, + "loss": 0.6334, + "step": 4408 + }, + { + "epoch": 0.45035750766087845, + "grad_norm": 1.4480639255419936, + "learning_rate": 1.2078901171831106e-05, + "loss": 0.698, + "step": 4409 + }, + { + "epoch": 0.45045965270684374, + "grad_norm": 1.4376241586795264, + "learning_rate": 1.2075665005339426e-05, + "loss": 0.6242, + "step": 4410 + }, + { + "epoch": 0.45056179775280897, + "grad_norm": 1.443282262100317, + "learning_rate": 1.2072428611664864e-05, + "loss": 0.6801, + "step": 4411 + }, + { + "epoch": 0.45066394279877425, + "grad_norm": 1.39450885496504, + "learning_rate": 1.2069191991161658e-05, + "loss": 0.7029, + "step": 4412 + }, + { + "epoch": 0.45076608784473954, + "grad_norm": 1.599144149457271, + "learning_rate": 1.2065955144184052e-05, + "loss": 0.7726, + "step": 4413 + }, + { + "epoch": 0.4508682328907048, + "grad_norm": 1.373473877396574, + "learning_rate": 1.2062718071086317e-05, + "loss": 0.6232, + "step": 4414 + }, + { + "epoch": 0.45097037793667005, + "grad_norm": 1.5095594440109863, + "learning_rate": 1.2059480772222756e-05, + "loss": 0.6713, + "step": 4415 + }, + { + "epoch": 0.45107252298263534, + "grad_norm": 1.44031859690779, + "learning_rate": 1.2056243247947697e-05, + "loss": 0.6613, + "step": 4416 + }, + { + "epoch": 0.4511746680286006, + "grad_norm": 1.3256904663519213, + "learning_rate": 1.2053005498615484e-05, + "loss": 0.753, + "step": 4417 + }, + { + "epoch": 0.4512768130745659, + "grad_norm": 1.3994283577238384, + "learning_rate": 1.2049767524580493e-05, + "loss": 0.8205, + "step": 4418 + }, + { + "epoch": 0.45137895812053114, + "grad_norm": 1.556199031453398, + "learning_rate": 1.2046529326197123e-05, + "loss": 0.7343, + "step": 4419 + }, + { + "epoch": 0.4514811031664964, + "grad_norm": 1.350704694924575, + "learning_rate": 1.2043290903819796e-05, + "loss": 0.7549, + "step": 4420 + }, + { + "epoch": 0.4515832482124617, + "grad_norm": 1.525958803218305, + "learning_rate": 1.2040052257802959e-05, + "loss": 0.7794, + "step": 4421 + }, + { + "epoch": 0.451685393258427, + "grad_norm": 1.470411451723171, + "learning_rate": 1.2036813388501086e-05, + "loss": 0.6812, + "step": 4422 + }, + { + "epoch": 0.4517875383043922, + "grad_norm": 1.416199166670464, + "learning_rate": 1.2033574296268669e-05, + "loss": 0.6105, + "step": 4423 + }, + { + "epoch": 0.4518896833503575, + "grad_norm": 1.540217586109598, + "learning_rate": 1.2030334981460232e-05, + "loss": 0.865, + "step": 4424 + }, + { + "epoch": 0.4519918283963228, + "grad_norm": 1.3906921001109036, + "learning_rate": 1.2027095444430322e-05, + "loss": 0.69, + "step": 4425 + }, + { + "epoch": 0.452093973442288, + "grad_norm": 1.3807130721367225, + "learning_rate": 1.20238556855335e-05, + "loss": 0.7883, + "step": 4426 + }, + { + "epoch": 0.4521961184882533, + "grad_norm": 1.5510032943363248, + "learning_rate": 1.202061570512437e-05, + "loss": 0.6689, + "step": 4427 + }, + { + "epoch": 0.4522982635342186, + "grad_norm": 1.363118110938814, + "learning_rate": 1.2017375503557544e-05, + "loss": 0.7374, + "step": 4428 + }, + { + "epoch": 0.4524004085801839, + "grad_norm": 1.4357667445835383, + "learning_rate": 1.201413508118767e-05, + "loss": 0.733, + "step": 4429 + }, + { + "epoch": 0.4525025536261491, + "grad_norm": 1.3108820213726866, + "learning_rate": 1.2010894438369405e-05, + "loss": 0.765, + "step": 4430 + }, + { + "epoch": 0.4526046986721144, + "grad_norm": 1.535544868946225, + "learning_rate": 1.2007653575457445e-05, + "loss": 0.741, + "step": 4431 + }, + { + "epoch": 0.4527068437180797, + "grad_norm": 1.5286731558569973, + "learning_rate": 1.2004412492806507e-05, + "loss": 0.8003, + "step": 4432 + }, + { + "epoch": 0.45280898876404496, + "grad_norm": 1.4730489224638963, + "learning_rate": 1.2001171190771325e-05, + "loss": 0.6618, + "step": 4433 + }, + { + "epoch": 0.4529111338100102, + "grad_norm": 1.453915623716457, + "learning_rate": 1.1997929669706664e-05, + "loss": 0.71, + "step": 4434 + }, + { + "epoch": 0.4530132788559755, + "grad_norm": 1.7679577552373162, + "learning_rate": 1.199468792996731e-05, + "loss": 0.8842, + "step": 4435 + }, + { + "epoch": 0.45311542390194076, + "grad_norm": 1.3727079808498286, + "learning_rate": 1.1991445971908072e-05, + "loss": 0.7056, + "step": 4436 + }, + { + "epoch": 0.45321756894790605, + "grad_norm": 1.4074743546290096, + "learning_rate": 1.1988203795883787e-05, + "loss": 0.5967, + "step": 4437 + }, + { + "epoch": 0.4533197139938713, + "grad_norm": 1.5073829520534248, + "learning_rate": 1.198496140224931e-05, + "loss": 0.6979, + "step": 4438 + }, + { + "epoch": 0.45342185903983656, + "grad_norm": 1.429241194942722, + "learning_rate": 1.1981718791359527e-05, + "loss": 0.7593, + "step": 4439 + }, + { + "epoch": 0.45352400408580185, + "grad_norm": 1.4546143175747257, + "learning_rate": 1.1978475963569343e-05, + "loss": 0.82, + "step": 4440 + }, + { + "epoch": 0.45362614913176713, + "grad_norm": 1.7785285644971274, + "learning_rate": 1.1975232919233684e-05, + "loss": 0.6882, + "step": 4441 + }, + { + "epoch": 0.45372829417773236, + "grad_norm": 1.4970466542597403, + "learning_rate": 1.1971989658707507e-05, + "loss": 0.7195, + "step": 4442 + }, + { + "epoch": 0.45383043922369765, + "grad_norm": 1.4054963498339679, + "learning_rate": 1.1968746182345785e-05, + "loss": 0.7519, + "step": 4443 + }, + { + "epoch": 0.45393258426966293, + "grad_norm": 1.5892193400696613, + "learning_rate": 1.1965502490503525e-05, + "loss": 0.7515, + "step": 4444 + }, + { + "epoch": 0.4540347293156282, + "grad_norm": 1.4682562435517577, + "learning_rate": 1.1962258583535747e-05, + "loss": 0.6774, + "step": 4445 + }, + { + "epoch": 0.45413687436159345, + "grad_norm": 1.5021529017127018, + "learning_rate": 1.1959014461797498e-05, + "loss": 0.692, + "step": 4446 + }, + { + "epoch": 0.45423901940755873, + "grad_norm": 1.3661682369665527, + "learning_rate": 1.195577012564385e-05, + "loss": 0.763, + "step": 4447 + }, + { + "epoch": 0.454341164453524, + "grad_norm": 1.4152040630401885, + "learning_rate": 1.19525255754299e-05, + "loss": 0.7232, + "step": 4448 + }, + { + "epoch": 0.4544433094994893, + "grad_norm": 1.7021936132141924, + "learning_rate": 1.1949280811510763e-05, + "loss": 0.75, + "step": 4449 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 1.3976685473117325, + "learning_rate": 1.1946035834241582e-05, + "loss": 0.7018, + "step": 4450 + }, + { + "epoch": 0.4546475995914198, + "grad_norm": 1.3638514612073973, + "learning_rate": 1.1942790643977523e-05, + "loss": 0.7236, + "step": 4451 + }, + { + "epoch": 0.4547497446373851, + "grad_norm": 1.5083968539835473, + "learning_rate": 1.1939545241073774e-05, + "loss": 0.793, + "step": 4452 + }, + { + "epoch": 0.45485188968335033, + "grad_norm": 1.3083226578590414, + "learning_rate": 1.1936299625885542e-05, + "loss": 0.6711, + "step": 4453 + }, + { + "epoch": 0.4549540347293156, + "grad_norm": 1.4819100042807536, + "learning_rate": 1.1933053798768065e-05, + "loss": 0.674, + "step": 4454 + }, + { + "epoch": 0.4550561797752809, + "grad_norm": 1.494768994864976, + "learning_rate": 1.1929807760076599e-05, + "loss": 0.7685, + "step": 4455 + }, + { + "epoch": 0.4551583248212462, + "grad_norm": 1.5428293099845931, + "learning_rate": 1.1926561510166432e-05, + "loss": 0.683, + "step": 4456 + }, + { + "epoch": 0.4552604698672114, + "grad_norm": 1.5218773399898289, + "learning_rate": 1.1923315049392859e-05, + "loss": 0.8571, + "step": 4457 + }, + { + "epoch": 0.4553626149131767, + "grad_norm": 1.3456395891463317, + "learning_rate": 1.1920068378111214e-05, + "loss": 0.6701, + "step": 4458 + }, + { + "epoch": 0.455464759959142, + "grad_norm": 1.3364270175490052, + "learning_rate": 1.1916821496676842e-05, + "loss": 0.6399, + "step": 4459 + }, + { + "epoch": 0.45556690500510727, + "grad_norm": 1.3936767358891256, + "learning_rate": 1.191357440544512e-05, + "loss": 0.7822, + "step": 4460 + }, + { + "epoch": 0.4556690500510725, + "grad_norm": 1.4175869051409595, + "learning_rate": 1.1910327104771444e-05, + "loss": 0.7581, + "step": 4461 + }, + { + "epoch": 0.4557711950970378, + "grad_norm": 1.5152997363576073, + "learning_rate": 1.190707959501123e-05, + "loss": 0.6552, + "step": 4462 + }, + { + "epoch": 0.45587334014300307, + "grad_norm": 1.4616797198097955, + "learning_rate": 1.1903831876519925e-05, + "loss": 0.7458, + "step": 4463 + }, + { + "epoch": 0.45597548518896835, + "grad_norm": 1.4448445647926547, + "learning_rate": 1.190058394965299e-05, + "loss": 0.6674, + "step": 4464 + }, + { + "epoch": 0.4560776302349336, + "grad_norm": 1.4918605862769925, + "learning_rate": 1.1897335814765913e-05, + "loss": 0.7428, + "step": 4465 + }, + { + "epoch": 0.45617977528089887, + "grad_norm": 1.408609158388085, + "learning_rate": 1.1894087472214207e-05, + "loss": 0.7152, + "step": 4466 + }, + { + "epoch": 0.45628192032686415, + "grad_norm": 1.4557950591250353, + "learning_rate": 1.1890838922353401e-05, + "loss": 0.6463, + "step": 4467 + }, + { + "epoch": 0.45638406537282944, + "grad_norm": 1.3917652304791865, + "learning_rate": 1.1887590165539053e-05, + "loss": 0.6889, + "step": 4468 + }, + { + "epoch": 0.45648621041879467, + "grad_norm": 1.298569340387807, + "learning_rate": 1.1884341202126745e-05, + "loss": 0.704, + "step": 4469 + }, + { + "epoch": 0.45658835546475995, + "grad_norm": 1.3493372771975554, + "learning_rate": 1.1881092032472072e-05, + "loss": 0.5985, + "step": 4470 + }, + { + "epoch": 0.45669050051072524, + "grad_norm": 1.4769110478431835, + "learning_rate": 1.1877842656930661e-05, + "loss": 0.7733, + "step": 4471 + }, + { + "epoch": 0.4567926455566905, + "grad_norm": 1.3819348357981414, + "learning_rate": 1.1874593075858159e-05, + "loss": 0.6765, + "step": 4472 + }, + { + "epoch": 0.45689479060265575, + "grad_norm": 1.467529481900932, + "learning_rate": 1.1871343289610233e-05, + "loss": 0.7215, + "step": 4473 + }, + { + "epoch": 0.45699693564862104, + "grad_norm": 1.3055325672947433, + "learning_rate": 1.1868093298542576e-05, + "loss": 0.7102, + "step": 4474 + }, + { + "epoch": 0.4570990806945863, + "grad_norm": 1.5932109654979698, + "learning_rate": 1.1864843103010898e-05, + "loss": 0.7304, + "step": 4475 + }, + { + "epoch": 0.4572012257405516, + "grad_norm": 1.5430787915665027, + "learning_rate": 1.1861592703370942e-05, + "loss": 0.7336, + "step": 4476 + }, + { + "epoch": 0.45730337078651684, + "grad_norm": 1.5141796117121165, + "learning_rate": 1.1858342099978458e-05, + "loss": 0.7576, + "step": 4477 + }, + { + "epoch": 0.4574055158324821, + "grad_norm": 1.4057089567082752, + "learning_rate": 1.1855091293189233e-05, + "loss": 0.7269, + "step": 4478 + }, + { + "epoch": 0.4575076608784474, + "grad_norm": 1.4773961845811605, + "learning_rate": 1.1851840283359067e-05, + "loss": 0.7183, + "step": 4479 + }, + { + "epoch": 0.4576098059244127, + "grad_norm": 1.465560342917277, + "learning_rate": 1.1848589070843783e-05, + "loss": 0.6968, + "step": 4480 + }, + { + "epoch": 0.4577119509703779, + "grad_norm": 1.3990218642663586, + "learning_rate": 1.1845337655999234e-05, + "loss": 0.773, + "step": 4481 + }, + { + "epoch": 0.4578140960163432, + "grad_norm": 1.4049164043782585, + "learning_rate": 1.1842086039181284e-05, + "loss": 0.732, + "step": 4482 + }, + { + "epoch": 0.4579162410623085, + "grad_norm": 1.5324372564910247, + "learning_rate": 1.1838834220745828e-05, + "loss": 0.8042, + "step": 4483 + }, + { + "epoch": 0.4580183861082737, + "grad_norm": 1.4241554532999965, + "learning_rate": 1.1835582201048777e-05, + "loss": 0.7165, + "step": 4484 + }, + { + "epoch": 0.458120531154239, + "grad_norm": 1.4764995078289065, + "learning_rate": 1.183232998044607e-05, + "loss": 0.6513, + "step": 4485 + }, + { + "epoch": 0.4582226762002043, + "grad_norm": 1.3793746714274664, + "learning_rate": 1.1829077559293665e-05, + "loss": 0.7562, + "step": 4486 + }, + { + "epoch": 0.4583248212461696, + "grad_norm": 1.4055444718881402, + "learning_rate": 1.1825824937947531e-05, + "loss": 0.6216, + "step": 4487 + }, + { + "epoch": 0.4584269662921348, + "grad_norm": 1.5710093242195102, + "learning_rate": 1.1822572116763686e-05, + "loss": 0.7807, + "step": 4488 + }, + { + "epoch": 0.4585291113381001, + "grad_norm": 1.5918105120579595, + "learning_rate": 1.1819319096098143e-05, + "loss": 0.7835, + "step": 4489 + }, + { + "epoch": 0.4586312563840654, + "grad_norm": 1.4985010192757853, + "learning_rate": 1.1816065876306951e-05, + "loss": 0.8001, + "step": 4490 + }, + { + "epoch": 0.45873340143003066, + "grad_norm": 1.4315793498288834, + "learning_rate": 1.1812812457746172e-05, + "loss": 0.7355, + "step": 4491 + }, + { + "epoch": 0.4588355464759959, + "grad_norm": 1.3918137340073968, + "learning_rate": 1.18095588407719e-05, + "loss": 0.745, + "step": 4492 + }, + { + "epoch": 0.4589376915219612, + "grad_norm": 1.4100405651430146, + "learning_rate": 1.1806305025740245e-05, + "loss": 0.7515, + "step": 4493 + }, + { + "epoch": 0.45903983656792646, + "grad_norm": 1.584976641672956, + "learning_rate": 1.1803051013007336e-05, + "loss": 0.8127, + "step": 4494 + }, + { + "epoch": 0.45914198161389175, + "grad_norm": 1.344524049078553, + "learning_rate": 1.1799796802929328e-05, + "loss": 0.5779, + "step": 4495 + }, + { + "epoch": 0.459244126659857, + "grad_norm": 1.5544231512859255, + "learning_rate": 1.1796542395862401e-05, + "loss": 0.7729, + "step": 4496 + }, + { + "epoch": 0.45934627170582226, + "grad_norm": 1.44060791549229, + "learning_rate": 1.1793287792162746e-05, + "loss": 0.7558, + "step": 4497 + }, + { + "epoch": 0.45944841675178755, + "grad_norm": 1.2906737439107772, + "learning_rate": 1.1790032992186584e-05, + "loss": 0.5977, + "step": 4498 + }, + { + "epoch": 0.45955056179775283, + "grad_norm": 1.4899824954260785, + "learning_rate": 1.178677799629015e-05, + "loss": 0.7043, + "step": 4499 + }, + { + "epoch": 0.45965270684371806, + "grad_norm": 1.5838750320172272, + "learning_rate": 1.1783522804829714e-05, + "loss": 0.7291, + "step": 4500 + }, + { + "epoch": 0.45975485188968335, + "grad_norm": 1.421561941919404, + "learning_rate": 1.1780267418161554e-05, + "loss": 0.7337, + "step": 4501 + }, + { + "epoch": 0.45985699693564863, + "grad_norm": 1.5156148111440408, + "learning_rate": 1.1777011836641978e-05, + "loss": 0.6222, + "step": 4502 + }, + { + "epoch": 0.4599591419816139, + "grad_norm": 1.5774293563327493, + "learning_rate": 1.1773756060627303e-05, + "loss": 0.8313, + "step": 4503 + }, + { + "epoch": 0.46006128702757915, + "grad_norm": 1.3139766439388563, + "learning_rate": 1.1770500090473885e-05, + "loss": 0.6694, + "step": 4504 + }, + { + "epoch": 0.46016343207354443, + "grad_norm": 1.4742926552052202, + "learning_rate": 1.1767243926538088e-05, + "loss": 0.7518, + "step": 4505 + }, + { + "epoch": 0.4602655771195097, + "grad_norm": 1.5176206140090431, + "learning_rate": 1.17639875691763e-05, + "loss": 0.681, + "step": 4506 + }, + { + "epoch": 0.460367722165475, + "grad_norm": 1.422717750612694, + "learning_rate": 1.1760731018744933e-05, + "loss": 0.6845, + "step": 4507 + }, + { + "epoch": 0.46046986721144023, + "grad_norm": 1.3533135422358182, + "learning_rate": 1.175747427560042e-05, + "loss": 0.7197, + "step": 4508 + }, + { + "epoch": 0.4605720122574055, + "grad_norm": 1.415120636265506, + "learning_rate": 1.175421734009921e-05, + "loss": 0.67, + "step": 4509 + }, + { + "epoch": 0.4606741573033708, + "grad_norm": 1.3429640427349085, + "learning_rate": 1.1750960212597779e-05, + "loss": 0.8117, + "step": 4510 + }, + { + "epoch": 0.46077630234933603, + "grad_norm": 1.52927151973224, + "learning_rate": 1.1747702893452621e-05, + "loss": 0.7561, + "step": 4511 + }, + { + "epoch": 0.4608784473953013, + "grad_norm": 1.4949545652597274, + "learning_rate": 1.1744445383020254e-05, + "loss": 0.6854, + "step": 4512 + }, + { + "epoch": 0.4609805924412666, + "grad_norm": 1.3517988707324138, + "learning_rate": 1.1741187681657213e-05, + "loss": 0.8343, + "step": 4513 + }, + { + "epoch": 0.4610827374872319, + "grad_norm": 1.5538074149780177, + "learning_rate": 1.1737929789720055e-05, + "loss": 0.8063, + "step": 4514 + }, + { + "epoch": 0.4611848825331971, + "grad_norm": 1.4282948544756653, + "learning_rate": 1.1734671707565358e-05, + "loss": 0.8012, + "step": 4515 + }, + { + "epoch": 0.4612870275791624, + "grad_norm": 1.452901576690905, + "learning_rate": 1.1731413435549718e-05, + "loss": 0.7044, + "step": 4516 + }, + { + "epoch": 0.4613891726251277, + "grad_norm": 1.5156278544007429, + "learning_rate": 1.1728154974029766e-05, + "loss": 0.7288, + "step": 4517 + }, + { + "epoch": 0.46149131767109297, + "grad_norm": 1.394450522609207, + "learning_rate": 1.172489632336213e-05, + "loss": 0.7078, + "step": 4518 + }, + { + "epoch": 0.4615934627170582, + "grad_norm": 1.391370479248982, + "learning_rate": 1.1721637483903478e-05, + "loss": 0.7446, + "step": 4519 + }, + { + "epoch": 0.4616956077630235, + "grad_norm": 1.4870436954173325, + "learning_rate": 1.1718378456010495e-05, + "loss": 0.6561, + "step": 4520 + }, + { + "epoch": 0.46179775280898877, + "grad_norm": 1.3112354748647896, + "learning_rate": 1.1715119240039877e-05, + "loss": 0.6363, + "step": 4521 + }, + { + "epoch": 0.46189989785495406, + "grad_norm": 1.4086864847352285, + "learning_rate": 1.171185983634835e-05, + "loss": 0.6854, + "step": 4522 + }, + { + "epoch": 0.4620020429009193, + "grad_norm": 1.284031270230089, + "learning_rate": 1.1708600245292656e-05, + "loss": 0.6077, + "step": 4523 + }, + { + "epoch": 0.46210418794688457, + "grad_norm": 1.637898424139519, + "learning_rate": 1.1705340467229564e-05, + "loss": 0.7875, + "step": 4524 + }, + { + "epoch": 0.46220633299284986, + "grad_norm": 1.5078505965161635, + "learning_rate": 1.1702080502515855e-05, + "loss": 0.6618, + "step": 4525 + }, + { + "epoch": 0.46230847803881514, + "grad_norm": 1.3516476396439767, + "learning_rate": 1.1698820351508336e-05, + "loss": 0.6934, + "step": 4526 + }, + { + "epoch": 0.46241062308478037, + "grad_norm": 1.4097216494224978, + "learning_rate": 1.1695560014563831e-05, + "loss": 0.716, + "step": 4527 + }, + { + "epoch": 0.46251276813074566, + "grad_norm": 1.541281499356308, + "learning_rate": 1.1692299492039188e-05, + "loss": 0.8164, + "step": 4528 + }, + { + "epoch": 0.46261491317671094, + "grad_norm": 1.5537444997547776, + "learning_rate": 1.168903878429127e-05, + "loss": 0.7481, + "step": 4529 + }, + { + "epoch": 0.4627170582226762, + "grad_norm": 1.4152725726478879, + "learning_rate": 1.168577789167697e-05, + "loss": 0.7678, + "step": 4530 + }, + { + "epoch": 0.46281920326864145, + "grad_norm": 1.3947114030805299, + "learning_rate": 1.1682516814553187e-05, + "loss": 0.7577, + "step": 4531 + }, + { + "epoch": 0.46292134831460674, + "grad_norm": 1.4001366349031497, + "learning_rate": 1.1679255553276853e-05, + "loss": 0.7626, + "step": 4532 + }, + { + "epoch": 0.463023493360572, + "grad_norm": 1.8663926057054372, + "learning_rate": 1.1675994108204913e-05, + "loss": 0.7736, + "step": 4533 + }, + { + "epoch": 0.4631256384065373, + "grad_norm": 1.5748421002904427, + "learning_rate": 1.1672732479694338e-05, + "loss": 0.8666, + "step": 4534 + }, + { + "epoch": 0.46322778345250254, + "grad_norm": 1.602035825189538, + "learning_rate": 1.1669470668102108e-05, + "loss": 0.6936, + "step": 4535 + }, + { + "epoch": 0.4633299284984678, + "grad_norm": 1.3340629493414446, + "learning_rate": 1.1666208673785235e-05, + "loss": 0.6729, + "step": 4536 + }, + { + "epoch": 0.4634320735444331, + "grad_norm": 1.5561068922055945, + "learning_rate": 1.1662946497100749e-05, + "loss": 0.6787, + "step": 4537 + }, + { + "epoch": 0.46353421859039834, + "grad_norm": 1.5118687406868672, + "learning_rate": 1.1659684138405694e-05, + "loss": 0.7221, + "step": 4538 + }, + { + "epoch": 0.4636363636363636, + "grad_norm": 1.4519085162765155, + "learning_rate": 1.1656421598057135e-05, + "loss": 0.7615, + "step": 4539 + }, + { + "epoch": 0.4637385086823289, + "grad_norm": 1.3996703743327186, + "learning_rate": 1.1653158876412167e-05, + "loss": 0.772, + "step": 4540 + }, + { + "epoch": 0.4638406537282942, + "grad_norm": 1.424905734581487, + "learning_rate": 1.1649895973827887e-05, + "loss": 0.7325, + "step": 4541 + }, + { + "epoch": 0.4639427987742594, + "grad_norm": 1.3153285306686104, + "learning_rate": 1.1646632890661431e-05, + "loss": 0.6948, + "step": 4542 + }, + { + "epoch": 0.4640449438202247, + "grad_norm": 1.4792997454565426, + "learning_rate": 1.1643369627269934e-05, + "loss": 0.8275, + "step": 4543 + }, + { + "epoch": 0.46414708886619, + "grad_norm": 1.4014362336860784, + "learning_rate": 1.1640106184010578e-05, + "loss": 0.7681, + "step": 4544 + }, + { + "epoch": 0.4642492339121553, + "grad_norm": 1.3017855283276736, + "learning_rate": 1.1636842561240536e-05, + "loss": 0.7001, + "step": 4545 + }, + { + "epoch": 0.4643513789581205, + "grad_norm": 1.3453803307719, + "learning_rate": 1.1633578759317019e-05, + "loss": 0.6569, + "step": 4546 + }, + { + "epoch": 0.4644535240040858, + "grad_norm": 1.3712654405117655, + "learning_rate": 1.1630314778597252e-05, + "loss": 0.6077, + "step": 4547 + }, + { + "epoch": 0.4645556690500511, + "grad_norm": 1.5268651178149601, + "learning_rate": 1.1627050619438476e-05, + "loss": 0.7634, + "step": 4548 + }, + { + "epoch": 0.46465781409601636, + "grad_norm": 1.590761155513556, + "learning_rate": 1.1623786282197961e-05, + "loss": 0.7091, + "step": 4549 + }, + { + "epoch": 0.4647599591419816, + "grad_norm": 1.3402695786937597, + "learning_rate": 1.1620521767232988e-05, + "loss": 0.7149, + "step": 4550 + }, + { + "epoch": 0.4648621041879469, + "grad_norm": 1.4925413074760292, + "learning_rate": 1.161725707490086e-05, + "loss": 0.768, + "step": 4551 + }, + { + "epoch": 0.46496424923391216, + "grad_norm": 1.3656067482389571, + "learning_rate": 1.1613992205558903e-05, + "loss": 0.6383, + "step": 4552 + }, + { + "epoch": 0.46506639427987745, + "grad_norm": 1.4362685564863855, + "learning_rate": 1.1610727159564454e-05, + "loss": 0.7115, + "step": 4553 + }, + { + "epoch": 0.4651685393258427, + "grad_norm": 1.5185036120952167, + "learning_rate": 1.160746193727488e-05, + "loss": 0.7549, + "step": 4554 + }, + { + "epoch": 0.46527068437180796, + "grad_norm": 1.4029603369476822, + "learning_rate": 1.1604196539047552e-05, + "loss": 0.7091, + "step": 4555 + }, + { + "epoch": 0.46537282941777325, + "grad_norm": 1.537030478497276, + "learning_rate": 1.1600930965239883e-05, + "loss": 0.7574, + "step": 4556 + }, + { + "epoch": 0.46547497446373853, + "grad_norm": 1.5758924440498452, + "learning_rate": 1.1597665216209281e-05, + "loss": 0.7917, + "step": 4557 + }, + { + "epoch": 0.46557711950970376, + "grad_norm": 1.5862278785151886, + "learning_rate": 1.1594399292313192e-05, + "loss": 0.8666, + "step": 4558 + }, + { + "epoch": 0.46567926455566905, + "grad_norm": 1.3865680057483374, + "learning_rate": 1.1591133193909067e-05, + "loss": 0.8053, + "step": 4559 + }, + { + "epoch": 0.46578140960163433, + "grad_norm": 1.382658973465373, + "learning_rate": 1.1587866921354388e-05, + "loss": 0.7276, + "step": 4560 + }, + { + "epoch": 0.4658835546475996, + "grad_norm": 1.3851150416908025, + "learning_rate": 1.1584600475006649e-05, + "loss": 0.6589, + "step": 4561 + }, + { + "epoch": 0.46598569969356485, + "grad_norm": 1.3623752690347581, + "learning_rate": 1.1581333855223362e-05, + "loss": 0.592, + "step": 4562 + }, + { + "epoch": 0.46608784473953013, + "grad_norm": 1.3031010323044558, + "learning_rate": 1.1578067062362064e-05, + "loss": 0.6313, + "step": 4563 + }, + { + "epoch": 0.4661899897854954, + "grad_norm": 1.5821242558541917, + "learning_rate": 1.1574800096780307e-05, + "loss": 0.7556, + "step": 4564 + }, + { + "epoch": 0.46629213483146065, + "grad_norm": 1.3076552958323757, + "learning_rate": 1.1571532958835664e-05, + "loss": 0.7504, + "step": 4565 + }, + { + "epoch": 0.46639427987742593, + "grad_norm": 1.495623075220918, + "learning_rate": 1.1568265648885722e-05, + "loss": 0.6968, + "step": 4566 + }, + { + "epoch": 0.4664964249233912, + "grad_norm": 1.4119164980792056, + "learning_rate": 1.1564998167288089e-05, + "loss": 0.7125, + "step": 4567 + }, + { + "epoch": 0.4665985699693565, + "grad_norm": 1.4671391997642753, + "learning_rate": 1.1561730514400395e-05, + "loss": 0.8384, + "step": 4568 + }, + { + "epoch": 0.46670071501532173, + "grad_norm": 1.422650020680028, + "learning_rate": 1.1558462690580292e-05, + "loss": 0.7884, + "step": 4569 + }, + { + "epoch": 0.466802860061287, + "grad_norm": 1.4682432490143777, + "learning_rate": 1.1555194696185437e-05, + "loss": 0.744, + "step": 4570 + }, + { + "epoch": 0.4669050051072523, + "grad_norm": 1.34101269903494, + "learning_rate": 1.1551926531573517e-05, + "loss": 0.6566, + "step": 4571 + }, + { + "epoch": 0.4670071501532176, + "grad_norm": 1.377863205523952, + "learning_rate": 1.1548658197102236e-05, + "loss": 0.7059, + "step": 4572 + }, + { + "epoch": 0.4671092951991828, + "grad_norm": 1.4150997833577, + "learning_rate": 1.1545389693129318e-05, + "loss": 0.8529, + "step": 4573 + }, + { + "epoch": 0.4672114402451481, + "grad_norm": 1.5751511378322125, + "learning_rate": 1.1542121020012498e-05, + "loss": 0.743, + "step": 4574 + }, + { + "epoch": 0.4673135852911134, + "grad_norm": 1.4587180518501928, + "learning_rate": 1.1538852178109532e-05, + "loss": 0.6624, + "step": 4575 + }, + { + "epoch": 0.46741573033707867, + "grad_norm": 1.513114376594062, + "learning_rate": 1.1535583167778206e-05, + "loss": 0.6995, + "step": 4576 + }, + { + "epoch": 0.4675178753830439, + "grad_norm": 1.3959257434612828, + "learning_rate": 1.1532313989376309e-05, + "loss": 0.6748, + "step": 4577 + }, + { + "epoch": 0.4676200204290092, + "grad_norm": 1.4624441010712579, + "learning_rate": 1.1529044643261655e-05, + "loss": 0.7235, + "step": 4578 + }, + { + "epoch": 0.46772216547497447, + "grad_norm": 1.479573579565105, + "learning_rate": 1.1525775129792079e-05, + "loss": 0.6625, + "step": 4579 + }, + { + "epoch": 0.46782431052093976, + "grad_norm": 1.468245640853636, + "learning_rate": 1.1522505449325426e-05, + "loss": 0.6372, + "step": 4580 + }, + { + "epoch": 0.467926455566905, + "grad_norm": 1.4809584094673238, + "learning_rate": 1.1519235602219569e-05, + "loss": 0.6514, + "step": 4581 + }, + { + "epoch": 0.46802860061287027, + "grad_norm": 1.3851591922369217, + "learning_rate": 1.1515965588832394e-05, + "loss": 0.8244, + "step": 4582 + }, + { + "epoch": 0.46813074565883556, + "grad_norm": 1.449227194672402, + "learning_rate": 1.1512695409521806e-05, + "loss": 0.7521, + "step": 4583 + }, + { + "epoch": 0.46823289070480084, + "grad_norm": 1.335402374116199, + "learning_rate": 1.150942506464573e-05, + "loss": 0.6609, + "step": 4584 + }, + { + "epoch": 0.46833503575076607, + "grad_norm": 1.4657570501899861, + "learning_rate": 1.15061545545621e-05, + "loss": 0.6737, + "step": 4585 + }, + { + "epoch": 0.46843718079673136, + "grad_norm": 1.3749899429581807, + "learning_rate": 1.1502883879628887e-05, + "loss": 0.6967, + "step": 4586 + }, + { + "epoch": 0.46853932584269664, + "grad_norm": 1.496094852758198, + "learning_rate": 1.1499613040204058e-05, + "loss": 0.713, + "step": 4587 + }, + { + "epoch": 0.4686414708886619, + "grad_norm": 1.4028827891639952, + "learning_rate": 1.1496342036645615e-05, + "loss": 0.6899, + "step": 4588 + }, + { + "epoch": 0.46874361593462716, + "grad_norm": 1.6237822421921642, + "learning_rate": 1.1493070869311569e-05, + "loss": 0.7004, + "step": 4589 + }, + { + "epoch": 0.46884576098059244, + "grad_norm": 1.5857557925370613, + "learning_rate": 1.1489799538559953e-05, + "loss": 0.705, + "step": 4590 + }, + { + "epoch": 0.4689479060265577, + "grad_norm": 1.3767669386995813, + "learning_rate": 1.1486528044748814e-05, + "loss": 0.7151, + "step": 4591 + }, + { + "epoch": 0.46905005107252296, + "grad_norm": 1.414581024929639, + "learning_rate": 1.1483256388236218e-05, + "loss": 0.7034, + "step": 4592 + }, + { + "epoch": 0.46915219611848824, + "grad_norm": 1.5383110942917957, + "learning_rate": 1.1479984569380256e-05, + "loss": 0.782, + "step": 4593 + }, + { + "epoch": 0.4692543411644535, + "grad_norm": 1.3752950914580033, + "learning_rate": 1.1476712588539023e-05, + "loss": 0.7254, + "step": 4594 + }, + { + "epoch": 0.4693564862104188, + "grad_norm": 1.4533411182031981, + "learning_rate": 1.1473440446070646e-05, + "loss": 0.6597, + "step": 4595 + }, + { + "epoch": 0.46945863125638404, + "grad_norm": 1.4501212256952631, + "learning_rate": 1.147016814233326e-05, + "loss": 0.6747, + "step": 4596 + }, + { + "epoch": 0.4695607763023493, + "grad_norm": 1.3847479746946372, + "learning_rate": 1.146689567768502e-05, + "loss": 0.7018, + "step": 4597 + }, + { + "epoch": 0.4696629213483146, + "grad_norm": 1.4891741056924936, + "learning_rate": 1.14636230524841e-05, + "loss": 0.8504, + "step": 4598 + }, + { + "epoch": 0.4697650663942799, + "grad_norm": 1.500441230228755, + "learning_rate": 1.1460350267088688e-05, + "loss": 0.7318, + "step": 4599 + }, + { + "epoch": 0.4698672114402451, + "grad_norm": 1.3831274770140913, + "learning_rate": 1.1457077321857002e-05, + "loss": 0.6675, + "step": 4600 + }, + { + "epoch": 0.4699693564862104, + "grad_norm": 1.4479169600078448, + "learning_rate": 1.145380421714726e-05, + "loss": 0.7732, + "step": 4601 + }, + { + "epoch": 0.4700715015321757, + "grad_norm": 1.3560605740731997, + "learning_rate": 1.1450530953317705e-05, + "loss": 0.7227, + "step": 4602 + }, + { + "epoch": 0.470173646578141, + "grad_norm": 1.5640022945170355, + "learning_rate": 1.1447257530726601e-05, + "loss": 0.7384, + "step": 4603 + }, + { + "epoch": 0.4702757916241062, + "grad_norm": 1.4872837492721065, + "learning_rate": 1.1443983949732225e-05, + "loss": 0.677, + "step": 4604 + }, + { + "epoch": 0.4703779366700715, + "grad_norm": 1.3748553106857189, + "learning_rate": 1.1440710210692874e-05, + "loss": 0.6987, + "step": 4605 + }, + { + "epoch": 0.4704800817160368, + "grad_norm": 1.4568476927107372, + "learning_rate": 1.1437436313966857e-05, + "loss": 0.6881, + "step": 4606 + }, + { + "epoch": 0.47058222676200206, + "grad_norm": 1.5634136743519704, + "learning_rate": 1.143416225991251e-05, + "loss": 0.6742, + "step": 4607 + }, + { + "epoch": 0.4706843718079673, + "grad_norm": 1.4234673549374743, + "learning_rate": 1.1430888048888175e-05, + "loss": 0.6917, + "step": 4608 + }, + { + "epoch": 0.4707865168539326, + "grad_norm": 1.3922306134715365, + "learning_rate": 1.1427613681252219e-05, + "loss": 0.6853, + "step": 4609 + }, + { + "epoch": 0.47088866189989786, + "grad_norm": 1.331213317427588, + "learning_rate": 1.1424339157363024e-05, + "loss": 0.6816, + "step": 4610 + }, + { + "epoch": 0.47099080694586315, + "grad_norm": 1.458589449098392, + "learning_rate": 1.1421064477578986e-05, + "loss": 0.681, + "step": 4611 + }, + { + "epoch": 0.4710929519918284, + "grad_norm": 1.4900377014875879, + "learning_rate": 1.1417789642258523e-05, + "loss": 0.7096, + "step": 4612 + }, + { + "epoch": 0.47119509703779366, + "grad_norm": 1.380239925393662, + "learning_rate": 1.1414514651760071e-05, + "loss": 0.6124, + "step": 4613 + }, + { + "epoch": 0.47129724208375895, + "grad_norm": 1.496306443905916, + "learning_rate": 1.1411239506442073e-05, + "loss": 0.7454, + "step": 4614 + }, + { + "epoch": 0.47139938712972423, + "grad_norm": 1.6019461587359385, + "learning_rate": 1.1407964206663e-05, + "loss": 0.6258, + "step": 4615 + }, + { + "epoch": 0.47150153217568946, + "grad_norm": 1.498099717141024, + "learning_rate": 1.1404688752781335e-05, + "loss": 0.7338, + "step": 4616 + }, + { + "epoch": 0.47160367722165475, + "grad_norm": 1.3563410442073103, + "learning_rate": 1.140141314515558e-05, + "loss": 0.6424, + "step": 4617 + }, + { + "epoch": 0.47170582226762003, + "grad_norm": 1.3693588691610963, + "learning_rate": 1.1398137384144253e-05, + "loss": 0.6434, + "step": 4618 + }, + { + "epoch": 0.47180796731358526, + "grad_norm": 1.4005606658076981, + "learning_rate": 1.1394861470105878e-05, + "loss": 0.6191, + "step": 4619 + }, + { + "epoch": 0.47191011235955055, + "grad_norm": 1.296358070109126, + "learning_rate": 1.139158540339902e-05, + "loss": 0.804, + "step": 4620 + }, + { + "epoch": 0.47201225740551583, + "grad_norm": 1.49758334155772, + "learning_rate": 1.1388309184382237e-05, + "loss": 0.6821, + "step": 4621 + }, + { + "epoch": 0.4721144024514811, + "grad_norm": 1.5757060269443182, + "learning_rate": 1.1385032813414121e-05, + "loss": 0.8261, + "step": 4622 + }, + { + "epoch": 0.47221654749744635, + "grad_norm": 1.6149679206737317, + "learning_rate": 1.1381756290853267e-05, + "loss": 0.7312, + "step": 4623 + }, + { + "epoch": 0.47231869254341163, + "grad_norm": 1.5887841295070801, + "learning_rate": 1.1378479617058293e-05, + "loss": 0.7529, + "step": 4624 + }, + { + "epoch": 0.4724208375893769, + "grad_norm": 1.428688693816069, + "learning_rate": 1.1375202792387836e-05, + "loss": 0.7197, + "step": 4625 + }, + { + "epoch": 0.4725229826353422, + "grad_norm": 1.412872508421381, + "learning_rate": 1.1371925817200544e-05, + "loss": 0.6769, + "step": 4626 + }, + { + "epoch": 0.47262512768130743, + "grad_norm": 1.382736490786676, + "learning_rate": 1.1368648691855084e-05, + "loss": 0.7287, + "step": 4627 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 1.4971349034599082, + "learning_rate": 1.1365371416710142e-05, + "loss": 0.7456, + "step": 4628 + }, + { + "epoch": 0.472829417773238, + "grad_norm": 1.4842805375229045, + "learning_rate": 1.1362093992124416e-05, + "loss": 0.714, + "step": 4629 + }, + { + "epoch": 0.4729315628192033, + "grad_norm": 1.4374293177193989, + "learning_rate": 1.1358816418456625e-05, + "loss": 0.6519, + "step": 4630 + }, + { + "epoch": 0.4730337078651685, + "grad_norm": 1.3576375338439068, + "learning_rate": 1.1355538696065491e-05, + "loss": 0.6827, + "step": 4631 + }, + { + "epoch": 0.4731358529111338, + "grad_norm": 1.3925611650479424, + "learning_rate": 1.135226082530978e-05, + "loss": 0.7307, + "step": 4632 + }, + { + "epoch": 0.4732379979570991, + "grad_norm": 1.4609066009779765, + "learning_rate": 1.1348982806548242e-05, + "loss": 0.7623, + "step": 4633 + }, + { + "epoch": 0.4733401430030644, + "grad_norm": 1.4176078667902363, + "learning_rate": 1.1345704640139668e-05, + "loss": 0.7382, + "step": 4634 + }, + { + "epoch": 0.4734422880490296, + "grad_norm": 1.4160934666837424, + "learning_rate": 1.134242632644285e-05, + "loss": 0.7484, + "step": 4635 + }, + { + "epoch": 0.4735444330949949, + "grad_norm": 1.4242787808837387, + "learning_rate": 1.1339147865816602e-05, + "loss": 0.7462, + "step": 4636 + }, + { + "epoch": 0.4736465781409602, + "grad_norm": 1.3488103241888663, + "learning_rate": 1.1335869258619758e-05, + "loss": 0.5837, + "step": 4637 + }, + { + "epoch": 0.47374872318692546, + "grad_norm": 1.478928467679866, + "learning_rate": 1.1332590505211158e-05, + "loss": 0.7891, + "step": 4638 + }, + { + "epoch": 0.4738508682328907, + "grad_norm": 1.466070668525409, + "learning_rate": 1.1329311605949666e-05, + "loss": 0.7601, + "step": 4639 + }, + { + "epoch": 0.473953013278856, + "grad_norm": 1.6708384094901065, + "learning_rate": 1.1326032561194163e-05, + "loss": 0.648, + "step": 4640 + }, + { + "epoch": 0.47405515832482126, + "grad_norm": 1.4545883677528972, + "learning_rate": 1.1322753371303536e-05, + "loss": 0.6773, + "step": 4641 + }, + { + "epoch": 0.47415730337078654, + "grad_norm": 1.4801451294868049, + "learning_rate": 1.1319474036636702e-05, + "loss": 0.6558, + "step": 4642 + }, + { + "epoch": 0.47425944841675177, + "grad_norm": 1.4562705333935624, + "learning_rate": 1.1316194557552575e-05, + "loss": 0.7484, + "step": 4643 + }, + { + "epoch": 0.47436159346271706, + "grad_norm": 1.4725518306596883, + "learning_rate": 1.1312914934410111e-05, + "loss": 0.7568, + "step": 4644 + }, + { + "epoch": 0.47446373850868234, + "grad_norm": 1.471184976628405, + "learning_rate": 1.1309635167568259e-05, + "loss": 0.725, + "step": 4645 + }, + { + "epoch": 0.4745658835546476, + "grad_norm": 1.3433768566020603, + "learning_rate": 1.1306355257385987e-05, + "loss": 0.6963, + "step": 4646 + }, + { + "epoch": 0.47466802860061286, + "grad_norm": 1.575069507568577, + "learning_rate": 1.1303075204222292e-05, + "loss": 0.8487, + "step": 4647 + }, + { + "epoch": 0.47477017364657814, + "grad_norm": 1.4967242299854873, + "learning_rate": 1.1299795008436168e-05, + "loss": 0.7634, + "step": 4648 + }, + { + "epoch": 0.4748723186925434, + "grad_norm": 1.3849950573502745, + "learning_rate": 1.1296514670386646e-05, + "loss": 0.6624, + "step": 4649 + }, + { + "epoch": 0.47497446373850866, + "grad_norm": 1.5094100383409006, + "learning_rate": 1.1293234190432753e-05, + "loss": 0.7114, + "step": 4650 + }, + { + "epoch": 0.47507660878447394, + "grad_norm": 1.3847510912887924, + "learning_rate": 1.1289953568933545e-05, + "loss": 0.7251, + "step": 4651 + }, + { + "epoch": 0.4751787538304392, + "grad_norm": 1.4199815336506274, + "learning_rate": 1.1286672806248082e-05, + "loss": 0.7147, + "step": 4652 + }, + { + "epoch": 0.4752808988764045, + "grad_norm": 1.5358473275215399, + "learning_rate": 1.1283391902735451e-05, + "loss": 0.7356, + "step": 4653 + }, + { + "epoch": 0.47538304392236974, + "grad_norm": 1.6028311372699997, + "learning_rate": 1.128011085875475e-05, + "loss": 0.7518, + "step": 4654 + }, + { + "epoch": 0.475485188968335, + "grad_norm": 1.4757244276681047, + "learning_rate": 1.1276829674665084e-05, + "loss": 0.7508, + "step": 4655 + }, + { + "epoch": 0.4755873340143003, + "grad_norm": 1.6398672407581447, + "learning_rate": 1.1273548350825584e-05, + "loss": 0.8111, + "step": 4656 + }, + { + "epoch": 0.4756894790602656, + "grad_norm": 1.3283907493780107, + "learning_rate": 1.12702668875954e-05, + "loss": 0.6821, + "step": 4657 + }, + { + "epoch": 0.4757916241062308, + "grad_norm": 1.4477970140519387, + "learning_rate": 1.126698528533368e-05, + "loss": 0.6846, + "step": 4658 + }, + { + "epoch": 0.4758937691521961, + "grad_norm": 1.5767618769337386, + "learning_rate": 1.1263703544399605e-05, + "loss": 0.6887, + "step": 4659 + }, + { + "epoch": 0.4759959141981614, + "grad_norm": 1.551097458156733, + "learning_rate": 1.1260421665152357e-05, + "loss": 0.7457, + "step": 4660 + }, + { + "epoch": 0.4760980592441267, + "grad_norm": 1.5490172537607214, + "learning_rate": 1.1257139647951146e-05, + "loss": 0.7408, + "step": 4661 + }, + { + "epoch": 0.4762002042900919, + "grad_norm": 1.488877484561567, + "learning_rate": 1.1253857493155189e-05, + "loss": 0.7016, + "step": 4662 + }, + { + "epoch": 0.4763023493360572, + "grad_norm": 1.4903238794910556, + "learning_rate": 1.1250575201123716e-05, + "loss": 0.6751, + "step": 4663 + }, + { + "epoch": 0.4764044943820225, + "grad_norm": 1.3887470103732797, + "learning_rate": 1.124729277221598e-05, + "loss": 0.6599, + "step": 4664 + }, + { + "epoch": 0.47650663942798777, + "grad_norm": 1.3380593264598162, + "learning_rate": 1.1244010206791244e-05, + "loss": 0.7595, + "step": 4665 + }, + { + "epoch": 0.476608784473953, + "grad_norm": 1.3504635421716253, + "learning_rate": 1.1240727505208793e-05, + "loss": 0.73, + "step": 4666 + }, + { + "epoch": 0.4767109295199183, + "grad_norm": 1.3045087532533222, + "learning_rate": 1.1237444667827908e-05, + "loss": 0.7515, + "step": 4667 + }, + { + "epoch": 0.47681307456588357, + "grad_norm": 1.4054401946995192, + "learning_rate": 1.1234161695007906e-05, + "loss": 0.7107, + "step": 4668 + }, + { + "epoch": 0.47691521961184885, + "grad_norm": 1.3447237163534818, + "learning_rate": 1.1230878587108112e-05, + "loss": 0.708, + "step": 4669 + }, + { + "epoch": 0.4770173646578141, + "grad_norm": 1.3755045437647198, + "learning_rate": 1.1227595344487861e-05, + "loss": 0.7086, + "step": 4670 + }, + { + "epoch": 0.47711950970377937, + "grad_norm": 1.3517475051221437, + "learning_rate": 1.1224311967506505e-05, + "loss": 0.7163, + "step": 4671 + }, + { + "epoch": 0.47722165474974465, + "grad_norm": 1.4105765737950693, + "learning_rate": 1.1221028456523413e-05, + "loss": 0.6783, + "step": 4672 + }, + { + "epoch": 0.47732379979570994, + "grad_norm": 1.613342143842825, + "learning_rate": 1.1217744811897966e-05, + "loss": 0.7139, + "step": 4673 + }, + { + "epoch": 0.47742594484167516, + "grad_norm": 1.5596618377009908, + "learning_rate": 1.1214461033989566e-05, + "loss": 0.7693, + "step": 4674 + }, + { + "epoch": 0.47752808988764045, + "grad_norm": 1.3792060089492475, + "learning_rate": 1.1211177123157619e-05, + "loss": 0.7617, + "step": 4675 + }, + { + "epoch": 0.47763023493360574, + "grad_norm": 1.4387718634765345, + "learning_rate": 1.1207893079761551e-05, + "loss": 0.8137, + "step": 4676 + }, + { + "epoch": 0.47773237997957096, + "grad_norm": 1.231978076444747, + "learning_rate": 1.1204608904160808e-05, + "loss": 0.6823, + "step": 4677 + }, + { + "epoch": 0.47783452502553625, + "grad_norm": 1.4556551950995793, + "learning_rate": 1.1201324596714845e-05, + "loss": 0.7209, + "step": 4678 + }, + { + "epoch": 0.47793667007150153, + "grad_norm": 1.4782081118904289, + "learning_rate": 1.1198040157783125e-05, + "loss": 0.7531, + "step": 4679 + }, + { + "epoch": 0.4780388151174668, + "grad_norm": 1.4649234984523267, + "learning_rate": 1.1194755587725134e-05, + "loss": 0.7274, + "step": 4680 + }, + { + "epoch": 0.47814096016343205, + "grad_norm": 1.4106938227713668, + "learning_rate": 1.1191470886900378e-05, + "loss": 0.6623, + "step": 4681 + }, + { + "epoch": 0.47824310520939733, + "grad_norm": 1.3474231419601228, + "learning_rate": 1.1188186055668358e-05, + "loss": 0.6413, + "step": 4682 + }, + { + "epoch": 0.4783452502553626, + "grad_norm": 1.359182306055146, + "learning_rate": 1.118490109438861e-05, + "loss": 0.6482, + "step": 4683 + }, + { + "epoch": 0.4784473953013279, + "grad_norm": 1.3975563749228346, + "learning_rate": 1.118161600342067e-05, + "loss": 0.6667, + "step": 4684 + }, + { + "epoch": 0.47854954034729313, + "grad_norm": 1.4644867152855252, + "learning_rate": 1.1178330783124094e-05, + "loss": 0.7048, + "step": 4685 + }, + { + "epoch": 0.4786516853932584, + "grad_norm": 1.5076206982144373, + "learning_rate": 1.1175045433858457e-05, + "loss": 0.7019, + "step": 4686 + }, + { + "epoch": 0.4787538304392237, + "grad_norm": 1.486235354665212, + "learning_rate": 1.1171759955983332e-05, + "loss": 0.7228, + "step": 4687 + }, + { + "epoch": 0.478855975485189, + "grad_norm": 1.4555598177593088, + "learning_rate": 1.1168474349858325e-05, + "loss": 0.7937, + "step": 4688 + }, + { + "epoch": 0.4789581205311542, + "grad_norm": 1.6350161323704449, + "learning_rate": 1.1165188615843048e-05, + "loss": 0.8726, + "step": 4689 + }, + { + "epoch": 0.4790602655771195, + "grad_norm": 1.564789595843301, + "learning_rate": 1.116190275429712e-05, + "loss": 0.8349, + "step": 4690 + }, + { + "epoch": 0.4791624106230848, + "grad_norm": 1.4402449148628527, + "learning_rate": 1.115861676558019e-05, + "loss": 0.6312, + "step": 4691 + }, + { + "epoch": 0.4792645556690501, + "grad_norm": 1.4385768883746815, + "learning_rate": 1.1155330650051902e-05, + "loss": 0.6438, + "step": 4692 + }, + { + "epoch": 0.4793667007150153, + "grad_norm": 1.6355445470466625, + "learning_rate": 1.1152044408071929e-05, + "loss": 0.7258, + "step": 4693 + }, + { + "epoch": 0.4794688457609806, + "grad_norm": 1.4723669392423124, + "learning_rate": 1.1148758039999951e-05, + "loss": 0.754, + "step": 4694 + }, + { + "epoch": 0.4795709908069459, + "grad_norm": 1.3979130111637978, + "learning_rate": 1.1145471546195665e-05, + "loss": 0.6319, + "step": 4695 + }, + { + "epoch": 0.47967313585291116, + "grad_norm": 1.346603391231298, + "learning_rate": 1.1142184927018778e-05, + "loss": 0.7508, + "step": 4696 + }, + { + "epoch": 0.4797752808988764, + "grad_norm": 1.4390226762627387, + "learning_rate": 1.113889818282901e-05, + "loss": 0.8709, + "step": 4697 + }, + { + "epoch": 0.4798774259448417, + "grad_norm": 1.338201967658271, + "learning_rate": 1.1135611313986103e-05, + "loss": 0.6627, + "step": 4698 + }, + { + "epoch": 0.47997957099080696, + "grad_norm": 1.337653789721875, + "learning_rate": 1.1132324320849803e-05, + "loss": 0.6441, + "step": 4699 + }, + { + "epoch": 0.48008171603677224, + "grad_norm": 1.4446213137373256, + "learning_rate": 1.1129037203779873e-05, + "loss": 0.6464, + "step": 4700 + }, + { + "epoch": 0.4801838610827375, + "grad_norm": 1.4565199164257723, + "learning_rate": 1.1125749963136097e-05, + "loss": 0.7344, + "step": 4701 + }, + { + "epoch": 0.48028600612870276, + "grad_norm": 1.1738935492007405, + "learning_rate": 1.1122462599278255e-05, + "loss": 0.747, + "step": 4702 + }, + { + "epoch": 0.48038815117466804, + "grad_norm": 1.4301540878668242, + "learning_rate": 1.1119175112566159e-05, + "loss": 0.734, + "step": 4703 + }, + { + "epoch": 0.4804902962206333, + "grad_norm": 1.3730090736621727, + "learning_rate": 1.111588750335962e-05, + "loss": 0.7, + "step": 4704 + }, + { + "epoch": 0.48059244126659856, + "grad_norm": 1.3630316007743029, + "learning_rate": 1.1112599772018476e-05, + "loss": 0.5666, + "step": 4705 + }, + { + "epoch": 0.48069458631256384, + "grad_norm": 1.4332445928303563, + "learning_rate": 1.110931191890257e-05, + "loss": 0.6576, + "step": 4706 + }, + { + "epoch": 0.48079673135852913, + "grad_norm": 1.4427638806608467, + "learning_rate": 1.1106023944371754e-05, + "loss": 0.6496, + "step": 4707 + }, + { + "epoch": 0.48089887640449436, + "grad_norm": 1.441314930530969, + "learning_rate": 1.1102735848785901e-05, + "loss": 0.7646, + "step": 4708 + }, + { + "epoch": 0.48100102145045964, + "grad_norm": 1.4011243402104627, + "learning_rate": 1.10994476325049e-05, + "loss": 0.6796, + "step": 4709 + }, + { + "epoch": 0.4811031664964249, + "grad_norm": 1.5332078680310863, + "learning_rate": 1.1096159295888646e-05, + "loss": 0.6752, + "step": 4710 + }, + { + "epoch": 0.4812053115423902, + "grad_norm": 1.339275016041277, + "learning_rate": 1.1092870839297048e-05, + "loss": 0.7027, + "step": 4711 + }, + { + "epoch": 0.48130745658835544, + "grad_norm": 1.5286070977781736, + "learning_rate": 1.1089582263090031e-05, + "loss": 0.6976, + "step": 4712 + }, + { + "epoch": 0.4814096016343207, + "grad_norm": 1.3916486434080306, + "learning_rate": 1.108629356762753e-05, + "loss": 0.7242, + "step": 4713 + }, + { + "epoch": 0.481511746680286, + "grad_norm": 1.4547589553747535, + "learning_rate": 1.1083004753269498e-05, + "loss": 0.6791, + "step": 4714 + }, + { + "epoch": 0.4816138917262513, + "grad_norm": 1.3247269001281308, + "learning_rate": 1.1079715820375896e-05, + "loss": 0.6979, + "step": 4715 + }, + { + "epoch": 0.4817160367722165, + "grad_norm": 1.5189073367249037, + "learning_rate": 1.10764267693067e-05, + "loss": 0.7847, + "step": 4716 + }, + { + "epoch": 0.4818181818181818, + "grad_norm": 1.4536075292173836, + "learning_rate": 1.1073137600421895e-05, + "loss": 0.7313, + "step": 4717 + }, + { + "epoch": 0.4819203268641471, + "grad_norm": 1.5215812614239415, + "learning_rate": 1.106984831408149e-05, + "loss": 0.7802, + "step": 4718 + }, + { + "epoch": 0.4820224719101124, + "grad_norm": 1.45199619357555, + "learning_rate": 1.1066558910645494e-05, + "loss": 0.688, + "step": 4719 + }, + { + "epoch": 0.4821246169560776, + "grad_norm": 1.449925752627741, + "learning_rate": 1.1063269390473937e-05, + "loss": 0.7402, + "step": 4720 + }, + { + "epoch": 0.4822267620020429, + "grad_norm": 1.3700791134833505, + "learning_rate": 1.1059979753926857e-05, + "loss": 0.6968, + "step": 4721 + }, + { + "epoch": 0.4823289070480082, + "grad_norm": 1.6700825120330365, + "learning_rate": 1.1056690001364312e-05, + "loss": 0.7011, + "step": 4722 + }, + { + "epoch": 0.48243105209397347, + "grad_norm": 1.452173014140861, + "learning_rate": 1.105340013314636e-05, + "loss": 0.7881, + "step": 4723 + }, + { + "epoch": 0.4825331971399387, + "grad_norm": 1.7102688780087025, + "learning_rate": 1.1050110149633081e-05, + "loss": 0.8457, + "step": 4724 + }, + { + "epoch": 0.482635342185904, + "grad_norm": 1.4183807764377703, + "learning_rate": 1.1046820051184572e-05, + "loss": 0.5645, + "step": 4725 + }, + { + "epoch": 0.48273748723186927, + "grad_norm": 1.5433763726676624, + "learning_rate": 1.104352983816093e-05, + "loss": 0.7098, + "step": 4726 + }, + { + "epoch": 0.48283963227783455, + "grad_norm": 1.530437238645394, + "learning_rate": 1.1040239510922274e-05, + "loss": 0.6483, + "step": 4727 + }, + { + "epoch": 0.4829417773237998, + "grad_norm": 1.281583461424658, + "learning_rate": 1.1036949069828732e-05, + "loss": 0.6891, + "step": 4728 + }, + { + "epoch": 0.48304392236976507, + "grad_norm": 1.581099491532157, + "learning_rate": 1.1033658515240442e-05, + "loss": 0.7577, + "step": 4729 + }, + { + "epoch": 0.48314606741573035, + "grad_norm": 1.3404123409199826, + "learning_rate": 1.1030367847517562e-05, + "loss": 0.6993, + "step": 4730 + }, + { + "epoch": 0.4832482124616956, + "grad_norm": 1.447097435906328, + "learning_rate": 1.1027077067020256e-05, + "loss": 0.7, + "step": 4731 + }, + { + "epoch": 0.48335035750766087, + "grad_norm": 1.522689313433083, + "learning_rate": 1.1023786174108697e-05, + "loss": 0.7008, + "step": 4732 + }, + { + "epoch": 0.48345250255362615, + "grad_norm": 1.2472719613848893, + "learning_rate": 1.1020495169143085e-05, + "loss": 0.5715, + "step": 4733 + }, + { + "epoch": 0.48355464759959144, + "grad_norm": 1.4343415549086316, + "learning_rate": 1.1017204052483614e-05, + "loss": 0.6811, + "step": 4734 + }, + { + "epoch": 0.48365679264555667, + "grad_norm": 1.4160243811006035, + "learning_rate": 1.1013912824490505e-05, + "loss": 0.6955, + "step": 4735 + }, + { + "epoch": 0.48375893769152195, + "grad_norm": 1.4087088736368545, + "learning_rate": 1.1010621485523976e-05, + "loss": 0.7517, + "step": 4736 + }, + { + "epoch": 0.48386108273748724, + "grad_norm": 1.5792861609406652, + "learning_rate": 1.100733003594428e-05, + "loss": 0.8144, + "step": 4737 + }, + { + "epoch": 0.4839632277834525, + "grad_norm": 1.4804956668244074, + "learning_rate": 1.1004038476111655e-05, + "loss": 0.7572, + "step": 4738 + }, + { + "epoch": 0.48406537282941775, + "grad_norm": 1.497378797266265, + "learning_rate": 1.1000746806386376e-05, + "loss": 0.6738, + "step": 4739 + }, + { + "epoch": 0.48416751787538304, + "grad_norm": 1.664610861877691, + "learning_rate": 1.0997455027128708e-05, + "loss": 0.7767, + "step": 4740 + }, + { + "epoch": 0.4842696629213483, + "grad_norm": 1.5594397513256653, + "learning_rate": 1.0994163138698944e-05, + "loss": 0.6548, + "step": 4741 + }, + { + "epoch": 0.4843718079673136, + "grad_norm": 1.3932185142744336, + "learning_rate": 1.0990871141457383e-05, + "loss": 0.6795, + "step": 4742 + }, + { + "epoch": 0.48447395301327884, + "grad_norm": 1.5340790775083974, + "learning_rate": 1.0987579035764334e-05, + "loss": 0.745, + "step": 4743 + }, + { + "epoch": 0.4845760980592441, + "grad_norm": 1.5184643039088583, + "learning_rate": 1.098428682198012e-05, + "loss": 0.6914, + "step": 4744 + }, + { + "epoch": 0.4846782431052094, + "grad_norm": 1.534369240790998, + "learning_rate": 1.0980994500465082e-05, + "loss": 0.7083, + "step": 4745 + }, + { + "epoch": 0.4847803881511747, + "grad_norm": 1.375709347008455, + "learning_rate": 1.097770207157956e-05, + "loss": 0.615, + "step": 4746 + }, + { + "epoch": 0.4848825331971399, + "grad_norm": 1.5124341747214853, + "learning_rate": 1.0974409535683915e-05, + "loss": 0.7781, + "step": 4747 + }, + { + "epoch": 0.4849846782431052, + "grad_norm": 1.4739504895528295, + "learning_rate": 1.0971116893138514e-05, + "loss": 0.7429, + "step": 4748 + }, + { + "epoch": 0.4850868232890705, + "grad_norm": 1.469761192613227, + "learning_rate": 1.0967824144303744e-05, + "loss": 0.7064, + "step": 4749 + }, + { + "epoch": 0.4851889683350358, + "grad_norm": 1.3837198078945248, + "learning_rate": 1.0964531289539996e-05, + "loss": 0.8201, + "step": 4750 + }, + { + "epoch": 0.485291113381001, + "grad_norm": 1.6432937371759888, + "learning_rate": 1.0961238329207674e-05, + "loss": 0.6821, + "step": 4751 + }, + { + "epoch": 0.4853932584269663, + "grad_norm": 1.4232015699449252, + "learning_rate": 1.0957945263667198e-05, + "loss": 0.7385, + "step": 4752 + }, + { + "epoch": 0.4854954034729316, + "grad_norm": 1.5591136740028924, + "learning_rate": 1.0954652093278992e-05, + "loss": 0.6357, + "step": 4753 + }, + { + "epoch": 0.48559754851889686, + "grad_norm": 1.6090627757684384, + "learning_rate": 1.09513588184035e-05, + "loss": 0.8706, + "step": 4754 + }, + { + "epoch": 0.4856996935648621, + "grad_norm": 1.2824965117402605, + "learning_rate": 1.0948065439401167e-05, + "loss": 0.7944, + "step": 4755 + }, + { + "epoch": 0.4858018386108274, + "grad_norm": 1.5222870121338883, + "learning_rate": 1.0944771956632461e-05, + "loss": 0.6711, + "step": 4756 + }, + { + "epoch": 0.48590398365679266, + "grad_norm": 1.4931651566811324, + "learning_rate": 1.0941478370457857e-05, + "loss": 0.7286, + "step": 4757 + }, + { + "epoch": 0.4860061287027579, + "grad_norm": 1.41385675466694, + "learning_rate": 1.0938184681237833e-05, + "loss": 0.6811, + "step": 4758 + }, + { + "epoch": 0.4861082737487232, + "grad_norm": 1.4487110011342361, + "learning_rate": 1.0934890889332892e-05, + "loss": 0.7063, + "step": 4759 + }, + { + "epoch": 0.48621041879468846, + "grad_norm": 1.560260887203428, + "learning_rate": 1.0931596995103537e-05, + "loss": 0.7352, + "step": 4760 + }, + { + "epoch": 0.48631256384065374, + "grad_norm": 1.4236950726441457, + "learning_rate": 1.092830299891029e-05, + "loss": 0.7215, + "step": 4761 + }, + { + "epoch": 0.486414708886619, + "grad_norm": 1.43817469110565, + "learning_rate": 1.092500890111368e-05, + "loss": 0.7674, + "step": 4762 + }, + { + "epoch": 0.48651685393258426, + "grad_norm": 1.4901566016402077, + "learning_rate": 1.0921714702074247e-05, + "loss": 0.7956, + "step": 4763 + }, + { + "epoch": 0.48661899897854954, + "grad_norm": 1.5059899075064886, + "learning_rate": 1.0918420402152546e-05, + "loss": 0.7749, + "step": 4764 + }, + { + "epoch": 0.48672114402451483, + "grad_norm": 1.3965853583805192, + "learning_rate": 1.0915126001709136e-05, + "loss": 0.6573, + "step": 4765 + }, + { + "epoch": 0.48682328907048006, + "grad_norm": 1.5938716200642589, + "learning_rate": 1.0911831501104598e-05, + "loss": 0.8367, + "step": 4766 + }, + { + "epoch": 0.48692543411644534, + "grad_norm": 1.3263109440622418, + "learning_rate": 1.090853690069951e-05, + "loss": 0.6874, + "step": 4767 + }, + { + "epoch": 0.48702757916241063, + "grad_norm": 1.478747216684893, + "learning_rate": 1.0905242200854472e-05, + "loss": 0.6745, + "step": 4768 + }, + { + "epoch": 0.4871297242083759, + "grad_norm": 1.377958581345215, + "learning_rate": 1.0901947401930091e-05, + "loss": 0.754, + "step": 4769 + }, + { + "epoch": 0.48723186925434114, + "grad_norm": 1.3274494273208592, + "learning_rate": 1.0898652504286982e-05, + "loss": 0.6681, + "step": 4770 + }, + { + "epoch": 0.48733401430030643, + "grad_norm": 1.588452944987468, + "learning_rate": 1.0895357508285779e-05, + "loss": 0.6956, + "step": 4771 + }, + { + "epoch": 0.4874361593462717, + "grad_norm": 1.4176444713663592, + "learning_rate": 1.0892062414287118e-05, + "loss": 0.6232, + "step": 4772 + }, + { + "epoch": 0.487538304392237, + "grad_norm": 1.3562293899930955, + "learning_rate": 1.0888767222651646e-05, + "loss": 0.7467, + "step": 4773 + }, + { + "epoch": 0.48764044943820223, + "grad_norm": 1.4366110614247176, + "learning_rate": 1.088547193374003e-05, + "loss": 0.6906, + "step": 4774 + }, + { + "epoch": 0.4877425944841675, + "grad_norm": 1.5247542758365578, + "learning_rate": 1.0882176547912937e-05, + "loss": 0.7729, + "step": 4775 + }, + { + "epoch": 0.4878447395301328, + "grad_norm": 1.415319457842646, + "learning_rate": 1.0878881065531051e-05, + "loss": 0.7112, + "step": 4776 + }, + { + "epoch": 0.4879468845760981, + "grad_norm": 1.6514315506061399, + "learning_rate": 1.0875585486955068e-05, + "loss": 0.7946, + "step": 4777 + }, + { + "epoch": 0.4880490296220633, + "grad_norm": 1.4956595068805882, + "learning_rate": 1.0872289812545685e-05, + "loss": 0.738, + "step": 4778 + }, + { + "epoch": 0.4881511746680286, + "grad_norm": 1.468818331164324, + "learning_rate": 1.0868994042663619e-05, + "loss": 0.8096, + "step": 4779 + }, + { + "epoch": 0.4882533197139939, + "grad_norm": 1.4975732916658322, + "learning_rate": 1.086569817766959e-05, + "loss": 0.6452, + "step": 4780 + }, + { + "epoch": 0.48835546475995917, + "grad_norm": 1.4818801455518247, + "learning_rate": 1.0862402217924342e-05, + "loss": 0.774, + "step": 4781 + }, + { + "epoch": 0.4884576098059244, + "grad_norm": 1.4038656981792568, + "learning_rate": 1.0859106163788608e-05, + "loss": 0.6904, + "step": 4782 + }, + { + "epoch": 0.4885597548518897, + "grad_norm": 1.439198575987848, + "learning_rate": 1.0855810015623156e-05, + "loss": 0.7551, + "step": 4783 + }, + { + "epoch": 0.48866189989785497, + "grad_norm": 1.5777882423507272, + "learning_rate": 1.085251377378874e-05, + "loss": 0.6802, + "step": 4784 + }, + { + "epoch": 0.4887640449438202, + "grad_norm": 1.4210039461625712, + "learning_rate": 1.0849217438646143e-05, + "loss": 0.7983, + "step": 4785 + }, + { + "epoch": 0.4888661899897855, + "grad_norm": 1.4216855239283253, + "learning_rate": 1.0845921010556148e-05, + "loss": 0.8124, + "step": 4786 + }, + { + "epoch": 0.48896833503575077, + "grad_norm": 1.5058742549350697, + "learning_rate": 1.0842624489879553e-05, + "loss": 0.8104, + "step": 4787 + }, + { + "epoch": 0.48907048008171605, + "grad_norm": 1.5358103262303238, + "learning_rate": 1.083932787697716e-05, + "loss": 0.7297, + "step": 4788 + }, + { + "epoch": 0.4891726251276813, + "grad_norm": 1.4314046155395013, + "learning_rate": 1.0836031172209792e-05, + "loss": 0.7326, + "step": 4789 + }, + { + "epoch": 0.48927477017364657, + "grad_norm": 1.7659487389672255, + "learning_rate": 1.0832734375938269e-05, + "loss": 0.6802, + "step": 4790 + }, + { + "epoch": 0.48937691521961185, + "grad_norm": 1.5208691604189406, + "learning_rate": 1.0829437488523433e-05, + "loss": 0.7457, + "step": 4791 + }, + { + "epoch": 0.48947906026557714, + "grad_norm": 1.3267972839509876, + "learning_rate": 1.0826140510326127e-05, + "loss": 0.6537, + "step": 4792 + }, + { + "epoch": 0.48958120531154237, + "grad_norm": 1.5645050670611527, + "learning_rate": 1.082284344170721e-05, + "loss": 0.7847, + "step": 4793 + }, + { + "epoch": 0.48968335035750765, + "grad_norm": 1.4751413112048315, + "learning_rate": 1.0819546283027544e-05, + "loss": 0.6565, + "step": 4794 + }, + { + "epoch": 0.48978549540347294, + "grad_norm": 1.5107509457948471, + "learning_rate": 1.081624903464801e-05, + "loss": 0.7759, + "step": 4795 + }, + { + "epoch": 0.4898876404494382, + "grad_norm": 1.4456766290323204, + "learning_rate": 1.081295169692949e-05, + "loss": 0.8139, + "step": 4796 + }, + { + "epoch": 0.48998978549540345, + "grad_norm": 1.5104576654915762, + "learning_rate": 1.080965427023288e-05, + "loss": 0.6609, + "step": 4797 + }, + { + "epoch": 0.49009193054136874, + "grad_norm": 1.3609813645018798, + "learning_rate": 1.0806356754919092e-05, + "loss": 0.6238, + "step": 4798 + }, + { + "epoch": 0.490194075587334, + "grad_norm": 1.3744872530313812, + "learning_rate": 1.0803059151349034e-05, + "loss": 0.6872, + "step": 4799 + }, + { + "epoch": 0.4902962206332993, + "grad_norm": 1.445958951773182, + "learning_rate": 1.0799761459883631e-05, + "loss": 0.7646, + "step": 4800 + }, + { + "epoch": 0.49039836567926454, + "grad_norm": 1.4104058097550571, + "learning_rate": 1.0796463680883822e-05, + "loss": 0.7003, + "step": 4801 + }, + { + "epoch": 0.4905005107252298, + "grad_norm": 1.4669422083785768, + "learning_rate": 1.0793165814710547e-05, + "loss": 0.654, + "step": 4802 + }, + { + "epoch": 0.4906026557711951, + "grad_norm": 1.6136921348822473, + "learning_rate": 1.0789867861724764e-05, + "loss": 0.8445, + "step": 4803 + }, + { + "epoch": 0.4907048008171604, + "grad_norm": 1.4706167800385206, + "learning_rate": 1.078656982228743e-05, + "loss": 0.7202, + "step": 4804 + }, + { + "epoch": 0.4908069458631256, + "grad_norm": 1.436602342849144, + "learning_rate": 1.078327169675952e-05, + "loss": 0.6466, + "step": 4805 + }, + { + "epoch": 0.4909090909090909, + "grad_norm": 1.347962570721208, + "learning_rate": 1.077997348550202e-05, + "loss": 0.5594, + "step": 4806 + }, + { + "epoch": 0.4910112359550562, + "grad_norm": 1.5684552429194407, + "learning_rate": 1.0776675188875916e-05, + "loss": 0.8123, + "step": 4807 + }, + { + "epoch": 0.4911133810010215, + "grad_norm": 1.474407940713154, + "learning_rate": 1.0773376807242211e-05, + "loss": 0.6448, + "step": 4808 + }, + { + "epoch": 0.4912155260469867, + "grad_norm": 1.4279886806824929, + "learning_rate": 1.0770078340961915e-05, + "loss": 0.6793, + "step": 4809 + }, + { + "epoch": 0.491317671092952, + "grad_norm": 1.5491650376005763, + "learning_rate": 1.076677979039605e-05, + "loss": 0.6706, + "step": 4810 + }, + { + "epoch": 0.4914198161389173, + "grad_norm": 1.475385142587922, + "learning_rate": 1.0763481155905637e-05, + "loss": 0.7756, + "step": 4811 + }, + { + "epoch": 0.49152196118488256, + "grad_norm": 1.37668490961059, + "learning_rate": 1.0760182437851718e-05, + "loss": 0.6976, + "step": 4812 + }, + { + "epoch": 0.4916241062308478, + "grad_norm": 1.292421018288612, + "learning_rate": 1.0756883636595344e-05, + "loss": 0.675, + "step": 4813 + }, + { + "epoch": 0.4917262512768131, + "grad_norm": 1.7145517074301948, + "learning_rate": 1.0753584752497566e-05, + "loss": 0.7903, + "step": 4814 + }, + { + "epoch": 0.49182839632277836, + "grad_norm": 1.3046012698934726, + "learning_rate": 1.0750285785919449e-05, + "loss": 0.5661, + "step": 4815 + }, + { + "epoch": 0.4919305413687436, + "grad_norm": 1.3200863451132492, + "learning_rate": 1.0746986737222067e-05, + "loss": 0.6726, + "step": 4816 + }, + { + "epoch": 0.4920326864147089, + "grad_norm": 1.4486284752239376, + "learning_rate": 1.0743687606766505e-05, + "loss": 0.7016, + "step": 4817 + }, + { + "epoch": 0.49213483146067416, + "grad_norm": 1.513097406772954, + "learning_rate": 1.0740388394913855e-05, + "loss": 0.7805, + "step": 4818 + }, + { + "epoch": 0.49223697650663945, + "grad_norm": 1.3646155037894567, + "learning_rate": 1.0737089102025216e-05, + "loss": 0.6375, + "step": 4819 + }, + { + "epoch": 0.4923391215526047, + "grad_norm": 1.3405871268165823, + "learning_rate": 1.0733789728461696e-05, + "loss": 0.6672, + "step": 4820 + }, + { + "epoch": 0.49244126659856996, + "grad_norm": 1.509606979433828, + "learning_rate": 1.073049027458442e-05, + "loss": 0.8514, + "step": 4821 + }, + { + "epoch": 0.49254341164453525, + "grad_norm": 1.4197579308679416, + "learning_rate": 1.072719074075451e-05, + "loss": 0.723, + "step": 4822 + }, + { + "epoch": 0.49264555669050053, + "grad_norm": 1.4061805051641842, + "learning_rate": 1.0723891127333104e-05, + "loss": 0.7146, + "step": 4823 + }, + { + "epoch": 0.49274770173646576, + "grad_norm": 1.414937495617106, + "learning_rate": 1.0720591434681343e-05, + "loss": 0.6488, + "step": 4824 + }, + { + "epoch": 0.49284984678243104, + "grad_norm": 1.5583678747805858, + "learning_rate": 1.0717291663160387e-05, + "loss": 0.8501, + "step": 4825 + }, + { + "epoch": 0.49295199182839633, + "grad_norm": 1.50701318068142, + "learning_rate": 1.0713991813131395e-05, + "loss": 0.7165, + "step": 4826 + }, + { + "epoch": 0.4930541368743616, + "grad_norm": 1.4606468327034636, + "learning_rate": 1.071069188495554e-05, + "loss": 0.6797, + "step": 4827 + }, + { + "epoch": 0.49315628192032684, + "grad_norm": 1.592801392103572, + "learning_rate": 1.0707391878993996e-05, + "loss": 0.8635, + "step": 4828 + }, + { + "epoch": 0.49325842696629213, + "grad_norm": 1.4870351175924856, + "learning_rate": 1.0704091795607954e-05, + "loss": 0.7054, + "step": 4829 + }, + { + "epoch": 0.4933605720122574, + "grad_norm": 1.4574312302557222, + "learning_rate": 1.0700791635158612e-05, + "loss": 0.6895, + "step": 4830 + }, + { + "epoch": 0.4934627170582227, + "grad_norm": 1.4522327184082269, + "learning_rate": 1.069749139800717e-05, + "loss": 0.8333, + "step": 4831 + }, + { + "epoch": 0.49356486210418793, + "grad_norm": 1.4829924388969178, + "learning_rate": 1.0694191084514844e-05, + "loss": 0.6646, + "step": 4832 + }, + { + "epoch": 0.4936670071501532, + "grad_norm": 1.4419217783419047, + "learning_rate": 1.0690890695042857e-05, + "loss": 0.6488, + "step": 4833 + }, + { + "epoch": 0.4937691521961185, + "grad_norm": 1.512176548555915, + "learning_rate": 1.0687590229952435e-05, + "loss": 0.7513, + "step": 4834 + }, + { + "epoch": 0.4938712972420838, + "grad_norm": 1.1797720064519392, + "learning_rate": 1.0684289689604824e-05, + "loss": 0.6561, + "step": 4835 + }, + { + "epoch": 0.493973442288049, + "grad_norm": 1.4523244156872483, + "learning_rate": 1.0680989074361254e-05, + "loss": 0.6626, + "step": 4836 + }, + { + "epoch": 0.4940755873340143, + "grad_norm": 1.45766887313778, + "learning_rate": 1.0677688384582999e-05, + "loss": 0.7549, + "step": 4837 + }, + { + "epoch": 0.4941777323799796, + "grad_norm": 1.2708216693153007, + "learning_rate": 1.0674387620631308e-05, + "loss": 0.6763, + "step": 4838 + }, + { + "epoch": 0.49427987742594487, + "grad_norm": 1.5256261761398195, + "learning_rate": 1.0671086782867459e-05, + "loss": 0.7168, + "step": 4839 + }, + { + "epoch": 0.4943820224719101, + "grad_norm": 1.3667434140510348, + "learning_rate": 1.0667785871652724e-05, + "loss": 0.6798, + "step": 4840 + }, + { + "epoch": 0.4944841675178754, + "grad_norm": 1.4370871043503535, + "learning_rate": 1.0664484887348396e-05, + "loss": 0.7291, + "step": 4841 + }, + { + "epoch": 0.49458631256384067, + "grad_norm": 1.4938763119935077, + "learning_rate": 1.0661183830315772e-05, + "loss": 0.7267, + "step": 4842 + }, + { + "epoch": 0.4946884576098059, + "grad_norm": 1.4041539157504446, + "learning_rate": 1.0657882700916144e-05, + "loss": 0.7428, + "step": 4843 + }, + { + "epoch": 0.4947906026557712, + "grad_norm": 1.562294603340939, + "learning_rate": 1.0654581499510832e-05, + "loss": 0.8075, + "step": 4844 + }, + { + "epoch": 0.49489274770173647, + "grad_norm": 1.3141668379802414, + "learning_rate": 1.0651280226461154e-05, + "loss": 0.6392, + "step": 4845 + }, + { + "epoch": 0.49499489274770175, + "grad_norm": 1.6195687250195019, + "learning_rate": 1.0647978882128431e-05, + "loss": 0.7879, + "step": 4846 + }, + { + "epoch": 0.495097037793667, + "grad_norm": 1.4723794365890113, + "learning_rate": 1.0644677466874005e-05, + "loss": 0.6849, + "step": 4847 + }, + { + "epoch": 0.49519918283963227, + "grad_norm": 1.5091505865627393, + "learning_rate": 1.064137598105921e-05, + "loss": 0.7203, + "step": 4848 + }, + { + "epoch": 0.49530132788559755, + "grad_norm": 1.3971572198802125, + "learning_rate": 1.06380744250454e-05, + "loss": 0.6774, + "step": 4849 + }, + { + "epoch": 0.49540347293156284, + "grad_norm": 1.3490066411686468, + "learning_rate": 1.0634772799193933e-05, + "loss": 0.6426, + "step": 4850 + }, + { + "epoch": 0.49550561797752807, + "grad_norm": 1.6673520078428556, + "learning_rate": 1.063147110386617e-05, + "loss": 0.7834, + "step": 4851 + }, + { + "epoch": 0.49560776302349335, + "grad_norm": 1.56732445270385, + "learning_rate": 1.0628169339423491e-05, + "loss": 0.8905, + "step": 4852 + }, + { + "epoch": 0.49570990806945864, + "grad_norm": 1.4549935971710761, + "learning_rate": 1.0624867506227268e-05, + "loss": 0.5801, + "step": 4853 + }, + { + "epoch": 0.4958120531154239, + "grad_norm": 1.388283850227532, + "learning_rate": 1.0621565604638897e-05, + "loss": 0.6584, + "step": 4854 + }, + { + "epoch": 0.49591419816138915, + "grad_norm": 1.5954216514003943, + "learning_rate": 1.0618263635019765e-05, + "loss": 0.7201, + "step": 4855 + }, + { + "epoch": 0.49601634320735444, + "grad_norm": 1.6140987739629562, + "learning_rate": 1.0614961597731279e-05, + "loss": 0.6137, + "step": 4856 + }, + { + "epoch": 0.4961184882533197, + "grad_norm": 1.6153877511894505, + "learning_rate": 1.0611659493134852e-05, + "loss": 0.688, + "step": 4857 + }, + { + "epoch": 0.496220633299285, + "grad_norm": 1.4229766413049774, + "learning_rate": 1.0608357321591895e-05, + "loss": 0.7657, + "step": 4858 + }, + { + "epoch": 0.49632277834525024, + "grad_norm": 1.4686393847255665, + "learning_rate": 1.060505508346384e-05, + "loss": 0.6999, + "step": 4859 + }, + { + "epoch": 0.4964249233912155, + "grad_norm": 1.4235236145380288, + "learning_rate": 1.0601752779112114e-05, + "loss": 0.7045, + "step": 4860 + }, + { + "epoch": 0.4965270684371808, + "grad_norm": 1.6074636774244668, + "learning_rate": 1.0598450408898154e-05, + "loss": 0.8177, + "step": 4861 + }, + { + "epoch": 0.4966292134831461, + "grad_norm": 1.5537396838109478, + "learning_rate": 1.0595147973183416e-05, + "loss": 0.6558, + "step": 4862 + }, + { + "epoch": 0.4967313585291113, + "grad_norm": 1.58141447185381, + "learning_rate": 1.0591845472329341e-05, + "loss": 0.7581, + "step": 4863 + }, + { + "epoch": 0.4968335035750766, + "grad_norm": 1.5924444968259097, + "learning_rate": 1.0588542906697401e-05, + "loss": 0.7916, + "step": 4864 + }, + { + "epoch": 0.4969356486210419, + "grad_norm": 1.4028515150428233, + "learning_rate": 1.0585240276649056e-05, + "loss": 0.634, + "step": 4865 + }, + { + "epoch": 0.4970377936670072, + "grad_norm": 1.2881391287383237, + "learning_rate": 1.0581937582545789e-05, + "loss": 0.6299, + "step": 4866 + }, + { + "epoch": 0.4971399387129724, + "grad_norm": 1.5596498619421684, + "learning_rate": 1.0578634824749076e-05, + "loss": 0.8019, + "step": 4867 + }, + { + "epoch": 0.4972420837589377, + "grad_norm": 1.52784809772862, + "learning_rate": 1.0575332003620406e-05, + "loss": 0.7828, + "step": 4868 + }, + { + "epoch": 0.497344228804903, + "grad_norm": 1.4048209191776697, + "learning_rate": 1.057202911952128e-05, + "loss": 0.661, + "step": 4869 + }, + { + "epoch": 0.4974463738508682, + "grad_norm": 1.3789684385809347, + "learning_rate": 1.0568726172813192e-05, + "loss": 0.595, + "step": 4870 + }, + { + "epoch": 0.4975485188968335, + "grad_norm": 1.7904796960301383, + "learning_rate": 1.0565423163857665e-05, + "loss": 0.6617, + "step": 4871 + }, + { + "epoch": 0.4976506639427988, + "grad_norm": 1.4610251813782338, + "learning_rate": 1.05621200930162e-05, + "loss": 0.6548, + "step": 4872 + }, + { + "epoch": 0.49775280898876406, + "grad_norm": 1.5264166869604894, + "learning_rate": 1.0558816960650328e-05, + "loss": 0.7879, + "step": 4873 + }, + { + "epoch": 0.4978549540347293, + "grad_norm": 1.6143699378092482, + "learning_rate": 1.0555513767121584e-05, + "loss": 0.5551, + "step": 4874 + }, + { + "epoch": 0.4979570990806946, + "grad_norm": 1.4184933991594126, + "learning_rate": 1.0552210512791494e-05, + "loss": 0.707, + "step": 4875 + }, + { + "epoch": 0.49805924412665986, + "grad_norm": 1.5794379866408286, + "learning_rate": 1.054890719802161e-05, + "loss": 0.7828, + "step": 4876 + }, + { + "epoch": 0.49816138917262515, + "grad_norm": 1.4793178794299249, + "learning_rate": 1.0545603823173479e-05, + "loss": 0.6561, + "step": 4877 + }, + { + "epoch": 0.4982635342185904, + "grad_norm": 1.505283604688199, + "learning_rate": 1.0542300388608652e-05, + "loss": 0.688, + "step": 4878 + }, + { + "epoch": 0.49836567926455566, + "grad_norm": 1.426099532389924, + "learning_rate": 1.0538996894688702e-05, + "loss": 0.7356, + "step": 4879 + }, + { + "epoch": 0.49846782431052095, + "grad_norm": 1.5312345109031216, + "learning_rate": 1.0535693341775191e-05, + "loss": 0.6673, + "step": 4880 + }, + { + "epoch": 0.49856996935648623, + "grad_norm": 1.4687782552468387, + "learning_rate": 1.05323897302297e-05, + "loss": 0.6117, + "step": 4881 + }, + { + "epoch": 0.49867211440245146, + "grad_norm": 1.474759458799634, + "learning_rate": 1.0529086060413807e-05, + "loss": 0.6331, + "step": 4882 + }, + { + "epoch": 0.49877425944841675, + "grad_norm": 1.4482382064876158, + "learning_rate": 1.0525782332689103e-05, + "loss": 0.8164, + "step": 4883 + }, + { + "epoch": 0.49887640449438203, + "grad_norm": 1.3576308022243069, + "learning_rate": 1.0522478547417183e-05, + "loss": 0.654, + "step": 4884 + }, + { + "epoch": 0.4989785495403473, + "grad_norm": 1.5239224402985214, + "learning_rate": 1.051917470495965e-05, + "loss": 0.7367, + "step": 4885 + }, + { + "epoch": 0.49908069458631255, + "grad_norm": 1.432211060367834, + "learning_rate": 1.051587080567811e-05, + "loss": 0.6996, + "step": 4886 + }, + { + "epoch": 0.49918283963227783, + "grad_norm": 1.3262176980758462, + "learning_rate": 1.0512566849934174e-05, + "loss": 0.7014, + "step": 4887 + }, + { + "epoch": 0.4992849846782431, + "grad_norm": 1.4379308482825146, + "learning_rate": 1.0509262838089467e-05, + "loss": 0.6592, + "step": 4888 + }, + { + "epoch": 0.4993871297242084, + "grad_norm": 1.3989492676618582, + "learning_rate": 1.0505958770505615e-05, + "loss": 0.7151, + "step": 4889 + }, + { + "epoch": 0.49948927477017363, + "grad_norm": 1.4860482507383963, + "learning_rate": 1.0502654647544246e-05, + "loss": 0.8067, + "step": 4890 + }, + { + "epoch": 0.4995914198161389, + "grad_norm": 1.4374733133306616, + "learning_rate": 1.0499350469567005e-05, + "loss": 0.7365, + "step": 4891 + }, + { + "epoch": 0.4996935648621042, + "grad_norm": 1.4646893849155718, + "learning_rate": 1.0496046236935529e-05, + "loss": 0.7654, + "step": 4892 + }, + { + "epoch": 0.4997957099080695, + "grad_norm": 1.5992299689918235, + "learning_rate": 1.0492741950011472e-05, + "loss": 0.641, + "step": 4893 + }, + { + "epoch": 0.4998978549540347, + "grad_norm": 1.5042865341850593, + "learning_rate": 1.0489437609156491e-05, + "loss": 0.6639, + "step": 4894 + }, + { + "epoch": 0.5, + "grad_norm": 1.4546188010857557, + "learning_rate": 1.0486133214732249e-05, + "loss": 0.7501, + "step": 4895 + }, + { + "epoch": 0.5001021450459653, + "grad_norm": 1.5152444667622122, + "learning_rate": 1.0482828767100409e-05, + "loss": 0.6712, + "step": 4896 + }, + { + "epoch": 0.5002042900919306, + "grad_norm": 1.544586856066606, + "learning_rate": 1.0479524266622649e-05, + "loss": 0.7819, + "step": 4897 + }, + { + "epoch": 0.5003064351378959, + "grad_norm": 1.3279587256301781, + "learning_rate": 1.047621971366065e-05, + "loss": 0.6556, + "step": 4898 + }, + { + "epoch": 0.500408580183861, + "grad_norm": 1.5120638638889898, + "learning_rate": 1.0472915108576095e-05, + "loss": 0.7279, + "step": 4899 + }, + { + "epoch": 0.5005107252298263, + "grad_norm": 1.4673202849906533, + "learning_rate": 1.0469610451730676e-05, + "loss": 0.6355, + "step": 4900 + }, + { + "epoch": 0.5006128702757916, + "grad_norm": 1.3106722606890313, + "learning_rate": 1.0466305743486092e-05, + "loss": 0.7234, + "step": 4901 + }, + { + "epoch": 0.5007150153217569, + "grad_norm": 1.379474125411674, + "learning_rate": 1.0463000984204039e-05, + "loss": 0.6966, + "step": 4902 + }, + { + "epoch": 0.5008171603677222, + "grad_norm": 1.451572504783514, + "learning_rate": 1.0459696174246232e-05, + "loss": 0.7035, + "step": 4903 + }, + { + "epoch": 0.5009193054136875, + "grad_norm": 1.4451819948964828, + "learning_rate": 1.045639131397438e-05, + "loss": 0.6769, + "step": 4904 + }, + { + "epoch": 0.5010214504596527, + "grad_norm": 1.3123760456294886, + "learning_rate": 1.0453086403750203e-05, + "loss": 0.6675, + "step": 4905 + }, + { + "epoch": 0.501123595505618, + "grad_norm": 1.5329469025217979, + "learning_rate": 1.044978144393543e-05, + "loss": 0.7072, + "step": 4906 + }, + { + "epoch": 0.5012257405515832, + "grad_norm": 1.3762813283503292, + "learning_rate": 1.0446476434891786e-05, + "loss": 0.7396, + "step": 4907 + }, + { + "epoch": 0.5013278855975485, + "grad_norm": 1.468183756462138, + "learning_rate": 1.0443171376981004e-05, + "loss": 0.6932, + "step": 4908 + }, + { + "epoch": 0.5014300306435138, + "grad_norm": 1.490695622732649, + "learning_rate": 1.043986627056483e-05, + "loss": 0.866, + "step": 4909 + }, + { + "epoch": 0.501532175689479, + "grad_norm": 1.3494003225416509, + "learning_rate": 1.0436561116005012e-05, + "loss": 0.7159, + "step": 4910 + }, + { + "epoch": 0.5016343207354443, + "grad_norm": 1.5782833684893363, + "learning_rate": 1.0433255913663299e-05, + "loss": 0.7814, + "step": 4911 + }, + { + "epoch": 0.5017364657814096, + "grad_norm": 1.4559434365130153, + "learning_rate": 1.042995066390144e-05, + "loss": 0.7889, + "step": 4912 + }, + { + "epoch": 0.5018386108273749, + "grad_norm": 1.3447130881204652, + "learning_rate": 1.0426645367081207e-05, + "loss": 0.6462, + "step": 4913 + }, + { + "epoch": 0.5019407558733402, + "grad_norm": 1.4867002540751066, + "learning_rate": 1.0423340023564362e-05, + "loss": 0.7794, + "step": 4914 + }, + { + "epoch": 0.5020429009193054, + "grad_norm": 1.3777104723035158, + "learning_rate": 1.0420034633712678e-05, + "loss": 0.5601, + "step": 4915 + }, + { + "epoch": 0.5021450459652707, + "grad_norm": 1.6107935028423546, + "learning_rate": 1.041672919788793e-05, + "loss": 0.7419, + "step": 4916 + }, + { + "epoch": 0.5022471910112359, + "grad_norm": 1.3720817188994545, + "learning_rate": 1.0413423716451904e-05, + "loss": 0.84, + "step": 4917 + }, + { + "epoch": 0.5023493360572012, + "grad_norm": 1.3850543537431321, + "learning_rate": 1.0410118189766386e-05, + "loss": 0.7248, + "step": 4918 + }, + { + "epoch": 0.5024514811031665, + "grad_norm": 1.4342716840740544, + "learning_rate": 1.0406812618193167e-05, + "loss": 0.772, + "step": 4919 + }, + { + "epoch": 0.5025536261491318, + "grad_norm": 1.4507620267116055, + "learning_rate": 1.0403507002094042e-05, + "loss": 0.7656, + "step": 4920 + }, + { + "epoch": 0.5026557711950971, + "grad_norm": 1.6435212028974366, + "learning_rate": 1.0400201341830819e-05, + "loss": 0.7457, + "step": 4921 + }, + { + "epoch": 0.5027579162410624, + "grad_norm": 1.3675105690159028, + "learning_rate": 1.0396895637765296e-05, + "loss": 0.6451, + "step": 4922 + }, + { + "epoch": 0.5028600612870275, + "grad_norm": 1.433988583491808, + "learning_rate": 1.0393589890259293e-05, + "loss": 0.7845, + "step": 4923 + }, + { + "epoch": 0.5029622063329928, + "grad_norm": 1.5062193639595813, + "learning_rate": 1.0390284099674616e-05, + "loss": 0.7176, + "step": 4924 + }, + { + "epoch": 0.5030643513789581, + "grad_norm": 1.3570578777574203, + "learning_rate": 1.03869782663731e-05, + "loss": 0.692, + "step": 4925 + }, + { + "epoch": 0.5031664964249234, + "grad_norm": 1.4613805145147096, + "learning_rate": 1.0383672390716558e-05, + "loss": 0.6892, + "step": 4926 + }, + { + "epoch": 0.5032686414708887, + "grad_norm": 1.5143175154008095, + "learning_rate": 1.0380366473066827e-05, + "loss": 0.6788, + "step": 4927 + }, + { + "epoch": 0.503370786516854, + "grad_norm": 1.675952757098426, + "learning_rate": 1.0377060513785737e-05, + "loss": 0.809, + "step": 4928 + }, + { + "epoch": 0.5034729315628192, + "grad_norm": 1.4993703810925254, + "learning_rate": 1.0373754513235133e-05, + "loss": 0.6875, + "step": 4929 + }, + { + "epoch": 0.5035750766087844, + "grad_norm": 1.5802161628037208, + "learning_rate": 1.0370448471776855e-05, + "loss": 0.7455, + "step": 4930 + }, + { + "epoch": 0.5036772216547497, + "grad_norm": 1.603586527790416, + "learning_rate": 1.036714238977275e-05, + "loss": 0.7098, + "step": 4931 + }, + { + "epoch": 0.503779366700715, + "grad_norm": 1.2619364110867464, + "learning_rate": 1.0363836267584676e-05, + "loss": 0.6646, + "step": 4932 + }, + { + "epoch": 0.5038815117466803, + "grad_norm": 1.5173364633899913, + "learning_rate": 1.0360530105574489e-05, + "loss": 0.7818, + "step": 4933 + }, + { + "epoch": 0.5039836567926456, + "grad_norm": 1.4656151646868276, + "learning_rate": 1.0357223904104046e-05, + "loss": 0.7426, + "step": 4934 + }, + { + "epoch": 0.5040858018386108, + "grad_norm": 1.4756222970278228, + "learning_rate": 1.0353917663535218e-05, + "loss": 0.7553, + "step": 4935 + }, + { + "epoch": 0.5041879468845761, + "grad_norm": 1.5365094330757276, + "learning_rate": 1.035061138422987e-05, + "loss": 0.7008, + "step": 4936 + }, + { + "epoch": 0.5042900919305414, + "grad_norm": 1.5305251066473442, + "learning_rate": 1.0347305066549881e-05, + "loss": 0.6681, + "step": 4937 + }, + { + "epoch": 0.5043922369765066, + "grad_norm": 1.5394322978267003, + "learning_rate": 1.0343998710857133e-05, + "loss": 0.729, + "step": 4938 + }, + { + "epoch": 0.5044943820224719, + "grad_norm": 1.3518139424256863, + "learning_rate": 1.0340692317513496e-05, + "loss": 0.6911, + "step": 4939 + }, + { + "epoch": 0.5045965270684372, + "grad_norm": 1.4602378033555292, + "learning_rate": 1.0337385886880868e-05, + "loss": 0.7311, + "step": 4940 + }, + { + "epoch": 0.5046986721144024, + "grad_norm": 1.3388402677704312, + "learning_rate": 1.0334079419321137e-05, + "loss": 0.6621, + "step": 4941 + }, + { + "epoch": 0.5048008171603677, + "grad_norm": 1.4917349926698051, + "learning_rate": 1.0330772915196199e-05, + "loss": 0.7844, + "step": 4942 + }, + { + "epoch": 0.504902962206333, + "grad_norm": 1.375921210815757, + "learning_rate": 1.0327466374867949e-05, + "loss": 0.6364, + "step": 4943 + }, + { + "epoch": 0.5050051072522983, + "grad_norm": 1.4588566545498547, + "learning_rate": 1.0324159798698294e-05, + "loss": 0.7213, + "step": 4944 + }, + { + "epoch": 0.5051072522982636, + "grad_norm": 1.5953367300601635, + "learning_rate": 1.032085318704914e-05, + "loss": 0.6758, + "step": 4945 + }, + { + "epoch": 0.5052093973442288, + "grad_norm": 1.5323012582712892, + "learning_rate": 1.0317546540282396e-05, + "loss": 0.8859, + "step": 4946 + }, + { + "epoch": 0.505311542390194, + "grad_norm": 1.4655779901171861, + "learning_rate": 1.031423985875998e-05, + "loss": 0.7718, + "step": 4947 + }, + { + "epoch": 0.5054136874361593, + "grad_norm": 1.5301276525679626, + "learning_rate": 1.0310933142843809e-05, + "loss": 0.7237, + "step": 4948 + }, + { + "epoch": 0.5055158324821246, + "grad_norm": 1.444305828810855, + "learning_rate": 1.0307626392895803e-05, + "loss": 0.8306, + "step": 4949 + }, + { + "epoch": 0.5056179775280899, + "grad_norm": 1.4782140246571118, + "learning_rate": 1.0304319609277888e-05, + "loss": 0.7849, + "step": 4950 + }, + { + "epoch": 0.5057201225740552, + "grad_norm": 1.3147184150489735, + "learning_rate": 1.0301012792351996e-05, + "loss": 0.5421, + "step": 4951 + }, + { + "epoch": 0.5058222676200205, + "grad_norm": 1.4724871708066771, + "learning_rate": 1.029770594248006e-05, + "loss": 0.8173, + "step": 4952 + }, + { + "epoch": 0.5059244126659856, + "grad_norm": 1.512684641290524, + "learning_rate": 1.0294399060024016e-05, + "loss": 0.7175, + "step": 4953 + }, + { + "epoch": 0.5060265577119509, + "grad_norm": 1.2904526686948803, + "learning_rate": 1.0291092145345807e-05, + "loss": 0.6908, + "step": 4954 + }, + { + "epoch": 0.5061287027579162, + "grad_norm": 1.4854167124457194, + "learning_rate": 1.0287785198807375e-05, + "loss": 0.715, + "step": 4955 + }, + { + "epoch": 0.5062308478038815, + "grad_norm": 1.4629714050175426, + "learning_rate": 1.028447822077066e-05, + "loss": 0.7592, + "step": 4956 + }, + { + "epoch": 0.5063329928498468, + "grad_norm": 1.5046595661930975, + "learning_rate": 1.0281171211597627e-05, + "loss": 0.7038, + "step": 4957 + }, + { + "epoch": 0.5064351378958121, + "grad_norm": 1.4931251335015692, + "learning_rate": 1.027786417165022e-05, + "loss": 0.7657, + "step": 4958 + }, + { + "epoch": 0.5065372829417774, + "grad_norm": 1.663608267459015, + "learning_rate": 1.0274557101290401e-05, + "loss": 0.7124, + "step": 4959 + }, + { + "epoch": 0.5066394279877426, + "grad_norm": 1.4416808762325581, + "learning_rate": 1.027125000088013e-05, + "loss": 0.7471, + "step": 4960 + }, + { + "epoch": 0.5067415730337078, + "grad_norm": 1.4715377292282, + "learning_rate": 1.026794287078137e-05, + "loss": 0.6257, + "step": 4961 + }, + { + "epoch": 0.5068437180796731, + "grad_norm": 1.4012413112055653, + "learning_rate": 1.0264635711356093e-05, + "loss": 0.6328, + "step": 4962 + }, + { + "epoch": 0.5069458631256384, + "grad_norm": 1.5136684014549189, + "learning_rate": 1.026132852296626e-05, + "loss": 0.7595, + "step": 4963 + }, + { + "epoch": 0.5070480081716037, + "grad_norm": 1.5072325247604184, + "learning_rate": 1.0258021305973855e-05, + "loss": 0.6832, + "step": 4964 + }, + { + "epoch": 0.507150153217569, + "grad_norm": 1.5772436889389359, + "learning_rate": 1.0254714060740853e-05, + "loss": 0.8105, + "step": 4965 + }, + { + "epoch": 0.5072522982635342, + "grad_norm": 1.541917004939391, + "learning_rate": 1.0251406787629232e-05, + "loss": 0.7555, + "step": 4966 + }, + { + "epoch": 0.5073544433094995, + "grad_norm": 1.5016372865708516, + "learning_rate": 1.0248099487000975e-05, + "loss": 0.6747, + "step": 4967 + }, + { + "epoch": 0.5074565883554648, + "grad_norm": 1.5279678030568928, + "learning_rate": 1.0244792159218066e-05, + "loss": 0.7458, + "step": 4968 + }, + { + "epoch": 0.50755873340143, + "grad_norm": 1.4293467953367496, + "learning_rate": 1.02414848046425e-05, + "loss": 0.7063, + "step": 4969 + }, + { + "epoch": 0.5076608784473953, + "grad_norm": 1.5631387792664038, + "learning_rate": 1.0238177423636266e-05, + "loss": 0.6517, + "step": 4970 + }, + { + "epoch": 0.5077630234933606, + "grad_norm": 1.4832788572734437, + "learning_rate": 1.023487001656136e-05, + "loss": 0.7222, + "step": 4971 + }, + { + "epoch": 0.5078651685393258, + "grad_norm": 1.4881679059526838, + "learning_rate": 1.0231562583779778e-05, + "loss": 0.725, + "step": 4972 + }, + { + "epoch": 0.5079673135852911, + "grad_norm": 1.3904776509961085, + "learning_rate": 1.022825512565352e-05, + "loss": 0.6542, + "step": 4973 + }, + { + "epoch": 0.5080694586312564, + "grad_norm": 1.4455967349824477, + "learning_rate": 1.0224947642544594e-05, + "loss": 0.6871, + "step": 4974 + }, + { + "epoch": 0.5081716036772217, + "grad_norm": 1.355783401547785, + "learning_rate": 1.0221640134815e-05, + "loss": 0.7243, + "step": 4975 + }, + { + "epoch": 0.508273748723187, + "grad_norm": 1.4043945277991996, + "learning_rate": 1.0218332602826751e-05, + "loss": 0.7347, + "step": 4976 + }, + { + "epoch": 0.5083758937691522, + "grad_norm": 1.4324937209978437, + "learning_rate": 1.021502504694186e-05, + "loss": 0.6995, + "step": 4977 + }, + { + "epoch": 0.5084780388151174, + "grad_norm": 1.4246427234437187, + "learning_rate": 1.0211717467522335e-05, + "loss": 0.7015, + "step": 4978 + }, + { + "epoch": 0.5085801838610827, + "grad_norm": 1.552543933066069, + "learning_rate": 1.02084098649302e-05, + "loss": 0.7515, + "step": 4979 + }, + { + "epoch": 0.508682328907048, + "grad_norm": 1.563332453309511, + "learning_rate": 1.0205102239527467e-05, + "loss": 0.7735, + "step": 4980 + }, + { + "epoch": 0.5087844739530133, + "grad_norm": 1.6258730233149554, + "learning_rate": 1.0201794591676164e-05, + "loss": 0.7516, + "step": 4981 + }, + { + "epoch": 0.5088866189989786, + "grad_norm": 1.3079701718078196, + "learning_rate": 1.0198486921738313e-05, + "loss": 0.5923, + "step": 4982 + }, + { + "epoch": 0.5089887640449439, + "grad_norm": 1.3943874781658374, + "learning_rate": 1.0195179230075937e-05, + "loss": 0.687, + "step": 4983 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 1.33725374530559, + "learning_rate": 1.0191871517051072e-05, + "loss": 0.7005, + "step": 4984 + }, + { + "epoch": 0.5091930541368743, + "grad_norm": 1.4763386045288456, + "learning_rate": 1.0188563783025742e-05, + "loss": 0.7885, + "step": 4985 + }, + { + "epoch": 0.5092951991828396, + "grad_norm": 1.298126020394323, + "learning_rate": 1.0185256028361987e-05, + "loss": 0.6568, + "step": 4986 + }, + { + "epoch": 0.5093973442288049, + "grad_norm": 1.4426594538511717, + "learning_rate": 1.0181948253421839e-05, + "loss": 0.6926, + "step": 4987 + }, + { + "epoch": 0.5094994892747702, + "grad_norm": 1.3713714818637979, + "learning_rate": 1.0178640458567334e-05, + "loss": 0.7149, + "step": 4988 + }, + { + "epoch": 0.5096016343207355, + "grad_norm": 1.593644299862376, + "learning_rate": 1.0175332644160521e-05, + "loss": 0.7229, + "step": 4989 + }, + { + "epoch": 0.5097037793667007, + "grad_norm": 1.4474671841797542, + "learning_rate": 1.0172024810563435e-05, + "loss": 0.7814, + "step": 4990 + }, + { + "epoch": 0.509805924412666, + "grad_norm": 1.5879352394758206, + "learning_rate": 1.016871695813812e-05, + "loss": 0.809, + "step": 4991 + }, + { + "epoch": 0.5099080694586312, + "grad_norm": 1.5684126035661095, + "learning_rate": 1.0165409087246627e-05, + "loss": 0.6729, + "step": 4992 + }, + { + "epoch": 0.5100102145045965, + "grad_norm": 1.4508443582389976, + "learning_rate": 1.0162101198251002e-05, + "loss": 0.7103, + "step": 4993 + }, + { + "epoch": 0.5101123595505618, + "grad_norm": 1.5391466259624145, + "learning_rate": 1.0158793291513296e-05, + "loss": 0.6897, + "step": 4994 + }, + { + "epoch": 0.5102145045965271, + "grad_norm": 1.5408442295054745, + "learning_rate": 1.015548536739556e-05, + "loss": 0.645, + "step": 4995 + }, + { + "epoch": 0.5103166496424923, + "grad_norm": 1.4203063400317217, + "learning_rate": 1.0152177426259852e-05, + "loss": 0.7228, + "step": 4996 + }, + { + "epoch": 0.5104187946884576, + "grad_norm": 1.5422079458619808, + "learning_rate": 1.0148869468468225e-05, + "loss": 0.7558, + "step": 4997 + }, + { + "epoch": 0.5105209397344229, + "grad_norm": 1.5420226479208468, + "learning_rate": 1.0145561494382743e-05, + "loss": 0.7381, + "step": 4998 + }, + { + "epoch": 0.5106230847803882, + "grad_norm": 1.4938624075509517, + "learning_rate": 1.0142253504365458e-05, + "loss": 0.6824, + "step": 4999 + }, + { + "epoch": 0.5107252298263534, + "grad_norm": 1.661207342991121, + "learning_rate": 1.0138945498778433e-05, + "loss": 0.7235, + "step": 5000 + }, + { + "epoch": 0.5108273748723187, + "grad_norm": 1.3066948050772274, + "learning_rate": 1.0135637477983738e-05, + "loss": 0.6311, + "step": 5001 + }, + { + "epoch": 0.510929519918284, + "grad_norm": 1.471095721592753, + "learning_rate": 1.013232944234343e-05, + "loss": 0.7429, + "step": 5002 + }, + { + "epoch": 0.5110316649642492, + "grad_norm": 1.414695509550472, + "learning_rate": 1.012902139221958e-05, + "loss": 0.6409, + "step": 5003 + }, + { + "epoch": 0.5111338100102145, + "grad_norm": 1.4341900720333762, + "learning_rate": 1.0125713327974253e-05, + "loss": 0.7215, + "step": 5004 + }, + { + "epoch": 0.5112359550561798, + "grad_norm": 1.5705947198090602, + "learning_rate": 1.0122405249969525e-05, + "loss": 0.7216, + "step": 5005 + }, + { + "epoch": 0.5113381001021451, + "grad_norm": 1.343164978296261, + "learning_rate": 1.011909715856746e-05, + "loss": 0.7009, + "step": 5006 + }, + { + "epoch": 0.5114402451481103, + "grad_norm": 1.487391374263533, + "learning_rate": 1.0115789054130136e-05, + "loss": 0.6737, + "step": 5007 + }, + { + "epoch": 0.5115423901940755, + "grad_norm": 1.6439198424027581, + "learning_rate": 1.0112480937019624e-05, + "loss": 0.7974, + "step": 5008 + }, + { + "epoch": 0.5116445352400408, + "grad_norm": 1.378566787593172, + "learning_rate": 1.0109172807598005e-05, + "loss": 0.6539, + "step": 5009 + }, + { + "epoch": 0.5117466802860061, + "grad_norm": 1.5869236655144923, + "learning_rate": 1.0105864666227345e-05, + "loss": 0.6963, + "step": 5010 + }, + { + "epoch": 0.5118488253319714, + "grad_norm": 1.5379285055702219, + "learning_rate": 1.0102556513269735e-05, + "loss": 0.8451, + "step": 5011 + }, + { + "epoch": 0.5119509703779367, + "grad_norm": 1.3723288999572696, + "learning_rate": 1.0099248349087242e-05, + "loss": 0.6843, + "step": 5012 + }, + { + "epoch": 0.512053115423902, + "grad_norm": 1.3878120551259336, + "learning_rate": 1.0095940174041959e-05, + "loss": 0.6514, + "step": 5013 + }, + { + "epoch": 0.5121552604698673, + "grad_norm": 1.4770768181131304, + "learning_rate": 1.0092631988495957e-05, + "loss": 0.7153, + "step": 5014 + }, + { + "epoch": 0.5122574055158324, + "grad_norm": 1.5242024814533823, + "learning_rate": 1.0089323792811329e-05, + "loss": 0.778, + "step": 5015 + }, + { + "epoch": 0.5123595505617977, + "grad_norm": 1.5194101112736569, + "learning_rate": 1.0086015587350151e-05, + "loss": 0.6832, + "step": 5016 + }, + { + "epoch": 0.512461695607763, + "grad_norm": 1.4404030368731784, + "learning_rate": 1.0082707372474512e-05, + "loss": 0.7597, + "step": 5017 + }, + { + "epoch": 0.5125638406537283, + "grad_norm": 1.5012777644700137, + "learning_rate": 1.0079399148546498e-05, + "loss": 0.7031, + "step": 5018 + }, + { + "epoch": 0.5126659856996936, + "grad_norm": 1.5597456050725038, + "learning_rate": 1.0076090915928194e-05, + "loss": 0.7478, + "step": 5019 + }, + { + "epoch": 0.5127681307456589, + "grad_norm": 1.355423371053339, + "learning_rate": 1.007278267498169e-05, + "loss": 0.6733, + "step": 5020 + }, + { + "epoch": 0.5128702757916241, + "grad_norm": 1.6261570273539918, + "learning_rate": 1.006947442606908e-05, + "loss": 0.7526, + "step": 5021 + }, + { + "epoch": 0.5129724208375894, + "grad_norm": 1.5866921582001485, + "learning_rate": 1.0066166169552444e-05, + "loss": 0.7462, + "step": 5022 + }, + { + "epoch": 0.5130745658835546, + "grad_norm": 1.411489396812108, + "learning_rate": 1.0062857905793883e-05, + "loss": 0.6992, + "step": 5023 + }, + { + "epoch": 0.5131767109295199, + "grad_norm": 1.526969075607703, + "learning_rate": 1.0059549635155477e-05, + "loss": 0.6917, + "step": 5024 + }, + { + "epoch": 0.5132788559754852, + "grad_norm": 1.5264803022624562, + "learning_rate": 1.005624135799933e-05, + "loss": 0.6993, + "step": 5025 + }, + { + "epoch": 0.5133810010214505, + "grad_norm": 1.5105370698861917, + "learning_rate": 1.005293307468753e-05, + "loss": 0.7261, + "step": 5026 + }, + { + "epoch": 0.5134831460674157, + "grad_norm": 1.654678278100486, + "learning_rate": 1.0049624785582169e-05, + "loss": 0.7391, + "step": 5027 + }, + { + "epoch": 0.513585291113381, + "grad_norm": 1.5998962492844213, + "learning_rate": 1.0046316491045343e-05, + "loss": 0.829, + "step": 5028 + }, + { + "epoch": 0.5136874361593463, + "grad_norm": 1.4427022931432851, + "learning_rate": 1.0043008191439147e-05, + "loss": 0.6785, + "step": 5029 + }, + { + "epoch": 0.5137895812053116, + "grad_norm": 1.4729968855420275, + "learning_rate": 1.0039699887125678e-05, + "loss": 0.758, + "step": 5030 + }, + { + "epoch": 0.5138917262512768, + "grad_norm": 1.3596378548166397, + "learning_rate": 1.0036391578467031e-05, + "loss": 0.7359, + "step": 5031 + }, + { + "epoch": 0.513993871297242, + "grad_norm": 1.6382597227279492, + "learning_rate": 1.0033083265825301e-05, + "loss": 0.7313, + "step": 5032 + }, + { + "epoch": 0.5140960163432073, + "grad_norm": 1.5056283480443384, + "learning_rate": 1.0029774949562588e-05, + "loss": 0.7083, + "step": 5033 + }, + { + "epoch": 0.5141981613891726, + "grad_norm": 1.516225681957321, + "learning_rate": 1.0026466630040984e-05, + "loss": 0.7495, + "step": 5034 + }, + { + "epoch": 0.5143003064351379, + "grad_norm": 1.5442895582633052, + "learning_rate": 1.0023158307622594e-05, + "loss": 0.7819, + "step": 5035 + }, + { + "epoch": 0.5144024514811032, + "grad_norm": 1.3092253371373535, + "learning_rate": 1.001984998266951e-05, + "loss": 0.736, + "step": 5036 + }, + { + "epoch": 0.5145045965270685, + "grad_norm": 1.5615601268186117, + "learning_rate": 1.0016541655543833e-05, + "loss": 0.6946, + "step": 5037 + }, + { + "epoch": 0.5146067415730337, + "grad_norm": 1.4313658374837899, + "learning_rate": 1.001323332660766e-05, + "loss": 0.6383, + "step": 5038 + }, + { + "epoch": 0.5147088866189989, + "grad_norm": 1.466079894321012, + "learning_rate": 1.0009924996223093e-05, + "loss": 0.6764, + "step": 5039 + }, + { + "epoch": 0.5148110316649642, + "grad_norm": 1.5032568986496178, + "learning_rate": 1.0006616664752227e-05, + "loss": 0.737, + "step": 5040 + }, + { + "epoch": 0.5149131767109295, + "grad_norm": 1.4381561145015749, + "learning_rate": 1.0003308332557163e-05, + "loss": 0.6853, + "step": 5041 + }, + { + "epoch": 0.5150153217568948, + "grad_norm": 1.514054188961378, + "learning_rate": 1e-05, + "loss": 0.7041, + "step": 5042 + }, + { + "epoch": 0.5151174668028601, + "grad_norm": 1.5407023890316274, + "learning_rate": 9.99669166744284e-06, + "loss": 0.7084, + "step": 5043 + }, + { + "epoch": 0.5152196118488254, + "grad_norm": 1.4027229541091284, + "learning_rate": 9.993383335247777e-06, + "loss": 0.6677, + "step": 5044 + }, + { + "epoch": 0.5153217568947907, + "grad_norm": 1.5254464612568175, + "learning_rate": 9.990075003776913e-06, + "loss": 0.683, + "step": 5045 + }, + { + "epoch": 0.5154239019407558, + "grad_norm": 1.6714994435999713, + "learning_rate": 9.986766673392344e-06, + "loss": 0.6915, + "step": 5046 + }, + { + "epoch": 0.5155260469867211, + "grad_norm": 1.5161590193303969, + "learning_rate": 9.983458344456169e-06, + "loss": 0.7425, + "step": 5047 + }, + { + "epoch": 0.5156281920326864, + "grad_norm": 1.5545020095786672, + "learning_rate": 9.980150017330494e-06, + "loss": 0.7354, + "step": 5048 + }, + { + "epoch": 0.5157303370786517, + "grad_norm": 1.4609707613585636, + "learning_rate": 9.976841692377409e-06, + "loss": 0.6889, + "step": 5049 + }, + { + "epoch": 0.515832482124617, + "grad_norm": 1.4382173263031521, + "learning_rate": 9.973533369959018e-06, + "loss": 0.703, + "step": 5050 + }, + { + "epoch": 0.5159346271705822, + "grad_norm": 1.5368490777179438, + "learning_rate": 9.970225050437417e-06, + "loss": 0.7613, + "step": 5051 + }, + { + "epoch": 0.5160367722165475, + "grad_norm": 1.33728049232719, + "learning_rate": 9.966916734174702e-06, + "loss": 0.7442, + "step": 5052 + }, + { + "epoch": 0.5161389172625128, + "grad_norm": 1.5343228406577771, + "learning_rate": 9.963608421532972e-06, + "loss": 0.6912, + "step": 5053 + }, + { + "epoch": 0.516241062308478, + "grad_norm": 1.4562095962932375, + "learning_rate": 9.960300112874327e-06, + "loss": 0.819, + "step": 5054 + }, + { + "epoch": 0.5163432073544433, + "grad_norm": 1.3758092454252902, + "learning_rate": 9.956991808560855e-06, + "loss": 0.6927, + "step": 5055 + }, + { + "epoch": 0.5164453524004086, + "grad_norm": 1.461762282121934, + "learning_rate": 9.953683508954659e-06, + "loss": 0.7272, + "step": 5056 + }, + { + "epoch": 0.5165474974463738, + "grad_norm": 1.4588567569296538, + "learning_rate": 9.950375214417833e-06, + "loss": 0.6181, + "step": 5057 + }, + { + "epoch": 0.5166496424923391, + "grad_norm": 1.509143891973364, + "learning_rate": 9.947066925312472e-06, + "loss": 0.8049, + "step": 5058 + }, + { + "epoch": 0.5167517875383044, + "grad_norm": 1.472274964002463, + "learning_rate": 9.943758642000673e-06, + "loss": 0.7034, + "step": 5059 + }, + { + "epoch": 0.5168539325842697, + "grad_norm": 1.5356466968300069, + "learning_rate": 9.940450364844525e-06, + "loss": 0.7741, + "step": 5060 + }, + { + "epoch": 0.5169560776302349, + "grad_norm": 1.6163531903171955, + "learning_rate": 9.937142094206122e-06, + "loss": 0.8499, + "step": 5061 + }, + { + "epoch": 0.5170582226762002, + "grad_norm": 1.3900653036177841, + "learning_rate": 9.93383383044756e-06, + "loss": 0.6009, + "step": 5062 + }, + { + "epoch": 0.5171603677221654, + "grad_norm": 1.4816395478688142, + "learning_rate": 9.930525573930922e-06, + "loss": 0.7708, + "step": 5063 + }, + { + "epoch": 0.5172625127681307, + "grad_norm": 1.3870062707823916, + "learning_rate": 9.927217325018309e-06, + "loss": 0.6804, + "step": 5064 + }, + { + "epoch": 0.517364657814096, + "grad_norm": 1.4878997443143618, + "learning_rate": 9.923909084071808e-06, + "loss": 0.6627, + "step": 5065 + }, + { + "epoch": 0.5174668028600613, + "grad_norm": 1.3972291301998636, + "learning_rate": 9.920600851453505e-06, + "loss": 0.7243, + "step": 5066 + }, + { + "epoch": 0.5175689479060266, + "grad_norm": 1.5377944516763358, + "learning_rate": 9.917292627525493e-06, + "loss": 0.6708, + "step": 5067 + }, + { + "epoch": 0.5176710929519919, + "grad_norm": 1.4612159782052174, + "learning_rate": 9.913984412649852e-06, + "loss": 0.642, + "step": 5068 + }, + { + "epoch": 0.517773237997957, + "grad_norm": 1.4930830348889437, + "learning_rate": 9.910676207188676e-06, + "loss": 0.6929, + "step": 5069 + }, + { + "epoch": 0.5178753830439223, + "grad_norm": 1.6392874611336676, + "learning_rate": 9.907368011504044e-06, + "loss": 0.7594, + "step": 5070 + }, + { + "epoch": 0.5179775280898876, + "grad_norm": 1.3046372773311206, + "learning_rate": 9.904059825958043e-06, + "loss": 0.7213, + "step": 5071 + }, + { + "epoch": 0.5180796731358529, + "grad_norm": 1.5750731969422274, + "learning_rate": 9.90075165091276e-06, + "loss": 0.7149, + "step": 5072 + }, + { + "epoch": 0.5181818181818182, + "grad_norm": 1.4760942067561031, + "learning_rate": 9.897443486730268e-06, + "loss": 0.7305, + "step": 5073 + }, + { + "epoch": 0.5182839632277835, + "grad_norm": 1.399452614218617, + "learning_rate": 9.894135333772657e-06, + "loss": 0.6304, + "step": 5074 + }, + { + "epoch": 0.5183861082737488, + "grad_norm": 1.606809156412251, + "learning_rate": 9.890827192402e-06, + "loss": 0.7522, + "step": 5075 + }, + { + "epoch": 0.518488253319714, + "grad_norm": 1.368347899673904, + "learning_rate": 9.88751906298038e-06, + "loss": 0.7237, + "step": 5076 + }, + { + "epoch": 0.5185903983656792, + "grad_norm": 1.4199384682758185, + "learning_rate": 9.88421094586987e-06, + "loss": 0.8127, + "step": 5077 + }, + { + "epoch": 0.5186925434116445, + "grad_norm": 1.4552173781406839, + "learning_rate": 9.880902841432544e-06, + "loss": 0.6946, + "step": 5078 + }, + { + "epoch": 0.5187946884576098, + "grad_norm": 1.4544203174762038, + "learning_rate": 9.877594750030477e-06, + "loss": 0.6478, + "step": 5079 + }, + { + "epoch": 0.5188968335035751, + "grad_norm": 1.5768149025467835, + "learning_rate": 9.874286672025749e-06, + "loss": 0.7469, + "step": 5080 + }, + { + "epoch": 0.5189989785495404, + "grad_norm": 1.44510525526849, + "learning_rate": 9.870978607780423e-06, + "loss": 0.7796, + "step": 5081 + }, + { + "epoch": 0.5191011235955056, + "grad_norm": 1.5508701880248372, + "learning_rate": 9.867670557656575e-06, + "loss": 0.6764, + "step": 5082 + }, + { + "epoch": 0.5192032686414709, + "grad_norm": 1.1860944094652088, + "learning_rate": 9.864362522016266e-06, + "loss": 0.6131, + "step": 5083 + }, + { + "epoch": 0.5193054136874362, + "grad_norm": 1.3638236151727117, + "learning_rate": 9.861054501221569e-06, + "loss": 0.6832, + "step": 5084 + }, + { + "epoch": 0.5194075587334014, + "grad_norm": 1.4411377216565255, + "learning_rate": 9.857746495634547e-06, + "loss": 0.7492, + "step": 5085 + }, + { + "epoch": 0.5195097037793667, + "grad_norm": 1.6104760989757423, + "learning_rate": 9.854438505617264e-06, + "loss": 0.6986, + "step": 5086 + }, + { + "epoch": 0.519611848825332, + "grad_norm": 1.538034773615667, + "learning_rate": 9.851130531531775e-06, + "loss": 0.7912, + "step": 5087 + }, + { + "epoch": 0.5197139938712972, + "grad_norm": 1.4058910173464245, + "learning_rate": 9.847822573740148e-06, + "loss": 0.7131, + "step": 5088 + }, + { + "epoch": 0.5198161389172625, + "grad_norm": 1.4674621166643893, + "learning_rate": 9.844514632604441e-06, + "loss": 0.6725, + "step": 5089 + }, + { + "epoch": 0.5199182839632278, + "grad_norm": 1.4618057673559015, + "learning_rate": 9.841206708486705e-06, + "loss": 0.5635, + "step": 5090 + }, + { + "epoch": 0.5200204290091931, + "grad_norm": 1.5492083622202624, + "learning_rate": 9.837898801749001e-06, + "loss": 0.6652, + "step": 5091 + }, + { + "epoch": 0.5201225740551583, + "grad_norm": 1.46502543553936, + "learning_rate": 9.834590912753376e-06, + "loss": 0.7195, + "step": 5092 + }, + { + "epoch": 0.5202247191011236, + "grad_norm": 1.489313570097098, + "learning_rate": 9.831283041861883e-06, + "loss": 0.7057, + "step": 5093 + }, + { + "epoch": 0.5203268641470888, + "grad_norm": 1.440211829884361, + "learning_rate": 9.827975189436572e-06, + "loss": 0.7159, + "step": 5094 + }, + { + "epoch": 0.5204290091930541, + "grad_norm": 1.6593988218012934, + "learning_rate": 9.824667355839479e-06, + "loss": 0.7569, + "step": 5095 + }, + { + "epoch": 0.5205311542390194, + "grad_norm": 1.4306533723228798, + "learning_rate": 9.821359541432664e-06, + "loss": 0.7001, + "step": 5096 + }, + { + "epoch": 0.5206332992849847, + "grad_norm": 1.598567843413237, + "learning_rate": 9.818051746578165e-06, + "loss": 0.7744, + "step": 5097 + }, + { + "epoch": 0.52073544433095, + "grad_norm": 1.4634821203353316, + "learning_rate": 9.814743971638016e-06, + "loss": 0.7436, + "step": 5098 + }, + { + "epoch": 0.5208375893769153, + "grad_norm": 1.4082152457858663, + "learning_rate": 9.81143621697426e-06, + "loss": 0.7359, + "step": 5099 + }, + { + "epoch": 0.5209397344228804, + "grad_norm": 1.6112140055927395, + "learning_rate": 9.808128482948932e-06, + "loss": 0.7314, + "step": 5100 + }, + { + "epoch": 0.5210418794688457, + "grad_norm": 1.4608811343047152, + "learning_rate": 9.804820769924066e-06, + "loss": 0.7638, + "step": 5101 + }, + { + "epoch": 0.521144024514811, + "grad_norm": 1.4543920651397524, + "learning_rate": 9.801513078261692e-06, + "loss": 0.7281, + "step": 5102 + }, + { + "epoch": 0.5212461695607763, + "grad_norm": 1.4715674723462617, + "learning_rate": 9.798205408323836e-06, + "loss": 0.7847, + "step": 5103 + }, + { + "epoch": 0.5213483146067416, + "grad_norm": 1.3786978048574219, + "learning_rate": 9.794897760472533e-06, + "loss": 0.6709, + "step": 5104 + }, + { + "epoch": 0.5214504596527069, + "grad_norm": 1.5521306073102712, + "learning_rate": 9.791590135069802e-06, + "loss": 0.7853, + "step": 5105 + }, + { + "epoch": 0.5215526046986722, + "grad_norm": 1.5015215760675327, + "learning_rate": 9.788282532477667e-06, + "loss": 0.781, + "step": 5106 + }, + { + "epoch": 0.5216547497446374, + "grad_norm": 1.395912833914741, + "learning_rate": 9.784974953058142e-06, + "loss": 0.7016, + "step": 5107 + }, + { + "epoch": 0.5217568947906026, + "grad_norm": 1.7594742889275699, + "learning_rate": 9.78166739717325e-06, + "loss": 0.7252, + "step": 5108 + }, + { + "epoch": 0.5218590398365679, + "grad_norm": 1.6559845561480964, + "learning_rate": 9.778359865185003e-06, + "loss": 0.7224, + "step": 5109 + }, + { + "epoch": 0.5219611848825332, + "grad_norm": 1.4170131301667364, + "learning_rate": 9.775052357455411e-06, + "loss": 0.7269, + "step": 5110 + }, + { + "epoch": 0.5220633299284985, + "grad_norm": 1.4962461688736937, + "learning_rate": 9.77174487434648e-06, + "loss": 0.6251, + "step": 5111 + }, + { + "epoch": 0.5221654749744638, + "grad_norm": 1.4369182092448913, + "learning_rate": 9.768437416220224e-06, + "loss": 0.7718, + "step": 5112 + }, + { + "epoch": 0.522267620020429, + "grad_norm": 1.3128302729015586, + "learning_rate": 9.765129983438642e-06, + "loss": 0.657, + "step": 5113 + }, + { + "epoch": 0.5223697650663943, + "grad_norm": 1.3797081584161954, + "learning_rate": 9.761822576363737e-06, + "loss": 0.6988, + "step": 5114 + }, + { + "epoch": 0.5224719101123596, + "grad_norm": 1.458159366944201, + "learning_rate": 9.758515195357501e-06, + "loss": 0.6998, + "step": 5115 + }, + { + "epoch": 0.5225740551583248, + "grad_norm": 1.5475287283222443, + "learning_rate": 9.755207840781937e-06, + "loss": 0.798, + "step": 5116 + }, + { + "epoch": 0.5226762002042901, + "grad_norm": 1.3773093236034852, + "learning_rate": 9.75190051299903e-06, + "loss": 0.7541, + "step": 5117 + }, + { + "epoch": 0.5227783452502553, + "grad_norm": 1.4254443804497317, + "learning_rate": 9.748593212370773e-06, + "loss": 0.6485, + "step": 5118 + }, + { + "epoch": 0.5228804902962206, + "grad_norm": 1.4301000627484601, + "learning_rate": 9.745285939259148e-06, + "loss": 0.7523, + "step": 5119 + }, + { + "epoch": 0.5229826353421859, + "grad_norm": 1.4920997205350683, + "learning_rate": 9.741978694026145e-06, + "loss": 0.7864, + "step": 5120 + }, + { + "epoch": 0.5230847803881512, + "grad_norm": 1.5106081143517995, + "learning_rate": 9.738671477033741e-06, + "loss": 0.8059, + "step": 5121 + }, + { + "epoch": 0.5231869254341165, + "grad_norm": 1.4139685284321442, + "learning_rate": 9.735364288643912e-06, + "loss": 0.7168, + "step": 5122 + }, + { + "epoch": 0.5232890704800817, + "grad_norm": 1.426102819257782, + "learning_rate": 9.732057129218634e-06, + "loss": 0.655, + "step": 5123 + }, + { + "epoch": 0.523391215526047, + "grad_norm": 1.4165945918142098, + "learning_rate": 9.728749999119872e-06, + "loss": 0.7276, + "step": 5124 + }, + { + "epoch": 0.5234933605720122, + "grad_norm": 1.5830020601416201, + "learning_rate": 9.725442898709604e-06, + "loss": 0.6914, + "step": 5125 + }, + { + "epoch": 0.5235955056179775, + "grad_norm": 1.5743064718683806, + "learning_rate": 9.722135828349784e-06, + "loss": 0.7779, + "step": 5126 + }, + { + "epoch": 0.5236976506639428, + "grad_norm": 1.5482270638775972, + "learning_rate": 9.718828788402374e-06, + "loss": 0.7639, + "step": 5127 + }, + { + "epoch": 0.5237997957099081, + "grad_norm": 1.453334098293946, + "learning_rate": 9.715521779229342e-06, + "loss": 0.6977, + "step": 5128 + }, + { + "epoch": 0.5239019407558734, + "grad_norm": 1.5625917186930427, + "learning_rate": 9.71221480119263e-06, + "loss": 0.7718, + "step": 5129 + }, + { + "epoch": 0.5240040858018387, + "grad_norm": 1.4521412400899234, + "learning_rate": 9.708907854654198e-06, + "loss": 0.6889, + "step": 5130 + }, + { + "epoch": 0.5241062308478038, + "grad_norm": 1.398189493511134, + "learning_rate": 9.705600939975988e-06, + "loss": 0.7046, + "step": 5131 + }, + { + "epoch": 0.5242083758937691, + "grad_norm": 1.506521122546969, + "learning_rate": 9.702294057519943e-06, + "loss": 0.7626, + "step": 5132 + }, + { + "epoch": 0.5243105209397344, + "grad_norm": 1.6063259208396754, + "learning_rate": 9.698987207648009e-06, + "loss": 0.715, + "step": 5133 + }, + { + "epoch": 0.5244126659856997, + "grad_norm": 1.4726721475135285, + "learning_rate": 9.695680390722117e-06, + "loss": 0.6684, + "step": 5134 + }, + { + "epoch": 0.524514811031665, + "grad_norm": 1.3268591401007739, + "learning_rate": 9.692373607104199e-06, + "loss": 0.7215, + "step": 5135 + }, + { + "epoch": 0.5246169560776303, + "grad_norm": 1.472933904520735, + "learning_rate": 9.689066857156196e-06, + "loss": 0.6952, + "step": 5136 + }, + { + "epoch": 0.5247191011235955, + "grad_norm": 1.4285027928044267, + "learning_rate": 9.685760141240022e-06, + "loss": 0.6048, + "step": 5137 + }, + { + "epoch": 0.5248212461695608, + "grad_norm": 1.506939851132931, + "learning_rate": 9.682453459717607e-06, + "loss": 0.765, + "step": 5138 + }, + { + "epoch": 0.524923391215526, + "grad_norm": 1.6195691401487944, + "learning_rate": 9.679146812950863e-06, + "loss": 0.7709, + "step": 5139 + }, + { + "epoch": 0.5250255362614913, + "grad_norm": 1.407943020312896, + "learning_rate": 9.675840201301709e-06, + "loss": 0.7129, + "step": 5140 + }, + { + "epoch": 0.5251276813074566, + "grad_norm": 1.4612211684616374, + "learning_rate": 9.672533625132053e-06, + "loss": 0.7242, + "step": 5141 + }, + { + "epoch": 0.5252298263534219, + "grad_norm": 1.5344220960942356, + "learning_rate": 9.669227084803806e-06, + "loss": 0.7248, + "step": 5142 + }, + { + "epoch": 0.5253319713993871, + "grad_norm": 1.491028807803594, + "learning_rate": 9.665920580678863e-06, + "loss": 0.6781, + "step": 5143 + }, + { + "epoch": 0.5254341164453524, + "grad_norm": 1.652221921116902, + "learning_rate": 9.662614113119132e-06, + "loss": 0.8428, + "step": 5144 + }, + { + "epoch": 0.5255362614913177, + "grad_norm": 1.3219219861732163, + "learning_rate": 9.659307682486506e-06, + "loss": 0.6062, + "step": 5145 + }, + { + "epoch": 0.5256384065372829, + "grad_norm": 1.5840142994394426, + "learning_rate": 9.656001289142872e-06, + "loss": 0.6878, + "step": 5146 + }, + { + "epoch": 0.5257405515832482, + "grad_norm": 1.3400159775019036, + "learning_rate": 9.65269493345012e-06, + "loss": 0.6692, + "step": 5147 + }, + { + "epoch": 0.5258426966292135, + "grad_norm": 1.6175365239293367, + "learning_rate": 9.649388615770134e-06, + "loss": 0.7748, + "step": 5148 + }, + { + "epoch": 0.5259448416751787, + "grad_norm": 1.4617545388348845, + "learning_rate": 9.646082336464787e-06, + "loss": 0.6759, + "step": 5149 + }, + { + "epoch": 0.526046986721144, + "grad_norm": 1.3685413776591426, + "learning_rate": 9.642776095895959e-06, + "loss": 0.6313, + "step": 5150 + }, + { + "epoch": 0.5261491317671093, + "grad_norm": 1.5030004027073611, + "learning_rate": 9.639469894425515e-06, + "loss": 0.6903, + "step": 5151 + }, + { + "epoch": 0.5262512768130746, + "grad_norm": 1.5636079019727214, + "learning_rate": 9.636163732415325e-06, + "loss": 0.708, + "step": 5152 + }, + { + "epoch": 0.5263534218590399, + "grad_norm": 1.5003921346173024, + "learning_rate": 9.632857610227251e-06, + "loss": 0.6825, + "step": 5153 + }, + { + "epoch": 0.5264555669050051, + "grad_norm": 1.482840292955541, + "learning_rate": 9.629551528223147e-06, + "loss": 0.7052, + "step": 5154 + }, + { + "epoch": 0.5265577119509703, + "grad_norm": 1.480367382004129, + "learning_rate": 9.626245486764872e-06, + "loss": 0.7671, + "step": 5155 + }, + { + "epoch": 0.5266598569969356, + "grad_norm": 1.4598999597691995, + "learning_rate": 9.622939486214265e-06, + "loss": 0.7796, + "step": 5156 + }, + { + "epoch": 0.5267620020429009, + "grad_norm": 1.4526246243513843, + "learning_rate": 9.619633526933178e-06, + "loss": 0.7693, + "step": 5157 + }, + { + "epoch": 0.5268641470888662, + "grad_norm": 1.483152338222519, + "learning_rate": 9.616327609283445e-06, + "loss": 0.7343, + "step": 5158 + }, + { + "epoch": 0.5269662921348315, + "grad_norm": 1.4935644627300313, + "learning_rate": 9.613021733626901e-06, + "loss": 0.7264, + "step": 5159 + }, + { + "epoch": 0.5270684371807968, + "grad_norm": 1.4892375072301431, + "learning_rate": 9.609715900325382e-06, + "loss": 0.7474, + "step": 5160 + }, + { + "epoch": 0.527170582226762, + "grad_norm": 1.700501327049568, + "learning_rate": 9.606410109740708e-06, + "loss": 0.7129, + "step": 5161 + }, + { + "epoch": 0.5272727272727272, + "grad_norm": 1.4575998933806678, + "learning_rate": 9.603104362234706e-06, + "loss": 0.7091, + "step": 5162 + }, + { + "epoch": 0.5273748723186925, + "grad_norm": 1.5009987290615843, + "learning_rate": 9.599798658169183e-06, + "loss": 0.6751, + "step": 5163 + }, + { + "epoch": 0.5274770173646578, + "grad_norm": 1.5089684752005672, + "learning_rate": 9.59649299790596e-06, + "loss": 0.636, + "step": 5164 + }, + { + "epoch": 0.5275791624106231, + "grad_norm": 1.4323652624834637, + "learning_rate": 9.593187381806836e-06, + "loss": 0.6815, + "step": 5165 + }, + { + "epoch": 0.5276813074565884, + "grad_norm": 1.5379467146294474, + "learning_rate": 9.589881810233617e-06, + "loss": 0.6455, + "step": 5166 + }, + { + "epoch": 0.5277834525025537, + "grad_norm": 1.4597742437757253, + "learning_rate": 9.586576283548094e-06, + "loss": 0.6074, + "step": 5167 + }, + { + "epoch": 0.5278855975485189, + "grad_norm": 1.359526809507787, + "learning_rate": 9.583270802112071e-06, + "loss": 0.7651, + "step": 5168 + }, + { + "epoch": 0.5279877425944842, + "grad_norm": 1.4085023448290075, + "learning_rate": 9.579965366287323e-06, + "loss": 0.653, + "step": 5169 + }, + { + "epoch": 0.5280898876404494, + "grad_norm": 1.3708279307672784, + "learning_rate": 9.576659976435642e-06, + "loss": 0.702, + "step": 5170 + }, + { + "epoch": 0.5281920326864147, + "grad_norm": 1.4952205593909023, + "learning_rate": 9.573354632918795e-06, + "loss": 0.6497, + "step": 5171 + }, + { + "epoch": 0.52829417773238, + "grad_norm": 1.3282727066725983, + "learning_rate": 9.570049336098564e-06, + "loss": 0.6484, + "step": 5172 + }, + { + "epoch": 0.5283963227783453, + "grad_norm": 1.5235616748837848, + "learning_rate": 9.566744086336706e-06, + "loss": 0.685, + "step": 5173 + }, + { + "epoch": 0.5284984678243105, + "grad_norm": 1.3512743226983603, + "learning_rate": 9.563438883994991e-06, + "loss": 0.6675, + "step": 5174 + }, + { + "epoch": 0.5286006128702758, + "grad_norm": 1.4272422143408126, + "learning_rate": 9.560133729435169e-06, + "loss": 0.7023, + "step": 5175 + }, + { + "epoch": 0.5287027579162411, + "grad_norm": 1.6387202021569567, + "learning_rate": 9.556828623018995e-06, + "loss": 0.8347, + "step": 5176 + }, + { + "epoch": 0.5288049029622063, + "grad_norm": 1.4893469388334897, + "learning_rate": 9.553523565108218e-06, + "loss": 0.7538, + "step": 5177 + }, + { + "epoch": 0.5289070480081716, + "grad_norm": 1.4249779120486283, + "learning_rate": 9.550218556064571e-06, + "loss": 0.7235, + "step": 5178 + }, + { + "epoch": 0.5290091930541369, + "grad_norm": 1.4211583157086818, + "learning_rate": 9.546913596249799e-06, + "loss": 0.7038, + "step": 5179 + }, + { + "epoch": 0.5291113381001021, + "grad_norm": 1.5960536678655683, + "learning_rate": 9.543608686025623e-06, + "loss": 0.7541, + "step": 5180 + }, + { + "epoch": 0.5292134831460674, + "grad_norm": 1.2827435194768086, + "learning_rate": 9.540303825753773e-06, + "loss": 0.6395, + "step": 5181 + }, + { + "epoch": 0.5293156281920327, + "grad_norm": 1.5285437181298687, + "learning_rate": 9.536999015795964e-06, + "loss": 0.6846, + "step": 5182 + }, + { + "epoch": 0.529417773237998, + "grad_norm": 1.6057062622426963, + "learning_rate": 9.53369425651391e-06, + "loss": 0.7851, + "step": 5183 + }, + { + "epoch": 0.5295199182839633, + "grad_norm": 1.423785849295611, + "learning_rate": 9.530389548269325e-06, + "loss": 0.7498, + "step": 5184 + }, + { + "epoch": 0.5296220633299284, + "grad_norm": 1.3436873685320918, + "learning_rate": 9.527084891423908e-06, + "loss": 0.6765, + "step": 5185 + }, + { + "epoch": 0.5297242083758937, + "grad_norm": 1.4496191456932257, + "learning_rate": 9.523780286339353e-06, + "loss": 0.7391, + "step": 5186 + }, + { + "epoch": 0.529826353421859, + "grad_norm": 1.6439732869842854, + "learning_rate": 9.520475733377355e-06, + "loss": 0.7116, + "step": 5187 + }, + { + "epoch": 0.5299284984678243, + "grad_norm": 1.545366081554161, + "learning_rate": 9.517171232899595e-06, + "loss": 0.739, + "step": 5188 + }, + { + "epoch": 0.5300306435137896, + "grad_norm": 1.4986960578817448, + "learning_rate": 9.513866785267758e-06, + "loss": 0.7468, + "step": 5189 + }, + { + "epoch": 0.5301327885597549, + "grad_norm": 1.7446155275946975, + "learning_rate": 9.510562390843514e-06, + "loss": 0.7508, + "step": 5190 + }, + { + "epoch": 0.5302349336057202, + "grad_norm": 1.4896490666947961, + "learning_rate": 9.50725804998853e-06, + "loss": 0.7514, + "step": 5191 + }, + { + "epoch": 0.5303370786516854, + "grad_norm": 1.525841631588098, + "learning_rate": 9.503953763064475e-06, + "loss": 0.6952, + "step": 5192 + }, + { + "epoch": 0.5304392236976506, + "grad_norm": 1.5430319354369864, + "learning_rate": 9.500649530432998e-06, + "loss": 0.6683, + "step": 5193 + }, + { + "epoch": 0.5305413687436159, + "grad_norm": 1.4622079798359395, + "learning_rate": 9.497345352455757e-06, + "loss": 0.6864, + "step": 5194 + }, + { + "epoch": 0.5306435137895812, + "grad_norm": 1.410600549756591, + "learning_rate": 9.494041229494388e-06, + "loss": 0.7752, + "step": 5195 + }, + { + "epoch": 0.5307456588355465, + "grad_norm": 1.5123284249259115, + "learning_rate": 9.490737161910536e-06, + "loss": 0.6101, + "step": 5196 + }, + { + "epoch": 0.5308478038815118, + "grad_norm": 1.5394335469023082, + "learning_rate": 9.487433150065827e-06, + "loss": 0.7096, + "step": 5197 + }, + { + "epoch": 0.530949948927477, + "grad_norm": 1.4027111356414252, + "learning_rate": 9.484129194321896e-06, + "loss": 0.7492, + "step": 5198 + }, + { + "epoch": 0.5310520939734423, + "grad_norm": 1.5076294639375596, + "learning_rate": 9.480825295040352e-06, + "loss": 0.749, + "step": 5199 + }, + { + "epoch": 0.5311542390194075, + "grad_norm": 1.5401512574061447, + "learning_rate": 9.477521452582817e-06, + "loss": 0.8157, + "step": 5200 + }, + { + "epoch": 0.5312563840653728, + "grad_norm": 1.7117409286432816, + "learning_rate": 9.474217667310899e-06, + "loss": 0.7423, + "step": 5201 + }, + { + "epoch": 0.5313585291113381, + "grad_norm": 1.4661050105078965, + "learning_rate": 9.470913939586198e-06, + "loss": 0.6627, + "step": 5202 + }, + { + "epoch": 0.5314606741573034, + "grad_norm": 1.401855504032334, + "learning_rate": 9.467610269770305e-06, + "loss": 0.6642, + "step": 5203 + }, + { + "epoch": 0.5315628192032686, + "grad_norm": 1.2906304813753924, + "learning_rate": 9.464306658224814e-06, + "loss": 0.67, + "step": 5204 + }, + { + "epoch": 0.5316649642492339, + "grad_norm": 1.4643820762637774, + "learning_rate": 9.461003105311301e-06, + "loss": 0.664, + "step": 5205 + }, + { + "epoch": 0.5317671092951992, + "grad_norm": 1.3580602625602194, + "learning_rate": 9.457699611391353e-06, + "loss": 0.6015, + "step": 5206 + }, + { + "epoch": 0.5318692543411645, + "grad_norm": 1.4537758891906634, + "learning_rate": 9.454396176826526e-06, + "loss": 0.7009, + "step": 5207 + }, + { + "epoch": 0.5319713993871297, + "grad_norm": 1.5132973527389146, + "learning_rate": 9.451092801978392e-06, + "loss": 0.7353, + "step": 5208 + }, + { + "epoch": 0.532073544433095, + "grad_norm": 1.5687592504040881, + "learning_rate": 9.447789487208507e-06, + "loss": 0.7545, + "step": 5209 + }, + { + "epoch": 0.5321756894790602, + "grad_norm": 1.5998407060957576, + "learning_rate": 9.44448623287842e-06, + "loss": 0.7174, + "step": 5210 + }, + { + "epoch": 0.5322778345250255, + "grad_norm": 1.3080205329773444, + "learning_rate": 9.441183039349673e-06, + "loss": 0.6654, + "step": 5211 + }, + { + "epoch": 0.5323799795709908, + "grad_norm": 1.5639283786139195, + "learning_rate": 9.437879906983802e-06, + "loss": 0.7279, + "step": 5212 + }, + { + "epoch": 0.5324821246169561, + "grad_norm": 1.357514135423471, + "learning_rate": 9.434576836142342e-06, + "loss": 0.6818, + "step": 5213 + }, + { + "epoch": 0.5325842696629214, + "grad_norm": 1.4100995588972292, + "learning_rate": 9.43127382718681e-06, + "loss": 0.7335, + "step": 5214 + }, + { + "epoch": 0.5326864147088867, + "grad_norm": 1.568960220925224, + "learning_rate": 9.427970880478722e-06, + "loss": 0.7584, + "step": 5215 + }, + { + "epoch": 0.5327885597548518, + "grad_norm": 1.5011030885054004, + "learning_rate": 9.424667996379595e-06, + "loss": 0.7565, + "step": 5216 + }, + { + "epoch": 0.5328907048008171, + "grad_norm": 1.5894263004873823, + "learning_rate": 9.421365175250926e-06, + "loss": 0.7215, + "step": 5217 + }, + { + "epoch": 0.5329928498467824, + "grad_norm": 1.7578176063141788, + "learning_rate": 9.418062417454213e-06, + "loss": 0.7107, + "step": 5218 + }, + { + "epoch": 0.5330949948927477, + "grad_norm": 1.440778242320715, + "learning_rate": 9.414759723350946e-06, + "loss": 0.7162, + "step": 5219 + }, + { + "epoch": 0.533197139938713, + "grad_norm": 1.5198544947260326, + "learning_rate": 9.411457093302602e-06, + "loss": 0.6941, + "step": 5220 + }, + { + "epoch": 0.5332992849846783, + "grad_norm": 1.5739514159496326, + "learning_rate": 9.408154527670664e-06, + "loss": 0.7206, + "step": 5221 + }, + { + "epoch": 0.5334014300306436, + "grad_norm": 1.5434643563777037, + "learning_rate": 9.404852026816591e-06, + "loss": 0.7277, + "step": 5222 + }, + { + "epoch": 0.5335035750766088, + "grad_norm": 1.3102973594194431, + "learning_rate": 9.401549591101846e-06, + "loss": 0.6163, + "step": 5223 + }, + { + "epoch": 0.533605720122574, + "grad_norm": 1.582047805245025, + "learning_rate": 9.398247220887891e-06, + "loss": 0.714, + "step": 5224 + }, + { + "epoch": 0.5337078651685393, + "grad_norm": 1.8185450745973895, + "learning_rate": 9.394944916536162e-06, + "loss": 0.7698, + "step": 5225 + }, + { + "epoch": 0.5338100102145046, + "grad_norm": 1.553943699560829, + "learning_rate": 9.391642678408106e-06, + "loss": 0.7408, + "step": 5226 + }, + { + "epoch": 0.5339121552604699, + "grad_norm": 1.4702776391165397, + "learning_rate": 9.38834050686515e-06, + "loss": 0.7308, + "step": 5227 + }, + { + "epoch": 0.5340143003064352, + "grad_norm": 1.5622365292883602, + "learning_rate": 9.385038402268723e-06, + "loss": 0.7105, + "step": 5228 + }, + { + "epoch": 0.5341164453524004, + "grad_norm": 1.332354262114744, + "learning_rate": 9.381736364980238e-06, + "loss": 0.603, + "step": 5229 + }, + { + "epoch": 0.5342185903983657, + "grad_norm": 1.5320921500301965, + "learning_rate": 9.37843439536111e-06, + "loss": 0.7429, + "step": 5230 + }, + { + "epoch": 0.5343207354443309, + "grad_norm": 1.3837328853944448, + "learning_rate": 9.375132493772732e-06, + "loss": 0.733, + "step": 5231 + }, + { + "epoch": 0.5344228804902962, + "grad_norm": 1.5242104816405373, + "learning_rate": 9.37183066057651e-06, + "loss": 0.6769, + "step": 5232 + }, + { + "epoch": 0.5345250255362615, + "grad_norm": 1.5727672737205118, + "learning_rate": 9.368528896133831e-06, + "loss": 0.5786, + "step": 5233 + }, + { + "epoch": 0.5346271705822268, + "grad_norm": 1.4921052624260152, + "learning_rate": 9.365227200806069e-06, + "loss": 0.7184, + "step": 5234 + }, + { + "epoch": 0.534729315628192, + "grad_norm": 1.4130026584586142, + "learning_rate": 9.361925574954603e-06, + "loss": 0.6841, + "step": 5235 + }, + { + "epoch": 0.5348314606741573, + "grad_norm": 1.365519194615162, + "learning_rate": 9.358624018940795e-06, + "loss": 0.6369, + "step": 5236 + }, + { + "epoch": 0.5349336057201226, + "grad_norm": 1.5666966599637793, + "learning_rate": 9.355322533126002e-06, + "loss": 0.7645, + "step": 5237 + }, + { + "epoch": 0.5350357507660879, + "grad_norm": 1.3232513646076036, + "learning_rate": 9.352021117871574e-06, + "loss": 0.7439, + "step": 5238 + }, + { + "epoch": 0.5351378958120531, + "grad_norm": 1.3245689427114669, + "learning_rate": 9.348719773538849e-06, + "loss": 0.6858, + "step": 5239 + }, + { + "epoch": 0.5352400408580184, + "grad_norm": 1.4605304969591744, + "learning_rate": 9.34541850048917e-06, + "loss": 0.6459, + "step": 5240 + }, + { + "epoch": 0.5353421859039836, + "grad_norm": 1.4476566910186557, + "learning_rate": 9.342117299083859e-06, + "loss": 0.6964, + "step": 5241 + }, + { + "epoch": 0.5354443309499489, + "grad_norm": 1.4907526958267718, + "learning_rate": 9.338816169684231e-06, + "loss": 0.742, + "step": 5242 + }, + { + "epoch": 0.5355464759959142, + "grad_norm": 1.4266270314557523, + "learning_rate": 9.335515112651606e-06, + "loss": 0.5986, + "step": 5243 + }, + { + "epoch": 0.5356486210418795, + "grad_norm": 1.5307428758134793, + "learning_rate": 9.332214128347277e-06, + "loss": 0.7907, + "step": 5244 + }, + { + "epoch": 0.5357507660878448, + "grad_norm": 1.4544659809004, + "learning_rate": 9.328913217132546e-06, + "loss": 0.7315, + "step": 5245 + }, + { + "epoch": 0.5358529111338101, + "grad_norm": 1.5311439433339236, + "learning_rate": 9.325612379368695e-06, + "loss": 0.704, + "step": 5246 + }, + { + "epoch": 0.5359550561797752, + "grad_norm": 1.46492126109438, + "learning_rate": 9.322311615417003e-06, + "loss": 0.6895, + "step": 5247 + }, + { + "epoch": 0.5360572012257405, + "grad_norm": 1.2532498667894556, + "learning_rate": 9.319010925638746e-06, + "loss": 0.6569, + "step": 5248 + }, + { + "epoch": 0.5361593462717058, + "grad_norm": 1.5057032932249226, + "learning_rate": 9.315710310395181e-06, + "loss": 0.689, + "step": 5249 + }, + { + "epoch": 0.5362614913176711, + "grad_norm": 1.43922695177929, + "learning_rate": 9.312409770047566e-06, + "loss": 0.7244, + "step": 5250 + }, + { + "epoch": 0.5363636363636364, + "grad_norm": 1.4659410388318872, + "learning_rate": 9.309109304957145e-06, + "loss": 0.7128, + "step": 5251 + }, + { + "epoch": 0.5364657814096017, + "grad_norm": 1.336241602958345, + "learning_rate": 9.30580891548516e-06, + "loss": 0.6475, + "step": 5252 + }, + { + "epoch": 0.536567926455567, + "grad_norm": 1.3401891670552928, + "learning_rate": 9.302508601992834e-06, + "loss": 0.7022, + "step": 5253 + }, + { + "epoch": 0.5366700715015322, + "grad_norm": 1.50850922143205, + "learning_rate": 9.299208364841394e-06, + "loss": 0.7908, + "step": 5254 + }, + { + "epoch": 0.5367722165474974, + "grad_norm": 1.5296596183193492, + "learning_rate": 9.295908204392047e-06, + "loss": 0.6882, + "step": 5255 + }, + { + "epoch": 0.5368743615934627, + "grad_norm": 1.6002141505999796, + "learning_rate": 9.292608121006009e-06, + "loss": 0.67, + "step": 5256 + }, + { + "epoch": 0.536976506639428, + "grad_norm": 1.3735472611573007, + "learning_rate": 9.289308115044463e-06, + "loss": 0.6693, + "step": 5257 + }, + { + "epoch": 0.5370786516853933, + "grad_norm": 1.3894506834816651, + "learning_rate": 9.286008186868606e-06, + "loss": 0.6783, + "step": 5258 + }, + { + "epoch": 0.5371807967313585, + "grad_norm": 1.306144968789425, + "learning_rate": 9.282708336839614e-06, + "loss": 0.5567, + "step": 5259 + }, + { + "epoch": 0.5372829417773238, + "grad_norm": 1.3103043955413112, + "learning_rate": 9.279408565318659e-06, + "loss": 0.6806, + "step": 5260 + }, + { + "epoch": 0.5373850868232891, + "grad_norm": 1.6535123776007974, + "learning_rate": 9.276108872666901e-06, + "loss": 0.7957, + "step": 5261 + }, + { + "epoch": 0.5374872318692543, + "grad_norm": 1.5252777916727294, + "learning_rate": 9.272809259245496e-06, + "loss": 0.8554, + "step": 5262 + }, + { + "epoch": 0.5375893769152196, + "grad_norm": 1.4153971798648703, + "learning_rate": 9.269509725415584e-06, + "loss": 0.7413, + "step": 5263 + }, + { + "epoch": 0.5376915219611849, + "grad_norm": 1.4794298137426707, + "learning_rate": 9.266210271538306e-06, + "loss": 0.7684, + "step": 5264 + }, + { + "epoch": 0.5377936670071501, + "grad_norm": 1.519620646630575, + "learning_rate": 9.262910897974789e-06, + "loss": 0.7292, + "step": 5265 + }, + { + "epoch": 0.5378958120531154, + "grad_norm": 1.6542631258073068, + "learning_rate": 9.259611605086148e-06, + "loss": 0.6751, + "step": 5266 + }, + { + "epoch": 0.5379979570990807, + "grad_norm": 1.3773974798268631, + "learning_rate": 9.256312393233498e-06, + "loss": 0.7119, + "step": 5267 + }, + { + "epoch": 0.538100102145046, + "grad_norm": 1.4379207729597026, + "learning_rate": 9.253013262777936e-06, + "loss": 0.69, + "step": 5268 + }, + { + "epoch": 0.5382022471910113, + "grad_norm": 1.4148171001847447, + "learning_rate": 9.249714214080556e-06, + "loss": 0.7604, + "step": 5269 + }, + { + "epoch": 0.5383043922369765, + "grad_norm": 1.325100154473844, + "learning_rate": 9.246415247502439e-06, + "loss": 0.7376, + "step": 5270 + }, + { + "epoch": 0.5384065372829417, + "grad_norm": 1.5177571170075688, + "learning_rate": 9.243116363404656e-06, + "loss": 0.696, + "step": 5271 + }, + { + "epoch": 0.538508682328907, + "grad_norm": 1.3555489570734756, + "learning_rate": 9.239817562148282e-06, + "loss": 0.6245, + "step": 5272 + }, + { + "epoch": 0.5386108273748723, + "grad_norm": 1.5472753084386863, + "learning_rate": 9.236518844094366e-06, + "loss": 0.828, + "step": 5273 + }, + { + "epoch": 0.5387129724208376, + "grad_norm": 1.4827026082201882, + "learning_rate": 9.233220209603954e-06, + "loss": 0.7435, + "step": 5274 + }, + { + "epoch": 0.5388151174668029, + "grad_norm": 1.4401966727546343, + "learning_rate": 9.229921659038088e-06, + "loss": 0.7402, + "step": 5275 + }, + { + "epoch": 0.5389172625127682, + "grad_norm": 1.508204883055981, + "learning_rate": 9.22662319275779e-06, + "loss": 0.6799, + "step": 5276 + }, + { + "epoch": 0.5390194075587335, + "grad_norm": 1.5978317798229795, + "learning_rate": 9.223324811124088e-06, + "loss": 0.7384, + "step": 5277 + }, + { + "epoch": 0.5391215526046986, + "grad_norm": 1.402087605817577, + "learning_rate": 9.220026514497984e-06, + "loss": 0.6534, + "step": 5278 + }, + { + "epoch": 0.5392236976506639, + "grad_norm": 1.3809946173019199, + "learning_rate": 9.21672830324048e-06, + "loss": 0.5954, + "step": 5279 + }, + { + "epoch": 0.5393258426966292, + "grad_norm": 1.6411917445001991, + "learning_rate": 9.213430177712574e-06, + "loss": 0.6867, + "step": 5280 + }, + { + "epoch": 0.5394279877425945, + "grad_norm": 1.520897593261886, + "learning_rate": 9.21013213827524e-06, + "loss": 0.7898, + "step": 5281 + }, + { + "epoch": 0.5395301327885598, + "grad_norm": 1.4964203013563593, + "learning_rate": 9.206834185289454e-06, + "loss": 0.7174, + "step": 5282 + }, + { + "epoch": 0.539632277834525, + "grad_norm": 1.531332446511657, + "learning_rate": 9.203536319116181e-06, + "loss": 0.7402, + "step": 5283 + }, + { + "epoch": 0.5397344228804903, + "grad_norm": 1.6615418227398808, + "learning_rate": 9.200238540116372e-06, + "loss": 0.7164, + "step": 5284 + }, + { + "epoch": 0.5398365679264555, + "grad_norm": 1.525332902436478, + "learning_rate": 9.196940848650971e-06, + "loss": 0.6072, + "step": 5285 + }, + { + "epoch": 0.5399387129724208, + "grad_norm": 1.6139711229075728, + "learning_rate": 9.193643245080913e-06, + "loss": 0.7862, + "step": 5286 + }, + { + "epoch": 0.5400408580183861, + "grad_norm": 1.451061318556362, + "learning_rate": 9.19034572976712e-06, + "loss": 0.6612, + "step": 5287 + }, + { + "epoch": 0.5401430030643514, + "grad_norm": 1.5857671251795262, + "learning_rate": 9.187048303070511e-06, + "loss": 0.7574, + "step": 5288 + }, + { + "epoch": 0.5402451481103167, + "grad_norm": 1.561709585188985, + "learning_rate": 9.183750965351993e-06, + "loss": 0.8005, + "step": 5289 + }, + { + "epoch": 0.5403472931562819, + "grad_norm": 1.3759356342814062, + "learning_rate": 9.180453716972457e-06, + "loss": 0.6906, + "step": 5290 + }, + { + "epoch": 0.5404494382022472, + "grad_norm": 1.3853215319387142, + "learning_rate": 9.177156558292794e-06, + "loss": 0.69, + "step": 5291 + }, + { + "epoch": 0.5405515832482125, + "grad_norm": 1.3870361073436825, + "learning_rate": 9.173859489673877e-06, + "loss": 0.7393, + "step": 5292 + }, + { + "epoch": 0.5406537282941777, + "grad_norm": 1.4140309681807364, + "learning_rate": 9.17056251147657e-06, + "loss": 0.7551, + "step": 5293 + }, + { + "epoch": 0.540755873340143, + "grad_norm": 1.4674334304973493, + "learning_rate": 9.167265624061734e-06, + "loss": 0.742, + "step": 5294 + }, + { + "epoch": 0.5408580183861083, + "grad_norm": 1.3562881998876646, + "learning_rate": 9.16396882779021e-06, + "loss": 0.6339, + "step": 5295 + }, + { + "epoch": 0.5409601634320735, + "grad_norm": 1.4859463329125158, + "learning_rate": 9.160672123022842e-06, + "loss": 0.751, + "step": 5296 + }, + { + "epoch": 0.5410623084780388, + "grad_norm": 1.466658939442774, + "learning_rate": 9.15737551012045e-06, + "loss": 0.7923, + "step": 5297 + }, + { + "epoch": 0.5411644535240041, + "grad_norm": 1.5990995821455012, + "learning_rate": 9.154078989443854e-06, + "loss": 0.6984, + "step": 5298 + }, + { + "epoch": 0.5412665985699694, + "grad_norm": 1.5890066752397045, + "learning_rate": 9.15078256135386e-06, + "loss": 0.6756, + "step": 5299 + }, + { + "epoch": 0.5413687436159347, + "grad_norm": 1.4800093375485848, + "learning_rate": 9.147486226211262e-06, + "loss": 0.7975, + "step": 5300 + }, + { + "epoch": 0.5414708886618999, + "grad_norm": 1.4305514607377332, + "learning_rate": 9.144189984376849e-06, + "loss": 0.7698, + "step": 5301 + }, + { + "epoch": 0.5415730337078651, + "grad_norm": 1.5629233271080127, + "learning_rate": 9.140893836211393e-06, + "loss": 0.7377, + "step": 5302 + }, + { + "epoch": 0.5416751787538304, + "grad_norm": 1.5762054273293127, + "learning_rate": 9.13759778207566e-06, + "loss": 0.8224, + "step": 5303 + }, + { + "epoch": 0.5417773237997957, + "grad_norm": 1.436590857879403, + "learning_rate": 9.13430182233041e-06, + "loss": 0.7729, + "step": 5304 + }, + { + "epoch": 0.541879468845761, + "grad_norm": 1.3800295467987769, + "learning_rate": 9.131005957336385e-06, + "loss": 0.7499, + "step": 5305 + }, + { + "epoch": 0.5419816138917263, + "grad_norm": 1.4531538739947707, + "learning_rate": 9.12771018745432e-06, + "loss": 0.7126, + "step": 5306 + }, + { + "epoch": 0.5420837589376916, + "grad_norm": 1.2574470937337408, + "learning_rate": 9.124414513044936e-06, + "loss": 0.5587, + "step": 5307 + }, + { + "epoch": 0.5421859039836568, + "grad_norm": 1.4843886595211573, + "learning_rate": 9.12111893446895e-06, + "loss": 0.7022, + "step": 5308 + }, + { + "epoch": 0.542288049029622, + "grad_norm": 1.4507226920181049, + "learning_rate": 9.117823452087068e-06, + "loss": 0.8284, + "step": 5309 + }, + { + "epoch": 0.5423901940755873, + "grad_norm": 1.4267470637068462, + "learning_rate": 9.114528066259976e-06, + "loss": 0.6566, + "step": 5310 + }, + { + "epoch": 0.5424923391215526, + "grad_norm": 1.4797203656055982, + "learning_rate": 9.111232777348356e-06, + "loss": 0.7206, + "step": 5311 + }, + { + "epoch": 0.5425944841675179, + "grad_norm": 1.5215950085698822, + "learning_rate": 9.107937585712887e-06, + "loss": 0.786, + "step": 5312 + }, + { + "epoch": 0.5426966292134832, + "grad_norm": 1.5107304338104994, + "learning_rate": 9.104642491714224e-06, + "loss": 0.6554, + "step": 5313 + }, + { + "epoch": 0.5427987742594484, + "grad_norm": 1.520630747917681, + "learning_rate": 9.10134749571302e-06, + "loss": 0.6564, + "step": 5314 + }, + { + "epoch": 0.5429009193054137, + "grad_norm": 1.445699678559169, + "learning_rate": 9.098052598069912e-06, + "loss": 0.6431, + "step": 5315 + }, + { + "epoch": 0.5430030643513789, + "grad_norm": 1.501999467823678, + "learning_rate": 9.094757799145532e-06, + "loss": 0.7194, + "step": 5316 + }, + { + "epoch": 0.5431052093973442, + "grad_norm": 1.4451666524407356, + "learning_rate": 9.091463099300494e-06, + "loss": 0.7128, + "step": 5317 + }, + { + "epoch": 0.5432073544433095, + "grad_norm": 1.5725513348734697, + "learning_rate": 9.088168498895407e-06, + "loss": 0.7077, + "step": 5318 + }, + { + "epoch": 0.5433094994892748, + "grad_norm": 1.451321436949179, + "learning_rate": 9.084873998290864e-06, + "loss": 0.7615, + "step": 5319 + }, + { + "epoch": 0.54341164453524, + "grad_norm": 1.5582671863331368, + "learning_rate": 9.081579597847456e-06, + "loss": 0.7957, + "step": 5320 + }, + { + "epoch": 0.5435137895812053, + "grad_norm": 1.4121393335382328, + "learning_rate": 9.078285297925755e-06, + "loss": 0.6675, + "step": 5321 + }, + { + "epoch": 0.5436159346271706, + "grad_norm": 1.449586993042086, + "learning_rate": 9.074991098886323e-06, + "loss": 0.7449, + "step": 5322 + }, + { + "epoch": 0.5437180796731359, + "grad_norm": 1.4462893785550492, + "learning_rate": 9.071697001089715e-06, + "loss": 0.6644, + "step": 5323 + }, + { + "epoch": 0.5438202247191011, + "grad_norm": 1.5100590038502006, + "learning_rate": 9.068403004896466e-06, + "loss": 0.7042, + "step": 5324 + }, + { + "epoch": 0.5439223697650664, + "grad_norm": 1.4309243720156277, + "learning_rate": 9.065109110667115e-06, + "loss": 0.5981, + "step": 5325 + }, + { + "epoch": 0.5440245148110316, + "grad_norm": 1.6063071330138892, + "learning_rate": 9.061815318762172e-06, + "loss": 0.8174, + "step": 5326 + }, + { + "epoch": 0.5441266598569969, + "grad_norm": 1.5566826578911757, + "learning_rate": 9.058521629542148e-06, + "loss": 0.678, + "step": 5327 + }, + { + "epoch": 0.5442288049029622, + "grad_norm": 1.4681254846370988, + "learning_rate": 9.055228043367539e-06, + "loss": 0.6948, + "step": 5328 + }, + { + "epoch": 0.5443309499489275, + "grad_norm": 1.4871040856778386, + "learning_rate": 9.051934560598836e-06, + "loss": 0.7453, + "step": 5329 + }, + { + "epoch": 0.5444330949948928, + "grad_norm": 1.5811952845783739, + "learning_rate": 9.048641181596504e-06, + "loss": 0.7559, + "step": 5330 + }, + { + "epoch": 0.5445352400408581, + "grad_norm": 1.453350051115602, + "learning_rate": 9.045347906721011e-06, + "loss": 0.817, + "step": 5331 + }, + { + "epoch": 0.5446373850868232, + "grad_norm": 1.4795550383718767, + "learning_rate": 9.042054736332806e-06, + "loss": 0.7452, + "step": 5332 + }, + { + "epoch": 0.5447395301327885, + "grad_norm": 1.4157521258886236, + "learning_rate": 9.03876167079233e-06, + "loss": 0.751, + "step": 5333 + }, + { + "epoch": 0.5448416751787538, + "grad_norm": 1.4382263368016583, + "learning_rate": 9.035468710460007e-06, + "loss": 0.7434, + "step": 5334 + }, + { + "epoch": 0.5449438202247191, + "grad_norm": 1.4580716073026145, + "learning_rate": 9.032175855696258e-06, + "loss": 0.6604, + "step": 5335 + }, + { + "epoch": 0.5450459652706844, + "grad_norm": 1.3815795263848616, + "learning_rate": 9.028883106861488e-06, + "loss": 0.7993, + "step": 5336 + }, + { + "epoch": 0.5451481103166497, + "grad_norm": 1.5790148880070334, + "learning_rate": 9.025590464316087e-06, + "loss": 0.7055, + "step": 5337 + }, + { + "epoch": 0.545250255362615, + "grad_norm": 1.2921656316065682, + "learning_rate": 9.022297928420444e-06, + "loss": 0.7469, + "step": 5338 + }, + { + "epoch": 0.5453524004085801, + "grad_norm": 1.5872318292370122, + "learning_rate": 9.019005499534921e-06, + "loss": 0.6628, + "step": 5339 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.4682122446890498, + "learning_rate": 9.01571317801988e-06, + "loss": 0.7187, + "step": 5340 + }, + { + "epoch": 0.5455566905005107, + "grad_norm": 1.4298051684777104, + "learning_rate": 9.012420964235668e-06, + "loss": 0.7633, + "step": 5341 + }, + { + "epoch": 0.545658835546476, + "grad_norm": 1.50302296413129, + "learning_rate": 9.009128858542622e-06, + "loss": 0.7363, + "step": 5342 + }, + { + "epoch": 0.5457609805924413, + "grad_norm": 1.3826266483155756, + "learning_rate": 9.005836861301058e-06, + "loss": 0.7217, + "step": 5343 + }, + { + "epoch": 0.5458631256384066, + "grad_norm": 1.4075502297413032, + "learning_rate": 9.002544972871292e-06, + "loss": 0.7515, + "step": 5344 + }, + { + "epoch": 0.5459652706843718, + "grad_norm": 1.4235299753942443, + "learning_rate": 8.999253193613627e-06, + "loss": 0.6235, + "step": 5345 + }, + { + "epoch": 0.5460674157303371, + "grad_norm": 1.4251626010458078, + "learning_rate": 8.995961523888346e-06, + "loss": 0.7342, + "step": 5346 + }, + { + "epoch": 0.5461695607763023, + "grad_norm": 1.4968845427203312, + "learning_rate": 8.992669964055724e-06, + "loss": 0.7633, + "step": 5347 + }, + { + "epoch": 0.5462717058222676, + "grad_norm": 1.4716207391979643, + "learning_rate": 8.989378514476025e-06, + "loss": 0.673, + "step": 5348 + }, + { + "epoch": 0.5463738508682329, + "grad_norm": 1.3852885529936758, + "learning_rate": 8.986087175509502e-06, + "loss": 0.6826, + "step": 5349 + }, + { + "epoch": 0.5464759959141982, + "grad_norm": 1.5653429152103706, + "learning_rate": 8.982795947516392e-06, + "loss": 0.789, + "step": 5350 + }, + { + "epoch": 0.5465781409601634, + "grad_norm": 1.4859779881279889, + "learning_rate": 8.979504830856919e-06, + "loss": 0.7388, + "step": 5351 + }, + { + "epoch": 0.5466802860061287, + "grad_norm": 1.5773763624722752, + "learning_rate": 8.976213825891304e-06, + "loss": 0.6827, + "step": 5352 + }, + { + "epoch": 0.546782431052094, + "grad_norm": 1.603407303018258, + "learning_rate": 8.97292293297975e-06, + "loss": 0.8171, + "step": 5353 + }, + { + "epoch": 0.5468845760980593, + "grad_norm": 1.4925773215730558, + "learning_rate": 8.96963215248244e-06, + "loss": 0.708, + "step": 5354 + }, + { + "epoch": 0.5469867211440245, + "grad_norm": 1.492018056543209, + "learning_rate": 8.966341484759561e-06, + "loss": 0.6226, + "step": 5355 + }, + { + "epoch": 0.5470888661899898, + "grad_norm": 1.6431137419227642, + "learning_rate": 8.963050930171271e-06, + "loss": 0.7496, + "step": 5356 + }, + { + "epoch": 0.547191011235955, + "grad_norm": 1.4791372078429992, + "learning_rate": 8.959760489077729e-06, + "loss": 0.6192, + "step": 5357 + }, + { + "epoch": 0.5472931562819203, + "grad_norm": 1.4890416899484047, + "learning_rate": 8.956470161839073e-06, + "loss": 0.7928, + "step": 5358 + }, + { + "epoch": 0.5473953013278856, + "grad_norm": 1.4434292697286835, + "learning_rate": 8.953179948815428e-06, + "loss": 0.6171, + "step": 5359 + }, + { + "epoch": 0.5474974463738509, + "grad_norm": 1.415645205441737, + "learning_rate": 8.949889850366918e-06, + "loss": 0.6723, + "step": 5360 + }, + { + "epoch": 0.5475995914198162, + "grad_norm": 1.448894353315161, + "learning_rate": 8.94659986685364e-06, + "loss": 0.6791, + "step": 5361 + }, + { + "epoch": 0.5477017364657815, + "grad_norm": 1.3656937207936726, + "learning_rate": 8.943309998635693e-06, + "loss": 0.7261, + "step": 5362 + }, + { + "epoch": 0.5478038815117466, + "grad_norm": 1.5996651055356172, + "learning_rate": 8.940020246073146e-06, + "loss": 0.7881, + "step": 5363 + }, + { + "epoch": 0.5479060265577119, + "grad_norm": 1.359052456741216, + "learning_rate": 8.936730609526065e-06, + "loss": 0.7476, + "step": 5364 + }, + { + "epoch": 0.5480081716036772, + "grad_norm": 1.4059140357592597, + "learning_rate": 8.93344108935451e-06, + "loss": 0.7359, + "step": 5365 + }, + { + "epoch": 0.5481103166496425, + "grad_norm": 1.556804157242638, + "learning_rate": 8.930151685918515e-06, + "loss": 0.8106, + "step": 5366 + }, + { + "epoch": 0.5482124616956078, + "grad_norm": 1.4906055103333997, + "learning_rate": 8.926862399578105e-06, + "loss": 0.682, + "step": 5367 + }, + { + "epoch": 0.5483146067415731, + "grad_norm": 1.4132452878515613, + "learning_rate": 8.923573230693304e-06, + "loss": 0.7147, + "step": 5368 + }, + { + "epoch": 0.5484167517875383, + "grad_norm": 1.4030899147833644, + "learning_rate": 8.920284179624107e-06, + "loss": 0.6427, + "step": 5369 + }, + { + "epoch": 0.5485188968335035, + "grad_norm": 1.512310965118413, + "learning_rate": 8.916995246730505e-06, + "loss": 0.8136, + "step": 5370 + }, + { + "epoch": 0.5486210418794688, + "grad_norm": 1.5233682828546162, + "learning_rate": 8.913706432372471e-06, + "loss": 0.6877, + "step": 5371 + }, + { + "epoch": 0.5487231869254341, + "grad_norm": 1.578424541584731, + "learning_rate": 8.910417736909974e-06, + "loss": 0.7004, + "step": 5372 + }, + { + "epoch": 0.5488253319713994, + "grad_norm": 1.3677503555378996, + "learning_rate": 8.907129160702954e-06, + "loss": 0.7669, + "step": 5373 + }, + { + "epoch": 0.5489274770173647, + "grad_norm": 1.4370148136214547, + "learning_rate": 8.903840704111357e-06, + "loss": 0.6979, + "step": 5374 + }, + { + "epoch": 0.54902962206333, + "grad_norm": 1.5042638915499635, + "learning_rate": 8.9005523674951e-06, + "loss": 0.7847, + "step": 5375 + }, + { + "epoch": 0.5491317671092952, + "grad_norm": 1.544319530070016, + "learning_rate": 8.897264151214097e-06, + "loss": 0.7424, + "step": 5376 + }, + { + "epoch": 0.5492339121552605, + "grad_norm": 1.549438778384867, + "learning_rate": 8.89397605562825e-06, + "loss": 0.6311, + "step": 5377 + }, + { + "epoch": 0.5493360572012257, + "grad_norm": 1.4177978825614934, + "learning_rate": 8.890688081097433e-06, + "loss": 0.67, + "step": 5378 + }, + { + "epoch": 0.549438202247191, + "grad_norm": 1.4395209406122906, + "learning_rate": 8.887400227981526e-06, + "loss": 0.7183, + "step": 5379 + }, + { + "epoch": 0.5495403472931563, + "grad_norm": 1.4616314817419782, + "learning_rate": 8.884112496640384e-06, + "loss": 0.7311, + "step": 5380 + }, + { + "epoch": 0.5496424923391215, + "grad_norm": 1.4454530093057807, + "learning_rate": 8.880824887433846e-06, + "loss": 0.6944, + "step": 5381 + }, + { + "epoch": 0.5497446373850868, + "grad_norm": 1.5396149169460862, + "learning_rate": 8.87753740072175e-06, + "loss": 0.6707, + "step": 5382 + }, + { + "epoch": 0.5498467824310521, + "grad_norm": 1.5909855895748617, + "learning_rate": 8.874250036863908e-06, + "loss": 0.6597, + "step": 5383 + }, + { + "epoch": 0.5499489274770174, + "grad_norm": 1.3443167615110867, + "learning_rate": 8.870962796220126e-06, + "loss": 0.6405, + "step": 5384 + }, + { + "epoch": 0.5500510725229827, + "grad_norm": 1.5852993308897236, + "learning_rate": 8.867675679150199e-06, + "loss": 0.7639, + "step": 5385 + }, + { + "epoch": 0.5501532175689479, + "grad_norm": 1.3942173580257833, + "learning_rate": 8.864388686013898e-06, + "loss": 0.6952, + "step": 5386 + }, + { + "epoch": 0.5502553626149131, + "grad_norm": 1.233334244837427, + "learning_rate": 8.861101817170992e-06, + "loss": 0.5944, + "step": 5387 + }, + { + "epoch": 0.5503575076608784, + "grad_norm": 1.360716004687835, + "learning_rate": 8.857815072981226e-06, + "loss": 0.6664, + "step": 5388 + }, + { + "epoch": 0.5504596527068437, + "grad_norm": 1.5082396249771426, + "learning_rate": 8.85452845380434e-06, + "loss": 0.7253, + "step": 5389 + }, + { + "epoch": 0.550561797752809, + "grad_norm": 1.4087227583963116, + "learning_rate": 8.851241960000052e-06, + "loss": 0.5674, + "step": 5390 + }, + { + "epoch": 0.5506639427987743, + "grad_norm": 1.57716737928456, + "learning_rate": 8.847955591928071e-06, + "loss": 0.8226, + "step": 5391 + }, + { + "epoch": 0.5507660878447396, + "grad_norm": 1.4750059229405859, + "learning_rate": 8.8446693499481e-06, + "loss": 0.6505, + "step": 5392 + }, + { + "epoch": 0.5508682328907047, + "grad_norm": 1.497572334655076, + "learning_rate": 8.841383234419814e-06, + "loss": 0.7789, + "step": 5393 + }, + { + "epoch": 0.55097037793667, + "grad_norm": 1.4985414622726312, + "learning_rate": 8.838097245702882e-06, + "loss": 0.7381, + "step": 5394 + }, + { + "epoch": 0.5510725229826353, + "grad_norm": 1.4572795778242322, + "learning_rate": 8.834811384156956e-06, + "loss": 0.6933, + "step": 5395 + }, + { + "epoch": 0.5511746680286006, + "grad_norm": 1.5379611256922758, + "learning_rate": 8.831525650141679e-06, + "loss": 0.7658, + "step": 5396 + }, + { + "epoch": 0.5512768130745659, + "grad_norm": 1.4225258726730035, + "learning_rate": 8.828240044016673e-06, + "loss": 0.6638, + "step": 5397 + }, + { + "epoch": 0.5513789581205312, + "grad_norm": 1.408941937092425, + "learning_rate": 8.82495456614155e-06, + "loss": 0.7282, + "step": 5398 + }, + { + "epoch": 0.5514811031664965, + "grad_norm": 1.5617833393287068, + "learning_rate": 8.821669216875907e-06, + "loss": 0.8289, + "step": 5399 + }, + { + "epoch": 0.5515832482124617, + "grad_norm": 1.4591657624697218, + "learning_rate": 8.818383996579333e-06, + "loss": 0.7902, + "step": 5400 + }, + { + "epoch": 0.5516853932584269, + "grad_norm": 1.3360872473629597, + "learning_rate": 8.815098905611394e-06, + "loss": 0.7384, + "step": 5401 + }, + { + "epoch": 0.5517875383043922, + "grad_norm": 1.6297580651485546, + "learning_rate": 8.811813944331645e-06, + "loss": 0.6578, + "step": 5402 + }, + { + "epoch": 0.5518896833503575, + "grad_norm": 1.322814968956624, + "learning_rate": 8.808529113099627e-06, + "loss": 0.6966, + "step": 5403 + }, + { + "epoch": 0.5519918283963228, + "grad_norm": 1.6136283568268253, + "learning_rate": 8.805244412274868e-06, + "loss": 0.6758, + "step": 5404 + }, + { + "epoch": 0.552093973442288, + "grad_norm": 1.3573620005413962, + "learning_rate": 8.801959842216878e-06, + "loss": 0.7232, + "step": 5405 + }, + { + "epoch": 0.5521961184882533, + "grad_norm": 1.3281703332363635, + "learning_rate": 8.79867540328516e-06, + "loss": 0.6728, + "step": 5406 + }, + { + "epoch": 0.5522982635342186, + "grad_norm": 1.3938561424786702, + "learning_rate": 8.795391095839192e-06, + "loss": 0.7659, + "step": 5407 + }, + { + "epoch": 0.5524004085801839, + "grad_norm": 1.3182967879684218, + "learning_rate": 8.792106920238447e-06, + "loss": 0.6943, + "step": 5408 + }, + { + "epoch": 0.5525025536261491, + "grad_norm": 1.5843658315144558, + "learning_rate": 8.788822876842384e-06, + "loss": 0.7033, + "step": 5409 + }, + { + "epoch": 0.5526046986721144, + "grad_norm": 1.3822059379086615, + "learning_rate": 8.785538966010437e-06, + "loss": 0.6363, + "step": 5410 + }, + { + "epoch": 0.5527068437180797, + "grad_norm": 1.3541830578460525, + "learning_rate": 8.782255188102037e-06, + "loss": 0.6685, + "step": 5411 + }, + { + "epoch": 0.5528089887640449, + "grad_norm": 1.527147474143802, + "learning_rate": 8.77897154347659e-06, + "loss": 0.7371, + "step": 5412 + }, + { + "epoch": 0.5529111338100102, + "grad_norm": 1.5212382176550752, + "learning_rate": 8.7756880324935e-06, + "loss": 0.7032, + "step": 5413 + }, + { + "epoch": 0.5530132788559755, + "grad_norm": 1.4912726537488599, + "learning_rate": 8.772404655512145e-06, + "loss": 0.7179, + "step": 5414 + }, + { + "epoch": 0.5531154239019408, + "grad_norm": 1.3226093351288812, + "learning_rate": 8.769121412891888e-06, + "loss": 0.7762, + "step": 5415 + }, + { + "epoch": 0.5532175689479061, + "grad_norm": 1.402450990480852, + "learning_rate": 8.765838304992094e-06, + "loss": 0.7697, + "step": 5416 + }, + { + "epoch": 0.5533197139938713, + "grad_norm": 1.5064101591621921, + "learning_rate": 8.762555332172095e-06, + "loss": 0.7789, + "step": 5417 + }, + { + "epoch": 0.5534218590398365, + "grad_norm": 1.3848635611319071, + "learning_rate": 8.75927249479121e-06, + "loss": 0.6135, + "step": 5418 + }, + { + "epoch": 0.5535240040858018, + "grad_norm": 1.3957637655466684, + "learning_rate": 8.755989793208757e-06, + "loss": 0.6932, + "step": 5419 + }, + { + "epoch": 0.5536261491317671, + "grad_norm": 1.3984161187758415, + "learning_rate": 8.752707227784021e-06, + "loss": 0.6831, + "step": 5420 + }, + { + "epoch": 0.5537282941777324, + "grad_norm": 1.3712990644502543, + "learning_rate": 8.749424798876289e-06, + "loss": 0.6204, + "step": 5421 + }, + { + "epoch": 0.5538304392236977, + "grad_norm": 1.3868031620723829, + "learning_rate": 8.746142506844816e-06, + "loss": 0.7131, + "step": 5422 + }, + { + "epoch": 0.553932584269663, + "grad_norm": 1.3183002560168648, + "learning_rate": 8.742860352048854e-06, + "loss": 0.7191, + "step": 5423 + }, + { + "epoch": 0.5540347293156281, + "grad_norm": 1.5646449904463107, + "learning_rate": 8.739578334847645e-06, + "loss": 0.6902, + "step": 5424 + }, + { + "epoch": 0.5541368743615934, + "grad_norm": 1.45429524617401, + "learning_rate": 8.736296455600396e-06, + "loss": 0.7357, + "step": 5425 + }, + { + "epoch": 0.5542390194075587, + "grad_norm": 1.4197719026293592, + "learning_rate": 8.733014714666322e-06, + "loss": 0.669, + "step": 5426 + }, + { + "epoch": 0.554341164453524, + "grad_norm": 1.421260438089345, + "learning_rate": 8.729733112404603e-06, + "loss": 0.6578, + "step": 5427 + }, + { + "epoch": 0.5544433094994893, + "grad_norm": 1.5623988841892025, + "learning_rate": 8.726451649174417e-06, + "loss": 0.7682, + "step": 5428 + }, + { + "epoch": 0.5545454545454546, + "grad_norm": 1.5197618045307155, + "learning_rate": 8.72317032533492e-06, + "loss": 0.72, + "step": 5429 + }, + { + "epoch": 0.5546475995914198, + "grad_norm": 1.528543427900039, + "learning_rate": 8.719889141245255e-06, + "loss": 0.7179, + "step": 5430 + }, + { + "epoch": 0.5547497446373851, + "grad_norm": 1.5208838908753068, + "learning_rate": 8.71660809726455e-06, + "loss": 0.7673, + "step": 5431 + }, + { + "epoch": 0.5548518896833503, + "grad_norm": 1.4274537855602016, + "learning_rate": 8.713327193751918e-06, + "loss": 0.6853, + "step": 5432 + }, + { + "epoch": 0.5549540347293156, + "grad_norm": 1.685769011154593, + "learning_rate": 8.710046431066458e-06, + "loss": 0.7788, + "step": 5433 + }, + { + "epoch": 0.5550561797752809, + "grad_norm": 1.405275362485287, + "learning_rate": 8.70676580956725e-06, + "loss": 0.8027, + "step": 5434 + }, + { + "epoch": 0.5551583248212462, + "grad_norm": 1.5163393188140282, + "learning_rate": 8.703485329613357e-06, + "loss": 0.7, + "step": 5435 + }, + { + "epoch": 0.5552604698672114, + "grad_norm": 1.3829420905283298, + "learning_rate": 8.700204991563835e-06, + "loss": 0.6525, + "step": 5436 + }, + { + "epoch": 0.5553626149131767, + "grad_norm": 1.4390024847657423, + "learning_rate": 8.696924795777715e-06, + "loss": 0.7738, + "step": 5437 + }, + { + "epoch": 0.555464759959142, + "grad_norm": 1.3109865126460842, + "learning_rate": 8.693644742614018e-06, + "loss": 0.5716, + "step": 5438 + }, + { + "epoch": 0.5555669050051073, + "grad_norm": 1.3243140838657725, + "learning_rate": 8.690364832431748e-06, + "loss": 0.6979, + "step": 5439 + }, + { + "epoch": 0.5556690500510725, + "grad_norm": 1.3421238460418488, + "learning_rate": 8.68708506558989e-06, + "loss": 0.6797, + "step": 5440 + }, + { + "epoch": 0.5557711950970378, + "grad_norm": 1.3511069491946468, + "learning_rate": 8.683805442447425e-06, + "loss": 0.6522, + "step": 5441 + }, + { + "epoch": 0.555873340143003, + "grad_norm": 1.243650514447468, + "learning_rate": 8.680525963363301e-06, + "loss": 0.6051, + "step": 5442 + }, + { + "epoch": 0.5559754851889683, + "grad_norm": 1.4258369976754803, + "learning_rate": 8.677246628696466e-06, + "loss": 0.6891, + "step": 5443 + }, + { + "epoch": 0.5560776302349336, + "grad_norm": 1.4255802384811038, + "learning_rate": 8.673967438805838e-06, + "loss": 0.5942, + "step": 5444 + }, + { + "epoch": 0.5561797752808989, + "grad_norm": 1.4796087527682718, + "learning_rate": 8.670688394050336e-06, + "loss": 0.6625, + "step": 5445 + }, + { + "epoch": 0.5562819203268642, + "grad_norm": 1.3559543353005912, + "learning_rate": 8.667409494788844e-06, + "loss": 0.7788, + "step": 5446 + }, + { + "epoch": 0.5563840653728295, + "grad_norm": 1.4652009230750216, + "learning_rate": 8.664130741380247e-06, + "loss": 0.681, + "step": 5447 + }, + { + "epoch": 0.5564862104187946, + "grad_norm": 1.3769266874580246, + "learning_rate": 8.660852134183398e-06, + "loss": 0.6092, + "step": 5448 + }, + { + "epoch": 0.5565883554647599, + "grad_norm": 1.5053238975994778, + "learning_rate": 8.657573673557152e-06, + "loss": 0.7319, + "step": 5449 + }, + { + "epoch": 0.5566905005107252, + "grad_norm": 1.6280561634994386, + "learning_rate": 8.654295359860334e-06, + "loss": 0.7001, + "step": 5450 + }, + { + "epoch": 0.5567926455566905, + "grad_norm": 1.365606013798107, + "learning_rate": 8.65101719345176e-06, + "loss": 0.7117, + "step": 5451 + }, + { + "epoch": 0.5568947906026558, + "grad_norm": 1.528620140340669, + "learning_rate": 8.647739174690224e-06, + "loss": 0.7467, + "step": 5452 + }, + { + "epoch": 0.5569969356486211, + "grad_norm": 1.3581623356761632, + "learning_rate": 8.64446130393451e-06, + "loss": 0.6924, + "step": 5453 + }, + { + "epoch": 0.5570990806945864, + "grad_norm": 1.4831497456138345, + "learning_rate": 8.641183581543382e-06, + "loss": 0.6535, + "step": 5454 + }, + { + "epoch": 0.5572012257405515, + "grad_norm": 1.6280099037221463, + "learning_rate": 8.63790600787559e-06, + "loss": 0.7211, + "step": 5455 + }, + { + "epoch": 0.5573033707865168, + "grad_norm": 1.3588971234407978, + "learning_rate": 8.634628583289861e-06, + "loss": 0.6252, + "step": 5456 + }, + { + "epoch": 0.5574055158324821, + "grad_norm": 1.6751376987740338, + "learning_rate": 8.631351308144916e-06, + "loss": 0.693, + "step": 5457 + }, + { + "epoch": 0.5575076608784474, + "grad_norm": 1.464127105438117, + "learning_rate": 8.628074182799458e-06, + "loss": 0.6311, + "step": 5458 + }, + { + "epoch": 0.5576098059244127, + "grad_norm": 1.573566640975009, + "learning_rate": 8.624797207612166e-06, + "loss": 0.7222, + "step": 5459 + }, + { + "epoch": 0.557711950970378, + "grad_norm": 1.6102868761297027, + "learning_rate": 8.62152038294171e-06, + "loss": 0.727, + "step": 5460 + }, + { + "epoch": 0.5578140960163432, + "grad_norm": 1.345739520910247, + "learning_rate": 8.618243709146737e-06, + "loss": 0.6615, + "step": 5461 + }, + { + "epoch": 0.5579162410623085, + "grad_norm": 1.464769936734692, + "learning_rate": 8.614967186585882e-06, + "loss": 0.7667, + "step": 5462 + }, + { + "epoch": 0.5580183861082737, + "grad_norm": 1.4420039795719477, + "learning_rate": 8.611690815617764e-06, + "loss": 0.7675, + "step": 5463 + }, + { + "epoch": 0.558120531154239, + "grad_norm": 1.9194002966031567, + "learning_rate": 8.60841459660098e-06, + "loss": 0.6199, + "step": 5464 + }, + { + "epoch": 0.5582226762002043, + "grad_norm": 1.5541804811824884, + "learning_rate": 8.605138529894122e-06, + "loss": 0.7915, + "step": 5465 + }, + { + "epoch": 0.5583248212461696, + "grad_norm": 1.4434386313334997, + "learning_rate": 8.601862615855752e-06, + "loss": 0.6898, + "step": 5466 + }, + { + "epoch": 0.5584269662921348, + "grad_norm": 1.4632940856441208, + "learning_rate": 8.598586854844422e-06, + "loss": 0.7327, + "step": 5467 + }, + { + "epoch": 0.5585291113381001, + "grad_norm": 1.4861320118181407, + "learning_rate": 8.595311247218667e-06, + "loss": 0.6306, + "step": 5468 + }, + { + "epoch": 0.5586312563840654, + "grad_norm": 1.3922441489611643, + "learning_rate": 8.592035793337002e-06, + "loss": 0.7341, + "step": 5469 + }, + { + "epoch": 0.5587334014300307, + "grad_norm": 1.5279809241627287, + "learning_rate": 8.588760493557932e-06, + "loss": 0.7231, + "step": 5470 + }, + { + "epoch": 0.5588355464759959, + "grad_norm": 1.5437193032512637, + "learning_rate": 8.585485348239934e-06, + "loss": 0.6459, + "step": 5471 + }, + { + "epoch": 0.5589376915219612, + "grad_norm": 1.4370671776973034, + "learning_rate": 8.582210357741476e-06, + "loss": 0.8095, + "step": 5472 + }, + { + "epoch": 0.5590398365679264, + "grad_norm": 1.421953655487678, + "learning_rate": 8.578935522421015e-06, + "loss": 0.6978, + "step": 5473 + }, + { + "epoch": 0.5591419816138917, + "grad_norm": 1.4793207270143711, + "learning_rate": 8.575660842636979e-06, + "loss": 0.7619, + "step": 5474 + }, + { + "epoch": 0.559244126659857, + "grad_norm": 1.406622688147467, + "learning_rate": 8.572386318747784e-06, + "loss": 0.6276, + "step": 5475 + }, + { + "epoch": 0.5593462717058223, + "grad_norm": 1.4989869654068735, + "learning_rate": 8.569111951111828e-06, + "loss": 0.7373, + "step": 5476 + }, + { + "epoch": 0.5594484167517876, + "grad_norm": 1.4979384028884712, + "learning_rate": 8.565837740087495e-06, + "loss": 0.6497, + "step": 5477 + }, + { + "epoch": 0.5595505617977528, + "grad_norm": 1.3751450191665806, + "learning_rate": 8.562563686033145e-06, + "loss": 0.6791, + "step": 5478 + }, + { + "epoch": 0.559652706843718, + "grad_norm": 1.4237708214715832, + "learning_rate": 8.559289789307131e-06, + "loss": 0.7264, + "step": 5479 + }, + { + "epoch": 0.5597548518896833, + "grad_norm": 1.415150732266702, + "learning_rate": 8.556016050267776e-06, + "loss": 0.7407, + "step": 5480 + }, + { + "epoch": 0.5598569969356486, + "grad_norm": 1.5448474830362244, + "learning_rate": 8.5527424692734e-06, + "loss": 0.7058, + "step": 5481 + }, + { + "epoch": 0.5599591419816139, + "grad_norm": 1.5640862993742257, + "learning_rate": 8.549469046682297e-06, + "loss": 0.7053, + "step": 5482 + }, + { + "epoch": 0.5600612870275792, + "grad_norm": 1.3953667688424205, + "learning_rate": 8.546195782852743e-06, + "loss": 0.739, + "step": 5483 + }, + { + "epoch": 0.5601634320735445, + "grad_norm": 1.4822673817737033, + "learning_rate": 8.542922678143001e-06, + "loss": 0.72, + "step": 5484 + }, + { + "epoch": 0.5602655771195098, + "grad_norm": 1.546441801790899, + "learning_rate": 8.539649732911315e-06, + "loss": 0.7618, + "step": 5485 + }, + { + "epoch": 0.5603677221654749, + "grad_norm": 1.4847525772552743, + "learning_rate": 8.536376947515905e-06, + "loss": 0.663, + "step": 5486 + }, + { + "epoch": 0.5604698672114402, + "grad_norm": 1.4454512965490551, + "learning_rate": 8.533104322314987e-06, + "loss": 0.7705, + "step": 5487 + }, + { + "epoch": 0.5605720122574055, + "grad_norm": 1.3872770038146887, + "learning_rate": 8.529831857666744e-06, + "loss": 0.6615, + "step": 5488 + }, + { + "epoch": 0.5606741573033708, + "grad_norm": 1.4999023399938949, + "learning_rate": 8.526559553929356e-06, + "loss": 0.7331, + "step": 5489 + }, + { + "epoch": 0.5607763023493361, + "grad_norm": 1.5416529430314083, + "learning_rate": 8.523287411460979e-06, + "loss": 0.6665, + "step": 5490 + }, + { + "epoch": 0.5608784473953013, + "grad_norm": 1.2950520231324016, + "learning_rate": 8.520015430619747e-06, + "loss": 0.6482, + "step": 5491 + }, + { + "epoch": 0.5609805924412666, + "grad_norm": 1.3550668051513932, + "learning_rate": 8.516743611763783e-06, + "loss": 0.6924, + "step": 5492 + }, + { + "epoch": 0.5610827374872319, + "grad_norm": 1.5292348681204713, + "learning_rate": 8.51347195525119e-06, + "loss": 0.7744, + "step": 5493 + }, + { + "epoch": 0.5611848825331971, + "grad_norm": 1.353489747020099, + "learning_rate": 8.510200461440052e-06, + "loss": 0.6871, + "step": 5494 + }, + { + "epoch": 0.5612870275791624, + "grad_norm": 1.4846507082997045, + "learning_rate": 8.506929130688433e-06, + "loss": 0.7412, + "step": 5495 + }, + { + "epoch": 0.5613891726251277, + "grad_norm": 1.333680929176951, + "learning_rate": 8.503657963354385e-06, + "loss": 0.8209, + "step": 5496 + }, + { + "epoch": 0.561491317671093, + "grad_norm": 1.5625060145019278, + "learning_rate": 8.500386959795944e-06, + "loss": 0.7314, + "step": 5497 + }, + { + "epoch": 0.5615934627170582, + "grad_norm": 1.5551643447656638, + "learning_rate": 8.497116120371114e-06, + "loss": 0.7978, + "step": 5498 + }, + { + "epoch": 0.5616956077630235, + "grad_norm": 1.5075140229202546, + "learning_rate": 8.493845445437901e-06, + "loss": 0.6843, + "step": 5499 + }, + { + "epoch": 0.5617977528089888, + "grad_norm": 1.539760279268995, + "learning_rate": 8.490574935354274e-06, + "loss": 0.7533, + "step": 5500 + }, + { + "epoch": 0.5618998978549541, + "grad_norm": 1.3792139374035617, + "learning_rate": 8.487304590478197e-06, + "loss": 0.5998, + "step": 5501 + }, + { + "epoch": 0.5620020429009193, + "grad_norm": 1.5015004024018868, + "learning_rate": 8.484034411167611e-06, + "loss": 0.7739, + "step": 5502 + }, + { + "epoch": 0.5621041879468845, + "grad_norm": 1.4648735723152027, + "learning_rate": 8.480764397780435e-06, + "loss": 0.7227, + "step": 5503 + }, + { + "epoch": 0.5622063329928498, + "grad_norm": 1.4295299318741348, + "learning_rate": 8.477494550674576e-06, + "loss": 0.6866, + "step": 5504 + }, + { + "epoch": 0.5623084780388151, + "grad_norm": 1.449077699015867, + "learning_rate": 8.474224870207926e-06, + "loss": 0.742, + "step": 5505 + }, + { + "epoch": 0.5624106230847804, + "grad_norm": 1.4739607128657262, + "learning_rate": 8.470955356738347e-06, + "loss": 0.7125, + "step": 5506 + }, + { + "epoch": 0.5625127681307457, + "grad_norm": 1.4406486107354575, + "learning_rate": 8.467686010623694e-06, + "loss": 0.6818, + "step": 5507 + }, + { + "epoch": 0.562614913176711, + "grad_norm": 1.4616967279507533, + "learning_rate": 8.464416832221797e-06, + "loss": 0.7278, + "step": 5508 + }, + { + "epoch": 0.5627170582226761, + "grad_norm": 1.3173178707164772, + "learning_rate": 8.46114782189047e-06, + "loss": 0.7004, + "step": 5509 + }, + { + "epoch": 0.5628192032686414, + "grad_norm": 1.545110222622579, + "learning_rate": 8.457878979987507e-06, + "loss": 0.7876, + "step": 5510 + }, + { + "epoch": 0.5629213483146067, + "grad_norm": 1.450836591062816, + "learning_rate": 8.454610306870688e-06, + "loss": 0.6435, + "step": 5511 + }, + { + "epoch": 0.563023493360572, + "grad_norm": 1.4712818830344108, + "learning_rate": 8.451341802897764e-06, + "loss": 0.8102, + "step": 5512 + }, + { + "epoch": 0.5631256384065373, + "grad_norm": 1.5437649731579361, + "learning_rate": 8.448073468426483e-06, + "loss": 0.6643, + "step": 5513 + }, + { + "epoch": 0.5632277834525026, + "grad_norm": 1.681499234085543, + "learning_rate": 8.444805303814566e-06, + "loss": 0.6865, + "step": 5514 + }, + { + "epoch": 0.5633299284984679, + "grad_norm": 1.4579460375539743, + "learning_rate": 8.441537309419713e-06, + "loss": 0.7953, + "step": 5515 + }, + { + "epoch": 0.5634320735444331, + "grad_norm": 1.602627341735896, + "learning_rate": 8.438269485599606e-06, + "loss": 0.7554, + "step": 5516 + }, + { + "epoch": 0.5635342185903983, + "grad_norm": 1.5134446688913048, + "learning_rate": 8.435001832711915e-06, + "loss": 0.602, + "step": 5517 + }, + { + "epoch": 0.5636363636363636, + "grad_norm": 1.6286407042271358, + "learning_rate": 8.431734351114285e-06, + "loss": 0.7646, + "step": 5518 + }, + { + "epoch": 0.5637385086823289, + "grad_norm": 1.4950181869379995, + "learning_rate": 8.428467041164341e-06, + "loss": 0.6759, + "step": 5519 + }, + { + "epoch": 0.5638406537282942, + "grad_norm": 1.3760183421170613, + "learning_rate": 8.425199903219693e-06, + "loss": 0.7645, + "step": 5520 + }, + { + "epoch": 0.5639427987742595, + "grad_norm": 1.4446162890346474, + "learning_rate": 8.421932937637936e-06, + "loss": 0.7053, + "step": 5521 + }, + { + "epoch": 0.5640449438202247, + "grad_norm": 1.4756902464276427, + "learning_rate": 8.41866614477664e-06, + "loss": 0.6366, + "step": 5522 + }, + { + "epoch": 0.56414708886619, + "grad_norm": 1.3881349447001143, + "learning_rate": 8.415399524993355e-06, + "loss": 0.6912, + "step": 5523 + }, + { + "epoch": 0.5642492339121553, + "grad_norm": 1.5122236258105692, + "learning_rate": 8.412133078645616e-06, + "loss": 0.7325, + "step": 5524 + }, + { + "epoch": 0.5643513789581205, + "grad_norm": 1.452961049797863, + "learning_rate": 8.408866806090936e-06, + "loss": 0.7303, + "step": 5525 + }, + { + "epoch": 0.5644535240040858, + "grad_norm": 1.263921346814425, + "learning_rate": 8.405600707686815e-06, + "loss": 0.6591, + "step": 5526 + }, + { + "epoch": 0.5645556690500511, + "grad_norm": 1.6093603868663224, + "learning_rate": 8.402334783790722e-06, + "loss": 0.7768, + "step": 5527 + }, + { + "epoch": 0.5646578140960163, + "grad_norm": 1.4359749128475645, + "learning_rate": 8.399069034760119e-06, + "loss": 0.7257, + "step": 5528 + }, + { + "epoch": 0.5647599591419816, + "grad_norm": 1.4561805674047275, + "learning_rate": 8.395803460952448e-06, + "loss": 0.7267, + "step": 5529 + }, + { + "epoch": 0.5648621041879469, + "grad_norm": 1.3748865796071352, + "learning_rate": 8.392538062725123e-06, + "loss": 0.6886, + "step": 5530 + }, + { + "epoch": 0.5649642492339122, + "grad_norm": 1.6115678313747757, + "learning_rate": 8.389272840435548e-06, + "loss": 0.7802, + "step": 5531 + }, + { + "epoch": 0.5650663942798774, + "grad_norm": 1.5146207628384138, + "learning_rate": 8.386007794441098e-06, + "loss": 0.7429, + "step": 5532 + }, + { + "epoch": 0.5651685393258427, + "grad_norm": 1.5020452005872036, + "learning_rate": 8.382742925099141e-06, + "loss": 0.7101, + "step": 5533 + }, + { + "epoch": 0.5652706843718079, + "grad_norm": 1.4195709083650634, + "learning_rate": 8.379478232767014e-06, + "loss": 0.5951, + "step": 5534 + }, + { + "epoch": 0.5653728294177732, + "grad_norm": 1.4541113404505992, + "learning_rate": 8.376213717802042e-06, + "loss": 0.709, + "step": 5535 + }, + { + "epoch": 0.5654749744637385, + "grad_norm": 1.2191889759574968, + "learning_rate": 8.372949380561523e-06, + "loss": 0.8121, + "step": 5536 + }, + { + "epoch": 0.5655771195097038, + "grad_norm": 1.3374943295366444, + "learning_rate": 8.36968522140275e-06, + "loss": 0.7005, + "step": 5537 + }, + { + "epoch": 0.5656792645556691, + "grad_norm": 1.339263273006483, + "learning_rate": 8.366421240682983e-06, + "loss": 0.6471, + "step": 5538 + }, + { + "epoch": 0.5657814096016344, + "grad_norm": 1.5284134081229392, + "learning_rate": 8.363157438759469e-06, + "loss": 0.7397, + "step": 5539 + }, + { + "epoch": 0.5658835546475995, + "grad_norm": 1.4117463498978489, + "learning_rate": 8.359893815989425e-06, + "loss": 0.6237, + "step": 5540 + }, + { + "epoch": 0.5659856996935648, + "grad_norm": 1.4771856313297753, + "learning_rate": 8.356630372730068e-06, + "loss": 0.7695, + "step": 5541 + }, + { + "epoch": 0.5660878447395301, + "grad_norm": 1.4193530252703073, + "learning_rate": 8.353367109338576e-06, + "loss": 0.753, + "step": 5542 + }, + { + "epoch": 0.5661899897854954, + "grad_norm": 1.5988505931947978, + "learning_rate": 8.350104026172118e-06, + "loss": 0.6664, + "step": 5543 + }, + { + "epoch": 0.5662921348314607, + "grad_norm": 1.2970332457595837, + "learning_rate": 8.346841123587836e-06, + "loss": 0.5533, + "step": 5544 + }, + { + "epoch": 0.566394279877426, + "grad_norm": 1.3558863327370871, + "learning_rate": 8.343578401942865e-06, + "loss": 0.7161, + "step": 5545 + }, + { + "epoch": 0.5664964249233913, + "grad_norm": 1.2766845592774723, + "learning_rate": 8.340315861594309e-06, + "loss": 0.6098, + "step": 5546 + }, + { + "epoch": 0.5665985699693565, + "grad_norm": 1.494831160185546, + "learning_rate": 8.337053502899253e-06, + "loss": 0.7157, + "step": 5547 + }, + { + "epoch": 0.5667007150153217, + "grad_norm": 1.3988168347511352, + "learning_rate": 8.333791326214767e-06, + "loss": 0.6124, + "step": 5548 + }, + { + "epoch": 0.566802860061287, + "grad_norm": 1.4021994741444659, + "learning_rate": 8.330529331897895e-06, + "loss": 0.6938, + "step": 5549 + }, + { + "epoch": 0.5669050051072523, + "grad_norm": 1.4701462153986604, + "learning_rate": 8.327267520305669e-06, + "loss": 0.7549, + "step": 5550 + }, + { + "epoch": 0.5670071501532176, + "grad_norm": 1.301337063550517, + "learning_rate": 8.32400589179509e-06, + "loss": 0.601, + "step": 5551 + }, + { + "epoch": 0.5671092951991829, + "grad_norm": 1.5803205715035384, + "learning_rate": 8.320744446723149e-06, + "loss": 0.7899, + "step": 5552 + }, + { + "epoch": 0.5672114402451481, + "grad_norm": 1.332096685267543, + "learning_rate": 8.317483185446815e-06, + "loss": 0.6942, + "step": 5553 + }, + { + "epoch": 0.5673135852911134, + "grad_norm": 1.4207245341357648, + "learning_rate": 8.314222108323033e-06, + "loss": 0.6795, + "step": 5554 + }, + { + "epoch": 0.5674157303370787, + "grad_norm": 1.4870848608321372, + "learning_rate": 8.310961215708731e-06, + "loss": 0.7063, + "step": 5555 + }, + { + "epoch": 0.5675178753830439, + "grad_norm": 1.4859237206022573, + "learning_rate": 8.307700507960817e-06, + "loss": 0.6861, + "step": 5556 + }, + { + "epoch": 0.5676200204290092, + "grad_norm": 1.5191927850781612, + "learning_rate": 8.304439985436172e-06, + "loss": 0.5831, + "step": 5557 + }, + { + "epoch": 0.5677221654749744, + "grad_norm": 1.4209278700463186, + "learning_rate": 8.301179648491669e-06, + "loss": 0.6666, + "step": 5558 + }, + { + "epoch": 0.5678243105209397, + "grad_norm": 1.4714484289948508, + "learning_rate": 8.297919497484148e-06, + "loss": 0.781, + "step": 5559 + }, + { + "epoch": 0.567926455566905, + "grad_norm": 1.3288912110969469, + "learning_rate": 8.294659532770437e-06, + "loss": 0.6737, + "step": 5560 + }, + { + "epoch": 0.5680286006128703, + "grad_norm": 1.419918056426505, + "learning_rate": 8.291399754707346e-06, + "loss": 0.7343, + "step": 5561 + }, + { + "epoch": 0.5681307456588356, + "grad_norm": 1.4191260281212257, + "learning_rate": 8.288140163651652e-06, + "loss": 0.7239, + "step": 5562 + }, + { + "epoch": 0.5682328907048008, + "grad_norm": 1.4969376217971684, + "learning_rate": 8.284880759960128e-06, + "loss": 0.7231, + "step": 5563 + }, + { + "epoch": 0.568335035750766, + "grad_norm": 1.4014358980120416, + "learning_rate": 8.281621543989508e-06, + "loss": 0.6127, + "step": 5564 + }, + { + "epoch": 0.5684371807967313, + "grad_norm": 1.4588840308043796, + "learning_rate": 8.278362516096524e-06, + "loss": 0.67, + "step": 5565 + }, + { + "epoch": 0.5685393258426966, + "grad_norm": 1.534157437069656, + "learning_rate": 8.275103676637872e-06, + "loss": 0.7857, + "step": 5566 + }, + { + "epoch": 0.5686414708886619, + "grad_norm": 1.443239945167279, + "learning_rate": 8.27184502597024e-06, + "loss": 0.7843, + "step": 5567 + }, + { + "epoch": 0.5687436159346272, + "grad_norm": 1.4194112044353462, + "learning_rate": 8.268586564450282e-06, + "loss": 0.7269, + "step": 5568 + }, + { + "epoch": 0.5688457609805925, + "grad_norm": 1.2209366142601779, + "learning_rate": 8.265328292434644e-06, + "loss": 0.5803, + "step": 5569 + }, + { + "epoch": 0.5689479060265578, + "grad_norm": 1.2860033587050994, + "learning_rate": 8.262070210279949e-06, + "loss": 0.6097, + "step": 5570 + }, + { + "epoch": 0.5690500510725229, + "grad_norm": 1.5168856969051356, + "learning_rate": 8.258812318342789e-06, + "loss": 0.7797, + "step": 5571 + }, + { + "epoch": 0.5691521961184882, + "grad_norm": 1.4309638441955725, + "learning_rate": 8.255554616979748e-06, + "loss": 0.581, + "step": 5572 + }, + { + "epoch": 0.5692543411644535, + "grad_norm": 1.556464091114335, + "learning_rate": 8.252297106547382e-06, + "loss": 0.7073, + "step": 5573 + }, + { + "epoch": 0.5693564862104188, + "grad_norm": 1.2960146281569849, + "learning_rate": 8.249039787402224e-06, + "loss": 0.6786, + "step": 5574 + }, + { + "epoch": 0.5694586312563841, + "grad_norm": 1.3832064420949621, + "learning_rate": 8.245782659900796e-06, + "loss": 0.6894, + "step": 5575 + }, + { + "epoch": 0.5695607763023494, + "grad_norm": 1.4977078386221925, + "learning_rate": 8.242525724399584e-06, + "loss": 0.7585, + "step": 5576 + }, + { + "epoch": 0.5696629213483146, + "grad_norm": 1.5602924146920711, + "learning_rate": 8.239268981255069e-06, + "loss": 0.7245, + "step": 5577 + }, + { + "epoch": 0.5697650663942799, + "grad_norm": 1.3789628662314972, + "learning_rate": 8.236012430823703e-06, + "loss": 0.5995, + "step": 5578 + }, + { + "epoch": 0.5698672114402451, + "grad_norm": 1.3379489313129043, + "learning_rate": 8.232756073461915e-06, + "loss": 0.5899, + "step": 5579 + }, + { + "epoch": 0.5699693564862104, + "grad_norm": 1.3440329832094646, + "learning_rate": 8.229499909526117e-06, + "loss": 0.6151, + "step": 5580 + }, + { + "epoch": 0.5700715015321757, + "grad_norm": 1.6416064035886635, + "learning_rate": 8.226243939372698e-06, + "loss": 0.7515, + "step": 5581 + }, + { + "epoch": 0.570173646578141, + "grad_norm": 1.5116545250869746, + "learning_rate": 8.222988163358028e-06, + "loss": 0.6849, + "step": 5582 + }, + { + "epoch": 0.5702757916241062, + "grad_norm": 1.6471497263817951, + "learning_rate": 8.219732581838447e-06, + "loss": 0.7142, + "step": 5583 + }, + { + "epoch": 0.5703779366700715, + "grad_norm": 1.537439691228659, + "learning_rate": 8.216477195170285e-06, + "loss": 0.7651, + "step": 5584 + }, + { + "epoch": 0.5704800817160368, + "grad_norm": 1.4430194669165006, + "learning_rate": 8.21322200370985e-06, + "loss": 0.6656, + "step": 5585 + }, + { + "epoch": 0.5705822267620021, + "grad_norm": 1.4725616895800748, + "learning_rate": 8.209967007813419e-06, + "loss": 0.6001, + "step": 5586 + }, + { + "epoch": 0.5706843718079673, + "grad_norm": 1.4823640667033064, + "learning_rate": 8.206712207837257e-06, + "loss": 0.8307, + "step": 5587 + }, + { + "epoch": 0.5707865168539326, + "grad_norm": 1.537808750675819, + "learning_rate": 8.2034576041376e-06, + "loss": 0.8112, + "step": 5588 + }, + { + "epoch": 0.5708886618998978, + "grad_norm": 1.4653883943868213, + "learning_rate": 8.200203197070673e-06, + "loss": 0.7844, + "step": 5589 + }, + { + "epoch": 0.5709908069458631, + "grad_norm": 1.5254797649357665, + "learning_rate": 8.196948986992667e-06, + "loss": 0.7117, + "step": 5590 + }, + { + "epoch": 0.5710929519918284, + "grad_norm": 1.6050926131275574, + "learning_rate": 8.193694974259759e-06, + "loss": 0.7696, + "step": 5591 + }, + { + "epoch": 0.5711950970377937, + "grad_norm": 1.3376944891028861, + "learning_rate": 8.1904411592281e-06, + "loss": 0.6248, + "step": 5592 + }, + { + "epoch": 0.571297242083759, + "grad_norm": 1.5811762962144547, + "learning_rate": 8.18718754225383e-06, + "loss": 0.6773, + "step": 5593 + }, + { + "epoch": 0.5713993871297242, + "grad_norm": 1.578578112276457, + "learning_rate": 8.183934123693052e-06, + "loss": 0.7359, + "step": 5594 + }, + { + "epoch": 0.5715015321756894, + "grad_norm": 1.4489308227038415, + "learning_rate": 8.18068090390186e-06, + "loss": 0.7416, + "step": 5595 + }, + { + "epoch": 0.5716036772216547, + "grad_norm": 1.416339811888468, + "learning_rate": 8.177427883236316e-06, + "loss": 0.613, + "step": 5596 + }, + { + "epoch": 0.57170582226762, + "grad_norm": 1.456391453880688, + "learning_rate": 8.17417506205247e-06, + "loss": 0.6652, + "step": 5597 + }, + { + "epoch": 0.5718079673135853, + "grad_norm": 1.610523099205285, + "learning_rate": 8.170922440706342e-06, + "loss": 0.6861, + "step": 5598 + }, + { + "epoch": 0.5719101123595506, + "grad_norm": 1.4646832894356274, + "learning_rate": 8.167670019553934e-06, + "loss": 0.6328, + "step": 5599 + }, + { + "epoch": 0.5720122574055159, + "grad_norm": 1.533819052881376, + "learning_rate": 8.164417798951224e-06, + "loss": 0.6548, + "step": 5600 + }, + { + "epoch": 0.5721144024514812, + "grad_norm": 1.6023117174354504, + "learning_rate": 8.161165779254174e-06, + "loss": 0.732, + "step": 5601 + }, + { + "epoch": 0.5722165474974463, + "grad_norm": 1.5340524639347695, + "learning_rate": 8.15791396081872e-06, + "loss": 0.7461, + "step": 5602 + }, + { + "epoch": 0.5723186925434116, + "grad_norm": 1.4974045811774361, + "learning_rate": 8.154662344000769e-06, + "loss": 0.7844, + "step": 5603 + }, + { + "epoch": 0.5724208375893769, + "grad_norm": 1.37114688318297, + "learning_rate": 8.15141092915622e-06, + "loss": 0.7016, + "step": 5604 + }, + { + "epoch": 0.5725229826353422, + "grad_norm": 1.3934491876109605, + "learning_rate": 8.148159716640938e-06, + "loss": 0.6403, + "step": 5605 + }, + { + "epoch": 0.5726251276813075, + "grad_norm": 1.5270139773915146, + "learning_rate": 8.144908706810772e-06, + "loss": 0.6498, + "step": 5606 + }, + { + "epoch": 0.5727272727272728, + "grad_norm": 1.5281764668417457, + "learning_rate": 8.141657900021544e-06, + "loss": 0.7074, + "step": 5607 + }, + { + "epoch": 0.572829417773238, + "grad_norm": 1.4273756275210643, + "learning_rate": 8.13840729662906e-06, + "loss": 0.7398, + "step": 5608 + }, + { + "epoch": 0.5729315628192033, + "grad_norm": 1.5579555294669083, + "learning_rate": 8.135156896989103e-06, + "loss": 0.6671, + "step": 5609 + }, + { + "epoch": 0.5730337078651685, + "grad_norm": 1.517223092753926, + "learning_rate": 8.131906701457427e-06, + "loss": 0.5893, + "step": 5610 + }, + { + "epoch": 0.5731358529111338, + "grad_norm": 1.4562633032354642, + "learning_rate": 8.128656710389769e-06, + "loss": 0.6092, + "step": 5611 + }, + { + "epoch": 0.5732379979570991, + "grad_norm": 1.589556494506836, + "learning_rate": 8.125406924141846e-06, + "loss": 0.7022, + "step": 5612 + }, + { + "epoch": 0.5733401430030644, + "grad_norm": 1.4223103158637547, + "learning_rate": 8.12215734306934e-06, + "loss": 0.807, + "step": 5613 + }, + { + "epoch": 0.5734422880490296, + "grad_norm": 1.436318568788936, + "learning_rate": 8.118907967527933e-06, + "loss": 0.7285, + "step": 5614 + }, + { + "epoch": 0.5735444330949949, + "grad_norm": 1.4105874159738752, + "learning_rate": 8.11565879787326e-06, + "loss": 0.6907, + "step": 5615 + }, + { + "epoch": 0.5736465781409602, + "grad_norm": 1.2552510874679075, + "learning_rate": 8.112409834460949e-06, + "loss": 0.6898, + "step": 5616 + }, + { + "epoch": 0.5737487231869254, + "grad_norm": 1.5381584408876339, + "learning_rate": 8.109161077646602e-06, + "loss": 0.7167, + "step": 5617 + }, + { + "epoch": 0.5738508682328907, + "grad_norm": 1.5725350551623212, + "learning_rate": 8.105912527785797e-06, + "loss": 0.7369, + "step": 5618 + }, + { + "epoch": 0.573953013278856, + "grad_norm": 1.5268710241750048, + "learning_rate": 8.10266418523409e-06, + "loss": 0.7832, + "step": 5619 + }, + { + "epoch": 0.5740551583248212, + "grad_norm": 1.4219351563476499, + "learning_rate": 8.099416050347013e-06, + "loss": 0.7062, + "step": 5620 + }, + { + "epoch": 0.5741573033707865, + "grad_norm": 1.582483450045852, + "learning_rate": 8.09616812348008e-06, + "loss": 0.7934, + "step": 5621 + }, + { + "epoch": 0.5742594484167518, + "grad_norm": 1.5381008434198968, + "learning_rate": 8.092920404988771e-06, + "loss": 0.8042, + "step": 5622 + }, + { + "epoch": 0.5743615934627171, + "grad_norm": 1.428464280513046, + "learning_rate": 8.08967289522856e-06, + "loss": 0.7316, + "step": 5623 + }, + { + "epoch": 0.5744637385086824, + "grad_norm": 1.4431869477896824, + "learning_rate": 8.086425594554882e-06, + "loss": 0.6957, + "step": 5624 + }, + { + "epoch": 0.5745658835546475, + "grad_norm": 1.4477018724438273, + "learning_rate": 8.083178503323156e-06, + "loss": 0.6335, + "step": 5625 + }, + { + "epoch": 0.5746680286006128, + "grad_norm": 1.2804855625032063, + "learning_rate": 8.079931621888788e-06, + "loss": 0.669, + "step": 5626 + }, + { + "epoch": 0.5747701736465781, + "grad_norm": 1.2175355735284705, + "learning_rate": 8.076684950607143e-06, + "loss": 0.5338, + "step": 5627 + }, + { + "epoch": 0.5748723186925434, + "grad_norm": 1.4883113317634589, + "learning_rate": 8.073438489833572e-06, + "loss": 0.6681, + "step": 5628 + }, + { + "epoch": 0.5749744637385087, + "grad_norm": 1.3559301533091768, + "learning_rate": 8.070192239923403e-06, + "loss": 0.7148, + "step": 5629 + }, + { + "epoch": 0.575076608784474, + "grad_norm": 1.404713633374025, + "learning_rate": 8.06694620123194e-06, + "loss": 0.6721, + "step": 5630 + }, + { + "epoch": 0.5751787538304393, + "grad_norm": 1.317258386008676, + "learning_rate": 8.063700374114465e-06, + "loss": 0.5983, + "step": 5631 + }, + { + "epoch": 0.5752808988764045, + "grad_norm": 1.4909351428117925, + "learning_rate": 8.060454758926231e-06, + "loss": 0.7412, + "step": 5632 + }, + { + "epoch": 0.5753830439223697, + "grad_norm": 1.3777339864925924, + "learning_rate": 8.057209356022479e-06, + "loss": 0.7076, + "step": 5633 + }, + { + "epoch": 0.575485188968335, + "grad_norm": 1.5536966310960205, + "learning_rate": 8.05396416575842e-06, + "loss": 0.784, + "step": 5634 + }, + { + "epoch": 0.5755873340143003, + "grad_norm": 1.4816599492126254, + "learning_rate": 8.050719188489238e-06, + "loss": 0.693, + "step": 5635 + }, + { + "epoch": 0.5756894790602656, + "grad_norm": 1.4859062637553826, + "learning_rate": 8.047474424570102e-06, + "loss": 0.7171, + "step": 5636 + }, + { + "epoch": 0.5757916241062309, + "grad_norm": 1.3286036844400726, + "learning_rate": 8.044229874356153e-06, + "loss": 0.6409, + "step": 5637 + }, + { + "epoch": 0.5758937691521961, + "grad_norm": 1.5573625662073798, + "learning_rate": 8.040985538202506e-06, + "loss": 0.7284, + "step": 5638 + }, + { + "epoch": 0.5759959141981614, + "grad_norm": 1.5523779803189943, + "learning_rate": 8.037741416464258e-06, + "loss": 0.7295, + "step": 5639 + }, + { + "epoch": 0.5760980592441267, + "grad_norm": 1.2871225483809485, + "learning_rate": 8.034497509496477e-06, + "loss": 0.69, + "step": 5640 + }, + { + "epoch": 0.5762002042900919, + "grad_norm": 1.4060997871871321, + "learning_rate": 8.031253817654216e-06, + "loss": 0.7999, + "step": 5641 + }, + { + "epoch": 0.5763023493360572, + "grad_norm": 1.4468961881177353, + "learning_rate": 8.028010341292496e-06, + "loss": 0.6694, + "step": 5642 + }, + { + "epoch": 0.5764044943820225, + "grad_norm": 1.6859879145002146, + "learning_rate": 8.02476708076632e-06, + "loss": 0.7437, + "step": 5643 + }, + { + "epoch": 0.5765066394279877, + "grad_norm": 1.539003912969649, + "learning_rate": 8.021524036430662e-06, + "loss": 0.7777, + "step": 5644 + }, + { + "epoch": 0.576608784473953, + "grad_norm": 1.332350836527585, + "learning_rate": 8.018281208640477e-06, + "loss": 0.6686, + "step": 5645 + }, + { + "epoch": 0.5767109295199183, + "grad_norm": 1.4736025610737644, + "learning_rate": 8.015038597750694e-06, + "loss": 0.6822, + "step": 5646 + }, + { + "epoch": 0.5768130745658836, + "grad_norm": 1.4286372783490593, + "learning_rate": 8.011796204116218e-06, + "loss": 0.6398, + "step": 5647 + }, + { + "epoch": 0.5769152196118488, + "grad_norm": 1.4752399836300267, + "learning_rate": 8.00855402809193e-06, + "loss": 0.6808, + "step": 5648 + }, + { + "epoch": 0.5770173646578141, + "grad_norm": 1.460300262926344, + "learning_rate": 8.005312070032693e-06, + "loss": 0.746, + "step": 5649 + }, + { + "epoch": 0.5771195097037793, + "grad_norm": 1.6087117566746612, + "learning_rate": 8.002070330293337e-06, + "loss": 0.7085, + "step": 5650 + }, + { + "epoch": 0.5772216547497446, + "grad_norm": 1.3668984543984017, + "learning_rate": 7.998828809228678e-06, + "loss": 0.6678, + "step": 5651 + }, + { + "epoch": 0.5773237997957099, + "grad_norm": 1.4319599078008105, + "learning_rate": 7.995587507193494e-06, + "loss": 0.6641, + "step": 5652 + }, + { + "epoch": 0.5774259448416752, + "grad_norm": 1.4645757724847759, + "learning_rate": 7.992346424542556e-06, + "loss": 0.6628, + "step": 5653 + }, + { + "epoch": 0.5775280898876405, + "grad_norm": 1.4685932317783474, + "learning_rate": 7.989105561630598e-06, + "loss": 0.7543, + "step": 5654 + }, + { + "epoch": 0.5776302349336058, + "grad_norm": 1.4580521186699307, + "learning_rate": 7.985864918812336e-06, + "loss": 0.6402, + "step": 5655 + }, + { + "epoch": 0.5777323799795709, + "grad_norm": 1.4889346291318515, + "learning_rate": 7.982624496442456e-06, + "loss": 0.6625, + "step": 5656 + }, + { + "epoch": 0.5778345250255362, + "grad_norm": 1.6421527768513642, + "learning_rate": 7.97938429487563e-06, + "loss": 0.7435, + "step": 5657 + }, + { + "epoch": 0.5779366700715015, + "grad_norm": 1.5417131501484904, + "learning_rate": 7.976144314466501e-06, + "loss": 0.6393, + "step": 5658 + }, + { + "epoch": 0.5780388151174668, + "grad_norm": 1.3807027274192551, + "learning_rate": 7.972904555569683e-06, + "loss": 0.7074, + "step": 5659 + }, + { + "epoch": 0.5781409601634321, + "grad_norm": 1.3511251046107053, + "learning_rate": 7.969665018539771e-06, + "loss": 0.6119, + "step": 5660 + }, + { + "epoch": 0.5782431052093974, + "grad_norm": 1.7011235081288032, + "learning_rate": 7.966425703731334e-06, + "loss": 0.9171, + "step": 5661 + }, + { + "epoch": 0.5783452502553627, + "grad_norm": 1.5274559972007, + "learning_rate": 7.96318661149892e-06, + "loss": 0.7116, + "step": 5662 + }, + { + "epoch": 0.5784473953013279, + "grad_norm": 1.466882545657048, + "learning_rate": 7.959947742197047e-06, + "loss": 0.7302, + "step": 5663 + }, + { + "epoch": 0.5785495403472931, + "grad_norm": 1.366570514116782, + "learning_rate": 7.956709096180206e-06, + "loss": 0.6748, + "step": 5664 + }, + { + "epoch": 0.5786516853932584, + "grad_norm": 1.647443402906696, + "learning_rate": 7.953470673802879e-06, + "loss": 0.6963, + "step": 5665 + }, + { + "epoch": 0.5787538304392237, + "grad_norm": 1.5027295314178175, + "learning_rate": 7.95023247541951e-06, + "loss": 0.7173, + "step": 5666 + }, + { + "epoch": 0.578855975485189, + "grad_norm": 1.5274770136651497, + "learning_rate": 7.946994501384518e-06, + "loss": 0.6343, + "step": 5667 + }, + { + "epoch": 0.5789581205311543, + "grad_norm": 1.549228383719072, + "learning_rate": 7.943756752052307e-06, + "loss": 0.7185, + "step": 5668 + }, + { + "epoch": 0.5790602655771195, + "grad_norm": 1.4127040833564875, + "learning_rate": 7.940519227777246e-06, + "loss": 0.7594, + "step": 5669 + }, + { + "epoch": 0.5791624106230848, + "grad_norm": 1.5320751243211286, + "learning_rate": 7.937281928913688e-06, + "loss": 0.6973, + "step": 5670 + }, + { + "epoch": 0.57926455566905, + "grad_norm": 1.561218651689877, + "learning_rate": 7.934044855815955e-06, + "loss": 0.7799, + "step": 5671 + }, + { + "epoch": 0.5793667007150153, + "grad_norm": 1.51958735641364, + "learning_rate": 7.930808008838342e-06, + "loss": 0.61, + "step": 5672 + }, + { + "epoch": 0.5794688457609806, + "grad_norm": 1.5703503778999657, + "learning_rate": 7.927571388335135e-06, + "loss": 0.7601, + "step": 5673 + }, + { + "epoch": 0.5795709908069459, + "grad_norm": 1.4492185089026643, + "learning_rate": 7.924334994660577e-06, + "loss": 0.5953, + "step": 5674 + }, + { + "epoch": 0.5796731358529111, + "grad_norm": 1.3981217725898551, + "learning_rate": 7.921098828168897e-06, + "loss": 0.7107, + "step": 5675 + }, + { + "epoch": 0.5797752808988764, + "grad_norm": 1.5133735062563882, + "learning_rate": 7.917862889214292e-06, + "loss": 0.6741, + "step": 5676 + }, + { + "epoch": 0.5798774259448417, + "grad_norm": 1.4447121674112442, + "learning_rate": 7.91462717815094e-06, + "loss": 0.6765, + "step": 5677 + }, + { + "epoch": 0.579979570990807, + "grad_norm": 1.570056760420568, + "learning_rate": 7.911391695332988e-06, + "loss": 0.8258, + "step": 5678 + }, + { + "epoch": 0.5800817160367722, + "grad_norm": 1.3898283333925954, + "learning_rate": 7.908156441114567e-06, + "loss": 0.7303, + "step": 5679 + }, + { + "epoch": 0.5801838610827375, + "grad_norm": 1.6167578452871458, + "learning_rate": 7.90492141584977e-06, + "loss": 0.7587, + "step": 5680 + }, + { + "epoch": 0.5802860061287027, + "grad_norm": 1.372384946572026, + "learning_rate": 7.901686619892685e-06, + "loss": 0.585, + "step": 5681 + }, + { + "epoch": 0.580388151174668, + "grad_norm": 1.5623582957132773, + "learning_rate": 7.898452053597349e-06, + "loss": 0.71, + "step": 5682 + }, + { + "epoch": 0.5804902962206333, + "grad_norm": 1.4270688781977934, + "learning_rate": 7.895217717317798e-06, + "loss": 0.673, + "step": 5683 + }, + { + "epoch": 0.5805924412665986, + "grad_norm": 1.464707999415032, + "learning_rate": 7.891983611408026e-06, + "loss": 0.7398, + "step": 5684 + }, + { + "epoch": 0.5806945863125639, + "grad_norm": 1.4235345217358386, + "learning_rate": 7.88874973622201e-06, + "loss": 0.778, + "step": 5685 + }, + { + "epoch": 0.5807967313585292, + "grad_norm": 1.5200852314676636, + "learning_rate": 7.8855160921137e-06, + "loss": 0.7258, + "step": 5686 + }, + { + "epoch": 0.5808988764044943, + "grad_norm": 1.3818079001734629, + "learning_rate": 7.88228267943702e-06, + "loss": 0.5681, + "step": 5687 + }, + { + "epoch": 0.5810010214504596, + "grad_norm": 1.493462708442748, + "learning_rate": 7.879049498545864e-06, + "loss": 0.7186, + "step": 5688 + }, + { + "epoch": 0.5811031664964249, + "grad_norm": 1.5894152323926831, + "learning_rate": 7.875816549794113e-06, + "loss": 0.7169, + "step": 5689 + }, + { + "epoch": 0.5812053115423902, + "grad_norm": 1.5425002409969844, + "learning_rate": 7.872583833535616e-06, + "loss": 0.8022, + "step": 5690 + }, + { + "epoch": 0.5813074565883555, + "grad_norm": 1.474689486914503, + "learning_rate": 7.86935135012419e-06, + "loss": 0.7248, + "step": 5691 + }, + { + "epoch": 0.5814096016343208, + "grad_norm": 1.402083891475651, + "learning_rate": 7.866119099913638e-06, + "loss": 0.5782, + "step": 5692 + }, + { + "epoch": 0.581511746680286, + "grad_norm": 1.4577953624324582, + "learning_rate": 7.862887083257728e-06, + "loss": 0.6844, + "step": 5693 + }, + { + "epoch": 0.5816138917262513, + "grad_norm": 1.4292870536491014, + "learning_rate": 7.859655300510209e-06, + "loss": 0.7003, + "step": 5694 + }, + { + "epoch": 0.5817160367722165, + "grad_norm": 1.44065877597086, + "learning_rate": 7.856423752024798e-06, + "loss": 0.6851, + "step": 5695 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 1.3317982467231464, + "learning_rate": 7.85319243815519e-06, + "loss": 0.6063, + "step": 5696 + }, + { + "epoch": 0.5819203268641471, + "grad_norm": 1.50295511294295, + "learning_rate": 7.84996135925506e-06, + "loss": 0.7441, + "step": 5697 + }, + { + "epoch": 0.5820224719101124, + "grad_norm": 1.3623730636270392, + "learning_rate": 7.846730515678047e-06, + "loss": 0.7867, + "step": 5698 + }, + { + "epoch": 0.5821246169560776, + "grad_norm": 1.375922109924535, + "learning_rate": 7.843499907777772e-06, + "loss": 0.551, + "step": 5699 + }, + { + "epoch": 0.5822267620020429, + "grad_norm": 1.4527912164592507, + "learning_rate": 7.840269535907826e-06, + "loss": 0.7223, + "step": 5700 + }, + { + "epoch": 0.5823289070480082, + "grad_norm": 1.4786742798360644, + "learning_rate": 7.837039400421773e-06, + "loss": 0.71, + "step": 5701 + }, + { + "epoch": 0.5824310520939734, + "grad_norm": 1.3067269156803258, + "learning_rate": 7.833809501673155e-06, + "loss": 0.6546, + "step": 5702 + }, + { + "epoch": 0.5825331971399387, + "grad_norm": 1.3647659121303959, + "learning_rate": 7.830579840015486e-06, + "loss": 0.6541, + "step": 5703 + }, + { + "epoch": 0.582635342185904, + "grad_norm": 1.4445183638176506, + "learning_rate": 7.827350415802254e-06, + "loss": 0.7103, + "step": 5704 + }, + { + "epoch": 0.5827374872318692, + "grad_norm": 1.4962103606159176, + "learning_rate": 7.824121229386925e-06, + "loss": 0.7842, + "step": 5705 + }, + { + "epoch": 0.5828396322778345, + "grad_norm": 1.4182254129509393, + "learning_rate": 7.820892281122932e-06, + "loss": 0.6714, + "step": 5706 + }, + { + "epoch": 0.5829417773237998, + "grad_norm": 1.4270658988553722, + "learning_rate": 7.81766357136369e-06, + "loss": 0.6168, + "step": 5707 + }, + { + "epoch": 0.5830439223697651, + "grad_norm": 1.3658979392186075, + "learning_rate": 7.814435100462576e-06, + "loss": 0.6913, + "step": 5708 + }, + { + "epoch": 0.5831460674157304, + "grad_norm": 1.574813197761062, + "learning_rate": 7.811206868772956e-06, + "loss": 0.6508, + "step": 5709 + }, + { + "epoch": 0.5832482124616956, + "grad_norm": 1.3890903426933534, + "learning_rate": 7.807978876648155e-06, + "loss": 0.662, + "step": 5710 + }, + { + "epoch": 0.5833503575076608, + "grad_norm": 1.288814289744943, + "learning_rate": 7.804751124441486e-06, + "loss": 0.7052, + "step": 5711 + }, + { + "epoch": 0.5834525025536261, + "grad_norm": 1.424970927454983, + "learning_rate": 7.801523612506219e-06, + "loss": 0.62, + "step": 5712 + }, + { + "epoch": 0.5835546475995914, + "grad_norm": 1.3897015927427192, + "learning_rate": 7.798296341195615e-06, + "loss": 0.7511, + "step": 5713 + }, + { + "epoch": 0.5836567926455567, + "grad_norm": 1.4849355068113859, + "learning_rate": 7.7950693108629e-06, + "loss": 0.7324, + "step": 5714 + }, + { + "epoch": 0.583758937691522, + "grad_norm": 1.4632220767221882, + "learning_rate": 7.791842521861273e-06, + "loss": 0.6898, + "step": 5715 + }, + { + "epoch": 0.5838610827374873, + "grad_norm": 1.486605349852115, + "learning_rate": 7.788615974543911e-06, + "loss": 0.6347, + "step": 5716 + }, + { + "epoch": 0.5839632277834526, + "grad_norm": 1.4947044668385794, + "learning_rate": 7.785389669263959e-06, + "loss": 0.7894, + "step": 5717 + }, + { + "epoch": 0.5840653728294177, + "grad_norm": 1.418817451071997, + "learning_rate": 7.782163606374536e-06, + "loss": 0.7046, + "step": 5718 + }, + { + "epoch": 0.584167517875383, + "grad_norm": 1.477788347568906, + "learning_rate": 7.778937786228742e-06, + "loss": 0.7997, + "step": 5719 + }, + { + "epoch": 0.5842696629213483, + "grad_norm": 1.5241703170172283, + "learning_rate": 7.775712209179638e-06, + "loss": 0.618, + "step": 5720 + }, + { + "epoch": 0.5843718079673136, + "grad_norm": 1.3768086295740194, + "learning_rate": 7.772486875580272e-06, + "loss": 0.7101, + "step": 5721 + }, + { + "epoch": 0.5844739530132789, + "grad_norm": 1.565074841950299, + "learning_rate": 7.769261785783658e-06, + "loss": 0.6457, + "step": 5722 + }, + { + "epoch": 0.5845760980592442, + "grad_norm": 1.5386399844653846, + "learning_rate": 7.76603694014278e-06, + "loss": 0.7032, + "step": 5723 + }, + { + "epoch": 0.5846782431052094, + "grad_norm": 1.4017717932230207, + "learning_rate": 7.762812339010606e-06, + "loss": 0.6271, + "step": 5724 + }, + { + "epoch": 0.5847803881511746, + "grad_norm": 1.4419555308996117, + "learning_rate": 7.759587982740064e-06, + "loss": 0.7152, + "step": 5725 + }, + { + "epoch": 0.5848825331971399, + "grad_norm": 1.6202475039226216, + "learning_rate": 7.756363871684067e-06, + "loss": 0.6838, + "step": 5726 + }, + { + "epoch": 0.5849846782431052, + "grad_norm": 1.5656159122736155, + "learning_rate": 7.753140006195492e-06, + "loss": 0.7997, + "step": 5727 + }, + { + "epoch": 0.5850868232890705, + "grad_norm": 1.4247788608531322, + "learning_rate": 7.749916386627192e-06, + "loss": 0.6514, + "step": 5728 + }, + { + "epoch": 0.5851889683350358, + "grad_norm": 1.4193072615235633, + "learning_rate": 7.746693013332003e-06, + "loss": 0.6873, + "step": 5729 + }, + { + "epoch": 0.585291113381001, + "grad_norm": 1.4838095393907869, + "learning_rate": 7.743469886662715e-06, + "loss": 0.6546, + "step": 5730 + }, + { + "epoch": 0.5853932584269663, + "grad_norm": 1.4479339088576935, + "learning_rate": 7.74024700697211e-06, + "loss": 0.7141, + "step": 5731 + }, + { + "epoch": 0.5854954034729316, + "grad_norm": 1.363378970493064, + "learning_rate": 7.737024374612928e-06, + "loss": 0.6987, + "step": 5732 + }, + { + "epoch": 0.5855975485188968, + "grad_norm": 1.427744650188317, + "learning_rate": 7.733801989937892e-06, + "loss": 0.6942, + "step": 5733 + }, + { + "epoch": 0.5856996935648621, + "grad_norm": 1.4391906172074227, + "learning_rate": 7.730579853299691e-06, + "loss": 0.7181, + "step": 5734 + }, + { + "epoch": 0.5858018386108274, + "grad_norm": 1.4363305326814972, + "learning_rate": 7.72735796505099e-06, + "loss": 0.7135, + "step": 5735 + }, + { + "epoch": 0.5859039836567926, + "grad_norm": 1.46357004625461, + "learning_rate": 7.724136325544426e-06, + "loss": 0.6862, + "step": 5736 + }, + { + "epoch": 0.5860061287027579, + "grad_norm": 1.2960813380500738, + "learning_rate": 7.720914935132618e-06, + "loss": 0.7029, + "step": 5737 + }, + { + "epoch": 0.5861082737487232, + "grad_norm": 1.5072538710238959, + "learning_rate": 7.717693794168137e-06, + "loss": 0.7662, + "step": 5738 + }, + { + "epoch": 0.5862104187946885, + "grad_norm": 1.5029503113115303, + "learning_rate": 7.71447290300355e-06, + "loss": 0.7585, + "step": 5739 + }, + { + "epoch": 0.5863125638406538, + "grad_norm": 1.4023798565232715, + "learning_rate": 7.711252261991376e-06, + "loss": 0.6887, + "step": 5740 + }, + { + "epoch": 0.586414708886619, + "grad_norm": 1.3173696103105035, + "learning_rate": 7.708031871484123e-06, + "loss": 0.7161, + "step": 5741 + }, + { + "epoch": 0.5865168539325842, + "grad_norm": 1.42169585958349, + "learning_rate": 7.70481173183426e-06, + "loss": 0.7269, + "step": 5742 + }, + { + "epoch": 0.5866189989785495, + "grad_norm": 1.4560680288781076, + "learning_rate": 7.70159184339424e-06, + "loss": 0.741, + "step": 5743 + }, + { + "epoch": 0.5867211440245148, + "grad_norm": 1.4816116590415376, + "learning_rate": 7.698372206516472e-06, + "loss": 0.7315, + "step": 5744 + }, + { + "epoch": 0.5868232890704801, + "grad_norm": 1.4312738855062197, + "learning_rate": 7.695152821553355e-06, + "loss": 0.7005, + "step": 5745 + }, + { + "epoch": 0.5869254341164454, + "grad_norm": 1.4659484310979962, + "learning_rate": 7.691933688857254e-06, + "loss": 0.774, + "step": 5746 + }, + { + "epoch": 0.5870275791624107, + "grad_norm": 1.367307255945504, + "learning_rate": 7.6887148087805e-06, + "loss": 0.6445, + "step": 5747 + }, + { + "epoch": 0.587129724208376, + "grad_norm": 1.5573391282646336, + "learning_rate": 7.685496181675405e-06, + "loss": 0.7286, + "step": 5748 + }, + { + "epoch": 0.5872318692543411, + "grad_norm": 1.388106700766112, + "learning_rate": 7.682277807894246e-06, + "loss": 0.5863, + "step": 5749 + }, + { + "epoch": 0.5873340143003064, + "grad_norm": 1.4125534021288657, + "learning_rate": 7.679059687789281e-06, + "loss": 0.6085, + "step": 5750 + }, + { + "epoch": 0.5874361593462717, + "grad_norm": 1.48549400081952, + "learning_rate": 7.675841821712731e-06, + "loss": 0.7007, + "step": 5751 + }, + { + "epoch": 0.587538304392237, + "grad_norm": 1.431591080380305, + "learning_rate": 7.672624210016792e-06, + "loss": 0.7538, + "step": 5752 + }, + { + "epoch": 0.5876404494382023, + "grad_norm": 1.5095338713887558, + "learning_rate": 7.66940685305364e-06, + "loss": 0.7852, + "step": 5753 + }, + { + "epoch": 0.5877425944841675, + "grad_norm": 1.5579880489233626, + "learning_rate": 7.666189751175414e-06, + "loss": 0.6927, + "step": 5754 + }, + { + "epoch": 0.5878447395301328, + "grad_norm": 1.4907030530031022, + "learning_rate": 7.662972904734227e-06, + "loss": 0.6848, + "step": 5755 + }, + { + "epoch": 0.587946884576098, + "grad_norm": 1.4234210016742586, + "learning_rate": 7.659756314082167e-06, + "loss": 0.7078, + "step": 5756 + }, + { + "epoch": 0.5880490296220633, + "grad_norm": 1.5315874608161262, + "learning_rate": 7.65653997957129e-06, + "loss": 0.7393, + "step": 5757 + }, + { + "epoch": 0.5881511746680286, + "grad_norm": 1.2921813577920969, + "learning_rate": 7.653323901553625e-06, + "loss": 0.6086, + "step": 5758 + }, + { + "epoch": 0.5882533197139939, + "grad_norm": 1.5021323272474654, + "learning_rate": 7.650108080381175e-06, + "loss": 0.7231, + "step": 5759 + }, + { + "epoch": 0.5883554647599591, + "grad_norm": 1.3350425349220059, + "learning_rate": 7.646892516405911e-06, + "loss": 0.7548, + "step": 5760 + }, + { + "epoch": 0.5884576098059244, + "grad_norm": 1.3695159320305166, + "learning_rate": 7.643677209979788e-06, + "loss": 0.6397, + "step": 5761 + }, + { + "epoch": 0.5885597548518897, + "grad_norm": 1.407791823542624, + "learning_rate": 7.640462161454712e-06, + "loss": 0.7314, + "step": 5762 + }, + { + "epoch": 0.588661899897855, + "grad_norm": 1.4864092848524708, + "learning_rate": 7.637247371182579e-06, + "loss": 0.7112, + "step": 5763 + }, + { + "epoch": 0.5887640449438202, + "grad_norm": 1.406465018365304, + "learning_rate": 7.634032839515246e-06, + "loss": 0.6085, + "step": 5764 + }, + { + "epoch": 0.5888661899897855, + "grad_norm": 1.548813137918193, + "learning_rate": 7.63081856680455e-06, + "loss": 0.7779, + "step": 5765 + }, + { + "epoch": 0.5889683350357507, + "grad_norm": 1.4307257018883277, + "learning_rate": 7.627604553402291e-06, + "loss": 0.6931, + "step": 5766 + }, + { + "epoch": 0.589070480081716, + "grad_norm": 1.483409059446225, + "learning_rate": 7.624390799660248e-06, + "loss": 0.7135, + "step": 5767 + }, + { + "epoch": 0.5891726251276813, + "grad_norm": 1.5442503422502547, + "learning_rate": 7.621177305930162e-06, + "loss": 0.7104, + "step": 5768 + }, + { + "epoch": 0.5892747701736466, + "grad_norm": 1.4097107421758208, + "learning_rate": 7.6179640725637596e-06, + "loss": 0.7218, + "step": 5769 + }, + { + "epoch": 0.5893769152196119, + "grad_norm": 1.5448612925622855, + "learning_rate": 7.61475109991273e-06, + "loss": 0.7752, + "step": 5770 + }, + { + "epoch": 0.5894790602655772, + "grad_norm": 1.5660689682465545, + "learning_rate": 7.611538388328734e-06, + "loss": 0.7952, + "step": 5771 + }, + { + "epoch": 0.5895812053115423, + "grad_norm": 1.5737212085221757, + "learning_rate": 7.608325938163402e-06, + "loss": 0.7108, + "step": 5772 + }, + { + "epoch": 0.5896833503575076, + "grad_norm": 1.675708624857963, + "learning_rate": 7.605113749768344e-06, + "loss": 0.7589, + "step": 5773 + }, + { + "epoch": 0.5897854954034729, + "grad_norm": 1.4736430180881663, + "learning_rate": 7.60190182349513e-06, + "loss": 0.7185, + "step": 5774 + }, + { + "epoch": 0.5898876404494382, + "grad_norm": 1.479642262455952, + "learning_rate": 7.598690159695314e-06, + "loss": 0.6768, + "step": 5775 + }, + { + "epoch": 0.5899897854954035, + "grad_norm": 1.5333942808553918, + "learning_rate": 7.595478758720407e-06, + "loss": 0.7368, + "step": 5776 + }, + { + "epoch": 0.5900919305413688, + "grad_norm": 1.5949677671150675, + "learning_rate": 7.5922676209219056e-06, + "loss": 0.6986, + "step": 5777 + }, + { + "epoch": 0.590194075587334, + "grad_norm": 1.4054217225454153, + "learning_rate": 7.589056746651271e-06, + "loss": 0.5658, + "step": 5778 + }, + { + "epoch": 0.5902962206332993, + "grad_norm": 1.4962813120257945, + "learning_rate": 7.5858461362599315e-06, + "loss": 0.6529, + "step": 5779 + }, + { + "epoch": 0.5903983656792645, + "grad_norm": 1.4521759411565687, + "learning_rate": 7.582635790099293e-06, + "loss": 0.7192, + "step": 5780 + }, + { + "epoch": 0.5905005107252298, + "grad_norm": 1.4617757146894208, + "learning_rate": 7.5794257085207265e-06, + "loss": 0.6485, + "step": 5781 + }, + { + "epoch": 0.5906026557711951, + "grad_norm": 1.3946528316804043, + "learning_rate": 7.5762158918755844e-06, + "loss": 0.6889, + "step": 5782 + }, + { + "epoch": 0.5907048008171604, + "grad_norm": 1.7392021232281059, + "learning_rate": 7.5730063405151755e-06, + "loss": 0.7305, + "step": 5783 + }, + { + "epoch": 0.5908069458631257, + "grad_norm": 1.476686831978223, + "learning_rate": 7.569797054790789e-06, + "loss": 0.802, + "step": 5784 + }, + { + "epoch": 0.5909090909090909, + "grad_norm": 1.5975339102967892, + "learning_rate": 7.566588035053688e-06, + "loss": 0.7294, + "step": 5785 + }, + { + "epoch": 0.5910112359550562, + "grad_norm": 1.5180944667469816, + "learning_rate": 7.563379281655098e-06, + "loss": 0.6873, + "step": 5786 + }, + { + "epoch": 0.5911133810010214, + "grad_norm": 1.3952954998488591, + "learning_rate": 7.560170794946221e-06, + "loss": 0.6587, + "step": 5787 + }, + { + "epoch": 0.5912155260469867, + "grad_norm": 2.5466930445511706, + "learning_rate": 7.5569625752782276e-06, + "loss": 0.7983, + "step": 5788 + }, + { + "epoch": 0.591317671092952, + "grad_norm": 1.5079319659720216, + "learning_rate": 7.553754623002256e-06, + "loss": 0.7015, + "step": 5789 + }, + { + "epoch": 0.5914198161389173, + "grad_norm": 1.2701662170402548, + "learning_rate": 7.550546938469424e-06, + "loss": 0.6452, + "step": 5790 + }, + { + "epoch": 0.5915219611848825, + "grad_norm": 1.4320667155022613, + "learning_rate": 7.54733952203081e-06, + "loss": 0.628, + "step": 5791 + }, + { + "epoch": 0.5916241062308478, + "grad_norm": 1.3872358715113562, + "learning_rate": 7.544132374037467e-06, + "loss": 0.6578, + "step": 5792 + }, + { + "epoch": 0.5917262512768131, + "grad_norm": 1.562221409030555, + "learning_rate": 7.540925494840427e-06, + "loss": 0.7384, + "step": 5793 + }, + { + "epoch": 0.5918283963227784, + "grad_norm": 1.3129676870599571, + "learning_rate": 7.537718884790679e-06, + "loss": 0.6119, + "step": 5794 + }, + { + "epoch": 0.5919305413687436, + "grad_norm": 1.5074772944146426, + "learning_rate": 7.534512544239192e-06, + "loss": 0.6569, + "step": 5795 + }, + { + "epoch": 0.5920326864147089, + "grad_norm": 1.6234323581300998, + "learning_rate": 7.531306473536897e-06, + "loss": 0.6378, + "step": 5796 + }, + { + "epoch": 0.5921348314606741, + "grad_norm": 1.4546561105750946, + "learning_rate": 7.5281006730347065e-06, + "loss": 0.7374, + "step": 5797 + }, + { + "epoch": 0.5922369765066394, + "grad_norm": 1.5629803179138624, + "learning_rate": 7.524895143083491e-06, + "loss": 0.7448, + "step": 5798 + }, + { + "epoch": 0.5923391215526047, + "grad_norm": 1.491121890942575, + "learning_rate": 7.521689884034104e-06, + "loss": 0.6947, + "step": 5799 + }, + { + "epoch": 0.59244126659857, + "grad_norm": 1.4620494522018919, + "learning_rate": 7.518484896237356e-06, + "loss": 0.6123, + "step": 5800 + }, + { + "epoch": 0.5925434116445353, + "grad_norm": 1.5181010734176568, + "learning_rate": 7.515280180044041e-06, + "loss": 0.7676, + "step": 5801 + }, + { + "epoch": 0.5926455566905006, + "grad_norm": 1.5533313086862486, + "learning_rate": 7.512075735804919e-06, + "loss": 0.7942, + "step": 5802 + }, + { + "epoch": 0.5927477017364657, + "grad_norm": 1.2923887603955424, + "learning_rate": 7.508871563870712e-06, + "loss": 0.5818, + "step": 5803 + }, + { + "epoch": 0.592849846782431, + "grad_norm": 1.4557670720280915, + "learning_rate": 7.505667664592125e-06, + "loss": 0.7453, + "step": 5804 + }, + { + "epoch": 0.5929519918283963, + "grad_norm": 1.4093005374900336, + "learning_rate": 7.502464038319822e-06, + "loss": 0.6805, + "step": 5805 + }, + { + "epoch": 0.5930541368743616, + "grad_norm": 1.4220418970001532, + "learning_rate": 7.499260685404443e-06, + "loss": 0.7014, + "step": 5806 + }, + { + "epoch": 0.5931562819203269, + "grad_norm": 1.502801408614704, + "learning_rate": 7.496057606196599e-06, + "loss": 0.6548, + "step": 5807 + }, + { + "epoch": 0.5932584269662922, + "grad_norm": 1.4601744284068825, + "learning_rate": 7.4928548010468635e-06, + "loss": 0.6828, + "step": 5808 + }, + { + "epoch": 0.5933605720122574, + "grad_norm": 1.4496218637181033, + "learning_rate": 7.489652270305792e-06, + "loss": 0.6693, + "step": 5809 + }, + { + "epoch": 0.5934627170582226, + "grad_norm": 1.5806909622590437, + "learning_rate": 7.486450014323905e-06, + "loss": 0.7726, + "step": 5810 + }, + { + "epoch": 0.5935648621041879, + "grad_norm": 1.3492198744988257, + "learning_rate": 7.483248033451684e-06, + "loss": 0.7009, + "step": 5811 + }, + { + "epoch": 0.5936670071501532, + "grad_norm": 1.5419031075159988, + "learning_rate": 7.4800463280395944e-06, + "loss": 0.724, + "step": 5812 + }, + { + "epoch": 0.5937691521961185, + "grad_norm": 1.406955428244615, + "learning_rate": 7.47684489843806e-06, + "loss": 0.639, + "step": 5813 + }, + { + "epoch": 0.5938712972420838, + "grad_norm": 1.4551558255379338, + "learning_rate": 7.473643744997483e-06, + "loss": 0.7321, + "step": 5814 + }, + { + "epoch": 0.593973442288049, + "grad_norm": 1.491777370028234, + "learning_rate": 7.470442868068231e-06, + "loss": 0.7593, + "step": 5815 + }, + { + "epoch": 0.5940755873340143, + "grad_norm": 1.4138634018268215, + "learning_rate": 7.467242268000636e-06, + "loss": 0.7877, + "step": 5816 + }, + { + "epoch": 0.5941777323799796, + "grad_norm": 1.4756494480856852, + "learning_rate": 7.464041945145017e-06, + "loss": 0.708, + "step": 5817 + }, + { + "epoch": 0.5942798774259448, + "grad_norm": 1.5069085261007462, + "learning_rate": 7.460841899851643e-06, + "loss": 0.7937, + "step": 5818 + }, + { + "epoch": 0.5943820224719101, + "grad_norm": 1.4658054590771001, + "learning_rate": 7.457642132470766e-06, + "loss": 0.7068, + "step": 5819 + }, + { + "epoch": 0.5944841675178754, + "grad_norm": 1.4350850426497235, + "learning_rate": 7.454442643352599e-06, + "loss": 0.7412, + "step": 5820 + }, + { + "epoch": 0.5945863125638406, + "grad_norm": 1.695287321265646, + "learning_rate": 7.45124343284733e-06, + "loss": 0.6436, + "step": 5821 + }, + { + "epoch": 0.5946884576098059, + "grad_norm": 1.5581514956614742, + "learning_rate": 7.448044501305114e-06, + "loss": 0.6489, + "step": 5822 + }, + { + "epoch": 0.5947906026557712, + "grad_norm": 1.456584404414506, + "learning_rate": 7.444845849076075e-06, + "loss": 0.7394, + "step": 5823 + }, + { + "epoch": 0.5948927477017365, + "grad_norm": 1.5271384689288559, + "learning_rate": 7.441647476510305e-06, + "loss": 0.7278, + "step": 5824 + }, + { + "epoch": 0.5949948927477018, + "grad_norm": 1.3904740897323202, + "learning_rate": 7.438449383957877e-06, + "loss": 0.7813, + "step": 5825 + }, + { + "epoch": 0.595097037793667, + "grad_norm": 1.5797853566570927, + "learning_rate": 7.435251571768817e-06, + "loss": 0.8489, + "step": 5826 + }, + { + "epoch": 0.5951991828396322, + "grad_norm": 1.4532963611050633, + "learning_rate": 7.432054040293131e-06, + "loss": 0.6717, + "step": 5827 + }, + { + "epoch": 0.5953013278855975, + "grad_norm": 1.3227154822064804, + "learning_rate": 7.428856789880787e-06, + "loss": 0.6837, + "step": 5828 + }, + { + "epoch": 0.5954034729315628, + "grad_norm": 1.3784683925882522, + "learning_rate": 7.425659820881732e-06, + "loss": 0.6048, + "step": 5829 + }, + { + "epoch": 0.5955056179775281, + "grad_norm": 1.4804097094920576, + "learning_rate": 7.42246313364587e-06, + "loss": 0.7372, + "step": 5830 + }, + { + "epoch": 0.5956077630234934, + "grad_norm": 1.4097054303879506, + "learning_rate": 7.419266728523084e-06, + "loss": 0.721, + "step": 5831 + }, + { + "epoch": 0.5957099080694587, + "grad_norm": 1.4773703195568362, + "learning_rate": 7.416070605863219e-06, + "loss": 0.6545, + "step": 5832 + }, + { + "epoch": 0.595812053115424, + "grad_norm": 1.6724671679688483, + "learning_rate": 7.412874766016099e-06, + "loss": 0.7634, + "step": 5833 + }, + { + "epoch": 0.5959141981613891, + "grad_norm": 1.5530302740637776, + "learning_rate": 7.409679209331507e-06, + "loss": 0.7557, + "step": 5834 + }, + { + "epoch": 0.5960163432073544, + "grad_norm": 1.5128275772744901, + "learning_rate": 7.406483936159199e-06, + "loss": 0.6323, + "step": 5835 + }, + { + "epoch": 0.5961184882533197, + "grad_norm": 1.4496759319084689, + "learning_rate": 7.403288946848901e-06, + "loss": 0.7238, + "step": 5836 + }, + { + "epoch": 0.596220633299285, + "grad_norm": 1.3898133737338034, + "learning_rate": 7.400094241750303e-06, + "loss": 0.7051, + "step": 5837 + }, + { + "epoch": 0.5963227783452503, + "grad_norm": 1.5120177333362457, + "learning_rate": 7.396899821213072e-06, + "loss": 0.6855, + "step": 5838 + }, + { + "epoch": 0.5964249233912156, + "grad_norm": 1.450229976850798, + "learning_rate": 7.393705685586838e-06, + "loss": 0.666, + "step": 5839 + }, + { + "epoch": 0.5965270684371808, + "grad_norm": 1.4057197508075616, + "learning_rate": 7.390511835221199e-06, + "loss": 0.7484, + "step": 5840 + }, + { + "epoch": 0.596629213483146, + "grad_norm": 1.5479776705699844, + "learning_rate": 7.387318270465722e-06, + "loss": 0.733, + "step": 5841 + }, + { + "epoch": 0.5967313585291113, + "grad_norm": 1.4607955555803671, + "learning_rate": 7.384124991669954e-06, + "loss": 0.6595, + "step": 5842 + }, + { + "epoch": 0.5968335035750766, + "grad_norm": 1.3757148862539543, + "learning_rate": 7.380931999183394e-06, + "loss": 0.5806, + "step": 5843 + }, + { + "epoch": 0.5969356486210419, + "grad_norm": 1.564140087412455, + "learning_rate": 7.377739293355519e-06, + "loss": 0.7638, + "step": 5844 + }, + { + "epoch": 0.5970377936670072, + "grad_norm": 1.4236940859507847, + "learning_rate": 7.374546874535771e-06, + "loss": 0.7112, + "step": 5845 + }, + { + "epoch": 0.5971399387129724, + "grad_norm": 1.4786527251152497, + "learning_rate": 7.371354743073567e-06, + "loss": 0.6516, + "step": 5846 + }, + { + "epoch": 0.5972420837589377, + "grad_norm": 1.480869051169739, + "learning_rate": 7.368162899318282e-06, + "loss": 0.6439, + "step": 5847 + }, + { + "epoch": 0.597344228804903, + "grad_norm": 1.426462840529589, + "learning_rate": 7.36497134361927e-06, + "loss": 0.7216, + "step": 5848 + }, + { + "epoch": 0.5974463738508682, + "grad_norm": 1.2904536840636613, + "learning_rate": 7.361780076325841e-06, + "loss": 0.6744, + "step": 5849 + }, + { + "epoch": 0.5975485188968335, + "grad_norm": 1.538850862338203, + "learning_rate": 7.3585890977872924e-06, + "loss": 0.6295, + "step": 5850 + }, + { + "epoch": 0.5976506639427988, + "grad_norm": 1.7105226628936505, + "learning_rate": 7.355398408352874e-06, + "loss": 0.6409, + "step": 5851 + }, + { + "epoch": 0.597752808988764, + "grad_norm": 1.5491906068221017, + "learning_rate": 7.352208008371808e-06, + "loss": 0.6745, + "step": 5852 + }, + { + "epoch": 0.5978549540347293, + "grad_norm": 1.4498433771474881, + "learning_rate": 7.349017898193286e-06, + "loss": 0.6423, + "step": 5853 + }, + { + "epoch": 0.5979570990806946, + "grad_norm": 1.3399637374443385, + "learning_rate": 7.345828078166466e-06, + "loss": 0.678, + "step": 5854 + }, + { + "epoch": 0.5980592441266599, + "grad_norm": 1.370888361134912, + "learning_rate": 7.342638548640482e-06, + "loss": 0.7288, + "step": 5855 + }, + { + "epoch": 0.5981613891726252, + "grad_norm": 1.3007171741981092, + "learning_rate": 7.339449309964424e-06, + "loss": 0.6071, + "step": 5856 + }, + { + "epoch": 0.5982635342185904, + "grad_norm": 1.4352387008486323, + "learning_rate": 7.336260362487351e-06, + "loss": 0.6248, + "step": 5857 + }, + { + "epoch": 0.5983656792645556, + "grad_norm": 1.4117526808498664, + "learning_rate": 7.3330717065583105e-06, + "loss": 0.5995, + "step": 5858 + }, + { + "epoch": 0.5984678243105209, + "grad_norm": 1.4328852879476324, + "learning_rate": 7.329883342526293e-06, + "loss": 0.7509, + "step": 5859 + }, + { + "epoch": 0.5985699693564862, + "grad_norm": 1.6384448766209287, + "learning_rate": 7.3266952707402675e-06, + "loss": 0.6928, + "step": 5860 + }, + { + "epoch": 0.5986721144024515, + "grad_norm": 1.3871534334114641, + "learning_rate": 7.323507491549173e-06, + "loss": 0.5918, + "step": 5861 + }, + { + "epoch": 0.5987742594484168, + "grad_norm": 1.5034207825241412, + "learning_rate": 7.320320005301911e-06, + "loss": 0.7017, + "step": 5862 + }, + { + "epoch": 0.5988764044943821, + "grad_norm": 1.5614707879590108, + "learning_rate": 7.317132812347359e-06, + "loss": 0.766, + "step": 5863 + }, + { + "epoch": 0.5989785495403472, + "grad_norm": 1.4148598715368401, + "learning_rate": 7.31394591303435e-06, + "loss": 0.6969, + "step": 5864 + }, + { + "epoch": 0.5990806945863125, + "grad_norm": 1.5643572833666173, + "learning_rate": 7.310759307711697e-06, + "loss": 0.7568, + "step": 5865 + }, + { + "epoch": 0.5991828396322778, + "grad_norm": 1.5113285530337168, + "learning_rate": 7.307572996728177e-06, + "loss": 0.6829, + "step": 5866 + }, + { + "epoch": 0.5992849846782431, + "grad_norm": 1.4630314000231337, + "learning_rate": 7.30438698043253e-06, + "loss": 0.6925, + "step": 5867 + }, + { + "epoch": 0.5993871297242084, + "grad_norm": 1.4150843070869774, + "learning_rate": 7.301201259173474e-06, + "loss": 0.7361, + "step": 5868 + }, + { + "epoch": 0.5994892747701737, + "grad_norm": 1.664065993774351, + "learning_rate": 7.298015833299679e-06, + "loss": 0.7802, + "step": 5869 + }, + { + "epoch": 0.599591419816139, + "grad_norm": 1.6287793720113375, + "learning_rate": 7.2948307031598005e-06, + "loss": 0.7481, + "step": 5870 + }, + { + "epoch": 0.5996935648621042, + "grad_norm": 1.6506500561459407, + "learning_rate": 7.291645869102446e-06, + "loss": 0.847, + "step": 5871 + }, + { + "epoch": 0.5997957099080694, + "grad_norm": 1.4595888537852602, + "learning_rate": 7.288461331476205e-06, + "loss": 0.8376, + "step": 5872 + }, + { + "epoch": 0.5998978549540347, + "grad_norm": 1.343409059190042, + "learning_rate": 7.285277090629617e-06, + "loss": 0.7081, + "step": 5873 + }, + { + "epoch": 0.6, + "grad_norm": 1.5003265921407023, + "learning_rate": 7.282093146911208e-06, + "loss": 0.7311, + "step": 5874 + }, + { + "epoch": 0.6001021450459653, + "grad_norm": 1.4733068011069301, + "learning_rate": 7.278909500669462e-06, + "loss": 0.6455, + "step": 5875 + }, + { + "epoch": 0.6002042900919305, + "grad_norm": 1.3783274711739495, + "learning_rate": 7.27572615225283e-06, + "loss": 0.7031, + "step": 5876 + }, + { + "epoch": 0.6003064351378958, + "grad_norm": 1.435450382534867, + "learning_rate": 7.272543102009728e-06, + "loss": 0.7771, + "step": 5877 + }, + { + "epoch": 0.6004085801838611, + "grad_norm": 1.3584899451012589, + "learning_rate": 7.269360350288548e-06, + "loss": 0.6913, + "step": 5878 + }, + { + "epoch": 0.6005107252298264, + "grad_norm": 1.421750549692087, + "learning_rate": 7.26617789743764e-06, + "loss": 0.6962, + "step": 5879 + }, + { + "epoch": 0.6006128702757916, + "grad_norm": 1.3129733482632258, + "learning_rate": 7.2629957438053285e-06, + "loss": 0.6996, + "step": 5880 + }, + { + "epoch": 0.6007150153217569, + "grad_norm": 1.4372971112290582, + "learning_rate": 7.259813889739897e-06, + "loss": 0.6093, + "step": 5881 + }, + { + "epoch": 0.6008171603677221, + "grad_norm": 1.493073979194371, + "learning_rate": 7.256632335589609e-06, + "loss": 0.7358, + "step": 5882 + }, + { + "epoch": 0.6009193054136874, + "grad_norm": 1.5282753675722793, + "learning_rate": 7.253451081702686e-06, + "loss": 0.7078, + "step": 5883 + }, + { + "epoch": 0.6010214504596527, + "grad_norm": 1.4899207091470166, + "learning_rate": 7.2502701284273145e-06, + "loss": 0.7203, + "step": 5884 + }, + { + "epoch": 0.601123595505618, + "grad_norm": 1.403918949914807, + "learning_rate": 7.247089476111655e-06, + "loss": 0.7449, + "step": 5885 + }, + { + "epoch": 0.6012257405515833, + "grad_norm": 1.334592331410426, + "learning_rate": 7.2439091251038295e-06, + "loss": 0.5974, + "step": 5886 + }, + { + "epoch": 0.6013278855975486, + "grad_norm": 1.334817126493994, + "learning_rate": 7.2407290757519345e-06, + "loss": 0.6539, + "step": 5887 + }, + { + "epoch": 0.6014300306435137, + "grad_norm": 1.5259312321091565, + "learning_rate": 7.237549328404021e-06, + "loss": 0.7872, + "step": 5888 + }, + { + "epoch": 0.601532175689479, + "grad_norm": 1.3974426306369905, + "learning_rate": 7.234369883408116e-06, + "loss": 0.6334, + "step": 5889 + }, + { + "epoch": 0.6016343207354443, + "grad_norm": 1.4541248673278022, + "learning_rate": 7.23119074111222e-06, + "loss": 0.7618, + "step": 5890 + }, + { + "epoch": 0.6017364657814096, + "grad_norm": 1.4514054763066233, + "learning_rate": 7.228011901864283e-06, + "loss": 0.6537, + "step": 5891 + }, + { + "epoch": 0.6018386108273749, + "grad_norm": 1.409127258091539, + "learning_rate": 7.224833366012236e-06, + "loss": 0.6423, + "step": 5892 + }, + { + "epoch": 0.6019407558733402, + "grad_norm": 1.5697394140134555, + "learning_rate": 7.221655133903971e-06, + "loss": 0.7172, + "step": 5893 + }, + { + "epoch": 0.6020429009193055, + "grad_norm": 1.552407886324837, + "learning_rate": 7.218477205887344e-06, + "loss": 0.6877, + "step": 5894 + }, + { + "epoch": 0.6021450459652706, + "grad_norm": 1.5063556753206147, + "learning_rate": 7.215299582310187e-06, + "loss": 0.7238, + "step": 5895 + }, + { + "epoch": 0.6022471910112359, + "grad_norm": 1.3751718840272076, + "learning_rate": 7.212122263520287e-06, + "loss": 0.665, + "step": 5896 + }, + { + "epoch": 0.6023493360572012, + "grad_norm": 1.4101370274030278, + "learning_rate": 7.208945249865404e-06, + "loss": 0.6644, + "step": 5897 + }, + { + "epoch": 0.6024514811031665, + "grad_norm": 1.4362162088640555, + "learning_rate": 7.205768541693271e-06, + "loss": 0.6734, + "step": 5898 + }, + { + "epoch": 0.6025536261491318, + "grad_norm": 1.3280863216388417, + "learning_rate": 7.202592139351574e-06, + "loss": 0.623, + "step": 5899 + }, + { + "epoch": 0.6026557711950971, + "grad_norm": 1.4321336287865873, + "learning_rate": 7.199416043187978e-06, + "loss": 0.6589, + "step": 5900 + }, + { + "epoch": 0.6027579162410623, + "grad_norm": 1.5841810161295078, + "learning_rate": 7.196240253550102e-06, + "loss": 0.7485, + "step": 5901 + }, + { + "epoch": 0.6028600612870276, + "grad_norm": 1.4603486447179619, + "learning_rate": 7.193064770785545e-06, + "loss": 0.6523, + "step": 5902 + }, + { + "epoch": 0.6029622063329928, + "grad_norm": 1.4129413337559316, + "learning_rate": 7.189889595241859e-06, + "loss": 0.6737, + "step": 5903 + }, + { + "epoch": 0.6030643513789581, + "grad_norm": 1.342943263142394, + "learning_rate": 7.186714727266575e-06, + "loss": 0.7003, + "step": 5904 + }, + { + "epoch": 0.6031664964249234, + "grad_norm": 1.345900513431136, + "learning_rate": 7.183540167207179e-06, + "loss": 0.6616, + "step": 5905 + }, + { + "epoch": 0.6032686414708887, + "grad_norm": 1.5392394438778376, + "learning_rate": 7.180365915411131e-06, + "loss": 0.7534, + "step": 5906 + }, + { + "epoch": 0.6033707865168539, + "grad_norm": 1.4282662313286736, + "learning_rate": 7.1771919722258586e-06, + "loss": 0.586, + "step": 5907 + }, + { + "epoch": 0.6034729315628192, + "grad_norm": 1.4230758702527313, + "learning_rate": 7.174018337998747e-06, + "loss": 0.5617, + "step": 5908 + }, + { + "epoch": 0.6035750766087845, + "grad_norm": 1.4602701258456388, + "learning_rate": 7.170845013077156e-06, + "loss": 0.5712, + "step": 5909 + }, + { + "epoch": 0.6036772216547498, + "grad_norm": 1.3836139050315057, + "learning_rate": 7.167671997808406e-06, + "loss": 0.7285, + "step": 5910 + }, + { + "epoch": 0.603779366700715, + "grad_norm": 1.3636126912004216, + "learning_rate": 7.164499292539783e-06, + "loss": 0.6279, + "step": 5911 + }, + { + "epoch": 0.6038815117466803, + "grad_norm": 1.3978859328454079, + "learning_rate": 7.161326897618547e-06, + "loss": 0.6997, + "step": 5912 + }, + { + "epoch": 0.6039836567926455, + "grad_norm": 1.4979157354049246, + "learning_rate": 7.158154813391911e-06, + "loss": 0.7615, + "step": 5913 + }, + { + "epoch": 0.6040858018386108, + "grad_norm": 1.4759052852895458, + "learning_rate": 7.154983040207071e-06, + "loss": 0.6707, + "step": 5914 + }, + { + "epoch": 0.6041879468845761, + "grad_norm": 1.4444922606576425, + "learning_rate": 7.151811578411176e-06, + "loss": 0.7194, + "step": 5915 + }, + { + "epoch": 0.6042900919305414, + "grad_norm": 1.5180061079976073, + "learning_rate": 7.148640428351341e-06, + "loss": 0.6778, + "step": 5916 + }, + { + "epoch": 0.6043922369765067, + "grad_norm": 1.6038497285708615, + "learning_rate": 7.1454695903746575e-06, + "loss": 0.8262, + "step": 5917 + }, + { + "epoch": 0.604494382022472, + "grad_norm": 1.4051256729905608, + "learning_rate": 7.142299064828169e-06, + "loss": 0.7348, + "step": 5918 + }, + { + "epoch": 0.6045965270684371, + "grad_norm": 1.4003392105507957, + "learning_rate": 7.139128852058894e-06, + "loss": 0.6837, + "step": 5919 + }, + { + "epoch": 0.6046986721144024, + "grad_norm": 1.5345839514618114, + "learning_rate": 7.135958952413815e-06, + "loss": 0.7187, + "step": 5920 + }, + { + "epoch": 0.6048008171603677, + "grad_norm": 1.47841244282769, + "learning_rate": 7.132789366239876e-06, + "loss": 0.6616, + "step": 5921 + }, + { + "epoch": 0.604902962206333, + "grad_norm": 1.4603720587270017, + "learning_rate": 7.129620093883998e-06, + "loss": 0.7182, + "step": 5922 + }, + { + "epoch": 0.6050051072522983, + "grad_norm": 1.4935846022160644, + "learning_rate": 7.126451135693052e-06, + "loss": 0.7415, + "step": 5923 + }, + { + "epoch": 0.6051072522982636, + "grad_norm": 1.5549311554787175, + "learning_rate": 7.1232824920138884e-06, + "loss": 0.7075, + "step": 5924 + }, + { + "epoch": 0.6052093973442288, + "grad_norm": 1.2929420000903817, + "learning_rate": 7.120114163193312e-06, + "loss": 0.6061, + "step": 5925 + }, + { + "epoch": 0.605311542390194, + "grad_norm": 1.242069455290526, + "learning_rate": 7.116946149578106e-06, + "loss": 0.6061, + "step": 5926 + }, + { + "epoch": 0.6054136874361593, + "grad_norm": 1.4296596349888733, + "learning_rate": 7.113778451515003e-06, + "loss": 0.6942, + "step": 5927 + }, + { + "epoch": 0.6055158324821246, + "grad_norm": 1.4499425587375494, + "learning_rate": 7.110611069350713e-06, + "loss": 0.7488, + "step": 5928 + }, + { + "epoch": 0.6056179775280899, + "grad_norm": 1.480204362686207, + "learning_rate": 7.107444003431906e-06, + "loss": 0.665, + "step": 5929 + }, + { + "epoch": 0.6057201225740552, + "grad_norm": 1.6288706572593974, + "learning_rate": 7.104277254105225e-06, + "loss": 0.7617, + "step": 5930 + }, + { + "epoch": 0.6058222676200204, + "grad_norm": 1.4117823480389378, + "learning_rate": 7.101110821717267e-06, + "loss": 0.7345, + "step": 5931 + }, + { + "epoch": 0.6059244126659857, + "grad_norm": 1.374198163451614, + "learning_rate": 7.097944706614607e-06, + "loss": 0.6384, + "step": 5932 + }, + { + "epoch": 0.606026557711951, + "grad_norm": 2.150011834438242, + "learning_rate": 7.09477890914377e-06, + "loss": 0.6861, + "step": 5933 + }, + { + "epoch": 0.6061287027579162, + "grad_norm": 1.365551514911846, + "learning_rate": 7.09161342965126e-06, + "loss": 0.5959, + "step": 5934 + }, + { + "epoch": 0.6062308478038815, + "grad_norm": 1.5081651464651011, + "learning_rate": 7.088448268483539e-06, + "loss": 0.6964, + "step": 5935 + }, + { + "epoch": 0.6063329928498468, + "grad_norm": 1.4641862085607433, + "learning_rate": 7.085283425987037e-06, + "loss": 0.7471, + "step": 5936 + }, + { + "epoch": 0.606435137895812, + "grad_norm": 1.423325550546707, + "learning_rate": 7.082118902508142e-06, + "loss": 0.7259, + "step": 5937 + }, + { + "epoch": 0.6065372829417773, + "grad_norm": 1.4216140331174967, + "learning_rate": 7.0789546983932224e-06, + "loss": 0.6027, + "step": 5938 + }, + { + "epoch": 0.6066394279877426, + "grad_norm": 1.5072423510122697, + "learning_rate": 7.075790813988599e-06, + "loss": 0.7399, + "step": 5939 + }, + { + "epoch": 0.6067415730337079, + "grad_norm": 1.482619911549146, + "learning_rate": 7.072627249640559e-06, + "loss": 0.7052, + "step": 5940 + }, + { + "epoch": 0.6068437180796732, + "grad_norm": 1.3255449778024178, + "learning_rate": 7.069464005695359e-06, + "loss": 0.6583, + "step": 5941 + }, + { + "epoch": 0.6069458631256384, + "grad_norm": 1.4585769395036152, + "learning_rate": 7.0663010824992166e-06, + "loss": 0.7185, + "step": 5942 + }, + { + "epoch": 0.6070480081716036, + "grad_norm": 1.4400790221792514, + "learning_rate": 7.063138480398316e-06, + "loss": 0.6996, + "step": 5943 + }, + { + "epoch": 0.6071501532175689, + "grad_norm": 1.6113066803988352, + "learning_rate": 7.059976199738805e-06, + "loss": 0.6919, + "step": 5944 + }, + { + "epoch": 0.6072522982635342, + "grad_norm": 1.4169487563860494, + "learning_rate": 7.056814240866796e-06, + "loss": 0.7229, + "step": 5945 + }, + { + "epoch": 0.6073544433094995, + "grad_norm": 1.3642339601865991, + "learning_rate": 7.053652604128376e-06, + "loss": 0.7745, + "step": 5946 + }, + { + "epoch": 0.6074565883554648, + "grad_norm": 1.5863125276646959, + "learning_rate": 7.05049128986958e-06, + "loss": 0.7111, + "step": 5947 + }, + { + "epoch": 0.6075587334014301, + "grad_norm": 1.4413117639999633, + "learning_rate": 7.047330298436417e-06, + "loss": 0.7088, + "step": 5948 + }, + { + "epoch": 0.6076608784473952, + "grad_norm": 1.4887816423957854, + "learning_rate": 7.044169630174862e-06, + "loss": 0.7436, + "step": 5949 + }, + { + "epoch": 0.6077630234933605, + "grad_norm": 1.4203235773642278, + "learning_rate": 7.041009285430849e-06, + "loss": 0.653, + "step": 5950 + }, + { + "epoch": 0.6078651685393258, + "grad_norm": 1.4360534734724038, + "learning_rate": 7.037849264550282e-06, + "loss": 0.7616, + "step": 5951 + }, + { + "epoch": 0.6079673135852911, + "grad_norm": 1.4449357041843944, + "learning_rate": 7.034689567879026e-06, + "loss": 0.8129, + "step": 5952 + }, + { + "epoch": 0.6080694586312564, + "grad_norm": 1.8014754582260786, + "learning_rate": 7.03153019576291e-06, + "loss": 0.7532, + "step": 5953 + }, + { + "epoch": 0.6081716036772217, + "grad_norm": 1.537709437039263, + "learning_rate": 7.028371148547737e-06, + "loss": 0.6204, + "step": 5954 + }, + { + "epoch": 0.608273748723187, + "grad_norm": 1.4367170795663489, + "learning_rate": 7.02521242657926e-06, + "loss": 0.6748, + "step": 5955 + }, + { + "epoch": 0.6083758937691522, + "grad_norm": 1.3520404608469325, + "learning_rate": 7.022054030203206e-06, + "loss": 0.6715, + "step": 5956 + }, + { + "epoch": 0.6084780388151174, + "grad_norm": 1.5990293519928394, + "learning_rate": 7.01889595976526e-06, + "loss": 0.7127, + "step": 5957 + }, + { + "epoch": 0.6085801838610827, + "grad_norm": 1.4299562860740695, + "learning_rate": 7.01573821561108e-06, + "loss": 0.6813, + "step": 5958 + }, + { + "epoch": 0.608682328907048, + "grad_norm": 1.5606205131949153, + "learning_rate": 7.012580798086278e-06, + "loss": 0.6967, + "step": 5959 + }, + { + "epoch": 0.6087844739530133, + "grad_norm": 1.6765703956489642, + "learning_rate": 7.00942370753644e-06, + "loss": 0.7797, + "step": 5960 + }, + { + "epoch": 0.6088866189989786, + "grad_norm": 1.519227804228802, + "learning_rate": 7.006266944307105e-06, + "loss": 0.7846, + "step": 5961 + }, + { + "epoch": 0.6089887640449438, + "grad_norm": 1.5044952396912175, + "learning_rate": 7.003110508743789e-06, + "loss": 0.7219, + "step": 5962 + }, + { + "epoch": 0.6090909090909091, + "grad_norm": 1.3218294881781498, + "learning_rate": 6.9999544011919664e-06, + "loss": 0.6193, + "step": 5963 + }, + { + "epoch": 0.6091930541368744, + "grad_norm": 1.5291194366283811, + "learning_rate": 6.996798621997074e-06, + "loss": 0.7157, + "step": 5964 + }, + { + "epoch": 0.6092951991828396, + "grad_norm": 1.4854553444988114, + "learning_rate": 6.993643171504509e-06, + "loss": 0.7228, + "step": 5965 + }, + { + "epoch": 0.6093973442288049, + "grad_norm": 1.5204966632121497, + "learning_rate": 6.990488050059644e-06, + "loss": 0.7149, + "step": 5966 + }, + { + "epoch": 0.6094994892747702, + "grad_norm": 1.6388165026381813, + "learning_rate": 6.987333258007805e-06, + "loss": 0.8201, + "step": 5967 + }, + { + "epoch": 0.6096016343207354, + "grad_norm": 1.383754035183041, + "learning_rate": 6.984178795694288e-06, + "loss": 0.7046, + "step": 5968 + }, + { + "epoch": 0.6097037793667007, + "grad_norm": 1.500056803110432, + "learning_rate": 6.981024663464347e-06, + "loss": 0.7486, + "step": 5969 + }, + { + "epoch": 0.609805924412666, + "grad_norm": 1.378524200367225, + "learning_rate": 6.977870861663209e-06, + "loss": 0.649, + "step": 5970 + }, + { + "epoch": 0.6099080694586313, + "grad_norm": 1.5246033586121945, + "learning_rate": 6.974717390636059e-06, + "loss": 0.7265, + "step": 5971 + }, + { + "epoch": 0.6100102145045966, + "grad_norm": 1.50034403507857, + "learning_rate": 6.971564250728044e-06, + "loss": 0.6507, + "step": 5972 + }, + { + "epoch": 0.6101123595505618, + "grad_norm": 1.3708884567524493, + "learning_rate": 6.968411442284279e-06, + "loss": 0.7255, + "step": 5973 + }, + { + "epoch": 0.610214504596527, + "grad_norm": 1.5479899644877473, + "learning_rate": 6.965258965649841e-06, + "loss": 0.7221, + "step": 5974 + }, + { + "epoch": 0.6103166496424923, + "grad_norm": 1.412269002629425, + "learning_rate": 6.96210682116977e-06, + "loss": 0.6496, + "step": 5975 + }, + { + "epoch": 0.6104187946884576, + "grad_norm": 1.462584429374096, + "learning_rate": 6.958955009189069e-06, + "loss": 0.7176, + "step": 5976 + }, + { + "epoch": 0.6105209397344229, + "grad_norm": 1.4649820438546874, + "learning_rate": 6.955803530052705e-06, + "loss": 0.8149, + "step": 5977 + }, + { + "epoch": 0.6106230847803882, + "grad_norm": 1.6030227392476382, + "learning_rate": 6.952652384105614e-06, + "loss": 0.7472, + "step": 5978 + }, + { + "epoch": 0.6107252298263535, + "grad_norm": 1.5979450842648746, + "learning_rate": 6.9495015716926875e-06, + "loss": 0.7451, + "step": 5979 + }, + { + "epoch": 0.6108273748723186, + "grad_norm": 1.5256350986575982, + "learning_rate": 6.946351093158788e-06, + "loss": 0.6909, + "step": 5980 + }, + { + "epoch": 0.6109295199182839, + "grad_norm": 1.5821816437332996, + "learning_rate": 6.943200948848732e-06, + "loss": 0.7128, + "step": 5981 + }, + { + "epoch": 0.6110316649642492, + "grad_norm": 1.4049180638821632, + "learning_rate": 6.940051139107307e-06, + "loss": 0.7159, + "step": 5982 + }, + { + "epoch": 0.6111338100102145, + "grad_norm": 1.489407985814746, + "learning_rate": 6.936901664279263e-06, + "loss": 0.8012, + "step": 5983 + }, + { + "epoch": 0.6112359550561798, + "grad_norm": 1.479161321048802, + "learning_rate": 6.933752524709308e-06, + "loss": 0.7092, + "step": 5984 + }, + { + "epoch": 0.6113381001021451, + "grad_norm": 1.5259428801694348, + "learning_rate": 6.9306037207421196e-06, + "loss": 0.6165, + "step": 5985 + }, + { + "epoch": 0.6114402451481104, + "grad_norm": 1.451341130766712, + "learning_rate": 6.92745525272234e-06, + "loss": 0.6468, + "step": 5986 + }, + { + "epoch": 0.6115423901940756, + "grad_norm": 1.4186920855738687, + "learning_rate": 6.924307120994567e-06, + "loss": 0.6673, + "step": 5987 + }, + { + "epoch": 0.6116445352400408, + "grad_norm": 1.5723166087547982, + "learning_rate": 6.921159325903368e-06, + "loss": 0.6815, + "step": 5988 + }, + { + "epoch": 0.6117466802860061, + "grad_norm": 1.5937039163973152, + "learning_rate": 6.918011867793268e-06, + "loss": 0.7112, + "step": 5989 + }, + { + "epoch": 0.6118488253319714, + "grad_norm": 1.753364603103956, + "learning_rate": 6.914864747008762e-06, + "loss": 0.579, + "step": 5990 + }, + { + "epoch": 0.6119509703779367, + "grad_norm": 1.4531015053929697, + "learning_rate": 6.911717963894302e-06, + "loss": 0.7525, + "step": 5991 + }, + { + "epoch": 0.612053115423902, + "grad_norm": 1.4787407748575119, + "learning_rate": 6.908571518794307e-06, + "loss": 0.6808, + "step": 5992 + }, + { + "epoch": 0.6121552604698672, + "grad_norm": 1.5117284380836764, + "learning_rate": 6.9054254120531524e-06, + "loss": 0.6392, + "step": 5993 + }, + { + "epoch": 0.6122574055158325, + "grad_norm": 1.4821468687066737, + "learning_rate": 6.902279644015188e-06, + "loss": 0.7268, + "step": 5994 + }, + { + "epoch": 0.6123595505617978, + "grad_norm": 1.6372951399038573, + "learning_rate": 6.8991342150247205e-06, + "loss": 0.6142, + "step": 5995 + }, + { + "epoch": 0.612461695607763, + "grad_norm": 1.4764270552280503, + "learning_rate": 6.895989125426014e-06, + "loss": 0.7137, + "step": 5996 + }, + { + "epoch": 0.6125638406537283, + "grad_norm": 1.347014414032496, + "learning_rate": 6.892844375563308e-06, + "loss": 0.5718, + "step": 5997 + }, + { + "epoch": 0.6126659856996935, + "grad_norm": 1.397411636577818, + "learning_rate": 6.889699965780787e-06, + "loss": 0.6611, + "step": 5998 + }, + { + "epoch": 0.6127681307456588, + "grad_norm": 1.450436577550693, + "learning_rate": 6.886555896422617e-06, + "loss": 0.6935, + "step": 5999 + }, + { + "epoch": 0.6128702757916241, + "grad_norm": 1.480619808957545, + "learning_rate": 6.883412167832917e-06, + "loss": 0.6834, + "step": 6000 + }, + { + "epoch": 0.6129724208375894, + "grad_norm": 1.3228613907663964, + "learning_rate": 6.880268780355763e-06, + "loss": 0.6882, + "step": 6001 + }, + { + "epoch": 0.6130745658835547, + "grad_norm": 1.466578899662363, + "learning_rate": 6.87712573433521e-06, + "loss": 0.6492, + "step": 6002 + }, + { + "epoch": 0.6131767109295199, + "grad_norm": 1.5822021029167004, + "learning_rate": 6.873983030115265e-06, + "loss": 0.8076, + "step": 6003 + }, + { + "epoch": 0.6132788559754851, + "grad_norm": 1.6039432440574921, + "learning_rate": 6.8708406680398945e-06, + "loss": 0.7018, + "step": 6004 + }, + { + "epoch": 0.6133810010214504, + "grad_norm": 1.564217085038011, + "learning_rate": 6.867698648453036e-06, + "loss": 0.7354, + "step": 6005 + }, + { + "epoch": 0.6134831460674157, + "grad_norm": 1.580602857393017, + "learning_rate": 6.864556971698584e-06, + "loss": 0.7381, + "step": 6006 + }, + { + "epoch": 0.613585291113381, + "grad_norm": 1.7003034464864017, + "learning_rate": 6.861415638120397e-06, + "loss": 0.7432, + "step": 6007 + }, + { + "epoch": 0.6136874361593463, + "grad_norm": 1.5240361507996587, + "learning_rate": 6.8582746480622954e-06, + "loss": 0.7169, + "step": 6008 + }, + { + "epoch": 0.6137895812053116, + "grad_norm": 1.4840637097960456, + "learning_rate": 6.855134001868061e-06, + "loss": 0.6734, + "step": 6009 + }, + { + "epoch": 0.6138917262512769, + "grad_norm": 1.4413475462488778, + "learning_rate": 6.851993699881446e-06, + "loss": 0.7434, + "step": 6010 + }, + { + "epoch": 0.613993871297242, + "grad_norm": 1.3617512119022661, + "learning_rate": 6.848853742446153e-06, + "loss": 0.7777, + "step": 6011 + }, + { + "epoch": 0.6140960163432073, + "grad_norm": 1.5202258508681286, + "learning_rate": 6.845714129905855e-06, + "loss": 0.6412, + "step": 6012 + }, + { + "epoch": 0.6141981613891726, + "grad_norm": 1.4433463303460763, + "learning_rate": 6.842574862604181e-06, + "loss": 0.715, + "step": 6013 + }, + { + "epoch": 0.6143003064351379, + "grad_norm": 1.5720118182756222, + "learning_rate": 6.8394359408847315e-06, + "loss": 0.7436, + "step": 6014 + }, + { + "epoch": 0.6144024514811032, + "grad_norm": 1.4734469310118399, + "learning_rate": 6.836297365091058e-06, + "loss": 0.7534, + "step": 6015 + }, + { + "epoch": 0.6145045965270685, + "grad_norm": 1.6581505768586497, + "learning_rate": 6.833159135566683e-06, + "loss": 0.6505, + "step": 6016 + }, + { + "epoch": 0.6146067415730337, + "grad_norm": 1.5623848571026056, + "learning_rate": 6.830021252655084e-06, + "loss": 0.8233, + "step": 6017 + }, + { + "epoch": 0.614708886618999, + "grad_norm": 1.5556822704679165, + "learning_rate": 6.826883716699711e-06, + "loss": 0.7013, + "step": 6018 + }, + { + "epoch": 0.6148110316649642, + "grad_norm": 1.4043130206828793, + "learning_rate": 6.823746528043962e-06, + "loss": 0.6657, + "step": 6019 + }, + { + "epoch": 0.6149131767109295, + "grad_norm": 1.535910934451778, + "learning_rate": 6.82060968703121e-06, + "loss": 0.7036, + "step": 6020 + }, + { + "epoch": 0.6150153217568948, + "grad_norm": 1.5058246996551714, + "learning_rate": 6.81747319400478e-06, + "loss": 0.763, + "step": 6021 + }, + { + "epoch": 0.6151174668028601, + "grad_norm": 1.515266693349947, + "learning_rate": 6.814337049307966e-06, + "loss": 0.7719, + "step": 6022 + }, + { + "epoch": 0.6152196118488253, + "grad_norm": 1.6934495835496437, + "learning_rate": 6.81120125328402e-06, + "loss": 0.723, + "step": 6023 + }, + { + "epoch": 0.6153217568947906, + "grad_norm": 1.4364644137202367, + "learning_rate": 6.808065806276158e-06, + "loss": 0.6603, + "step": 6024 + }, + { + "epoch": 0.6154239019407559, + "grad_norm": 1.4582650876582257, + "learning_rate": 6.804930708627553e-06, + "loss": 0.6027, + "step": 6025 + }, + { + "epoch": 0.6155260469867212, + "grad_norm": 1.41261136126919, + "learning_rate": 6.801795960681348e-06, + "loss": 0.6802, + "step": 6026 + }, + { + "epoch": 0.6156281920326864, + "grad_norm": 1.4778627832345463, + "learning_rate": 6.798661562780642e-06, + "loss": 0.7586, + "step": 6027 + }, + { + "epoch": 0.6157303370786517, + "grad_norm": 1.4136594091595078, + "learning_rate": 6.795527515268494e-06, + "loss": 0.656, + "step": 6028 + }, + { + "epoch": 0.6158324821246169, + "grad_norm": 1.5071295563302154, + "learning_rate": 6.792393818487933e-06, + "loss": 0.68, + "step": 6029 + }, + { + "epoch": 0.6159346271705822, + "grad_norm": 1.4904188938039284, + "learning_rate": 6.78926047278194e-06, + "loss": 0.6769, + "step": 6030 + }, + { + "epoch": 0.6160367722165475, + "grad_norm": 1.5816343283680228, + "learning_rate": 6.786127478493463e-06, + "loss": 0.7574, + "step": 6031 + }, + { + "epoch": 0.6161389172625128, + "grad_norm": 1.4066042305488502, + "learning_rate": 6.7829948359654085e-06, + "loss": 0.7378, + "step": 6032 + }, + { + "epoch": 0.6162410623084781, + "grad_norm": 1.3822098245855512, + "learning_rate": 6.779862545540645e-06, + "loss": 0.6457, + "step": 6033 + }, + { + "epoch": 0.6163432073544433, + "grad_norm": 1.4866280478571343, + "learning_rate": 6.776730607562011e-06, + "loss": 0.7453, + "step": 6034 + }, + { + "epoch": 0.6164453524004085, + "grad_norm": 1.4304136550420243, + "learning_rate": 6.7735990223722946e-06, + "loss": 0.6577, + "step": 6035 + }, + { + "epoch": 0.6165474974463738, + "grad_norm": 1.3540797154977005, + "learning_rate": 6.770467790314248e-06, + "loss": 0.6715, + "step": 6036 + }, + { + "epoch": 0.6166496424923391, + "grad_norm": 1.3863200111804435, + "learning_rate": 6.767336911730589e-06, + "loss": 0.6749, + "step": 6037 + }, + { + "epoch": 0.6167517875383044, + "grad_norm": 1.5307327834890467, + "learning_rate": 6.764206386963992e-06, + "loss": 0.7428, + "step": 6038 + }, + { + "epoch": 0.6168539325842697, + "grad_norm": 1.492832825968607, + "learning_rate": 6.761076216357099e-06, + "loss": 0.6406, + "step": 6039 + }, + { + "epoch": 0.616956077630235, + "grad_norm": 1.95631672833207, + "learning_rate": 6.7579464002525055e-06, + "loss": 0.7984, + "step": 6040 + }, + { + "epoch": 0.6170582226762003, + "grad_norm": 1.4105278933788215, + "learning_rate": 6.7548169389927696e-06, + "loss": 0.688, + "step": 6041 + }, + { + "epoch": 0.6171603677221654, + "grad_norm": 1.4704117349641852, + "learning_rate": 6.7516878329204216e-06, + "loss": 0.7668, + "step": 6042 + }, + { + "epoch": 0.6172625127681307, + "grad_norm": 1.4288420107464077, + "learning_rate": 6.7485590823779364e-06, + "loss": 0.757, + "step": 6043 + }, + { + "epoch": 0.617364657814096, + "grad_norm": 1.4247579470447873, + "learning_rate": 6.745430687707761e-06, + "loss": 0.6385, + "step": 6044 + }, + { + "epoch": 0.6174668028600613, + "grad_norm": 1.5092677217279975, + "learning_rate": 6.742302649252299e-06, + "loss": 0.6657, + "step": 6045 + }, + { + "epoch": 0.6175689479060266, + "grad_norm": 1.41659047436331, + "learning_rate": 6.73917496735392e-06, + "loss": 0.6909, + "step": 6046 + }, + { + "epoch": 0.6176710929519919, + "grad_norm": 1.5660725850153576, + "learning_rate": 6.7360476423549434e-06, + "loss": 0.7386, + "step": 6047 + }, + { + "epoch": 0.6177732379979571, + "grad_norm": 1.4524900168355066, + "learning_rate": 6.732920674597665e-06, + "loss": 0.6381, + "step": 6048 + }, + { + "epoch": 0.6178753830439224, + "grad_norm": 1.4670458519782945, + "learning_rate": 6.7297940644243245e-06, + "loss": 0.6669, + "step": 6049 + }, + { + "epoch": 0.6179775280898876, + "grad_norm": 1.376312966653919, + "learning_rate": 6.7266678121771375e-06, + "loss": 0.6741, + "step": 6050 + }, + { + "epoch": 0.6180796731358529, + "grad_norm": 1.45597254893423, + "learning_rate": 6.723541918198278e-06, + "loss": 0.6212, + "step": 6051 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 1.4291707979706016, + "learning_rate": 6.720416382829867e-06, + "loss": 0.6923, + "step": 6052 + }, + { + "epoch": 0.6182839632277835, + "grad_norm": 1.4371727848437101, + "learning_rate": 6.7172912064140065e-06, + "loss": 0.7665, + "step": 6053 + }, + { + "epoch": 0.6183861082737487, + "grad_norm": 1.6518260307164343, + "learning_rate": 6.714166389292743e-06, + "loss": 0.7503, + "step": 6054 + }, + { + "epoch": 0.618488253319714, + "grad_norm": 1.499842965752922, + "learning_rate": 6.7110419318080884e-06, + "loss": 0.8056, + "step": 6055 + }, + { + "epoch": 0.6185903983656793, + "grad_norm": 1.4816027524854394, + "learning_rate": 6.707917834302024e-06, + "loss": 0.8149, + "step": 6056 + }, + { + "epoch": 0.6186925434116445, + "grad_norm": 1.5426118667475097, + "learning_rate": 6.704794097116474e-06, + "loss": 0.6889, + "step": 6057 + }, + { + "epoch": 0.6187946884576098, + "grad_norm": 1.4294025819458678, + "learning_rate": 6.701670720593341e-06, + "loss": 0.6921, + "step": 6058 + }, + { + "epoch": 0.618896833503575, + "grad_norm": 1.5660300238482336, + "learning_rate": 6.6985477050744805e-06, + "loss": 0.7242, + "step": 6059 + }, + { + "epoch": 0.6189989785495403, + "grad_norm": 1.3925819059229059, + "learning_rate": 6.695425050901705e-06, + "loss": 0.5613, + "step": 6060 + }, + { + "epoch": 0.6191011235955056, + "grad_norm": 1.4295065181841535, + "learning_rate": 6.692302758416795e-06, + "loss": 0.6763, + "step": 6061 + }, + { + "epoch": 0.6192032686414709, + "grad_norm": 1.5153497676290344, + "learning_rate": 6.689180827961481e-06, + "loss": 0.663, + "step": 6062 + }, + { + "epoch": 0.6193054136874362, + "grad_norm": 1.500692153504031, + "learning_rate": 6.6860592598774685e-06, + "loss": 0.5768, + "step": 6063 + }, + { + "epoch": 0.6194075587334015, + "grad_norm": 1.4361191982467998, + "learning_rate": 6.682938054506409e-06, + "loss": 0.7274, + "step": 6064 + }, + { + "epoch": 0.6195097037793666, + "grad_norm": 1.4775141073975566, + "learning_rate": 6.679817212189921e-06, + "loss": 0.7486, + "step": 6065 + }, + { + "epoch": 0.6196118488253319, + "grad_norm": 1.506458475723345, + "learning_rate": 6.676696733269588e-06, + "loss": 0.7372, + "step": 6066 + }, + { + "epoch": 0.6197139938712972, + "grad_norm": 1.460369331967241, + "learning_rate": 6.673576618086941e-06, + "loss": 0.5962, + "step": 6067 + }, + { + "epoch": 0.6198161389172625, + "grad_norm": 1.2949803696878728, + "learning_rate": 6.670456866983486e-06, + "loss": 0.564, + "step": 6068 + }, + { + "epoch": 0.6199182839632278, + "grad_norm": 1.758863773646398, + "learning_rate": 6.667337480300675e-06, + "loss": 0.738, + "step": 6069 + }, + { + "epoch": 0.6200204290091931, + "grad_norm": 1.5500809275919858, + "learning_rate": 6.664218458379933e-06, + "loss": 0.6816, + "step": 6070 + }, + { + "epoch": 0.6201225740551584, + "grad_norm": 1.4066602194317719, + "learning_rate": 6.661099801562636e-06, + "loss": 0.6525, + "step": 6071 + }, + { + "epoch": 0.6202247191011236, + "grad_norm": 1.3009317908415803, + "learning_rate": 6.657981510190119e-06, + "loss": 0.7116, + "step": 6072 + }, + { + "epoch": 0.6203268641470888, + "grad_norm": 1.575424484722834, + "learning_rate": 6.654863584603684e-06, + "loss": 0.7053, + "step": 6073 + }, + { + "epoch": 0.6204290091930541, + "grad_norm": 1.3368640242122787, + "learning_rate": 6.651746025144597e-06, + "loss": 0.6669, + "step": 6074 + }, + { + "epoch": 0.6205311542390194, + "grad_norm": 1.4088063879810369, + "learning_rate": 6.648628832154066e-06, + "loss": 0.7192, + "step": 6075 + }, + { + "epoch": 0.6206332992849847, + "grad_norm": 1.435331346167685, + "learning_rate": 6.645512005973278e-06, + "loss": 0.6027, + "step": 6076 + }, + { + "epoch": 0.62073544433095, + "grad_norm": 1.4768219350941822, + "learning_rate": 6.642395546943365e-06, + "loss": 0.7019, + "step": 6077 + }, + { + "epoch": 0.6208375893769152, + "grad_norm": 1.543670492926742, + "learning_rate": 6.639279455405432e-06, + "loss": 0.6407, + "step": 6078 + }, + { + "epoch": 0.6209397344228805, + "grad_norm": 1.688668873676125, + "learning_rate": 6.636163731700531e-06, + "loss": 0.6674, + "step": 6079 + }, + { + "epoch": 0.6210418794688458, + "grad_norm": 1.3895951000534057, + "learning_rate": 6.633048376169685e-06, + "loss": 0.5355, + "step": 6080 + }, + { + "epoch": 0.621144024514811, + "grad_norm": 1.5113002930997002, + "learning_rate": 6.629933389153867e-06, + "loss": 0.7938, + "step": 6081 + }, + { + "epoch": 0.6212461695607763, + "grad_norm": 1.537745442511612, + "learning_rate": 6.626818770994017e-06, + "loss": 0.682, + "step": 6082 + }, + { + "epoch": 0.6213483146067416, + "grad_norm": 1.7132838972110638, + "learning_rate": 6.6237045220310335e-06, + "loss": 0.7049, + "step": 6083 + }, + { + "epoch": 0.6214504596527068, + "grad_norm": 1.4599340642780694, + "learning_rate": 6.620590642605771e-06, + "loss": 0.6635, + "step": 6084 + }, + { + "epoch": 0.6215526046986721, + "grad_norm": 1.5891035170209347, + "learning_rate": 6.617477133059048e-06, + "loss": 0.7097, + "step": 6085 + }, + { + "epoch": 0.6216547497446374, + "grad_norm": 1.6559760622480633, + "learning_rate": 6.6143639937316364e-06, + "loss": 0.742, + "step": 6086 + }, + { + "epoch": 0.6217568947906027, + "grad_norm": 1.5919220585917173, + "learning_rate": 6.611251224964273e-06, + "loss": 0.6516, + "step": 6087 + }, + { + "epoch": 0.6218590398365679, + "grad_norm": 1.4540915195089683, + "learning_rate": 6.608138827097654e-06, + "loss": 0.7436, + "step": 6088 + }, + { + "epoch": 0.6219611848825332, + "grad_norm": 1.4878140228498602, + "learning_rate": 6.6050268004724285e-06, + "loss": 0.6584, + "step": 6089 + }, + { + "epoch": 0.6220633299284984, + "grad_norm": 1.5403716067395332, + "learning_rate": 6.601915145429214e-06, + "loss": 0.7143, + "step": 6090 + }, + { + "epoch": 0.6221654749744637, + "grad_norm": 1.4553378596096787, + "learning_rate": 6.598803862308585e-06, + "loss": 0.7485, + "step": 6091 + }, + { + "epoch": 0.622267620020429, + "grad_norm": 1.4094091074894952, + "learning_rate": 6.595692951451067e-06, + "loss": 0.6686, + "step": 6092 + }, + { + "epoch": 0.6223697650663943, + "grad_norm": 1.6500652180930637, + "learning_rate": 6.5925824131971595e-06, + "loss": 0.676, + "step": 6093 + }, + { + "epoch": 0.6224719101123596, + "grad_norm": 1.3390705563168666, + "learning_rate": 6.589472247887305e-06, + "loss": 0.5841, + "step": 6094 + }, + { + "epoch": 0.6225740551583249, + "grad_norm": 1.49096187117337, + "learning_rate": 6.586362455861918e-06, + "loss": 0.7096, + "step": 6095 + }, + { + "epoch": 0.62267620020429, + "grad_norm": 1.4917426621288992, + "learning_rate": 6.5832530374613635e-06, + "loss": 0.6601, + "step": 6096 + }, + { + "epoch": 0.6227783452502553, + "grad_norm": 1.6903309468479464, + "learning_rate": 6.5801439930259714e-06, + "loss": 0.7086, + "step": 6097 + }, + { + "epoch": 0.6228804902962206, + "grad_norm": 1.5570198304384162, + "learning_rate": 6.57703532289603e-06, + "loss": 0.7381, + "step": 6098 + }, + { + "epoch": 0.6229826353421859, + "grad_norm": 1.54621313987679, + "learning_rate": 6.573927027411784e-06, + "loss": 0.6884, + "step": 6099 + }, + { + "epoch": 0.6230847803881512, + "grad_norm": 1.3510007195205616, + "learning_rate": 6.570819106913441e-06, + "loss": 0.6559, + "step": 6100 + }, + { + "epoch": 0.6231869254341165, + "grad_norm": 1.4204563608987426, + "learning_rate": 6.5677115617411595e-06, + "loss": 0.6985, + "step": 6101 + }, + { + "epoch": 0.6232890704800818, + "grad_norm": 1.496205072591101, + "learning_rate": 6.5646043922350665e-06, + "loss": 0.7232, + "step": 6102 + }, + { + "epoch": 0.623391215526047, + "grad_norm": 1.3192881547599211, + "learning_rate": 6.5614975987352404e-06, + "loss": 0.7113, + "step": 6103 + }, + { + "epoch": 0.6234933605720122, + "grad_norm": 1.4570154971656228, + "learning_rate": 6.558391181581727e-06, + "loss": 0.6575, + "step": 6104 + }, + { + "epoch": 0.6235955056179775, + "grad_norm": 1.3962310531998048, + "learning_rate": 6.555285141114515e-06, + "loss": 0.7013, + "step": 6105 + }, + { + "epoch": 0.6236976506639428, + "grad_norm": 1.522001929479248, + "learning_rate": 6.552179477673577e-06, + "loss": 0.6985, + "step": 6106 + }, + { + "epoch": 0.6237997957099081, + "grad_norm": 1.4548704517087414, + "learning_rate": 6.549074191598821e-06, + "loss": 0.728, + "step": 6107 + }, + { + "epoch": 0.6239019407558734, + "grad_norm": 1.388947331995313, + "learning_rate": 6.545969283230125e-06, + "loss": 0.6588, + "step": 6108 + }, + { + "epoch": 0.6240040858018386, + "grad_norm": 1.5921301098645846, + "learning_rate": 6.542864752907321e-06, + "loss": 0.7473, + "step": 6109 + }, + { + "epoch": 0.6241062308478039, + "grad_norm": 1.5357758512747757, + "learning_rate": 6.539760600970205e-06, + "loss": 0.7124, + "step": 6110 + }, + { + "epoch": 0.6242083758937692, + "grad_norm": 1.5694724590235667, + "learning_rate": 6.5366568277585254e-06, + "loss": 0.6955, + "step": 6111 + }, + { + "epoch": 0.6243105209397344, + "grad_norm": 1.556913179252182, + "learning_rate": 6.533553433611994e-06, + "loss": 0.7037, + "step": 6112 + }, + { + "epoch": 0.6244126659856997, + "grad_norm": 1.5414589304335253, + "learning_rate": 6.530450418870278e-06, + "loss": 0.6921, + "step": 6113 + }, + { + "epoch": 0.624514811031665, + "grad_norm": 1.4780229950873949, + "learning_rate": 6.527347783873003e-06, + "loss": 0.6105, + "step": 6114 + }, + { + "epoch": 0.6246169560776302, + "grad_norm": 1.5167098436271296, + "learning_rate": 6.524245528959763e-06, + "loss": 0.702, + "step": 6115 + }, + { + "epoch": 0.6247191011235955, + "grad_norm": 1.412337467423917, + "learning_rate": 6.521143654470091e-06, + "loss": 0.731, + "step": 6116 + }, + { + "epoch": 0.6248212461695608, + "grad_norm": 1.4414654883539222, + "learning_rate": 6.5180421607434964e-06, + "loss": 0.7116, + "step": 6117 + }, + { + "epoch": 0.6249233912155261, + "grad_norm": 1.5384967019497453, + "learning_rate": 6.514941048119434e-06, + "loss": 0.7711, + "step": 6118 + }, + { + "epoch": 0.6250255362614913, + "grad_norm": 1.6264825953342996, + "learning_rate": 6.511840316937329e-06, + "loss": 0.7375, + "step": 6119 + }, + { + "epoch": 0.6251276813074566, + "grad_norm": 1.544268452745341, + "learning_rate": 6.508739967536553e-06, + "loss": 0.7036, + "step": 6120 + }, + { + "epoch": 0.6252298263534218, + "grad_norm": 1.488165588337334, + "learning_rate": 6.505640000256439e-06, + "loss": 0.7173, + "step": 6121 + }, + { + "epoch": 0.6253319713993871, + "grad_norm": 1.390294984734998, + "learning_rate": 6.50254041543629e-06, + "loss": 0.7382, + "step": 6122 + }, + { + "epoch": 0.6254341164453524, + "grad_norm": 1.4789159006002561, + "learning_rate": 6.4994412134153505e-06, + "loss": 0.6836, + "step": 6123 + }, + { + "epoch": 0.6255362614913177, + "grad_norm": 1.4985027068779089, + "learning_rate": 6.496342394532834e-06, + "loss": 0.652, + "step": 6124 + }, + { + "epoch": 0.625638406537283, + "grad_norm": 1.5859858167308005, + "learning_rate": 6.493243959127906e-06, + "loss": 0.7614, + "step": 6125 + }, + { + "epoch": 0.6257405515832483, + "grad_norm": 1.7279997497800577, + "learning_rate": 6.490145907539689e-06, + "loss": 0.787, + "step": 6126 + }, + { + "epoch": 0.6258426966292134, + "grad_norm": 1.519391082354374, + "learning_rate": 6.487048240107275e-06, + "loss": 0.6974, + "step": 6127 + }, + { + "epoch": 0.6259448416751787, + "grad_norm": 1.540289547940538, + "learning_rate": 6.483950957169695e-06, + "loss": 0.757, + "step": 6128 + }, + { + "epoch": 0.626046986721144, + "grad_norm": 1.4242659175660761, + "learning_rate": 6.4808540590659555e-06, + "loss": 0.6387, + "step": 6129 + }, + { + "epoch": 0.6261491317671093, + "grad_norm": 1.4959259914799132, + "learning_rate": 6.477757546135015e-06, + "loss": 0.6499, + "step": 6130 + }, + { + "epoch": 0.6262512768130746, + "grad_norm": 1.530240054016163, + "learning_rate": 6.474661418715784e-06, + "loss": 0.6252, + "step": 6131 + }, + { + "epoch": 0.6263534218590399, + "grad_norm": 1.5543998527613867, + "learning_rate": 6.471565677147142e-06, + "loss": 0.6421, + "step": 6132 + }, + { + "epoch": 0.6264555669050051, + "grad_norm": 1.3142449970090542, + "learning_rate": 6.468470321767914e-06, + "loss": 0.7445, + "step": 6133 + }, + { + "epoch": 0.6265577119509704, + "grad_norm": 1.4687246598647847, + "learning_rate": 6.465375352916894e-06, + "loss": 0.6106, + "step": 6134 + }, + { + "epoch": 0.6266598569969356, + "grad_norm": 1.490175162611309, + "learning_rate": 6.4622807709328216e-06, + "loss": 0.791, + "step": 6135 + }, + { + "epoch": 0.6267620020429009, + "grad_norm": 1.3795828066046198, + "learning_rate": 6.4591865761544085e-06, + "loss": 0.6047, + "step": 6136 + }, + { + "epoch": 0.6268641470888662, + "grad_norm": 1.6217155999779487, + "learning_rate": 6.4560927689203055e-06, + "loss": 0.6734, + "step": 6137 + }, + { + "epoch": 0.6269662921348315, + "grad_norm": 1.548370911051616, + "learning_rate": 6.4529993495691426e-06, + "loss": 0.6208, + "step": 6138 + }, + { + "epoch": 0.6270684371807967, + "grad_norm": 1.415130929828379, + "learning_rate": 6.449906318439494e-06, + "loss": 0.5849, + "step": 6139 + }, + { + "epoch": 0.627170582226762, + "grad_norm": 1.415237121981588, + "learning_rate": 6.44681367586989e-06, + "loss": 0.7854, + "step": 6140 + }, + { + "epoch": 0.6272727272727273, + "grad_norm": 1.4886019830601929, + "learning_rate": 6.443721422198827e-06, + "loss": 0.6364, + "step": 6141 + }, + { + "epoch": 0.6273748723186925, + "grad_norm": 1.5229314779038945, + "learning_rate": 6.440629557764752e-06, + "loss": 0.8162, + "step": 6142 + }, + { + "epoch": 0.6274770173646578, + "grad_norm": 1.2937330380822516, + "learning_rate": 6.437538082906071e-06, + "loss": 0.5845, + "step": 6143 + }, + { + "epoch": 0.6275791624106231, + "grad_norm": 1.368139203411507, + "learning_rate": 6.434446997961149e-06, + "loss": 0.686, + "step": 6144 + }, + { + "epoch": 0.6276813074565883, + "grad_norm": 1.5627474790929743, + "learning_rate": 6.431356303268303e-06, + "loss": 0.6666, + "step": 6145 + }, + { + "epoch": 0.6277834525025536, + "grad_norm": 1.39128874288251, + "learning_rate": 6.428265999165817e-06, + "loss": 0.6475, + "step": 6146 + }, + { + "epoch": 0.6278855975485189, + "grad_norm": 1.535923035945346, + "learning_rate": 6.4251760859919275e-06, + "loss": 0.7689, + "step": 6147 + }, + { + "epoch": 0.6279877425944842, + "grad_norm": 1.4480834384293195, + "learning_rate": 6.422086564084823e-06, + "loss": 0.787, + "step": 6148 + }, + { + "epoch": 0.6280898876404495, + "grad_norm": 1.5499550928607904, + "learning_rate": 6.418997433782657e-06, + "loss": 0.6551, + "step": 6149 + }, + { + "epoch": 0.6281920326864147, + "grad_norm": 1.4702956737794057, + "learning_rate": 6.415908695423534e-06, + "loss": 0.7196, + "step": 6150 + }, + { + "epoch": 0.6282941777323799, + "grad_norm": 1.534264929854412, + "learning_rate": 6.412820349345522e-06, + "loss": 0.7261, + "step": 6151 + }, + { + "epoch": 0.6283963227783452, + "grad_norm": 1.5514897825533918, + "learning_rate": 6.409732395886639e-06, + "loss": 0.6872, + "step": 6152 + }, + { + "epoch": 0.6284984678243105, + "grad_norm": 1.5145864060263614, + "learning_rate": 6.406644835384861e-06, + "loss": 0.6604, + "step": 6153 + }, + { + "epoch": 0.6286006128702758, + "grad_norm": 1.405545308550319, + "learning_rate": 6.403557668178133e-06, + "loss": 0.6975, + "step": 6154 + }, + { + "epoch": 0.6287027579162411, + "grad_norm": 1.409986528115615, + "learning_rate": 6.400470894604339e-06, + "loss": 0.7668, + "step": 6155 + }, + { + "epoch": 0.6288049029622064, + "grad_norm": 1.4051823487323436, + "learning_rate": 6.397384515001335e-06, + "loss": 0.676, + "step": 6156 + }, + { + "epoch": 0.6289070480081717, + "grad_norm": 1.5031621105998738, + "learning_rate": 6.394298529706919e-06, + "loss": 0.7039, + "step": 6157 + }, + { + "epoch": 0.6290091930541368, + "grad_norm": 1.4679787605986283, + "learning_rate": 6.3912129390588615e-06, + "loss": 0.6971, + "step": 6158 + }, + { + "epoch": 0.6291113381001021, + "grad_norm": 1.655866605465299, + "learning_rate": 6.3881277433948785e-06, + "loss": 0.7522, + "step": 6159 + }, + { + "epoch": 0.6292134831460674, + "grad_norm": 1.4169112026998378, + "learning_rate": 6.385042943052648e-06, + "loss": 0.6098, + "step": 6160 + }, + { + "epoch": 0.6293156281920327, + "grad_norm": 1.5088972911438008, + "learning_rate": 6.381958538369799e-06, + "loss": 0.6875, + "step": 6161 + }, + { + "epoch": 0.629417773237998, + "grad_norm": 1.393540763856964, + "learning_rate": 6.378874529683929e-06, + "loss": 0.6638, + "step": 6162 + }, + { + "epoch": 0.6295199182839633, + "grad_norm": 1.4814868758339288, + "learning_rate": 6.375790917332581e-06, + "loss": 0.7373, + "step": 6163 + }, + { + "epoch": 0.6296220633299285, + "grad_norm": 1.4958972215313535, + "learning_rate": 6.3727077016532605e-06, + "loss": 0.7292, + "step": 6164 + }, + { + "epoch": 0.6297242083758938, + "grad_norm": 1.382691756925648, + "learning_rate": 6.3696248829834216e-06, + "loss": 0.7479, + "step": 6165 + }, + { + "epoch": 0.629826353421859, + "grad_norm": 1.4689183306559397, + "learning_rate": 6.366542461660487e-06, + "loss": 0.7133, + "step": 6166 + }, + { + "epoch": 0.6299284984678243, + "grad_norm": 1.6578691864028614, + "learning_rate": 6.363460438021828e-06, + "loss": 0.5832, + "step": 6167 + }, + { + "epoch": 0.6300306435137896, + "grad_norm": 1.4716762140685307, + "learning_rate": 6.360378812404774e-06, + "loss": 0.7267, + "step": 6168 + }, + { + "epoch": 0.6301327885597549, + "grad_norm": 1.352135729165762, + "learning_rate": 6.357297585146607e-06, + "loss": 0.6907, + "step": 6169 + }, + { + "epoch": 0.6302349336057201, + "grad_norm": 1.5351627885458556, + "learning_rate": 6.354216756584573e-06, + "loss": 0.5625, + "step": 6170 + }, + { + "epoch": 0.6303370786516854, + "grad_norm": 1.5559090636427537, + "learning_rate": 6.351136327055875e-06, + "loss": 0.7687, + "step": 6171 + }, + { + "epoch": 0.6304392236976507, + "grad_norm": 1.4939070298327277, + "learning_rate": 6.34805629689766e-06, + "loss": 0.7567, + "step": 6172 + }, + { + "epoch": 0.6305413687436159, + "grad_norm": 1.4624280100821394, + "learning_rate": 6.344976666447045e-06, + "loss": 0.7369, + "step": 6173 + }, + { + "epoch": 0.6306435137895812, + "grad_norm": 1.4996062015482285, + "learning_rate": 6.341897436041094e-06, + "loss": 0.7253, + "step": 6174 + }, + { + "epoch": 0.6307456588355465, + "grad_norm": 1.4303744556780493, + "learning_rate": 6.338818606016832e-06, + "loss": 0.6669, + "step": 6175 + }, + { + "epoch": 0.6308478038815117, + "grad_norm": 1.3145261755249669, + "learning_rate": 6.335740176711241e-06, + "loss": 0.6468, + "step": 6176 + }, + { + "epoch": 0.630949948927477, + "grad_norm": 1.2739431330657027, + "learning_rate": 6.332662148461248e-06, + "loss": 0.6335, + "step": 6177 + }, + { + "epoch": 0.6310520939734423, + "grad_norm": 1.5297459910209643, + "learning_rate": 6.32958452160376e-06, + "loss": 0.6628, + "step": 6178 + }, + { + "epoch": 0.6311542390194076, + "grad_norm": 1.440386659421521, + "learning_rate": 6.3265072964756155e-06, + "loss": 0.7348, + "step": 6179 + }, + { + "epoch": 0.6312563840653729, + "grad_norm": 1.4782781421664306, + "learning_rate": 6.32343047341362e-06, + "loss": 0.6466, + "step": 6180 + }, + { + "epoch": 0.631358529111338, + "grad_norm": 1.392628288008539, + "learning_rate": 6.320354052754536e-06, + "loss": 0.7182, + "step": 6181 + }, + { + "epoch": 0.6314606741573033, + "grad_norm": 1.4904357863526596, + "learning_rate": 6.317278034835077e-06, + "loss": 0.8133, + "step": 6182 + }, + { + "epoch": 0.6315628192032686, + "grad_norm": 1.2716580098208816, + "learning_rate": 6.314202419991917e-06, + "loss": 0.6298, + "step": 6183 + }, + { + "epoch": 0.6316649642492339, + "grad_norm": 1.5929248804767717, + "learning_rate": 6.311127208561683e-06, + "loss": 0.7973, + "step": 6184 + }, + { + "epoch": 0.6317671092951992, + "grad_norm": 1.5075047855181032, + "learning_rate": 6.308052400880955e-06, + "loss": 0.6926, + "step": 6185 + }, + { + "epoch": 0.6318692543411645, + "grad_norm": 1.4485528969026127, + "learning_rate": 6.304977997286283e-06, + "loss": 0.7937, + "step": 6186 + }, + { + "epoch": 0.6319713993871298, + "grad_norm": 1.5189585560008951, + "learning_rate": 6.301903998114154e-06, + "loss": 0.6726, + "step": 6187 + }, + { + "epoch": 0.632073544433095, + "grad_norm": 1.4535747623387782, + "learning_rate": 6.298830403701024e-06, + "loss": 0.6461, + "step": 6188 + }, + { + "epoch": 0.6321756894790602, + "grad_norm": 1.4189325564748962, + "learning_rate": 6.295757214383296e-06, + "loss": 0.8286, + "step": 6189 + }, + { + "epoch": 0.6322778345250255, + "grad_norm": 1.4963841322450242, + "learning_rate": 6.292684430497336e-06, + "loss": 0.7421, + "step": 6190 + }, + { + "epoch": 0.6323799795709908, + "grad_norm": 1.4279436184550074, + "learning_rate": 6.289612052379457e-06, + "loss": 0.7743, + "step": 6191 + }, + { + "epoch": 0.6324821246169561, + "grad_norm": 1.4492016880850955, + "learning_rate": 6.286540080365939e-06, + "loss": 0.7429, + "step": 6192 + }, + { + "epoch": 0.6325842696629214, + "grad_norm": 1.495249373750305, + "learning_rate": 6.283468514793005e-06, + "loss": 0.7534, + "step": 6193 + }, + { + "epoch": 0.6326864147088866, + "grad_norm": 1.5968758595614805, + "learning_rate": 6.280397355996843e-06, + "loss": 0.8035, + "step": 6194 + }, + { + "epoch": 0.6327885597548519, + "grad_norm": 1.4433902304256785, + "learning_rate": 6.277326604313598e-06, + "loss": 0.8289, + "step": 6195 + }, + { + "epoch": 0.6328907048008171, + "grad_norm": 1.4080068376210237, + "learning_rate": 6.2742562600793614e-06, + "loss": 0.7034, + "step": 6196 + }, + { + "epoch": 0.6329928498467824, + "grad_norm": 1.4334213083420997, + "learning_rate": 6.2711863236301805e-06, + "loss": 0.7401, + "step": 6197 + }, + { + "epoch": 0.6330949948927477, + "grad_norm": 1.4977396009773944, + "learning_rate": 6.268116795302068e-06, + "loss": 0.7156, + "step": 6198 + }, + { + "epoch": 0.633197139938713, + "grad_norm": 1.5021139954309746, + "learning_rate": 6.265047675430982e-06, + "loss": 0.728, + "step": 6199 + }, + { + "epoch": 0.6332992849846782, + "grad_norm": 1.5336938535119229, + "learning_rate": 6.2619789643528424e-06, + "loss": 0.8164, + "step": 6200 + }, + { + "epoch": 0.6334014300306435, + "grad_norm": 1.356242017092509, + "learning_rate": 6.258910662403517e-06, + "loss": 0.653, + "step": 6201 + }, + { + "epoch": 0.6335035750766088, + "grad_norm": 1.3265460923211345, + "learning_rate": 6.255842769918838e-06, + "loss": 0.7114, + "step": 6202 + }, + { + "epoch": 0.6336057201225741, + "grad_norm": 1.3879826733753458, + "learning_rate": 6.2527752872345895e-06, + "loss": 0.6709, + "step": 6203 + }, + { + "epoch": 0.6337078651685393, + "grad_norm": 1.4680080751645839, + "learning_rate": 6.249708214686505e-06, + "loss": 0.7722, + "step": 6204 + }, + { + "epoch": 0.6338100102145046, + "grad_norm": 1.5092491757988253, + "learning_rate": 6.24664155261028e-06, + "loss": 0.7145, + "step": 6205 + }, + { + "epoch": 0.6339121552604698, + "grad_norm": 1.578997497195343, + "learning_rate": 6.243575301341561e-06, + "loss": 0.7269, + "step": 6206 + }, + { + "epoch": 0.6340143003064351, + "grad_norm": 1.4318419337598796, + "learning_rate": 6.2405094612159546e-06, + "loss": 0.6452, + "step": 6207 + }, + { + "epoch": 0.6341164453524004, + "grad_norm": 1.4816845059788193, + "learning_rate": 6.237444032569013e-06, + "loss": 0.6853, + "step": 6208 + }, + { + "epoch": 0.6342185903983657, + "grad_norm": 1.5123491478029518, + "learning_rate": 6.234379015736253e-06, + "loss": 0.6361, + "step": 6209 + }, + { + "epoch": 0.634320735444331, + "grad_norm": 1.452998072193306, + "learning_rate": 6.231314411053147e-06, + "loss": 0.6088, + "step": 6210 + }, + { + "epoch": 0.6344228804902963, + "grad_norm": 1.4471759682705987, + "learning_rate": 6.22825021885511e-06, + "loss": 0.7437, + "step": 6211 + }, + { + "epoch": 0.6345250255362614, + "grad_norm": 1.4885112002115297, + "learning_rate": 6.225186439477524e-06, + "loss": 0.6542, + "step": 6212 + }, + { + "epoch": 0.6346271705822267, + "grad_norm": 1.4219888059498778, + "learning_rate": 6.222123073255724e-06, + "loss": 0.6043, + "step": 6213 + }, + { + "epoch": 0.634729315628192, + "grad_norm": 1.4993694308449521, + "learning_rate": 6.2190601205249915e-06, + "loss": 0.6974, + "step": 6214 + }, + { + "epoch": 0.6348314606741573, + "grad_norm": 1.4653977402395704, + "learning_rate": 6.215997581620572e-06, + "loss": 0.651, + "step": 6215 + }, + { + "epoch": 0.6349336057201226, + "grad_norm": 1.4504700293284152, + "learning_rate": 6.212935456877663e-06, + "loss": 0.6603, + "step": 6216 + }, + { + "epoch": 0.6350357507660879, + "grad_norm": 1.31453360826875, + "learning_rate": 6.20987374663141e-06, + "loss": 0.577, + "step": 6217 + }, + { + "epoch": 0.6351378958120532, + "grad_norm": 1.5966027384079478, + "learning_rate": 6.20681245121693e-06, + "loss": 0.641, + "step": 6218 + }, + { + "epoch": 0.6352400408580184, + "grad_norm": 1.4582790796757519, + "learning_rate": 6.203751570969275e-06, + "loss": 0.6733, + "step": 6219 + }, + { + "epoch": 0.6353421859039836, + "grad_norm": 1.462234736454773, + "learning_rate": 6.200691106223466e-06, + "loss": 0.7, + "step": 6220 + }, + { + "epoch": 0.6354443309499489, + "grad_norm": 1.4057295654477358, + "learning_rate": 6.197631057314468e-06, + "loss": 0.6323, + "step": 6221 + }, + { + "epoch": 0.6355464759959142, + "grad_norm": 1.5233682182128938, + "learning_rate": 6.194571424577209e-06, + "loss": 0.8243, + "step": 6222 + }, + { + "epoch": 0.6356486210418795, + "grad_norm": 1.4220250645864718, + "learning_rate": 6.191512208346564e-06, + "loss": 0.7041, + "step": 6223 + }, + { + "epoch": 0.6357507660878448, + "grad_norm": 1.6270115198349078, + "learning_rate": 6.188453408957371e-06, + "loss": 0.737, + "step": 6224 + }, + { + "epoch": 0.63585291113381, + "grad_norm": 1.549927916578951, + "learning_rate": 6.1853950267444095e-06, + "loss": 0.6851, + "step": 6225 + }, + { + "epoch": 0.6359550561797753, + "grad_norm": 1.4296112451074776, + "learning_rate": 6.182337062042431e-06, + "loss": 0.6015, + "step": 6226 + }, + { + "epoch": 0.6360572012257405, + "grad_norm": 1.4719461868049277, + "learning_rate": 6.1792795151861285e-06, + "loss": 0.6562, + "step": 6227 + }, + { + "epoch": 0.6361593462717058, + "grad_norm": 1.3184197167047926, + "learning_rate": 6.17622238651015e-06, + "loss": 0.6735, + "step": 6228 + }, + { + "epoch": 0.6362614913176711, + "grad_norm": 1.4591541413720452, + "learning_rate": 6.173165676349103e-06, + "loss": 0.6164, + "step": 6229 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 1.4203075982165287, + "learning_rate": 6.170109385037546e-06, + "loss": 0.7087, + "step": 6230 + }, + { + "epoch": 0.6364657814096016, + "grad_norm": 1.359520142387967, + "learning_rate": 6.167053512909988e-06, + "loss": 0.7115, + "step": 6231 + }, + { + "epoch": 0.6365679264555669, + "grad_norm": 1.3808068490156395, + "learning_rate": 6.163998060300904e-06, + "loss": 0.6391, + "step": 6232 + }, + { + "epoch": 0.6366700715015322, + "grad_norm": 1.405883583688255, + "learning_rate": 6.160943027544706e-06, + "loss": 0.6704, + "step": 6233 + }, + { + "epoch": 0.6367722165474975, + "grad_norm": 1.3646605696719372, + "learning_rate": 6.157888414975773e-06, + "loss": 0.6202, + "step": 6234 + }, + { + "epoch": 0.6368743615934627, + "grad_norm": 1.5158929539816646, + "learning_rate": 6.154834222928439e-06, + "loss": 0.7368, + "step": 6235 + }, + { + "epoch": 0.636976506639428, + "grad_norm": 1.6604850432086664, + "learning_rate": 6.151780451736983e-06, + "loss": 0.6779, + "step": 6236 + }, + { + "epoch": 0.6370786516853932, + "grad_norm": 1.574391380965864, + "learning_rate": 6.148727101735643e-06, + "loss": 0.76, + "step": 6237 + }, + { + "epoch": 0.6371807967313585, + "grad_norm": 1.7394989648284243, + "learning_rate": 6.145674173258609e-06, + "loss": 0.7345, + "step": 6238 + }, + { + "epoch": 0.6372829417773238, + "grad_norm": 1.4628235786694361, + "learning_rate": 6.142621666640028e-06, + "loss": 0.7739, + "step": 6239 + }, + { + "epoch": 0.6373850868232891, + "grad_norm": 1.5121468234428965, + "learning_rate": 6.139569582213997e-06, + "loss": 0.7746, + "step": 6240 + }, + { + "epoch": 0.6374872318692544, + "grad_norm": 1.3744367591821849, + "learning_rate": 6.1365179203145705e-06, + "loss": 0.6738, + "step": 6241 + }, + { + "epoch": 0.6375893769152197, + "grad_norm": 1.5669755563045646, + "learning_rate": 6.13346668127575e-06, + "loss": 0.6892, + "step": 6242 + }, + { + "epoch": 0.6376915219611848, + "grad_norm": 1.4568635355387474, + "learning_rate": 6.1304158654315015e-06, + "loss": 0.8537, + "step": 6243 + }, + { + "epoch": 0.6377936670071501, + "grad_norm": 1.2602954399512702, + "learning_rate": 6.127365473115738e-06, + "loss": 0.612, + "step": 6244 + }, + { + "epoch": 0.6378958120531154, + "grad_norm": 1.4469408580318752, + "learning_rate": 6.124315504662325e-06, + "loss": 0.7272, + "step": 6245 + }, + { + "epoch": 0.6379979570990807, + "grad_norm": 1.448084307636434, + "learning_rate": 6.121265960405085e-06, + "loss": 0.7752, + "step": 6246 + }, + { + "epoch": 0.638100102145046, + "grad_norm": 1.3914522330353567, + "learning_rate": 6.118216840677791e-06, + "loss": 0.7579, + "step": 6247 + }, + { + "epoch": 0.6382022471910113, + "grad_norm": 1.5852464799926391, + "learning_rate": 6.11516814581417e-06, + "loss": 0.7772, + "step": 6248 + }, + { + "epoch": 0.6383043922369765, + "grad_norm": 1.5236108392706704, + "learning_rate": 6.112119876147908e-06, + "loss": 0.7219, + "step": 6249 + }, + { + "epoch": 0.6384065372829418, + "grad_norm": 1.5016271098897103, + "learning_rate": 6.1090720320126325e-06, + "loss": 0.7993, + "step": 6250 + }, + { + "epoch": 0.638508682328907, + "grad_norm": 1.4939538218284871, + "learning_rate": 6.106024613741941e-06, + "loss": 0.6716, + "step": 6251 + }, + { + "epoch": 0.6386108273748723, + "grad_norm": 1.7935364035761887, + "learning_rate": 6.102977621669371e-06, + "loss": 0.7911, + "step": 6252 + }, + { + "epoch": 0.6387129724208376, + "grad_norm": 1.5198210915670876, + "learning_rate": 6.099931056128418e-06, + "loss": 0.5812, + "step": 6253 + }, + { + "epoch": 0.6388151174668029, + "grad_norm": 1.4867267152849766, + "learning_rate": 6.096884917452531e-06, + "loss": 0.739, + "step": 6254 + }, + { + "epoch": 0.6389172625127681, + "grad_norm": 1.4288751536856423, + "learning_rate": 6.093839205975111e-06, + "loss": 0.689, + "step": 6255 + }, + { + "epoch": 0.6390194075587334, + "grad_norm": 1.3179589640284959, + "learning_rate": 6.090793922029514e-06, + "loss": 0.6719, + "step": 6256 + }, + { + "epoch": 0.6391215526046987, + "grad_norm": 1.5559147156948698, + "learning_rate": 6.087749065949047e-06, + "loss": 0.6956, + "step": 6257 + }, + { + "epoch": 0.6392236976506639, + "grad_norm": 1.5465574184309092, + "learning_rate": 6.08470463806697e-06, + "loss": 0.7879, + "step": 6258 + }, + { + "epoch": 0.6393258426966292, + "grad_norm": 1.4421781530341753, + "learning_rate": 6.081660638716505e-06, + "loss": 0.5971, + "step": 6259 + }, + { + "epoch": 0.6394279877425945, + "grad_norm": 1.4028764383352599, + "learning_rate": 6.0786170682308125e-06, + "loss": 0.6198, + "step": 6260 + }, + { + "epoch": 0.6395301327885597, + "grad_norm": 1.496240994624535, + "learning_rate": 6.075573926943016e-06, + "loss": 0.6857, + "step": 6261 + }, + { + "epoch": 0.639632277834525, + "grad_norm": 1.5726742847144117, + "learning_rate": 6.072531215186187e-06, + "loss": 0.6352, + "step": 6262 + }, + { + "epoch": 0.6397344228804903, + "grad_norm": 1.5342798755293476, + "learning_rate": 6.069488933293357e-06, + "loss": 0.7799, + "step": 6263 + }, + { + "epoch": 0.6398365679264556, + "grad_norm": 1.466962476830368, + "learning_rate": 6.066447081597502e-06, + "loss": 0.5925, + "step": 6264 + }, + { + "epoch": 0.6399387129724209, + "grad_norm": 1.516820858944803, + "learning_rate": 6.063405660431553e-06, + "loss": 0.7411, + "step": 6265 + }, + { + "epoch": 0.6400408580183861, + "grad_norm": 1.5568608146490341, + "learning_rate": 6.060364670128396e-06, + "loss": 0.7675, + "step": 6266 + }, + { + "epoch": 0.6401430030643513, + "grad_norm": 1.437974419050373, + "learning_rate": 6.0573241110208755e-06, + "loss": 0.6998, + "step": 6267 + }, + { + "epoch": 0.6402451481103166, + "grad_norm": 1.4162603119460129, + "learning_rate": 6.054283983441776e-06, + "loss": 0.6653, + "step": 6268 + }, + { + "epoch": 0.6403472931562819, + "grad_norm": 1.277162464892732, + "learning_rate": 6.051244287723846e-06, + "loss": 0.5971, + "step": 6269 + }, + { + "epoch": 0.6404494382022472, + "grad_norm": 1.220602949708139, + "learning_rate": 6.048205024199778e-06, + "loss": 0.7208, + "step": 6270 + }, + { + "epoch": 0.6405515832482125, + "grad_norm": 1.3681695386784096, + "learning_rate": 6.0451661932022255e-06, + "loss": 0.7108, + "step": 6271 + }, + { + "epoch": 0.6406537282941778, + "grad_norm": 1.658182041622724, + "learning_rate": 6.042127795063786e-06, + "loss": 0.6594, + "step": 6272 + }, + { + "epoch": 0.6407558733401431, + "grad_norm": 1.577463958203946, + "learning_rate": 6.0390898301170185e-06, + "loss": 0.7456, + "step": 6273 + }, + { + "epoch": 0.6408580183861082, + "grad_norm": 1.3223477041156972, + "learning_rate": 6.036052298694424e-06, + "loss": 0.6867, + "step": 6274 + }, + { + "epoch": 0.6409601634320735, + "grad_norm": 1.5616895836045444, + "learning_rate": 6.0330152011284696e-06, + "loss": 0.7719, + "step": 6275 + }, + { + "epoch": 0.6410623084780388, + "grad_norm": 1.467252891058547, + "learning_rate": 6.029978537751564e-06, + "loss": 0.6348, + "step": 6276 + }, + { + "epoch": 0.6411644535240041, + "grad_norm": 1.469235519563346, + "learning_rate": 6.026942308896073e-06, + "loss": 0.6996, + "step": 6277 + }, + { + "epoch": 0.6412665985699694, + "grad_norm": 1.7020674155693478, + "learning_rate": 6.023906514894313e-06, + "loss": 0.7073, + "step": 6278 + }, + { + "epoch": 0.6413687436159347, + "grad_norm": 1.5687023916689578, + "learning_rate": 6.020871156078554e-06, + "loss": 0.7302, + "step": 6279 + }, + { + "epoch": 0.6414708886618999, + "grad_norm": 1.515846285703917, + "learning_rate": 6.017836232781018e-06, + "loss": 0.5688, + "step": 6280 + }, + { + "epoch": 0.6415730337078651, + "grad_norm": 1.5468003853842678, + "learning_rate": 6.01480174533388e-06, + "loss": 0.6731, + "step": 6281 + }, + { + "epoch": 0.6416751787538304, + "grad_norm": 1.3861808635196455, + "learning_rate": 6.011767694069259e-06, + "loss": 0.7177, + "step": 6282 + }, + { + "epoch": 0.6417773237997957, + "grad_norm": 1.4517288074656356, + "learning_rate": 6.008734079319247e-06, + "loss": 0.6446, + "step": 6283 + }, + { + "epoch": 0.641879468845761, + "grad_norm": 1.4919906162776033, + "learning_rate": 6.005700901415869e-06, + "loss": 0.7099, + "step": 6284 + }, + { + "epoch": 0.6419816138917263, + "grad_norm": 1.4612823533062915, + "learning_rate": 6.002668160691107e-06, + "loss": 0.6198, + "step": 6285 + }, + { + "epoch": 0.6420837589376915, + "grad_norm": 1.5320872605567237, + "learning_rate": 5.999635857476897e-06, + "loss": 0.809, + "step": 6286 + }, + { + "epoch": 0.6421859039836568, + "grad_norm": 1.4252250187186526, + "learning_rate": 5.996603992105128e-06, + "loss": 0.7124, + "step": 6287 + }, + { + "epoch": 0.6422880490296221, + "grad_norm": 1.479817849230376, + "learning_rate": 5.993572564907637e-06, + "loss": 0.6795, + "step": 6288 + }, + { + "epoch": 0.6423901940755873, + "grad_norm": 1.4328461340662038, + "learning_rate": 5.9905415762162176e-06, + "loss": 0.5986, + "step": 6289 + }, + { + "epoch": 0.6424923391215526, + "grad_norm": 1.6153171128679744, + "learning_rate": 5.987511026362611e-06, + "loss": 0.6841, + "step": 6290 + }, + { + "epoch": 0.6425944841675179, + "grad_norm": 1.474212562965794, + "learning_rate": 5.984480915678519e-06, + "loss": 0.7023, + "step": 6291 + }, + { + "epoch": 0.6426966292134831, + "grad_norm": 1.4628158184884703, + "learning_rate": 5.981451244495582e-06, + "loss": 0.8259, + "step": 6292 + }, + { + "epoch": 0.6427987742594484, + "grad_norm": 1.415954260290507, + "learning_rate": 5.978422013145406e-06, + "loss": 0.6458, + "step": 6293 + }, + { + "epoch": 0.6429009193054137, + "grad_norm": 1.482107150450611, + "learning_rate": 5.975393221959535e-06, + "loss": 0.684, + "step": 6294 + }, + { + "epoch": 0.643003064351379, + "grad_norm": 1.4097967647962504, + "learning_rate": 5.97236487126948e-06, + "loss": 0.7265, + "step": 6295 + }, + { + "epoch": 0.6431052093973443, + "grad_norm": 1.6066437209036217, + "learning_rate": 5.969336961406689e-06, + "loss": 0.7774, + "step": 6296 + }, + { + "epoch": 0.6432073544433095, + "grad_norm": 1.3396529778686272, + "learning_rate": 5.966309492702574e-06, + "loss": 0.6666, + "step": 6297 + }, + { + "epoch": 0.6433094994892747, + "grad_norm": 1.6611679074904009, + "learning_rate": 5.963282465488488e-06, + "loss": 0.7887, + "step": 6298 + }, + { + "epoch": 0.64341164453524, + "grad_norm": 1.6416909516868807, + "learning_rate": 5.960255880095746e-06, + "loss": 0.8088, + "step": 6299 + }, + { + "epoch": 0.6435137895812053, + "grad_norm": 1.409404647165669, + "learning_rate": 5.957229736855609e-06, + "loss": 0.7443, + "step": 6300 + }, + { + "epoch": 0.6436159346271706, + "grad_norm": 1.4038165064767516, + "learning_rate": 5.9542040360992895e-06, + "loss": 0.7249, + "step": 6301 + }, + { + "epoch": 0.6437180796731359, + "grad_norm": 1.4505670644750825, + "learning_rate": 5.95117877815795e-06, + "loss": 0.7171, + "step": 6302 + }, + { + "epoch": 0.6438202247191012, + "grad_norm": 1.424467231678113, + "learning_rate": 5.948153963362711e-06, + "loss": 0.5964, + "step": 6303 + }, + { + "epoch": 0.6439223697650664, + "grad_norm": 1.6934730700868863, + "learning_rate": 5.945129592044638e-06, + "loss": 0.7947, + "step": 6304 + }, + { + "epoch": 0.6440245148110316, + "grad_norm": 1.5691252449151116, + "learning_rate": 5.942105664534752e-06, + "loss": 0.7513, + "step": 6305 + }, + { + "epoch": 0.6441266598569969, + "grad_norm": 1.5759238010677883, + "learning_rate": 5.939082181164018e-06, + "loss": 0.7166, + "step": 6306 + }, + { + "epoch": 0.6442288049029622, + "grad_norm": 1.6203843652502852, + "learning_rate": 5.9360591422633654e-06, + "loss": 0.66, + "step": 6307 + }, + { + "epoch": 0.6443309499489275, + "grad_norm": 1.467542102367679, + "learning_rate": 5.933036548163668e-06, + "loss": 0.7398, + "step": 6308 + }, + { + "epoch": 0.6444330949948928, + "grad_norm": 1.371049671255989, + "learning_rate": 5.9300143991957445e-06, + "loss": 0.6845, + "step": 6309 + }, + { + "epoch": 0.644535240040858, + "grad_norm": 1.3700813356801136, + "learning_rate": 5.926992695690378e-06, + "loss": 0.6713, + "step": 6310 + }, + { + "epoch": 0.6446373850868233, + "grad_norm": 1.4485422707085727, + "learning_rate": 5.92397143797829e-06, + "loss": 0.7022, + "step": 6311 + }, + { + "epoch": 0.6447395301327885, + "grad_norm": 1.4308039619657558, + "learning_rate": 5.920950626390163e-06, + "loss": 0.6391, + "step": 6312 + }, + { + "epoch": 0.6448416751787538, + "grad_norm": 1.481273965125313, + "learning_rate": 5.917930261256624e-06, + "loss": 0.6249, + "step": 6313 + }, + { + "epoch": 0.6449438202247191, + "grad_norm": 1.594762582758169, + "learning_rate": 5.9149103429082535e-06, + "loss": 0.8109, + "step": 6314 + }, + { + "epoch": 0.6450459652706844, + "grad_norm": 1.544382420259511, + "learning_rate": 5.91189087167559e-06, + "loss": 0.6977, + "step": 6315 + }, + { + "epoch": 0.6451481103166496, + "grad_norm": 1.5624957361892107, + "learning_rate": 5.908871847889108e-06, + "loss": 0.7703, + "step": 6316 + }, + { + "epoch": 0.6452502553626149, + "grad_norm": 1.3960109333197803, + "learning_rate": 5.905853271879248e-06, + "loss": 0.6361, + "step": 6317 + }, + { + "epoch": 0.6453524004085802, + "grad_norm": 1.4233643441313804, + "learning_rate": 5.902835143976393e-06, + "loss": 0.7047, + "step": 6318 + }, + { + "epoch": 0.6454545454545455, + "grad_norm": 1.3246914761704722, + "learning_rate": 5.8998174645108766e-06, + "loss": 0.6961, + "step": 6319 + }, + { + "epoch": 0.6455566905005107, + "grad_norm": 1.3027690246627308, + "learning_rate": 5.896800233812989e-06, + "loss": 0.7153, + "step": 6320 + }, + { + "epoch": 0.645658835546476, + "grad_norm": 1.311259140583127, + "learning_rate": 5.893783452212965e-06, + "loss": 0.7258, + "step": 6321 + }, + { + "epoch": 0.6457609805924412, + "grad_norm": 1.3147488868040462, + "learning_rate": 5.8907671200409944e-06, + "loss": 0.7149, + "step": 6322 + }, + { + "epoch": 0.6458631256384065, + "grad_norm": 1.4902720867640786, + "learning_rate": 5.887751237627219e-06, + "loss": 0.6608, + "step": 6323 + }, + { + "epoch": 0.6459652706843718, + "grad_norm": 1.542438356662139, + "learning_rate": 5.884735805301729e-06, + "loss": 0.83, + "step": 6324 + }, + { + "epoch": 0.6460674157303371, + "grad_norm": 1.4886161965657752, + "learning_rate": 5.881720823394563e-06, + "loss": 0.7997, + "step": 6325 + }, + { + "epoch": 0.6461695607763024, + "grad_norm": 1.5876435627495205, + "learning_rate": 5.878706292235712e-06, + "loss": 0.7174, + "step": 6326 + }, + { + "epoch": 0.6462717058222677, + "grad_norm": 1.4868227272801122, + "learning_rate": 5.875692212155123e-06, + "loss": 0.7195, + "step": 6327 + }, + { + "epoch": 0.6463738508682328, + "grad_norm": 1.4926683500799103, + "learning_rate": 5.872678583482684e-06, + "loss": 0.6917, + "step": 6328 + }, + { + "epoch": 0.6464759959141981, + "grad_norm": 1.5116797716265975, + "learning_rate": 5.869665406548242e-06, + "loss": 0.7352, + "step": 6329 + }, + { + "epoch": 0.6465781409601634, + "grad_norm": 1.3896217737963354, + "learning_rate": 5.866652681681586e-06, + "loss": 0.5781, + "step": 6330 + }, + { + "epoch": 0.6466802860061287, + "grad_norm": 1.56993081915318, + "learning_rate": 5.863640409212467e-06, + "loss": 0.7232, + "step": 6331 + }, + { + "epoch": 0.646782431052094, + "grad_norm": 1.3129115209525009, + "learning_rate": 5.860628589470578e-06, + "loss": 0.6601, + "step": 6332 + }, + { + "epoch": 0.6468845760980593, + "grad_norm": 1.5171361512116293, + "learning_rate": 5.857617222785563e-06, + "loss": 0.6758, + "step": 6333 + }, + { + "epoch": 0.6469867211440246, + "grad_norm": 1.2263135646165713, + "learning_rate": 5.854606309487023e-06, + "loss": 0.6683, + "step": 6334 + }, + { + "epoch": 0.6470888661899897, + "grad_norm": 1.486673194419447, + "learning_rate": 5.851595849904499e-06, + "loss": 0.8071, + "step": 6335 + }, + { + "epoch": 0.647191011235955, + "grad_norm": 1.3338496817668883, + "learning_rate": 5.848585844367487e-06, + "loss": 0.7452, + "step": 6336 + }, + { + "epoch": 0.6472931562819203, + "grad_norm": 1.441407239701411, + "learning_rate": 5.845576293205439e-06, + "loss": 0.6698, + "step": 6337 + }, + { + "epoch": 0.6473953013278856, + "grad_norm": 1.3852217401224993, + "learning_rate": 5.842567196747747e-06, + "loss": 0.7051, + "step": 6338 + }, + { + "epoch": 0.6474974463738509, + "grad_norm": 1.6108619144981824, + "learning_rate": 5.839558555323764e-06, + "loss": 0.7733, + "step": 6339 + }, + { + "epoch": 0.6475995914198162, + "grad_norm": 1.4394800132962888, + "learning_rate": 5.836550369262783e-06, + "loss": 0.7789, + "step": 6340 + }, + { + "epoch": 0.6477017364657814, + "grad_norm": 1.3628402270894475, + "learning_rate": 5.833542638894056e-06, + "loss": 0.6488, + "step": 6341 + }, + { + "epoch": 0.6478038815117467, + "grad_norm": 1.509343726814961, + "learning_rate": 5.830535364546779e-06, + "loss": 0.6341, + "step": 6342 + }, + { + "epoch": 0.6479060265577119, + "grad_norm": 1.5954579259879267, + "learning_rate": 5.827528546550101e-06, + "loss": 0.7731, + "step": 6343 + }, + { + "epoch": 0.6480081716036772, + "grad_norm": 1.4775873272210471, + "learning_rate": 5.824522185233115e-06, + "loss": 0.5926, + "step": 6344 + }, + { + "epoch": 0.6481103166496425, + "grad_norm": 1.4302511416703345, + "learning_rate": 5.8215162809248785e-06, + "loss": 0.7, + "step": 6345 + }, + { + "epoch": 0.6482124616956078, + "grad_norm": 1.4511007000605698, + "learning_rate": 5.818510833954379e-06, + "loss": 0.7171, + "step": 6346 + }, + { + "epoch": 0.648314606741573, + "grad_norm": 1.475167437030065, + "learning_rate": 5.815505844650576e-06, + "loss": 0.7625, + "step": 6347 + }, + { + "epoch": 0.6484167517875383, + "grad_norm": 1.4842598560934537, + "learning_rate": 5.81250131334236e-06, + "loss": 0.6768, + "step": 6348 + }, + { + "epoch": 0.6485188968335036, + "grad_norm": 1.6331893348035946, + "learning_rate": 5.809497240358578e-06, + "loss": 0.739, + "step": 6349 + }, + { + "epoch": 0.6486210418794689, + "grad_norm": 1.3449957211538792, + "learning_rate": 5.806493626028033e-06, + "loss": 0.6524, + "step": 6350 + }, + { + "epoch": 0.6487231869254341, + "grad_norm": 1.4835045860482052, + "learning_rate": 5.803490470679473e-06, + "loss": 0.6641, + "step": 6351 + }, + { + "epoch": 0.6488253319713994, + "grad_norm": 1.5332059158632318, + "learning_rate": 5.80048777464159e-06, + "loss": 0.7534, + "step": 6352 + }, + { + "epoch": 0.6489274770173646, + "grad_norm": 1.5139463503928976, + "learning_rate": 5.797485538243034e-06, + "loss": 0.6745, + "step": 6353 + }, + { + "epoch": 0.6490296220633299, + "grad_norm": 1.5535727612526296, + "learning_rate": 5.794483761812393e-06, + "loss": 0.6875, + "step": 6354 + }, + { + "epoch": 0.6491317671092952, + "grad_norm": 1.3930562914335123, + "learning_rate": 5.7914824456782296e-06, + "loss": 0.6241, + "step": 6355 + }, + { + "epoch": 0.6492339121552605, + "grad_norm": 1.503355955866966, + "learning_rate": 5.7884815901690306e-06, + "loss": 0.7057, + "step": 6356 + }, + { + "epoch": 0.6493360572012258, + "grad_norm": 1.678559934401488, + "learning_rate": 5.785481195613243e-06, + "loss": 0.8101, + "step": 6357 + }, + { + "epoch": 0.6494382022471911, + "grad_norm": 1.4271361015473665, + "learning_rate": 5.782481262339261e-06, + "loss": 0.6927, + "step": 6358 + }, + { + "epoch": 0.6495403472931562, + "grad_norm": 1.4001833232568726, + "learning_rate": 5.779481790675426e-06, + "loss": 0.5893, + "step": 6359 + }, + { + "epoch": 0.6496424923391215, + "grad_norm": 1.4763191347456233, + "learning_rate": 5.776482780950041e-06, + "loss": 0.688, + "step": 6360 + }, + { + "epoch": 0.6497446373850868, + "grad_norm": 1.3113638951517055, + "learning_rate": 5.773484233491342e-06, + "loss": 0.6988, + "step": 6361 + }, + { + "epoch": 0.6498467824310521, + "grad_norm": 1.3460519984892971, + "learning_rate": 5.770486148627523e-06, + "loss": 0.6623, + "step": 6362 + }, + { + "epoch": 0.6499489274770174, + "grad_norm": 1.5451060793849392, + "learning_rate": 5.76748852668673e-06, + "loss": 0.765, + "step": 6363 + }, + { + "epoch": 0.6500510725229827, + "grad_norm": 1.4721720706511154, + "learning_rate": 5.764491367997049e-06, + "loss": 0.7641, + "step": 6364 + }, + { + "epoch": 0.650153217568948, + "grad_norm": 1.462312018560201, + "learning_rate": 5.7614946728865275e-06, + "loss": 0.6499, + "step": 6365 + }, + { + "epoch": 0.6502553626149131, + "grad_norm": 1.5262239818356969, + "learning_rate": 5.758498441683154e-06, + "loss": 0.7359, + "step": 6366 + }, + { + "epoch": 0.6503575076608784, + "grad_norm": 1.40463504869887, + "learning_rate": 5.755502674714865e-06, + "loss": 0.801, + "step": 6367 + }, + { + "epoch": 0.6504596527068437, + "grad_norm": 1.3855909996868994, + "learning_rate": 5.752507372309546e-06, + "loss": 0.7379, + "step": 6368 + }, + { + "epoch": 0.650561797752809, + "grad_norm": 1.2718627967339275, + "learning_rate": 5.749512534795044e-06, + "loss": 0.6066, + "step": 6369 + }, + { + "epoch": 0.6506639427987743, + "grad_norm": 1.4181372334478919, + "learning_rate": 5.7465181624991374e-06, + "loss": 0.8462, + "step": 6370 + }, + { + "epoch": 0.6507660878447395, + "grad_norm": 1.5282786810820324, + "learning_rate": 5.7435242557495705e-06, + "loss": 0.727, + "step": 6371 + }, + { + "epoch": 0.6508682328907048, + "grad_norm": 1.5203966197020127, + "learning_rate": 5.740530814874023e-06, + "loss": 0.7547, + "step": 6372 + }, + { + "epoch": 0.6509703779366701, + "grad_norm": 1.4331867228001234, + "learning_rate": 5.737537840200132e-06, + "loss": 0.6602, + "step": 6373 + }, + { + "epoch": 0.6510725229826353, + "grad_norm": 1.59507792623401, + "learning_rate": 5.734545332055471e-06, + "loss": 0.7751, + "step": 6374 + }, + { + "epoch": 0.6511746680286006, + "grad_norm": 1.4134453399559137, + "learning_rate": 5.731553290767584e-06, + "loss": 0.6088, + "step": 6375 + }, + { + "epoch": 0.6512768130745659, + "grad_norm": 1.5010996630166384, + "learning_rate": 5.728561716663949e-06, + "loss": 0.6873, + "step": 6376 + }, + { + "epoch": 0.6513789581205311, + "grad_norm": 1.4138872034974663, + "learning_rate": 5.7255706100719925e-06, + "loss": 0.6504, + "step": 6377 + }, + { + "epoch": 0.6514811031664964, + "grad_norm": 1.5847195954841544, + "learning_rate": 5.722579971319091e-06, + "loss": 0.7621, + "step": 6378 + }, + { + "epoch": 0.6515832482124617, + "grad_norm": 1.581836987343748, + "learning_rate": 5.719589800732575e-06, + "loss": 0.7084, + "step": 6379 + }, + { + "epoch": 0.651685393258427, + "grad_norm": 1.4856061002120062, + "learning_rate": 5.716600098639724e-06, + "loss": 0.7705, + "step": 6380 + }, + { + "epoch": 0.6517875383043923, + "grad_norm": 1.478977177570494, + "learning_rate": 5.71361086536776e-06, + "loss": 0.712, + "step": 6381 + }, + { + "epoch": 0.6518896833503575, + "grad_norm": 1.4958918162723975, + "learning_rate": 5.710622101243857e-06, + "loss": 0.6564, + "step": 6382 + }, + { + "epoch": 0.6519918283963227, + "grad_norm": 1.3043031060841932, + "learning_rate": 5.70763380659513e-06, + "loss": 0.7272, + "step": 6383 + }, + { + "epoch": 0.652093973442288, + "grad_norm": 1.6663607740503847, + "learning_rate": 5.704645981748662e-06, + "loss": 0.7739, + "step": 6384 + }, + { + "epoch": 0.6521961184882533, + "grad_norm": 1.572339684632511, + "learning_rate": 5.701658627031466e-06, + "loss": 0.7304, + "step": 6385 + }, + { + "epoch": 0.6522982635342186, + "grad_norm": 1.4765424427825464, + "learning_rate": 5.698671742770507e-06, + "loss": 0.6868, + "step": 6386 + }, + { + "epoch": 0.6524004085801839, + "grad_norm": 1.5123910337252047, + "learning_rate": 5.695685329292708e-06, + "loss": 0.7538, + "step": 6387 + }, + { + "epoch": 0.6525025536261492, + "grad_norm": 1.4264475689472662, + "learning_rate": 5.692699386924927e-06, + "loss": 0.6709, + "step": 6388 + }, + { + "epoch": 0.6526046986721143, + "grad_norm": 1.47220997476032, + "learning_rate": 5.689713915993985e-06, + "loss": 0.7414, + "step": 6389 + }, + { + "epoch": 0.6527068437180796, + "grad_norm": 1.563475522528441, + "learning_rate": 5.68672891682664e-06, + "loss": 0.6984, + "step": 6390 + }, + { + "epoch": 0.6528089887640449, + "grad_norm": 1.5787700972102408, + "learning_rate": 5.683744389749602e-06, + "loss": 0.6489, + "step": 6391 + }, + { + "epoch": 0.6529111338100102, + "grad_norm": 1.4193504468392486, + "learning_rate": 5.680760335089528e-06, + "loss": 0.7097, + "step": 6392 + }, + { + "epoch": 0.6530132788559755, + "grad_norm": 1.2411623726872238, + "learning_rate": 5.6777767531730225e-06, + "loss": 0.6962, + "step": 6393 + }, + { + "epoch": 0.6531154239019408, + "grad_norm": 1.4091315428955322, + "learning_rate": 5.6747936443266435e-06, + "loss": 0.6075, + "step": 6394 + }, + { + "epoch": 0.6532175689479061, + "grad_norm": 1.4631515867390752, + "learning_rate": 5.6718110088769e-06, + "loss": 0.7178, + "step": 6395 + }, + { + "epoch": 0.6533197139938713, + "grad_norm": 1.2854831506396007, + "learning_rate": 5.6688288471502365e-06, + "loss": 0.6645, + "step": 6396 + }, + { + "epoch": 0.6534218590398365, + "grad_norm": 1.4117525668697681, + "learning_rate": 5.665847159473053e-06, + "loss": 0.6162, + "step": 6397 + }, + { + "epoch": 0.6535240040858018, + "grad_norm": 1.4588081405168964, + "learning_rate": 5.6628659461716965e-06, + "loss": 0.7265, + "step": 6398 + }, + { + "epoch": 0.6536261491317671, + "grad_norm": 1.5537495002960608, + "learning_rate": 5.659885207572466e-06, + "loss": 0.7343, + "step": 6399 + }, + { + "epoch": 0.6537282941777324, + "grad_norm": 1.6106729023277113, + "learning_rate": 5.656904944001607e-06, + "loss": 0.6697, + "step": 6400 + }, + { + "epoch": 0.6538304392236977, + "grad_norm": 1.5463585793752392, + "learning_rate": 5.653925155785305e-06, + "loss": 0.7391, + "step": 6401 + }, + { + "epoch": 0.6539325842696629, + "grad_norm": 1.399071494854687, + "learning_rate": 5.6509458432496985e-06, + "loss": 0.6772, + "step": 6402 + }, + { + "epoch": 0.6540347293156282, + "grad_norm": 1.4919613571178558, + "learning_rate": 5.64796700672088e-06, + "loss": 0.7101, + "step": 6403 + }, + { + "epoch": 0.6541368743615935, + "grad_norm": 1.4797670762771453, + "learning_rate": 5.644988646524889e-06, + "loss": 0.6975, + "step": 6404 + }, + { + "epoch": 0.6542390194075587, + "grad_norm": 1.5498117775501439, + "learning_rate": 5.642010762987704e-06, + "loss": 0.6865, + "step": 6405 + }, + { + "epoch": 0.654341164453524, + "grad_norm": 1.4989140822337448, + "learning_rate": 5.639033356435257e-06, + "loss": 0.6455, + "step": 6406 + }, + { + "epoch": 0.6544433094994893, + "grad_norm": 1.4019225799138728, + "learning_rate": 5.636056427193426e-06, + "loss": 0.6001, + "step": 6407 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 1.320388791277678, + "learning_rate": 5.633079975588035e-06, + "loss": 0.7011, + "step": 6408 + }, + { + "epoch": 0.6546475995914198, + "grad_norm": 1.644115941851424, + "learning_rate": 5.630104001944865e-06, + "loss": 0.6918, + "step": 6409 + }, + { + "epoch": 0.6547497446373851, + "grad_norm": 1.6101219848444763, + "learning_rate": 5.627128506589633e-06, + "loss": 0.6704, + "step": 6410 + }, + { + "epoch": 0.6548518896833504, + "grad_norm": 1.6161133579459637, + "learning_rate": 5.624153489848014e-06, + "loss": 0.7291, + "step": 6411 + }, + { + "epoch": 0.6549540347293157, + "grad_norm": 1.5440675157851353, + "learning_rate": 5.621178952045622e-06, + "loss": 0.6293, + "step": 6412 + }, + { + "epoch": 0.6550561797752809, + "grad_norm": 1.7283246226720521, + "learning_rate": 5.618204893508021e-06, + "loss": 0.697, + "step": 6413 + }, + { + "epoch": 0.6551583248212461, + "grad_norm": 1.3559492819230137, + "learning_rate": 5.615231314560727e-06, + "loss": 0.5894, + "step": 6414 + }, + { + "epoch": 0.6552604698672114, + "grad_norm": 1.5935615385559916, + "learning_rate": 5.6122582155292e-06, + "loss": 0.6376, + "step": 6415 + }, + { + "epoch": 0.6553626149131767, + "grad_norm": 1.538253692371307, + "learning_rate": 5.609285596738847e-06, + "loss": 0.6759, + "step": 6416 + }, + { + "epoch": 0.655464759959142, + "grad_norm": 1.4012332649924164, + "learning_rate": 5.606313458515017e-06, + "loss": 0.6324, + "step": 6417 + }, + { + "epoch": 0.6555669050051073, + "grad_norm": 1.336031351324118, + "learning_rate": 5.603341801183017e-06, + "loss": 0.5308, + "step": 6418 + }, + { + "epoch": 0.6556690500510726, + "grad_norm": 1.492685158102753, + "learning_rate": 5.600370625068103e-06, + "loss": 0.6106, + "step": 6419 + }, + { + "epoch": 0.6557711950970377, + "grad_norm": 1.4597778620592337, + "learning_rate": 5.597399930495466e-06, + "loss": 0.795, + "step": 6420 + }, + { + "epoch": 0.655873340143003, + "grad_norm": 1.4632272190260227, + "learning_rate": 5.594429717790251e-06, + "loss": 0.613, + "step": 6421 + }, + { + "epoch": 0.6559754851889683, + "grad_norm": 1.5499079670946054, + "learning_rate": 5.591459987277545e-06, + "loss": 0.7384, + "step": 6422 + }, + { + "epoch": 0.6560776302349336, + "grad_norm": 1.3879338002574035, + "learning_rate": 5.588490739282396e-06, + "loss": 0.6619, + "step": 6423 + }, + { + "epoch": 0.6561797752808989, + "grad_norm": 1.3718112923058157, + "learning_rate": 5.585521974129786e-06, + "loss": 0.6934, + "step": 6424 + }, + { + "epoch": 0.6562819203268642, + "grad_norm": 1.4101973443413434, + "learning_rate": 5.582553692144648e-06, + "loss": 0.7318, + "step": 6425 + }, + { + "epoch": 0.6563840653728295, + "grad_norm": 1.466562194126063, + "learning_rate": 5.57958589365186e-06, + "loss": 0.6801, + "step": 6426 + }, + { + "epoch": 0.6564862104187947, + "grad_norm": 1.3753948035749377, + "learning_rate": 5.576618578976254e-06, + "loss": 0.6615, + "step": 6427 + }, + { + "epoch": 0.6565883554647599, + "grad_norm": 1.4559602828184697, + "learning_rate": 5.573651748442599e-06, + "loss": 0.7807, + "step": 6428 + }, + { + "epoch": 0.6566905005107252, + "grad_norm": 1.4531113426015971, + "learning_rate": 5.570685402375623e-06, + "loss": 0.8324, + "step": 6429 + }, + { + "epoch": 0.6567926455566905, + "grad_norm": 1.5327802305331653, + "learning_rate": 5.567719541099992e-06, + "loss": 0.7614, + "step": 6430 + }, + { + "epoch": 0.6568947906026558, + "grad_norm": 1.3291991390675684, + "learning_rate": 5.564754164940322e-06, + "loss": 0.6325, + "step": 6431 + }, + { + "epoch": 0.656996935648621, + "grad_norm": 1.617498900406194, + "learning_rate": 5.5617892742211685e-06, + "loss": 0.7064, + "step": 6432 + }, + { + "epoch": 0.6570990806945863, + "grad_norm": 1.4534551761199075, + "learning_rate": 5.558824869267051e-06, + "loss": 0.669, + "step": 6433 + }, + { + "epoch": 0.6572012257405516, + "grad_norm": 1.3924221299839745, + "learning_rate": 5.555860950402417e-06, + "loss": 0.6807, + "step": 6434 + }, + { + "epoch": 0.6573033707865169, + "grad_norm": 1.4733493077412563, + "learning_rate": 5.552897517951678e-06, + "loss": 0.709, + "step": 6435 + }, + { + "epoch": 0.6574055158324821, + "grad_norm": 1.494083039848911, + "learning_rate": 5.549934572239177e-06, + "loss": 0.7293, + "step": 6436 + }, + { + "epoch": 0.6575076608784474, + "grad_norm": 1.4418543852146206, + "learning_rate": 5.546972113589211e-06, + "loss": 0.6049, + "step": 6437 + }, + { + "epoch": 0.6576098059244126, + "grad_norm": 1.4503013134343983, + "learning_rate": 5.544010142326026e-06, + "loss": 0.7854, + "step": 6438 + }, + { + "epoch": 0.6577119509703779, + "grad_norm": 1.620662963952648, + "learning_rate": 5.5410486587738096e-06, + "loss": 0.861, + "step": 6439 + }, + { + "epoch": 0.6578140960163432, + "grad_norm": 1.320980141073788, + "learning_rate": 5.5380876632566995e-06, + "loss": 0.6461, + "step": 6440 + }, + { + "epoch": 0.6579162410623085, + "grad_norm": 1.4444173840822545, + "learning_rate": 5.535127156098776e-06, + "loss": 0.5556, + "step": 6441 + }, + { + "epoch": 0.6580183861082738, + "grad_norm": 1.424762596957635, + "learning_rate": 5.532167137624064e-06, + "loss": 0.7649, + "step": 6442 + }, + { + "epoch": 0.6581205311542391, + "grad_norm": 1.443251849291834, + "learning_rate": 5.529207608156553e-06, + "loss": 0.6466, + "step": 6443 + }, + { + "epoch": 0.6582226762002042, + "grad_norm": 1.4949742404987731, + "learning_rate": 5.526248568020156e-06, + "loss": 0.7689, + "step": 6444 + }, + { + "epoch": 0.6583248212461695, + "grad_norm": 1.5021058512750207, + "learning_rate": 5.523290017538745e-06, + "loss": 0.7298, + "step": 6445 + }, + { + "epoch": 0.6584269662921348, + "grad_norm": 1.297823489032694, + "learning_rate": 5.520331957036134e-06, + "loss": 0.5758, + "step": 6446 + }, + { + "epoch": 0.6585291113381001, + "grad_norm": 1.2451483847653233, + "learning_rate": 5.517374386836081e-06, + "loss": 0.4773, + "step": 6447 + }, + { + "epoch": 0.6586312563840654, + "grad_norm": 1.4070563818357695, + "learning_rate": 5.5144173072623e-06, + "loss": 0.6623, + "step": 6448 + }, + { + "epoch": 0.6587334014300307, + "grad_norm": 1.5243881935184953, + "learning_rate": 5.511460718638444e-06, + "loss": 0.7343, + "step": 6449 + }, + { + "epoch": 0.658835546475996, + "grad_norm": 1.667082783672821, + "learning_rate": 5.508504621288107e-06, + "loss": 0.7512, + "step": 6450 + }, + { + "epoch": 0.6589376915219611, + "grad_norm": 1.7212574845060382, + "learning_rate": 5.505549015534846e-06, + "loss": 0.841, + "step": 6451 + }, + { + "epoch": 0.6590398365679264, + "grad_norm": 1.378710960646386, + "learning_rate": 5.502593901702145e-06, + "loss": 0.7135, + "step": 6452 + }, + { + "epoch": 0.6591419816138917, + "grad_norm": 1.4052582965138933, + "learning_rate": 5.499639280113452e-06, + "loss": 0.656, + "step": 6453 + }, + { + "epoch": 0.659244126659857, + "grad_norm": 1.4718936841890282, + "learning_rate": 5.496685151092145e-06, + "loss": 0.6765, + "step": 6454 + }, + { + "epoch": 0.6593462717058223, + "grad_norm": 1.407794575075968, + "learning_rate": 5.49373151496156e-06, + "loss": 0.5649, + "step": 6455 + }, + { + "epoch": 0.6594484167517876, + "grad_norm": 1.5143000127292756, + "learning_rate": 5.490778372044967e-06, + "loss": 0.7044, + "step": 6456 + }, + { + "epoch": 0.6595505617977528, + "grad_norm": 1.5182534188864187, + "learning_rate": 5.487825722665599e-06, + "loss": 0.6589, + "step": 6457 + }, + { + "epoch": 0.6596527068437181, + "grad_norm": 1.4667008927160596, + "learning_rate": 5.484873567146617e-06, + "loss": 0.7224, + "step": 6458 + }, + { + "epoch": 0.6597548518896833, + "grad_norm": 1.5326465357974048, + "learning_rate": 5.481921905811144e-06, + "loss": 0.7052, + "step": 6459 + }, + { + "epoch": 0.6598569969356486, + "grad_norm": 1.6814841682182398, + "learning_rate": 5.478970738982236e-06, + "loss": 0.769, + "step": 6460 + }, + { + "epoch": 0.6599591419816139, + "grad_norm": 1.4830018545780765, + "learning_rate": 5.476020066982903e-06, + "loss": 0.6657, + "step": 6461 + }, + { + "epoch": 0.6600612870275792, + "grad_norm": 1.3549759526147498, + "learning_rate": 5.473069890136094e-06, + "loss": 0.7184, + "step": 6462 + }, + { + "epoch": 0.6601634320735444, + "grad_norm": 1.438517311815641, + "learning_rate": 5.470120208764713e-06, + "loss": 0.7584, + "step": 6463 + }, + { + "epoch": 0.6602655771195097, + "grad_norm": 1.3336390518437293, + "learning_rate": 5.467171023191601e-06, + "loss": 0.6437, + "step": 6464 + }, + { + "epoch": 0.660367722165475, + "grad_norm": 1.349422722425278, + "learning_rate": 5.464222333739551e-06, + "loss": 0.6402, + "step": 6465 + }, + { + "epoch": 0.6604698672114403, + "grad_norm": 1.4738551892646254, + "learning_rate": 5.4612741407312905e-06, + "loss": 0.6567, + "step": 6466 + }, + { + "epoch": 0.6605720122574055, + "grad_norm": 1.5419523999833575, + "learning_rate": 5.458326444489509e-06, + "loss": 0.7172, + "step": 6467 + }, + { + "epoch": 0.6606741573033708, + "grad_norm": 1.2470581652385127, + "learning_rate": 5.455379245336836e-06, + "loss": 0.5738, + "step": 6468 + }, + { + "epoch": 0.660776302349336, + "grad_norm": 1.3619156841412574, + "learning_rate": 5.4524325435958424e-06, + "loss": 0.6338, + "step": 6469 + }, + { + "epoch": 0.6608784473953013, + "grad_norm": 1.473565004636987, + "learning_rate": 5.4494863395890426e-06, + "loss": 0.6713, + "step": 6470 + }, + { + "epoch": 0.6609805924412666, + "grad_norm": 1.563945853726889, + "learning_rate": 5.4465406336389015e-06, + "loss": 0.7009, + "step": 6471 + }, + { + "epoch": 0.6610827374872319, + "grad_norm": 1.612131754723165, + "learning_rate": 5.443595426067831e-06, + "loss": 0.7489, + "step": 6472 + }, + { + "epoch": 0.6611848825331972, + "grad_norm": 1.3962362114749298, + "learning_rate": 5.440650717198188e-06, + "loss": 0.7396, + "step": 6473 + }, + { + "epoch": 0.6612870275791624, + "grad_norm": 1.3733358544311225, + "learning_rate": 5.437706507352264e-06, + "loss": 0.6004, + "step": 6474 + }, + { + "epoch": 0.6613891726251276, + "grad_norm": 1.4649064634105868, + "learning_rate": 5.434762796852315e-06, + "loss": 0.6832, + "step": 6475 + }, + { + "epoch": 0.6614913176710929, + "grad_norm": 1.434765150725726, + "learning_rate": 5.431819586020523e-06, + "loss": 0.6423, + "step": 6476 + }, + { + "epoch": 0.6615934627170582, + "grad_norm": 1.3863263050926706, + "learning_rate": 5.428876875179032e-06, + "loss": 0.6334, + "step": 6477 + }, + { + "epoch": 0.6616956077630235, + "grad_norm": 1.6001201553599602, + "learning_rate": 5.425934664649921e-06, + "loss": 0.8054, + "step": 6478 + }, + { + "epoch": 0.6617977528089888, + "grad_norm": 1.578634973435631, + "learning_rate": 5.422992954755217e-06, + "loss": 0.7148, + "step": 6479 + }, + { + "epoch": 0.6618998978549541, + "grad_norm": 1.6205820698479412, + "learning_rate": 5.42005174581689e-06, + "loss": 0.7558, + "step": 6480 + }, + { + "epoch": 0.6620020429009194, + "grad_norm": 1.360396711234745, + "learning_rate": 5.417111038156855e-06, + "loss": 0.6639, + "step": 6481 + }, + { + "epoch": 0.6621041879468845, + "grad_norm": 1.3258747500264592, + "learning_rate": 5.414170832096979e-06, + "loss": 0.7072, + "step": 6482 + }, + { + "epoch": 0.6622063329928498, + "grad_norm": 1.5084639101310708, + "learning_rate": 5.411231127959072e-06, + "loss": 0.7023, + "step": 6483 + }, + { + "epoch": 0.6623084780388151, + "grad_norm": 1.4111645373653259, + "learning_rate": 5.40829192606488e-06, + "loss": 0.7475, + "step": 6484 + }, + { + "epoch": 0.6624106230847804, + "grad_norm": 1.5271301900245238, + "learning_rate": 5.405353226736105e-06, + "loss": 0.6814, + "step": 6485 + }, + { + "epoch": 0.6625127681307457, + "grad_norm": 1.4094411106383697, + "learning_rate": 5.4024150302943844e-06, + "loss": 0.6854, + "step": 6486 + }, + { + "epoch": 0.662614913176711, + "grad_norm": 1.4444107854209403, + "learning_rate": 5.399477337061313e-06, + "loss": 0.7403, + "step": 6487 + }, + { + "epoch": 0.6627170582226762, + "grad_norm": 1.4123421999324308, + "learning_rate": 5.3965401473584186e-06, + "loss": 0.6387, + "step": 6488 + }, + { + "epoch": 0.6628192032686415, + "grad_norm": 1.5440964240956108, + "learning_rate": 5.39360346150718e-06, + "loss": 0.7416, + "step": 6489 + }, + { + "epoch": 0.6629213483146067, + "grad_norm": 1.5282751224424966, + "learning_rate": 5.3906672798290135e-06, + "loss": 0.717, + "step": 6490 + }, + { + "epoch": 0.663023493360572, + "grad_norm": 1.1771836562756872, + "learning_rate": 5.3877316026452944e-06, + "loss": 0.6176, + "step": 6491 + }, + { + "epoch": 0.6631256384065373, + "grad_norm": 1.5891312320059654, + "learning_rate": 5.384796430277333e-06, + "loss": 0.638, + "step": 6492 + }, + { + "epoch": 0.6632277834525026, + "grad_norm": 1.485319485215655, + "learning_rate": 5.381861763046383e-06, + "loss": 0.7164, + "step": 6493 + }, + { + "epoch": 0.6633299284984678, + "grad_norm": 1.4032204114898663, + "learning_rate": 5.378927601273648e-06, + "loss": 0.6983, + "step": 6494 + }, + { + "epoch": 0.6634320735444331, + "grad_norm": 1.3612329496393076, + "learning_rate": 5.375993945280273e-06, + "loss": 0.7325, + "step": 6495 + }, + { + "epoch": 0.6635342185903984, + "grad_norm": 1.5981859503469504, + "learning_rate": 5.3730607953873436e-06, + "loss": 0.7693, + "step": 6496 + }, + { + "epoch": 0.6636363636363637, + "grad_norm": 1.5333443813825893, + "learning_rate": 5.370128151915903e-06, + "loss": 0.6943, + "step": 6497 + }, + { + "epoch": 0.6637385086823289, + "grad_norm": 1.6257901390006935, + "learning_rate": 5.367196015186924e-06, + "loss": 0.7508, + "step": 6498 + }, + { + "epoch": 0.6638406537282941, + "grad_norm": 1.445519200534978, + "learning_rate": 5.36426438552134e-06, + "loss": 0.684, + "step": 6499 + }, + { + "epoch": 0.6639427987742594, + "grad_norm": 1.4069808746085324, + "learning_rate": 5.361333263240012e-06, + "loss": 0.658, + "step": 6500 + }, + { + "epoch": 0.6640449438202247, + "grad_norm": 1.4878350649869472, + "learning_rate": 5.358402648663752e-06, + "loss": 0.7193, + "step": 6501 + }, + { + "epoch": 0.66414708886619, + "grad_norm": 1.4000396055031883, + "learning_rate": 5.355472542113325e-06, + "loss": 0.5534, + "step": 6502 + }, + { + "epoch": 0.6642492339121553, + "grad_norm": 1.4459998675335881, + "learning_rate": 5.3525429439094275e-06, + "loss": 0.6397, + "step": 6503 + }, + { + "epoch": 0.6643513789581206, + "grad_norm": 1.5814478450083063, + "learning_rate": 5.34961385437271e-06, + "loss": 0.8189, + "step": 6504 + }, + { + "epoch": 0.6644535240040857, + "grad_norm": 1.5744139873093734, + "learning_rate": 5.346685273823755e-06, + "loss": 0.733, + "step": 6505 + }, + { + "epoch": 0.664555669050051, + "grad_norm": 1.5684568120459548, + "learning_rate": 5.343757202583104e-06, + "loss": 0.7395, + "step": 6506 + }, + { + "epoch": 0.6646578140960163, + "grad_norm": 1.500171207711971, + "learning_rate": 5.340829640971239e-06, + "loss": 0.6649, + "step": 6507 + }, + { + "epoch": 0.6647599591419816, + "grad_norm": 1.4138434026554247, + "learning_rate": 5.337902589308579e-06, + "loss": 0.6882, + "step": 6508 + }, + { + "epoch": 0.6648621041879469, + "grad_norm": 1.4934437149639046, + "learning_rate": 5.334976047915494e-06, + "loss": 0.7525, + "step": 6509 + }, + { + "epoch": 0.6649642492339122, + "grad_norm": 1.5406119987203275, + "learning_rate": 5.332050017112292e-06, + "loss": 0.7706, + "step": 6510 + }, + { + "epoch": 0.6650663942798775, + "grad_norm": 1.5439920515651857, + "learning_rate": 5.329124497219232e-06, + "loss": 0.7902, + "step": 6511 + }, + { + "epoch": 0.6651685393258427, + "grad_norm": 1.6255241994993048, + "learning_rate": 5.326199488556516e-06, + "loss": 0.5789, + "step": 6512 + }, + { + "epoch": 0.6652706843718079, + "grad_norm": 1.6199711517177522, + "learning_rate": 5.323274991444285e-06, + "loss": 0.7271, + "step": 6513 + }, + { + "epoch": 0.6653728294177732, + "grad_norm": 1.686948899869703, + "learning_rate": 5.320351006202624e-06, + "loss": 0.6075, + "step": 6514 + }, + { + "epoch": 0.6654749744637385, + "grad_norm": 1.6663570823939364, + "learning_rate": 5.317427533151572e-06, + "loss": 0.6938, + "step": 6515 + }, + { + "epoch": 0.6655771195097038, + "grad_norm": 1.638513014635251, + "learning_rate": 5.3145045726110984e-06, + "loss": 0.7046, + "step": 6516 + }, + { + "epoch": 0.6656792645556691, + "grad_norm": 1.5384884184510565, + "learning_rate": 5.311582124901131e-06, + "loss": 0.7302, + "step": 6517 + }, + { + "epoch": 0.6657814096016343, + "grad_norm": 1.5998308938073411, + "learning_rate": 5.308660190341528e-06, + "loss": 0.7253, + "step": 6518 + }, + { + "epoch": 0.6658835546475996, + "grad_norm": 1.4463100253888226, + "learning_rate": 5.3057387692521e-06, + "loss": 0.6748, + "step": 6519 + }, + { + "epoch": 0.6659856996935649, + "grad_norm": 1.4107388180518095, + "learning_rate": 5.302817861952592e-06, + "loss": 0.5955, + "step": 6520 + }, + { + "epoch": 0.6660878447395301, + "grad_norm": 1.4218335993768003, + "learning_rate": 5.299897468762707e-06, + "loss": 0.6951, + "step": 6521 + }, + { + "epoch": 0.6661899897854954, + "grad_norm": 1.4308701748717412, + "learning_rate": 5.296977590002077e-06, + "loss": 0.668, + "step": 6522 + }, + { + "epoch": 0.6662921348314607, + "grad_norm": 1.4805881925189268, + "learning_rate": 5.294058225990292e-06, + "loss": 0.5968, + "step": 6523 + }, + { + "epoch": 0.6663942798774259, + "grad_norm": 1.3493272474950293, + "learning_rate": 5.291139377046874e-06, + "loss": 0.6837, + "step": 6524 + }, + { + "epoch": 0.6664964249233912, + "grad_norm": 1.563174685269643, + "learning_rate": 5.288221043491291e-06, + "loss": 0.7525, + "step": 6525 + }, + { + "epoch": 0.6665985699693565, + "grad_norm": 1.439215358013041, + "learning_rate": 5.285303225642962e-06, + "loss": 0.6924, + "step": 6526 + }, + { + "epoch": 0.6667007150153218, + "grad_norm": 1.5090365411301374, + "learning_rate": 5.282385923821242e-06, + "loss": 0.7549, + "step": 6527 + }, + { + "epoch": 0.666802860061287, + "grad_norm": 1.5130527851200468, + "learning_rate": 5.27946913834543e-06, + "loss": 0.7345, + "step": 6528 + }, + { + "epoch": 0.6669050051072523, + "grad_norm": 1.5686150717105887, + "learning_rate": 5.276552869534765e-06, + "loss": 0.7035, + "step": 6529 + }, + { + "epoch": 0.6670071501532175, + "grad_norm": 1.2694511967841604, + "learning_rate": 5.273637117708444e-06, + "loss": 0.6929, + "step": 6530 + }, + { + "epoch": 0.6671092951991828, + "grad_norm": 1.4093188713704887, + "learning_rate": 5.270721883185594e-06, + "loss": 0.7178, + "step": 6531 + }, + { + "epoch": 0.6672114402451481, + "grad_norm": 1.5029835971487457, + "learning_rate": 5.267807166285292e-06, + "loss": 0.7558, + "step": 6532 + }, + { + "epoch": 0.6673135852911134, + "grad_norm": 1.4367556022956347, + "learning_rate": 5.264892967326552e-06, + "loss": 0.6893, + "step": 6533 + }, + { + "epoch": 0.6674157303370787, + "grad_norm": 1.5494496929827504, + "learning_rate": 5.261979286628336e-06, + "loss": 0.7603, + "step": 6534 + }, + { + "epoch": 0.667517875383044, + "grad_norm": 1.5322589830504494, + "learning_rate": 5.259066124509545e-06, + "loss": 0.7354, + "step": 6535 + }, + { + "epoch": 0.6676200204290091, + "grad_norm": 1.496147167283038, + "learning_rate": 5.256153481289034e-06, + "loss": 0.6617, + "step": 6536 + }, + { + "epoch": 0.6677221654749744, + "grad_norm": 1.3786676288102002, + "learning_rate": 5.253241357285588e-06, + "loss": 0.7243, + "step": 6537 + }, + { + "epoch": 0.6678243105209397, + "grad_norm": 1.488728077548782, + "learning_rate": 5.25032975281794e-06, + "loss": 0.6611, + "step": 6538 + }, + { + "epoch": 0.667926455566905, + "grad_norm": 1.4599517840004244, + "learning_rate": 5.247418668204771e-06, + "loss": 0.6775, + "step": 6539 + }, + { + "epoch": 0.6680286006128703, + "grad_norm": 1.4195890697553577, + "learning_rate": 5.244508103764696e-06, + "loss": 0.6637, + "step": 6540 + }, + { + "epoch": 0.6681307456588356, + "grad_norm": 1.4687881668013583, + "learning_rate": 5.241598059816286e-06, + "loss": 0.6672, + "step": 6541 + }, + { + "epoch": 0.6682328907048009, + "grad_norm": 1.4630041184733678, + "learning_rate": 5.2386885366780425e-06, + "loss": 0.6072, + "step": 6542 + }, + { + "epoch": 0.6683350357507661, + "grad_norm": 1.5330979327950098, + "learning_rate": 5.2357795346684145e-06, + "loss": 0.7786, + "step": 6543 + }, + { + "epoch": 0.6684371807967313, + "grad_norm": 1.530092992825617, + "learning_rate": 5.2328710541057924e-06, + "loss": 0.7351, + "step": 6544 + }, + { + "epoch": 0.6685393258426966, + "grad_norm": 1.50290595964251, + "learning_rate": 5.229963095308516e-06, + "loss": 0.7557, + "step": 6545 + }, + { + "epoch": 0.6686414708886619, + "grad_norm": 1.613286057290504, + "learning_rate": 5.227055658594856e-06, + "loss": 0.7371, + "step": 6546 + }, + { + "epoch": 0.6687436159346272, + "grad_norm": 1.4414580684985665, + "learning_rate": 5.2241487442830414e-06, + "loss": 0.7132, + "step": 6547 + }, + { + "epoch": 0.6688457609805925, + "grad_norm": 1.434827594414842, + "learning_rate": 5.221242352691235e-06, + "loss": 0.6117, + "step": 6548 + }, + { + "epoch": 0.6689479060265577, + "grad_norm": 1.5834954720822438, + "learning_rate": 5.218336484137538e-06, + "loss": 0.5977, + "step": 6549 + }, + { + "epoch": 0.669050051072523, + "grad_norm": 1.416561166848323, + "learning_rate": 5.215431138939999e-06, + "loss": 0.6967, + "step": 6550 + }, + { + "epoch": 0.6691521961184883, + "grad_norm": 1.4934765919504858, + "learning_rate": 5.2125263174166175e-06, + "loss": 0.694, + "step": 6551 + }, + { + "epoch": 0.6692543411644535, + "grad_norm": 1.4895070249210873, + "learning_rate": 5.2096220198853235e-06, + "loss": 0.5782, + "step": 6552 + }, + { + "epoch": 0.6693564862104188, + "grad_norm": 1.4889361345992935, + "learning_rate": 5.206718246663995e-06, + "loss": 0.7017, + "step": 6553 + }, + { + "epoch": 0.669458631256384, + "grad_norm": 1.5467146273376797, + "learning_rate": 5.2038149980704465e-06, + "loss": 0.7144, + "step": 6554 + }, + { + "epoch": 0.6695607763023493, + "grad_norm": 1.4741384646996376, + "learning_rate": 5.200912274422445e-06, + "loss": 0.5475, + "step": 6555 + }, + { + "epoch": 0.6696629213483146, + "grad_norm": 1.3351884568966441, + "learning_rate": 5.1980100760377e-06, + "loss": 0.547, + "step": 6556 + }, + { + "epoch": 0.6697650663942799, + "grad_norm": 1.3939121797880538, + "learning_rate": 5.195108403233855e-06, + "loss": 0.7277, + "step": 6557 + }, + { + "epoch": 0.6698672114402452, + "grad_norm": 1.62938894276569, + "learning_rate": 5.192207256328499e-06, + "loss": 0.7715, + "step": 6558 + }, + { + "epoch": 0.6699693564862104, + "grad_norm": 1.4398936694352984, + "learning_rate": 5.189306635639161e-06, + "loss": 0.7604, + "step": 6559 + }, + { + "epoch": 0.6700715015321757, + "grad_norm": 1.6527054972011854, + "learning_rate": 5.1864065414833245e-06, + "loss": 0.7469, + "step": 6560 + }, + { + "epoch": 0.6701736465781409, + "grad_norm": 1.5103351826028084, + "learning_rate": 5.183506974178401e-06, + "loss": 0.7293, + "step": 6561 + }, + { + "epoch": 0.6702757916241062, + "grad_norm": 1.4535778761489895, + "learning_rate": 5.180607934041748e-06, + "loss": 0.6732, + "step": 6562 + }, + { + "epoch": 0.6703779366700715, + "grad_norm": 1.679899997469296, + "learning_rate": 5.177709421390673e-06, + "loss": 0.7865, + "step": 6563 + }, + { + "epoch": 0.6704800817160368, + "grad_norm": 1.6202467052195313, + "learning_rate": 5.174811436542415e-06, + "loss": 0.8204, + "step": 6564 + }, + { + "epoch": 0.6705822267620021, + "grad_norm": 1.5247507487009946, + "learning_rate": 5.171913979814167e-06, + "loss": 0.683, + "step": 6565 + }, + { + "epoch": 0.6706843718079674, + "grad_norm": 1.3933838195908217, + "learning_rate": 5.1690170515230506e-06, + "loss": 0.7376, + "step": 6566 + }, + { + "epoch": 0.6707865168539325, + "grad_norm": 1.509413770690142, + "learning_rate": 5.166120651986142e-06, + "loss": 0.6444, + "step": 6567 + }, + { + "epoch": 0.6708886618998978, + "grad_norm": 1.5152130706510722, + "learning_rate": 5.163224781520451e-06, + "loss": 0.6644, + "step": 6568 + }, + { + "epoch": 0.6709908069458631, + "grad_norm": 1.37562656334029, + "learning_rate": 5.160329440442926e-06, + "loss": 0.5414, + "step": 6569 + }, + { + "epoch": 0.6710929519918284, + "grad_norm": 1.4609110125963283, + "learning_rate": 5.157434629070472e-06, + "loss": 0.7248, + "step": 6570 + }, + { + "epoch": 0.6711950970377937, + "grad_norm": 1.4048633231392489, + "learning_rate": 5.154540347719932e-06, + "loss": 0.65, + "step": 6571 + }, + { + "epoch": 0.671297242083759, + "grad_norm": 1.3271921421702595, + "learning_rate": 5.15164659670808e-06, + "loss": 0.6715, + "step": 6572 + }, + { + "epoch": 0.6713993871297242, + "grad_norm": 1.45304237135027, + "learning_rate": 5.148753376351641e-06, + "loss": 0.6591, + "step": 6573 + }, + { + "epoch": 0.6715015321756895, + "grad_norm": 1.57070151202486, + "learning_rate": 5.145860686967274e-06, + "loss": 0.6569, + "step": 6574 + }, + { + "epoch": 0.6716036772216547, + "grad_norm": 1.3965757483142136, + "learning_rate": 5.142968528871597e-06, + "loss": 0.7426, + "step": 6575 + }, + { + "epoch": 0.67170582226762, + "grad_norm": 1.626117468109773, + "learning_rate": 5.14007690238115e-06, + "loss": 0.8063, + "step": 6576 + }, + { + "epoch": 0.6718079673135853, + "grad_norm": 1.4695694723975783, + "learning_rate": 5.137185807812428e-06, + "loss": 0.7343, + "step": 6577 + }, + { + "epoch": 0.6719101123595506, + "grad_norm": 1.3639947068452039, + "learning_rate": 5.134295245481857e-06, + "loss": 0.6953, + "step": 6578 + }, + { + "epoch": 0.6720122574055158, + "grad_norm": 1.4759568290668346, + "learning_rate": 5.1314052157058144e-06, + "loss": 0.6707, + "step": 6579 + }, + { + "epoch": 0.6721144024514811, + "grad_norm": 1.428728791074938, + "learning_rate": 5.128515718800622e-06, + "loss": 0.6727, + "step": 6580 + }, + { + "epoch": 0.6722165474974464, + "grad_norm": 1.3494917940459454, + "learning_rate": 5.125626755082529e-06, + "loss": 0.6655, + "step": 6581 + }, + { + "epoch": 0.6723186925434116, + "grad_norm": 1.465630613249248, + "learning_rate": 5.122738324867738e-06, + "loss": 0.6034, + "step": 6582 + }, + { + "epoch": 0.6724208375893769, + "grad_norm": 1.5454673622988147, + "learning_rate": 5.119850428472389e-06, + "loss": 0.7355, + "step": 6583 + }, + { + "epoch": 0.6725229826353422, + "grad_norm": 1.3711412106030618, + "learning_rate": 5.1169630662125595e-06, + "loss": 0.5643, + "step": 6584 + }, + { + "epoch": 0.6726251276813074, + "grad_norm": 1.5420577678172087, + "learning_rate": 5.11407623840428e-06, + "loss": 0.6618, + "step": 6585 + }, + { + "epoch": 0.6727272727272727, + "grad_norm": 1.510257622594131, + "learning_rate": 5.111189945363511e-06, + "loss": 0.6347, + "step": 6586 + }, + { + "epoch": 0.672829417773238, + "grad_norm": 1.3824948577907834, + "learning_rate": 5.108304187406163e-06, + "loss": 0.7013, + "step": 6587 + }, + { + "epoch": 0.6729315628192033, + "grad_norm": 1.4832340409581468, + "learning_rate": 5.105418964848083e-06, + "loss": 0.7476, + "step": 6588 + }, + { + "epoch": 0.6730337078651686, + "grad_norm": 1.5023182187315376, + "learning_rate": 5.102534278005056e-06, + "loss": 0.7392, + "step": 6589 + }, + { + "epoch": 0.6731358529111338, + "grad_norm": 1.3159532220618566, + "learning_rate": 5.09965012719282e-06, + "loss": 0.6357, + "step": 6590 + }, + { + "epoch": 0.673237997957099, + "grad_norm": 1.702468417858951, + "learning_rate": 5.096766512727043e-06, + "loss": 0.6953, + "step": 6591 + }, + { + "epoch": 0.6733401430030643, + "grad_norm": 1.5434887732215472, + "learning_rate": 5.09388343492334e-06, + "loss": 0.7115, + "step": 6592 + }, + { + "epoch": 0.6734422880490296, + "grad_norm": 1.4608055714610884, + "learning_rate": 5.091000894097261e-06, + "loss": 0.7106, + "step": 6593 + }, + { + "epoch": 0.6735444330949949, + "grad_norm": 1.5683751619969664, + "learning_rate": 5.088118890564305e-06, + "loss": 0.7356, + "step": 6594 + }, + { + "epoch": 0.6736465781409602, + "grad_norm": 1.5817571525844558, + "learning_rate": 5.085237424639915e-06, + "loss": 0.7321, + "step": 6595 + }, + { + "epoch": 0.6737487231869255, + "grad_norm": 1.51906903190527, + "learning_rate": 5.082356496639462e-06, + "loss": 0.6924, + "step": 6596 + }, + { + "epoch": 0.6738508682328908, + "grad_norm": 1.3989166392097978, + "learning_rate": 5.07947610687827e-06, + "loss": 0.6758, + "step": 6597 + }, + { + "epoch": 0.6739530132788559, + "grad_norm": 1.559384315354782, + "learning_rate": 5.0765962556715916e-06, + "loss": 0.7788, + "step": 6598 + }, + { + "epoch": 0.6740551583248212, + "grad_norm": 1.5233535305228334, + "learning_rate": 5.0737169433346385e-06, + "loss": 0.7204, + "step": 6599 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 1.4664687101172342, + "learning_rate": 5.070838170182549e-06, + "loss": 0.6751, + "step": 6600 + }, + { + "epoch": 0.6742594484167518, + "grad_norm": 1.5519747408427538, + "learning_rate": 5.067959936530407e-06, + "loss": 0.7508, + "step": 6601 + }, + { + "epoch": 0.6743615934627171, + "grad_norm": 1.5653169292582163, + "learning_rate": 5.065082242693231e-06, + "loss": 0.7521, + "step": 6602 + }, + { + "epoch": 0.6744637385086824, + "grad_norm": 1.455583437509877, + "learning_rate": 5.0622050889859966e-06, + "loss": 0.705, + "step": 6603 + }, + { + "epoch": 0.6745658835546476, + "grad_norm": 1.5595628588487156, + "learning_rate": 5.059328475723602e-06, + "loss": 0.7317, + "step": 6604 + }, + { + "epoch": 0.6746680286006129, + "grad_norm": 1.4946461941335332, + "learning_rate": 5.056452403220902e-06, + "loss": 0.8592, + "step": 6605 + }, + { + "epoch": 0.6747701736465781, + "grad_norm": 1.4257444736154308, + "learning_rate": 5.053576871792678e-06, + "loss": 0.7127, + "step": 6606 + }, + { + "epoch": 0.6748723186925434, + "grad_norm": 1.434121148425506, + "learning_rate": 5.050701881753664e-06, + "loss": 0.6463, + "step": 6607 + }, + { + "epoch": 0.6749744637385087, + "grad_norm": 1.4849004445757477, + "learning_rate": 5.047827433418522e-06, + "loss": 0.6639, + "step": 6608 + }, + { + "epoch": 0.675076608784474, + "grad_norm": 1.6475417733550277, + "learning_rate": 5.04495352710187e-06, + "loss": 0.6994, + "step": 6609 + }, + { + "epoch": 0.6751787538304392, + "grad_norm": 1.298612033368378, + "learning_rate": 5.042080163118252e-06, + "loss": 0.7191, + "step": 6610 + }, + { + "epoch": 0.6752808988764045, + "grad_norm": 1.3795673728978026, + "learning_rate": 5.039207341782167e-06, + "loss": 0.6393, + "step": 6611 + }, + { + "epoch": 0.6753830439223698, + "grad_norm": 1.4168220936444358, + "learning_rate": 5.036335063408044e-06, + "loss": 0.606, + "step": 6612 + }, + { + "epoch": 0.675485188968335, + "grad_norm": 1.5760137326955517, + "learning_rate": 5.033463328310253e-06, + "loss": 0.8183, + "step": 6613 + }, + { + "epoch": 0.6755873340143003, + "grad_norm": 1.4497956837022048, + "learning_rate": 5.030592136803112e-06, + "loss": 0.6718, + "step": 6614 + }, + { + "epoch": 0.6756894790602656, + "grad_norm": 1.4444671501142599, + "learning_rate": 5.027721489200872e-06, + "loss": 0.6991, + "step": 6615 + }, + { + "epoch": 0.6757916241062308, + "grad_norm": 1.392913015668251, + "learning_rate": 5.0248513858177266e-06, + "loss": 0.5669, + "step": 6616 + }, + { + "epoch": 0.6758937691521961, + "grad_norm": 1.5204553661403344, + "learning_rate": 5.021981826967809e-06, + "loss": 0.7106, + "step": 6617 + }, + { + "epoch": 0.6759959141981614, + "grad_norm": 1.466928484954999, + "learning_rate": 5.019112812965197e-06, + "loss": 0.7747, + "step": 6618 + }, + { + "epoch": 0.6760980592441267, + "grad_norm": 1.5203976582229837, + "learning_rate": 5.0162443441239085e-06, + "loss": 0.7392, + "step": 6619 + }, + { + "epoch": 0.676200204290092, + "grad_norm": 1.4580340103316611, + "learning_rate": 5.013376420757896e-06, + "loss": 0.6529, + "step": 6620 + }, + { + "epoch": 0.6763023493360572, + "grad_norm": 1.456013509016094, + "learning_rate": 5.010509043181058e-06, + "loss": 0.5973, + "step": 6621 + }, + { + "epoch": 0.6764044943820224, + "grad_norm": 1.5420054922467161, + "learning_rate": 5.007642211707227e-06, + "loss": 0.6645, + "step": 6622 + }, + { + "epoch": 0.6765066394279877, + "grad_norm": 1.5268921001182891, + "learning_rate": 5.004775926650178e-06, + "loss": 0.7764, + "step": 6623 + }, + { + "epoch": 0.676608784473953, + "grad_norm": 1.4839674977623316, + "learning_rate": 5.001910188323636e-06, + "loss": 0.7424, + "step": 6624 + }, + { + "epoch": 0.6767109295199183, + "grad_norm": 1.57778982725158, + "learning_rate": 4.999044997041252e-06, + "loss": 0.7599, + "step": 6625 + }, + { + "epoch": 0.6768130745658836, + "grad_norm": 1.5131135733668533, + "learning_rate": 4.996180353116623e-06, + "loss": 0.7231, + "step": 6626 + }, + { + "epoch": 0.6769152196118489, + "grad_norm": 1.5434461817148768, + "learning_rate": 4.993316256863286e-06, + "loss": 0.76, + "step": 6627 + }, + { + "epoch": 0.6770173646578141, + "grad_norm": 1.3926953451505821, + "learning_rate": 4.990452708594718e-06, + "loss": 0.7106, + "step": 6628 + }, + { + "epoch": 0.6771195097037793, + "grad_norm": 1.5281744489409315, + "learning_rate": 4.987589708624341e-06, + "loss": 0.7454, + "step": 6629 + }, + { + "epoch": 0.6772216547497446, + "grad_norm": 1.7266306780241274, + "learning_rate": 4.984727257265509e-06, + "loss": 0.7728, + "step": 6630 + }, + { + "epoch": 0.6773237997957099, + "grad_norm": 1.2644585881055908, + "learning_rate": 4.9818653548315195e-06, + "loss": 0.5821, + "step": 6631 + }, + { + "epoch": 0.6774259448416752, + "grad_norm": 1.40328094084618, + "learning_rate": 4.979004001635606e-06, + "loss": 0.7213, + "step": 6632 + }, + { + "epoch": 0.6775280898876405, + "grad_norm": 1.4476314247451192, + "learning_rate": 4.976143197990951e-06, + "loss": 0.8015, + "step": 6633 + }, + { + "epoch": 0.6776302349336057, + "grad_norm": 1.4252608120792556, + "learning_rate": 4.973282944210669e-06, + "loss": 0.7014, + "step": 6634 + }, + { + "epoch": 0.677732379979571, + "grad_norm": 1.4685408326294858, + "learning_rate": 4.970423240607813e-06, + "loss": 0.6852, + "step": 6635 + }, + { + "epoch": 0.6778345250255363, + "grad_norm": 1.5352684984438767, + "learning_rate": 4.967564087495387e-06, + "loss": 0.583, + "step": 6636 + }, + { + "epoch": 0.6779366700715015, + "grad_norm": 1.5257333448327395, + "learning_rate": 4.964705485186322e-06, + "loss": 0.744, + "step": 6637 + }, + { + "epoch": 0.6780388151174668, + "grad_norm": 1.6406488744826682, + "learning_rate": 4.961847433993492e-06, + "loss": 0.7701, + "step": 6638 + }, + { + "epoch": 0.6781409601634321, + "grad_norm": 1.3964741829908782, + "learning_rate": 4.958989934229719e-06, + "loss": 0.6519, + "step": 6639 + }, + { + "epoch": 0.6782431052093973, + "grad_norm": 1.4418400734239152, + "learning_rate": 4.956132986207754e-06, + "loss": 0.6461, + "step": 6640 + }, + { + "epoch": 0.6783452502553626, + "grad_norm": 1.4194208621196294, + "learning_rate": 4.953276590240293e-06, + "loss": 0.7303, + "step": 6641 + }, + { + "epoch": 0.6784473953013279, + "grad_norm": 1.4864260105518399, + "learning_rate": 4.950420746639965e-06, + "loss": 0.6842, + "step": 6642 + }, + { + "epoch": 0.6785495403472932, + "grad_norm": 1.4165569700189224, + "learning_rate": 4.94756545571935e-06, + "loss": 0.7414, + "step": 6643 + }, + { + "epoch": 0.6786516853932584, + "grad_norm": 1.6240892196511818, + "learning_rate": 4.944710717790964e-06, + "loss": 0.8087, + "step": 6644 + }, + { + "epoch": 0.6787538304392237, + "grad_norm": 1.228910125760652, + "learning_rate": 4.941856533167255e-06, + "loss": 0.6193, + "step": 6645 + }, + { + "epoch": 0.678855975485189, + "grad_norm": 1.6021855448018207, + "learning_rate": 4.939002902160617e-06, + "loss": 0.725, + "step": 6646 + }, + { + "epoch": 0.6789581205311542, + "grad_norm": 1.496872619388595, + "learning_rate": 4.9361498250833775e-06, + "loss": 0.8093, + "step": 6647 + }, + { + "epoch": 0.6790602655771195, + "grad_norm": 1.4258167654538545, + "learning_rate": 4.933297302247815e-06, + "loss": 0.6564, + "step": 6648 + }, + { + "epoch": 0.6791624106230848, + "grad_norm": 1.424649667091576, + "learning_rate": 4.930445333966136e-06, + "loss": 0.7106, + "step": 6649 + }, + { + "epoch": 0.6792645556690501, + "grad_norm": 1.5427968334313757, + "learning_rate": 4.927593920550491e-06, + "loss": 0.759, + "step": 6650 + }, + { + "epoch": 0.6793667007150154, + "grad_norm": 1.5581699739933221, + "learning_rate": 4.9247430623129645e-06, + "loss": 0.7082, + "step": 6651 + }, + { + "epoch": 0.6794688457609805, + "grad_norm": 1.4596845818509856, + "learning_rate": 4.921892759565589e-06, + "loss": 0.7917, + "step": 6652 + }, + { + "epoch": 0.6795709908069458, + "grad_norm": 1.6775701120037028, + "learning_rate": 4.919043012620336e-06, + "loss": 0.75, + "step": 6653 + }, + { + "epoch": 0.6796731358529111, + "grad_norm": 1.3735870159030041, + "learning_rate": 4.916193821789107e-06, + "loss": 0.6587, + "step": 6654 + }, + { + "epoch": 0.6797752808988764, + "grad_norm": 1.549705494592493, + "learning_rate": 4.91334518738375e-06, + "loss": 0.6841, + "step": 6655 + }, + { + "epoch": 0.6798774259448417, + "grad_norm": 1.4305981604508688, + "learning_rate": 4.910497109716048e-06, + "loss": 0.7037, + "step": 6656 + }, + { + "epoch": 0.679979570990807, + "grad_norm": 1.6312959348527254, + "learning_rate": 4.907649589097722e-06, + "loss": 0.6718, + "step": 6657 + }, + { + "epoch": 0.6800817160367723, + "grad_norm": 1.5311068184205798, + "learning_rate": 4.904802625840441e-06, + "loss": 0.6918, + "step": 6658 + }, + { + "epoch": 0.6801838610827375, + "grad_norm": 1.3627896636893202, + "learning_rate": 4.901956220255802e-06, + "loss": 0.7811, + "step": 6659 + }, + { + "epoch": 0.6802860061287027, + "grad_norm": 1.4420947546147513, + "learning_rate": 4.899110372655352e-06, + "loss": 0.6849, + "step": 6660 + }, + { + "epoch": 0.680388151174668, + "grad_norm": 1.3150617819247594, + "learning_rate": 4.896265083350568e-06, + "loss": 0.662, + "step": 6661 + }, + { + "epoch": 0.6804902962206333, + "grad_norm": 1.462922858278926, + "learning_rate": 4.893420352652863e-06, + "loss": 0.6594, + "step": 6662 + }, + { + "epoch": 0.6805924412665986, + "grad_norm": 1.460430423691327, + "learning_rate": 4.890576180873605e-06, + "loss": 0.7559, + "step": 6663 + }, + { + "epoch": 0.6806945863125639, + "grad_norm": 1.59406105858736, + "learning_rate": 4.887732568324084e-06, + "loss": 0.6935, + "step": 6664 + }, + { + "epoch": 0.6807967313585291, + "grad_norm": 1.5703209928963187, + "learning_rate": 4.884889515315537e-06, + "loss": 0.7929, + "step": 6665 + }, + { + "epoch": 0.6808988764044944, + "grad_norm": 1.645252523846712, + "learning_rate": 4.8820470221591335e-06, + "loss": 0.7196, + "step": 6666 + }, + { + "epoch": 0.6810010214504596, + "grad_norm": 1.5532248594509683, + "learning_rate": 4.87920508916599e-06, + "loss": 0.731, + "step": 6667 + }, + { + "epoch": 0.6811031664964249, + "grad_norm": 1.4603258316768737, + "learning_rate": 4.876363716647161e-06, + "loss": 0.6922, + "step": 6668 + }, + { + "epoch": 0.6812053115423902, + "grad_norm": 1.5878906931259673, + "learning_rate": 4.8735229049136335e-06, + "loss": 0.6597, + "step": 6669 + }, + { + "epoch": 0.6813074565883555, + "grad_norm": 1.620717516313211, + "learning_rate": 4.870682654276337e-06, + "loss": 0.7801, + "step": 6670 + }, + { + "epoch": 0.6814096016343207, + "grad_norm": 1.570211284718985, + "learning_rate": 4.867842965046134e-06, + "loss": 0.7025, + "step": 6671 + }, + { + "epoch": 0.681511746680286, + "grad_norm": 1.408406329301186, + "learning_rate": 4.865003837533837e-06, + "loss": 0.6692, + "step": 6672 + }, + { + "epoch": 0.6816138917262513, + "grad_norm": 1.5196156166042094, + "learning_rate": 4.8621652720501884e-06, + "loss": 0.6758, + "step": 6673 + }, + { + "epoch": 0.6817160367722166, + "grad_norm": 1.439353720681077, + "learning_rate": 4.859327268905871e-06, + "loss": 0.7435, + "step": 6674 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 1.4984924473539427, + "learning_rate": 4.8564898284115e-06, + "loss": 0.6777, + "step": 6675 + }, + { + "epoch": 0.681920326864147, + "grad_norm": 1.4387357897987425, + "learning_rate": 4.853652950877645e-06, + "loss": 0.7449, + "step": 6676 + }, + { + "epoch": 0.6820224719101123, + "grad_norm": 1.4636392795403104, + "learning_rate": 4.850816636614797e-06, + "loss": 0.6896, + "step": 6677 + }, + { + "epoch": 0.6821246169560776, + "grad_norm": 1.2668154360523531, + "learning_rate": 4.8479808859333964e-06, + "loss": 0.6283, + "step": 6678 + }, + { + "epoch": 0.6822267620020429, + "grad_norm": 1.470623676503276, + "learning_rate": 4.845145699143818e-06, + "loss": 0.6737, + "step": 6679 + }, + { + "epoch": 0.6823289070480082, + "grad_norm": 1.5385561183222132, + "learning_rate": 4.842311076556373e-06, + "loss": 0.7029, + "step": 6680 + }, + { + "epoch": 0.6824310520939735, + "grad_norm": 1.5013611336741748, + "learning_rate": 4.839477018481309e-06, + "loss": 0.7339, + "step": 6681 + }, + { + "epoch": 0.6825331971399388, + "grad_norm": 1.3831685164489391, + "learning_rate": 4.836643525228822e-06, + "loss": 0.6541, + "step": 6682 + }, + { + "epoch": 0.6826353421859039, + "grad_norm": 1.8112592003340724, + "learning_rate": 4.833810597109036e-06, + "loss": 0.7911, + "step": 6683 + }, + { + "epoch": 0.6827374872318692, + "grad_norm": 1.5188390671802119, + "learning_rate": 4.83097823443202e-06, + "loss": 0.6318, + "step": 6684 + }, + { + "epoch": 0.6828396322778345, + "grad_norm": 1.47318579445148, + "learning_rate": 4.828146437507776e-06, + "loss": 0.7134, + "step": 6685 + }, + { + "epoch": 0.6829417773237998, + "grad_norm": 1.406808962296762, + "learning_rate": 4.825315206646242e-06, + "loss": 0.6433, + "step": 6686 + }, + { + "epoch": 0.6830439223697651, + "grad_norm": 1.498110994291386, + "learning_rate": 4.822484542157306e-06, + "loss": 0.6846, + "step": 6687 + }, + { + "epoch": 0.6831460674157304, + "grad_norm": 1.4645350879203618, + "learning_rate": 4.819654444350782e-06, + "loss": 0.6449, + "step": 6688 + }, + { + "epoch": 0.6832482124616956, + "grad_norm": 1.5929618040681501, + "learning_rate": 4.816824913536425e-06, + "loss": 0.6954, + "step": 6689 + }, + { + "epoch": 0.6833503575076609, + "grad_norm": 1.5178842337698355, + "learning_rate": 4.813995950023932e-06, + "loss": 0.711, + "step": 6690 + }, + { + "epoch": 0.6834525025536261, + "grad_norm": 1.5036647118132327, + "learning_rate": 4.811167554122928e-06, + "loss": 0.5943, + "step": 6691 + }, + { + "epoch": 0.6835546475995914, + "grad_norm": 1.4356735651208241, + "learning_rate": 4.8083397261429865e-06, + "loss": 0.743, + "step": 6692 + }, + { + "epoch": 0.6836567926455567, + "grad_norm": 1.6297738158493271, + "learning_rate": 4.805512466393621e-06, + "loss": 0.7049, + "step": 6693 + }, + { + "epoch": 0.683758937691522, + "grad_norm": 1.4861232710480963, + "learning_rate": 4.802685775184272e-06, + "loss": 0.6331, + "step": 6694 + }, + { + "epoch": 0.6838610827374872, + "grad_norm": 1.6436316159506252, + "learning_rate": 4.799859652824323e-06, + "loss": 0.8186, + "step": 6695 + }, + { + "epoch": 0.6839632277834525, + "grad_norm": 1.4400975364979185, + "learning_rate": 4.797034099623091e-06, + "loss": 0.6597, + "step": 6696 + }, + { + "epoch": 0.6840653728294178, + "grad_norm": 1.357322197603183, + "learning_rate": 4.794209115889843e-06, + "loss": 0.7439, + "step": 6697 + }, + { + "epoch": 0.684167517875383, + "grad_norm": 1.370924954165412, + "learning_rate": 4.791384701933769e-06, + "loss": 0.6024, + "step": 6698 + }, + { + "epoch": 0.6842696629213483, + "grad_norm": 1.505675658507532, + "learning_rate": 4.788560858064002e-06, + "loss": 0.6839, + "step": 6699 + }, + { + "epoch": 0.6843718079673136, + "grad_norm": 1.5510826159942197, + "learning_rate": 4.785737584589619e-06, + "loss": 0.682, + "step": 6700 + }, + { + "epoch": 0.6844739530132788, + "grad_norm": 1.6365368398842248, + "learning_rate": 4.782914881819623e-06, + "loss": 0.7179, + "step": 6701 + }, + { + "epoch": 0.6845760980592441, + "grad_norm": 1.5877252611737447, + "learning_rate": 4.780092750062967e-06, + "loss": 0.699, + "step": 6702 + }, + { + "epoch": 0.6846782431052094, + "grad_norm": 1.584563810122719, + "learning_rate": 4.777271189628533e-06, + "loss": 0.7084, + "step": 6703 + }, + { + "epoch": 0.6847803881511747, + "grad_norm": 1.503604752418825, + "learning_rate": 4.77445020082514e-06, + "loss": 0.7073, + "step": 6704 + }, + { + "epoch": 0.68488253319714, + "grad_norm": 1.4559730071882393, + "learning_rate": 4.771629783961547e-06, + "loss": 0.6774, + "step": 6705 + }, + { + "epoch": 0.6849846782431052, + "grad_norm": 1.4530798548958979, + "learning_rate": 4.768809939346454e-06, + "loss": 0.7003, + "step": 6706 + }, + { + "epoch": 0.6850868232890704, + "grad_norm": 1.496299529952143, + "learning_rate": 4.765990667288489e-06, + "loss": 0.6538, + "step": 6707 + }, + { + "epoch": 0.6851889683350357, + "grad_norm": 1.527692864965651, + "learning_rate": 4.763171968096233e-06, + "loss": 0.738, + "step": 6708 + }, + { + "epoch": 0.685291113381001, + "grad_norm": 1.3640835582031088, + "learning_rate": 4.760353842078187e-06, + "loss": 0.6269, + "step": 6709 + }, + { + "epoch": 0.6853932584269663, + "grad_norm": 1.6351615653899563, + "learning_rate": 4.757536289542798e-06, + "loss": 0.6922, + "step": 6710 + }, + { + "epoch": 0.6854954034729316, + "grad_norm": 1.6325769371838277, + "learning_rate": 4.754719310798446e-06, + "loss": 0.7335, + "step": 6711 + }, + { + "epoch": 0.6855975485188969, + "grad_norm": 1.547190540641468, + "learning_rate": 4.751902906153458e-06, + "loss": 0.657, + "step": 6712 + }, + { + "epoch": 0.6856996935648622, + "grad_norm": 1.4775470590933055, + "learning_rate": 4.749087075916088e-06, + "loss": 0.6832, + "step": 6713 + }, + { + "epoch": 0.6858018386108273, + "grad_norm": 1.3931627128624873, + "learning_rate": 4.746271820394529e-06, + "loss": 0.6831, + "step": 6714 + }, + { + "epoch": 0.6859039836567926, + "grad_norm": 1.9562594884170799, + "learning_rate": 4.7434571398969105e-06, + "loss": 0.7254, + "step": 6715 + }, + { + "epoch": 0.6860061287027579, + "grad_norm": 1.3596156401056345, + "learning_rate": 4.7406430347313045e-06, + "loss": 0.5291, + "step": 6716 + }, + { + "epoch": 0.6861082737487232, + "grad_norm": 1.460419798711219, + "learning_rate": 4.73782950520572e-06, + "loss": 0.7014, + "step": 6717 + }, + { + "epoch": 0.6862104187946885, + "grad_norm": 1.5593519326529974, + "learning_rate": 4.735016551628095e-06, + "loss": 0.8455, + "step": 6718 + }, + { + "epoch": 0.6863125638406538, + "grad_norm": 1.5513534553894874, + "learning_rate": 4.732204174306312e-06, + "loss": 0.7057, + "step": 6719 + }, + { + "epoch": 0.686414708886619, + "grad_norm": 1.396672613190142, + "learning_rate": 4.7293923735481796e-06, + "loss": 0.6862, + "step": 6720 + }, + { + "epoch": 0.6865168539325842, + "grad_norm": 1.5772073339324972, + "learning_rate": 4.726581149661462e-06, + "loss": 0.7344, + "step": 6721 + }, + { + "epoch": 0.6866189989785495, + "grad_norm": 1.7539862686874437, + "learning_rate": 4.723770502953845e-06, + "loss": 0.7465, + "step": 6722 + }, + { + "epoch": 0.6867211440245148, + "grad_norm": 1.2399727676892098, + "learning_rate": 4.7209604337329505e-06, + "loss": 0.5424, + "step": 6723 + }, + { + "epoch": 0.6868232890704801, + "grad_norm": 1.5313293719636298, + "learning_rate": 4.7181509423063525e-06, + "loss": 0.7396, + "step": 6724 + }, + { + "epoch": 0.6869254341164454, + "grad_norm": 1.433112636658535, + "learning_rate": 4.715342028981541e-06, + "loss": 0.6887, + "step": 6725 + }, + { + "epoch": 0.6870275791624106, + "grad_norm": 1.440273819299479, + "learning_rate": 4.712533694065964e-06, + "loss": 0.7249, + "step": 6726 + }, + { + "epoch": 0.6871297242083759, + "grad_norm": 1.4711643861067927, + "learning_rate": 4.709725937866989e-06, + "loss": 0.7243, + "step": 6727 + }, + { + "epoch": 0.6872318692543412, + "grad_norm": 1.4802606396596845, + "learning_rate": 4.7069187606919286e-06, + "loss": 0.7458, + "step": 6728 + }, + { + "epoch": 0.6873340143003064, + "grad_norm": 1.2461919690492955, + "learning_rate": 4.704112162848029e-06, + "loss": 0.6586, + "step": 6729 + }, + { + "epoch": 0.6874361593462717, + "grad_norm": 1.4940429570595293, + "learning_rate": 4.701306144642472e-06, + "loss": 0.6583, + "step": 6730 + }, + { + "epoch": 0.687538304392237, + "grad_norm": 1.4740443164617025, + "learning_rate": 4.698500706382381e-06, + "loss": 0.7566, + "step": 6731 + }, + { + "epoch": 0.6876404494382022, + "grad_norm": 1.6399334052662342, + "learning_rate": 4.695695848374816e-06, + "loss": 0.7727, + "step": 6732 + }, + { + "epoch": 0.6877425944841675, + "grad_norm": 1.3842060966011773, + "learning_rate": 4.692891570926768e-06, + "loss": 0.6256, + "step": 6733 + }, + { + "epoch": 0.6878447395301328, + "grad_norm": 1.53419332995097, + "learning_rate": 4.690087874345165e-06, + "loss": 0.6802, + "step": 6734 + }, + { + "epoch": 0.6879468845760981, + "grad_norm": 1.475943024993585, + "learning_rate": 4.687284758936872e-06, + "loss": 0.6802, + "step": 6735 + }, + { + "epoch": 0.6880490296220634, + "grad_norm": 1.4953806841439174, + "learning_rate": 4.684482225008698e-06, + "loss": 0.67, + "step": 6736 + }, + { + "epoch": 0.6881511746680286, + "grad_norm": 1.505365778003549, + "learning_rate": 4.6816802728673795e-06, + "loss": 0.7716, + "step": 6737 + }, + { + "epoch": 0.6882533197139938, + "grad_norm": 1.5428755220029613, + "learning_rate": 4.67887890281959e-06, + "loss": 0.6938, + "step": 6738 + }, + { + "epoch": 0.6883554647599591, + "grad_norm": 1.329703349164968, + "learning_rate": 4.67607811517194e-06, + "loss": 0.7307, + "step": 6739 + }, + { + "epoch": 0.6884576098059244, + "grad_norm": 1.5220102618993503, + "learning_rate": 4.67327791023098e-06, + "loss": 0.8349, + "step": 6740 + }, + { + "epoch": 0.6885597548518897, + "grad_norm": 1.4787125015148104, + "learning_rate": 4.670478288303198e-06, + "loss": 0.6891, + "step": 6741 + }, + { + "epoch": 0.688661899897855, + "grad_norm": 1.6445839132738922, + "learning_rate": 4.66767924969501e-06, + "loss": 0.6832, + "step": 6742 + }, + { + "epoch": 0.6887640449438203, + "grad_norm": 1.5876279047181747, + "learning_rate": 4.664880794712773e-06, + "loss": 0.7316, + "step": 6743 + }, + { + "epoch": 0.6888661899897855, + "grad_norm": 1.3801947009980542, + "learning_rate": 4.662082923662779e-06, + "loss": 0.729, + "step": 6744 + }, + { + "epoch": 0.6889683350357507, + "grad_norm": 1.585264591113842, + "learning_rate": 4.659285636851256e-06, + "loss": 0.7225, + "step": 6745 + }, + { + "epoch": 0.689070480081716, + "grad_norm": 1.535521748135794, + "learning_rate": 4.656488934584373e-06, + "loss": 0.713, + "step": 6746 + }, + { + "epoch": 0.6891726251276813, + "grad_norm": 1.3766737283308377, + "learning_rate": 4.653692817168226e-06, + "loss": 0.6324, + "step": 6747 + }, + { + "epoch": 0.6892747701736466, + "grad_norm": 1.574540104804747, + "learning_rate": 4.6508972849088576e-06, + "loss": 0.7868, + "step": 6748 + }, + { + "epoch": 0.6893769152196119, + "grad_norm": 1.6123208145547114, + "learning_rate": 4.648102338112237e-06, + "loss": 0.643, + "step": 6749 + }, + { + "epoch": 0.6894790602655771, + "grad_norm": 1.4709619458645418, + "learning_rate": 4.6453079770842695e-06, + "loss": 0.7227, + "step": 6750 + }, + { + "epoch": 0.6895812053115424, + "grad_norm": 1.4740964904163405, + "learning_rate": 4.642514202130808e-06, + "loss": 0.6539, + "step": 6751 + }, + { + "epoch": 0.6896833503575076, + "grad_norm": 1.4854651757543027, + "learning_rate": 4.639721013557628e-06, + "loss": 0.7845, + "step": 6752 + }, + { + "epoch": 0.6897854954034729, + "grad_norm": 1.470395882509057, + "learning_rate": 4.636928411670445e-06, + "loss": 0.7604, + "step": 6753 + }, + { + "epoch": 0.6898876404494382, + "grad_norm": 1.3619495547676008, + "learning_rate": 4.6341363967749095e-06, + "loss": 0.7, + "step": 6754 + }, + { + "epoch": 0.6899897854954035, + "grad_norm": 1.5371267942225482, + "learning_rate": 4.631344969176613e-06, + "loss": 0.7035, + "step": 6755 + }, + { + "epoch": 0.6900919305413687, + "grad_norm": 1.5499009757428495, + "learning_rate": 4.628554129181081e-06, + "loss": 0.6739, + "step": 6756 + }, + { + "epoch": 0.690194075587334, + "grad_norm": 1.551004136175099, + "learning_rate": 4.625763877093771e-06, + "loss": 0.7334, + "step": 6757 + }, + { + "epoch": 0.6902962206332993, + "grad_norm": 1.5304277111028575, + "learning_rate": 4.622974213220075e-06, + "loss": 0.8103, + "step": 6758 + }, + { + "epoch": 0.6903983656792646, + "grad_norm": 1.4824124887067411, + "learning_rate": 4.620185137865323e-06, + "loss": 0.7334, + "step": 6759 + }, + { + "epoch": 0.6905005107252298, + "grad_norm": 1.4368503232086713, + "learning_rate": 4.617396651334787e-06, + "loss": 0.7893, + "step": 6760 + }, + { + "epoch": 0.6906026557711951, + "grad_norm": 1.5745773273697659, + "learning_rate": 4.614608753933665e-06, + "loss": 0.6698, + "step": 6761 + }, + { + "epoch": 0.6907048008171603, + "grad_norm": 1.5336579134333808, + "learning_rate": 4.611821445967094e-06, + "loss": 0.6801, + "step": 6762 + }, + { + "epoch": 0.6908069458631256, + "grad_norm": 1.5908909391837565, + "learning_rate": 4.609034727740144e-06, + "loss": 0.6461, + "step": 6763 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 1.5035990021692525, + "learning_rate": 4.606248599557829e-06, + "loss": 0.8033, + "step": 6764 + }, + { + "epoch": 0.6910112359550562, + "grad_norm": 1.4000414953588973, + "learning_rate": 4.603463061725086e-06, + "loss": 0.6628, + "step": 6765 + }, + { + "epoch": 0.6911133810010215, + "grad_norm": 1.5166798619516466, + "learning_rate": 4.600678114546802e-06, + "loss": 0.6355, + "step": 6766 + }, + { + "epoch": 0.6912155260469868, + "grad_norm": 1.3870179082346747, + "learning_rate": 4.597893758327786e-06, + "loss": 0.7154, + "step": 6767 + }, + { + "epoch": 0.691317671092952, + "grad_norm": 1.483211684229176, + "learning_rate": 4.595109993372788e-06, + "loss": 0.5908, + "step": 6768 + }, + { + "epoch": 0.6914198161389172, + "grad_norm": 1.554261259595864, + "learning_rate": 4.59232681998649e-06, + "loss": 0.6731, + "step": 6769 + }, + { + "epoch": 0.6915219611848825, + "grad_norm": 1.6506411859584873, + "learning_rate": 4.58954423847352e-06, + "loss": 0.5771, + "step": 6770 + }, + { + "epoch": 0.6916241062308478, + "grad_norm": 1.5024146222286596, + "learning_rate": 4.5867622491384234e-06, + "loss": 0.6638, + "step": 6771 + }, + { + "epoch": 0.6917262512768131, + "grad_norm": 1.519873280517956, + "learning_rate": 4.583980852285701e-06, + "loss": 0.695, + "step": 6772 + }, + { + "epoch": 0.6918283963227784, + "grad_norm": 1.543278638382172, + "learning_rate": 4.5812000482197725e-06, + "loss": 0.7164, + "step": 6773 + }, + { + "epoch": 0.6919305413687437, + "grad_norm": 1.7728598853237947, + "learning_rate": 4.5784198372449974e-06, + "loss": 0.7695, + "step": 6774 + }, + { + "epoch": 0.6920326864147089, + "grad_norm": 1.5863337853201025, + "learning_rate": 4.575640219665676e-06, + "loss": 0.7183, + "step": 6775 + }, + { + "epoch": 0.6921348314606741, + "grad_norm": 1.369405799073389, + "learning_rate": 4.572861195786038e-06, + "loss": 0.682, + "step": 6776 + }, + { + "epoch": 0.6922369765066394, + "grad_norm": 1.5079931583850443, + "learning_rate": 4.5700827659102484e-06, + "loss": 0.6408, + "step": 6777 + }, + { + "epoch": 0.6923391215526047, + "grad_norm": 1.548565726644308, + "learning_rate": 4.567304930342407e-06, + "loss": 0.7144, + "step": 6778 + }, + { + "epoch": 0.69244126659857, + "grad_norm": 1.5469566630761717, + "learning_rate": 4.564527689386545e-06, + "loss": 0.7029, + "step": 6779 + }, + { + "epoch": 0.6925434116445353, + "grad_norm": 1.4339483761466318, + "learning_rate": 4.561751043346647e-06, + "loss": 0.7767, + "step": 6780 + }, + { + "epoch": 0.6926455566905005, + "grad_norm": 1.457798586439289, + "learning_rate": 4.55897499252661e-06, + "loss": 0.6564, + "step": 6781 + }, + { + "epoch": 0.6927477017364658, + "grad_norm": 1.2877988227655481, + "learning_rate": 4.556199537230276e-06, + "loss": 0.6737, + "step": 6782 + }, + { + "epoch": 0.692849846782431, + "grad_norm": 1.332956315392737, + "learning_rate": 4.553424677761419e-06, + "loss": 0.6441, + "step": 6783 + }, + { + "epoch": 0.6929519918283963, + "grad_norm": 1.5229127198105892, + "learning_rate": 4.550650414423747e-06, + "loss": 0.7253, + "step": 6784 + }, + { + "epoch": 0.6930541368743616, + "grad_norm": 1.3709148348665277, + "learning_rate": 4.547876747520911e-06, + "loss": 0.6405, + "step": 6785 + }, + { + "epoch": 0.6931562819203269, + "grad_norm": 1.5462928232319892, + "learning_rate": 4.545103677356489e-06, + "loss": 0.7494, + "step": 6786 + }, + { + "epoch": 0.6932584269662921, + "grad_norm": 1.389197771720662, + "learning_rate": 4.5423312042339876e-06, + "loss": 0.6425, + "step": 6787 + }, + { + "epoch": 0.6933605720122574, + "grad_norm": 1.4993160423413212, + "learning_rate": 4.539559328456867e-06, + "loss": 0.6509, + "step": 6788 + }, + { + "epoch": 0.6934627170582227, + "grad_norm": 1.5994802172836868, + "learning_rate": 4.536788050328502e-06, + "loss": 0.6251, + "step": 6789 + }, + { + "epoch": 0.693564862104188, + "grad_norm": 1.6486208739643695, + "learning_rate": 4.534017370152218e-06, + "loss": 0.7125, + "step": 6790 + }, + { + "epoch": 0.6936670071501532, + "grad_norm": 1.4943118369009607, + "learning_rate": 4.531247288231265e-06, + "loss": 0.675, + "step": 6791 + }, + { + "epoch": 0.6937691521961185, + "grad_norm": 1.548715625290685, + "learning_rate": 4.528477804868829e-06, + "loss": 0.667, + "step": 6792 + }, + { + "epoch": 0.6938712972420837, + "grad_norm": 1.4068601735992454, + "learning_rate": 4.525708920368029e-06, + "loss": 0.6167, + "step": 6793 + }, + { + "epoch": 0.693973442288049, + "grad_norm": 1.576602950420474, + "learning_rate": 4.5229406350319285e-06, + "loss": 0.7461, + "step": 6794 + }, + { + "epoch": 0.6940755873340143, + "grad_norm": 1.4440606292743543, + "learning_rate": 4.520172949163509e-06, + "loss": 0.6371, + "step": 6795 + }, + { + "epoch": 0.6941777323799796, + "grad_norm": 1.6045921964400394, + "learning_rate": 4.517405863065706e-06, + "loss": 0.7217, + "step": 6796 + }, + { + "epoch": 0.6942798774259449, + "grad_norm": 1.5416770513302898, + "learning_rate": 4.5146393770413724e-06, + "loss": 0.6997, + "step": 6797 + }, + { + "epoch": 0.6943820224719102, + "grad_norm": 1.3208797193866764, + "learning_rate": 4.511873491393304e-06, + "loss": 0.6503, + "step": 6798 + }, + { + "epoch": 0.6944841675178753, + "grad_norm": 1.5476251422551661, + "learning_rate": 4.509108206424223e-06, + "loss": 0.7272, + "step": 6799 + }, + { + "epoch": 0.6945863125638406, + "grad_norm": 1.4396853103810245, + "learning_rate": 4.5063435224368e-06, + "loss": 0.5444, + "step": 6800 + }, + { + "epoch": 0.6946884576098059, + "grad_norm": 1.4631912127787892, + "learning_rate": 4.503579439733629e-06, + "loss": 0.6514, + "step": 6801 + }, + { + "epoch": 0.6947906026557712, + "grad_norm": 1.58518884301646, + "learning_rate": 4.500815958617239e-06, + "loss": 0.7738, + "step": 6802 + }, + { + "epoch": 0.6948927477017365, + "grad_norm": 1.3724546226795158, + "learning_rate": 4.498053079390091e-06, + "loss": 0.6166, + "step": 6803 + }, + { + "epoch": 0.6949948927477018, + "grad_norm": 1.5630562975742832, + "learning_rate": 4.495290802354589e-06, + "loss": 0.6906, + "step": 6804 + }, + { + "epoch": 0.695097037793667, + "grad_norm": 1.5008174847983093, + "learning_rate": 4.492529127813068e-06, + "loss": 0.6745, + "step": 6805 + }, + { + "epoch": 0.6951991828396322, + "grad_norm": 1.4605168782645175, + "learning_rate": 4.489768056067792e-06, + "loss": 0.6766, + "step": 6806 + }, + { + "epoch": 0.6953013278855975, + "grad_norm": 1.50560593680736, + "learning_rate": 4.487007587420963e-06, + "loss": 0.7417, + "step": 6807 + }, + { + "epoch": 0.6954034729315628, + "grad_norm": 1.5373706522338397, + "learning_rate": 4.484247722174712e-06, + "loss": 0.7043, + "step": 6808 + }, + { + "epoch": 0.6955056179775281, + "grad_norm": 1.531580383076186, + "learning_rate": 4.481488460631115e-06, + "loss": 0.7957, + "step": 6809 + }, + { + "epoch": 0.6956077630234934, + "grad_norm": 1.3779029730932686, + "learning_rate": 4.478729803092171e-06, + "loss": 0.6066, + "step": 6810 + }, + { + "epoch": 0.6957099080694586, + "grad_norm": 2.4343603660073705, + "learning_rate": 4.475971749859813e-06, + "loss": 0.7859, + "step": 6811 + }, + { + "epoch": 0.6958120531154239, + "grad_norm": 1.4829636500444363, + "learning_rate": 4.473214301235921e-06, + "loss": 0.6414, + "step": 6812 + }, + { + "epoch": 0.6959141981613892, + "grad_norm": 1.5804713553243892, + "learning_rate": 4.470457457522291e-06, + "loss": 0.6887, + "step": 6813 + }, + { + "epoch": 0.6960163432073544, + "grad_norm": 1.3389988806746864, + "learning_rate": 4.467701219020667e-06, + "loss": 0.6037, + "step": 6814 + }, + { + "epoch": 0.6961184882533197, + "grad_norm": 1.6110851358640677, + "learning_rate": 4.464945586032719e-06, + "loss": 0.7449, + "step": 6815 + }, + { + "epoch": 0.696220633299285, + "grad_norm": 1.4116805551395035, + "learning_rate": 4.462190558860054e-06, + "loss": 0.6191, + "step": 6816 + }, + { + "epoch": 0.6963227783452502, + "grad_norm": 1.5306103476090789, + "learning_rate": 4.45943613780421e-06, + "loss": 0.7304, + "step": 6817 + }, + { + "epoch": 0.6964249233912155, + "grad_norm": 1.4304422432659412, + "learning_rate": 4.456682323166656e-06, + "loss": 0.6608, + "step": 6818 + }, + { + "epoch": 0.6965270684371808, + "grad_norm": 1.387485509871996, + "learning_rate": 4.453929115248803e-06, + "loss": 0.6742, + "step": 6819 + }, + { + "epoch": 0.6966292134831461, + "grad_norm": 1.4655215858268449, + "learning_rate": 4.4511765143519955e-06, + "loss": 0.6303, + "step": 6820 + }, + { + "epoch": 0.6967313585291114, + "grad_norm": 1.3684655845039935, + "learning_rate": 4.448424520777503e-06, + "loss": 0.629, + "step": 6821 + }, + { + "epoch": 0.6968335035750766, + "grad_norm": 1.50780324349912, + "learning_rate": 4.4456731348265345e-06, + "loss": 0.8013, + "step": 6822 + }, + { + "epoch": 0.6969356486210418, + "grad_norm": 1.6324324350891788, + "learning_rate": 4.442922356800226e-06, + "loss": 0.7644, + "step": 6823 + }, + { + "epoch": 0.6970377936670071, + "grad_norm": 1.4603877755238088, + "learning_rate": 4.44017218699966e-06, + "loss": 0.7205, + "step": 6824 + }, + { + "epoch": 0.6971399387129724, + "grad_norm": 1.5417780828124856, + "learning_rate": 4.437422625725839e-06, + "loss": 0.8077, + "step": 6825 + }, + { + "epoch": 0.6972420837589377, + "grad_norm": 1.3933072464651688, + "learning_rate": 4.434673673279707e-06, + "loss": 0.6998, + "step": 6826 + }, + { + "epoch": 0.697344228804903, + "grad_norm": 1.508865392691754, + "learning_rate": 4.431925329962133e-06, + "loss": 0.7974, + "step": 6827 + }, + { + "epoch": 0.6974463738508683, + "grad_norm": 1.5528196401118837, + "learning_rate": 4.429177596073928e-06, + "loss": 0.6934, + "step": 6828 + }, + { + "epoch": 0.6975485188968336, + "grad_norm": 1.4996057078826062, + "learning_rate": 4.426430471915839e-06, + "loss": 0.7067, + "step": 6829 + }, + { + "epoch": 0.6976506639427987, + "grad_norm": 1.4715660573342602, + "learning_rate": 4.423683957788535e-06, + "loss": 0.7131, + "step": 6830 + }, + { + "epoch": 0.697752808988764, + "grad_norm": 1.4553917884317094, + "learning_rate": 4.420938053992625e-06, + "loss": 0.605, + "step": 6831 + }, + { + "epoch": 0.6978549540347293, + "grad_norm": 1.5675002383686016, + "learning_rate": 4.41819276082865e-06, + "loss": 0.7439, + "step": 6832 + }, + { + "epoch": 0.6979570990806946, + "grad_norm": 1.423942422371826, + "learning_rate": 4.41544807859708e-06, + "loss": 0.7102, + "step": 6833 + }, + { + "epoch": 0.6980592441266599, + "grad_norm": 1.4989983801327054, + "learning_rate": 4.412704007598329e-06, + "loss": 0.6979, + "step": 6834 + }, + { + "epoch": 0.6981613891726252, + "grad_norm": 1.6122564019126269, + "learning_rate": 4.40996054813273e-06, + "loss": 0.8116, + "step": 6835 + }, + { + "epoch": 0.6982635342185904, + "grad_norm": 1.5216957910952824, + "learning_rate": 4.407217700500565e-06, + "loss": 0.7415, + "step": 6836 + }, + { + "epoch": 0.6983656792645556, + "grad_norm": 1.6023715853634912, + "learning_rate": 4.4044754650020364e-06, + "loss": 0.6932, + "step": 6837 + }, + { + "epoch": 0.6984678243105209, + "grad_norm": 1.6164665136924132, + "learning_rate": 4.401733841937279e-06, + "loss": 0.6726, + "step": 6838 + }, + { + "epoch": 0.6985699693564862, + "grad_norm": 1.37887163323791, + "learning_rate": 4.398992831606374e-06, + "loss": 0.6674, + "step": 6839 + }, + { + "epoch": 0.6986721144024515, + "grad_norm": 1.4392720418203955, + "learning_rate": 4.396252434309322e-06, + "loss": 0.7381, + "step": 6840 + }, + { + "epoch": 0.6987742594484168, + "grad_norm": 1.4570259042514484, + "learning_rate": 4.3935126503460615e-06, + "loss": 0.6417, + "step": 6841 + }, + { + "epoch": 0.698876404494382, + "grad_norm": 1.5996022268624903, + "learning_rate": 4.3907734800164594e-06, + "loss": 0.7228, + "step": 6842 + }, + { + "epoch": 0.6989785495403473, + "grad_norm": 1.5816727774641341, + "learning_rate": 4.3880349236203245e-06, + "loss": 0.7383, + "step": 6843 + }, + { + "epoch": 0.6990806945863126, + "grad_norm": 1.512896962122419, + "learning_rate": 4.385296981457396e-06, + "loss": 0.6839, + "step": 6844 + }, + { + "epoch": 0.6991828396322778, + "grad_norm": 1.4808097566520873, + "learning_rate": 4.382559653827342e-06, + "loss": 0.687, + "step": 6845 + }, + { + "epoch": 0.6992849846782431, + "grad_norm": 1.4820606024713598, + "learning_rate": 4.379822941029762e-06, + "loss": 0.6584, + "step": 6846 + }, + { + "epoch": 0.6993871297242084, + "grad_norm": 1.4276576022115857, + "learning_rate": 4.377086843364189e-06, + "loss": 0.6382, + "step": 6847 + }, + { + "epoch": 0.6994892747701736, + "grad_norm": 1.298809824069812, + "learning_rate": 4.374351361130097e-06, + "loss": 0.5897, + "step": 6848 + }, + { + "epoch": 0.6995914198161389, + "grad_norm": 1.4530968951386005, + "learning_rate": 4.371616494626884e-06, + "loss": 0.6452, + "step": 6849 + }, + { + "epoch": 0.6996935648621042, + "grad_norm": 1.5924035859429548, + "learning_rate": 4.3688822441538815e-06, + "loss": 0.7587, + "step": 6850 + }, + { + "epoch": 0.6997957099080695, + "grad_norm": 1.5350968776949536, + "learning_rate": 4.366148610010352e-06, + "loss": 0.6546, + "step": 6851 + }, + { + "epoch": 0.6998978549540348, + "grad_norm": 1.4959940604051245, + "learning_rate": 4.3634155924955e-06, + "loss": 0.6761, + "step": 6852 + }, + { + "epoch": 0.7, + "grad_norm": 1.3715700521176932, + "learning_rate": 4.360683191908451e-06, + "loss": 0.6209, + "step": 6853 + }, + { + "epoch": 0.7001021450459652, + "grad_norm": 1.6486401280220049, + "learning_rate": 4.357951408548272e-06, + "loss": 0.8114, + "step": 6854 + }, + { + "epoch": 0.7002042900919305, + "grad_norm": 1.5473517479074097, + "learning_rate": 4.355220242713958e-06, + "loss": 0.6874, + "step": 6855 + }, + { + "epoch": 0.7003064351378958, + "grad_norm": 1.4934656023474913, + "learning_rate": 4.3524896947044345e-06, + "loss": 0.6962, + "step": 6856 + }, + { + "epoch": 0.7004085801838611, + "grad_norm": 1.5908546904137781, + "learning_rate": 4.3497597648185595e-06, + "loss": 0.6728, + "step": 6857 + }, + { + "epoch": 0.7005107252298264, + "grad_norm": 1.4895237427281691, + "learning_rate": 4.347030453355132e-06, + "loss": 0.7172, + "step": 6858 + }, + { + "epoch": 0.7006128702757917, + "grad_norm": 1.4994178324754317, + "learning_rate": 4.344301760612871e-06, + "loss": 0.6547, + "step": 6859 + }, + { + "epoch": 0.7007150153217568, + "grad_norm": 1.6105977295987985, + "learning_rate": 4.341573686890439e-06, + "loss": 0.7568, + "step": 6860 + }, + { + "epoch": 0.7008171603677221, + "grad_norm": 1.435604422655134, + "learning_rate": 4.338846232486423e-06, + "loss": 0.8171, + "step": 6861 + }, + { + "epoch": 0.7009193054136874, + "grad_norm": 1.4530488259824883, + "learning_rate": 4.336119397699341e-06, + "loss": 0.7082, + "step": 6862 + }, + { + "epoch": 0.7010214504596527, + "grad_norm": 1.522581258810552, + "learning_rate": 4.3333931828276545e-06, + "loss": 0.7314, + "step": 6863 + }, + { + "epoch": 0.701123595505618, + "grad_norm": 1.465312359074145, + "learning_rate": 4.330667588169743e-06, + "loss": 0.6522, + "step": 6864 + }, + { + "epoch": 0.7012257405515833, + "grad_norm": 1.5613916215177015, + "learning_rate": 4.327942614023929e-06, + "loss": 0.6868, + "step": 6865 + }, + { + "epoch": 0.7013278855975486, + "grad_norm": 1.50412780534582, + "learning_rate": 4.325218260688461e-06, + "loss": 0.7757, + "step": 6866 + }, + { + "epoch": 0.7014300306435138, + "grad_norm": 1.4053414029349975, + "learning_rate": 4.322494528461512e-06, + "loss": 0.6582, + "step": 6867 + }, + { + "epoch": 0.701532175689479, + "grad_norm": 1.3978757912291284, + "learning_rate": 4.319771417641212e-06, + "loss": 0.7862, + "step": 6868 + }, + { + "epoch": 0.7016343207354443, + "grad_norm": 1.3690172022526992, + "learning_rate": 4.317048928525601e-06, + "loss": 0.7281, + "step": 6869 + }, + { + "epoch": 0.7017364657814096, + "grad_norm": 1.6706524631067317, + "learning_rate": 4.314327061412657e-06, + "loss": 0.7186, + "step": 6870 + }, + { + "epoch": 0.7018386108273749, + "grad_norm": 1.3303549259325627, + "learning_rate": 4.311605816600288e-06, + "loss": 0.7078, + "step": 6871 + }, + { + "epoch": 0.7019407558733401, + "grad_norm": 1.3520839907710007, + "learning_rate": 4.308885194386335e-06, + "loss": 0.4908, + "step": 6872 + }, + { + "epoch": 0.7020429009193054, + "grad_norm": 1.5570065059996687, + "learning_rate": 4.306165195068578e-06, + "loss": 0.7567, + "step": 6873 + }, + { + "epoch": 0.7021450459652707, + "grad_norm": 1.3878115998411766, + "learning_rate": 4.303445818944718e-06, + "loss": 0.6341, + "step": 6874 + }, + { + "epoch": 0.702247191011236, + "grad_norm": 1.514493281075276, + "learning_rate": 4.3007270663123916e-06, + "loss": 0.7526, + "step": 6875 + }, + { + "epoch": 0.7023493360572012, + "grad_norm": 1.388361325332902, + "learning_rate": 4.298008937469172e-06, + "loss": 0.7016, + "step": 6876 + }, + { + "epoch": 0.7024514811031665, + "grad_norm": 1.4907063128873075, + "learning_rate": 4.295291432712556e-06, + "loss": 0.7577, + "step": 6877 + }, + { + "epoch": 0.7025536261491317, + "grad_norm": 1.5049483647748787, + "learning_rate": 4.292574552339981e-06, + "loss": 0.7872, + "step": 6878 + }, + { + "epoch": 0.702655771195097, + "grad_norm": 1.6883457230224006, + "learning_rate": 4.289858296648809e-06, + "loss": 0.7188, + "step": 6879 + }, + { + "epoch": 0.7027579162410623, + "grad_norm": 1.5472129190231299, + "learning_rate": 4.287142665936336e-06, + "loss": 0.7458, + "step": 6880 + }, + { + "epoch": 0.7028600612870276, + "grad_norm": 1.5132961111446008, + "learning_rate": 4.284427660499786e-06, + "loss": 0.6451, + "step": 6881 + }, + { + "epoch": 0.7029622063329929, + "grad_norm": 1.5492297674414648, + "learning_rate": 4.281713280636324e-06, + "loss": 0.6612, + "step": 6882 + }, + { + "epoch": 0.7030643513789582, + "grad_norm": 1.3573816775271157, + "learning_rate": 4.278999526643034e-06, + "loss": 0.6041, + "step": 6883 + }, + { + "epoch": 0.7031664964249233, + "grad_norm": 1.4751164839055932, + "learning_rate": 4.276286398816946e-06, + "loss": 0.6709, + "step": 6884 + }, + { + "epoch": 0.7032686414708886, + "grad_norm": 1.5169584769255473, + "learning_rate": 4.27357389745501e-06, + "loss": 0.7304, + "step": 6885 + }, + { + "epoch": 0.7033707865168539, + "grad_norm": 1.342609572756083, + "learning_rate": 4.2708620228541105e-06, + "loss": 0.6573, + "step": 6886 + }, + { + "epoch": 0.7034729315628192, + "grad_norm": 1.4762555109216227, + "learning_rate": 4.268150775311061e-06, + "loss": 0.6281, + "step": 6887 + }, + { + "epoch": 0.7035750766087845, + "grad_norm": 1.3985719796100031, + "learning_rate": 4.2654401551226156e-06, + "loss": 0.6348, + "step": 6888 + }, + { + "epoch": 0.7036772216547498, + "grad_norm": 1.4927722265263943, + "learning_rate": 4.26273016258545e-06, + "loss": 0.6519, + "step": 6889 + }, + { + "epoch": 0.7037793667007151, + "grad_norm": 1.544805630759808, + "learning_rate": 4.260020797996175e-06, + "loss": 0.7031, + "step": 6890 + }, + { + "epoch": 0.7038815117466802, + "grad_norm": 1.4298455386768272, + "learning_rate": 4.257312061651329e-06, + "loss": 0.7162, + "step": 6891 + }, + { + "epoch": 0.7039836567926455, + "grad_norm": 1.4985224227709653, + "learning_rate": 4.254603953847387e-06, + "loss": 0.6963, + "step": 6892 + }, + { + "epoch": 0.7040858018386108, + "grad_norm": 1.3714373532174218, + "learning_rate": 4.251896474880758e-06, + "loss": 0.7687, + "step": 6893 + }, + { + "epoch": 0.7041879468845761, + "grad_norm": 1.615170701940534, + "learning_rate": 4.249189625047773e-06, + "loss": 0.7103, + "step": 6894 + }, + { + "epoch": 0.7042900919305414, + "grad_norm": 1.475600933192139, + "learning_rate": 4.246483404644702e-06, + "loss": 0.6687, + "step": 6895 + }, + { + "epoch": 0.7043922369765067, + "grad_norm": 1.4405323194950397, + "learning_rate": 4.243777813967733e-06, + "loss": 0.6387, + "step": 6896 + }, + { + "epoch": 0.7044943820224719, + "grad_norm": 1.2871270164363433, + "learning_rate": 4.241072853313006e-06, + "loss": 0.6883, + "step": 6897 + }, + { + "epoch": 0.7045965270684372, + "grad_norm": 1.4923302086763857, + "learning_rate": 4.2383685229765755e-06, + "loss": 0.6945, + "step": 6898 + }, + { + "epoch": 0.7046986721144024, + "grad_norm": 1.5667679334535802, + "learning_rate": 4.235664823254431e-06, + "loss": 0.7438, + "step": 6899 + }, + { + "epoch": 0.7048008171603677, + "grad_norm": 1.6150210127153137, + "learning_rate": 4.2329617544424976e-06, + "loss": 0.7016, + "step": 6900 + }, + { + "epoch": 0.704902962206333, + "grad_norm": 1.3653693254244708, + "learning_rate": 4.230259316836622e-06, + "loss": 0.651, + "step": 6901 + }, + { + "epoch": 0.7050051072522983, + "grad_norm": 1.518331945787565, + "learning_rate": 4.2275575107325965e-06, + "loss": 0.6647, + "step": 6902 + }, + { + "epoch": 0.7051072522982635, + "grad_norm": 1.4338161491876356, + "learning_rate": 4.22485633642613e-06, + "loss": 0.728, + "step": 6903 + }, + { + "epoch": 0.7052093973442288, + "grad_norm": 1.2339149468194266, + "learning_rate": 4.22215579421287e-06, + "loss": 0.6812, + "step": 6904 + }, + { + "epoch": 0.7053115423901941, + "grad_norm": 1.3312497716484102, + "learning_rate": 4.219455884388391e-06, + "loss": 0.7013, + "step": 6905 + }, + { + "epoch": 0.7054136874361594, + "grad_norm": 1.3893373807235514, + "learning_rate": 4.216756607248197e-06, + "loss": 0.6227, + "step": 6906 + }, + { + "epoch": 0.7055158324821246, + "grad_norm": 1.4223983699624907, + "learning_rate": 4.214057963087727e-06, + "loss": 0.6653, + "step": 6907 + }, + { + "epoch": 0.7056179775280899, + "grad_norm": 1.3985984317691704, + "learning_rate": 4.211359952202357e-06, + "loss": 0.7557, + "step": 6908 + }, + { + "epoch": 0.7057201225740551, + "grad_norm": 1.554109838648449, + "learning_rate": 4.208662574887379e-06, + "loss": 0.703, + "step": 6909 + }, + { + "epoch": 0.7058222676200204, + "grad_norm": 1.2245517088798967, + "learning_rate": 4.205965831438024e-06, + "loss": 0.6619, + "step": 6910 + }, + { + "epoch": 0.7059244126659857, + "grad_norm": 1.7681143736486562, + "learning_rate": 4.203269722149448e-06, + "loss": 0.6726, + "step": 6911 + }, + { + "epoch": 0.706026557711951, + "grad_norm": 1.4262529315357486, + "learning_rate": 4.20057424731675e-06, + "loss": 0.6832, + "step": 6912 + }, + { + "epoch": 0.7061287027579163, + "grad_norm": 1.4871599430438882, + "learning_rate": 4.197879407234947e-06, + "loss": 0.6949, + "step": 6913 + }, + { + "epoch": 0.7062308478038815, + "grad_norm": 1.627684395736922, + "learning_rate": 4.195185202198992e-06, + "loss": 0.8037, + "step": 6914 + }, + { + "epoch": 0.7063329928498467, + "grad_norm": 1.3533222196916344, + "learning_rate": 4.192491632503765e-06, + "loss": 0.753, + "step": 6915 + }, + { + "epoch": 0.706435137895812, + "grad_norm": 1.56137148764183, + "learning_rate": 4.1897986984440784e-06, + "loss": 0.7667, + "step": 6916 + }, + { + "epoch": 0.7065372829417773, + "grad_norm": 1.4538822772656388, + "learning_rate": 4.187106400314683e-06, + "loss": 0.5975, + "step": 6917 + }, + { + "epoch": 0.7066394279877426, + "grad_norm": 1.4345386327078853, + "learning_rate": 4.1844147384102486e-06, + "loss": 0.6025, + "step": 6918 + }, + { + "epoch": 0.7067415730337079, + "grad_norm": 1.48088715885521, + "learning_rate": 4.181723713025376e-06, + "loss": 0.7334, + "step": 6919 + }, + { + "epoch": 0.7068437180796732, + "grad_norm": 1.485899176001851, + "learning_rate": 4.179033324454605e-06, + "loss": 0.6834, + "step": 6920 + }, + { + "epoch": 0.7069458631256385, + "grad_norm": 1.5269414957077465, + "learning_rate": 4.176343572992392e-06, + "loss": 0.8186, + "step": 6921 + }, + { + "epoch": 0.7070480081716036, + "grad_norm": 1.4123607121690231, + "learning_rate": 4.173654458933141e-06, + "loss": 0.7038, + "step": 6922 + }, + { + "epoch": 0.7071501532175689, + "grad_norm": 1.4741140082472883, + "learning_rate": 4.170965982571171e-06, + "loss": 0.6858, + "step": 6923 + }, + { + "epoch": 0.7072522982635342, + "grad_norm": 1.4369479066319717, + "learning_rate": 4.168278144200743e-06, + "loss": 0.7296, + "step": 6924 + }, + { + "epoch": 0.7073544433094995, + "grad_norm": 1.3466378817008398, + "learning_rate": 4.16559094411604e-06, + "loss": 0.6677, + "step": 6925 + }, + { + "epoch": 0.7074565883554648, + "grad_norm": 1.4729974792095448, + "learning_rate": 4.1629043826111745e-06, + "loss": 0.6488, + "step": 6926 + }, + { + "epoch": 0.70755873340143, + "grad_norm": 1.656646769248252, + "learning_rate": 4.1602184599802e-06, + "loss": 0.6709, + "step": 6927 + }, + { + "epoch": 0.7076608784473953, + "grad_norm": 1.3342365848423112, + "learning_rate": 4.157533176517087e-06, + "loss": 0.6062, + "step": 6928 + }, + { + "epoch": 0.7077630234933606, + "grad_norm": 1.5572655294927515, + "learning_rate": 4.154848532515743e-06, + "loss": 0.6751, + "step": 6929 + }, + { + "epoch": 0.7078651685393258, + "grad_norm": 1.4551539307134356, + "learning_rate": 4.152164528269999e-06, + "loss": 0.6958, + "step": 6930 + }, + { + "epoch": 0.7079673135852911, + "grad_norm": 1.5365208776514503, + "learning_rate": 4.149481164073626e-06, + "loss": 0.6269, + "step": 6931 + }, + { + "epoch": 0.7080694586312564, + "grad_norm": 1.6637751151269546, + "learning_rate": 4.146798440220324e-06, + "loss": 0.7469, + "step": 6932 + }, + { + "epoch": 0.7081716036772217, + "grad_norm": 1.3399627605260522, + "learning_rate": 4.144116357003713e-06, + "loss": 0.638, + "step": 6933 + }, + { + "epoch": 0.7082737487231869, + "grad_norm": 1.4809522081192772, + "learning_rate": 4.1414349147173514e-06, + "loss": 0.6571, + "step": 6934 + }, + { + "epoch": 0.7083758937691522, + "grad_norm": 1.5454271114375617, + "learning_rate": 4.138754113654719e-06, + "loss": 0.695, + "step": 6935 + }, + { + "epoch": 0.7084780388151175, + "grad_norm": 1.567514451263671, + "learning_rate": 4.1360739541092396e-06, + "loss": 0.7454, + "step": 6936 + }, + { + "epoch": 0.7085801838610828, + "grad_norm": 1.3656927578039195, + "learning_rate": 4.133394436374255e-06, + "loss": 0.6397, + "step": 6937 + }, + { + "epoch": 0.708682328907048, + "grad_norm": 1.5548160918283194, + "learning_rate": 4.130715560743039e-06, + "loss": 0.7252, + "step": 6938 + }, + { + "epoch": 0.7087844739530132, + "grad_norm": 1.5319142486253907, + "learning_rate": 4.128037327508794e-06, + "loss": 0.6388, + "step": 6939 + }, + { + "epoch": 0.7088866189989785, + "grad_norm": 1.5094076415842528, + "learning_rate": 4.12535973696466e-06, + "loss": 0.6951, + "step": 6940 + }, + { + "epoch": 0.7089887640449438, + "grad_norm": 1.3711593003138105, + "learning_rate": 4.122682789403695e-06, + "loss": 0.7026, + "step": 6941 + }, + { + "epoch": 0.7090909090909091, + "grad_norm": 1.5301888201338882, + "learning_rate": 4.120006485118901e-06, + "loss": 0.7315, + "step": 6942 + }, + { + "epoch": 0.7091930541368744, + "grad_norm": 1.5458233754864787, + "learning_rate": 4.117330824403194e-06, + "loss": 0.6448, + "step": 6943 + }, + { + "epoch": 0.7092951991828397, + "grad_norm": 1.58428243301101, + "learning_rate": 4.114655807549429e-06, + "loss": 0.7707, + "step": 6944 + }, + { + "epoch": 0.7093973442288048, + "grad_norm": 1.4589447303945489, + "learning_rate": 4.111981434850386e-06, + "loss": 0.7338, + "step": 6945 + }, + { + "epoch": 0.7094994892747701, + "grad_norm": 1.3573813231157297, + "learning_rate": 4.1093077065987816e-06, + "loss": 0.594, + "step": 6946 + }, + { + "epoch": 0.7096016343207354, + "grad_norm": 1.4929450602905736, + "learning_rate": 4.1066346230872526e-06, + "loss": 0.6907, + "step": 6947 + }, + { + "epoch": 0.7097037793667007, + "grad_norm": 1.4765637889977614, + "learning_rate": 4.103962184608373e-06, + "loss": 0.707, + "step": 6948 + }, + { + "epoch": 0.709805924412666, + "grad_norm": 1.4266829655229054, + "learning_rate": 4.101290391454643e-06, + "loss": 0.7152, + "step": 6949 + }, + { + "epoch": 0.7099080694586313, + "grad_norm": 1.3393810861270083, + "learning_rate": 4.098619243918487e-06, + "loss": 0.7704, + "step": 6950 + }, + { + "epoch": 0.7100102145045966, + "grad_norm": 1.4913386915484104, + "learning_rate": 4.095948742292271e-06, + "loss": 0.7648, + "step": 6951 + }, + { + "epoch": 0.7101123595505618, + "grad_norm": 1.4933262446769613, + "learning_rate": 4.09327888686828e-06, + "loss": 0.7223, + "step": 6952 + }, + { + "epoch": 0.710214504596527, + "grad_norm": 1.510729726440067, + "learning_rate": 4.090609677938731e-06, + "loss": 0.6809, + "step": 6953 + }, + { + "epoch": 0.7103166496424923, + "grad_norm": 1.4359945221295232, + "learning_rate": 4.087941115795767e-06, + "loss": 0.6382, + "step": 6954 + }, + { + "epoch": 0.7104187946884576, + "grad_norm": 1.586910192518934, + "learning_rate": 4.085273200731468e-06, + "loss": 0.6279, + "step": 6955 + }, + { + "epoch": 0.7105209397344229, + "grad_norm": 1.580053122296865, + "learning_rate": 4.082605933037841e-06, + "loss": 0.7024, + "step": 6956 + }, + { + "epoch": 0.7106230847803882, + "grad_norm": 1.4279719561691568, + "learning_rate": 4.079939313006819e-06, + "loss": 0.7049, + "step": 6957 + }, + { + "epoch": 0.7107252298263534, + "grad_norm": 1.4984985937706568, + "learning_rate": 4.0772733409302634e-06, + "loss": 0.7153, + "step": 6958 + }, + { + "epoch": 0.7108273748723187, + "grad_norm": 1.512320694524567, + "learning_rate": 4.0746080170999665e-06, + "loss": 0.7081, + "step": 6959 + }, + { + "epoch": 0.710929519918284, + "grad_norm": 1.365056956827684, + "learning_rate": 4.071943341807648e-06, + "loss": 0.72, + "step": 6960 + }, + { + "epoch": 0.7110316649642492, + "grad_norm": 1.5510494384170725, + "learning_rate": 4.069279315344964e-06, + "loss": 0.7533, + "step": 6961 + }, + { + "epoch": 0.7111338100102145, + "grad_norm": 1.544704850899328, + "learning_rate": 4.066615938003491e-06, + "loss": 0.6952, + "step": 6962 + }, + { + "epoch": 0.7112359550561798, + "grad_norm": 1.4054277916532003, + "learning_rate": 4.063953210074733e-06, + "loss": 0.6552, + "step": 6963 + }, + { + "epoch": 0.711338100102145, + "grad_norm": 1.5535030525651847, + "learning_rate": 4.0612911318501345e-06, + "loss": 0.6126, + "step": 6964 + }, + { + "epoch": 0.7114402451481103, + "grad_norm": 1.5182672575597311, + "learning_rate": 4.058629703621055e-06, + "loss": 0.6508, + "step": 6965 + }, + { + "epoch": 0.7115423901940756, + "grad_norm": 1.4253831232899727, + "learning_rate": 4.055968925678797e-06, + "loss": 0.6922, + "step": 6966 + }, + { + "epoch": 0.7116445352400409, + "grad_norm": 1.4661668455035148, + "learning_rate": 4.05330879831458e-06, + "loss": 0.7007, + "step": 6967 + }, + { + "epoch": 0.7117466802860062, + "grad_norm": 1.5024060149316576, + "learning_rate": 4.050649321819559e-06, + "loss": 0.7593, + "step": 6968 + }, + { + "epoch": 0.7118488253319714, + "grad_norm": 1.546589826827416, + "learning_rate": 4.047990496484808e-06, + "loss": 0.7128, + "step": 6969 + }, + { + "epoch": 0.7119509703779366, + "grad_norm": 1.4341931944088089, + "learning_rate": 4.045332322601346e-06, + "loss": 0.6251, + "step": 6970 + }, + { + "epoch": 0.7120531154239019, + "grad_norm": 1.6632871188263734, + "learning_rate": 4.042674800460106e-06, + "loss": 0.7633, + "step": 6971 + }, + { + "epoch": 0.7121552604698672, + "grad_norm": 1.2719144549393413, + "learning_rate": 4.0400179303519616e-06, + "loss": 0.7222, + "step": 6972 + }, + { + "epoch": 0.7122574055158325, + "grad_norm": 1.3800349891909651, + "learning_rate": 4.037361712567705e-06, + "loss": 0.6266, + "step": 6973 + }, + { + "epoch": 0.7123595505617978, + "grad_norm": 1.504362942420356, + "learning_rate": 4.034706147398061e-06, + "loss": 0.6854, + "step": 6974 + }, + { + "epoch": 0.7124616956077631, + "grad_norm": 1.3387059733069222, + "learning_rate": 4.0320512351336806e-06, + "loss": 0.6917, + "step": 6975 + }, + { + "epoch": 0.7125638406537282, + "grad_norm": 1.4098029850031484, + "learning_rate": 4.029396976065151e-06, + "loss": 0.6511, + "step": 6976 + }, + { + "epoch": 0.7126659856996935, + "grad_norm": 1.6626313742270857, + "learning_rate": 4.026743370482979e-06, + "loss": 0.6912, + "step": 6977 + }, + { + "epoch": 0.7127681307456588, + "grad_norm": 1.3152340121133819, + "learning_rate": 4.024090418677607e-06, + "loss": 0.5649, + "step": 6978 + }, + { + "epoch": 0.7128702757916241, + "grad_norm": 1.5206404099244941, + "learning_rate": 4.021438120939394e-06, + "loss": 0.6973, + "step": 6979 + }, + { + "epoch": 0.7129724208375894, + "grad_norm": 1.5027527665548193, + "learning_rate": 4.0187864775586415e-06, + "loss": 0.7196, + "step": 6980 + }, + { + "epoch": 0.7130745658835547, + "grad_norm": 1.4543889960971648, + "learning_rate": 4.016135488825576e-06, + "loss": 0.7567, + "step": 6981 + }, + { + "epoch": 0.71317671092952, + "grad_norm": 1.457362973335699, + "learning_rate": 4.01348515503035e-06, + "loss": 0.5176, + "step": 6982 + }, + { + "epoch": 0.7132788559754852, + "grad_norm": 1.514257364983238, + "learning_rate": 4.01083547646304e-06, + "loss": 0.6429, + "step": 6983 + }, + { + "epoch": 0.7133810010214504, + "grad_norm": 1.5201850988807657, + "learning_rate": 4.008186453413653e-06, + "loss": 0.7623, + "step": 6984 + }, + { + "epoch": 0.7134831460674157, + "grad_norm": 1.4757826354680996, + "learning_rate": 4.0055380861721335e-06, + "loss": 0.5967, + "step": 6985 + }, + { + "epoch": 0.713585291113381, + "grad_norm": 1.4068852740322775, + "learning_rate": 4.002890375028343e-06, + "loss": 0.6404, + "step": 6986 + }, + { + "epoch": 0.7136874361593463, + "grad_norm": 1.613139122710717, + "learning_rate": 4.000243320272073e-06, + "loss": 0.7457, + "step": 6987 + }, + { + "epoch": 0.7137895812053116, + "grad_norm": 1.4987010532850589, + "learning_rate": 3.997596922193051e-06, + "loss": 0.725, + "step": 6988 + }, + { + "epoch": 0.7138917262512768, + "grad_norm": 1.4925355664152664, + "learning_rate": 3.99495118108092e-06, + "loss": 0.6383, + "step": 6989 + }, + { + "epoch": 0.7139938712972421, + "grad_norm": 1.5000109920763496, + "learning_rate": 3.992306097225266e-06, + "loss": 0.752, + "step": 6990 + }, + { + "epoch": 0.7140960163432074, + "grad_norm": 1.5989585905451302, + "learning_rate": 3.989661670915591e-06, + "loss": 0.7079, + "step": 6991 + }, + { + "epoch": 0.7141981613891726, + "grad_norm": 1.3948397128365424, + "learning_rate": 3.987017902441329e-06, + "loss": 0.6121, + "step": 6992 + }, + { + "epoch": 0.7143003064351379, + "grad_norm": 1.3696154895201689, + "learning_rate": 3.984374792091843e-06, + "loss": 0.5742, + "step": 6993 + }, + { + "epoch": 0.7144024514811032, + "grad_norm": 1.5097901420389772, + "learning_rate": 3.981732340156419e-06, + "loss": 0.6033, + "step": 6994 + }, + { + "epoch": 0.7145045965270684, + "grad_norm": 1.5133340711156522, + "learning_rate": 3.979090546924278e-06, + "loss": 0.7838, + "step": 6995 + }, + { + "epoch": 0.7146067415730337, + "grad_norm": 1.434584662011082, + "learning_rate": 3.976449412684571e-06, + "loss": 0.6877, + "step": 6996 + }, + { + "epoch": 0.714708886618999, + "grad_norm": 1.3855418475958892, + "learning_rate": 3.973808937726368e-06, + "loss": 0.6551, + "step": 6997 + }, + { + "epoch": 0.7148110316649643, + "grad_norm": 1.4332538025688333, + "learning_rate": 3.971169122338668e-06, + "loss": 0.761, + "step": 6998 + }, + { + "epoch": 0.7149131767109295, + "grad_norm": 1.5403945880685816, + "learning_rate": 3.968529966810402e-06, + "loss": 0.6885, + "step": 6999 + }, + { + "epoch": 0.7150153217568948, + "grad_norm": 1.544246550440396, + "learning_rate": 3.965891471430429e-06, + "loss": 0.6154, + "step": 7000 + }, + { + "epoch": 0.71511746680286, + "grad_norm": 1.6109009217632133, + "learning_rate": 3.963253636487534e-06, + "loss": 0.6905, + "step": 7001 + }, + { + "epoch": 0.7152196118488253, + "grad_norm": 1.595923082487574, + "learning_rate": 3.960616462270429e-06, + "loss": 0.6738, + "step": 7002 + }, + { + "epoch": 0.7153217568947906, + "grad_norm": 1.448628712174153, + "learning_rate": 3.957979949067751e-06, + "loss": 0.5547, + "step": 7003 + }, + { + "epoch": 0.7154239019407559, + "grad_norm": 1.5690766306953075, + "learning_rate": 3.95534409716807e-06, + "loss": 0.7901, + "step": 7004 + }, + { + "epoch": 0.7155260469867212, + "grad_norm": 1.5334029487229923, + "learning_rate": 3.952708906859887e-06, + "loss": 0.6771, + "step": 7005 + }, + { + "epoch": 0.7156281920326865, + "grad_norm": 1.4864462986841054, + "learning_rate": 3.9500743784316206e-06, + "loss": 0.717, + "step": 7006 + }, + { + "epoch": 0.7157303370786516, + "grad_norm": 1.6609678120529443, + "learning_rate": 3.947440512171623e-06, + "loss": 0.72, + "step": 7007 + }, + { + "epoch": 0.7158324821246169, + "grad_norm": 1.320077026184007, + "learning_rate": 3.9448073083681685e-06, + "loss": 0.5906, + "step": 7008 + }, + { + "epoch": 0.7159346271705822, + "grad_norm": 1.6223332202334915, + "learning_rate": 3.942174767309469e-06, + "loss": 0.7974, + "step": 7009 + }, + { + "epoch": 0.7160367722165475, + "grad_norm": 1.6065971506850496, + "learning_rate": 3.9395428892836554e-06, + "loss": 0.6773, + "step": 7010 + }, + { + "epoch": 0.7161389172625128, + "grad_norm": 1.3829526396625, + "learning_rate": 3.936911674578785e-06, + "loss": 0.6503, + "step": 7011 + }, + { + "epoch": 0.7162410623084781, + "grad_norm": 1.4655438965843948, + "learning_rate": 3.934281123482851e-06, + "loss": 0.6437, + "step": 7012 + }, + { + "epoch": 0.7163432073544433, + "grad_norm": 1.303770766939529, + "learning_rate": 3.931651236283769e-06, + "loss": 0.585, + "step": 7013 + }, + { + "epoch": 0.7164453524004086, + "grad_norm": 1.5095490572184738, + "learning_rate": 3.929022013269376e-06, + "loss": 0.7376, + "step": 7014 + }, + { + "epoch": 0.7165474974463738, + "grad_norm": 1.522119712747717, + "learning_rate": 3.926393454727448e-06, + "loss": 0.6947, + "step": 7015 + }, + { + "epoch": 0.7166496424923391, + "grad_norm": 1.475139075591249, + "learning_rate": 3.923765560945683e-06, + "loss": 0.7413, + "step": 7016 + }, + { + "epoch": 0.7167517875383044, + "grad_norm": 1.590885631802265, + "learning_rate": 3.9211383322117e-06, + "loss": 0.7123, + "step": 7017 + }, + { + "epoch": 0.7168539325842697, + "grad_norm": 1.6443483748199068, + "learning_rate": 3.918511768813053e-06, + "loss": 0.7947, + "step": 7018 + }, + { + "epoch": 0.716956077630235, + "grad_norm": 1.4083675300888494, + "learning_rate": 3.9158858710372205e-06, + "loss": 0.697, + "step": 7019 + }, + { + "epoch": 0.7170582226762002, + "grad_norm": 1.4583557746783096, + "learning_rate": 3.913260639171614e-06, + "loss": 0.7127, + "step": 7020 + }, + { + "epoch": 0.7171603677221655, + "grad_norm": 1.5971502915000542, + "learning_rate": 3.910636073503564e-06, + "loss": 0.7369, + "step": 7021 + }, + { + "epoch": 0.7172625127681308, + "grad_norm": 1.414205804962322, + "learning_rate": 3.908012174320329e-06, + "loss": 0.6655, + "step": 7022 + }, + { + "epoch": 0.717364657814096, + "grad_norm": 1.404453918439644, + "learning_rate": 3.905388941909095e-06, + "loss": 0.5964, + "step": 7023 + }, + { + "epoch": 0.7174668028600613, + "grad_norm": 1.4673374134692105, + "learning_rate": 3.902766376556982e-06, + "loss": 0.6769, + "step": 7024 + }, + { + "epoch": 0.7175689479060265, + "grad_norm": 1.404688132525737, + "learning_rate": 3.900144478551028e-06, + "loss": 0.7689, + "step": 7025 + }, + { + "epoch": 0.7176710929519918, + "grad_norm": 1.4072704858483545, + "learning_rate": 3.897523248178202e-06, + "loss": 0.6927, + "step": 7026 + }, + { + "epoch": 0.7177732379979571, + "grad_norm": 1.444236287226445, + "learning_rate": 3.894902685725399e-06, + "loss": 0.6627, + "step": 7027 + }, + { + "epoch": 0.7178753830439224, + "grad_norm": 1.435534095170892, + "learning_rate": 3.892282791479438e-06, + "loss": 0.7429, + "step": 7028 + }, + { + "epoch": 0.7179775280898877, + "grad_norm": 1.4310946315005222, + "learning_rate": 3.889663565727072e-06, + "loss": 0.6564, + "step": 7029 + }, + { + "epoch": 0.7180796731358529, + "grad_norm": 1.6263096407519817, + "learning_rate": 3.887045008754981e-06, + "loss": 0.7515, + "step": 7030 + }, + { + "epoch": 0.7181818181818181, + "grad_norm": 1.4223494455700318, + "learning_rate": 3.884427120849761e-06, + "loss": 0.674, + "step": 7031 + }, + { + "epoch": 0.7182839632277834, + "grad_norm": 1.3829281867394458, + "learning_rate": 3.881809902297945e-06, + "loss": 0.7106, + "step": 7032 + }, + { + "epoch": 0.7183861082737487, + "grad_norm": 1.5425671289927854, + "learning_rate": 3.879193353385985e-06, + "loss": 0.7882, + "step": 7033 + }, + { + "epoch": 0.718488253319714, + "grad_norm": 1.478219515082956, + "learning_rate": 3.876577474400269e-06, + "loss": 0.7238, + "step": 7034 + }, + { + "epoch": 0.7185903983656793, + "grad_norm": 1.5372798387160067, + "learning_rate": 3.873962265627106e-06, + "loss": 0.7272, + "step": 7035 + }, + { + "epoch": 0.7186925434116446, + "grad_norm": 1.4976429074576547, + "learning_rate": 3.871347727352727e-06, + "loss": 0.7289, + "step": 7036 + }, + { + "epoch": 0.7187946884576099, + "grad_norm": 1.552556400668667, + "learning_rate": 3.868733859863302e-06, + "loss": 0.6202, + "step": 7037 + }, + { + "epoch": 0.718896833503575, + "grad_norm": 1.5312789556181494, + "learning_rate": 3.8661206634449145e-06, + "loss": 0.7477, + "step": 7038 + }, + { + "epoch": 0.7189989785495403, + "grad_norm": 1.4448015782734187, + "learning_rate": 3.863508138383587e-06, + "loss": 0.6774, + "step": 7039 + }, + { + "epoch": 0.7191011235955056, + "grad_norm": 1.461959968335165, + "learning_rate": 3.860896284965258e-06, + "loss": 0.6163, + "step": 7040 + }, + { + "epoch": 0.7192032686414709, + "grad_norm": 1.4815567449589533, + "learning_rate": 3.858285103475796e-06, + "loss": 0.5988, + "step": 7041 + }, + { + "epoch": 0.7193054136874362, + "grad_norm": 1.5629535533684802, + "learning_rate": 3.855674594200995e-06, + "loss": 0.7161, + "step": 7042 + }, + { + "epoch": 0.7194075587334015, + "grad_norm": 1.3457463589579566, + "learning_rate": 3.853064757426583e-06, + "loss": 0.6995, + "step": 7043 + }, + { + "epoch": 0.7195097037793667, + "grad_norm": 1.5584706421016177, + "learning_rate": 3.8504555934382015e-06, + "loss": 0.68, + "step": 7044 + }, + { + "epoch": 0.719611848825332, + "grad_norm": 1.4662350746784676, + "learning_rate": 3.847847102521432e-06, + "loss": 0.658, + "step": 7045 + }, + { + "epoch": 0.7197139938712972, + "grad_norm": 1.4188881366868653, + "learning_rate": 3.845239284961772e-06, + "loss": 0.6267, + "step": 7046 + }, + { + "epoch": 0.7198161389172625, + "grad_norm": 1.496800561754512, + "learning_rate": 3.842632141044648e-06, + "loss": 0.7227, + "step": 7047 + }, + { + "epoch": 0.7199182839632278, + "grad_norm": 1.3977356849715123, + "learning_rate": 3.8400256710554105e-06, + "loss": 0.6527, + "step": 7048 + }, + { + "epoch": 0.720020429009193, + "grad_norm": 1.5639474526378097, + "learning_rate": 3.8374198752793465e-06, + "loss": 0.6285, + "step": 7049 + }, + { + "epoch": 0.7201225740551583, + "grad_norm": 1.5188239094557237, + "learning_rate": 3.83481475400166e-06, + "loss": 0.73, + "step": 7050 + }, + { + "epoch": 0.7202247191011236, + "grad_norm": 1.5438991487309413, + "learning_rate": 3.832210307507481e-06, + "loss": 0.6782, + "step": 7051 + }, + { + "epoch": 0.7203268641470889, + "grad_norm": 1.450365324602008, + "learning_rate": 3.8296065360818654e-06, + "loss": 0.6902, + "step": 7052 + }, + { + "epoch": 0.7204290091930541, + "grad_norm": 1.5775740274142305, + "learning_rate": 3.827003440009801e-06, + "loss": 0.7509, + "step": 7053 + }, + { + "epoch": 0.7205311542390194, + "grad_norm": 1.5936993252326161, + "learning_rate": 3.824401019576202e-06, + "loss": 0.7231, + "step": 7054 + }, + { + "epoch": 0.7206332992849847, + "grad_norm": 1.4052472018197752, + "learning_rate": 3.821799275065901e-06, + "loss": 0.6809, + "step": 7055 + }, + { + "epoch": 0.7207354443309499, + "grad_norm": 1.6340562175740863, + "learning_rate": 3.819198206763662e-06, + "loss": 0.6778, + "step": 7056 + }, + { + "epoch": 0.7208375893769152, + "grad_norm": 1.5534429790957294, + "learning_rate": 3.816597814954168e-06, + "loss": 0.7628, + "step": 7057 + }, + { + "epoch": 0.7209397344228805, + "grad_norm": 1.3576150178355606, + "learning_rate": 3.8139980999220426e-06, + "loss": 0.5862, + "step": 7058 + }, + { + "epoch": 0.7210418794688458, + "grad_norm": 1.3692708540587757, + "learning_rate": 3.8113990619518214e-06, + "loss": 0.6034, + "step": 7059 + }, + { + "epoch": 0.7211440245148111, + "grad_norm": 1.5062612610466932, + "learning_rate": 3.808800701327967e-06, + "loss": 0.6191, + "step": 7060 + }, + { + "epoch": 0.7212461695607763, + "grad_norm": 1.481839888144394, + "learning_rate": 3.8062030183348806e-06, + "loss": 0.7233, + "step": 7061 + }, + { + "epoch": 0.7213483146067415, + "grad_norm": 1.479973312481816, + "learning_rate": 3.8036060132568708e-06, + "loss": 0.6779, + "step": 7062 + }, + { + "epoch": 0.7214504596527068, + "grad_norm": 1.5202882822887256, + "learning_rate": 3.8010096863781908e-06, + "loss": 0.6729, + "step": 7063 + }, + { + "epoch": 0.7215526046986721, + "grad_norm": 1.6327802284402848, + "learning_rate": 3.798414037983005e-06, + "loss": 0.6329, + "step": 7064 + }, + { + "epoch": 0.7216547497446374, + "grad_norm": 1.5073896989004747, + "learning_rate": 3.79581906835541e-06, + "loss": 0.6826, + "step": 7065 + }, + { + "epoch": 0.7217568947906027, + "grad_norm": 1.4239633388829123, + "learning_rate": 3.793224777779426e-06, + "loss": 0.7346, + "step": 7066 + }, + { + "epoch": 0.721859039836568, + "grad_norm": 1.4009700114636707, + "learning_rate": 3.790631166538997e-06, + "loss": 0.72, + "step": 7067 + }, + { + "epoch": 0.7219611848825332, + "grad_norm": 1.4674404835529677, + "learning_rate": 3.7880382349179978e-06, + "loss": 0.6512, + "step": 7068 + }, + { + "epoch": 0.7220633299284984, + "grad_norm": 1.5536633851504968, + "learning_rate": 3.7854459832002312e-06, + "loss": 0.5836, + "step": 7069 + }, + { + "epoch": 0.7221654749744637, + "grad_norm": 1.4578115200460853, + "learning_rate": 3.782854411669418e-06, + "loss": 0.7717, + "step": 7070 + }, + { + "epoch": 0.722267620020429, + "grad_norm": 1.420378745929112, + "learning_rate": 3.780263520609204e-06, + "loss": 0.5877, + "step": 7071 + }, + { + "epoch": 0.7223697650663943, + "grad_norm": 1.6933517286350868, + "learning_rate": 3.777673310303164e-06, + "loss": 0.7097, + "step": 7072 + }, + { + "epoch": 0.7224719101123596, + "grad_norm": 1.4256525968299967, + "learning_rate": 3.775083781034804e-06, + "loss": 0.6869, + "step": 7073 + }, + { + "epoch": 0.7225740551583248, + "grad_norm": 1.5198555916083616, + "learning_rate": 3.7724949330875447e-06, + "loss": 0.6888, + "step": 7074 + }, + { + "epoch": 0.7226762002042901, + "grad_norm": 1.4938551964791031, + "learning_rate": 3.7699067667447396e-06, + "loss": 0.7049, + "step": 7075 + }, + { + "epoch": 0.7227783452502554, + "grad_norm": 1.4123405504628173, + "learning_rate": 3.767319282289661e-06, + "loss": 0.6369, + "step": 7076 + }, + { + "epoch": 0.7228804902962206, + "grad_norm": 1.6328955447232858, + "learning_rate": 3.764732480005513e-06, + "loss": 0.6459, + "step": 7077 + }, + { + "epoch": 0.7229826353421859, + "grad_norm": 1.4193681387451087, + "learning_rate": 3.7621463601754273e-06, + "loss": 0.7052, + "step": 7078 + }, + { + "epoch": 0.7230847803881512, + "grad_norm": 1.4869512090919346, + "learning_rate": 3.7595609230824525e-06, + "loss": 0.649, + "step": 7079 + }, + { + "epoch": 0.7231869254341164, + "grad_norm": 1.5796632070595396, + "learning_rate": 3.7569761690095664e-06, + "loss": 0.7175, + "step": 7080 + }, + { + "epoch": 0.7232890704800817, + "grad_norm": 1.7077753631680226, + "learning_rate": 3.754392098239672e-06, + "loss": 0.6698, + "step": 7081 + }, + { + "epoch": 0.723391215526047, + "grad_norm": 1.45969217145936, + "learning_rate": 3.7518087110555943e-06, + "loss": 0.6231, + "step": 7082 + }, + { + "epoch": 0.7234933605720123, + "grad_norm": 1.4122795067251015, + "learning_rate": 3.7492260077400934e-06, + "loss": 0.6069, + "step": 7083 + }, + { + "epoch": 0.7235955056179775, + "grad_norm": 1.5040644557105953, + "learning_rate": 3.746643988575841e-06, + "loss": 0.6834, + "step": 7084 + }, + { + "epoch": 0.7236976506639428, + "grad_norm": 1.532657976039376, + "learning_rate": 3.7440626538454484e-06, + "loss": 0.7281, + "step": 7085 + }, + { + "epoch": 0.723799795709908, + "grad_norm": 1.4650438947348665, + "learning_rate": 3.741482003831439e-06, + "loss": 0.6559, + "step": 7086 + }, + { + "epoch": 0.7239019407558733, + "grad_norm": 1.4995369927961768, + "learning_rate": 3.7389020388162656e-06, + "loss": 0.6265, + "step": 7087 + }, + { + "epoch": 0.7240040858018386, + "grad_norm": 1.5606598861071947, + "learning_rate": 3.7363227590823115e-06, + "loss": 0.6614, + "step": 7088 + }, + { + "epoch": 0.7241062308478039, + "grad_norm": 1.5002165668915637, + "learning_rate": 3.733744164911879e-06, + "loss": 0.6644, + "step": 7089 + }, + { + "epoch": 0.7242083758937692, + "grad_norm": 1.3573597979686727, + "learning_rate": 3.7311662565871966e-06, + "loss": 0.6999, + "step": 7090 + }, + { + "epoch": 0.7243105209397345, + "grad_norm": 1.5651019121910072, + "learning_rate": 3.728589034390413e-06, + "loss": 0.6785, + "step": 7091 + }, + { + "epoch": 0.7244126659856996, + "grad_norm": 1.352499112302756, + "learning_rate": 3.7260124986036116e-06, + "loss": 0.7253, + "step": 7092 + }, + { + "epoch": 0.7245148110316649, + "grad_norm": 1.4814936750694578, + "learning_rate": 3.7234366495088005e-06, + "loss": 0.6392, + "step": 7093 + }, + { + "epoch": 0.7246169560776302, + "grad_norm": 1.4845030672235702, + "learning_rate": 3.7208614873879013e-06, + "loss": 0.7076, + "step": 7094 + }, + { + "epoch": 0.7247191011235955, + "grad_norm": 1.580611567656964, + "learning_rate": 3.718287012522771e-06, + "loss": 0.7371, + "step": 7095 + }, + { + "epoch": 0.7248212461695608, + "grad_norm": 1.3830700958636837, + "learning_rate": 3.7157132251951812e-06, + "loss": 0.671, + "step": 7096 + }, + { + "epoch": 0.7249233912155261, + "grad_norm": 1.6113487156070951, + "learning_rate": 3.7131401256868428e-06, + "loss": 0.7594, + "step": 7097 + }, + { + "epoch": 0.7250255362614914, + "grad_norm": 1.3592419887689868, + "learning_rate": 3.7105677142793795e-06, + "loss": 0.6995, + "step": 7098 + }, + { + "epoch": 0.7251276813074566, + "grad_norm": 1.3236772279693776, + "learning_rate": 3.7079959912543427e-06, + "loss": 0.6768, + "step": 7099 + }, + { + "epoch": 0.7252298263534218, + "grad_norm": 1.3300000601187854, + "learning_rate": 3.7054249568932077e-06, + "loss": 0.5502, + "step": 7100 + }, + { + "epoch": 0.7253319713993871, + "grad_norm": 1.467577643493302, + "learning_rate": 3.70285461147738e-06, + "loss": 0.6946, + "step": 7101 + }, + { + "epoch": 0.7254341164453524, + "grad_norm": 1.4494053819354962, + "learning_rate": 3.7002849552881815e-06, + "loss": 0.7189, + "step": 7102 + }, + { + "epoch": 0.7255362614913177, + "grad_norm": 1.5581260701782937, + "learning_rate": 3.697715988606867e-06, + "loss": 0.729, + "step": 7103 + }, + { + "epoch": 0.725638406537283, + "grad_norm": 1.5866894844431803, + "learning_rate": 3.6951477117146107e-06, + "loss": 0.7443, + "step": 7104 + }, + { + "epoch": 0.7257405515832482, + "grad_norm": 1.6474687016486003, + "learning_rate": 3.6925801248925096e-06, + "loss": 0.8058, + "step": 7105 + }, + { + "epoch": 0.7258426966292135, + "grad_norm": 1.4128404825486578, + "learning_rate": 3.690013228421586e-06, + "loss": 0.6532, + "step": 7106 + }, + { + "epoch": 0.7259448416751788, + "grad_norm": 1.6705538661096744, + "learning_rate": 3.687447022582794e-06, + "loss": 0.6811, + "step": 7107 + }, + { + "epoch": 0.726046986721144, + "grad_norm": 1.4967103316949135, + "learning_rate": 3.684881507657001e-06, + "loss": 0.6834, + "step": 7108 + }, + { + "epoch": 0.7261491317671093, + "grad_norm": 1.6543726034663198, + "learning_rate": 3.682316683925011e-06, + "loss": 0.8138, + "step": 7109 + }, + { + "epoch": 0.7262512768130746, + "grad_norm": 1.5044237801891542, + "learning_rate": 3.6797525516675414e-06, + "loss": 0.6637, + "step": 7110 + }, + { + "epoch": 0.7263534218590398, + "grad_norm": 1.3778905832099217, + "learning_rate": 3.6771891111652347e-06, + "loss": 0.5826, + "step": 7111 + }, + { + "epoch": 0.7264555669050051, + "grad_norm": 1.4630210990796815, + "learning_rate": 3.674626362698668e-06, + "loss": 0.7679, + "step": 7112 + }, + { + "epoch": 0.7265577119509704, + "grad_norm": 1.5365308947751668, + "learning_rate": 3.672064306548333e-06, + "loss": 0.7601, + "step": 7113 + }, + { + "epoch": 0.7266598569969357, + "grad_norm": 1.449295538844758, + "learning_rate": 3.6695029429946484e-06, + "loss": 0.7215, + "step": 7114 + }, + { + "epoch": 0.7267620020429009, + "grad_norm": 1.6272325789129607, + "learning_rate": 3.666942272317956e-06, + "loss": 0.7152, + "step": 7115 + }, + { + "epoch": 0.7268641470888662, + "grad_norm": 1.486904217626116, + "learning_rate": 3.6643822947985208e-06, + "loss": 0.6623, + "step": 7116 + }, + { + "epoch": 0.7269662921348314, + "grad_norm": 1.4123845721614765, + "learning_rate": 3.6618230107165366e-06, + "loss": 0.5766, + "step": 7117 + }, + { + "epoch": 0.7270684371807967, + "grad_norm": 1.3959585615359724, + "learning_rate": 3.659264420352122e-06, + "loss": 0.6357, + "step": 7118 + }, + { + "epoch": 0.727170582226762, + "grad_norm": 1.4126078328443816, + "learning_rate": 3.656706523985313e-06, + "loss": 0.6886, + "step": 7119 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.6406001983046046, + "learning_rate": 3.654149321896073e-06, + "loss": 0.7197, + "step": 7120 + }, + { + "epoch": 0.7273748723186926, + "grad_norm": 1.3952546470205065, + "learning_rate": 3.6515928143642876e-06, + "loss": 0.652, + "step": 7121 + }, + { + "epoch": 0.7274770173646579, + "grad_norm": 1.3925626880905309, + "learning_rate": 3.649037001669773e-06, + "loss": 0.5609, + "step": 7122 + }, + { + "epoch": 0.727579162410623, + "grad_norm": 1.4561756854550991, + "learning_rate": 3.6464818840922623e-06, + "loss": 0.6818, + "step": 7123 + }, + { + "epoch": 0.7276813074565883, + "grad_norm": 1.495220544409117, + "learning_rate": 3.6439274619114095e-06, + "loss": 0.7383, + "step": 7124 + }, + { + "epoch": 0.7277834525025536, + "grad_norm": 1.48774184775132, + "learning_rate": 3.6413737354068067e-06, + "loss": 0.7774, + "step": 7125 + }, + { + "epoch": 0.7278855975485189, + "grad_norm": 1.624373724228595, + "learning_rate": 3.6388207048579537e-06, + "loss": 0.6348, + "step": 7126 + }, + { + "epoch": 0.7279877425944842, + "grad_norm": 1.6616488704099306, + "learning_rate": 3.636268370544288e-06, + "loss": 0.7529, + "step": 7127 + }, + { + "epoch": 0.7280898876404495, + "grad_norm": 1.3931581297431792, + "learning_rate": 3.6337167327451596e-06, + "loss": 0.6658, + "step": 7128 + }, + { + "epoch": 0.7281920326864147, + "grad_norm": 1.4688661290422278, + "learning_rate": 3.631165791739849e-06, + "loss": 0.689, + "step": 7129 + }, + { + "epoch": 0.72829417773238, + "grad_norm": 1.4982968394553628, + "learning_rate": 3.6286155478075536e-06, + "loss": 0.6903, + "step": 7130 + }, + { + "epoch": 0.7283963227783452, + "grad_norm": 1.4409055752172903, + "learning_rate": 3.626066001227405e-06, + "loss": 0.6621, + "step": 7131 + }, + { + "epoch": 0.7284984678243105, + "grad_norm": 1.4421167579036958, + "learning_rate": 3.6235171522784495e-06, + "loss": 0.7708, + "step": 7132 + }, + { + "epoch": 0.7286006128702758, + "grad_norm": 1.3512700613370983, + "learning_rate": 3.6209690012396636e-06, + "loss": 0.7661, + "step": 7133 + }, + { + "epoch": 0.7287027579162411, + "grad_norm": 1.546624688665969, + "learning_rate": 3.618421548389942e-06, + "loss": 0.6801, + "step": 7134 + }, + { + "epoch": 0.7288049029622063, + "grad_norm": 1.4474796972220918, + "learning_rate": 3.615874794008105e-06, + "loss": 0.7565, + "step": 7135 + }, + { + "epoch": 0.7289070480081716, + "grad_norm": 1.5045904675792385, + "learning_rate": 3.613328738372893e-06, + "loss": 0.7765, + "step": 7136 + }, + { + "epoch": 0.7290091930541369, + "grad_norm": 1.3356333399849432, + "learning_rate": 3.61078338176298e-06, + "loss": 0.6626, + "step": 7137 + }, + { + "epoch": 0.7291113381001021, + "grad_norm": 1.5806846091549118, + "learning_rate": 3.608238724456954e-06, + "loss": 0.5791, + "step": 7138 + }, + { + "epoch": 0.7292134831460674, + "grad_norm": 1.5623368907765467, + "learning_rate": 3.6056947667333297e-06, + "loss": 0.7334, + "step": 7139 + }, + { + "epoch": 0.7293156281920327, + "grad_norm": 1.4933888209447022, + "learning_rate": 3.6031515088705406e-06, + "loss": 0.7063, + "step": 7140 + }, + { + "epoch": 0.729417773237998, + "grad_norm": 1.579515555301018, + "learning_rate": 3.600608951146952e-06, + "loss": 0.679, + "step": 7141 + }, + { + "epoch": 0.7295199182839632, + "grad_norm": 1.4636003323639317, + "learning_rate": 3.598067093840851e-06, + "loss": 0.5558, + "step": 7142 + }, + { + "epoch": 0.7296220633299285, + "grad_norm": 1.5299345311274193, + "learning_rate": 3.595525937230444e-06, + "loss": 0.6927, + "step": 7143 + }, + { + "epoch": 0.7297242083758938, + "grad_norm": 1.6358456360435245, + "learning_rate": 3.59298548159386e-06, + "loss": 0.7249, + "step": 7144 + }, + { + "epoch": 0.7298263534218591, + "grad_norm": 1.471630059044617, + "learning_rate": 3.590445727209151e-06, + "loss": 0.571, + "step": 7145 + }, + { + "epoch": 0.7299284984678243, + "grad_norm": 1.589561516493663, + "learning_rate": 3.5879066743543023e-06, + "loss": 0.7061, + "step": 7146 + }, + { + "epoch": 0.7300306435137895, + "grad_norm": 1.5118799599283819, + "learning_rate": 3.58536832330721e-06, + "loss": 0.7199, + "step": 7147 + }, + { + "epoch": 0.7301327885597548, + "grad_norm": 1.7172277757291687, + "learning_rate": 3.5828306743456965e-06, + "loss": 0.6499, + "step": 7148 + }, + { + "epoch": 0.7302349336057201, + "grad_norm": 1.4518023115557317, + "learning_rate": 3.5802937277475147e-06, + "loss": 0.6272, + "step": 7149 + }, + { + "epoch": 0.7303370786516854, + "grad_norm": 1.4584611492893302, + "learning_rate": 3.5777574837903295e-06, + "loss": 0.6615, + "step": 7150 + }, + { + "epoch": 0.7304392236976507, + "grad_norm": 1.6160825393862535, + "learning_rate": 3.5752219427517386e-06, + "loss": 0.7541, + "step": 7151 + }, + { + "epoch": 0.730541368743616, + "grad_norm": 1.3967771381085305, + "learning_rate": 3.5726871049092593e-06, + "loss": 0.6769, + "step": 7152 + }, + { + "epoch": 0.7306435137895813, + "grad_norm": 1.4304688706193205, + "learning_rate": 3.570152970540327e-06, + "loss": 0.6651, + "step": 7153 + }, + { + "epoch": 0.7307456588355464, + "grad_norm": 1.5314516705188512, + "learning_rate": 3.567619539922307e-06, + "loss": 0.6655, + "step": 7154 + }, + { + "epoch": 0.7308478038815117, + "grad_norm": 1.5354712002342537, + "learning_rate": 3.565086813332481e-06, + "loss": 0.7095, + "step": 7155 + }, + { + "epoch": 0.730949948927477, + "grad_norm": 1.4956568529942784, + "learning_rate": 3.5625547910480607e-06, + "loss": 0.6174, + "step": 7156 + }, + { + "epoch": 0.7310520939734423, + "grad_norm": 1.2963747770743712, + "learning_rate": 3.5600234733461812e-06, + "loss": 0.5643, + "step": 7157 + }, + { + "epoch": 0.7311542390194076, + "grad_norm": 1.5265117940517414, + "learning_rate": 3.557492860503893e-06, + "loss": 0.6967, + "step": 7158 + }, + { + "epoch": 0.7312563840653729, + "grad_norm": 1.445588013400257, + "learning_rate": 3.5549629527981733e-06, + "loss": 0.7311, + "step": 7159 + }, + { + "epoch": 0.7313585291113381, + "grad_norm": 1.4786910753124427, + "learning_rate": 3.552433750505919e-06, + "loss": 0.7194, + "step": 7160 + }, + { + "epoch": 0.7314606741573034, + "grad_norm": 1.4029233753362451, + "learning_rate": 3.5499052539039603e-06, + "loss": 0.7547, + "step": 7161 + }, + { + "epoch": 0.7315628192032686, + "grad_norm": 1.467078022122068, + "learning_rate": 3.5473774632690395e-06, + "loss": 0.6511, + "step": 7162 + }, + { + "epoch": 0.7316649642492339, + "grad_norm": 1.4830073117511937, + "learning_rate": 3.5448503788778234e-06, + "loss": 0.5864, + "step": 7163 + }, + { + "epoch": 0.7317671092951992, + "grad_norm": 1.511643053701804, + "learning_rate": 3.5423240010069004e-06, + "loss": 0.6341, + "step": 7164 + }, + { + "epoch": 0.7318692543411645, + "grad_norm": 1.604933532719099, + "learning_rate": 3.5397983299327876e-06, + "loss": 0.7123, + "step": 7165 + }, + { + "epoch": 0.7319713993871297, + "grad_norm": 1.3599871792762637, + "learning_rate": 3.537273365931926e-06, + "loss": 0.767, + "step": 7166 + }, + { + "epoch": 0.732073544433095, + "grad_norm": 1.5201300117831562, + "learning_rate": 3.5347491092806686e-06, + "loss": 0.6454, + "step": 7167 + }, + { + "epoch": 0.7321756894790603, + "grad_norm": 1.5895356066540456, + "learning_rate": 3.532225560255298e-06, + "loss": 0.7863, + "step": 7168 + }, + { + "epoch": 0.7322778345250255, + "grad_norm": 1.461497461510232, + "learning_rate": 3.52970271913202e-06, + "loss": 0.7136, + "step": 7169 + }, + { + "epoch": 0.7323799795709908, + "grad_norm": 1.452832453064315, + "learning_rate": 3.527180586186956e-06, + "loss": 0.622, + "step": 7170 + }, + { + "epoch": 0.732482124616956, + "grad_norm": 1.5542932723616634, + "learning_rate": 3.524659161696161e-06, + "loss": 0.6089, + "step": 7171 + }, + { + "epoch": 0.7325842696629213, + "grad_norm": 1.5202432629788885, + "learning_rate": 3.5221384459356022e-06, + "loss": 0.7928, + "step": 7172 + }, + { + "epoch": 0.7326864147088866, + "grad_norm": 1.5314338168712633, + "learning_rate": 3.5196184391811785e-06, + "loss": 0.6573, + "step": 7173 + }, + { + "epoch": 0.7327885597548519, + "grad_norm": 1.4972369643738652, + "learning_rate": 3.517099141708703e-06, + "loss": 0.6848, + "step": 7174 + }, + { + "epoch": 0.7328907048008172, + "grad_norm": 1.4640711603988523, + "learning_rate": 3.5145805537939124e-06, + "loss": 0.7297, + "step": 7175 + }, + { + "epoch": 0.7329928498467825, + "grad_norm": 1.5985663475466105, + "learning_rate": 3.512062675712474e-06, + "loss": 0.7752, + "step": 7176 + }, + { + "epoch": 0.7330949948927477, + "grad_norm": 1.4900808781794106, + "learning_rate": 3.5095455077399663e-06, + "loss": 0.6853, + "step": 7177 + }, + { + "epoch": 0.7331971399387129, + "grad_norm": 1.4542986060158645, + "learning_rate": 3.5070290501518978e-06, + "loss": 0.7126, + "step": 7178 + }, + { + "epoch": 0.7332992849846782, + "grad_norm": 1.5232548757198607, + "learning_rate": 3.50451330322369e-06, + "loss": 0.6991, + "step": 7179 + }, + { + "epoch": 0.7334014300306435, + "grad_norm": 1.512530032272857, + "learning_rate": 3.5019982672306986e-06, + "loss": 0.6829, + "step": 7180 + }, + { + "epoch": 0.7335035750766088, + "grad_norm": 1.51292559122578, + "learning_rate": 3.4994839424481974e-06, + "loss": 0.6688, + "step": 7181 + }, + { + "epoch": 0.7336057201225741, + "grad_norm": 1.59561868685321, + "learning_rate": 3.49697032915138e-06, + "loss": 0.7978, + "step": 7182 + }, + { + "epoch": 0.7337078651685394, + "grad_norm": 1.4013467532466148, + "learning_rate": 3.494457427615361e-06, + "loss": 0.6973, + "step": 7183 + }, + { + "epoch": 0.7338100102145046, + "grad_norm": 1.4714948005857993, + "learning_rate": 3.4919452381151753e-06, + "loss": 0.6588, + "step": 7184 + }, + { + "epoch": 0.7339121552604698, + "grad_norm": 1.3678885165041412, + "learning_rate": 3.4894337609257923e-06, + "loss": 0.5918, + "step": 7185 + }, + { + "epoch": 0.7340143003064351, + "grad_norm": 1.3923041447997604, + "learning_rate": 3.4869229963220906e-06, + "loss": 0.5693, + "step": 7186 + }, + { + "epoch": 0.7341164453524004, + "grad_norm": 1.345741310302056, + "learning_rate": 3.4844129445788754e-06, + "loss": 0.6793, + "step": 7187 + }, + { + "epoch": 0.7342185903983657, + "grad_norm": 1.4503587892912575, + "learning_rate": 3.4819036059708687e-06, + "loss": 0.6342, + "step": 7188 + }, + { + "epoch": 0.734320735444331, + "grad_norm": 1.569453285302973, + "learning_rate": 3.4793949807727267e-06, + "loss": 0.7437, + "step": 7189 + }, + { + "epoch": 0.7344228804902962, + "grad_norm": 1.519197692524186, + "learning_rate": 3.476887069259015e-06, + "loss": 0.6797, + "step": 7190 + }, + { + "epoch": 0.7345250255362615, + "grad_norm": 1.4453106322507356, + "learning_rate": 3.47437987170423e-06, + "loss": 0.7073, + "step": 7191 + }, + { + "epoch": 0.7346271705822267, + "grad_norm": 1.4883022686074887, + "learning_rate": 3.471873388382785e-06, + "loss": 0.7629, + "step": 7192 + }, + { + "epoch": 0.734729315628192, + "grad_norm": 1.3957535034379795, + "learning_rate": 3.4693676195690153e-06, + "loss": 0.6807, + "step": 7193 + }, + { + "epoch": 0.7348314606741573, + "grad_norm": 1.5067281858837211, + "learning_rate": 3.4668625655371746e-06, + "loss": 0.8111, + "step": 7194 + }, + { + "epoch": 0.7349336057201226, + "grad_norm": 1.427991742552689, + "learning_rate": 3.4643582265614517e-06, + "loss": 0.6167, + "step": 7195 + }, + { + "epoch": 0.7350357507660878, + "grad_norm": 1.4159163593659914, + "learning_rate": 3.4618546029159396e-06, + "loss": 0.5929, + "step": 7196 + }, + { + "epoch": 0.7351378958120531, + "grad_norm": 1.6577175125691186, + "learning_rate": 3.4593516948746684e-06, + "loss": 0.7623, + "step": 7197 + }, + { + "epoch": 0.7352400408580184, + "grad_norm": 1.495075436371532, + "learning_rate": 3.45684950271158e-06, + "loss": 0.7823, + "step": 7198 + }, + { + "epoch": 0.7353421859039837, + "grad_norm": 1.5950963923451145, + "learning_rate": 3.4543480267005382e-06, + "loss": 0.696, + "step": 7199 + }, + { + "epoch": 0.7354443309499489, + "grad_norm": 1.458427327218405, + "learning_rate": 3.451847267115337e-06, + "loss": 0.6556, + "step": 7200 + }, + { + "epoch": 0.7355464759959142, + "grad_norm": 1.4449637934544628, + "learning_rate": 3.4493472242296822e-06, + "loss": 0.7478, + "step": 7201 + }, + { + "epoch": 0.7356486210418794, + "grad_norm": 1.5049990469872028, + "learning_rate": 3.446847898317207e-06, + "loss": 0.632, + "step": 7202 + }, + { + "epoch": 0.7357507660878447, + "grad_norm": 1.449369441550173, + "learning_rate": 3.444349289651463e-06, + "loss": 0.6109, + "step": 7203 + }, + { + "epoch": 0.73585291113381, + "grad_norm": 1.3839527223524368, + "learning_rate": 3.4418513985059177e-06, + "loss": 0.6787, + "step": 7204 + }, + { + "epoch": 0.7359550561797753, + "grad_norm": 1.431790013002358, + "learning_rate": 3.439354225153981e-06, + "loss": 0.7069, + "step": 7205 + }, + { + "epoch": 0.7360572012257406, + "grad_norm": 1.4624326882996626, + "learning_rate": 3.436857769868963e-06, + "loss": 0.6592, + "step": 7206 + }, + { + "epoch": 0.7361593462717059, + "grad_norm": 1.5310688826032746, + "learning_rate": 3.4343620329241032e-06, + "loss": 0.6343, + "step": 7207 + }, + { + "epoch": 0.736261491317671, + "grad_norm": 1.4627653718402227, + "learning_rate": 3.4318670145925602e-06, + "loss": 0.5845, + "step": 7208 + }, + { + "epoch": 0.7363636363636363, + "grad_norm": 1.4752384829460108, + "learning_rate": 3.429372715147412e-06, + "loss": 0.6331, + "step": 7209 + }, + { + "epoch": 0.7364657814096016, + "grad_norm": 1.5428244207008903, + "learning_rate": 3.4268791348616693e-06, + "loss": 0.7231, + "step": 7210 + }, + { + "epoch": 0.7365679264555669, + "grad_norm": 1.5561949164468831, + "learning_rate": 3.4243862740082524e-06, + "loss": 0.719, + "step": 7211 + }, + { + "epoch": 0.7366700715015322, + "grad_norm": 1.6050505235409525, + "learning_rate": 3.421894132860002e-06, + "loss": 0.7217, + "step": 7212 + }, + { + "epoch": 0.7367722165474975, + "grad_norm": 1.4638893459042792, + "learning_rate": 3.4194027116896924e-06, + "loss": 0.6663, + "step": 7213 + }, + { + "epoch": 0.7368743615934628, + "grad_norm": 1.6646856601349274, + "learning_rate": 3.416912010770005e-06, + "loss": 0.7543, + "step": 7214 + }, + { + "epoch": 0.736976506639428, + "grad_norm": 1.449352196068496, + "learning_rate": 3.4144220303735533e-06, + "loss": 0.6968, + "step": 7215 + }, + { + "epoch": 0.7370786516853932, + "grad_norm": 1.4720725308435278, + "learning_rate": 3.4119327707728654e-06, + "loss": 0.7161, + "step": 7216 + }, + { + "epoch": 0.7371807967313585, + "grad_norm": 1.550236175244078, + "learning_rate": 3.4094442322403933e-06, + "loss": 0.6046, + "step": 7217 + }, + { + "epoch": 0.7372829417773238, + "grad_norm": 1.5106948286610038, + "learning_rate": 3.4069564150485034e-06, + "loss": 0.7142, + "step": 7218 + }, + { + "epoch": 0.7373850868232891, + "grad_norm": 1.4180692817344909, + "learning_rate": 3.4044693194694976e-06, + "loss": 0.6724, + "step": 7219 + }, + { + "epoch": 0.7374872318692544, + "grad_norm": 1.5140786796032497, + "learning_rate": 3.401982945775583e-06, + "loss": 0.6531, + "step": 7220 + }, + { + "epoch": 0.7375893769152196, + "grad_norm": 1.473493152878974, + "learning_rate": 3.3994972942389005e-06, + "loss": 0.6631, + "step": 7221 + }, + { + "epoch": 0.7376915219611849, + "grad_norm": 1.375794971763795, + "learning_rate": 3.3970123651315045e-06, + "loss": 0.6411, + "step": 7222 + }, + { + "epoch": 0.7377936670071501, + "grad_norm": 1.270011343452332, + "learning_rate": 3.3945281587253708e-06, + "loss": 0.6038, + "step": 7223 + }, + { + "epoch": 0.7378958120531154, + "grad_norm": 1.4326292711052482, + "learning_rate": 3.392044675292394e-06, + "loss": 0.6222, + "step": 7224 + }, + { + "epoch": 0.7379979570990807, + "grad_norm": 1.4776069485012797, + "learning_rate": 3.3895619151044003e-06, + "loss": 0.7445, + "step": 7225 + }, + { + "epoch": 0.738100102145046, + "grad_norm": 1.4807192178262323, + "learning_rate": 3.387079878433126e-06, + "loss": 0.6391, + "step": 7226 + }, + { + "epoch": 0.7382022471910112, + "grad_norm": 1.5574698102052305, + "learning_rate": 3.3845985655502313e-06, + "loss": 0.6874, + "step": 7227 + }, + { + "epoch": 0.7383043922369765, + "grad_norm": 1.3324613031301005, + "learning_rate": 3.382117976727295e-06, + "loss": 0.6133, + "step": 7228 + }, + { + "epoch": 0.7384065372829418, + "grad_norm": 1.5188391532322971, + "learning_rate": 3.379638112235821e-06, + "loss": 0.7132, + "step": 7229 + }, + { + "epoch": 0.7385086823289071, + "grad_norm": 1.3869807491225528, + "learning_rate": 3.3771589723472364e-06, + "loss": 0.6558, + "step": 7230 + }, + { + "epoch": 0.7386108273748723, + "grad_norm": 1.4928145566997701, + "learning_rate": 3.3746805573328824e-06, + "loss": 0.5816, + "step": 7231 + }, + { + "epoch": 0.7387129724208376, + "grad_norm": 1.4688092257406549, + "learning_rate": 3.3722028674640207e-06, + "loss": 0.7525, + "step": 7232 + }, + { + "epoch": 0.7388151174668028, + "grad_norm": 1.4174700517716656, + "learning_rate": 3.3697259030118336e-06, + "loss": 0.6307, + "step": 7233 + }, + { + "epoch": 0.7389172625127681, + "grad_norm": 1.4872439760387615, + "learning_rate": 3.367249664247434e-06, + "loss": 0.752, + "step": 7234 + }, + { + "epoch": 0.7390194075587334, + "grad_norm": 1.458537152772094, + "learning_rate": 3.364774151441844e-06, + "loss": 0.6458, + "step": 7235 + }, + { + "epoch": 0.7391215526046987, + "grad_norm": 1.5278031342504776, + "learning_rate": 3.3622993648660063e-06, + "loss": 0.6955, + "step": 7236 + }, + { + "epoch": 0.739223697650664, + "grad_norm": 1.5433173660474409, + "learning_rate": 3.3598253047907958e-06, + "loss": 0.7985, + "step": 7237 + }, + { + "epoch": 0.7393258426966293, + "grad_norm": 1.4967563310210548, + "learning_rate": 3.3573519714869916e-06, + "loss": 0.6981, + "step": 7238 + }, + { + "epoch": 0.7394279877425944, + "grad_norm": 1.4624890046214614, + "learning_rate": 3.3548793652253098e-06, + "loss": 0.6894, + "step": 7239 + }, + { + "epoch": 0.7395301327885597, + "grad_norm": 1.60549234977735, + "learning_rate": 3.3524074862763743e-06, + "loss": 0.7191, + "step": 7240 + }, + { + "epoch": 0.739632277834525, + "grad_norm": 1.625812735779238, + "learning_rate": 3.349936334910735e-06, + "loss": 0.6513, + "step": 7241 + }, + { + "epoch": 0.7397344228804903, + "grad_norm": 1.5059778448795302, + "learning_rate": 3.3474659113988596e-06, + "loss": 0.6758, + "step": 7242 + }, + { + "epoch": 0.7398365679264556, + "grad_norm": 1.4672863320748002, + "learning_rate": 3.344996216011135e-06, + "loss": 0.7758, + "step": 7243 + }, + { + "epoch": 0.7399387129724209, + "grad_norm": 1.3542951773587226, + "learning_rate": 3.342527249017875e-06, + "loss": 0.7247, + "step": 7244 + }, + { + "epoch": 0.7400408580183861, + "grad_norm": 1.551796242630123, + "learning_rate": 3.3400590106893118e-06, + "loss": 0.7712, + "step": 7245 + }, + { + "epoch": 0.7401430030643513, + "grad_norm": 1.358722244171248, + "learning_rate": 3.3375915012955916e-06, + "loss": 0.6043, + "step": 7246 + }, + { + "epoch": 0.7402451481103166, + "grad_norm": 1.54651746690138, + "learning_rate": 3.3351247211067874e-06, + "loss": 0.6831, + "step": 7247 + }, + { + "epoch": 0.7403472931562819, + "grad_norm": 1.352675279889106, + "learning_rate": 3.3326586703928853e-06, + "loss": 0.7196, + "step": 7248 + }, + { + "epoch": 0.7404494382022472, + "grad_norm": 1.551020232396347, + "learning_rate": 3.3301933494238013e-06, + "loss": 0.7378, + "step": 7249 + }, + { + "epoch": 0.7405515832482125, + "grad_norm": 1.7399529116090866, + "learning_rate": 3.327728758469366e-06, + "loss": 0.8401, + "step": 7250 + }, + { + "epoch": 0.7406537282941777, + "grad_norm": 1.4847729055160444, + "learning_rate": 3.3252648977993287e-06, + "loss": 0.6599, + "step": 7251 + }, + { + "epoch": 0.740755873340143, + "grad_norm": 1.4652868008733393, + "learning_rate": 3.322801767683357e-06, + "loss": 0.6007, + "step": 7252 + }, + { + "epoch": 0.7408580183861083, + "grad_norm": 1.574100038754834, + "learning_rate": 3.3203393683910458e-06, + "loss": 0.6654, + "step": 7253 + }, + { + "epoch": 0.7409601634320735, + "grad_norm": 1.4632781804503752, + "learning_rate": 3.3178777001919093e-06, + "loss": 0.6738, + "step": 7254 + }, + { + "epoch": 0.7410623084780388, + "grad_norm": 1.5213587181308414, + "learning_rate": 3.315416763355377e-06, + "loss": 0.6601, + "step": 7255 + }, + { + "epoch": 0.7411644535240041, + "grad_norm": 1.513228078561968, + "learning_rate": 3.3129565581507973e-06, + "loss": 0.7303, + "step": 7256 + }, + { + "epoch": 0.7412665985699693, + "grad_norm": 1.4967698844966448, + "learning_rate": 3.3104970848474437e-06, + "loss": 0.6814, + "step": 7257 + }, + { + "epoch": 0.7413687436159346, + "grad_norm": 1.4222064201541127, + "learning_rate": 3.3080383437145026e-06, + "loss": 0.7067, + "step": 7258 + }, + { + "epoch": 0.7414708886618999, + "grad_norm": 1.490917076007063, + "learning_rate": 3.305580335021091e-06, + "loss": 0.6712, + "step": 7259 + }, + { + "epoch": 0.7415730337078652, + "grad_norm": 1.390164529893434, + "learning_rate": 3.303123059036234e-06, + "loss": 0.6398, + "step": 7260 + }, + { + "epoch": 0.7416751787538305, + "grad_norm": 1.5219966094539366, + "learning_rate": 3.3006665160288886e-06, + "loss": 0.724, + "step": 7261 + }, + { + "epoch": 0.7417773237997957, + "grad_norm": 1.4100850582186577, + "learning_rate": 3.2982107062679213e-06, + "loss": 0.5999, + "step": 7262 + }, + { + "epoch": 0.741879468845761, + "grad_norm": 1.2663477616788879, + "learning_rate": 3.295755630022118e-06, + "loss": 0.6376, + "step": 7263 + }, + { + "epoch": 0.7419816138917262, + "grad_norm": 1.4460492280077328, + "learning_rate": 3.2933012875601967e-06, + "loss": 0.6815, + "step": 7264 + }, + { + "epoch": 0.7420837589376915, + "grad_norm": 1.4435069223224857, + "learning_rate": 3.2908476791507826e-06, + "loss": 0.6855, + "step": 7265 + }, + { + "epoch": 0.7421859039836568, + "grad_norm": 1.3340386701722808, + "learning_rate": 3.2883948050624236e-06, + "loss": 0.6297, + "step": 7266 + }, + { + "epoch": 0.7422880490296221, + "grad_norm": 1.4474932404420011, + "learning_rate": 3.285942665563587e-06, + "loss": 0.6878, + "step": 7267 + }, + { + "epoch": 0.7423901940755874, + "grad_norm": 1.6095537947457488, + "learning_rate": 3.2834912609226633e-06, + "loss": 0.6765, + "step": 7268 + }, + { + "epoch": 0.7424923391215527, + "grad_norm": 1.3510676801884707, + "learning_rate": 3.2810405914079645e-06, + "loss": 0.6259, + "step": 7269 + }, + { + "epoch": 0.7425944841675178, + "grad_norm": 1.4110180195028938, + "learning_rate": 3.2785906572877135e-06, + "loss": 0.7216, + "step": 7270 + }, + { + "epoch": 0.7426966292134831, + "grad_norm": 1.4087824912598197, + "learning_rate": 3.276141458830057e-06, + "loss": 0.6014, + "step": 7271 + }, + { + "epoch": 0.7427987742594484, + "grad_norm": 1.3623740061496348, + "learning_rate": 3.2736929963030596e-06, + "loss": 0.5993, + "step": 7272 + }, + { + "epoch": 0.7429009193054137, + "grad_norm": 1.38695352408741, + "learning_rate": 3.271245269974712e-06, + "loss": 0.6104, + "step": 7273 + }, + { + "epoch": 0.743003064351379, + "grad_norm": 1.43493684614806, + "learning_rate": 3.268798280112917e-06, + "loss": 0.6218, + "step": 7274 + }, + { + "epoch": 0.7431052093973443, + "grad_norm": 1.5346958965523905, + "learning_rate": 3.2663520269855e-06, + "loss": 0.7025, + "step": 7275 + }, + { + "epoch": 0.7432073544433095, + "grad_norm": 1.5065464684883334, + "learning_rate": 3.2639065108601995e-06, + "loss": 0.6544, + "step": 7276 + }, + { + "epoch": 0.7433094994892747, + "grad_norm": 1.5840414921562407, + "learning_rate": 3.261461732004688e-06, + "loss": 0.7125, + "step": 7277 + }, + { + "epoch": 0.74341164453524, + "grad_norm": 1.430433024442273, + "learning_rate": 3.25901769068654e-06, + "loss": 0.643, + "step": 7278 + }, + { + "epoch": 0.7435137895812053, + "grad_norm": 1.5975369393170706, + "learning_rate": 3.2565743871732634e-06, + "loss": 0.711, + "step": 7279 + }, + { + "epoch": 0.7436159346271706, + "grad_norm": 1.5894228550897576, + "learning_rate": 3.2541318217322782e-06, + "loss": 0.6307, + "step": 7280 + }, + { + "epoch": 0.7437180796731359, + "grad_norm": 1.5935119865717782, + "learning_rate": 3.251689994630923e-06, + "loss": 0.6238, + "step": 7281 + }, + { + "epoch": 0.7438202247191011, + "grad_norm": 1.5279066014904128, + "learning_rate": 3.249248906136454e-06, + "loss": 0.7027, + "step": 7282 + }, + { + "epoch": 0.7439223697650664, + "grad_norm": 1.4985212237329442, + "learning_rate": 3.246808556516058e-06, + "loss": 0.628, + "step": 7283 + }, + { + "epoch": 0.7440245148110317, + "grad_norm": 1.612355435294389, + "learning_rate": 3.2443689460368256e-06, + "loss": 0.6772, + "step": 7284 + }, + { + "epoch": 0.7441266598569969, + "grad_norm": 1.3842280240303255, + "learning_rate": 3.2419300749657788e-06, + "loss": 0.6964, + "step": 7285 + }, + { + "epoch": 0.7442288049029622, + "grad_norm": 1.5756630060754682, + "learning_rate": 3.2394919435698526e-06, + "loss": 0.8375, + "step": 7286 + }, + { + "epoch": 0.7443309499489275, + "grad_norm": 1.5380175848027595, + "learning_rate": 3.2370545521158968e-06, + "loss": 0.608, + "step": 7287 + }, + { + "epoch": 0.7444330949948927, + "grad_norm": 1.5228010398601513, + "learning_rate": 3.2346179008706936e-06, + "loss": 0.7184, + "step": 7288 + }, + { + "epoch": 0.744535240040858, + "grad_norm": 1.45623032112833, + "learning_rate": 3.2321819901009323e-06, + "loss": 0.625, + "step": 7289 + }, + { + "epoch": 0.7446373850868233, + "grad_norm": 1.4247644509637543, + "learning_rate": 3.229746820073224e-06, + "loss": 0.6352, + "step": 7290 + }, + { + "epoch": 0.7447395301327886, + "grad_norm": 1.4712841668249657, + "learning_rate": 3.2273123910541006e-06, + "loss": 0.609, + "step": 7291 + }, + { + "epoch": 0.7448416751787539, + "grad_norm": 1.5116294547848812, + "learning_rate": 3.2248787033100058e-06, + "loss": 0.802, + "step": 7292 + }, + { + "epoch": 0.744943820224719, + "grad_norm": 1.6310981821690147, + "learning_rate": 3.2224457571073196e-06, + "loss": 0.7314, + "step": 7293 + }, + { + "epoch": 0.7450459652706843, + "grad_norm": 1.497685927433057, + "learning_rate": 3.2200135527123256e-06, + "loss": 0.6653, + "step": 7294 + }, + { + "epoch": 0.7451481103166496, + "grad_norm": 1.5585276616226476, + "learning_rate": 3.217582090391228e-06, + "loss": 0.6928, + "step": 7295 + }, + { + "epoch": 0.7452502553626149, + "grad_norm": 1.6122584920978522, + "learning_rate": 3.215151370410152e-06, + "loss": 0.7437, + "step": 7296 + }, + { + "epoch": 0.7453524004085802, + "grad_norm": 1.4281443866998378, + "learning_rate": 3.2127213930351398e-06, + "loss": 0.7053, + "step": 7297 + }, + { + "epoch": 0.7454545454545455, + "grad_norm": 1.6461917705663103, + "learning_rate": 3.2102921585321587e-06, + "loss": 0.7238, + "step": 7298 + }, + { + "epoch": 0.7455566905005108, + "grad_norm": 1.4923356043669864, + "learning_rate": 3.207863667167088e-06, + "loss": 0.6422, + "step": 7299 + }, + { + "epoch": 0.745658835546476, + "grad_norm": 1.58967118620895, + "learning_rate": 3.2054359192057238e-06, + "loss": 0.6988, + "step": 7300 + }, + { + "epoch": 0.7457609805924412, + "grad_norm": 1.526901592081634, + "learning_rate": 3.2030089149137923e-06, + "loss": 0.7652, + "step": 7301 + }, + { + "epoch": 0.7458631256384065, + "grad_norm": 1.515780907718703, + "learning_rate": 3.200582654556922e-06, + "loss": 0.6502, + "step": 7302 + }, + { + "epoch": 0.7459652706843718, + "grad_norm": 1.5548364514754278, + "learning_rate": 3.198157138400677e-06, + "loss": 0.6859, + "step": 7303 + }, + { + "epoch": 0.7460674157303371, + "grad_norm": 1.6106308380089376, + "learning_rate": 3.1957323667105277e-06, + "loss": 0.7171, + "step": 7304 + }, + { + "epoch": 0.7461695607763024, + "grad_norm": 1.4965631492150013, + "learning_rate": 3.193308339751866e-06, + "loss": 0.7202, + "step": 7305 + }, + { + "epoch": 0.7462717058222677, + "grad_norm": 1.5051339652885638, + "learning_rate": 3.190885057790002e-06, + "loss": 0.7318, + "step": 7306 + }, + { + "epoch": 0.7463738508682329, + "grad_norm": 1.477044628838819, + "learning_rate": 3.188462521090171e-06, + "loss": 0.6794, + "step": 7307 + }, + { + "epoch": 0.7464759959141981, + "grad_norm": 1.3917881719476153, + "learning_rate": 3.1860407299175145e-06, + "loss": 0.7053, + "step": 7308 + }, + { + "epoch": 0.7465781409601634, + "grad_norm": 1.4609751502695738, + "learning_rate": 3.183619684537106e-06, + "loss": 0.7413, + "step": 7309 + }, + { + "epoch": 0.7466802860061287, + "grad_norm": 1.3920167427784755, + "learning_rate": 3.1811993852139257e-06, + "loss": 0.6667, + "step": 7310 + }, + { + "epoch": 0.746782431052094, + "grad_norm": 1.442138760045788, + "learning_rate": 3.1787798322128794e-06, + "loss": 0.658, + "step": 7311 + }, + { + "epoch": 0.7468845760980592, + "grad_norm": 1.4984865352180206, + "learning_rate": 3.1763610257987844e-06, + "loss": 0.7049, + "step": 7312 + }, + { + "epoch": 0.7469867211440245, + "grad_norm": 1.491174203291151, + "learning_rate": 3.173942966236386e-06, + "loss": 0.7258, + "step": 7313 + }, + { + "epoch": 0.7470888661899898, + "grad_norm": 1.5426439857598107, + "learning_rate": 3.1715256537903404e-06, + "loss": 0.5599, + "step": 7314 + }, + { + "epoch": 0.7471910112359551, + "grad_norm": 1.4738893994846944, + "learning_rate": 3.169109088725224e-06, + "loss": 0.5835, + "step": 7315 + }, + { + "epoch": 0.7472931562819203, + "grad_norm": 1.4424190877182772, + "learning_rate": 3.1666932713055285e-06, + "loss": 0.5675, + "step": 7316 + }, + { + "epoch": 0.7473953013278856, + "grad_norm": 1.3693468486433427, + "learning_rate": 3.1642782017956684e-06, + "loss": 0.6681, + "step": 7317 + }, + { + "epoch": 0.7474974463738508, + "grad_norm": 1.5483712880270935, + "learning_rate": 3.16186388045998e-06, + "loss": 0.7208, + "step": 7318 + }, + { + "epoch": 0.7475995914198161, + "grad_norm": 1.4007978433216566, + "learning_rate": 3.159450307562707e-06, + "loss": 0.5058, + "step": 7319 + }, + { + "epoch": 0.7477017364657814, + "grad_norm": 1.4125182555935674, + "learning_rate": 3.1570374833680173e-06, + "loss": 0.6628, + "step": 7320 + }, + { + "epoch": 0.7478038815117467, + "grad_norm": 1.4670233854278145, + "learning_rate": 3.154625408139993e-06, + "loss": 0.5997, + "step": 7321 + }, + { + "epoch": 0.747906026557712, + "grad_norm": 1.5112228224401663, + "learning_rate": 3.152214082142644e-06, + "loss": 0.7062, + "step": 7322 + }, + { + "epoch": 0.7480081716036773, + "grad_norm": 1.5362015298202545, + "learning_rate": 3.149803505639888e-06, + "loss": 0.6824, + "step": 7323 + }, + { + "epoch": 0.7481103166496424, + "grad_norm": 1.4122900656734656, + "learning_rate": 3.1473936788955606e-06, + "loss": 0.6665, + "step": 7324 + }, + { + "epoch": 0.7482124616956077, + "grad_norm": 1.4945991531306546, + "learning_rate": 3.1449846021734256e-06, + "loss": 0.6677, + "step": 7325 + }, + { + "epoch": 0.748314606741573, + "grad_norm": 1.574172493101341, + "learning_rate": 3.1425762757371514e-06, + "loss": 0.6022, + "step": 7326 + }, + { + "epoch": 0.7484167517875383, + "grad_norm": 1.4955029150719963, + "learning_rate": 3.1401686998503377e-06, + "loss": 0.7323, + "step": 7327 + }, + { + "epoch": 0.7485188968335036, + "grad_norm": 1.515072363249974, + "learning_rate": 3.1377618747764914e-06, + "loss": 0.6286, + "step": 7328 + }, + { + "epoch": 0.7486210418794689, + "grad_norm": 1.462095536505743, + "learning_rate": 3.135355800779042e-06, + "loss": 0.6922, + "step": 7329 + }, + { + "epoch": 0.7487231869254342, + "grad_norm": 1.3026849421733193, + "learning_rate": 3.132950478121336e-06, + "loss": 0.5579, + "step": 7330 + }, + { + "epoch": 0.7488253319713993, + "grad_norm": 1.3130148444370955, + "learning_rate": 3.1305459070666324e-06, + "loss": 0.623, + "step": 7331 + }, + { + "epoch": 0.7489274770173646, + "grad_norm": 1.4763716259279873, + "learning_rate": 3.128142087878118e-06, + "loss": 0.7339, + "step": 7332 + }, + { + "epoch": 0.7490296220633299, + "grad_norm": 1.5549038305097915, + "learning_rate": 3.1257390208188954e-06, + "loss": 0.7197, + "step": 7333 + }, + { + "epoch": 0.7491317671092952, + "grad_norm": 1.5393353675810355, + "learning_rate": 3.1233367061519782e-06, + "loss": 0.658, + "step": 7334 + }, + { + "epoch": 0.7492339121552605, + "grad_norm": 1.4142178875609257, + "learning_rate": 3.1209351441403013e-06, + "loss": 0.6252, + "step": 7335 + }, + { + "epoch": 0.7493360572012258, + "grad_norm": 1.3927300083760394, + "learning_rate": 3.1185343350467135e-06, + "loss": 0.6814, + "step": 7336 + }, + { + "epoch": 0.749438202247191, + "grad_norm": 1.432365673442275, + "learning_rate": 3.116134279133992e-06, + "loss": 0.7229, + "step": 7337 + }, + { + "epoch": 0.7495403472931563, + "grad_norm": 1.470825030166371, + "learning_rate": 3.1137349766648215e-06, + "loss": 0.5936, + "step": 7338 + }, + { + "epoch": 0.7496424923391215, + "grad_norm": 1.3562483228082394, + "learning_rate": 3.1113364279018075e-06, + "loss": 0.6443, + "step": 7339 + }, + { + "epoch": 0.7497446373850868, + "grad_norm": 1.5846451805343864, + "learning_rate": 3.108938633107469e-06, + "loss": 0.7362, + "step": 7340 + }, + { + "epoch": 0.7498467824310521, + "grad_norm": 1.4437187461839855, + "learning_rate": 3.1065415925442487e-06, + "loss": 0.7117, + "step": 7341 + }, + { + "epoch": 0.7499489274770174, + "grad_norm": 1.5059706587337267, + "learning_rate": 3.1041453064745073e-06, + "loss": 0.6752, + "step": 7342 + }, + { + "epoch": 0.7500510725229826, + "grad_norm": 1.5932847338564633, + "learning_rate": 3.1017497751605184e-06, + "loss": 0.7279, + "step": 7343 + }, + { + "epoch": 0.7501532175689479, + "grad_norm": 1.3606241091957954, + "learning_rate": 3.0993549988644733e-06, + "loss": 0.6704, + "step": 7344 + }, + { + "epoch": 0.7502553626149132, + "grad_norm": 1.4568792028542599, + "learning_rate": 3.096960977848482e-06, + "loss": 0.7453, + "step": 7345 + }, + { + "epoch": 0.7503575076608785, + "grad_norm": 1.5080967527396887, + "learning_rate": 3.0945677123745687e-06, + "loss": 0.6659, + "step": 7346 + }, + { + "epoch": 0.7504596527068437, + "grad_norm": 1.4384596193466221, + "learning_rate": 3.092175202704684e-06, + "loss": 0.7362, + "step": 7347 + }, + { + "epoch": 0.750561797752809, + "grad_norm": 1.3483693961574803, + "learning_rate": 3.0897834491006818e-06, + "loss": 0.6838, + "step": 7348 + }, + { + "epoch": 0.7506639427987742, + "grad_norm": 1.5851545150928008, + "learning_rate": 3.0873924518243504e-06, + "loss": 0.8075, + "step": 7349 + }, + { + "epoch": 0.7507660878447395, + "grad_norm": 1.3054090809092345, + "learning_rate": 3.08500221113738e-06, + "loss": 0.7067, + "step": 7350 + }, + { + "epoch": 0.7508682328907048, + "grad_norm": 1.4891763215503115, + "learning_rate": 3.082612727301383e-06, + "loss": 0.7387, + "step": 7351 + }, + { + "epoch": 0.7509703779366701, + "grad_norm": 1.486049117147843, + "learning_rate": 3.080224000577895e-06, + "loss": 0.6871, + "step": 7352 + }, + { + "epoch": 0.7510725229826354, + "grad_norm": 1.4085850780168814, + "learning_rate": 3.0778360312283617e-06, + "loss": 0.6649, + "step": 7353 + }, + { + "epoch": 0.7511746680286007, + "grad_norm": 1.4818951099275164, + "learning_rate": 3.0754488195141464e-06, + "loss": 0.7912, + "step": 7354 + }, + { + "epoch": 0.7512768130745658, + "grad_norm": 1.396184903660817, + "learning_rate": 3.0730623656965288e-06, + "loss": 0.6842, + "step": 7355 + }, + { + "epoch": 0.7513789581205311, + "grad_norm": 1.5695093846584869, + "learning_rate": 3.0706766700367095e-06, + "loss": 0.7839, + "step": 7356 + }, + { + "epoch": 0.7514811031664964, + "grad_norm": 1.3078015936791683, + "learning_rate": 3.0682917327958095e-06, + "loss": 0.6343, + "step": 7357 + }, + { + "epoch": 0.7515832482124617, + "grad_norm": 1.5495646021413993, + "learning_rate": 3.0659075542348583e-06, + "loss": 0.6841, + "step": 7358 + }, + { + "epoch": 0.751685393258427, + "grad_norm": 1.530334184665937, + "learning_rate": 3.063524134614805e-06, + "loss": 0.7695, + "step": 7359 + }, + { + "epoch": 0.7517875383043923, + "grad_norm": 1.6010646702197036, + "learning_rate": 3.061141474196513e-06, + "loss": 0.6714, + "step": 7360 + }, + { + "epoch": 0.7518896833503576, + "grad_norm": 1.5282134803130527, + "learning_rate": 3.058759573240774e-06, + "loss": 0.6667, + "step": 7361 + }, + { + "epoch": 0.7519918283963227, + "grad_norm": 1.5222213239995666, + "learning_rate": 3.0563784320082833e-06, + "loss": 0.7305, + "step": 7362 + }, + { + "epoch": 0.752093973442288, + "grad_norm": 1.486241774941304, + "learning_rate": 3.0539980507596588e-06, + "loss": 0.6444, + "step": 7363 + }, + { + "epoch": 0.7521961184882533, + "grad_norm": 1.5667639932440547, + "learning_rate": 3.051618429755433e-06, + "loss": 0.817, + "step": 7364 + }, + { + "epoch": 0.7522982635342186, + "grad_norm": 1.398122195975387, + "learning_rate": 3.049239569256063e-06, + "loss": 0.6346, + "step": 7365 + }, + { + "epoch": 0.7524004085801839, + "grad_norm": 1.4643337629675643, + "learning_rate": 3.046861469521909e-06, + "loss": 0.7417, + "step": 7366 + }, + { + "epoch": 0.7525025536261492, + "grad_norm": 1.3961404005152431, + "learning_rate": 3.0444841308132635e-06, + "loss": 0.588, + "step": 7367 + }, + { + "epoch": 0.7526046986721144, + "grad_norm": 1.4747741193082498, + "learning_rate": 3.042107553390323e-06, + "loss": 0.6993, + "step": 7368 + }, + { + "epoch": 0.7527068437180797, + "grad_norm": 1.5891928412582947, + "learning_rate": 3.0397317375132064e-06, + "loss": 0.7577, + "step": 7369 + }, + { + "epoch": 0.7528089887640449, + "grad_norm": 1.4921189015476584, + "learning_rate": 3.0373566834419445e-06, + "loss": 0.6711, + "step": 7370 + }, + { + "epoch": 0.7529111338100102, + "grad_norm": 1.2600322502848067, + "learning_rate": 3.034982391436495e-06, + "loss": 0.6099, + "step": 7371 + }, + { + "epoch": 0.7530132788559755, + "grad_norm": 1.4885130462834362, + "learning_rate": 3.0326088617567204e-06, + "loss": 0.6047, + "step": 7372 + }, + { + "epoch": 0.7531154239019408, + "grad_norm": 1.5156072995660586, + "learning_rate": 3.03023609466241e-06, + "loss": 0.7578, + "step": 7373 + }, + { + "epoch": 0.753217568947906, + "grad_norm": 1.6085434385377166, + "learning_rate": 3.027864090413263e-06, + "loss": 0.724, + "step": 7374 + }, + { + "epoch": 0.7533197139938713, + "grad_norm": 1.5651032270204697, + "learning_rate": 3.0254928492688905e-06, + "loss": 0.7588, + "step": 7375 + }, + { + "epoch": 0.7534218590398366, + "grad_norm": 1.4929193041921875, + "learning_rate": 3.023122371488837e-06, + "loss": 0.6857, + "step": 7376 + }, + { + "epoch": 0.7535240040858019, + "grad_norm": 1.4130015211760834, + "learning_rate": 3.0207526573325473e-06, + "loss": 0.7327, + "step": 7377 + }, + { + "epoch": 0.7536261491317671, + "grad_norm": 1.6705119570409108, + "learning_rate": 3.018383707059388e-06, + "loss": 0.7442, + "step": 7378 + }, + { + "epoch": 0.7537282941777323, + "grad_norm": 1.5065922576770472, + "learning_rate": 3.016015520928639e-06, + "loss": 0.7277, + "step": 7379 + }, + { + "epoch": 0.7538304392236976, + "grad_norm": 1.5787258215061597, + "learning_rate": 3.013648099199504e-06, + "loss": 0.7471, + "step": 7380 + }, + { + "epoch": 0.7539325842696629, + "grad_norm": 1.430279911098824, + "learning_rate": 3.011281442131102e-06, + "loss": 0.7229, + "step": 7381 + }, + { + "epoch": 0.7540347293156282, + "grad_norm": 1.544452746334185, + "learning_rate": 3.008915549982461e-06, + "loss": 0.6635, + "step": 7382 + }, + { + "epoch": 0.7541368743615935, + "grad_norm": 1.3988708005590775, + "learning_rate": 3.0065504230125297e-06, + "loss": 0.6076, + "step": 7383 + }, + { + "epoch": 0.7542390194075588, + "grad_norm": 1.582997856886227, + "learning_rate": 3.0041860614801734e-06, + "loss": 0.7025, + "step": 7384 + }, + { + "epoch": 0.754341164453524, + "grad_norm": 1.6702792905658461, + "learning_rate": 3.0018224656441684e-06, + "loss": 0.6837, + "step": 7385 + }, + { + "epoch": 0.7544433094994892, + "grad_norm": 1.50883689100958, + "learning_rate": 2.99945963576322e-06, + "loss": 0.7914, + "step": 7386 + }, + { + "epoch": 0.7545454545454545, + "grad_norm": 1.5125548351810687, + "learning_rate": 2.9970975720959372e-06, + "loss": 0.7445, + "step": 7387 + }, + { + "epoch": 0.7546475995914198, + "grad_norm": 1.538047176267832, + "learning_rate": 2.994736274900847e-06, + "loss": 0.623, + "step": 7388 + }, + { + "epoch": 0.7547497446373851, + "grad_norm": 1.4895210683350129, + "learning_rate": 2.9923757444364e-06, + "loss": 0.6346, + "step": 7389 + }, + { + "epoch": 0.7548518896833504, + "grad_norm": 1.502613610771345, + "learning_rate": 2.990015980960952e-06, + "loss": 0.6239, + "step": 7390 + }, + { + "epoch": 0.7549540347293157, + "grad_norm": 1.5176719967855912, + "learning_rate": 2.9876569847327873e-06, + "loss": 0.6667, + "step": 7391 + }, + { + "epoch": 0.755056179775281, + "grad_norm": 1.6275282986861968, + "learning_rate": 2.9852987560100955e-06, + "loss": 0.7237, + "step": 7392 + }, + { + "epoch": 0.7551583248212461, + "grad_norm": 1.6384351087907165, + "learning_rate": 2.9829412950509874e-06, + "loss": 0.7894, + "step": 7393 + }, + { + "epoch": 0.7552604698672114, + "grad_norm": 1.46122284759592, + "learning_rate": 2.9805846021134856e-06, + "loss": 0.6324, + "step": 7394 + }, + { + "epoch": 0.7553626149131767, + "grad_norm": 1.663885318224284, + "learning_rate": 2.9782286774555367e-06, + "loss": 0.721, + "step": 7395 + }, + { + "epoch": 0.755464759959142, + "grad_norm": 1.5699906893194122, + "learning_rate": 2.975873521334993e-06, + "loss": 0.7131, + "step": 7396 + }, + { + "epoch": 0.7555669050051073, + "grad_norm": 1.5305018301446212, + "learning_rate": 2.9735191340096335e-06, + "loss": 0.6917, + "step": 7397 + }, + { + "epoch": 0.7556690500510725, + "grad_norm": 1.4032371257552516, + "learning_rate": 2.9711655157371444e-06, + "loss": 0.7249, + "step": 7398 + }, + { + "epoch": 0.7557711950970378, + "grad_norm": 1.5201139342252188, + "learning_rate": 2.9688126667751303e-06, + "loss": 0.6589, + "step": 7399 + }, + { + "epoch": 0.7558733401430031, + "grad_norm": 1.500631948926616, + "learning_rate": 2.9664605873811104e-06, + "loss": 0.7535, + "step": 7400 + }, + { + "epoch": 0.7559754851889683, + "grad_norm": 1.5108203604305874, + "learning_rate": 2.964109277812526e-06, + "loss": 0.6811, + "step": 7401 + }, + { + "epoch": 0.7560776302349336, + "grad_norm": 1.3680764431558419, + "learning_rate": 2.9617587383267266e-06, + "loss": 0.6151, + "step": 7402 + }, + { + "epoch": 0.7561797752808989, + "grad_norm": 1.5882061152168516, + "learning_rate": 2.959408969180981e-06, + "loss": 0.7107, + "step": 7403 + }, + { + "epoch": 0.7562819203268641, + "grad_norm": 1.5603439801066015, + "learning_rate": 2.95705997063247e-06, + "loss": 0.7559, + "step": 7404 + }, + { + "epoch": 0.7563840653728294, + "grad_norm": 1.6739827262326474, + "learning_rate": 2.9547117429382955e-06, + "loss": 0.7459, + "step": 7405 + }, + { + "epoch": 0.7564862104187947, + "grad_norm": 1.5155734596075967, + "learning_rate": 2.952364286355475e-06, + "loss": 0.6424, + "step": 7406 + }, + { + "epoch": 0.75658835546476, + "grad_norm": 1.3215348387500192, + "learning_rate": 2.9500176011409365e-06, + "loss": 0.577, + "step": 7407 + }, + { + "epoch": 0.7566905005107253, + "grad_norm": 1.5166281698816015, + "learning_rate": 2.9476716875515265e-06, + "loss": 0.6843, + "step": 7408 + }, + { + "epoch": 0.7567926455566905, + "grad_norm": 1.3937676171948743, + "learning_rate": 2.945326545844004e-06, + "loss": 0.6411, + "step": 7409 + }, + { + "epoch": 0.7568947906026557, + "grad_norm": 1.6200071846609922, + "learning_rate": 2.942982176275052e-06, + "loss": 0.6305, + "step": 7410 + }, + { + "epoch": 0.756996935648621, + "grad_norm": 1.4115235226309217, + "learning_rate": 2.9406385791012604e-06, + "loss": 0.6409, + "step": 7411 + }, + { + "epoch": 0.7570990806945863, + "grad_norm": 1.4044946628605268, + "learning_rate": 2.9382957545791333e-06, + "loss": 0.7496, + "step": 7412 + }, + { + "epoch": 0.7572012257405516, + "grad_norm": 1.6162198970035375, + "learning_rate": 2.935953702965102e-06, + "loss": 0.6873, + "step": 7413 + }, + { + "epoch": 0.7573033707865169, + "grad_norm": 1.431189574773295, + "learning_rate": 2.9336124245154995e-06, + "loss": 0.6205, + "step": 7414 + }, + { + "epoch": 0.7574055158324822, + "grad_norm": 1.482056556253484, + "learning_rate": 2.9312719194865845e-06, + "loss": 0.7205, + "step": 7415 + }, + { + "epoch": 0.7575076608784473, + "grad_norm": 1.805903595230955, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.7599, + "step": 7416 + }, + { + "epoch": 0.7576098059244126, + "grad_norm": 1.3284276573168574, + "learning_rate": 2.9265932307154064e-06, + "loss": 0.6073, + "step": 7417 + }, + { + "epoch": 0.7577119509703779, + "grad_norm": 1.3868868341373433, + "learning_rate": 2.9242550474852294e-06, + "loss": 0.7313, + "step": 7418 + }, + { + "epoch": 0.7578140960163432, + "grad_norm": 1.4549558391544015, + "learning_rate": 2.9219176386999048e-06, + "loss": 0.6481, + "step": 7419 + }, + { + "epoch": 0.7579162410623085, + "grad_norm": 1.3727739847972151, + "learning_rate": 2.9195810046152717e-06, + "loss": 0.7543, + "step": 7420 + }, + { + "epoch": 0.7580183861082738, + "grad_norm": 1.513107430084354, + "learning_rate": 2.917245145487069e-06, + "loss": 0.6979, + "step": 7421 + }, + { + "epoch": 0.758120531154239, + "grad_norm": 1.4864682381043417, + "learning_rate": 2.9149100615709635e-06, + "loss": 0.7042, + "step": 7422 + }, + { + "epoch": 0.7582226762002043, + "grad_norm": 1.3509754291362757, + "learning_rate": 2.9125757531225296e-06, + "loss": 0.6282, + "step": 7423 + }, + { + "epoch": 0.7583248212461695, + "grad_norm": 1.568394017620605, + "learning_rate": 2.9102422203972546e-06, + "loss": 0.6835, + "step": 7424 + }, + { + "epoch": 0.7584269662921348, + "grad_norm": 1.3114329100927211, + "learning_rate": 2.9079094636505533e-06, + "loss": 0.584, + "step": 7425 + }, + { + "epoch": 0.7585291113381001, + "grad_norm": 1.5630330466552067, + "learning_rate": 2.9055774831377436e-06, + "loss": 0.7948, + "step": 7426 + }, + { + "epoch": 0.7586312563840654, + "grad_norm": 1.4368909478942313, + "learning_rate": 2.9032462791140613e-06, + "loss": 0.6769, + "step": 7427 + }, + { + "epoch": 0.7587334014300307, + "grad_norm": 1.452773688240602, + "learning_rate": 2.9009158518346557e-06, + "loss": 0.6853, + "step": 7428 + }, + { + "epoch": 0.7588355464759959, + "grad_norm": 1.512828472508022, + "learning_rate": 2.8985862015545973e-06, + "loss": 0.7146, + "step": 7429 + }, + { + "epoch": 0.7589376915219612, + "grad_norm": 1.4306841096048442, + "learning_rate": 2.89625732852887e-06, + "loss": 0.6882, + "step": 7430 + }, + { + "epoch": 0.7590398365679265, + "grad_norm": 1.5306789506679488, + "learning_rate": 2.893929233012367e-06, + "loss": 0.594, + "step": 7431 + }, + { + "epoch": 0.7591419816138917, + "grad_norm": 1.3675652168530181, + "learning_rate": 2.8916019152599017e-06, + "loss": 0.6409, + "step": 7432 + }, + { + "epoch": 0.759244126659857, + "grad_norm": 1.5936042672261972, + "learning_rate": 2.889275375526196e-06, + "loss": 0.7151, + "step": 7433 + }, + { + "epoch": 0.7593462717058223, + "grad_norm": 1.5382525355908887, + "learning_rate": 2.886949614065897e-06, + "loss": 0.6944, + "step": 7434 + }, + { + "epoch": 0.7594484167517875, + "grad_norm": 1.497449052729323, + "learning_rate": 2.884624631133559e-06, + "loss": 0.763, + "step": 7435 + }, + { + "epoch": 0.7595505617977528, + "grad_norm": 1.5548274775949733, + "learning_rate": 2.8823004269836517e-06, + "loss": 0.6861, + "step": 7436 + }, + { + "epoch": 0.7596527068437181, + "grad_norm": 1.4865981173243363, + "learning_rate": 2.8799770018705587e-06, + "loss": 0.7088, + "step": 7437 + }, + { + "epoch": 0.7597548518896834, + "grad_norm": 1.4000335774005424, + "learning_rate": 2.877654356048586e-06, + "loss": 0.7364, + "step": 7438 + }, + { + "epoch": 0.7598569969356487, + "grad_norm": 1.5071074989111395, + "learning_rate": 2.8753324897719425e-06, + "loss": 0.7147, + "step": 7439 + }, + { + "epoch": 0.7599591419816139, + "grad_norm": 1.4078020191031093, + "learning_rate": 2.8730114032947643e-06, + "loss": 0.6415, + "step": 7440 + }, + { + "epoch": 0.7600612870275791, + "grad_norm": 1.5260236379706307, + "learning_rate": 2.8706910968710923e-06, + "loss": 0.5871, + "step": 7441 + }, + { + "epoch": 0.7601634320735444, + "grad_norm": 1.3698127996229998, + "learning_rate": 2.8683715707548863e-06, + "loss": 0.6525, + "step": 7442 + }, + { + "epoch": 0.7602655771195097, + "grad_norm": 1.4774106970744387, + "learning_rate": 2.8660528252000165e-06, + "loss": 0.6196, + "step": 7443 + }, + { + "epoch": 0.760367722165475, + "grad_norm": 1.431585655150089, + "learning_rate": 2.8637348604602765e-06, + "loss": 0.7249, + "step": 7444 + }, + { + "epoch": 0.7604698672114403, + "grad_norm": 1.4239611399007868, + "learning_rate": 2.8614176767893644e-06, + "loss": 0.6432, + "step": 7445 + }, + { + "epoch": 0.7605720122574056, + "grad_norm": 1.6015901888784416, + "learning_rate": 2.859101274440902e-06, + "loss": 0.6907, + "step": 7446 + }, + { + "epoch": 0.7606741573033707, + "grad_norm": 1.5402087867669139, + "learning_rate": 2.856785653668419e-06, + "loss": 0.6794, + "step": 7447 + }, + { + "epoch": 0.760776302349336, + "grad_norm": 1.4990709112836973, + "learning_rate": 2.8544708147253585e-06, + "loss": 0.6723, + "step": 7448 + }, + { + "epoch": 0.7608784473953013, + "grad_norm": 1.4290053785281838, + "learning_rate": 2.8521567578650867e-06, + "loss": 0.6772, + "step": 7449 + }, + { + "epoch": 0.7609805924412666, + "grad_norm": 1.3486039730762873, + "learning_rate": 2.8498434833408762e-06, + "loss": 0.6593, + "step": 7450 + }, + { + "epoch": 0.7610827374872319, + "grad_norm": 1.6297030504294905, + "learning_rate": 2.8475309914059157e-06, + "loss": 0.6684, + "step": 7451 + }, + { + "epoch": 0.7611848825331972, + "grad_norm": 1.5306049430840014, + "learning_rate": 2.8452192823133096e-06, + "loss": 0.7069, + "step": 7452 + }, + { + "epoch": 0.7612870275791624, + "grad_norm": 1.409766100219532, + "learning_rate": 2.8429083563160718e-06, + "loss": 0.6406, + "step": 7453 + }, + { + "epoch": 0.7613891726251277, + "grad_norm": 1.367544172775995, + "learning_rate": 2.8405982136671394e-06, + "loss": 0.6512, + "step": 7454 + }, + { + "epoch": 0.7614913176710929, + "grad_norm": 1.5869502903828088, + "learning_rate": 2.838288854619361e-06, + "loss": 0.77, + "step": 7455 + }, + { + "epoch": 0.7615934627170582, + "grad_norm": 1.4597399174442398, + "learning_rate": 2.835980279425494e-06, + "loss": 0.6743, + "step": 7456 + }, + { + "epoch": 0.7616956077630235, + "grad_norm": 1.428650917932562, + "learning_rate": 2.8336724883382137e-06, + "loss": 0.5289, + "step": 7457 + }, + { + "epoch": 0.7617977528089888, + "grad_norm": 1.3248505602502982, + "learning_rate": 2.831365481610108e-06, + "loss": 0.6505, + "step": 7458 + }, + { + "epoch": 0.761899897854954, + "grad_norm": 1.4151535911990676, + "learning_rate": 2.8290592594936837e-06, + "loss": 0.7428, + "step": 7459 + }, + { + "epoch": 0.7620020429009193, + "grad_norm": 1.4835404460932058, + "learning_rate": 2.826753822241356e-06, + "loss": 0.6555, + "step": 7460 + }, + { + "epoch": 0.7621041879468846, + "grad_norm": 1.6097453819487826, + "learning_rate": 2.8244491701054555e-06, + "loss": 0.8062, + "step": 7461 + }, + { + "epoch": 0.7622063329928499, + "grad_norm": 1.5719075408084238, + "learning_rate": 2.8221453033382306e-06, + "loss": 0.7377, + "step": 7462 + }, + { + "epoch": 0.7623084780388151, + "grad_norm": 1.4120764478716656, + "learning_rate": 2.8198422221918387e-06, + "loss": 0.6108, + "step": 7463 + }, + { + "epoch": 0.7624106230847804, + "grad_norm": 1.6434040750966188, + "learning_rate": 2.8175399269183556e-06, + "loss": 0.6043, + "step": 7464 + }, + { + "epoch": 0.7625127681307456, + "grad_norm": 1.4799989995851495, + "learning_rate": 2.815238417769769e-06, + "loss": 0.6473, + "step": 7465 + }, + { + "epoch": 0.7626149131767109, + "grad_norm": 1.3046272786143347, + "learning_rate": 2.8129376949979805e-06, + "loss": 0.5871, + "step": 7466 + }, + { + "epoch": 0.7627170582226762, + "grad_norm": 1.4752261876779604, + "learning_rate": 2.8106377588547996e-06, + "loss": 0.6863, + "step": 7467 + }, + { + "epoch": 0.7628192032686415, + "grad_norm": 1.4929004380384108, + "learning_rate": 2.808338609591965e-06, + "loss": 0.6802, + "step": 7468 + }, + { + "epoch": 0.7629213483146068, + "grad_norm": 1.373773440059979, + "learning_rate": 2.8060402474611128e-06, + "loss": 0.733, + "step": 7469 + }, + { + "epoch": 0.763023493360572, + "grad_norm": 1.4713833482270784, + "learning_rate": 2.803742672713807e-06, + "loss": 0.6507, + "step": 7470 + }, + { + "epoch": 0.7631256384065372, + "grad_norm": 1.4034397487136077, + "learning_rate": 2.801445885601515e-06, + "loss": 0.7204, + "step": 7471 + }, + { + "epoch": 0.7632277834525025, + "grad_norm": 1.5504528216279785, + "learning_rate": 2.7991498863756205e-06, + "loss": 0.7103, + "step": 7472 + }, + { + "epoch": 0.7633299284984678, + "grad_norm": 1.5087620857803632, + "learning_rate": 2.7968546752874214e-06, + "loss": 0.6467, + "step": 7473 + }, + { + "epoch": 0.7634320735444331, + "grad_norm": 1.563373337757284, + "learning_rate": 2.7945602525881345e-06, + "loss": 0.7126, + "step": 7474 + }, + { + "epoch": 0.7635342185903984, + "grad_norm": 1.4671772582514293, + "learning_rate": 2.7922666185288837e-06, + "loss": 0.7377, + "step": 7475 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 1.3530425895537521, + "learning_rate": 2.789973773360708e-06, + "loss": 0.7174, + "step": 7476 + }, + { + "epoch": 0.763738508682329, + "grad_norm": 1.4603876322373197, + "learning_rate": 2.7876817173345573e-06, + "loss": 0.6659, + "step": 7477 + }, + { + "epoch": 0.7638406537282941, + "grad_norm": 1.5486962596843985, + "learning_rate": 2.785390450701303e-06, + "loss": 0.6162, + "step": 7478 + }, + { + "epoch": 0.7639427987742594, + "grad_norm": 1.5618213122041809, + "learning_rate": 2.783099973711728e-06, + "loss": 0.7013, + "step": 7479 + }, + { + "epoch": 0.7640449438202247, + "grad_norm": 1.4928805565947687, + "learning_rate": 2.7808102866165243e-06, + "loss": 0.6871, + "step": 7480 + }, + { + "epoch": 0.76414708886619, + "grad_norm": 1.4876570883563174, + "learning_rate": 2.7785213896662987e-06, + "loss": 0.7634, + "step": 7481 + }, + { + "epoch": 0.7642492339121553, + "grad_norm": 1.5855894388454, + "learning_rate": 2.776233283111569e-06, + "loss": 0.7416, + "step": 7482 + }, + { + "epoch": 0.7643513789581206, + "grad_norm": 1.522498379785394, + "learning_rate": 2.773945967202777e-06, + "loss": 0.7315, + "step": 7483 + }, + { + "epoch": 0.7644535240040858, + "grad_norm": 1.374140706227445, + "learning_rate": 2.7716594421902674e-06, + "loss": 0.6875, + "step": 7484 + }, + { + "epoch": 0.7645556690500511, + "grad_norm": 1.5176072044157276, + "learning_rate": 2.7693737083243e-06, + "loss": 0.6176, + "step": 7485 + }, + { + "epoch": 0.7646578140960163, + "grad_norm": 1.465286480544789, + "learning_rate": 2.767088765855054e-06, + "loss": 0.7252, + "step": 7486 + }, + { + "epoch": 0.7647599591419816, + "grad_norm": 1.4324917418480216, + "learning_rate": 2.7648046150326113e-06, + "loss": 0.6059, + "step": 7487 + }, + { + "epoch": 0.7648621041879469, + "grad_norm": 1.5877790408839978, + "learning_rate": 2.7625212561069826e-06, + "loss": 0.7645, + "step": 7488 + }, + { + "epoch": 0.7649642492339122, + "grad_norm": 1.4613514662155962, + "learning_rate": 2.7602386893280786e-06, + "loss": 0.6777, + "step": 7489 + }, + { + "epoch": 0.7650663942798774, + "grad_norm": 1.4025383232396098, + "learning_rate": 2.7579569149457266e-06, + "loss": 0.7007, + "step": 7490 + }, + { + "epoch": 0.7651685393258427, + "grad_norm": 1.4609774678610743, + "learning_rate": 2.7556759332096694e-06, + "loss": 0.7087, + "step": 7491 + }, + { + "epoch": 0.765270684371808, + "grad_norm": 1.448706019032512, + "learning_rate": 2.753395744369559e-06, + "loss": 0.6244, + "step": 7492 + }, + { + "epoch": 0.7653728294177733, + "grad_norm": 1.3290411468165644, + "learning_rate": 2.751116348674967e-06, + "loss": 0.604, + "step": 7493 + }, + { + "epoch": 0.7654749744637385, + "grad_norm": 1.317100743286196, + "learning_rate": 2.7488377463753755e-06, + "loss": 0.661, + "step": 7494 + }, + { + "epoch": 0.7655771195097038, + "grad_norm": 1.4570245621552407, + "learning_rate": 2.746559937720179e-06, + "loss": 0.7588, + "step": 7495 + }, + { + "epoch": 0.765679264555669, + "grad_norm": 1.4616013657384508, + "learning_rate": 2.744282922958683e-06, + "loss": 0.6718, + "step": 7496 + }, + { + "epoch": 0.7657814096016343, + "grad_norm": 1.6002785039196028, + "learning_rate": 2.7420067023401055e-06, + "loss": 0.6644, + "step": 7497 + }, + { + "epoch": 0.7658835546475996, + "grad_norm": 1.5499481201136127, + "learning_rate": 2.7397312761135864e-06, + "loss": 0.7273, + "step": 7498 + }, + { + "epoch": 0.7659856996935649, + "grad_norm": 1.4890135175877488, + "learning_rate": 2.7374566445281715e-06, + "loss": 0.651, + "step": 7499 + }, + { + "epoch": 0.7660878447395302, + "grad_norm": 1.397372249896923, + "learning_rate": 2.735182807832818e-06, + "loss": 0.6331, + "step": 7500 + }, + { + "epoch": 0.7661899897854954, + "grad_norm": 1.406717338735443, + "learning_rate": 2.732909766276396e-06, + "loss": 0.6921, + "step": 7501 + }, + { + "epoch": 0.7662921348314606, + "grad_norm": 1.4415220672406783, + "learning_rate": 2.7306375201076963e-06, + "loss": 0.6378, + "step": 7502 + }, + { + "epoch": 0.7663942798774259, + "grad_norm": 1.5594160374417692, + "learning_rate": 2.72836606957542e-06, + "loss": 0.7122, + "step": 7503 + }, + { + "epoch": 0.7664964249233912, + "grad_norm": 1.4497392599049908, + "learning_rate": 2.726095414928175e-06, + "loss": 0.6539, + "step": 7504 + }, + { + "epoch": 0.7665985699693565, + "grad_norm": 1.529216016344709, + "learning_rate": 2.7238255564144854e-06, + "loss": 0.5987, + "step": 7505 + }, + { + "epoch": 0.7667007150153218, + "grad_norm": 1.482709032801827, + "learning_rate": 2.721556494282791e-06, + "loss": 0.6588, + "step": 7506 + }, + { + "epoch": 0.7668028600612871, + "grad_norm": 1.4342341980255233, + "learning_rate": 2.719288228781437e-06, + "loss": 0.6514, + "step": 7507 + }, + { + "epoch": 0.7669050051072523, + "grad_norm": 1.5900169066365148, + "learning_rate": 2.717020760158694e-06, + "loss": 0.8167, + "step": 7508 + }, + { + "epoch": 0.7670071501532175, + "grad_norm": 1.4868839873783102, + "learning_rate": 2.714754088662731e-06, + "loss": 0.7151, + "step": 7509 + }, + { + "epoch": 0.7671092951991828, + "grad_norm": 1.5472925782318938, + "learning_rate": 2.712488214541642e-06, + "loss": 0.753, + "step": 7510 + }, + { + "epoch": 0.7672114402451481, + "grad_norm": 1.5169657050369079, + "learning_rate": 2.7102231380434276e-06, + "loss": 0.6872, + "step": 7511 + }, + { + "epoch": 0.7673135852911134, + "grad_norm": 1.4831163029773318, + "learning_rate": 2.7079588594159966e-06, + "loss": 0.6376, + "step": 7512 + }, + { + "epoch": 0.7674157303370787, + "grad_norm": 1.5441842632365448, + "learning_rate": 2.7056953789071826e-06, + "loss": 0.6884, + "step": 7513 + }, + { + "epoch": 0.767517875383044, + "grad_norm": 1.5903428649550728, + "learning_rate": 2.7034326967647228e-06, + "loss": 0.731, + "step": 7514 + }, + { + "epoch": 0.7676200204290092, + "grad_norm": 1.5789230944885124, + "learning_rate": 2.701170813236268e-06, + "loss": 0.7097, + "step": 7515 + }, + { + "epoch": 0.7677221654749745, + "grad_norm": 1.5352716098611365, + "learning_rate": 2.698909728569381e-06, + "loss": 0.7075, + "step": 7516 + }, + { + "epoch": 0.7678243105209397, + "grad_norm": 1.3717757136103805, + "learning_rate": 2.696649443011541e-06, + "loss": 0.5222, + "step": 7517 + }, + { + "epoch": 0.767926455566905, + "grad_norm": 1.5008977627382827, + "learning_rate": 2.6943899568101404e-06, + "loss": 0.7349, + "step": 7518 + }, + { + "epoch": 0.7680286006128703, + "grad_norm": 1.4578786805285304, + "learning_rate": 2.6921312702124792e-06, + "loss": 0.7027, + "step": 7519 + }, + { + "epoch": 0.7681307456588355, + "grad_norm": 1.464822780206585, + "learning_rate": 2.6898733834657732e-06, + "loss": 0.648, + "step": 7520 + }, + { + "epoch": 0.7682328907048008, + "grad_norm": 1.614545270250516, + "learning_rate": 2.687616296817144e-06, + "loss": 0.7547, + "step": 7521 + }, + { + "epoch": 0.7683350357507661, + "grad_norm": 1.412759199973982, + "learning_rate": 2.6853600105136392e-06, + "loss": 0.7793, + "step": 7522 + }, + { + "epoch": 0.7684371807967314, + "grad_norm": 1.5145375536468386, + "learning_rate": 2.6831045248022068e-06, + "loss": 0.5748, + "step": 7523 + }, + { + "epoch": 0.7685393258426966, + "grad_norm": 1.357646530383558, + "learning_rate": 2.6808498399297113e-06, + "loss": 0.649, + "step": 7524 + }, + { + "epoch": 0.7686414708886619, + "grad_norm": 1.340401378469464, + "learning_rate": 2.6785959561429264e-06, + "loss": 0.6371, + "step": 7525 + }, + { + "epoch": 0.7687436159346271, + "grad_norm": 1.5109008779893367, + "learning_rate": 2.6763428736885477e-06, + "loss": 0.749, + "step": 7526 + }, + { + "epoch": 0.7688457609805924, + "grad_norm": 1.4268138130774322, + "learning_rate": 2.6740905928131712e-06, + "loss": 0.6787, + "step": 7527 + }, + { + "epoch": 0.7689479060265577, + "grad_norm": 1.3867794435906604, + "learning_rate": 2.6718391137633138e-06, + "loss": 0.7069, + "step": 7528 + }, + { + "epoch": 0.769050051072523, + "grad_norm": 1.4807100385710656, + "learning_rate": 2.669588436785401e-06, + "loss": 0.5362, + "step": 7529 + }, + { + "epoch": 0.7691521961184883, + "grad_norm": 1.497960398518875, + "learning_rate": 2.6673385621257698e-06, + "loss": 0.7432, + "step": 7530 + }, + { + "epoch": 0.7692543411644536, + "grad_norm": 1.372783096873257, + "learning_rate": 2.6650894900306667e-06, + "loss": 0.6242, + "step": 7531 + }, + { + "epoch": 0.7693564862104187, + "grad_norm": 1.6108390917590352, + "learning_rate": 2.6628412207462616e-06, + "loss": 0.6651, + "step": 7532 + }, + { + "epoch": 0.769458631256384, + "grad_norm": 1.4706447661449698, + "learning_rate": 2.660593754518622e-06, + "loss": 0.5825, + "step": 7533 + }, + { + "epoch": 0.7695607763023493, + "grad_norm": 1.5474869320658484, + "learning_rate": 2.6583470915937403e-06, + "loss": 0.7041, + "step": 7534 + }, + { + "epoch": 0.7696629213483146, + "grad_norm": 1.4532029343508164, + "learning_rate": 2.656101232217514e-06, + "loss": 0.6687, + "step": 7535 + }, + { + "epoch": 0.7697650663942799, + "grad_norm": 1.427826472608391, + "learning_rate": 2.6538561766357486e-06, + "loss": 0.7307, + "step": 7536 + }, + { + "epoch": 0.7698672114402452, + "grad_norm": 1.449006821514821, + "learning_rate": 2.651611925094174e-06, + "loss": 0.6744, + "step": 7537 + }, + { + "epoch": 0.7699693564862105, + "grad_norm": 1.564866030599005, + "learning_rate": 2.649368477838422e-06, + "loss": 0.5883, + "step": 7538 + }, + { + "epoch": 0.7700715015321757, + "grad_norm": 1.5395065801551826, + "learning_rate": 2.6471258351140393e-06, + "loss": 0.5955, + "step": 7539 + }, + { + "epoch": 0.7701736465781409, + "grad_norm": 1.6246055267562316, + "learning_rate": 2.6448839971664853e-06, + "loss": 0.6826, + "step": 7540 + }, + { + "epoch": 0.7702757916241062, + "grad_norm": 1.5117148746997864, + "learning_rate": 2.6426429642411235e-06, + "loss": 0.6492, + "step": 7541 + }, + { + "epoch": 0.7703779366700715, + "grad_norm": 1.5138880704219884, + "learning_rate": 2.6404027365832473e-06, + "loss": 0.7324, + "step": 7542 + }, + { + "epoch": 0.7704800817160368, + "grad_norm": 1.4896248288183176, + "learning_rate": 2.638163314438048e-06, + "loss": 0.6586, + "step": 7543 + }, + { + "epoch": 0.770582226762002, + "grad_norm": 1.4400288205613612, + "learning_rate": 2.6359246980506293e-06, + "loss": 0.646, + "step": 7544 + }, + { + "epoch": 0.7706843718079673, + "grad_norm": 1.4597150643872812, + "learning_rate": 2.6336868876660104e-06, + "loss": 0.6293, + "step": 7545 + }, + { + "epoch": 0.7707865168539326, + "grad_norm": 1.510403378345157, + "learning_rate": 2.631449883529119e-06, + "loss": 0.6754, + "step": 7546 + }, + { + "epoch": 0.7708886618998979, + "grad_norm": 1.5858149207337373, + "learning_rate": 2.6292136858848006e-06, + "loss": 0.7348, + "step": 7547 + }, + { + "epoch": 0.7709908069458631, + "grad_norm": 1.4434889761626102, + "learning_rate": 2.6269782949778066e-06, + "loss": 0.7862, + "step": 7548 + }, + { + "epoch": 0.7710929519918284, + "grad_norm": 1.5675751178491457, + "learning_rate": 2.6247437110527984e-06, + "loss": 0.7084, + "step": 7549 + }, + { + "epoch": 0.7711950970377937, + "grad_norm": 1.4935930227943943, + "learning_rate": 2.6225099343543593e-06, + "loss": 0.6616, + "step": 7550 + }, + { + "epoch": 0.7712972420837589, + "grad_norm": 1.5252268690979696, + "learning_rate": 2.620276965126971e-06, + "loss": 0.6467, + "step": 7551 + }, + { + "epoch": 0.7713993871297242, + "grad_norm": 1.3865577879508761, + "learning_rate": 2.618044803615041e-06, + "loss": 0.6129, + "step": 7552 + }, + { + "epoch": 0.7715015321756895, + "grad_norm": 1.3724344870142149, + "learning_rate": 2.615813450062875e-06, + "loss": 0.5971, + "step": 7553 + }, + { + "epoch": 0.7716036772216548, + "grad_norm": 1.4986142043516903, + "learning_rate": 2.613582904714699e-06, + "loss": 0.641, + "step": 7554 + }, + { + "epoch": 0.77170582226762, + "grad_norm": 1.5344258851324915, + "learning_rate": 2.611353167814643e-06, + "loss": 0.7846, + "step": 7555 + }, + { + "epoch": 0.7718079673135853, + "grad_norm": 1.3651701702339099, + "learning_rate": 2.6091242396067586e-06, + "loss": 0.5861, + "step": 7556 + }, + { + "epoch": 0.7719101123595505, + "grad_norm": 1.5376752242556553, + "learning_rate": 2.6068961203349997e-06, + "loss": 0.6837, + "step": 7557 + }, + { + "epoch": 0.7720122574055158, + "grad_norm": 1.5233326085928105, + "learning_rate": 2.604668810243238e-06, + "loss": 0.674, + "step": 7558 + }, + { + "epoch": 0.7721144024514811, + "grad_norm": 1.7229607075030353, + "learning_rate": 2.6024423095752547e-06, + "loss": 0.7216, + "step": 7559 + }, + { + "epoch": 0.7722165474974464, + "grad_norm": 1.6084217351509287, + "learning_rate": 2.6002166185747403e-06, + "loss": 0.6364, + "step": 7560 + }, + { + "epoch": 0.7723186925434117, + "grad_norm": 1.4218962991240265, + "learning_rate": 2.5979917374852935e-06, + "loss": 0.6961, + "step": 7561 + }, + { + "epoch": 0.772420837589377, + "grad_norm": 1.4656733956316854, + "learning_rate": 2.595767666550437e-06, + "loss": 0.5589, + "step": 7562 + }, + { + "epoch": 0.7725229826353421, + "grad_norm": 1.528048160897142, + "learning_rate": 2.5935444060135938e-06, + "loss": 0.6968, + "step": 7563 + }, + { + "epoch": 0.7726251276813074, + "grad_norm": 1.482536158161189, + "learning_rate": 2.591321956118099e-06, + "loss": 0.6833, + "step": 7564 + }, + { + "epoch": 0.7727272727272727, + "grad_norm": 1.6896649908100947, + "learning_rate": 2.5891003171072014e-06, + "loss": 0.7131, + "step": 7565 + }, + { + "epoch": 0.772829417773238, + "grad_norm": 1.4242507648558607, + "learning_rate": 2.586879489224061e-06, + "loss": 0.7124, + "step": 7566 + }, + { + "epoch": 0.7729315628192033, + "grad_norm": 1.6088421900468373, + "learning_rate": 2.5846594727117537e-06, + "loss": 0.6358, + "step": 7567 + }, + { + "epoch": 0.7730337078651686, + "grad_norm": 1.5067738200233267, + "learning_rate": 2.5824402678132576e-06, + "loss": 0.7074, + "step": 7568 + }, + { + "epoch": 0.7731358529111338, + "grad_norm": 1.416804467247809, + "learning_rate": 2.580221874771467e-06, + "loss": 0.6683, + "step": 7569 + }, + { + "epoch": 0.7732379979570991, + "grad_norm": 1.5147699063257296, + "learning_rate": 2.5780042938291817e-06, + "loss": 0.7125, + "step": 7570 + }, + { + "epoch": 0.7733401430030643, + "grad_norm": 1.3546254665536068, + "learning_rate": 2.5757875252291266e-06, + "loss": 0.5713, + "step": 7571 + }, + { + "epoch": 0.7734422880490296, + "grad_norm": 1.5794784190956486, + "learning_rate": 2.573571569213922e-06, + "loss": 0.6644, + "step": 7572 + }, + { + "epoch": 0.7735444330949949, + "grad_norm": 1.5533172552031582, + "learning_rate": 2.571356426026105e-06, + "loss": 0.6402, + "step": 7573 + }, + { + "epoch": 0.7736465781409602, + "grad_norm": 1.554171903370183, + "learning_rate": 2.5691420959081295e-06, + "loss": 0.7905, + "step": 7574 + }, + { + "epoch": 0.7737487231869254, + "grad_norm": 1.4843270149620513, + "learning_rate": 2.566928579102349e-06, + "loss": 0.6466, + "step": 7575 + }, + { + "epoch": 0.7738508682328907, + "grad_norm": 1.7730460244938047, + "learning_rate": 2.5647158758510414e-06, + "loss": 0.7165, + "step": 7576 + }, + { + "epoch": 0.773953013278856, + "grad_norm": 1.5373337920144932, + "learning_rate": 2.562503986396385e-06, + "loss": 0.6663, + "step": 7577 + }, + { + "epoch": 0.7740551583248212, + "grad_norm": 1.5770840037033396, + "learning_rate": 2.5602929109804717e-06, + "loss": 0.7249, + "step": 7578 + }, + { + "epoch": 0.7741573033707865, + "grad_norm": 1.3823815536036115, + "learning_rate": 2.558082649845307e-06, + "loss": 0.6508, + "step": 7579 + }, + { + "epoch": 0.7742594484167518, + "grad_norm": 1.608982361327682, + "learning_rate": 2.5558732032328013e-06, + "loss": 0.73, + "step": 7580 + }, + { + "epoch": 0.774361593462717, + "grad_norm": 1.4351901003319678, + "learning_rate": 2.553664571384783e-06, + "loss": 0.6713, + "step": 7581 + }, + { + "epoch": 0.7744637385086823, + "grad_norm": 1.4472968123956922, + "learning_rate": 2.5514567545429914e-06, + "loss": 0.7065, + "step": 7582 + }, + { + "epoch": 0.7745658835546476, + "grad_norm": 1.5267743672428034, + "learning_rate": 2.54924975294907e-06, + "loss": 0.6673, + "step": 7583 + }, + { + "epoch": 0.7746680286006129, + "grad_norm": 1.6842849837530693, + "learning_rate": 2.547043566844577e-06, + "loss": 0.6996, + "step": 7584 + }, + { + "epoch": 0.7747701736465782, + "grad_norm": 1.5499116395696109, + "learning_rate": 2.5448381964709777e-06, + "loss": 0.8077, + "step": 7585 + }, + { + "epoch": 0.7748723186925434, + "grad_norm": 1.529752465699179, + "learning_rate": 2.5426336420696586e-06, + "loss": 0.7524, + "step": 7586 + }, + { + "epoch": 0.7749744637385086, + "grad_norm": 1.5834104831618432, + "learning_rate": 2.5404299038819036e-06, + "loss": 0.7769, + "step": 7587 + }, + { + "epoch": 0.7750766087844739, + "grad_norm": 1.5286150675161132, + "learning_rate": 2.538226982148917e-06, + "loss": 0.6937, + "step": 7588 + }, + { + "epoch": 0.7751787538304392, + "grad_norm": 1.469695878784593, + "learning_rate": 2.5360248771118036e-06, + "loss": 0.7045, + "step": 7589 + }, + { + "epoch": 0.7752808988764045, + "grad_norm": 1.6513951261113482, + "learning_rate": 2.5338235890115905e-06, + "loss": 0.7059, + "step": 7590 + }, + { + "epoch": 0.7753830439223698, + "grad_norm": 1.4783175317189345, + "learning_rate": 2.5316231180892127e-06, + "loss": 0.5918, + "step": 7591 + }, + { + "epoch": 0.7754851889683351, + "grad_norm": 1.338950286394361, + "learning_rate": 2.529423464585509e-06, + "loss": 0.5098, + "step": 7592 + }, + { + "epoch": 0.7755873340143004, + "grad_norm": 1.406579206030611, + "learning_rate": 2.527224628741234e-06, + "loss": 0.6693, + "step": 7593 + }, + { + "epoch": 0.7756894790602655, + "grad_norm": 1.337700529423796, + "learning_rate": 2.525026610797051e-06, + "loss": 0.652, + "step": 7594 + }, + { + "epoch": 0.7757916241062308, + "grad_norm": 1.4092099099407729, + "learning_rate": 2.5228294109935323e-06, + "loss": 0.6822, + "step": 7595 + }, + { + "epoch": 0.7758937691521961, + "grad_norm": 1.4722013992232574, + "learning_rate": 2.520633029571169e-06, + "loss": 0.6622, + "step": 7596 + }, + { + "epoch": 0.7759959141981614, + "grad_norm": 1.4936439600304174, + "learning_rate": 2.5184374667703494e-06, + "loss": 0.7578, + "step": 7597 + }, + { + "epoch": 0.7760980592441267, + "grad_norm": 1.6265080971670343, + "learning_rate": 2.5162427228313856e-06, + "loss": 0.7249, + "step": 7598 + }, + { + "epoch": 0.776200204290092, + "grad_norm": 1.5547549053836527, + "learning_rate": 2.5140487979944907e-06, + "loss": 0.6888, + "step": 7599 + }, + { + "epoch": 0.7763023493360572, + "grad_norm": 1.5441945973243612, + "learning_rate": 2.5118556924997882e-06, + "loss": 0.7252, + "step": 7600 + }, + { + "epoch": 0.7764044943820225, + "grad_norm": 1.4859112984741925, + "learning_rate": 2.5096634065873215e-06, + "loss": 0.701, + "step": 7601 + }, + { + "epoch": 0.7765066394279877, + "grad_norm": 1.6939475727356772, + "learning_rate": 2.507471940497035e-06, + "loss": 0.6758, + "step": 7602 + }, + { + "epoch": 0.776608784473953, + "grad_norm": 1.4976764380426018, + "learning_rate": 2.5052812944687854e-06, + "loss": 0.7398, + "step": 7603 + }, + { + "epoch": 0.7767109295199183, + "grad_norm": 1.3857948933455269, + "learning_rate": 2.503091468742337e-06, + "loss": 0.6992, + "step": 7604 + }, + { + "epoch": 0.7768130745658836, + "grad_norm": 1.4347429297035994, + "learning_rate": 2.50090246355737e-06, + "loss": 0.6946, + "step": 7605 + }, + { + "epoch": 0.7769152196118488, + "grad_norm": 1.502427772819888, + "learning_rate": 2.498714279153477e-06, + "loss": 0.6293, + "step": 7606 + }, + { + "epoch": 0.7770173646578141, + "grad_norm": 1.4880544900763362, + "learning_rate": 2.4965269157701533e-06, + "loss": 0.7517, + "step": 7607 + }, + { + "epoch": 0.7771195097037794, + "grad_norm": 1.4335461118431672, + "learning_rate": 2.494340373646805e-06, + "loss": 0.6687, + "step": 7608 + }, + { + "epoch": 0.7772216547497446, + "grad_norm": 1.6035790374291685, + "learning_rate": 2.4921546530227515e-06, + "loss": 0.6078, + "step": 7609 + }, + { + "epoch": 0.7773237997957099, + "grad_norm": 1.488469466015601, + "learning_rate": 2.4899697541372224e-06, + "loss": 0.6302, + "step": 7610 + }, + { + "epoch": 0.7774259448416752, + "grad_norm": 1.5170960014944703, + "learning_rate": 2.487785677229357e-06, + "loss": 0.7064, + "step": 7611 + }, + { + "epoch": 0.7775280898876404, + "grad_norm": 1.4380095436649278, + "learning_rate": 2.4856024225382027e-06, + "loss": 0.6981, + "step": 7612 + }, + { + "epoch": 0.7776302349336057, + "grad_norm": 1.4389450598710658, + "learning_rate": 2.4834199903027157e-06, + "loss": 0.5713, + "step": 7613 + }, + { + "epoch": 0.777732379979571, + "grad_norm": 1.6116819205263988, + "learning_rate": 2.481238380761769e-06, + "loss": 0.6738, + "step": 7614 + }, + { + "epoch": 0.7778345250255363, + "grad_norm": 1.6546075890814558, + "learning_rate": 2.4790575941541374e-06, + "loss": 0.7098, + "step": 7615 + }, + { + "epoch": 0.7779366700715016, + "grad_norm": 1.5638639184473528, + "learning_rate": 2.476877630718514e-06, + "loss": 0.7438, + "step": 7616 + }, + { + "epoch": 0.7780388151174668, + "grad_norm": 1.33827279086134, + "learning_rate": 2.4746984906934934e-06, + "loss": 0.717, + "step": 7617 + }, + { + "epoch": 0.778140960163432, + "grad_norm": 1.4094009629834923, + "learning_rate": 2.4725201743175854e-06, + "loss": 0.6304, + "step": 7618 + }, + { + "epoch": 0.7782431052093973, + "grad_norm": 1.7314463038745098, + "learning_rate": 2.4703426818292055e-06, + "loss": 0.6905, + "step": 7619 + }, + { + "epoch": 0.7783452502553626, + "grad_norm": 1.5702428441509972, + "learning_rate": 2.468166013466686e-06, + "loss": 0.7202, + "step": 7620 + }, + { + "epoch": 0.7784473953013279, + "grad_norm": 1.6683688231958707, + "learning_rate": 2.4659901694682597e-06, + "loss": 0.6697, + "step": 7621 + }, + { + "epoch": 0.7785495403472932, + "grad_norm": 1.388103610440498, + "learning_rate": 2.463815150072081e-06, + "loss": 0.6895, + "step": 7622 + }, + { + "epoch": 0.7786516853932585, + "grad_norm": 1.487254299221795, + "learning_rate": 2.4616409555162012e-06, + "loss": 0.6981, + "step": 7623 + }, + { + "epoch": 0.7787538304392237, + "grad_norm": 1.515056821045475, + "learning_rate": 2.4594675860385873e-06, + "loss": 0.6662, + "step": 7624 + }, + { + "epoch": 0.7788559754851889, + "grad_norm": 1.4267759597723066, + "learning_rate": 2.457295041877121e-06, + "loss": 0.6674, + "step": 7625 + }, + { + "epoch": 0.7789581205311542, + "grad_norm": 1.3765671190295736, + "learning_rate": 2.455123323269586e-06, + "loss": 0.6682, + "step": 7626 + }, + { + "epoch": 0.7790602655771195, + "grad_norm": 1.3672344889990873, + "learning_rate": 2.452952430453677e-06, + "loss": 0.7113, + "step": 7627 + }, + { + "epoch": 0.7791624106230848, + "grad_norm": 1.540548363282977, + "learning_rate": 2.4507823636670016e-06, + "loss": 0.7496, + "step": 7628 + }, + { + "epoch": 0.7792645556690501, + "grad_norm": 1.55100357333724, + "learning_rate": 2.4486131231470665e-06, + "loss": 0.6543, + "step": 7629 + }, + { + "epoch": 0.7793667007150153, + "grad_norm": 1.4784275752553009, + "learning_rate": 2.4464447091313103e-06, + "loss": 0.6619, + "step": 7630 + }, + { + "epoch": 0.7794688457609806, + "grad_norm": 1.4783956474879465, + "learning_rate": 2.4442771218570618e-06, + "loss": 0.675, + "step": 7631 + }, + { + "epoch": 0.7795709908069459, + "grad_norm": 1.4683689840493102, + "learning_rate": 2.4421103615615626e-06, + "loss": 0.6474, + "step": 7632 + }, + { + "epoch": 0.7796731358529111, + "grad_norm": 1.7554036087838423, + "learning_rate": 2.4399444284819685e-06, + "loss": 0.6144, + "step": 7633 + }, + { + "epoch": 0.7797752808988764, + "grad_norm": 1.3290933715433075, + "learning_rate": 2.437779322855337e-06, + "loss": 0.5352, + "step": 7634 + }, + { + "epoch": 0.7798774259448417, + "grad_norm": 1.6188972321603414, + "learning_rate": 2.4356150449186487e-06, + "loss": 0.6766, + "step": 7635 + }, + { + "epoch": 0.779979570990807, + "grad_norm": 1.4314166944686777, + "learning_rate": 2.43345159490878e-06, + "loss": 0.6879, + "step": 7636 + }, + { + "epoch": 0.7800817160367722, + "grad_norm": 1.6147653985890673, + "learning_rate": 2.43128897306252e-06, + "loss": 0.7007, + "step": 7637 + }, + { + "epoch": 0.7801838610827375, + "grad_norm": 1.447919503119416, + "learning_rate": 2.429127179616575e-06, + "loss": 0.6918, + "step": 7638 + }, + { + "epoch": 0.7802860061287028, + "grad_norm": 1.3470491877991122, + "learning_rate": 2.426966214807549e-06, + "loss": 0.63, + "step": 7639 + }, + { + "epoch": 0.780388151174668, + "grad_norm": 1.3820116857530786, + "learning_rate": 2.424806078871966e-06, + "loss": 0.7285, + "step": 7640 + }, + { + "epoch": 0.7804902962206333, + "grad_norm": 1.603025158894778, + "learning_rate": 2.422646772046252e-06, + "loss": 0.7288, + "step": 7641 + }, + { + "epoch": 0.7805924412665985, + "grad_norm": 1.5172059781463059, + "learning_rate": 2.420488294566745e-06, + "loss": 0.6552, + "step": 7642 + }, + { + "epoch": 0.7806945863125638, + "grad_norm": 1.477034566542824, + "learning_rate": 2.4183306466696877e-06, + "loss": 0.661, + "step": 7643 + }, + { + "epoch": 0.7807967313585291, + "grad_norm": 1.3917112776727087, + "learning_rate": 2.4161738285912427e-06, + "loss": 0.7714, + "step": 7644 + }, + { + "epoch": 0.7808988764044944, + "grad_norm": 1.4742150368416518, + "learning_rate": 2.4140178405674685e-06, + "loss": 0.6272, + "step": 7645 + }, + { + "epoch": 0.7810010214504597, + "grad_norm": 1.4376822368932918, + "learning_rate": 2.411862682834346e-06, + "loss": 0.6434, + "step": 7646 + }, + { + "epoch": 0.781103166496425, + "grad_norm": 1.5077904883888897, + "learning_rate": 2.4097083556277555e-06, + "loss": 0.7089, + "step": 7647 + }, + { + "epoch": 0.7812053115423901, + "grad_norm": 1.4974472962238985, + "learning_rate": 2.4075548591834897e-06, + "loss": 0.5656, + "step": 7648 + }, + { + "epoch": 0.7813074565883554, + "grad_norm": 1.445861665764525, + "learning_rate": 2.405402193737246e-06, + "loss": 0.7199, + "step": 7649 + }, + { + "epoch": 0.7814096016343207, + "grad_norm": 1.312592755924976, + "learning_rate": 2.4032503595246437e-06, + "loss": 0.7204, + "step": 7650 + }, + { + "epoch": 0.781511746680286, + "grad_norm": 1.577097964607172, + "learning_rate": 2.4010993567811956e-06, + "loss": 0.692, + "step": 7651 + }, + { + "epoch": 0.7816138917262513, + "grad_norm": 1.5185926617116954, + "learning_rate": 2.398949185742334e-06, + "loss": 0.6134, + "step": 7652 + }, + { + "epoch": 0.7817160367722166, + "grad_norm": 1.5451330602267526, + "learning_rate": 2.3967998466433916e-06, + "loss": 0.6931, + "step": 7653 + }, + { + "epoch": 0.7818181818181819, + "grad_norm": 1.318621785866221, + "learning_rate": 2.394651339719618e-06, + "loss": 0.6825, + "step": 7654 + }, + { + "epoch": 0.7819203268641471, + "grad_norm": 1.4521904128742185, + "learning_rate": 2.3925036652061717e-06, + "loss": 0.5794, + "step": 7655 + }, + { + "epoch": 0.7820224719101123, + "grad_norm": 1.532754558659476, + "learning_rate": 2.3903568233381146e-06, + "loss": 0.7097, + "step": 7656 + }, + { + "epoch": 0.7821246169560776, + "grad_norm": 1.4848551096716975, + "learning_rate": 2.388210814350419e-06, + "loss": 0.7281, + "step": 7657 + }, + { + "epoch": 0.7822267620020429, + "grad_norm": 1.5240290141762678, + "learning_rate": 2.386065638477966e-06, + "loss": 0.7469, + "step": 7658 + }, + { + "epoch": 0.7823289070480082, + "grad_norm": 1.5137068931192816, + "learning_rate": 2.38392129595555e-06, + "loss": 0.6739, + "step": 7659 + }, + { + "epoch": 0.7824310520939735, + "grad_norm": 1.4835196680465237, + "learning_rate": 2.3817777870178692e-06, + "loss": 0.7589, + "step": 7660 + }, + { + "epoch": 0.7825331971399387, + "grad_norm": 1.4313653341396373, + "learning_rate": 2.3796351118995287e-06, + "loss": 0.6427, + "step": 7661 + }, + { + "epoch": 0.782635342185904, + "grad_norm": 1.6172344529486073, + "learning_rate": 2.377493270835051e-06, + "loss": 0.7277, + "step": 7662 + }, + { + "epoch": 0.7827374872318692, + "grad_norm": 1.3378542726162284, + "learning_rate": 2.3753522640588567e-06, + "loss": 0.6077, + "step": 7663 + }, + { + "epoch": 0.7828396322778345, + "grad_norm": 1.6194442006199115, + "learning_rate": 2.373212091805287e-06, + "loss": 0.7198, + "step": 7664 + }, + { + "epoch": 0.7829417773237998, + "grad_norm": 1.5822903909886965, + "learning_rate": 2.371072754308581e-06, + "loss": 0.7089, + "step": 7665 + }, + { + "epoch": 0.783043922369765, + "grad_norm": 1.4514494964980613, + "learning_rate": 2.36893425180289e-06, + "loss": 0.6891, + "step": 7666 + }, + { + "epoch": 0.7831460674157303, + "grad_norm": 1.508272499985356, + "learning_rate": 2.3667965845222774e-06, + "loss": 0.6542, + "step": 7667 + }, + { + "epoch": 0.7832482124616956, + "grad_norm": 1.5027684876561473, + "learning_rate": 2.364659752700705e-06, + "loss": 0.6781, + "step": 7668 + }, + { + "epoch": 0.7833503575076609, + "grad_norm": 1.7554138002883881, + "learning_rate": 2.362523756572058e-06, + "loss": 0.7918, + "step": 7669 + }, + { + "epoch": 0.7834525025536262, + "grad_norm": 1.4273235942365043, + "learning_rate": 2.3603885963701225e-06, + "loss": 0.681, + "step": 7670 + }, + { + "epoch": 0.7835546475995914, + "grad_norm": 1.523686588231696, + "learning_rate": 2.3582542723285904e-06, + "loss": 0.6414, + "step": 7671 + }, + { + "epoch": 0.7836567926455567, + "grad_norm": 1.5645708272699634, + "learning_rate": 2.356120784681065e-06, + "loss": 0.7478, + "step": 7672 + }, + { + "epoch": 0.7837589376915219, + "grad_norm": 1.5715813585413854, + "learning_rate": 2.353988133661056e-06, + "loss": 0.8196, + "step": 7673 + }, + { + "epoch": 0.7838610827374872, + "grad_norm": 1.4354206242403733, + "learning_rate": 2.3518563195019893e-06, + "loss": 0.6108, + "step": 7674 + }, + { + "epoch": 0.7839632277834525, + "grad_norm": 1.386126888076502, + "learning_rate": 2.3497253424371892e-06, + "loss": 0.6445, + "step": 7675 + }, + { + "epoch": 0.7840653728294178, + "grad_norm": 1.4041361724344759, + "learning_rate": 2.3475952026998927e-06, + "loss": 0.6445, + "step": 7676 + }, + { + "epoch": 0.7841675178753831, + "grad_norm": 1.6572468984466966, + "learning_rate": 2.3454659005232425e-06, + "loss": 0.745, + "step": 7677 + }, + { + "epoch": 0.7842696629213484, + "grad_norm": 1.5299766591611008, + "learning_rate": 2.343337436140295e-06, + "loss": 0.6797, + "step": 7678 + }, + { + "epoch": 0.7843718079673135, + "grad_norm": 1.376831705725251, + "learning_rate": 2.3412098097840154e-06, + "loss": 0.6109, + "step": 7679 + }, + { + "epoch": 0.7844739530132788, + "grad_norm": 1.4169714742399724, + "learning_rate": 2.3390830216872697e-06, + "loss": 0.6675, + "step": 7680 + }, + { + "epoch": 0.7845760980592441, + "grad_norm": 1.3636722903648457, + "learning_rate": 2.3369570720828372e-06, + "loss": 0.6785, + "step": 7681 + }, + { + "epoch": 0.7846782431052094, + "grad_norm": 1.4925143166837687, + "learning_rate": 2.3348319612034042e-06, + "loss": 0.836, + "step": 7682 + }, + { + "epoch": 0.7847803881511747, + "grad_norm": 1.3346006933195858, + "learning_rate": 2.3327076892815626e-06, + "loss": 0.6976, + "step": 7683 + }, + { + "epoch": 0.78488253319714, + "grad_norm": 1.4023282393407008, + "learning_rate": 2.3305842565498203e-06, + "loss": 0.6069, + "step": 7684 + }, + { + "epoch": 0.7849846782431052, + "grad_norm": 1.5711373378897706, + "learning_rate": 2.3284616632405842e-06, + "loss": 0.7301, + "step": 7685 + }, + { + "epoch": 0.7850868232890705, + "grad_norm": 1.4996578332272632, + "learning_rate": 2.3263399095861785e-06, + "loss": 0.6176, + "step": 7686 + }, + { + "epoch": 0.7851889683350357, + "grad_norm": 1.5191855583785092, + "learning_rate": 2.3242189958188264e-06, + "loss": 0.576, + "step": 7687 + }, + { + "epoch": 0.785291113381001, + "grad_norm": 1.5242311616271274, + "learning_rate": 2.3220989221706626e-06, + "loss": 0.5946, + "step": 7688 + }, + { + "epoch": 0.7853932584269663, + "grad_norm": 1.6345507436252389, + "learning_rate": 2.3199796888737338e-06, + "loss": 0.7623, + "step": 7689 + }, + { + "epoch": 0.7854954034729316, + "grad_norm": 1.466363102995361, + "learning_rate": 2.317861296159991e-06, + "loss": 0.6504, + "step": 7690 + }, + { + "epoch": 0.7855975485188968, + "grad_norm": 1.3461696448152913, + "learning_rate": 2.3157437442612927e-06, + "loss": 0.6434, + "step": 7691 + }, + { + "epoch": 0.7856996935648621, + "grad_norm": 1.4707618637234265, + "learning_rate": 2.3136270334094035e-06, + "loss": 0.7655, + "step": 7692 + }, + { + "epoch": 0.7858018386108274, + "grad_norm": 1.5984912603227295, + "learning_rate": 2.311511163836001e-06, + "loss": 0.7024, + "step": 7693 + }, + { + "epoch": 0.7859039836567926, + "grad_norm": 1.4768427932076362, + "learning_rate": 2.3093961357726723e-06, + "loss": 0.656, + "step": 7694 + }, + { + "epoch": 0.7860061287027579, + "grad_norm": 1.5417951071721385, + "learning_rate": 2.307281949450905e-06, + "loss": 0.7571, + "step": 7695 + }, + { + "epoch": 0.7861082737487232, + "grad_norm": 1.416543992798943, + "learning_rate": 2.3051686051020983e-06, + "loss": 0.7372, + "step": 7696 + }, + { + "epoch": 0.7862104187946884, + "grad_norm": 1.5643679749839585, + "learning_rate": 2.303056102957557e-06, + "loss": 0.6975, + "step": 7697 + }, + { + "epoch": 0.7863125638406537, + "grad_norm": 1.462697740373663, + "learning_rate": 2.3009444432485007e-06, + "loss": 0.7783, + "step": 7698 + }, + { + "epoch": 0.786414708886619, + "grad_norm": 1.5875406803774808, + "learning_rate": 2.2988336262060485e-06, + "loss": 0.7481, + "step": 7699 + }, + { + "epoch": 0.7865168539325843, + "grad_norm": 1.555921731479916, + "learning_rate": 2.2967236520612322e-06, + "loss": 0.7263, + "step": 7700 + }, + { + "epoch": 0.7866189989785496, + "grad_norm": 1.4674329237488177, + "learning_rate": 2.2946145210449864e-06, + "loss": 0.5993, + "step": 7701 + }, + { + "epoch": 0.7867211440245148, + "grad_norm": 1.4915759610494845, + "learning_rate": 2.292506233388162e-06, + "loss": 0.7766, + "step": 7702 + }, + { + "epoch": 0.78682328907048, + "grad_norm": 1.415489653003323, + "learning_rate": 2.2903987893215086e-06, + "loss": 0.6813, + "step": 7703 + }, + { + "epoch": 0.7869254341164453, + "grad_norm": 1.6817373728547307, + "learning_rate": 2.2882921890756906e-06, + "loss": 0.7087, + "step": 7704 + }, + { + "epoch": 0.7870275791624106, + "grad_norm": 1.4575124396260595, + "learning_rate": 2.2861864328812744e-06, + "loss": 0.6379, + "step": 7705 + }, + { + "epoch": 0.7871297242083759, + "grad_norm": 1.5891209368904666, + "learning_rate": 2.2840815209687374e-06, + "loss": 0.7641, + "step": 7706 + }, + { + "epoch": 0.7872318692543412, + "grad_norm": 1.38507553831139, + "learning_rate": 2.28197745356846e-06, + "loss": 0.6488, + "step": 7707 + }, + { + "epoch": 0.7873340143003065, + "grad_norm": 1.6155673762899823, + "learning_rate": 2.2798742309107403e-06, + "loss": 0.641, + "step": 7708 + }, + { + "epoch": 0.7874361593462718, + "grad_norm": 1.4748210167259121, + "learning_rate": 2.2777718532257697e-06, + "loss": 0.7034, + "step": 7709 + }, + { + "epoch": 0.7875383043922369, + "grad_norm": 1.3663874776661888, + "learning_rate": 2.2756703207436627e-06, + "loss": 0.6579, + "step": 7710 + }, + { + "epoch": 0.7876404494382022, + "grad_norm": 1.54911097365704, + "learning_rate": 2.27356963369443e-06, + "loss": 0.6903, + "step": 7711 + }, + { + "epoch": 0.7877425944841675, + "grad_norm": 1.601233941436273, + "learning_rate": 2.2714697923079887e-06, + "loss": 0.6352, + "step": 7712 + }, + { + "epoch": 0.7878447395301328, + "grad_norm": 1.4373921554482778, + "learning_rate": 2.2693707968141763e-06, + "loss": 0.6822, + "step": 7713 + }, + { + "epoch": 0.7879468845760981, + "grad_norm": 1.4424820628253012, + "learning_rate": 2.267272647442724e-06, + "loss": 0.7219, + "step": 7714 + }, + { + "epoch": 0.7880490296220634, + "grad_norm": 1.533168956869607, + "learning_rate": 2.265175344423276e-06, + "loss": 0.7286, + "step": 7715 + }, + { + "epoch": 0.7881511746680286, + "grad_norm": 1.41928161159938, + "learning_rate": 2.2630788879853815e-06, + "loss": 0.6958, + "step": 7716 + }, + { + "epoch": 0.7882533197139938, + "grad_norm": 1.5728201144796565, + "learning_rate": 2.2609832783585018e-06, + "loss": 0.6634, + "step": 7717 + }, + { + "epoch": 0.7883554647599591, + "grad_norm": 1.6185838848821745, + "learning_rate": 2.2588885157720053e-06, + "loss": 0.6313, + "step": 7718 + }, + { + "epoch": 0.7884576098059244, + "grad_norm": 1.445847426059235, + "learning_rate": 2.2567946004551612e-06, + "loss": 0.7044, + "step": 7719 + }, + { + "epoch": 0.7885597548518897, + "grad_norm": 1.5788010115964295, + "learning_rate": 2.254701532637151e-06, + "loss": 0.7247, + "step": 7720 + }, + { + "epoch": 0.788661899897855, + "grad_norm": 1.5785515159173298, + "learning_rate": 2.2526093125470627e-06, + "loss": 0.6895, + "step": 7721 + }, + { + "epoch": 0.7887640449438202, + "grad_norm": 1.5414168102281451, + "learning_rate": 2.2505179404138876e-06, + "loss": 0.7424, + "step": 7722 + }, + { + "epoch": 0.7888661899897855, + "grad_norm": 1.4583423290212223, + "learning_rate": 2.2484274164665333e-06, + "loss": 0.6, + "step": 7723 + }, + { + "epoch": 0.7889683350357508, + "grad_norm": 1.502839995420565, + "learning_rate": 2.246337740933806e-06, + "loss": 0.717, + "step": 7724 + }, + { + "epoch": 0.789070480081716, + "grad_norm": 1.494109657510985, + "learning_rate": 2.244248914044421e-06, + "loss": 0.6619, + "step": 7725 + }, + { + "epoch": 0.7891726251276813, + "grad_norm": 1.4023564978251515, + "learning_rate": 2.2421609360270047e-06, + "loss": 0.6693, + "step": 7726 + }, + { + "epoch": 0.7892747701736466, + "grad_norm": 1.3483217304961572, + "learning_rate": 2.2400738071100845e-06, + "loss": 0.5749, + "step": 7727 + }, + { + "epoch": 0.7893769152196118, + "grad_norm": 1.5477708024937964, + "learning_rate": 2.237987527522102e-06, + "loss": 0.786, + "step": 7728 + }, + { + "epoch": 0.7894790602655771, + "grad_norm": 1.4838746471082362, + "learning_rate": 2.2359020974913993e-06, + "loss": 0.6476, + "step": 7729 + }, + { + "epoch": 0.7895812053115424, + "grad_norm": 1.6475066396148115, + "learning_rate": 2.2338175172462283e-06, + "loss": 0.7322, + "step": 7730 + }, + { + "epoch": 0.7896833503575077, + "grad_norm": 1.4992696030548056, + "learning_rate": 2.2317337870147447e-06, + "loss": 0.7326, + "step": 7731 + }, + { + "epoch": 0.789785495403473, + "grad_norm": 1.5950771049805939, + "learning_rate": 2.2296509070250204e-06, + "loss": 0.72, + "step": 7732 + }, + { + "epoch": 0.7898876404494382, + "grad_norm": 1.5274755871763535, + "learning_rate": 2.2275688775050207e-06, + "loss": 0.8128, + "step": 7733 + }, + { + "epoch": 0.7899897854954034, + "grad_norm": 2.3044514493104735, + "learning_rate": 2.2254876986826325e-06, + "loss": 0.6297, + "step": 7734 + }, + { + "epoch": 0.7900919305413687, + "grad_norm": 1.458553851396111, + "learning_rate": 2.2234073707856396e-06, + "loss": 0.693, + "step": 7735 + }, + { + "epoch": 0.790194075587334, + "grad_norm": 1.4160446000964098, + "learning_rate": 2.2213278940417324e-06, + "loss": 0.6243, + "step": 7736 + }, + { + "epoch": 0.7902962206332993, + "grad_norm": 1.5466232503962603, + "learning_rate": 2.2192492686785118e-06, + "loss": 0.7368, + "step": 7737 + }, + { + "epoch": 0.7903983656792646, + "grad_norm": 1.6364154664122268, + "learning_rate": 2.217171494923488e-06, + "loss": 0.7579, + "step": 7738 + }, + { + "epoch": 0.7905005107252299, + "grad_norm": 1.429565326953612, + "learning_rate": 2.215094573004072e-06, + "loss": 0.7286, + "step": 7739 + }, + { + "epoch": 0.7906026557711952, + "grad_norm": 1.5316915083205112, + "learning_rate": 2.2130185031475846e-06, + "loss": 0.7297, + "step": 7740 + }, + { + "epoch": 0.7907048008171603, + "grad_norm": 1.4328271417827034, + "learning_rate": 2.2109432855812506e-06, + "loss": 0.6855, + "step": 7741 + }, + { + "epoch": 0.7908069458631256, + "grad_norm": 1.5359671614090045, + "learning_rate": 2.2088689205322065e-06, + "loss": 0.6877, + "step": 7742 + }, + { + "epoch": 0.7909090909090909, + "grad_norm": 1.3718872049803184, + "learning_rate": 2.2067954082274957e-06, + "loss": 0.6588, + "step": 7743 + }, + { + "epoch": 0.7910112359550562, + "grad_norm": 1.254225192578895, + "learning_rate": 2.2047227488940612e-06, + "loss": 0.6267, + "step": 7744 + }, + { + "epoch": 0.7911133810010215, + "grad_norm": 1.4011968923465954, + "learning_rate": 2.2026509427587605e-06, + "loss": 0.6158, + "step": 7745 + }, + { + "epoch": 0.7912155260469868, + "grad_norm": 1.6006798977682952, + "learning_rate": 2.200579990048347e-06, + "loss": 0.7223, + "step": 7746 + }, + { + "epoch": 0.791317671092952, + "grad_norm": 1.5850799455521105, + "learning_rate": 2.1985098909894966e-06, + "loss": 0.543, + "step": 7747 + }, + { + "epoch": 0.7914198161389172, + "grad_norm": 1.5306118364914878, + "learning_rate": 2.196440645808778e-06, + "loss": 0.7339, + "step": 7748 + }, + { + "epoch": 0.7915219611848825, + "grad_norm": 1.4610803300551756, + "learning_rate": 2.194372254732671e-06, + "loss": 0.7584, + "step": 7749 + }, + { + "epoch": 0.7916241062308478, + "grad_norm": 1.4929412606662422, + "learning_rate": 2.1923047179875657e-06, + "loss": 0.6773, + "step": 7750 + }, + { + "epoch": 0.7917262512768131, + "grad_norm": 1.4306215611670807, + "learning_rate": 2.190238035799751e-06, + "loss": 0.671, + "step": 7751 + }, + { + "epoch": 0.7918283963227783, + "grad_norm": 1.5143401090958584, + "learning_rate": 2.1881722083954315e-06, + "loss": 0.7536, + "step": 7752 + }, + { + "epoch": 0.7919305413687436, + "grad_norm": 1.3999228204357042, + "learning_rate": 2.186107236000712e-06, + "loss": 0.7278, + "step": 7753 + }, + { + "epoch": 0.7920326864147089, + "grad_norm": 1.7467814304123583, + "learning_rate": 2.1840431188416023e-06, + "loss": 0.6505, + "step": 7754 + }, + { + "epoch": 0.7921348314606742, + "grad_norm": 1.4953040765986458, + "learning_rate": 2.181979857144024e-06, + "loss": 0.6531, + "step": 7755 + }, + { + "epoch": 0.7922369765066394, + "grad_norm": 1.7642953030621489, + "learning_rate": 2.1799174511337986e-06, + "loss": 0.7327, + "step": 7756 + }, + { + "epoch": 0.7923391215526047, + "grad_norm": 1.542536512103279, + "learning_rate": 2.177855901036661e-06, + "loss": 0.6371, + "step": 7757 + }, + { + "epoch": 0.79244126659857, + "grad_norm": 1.4940264897239741, + "learning_rate": 2.1757952070782507e-06, + "loss": 0.6538, + "step": 7758 + }, + { + "epoch": 0.7925434116445352, + "grad_norm": 1.5707196521044482, + "learning_rate": 2.17373536948411e-06, + "loss": 0.7362, + "step": 7759 + }, + { + "epoch": 0.7926455566905005, + "grad_norm": 1.616765095131558, + "learning_rate": 2.17167638847969e-06, + "loss": 0.7563, + "step": 7760 + }, + { + "epoch": 0.7927477017364658, + "grad_norm": 1.4381821085408717, + "learning_rate": 2.169618264290344e-06, + "loss": 0.6801, + "step": 7761 + }, + { + "epoch": 0.7928498467824311, + "grad_norm": 1.4715761686181472, + "learning_rate": 2.1675609971413402e-06, + "loss": 0.6975, + "step": 7762 + }, + { + "epoch": 0.7929519918283964, + "grad_norm": 1.5236353731762902, + "learning_rate": 2.1655045872578475e-06, + "loss": 0.7424, + "step": 7763 + }, + { + "epoch": 0.7930541368743615, + "grad_norm": 1.5206858183983176, + "learning_rate": 2.1634490348649372e-06, + "loss": 0.7655, + "step": 7764 + }, + { + "epoch": 0.7931562819203268, + "grad_norm": 1.4346918407738651, + "learning_rate": 2.1613943401875924e-06, + "loss": 0.7352, + "step": 7765 + }, + { + "epoch": 0.7932584269662921, + "grad_norm": 1.4833452816491084, + "learning_rate": 2.1593405034506998e-06, + "loss": 0.6923, + "step": 7766 + }, + { + "epoch": 0.7933605720122574, + "grad_norm": 1.420371761572305, + "learning_rate": 2.157287524879058e-06, + "loss": 0.5678, + "step": 7767 + }, + { + "epoch": 0.7934627170582227, + "grad_norm": 1.5075347096621476, + "learning_rate": 2.1552354046973646e-06, + "loss": 0.6416, + "step": 7768 + }, + { + "epoch": 0.793564862104188, + "grad_norm": 1.5575647830266741, + "learning_rate": 2.1531841431302234e-06, + "loss": 0.7177, + "step": 7769 + }, + { + "epoch": 0.7936670071501533, + "grad_norm": 1.5433175771119019, + "learning_rate": 2.151133740402148e-06, + "loss": 0.7175, + "step": 7770 + }, + { + "epoch": 0.7937691521961185, + "grad_norm": 1.7005725423204792, + "learning_rate": 2.1490841967375532e-06, + "loss": 0.607, + "step": 7771 + }, + { + "epoch": 0.7938712972420837, + "grad_norm": 1.5178772110503094, + "learning_rate": 2.147035512360768e-06, + "loss": 0.7173, + "step": 7772 + }, + { + "epoch": 0.793973442288049, + "grad_norm": 1.4680008683515775, + "learning_rate": 2.1449876874960163e-06, + "loss": 0.7326, + "step": 7773 + }, + { + "epoch": 0.7940755873340143, + "grad_norm": 1.495083306898762, + "learning_rate": 2.1429407223674403e-06, + "loss": 0.5941, + "step": 7774 + }, + { + "epoch": 0.7941777323799796, + "grad_norm": 1.4657531382249784, + "learning_rate": 2.1408946171990785e-06, + "loss": 0.6601, + "step": 7775 + }, + { + "epoch": 0.7942798774259449, + "grad_norm": 1.433108704608721, + "learning_rate": 2.1388493722148763e-06, + "loss": 0.629, + "step": 7776 + }, + { + "epoch": 0.7943820224719101, + "grad_norm": 1.435341016400755, + "learning_rate": 2.136804987638691e-06, + "loss": 0.6752, + "step": 7777 + }, + { + "epoch": 0.7944841675178754, + "grad_norm": 1.4858876831898125, + "learning_rate": 2.1347614636942815e-06, + "loss": 0.6933, + "step": 7778 + }, + { + "epoch": 0.7945863125638406, + "grad_norm": 1.5384439538433965, + "learning_rate": 2.13271880060531e-06, + "loss": 0.6353, + "step": 7779 + }, + { + "epoch": 0.7946884576098059, + "grad_norm": 1.4775748546862888, + "learning_rate": 2.130676998595347e-06, + "loss": 0.6364, + "step": 7780 + }, + { + "epoch": 0.7947906026557712, + "grad_norm": 1.4768966528031424, + "learning_rate": 2.1286360578878693e-06, + "loss": 0.7569, + "step": 7781 + }, + { + "epoch": 0.7948927477017365, + "grad_norm": 1.5743931378204703, + "learning_rate": 2.126595978706265e-06, + "loss": 0.6567, + "step": 7782 + }, + { + "epoch": 0.7949948927477017, + "grad_norm": 1.473910644840203, + "learning_rate": 2.1245567612738162e-06, + "loss": 0.6736, + "step": 7783 + }, + { + "epoch": 0.795097037793667, + "grad_norm": 1.804966063752203, + "learning_rate": 2.1225184058137193e-06, + "loss": 0.7526, + "step": 7784 + }, + { + "epoch": 0.7951991828396323, + "grad_norm": 1.4447521422290606, + "learning_rate": 2.120480912549069e-06, + "loss": 0.7293, + "step": 7785 + }, + { + "epoch": 0.7953013278855976, + "grad_norm": 1.4685435216149456, + "learning_rate": 2.118444281702876e-06, + "loss": 0.8328, + "step": 7786 + }, + { + "epoch": 0.7954034729315628, + "grad_norm": 1.674530418337665, + "learning_rate": 2.1164085134980495e-06, + "loss": 0.7127, + "step": 7787 + }, + { + "epoch": 0.7955056179775281, + "grad_norm": 1.2504057617445306, + "learning_rate": 2.114373608157404e-06, + "loss": 0.5616, + "step": 7788 + }, + { + "epoch": 0.7956077630234933, + "grad_norm": 1.5912585076784915, + "learning_rate": 2.1123395659036596e-06, + "loss": 0.7491, + "step": 7789 + }, + { + "epoch": 0.7957099080694586, + "grad_norm": 1.4060460282399452, + "learning_rate": 2.1103063869594486e-06, + "loss": 0.667, + "step": 7790 + }, + { + "epoch": 0.7958120531154239, + "grad_norm": 1.4262370083762665, + "learning_rate": 2.108274071547297e-06, + "loss": 0.6282, + "step": 7791 + }, + { + "epoch": 0.7959141981613892, + "grad_norm": 1.4336262958184738, + "learning_rate": 2.1062426198896514e-06, + "loss": 0.7297, + "step": 7792 + }, + { + "epoch": 0.7960163432073545, + "grad_norm": 1.6402344100111523, + "learning_rate": 2.10421203220885e-06, + "loss": 0.7123, + "step": 7793 + }, + { + "epoch": 0.7961184882533198, + "grad_norm": 1.6616797050760206, + "learning_rate": 2.1021823087271432e-06, + "loss": 0.7789, + "step": 7794 + }, + { + "epoch": 0.7962206332992849, + "grad_norm": 1.4345402458261807, + "learning_rate": 2.100153449666682e-06, + "loss": 0.6388, + "step": 7795 + }, + { + "epoch": 0.7963227783452502, + "grad_norm": 1.468227603833657, + "learning_rate": 2.0981254552495334e-06, + "loss": 0.6402, + "step": 7796 + }, + { + "epoch": 0.7964249233912155, + "grad_norm": 1.5579346912602685, + "learning_rate": 2.0960983256976565e-06, + "loss": 0.737, + "step": 7797 + }, + { + "epoch": 0.7965270684371808, + "grad_norm": 3.482558951292091, + "learning_rate": 2.0940720612329258e-06, + "loss": 0.7463, + "step": 7798 + }, + { + "epoch": 0.7966292134831461, + "grad_norm": 1.4661638893927695, + "learning_rate": 2.0920466620771174e-06, + "loss": 0.611, + "step": 7799 + }, + { + "epoch": 0.7967313585291114, + "grad_norm": 1.5498746242493828, + "learning_rate": 2.0900221284519074e-06, + "loss": 0.7438, + "step": 7800 + }, + { + "epoch": 0.7968335035750767, + "grad_norm": 1.4277591910811147, + "learning_rate": 2.0879984605788882e-06, + "loss": 0.5957, + "step": 7801 + }, + { + "epoch": 0.7969356486210418, + "grad_norm": 1.390686777835806, + "learning_rate": 2.085975658679551e-06, + "loss": 0.6177, + "step": 7802 + }, + { + "epoch": 0.7970377936670071, + "grad_norm": 1.5217939386383543, + "learning_rate": 2.0839537229752893e-06, + "loss": 0.6949, + "step": 7803 + }, + { + "epoch": 0.7971399387129724, + "grad_norm": 1.5078474293340662, + "learning_rate": 2.081932653687405e-06, + "loss": 0.7715, + "step": 7804 + }, + { + "epoch": 0.7972420837589377, + "grad_norm": 1.5915791364209657, + "learning_rate": 2.079912451037107e-06, + "loss": 0.6397, + "step": 7805 + }, + { + "epoch": 0.797344228804903, + "grad_norm": 1.5338049938419043, + "learning_rate": 2.077893115245512e-06, + "loss": 0.7746, + "step": 7806 + }, + { + "epoch": 0.7974463738508683, + "grad_norm": 1.4118878727361974, + "learning_rate": 2.0758746465336333e-06, + "loss": 0.712, + "step": 7807 + }, + { + "epoch": 0.7975485188968335, + "grad_norm": 1.598702725140203, + "learning_rate": 2.073857045122395e-06, + "loss": 0.7753, + "step": 7808 + }, + { + "epoch": 0.7976506639427988, + "grad_norm": 1.5440358595469432, + "learning_rate": 2.0718403112326224e-06, + "loss": 0.6232, + "step": 7809 + }, + { + "epoch": 0.797752808988764, + "grad_norm": 1.4412255520040955, + "learning_rate": 2.069824445085048e-06, + "loss": 0.6509, + "step": 7810 + }, + { + "epoch": 0.7978549540347293, + "grad_norm": 1.5233929953072618, + "learning_rate": 2.0678094469003152e-06, + "loss": 0.7029, + "step": 7811 + }, + { + "epoch": 0.7979570990806946, + "grad_norm": 1.5344864200997315, + "learning_rate": 2.065795316898962e-06, + "loss": 0.71, + "step": 7812 + }, + { + "epoch": 0.7980592441266599, + "grad_norm": 1.4710654671240766, + "learning_rate": 2.0637820553014385e-06, + "loss": 0.7155, + "step": 7813 + }, + { + "epoch": 0.7981613891726251, + "grad_norm": 1.4652225766492732, + "learning_rate": 2.0617696623280937e-06, + "loss": 0.7055, + "step": 7814 + }, + { + "epoch": 0.7982635342185904, + "grad_norm": 1.489376405220086, + "learning_rate": 2.059758138199187e-06, + "loss": 0.6514, + "step": 7815 + }, + { + "epoch": 0.7983656792645557, + "grad_norm": 1.519029822602198, + "learning_rate": 2.0577474831348864e-06, + "loss": 0.7517, + "step": 7816 + }, + { + "epoch": 0.798467824310521, + "grad_norm": 1.6589740526821566, + "learning_rate": 2.0557376973552544e-06, + "loss": 0.6488, + "step": 7817 + }, + { + "epoch": 0.7985699693564862, + "grad_norm": 1.4441806248944669, + "learning_rate": 2.053728781080264e-06, + "loss": 0.7254, + "step": 7818 + }, + { + "epoch": 0.7986721144024514, + "grad_norm": 1.4006448883095295, + "learning_rate": 2.0517207345297897e-06, + "loss": 0.6432, + "step": 7819 + }, + { + "epoch": 0.7987742594484167, + "grad_norm": 1.7425232741713046, + "learning_rate": 2.0497135579236195e-06, + "loss": 0.7133, + "step": 7820 + }, + { + "epoch": 0.798876404494382, + "grad_norm": 1.5223818585166318, + "learning_rate": 2.0477072514814354e-06, + "loss": 0.691, + "step": 7821 + }, + { + "epoch": 0.7989785495403473, + "grad_norm": 1.566802337374586, + "learning_rate": 2.045701815422829e-06, + "loss": 0.724, + "step": 7822 + }, + { + "epoch": 0.7990806945863126, + "grad_norm": 1.429739979681164, + "learning_rate": 2.043697249967301e-06, + "loss": 0.636, + "step": 7823 + }, + { + "epoch": 0.7991828396322779, + "grad_norm": 1.5745168034946238, + "learning_rate": 2.041693555334249e-06, + "loss": 0.6891, + "step": 7824 + }, + { + "epoch": 0.7992849846782432, + "grad_norm": 1.3969928437003056, + "learning_rate": 2.039690731742976e-06, + "loss": 0.6785, + "step": 7825 + }, + { + "epoch": 0.7993871297242083, + "grad_norm": 1.395234069145252, + "learning_rate": 2.0376887794126986e-06, + "loss": 0.6319, + "step": 7826 + }, + { + "epoch": 0.7994892747701736, + "grad_norm": 1.4144523298206955, + "learning_rate": 2.0356876985625285e-06, + "loss": 0.5293, + "step": 7827 + }, + { + "epoch": 0.7995914198161389, + "grad_norm": 1.5673781311751296, + "learning_rate": 2.0336874894114856e-06, + "loss": 0.6844, + "step": 7828 + }, + { + "epoch": 0.7996935648621042, + "grad_norm": 1.4040287387446788, + "learning_rate": 2.0316881521784916e-06, + "loss": 0.7407, + "step": 7829 + }, + { + "epoch": 0.7997957099080695, + "grad_norm": 1.468170969782854, + "learning_rate": 2.0296896870823767e-06, + "loss": 0.6657, + "step": 7830 + }, + { + "epoch": 0.7998978549540348, + "grad_norm": 1.4338864919497423, + "learning_rate": 2.0276920943418777e-06, + "loss": 0.7278, + "step": 7831 + }, + { + "epoch": 0.8, + "grad_norm": 1.4287010507776465, + "learning_rate": 2.02569537417563e-06, + "loss": 0.7813, + "step": 7832 + }, + { + "epoch": 0.8001021450459652, + "grad_norm": 1.479377825599003, + "learning_rate": 2.0236995268021753e-06, + "loss": 0.6252, + "step": 7833 + }, + { + "epoch": 0.8002042900919305, + "grad_norm": 1.3965300952698099, + "learning_rate": 2.021704552439959e-06, + "loss": 0.6533, + "step": 7834 + }, + { + "epoch": 0.8003064351378958, + "grad_norm": 1.5008063754573753, + "learning_rate": 2.0197104513073364e-06, + "loss": 0.8283, + "step": 7835 + }, + { + "epoch": 0.8004085801838611, + "grad_norm": 1.5756820422433289, + "learning_rate": 2.017717223622561e-06, + "loss": 0.7619, + "step": 7836 + }, + { + "epoch": 0.8005107252298264, + "grad_norm": 1.3299366697271333, + "learning_rate": 2.0157248696037913e-06, + "loss": 0.6385, + "step": 7837 + }, + { + "epoch": 0.8006128702757916, + "grad_norm": 1.4951029011046026, + "learning_rate": 2.0137333894690913e-06, + "loss": 0.6253, + "step": 7838 + }, + { + "epoch": 0.8007150153217569, + "grad_norm": 1.407405901496055, + "learning_rate": 2.011742783436432e-06, + "loss": 0.8059, + "step": 7839 + }, + { + "epoch": 0.8008171603677222, + "grad_norm": 1.5685550241561312, + "learning_rate": 2.0097530517236887e-06, + "loss": 0.7203, + "step": 7840 + }, + { + "epoch": 0.8009193054136874, + "grad_norm": 1.5488988160344208, + "learning_rate": 2.007764194548636e-06, + "loss": 0.5405, + "step": 7841 + }, + { + "epoch": 0.8010214504596527, + "grad_norm": 1.6215162095236308, + "learning_rate": 2.0057762121289557e-06, + "loss": 0.6855, + "step": 7842 + }, + { + "epoch": 0.801123595505618, + "grad_norm": 1.476000901025003, + "learning_rate": 2.0037891046822343e-06, + "loss": 0.741, + "step": 7843 + }, + { + "epoch": 0.8012257405515832, + "grad_norm": 1.3818520490469952, + "learning_rate": 2.0018028724259588e-06, + "loss": 0.6146, + "step": 7844 + }, + { + "epoch": 0.8013278855975485, + "grad_norm": 1.5172993527121303, + "learning_rate": 1.999817515577529e-06, + "loss": 0.6863, + "step": 7845 + }, + { + "epoch": 0.8014300306435138, + "grad_norm": 1.4830161140553875, + "learning_rate": 1.9978330343542384e-06, + "loss": 0.6809, + "step": 7846 + }, + { + "epoch": 0.8015321756894791, + "grad_norm": 1.3914812293102228, + "learning_rate": 1.9958494289732957e-06, + "loss": 0.7178, + "step": 7847 + }, + { + "epoch": 0.8016343207354444, + "grad_norm": 1.3899691250074713, + "learning_rate": 1.993866699651803e-06, + "loss": 0.6144, + "step": 7848 + }, + { + "epoch": 0.8017364657814096, + "grad_norm": 1.507668945810921, + "learning_rate": 1.991884846606771e-06, + "loss": 0.788, + "step": 7849 + }, + { + "epoch": 0.8018386108273748, + "grad_norm": 1.5818428599766068, + "learning_rate": 1.9899038700551178e-06, + "loss": 0.7728, + "step": 7850 + }, + { + "epoch": 0.8019407558733401, + "grad_norm": 1.623485864049958, + "learning_rate": 1.987923770213662e-06, + "loss": 0.7417, + "step": 7851 + }, + { + "epoch": 0.8020429009193054, + "grad_norm": 1.4559210447437414, + "learning_rate": 1.9859445472991257e-06, + "loss": 0.6484, + "step": 7852 + }, + { + "epoch": 0.8021450459652707, + "grad_norm": 1.3207792022236151, + "learning_rate": 1.983966201528135e-06, + "loss": 0.5143, + "step": 7853 + }, + { + "epoch": 0.802247191011236, + "grad_norm": 1.5882542768427277, + "learning_rate": 1.9819887331172204e-06, + "loss": 0.6981, + "step": 7854 + }, + { + "epoch": 0.8023493360572013, + "grad_norm": 1.4830620488288764, + "learning_rate": 1.9800121422828233e-06, + "loss": 0.6929, + "step": 7855 + }, + { + "epoch": 0.8024514811031664, + "grad_norm": 1.412441450711497, + "learning_rate": 1.978036429241279e-06, + "loss": 0.6024, + "step": 7856 + }, + { + "epoch": 0.8025536261491317, + "grad_norm": 1.4299424048926375, + "learning_rate": 1.9760615942088303e-06, + "loss": 0.6576, + "step": 7857 + }, + { + "epoch": 0.802655771195097, + "grad_norm": 1.2771392906672054, + "learning_rate": 1.9740876374016218e-06, + "loss": 0.6277, + "step": 7858 + }, + { + "epoch": 0.8027579162410623, + "grad_norm": 1.5305675571129098, + "learning_rate": 1.972114559035708e-06, + "loss": 0.6761, + "step": 7859 + }, + { + "epoch": 0.8028600612870276, + "grad_norm": 1.5722089861990158, + "learning_rate": 1.970142359327044e-06, + "loss": 0.8413, + "step": 7860 + }, + { + "epoch": 0.8029622063329929, + "grad_norm": 1.4898777718418432, + "learning_rate": 1.968171038491485e-06, + "loss": 0.6995, + "step": 7861 + }, + { + "epoch": 0.8030643513789582, + "grad_norm": 1.658860581799003, + "learning_rate": 1.966200596744794e-06, + "loss": 0.7807, + "step": 7862 + }, + { + "epoch": 0.8031664964249234, + "grad_norm": 1.5518478295504126, + "learning_rate": 1.9642310343026405e-06, + "loss": 0.6348, + "step": 7863 + }, + { + "epoch": 0.8032686414708886, + "grad_norm": 1.535489079308936, + "learning_rate": 1.9622623513805894e-06, + "loss": 0.6936, + "step": 7864 + }, + { + "epoch": 0.8033707865168539, + "grad_norm": 1.3781665758697053, + "learning_rate": 1.9602945481941194e-06, + "loss": 0.6291, + "step": 7865 + }, + { + "epoch": 0.8034729315628192, + "grad_norm": 1.4477381102309401, + "learning_rate": 1.958327624958606e-06, + "loss": 0.7067, + "step": 7866 + }, + { + "epoch": 0.8035750766087845, + "grad_norm": 1.3350438482889797, + "learning_rate": 1.956361581889329e-06, + "loss": 0.654, + "step": 7867 + }, + { + "epoch": 0.8036772216547498, + "grad_norm": 1.4554463691795991, + "learning_rate": 1.9543964192014707e-06, + "loss": 0.6267, + "step": 7868 + }, + { + "epoch": 0.803779366700715, + "grad_norm": 1.3856215847927158, + "learning_rate": 1.952432137110125e-06, + "loss": 0.622, + "step": 7869 + }, + { + "epoch": 0.8038815117466803, + "grad_norm": 1.5511593394798093, + "learning_rate": 1.95046873583028e-06, + "loss": 0.748, + "step": 7870 + }, + { + "epoch": 0.8039836567926456, + "grad_norm": 1.4244741164085335, + "learning_rate": 1.9485062155768344e-06, + "loss": 0.6314, + "step": 7871 + }, + { + "epoch": 0.8040858018386108, + "grad_norm": 1.406015533786862, + "learning_rate": 1.946544576564585e-06, + "loss": 0.5779, + "step": 7872 + }, + { + "epoch": 0.8041879468845761, + "grad_norm": 1.3767782878573327, + "learning_rate": 1.9445838190082334e-06, + "loss": 0.6728, + "step": 7873 + }, + { + "epoch": 0.8042900919305414, + "grad_norm": 1.4741064952387142, + "learning_rate": 1.942623943122388e-06, + "loss": 0.673, + "step": 7874 + }, + { + "epoch": 0.8043922369765066, + "grad_norm": 1.314622080331586, + "learning_rate": 1.940664949121559e-06, + "loss": 0.6857, + "step": 7875 + }, + { + "epoch": 0.8044943820224719, + "grad_norm": 1.7046901476973353, + "learning_rate": 1.938706837220159e-06, + "loss": 0.6651, + "step": 7876 + }, + { + "epoch": 0.8045965270684372, + "grad_norm": 1.4262506013747607, + "learning_rate": 1.9367496076325033e-06, + "loss": 0.6365, + "step": 7877 + }, + { + "epoch": 0.8046986721144025, + "grad_norm": 1.4257702533989989, + "learning_rate": 1.934793260572809e-06, + "loss": 0.5918, + "step": 7878 + }, + { + "epoch": 0.8048008171603678, + "grad_norm": 1.7041732961066027, + "learning_rate": 1.932837796255205e-06, + "loss": 0.6626, + "step": 7879 + }, + { + "epoch": 0.804902962206333, + "grad_norm": 1.383811858906262, + "learning_rate": 1.9308832148937175e-06, + "loss": 0.6692, + "step": 7880 + }, + { + "epoch": 0.8050051072522982, + "grad_norm": 1.369167082064326, + "learning_rate": 1.928929516702276e-06, + "loss": 0.6775, + "step": 7881 + }, + { + "epoch": 0.8051072522982635, + "grad_norm": 1.6522479135479788, + "learning_rate": 1.926976701894713e-06, + "loss": 0.6464, + "step": 7882 + }, + { + "epoch": 0.8052093973442288, + "grad_norm": 1.533123067540322, + "learning_rate": 1.9250247706847635e-06, + "loss": 0.675, + "step": 7883 + }, + { + "epoch": 0.8053115423901941, + "grad_norm": 1.5250416963307527, + "learning_rate": 1.9230737232860718e-06, + "loss": 0.7414, + "step": 7884 + }, + { + "epoch": 0.8054136874361594, + "grad_norm": 1.5228670946620295, + "learning_rate": 1.92112355991218e-06, + "loss": 0.7282, + "step": 7885 + }, + { + "epoch": 0.8055158324821247, + "grad_norm": 1.5578649434320615, + "learning_rate": 1.9191742807765323e-06, + "loss": 0.7192, + "step": 7886 + }, + { + "epoch": 0.8056179775280898, + "grad_norm": 1.5396840309612243, + "learning_rate": 1.917225886092483e-06, + "loss": 0.745, + "step": 7887 + }, + { + "epoch": 0.8057201225740551, + "grad_norm": 1.475875451024162, + "learning_rate": 1.9152783760732785e-06, + "loss": 0.6995, + "step": 7888 + }, + { + "epoch": 0.8058222676200204, + "grad_norm": 1.5239632145627844, + "learning_rate": 1.9133317509320837e-06, + "loss": 0.7923, + "step": 7889 + }, + { + "epoch": 0.8059244126659857, + "grad_norm": 1.765683970603823, + "learning_rate": 1.9113860108819513e-06, + "loss": 0.6844, + "step": 7890 + }, + { + "epoch": 0.806026557711951, + "grad_norm": 1.4589596888832825, + "learning_rate": 1.9094411561358485e-06, + "loss": 0.7314, + "step": 7891 + }, + { + "epoch": 0.8061287027579163, + "grad_norm": 1.5959397508765965, + "learning_rate": 1.9074971869066337e-06, + "loss": 0.6925, + "step": 7892 + }, + { + "epoch": 0.8062308478038815, + "grad_norm": 1.3503961303506125, + "learning_rate": 1.9055541034070835e-06, + "loss": 0.6406, + "step": 7893 + }, + { + "epoch": 0.8063329928498468, + "grad_norm": 1.44041817746977, + "learning_rate": 1.9036119058498637e-06, + "loss": 0.6599, + "step": 7894 + }, + { + "epoch": 0.806435137895812, + "grad_norm": 1.4683504265177987, + "learning_rate": 1.9016705944475544e-06, + "loss": 0.6885, + "step": 7895 + }, + { + "epoch": 0.8065372829417773, + "grad_norm": 1.4871009334409362, + "learning_rate": 1.8997301694126303e-06, + "loss": 0.6982, + "step": 7896 + }, + { + "epoch": 0.8066394279877426, + "grad_norm": 1.3450239755714446, + "learning_rate": 1.8977906309574733e-06, + "loss": 0.6026, + "step": 7897 + }, + { + "epoch": 0.8067415730337079, + "grad_norm": 1.4546195268425013, + "learning_rate": 1.8958519792943631e-06, + "loss": 0.634, + "step": 7898 + }, + { + "epoch": 0.8068437180796731, + "grad_norm": 1.5034998113985705, + "learning_rate": 1.8939142146354927e-06, + "loss": 0.6906, + "step": 7899 + }, + { + "epoch": 0.8069458631256384, + "grad_norm": 1.4837782354910563, + "learning_rate": 1.8919773371929485e-06, + "loss": 0.6728, + "step": 7900 + }, + { + "epoch": 0.8070480081716037, + "grad_norm": 1.348778695904617, + "learning_rate": 1.8900413471787238e-06, + "loss": 0.7286, + "step": 7901 + }, + { + "epoch": 0.807150153217569, + "grad_norm": 1.4728710949084605, + "learning_rate": 1.8881062448047105e-06, + "loss": 0.6788, + "step": 7902 + }, + { + "epoch": 0.8072522982635342, + "grad_norm": 1.672686682683323, + "learning_rate": 1.8861720302827091e-06, + "loss": 0.641, + "step": 7903 + }, + { + "epoch": 0.8073544433094995, + "grad_norm": 1.5156966434767025, + "learning_rate": 1.8842387038244248e-06, + "loss": 0.7404, + "step": 7904 + }, + { + "epoch": 0.8074565883554647, + "grad_norm": 1.4814875906895417, + "learning_rate": 1.882306265641457e-06, + "loss": 0.6202, + "step": 7905 + }, + { + "epoch": 0.80755873340143, + "grad_norm": 1.545100705588449, + "learning_rate": 1.8803747159453134e-06, + "loss": 0.72, + "step": 7906 + }, + { + "epoch": 0.8076608784473953, + "grad_norm": 1.553555430444274, + "learning_rate": 1.8784440549474004e-06, + "loss": 0.6529, + "step": 7907 + }, + { + "epoch": 0.8077630234933606, + "grad_norm": 1.4648502374514578, + "learning_rate": 1.8765142828590355e-06, + "loss": 0.6535, + "step": 7908 + }, + { + "epoch": 0.8078651685393259, + "grad_norm": 1.5225389019818745, + "learning_rate": 1.874585399891431e-06, + "loss": 0.6364, + "step": 7909 + }, + { + "epoch": 0.8079673135852911, + "grad_norm": 1.3958729195569635, + "learning_rate": 1.8726574062557012e-06, + "loss": 0.6797, + "step": 7910 + }, + { + "epoch": 0.8080694586312563, + "grad_norm": 1.2868002028588237, + "learning_rate": 1.870730302162872e-06, + "loss": 0.6603, + "step": 7911 + }, + { + "epoch": 0.8081716036772216, + "grad_norm": 1.5641896140628415, + "learning_rate": 1.86880408782386e-06, + "loss": 0.7456, + "step": 7912 + }, + { + "epoch": 0.8082737487231869, + "grad_norm": 1.5537834301057654, + "learning_rate": 1.8668787634494977e-06, + "loss": 0.6884, + "step": 7913 + }, + { + "epoch": 0.8083758937691522, + "grad_norm": 1.8912484030472259, + "learning_rate": 1.8649543292505091e-06, + "loss": 0.7589, + "step": 7914 + }, + { + "epoch": 0.8084780388151175, + "grad_norm": 1.4358158549150455, + "learning_rate": 1.863030785437525e-06, + "loss": 0.7381, + "step": 7915 + }, + { + "epoch": 0.8085801838610828, + "grad_norm": 1.5004073353923482, + "learning_rate": 1.861108132221079e-06, + "loss": 0.6894, + "step": 7916 + }, + { + "epoch": 0.808682328907048, + "grad_norm": 1.5386572871918545, + "learning_rate": 1.859186369811603e-06, + "loss": 0.6716, + "step": 7917 + }, + { + "epoch": 0.8087844739530132, + "grad_norm": 1.45145770741655, + "learning_rate": 1.8572654984194395e-06, + "loss": 0.6968, + "step": 7918 + }, + { + "epoch": 0.8088866189989785, + "grad_norm": 1.5539913209580558, + "learning_rate": 1.8553455182548296e-06, + "loss": 0.6104, + "step": 7919 + }, + { + "epoch": 0.8089887640449438, + "grad_norm": 1.3410486680365905, + "learning_rate": 1.8534264295279147e-06, + "loss": 0.6389, + "step": 7920 + }, + { + "epoch": 0.8090909090909091, + "grad_norm": 1.2980081262901544, + "learning_rate": 1.851508232448741e-06, + "loss": 0.6786, + "step": 7921 + }, + { + "epoch": 0.8091930541368744, + "grad_norm": 1.5485028065506046, + "learning_rate": 1.849590927227254e-06, + "loss": 0.6797, + "step": 7922 + }, + { + "epoch": 0.8092951991828397, + "grad_norm": 1.4579903362829445, + "learning_rate": 1.8476745140733077e-06, + "loss": 0.6679, + "step": 7923 + }, + { + "epoch": 0.8093973442288049, + "grad_norm": 1.431220913720757, + "learning_rate": 1.8457589931966524e-06, + "loss": 0.6918, + "step": 7924 + }, + { + "epoch": 0.8094994892747702, + "grad_norm": 1.4342173299359275, + "learning_rate": 1.8438443648069438e-06, + "loss": 0.6399, + "step": 7925 + }, + { + "epoch": 0.8096016343207354, + "grad_norm": 1.5700348943244125, + "learning_rate": 1.8419306291137374e-06, + "loss": 0.7745, + "step": 7926 + }, + { + "epoch": 0.8097037793667007, + "grad_norm": 1.733029895278668, + "learning_rate": 1.8400177863264934e-06, + "loss": 0.7366, + "step": 7927 + }, + { + "epoch": 0.809805924412666, + "grad_norm": 1.648336832311013, + "learning_rate": 1.8381058366545778e-06, + "loss": 0.715, + "step": 7928 + }, + { + "epoch": 0.8099080694586313, + "grad_norm": 1.4661661728975928, + "learning_rate": 1.836194780307251e-06, + "loss": 0.7344, + "step": 7929 + }, + { + "epoch": 0.8100102145045965, + "grad_norm": 1.7310958176408033, + "learning_rate": 1.8342846174936812e-06, + "loss": 0.7719, + "step": 7930 + }, + { + "epoch": 0.8101123595505618, + "grad_norm": 1.5382464051656972, + "learning_rate": 1.8323753484229345e-06, + "loss": 0.7742, + "step": 7931 + }, + { + "epoch": 0.8102145045965271, + "grad_norm": 1.5591911258345108, + "learning_rate": 1.8304669733039815e-06, + "loss": 0.6613, + "step": 7932 + }, + { + "epoch": 0.8103166496424924, + "grad_norm": 1.4359448935836214, + "learning_rate": 1.8285594923456985e-06, + "loss": 0.6323, + "step": 7933 + }, + { + "epoch": 0.8104187946884576, + "grad_norm": 1.5158351227814322, + "learning_rate": 1.826652905756855e-06, + "loss": 0.6783, + "step": 7934 + }, + { + "epoch": 0.8105209397344229, + "grad_norm": 1.426404921507915, + "learning_rate": 1.8247472137461354e-06, + "loss": 0.675, + "step": 7935 + }, + { + "epoch": 0.8106230847803881, + "grad_norm": 1.462823116399566, + "learning_rate": 1.8228424165221148e-06, + "loss": 0.7208, + "step": 7936 + }, + { + "epoch": 0.8107252298263534, + "grad_norm": 1.6508443109566289, + "learning_rate": 1.8209385142932722e-06, + "loss": 0.7414, + "step": 7937 + }, + { + "epoch": 0.8108273748723187, + "grad_norm": 1.470469224493644, + "learning_rate": 1.8190355072679955e-06, + "loss": 0.618, + "step": 7938 + }, + { + "epoch": 0.810929519918284, + "grad_norm": 1.5006714561173804, + "learning_rate": 1.817133395654569e-06, + "loss": 0.7605, + "step": 7939 + }, + { + "epoch": 0.8110316649642493, + "grad_norm": 1.4202852696306167, + "learning_rate": 1.8152321796611795e-06, + "loss": 0.7237, + "step": 7940 + }, + { + "epoch": 0.8111338100102145, + "grad_norm": 1.4820735550254496, + "learning_rate": 1.8133318594959127e-06, + "loss": 0.6766, + "step": 7941 + }, + { + "epoch": 0.8112359550561797, + "grad_norm": 1.443814795037324, + "learning_rate": 1.8114324353667633e-06, + "loss": 0.6379, + "step": 7942 + }, + { + "epoch": 0.811338100102145, + "grad_norm": 1.5418020654124236, + "learning_rate": 1.8095339074816264e-06, + "loss": 0.6774, + "step": 7943 + }, + { + "epoch": 0.8114402451481103, + "grad_norm": 1.5022210483368588, + "learning_rate": 1.8076362760482956e-06, + "loss": 0.6925, + "step": 7944 + }, + { + "epoch": 0.8115423901940756, + "grad_norm": 1.34420492719122, + "learning_rate": 1.8057395412744672e-06, + "loss": 0.7079, + "step": 7945 + }, + { + "epoch": 0.8116445352400409, + "grad_norm": 1.484745545525041, + "learning_rate": 1.8038437033677381e-06, + "loss": 0.6916, + "step": 7946 + }, + { + "epoch": 0.8117466802860062, + "grad_norm": 1.524257887649823, + "learning_rate": 1.8019487625356125e-06, + "loss": 0.7285, + "step": 7947 + }, + { + "epoch": 0.8118488253319714, + "grad_norm": 1.4591004516769996, + "learning_rate": 1.8000547189854921e-06, + "loss": 0.6556, + "step": 7948 + }, + { + "epoch": 0.8119509703779366, + "grad_norm": 1.4466771995277452, + "learning_rate": 1.7981615729246804e-06, + "loss": 0.7212, + "step": 7949 + }, + { + "epoch": 0.8120531154239019, + "grad_norm": 1.5152753876360099, + "learning_rate": 1.7962693245603813e-06, + "loss": 0.6432, + "step": 7950 + }, + { + "epoch": 0.8121552604698672, + "grad_norm": 1.4223410322362229, + "learning_rate": 1.7943779740997081e-06, + "loss": 0.6375, + "step": 7951 + }, + { + "epoch": 0.8122574055158325, + "grad_norm": 1.4052379961453423, + "learning_rate": 1.7924875217496628e-06, + "loss": 0.6403, + "step": 7952 + }, + { + "epoch": 0.8123595505617978, + "grad_norm": 1.4867767262620033, + "learning_rate": 1.7905979677171648e-06, + "loss": 0.65, + "step": 7953 + }, + { + "epoch": 0.812461695607763, + "grad_norm": 1.4383118836889621, + "learning_rate": 1.7887093122090238e-06, + "loss": 0.6201, + "step": 7954 + }, + { + "epoch": 0.8125638406537283, + "grad_norm": 1.3333697799039885, + "learning_rate": 1.7868215554319524e-06, + "loss": 0.678, + "step": 7955 + }, + { + "epoch": 0.8126659856996936, + "grad_norm": 1.4250342459219705, + "learning_rate": 1.7849346975925663e-06, + "loss": 0.649, + "step": 7956 + }, + { + "epoch": 0.8127681307456588, + "grad_norm": 1.6062665012308408, + "learning_rate": 1.7830487388973873e-06, + "loss": 0.7567, + "step": 7957 + }, + { + "epoch": 0.8128702757916241, + "grad_norm": 1.4070349300697151, + "learning_rate": 1.781163679552831e-06, + "loss": 0.6264, + "step": 7958 + }, + { + "epoch": 0.8129724208375894, + "grad_norm": 1.5890624660871187, + "learning_rate": 1.7792795197652212e-06, + "loss": 0.753, + "step": 7959 + }, + { + "epoch": 0.8130745658835546, + "grad_norm": 1.4959304316405335, + "learning_rate": 1.77739625974078e-06, + "loss": 0.7473, + "step": 7960 + }, + { + "epoch": 0.8131767109295199, + "grad_norm": 1.426543241922162, + "learning_rate": 1.775513899685628e-06, + "loss": 0.6915, + "step": 7961 + }, + { + "epoch": 0.8132788559754852, + "grad_norm": 1.5997129597596234, + "learning_rate": 1.7736324398057959e-06, + "loss": 0.7321, + "step": 7962 + }, + { + "epoch": 0.8133810010214505, + "grad_norm": 1.368155297263089, + "learning_rate": 1.7717518803072087e-06, + "loss": 0.6691, + "step": 7963 + }, + { + "epoch": 0.8134831460674158, + "grad_norm": 1.4458812247504478, + "learning_rate": 1.7698722213956943e-06, + "loss": 0.6933, + "step": 7964 + }, + { + "epoch": 0.813585291113381, + "grad_norm": 1.4594710639090436, + "learning_rate": 1.7679934632769812e-06, + "loss": 0.7478, + "step": 7965 + }, + { + "epoch": 0.8136874361593462, + "grad_norm": 1.5303612311546528, + "learning_rate": 1.7661156061566985e-06, + "loss": 0.7417, + "step": 7966 + }, + { + "epoch": 0.8137895812053115, + "grad_norm": 1.4609730584893192, + "learning_rate": 1.7642386502403875e-06, + "loss": 0.7137, + "step": 7967 + }, + { + "epoch": 0.8138917262512768, + "grad_norm": 1.6602929442118024, + "learning_rate": 1.7623625957334767e-06, + "loss": 0.7384, + "step": 7968 + }, + { + "epoch": 0.8139938712972421, + "grad_norm": 1.5775511671755067, + "learning_rate": 1.7604874428413022e-06, + "loss": 0.7189, + "step": 7969 + }, + { + "epoch": 0.8140960163432074, + "grad_norm": 1.6028287070045681, + "learning_rate": 1.7586131917690996e-06, + "loss": 0.7424, + "step": 7970 + }, + { + "epoch": 0.8141981613891727, + "grad_norm": 1.4257714808375646, + "learning_rate": 1.7567398427220062e-06, + "loss": 0.7238, + "step": 7971 + }, + { + "epoch": 0.8143003064351378, + "grad_norm": 1.428431088335843, + "learning_rate": 1.7548673959050654e-06, + "loss": 0.6986, + "step": 7972 + }, + { + "epoch": 0.8144024514811031, + "grad_norm": 1.412206966343264, + "learning_rate": 1.7529958515232149e-06, + "loss": 0.6455, + "step": 7973 + }, + { + "epoch": 0.8145045965270684, + "grad_norm": 1.3634755109936474, + "learning_rate": 1.7511252097812948e-06, + "loss": 0.6548, + "step": 7974 + }, + { + "epoch": 0.8146067415730337, + "grad_norm": 1.6398673407439053, + "learning_rate": 1.7492554708840514e-06, + "loss": 0.7374, + "step": 7975 + }, + { + "epoch": 0.814708886618999, + "grad_norm": 1.4343336537508875, + "learning_rate": 1.7473866350361256e-06, + "loss": 0.6379, + "step": 7976 + }, + { + "epoch": 0.8148110316649643, + "grad_norm": 1.3744558549715546, + "learning_rate": 1.7455187024420662e-06, + "loss": 0.673, + "step": 7977 + }, + { + "epoch": 0.8149131767109296, + "grad_norm": 1.5028474784407904, + "learning_rate": 1.743651673306318e-06, + "loss": 0.7562, + "step": 7978 + }, + { + "epoch": 0.8150153217568948, + "grad_norm": 1.4273108881268894, + "learning_rate": 1.741785547833229e-06, + "loss": 0.7721, + "step": 7979 + }, + { + "epoch": 0.81511746680286, + "grad_norm": 1.4828473332250276, + "learning_rate": 1.7399203262270447e-06, + "loss": 0.6029, + "step": 7980 + }, + { + "epoch": 0.8152196118488253, + "grad_norm": 1.586794582438247, + "learning_rate": 1.7380560086919196e-06, + "loss": 0.7166, + "step": 7981 + }, + { + "epoch": 0.8153217568947906, + "grad_norm": 1.3648965712591843, + "learning_rate": 1.7361925954319003e-06, + "loss": 0.618, + "step": 7982 + }, + { + "epoch": 0.8154239019407559, + "grad_norm": 1.5343647647186425, + "learning_rate": 1.7343300866509426e-06, + "loss": 0.6, + "step": 7983 + }, + { + "epoch": 0.8155260469867212, + "grad_norm": 1.4454439852683993, + "learning_rate": 1.7324684825528982e-06, + "loss": 0.6846, + "step": 7984 + }, + { + "epoch": 0.8156281920326864, + "grad_norm": 1.589671921118025, + "learning_rate": 1.730607783341518e-06, + "loss": 0.7539, + "step": 7985 + }, + { + "epoch": 0.8157303370786517, + "grad_norm": 1.3656294592986191, + "learning_rate": 1.7287479892204572e-06, + "loss": 0.5753, + "step": 7986 + }, + { + "epoch": 0.815832482124617, + "grad_norm": 1.4227661619110714, + "learning_rate": 1.7268891003932753e-06, + "loss": 0.6567, + "step": 7987 + }, + { + "epoch": 0.8159346271705822, + "grad_norm": 1.8317641131320619, + "learning_rate": 1.725031117063427e-06, + "loss": 0.8709, + "step": 7988 + }, + { + "epoch": 0.8160367722165475, + "grad_norm": 1.5219043725969479, + "learning_rate": 1.723174039434269e-06, + "loss": 0.8022, + "step": 7989 + }, + { + "epoch": 0.8161389172625128, + "grad_norm": 1.5051377183444266, + "learning_rate": 1.721317867709057e-06, + "loss": 0.6678, + "step": 7990 + }, + { + "epoch": 0.816241062308478, + "grad_norm": 1.591724202947102, + "learning_rate": 1.7194626020909532e-06, + "loss": 0.694, + "step": 7991 + }, + { + "epoch": 0.8163432073544433, + "grad_norm": 1.369392228141755, + "learning_rate": 1.7176082427830198e-06, + "loss": 0.7399, + "step": 7992 + }, + { + "epoch": 0.8164453524004086, + "grad_norm": 1.4050256967612185, + "learning_rate": 1.7157547899882155e-06, + "loss": 0.586, + "step": 7993 + }, + { + "epoch": 0.8165474974463739, + "grad_norm": 1.5161216006757114, + "learning_rate": 1.713902243909402e-06, + "loss": 0.6881, + "step": 7994 + }, + { + "epoch": 0.8166496424923391, + "grad_norm": 1.568385117457169, + "learning_rate": 1.712050604749339e-06, + "loss": 0.69, + "step": 7995 + }, + { + "epoch": 0.8167517875383044, + "grad_norm": 1.583993008667526, + "learning_rate": 1.7101998727106938e-06, + "loss": 0.6624, + "step": 7996 + }, + { + "epoch": 0.8168539325842696, + "grad_norm": 1.4016847403491144, + "learning_rate": 1.7083500479960292e-06, + "loss": 0.658, + "step": 7997 + }, + { + "epoch": 0.8169560776302349, + "grad_norm": 1.5733702102729963, + "learning_rate": 1.7065011308078062e-06, + "loss": 0.6432, + "step": 7998 + }, + { + "epoch": 0.8170582226762002, + "grad_norm": 1.5326362048040314, + "learning_rate": 1.7046531213483953e-06, + "loss": 0.7044, + "step": 7999 + }, + { + "epoch": 0.8171603677221655, + "grad_norm": 1.3846577361600956, + "learning_rate": 1.7028060198200568e-06, + "loss": 0.6446, + "step": 8000 + }, + { + "epoch": 0.8172625127681308, + "grad_norm": 1.5068825977677331, + "learning_rate": 1.7009598264249626e-06, + "loss": 0.7935, + "step": 8001 + }, + { + "epoch": 0.8173646578140961, + "grad_norm": 1.3413378608236013, + "learning_rate": 1.6991145413651778e-06, + "loss": 0.6713, + "step": 8002 + }, + { + "epoch": 0.8174668028600612, + "grad_norm": 1.4468841209413241, + "learning_rate": 1.6972701648426693e-06, + "loss": 0.6289, + "step": 8003 + }, + { + "epoch": 0.8175689479060265, + "grad_norm": 1.4014668987421202, + "learning_rate": 1.695426697059306e-06, + "loss": 0.7557, + "step": 8004 + }, + { + "epoch": 0.8176710929519918, + "grad_norm": 1.5265277883019224, + "learning_rate": 1.6935841382168527e-06, + "loss": 0.7633, + "step": 8005 + }, + { + "epoch": 0.8177732379979571, + "grad_norm": 1.4407500763661103, + "learning_rate": 1.6917424885169832e-06, + "loss": 0.6888, + "step": 8006 + }, + { + "epoch": 0.8178753830439224, + "grad_norm": 1.3939769220695781, + "learning_rate": 1.6899017481612678e-06, + "loss": 0.7354, + "step": 8007 + }, + { + "epoch": 0.8179775280898877, + "grad_norm": 1.6343383738154178, + "learning_rate": 1.6880619173511748e-06, + "loss": 0.8354, + "step": 8008 + }, + { + "epoch": 0.818079673135853, + "grad_norm": 1.5620359390454892, + "learning_rate": 1.6862229962880762e-06, + "loss": 0.6092, + "step": 8009 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 1.587466748414701, + "learning_rate": 1.6843849851732385e-06, + "loss": 0.7723, + "step": 8010 + }, + { + "epoch": 0.8182839632277834, + "grad_norm": 1.381452032875976, + "learning_rate": 1.682547884207839e-06, + "loss": 0.6425, + "step": 8011 + }, + { + "epoch": 0.8183861082737487, + "grad_norm": 1.700186596589639, + "learning_rate": 1.6807116935929491e-06, + "loss": 0.7651, + "step": 8012 + }, + { + "epoch": 0.818488253319714, + "grad_norm": 1.5748054651105816, + "learning_rate": 1.6788764135295377e-06, + "loss": 0.6691, + "step": 8013 + }, + { + "epoch": 0.8185903983656793, + "grad_norm": 1.5176831660640564, + "learning_rate": 1.6770420442184765e-06, + "loss": 0.6788, + "step": 8014 + }, + { + "epoch": 0.8186925434116445, + "grad_norm": 1.54067703369349, + "learning_rate": 1.6752085858605416e-06, + "loss": 0.6585, + "step": 8015 + }, + { + "epoch": 0.8187946884576098, + "grad_norm": 1.3861119891961795, + "learning_rate": 1.673376038656408e-06, + "loss": 0.589, + "step": 8016 + }, + { + "epoch": 0.8188968335035751, + "grad_norm": 1.2826141347021762, + "learning_rate": 1.6715444028066463e-06, + "loss": 0.6013, + "step": 8017 + }, + { + "epoch": 0.8189989785495404, + "grad_norm": 1.6498472966622095, + "learning_rate": 1.6697136785117307e-06, + "loss": 0.7429, + "step": 8018 + }, + { + "epoch": 0.8191011235955056, + "grad_norm": 1.4555512125186842, + "learning_rate": 1.6678838659720353e-06, + "loss": 0.6784, + "step": 8019 + }, + { + "epoch": 0.8192032686414709, + "grad_norm": 1.550441510101218, + "learning_rate": 1.666054965387831e-06, + "loss": 0.7041, + "step": 8020 + }, + { + "epoch": 0.8193054136874361, + "grad_norm": 1.4590390443304897, + "learning_rate": 1.6642269769592978e-06, + "loss": 0.6652, + "step": 8021 + }, + { + "epoch": 0.8194075587334014, + "grad_norm": 1.805244553495291, + "learning_rate": 1.6623999008865055e-06, + "loss": 0.7808, + "step": 8022 + }, + { + "epoch": 0.8195097037793667, + "grad_norm": 1.4407135472508246, + "learning_rate": 1.6605737373694332e-06, + "loss": 0.7324, + "step": 8023 + }, + { + "epoch": 0.819611848825332, + "grad_norm": 1.6032452874921865, + "learning_rate": 1.6587484866079528e-06, + "loss": 0.7143, + "step": 8024 + }, + { + "epoch": 0.8197139938712973, + "grad_norm": 1.4601865129573774, + "learning_rate": 1.6569241488018373e-06, + "loss": 0.6749, + "step": 8025 + }, + { + "epoch": 0.8198161389172625, + "grad_norm": 1.4190417545020593, + "learning_rate": 1.6551007241507667e-06, + "loss": 0.7113, + "step": 8026 + }, + { + "epoch": 0.8199182839632277, + "grad_norm": 1.5552960256500907, + "learning_rate": 1.6532782128543135e-06, + "loss": 0.5919, + "step": 8027 + }, + { + "epoch": 0.820020429009193, + "grad_norm": 1.4225992003830799, + "learning_rate": 1.6514566151119526e-06, + "loss": 0.7428, + "step": 8028 + }, + { + "epoch": 0.8201225740551583, + "grad_norm": 1.4629221048171699, + "learning_rate": 1.649635931123056e-06, + "loss": 0.7327, + "step": 8029 + }, + { + "epoch": 0.8202247191011236, + "grad_norm": 1.6162658350818524, + "learning_rate": 1.6478161610869026e-06, + "loss": 0.7273, + "step": 8030 + }, + { + "epoch": 0.8203268641470889, + "grad_norm": 1.6195539289532868, + "learning_rate": 1.645997305202668e-06, + "loss": 0.7651, + "step": 8031 + }, + { + "epoch": 0.8204290091930542, + "grad_norm": 1.5338828765134138, + "learning_rate": 1.6441793636694259e-06, + "loss": 0.605, + "step": 8032 + }, + { + "epoch": 0.8205311542390195, + "grad_norm": 1.4530702857021864, + "learning_rate": 1.6423623366861519e-06, + "loss": 0.6014, + "step": 8033 + }, + { + "epoch": 0.8206332992849846, + "grad_norm": 1.677579327574725, + "learning_rate": 1.6405462244517167e-06, + "loss": 0.6106, + "step": 8034 + }, + { + "epoch": 0.8207354443309499, + "grad_norm": 1.5250263496829697, + "learning_rate": 1.6387310271649005e-06, + "loss": 0.6463, + "step": 8035 + }, + { + "epoch": 0.8208375893769152, + "grad_norm": 1.6666564207397836, + "learning_rate": 1.6369167450243761e-06, + "loss": 0.8187, + "step": 8036 + }, + { + "epoch": 0.8209397344228805, + "grad_norm": 1.533884955634993, + "learning_rate": 1.6351033782287163e-06, + "loss": 0.71, + "step": 8037 + }, + { + "epoch": 0.8210418794688458, + "grad_norm": 1.536911361800758, + "learning_rate": 1.6332909269763952e-06, + "loss": 0.7066, + "step": 8038 + }, + { + "epoch": 0.821144024514811, + "grad_norm": 1.3162474454341764, + "learning_rate": 1.631479391465789e-06, + "loss": 0.65, + "step": 8039 + }, + { + "epoch": 0.8212461695607763, + "grad_norm": 1.4383768944053172, + "learning_rate": 1.6296687718951677e-06, + "loss": 0.7751, + "step": 8040 + }, + { + "epoch": 0.8213483146067416, + "grad_norm": 1.4374625784179178, + "learning_rate": 1.6278590684627094e-06, + "loss": 0.5932, + "step": 8041 + }, + { + "epoch": 0.8214504596527068, + "grad_norm": 1.4662672539692108, + "learning_rate": 1.626050281366486e-06, + "loss": 0.6711, + "step": 8042 + }, + { + "epoch": 0.8215526046986721, + "grad_norm": 1.4336486510731772, + "learning_rate": 1.6242424108044697e-06, + "loss": 0.6622, + "step": 8043 + }, + { + "epoch": 0.8216547497446374, + "grad_norm": 1.723983258685338, + "learning_rate": 1.6224354569745294e-06, + "loss": 0.752, + "step": 8044 + }, + { + "epoch": 0.8217568947906027, + "grad_norm": 1.4421408122236568, + "learning_rate": 1.6206294200744444e-06, + "loss": 0.7447, + "step": 8045 + }, + { + "epoch": 0.8218590398365679, + "grad_norm": 1.3984127387809175, + "learning_rate": 1.61882430030188e-06, + "loss": 0.5685, + "step": 8046 + }, + { + "epoch": 0.8219611848825332, + "grad_norm": 1.475654081888823, + "learning_rate": 1.6170200978544137e-06, + "loss": 0.6479, + "step": 8047 + }, + { + "epoch": 0.8220633299284985, + "grad_norm": 1.4680537357339023, + "learning_rate": 1.6152168129295143e-06, + "loss": 0.8026, + "step": 8048 + }, + { + "epoch": 0.8221654749744637, + "grad_norm": 1.518214472174791, + "learning_rate": 1.6134144457245493e-06, + "loss": 0.6477, + "step": 8049 + }, + { + "epoch": 0.822267620020429, + "grad_norm": 1.4166217488112398, + "learning_rate": 1.6116129964367932e-06, + "loss": 0.6504, + "step": 8050 + }, + { + "epoch": 0.8223697650663943, + "grad_norm": 1.5368412228323969, + "learning_rate": 1.609812465263415e-06, + "loss": 0.6949, + "step": 8051 + }, + { + "epoch": 0.8224719101123595, + "grad_norm": 1.4154965142983382, + "learning_rate": 1.6080128524014825e-06, + "loss": 0.62, + "step": 8052 + }, + { + "epoch": 0.8225740551583248, + "grad_norm": 1.4214435690364278, + "learning_rate": 1.6062141580479663e-06, + "loss": 0.7587, + "step": 8053 + }, + { + "epoch": 0.8226762002042901, + "grad_norm": 1.443646930356721, + "learning_rate": 1.6044163823997284e-06, + "loss": 0.6913, + "step": 8054 + }, + { + "epoch": 0.8227783452502554, + "grad_norm": 1.5119383988814488, + "learning_rate": 1.602619525653546e-06, + "loss": 0.7308, + "step": 8055 + }, + { + "epoch": 0.8228804902962207, + "grad_norm": 1.400605649521305, + "learning_rate": 1.600823588006082e-06, + "loss": 0.7269, + "step": 8056 + }, + { + "epoch": 0.8229826353421859, + "grad_norm": 1.3815686815134187, + "learning_rate": 1.5990285696539031e-06, + "loss": 0.6954, + "step": 8057 + }, + { + "epoch": 0.8230847803881511, + "grad_norm": 1.5197407934105915, + "learning_rate": 1.597234470793475e-06, + "loss": 0.9029, + "step": 8058 + }, + { + "epoch": 0.8231869254341164, + "grad_norm": 1.417217389050144, + "learning_rate": 1.5954412916211604e-06, + "loss": 0.6137, + "step": 8059 + }, + { + "epoch": 0.8232890704800817, + "grad_norm": 1.5327541937041673, + "learning_rate": 1.5936490323332288e-06, + "loss": 0.7317, + "step": 8060 + }, + { + "epoch": 0.823391215526047, + "grad_norm": 1.56224927334137, + "learning_rate": 1.5918576931258423e-06, + "loss": 0.8155, + "step": 8061 + }, + { + "epoch": 0.8234933605720123, + "grad_norm": 1.3413555421539292, + "learning_rate": 1.5900672741950606e-06, + "loss": 0.6917, + "step": 8062 + }, + { + "epoch": 0.8235955056179776, + "grad_norm": 1.6104158371848574, + "learning_rate": 1.5882777757368529e-06, + "loss": 0.711, + "step": 8063 + }, + { + "epoch": 0.8236976506639428, + "grad_norm": 1.5099048543031726, + "learning_rate": 1.5864891979470731e-06, + "loss": 0.6882, + "step": 8064 + }, + { + "epoch": 0.823799795709908, + "grad_norm": 1.4412289091663713, + "learning_rate": 1.5847015410214905e-06, + "loss": 0.5785, + "step": 8065 + }, + { + "epoch": 0.8239019407558733, + "grad_norm": 1.4527119126630987, + "learning_rate": 1.5829148051557598e-06, + "loss": 0.6907, + "step": 8066 + }, + { + "epoch": 0.8240040858018386, + "grad_norm": 1.4480130380562979, + "learning_rate": 1.581128990545443e-06, + "loss": 0.6955, + "step": 8067 + }, + { + "epoch": 0.8241062308478039, + "grad_norm": 1.5497488509131556, + "learning_rate": 1.5793440973859942e-06, + "loss": 0.7859, + "step": 8068 + }, + { + "epoch": 0.8242083758937692, + "grad_norm": 1.523893576356234, + "learning_rate": 1.5775601258727768e-06, + "loss": 0.7549, + "step": 8069 + }, + { + "epoch": 0.8243105209397344, + "grad_norm": 1.5303109961997998, + "learning_rate": 1.5757770762010438e-06, + "loss": 0.6069, + "step": 8070 + }, + { + "epoch": 0.8244126659856997, + "grad_norm": 1.5061910204237077, + "learning_rate": 1.5739949485659533e-06, + "loss": 0.6478, + "step": 8071 + }, + { + "epoch": 0.824514811031665, + "grad_norm": 1.4162350538099635, + "learning_rate": 1.5722137431625606e-06, + "loss": 0.6661, + "step": 8072 + }, + { + "epoch": 0.8246169560776302, + "grad_norm": 1.505348775066783, + "learning_rate": 1.5704334601858196e-06, + "loss": 0.6522, + "step": 8073 + }, + { + "epoch": 0.8247191011235955, + "grad_norm": 1.6089387026742075, + "learning_rate": 1.5686540998305789e-06, + "loss": 0.7021, + "step": 8074 + }, + { + "epoch": 0.8248212461695608, + "grad_norm": 1.5175669889076127, + "learning_rate": 1.566875662291597e-06, + "loss": 0.6856, + "step": 8075 + }, + { + "epoch": 0.824923391215526, + "grad_norm": 1.5352235054873198, + "learning_rate": 1.5650981477635219e-06, + "loss": 0.6367, + "step": 8076 + }, + { + "epoch": 0.8250255362614913, + "grad_norm": 1.37583573139196, + "learning_rate": 1.5633215564409054e-06, + "loss": 0.571, + "step": 8077 + }, + { + "epoch": 0.8251276813074566, + "grad_norm": 1.6331157663418598, + "learning_rate": 1.5615458885181923e-06, + "loss": 0.6756, + "step": 8078 + }, + { + "epoch": 0.8252298263534219, + "grad_norm": 1.5080650415302426, + "learning_rate": 1.5597711441897335e-06, + "loss": 0.6381, + "step": 8079 + }, + { + "epoch": 0.8253319713993871, + "grad_norm": 1.6040951449330934, + "learning_rate": 1.5579973236497781e-06, + "loss": 0.7291, + "step": 8080 + }, + { + "epoch": 0.8254341164453524, + "grad_norm": 1.4701275500989344, + "learning_rate": 1.5562244270924708e-06, + "loss": 0.6621, + "step": 8081 + }, + { + "epoch": 0.8255362614913176, + "grad_norm": 1.5604656056658814, + "learning_rate": 1.5544524547118545e-06, + "loss": 0.649, + "step": 8082 + }, + { + "epoch": 0.8256384065372829, + "grad_norm": 1.4907088569773226, + "learning_rate": 1.5526814067018715e-06, + "loss": 0.6688, + "step": 8083 + }, + { + "epoch": 0.8257405515832482, + "grad_norm": 1.501757899836109, + "learning_rate": 1.550911283256369e-06, + "loss": 0.7252, + "step": 8084 + }, + { + "epoch": 0.8258426966292135, + "grad_norm": 1.6728978496297755, + "learning_rate": 1.549142084569084e-06, + "loss": 0.7446, + "step": 8085 + }, + { + "epoch": 0.8259448416751788, + "grad_norm": 1.4452981367333813, + "learning_rate": 1.5473738108336567e-06, + "loss": 0.7121, + "step": 8086 + }, + { + "epoch": 0.8260469867211441, + "grad_norm": 1.429457661254563, + "learning_rate": 1.5456064622436296e-06, + "loss": 0.6063, + "step": 8087 + }, + { + "epoch": 0.8261491317671092, + "grad_norm": 1.4512304880192102, + "learning_rate": 1.5438400389924335e-06, + "loss": 0.6447, + "step": 8088 + }, + { + "epoch": 0.8262512768130745, + "grad_norm": 1.558338100389552, + "learning_rate": 1.5420745412734128e-06, + "loss": 0.7563, + "step": 8089 + }, + { + "epoch": 0.8263534218590398, + "grad_norm": 1.5160083871726717, + "learning_rate": 1.5403099692797963e-06, + "loss": 0.7114, + "step": 8090 + }, + { + "epoch": 0.8264555669050051, + "grad_norm": 1.492123892181121, + "learning_rate": 1.5385463232047204e-06, + "loss": 0.7375, + "step": 8091 + }, + { + "epoch": 0.8265577119509704, + "grad_norm": 1.6557691409532018, + "learning_rate": 1.5367836032412153e-06, + "loss": 0.6145, + "step": 8092 + }, + { + "epoch": 0.8266598569969357, + "grad_norm": 1.5878017494240333, + "learning_rate": 1.5350218095822112e-06, + "loss": 0.695, + "step": 8093 + }, + { + "epoch": 0.826762002042901, + "grad_norm": 1.387193819094124, + "learning_rate": 1.5332609424205391e-06, + "loss": 0.5701, + "step": 8094 + }, + { + "epoch": 0.8268641470888662, + "grad_norm": 1.6277375090634056, + "learning_rate": 1.5315010019489295e-06, + "loss": 0.6686, + "step": 8095 + }, + { + "epoch": 0.8269662921348314, + "grad_norm": 1.6622526157173243, + "learning_rate": 1.5297419883600062e-06, + "loss": 0.7579, + "step": 8096 + }, + { + "epoch": 0.8270684371807967, + "grad_norm": 1.6550526240861725, + "learning_rate": 1.527983901846295e-06, + "loss": 0.8124, + "step": 8097 + }, + { + "epoch": 0.827170582226762, + "grad_norm": 1.4494175785068921, + "learning_rate": 1.5262267426002164e-06, + "loss": 0.6293, + "step": 8098 + }, + { + "epoch": 0.8272727272727273, + "grad_norm": 1.6822110823372496, + "learning_rate": 1.524470510814099e-06, + "loss": 0.6645, + "step": 8099 + }, + { + "epoch": 0.8273748723186926, + "grad_norm": 1.4719520368114836, + "learning_rate": 1.5227152066801598e-06, + "loss": 0.6681, + "step": 8100 + }, + { + "epoch": 0.8274770173646578, + "grad_norm": 1.5635445254915965, + "learning_rate": 1.5209608303905177e-06, + "loss": 0.6939, + "step": 8101 + }, + { + "epoch": 0.8275791624106231, + "grad_norm": 1.4874623860123868, + "learning_rate": 1.519207382137189e-06, + "loss": 0.6963, + "step": 8102 + }, + { + "epoch": 0.8276813074565884, + "grad_norm": 1.495710731359258, + "learning_rate": 1.517454862112091e-06, + "loss": 0.7601, + "step": 8103 + }, + { + "epoch": 0.8277834525025536, + "grad_norm": 1.5570642158106203, + "learning_rate": 1.5157032705070417e-06, + "loss": 0.6905, + "step": 8104 + }, + { + "epoch": 0.8278855975485189, + "grad_norm": 1.4503127087048633, + "learning_rate": 1.5139526075137513e-06, + "loss": 0.6903, + "step": 8105 + }, + { + "epoch": 0.8279877425944842, + "grad_norm": 1.4071783301332017, + "learning_rate": 1.5122028733238303e-06, + "loss": 0.6499, + "step": 8106 + }, + { + "epoch": 0.8280898876404494, + "grad_norm": 1.4821741294444453, + "learning_rate": 1.5104540681287882e-06, + "loss": 0.6229, + "step": 8107 + }, + { + "epoch": 0.8281920326864147, + "grad_norm": 1.6628379335468808, + "learning_rate": 1.5087061921200308e-06, + "loss": 0.7799, + "step": 8108 + }, + { + "epoch": 0.82829417773238, + "grad_norm": 1.423496250272961, + "learning_rate": 1.5069592454888704e-06, + "loss": 0.6158, + "step": 8109 + }, + { + "epoch": 0.8283963227783453, + "grad_norm": 1.4417200216897927, + "learning_rate": 1.505213228426504e-06, + "loss": 0.7513, + "step": 8110 + }, + { + "epoch": 0.8284984678243105, + "grad_norm": 1.3463908866510086, + "learning_rate": 1.5034681411240414e-06, + "loss": 0.7002, + "step": 8111 + }, + { + "epoch": 0.8286006128702758, + "grad_norm": 1.5401210939308432, + "learning_rate": 1.5017239837724795e-06, + "loss": 0.644, + "step": 8112 + }, + { + "epoch": 0.828702757916241, + "grad_norm": 1.6268378794525058, + "learning_rate": 1.4999807565627167e-06, + "loss": 0.7819, + "step": 8113 + }, + { + "epoch": 0.8288049029622063, + "grad_norm": 1.5591847129074905, + "learning_rate": 1.4982384596855537e-06, + "loss": 0.6801, + "step": 8114 + }, + { + "epoch": 0.8289070480081716, + "grad_norm": 1.6447593048461076, + "learning_rate": 1.4964970933316836e-06, + "loss": 0.6697, + "step": 8115 + }, + { + "epoch": 0.8290091930541369, + "grad_norm": 1.5248259117396603, + "learning_rate": 1.4947566576917016e-06, + "loss": 0.6449, + "step": 8116 + }, + { + "epoch": 0.8291113381001022, + "grad_norm": 1.560802603325432, + "learning_rate": 1.4930171529560955e-06, + "loss": 0.6817, + "step": 8117 + }, + { + "epoch": 0.8292134831460675, + "grad_norm": 1.3704862435692438, + "learning_rate": 1.4912785793152584e-06, + "loss": 0.7968, + "step": 8118 + }, + { + "epoch": 0.8293156281920326, + "grad_norm": 1.5810451100859217, + "learning_rate": 1.4895409369594805e-06, + "loss": 0.6009, + "step": 8119 + }, + { + "epoch": 0.8294177732379979, + "grad_norm": 1.5572763229827489, + "learning_rate": 1.487804226078946e-06, + "loss": 0.7799, + "step": 8120 + }, + { + "epoch": 0.8295199182839632, + "grad_norm": 1.5738311685225543, + "learning_rate": 1.4860684468637376e-06, + "loss": 0.7179, + "step": 8121 + }, + { + "epoch": 0.8296220633299285, + "grad_norm": 1.542633263928474, + "learning_rate": 1.4843335995038365e-06, + "loss": 0.6726, + "step": 8122 + }, + { + "epoch": 0.8297242083758938, + "grad_norm": 1.5234966994007288, + "learning_rate": 1.4825996841891265e-06, + "loss": 0.6665, + "step": 8123 + }, + { + "epoch": 0.8298263534218591, + "grad_norm": 1.4979572308512723, + "learning_rate": 1.4808667011093847e-06, + "loss": 0.6412, + "step": 8124 + }, + { + "epoch": 0.8299284984678243, + "grad_norm": 1.5224455304666085, + "learning_rate": 1.479134650454287e-06, + "loss": 0.6978, + "step": 8125 + }, + { + "epoch": 0.8300306435137896, + "grad_norm": 1.2996654420346645, + "learning_rate": 1.4774035324134039e-06, + "loss": 0.6551, + "step": 8126 + }, + { + "epoch": 0.8301327885597548, + "grad_norm": 1.7430308606116491, + "learning_rate": 1.475673347176213e-06, + "loss": 0.7088, + "step": 8127 + }, + { + "epoch": 0.8302349336057201, + "grad_norm": 1.4430777783549522, + "learning_rate": 1.4739440949320793e-06, + "loss": 0.6459, + "step": 8128 + }, + { + "epoch": 0.8303370786516854, + "grad_norm": 1.5432165007359933, + "learning_rate": 1.472215775870275e-06, + "loss": 0.769, + "step": 8129 + }, + { + "epoch": 0.8304392236976507, + "grad_norm": 1.6864037795707907, + "learning_rate": 1.4704883901799638e-06, + "loss": 0.7402, + "step": 8130 + }, + { + "epoch": 0.830541368743616, + "grad_norm": 1.3665145567453079, + "learning_rate": 1.4687619380502094e-06, + "loss": 0.5465, + "step": 8131 + }, + { + "epoch": 0.8306435137895812, + "grad_norm": 1.5685708375511025, + "learning_rate": 1.4670364196699704e-06, + "loss": 0.6216, + "step": 8132 + }, + { + "epoch": 0.8307456588355465, + "grad_norm": 1.5225160586101547, + "learning_rate": 1.4653118352281104e-06, + "loss": 0.6763, + "step": 8133 + }, + { + "epoch": 0.8308478038815117, + "grad_norm": 1.5847370921095767, + "learning_rate": 1.4635881849133827e-06, + "loss": 0.842, + "step": 8134 + }, + { + "epoch": 0.830949948927477, + "grad_norm": 1.4305351955090095, + "learning_rate": 1.461865468914444e-06, + "loss": 0.5666, + "step": 8135 + }, + { + "epoch": 0.8310520939734423, + "grad_norm": 1.5064405743143834, + "learning_rate": 1.460143687419847e-06, + "loss": 0.7135, + "step": 8136 + }, + { + "epoch": 0.8311542390194075, + "grad_norm": 1.4600573918276587, + "learning_rate": 1.458422840618039e-06, + "loss": 0.5929, + "step": 8137 + }, + { + "epoch": 0.8312563840653728, + "grad_norm": 1.5316332444644378, + "learning_rate": 1.4567029286973711e-06, + "loss": 0.6954, + "step": 8138 + }, + { + "epoch": 0.8313585291113381, + "grad_norm": 1.6322975956372432, + "learning_rate": 1.4549839518460885e-06, + "loss": 0.6733, + "step": 8139 + }, + { + "epoch": 0.8314606741573034, + "grad_norm": 1.7032293639981881, + "learning_rate": 1.4532659102523317e-06, + "loss": 0.7648, + "step": 8140 + }, + { + "epoch": 0.8315628192032687, + "grad_norm": 1.3869356671562079, + "learning_rate": 1.4515488041041414e-06, + "loss": 0.6266, + "step": 8141 + }, + { + "epoch": 0.8316649642492339, + "grad_norm": 1.5881164408369794, + "learning_rate": 1.4498326335894574e-06, + "loss": 0.7423, + "step": 8142 + }, + { + "epoch": 0.8317671092951991, + "grad_norm": 1.478319655150249, + "learning_rate": 1.4481173988961183e-06, + "loss": 0.6096, + "step": 8143 + }, + { + "epoch": 0.8318692543411644, + "grad_norm": 1.4361053561800963, + "learning_rate": 1.446403100211855e-06, + "loss": 0.6312, + "step": 8144 + }, + { + "epoch": 0.8319713993871297, + "grad_norm": 1.5280127812931918, + "learning_rate": 1.4446897377242986e-06, + "loss": 0.6373, + "step": 8145 + }, + { + "epoch": 0.832073544433095, + "grad_norm": 1.4673911052403497, + "learning_rate": 1.4429773116209778e-06, + "loss": 0.6123, + "step": 8146 + }, + { + "epoch": 0.8321756894790603, + "grad_norm": 1.4650189906991498, + "learning_rate": 1.441265822089316e-06, + "loss": 0.5905, + "step": 8147 + }, + { + "epoch": 0.8322778345250256, + "grad_norm": 1.4337618372504215, + "learning_rate": 1.4395552693166425e-06, + "loss": 0.6642, + "step": 8148 + }, + { + "epoch": 0.8323799795709909, + "grad_norm": 1.5808941433839487, + "learning_rate": 1.4378456534901751e-06, + "loss": 0.7413, + "step": 8149 + }, + { + "epoch": 0.832482124616956, + "grad_norm": 1.4358912773241022, + "learning_rate": 1.436136974797031e-06, + "loss": 0.608, + "step": 8150 + }, + { + "epoch": 0.8325842696629213, + "grad_norm": 1.4151565268533477, + "learning_rate": 1.4344292334242306e-06, + "loss": 0.6391, + "step": 8151 + }, + { + "epoch": 0.8326864147088866, + "grad_norm": 1.6498398781341925, + "learning_rate": 1.4327224295586818e-06, + "loss": 0.6575, + "step": 8152 + }, + { + "epoch": 0.8327885597548519, + "grad_norm": 1.391153572791816, + "learning_rate": 1.4310165633872008e-06, + "loss": 0.6756, + "step": 8153 + }, + { + "epoch": 0.8328907048008172, + "grad_norm": 1.3981727528788448, + "learning_rate": 1.4293116350964931e-06, + "loss": 0.6239, + "step": 8154 + }, + { + "epoch": 0.8329928498467825, + "grad_norm": 1.3891598561867289, + "learning_rate": 1.4276076448731646e-06, + "loss": 0.6216, + "step": 8155 + }, + { + "epoch": 0.8330949948927477, + "grad_norm": 1.4565438494515128, + "learning_rate": 1.4259045929037152e-06, + "loss": 0.7296, + "step": 8156 + }, + { + "epoch": 0.833197139938713, + "grad_norm": 1.564040574334429, + "learning_rate": 1.4242024793745491e-06, + "loss": 0.6471, + "step": 8157 + }, + { + "epoch": 0.8332992849846782, + "grad_norm": 1.5621789293755448, + "learning_rate": 1.4225013044719615e-06, + "loss": 0.8013, + "step": 8158 + }, + { + "epoch": 0.8334014300306435, + "grad_norm": 1.6261991901627544, + "learning_rate": 1.4208010683821494e-06, + "loss": 0.7156, + "step": 8159 + }, + { + "epoch": 0.8335035750766088, + "grad_norm": 1.581343917332976, + "learning_rate": 1.4191017712912036e-06, + "loss": 0.6173, + "step": 8160 + }, + { + "epoch": 0.8336057201225741, + "grad_norm": 1.4694883603552782, + "learning_rate": 1.4174034133851122e-06, + "loss": 0.6275, + "step": 8161 + }, + { + "epoch": 0.8337078651685393, + "grad_norm": 1.4265547794511055, + "learning_rate": 1.4157059948497608e-06, + "loss": 0.7316, + "step": 8162 + }, + { + "epoch": 0.8338100102145046, + "grad_norm": 1.482973350672315, + "learning_rate": 1.4140095158709367e-06, + "loss": 0.6361, + "step": 8163 + }, + { + "epoch": 0.8339121552604699, + "grad_norm": 1.6518780881315658, + "learning_rate": 1.4123139766343185e-06, + "loss": 0.6771, + "step": 8164 + }, + { + "epoch": 0.8340143003064351, + "grad_norm": 1.55166635753713, + "learning_rate": 1.4106193773254828e-06, + "loss": 0.7084, + "step": 8165 + }, + { + "epoch": 0.8341164453524004, + "grad_norm": 1.4673043891868442, + "learning_rate": 1.4089257181299042e-06, + "loss": 0.62, + "step": 8166 + }, + { + "epoch": 0.8342185903983657, + "grad_norm": 1.5887228739942647, + "learning_rate": 1.4072329992329559e-06, + "loss": 0.79, + "step": 8167 + }, + { + "epoch": 0.8343207354443309, + "grad_norm": 1.5288142582639677, + "learning_rate": 1.4055412208199105e-06, + "loss": 0.648, + "step": 8168 + }, + { + "epoch": 0.8344228804902962, + "grad_norm": 1.584683619803911, + "learning_rate": 1.403850383075931e-06, + "loss": 0.7647, + "step": 8169 + }, + { + "epoch": 0.8345250255362615, + "grad_norm": 1.4209703937936162, + "learning_rate": 1.4021604861860806e-06, + "loss": 0.7439, + "step": 8170 + }, + { + "epoch": 0.8346271705822268, + "grad_norm": 1.5295796253474756, + "learning_rate": 1.4004715303353177e-06, + "loss": 0.7212, + "step": 8171 + }, + { + "epoch": 0.8347293156281921, + "grad_norm": 1.5514537122346788, + "learning_rate": 1.398783515708504e-06, + "loss": 0.6875, + "step": 8172 + }, + { + "epoch": 0.8348314606741573, + "grad_norm": 1.4069697206909955, + "learning_rate": 1.3970964424903922e-06, + "loss": 0.6807, + "step": 8173 + }, + { + "epoch": 0.8349336057201225, + "grad_norm": 1.4758736999932414, + "learning_rate": 1.395410310865629e-06, + "loss": 0.6668, + "step": 8174 + }, + { + "epoch": 0.8350357507660878, + "grad_norm": 1.5208154030527188, + "learning_rate": 1.3937251210187707e-06, + "loss": 0.7003, + "step": 8175 + }, + { + "epoch": 0.8351378958120531, + "grad_norm": 1.6157769322914122, + "learning_rate": 1.392040873134255e-06, + "loss": 0.7098, + "step": 8176 + }, + { + "epoch": 0.8352400408580184, + "grad_norm": 1.5350439844888428, + "learning_rate": 1.3903575673964298e-06, + "loss": 0.7603, + "step": 8177 + }, + { + "epoch": 0.8353421859039837, + "grad_norm": 1.404109471351054, + "learning_rate": 1.3886752039895313e-06, + "loss": 0.6764, + "step": 8178 + }, + { + "epoch": 0.835444330949949, + "grad_norm": 1.5309618104151284, + "learning_rate": 1.3869937830976953e-06, + "loss": 0.6784, + "step": 8179 + }, + { + "epoch": 0.8355464759959143, + "grad_norm": 1.6452933771467642, + "learning_rate": 1.385313304904955e-06, + "loss": 0.7598, + "step": 8180 + }, + { + "epoch": 0.8356486210418794, + "grad_norm": 1.4722873622448365, + "learning_rate": 1.3836337695952363e-06, + "loss": 0.7737, + "step": 8181 + }, + { + "epoch": 0.8357507660878447, + "grad_norm": 1.531119532443974, + "learning_rate": 1.3819551773523687e-06, + "loss": 0.7183, + "step": 8182 + }, + { + "epoch": 0.83585291113381, + "grad_norm": 1.2898799151019351, + "learning_rate": 1.3802775283600777e-06, + "loss": 0.5687, + "step": 8183 + }, + { + "epoch": 0.8359550561797753, + "grad_norm": 1.5324765304861911, + "learning_rate": 1.3786008228019787e-06, + "loss": 0.6958, + "step": 8184 + }, + { + "epoch": 0.8360572012257406, + "grad_norm": 1.4132021785931987, + "learning_rate": 1.3769250608615915e-06, + "loss": 0.6944, + "step": 8185 + }, + { + "epoch": 0.8361593462717059, + "grad_norm": 1.580361450078761, + "learning_rate": 1.3752502427223246e-06, + "loss": 0.5853, + "step": 8186 + }, + { + "epoch": 0.8362614913176711, + "grad_norm": 1.4022249026009843, + "learning_rate": 1.373576368567493e-06, + "loss": 0.6908, + "step": 8187 + }, + { + "epoch": 0.8363636363636363, + "grad_norm": 1.5632692823789802, + "learning_rate": 1.371903438580302e-06, + "loss": 0.6976, + "step": 8188 + }, + { + "epoch": 0.8364657814096016, + "grad_norm": 1.4338842959147313, + "learning_rate": 1.3702314529438532e-06, + "loss": 0.703, + "step": 8189 + }, + { + "epoch": 0.8365679264555669, + "grad_norm": 1.530211078486839, + "learning_rate": 1.3685604118411456e-06, + "loss": 0.6265, + "step": 8190 + }, + { + "epoch": 0.8366700715015322, + "grad_norm": 1.519003409824781, + "learning_rate": 1.3668903154550772e-06, + "loss": 0.6647, + "step": 8191 + }, + { + "epoch": 0.8367722165474974, + "grad_norm": 1.387155137396174, + "learning_rate": 1.3652211639684442e-06, + "loss": 0.7177, + "step": 8192 + }, + { + "epoch": 0.8368743615934627, + "grad_norm": 1.5302453217138512, + "learning_rate": 1.3635529575639339e-06, + "loss": 0.6194, + "step": 8193 + }, + { + "epoch": 0.836976506639428, + "grad_norm": 1.6940031791141499, + "learning_rate": 1.3618856964241312e-06, + "loss": 0.7269, + "step": 8194 + }, + { + "epoch": 0.8370786516853933, + "grad_norm": 1.7065555506550198, + "learning_rate": 1.360219380731519e-06, + "loss": 0.7965, + "step": 8195 + }, + { + "epoch": 0.8371807967313585, + "grad_norm": 1.5474877061726986, + "learning_rate": 1.358554010668478e-06, + "loss": 0.6894, + "step": 8196 + }, + { + "epoch": 0.8372829417773238, + "grad_norm": 1.5514085998854454, + "learning_rate": 1.3568895864172849e-06, + "loss": 0.7555, + "step": 8197 + }, + { + "epoch": 0.837385086823289, + "grad_norm": 1.5319729118185041, + "learning_rate": 1.3552261081601091e-06, + "loss": 0.7842, + "step": 8198 + }, + { + "epoch": 0.8374872318692543, + "grad_norm": 1.4912627377211982, + "learning_rate": 1.3535635760790223e-06, + "loss": 0.6281, + "step": 8199 + }, + { + "epoch": 0.8375893769152196, + "grad_norm": 1.4679135886781982, + "learning_rate": 1.3519019903559882e-06, + "loss": 0.6356, + "step": 8200 + }, + { + "epoch": 0.8376915219611849, + "grad_norm": 1.521538659267852, + "learning_rate": 1.3502413511728673e-06, + "loss": 0.647, + "step": 8201 + }, + { + "epoch": 0.8377936670071502, + "grad_norm": 1.4508042696362593, + "learning_rate": 1.3485816587114199e-06, + "loss": 0.74, + "step": 8202 + }, + { + "epoch": 0.8378958120531155, + "grad_norm": 1.6743637746816158, + "learning_rate": 1.3469229131533002e-06, + "loss": 0.7477, + "step": 8203 + }, + { + "epoch": 0.8379979570990806, + "grad_norm": 1.4225148950206428, + "learning_rate": 1.3452651146800588e-06, + "loss": 0.6269, + "step": 8204 + }, + { + "epoch": 0.8381001021450459, + "grad_norm": 1.6255162083045618, + "learning_rate": 1.3436082634731396e-06, + "loss": 0.7296, + "step": 8205 + }, + { + "epoch": 0.8382022471910112, + "grad_norm": 1.7366213878172718, + "learning_rate": 1.3419523597138884e-06, + "loss": 0.7473, + "step": 8206 + }, + { + "epoch": 0.8383043922369765, + "grad_norm": 1.6207014259653467, + "learning_rate": 1.340297403583548e-06, + "loss": 0.6347, + "step": 8207 + }, + { + "epoch": 0.8384065372829418, + "grad_norm": 1.441445096213815, + "learning_rate": 1.3386433952632517e-06, + "loss": 0.6865, + "step": 8208 + }, + { + "epoch": 0.8385086823289071, + "grad_norm": 1.57702135907775, + "learning_rate": 1.3369903349340308e-06, + "loss": 0.7227, + "step": 8209 + }, + { + "epoch": 0.8386108273748724, + "grad_norm": 1.5627318645714228, + "learning_rate": 1.3353382227768142e-06, + "loss": 0.6104, + "step": 8210 + }, + { + "epoch": 0.8387129724208376, + "grad_norm": 1.467750960370785, + "learning_rate": 1.3336870589724282e-06, + "loss": 0.7352, + "step": 8211 + }, + { + "epoch": 0.8388151174668028, + "grad_norm": 1.5306048359005233, + "learning_rate": 1.332036843701593e-06, + "loss": 0.6549, + "step": 8212 + }, + { + "epoch": 0.8389172625127681, + "grad_norm": 1.5167162399969156, + "learning_rate": 1.3303875771449247e-06, + "loss": 0.5144, + "step": 8213 + }, + { + "epoch": 0.8390194075587334, + "grad_norm": 1.5798163480212422, + "learning_rate": 1.3287392594829385e-06, + "loss": 0.694, + "step": 8214 + }, + { + "epoch": 0.8391215526046987, + "grad_norm": 1.460791953716969, + "learning_rate": 1.3270918908960406e-06, + "loss": 0.7846, + "step": 8215 + }, + { + "epoch": 0.839223697650664, + "grad_norm": 1.290478280059854, + "learning_rate": 1.325445471564538e-06, + "loss": 0.5491, + "step": 8216 + }, + { + "epoch": 0.8393258426966292, + "grad_norm": 1.567184774325552, + "learning_rate": 1.3238000016686347e-06, + "loss": 0.6461, + "step": 8217 + }, + { + "epoch": 0.8394279877425945, + "grad_norm": 1.589923232042974, + "learning_rate": 1.3221554813884275e-06, + "loss": 0.7656, + "step": 8218 + }, + { + "epoch": 0.8395301327885597, + "grad_norm": 1.6273073151214594, + "learning_rate": 1.32051191090391e-06, + "loss": 0.663, + "step": 8219 + }, + { + "epoch": 0.839632277834525, + "grad_norm": 1.6166074245705306, + "learning_rate": 1.3188692903949685e-06, + "loss": 0.6276, + "step": 8220 + }, + { + "epoch": 0.8397344228804903, + "grad_norm": 1.541100519766431, + "learning_rate": 1.3172276200413948e-06, + "loss": 0.773, + "step": 8221 + }, + { + "epoch": 0.8398365679264556, + "grad_norm": 1.5134877497328956, + "learning_rate": 1.3155869000228672e-06, + "loss": 0.7796, + "step": 8222 + }, + { + "epoch": 0.8399387129724208, + "grad_norm": 1.5082955275520782, + "learning_rate": 1.3139471305189622e-06, + "loss": 0.6456, + "step": 8223 + }, + { + "epoch": 0.8400408580183861, + "grad_norm": 1.5108386164303054, + "learning_rate": 1.3123083117091573e-06, + "loss": 0.6655, + "step": 8224 + }, + { + "epoch": 0.8401430030643514, + "grad_norm": 1.5049696983412515, + "learning_rate": 1.3106704437728191e-06, + "loss": 0.6539, + "step": 8225 + }, + { + "epoch": 0.8402451481103167, + "grad_norm": 1.647998124348615, + "learning_rate": 1.3090335268892175e-06, + "loss": 0.7149, + "step": 8226 + }, + { + "epoch": 0.8403472931562819, + "grad_norm": 1.4826765492861074, + "learning_rate": 1.3073975612375111e-06, + "loss": 0.6772, + "step": 8227 + }, + { + "epoch": 0.8404494382022472, + "grad_norm": 1.4323011394663685, + "learning_rate": 1.3057625469967572e-06, + "loss": 0.5417, + "step": 8228 + }, + { + "epoch": 0.8405515832482124, + "grad_norm": 1.6227434337334146, + "learning_rate": 1.3041284843459078e-06, + "loss": 0.6587, + "step": 8229 + }, + { + "epoch": 0.8406537282941777, + "grad_norm": 1.5499946650575884, + "learning_rate": 1.3024953734638169e-06, + "loss": 0.6153, + "step": 8230 + }, + { + "epoch": 0.840755873340143, + "grad_norm": 1.4370497964387507, + "learning_rate": 1.3008632145292244e-06, + "loss": 0.675, + "step": 8231 + }, + { + "epoch": 0.8408580183861083, + "grad_norm": 1.48623802203511, + "learning_rate": 1.2992320077207744e-06, + "loss": 0.6455, + "step": 8232 + }, + { + "epoch": 0.8409601634320736, + "grad_norm": 1.4482110630727887, + "learning_rate": 1.2976017532170037e-06, + "loss": 0.7146, + "step": 8233 + }, + { + "epoch": 0.8410623084780389, + "grad_norm": 1.4427367974474214, + "learning_rate": 1.2959724511963434e-06, + "loss": 0.6968, + "step": 8234 + }, + { + "epoch": 0.841164453524004, + "grad_norm": 1.6512229444236333, + "learning_rate": 1.2943441018371195e-06, + "loss": 0.7645, + "step": 8235 + }, + { + "epoch": 0.8412665985699693, + "grad_norm": 1.5576136113974224, + "learning_rate": 1.2927167053175603e-06, + "loss": 0.7044, + "step": 8236 + }, + { + "epoch": 0.8413687436159346, + "grad_norm": 1.4597564096245685, + "learning_rate": 1.291090261815784e-06, + "loss": 0.5747, + "step": 8237 + }, + { + "epoch": 0.8414708886618999, + "grad_norm": 1.593830716972166, + "learning_rate": 1.289464771509804e-06, + "loss": 0.6773, + "step": 8238 + }, + { + "epoch": 0.8415730337078652, + "grad_norm": 1.5360531089034826, + "learning_rate": 1.287840234577531e-06, + "loss": 0.7113, + "step": 8239 + }, + { + "epoch": 0.8416751787538305, + "grad_norm": 1.4305447624179328, + "learning_rate": 1.2862166511967734e-06, + "loss": 0.6585, + "step": 8240 + }, + { + "epoch": 0.8417773237997958, + "grad_norm": 1.4609142021553054, + "learning_rate": 1.2845940215452346e-06, + "loss": 0.7431, + "step": 8241 + }, + { + "epoch": 0.8418794688457609, + "grad_norm": 1.3977124218935364, + "learning_rate": 1.2829723458005118e-06, + "loss": 0.6428, + "step": 8242 + }, + { + "epoch": 0.8419816138917262, + "grad_norm": 1.653584238364525, + "learning_rate": 1.281351624140097e-06, + "loss": 0.7709, + "step": 8243 + }, + { + "epoch": 0.8420837589376915, + "grad_norm": 1.4678037674884628, + "learning_rate": 1.2797318567413787e-06, + "loss": 0.6809, + "step": 8244 + }, + { + "epoch": 0.8421859039836568, + "grad_norm": 1.4854574280238055, + "learning_rate": 1.2781130437816436e-06, + "loss": 0.6845, + "step": 8245 + }, + { + "epoch": 0.8422880490296221, + "grad_norm": 1.3998907811007884, + "learning_rate": 1.2764951854380714e-06, + "loss": 0.6528, + "step": 8246 + }, + { + "epoch": 0.8423901940755874, + "grad_norm": 1.5150038870704903, + "learning_rate": 1.2748782818877358e-06, + "loss": 0.6328, + "step": 8247 + }, + { + "epoch": 0.8424923391215526, + "grad_norm": 1.402649116622681, + "learning_rate": 1.273262333307611e-06, + "loss": 0.5897, + "step": 8248 + }, + { + "epoch": 0.8425944841675179, + "grad_norm": 1.2636561222401679, + "learning_rate": 1.27164733987456e-06, + "loss": 0.6364, + "step": 8249 + }, + { + "epoch": 0.8426966292134831, + "grad_norm": 1.648985617449909, + "learning_rate": 1.2700333017653488e-06, + "loss": 0.7184, + "step": 8250 + }, + { + "epoch": 0.8427987742594484, + "grad_norm": 1.263915978062596, + "learning_rate": 1.268420219156633e-06, + "loss": 0.5707, + "step": 8251 + }, + { + "epoch": 0.8429009193054137, + "grad_norm": 1.7063304367168142, + "learning_rate": 1.2668080922249658e-06, + "loss": 0.7565, + "step": 8252 + }, + { + "epoch": 0.843003064351379, + "grad_norm": 1.4154782817642595, + "learning_rate": 1.2651969211467952e-06, + "loss": 0.7702, + "step": 8253 + }, + { + "epoch": 0.8431052093973442, + "grad_norm": 1.5648961577124925, + "learning_rate": 1.263586706098462e-06, + "loss": 0.6425, + "step": 8254 + }, + { + "epoch": 0.8432073544433095, + "grad_norm": 1.48461816336958, + "learning_rate": 1.2619774472562097e-06, + "loss": 0.6729, + "step": 8255 + }, + { + "epoch": 0.8433094994892748, + "grad_norm": 1.5535651729256321, + "learning_rate": 1.260369144796172e-06, + "loss": 0.7646, + "step": 8256 + }, + { + "epoch": 0.8434116445352401, + "grad_norm": 1.3404449509821161, + "learning_rate": 1.258761798894379e-06, + "loss": 0.6533, + "step": 8257 + }, + { + "epoch": 0.8435137895812053, + "grad_norm": 1.6411279324971095, + "learning_rate": 1.2571554097267546e-06, + "loss": 0.7353, + "step": 8258 + }, + { + "epoch": 0.8436159346271705, + "grad_norm": 1.5559675332493963, + "learning_rate": 1.255549977469116e-06, + "loss": 0.7454, + "step": 8259 + }, + { + "epoch": 0.8437180796731358, + "grad_norm": 1.2796119926933718, + "learning_rate": 1.2539455022971858e-06, + "loss": 0.62, + "step": 8260 + }, + { + "epoch": 0.8438202247191011, + "grad_norm": 1.4353597823676087, + "learning_rate": 1.2523419843865692e-06, + "loss": 0.6335, + "step": 8261 + }, + { + "epoch": 0.8439223697650664, + "grad_norm": 1.6129497741580998, + "learning_rate": 1.2507394239127757e-06, + "loss": 0.7342, + "step": 8262 + }, + { + "epoch": 0.8440245148110317, + "grad_norm": 1.4253424198325702, + "learning_rate": 1.2491378210512018e-06, + "loss": 0.7932, + "step": 8263 + }, + { + "epoch": 0.844126659856997, + "grad_norm": 1.4982284523372822, + "learning_rate": 1.2475371759771482e-06, + "loss": 0.6584, + "step": 8264 + }, + { + "epoch": 0.8442288049029623, + "grad_norm": 1.5270501704278463, + "learning_rate": 1.245937488865807e-06, + "loss": 0.634, + "step": 8265 + }, + { + "epoch": 0.8443309499489274, + "grad_norm": 1.3908848541085905, + "learning_rate": 1.244338759892263e-06, + "loss": 0.5551, + "step": 8266 + }, + { + "epoch": 0.8444330949948927, + "grad_norm": 1.4993400275080155, + "learning_rate": 1.242740989231499e-06, + "loss": 0.6868, + "step": 8267 + }, + { + "epoch": 0.844535240040858, + "grad_norm": 1.5550121340985836, + "learning_rate": 1.241144177058392e-06, + "loss": 0.7091, + "step": 8268 + }, + { + "epoch": 0.8446373850868233, + "grad_norm": 1.5192424878636976, + "learning_rate": 1.2395483235477112e-06, + "loss": 0.6799, + "step": 8269 + }, + { + "epoch": 0.8447395301327886, + "grad_norm": 1.5909685747378701, + "learning_rate": 1.237953428874129e-06, + "loss": 0.7335, + "step": 8270 + }, + { + "epoch": 0.8448416751787539, + "grad_norm": 1.593661788493861, + "learning_rate": 1.2363594932122026e-06, + "loss": 0.6705, + "step": 8271 + }, + { + "epoch": 0.8449438202247191, + "grad_norm": 1.4357876486447376, + "learning_rate": 1.2347665167363942e-06, + "loss": 0.6598, + "step": 8272 + }, + { + "epoch": 0.8450459652706843, + "grad_norm": 1.4742916599107387, + "learning_rate": 1.2331744996210537e-06, + "loss": 0.726, + "step": 8273 + }, + { + "epoch": 0.8451481103166496, + "grad_norm": 1.462129482419131, + "learning_rate": 1.231583442040425e-06, + "loss": 0.5845, + "step": 8274 + }, + { + "epoch": 0.8452502553626149, + "grad_norm": 1.5690825916418867, + "learning_rate": 1.2299933441686562e-06, + "loss": 0.6925, + "step": 8275 + }, + { + "epoch": 0.8453524004085802, + "grad_norm": 1.4361393036662764, + "learning_rate": 1.228404206179783e-06, + "loss": 0.7037, + "step": 8276 + }, + { + "epoch": 0.8454545454545455, + "grad_norm": 1.555600272565985, + "learning_rate": 1.226816028247736e-06, + "loss": 0.6705, + "step": 8277 + }, + { + "epoch": 0.8455566905005107, + "grad_norm": 1.3872687460043418, + "learning_rate": 1.2252288105463405e-06, + "loss": 0.7308, + "step": 8278 + }, + { + "epoch": 0.845658835546476, + "grad_norm": 1.642763269340115, + "learning_rate": 1.2236425532493213e-06, + "loss": 0.6662, + "step": 8279 + }, + { + "epoch": 0.8457609805924413, + "grad_norm": 1.5024050731333365, + "learning_rate": 1.222057256530297e-06, + "loss": 0.6739, + "step": 8280 + }, + { + "epoch": 0.8458631256384065, + "grad_norm": 1.4969847513172527, + "learning_rate": 1.2204729205627774e-06, + "loss": 0.7276, + "step": 8281 + }, + { + "epoch": 0.8459652706843718, + "grad_norm": 1.4580292038690332, + "learning_rate": 1.2188895455201688e-06, + "loss": 0.6367, + "step": 8282 + }, + { + "epoch": 0.8460674157303371, + "grad_norm": 1.6702878164215913, + "learning_rate": 1.2173071315757701e-06, + "loss": 0.6753, + "step": 8283 + }, + { + "epoch": 0.8461695607763023, + "grad_norm": 1.3638612131921646, + "learning_rate": 1.2157256789027828e-06, + "loss": 0.6379, + "step": 8284 + }, + { + "epoch": 0.8462717058222676, + "grad_norm": 1.3661290707863352, + "learning_rate": 1.214145187674296e-06, + "loss": 0.5924, + "step": 8285 + }, + { + "epoch": 0.8463738508682329, + "grad_norm": 1.4734222442963858, + "learning_rate": 1.2125656580632939e-06, + "loss": 0.6418, + "step": 8286 + }, + { + "epoch": 0.8464759959141982, + "grad_norm": 1.4844363405598546, + "learning_rate": 1.2109870902426558e-06, + "loss": 0.7947, + "step": 8287 + }, + { + "epoch": 0.8465781409601635, + "grad_norm": 1.4409950843028163, + "learning_rate": 1.2094094843851612e-06, + "loss": 0.6364, + "step": 8288 + }, + { + "epoch": 0.8466802860061287, + "grad_norm": 1.5554225116992695, + "learning_rate": 1.2078328406634765e-06, + "loss": 0.5967, + "step": 8289 + }, + { + "epoch": 0.8467824310520939, + "grad_norm": 1.519089030800187, + "learning_rate": 1.2062571592501692e-06, + "loss": 0.7747, + "step": 8290 + }, + { + "epoch": 0.8468845760980592, + "grad_norm": 1.4302614247190841, + "learning_rate": 1.2046824403176983e-06, + "loss": 0.6194, + "step": 8291 + }, + { + "epoch": 0.8469867211440245, + "grad_norm": 1.668285188563771, + "learning_rate": 1.2031086840384154e-06, + "loss": 0.7204, + "step": 8292 + }, + { + "epoch": 0.8470888661899898, + "grad_norm": 1.4063476492285198, + "learning_rate": 1.2015358905845699e-06, + "loss": 0.6491, + "step": 8293 + }, + { + "epoch": 0.8471910112359551, + "grad_norm": 1.3783506569007256, + "learning_rate": 1.1999640601283069e-06, + "loss": 0.6473, + "step": 8294 + }, + { + "epoch": 0.8472931562819204, + "grad_norm": 1.6438735016415327, + "learning_rate": 1.1983931928416614e-06, + "loss": 0.6941, + "step": 8295 + }, + { + "epoch": 0.8473953013278857, + "grad_norm": 1.513510309176791, + "learning_rate": 1.1968232888965692e-06, + "loss": 0.623, + "step": 8296 + }, + { + "epoch": 0.8474974463738508, + "grad_norm": 1.4043294562513438, + "learning_rate": 1.195254348464856e-06, + "loss": 0.693, + "step": 8297 + }, + { + "epoch": 0.8475995914198161, + "grad_norm": 1.382566793179503, + "learning_rate": 1.19368637171824e-06, + "loss": 0.7131, + "step": 8298 + }, + { + "epoch": 0.8477017364657814, + "grad_norm": 1.5928810802805253, + "learning_rate": 1.192119358828343e-06, + "loss": 0.8231, + "step": 8299 + }, + { + "epoch": 0.8478038815117467, + "grad_norm": 1.422601486539526, + "learning_rate": 1.1905533099666732e-06, + "loss": 0.6791, + "step": 8300 + }, + { + "epoch": 0.847906026557712, + "grad_norm": 1.5243141136028528, + "learning_rate": 1.1889882253046347e-06, + "loss": 0.5951, + "step": 8301 + }, + { + "epoch": 0.8480081716036773, + "grad_norm": 1.5587905641221254, + "learning_rate": 1.1874241050135283e-06, + "loss": 0.6197, + "step": 8302 + }, + { + "epoch": 0.8481103166496425, + "grad_norm": 1.6424808663493455, + "learning_rate": 1.1858609492645435e-06, + "loss": 0.7111, + "step": 8303 + }, + { + "epoch": 0.8482124616956077, + "grad_norm": 1.6258600924568685, + "learning_rate": 1.1842987582287734e-06, + "loss": 0.6623, + "step": 8304 + }, + { + "epoch": 0.848314606741573, + "grad_norm": 1.526471651358364, + "learning_rate": 1.1827375320772027e-06, + "loss": 0.6383, + "step": 8305 + }, + { + "epoch": 0.8484167517875383, + "grad_norm": 1.38963719292274, + "learning_rate": 1.1811772709807057e-06, + "loss": 0.6806, + "step": 8306 + }, + { + "epoch": 0.8485188968335036, + "grad_norm": 1.575124060407301, + "learning_rate": 1.1796179751100533e-06, + "loss": 0.6778, + "step": 8307 + }, + { + "epoch": 0.8486210418794689, + "grad_norm": 1.4866530262146251, + "learning_rate": 1.1780596446359105e-06, + "loss": 0.5911, + "step": 8308 + }, + { + "epoch": 0.8487231869254341, + "grad_norm": 1.5062414086633105, + "learning_rate": 1.1765022797288418e-06, + "loss": 0.6412, + "step": 8309 + }, + { + "epoch": 0.8488253319713994, + "grad_norm": 1.5753247094481755, + "learning_rate": 1.1749458805592983e-06, + "loss": 0.6912, + "step": 8310 + }, + { + "epoch": 0.8489274770173647, + "grad_norm": 1.431872908227566, + "learning_rate": 1.1733904472976277e-06, + "loss": 0.661, + "step": 8311 + }, + { + "epoch": 0.8490296220633299, + "grad_norm": 1.3938312565265256, + "learning_rate": 1.1718359801140788e-06, + "loss": 0.6762, + "step": 8312 + }, + { + "epoch": 0.8491317671092952, + "grad_norm": 1.6075948936753437, + "learning_rate": 1.1702824791787825e-06, + "loss": 0.7822, + "step": 8313 + }, + { + "epoch": 0.8492339121552605, + "grad_norm": 1.5666264133552175, + "learning_rate": 1.1687299446617762e-06, + "loss": 0.7319, + "step": 8314 + }, + { + "epoch": 0.8493360572012257, + "grad_norm": 1.4335010181056, + "learning_rate": 1.1671783767329824e-06, + "loss": 0.7015, + "step": 8315 + }, + { + "epoch": 0.849438202247191, + "grad_norm": 1.4845499697012328, + "learning_rate": 1.1656277755622225e-06, + "loss": 0.7232, + "step": 8316 + }, + { + "epoch": 0.8495403472931563, + "grad_norm": 1.3611465700380552, + "learning_rate": 1.1640781413192082e-06, + "loss": 0.7713, + "step": 8317 + }, + { + "epoch": 0.8496424923391216, + "grad_norm": 1.5612776185721151, + "learning_rate": 1.1625294741735527e-06, + "loss": 0.576, + "step": 8318 + }, + { + "epoch": 0.8497446373850869, + "grad_norm": 1.4335053682421963, + "learning_rate": 1.1609817742947538e-06, + "loss": 0.6209, + "step": 8319 + }, + { + "epoch": 0.849846782431052, + "grad_norm": 1.5300787797336082, + "learning_rate": 1.1594350418522115e-06, + "loss": 0.7007, + "step": 8320 + }, + { + "epoch": 0.8499489274770173, + "grad_norm": 1.4669915467675874, + "learning_rate": 1.1578892770152162e-06, + "loss": 0.6246, + "step": 8321 + }, + { + "epoch": 0.8500510725229826, + "grad_norm": 1.6326682373033836, + "learning_rate": 1.1563444799529522e-06, + "loss": 0.7536, + "step": 8322 + }, + { + "epoch": 0.8501532175689479, + "grad_norm": 1.496430257119027, + "learning_rate": 1.1548006508344966e-06, + "loss": 0.6003, + "step": 8323 + }, + { + "epoch": 0.8502553626149132, + "grad_norm": 1.4847971629580081, + "learning_rate": 1.1532577898288267e-06, + "loss": 0.7235, + "step": 8324 + }, + { + "epoch": 0.8503575076608785, + "grad_norm": 1.49297185425958, + "learning_rate": 1.151715897104807e-06, + "loss": 0.6805, + "step": 8325 + }, + { + "epoch": 0.8504596527068438, + "grad_norm": 1.4577493639847692, + "learning_rate": 1.1501749728311994e-06, + "loss": 0.6483, + "step": 8326 + }, + { + "epoch": 0.8505617977528089, + "grad_norm": 1.5627491730819745, + "learning_rate": 1.148635017176657e-06, + "loss": 0.704, + "step": 8327 + }, + { + "epoch": 0.8506639427987742, + "grad_norm": 1.4827267529092356, + "learning_rate": 1.14709603030973e-06, + "loss": 0.6675, + "step": 8328 + }, + { + "epoch": 0.8507660878447395, + "grad_norm": 1.49504066312726, + "learning_rate": 1.1455580123988653e-06, + "loss": 0.6509, + "step": 8329 + }, + { + "epoch": 0.8508682328907048, + "grad_norm": 1.4408648997080775, + "learning_rate": 1.1440209636123956e-06, + "loss": 0.7024, + "step": 8330 + }, + { + "epoch": 0.8509703779366701, + "grad_norm": 1.608215344639957, + "learning_rate": 1.1424848841185542e-06, + "loss": 0.6993, + "step": 8331 + }, + { + "epoch": 0.8510725229826354, + "grad_norm": 1.5792241908133076, + "learning_rate": 1.1409497740854625e-06, + "loss": 0.7387, + "step": 8332 + }, + { + "epoch": 0.8511746680286006, + "grad_norm": 1.6784978736053213, + "learning_rate": 1.1394156336811436e-06, + "loss": 0.7714, + "step": 8333 + }, + { + "epoch": 0.8512768130745659, + "grad_norm": 1.4756358463644228, + "learning_rate": 1.1378824630735087e-06, + "loss": 0.665, + "step": 8334 + }, + { + "epoch": 0.8513789581205311, + "grad_norm": 1.455234974518297, + "learning_rate": 1.1363502624303614e-06, + "loss": 0.8013, + "step": 8335 + }, + { + "epoch": 0.8514811031664964, + "grad_norm": 1.5578478407176866, + "learning_rate": 1.1348190319194064e-06, + "loss": 0.7288, + "step": 8336 + }, + { + "epoch": 0.8515832482124617, + "grad_norm": 1.5238399426281883, + "learning_rate": 1.1332887717082342e-06, + "loss": 0.6823, + "step": 8337 + }, + { + "epoch": 0.851685393258427, + "grad_norm": 1.5100274966026, + "learning_rate": 1.1317594819643362e-06, + "loss": 0.6719, + "step": 8338 + }, + { + "epoch": 0.8517875383043922, + "grad_norm": 1.9348803766528373, + "learning_rate": 1.1302311628550933e-06, + "loss": 0.7397, + "step": 8339 + }, + { + "epoch": 0.8518896833503575, + "grad_norm": 1.5639655456892616, + "learning_rate": 1.1287038145477791e-06, + "loss": 0.6396, + "step": 8340 + }, + { + "epoch": 0.8519918283963228, + "grad_norm": 1.4829619692123155, + "learning_rate": 1.1271774372095646e-06, + "loss": 0.7001, + "step": 8341 + }, + { + "epoch": 0.8520939734422881, + "grad_norm": 1.5711843217568109, + "learning_rate": 1.1256520310075103e-06, + "loss": 0.6871, + "step": 8342 + }, + { + "epoch": 0.8521961184882533, + "grad_norm": 1.5996771928356603, + "learning_rate": 1.1241275961085751e-06, + "loss": 0.6852, + "step": 8343 + }, + { + "epoch": 0.8522982635342186, + "grad_norm": 1.3985494490443957, + "learning_rate": 1.12260413267961e-06, + "loss": 0.7239, + "step": 8344 + }, + { + "epoch": 0.8524004085801838, + "grad_norm": 1.3838674990944548, + "learning_rate": 1.1210816408873592e-06, + "loss": 0.6503, + "step": 8345 + }, + { + "epoch": 0.8525025536261491, + "grad_norm": 1.4871811748704775, + "learning_rate": 1.1195601208984587e-06, + "loss": 0.6034, + "step": 8346 + }, + { + "epoch": 0.8526046986721144, + "grad_norm": 1.6507273123002324, + "learning_rate": 1.118039572879439e-06, + "loss": 0.6567, + "step": 8347 + }, + { + "epoch": 0.8527068437180797, + "grad_norm": 1.631555295937028, + "learning_rate": 1.1165199969967277e-06, + "loss": 0.8299, + "step": 8348 + }, + { + "epoch": 0.852808988764045, + "grad_norm": 1.4300233191496894, + "learning_rate": 1.1150013934166426e-06, + "loss": 0.7062, + "step": 8349 + }, + { + "epoch": 0.8529111338100103, + "grad_norm": 1.60307805053133, + "learning_rate": 1.1134837623053962e-06, + "loss": 0.8125, + "step": 8350 + }, + { + "epoch": 0.8530132788559754, + "grad_norm": 1.4187994086111455, + "learning_rate": 1.1119671038290901e-06, + "loss": 0.6542, + "step": 8351 + }, + { + "epoch": 0.8531154239019407, + "grad_norm": 1.7344322105544165, + "learning_rate": 1.1104514181537273e-06, + "loss": 0.6951, + "step": 8352 + }, + { + "epoch": 0.853217568947906, + "grad_norm": 1.6365063070819643, + "learning_rate": 1.1089367054452028e-06, + "loss": 0.7281, + "step": 8353 + }, + { + "epoch": 0.8533197139938713, + "grad_norm": 1.5333027224389877, + "learning_rate": 1.1074229658693003e-06, + "loss": 0.7699, + "step": 8354 + }, + { + "epoch": 0.8534218590398366, + "grad_norm": 1.6694082709363514, + "learning_rate": 1.1059101995916988e-06, + "loss": 0.6883, + "step": 8355 + }, + { + "epoch": 0.8535240040858019, + "grad_norm": 1.5817948433424238, + "learning_rate": 1.1043984067779723e-06, + "loss": 0.7377, + "step": 8356 + }, + { + "epoch": 0.8536261491317672, + "grad_norm": 1.5384613166492191, + "learning_rate": 1.1028875875935863e-06, + "loss": 0.6396, + "step": 8357 + }, + { + "epoch": 0.8537282941777323, + "grad_norm": 1.5048784987433386, + "learning_rate": 1.101377742203903e-06, + "loss": 0.6462, + "step": 8358 + }, + { + "epoch": 0.8538304392236976, + "grad_norm": 1.4491982865127386, + "learning_rate": 1.0998688707741733e-06, + "loss": 0.665, + "step": 8359 + }, + { + "epoch": 0.8539325842696629, + "grad_norm": 1.5306767462310378, + "learning_rate": 1.0983609734695488e-06, + "loss": 0.6339, + "step": 8360 + }, + { + "epoch": 0.8540347293156282, + "grad_norm": 1.4556056141121028, + "learning_rate": 1.0968540504550661e-06, + "loss": 0.659, + "step": 8361 + }, + { + "epoch": 0.8541368743615935, + "grad_norm": 1.560099801058066, + "learning_rate": 1.095348101895658e-06, + "loss": 0.6243, + "step": 8362 + }, + { + "epoch": 0.8542390194075588, + "grad_norm": 1.6843941265264888, + "learning_rate": 1.0938431279561556e-06, + "loss": 0.7801, + "step": 8363 + }, + { + "epoch": 0.854341164453524, + "grad_norm": 1.5789251376130133, + "learning_rate": 1.0923391288012764e-06, + "loss": 0.7804, + "step": 8364 + }, + { + "epoch": 0.8544433094994893, + "grad_norm": 1.5706988507324289, + "learning_rate": 1.0908361045956352e-06, + "loss": 0.6853, + "step": 8365 + }, + { + "epoch": 0.8545454545454545, + "grad_norm": 1.6974830420637157, + "learning_rate": 1.0893340555037356e-06, + "loss": 0.7981, + "step": 8366 + }, + { + "epoch": 0.8546475995914198, + "grad_norm": 1.4685955596527507, + "learning_rate": 1.0878329816899813e-06, + "loss": 0.7786, + "step": 8367 + }, + { + "epoch": 0.8547497446373851, + "grad_norm": 1.511055685775897, + "learning_rate": 1.086332883318667e-06, + "loss": 0.7245, + "step": 8368 + }, + { + "epoch": 0.8548518896833504, + "grad_norm": 1.4492622970916131, + "learning_rate": 1.0848337605539782e-06, + "loss": 0.6086, + "step": 8369 + }, + { + "epoch": 0.8549540347293156, + "grad_norm": 1.5179494988495408, + "learning_rate": 1.0833356135599938e-06, + "loss": 0.6718, + "step": 8370 + }, + { + "epoch": 0.8550561797752809, + "grad_norm": 1.4497075829838668, + "learning_rate": 1.0818384425006844e-06, + "loss": 0.7019, + "step": 8371 + }, + { + "epoch": 0.8551583248212462, + "grad_norm": 1.5260078580520513, + "learning_rate": 1.0803422475399228e-06, + "loss": 0.7017, + "step": 8372 + }, + { + "epoch": 0.8552604698672115, + "grad_norm": 1.4489602047079986, + "learning_rate": 1.0788470288414642e-06, + "loss": 0.5729, + "step": 8373 + }, + { + "epoch": 0.8553626149131767, + "grad_norm": 1.6914722426647502, + "learning_rate": 1.0773527865689625e-06, + "loss": 0.7541, + "step": 8374 + }, + { + "epoch": 0.855464759959142, + "grad_norm": 1.6822433057494055, + "learning_rate": 1.07585952088596e-06, + "loss": 0.7944, + "step": 8375 + }, + { + "epoch": 0.8555669050051072, + "grad_norm": 1.348097100251337, + "learning_rate": 1.0743672319559017e-06, + "loss": 0.6721, + "step": 8376 + }, + { + "epoch": 0.8556690500510725, + "grad_norm": 1.4963404354326346, + "learning_rate": 1.0728759199421146e-06, + "loss": 0.6947, + "step": 8377 + }, + { + "epoch": 0.8557711950970378, + "grad_norm": 1.6349750018882885, + "learning_rate": 1.071385585007828e-06, + "loss": 0.7319, + "step": 8378 + }, + { + "epoch": 0.8558733401430031, + "grad_norm": 1.285242558988449, + "learning_rate": 1.0698962273161573e-06, + "loss": 0.6166, + "step": 8379 + }, + { + "epoch": 0.8559754851889684, + "grad_norm": 1.307375444013231, + "learning_rate": 1.068407847030114e-06, + "loss": 0.5787, + "step": 8380 + }, + { + "epoch": 0.8560776302349336, + "grad_norm": 1.4056924453508923, + "learning_rate": 1.0669204443126002e-06, + "loss": 0.6134, + "step": 8381 + }, + { + "epoch": 0.8561797752808988, + "grad_norm": 1.5096506350933856, + "learning_rate": 1.065434019326418e-06, + "loss": 0.68, + "step": 8382 + }, + { + "epoch": 0.8562819203268641, + "grad_norm": 1.5705467604452903, + "learning_rate": 1.0639485722342524e-06, + "loss": 0.708, + "step": 8383 + }, + { + "epoch": 0.8563840653728294, + "grad_norm": 1.6086545446162752, + "learning_rate": 1.0624641031986903e-06, + "loss": 0.7612, + "step": 8384 + }, + { + "epoch": 0.8564862104187947, + "grad_norm": 1.4667618464165484, + "learning_rate": 1.0609806123822076e-06, + "loss": 0.6591, + "step": 8385 + }, + { + "epoch": 0.85658835546476, + "grad_norm": 1.3851260883922176, + "learning_rate": 1.0594980999471694e-06, + "loss": 0.6937, + "step": 8386 + }, + { + "epoch": 0.8566905005107253, + "grad_norm": 1.5715958739378904, + "learning_rate": 1.0580165660558439e-06, + "loss": 0.6998, + "step": 8387 + }, + { + "epoch": 0.8567926455566905, + "grad_norm": 1.60621104241265, + "learning_rate": 1.0565360108703816e-06, + "loss": 0.7289, + "step": 8388 + }, + { + "epoch": 0.8568947906026557, + "grad_norm": 1.5755417222954873, + "learning_rate": 1.0550564345528302e-06, + "loss": 0.6729, + "step": 8389 + }, + { + "epoch": 0.856996935648621, + "grad_norm": 1.4740455403740886, + "learning_rate": 1.0535778372651318e-06, + "loss": 0.6897, + "step": 8390 + }, + { + "epoch": 0.8570990806945863, + "grad_norm": 1.5783974294431247, + "learning_rate": 1.0521002191691153e-06, + "loss": 0.7223, + "step": 8391 + }, + { + "epoch": 0.8572012257405516, + "grad_norm": 1.4022919468166204, + "learning_rate": 1.0506235804265153e-06, + "loss": 0.6716, + "step": 8392 + }, + { + "epoch": 0.8573033707865169, + "grad_norm": 1.5579921920885453, + "learning_rate": 1.0491479211989464e-06, + "loss": 0.6409, + "step": 8393 + }, + { + "epoch": 0.8574055158324821, + "grad_norm": 1.52293031855573, + "learning_rate": 1.0476732416479207e-06, + "loss": 0.6377, + "step": 8394 + }, + { + "epoch": 0.8575076608784474, + "grad_norm": 1.4832990699534652, + "learning_rate": 1.0461995419348425e-06, + "loss": 0.6686, + "step": 8395 + }, + { + "epoch": 0.8576098059244127, + "grad_norm": 1.550337822007131, + "learning_rate": 1.0447268222210072e-06, + "loss": 0.6739, + "step": 8396 + }, + { + "epoch": 0.8577119509703779, + "grad_norm": 1.6782195940649836, + "learning_rate": 1.043255082667608e-06, + "loss": 0.6262, + "step": 8397 + }, + { + "epoch": 0.8578140960163432, + "grad_norm": 1.5755145608127419, + "learning_rate": 1.0417843234357283e-06, + "loss": 0.6656, + "step": 8398 + }, + { + "epoch": 0.8579162410623085, + "grad_norm": 1.5443480028971983, + "learning_rate": 1.0403145446863394e-06, + "loss": 0.7346, + "step": 8399 + }, + { + "epoch": 0.8580183861082737, + "grad_norm": 1.5297780278216855, + "learning_rate": 1.0388457465803148e-06, + "loss": 0.6226, + "step": 8400 + }, + { + "epoch": 0.858120531154239, + "grad_norm": 1.5091107474738008, + "learning_rate": 1.0373779292784103e-06, + "loss": 0.6427, + "step": 8401 + }, + { + "epoch": 0.8582226762002043, + "grad_norm": 1.4666599466263683, + "learning_rate": 1.0359110929412841e-06, + "loss": 0.5913, + "step": 8402 + }, + { + "epoch": 0.8583248212461696, + "grad_norm": 1.399957793209479, + "learning_rate": 1.0344452377294812e-06, + "loss": 0.6925, + "step": 8403 + }, + { + "epoch": 0.8584269662921349, + "grad_norm": 1.5421864070525229, + "learning_rate": 1.0329803638034386e-06, + "loss": 0.6885, + "step": 8404 + }, + { + "epoch": 0.8585291113381001, + "grad_norm": 1.6069704570268586, + "learning_rate": 1.031516471323487e-06, + "loss": 0.8394, + "step": 8405 + }, + { + "epoch": 0.8586312563840653, + "grad_norm": 1.5521176037173332, + "learning_rate": 1.030053560449854e-06, + "loss": 0.6796, + "step": 8406 + }, + { + "epoch": 0.8587334014300306, + "grad_norm": 1.4065750604015383, + "learning_rate": 1.0285916313426513e-06, + "loss": 0.5919, + "step": 8407 + }, + { + "epoch": 0.8588355464759959, + "grad_norm": 1.534673093252651, + "learning_rate": 1.0271306841618945e-06, + "loss": 0.624, + "step": 8408 + }, + { + "epoch": 0.8589376915219612, + "grad_norm": 1.5298296488323104, + "learning_rate": 1.0256707190674797e-06, + "loss": 0.7186, + "step": 8409 + }, + { + "epoch": 0.8590398365679265, + "grad_norm": 1.6006599863585649, + "learning_rate": 1.024211736219204e-06, + "loss": 0.6672, + "step": 8410 + }, + { + "epoch": 0.8591419816138918, + "grad_norm": 1.6403547054101784, + "learning_rate": 1.0227537357767504e-06, + "loss": 0.7179, + "step": 8411 + }, + { + "epoch": 0.8592441266598569, + "grad_norm": 1.462667240267668, + "learning_rate": 1.0212967178997024e-06, + "loss": 0.7737, + "step": 8412 + }, + { + "epoch": 0.8593462717058222, + "grad_norm": 1.5496130041531306, + "learning_rate": 1.0198406827475304e-06, + "loss": 0.6385, + "step": 8413 + }, + { + "epoch": 0.8594484167517875, + "grad_norm": 1.4386234041606987, + "learning_rate": 1.0183856304795969e-06, + "loss": 0.7122, + "step": 8414 + }, + { + "epoch": 0.8595505617977528, + "grad_norm": 1.4291252637253038, + "learning_rate": 1.0169315612551566e-06, + "loss": 0.7004, + "step": 8415 + }, + { + "epoch": 0.8596527068437181, + "grad_norm": 1.409563058017546, + "learning_rate": 1.0154784752333625e-06, + "loss": 0.695, + "step": 8416 + }, + { + "epoch": 0.8597548518896834, + "grad_norm": 1.5266882454159405, + "learning_rate": 1.0140263725732546e-06, + "loss": 0.7491, + "step": 8417 + }, + { + "epoch": 0.8598569969356487, + "grad_norm": 1.5284731824300424, + "learning_rate": 1.0125752534337664e-06, + "loss": 0.6461, + "step": 8418 + }, + { + "epoch": 0.8599591419816139, + "grad_norm": 1.2770840589351984, + "learning_rate": 1.0111251179737225e-06, + "loss": 0.6503, + "step": 8419 + }, + { + "epoch": 0.8600612870275791, + "grad_norm": 1.5599679485567866, + "learning_rate": 1.0096759663518407e-06, + "loss": 0.6957, + "step": 8420 + }, + { + "epoch": 0.8601634320735444, + "grad_norm": 1.5697499707897458, + "learning_rate": 1.0082277987267341e-06, + "loss": 0.6834, + "step": 8421 + }, + { + "epoch": 0.8602655771195097, + "grad_norm": 1.5076440741097168, + "learning_rate": 1.0067806152569048e-06, + "loss": 0.726, + "step": 8422 + }, + { + "epoch": 0.860367722165475, + "grad_norm": 1.4948877328610561, + "learning_rate": 1.0053344161007461e-06, + "loss": 0.6929, + "step": 8423 + }, + { + "epoch": 0.8604698672114403, + "grad_norm": 1.5395570487238657, + "learning_rate": 1.0038892014165491e-06, + "loss": 0.6547, + "step": 8424 + }, + { + "epoch": 0.8605720122574055, + "grad_norm": 1.3567920733803391, + "learning_rate": 1.0024449713624885e-06, + "loss": 0.5855, + "step": 8425 + }, + { + "epoch": 0.8606741573033708, + "grad_norm": 1.5776620557456118, + "learning_rate": 1.0010017260966409e-06, + "loss": 0.829, + "step": 8426 + }, + { + "epoch": 0.8607763023493361, + "grad_norm": 1.5773438314140968, + "learning_rate": 9.99559465776968e-07, + "loss": 0.6737, + "step": 8427 + }, + { + "epoch": 0.8608784473953013, + "grad_norm": 1.5651514097935753, + "learning_rate": 9.98118190561328e-07, + "loss": 0.7276, + "step": 8428 + }, + { + "epoch": 0.8609805924412666, + "grad_norm": 1.6425492111090085, + "learning_rate": 9.966779006074666e-07, + "loss": 0.7719, + "step": 8429 + }, + { + "epoch": 0.8610827374872319, + "grad_norm": 1.3180986704982216, + "learning_rate": 9.952385960730249e-07, + "loss": 0.6816, + "step": 8430 + }, + { + "epoch": 0.8611848825331971, + "grad_norm": 1.3720702559157816, + "learning_rate": 9.938002771155363e-07, + "loss": 0.6412, + "step": 8431 + }, + { + "epoch": 0.8612870275791624, + "grad_norm": 1.523881839338781, + "learning_rate": 9.923629438924275e-07, + "loss": 0.65, + "step": 8432 + }, + { + "epoch": 0.8613891726251277, + "grad_norm": 1.5530211463408303, + "learning_rate": 9.90926596561015e-07, + "loss": 0.7387, + "step": 8433 + }, + { + "epoch": 0.861491317671093, + "grad_norm": 1.504503627322489, + "learning_rate": 9.894912352785068e-07, + "loss": 0.7267, + "step": 8434 + }, + { + "epoch": 0.8615934627170582, + "grad_norm": 1.4995518305753064, + "learning_rate": 9.880568602020024e-07, + "loss": 0.635, + "step": 8435 + }, + { + "epoch": 0.8616956077630235, + "grad_norm": 1.52295734799373, + "learning_rate": 9.866234714884993e-07, + "loss": 0.7249, + "step": 8436 + }, + { + "epoch": 0.8617977528089887, + "grad_norm": 1.4918923691055421, + "learning_rate": 9.851910692948808e-07, + "loss": 0.6448, + "step": 8437 + }, + { + "epoch": 0.861899897854954, + "grad_norm": 1.4268301794085236, + "learning_rate": 9.837596537779236e-07, + "loss": 0.6643, + "step": 8438 + }, + { + "epoch": 0.8620020429009193, + "grad_norm": 1.5968089462852884, + "learning_rate": 9.82329225094296e-07, + "loss": 0.7822, + "step": 8439 + }, + { + "epoch": 0.8621041879468846, + "grad_norm": 1.4662695372847874, + "learning_rate": 9.808997834005608e-07, + "loss": 0.5761, + "step": 8440 + }, + { + "epoch": 0.8622063329928499, + "grad_norm": 1.462011915016091, + "learning_rate": 9.794713288531732e-07, + "loss": 0.6583, + "step": 8441 + }, + { + "epoch": 0.8623084780388152, + "grad_norm": 1.3541053527917732, + "learning_rate": 9.780438616084765e-07, + "loss": 0.6328, + "step": 8442 + }, + { + "epoch": 0.8624106230847803, + "grad_norm": 1.4395147613221053, + "learning_rate": 9.766173818227086e-07, + "loss": 0.6242, + "step": 8443 + }, + { + "epoch": 0.8625127681307456, + "grad_norm": 1.4203046476061356, + "learning_rate": 9.751918896519974e-07, + "loss": 0.6014, + "step": 8444 + }, + { + "epoch": 0.8626149131767109, + "grad_norm": 1.4173082398051704, + "learning_rate": 9.737673852523632e-07, + "loss": 0.6746, + "step": 8445 + }, + { + "epoch": 0.8627170582226762, + "grad_norm": 1.4861742514945837, + "learning_rate": 9.723438687797227e-07, + "loss": 0.72, + "step": 8446 + }, + { + "epoch": 0.8628192032686415, + "grad_norm": 1.541930701300977, + "learning_rate": 9.709213403898753e-07, + "loss": 0.6622, + "step": 8447 + }, + { + "epoch": 0.8629213483146068, + "grad_norm": 1.6095812558113067, + "learning_rate": 9.694998002385235e-07, + "loss": 0.6841, + "step": 8448 + }, + { + "epoch": 0.863023493360572, + "grad_norm": 1.4476702122682548, + "learning_rate": 9.68079248481253e-07, + "loss": 0.764, + "step": 8449 + }, + { + "epoch": 0.8631256384065373, + "grad_norm": 1.579290552032581, + "learning_rate": 9.66659685273542e-07, + "loss": 0.7284, + "step": 8450 + }, + { + "epoch": 0.8632277834525025, + "grad_norm": 1.4032523648897783, + "learning_rate": 9.65241110770766e-07, + "loss": 0.818, + "step": 8451 + }, + { + "epoch": 0.8633299284984678, + "grad_norm": 1.5115535217447795, + "learning_rate": 9.638235251281892e-07, + "loss": 0.6009, + "step": 8452 + }, + { + "epoch": 0.8634320735444331, + "grad_norm": 1.7035822410655495, + "learning_rate": 9.624069285009641e-07, + "loss": 0.6748, + "step": 8453 + }, + { + "epoch": 0.8635342185903984, + "grad_norm": 1.5572569503985585, + "learning_rate": 9.60991321044139e-07, + "loss": 0.6837, + "step": 8454 + }, + { + "epoch": 0.8636363636363636, + "grad_norm": 1.498325662260311, + "learning_rate": 9.595767029126525e-07, + "loss": 0.6289, + "step": 8455 + }, + { + "epoch": 0.8637385086823289, + "grad_norm": 1.5135315534106113, + "learning_rate": 9.581630742613402e-07, + "loss": 0.7445, + "step": 8456 + }, + { + "epoch": 0.8638406537282942, + "grad_norm": 1.480781254611083, + "learning_rate": 9.567504352449198e-07, + "loss": 0.7096, + "step": 8457 + }, + { + "epoch": 0.8639427987742595, + "grad_norm": 1.5788290032879526, + "learning_rate": 9.553387860180074e-07, + "loss": 0.6986, + "step": 8458 + }, + { + "epoch": 0.8640449438202247, + "grad_norm": 1.333410470461088, + "learning_rate": 9.539281267351063e-07, + "loss": 0.6328, + "step": 8459 + }, + { + "epoch": 0.86414708886619, + "grad_norm": 1.4300906666381414, + "learning_rate": 9.525184575506186e-07, + "loss": 0.6508, + "step": 8460 + }, + { + "epoch": 0.8642492339121552, + "grad_norm": 1.53269598830645, + "learning_rate": 9.511097786188317e-07, + "loss": 0.6998, + "step": 8461 + }, + { + "epoch": 0.8643513789581205, + "grad_norm": 1.5242243094165946, + "learning_rate": 9.497020900939247e-07, + "loss": 0.754, + "step": 8462 + }, + { + "epoch": 0.8644535240040858, + "grad_norm": 1.351601934160555, + "learning_rate": 9.482953921299698e-07, + "loss": 0.639, + "step": 8463 + }, + { + "epoch": 0.8645556690500511, + "grad_norm": 1.4912752568954282, + "learning_rate": 9.468896848809351e-07, + "loss": 0.7049, + "step": 8464 + }, + { + "epoch": 0.8646578140960164, + "grad_norm": 1.5719527752866476, + "learning_rate": 9.454849685006706e-07, + "loss": 0.7269, + "step": 8465 + }, + { + "epoch": 0.8647599591419816, + "grad_norm": 1.5343672445535446, + "learning_rate": 9.44081243142928e-07, + "loss": 0.7199, + "step": 8466 + }, + { + "epoch": 0.8648621041879468, + "grad_norm": 1.35141569322651, + "learning_rate": 9.426785089613443e-07, + "loss": 0.6627, + "step": 8467 + }, + { + "epoch": 0.8649642492339121, + "grad_norm": 1.6700441846213627, + "learning_rate": 9.412767661094502e-07, + "loss": 0.6861, + "step": 8468 + }, + { + "epoch": 0.8650663942798774, + "grad_norm": 1.4135988569194884, + "learning_rate": 9.398760147406638e-07, + "loss": 0.6371, + "step": 8469 + }, + { + "epoch": 0.8651685393258427, + "grad_norm": 1.4854479191466268, + "learning_rate": 9.384762550083037e-07, + "loss": 0.6193, + "step": 8470 + }, + { + "epoch": 0.865270684371808, + "grad_norm": 1.5333026786863024, + "learning_rate": 9.370774870655708e-07, + "loss": 0.649, + "step": 8471 + }, + { + "epoch": 0.8653728294177733, + "grad_norm": 1.4164293776943533, + "learning_rate": 9.356797110655624e-07, + "loss": 0.725, + "step": 8472 + }, + { + "epoch": 0.8654749744637386, + "grad_norm": 1.5361906535669616, + "learning_rate": 9.342829271612675e-07, + "loss": 0.6868, + "step": 8473 + }, + { + "epoch": 0.8655771195097037, + "grad_norm": 1.492824730050042, + "learning_rate": 9.328871355055613e-07, + "loss": 0.6268, + "step": 8474 + }, + { + "epoch": 0.865679264555669, + "grad_norm": 1.492059422984626, + "learning_rate": 9.314923362512174e-07, + "loss": 0.6288, + "step": 8475 + }, + { + "epoch": 0.8657814096016343, + "grad_norm": 1.5530588581568225, + "learning_rate": 9.300985295508968e-07, + "loss": 0.6529, + "step": 8476 + }, + { + "epoch": 0.8658835546475996, + "grad_norm": 1.470948097266118, + "learning_rate": 9.287057155571522e-07, + "loss": 0.7235, + "step": 8477 + }, + { + "epoch": 0.8659856996935649, + "grad_norm": 1.4544606632321349, + "learning_rate": 9.27313894422428e-07, + "loss": 0.5794, + "step": 8478 + }, + { + "epoch": 0.8660878447395302, + "grad_norm": 1.4097443726882306, + "learning_rate": 9.259230662990559e-07, + "loss": 0.6301, + "step": 8479 + }, + { + "epoch": 0.8661899897854954, + "grad_norm": 1.4896353570747518, + "learning_rate": 9.245332313392697e-07, + "loss": 0.6299, + "step": 8480 + }, + { + "epoch": 0.8662921348314607, + "grad_norm": 1.3633757944623672, + "learning_rate": 9.231443896951852e-07, + "loss": 0.6133, + "step": 8481 + }, + { + "epoch": 0.8663942798774259, + "grad_norm": 1.4948359827885889, + "learning_rate": 9.217565415188124e-07, + "loss": 0.7572, + "step": 8482 + }, + { + "epoch": 0.8664964249233912, + "grad_norm": 1.439120692668659, + "learning_rate": 9.203696869620504e-07, + "loss": 0.5754, + "step": 8483 + }, + { + "epoch": 0.8665985699693565, + "grad_norm": 1.5752788035590148, + "learning_rate": 9.189838261766915e-07, + "loss": 0.6252, + "step": 8484 + }, + { + "epoch": 0.8667007150153218, + "grad_norm": 1.3854623916692594, + "learning_rate": 9.175989593144208e-07, + "loss": 0.5902, + "step": 8485 + }, + { + "epoch": 0.866802860061287, + "grad_norm": 1.3718944613475241, + "learning_rate": 9.162150865268127e-07, + "loss": 0.5524, + "step": 8486 + }, + { + "epoch": 0.8669050051072523, + "grad_norm": 1.3424377460264791, + "learning_rate": 9.148322079653305e-07, + "loss": 0.7074, + "step": 8487 + }, + { + "epoch": 0.8670071501532176, + "grad_norm": 2.0298161820636755, + "learning_rate": 9.134503237813341e-07, + "loss": 0.7532, + "step": 8488 + }, + { + "epoch": 0.8671092951991829, + "grad_norm": 1.535983650914348, + "learning_rate": 9.120694341260694e-07, + "loss": 0.7474, + "step": 8489 + }, + { + "epoch": 0.8672114402451481, + "grad_norm": 1.5463542255447147, + "learning_rate": 9.106895391506776e-07, + "loss": 0.7745, + "step": 8490 + }, + { + "epoch": 0.8673135852911134, + "grad_norm": 1.5308511467459294, + "learning_rate": 9.09310639006189e-07, + "loss": 0.7632, + "step": 8491 + }, + { + "epoch": 0.8674157303370786, + "grad_norm": 1.3248789986041865, + "learning_rate": 9.07932733843524e-07, + "loss": 0.5745, + "step": 8492 + }, + { + "epoch": 0.8675178753830439, + "grad_norm": 1.579869419678343, + "learning_rate": 9.065558238134931e-07, + "loss": 0.6576, + "step": 8493 + }, + { + "epoch": 0.8676200204290092, + "grad_norm": 1.4664582813563989, + "learning_rate": 9.051799090668045e-07, + "loss": 0.7024, + "step": 8494 + }, + { + "epoch": 0.8677221654749745, + "grad_norm": 1.6180061212139403, + "learning_rate": 9.038049897540491e-07, + "loss": 0.6749, + "step": 8495 + }, + { + "epoch": 0.8678243105209398, + "grad_norm": 1.5216693703687854, + "learning_rate": 9.024310660257163e-07, + "loss": 0.7122, + "step": 8496 + }, + { + "epoch": 0.867926455566905, + "grad_norm": 1.5076061159070737, + "learning_rate": 9.010581380321814e-07, + "loss": 0.6326, + "step": 8497 + }, + { + "epoch": 0.8680286006128702, + "grad_norm": 1.4446104240930326, + "learning_rate": 8.996862059237122e-07, + "loss": 0.586, + "step": 8498 + }, + { + "epoch": 0.8681307456588355, + "grad_norm": 1.3800535367407576, + "learning_rate": 8.983152698504649e-07, + "loss": 0.6844, + "step": 8499 + }, + { + "epoch": 0.8682328907048008, + "grad_norm": 1.4492833756849497, + "learning_rate": 8.969453299624942e-07, + "loss": 0.6985, + "step": 8500 + }, + { + "epoch": 0.8683350357507661, + "grad_norm": 1.413258900593109, + "learning_rate": 8.955763864097377e-07, + "loss": 0.6097, + "step": 8501 + }, + { + "epoch": 0.8684371807967314, + "grad_norm": 1.436552167088429, + "learning_rate": 8.94208439342028e-07, + "loss": 0.7653, + "step": 8502 + }, + { + "epoch": 0.8685393258426967, + "grad_norm": 1.6099135064221297, + "learning_rate": 8.928414889090864e-07, + "loss": 0.6972, + "step": 8503 + }, + { + "epoch": 0.868641470888662, + "grad_norm": 1.4850636089653453, + "learning_rate": 8.914755352605276e-07, + "loss": 0.6342, + "step": 8504 + }, + { + "epoch": 0.8687436159346271, + "grad_norm": 1.5850498689616555, + "learning_rate": 8.901105785458586e-07, + "loss": 0.723, + "step": 8505 + }, + { + "epoch": 0.8688457609805924, + "grad_norm": 1.570473682958406, + "learning_rate": 8.887466189144711e-07, + "loss": 0.7417, + "step": 8506 + }, + { + "epoch": 0.8689479060265577, + "grad_norm": 1.5041022695298432, + "learning_rate": 8.873836565156546e-07, + "loss": 0.7084, + "step": 8507 + }, + { + "epoch": 0.869050051072523, + "grad_norm": 1.5227889398483903, + "learning_rate": 8.860216914985808e-07, + "loss": 0.6533, + "step": 8508 + }, + { + "epoch": 0.8691521961184883, + "grad_norm": 1.550466174210768, + "learning_rate": 8.846607240123239e-07, + "loss": 0.6859, + "step": 8509 + }, + { + "epoch": 0.8692543411644535, + "grad_norm": 1.6365362235019254, + "learning_rate": 8.833007542058403e-07, + "loss": 0.6777, + "step": 8510 + }, + { + "epoch": 0.8693564862104188, + "grad_norm": 1.6537623078095758, + "learning_rate": 8.819417822279775e-07, + "loss": 0.6165, + "step": 8511 + }, + { + "epoch": 0.8694586312563841, + "grad_norm": 1.4080152255879594, + "learning_rate": 8.805838082274798e-07, + "loss": 0.7291, + "step": 8512 + }, + { + "epoch": 0.8695607763023493, + "grad_norm": 1.4767296260763223, + "learning_rate": 8.792268323529729e-07, + "loss": 0.7772, + "step": 8513 + }, + { + "epoch": 0.8696629213483146, + "grad_norm": 1.4522399282233833, + "learning_rate": 8.778708547529846e-07, + "loss": 0.6439, + "step": 8514 + }, + { + "epoch": 0.8697650663942799, + "grad_norm": 1.4477429832951676, + "learning_rate": 8.765158755759251e-07, + "loss": 0.7148, + "step": 8515 + }, + { + "epoch": 0.8698672114402451, + "grad_norm": 1.4903990855022875, + "learning_rate": 8.75161894970098e-07, + "loss": 0.6084, + "step": 8516 + }, + { + "epoch": 0.8699693564862104, + "grad_norm": 1.6231633371002947, + "learning_rate": 8.738089130836958e-07, + "loss": 0.6265, + "step": 8517 + }, + { + "epoch": 0.8700715015321757, + "grad_norm": 1.5547413094045741, + "learning_rate": 8.724569300648034e-07, + "loss": 0.6714, + "step": 8518 + }, + { + "epoch": 0.870173646578141, + "grad_norm": 1.5296719447081175, + "learning_rate": 8.711059460613968e-07, + "loss": 0.745, + "step": 8519 + }, + { + "epoch": 0.8702757916241062, + "grad_norm": 1.4171461707966937, + "learning_rate": 8.697559612213447e-07, + "loss": 0.6468, + "step": 8520 + }, + { + "epoch": 0.8703779366700715, + "grad_norm": 1.49882351915601, + "learning_rate": 8.684069756924007e-07, + "loss": 0.6771, + "step": 8521 + }, + { + "epoch": 0.8704800817160367, + "grad_norm": 1.4817187953441053, + "learning_rate": 8.670589896222125e-07, + "loss": 0.8399, + "step": 8522 + }, + { + "epoch": 0.870582226762002, + "grad_norm": 1.5056321366140883, + "learning_rate": 8.657120031583177e-07, + "loss": 0.6628, + "step": 8523 + }, + { + "epoch": 0.8706843718079673, + "grad_norm": 1.4200805201599531, + "learning_rate": 8.643660164481482e-07, + "loss": 0.6307, + "step": 8524 + }, + { + "epoch": 0.8707865168539326, + "grad_norm": 1.469030613918922, + "learning_rate": 8.630210296390195e-07, + "loss": 0.6536, + "step": 8525 + }, + { + "epoch": 0.8708886618998979, + "grad_norm": 1.6352237736306625, + "learning_rate": 8.616770428781418e-07, + "loss": 0.7216, + "step": 8526 + }, + { + "epoch": 0.8709908069458632, + "grad_norm": 1.517830019913351, + "learning_rate": 8.60334056312615e-07, + "loss": 0.6887, + "step": 8527 + }, + { + "epoch": 0.8710929519918283, + "grad_norm": 1.4386319399131136, + "learning_rate": 8.589920700894306e-07, + "loss": 0.6466, + "step": 8528 + }, + { + "epoch": 0.8711950970377936, + "grad_norm": 1.4655113351727105, + "learning_rate": 8.57651084355472e-07, + "loss": 0.6312, + "step": 8529 + }, + { + "epoch": 0.8712972420837589, + "grad_norm": 1.3701969653052128, + "learning_rate": 8.563110992575086e-07, + "loss": 0.6472, + "step": 8530 + }, + { + "epoch": 0.8713993871297242, + "grad_norm": 1.5973111259943744, + "learning_rate": 8.54972114942203e-07, + "loss": 0.7185, + "step": 8531 + }, + { + "epoch": 0.8715015321756895, + "grad_norm": 1.3370096445072075, + "learning_rate": 8.53634131556108e-07, + "loss": 0.6134, + "step": 8532 + }, + { + "epoch": 0.8716036772216548, + "grad_norm": 1.5934781974486247, + "learning_rate": 8.522971492456644e-07, + "loss": 0.6986, + "step": 8533 + }, + { + "epoch": 0.8717058222676201, + "grad_norm": 1.499213510961216, + "learning_rate": 8.509611681572105e-07, + "loss": 0.6527, + "step": 8534 + }, + { + "epoch": 0.8718079673135853, + "grad_norm": 1.4781737241868766, + "learning_rate": 8.49626188436965e-07, + "loss": 0.6654, + "step": 8535 + }, + { + "epoch": 0.8719101123595505, + "grad_norm": 1.5502328614563192, + "learning_rate": 8.482922102310476e-07, + "loss": 0.7051, + "step": 8536 + }, + { + "epoch": 0.8720122574055158, + "grad_norm": 1.4146162700283884, + "learning_rate": 8.469592336854604e-07, + "loss": 0.5991, + "step": 8537 + }, + { + "epoch": 0.8721144024514811, + "grad_norm": 1.5085383524854312, + "learning_rate": 8.456272589460967e-07, + "loss": 0.6433, + "step": 8538 + }, + { + "epoch": 0.8722165474974464, + "grad_norm": 1.401819210848315, + "learning_rate": 8.442962861587456e-07, + "loss": 0.6463, + "step": 8539 + }, + { + "epoch": 0.8723186925434117, + "grad_norm": 1.3243917281224575, + "learning_rate": 8.429663154690826e-07, + "loss": 0.6301, + "step": 8540 + }, + { + "epoch": 0.8724208375893769, + "grad_norm": 1.393181121312515, + "learning_rate": 8.416373470226713e-07, + "loss": 0.6428, + "step": 8541 + }, + { + "epoch": 0.8725229826353422, + "grad_norm": 1.645955037967443, + "learning_rate": 8.403093809649676e-07, + "loss": 0.6834, + "step": 8542 + }, + { + "epoch": 0.8726251276813075, + "grad_norm": 1.5255078880438202, + "learning_rate": 8.389824174413208e-07, + "loss": 0.755, + "step": 8543 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 1.5112261408721048, + "learning_rate": 8.37656456596968e-07, + "loss": 0.7299, + "step": 8544 + }, + { + "epoch": 0.872829417773238, + "grad_norm": 1.5631188394198194, + "learning_rate": 8.363314985770366e-07, + "loss": 0.6294, + "step": 8545 + }, + { + "epoch": 0.8729315628192033, + "grad_norm": 1.455378411794105, + "learning_rate": 8.350075435265426e-07, + "loss": 0.6452, + "step": 8546 + }, + { + "epoch": 0.8730337078651685, + "grad_norm": 1.6293134673459684, + "learning_rate": 8.336845915903935e-07, + "loss": 0.7396, + "step": 8547 + }, + { + "epoch": 0.8731358529111338, + "grad_norm": 1.4200009434812784, + "learning_rate": 8.323626429133891e-07, + "loss": 0.7063, + "step": 8548 + }, + { + "epoch": 0.8732379979570991, + "grad_norm": 1.3219686683466194, + "learning_rate": 8.310416976402158e-07, + "loss": 0.6687, + "step": 8549 + }, + { + "epoch": 0.8733401430030644, + "grad_norm": 1.2910262500146137, + "learning_rate": 8.297217559154535e-07, + "loss": 0.6098, + "step": 8550 + }, + { + "epoch": 0.8734422880490296, + "grad_norm": 1.4434635361449777, + "learning_rate": 8.28402817883568e-07, + "loss": 0.6031, + "step": 8551 + }, + { + "epoch": 0.8735444330949949, + "grad_norm": 1.5066255518082399, + "learning_rate": 8.270848836889211e-07, + "loss": 0.7031, + "step": 8552 + }, + { + "epoch": 0.8736465781409601, + "grad_norm": 1.558414976811602, + "learning_rate": 8.25767953475759e-07, + "loss": 0.7637, + "step": 8553 + }, + { + "epoch": 0.8737487231869254, + "grad_norm": 1.546544348284617, + "learning_rate": 8.244520273882229e-07, + "loss": 0.8463, + "step": 8554 + }, + { + "epoch": 0.8738508682328907, + "grad_norm": 1.4544597602875897, + "learning_rate": 8.231371055703408e-07, + "loss": 0.6661, + "step": 8555 + }, + { + "epoch": 0.873953013278856, + "grad_norm": 1.4887006339989877, + "learning_rate": 8.21823188166031e-07, + "loss": 0.5966, + "step": 8556 + }, + { + "epoch": 0.8740551583248213, + "grad_norm": 1.563724029688523, + "learning_rate": 8.205102753191019e-07, + "loss": 0.7758, + "step": 8557 + }, + { + "epoch": 0.8741573033707866, + "grad_norm": 1.4562864404556435, + "learning_rate": 8.191983671732551e-07, + "loss": 0.6223, + "step": 8558 + }, + { + "epoch": 0.8742594484167517, + "grad_norm": 1.591158316125687, + "learning_rate": 8.178874638720768e-07, + "loss": 0.5759, + "step": 8559 + }, + { + "epoch": 0.874361593462717, + "grad_norm": 1.5358970391281241, + "learning_rate": 8.165775655590502e-07, + "loss": 0.7006, + "step": 8560 + }, + { + "epoch": 0.8744637385086823, + "grad_norm": 1.704286548479193, + "learning_rate": 8.152686723775427e-07, + "loss": 0.727, + "step": 8561 + }, + { + "epoch": 0.8745658835546476, + "grad_norm": 1.5723117544428094, + "learning_rate": 8.139607844708109e-07, + "loss": 0.6993, + "step": 8562 + }, + { + "epoch": 0.8746680286006129, + "grad_norm": 1.5594317069123798, + "learning_rate": 8.126539019820079e-07, + "loss": 0.7268, + "step": 8563 + }, + { + "epoch": 0.8747701736465782, + "grad_norm": 1.5080306381971469, + "learning_rate": 8.113480250541705e-07, + "loss": 0.5758, + "step": 8564 + }, + { + "epoch": 0.8748723186925434, + "grad_norm": 1.4846989613531036, + "learning_rate": 8.100431538302289e-07, + "loss": 0.6905, + "step": 8565 + }, + { + "epoch": 0.8749744637385087, + "grad_norm": 1.5576973511517211, + "learning_rate": 8.087392884530009e-07, + "loss": 0.7594, + "step": 8566 + }, + { + "epoch": 0.8750766087844739, + "grad_norm": 1.344881798560366, + "learning_rate": 8.074364290651949e-07, + "loss": 0.6065, + "step": 8567 + }, + { + "epoch": 0.8751787538304392, + "grad_norm": 1.595445155285488, + "learning_rate": 8.06134575809413e-07, + "loss": 0.6508, + "step": 8568 + }, + { + "epoch": 0.8752808988764045, + "grad_norm": 1.4548985294557544, + "learning_rate": 8.048337288281416e-07, + "loss": 0.6876, + "step": 8569 + }, + { + "epoch": 0.8753830439223698, + "grad_norm": 1.513626647396427, + "learning_rate": 8.035338882637589e-07, + "loss": 0.662, + "step": 8570 + }, + { + "epoch": 0.875485188968335, + "grad_norm": 1.539678238607038, + "learning_rate": 8.022350542585344e-07, + "loss": 0.8012, + "step": 8571 + }, + { + "epoch": 0.8755873340143003, + "grad_norm": 1.3978700844617418, + "learning_rate": 8.009372269546245e-07, + "loss": 0.685, + "step": 8572 + }, + { + "epoch": 0.8756894790602656, + "grad_norm": 1.378556083824567, + "learning_rate": 7.996404064940788e-07, + "loss": 0.7131, + "step": 8573 + }, + { + "epoch": 0.8757916241062308, + "grad_norm": 1.4237547319750954, + "learning_rate": 7.98344593018836e-07, + "loss": 0.6774, + "step": 8574 + }, + { + "epoch": 0.8758937691521961, + "grad_norm": 1.5627295229232614, + "learning_rate": 7.970497866707205e-07, + "loss": 0.7602, + "step": 8575 + }, + { + "epoch": 0.8759959141981614, + "grad_norm": 1.5554458475825865, + "learning_rate": 7.957559875914533e-07, + "loss": 0.6669, + "step": 8576 + }, + { + "epoch": 0.8760980592441266, + "grad_norm": 1.5046849168837526, + "learning_rate": 7.944631959226379e-07, + "loss": 0.6598, + "step": 8577 + }, + { + "epoch": 0.8762002042900919, + "grad_norm": 1.4453780392770423, + "learning_rate": 7.931714118057754e-07, + "loss": 0.6527, + "step": 8578 + }, + { + "epoch": 0.8763023493360572, + "grad_norm": 1.4376876804527154, + "learning_rate": 7.918806353822506e-07, + "loss": 0.6477, + "step": 8579 + }, + { + "epoch": 0.8764044943820225, + "grad_norm": 1.5410413321115763, + "learning_rate": 7.905908667933393e-07, + "loss": 0.6957, + "step": 8580 + }, + { + "epoch": 0.8765066394279878, + "grad_norm": 1.5379146364678327, + "learning_rate": 7.893021061802053e-07, + "loss": 0.6992, + "step": 8581 + }, + { + "epoch": 0.876608784473953, + "grad_norm": 1.3803955126138514, + "learning_rate": 7.880143536839091e-07, + "loss": 0.648, + "step": 8582 + }, + { + "epoch": 0.8767109295199182, + "grad_norm": 1.703007625140265, + "learning_rate": 7.867276094453913e-07, + "loss": 0.7048, + "step": 8583 + }, + { + "epoch": 0.8768130745658835, + "grad_norm": 1.50489645940922, + "learning_rate": 7.854418736054914e-07, + "loss": 0.7859, + "step": 8584 + }, + { + "epoch": 0.8769152196118488, + "grad_norm": 1.7200827094772944, + "learning_rate": 7.841571463049314e-07, + "loss": 0.6499, + "step": 8585 + }, + { + "epoch": 0.8770173646578141, + "grad_norm": 1.4787639787350402, + "learning_rate": 7.828734276843264e-07, + "loss": 0.6687, + "step": 8586 + }, + { + "epoch": 0.8771195097037794, + "grad_norm": 1.5250972562043013, + "learning_rate": 7.815907178841775e-07, + "loss": 0.6621, + "step": 8587 + }, + { + "epoch": 0.8772216547497447, + "grad_norm": 1.5510542928341962, + "learning_rate": 7.803090170448823e-07, + "loss": 0.6696, + "step": 8588 + }, + { + "epoch": 0.87732379979571, + "grad_norm": 1.3750955065125678, + "learning_rate": 7.790283253067221e-07, + "loss": 0.61, + "step": 8589 + }, + { + "epoch": 0.8774259448416751, + "grad_norm": 1.3658964969384344, + "learning_rate": 7.777486428098691e-07, + "loss": 0.6461, + "step": 8590 + }, + { + "epoch": 0.8775280898876404, + "grad_norm": 1.5079996126879893, + "learning_rate": 7.764699696943845e-07, + "loss": 0.5781, + "step": 8591 + }, + { + "epoch": 0.8776302349336057, + "grad_norm": 1.4553655232353502, + "learning_rate": 7.751923061002198e-07, + "loss": 0.6794, + "step": 8592 + }, + { + "epoch": 0.877732379979571, + "grad_norm": 1.4261302535883218, + "learning_rate": 7.739156521672186e-07, + "loss": 0.7295, + "step": 8593 + }, + { + "epoch": 0.8778345250255363, + "grad_norm": 1.319671868067928, + "learning_rate": 7.726400080351115e-07, + "loss": 0.6581, + "step": 8594 + }, + { + "epoch": 0.8779366700715016, + "grad_norm": 1.5631979798417883, + "learning_rate": 7.713653738435156e-07, + "loss": 0.7805, + "step": 8595 + }, + { + "epoch": 0.8780388151174668, + "grad_norm": 1.4995817288307924, + "learning_rate": 7.700917497319416e-07, + "loss": 0.7012, + "step": 8596 + }, + { + "epoch": 0.8781409601634321, + "grad_norm": 1.5570208610314946, + "learning_rate": 7.688191358397901e-07, + "loss": 0.6141, + "step": 8597 + }, + { + "epoch": 0.8782431052093973, + "grad_norm": 1.5561113273219938, + "learning_rate": 7.675475323063475e-07, + "loss": 0.7406, + "step": 8598 + }, + { + "epoch": 0.8783452502553626, + "grad_norm": 1.5180081044218017, + "learning_rate": 7.662769392707914e-07, + "loss": 0.6762, + "step": 8599 + }, + { + "epoch": 0.8784473953013279, + "grad_norm": 1.4476990391794289, + "learning_rate": 7.650073568721916e-07, + "loss": 0.7489, + "step": 8600 + }, + { + "epoch": 0.8785495403472932, + "grad_norm": 1.5529012561065447, + "learning_rate": 7.637387852495026e-07, + "loss": 0.6991, + "step": 8601 + }, + { + "epoch": 0.8786516853932584, + "grad_norm": 1.4925115067151076, + "learning_rate": 7.624712245415711e-07, + "loss": 0.5875, + "step": 8602 + }, + { + "epoch": 0.8787538304392237, + "grad_norm": 1.4516140630070322, + "learning_rate": 7.612046748871327e-07, + "loss": 0.6711, + "step": 8603 + }, + { + "epoch": 0.878855975485189, + "grad_norm": 1.5456186065851505, + "learning_rate": 7.599391364248121e-07, + "loss": 0.6028, + "step": 8604 + }, + { + "epoch": 0.8789581205311542, + "grad_norm": 1.4831391151566042, + "learning_rate": 7.58674609293123e-07, + "loss": 0.6434, + "step": 8605 + }, + { + "epoch": 0.8790602655771195, + "grad_norm": 1.435413704644167, + "learning_rate": 7.574110936304657e-07, + "loss": 0.7102, + "step": 8606 + }, + { + "epoch": 0.8791624106230848, + "grad_norm": 1.5716776250982578, + "learning_rate": 7.561485895751386e-07, + "loss": 0.6534, + "step": 8607 + }, + { + "epoch": 0.87926455566905, + "grad_norm": 1.555282539114663, + "learning_rate": 7.548870972653177e-07, + "loss": 0.7367, + "step": 8608 + }, + { + "epoch": 0.8793667007150153, + "grad_norm": 1.5078565978433505, + "learning_rate": 7.536266168390804e-07, + "loss": 0.7184, + "step": 8609 + }, + { + "epoch": 0.8794688457609806, + "grad_norm": 1.6246840028017153, + "learning_rate": 7.523671484343831e-07, + "loss": 0.6848, + "step": 8610 + }, + { + "epoch": 0.8795709908069459, + "grad_norm": 1.440878072381795, + "learning_rate": 7.511086921890742e-07, + "loss": 0.7016, + "step": 8611 + }, + { + "epoch": 0.8796731358529112, + "grad_norm": 1.419549109145025, + "learning_rate": 7.498512482408959e-07, + "loss": 0.613, + "step": 8612 + }, + { + "epoch": 0.8797752808988764, + "grad_norm": 1.6237718899505955, + "learning_rate": 7.485948167274759e-07, + "loss": 0.5679, + "step": 8613 + }, + { + "epoch": 0.8798774259448416, + "grad_norm": 1.6046564414394051, + "learning_rate": 7.473393977863297e-07, + "loss": 0.6502, + "step": 8614 + }, + { + "epoch": 0.8799795709908069, + "grad_norm": 1.4121589985117955, + "learning_rate": 7.460849915548618e-07, + "loss": 0.602, + "step": 8615 + }, + { + "epoch": 0.8800817160367722, + "grad_norm": 1.5770159944563202, + "learning_rate": 7.448315981703714e-07, + "loss": 0.637, + "step": 8616 + }, + { + "epoch": 0.8801838610827375, + "grad_norm": 1.4234742562001277, + "learning_rate": 7.435792177700441e-07, + "loss": 0.7638, + "step": 8617 + }, + { + "epoch": 0.8802860061287028, + "grad_norm": 1.6016935628822655, + "learning_rate": 7.423278504909515e-07, + "loss": 0.6768, + "step": 8618 + }, + { + "epoch": 0.8803881511746681, + "grad_norm": 1.4335603327837425, + "learning_rate": 7.410774964700573e-07, + "loss": 0.6658, + "step": 8619 + }, + { + "epoch": 0.8804902962206334, + "grad_norm": 1.4807743815972503, + "learning_rate": 7.398281558442111e-07, + "loss": 0.717, + "step": 8620 + }, + { + "epoch": 0.8805924412665985, + "grad_norm": 1.2937354161016952, + "learning_rate": 7.385798287501578e-07, + "loss": 0.689, + "step": 8621 + }, + { + "epoch": 0.8806945863125638, + "grad_norm": 1.5090528781857355, + "learning_rate": 7.373325153245259e-07, + "loss": 0.7532, + "step": 8622 + }, + { + "epoch": 0.8807967313585291, + "grad_norm": 1.480857079914353, + "learning_rate": 7.36086215703834e-07, + "loss": 0.6672, + "step": 8623 + }, + { + "epoch": 0.8808988764044944, + "grad_norm": 1.5581295927233445, + "learning_rate": 7.348409300244896e-07, + "loss": 0.7659, + "step": 8624 + }, + { + "epoch": 0.8810010214504597, + "grad_norm": 1.469544334535651, + "learning_rate": 7.335966584227939e-07, + "loss": 0.7415, + "step": 8625 + }, + { + "epoch": 0.881103166496425, + "grad_norm": 1.35035026634553, + "learning_rate": 7.323534010349287e-07, + "loss": 0.6791, + "step": 8626 + }, + { + "epoch": 0.8812053115423902, + "grad_norm": 1.461905426783304, + "learning_rate": 7.311111579969732e-07, + "loss": 0.6837, + "step": 8627 + }, + { + "epoch": 0.8813074565883555, + "grad_norm": 1.5759890752225938, + "learning_rate": 7.298699294448896e-07, + "loss": 0.71, + "step": 8628 + }, + { + "epoch": 0.8814096016343207, + "grad_norm": 1.3705500129798684, + "learning_rate": 7.286297155145317e-07, + "loss": 0.6622, + "step": 8629 + }, + { + "epoch": 0.881511746680286, + "grad_norm": 1.2688729796951292, + "learning_rate": 7.273905163416394e-07, + "loss": 0.5325, + "step": 8630 + }, + { + "epoch": 0.8816138917262513, + "grad_norm": 1.588866617864123, + "learning_rate": 7.26152332061848e-07, + "loss": 0.6501, + "step": 8631 + }, + { + "epoch": 0.8817160367722165, + "grad_norm": 1.5735612369424323, + "learning_rate": 7.249151628106744e-07, + "loss": 0.734, + "step": 8632 + }, + { + "epoch": 0.8818181818181818, + "grad_norm": 1.6342804297516687, + "learning_rate": 7.236790087235302e-07, + "loss": 0.6721, + "step": 8633 + }, + { + "epoch": 0.8819203268641471, + "grad_norm": 1.4727447012501826, + "learning_rate": 7.224438699357117e-07, + "loss": 0.5519, + "step": 8634 + }, + { + "epoch": 0.8820224719101124, + "grad_norm": 1.4364877225452468, + "learning_rate": 7.212097465824031e-07, + "loss": 0.6183, + "step": 8635 + }, + { + "epoch": 0.8821246169560776, + "grad_norm": 1.499651177544667, + "learning_rate": 7.199766387986851e-07, + "loss": 0.6802, + "step": 8636 + }, + { + "epoch": 0.8822267620020429, + "grad_norm": 1.45956453680312, + "learning_rate": 7.187445467195198e-07, + "loss": 0.6052, + "step": 8637 + }, + { + "epoch": 0.8823289070480081, + "grad_norm": 1.5373904737649855, + "learning_rate": 7.175134704797593e-07, + "loss": 0.686, + "step": 8638 + }, + { + "epoch": 0.8824310520939734, + "grad_norm": 1.4597139797263583, + "learning_rate": 7.16283410214148e-07, + "loss": 0.7785, + "step": 8639 + }, + { + "epoch": 0.8825331971399387, + "grad_norm": 1.7062323283156, + "learning_rate": 7.150543660573128e-07, + "loss": 0.7869, + "step": 8640 + }, + { + "epoch": 0.882635342185904, + "grad_norm": 1.6382279328375466, + "learning_rate": 7.138263381437772e-07, + "loss": 0.7401, + "step": 8641 + }, + { + "epoch": 0.8827374872318693, + "grad_norm": 1.4180042610010084, + "learning_rate": 7.12599326607949e-07, + "loss": 0.7308, + "step": 8642 + }, + { + "epoch": 0.8828396322778346, + "grad_norm": 1.4980664226934188, + "learning_rate": 7.113733315841254e-07, + "loss": 0.6746, + "step": 8643 + }, + { + "epoch": 0.8829417773237997, + "grad_norm": 1.4209841713634492, + "learning_rate": 7.101483532064923e-07, + "loss": 0.6074, + "step": 8644 + }, + { + "epoch": 0.883043922369765, + "grad_norm": 1.5638245721451667, + "learning_rate": 7.089243916091215e-07, + "loss": 0.7248, + "step": 8645 + }, + { + "epoch": 0.8831460674157303, + "grad_norm": 1.5769991322710115, + "learning_rate": 7.077014469259813e-07, + "loss": 0.7183, + "step": 8646 + }, + { + "epoch": 0.8832482124616956, + "grad_norm": 1.3453241286517867, + "learning_rate": 7.064795192909213e-07, + "loss": 0.6837, + "step": 8647 + }, + { + "epoch": 0.8833503575076609, + "grad_norm": 1.6658428242549277, + "learning_rate": 7.0525860883768e-07, + "loss": 0.7996, + "step": 8648 + }, + { + "epoch": 0.8834525025536262, + "grad_norm": 1.521745775608374, + "learning_rate": 7.040387156998918e-07, + "loss": 0.6886, + "step": 8649 + }, + { + "epoch": 0.8835546475995915, + "grad_norm": 1.4888262831018038, + "learning_rate": 7.028198400110697e-07, + "loss": 0.6798, + "step": 8650 + }, + { + "epoch": 0.8836567926455567, + "grad_norm": 1.584722758899668, + "learning_rate": 7.016019819046239e-07, + "loss": 0.658, + "step": 8651 + }, + { + "epoch": 0.8837589376915219, + "grad_norm": 1.5803465791097386, + "learning_rate": 7.003851415138497e-07, + "loss": 0.6851, + "step": 8652 + }, + { + "epoch": 0.8838610827374872, + "grad_norm": 1.3770448516571323, + "learning_rate": 6.991693189719295e-07, + "loss": 0.5981, + "step": 8653 + }, + { + "epoch": 0.8839632277834525, + "grad_norm": 1.436312580607404, + "learning_rate": 6.979545144119349e-07, + "loss": 0.6978, + "step": 8654 + }, + { + "epoch": 0.8840653728294178, + "grad_norm": 1.4846882638915846, + "learning_rate": 6.967407279668304e-07, + "loss": 0.5626, + "step": 8655 + }, + { + "epoch": 0.8841675178753831, + "grad_norm": 1.4510727865474848, + "learning_rate": 6.955279597694631e-07, + "loss": 0.5806, + "step": 8656 + }, + { + "epoch": 0.8842696629213483, + "grad_norm": 1.4093221789973185, + "learning_rate": 6.943162099525724e-07, + "loss": 0.7264, + "step": 8657 + }, + { + "epoch": 0.8843718079673136, + "grad_norm": 1.5174851374251133, + "learning_rate": 6.931054786487857e-07, + "loss": 0.6673, + "step": 8658 + }, + { + "epoch": 0.8844739530132788, + "grad_norm": 1.458727415236385, + "learning_rate": 6.918957659906167e-07, + "loss": 0.6337, + "step": 8659 + }, + { + "epoch": 0.8845760980592441, + "grad_norm": 1.6251352448654643, + "learning_rate": 6.906870721104686e-07, + "loss": 0.6788, + "step": 8660 + }, + { + "epoch": 0.8846782431052094, + "grad_norm": 1.5164862326234092, + "learning_rate": 6.894793971406366e-07, + "loss": 0.6953, + "step": 8661 + }, + { + "epoch": 0.8847803881511747, + "grad_norm": 1.4912789400617983, + "learning_rate": 6.882727412132995e-07, + "loss": 0.7717, + "step": 8662 + }, + { + "epoch": 0.8848825331971399, + "grad_norm": 1.506825891482042, + "learning_rate": 6.87067104460527e-07, + "loss": 0.6252, + "step": 8663 + }, + { + "epoch": 0.8849846782431052, + "grad_norm": 1.4723967965827243, + "learning_rate": 6.85862487014276e-07, + "loss": 0.6878, + "step": 8664 + }, + { + "epoch": 0.8850868232890705, + "grad_norm": 1.487931872838047, + "learning_rate": 6.846588890063932e-07, + "loss": 0.71, + "step": 8665 + }, + { + "epoch": 0.8851889683350358, + "grad_norm": 1.4847231257089475, + "learning_rate": 6.834563105686154e-07, + "loss": 0.6695, + "step": 8666 + }, + { + "epoch": 0.885291113381001, + "grad_norm": 1.5790069673911127, + "learning_rate": 6.822547518325629e-07, + "loss": 0.692, + "step": 8667 + }, + { + "epoch": 0.8853932584269663, + "grad_norm": 1.5943548472849036, + "learning_rate": 6.810542129297482e-07, + "loss": 0.7065, + "step": 8668 + }, + { + "epoch": 0.8854954034729315, + "grad_norm": 1.588287073739239, + "learning_rate": 6.798546939915696e-07, + "loss": 0.7357, + "step": 8669 + }, + { + "epoch": 0.8855975485188968, + "grad_norm": 1.4473952425390095, + "learning_rate": 6.786561951493176e-07, + "loss": 0.6641, + "step": 8670 + }, + { + "epoch": 0.8856996935648621, + "grad_norm": 1.5246084510771867, + "learning_rate": 6.774587165341673e-07, + "loss": 0.6996, + "step": 8671 + }, + { + "epoch": 0.8858018386108274, + "grad_norm": 1.4868082906999878, + "learning_rate": 6.762622582771817e-07, + "loss": 0.6862, + "step": 8672 + }, + { + "epoch": 0.8859039836567927, + "grad_norm": 1.429528597475294, + "learning_rate": 6.750668205093181e-07, + "loss": 0.6598, + "step": 8673 + }, + { + "epoch": 0.886006128702758, + "grad_norm": 1.4463597751889898, + "learning_rate": 6.73872403361413e-07, + "loss": 0.6459, + "step": 8674 + }, + { + "epoch": 0.8861082737487231, + "grad_norm": 1.5230479077773138, + "learning_rate": 6.72679006964202e-07, + "loss": 0.76, + "step": 8675 + }, + { + "epoch": 0.8862104187946884, + "grad_norm": 1.6639118685268157, + "learning_rate": 6.714866314482982e-07, + "loss": 0.7702, + "step": 8676 + }, + { + "epoch": 0.8863125638406537, + "grad_norm": 1.5108821019019738, + "learning_rate": 6.702952769442106e-07, + "loss": 0.6618, + "step": 8677 + }, + { + "epoch": 0.886414708886619, + "grad_norm": 1.8006103149687902, + "learning_rate": 6.691049435823327e-07, + "loss": 0.7642, + "step": 8678 + }, + { + "epoch": 0.8865168539325843, + "grad_norm": 1.6806126075107652, + "learning_rate": 6.679156314929458e-07, + "loss": 0.7118, + "step": 8679 + }, + { + "epoch": 0.8866189989785496, + "grad_norm": 1.479644258481133, + "learning_rate": 6.667273408062214e-07, + "loss": 0.7622, + "step": 8680 + }, + { + "epoch": 0.8867211440245149, + "grad_norm": 1.439301912073313, + "learning_rate": 6.65540071652222e-07, + "loss": 0.6384, + "step": 8681 + }, + { + "epoch": 0.8868232890704801, + "grad_norm": 1.63038246133096, + "learning_rate": 6.643538241608927e-07, + "loss": 0.7015, + "step": 8682 + }, + { + "epoch": 0.8869254341164453, + "grad_norm": 1.4942964900160889, + "learning_rate": 6.631685984620684e-07, + "loss": 0.7854, + "step": 8683 + }, + { + "epoch": 0.8870275791624106, + "grad_norm": 1.456855076226014, + "learning_rate": 6.61984394685472e-07, + "loss": 0.6834, + "step": 8684 + }, + { + "epoch": 0.8871297242083759, + "grad_norm": 1.412265149192123, + "learning_rate": 6.608012129607189e-07, + "loss": 0.6113, + "step": 8685 + }, + { + "epoch": 0.8872318692543412, + "grad_norm": 1.4055654947328389, + "learning_rate": 6.596190534173063e-07, + "loss": 0.5073, + "step": 8686 + }, + { + "epoch": 0.8873340143003065, + "grad_norm": 1.6181563723524108, + "learning_rate": 6.584379161846222e-07, + "loss": 0.6768, + "step": 8687 + }, + { + "epoch": 0.8874361593462717, + "grad_norm": 1.5563665141725986, + "learning_rate": 6.572578013919429e-07, + "loss": 0.6727, + "step": 8688 + }, + { + "epoch": 0.887538304392237, + "grad_norm": 1.414526633878522, + "learning_rate": 6.56078709168434e-07, + "loss": 0.7156, + "step": 8689 + }, + { + "epoch": 0.8876404494382022, + "grad_norm": 1.6200848607184277, + "learning_rate": 6.549006396431478e-07, + "loss": 0.662, + "step": 8690 + }, + { + "epoch": 0.8877425944841675, + "grad_norm": 1.4931208456925913, + "learning_rate": 6.537235929450247e-07, + "loss": 0.7092, + "step": 8691 + }, + { + "epoch": 0.8878447395301328, + "grad_norm": 1.5654866097105613, + "learning_rate": 6.525475692028926e-07, + "loss": 0.7607, + "step": 8692 + }, + { + "epoch": 0.887946884576098, + "grad_norm": 1.4103081637117254, + "learning_rate": 6.513725685454686e-07, + "loss": 0.6721, + "step": 8693 + }, + { + "epoch": 0.8880490296220633, + "grad_norm": 1.5687115692647875, + "learning_rate": 6.501985911013564e-07, + "loss": 0.6849, + "step": 8694 + }, + { + "epoch": 0.8881511746680286, + "grad_norm": 1.493855756922009, + "learning_rate": 6.490256369990489e-07, + "loss": 0.7048, + "step": 8695 + }, + { + "epoch": 0.8882533197139939, + "grad_norm": 1.6214278216873494, + "learning_rate": 6.478537063669266e-07, + "loss": 0.701, + "step": 8696 + }, + { + "epoch": 0.8883554647599592, + "grad_norm": 1.3964087724680025, + "learning_rate": 6.466827993332591e-07, + "loss": 0.677, + "step": 8697 + }, + { + "epoch": 0.8884576098059244, + "grad_norm": 1.7057732681849032, + "learning_rate": 6.45512916026203e-07, + "loss": 0.7572, + "step": 8698 + }, + { + "epoch": 0.8885597548518896, + "grad_norm": 1.3175212125384081, + "learning_rate": 6.443440565738002e-07, + "loss": 0.6532, + "step": 8699 + }, + { + "epoch": 0.8886618998978549, + "grad_norm": 1.4204988931620914, + "learning_rate": 6.431762211039861e-07, + "loss": 0.6882, + "step": 8700 + }, + { + "epoch": 0.8887640449438202, + "grad_norm": 1.4205872154479953, + "learning_rate": 6.420094097445806e-07, + "loss": 0.603, + "step": 8701 + }, + { + "epoch": 0.8888661899897855, + "grad_norm": 1.410815896572816, + "learning_rate": 6.408436226232906e-07, + "loss": 0.6039, + "step": 8702 + }, + { + "epoch": 0.8889683350357508, + "grad_norm": 1.4526254084414725, + "learning_rate": 6.396788598677117e-07, + "loss": 0.6674, + "step": 8703 + }, + { + "epoch": 0.8890704800817161, + "grad_norm": 1.5670316413323149, + "learning_rate": 6.385151216053287e-07, + "loss": 0.7221, + "step": 8704 + }, + { + "epoch": 0.8891726251276814, + "grad_norm": 1.4049155533490467, + "learning_rate": 6.373524079635152e-07, + "loss": 0.6239, + "step": 8705 + }, + { + "epoch": 0.8892747701736465, + "grad_norm": 1.4981341705373787, + "learning_rate": 6.361907190695304e-07, + "loss": 0.6514, + "step": 8706 + }, + { + "epoch": 0.8893769152196118, + "grad_norm": 1.5586782853849446, + "learning_rate": 6.350300550505217e-07, + "loss": 0.7045, + "step": 8707 + }, + { + "epoch": 0.8894790602655771, + "grad_norm": 1.582349511447863, + "learning_rate": 6.338704160335219e-07, + "loss": 0.6999, + "step": 8708 + }, + { + "epoch": 0.8895812053115424, + "grad_norm": 1.6103633406372455, + "learning_rate": 6.32711802145457e-07, + "loss": 0.7195, + "step": 8709 + }, + { + "epoch": 0.8896833503575077, + "grad_norm": 1.3783794694241736, + "learning_rate": 6.31554213513138e-07, + "loss": 0.6439, + "step": 8710 + }, + { + "epoch": 0.889785495403473, + "grad_norm": 1.4483351183961986, + "learning_rate": 6.303976502632636e-07, + "loss": 0.677, + "step": 8711 + }, + { + "epoch": 0.8898876404494382, + "grad_norm": 1.4527275625705662, + "learning_rate": 6.29242112522418e-07, + "loss": 0.5745, + "step": 8712 + }, + { + "epoch": 0.8899897854954034, + "grad_norm": 1.3471301953537171, + "learning_rate": 6.2808760041708e-07, + "loss": 0.6692, + "step": 8713 + }, + { + "epoch": 0.8900919305413687, + "grad_norm": 1.6188563636696014, + "learning_rate": 6.269341140736063e-07, + "loss": 0.7548, + "step": 8714 + }, + { + "epoch": 0.890194075587334, + "grad_norm": 1.4118017899349982, + "learning_rate": 6.257816536182515e-07, + "loss": 0.618, + "step": 8715 + }, + { + "epoch": 0.8902962206332993, + "grad_norm": 1.54730451626402, + "learning_rate": 6.246302191771514e-07, + "loss": 0.6351, + "step": 8716 + }, + { + "epoch": 0.8903983656792646, + "grad_norm": 1.5254066788204255, + "learning_rate": 6.234798108763307e-07, + "loss": 0.6171, + "step": 8717 + }, + { + "epoch": 0.8905005107252298, + "grad_norm": 1.4138694857381509, + "learning_rate": 6.22330428841702e-07, + "loss": 0.6666, + "step": 8718 + }, + { + "epoch": 0.8906026557711951, + "grad_norm": 1.7104141662884873, + "learning_rate": 6.21182073199067e-07, + "loss": 0.6307, + "step": 8719 + }, + { + "epoch": 0.8907048008171604, + "grad_norm": 1.5563856663735036, + "learning_rate": 6.200347440741128e-07, + "loss": 0.6301, + "step": 8720 + }, + { + "epoch": 0.8908069458631256, + "grad_norm": 1.5502106425169953, + "learning_rate": 6.188884415924179e-07, + "loss": 0.7038, + "step": 8721 + }, + { + "epoch": 0.8909090909090909, + "grad_norm": 1.4890063308524728, + "learning_rate": 6.177431658794431e-07, + "loss": 0.5337, + "step": 8722 + }, + { + "epoch": 0.8910112359550562, + "grad_norm": 1.3575587133797773, + "learning_rate": 6.165989170605391e-07, + "loss": 0.6168, + "step": 8723 + }, + { + "epoch": 0.8911133810010214, + "grad_norm": 1.457395567773895, + "learning_rate": 6.154556952609481e-07, + "loss": 0.7976, + "step": 8724 + }, + { + "epoch": 0.8912155260469867, + "grad_norm": 1.5692735339494268, + "learning_rate": 6.143135006057943e-07, + "loss": 0.7453, + "step": 8725 + }, + { + "epoch": 0.891317671092952, + "grad_norm": 1.6201777451445352, + "learning_rate": 6.131723332200923e-07, + "loss": 0.6862, + "step": 8726 + }, + { + "epoch": 0.8914198161389173, + "grad_norm": 1.5329035520926653, + "learning_rate": 6.120321932287421e-07, + "loss": 0.6912, + "step": 8727 + }, + { + "epoch": 0.8915219611848826, + "grad_norm": 1.730828106130853, + "learning_rate": 6.108930807565316e-07, + "loss": 0.7921, + "step": 8728 + }, + { + "epoch": 0.8916241062308478, + "grad_norm": 1.774290956196776, + "learning_rate": 6.097549959281424e-07, + "loss": 0.7159, + "step": 8729 + }, + { + "epoch": 0.891726251276813, + "grad_norm": 1.5384140050101336, + "learning_rate": 6.086179388681357e-07, + "loss": 0.6935, + "step": 8730 + }, + { + "epoch": 0.8918283963227783, + "grad_norm": 1.735607376501701, + "learning_rate": 6.074819097009632e-07, + "loss": 0.8618, + "step": 8731 + }, + { + "epoch": 0.8919305413687436, + "grad_norm": 1.4797005832341164, + "learning_rate": 6.063469085509632e-07, + "loss": 0.6054, + "step": 8732 + }, + { + "epoch": 0.8920326864147089, + "grad_norm": 1.4603111138388694, + "learning_rate": 6.052129355423631e-07, + "loss": 0.5767, + "step": 8733 + }, + { + "epoch": 0.8921348314606742, + "grad_norm": 1.4501860582164268, + "learning_rate": 6.040799907992778e-07, + "loss": 0.6662, + "step": 8734 + }, + { + "epoch": 0.8922369765066395, + "grad_norm": 1.414462586374828, + "learning_rate": 6.029480744457072e-07, + "loss": 0.5826, + "step": 8735 + }, + { + "epoch": 0.8923391215526048, + "grad_norm": 1.5616616987694585, + "learning_rate": 6.018171866055411e-07, + "loss": 0.7253, + "step": 8736 + }, + { + "epoch": 0.8924412665985699, + "grad_norm": 1.6074297043281531, + "learning_rate": 6.006873274025571e-07, + "loss": 0.6803, + "step": 8737 + }, + { + "epoch": 0.8925434116445352, + "grad_norm": 1.3547492001570465, + "learning_rate": 5.995584969604151e-07, + "loss": 0.6023, + "step": 8738 + }, + { + "epoch": 0.8926455566905005, + "grad_norm": 1.492475947995031, + "learning_rate": 5.98430695402672e-07, + "loss": 0.6885, + "step": 8739 + }, + { + "epoch": 0.8927477017364658, + "grad_norm": 1.4902546087565836, + "learning_rate": 5.973039228527633e-07, + "loss": 0.7466, + "step": 8740 + }, + { + "epoch": 0.8928498467824311, + "grad_norm": 1.370045388179979, + "learning_rate": 5.961781794340149e-07, + "loss": 0.7086, + "step": 8741 + }, + { + "epoch": 0.8929519918283964, + "grad_norm": 1.4779713181599967, + "learning_rate": 5.950534652696382e-07, + "loss": 0.7071, + "step": 8742 + }, + { + "epoch": 0.8930541368743616, + "grad_norm": 1.421556033918497, + "learning_rate": 5.939297804827382e-07, + "loss": 0.7084, + "step": 8743 + }, + { + "epoch": 0.8931562819203268, + "grad_norm": 1.4809840657192253, + "learning_rate": 5.928071251962996e-07, + "loss": 0.6921, + "step": 8744 + }, + { + "epoch": 0.8932584269662921, + "grad_norm": 1.5064407244508502, + "learning_rate": 5.916854995331999e-07, + "loss": 0.6395, + "step": 8745 + }, + { + "epoch": 0.8933605720122574, + "grad_norm": 1.5078182312516784, + "learning_rate": 5.90564903616202e-07, + "loss": 0.6768, + "step": 8746 + }, + { + "epoch": 0.8934627170582227, + "grad_norm": 1.5119788148300612, + "learning_rate": 5.894453375679532e-07, + "loss": 0.6475, + "step": 8747 + }, + { + "epoch": 0.893564862104188, + "grad_norm": 1.690172145745588, + "learning_rate": 5.883268015109911e-07, + "loss": 0.7778, + "step": 8748 + }, + { + "epoch": 0.8936670071501532, + "grad_norm": 1.4843278237519608, + "learning_rate": 5.872092955677433e-07, + "loss": 0.7007, + "step": 8749 + }, + { + "epoch": 0.8937691521961185, + "grad_norm": 1.6404035426750663, + "learning_rate": 5.860928198605198e-07, + "loss": 0.666, + "step": 8750 + }, + { + "epoch": 0.8938712972420838, + "grad_norm": 1.508786425044505, + "learning_rate": 5.849773745115183e-07, + "loss": 0.6746, + "step": 8751 + }, + { + "epoch": 0.893973442288049, + "grad_norm": 1.390417382923603, + "learning_rate": 5.838629596428247e-07, + "loss": 0.7283, + "step": 8752 + }, + { + "epoch": 0.8940755873340143, + "grad_norm": 1.4988054935561899, + "learning_rate": 5.827495753764146e-07, + "loss": 0.6404, + "step": 8753 + }, + { + "epoch": 0.8941777323799796, + "grad_norm": 1.6360246787634953, + "learning_rate": 5.816372218341482e-07, + "loss": 0.7224, + "step": 8754 + }, + { + "epoch": 0.8942798774259448, + "grad_norm": 1.5684150933680907, + "learning_rate": 5.805258991377737e-07, + "loss": 0.6918, + "step": 8755 + }, + { + "epoch": 0.8943820224719101, + "grad_norm": 1.5422055534878594, + "learning_rate": 5.794156074089253e-07, + "loss": 0.6114, + "step": 8756 + }, + { + "epoch": 0.8944841675178754, + "grad_norm": 1.4407465232181196, + "learning_rate": 5.783063467691241e-07, + "loss": 0.6602, + "step": 8757 + }, + { + "epoch": 0.8945863125638407, + "grad_norm": 1.543354244918916, + "learning_rate": 5.771981173397811e-07, + "loss": 0.6327, + "step": 8758 + }, + { + "epoch": 0.894688457609806, + "grad_norm": 1.3958533856507473, + "learning_rate": 5.760909192421916e-07, + "loss": 0.7049, + "step": 8759 + }, + { + "epoch": 0.8947906026557712, + "grad_norm": 1.5120161491899482, + "learning_rate": 5.749847525975393e-07, + "loss": 0.6633, + "step": 8760 + }, + { + "epoch": 0.8948927477017364, + "grad_norm": 1.5223496484705068, + "learning_rate": 5.738796175268957e-07, + "loss": 0.6321, + "step": 8761 + }, + { + "epoch": 0.8949948927477017, + "grad_norm": 1.500562677731446, + "learning_rate": 5.72775514151217e-07, + "loss": 0.7773, + "step": 8762 + }, + { + "epoch": 0.895097037793667, + "grad_norm": 1.5271248767253236, + "learning_rate": 5.716724425913511e-07, + "loss": 0.6594, + "step": 8763 + }, + { + "epoch": 0.8951991828396323, + "grad_norm": 1.6346004479995313, + "learning_rate": 5.70570402968027e-07, + "loss": 0.6791, + "step": 8764 + }, + { + "epoch": 0.8953013278855976, + "grad_norm": 1.4670581736969028, + "learning_rate": 5.694693954018649e-07, + "loss": 0.6715, + "step": 8765 + }, + { + "epoch": 0.8954034729315629, + "grad_norm": 1.5029790183836607, + "learning_rate": 5.683694200133705e-07, + "loss": 0.6901, + "step": 8766 + }, + { + "epoch": 0.895505617977528, + "grad_norm": 1.49086114473795, + "learning_rate": 5.672704769229342e-07, + "loss": 0.7195, + "step": 8767 + }, + { + "epoch": 0.8956077630234933, + "grad_norm": 1.5644298866233797, + "learning_rate": 5.661725662508399e-07, + "loss": 0.648, + "step": 8768 + }, + { + "epoch": 0.8957099080694586, + "grad_norm": 1.306758995326433, + "learning_rate": 5.650756881172536e-07, + "loss": 0.6355, + "step": 8769 + }, + { + "epoch": 0.8958120531154239, + "grad_norm": 1.3986012492971005, + "learning_rate": 5.639798426422293e-07, + "loss": 0.608, + "step": 8770 + }, + { + "epoch": 0.8959141981613892, + "grad_norm": 1.4969125098704537, + "learning_rate": 5.628850299457078e-07, + "loss": 0.7368, + "step": 8771 + }, + { + "epoch": 0.8960163432073545, + "grad_norm": 1.6760131782442813, + "learning_rate": 5.617912501475153e-07, + "loss": 0.743, + "step": 8772 + }, + { + "epoch": 0.8961184882533197, + "grad_norm": 1.644853612849001, + "learning_rate": 5.606985033673706e-07, + "loss": 0.7298, + "step": 8773 + }, + { + "epoch": 0.896220633299285, + "grad_norm": 1.5134450622041864, + "learning_rate": 5.596067897248724e-07, + "loss": 0.6919, + "step": 8774 + }, + { + "epoch": 0.8963227783452502, + "grad_norm": 1.5393771271788381, + "learning_rate": 5.585161093395108e-07, + "loss": 0.727, + "step": 8775 + }, + { + "epoch": 0.8964249233912155, + "grad_norm": 1.5742064328712582, + "learning_rate": 5.574264623306591e-07, + "loss": 0.6037, + "step": 8776 + }, + { + "epoch": 0.8965270684371808, + "grad_norm": 1.4336160282242218, + "learning_rate": 5.563378488175819e-07, + "loss": 0.6172, + "step": 8777 + }, + { + "epoch": 0.8966292134831461, + "grad_norm": 1.4574072788285337, + "learning_rate": 5.552502689194306e-07, + "loss": 0.7123, + "step": 8778 + }, + { + "epoch": 0.8967313585291113, + "grad_norm": 1.394934402350585, + "learning_rate": 5.541637227552388e-07, + "loss": 0.6557, + "step": 8779 + }, + { + "epoch": 0.8968335035750766, + "grad_norm": 1.5835279095223485, + "learning_rate": 5.530782104439303e-07, + "loss": 0.72, + "step": 8780 + }, + { + "epoch": 0.8969356486210419, + "grad_norm": 1.423143014684887, + "learning_rate": 5.519937321043156e-07, + "loss": 0.6148, + "step": 8781 + }, + { + "epoch": 0.8970377936670072, + "grad_norm": 1.5779165503335235, + "learning_rate": 5.509102878550887e-07, + "loss": 0.7151, + "step": 8782 + }, + { + "epoch": 0.8971399387129724, + "grad_norm": 1.4210091568252716, + "learning_rate": 5.49827877814838e-07, + "loss": 0.6478, + "step": 8783 + }, + { + "epoch": 0.8972420837589377, + "grad_norm": 1.5651173570784946, + "learning_rate": 5.487465021020299e-07, + "loss": 0.7763, + "step": 8784 + }, + { + "epoch": 0.8973442288049029, + "grad_norm": 1.6675001045835647, + "learning_rate": 5.476661608350253e-07, + "loss": 0.7791, + "step": 8785 + }, + { + "epoch": 0.8974463738508682, + "grad_norm": 1.5041781455862193, + "learning_rate": 5.465868541320662e-07, + "loss": 0.6549, + "step": 8786 + }, + { + "epoch": 0.8975485188968335, + "grad_norm": 1.453548114823558, + "learning_rate": 5.455085821112827e-07, + "loss": 0.6595, + "step": 8787 + }, + { + "epoch": 0.8976506639427988, + "grad_norm": 1.4291824752251765, + "learning_rate": 5.444313448906935e-07, + "loss": 0.6616, + "step": 8788 + }, + { + "epoch": 0.8977528089887641, + "grad_norm": 1.598893225593028, + "learning_rate": 5.433551425882034e-07, + "loss": 0.6932, + "step": 8789 + }, + { + "epoch": 0.8978549540347294, + "grad_norm": 1.575748405002786, + "learning_rate": 5.422799753216024e-07, + "loss": 0.7382, + "step": 8790 + }, + { + "epoch": 0.8979570990806945, + "grad_norm": 1.5479408463337505, + "learning_rate": 5.412058432085676e-07, + "loss": 0.7069, + "step": 8791 + }, + { + "epoch": 0.8980592441266598, + "grad_norm": 1.574614019741522, + "learning_rate": 5.40132746366664e-07, + "loss": 0.6086, + "step": 8792 + }, + { + "epoch": 0.8981613891726251, + "grad_norm": 1.5355385238222923, + "learning_rate": 5.390606849133451e-07, + "loss": 0.6341, + "step": 8793 + }, + { + "epoch": 0.8982635342185904, + "grad_norm": 1.487011354504039, + "learning_rate": 5.379896589659461e-07, + "loss": 0.7245, + "step": 8794 + }, + { + "epoch": 0.8983656792645557, + "grad_norm": 1.5588263710421255, + "learning_rate": 5.369196686416933e-07, + "loss": 0.6725, + "step": 8795 + }, + { + "epoch": 0.898467824310521, + "grad_norm": 1.420305935395289, + "learning_rate": 5.35850714057694e-07, + "loss": 0.6474, + "step": 8796 + }, + { + "epoch": 0.8985699693564863, + "grad_norm": 1.6177450656437609, + "learning_rate": 5.347827953309504e-07, + "loss": 0.6756, + "step": 8797 + }, + { + "epoch": 0.8986721144024514, + "grad_norm": 1.4348818450335272, + "learning_rate": 5.337159125783453e-07, + "loss": 0.6149, + "step": 8798 + }, + { + "epoch": 0.8987742594484167, + "grad_norm": 1.4864854539613406, + "learning_rate": 5.326500659166501e-07, + "loss": 0.699, + "step": 8799 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 1.409088934312831, + "learning_rate": 5.3158525546252e-07, + "loss": 0.6517, + "step": 8800 + }, + { + "epoch": 0.8989785495403473, + "grad_norm": 1.5342041067996426, + "learning_rate": 5.305214813325022e-07, + "loss": 0.6875, + "step": 8801 + }, + { + "epoch": 0.8990806945863126, + "grad_norm": 1.539628398080335, + "learning_rate": 5.294587436430254e-07, + "loss": 0.6893, + "step": 8802 + }, + { + "epoch": 0.8991828396322779, + "grad_norm": 1.475777966386795, + "learning_rate": 5.28397042510409e-07, + "loss": 0.7241, + "step": 8803 + }, + { + "epoch": 0.8992849846782431, + "grad_norm": 1.5220809546181582, + "learning_rate": 5.273363780508556e-07, + "loss": 0.7061, + "step": 8804 + }, + { + "epoch": 0.8993871297242084, + "grad_norm": 1.4163136934225535, + "learning_rate": 5.262767503804567e-07, + "loss": 0.6963, + "step": 8805 + }, + { + "epoch": 0.8994892747701736, + "grad_norm": 1.5347825227662513, + "learning_rate": 5.252181596151861e-07, + "loss": 0.6965, + "step": 8806 + }, + { + "epoch": 0.8995914198161389, + "grad_norm": 1.4310649855530533, + "learning_rate": 5.241606058709103e-07, + "loss": 0.6632, + "step": 8807 + }, + { + "epoch": 0.8996935648621042, + "grad_norm": 1.5356958321531848, + "learning_rate": 5.231040892633776e-07, + "loss": 0.6314, + "step": 8808 + }, + { + "epoch": 0.8997957099080695, + "grad_norm": 1.5196074454375148, + "learning_rate": 5.220486099082267e-07, + "loss": 0.6707, + "step": 8809 + }, + { + "epoch": 0.8998978549540347, + "grad_norm": 1.5241583031604422, + "learning_rate": 5.209941679209785e-07, + "loss": 0.6205, + "step": 8810 + }, + { + "epoch": 0.9, + "grad_norm": 1.4436174097359282, + "learning_rate": 5.199407634170417e-07, + "loss": 0.5825, + "step": 8811 + }, + { + "epoch": 0.9001021450459653, + "grad_norm": 1.4146828192017114, + "learning_rate": 5.188883965117153e-07, + "loss": 0.6646, + "step": 8812 + }, + { + "epoch": 0.9002042900919306, + "grad_norm": 1.6336488084321334, + "learning_rate": 5.178370673201783e-07, + "loss": 0.6799, + "step": 8813 + }, + { + "epoch": 0.9003064351378958, + "grad_norm": 1.6624283393709594, + "learning_rate": 5.167867759575007e-07, + "loss": 0.8012, + "step": 8814 + }, + { + "epoch": 0.900408580183861, + "grad_norm": 1.6241143770122437, + "learning_rate": 5.157375225386385e-07, + "loss": 0.8004, + "step": 8815 + }, + { + "epoch": 0.9005107252298263, + "grad_norm": 1.4745925098009403, + "learning_rate": 5.146893071784286e-07, + "loss": 0.6357, + "step": 8816 + }, + { + "epoch": 0.9006128702757916, + "grad_norm": 1.4582728572214099, + "learning_rate": 5.13642129991605e-07, + "loss": 0.7024, + "step": 8817 + }, + { + "epoch": 0.9007150153217569, + "grad_norm": 1.2825594919898262, + "learning_rate": 5.125959910927792e-07, + "loss": 0.6054, + "step": 8818 + }, + { + "epoch": 0.9008171603677222, + "grad_norm": 1.4710055364365755, + "learning_rate": 5.115508905964516e-07, + "loss": 0.6937, + "step": 8819 + }, + { + "epoch": 0.9009193054136875, + "grad_norm": 1.6439372858716164, + "learning_rate": 5.1050682861701e-07, + "loss": 0.7467, + "step": 8820 + }, + { + "epoch": 0.9010214504596528, + "grad_norm": 1.4396129176353436, + "learning_rate": 5.094638052687251e-07, + "loss": 0.7089, + "step": 8821 + }, + { + "epoch": 0.9011235955056179, + "grad_norm": 1.5651785705913253, + "learning_rate": 5.084218206657609e-07, + "loss": 0.7155, + "step": 8822 + }, + { + "epoch": 0.9012257405515832, + "grad_norm": 1.4532324331369857, + "learning_rate": 5.073808749221598e-07, + "loss": 0.6197, + "step": 8823 + }, + { + "epoch": 0.9013278855975485, + "grad_norm": 1.5145427598096115, + "learning_rate": 5.063409681518528e-07, + "loss": 0.6089, + "step": 8824 + }, + { + "epoch": 0.9014300306435138, + "grad_norm": 1.6188235718444264, + "learning_rate": 5.053021004686632e-07, + "loss": 0.6844, + "step": 8825 + }, + { + "epoch": 0.9015321756894791, + "grad_norm": 1.3689676461966298, + "learning_rate": 5.042642719862912e-07, + "loss": 0.613, + "step": 8826 + }, + { + "epoch": 0.9016343207354444, + "grad_norm": 1.5328725465042532, + "learning_rate": 5.032274828183314e-07, + "loss": 0.6993, + "step": 8827 + }, + { + "epoch": 0.9017364657814096, + "grad_norm": 1.660657033781915, + "learning_rate": 5.021917330782589e-07, + "loss": 0.6724, + "step": 8828 + }, + { + "epoch": 0.9018386108273748, + "grad_norm": 1.5128467033815287, + "learning_rate": 5.011570228794372e-07, + "loss": 0.6643, + "step": 8829 + }, + { + "epoch": 0.9019407558733401, + "grad_norm": 1.4433606531391006, + "learning_rate": 5.001233523351157e-07, + "loss": 0.7066, + "step": 8830 + }, + { + "epoch": 0.9020429009193054, + "grad_norm": 1.486976924216084, + "learning_rate": 4.990907215584317e-07, + "loss": 0.7014, + "step": 8831 + }, + { + "epoch": 0.9021450459652707, + "grad_norm": 1.6314475262840287, + "learning_rate": 4.98059130662406e-07, + "loss": 0.7836, + "step": 8832 + }, + { + "epoch": 0.902247191011236, + "grad_norm": 1.5625608206048425, + "learning_rate": 4.970285797599483e-07, + "loss": 0.673, + "step": 8833 + }, + { + "epoch": 0.9023493360572012, + "grad_norm": 1.6122864410066604, + "learning_rate": 4.959990689638528e-07, + "loss": 0.7773, + "step": 8834 + }, + { + "epoch": 0.9024514811031665, + "grad_norm": 1.6472006477752754, + "learning_rate": 4.949705983867992e-07, + "loss": 0.6599, + "step": 8835 + }, + { + "epoch": 0.9025536261491318, + "grad_norm": 1.566099470906536, + "learning_rate": 4.939431681413531e-07, + "loss": 0.7413, + "step": 8836 + }, + { + "epoch": 0.902655771195097, + "grad_norm": 1.681186175650415, + "learning_rate": 4.929167783399713e-07, + "loss": 0.7308, + "step": 8837 + }, + { + "epoch": 0.9027579162410623, + "grad_norm": 1.5094732122319936, + "learning_rate": 4.918914290949894e-07, + "loss": 0.6701, + "step": 8838 + }, + { + "epoch": 0.9028600612870276, + "grad_norm": 1.7087569463222336, + "learning_rate": 4.908671205186343e-07, + "loss": 0.7326, + "step": 8839 + }, + { + "epoch": 0.9029622063329928, + "grad_norm": 1.6200327547552695, + "learning_rate": 4.898438527230143e-07, + "loss": 0.702, + "step": 8840 + }, + { + "epoch": 0.9030643513789581, + "grad_norm": 1.6373827743873604, + "learning_rate": 4.888216258201295e-07, + "loss": 0.777, + "step": 8841 + }, + { + "epoch": 0.9031664964249234, + "grad_norm": 1.4875238017807653, + "learning_rate": 4.878004399218639e-07, + "loss": 0.5915, + "step": 8842 + }, + { + "epoch": 0.9032686414708887, + "grad_norm": 1.5220620056936245, + "learning_rate": 4.867802951399869e-07, + "loss": 0.6729, + "step": 8843 + }, + { + "epoch": 0.903370786516854, + "grad_norm": 1.5419295099625132, + "learning_rate": 4.857611915861516e-07, + "loss": 0.6535, + "step": 8844 + }, + { + "epoch": 0.9034729315628192, + "grad_norm": 1.6060366159741524, + "learning_rate": 4.847431293718996e-07, + "loss": 0.6881, + "step": 8845 + }, + { + "epoch": 0.9035750766087844, + "grad_norm": 1.4587362804717088, + "learning_rate": 4.83726108608662e-07, + "loss": 0.6459, + "step": 8846 + }, + { + "epoch": 0.9036772216547497, + "grad_norm": 1.4689542771656021, + "learning_rate": 4.827101294077496e-07, + "loss": 0.6796, + "step": 8847 + }, + { + "epoch": 0.903779366700715, + "grad_norm": 1.5097404121516667, + "learning_rate": 4.816951918803603e-07, + "loss": 0.8471, + "step": 8848 + }, + { + "epoch": 0.9038815117466803, + "grad_norm": 1.5583876563658305, + "learning_rate": 4.80681296137584e-07, + "loss": 0.7164, + "step": 8849 + }, + { + "epoch": 0.9039836567926456, + "grad_norm": 1.6252539420577212, + "learning_rate": 4.796684422903897e-07, + "loss": 0.7481, + "step": 8850 + }, + { + "epoch": 0.9040858018386109, + "grad_norm": 1.6158086217363263, + "learning_rate": 4.786566304496353e-07, + "loss": 0.641, + "step": 8851 + }, + { + "epoch": 0.904187946884576, + "grad_norm": 1.5673687557610714, + "learning_rate": 4.776458607260648e-07, + "loss": 0.713, + "step": 8852 + }, + { + "epoch": 0.9042900919305413, + "grad_norm": 1.5371481756123881, + "learning_rate": 4.766361332303071e-07, + "loss": 0.6791, + "step": 8853 + }, + { + "epoch": 0.9043922369765066, + "grad_norm": 1.6834504456794845, + "learning_rate": 4.756274480728773e-07, + "loss": 0.747, + "step": 8854 + }, + { + "epoch": 0.9044943820224719, + "grad_norm": 1.5297964187782123, + "learning_rate": 4.7461980536417486e-07, + "loss": 0.5999, + "step": 8855 + }, + { + "epoch": 0.9045965270684372, + "grad_norm": 1.4783836823864092, + "learning_rate": 4.736132052144904e-07, + "loss": 0.6728, + "step": 8856 + }, + { + "epoch": 0.9046986721144025, + "grad_norm": 1.5597510906512044, + "learning_rate": 4.726076477339958e-07, + "loss": 0.7888, + "step": 8857 + }, + { + "epoch": 0.9048008171603678, + "grad_norm": 1.449844809243343, + "learning_rate": 4.7160313303274973e-07, + "loss": 0.7078, + "step": 8858 + }, + { + "epoch": 0.904902962206333, + "grad_norm": 1.32386251011179, + "learning_rate": 4.7059966122069754e-07, + "loss": 0.5522, + "step": 8859 + }, + { + "epoch": 0.9050051072522982, + "grad_norm": 1.522791631146952, + "learning_rate": 4.69597232407667e-07, + "loss": 0.6658, + "step": 8860 + }, + { + "epoch": 0.9051072522982635, + "grad_norm": 1.521605977924698, + "learning_rate": 4.68595846703378e-07, + "loss": 0.6937, + "step": 8861 + }, + { + "epoch": 0.9052093973442288, + "grad_norm": 1.3172199429916436, + "learning_rate": 4.6759550421743296e-07, + "loss": 0.6262, + "step": 8862 + }, + { + "epoch": 0.9053115423901941, + "grad_norm": 1.538402520690178, + "learning_rate": 4.6659620505931757e-07, + "loss": 0.655, + "step": 8863 + }, + { + "epoch": 0.9054136874361594, + "grad_norm": 1.497635360228963, + "learning_rate": 4.655979493384055e-07, + "loss": 0.6821, + "step": 8864 + }, + { + "epoch": 0.9055158324821246, + "grad_norm": 1.4427324945180435, + "learning_rate": 4.6460073716395825e-07, + "loss": 0.6222, + "step": 8865 + }, + { + "epoch": 0.9056179775280899, + "grad_norm": 1.4990579454148953, + "learning_rate": 4.6360456864512295e-07, + "loss": 0.7028, + "step": 8866 + }, + { + "epoch": 0.9057201225740552, + "grad_norm": 2.4911457101981767, + "learning_rate": 4.6260944389092786e-07, + "loss": 0.7062, + "step": 8867 + }, + { + "epoch": 0.9058222676200204, + "grad_norm": 1.5194161247013336, + "learning_rate": 4.6161536301029154e-07, + "loss": 0.5935, + "step": 8868 + }, + { + "epoch": 0.9059244126659857, + "grad_norm": 1.6078486022444718, + "learning_rate": 4.6062232611201574e-07, + "loss": 0.7296, + "step": 8869 + }, + { + "epoch": 0.906026557711951, + "grad_norm": 1.7627956417542066, + "learning_rate": 4.596303333047891e-07, + "loss": 0.6883, + "step": 8870 + }, + { + "epoch": 0.9061287027579162, + "grad_norm": 1.4282856351735906, + "learning_rate": 4.5863938469718707e-07, + "loss": 0.5914, + "step": 8871 + }, + { + "epoch": 0.9062308478038815, + "grad_norm": 1.4479624608326627, + "learning_rate": 4.576494803976683e-07, + "loss": 0.5816, + "step": 8872 + }, + { + "epoch": 0.9063329928498468, + "grad_norm": 1.5661694944863442, + "learning_rate": 4.566606205145796e-07, + "loss": 0.7052, + "step": 8873 + }, + { + "epoch": 0.9064351378958121, + "grad_norm": 1.6261112888759972, + "learning_rate": 4.5567280515615207e-07, + "loss": 0.657, + "step": 8874 + }, + { + "epoch": 0.9065372829417774, + "grad_norm": 1.4886309421632047, + "learning_rate": 4.5468603443050155e-07, + "loss": 0.6774, + "step": 8875 + }, + { + "epoch": 0.9066394279877426, + "grad_norm": 1.6816665938043953, + "learning_rate": 4.537003084456326e-07, + "loss": 0.7622, + "step": 8876 + }, + { + "epoch": 0.9067415730337078, + "grad_norm": 1.4337879321128262, + "learning_rate": 4.527156273094324e-07, + "loss": 0.8093, + "step": 8877 + }, + { + "epoch": 0.9068437180796731, + "grad_norm": 1.5024111251009986, + "learning_rate": 4.517319911296747e-07, + "loss": 0.6909, + "step": 8878 + }, + { + "epoch": 0.9069458631256384, + "grad_norm": 1.6235565533581433, + "learning_rate": 4.507494000140189e-07, + "loss": 0.7169, + "step": 8879 + }, + { + "epoch": 0.9070480081716037, + "grad_norm": 1.4667139193675733, + "learning_rate": 4.497678540700112e-07, + "loss": 0.6314, + "step": 8880 + }, + { + "epoch": 0.907150153217569, + "grad_norm": 1.4872571843058904, + "learning_rate": 4.487873534050824e-07, + "loss": 0.6926, + "step": 8881 + }, + { + "epoch": 0.9072522982635343, + "grad_norm": 1.5771218572274353, + "learning_rate": 4.478078981265499e-07, + "loss": 0.7935, + "step": 8882 + }, + { + "epoch": 0.9073544433094994, + "grad_norm": 1.6204191962407675, + "learning_rate": 4.4682948834161356e-07, + "loss": 0.7742, + "step": 8883 + }, + { + "epoch": 0.9074565883554647, + "grad_norm": 1.5220100839105495, + "learning_rate": 4.45852124157361e-07, + "loss": 0.7307, + "step": 8884 + }, + { + "epoch": 0.90755873340143, + "grad_norm": 1.449600603937364, + "learning_rate": 4.448758056807667e-07, + "loss": 0.6134, + "step": 8885 + }, + { + "epoch": 0.9076608784473953, + "grad_norm": 1.3948453707094013, + "learning_rate": 4.4390053301868965e-07, + "loss": 0.7771, + "step": 8886 + }, + { + "epoch": 0.9077630234933606, + "grad_norm": 1.5395122998005635, + "learning_rate": 4.4292630627787324e-07, + "loss": 0.6437, + "step": 8887 + }, + { + "epoch": 0.9078651685393259, + "grad_norm": 1.5717364706375552, + "learning_rate": 4.4195312556494563e-07, + "loss": 0.7226, + "step": 8888 + }, + { + "epoch": 0.9079673135852911, + "grad_norm": 1.4697836624598823, + "learning_rate": 4.409809909864249e-07, + "loss": 0.6474, + "step": 8889 + }, + { + "epoch": 0.9080694586312564, + "grad_norm": 1.5280786654544793, + "learning_rate": 4.4000990264871034e-07, + "loss": 0.7204, + "step": 8890 + }, + { + "epoch": 0.9081716036772216, + "grad_norm": 1.547719631065917, + "learning_rate": 4.3903986065808924e-07, + "loss": 0.7004, + "step": 8891 + }, + { + "epoch": 0.9082737487231869, + "grad_norm": 1.2808077147523595, + "learning_rate": 4.380708651207322e-07, + "loss": 0.5752, + "step": 8892 + }, + { + "epoch": 0.9083758937691522, + "grad_norm": 1.5475178003205454, + "learning_rate": 4.371029161426965e-07, + "loss": 0.6314, + "step": 8893 + }, + { + "epoch": 0.9084780388151175, + "grad_norm": 1.6588241776290253, + "learning_rate": 4.3613601382992423e-07, + "loss": 0.7101, + "step": 8894 + }, + { + "epoch": 0.9085801838610827, + "grad_norm": 1.5043304201134553, + "learning_rate": 4.3517015828824615e-07, + "loss": 0.7909, + "step": 8895 + }, + { + "epoch": 0.908682328907048, + "grad_norm": 1.5912515577254782, + "learning_rate": 4.3420534962337224e-07, + "loss": 0.6342, + "step": 8896 + }, + { + "epoch": 0.9087844739530133, + "grad_norm": 1.589553048190197, + "learning_rate": 4.3324158794090354e-07, + "loss": 0.8424, + "step": 8897 + }, + { + "epoch": 0.9088866189989786, + "grad_norm": 1.3684204397981246, + "learning_rate": 4.3227887334632455e-07, + "loss": 0.6637, + "step": 8898 + }, + { + "epoch": 0.9089887640449438, + "grad_norm": 1.487395639379782, + "learning_rate": 4.3131720594500327e-07, + "loss": 0.7116, + "step": 8899 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.4186199981235292, + "learning_rate": 4.3035658584219766e-07, + "loss": 0.6089, + "step": 8900 + }, + { + "epoch": 0.9091930541368743, + "grad_norm": 1.4210128453738904, + "learning_rate": 4.2939701314304585e-07, + "loss": 0.5864, + "step": 8901 + }, + { + "epoch": 0.9092951991828396, + "grad_norm": 1.4347755942967642, + "learning_rate": 4.284384879525749e-07, + "loss": 0.6895, + "step": 8902 + }, + { + "epoch": 0.9093973442288049, + "grad_norm": 1.6397514582558435, + "learning_rate": 4.2748101037569323e-07, + "loss": 0.8036, + "step": 8903 + }, + { + "epoch": 0.9094994892747702, + "grad_norm": 1.4487434065996465, + "learning_rate": 4.2652458051720025e-07, + "loss": 0.701, + "step": 8904 + }, + { + "epoch": 0.9096016343207355, + "grad_norm": 1.562841860425281, + "learning_rate": 4.255691984817789e-07, + "loss": 0.6536, + "step": 8905 + }, + { + "epoch": 0.9097037793667007, + "grad_norm": 1.4333114283968933, + "learning_rate": 4.2461486437399337e-07, + "loss": 0.6667, + "step": 8906 + }, + { + "epoch": 0.909805924412666, + "grad_norm": 1.5156292093867363, + "learning_rate": 4.236615782982989e-07, + "loss": 0.7312, + "step": 8907 + }, + { + "epoch": 0.9099080694586312, + "grad_norm": 1.4952419336088052, + "learning_rate": 4.22709340359031e-07, + "loss": 0.6769, + "step": 8908 + }, + { + "epoch": 0.9100102145045965, + "grad_norm": 1.476352048654321, + "learning_rate": 4.217581506604118e-07, + "loss": 0.689, + "step": 8909 + }, + { + "epoch": 0.9101123595505618, + "grad_norm": 1.665372064813896, + "learning_rate": 4.208080093065536e-07, + "loss": 0.6626, + "step": 8910 + }, + { + "epoch": 0.9102145045965271, + "grad_norm": 1.4178443355769028, + "learning_rate": 4.198589164014477e-07, + "loss": 0.6425, + "step": 8911 + }, + { + "epoch": 0.9103166496424924, + "grad_norm": 1.3778041250777988, + "learning_rate": 4.1891087204897097e-07, + "loss": 0.546, + "step": 8912 + }, + { + "epoch": 0.9104187946884577, + "grad_norm": 1.3684193349668414, + "learning_rate": 4.179638763528904e-07, + "loss": 0.6439, + "step": 8913 + }, + { + "epoch": 0.9105209397344228, + "grad_norm": 1.7000013242510281, + "learning_rate": 4.1701792941685415e-07, + "loss": 0.7153, + "step": 8914 + }, + { + "epoch": 0.9106230847803881, + "grad_norm": 1.5623888865660085, + "learning_rate": 4.160730313443984e-07, + "loss": 0.7497, + "step": 8915 + }, + { + "epoch": 0.9107252298263534, + "grad_norm": 1.5197447254062728, + "learning_rate": 4.151291822389403e-07, + "loss": 0.6749, + "step": 8916 + }, + { + "epoch": 0.9108273748723187, + "grad_norm": 1.4751899391816226, + "learning_rate": 4.1418638220378616e-07, + "loss": 0.7406, + "step": 8917 + }, + { + "epoch": 0.910929519918284, + "grad_norm": 1.5789250643287935, + "learning_rate": 4.132446313421246e-07, + "loss": 0.7679, + "step": 8918 + }, + { + "epoch": 0.9110316649642493, + "grad_norm": 1.5286779169960332, + "learning_rate": 4.12303929757033e-07, + "loss": 0.7821, + "step": 8919 + }, + { + "epoch": 0.9111338100102145, + "grad_norm": 1.6311795683051307, + "learning_rate": 4.1136427755147033e-07, + "loss": 0.74, + "step": 8920 + }, + { + "epoch": 0.9112359550561798, + "grad_norm": 1.567505604719893, + "learning_rate": 4.104256748282831e-07, + "loss": 0.7326, + "step": 8921 + }, + { + "epoch": 0.911338100102145, + "grad_norm": 1.609353847079402, + "learning_rate": 4.0948812169020137e-07, + "loss": 0.7554, + "step": 8922 + }, + { + "epoch": 0.9114402451481103, + "grad_norm": 1.6608640415024978, + "learning_rate": 4.085516182398408e-07, + "loss": 0.7194, + "step": 8923 + }, + { + "epoch": 0.9115423901940756, + "grad_norm": 1.4365131400989493, + "learning_rate": 4.076161645797017e-07, + "loss": 0.6184, + "step": 8924 + }, + { + "epoch": 0.9116445352400409, + "grad_norm": 1.4151770622874968, + "learning_rate": 4.0668176081217205e-07, + "loss": 0.6957, + "step": 8925 + }, + { + "epoch": 0.9117466802860061, + "grad_norm": 1.57935756562091, + "learning_rate": 4.057484070395212e-07, + "loss": 0.7328, + "step": 8926 + }, + { + "epoch": 0.9118488253319714, + "grad_norm": 1.4330170151818198, + "learning_rate": 4.048161033639064e-07, + "loss": 0.699, + "step": 8927 + }, + { + "epoch": 0.9119509703779367, + "grad_norm": 1.5575680494422013, + "learning_rate": 4.0388484988736707e-07, + "loss": 0.6907, + "step": 8928 + }, + { + "epoch": 0.912053115423902, + "grad_norm": 1.4193865648928499, + "learning_rate": 4.029546467118306e-07, + "loss": 0.6865, + "step": 8929 + }, + { + "epoch": 0.9121552604698672, + "grad_norm": 1.6414761832542606, + "learning_rate": 4.0202549393910995e-07, + "loss": 0.7248, + "step": 8930 + }, + { + "epoch": 0.9122574055158325, + "grad_norm": 1.4368967877193175, + "learning_rate": 4.0109739167090046e-07, + "loss": 0.5447, + "step": 8931 + }, + { + "epoch": 0.9123595505617977, + "grad_norm": 1.4622601607345622, + "learning_rate": 4.001703400087831e-07, + "loss": 0.5971, + "step": 8932 + }, + { + "epoch": 0.912461695607763, + "grad_norm": 1.4639563141980465, + "learning_rate": 3.992443390542222e-07, + "loss": 0.6798, + "step": 8933 + }, + { + "epoch": 0.9125638406537283, + "grad_norm": 1.573735662441195, + "learning_rate": 3.983193889085735e-07, + "loss": 0.6937, + "step": 8934 + }, + { + "epoch": 0.9126659856996936, + "grad_norm": 1.558385481466482, + "learning_rate": 3.9739548967307027e-07, + "loss": 0.6434, + "step": 8935 + }, + { + "epoch": 0.9127681307456589, + "grad_norm": 1.506114295379127, + "learning_rate": 3.96472641448834e-07, + "loss": 0.6526, + "step": 8936 + }, + { + "epoch": 0.912870275791624, + "grad_norm": 1.4721811399874458, + "learning_rate": 3.9555084433687274e-07, + "loss": 0.6869, + "step": 8937 + }, + { + "epoch": 0.9129724208375893, + "grad_norm": 1.4750075423228408, + "learning_rate": 3.9463009843807577e-07, + "loss": 0.5846, + "step": 8938 + }, + { + "epoch": 0.9130745658835546, + "grad_norm": 1.3862736327119547, + "learning_rate": 3.9371040385322246e-07, + "loss": 0.6674, + "step": 8939 + }, + { + "epoch": 0.9131767109295199, + "grad_norm": 1.5618504086079787, + "learning_rate": 3.927917606829712e-07, + "loss": 0.83, + "step": 8940 + }, + { + "epoch": 0.9132788559754852, + "grad_norm": 1.7585550876551548, + "learning_rate": 3.9187416902786824e-07, + "loss": 0.6304, + "step": 8941 + }, + { + "epoch": 0.9133810010214505, + "grad_norm": 1.4592329154070418, + "learning_rate": 3.9095762898834544e-07, + "loss": 0.6356, + "step": 8942 + }, + { + "epoch": 0.9134831460674158, + "grad_norm": 1.487034609094372, + "learning_rate": 3.90042140664717e-07, + "loss": 0.6722, + "step": 8943 + }, + { + "epoch": 0.913585291113381, + "grad_norm": 1.4583344845508903, + "learning_rate": 3.8912770415718615e-07, + "loss": 0.6023, + "step": 8944 + }, + { + "epoch": 0.9136874361593462, + "grad_norm": 1.5079050218450298, + "learning_rate": 3.8821431956583834e-07, + "loss": 0.6638, + "step": 8945 + }, + { + "epoch": 0.9137895812053115, + "grad_norm": 1.6505793938913262, + "learning_rate": 3.873019869906425e-07, + "loss": 0.7783, + "step": 8946 + }, + { + "epoch": 0.9138917262512768, + "grad_norm": 1.5682612576443133, + "learning_rate": 3.863907065314554e-07, + "loss": 0.6539, + "step": 8947 + }, + { + "epoch": 0.9139938712972421, + "grad_norm": 1.5684555290459337, + "learning_rate": 3.8548047828801505e-07, + "loss": 0.7136, + "step": 8948 + }, + { + "epoch": 0.9140960163432074, + "grad_norm": 1.387603166140642, + "learning_rate": 3.8457130235994953e-07, + "loss": 0.6154, + "step": 8949 + }, + { + "epoch": 0.9141981613891726, + "grad_norm": 1.529300529123299, + "learning_rate": 3.8366317884676705e-07, + "loss": 0.6348, + "step": 8950 + }, + { + "epoch": 0.9143003064351379, + "grad_norm": 1.4628415845835792, + "learning_rate": 3.8275610784786254e-07, + "loss": 0.6571, + "step": 8951 + }, + { + "epoch": 0.9144024514811032, + "grad_norm": 1.360386918936227, + "learning_rate": 3.8185008946251546e-07, + "loss": 0.6809, + "step": 8952 + }, + { + "epoch": 0.9145045965270684, + "grad_norm": 1.4956389469836187, + "learning_rate": 3.809451237898887e-07, + "loss": 0.6223, + "step": 8953 + }, + { + "epoch": 0.9146067415730337, + "grad_norm": 1.4499482335013432, + "learning_rate": 3.800412109290352e-07, + "loss": 0.7809, + "step": 8954 + }, + { + "epoch": 0.914708886618999, + "grad_norm": 1.489386402906914, + "learning_rate": 3.7913835097888595e-07, + "loss": 0.6731, + "step": 8955 + }, + { + "epoch": 0.9148110316649642, + "grad_norm": 1.3721374810099194, + "learning_rate": 3.7823654403826073e-07, + "loss": 0.8223, + "step": 8956 + }, + { + "epoch": 0.9149131767109295, + "grad_norm": 1.494396635422137, + "learning_rate": 3.7733579020586166e-07, + "loss": 0.6489, + "step": 8957 + }, + { + "epoch": 0.9150153217568948, + "grad_norm": 1.3087647018785062, + "learning_rate": 3.7643608958027543e-07, + "loss": 0.6809, + "step": 8958 + }, + { + "epoch": 0.9151174668028601, + "grad_norm": 1.5842353638303468, + "learning_rate": 3.755374422599789e-07, + "loss": 0.6988, + "step": 8959 + }, + { + "epoch": 0.9152196118488254, + "grad_norm": 1.3212088564942823, + "learning_rate": 3.7463984834332665e-07, + "loss": 0.6711, + "step": 8960 + }, + { + "epoch": 0.9153217568947906, + "grad_norm": 1.4140368632953197, + "learning_rate": 3.737433079285624e-07, + "loss": 0.6392, + "step": 8961 + }, + { + "epoch": 0.9154239019407558, + "grad_norm": 1.4435915079052284, + "learning_rate": 3.72847821113812e-07, + "loss": 0.6723, + "step": 8962 + }, + { + "epoch": 0.9155260469867211, + "grad_norm": 1.459297830367921, + "learning_rate": 3.7195338799708716e-07, + "loss": 0.7009, + "step": 8963 + }, + { + "epoch": 0.9156281920326864, + "grad_norm": 1.403540378978877, + "learning_rate": 3.71060008676285e-07, + "loss": 0.699, + "step": 8964 + }, + { + "epoch": 0.9157303370786517, + "grad_norm": 1.3929585991068962, + "learning_rate": 3.701676832491863e-07, + "loss": 0.6038, + "step": 8965 + }, + { + "epoch": 0.915832482124617, + "grad_norm": 1.7431304800071328, + "learning_rate": 3.692764118134573e-07, + "loss": 0.6006, + "step": 8966 + }, + { + "epoch": 0.9159346271705823, + "grad_norm": 1.4805836386151792, + "learning_rate": 3.6838619446664447e-07, + "loss": 0.6454, + "step": 8967 + }, + { + "epoch": 0.9160367722165474, + "grad_norm": 1.5950779454773634, + "learning_rate": 3.6749703130618653e-07, + "loss": 0.6659, + "step": 8968 + }, + { + "epoch": 0.9161389172625127, + "grad_norm": 1.4926889591244192, + "learning_rate": 3.6660892242940227e-07, + "loss": 0.6786, + "step": 8969 + }, + { + "epoch": 0.916241062308478, + "grad_norm": 1.395532426091007, + "learning_rate": 3.657218679334962e-07, + "loss": 0.7226, + "step": 8970 + }, + { + "epoch": 0.9163432073544433, + "grad_norm": 1.6064225298030648, + "learning_rate": 3.6483586791555613e-07, + "loss": 0.7747, + "step": 8971 + }, + { + "epoch": 0.9164453524004086, + "grad_norm": 1.470024895145512, + "learning_rate": 3.6395092247255347e-07, + "loss": 0.6803, + "step": 8972 + }, + { + "epoch": 0.9165474974463739, + "grad_norm": 1.4802091917970328, + "learning_rate": 3.630670317013507e-07, + "loss": 0.7218, + "step": 8973 + }, + { + "epoch": 0.9166496424923392, + "grad_norm": 1.569370055803383, + "learning_rate": 3.6218419569868603e-07, + "loss": 0.75, + "step": 8974 + }, + { + "epoch": 0.9167517875383044, + "grad_norm": 1.3999677276900633, + "learning_rate": 3.6130241456118877e-07, + "loss": 0.6385, + "step": 8975 + }, + { + "epoch": 0.9168539325842696, + "grad_norm": 1.587790441220409, + "learning_rate": 3.604216883853684e-07, + "loss": 0.6115, + "step": 8976 + }, + { + "epoch": 0.9169560776302349, + "grad_norm": 1.5352739715010324, + "learning_rate": 3.595420172676234e-07, + "loss": 0.6429, + "step": 8977 + }, + { + "epoch": 0.9170582226762002, + "grad_norm": 1.529110107926348, + "learning_rate": 3.5866340130423117e-07, + "loss": 0.6986, + "step": 8978 + }, + { + "epoch": 0.9171603677221655, + "grad_norm": 1.5132314831678404, + "learning_rate": 3.577858405913615e-07, + "loss": 0.7166, + "step": 8979 + }, + { + "epoch": 0.9172625127681308, + "grad_norm": 1.51041731313612, + "learning_rate": 3.569093352250597e-07, + "loss": 0.7019, + "step": 8980 + }, + { + "epoch": 0.917364657814096, + "grad_norm": 1.6401537984693644, + "learning_rate": 3.5603388530126236e-07, + "loss": 0.6936, + "step": 8981 + }, + { + "epoch": 0.9174668028600613, + "grad_norm": 1.446058674937161, + "learning_rate": 3.5515949091578514e-07, + "loss": 0.691, + "step": 8982 + }, + { + "epoch": 0.9175689479060266, + "grad_norm": 1.4541144754992703, + "learning_rate": 3.542861521643348e-07, + "loss": 0.5959, + "step": 8983 + }, + { + "epoch": 0.9176710929519918, + "grad_norm": 1.4078097272429528, + "learning_rate": 3.53413869142496e-07, + "loss": 0.5902, + "step": 8984 + }, + { + "epoch": 0.9177732379979571, + "grad_norm": 1.5058883115713082, + "learning_rate": 3.525426419457423e-07, + "loss": 0.7123, + "step": 8985 + }, + { + "epoch": 0.9178753830439224, + "grad_norm": 1.6624353704939734, + "learning_rate": 3.5167247066942966e-07, + "loss": 0.7015, + "step": 8986 + }, + { + "epoch": 0.9179775280898876, + "grad_norm": 1.3749528303274887, + "learning_rate": 3.508033554087975e-07, + "loss": 0.5711, + "step": 8987 + }, + { + "epoch": 0.9180796731358529, + "grad_norm": 1.5481589834530205, + "learning_rate": 3.4993529625897413e-07, + "loss": 0.6992, + "step": 8988 + }, + { + "epoch": 0.9181818181818182, + "grad_norm": 1.4951327018881502, + "learning_rate": 3.4906829331496697e-07, + "loss": 0.6781, + "step": 8989 + }, + { + "epoch": 0.9182839632277835, + "grad_norm": 1.5599976498262236, + "learning_rate": 3.4820234667166996e-07, + "loss": 0.704, + "step": 8990 + }, + { + "epoch": 0.9183861082737487, + "grad_norm": 1.4075024476783786, + "learning_rate": 3.4733745642386076e-07, + "loss": 0.6781, + "step": 8991 + }, + { + "epoch": 0.918488253319714, + "grad_norm": 1.5104951243649418, + "learning_rate": 3.4647362266620355e-07, + "loss": 0.6887, + "step": 8992 + }, + { + "epoch": 0.9185903983656792, + "grad_norm": 1.4790634653600672, + "learning_rate": 3.4561084549324717e-07, + "loss": 0.6612, + "step": 8993 + }, + { + "epoch": 0.9186925434116445, + "grad_norm": 1.6897891201292183, + "learning_rate": 3.447491249994206e-07, + "loss": 0.8149, + "step": 8994 + }, + { + "epoch": 0.9187946884576098, + "grad_norm": 1.873893086432714, + "learning_rate": 3.438884612790405e-07, + "loss": 0.7027, + "step": 8995 + }, + { + "epoch": 0.9188968335035751, + "grad_norm": 1.5777267798783916, + "learning_rate": 3.430288544263072e-07, + "loss": 0.7475, + "step": 8996 + }, + { + "epoch": 0.9189989785495404, + "grad_norm": 1.5427509641976325, + "learning_rate": 3.421703045353031e-07, + "loss": 0.6166, + "step": 8997 + }, + { + "epoch": 0.9191011235955057, + "grad_norm": 1.4699686590321384, + "learning_rate": 3.4131281170000085e-07, + "loss": 0.6753, + "step": 8998 + }, + { + "epoch": 0.9192032686414708, + "grad_norm": 1.503154415395642, + "learning_rate": 3.4045637601425096e-07, + "loss": 0.5698, + "step": 8999 + }, + { + "epoch": 0.9193054136874361, + "grad_norm": 1.5537988184157074, + "learning_rate": 3.396009975717929e-07, + "loss": 0.7485, + "step": 9000 + }, + { + "epoch": 0.9194075587334014, + "grad_norm": 1.331848729677276, + "learning_rate": 3.3874667646624505e-07, + "loss": 0.6175, + "step": 9001 + }, + { + "epoch": 0.9195097037793667, + "grad_norm": 1.5417778672892601, + "learning_rate": 3.3789341279111486e-07, + "loss": 0.7749, + "step": 9002 + }, + { + "epoch": 0.919611848825332, + "grad_norm": 1.5842283699843849, + "learning_rate": 3.3704120663979547e-07, + "loss": 0.6419, + "step": 9003 + }, + { + "epoch": 0.9197139938712973, + "grad_norm": 1.4693781773151633, + "learning_rate": 3.361900581055577e-07, + "loss": 0.6462, + "step": 9004 + }, + { + "epoch": 0.9198161389172625, + "grad_norm": 1.415116754658429, + "learning_rate": 3.353399672815627e-07, + "loss": 0.6259, + "step": 9005 + }, + { + "epoch": 0.9199182839632278, + "grad_norm": 1.3865094119635362, + "learning_rate": 3.3449093426085155e-07, + "loss": 0.7106, + "step": 9006 + }, + { + "epoch": 0.920020429009193, + "grad_norm": 1.5547271607281539, + "learning_rate": 3.3364295913635214e-07, + "loss": 0.6798, + "step": 9007 + }, + { + "epoch": 0.9201225740551583, + "grad_norm": 1.5176673064971642, + "learning_rate": 3.32796042000878e-07, + "loss": 0.691, + "step": 9008 + }, + { + "epoch": 0.9202247191011236, + "grad_norm": 1.5373958318402452, + "learning_rate": 3.3195018294712056e-07, + "loss": 0.7349, + "step": 9009 + }, + { + "epoch": 0.9203268641470889, + "grad_norm": 1.5739228959164528, + "learning_rate": 3.311053820676635e-07, + "loss": 0.669, + "step": 9010 + }, + { + "epoch": 0.9204290091930541, + "grad_norm": 1.3062468111676893, + "learning_rate": 3.3026163945496846e-07, + "loss": 0.5868, + "step": 9011 + }, + { + "epoch": 0.9205311542390194, + "grad_norm": 1.3196193344612024, + "learning_rate": 3.294189552013838e-07, + "loss": 0.6519, + "step": 9012 + }, + { + "epoch": 0.9206332992849847, + "grad_norm": 1.5739461201771177, + "learning_rate": 3.2857732939914346e-07, + "loss": 0.6389, + "step": 9013 + }, + { + "epoch": 0.92073544433095, + "grad_norm": 1.47890485584071, + "learning_rate": 3.2773676214036374e-07, + "loss": 0.6463, + "step": 9014 + }, + { + "epoch": 0.9208375893769152, + "grad_norm": 1.3578734540994402, + "learning_rate": 3.268972535170434e-07, + "loss": 0.6199, + "step": 9015 + }, + { + "epoch": 0.9209397344228805, + "grad_norm": 1.4556942347216502, + "learning_rate": 3.260588036210677e-07, + "loss": 0.5595, + "step": 9016 + }, + { + "epoch": 0.9210418794688457, + "grad_norm": 1.5053998505890394, + "learning_rate": 3.252214125442066e-07, + "loss": 0.7182, + "step": 9017 + }, + { + "epoch": 0.921144024514811, + "grad_norm": 1.6029171949584784, + "learning_rate": 3.2438508037811344e-07, + "loss": 0.6599, + "step": 9018 + }, + { + "epoch": 0.9212461695607763, + "grad_norm": 1.4636311768504586, + "learning_rate": 3.2354980721432395e-07, + "loss": 0.7372, + "step": 9019 + }, + { + "epoch": 0.9213483146067416, + "grad_norm": 1.4634205016863506, + "learning_rate": 3.227155931442605e-07, + "loss": 0.7861, + "step": 9020 + }, + { + "epoch": 0.9214504596527069, + "grad_norm": 1.6722636293229056, + "learning_rate": 3.2188243825922673e-07, + "loss": 0.6376, + "step": 9021 + }, + { + "epoch": 0.9215526046986721, + "grad_norm": 1.4227670606688423, + "learning_rate": 3.21050342650413e-07, + "loss": 0.553, + "step": 9022 + }, + { + "epoch": 0.9216547497446373, + "grad_norm": 1.4291560141603417, + "learning_rate": 3.202193064088932e-07, + "loss": 0.6951, + "step": 9023 + }, + { + "epoch": 0.9217568947906026, + "grad_norm": 1.6180456420779927, + "learning_rate": 3.193893296256245e-07, + "loss": 0.6593, + "step": 9024 + }, + { + "epoch": 0.9218590398365679, + "grad_norm": 1.6444997157311811, + "learning_rate": 3.185604123914465e-07, + "loss": 0.8316, + "step": 9025 + }, + { + "epoch": 0.9219611848825332, + "grad_norm": 1.2987822154289879, + "learning_rate": 3.1773255479708667e-07, + "loss": 0.6064, + "step": 9026 + }, + { + "epoch": 0.9220633299284985, + "grad_norm": 1.5653991315885252, + "learning_rate": 3.169057569331557e-07, + "loss": 0.735, + "step": 9027 + }, + { + "epoch": 0.9221654749744638, + "grad_norm": 1.411708294119295, + "learning_rate": 3.1608001889014583e-07, + "loss": 0.6723, + "step": 9028 + }, + { + "epoch": 0.9222676200204291, + "grad_norm": 1.4586030537323549, + "learning_rate": 3.152553407584336e-07, + "loss": 0.6181, + "step": 9029 + }, + { + "epoch": 0.9223697650663942, + "grad_norm": 1.4305684572405277, + "learning_rate": 3.1443172262828223e-07, + "loss": 0.6079, + "step": 9030 + }, + { + "epoch": 0.9224719101123595, + "grad_norm": 1.6809329578860155, + "learning_rate": 3.1360916458983536e-07, + "loss": 0.7612, + "step": 9031 + }, + { + "epoch": 0.9225740551583248, + "grad_norm": 1.4867660470944883, + "learning_rate": 3.127876667331242e-07, + "loss": 0.6439, + "step": 9032 + }, + { + "epoch": 0.9226762002042901, + "grad_norm": 1.4168839301688856, + "learning_rate": 3.119672291480613e-07, + "loss": 0.6244, + "step": 9033 + }, + { + "epoch": 0.9227783452502554, + "grad_norm": 1.477687097653395, + "learning_rate": 3.1114785192444484e-07, + "loss": 0.6339, + "step": 9034 + }, + { + "epoch": 0.9228804902962207, + "grad_norm": 1.5067663275072332, + "learning_rate": 3.103295351519564e-07, + "loss": 0.6743, + "step": 9035 + }, + { + "epoch": 0.9229826353421859, + "grad_norm": 1.5274048832959097, + "learning_rate": 3.0951227892015876e-07, + "loss": 0.6449, + "step": 9036 + }, + { + "epoch": 0.9230847803881512, + "grad_norm": 1.4777313174468392, + "learning_rate": 3.0869608331850486e-07, + "loss": 0.6378, + "step": 9037 + }, + { + "epoch": 0.9231869254341164, + "grad_norm": 1.438864450074164, + "learning_rate": 3.0788094843632655e-07, + "loss": 0.5977, + "step": 9038 + }, + { + "epoch": 0.9232890704800817, + "grad_norm": 1.455246240118996, + "learning_rate": 3.070668743628391e-07, + "loss": 0.6984, + "step": 9039 + }, + { + "epoch": 0.923391215526047, + "grad_norm": 1.5559062050081613, + "learning_rate": 3.0625386118714463e-07, + "loss": 0.7428, + "step": 9040 + }, + { + "epoch": 0.9234933605720123, + "grad_norm": 1.506460833201786, + "learning_rate": 3.0544190899822743e-07, + "loss": 0.6558, + "step": 9041 + }, + { + "epoch": 0.9235955056179775, + "grad_norm": 1.3640118646974595, + "learning_rate": 3.0463101788495766e-07, + "loss": 0.6507, + "step": 9042 + }, + { + "epoch": 0.9236976506639428, + "grad_norm": 1.3886200154493422, + "learning_rate": 3.038211879360875e-07, + "loss": 0.6353, + "step": 9043 + }, + { + "epoch": 0.9237997957099081, + "grad_norm": 1.4802388563961917, + "learning_rate": 3.0301241924025174e-07, + "loss": 0.7455, + "step": 9044 + }, + { + "epoch": 0.9239019407558733, + "grad_norm": 1.5268293932488866, + "learning_rate": 3.0220471188597167e-07, + "loss": 0.6755, + "step": 9045 + }, + { + "epoch": 0.9240040858018386, + "grad_norm": 1.5967341125122239, + "learning_rate": 3.0139806596165335e-07, + "loss": 0.6352, + "step": 9046 + }, + { + "epoch": 0.9241062308478039, + "grad_norm": 1.5912764583827683, + "learning_rate": 3.0059248155558164e-07, + "loss": 0.7739, + "step": 9047 + }, + { + "epoch": 0.9242083758937691, + "grad_norm": 1.283414034363248, + "learning_rate": 2.997879587559294e-07, + "loss": 0.5687, + "step": 9048 + }, + { + "epoch": 0.9243105209397344, + "grad_norm": 1.605010899614883, + "learning_rate": 2.9898449765075165e-07, + "loss": 0.6611, + "step": 9049 + }, + { + "epoch": 0.9244126659856997, + "grad_norm": 1.4240546772911074, + "learning_rate": 2.981820983279893e-07, + "loss": 0.6561, + "step": 9050 + }, + { + "epoch": 0.924514811031665, + "grad_norm": 1.474986078995608, + "learning_rate": 2.9738076087546306e-07, + "loss": 0.6954, + "step": 9051 + }, + { + "epoch": 0.9246169560776303, + "grad_norm": 1.698047494105655, + "learning_rate": 2.965804853808818e-07, + "loss": 0.7551, + "step": 9052 + }, + { + "epoch": 0.9247191011235955, + "grad_norm": 1.4638386748394576, + "learning_rate": 2.957812719318365e-07, + "loss": 0.6655, + "step": 9053 + }, + { + "epoch": 0.9248212461695607, + "grad_norm": 1.4059410997079163, + "learning_rate": 2.9498312061580047e-07, + "loss": 0.6918, + "step": 9054 + }, + { + "epoch": 0.924923391215526, + "grad_norm": 1.660114259671006, + "learning_rate": 2.941860315201317e-07, + "loss": 0.7143, + "step": 9055 + }, + { + "epoch": 0.9250255362614913, + "grad_norm": 1.4605432428247527, + "learning_rate": 2.933900047320726e-07, + "loss": 0.6553, + "step": 9056 + }, + { + "epoch": 0.9251276813074566, + "grad_norm": 1.554556514259989, + "learning_rate": 2.9259504033874787e-07, + "loss": 0.699, + "step": 9057 + }, + { + "epoch": 0.9252298263534219, + "grad_norm": 1.4752230167128741, + "learning_rate": 2.9180113842716904e-07, + "loss": 0.6703, + "step": 9058 + }, + { + "epoch": 0.9253319713993872, + "grad_norm": 1.6679577352754222, + "learning_rate": 2.9100829908422777e-07, + "loss": 0.8089, + "step": 9059 + }, + { + "epoch": 0.9254341164453525, + "grad_norm": 1.4411931186836937, + "learning_rate": 2.9021652239670015e-07, + "loss": 0.7455, + "step": 9060 + }, + { + "epoch": 0.9255362614913176, + "grad_norm": 1.50904518415986, + "learning_rate": 2.8942580845124913e-07, + "loss": 0.6452, + "step": 9061 + }, + { + "epoch": 0.9256384065372829, + "grad_norm": 1.392430782657852, + "learning_rate": 2.886361573344165e-07, + "loss": 0.6879, + "step": 9062 + }, + { + "epoch": 0.9257405515832482, + "grad_norm": 1.4677125556861943, + "learning_rate": 2.878475691326299e-07, + "loss": 0.6369, + "step": 9063 + }, + { + "epoch": 0.9258426966292135, + "grad_norm": 1.392628003431798, + "learning_rate": 2.8706004393220357e-07, + "loss": 0.6439, + "step": 9064 + }, + { + "epoch": 0.9259448416751788, + "grad_norm": 1.4078379416764641, + "learning_rate": 2.862735818193285e-07, + "loss": 0.7396, + "step": 9065 + }, + { + "epoch": 0.926046986721144, + "grad_norm": 1.6687131859910673, + "learning_rate": 2.8548818288008594e-07, + "loss": 0.725, + "step": 9066 + }, + { + "epoch": 0.9261491317671093, + "grad_norm": 1.376749934669538, + "learning_rate": 2.847038472004393e-07, + "loss": 0.6054, + "step": 9067 + }, + { + "epoch": 0.9262512768130746, + "grad_norm": 1.4272631722917932, + "learning_rate": 2.8392057486623326e-07, + "loss": 0.6908, + "step": 9068 + }, + { + "epoch": 0.9263534218590398, + "grad_norm": 1.5680103160275558, + "learning_rate": 2.831383659631981e-07, + "loss": 0.6256, + "step": 9069 + }, + { + "epoch": 0.9264555669050051, + "grad_norm": 1.4734374085034243, + "learning_rate": 2.8235722057694534e-07, + "loss": 0.6608, + "step": 9070 + }, + { + "epoch": 0.9265577119509704, + "grad_norm": 1.3950797085421625, + "learning_rate": 2.815771387929744e-07, + "loss": 0.6494, + "step": 9071 + }, + { + "epoch": 0.9266598569969356, + "grad_norm": 1.5619883049854058, + "learning_rate": 2.807981206966648e-07, + "loss": 0.6533, + "step": 9072 + }, + { + "epoch": 0.9267620020429009, + "grad_norm": 1.4889887493473386, + "learning_rate": 2.800201663732782e-07, + "loss": 0.718, + "step": 9073 + }, + { + "epoch": 0.9268641470888662, + "grad_norm": 1.4574939610320594, + "learning_rate": 2.792432759079666e-07, + "loss": 0.5701, + "step": 9074 + }, + { + "epoch": 0.9269662921348315, + "grad_norm": 1.5346680627067606, + "learning_rate": 2.7846744938575753e-07, + "loss": 0.6977, + "step": 9075 + }, + { + "epoch": 0.9270684371807967, + "grad_norm": 1.5077624502454983, + "learning_rate": 2.776926868915675e-07, + "loss": 0.6703, + "step": 9076 + }, + { + "epoch": 0.927170582226762, + "grad_norm": 1.3994226998726766, + "learning_rate": 2.7691898851019526e-07, + "loss": 0.579, + "step": 9077 + }, + { + "epoch": 0.9272727272727272, + "grad_norm": 1.4955842533883532, + "learning_rate": 2.76146354326321e-07, + "loss": 0.606, + "step": 9078 + }, + { + "epoch": 0.9273748723186925, + "grad_norm": 1.5446520803579855, + "learning_rate": 2.7537478442450914e-07, + "loss": 0.8061, + "step": 9079 + }, + { + "epoch": 0.9274770173646578, + "grad_norm": 1.4336480407366239, + "learning_rate": 2.7460427888921116e-07, + "loss": 0.7232, + "step": 9080 + }, + { + "epoch": 0.9275791624106231, + "grad_norm": 1.4667826062828704, + "learning_rate": 2.738348378047584e-07, + "loss": 0.717, + "step": 9081 + }, + { + "epoch": 0.9276813074565884, + "grad_norm": 1.5105158315964704, + "learning_rate": 2.7306646125536685e-07, + "loss": 0.7791, + "step": 9082 + }, + { + "epoch": 0.9277834525025537, + "grad_norm": 1.3897424326179395, + "learning_rate": 2.7229914932513477e-07, + "loss": 0.6402, + "step": 9083 + }, + { + "epoch": 0.9278855975485188, + "grad_norm": 1.4825048051652712, + "learning_rate": 2.715329020980473e-07, + "loss": 0.5874, + "step": 9084 + }, + { + "epoch": 0.9279877425944841, + "grad_norm": 1.3676974167783438, + "learning_rate": 2.707677196579672e-07, + "loss": 0.6263, + "step": 9085 + }, + { + "epoch": 0.9280898876404494, + "grad_norm": 1.4622831549654354, + "learning_rate": 2.700036020886465e-07, + "loss": 0.6529, + "step": 9086 + }, + { + "epoch": 0.9281920326864147, + "grad_norm": 1.3961191062572362, + "learning_rate": 2.6924054947371935e-07, + "loss": 0.5848, + "step": 9087 + }, + { + "epoch": 0.92829417773238, + "grad_norm": 1.5778766938303936, + "learning_rate": 2.6847856189670005e-07, + "loss": 0.6885, + "step": 9088 + }, + { + "epoch": 0.9283963227783453, + "grad_norm": 1.5088768814175193, + "learning_rate": 2.677176394409886e-07, + "loss": 0.7792, + "step": 9089 + }, + { + "epoch": 0.9284984678243106, + "grad_norm": 1.5375145524313174, + "learning_rate": 2.6695778218986837e-07, + "loss": 0.6525, + "step": 9090 + }, + { + "epoch": 0.9286006128702758, + "grad_norm": 1.490609658669047, + "learning_rate": 2.661989902265094e-07, + "loss": 0.8083, + "step": 9091 + }, + { + "epoch": 0.928702757916241, + "grad_norm": 1.5375816327136986, + "learning_rate": 2.6544126363395985e-07, + "loss": 0.7022, + "step": 9092 + }, + { + "epoch": 0.9288049029622063, + "grad_norm": 1.5356260020596588, + "learning_rate": 2.6468460249515326e-07, + "loss": 0.6506, + "step": 9093 + }, + { + "epoch": 0.9289070480081716, + "grad_norm": 1.5062816751353731, + "learning_rate": 2.6392900689290566e-07, + "loss": 0.67, + "step": 9094 + }, + { + "epoch": 0.9290091930541369, + "grad_norm": 1.3357023287484329, + "learning_rate": 2.6317447690991983e-07, + "loss": 0.6828, + "step": 9095 + }, + { + "epoch": 0.9291113381001022, + "grad_norm": 1.6362464523164053, + "learning_rate": 2.624210126287774e-07, + "loss": 0.7265, + "step": 9096 + }, + { + "epoch": 0.9292134831460674, + "grad_norm": 1.4658188218373034, + "learning_rate": 2.6166861413194575e-07, + "loss": 0.5894, + "step": 9097 + }, + { + "epoch": 0.9293156281920327, + "grad_norm": 1.3602847075444386, + "learning_rate": 2.60917281501778e-07, + "loss": 0.5892, + "step": 9098 + }, + { + "epoch": 0.9294177732379979, + "grad_norm": 1.4237917067622923, + "learning_rate": 2.6016701482050377e-07, + "loss": 0.6038, + "step": 9099 + }, + { + "epoch": 0.9295199182839632, + "grad_norm": 1.4231037093136365, + "learning_rate": 2.5941781417024413e-07, + "loss": 0.6602, + "step": 9100 + }, + { + "epoch": 0.9296220633299285, + "grad_norm": 1.421210852654467, + "learning_rate": 2.5866967963299797e-07, + "loss": 0.4705, + "step": 9101 + }, + { + "epoch": 0.9297242083758938, + "grad_norm": 1.3909707842132233, + "learning_rate": 2.579226112906486e-07, + "loss": 0.7635, + "step": 9102 + }, + { + "epoch": 0.929826353421859, + "grad_norm": 1.4334227508034074, + "learning_rate": 2.571766092249639e-07, + "loss": 0.6757, + "step": 9103 + }, + { + "epoch": 0.9299284984678243, + "grad_norm": 1.5094007575381898, + "learning_rate": 2.5643167351759315e-07, + "loss": 0.61, + "step": 9104 + }, + { + "epoch": 0.9300306435137896, + "grad_norm": 1.6028771423213608, + "learning_rate": 2.5568780425007103e-07, + "loss": 0.6586, + "step": 9105 + }, + { + "epoch": 0.9301327885597549, + "grad_norm": 1.456463712651084, + "learning_rate": 2.5494500150381463e-07, + "loss": 0.7057, + "step": 9106 + }, + { + "epoch": 0.9302349336057201, + "grad_norm": 1.546546619626533, + "learning_rate": 2.542032653601245e-07, + "loss": 0.6972, + "step": 9107 + }, + { + "epoch": 0.9303370786516854, + "grad_norm": 1.5600063354197171, + "learning_rate": 2.534625959001835e-07, + "loss": 0.7408, + "step": 9108 + }, + { + "epoch": 0.9304392236976506, + "grad_norm": 1.720129882822969, + "learning_rate": 2.527229932050579e-07, + "loss": 0.6684, + "step": 9109 + }, + { + "epoch": 0.9305413687436159, + "grad_norm": 1.553326871768726, + "learning_rate": 2.5198445735569844e-07, + "loss": 0.6111, + "step": 9110 + }, + { + "epoch": 0.9306435137895812, + "grad_norm": 1.3900504869759245, + "learning_rate": 2.5124698843293824e-07, + "loss": 0.7346, + "step": 9111 + }, + { + "epoch": 0.9307456588355465, + "grad_norm": 1.4525328659889858, + "learning_rate": 2.505105865174939e-07, + "loss": 0.7573, + "step": 9112 + }, + { + "epoch": 0.9308478038815118, + "grad_norm": 1.564158941724999, + "learning_rate": 2.4977525168996433e-07, + "loss": 0.765, + "step": 9113 + }, + { + "epoch": 0.9309499489274771, + "grad_norm": 1.5094116640678852, + "learning_rate": 2.490409840308328e-07, + "loss": 0.6589, + "step": 9114 + }, + { + "epoch": 0.9310520939734422, + "grad_norm": 1.3633111354005205, + "learning_rate": 2.483077836204661e-07, + "loss": 0.5708, + "step": 9115 + }, + { + "epoch": 0.9311542390194075, + "grad_norm": 1.7184992629284188, + "learning_rate": 2.475756505391125e-07, + "loss": 0.6931, + "step": 9116 + }, + { + "epoch": 0.9312563840653728, + "grad_norm": 1.4929170250902024, + "learning_rate": 2.468445848669054e-07, + "loss": 0.586, + "step": 9117 + }, + { + "epoch": 0.9313585291113381, + "grad_norm": 1.5889289366213262, + "learning_rate": 2.461145866838599e-07, + "loss": 0.7276, + "step": 9118 + }, + { + "epoch": 0.9314606741573034, + "grad_norm": 1.3105678542476284, + "learning_rate": 2.453856560698731e-07, + "loss": 0.6261, + "step": 9119 + }, + { + "epoch": 0.9315628192032687, + "grad_norm": 1.5753071289609075, + "learning_rate": 2.4465779310473e-07, + "loss": 0.6837, + "step": 9120 + }, + { + "epoch": 0.931664964249234, + "grad_norm": 1.4367084370419283, + "learning_rate": 2.439309978680926e-07, + "loss": 0.6192, + "step": 9121 + }, + { + "epoch": 0.9317671092951992, + "grad_norm": 1.4839780640277067, + "learning_rate": 2.432052704395127e-07, + "loss": 0.6672, + "step": 9122 + }, + { + "epoch": 0.9318692543411644, + "grad_norm": 1.6337771619732901, + "learning_rate": 2.42480610898419e-07, + "loss": 0.7254, + "step": 9123 + }, + { + "epoch": 0.9319713993871297, + "grad_norm": 1.4842092966190576, + "learning_rate": 2.417570193241259e-07, + "loss": 0.6788, + "step": 9124 + }, + { + "epoch": 0.932073544433095, + "grad_norm": 1.5510836969257646, + "learning_rate": 2.4103449579583217e-07, + "loss": 0.6493, + "step": 9125 + }, + { + "epoch": 0.9321756894790603, + "grad_norm": 1.4659656858443209, + "learning_rate": 2.4031304039261907e-07, + "loss": 0.6895, + "step": 9126 + }, + { + "epoch": 0.9322778345250256, + "grad_norm": 1.524726494334057, + "learning_rate": 2.395926531934478e-07, + "loss": 0.8226, + "step": 9127 + }, + { + "epoch": 0.9323799795709908, + "grad_norm": 1.6145203447749423, + "learning_rate": 2.3887333427716654e-07, + "loss": 0.6599, + "step": 9128 + }, + { + "epoch": 0.9324821246169561, + "grad_norm": 1.5572245511718075, + "learning_rate": 2.381550837225055e-07, + "loss": 0.6889, + "step": 9129 + }, + { + "epoch": 0.9325842696629213, + "grad_norm": 1.5983137366946285, + "learning_rate": 2.3743790160807746e-07, + "loss": 0.6297, + "step": 9130 + }, + { + "epoch": 0.9326864147088866, + "grad_norm": 1.5457715938545957, + "learning_rate": 2.3672178801237954e-07, + "loss": 0.7167, + "step": 9131 + }, + { + "epoch": 0.9327885597548519, + "grad_norm": 1.5201206155782399, + "learning_rate": 2.3600674301379022e-07, + "loss": 0.7646, + "step": 9132 + }, + { + "epoch": 0.9328907048008171, + "grad_norm": 1.4822701675640622, + "learning_rate": 2.3529276669056911e-07, + "loss": 0.6677, + "step": 9133 + }, + { + "epoch": 0.9329928498467824, + "grad_norm": 1.7607538306372794, + "learning_rate": 2.345798591208648e-07, + "loss": 0.6689, + "step": 9134 + }, + { + "epoch": 0.9330949948927477, + "grad_norm": 1.3750748004042912, + "learning_rate": 2.3386802038270484e-07, + "loss": 0.5917, + "step": 9135 + }, + { + "epoch": 0.933197139938713, + "grad_norm": 1.4940383946217, + "learning_rate": 2.3315725055399918e-07, + "loss": 0.6666, + "step": 9136 + }, + { + "epoch": 0.9332992849846783, + "grad_norm": 1.5364699432899165, + "learning_rate": 2.3244754971254113e-07, + "loss": 0.7197, + "step": 9137 + }, + { + "epoch": 0.9334014300306435, + "grad_norm": 1.7078066608739093, + "learning_rate": 2.3173891793601077e-07, + "loss": 0.8291, + "step": 9138 + }, + { + "epoch": 0.9335035750766087, + "grad_norm": 1.6523859493235273, + "learning_rate": 2.3103135530196607e-07, + "loss": 0.7272, + "step": 9139 + }, + { + "epoch": 0.933605720122574, + "grad_norm": 1.5748282987865894, + "learning_rate": 2.3032486188785286e-07, + "loss": 0.689, + "step": 9140 + }, + { + "epoch": 0.9337078651685393, + "grad_norm": 1.386508424126899, + "learning_rate": 2.2961943777099484e-07, + "loss": 0.6643, + "step": 9141 + }, + { + "epoch": 0.9338100102145046, + "grad_norm": 1.316166920623976, + "learning_rate": 2.2891508302860133e-07, + "loss": 0.5659, + "step": 9142 + }, + { + "epoch": 0.9339121552604699, + "grad_norm": 1.515443110306196, + "learning_rate": 2.2821179773776404e-07, + "loss": 0.666, + "step": 9143 + }, + { + "epoch": 0.9340143003064352, + "grad_norm": 1.4330229975422812, + "learning_rate": 2.2750958197546026e-07, + "loss": 0.7577, + "step": 9144 + }, + { + "epoch": 0.9341164453524005, + "grad_norm": 1.4537149398720624, + "learning_rate": 2.2680843581854627e-07, + "loss": 0.68, + "step": 9145 + }, + { + "epoch": 0.9342185903983656, + "grad_norm": 1.5610385085610248, + "learning_rate": 2.2610835934376296e-07, + "loss": 0.7819, + "step": 9146 + }, + { + "epoch": 0.9343207354443309, + "grad_norm": 1.4532287269657076, + "learning_rate": 2.2540935262773568e-07, + "loss": 0.8448, + "step": 9147 + }, + { + "epoch": 0.9344228804902962, + "grad_norm": 1.413809825646478, + "learning_rate": 2.2471141574696765e-07, + "loss": 0.6938, + "step": 9148 + }, + { + "epoch": 0.9345250255362615, + "grad_norm": 1.578987626850694, + "learning_rate": 2.2401454877785223e-07, + "loss": 0.7897, + "step": 9149 + }, + { + "epoch": 0.9346271705822268, + "grad_norm": 1.3978398053753782, + "learning_rate": 2.2331875179666174e-07, + "loss": 0.6519, + "step": 9150 + }, + { + "epoch": 0.9347293156281921, + "grad_norm": 1.3893884288178022, + "learning_rate": 2.2262402487954859e-07, + "loss": 0.6452, + "step": 9151 + }, + { + "epoch": 0.9348314606741573, + "grad_norm": 1.4805054804097963, + "learning_rate": 2.2193036810255418e-07, + "loss": 0.643, + "step": 9152 + }, + { + "epoch": 0.9349336057201226, + "grad_norm": 1.609762265211534, + "learning_rate": 2.2123778154159558e-07, + "loss": 0.6297, + "step": 9153 + }, + { + "epoch": 0.9350357507660878, + "grad_norm": 1.474466727440794, + "learning_rate": 2.2054626527248214e-07, + "loss": 0.6667, + "step": 9154 + }, + { + "epoch": 0.9351378958120531, + "grad_norm": 1.5944723513603682, + "learning_rate": 2.1985581937089883e-07, + "loss": 0.6466, + "step": 9155 + }, + { + "epoch": 0.9352400408580184, + "grad_norm": 1.4484222395771356, + "learning_rate": 2.1916644391241416e-07, + "loss": 0.6751, + "step": 9156 + }, + { + "epoch": 0.9353421859039837, + "grad_norm": 1.5329706700149432, + "learning_rate": 2.1847813897248104e-07, + "loss": 0.691, + "step": 9157 + }, + { + "epoch": 0.9354443309499489, + "grad_norm": 1.4262688920839575, + "learning_rate": 2.1779090462643483e-07, + "loss": 0.6776, + "step": 9158 + }, + { + "epoch": 0.9355464759959142, + "grad_norm": 1.850008836317782, + "learning_rate": 2.171047409494953e-07, + "loss": 0.7105, + "step": 9159 + }, + { + "epoch": 0.9356486210418795, + "grad_norm": 1.539621491780249, + "learning_rate": 2.164196480167624e-07, + "loss": 0.6479, + "step": 9160 + }, + { + "epoch": 0.9357507660878447, + "grad_norm": 1.696711499564608, + "learning_rate": 2.157356259032195e-07, + "loss": 0.7222, + "step": 9161 + }, + { + "epoch": 0.93585291113381, + "grad_norm": 1.5699839132225384, + "learning_rate": 2.1505267468373447e-07, + "loss": 0.7121, + "step": 9162 + }, + { + "epoch": 0.9359550561797753, + "grad_norm": 1.4204585945720691, + "learning_rate": 2.143707944330542e-07, + "loss": 0.5993, + "step": 9163 + }, + { + "epoch": 0.9360572012257405, + "grad_norm": 1.5097873272466122, + "learning_rate": 2.136899852258145e-07, + "loss": 0.621, + "step": 9164 + }, + { + "epoch": 0.9361593462717058, + "grad_norm": 1.472967099270891, + "learning_rate": 2.1301024713652919e-07, + "loss": 0.7536, + "step": 9165 + }, + { + "epoch": 0.9362614913176711, + "grad_norm": 1.4288746778226458, + "learning_rate": 2.123315802395942e-07, + "loss": 0.6153, + "step": 9166 + }, + { + "epoch": 0.9363636363636364, + "grad_norm": 1.8105445982856454, + "learning_rate": 2.1165398460929133e-07, + "loss": 0.608, + "step": 9167 + }, + { + "epoch": 0.9364657814096017, + "grad_norm": 1.5726058721852456, + "learning_rate": 2.1097746031978561e-07, + "loss": 0.731, + "step": 9168 + }, + { + "epoch": 0.9365679264555669, + "grad_norm": 1.4543231329066133, + "learning_rate": 2.1030200744511896e-07, + "loss": 0.6115, + "step": 9169 + }, + { + "epoch": 0.9366700715015321, + "grad_norm": 1.546267395817543, + "learning_rate": 2.096276260592245e-07, + "loss": 0.7379, + "step": 9170 + }, + { + "epoch": 0.9367722165474974, + "grad_norm": 1.6224828727306777, + "learning_rate": 2.089543162359109e-07, + "loss": 0.6338, + "step": 9171 + }, + { + "epoch": 0.9368743615934627, + "grad_norm": 1.521620274363574, + "learning_rate": 2.0828207804887367e-07, + "loss": 0.7493, + "step": 9172 + }, + { + "epoch": 0.936976506639428, + "grad_norm": 1.7000889555108607, + "learning_rate": 2.0761091157168844e-07, + "loss": 0.7303, + "step": 9173 + }, + { + "epoch": 0.9370786516853933, + "grad_norm": 1.5214493794892727, + "learning_rate": 2.0694081687781642e-07, + "loss": 0.6618, + "step": 9174 + }, + { + "epoch": 0.9371807967313586, + "grad_norm": 1.467202499243737, + "learning_rate": 2.0627179404060004e-07, + "loss": 0.6026, + "step": 9175 + }, + { + "epoch": 0.9372829417773239, + "grad_norm": 1.3654687437728887, + "learning_rate": 2.0560384313326187e-07, + "loss": 0.5816, + "step": 9176 + }, + { + "epoch": 0.937385086823289, + "grad_norm": 1.4692067681497907, + "learning_rate": 2.049369642289112e-07, + "loss": 0.6527, + "step": 9177 + }, + { + "epoch": 0.9374872318692543, + "grad_norm": 1.654689827699916, + "learning_rate": 2.0427115740053737e-07, + "loss": 0.6393, + "step": 9178 + }, + { + "epoch": 0.9375893769152196, + "grad_norm": 1.4348862470190473, + "learning_rate": 2.036064227210155e-07, + "loss": 0.633, + "step": 9179 + }, + { + "epoch": 0.9376915219611849, + "grad_norm": 1.4484687154579214, + "learning_rate": 2.0294276026310066e-07, + "loss": 0.5937, + "step": 9180 + }, + { + "epoch": 0.9377936670071502, + "grad_norm": 1.4794775571299401, + "learning_rate": 2.0228017009942924e-07, + "loss": 0.7494, + "step": 9181 + }, + { + "epoch": 0.9378958120531155, + "grad_norm": 1.4270010521828056, + "learning_rate": 2.016186523025232e-07, + "loss": 0.6545, + "step": 9182 + }, + { + "epoch": 0.9379979570990807, + "grad_norm": 1.3911152080698819, + "learning_rate": 2.0095820694478685e-07, + "loss": 0.642, + "step": 9183 + }, + { + "epoch": 0.9381001021450459, + "grad_norm": 1.4067780513913422, + "learning_rate": 2.002988340985057e-07, + "loss": 0.6205, + "step": 9184 + }, + { + "epoch": 0.9382022471910112, + "grad_norm": 1.4203092045042007, + "learning_rate": 1.996405338358476e-07, + "loss": 0.727, + "step": 9185 + }, + { + "epoch": 0.9383043922369765, + "grad_norm": 1.4313456146321495, + "learning_rate": 1.9898330622886597e-07, + "loss": 0.657, + "step": 9186 + }, + { + "epoch": 0.9384065372829418, + "grad_norm": 1.5764090005522304, + "learning_rate": 1.9832715134949333e-07, + "loss": 0.7166, + "step": 9187 + }, + { + "epoch": 0.938508682328907, + "grad_norm": 1.546219421158126, + "learning_rate": 1.9767206926954662e-07, + "loss": 0.7675, + "step": 9188 + }, + { + "epoch": 0.9386108273748723, + "grad_norm": 1.5311481796682007, + "learning_rate": 1.9701806006072522e-07, + "loss": 0.6447, + "step": 9189 + }, + { + "epoch": 0.9387129724208376, + "grad_norm": 1.5861926173047243, + "learning_rate": 1.963651237946107e-07, + "loss": 0.7269, + "step": 9190 + }, + { + "epoch": 0.9388151174668029, + "grad_norm": 1.4093799510424223, + "learning_rate": 1.9571326054266814e-07, + "loss": 0.5907, + "step": 9191 + }, + { + "epoch": 0.9389172625127681, + "grad_norm": 1.3379601530316063, + "learning_rate": 1.950624703762416e-07, + "loss": 0.6797, + "step": 9192 + }, + { + "epoch": 0.9390194075587334, + "grad_norm": 1.5222326324431352, + "learning_rate": 1.9441275336656294e-07, + "loss": 0.5817, + "step": 9193 + }, + { + "epoch": 0.9391215526046987, + "grad_norm": 1.509801818368835, + "learning_rate": 1.9376410958474534e-07, + "loss": 0.7005, + "step": 9194 + }, + { + "epoch": 0.9392236976506639, + "grad_norm": 1.5110512363911448, + "learning_rate": 1.9311653910178085e-07, + "loss": 0.6657, + "step": 9195 + }, + { + "epoch": 0.9393258426966292, + "grad_norm": 1.649271908127654, + "learning_rate": 1.9247004198854724e-07, + "loss": 0.7076, + "step": 9196 + }, + { + "epoch": 0.9394279877425945, + "grad_norm": 1.4048627828985776, + "learning_rate": 1.9182461831580234e-07, + "loss": 0.6968, + "step": 9197 + }, + { + "epoch": 0.9395301327885598, + "grad_norm": 1.5583964896608415, + "learning_rate": 1.9118026815419188e-07, + "loss": 0.689, + "step": 9198 + }, + { + "epoch": 0.9396322778345251, + "grad_norm": 1.417301813944971, + "learning_rate": 1.9053699157423834e-07, + "loss": 0.6415, + "step": 9199 + }, + { + "epoch": 0.9397344228804902, + "grad_norm": 1.51873963909895, + "learning_rate": 1.898947886463487e-07, + "loss": 0.7756, + "step": 9200 + }, + { + "epoch": 0.9398365679264555, + "grad_norm": 1.589273558421054, + "learning_rate": 1.892536594408112e-07, + "loss": 0.654, + "step": 9201 + }, + { + "epoch": 0.9399387129724208, + "grad_norm": 1.4869529434282553, + "learning_rate": 1.8861360402779972e-07, + "loss": 0.8251, + "step": 9202 + }, + { + "epoch": 0.9400408580183861, + "grad_norm": 1.437537377279064, + "learning_rate": 1.879746224773682e-07, + "loss": 0.6675, + "step": 9203 + }, + { + "epoch": 0.9401430030643514, + "grad_norm": 1.5107359945466787, + "learning_rate": 1.8733671485945516e-07, + "loss": 0.7291, + "step": 9204 + }, + { + "epoch": 0.9402451481103167, + "grad_norm": 1.4939796767948221, + "learning_rate": 1.8669988124387695e-07, + "loss": 0.7402, + "step": 9205 + }, + { + "epoch": 0.940347293156282, + "grad_norm": 1.5448036014027644, + "learning_rate": 1.8606412170033783e-07, + "loss": 0.6909, + "step": 9206 + }, + { + "epoch": 0.9404494382022472, + "grad_norm": 1.6652358930281643, + "learning_rate": 1.8542943629841993e-07, + "loss": 0.7863, + "step": 9207 + }, + { + "epoch": 0.9405515832482124, + "grad_norm": 1.67465360743183, + "learning_rate": 1.847958251075921e-07, + "loss": 0.6942, + "step": 9208 + }, + { + "epoch": 0.9406537282941777, + "grad_norm": 1.3793255517744074, + "learning_rate": 1.841632881972022e-07, + "loss": 0.6659, + "step": 9209 + }, + { + "epoch": 0.940755873340143, + "grad_norm": 1.5210946436378445, + "learning_rate": 1.8353182563648264e-07, + "loss": 0.7153, + "step": 9210 + }, + { + "epoch": 0.9408580183861083, + "grad_norm": 1.4801130401798237, + "learning_rate": 1.8290143749454813e-07, + "loss": 0.6562, + "step": 9211 + }, + { + "epoch": 0.9409601634320736, + "grad_norm": 1.487967576262061, + "learning_rate": 1.822721238403924e-07, + "loss": 0.616, + "step": 9212 + }, + { + "epoch": 0.9410623084780388, + "grad_norm": 1.565554998305901, + "learning_rate": 1.8164388474289584e-07, + "loss": 0.7236, + "step": 9213 + }, + { + "epoch": 0.9411644535240041, + "grad_norm": 1.5585291193941702, + "learning_rate": 1.8101672027082018e-07, + "loss": 0.6923, + "step": 9214 + }, + { + "epoch": 0.9412665985699693, + "grad_norm": 1.4756991123897936, + "learning_rate": 1.8039063049280824e-07, + "loss": 0.6622, + "step": 9215 + }, + { + "epoch": 0.9413687436159346, + "grad_norm": 1.6939074763150774, + "learning_rate": 1.7976561547738526e-07, + "loss": 0.721, + "step": 9216 + }, + { + "epoch": 0.9414708886618999, + "grad_norm": 1.5277423455869896, + "learning_rate": 1.791416752929609e-07, + "loss": 0.6835, + "step": 9217 + }, + { + "epoch": 0.9415730337078652, + "grad_norm": 1.498827750617463, + "learning_rate": 1.7851881000782612e-07, + "loss": 0.6832, + "step": 9218 + }, + { + "epoch": 0.9416751787538304, + "grad_norm": 1.4716570593858127, + "learning_rate": 1.7789701969015195e-07, + "loss": 0.6967, + "step": 9219 + }, + { + "epoch": 0.9417773237997957, + "grad_norm": 1.511468326703934, + "learning_rate": 1.7727630440799615e-07, + "loss": 0.693, + "step": 9220 + }, + { + "epoch": 0.941879468845761, + "grad_norm": 1.3614925169175607, + "learning_rate": 1.766566642292933e-07, + "loss": 0.6452, + "step": 9221 + }, + { + "epoch": 0.9419816138917263, + "grad_norm": 1.530722700688656, + "learning_rate": 1.7603809922186688e-07, + "loss": 0.7506, + "step": 9222 + }, + { + "epoch": 0.9420837589376915, + "grad_norm": 1.6692969357417948, + "learning_rate": 1.754206094534161e-07, + "loss": 0.8624, + "step": 9223 + }, + { + "epoch": 0.9421859039836568, + "grad_norm": 1.503319904228773, + "learning_rate": 1.7480419499152912e-07, + "loss": 0.6789, + "step": 9224 + }, + { + "epoch": 0.942288049029622, + "grad_norm": 1.57641052138822, + "learning_rate": 1.7418885590366864e-07, + "loss": 0.7784, + "step": 9225 + }, + { + "epoch": 0.9423901940755873, + "grad_norm": 1.5827297238864475, + "learning_rate": 1.735745922571874e-07, + "loss": 0.7343, + "step": 9226 + }, + { + "epoch": 0.9424923391215526, + "grad_norm": 1.4827221172831844, + "learning_rate": 1.7296140411931507e-07, + "loss": 0.6493, + "step": 9227 + }, + { + "epoch": 0.9425944841675179, + "grad_norm": 1.4520911571448938, + "learning_rate": 1.7234929155716673e-07, + "loss": 0.6955, + "step": 9228 + }, + { + "epoch": 0.9426966292134832, + "grad_norm": 1.7208341226702184, + "learning_rate": 1.7173825463773774e-07, + "loss": 0.765, + "step": 9229 + }, + { + "epoch": 0.9427987742594485, + "grad_norm": 1.4892207631146965, + "learning_rate": 1.7112829342790683e-07, + "loss": 0.7009, + "step": 9230 + }, + { + "epoch": 0.9429009193054136, + "grad_norm": 1.332369164366498, + "learning_rate": 1.705194079944339e-07, + "loss": 0.5831, + "step": 9231 + }, + { + "epoch": 0.9430030643513789, + "grad_norm": 1.4322658593830402, + "learning_rate": 1.6991159840396342e-07, + "loss": 0.6594, + "step": 9232 + }, + { + "epoch": 0.9431052093973442, + "grad_norm": 1.4983619132286408, + "learning_rate": 1.6930486472301888e-07, + "loss": 0.6423, + "step": 9233 + }, + { + "epoch": 0.9432073544433095, + "grad_norm": 1.5645139771311332, + "learning_rate": 1.6869920701800824e-07, + "loss": 0.7148, + "step": 9234 + }, + { + "epoch": 0.9433094994892748, + "grad_norm": 1.4439804397427218, + "learning_rate": 1.680946253552218e-07, + "loss": 0.7543, + "step": 9235 + }, + { + "epoch": 0.9434116445352401, + "grad_norm": 1.5507773309709343, + "learning_rate": 1.6749111980082998e-07, + "loss": 0.6626, + "step": 9236 + }, + { + "epoch": 0.9435137895812054, + "grad_norm": 1.5404616595603968, + "learning_rate": 1.6688869042088773e-07, + "loss": 0.7204, + "step": 9237 + }, + { + "epoch": 0.9436159346271705, + "grad_norm": 1.4132372839595242, + "learning_rate": 1.662873372813323e-07, + "loss": 0.654, + "step": 9238 + }, + { + "epoch": 0.9437180796731358, + "grad_norm": 1.6012398614855041, + "learning_rate": 1.65687060447981e-07, + "loss": 0.753, + "step": 9239 + }, + { + "epoch": 0.9438202247191011, + "grad_norm": 1.4663519001830385, + "learning_rate": 1.6508785998653466e-07, + "loss": 0.7042, + "step": 9240 + }, + { + "epoch": 0.9439223697650664, + "grad_norm": 1.3698891442492873, + "learning_rate": 1.6448973596257412e-07, + "loss": 0.6125, + "step": 9241 + }, + { + "epoch": 0.9440245148110317, + "grad_norm": 1.661359149604778, + "learning_rate": 1.6389268844156924e-07, + "loss": 0.7368, + "step": 9242 + }, + { + "epoch": 0.944126659856997, + "grad_norm": 1.5785198621726113, + "learning_rate": 1.6329671748886333e-07, + "loss": 0.7571, + "step": 9243 + }, + { + "epoch": 0.9442288049029622, + "grad_norm": 1.3595098553347953, + "learning_rate": 1.6270182316968753e-07, + "loss": 0.5922, + "step": 9244 + }, + { + "epoch": 0.9443309499489275, + "grad_norm": 1.3770029103886756, + "learning_rate": 1.6210800554915306e-07, + "loss": 0.6736, + "step": 9245 + }, + { + "epoch": 0.9444330949948927, + "grad_norm": 1.538482605999784, + "learning_rate": 1.6151526469225243e-07, + "loss": 0.7113, + "step": 9246 + }, + { + "epoch": 0.944535240040858, + "grad_norm": 1.440233041973123, + "learning_rate": 1.609236006638637e-07, + "loss": 0.5824, + "step": 9247 + }, + { + "epoch": 0.9446373850868233, + "grad_norm": 1.5608158008113064, + "learning_rate": 1.6033301352874396e-07, + "loss": 0.7081, + "step": 9248 + }, + { + "epoch": 0.9447395301327886, + "grad_norm": 1.6535352740355067, + "learning_rate": 1.5974350335153154e-07, + "loss": 0.7602, + "step": 9249 + }, + { + "epoch": 0.9448416751787538, + "grad_norm": 1.534514069717892, + "learning_rate": 1.5915507019675147e-07, + "loss": 0.6574, + "step": 9250 + }, + { + "epoch": 0.9449438202247191, + "grad_norm": 1.6210579115975412, + "learning_rate": 1.5856771412880556e-07, + "loss": 0.771, + "step": 9251 + }, + { + "epoch": 0.9450459652706844, + "grad_norm": 1.3875732925312887, + "learning_rate": 1.579814352119835e-07, + "loss": 0.5545, + "step": 9252 + }, + { + "epoch": 0.9451481103166497, + "grad_norm": 1.5308048078834509, + "learning_rate": 1.5739623351045174e-07, + "loss": 0.6268, + "step": 9253 + }, + { + "epoch": 0.9452502553626149, + "grad_norm": 1.5468445179103205, + "learning_rate": 1.5681210908826127e-07, + "loss": 0.7112, + "step": 9254 + }, + { + "epoch": 0.9453524004085802, + "grad_norm": 1.4732141894788657, + "learning_rate": 1.5622906200934428e-07, + "loss": 0.7093, + "step": 9255 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 1.618852444463458, + "learning_rate": 1.5564709233751752e-07, + "loss": 0.6604, + "step": 9256 + }, + { + "epoch": 0.9455566905005107, + "grad_norm": 1.349056677169564, + "learning_rate": 1.5506620013647554e-07, + "loss": 0.6521, + "step": 9257 + }, + { + "epoch": 0.945658835546476, + "grad_norm": 1.455499690417291, + "learning_rate": 1.5448638546979866e-07, + "loss": 0.6139, + "step": 9258 + }, + { + "epoch": 0.9457609805924413, + "grad_norm": 1.4138197586894834, + "learning_rate": 1.539076484009494e-07, + "loss": 0.6648, + "step": 9259 + }, + { + "epoch": 0.9458631256384066, + "grad_norm": 1.5234178311215272, + "learning_rate": 1.5332998899326823e-07, + "loss": 0.7855, + "step": 9260 + }, + { + "epoch": 0.9459652706843719, + "grad_norm": 1.5324659588522216, + "learning_rate": 1.5275340730998122e-07, + "loss": 0.7188, + "step": 9261 + }, + { + "epoch": 0.946067415730337, + "grad_norm": 1.5263772527270794, + "learning_rate": 1.521779034141968e-07, + "loss": 0.7074, + "step": 9262 + }, + { + "epoch": 0.9461695607763023, + "grad_norm": 1.5186616472286383, + "learning_rate": 1.5160347736890235e-07, + "loss": 0.6143, + "step": 9263 + }, + { + "epoch": 0.9462717058222676, + "grad_norm": 1.5763677979185726, + "learning_rate": 1.5103012923697201e-07, + "loss": 0.6978, + "step": 9264 + }, + { + "epoch": 0.9463738508682329, + "grad_norm": 1.420818036830884, + "learning_rate": 1.5045785908115563e-07, + "loss": 0.5854, + "step": 9265 + }, + { + "epoch": 0.9464759959141982, + "grad_norm": 1.722681001924757, + "learning_rate": 1.498866669640897e-07, + "loss": 0.7203, + "step": 9266 + }, + { + "epoch": 0.9465781409601635, + "grad_norm": 1.5844118930790396, + "learning_rate": 1.4931655294829317e-07, + "loss": 0.6346, + "step": 9267 + }, + { + "epoch": 0.9466802860061287, + "grad_norm": 1.5041325508194328, + "learning_rate": 1.4874751709616386e-07, + "loss": 0.6227, + "step": 9268 + }, + { + "epoch": 0.9467824310520939, + "grad_norm": 1.516447761938868, + "learning_rate": 1.4817955946998418e-07, + "loss": 0.7494, + "step": 9269 + }, + { + "epoch": 0.9468845760980592, + "grad_norm": 1.6041140912539658, + "learning_rate": 1.4761268013191555e-07, + "loss": 0.6639, + "step": 9270 + }, + { + "epoch": 0.9469867211440245, + "grad_norm": 1.4189435857675738, + "learning_rate": 1.4704687914400605e-07, + "loss": 0.7255, + "step": 9271 + }, + { + "epoch": 0.9470888661899898, + "grad_norm": 1.493706875093232, + "learning_rate": 1.4648215656818066e-07, + "loss": 0.6629, + "step": 9272 + }, + { + "epoch": 0.9471910112359551, + "grad_norm": 1.391188872442206, + "learning_rate": 1.4591851246624878e-07, + "loss": 0.6944, + "step": 9273 + }, + { + "epoch": 0.9472931562819203, + "grad_norm": 1.4587415904822167, + "learning_rate": 1.453559468999033e-07, + "loss": 0.7308, + "step": 9274 + }, + { + "epoch": 0.9473953013278856, + "grad_norm": 1.5452435889215106, + "learning_rate": 1.4479445993071606e-07, + "loss": 0.708, + "step": 9275 + }, + { + "epoch": 0.9474974463738509, + "grad_norm": 1.4266428324147273, + "learning_rate": 1.4423405162014238e-07, + "loss": 0.606, + "step": 9276 + }, + { + "epoch": 0.9475995914198161, + "grad_norm": 1.5547600755760407, + "learning_rate": 1.4367472202951983e-07, + "loss": 0.7454, + "step": 9277 + }, + { + "epoch": 0.9477017364657814, + "grad_norm": 1.5009573897420434, + "learning_rate": 1.4311647122006722e-07, + "loss": 0.634, + "step": 9278 + }, + { + "epoch": 0.9478038815117467, + "grad_norm": 1.5893430883430075, + "learning_rate": 1.425592992528846e-07, + "loss": 0.6661, + "step": 9279 + }, + { + "epoch": 0.947906026557712, + "grad_norm": 1.4437110861867624, + "learning_rate": 1.4200320618895424e-07, + "loss": 0.6271, + "step": 9280 + }, + { + "epoch": 0.9480081716036772, + "grad_norm": 1.5054424654550063, + "learning_rate": 1.4144819208914306e-07, + "loss": 0.7966, + "step": 9281 + }, + { + "epoch": 0.9481103166496425, + "grad_norm": 1.4468942107725755, + "learning_rate": 1.408942570141969e-07, + "loss": 0.7719, + "step": 9282 + }, + { + "epoch": 0.9482124616956078, + "grad_norm": 1.538404982684656, + "learning_rate": 1.4034140102474392e-07, + "loss": 0.6634, + "step": 9283 + }, + { + "epoch": 0.9483146067415731, + "grad_norm": 1.6256133473944119, + "learning_rate": 1.3978962418129572e-07, + "loss": 0.6593, + "step": 9284 + }, + { + "epoch": 0.9484167517875383, + "grad_norm": 1.5959561520228596, + "learning_rate": 1.3923892654424177e-07, + "loss": 0.6736, + "step": 9285 + }, + { + "epoch": 0.9485188968335035, + "grad_norm": 1.6079001448065717, + "learning_rate": 1.386893081738594e-07, + "loss": 0.6901, + "step": 9286 + }, + { + "epoch": 0.9486210418794688, + "grad_norm": 1.6480143755403525, + "learning_rate": 1.3814076913030382e-07, + "loss": 0.7224, + "step": 9287 + }, + { + "epoch": 0.9487231869254341, + "grad_norm": 1.6143665708272015, + "learning_rate": 1.375933094736126e-07, + "loss": 0.6146, + "step": 9288 + }, + { + "epoch": 0.9488253319713994, + "grad_norm": 1.4757356779022706, + "learning_rate": 1.3704692926370444e-07, + "loss": 0.689, + "step": 9289 + }, + { + "epoch": 0.9489274770173647, + "grad_norm": 1.4433858343052577, + "learning_rate": 1.3650162856038153e-07, + "loss": 0.5997, + "step": 9290 + }, + { + "epoch": 0.94902962206333, + "grad_norm": 1.5715214428091502, + "learning_rate": 1.359574074233294e-07, + "loss": 0.7138, + "step": 9291 + }, + { + "epoch": 0.9491317671092953, + "grad_norm": 1.6137140443060531, + "learning_rate": 1.3541426591211272e-07, + "loss": 0.6572, + "step": 9292 + }, + { + "epoch": 0.9492339121552604, + "grad_norm": 1.546687893978786, + "learning_rate": 1.3487220408617718e-07, + "loss": 0.6914, + "step": 9293 + }, + { + "epoch": 0.9493360572012257, + "grad_norm": 1.518421044470318, + "learning_rate": 1.3433122200485315e-07, + "loss": 0.7158, + "step": 9294 + }, + { + "epoch": 0.949438202247191, + "grad_norm": 1.5543457163777104, + "learning_rate": 1.3379131972734884e-07, + "loss": 0.7023, + "step": 9295 + }, + { + "epoch": 0.9495403472931563, + "grad_norm": 1.5975011108937367, + "learning_rate": 1.3325249731276134e-07, + "loss": 0.679, + "step": 9296 + }, + { + "epoch": 0.9496424923391216, + "grad_norm": 1.6024613259367313, + "learning_rate": 1.3271475482006134e-07, + "loss": 0.6689, + "step": 9297 + }, + { + "epoch": 0.9497446373850869, + "grad_norm": 1.4728127270988278, + "learning_rate": 1.321780923081073e-07, + "loss": 0.6868, + "step": 9298 + }, + { + "epoch": 0.9498467824310521, + "grad_norm": 1.62832985516429, + "learning_rate": 1.3164250983563665e-07, + "loss": 0.7491, + "step": 9299 + }, + { + "epoch": 0.9499489274770173, + "grad_norm": 1.5068663065776413, + "learning_rate": 1.3110800746126805e-07, + "loss": 0.7455, + "step": 9300 + }, + { + "epoch": 0.9500510725229826, + "grad_norm": 1.512544023129672, + "learning_rate": 1.3057458524350476e-07, + "loss": 0.5987, + "step": 9301 + }, + { + "epoch": 0.9501532175689479, + "grad_norm": 1.3087888161853105, + "learning_rate": 1.3004224324073e-07, + "loss": 0.6404, + "step": 9302 + }, + { + "epoch": 0.9502553626149132, + "grad_norm": 1.4617228192995908, + "learning_rate": 1.295109815112072e-07, + "loss": 0.6679, + "step": 9303 + }, + { + "epoch": 0.9503575076608785, + "grad_norm": 2.766873574774778, + "learning_rate": 1.2898080011308543e-07, + "loss": 0.669, + "step": 9304 + }, + { + "epoch": 0.9504596527068437, + "grad_norm": 1.491754081125532, + "learning_rate": 1.284516991043927e-07, + "loss": 0.6872, + "step": 9305 + }, + { + "epoch": 0.950561797752809, + "grad_norm": 1.4022827435182528, + "learning_rate": 1.2792367854303933e-07, + "loss": 0.6924, + "step": 9306 + }, + { + "epoch": 0.9506639427987743, + "grad_norm": 1.513732646340184, + "learning_rate": 1.2739673848681688e-07, + "loss": 0.65, + "step": 9307 + }, + { + "epoch": 0.9507660878447395, + "grad_norm": 1.5332202763447564, + "learning_rate": 1.2687087899340144e-07, + "loss": 0.6633, + "step": 9308 + }, + { + "epoch": 0.9508682328907048, + "grad_norm": 1.4633387505028197, + "learning_rate": 1.2634610012034586e-07, + "loss": 0.6771, + "step": 9309 + }, + { + "epoch": 0.95097037793667, + "grad_norm": 1.5941804218443754, + "learning_rate": 1.2582240192508865e-07, + "loss": 0.7348, + "step": 9310 + }, + { + "epoch": 0.9510725229826353, + "grad_norm": 1.5389927026890677, + "learning_rate": 1.2529978446495063e-07, + "loss": 0.5975, + "step": 9311 + }, + { + "epoch": 0.9511746680286006, + "grad_norm": 1.4698878710359864, + "learning_rate": 1.2477824779712932e-07, + "loss": 0.6688, + "step": 9312 + }, + { + "epoch": 0.9512768130745659, + "grad_norm": 1.5144857667993492, + "learning_rate": 1.2425779197871024e-07, + "loss": 0.6762, + "step": 9313 + }, + { + "epoch": 0.9513789581205312, + "grad_norm": 1.4004750619021893, + "learning_rate": 1.2373841706665556e-07, + "loss": 0.6904, + "step": 9314 + }, + { + "epoch": 0.9514811031664965, + "grad_norm": 1.4403383846228797, + "learning_rate": 1.2322012311781205e-07, + "loss": 0.687, + "step": 9315 + }, + { + "epoch": 0.9515832482124617, + "grad_norm": 1.6750578519595565, + "learning_rate": 1.2270291018890767e-07, + "loss": 0.731, + "step": 9316 + }, + { + "epoch": 0.9516853932584269, + "grad_norm": 1.5818994239982531, + "learning_rate": 1.2218677833655157e-07, + "loss": 0.6856, + "step": 9317 + }, + { + "epoch": 0.9517875383043922, + "grad_norm": 1.268700335400305, + "learning_rate": 1.2167172761723412e-07, + "loss": 0.6599, + "step": 9318 + }, + { + "epoch": 0.9518896833503575, + "grad_norm": 1.3657047608950803, + "learning_rate": 1.2115775808732799e-07, + "loss": 0.657, + "step": 9319 + }, + { + "epoch": 0.9519918283963228, + "grad_norm": 1.556957583595499, + "learning_rate": 1.206448698030882e-07, + "loss": 0.5482, + "step": 9320 + }, + { + "epoch": 0.9520939734422881, + "grad_norm": 1.492298082963576, + "learning_rate": 1.2013306282064985e-07, + "loss": 0.6639, + "step": 9321 + }, + { + "epoch": 0.9521961184882534, + "grad_norm": 1.4752585547574129, + "learning_rate": 1.1962233719603144e-07, + "loss": 0.6281, + "step": 9322 + }, + { + "epoch": 0.9522982635342185, + "grad_norm": 1.5151434569571445, + "learning_rate": 1.1911269298513162e-07, + "loss": 0.6453, + "step": 9323 + }, + { + "epoch": 0.9524004085801838, + "grad_norm": 1.5449130116422798, + "learning_rate": 1.186041302437313e-07, + "loss": 0.5954, + "step": 9324 + }, + { + "epoch": 0.9525025536261491, + "grad_norm": 1.453585818555242, + "learning_rate": 1.1809664902749262e-07, + "loss": 0.7877, + "step": 9325 + }, + { + "epoch": 0.9526046986721144, + "grad_norm": 1.5939350188551138, + "learning_rate": 1.1759024939196117e-07, + "loss": 0.6176, + "step": 9326 + }, + { + "epoch": 0.9527068437180797, + "grad_norm": 1.4507339328571252, + "learning_rate": 1.1708493139256149e-07, + "loss": 0.5803, + "step": 9327 + }, + { + "epoch": 0.952808988764045, + "grad_norm": 1.301136977892548, + "learning_rate": 1.1658069508460157e-07, + "loss": 0.555, + "step": 9328 + }, + { + "epoch": 0.9529111338100102, + "grad_norm": 1.3785820368436714, + "learning_rate": 1.1607754052326836e-07, + "loss": 0.5953, + "step": 9329 + }, + { + "epoch": 0.9530132788559755, + "grad_norm": 1.6277974290056627, + "learning_rate": 1.155754677636367e-07, + "loss": 0.7546, + "step": 9330 + }, + { + "epoch": 0.9531154239019407, + "grad_norm": 1.3665797519326228, + "learning_rate": 1.1507447686065487e-07, + "loss": 0.5786, + "step": 9331 + }, + { + "epoch": 0.953217568947906, + "grad_norm": 1.4373501304917398, + "learning_rate": 1.1457456786915788e-07, + "loss": 0.6574, + "step": 9332 + }, + { + "epoch": 0.9533197139938713, + "grad_norm": 1.520732629464046, + "learning_rate": 1.1407574084386197e-07, + "loss": 0.6363, + "step": 9333 + }, + { + "epoch": 0.9534218590398366, + "grad_norm": 1.4503950169192572, + "learning_rate": 1.1357799583936236e-07, + "loss": 0.6362, + "step": 9334 + }, + { + "epoch": 0.9535240040858018, + "grad_norm": 1.6272060895125915, + "learning_rate": 1.130813329101399e-07, + "loss": 0.6273, + "step": 9335 + }, + { + "epoch": 0.9536261491317671, + "grad_norm": 1.5154591964098143, + "learning_rate": 1.1258575211055223e-07, + "loss": 0.6523, + "step": 9336 + }, + { + "epoch": 0.9537282941777324, + "grad_norm": 1.603956192689196, + "learning_rate": 1.1209125349484263e-07, + "loss": 0.7404, + "step": 9337 + }, + { + "epoch": 0.9538304392236977, + "grad_norm": 1.5311618597428467, + "learning_rate": 1.1159783711713335e-07, + "loss": 0.6245, + "step": 9338 + }, + { + "epoch": 0.9539325842696629, + "grad_norm": 1.593823869286098, + "learning_rate": 1.1110550303143008e-07, + "loss": 0.75, + "step": 9339 + }, + { + "epoch": 0.9540347293156282, + "grad_norm": 1.3862848543822053, + "learning_rate": 1.1061425129161752e-07, + "loss": 0.578, + "step": 9340 + }, + { + "epoch": 0.9541368743615934, + "grad_norm": 1.4522786173209419, + "learning_rate": 1.1012408195146596e-07, + "loss": 0.6946, + "step": 9341 + }, + { + "epoch": 0.9542390194075587, + "grad_norm": 1.4420361895243479, + "learning_rate": 1.096349950646225e-07, + "loss": 0.6743, + "step": 9342 + }, + { + "epoch": 0.954341164453524, + "grad_norm": 1.596877206868667, + "learning_rate": 1.0914699068461876e-07, + "loss": 0.6523, + "step": 9343 + }, + { + "epoch": 0.9544433094994893, + "grad_norm": 1.3998708225106409, + "learning_rate": 1.0866006886486757e-07, + "loss": 0.6013, + "step": 9344 + }, + { + "epoch": 0.9545454545454546, + "grad_norm": 1.50468343526662, + "learning_rate": 1.0817422965866187e-07, + "loss": 0.7695, + "step": 9345 + }, + { + "epoch": 0.9546475995914199, + "grad_norm": 1.3692514906557283, + "learning_rate": 1.0768947311917799e-07, + "loss": 0.7239, + "step": 9346 + }, + { + "epoch": 0.954749744637385, + "grad_norm": 1.6372260736670652, + "learning_rate": 1.0720579929947239e-07, + "loss": 0.7884, + "step": 9347 + }, + { + "epoch": 0.9548518896833503, + "grad_norm": 1.353239302600394, + "learning_rate": 1.0672320825248383e-07, + "loss": 0.7376, + "step": 9348 + }, + { + "epoch": 0.9549540347293156, + "grad_norm": 1.3684272137669975, + "learning_rate": 1.0624170003103119e-07, + "loss": 0.6095, + "step": 9349 + }, + { + "epoch": 0.9550561797752809, + "grad_norm": 1.5666563533156415, + "learning_rate": 1.0576127468781782e-07, + "loss": 0.7431, + "step": 9350 + }, + { + "epoch": 0.9551583248212462, + "grad_norm": 1.5055420510615423, + "learning_rate": 1.0528193227542505e-07, + "loss": 0.6715, + "step": 9351 + }, + { + "epoch": 0.9552604698672115, + "grad_norm": 1.4846171725221482, + "learning_rate": 1.0480367284631865e-07, + "loss": 0.6499, + "step": 9352 + }, + { + "epoch": 0.9553626149131768, + "grad_norm": 1.4326165507769513, + "learning_rate": 1.0432649645284121e-07, + "loss": 0.6252, + "step": 9353 + }, + { + "epoch": 0.9554647599591419, + "grad_norm": 1.4765100035115832, + "learning_rate": 1.0385040314722317e-07, + "loss": 0.6469, + "step": 9354 + }, + { + "epoch": 0.9555669050051072, + "grad_norm": 1.6171419773985063, + "learning_rate": 1.0337539298157284e-07, + "loss": 0.6463, + "step": 9355 + }, + { + "epoch": 0.9556690500510725, + "grad_norm": 1.2663803841964771, + "learning_rate": 1.0290146600787865e-07, + "loss": 0.6441, + "step": 9356 + }, + { + "epoch": 0.9557711950970378, + "grad_norm": 1.3253496491213006, + "learning_rate": 1.0242862227801464e-07, + "loss": 0.6188, + "step": 9357 + }, + { + "epoch": 0.9558733401430031, + "grad_norm": 1.5306638179849967, + "learning_rate": 1.0195686184373165e-07, + "loss": 0.7339, + "step": 9358 + }, + { + "epoch": 0.9559754851889684, + "grad_norm": 1.5703878828020956, + "learning_rate": 1.0148618475666505e-07, + "loss": 0.7261, + "step": 9359 + }, + { + "epoch": 0.9560776302349336, + "grad_norm": 1.457665550747588, + "learning_rate": 1.0101659106833139e-07, + "loss": 0.632, + "step": 9360 + }, + { + "epoch": 0.9561797752808989, + "grad_norm": 1.350885491983239, + "learning_rate": 1.0054808083012624e-07, + "loss": 0.6986, + "step": 9361 + }, + { + "epoch": 0.9562819203268641, + "grad_norm": 1.4812234474606873, + "learning_rate": 1.0008065409333079e-07, + "loss": 0.669, + "step": 9362 + }, + { + "epoch": 0.9563840653728294, + "grad_norm": 1.6030211409494997, + "learning_rate": 9.961431090910301e-08, + "loss": 0.6623, + "step": 9363 + }, + { + "epoch": 0.9564862104187947, + "grad_norm": 1.5431489267831724, + "learning_rate": 9.914905132848651e-08, + "loss": 0.6128, + "step": 9364 + }, + { + "epoch": 0.95658835546476, + "grad_norm": 1.5522403341024757, + "learning_rate": 9.868487540240169e-08, + "loss": 0.6819, + "step": 9365 + }, + { + "epoch": 0.9566905005107252, + "grad_norm": 1.3750890110132559, + "learning_rate": 9.822178318165565e-08, + "loss": 0.6588, + "step": 9366 + }, + { + "epoch": 0.9567926455566905, + "grad_norm": 1.4668446674202464, + "learning_rate": 9.775977471693232e-08, + "loss": 0.6428, + "step": 9367 + }, + { + "epoch": 0.9568947906026558, + "grad_norm": 1.5977532017307068, + "learning_rate": 9.729885005879902e-08, + "loss": 0.7452, + "step": 9368 + }, + { + "epoch": 0.9569969356486211, + "grad_norm": 1.4503373504467043, + "learning_rate": 9.683900925770429e-08, + "loss": 0.6075, + "step": 9369 + }, + { + "epoch": 0.9570990806945863, + "grad_norm": 1.5850948464398236, + "learning_rate": 9.638025236397897e-08, + "loss": 0.6982, + "step": 9370 + }, + { + "epoch": 0.9572012257405516, + "grad_norm": 1.715027277017169, + "learning_rate": 9.59225794278329e-08, + "loss": 0.7414, + "step": 9371 + }, + { + "epoch": 0.9573033707865168, + "grad_norm": 1.4886562540181885, + "learning_rate": 9.546599049936045e-08, + "loss": 0.6563, + "step": 9372 + }, + { + "epoch": 0.9574055158324821, + "grad_norm": 1.5013492329991132, + "learning_rate": 9.501048562853277e-08, + "loss": 0.646, + "step": 9373 + }, + { + "epoch": 0.9575076608784474, + "grad_norm": 1.459513253038374, + "learning_rate": 9.455606486520886e-08, + "loss": 0.6805, + "step": 9374 + }, + { + "epoch": 0.9576098059244127, + "grad_norm": 1.4566920212137846, + "learning_rate": 9.410272825912225e-08, + "loss": 0.709, + "step": 9375 + }, + { + "epoch": 0.957711950970378, + "grad_norm": 1.5489703885160664, + "learning_rate": 9.365047585989218e-08, + "loss": 0.71, + "step": 9376 + }, + { + "epoch": 0.9578140960163432, + "grad_norm": 1.568723791793979, + "learning_rate": 9.31993077170179e-08, + "loss": 0.5782, + "step": 9377 + }, + { + "epoch": 0.9579162410623084, + "grad_norm": 1.6086113870166991, + "learning_rate": 9.274922387987995e-08, + "loss": 0.7567, + "step": 9378 + }, + { + "epoch": 0.9580183861082737, + "grad_norm": 1.4776654009659025, + "learning_rate": 9.230022439774e-08, + "loss": 0.7302, + "step": 9379 + }, + { + "epoch": 0.958120531154239, + "grad_norm": 1.3246881217050994, + "learning_rate": 9.185230931974209e-08, + "loss": 0.6887, + "step": 9380 + }, + { + "epoch": 0.9582226762002043, + "grad_norm": 1.408572675121359, + "learning_rate": 9.140547869491146e-08, + "loss": 0.6767, + "step": 9381 + }, + { + "epoch": 0.9583248212461696, + "grad_norm": 1.4158015990770736, + "learning_rate": 9.095973257215118e-08, + "loss": 0.6254, + "step": 9382 + }, + { + "epoch": 0.9584269662921349, + "grad_norm": 1.5412811612403026, + "learning_rate": 9.051507100025114e-08, + "loss": 0.7331, + "step": 9383 + }, + { + "epoch": 0.9585291113381001, + "grad_norm": 1.438132437890826, + "learning_rate": 9.007149402787908e-08, + "loss": 0.5668, + "step": 9384 + }, + { + "epoch": 0.9586312563840653, + "grad_norm": 1.5154947069615163, + "learning_rate": 8.962900170358391e-08, + "loss": 0.7401, + "step": 9385 + }, + { + "epoch": 0.9587334014300306, + "grad_norm": 1.5694325811432663, + "learning_rate": 8.918759407579803e-08, + "loss": 0.7009, + "step": 9386 + }, + { + "epoch": 0.9588355464759959, + "grad_norm": 1.5618867561996117, + "learning_rate": 8.874727119283278e-08, + "loss": 0.7412, + "step": 9387 + }, + { + "epoch": 0.9589376915219612, + "grad_norm": 1.5995842436155452, + "learning_rate": 8.830803310288183e-08, + "loss": 0.7114, + "step": 9388 + }, + { + "epoch": 0.9590398365679265, + "grad_norm": 1.6121765845625322, + "learning_rate": 8.786987985402118e-08, + "loss": 0.759, + "step": 9389 + }, + { + "epoch": 0.9591419816138917, + "grad_norm": 1.5512657837643504, + "learning_rate": 8.743281149420691e-08, + "loss": 0.7675, + "step": 9390 + }, + { + "epoch": 0.959244126659857, + "grad_norm": 1.4321518660314434, + "learning_rate": 8.699682807127518e-08, + "loss": 0.7207, + "step": 9391 + }, + { + "epoch": 0.9593462717058223, + "grad_norm": 1.4878019006342704, + "learning_rate": 8.656192963294452e-08, + "loss": 0.5416, + "step": 9392 + }, + { + "epoch": 0.9594484167517875, + "grad_norm": 1.4556023525616115, + "learning_rate": 8.612811622681572e-08, + "loss": 0.6952, + "step": 9393 + }, + { + "epoch": 0.9595505617977528, + "grad_norm": 1.6324140306475445, + "learning_rate": 8.569538790036969e-08, + "loss": 0.7132, + "step": 9394 + }, + { + "epoch": 0.9596527068437181, + "grad_norm": 1.412991930622943, + "learning_rate": 8.526374470096966e-08, + "loss": 0.6558, + "step": 9395 + }, + { + "epoch": 0.9597548518896833, + "grad_norm": 1.5396416009971017, + "learning_rate": 8.483318667585782e-08, + "loss": 0.6499, + "step": 9396 + }, + { + "epoch": 0.9598569969356486, + "grad_norm": 1.4551567615479741, + "learning_rate": 8.440371387215985e-08, + "loss": 0.7292, + "step": 9397 + }, + { + "epoch": 0.9599591419816139, + "grad_norm": 1.5899256961579473, + "learning_rate": 8.397532633688254e-08, + "loss": 0.6429, + "step": 9398 + }, + { + "epoch": 0.9600612870275792, + "grad_norm": 1.5968969253704848, + "learning_rate": 8.354802411691176e-08, + "loss": 0.7615, + "step": 9399 + }, + { + "epoch": 0.9601634320735445, + "grad_norm": 1.4311261822672456, + "learning_rate": 8.312180725901676e-08, + "loss": 0.6478, + "step": 9400 + }, + { + "epoch": 0.9602655771195097, + "grad_norm": 1.5778032467725869, + "learning_rate": 8.269667580984687e-08, + "loss": 0.5741, + "step": 9401 + }, + { + "epoch": 0.960367722165475, + "grad_norm": 1.5117445016673505, + "learning_rate": 8.227262981593265e-08, + "loss": 0.757, + "step": 9402 + }, + { + "epoch": 0.9604698672114402, + "grad_norm": 1.650827259679051, + "learning_rate": 8.184966932368698e-08, + "loss": 0.6937, + "step": 9403 + }, + { + "epoch": 0.9605720122574055, + "grad_norm": 1.4979105231437397, + "learning_rate": 8.142779437940285e-08, + "loss": 0.6729, + "step": 9404 + }, + { + "epoch": 0.9606741573033708, + "grad_norm": 1.6875897906033999, + "learning_rate": 8.100700502925551e-08, + "loss": 0.697, + "step": 9405 + }, + { + "epoch": 0.9607763023493361, + "grad_norm": 1.5737806282144426, + "learning_rate": 8.058730131930037e-08, + "loss": 0.7054, + "step": 9406 + }, + { + "epoch": 0.9608784473953014, + "grad_norm": 1.7461437791338588, + "learning_rate": 8.01686832954729e-08, + "loss": 0.7345, + "step": 9407 + }, + { + "epoch": 0.9609805924412665, + "grad_norm": 1.4058027721577275, + "learning_rate": 7.975115100359199e-08, + "loss": 0.7928, + "step": 9408 + }, + { + "epoch": 0.9610827374872318, + "grad_norm": 1.4343287615359337, + "learning_rate": 7.933470448935776e-08, + "loss": 0.6059, + "step": 9409 + }, + { + "epoch": 0.9611848825331971, + "grad_norm": 1.6369556621740613, + "learning_rate": 7.89193437983482e-08, + "loss": 0.7246, + "step": 9410 + }, + { + "epoch": 0.9612870275791624, + "grad_norm": 1.6181778251997279, + "learning_rate": 7.850506897602805e-08, + "loss": 0.6423, + "step": 9411 + }, + { + "epoch": 0.9613891726251277, + "grad_norm": 1.4868455943295904, + "learning_rate": 7.80918800677366e-08, + "loss": 0.7151, + "step": 9412 + }, + { + "epoch": 0.961491317671093, + "grad_norm": 1.4084743714659225, + "learning_rate": 7.767977711869989e-08, + "loss": 0.6322, + "step": 9413 + }, + { + "epoch": 0.9615934627170583, + "grad_norm": 1.4631699770694613, + "learning_rate": 7.726876017402296e-08, + "loss": 0.7022, + "step": 9414 + }, + { + "epoch": 0.9616956077630235, + "grad_norm": 1.3218703429764336, + "learning_rate": 7.685882927869093e-08, + "loss": 0.7034, + "step": 9415 + }, + { + "epoch": 0.9617977528089887, + "grad_norm": 1.580264219215216, + "learning_rate": 7.644998447757013e-08, + "loss": 0.6453, + "step": 9416 + }, + { + "epoch": 0.961899897854954, + "grad_norm": 1.5839179877348373, + "learning_rate": 7.604222581541143e-08, + "loss": 0.7191, + "step": 9417 + }, + { + "epoch": 0.9620020429009193, + "grad_norm": 1.5600931509003377, + "learning_rate": 7.563555333684136e-08, + "loss": 0.7548, + "step": 9418 + }, + { + "epoch": 0.9621041879468846, + "grad_norm": 1.7209103932740446, + "learning_rate": 7.52299670863732e-08, + "loss": 0.6365, + "step": 9419 + }, + { + "epoch": 0.9622063329928499, + "grad_norm": 1.3747184700942165, + "learning_rate": 7.4825467108397e-08, + "loss": 0.6227, + "step": 9420 + }, + { + "epoch": 0.9623084780388151, + "grad_norm": 1.5857416349249942, + "learning_rate": 7.442205344718511e-08, + "loss": 0.7638, + "step": 9421 + }, + { + "epoch": 0.9624106230847804, + "grad_norm": 1.6210903850081748, + "learning_rate": 7.401972614689335e-08, + "loss": 0.6529, + "step": 9422 + }, + { + "epoch": 0.9625127681307457, + "grad_norm": 1.375792963072189, + "learning_rate": 7.361848525155536e-08, + "loss": 0.696, + "step": 9423 + }, + { + "epoch": 0.9626149131767109, + "grad_norm": 1.581692218164805, + "learning_rate": 7.321833080508711e-08, + "loss": 0.758, + "step": 9424 + }, + { + "epoch": 0.9627170582226762, + "grad_norm": 1.5117543085238077, + "learning_rate": 7.281926285128582e-08, + "loss": 0.7818, + "step": 9425 + }, + { + "epoch": 0.9628192032686415, + "grad_norm": 1.6581299585325628, + "learning_rate": 7.242128143382986e-08, + "loss": 0.7561, + "step": 9426 + }, + { + "epoch": 0.9629213483146067, + "grad_norm": 1.5356630420732904, + "learning_rate": 7.202438659627886e-08, + "loss": 0.7218, + "step": 9427 + }, + { + "epoch": 0.963023493360572, + "grad_norm": 1.4384114802202057, + "learning_rate": 7.16285783820736e-08, + "loss": 0.5566, + "step": 9428 + }, + { + "epoch": 0.9631256384065373, + "grad_norm": 1.5833799840745688, + "learning_rate": 7.123385683453498e-08, + "loss": 0.6545, + "step": 9429 + }, + { + "epoch": 0.9632277834525026, + "grad_norm": 1.5651942844053226, + "learning_rate": 7.084022199686513e-08, + "loss": 0.6841, + "step": 9430 + }, + { + "epoch": 0.9633299284984678, + "grad_norm": 1.5170535949495294, + "learning_rate": 7.044767391214735e-08, + "loss": 0.7431, + "step": 9431 + }, + { + "epoch": 0.963432073544433, + "grad_norm": 1.5535938019735855, + "learning_rate": 7.005621262334838e-08, + "loss": 0.6592, + "step": 9432 + }, + { + "epoch": 0.9635342185903983, + "grad_norm": 1.3597329078172913, + "learning_rate": 6.966583817331173e-08, + "loss": 0.5713, + "step": 9433 + }, + { + "epoch": 0.9636363636363636, + "grad_norm": 1.4469071383086436, + "learning_rate": 6.927655060476435e-08, + "loss": 0.6549, + "step": 9434 + }, + { + "epoch": 0.9637385086823289, + "grad_norm": 1.5355677754442751, + "learning_rate": 6.888834996031546e-08, + "loss": 0.7258, + "step": 9435 + }, + { + "epoch": 0.9638406537282942, + "grad_norm": 1.6418660352232157, + "learning_rate": 6.85012362824522e-08, + "loss": 0.7007, + "step": 9436 + }, + { + "epoch": 0.9639427987742595, + "grad_norm": 1.373550107994046, + "learning_rate": 6.811520961354623e-08, + "loss": 0.6179, + "step": 9437 + }, + { + "epoch": 0.9640449438202248, + "grad_norm": 1.459613965201276, + "learning_rate": 6.773026999584709e-08, + "loss": 0.6966, + "step": 9438 + }, + { + "epoch": 0.9641470888661899, + "grad_norm": 1.5327654069557626, + "learning_rate": 6.734641747148663e-08, + "loss": 0.7015, + "step": 9439 + }, + { + "epoch": 0.9642492339121552, + "grad_norm": 1.3806572828581034, + "learning_rate": 6.696365208247901e-08, + "loss": 0.5251, + "step": 9440 + }, + { + "epoch": 0.9643513789581205, + "grad_norm": 1.4115136410232567, + "learning_rate": 6.65819738707163e-08, + "loss": 0.5973, + "step": 9441 + }, + { + "epoch": 0.9644535240040858, + "grad_norm": 1.5939776343000227, + "learning_rate": 6.620138287797396e-08, + "loss": 0.7133, + "step": 9442 + }, + { + "epoch": 0.9645556690500511, + "grad_norm": 1.4992191517543352, + "learning_rate": 6.582187914590865e-08, + "loss": 0.6344, + "step": 9443 + }, + { + "epoch": 0.9646578140960164, + "grad_norm": 1.6731812465418523, + "learning_rate": 6.544346271605827e-08, + "loss": 0.6989, + "step": 9444 + }, + { + "epoch": 0.9647599591419816, + "grad_norm": 1.5289622286134268, + "learning_rate": 6.50661336298386e-08, + "loss": 0.6932, + "step": 9445 + }, + { + "epoch": 0.9648621041879469, + "grad_norm": 1.5716535302409274, + "learning_rate": 6.46898919285488e-08, + "loss": 0.6442, + "step": 9446 + }, + { + "epoch": 0.9649642492339121, + "grad_norm": 1.5283770706606583, + "learning_rate": 6.431473765336927e-08, + "loss": 0.746, + "step": 9447 + }, + { + "epoch": 0.9650663942798774, + "grad_norm": 1.518666071142373, + "learning_rate": 6.39406708453616e-08, + "loss": 0.7097, + "step": 9448 + }, + { + "epoch": 0.9651685393258427, + "grad_norm": 1.5901600603052117, + "learning_rate": 6.35676915454675e-08, + "loss": 0.7361, + "step": 9449 + }, + { + "epoch": 0.965270684371808, + "grad_norm": 1.5543987184215513, + "learning_rate": 6.319579979450763e-08, + "loss": 0.6601, + "step": 9450 + }, + { + "epoch": 0.9653728294177732, + "grad_norm": 1.4963180787331731, + "learning_rate": 6.282499563318834e-08, + "loss": 0.63, + "step": 9451 + }, + { + "epoch": 0.9654749744637385, + "grad_norm": 1.4893446949582094, + "learning_rate": 6.245527910209381e-08, + "loss": 0.5909, + "step": 9452 + }, + { + "epoch": 0.9655771195097038, + "grad_norm": 1.2956980760342613, + "learning_rate": 6.208665024168948e-08, + "loss": 0.6457, + "step": 9453 + }, + { + "epoch": 0.9656792645556691, + "grad_norm": 1.3979771673912176, + "learning_rate": 6.171910909232193e-08, + "loss": 0.6424, + "step": 9454 + }, + { + "epoch": 0.9657814096016343, + "grad_norm": 1.4049358845470743, + "learning_rate": 6.1352655694219e-08, + "loss": 0.5582, + "step": 9455 + }, + { + "epoch": 0.9658835546475996, + "grad_norm": 1.593607483537053, + "learning_rate": 6.09872900874886e-08, + "loss": 0.7171, + "step": 9456 + }, + { + "epoch": 0.9659856996935648, + "grad_norm": 1.5275074764633596, + "learning_rate": 6.062301231212209e-08, + "loss": 0.737, + "step": 9457 + }, + { + "epoch": 0.9660878447395301, + "grad_norm": 1.3703185143529415, + "learning_rate": 6.025982240798644e-08, + "loss": 0.6366, + "step": 9458 + }, + { + "epoch": 0.9661899897854954, + "grad_norm": 1.4913914237554737, + "learning_rate": 5.989772041483654e-08, + "loss": 0.7163, + "step": 9459 + }, + { + "epoch": 0.9662921348314607, + "grad_norm": 1.3886987997205227, + "learning_rate": 5.953670637230291e-08, + "loss": 0.6704, + "step": 9460 + }, + { + "epoch": 0.966394279877426, + "grad_norm": 1.3719624460438704, + "learning_rate": 5.9176780319898374e-08, + "loss": 0.555, + "step": 9461 + }, + { + "epoch": 0.9664964249233912, + "grad_norm": 1.479999817934045, + "learning_rate": 5.88179422970192e-08, + "loss": 0.6494, + "step": 9462 + }, + { + "epoch": 0.9665985699693564, + "grad_norm": 1.3924134684179825, + "learning_rate": 5.8460192342938425e-08, + "loss": 0.7124, + "step": 9463 + }, + { + "epoch": 0.9667007150153217, + "grad_norm": 1.5376962057786092, + "learning_rate": 5.81035304968125e-08, + "loss": 0.6669, + "step": 9464 + }, + { + "epoch": 0.966802860061287, + "grad_norm": 1.495720478037599, + "learning_rate": 5.774795679767797e-08, + "loss": 0.6706, + "step": 9465 + }, + { + "epoch": 0.9669050051072523, + "grad_norm": 1.4199342944038444, + "learning_rate": 5.739347128445372e-08, + "loss": 0.6361, + "step": 9466 + }, + { + "epoch": 0.9670071501532176, + "grad_norm": 1.3577487469193374, + "learning_rate": 5.704007399593758e-08, + "loss": 0.6541, + "step": 9467 + }, + { + "epoch": 0.9671092951991829, + "grad_norm": 1.528714545671224, + "learning_rate": 5.668776497080974e-08, + "loss": 0.7498, + "step": 9468 + }, + { + "epoch": 0.9672114402451482, + "grad_norm": 1.3503032769149808, + "learning_rate": 5.633654424763046e-08, + "loss": 0.6852, + "step": 9469 + }, + { + "epoch": 0.9673135852911133, + "grad_norm": 1.610890383080021, + "learning_rate": 5.5986411864840106e-08, + "loss": 0.7749, + "step": 9470 + }, + { + "epoch": 0.9674157303370786, + "grad_norm": 1.4660905683235932, + "learning_rate": 5.5637367860762456e-08, + "loss": 0.6813, + "step": 9471 + }, + { + "epoch": 0.9675178753830439, + "grad_norm": 1.570312981471915, + "learning_rate": 5.5289412273599184e-08, + "loss": 0.7291, + "step": 9472 + }, + { + "epoch": 0.9676200204290092, + "grad_norm": 1.5775111672502657, + "learning_rate": 5.494254514143427e-08, + "loss": 0.6656, + "step": 9473 + }, + { + "epoch": 0.9677221654749745, + "grad_norm": 1.5348538479929605, + "learning_rate": 5.4596766502234e-08, + "loss": 0.6433, + "step": 9474 + }, + { + "epoch": 0.9678243105209398, + "grad_norm": 1.5800024963794863, + "learning_rate": 5.425207639384256e-08, + "loss": 0.6607, + "step": 9475 + }, + { + "epoch": 0.967926455566905, + "grad_norm": 1.5246106341589787, + "learning_rate": 5.390847485398754e-08, + "loss": 0.6499, + "step": 9476 + }, + { + "epoch": 0.9680286006128703, + "grad_norm": 1.5107145837211426, + "learning_rate": 5.3565961920275524e-08, + "loss": 0.5696, + "step": 9477 + }, + { + "epoch": 0.9681307456588355, + "grad_norm": 1.354845949199645, + "learning_rate": 5.3224537630196526e-08, + "loss": 0.6399, + "step": 9478 + }, + { + "epoch": 0.9682328907048008, + "grad_norm": 1.5038465589660885, + "learning_rate": 5.288420202111732e-08, + "loss": 0.6815, + "step": 9479 + }, + { + "epoch": 0.9683350357507661, + "grad_norm": 1.562126247919074, + "learning_rate": 5.254495513028812e-08, + "loss": 0.6831, + "step": 9480 + }, + { + "epoch": 0.9684371807967314, + "grad_norm": 1.5940707311951237, + "learning_rate": 5.2206796994841434e-08, + "loss": 0.6663, + "step": 9481 + }, + { + "epoch": 0.9685393258426966, + "grad_norm": 1.6050796668608505, + "learning_rate": 5.186972765178766e-08, + "loss": 0.7101, + "step": 9482 + }, + { + "epoch": 0.9686414708886619, + "grad_norm": 1.519690025804949, + "learning_rate": 5.1533747138019505e-08, + "loss": 0.6581, + "step": 9483 + }, + { + "epoch": 0.9687436159346272, + "grad_norm": 1.358258785941762, + "learning_rate": 5.1198855490310895e-08, + "loss": 0.6192, + "step": 9484 + }, + { + "epoch": 0.9688457609805925, + "grad_norm": 1.5812310068873439, + "learning_rate": 5.086505274531362e-08, + "loss": 0.7611, + "step": 9485 + }, + { + "epoch": 0.9689479060265577, + "grad_norm": 1.344306262110424, + "learning_rate": 5.0532338939566215e-08, + "loss": 0.6947, + "step": 9486 + }, + { + "epoch": 0.969050051072523, + "grad_norm": 1.423471154175696, + "learning_rate": 5.0200714109481797e-08, + "loss": 0.6058, + "step": 9487 + }, + { + "epoch": 0.9691521961184882, + "grad_norm": 1.457089415560458, + "learning_rate": 4.9870178291356874e-08, + "loss": 0.6875, + "step": 9488 + }, + { + "epoch": 0.9692543411644535, + "grad_norm": 1.2908270849236776, + "learning_rate": 4.954073152137029e-08, + "loss": 0.524, + "step": 9489 + }, + { + "epoch": 0.9693564862104188, + "grad_norm": 1.5405178132739534, + "learning_rate": 4.9212373835579865e-08, + "loss": 0.6823, + "step": 9490 + }, + { + "epoch": 0.9694586312563841, + "grad_norm": 1.3594330511638362, + "learning_rate": 4.888510526992241e-08, + "loss": 0.6308, + "step": 9491 + }, + { + "epoch": 0.9695607763023494, + "grad_norm": 1.44628820048114, + "learning_rate": 4.8558925860221486e-08, + "loss": 0.6934, + "step": 9492 + }, + { + "epoch": 0.9696629213483146, + "grad_norm": 1.68256728969364, + "learning_rate": 4.8233835642174096e-08, + "loss": 0.6404, + "step": 9493 + }, + { + "epoch": 0.9697650663942798, + "grad_norm": 1.4982268824456026, + "learning_rate": 4.7909834651364006e-08, + "loss": 0.6301, + "step": 9494 + }, + { + "epoch": 0.9698672114402451, + "grad_norm": 1.4958152577436736, + "learning_rate": 4.7586922923251734e-08, + "loss": 0.6751, + "step": 9495 + }, + { + "epoch": 0.9699693564862104, + "grad_norm": 1.3914487449837927, + "learning_rate": 4.7265100493181227e-08, + "loss": 0.5993, + "step": 9496 + }, + { + "epoch": 0.9700715015321757, + "grad_norm": 1.5445884551693123, + "learning_rate": 4.6944367396376533e-08, + "loss": 0.685, + "step": 9497 + }, + { + "epoch": 0.970173646578141, + "grad_norm": 1.6482077271455042, + "learning_rate": 4.662472366793957e-08, + "loss": 0.7283, + "step": 9498 + }, + { + "epoch": 0.9702757916241063, + "grad_norm": 1.4736148144054146, + "learning_rate": 4.6306169342859034e-08, + "loss": 0.701, + "step": 9499 + }, + { + "epoch": 0.9703779366700716, + "grad_norm": 1.397677181778376, + "learning_rate": 4.598870445599812e-08, + "loss": 0.5897, + "step": 9500 + }, + { + "epoch": 0.9704800817160367, + "grad_norm": 1.6702363181601776, + "learning_rate": 4.5672329042104614e-08, + "loss": 0.6774, + "step": 9501 + }, + { + "epoch": 0.970582226762002, + "grad_norm": 1.5402377376108611, + "learning_rate": 4.535704313580636e-08, + "loss": 0.7501, + "step": 9502 + }, + { + "epoch": 0.9706843718079673, + "grad_norm": 1.613641986255052, + "learning_rate": 4.5042846771611306e-08, + "loss": 0.665, + "step": 9503 + }, + { + "epoch": 0.9707865168539326, + "grad_norm": 1.6001981384326, + "learning_rate": 4.4729739983907506e-08, + "loss": 0.638, + "step": 9504 + }, + { + "epoch": 0.9708886618998979, + "grad_norm": 1.5324990804193306, + "learning_rate": 4.4417722806966434e-08, + "loss": 0.7435, + "step": 9505 + }, + { + "epoch": 0.9709908069458631, + "grad_norm": 1.4388552817664748, + "learning_rate": 4.410679527493744e-08, + "loss": 0.6339, + "step": 9506 + }, + { + "epoch": 0.9710929519918284, + "grad_norm": 1.6178740944704002, + "learning_rate": 4.3796957421852194e-08, + "loss": 0.6464, + "step": 9507 + }, + { + "epoch": 0.9711950970377937, + "grad_norm": 1.3683178759279988, + "learning_rate": 4.348820928162356e-08, + "loss": 0.6608, + "step": 9508 + }, + { + "epoch": 0.9712972420837589, + "grad_norm": 1.563470099820946, + "learning_rate": 4.3180550888041184e-08, + "loss": 0.6824, + "step": 9509 + }, + { + "epoch": 0.9713993871297242, + "grad_norm": 1.5718656058408509, + "learning_rate": 4.287398227478146e-08, + "loss": 0.6418, + "step": 9510 + }, + { + "epoch": 0.9715015321756895, + "grad_norm": 1.34762984021176, + "learning_rate": 4.256850347539754e-08, + "loss": 0.6517, + "step": 9511 + }, + { + "epoch": 0.9716036772216547, + "grad_norm": 1.6545414638671812, + "learning_rate": 4.22641145233238e-08, + "loss": 0.7761, + "step": 9512 + }, + { + "epoch": 0.97170582226762, + "grad_norm": 1.4495666051927938, + "learning_rate": 4.1960815451876915e-08, + "loss": 0.7226, + "step": 9513 + }, + { + "epoch": 0.9718079673135853, + "grad_norm": 1.548819577746712, + "learning_rate": 4.165860629425256e-08, + "loss": 0.6956, + "step": 9514 + }, + { + "epoch": 0.9719101123595506, + "grad_norm": 1.6365714756909788, + "learning_rate": 4.135748708352649e-08, + "loss": 0.6986, + "step": 9515 + }, + { + "epoch": 0.9720122574055158, + "grad_norm": 1.516481069875011, + "learning_rate": 4.105745785265791e-08, + "loss": 0.6947, + "step": 9516 + }, + { + "epoch": 0.9721144024514811, + "grad_norm": 1.303587542824108, + "learning_rate": 4.07585186344861e-08, + "loss": 0.5145, + "step": 9517 + }, + { + "epoch": 0.9722165474974463, + "grad_norm": 1.4942874676515192, + "learning_rate": 4.046066946172822e-08, + "loss": 0.6938, + "step": 9518 + }, + { + "epoch": 0.9723186925434116, + "grad_norm": 1.511778461744693, + "learning_rate": 4.016391036698375e-08, + "loss": 0.6875, + "step": 9519 + }, + { + "epoch": 0.9724208375893769, + "grad_norm": 1.5980375305737715, + "learning_rate": 3.9868241382735593e-08, + "loss": 0.7166, + "step": 9520 + }, + { + "epoch": 0.9725229826353422, + "grad_norm": 1.6164000074195612, + "learning_rate": 3.957366254134343e-08, + "loss": 0.6107, + "step": 9521 + }, + { + "epoch": 0.9726251276813075, + "grad_norm": 1.5137971229109781, + "learning_rate": 3.928017387504812e-08, + "loss": 0.7072, + "step": 9522 + }, + { + "epoch": 0.9727272727272728, + "grad_norm": 1.5750236411039198, + "learning_rate": 3.8987775415973985e-08, + "loss": 0.7215, + "step": 9523 + }, + { + "epoch": 0.972829417773238, + "grad_norm": 1.6236838475805129, + "learning_rate": 3.869646719612208e-08, + "loss": 0.6757, + "step": 9524 + }, + { + "epoch": 0.9729315628192032, + "grad_norm": 1.481965595978336, + "learning_rate": 3.8406249247379125e-08, + "loss": 0.6168, + "step": 9525 + }, + { + "epoch": 0.9730337078651685, + "grad_norm": 1.6073848592798912, + "learning_rate": 3.81171216015086e-08, + "loss": 0.779, + "step": 9526 + }, + { + "epoch": 0.9731358529111338, + "grad_norm": 1.4613419756581134, + "learning_rate": 3.782908429015408e-08, + "loss": 0.75, + "step": 9527 + }, + { + "epoch": 0.9732379979570991, + "grad_norm": 1.5015264697572668, + "learning_rate": 3.754213734484369e-08, + "loss": 0.6461, + "step": 9528 + }, + { + "epoch": 0.9733401430030644, + "grad_norm": 1.4596954847177306, + "learning_rate": 3.725628079698229e-08, + "loss": 0.5707, + "step": 9529 + }, + { + "epoch": 0.9734422880490297, + "grad_norm": 1.568039595285662, + "learning_rate": 3.697151467785709e-08, + "loss": 0.6853, + "step": 9530 + }, + { + "epoch": 0.9735444330949949, + "grad_norm": 1.6446362610640897, + "learning_rate": 3.668783901863759e-08, + "loss": 0.6792, + "step": 9531 + }, + { + "epoch": 0.9736465781409601, + "grad_norm": 1.4393981056455176, + "learning_rate": 3.640525385037119e-08, + "loss": 0.7216, + "step": 9532 + }, + { + "epoch": 0.9737487231869254, + "grad_norm": 1.639716264514587, + "learning_rate": 3.612375920398758e-08, + "loss": 0.7035, + "step": 9533 + }, + { + "epoch": 0.9738508682328907, + "grad_norm": 1.4739016254029207, + "learning_rate": 3.584335511029435e-08, + "loss": 0.536, + "step": 9534 + }, + { + "epoch": 0.973953013278856, + "grad_norm": 1.5055429639859754, + "learning_rate": 3.556404159998472e-08, + "loss": 0.6992, + "step": 9535 + }, + { + "epoch": 0.9740551583248213, + "grad_norm": 1.6432180643801644, + "learning_rate": 3.5285818703628685e-08, + "loss": 0.6703, + "step": 9536 + }, + { + "epoch": 0.9741573033707865, + "grad_norm": 1.5587980090175149, + "learning_rate": 3.500868645167743e-08, + "loss": 0.6455, + "step": 9537 + }, + { + "epoch": 0.9742594484167518, + "grad_norm": 1.600001722365177, + "learning_rate": 3.473264487446337e-08, + "loss": 0.6777, + "step": 9538 + }, + { + "epoch": 0.9743615934627171, + "grad_norm": 1.6793618859511104, + "learning_rate": 3.4457694002200113e-08, + "loss": 0.6717, + "step": 9539 + }, + { + "epoch": 0.9744637385086823, + "grad_norm": 1.5993122432909426, + "learning_rate": 3.4183833864981364e-08, + "loss": 0.6538, + "step": 9540 + }, + { + "epoch": 0.9745658835546476, + "grad_norm": 1.5786056126647705, + "learning_rate": 3.391106449277981e-08, + "loss": 0.7136, + "step": 9541 + }, + { + "epoch": 0.9746680286006129, + "grad_norm": 1.4091406895569083, + "learning_rate": 3.363938591545157e-08, + "loss": 0.6148, + "step": 9542 + }, + { + "epoch": 0.9747701736465781, + "grad_norm": 1.601398670530783, + "learning_rate": 3.336879816273175e-08, + "loss": 0.6846, + "step": 9543 + }, + { + "epoch": 0.9748723186925434, + "grad_norm": 1.5286110008342757, + "learning_rate": 3.309930126423555e-08, + "loss": 0.6699, + "step": 9544 + }, + { + "epoch": 0.9749744637385087, + "grad_norm": 1.7036285960927844, + "learning_rate": 3.283089524946159e-08, + "loss": 0.6894, + "step": 9545 + }, + { + "epoch": 0.975076608784474, + "grad_norm": 1.3957397900301574, + "learning_rate": 3.256358014778416e-08, + "loss": 0.6442, + "step": 9546 + }, + { + "epoch": 0.9751787538304392, + "grad_norm": 1.4083808968544371, + "learning_rate": 3.2297355988463175e-08, + "loss": 0.6952, + "step": 9547 + }, + { + "epoch": 0.9752808988764045, + "grad_norm": 1.5298540895386814, + "learning_rate": 3.203222280063756e-08, + "loss": 0.7081, + "step": 9548 + }, + { + "epoch": 0.9753830439223697, + "grad_norm": 1.3842890520127713, + "learning_rate": 3.17681806133241e-08, + "loss": 0.6561, + "step": 9549 + }, + { + "epoch": 0.975485188968335, + "grad_norm": 1.4401027857329183, + "learning_rate": 3.150522945542411e-08, + "loss": 0.6705, + "step": 9550 + }, + { + "epoch": 0.9755873340143003, + "grad_norm": 1.495655116384697, + "learning_rate": 3.1243369355717924e-08, + "loss": 0.6697, + "step": 9551 + }, + { + "epoch": 0.9756894790602656, + "grad_norm": 1.577291416180316, + "learning_rate": 3.098260034286482e-08, + "loss": 0.7457, + "step": 9552 + }, + { + "epoch": 0.9757916241062309, + "grad_norm": 1.528082975056903, + "learning_rate": 3.0722922445406424e-08, + "loss": 0.7735, + "step": 9553 + }, + { + "epoch": 0.9758937691521962, + "grad_norm": 1.4543632115296194, + "learning_rate": 3.0464335691765546e-08, + "loss": 0.7646, + "step": 9554 + }, + { + "epoch": 0.9759959141981613, + "grad_norm": 1.3632637853783933, + "learning_rate": 3.0206840110243994e-08, + "loss": 0.6018, + "step": 9555 + }, + { + "epoch": 0.9760980592441266, + "grad_norm": 1.5229510073776469, + "learning_rate": 2.9950435729025895e-08, + "loss": 0.7266, + "step": 9556 + }, + { + "epoch": 0.9762002042900919, + "grad_norm": 1.9557100214958654, + "learning_rate": 2.969512257617324e-08, + "loss": 0.7012, + "step": 9557 + }, + { + "epoch": 0.9763023493360572, + "grad_norm": 1.4273053973910352, + "learning_rate": 2.9440900679631457e-08, + "loss": 0.652, + "step": 9558 + }, + { + "epoch": 0.9764044943820225, + "grad_norm": 1.518432184930031, + "learning_rate": 2.918777006722495e-08, + "loss": 0.6849, + "step": 9559 + }, + { + "epoch": 0.9765066394279878, + "grad_norm": 1.4173453629070492, + "learning_rate": 2.8935730766659343e-08, + "loss": 0.6504, + "step": 9560 + }, + { + "epoch": 0.976608784473953, + "grad_norm": 1.426963462560284, + "learning_rate": 2.868478280552034e-08, + "loss": 0.7194, + "step": 9561 + }, + { + "epoch": 0.9767109295199183, + "grad_norm": 1.4719385756639292, + "learning_rate": 2.843492621127264e-08, + "loss": 0.6424, + "step": 9562 + }, + { + "epoch": 0.9768130745658835, + "grad_norm": 1.6671012205609284, + "learning_rate": 2.818616101126548e-08, + "loss": 0.6343, + "step": 9563 + }, + { + "epoch": 0.9769152196118488, + "grad_norm": 1.5201975232090417, + "learning_rate": 2.7938487232725963e-08, + "loss": 0.6276, + "step": 9564 + }, + { + "epoch": 0.9770173646578141, + "grad_norm": 1.6435017878103428, + "learning_rate": 2.7691904902761303e-08, + "loss": 0.6551, + "step": 9565 + }, + { + "epoch": 0.9771195097037794, + "grad_norm": 1.4201351051267848, + "learning_rate": 2.7446414048361015e-08, + "loss": 0.6415, + "step": 9566 + }, + { + "epoch": 0.9772216547497447, + "grad_norm": 1.4925709761146007, + "learning_rate": 2.720201469639361e-08, + "loss": 0.7688, + "step": 9567 + }, + { + "epoch": 0.9773237997957099, + "grad_norm": 1.344894689484712, + "learning_rate": 2.6958706873608797e-08, + "loss": 0.553, + "step": 9568 + }, + { + "epoch": 0.9774259448416752, + "grad_norm": 1.5109545782915559, + "learning_rate": 2.6716490606637502e-08, + "loss": 0.6168, + "step": 9569 + }, + { + "epoch": 0.9775280898876404, + "grad_norm": 1.4912209162707604, + "learning_rate": 2.6475365921989627e-08, + "loss": 0.7729, + "step": 9570 + }, + { + "epoch": 0.9776302349336057, + "grad_norm": 1.4054086389856157, + "learning_rate": 2.623533284605628e-08, + "loss": 0.6214, + "step": 9571 + }, + { + "epoch": 0.977732379979571, + "grad_norm": 1.4878537346610572, + "learning_rate": 2.5996391405109788e-08, + "loss": 0.6096, + "step": 9572 + }, + { + "epoch": 0.9778345250255362, + "grad_norm": 1.5573787517484798, + "learning_rate": 2.5758541625302557e-08, + "loss": 0.6912, + "step": 9573 + }, + { + "epoch": 0.9779366700715015, + "grad_norm": 1.5263052039024527, + "learning_rate": 2.55217835326671e-08, + "loss": 0.7665, + "step": 9574 + }, + { + "epoch": 0.9780388151174668, + "grad_norm": 1.329772607509056, + "learning_rate": 2.528611715311713e-08, + "loss": 0.6562, + "step": 9575 + }, + { + "epoch": 0.9781409601634321, + "grad_norm": 1.5646938821135543, + "learning_rate": 2.505154251244535e-08, + "loss": 0.7519, + "step": 9576 + }, + { + "epoch": 0.9782431052093974, + "grad_norm": 1.508611110923295, + "learning_rate": 2.4818059636327883e-08, + "loss": 0.6628, + "step": 9577 + }, + { + "epoch": 0.9783452502553626, + "grad_norm": 1.58646737498991, + "learning_rate": 2.458566855031652e-08, + "loss": 0.7416, + "step": 9578 + }, + { + "epoch": 0.9784473953013278, + "grad_norm": 1.5252148721230188, + "learning_rate": 2.435436927985091e-08, + "loss": 0.6956, + "step": 9579 + }, + { + "epoch": 0.9785495403472931, + "grad_norm": 1.4694159907337836, + "learning_rate": 2.4124161850243023e-08, + "loss": 0.6843, + "step": 9580 + }, + { + "epoch": 0.9786516853932584, + "grad_norm": 1.5835426346202524, + "learning_rate": 2.3895046286692702e-08, + "loss": 0.663, + "step": 9581 + }, + { + "epoch": 0.9787538304392237, + "grad_norm": 1.4425845089103262, + "learning_rate": 2.3667022614273226e-08, + "loss": 0.6395, + "step": 9582 + }, + { + "epoch": 0.978855975485189, + "grad_norm": 1.2993039957199546, + "learning_rate": 2.344009085794463e-08, + "loss": 0.6411, + "step": 9583 + }, + { + "epoch": 0.9789581205311543, + "grad_norm": 1.3944930652913556, + "learning_rate": 2.321425104254371e-08, + "loss": 0.6684, + "step": 9584 + }, + { + "epoch": 0.9790602655771196, + "grad_norm": 1.3852702486705006, + "learning_rate": 2.2989503192788477e-08, + "loss": 0.5782, + "step": 9585 + }, + { + "epoch": 0.9791624106230847, + "grad_norm": 1.4672234833582334, + "learning_rate": 2.2765847333278134e-08, + "loss": 0.6398, + "step": 9586 + }, + { + "epoch": 0.97926455566905, + "grad_norm": 1.29680080741187, + "learning_rate": 2.2543283488491997e-08, + "loss": 0.6014, + "step": 9587 + }, + { + "epoch": 0.9793667007150153, + "grad_norm": 1.5616374147123422, + "learning_rate": 2.2321811682789463e-08, + "loss": 0.7107, + "step": 9588 + }, + { + "epoch": 0.9794688457609806, + "grad_norm": 1.518530184861213, + "learning_rate": 2.2101431940411145e-08, + "loss": 0.6046, + "step": 9589 + }, + { + "epoch": 0.9795709908069459, + "grad_norm": 1.5752752798408691, + "learning_rate": 2.1882144285477748e-08, + "loss": 0.6313, + "step": 9590 + }, + { + "epoch": 0.9796731358529112, + "grad_norm": 1.6287133253558443, + "learning_rate": 2.1663948741991182e-08, + "loss": 0.8074, + "step": 9591 + }, + { + "epoch": 0.9797752808988764, + "grad_norm": 1.51243944349324, + "learning_rate": 2.1446845333831236e-08, + "loss": 0.7394, + "step": 9592 + }, + { + "epoch": 0.9798774259448417, + "grad_norm": 1.6201216683377506, + "learning_rate": 2.123083408476112e-08, + "loss": 0.6371, + "step": 9593 + }, + { + "epoch": 0.9799795709908069, + "grad_norm": 1.5693196093017254, + "learning_rate": 2.101591501842304e-08, + "loss": 0.616, + "step": 9594 + }, + { + "epoch": 0.9800817160367722, + "grad_norm": 1.5863920479457674, + "learning_rate": 2.0802088158341505e-08, + "loss": 0.6765, + "step": 9595 + }, + { + "epoch": 0.9801838610827375, + "grad_norm": 1.495760355170821, + "learning_rate": 2.05893535279178e-08, + "loss": 0.7358, + "step": 9596 + }, + { + "epoch": 0.9802860061287028, + "grad_norm": 1.4806875076317487, + "learning_rate": 2.037771115043774e-08, + "loss": 0.6015, + "step": 9597 + }, + { + "epoch": 0.980388151174668, + "grad_norm": 1.4893376918397716, + "learning_rate": 2.016716104906391e-08, + "loss": 0.7127, + "step": 9598 + }, + { + "epoch": 0.9804902962206333, + "grad_norm": 1.719515823626762, + "learning_rate": 1.995770324684232e-08, + "loss": 0.6996, + "step": 9599 + }, + { + "epoch": 0.9805924412665986, + "grad_norm": 1.389484967218732, + "learning_rate": 1.9749337766697962e-08, + "loss": 0.6825, + "step": 9600 + }, + { + "epoch": 0.9806945863125638, + "grad_norm": 1.404600958605843, + "learning_rate": 1.954206463143704e-08, + "loss": 0.5668, + "step": 9601 + }, + { + "epoch": 0.9807967313585291, + "grad_norm": 1.6048339290695426, + "learning_rate": 1.933588386374585e-08, + "loss": 0.6326, + "step": 9602 + }, + { + "epoch": 0.9808988764044944, + "grad_norm": 1.7030109064952923, + "learning_rate": 1.913079548618968e-08, + "loss": 0.6714, + "step": 9603 + }, + { + "epoch": 0.9810010214504596, + "grad_norm": 1.515482101094523, + "learning_rate": 1.8926799521216123e-08, + "loss": 0.7125, + "step": 9604 + }, + { + "epoch": 0.9811031664964249, + "grad_norm": 1.538760306235086, + "learning_rate": 1.8723895991153984e-08, + "loss": 0.6417, + "step": 9605 + }, + { + "epoch": 0.9812053115423902, + "grad_norm": 1.3813989569681897, + "learning_rate": 1.8522084918208837e-08, + "loss": 0.5928, + "step": 9606 + }, + { + "epoch": 0.9813074565883555, + "grad_norm": 1.6335133251818597, + "learning_rate": 1.8321366324471902e-08, + "loss": 0.6499, + "step": 9607 + }, + { + "epoch": 0.9814096016343208, + "grad_norm": 1.5145008896531882, + "learning_rate": 1.8121740231908934e-08, + "loss": 0.6612, + "step": 9608 + }, + { + "epoch": 0.981511746680286, + "grad_norm": 1.590724230407954, + "learning_rate": 1.792320666237135e-08, + "loss": 0.8148, + "step": 9609 + }, + { + "epoch": 0.9816138917262512, + "grad_norm": 1.3663934855250255, + "learning_rate": 1.772576563758843e-08, + "loss": 0.653, + "step": 9610 + }, + { + "epoch": 0.9817160367722165, + "grad_norm": 1.5434207296504072, + "learning_rate": 1.7529417179169562e-08, + "loss": 0.7085, + "step": 9611 + }, + { + "epoch": 0.9818181818181818, + "grad_norm": 1.6024033310838954, + "learning_rate": 1.7334161308604215e-08, + "loss": 0.695, + "step": 9612 + }, + { + "epoch": 0.9819203268641471, + "grad_norm": 1.4052211264769003, + "learning_rate": 1.7139998047266403e-08, + "loss": 0.6054, + "step": 9613 + }, + { + "epoch": 0.9820224719101124, + "grad_norm": 1.4506094367674345, + "learning_rate": 1.6946927416404692e-08, + "loss": 0.641, + "step": 9614 + }, + { + "epoch": 0.9821246169560777, + "grad_norm": 1.433893746359972, + "learning_rate": 1.675494943715217e-08, + "loss": 0.5743, + "step": 9615 + }, + { + "epoch": 0.982226762002043, + "grad_norm": 1.539141008389926, + "learning_rate": 1.656406413051981e-08, + "loss": 0.7094, + "step": 9616 + }, + { + "epoch": 0.9823289070480081, + "grad_norm": 1.538186413240415, + "learning_rate": 1.6374271517400896e-08, + "loss": 0.6573, + "step": 9617 + }, + { + "epoch": 0.9824310520939734, + "grad_norm": 1.439709734841971, + "learning_rate": 1.618557161856771e-08, + "loss": 0.6072, + "step": 9618 + }, + { + "epoch": 0.9825331971399387, + "grad_norm": 1.7538173775842172, + "learning_rate": 1.599796445467483e-08, + "loss": 0.6658, + "step": 9619 + }, + { + "epoch": 0.982635342185904, + "grad_norm": 1.5249896488665171, + "learning_rate": 1.5811450046255837e-08, + "loss": 0.6955, + "step": 9620 + }, + { + "epoch": 0.9827374872318693, + "grad_norm": 1.7833892058365306, + "learning_rate": 1.5626028413723293e-08, + "loss": 0.7294, + "step": 9621 + }, + { + "epoch": 0.9828396322778346, + "grad_norm": 1.4623360092080147, + "learning_rate": 1.544169957737207e-08, + "loss": 0.7016, + "step": 9622 + }, + { + "epoch": 0.9829417773237998, + "grad_norm": 1.7034121791708219, + "learning_rate": 1.5258463557379366e-08, + "loss": 0.6878, + "step": 9623 + }, + { + "epoch": 0.9830439223697651, + "grad_norm": 1.5068524145876203, + "learning_rate": 1.5076320373796915e-08, + "loss": 0.6909, + "step": 9624 + }, + { + "epoch": 0.9831460674157303, + "grad_norm": 1.5741935951466728, + "learning_rate": 1.4895270046564325e-08, + "loss": 0.6425, + "step": 9625 + }, + { + "epoch": 0.9832482124616956, + "grad_norm": 1.3829933190833323, + "learning_rate": 1.4715312595493525e-08, + "loss": 0.6514, + "step": 9626 + }, + { + "epoch": 0.9833503575076609, + "grad_norm": 1.6045151679087422, + "learning_rate": 1.4536448040284312e-08, + "loss": 0.7654, + "step": 9627 + }, + { + "epoch": 0.9834525025536262, + "grad_norm": 1.5521353335178172, + "learning_rate": 1.435867640051214e-08, + "loss": 0.7333, + "step": 9628 + }, + { + "epoch": 0.9835546475995914, + "grad_norm": 1.493969088695074, + "learning_rate": 1.4181997695634774e-08, + "loss": 0.755, + "step": 9629 + }, + { + "epoch": 0.9836567926455567, + "grad_norm": 1.5295369466667021, + "learning_rate": 1.4006411944988974e-08, + "loss": 0.7005, + "step": 9630 + }, + { + "epoch": 0.983758937691522, + "grad_norm": 1.4928266772062624, + "learning_rate": 1.3831919167792696e-08, + "loss": 0.6877, + "step": 9631 + }, + { + "epoch": 0.9838610827374872, + "grad_norm": 1.511075559829526, + "learning_rate": 1.3658519383145107e-08, + "loss": 0.6153, + "step": 9632 + }, + { + "epoch": 0.9839632277834525, + "grad_norm": 1.524093725343304, + "learning_rate": 1.3486212610025473e-08, + "loss": 0.6323, + "step": 9633 + }, + { + "epoch": 0.9840653728294178, + "grad_norm": 1.2840936666822316, + "learning_rate": 1.331499886729093e-08, + "loss": 0.5463, + "step": 9634 + }, + { + "epoch": 0.984167517875383, + "grad_norm": 1.386038076316431, + "learning_rate": 1.3144878173682042e-08, + "loss": 0.5476, + "step": 9635 + }, + { + "epoch": 0.9842696629213483, + "grad_norm": 1.4410513904890978, + "learning_rate": 1.2975850547819469e-08, + "loss": 0.718, + "step": 9636 + }, + { + "epoch": 0.9843718079673136, + "grad_norm": 1.4002649937019942, + "learning_rate": 1.2807916008201748e-08, + "loss": 0.7207, + "step": 9637 + }, + { + "epoch": 0.9844739530132789, + "grad_norm": 1.508212872791826, + "learning_rate": 1.2641074573209732e-08, + "loss": 0.6219, + "step": 9638 + }, + { + "epoch": 0.9845760980592442, + "grad_norm": 1.3945012588945809, + "learning_rate": 1.247532626110548e-08, + "loss": 0.5929, + "step": 9639 + }, + { + "epoch": 0.9846782431052093, + "grad_norm": 1.5964522655457851, + "learning_rate": 1.2310671090028925e-08, + "loss": 0.6758, + "step": 9640 + }, + { + "epoch": 0.9847803881511746, + "grad_norm": 1.4155939957396753, + "learning_rate": 1.2147109078003427e-08, + "loss": 0.6987, + "step": 9641 + }, + { + "epoch": 0.9848825331971399, + "grad_norm": 1.3846225169398596, + "learning_rate": 1.1984640242928003e-08, + "loss": 0.6224, + "step": 9642 + }, + { + "epoch": 0.9849846782431052, + "grad_norm": 1.5419409063243765, + "learning_rate": 1.1823264602588425e-08, + "loss": 0.6709, + "step": 9643 + }, + { + "epoch": 0.9850868232890705, + "grad_norm": 1.2232222990208017, + "learning_rate": 1.166298217464501e-08, + "loss": 0.5902, + "step": 9644 + }, + { + "epoch": 0.9851889683350358, + "grad_norm": 1.4142089390947932, + "learning_rate": 1.1503792976641503e-08, + "loss": 0.5448, + "step": 9645 + }, + { + "epoch": 0.9852911133810011, + "grad_norm": 1.452238411588004, + "learning_rate": 1.1345697026001745e-08, + "loss": 0.7287, + "step": 9646 + }, + { + "epoch": 0.9853932584269663, + "grad_norm": 1.3323098026295412, + "learning_rate": 1.118869434002856e-08, + "loss": 0.6466, + "step": 9647 + }, + { + "epoch": 0.9854954034729315, + "grad_norm": 1.5670632088011618, + "learning_rate": 1.1032784935907093e-08, + "loss": 0.7177, + "step": 9648 + }, + { + "epoch": 0.9855975485188968, + "grad_norm": 1.3957745522448013, + "learning_rate": 1.0877968830700358e-08, + "loss": 0.6075, + "step": 9649 + }, + { + "epoch": 0.9856996935648621, + "grad_norm": 1.5766434019194155, + "learning_rate": 1.0724246041353692e-08, + "loss": 0.7226, + "step": 9650 + }, + { + "epoch": 0.9858018386108274, + "grad_norm": 1.4354718841725742, + "learning_rate": 1.0571616584691414e-08, + "loss": 0.6535, + "step": 9651 + }, + { + "epoch": 0.9859039836567927, + "grad_norm": 1.4940211959896346, + "learning_rate": 1.0420080477421269e-08, + "loss": 0.7099, + "step": 9652 + }, + { + "epoch": 0.986006128702758, + "grad_norm": 1.6028894237796012, + "learning_rate": 1.0269637736126659e-08, + "loss": 0.6312, + "step": 9653 + }, + { + "epoch": 0.9861082737487232, + "grad_norm": 1.4450661106538019, + "learning_rate": 1.0120288377274412e-08, + "loss": 0.7073, + "step": 9654 + }, + { + "epoch": 0.9862104187946884, + "grad_norm": 1.496838114799703, + "learning_rate": 9.972032417210342e-09, + "loss": 0.7378, + "step": 9655 + }, + { + "epoch": 0.9863125638406537, + "grad_norm": 1.5035543882968891, + "learning_rate": 9.824869872162578e-09, + "loss": 0.702, + "step": 9656 + }, + { + "epoch": 0.986414708886619, + "grad_norm": 1.4777435299532664, + "learning_rate": 9.678800758237128e-09, + "loss": 0.6985, + "step": 9657 + }, + { + "epoch": 0.9865168539325843, + "grad_norm": 1.4408904193117742, + "learning_rate": 9.5338250914212e-09, + "loss": 0.6752, + "step": 9658 + }, + { + "epoch": 0.9866189989785495, + "grad_norm": 1.515999849005689, + "learning_rate": 9.389942887582105e-09, + "loss": 0.6406, + "step": 9659 + }, + { + "epoch": 0.9867211440245148, + "grad_norm": 1.5386956906805456, + "learning_rate": 9.247154162469464e-09, + "loss": 0.689, + "step": 9660 + }, + { + "epoch": 0.9868232890704801, + "grad_norm": 1.4331736300054096, + "learning_rate": 9.105458931710776e-09, + "loss": 0.6807, + "step": 9661 + }, + { + "epoch": 0.9869254341164454, + "grad_norm": 1.3808399906777045, + "learning_rate": 8.964857210814748e-09, + "loss": 0.6656, + "step": 9662 + }, + { + "epoch": 0.9870275791624106, + "grad_norm": 1.4924563118890635, + "learning_rate": 8.825349015169071e-09, + "loss": 0.7078, + "step": 9663 + }, + { + "epoch": 0.9871297242083759, + "grad_norm": 1.6857126354382221, + "learning_rate": 8.686934360044863e-09, + "loss": 0.7466, + "step": 9664 + }, + { + "epoch": 0.9872318692543411, + "grad_norm": 1.4322930877257218, + "learning_rate": 8.549613260591117e-09, + "loss": 0.7173, + "step": 9665 + }, + { + "epoch": 0.9873340143003064, + "grad_norm": 1.458555071455001, + "learning_rate": 8.41338573183692e-09, + "loss": 0.6366, + "step": 9666 + }, + { + "epoch": 0.9874361593462717, + "grad_norm": 1.4427834316735153, + "learning_rate": 8.27825178869257e-09, + "loss": 0.6441, + "step": 9667 + }, + { + "epoch": 0.987538304392237, + "grad_norm": 1.5531175409143838, + "learning_rate": 8.144211445949568e-09, + "loss": 0.6719, + "step": 9668 + }, + { + "epoch": 0.9876404494382023, + "grad_norm": 1.4759623987391284, + "learning_rate": 8.0112647182784e-09, + "loss": 0.5701, + "step": 9669 + }, + { + "epoch": 0.9877425944841676, + "grad_norm": 1.5225063886883665, + "learning_rate": 7.87941162023076e-09, + "loss": 0.7285, + "step": 9670 + }, + { + "epoch": 0.9878447395301327, + "grad_norm": 1.8225258087700142, + "learning_rate": 7.748652166236215e-09, + "loss": 0.7687, + "step": 9671 + }, + { + "epoch": 0.987946884576098, + "grad_norm": 1.6645775452305411, + "learning_rate": 7.61898637060665e-09, + "loss": 0.6574, + "step": 9672 + }, + { + "epoch": 0.9880490296220633, + "grad_norm": 1.4877050788480064, + "learning_rate": 7.49041424753627e-09, + "loss": 0.6308, + "step": 9673 + }, + { + "epoch": 0.9881511746680286, + "grad_norm": 1.4417494569901594, + "learning_rate": 7.3629358110960395e-09, + "loss": 0.6104, + "step": 9674 + }, + { + "epoch": 0.9882533197139939, + "grad_norm": 1.397734290251388, + "learning_rate": 7.23655107523813e-09, + "loss": 0.681, + "step": 9675 + }, + { + "epoch": 0.9883554647599592, + "grad_norm": 1.4225654302225892, + "learning_rate": 7.111260053795921e-09, + "loss": 0.6585, + "step": 9676 + }, + { + "epoch": 0.9884576098059245, + "grad_norm": 1.5560650172386679, + "learning_rate": 6.987062760482888e-09, + "loss": 0.7472, + "step": 9677 + }, + { + "epoch": 0.9885597548518897, + "grad_norm": 1.433487842387203, + "learning_rate": 6.863959208891491e-09, + "loss": 0.6792, + "step": 9678 + }, + { + "epoch": 0.9886618998978549, + "grad_norm": 1.4770699024837601, + "learning_rate": 6.741949412496507e-09, + "loss": 0.6715, + "step": 9679 + }, + { + "epoch": 0.9887640449438202, + "grad_norm": 1.5884365156230937, + "learning_rate": 6.6210333846516986e-09, + "loss": 0.7384, + "step": 9680 + }, + { + "epoch": 0.9888661899897855, + "grad_norm": 1.4769738194409154, + "learning_rate": 6.501211138590924e-09, + "loss": 0.6984, + "step": 9681 + }, + { + "epoch": 0.9889683350357508, + "grad_norm": 1.6284689701538175, + "learning_rate": 6.382482687429247e-09, + "loss": 0.7621, + "step": 9682 + }, + { + "epoch": 0.989070480081716, + "grad_norm": 1.5473833828638142, + "learning_rate": 6.264848044161831e-09, + "loss": 0.6189, + "step": 9683 + }, + { + "epoch": 0.9891726251276813, + "grad_norm": 1.6014523065484592, + "learning_rate": 6.148307221663929e-09, + "loss": 0.7433, + "step": 9684 + }, + { + "epoch": 0.9892747701736466, + "grad_norm": 1.4128570645182545, + "learning_rate": 6.032860232690896e-09, + "loss": 0.7049, + "step": 9685 + }, + { + "epoch": 0.9893769152196118, + "grad_norm": 1.6397437051187032, + "learning_rate": 5.918507089877068e-09, + "loss": 0.684, + "step": 9686 + }, + { + "epoch": 0.9894790602655771, + "grad_norm": 1.5233757928283498, + "learning_rate": 5.805247805740211e-09, + "loss": 0.6894, + "step": 9687 + }, + { + "epoch": 0.9895812053115424, + "grad_norm": 1.5277121616229623, + "learning_rate": 5.693082392675964e-09, + "loss": 0.6869, + "step": 9688 + }, + { + "epoch": 0.9896833503575077, + "grad_norm": 1.5914027769467198, + "learning_rate": 5.582010862961173e-09, + "loss": 0.6807, + "step": 9689 + }, + { + "epoch": 0.9897854954034729, + "grad_norm": 1.5737661534500027, + "learning_rate": 5.472033228752782e-09, + "loss": 0.7387, + "step": 9690 + }, + { + "epoch": 0.9898876404494382, + "grad_norm": 1.5365617308726318, + "learning_rate": 5.363149502086717e-09, + "loss": 0.6527, + "step": 9691 + }, + { + "epoch": 0.9899897854954035, + "grad_norm": 1.5828146635050324, + "learning_rate": 5.255359694882334e-09, + "loss": 0.6918, + "step": 9692 + }, + { + "epoch": 0.9900919305413688, + "grad_norm": 1.5098778239805088, + "learning_rate": 5.148663818935751e-09, + "loss": 0.7175, + "step": 9693 + }, + { + "epoch": 0.990194075587334, + "grad_norm": 1.5757670001733775, + "learning_rate": 5.043061885925404e-09, + "loss": 0.6487, + "step": 9694 + }, + { + "epoch": 0.9902962206332993, + "grad_norm": 1.543442969364944, + "learning_rate": 4.9385539074098265e-09, + "loss": 0.6719, + "step": 9695 + }, + { + "epoch": 0.9903983656792645, + "grad_norm": 1.5458135656241583, + "learning_rate": 4.835139894826535e-09, + "loss": 0.6054, + "step": 9696 + }, + { + "epoch": 0.9905005107252298, + "grad_norm": 1.4603862657651896, + "learning_rate": 4.732819859495363e-09, + "loss": 0.6772, + "step": 9697 + }, + { + "epoch": 0.9906026557711951, + "grad_norm": 1.4815428091732459, + "learning_rate": 4.631593812614021e-09, + "loss": 0.6821, + "step": 9698 + }, + { + "epoch": 0.9907048008171604, + "grad_norm": 1.5146836796916139, + "learning_rate": 4.531461765263645e-09, + "loss": 0.6797, + "step": 9699 + }, + { + "epoch": 0.9908069458631257, + "grad_norm": 1.4444515065787251, + "learning_rate": 4.432423728402135e-09, + "loss": 0.7226, + "step": 9700 + }, + { + "epoch": 0.990909090909091, + "grad_norm": 1.3998400462277139, + "learning_rate": 4.33447971286971e-09, + "loss": 0.6329, + "step": 9701 + }, + { + "epoch": 0.9910112359550561, + "grad_norm": 1.5773829833884847, + "learning_rate": 4.237629729387793e-09, + "loss": 0.6806, + "step": 9702 + }, + { + "epoch": 0.9911133810010214, + "grad_norm": 1.5238353944196297, + "learning_rate": 4.141873788553463e-09, + "loss": 0.643, + "step": 9703 + }, + { + "epoch": 0.9912155260469867, + "grad_norm": 1.5986699686960189, + "learning_rate": 4.047211900850556e-09, + "loss": 0.7402, + "step": 9704 + }, + { + "epoch": 0.991317671092952, + "grad_norm": 1.4298476636782917, + "learning_rate": 3.953644076638563e-09, + "loss": 0.6601, + "step": 9705 + }, + { + "epoch": 0.9914198161389173, + "grad_norm": 1.4038314891185384, + "learning_rate": 3.861170326157071e-09, + "loss": 0.6782, + "step": 9706 + }, + { + "epoch": 0.9915219611848826, + "grad_norm": 1.590110716614218, + "learning_rate": 3.769790659530204e-09, + "loss": 0.6652, + "step": 9707 + }, + { + "epoch": 0.9916241062308478, + "grad_norm": 1.6411131544942885, + "learning_rate": 3.67950508675774e-09, + "loss": 0.6967, + "step": 9708 + }, + { + "epoch": 0.991726251276813, + "grad_norm": 1.4016242073432523, + "learning_rate": 3.5903136177217744e-09, + "loss": 0.7005, + "step": 9709 + }, + { + "epoch": 0.9918283963227783, + "grad_norm": 1.4000700295605912, + "learning_rate": 3.502216262184499e-09, + "loss": 0.6595, + "step": 9710 + }, + { + "epoch": 0.9919305413687436, + "grad_norm": 1.6845058129393669, + "learning_rate": 3.4152130297882003e-09, + "loss": 0.7433, + "step": 9711 + }, + { + "epoch": 0.9920326864147089, + "grad_norm": 1.5610450899616208, + "learning_rate": 3.329303930055261e-09, + "loss": 0.7315, + "step": 9712 + }, + { + "epoch": 0.9921348314606742, + "grad_norm": 1.629786125282944, + "learning_rate": 3.2444889723892702e-09, + "loss": 0.6837, + "step": 9713 + }, + { + "epoch": 0.9922369765066394, + "grad_norm": 1.5353016555086143, + "learning_rate": 3.160768166072803e-09, + "loss": 0.6821, + "step": 9714 + }, + { + "epoch": 0.9923391215526047, + "grad_norm": 1.6737665477959751, + "learning_rate": 3.0781415202685293e-09, + "loss": 0.7553, + "step": 9715 + }, + { + "epoch": 0.99244126659857, + "grad_norm": 1.512155379472392, + "learning_rate": 2.9966090440203264e-09, + "loss": 0.6331, + "step": 9716 + }, + { + "epoch": 0.9925434116445352, + "grad_norm": 1.4524805653500787, + "learning_rate": 2.916170746252167e-09, + "loss": 0.7318, + "step": 9717 + }, + { + "epoch": 0.9926455566905005, + "grad_norm": 1.4286117923143586, + "learning_rate": 2.8368266357681195e-09, + "loss": 0.6967, + "step": 9718 + }, + { + "epoch": 0.9927477017364658, + "grad_norm": 1.4520311352785522, + "learning_rate": 2.7585767212534587e-09, + "loss": 0.5924, + "step": 9719 + }, + { + "epoch": 0.992849846782431, + "grad_norm": 1.4907033731896187, + "learning_rate": 2.6814210112702245e-09, + "loss": 0.6227, + "step": 9720 + }, + { + "epoch": 0.9929519918283963, + "grad_norm": 1.440712116757037, + "learning_rate": 2.6053595142649936e-09, + "loss": 0.7134, + "step": 9721 + }, + { + "epoch": 0.9930541368743616, + "grad_norm": 1.5190057869631894, + "learning_rate": 2.5303922385622183e-09, + "loss": 0.5884, + "step": 9722 + }, + { + "epoch": 0.9931562819203269, + "grad_norm": 1.3575247305573648, + "learning_rate": 2.4565191923675568e-09, + "loss": 0.5936, + "step": 9723 + }, + { + "epoch": 0.9932584269662922, + "grad_norm": 1.4595834482568206, + "learning_rate": 2.3837403837656536e-09, + "loss": 0.6344, + "step": 9724 + }, + { + "epoch": 0.9933605720122574, + "grad_norm": 1.5763056876861297, + "learning_rate": 2.312055820723469e-09, + "loss": 0.8151, + "step": 9725 + }, + { + "epoch": 0.9934627170582226, + "grad_norm": 1.577924075792768, + "learning_rate": 2.2414655110858385e-09, + "loss": 0.7893, + "step": 9726 + }, + { + "epoch": 0.9935648621041879, + "grad_norm": 1.563689832023081, + "learning_rate": 2.1719694625788046e-09, + "loss": 0.6476, + "step": 9727 + }, + { + "epoch": 0.9936670071501532, + "grad_norm": 1.427880016951051, + "learning_rate": 2.103567682808505e-09, + "loss": 0.7267, + "step": 9728 + }, + { + "epoch": 0.9937691521961185, + "grad_norm": 1.4099044144074537, + "learning_rate": 2.036260179263394e-09, + "loss": 0.5795, + "step": 9729 + }, + { + "epoch": 0.9938712972420838, + "grad_norm": 1.5308154578199733, + "learning_rate": 1.970046959308691e-09, + "loss": 0.6904, + "step": 9730 + }, + { + "epoch": 0.9939734422880491, + "grad_norm": 1.5152319127819918, + "learning_rate": 1.9049280301919324e-09, + "loss": 0.6576, + "step": 9731 + }, + { + "epoch": 0.9940755873340144, + "grad_norm": 1.6258738679917872, + "learning_rate": 1.8409033990407498e-09, + "loss": 0.7749, + "step": 9732 + }, + { + "epoch": 0.9941777323799795, + "grad_norm": 1.5610616565163176, + "learning_rate": 1.7779730728617606e-09, + "loss": 0.6791, + "step": 9733 + }, + { + "epoch": 0.9942798774259448, + "grad_norm": 1.544344074422544, + "learning_rate": 1.7161370585427883e-09, + "loss": 0.7594, + "step": 9734 + }, + { + "epoch": 0.9943820224719101, + "grad_norm": 1.4595376190228746, + "learning_rate": 1.655395362852863e-09, + "loss": 0.6629, + "step": 9735 + }, + { + "epoch": 0.9944841675178754, + "grad_norm": 1.4811938931076363, + "learning_rate": 1.59574799244e-09, + "loss": 0.7351, + "step": 9736 + }, + { + "epoch": 0.9945863125638407, + "grad_norm": 1.5457695137180496, + "learning_rate": 1.5371949538323105e-09, + "loss": 0.5872, + "step": 9737 + }, + { + "epoch": 0.994688457609806, + "grad_norm": 1.489779053503393, + "learning_rate": 1.4797362534380022e-09, + "loss": 0.6846, + "step": 9738 + }, + { + "epoch": 0.9947906026557712, + "grad_norm": 1.6585808377352385, + "learning_rate": 1.4233718975464882e-09, + "loss": 0.6865, + "step": 9739 + }, + { + "epoch": 0.9948927477017364, + "grad_norm": 1.519439407015822, + "learning_rate": 1.3681018923272782e-09, + "loss": 0.6736, + "step": 9740 + }, + { + "epoch": 0.9949948927477017, + "grad_norm": 1.5999052402906233, + "learning_rate": 1.3139262438288669e-09, + "loss": 0.6347, + "step": 9741 + }, + { + "epoch": 0.995097037793667, + "grad_norm": 1.3866816015103618, + "learning_rate": 1.260844957982066e-09, + "loss": 0.6333, + "step": 9742 + }, + { + "epoch": 0.9951991828396323, + "grad_norm": 1.6134795770108707, + "learning_rate": 1.2088580405944518e-09, + "loss": 0.673, + "step": 9743 + }, + { + "epoch": 0.9953013278855976, + "grad_norm": 1.5431427787906, + "learning_rate": 1.1579654973581378e-09, + "loss": 0.7002, + "step": 9744 + }, + { + "epoch": 0.9954034729315628, + "grad_norm": 1.4031345199660101, + "learning_rate": 1.1081673338431132e-09, + "loss": 0.5915, + "step": 9745 + }, + { + "epoch": 0.9955056179775281, + "grad_norm": 1.4622381357635421, + "learning_rate": 1.0594635554983523e-09, + "loss": 0.6514, + "step": 9746 + }, + { + "epoch": 0.9956077630234934, + "grad_norm": 1.5920166256335029, + "learning_rate": 1.0118541676551464e-09, + "loss": 0.7037, + "step": 9747 + }, + { + "epoch": 0.9957099080694586, + "grad_norm": 1.4236857423848537, + "learning_rate": 9.653391755259922e-10, + "loss": 0.628, + "step": 9748 + }, + { + "epoch": 0.9958120531154239, + "grad_norm": 1.6001965627746497, + "learning_rate": 9.199185841990421e-10, + "loss": 0.7382, + "step": 9749 + }, + { + "epoch": 0.9959141981613892, + "grad_norm": 1.5231220871086684, + "learning_rate": 8.755923986480952e-10, + "loss": 0.7116, + "step": 9750 + }, + { + "epoch": 0.9960163432073544, + "grad_norm": 1.4768030596704838, + "learning_rate": 8.32360623724826e-10, + "loss": 0.6933, + "step": 9751 + }, + { + "epoch": 0.9961184882533197, + "grad_norm": 1.4130833974663959, + "learning_rate": 7.902232641587848e-10, + "loss": 0.6501, + "step": 9752 + }, + { + "epoch": 0.996220633299285, + "grad_norm": 1.6667250575018075, + "learning_rate": 7.49180324562948e-10, + "loss": 0.7086, + "step": 9753 + }, + { + "epoch": 0.9963227783452503, + "grad_norm": 1.4475514906626687, + "learning_rate": 7.092318094303885e-10, + "loss": 0.6168, + "step": 9754 + }, + { + "epoch": 0.9964249233912156, + "grad_norm": 1.3490862199882951, + "learning_rate": 6.70377723132054e-10, + "loss": 0.5767, + "step": 9755 + }, + { + "epoch": 0.9965270684371808, + "grad_norm": 1.4844768059310436, + "learning_rate": 6.326180699212092e-10, + "loss": 0.6089, + "step": 9756 + }, + { + "epoch": 0.996629213483146, + "grad_norm": 1.4230602273197972, + "learning_rate": 5.959528539312143e-10, + "loss": 0.702, + "step": 9757 + }, + { + "epoch": 0.9967313585291113, + "grad_norm": 1.5348659223309693, + "learning_rate": 5.603820791755255e-10, + "loss": 0.792, + "step": 9758 + }, + { + "epoch": 0.9968335035750766, + "grad_norm": 1.3595078039927926, + "learning_rate": 5.259057495454745e-10, + "loss": 0.5807, + "step": 9759 + }, + { + "epoch": 0.9969356486210419, + "grad_norm": 1.4200480888989915, + "learning_rate": 4.925238688147094e-10, + "loss": 0.7224, + "step": 9760 + }, + { + "epoch": 0.9970377936670072, + "grad_norm": 1.6254310117968056, + "learning_rate": 4.602364406391946e-10, + "loss": 0.6745, + "step": 9761 + }, + { + "epoch": 0.9971399387129725, + "grad_norm": 1.6436805560680758, + "learning_rate": 4.2904346855054956e-10, + "loss": 0.7283, + "step": 9762 + }, + { + "epoch": 0.9972420837589376, + "grad_norm": 1.405297572331721, + "learning_rate": 3.989449559638203e-10, + "loss": 0.7248, + "step": 9763 + }, + { + "epoch": 0.9973442288049029, + "grad_norm": 1.5040018132234394, + "learning_rate": 3.699409061730386e-10, + "loss": 0.7067, + "step": 9764 + }, + { + "epoch": 0.9974463738508682, + "grad_norm": 1.3729144049432551, + "learning_rate": 3.4203132235344216e-10, + "loss": 0.678, + "step": 9765 + }, + { + "epoch": 0.9975485188968335, + "grad_norm": 1.5627317186419327, + "learning_rate": 3.152162075581444e-10, + "loss": 0.5959, + "step": 9766 + }, + { + "epoch": 0.9976506639427988, + "grad_norm": 1.4802854560337235, + "learning_rate": 2.8949556472368525e-10, + "loss": 0.7342, + "step": 9767 + }, + { + "epoch": 0.9977528089887641, + "grad_norm": 1.463311580761161, + "learning_rate": 2.6486939666447995e-10, + "loss": 0.6555, + "step": 9768 + }, + { + "epoch": 0.9978549540347293, + "grad_norm": 1.4481895723207845, + "learning_rate": 2.4133770607615017e-10, + "loss": 0.675, + "step": 9769 + }, + { + "epoch": 0.9979570990806946, + "grad_norm": 1.4608510300673103, + "learning_rate": 2.1890049553330296e-10, + "loss": 0.6292, + "step": 9770 + }, + { + "epoch": 0.9980592441266598, + "grad_norm": 1.5097394213347115, + "learning_rate": 1.9755776749286192e-10, + "loss": 0.7008, + "step": 9771 + }, + { + "epoch": 0.9981613891726251, + "grad_norm": 1.4701023532438253, + "learning_rate": 1.7730952429073635e-10, + "loss": 0.6532, + "step": 9772 + }, + { + "epoch": 0.9982635342185904, + "grad_norm": 1.4968961650950559, + "learning_rate": 1.5815576814293132e-10, + "loss": 0.6848, + "step": 9773 + }, + { + "epoch": 0.9983656792645557, + "grad_norm": 1.4542113865838224, + "learning_rate": 1.4009650114554796e-10, + "loss": 0.6673, + "step": 9774 + }, + { + "epoch": 0.998467824310521, + "grad_norm": 1.6873313989121506, + "learning_rate": 1.2313172527478322e-10, + "loss": 0.7112, + "step": 9775 + }, + { + "epoch": 0.9985699693564862, + "grad_norm": 1.3646728786806015, + "learning_rate": 1.0726144238804027e-10, + "loss": 0.6289, + "step": 9776 + }, + { + "epoch": 0.9986721144024515, + "grad_norm": 1.5085952661021869, + "learning_rate": 9.248565422281808e-11, + "loss": 0.6718, + "step": 9777 + }, + { + "epoch": 0.9987742594484168, + "grad_norm": 1.5267520961320835, + "learning_rate": 7.88043623956014e-11, + "loss": 0.6658, + "step": 9778 + }, + { + "epoch": 0.998876404494382, + "grad_norm": 1.5776261395842917, + "learning_rate": 6.621756840408111e-11, + "loss": 0.6793, + "step": 9779 + }, + { + "epoch": 0.9989785495403473, + "grad_norm": 1.4143593982621947, + "learning_rate": 5.472527362604396e-11, + "loss": 0.6749, + "step": 9780 + }, + { + "epoch": 0.9990806945863125, + "grad_norm": 1.6317060793543992, + "learning_rate": 4.4327479319372647e-11, + "loss": 0.7876, + "step": 9781 + }, + { + "epoch": 0.9991828396322778, + "grad_norm": 1.4425819779577098, + "learning_rate": 3.502418662093554e-11, + "loss": 0.7695, + "step": 9782 + }, + { + "epoch": 0.9992849846782431, + "grad_norm": 1.4936037265260123, + "learning_rate": 2.6815396549917383e-11, + "loss": 0.6939, + "step": 9783 + }, + { + "epoch": 0.9993871297242084, + "grad_norm": 1.5272704943858435, + "learning_rate": 1.9701110005598822e-11, + "loss": 0.638, + "step": 9784 + }, + { + "epoch": 0.9994892747701737, + "grad_norm": 1.4363305195173535, + "learning_rate": 1.3681327765135976e-11, + "loss": 0.6003, + "step": 9785 + }, + { + "epoch": 0.999591419816139, + "grad_norm": 1.528606615469795, + "learning_rate": 8.756050489111546e-12, + "loss": 0.6423, + "step": 9786 + }, + { + "epoch": 0.9996935648621041, + "grad_norm": 1.5275074852477024, + "learning_rate": 4.925278714873472e-12, + "loss": 0.7543, + "step": 9787 + }, + { + "epoch": 0.9997957099080694, + "grad_norm": 1.3559350401499568, + "learning_rate": 2.1890128620860597e-12, + "loss": 0.705, + "step": 9788 + }, + { + "epoch": 0.9998978549540347, + "grad_norm": 1.6260612351761496, + "learning_rate": 5.472532305095257e-13, + "loss": 0.6681, + "step": 9789 + }, + { + "epoch": 1.0, + "grad_norm": 1.502086650767526, + "learning_rate": 0.0, + "loss": 0.6916, + "step": 9790 + }, + { + "epoch": 1.0, + "step": 9790, + "total_flos": 1611723555823616.0, + "train_loss": 0.7202048060142714, + "train_runtime": 204003.8028, + "train_samples_per_second": 6.142, + "train_steps_per_second": 0.048 + } + ], + "logging_steps": 1.0, + "max_steps": 9790, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1611723555823616.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}