diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,31172 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999155096178218, + "eval_steps": 500, + "global_step": 4438, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00022530768580843213, + "grad_norm": 11.936173879164299, + "learning_rate": 2.2522522522522524e-08, + "loss": 0.9961, + "step": 1 + }, + { + "epoch": 0.00045061537161686426, + "grad_norm": 11.109717764505199, + "learning_rate": 4.504504504504505e-08, + "loss": 0.9773, + "step": 2 + }, + { + "epoch": 0.0006759230574252964, + "grad_norm": 11.48763977437855, + "learning_rate": 6.756756756756757e-08, + "loss": 0.9666, + "step": 3 + }, + { + "epoch": 0.0009012307432337285, + "grad_norm": 12.921066319372066, + "learning_rate": 9.00900900900901e-08, + "loss": 1.0508, + "step": 4 + }, + { + "epoch": 0.0011265384290421608, + "grad_norm": 14.398642965351947, + "learning_rate": 1.1261261261261262e-07, + "loss": 1.0749, + "step": 5 + }, + { + "epoch": 0.0013518461148505929, + "grad_norm": 13.68891847742913, + "learning_rate": 1.3513513513513515e-07, + "loss": 1.0152, + "step": 6 + }, + { + "epoch": 0.001577153800659025, + "grad_norm": 11.354396751595853, + "learning_rate": 1.5765765765765766e-07, + "loss": 0.9314, + "step": 7 + }, + { + "epoch": 0.001802461486467457, + "grad_norm": 13.329093094043536, + "learning_rate": 1.801801801801802e-07, + "loss": 1.0694, + "step": 8 + }, + { + "epoch": 0.0020277691722758893, + "grad_norm": 12.982013179241148, + "learning_rate": 2.0270270270270273e-07, + "loss": 1.0219, + "step": 9 + }, + { + "epoch": 0.0022530768580843216, + "grad_norm": 12.097199257094674, + "learning_rate": 2.2522522522522524e-07, + "loss": 1.006, + "step": 10 + }, + { + "epoch": 0.0024783845438927535, + "grad_norm": 13.56967783294782, + "learning_rate": 2.477477477477478e-07, + "loss": 1.1272, + "step": 11 + }, + { + "epoch": 0.0027036922297011858, + "grad_norm": 11.70696672374608, + "learning_rate": 2.702702702702703e-07, + "loss": 0.9947, + "step": 12 + }, + { + "epoch": 0.0029289999155096176, + "grad_norm": 13.133925840485313, + "learning_rate": 2.927927927927928e-07, + "loss": 1.0601, + "step": 13 + }, + { + "epoch": 0.00315430760131805, + "grad_norm": 14.354181941939034, + "learning_rate": 3.153153153153153e-07, + "loss": 1.1442, + "step": 14 + }, + { + "epoch": 0.003379615287126482, + "grad_norm": 11.37630808176472, + "learning_rate": 3.378378378378379e-07, + "loss": 0.9655, + "step": 15 + }, + { + "epoch": 0.003604922972934914, + "grad_norm": 12.655498444889833, + "learning_rate": 3.603603603603604e-07, + "loss": 1.0888, + "step": 16 + }, + { + "epoch": 0.0038302306587433463, + "grad_norm": 13.682301381890513, + "learning_rate": 3.828828828828829e-07, + "loss": 1.0917, + "step": 17 + }, + { + "epoch": 0.004055538344551779, + "grad_norm": 13.293859452525615, + "learning_rate": 4.0540540540540546e-07, + "loss": 1.1195, + "step": 18 + }, + { + "epoch": 0.0042808460303602105, + "grad_norm": 11.560850043327703, + "learning_rate": 4.27927927927928e-07, + "loss": 1.0578, + "step": 19 + }, + { + "epoch": 0.004506153716168643, + "grad_norm": 10.425079898884702, + "learning_rate": 4.504504504504505e-07, + "loss": 0.9451, + "step": 20 + }, + { + "epoch": 0.004731461401977075, + "grad_norm": 10.486269444929182, + "learning_rate": 4.7297297297297305e-07, + "loss": 0.9598, + "step": 21 + }, + { + "epoch": 0.004956769087785507, + "grad_norm": 12.15628496098293, + "learning_rate": 4.954954954954956e-07, + "loss": 0.9971, + "step": 22 + }, + { + "epoch": 0.00518207677359394, + "grad_norm": 9.204174105290507, + "learning_rate": 5.180180180180181e-07, + "loss": 0.8835, + "step": 23 + }, + { + "epoch": 0.0054073844594023715, + "grad_norm": 11.066848193136316, + "learning_rate": 5.405405405405406e-07, + "loss": 0.923, + "step": 24 + }, + { + "epoch": 0.005632692145210803, + "grad_norm": 9.36860704890201, + "learning_rate": 5.630630630630631e-07, + "loss": 0.8674, + "step": 25 + }, + { + "epoch": 0.005857999831019235, + "grad_norm": 9.522248170195617, + "learning_rate": 5.855855855855856e-07, + "loss": 0.897, + "step": 26 + }, + { + "epoch": 0.006083307516827668, + "grad_norm": 11.736110183897287, + "learning_rate": 6.081081081081082e-07, + "loss": 1.0201, + "step": 27 + }, + { + "epoch": 0.0063086152026361, + "grad_norm": 7.317254552101622, + "learning_rate": 6.306306306306306e-07, + "loss": 0.7827, + "step": 28 + }, + { + "epoch": 0.006533922888444532, + "grad_norm": 7.614150097450258, + "learning_rate": 6.531531531531532e-07, + "loss": 0.805, + "step": 29 + }, + { + "epoch": 0.006759230574252964, + "grad_norm": 6.154341588900648, + "learning_rate": 6.756756756756758e-07, + "loss": 0.7434, + "step": 30 + }, + { + "epoch": 0.006984538260061396, + "grad_norm": 6.970999560013524, + "learning_rate": 6.981981981981982e-07, + "loss": 0.784, + "step": 31 + }, + { + "epoch": 0.007209845945869828, + "grad_norm": 7.073202092815235, + "learning_rate": 7.207207207207208e-07, + "loss": 0.7986, + "step": 32 + }, + { + "epoch": 0.007435153631678261, + "grad_norm": 6.525114927434426, + "learning_rate": 7.432432432432434e-07, + "loss": 0.7886, + "step": 33 + }, + { + "epoch": 0.007660461317486693, + "grad_norm": 5.687910470756995, + "learning_rate": 7.657657657657658e-07, + "loss": 0.682, + "step": 34 + }, + { + "epoch": 0.007885769003295125, + "grad_norm": 5.634933474200907, + "learning_rate": 7.882882882882883e-07, + "loss": 0.7294, + "step": 35 + }, + { + "epoch": 0.008111076689103557, + "grad_norm": 5.655833927387277, + "learning_rate": 8.108108108108109e-07, + "loss": 0.712, + "step": 36 + }, + { + "epoch": 0.008336384374911989, + "grad_norm": 4.708138709017598, + "learning_rate": 8.333333333333333e-07, + "loss": 0.6915, + "step": 37 + }, + { + "epoch": 0.008561692060720421, + "grad_norm": 4.05288774197748, + "learning_rate": 8.55855855855856e-07, + "loss": 0.6623, + "step": 38 + }, + { + "epoch": 0.008786999746528853, + "grad_norm": 1.9317291217592878, + "learning_rate": 8.783783783783785e-07, + "loss": 0.5708, + "step": 39 + }, + { + "epoch": 0.009012307432337286, + "grad_norm": 1.9094234958133491, + "learning_rate": 9.00900900900901e-07, + "loss": 0.5735, + "step": 40 + }, + { + "epoch": 0.009237615118145718, + "grad_norm": 1.9750004562335615, + "learning_rate": 9.234234234234235e-07, + "loss": 0.6037, + "step": 41 + }, + { + "epoch": 0.00946292280395415, + "grad_norm": 1.7896735937996355, + "learning_rate": 9.459459459459461e-07, + "loss": 0.5713, + "step": 42 + }, + { + "epoch": 0.009688230489762582, + "grad_norm": 1.7918525252070632, + "learning_rate": 9.684684684684686e-07, + "loss": 0.5382, + "step": 43 + }, + { + "epoch": 0.009913538175571014, + "grad_norm": 1.6885654083308477, + "learning_rate": 9.909909909909911e-07, + "loss": 0.5277, + "step": 44 + }, + { + "epoch": 0.010138845861379446, + "grad_norm": 1.6601818064455918, + "learning_rate": 1.0135135135135136e-06, + "loss": 0.5121, + "step": 45 + }, + { + "epoch": 0.01036415354718788, + "grad_norm": 1.5559165811046038, + "learning_rate": 1.0360360360360361e-06, + "loss": 0.5216, + "step": 46 + }, + { + "epoch": 0.010589461232996311, + "grad_norm": 1.4185291563062736, + "learning_rate": 1.0585585585585587e-06, + "loss": 0.4737, + "step": 47 + }, + { + "epoch": 0.010814768918804743, + "grad_norm": 1.481143710121075, + "learning_rate": 1.0810810810810812e-06, + "loss": 0.5297, + "step": 48 + }, + { + "epoch": 0.011040076604613175, + "grad_norm": 1.311870893356728, + "learning_rate": 1.1036036036036037e-06, + "loss": 0.5064, + "step": 49 + }, + { + "epoch": 0.011265384290421607, + "grad_norm": 1.1678899477800346, + "learning_rate": 1.1261261261261262e-06, + "loss": 0.4745, + "step": 50 + }, + { + "epoch": 0.011490691976230039, + "grad_norm": 1.1252132156784977, + "learning_rate": 1.148648648648649e-06, + "loss": 0.4614, + "step": 51 + }, + { + "epoch": 0.01171599966203847, + "grad_norm": 1.1288733340157195, + "learning_rate": 1.1711711711711712e-06, + "loss": 0.4774, + "step": 52 + }, + { + "epoch": 0.011941307347846904, + "grad_norm": 1.1130362968149783, + "learning_rate": 1.1936936936936937e-06, + "loss": 0.4761, + "step": 53 + }, + { + "epoch": 0.012166615033655336, + "grad_norm": 1.1120583963624748, + "learning_rate": 1.2162162162162164e-06, + "loss": 0.4498, + "step": 54 + }, + { + "epoch": 0.012391922719463768, + "grad_norm": 1.1292463332174272, + "learning_rate": 1.2387387387387387e-06, + "loss": 0.4798, + "step": 55 + }, + { + "epoch": 0.0126172304052722, + "grad_norm": 1.0157431262393812, + "learning_rate": 1.2612612612612613e-06, + "loss": 0.4424, + "step": 56 + }, + { + "epoch": 0.012842538091080631, + "grad_norm": 0.9570599129653108, + "learning_rate": 1.2837837837837838e-06, + "loss": 0.4278, + "step": 57 + }, + { + "epoch": 0.013067845776889063, + "grad_norm": 0.9153558372380529, + "learning_rate": 1.3063063063063065e-06, + "loss": 0.3809, + "step": 58 + }, + { + "epoch": 0.013293153462697497, + "grad_norm": 0.9177872785198733, + "learning_rate": 1.328828828828829e-06, + "loss": 0.4381, + "step": 59 + }, + { + "epoch": 0.013518461148505929, + "grad_norm": 0.9062402042670152, + "learning_rate": 1.3513513513513515e-06, + "loss": 0.4249, + "step": 60 + }, + { + "epoch": 0.01374376883431436, + "grad_norm": 0.8302727940931209, + "learning_rate": 1.373873873873874e-06, + "loss": 0.4317, + "step": 61 + }, + { + "epoch": 0.013969076520122792, + "grad_norm": 0.7985161152291435, + "learning_rate": 1.3963963963963963e-06, + "loss": 0.4119, + "step": 62 + }, + { + "epoch": 0.014194384205931224, + "grad_norm": 0.6788235342412323, + "learning_rate": 1.418918918918919e-06, + "loss": 0.3513, + "step": 63 + }, + { + "epoch": 0.014419691891739656, + "grad_norm": 0.8572236513498243, + "learning_rate": 1.4414414414414416e-06, + "loss": 0.4588, + "step": 64 + }, + { + "epoch": 0.01464499957754809, + "grad_norm": 0.7302922959729372, + "learning_rate": 1.463963963963964e-06, + "loss": 0.3911, + "step": 65 + }, + { + "epoch": 0.014870307263356522, + "grad_norm": 0.7824591387989931, + "learning_rate": 1.4864864864864868e-06, + "loss": 0.4037, + "step": 66 + }, + { + "epoch": 0.015095614949164953, + "grad_norm": 0.7810713906428906, + "learning_rate": 1.5090090090090093e-06, + "loss": 0.4135, + "step": 67 + }, + { + "epoch": 0.015320922634973385, + "grad_norm": 0.707342989030676, + "learning_rate": 1.5315315315315316e-06, + "loss": 0.3545, + "step": 68 + }, + { + "epoch": 0.015546230320781817, + "grad_norm": 0.7141705912615383, + "learning_rate": 1.5540540540540541e-06, + "loss": 0.385, + "step": 69 + }, + { + "epoch": 0.01577153800659025, + "grad_norm": 0.7120123045743703, + "learning_rate": 1.5765765765765766e-06, + "loss": 0.3873, + "step": 70 + }, + { + "epoch": 0.015996845692398683, + "grad_norm": 0.6978830675241943, + "learning_rate": 1.5990990990990993e-06, + "loss": 0.4184, + "step": 71 + }, + { + "epoch": 0.016222153378207115, + "grad_norm": 0.7306319139671842, + "learning_rate": 1.6216216216216219e-06, + "loss": 0.388, + "step": 72 + }, + { + "epoch": 0.016447461064015546, + "grad_norm": 0.6268268474684758, + "learning_rate": 1.6441441441441444e-06, + "loss": 0.3404, + "step": 73 + }, + { + "epoch": 0.016672768749823978, + "grad_norm": 0.6487573951668212, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.3624, + "step": 74 + }, + { + "epoch": 0.01689807643563241, + "grad_norm": 0.6484208913961328, + "learning_rate": 1.6891891891891894e-06, + "loss": 0.3556, + "step": 75 + }, + { + "epoch": 0.017123384121440842, + "grad_norm": 0.6840718234856263, + "learning_rate": 1.711711711711712e-06, + "loss": 0.3467, + "step": 76 + }, + { + "epoch": 0.017348691807249274, + "grad_norm": 0.659807580581444, + "learning_rate": 1.7342342342342344e-06, + "loss": 0.342, + "step": 77 + }, + { + "epoch": 0.017573999493057706, + "grad_norm": 0.6512037549209214, + "learning_rate": 1.756756756756757e-06, + "loss": 0.3659, + "step": 78 + }, + { + "epoch": 0.017799307178866138, + "grad_norm": 0.6874746835034562, + "learning_rate": 1.7792792792792792e-06, + "loss": 0.329, + "step": 79 + }, + { + "epoch": 0.018024614864674573, + "grad_norm": 0.6652204095219298, + "learning_rate": 1.801801801801802e-06, + "loss": 0.3651, + "step": 80 + }, + { + "epoch": 0.018249922550483005, + "grad_norm": 0.6009544505838627, + "learning_rate": 1.8243243243243245e-06, + "loss": 0.3134, + "step": 81 + }, + { + "epoch": 0.018475230236291437, + "grad_norm": 0.6373729682974445, + "learning_rate": 1.846846846846847e-06, + "loss": 0.3559, + "step": 82 + }, + { + "epoch": 0.01870053792209987, + "grad_norm": 0.6739503059036726, + "learning_rate": 1.8693693693693697e-06, + "loss": 0.3555, + "step": 83 + }, + { + "epoch": 0.0189258456079083, + "grad_norm": 0.6242123260713159, + "learning_rate": 1.8918918918918922e-06, + "loss": 0.3548, + "step": 84 + }, + { + "epoch": 0.019151153293716732, + "grad_norm": 0.5792884692336053, + "learning_rate": 1.9144144144144145e-06, + "loss": 0.2975, + "step": 85 + }, + { + "epoch": 0.019376460979525164, + "grad_norm": 0.5687785943256655, + "learning_rate": 1.9369369369369372e-06, + "loss": 0.297, + "step": 86 + }, + { + "epoch": 0.019601768665333596, + "grad_norm": 0.6246514600261033, + "learning_rate": 1.9594594594594595e-06, + "loss": 0.333, + "step": 87 + }, + { + "epoch": 0.019827076351142028, + "grad_norm": 0.6512842111920555, + "learning_rate": 1.9819819819819822e-06, + "loss": 0.3285, + "step": 88 + }, + { + "epoch": 0.02005238403695046, + "grad_norm": 0.5765256910098763, + "learning_rate": 2.0045045045045045e-06, + "loss": 0.3211, + "step": 89 + }, + { + "epoch": 0.02027769172275889, + "grad_norm": 0.63442242133845, + "learning_rate": 2.0270270270270273e-06, + "loss": 0.3247, + "step": 90 + }, + { + "epoch": 0.020502999408567323, + "grad_norm": 0.6553802581121276, + "learning_rate": 2.0495495495495496e-06, + "loss": 0.3565, + "step": 91 + }, + { + "epoch": 0.02072830709437576, + "grad_norm": 0.659534380013316, + "learning_rate": 2.0720720720720723e-06, + "loss": 0.3338, + "step": 92 + }, + { + "epoch": 0.02095361478018419, + "grad_norm": 0.5985623899439386, + "learning_rate": 2.0945945945945946e-06, + "loss": 0.3348, + "step": 93 + }, + { + "epoch": 0.021178922465992622, + "grad_norm": 0.6480322141678077, + "learning_rate": 2.1171171171171173e-06, + "loss": 0.3413, + "step": 94 + }, + { + "epoch": 0.021404230151801054, + "grad_norm": 0.5868758681500902, + "learning_rate": 2.13963963963964e-06, + "loss": 0.3093, + "step": 95 + }, + { + "epoch": 0.021629537837609486, + "grad_norm": 0.6189631518226665, + "learning_rate": 2.1621621621621623e-06, + "loss": 0.3523, + "step": 96 + }, + { + "epoch": 0.021854845523417918, + "grad_norm": 0.5987025667709506, + "learning_rate": 2.1846846846846846e-06, + "loss": 0.3261, + "step": 97 + }, + { + "epoch": 0.02208015320922635, + "grad_norm": 0.6448724071398437, + "learning_rate": 2.2072072072072073e-06, + "loss": 0.3524, + "step": 98 + }, + { + "epoch": 0.02230546089503478, + "grad_norm": 0.6297715682432312, + "learning_rate": 2.22972972972973e-06, + "loss": 0.3413, + "step": 99 + }, + { + "epoch": 0.022530768580843213, + "grad_norm": 0.6021319322145385, + "learning_rate": 2.2522522522522524e-06, + "loss": 0.2919, + "step": 100 + }, + { + "epoch": 0.022756076266651645, + "grad_norm": 0.6060171552253918, + "learning_rate": 2.274774774774775e-06, + "loss": 0.2985, + "step": 101 + }, + { + "epoch": 0.022981383952460077, + "grad_norm": 0.5860236698371681, + "learning_rate": 2.297297297297298e-06, + "loss": 0.3112, + "step": 102 + }, + { + "epoch": 0.02320669163826851, + "grad_norm": 0.5975203183353793, + "learning_rate": 2.31981981981982e-06, + "loss": 0.3041, + "step": 103 + }, + { + "epoch": 0.02343199932407694, + "grad_norm": 0.5871301449536451, + "learning_rate": 2.3423423423423424e-06, + "loss": 0.3032, + "step": 104 + }, + { + "epoch": 0.023657307009885376, + "grad_norm": 0.6316618555182978, + "learning_rate": 2.364864864864865e-06, + "loss": 0.3341, + "step": 105 + }, + { + "epoch": 0.023882614695693808, + "grad_norm": 0.7023154810702413, + "learning_rate": 2.3873873873873874e-06, + "loss": 0.3161, + "step": 106 + }, + { + "epoch": 0.02410792238150224, + "grad_norm": 0.8198651637851923, + "learning_rate": 2.40990990990991e-06, + "loss": 0.3012, + "step": 107 + }, + { + "epoch": 0.024333230067310672, + "grad_norm": 0.5808646835139879, + "learning_rate": 2.432432432432433e-06, + "loss": 0.3111, + "step": 108 + }, + { + "epoch": 0.024558537753119104, + "grad_norm": 0.5740220072963436, + "learning_rate": 2.454954954954955e-06, + "loss": 0.3277, + "step": 109 + }, + { + "epoch": 0.024783845438927535, + "grad_norm": 0.5655280662205966, + "learning_rate": 2.4774774774774775e-06, + "loss": 0.2846, + "step": 110 + }, + { + "epoch": 0.025009153124735967, + "grad_norm": 0.5980545328698392, + "learning_rate": 2.5e-06, + "loss": 0.3178, + "step": 111 + }, + { + "epoch": 0.0252344608105444, + "grad_norm": 0.5681018063872556, + "learning_rate": 2.5225225225225225e-06, + "loss": 0.3374, + "step": 112 + }, + { + "epoch": 0.02545976849635283, + "grad_norm": 0.5213536695778083, + "learning_rate": 2.5450450450450452e-06, + "loss": 0.2762, + "step": 113 + }, + { + "epoch": 0.025685076182161263, + "grad_norm": 0.5894435916703348, + "learning_rate": 2.5675675675675675e-06, + "loss": 0.3014, + "step": 114 + }, + { + "epoch": 0.025910383867969695, + "grad_norm": 0.6134410310091715, + "learning_rate": 2.5900900900900907e-06, + "loss": 0.3266, + "step": 115 + }, + { + "epoch": 0.026135691553778127, + "grad_norm": 0.5705140807622049, + "learning_rate": 2.612612612612613e-06, + "loss": 0.304, + "step": 116 + }, + { + "epoch": 0.026360999239586562, + "grad_norm": 0.6011386424919888, + "learning_rate": 2.6351351351351353e-06, + "loss": 0.3139, + "step": 117 + }, + { + "epoch": 0.026586306925394994, + "grad_norm": 0.5546073985402236, + "learning_rate": 2.657657657657658e-06, + "loss": 0.3063, + "step": 118 + }, + { + "epoch": 0.026811614611203426, + "grad_norm": 0.5549497978809252, + "learning_rate": 2.6801801801801803e-06, + "loss": 0.293, + "step": 119 + }, + { + "epoch": 0.027036922297011858, + "grad_norm": 0.5837056671669407, + "learning_rate": 2.702702702702703e-06, + "loss": 0.3102, + "step": 120 + }, + { + "epoch": 0.02726222998282029, + "grad_norm": 0.5744799226261981, + "learning_rate": 2.7252252252252253e-06, + "loss": 0.2986, + "step": 121 + }, + { + "epoch": 0.02748753766862872, + "grad_norm": 0.5439720751333971, + "learning_rate": 2.747747747747748e-06, + "loss": 0.3033, + "step": 122 + }, + { + "epoch": 0.027712845354437153, + "grad_norm": 0.5474247941587266, + "learning_rate": 2.7702702702702703e-06, + "loss": 0.2858, + "step": 123 + }, + { + "epoch": 0.027938153040245585, + "grad_norm": 0.6283331854834596, + "learning_rate": 2.7927927927927926e-06, + "loss": 0.3189, + "step": 124 + }, + { + "epoch": 0.028163460726054017, + "grad_norm": 0.5603994323773408, + "learning_rate": 2.8153153153153158e-06, + "loss": 0.2887, + "step": 125 + }, + { + "epoch": 0.02838876841186245, + "grad_norm": 0.5644680365666778, + "learning_rate": 2.837837837837838e-06, + "loss": 0.3153, + "step": 126 + }, + { + "epoch": 0.02861407609767088, + "grad_norm": 0.5759974006009255, + "learning_rate": 2.860360360360361e-06, + "loss": 0.3141, + "step": 127 + }, + { + "epoch": 0.028839383783479312, + "grad_norm": 0.5789611651696865, + "learning_rate": 2.882882882882883e-06, + "loss": 0.2898, + "step": 128 + }, + { + "epoch": 0.029064691469287748, + "grad_norm": 0.6009295302156721, + "learning_rate": 2.9054054054054054e-06, + "loss": 0.3116, + "step": 129 + }, + { + "epoch": 0.02928999915509618, + "grad_norm": 0.5747588495071535, + "learning_rate": 2.927927927927928e-06, + "loss": 0.3174, + "step": 130 + }, + { + "epoch": 0.02951530684090461, + "grad_norm": 0.6201794666398632, + "learning_rate": 2.9504504504504504e-06, + "loss": 0.2812, + "step": 131 + }, + { + "epoch": 0.029740614526713043, + "grad_norm": 0.5720112683629468, + "learning_rate": 2.9729729729729736e-06, + "loss": 0.3085, + "step": 132 + }, + { + "epoch": 0.029965922212521475, + "grad_norm": 0.5910233820294922, + "learning_rate": 2.995495495495496e-06, + "loss": 0.3063, + "step": 133 + }, + { + "epoch": 0.030191229898329907, + "grad_norm": 0.589120520987268, + "learning_rate": 3.0180180180180186e-06, + "loss": 0.2866, + "step": 134 + }, + { + "epoch": 0.03041653758413834, + "grad_norm": 0.5515445749935719, + "learning_rate": 3.040540540540541e-06, + "loss": 0.3004, + "step": 135 + }, + { + "epoch": 0.03064184526994677, + "grad_norm": 0.5754269057699644, + "learning_rate": 3.063063063063063e-06, + "loss": 0.3014, + "step": 136 + }, + { + "epoch": 0.030867152955755203, + "grad_norm": 0.5638039350243716, + "learning_rate": 3.085585585585586e-06, + "loss": 0.2916, + "step": 137 + }, + { + "epoch": 0.031092460641563634, + "grad_norm": 0.5999708231413027, + "learning_rate": 3.1081081081081082e-06, + "loss": 0.296, + "step": 138 + }, + { + "epoch": 0.03131776832737207, + "grad_norm": 0.579400745613337, + "learning_rate": 3.130630630630631e-06, + "loss": 0.2896, + "step": 139 + }, + { + "epoch": 0.0315430760131805, + "grad_norm": 0.564389230700374, + "learning_rate": 3.1531531531531532e-06, + "loss": 0.279, + "step": 140 + }, + { + "epoch": 0.03176838369898893, + "grad_norm": 0.556887123989689, + "learning_rate": 3.1756756756756755e-06, + "loss": 0.2864, + "step": 141 + }, + { + "epoch": 0.031993691384797365, + "grad_norm": 0.5798090380099931, + "learning_rate": 3.1981981981981987e-06, + "loss": 0.2976, + "step": 142 + }, + { + "epoch": 0.0322189990706058, + "grad_norm": 0.60161592787678, + "learning_rate": 3.220720720720721e-06, + "loss": 0.3104, + "step": 143 + }, + { + "epoch": 0.03244430675641423, + "grad_norm": 0.5913558650343905, + "learning_rate": 3.2432432432432437e-06, + "loss": 0.2952, + "step": 144 + }, + { + "epoch": 0.03266961444222266, + "grad_norm": 0.6258099277660234, + "learning_rate": 3.265765765765766e-06, + "loss": 0.3065, + "step": 145 + }, + { + "epoch": 0.03289492212803109, + "grad_norm": 0.5439327977956886, + "learning_rate": 3.2882882882882887e-06, + "loss": 0.2748, + "step": 146 + }, + { + "epoch": 0.033120229813839525, + "grad_norm": 0.5400895120263501, + "learning_rate": 3.310810810810811e-06, + "loss": 0.2701, + "step": 147 + }, + { + "epoch": 0.033345537499647956, + "grad_norm": 0.567447838869882, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.2727, + "step": 148 + }, + { + "epoch": 0.03357084518545639, + "grad_norm": 0.5992503091639352, + "learning_rate": 3.3558558558558565e-06, + "loss": 0.2707, + "step": 149 + }, + { + "epoch": 0.03379615287126482, + "grad_norm": 0.5879995880852936, + "learning_rate": 3.3783783783783788e-06, + "loss": 0.2912, + "step": 150 + }, + { + "epoch": 0.03402146055707325, + "grad_norm": 0.5446531705812507, + "learning_rate": 3.4009009009009015e-06, + "loss": 0.2871, + "step": 151 + }, + { + "epoch": 0.034246768242881684, + "grad_norm": 0.5765949589037657, + "learning_rate": 3.423423423423424e-06, + "loss": 0.2519, + "step": 152 + }, + { + "epoch": 0.034472075928690116, + "grad_norm": 0.6044291836429194, + "learning_rate": 3.445945945945946e-06, + "loss": 0.2699, + "step": 153 + }, + { + "epoch": 0.03469738361449855, + "grad_norm": 0.5449462276919425, + "learning_rate": 3.468468468468469e-06, + "loss": 0.2789, + "step": 154 + }, + { + "epoch": 0.03492269130030698, + "grad_norm": 0.5466393002721017, + "learning_rate": 3.490990990990991e-06, + "loss": 0.2811, + "step": 155 + }, + { + "epoch": 0.03514799898611541, + "grad_norm": 0.6341319797629209, + "learning_rate": 3.513513513513514e-06, + "loss": 0.308, + "step": 156 + }, + { + "epoch": 0.03537330667192384, + "grad_norm": 0.581316008997504, + "learning_rate": 3.536036036036036e-06, + "loss": 0.2687, + "step": 157 + }, + { + "epoch": 0.035598614357732275, + "grad_norm": 0.617573521990852, + "learning_rate": 3.5585585585585584e-06, + "loss": 0.2599, + "step": 158 + }, + { + "epoch": 0.03582392204354071, + "grad_norm": 0.5221543477733095, + "learning_rate": 3.5810810810810816e-06, + "loss": 0.2596, + "step": 159 + }, + { + "epoch": 0.036049229729349146, + "grad_norm": 0.6603266674791795, + "learning_rate": 3.603603603603604e-06, + "loss": 0.3211, + "step": 160 + }, + { + "epoch": 0.03627453741515758, + "grad_norm": 0.5955458903480422, + "learning_rate": 3.6261261261261266e-06, + "loss": 0.2923, + "step": 161 + }, + { + "epoch": 0.03649984510096601, + "grad_norm": 0.6055380301297932, + "learning_rate": 3.648648648648649e-06, + "loss": 0.2906, + "step": 162 + }, + { + "epoch": 0.03672515278677444, + "grad_norm": 0.531818869967569, + "learning_rate": 3.6711711711711716e-06, + "loss": 0.2629, + "step": 163 + }, + { + "epoch": 0.03695046047258287, + "grad_norm": 0.5753444861615965, + "learning_rate": 3.693693693693694e-06, + "loss": 0.2845, + "step": 164 + }, + { + "epoch": 0.037175768158391305, + "grad_norm": 0.5564830966738844, + "learning_rate": 3.7162162162162162e-06, + "loss": 0.2603, + "step": 165 + }, + { + "epoch": 0.03740107584419974, + "grad_norm": 0.5501050244400439, + "learning_rate": 3.7387387387387394e-06, + "loss": 0.2734, + "step": 166 + }, + { + "epoch": 0.03762638353000817, + "grad_norm": 0.5660023140704108, + "learning_rate": 3.7612612612612612e-06, + "loss": 0.2894, + "step": 167 + }, + { + "epoch": 0.0378516912158166, + "grad_norm": 0.6219021743315375, + "learning_rate": 3.7837837837837844e-06, + "loss": 0.2883, + "step": 168 + }, + { + "epoch": 0.03807699890162503, + "grad_norm": 0.5270550226341922, + "learning_rate": 3.8063063063063067e-06, + "loss": 0.2693, + "step": 169 + }, + { + "epoch": 0.038302306587433464, + "grad_norm": 0.5331833000506953, + "learning_rate": 3.828828828828829e-06, + "loss": 0.2777, + "step": 170 + }, + { + "epoch": 0.038527614273241896, + "grad_norm": 0.565068820038, + "learning_rate": 3.851351351351352e-06, + "loss": 0.276, + "step": 171 + }, + { + "epoch": 0.03875292195905033, + "grad_norm": 0.5780634546593982, + "learning_rate": 3.8738738738738744e-06, + "loss": 0.2747, + "step": 172 + }, + { + "epoch": 0.03897822964485876, + "grad_norm": 0.5307406453698116, + "learning_rate": 3.896396396396397e-06, + "loss": 0.2608, + "step": 173 + }, + { + "epoch": 0.03920353733066719, + "grad_norm": 0.5896546885597862, + "learning_rate": 3.918918918918919e-06, + "loss": 0.2918, + "step": 174 + }, + { + "epoch": 0.039428845016475624, + "grad_norm": 0.6554028736684729, + "learning_rate": 3.941441441441442e-06, + "loss": 0.2732, + "step": 175 + }, + { + "epoch": 0.039654152702284055, + "grad_norm": 0.5879690965569474, + "learning_rate": 3.9639639639639645e-06, + "loss": 0.2746, + "step": 176 + }, + { + "epoch": 0.03987946038809249, + "grad_norm": 0.5179332240216997, + "learning_rate": 3.986486486486487e-06, + "loss": 0.2458, + "step": 177 + }, + { + "epoch": 0.04010476807390092, + "grad_norm": 0.5367943905708724, + "learning_rate": 4.009009009009009e-06, + "loss": 0.2827, + "step": 178 + }, + { + "epoch": 0.04033007575970935, + "grad_norm": 0.5320197577269457, + "learning_rate": 4.031531531531531e-06, + "loss": 0.2663, + "step": 179 + }, + { + "epoch": 0.04055538344551778, + "grad_norm": 0.5641722252535161, + "learning_rate": 4.0540540540540545e-06, + "loss": 0.2895, + "step": 180 + }, + { + "epoch": 0.040780691131326215, + "grad_norm": 0.5945830312609298, + "learning_rate": 4.076576576576577e-06, + "loss": 0.3002, + "step": 181 + }, + { + "epoch": 0.041005998817134647, + "grad_norm": 0.5149067756946532, + "learning_rate": 4.099099099099099e-06, + "loss": 0.2427, + "step": 182 + }, + { + "epoch": 0.04123130650294308, + "grad_norm": 0.5051078922831654, + "learning_rate": 4.121621621621622e-06, + "loss": 0.2388, + "step": 183 + }, + { + "epoch": 0.04145661418875152, + "grad_norm": 0.5630326104128994, + "learning_rate": 4.1441441441441446e-06, + "loss": 0.2452, + "step": 184 + }, + { + "epoch": 0.04168192187455995, + "grad_norm": 0.5723286428947142, + "learning_rate": 4.166666666666667e-06, + "loss": 0.3019, + "step": 185 + }, + { + "epoch": 0.04190722956036838, + "grad_norm": 0.5190626209667113, + "learning_rate": 4.189189189189189e-06, + "loss": 0.26, + "step": 186 + }, + { + "epoch": 0.04213253724617681, + "grad_norm": 0.5415127019626786, + "learning_rate": 4.2117117117117115e-06, + "loss": 0.2771, + "step": 187 + }, + { + "epoch": 0.042357844931985245, + "grad_norm": 0.5148943999308498, + "learning_rate": 4.234234234234235e-06, + "loss": 0.2421, + "step": 188 + }, + { + "epoch": 0.042583152617793676, + "grad_norm": 0.5219749562808006, + "learning_rate": 4.256756756756757e-06, + "loss": 0.2363, + "step": 189 + }, + { + "epoch": 0.04280846030360211, + "grad_norm": 0.5182891351137497, + "learning_rate": 4.27927927927928e-06, + "loss": 0.2665, + "step": 190 + }, + { + "epoch": 0.04303376798941054, + "grad_norm": 0.5335524605354496, + "learning_rate": 4.301801801801802e-06, + "loss": 0.2709, + "step": 191 + }, + { + "epoch": 0.04325907567521897, + "grad_norm": 0.5358656165662956, + "learning_rate": 4.324324324324325e-06, + "loss": 0.2752, + "step": 192 + }, + { + "epoch": 0.043484383361027404, + "grad_norm": 0.5423026702181873, + "learning_rate": 4.346846846846847e-06, + "loss": 0.2633, + "step": 193 + }, + { + "epoch": 0.043709691046835836, + "grad_norm": 0.5356664659801363, + "learning_rate": 4.369369369369369e-06, + "loss": 0.2743, + "step": 194 + }, + { + "epoch": 0.04393499873264427, + "grad_norm": 0.5271023564093342, + "learning_rate": 4.391891891891892e-06, + "loss": 0.2569, + "step": 195 + }, + { + "epoch": 0.0441603064184527, + "grad_norm": 0.5354325541802134, + "learning_rate": 4.414414414414415e-06, + "loss": 0.2679, + "step": 196 + }, + { + "epoch": 0.04438561410426113, + "grad_norm": 0.5812639664433978, + "learning_rate": 4.436936936936938e-06, + "loss": 0.2699, + "step": 197 + }, + { + "epoch": 0.04461092179006956, + "grad_norm": 0.5023108000816281, + "learning_rate": 4.45945945945946e-06, + "loss": 0.2513, + "step": 198 + }, + { + "epoch": 0.044836229475877995, + "grad_norm": 0.5622972302988116, + "learning_rate": 4.4819819819819824e-06, + "loss": 0.276, + "step": 199 + }, + { + "epoch": 0.04506153716168643, + "grad_norm": 0.5331924613395196, + "learning_rate": 4.504504504504505e-06, + "loss": 0.2628, + "step": 200 + }, + { + "epoch": 0.04528684484749486, + "grad_norm": 0.5379790043889859, + "learning_rate": 4.527027027027027e-06, + "loss": 0.26, + "step": 201 + }, + { + "epoch": 0.04551215253330329, + "grad_norm": 0.5287338263732682, + "learning_rate": 4.54954954954955e-06, + "loss": 0.255, + "step": 202 + }, + { + "epoch": 0.04573746021911172, + "grad_norm": 0.5650946176681797, + "learning_rate": 4.5720720720720725e-06, + "loss": 0.273, + "step": 203 + }, + { + "epoch": 0.045962767904920154, + "grad_norm": 0.5399848479417585, + "learning_rate": 4.594594594594596e-06, + "loss": 0.2591, + "step": 204 + }, + { + "epoch": 0.046188075590728586, + "grad_norm": 0.5503252284611668, + "learning_rate": 4.617117117117118e-06, + "loss": 0.273, + "step": 205 + }, + { + "epoch": 0.04641338327653702, + "grad_norm": 0.5356708409247416, + "learning_rate": 4.63963963963964e-06, + "loss": 0.2907, + "step": 206 + }, + { + "epoch": 0.04663869096234545, + "grad_norm": 0.5680782788614975, + "learning_rate": 4.6621621621621625e-06, + "loss": 0.2779, + "step": 207 + }, + { + "epoch": 0.04686399864815388, + "grad_norm": 0.538559452588655, + "learning_rate": 4.684684684684685e-06, + "loss": 0.2707, + "step": 208 + }, + { + "epoch": 0.04708930633396232, + "grad_norm": 0.5595464071717937, + "learning_rate": 4.707207207207208e-06, + "loss": 0.2657, + "step": 209 + }, + { + "epoch": 0.04731461401977075, + "grad_norm": 0.5490893054315078, + "learning_rate": 4.72972972972973e-06, + "loss": 0.2738, + "step": 210 + }, + { + "epoch": 0.047539921705579184, + "grad_norm": 0.503924740818271, + "learning_rate": 4.7522522522522526e-06, + "loss": 0.242, + "step": 211 + }, + { + "epoch": 0.047765229391387616, + "grad_norm": 0.6050424912600394, + "learning_rate": 4.774774774774775e-06, + "loss": 0.2751, + "step": 212 + }, + { + "epoch": 0.04799053707719605, + "grad_norm": 0.5521529568344522, + "learning_rate": 4.797297297297297e-06, + "loss": 0.2399, + "step": 213 + }, + { + "epoch": 0.04821584476300448, + "grad_norm": 0.5362581252132722, + "learning_rate": 4.81981981981982e-06, + "loss": 0.2496, + "step": 214 + }, + { + "epoch": 0.04844115244881291, + "grad_norm": 0.5359936711692728, + "learning_rate": 4.842342342342343e-06, + "loss": 0.2551, + "step": 215 + }, + { + "epoch": 0.048666460134621344, + "grad_norm": 0.5234364169307213, + "learning_rate": 4.864864864864866e-06, + "loss": 0.2564, + "step": 216 + }, + { + "epoch": 0.048891767820429775, + "grad_norm": 0.5347237036233039, + "learning_rate": 4.887387387387388e-06, + "loss": 0.251, + "step": 217 + }, + { + "epoch": 0.04911707550623821, + "grad_norm": 0.5858618760678491, + "learning_rate": 4.90990990990991e-06, + "loss": 0.2695, + "step": 218 + }, + { + "epoch": 0.04934238319204664, + "grad_norm": 0.5555974042872671, + "learning_rate": 4.932432432432433e-06, + "loss": 0.2621, + "step": 219 + }, + { + "epoch": 0.04956769087785507, + "grad_norm": 0.5709454699777917, + "learning_rate": 4.954954954954955e-06, + "loss": 0.2896, + "step": 220 + }, + { + "epoch": 0.0497929985636635, + "grad_norm": 0.5554781124539431, + "learning_rate": 4.977477477477478e-06, + "loss": 0.259, + "step": 221 + }, + { + "epoch": 0.050018306249471935, + "grad_norm": 0.547525110724423, + "learning_rate": 5e-06, + "loss": 0.2588, + "step": 222 + }, + { + "epoch": 0.050243613935280367, + "grad_norm": 0.5203084790922287, + "learning_rate": 4.999999305921593e-06, + "loss": 0.259, + "step": 223 + }, + { + "epoch": 0.0504689216210888, + "grad_norm": 0.5216446252432033, + "learning_rate": 4.999997223686756e-06, + "loss": 0.2399, + "step": 224 + }, + { + "epoch": 0.05069422930689723, + "grad_norm": 0.5699954948887261, + "learning_rate": 4.9999937532966454e-06, + "loss": 0.278, + "step": 225 + }, + { + "epoch": 0.05091953699270566, + "grad_norm": 0.5005257255031206, + "learning_rate": 4.999988894753189e-06, + "loss": 0.2336, + "step": 226 + }, + { + "epoch": 0.051144844678514094, + "grad_norm": 0.5802518334154529, + "learning_rate": 4.999982648059082e-06, + "loss": 0.2937, + "step": 227 + }, + { + "epoch": 0.051370152364322526, + "grad_norm": 0.5933766139368982, + "learning_rate": 4.999975013217796e-06, + "loss": 0.275, + "step": 228 + }, + { + "epoch": 0.05159546005013096, + "grad_norm": 0.5493350162784889, + "learning_rate": 4.99996599023357e-06, + "loss": 0.2719, + "step": 229 + }, + { + "epoch": 0.05182076773593939, + "grad_norm": 0.5539631206233041, + "learning_rate": 4.999955579111413e-06, + "loss": 0.2674, + "step": 230 + }, + { + "epoch": 0.05204607542174782, + "grad_norm": 0.5012253111610666, + "learning_rate": 4.999943779857106e-06, + "loss": 0.2294, + "step": 231 + }, + { + "epoch": 0.05227138310755625, + "grad_norm": 0.5635675497060345, + "learning_rate": 4.9999305924772e-06, + "loss": 0.2691, + "step": 232 + }, + { + "epoch": 0.05249669079336469, + "grad_norm": 0.5445020626349343, + "learning_rate": 4.999916016979019e-06, + "loss": 0.2543, + "step": 233 + }, + { + "epoch": 0.052721998479173124, + "grad_norm": 0.576507641787432, + "learning_rate": 4.999900053370657e-06, + "loss": 0.2776, + "step": 234 + }, + { + "epoch": 0.052947306164981556, + "grad_norm": 0.5358673412608711, + "learning_rate": 4.999882701660975e-06, + "loss": 0.2576, + "step": 235 + }, + { + "epoch": 0.05317261385078999, + "grad_norm": 0.49720157054682046, + "learning_rate": 4.99986396185961e-06, + "loss": 0.241, + "step": 236 + }, + { + "epoch": 0.05339792153659842, + "grad_norm": 0.5623899434090289, + "learning_rate": 4.999843833976967e-06, + "loss": 0.2651, + "step": 237 + }, + { + "epoch": 0.05362322922240685, + "grad_norm": 0.5119448981504607, + "learning_rate": 4.999822318024222e-06, + "loss": 0.2308, + "step": 238 + }, + { + "epoch": 0.05384853690821528, + "grad_norm": 0.5721640546803508, + "learning_rate": 4.999799414013322e-06, + "loss": 0.2775, + "step": 239 + }, + { + "epoch": 0.054073844594023715, + "grad_norm": 0.509335528757952, + "learning_rate": 4.9997751219569844e-06, + "loss": 0.2308, + "step": 240 + }, + { + "epoch": 0.05429915227983215, + "grad_norm": 0.5253291750664415, + "learning_rate": 4.999749441868699e-06, + "loss": 0.2311, + "step": 241 + }, + { + "epoch": 0.05452445996564058, + "grad_norm": 0.5018040304043867, + "learning_rate": 4.999722373762725e-06, + "loss": 0.2316, + "step": 242 + }, + { + "epoch": 0.05474976765144901, + "grad_norm": 0.5493140345979329, + "learning_rate": 4.9996939176540895e-06, + "loss": 0.2529, + "step": 243 + }, + { + "epoch": 0.05497507533725744, + "grad_norm": 0.5353508145989391, + "learning_rate": 4.999664073558596e-06, + "loss": 0.264, + "step": 244 + }, + { + "epoch": 0.055200383023065874, + "grad_norm": 0.572868958302028, + "learning_rate": 4.999632841492815e-06, + "loss": 0.2611, + "step": 245 + }, + { + "epoch": 0.055425690708874306, + "grad_norm": 0.5138295992044387, + "learning_rate": 4.999600221474089e-06, + "loss": 0.2351, + "step": 246 + }, + { + "epoch": 0.05565099839468274, + "grad_norm": 0.49837206603818257, + "learning_rate": 4.999566213520529e-06, + "loss": 0.2353, + "step": 247 + }, + { + "epoch": 0.05587630608049117, + "grad_norm": 0.5187607812721068, + "learning_rate": 4.99953081765102e-06, + "loss": 0.2476, + "step": 248 + }, + { + "epoch": 0.0561016137662996, + "grad_norm": 0.4847894055885108, + "learning_rate": 4.999494033885215e-06, + "loss": 0.2276, + "step": 249 + }, + { + "epoch": 0.056326921452108034, + "grad_norm": 0.5574706172522889, + "learning_rate": 4.999455862243539e-06, + "loss": 0.2731, + "step": 250 + }, + { + "epoch": 0.056552229137916465, + "grad_norm": 0.5424717676120533, + "learning_rate": 4.999416302747189e-06, + "loss": 0.252, + "step": 251 + }, + { + "epoch": 0.0567775368237249, + "grad_norm": 0.5373693422068457, + "learning_rate": 4.999375355418128e-06, + "loss": 0.2676, + "step": 252 + }, + { + "epoch": 0.05700284450953333, + "grad_norm": 0.5828979442727652, + "learning_rate": 4.999333020279094e-06, + "loss": 0.2342, + "step": 253 + }, + { + "epoch": 0.05722815219534176, + "grad_norm": 0.5344116062958477, + "learning_rate": 4.999289297353593e-06, + "loss": 0.2466, + "step": 254 + }, + { + "epoch": 0.05745345988115019, + "grad_norm": 0.511266662238741, + "learning_rate": 4.9992441866659054e-06, + "loss": 0.2496, + "step": 255 + }, + { + "epoch": 0.057678767566958625, + "grad_norm": 0.5212650478432154, + "learning_rate": 4.999197688241076e-06, + "loss": 0.2195, + "step": 256 + }, + { + "epoch": 0.05790407525276706, + "grad_norm": 0.5390312671700465, + "learning_rate": 4.999149802104926e-06, + "loss": 0.2261, + "step": 257 + }, + { + "epoch": 0.058129382938575495, + "grad_norm": 0.513006294727029, + "learning_rate": 4.999100528284045e-06, + "loss": 0.2485, + "step": 258 + }, + { + "epoch": 0.05835469062438393, + "grad_norm": 0.533856909477219, + "learning_rate": 4.999049866805793e-06, + "loss": 0.2531, + "step": 259 + }, + { + "epoch": 0.05857999831019236, + "grad_norm": 0.5199018736646431, + "learning_rate": 4.998997817698298e-06, + "loss": 0.2255, + "step": 260 + }, + { + "epoch": 0.05880530599600079, + "grad_norm": 0.5698525959140979, + "learning_rate": 4.998944380990462e-06, + "loss": 0.2822, + "step": 261 + }, + { + "epoch": 0.05903061368180922, + "grad_norm": 0.520255828671511, + "learning_rate": 4.998889556711958e-06, + "loss": 0.2625, + "step": 262 + }, + { + "epoch": 0.059255921367617655, + "grad_norm": 0.5378431835682165, + "learning_rate": 4.998833344893226e-06, + "loss": 0.2369, + "step": 263 + }, + { + "epoch": 0.059481229053426087, + "grad_norm": 0.5612975965032834, + "learning_rate": 4.998775745565479e-06, + "loss": 0.2674, + "step": 264 + }, + { + "epoch": 0.05970653673923452, + "grad_norm": 0.532982113460971, + "learning_rate": 4.998716758760701e-06, + "loss": 0.2632, + "step": 265 + }, + { + "epoch": 0.05993184442504295, + "grad_norm": 0.5391602852834664, + "learning_rate": 4.998656384511643e-06, + "loss": 0.2482, + "step": 266 + }, + { + "epoch": 0.06015715211085138, + "grad_norm": 0.49561824121587345, + "learning_rate": 4.998594622851829e-06, + "loss": 0.1964, + "step": 267 + }, + { + "epoch": 0.060382459796659814, + "grad_norm": 0.5769530727799053, + "learning_rate": 4.9985314738155545e-06, + "loss": 0.2655, + "step": 268 + }, + { + "epoch": 0.060607767482468246, + "grad_norm": 0.5147339012649532, + "learning_rate": 4.9984669374378825e-06, + "loss": 0.2319, + "step": 269 + }, + { + "epoch": 0.06083307516827668, + "grad_norm": 0.601957903530436, + "learning_rate": 4.9984010137546475e-06, + "loss": 0.298, + "step": 270 + }, + { + "epoch": 0.06105838285408511, + "grad_norm": 0.5296976017163629, + "learning_rate": 4.998333702802457e-06, + "loss": 0.2477, + "step": 271 + }, + { + "epoch": 0.06128369053989354, + "grad_norm": 1.1006997984360207, + "learning_rate": 4.998265004618682e-06, + "loss": 0.2623, + "step": 272 + }, + { + "epoch": 0.06150899822570197, + "grad_norm": 0.5280928480958849, + "learning_rate": 4.998194919241471e-06, + "loss": 0.2465, + "step": 273 + }, + { + "epoch": 0.061734305911510405, + "grad_norm": 0.5742459841837003, + "learning_rate": 4.998123446709739e-06, + "loss": 0.2632, + "step": 274 + }, + { + "epoch": 0.06195961359731884, + "grad_norm": 0.5693001397168037, + "learning_rate": 4.998050587063173e-06, + "loss": 0.2327, + "step": 275 + }, + { + "epoch": 0.06218492128312727, + "grad_norm": 0.5355472925042629, + "learning_rate": 4.997976340342226e-06, + "loss": 0.2452, + "step": 276 + }, + { + "epoch": 0.0624102289689357, + "grad_norm": 0.6032151517142877, + "learning_rate": 4.997900706588129e-06, + "loss": 0.2592, + "step": 277 + }, + { + "epoch": 0.06263553665474414, + "grad_norm": 0.5812745923843011, + "learning_rate": 4.997823685842875e-06, + "loss": 0.2602, + "step": 278 + }, + { + "epoch": 0.06286084434055257, + "grad_norm": 0.4842358936566825, + "learning_rate": 4.997745278149233e-06, + "loss": 0.2118, + "step": 279 + }, + { + "epoch": 0.063086152026361, + "grad_norm": 0.515905911136647, + "learning_rate": 4.997665483550739e-06, + "loss": 0.2329, + "step": 280 + }, + { + "epoch": 0.06331145971216944, + "grad_norm": 0.557843250774739, + "learning_rate": 4.997584302091699e-06, + "loss": 0.2668, + "step": 281 + }, + { + "epoch": 0.06353676739797787, + "grad_norm": 0.5441825034870524, + "learning_rate": 4.997501733817191e-06, + "loss": 0.2506, + "step": 282 + }, + { + "epoch": 0.0637620750837863, + "grad_norm": 0.47943857767450315, + "learning_rate": 4.997417778773064e-06, + "loss": 0.2119, + "step": 283 + }, + { + "epoch": 0.06398738276959473, + "grad_norm": 0.5947835556877582, + "learning_rate": 4.997332437005932e-06, + "loss": 0.2875, + "step": 284 + }, + { + "epoch": 0.06421269045540316, + "grad_norm": 0.5088488486871213, + "learning_rate": 4.9972457085631825e-06, + "loss": 0.2389, + "step": 285 + }, + { + "epoch": 0.0644379981412116, + "grad_norm": 0.5355780039184576, + "learning_rate": 4.997157593492974e-06, + "loss": 0.22, + "step": 286 + }, + { + "epoch": 0.06466330582702003, + "grad_norm": 0.5213202699700756, + "learning_rate": 4.997068091844233e-06, + "loss": 0.2585, + "step": 287 + }, + { + "epoch": 0.06488861351282846, + "grad_norm": 0.5171261205707682, + "learning_rate": 4.996977203666657e-06, + "loss": 0.2404, + "step": 288 + }, + { + "epoch": 0.06511392119863689, + "grad_norm": 0.507993103133664, + "learning_rate": 4.99688492901071e-06, + "loss": 0.2413, + "step": 289 + }, + { + "epoch": 0.06533922888444532, + "grad_norm": 0.49908079193151955, + "learning_rate": 4.996791267927632e-06, + "loss": 0.2356, + "step": 290 + }, + { + "epoch": 0.06556453657025375, + "grad_norm": 0.5403422364149439, + "learning_rate": 4.996696220469429e-06, + "loss": 0.2404, + "step": 291 + }, + { + "epoch": 0.06578984425606219, + "grad_norm": 0.5541185239546801, + "learning_rate": 4.996599786688876e-06, + "loss": 0.2533, + "step": 292 + }, + { + "epoch": 0.06601515194187062, + "grad_norm": 0.5362361192584062, + "learning_rate": 4.996501966639519e-06, + "loss": 0.2595, + "step": 293 + }, + { + "epoch": 0.06624045962767905, + "grad_norm": 0.6044224612476164, + "learning_rate": 4.996402760375676e-06, + "loss": 0.2913, + "step": 294 + }, + { + "epoch": 0.06646576731348748, + "grad_norm": 0.5315279127985204, + "learning_rate": 4.99630216795243e-06, + "loss": 0.2543, + "step": 295 + }, + { + "epoch": 0.06669107499929591, + "grad_norm": 0.5145747206533833, + "learning_rate": 4.996200189425638e-06, + "loss": 0.2483, + "step": 296 + }, + { + "epoch": 0.06691638268510434, + "grad_norm": 0.5164626055382406, + "learning_rate": 4.996096824851923e-06, + "loss": 0.2558, + "step": 297 + }, + { + "epoch": 0.06714169037091278, + "grad_norm": 0.5139087841971721, + "learning_rate": 4.9959920742886815e-06, + "loss": 0.2386, + "step": 298 + }, + { + "epoch": 0.06736699805672121, + "grad_norm": 0.5494626246964903, + "learning_rate": 4.9958859377940765e-06, + "loss": 0.2299, + "step": 299 + }, + { + "epoch": 0.06759230574252964, + "grad_norm": 0.5354867324483468, + "learning_rate": 4.995778415427042e-06, + "loss": 0.2483, + "step": 300 + }, + { + "epoch": 0.06781761342833807, + "grad_norm": 0.5609491996559047, + "learning_rate": 4.99566950724728e-06, + "loss": 0.2628, + "step": 301 + }, + { + "epoch": 0.0680429211141465, + "grad_norm": 0.5698811107677528, + "learning_rate": 4.995559213315267e-06, + "loss": 0.2743, + "step": 302 + }, + { + "epoch": 0.06826822879995494, + "grad_norm": 0.5029308998025265, + "learning_rate": 4.995447533692239e-06, + "loss": 0.2379, + "step": 303 + }, + { + "epoch": 0.06849353648576337, + "grad_norm": 0.4812997556500436, + "learning_rate": 4.995334468440213e-06, + "loss": 0.2286, + "step": 304 + }, + { + "epoch": 0.0687188441715718, + "grad_norm": 0.49571192011888043, + "learning_rate": 4.995220017621967e-06, + "loss": 0.2227, + "step": 305 + }, + { + "epoch": 0.06894415185738023, + "grad_norm": 0.5168602914024734, + "learning_rate": 4.995104181301052e-06, + "loss": 0.2352, + "step": 306 + }, + { + "epoch": 0.06916945954318866, + "grad_norm": 0.5722317502191536, + "learning_rate": 4.994986959541788e-06, + "loss": 0.2507, + "step": 307 + }, + { + "epoch": 0.0693947672289971, + "grad_norm": 0.5485677223430081, + "learning_rate": 4.994868352409263e-06, + "loss": 0.2349, + "step": 308 + }, + { + "epoch": 0.06962007491480553, + "grad_norm": 0.6547548271666068, + "learning_rate": 4.994748359969336e-06, + "loss": 0.2452, + "step": 309 + }, + { + "epoch": 0.06984538260061396, + "grad_norm": 0.5178515499604229, + "learning_rate": 4.9946269822886335e-06, + "loss": 0.2353, + "step": 310 + }, + { + "epoch": 0.07007069028642239, + "grad_norm": 0.5505729007995378, + "learning_rate": 4.994504219434553e-06, + "loss": 0.2592, + "step": 311 + }, + { + "epoch": 0.07029599797223082, + "grad_norm": 0.5445308429804935, + "learning_rate": 4.9943800714752586e-06, + "loss": 0.2679, + "step": 312 + }, + { + "epoch": 0.07052130565803925, + "grad_norm": 0.49115222469824144, + "learning_rate": 4.994254538479687e-06, + "loss": 0.2287, + "step": 313 + }, + { + "epoch": 0.07074661334384769, + "grad_norm": 0.5788297907213041, + "learning_rate": 4.9941276205175405e-06, + "loss": 0.2478, + "step": 314 + }, + { + "epoch": 0.07097192102965612, + "grad_norm": 0.5166349394709129, + "learning_rate": 4.993999317659293e-06, + "loss": 0.2407, + "step": 315 + }, + { + "epoch": 0.07119722871546455, + "grad_norm": 0.5161451673553353, + "learning_rate": 4.9938696299761856e-06, + "loss": 0.2193, + "step": 316 + }, + { + "epoch": 0.07142253640127298, + "grad_norm": 0.5968230065919653, + "learning_rate": 4.9937385575402284e-06, + "loss": 0.2816, + "step": 317 + }, + { + "epoch": 0.07164784408708141, + "grad_norm": 0.5135868081799784, + "learning_rate": 4.993606100424202e-06, + "loss": 0.2288, + "step": 318 + }, + { + "epoch": 0.07187315177288986, + "grad_norm": 0.5474164925008911, + "learning_rate": 4.9934722587016555e-06, + "loss": 0.2529, + "step": 319 + }, + { + "epoch": 0.07209845945869829, + "grad_norm": 0.5420239841509832, + "learning_rate": 4.9933370324469045e-06, + "loss": 0.2508, + "step": 320 + }, + { + "epoch": 0.07232376714450672, + "grad_norm": 0.5538664634048086, + "learning_rate": 4.993200421735037e-06, + "loss": 0.2478, + "step": 321 + }, + { + "epoch": 0.07254907483031516, + "grad_norm": 0.5090258836334843, + "learning_rate": 4.993062426641906e-06, + "loss": 0.2438, + "step": 322 + }, + { + "epoch": 0.07277438251612359, + "grad_norm": 0.5439963078693223, + "learning_rate": 4.992923047244136e-06, + "loss": 0.2595, + "step": 323 + }, + { + "epoch": 0.07299969020193202, + "grad_norm": 0.543642701605141, + "learning_rate": 4.9927822836191185e-06, + "loss": 0.2671, + "step": 324 + }, + { + "epoch": 0.07322499788774045, + "grad_norm": 0.5297213308215476, + "learning_rate": 4.992640135845016e-06, + "loss": 0.2456, + "step": 325 + }, + { + "epoch": 0.07345030557354888, + "grad_norm": 0.5067944408503602, + "learning_rate": 4.992496604000756e-06, + "loss": 0.2348, + "step": 326 + }, + { + "epoch": 0.07367561325935731, + "grad_norm": 0.5082087946989515, + "learning_rate": 4.992351688166038e-06, + "loss": 0.235, + "step": 327 + }, + { + "epoch": 0.07390092094516575, + "grad_norm": 0.5587053292023891, + "learning_rate": 4.992205388421326e-06, + "loss": 0.2532, + "step": 328 + }, + { + "epoch": 0.07412622863097418, + "grad_norm": 0.5086849805443516, + "learning_rate": 4.992057704847858e-06, + "loss": 0.2427, + "step": 329 + }, + { + "epoch": 0.07435153631678261, + "grad_norm": 0.5181149546680572, + "learning_rate": 4.991908637527634e-06, + "loss": 0.2441, + "step": 330 + }, + { + "epoch": 0.07457684400259104, + "grad_norm": 0.4986491473005848, + "learning_rate": 4.9917581865434275e-06, + "loss": 0.2255, + "step": 331 + }, + { + "epoch": 0.07480215168839947, + "grad_norm": 0.5374009057676805, + "learning_rate": 4.9916063519787775e-06, + "loss": 0.2437, + "step": 332 + }, + { + "epoch": 0.0750274593742079, + "grad_norm": 0.5378994240322091, + "learning_rate": 4.991453133917993e-06, + "loss": 0.2381, + "step": 333 + }, + { + "epoch": 0.07525276706001634, + "grad_norm": 0.5113411744810432, + "learning_rate": 4.991298532446149e-06, + "loss": 0.2451, + "step": 334 + }, + { + "epoch": 0.07547807474582477, + "grad_norm": 0.4954947096424282, + "learning_rate": 4.991142547649091e-06, + "loss": 0.2205, + "step": 335 + }, + { + "epoch": 0.0757033824316332, + "grad_norm": 0.5306624625163016, + "learning_rate": 4.990985179613431e-06, + "loss": 0.2321, + "step": 336 + }, + { + "epoch": 0.07592869011744163, + "grad_norm": 0.5892241074938874, + "learning_rate": 4.990826428426549e-06, + "loss": 0.2512, + "step": 337 + }, + { + "epoch": 0.07615399780325006, + "grad_norm": 0.5487910904949626, + "learning_rate": 4.990666294176596e-06, + "loss": 0.2362, + "step": 338 + }, + { + "epoch": 0.0763793054890585, + "grad_norm": 0.5330163013540344, + "learning_rate": 4.9905047769524855e-06, + "loss": 0.2608, + "step": 339 + }, + { + "epoch": 0.07660461317486693, + "grad_norm": 0.5241644038095767, + "learning_rate": 4.990341876843904e-06, + "loss": 0.2303, + "step": 340 + }, + { + "epoch": 0.07682992086067536, + "grad_norm": 0.5448518478582587, + "learning_rate": 4.990177593941303e-06, + "loss": 0.2339, + "step": 341 + }, + { + "epoch": 0.07705522854648379, + "grad_norm": 0.5081990034886635, + "learning_rate": 4.9900119283359025e-06, + "loss": 0.2288, + "step": 342 + }, + { + "epoch": 0.07728053623229222, + "grad_norm": 0.49935623169210674, + "learning_rate": 4.989844880119692e-06, + "loss": 0.2083, + "step": 343 + }, + { + "epoch": 0.07750584391810066, + "grad_norm": 0.5630445371139264, + "learning_rate": 4.989676449385426e-06, + "loss": 0.2327, + "step": 344 + }, + { + "epoch": 0.07773115160390909, + "grad_norm": 0.5227251379989739, + "learning_rate": 4.989506636226626e-06, + "loss": 0.2323, + "step": 345 + }, + { + "epoch": 0.07795645928971752, + "grad_norm": 0.4961193769148893, + "learning_rate": 4.989335440737587e-06, + "loss": 0.2174, + "step": 346 + }, + { + "epoch": 0.07818176697552595, + "grad_norm": 0.5309519810578879, + "learning_rate": 4.989162863013364e-06, + "loss": 0.2139, + "step": 347 + }, + { + "epoch": 0.07840707466133438, + "grad_norm": 0.9635950757819977, + "learning_rate": 4.988988903149784e-06, + "loss": 0.3062, + "step": 348 + }, + { + "epoch": 0.07863238234714282, + "grad_norm": 0.5297919912126666, + "learning_rate": 4.9888135612434415e-06, + "loss": 0.2354, + "step": 349 + }, + { + "epoch": 0.07885769003295125, + "grad_norm": 0.4983934545325781, + "learning_rate": 4.988636837391696e-06, + "loss": 0.2222, + "step": 350 + }, + { + "epoch": 0.07908299771875968, + "grad_norm": 0.5670806029235518, + "learning_rate": 4.9884587316926765e-06, + "loss": 0.2481, + "step": 351 + }, + { + "epoch": 0.07930830540456811, + "grad_norm": 0.5799817613473424, + "learning_rate": 4.988279244245278e-06, + "loss": 0.2375, + "step": 352 + }, + { + "epoch": 0.07953361309037654, + "grad_norm": 0.5474360262997107, + "learning_rate": 4.988098375149163e-06, + "loss": 0.2342, + "step": 353 + }, + { + "epoch": 0.07975892077618497, + "grad_norm": 0.506711748607797, + "learning_rate": 4.987916124504761e-06, + "loss": 0.2296, + "step": 354 + }, + { + "epoch": 0.0799842284619934, + "grad_norm": 0.5793251179209808, + "learning_rate": 4.987732492413271e-06, + "loss": 0.2188, + "step": 355 + }, + { + "epoch": 0.08020953614780184, + "grad_norm": 0.5050257821348573, + "learning_rate": 4.987547478976655e-06, + "loss": 0.2282, + "step": 356 + }, + { + "epoch": 0.08043484383361027, + "grad_norm": 0.5002712035100754, + "learning_rate": 4.987361084297645e-06, + "loss": 0.2253, + "step": 357 + }, + { + "epoch": 0.0806601515194187, + "grad_norm": 0.5132547604305784, + "learning_rate": 4.987173308479738e-06, + "loss": 0.2253, + "step": 358 + }, + { + "epoch": 0.08088545920522713, + "grad_norm": 0.544107852830466, + "learning_rate": 4.9869841516272004e-06, + "loss": 0.2279, + "step": 359 + }, + { + "epoch": 0.08111076689103557, + "grad_norm": 0.498883580538128, + "learning_rate": 4.9867936138450635e-06, + "loss": 0.2233, + "step": 360 + }, + { + "epoch": 0.081336074576844, + "grad_norm": 0.5534946415726206, + "learning_rate": 4.986601695239125e-06, + "loss": 0.2483, + "step": 361 + }, + { + "epoch": 0.08156138226265243, + "grad_norm": 0.5214569493391799, + "learning_rate": 4.98640839591595e-06, + "loss": 0.2331, + "step": 362 + }, + { + "epoch": 0.08178668994846086, + "grad_norm": 0.5523513196979261, + "learning_rate": 4.986213715982873e-06, + "loss": 0.2362, + "step": 363 + }, + { + "epoch": 0.08201199763426929, + "grad_norm": 0.6206515558099188, + "learning_rate": 4.986017655547989e-06, + "loss": 0.2403, + "step": 364 + }, + { + "epoch": 0.08223730532007772, + "grad_norm": 0.5452295216679693, + "learning_rate": 4.985820214720165e-06, + "loss": 0.229, + "step": 365 + }, + { + "epoch": 0.08246261300588616, + "grad_norm": 0.5402735092300626, + "learning_rate": 4.985621393609032e-06, + "loss": 0.2547, + "step": 366 + }, + { + "epoch": 0.08268792069169459, + "grad_norm": 0.476065793944971, + "learning_rate": 4.98542119232499e-06, + "loss": 0.2103, + "step": 367 + }, + { + "epoch": 0.08291322837750303, + "grad_norm": 0.49875233446817213, + "learning_rate": 4.9852196109792e-06, + "loss": 0.2113, + "step": 368 + }, + { + "epoch": 0.08313853606331147, + "grad_norm": 0.566881708300442, + "learning_rate": 4.985016649683594e-06, + "loss": 0.244, + "step": 369 + }, + { + "epoch": 0.0833638437491199, + "grad_norm": 0.5054958687729084, + "learning_rate": 4.984812308550869e-06, + "loss": 0.2492, + "step": 370 + }, + { + "epoch": 0.08358915143492833, + "grad_norm": 0.4812044370610267, + "learning_rate": 4.984606587694488e-06, + "loss": 0.221, + "step": 371 + }, + { + "epoch": 0.08381445912073676, + "grad_norm": 0.6135791875609653, + "learning_rate": 4.98439948722868e-06, + "loss": 0.2429, + "step": 372 + }, + { + "epoch": 0.0840397668065452, + "grad_norm": 0.4948654390611102, + "learning_rate": 4.9841910072684406e-06, + "loss": 0.2469, + "step": 373 + }, + { + "epoch": 0.08426507449235363, + "grad_norm": 0.5390131799760087, + "learning_rate": 4.98398114792953e-06, + "loss": 0.2335, + "step": 374 + }, + { + "epoch": 0.08449038217816206, + "grad_norm": 0.583261071954013, + "learning_rate": 4.9837699093284765e-06, + "loss": 0.2322, + "step": 375 + }, + { + "epoch": 0.08471568986397049, + "grad_norm": 0.5320379581667112, + "learning_rate": 4.983557291582572e-06, + "loss": 0.2395, + "step": 376 + }, + { + "epoch": 0.08494099754977892, + "grad_norm": 0.5073932754465009, + "learning_rate": 4.983343294809875e-06, + "loss": 0.2189, + "step": 377 + }, + { + "epoch": 0.08516630523558735, + "grad_norm": 0.5148060147019823, + "learning_rate": 4.9831279191292114e-06, + "loss": 0.2356, + "step": 378 + }, + { + "epoch": 0.08539161292139578, + "grad_norm": 0.5111114514950656, + "learning_rate": 4.98291116466017e-06, + "loss": 0.2355, + "step": 379 + }, + { + "epoch": 0.08561692060720422, + "grad_norm": 0.48099837068101386, + "learning_rate": 4.982693031523107e-06, + "loss": 0.1958, + "step": 380 + }, + { + "epoch": 0.08584222829301265, + "grad_norm": 0.5316267743233823, + "learning_rate": 4.982473519839144e-06, + "loss": 0.2209, + "step": 381 + }, + { + "epoch": 0.08606753597882108, + "grad_norm": 0.5297600064502063, + "learning_rate": 4.982252629730167e-06, + "loss": 0.2291, + "step": 382 + }, + { + "epoch": 0.08629284366462951, + "grad_norm": 0.5527569259794954, + "learning_rate": 4.982030361318827e-06, + "loss": 0.2307, + "step": 383 + }, + { + "epoch": 0.08651815135043794, + "grad_norm": 0.49587010959827277, + "learning_rate": 4.981806714728543e-06, + "loss": 0.2145, + "step": 384 + }, + { + "epoch": 0.08674345903624638, + "grad_norm": 0.5037256164499812, + "learning_rate": 4.981581690083498e-06, + "loss": 0.2005, + "step": 385 + }, + { + "epoch": 0.08696876672205481, + "grad_norm": 0.4995542944743229, + "learning_rate": 4.981355287508638e-06, + "loss": 0.2023, + "step": 386 + }, + { + "epoch": 0.08719407440786324, + "grad_norm": 0.49398253598583813, + "learning_rate": 4.981127507129677e-06, + "loss": 0.2235, + "step": 387 + }, + { + "epoch": 0.08741938209367167, + "grad_norm": 0.5734112149538373, + "learning_rate": 4.980898349073094e-06, + "loss": 0.2342, + "step": 388 + }, + { + "epoch": 0.0876446897794801, + "grad_norm": 0.5214322796133589, + "learning_rate": 4.9806678134661295e-06, + "loss": 0.2231, + "step": 389 + }, + { + "epoch": 0.08786999746528854, + "grad_norm": 0.5484550915426594, + "learning_rate": 4.980435900436793e-06, + "loss": 0.2207, + "step": 390 + }, + { + "epoch": 0.08809530515109697, + "grad_norm": 0.5014628185290569, + "learning_rate": 4.980202610113857e-06, + "loss": 0.2204, + "step": 391 + }, + { + "epoch": 0.0883206128369054, + "grad_norm": 0.5251987157864981, + "learning_rate": 4.9799679426268575e-06, + "loss": 0.2525, + "step": 392 + }, + { + "epoch": 0.08854592052271383, + "grad_norm": 0.5143863754732736, + "learning_rate": 4.9797318981061e-06, + "loss": 0.2388, + "step": 393 + }, + { + "epoch": 0.08877122820852226, + "grad_norm": 0.5260299246085233, + "learning_rate": 4.979494476682647e-06, + "loss": 0.2426, + "step": 394 + }, + { + "epoch": 0.0889965358943307, + "grad_norm": 0.48550633855724745, + "learning_rate": 4.979255678488332e-06, + "loss": 0.2155, + "step": 395 + }, + { + "epoch": 0.08922184358013913, + "grad_norm": 0.5194610591194821, + "learning_rate": 4.979015503655751e-06, + "loss": 0.2351, + "step": 396 + }, + { + "epoch": 0.08944715126594756, + "grad_norm": 0.5070984401123457, + "learning_rate": 4.978773952318263e-06, + "loss": 0.2322, + "step": 397 + }, + { + "epoch": 0.08967245895175599, + "grad_norm": 0.5092030320619917, + "learning_rate": 4.978531024609994e-06, + "loss": 0.241, + "step": 398 + }, + { + "epoch": 0.08989776663756442, + "grad_norm": 0.5047399699717205, + "learning_rate": 4.978286720665832e-06, + "loss": 0.2375, + "step": 399 + }, + { + "epoch": 0.09012307432337285, + "grad_norm": 0.49513117961229813, + "learning_rate": 4.978041040621428e-06, + "loss": 0.2155, + "step": 400 + }, + { + "epoch": 0.09034838200918129, + "grad_norm": 0.5324274969850328, + "learning_rate": 4.977793984613202e-06, + "loss": 0.2293, + "step": 401 + }, + { + "epoch": 0.09057368969498972, + "grad_norm": 0.5358967052765463, + "learning_rate": 4.977545552778333e-06, + "loss": 0.227, + "step": 402 + }, + { + "epoch": 0.09079899738079815, + "grad_norm": 0.4927720394246359, + "learning_rate": 4.977295745254766e-06, + "loss": 0.2194, + "step": 403 + }, + { + "epoch": 0.09102430506660658, + "grad_norm": 0.4842737360300386, + "learning_rate": 4.977044562181212e-06, + "loss": 0.2176, + "step": 404 + }, + { + "epoch": 0.09124961275241501, + "grad_norm": 0.49995778400684115, + "learning_rate": 4.9767920036971406e-06, + "loss": 0.2228, + "step": 405 + }, + { + "epoch": 0.09147492043822344, + "grad_norm": 0.4926753761421992, + "learning_rate": 4.9765380699427905e-06, + "loss": 0.2089, + "step": 406 + }, + { + "epoch": 0.09170022812403188, + "grad_norm": 0.6021881537351668, + "learning_rate": 4.97628276105916e-06, + "loss": 0.2505, + "step": 407 + }, + { + "epoch": 0.09192553580984031, + "grad_norm": 0.5639995287297204, + "learning_rate": 4.976026077188013e-06, + "loss": 0.2342, + "step": 408 + }, + { + "epoch": 0.09215084349564874, + "grad_norm": 0.5367590005490487, + "learning_rate": 4.975768018471877e-06, + "loss": 0.2159, + "step": 409 + }, + { + "epoch": 0.09237615118145717, + "grad_norm": 0.4710815460528756, + "learning_rate": 4.9755085850540426e-06, + "loss": 0.2245, + "step": 410 + }, + { + "epoch": 0.0926014588672656, + "grad_norm": 0.5217372035845981, + "learning_rate": 4.9752477770785625e-06, + "loss": 0.2343, + "step": 411 + }, + { + "epoch": 0.09282676655307404, + "grad_norm": 0.5031243975807814, + "learning_rate": 4.974985594690255e-06, + "loss": 0.2321, + "step": 412 + }, + { + "epoch": 0.09305207423888247, + "grad_norm": 0.4514788516637943, + "learning_rate": 4.9747220380346975e-06, + "loss": 0.2007, + "step": 413 + }, + { + "epoch": 0.0932773819246909, + "grad_norm": 0.5295799170285104, + "learning_rate": 4.9744571072582365e-06, + "loss": 0.245, + "step": 414 + }, + { + "epoch": 0.09350268961049933, + "grad_norm": 0.4931503903694202, + "learning_rate": 4.974190802507977e-06, + "loss": 0.2183, + "step": 415 + }, + { + "epoch": 0.09372799729630776, + "grad_norm": 0.515837150681483, + "learning_rate": 4.973923123931786e-06, + "loss": 0.2205, + "step": 416 + }, + { + "epoch": 0.09395330498211621, + "grad_norm": 0.5117249890633937, + "learning_rate": 4.973654071678299e-06, + "loss": 0.2171, + "step": 417 + }, + { + "epoch": 0.09417861266792464, + "grad_norm": 0.5827630877715476, + "learning_rate": 4.973383645896908e-06, + "loss": 0.23, + "step": 418 + }, + { + "epoch": 0.09440392035373307, + "grad_norm": 0.4820937642698469, + "learning_rate": 4.973111846737772e-06, + "loss": 0.2111, + "step": 419 + }, + { + "epoch": 0.0946292280395415, + "grad_norm": 0.5111326848098502, + "learning_rate": 4.97283867435181e-06, + "loss": 0.2196, + "step": 420 + }, + { + "epoch": 0.09485453572534994, + "grad_norm": 0.49075544949749467, + "learning_rate": 4.972564128890704e-06, + "loss": 0.2407, + "step": 421 + }, + { + "epoch": 0.09507984341115837, + "grad_norm": 0.4788769418855161, + "learning_rate": 4.972288210506902e-06, + "loss": 0.2211, + "step": 422 + }, + { + "epoch": 0.0953051510969668, + "grad_norm": 0.49706576632158217, + "learning_rate": 4.972010919353606e-06, + "loss": 0.2163, + "step": 423 + }, + { + "epoch": 0.09553045878277523, + "grad_norm": 0.5082765894211952, + "learning_rate": 4.971732255584789e-06, + "loss": 0.23, + "step": 424 + }, + { + "epoch": 0.09575576646858366, + "grad_norm": 0.525449720053065, + "learning_rate": 4.971452219355182e-06, + "loss": 0.2427, + "step": 425 + }, + { + "epoch": 0.0959810741543921, + "grad_norm": 0.5078779695635218, + "learning_rate": 4.971170810820279e-06, + "loss": 0.2315, + "step": 426 + }, + { + "epoch": 0.09620638184020053, + "grad_norm": 0.4893344798924764, + "learning_rate": 4.970888030136335e-06, + "loss": 0.2178, + "step": 427 + }, + { + "epoch": 0.09643168952600896, + "grad_norm": 0.5281274710278412, + "learning_rate": 4.970603877460367e-06, + "loss": 0.2459, + "step": 428 + }, + { + "epoch": 0.09665699721181739, + "grad_norm": 0.5358534122086207, + "learning_rate": 4.970318352950155e-06, + "loss": 0.2451, + "step": 429 + }, + { + "epoch": 0.09688230489762582, + "grad_norm": 0.5119365743171465, + "learning_rate": 4.970031456764242e-06, + "loss": 0.2398, + "step": 430 + }, + { + "epoch": 0.09710761258343426, + "grad_norm": 0.4831996723152414, + "learning_rate": 4.9697431890619265e-06, + "loss": 0.2162, + "step": 431 + }, + { + "epoch": 0.09733292026924269, + "grad_norm": 0.5284018105155921, + "learning_rate": 4.969453550003277e-06, + "loss": 0.226, + "step": 432 + }, + { + "epoch": 0.09755822795505112, + "grad_norm": 0.5201156313325921, + "learning_rate": 4.969162539749117e-06, + "loss": 0.2246, + "step": 433 + }, + { + "epoch": 0.09778353564085955, + "grad_norm": 0.49223922935814357, + "learning_rate": 4.9688701584610345e-06, + "loss": 0.2088, + "step": 434 + }, + { + "epoch": 0.09800884332666798, + "grad_norm": 0.46543427683770366, + "learning_rate": 4.968576406301377e-06, + "loss": 0.215, + "step": 435 + }, + { + "epoch": 0.09823415101247641, + "grad_norm": 0.4950767754154196, + "learning_rate": 4.968281283433256e-06, + "loss": 0.2213, + "step": 436 + }, + { + "epoch": 0.09845945869828485, + "grad_norm": 0.4904354526355348, + "learning_rate": 4.96798479002054e-06, + "loss": 0.2098, + "step": 437 + }, + { + "epoch": 0.09868476638409328, + "grad_norm": 0.5198393181232206, + "learning_rate": 4.967686926227862e-06, + "loss": 0.2313, + "step": 438 + }, + { + "epoch": 0.09891007406990171, + "grad_norm": 0.5210881992179851, + "learning_rate": 4.967387692220615e-06, + "loss": 0.2364, + "step": 439 + }, + { + "epoch": 0.09913538175571014, + "grad_norm": 0.5279937904475803, + "learning_rate": 4.967087088164951e-06, + "loss": 0.2172, + "step": 440 + }, + { + "epoch": 0.09936068944151857, + "grad_norm": 0.46927680481270295, + "learning_rate": 4.966785114227785e-06, + "loss": 0.2164, + "step": 441 + }, + { + "epoch": 0.099585997127327, + "grad_norm": 0.5436814169365575, + "learning_rate": 4.966481770576793e-06, + "loss": 0.231, + "step": 442 + }, + { + "epoch": 0.09981130481313544, + "grad_norm": 0.49606398855812583, + "learning_rate": 4.966177057380409e-06, + "loss": 0.2126, + "step": 443 + }, + { + "epoch": 0.10003661249894387, + "grad_norm": 0.5181928569264367, + "learning_rate": 4.965870974807829e-06, + "loss": 0.2196, + "step": 444 + }, + { + "epoch": 0.1002619201847523, + "grad_norm": 0.5394600651671896, + "learning_rate": 4.96556352302901e-06, + "loss": 0.2385, + "step": 445 + }, + { + "epoch": 0.10048722787056073, + "grad_norm": 0.5655328432369621, + "learning_rate": 4.965254702214668e-06, + "loss": 0.2198, + "step": 446 + }, + { + "epoch": 0.10071253555636916, + "grad_norm": 0.5009056388184171, + "learning_rate": 4.96494451253628e-06, + "loss": 0.2143, + "step": 447 + }, + { + "epoch": 0.1009378432421776, + "grad_norm": 0.5247959148001261, + "learning_rate": 4.964632954166081e-06, + "loss": 0.2507, + "step": 448 + }, + { + "epoch": 0.10116315092798603, + "grad_norm": 0.4677901786401866, + "learning_rate": 4.964320027277071e-06, + "loss": 0.1901, + "step": 449 + }, + { + "epoch": 0.10138845861379446, + "grad_norm": 0.5460125641377022, + "learning_rate": 4.964005732043003e-06, + "loss": 0.2368, + "step": 450 + }, + { + "epoch": 0.10161376629960289, + "grad_norm": 0.5084799111689264, + "learning_rate": 4.963690068638397e-06, + "loss": 0.2283, + "step": 451 + }, + { + "epoch": 0.10183907398541132, + "grad_norm": 0.5099752952507947, + "learning_rate": 4.963373037238527e-06, + "loss": 0.2272, + "step": 452 + }, + { + "epoch": 0.10206438167121976, + "grad_norm": 0.5102485750814726, + "learning_rate": 4.963054638019429e-06, + "loss": 0.2254, + "step": 453 + }, + { + "epoch": 0.10228968935702819, + "grad_norm": 0.5082839891294482, + "learning_rate": 4.9627348711578996e-06, + "loss": 0.22, + "step": 454 + }, + { + "epoch": 0.10251499704283662, + "grad_norm": 0.4978067731345427, + "learning_rate": 4.962413736831491e-06, + "loss": 0.2301, + "step": 455 + }, + { + "epoch": 0.10274030472864505, + "grad_norm": 0.556751830407001, + "learning_rate": 4.962091235218518e-06, + "loss": 0.2371, + "step": 456 + }, + { + "epoch": 0.10296561241445348, + "grad_norm": 0.544335856879091, + "learning_rate": 4.961767366498055e-06, + "loss": 0.2291, + "step": 457 + }, + { + "epoch": 0.10319092010026192, + "grad_norm": 0.4825687927700831, + "learning_rate": 4.961442130849933e-06, + "loss": 0.2195, + "step": 458 + }, + { + "epoch": 0.10341622778607035, + "grad_norm": 0.534832085741591, + "learning_rate": 4.961115528454743e-06, + "loss": 0.2496, + "step": 459 + }, + { + "epoch": 0.10364153547187878, + "grad_norm": 0.48776000152179, + "learning_rate": 4.960787559493836e-06, + "loss": 0.2117, + "step": 460 + }, + { + "epoch": 0.10386684315768721, + "grad_norm": 0.5201482825488519, + "learning_rate": 4.96045822414932e-06, + "loss": 0.2283, + "step": 461 + }, + { + "epoch": 0.10409215084349564, + "grad_norm": 0.5235344352676657, + "learning_rate": 4.960127522604065e-06, + "loss": 0.2198, + "step": 462 + }, + { + "epoch": 0.10431745852930407, + "grad_norm": 0.5531366617700393, + "learning_rate": 4.959795455041694e-06, + "loss": 0.2198, + "step": 463 + }, + { + "epoch": 0.1045427662151125, + "grad_norm": 0.5536243341931267, + "learning_rate": 4.959462021646593e-06, + "loss": 0.24, + "step": 464 + }, + { + "epoch": 0.10476807390092094, + "grad_norm": 0.4754913677181068, + "learning_rate": 4.959127222603905e-06, + "loss": 0.1975, + "step": 465 + }, + { + "epoch": 0.10499338158672938, + "grad_norm": 0.522374645050011, + "learning_rate": 4.958791058099533e-06, + "loss": 0.2244, + "step": 466 + }, + { + "epoch": 0.10521868927253782, + "grad_norm": 0.4792084707936445, + "learning_rate": 4.958453528320135e-06, + "loss": 0.2033, + "step": 467 + }, + { + "epoch": 0.10544399695834625, + "grad_norm": 0.516610671424305, + "learning_rate": 4.95811463345313e-06, + "loss": 0.2255, + "step": 468 + }, + { + "epoch": 0.10566930464415468, + "grad_norm": 0.5363479563265644, + "learning_rate": 4.957774373686692e-06, + "loss": 0.2463, + "step": 469 + }, + { + "epoch": 0.10589461232996311, + "grad_norm": 0.5133297822300238, + "learning_rate": 4.957432749209755e-06, + "loss": 0.2339, + "step": 470 + }, + { + "epoch": 0.10611992001577154, + "grad_norm": 0.5320272386882607, + "learning_rate": 4.95708976021201e-06, + "loss": 0.2253, + "step": 471 + }, + { + "epoch": 0.10634522770157998, + "grad_norm": 0.5199773917149282, + "learning_rate": 4.956745406883909e-06, + "loss": 0.2271, + "step": 472 + }, + { + "epoch": 0.10657053538738841, + "grad_norm": 0.5120357202307477, + "learning_rate": 4.956399689416654e-06, + "loss": 0.2124, + "step": 473 + }, + { + "epoch": 0.10679584307319684, + "grad_norm": 0.5294804590828698, + "learning_rate": 4.956052608002212e-06, + "loss": 0.2381, + "step": 474 + }, + { + "epoch": 0.10702115075900527, + "grad_norm": 0.5258961780276532, + "learning_rate": 4.9557041628333046e-06, + "loss": 0.2304, + "step": 475 + }, + { + "epoch": 0.1072464584448137, + "grad_norm": 0.48542519662207223, + "learning_rate": 4.9553543541034086e-06, + "loss": 0.1965, + "step": 476 + }, + { + "epoch": 0.10747176613062213, + "grad_norm": 0.4894293903644207, + "learning_rate": 4.955003182006761e-06, + "loss": 0.2142, + "step": 477 + }, + { + "epoch": 0.10769707381643057, + "grad_norm": 0.5609171481609372, + "learning_rate": 4.954650646738354e-06, + "loss": 0.2277, + "step": 478 + }, + { + "epoch": 0.107922381502239, + "grad_norm": 0.507216106075902, + "learning_rate": 4.954296748493938e-06, + "loss": 0.2156, + "step": 479 + }, + { + "epoch": 0.10814768918804743, + "grad_norm": 0.49817418101277045, + "learning_rate": 4.953941487470017e-06, + "loss": 0.2156, + "step": 480 + }, + { + "epoch": 0.10837299687385586, + "grad_norm": 0.5061697163401724, + "learning_rate": 4.9535848638638586e-06, + "loss": 0.2259, + "step": 481 + }, + { + "epoch": 0.1085983045596643, + "grad_norm": 0.5381816271686993, + "learning_rate": 4.953226877873479e-06, + "loss": 0.2555, + "step": 482 + }, + { + "epoch": 0.10882361224547273, + "grad_norm": 0.5179781718200609, + "learning_rate": 4.952867529697656e-06, + "loss": 0.2211, + "step": 483 + }, + { + "epoch": 0.10904891993128116, + "grad_norm": 0.5275157446369125, + "learning_rate": 4.952506819535922e-06, + "loss": 0.2311, + "step": 484 + }, + { + "epoch": 0.10927422761708959, + "grad_norm": 0.5174387763563961, + "learning_rate": 4.952144747588566e-06, + "loss": 0.236, + "step": 485 + }, + { + "epoch": 0.10949953530289802, + "grad_norm": 0.5342583071967775, + "learning_rate": 4.951781314056633e-06, + "loss": 0.2216, + "step": 486 + }, + { + "epoch": 0.10972484298870645, + "grad_norm": 0.5088991054614, + "learning_rate": 4.951416519141923e-06, + "loss": 0.2282, + "step": 487 + }, + { + "epoch": 0.10995015067451488, + "grad_norm": 0.4957880984614225, + "learning_rate": 4.951050363046995e-06, + "loss": 0.2004, + "step": 488 + }, + { + "epoch": 0.11017545836032332, + "grad_norm": 0.5954851719628047, + "learning_rate": 4.95068284597516e-06, + "loss": 0.2391, + "step": 489 + }, + { + "epoch": 0.11040076604613175, + "grad_norm": 0.542789623177455, + "learning_rate": 4.950313968130488e-06, + "loss": 0.2446, + "step": 490 + }, + { + "epoch": 0.11062607373194018, + "grad_norm": 0.5143227911053264, + "learning_rate": 4.949943729717802e-06, + "loss": 0.2272, + "step": 491 + }, + { + "epoch": 0.11085138141774861, + "grad_norm": 0.4915783281621786, + "learning_rate": 4.949572130942683e-06, + "loss": 0.2187, + "step": 492 + }, + { + "epoch": 0.11107668910355704, + "grad_norm": 0.48971397846998715, + "learning_rate": 4.949199172011464e-06, + "loss": 0.2164, + "step": 493 + }, + { + "epoch": 0.11130199678936548, + "grad_norm": 0.5325989869618188, + "learning_rate": 4.948824853131237e-06, + "loss": 0.2332, + "step": 494 + }, + { + "epoch": 0.11152730447517391, + "grad_norm": 0.4870499900700446, + "learning_rate": 4.948449174509846e-06, + "loss": 0.2291, + "step": 495 + }, + { + "epoch": 0.11175261216098234, + "grad_norm": 0.4990937183219913, + "learning_rate": 4.948072136355892e-06, + "loss": 0.218, + "step": 496 + }, + { + "epoch": 0.11197791984679077, + "grad_norm": 0.5082233941899192, + "learning_rate": 4.94769373887873e-06, + "loss": 0.2248, + "step": 497 + }, + { + "epoch": 0.1122032275325992, + "grad_norm": 0.5235779538403509, + "learning_rate": 4.94731398228847e-06, + "loss": 0.2538, + "step": 498 + }, + { + "epoch": 0.11242853521840764, + "grad_norm": 0.533179350591644, + "learning_rate": 4.946932866795977e-06, + "loss": 0.2232, + "step": 499 + }, + { + "epoch": 0.11265384290421607, + "grad_norm": 0.5725883892669145, + "learning_rate": 4.94655039261287e-06, + "loss": 0.2317, + "step": 500 + }, + { + "epoch": 0.11265384290421607, + "eval_loss": 0.2230980396270752, + "eval_runtime": 56.9405, + "eval_samples_per_second": 50.404, + "eval_steps_per_second": 6.305, + "step": 500 + }, + { + "epoch": 0.1128791505900245, + "grad_norm": 0.4698722230427548, + "learning_rate": 4.946166559951523e-06, + "loss": 0.2049, + "step": 501 + }, + { + "epoch": 0.11310445827583293, + "grad_norm": 0.5397038003467908, + "learning_rate": 4.9457813690250635e-06, + "loss": 0.2296, + "step": 502 + }, + { + "epoch": 0.11332976596164136, + "grad_norm": 0.5221480768168976, + "learning_rate": 4.945394820047373e-06, + "loss": 0.2207, + "step": 503 + }, + { + "epoch": 0.1135550736474498, + "grad_norm": 0.48520982166447596, + "learning_rate": 4.94500691323309e-06, + "loss": 0.2138, + "step": 504 + }, + { + "epoch": 0.11378038133325823, + "grad_norm": 0.5746966801004397, + "learning_rate": 4.944617648797602e-06, + "loss": 0.2518, + "step": 505 + }, + { + "epoch": 0.11400568901906666, + "grad_norm": 0.521211629150247, + "learning_rate": 4.9442270269570545e-06, + "loss": 0.2068, + "step": 506 + }, + { + "epoch": 0.11423099670487509, + "grad_norm": 0.47772853554650735, + "learning_rate": 4.943835047928346e-06, + "loss": 0.1953, + "step": 507 + }, + { + "epoch": 0.11445630439068352, + "grad_norm": 0.5544391514701812, + "learning_rate": 4.943441711929126e-06, + "loss": 0.2359, + "step": 508 + }, + { + "epoch": 0.11468161207649195, + "grad_norm": 0.5407544487298549, + "learning_rate": 4.9430470191778e-06, + "loss": 0.2269, + "step": 509 + }, + { + "epoch": 0.11490691976230039, + "grad_norm": 0.5244153211782464, + "learning_rate": 4.942650969893527e-06, + "loss": 0.2183, + "step": 510 + }, + { + "epoch": 0.11513222744810882, + "grad_norm": 0.5185883649204517, + "learning_rate": 4.942253564296217e-06, + "loss": 0.2385, + "step": 511 + }, + { + "epoch": 0.11535753513391725, + "grad_norm": 0.5270740200557205, + "learning_rate": 4.941854802606537e-06, + "loss": 0.2191, + "step": 512 + }, + { + "epoch": 0.11558284281972568, + "grad_norm": 0.5198711172641344, + "learning_rate": 4.9414546850459014e-06, + "loss": 0.1988, + "step": 513 + }, + { + "epoch": 0.11580815050553411, + "grad_norm": 0.45775293911805753, + "learning_rate": 4.941053211836482e-06, + "loss": 0.1945, + "step": 514 + }, + { + "epoch": 0.11603345819134256, + "grad_norm": 0.537592497802279, + "learning_rate": 4.940650383201202e-06, + "loss": 0.2201, + "step": 515 + }, + { + "epoch": 0.11625876587715099, + "grad_norm": 0.5111545898331085, + "learning_rate": 4.940246199363737e-06, + "loss": 0.2158, + "step": 516 + }, + { + "epoch": 0.11648407356295942, + "grad_norm": 0.48521781953079823, + "learning_rate": 4.939840660548515e-06, + "loss": 0.2247, + "step": 517 + }, + { + "epoch": 0.11670938124876785, + "grad_norm": 0.48611016286528647, + "learning_rate": 4.939433766980717e-06, + "loss": 0.1998, + "step": 518 + }, + { + "epoch": 0.11693468893457629, + "grad_norm": 0.5498962603721859, + "learning_rate": 4.939025518886276e-06, + "loss": 0.2346, + "step": 519 + }, + { + "epoch": 0.11715999662038472, + "grad_norm": 0.464208027616344, + "learning_rate": 4.9386159164918764e-06, + "loss": 0.216, + "step": 520 + }, + { + "epoch": 0.11738530430619315, + "grad_norm": 0.5220733803410864, + "learning_rate": 4.938204960024955e-06, + "loss": 0.2444, + "step": 521 + }, + { + "epoch": 0.11761061199200158, + "grad_norm": 0.5037206512884734, + "learning_rate": 4.937792649713701e-06, + "loss": 0.2207, + "step": 522 + }, + { + "epoch": 0.11783591967781001, + "grad_norm": 0.4641506492723032, + "learning_rate": 4.937378985787055e-06, + "loss": 0.2025, + "step": 523 + }, + { + "epoch": 0.11806122736361845, + "grad_norm": 0.4926177131624458, + "learning_rate": 4.9369639684747095e-06, + "loss": 0.2068, + "step": 524 + }, + { + "epoch": 0.11828653504942688, + "grad_norm": 0.5135612305585935, + "learning_rate": 4.936547598007107e-06, + "loss": 0.2246, + "step": 525 + }, + { + "epoch": 0.11851184273523531, + "grad_norm": 0.4900183024362626, + "learning_rate": 4.936129874615443e-06, + "loss": 0.2002, + "step": 526 + }, + { + "epoch": 0.11873715042104374, + "grad_norm": 0.49508420952954096, + "learning_rate": 4.935710798531664e-06, + "loss": 0.2281, + "step": 527 + }, + { + "epoch": 0.11896245810685217, + "grad_norm": 0.4834300036086303, + "learning_rate": 4.935290369988468e-06, + "loss": 0.2403, + "step": 528 + }, + { + "epoch": 0.1191877657926606, + "grad_norm": 0.5353723395435673, + "learning_rate": 4.934868589219302e-06, + "loss": 0.2302, + "step": 529 + }, + { + "epoch": 0.11941307347846904, + "grad_norm": 0.4860356717420388, + "learning_rate": 4.934445456458366e-06, + "loss": 0.2138, + "step": 530 + }, + { + "epoch": 0.11963838116427747, + "grad_norm": 0.5010723396295983, + "learning_rate": 4.934020971940609e-06, + "loss": 0.234, + "step": 531 + }, + { + "epoch": 0.1198636888500859, + "grad_norm": 0.4940777209766888, + "learning_rate": 4.933595135901733e-06, + "loss": 0.2117, + "step": 532 + }, + { + "epoch": 0.12008899653589433, + "grad_norm": 0.47595074974568136, + "learning_rate": 4.933167948578187e-06, + "loss": 0.2073, + "step": 533 + }, + { + "epoch": 0.12031430422170276, + "grad_norm": 0.5264491500029588, + "learning_rate": 4.932739410207172e-06, + "loss": 0.2395, + "step": 534 + }, + { + "epoch": 0.1205396119075112, + "grad_norm": 0.5132994804908365, + "learning_rate": 4.932309521026643e-06, + "loss": 0.2215, + "step": 535 + }, + { + "epoch": 0.12076491959331963, + "grad_norm": 0.49825588383402764, + "learning_rate": 4.931878281275296e-06, + "loss": 0.2164, + "step": 536 + }, + { + "epoch": 0.12099022727912806, + "grad_norm": 0.46875882919019307, + "learning_rate": 4.931445691192587e-06, + "loss": 0.2075, + "step": 537 + }, + { + "epoch": 0.12121553496493649, + "grad_norm": 0.4968035502191898, + "learning_rate": 4.931011751018715e-06, + "loss": 0.2234, + "step": 538 + }, + { + "epoch": 0.12144084265074492, + "grad_norm": 0.45371511823110433, + "learning_rate": 4.930576460994631e-06, + "loss": 0.199, + "step": 539 + }, + { + "epoch": 0.12166615033655336, + "grad_norm": 0.5403412963835224, + "learning_rate": 4.930139821362036e-06, + "loss": 0.2254, + "step": 540 + }, + { + "epoch": 0.12189145802236179, + "grad_norm": 0.4982859130144187, + "learning_rate": 4.929701832363379e-06, + "loss": 0.2188, + "step": 541 + }, + { + "epoch": 0.12211676570817022, + "grad_norm": 0.4496322669343131, + "learning_rate": 4.929262494241859e-06, + "loss": 0.1972, + "step": 542 + }, + { + "epoch": 0.12234207339397865, + "grad_norm": 0.5121308736461709, + "learning_rate": 4.928821807241425e-06, + "loss": 0.2112, + "step": 543 + }, + { + "epoch": 0.12256738107978708, + "grad_norm": 0.46707399467282235, + "learning_rate": 4.928379771606773e-06, + "loss": 0.2104, + "step": 544 + }, + { + "epoch": 0.12279268876559551, + "grad_norm": 0.4981270782643168, + "learning_rate": 4.927936387583348e-06, + "loss": 0.2369, + "step": 545 + }, + { + "epoch": 0.12301799645140395, + "grad_norm": 0.47198934236316537, + "learning_rate": 4.927491655417347e-06, + "loss": 0.2053, + "step": 546 + }, + { + "epoch": 0.12324330413721238, + "grad_norm": 0.5117927892317086, + "learning_rate": 4.927045575355712e-06, + "loss": 0.2432, + "step": 547 + }, + { + "epoch": 0.12346861182302081, + "grad_norm": 0.5051292761662544, + "learning_rate": 4.926598147646134e-06, + "loss": 0.2177, + "step": 548 + }, + { + "epoch": 0.12369391950882924, + "grad_norm": 0.46156354865639077, + "learning_rate": 4.9261493725370546e-06, + "loss": 0.1941, + "step": 549 + }, + { + "epoch": 0.12391922719463767, + "grad_norm": 0.5009780688337698, + "learning_rate": 4.9256992502776605e-06, + "loss": 0.2149, + "step": 550 + }, + { + "epoch": 0.1241445348804461, + "grad_norm": 0.4743998865409742, + "learning_rate": 4.925247781117888e-06, + "loss": 0.1954, + "step": 551 + }, + { + "epoch": 0.12436984256625454, + "grad_norm": 0.49396133628898836, + "learning_rate": 4.924794965308421e-06, + "loss": 0.2036, + "step": 552 + }, + { + "epoch": 0.12459515025206297, + "grad_norm": 0.5315568565652948, + "learning_rate": 4.924340803100692e-06, + "loss": 0.2297, + "step": 553 + }, + { + "epoch": 0.1248204579378714, + "grad_norm": 0.48953283371440887, + "learning_rate": 4.9238852947468796e-06, + "loss": 0.2172, + "step": 554 + }, + { + "epoch": 0.12504576562367983, + "grad_norm": 0.5336701860746771, + "learning_rate": 4.923428440499912e-06, + "loss": 0.2201, + "step": 555 + }, + { + "epoch": 0.12527107330948828, + "grad_norm": 0.5106857011491374, + "learning_rate": 4.922970240613461e-06, + "loss": 0.2188, + "step": 556 + }, + { + "epoch": 0.1254963809952967, + "grad_norm": 0.47508155488586634, + "learning_rate": 4.92251069534195e-06, + "loss": 0.2255, + "step": 557 + }, + { + "epoch": 0.12572168868110514, + "grad_norm": 0.490437067253941, + "learning_rate": 4.922049804940546e-06, + "loss": 0.2244, + "step": 558 + }, + { + "epoch": 0.12594699636691356, + "grad_norm": 0.5265093444022817, + "learning_rate": 4.9215875696651645e-06, + "loss": 0.2193, + "step": 559 + }, + { + "epoch": 0.126172304052722, + "grad_norm": 0.5143291674465049, + "learning_rate": 4.9211239897724685e-06, + "loss": 0.2305, + "step": 560 + }, + { + "epoch": 0.12639761173853042, + "grad_norm": 0.5200882346849997, + "learning_rate": 4.920659065519866e-06, + "loss": 0.2236, + "step": 561 + }, + { + "epoch": 0.12662291942433887, + "grad_norm": 0.468517902366109, + "learning_rate": 4.920192797165511e-06, + "loss": 0.2068, + "step": 562 + }, + { + "epoch": 0.1268482271101473, + "grad_norm": 0.5371091361672856, + "learning_rate": 4.919725184968307e-06, + "loss": 0.2259, + "step": 563 + }, + { + "epoch": 0.12707353479595573, + "grad_norm": 0.4846987418038485, + "learning_rate": 4.9192562291879e-06, + "loss": 0.2091, + "step": 564 + }, + { + "epoch": 0.12729884248176415, + "grad_norm": 0.49297540741137763, + "learning_rate": 4.9187859300846845e-06, + "loss": 0.2154, + "step": 565 + }, + { + "epoch": 0.1275241501675726, + "grad_norm": 0.4845062288901298, + "learning_rate": 4.9183142879198e-06, + "loss": 0.2293, + "step": 566 + }, + { + "epoch": 0.12774945785338102, + "grad_norm": 0.48589192727765523, + "learning_rate": 4.917841302955132e-06, + "loss": 0.2222, + "step": 567 + }, + { + "epoch": 0.12797476553918946, + "grad_norm": 0.49095764636753225, + "learning_rate": 4.917366975453311e-06, + "loss": 0.2151, + "step": 568 + }, + { + "epoch": 0.12820007322499788, + "grad_norm": 0.4917791447388033, + "learning_rate": 4.916891305677712e-06, + "loss": 0.21, + "step": 569 + }, + { + "epoch": 0.12842538091080632, + "grad_norm": 0.5212780134967567, + "learning_rate": 4.9164142938924595e-06, + "loss": 0.223, + "step": 570 + }, + { + "epoch": 0.12865068859661474, + "grad_norm": 0.5042044559239311, + "learning_rate": 4.9159359403624185e-06, + "loss": 0.2412, + "step": 571 + }, + { + "epoch": 0.1288759962824232, + "grad_norm": 0.4767150043917884, + "learning_rate": 4.915456245353202e-06, + "loss": 0.203, + "step": 572 + }, + { + "epoch": 0.1291013039682316, + "grad_norm": 0.47954942660260497, + "learning_rate": 4.914975209131165e-06, + "loss": 0.2229, + "step": 573 + }, + { + "epoch": 0.12932661165404005, + "grad_norm": 0.5009092410144068, + "learning_rate": 4.914492831963411e-06, + "loss": 0.2237, + "step": 574 + }, + { + "epoch": 0.12955191933984847, + "grad_norm": 0.4988875465505569, + "learning_rate": 4.9140091141177856e-06, + "loss": 0.2257, + "step": 575 + }, + { + "epoch": 0.12977722702565692, + "grad_norm": 0.5488783460126944, + "learning_rate": 4.9135240558628786e-06, + "loss": 0.242, + "step": 576 + }, + { + "epoch": 0.13000253471146533, + "grad_norm": 0.5091760384334044, + "learning_rate": 4.913037657468025e-06, + "loss": 0.2177, + "step": 577 + }, + { + "epoch": 0.13022784239727378, + "grad_norm": 0.48452437682730054, + "learning_rate": 4.9125499192033035e-06, + "loss": 0.2088, + "step": 578 + }, + { + "epoch": 0.1304531500830822, + "grad_norm": 0.5190036686853693, + "learning_rate": 4.912060841339536e-06, + "loss": 0.1942, + "step": 579 + }, + { + "epoch": 0.13067845776889064, + "grad_norm": 0.48345575026430726, + "learning_rate": 4.911570424148293e-06, + "loss": 0.2106, + "step": 580 + }, + { + "epoch": 0.13090376545469906, + "grad_norm": 0.5064704988590379, + "learning_rate": 4.911078667901881e-06, + "loss": 0.2329, + "step": 581 + }, + { + "epoch": 0.1311290731405075, + "grad_norm": 0.4775689444711927, + "learning_rate": 4.910585572873355e-06, + "loss": 0.2083, + "step": 582 + }, + { + "epoch": 0.13135438082631593, + "grad_norm": 0.517827260112663, + "learning_rate": 4.9100911393365134e-06, + "loss": 0.1843, + "step": 583 + }, + { + "epoch": 0.13157968851212437, + "grad_norm": 0.5406658137598743, + "learning_rate": 4.9095953675658945e-06, + "loss": 0.2307, + "step": 584 + }, + { + "epoch": 0.1318049961979328, + "grad_norm": 0.5014379648784281, + "learning_rate": 4.909098257836784e-06, + "loss": 0.1894, + "step": 585 + }, + { + "epoch": 0.13203030388374123, + "grad_norm": 0.5254637015707742, + "learning_rate": 4.908599810425208e-06, + "loss": 0.2246, + "step": 586 + }, + { + "epoch": 0.13225561156954965, + "grad_norm": 0.480677320409641, + "learning_rate": 4.908100025607935e-06, + "loss": 0.2081, + "step": 587 + }, + { + "epoch": 0.1324809192553581, + "grad_norm": 0.4816991618761013, + "learning_rate": 4.907598903662477e-06, + "loss": 0.2031, + "step": 588 + }, + { + "epoch": 0.13270622694116654, + "grad_norm": 0.525344268758633, + "learning_rate": 4.90709644486709e-06, + "loss": 0.2304, + "step": 589 + }, + { + "epoch": 0.13293153462697496, + "grad_norm": 0.5222782378351684, + "learning_rate": 4.906592649500767e-06, + "loss": 0.2144, + "step": 590 + }, + { + "epoch": 0.1331568423127834, + "grad_norm": 0.5013735378636515, + "learning_rate": 4.906087517843251e-06, + "loss": 0.2139, + "step": 591 + }, + { + "epoch": 0.13338214999859183, + "grad_norm": 0.47770889012385287, + "learning_rate": 4.9055810501750205e-06, + "loss": 0.2037, + "step": 592 + }, + { + "epoch": 0.13360745768440027, + "grad_norm": 0.48019086039186315, + "learning_rate": 4.905073246777298e-06, + "loss": 0.21, + "step": 593 + }, + { + "epoch": 0.1338327653702087, + "grad_norm": 0.5005698008240757, + "learning_rate": 4.904564107932048e-06, + "loss": 0.199, + "step": 594 + }, + { + "epoch": 0.13405807305601714, + "grad_norm": 0.4742105583103391, + "learning_rate": 4.904053633921977e-06, + "loss": 0.2146, + "step": 595 + }, + { + "epoch": 0.13428338074182555, + "grad_norm": 0.517632786803299, + "learning_rate": 4.9035418250305314e-06, + "loss": 0.2292, + "step": 596 + }, + { + "epoch": 0.134508688427634, + "grad_norm": 0.48009565501960405, + "learning_rate": 4.9030286815419e-06, + "loss": 0.1949, + "step": 597 + }, + { + "epoch": 0.13473399611344242, + "grad_norm": 0.4596160202944978, + "learning_rate": 4.902514203741013e-06, + "loss": 0.1935, + "step": 598 + }, + { + "epoch": 0.13495930379925086, + "grad_norm": 0.4658721082705013, + "learning_rate": 4.901998391913539e-06, + "loss": 0.2056, + "step": 599 + }, + { + "epoch": 0.13518461148505928, + "grad_norm": 0.45903799196737716, + "learning_rate": 4.9014812463458905e-06, + "loss": 0.2011, + "step": 600 + }, + { + "epoch": 0.13540991917086773, + "grad_norm": 0.5051774039084214, + "learning_rate": 4.9009627673252195e-06, + "loss": 0.2192, + "step": 601 + }, + { + "epoch": 0.13563522685667614, + "grad_norm": 0.527411683127971, + "learning_rate": 4.9004429551394155e-06, + "loss": 0.2237, + "step": 602 + }, + { + "epoch": 0.1358605345424846, + "grad_norm": 0.5014991689713594, + "learning_rate": 4.899921810077114e-06, + "loss": 0.2179, + "step": 603 + }, + { + "epoch": 0.136085842228293, + "grad_norm": 0.48221075981109124, + "learning_rate": 4.899399332427685e-06, + "loss": 0.2028, + "step": 604 + }, + { + "epoch": 0.13631114991410145, + "grad_norm": 0.49268110304543344, + "learning_rate": 4.898875522481242e-06, + "loss": 0.2159, + "step": 605 + }, + { + "epoch": 0.13653645759990987, + "grad_norm": 0.4942604004900246, + "learning_rate": 4.898350380528638e-06, + "loss": 0.2196, + "step": 606 + }, + { + "epoch": 0.13676176528571832, + "grad_norm": 0.5028877141966045, + "learning_rate": 4.897823906861463e-06, + "loss": 0.2253, + "step": 607 + }, + { + "epoch": 0.13698707297152674, + "grad_norm": 0.47097802636251307, + "learning_rate": 4.89729610177205e-06, + "loss": 0.2088, + "step": 608 + }, + { + "epoch": 0.13721238065733518, + "grad_norm": 0.49313670408257826, + "learning_rate": 4.896766965553467e-06, + "loss": 0.2347, + "step": 609 + }, + { + "epoch": 0.1374376883431436, + "grad_norm": 0.4871461530557875, + "learning_rate": 4.896236498499526e-06, + "loss": 0.2287, + "step": 610 + }, + { + "epoch": 0.13766299602895204, + "grad_norm": 0.46723852936407173, + "learning_rate": 4.8957047009047744e-06, + "loss": 0.2006, + "step": 611 + }, + { + "epoch": 0.13788830371476046, + "grad_norm": 0.4665199751934401, + "learning_rate": 4.8951715730645e-06, + "loss": 0.2149, + "step": 612 + }, + { + "epoch": 0.1381136114005689, + "grad_norm": 0.48305482619921186, + "learning_rate": 4.894637115274728e-06, + "loss": 0.2183, + "step": 613 + }, + { + "epoch": 0.13833891908637733, + "grad_norm": 0.5312489063314536, + "learning_rate": 4.894101327832225e-06, + "loss": 0.2482, + "step": 614 + }, + { + "epoch": 0.13856422677218577, + "grad_norm": 0.4856020344791176, + "learning_rate": 4.893564211034492e-06, + "loss": 0.2065, + "step": 615 + }, + { + "epoch": 0.1387895344579942, + "grad_norm": 0.4967354978655531, + "learning_rate": 4.89302576517977e-06, + "loss": 0.2199, + "step": 616 + }, + { + "epoch": 0.13901484214380264, + "grad_norm": 0.46566265370705107, + "learning_rate": 4.892485990567037e-06, + "loss": 0.2086, + "step": 617 + }, + { + "epoch": 0.13924014982961105, + "grad_norm": 0.4883020636515578, + "learning_rate": 4.891944887496013e-06, + "loss": 0.2219, + "step": 618 + }, + { + "epoch": 0.1394654575154195, + "grad_norm": 0.5347123554712424, + "learning_rate": 4.891402456267149e-06, + "loss": 0.2257, + "step": 619 + }, + { + "epoch": 0.13969076520122792, + "grad_norm": 0.5063686339666736, + "learning_rate": 4.890858697181638e-06, + "loss": 0.2167, + "step": 620 + }, + { + "epoch": 0.13991607288703636, + "grad_norm": 0.5461851595545708, + "learning_rate": 4.89031361054141e-06, + "loss": 0.2256, + "step": 621 + }, + { + "epoch": 0.14014138057284478, + "grad_norm": 0.5338128979292234, + "learning_rate": 4.8897671966491315e-06, + "loss": 0.2131, + "step": 622 + }, + { + "epoch": 0.14036668825865323, + "grad_norm": 0.5023418751489027, + "learning_rate": 4.889219455808204e-06, + "loss": 0.2135, + "step": 623 + }, + { + "epoch": 0.14059199594446165, + "grad_norm": 0.537914503502134, + "learning_rate": 4.888670388322768e-06, + "loss": 0.2312, + "step": 624 + }, + { + "epoch": 0.1408173036302701, + "grad_norm": 0.48795765527514035, + "learning_rate": 4.888119994497701e-06, + "loss": 0.2025, + "step": 625 + }, + { + "epoch": 0.1410426113160785, + "grad_norm": 0.5539262837760985, + "learning_rate": 4.887568274638616e-06, + "loss": 0.2395, + "step": 626 + }, + { + "epoch": 0.14126791900188695, + "grad_norm": 0.5404624831372493, + "learning_rate": 4.887015229051861e-06, + "loss": 0.217, + "step": 627 + }, + { + "epoch": 0.14149322668769537, + "grad_norm": 0.5041160845005246, + "learning_rate": 4.886460858044524e-06, + "loss": 0.2088, + "step": 628 + }, + { + "epoch": 0.14171853437350382, + "grad_norm": 0.47805588690287043, + "learning_rate": 4.885905161924426e-06, + "loss": 0.2108, + "step": 629 + }, + { + "epoch": 0.14194384205931224, + "grad_norm": 0.48883375640799653, + "learning_rate": 4.8853481410001225e-06, + "loss": 0.2045, + "step": 630 + }, + { + "epoch": 0.14216914974512068, + "grad_norm": 0.5184054859688491, + "learning_rate": 4.8847897955809085e-06, + "loss": 0.2235, + "step": 631 + }, + { + "epoch": 0.1423944574309291, + "grad_norm": 0.49947977523672554, + "learning_rate": 4.884230125976812e-06, + "loss": 0.1942, + "step": 632 + }, + { + "epoch": 0.14261976511673755, + "grad_norm": 0.510742114810367, + "learning_rate": 4.8836691324985955e-06, + "loss": 0.2206, + "step": 633 + }, + { + "epoch": 0.14284507280254596, + "grad_norm": 0.5332394483396484, + "learning_rate": 4.883106815457758e-06, + "loss": 0.2224, + "step": 634 + }, + { + "epoch": 0.1430703804883544, + "grad_norm": 0.46610893805714276, + "learning_rate": 4.882543175166535e-06, + "loss": 0.1959, + "step": 635 + }, + { + "epoch": 0.14329568817416283, + "grad_norm": 0.5339487807664778, + "learning_rate": 4.881978211937895e-06, + "loss": 0.23, + "step": 636 + }, + { + "epoch": 0.14352099585997127, + "grad_norm": 0.5068351593574385, + "learning_rate": 4.8814119260855374e-06, + "loss": 0.219, + "step": 637 + }, + { + "epoch": 0.14374630354577972, + "grad_norm": 0.4791069348195748, + "learning_rate": 4.8808443179239025e-06, + "loss": 0.2067, + "step": 638 + }, + { + "epoch": 0.14397161123158814, + "grad_norm": 0.457462839970975, + "learning_rate": 4.880275387768162e-06, + "loss": 0.1948, + "step": 639 + }, + { + "epoch": 0.14419691891739658, + "grad_norm": 0.5450517082010574, + "learning_rate": 4.87970513593422e-06, + "loss": 0.2071, + "step": 640 + }, + { + "epoch": 0.144422226603205, + "grad_norm": 0.5422338957547485, + "learning_rate": 4.879133562738719e-06, + "loss": 0.2464, + "step": 641 + }, + { + "epoch": 0.14464753428901345, + "grad_norm": 0.5071084359388005, + "learning_rate": 4.878560668499029e-06, + "loss": 0.2279, + "step": 642 + }, + { + "epoch": 0.14487284197482186, + "grad_norm": 0.4619397880876343, + "learning_rate": 4.8779864535332585e-06, + "loss": 0.2004, + "step": 643 + }, + { + "epoch": 0.1450981496606303, + "grad_norm": 0.4979038290587033, + "learning_rate": 4.877410918160247e-06, + "loss": 0.2046, + "step": 644 + }, + { + "epoch": 0.14532345734643873, + "grad_norm": 0.5319335016776964, + "learning_rate": 4.876834062699569e-06, + "loss": 0.2362, + "step": 645 + }, + { + "epoch": 0.14554876503224717, + "grad_norm": 0.4832269096267646, + "learning_rate": 4.87625588747153e-06, + "loss": 0.2173, + "step": 646 + }, + { + "epoch": 0.1457740727180556, + "grad_norm": 0.4984344262125044, + "learning_rate": 4.875676392797169e-06, + "loss": 0.2109, + "step": 647 + }, + { + "epoch": 0.14599938040386404, + "grad_norm": 0.5563434490911646, + "learning_rate": 4.875095578998258e-06, + "loss": 0.2234, + "step": 648 + }, + { + "epoch": 0.14622468808967246, + "grad_norm": 0.4852569925709238, + "learning_rate": 4.874513446397301e-06, + "loss": 0.1997, + "step": 649 + }, + { + "epoch": 0.1464499957754809, + "grad_norm": 0.48144931872854707, + "learning_rate": 4.873929995317535e-06, + "loss": 0.2106, + "step": 650 + }, + { + "epoch": 0.14667530346128932, + "grad_norm": 0.5072590003715031, + "learning_rate": 4.873345226082929e-06, + "loss": 0.2188, + "step": 651 + }, + { + "epoch": 0.14690061114709776, + "grad_norm": 0.4966540485881892, + "learning_rate": 4.872759139018183e-06, + "loss": 0.2111, + "step": 652 + }, + { + "epoch": 0.14712591883290618, + "grad_norm": 0.5271163207218221, + "learning_rate": 4.872171734448728e-06, + "loss": 0.2318, + "step": 653 + }, + { + "epoch": 0.14735122651871463, + "grad_norm": 0.5337810154931217, + "learning_rate": 4.87158301270073e-06, + "loss": 0.2022, + "step": 654 + }, + { + "epoch": 0.14757653420452305, + "grad_norm": 0.4517221586825233, + "learning_rate": 4.870992974101084e-06, + "loss": 0.1939, + "step": 655 + }, + { + "epoch": 0.1478018418903315, + "grad_norm": 0.5280705676185263, + "learning_rate": 4.870401618977415e-06, + "loss": 0.2335, + "step": 656 + }, + { + "epoch": 0.1480271495761399, + "grad_norm": 0.4982889360888342, + "learning_rate": 4.869808947658082e-06, + "loss": 0.2344, + "step": 657 + }, + { + "epoch": 0.14825245726194836, + "grad_norm": 0.5201163717623067, + "learning_rate": 4.869214960472172e-06, + "loss": 0.223, + "step": 658 + }, + { + "epoch": 0.14847776494775677, + "grad_norm": 0.49536958068342385, + "learning_rate": 4.868619657749505e-06, + "loss": 0.2219, + "step": 659 + }, + { + "epoch": 0.14870307263356522, + "grad_norm": 0.5262562890345055, + "learning_rate": 4.868023039820629e-06, + "loss": 0.2283, + "step": 660 + }, + { + "epoch": 0.14892838031937364, + "grad_norm": 0.45138943610472454, + "learning_rate": 4.867425107016826e-06, + "loss": 0.1986, + "step": 661 + }, + { + "epoch": 0.14915368800518208, + "grad_norm": 0.4816189071034616, + "learning_rate": 4.8668258596701035e-06, + "loss": 0.1908, + "step": 662 + }, + { + "epoch": 0.1493789956909905, + "grad_norm": 0.4843524878942397, + "learning_rate": 4.866225298113203e-06, + "loss": 0.2013, + "step": 663 + }, + { + "epoch": 0.14960430337679895, + "grad_norm": 0.5486470299188861, + "learning_rate": 4.865623422679593e-06, + "loss": 0.2325, + "step": 664 + }, + { + "epoch": 0.14982961106260737, + "grad_norm": 0.5176940887442645, + "learning_rate": 4.865020233703472e-06, + "loss": 0.2232, + "step": 665 + }, + { + "epoch": 0.1500549187484158, + "grad_norm": 0.5091612097821714, + "learning_rate": 4.864415731519769e-06, + "loss": 0.2102, + "step": 666 + }, + { + "epoch": 0.15028022643422423, + "grad_norm": 0.504972394000636, + "learning_rate": 4.863809916464142e-06, + "loss": 0.219, + "step": 667 + }, + { + "epoch": 0.15050553412003267, + "grad_norm": 0.5020711759554679, + "learning_rate": 4.8632027888729765e-06, + "loss": 0.2144, + "step": 668 + }, + { + "epoch": 0.1507308418058411, + "grad_norm": 0.4806893163663347, + "learning_rate": 4.862594349083389e-06, + "loss": 0.1895, + "step": 669 + }, + { + "epoch": 0.15095614949164954, + "grad_norm": 0.4962466701514298, + "learning_rate": 4.861984597433223e-06, + "loss": 0.2159, + "step": 670 + }, + { + "epoch": 0.15118145717745796, + "grad_norm": 0.4896818762695566, + "learning_rate": 4.861373534261049e-06, + "loss": 0.1955, + "step": 671 + }, + { + "epoch": 0.1514067648632664, + "grad_norm": 0.5137506496338324, + "learning_rate": 4.860761159906171e-06, + "loss": 0.2262, + "step": 672 + }, + { + "epoch": 0.15163207254907482, + "grad_norm": 0.4472904498934483, + "learning_rate": 4.8601474747086145e-06, + "loss": 0.1919, + "step": 673 + }, + { + "epoch": 0.15185738023488327, + "grad_norm": 0.506372262580618, + "learning_rate": 4.859532479009138e-06, + "loss": 0.2186, + "step": 674 + }, + { + "epoch": 0.15208268792069168, + "grad_norm": 0.6157081715613052, + "learning_rate": 4.8589161731492255e-06, + "loss": 0.2057, + "step": 675 + }, + { + "epoch": 0.15230799560650013, + "grad_norm": 0.5103362732878185, + "learning_rate": 4.858298557471089e-06, + "loss": 0.2192, + "step": 676 + }, + { + "epoch": 0.15253330329230855, + "grad_norm": 0.49455722278468994, + "learning_rate": 4.857679632317664e-06, + "loss": 0.228, + "step": 677 + }, + { + "epoch": 0.152758610978117, + "grad_norm": 0.5070030229009717, + "learning_rate": 4.857059398032622e-06, + "loss": 0.2074, + "step": 678 + }, + { + "epoch": 0.1529839186639254, + "grad_norm": 0.46878036214174107, + "learning_rate": 4.856437854960352e-06, + "loss": 0.2052, + "step": 679 + }, + { + "epoch": 0.15320922634973386, + "grad_norm": 0.49226434301901395, + "learning_rate": 4.855815003445975e-06, + "loss": 0.2226, + "step": 680 + }, + { + "epoch": 0.15343453403554227, + "grad_norm": 0.48702391586924726, + "learning_rate": 4.855190843835338e-06, + "loss": 0.2131, + "step": 681 + }, + { + "epoch": 0.15365984172135072, + "grad_norm": 0.5061582304296567, + "learning_rate": 4.8545653764750125e-06, + "loss": 0.2292, + "step": 682 + }, + { + "epoch": 0.15388514940715914, + "grad_norm": 0.4541571317048153, + "learning_rate": 4.853938601712297e-06, + "loss": 0.2084, + "step": 683 + }, + { + "epoch": 0.15411045709296758, + "grad_norm": 0.5060840092522514, + "learning_rate": 4.853310519895217e-06, + "loss": 0.2061, + "step": 684 + }, + { + "epoch": 0.154335764778776, + "grad_norm": 0.49219169689950343, + "learning_rate": 4.852681131372522e-06, + "loss": 0.225, + "step": 685 + }, + { + "epoch": 0.15456107246458445, + "grad_norm": 0.5135374750800394, + "learning_rate": 4.85205043649369e-06, + "loss": 0.2178, + "step": 686 + }, + { + "epoch": 0.1547863801503929, + "grad_norm": 0.4705933351891192, + "learning_rate": 4.851418435608919e-06, + "loss": 0.2074, + "step": 687 + }, + { + "epoch": 0.1550116878362013, + "grad_norm": 0.49450453458984256, + "learning_rate": 4.850785129069139e-06, + "loss": 0.2067, + "step": 688 + }, + { + "epoch": 0.15523699552200976, + "grad_norm": 0.4652776306565102, + "learning_rate": 4.850150517225999e-06, + "loss": 0.2023, + "step": 689 + }, + { + "epoch": 0.15546230320781818, + "grad_norm": 0.4822444368988619, + "learning_rate": 4.849514600431877e-06, + "loss": 0.2037, + "step": 690 + }, + { + "epoch": 0.15568761089362662, + "grad_norm": 0.5056319193122155, + "learning_rate": 4.848877379039874e-06, + "loss": 0.2235, + "step": 691 + }, + { + "epoch": 0.15591291857943504, + "grad_norm": 0.4874049237646761, + "learning_rate": 4.848238853403813e-06, + "loss": 0.2082, + "step": 692 + }, + { + "epoch": 0.15613822626524348, + "grad_norm": 0.5408155214553032, + "learning_rate": 4.847599023878245e-06, + "loss": 0.2205, + "step": 693 + }, + { + "epoch": 0.1563635339510519, + "grad_norm": 0.4962062180128646, + "learning_rate": 4.846957890818444e-06, + "loss": 0.219, + "step": 694 + }, + { + "epoch": 0.15658884163686035, + "grad_norm": 0.457960800868629, + "learning_rate": 4.846315454580406e-06, + "loss": 0.198, + "step": 695 + }, + { + "epoch": 0.15681414932266877, + "grad_norm": 0.48199028963379814, + "learning_rate": 4.845671715520853e-06, + "loss": 0.2026, + "step": 696 + }, + { + "epoch": 0.1570394570084772, + "grad_norm": 0.46279616242607235, + "learning_rate": 4.845026673997229e-06, + "loss": 0.1898, + "step": 697 + }, + { + "epoch": 0.15726476469428563, + "grad_norm": 0.485564952237087, + "learning_rate": 4.844380330367701e-06, + "loss": 0.2084, + "step": 698 + }, + { + "epoch": 0.15749007238009408, + "grad_norm": 0.47919503929382407, + "learning_rate": 4.843732684991161e-06, + "loss": 0.2134, + "step": 699 + }, + { + "epoch": 0.1577153800659025, + "grad_norm": 0.5507967373531661, + "learning_rate": 4.84308373822722e-06, + "loss": 0.2368, + "step": 700 + }, + { + "epoch": 0.15794068775171094, + "grad_norm": 0.4748805534625741, + "learning_rate": 4.842433490436217e-06, + "loss": 0.2072, + "step": 701 + }, + { + "epoch": 0.15816599543751936, + "grad_norm": 0.505175057363065, + "learning_rate": 4.841781941979207e-06, + "loss": 0.2194, + "step": 702 + }, + { + "epoch": 0.1583913031233278, + "grad_norm": 0.47570732738418525, + "learning_rate": 4.8411290932179734e-06, + "loss": 0.2062, + "step": 703 + }, + { + "epoch": 0.15861661080913622, + "grad_norm": 0.4760889805757383, + "learning_rate": 4.840474944515017e-06, + "loss": 0.2175, + "step": 704 + }, + { + "epoch": 0.15884191849494467, + "grad_norm": 0.4996849687805509, + "learning_rate": 4.839819496233562e-06, + "loss": 0.2164, + "step": 705 + }, + { + "epoch": 0.15906722618075309, + "grad_norm": 0.48751640993420303, + "learning_rate": 4.839162748737556e-06, + "loss": 0.2022, + "step": 706 + }, + { + "epoch": 0.15929253386656153, + "grad_norm": 0.5047505029740413, + "learning_rate": 4.838504702391665e-06, + "loss": 0.2176, + "step": 707 + }, + { + "epoch": 0.15951784155236995, + "grad_norm": 0.4934854389416716, + "learning_rate": 4.8378453575612785e-06, + "loss": 0.2287, + "step": 708 + }, + { + "epoch": 0.1597431492381784, + "grad_norm": 0.46732468687602846, + "learning_rate": 4.837184714612506e-06, + "loss": 0.1946, + "step": 709 + }, + { + "epoch": 0.1599684569239868, + "grad_norm": 0.4645307989408414, + "learning_rate": 4.836522773912178e-06, + "loss": 0.2055, + "step": 710 + }, + { + "epoch": 0.16019376460979526, + "grad_norm": 0.5025033641155221, + "learning_rate": 4.835859535827844e-06, + "loss": 0.2099, + "step": 711 + }, + { + "epoch": 0.16041907229560368, + "grad_norm": 0.5348614295989801, + "learning_rate": 4.835195000727778e-06, + "loss": 0.2227, + "step": 712 + }, + { + "epoch": 0.16064437998141212, + "grad_norm": 0.5012837638155585, + "learning_rate": 4.834529168980969e-06, + "loss": 0.2187, + "step": 713 + }, + { + "epoch": 0.16086968766722054, + "grad_norm": 0.4842613158213902, + "learning_rate": 4.83386204095713e-06, + "loss": 0.2153, + "step": 714 + }, + { + "epoch": 0.16109499535302899, + "grad_norm": 0.49070022600833896, + "learning_rate": 4.833193617026692e-06, + "loss": 0.2104, + "step": 715 + }, + { + "epoch": 0.1613203030388374, + "grad_norm": 0.4681314758386819, + "learning_rate": 4.832523897560806e-06, + "loss": 0.1985, + "step": 716 + }, + { + "epoch": 0.16154561072464585, + "grad_norm": 0.4883181546275142, + "learning_rate": 4.831852882931342e-06, + "loss": 0.2027, + "step": 717 + }, + { + "epoch": 0.16177091841045427, + "grad_norm": 0.5078892280826068, + "learning_rate": 4.83118057351089e-06, + "loss": 0.2187, + "step": 718 + }, + { + "epoch": 0.1619962260962627, + "grad_norm": 0.4659629231650999, + "learning_rate": 4.830506969672758e-06, + "loss": 0.2044, + "step": 719 + }, + { + "epoch": 0.16222153378207113, + "grad_norm": 0.4983678424832625, + "learning_rate": 4.829832071790972e-06, + "loss": 0.2204, + "step": 720 + }, + { + "epoch": 0.16244684146787958, + "grad_norm": 0.510587752735488, + "learning_rate": 4.829155880240279e-06, + "loss": 0.2126, + "step": 721 + }, + { + "epoch": 0.162672149153688, + "grad_norm": 0.47502937360691894, + "learning_rate": 4.828478395396143e-06, + "loss": 0.203, + "step": 722 + }, + { + "epoch": 0.16289745683949644, + "grad_norm": 0.48095305502021934, + "learning_rate": 4.8277996176347465e-06, + "loss": 0.1944, + "step": 723 + }, + { + "epoch": 0.16312276452530486, + "grad_norm": 0.5011496962504307, + "learning_rate": 4.827119547332988e-06, + "loss": 0.1976, + "step": 724 + }, + { + "epoch": 0.1633480722111133, + "grad_norm": 0.4870881491532629, + "learning_rate": 4.826438184868486e-06, + "loss": 0.2068, + "step": 725 + }, + { + "epoch": 0.16357337989692172, + "grad_norm": 0.4802671703139868, + "learning_rate": 4.825755530619576e-06, + "loss": 0.2001, + "step": 726 + }, + { + "epoch": 0.16379868758273017, + "grad_norm": 0.505504955125028, + "learning_rate": 4.825071584965308e-06, + "loss": 0.2013, + "step": 727 + }, + { + "epoch": 0.16402399526853859, + "grad_norm": 0.5120257000541731, + "learning_rate": 4.824386348285456e-06, + "loss": 0.2234, + "step": 728 + }, + { + "epoch": 0.16424930295434703, + "grad_norm": 0.49486406271365074, + "learning_rate": 4.823699820960502e-06, + "loss": 0.2156, + "step": 729 + }, + { + "epoch": 0.16447461064015545, + "grad_norm": 0.49603637373143805, + "learning_rate": 4.8230120033716525e-06, + "loss": 0.209, + "step": 730 + }, + { + "epoch": 0.1646999183259639, + "grad_norm": 0.4897408502719706, + "learning_rate": 4.822322895900825e-06, + "loss": 0.2001, + "step": 731 + }, + { + "epoch": 0.1649252260117723, + "grad_norm": 0.4839050055640949, + "learning_rate": 4.821632498930656e-06, + "loss": 0.1967, + "step": 732 + }, + { + "epoch": 0.16515053369758076, + "grad_norm": 0.49343908730827324, + "learning_rate": 4.820940812844496e-06, + "loss": 0.197, + "step": 733 + }, + { + "epoch": 0.16537584138338918, + "grad_norm": 0.4880269330508922, + "learning_rate": 4.820247838026414e-06, + "loss": 0.205, + "step": 734 + }, + { + "epoch": 0.16560114906919762, + "grad_norm": 0.44528666537541817, + "learning_rate": 4.819553574861192e-06, + "loss": 0.19, + "step": 735 + }, + { + "epoch": 0.16582645675500607, + "grad_norm": 0.5024733094727657, + "learning_rate": 4.81885802373433e-06, + "loss": 0.1959, + "step": 736 + }, + { + "epoch": 0.1660517644408145, + "grad_norm": 0.4995022959909293, + "learning_rate": 4.818161185032039e-06, + "loss": 0.226, + "step": 737 + }, + { + "epoch": 0.16627707212662293, + "grad_norm": 0.5124846611104443, + "learning_rate": 4.8174630591412495e-06, + "loss": 0.2278, + "step": 738 + }, + { + "epoch": 0.16650237981243135, + "grad_norm": 0.479097121759596, + "learning_rate": 4.816763646449605e-06, + "loss": 0.2174, + "step": 739 + }, + { + "epoch": 0.1667276874982398, + "grad_norm": 0.49905849734476754, + "learning_rate": 4.816062947345462e-06, + "loss": 0.2061, + "step": 740 + }, + { + "epoch": 0.16695299518404821, + "grad_norm": 0.5776653039328091, + "learning_rate": 4.815360962217894e-06, + "loss": 0.2477, + "step": 741 + }, + { + "epoch": 0.16717830286985666, + "grad_norm": 0.4492369799443974, + "learning_rate": 4.814657691456685e-06, + "loss": 0.2035, + "step": 742 + }, + { + "epoch": 0.16740361055566508, + "grad_norm": 0.5113299677466411, + "learning_rate": 4.813953135452338e-06, + "loss": 0.216, + "step": 743 + }, + { + "epoch": 0.16762891824147352, + "grad_norm": 0.48903920254544786, + "learning_rate": 4.813247294596065e-06, + "loss": 0.2115, + "step": 744 + }, + { + "epoch": 0.16785422592728194, + "grad_norm": 0.4907396350181237, + "learning_rate": 4.812540169279793e-06, + "loss": 0.2054, + "step": 745 + }, + { + "epoch": 0.1680795336130904, + "grad_norm": 0.4973341977301926, + "learning_rate": 4.8118317598961625e-06, + "loss": 0.2092, + "step": 746 + }, + { + "epoch": 0.1683048412988988, + "grad_norm": 0.5234295945986369, + "learning_rate": 4.811122066838527e-06, + "loss": 0.201, + "step": 747 + }, + { + "epoch": 0.16853014898470725, + "grad_norm": 0.5106855409283393, + "learning_rate": 4.810411090500952e-06, + "loss": 0.2035, + "step": 748 + }, + { + "epoch": 0.16875545667051567, + "grad_norm": 0.4941944616680497, + "learning_rate": 4.809698831278217e-06, + "loss": 0.1904, + "step": 749 + }, + { + "epoch": 0.16898076435632411, + "grad_norm": 0.526233500381215, + "learning_rate": 4.808985289565813e-06, + "loss": 0.248, + "step": 750 + }, + { + "epoch": 0.16920607204213253, + "grad_norm": 0.555851515057548, + "learning_rate": 4.808270465759943e-06, + "loss": 0.2197, + "step": 751 + }, + { + "epoch": 0.16943137972794098, + "grad_norm": 0.5319174152865482, + "learning_rate": 4.807554360257522e-06, + "loss": 0.2214, + "step": 752 + }, + { + "epoch": 0.1696566874137494, + "grad_norm": 0.5200007767252409, + "learning_rate": 4.806836973456175e-06, + "loss": 0.2063, + "step": 753 + }, + { + "epoch": 0.16988199509955784, + "grad_norm": 0.5010693543012754, + "learning_rate": 4.8061183057542424e-06, + "loss": 0.1932, + "step": 754 + }, + { + "epoch": 0.17010730278536626, + "grad_norm": 0.483753158950766, + "learning_rate": 4.8053983575507735e-06, + "loss": 0.199, + "step": 755 + }, + { + "epoch": 0.1703326104711747, + "grad_norm": 0.5201699552301197, + "learning_rate": 4.804677129245527e-06, + "loss": 0.2182, + "step": 756 + }, + { + "epoch": 0.17055791815698312, + "grad_norm": 0.4443984008980045, + "learning_rate": 4.8039546212389765e-06, + "loss": 0.1863, + "step": 757 + }, + { + "epoch": 0.17078322584279157, + "grad_norm": 0.4837697089799788, + "learning_rate": 4.803230833932302e-06, + "loss": 0.2153, + "step": 758 + }, + { + "epoch": 0.1710085335286, + "grad_norm": 0.45416749946288687, + "learning_rate": 4.802505767727395e-06, + "loss": 0.1996, + "step": 759 + }, + { + "epoch": 0.17123384121440843, + "grad_norm": 0.5059509662538503, + "learning_rate": 4.80177942302686e-06, + "loss": 0.224, + "step": 760 + }, + { + "epoch": 0.17145914890021685, + "grad_norm": 0.47433654222833943, + "learning_rate": 4.8010518002340065e-06, + "loss": 0.1976, + "step": 761 + }, + { + "epoch": 0.1716844565860253, + "grad_norm": 0.4486508915715903, + "learning_rate": 4.800322899752859e-06, + "loss": 0.1919, + "step": 762 + }, + { + "epoch": 0.17190976427183371, + "grad_norm": 0.5277705228383666, + "learning_rate": 4.799592721988147e-06, + "loss": 0.2254, + "step": 763 + }, + { + "epoch": 0.17213507195764216, + "grad_norm": 0.46936342933913916, + "learning_rate": 4.798861267345312e-06, + "loss": 0.1874, + "step": 764 + }, + { + "epoch": 0.17236037964345058, + "grad_norm": 0.47208441835137804, + "learning_rate": 4.798128536230502e-06, + "loss": 0.2198, + "step": 765 + }, + { + "epoch": 0.17258568732925902, + "grad_norm": 0.5079351578489507, + "learning_rate": 4.797394529050577e-06, + "loss": 0.2136, + "step": 766 + }, + { + "epoch": 0.17281099501506744, + "grad_norm": 0.4749930509449937, + "learning_rate": 4.796659246213103e-06, + "loss": 0.2022, + "step": 767 + }, + { + "epoch": 0.1730363027008759, + "grad_norm": 0.48256447098600935, + "learning_rate": 4.795922688126355e-06, + "loss": 0.1886, + "step": 768 + }, + { + "epoch": 0.1732616103866843, + "grad_norm": 0.5026443329361266, + "learning_rate": 4.795184855199316e-06, + "loss": 0.2238, + "step": 769 + }, + { + "epoch": 0.17348691807249275, + "grad_norm": 0.5104560997846961, + "learning_rate": 4.794445747841679e-06, + "loss": 0.2283, + "step": 770 + }, + { + "epoch": 0.17371222575830117, + "grad_norm": 0.5044270674296859, + "learning_rate": 4.79370536646384e-06, + "loss": 0.2189, + "step": 771 + }, + { + "epoch": 0.17393753344410962, + "grad_norm": 0.5140756252400323, + "learning_rate": 4.792963711476908e-06, + "loss": 0.205, + "step": 772 + }, + { + "epoch": 0.17416284112991803, + "grad_norm": 0.44698255181524993, + "learning_rate": 4.792220783292694e-06, + "loss": 0.1879, + "step": 773 + }, + { + "epoch": 0.17438814881572648, + "grad_norm": 0.4877214209383508, + "learning_rate": 4.791476582323719e-06, + "loss": 0.2078, + "step": 774 + }, + { + "epoch": 0.1746134565015349, + "grad_norm": 0.48158986340543913, + "learning_rate": 4.790731108983211e-06, + "loss": 0.2106, + "step": 775 + }, + { + "epoch": 0.17483876418734334, + "grad_norm": 0.4824643746877253, + "learning_rate": 4.7899843636851014e-06, + "loss": 0.1993, + "step": 776 + }, + { + "epoch": 0.17506407187315176, + "grad_norm": 0.5028710774155346, + "learning_rate": 4.789236346844034e-06, + "loss": 0.2103, + "step": 777 + }, + { + "epoch": 0.1752893795589602, + "grad_norm": 0.48846527928319905, + "learning_rate": 4.78848705887535e-06, + "loss": 0.2138, + "step": 778 + }, + { + "epoch": 0.17551468724476862, + "grad_norm": 0.5481737612065419, + "learning_rate": 4.7877365001951045e-06, + "loss": 0.2127, + "step": 779 + }, + { + "epoch": 0.17573999493057707, + "grad_norm": 0.5057300958645277, + "learning_rate": 4.786984671220053e-06, + "loss": 0.1929, + "step": 780 + }, + { + "epoch": 0.1759653026163855, + "grad_norm": 0.5223737070900981, + "learning_rate": 4.786231572367659e-06, + "loss": 0.223, + "step": 781 + }, + { + "epoch": 0.17619061030219393, + "grad_norm": 0.47579601143913175, + "learning_rate": 4.785477204056089e-06, + "loss": 0.1999, + "step": 782 + }, + { + "epoch": 0.17641591798800235, + "grad_norm": 0.5114716682723512, + "learning_rate": 4.784721566704217e-06, + "loss": 0.2126, + "step": 783 + }, + { + "epoch": 0.1766412256738108, + "grad_norm": 0.5016228140613117, + "learning_rate": 4.78396466073162e-06, + "loss": 0.2067, + "step": 784 + }, + { + "epoch": 0.17686653335961924, + "grad_norm": 0.5258855690597636, + "learning_rate": 4.7832064865585795e-06, + "loss": 0.2041, + "step": 785 + }, + { + "epoch": 0.17709184104542766, + "grad_norm": 0.4928419952160604, + "learning_rate": 4.78244704460608e-06, + "loss": 0.2034, + "step": 786 + }, + { + "epoch": 0.1773171487312361, + "grad_norm": 0.5315359278084137, + "learning_rate": 4.781686335295813e-06, + "loss": 0.2095, + "step": 787 + }, + { + "epoch": 0.17754245641704453, + "grad_norm": 0.5425535063529452, + "learning_rate": 4.7809243590501725e-06, + "loss": 0.2059, + "step": 788 + }, + { + "epoch": 0.17776776410285297, + "grad_norm": 0.4925028756436677, + "learning_rate": 4.780161116292254e-06, + "loss": 0.2101, + "step": 789 + }, + { + "epoch": 0.1779930717886614, + "grad_norm": 0.4819602024457228, + "learning_rate": 4.779396607445858e-06, + "loss": 0.2109, + "step": 790 + }, + { + "epoch": 0.17821837947446983, + "grad_norm": 0.5084789479262742, + "learning_rate": 4.778630832935489e-06, + "loss": 0.2068, + "step": 791 + }, + { + "epoch": 0.17844368716027825, + "grad_norm": 0.49164909455844125, + "learning_rate": 4.777863793186351e-06, + "loss": 0.1965, + "step": 792 + }, + { + "epoch": 0.1786689948460867, + "grad_norm": 0.45404718306951947, + "learning_rate": 4.777095488624355e-06, + "loss": 0.1892, + "step": 793 + }, + { + "epoch": 0.17889430253189512, + "grad_norm": 0.5250790406013919, + "learning_rate": 4.776325919676109e-06, + "loss": 0.223, + "step": 794 + }, + { + "epoch": 0.17911961021770356, + "grad_norm": 0.4751129458903047, + "learning_rate": 4.775555086768929e-06, + "loss": 0.1894, + "step": 795 + }, + { + "epoch": 0.17934491790351198, + "grad_norm": 0.47785323999242096, + "learning_rate": 4.774782990330828e-06, + "loss": 0.1979, + "step": 796 + }, + { + "epoch": 0.17957022558932043, + "grad_norm": 0.5516777340082402, + "learning_rate": 4.774009630790522e-06, + "loss": 0.2201, + "step": 797 + }, + { + "epoch": 0.17979553327512884, + "grad_norm": 0.47988603490744913, + "learning_rate": 4.77323500857743e-06, + "loss": 0.1803, + "step": 798 + }, + { + "epoch": 0.1800208409609373, + "grad_norm": 0.4896587265535693, + "learning_rate": 4.77245912412167e-06, + "loss": 0.2061, + "step": 799 + }, + { + "epoch": 0.1802461486467457, + "grad_norm": 0.4768354652873873, + "learning_rate": 4.771681977854062e-06, + "loss": 0.1977, + "step": 800 + }, + { + "epoch": 0.18047145633255415, + "grad_norm": 0.5068517760497676, + "learning_rate": 4.7709035702061275e-06, + "loss": 0.2085, + "step": 801 + }, + { + "epoch": 0.18069676401836257, + "grad_norm": 0.4837732376602242, + "learning_rate": 4.770123901610085e-06, + "loss": 0.2149, + "step": 802 + }, + { + "epoch": 0.18092207170417102, + "grad_norm": 0.5028989424795088, + "learning_rate": 4.7693429724988565e-06, + "loss": 0.2128, + "step": 803 + }, + { + "epoch": 0.18114737938997943, + "grad_norm": 0.47412046057681884, + "learning_rate": 4.768560783306064e-06, + "loss": 0.2094, + "step": 804 + }, + { + "epoch": 0.18137268707578788, + "grad_norm": 0.46633100002211975, + "learning_rate": 4.767777334466025e-06, + "loss": 0.1924, + "step": 805 + }, + { + "epoch": 0.1815979947615963, + "grad_norm": 0.4767387772306873, + "learning_rate": 4.7669926264137625e-06, + "loss": 0.2031, + "step": 806 + }, + { + "epoch": 0.18182330244740474, + "grad_norm": 0.45107711302832615, + "learning_rate": 4.766206659584994e-06, + "loss": 0.1861, + "step": 807 + }, + { + "epoch": 0.18204861013321316, + "grad_norm": 0.4850288582778965, + "learning_rate": 4.765419434416138e-06, + "loss": 0.2066, + "step": 808 + }, + { + "epoch": 0.1822739178190216, + "grad_norm": 0.5183967156862951, + "learning_rate": 4.7646309513443115e-06, + "loss": 0.2258, + "step": 809 + }, + { + "epoch": 0.18249922550483003, + "grad_norm": 0.4732335317619318, + "learning_rate": 4.763841210807329e-06, + "loss": 0.2042, + "step": 810 + }, + { + "epoch": 0.18272453319063847, + "grad_norm": 0.5035561890035928, + "learning_rate": 4.763050213243705e-06, + "loss": 0.2127, + "step": 811 + }, + { + "epoch": 0.1829498408764469, + "grad_norm": 0.5268159851154696, + "learning_rate": 4.762257959092651e-06, + "loss": 0.224, + "step": 812 + }, + { + "epoch": 0.18317514856225534, + "grad_norm": 0.5252501786106092, + "learning_rate": 4.7614644487940755e-06, + "loss": 0.2137, + "step": 813 + }, + { + "epoch": 0.18340045624806375, + "grad_norm": 0.48099245665705387, + "learning_rate": 4.760669682788584e-06, + "loss": 0.2154, + "step": 814 + }, + { + "epoch": 0.1836257639338722, + "grad_norm": 0.5195932961525055, + "learning_rate": 4.759873661517484e-06, + "loss": 0.2162, + "step": 815 + }, + { + "epoch": 0.18385107161968062, + "grad_norm": 0.491871174052472, + "learning_rate": 4.759076385422773e-06, + "loss": 0.185, + "step": 816 + }, + { + "epoch": 0.18407637930548906, + "grad_norm": 0.509836207753738, + "learning_rate": 4.75827785494715e-06, + "loss": 0.2051, + "step": 817 + }, + { + "epoch": 0.18430168699129748, + "grad_norm": 0.4392612233029081, + "learning_rate": 4.7574780705340094e-06, + "loss": 0.1871, + "step": 818 + }, + { + "epoch": 0.18452699467710593, + "grad_norm": 0.48366284865291176, + "learning_rate": 4.756677032627442e-06, + "loss": 0.211, + "step": 819 + }, + { + "epoch": 0.18475230236291434, + "grad_norm": 0.5179292833390267, + "learning_rate": 4.755874741672233e-06, + "loss": 0.1983, + "step": 820 + }, + { + "epoch": 0.1849776100487228, + "grad_norm": 0.5287770477559092, + "learning_rate": 4.755071198113865e-06, + "loss": 0.1977, + "step": 821 + }, + { + "epoch": 0.1852029177345312, + "grad_norm": 0.4994813270845832, + "learning_rate": 4.754266402398517e-06, + "loss": 0.2049, + "step": 822 + }, + { + "epoch": 0.18542822542033965, + "grad_norm": 0.5018580233691172, + "learning_rate": 4.753460354973061e-06, + "loss": 0.2128, + "step": 823 + }, + { + "epoch": 0.18565353310614807, + "grad_norm": 0.4922830030204302, + "learning_rate": 4.752653056285066e-06, + "loss": 0.1933, + "step": 824 + }, + { + "epoch": 0.18587884079195652, + "grad_norm": 0.4715524098560804, + "learning_rate": 4.751844506782793e-06, + "loss": 0.2087, + "step": 825 + }, + { + "epoch": 0.18610414847776494, + "grad_norm": 0.4522675254925592, + "learning_rate": 4.7510347069152015e-06, + "loss": 0.1848, + "step": 826 + }, + { + "epoch": 0.18632945616357338, + "grad_norm": 0.48698757731561165, + "learning_rate": 4.750223657131942e-06, + "loss": 0.2058, + "step": 827 + }, + { + "epoch": 0.1865547638493818, + "grad_norm": 0.517348152295888, + "learning_rate": 4.74941135788336e-06, + "loss": 0.2017, + "step": 828 + }, + { + "epoch": 0.18678007153519025, + "grad_norm": 0.46765817389544195, + "learning_rate": 4.748597809620496e-06, + "loss": 0.1783, + "step": 829 + }, + { + "epoch": 0.18700537922099866, + "grad_norm": 0.4869830196332321, + "learning_rate": 4.747783012795083e-06, + "loss": 0.2186, + "step": 830 + }, + { + "epoch": 0.1872306869068071, + "grad_norm": 0.5186877521225656, + "learning_rate": 4.746966967859547e-06, + "loss": 0.2244, + "step": 831 + }, + { + "epoch": 0.18745599459261553, + "grad_norm": 0.49419873589873664, + "learning_rate": 4.746149675267005e-06, + "loss": 0.2026, + "step": 832 + }, + { + "epoch": 0.18768130227842397, + "grad_norm": 0.463256941223187, + "learning_rate": 4.745331135471274e-06, + "loss": 0.1821, + "step": 833 + }, + { + "epoch": 0.18790660996423242, + "grad_norm": 0.47169350721016196, + "learning_rate": 4.744511348926855e-06, + "loss": 0.204, + "step": 834 + }, + { + "epoch": 0.18813191765004084, + "grad_norm": 0.5089584854309798, + "learning_rate": 4.743690316088945e-06, + "loss": 0.2019, + "step": 835 + }, + { + "epoch": 0.18835722533584928, + "grad_norm": 0.5362100998570167, + "learning_rate": 4.742868037413435e-06, + "loss": 0.2238, + "step": 836 + }, + { + "epoch": 0.1885825330216577, + "grad_norm": 0.49376879981663013, + "learning_rate": 4.742044513356904e-06, + "loss": 0.2022, + "step": 837 + }, + { + "epoch": 0.18880784070746615, + "grad_norm": 0.4753613006458755, + "learning_rate": 4.741219744376624e-06, + "loss": 0.1857, + "step": 838 + }, + { + "epoch": 0.18903314839327456, + "grad_norm": 0.5128856428577322, + "learning_rate": 4.74039373093056e-06, + "loss": 0.2236, + "step": 839 + }, + { + "epoch": 0.189258456079083, + "grad_norm": 0.47043356290506755, + "learning_rate": 4.739566473477365e-06, + "loss": 0.1877, + "step": 840 + }, + { + "epoch": 0.18948376376489143, + "grad_norm": 0.46941463398179445, + "learning_rate": 4.738737972476385e-06, + "loss": 0.1882, + "step": 841 + }, + { + "epoch": 0.18970907145069987, + "grad_norm": 0.5214765210899315, + "learning_rate": 4.737908228387656e-06, + "loss": 0.2142, + "step": 842 + }, + { + "epoch": 0.1899343791365083, + "grad_norm": 0.5090865193255023, + "learning_rate": 4.737077241671904e-06, + "loss": 0.2184, + "step": 843 + }, + { + "epoch": 0.19015968682231674, + "grad_norm": 0.47499345696520845, + "learning_rate": 4.736245012790543e-06, + "loss": 0.1979, + "step": 844 + }, + { + "epoch": 0.19038499450812515, + "grad_norm": 0.5085348156878557, + "learning_rate": 4.735411542205681e-06, + "loss": 0.2186, + "step": 845 + }, + { + "epoch": 0.1906103021939336, + "grad_norm": 0.5114810350064947, + "learning_rate": 4.734576830380113e-06, + "loss": 0.1988, + "step": 846 + }, + { + "epoch": 0.19083560987974202, + "grad_norm": 0.5046741855344354, + "learning_rate": 4.733740877777322e-06, + "loss": 0.1889, + "step": 847 + }, + { + "epoch": 0.19106091756555046, + "grad_norm": 0.5123210879900315, + "learning_rate": 4.732903684861482e-06, + "loss": 0.217, + "step": 848 + }, + { + "epoch": 0.19128622525135888, + "grad_norm": 0.5001336462444113, + "learning_rate": 4.732065252097455e-06, + "loss": 0.2081, + "step": 849 + }, + { + "epoch": 0.19151153293716733, + "grad_norm": 0.5010914354651583, + "learning_rate": 4.731225579950791e-06, + "loss": 0.1969, + "step": 850 + }, + { + "epoch": 0.19173684062297575, + "grad_norm": 0.5315585209426944, + "learning_rate": 4.730384668887731e-06, + "loss": 0.2057, + "step": 851 + }, + { + "epoch": 0.1919621483087842, + "grad_norm": 0.47737497037739746, + "learning_rate": 4.7295425193751974e-06, + "loss": 0.1931, + "step": 852 + }, + { + "epoch": 0.1921874559945926, + "grad_norm": 0.5107650973444327, + "learning_rate": 4.728699131880808e-06, + "loss": 0.2216, + "step": 853 + }, + { + "epoch": 0.19241276368040106, + "grad_norm": 0.4927404381863255, + "learning_rate": 4.727854506872863e-06, + "loss": 0.1811, + "step": 854 + }, + { + "epoch": 0.19263807136620947, + "grad_norm": 0.5440839555234477, + "learning_rate": 4.727008644820351e-06, + "loss": 0.2144, + "step": 855 + }, + { + "epoch": 0.19286337905201792, + "grad_norm": 0.473118996117686, + "learning_rate": 4.726161546192949e-06, + "loss": 0.1905, + "step": 856 + }, + { + "epoch": 0.19308868673782634, + "grad_norm": 0.4893909321136876, + "learning_rate": 4.725313211461018e-06, + "loss": 0.2197, + "step": 857 + }, + { + "epoch": 0.19331399442363478, + "grad_norm": 0.5494385064867621, + "learning_rate": 4.724463641095606e-06, + "loss": 0.201, + "step": 858 + }, + { + "epoch": 0.1935393021094432, + "grad_norm": 0.5865614997435555, + "learning_rate": 4.72361283556845e-06, + "loss": 0.2016, + "step": 859 + }, + { + "epoch": 0.19376460979525165, + "grad_norm": 0.4817635376357147, + "learning_rate": 4.7227607953519686e-06, + "loss": 0.1974, + "step": 860 + }, + { + "epoch": 0.19398991748106006, + "grad_norm": 0.49922389827875635, + "learning_rate": 4.7219075209192686e-06, + "loss": 0.2158, + "step": 861 + }, + { + "epoch": 0.1942152251668685, + "grad_norm": 0.508901551076916, + "learning_rate": 4.721053012744142e-06, + "loss": 0.1942, + "step": 862 + }, + { + "epoch": 0.19444053285267693, + "grad_norm": 0.48630878190190474, + "learning_rate": 4.720197271301064e-06, + "loss": 0.1964, + "step": 863 + }, + { + "epoch": 0.19466584053848537, + "grad_norm": 0.52100758845284, + "learning_rate": 4.719340297065198e-06, + "loss": 0.1978, + "step": 864 + }, + { + "epoch": 0.1948911482242938, + "grad_norm": 0.5037370713625282, + "learning_rate": 4.718482090512389e-06, + "loss": 0.2097, + "step": 865 + }, + { + "epoch": 0.19511645591010224, + "grad_norm": 0.4445806183832522, + "learning_rate": 4.717622652119166e-06, + "loss": 0.179, + "step": 866 + }, + { + "epoch": 0.19534176359591066, + "grad_norm": 0.5080486724061966, + "learning_rate": 4.716761982362744e-06, + "loss": 0.2104, + "step": 867 + }, + { + "epoch": 0.1955670712817191, + "grad_norm": 0.49940784161664664, + "learning_rate": 4.715900081721021e-06, + "loss": 0.2143, + "step": 868 + }, + { + "epoch": 0.19579237896752752, + "grad_norm": 0.47038955836580026, + "learning_rate": 4.715036950672578e-06, + "loss": 0.1986, + "step": 869 + }, + { + "epoch": 0.19601768665333597, + "grad_norm": 0.5024540621434118, + "learning_rate": 4.71417258969668e-06, + "loss": 0.207, + "step": 870 + }, + { + "epoch": 0.19624299433914438, + "grad_norm": 0.5005538713420657, + "learning_rate": 4.713306999273273e-06, + "loss": 0.2014, + "step": 871 + }, + { + "epoch": 0.19646830202495283, + "grad_norm": 0.4337940525693888, + "learning_rate": 4.712440179882989e-06, + "loss": 0.1762, + "step": 872 + }, + { + "epoch": 0.19669360971076125, + "grad_norm": 0.5022289469192481, + "learning_rate": 4.711572132007139e-06, + "loss": 0.208, + "step": 873 + }, + { + "epoch": 0.1969189173965697, + "grad_norm": 0.49721562741477776, + "learning_rate": 4.710702856127718e-06, + "loss": 0.1902, + "step": 874 + }, + { + "epoch": 0.1971442250823781, + "grad_norm": 0.46782011695349285, + "learning_rate": 4.709832352727404e-06, + "loss": 0.1832, + "step": 875 + }, + { + "epoch": 0.19736953276818656, + "grad_norm": 0.5233171695881712, + "learning_rate": 4.708960622289552e-06, + "loss": 0.2297, + "step": 876 + }, + { + "epoch": 0.19759484045399497, + "grad_norm": 0.4891626271790364, + "learning_rate": 4.708087665298204e-06, + "loss": 0.2142, + "step": 877 + }, + { + "epoch": 0.19782014813980342, + "grad_norm": 0.49036548809469066, + "learning_rate": 4.70721348223808e-06, + "loss": 0.2044, + "step": 878 + }, + { + "epoch": 0.19804545582561184, + "grad_norm": 0.49459219834398377, + "learning_rate": 4.706338073594581e-06, + "loss": 0.1904, + "step": 879 + }, + { + "epoch": 0.19827076351142028, + "grad_norm": 0.5251422157074895, + "learning_rate": 4.705461439853789e-06, + "loss": 0.2099, + "step": 880 + }, + { + "epoch": 0.1984960711972287, + "grad_norm": 0.5218502451324368, + "learning_rate": 4.704583581502465e-06, + "loss": 0.2016, + "step": 881 + }, + { + "epoch": 0.19872137888303715, + "grad_norm": 0.5375383373374043, + "learning_rate": 4.703704499028052e-06, + "loss": 0.2279, + "step": 882 + }, + { + "epoch": 0.1989466865688456, + "grad_norm": 0.5013860134456253, + "learning_rate": 4.702824192918672e-06, + "loss": 0.196, + "step": 883 + }, + { + "epoch": 0.199171994254654, + "grad_norm": 0.49400976639317334, + "learning_rate": 4.701942663663126e-06, + "loss": 0.1966, + "step": 884 + }, + { + "epoch": 0.19939730194046246, + "grad_norm": 0.4782477218853474, + "learning_rate": 4.7010599117508936e-06, + "loss": 0.1975, + "step": 885 + }, + { + "epoch": 0.19962260962627087, + "grad_norm": 0.49125209285588883, + "learning_rate": 4.700175937672134e-06, + "loss": 0.2145, + "step": 886 + }, + { + "epoch": 0.19984791731207932, + "grad_norm": 0.4947849790526274, + "learning_rate": 4.699290741917686e-06, + "loss": 0.1918, + "step": 887 + }, + { + "epoch": 0.20007322499788774, + "grad_norm": 0.4871051730476656, + "learning_rate": 4.698404324979066e-06, + "loss": 0.2004, + "step": 888 + }, + { + "epoch": 0.20029853268369618, + "grad_norm": 0.4850852542725395, + "learning_rate": 4.697516687348466e-06, + "loss": 0.2078, + "step": 889 + }, + { + "epoch": 0.2005238403695046, + "grad_norm": 0.4843999369317545, + "learning_rate": 4.696627829518761e-06, + "loss": 0.2152, + "step": 890 + }, + { + "epoch": 0.20074914805531305, + "grad_norm": 0.47134945943413825, + "learning_rate": 4.695737751983499e-06, + "loss": 0.2079, + "step": 891 + }, + { + "epoch": 0.20097445574112147, + "grad_norm": 0.4905317614259472, + "learning_rate": 4.6948464552369075e-06, + "loss": 0.1991, + "step": 892 + }, + { + "epoch": 0.2011997634269299, + "grad_norm": 0.4589845824499802, + "learning_rate": 4.69395393977389e-06, + "loss": 0.1911, + "step": 893 + }, + { + "epoch": 0.20142507111273833, + "grad_norm": 0.5483608683591121, + "learning_rate": 4.693060206090028e-06, + "loss": 0.2005, + "step": 894 + }, + { + "epoch": 0.20165037879854678, + "grad_norm": 0.48146645970205143, + "learning_rate": 4.692165254681576e-06, + "loss": 0.2062, + "step": 895 + }, + { + "epoch": 0.2018756864843552, + "grad_norm": 0.4802272404776062, + "learning_rate": 4.69126908604547e-06, + "loss": 0.2015, + "step": 896 + }, + { + "epoch": 0.20210099417016364, + "grad_norm": 0.4864389896967028, + "learning_rate": 4.690371700679317e-06, + "loss": 0.2159, + "step": 897 + }, + { + "epoch": 0.20232630185597206, + "grad_norm": 0.4833496775805583, + "learning_rate": 4.689473099081403e-06, + "loss": 0.2147, + "step": 898 + }, + { + "epoch": 0.2025516095417805, + "grad_norm": 0.4735139372849752, + "learning_rate": 4.688573281750688e-06, + "loss": 0.1909, + "step": 899 + }, + { + "epoch": 0.20277691722758892, + "grad_norm": 0.4759882208841414, + "learning_rate": 4.687672249186805e-06, + "loss": 0.1924, + "step": 900 + }, + { + "epoch": 0.20300222491339737, + "grad_norm": 0.4540908297529742, + "learning_rate": 4.686770001890067e-06, + "loss": 0.1821, + "step": 901 + }, + { + "epoch": 0.20322753259920578, + "grad_norm": 0.5424952194958993, + "learning_rate": 4.685866540361456e-06, + "loss": 0.2193, + "step": 902 + }, + { + "epoch": 0.20345284028501423, + "grad_norm": 0.494573221016372, + "learning_rate": 4.684961865102631e-06, + "loss": 0.1784, + "step": 903 + }, + { + "epoch": 0.20367814797082265, + "grad_norm": 0.481241020297038, + "learning_rate": 4.684055976615924e-06, + "loss": 0.1889, + "step": 904 + }, + { + "epoch": 0.2039034556566311, + "grad_norm": 0.4888537311435723, + "learning_rate": 4.683148875404343e-06, + "loss": 0.2118, + "step": 905 + }, + { + "epoch": 0.2041287633424395, + "grad_norm": 0.530285062832325, + "learning_rate": 4.682240561971565e-06, + "loss": 0.2026, + "step": 906 + }, + { + "epoch": 0.20435407102824796, + "grad_norm": 0.5175840786735468, + "learning_rate": 4.681331036821945e-06, + "loss": 0.2111, + "step": 907 + }, + { + "epoch": 0.20457937871405638, + "grad_norm": 0.5378288296166718, + "learning_rate": 4.680420300460505e-06, + "loss": 0.2207, + "step": 908 + }, + { + "epoch": 0.20480468639986482, + "grad_norm": 0.4474303570821406, + "learning_rate": 4.679508353392946e-06, + "loss": 0.1858, + "step": 909 + }, + { + "epoch": 0.20502999408567324, + "grad_norm": 0.4501349486773342, + "learning_rate": 4.678595196125638e-06, + "loss": 0.1837, + "step": 910 + }, + { + "epoch": 0.20525530177148169, + "grad_norm": 0.46741357149834606, + "learning_rate": 4.677680829165623e-06, + "loss": 0.1948, + "step": 911 + }, + { + "epoch": 0.2054806094572901, + "grad_norm": 0.505427555015149, + "learning_rate": 4.676765253020613e-06, + "loss": 0.1932, + "step": 912 + }, + { + "epoch": 0.20570591714309855, + "grad_norm": 0.5261795901887475, + "learning_rate": 4.675848468198995e-06, + "loss": 0.2205, + "step": 913 + }, + { + "epoch": 0.20593122482890697, + "grad_norm": 0.4785390252820153, + "learning_rate": 4.674930475209827e-06, + "loss": 0.192, + "step": 914 + }, + { + "epoch": 0.2061565325147154, + "grad_norm": 0.4702296119402517, + "learning_rate": 4.674011274562833e-06, + "loss": 0.1808, + "step": 915 + }, + { + "epoch": 0.20638184020052383, + "grad_norm": 0.4657319507835202, + "learning_rate": 4.673090866768412e-06, + "loss": 0.1989, + "step": 916 + }, + { + "epoch": 0.20660714788633228, + "grad_norm": 0.46078255730309053, + "learning_rate": 4.672169252337633e-06, + "loss": 0.1841, + "step": 917 + }, + { + "epoch": 0.2068324555721407, + "grad_norm": 0.48096334840945715, + "learning_rate": 4.671246431782234e-06, + "loss": 0.2007, + "step": 918 + }, + { + "epoch": 0.20705776325794914, + "grad_norm": 0.4654079412952585, + "learning_rate": 4.670322405614621e-06, + "loss": 0.1895, + "step": 919 + }, + { + "epoch": 0.20728307094375756, + "grad_norm": 0.47745372396317154, + "learning_rate": 4.669397174347874e-06, + "loss": 0.2216, + "step": 920 + }, + { + "epoch": 0.207508378629566, + "grad_norm": 0.49261334298866033, + "learning_rate": 4.668470738495738e-06, + "loss": 0.2069, + "step": 921 + }, + { + "epoch": 0.20773368631537442, + "grad_norm": 0.4694970662148565, + "learning_rate": 4.667543098572627e-06, + "loss": 0.2098, + "step": 922 + }, + { + "epoch": 0.20795899400118287, + "grad_norm": 0.4714139379876115, + "learning_rate": 4.6666142550936286e-06, + "loss": 0.1817, + "step": 923 + }, + { + "epoch": 0.20818430168699129, + "grad_norm": 0.4795558762287292, + "learning_rate": 4.665684208574492e-06, + "loss": 0.2102, + "step": 924 + }, + { + "epoch": 0.20840960937279973, + "grad_norm": 0.47967212975979767, + "learning_rate": 4.664752959531638e-06, + "loss": 0.1939, + "step": 925 + }, + { + "epoch": 0.20863491705860815, + "grad_norm": 0.46689729262959806, + "learning_rate": 4.6638205084821544e-06, + "loss": 0.1849, + "step": 926 + }, + { + "epoch": 0.2088602247444166, + "grad_norm": 0.4901758419757583, + "learning_rate": 4.6628868559437964e-06, + "loss": 0.2, + "step": 927 + }, + { + "epoch": 0.209085532430225, + "grad_norm": 0.4769235819335332, + "learning_rate": 4.661952002434988e-06, + "loss": 0.2222, + "step": 928 + }, + { + "epoch": 0.20931084011603346, + "grad_norm": 0.47665334104968426, + "learning_rate": 4.661015948474815e-06, + "loss": 0.2048, + "step": 929 + }, + { + "epoch": 0.20953614780184188, + "grad_norm": 0.48263348426606917, + "learning_rate": 4.660078694583037e-06, + "loss": 0.2054, + "step": 930 + }, + { + "epoch": 0.20976145548765032, + "grad_norm": 0.4787579733966912, + "learning_rate": 4.659140241280075e-06, + "loss": 0.2, + "step": 931 + }, + { + "epoch": 0.20998676317345877, + "grad_norm": 0.47971682458220355, + "learning_rate": 4.658200589087016e-06, + "loss": 0.2018, + "step": 932 + }, + { + "epoch": 0.21021207085926719, + "grad_norm": 0.5023172879889126, + "learning_rate": 4.657259738525615e-06, + "loss": 0.2199, + "step": 933 + }, + { + "epoch": 0.21043737854507563, + "grad_norm": 0.4537975058101896, + "learning_rate": 4.656317690118291e-06, + "loss": 0.1882, + "step": 934 + }, + { + "epoch": 0.21066268623088405, + "grad_norm": 0.45675681525958844, + "learning_rate": 4.655374444388127e-06, + "loss": 0.2015, + "step": 935 + }, + { + "epoch": 0.2108879939166925, + "grad_norm": 0.5090466662406647, + "learning_rate": 4.654430001858874e-06, + "loss": 0.2143, + "step": 936 + }, + { + "epoch": 0.2111133016025009, + "grad_norm": 0.4809352459973046, + "learning_rate": 4.653484363054947e-06, + "loss": 0.2062, + "step": 937 + }, + { + "epoch": 0.21133860928830936, + "grad_norm": 0.5107396687444519, + "learning_rate": 4.6525375285014195e-06, + "loss": 0.216, + "step": 938 + }, + { + "epoch": 0.21156391697411778, + "grad_norm": 0.4629800541902415, + "learning_rate": 4.651589498724037e-06, + "loss": 0.1974, + "step": 939 + }, + { + "epoch": 0.21178922465992622, + "grad_norm": 0.5263963932749216, + "learning_rate": 4.650640274249205e-06, + "loss": 0.2231, + "step": 940 + }, + { + "epoch": 0.21201453234573464, + "grad_norm": 0.46810181991320743, + "learning_rate": 4.649689855603992e-06, + "loss": 0.2015, + "step": 941 + }, + { + "epoch": 0.2122398400315431, + "grad_norm": 0.48016964284677827, + "learning_rate": 4.648738243316128e-06, + "loss": 0.2066, + "step": 942 + }, + { + "epoch": 0.2124651477173515, + "grad_norm": 0.4777563828083404, + "learning_rate": 4.647785437914011e-06, + "loss": 0.2031, + "step": 943 + }, + { + "epoch": 0.21269045540315995, + "grad_norm": 0.5169923253068535, + "learning_rate": 4.646831439926696e-06, + "loss": 0.2025, + "step": 944 + }, + { + "epoch": 0.21291576308896837, + "grad_norm": 0.49956268988880237, + "learning_rate": 4.645876249883903e-06, + "loss": 0.2033, + "step": 945 + }, + { + "epoch": 0.21314107077477681, + "grad_norm": 0.42879509011316225, + "learning_rate": 4.644919868316014e-06, + "loss": 0.1703, + "step": 946 + }, + { + "epoch": 0.21336637846058523, + "grad_norm": 0.5157983960464682, + "learning_rate": 4.643962295754073e-06, + "loss": 0.2325, + "step": 947 + }, + { + "epoch": 0.21359168614639368, + "grad_norm": 0.45373820129184583, + "learning_rate": 4.643003532729783e-06, + "loss": 0.1925, + "step": 948 + }, + { + "epoch": 0.2138169938322021, + "grad_norm": 0.46768291883639157, + "learning_rate": 4.642043579775509e-06, + "loss": 0.1995, + "step": 949 + }, + { + "epoch": 0.21404230151801054, + "grad_norm": 0.4795324072096692, + "learning_rate": 4.641082437424277e-06, + "loss": 0.2039, + "step": 950 + }, + { + "epoch": 0.21426760920381896, + "grad_norm": 0.462899598246732, + "learning_rate": 4.640120106209776e-06, + "loss": 0.196, + "step": 951 + }, + { + "epoch": 0.2144929168896274, + "grad_norm": 0.5119023712466997, + "learning_rate": 4.639156586666349e-06, + "loss": 0.2122, + "step": 952 + }, + { + "epoch": 0.21471822457543582, + "grad_norm": 0.5434175288463815, + "learning_rate": 4.638191879329005e-06, + "loss": 0.2226, + "step": 953 + }, + { + "epoch": 0.21494353226124427, + "grad_norm": 0.4631901819128636, + "learning_rate": 4.63722598473341e-06, + "loss": 0.1876, + "step": 954 + }, + { + "epoch": 0.2151688399470527, + "grad_norm": 0.4983287992002548, + "learning_rate": 4.636258903415888e-06, + "loss": 0.1986, + "step": 955 + }, + { + "epoch": 0.21539414763286113, + "grad_norm": 0.48619747849195893, + "learning_rate": 4.635290635913425e-06, + "loss": 0.2009, + "step": 956 + }, + { + "epoch": 0.21561945531866955, + "grad_norm": 0.4786474276651927, + "learning_rate": 4.63432118276366e-06, + "loss": 0.2206, + "step": 957 + }, + { + "epoch": 0.215844763004478, + "grad_norm": 0.5181321336167938, + "learning_rate": 4.633350544504899e-06, + "loss": 0.2015, + "step": 958 + }, + { + "epoch": 0.21607007069028641, + "grad_norm": 0.4499194505004076, + "learning_rate": 4.632378721676098e-06, + "loss": 0.1977, + "step": 959 + }, + { + "epoch": 0.21629537837609486, + "grad_norm": 0.4659723722720275, + "learning_rate": 4.6314057148168765e-06, + "loss": 0.1942, + "step": 960 + }, + { + "epoch": 0.21652068606190328, + "grad_norm": 0.47146995989968604, + "learning_rate": 4.6304315244675065e-06, + "loss": 0.1991, + "step": 961 + }, + { + "epoch": 0.21674599374771172, + "grad_norm": 0.472387918086441, + "learning_rate": 4.629456151168921e-06, + "loss": 0.2073, + "step": 962 + }, + { + "epoch": 0.21697130143352014, + "grad_norm": 0.4744768420284569, + "learning_rate": 4.628479595462708e-06, + "loss": 0.201, + "step": 963 + }, + { + "epoch": 0.2171966091193286, + "grad_norm": 0.4790940048946635, + "learning_rate": 4.627501857891113e-06, + "loss": 0.2016, + "step": 964 + }, + { + "epoch": 0.217421916805137, + "grad_norm": 0.5433094472540639, + "learning_rate": 4.626522938997037e-06, + "loss": 0.2217, + "step": 965 + }, + { + "epoch": 0.21764722449094545, + "grad_norm": 0.5057802899697299, + "learning_rate": 4.625542839324036e-06, + "loss": 0.2174, + "step": 966 + }, + { + "epoch": 0.21787253217675387, + "grad_norm": 0.4564508405727325, + "learning_rate": 4.624561559416324e-06, + "loss": 0.1963, + "step": 967 + }, + { + "epoch": 0.21809783986256231, + "grad_norm": 0.4850122497006368, + "learning_rate": 4.623579099818769e-06, + "loss": 0.2071, + "step": 968 + }, + { + "epoch": 0.21832314754837073, + "grad_norm": 0.47069406003295644, + "learning_rate": 4.6225954610768945e-06, + "loss": 0.1826, + "step": 969 + }, + { + "epoch": 0.21854845523417918, + "grad_norm": 0.5139328996398782, + "learning_rate": 4.621610643736878e-06, + "loss": 0.2008, + "step": 970 + }, + { + "epoch": 0.2187737629199876, + "grad_norm": 0.49853075614802633, + "learning_rate": 4.620624648345552e-06, + "loss": 0.189, + "step": 971 + }, + { + "epoch": 0.21899907060579604, + "grad_norm": 0.4978922725219317, + "learning_rate": 4.6196374754504024e-06, + "loss": 0.2064, + "step": 972 + }, + { + "epoch": 0.21922437829160446, + "grad_norm": 0.5492903937176812, + "learning_rate": 4.61864912559957e-06, + "loss": 0.2181, + "step": 973 + }, + { + "epoch": 0.2194496859774129, + "grad_norm": 0.4676468598153343, + "learning_rate": 4.617659599341849e-06, + "loss": 0.1858, + "step": 974 + }, + { + "epoch": 0.21967499366322132, + "grad_norm": 0.5121273605329828, + "learning_rate": 4.616668897226686e-06, + "loss": 0.1981, + "step": 975 + }, + { + "epoch": 0.21990030134902977, + "grad_norm": 0.5133092104606375, + "learning_rate": 4.615677019804182e-06, + "loss": 0.2, + "step": 976 + }, + { + "epoch": 0.2201256090348382, + "grad_norm": 0.47052555784308775, + "learning_rate": 4.6146839676250875e-06, + "loss": 0.1856, + "step": 977 + }, + { + "epoch": 0.22035091672064663, + "grad_norm": 0.5007787592833198, + "learning_rate": 4.6136897412408084e-06, + "loss": 0.2133, + "step": 978 + }, + { + "epoch": 0.22057622440645505, + "grad_norm": 0.4476718197999082, + "learning_rate": 4.612694341203403e-06, + "loss": 0.1981, + "step": 979 + }, + { + "epoch": 0.2208015320922635, + "grad_norm": 0.505928985012462, + "learning_rate": 4.611697768065577e-06, + "loss": 0.2293, + "step": 980 + }, + { + "epoch": 0.22102683977807194, + "grad_norm": 0.45916432371599647, + "learning_rate": 4.610700022380692e-06, + "loss": 0.1986, + "step": 981 + }, + { + "epoch": 0.22125214746388036, + "grad_norm": 0.44801896454482915, + "learning_rate": 4.609701104702759e-06, + "loss": 0.1767, + "step": 982 + }, + { + "epoch": 0.2214774551496888, + "grad_norm": 0.44352066728889705, + "learning_rate": 4.6087010155864394e-06, + "loss": 0.1749, + "step": 983 + }, + { + "epoch": 0.22170276283549722, + "grad_norm": 0.5003803254595942, + "learning_rate": 4.607699755587046e-06, + "loss": 0.2185, + "step": 984 + }, + { + "epoch": 0.22192807052130567, + "grad_norm": 0.508871618965233, + "learning_rate": 4.60669732526054e-06, + "loss": 0.2267, + "step": 985 + }, + { + "epoch": 0.2221533782071141, + "grad_norm": 0.5103168838866659, + "learning_rate": 4.605693725163536e-06, + "loss": 0.211, + "step": 986 + }, + { + "epoch": 0.22237868589292253, + "grad_norm": 0.5265531733209746, + "learning_rate": 4.6046889558532925e-06, + "loss": 0.2024, + "step": 987 + }, + { + "epoch": 0.22260399357873095, + "grad_norm": 0.467641364150172, + "learning_rate": 4.603683017887722e-06, + "loss": 0.1877, + "step": 988 + }, + { + "epoch": 0.2228293012645394, + "grad_norm": 0.4586339747851823, + "learning_rate": 4.602675911825386e-06, + "loss": 0.1968, + "step": 989 + }, + { + "epoch": 0.22305460895034782, + "grad_norm": 0.4469623909492166, + "learning_rate": 4.6016676382254895e-06, + "loss": 0.1861, + "step": 990 + }, + { + "epoch": 0.22327991663615626, + "grad_norm": 0.44631832200784094, + "learning_rate": 4.600658197647892e-06, + "loss": 0.1829, + "step": 991 + }, + { + "epoch": 0.22350522432196468, + "grad_norm": 0.5005155798196936, + "learning_rate": 4.5996475906530955e-06, + "loss": 0.2042, + "step": 992 + }, + { + "epoch": 0.22373053200777313, + "grad_norm": 0.48715270623744505, + "learning_rate": 4.598635817802256e-06, + "loss": 0.1994, + "step": 993 + }, + { + "epoch": 0.22395583969358154, + "grad_norm": 0.5048595676388182, + "learning_rate": 4.597622879657171e-06, + "loss": 0.2015, + "step": 994 + }, + { + "epoch": 0.22418114737939, + "grad_norm": 0.46241002830597766, + "learning_rate": 4.596608776780287e-06, + "loss": 0.1881, + "step": 995 + }, + { + "epoch": 0.2244064550651984, + "grad_norm": 0.5040186572752211, + "learning_rate": 4.595593509734699e-06, + "loss": 0.1965, + "step": 996 + }, + { + "epoch": 0.22463176275100685, + "grad_norm": 0.4683991435681605, + "learning_rate": 4.594577079084146e-06, + "loss": 0.1905, + "step": 997 + }, + { + "epoch": 0.22485707043681527, + "grad_norm": 0.5489715975104815, + "learning_rate": 4.593559485393015e-06, + "loss": 0.1985, + "step": 998 + }, + { + "epoch": 0.22508237812262372, + "grad_norm": 0.4937562704030336, + "learning_rate": 4.592540729226336e-06, + "loss": 0.2024, + "step": 999 + }, + { + "epoch": 0.22530768580843213, + "grad_norm": 0.48927660007700485, + "learning_rate": 4.591520811149787e-06, + "loss": 0.2003, + "step": 1000 + }, + { + "epoch": 0.22530768580843213, + "eval_loss": 0.20059539377689362, + "eval_runtime": 56.9577, + "eval_samples_per_second": 50.388, + "eval_steps_per_second": 6.303, + "step": 1000 + }, + { + "epoch": 0.22553299349424058, + "grad_norm": 0.4873377661508005, + "learning_rate": 4.590499731729692e-06, + "loss": 0.1891, + "step": 1001 + }, + { + "epoch": 0.225758301180049, + "grad_norm": 0.4606505014040577, + "learning_rate": 4.589477491533016e-06, + "loss": 0.1754, + "step": 1002 + }, + { + "epoch": 0.22598360886585744, + "grad_norm": 0.4632691183389092, + "learning_rate": 4.588454091127373e-06, + "loss": 0.2012, + "step": 1003 + }, + { + "epoch": 0.22620891655166586, + "grad_norm": 0.513187475547357, + "learning_rate": 4.587429531081019e-06, + "loss": 0.2174, + "step": 1004 + }, + { + "epoch": 0.2264342242374743, + "grad_norm": 0.4632332589291033, + "learning_rate": 4.586403811962852e-06, + "loss": 0.1752, + "step": 1005 + }, + { + "epoch": 0.22665953192328273, + "grad_norm": 0.47166845046530775, + "learning_rate": 4.585376934342418e-06, + "loss": 0.1876, + "step": 1006 + }, + { + "epoch": 0.22688483960909117, + "grad_norm": 0.45534470177889064, + "learning_rate": 4.584348898789901e-06, + "loss": 0.1864, + "step": 1007 + }, + { + "epoch": 0.2271101472948996, + "grad_norm": 0.47142471258560964, + "learning_rate": 4.583319705876133e-06, + "loss": 0.1997, + "step": 1008 + }, + { + "epoch": 0.22733545498070803, + "grad_norm": 0.47524284739162703, + "learning_rate": 4.5822893561725864e-06, + "loss": 0.1897, + "step": 1009 + }, + { + "epoch": 0.22756076266651645, + "grad_norm": 0.48893857418977654, + "learning_rate": 4.581257850251376e-06, + "loss": 0.2148, + "step": 1010 + }, + { + "epoch": 0.2277860703523249, + "grad_norm": 0.5084929680834531, + "learning_rate": 4.580225188685257e-06, + "loss": 0.1949, + "step": 1011 + }, + { + "epoch": 0.22801137803813332, + "grad_norm": 0.4302654708694908, + "learning_rate": 4.579191372047631e-06, + "loss": 0.1816, + "step": 1012 + }, + { + "epoch": 0.22823668572394176, + "grad_norm": 0.5653477551038639, + "learning_rate": 4.578156400912535e-06, + "loss": 0.2325, + "step": 1013 + }, + { + "epoch": 0.22846199340975018, + "grad_norm": 0.4939375469052575, + "learning_rate": 4.577120275854649e-06, + "loss": 0.1947, + "step": 1014 + }, + { + "epoch": 0.22868730109555863, + "grad_norm": 0.45880693003154194, + "learning_rate": 4.576082997449298e-06, + "loss": 0.2009, + "step": 1015 + }, + { + "epoch": 0.22891260878136704, + "grad_norm": 0.481144247528256, + "learning_rate": 4.5750445662724426e-06, + "loss": 0.2044, + "step": 1016 + }, + { + "epoch": 0.2291379164671755, + "grad_norm": 0.43781558585457897, + "learning_rate": 4.574004982900684e-06, + "loss": 0.1929, + "step": 1017 + }, + { + "epoch": 0.2293632241529839, + "grad_norm": 0.47685847780930846, + "learning_rate": 4.572964247911265e-06, + "loss": 0.2025, + "step": 1018 + }, + { + "epoch": 0.22958853183879235, + "grad_norm": 0.48768308372414104, + "learning_rate": 4.5719223618820666e-06, + "loss": 0.2181, + "step": 1019 + }, + { + "epoch": 0.22981383952460077, + "grad_norm": 0.4388679579903982, + "learning_rate": 4.5708793253916104e-06, + "loss": 0.1879, + "step": 1020 + }, + { + "epoch": 0.23003914721040922, + "grad_norm": 0.4624116781087665, + "learning_rate": 4.569835139019054e-06, + "loss": 0.1899, + "step": 1021 + }, + { + "epoch": 0.23026445489621764, + "grad_norm": 0.4715220388699325, + "learning_rate": 4.568789803344196e-06, + "loss": 0.2098, + "step": 1022 + }, + { + "epoch": 0.23048976258202608, + "grad_norm": 0.4807951206189574, + "learning_rate": 4.567743318947472e-06, + "loss": 0.1907, + "step": 1023 + }, + { + "epoch": 0.2307150702678345, + "grad_norm": 0.5293917950146294, + "learning_rate": 4.566695686409957e-06, + "loss": 0.2277, + "step": 1024 + }, + { + "epoch": 0.23094037795364294, + "grad_norm": 0.45548382716015795, + "learning_rate": 4.56564690631336e-06, + "loss": 0.1961, + "step": 1025 + }, + { + "epoch": 0.23116568563945136, + "grad_norm": 0.4470221608628322, + "learning_rate": 4.564596979240031e-06, + "loss": 0.1808, + "step": 1026 + }, + { + "epoch": 0.2313909933252598, + "grad_norm": 0.45453412170665775, + "learning_rate": 4.563545905772956e-06, + "loss": 0.1887, + "step": 1027 + }, + { + "epoch": 0.23161630101106823, + "grad_norm": 0.45823231004159254, + "learning_rate": 4.562493686495756e-06, + "loss": 0.1873, + "step": 1028 + }, + { + "epoch": 0.23184160869687667, + "grad_norm": 0.4726721668649344, + "learning_rate": 4.56144032199269e-06, + "loss": 0.177, + "step": 1029 + }, + { + "epoch": 0.23206691638268512, + "grad_norm": 0.46570651926236073, + "learning_rate": 4.56038581284865e-06, + "loss": 0.1909, + "step": 1030 + }, + { + "epoch": 0.23229222406849354, + "grad_norm": 0.48102186512126616, + "learning_rate": 4.559330159649166e-06, + "loss": 0.1949, + "step": 1031 + }, + { + "epoch": 0.23251753175430198, + "grad_norm": 0.45537531678769255, + "learning_rate": 4.558273362980406e-06, + "loss": 0.1867, + "step": 1032 + }, + { + "epoch": 0.2327428394401104, + "grad_norm": 0.4479286120571412, + "learning_rate": 4.557215423429167e-06, + "loss": 0.193, + "step": 1033 + }, + { + "epoch": 0.23296814712591885, + "grad_norm": 0.466876833020868, + "learning_rate": 4.556156341582884e-06, + "loss": 0.2019, + "step": 1034 + }, + { + "epoch": 0.23319345481172726, + "grad_norm": 0.4744309046925059, + "learning_rate": 4.555096118029625e-06, + "loss": 0.1839, + "step": 1035 + }, + { + "epoch": 0.2334187624975357, + "grad_norm": 0.4633486908021026, + "learning_rate": 4.5540347533580935e-06, + "loss": 0.2008, + "step": 1036 + }, + { + "epoch": 0.23364407018334413, + "grad_norm": 0.49738677349987176, + "learning_rate": 4.5529722481576265e-06, + "loss": 0.1866, + "step": 1037 + }, + { + "epoch": 0.23386937786915257, + "grad_norm": 0.4554322332010211, + "learning_rate": 4.551908603018191e-06, + "loss": 0.1906, + "step": 1038 + }, + { + "epoch": 0.234094685554961, + "grad_norm": 0.47960865183869833, + "learning_rate": 4.550843818530392e-06, + "loss": 0.1985, + "step": 1039 + }, + { + "epoch": 0.23431999324076944, + "grad_norm": 0.4762222550115432, + "learning_rate": 4.549777895285464e-06, + "loss": 0.1889, + "step": 1040 + }, + { + "epoch": 0.23454530092657785, + "grad_norm": 0.4744698309952979, + "learning_rate": 4.548710833875273e-06, + "loss": 0.2032, + "step": 1041 + }, + { + "epoch": 0.2347706086123863, + "grad_norm": 0.4712325404065455, + "learning_rate": 4.547642634892321e-06, + "loss": 0.1857, + "step": 1042 + }, + { + "epoch": 0.23499591629819472, + "grad_norm": 0.48600480555475006, + "learning_rate": 4.5465732989297365e-06, + "loss": 0.1913, + "step": 1043 + }, + { + "epoch": 0.23522122398400316, + "grad_norm": 0.49136149767291737, + "learning_rate": 4.545502826581284e-06, + "loss": 0.1994, + "step": 1044 + }, + { + "epoch": 0.23544653166981158, + "grad_norm": 0.5043897243534683, + "learning_rate": 4.5444312184413554e-06, + "loss": 0.2108, + "step": 1045 + }, + { + "epoch": 0.23567183935562003, + "grad_norm": 0.5002253645900545, + "learning_rate": 4.543358475104975e-06, + "loss": 0.195, + "step": 1046 + }, + { + "epoch": 0.23589714704142845, + "grad_norm": 0.4564537413667741, + "learning_rate": 4.5422845971677985e-06, + "loss": 0.1837, + "step": 1047 + }, + { + "epoch": 0.2361224547272369, + "grad_norm": 0.46359899734593507, + "learning_rate": 4.541209585226109e-06, + "loss": 0.1806, + "step": 1048 + }, + { + "epoch": 0.2363477624130453, + "grad_norm": 0.4792586591587733, + "learning_rate": 4.5401334398768195e-06, + "loss": 0.1835, + "step": 1049 + }, + { + "epoch": 0.23657307009885375, + "grad_norm": 0.4560369435657108, + "learning_rate": 4.539056161717477e-06, + "loss": 0.1927, + "step": 1050 + }, + { + "epoch": 0.23679837778466217, + "grad_norm": 0.4762807669924131, + "learning_rate": 4.53797775134625e-06, + "loss": 0.1769, + "step": 1051 + }, + { + "epoch": 0.23702368547047062, + "grad_norm": 0.5194464584071266, + "learning_rate": 4.536898209361942e-06, + "loss": 0.2004, + "step": 1052 + }, + { + "epoch": 0.23724899315627904, + "grad_norm": 0.4651761481355681, + "learning_rate": 4.535817536363981e-06, + "loss": 0.1878, + "step": 1053 + }, + { + "epoch": 0.23747430084208748, + "grad_norm": 0.43233745793103384, + "learning_rate": 4.5347357329524254e-06, + "loss": 0.1687, + "step": 1054 + }, + { + "epoch": 0.2376996085278959, + "grad_norm": 0.4640848720869069, + "learning_rate": 4.53365279972796e-06, + "loss": 0.1969, + "step": 1055 + }, + { + "epoch": 0.23792491621370435, + "grad_norm": 0.4899414176964153, + "learning_rate": 4.532568737291898e-06, + "loss": 0.1874, + "step": 1056 + }, + { + "epoch": 0.23815022389951276, + "grad_norm": 0.49741122976240815, + "learning_rate": 4.531483546246177e-06, + "loss": 0.216, + "step": 1057 + }, + { + "epoch": 0.2383755315853212, + "grad_norm": 0.49746366263939634, + "learning_rate": 4.530397227193365e-06, + "loss": 0.1979, + "step": 1058 + }, + { + "epoch": 0.23860083927112963, + "grad_norm": 0.5231454499803017, + "learning_rate": 4.529309780736654e-06, + "loss": 0.2052, + "step": 1059 + }, + { + "epoch": 0.23882614695693807, + "grad_norm": 0.4866844045085217, + "learning_rate": 4.528221207479862e-06, + "loss": 0.2079, + "step": 1060 + }, + { + "epoch": 0.2390514546427465, + "grad_norm": 0.4456526605155982, + "learning_rate": 4.527131508027433e-06, + "loss": 0.1761, + "step": 1061 + }, + { + "epoch": 0.23927676232855494, + "grad_norm": 0.4984776619790202, + "learning_rate": 4.5260406829844364e-06, + "loss": 0.2039, + "step": 1062 + }, + { + "epoch": 0.23950207001436336, + "grad_norm": 0.535437021649417, + "learning_rate": 4.524948732956568e-06, + "loss": 0.2112, + "step": 1063 + }, + { + "epoch": 0.2397273777001718, + "grad_norm": 0.5177815576236905, + "learning_rate": 4.523855658550146e-06, + "loss": 0.2101, + "step": 1064 + }, + { + "epoch": 0.23995268538598022, + "grad_norm": 0.4788368870200376, + "learning_rate": 4.522761460372114e-06, + "loss": 0.1962, + "step": 1065 + }, + { + "epoch": 0.24017799307178866, + "grad_norm": 0.4775366648975166, + "learning_rate": 4.521666139030039e-06, + "loss": 0.192, + "step": 1066 + }, + { + "epoch": 0.24040330075759708, + "grad_norm": 0.441107754155326, + "learning_rate": 4.520569695132113e-06, + "loss": 0.1653, + "step": 1067 + }, + { + "epoch": 0.24062860844340553, + "grad_norm": 0.5254232095442831, + "learning_rate": 4.51947212928715e-06, + "loss": 0.2033, + "step": 1068 + }, + { + "epoch": 0.24085391612921395, + "grad_norm": 0.508997878208784, + "learning_rate": 4.518373442104587e-06, + "loss": 0.2024, + "step": 1069 + }, + { + "epoch": 0.2410792238150224, + "grad_norm": 0.4492236889957584, + "learning_rate": 4.5172736341944845e-06, + "loss": 0.1882, + "step": 1070 + }, + { + "epoch": 0.2413045315008308, + "grad_norm": 0.5066247284668499, + "learning_rate": 4.516172706167525e-06, + "loss": 0.1978, + "step": 1071 + }, + { + "epoch": 0.24152983918663926, + "grad_norm": 0.5126644879628546, + "learning_rate": 4.515070658635013e-06, + "loss": 0.1972, + "step": 1072 + }, + { + "epoch": 0.24175514687244767, + "grad_norm": 0.46159621555751024, + "learning_rate": 4.513967492208874e-06, + "loss": 0.1928, + "step": 1073 + }, + { + "epoch": 0.24198045455825612, + "grad_norm": 0.4784940645175002, + "learning_rate": 4.512863207501654e-06, + "loss": 0.2113, + "step": 1074 + }, + { + "epoch": 0.24220576224406454, + "grad_norm": 0.4453831889175742, + "learning_rate": 4.511757805126523e-06, + "loss": 0.1935, + "step": 1075 + }, + { + "epoch": 0.24243106992987298, + "grad_norm": 0.48517492293516534, + "learning_rate": 4.510651285697269e-06, + "loss": 0.1938, + "step": 1076 + }, + { + "epoch": 0.2426563776156814, + "grad_norm": 0.4481978785346557, + "learning_rate": 4.509543649828302e-06, + "loss": 0.1903, + "step": 1077 + }, + { + "epoch": 0.24288168530148985, + "grad_norm": 0.5027043174343503, + "learning_rate": 4.5084348981346495e-06, + "loss": 0.1933, + "step": 1078 + }, + { + "epoch": 0.2431069929872983, + "grad_norm": 0.47637048506771124, + "learning_rate": 4.507325031231959e-06, + "loss": 0.2053, + "step": 1079 + }, + { + "epoch": 0.2433323006731067, + "grad_norm": 0.4890701425489635, + "learning_rate": 4.506214049736502e-06, + "loss": 0.1969, + "step": 1080 + }, + { + "epoch": 0.24355760835891516, + "grad_norm": 0.506377164401778, + "learning_rate": 4.505101954265161e-06, + "loss": 0.2171, + "step": 1081 + }, + { + "epoch": 0.24378291604472357, + "grad_norm": 0.4627708019093523, + "learning_rate": 4.503988745435443e-06, + "loss": 0.1772, + "step": 1082 + }, + { + "epoch": 0.24400822373053202, + "grad_norm": 0.47649545946469807, + "learning_rate": 4.502874423865473e-06, + "loss": 0.2066, + "step": 1083 + }, + { + "epoch": 0.24423353141634044, + "grad_norm": 0.47958577009161757, + "learning_rate": 4.5017589901739885e-06, + "loss": 0.1888, + "step": 1084 + }, + { + "epoch": 0.24445883910214888, + "grad_norm": 0.500201620097912, + "learning_rate": 4.500642444980352e-06, + "loss": 0.1991, + "step": 1085 + }, + { + "epoch": 0.2446841467879573, + "grad_norm": 0.4495147940914091, + "learning_rate": 4.499524788904537e-06, + "loss": 0.1835, + "step": 1086 + }, + { + "epoch": 0.24490945447376575, + "grad_norm": 0.45253992410680455, + "learning_rate": 4.498406022567137e-06, + "loss": 0.1988, + "step": 1087 + }, + { + "epoch": 0.24513476215957417, + "grad_norm": 0.4760869981017264, + "learning_rate": 4.497286146589361e-06, + "loss": 0.1886, + "step": 1088 + }, + { + "epoch": 0.2453600698453826, + "grad_norm": 0.49963960872230523, + "learning_rate": 4.4961651615930344e-06, + "loss": 0.1931, + "step": 1089 + }, + { + "epoch": 0.24558537753119103, + "grad_norm": 0.4811860204716021, + "learning_rate": 4.4950430682005995e-06, + "loss": 0.1867, + "step": 1090 + }, + { + "epoch": 0.24581068521699947, + "grad_norm": 0.4377848760675584, + "learning_rate": 4.493919867035112e-06, + "loss": 0.1672, + "step": 1091 + }, + { + "epoch": 0.2460359929028079, + "grad_norm": 0.4633349829575837, + "learning_rate": 4.492795558720242e-06, + "loss": 0.1821, + "step": 1092 + }, + { + "epoch": 0.24626130058861634, + "grad_norm": 0.4645800278533146, + "learning_rate": 4.491670143880279e-06, + "loss": 0.1769, + "step": 1093 + }, + { + "epoch": 0.24648660827442476, + "grad_norm": 0.48085428387503587, + "learning_rate": 4.490543623140123e-06, + "loss": 0.2058, + "step": 1094 + }, + { + "epoch": 0.2467119159602332, + "grad_norm": 0.4917919712093067, + "learning_rate": 4.489415997125288e-06, + "loss": 0.2048, + "step": 1095 + }, + { + "epoch": 0.24693722364604162, + "grad_norm": 0.5023157743811051, + "learning_rate": 4.488287266461904e-06, + "loss": 0.2043, + "step": 1096 + }, + { + "epoch": 0.24716253133185007, + "grad_norm": 0.49560323444291804, + "learning_rate": 4.487157431776712e-06, + "loss": 0.2054, + "step": 1097 + }, + { + "epoch": 0.24738783901765848, + "grad_norm": 0.487845417095443, + "learning_rate": 4.486026493697067e-06, + "loss": 0.203, + "step": 1098 + }, + { + "epoch": 0.24761314670346693, + "grad_norm": 0.48502025922129177, + "learning_rate": 4.484894452850937e-06, + "loss": 0.1906, + "step": 1099 + }, + { + "epoch": 0.24783845438927535, + "grad_norm": 0.5088914027395394, + "learning_rate": 4.483761309866902e-06, + "loss": 0.2183, + "step": 1100 + }, + { + "epoch": 0.2480637620750838, + "grad_norm": 0.4868785633843427, + "learning_rate": 4.482627065374155e-06, + "loss": 0.2083, + "step": 1101 + }, + { + "epoch": 0.2482890697608922, + "grad_norm": 0.45743263810214724, + "learning_rate": 4.481491720002499e-06, + "loss": 0.1809, + "step": 1102 + }, + { + "epoch": 0.24851437744670066, + "grad_norm": 0.47238917005822284, + "learning_rate": 4.4803552743823495e-06, + "loss": 0.1945, + "step": 1103 + }, + { + "epoch": 0.24873968513250908, + "grad_norm": 0.5066193956915539, + "learning_rate": 4.479217729144731e-06, + "loss": 0.2077, + "step": 1104 + }, + { + "epoch": 0.24896499281831752, + "grad_norm": 0.4556738911707223, + "learning_rate": 4.478079084921282e-06, + "loss": 0.1743, + "step": 1105 + }, + { + "epoch": 0.24919030050412594, + "grad_norm": 0.4399147974180491, + "learning_rate": 4.476939342344246e-06, + "loss": 0.182, + "step": 1106 + }, + { + "epoch": 0.24941560818993438, + "grad_norm": 0.46527835945459334, + "learning_rate": 4.475798502046484e-06, + "loss": 0.1912, + "step": 1107 + }, + { + "epoch": 0.2496409158757428, + "grad_norm": 0.476108972486012, + "learning_rate": 4.474656564661458e-06, + "loss": 0.1828, + "step": 1108 + }, + { + "epoch": 0.24986622356155125, + "grad_norm": 0.5060864667543569, + "learning_rate": 4.473513530823246e-06, + "loss": 0.2106, + "step": 1109 + }, + { + "epoch": 0.25009153124735967, + "grad_norm": 0.4726786094988315, + "learning_rate": 4.472369401166531e-06, + "loss": 0.193, + "step": 1110 + }, + { + "epoch": 0.2503168389331681, + "grad_norm": 0.4710952325113045, + "learning_rate": 4.471224176326605e-06, + "loss": 0.1915, + "step": 1111 + }, + { + "epoch": 0.25054214661897656, + "grad_norm": 0.4595573681435705, + "learning_rate": 4.47007785693937e-06, + "loss": 0.1975, + "step": 1112 + }, + { + "epoch": 0.250767454304785, + "grad_norm": 0.4996964555198063, + "learning_rate": 4.468930443641333e-06, + "loss": 0.1968, + "step": 1113 + }, + { + "epoch": 0.2509927619905934, + "grad_norm": 0.4823879287675799, + "learning_rate": 4.467781937069611e-06, + "loss": 0.1995, + "step": 1114 + }, + { + "epoch": 0.2512180696764018, + "grad_norm": 0.48581218465693743, + "learning_rate": 4.466632337861926e-06, + "loss": 0.1896, + "step": 1115 + }, + { + "epoch": 0.2514433773622103, + "grad_norm": 0.5350351363814719, + "learning_rate": 4.465481646656608e-06, + "loss": 0.2127, + "step": 1116 + }, + { + "epoch": 0.2516686850480187, + "grad_norm": 0.4558891598101108, + "learning_rate": 4.464329864092593e-06, + "loss": 0.1887, + "step": 1117 + }, + { + "epoch": 0.2518939927338271, + "grad_norm": 0.5395488979865397, + "learning_rate": 4.463176990809423e-06, + "loss": 0.2102, + "step": 1118 + }, + { + "epoch": 0.25211930041963554, + "grad_norm": 0.49956933577227663, + "learning_rate": 4.462023027447246e-06, + "loss": 0.1933, + "step": 1119 + }, + { + "epoch": 0.252344608105444, + "grad_norm": 0.4868986727107991, + "learning_rate": 4.460867974646814e-06, + "loss": 0.1832, + "step": 1120 + }, + { + "epoch": 0.25256991579125243, + "grad_norm": 0.5395872322940283, + "learning_rate": 4.459711833049485e-06, + "loss": 0.226, + "step": 1121 + }, + { + "epoch": 0.25279522347706085, + "grad_norm": 0.49262650223055376, + "learning_rate": 4.45855460329722e-06, + "loss": 0.1963, + "step": 1122 + }, + { + "epoch": 0.25302053116286927, + "grad_norm": 0.5244097601372129, + "learning_rate": 4.457396286032589e-06, + "loss": 0.2041, + "step": 1123 + }, + { + "epoch": 0.25324583884867774, + "grad_norm": 0.4921328598982295, + "learning_rate": 4.45623688189876e-06, + "loss": 0.1929, + "step": 1124 + }, + { + "epoch": 0.25347114653448616, + "grad_norm": 0.47559029560310095, + "learning_rate": 4.455076391539507e-06, + "loss": 0.2044, + "step": 1125 + }, + { + "epoch": 0.2536964542202946, + "grad_norm": 0.5322582503549469, + "learning_rate": 4.453914815599206e-06, + "loss": 0.1885, + "step": 1126 + }, + { + "epoch": 0.25392176190610305, + "grad_norm": 0.5060051761318933, + "learning_rate": 4.45275215472284e-06, + "loss": 0.1996, + "step": 1127 + }, + { + "epoch": 0.25414706959191147, + "grad_norm": 0.47211025118929506, + "learning_rate": 4.451588409555988e-06, + "loss": 0.1873, + "step": 1128 + }, + { + "epoch": 0.2543723772777199, + "grad_norm": 0.4794945802050988, + "learning_rate": 4.450423580744837e-06, + "loss": 0.1852, + "step": 1129 + }, + { + "epoch": 0.2545976849635283, + "grad_norm": 0.4810270163954766, + "learning_rate": 4.4492576689361705e-06, + "loss": 0.1796, + "step": 1130 + }, + { + "epoch": 0.2548229926493368, + "grad_norm": 0.523962565518052, + "learning_rate": 4.448090674777377e-06, + "loss": 0.2048, + "step": 1131 + }, + { + "epoch": 0.2550483003351452, + "grad_norm": 0.503163952245819, + "learning_rate": 4.446922598916445e-06, + "loss": 0.1903, + "step": 1132 + }, + { + "epoch": 0.2552736080209536, + "grad_norm": 0.4596563056401201, + "learning_rate": 4.4457534420019644e-06, + "loss": 0.1826, + "step": 1133 + }, + { + "epoch": 0.25549891570676203, + "grad_norm": 0.5077969690788608, + "learning_rate": 4.444583204683123e-06, + "loss": 0.2079, + "step": 1134 + }, + { + "epoch": 0.2557242233925705, + "grad_norm": 0.5311620122502937, + "learning_rate": 4.44341188760971e-06, + "loss": 0.1816, + "step": 1135 + }, + { + "epoch": 0.2559495310783789, + "grad_norm": 0.4951419590088636, + "learning_rate": 4.4422394914321145e-06, + "loss": 0.1991, + "step": 1136 + }, + { + "epoch": 0.25617483876418734, + "grad_norm": 0.48504167969942574, + "learning_rate": 4.4410660168013255e-06, + "loss": 0.1954, + "step": 1137 + }, + { + "epoch": 0.25640014644999576, + "grad_norm": 0.4411764665447903, + "learning_rate": 4.439891464368927e-06, + "loss": 0.1836, + "step": 1138 + }, + { + "epoch": 0.25662545413580423, + "grad_norm": 0.45808743075133523, + "learning_rate": 4.438715834787107e-06, + "loss": 0.1898, + "step": 1139 + }, + { + "epoch": 0.25685076182161265, + "grad_norm": 0.4927034933649536, + "learning_rate": 4.437539128708647e-06, + "loss": 0.2005, + "step": 1140 + }, + { + "epoch": 0.25707606950742107, + "grad_norm": 0.4537693359810231, + "learning_rate": 4.436361346786929e-06, + "loss": 0.1838, + "step": 1141 + }, + { + "epoch": 0.2573013771932295, + "grad_norm": 0.4448000478086488, + "learning_rate": 4.435182489675931e-06, + "loss": 0.1755, + "step": 1142 + }, + { + "epoch": 0.25752668487903796, + "grad_norm": 0.45660985235301516, + "learning_rate": 4.4340025580302285e-06, + "loss": 0.1979, + "step": 1143 + }, + { + "epoch": 0.2577519925648464, + "grad_norm": 0.44463615357841385, + "learning_rate": 4.432821552504994e-06, + "loss": 0.1804, + "step": 1144 + }, + { + "epoch": 0.2579773002506548, + "grad_norm": 0.4634710398637101, + "learning_rate": 4.431639473755994e-06, + "loss": 0.1921, + "step": 1145 + }, + { + "epoch": 0.2582026079364632, + "grad_norm": 0.4651826299365729, + "learning_rate": 4.430456322439596e-06, + "loss": 0.1915, + "step": 1146 + }, + { + "epoch": 0.2584279156222717, + "grad_norm": 0.4803236391024267, + "learning_rate": 4.429272099212757e-06, + "loss": 0.1995, + "step": 1147 + }, + { + "epoch": 0.2586532233080801, + "grad_norm": 0.4688113074179525, + "learning_rate": 4.4280868047330325e-06, + "loss": 0.1831, + "step": 1148 + }, + { + "epoch": 0.2588785309938885, + "grad_norm": 0.4829604494340961, + "learning_rate": 4.4269004396585735e-06, + "loss": 0.2057, + "step": 1149 + }, + { + "epoch": 0.25910383867969694, + "grad_norm": 0.47586285548000795, + "learning_rate": 4.425713004648123e-06, + "loss": 0.1811, + "step": 1150 + }, + { + "epoch": 0.2593291463655054, + "grad_norm": 0.45575298673644216, + "learning_rate": 4.424524500361021e-06, + "loss": 0.1725, + "step": 1151 + }, + { + "epoch": 0.25955445405131383, + "grad_norm": 0.45776941360214873, + "learning_rate": 4.423334927457198e-06, + "loss": 0.18, + "step": 1152 + }, + { + "epoch": 0.25977976173712225, + "grad_norm": 0.484449227504515, + "learning_rate": 4.42214428659718e-06, + "loss": 0.1984, + "step": 1153 + }, + { + "epoch": 0.26000506942293067, + "grad_norm": 0.46206370597857715, + "learning_rate": 4.420952578442086e-06, + "loss": 0.1869, + "step": 1154 + }, + { + "epoch": 0.26023037710873914, + "grad_norm": 0.5029292548579077, + "learning_rate": 4.419759803653627e-06, + "loss": 0.1995, + "step": 1155 + }, + { + "epoch": 0.26045568479454756, + "grad_norm": 0.474723966790589, + "learning_rate": 4.4185659628941054e-06, + "loss": 0.1918, + "step": 1156 + }, + { + "epoch": 0.260680992480356, + "grad_norm": 0.4272695879524812, + "learning_rate": 4.417371056826417e-06, + "loss": 0.1668, + "step": 1157 + }, + { + "epoch": 0.2609063001661644, + "grad_norm": 0.5108015167387012, + "learning_rate": 4.416175086114049e-06, + "loss": 0.2149, + "step": 1158 + }, + { + "epoch": 0.26113160785197287, + "grad_norm": 0.46402701501840427, + "learning_rate": 4.414978051421081e-06, + "loss": 0.1936, + "step": 1159 + }, + { + "epoch": 0.2613569155377813, + "grad_norm": 0.5072286102154426, + "learning_rate": 4.4137799534121785e-06, + "loss": 0.2046, + "step": 1160 + }, + { + "epoch": 0.2615822232235897, + "grad_norm": 0.4644019340199612, + "learning_rate": 4.412580792752601e-06, + "loss": 0.1869, + "step": 1161 + }, + { + "epoch": 0.2618075309093981, + "grad_norm": 0.4638494998655463, + "learning_rate": 4.4113805701082e-06, + "loss": 0.1902, + "step": 1162 + }, + { + "epoch": 0.2620328385952066, + "grad_norm": 0.4758488654332712, + "learning_rate": 4.410179286145414e-06, + "loss": 0.1838, + "step": 1163 + }, + { + "epoch": 0.262258146281015, + "grad_norm": 0.4624328526889039, + "learning_rate": 4.408976941531269e-06, + "loss": 0.1973, + "step": 1164 + }, + { + "epoch": 0.26248345396682343, + "grad_norm": 0.5150028126100794, + "learning_rate": 4.407773536933384e-06, + "loss": 0.2123, + "step": 1165 + }, + { + "epoch": 0.26270876165263185, + "grad_norm": 0.4728867742679087, + "learning_rate": 4.406569073019965e-06, + "loss": 0.2021, + "step": 1166 + }, + { + "epoch": 0.2629340693384403, + "grad_norm": 0.4669527334919927, + "learning_rate": 4.4053635504598045e-06, + "loss": 0.1938, + "step": 1167 + }, + { + "epoch": 0.26315937702424874, + "grad_norm": 0.5066256560431938, + "learning_rate": 4.404156969922284e-06, + "loss": 0.1922, + "step": 1168 + }, + { + "epoch": 0.26338468471005716, + "grad_norm": 0.490829068074961, + "learning_rate": 4.402949332077375e-06, + "loss": 0.1927, + "step": 1169 + }, + { + "epoch": 0.2636099923958656, + "grad_norm": 0.5329406431047818, + "learning_rate": 4.401740637595633e-06, + "loss": 0.1859, + "step": 1170 + }, + { + "epoch": 0.26383530008167405, + "grad_norm": 0.47704312619213884, + "learning_rate": 4.400530887148199e-06, + "loss": 0.1863, + "step": 1171 + }, + { + "epoch": 0.26406060776748247, + "grad_norm": 0.4881059649121643, + "learning_rate": 4.3993200814068035e-06, + "loss": 0.1827, + "step": 1172 + }, + { + "epoch": 0.2642859154532909, + "grad_norm": 0.4757858587936569, + "learning_rate": 4.398108221043764e-06, + "loss": 0.2013, + "step": 1173 + }, + { + "epoch": 0.2645112231390993, + "grad_norm": 0.4455261953316949, + "learning_rate": 4.396895306731978e-06, + "loss": 0.1949, + "step": 1174 + }, + { + "epoch": 0.2647365308249078, + "grad_norm": 0.4986842575553214, + "learning_rate": 4.395681339144933e-06, + "loss": 0.2142, + "step": 1175 + }, + { + "epoch": 0.2649618385107162, + "grad_norm": 0.4293456932737468, + "learning_rate": 4.394466318956701e-06, + "loss": 0.1807, + "step": 1176 + }, + { + "epoch": 0.2651871461965246, + "grad_norm": 0.5156518217107511, + "learning_rate": 4.393250246841935e-06, + "loss": 0.1922, + "step": 1177 + }, + { + "epoch": 0.2654124538823331, + "grad_norm": 0.4548917169654763, + "learning_rate": 4.392033123475876e-06, + "loss": 0.1851, + "step": 1178 + }, + { + "epoch": 0.2656377615681415, + "grad_norm": 0.4652108020207147, + "learning_rate": 4.390814949534348e-06, + "loss": 0.1877, + "step": 1179 + }, + { + "epoch": 0.2658630692539499, + "grad_norm": 0.4684923553863551, + "learning_rate": 4.389595725693756e-06, + "loss": 0.1913, + "step": 1180 + }, + { + "epoch": 0.26608837693975834, + "grad_norm": 0.43319165273435173, + "learning_rate": 4.388375452631091e-06, + "loss": 0.1781, + "step": 1181 + }, + { + "epoch": 0.2663136846255668, + "grad_norm": 0.42899299541921065, + "learning_rate": 4.387154131023924e-06, + "loss": 0.1709, + "step": 1182 + }, + { + "epoch": 0.26653899231137523, + "grad_norm": 0.511450414544226, + "learning_rate": 4.385931761550411e-06, + "loss": 0.1972, + "step": 1183 + }, + { + "epoch": 0.26676429999718365, + "grad_norm": 0.4927985067686955, + "learning_rate": 4.384708344889285e-06, + "loss": 0.1914, + "step": 1184 + }, + { + "epoch": 0.26698960768299207, + "grad_norm": 0.5011738435420665, + "learning_rate": 4.383483881719867e-06, + "loss": 0.2064, + "step": 1185 + }, + { + "epoch": 0.26721491536880054, + "grad_norm": 0.47735227377270883, + "learning_rate": 4.382258372722054e-06, + "loss": 0.1948, + "step": 1186 + }, + { + "epoch": 0.26744022305460896, + "grad_norm": 0.4534789718841763, + "learning_rate": 4.381031818576326e-06, + "loss": 0.1838, + "step": 1187 + }, + { + "epoch": 0.2676655307404174, + "grad_norm": 0.4605506619916513, + "learning_rate": 4.379804219963742e-06, + "loss": 0.1918, + "step": 1188 + }, + { + "epoch": 0.2678908384262258, + "grad_norm": 0.49182598420277085, + "learning_rate": 4.378575577565945e-06, + "loss": 0.1943, + "step": 1189 + }, + { + "epoch": 0.26811614611203427, + "grad_norm": 0.4512665308420394, + "learning_rate": 4.377345892065149e-06, + "loss": 0.1763, + "step": 1190 + }, + { + "epoch": 0.2683414537978427, + "grad_norm": 0.4746898402983907, + "learning_rate": 4.376115164144157e-06, + "loss": 0.1967, + "step": 1191 + }, + { + "epoch": 0.2685667614836511, + "grad_norm": 0.4715996896397217, + "learning_rate": 4.374883394486343e-06, + "loss": 0.1937, + "step": 1192 + }, + { + "epoch": 0.2687920691694595, + "grad_norm": 0.501938384629581, + "learning_rate": 4.373650583775666e-06, + "loss": 0.1832, + "step": 1193 + }, + { + "epoch": 0.269017376855268, + "grad_norm": 0.5391709457959741, + "learning_rate": 4.3724167326966575e-06, + "loss": 0.2139, + "step": 1194 + }, + { + "epoch": 0.2692426845410764, + "grad_norm": 0.43957827447224707, + "learning_rate": 4.37118184193443e-06, + "loss": 0.1768, + "step": 1195 + }, + { + "epoch": 0.26946799222688483, + "grad_norm": 0.5066080802727835, + "learning_rate": 4.3699459121746726e-06, + "loss": 0.2075, + "step": 1196 + }, + { + "epoch": 0.26969329991269325, + "grad_norm": 0.46289907699342325, + "learning_rate": 4.368708944103649e-06, + "loss": 0.1773, + "step": 1197 + }, + { + "epoch": 0.2699186075985017, + "grad_norm": 0.45430542062991053, + "learning_rate": 4.367470938408204e-06, + "loss": 0.1833, + "step": 1198 + }, + { + "epoch": 0.27014391528431014, + "grad_norm": 0.4908638946049585, + "learning_rate": 4.366231895775755e-06, + "loss": 0.1925, + "step": 1199 + }, + { + "epoch": 0.27036922297011856, + "grad_norm": 0.4181234102832068, + "learning_rate": 4.364991816894296e-06, + "loss": 0.1659, + "step": 1200 + }, + { + "epoch": 0.270594530655927, + "grad_norm": 0.5116900793733266, + "learning_rate": 4.3637507024523975e-06, + "loss": 0.2235, + "step": 1201 + }, + { + "epoch": 0.27081983834173545, + "grad_norm": 0.49708577691712097, + "learning_rate": 4.362508553139203e-06, + "loss": 0.2061, + "step": 1202 + }, + { + "epoch": 0.27104514602754387, + "grad_norm": 0.449569545276644, + "learning_rate": 4.361265369644432e-06, + "loss": 0.1875, + "step": 1203 + }, + { + "epoch": 0.2712704537133523, + "grad_norm": 0.48359949812906805, + "learning_rate": 4.360021152658378e-06, + "loss": 0.198, + "step": 1204 + }, + { + "epoch": 0.2714957613991607, + "grad_norm": 0.4337351982623813, + "learning_rate": 4.3587759028719075e-06, + "loss": 0.178, + "step": 1205 + }, + { + "epoch": 0.2717210690849692, + "grad_norm": 0.46284021114582424, + "learning_rate": 4.357529620976463e-06, + "loss": 0.1875, + "step": 1206 + }, + { + "epoch": 0.2719463767707776, + "grad_norm": 0.46958422811400885, + "learning_rate": 4.356282307664057e-06, + "loss": 0.1952, + "step": 1207 + }, + { + "epoch": 0.272171684456586, + "grad_norm": 0.4748937607071291, + "learning_rate": 4.355033963627277e-06, + "loss": 0.1857, + "step": 1208 + }, + { + "epoch": 0.27239699214239443, + "grad_norm": 0.41660420959548705, + "learning_rate": 4.353784589559282e-06, + "loss": 0.171, + "step": 1209 + }, + { + "epoch": 0.2726222998282029, + "grad_norm": 0.4663453624745409, + "learning_rate": 4.352534186153802e-06, + "loss": 0.1924, + "step": 1210 + }, + { + "epoch": 0.2728476075140113, + "grad_norm": 0.480322758889155, + "learning_rate": 4.35128275410514e-06, + "loss": 0.1828, + "step": 1211 + }, + { + "epoch": 0.27307291519981974, + "grad_norm": 0.54294070081179, + "learning_rate": 4.3500302941081685e-06, + "loss": 0.2067, + "step": 1212 + }, + { + "epoch": 0.27329822288562816, + "grad_norm": 0.48261246040587036, + "learning_rate": 4.348776806858334e-06, + "loss": 0.1935, + "step": 1213 + }, + { + "epoch": 0.27352353057143663, + "grad_norm": 0.46417758435881934, + "learning_rate": 4.3475222930516484e-06, + "loss": 0.1949, + "step": 1214 + }, + { + "epoch": 0.27374883825724505, + "grad_norm": 0.46880074255715254, + "learning_rate": 4.346266753384699e-06, + "loss": 0.1869, + "step": 1215 + }, + { + "epoch": 0.27397414594305347, + "grad_norm": 0.4820661862466546, + "learning_rate": 4.345010188554638e-06, + "loss": 0.1933, + "step": 1216 + }, + { + "epoch": 0.2741994536288619, + "grad_norm": 0.4800481857053596, + "learning_rate": 4.343752599259192e-06, + "loss": 0.2007, + "step": 1217 + }, + { + "epoch": 0.27442476131467036, + "grad_norm": 0.4645150567042009, + "learning_rate": 4.34249398619665e-06, + "loss": 0.1963, + "step": 1218 + }, + { + "epoch": 0.2746500690004788, + "grad_norm": 0.4781186911560839, + "learning_rate": 4.341234350065876e-06, + "loss": 0.2049, + "step": 1219 + }, + { + "epoch": 0.2748753766862872, + "grad_norm": 0.45394797526114344, + "learning_rate": 4.339973691566297e-06, + "loss": 0.1811, + "step": 1220 + }, + { + "epoch": 0.2751006843720956, + "grad_norm": 0.47370498380290443, + "learning_rate": 4.33871201139791e-06, + "loss": 0.1819, + "step": 1221 + }, + { + "epoch": 0.2753259920579041, + "grad_norm": 0.46358345279091534, + "learning_rate": 4.337449310261279e-06, + "loss": 0.1864, + "step": 1222 + }, + { + "epoch": 0.2755512997437125, + "grad_norm": 0.5011472595239045, + "learning_rate": 4.336185588857535e-06, + "loss": 0.1966, + "step": 1223 + }, + { + "epoch": 0.2757766074295209, + "grad_norm": 0.5108577320955907, + "learning_rate": 4.334920847888376e-06, + "loss": 0.1962, + "step": 1224 + }, + { + "epoch": 0.2760019151153294, + "grad_norm": 0.4808873958889132, + "learning_rate": 4.333655088056065e-06, + "loss": 0.2042, + "step": 1225 + }, + { + "epoch": 0.2762272228011378, + "grad_norm": 0.6330213125461257, + "learning_rate": 4.332388310063431e-06, + "loss": 0.209, + "step": 1226 + }, + { + "epoch": 0.27645253048694624, + "grad_norm": 0.4961328421118048, + "learning_rate": 4.331120514613869e-06, + "loss": 0.1942, + "step": 1227 + }, + { + "epoch": 0.27667783817275465, + "grad_norm": 0.5102680428992475, + "learning_rate": 4.329851702411339e-06, + "loss": 0.1965, + "step": 1228 + }, + { + "epoch": 0.2769031458585631, + "grad_norm": 0.4584692588311406, + "learning_rate": 4.328581874160363e-06, + "loss": 0.1767, + "step": 1229 + }, + { + "epoch": 0.27712845354437154, + "grad_norm": 0.5071807827391811, + "learning_rate": 4.327311030566033e-06, + "loss": 0.1756, + "step": 1230 + }, + { + "epoch": 0.27735376123017996, + "grad_norm": 0.49994699305632073, + "learning_rate": 4.326039172333997e-06, + "loss": 0.1994, + "step": 1231 + }, + { + "epoch": 0.2775790689159884, + "grad_norm": 0.4888174690134026, + "learning_rate": 4.324766300170473e-06, + "loss": 0.1964, + "step": 1232 + }, + { + "epoch": 0.27780437660179685, + "grad_norm": 0.5098262364640908, + "learning_rate": 4.323492414782239e-06, + "loss": 0.195, + "step": 1233 + }, + { + "epoch": 0.27802968428760527, + "grad_norm": 0.46800522247221565, + "learning_rate": 4.322217516876635e-06, + "loss": 0.174, + "step": 1234 + }, + { + "epoch": 0.2782549919734137, + "grad_norm": 0.5166728824188981, + "learning_rate": 4.320941607161567e-06, + "loss": 0.2076, + "step": 1235 + }, + { + "epoch": 0.2784802996592221, + "grad_norm": 0.4789548166071354, + "learning_rate": 4.3196646863454975e-06, + "loss": 0.1874, + "step": 1236 + }, + { + "epoch": 0.2787056073450306, + "grad_norm": 0.4784522504460812, + "learning_rate": 4.3183867551374535e-06, + "loss": 0.1907, + "step": 1237 + }, + { + "epoch": 0.278930915030839, + "grad_norm": 0.49825493477824906, + "learning_rate": 4.317107814247022e-06, + "loss": 0.2104, + "step": 1238 + }, + { + "epoch": 0.2791562227166474, + "grad_norm": 0.48871961748808174, + "learning_rate": 4.3158278643843544e-06, + "loss": 0.2001, + "step": 1239 + }, + { + "epoch": 0.27938153040245584, + "grad_norm": 0.4923653447956059, + "learning_rate": 4.314546906260156e-06, + "loss": 0.1983, + "step": 1240 + }, + { + "epoch": 0.2796068380882643, + "grad_norm": 0.5060233189230049, + "learning_rate": 4.313264940585695e-06, + "loss": 0.1869, + "step": 1241 + }, + { + "epoch": 0.2798321457740727, + "grad_norm": 0.4588082075320794, + "learning_rate": 4.3119819680728e-06, + "loss": 0.1763, + "step": 1242 + }, + { + "epoch": 0.28005745345988114, + "grad_norm": 0.4953206561980601, + "learning_rate": 4.310697989433858e-06, + "loss": 0.2012, + "step": 1243 + }, + { + "epoch": 0.28028276114568956, + "grad_norm": 0.47029074557556283, + "learning_rate": 4.3094130053818164e-06, + "loss": 0.1815, + "step": 1244 + }, + { + "epoch": 0.28050806883149804, + "grad_norm": 0.49335061533474056, + "learning_rate": 4.308127016630176e-06, + "loss": 0.1967, + "step": 1245 + }, + { + "epoch": 0.28073337651730645, + "grad_norm": 0.467417303117729, + "learning_rate": 4.306840023892998e-06, + "loss": 0.1907, + "step": 1246 + }, + { + "epoch": 0.2809586842031149, + "grad_norm": 0.47414334568475025, + "learning_rate": 4.305552027884904e-06, + "loss": 0.1844, + "step": 1247 + }, + { + "epoch": 0.2811839918889233, + "grad_norm": 0.48011230961511236, + "learning_rate": 4.304263029321069e-06, + "loss": 0.1987, + "step": 1248 + }, + { + "epoch": 0.28140929957473176, + "grad_norm": 0.4679786377923089, + "learning_rate": 4.302973028917226e-06, + "loss": 0.1987, + "step": 1249 + }, + { + "epoch": 0.2816346072605402, + "grad_norm": 0.49679492887099075, + "learning_rate": 4.301682027389663e-06, + "loss": 0.198, + "step": 1250 + }, + { + "epoch": 0.2818599149463486, + "grad_norm": 0.45308531412013714, + "learning_rate": 4.300390025455227e-06, + "loss": 0.1876, + "step": 1251 + }, + { + "epoch": 0.282085222632157, + "grad_norm": 0.4760936851495241, + "learning_rate": 4.299097023831318e-06, + "loss": 0.1892, + "step": 1252 + }, + { + "epoch": 0.2823105303179655, + "grad_norm": 0.4494748053423337, + "learning_rate": 4.2978030232358904e-06, + "loss": 0.1901, + "step": 1253 + }, + { + "epoch": 0.2825358380037739, + "grad_norm": 0.44339068737217213, + "learning_rate": 4.2965080243874555e-06, + "loss": 0.1785, + "step": 1254 + }, + { + "epoch": 0.2827611456895823, + "grad_norm": 0.47922957221615897, + "learning_rate": 4.295212028005078e-06, + "loss": 0.1944, + "step": 1255 + }, + { + "epoch": 0.28298645337539075, + "grad_norm": 0.4661078016438798, + "learning_rate": 4.293915034808376e-06, + "loss": 0.1759, + "step": 1256 + }, + { + "epoch": 0.2832117610611992, + "grad_norm": 0.45842940270946797, + "learning_rate": 4.292617045517521e-06, + "loss": 0.1815, + "step": 1257 + }, + { + "epoch": 0.28343706874700764, + "grad_norm": 0.4415150438292391, + "learning_rate": 4.29131806085324e-06, + "loss": 0.1664, + "step": 1258 + }, + { + "epoch": 0.28366237643281605, + "grad_norm": 0.4550722532627682, + "learning_rate": 4.290018081536807e-06, + "loss": 0.1818, + "step": 1259 + }, + { + "epoch": 0.2838876841186245, + "grad_norm": 0.49330724161672795, + "learning_rate": 4.288717108290056e-06, + "loss": 0.1887, + "step": 1260 + }, + { + "epoch": 0.28411299180443295, + "grad_norm": 0.4766103515247083, + "learning_rate": 4.287415141835368e-06, + "loss": 0.2063, + "step": 1261 + }, + { + "epoch": 0.28433829949024136, + "grad_norm": 0.47599156194780146, + "learning_rate": 4.2861121828956745e-06, + "loss": 0.1897, + "step": 1262 + }, + { + "epoch": 0.2845636071760498, + "grad_norm": 0.485960224556403, + "learning_rate": 4.284808232194462e-06, + "loss": 0.1972, + "step": 1263 + }, + { + "epoch": 0.2847889148618582, + "grad_norm": 0.48326766053122255, + "learning_rate": 4.283503290455765e-06, + "loss": 0.1923, + "step": 1264 + }, + { + "epoch": 0.2850142225476667, + "grad_norm": 0.4857522447813632, + "learning_rate": 4.28219735840417e-06, + "loss": 0.1951, + "step": 1265 + }, + { + "epoch": 0.2852395302334751, + "grad_norm": 0.49448566407970473, + "learning_rate": 4.28089043676481e-06, + "loss": 0.1945, + "step": 1266 + }, + { + "epoch": 0.2854648379192835, + "grad_norm": 0.514781069172799, + "learning_rate": 4.279582526263371e-06, + "loss": 0.205, + "step": 1267 + }, + { + "epoch": 0.2856901456050919, + "grad_norm": 0.5215710152327001, + "learning_rate": 4.27827362762609e-06, + "loss": 0.2112, + "step": 1268 + }, + { + "epoch": 0.2859154532909004, + "grad_norm": 0.5092596752766128, + "learning_rate": 4.276963741579745e-06, + "loss": 0.1996, + "step": 1269 + }, + { + "epoch": 0.2861407609767088, + "grad_norm": 0.45023899303457543, + "learning_rate": 4.275652868851669e-06, + "loss": 0.1785, + "step": 1270 + }, + { + "epoch": 0.28636606866251724, + "grad_norm": 0.5072980531060344, + "learning_rate": 4.2743410101697405e-06, + "loss": 0.2017, + "step": 1271 + }, + { + "epoch": 0.28659137634832565, + "grad_norm": 0.4669719309794723, + "learning_rate": 4.2730281662623866e-06, + "loss": 0.1868, + "step": 1272 + }, + { + "epoch": 0.28681668403413413, + "grad_norm": 0.47185384450230006, + "learning_rate": 4.271714337858579e-06, + "loss": 0.1816, + "step": 1273 + }, + { + "epoch": 0.28704199171994255, + "grad_norm": 0.45011698453984966, + "learning_rate": 4.270399525687839e-06, + "loss": 0.1839, + "step": 1274 + }, + { + "epoch": 0.28726729940575096, + "grad_norm": 0.49680547729144037, + "learning_rate": 4.269083730480232e-06, + "loss": 0.2075, + "step": 1275 + }, + { + "epoch": 0.28749260709155944, + "grad_norm": 0.4692881143781052, + "learning_rate": 4.267766952966369e-06, + "loss": 0.1891, + "step": 1276 + }, + { + "epoch": 0.28771791477736786, + "grad_norm": 0.4686155233437825, + "learning_rate": 4.26644919387741e-06, + "loss": 0.1871, + "step": 1277 + }, + { + "epoch": 0.2879432224631763, + "grad_norm": 0.44109069070554513, + "learning_rate": 4.265130453945056e-06, + "loss": 0.1751, + "step": 1278 + }, + { + "epoch": 0.2881685301489847, + "grad_norm": 0.5014677411231687, + "learning_rate": 4.263810733901554e-06, + "loss": 0.2088, + "step": 1279 + }, + { + "epoch": 0.28839383783479317, + "grad_norm": 0.4775469054372758, + "learning_rate": 4.262490034479697e-06, + "loss": 0.1854, + "step": 1280 + }, + { + "epoch": 0.2886191455206016, + "grad_norm": 0.46027755373514934, + "learning_rate": 4.261168356412818e-06, + "loss": 0.1837, + "step": 1281 + }, + { + "epoch": 0.28884445320641, + "grad_norm": 0.4357746488120603, + "learning_rate": 4.259845700434797e-06, + "loss": 0.169, + "step": 1282 + }, + { + "epoch": 0.2890697608922184, + "grad_norm": 0.47508058583917057, + "learning_rate": 4.258522067280055e-06, + "loss": 0.1891, + "step": 1283 + }, + { + "epoch": 0.2892950685780269, + "grad_norm": 0.4910892208629531, + "learning_rate": 4.257197457683556e-06, + "loss": 0.1827, + "step": 1284 + }, + { + "epoch": 0.2895203762638353, + "grad_norm": 0.4722022469529989, + "learning_rate": 4.2558718723808055e-06, + "loss": 0.1979, + "step": 1285 + }, + { + "epoch": 0.28974568394964373, + "grad_norm": 0.48147289570558727, + "learning_rate": 4.254545312107854e-06, + "loss": 0.1979, + "step": 1286 + }, + { + "epoch": 0.28997099163545215, + "grad_norm": 0.5599010516181736, + "learning_rate": 4.253217777601289e-06, + "loss": 0.2042, + "step": 1287 + }, + { + "epoch": 0.2901962993212606, + "grad_norm": 0.5114733154047054, + "learning_rate": 4.251889269598241e-06, + "loss": 0.2098, + "step": 1288 + }, + { + "epoch": 0.29042160700706904, + "grad_norm": 0.4603029533334363, + "learning_rate": 4.250559788836382e-06, + "loss": 0.1804, + "step": 1289 + }, + { + "epoch": 0.29064691469287746, + "grad_norm": 0.5343576486121003, + "learning_rate": 4.249229336053924e-06, + "loss": 0.1952, + "step": 1290 + }, + { + "epoch": 0.2908722223786859, + "grad_norm": 0.470124867284512, + "learning_rate": 4.247897911989615e-06, + "loss": 0.1825, + "step": 1291 + }, + { + "epoch": 0.29109753006449435, + "grad_norm": 0.49441047089211804, + "learning_rate": 4.2465655173827465e-06, + "loss": 0.188, + "step": 1292 + }, + { + "epoch": 0.29132283775030277, + "grad_norm": 0.4444504905889329, + "learning_rate": 4.245232152973148e-06, + "loss": 0.1739, + "step": 1293 + }, + { + "epoch": 0.2915481454361112, + "grad_norm": 0.453334392499537, + "learning_rate": 4.243897819501187e-06, + "loss": 0.1725, + "step": 1294 + }, + { + "epoch": 0.2917734531219196, + "grad_norm": 0.47636947270017965, + "learning_rate": 4.242562517707768e-06, + "loss": 0.1768, + "step": 1295 + }, + { + "epoch": 0.2919987608077281, + "grad_norm": 0.4830079375939094, + "learning_rate": 4.241226248334335e-06, + "loss": 0.203, + "step": 1296 + }, + { + "epoch": 0.2922240684935365, + "grad_norm": 0.42400466785383345, + "learning_rate": 4.23988901212287e-06, + "loss": 0.1647, + "step": 1297 + }, + { + "epoch": 0.2924493761793449, + "grad_norm": 0.47069916445380955, + "learning_rate": 4.238550809815889e-06, + "loss": 0.1781, + "step": 1298 + }, + { + "epoch": 0.29267468386515333, + "grad_norm": 0.4303088797245281, + "learning_rate": 4.237211642156446e-06, + "loss": 0.1705, + "step": 1299 + }, + { + "epoch": 0.2928999915509618, + "grad_norm": 0.4365133831551404, + "learning_rate": 4.23587150988813e-06, + "loss": 0.1793, + "step": 1300 + }, + { + "epoch": 0.2931252992367702, + "grad_norm": 0.5125537327563031, + "learning_rate": 4.234530413755069e-06, + "loss": 0.2076, + "step": 1301 + }, + { + "epoch": 0.29335060692257864, + "grad_norm": 0.46235555246286686, + "learning_rate": 4.233188354501921e-06, + "loss": 0.1888, + "step": 1302 + }, + { + "epoch": 0.29357591460838706, + "grad_norm": 0.49195600825087277, + "learning_rate": 4.231845332873883e-06, + "loss": 0.191, + "step": 1303 + }, + { + "epoch": 0.29380122229419553, + "grad_norm": 0.48151491736608987, + "learning_rate": 4.230501349616683e-06, + "loss": 0.1859, + "step": 1304 + }, + { + "epoch": 0.29402652998000395, + "grad_norm": 0.5036204987582802, + "learning_rate": 4.2291564054765876e-06, + "loss": 0.1878, + "step": 1305 + }, + { + "epoch": 0.29425183766581237, + "grad_norm": 0.4692967043154867, + "learning_rate": 4.227810501200393e-06, + "loss": 0.1927, + "step": 1306 + }, + { + "epoch": 0.2944771453516208, + "grad_norm": 0.5185658295986955, + "learning_rate": 4.226463637535429e-06, + "loss": 0.2144, + "step": 1307 + }, + { + "epoch": 0.29470245303742926, + "grad_norm": 0.5045251300691717, + "learning_rate": 4.225115815229559e-06, + "loss": 0.1978, + "step": 1308 + }, + { + "epoch": 0.2949277607232377, + "grad_norm": 0.5383017651663566, + "learning_rate": 4.22376703503118e-06, + "loss": 0.1862, + "step": 1309 + }, + { + "epoch": 0.2951530684090461, + "grad_norm": 0.4310543486540994, + "learning_rate": 4.222417297689217e-06, + "loss": 0.1717, + "step": 1310 + }, + { + "epoch": 0.2953783760948545, + "grad_norm": 0.49711024740087667, + "learning_rate": 4.22106660395313e-06, + "loss": 0.2048, + "step": 1311 + }, + { + "epoch": 0.295603683780663, + "grad_norm": 0.4878815830750108, + "learning_rate": 4.219714954572909e-06, + "loss": 0.1853, + "step": 1312 + }, + { + "epoch": 0.2958289914664714, + "grad_norm": 0.4879553541558466, + "learning_rate": 4.218362350299075e-06, + "loss": 0.1955, + "step": 1313 + }, + { + "epoch": 0.2960542991522798, + "grad_norm": 0.5154078926644345, + "learning_rate": 4.217008791882678e-06, + "loss": 0.2053, + "step": 1314 + }, + { + "epoch": 0.29627960683808824, + "grad_norm": 0.49111916225881597, + "learning_rate": 4.215654280075297e-06, + "loss": 0.1943, + "step": 1315 + }, + { + "epoch": 0.2965049145238967, + "grad_norm": 0.4673462345883577, + "learning_rate": 4.214298815629046e-06, + "loss": 0.1951, + "step": 1316 + }, + { + "epoch": 0.29673022220970513, + "grad_norm": 0.4493073559361342, + "learning_rate": 4.212942399296559e-06, + "loss": 0.1718, + "step": 1317 + }, + { + "epoch": 0.29695552989551355, + "grad_norm": 0.5053730562712272, + "learning_rate": 4.211585031831007e-06, + "loss": 0.2081, + "step": 1318 + }, + { + "epoch": 0.29718083758132197, + "grad_norm": 0.4839028554404252, + "learning_rate": 4.210226713986085e-06, + "loss": 0.1847, + "step": 1319 + }, + { + "epoch": 0.29740614526713044, + "grad_norm": 0.4578952563083796, + "learning_rate": 4.208867446516015e-06, + "loss": 0.1866, + "step": 1320 + }, + { + "epoch": 0.29763145295293886, + "grad_norm": 0.43660857707331907, + "learning_rate": 4.2075072301755486e-06, + "loss": 0.1703, + "step": 1321 + }, + { + "epoch": 0.2978567606387473, + "grad_norm": 0.45922320983926745, + "learning_rate": 4.206146065719963e-06, + "loss": 0.1816, + "step": 1322 + }, + { + "epoch": 0.29808206832455575, + "grad_norm": 0.47436748632930276, + "learning_rate": 4.204783953905062e-06, + "loss": 0.203, + "step": 1323 + }, + { + "epoch": 0.29830737601036417, + "grad_norm": 0.4641836493556383, + "learning_rate": 4.203420895487175e-06, + "loss": 0.1877, + "step": 1324 + }, + { + "epoch": 0.2985326836961726, + "grad_norm": 0.4456227661319575, + "learning_rate": 4.202056891223159e-06, + "loss": 0.1841, + "step": 1325 + }, + { + "epoch": 0.298757991381981, + "grad_norm": 0.45769778038465464, + "learning_rate": 4.200691941870392e-06, + "loss": 0.1828, + "step": 1326 + }, + { + "epoch": 0.2989832990677895, + "grad_norm": 0.4435650638003759, + "learning_rate": 4.199326048186783e-06, + "loss": 0.1736, + "step": 1327 + }, + { + "epoch": 0.2992086067535979, + "grad_norm": 0.49527524555127955, + "learning_rate": 4.197959210930759e-06, + "loss": 0.2002, + "step": 1328 + }, + { + "epoch": 0.2994339144394063, + "grad_norm": 0.4455759355497459, + "learning_rate": 4.196591430861275e-06, + "loss": 0.1866, + "step": 1329 + }, + { + "epoch": 0.29965922212521473, + "grad_norm": 0.46761479436691084, + "learning_rate": 4.195222708737809e-06, + "loss": 0.1868, + "step": 1330 + }, + { + "epoch": 0.2998845298110232, + "grad_norm": 0.4891395255443091, + "learning_rate": 4.193853045320359e-06, + "loss": 0.2189, + "step": 1331 + }, + { + "epoch": 0.3001098374968316, + "grad_norm": 0.44825266332667657, + "learning_rate": 4.192482441369451e-06, + "loss": 0.1747, + "step": 1332 + }, + { + "epoch": 0.30033514518264004, + "grad_norm": 0.4719197133024163, + "learning_rate": 4.191110897646129e-06, + "loss": 0.1766, + "step": 1333 + }, + { + "epoch": 0.30056045286844846, + "grad_norm": 0.49484821590115535, + "learning_rate": 4.189738414911959e-06, + "loss": 0.1936, + "step": 1334 + }, + { + "epoch": 0.30078576055425693, + "grad_norm": 0.4656229680414219, + "learning_rate": 4.188364993929032e-06, + "loss": 0.2013, + "step": 1335 + }, + { + "epoch": 0.30101106824006535, + "grad_norm": 0.4588317151707372, + "learning_rate": 4.186990635459954e-06, + "loss": 0.1888, + "step": 1336 + }, + { + "epoch": 0.30123637592587377, + "grad_norm": 0.5049287048083992, + "learning_rate": 4.185615340267858e-06, + "loss": 0.177, + "step": 1337 + }, + { + "epoch": 0.3014616836116822, + "grad_norm": 0.5034510917467762, + "learning_rate": 4.184239109116393e-06, + "loss": 0.2021, + "step": 1338 + }, + { + "epoch": 0.30168699129749066, + "grad_norm": 0.4374643518236677, + "learning_rate": 4.182861942769729e-06, + "loss": 0.1808, + "step": 1339 + }, + { + "epoch": 0.3019122989832991, + "grad_norm": 0.4605663820806801, + "learning_rate": 4.181483841992556e-06, + "loss": 0.1789, + "step": 1340 + }, + { + "epoch": 0.3021376066691075, + "grad_norm": 0.7858695008366614, + "learning_rate": 4.18010480755008e-06, + "loss": 0.1854, + "step": 1341 + }, + { + "epoch": 0.3023629143549159, + "grad_norm": 0.48101123492736725, + "learning_rate": 4.178724840208029e-06, + "loss": 0.185, + "step": 1342 + }, + { + "epoch": 0.3025882220407244, + "grad_norm": 0.474215425040721, + "learning_rate": 4.1773439407326474e-06, + "loss": 0.1803, + "step": 1343 + }, + { + "epoch": 0.3028135297265328, + "grad_norm": 0.43887353516182803, + "learning_rate": 4.175962109890697e-06, + "loss": 0.176, + "step": 1344 + }, + { + "epoch": 0.3030388374123412, + "grad_norm": 0.4963032667829092, + "learning_rate": 4.174579348449456e-06, + "loss": 0.197, + "step": 1345 + }, + { + "epoch": 0.30326414509814964, + "grad_norm": 0.46376717590715494, + "learning_rate": 4.1731956571767215e-06, + "loss": 0.1809, + "step": 1346 + }, + { + "epoch": 0.3034894527839581, + "grad_norm": 0.49794665734183463, + "learning_rate": 4.171811036840805e-06, + "loss": 0.1769, + "step": 1347 + }, + { + "epoch": 0.30371476046976653, + "grad_norm": 0.4697355398480229, + "learning_rate": 4.170425488210534e-06, + "loss": 0.1883, + "step": 1348 + }, + { + "epoch": 0.30394006815557495, + "grad_norm": 0.4992607581189678, + "learning_rate": 4.169039012055255e-06, + "loss": 0.186, + "step": 1349 + }, + { + "epoch": 0.30416537584138337, + "grad_norm": 0.4886782383379749, + "learning_rate": 4.167651609144822e-06, + "loss": 0.184, + "step": 1350 + }, + { + "epoch": 0.30439068352719184, + "grad_norm": 0.48470108133640855, + "learning_rate": 4.166263280249613e-06, + "loss": 0.186, + "step": 1351 + }, + { + "epoch": 0.30461599121300026, + "grad_norm": 0.44143385906177274, + "learning_rate": 4.164874026140511e-06, + "loss": 0.181, + "step": 1352 + }, + { + "epoch": 0.3048412988988087, + "grad_norm": 0.44627212053965276, + "learning_rate": 4.163483847588919e-06, + "loss": 0.1751, + "step": 1353 + }, + { + "epoch": 0.3050666065846171, + "grad_norm": 0.47357238577660615, + "learning_rate": 4.1620927453667515e-06, + "loss": 0.1876, + "step": 1354 + }, + { + "epoch": 0.30529191427042557, + "grad_norm": 0.4560640538631906, + "learning_rate": 4.160700720246435e-06, + "loss": 0.1778, + "step": 1355 + }, + { + "epoch": 0.305517221956234, + "grad_norm": 0.44275946070101524, + "learning_rate": 4.159307773000909e-06, + "loss": 0.1684, + "step": 1356 + }, + { + "epoch": 0.3057425296420424, + "grad_norm": 0.4811817671856529, + "learning_rate": 4.1579139044036265e-06, + "loss": 0.192, + "step": 1357 + }, + { + "epoch": 0.3059678373278508, + "grad_norm": 0.4482602112221148, + "learning_rate": 4.15651911522855e-06, + "loss": 0.1777, + "step": 1358 + }, + { + "epoch": 0.3061931450136593, + "grad_norm": 0.5249572703936674, + "learning_rate": 4.155123406250153e-06, + "loss": 0.2118, + "step": 1359 + }, + { + "epoch": 0.3064184526994677, + "grad_norm": 0.4460409269403352, + "learning_rate": 4.153726778243422e-06, + "loss": 0.1787, + "step": 1360 + }, + { + "epoch": 0.30664376038527613, + "grad_norm": 0.48376223281047515, + "learning_rate": 4.152329231983852e-06, + "loss": 0.2051, + "step": 1361 + }, + { + "epoch": 0.30686906807108455, + "grad_norm": 0.4901084446477793, + "learning_rate": 4.150930768247449e-06, + "loss": 0.1983, + "step": 1362 + }, + { + "epoch": 0.307094375756893, + "grad_norm": 0.4859145577316666, + "learning_rate": 4.149531387810727e-06, + "loss": 0.1937, + "step": 1363 + }, + { + "epoch": 0.30731968344270144, + "grad_norm": 0.478657607779264, + "learning_rate": 4.148131091450709e-06, + "loss": 0.1755, + "step": 1364 + }, + { + "epoch": 0.30754499112850986, + "grad_norm": 0.4641945404394464, + "learning_rate": 4.14672987994493e-06, + "loss": 0.1652, + "step": 1365 + }, + { + "epoch": 0.3077702988143183, + "grad_norm": 0.47428133836806635, + "learning_rate": 4.145327754071427e-06, + "loss": 0.2019, + "step": 1366 + }, + { + "epoch": 0.30799560650012675, + "grad_norm": 0.4724473903163873, + "learning_rate": 4.1439247146087515e-06, + "loss": 0.1992, + "step": 1367 + }, + { + "epoch": 0.30822091418593517, + "grad_norm": 0.43646343985134695, + "learning_rate": 4.142520762335957e-06, + "loss": 0.1655, + "step": 1368 + }, + { + "epoch": 0.3084462218717436, + "grad_norm": 0.45190266891087105, + "learning_rate": 4.141115898032607e-06, + "loss": 0.1854, + "step": 1369 + }, + { + "epoch": 0.308671529557552, + "grad_norm": 0.47504362483876283, + "learning_rate": 4.13971012247877e-06, + "loss": 0.1728, + "step": 1370 + }, + { + "epoch": 0.3088968372433605, + "grad_norm": 0.49461070222771025, + "learning_rate": 4.138303436455019e-06, + "loss": 0.1985, + "step": 1371 + }, + { + "epoch": 0.3091221449291689, + "grad_norm": 0.45592575903631966, + "learning_rate": 4.136895840742437e-06, + "loss": 0.1889, + "step": 1372 + }, + { + "epoch": 0.3093474526149773, + "grad_norm": 0.47402208971020104, + "learning_rate": 4.1354873361226074e-06, + "loss": 0.1726, + "step": 1373 + }, + { + "epoch": 0.3095727603007858, + "grad_norm": 0.459850710025197, + "learning_rate": 4.134077923377622e-06, + "loss": 0.187, + "step": 1374 + }, + { + "epoch": 0.3097980679865942, + "grad_norm": 0.46118841220771517, + "learning_rate": 4.132667603290075e-06, + "loss": 0.194, + "step": 1375 + }, + { + "epoch": 0.3100233756724026, + "grad_norm": 0.5020566670909031, + "learning_rate": 4.131256376643062e-06, + "loss": 0.2171, + "step": 1376 + }, + { + "epoch": 0.31024868335821104, + "grad_norm": 0.4855032548435597, + "learning_rate": 4.129844244220188e-06, + "loss": 0.1782, + "step": 1377 + }, + { + "epoch": 0.3104739910440195, + "grad_norm": 0.4797040310295829, + "learning_rate": 4.128431206805556e-06, + "loss": 0.1787, + "step": 1378 + }, + { + "epoch": 0.31069929872982793, + "grad_norm": 0.47530623311353404, + "learning_rate": 4.127017265183772e-06, + "loss": 0.181, + "step": 1379 + }, + { + "epoch": 0.31092460641563635, + "grad_norm": 0.4699824994880066, + "learning_rate": 4.125602420139947e-06, + "loss": 0.1942, + "step": 1380 + }, + { + "epoch": 0.31114991410144477, + "grad_norm": 0.41923111522443063, + "learning_rate": 4.124186672459691e-06, + "loss": 0.166, + "step": 1381 + }, + { + "epoch": 0.31137522178725324, + "grad_norm": 0.4790985757988742, + "learning_rate": 4.122770022929114e-06, + "loss": 0.194, + "step": 1382 + }, + { + "epoch": 0.31160052947306166, + "grad_norm": 0.5048644232600185, + "learning_rate": 4.121352472334832e-06, + "loss": 0.1834, + "step": 1383 + }, + { + "epoch": 0.3118258371588701, + "grad_norm": 0.48680818157177147, + "learning_rate": 4.119934021463956e-06, + "loss": 0.1947, + "step": 1384 + }, + { + "epoch": 0.3120511448446785, + "grad_norm": 0.4758203544787323, + "learning_rate": 4.1185146711040995e-06, + "loss": 0.1932, + "step": 1385 + }, + { + "epoch": 0.31227645253048697, + "grad_norm": 0.5325957748784159, + "learning_rate": 4.117094422043374e-06, + "loss": 0.1983, + "step": 1386 + }, + { + "epoch": 0.3125017602162954, + "grad_norm": 0.5029297426533659, + "learning_rate": 4.115673275070392e-06, + "loss": 0.188, + "step": 1387 + }, + { + "epoch": 0.3127270679021038, + "grad_norm": 0.47415477325987965, + "learning_rate": 4.114251230974263e-06, + "loss": 0.1796, + "step": 1388 + }, + { + "epoch": 0.3129523755879122, + "grad_norm": 0.5028461801210712, + "learning_rate": 4.1128282905445945e-06, + "loss": 0.1996, + "step": 1389 + }, + { + "epoch": 0.3131776832737207, + "grad_norm": 0.529363476176261, + "learning_rate": 4.1114044545714935e-06, + "loss": 0.2049, + "step": 1390 + }, + { + "epoch": 0.3134029909595291, + "grad_norm": 0.472609567168099, + "learning_rate": 4.1099797238455615e-06, + "loss": 0.1993, + "step": 1391 + }, + { + "epoch": 0.31362829864533753, + "grad_norm": 0.4620133930855811, + "learning_rate": 4.1085540991579e-06, + "loss": 0.1808, + "step": 1392 + }, + { + "epoch": 0.31385360633114595, + "grad_norm": 0.49445347887026003, + "learning_rate": 4.107127581300105e-06, + "loss": 0.2059, + "step": 1393 + }, + { + "epoch": 0.3140789140169544, + "grad_norm": 0.47736601362539105, + "learning_rate": 4.105700171064267e-06, + "loss": 0.1904, + "step": 1394 + }, + { + "epoch": 0.31430422170276284, + "grad_norm": 0.46736326857267707, + "learning_rate": 4.104271869242975e-06, + "loss": 0.1827, + "step": 1395 + }, + { + "epoch": 0.31452952938857126, + "grad_norm": 0.535139142397991, + "learning_rate": 4.102842676629313e-06, + "loss": 0.1949, + "step": 1396 + }, + { + "epoch": 0.3147548370743797, + "grad_norm": 0.4861401696372517, + "learning_rate": 4.101412594016855e-06, + "loss": 0.1963, + "step": 1397 + }, + { + "epoch": 0.31498014476018815, + "grad_norm": 0.46033147239068883, + "learning_rate": 4.0999816221996755e-06, + "loss": 0.1743, + "step": 1398 + }, + { + "epoch": 0.31520545244599657, + "grad_norm": 0.48048061271812775, + "learning_rate": 4.098549761972339e-06, + "loss": 0.1891, + "step": 1399 + }, + { + "epoch": 0.315430760131805, + "grad_norm": 0.5053303850779427, + "learning_rate": 4.097117014129903e-06, + "loss": 0.1918, + "step": 1400 + }, + { + "epoch": 0.3156560678176134, + "grad_norm": 0.49613675914207667, + "learning_rate": 4.095683379467922e-06, + "loss": 0.1959, + "step": 1401 + }, + { + "epoch": 0.3158813755034219, + "grad_norm": 0.4863386018085936, + "learning_rate": 4.094248858782436e-06, + "loss": 0.1991, + "step": 1402 + }, + { + "epoch": 0.3161066831892303, + "grad_norm": 0.4783513971449028, + "learning_rate": 4.092813452869983e-06, + "loss": 0.1997, + "step": 1403 + }, + { + "epoch": 0.3163319908750387, + "grad_norm": 0.5021687419423587, + "learning_rate": 4.091377162527592e-06, + "loss": 0.1934, + "step": 1404 + }, + { + "epoch": 0.31655729856084713, + "grad_norm": 0.44779584584671767, + "learning_rate": 4.089939988552778e-06, + "loss": 0.1919, + "step": 1405 + }, + { + "epoch": 0.3167826062466556, + "grad_norm": 0.47721237898132285, + "learning_rate": 4.088501931743551e-06, + "loss": 0.2113, + "step": 1406 + }, + { + "epoch": 0.317007913932464, + "grad_norm": 0.4388024480687185, + "learning_rate": 4.087062992898413e-06, + "loss": 0.1751, + "step": 1407 + }, + { + "epoch": 0.31723322161827244, + "grad_norm": 0.4812623514236023, + "learning_rate": 4.08562317281635e-06, + "loss": 0.1934, + "step": 1408 + }, + { + "epoch": 0.31745852930408086, + "grad_norm": 0.4811168701866593, + "learning_rate": 4.084182472296842e-06, + "loss": 0.1901, + "step": 1409 + }, + { + "epoch": 0.31768383698988933, + "grad_norm": 0.46299024011221557, + "learning_rate": 4.082740892139856e-06, + "loss": 0.1918, + "step": 1410 + }, + { + "epoch": 0.31790914467569775, + "grad_norm": 0.46431431426484443, + "learning_rate": 4.081298433145847e-06, + "loss": 0.1785, + "step": 1411 + }, + { + "epoch": 0.31813445236150617, + "grad_norm": 0.4761673174138738, + "learning_rate": 4.07985509611576e-06, + "loss": 0.1938, + "step": 1412 + }, + { + "epoch": 0.3183597600473146, + "grad_norm": 0.47136115774649245, + "learning_rate": 4.078410881851026e-06, + "loss": 0.1911, + "step": 1413 + }, + { + "epoch": 0.31858506773312306, + "grad_norm": 0.47875295025179365, + "learning_rate": 4.076965791153562e-06, + "loss": 0.1941, + "step": 1414 + }, + { + "epoch": 0.3188103754189315, + "grad_norm": 0.4899721058436374, + "learning_rate": 4.075519824825775e-06, + "loss": 0.2065, + "step": 1415 + }, + { + "epoch": 0.3190356831047399, + "grad_norm": 0.4594859900194388, + "learning_rate": 4.074072983670555e-06, + "loss": 0.1907, + "step": 1416 + }, + { + "epoch": 0.3192609907905483, + "grad_norm": 0.47935713293206506, + "learning_rate": 4.072625268491279e-06, + "loss": 0.1853, + "step": 1417 + }, + { + "epoch": 0.3194862984763568, + "grad_norm": 0.49943973505753847, + "learning_rate": 4.071176680091809e-06, + "loss": 0.2119, + "step": 1418 + }, + { + "epoch": 0.3197116061621652, + "grad_norm": 0.474470260007517, + "learning_rate": 4.069727219276493e-06, + "loss": 0.1703, + "step": 1419 + }, + { + "epoch": 0.3199369138479736, + "grad_norm": 0.46974275408314936, + "learning_rate": 4.068276886850162e-06, + "loss": 0.1668, + "step": 1420 + }, + { + "epoch": 0.3201622215337821, + "grad_norm": 0.428467625182276, + "learning_rate": 4.066825683618132e-06, + "loss": 0.1692, + "step": 1421 + }, + { + "epoch": 0.3203875292195905, + "grad_norm": 0.5231506062106315, + "learning_rate": 4.065373610386201e-06, + "loss": 0.2163, + "step": 1422 + }, + { + "epoch": 0.32061283690539893, + "grad_norm": 0.45256985898373553, + "learning_rate": 4.063920667960652e-06, + "loss": 0.1884, + "step": 1423 + }, + { + "epoch": 0.32083814459120735, + "grad_norm": 0.44722673202253765, + "learning_rate": 4.06246685714825e-06, + "loss": 0.1835, + "step": 1424 + }, + { + "epoch": 0.3210634522770158, + "grad_norm": 0.4669011483588059, + "learning_rate": 4.061012178756242e-06, + "loss": 0.1915, + "step": 1425 + }, + { + "epoch": 0.32128875996282424, + "grad_norm": 0.46711885521138474, + "learning_rate": 4.059556633592356e-06, + "loss": 0.179, + "step": 1426 + }, + { + "epoch": 0.32151406764863266, + "grad_norm": 0.4767663007250986, + "learning_rate": 4.058100222464802e-06, + "loss": 0.1942, + "step": 1427 + }, + { + "epoch": 0.3217393753344411, + "grad_norm": 0.4794219470239876, + "learning_rate": 4.056642946182271e-06, + "loss": 0.1791, + "step": 1428 + }, + { + "epoch": 0.32196468302024955, + "grad_norm": 0.44724646637518295, + "learning_rate": 4.0551848055539345e-06, + "loss": 0.1839, + "step": 1429 + }, + { + "epoch": 0.32218999070605797, + "grad_norm": 0.48557730437163366, + "learning_rate": 4.0537258013894434e-06, + "loss": 0.1959, + "step": 1430 + }, + { + "epoch": 0.3224152983918664, + "grad_norm": 0.48220333097287077, + "learning_rate": 4.052265934498929e-06, + "loss": 0.1989, + "step": 1431 + }, + { + "epoch": 0.3226406060776748, + "grad_norm": 0.4696663509618159, + "learning_rate": 4.0508052056929995e-06, + "loss": 0.1749, + "step": 1432 + }, + { + "epoch": 0.3228659137634833, + "grad_norm": 0.4769281167542128, + "learning_rate": 4.049343615782744e-06, + "loss": 0.1829, + "step": 1433 + }, + { + "epoch": 0.3230912214492917, + "grad_norm": 0.4637689288072718, + "learning_rate": 4.047881165579729e-06, + "loss": 0.1929, + "step": 1434 + }, + { + "epoch": 0.3233165291351001, + "grad_norm": 0.4746389489305311, + "learning_rate": 4.046417855895999e-06, + "loss": 0.1866, + "step": 1435 + }, + { + "epoch": 0.32354183682090853, + "grad_norm": 0.4805399687886267, + "learning_rate": 4.044953687544074e-06, + "loss": 0.1879, + "step": 1436 + }, + { + "epoch": 0.323767144506717, + "grad_norm": 0.4329397526489964, + "learning_rate": 4.043488661336953e-06, + "loss": 0.1705, + "step": 1437 + }, + { + "epoch": 0.3239924521925254, + "grad_norm": 0.4678813731064558, + "learning_rate": 4.042022778088111e-06, + "loss": 0.1779, + "step": 1438 + }, + { + "epoch": 0.32421775987833384, + "grad_norm": 0.4808521136031041, + "learning_rate": 4.0405560386114975e-06, + "loss": 0.1991, + "step": 1439 + }, + { + "epoch": 0.32444306756414226, + "grad_norm": 0.45591993339117204, + "learning_rate": 4.039088443721538e-06, + "loss": 0.1653, + "step": 1440 + }, + { + "epoch": 0.32466837524995074, + "grad_norm": 0.6328858176349258, + "learning_rate": 4.0376199942331335e-06, + "loss": 0.1885, + "step": 1441 + }, + { + "epoch": 0.32489368293575915, + "grad_norm": 0.4693374739747998, + "learning_rate": 4.03615069096166e-06, + "loss": 0.1896, + "step": 1442 + }, + { + "epoch": 0.32511899062156757, + "grad_norm": 0.48708363517247544, + "learning_rate": 4.034680534722966e-06, + "loss": 0.1923, + "step": 1443 + }, + { + "epoch": 0.325344298307376, + "grad_norm": 0.47563479472336, + "learning_rate": 4.033209526333375e-06, + "loss": 0.1852, + "step": 1444 + }, + { + "epoch": 0.32556960599318446, + "grad_norm": 0.4937222507991361, + "learning_rate": 4.0317376666096815e-06, + "loss": 0.2042, + "step": 1445 + }, + { + "epoch": 0.3257949136789929, + "grad_norm": 0.5216988625410693, + "learning_rate": 4.030264956369158e-06, + "loss": 0.1902, + "step": 1446 + }, + { + "epoch": 0.3260202213648013, + "grad_norm": 0.5043705892505386, + "learning_rate": 4.028791396429541e-06, + "loss": 0.1909, + "step": 1447 + }, + { + "epoch": 0.3262455290506097, + "grad_norm": 0.4694685153588241, + "learning_rate": 4.0273169876090475e-06, + "loss": 0.1939, + "step": 1448 + }, + { + "epoch": 0.3264708367364182, + "grad_norm": 0.4396381690669189, + "learning_rate": 4.02584173072636e-06, + "loss": 0.1715, + "step": 1449 + }, + { + "epoch": 0.3266961444222266, + "grad_norm": 0.43727184614459064, + "learning_rate": 4.024365626600632e-06, + "loss": 0.1673, + "step": 1450 + }, + { + "epoch": 0.326921452108035, + "grad_norm": 0.5014941672059154, + "learning_rate": 4.022888676051492e-06, + "loss": 0.1809, + "step": 1451 + }, + { + "epoch": 0.32714675979384344, + "grad_norm": 0.4574194871944918, + "learning_rate": 4.021410879899035e-06, + "loss": 0.1816, + "step": 1452 + }, + { + "epoch": 0.3273720674796519, + "grad_norm": 0.4804416537274082, + "learning_rate": 4.019932238963824e-06, + "loss": 0.1802, + "step": 1453 + }, + { + "epoch": 0.32759737516546034, + "grad_norm": 0.4745570503202099, + "learning_rate": 4.018452754066895e-06, + "loss": 0.1917, + "step": 1454 + }, + { + "epoch": 0.32782268285126875, + "grad_norm": 0.44958791779542656, + "learning_rate": 4.016972426029751e-06, + "loss": 0.1821, + "step": 1455 + }, + { + "epoch": 0.32804799053707717, + "grad_norm": 0.4610521201830226, + "learning_rate": 4.015491255674362e-06, + "loss": 0.1695, + "step": 1456 + }, + { + "epoch": 0.32827329822288565, + "grad_norm": 0.4835313818228284, + "learning_rate": 4.014009243823167e-06, + "loss": 0.1986, + "step": 1457 + }, + { + "epoch": 0.32849860590869406, + "grad_norm": 0.5056923204224979, + "learning_rate": 4.012526391299073e-06, + "loss": 0.182, + "step": 1458 + }, + { + "epoch": 0.3287239135945025, + "grad_norm": 0.4900583329553471, + "learning_rate": 4.01104269892545e-06, + "loss": 0.1906, + "step": 1459 + }, + { + "epoch": 0.3289492212803109, + "grad_norm": 0.4916000610247718, + "learning_rate": 4.0095581675261405e-06, + "loss": 0.1831, + "step": 1460 + }, + { + "epoch": 0.3291745289661194, + "grad_norm": 0.5119650609462135, + "learning_rate": 4.008072797925447e-06, + "loss": 0.188, + "step": 1461 + }, + { + "epoch": 0.3293998366519278, + "grad_norm": 0.4699109358825036, + "learning_rate": 4.006586590948141e-06, + "loss": 0.1763, + "step": 1462 + }, + { + "epoch": 0.3296251443377362, + "grad_norm": 0.6046228199077757, + "learning_rate": 4.005099547419458e-06, + "loss": 0.1994, + "step": 1463 + }, + { + "epoch": 0.3298504520235446, + "grad_norm": 0.4824539152015273, + "learning_rate": 4.003611668165097e-06, + "loss": 0.1793, + "step": 1464 + }, + { + "epoch": 0.3300757597093531, + "grad_norm": 0.4706188913394758, + "learning_rate": 4.0021229540112226e-06, + "loss": 0.1935, + "step": 1465 + }, + { + "epoch": 0.3303010673951615, + "grad_norm": 0.5322522318183285, + "learning_rate": 4.000633405784461e-06, + "loss": 0.1928, + "step": 1466 + }, + { + "epoch": 0.33052637508096994, + "grad_norm": 0.4723656739440685, + "learning_rate": 3.999143024311904e-06, + "loss": 0.1912, + "step": 1467 + }, + { + "epoch": 0.33075168276677835, + "grad_norm": 0.4896996091683646, + "learning_rate": 3.997651810421106e-06, + "loss": 0.196, + "step": 1468 + }, + { + "epoch": 0.33097699045258683, + "grad_norm": 0.4742887053571119, + "learning_rate": 3.99615976494008e-06, + "loss": 0.1874, + "step": 1469 + }, + { + "epoch": 0.33120229813839525, + "grad_norm": 0.4938629569782315, + "learning_rate": 3.994666888697304e-06, + "loss": 0.2066, + "step": 1470 + }, + { + "epoch": 0.33142760582420366, + "grad_norm": 0.5256193757123643, + "learning_rate": 3.993173182521718e-06, + "loss": 0.1804, + "step": 1471 + }, + { + "epoch": 0.33165291351001214, + "grad_norm": 0.5031179164506312, + "learning_rate": 3.991678647242719e-06, + "loss": 0.201, + "step": 1472 + }, + { + "epoch": 0.33187822119582056, + "grad_norm": 0.4872275458399549, + "learning_rate": 3.990183283690169e-06, + "loss": 0.181, + "step": 1473 + }, + { + "epoch": 0.332103528881629, + "grad_norm": 0.47484678510315825, + "learning_rate": 3.988687092694386e-06, + "loss": 0.1839, + "step": 1474 + }, + { + "epoch": 0.3323288365674374, + "grad_norm": 0.4761838528991371, + "learning_rate": 3.98719007508615e-06, + "loss": 0.1901, + "step": 1475 + }, + { + "epoch": 0.33255414425324586, + "grad_norm": 0.48557548791813976, + "learning_rate": 3.985692231696699e-06, + "loss": 0.193, + "step": 1476 + }, + { + "epoch": 0.3327794519390543, + "grad_norm": 0.5140660483592174, + "learning_rate": 3.98419356335773e-06, + "loss": 0.1935, + "step": 1477 + }, + { + "epoch": 0.3330047596248627, + "grad_norm": 0.4616425228732021, + "learning_rate": 3.982694070901396e-06, + "loss": 0.1901, + "step": 1478 + }, + { + "epoch": 0.3332300673106711, + "grad_norm": 0.4682986285768006, + "learning_rate": 3.981193755160311e-06, + "loss": 0.184, + "step": 1479 + }, + { + "epoch": 0.3334553749964796, + "grad_norm": 0.5222640302881676, + "learning_rate": 3.979692616967543e-06, + "loss": 0.2143, + "step": 1480 + }, + { + "epoch": 0.333680682682288, + "grad_norm": 0.47119325625073183, + "learning_rate": 3.9781906571566195e-06, + "loss": 0.1884, + "step": 1481 + }, + { + "epoch": 0.33390599036809643, + "grad_norm": 0.5257284496891892, + "learning_rate": 3.976687876561523e-06, + "loss": 0.2093, + "step": 1482 + }, + { + "epoch": 0.33413129805390485, + "grad_norm": 0.4847675935357707, + "learning_rate": 3.975184276016689e-06, + "loss": 0.1946, + "step": 1483 + }, + { + "epoch": 0.3343566057397133, + "grad_norm": 0.4903878905158691, + "learning_rate": 3.973679856357014e-06, + "loss": 0.1774, + "step": 1484 + }, + { + "epoch": 0.33458191342552174, + "grad_norm": 0.46121846780994913, + "learning_rate": 3.972174618417843e-06, + "loss": 0.1784, + "step": 1485 + }, + { + "epoch": 0.33480722111133016, + "grad_norm": 0.5208829745121606, + "learning_rate": 3.970668563034982e-06, + "loss": 0.1936, + "step": 1486 + }, + { + "epoch": 0.3350325287971386, + "grad_norm": 0.47702696675642997, + "learning_rate": 3.9691616910446845e-06, + "loss": 0.1889, + "step": 1487 + }, + { + "epoch": 0.33525783648294705, + "grad_norm": 0.508247989988977, + "learning_rate": 3.967654003283662e-06, + "loss": 0.1992, + "step": 1488 + }, + { + "epoch": 0.33548314416875546, + "grad_norm": 0.4897255803148681, + "learning_rate": 3.966145500589076e-06, + "loss": 0.1962, + "step": 1489 + }, + { + "epoch": 0.3357084518545639, + "grad_norm": 0.511341673254947, + "learning_rate": 3.9646361837985435e-06, + "loss": 0.2043, + "step": 1490 + }, + { + "epoch": 0.3359337595403723, + "grad_norm": 0.421563160264042, + "learning_rate": 3.9631260537501304e-06, + "loss": 0.1678, + "step": 1491 + }, + { + "epoch": 0.3361590672261808, + "grad_norm": 0.47248311811473054, + "learning_rate": 3.961615111282357e-06, + "loss": 0.2027, + "step": 1492 + }, + { + "epoch": 0.3363843749119892, + "grad_norm": 0.47740860136767643, + "learning_rate": 3.960103357234192e-06, + "loss": 0.1874, + "step": 1493 + }, + { + "epoch": 0.3366096825977976, + "grad_norm": 0.46379530514013034, + "learning_rate": 3.958590792445057e-06, + "loss": 0.2057, + "step": 1494 + }, + { + "epoch": 0.33683499028360603, + "grad_norm": 0.5040079027478207, + "learning_rate": 3.957077417754822e-06, + "loss": 0.1851, + "step": 1495 + }, + { + "epoch": 0.3370602979694145, + "grad_norm": 0.48374228050685775, + "learning_rate": 3.9555632340038075e-06, + "loss": 0.2025, + "step": 1496 + }, + { + "epoch": 0.3372856056552229, + "grad_norm": 0.5027044076035537, + "learning_rate": 3.9540482420327845e-06, + "loss": 0.1922, + "step": 1497 + }, + { + "epoch": 0.33751091334103134, + "grad_norm": 0.46977957080060573, + "learning_rate": 3.9525324426829716e-06, + "loss": 0.1857, + "step": 1498 + }, + { + "epoch": 0.33773622102683976, + "grad_norm": 0.4431535447314052, + "learning_rate": 3.951015836796034e-06, + "loss": 0.1708, + "step": 1499 + }, + { + "epoch": 0.33796152871264823, + "grad_norm": 0.41604022909371596, + "learning_rate": 3.949498425214088e-06, + "loss": 0.166, + "step": 1500 + }, + { + "epoch": 0.33796152871264823, + "eval_loss": 0.18758253753185272, + "eval_runtime": 57.0664, + "eval_samples_per_second": 50.292, + "eval_steps_per_second": 6.291, + "step": 1500 + }, + { + "epoch": 0.33818683639845665, + "grad_norm": 0.48261068557650305, + "learning_rate": 3.947980208779693e-06, + "loss": 0.1889, + "step": 1501 + }, + { + "epoch": 0.33841214408426507, + "grad_norm": 0.4802496809455009, + "learning_rate": 3.946461188335863e-06, + "loss": 0.2106, + "step": 1502 + }, + { + "epoch": 0.3386374517700735, + "grad_norm": 0.5026764767902618, + "learning_rate": 3.944941364726049e-06, + "loss": 0.2189, + "step": 1503 + }, + { + "epoch": 0.33886275945588196, + "grad_norm": 0.4988387475849198, + "learning_rate": 3.943420738794153e-06, + "loss": 0.1783, + "step": 1504 + }, + { + "epoch": 0.3390880671416904, + "grad_norm": 0.49541433529994094, + "learning_rate": 3.941899311384525e-06, + "loss": 0.1953, + "step": 1505 + }, + { + "epoch": 0.3393133748274988, + "grad_norm": 0.49290797711453693, + "learning_rate": 3.9403770833419535e-06, + "loss": 0.1872, + "step": 1506 + }, + { + "epoch": 0.3395386825133072, + "grad_norm": 0.4448029081678158, + "learning_rate": 3.938854055511676e-06, + "loss": 0.1753, + "step": 1507 + }, + { + "epoch": 0.3397639901991157, + "grad_norm": 0.4948704166369294, + "learning_rate": 3.937330228739374e-06, + "loss": 0.1933, + "step": 1508 + }, + { + "epoch": 0.3399892978849241, + "grad_norm": 0.4654841534625173, + "learning_rate": 3.9358056038711714e-06, + "loss": 0.189, + "step": 1509 + }, + { + "epoch": 0.3402146055707325, + "grad_norm": 0.46876317124135675, + "learning_rate": 3.934280181753634e-06, + "loss": 0.1851, + "step": 1510 + }, + { + "epoch": 0.34043991325654094, + "grad_norm": 0.4588410737959432, + "learning_rate": 3.932753963233773e-06, + "loss": 0.1661, + "step": 1511 + }, + { + "epoch": 0.3406652209423494, + "grad_norm": 0.45453902105647565, + "learning_rate": 3.931226949159041e-06, + "loss": 0.1865, + "step": 1512 + }, + { + "epoch": 0.34089052862815783, + "grad_norm": 0.49726362602150964, + "learning_rate": 3.9296991403773325e-06, + "loss": 0.1853, + "step": 1513 + }, + { + "epoch": 0.34111583631396625, + "grad_norm": 0.45236058708084204, + "learning_rate": 3.9281705377369814e-06, + "loss": 0.1657, + "step": 1514 + }, + { + "epoch": 0.34134114399977467, + "grad_norm": 0.45942547036063025, + "learning_rate": 3.9266411420867635e-06, + "loss": 0.1664, + "step": 1515 + }, + { + "epoch": 0.34156645168558314, + "grad_norm": 0.48572021263925935, + "learning_rate": 3.925110954275897e-06, + "loss": 0.1917, + "step": 1516 + }, + { + "epoch": 0.34179175937139156, + "grad_norm": 0.45057282883529537, + "learning_rate": 3.923579975154037e-06, + "loss": 0.1737, + "step": 1517 + }, + { + "epoch": 0.3420170670572, + "grad_norm": 0.4676622415061151, + "learning_rate": 3.922048205571279e-06, + "loss": 0.1884, + "step": 1518 + }, + { + "epoch": 0.34224237474300845, + "grad_norm": 0.4824920697046802, + "learning_rate": 3.920515646378159e-06, + "loss": 0.185, + "step": 1519 + }, + { + "epoch": 0.34246768242881687, + "grad_norm": 0.4828420925891174, + "learning_rate": 3.918982298425647e-06, + "loss": 0.2054, + "step": 1520 + }, + { + "epoch": 0.3426929901146253, + "grad_norm": 0.44789058404295734, + "learning_rate": 3.917448162565157e-06, + "loss": 0.1636, + "step": 1521 + }, + { + "epoch": 0.3429182978004337, + "grad_norm": 0.4822156988520378, + "learning_rate": 3.915913239648535e-06, + "loss": 0.2028, + "step": 1522 + }, + { + "epoch": 0.3431436054862422, + "grad_norm": 0.48013577131461527, + "learning_rate": 3.91437753052807e-06, + "loss": 0.1929, + "step": 1523 + }, + { + "epoch": 0.3433689131720506, + "grad_norm": 0.4658198902353949, + "learning_rate": 3.91284103605648e-06, + "loss": 0.1762, + "step": 1524 + }, + { + "epoch": 0.343594220857859, + "grad_norm": 0.4677867451429515, + "learning_rate": 3.911303757086925e-06, + "loss": 0.1813, + "step": 1525 + }, + { + "epoch": 0.34381952854366743, + "grad_norm": 0.44834043424165354, + "learning_rate": 3.909765694473e-06, + "loss": 0.1791, + "step": 1526 + }, + { + "epoch": 0.3440448362294759, + "grad_norm": 0.5152512243110383, + "learning_rate": 3.908226849068731e-06, + "loss": 0.1905, + "step": 1527 + }, + { + "epoch": 0.3442701439152843, + "grad_norm": 0.4908580210309517, + "learning_rate": 3.906687221728583e-06, + "loss": 0.191, + "step": 1528 + }, + { + "epoch": 0.34449545160109274, + "grad_norm": 0.45956468686017304, + "learning_rate": 3.905146813307455e-06, + "loss": 0.1798, + "step": 1529 + }, + { + "epoch": 0.34472075928690116, + "grad_norm": 0.4656968092531852, + "learning_rate": 3.903605624660676e-06, + "loss": 0.1824, + "step": 1530 + }, + { + "epoch": 0.34494606697270963, + "grad_norm": 0.5053896495734799, + "learning_rate": 3.902063656644012e-06, + "loss": 0.1845, + "step": 1531 + }, + { + "epoch": 0.34517137465851805, + "grad_norm": 0.5083992854509763, + "learning_rate": 3.900520910113659e-06, + "loss": 0.2227, + "step": 1532 + }, + { + "epoch": 0.34539668234432647, + "grad_norm": 0.4558619650296016, + "learning_rate": 3.898977385926249e-06, + "loss": 0.189, + "step": 1533 + }, + { + "epoch": 0.3456219900301349, + "grad_norm": 0.48121157704316825, + "learning_rate": 3.897433084938841e-06, + "loss": 0.1714, + "step": 1534 + }, + { + "epoch": 0.34584729771594336, + "grad_norm": 0.4670252803466497, + "learning_rate": 3.895888008008929e-06, + "loss": 0.1901, + "step": 1535 + }, + { + "epoch": 0.3460726054017518, + "grad_norm": 0.5069706946576791, + "learning_rate": 3.894342155994437e-06, + "loss": 0.2058, + "step": 1536 + }, + { + "epoch": 0.3462979130875602, + "grad_norm": 0.48906829842369554, + "learning_rate": 3.892795529753718e-06, + "loss": 0.1922, + "step": 1537 + }, + { + "epoch": 0.3465232207733686, + "grad_norm": 0.46664448558023774, + "learning_rate": 3.891248130145556e-06, + "loss": 0.1816, + "step": 1538 + }, + { + "epoch": 0.3467485284591771, + "grad_norm": 0.45412246252652255, + "learning_rate": 3.889699958029166e-06, + "loss": 0.1767, + "step": 1539 + }, + { + "epoch": 0.3469738361449855, + "grad_norm": 0.45562663883139215, + "learning_rate": 3.888151014264189e-06, + "loss": 0.1816, + "step": 1540 + }, + { + "epoch": 0.3471991438307939, + "grad_norm": 0.47219473905402726, + "learning_rate": 3.886601299710694e-06, + "loss": 0.1734, + "step": 1541 + }, + { + "epoch": 0.34742445151660234, + "grad_norm": 0.48618340372178315, + "learning_rate": 3.885050815229182e-06, + "loss": 0.1772, + "step": 1542 + }, + { + "epoch": 0.3476497592024108, + "grad_norm": 0.5160538726246495, + "learning_rate": 3.88349956168058e-06, + "loss": 0.1929, + "step": 1543 + }, + { + "epoch": 0.34787506688821923, + "grad_norm": 0.5043324721236656, + "learning_rate": 3.881947539926239e-06, + "loss": 0.1921, + "step": 1544 + }, + { + "epoch": 0.34810037457402765, + "grad_norm": 0.45271350841667135, + "learning_rate": 3.880394750827939e-06, + "loss": 0.1776, + "step": 1545 + }, + { + "epoch": 0.34832568225983607, + "grad_norm": 0.49653808628181134, + "learning_rate": 3.878841195247888e-06, + "loss": 0.2066, + "step": 1546 + }, + { + "epoch": 0.34855098994564454, + "grad_norm": 0.4927313041803519, + "learning_rate": 3.877286874048716e-06, + "loss": 0.1975, + "step": 1547 + }, + { + "epoch": 0.34877629763145296, + "grad_norm": 0.5262656344751725, + "learning_rate": 3.875731788093478e-06, + "loss": 0.2086, + "step": 1548 + }, + { + "epoch": 0.3490016053172614, + "grad_norm": 0.48140944912709827, + "learning_rate": 3.874175938245659e-06, + "loss": 0.1826, + "step": 1549 + }, + { + "epoch": 0.3492269130030698, + "grad_norm": 0.4757763135888991, + "learning_rate": 3.872619325369162e-06, + "loss": 0.1848, + "step": 1550 + }, + { + "epoch": 0.34945222068887827, + "grad_norm": 0.48496238265854513, + "learning_rate": 3.871061950328317e-06, + "loss": 0.1847, + "step": 1551 + }, + { + "epoch": 0.3496775283746867, + "grad_norm": 0.5056755326187495, + "learning_rate": 3.869503813987876e-06, + "loss": 0.1856, + "step": 1552 + }, + { + "epoch": 0.3499028360604951, + "grad_norm": 0.48678902933610946, + "learning_rate": 3.867944917213014e-06, + "loss": 0.1878, + "step": 1553 + }, + { + "epoch": 0.3501281437463035, + "grad_norm": 0.43744579825023017, + "learning_rate": 3.866385260869327e-06, + "loss": 0.1596, + "step": 1554 + }, + { + "epoch": 0.350353451432112, + "grad_norm": 0.4997480373473333, + "learning_rate": 3.864824845822837e-06, + "loss": 0.1935, + "step": 1555 + }, + { + "epoch": 0.3505787591179204, + "grad_norm": 0.5123319505290229, + "learning_rate": 3.8632636729399815e-06, + "loss": 0.1984, + "step": 1556 + }, + { + "epoch": 0.35080406680372883, + "grad_norm": 0.4581467298171831, + "learning_rate": 3.861701743087622e-06, + "loss": 0.1839, + "step": 1557 + }, + { + "epoch": 0.35102937448953725, + "grad_norm": 0.5098730132201903, + "learning_rate": 3.860139057133042e-06, + "loss": 0.2003, + "step": 1558 + }, + { + "epoch": 0.3512546821753457, + "grad_norm": 0.46636431088528574, + "learning_rate": 3.858575615943941e-06, + "loss": 0.1942, + "step": 1559 + }, + { + "epoch": 0.35147998986115414, + "grad_norm": 0.49117369035601177, + "learning_rate": 3.85701142038844e-06, + "loss": 0.208, + "step": 1560 + }, + { + "epoch": 0.35170529754696256, + "grad_norm": 0.4335719276138497, + "learning_rate": 3.855446471335078e-06, + "loss": 0.173, + "step": 1561 + }, + { + "epoch": 0.351930605232771, + "grad_norm": 0.4863357538470509, + "learning_rate": 3.853880769652815e-06, + "loss": 0.1965, + "step": 1562 + }, + { + "epoch": 0.35215591291857945, + "grad_norm": 0.4776050654686914, + "learning_rate": 3.852314316211023e-06, + "loss": 0.1873, + "step": 1563 + }, + { + "epoch": 0.35238122060438787, + "grad_norm": 0.466506831387238, + "learning_rate": 3.850747111879499e-06, + "loss": 0.1994, + "step": 1564 + }, + { + "epoch": 0.3526065282901963, + "grad_norm": 0.4550631950756372, + "learning_rate": 3.84917915752845e-06, + "loss": 0.1784, + "step": 1565 + }, + { + "epoch": 0.3528318359760047, + "grad_norm": 0.47166767952305483, + "learning_rate": 3.8476104540285054e-06, + "loss": 0.1894, + "step": 1566 + }, + { + "epoch": 0.3530571436618132, + "grad_norm": 0.49225839221439005, + "learning_rate": 3.846041002250705e-06, + "loss": 0.2011, + "step": 1567 + }, + { + "epoch": 0.3532824513476216, + "grad_norm": 0.45068526544325144, + "learning_rate": 3.84447080306651e-06, + "loss": 0.187, + "step": 1568 + }, + { + "epoch": 0.35350775903343, + "grad_norm": 0.47988061196534343, + "learning_rate": 3.842899857347792e-06, + "loss": 0.1977, + "step": 1569 + }, + { + "epoch": 0.3537330667192385, + "grad_norm": 0.4690103613869256, + "learning_rate": 3.841328165966837e-06, + "loss": 0.176, + "step": 1570 + }, + { + "epoch": 0.3539583744050469, + "grad_norm": 0.47953899085085977, + "learning_rate": 3.839755729796349e-06, + "loss": 0.1922, + "step": 1571 + }, + { + "epoch": 0.3541836820908553, + "grad_norm": 0.43883493425119646, + "learning_rate": 3.838182549709442e-06, + "loss": 0.1736, + "step": 1572 + }, + { + "epoch": 0.35440898977666374, + "grad_norm": 0.4790127526608185, + "learning_rate": 3.8366086265796445e-06, + "loss": 0.1989, + "step": 1573 + }, + { + "epoch": 0.3546342974624722, + "grad_norm": 0.4588213089648236, + "learning_rate": 3.835033961280898e-06, + "loss": 0.1793, + "step": 1574 + }, + { + "epoch": 0.35485960514828063, + "grad_norm": 0.47435118939753623, + "learning_rate": 3.8334585546875544e-06, + "loss": 0.1801, + "step": 1575 + }, + { + "epoch": 0.35508491283408905, + "grad_norm": 0.42425570346286406, + "learning_rate": 3.831882407674379e-06, + "loss": 0.1626, + "step": 1576 + }, + { + "epoch": 0.35531022051989747, + "grad_norm": 0.4351455263493925, + "learning_rate": 3.830305521116546e-06, + "loss": 0.1648, + "step": 1577 + }, + { + "epoch": 0.35553552820570594, + "grad_norm": 0.521876203445667, + "learning_rate": 3.828727895889644e-06, + "loss": 0.2133, + "step": 1578 + }, + { + "epoch": 0.35576083589151436, + "grad_norm": 0.45892558089561325, + "learning_rate": 3.827149532869668e-06, + "loss": 0.1856, + "step": 1579 + }, + { + "epoch": 0.3559861435773228, + "grad_norm": 0.5041254836452159, + "learning_rate": 3.825570432933026e-06, + "loss": 0.1947, + "step": 1580 + }, + { + "epoch": 0.3562114512631312, + "grad_norm": 0.5252996204183609, + "learning_rate": 3.823990596956531e-06, + "loss": 0.1885, + "step": 1581 + }, + { + "epoch": 0.35643675894893967, + "grad_norm": 0.4866761958255894, + "learning_rate": 3.8224100258174066e-06, + "loss": 0.1794, + "step": 1582 + }, + { + "epoch": 0.3566620666347481, + "grad_norm": 0.4644843707042834, + "learning_rate": 3.820828720393287e-06, + "loss": 0.1925, + "step": 1583 + }, + { + "epoch": 0.3568873743205565, + "grad_norm": 0.46827226295205227, + "learning_rate": 3.819246681562212e-06, + "loss": 0.179, + "step": 1584 + }, + { + "epoch": 0.3571126820063649, + "grad_norm": 0.45521893075894787, + "learning_rate": 3.817663910202628e-06, + "loss": 0.1762, + "step": 1585 + }, + { + "epoch": 0.3573379896921734, + "grad_norm": 0.443692398459433, + "learning_rate": 3.81608040719339e-06, + "loss": 0.1777, + "step": 1586 + }, + { + "epoch": 0.3575632973779818, + "grad_norm": 0.45864138161985846, + "learning_rate": 3.8144961734137566e-06, + "loss": 0.1803, + "step": 1587 + }, + { + "epoch": 0.35778860506379023, + "grad_norm": 0.4553944684694794, + "learning_rate": 3.812911209743395e-06, + "loss": 0.1656, + "step": 1588 + }, + { + "epoch": 0.35801391274959865, + "grad_norm": 0.43184560859176946, + "learning_rate": 3.8113255170623763e-06, + "loss": 0.1624, + "step": 1589 + }, + { + "epoch": 0.3582392204354071, + "grad_norm": 0.475124957752014, + "learning_rate": 3.809739096251176e-06, + "loss": 0.1932, + "step": 1590 + }, + { + "epoch": 0.35846452812121554, + "grad_norm": 0.47685574064434305, + "learning_rate": 3.8081519481906747e-06, + "loss": 0.19, + "step": 1591 + }, + { + "epoch": 0.35868983580702396, + "grad_norm": 0.4558998206205226, + "learning_rate": 3.8065640737621566e-06, + "loss": 0.1748, + "step": 1592 + }, + { + "epoch": 0.3589151434928324, + "grad_norm": 0.46331830160203613, + "learning_rate": 3.804975473847309e-06, + "loss": 0.1855, + "step": 1593 + }, + { + "epoch": 0.35914045117864085, + "grad_norm": 0.47980477359318663, + "learning_rate": 3.803386149328223e-06, + "loss": 0.1951, + "step": 1594 + }, + { + "epoch": 0.35936575886444927, + "grad_norm": 0.516104351593465, + "learning_rate": 3.8017961010873904e-06, + "loss": 0.1828, + "step": 1595 + }, + { + "epoch": 0.3595910665502577, + "grad_norm": 0.4942693760239517, + "learning_rate": 3.8002053300077056e-06, + "loss": 0.2069, + "step": 1596 + }, + { + "epoch": 0.3598163742360661, + "grad_norm": 0.49409709688404807, + "learning_rate": 3.7986138369724664e-06, + "loss": 0.1867, + "step": 1597 + }, + { + "epoch": 0.3600416819218746, + "grad_norm": 0.4571875458976171, + "learning_rate": 3.7970216228653667e-06, + "loss": 0.1787, + "step": 1598 + }, + { + "epoch": 0.360266989607683, + "grad_norm": 0.45660199624827735, + "learning_rate": 3.795428688570505e-06, + "loss": 0.1786, + "step": 1599 + }, + { + "epoch": 0.3604922972934914, + "grad_norm": 0.4999998218354506, + "learning_rate": 3.7938350349723784e-06, + "loss": 0.1935, + "step": 1600 + }, + { + "epoch": 0.36071760497929983, + "grad_norm": 0.46658026691023297, + "learning_rate": 3.792240662955884e-06, + "loss": 0.1898, + "step": 1601 + }, + { + "epoch": 0.3609429126651083, + "grad_norm": 0.43581290306209425, + "learning_rate": 3.7906455734063156e-06, + "loss": 0.181, + "step": 1602 + }, + { + "epoch": 0.3611682203509167, + "grad_norm": 0.5050223552914678, + "learning_rate": 3.7890497672093686e-06, + "loss": 0.1966, + "step": 1603 + }, + { + "epoch": 0.36139352803672514, + "grad_norm": 0.4924846296518356, + "learning_rate": 3.7874532452511324e-06, + "loss": 0.1799, + "step": 1604 + }, + { + "epoch": 0.36161883572253356, + "grad_norm": 0.45286409663603916, + "learning_rate": 3.785856008418099e-06, + "loss": 0.1853, + "step": 1605 + }, + { + "epoch": 0.36184414340834203, + "grad_norm": 0.4871489888094629, + "learning_rate": 3.7842580575971533e-06, + "loss": 0.1903, + "step": 1606 + }, + { + "epoch": 0.36206945109415045, + "grad_norm": 0.5252179645558547, + "learning_rate": 3.782659393675577e-06, + "loss": 0.1851, + "step": 1607 + }, + { + "epoch": 0.36229475877995887, + "grad_norm": 0.49274160550714746, + "learning_rate": 3.7810600175410493e-06, + "loss": 0.1722, + "step": 1608 + }, + { + "epoch": 0.3625200664657673, + "grad_norm": 0.4528658811184739, + "learning_rate": 3.7794599300816435e-06, + "loss": 0.1843, + "step": 1609 + }, + { + "epoch": 0.36274537415157576, + "grad_norm": 0.4759421311162183, + "learning_rate": 3.77785913218583e-06, + "loss": 0.1974, + "step": 1610 + }, + { + "epoch": 0.3629706818373842, + "grad_norm": 0.4678080791184679, + "learning_rate": 3.7762576247424707e-06, + "loss": 0.1815, + "step": 1611 + }, + { + "epoch": 0.3631959895231926, + "grad_norm": 0.4770143193395648, + "learning_rate": 3.7746554086408245e-06, + "loss": 0.1677, + "step": 1612 + }, + { + "epoch": 0.363421297209001, + "grad_norm": 0.481980661517477, + "learning_rate": 3.7730524847705407e-06, + "loss": 0.1863, + "step": 1613 + }, + { + "epoch": 0.3636466048948095, + "grad_norm": 0.46068223705870226, + "learning_rate": 3.7714488540216637e-06, + "loss": 0.1841, + "step": 1614 + }, + { + "epoch": 0.3638719125806179, + "grad_norm": 0.49893903490686975, + "learning_rate": 3.7698445172846305e-06, + "loss": 0.1705, + "step": 1615 + }, + { + "epoch": 0.3640972202664263, + "grad_norm": 0.558895862813068, + "learning_rate": 3.7682394754502687e-06, + "loss": 0.1878, + "step": 1616 + }, + { + "epoch": 0.3643225279522348, + "grad_norm": 0.5075242452230608, + "learning_rate": 3.7666337294097987e-06, + "loss": 0.1891, + "step": 1617 + }, + { + "epoch": 0.3645478356380432, + "grad_norm": 0.43706331040591445, + "learning_rate": 3.7650272800548316e-06, + "loss": 0.1686, + "step": 1618 + }, + { + "epoch": 0.36477314332385163, + "grad_norm": 0.4436755660497053, + "learning_rate": 3.7634201282773673e-06, + "loss": 0.1678, + "step": 1619 + }, + { + "epoch": 0.36499845100966005, + "grad_norm": 0.4449246229322858, + "learning_rate": 3.7618122749697993e-06, + "loss": 0.1699, + "step": 1620 + }, + { + "epoch": 0.3652237586954685, + "grad_norm": 0.4452378198791521, + "learning_rate": 3.7602037210249077e-06, + "loss": 0.1843, + "step": 1621 + }, + { + "epoch": 0.36544906638127694, + "grad_norm": 0.46593836599208055, + "learning_rate": 3.7585944673358632e-06, + "loss": 0.1771, + "step": 1622 + }, + { + "epoch": 0.36567437406708536, + "grad_norm": 0.5004623768912928, + "learning_rate": 3.756984514796224e-06, + "loss": 0.1867, + "step": 1623 + }, + { + "epoch": 0.3658996817528938, + "grad_norm": 0.4961515678854415, + "learning_rate": 3.7553738642999354e-06, + "loss": 0.1869, + "step": 1624 + }, + { + "epoch": 0.36612498943870225, + "grad_norm": 0.4510784909607947, + "learning_rate": 3.753762516741333e-06, + "loss": 0.1829, + "step": 1625 + }, + { + "epoch": 0.36635029712451067, + "grad_norm": 0.45815425710480856, + "learning_rate": 3.7521504730151382e-06, + "loss": 0.1907, + "step": 1626 + }, + { + "epoch": 0.3665756048103191, + "grad_norm": 0.46365843318785316, + "learning_rate": 3.7505377340164585e-06, + "loss": 0.188, + "step": 1627 + }, + { + "epoch": 0.3668009124961275, + "grad_norm": 0.46388107068838463, + "learning_rate": 3.748924300640787e-06, + "loss": 0.1992, + "step": 1628 + }, + { + "epoch": 0.367026220181936, + "grad_norm": 0.46128511524648713, + "learning_rate": 3.747310173784004e-06, + "loss": 0.1743, + "step": 1629 + }, + { + "epoch": 0.3672515278677444, + "grad_norm": 0.4904799400980858, + "learning_rate": 3.745695354342374e-06, + "loss": 0.1917, + "step": 1630 + }, + { + "epoch": 0.3674768355535528, + "grad_norm": 0.48844459648380667, + "learning_rate": 3.7440798432125452e-06, + "loss": 0.1703, + "step": 1631 + }, + { + "epoch": 0.36770214323936123, + "grad_norm": 0.4307880252647068, + "learning_rate": 3.742463641291552e-06, + "loss": 0.1671, + "step": 1632 + }, + { + "epoch": 0.3679274509251697, + "grad_norm": 0.429523148641964, + "learning_rate": 3.7408467494768104e-06, + "loss": 0.1613, + "step": 1633 + }, + { + "epoch": 0.3681527586109781, + "grad_norm": 0.47642562532530275, + "learning_rate": 3.73922916866612e-06, + "loss": 0.1929, + "step": 1634 + }, + { + "epoch": 0.36837806629678654, + "grad_norm": 0.47220880397748816, + "learning_rate": 3.7376108997576628e-06, + "loss": 0.1783, + "step": 1635 + }, + { + "epoch": 0.36860337398259496, + "grad_norm": 0.4749155014642529, + "learning_rate": 3.7359919436500038e-06, + "loss": 0.1906, + "step": 1636 + }, + { + "epoch": 0.36882868166840344, + "grad_norm": 0.48956492054110756, + "learning_rate": 3.7343723012420884e-06, + "loss": 0.1975, + "step": 1637 + }, + { + "epoch": 0.36905398935421185, + "grad_norm": 0.4108944270554346, + "learning_rate": 3.7327519734332453e-06, + "loss": 0.164, + "step": 1638 + }, + { + "epoch": 0.36927929704002027, + "grad_norm": 0.4591241945802992, + "learning_rate": 3.73113096112318e-06, + "loss": 0.1859, + "step": 1639 + }, + { + "epoch": 0.3695046047258287, + "grad_norm": 0.4940190864445239, + "learning_rate": 3.7295092652119815e-06, + "loss": 0.1787, + "step": 1640 + }, + { + "epoch": 0.36972991241163716, + "grad_norm": 0.4507719669883632, + "learning_rate": 3.7278868866001165e-06, + "loss": 0.1805, + "step": 1641 + }, + { + "epoch": 0.3699552200974456, + "grad_norm": 0.47849220503426865, + "learning_rate": 3.726263826188432e-06, + "loss": 0.1862, + "step": 1642 + }, + { + "epoch": 0.370180527783254, + "grad_norm": 0.44970360516599167, + "learning_rate": 3.724640084878153e-06, + "loss": 0.1799, + "step": 1643 + }, + { + "epoch": 0.3704058354690624, + "grad_norm": 0.46332194821329004, + "learning_rate": 3.7230156635708815e-06, + "loss": 0.1882, + "step": 1644 + }, + { + "epoch": 0.3706311431548709, + "grad_norm": 0.45154499198234405, + "learning_rate": 3.7213905631685988e-06, + "loss": 0.1775, + "step": 1645 + }, + { + "epoch": 0.3708564508406793, + "grad_norm": 0.47333392379021144, + "learning_rate": 3.7197647845736616e-06, + "loss": 0.1695, + "step": 1646 + }, + { + "epoch": 0.3710817585264877, + "grad_norm": 0.44145595481569166, + "learning_rate": 3.7181383286888056e-06, + "loss": 0.1656, + "step": 1647 + }, + { + "epoch": 0.37130706621229614, + "grad_norm": 0.45149961404861594, + "learning_rate": 3.7165111964171407e-06, + "loss": 0.1749, + "step": 1648 + }, + { + "epoch": 0.3715323738981046, + "grad_norm": 0.4776000175083643, + "learning_rate": 3.714883388662153e-06, + "loss": 0.1942, + "step": 1649 + }, + { + "epoch": 0.37175768158391304, + "grad_norm": 0.41891726582884437, + "learning_rate": 3.7132549063277033e-06, + "loss": 0.1738, + "step": 1650 + }, + { + "epoch": 0.37198298926972145, + "grad_norm": 0.48892046818469326, + "learning_rate": 3.711625750318026e-06, + "loss": 0.2024, + "step": 1651 + }, + { + "epoch": 0.37220829695552987, + "grad_norm": 0.45520836754907884, + "learning_rate": 3.7099959215377325e-06, + "loss": 0.1832, + "step": 1652 + }, + { + "epoch": 0.37243360464133834, + "grad_norm": 0.4682114487108024, + "learning_rate": 3.7083654208918044e-06, + "loss": 0.1807, + "step": 1653 + }, + { + "epoch": 0.37265891232714676, + "grad_norm": 0.4260957687005181, + "learning_rate": 3.7067342492855997e-06, + "loss": 0.1702, + "step": 1654 + }, + { + "epoch": 0.3728842200129552, + "grad_norm": 0.4787032147953297, + "learning_rate": 3.7051024076248455e-06, + "loss": 0.1848, + "step": 1655 + }, + { + "epoch": 0.3731095276987636, + "grad_norm": 0.47123221158960943, + "learning_rate": 3.7034698968156434e-06, + "loss": 0.175, + "step": 1656 + }, + { + "epoch": 0.3733348353845721, + "grad_norm": 0.5158808366707351, + "learning_rate": 3.7018367177644654e-06, + "loss": 0.2275, + "step": 1657 + }, + { + "epoch": 0.3735601430703805, + "grad_norm": 0.4833785800380268, + "learning_rate": 3.700202871378156e-06, + "loss": 0.1881, + "step": 1658 + }, + { + "epoch": 0.3737854507561889, + "grad_norm": 0.46059013723235614, + "learning_rate": 3.698568358563928e-06, + "loss": 0.1828, + "step": 1659 + }, + { + "epoch": 0.3740107584419973, + "grad_norm": 0.4358149797381415, + "learning_rate": 3.696933180229366e-06, + "loss": 0.1794, + "step": 1660 + }, + { + "epoch": 0.3742360661278058, + "grad_norm": 0.5094653542558862, + "learning_rate": 3.6952973372824236e-06, + "loss": 0.1984, + "step": 1661 + }, + { + "epoch": 0.3744613738136142, + "grad_norm": 0.4529112813486184, + "learning_rate": 3.6936608306314227e-06, + "loss": 0.1696, + "step": 1662 + }, + { + "epoch": 0.37468668149942264, + "grad_norm": 0.4374825819304441, + "learning_rate": 3.6920236611850557e-06, + "loss": 0.1589, + "step": 1663 + }, + { + "epoch": 0.37491198918523105, + "grad_norm": 0.4519325183402992, + "learning_rate": 3.690385829852381e-06, + "loss": 0.1841, + "step": 1664 + }, + { + "epoch": 0.3751372968710395, + "grad_norm": 0.4706429863788977, + "learning_rate": 3.6887473375428257e-06, + "loss": 0.1899, + "step": 1665 + }, + { + "epoch": 0.37536260455684795, + "grad_norm": 0.5150309169615933, + "learning_rate": 3.6871081851661825e-06, + "loss": 0.1931, + "step": 1666 + }, + { + "epoch": 0.37558791224265636, + "grad_norm": 0.43623116765370185, + "learning_rate": 3.685468373632613e-06, + "loss": 0.1803, + "step": 1667 + }, + { + "epoch": 0.37581321992846484, + "grad_norm": 0.4619009947326216, + "learning_rate": 3.6838279038526427e-06, + "loss": 0.1804, + "step": 1668 + }, + { + "epoch": 0.37603852761427325, + "grad_norm": 0.4828646878127938, + "learning_rate": 3.6821867767371634e-06, + "loss": 0.1903, + "step": 1669 + }, + { + "epoch": 0.3762638353000817, + "grad_norm": 0.45108879277512637, + "learning_rate": 3.6805449931974313e-06, + "loss": 0.1827, + "step": 1670 + }, + { + "epoch": 0.3764891429858901, + "grad_norm": 0.4964621446580229, + "learning_rate": 3.6789025541450686e-06, + "loss": 0.1857, + "step": 1671 + }, + { + "epoch": 0.37671445067169856, + "grad_norm": 0.4545173035377614, + "learning_rate": 3.67725946049206e-06, + "loss": 0.1882, + "step": 1672 + }, + { + "epoch": 0.376939758357507, + "grad_norm": 0.48636497680368257, + "learning_rate": 3.675615713150754e-06, + "loss": 0.2026, + "step": 1673 + }, + { + "epoch": 0.3771650660433154, + "grad_norm": 0.5083676792504715, + "learning_rate": 3.6739713130338617e-06, + "loss": 0.201, + "step": 1674 + }, + { + "epoch": 0.3773903737291238, + "grad_norm": 0.4776593991572466, + "learning_rate": 3.6723262610544586e-06, + "loss": 0.1975, + "step": 1675 + }, + { + "epoch": 0.3776156814149323, + "grad_norm": 0.4409366335246958, + "learning_rate": 3.6706805581259807e-06, + "loss": 0.1774, + "step": 1676 + }, + { + "epoch": 0.3778409891007407, + "grad_norm": 0.47996857699837164, + "learning_rate": 3.669034205162224e-06, + "loss": 0.181, + "step": 1677 + }, + { + "epoch": 0.3780662967865491, + "grad_norm": 0.4528780941370058, + "learning_rate": 3.6673872030773473e-06, + "loss": 0.1772, + "step": 1678 + }, + { + "epoch": 0.37829160447235755, + "grad_norm": 0.44338051593284933, + "learning_rate": 3.66573955278587e-06, + "loss": 0.1767, + "step": 1679 + }, + { + "epoch": 0.378516912158166, + "grad_norm": 0.46444737099980365, + "learning_rate": 3.664091255202672e-06, + "loss": 0.1798, + "step": 1680 + }, + { + "epoch": 0.37874221984397444, + "grad_norm": 0.48866742661704116, + "learning_rate": 3.662442311242989e-06, + "loss": 0.1867, + "step": 1681 + }, + { + "epoch": 0.37896752752978285, + "grad_norm": 0.5267588128947006, + "learning_rate": 3.66079272182242e-06, + "loss": 0.2048, + "step": 1682 + }, + { + "epoch": 0.3791928352155913, + "grad_norm": 0.4574170162725737, + "learning_rate": 3.6591424878569203e-06, + "loss": 0.1835, + "step": 1683 + }, + { + "epoch": 0.37941814290139975, + "grad_norm": 0.5065653540542215, + "learning_rate": 3.657491610262802e-06, + "loss": 0.1962, + "step": 1684 + }, + { + "epoch": 0.37964345058720816, + "grad_norm": 0.4373791181044454, + "learning_rate": 3.655840089956738e-06, + "loss": 0.1772, + "step": 1685 + }, + { + "epoch": 0.3798687582730166, + "grad_norm": 0.43962764885212496, + "learning_rate": 3.654187927855754e-06, + "loss": 0.1725, + "step": 1686 + }, + { + "epoch": 0.380094065958825, + "grad_norm": 0.4368218243898304, + "learning_rate": 3.6525351248772357e-06, + "loss": 0.1589, + "step": 1687 + }, + { + "epoch": 0.3803193736446335, + "grad_norm": 0.47956633616486494, + "learning_rate": 3.6508816819389216e-06, + "loss": 0.1857, + "step": 1688 + }, + { + "epoch": 0.3805446813304419, + "grad_norm": 0.43135975591144976, + "learning_rate": 3.6492275999589065e-06, + "loss": 0.1674, + "step": 1689 + }, + { + "epoch": 0.3807699890162503, + "grad_norm": 0.4935310600423059, + "learning_rate": 3.6475728798556426e-06, + "loss": 0.1862, + "step": 1690 + }, + { + "epoch": 0.38099529670205873, + "grad_norm": 0.5794052112710533, + "learning_rate": 3.645917522547933e-06, + "loss": 0.1935, + "step": 1691 + }, + { + "epoch": 0.3812206043878672, + "grad_norm": 0.488396059153258, + "learning_rate": 3.6442615289549354e-06, + "loss": 0.1882, + "step": 1692 + }, + { + "epoch": 0.3814459120736756, + "grad_norm": 0.4730782805862536, + "learning_rate": 3.6426048999961626e-06, + "loss": 0.1917, + "step": 1693 + }, + { + "epoch": 0.38167121975948404, + "grad_norm": 0.4503801479600874, + "learning_rate": 3.6409476365914786e-06, + "loss": 0.1799, + "step": 1694 + }, + { + "epoch": 0.38189652744529246, + "grad_norm": 0.4849548311067158, + "learning_rate": 3.6392897396610992e-06, + "loss": 0.196, + "step": 1695 + }, + { + "epoch": 0.38212183513110093, + "grad_norm": 0.4620064896567314, + "learning_rate": 3.6376312101255934e-06, + "loss": 0.1753, + "step": 1696 + }, + { + "epoch": 0.38234714281690935, + "grad_norm": 0.4996224103167475, + "learning_rate": 3.6359720489058804e-06, + "loss": 0.1787, + "step": 1697 + }, + { + "epoch": 0.38257245050271776, + "grad_norm": 0.4784723045194729, + "learning_rate": 3.6343122569232313e-06, + "loss": 0.1985, + "step": 1698 + }, + { + "epoch": 0.3827977581885262, + "grad_norm": 0.4276451648599319, + "learning_rate": 3.6326518350992657e-06, + "loss": 0.1646, + "step": 1699 + }, + { + "epoch": 0.38302306587433466, + "grad_norm": 0.457874466930729, + "learning_rate": 3.6309907843559542e-06, + "loss": 0.1793, + "step": 1700 + }, + { + "epoch": 0.3832483735601431, + "grad_norm": 0.4423491487196018, + "learning_rate": 3.6293291056156178e-06, + "loss": 0.1743, + "step": 1701 + }, + { + "epoch": 0.3834736812459515, + "grad_norm": 0.4797426694264622, + "learning_rate": 3.6276667998009242e-06, + "loss": 0.1818, + "step": 1702 + }, + { + "epoch": 0.3836989889317599, + "grad_norm": 0.479986261939842, + "learning_rate": 3.626003867834888e-06, + "loss": 0.1896, + "step": 1703 + }, + { + "epoch": 0.3839242966175684, + "grad_norm": 0.5159126597736717, + "learning_rate": 3.624340310640875e-06, + "loss": 0.198, + "step": 1704 + }, + { + "epoch": 0.3841496043033768, + "grad_norm": 0.5171034293880262, + "learning_rate": 3.6226761291425956e-06, + "loss": 0.1854, + "step": 1705 + }, + { + "epoch": 0.3843749119891852, + "grad_norm": 0.4345996229057939, + "learning_rate": 3.621011324264109e-06, + "loss": 0.183, + "step": 1706 + }, + { + "epoch": 0.38460021967499364, + "grad_norm": 0.4551189788076313, + "learning_rate": 3.6193458969298184e-06, + "loss": 0.1922, + "step": 1707 + }, + { + "epoch": 0.3848255273608021, + "grad_norm": 0.4805691605600424, + "learning_rate": 3.617679848064474e-06, + "loss": 0.1789, + "step": 1708 + }, + { + "epoch": 0.38505083504661053, + "grad_norm": 0.47021911708860864, + "learning_rate": 3.6160131785931695e-06, + "loss": 0.1851, + "step": 1709 + }, + { + "epoch": 0.38527614273241895, + "grad_norm": 0.42695261557101194, + "learning_rate": 3.6143458894413463e-06, + "loss": 0.1781, + "step": 1710 + }, + { + "epoch": 0.38550145041822736, + "grad_norm": 0.44614512105839105, + "learning_rate": 3.6126779815347863e-06, + "loss": 0.1752, + "step": 1711 + }, + { + "epoch": 0.38572675810403584, + "grad_norm": 0.4312499119447821, + "learning_rate": 3.611009455799617e-06, + "loss": 0.1666, + "step": 1712 + }, + { + "epoch": 0.38595206578984426, + "grad_norm": 0.42615709569116095, + "learning_rate": 3.609340313162309e-06, + "loss": 0.1643, + "step": 1713 + }, + { + "epoch": 0.3861773734756527, + "grad_norm": 0.5319097029021179, + "learning_rate": 3.6076705545496743e-06, + "loss": 0.1889, + "step": 1714 + }, + { + "epoch": 0.38640268116146115, + "grad_norm": 0.4189232925318139, + "learning_rate": 3.606000180888868e-06, + "loss": 0.1613, + "step": 1715 + }, + { + "epoch": 0.38662798884726957, + "grad_norm": 0.43215427318119615, + "learning_rate": 3.604329193107386e-06, + "loss": 0.1715, + "step": 1716 + }, + { + "epoch": 0.386853296533078, + "grad_norm": 0.4982241540713365, + "learning_rate": 3.6026575921330665e-06, + "loss": 0.1928, + "step": 1717 + }, + { + "epoch": 0.3870786042188864, + "grad_norm": 0.4884512321380232, + "learning_rate": 3.600985378894086e-06, + "loss": 0.1865, + "step": 1718 + }, + { + "epoch": 0.3873039119046949, + "grad_norm": 0.4627996817762917, + "learning_rate": 3.5993125543189634e-06, + "loss": 0.1846, + "step": 1719 + }, + { + "epoch": 0.3875292195905033, + "grad_norm": 0.437155607244429, + "learning_rate": 3.5976391193365544e-06, + "loss": 0.1737, + "step": 1720 + }, + { + "epoch": 0.3877545272763117, + "grad_norm": 0.48040785830439203, + "learning_rate": 3.5959650748760562e-06, + "loss": 0.1901, + "step": 1721 + }, + { + "epoch": 0.38797983496212013, + "grad_norm": 0.44289880990954594, + "learning_rate": 3.5942904218670025e-06, + "loss": 0.1811, + "step": 1722 + }, + { + "epoch": 0.3882051426479286, + "grad_norm": 0.46264309399790926, + "learning_rate": 3.592615161239267e-06, + "loss": 0.177, + "step": 1723 + }, + { + "epoch": 0.388430450333737, + "grad_norm": 0.45910416645019597, + "learning_rate": 3.590939293923058e-06, + "loss": 0.1894, + "step": 1724 + }, + { + "epoch": 0.38865575801954544, + "grad_norm": 0.4691325680673703, + "learning_rate": 3.5892628208489226e-06, + "loss": 0.177, + "step": 1725 + }, + { + "epoch": 0.38888106570535386, + "grad_norm": 0.4507877266927297, + "learning_rate": 3.5875857429477447e-06, + "loss": 0.1797, + "step": 1726 + }, + { + "epoch": 0.38910637339116233, + "grad_norm": 0.45418110404478707, + "learning_rate": 3.585908061150741e-06, + "loss": 0.1685, + "step": 1727 + }, + { + "epoch": 0.38933168107697075, + "grad_norm": 0.5176850879565124, + "learning_rate": 3.584229776389468e-06, + "loss": 0.1923, + "step": 1728 + }, + { + "epoch": 0.38955698876277917, + "grad_norm": 0.48505562511673667, + "learning_rate": 3.5825508895958143e-06, + "loss": 0.1943, + "step": 1729 + }, + { + "epoch": 0.3897822964485876, + "grad_norm": 0.46168874572450475, + "learning_rate": 3.580871401702002e-06, + "loss": 0.1906, + "step": 1730 + }, + { + "epoch": 0.39000760413439606, + "grad_norm": 0.449316342769276, + "learning_rate": 3.5791913136405883e-06, + "loss": 0.1743, + "step": 1731 + }, + { + "epoch": 0.3902329118202045, + "grad_norm": 0.45068330640945636, + "learning_rate": 3.5775106263444644e-06, + "loss": 0.1764, + "step": 1732 + }, + { + "epoch": 0.3904582195060129, + "grad_norm": 0.45574151612583014, + "learning_rate": 3.5758293407468525e-06, + "loss": 0.1681, + "step": 1733 + }, + { + "epoch": 0.3906835271918213, + "grad_norm": 0.5193773756271899, + "learning_rate": 3.5741474577813086e-06, + "loss": 0.1936, + "step": 1734 + }, + { + "epoch": 0.3909088348776298, + "grad_norm": 0.49486219105180307, + "learning_rate": 3.572464978381719e-06, + "loss": 0.1918, + "step": 1735 + }, + { + "epoch": 0.3911341425634382, + "grad_norm": 0.48462704307035126, + "learning_rate": 3.570781903482302e-06, + "loss": 0.1923, + "step": 1736 + }, + { + "epoch": 0.3913594502492466, + "grad_norm": 0.4626161664323633, + "learning_rate": 3.569098234017606e-06, + "loss": 0.1739, + "step": 1737 + }, + { + "epoch": 0.39158475793505504, + "grad_norm": 0.5097099709789032, + "learning_rate": 3.5674139709225104e-06, + "loss": 0.1899, + "step": 1738 + }, + { + "epoch": 0.3918100656208635, + "grad_norm": 0.4915100613290528, + "learning_rate": 3.565729115132224e-06, + "loss": 0.189, + "step": 1739 + }, + { + "epoch": 0.39203537330667193, + "grad_norm": 0.4615655412540974, + "learning_rate": 3.5640436675822833e-06, + "loss": 0.1761, + "step": 1740 + }, + { + "epoch": 0.39226068099248035, + "grad_norm": 0.4762740200693682, + "learning_rate": 3.5623576292085555e-06, + "loss": 0.186, + "step": 1741 + }, + { + "epoch": 0.39248598867828877, + "grad_norm": 0.46195996115651383, + "learning_rate": 3.5606710009472335e-06, + "loss": 0.1733, + "step": 1742 + }, + { + "epoch": 0.39271129636409724, + "grad_norm": 0.4936557648536029, + "learning_rate": 3.558983783734841e-06, + "loss": 0.1745, + "step": 1743 + }, + { + "epoch": 0.39293660404990566, + "grad_norm": 0.5080632703201438, + "learning_rate": 3.5572959785082264e-06, + "loss": 0.1814, + "step": 1744 + }, + { + "epoch": 0.3931619117357141, + "grad_norm": 0.4800254957284832, + "learning_rate": 3.5556075862045636e-06, + "loss": 0.1781, + "step": 1745 + }, + { + "epoch": 0.3933872194215225, + "grad_norm": 0.45038276821874035, + "learning_rate": 3.5539186077613562e-06, + "loss": 0.1686, + "step": 1746 + }, + { + "epoch": 0.39361252710733097, + "grad_norm": 0.4727107181929818, + "learning_rate": 3.552229044116428e-06, + "loss": 0.1906, + "step": 1747 + }, + { + "epoch": 0.3938378347931394, + "grad_norm": 0.45654289993725344, + "learning_rate": 3.5505388962079337e-06, + "loss": 0.1752, + "step": 1748 + }, + { + "epoch": 0.3940631424789478, + "grad_norm": 0.47010530892609387, + "learning_rate": 3.548848164974347e-06, + "loss": 0.1733, + "step": 1749 + }, + { + "epoch": 0.3942884501647562, + "grad_norm": 0.4936203713231898, + "learning_rate": 3.54715685135447e-06, + "loss": 0.2124, + "step": 1750 + }, + { + "epoch": 0.3945137578505647, + "grad_norm": 0.44567052629143, + "learning_rate": 3.545464956287425e-06, + "loss": 0.1728, + "step": 1751 + }, + { + "epoch": 0.3947390655363731, + "grad_norm": 0.442079058115891, + "learning_rate": 3.5437724807126583e-06, + "loss": 0.1723, + "step": 1752 + }, + { + "epoch": 0.39496437322218153, + "grad_norm": 0.44808990322995895, + "learning_rate": 3.542079425569938e-06, + "loss": 0.1804, + "step": 1753 + }, + { + "epoch": 0.39518968090798995, + "grad_norm": 0.44842219182277787, + "learning_rate": 3.5403857917993554e-06, + "loss": 0.1809, + "step": 1754 + }, + { + "epoch": 0.3954149885937984, + "grad_norm": 0.5125824971257542, + "learning_rate": 3.5386915803413234e-06, + "loss": 0.1935, + "step": 1755 + }, + { + "epoch": 0.39564029627960684, + "grad_norm": 0.48385631901973886, + "learning_rate": 3.5369967921365718e-06, + "loss": 0.1776, + "step": 1756 + }, + { + "epoch": 0.39586560396541526, + "grad_norm": 0.45991587267364153, + "learning_rate": 3.5353014281261545e-06, + "loss": 0.1741, + "step": 1757 + }, + { + "epoch": 0.3960909116512237, + "grad_norm": 0.5004421927098073, + "learning_rate": 3.5336054892514437e-06, + "loss": 0.1837, + "step": 1758 + }, + { + "epoch": 0.39631621933703215, + "grad_norm": 0.4739107943083216, + "learning_rate": 3.531908976454132e-06, + "loss": 0.1855, + "step": 1759 + }, + { + "epoch": 0.39654152702284057, + "grad_norm": 0.5170688905760068, + "learning_rate": 3.530211890676229e-06, + "loss": 0.1581, + "step": 1760 + }, + { + "epoch": 0.396766834708649, + "grad_norm": 0.47033415893542185, + "learning_rate": 3.528514232860063e-06, + "loss": 0.1887, + "step": 1761 + }, + { + "epoch": 0.3969921423944574, + "grad_norm": 0.47997000238448695, + "learning_rate": 3.52681600394828e-06, + "loss": 0.1903, + "step": 1762 + }, + { + "epoch": 0.3972174500802659, + "grad_norm": 0.45293907619917106, + "learning_rate": 3.525117204883844e-06, + "loss": 0.1748, + "step": 1763 + }, + { + "epoch": 0.3974427577660743, + "grad_norm": 0.5312313062281866, + "learning_rate": 3.5234178366100343e-06, + "loss": 0.1873, + "step": 1764 + }, + { + "epoch": 0.3976680654518827, + "grad_norm": 0.4666566416242577, + "learning_rate": 3.5217179000704467e-06, + "loss": 0.173, + "step": 1765 + }, + { + "epoch": 0.3978933731376912, + "grad_norm": 0.4795379722938278, + "learning_rate": 3.520017396208993e-06, + "loss": 0.1803, + "step": 1766 + }, + { + "epoch": 0.3981186808234996, + "grad_norm": 0.4715385721381925, + "learning_rate": 3.518316325969899e-06, + "loss": 0.1911, + "step": 1767 + }, + { + "epoch": 0.398343988509308, + "grad_norm": 0.5047233837366726, + "learning_rate": 3.5166146902977055e-06, + "loss": 0.1994, + "step": 1768 + }, + { + "epoch": 0.39856929619511644, + "grad_norm": 0.4928157708439929, + "learning_rate": 3.514912490137268e-06, + "loss": 0.1796, + "step": 1769 + }, + { + "epoch": 0.3987946038809249, + "grad_norm": 0.5160605616588393, + "learning_rate": 3.5132097264337546e-06, + "loss": 0.1908, + "step": 1770 + }, + { + "epoch": 0.39901991156673333, + "grad_norm": 0.461502157301641, + "learning_rate": 3.5115064001326467e-06, + "loss": 0.1842, + "step": 1771 + }, + { + "epoch": 0.39924521925254175, + "grad_norm": 0.4758384646970424, + "learning_rate": 3.5098025121797375e-06, + "loss": 0.1807, + "step": 1772 + }, + { + "epoch": 0.39947052693835017, + "grad_norm": 0.4467448757048903, + "learning_rate": 3.508098063521134e-06, + "loss": 0.1835, + "step": 1773 + }, + { + "epoch": 0.39969583462415864, + "grad_norm": 0.4932059630689324, + "learning_rate": 3.5063930551032494e-06, + "loss": 0.1859, + "step": 1774 + }, + { + "epoch": 0.39992114230996706, + "grad_norm": 0.46990964137050534, + "learning_rate": 3.504687487872815e-06, + "loss": 0.1743, + "step": 1775 + }, + { + "epoch": 0.4001464499957755, + "grad_norm": 0.5263456065587255, + "learning_rate": 3.5029813627768665e-06, + "loss": 0.2028, + "step": 1776 + }, + { + "epoch": 0.4003717576815839, + "grad_norm": 0.4594464299475304, + "learning_rate": 3.501274680762753e-06, + "loss": 0.1686, + "step": 1777 + }, + { + "epoch": 0.40059706536739237, + "grad_norm": 0.4519488401355369, + "learning_rate": 3.499567442778131e-06, + "loss": 0.1797, + "step": 1778 + }, + { + "epoch": 0.4008223730532008, + "grad_norm": 0.49797718256587875, + "learning_rate": 3.497859649770965e-06, + "loss": 0.1905, + "step": 1779 + }, + { + "epoch": 0.4010476807390092, + "grad_norm": 1.1010435863911125, + "learning_rate": 3.49615130268953e-06, + "loss": 0.1747, + "step": 1780 + }, + { + "epoch": 0.4012729884248176, + "grad_norm": 0.4777672782591009, + "learning_rate": 3.494442402482407e-06, + "loss": 0.1897, + "step": 1781 + }, + { + "epoch": 0.4014982961106261, + "grad_norm": 0.4331619119317055, + "learning_rate": 3.4927329500984857e-06, + "loss": 0.1723, + "step": 1782 + }, + { + "epoch": 0.4017236037964345, + "grad_norm": 0.5069377338073053, + "learning_rate": 3.4910229464869594e-06, + "loss": 0.1932, + "step": 1783 + }, + { + "epoch": 0.40194891148224293, + "grad_norm": 0.4432660250189479, + "learning_rate": 3.489312392597331e-06, + "loss": 0.179, + "step": 1784 + }, + { + "epoch": 0.40217421916805135, + "grad_norm": 0.5138013973552193, + "learning_rate": 3.4876012893794053e-06, + "loss": 0.2077, + "step": 1785 + }, + { + "epoch": 0.4023995268538598, + "grad_norm": 0.4516540040109276, + "learning_rate": 3.4858896377832966e-06, + "loss": 0.1564, + "step": 1786 + }, + { + "epoch": 0.40262483453966824, + "grad_norm": 0.49225242326044605, + "learning_rate": 3.4841774387594202e-06, + "loss": 0.2001, + "step": 1787 + }, + { + "epoch": 0.40285014222547666, + "grad_norm": 0.4385703152066493, + "learning_rate": 3.482464693258496e-06, + "loss": 0.1682, + "step": 1788 + }, + { + "epoch": 0.4030754499112851, + "grad_norm": 0.4821003550005217, + "learning_rate": 3.4807514022315473e-06, + "loss": 0.1801, + "step": 1789 + }, + { + "epoch": 0.40330075759709355, + "grad_norm": 0.44991621730993836, + "learning_rate": 3.4790375666299026e-06, + "loss": 0.1711, + "step": 1790 + }, + { + "epoch": 0.40352606528290197, + "grad_norm": 0.4655382860947815, + "learning_rate": 3.4773231874051893e-06, + "loss": 0.1902, + "step": 1791 + }, + { + "epoch": 0.4037513729687104, + "grad_norm": 0.46813425787649005, + "learning_rate": 3.4756082655093387e-06, + "loss": 0.1849, + "step": 1792 + }, + { + "epoch": 0.4039766806545188, + "grad_norm": 0.4617698815983265, + "learning_rate": 3.4738928018945828e-06, + "loss": 0.1754, + "step": 1793 + }, + { + "epoch": 0.4042019883403273, + "grad_norm": 0.4449334800217574, + "learning_rate": 3.4721767975134557e-06, + "loss": 0.1718, + "step": 1794 + }, + { + "epoch": 0.4044272960261357, + "grad_norm": 0.4482495297215526, + "learning_rate": 3.470460253318789e-06, + "loss": 0.1808, + "step": 1795 + }, + { + "epoch": 0.4046526037119441, + "grad_norm": 0.47738372044213667, + "learning_rate": 3.4687431702637165e-06, + "loss": 0.1871, + "step": 1796 + }, + { + "epoch": 0.40487791139775253, + "grad_norm": 0.5002635665361638, + "learning_rate": 3.4670255493016715e-06, + "loss": 0.1967, + "step": 1797 + }, + { + "epoch": 0.405103219083561, + "grad_norm": 0.43613677561297415, + "learning_rate": 3.465307391386383e-06, + "loss": 0.1568, + "step": 1798 + }, + { + "epoch": 0.4053285267693694, + "grad_norm": 0.47724515292779823, + "learning_rate": 3.4635886974718814e-06, + "loss": 0.1897, + "step": 1799 + }, + { + "epoch": 0.40555383445517784, + "grad_norm": 0.44533175466657154, + "learning_rate": 3.4618694685124927e-06, + "loss": 0.1723, + "step": 1800 + }, + { + "epoch": 0.40577914214098626, + "grad_norm": 0.47893363637759345, + "learning_rate": 3.4601497054628407e-06, + "loss": 0.1946, + "step": 1801 + }, + { + "epoch": 0.40600444982679473, + "grad_norm": 0.47607716250094795, + "learning_rate": 3.458429409277846e-06, + "loss": 0.1813, + "step": 1802 + }, + { + "epoch": 0.40622975751260315, + "grad_norm": 0.4533437781704823, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.1785, + "step": 1803 + }, + { + "epoch": 0.40645506519841157, + "grad_norm": 0.47250365903990904, + "learning_rate": 3.454987221322989e-06, + "loss": 0.1896, + "step": 1804 + }, + { + "epoch": 0.40668037288422, + "grad_norm": 0.4784406821542634, + "learning_rate": 3.4532653314644453e-06, + "loss": 0.1905, + "step": 1805 + }, + { + "epoch": 0.40690568057002846, + "grad_norm": 0.44029147176840283, + "learning_rate": 3.4515429122931955e-06, + "loss": 0.1713, + "step": 1806 + }, + { + "epoch": 0.4071309882558369, + "grad_norm": 0.4404482784311822, + "learning_rate": 3.4498199647656335e-06, + "loss": 0.1676, + "step": 1807 + }, + { + "epoch": 0.4073562959416453, + "grad_norm": 0.46201379255110625, + "learning_rate": 3.4480964898384495e-06, + "loss": 0.1766, + "step": 1808 + }, + { + "epoch": 0.4075816036274537, + "grad_norm": 0.4929345791980121, + "learning_rate": 3.4463724884686234e-06, + "loss": 0.1989, + "step": 1809 + }, + { + "epoch": 0.4078069113132622, + "grad_norm": 0.44427484365517794, + "learning_rate": 3.44464796161343e-06, + "loss": 0.1722, + "step": 1810 + }, + { + "epoch": 0.4080322189990706, + "grad_norm": 0.4851393458136704, + "learning_rate": 3.4429229102304336e-06, + "loss": 0.1974, + "step": 1811 + }, + { + "epoch": 0.408257526684879, + "grad_norm": 0.4940102051780381, + "learning_rate": 3.4411973352774917e-06, + "loss": 0.1898, + "step": 1812 + }, + { + "epoch": 0.4084828343706875, + "grad_norm": 0.43337033578454826, + "learning_rate": 3.4394712377127524e-06, + "loss": 0.1798, + "step": 1813 + }, + { + "epoch": 0.4087081420564959, + "grad_norm": 0.46457828175376203, + "learning_rate": 3.437744618494653e-06, + "loss": 0.1826, + "step": 1814 + }, + { + "epoch": 0.40893344974230433, + "grad_norm": 0.4658044274149683, + "learning_rate": 3.4360174785819196e-06, + "loss": 0.1721, + "step": 1815 + }, + { + "epoch": 0.40915875742811275, + "grad_norm": 0.46371938881929187, + "learning_rate": 3.4342898189335692e-06, + "loss": 0.1783, + "step": 1816 + }, + { + "epoch": 0.4093840651139212, + "grad_norm": 0.42314240549223686, + "learning_rate": 3.432561640508908e-06, + "loss": 0.1625, + "step": 1817 + }, + { + "epoch": 0.40960937279972964, + "grad_norm": 0.4595803423587687, + "learning_rate": 3.4308329442675276e-06, + "loss": 0.1712, + "step": 1818 + }, + { + "epoch": 0.40983468048553806, + "grad_norm": 0.407295397279368, + "learning_rate": 3.4291037311693088e-06, + "loss": 0.156, + "step": 1819 + }, + { + "epoch": 0.4100599881713465, + "grad_norm": 0.4540024922591821, + "learning_rate": 3.42737400217442e-06, + "loss": 0.1813, + "step": 1820 + }, + { + "epoch": 0.41028529585715495, + "grad_norm": 0.44136741108813465, + "learning_rate": 3.4256437582433144e-06, + "loss": 0.1779, + "step": 1821 + }, + { + "epoch": 0.41051060354296337, + "grad_norm": 0.45029904329698645, + "learning_rate": 3.423913000336732e-06, + "loss": 0.1813, + "step": 1822 + }, + { + "epoch": 0.4107359112287718, + "grad_norm": 0.42857123226543636, + "learning_rate": 3.422181729415699e-06, + "loss": 0.1602, + "step": 1823 + }, + { + "epoch": 0.4109612189145802, + "grad_norm": 0.4760695080027104, + "learning_rate": 3.4204499464415253e-06, + "loss": 0.1845, + "step": 1824 + }, + { + "epoch": 0.4111865266003887, + "grad_norm": 0.4188961679773126, + "learning_rate": 3.418717652375805e-06, + "loss": 0.1565, + "step": 1825 + }, + { + "epoch": 0.4114118342861971, + "grad_norm": 0.4576745254531164, + "learning_rate": 3.4169848481804165e-06, + "loss": 0.1874, + "step": 1826 + }, + { + "epoch": 0.4116371419720055, + "grad_norm": 0.445517805246202, + "learning_rate": 3.415251534817521e-06, + "loss": 0.1742, + "step": 1827 + }, + { + "epoch": 0.41186244965781393, + "grad_norm": 0.48390667796691295, + "learning_rate": 3.4135177132495632e-06, + "loss": 0.1835, + "step": 1828 + }, + { + "epoch": 0.4120877573436224, + "grad_norm": 0.495938929248875, + "learning_rate": 3.4117833844392704e-06, + "loss": 0.1938, + "step": 1829 + }, + { + "epoch": 0.4123130650294308, + "grad_norm": 0.4576571278023855, + "learning_rate": 3.41004854934965e-06, + "loss": 0.1748, + "step": 1830 + }, + { + "epoch": 0.41253837271523924, + "grad_norm": 0.45975740340892945, + "learning_rate": 3.4083132089439912e-06, + "loss": 0.1924, + "step": 1831 + }, + { + "epoch": 0.41276368040104766, + "grad_norm": 0.460007599399347, + "learning_rate": 3.406577364185864e-06, + "loss": 0.1852, + "step": 1832 + }, + { + "epoch": 0.41298898808685613, + "grad_norm": 0.4860135022327305, + "learning_rate": 3.404841016039118e-06, + "loss": 0.1944, + "step": 1833 + }, + { + "epoch": 0.41321429577266455, + "grad_norm": 0.4940524481840105, + "learning_rate": 3.403104165467883e-06, + "loss": 0.1962, + "step": 1834 + }, + { + "epoch": 0.41343960345847297, + "grad_norm": 0.46017432964086324, + "learning_rate": 3.4013668134365675e-06, + "loss": 0.1708, + "step": 1835 + }, + { + "epoch": 0.4136649111442814, + "grad_norm": 0.4851577136583611, + "learning_rate": 3.399628960909857e-06, + "loss": 0.1825, + "step": 1836 + }, + { + "epoch": 0.41389021883008986, + "grad_norm": 0.474603088858992, + "learning_rate": 3.397890608852718e-06, + "loss": 0.1884, + "step": 1837 + }, + { + "epoch": 0.4141155265158983, + "grad_norm": 0.4467874158129823, + "learning_rate": 3.3961517582303916e-06, + "loss": 0.1792, + "step": 1838 + }, + { + "epoch": 0.4143408342017067, + "grad_norm": 0.4816070010880334, + "learning_rate": 3.394412410008397e-06, + "loss": 0.1866, + "step": 1839 + }, + { + "epoch": 0.4145661418875151, + "grad_norm": 0.4737267326237205, + "learning_rate": 3.39267256515253e-06, + "loss": 0.1847, + "step": 1840 + }, + { + "epoch": 0.4147914495733236, + "grad_norm": 0.43763287158954123, + "learning_rate": 3.3909322246288606e-06, + "loss": 0.1506, + "step": 1841 + }, + { + "epoch": 0.415016757259132, + "grad_norm": 0.4504182643927509, + "learning_rate": 3.3891913894037354e-06, + "loss": 0.1872, + "step": 1842 + }, + { + "epoch": 0.4152420649449404, + "grad_norm": 0.48082376041128666, + "learning_rate": 3.3874500604437752e-06, + "loss": 0.1776, + "step": 1843 + }, + { + "epoch": 0.41546737263074884, + "grad_norm": 0.4626636382448201, + "learning_rate": 3.385708238715876e-06, + "loss": 0.1801, + "step": 1844 + }, + { + "epoch": 0.4156926803165573, + "grad_norm": 0.44116414561370704, + "learning_rate": 3.3839659251872054e-06, + "loss": 0.1668, + "step": 1845 + }, + { + "epoch": 0.41591798800236573, + "grad_norm": 0.45393816315075275, + "learning_rate": 3.3822231208252053e-06, + "loss": 0.1744, + "step": 1846 + }, + { + "epoch": 0.41614329568817415, + "grad_norm": 0.4869906362883922, + "learning_rate": 3.38047982659759e-06, + "loss": 0.1977, + "step": 1847 + }, + { + "epoch": 0.41636860337398257, + "grad_norm": 0.47707525102918136, + "learning_rate": 3.3787360434723466e-06, + "loss": 0.1866, + "step": 1848 + }, + { + "epoch": 0.41659391105979104, + "grad_norm": 0.5010971798238762, + "learning_rate": 3.3769917724177315e-06, + "loss": 0.1937, + "step": 1849 + }, + { + "epoch": 0.41681921874559946, + "grad_norm": 0.4522776413417502, + "learning_rate": 3.3752470144022745e-06, + "loss": 0.1773, + "step": 1850 + }, + { + "epoch": 0.4170445264314079, + "grad_norm": 0.4789831792383779, + "learning_rate": 3.3735017703947748e-06, + "loss": 0.195, + "step": 1851 + }, + { + "epoch": 0.4172698341172163, + "grad_norm": 0.4554662735888735, + "learning_rate": 3.371756041364301e-06, + "loss": 0.174, + "step": 1852 + }, + { + "epoch": 0.41749514180302477, + "grad_norm": 0.4597410514735308, + "learning_rate": 3.370009828280191e-06, + "loss": 0.1702, + "step": 1853 + }, + { + "epoch": 0.4177204494888332, + "grad_norm": 0.4676505229144349, + "learning_rate": 3.3682631321120507e-06, + "loss": 0.1933, + "step": 1854 + }, + { + "epoch": 0.4179457571746416, + "grad_norm": 0.4959557268224456, + "learning_rate": 3.366515953829758e-06, + "loss": 0.2151, + "step": 1855 + }, + { + "epoch": 0.41817106486045, + "grad_norm": 0.4868647994731083, + "learning_rate": 3.364768294403455e-06, + "loss": 0.1903, + "step": 1856 + }, + { + "epoch": 0.4183963725462585, + "grad_norm": 0.46354105181526595, + "learning_rate": 3.3630201548035512e-06, + "loss": 0.1876, + "step": 1857 + }, + { + "epoch": 0.4186216802320669, + "grad_norm": 0.5400242386491048, + "learning_rate": 3.361271536000723e-06, + "loss": 0.1897, + "step": 1858 + }, + { + "epoch": 0.41884698791787534, + "grad_norm": 0.4674379464695428, + "learning_rate": 3.359522438965915e-06, + "loss": 0.1848, + "step": 1859 + }, + { + "epoch": 0.41907229560368375, + "grad_norm": 0.45497390315226255, + "learning_rate": 3.3577728646703335e-06, + "loss": 0.1653, + "step": 1860 + }, + { + "epoch": 0.4192976032894922, + "grad_norm": 0.4876915477912796, + "learning_rate": 3.3560228140854534e-06, + "loss": 0.185, + "step": 1861 + }, + { + "epoch": 0.41952291097530064, + "grad_norm": 0.46865961848797727, + "learning_rate": 3.354272288183012e-06, + "loss": 0.1839, + "step": 1862 + }, + { + "epoch": 0.41974821866110906, + "grad_norm": 0.48706750823282474, + "learning_rate": 3.352521287935011e-06, + "loss": 0.1629, + "step": 1863 + }, + { + "epoch": 0.41997352634691754, + "grad_norm": 0.4636544732808607, + "learning_rate": 3.3507698143137157e-06, + "loss": 0.1722, + "step": 1864 + }, + { + "epoch": 0.42019883403272595, + "grad_norm": 0.4263617210818012, + "learning_rate": 3.3490178682916534e-06, + "loss": 0.1565, + "step": 1865 + }, + { + "epoch": 0.42042414171853437, + "grad_norm": 0.4337634547218228, + "learning_rate": 3.3472654508416157e-06, + "loss": 0.1816, + "step": 1866 + }, + { + "epoch": 0.4206494494043428, + "grad_norm": 0.4834387527015694, + "learning_rate": 3.3455125629366546e-06, + "loss": 0.1797, + "step": 1867 + }, + { + "epoch": 0.42087475709015126, + "grad_norm": 0.45189191805795714, + "learning_rate": 3.3437592055500825e-06, + "loss": 0.1837, + "step": 1868 + }, + { + "epoch": 0.4211000647759597, + "grad_norm": 0.47943136238688605, + "learning_rate": 3.342005379655474e-06, + "loss": 0.1776, + "step": 1869 + }, + { + "epoch": 0.4213253724617681, + "grad_norm": 0.47521820765926304, + "learning_rate": 3.340251086226663e-06, + "loss": 0.1772, + "step": 1870 + }, + { + "epoch": 0.4215506801475765, + "grad_norm": 0.46803833955158985, + "learning_rate": 3.3384963262377434e-06, + "loss": 0.1839, + "step": 1871 + }, + { + "epoch": 0.421775987833385, + "grad_norm": 0.48054560809998237, + "learning_rate": 3.3367411006630677e-06, + "loss": 0.1879, + "step": 1872 + }, + { + "epoch": 0.4220012955191934, + "grad_norm": 0.5026466545633729, + "learning_rate": 3.3349854104772476e-06, + "loss": 0.1943, + "step": 1873 + }, + { + "epoch": 0.4222266032050018, + "grad_norm": 0.4339212772543207, + "learning_rate": 3.333229256655153e-06, + "loss": 0.1657, + "step": 1874 + }, + { + "epoch": 0.42245191089081024, + "grad_norm": 0.46471631615927744, + "learning_rate": 3.3314726401719088e-06, + "loss": 0.1726, + "step": 1875 + }, + { + "epoch": 0.4226772185766187, + "grad_norm": 0.4467346294732029, + "learning_rate": 3.3297155620029e-06, + "loss": 0.1682, + "step": 1876 + }, + { + "epoch": 0.42290252626242714, + "grad_norm": 0.5215087099623767, + "learning_rate": 3.3279580231237664e-06, + "loss": 0.1749, + "step": 1877 + }, + { + "epoch": 0.42312783394823555, + "grad_norm": 0.49879193692840385, + "learning_rate": 3.326200024510405e-06, + "loss": 0.1734, + "step": 1878 + }, + { + "epoch": 0.423353141634044, + "grad_norm": 0.4497194733748111, + "learning_rate": 3.324441567138965e-06, + "loss": 0.1802, + "step": 1879 + }, + { + "epoch": 0.42357844931985245, + "grad_norm": 0.43573937645055766, + "learning_rate": 3.3226826519858526e-06, + "loss": 0.1663, + "step": 1880 + }, + { + "epoch": 0.42380375700566086, + "grad_norm": 0.46733973441345816, + "learning_rate": 3.320923280027728e-06, + "loss": 0.1805, + "step": 1881 + }, + { + "epoch": 0.4240290646914693, + "grad_norm": 0.4687777797298531, + "learning_rate": 3.3191634522415064e-06, + "loss": 0.1633, + "step": 1882 + }, + { + "epoch": 0.4242543723772777, + "grad_norm": 0.4517047159489899, + "learning_rate": 3.317403169604352e-06, + "loss": 0.1788, + "step": 1883 + }, + { + "epoch": 0.4244796800630862, + "grad_norm": 0.41223841849685455, + "learning_rate": 3.315642433093686e-06, + "loss": 0.1577, + "step": 1884 + }, + { + "epoch": 0.4247049877488946, + "grad_norm": 0.43624828229160506, + "learning_rate": 3.313881243687179e-06, + "loss": 0.1745, + "step": 1885 + }, + { + "epoch": 0.424930295434703, + "grad_norm": 0.47305528462567104, + "learning_rate": 3.3121196023627543e-06, + "loss": 0.1932, + "step": 1886 + }, + { + "epoch": 0.4251556031205114, + "grad_norm": 0.46077550439216297, + "learning_rate": 3.3103575100985852e-06, + "loss": 0.1755, + "step": 1887 + }, + { + "epoch": 0.4253809108063199, + "grad_norm": 0.43410918819058486, + "learning_rate": 3.3085949678730953e-06, + "loss": 0.1547, + "step": 1888 + }, + { + "epoch": 0.4256062184921283, + "grad_norm": 0.45100278951175565, + "learning_rate": 3.3068319766649605e-06, + "loss": 0.1594, + "step": 1889 + }, + { + "epoch": 0.42583152617793674, + "grad_norm": 0.474047407061855, + "learning_rate": 3.305068537453102e-06, + "loss": 0.1866, + "step": 1890 + }, + { + "epoch": 0.42605683386374515, + "grad_norm": 0.4505462134351408, + "learning_rate": 3.303304651216693e-06, + "loss": 0.172, + "step": 1891 + }, + { + "epoch": 0.42628214154955363, + "grad_norm": 0.4741991261024551, + "learning_rate": 3.3015403189351536e-06, + "loss": 0.1905, + "step": 1892 + }, + { + "epoch": 0.42650744923536205, + "grad_norm": 0.45226088553567023, + "learning_rate": 3.2997755415881516e-06, + "loss": 0.1688, + "step": 1893 + }, + { + "epoch": 0.42673275692117046, + "grad_norm": 0.490451794531464, + "learning_rate": 3.2980103201556023e-06, + "loss": 0.1852, + "step": 1894 + }, + { + "epoch": 0.4269580646069789, + "grad_norm": 0.4791149934491423, + "learning_rate": 3.2962446556176676e-06, + "loss": 0.1894, + "step": 1895 + }, + { + "epoch": 0.42718337229278736, + "grad_norm": 0.4783115380617043, + "learning_rate": 3.2944785489547544e-06, + "loss": 0.1726, + "step": 1896 + }, + { + "epoch": 0.4274086799785958, + "grad_norm": 0.4448038280249334, + "learning_rate": 3.2927120011475168e-06, + "loss": 0.1767, + "step": 1897 + }, + { + "epoch": 0.4276339876644042, + "grad_norm": 0.46940097462866526, + "learning_rate": 3.290945013176852e-06, + "loss": 0.1816, + "step": 1898 + }, + { + "epoch": 0.4278592953502126, + "grad_norm": 0.42454240362971507, + "learning_rate": 3.2891775860239033e-06, + "loss": 0.1623, + "step": 1899 + }, + { + "epoch": 0.4280846030360211, + "grad_norm": 0.4989505080480273, + "learning_rate": 3.2874097206700566e-06, + "loss": 0.1974, + "step": 1900 + }, + { + "epoch": 0.4283099107218295, + "grad_norm": 0.4892314076492719, + "learning_rate": 3.285641418096942e-06, + "loss": 0.1956, + "step": 1901 + }, + { + "epoch": 0.4285352184076379, + "grad_norm": 0.4550020891020824, + "learning_rate": 3.2838726792864315e-06, + "loss": 0.1726, + "step": 1902 + }, + { + "epoch": 0.42876052609344634, + "grad_norm": 0.45129015690765056, + "learning_rate": 3.2821035052206413e-06, + "loss": 0.1806, + "step": 1903 + }, + { + "epoch": 0.4289858337792548, + "grad_norm": 0.4460573927020872, + "learning_rate": 3.2803338968819264e-06, + "loss": 0.1685, + "step": 1904 + }, + { + "epoch": 0.42921114146506323, + "grad_norm": 0.49346248106990415, + "learning_rate": 3.278563855252885e-06, + "loss": 0.1877, + "step": 1905 + }, + { + "epoch": 0.42943644915087165, + "grad_norm": 0.4379202919875662, + "learning_rate": 3.2767933813163542e-06, + "loss": 0.1619, + "step": 1906 + }, + { + "epoch": 0.42966175683668006, + "grad_norm": 0.435340796582951, + "learning_rate": 3.2750224760554135e-06, + "loss": 0.1776, + "step": 1907 + }, + { + "epoch": 0.42988706452248854, + "grad_norm": 0.4662588960082282, + "learning_rate": 3.2732511404533797e-06, + "loss": 0.1763, + "step": 1908 + }, + { + "epoch": 0.43011237220829696, + "grad_norm": 0.5042080234916956, + "learning_rate": 3.2714793754938102e-06, + "loss": 0.1955, + "step": 1909 + }, + { + "epoch": 0.4303376798941054, + "grad_norm": 0.4724957514713676, + "learning_rate": 3.2697071821604986e-06, + "loss": 0.1906, + "step": 1910 + }, + { + "epoch": 0.43056298757991385, + "grad_norm": 0.4875054834104178, + "learning_rate": 3.2679345614374802e-06, + "loss": 0.1934, + "step": 1911 + }, + { + "epoch": 0.43078829526572227, + "grad_norm": 0.4484132832693631, + "learning_rate": 3.266161514309023e-06, + "loss": 0.1768, + "step": 1912 + }, + { + "epoch": 0.4310136029515307, + "grad_norm": 0.48139768592305, + "learning_rate": 3.264388041759635e-06, + "loss": 0.1888, + "step": 1913 + }, + { + "epoch": 0.4312389106373391, + "grad_norm": 0.4620765036595367, + "learning_rate": 3.262614144774059e-06, + "loss": 0.1897, + "step": 1914 + }, + { + "epoch": 0.4314642183231476, + "grad_norm": 0.4373550750157816, + "learning_rate": 3.260839824337274e-06, + "loss": 0.1568, + "step": 1915 + }, + { + "epoch": 0.431689526008956, + "grad_norm": 0.4668638173723568, + "learning_rate": 3.259065081434495e-06, + "loss": 0.1681, + "step": 1916 + }, + { + "epoch": 0.4319148336947644, + "grad_norm": 0.4813808830289284, + "learning_rate": 3.2572899170511683e-06, + "loss": 0.184, + "step": 1917 + }, + { + "epoch": 0.43214014138057283, + "grad_norm": 0.4982601206967862, + "learning_rate": 3.255514332172979e-06, + "loss": 0.1895, + "step": 1918 + }, + { + "epoch": 0.4323654490663813, + "grad_norm": 0.44820539920343766, + "learning_rate": 3.2537383277858413e-06, + "loss": 0.1664, + "step": 1919 + }, + { + "epoch": 0.4325907567521897, + "grad_norm": 0.4785606317406842, + "learning_rate": 3.2519619048759056e-06, + "loss": 0.172, + "step": 1920 + }, + { + "epoch": 0.43281606443799814, + "grad_norm": 0.4781792913368394, + "learning_rate": 3.250185064429552e-06, + "loss": 0.189, + "step": 1921 + }, + { + "epoch": 0.43304137212380656, + "grad_norm": 0.4672471795981698, + "learning_rate": 3.248407807433396e-06, + "loss": 0.1892, + "step": 1922 + }, + { + "epoch": 0.43326667980961503, + "grad_norm": 0.49126960349413656, + "learning_rate": 3.246630134874279e-06, + "loss": 0.1786, + "step": 1923 + }, + { + "epoch": 0.43349198749542345, + "grad_norm": 0.4828165704960671, + "learning_rate": 3.2448520477392788e-06, + "loss": 0.1965, + "step": 1924 + }, + { + "epoch": 0.43371729518123187, + "grad_norm": 0.4532078207693903, + "learning_rate": 3.2430735470157e-06, + "loss": 0.1727, + "step": 1925 + }, + { + "epoch": 0.4339426028670403, + "grad_norm": 0.48683543433707893, + "learning_rate": 3.2412946336910778e-06, + "loss": 0.1974, + "step": 1926 + }, + { + "epoch": 0.43416791055284876, + "grad_norm": 0.4839888232752711, + "learning_rate": 3.2395153087531767e-06, + "loss": 0.2057, + "step": 1927 + }, + { + "epoch": 0.4343932182386572, + "grad_norm": 0.47885189453023336, + "learning_rate": 3.237735573189989e-06, + "loss": 0.1885, + "step": 1928 + }, + { + "epoch": 0.4346185259244656, + "grad_norm": 0.41107545137798746, + "learning_rate": 3.2359554279897353e-06, + "loss": 0.1557, + "step": 1929 + }, + { + "epoch": 0.434843833610274, + "grad_norm": 0.48394813377811147, + "learning_rate": 3.234174874140866e-06, + "loss": 0.1813, + "step": 1930 + }, + { + "epoch": 0.4350691412960825, + "grad_norm": 0.4554396549603301, + "learning_rate": 3.232393912632054e-06, + "loss": 0.1698, + "step": 1931 + }, + { + "epoch": 0.4352944489818909, + "grad_norm": 0.46105423896888936, + "learning_rate": 3.230612544452202e-06, + "loss": 0.1823, + "step": 1932 + }, + { + "epoch": 0.4355197566676993, + "grad_norm": 0.45647901218370346, + "learning_rate": 3.228830770590436e-06, + "loss": 0.1725, + "step": 1933 + }, + { + "epoch": 0.43574506435350774, + "grad_norm": 0.46074326507878804, + "learning_rate": 3.2270485920361093e-06, + "loss": 0.1752, + "step": 1934 + }, + { + "epoch": 0.4359703720393162, + "grad_norm": 0.47191570778688724, + "learning_rate": 3.2252660097788003e-06, + "loss": 0.1917, + "step": 1935 + }, + { + "epoch": 0.43619567972512463, + "grad_norm": 0.4800079066452369, + "learning_rate": 3.2234830248083095e-06, + "loss": 0.1774, + "step": 1936 + }, + { + "epoch": 0.43642098741093305, + "grad_norm": 0.439931881162112, + "learning_rate": 3.2216996381146613e-06, + "loss": 0.1687, + "step": 1937 + }, + { + "epoch": 0.43664629509674147, + "grad_norm": 0.4140627966262883, + "learning_rate": 3.219915850688106e-06, + "loss": 0.1516, + "step": 1938 + }, + { + "epoch": 0.43687160278254994, + "grad_norm": 0.5625837186267276, + "learning_rate": 3.2181316635191125e-06, + "loss": 0.1751, + "step": 1939 + }, + { + "epoch": 0.43709691046835836, + "grad_norm": 0.4898808241462336, + "learning_rate": 3.2163470775983733e-06, + "loss": 0.1668, + "step": 1940 + }, + { + "epoch": 0.4373222181541668, + "grad_norm": 0.4670591076304303, + "learning_rate": 3.2145620939168036e-06, + "loss": 0.1837, + "step": 1941 + }, + { + "epoch": 0.4375475258399752, + "grad_norm": 0.45126123741430546, + "learning_rate": 3.2127767134655374e-06, + "loss": 0.18, + "step": 1942 + }, + { + "epoch": 0.43777283352578367, + "grad_norm": 0.47241349089531304, + "learning_rate": 3.210990937235931e-06, + "loss": 0.1665, + "step": 1943 + }, + { + "epoch": 0.4379981412115921, + "grad_norm": 0.46437755723792035, + "learning_rate": 3.209204766219558e-06, + "loss": 0.1899, + "step": 1944 + }, + { + "epoch": 0.4382234488974005, + "grad_norm": 0.44545288592439225, + "learning_rate": 3.207418201408213e-06, + "loss": 0.1666, + "step": 1945 + }, + { + "epoch": 0.4384487565832089, + "grad_norm": 0.45637762564604695, + "learning_rate": 3.205631243793909e-06, + "loss": 0.1715, + "step": 1946 + }, + { + "epoch": 0.4386740642690174, + "grad_norm": 0.4816136687309396, + "learning_rate": 3.2038438943688777e-06, + "loss": 0.183, + "step": 1947 + }, + { + "epoch": 0.4388993719548258, + "grad_norm": 0.4766323054491745, + "learning_rate": 3.202056154125567e-06, + "loss": 0.1814, + "step": 1948 + }, + { + "epoch": 0.43912467964063423, + "grad_norm": 0.4313888561912964, + "learning_rate": 3.2002680240566412e-06, + "loss": 0.1685, + "step": 1949 + }, + { + "epoch": 0.43934998732644265, + "grad_norm": 0.45951720466930723, + "learning_rate": 3.198479505154984e-06, + "loss": 0.1864, + "step": 1950 + }, + { + "epoch": 0.4395752950122511, + "grad_norm": 0.4701803598742908, + "learning_rate": 3.1966905984136932e-06, + "loss": 0.184, + "step": 1951 + }, + { + "epoch": 0.43980060269805954, + "grad_norm": 0.5009703394326628, + "learning_rate": 3.1949013048260813e-06, + "loss": 0.1939, + "step": 1952 + }, + { + "epoch": 0.44002591038386796, + "grad_norm": 0.46197548368518165, + "learning_rate": 3.1931116253856762e-06, + "loss": 0.1777, + "step": 1953 + }, + { + "epoch": 0.4402512180696764, + "grad_norm": 0.470849326635837, + "learning_rate": 3.1913215610862208e-06, + "loss": 0.1895, + "step": 1954 + }, + { + "epoch": 0.44047652575548485, + "grad_norm": 0.4579316958072282, + "learning_rate": 3.189531112921671e-06, + "loss": 0.1766, + "step": 1955 + }, + { + "epoch": 0.44070183344129327, + "grad_norm": 0.5084703409072364, + "learning_rate": 3.1877402818861954e-06, + "loss": 0.1509, + "step": 1956 + }, + { + "epoch": 0.4409271411271017, + "grad_norm": 0.47104056884408985, + "learning_rate": 3.185949068974177e-06, + "loss": 0.1775, + "step": 1957 + }, + { + "epoch": 0.4411524488129101, + "grad_norm": 0.4427657864484733, + "learning_rate": 3.184157475180208e-06, + "loss": 0.1713, + "step": 1958 + }, + { + "epoch": 0.4413777564987186, + "grad_norm": 0.5019576086082366, + "learning_rate": 3.1823655014990937e-06, + "loss": 0.1805, + "step": 1959 + }, + { + "epoch": 0.441603064184527, + "grad_norm": 0.47412604448722473, + "learning_rate": 3.1805731489258516e-06, + "loss": 0.1703, + "step": 1960 + }, + { + "epoch": 0.4418283718703354, + "grad_norm": 0.4918090693178185, + "learning_rate": 3.1787804184557074e-06, + "loss": 0.1644, + "step": 1961 + }, + { + "epoch": 0.4420536795561439, + "grad_norm": 0.49026825479506464, + "learning_rate": 3.1769873110840977e-06, + "loss": 0.1741, + "step": 1962 + }, + { + "epoch": 0.4422789872419523, + "grad_norm": 0.46708437359273897, + "learning_rate": 3.1751938278066687e-06, + "loss": 0.1747, + "step": 1963 + }, + { + "epoch": 0.4425042949277607, + "grad_norm": 0.4736507627745363, + "learning_rate": 3.1733999696192736e-06, + "loss": 0.177, + "step": 1964 + }, + { + "epoch": 0.44272960261356914, + "grad_norm": 0.502477558680681, + "learning_rate": 3.171605737517976e-06, + "loss": 0.1855, + "step": 1965 + }, + { + "epoch": 0.4429549102993776, + "grad_norm": 0.44394451449107425, + "learning_rate": 3.1698111324990454e-06, + "loss": 0.1607, + "step": 1966 + }, + { + "epoch": 0.44318021798518603, + "grad_norm": 0.47194405043984106, + "learning_rate": 3.16801615555896e-06, + "loss": 0.1898, + "step": 1967 + }, + { + "epoch": 0.44340552567099445, + "grad_norm": 0.4600312870115847, + "learning_rate": 3.1662208076944027e-06, + "loss": 0.1737, + "step": 1968 + }, + { + "epoch": 0.44363083335680287, + "grad_norm": 0.4526291208341626, + "learning_rate": 3.1644250899022637e-06, + "loss": 0.17, + "step": 1969 + }, + { + "epoch": 0.44385614104261134, + "grad_norm": 0.47693269802347554, + "learning_rate": 3.162629003179638e-06, + "loss": 0.172, + "step": 1970 + }, + { + "epoch": 0.44408144872841976, + "grad_norm": 0.45975500464709795, + "learning_rate": 3.1608325485238257e-06, + "loss": 0.1711, + "step": 1971 + }, + { + "epoch": 0.4443067564142282, + "grad_norm": 0.47462735838990233, + "learning_rate": 3.1590357269323312e-06, + "loss": 0.184, + "step": 1972 + }, + { + "epoch": 0.4445320641000366, + "grad_norm": 0.47891808376449757, + "learning_rate": 3.157238539402862e-06, + "loss": 0.1936, + "step": 1973 + }, + { + "epoch": 0.44475737178584507, + "grad_norm": 0.4641511494770072, + "learning_rate": 3.15544098693333e-06, + "loss": 0.1931, + "step": 1974 + }, + { + "epoch": 0.4449826794716535, + "grad_norm": 0.44981510818581427, + "learning_rate": 3.15364307052185e-06, + "loss": 0.1707, + "step": 1975 + }, + { + "epoch": 0.4452079871574619, + "grad_norm": 0.4746555461430859, + "learning_rate": 3.151844791166735e-06, + "loss": 0.1844, + "step": 1976 + }, + { + "epoch": 0.4454332948432703, + "grad_norm": 0.5165576622199085, + "learning_rate": 3.1500461498665053e-06, + "loss": 0.204, + "step": 1977 + }, + { + "epoch": 0.4456586025290788, + "grad_norm": 0.5170000095508068, + "learning_rate": 3.1482471476198784e-06, + "loss": 0.18, + "step": 1978 + }, + { + "epoch": 0.4458839102148872, + "grad_norm": 0.45199780803627987, + "learning_rate": 3.1464477854257726e-06, + "loss": 0.1744, + "step": 1979 + }, + { + "epoch": 0.44610921790069563, + "grad_norm": 0.4741841997215869, + "learning_rate": 3.1446480642833077e-06, + "loss": 0.1812, + "step": 1980 + }, + { + "epoch": 0.44633452558650405, + "grad_norm": 0.4655473432435609, + "learning_rate": 3.1428479851918014e-06, + "loss": 0.1745, + "step": 1981 + }, + { + "epoch": 0.4465598332723125, + "grad_norm": 0.47252666010453426, + "learning_rate": 3.14104754915077e-06, + "loss": 0.1745, + "step": 1982 + }, + { + "epoch": 0.44678514095812094, + "grad_norm": 0.4604232958374821, + "learning_rate": 3.1392467571599288e-06, + "loss": 0.184, + "step": 1983 + }, + { + "epoch": 0.44701044864392936, + "grad_norm": 0.47253935330758334, + "learning_rate": 3.137445610219192e-06, + "loss": 0.1703, + "step": 1984 + }, + { + "epoch": 0.4472357563297378, + "grad_norm": 0.4715090746548975, + "learning_rate": 3.1356441093286673e-06, + "loss": 0.1887, + "step": 1985 + }, + { + "epoch": 0.44746106401554625, + "grad_norm": 0.5021487735690854, + "learning_rate": 3.133842255488661e-06, + "loss": 0.1951, + "step": 1986 + }, + { + "epoch": 0.44768637170135467, + "grad_norm": 0.5136048382980162, + "learning_rate": 3.132040049699676e-06, + "loss": 0.1995, + "step": 1987 + }, + { + "epoch": 0.4479116793871631, + "grad_norm": 0.4966960822551801, + "learning_rate": 3.130237492962411e-06, + "loss": 0.1865, + "step": 1988 + }, + { + "epoch": 0.4481369870729715, + "grad_norm": 0.48774386438505996, + "learning_rate": 3.1284345862777572e-06, + "loss": 0.1787, + "step": 1989 + }, + { + "epoch": 0.44836229475878, + "grad_norm": 0.4665760727438642, + "learning_rate": 3.1266313306468018e-06, + "loss": 0.1702, + "step": 1990 + }, + { + "epoch": 0.4485876024445884, + "grad_norm": 0.46142139372684615, + "learning_rate": 3.1248277270708255e-06, + "loss": 0.1766, + "step": 1991 + }, + { + "epoch": 0.4488129101303968, + "grad_norm": 0.47692330513643316, + "learning_rate": 3.1230237765513023e-06, + "loss": 0.1965, + "step": 1992 + }, + { + "epoch": 0.44903821781620523, + "grad_norm": 0.428337663244195, + "learning_rate": 3.121219480089899e-06, + "loss": 0.1571, + "step": 1993 + }, + { + "epoch": 0.4492635255020137, + "grad_norm": 0.4368775151168116, + "learning_rate": 3.119414838688473e-06, + "loss": 0.1699, + "step": 1994 + }, + { + "epoch": 0.4494888331878221, + "grad_norm": 0.4366158853356244, + "learning_rate": 3.1176098533490755e-06, + "loss": 0.1529, + "step": 1995 + }, + { + "epoch": 0.44971414087363054, + "grad_norm": 0.5323832268156325, + "learning_rate": 3.1158045250739473e-06, + "loss": 0.1986, + "step": 1996 + }, + { + "epoch": 0.44993944855943896, + "grad_norm": 0.5194435348775026, + "learning_rate": 3.11399885486552e-06, + "loss": 0.1727, + "step": 1997 + }, + { + "epoch": 0.45016475624524743, + "grad_norm": 0.4554273959188556, + "learning_rate": 3.1121928437264138e-06, + "loss": 0.1766, + "step": 1998 + }, + { + "epoch": 0.45039006393105585, + "grad_norm": 0.48444799356478224, + "learning_rate": 3.1103864926594406e-06, + "loss": 0.1711, + "step": 1999 + }, + { + "epoch": 0.45061537161686427, + "grad_norm": 0.48873321649126766, + "learning_rate": 3.1085798026676e-06, + "loss": 0.1755, + "step": 2000 + }, + { + "epoch": 0.45061537161686427, + "eval_loss": 0.178781658411026, + "eval_runtime": 56.6429, + "eval_samples_per_second": 50.668, + "eval_steps_per_second": 6.338, + "step": 2000 + }, + { + "epoch": 0.4508406793026727, + "grad_norm": 0.4448525968602451, + "learning_rate": 3.1067727747540797e-06, + "loss": 0.157, + "step": 2001 + }, + { + "epoch": 0.45106598698848116, + "grad_norm": 0.44743250572108806, + "learning_rate": 3.1049654099222542e-06, + "loss": 0.1765, + "step": 2002 + }, + { + "epoch": 0.4512912946742896, + "grad_norm": 0.45327965737027653, + "learning_rate": 3.1031577091756852e-06, + "loss": 0.1718, + "step": 2003 + }, + { + "epoch": 0.451516602360098, + "grad_norm": 0.4369824821924044, + "learning_rate": 3.1013496735181232e-06, + "loss": 0.1643, + "step": 2004 + }, + { + "epoch": 0.4517419100459064, + "grad_norm": 0.4372424513731796, + "learning_rate": 3.0995413039535017e-06, + "loss": 0.1692, + "step": 2005 + }, + { + "epoch": 0.4519672177317149, + "grad_norm": 0.4697546518319072, + "learning_rate": 3.0977326014859415e-06, + "loss": 0.1932, + "step": 2006 + }, + { + "epoch": 0.4521925254175233, + "grad_norm": 0.4508326355784404, + "learning_rate": 3.095923567119748e-06, + "loss": 0.1737, + "step": 2007 + }, + { + "epoch": 0.4524178331033317, + "grad_norm": 0.46009434666145305, + "learning_rate": 3.09411420185941e-06, + "loss": 0.1775, + "step": 2008 + }, + { + "epoch": 0.4526431407891402, + "grad_norm": 0.46816359977275396, + "learning_rate": 3.0923045067096e-06, + "loss": 0.1737, + "step": 2009 + }, + { + "epoch": 0.4528684484749486, + "grad_norm": 0.4764227464628939, + "learning_rate": 3.090494482675176e-06, + "loss": 0.1969, + "step": 2010 + }, + { + "epoch": 0.45309375616075703, + "grad_norm": 0.48077236393924566, + "learning_rate": 3.088684130761175e-06, + "loss": 0.1888, + "step": 2011 + }, + { + "epoch": 0.45331906384656545, + "grad_norm": 0.44092660916655535, + "learning_rate": 3.0868734519728194e-06, + "loss": 0.1574, + "step": 2012 + }, + { + "epoch": 0.4535443715323739, + "grad_norm": 0.4937421932347271, + "learning_rate": 3.085062447315511e-06, + "loss": 0.1883, + "step": 2013 + }, + { + "epoch": 0.45376967921818234, + "grad_norm": 0.4433675146765382, + "learning_rate": 3.0832511177948326e-06, + "loss": 0.1695, + "step": 2014 + }, + { + "epoch": 0.45399498690399076, + "grad_norm": 0.4867475294069182, + "learning_rate": 3.081439464416549e-06, + "loss": 0.1957, + "step": 2015 + }, + { + "epoch": 0.4542202945897992, + "grad_norm": 0.4534126602554105, + "learning_rate": 3.0796274881866034e-06, + "loss": 0.1857, + "step": 2016 + }, + { + "epoch": 0.45444560227560765, + "grad_norm": 0.4685644262511982, + "learning_rate": 3.0778151901111187e-06, + "loss": 0.1825, + "step": 2017 + }, + { + "epoch": 0.45467090996141607, + "grad_norm": 0.42974154034528567, + "learning_rate": 3.0760025711963964e-06, + "loss": 0.1587, + "step": 2018 + }, + { + "epoch": 0.4548962176472245, + "grad_norm": 0.47023889389076906, + "learning_rate": 3.0741896324489163e-06, + "loss": 0.1899, + "step": 2019 + }, + { + "epoch": 0.4551215253330329, + "grad_norm": 0.43533463063829375, + "learning_rate": 3.0723763748753354e-06, + "loss": 0.1665, + "step": 2020 + }, + { + "epoch": 0.4553468330188414, + "grad_norm": 0.4706578778157172, + "learning_rate": 3.0705627994824887e-06, + "loss": 0.1817, + "step": 2021 + }, + { + "epoch": 0.4555721407046498, + "grad_norm": 0.4411217086293554, + "learning_rate": 3.0687489072773864e-06, + "loss": 0.1732, + "step": 2022 + }, + { + "epoch": 0.4557974483904582, + "grad_norm": 0.44302094267700187, + "learning_rate": 3.0669346992672156e-06, + "loss": 0.169, + "step": 2023 + }, + { + "epoch": 0.45602275607626663, + "grad_norm": 0.48886005785285397, + "learning_rate": 3.0651201764593375e-06, + "loss": 0.1838, + "step": 2024 + }, + { + "epoch": 0.4562480637620751, + "grad_norm": 0.4575709754373421, + "learning_rate": 3.06330533986129e-06, + "loss": 0.1829, + "step": 2025 + }, + { + "epoch": 0.4564733714478835, + "grad_norm": 0.47995861564966674, + "learning_rate": 3.0614901904807836e-06, + "loss": 0.1726, + "step": 2026 + }, + { + "epoch": 0.45669867913369194, + "grad_norm": 0.43946056756939506, + "learning_rate": 3.0596747293257047e-06, + "loss": 0.1759, + "step": 2027 + }, + { + "epoch": 0.45692398681950036, + "grad_norm": 0.46879761242178286, + "learning_rate": 3.0578589574041097e-06, + "loss": 0.1651, + "step": 2028 + }, + { + "epoch": 0.45714929450530883, + "grad_norm": 0.47140699775048767, + "learning_rate": 3.056042875724228e-06, + "loss": 0.1866, + "step": 2029 + }, + { + "epoch": 0.45737460219111725, + "grad_norm": 0.43916421202150646, + "learning_rate": 3.0542264852944635e-06, + "loss": 0.1666, + "step": 2030 + }, + { + "epoch": 0.45759990987692567, + "grad_norm": 0.4769634677880467, + "learning_rate": 3.052409787123391e-06, + "loss": 0.1881, + "step": 2031 + }, + { + "epoch": 0.4578252175627341, + "grad_norm": 0.4253220044866635, + "learning_rate": 3.0505927822197533e-06, + "loss": 0.1666, + "step": 2032 + }, + { + "epoch": 0.45805052524854256, + "grad_norm": 0.45395239330052456, + "learning_rate": 3.0487754715924674e-06, + "loss": 0.159, + "step": 2033 + }, + { + "epoch": 0.458275832934351, + "grad_norm": 0.46551168581240177, + "learning_rate": 3.0469578562506165e-06, + "loss": 0.1745, + "step": 2034 + }, + { + "epoch": 0.4585011406201594, + "grad_norm": 0.444461912662105, + "learning_rate": 3.045139937203455e-06, + "loss": 0.1741, + "step": 2035 + }, + { + "epoch": 0.4587264483059678, + "grad_norm": 0.45764932720928, + "learning_rate": 3.0433217154604067e-06, + "loss": 0.191, + "step": 2036 + }, + { + "epoch": 0.4589517559917763, + "grad_norm": 0.46363763943724695, + "learning_rate": 3.0415031920310613e-06, + "loss": 0.184, + "step": 2037 + }, + { + "epoch": 0.4591770636775847, + "grad_norm": 0.437038243115666, + "learning_rate": 3.0396843679251777e-06, + "loss": 0.1708, + "step": 2038 + }, + { + "epoch": 0.4594023713633931, + "grad_norm": 0.44364950357701566, + "learning_rate": 3.03786524415268e-06, + "loss": 0.1703, + "step": 2039 + }, + { + "epoch": 0.45962767904920154, + "grad_norm": 0.4617444282614592, + "learning_rate": 3.0360458217236604e-06, + "loss": 0.1798, + "step": 2040 + }, + { + "epoch": 0.45985298673501, + "grad_norm": 0.4456061222843684, + "learning_rate": 3.034226101648377e-06, + "loss": 0.173, + "step": 2041 + }, + { + "epoch": 0.46007829442081843, + "grad_norm": 0.4391942250910541, + "learning_rate": 3.0324060849372526e-06, + "loss": 0.1649, + "step": 2042 + }, + { + "epoch": 0.46030360210662685, + "grad_norm": 0.4355281778187853, + "learning_rate": 3.0305857726008736e-06, + "loss": 0.1693, + "step": 2043 + }, + { + "epoch": 0.46052890979243527, + "grad_norm": 0.4770822816285058, + "learning_rate": 3.028765165649992e-06, + "loss": 0.1694, + "step": 2044 + }, + { + "epoch": 0.46075421747824374, + "grad_norm": 0.46181305627723274, + "learning_rate": 3.026944265095524e-06, + "loss": 0.1628, + "step": 2045 + }, + { + "epoch": 0.46097952516405216, + "grad_norm": 0.4518621765771259, + "learning_rate": 3.0251230719485465e-06, + "loss": 0.1756, + "step": 2046 + }, + { + "epoch": 0.4612048328498606, + "grad_norm": 0.45572128895799496, + "learning_rate": 3.0233015872203004e-06, + "loss": 0.1764, + "step": 2047 + }, + { + "epoch": 0.461430140535669, + "grad_norm": 0.4369141547342832, + "learning_rate": 3.0214798119221884e-06, + "loss": 0.1688, + "step": 2048 + }, + { + "epoch": 0.46165544822147747, + "grad_norm": 0.44272966742372416, + "learning_rate": 3.0196577470657744e-06, + "loss": 0.1712, + "step": 2049 + }, + { + "epoch": 0.4618807559072859, + "grad_norm": 0.442968902773088, + "learning_rate": 3.0178353936627835e-06, + "loss": 0.1728, + "step": 2050 + }, + { + "epoch": 0.4621060635930943, + "grad_norm": 0.44742699858152973, + "learning_rate": 3.0160127527250993e-06, + "loss": 0.1749, + "step": 2051 + }, + { + "epoch": 0.4623313712789027, + "grad_norm": 0.4583756265965511, + "learning_rate": 3.0141898252647682e-06, + "loss": 0.1719, + "step": 2052 + }, + { + "epoch": 0.4625566789647112, + "grad_norm": 0.4413461793868719, + "learning_rate": 3.012366612293993e-06, + "loss": 0.1797, + "step": 2053 + }, + { + "epoch": 0.4627819866505196, + "grad_norm": 0.5104883095673568, + "learning_rate": 3.0105431148251364e-06, + "loss": 0.1954, + "step": 2054 + }, + { + "epoch": 0.46300729433632803, + "grad_norm": 0.43690079720202335, + "learning_rate": 3.0087193338707175e-06, + "loss": 0.166, + "step": 2055 + }, + { + "epoch": 0.46323260202213645, + "grad_norm": 0.4645040215339915, + "learning_rate": 3.0068952704434145e-06, + "loss": 0.1904, + "step": 2056 + }, + { + "epoch": 0.4634579097079449, + "grad_norm": 0.43125963174507254, + "learning_rate": 3.0050709255560616e-06, + "loss": 0.17, + "step": 2057 + }, + { + "epoch": 0.46368321739375334, + "grad_norm": 0.47208528600460287, + "learning_rate": 3.0032463002216504e-06, + "loss": 0.1822, + "step": 2058 + }, + { + "epoch": 0.46390852507956176, + "grad_norm": 0.43401185733665737, + "learning_rate": 3.0014213954533265e-06, + "loss": 0.1687, + "step": 2059 + }, + { + "epoch": 0.46413383276537024, + "grad_norm": 0.4419179647443992, + "learning_rate": 2.999596212264392e-06, + "loss": 0.1744, + "step": 2060 + }, + { + "epoch": 0.46435914045117865, + "grad_norm": 0.43313156088893195, + "learning_rate": 2.997770751668302e-06, + "loss": 0.1664, + "step": 2061 + }, + { + "epoch": 0.46458444813698707, + "grad_norm": 0.48643460141736217, + "learning_rate": 2.9959450146786674e-06, + "loss": 0.1883, + "step": 2062 + }, + { + "epoch": 0.4648097558227955, + "grad_norm": 0.4976956590174994, + "learning_rate": 2.994119002309253e-06, + "loss": 0.1903, + "step": 2063 + }, + { + "epoch": 0.46503506350860396, + "grad_norm": 0.4708658758383177, + "learning_rate": 2.9922927155739737e-06, + "loss": 0.1792, + "step": 2064 + }, + { + "epoch": 0.4652603711944124, + "grad_norm": 0.4364123236102563, + "learning_rate": 2.9904661554868997e-06, + "loss": 0.1567, + "step": 2065 + }, + { + "epoch": 0.4654856788802208, + "grad_norm": 0.4369996422790048, + "learning_rate": 2.9886393230622507e-06, + "loss": 0.1623, + "step": 2066 + }, + { + "epoch": 0.4657109865660292, + "grad_norm": 0.4623678741261697, + "learning_rate": 2.986812219314399e-06, + "loss": 0.1783, + "step": 2067 + }, + { + "epoch": 0.4659362942518377, + "grad_norm": 0.4742905798531366, + "learning_rate": 2.984984845257868e-06, + "loss": 0.1853, + "step": 2068 + }, + { + "epoch": 0.4661616019376461, + "grad_norm": 0.4782705221115839, + "learning_rate": 2.983157201907329e-06, + "loss": 0.1835, + "step": 2069 + }, + { + "epoch": 0.4663869096234545, + "grad_norm": 0.44365559788577486, + "learning_rate": 2.981329290277605e-06, + "loss": 0.167, + "step": 2070 + }, + { + "epoch": 0.46661221730926294, + "grad_norm": 0.43801534264914416, + "learning_rate": 2.9795011113836686e-06, + "loss": 0.1711, + "step": 2071 + }, + { + "epoch": 0.4668375249950714, + "grad_norm": 0.500754531876191, + "learning_rate": 2.977672666240636e-06, + "loss": 0.1878, + "step": 2072 + }, + { + "epoch": 0.46706283268087984, + "grad_norm": 0.43215332151087815, + "learning_rate": 2.9758439558637774e-06, + "loss": 0.172, + "step": 2073 + }, + { + "epoch": 0.46728814036668825, + "grad_norm": 0.5056442893102445, + "learning_rate": 2.974014981268507e-06, + "loss": 0.1756, + "step": 2074 + }, + { + "epoch": 0.46751344805249667, + "grad_norm": 0.4745399978223141, + "learning_rate": 2.972185743470386e-06, + "loss": 0.1856, + "step": 2075 + }, + { + "epoch": 0.46773875573830515, + "grad_norm": 0.46805694357432803, + "learning_rate": 2.9703562434851218e-06, + "loss": 0.1692, + "step": 2076 + }, + { + "epoch": 0.46796406342411356, + "grad_norm": 0.4225075187011975, + "learning_rate": 2.9685264823285676e-06, + "loss": 0.1591, + "step": 2077 + }, + { + "epoch": 0.468189371109922, + "grad_norm": 0.4468491930897374, + "learning_rate": 2.966696461016721e-06, + "loss": 0.1711, + "step": 2078 + }, + { + "epoch": 0.4684146787957304, + "grad_norm": 0.435596662348042, + "learning_rate": 2.964866180565725e-06, + "loss": 0.1605, + "step": 2079 + }, + { + "epoch": 0.4686399864815389, + "grad_norm": 0.4490815593734429, + "learning_rate": 2.9630356419918682e-06, + "loss": 0.1798, + "step": 2080 + }, + { + "epoch": 0.4688652941673473, + "grad_norm": 0.4555482058572205, + "learning_rate": 2.9612048463115774e-06, + "loss": 0.1666, + "step": 2081 + }, + { + "epoch": 0.4690906018531557, + "grad_norm": 0.4732132971396988, + "learning_rate": 2.9593737945414264e-06, + "loss": 0.1896, + "step": 2082 + }, + { + "epoch": 0.4693159095389641, + "grad_norm": 0.4702761809496426, + "learning_rate": 2.9575424876981298e-06, + "loss": 0.177, + "step": 2083 + }, + { + "epoch": 0.4695412172247726, + "grad_norm": 0.45027046821483724, + "learning_rate": 2.9557109267985445e-06, + "loss": 0.1609, + "step": 2084 + }, + { + "epoch": 0.469766524910581, + "grad_norm": 0.44140027773481344, + "learning_rate": 2.953879112859668e-06, + "loss": 0.1672, + "step": 2085 + }, + { + "epoch": 0.46999183259638944, + "grad_norm": 0.4828515699733266, + "learning_rate": 2.952047046898637e-06, + "loss": 0.1944, + "step": 2086 + }, + { + "epoch": 0.47021714028219785, + "grad_norm": 0.48667888528654585, + "learning_rate": 2.9502147299327316e-06, + "loss": 0.1873, + "step": 2087 + }, + { + "epoch": 0.4704424479680063, + "grad_norm": 0.4592669167670882, + "learning_rate": 2.9483821629793673e-06, + "loss": 0.1796, + "step": 2088 + }, + { + "epoch": 0.47066775565381475, + "grad_norm": 0.47441249564050847, + "learning_rate": 2.946549347056101e-06, + "loss": 0.185, + "step": 2089 + }, + { + "epoch": 0.47089306333962316, + "grad_norm": 15.320891306843055, + "learning_rate": 2.9447162831806275e-06, + "loss": 0.1797, + "step": 2090 + }, + { + "epoch": 0.4711183710254316, + "grad_norm": 0.4839520376771902, + "learning_rate": 2.942882972370778e-06, + "loss": 0.1543, + "step": 2091 + }, + { + "epoch": 0.47134367871124006, + "grad_norm": 0.5036961013768916, + "learning_rate": 2.941049415644522e-06, + "loss": 0.1682, + "step": 2092 + }, + { + "epoch": 0.4715689863970485, + "grad_norm": 0.4641557330707968, + "learning_rate": 2.9392156140199644e-06, + "loss": 0.1781, + "step": 2093 + }, + { + "epoch": 0.4717942940828569, + "grad_norm": 0.4912307259701395, + "learning_rate": 2.9373815685153485e-06, + "loss": 0.1862, + "step": 2094 + }, + { + "epoch": 0.4720196017686653, + "grad_norm": 0.47481157214164066, + "learning_rate": 2.93554728014905e-06, + "loss": 0.1794, + "step": 2095 + }, + { + "epoch": 0.4722449094544738, + "grad_norm": 0.46497108103831036, + "learning_rate": 2.933712749939582e-06, + "loss": 0.1718, + "step": 2096 + }, + { + "epoch": 0.4724702171402822, + "grad_norm": 0.42570180661525925, + "learning_rate": 2.9318779789055894e-06, + "loss": 0.1593, + "step": 2097 + }, + { + "epoch": 0.4726955248260906, + "grad_norm": 0.46742299557015055, + "learning_rate": 2.9300429680658538e-06, + "loss": 0.1649, + "step": 2098 + }, + { + "epoch": 0.47292083251189904, + "grad_norm": 0.460885260741835, + "learning_rate": 2.928207718439287e-06, + "loss": 0.1726, + "step": 2099 + }, + { + "epoch": 0.4731461401977075, + "grad_norm": 0.4752721076108127, + "learning_rate": 2.9263722310449353e-06, + "loss": 0.175, + "step": 2100 + }, + { + "epoch": 0.47337144788351593, + "grad_norm": 0.44439595937412796, + "learning_rate": 2.924536506901976e-06, + "loss": 0.1712, + "step": 2101 + }, + { + "epoch": 0.47359675556932435, + "grad_norm": 0.48817557892149893, + "learning_rate": 2.9227005470297194e-06, + "loss": 0.1874, + "step": 2102 + }, + { + "epoch": 0.47382206325513276, + "grad_norm": 0.4608016400267469, + "learning_rate": 2.9208643524476037e-06, + "loss": 0.1714, + "step": 2103 + }, + { + "epoch": 0.47404737094094124, + "grad_norm": 0.4598647877611021, + "learning_rate": 2.919027924175201e-06, + "loss": 0.1689, + "step": 2104 + }, + { + "epoch": 0.47427267862674966, + "grad_norm": 0.4716802653092358, + "learning_rate": 2.9171912632322102e-06, + "loss": 0.188, + "step": 2105 + }, + { + "epoch": 0.4744979863125581, + "grad_norm": 0.4823223686204493, + "learning_rate": 2.915354370638462e-06, + "loss": 0.1793, + "step": 2106 + }, + { + "epoch": 0.47472329399836655, + "grad_norm": 0.4818981907639348, + "learning_rate": 2.913517247413914e-06, + "loss": 0.1778, + "step": 2107 + }, + { + "epoch": 0.47494860168417496, + "grad_norm": 0.4583874342976942, + "learning_rate": 2.9116798945786515e-06, + "loss": 0.1753, + "step": 2108 + }, + { + "epoch": 0.4751739093699834, + "grad_norm": 0.4455153011005638, + "learning_rate": 2.909842313152888e-06, + "loss": 0.1728, + "step": 2109 + }, + { + "epoch": 0.4753992170557918, + "grad_norm": 0.4834636522137698, + "learning_rate": 2.9080045041569647e-06, + "loss": 0.1906, + "step": 2110 + }, + { + "epoch": 0.4756245247416003, + "grad_norm": 0.46741386464656404, + "learning_rate": 2.9061664686113487e-06, + "loss": 0.1756, + "step": 2111 + }, + { + "epoch": 0.4758498324274087, + "grad_norm": 0.4662330437203892, + "learning_rate": 2.904328207536632e-06, + "loss": 0.1707, + "step": 2112 + }, + { + "epoch": 0.4760751401132171, + "grad_norm": 0.484445103641389, + "learning_rate": 2.9024897219535326e-06, + "loss": 0.1899, + "step": 2113 + }, + { + "epoch": 0.47630044779902553, + "grad_norm": 0.4760594624019392, + "learning_rate": 2.900651012882893e-06, + "loss": 0.175, + "step": 2114 + }, + { + "epoch": 0.476525755484834, + "grad_norm": 0.5016265130675016, + "learning_rate": 2.8988120813456794e-06, + "loss": 0.1804, + "step": 2115 + }, + { + "epoch": 0.4767510631706424, + "grad_norm": 0.46085698214700854, + "learning_rate": 2.896972928362983e-06, + "loss": 0.1811, + "step": 2116 + }, + { + "epoch": 0.47697637085645084, + "grad_norm": 0.4731784025728347, + "learning_rate": 2.8951335549560156e-06, + "loss": 0.1911, + "step": 2117 + }, + { + "epoch": 0.47720167854225926, + "grad_norm": 0.4606793403476545, + "learning_rate": 2.893293962146114e-06, + "loss": 0.1716, + "step": 2118 + }, + { + "epoch": 0.47742698622806773, + "grad_norm": 0.4607601497381891, + "learning_rate": 2.8914541509547345e-06, + "loss": 0.1705, + "step": 2119 + }, + { + "epoch": 0.47765229391387615, + "grad_norm": 0.4431092357438973, + "learning_rate": 2.8896141224034554e-06, + "loss": 0.1619, + "step": 2120 + }, + { + "epoch": 0.47787760159968456, + "grad_norm": 0.46971266351591834, + "learning_rate": 2.8877738775139767e-06, + "loss": 0.172, + "step": 2121 + }, + { + "epoch": 0.478102909285493, + "grad_norm": 0.4330701696318025, + "learning_rate": 2.885933417308118e-06, + "loss": 0.1573, + "step": 2122 + }, + { + "epoch": 0.47832821697130146, + "grad_norm": 0.46716715522207447, + "learning_rate": 2.8840927428078185e-06, + "loss": 0.1688, + "step": 2123 + }, + { + "epoch": 0.4785535246571099, + "grad_norm": 0.4949357110216969, + "learning_rate": 2.8822518550351356e-06, + "loss": 0.2038, + "step": 2124 + }, + { + "epoch": 0.4787788323429183, + "grad_norm": 0.4561315237677896, + "learning_rate": 2.8804107550122453e-06, + "loss": 0.1789, + "step": 2125 + }, + { + "epoch": 0.4790041400287267, + "grad_norm": 0.46006362827647646, + "learning_rate": 2.878569443761442e-06, + "loss": 0.1778, + "step": 2126 + }, + { + "epoch": 0.4792294477145352, + "grad_norm": 0.46562848894486086, + "learning_rate": 2.8767279223051375e-06, + "loss": 0.1721, + "step": 2127 + }, + { + "epoch": 0.4794547554003436, + "grad_norm": 0.45350653446964795, + "learning_rate": 2.87488619166586e-06, + "loss": 0.172, + "step": 2128 + }, + { + "epoch": 0.479680063086152, + "grad_norm": 0.4438714170588374, + "learning_rate": 2.8730442528662537e-06, + "loss": 0.1783, + "step": 2129 + }, + { + "epoch": 0.47990537077196044, + "grad_norm": 0.4742323658119051, + "learning_rate": 2.8712021069290786e-06, + "loss": 0.1737, + "step": 2130 + }, + { + "epoch": 0.4801306784577689, + "grad_norm": 0.4584352187918664, + "learning_rate": 2.869359754877209e-06, + "loss": 0.182, + "step": 2131 + }, + { + "epoch": 0.48035598614357733, + "grad_norm": 0.4193203919612601, + "learning_rate": 2.8675171977336357e-06, + "loss": 0.1631, + "step": 2132 + }, + { + "epoch": 0.48058129382938575, + "grad_norm": 0.43994507148087314, + "learning_rate": 2.8656744365214622e-06, + "loss": 0.1718, + "step": 2133 + }, + { + "epoch": 0.48080660151519417, + "grad_norm": 0.4826917968264974, + "learning_rate": 2.863831472263904e-06, + "loss": 0.1767, + "step": 2134 + }, + { + "epoch": 0.48103190920100264, + "grad_norm": 0.4457798815207207, + "learning_rate": 2.8619883059842897e-06, + "loss": 0.1618, + "step": 2135 + }, + { + "epoch": 0.48125721688681106, + "grad_norm": 0.4624187926944956, + "learning_rate": 2.8601449387060622e-06, + "loss": 0.1813, + "step": 2136 + }, + { + "epoch": 0.4814825245726195, + "grad_norm": 0.44470888753773025, + "learning_rate": 2.858301371452774e-06, + "loss": 0.16, + "step": 2137 + }, + { + "epoch": 0.4817078322584279, + "grad_norm": 0.48186806387676867, + "learning_rate": 2.8564576052480895e-06, + "loss": 0.1917, + "step": 2138 + }, + { + "epoch": 0.48193313994423637, + "grad_norm": 0.48463537150814423, + "learning_rate": 2.8546136411157843e-06, + "loss": 0.1759, + "step": 2139 + }, + { + "epoch": 0.4821584476300448, + "grad_norm": 0.5238103411433412, + "learning_rate": 2.8527694800797417e-06, + "loss": 0.1832, + "step": 2140 + }, + { + "epoch": 0.4823837553158532, + "grad_norm": 0.4689620150227957, + "learning_rate": 2.850925123163956e-06, + "loss": 0.1907, + "step": 2141 + }, + { + "epoch": 0.4826090630016616, + "grad_norm": 0.41128876488634936, + "learning_rate": 2.8490805713925298e-06, + "loss": 0.1577, + "step": 2142 + }, + { + "epoch": 0.4828343706874701, + "grad_norm": 0.46724382784265567, + "learning_rate": 2.847235825789673e-06, + "loss": 0.1784, + "step": 2143 + }, + { + "epoch": 0.4830596783732785, + "grad_norm": 0.4666280203660776, + "learning_rate": 2.845390887379706e-06, + "loss": 0.1879, + "step": 2144 + }, + { + "epoch": 0.48328498605908693, + "grad_norm": 0.48164500272938054, + "learning_rate": 2.8435457571870527e-06, + "loss": 0.1904, + "step": 2145 + }, + { + "epoch": 0.48351029374489535, + "grad_norm": 0.44008544738881744, + "learning_rate": 2.8417004362362465e-06, + "loss": 0.1486, + "step": 2146 + }, + { + "epoch": 0.4837356014307038, + "grad_norm": 0.4635257499230133, + "learning_rate": 2.8398549255519237e-06, + "loss": 0.1804, + "step": 2147 + }, + { + "epoch": 0.48396090911651224, + "grad_norm": 0.465990676225474, + "learning_rate": 2.838009226158829e-06, + "loss": 0.1825, + "step": 2148 + }, + { + "epoch": 0.48418621680232066, + "grad_norm": 0.5075547299710821, + "learning_rate": 2.83616333908181e-06, + "loss": 0.1937, + "step": 2149 + }, + { + "epoch": 0.4844115244881291, + "grad_norm": 0.46883827851334353, + "learning_rate": 2.8343172653458194e-06, + "loss": 0.1818, + "step": 2150 + }, + { + "epoch": 0.48463683217393755, + "grad_norm": 0.4901765119511683, + "learning_rate": 2.8324710059759126e-06, + "loss": 0.1788, + "step": 2151 + }, + { + "epoch": 0.48486213985974597, + "grad_norm": 0.43409955881170137, + "learning_rate": 2.8306245619972476e-06, + "loss": 0.17, + "step": 2152 + }, + { + "epoch": 0.4850874475455544, + "grad_norm": 0.4690992247008866, + "learning_rate": 2.828777934435088e-06, + "loss": 0.1887, + "step": 2153 + }, + { + "epoch": 0.4853127552313628, + "grad_norm": 0.4839284154669798, + "learning_rate": 2.826931124314796e-06, + "loss": 0.1932, + "step": 2154 + }, + { + "epoch": 0.4855380629171713, + "grad_norm": 0.4894590882956956, + "learning_rate": 2.8250841326618367e-06, + "loss": 0.1728, + "step": 2155 + }, + { + "epoch": 0.4857633706029797, + "grad_norm": 0.4869585017472735, + "learning_rate": 2.8232369605017757e-06, + "loss": 0.1747, + "step": 2156 + }, + { + "epoch": 0.4859886782887881, + "grad_norm": 0.4498537151395631, + "learning_rate": 2.8213896088602786e-06, + "loss": 0.1685, + "step": 2157 + }, + { + "epoch": 0.4862139859745966, + "grad_norm": 0.5127666742169606, + "learning_rate": 2.8195420787631113e-06, + "loss": 0.1945, + "step": 2158 + }, + { + "epoch": 0.486439293660405, + "grad_norm": 0.4763157623373245, + "learning_rate": 2.8176943712361394e-06, + "loss": 0.1863, + "step": 2159 + }, + { + "epoch": 0.4866646013462134, + "grad_norm": 0.4599414317655403, + "learning_rate": 2.8158464873053236e-06, + "loss": 0.1652, + "step": 2160 + }, + { + "epoch": 0.48688990903202184, + "grad_norm": 0.48640133019890974, + "learning_rate": 2.8139984279967265e-06, + "loss": 0.175, + "step": 2161 + }, + { + "epoch": 0.4871152167178303, + "grad_norm": 0.44592196742387086, + "learning_rate": 2.8121501943365066e-06, + "loss": 0.155, + "step": 2162 + }, + { + "epoch": 0.48734052440363873, + "grad_norm": 0.4643658006781154, + "learning_rate": 2.810301787350918e-06, + "loss": 0.1764, + "step": 2163 + }, + { + "epoch": 0.48756583208944715, + "grad_norm": 0.45078546818239396, + "learning_rate": 2.808453208066314e-06, + "loss": 0.1644, + "step": 2164 + }, + { + "epoch": 0.48779113977525557, + "grad_norm": 0.4354681652629705, + "learning_rate": 2.8066044575091404e-06, + "loss": 0.1686, + "step": 2165 + }, + { + "epoch": 0.48801644746106404, + "grad_norm": 0.5062771902888445, + "learning_rate": 2.8047555367059404e-06, + "loss": 0.1961, + "step": 2166 + }, + { + "epoch": 0.48824175514687246, + "grad_norm": 0.4758616065778028, + "learning_rate": 2.80290644668335e-06, + "loss": 0.1589, + "step": 2167 + }, + { + "epoch": 0.4884670628326809, + "grad_norm": 0.47102109682311355, + "learning_rate": 2.8010571884681004e-06, + "loss": 0.171, + "step": 2168 + }, + { + "epoch": 0.4886923705184893, + "grad_norm": 0.43971879713767237, + "learning_rate": 2.799207763087015e-06, + "loss": 0.1645, + "step": 2169 + }, + { + "epoch": 0.48891767820429777, + "grad_norm": 0.45381369829580853, + "learning_rate": 2.7973581715670124e-06, + "loss": 0.1606, + "step": 2170 + }, + { + "epoch": 0.4891429858901062, + "grad_norm": 0.44900134476797426, + "learning_rate": 2.7955084149351002e-06, + "loss": 0.1722, + "step": 2171 + }, + { + "epoch": 0.4893682935759146, + "grad_norm": 0.4945919529575803, + "learning_rate": 2.7936584942183804e-06, + "loss": 0.1808, + "step": 2172 + }, + { + "epoch": 0.489593601261723, + "grad_norm": 0.4570748764476789, + "learning_rate": 2.7918084104440446e-06, + "loss": 0.1702, + "step": 2173 + }, + { + "epoch": 0.4898189089475315, + "grad_norm": 0.4633639784411994, + "learning_rate": 2.7899581646393746e-06, + "loss": 0.1808, + "step": 2174 + }, + { + "epoch": 0.4900442166333399, + "grad_norm": 0.5220983153220906, + "learning_rate": 2.7881077578317445e-06, + "loss": 0.1726, + "step": 2175 + }, + { + "epoch": 0.49026952431914833, + "grad_norm": 0.4576825082430093, + "learning_rate": 2.7862571910486148e-06, + "loss": 0.1743, + "step": 2176 + }, + { + "epoch": 0.49049483200495675, + "grad_norm": 0.5328807938743929, + "learning_rate": 2.784406465317538e-06, + "loss": 0.1843, + "step": 2177 + }, + { + "epoch": 0.4907201396907652, + "grad_norm": 0.5056501607330407, + "learning_rate": 2.7825555816661503e-06, + "loss": 0.2003, + "step": 2178 + }, + { + "epoch": 0.49094544737657364, + "grad_norm": 0.42773094241205145, + "learning_rate": 2.7807045411221813e-06, + "loss": 0.1673, + "step": 2179 + }, + { + "epoch": 0.49117075506238206, + "grad_norm": 0.4664291811126412, + "learning_rate": 2.778853344713443e-06, + "loss": 0.1708, + "step": 2180 + }, + { + "epoch": 0.4913960627481905, + "grad_norm": 0.4453287448208835, + "learning_rate": 2.777001993467837e-06, + "loss": 0.1685, + "step": 2181 + }, + { + "epoch": 0.49162137043399895, + "grad_norm": 0.46389082750942406, + "learning_rate": 2.7751504884133484e-06, + "loss": 0.1752, + "step": 2182 + }, + { + "epoch": 0.49184667811980737, + "grad_norm": 0.5163857955249544, + "learning_rate": 2.7732988305780496e-06, + "loss": 0.1797, + "step": 2183 + }, + { + "epoch": 0.4920719858056158, + "grad_norm": 0.4215705234503739, + "learning_rate": 2.7714470209900974e-06, + "loss": 0.1704, + "step": 2184 + }, + { + "epoch": 0.4922972934914242, + "grad_norm": 0.4629717768154613, + "learning_rate": 2.769595060677732e-06, + "loss": 0.1721, + "step": 2185 + }, + { + "epoch": 0.4925226011772327, + "grad_norm": 0.4761814651960009, + "learning_rate": 2.7677429506692788e-06, + "loss": 0.1727, + "step": 2186 + }, + { + "epoch": 0.4927479088630411, + "grad_norm": 0.4541677333323878, + "learning_rate": 2.7658906919931443e-06, + "loss": 0.1705, + "step": 2187 + }, + { + "epoch": 0.4929732165488495, + "grad_norm": 0.4975462659410821, + "learning_rate": 2.76403828567782e-06, + "loss": 0.1743, + "step": 2188 + }, + { + "epoch": 0.49319852423465793, + "grad_norm": 0.48590535832293386, + "learning_rate": 2.7621857327518763e-06, + "loss": 0.1876, + "step": 2189 + }, + { + "epoch": 0.4934238319204664, + "grad_norm": 0.5054622342442072, + "learning_rate": 2.7603330342439686e-06, + "loss": 0.186, + "step": 2190 + }, + { + "epoch": 0.4936491396062748, + "grad_norm": 0.44760042352064583, + "learning_rate": 2.7584801911828314e-06, + "loss": 0.1782, + "step": 2191 + }, + { + "epoch": 0.49387444729208324, + "grad_norm": 0.46151940170135775, + "learning_rate": 2.7566272045972777e-06, + "loss": 0.1758, + "step": 2192 + }, + { + "epoch": 0.49409975497789166, + "grad_norm": 0.4214858342853846, + "learning_rate": 2.7547740755162034e-06, + "loss": 0.1514, + "step": 2193 + }, + { + "epoch": 0.49432506266370013, + "grad_norm": 0.45815140291027234, + "learning_rate": 2.752920804968581e-06, + "loss": 0.1656, + "step": 2194 + }, + { + "epoch": 0.49455037034950855, + "grad_norm": 0.4800547426007386, + "learning_rate": 2.7510673939834633e-06, + "loss": 0.1793, + "step": 2195 + }, + { + "epoch": 0.49477567803531697, + "grad_norm": 0.4509899242835938, + "learning_rate": 2.7492138435899794e-06, + "loss": 0.1718, + "step": 2196 + }, + { + "epoch": 0.4950009857211254, + "grad_norm": 0.506693118405821, + "learning_rate": 2.747360154817338e-06, + "loss": 0.1742, + "step": 2197 + }, + { + "epoch": 0.49522629340693386, + "grad_norm": 0.42819006713750624, + "learning_rate": 2.745506328694822e-06, + "loss": 0.1709, + "step": 2198 + }, + { + "epoch": 0.4954516010927423, + "grad_norm": 0.45516129244026926, + "learning_rate": 2.743652366251793e-06, + "loss": 0.1715, + "step": 2199 + }, + { + "epoch": 0.4956769087785507, + "grad_norm": 0.4550791539079108, + "learning_rate": 2.741798268517687e-06, + "loss": 0.1604, + "step": 2200 + }, + { + "epoch": 0.4959022164643591, + "grad_norm": 0.490548149967066, + "learning_rate": 2.7399440365220153e-06, + "loss": 0.1769, + "step": 2201 + }, + { + "epoch": 0.4961275241501676, + "grad_norm": 0.45834229392810766, + "learning_rate": 2.738089671294364e-06, + "loss": 0.1809, + "step": 2202 + }, + { + "epoch": 0.496352831835976, + "grad_norm": 0.4781352095916142, + "learning_rate": 2.7362351738643926e-06, + "loss": 0.1955, + "step": 2203 + }, + { + "epoch": 0.4965781395217844, + "grad_norm": 0.46326414441011077, + "learning_rate": 2.734380545261835e-06, + "loss": 0.1616, + "step": 2204 + }, + { + "epoch": 0.4968034472075929, + "grad_norm": 0.44904163521168167, + "learning_rate": 2.7325257865164955e-06, + "loss": 0.167, + "step": 2205 + }, + { + "epoch": 0.4970287548934013, + "grad_norm": 0.4899948596040398, + "learning_rate": 2.730670898658255e-06, + "loss": 0.2064, + "step": 2206 + }, + { + "epoch": 0.49725406257920973, + "grad_norm": 0.4405856252870001, + "learning_rate": 2.7288158827170623e-06, + "loss": 0.1579, + "step": 2207 + }, + { + "epoch": 0.49747937026501815, + "grad_norm": 0.4675595283678431, + "learning_rate": 2.726960739722939e-06, + "loss": 0.179, + "step": 2208 + }, + { + "epoch": 0.4977046779508266, + "grad_norm": 0.5102277250588123, + "learning_rate": 2.725105470705977e-06, + "loss": 0.1796, + "step": 2209 + }, + { + "epoch": 0.49792998563663504, + "grad_norm": 0.5566527000707113, + "learning_rate": 2.7232500766963373e-06, + "loss": 0.1927, + "step": 2210 + }, + { + "epoch": 0.49815529332244346, + "grad_norm": 0.443934480142217, + "learning_rate": 2.7213945587242507e-06, + "loss": 0.1743, + "step": 2211 + }, + { + "epoch": 0.4983806010082519, + "grad_norm": 0.4767187900499543, + "learning_rate": 2.7195389178200194e-06, + "loss": 0.1779, + "step": 2212 + }, + { + "epoch": 0.49860590869406035, + "grad_norm": 0.4974698365195795, + "learning_rate": 2.7176831550140093e-06, + "loss": 0.1867, + "step": 2213 + }, + { + "epoch": 0.49883121637986877, + "grad_norm": 0.4601405704666756, + "learning_rate": 2.7158272713366573e-06, + "loss": 0.1512, + "step": 2214 + }, + { + "epoch": 0.4990565240656772, + "grad_norm": 0.6010196931151838, + "learning_rate": 2.713971267818466e-06, + "loss": 0.1828, + "step": 2215 + }, + { + "epoch": 0.4992818317514856, + "grad_norm": 0.4391701018618837, + "learning_rate": 2.7121151454900048e-06, + "loss": 0.1679, + "step": 2216 + }, + { + "epoch": 0.4995071394372941, + "grad_norm": 0.4459776925561746, + "learning_rate": 2.7102589053819107e-06, + "loss": 0.1641, + "step": 2217 + }, + { + "epoch": 0.4997324471231025, + "grad_norm": 0.47679944533789637, + "learning_rate": 2.7084025485248827e-06, + "loss": 0.1724, + "step": 2218 + }, + { + "epoch": 0.4999577548089109, + "grad_norm": 0.4394096352496152, + "learning_rate": 2.706546075949688e-06, + "loss": 0.1665, + "step": 2219 + }, + { + "epoch": 0.5001830624947193, + "grad_norm": 0.4588632027513291, + "learning_rate": 2.7046894886871564e-06, + "loss": 0.1751, + "step": 2220 + }, + { + "epoch": 0.5004083701805278, + "grad_norm": 0.457859365699703, + "learning_rate": 2.7028327877681808e-06, + "loss": 0.1719, + "step": 2221 + }, + { + "epoch": 0.5006336778663362, + "grad_norm": 0.48384844304189706, + "learning_rate": 2.700975974223719e-06, + "loss": 0.1783, + "step": 2222 + }, + { + "epoch": 0.5008589855521447, + "grad_norm": 0.43944087047409747, + "learning_rate": 2.6991190490847898e-06, + "loss": 0.1688, + "step": 2223 + }, + { + "epoch": 0.5010842932379531, + "grad_norm": 0.43871579053810494, + "learning_rate": 2.6972620133824745e-06, + "loss": 0.1727, + "step": 2224 + }, + { + "epoch": 0.5013096009237615, + "grad_norm": 0.44863239006819716, + "learning_rate": 2.695404868147916e-06, + "loss": 0.1759, + "step": 2225 + }, + { + "epoch": 0.50153490860957, + "grad_norm": 0.46159614459358683, + "learning_rate": 2.6935476144123173e-06, + "loss": 0.1701, + "step": 2226 + }, + { + "epoch": 0.5017602162953784, + "grad_norm": 0.46086115595022936, + "learning_rate": 2.691690253206943e-06, + "loss": 0.1784, + "step": 2227 + }, + { + "epoch": 0.5019855239811868, + "grad_norm": 0.4647242619735199, + "learning_rate": 2.689832785563116e-06, + "loss": 0.1627, + "step": 2228 + }, + { + "epoch": 0.5022108316669952, + "grad_norm": 0.4446567813660805, + "learning_rate": 2.6879752125122193e-06, + "loss": 0.1606, + "step": 2229 + }, + { + "epoch": 0.5024361393528036, + "grad_norm": 0.4774446205251166, + "learning_rate": 2.6861175350856937e-06, + "loss": 0.1809, + "step": 2230 + }, + { + "epoch": 0.5026614470386122, + "grad_norm": 0.5054510609596393, + "learning_rate": 2.684259754315038e-06, + "loss": 0.2051, + "step": 2231 + }, + { + "epoch": 0.5028867547244206, + "grad_norm": 0.46403071608589413, + "learning_rate": 2.6824018712318084e-06, + "loss": 0.1731, + "step": 2232 + }, + { + "epoch": 0.503112062410229, + "grad_norm": 0.4771049420220912, + "learning_rate": 2.6805438868676186e-06, + "loss": 0.183, + "step": 2233 + }, + { + "epoch": 0.5033373700960374, + "grad_norm": 0.45596508826869125, + "learning_rate": 2.6786858022541385e-06, + "loss": 0.1711, + "step": 2234 + }, + { + "epoch": 0.5035626777818458, + "grad_norm": 0.4900413762704875, + "learning_rate": 2.676827618423093e-06, + "loss": 0.1929, + "step": 2235 + }, + { + "epoch": 0.5037879854676542, + "grad_norm": 0.457336500030823, + "learning_rate": 2.674969336406262e-06, + "loss": 0.1697, + "step": 2236 + }, + { + "epoch": 0.5040132931534627, + "grad_norm": 0.4337050929098509, + "learning_rate": 2.6731109572354795e-06, + "loss": 0.176, + "step": 2237 + }, + { + "epoch": 0.5042386008392711, + "grad_norm": 0.5115882463521207, + "learning_rate": 2.6712524819426355e-06, + "loss": 0.1971, + "step": 2238 + }, + { + "epoch": 0.5044639085250796, + "grad_norm": 0.461064300937914, + "learning_rate": 2.6693939115596718e-06, + "loss": 0.1666, + "step": 2239 + }, + { + "epoch": 0.504689216210888, + "grad_norm": 0.43818192825602736, + "learning_rate": 2.6675352471185824e-06, + "loss": 0.1584, + "step": 2240 + }, + { + "epoch": 0.5049145238966964, + "grad_norm": 0.4380198371442143, + "learning_rate": 2.6656764896514152e-06, + "loss": 0.1694, + "step": 2241 + }, + { + "epoch": 0.5051398315825049, + "grad_norm": 0.4621481856275132, + "learning_rate": 2.6638176401902693e-06, + "loss": 0.18, + "step": 2242 + }, + { + "epoch": 0.5053651392683133, + "grad_norm": 0.44685219071746596, + "learning_rate": 2.6619586997672923e-06, + "loss": 0.1611, + "step": 2243 + }, + { + "epoch": 0.5055904469541217, + "grad_norm": 0.4885190666528013, + "learning_rate": 2.6600996694146876e-06, + "loss": 0.1944, + "step": 2244 + }, + { + "epoch": 0.5058157546399301, + "grad_norm": 0.4340057682034641, + "learning_rate": 2.658240550164704e-06, + "loss": 0.1672, + "step": 2245 + }, + { + "epoch": 0.5060410623257385, + "grad_norm": 0.4536693404947069, + "learning_rate": 2.656381343049641e-06, + "loss": 0.1702, + "step": 2246 + }, + { + "epoch": 0.5062663700115471, + "grad_norm": 0.46004175801449804, + "learning_rate": 2.654522049101847e-06, + "loss": 0.1607, + "step": 2247 + }, + { + "epoch": 0.5064916776973555, + "grad_norm": 0.45970475813756534, + "learning_rate": 2.652662669353719e-06, + "loss": 0.1846, + "step": 2248 + }, + { + "epoch": 0.5067169853831639, + "grad_norm": 0.44438837304347995, + "learning_rate": 2.6508032048377006e-06, + "loss": 0.1508, + "step": 2249 + }, + { + "epoch": 0.5069422930689723, + "grad_norm": 0.45833739533956785, + "learning_rate": 2.648943656586284e-06, + "loss": 0.1632, + "step": 2250 + }, + { + "epoch": 0.5071676007547807, + "grad_norm": 0.4996621906625241, + "learning_rate": 2.6470840256320064e-06, + "loss": 0.1714, + "step": 2251 + }, + { + "epoch": 0.5073929084405892, + "grad_norm": 0.40786308614112693, + "learning_rate": 2.6452243130074523e-06, + "loss": 0.1488, + "step": 2252 + }, + { + "epoch": 0.5076182161263976, + "grad_norm": 0.45137255294206774, + "learning_rate": 2.6433645197452493e-06, + "loss": 0.1656, + "step": 2253 + }, + { + "epoch": 0.5078435238122061, + "grad_norm": 0.45885836860544116, + "learning_rate": 2.6415046468780726e-06, + "loss": 0.1648, + "step": 2254 + }, + { + "epoch": 0.5080688314980145, + "grad_norm": 0.4414180400429613, + "learning_rate": 2.63964469543864e-06, + "loss": 0.1575, + "step": 2255 + }, + { + "epoch": 0.5082941391838229, + "grad_norm": 0.4904286663666652, + "learning_rate": 2.637784666459714e-06, + "loss": 0.1692, + "step": 2256 + }, + { + "epoch": 0.5085194468696314, + "grad_norm": 0.5091607966528039, + "learning_rate": 2.635924560974098e-06, + "loss": 0.1856, + "step": 2257 + }, + { + "epoch": 0.5087447545554398, + "grad_norm": 0.46663888724653707, + "learning_rate": 2.6340643800146387e-06, + "loss": 0.1756, + "step": 2258 + }, + { + "epoch": 0.5089700622412482, + "grad_norm": 0.455474336932662, + "learning_rate": 2.6322041246142273e-06, + "loss": 0.1742, + "step": 2259 + }, + { + "epoch": 0.5091953699270566, + "grad_norm": 0.49383959811523165, + "learning_rate": 2.6303437958057932e-06, + "loss": 0.1782, + "step": 2260 + }, + { + "epoch": 0.509420677612865, + "grad_norm": 0.46258319820722743, + "learning_rate": 2.6284833946223075e-06, + "loss": 0.1726, + "step": 2261 + }, + { + "epoch": 0.5096459852986736, + "grad_norm": 0.4893995740396655, + "learning_rate": 2.626622922096782e-06, + "loss": 0.1858, + "step": 2262 + }, + { + "epoch": 0.509871292984482, + "grad_norm": 0.4659116261221518, + "learning_rate": 2.624762379262268e-06, + "loss": 0.1716, + "step": 2263 + }, + { + "epoch": 0.5100966006702904, + "grad_norm": 0.44416912248391677, + "learning_rate": 2.622901767151855e-06, + "loss": 0.1722, + "step": 2264 + }, + { + "epoch": 0.5103219083560988, + "grad_norm": 0.4341927663792587, + "learning_rate": 2.6210410867986713e-06, + "loss": 0.1647, + "step": 2265 + }, + { + "epoch": 0.5105472160419072, + "grad_norm": 0.45890223021588084, + "learning_rate": 2.619180339235884e-06, + "loss": 0.1611, + "step": 2266 + }, + { + "epoch": 0.5107725237277156, + "grad_norm": 0.45902979482582273, + "learning_rate": 2.6173195254966966e-06, + "loss": 0.1834, + "step": 2267 + }, + { + "epoch": 0.5109978314135241, + "grad_norm": 0.4397890015688735, + "learning_rate": 2.6154586466143495e-06, + "loss": 0.1602, + "step": 2268 + }, + { + "epoch": 0.5112231390993325, + "grad_norm": 0.4884430597896144, + "learning_rate": 2.6135977036221195e-06, + "loss": 0.1613, + "step": 2269 + }, + { + "epoch": 0.511448446785141, + "grad_norm": 0.4209950271265405, + "learning_rate": 2.6117366975533187e-06, + "loss": 0.169, + "step": 2270 + }, + { + "epoch": 0.5116737544709494, + "grad_norm": 0.4647222269344841, + "learning_rate": 2.609875629441295e-06, + "loss": 0.1857, + "step": 2271 + }, + { + "epoch": 0.5118990621567578, + "grad_norm": 0.45607741575031296, + "learning_rate": 2.60801450031943e-06, + "loss": 0.1668, + "step": 2272 + }, + { + "epoch": 0.5121243698425663, + "grad_norm": 0.45983806257223814, + "learning_rate": 2.6061533112211394e-06, + "loss": 0.1733, + "step": 2273 + }, + { + "epoch": 0.5123496775283747, + "grad_norm": 0.48262491009852454, + "learning_rate": 2.604292063179871e-06, + "loss": 0.1787, + "step": 2274 + }, + { + "epoch": 0.5125749852141831, + "grad_norm": 0.4449747546098695, + "learning_rate": 2.602430757229108e-06, + "loss": 0.1782, + "step": 2275 + }, + { + "epoch": 0.5128002928999915, + "grad_norm": 0.4684503267638533, + "learning_rate": 2.600569394402363e-06, + "loss": 0.1749, + "step": 2276 + }, + { + "epoch": 0.5130256005857999, + "grad_norm": 0.4857155410592575, + "learning_rate": 2.5987079757331824e-06, + "loss": 0.1904, + "step": 2277 + }, + { + "epoch": 0.5132509082716085, + "grad_norm": 0.45111466822788104, + "learning_rate": 2.596846502255142e-06, + "loss": 0.1687, + "step": 2278 + }, + { + "epoch": 0.5134762159574169, + "grad_norm": 0.4340113675284237, + "learning_rate": 2.5949849750018486e-06, + "loss": 0.1672, + "step": 2279 + }, + { + "epoch": 0.5137015236432253, + "grad_norm": 0.4532397788897212, + "learning_rate": 2.5931233950069385e-06, + "loss": 0.166, + "step": 2280 + }, + { + "epoch": 0.5139268313290337, + "grad_norm": 0.4746769020022811, + "learning_rate": 2.591261763304079e-06, + "loss": 0.1749, + "step": 2281 + }, + { + "epoch": 0.5141521390148421, + "grad_norm": 0.47031098588280024, + "learning_rate": 2.589400080926964e-06, + "loss": 0.1684, + "step": 2282 + }, + { + "epoch": 0.5143774467006506, + "grad_norm": 0.47962104617045637, + "learning_rate": 2.5875383489093165e-06, + "loss": 0.1844, + "step": 2283 + }, + { + "epoch": 0.514602754386459, + "grad_norm": 0.4378612509433773, + "learning_rate": 2.585676568284886e-06, + "loss": 0.1718, + "step": 2284 + }, + { + "epoch": 0.5148280620722674, + "grad_norm": 0.4643746007209152, + "learning_rate": 2.583814740087451e-06, + "loss": 0.1726, + "step": 2285 + }, + { + "epoch": 0.5150533697580759, + "grad_norm": 0.5173185786551981, + "learning_rate": 2.581952865350815e-06, + "loss": 0.1868, + "step": 2286 + }, + { + "epoch": 0.5152786774438843, + "grad_norm": 0.45979386333342936, + "learning_rate": 2.5800909451088075e-06, + "loss": 0.1759, + "step": 2287 + }, + { + "epoch": 0.5155039851296928, + "grad_norm": 0.4657971167420107, + "learning_rate": 2.578228980395283e-06, + "loss": 0.1704, + "step": 2288 + }, + { + "epoch": 0.5157292928155012, + "grad_norm": 0.48597341503078534, + "learning_rate": 2.5763669722441226e-06, + "loss": 0.1819, + "step": 2289 + }, + { + "epoch": 0.5159546005013096, + "grad_norm": 0.4245129618582946, + "learning_rate": 2.5745049216892286e-06, + "loss": 0.1562, + "step": 2290 + }, + { + "epoch": 0.516179908187118, + "grad_norm": 0.5155032811816632, + "learning_rate": 2.5726428297645285e-06, + "loss": 0.1727, + "step": 2291 + }, + { + "epoch": 0.5164052158729264, + "grad_norm": 0.4843500508316912, + "learning_rate": 2.570780697503973e-06, + "loss": 0.1729, + "step": 2292 + }, + { + "epoch": 0.5166305235587348, + "grad_norm": 0.5132300511517351, + "learning_rate": 2.5689185259415346e-06, + "loss": 0.1903, + "step": 2293 + }, + { + "epoch": 0.5168558312445434, + "grad_norm": 0.4562650448324461, + "learning_rate": 2.5670563161112073e-06, + "loss": 0.1678, + "step": 2294 + }, + { + "epoch": 0.5170811389303518, + "grad_norm": 0.4706113326531671, + "learning_rate": 2.5651940690470074e-06, + "loss": 0.1795, + "step": 2295 + }, + { + "epoch": 0.5173064466161602, + "grad_norm": 0.5152700625334967, + "learning_rate": 2.56333178578297e-06, + "loss": 0.1856, + "step": 2296 + }, + { + "epoch": 0.5175317543019686, + "grad_norm": 0.4697927995005228, + "learning_rate": 2.5614694673531533e-06, + "loss": 0.1691, + "step": 2297 + }, + { + "epoch": 0.517757061987777, + "grad_norm": 0.43529699586831366, + "learning_rate": 2.5596071147916325e-06, + "loss": 0.1573, + "step": 2298 + }, + { + "epoch": 0.5179823696735855, + "grad_norm": 0.5013567605044023, + "learning_rate": 2.557744729132503e-06, + "loss": 0.1853, + "step": 2299 + }, + { + "epoch": 0.5182076773593939, + "grad_norm": 0.47297929007166867, + "learning_rate": 2.555882311409878e-06, + "loss": 0.183, + "step": 2300 + }, + { + "epoch": 0.5184329850452023, + "grad_norm": 0.4582972451073558, + "learning_rate": 2.554019862657888e-06, + "loss": 0.1741, + "step": 2301 + }, + { + "epoch": 0.5186582927310108, + "grad_norm": 0.46653281435476707, + "learning_rate": 2.5521573839106815e-06, + "loss": 0.1712, + "step": 2302 + }, + { + "epoch": 0.5188836004168192, + "grad_norm": 0.47828112042737436, + "learning_rate": 2.5502948762024244e-06, + "loss": 0.1686, + "step": 2303 + }, + { + "epoch": 0.5191089081026277, + "grad_norm": 0.48874471839104267, + "learning_rate": 2.5484323405672965e-06, + "loss": 0.1876, + "step": 2304 + }, + { + "epoch": 0.5193342157884361, + "grad_norm": 0.49841948700295774, + "learning_rate": 2.546569778039496e-06, + "loss": 0.1733, + "step": 2305 + }, + { + "epoch": 0.5195595234742445, + "grad_norm": 0.47247538874134204, + "learning_rate": 2.544707189653233e-06, + "loss": 0.1767, + "step": 2306 + }, + { + "epoch": 0.5197848311600529, + "grad_norm": 0.46076560732597105, + "learning_rate": 2.542844576442734e-06, + "loss": 0.1748, + "step": 2307 + }, + { + "epoch": 0.5200101388458613, + "grad_norm": 0.4591748195831405, + "learning_rate": 2.5409819394422386e-06, + "loss": 0.1661, + "step": 2308 + }, + { + "epoch": 0.5202354465316699, + "grad_norm": 0.47052853474510675, + "learning_rate": 2.539119279686001e-06, + "loss": 0.1562, + "step": 2309 + }, + { + "epoch": 0.5204607542174783, + "grad_norm": 0.45289567235953443, + "learning_rate": 2.5372565982082843e-06, + "loss": 0.1784, + "step": 2310 + }, + { + "epoch": 0.5206860619032867, + "grad_norm": 0.4950180307603958, + "learning_rate": 2.535393896043368e-06, + "loss": 0.1942, + "step": 2311 + }, + { + "epoch": 0.5209113695890951, + "grad_norm": 0.4795840768950289, + "learning_rate": 2.5335311742255392e-06, + "loss": 0.1883, + "step": 2312 + }, + { + "epoch": 0.5211366772749035, + "grad_norm": 0.4352019160622489, + "learning_rate": 2.5316684337891005e-06, + "loss": 0.1706, + "step": 2313 + }, + { + "epoch": 0.521361984960712, + "grad_norm": 0.5639635654459939, + "learning_rate": 2.5298056757683604e-06, + "loss": 0.1989, + "step": 2314 + }, + { + "epoch": 0.5215872926465204, + "grad_norm": 0.45786370383450886, + "learning_rate": 2.52794290119764e-06, + "loss": 0.1744, + "step": 2315 + }, + { + "epoch": 0.5218126003323288, + "grad_norm": 0.5041847150551114, + "learning_rate": 2.5260801111112677e-06, + "loss": 0.1973, + "step": 2316 + }, + { + "epoch": 0.5220379080181373, + "grad_norm": 0.49412549185338067, + "learning_rate": 2.5242173065435815e-06, + "loss": 0.1664, + "step": 2317 + }, + { + "epoch": 0.5222632157039457, + "grad_norm": 0.4188983532239631, + "learning_rate": 2.5223544885289287e-06, + "loss": 0.1525, + "step": 2318 + }, + { + "epoch": 0.5224885233897542, + "grad_norm": 0.45166378640865584, + "learning_rate": 2.5204916581016608e-06, + "loss": 0.167, + "step": 2319 + }, + { + "epoch": 0.5227138310755626, + "grad_norm": 0.4530323028317828, + "learning_rate": 2.518628816296139e-06, + "loss": 0.177, + "step": 2320 + }, + { + "epoch": 0.522939138761371, + "grad_norm": 0.443190117434506, + "learning_rate": 2.5167659641467302e-06, + "loss": 0.167, + "step": 2321 + }, + { + "epoch": 0.5231644464471794, + "grad_norm": 0.4898811281446569, + "learning_rate": 2.5149031026878063e-06, + "loss": 0.1731, + "step": 2322 + }, + { + "epoch": 0.5233897541329878, + "grad_norm": 0.3951877557379856, + "learning_rate": 2.5130402329537444e-06, + "loss": 0.1423, + "step": 2323 + }, + { + "epoch": 0.5236150618187962, + "grad_norm": 0.4831147175419459, + "learning_rate": 2.5111773559789277e-06, + "loss": 0.1897, + "step": 2324 + }, + { + "epoch": 0.5238403695046048, + "grad_norm": 0.44114811151283884, + "learning_rate": 2.509314472797742e-06, + "loss": 0.1671, + "step": 2325 + }, + { + "epoch": 0.5240656771904132, + "grad_norm": 0.4414312903866855, + "learning_rate": 2.5074515844445774e-06, + "loss": 0.1682, + "step": 2326 + }, + { + "epoch": 0.5242909848762216, + "grad_norm": 0.48786764374464403, + "learning_rate": 2.5055886919538247e-06, + "loss": 0.2103, + "step": 2327 + }, + { + "epoch": 0.52451629256203, + "grad_norm": 0.4680949513473278, + "learning_rate": 2.50372579635988e-06, + "loss": 0.1702, + "step": 2328 + }, + { + "epoch": 0.5247416002478384, + "grad_norm": 0.42334216437072, + "learning_rate": 2.5018628986971395e-06, + "loss": 0.1528, + "step": 2329 + }, + { + "epoch": 0.5249669079336469, + "grad_norm": 0.4478148766002246, + "learning_rate": 2.5e-06, + "loss": 0.1645, + "step": 2330 + }, + { + "epoch": 0.5251922156194553, + "grad_norm": 0.48487042486815846, + "learning_rate": 2.4981371013028618e-06, + "loss": 0.1913, + "step": 2331 + }, + { + "epoch": 0.5254175233052637, + "grad_norm": 0.44602779542566684, + "learning_rate": 2.4962742036401213e-06, + "loss": 0.1674, + "step": 2332 + }, + { + "epoch": 0.5256428309910722, + "grad_norm": 0.45365066825246536, + "learning_rate": 2.494411308046176e-06, + "loss": 0.166, + "step": 2333 + }, + { + "epoch": 0.5258681386768806, + "grad_norm": 0.46425080154301407, + "learning_rate": 2.4925484155554235e-06, + "loss": 0.1799, + "step": 2334 + }, + { + "epoch": 0.5260934463626891, + "grad_norm": 0.4977441090036592, + "learning_rate": 2.490685527202258e-06, + "loss": 0.201, + "step": 2335 + }, + { + "epoch": 0.5263187540484975, + "grad_norm": 0.4617983809629186, + "learning_rate": 2.4888226440210723e-06, + "loss": 0.1724, + "step": 2336 + }, + { + "epoch": 0.5265440617343059, + "grad_norm": 0.453220198686478, + "learning_rate": 2.4869597670462555e-06, + "loss": 0.1744, + "step": 2337 + }, + { + "epoch": 0.5267693694201143, + "grad_norm": 0.4745171480574094, + "learning_rate": 2.4850968973121945e-06, + "loss": 0.1855, + "step": 2338 + }, + { + "epoch": 0.5269946771059227, + "grad_norm": 0.47671991331349084, + "learning_rate": 2.483234035853271e-06, + "loss": 0.1753, + "step": 2339 + }, + { + "epoch": 0.5272199847917312, + "grad_norm": 0.4590992016969129, + "learning_rate": 2.481371183703862e-06, + "loss": 0.1607, + "step": 2340 + }, + { + "epoch": 0.5274452924775397, + "grad_norm": 0.4594476394853163, + "learning_rate": 2.4795083418983405e-06, + "loss": 0.1796, + "step": 2341 + }, + { + "epoch": 0.5276706001633481, + "grad_norm": 0.46721143404773546, + "learning_rate": 2.477645511471073e-06, + "loss": 0.1797, + "step": 2342 + }, + { + "epoch": 0.5278959078491565, + "grad_norm": 0.42392295644545946, + "learning_rate": 2.475782693456419e-06, + "loss": 0.153, + "step": 2343 + }, + { + "epoch": 0.5281212155349649, + "grad_norm": 0.45089144766560346, + "learning_rate": 2.473919888888733e-06, + "loss": 0.1533, + "step": 2344 + }, + { + "epoch": 0.5283465232207734, + "grad_norm": 0.4814768396165282, + "learning_rate": 2.472057098802361e-06, + "loss": 0.1767, + "step": 2345 + }, + { + "epoch": 0.5285718309065818, + "grad_norm": 0.4611016371667415, + "learning_rate": 2.4701943242316405e-06, + "loss": 0.1632, + "step": 2346 + }, + { + "epoch": 0.5287971385923902, + "grad_norm": 0.5201895988292503, + "learning_rate": 2.4683315662109003e-06, + "loss": 0.1983, + "step": 2347 + }, + { + "epoch": 0.5290224462781986, + "grad_norm": 0.3949726130072961, + "learning_rate": 2.466468825774461e-06, + "loss": 0.15, + "step": 2348 + }, + { + "epoch": 0.5292477539640071, + "grad_norm": 0.4416129481143616, + "learning_rate": 2.464606103956633e-06, + "loss": 0.1648, + "step": 2349 + }, + { + "epoch": 0.5294730616498156, + "grad_norm": 0.4589830798831805, + "learning_rate": 2.462743401791716e-06, + "loss": 0.1871, + "step": 2350 + }, + { + "epoch": 0.529698369335624, + "grad_norm": 0.4962273108419955, + "learning_rate": 2.460880720314e-06, + "loss": 0.1777, + "step": 2351 + }, + { + "epoch": 0.5299236770214324, + "grad_norm": 0.4758377394826265, + "learning_rate": 2.4590180605577614e-06, + "loss": 0.1759, + "step": 2352 + }, + { + "epoch": 0.5301489847072408, + "grad_norm": 0.4629910329882233, + "learning_rate": 2.4571554235572665e-06, + "loss": 0.1711, + "step": 2353 + }, + { + "epoch": 0.5303742923930492, + "grad_norm": 0.4417972807412128, + "learning_rate": 2.4552928103467677e-06, + "loss": 0.1736, + "step": 2354 + }, + { + "epoch": 0.5305996000788576, + "grad_norm": 0.4685826192356472, + "learning_rate": 2.4534302219605044e-06, + "loss": 0.1706, + "step": 2355 + }, + { + "epoch": 0.5308249077646662, + "grad_norm": 0.4341014874906131, + "learning_rate": 2.4515676594327035e-06, + "loss": 0.1662, + "step": 2356 + }, + { + "epoch": 0.5310502154504746, + "grad_norm": 0.4737221621115245, + "learning_rate": 2.4497051237975773e-06, + "loss": 0.1732, + "step": 2357 + }, + { + "epoch": 0.531275523136283, + "grad_norm": 0.4970480417924003, + "learning_rate": 2.4478426160893197e-06, + "loss": 0.1858, + "step": 2358 + }, + { + "epoch": 0.5315008308220914, + "grad_norm": 0.4614065853203992, + "learning_rate": 2.4459801373421134e-06, + "loss": 0.169, + "step": 2359 + }, + { + "epoch": 0.5317261385078998, + "grad_norm": 0.453311605188678, + "learning_rate": 2.4441176885901234e-06, + "loss": 0.1702, + "step": 2360 + }, + { + "epoch": 0.5319514461937083, + "grad_norm": 0.4845793459138786, + "learning_rate": 2.4422552708674977e-06, + "loss": 0.1673, + "step": 2361 + }, + { + "epoch": 0.5321767538795167, + "grad_norm": 0.46977146251230795, + "learning_rate": 2.440392885208368e-06, + "loss": 0.1642, + "step": 2362 + }, + { + "epoch": 0.5324020615653251, + "grad_norm": 0.48802448384280056, + "learning_rate": 2.4385305326468475e-06, + "loss": 0.17, + "step": 2363 + }, + { + "epoch": 0.5326273692511336, + "grad_norm": 0.45181082267524225, + "learning_rate": 2.436668214217031e-06, + "loss": 0.1597, + "step": 2364 + }, + { + "epoch": 0.532852676936942, + "grad_norm": 0.46116330293942, + "learning_rate": 2.4348059309529935e-06, + "loss": 0.1751, + "step": 2365 + }, + { + "epoch": 0.5330779846227505, + "grad_norm": 0.4429328330919978, + "learning_rate": 2.4329436838887936e-06, + "loss": 0.162, + "step": 2366 + }, + { + "epoch": 0.5333032923085589, + "grad_norm": 0.4790109520966867, + "learning_rate": 2.4310814740584663e-06, + "loss": 0.1787, + "step": 2367 + }, + { + "epoch": 0.5335285999943673, + "grad_norm": 0.4501594851512234, + "learning_rate": 2.4292193024960275e-06, + "loss": 0.1663, + "step": 2368 + }, + { + "epoch": 0.5337539076801757, + "grad_norm": 0.4331762471374425, + "learning_rate": 2.427357170235472e-06, + "loss": 0.1489, + "step": 2369 + }, + { + "epoch": 0.5339792153659841, + "grad_norm": 0.4507478665724284, + "learning_rate": 2.425495078310772e-06, + "loss": 0.1643, + "step": 2370 + }, + { + "epoch": 0.5342045230517926, + "grad_norm": 0.46171575554427935, + "learning_rate": 2.4236330277558774e-06, + "loss": 0.1865, + "step": 2371 + }, + { + "epoch": 0.5344298307376011, + "grad_norm": 0.43877684906434145, + "learning_rate": 2.4217710196047166e-06, + "loss": 0.1776, + "step": 2372 + }, + { + "epoch": 0.5346551384234095, + "grad_norm": 0.4461235179872465, + "learning_rate": 2.419909054891193e-06, + "loss": 0.1692, + "step": 2373 + }, + { + "epoch": 0.5348804461092179, + "grad_norm": 0.49968847828811536, + "learning_rate": 2.4180471346491864e-06, + "loss": 0.1676, + "step": 2374 + }, + { + "epoch": 0.5351057537950263, + "grad_norm": 0.43712704864054586, + "learning_rate": 2.4161852599125504e-06, + "loss": 0.1713, + "step": 2375 + }, + { + "epoch": 0.5353310614808348, + "grad_norm": 0.4102969624927177, + "learning_rate": 2.414323431715115e-06, + "loss": 0.1513, + "step": 2376 + }, + { + "epoch": 0.5355563691666432, + "grad_norm": 0.45493332376556395, + "learning_rate": 2.412461651090685e-06, + "loss": 0.1625, + "step": 2377 + }, + { + "epoch": 0.5357816768524516, + "grad_norm": 0.4380991457294616, + "learning_rate": 2.410599919073037e-06, + "loss": 0.1684, + "step": 2378 + }, + { + "epoch": 0.53600698453826, + "grad_norm": 0.44848519212258064, + "learning_rate": 2.408738236695922e-06, + "loss": 0.1767, + "step": 2379 + }, + { + "epoch": 0.5362322922240685, + "grad_norm": 0.4736768374539415, + "learning_rate": 2.4068766049930623e-06, + "loss": 0.1771, + "step": 2380 + }, + { + "epoch": 0.536457599909877, + "grad_norm": 0.47830649364623495, + "learning_rate": 2.4050150249981522e-06, + "loss": 0.1845, + "step": 2381 + }, + { + "epoch": 0.5366829075956854, + "grad_norm": 0.4875633576998744, + "learning_rate": 2.403153497744859e-06, + "loss": 0.1841, + "step": 2382 + }, + { + "epoch": 0.5369082152814938, + "grad_norm": 0.4260527425637645, + "learning_rate": 2.4012920242668184e-06, + "loss": 0.1683, + "step": 2383 + }, + { + "epoch": 0.5371335229673022, + "grad_norm": 0.4915340964347304, + "learning_rate": 2.3994306055976374e-06, + "loss": 0.1861, + "step": 2384 + }, + { + "epoch": 0.5373588306531106, + "grad_norm": 0.46620137012980417, + "learning_rate": 2.397569242770893e-06, + "loss": 0.1828, + "step": 2385 + }, + { + "epoch": 0.537584138338919, + "grad_norm": 0.4539779748890486, + "learning_rate": 2.3957079368201293e-06, + "loss": 0.1671, + "step": 2386 + }, + { + "epoch": 0.5378094460247275, + "grad_norm": 0.4375980276966808, + "learning_rate": 2.393846688778861e-06, + "loss": 0.1571, + "step": 2387 + }, + { + "epoch": 0.538034753710536, + "grad_norm": 0.46622363457100846, + "learning_rate": 2.39198549968057e-06, + "loss": 0.1914, + "step": 2388 + }, + { + "epoch": 0.5382600613963444, + "grad_norm": 0.4360994766668747, + "learning_rate": 2.390124370558705e-06, + "loss": 0.1665, + "step": 2389 + }, + { + "epoch": 0.5384853690821528, + "grad_norm": 0.4884165143494655, + "learning_rate": 2.3882633024466813e-06, + "loss": 0.1909, + "step": 2390 + }, + { + "epoch": 0.5387106767679612, + "grad_norm": 0.4362139267516624, + "learning_rate": 2.386402296377881e-06, + "loss": 0.1696, + "step": 2391 + }, + { + "epoch": 0.5389359844537697, + "grad_norm": 0.44759511462581986, + "learning_rate": 2.3845413533856517e-06, + "loss": 0.1581, + "step": 2392 + }, + { + "epoch": 0.5391612921395781, + "grad_norm": 0.4746282020181777, + "learning_rate": 2.3826804745033046e-06, + "loss": 0.1827, + "step": 2393 + }, + { + "epoch": 0.5393865998253865, + "grad_norm": 0.4837445016210145, + "learning_rate": 2.3808196607641176e-06, + "loss": 0.1755, + "step": 2394 + }, + { + "epoch": 0.5396119075111949, + "grad_norm": 0.4852592712330177, + "learning_rate": 2.3789589132013304e-06, + "loss": 0.1798, + "step": 2395 + }, + { + "epoch": 0.5398372151970035, + "grad_norm": 0.4679601245423019, + "learning_rate": 2.3770982328481464e-06, + "loss": 0.1868, + "step": 2396 + }, + { + "epoch": 0.5400625228828119, + "grad_norm": 0.45832315042158955, + "learning_rate": 2.3752376207377333e-06, + "loss": 0.1832, + "step": 2397 + }, + { + "epoch": 0.5402878305686203, + "grad_norm": 0.46274541668134184, + "learning_rate": 2.3733770779032185e-06, + "loss": 0.1795, + "step": 2398 + }, + { + "epoch": 0.5405131382544287, + "grad_norm": 0.5164319607147908, + "learning_rate": 2.371516605377693e-06, + "loss": 0.1887, + "step": 2399 + }, + { + "epoch": 0.5407384459402371, + "grad_norm": 0.4559215820648335, + "learning_rate": 2.3696562041942076e-06, + "loss": 0.1814, + "step": 2400 + }, + { + "epoch": 0.5409637536260455, + "grad_norm": 0.4605507668522533, + "learning_rate": 2.367795875385773e-06, + "loss": 0.1749, + "step": 2401 + }, + { + "epoch": 0.541189061311854, + "grad_norm": 0.4362738614697804, + "learning_rate": 2.3659356199853617e-06, + "loss": 0.1548, + "step": 2402 + }, + { + "epoch": 0.5414143689976625, + "grad_norm": 0.4615797929914004, + "learning_rate": 2.3640754390259026e-06, + "loss": 0.1766, + "step": 2403 + }, + { + "epoch": 0.5416396766834709, + "grad_norm": 0.43723201497167924, + "learning_rate": 2.362215333540287e-06, + "loss": 0.1757, + "step": 2404 + }, + { + "epoch": 0.5418649843692793, + "grad_norm": 0.47049701319695275, + "learning_rate": 2.36035530456136e-06, + "loss": 0.1724, + "step": 2405 + }, + { + "epoch": 0.5420902920550877, + "grad_norm": 0.45635760501607836, + "learning_rate": 2.3584953531219278e-06, + "loss": 0.1699, + "step": 2406 + }, + { + "epoch": 0.5423155997408962, + "grad_norm": 0.49081073832313143, + "learning_rate": 2.356635480254751e-06, + "loss": 0.1859, + "step": 2407 + }, + { + "epoch": 0.5425409074267046, + "grad_norm": 0.5826758237219053, + "learning_rate": 2.3547756869925485e-06, + "loss": 0.1672, + "step": 2408 + }, + { + "epoch": 0.542766215112513, + "grad_norm": 0.4986683231676404, + "learning_rate": 2.3529159743679936e-06, + "loss": 0.1813, + "step": 2409 + }, + { + "epoch": 0.5429915227983214, + "grad_norm": 0.45338638663528713, + "learning_rate": 2.3510563434137175e-06, + "loss": 0.1599, + "step": 2410 + }, + { + "epoch": 0.5432168304841299, + "grad_norm": 0.4757239619418053, + "learning_rate": 2.3491967951623006e-06, + "loss": 0.1595, + "step": 2411 + }, + { + "epoch": 0.5434421381699384, + "grad_norm": 0.44551404086377977, + "learning_rate": 2.347337330646282e-06, + "loss": 0.1719, + "step": 2412 + }, + { + "epoch": 0.5436674458557468, + "grad_norm": 0.4303504925658134, + "learning_rate": 2.3454779508981536e-06, + "loss": 0.1631, + "step": 2413 + }, + { + "epoch": 0.5438927535415552, + "grad_norm": 0.4734270395361259, + "learning_rate": 2.3436186569503598e-06, + "loss": 0.1728, + "step": 2414 + }, + { + "epoch": 0.5441180612273636, + "grad_norm": 0.4757430807735444, + "learning_rate": 2.341759449835297e-06, + "loss": 0.1756, + "step": 2415 + }, + { + "epoch": 0.544343368913172, + "grad_norm": 0.4518243291159189, + "learning_rate": 2.339900330585313e-06, + "loss": 0.1768, + "step": 2416 + }, + { + "epoch": 0.5445686765989805, + "grad_norm": 0.41466139694517934, + "learning_rate": 2.338041300232708e-06, + "loss": 0.1558, + "step": 2417 + }, + { + "epoch": 0.5447939842847889, + "grad_norm": 0.4387645357978409, + "learning_rate": 2.3361823598097316e-06, + "loss": 0.1668, + "step": 2418 + }, + { + "epoch": 0.5450192919705974, + "grad_norm": 0.4522237774944955, + "learning_rate": 2.334323510348585e-06, + "loss": 0.1766, + "step": 2419 + }, + { + "epoch": 0.5452445996564058, + "grad_norm": 0.46765468379384356, + "learning_rate": 2.332464752881418e-06, + "loss": 0.181, + "step": 2420 + }, + { + "epoch": 0.5454699073422142, + "grad_norm": 0.4720796353585363, + "learning_rate": 2.330606088440329e-06, + "loss": 0.169, + "step": 2421 + }, + { + "epoch": 0.5456952150280227, + "grad_norm": 0.4731526500922286, + "learning_rate": 2.3287475180573653e-06, + "loss": 0.1773, + "step": 2422 + }, + { + "epoch": 0.5459205227138311, + "grad_norm": 0.4255890485186262, + "learning_rate": 2.3268890427645213e-06, + "loss": 0.1594, + "step": 2423 + }, + { + "epoch": 0.5461458303996395, + "grad_norm": 0.431182882225161, + "learning_rate": 2.3250306635937385e-06, + "loss": 0.1577, + "step": 2424 + }, + { + "epoch": 0.5463711380854479, + "grad_norm": 0.44826404732225417, + "learning_rate": 2.323172381576907e-06, + "loss": 0.1701, + "step": 2425 + }, + { + "epoch": 0.5465964457712563, + "grad_norm": 0.4564394170747981, + "learning_rate": 2.3213141977458615e-06, + "loss": 0.1599, + "step": 2426 + }, + { + "epoch": 0.5468217534570649, + "grad_norm": 0.468807637578165, + "learning_rate": 2.3194561131323823e-06, + "loss": 0.1764, + "step": 2427 + }, + { + "epoch": 0.5470470611428733, + "grad_norm": 0.4648051007002754, + "learning_rate": 2.3175981287681924e-06, + "loss": 0.1814, + "step": 2428 + }, + { + "epoch": 0.5472723688286817, + "grad_norm": 0.45855019957030524, + "learning_rate": 2.3157402456849632e-06, + "loss": 0.1738, + "step": 2429 + }, + { + "epoch": 0.5474976765144901, + "grad_norm": 0.4731766590937873, + "learning_rate": 2.3138824649143076e-06, + "loss": 0.173, + "step": 2430 + }, + { + "epoch": 0.5477229842002985, + "grad_norm": 0.4311090571500072, + "learning_rate": 2.312024787487782e-06, + "loss": 0.1705, + "step": 2431 + }, + { + "epoch": 0.5479482918861069, + "grad_norm": 0.4497419291390194, + "learning_rate": 2.310167214436885e-06, + "loss": 0.1701, + "step": 2432 + }, + { + "epoch": 0.5481735995719154, + "grad_norm": 0.46248747292271064, + "learning_rate": 2.3083097467930583e-06, + "loss": 0.1744, + "step": 2433 + }, + { + "epoch": 0.5483989072577238, + "grad_norm": 0.4748658453585637, + "learning_rate": 2.306452385587683e-06, + "loss": 0.1715, + "step": 2434 + }, + { + "epoch": 0.5486242149435323, + "grad_norm": 0.47334739791324204, + "learning_rate": 2.304595131852085e-06, + "loss": 0.1771, + "step": 2435 + }, + { + "epoch": 0.5488495226293407, + "grad_norm": 0.4689141322890139, + "learning_rate": 2.3027379866175263e-06, + "loss": 0.1841, + "step": 2436 + }, + { + "epoch": 0.5490748303151491, + "grad_norm": 0.4420449587364503, + "learning_rate": 2.300880950915211e-06, + "loss": 0.1662, + "step": 2437 + }, + { + "epoch": 0.5493001380009576, + "grad_norm": 0.4566295511857543, + "learning_rate": 2.2990240257762817e-06, + "loss": 0.1735, + "step": 2438 + }, + { + "epoch": 0.549525445686766, + "grad_norm": 0.4172639116036727, + "learning_rate": 2.2971672122318196e-06, + "loss": 0.1444, + "step": 2439 + }, + { + "epoch": 0.5497507533725744, + "grad_norm": 0.4306080268803443, + "learning_rate": 2.295310511312844e-06, + "loss": 0.1644, + "step": 2440 + }, + { + "epoch": 0.5499760610583828, + "grad_norm": 0.47070895636710386, + "learning_rate": 2.293453924050312e-06, + "loss": 0.176, + "step": 2441 + }, + { + "epoch": 0.5502013687441912, + "grad_norm": 0.47113733890438, + "learning_rate": 2.2915974514751173e-06, + "loss": 0.1795, + "step": 2442 + }, + { + "epoch": 0.5504266764299998, + "grad_norm": 0.4546442646778374, + "learning_rate": 2.2897410946180897e-06, + "loss": 0.1778, + "step": 2443 + }, + { + "epoch": 0.5506519841158082, + "grad_norm": 0.4561106118360563, + "learning_rate": 2.287884854509995e-06, + "loss": 0.1683, + "step": 2444 + }, + { + "epoch": 0.5508772918016166, + "grad_norm": 0.46333631358844674, + "learning_rate": 2.286028732181535e-06, + "loss": 0.183, + "step": 2445 + }, + { + "epoch": 0.551102599487425, + "grad_norm": 0.46293513857738805, + "learning_rate": 2.2841727286633444e-06, + "loss": 0.1671, + "step": 2446 + }, + { + "epoch": 0.5513279071732334, + "grad_norm": 0.45762304657294156, + "learning_rate": 2.282316844985992e-06, + "loss": 0.1639, + "step": 2447 + }, + { + "epoch": 0.5515532148590419, + "grad_norm": 0.4665745390050163, + "learning_rate": 2.280461082179982e-06, + "loss": 0.1736, + "step": 2448 + }, + { + "epoch": 0.5517785225448503, + "grad_norm": 0.463709317351158, + "learning_rate": 2.27860544127575e-06, + "loss": 0.1809, + "step": 2449 + }, + { + "epoch": 0.5520038302306588, + "grad_norm": 0.4312370826783428, + "learning_rate": 2.2767499233036635e-06, + "loss": 0.1635, + "step": 2450 + }, + { + "epoch": 0.5522291379164672, + "grad_norm": 0.4640744696036262, + "learning_rate": 2.2748945292940237e-06, + "loss": 0.169, + "step": 2451 + }, + { + "epoch": 0.5524544456022756, + "grad_norm": 0.46002128988026153, + "learning_rate": 2.2730392602770617e-06, + "loss": 0.1682, + "step": 2452 + }, + { + "epoch": 0.552679753288084, + "grad_norm": 0.45550002299181624, + "learning_rate": 2.271184117282938e-06, + "loss": 0.1641, + "step": 2453 + }, + { + "epoch": 0.5529050609738925, + "grad_norm": 0.4752640990279706, + "learning_rate": 2.269329101341745e-06, + "loss": 0.1836, + "step": 2454 + }, + { + "epoch": 0.5531303686597009, + "grad_norm": 0.43963830803243287, + "learning_rate": 2.267474213483505e-06, + "loss": 0.1575, + "step": 2455 + }, + { + "epoch": 0.5533556763455093, + "grad_norm": 0.4512720818736478, + "learning_rate": 2.265619454738166e-06, + "loss": 0.1654, + "step": 2456 + }, + { + "epoch": 0.5535809840313177, + "grad_norm": 0.4697756311290459, + "learning_rate": 2.2637648261356078e-06, + "loss": 0.1789, + "step": 2457 + }, + { + "epoch": 0.5538062917171263, + "grad_norm": 0.4444517481258749, + "learning_rate": 2.2619103287056366e-06, + "loss": 0.1738, + "step": 2458 + }, + { + "epoch": 0.5540315994029347, + "grad_norm": 0.4563357938417464, + "learning_rate": 2.260055963477985e-06, + "loss": 0.1785, + "step": 2459 + }, + { + "epoch": 0.5542569070887431, + "grad_norm": 0.4360454486370432, + "learning_rate": 2.2582017314823135e-06, + "loss": 0.1713, + "step": 2460 + }, + { + "epoch": 0.5544822147745515, + "grad_norm": 0.44430341626688435, + "learning_rate": 2.2563476337482073e-06, + "loss": 0.1701, + "step": 2461 + }, + { + "epoch": 0.5547075224603599, + "grad_norm": 0.44900084672365126, + "learning_rate": 2.254493671305179e-06, + "loss": 0.1753, + "step": 2462 + }, + { + "epoch": 0.5549328301461683, + "grad_norm": 0.4782378043685235, + "learning_rate": 2.2526398451826638e-06, + "loss": 0.1786, + "step": 2463 + }, + { + "epoch": 0.5551581378319768, + "grad_norm": 0.44203709204437974, + "learning_rate": 2.250786156410022e-06, + "loss": 0.1737, + "step": 2464 + }, + { + "epoch": 0.5553834455177852, + "grad_norm": 0.4745143323385081, + "learning_rate": 2.2489326060165384e-06, + "loss": 0.1843, + "step": 2465 + }, + { + "epoch": 0.5556087532035937, + "grad_norm": 0.4568732003691151, + "learning_rate": 2.24707919503142e-06, + "loss": 0.1778, + "step": 2466 + }, + { + "epoch": 0.5558340608894021, + "grad_norm": 0.4147805595386346, + "learning_rate": 2.2452259244837974e-06, + "loss": 0.1549, + "step": 2467 + }, + { + "epoch": 0.5560593685752105, + "grad_norm": 0.40380560634718426, + "learning_rate": 2.2433727954027227e-06, + "loss": 0.1538, + "step": 2468 + }, + { + "epoch": 0.556284676261019, + "grad_norm": 0.43610935357428066, + "learning_rate": 2.24151980881717e-06, + "loss": 0.1697, + "step": 2469 + }, + { + "epoch": 0.5565099839468274, + "grad_norm": 0.4443229473210749, + "learning_rate": 2.239666965756032e-06, + "loss": 0.1645, + "step": 2470 + }, + { + "epoch": 0.5567352916326358, + "grad_norm": 0.4640652160098938, + "learning_rate": 2.237814267248124e-06, + "loss": 0.162, + "step": 2471 + }, + { + "epoch": 0.5569605993184442, + "grad_norm": 0.44437454640880975, + "learning_rate": 2.2359617143221805e-06, + "loss": 0.1749, + "step": 2472 + }, + { + "epoch": 0.5571859070042526, + "grad_norm": 0.43679439222011246, + "learning_rate": 2.234109308006856e-06, + "loss": 0.1659, + "step": 2473 + }, + { + "epoch": 0.5574112146900612, + "grad_norm": 0.45320011521194586, + "learning_rate": 2.232257049330722e-06, + "loss": 0.1553, + "step": 2474 + }, + { + "epoch": 0.5576365223758696, + "grad_norm": 0.4700068579798193, + "learning_rate": 2.2304049393222686e-06, + "loss": 0.1687, + "step": 2475 + }, + { + "epoch": 0.557861830061678, + "grad_norm": 0.4261935242445229, + "learning_rate": 2.2285529790099034e-06, + "loss": 0.1649, + "step": 2476 + }, + { + "epoch": 0.5580871377474864, + "grad_norm": 0.447684152357022, + "learning_rate": 2.2267011694219513e-06, + "loss": 0.1762, + "step": 2477 + }, + { + "epoch": 0.5583124454332948, + "grad_norm": 0.4695330107774392, + "learning_rate": 2.224849511586652e-06, + "loss": 0.1641, + "step": 2478 + }, + { + "epoch": 0.5585377531191033, + "grad_norm": 0.4480115275610278, + "learning_rate": 2.2229980065321636e-06, + "loss": 0.158, + "step": 2479 + }, + { + "epoch": 0.5587630608049117, + "grad_norm": 0.4538980098797459, + "learning_rate": 2.221146655286558e-06, + "loss": 0.1819, + "step": 2480 + }, + { + "epoch": 0.5589883684907201, + "grad_norm": 0.4504385810025041, + "learning_rate": 2.2192954588778195e-06, + "loss": 0.1769, + "step": 2481 + }, + { + "epoch": 0.5592136761765286, + "grad_norm": 0.45069779564859974, + "learning_rate": 2.21744441833385e-06, + "loss": 0.1716, + "step": 2482 + }, + { + "epoch": 0.559438983862337, + "grad_norm": 0.47508975240605905, + "learning_rate": 2.2155935346824634e-06, + "loss": 0.1817, + "step": 2483 + }, + { + "epoch": 0.5596642915481455, + "grad_norm": 0.4408804558222812, + "learning_rate": 2.2137428089513857e-06, + "loss": 0.1637, + "step": 2484 + }, + { + "epoch": 0.5598895992339539, + "grad_norm": 0.48465020187282243, + "learning_rate": 2.2118922421682563e-06, + "loss": 0.1797, + "step": 2485 + }, + { + "epoch": 0.5601149069197623, + "grad_norm": 0.44675786540006823, + "learning_rate": 2.2100418353606262e-06, + "loss": 0.1757, + "step": 2486 + }, + { + "epoch": 0.5603402146055707, + "grad_norm": 0.4453127467549628, + "learning_rate": 2.208191589555956e-06, + "loss": 0.1631, + "step": 2487 + }, + { + "epoch": 0.5605655222913791, + "grad_norm": 0.4763720892275746, + "learning_rate": 2.20634150578162e-06, + "loss": 0.1762, + "step": 2488 + }, + { + "epoch": 0.5607908299771875, + "grad_norm": 0.49414160526844575, + "learning_rate": 2.2044915850649e-06, + "loss": 0.1738, + "step": 2489 + }, + { + "epoch": 0.5610161376629961, + "grad_norm": 0.44794664243701754, + "learning_rate": 2.202641828432988e-06, + "loss": 0.1697, + "step": 2490 + }, + { + "epoch": 0.5612414453488045, + "grad_norm": 0.4683258352468174, + "learning_rate": 2.2007922369129854e-06, + "loss": 0.1851, + "step": 2491 + }, + { + "epoch": 0.5614667530346129, + "grad_norm": 0.44788537650068433, + "learning_rate": 2.1989428115319005e-06, + "loss": 0.1759, + "step": 2492 + }, + { + "epoch": 0.5616920607204213, + "grad_norm": 0.4617551931254518, + "learning_rate": 2.1970935533166505e-06, + "loss": 0.178, + "step": 2493 + }, + { + "epoch": 0.5619173684062297, + "grad_norm": 0.469335238391166, + "learning_rate": 2.19524446329406e-06, + "loss": 0.1637, + "step": 2494 + }, + { + "epoch": 0.5621426760920382, + "grad_norm": 0.4631549351647922, + "learning_rate": 2.1933955424908596e-06, + "loss": 0.1919, + "step": 2495 + }, + { + "epoch": 0.5623679837778466, + "grad_norm": 0.4162353760787374, + "learning_rate": 2.1915467919336862e-06, + "loss": 0.1509, + "step": 2496 + }, + { + "epoch": 0.562593291463655, + "grad_norm": 0.43383152962537525, + "learning_rate": 2.1896982126490825e-06, + "loss": 0.1707, + "step": 2497 + }, + { + "epoch": 0.5628185991494635, + "grad_norm": 0.4565119141114124, + "learning_rate": 2.1878498056634946e-06, + "loss": 0.18, + "step": 2498 + }, + { + "epoch": 0.563043906835272, + "grad_norm": 0.4754399262901138, + "learning_rate": 2.1860015720032747e-06, + "loss": 0.1763, + "step": 2499 + }, + { + "epoch": 0.5632692145210804, + "grad_norm": 0.4475146222347243, + "learning_rate": 2.1841535126946777e-06, + "loss": 0.1612, + "step": 2500 + }, + { + "epoch": 0.5632692145210804, + "eval_loss": 0.17249523103237152, + "eval_runtime": 56.8991, + "eval_samples_per_second": 50.44, + "eval_steps_per_second": 6.309, + "step": 2500 + }, + { + "epoch": 0.5634945222068888, + "grad_norm": 0.47569122833885047, + "learning_rate": 2.1823056287638623e-06, + "loss": 0.1873, + "step": 2501 + }, + { + "epoch": 0.5637198298926972, + "grad_norm": 0.46601066546351255, + "learning_rate": 2.180457921236889e-06, + "loss": 0.1675, + "step": 2502 + }, + { + "epoch": 0.5639451375785056, + "grad_norm": 0.46400456158561365, + "learning_rate": 2.1786103911397218e-06, + "loss": 0.174, + "step": 2503 + }, + { + "epoch": 0.564170445264314, + "grad_norm": 0.44351318332889395, + "learning_rate": 2.176763039498225e-06, + "loss": 0.1655, + "step": 2504 + }, + { + "epoch": 0.5643957529501226, + "grad_norm": 0.43553983461819756, + "learning_rate": 2.174915867338164e-06, + "loss": 0.1617, + "step": 2505 + }, + { + "epoch": 0.564621060635931, + "grad_norm": 0.4645859779969508, + "learning_rate": 2.1730688756852046e-06, + "loss": 0.1694, + "step": 2506 + }, + { + "epoch": 0.5648463683217394, + "grad_norm": 0.4778946751719882, + "learning_rate": 2.171222065564913e-06, + "loss": 0.1902, + "step": 2507 + }, + { + "epoch": 0.5650716760075478, + "grad_norm": 0.4517399390705507, + "learning_rate": 2.1693754380027533e-06, + "loss": 0.1768, + "step": 2508 + }, + { + "epoch": 0.5652969836933562, + "grad_norm": 0.4319938524269905, + "learning_rate": 2.1675289940240883e-06, + "loss": 0.1574, + "step": 2509 + }, + { + "epoch": 0.5655222913791647, + "grad_norm": 0.39065317934793764, + "learning_rate": 2.165682734654181e-06, + "loss": 0.146, + "step": 2510 + }, + { + "epoch": 0.5657475990649731, + "grad_norm": 0.4688171230813956, + "learning_rate": 2.16383666091819e-06, + "loss": 0.1706, + "step": 2511 + }, + { + "epoch": 0.5659729067507815, + "grad_norm": 0.4675966992365206, + "learning_rate": 2.161990773841171e-06, + "loss": 0.1733, + "step": 2512 + }, + { + "epoch": 0.56619821443659, + "grad_norm": 0.46566946607071596, + "learning_rate": 2.1601450744480763e-06, + "loss": 0.1759, + "step": 2513 + }, + { + "epoch": 0.5664235221223984, + "grad_norm": 0.5018084046643053, + "learning_rate": 2.1582995637637543e-06, + "loss": 0.1822, + "step": 2514 + }, + { + "epoch": 0.5666488298082069, + "grad_norm": 0.4453972565228735, + "learning_rate": 2.156454242812948e-06, + "loss": 0.1592, + "step": 2515 + }, + { + "epoch": 0.5668741374940153, + "grad_norm": 0.47162885826884643, + "learning_rate": 2.1546091126202955e-06, + "loss": 0.1855, + "step": 2516 + }, + { + "epoch": 0.5670994451798237, + "grad_norm": 0.43391872438066137, + "learning_rate": 2.1527641742103282e-06, + "loss": 0.1553, + "step": 2517 + }, + { + "epoch": 0.5673247528656321, + "grad_norm": 0.49024259519148045, + "learning_rate": 2.150919428607472e-06, + "loss": 0.1726, + "step": 2518 + }, + { + "epoch": 0.5675500605514405, + "grad_norm": 0.45502623106366774, + "learning_rate": 2.149074876836045e-06, + "loss": 0.1753, + "step": 2519 + }, + { + "epoch": 0.567775368237249, + "grad_norm": 0.4734314147689715, + "learning_rate": 2.147230519920259e-06, + "loss": 0.1912, + "step": 2520 + }, + { + "epoch": 0.5680006759230575, + "grad_norm": 0.4583822099373201, + "learning_rate": 2.1453863588842165e-06, + "loss": 0.1664, + "step": 2521 + }, + { + "epoch": 0.5682259836088659, + "grad_norm": 0.4540996526317524, + "learning_rate": 2.143542394751911e-06, + "loss": 0.1713, + "step": 2522 + }, + { + "epoch": 0.5684512912946743, + "grad_norm": 0.4760924494356611, + "learning_rate": 2.1416986285472268e-06, + "loss": 0.176, + "step": 2523 + }, + { + "epoch": 0.5686765989804827, + "grad_norm": 0.4698356524459828, + "learning_rate": 2.139855061293939e-06, + "loss": 0.1763, + "step": 2524 + }, + { + "epoch": 0.5689019066662911, + "grad_norm": 0.44190717756597203, + "learning_rate": 2.1380116940157107e-06, + "loss": 0.1737, + "step": 2525 + }, + { + "epoch": 0.5691272143520996, + "grad_norm": 0.4926884469777348, + "learning_rate": 2.1361685277360973e-06, + "loss": 0.1806, + "step": 2526 + }, + { + "epoch": 0.569352522037908, + "grad_norm": 0.45676163496506195, + "learning_rate": 2.1343255634785386e-06, + "loss": 0.1712, + "step": 2527 + }, + { + "epoch": 0.5695778297237164, + "grad_norm": 0.409760603773833, + "learning_rate": 2.132482802266364e-06, + "loss": 0.1385, + "step": 2528 + }, + { + "epoch": 0.5698031374095249, + "grad_norm": 0.44615936271050516, + "learning_rate": 2.1306402451227907e-06, + "loss": 0.172, + "step": 2529 + }, + { + "epoch": 0.5700284450953333, + "grad_norm": 0.4446192137679883, + "learning_rate": 2.128797893070922e-06, + "loss": 0.1708, + "step": 2530 + }, + { + "epoch": 0.5702537527811418, + "grad_norm": 0.4706091087079518, + "learning_rate": 2.1269557471337467e-06, + "loss": 0.1704, + "step": 2531 + }, + { + "epoch": 0.5704790604669502, + "grad_norm": 0.4738641507920694, + "learning_rate": 2.1251138083341404e-06, + "loss": 0.1725, + "step": 2532 + }, + { + "epoch": 0.5707043681527586, + "grad_norm": 0.4730597019121828, + "learning_rate": 2.123272077694864e-06, + "loss": 0.1639, + "step": 2533 + }, + { + "epoch": 0.570929675838567, + "grad_norm": 0.4524394608377241, + "learning_rate": 2.1214305562385592e-06, + "loss": 0.1737, + "step": 2534 + }, + { + "epoch": 0.5711549835243754, + "grad_norm": 0.44554759400518856, + "learning_rate": 2.1195892449877556e-06, + "loss": 0.1694, + "step": 2535 + }, + { + "epoch": 0.5713802912101839, + "grad_norm": 0.4794252039418334, + "learning_rate": 2.117748144964865e-06, + "loss": 0.1809, + "step": 2536 + }, + { + "epoch": 0.5716055988959924, + "grad_norm": 0.45755612616054175, + "learning_rate": 2.115907257192182e-06, + "loss": 0.1697, + "step": 2537 + }, + { + "epoch": 0.5718309065818008, + "grad_norm": 0.4298878873902357, + "learning_rate": 2.1140665826918823e-06, + "loss": 0.151, + "step": 2538 + }, + { + "epoch": 0.5720562142676092, + "grad_norm": 0.544846788525287, + "learning_rate": 2.1122261224860237e-06, + "loss": 0.1881, + "step": 2539 + }, + { + "epoch": 0.5722815219534176, + "grad_norm": 0.45518534394481713, + "learning_rate": 2.1103858775965455e-06, + "loss": 0.1794, + "step": 2540 + }, + { + "epoch": 0.5725068296392261, + "grad_norm": 0.4590831133752798, + "learning_rate": 2.1085458490452663e-06, + "loss": 0.1758, + "step": 2541 + }, + { + "epoch": 0.5727321373250345, + "grad_norm": 0.4713475548181337, + "learning_rate": 2.106706037853887e-06, + "loss": 0.1639, + "step": 2542 + }, + { + "epoch": 0.5729574450108429, + "grad_norm": 0.4553584605455894, + "learning_rate": 2.1048664450439853e-06, + "loss": 0.162, + "step": 2543 + }, + { + "epoch": 0.5731827526966513, + "grad_norm": 0.4482727885737725, + "learning_rate": 2.103027071637018e-06, + "loss": 0.1649, + "step": 2544 + }, + { + "epoch": 0.5734080603824598, + "grad_norm": 0.44153676762477473, + "learning_rate": 2.101187918654321e-06, + "loss": 0.1573, + "step": 2545 + }, + { + "epoch": 0.5736333680682683, + "grad_norm": 0.4924742786959815, + "learning_rate": 2.099348987117108e-06, + "loss": 0.1844, + "step": 2546 + }, + { + "epoch": 0.5738586757540767, + "grad_norm": 0.46741438290322646, + "learning_rate": 2.0975102780464674e-06, + "loss": 0.1781, + "step": 2547 + }, + { + "epoch": 0.5740839834398851, + "grad_norm": 0.4977662058692273, + "learning_rate": 2.095671792463368e-06, + "loss": 0.1772, + "step": 2548 + }, + { + "epoch": 0.5743092911256935, + "grad_norm": 0.4689586729735219, + "learning_rate": 2.0938335313886513e-06, + "loss": 0.1743, + "step": 2549 + }, + { + "epoch": 0.5745345988115019, + "grad_norm": 0.4151397211129145, + "learning_rate": 2.0919954958430357e-06, + "loss": 0.1531, + "step": 2550 + }, + { + "epoch": 0.5747599064973103, + "grad_norm": 0.48449661990664983, + "learning_rate": 2.0901576868471125e-06, + "loss": 0.1816, + "step": 2551 + }, + { + "epoch": 0.5749852141831189, + "grad_norm": 0.41615432088837917, + "learning_rate": 2.0883201054213493e-06, + "loss": 0.1634, + "step": 2552 + }, + { + "epoch": 0.5752105218689273, + "grad_norm": 0.4245139328580003, + "learning_rate": 2.086482752586087e-06, + "loss": 0.1678, + "step": 2553 + }, + { + "epoch": 0.5754358295547357, + "grad_norm": 0.4440158325525177, + "learning_rate": 2.0846456293615384e-06, + "loss": 0.1621, + "step": 2554 + }, + { + "epoch": 0.5756611372405441, + "grad_norm": 0.4560782868664015, + "learning_rate": 2.0828087367677906e-06, + "loss": 0.175, + "step": 2555 + }, + { + "epoch": 0.5758864449263525, + "grad_norm": 0.4480116276072101, + "learning_rate": 2.0809720758247997e-06, + "loss": 0.1802, + "step": 2556 + }, + { + "epoch": 0.576111752612161, + "grad_norm": 0.5460996335559186, + "learning_rate": 2.0791356475523967e-06, + "loss": 0.1892, + "step": 2557 + }, + { + "epoch": 0.5763370602979694, + "grad_norm": 0.46935486617467453, + "learning_rate": 2.077299452970282e-06, + "loss": 0.1826, + "step": 2558 + }, + { + "epoch": 0.5765623679837778, + "grad_norm": 0.46663850838902876, + "learning_rate": 2.0754634930980245e-06, + "loss": 0.1735, + "step": 2559 + }, + { + "epoch": 0.5767876756695863, + "grad_norm": 0.4652334978548059, + "learning_rate": 2.0736277689550655e-06, + "loss": 0.1736, + "step": 2560 + }, + { + "epoch": 0.5770129833553947, + "grad_norm": 0.4481552043857453, + "learning_rate": 2.0717922815607134e-06, + "loss": 0.1755, + "step": 2561 + }, + { + "epoch": 0.5772382910412032, + "grad_norm": 0.45895527247744683, + "learning_rate": 2.069957031934147e-06, + "loss": 0.1684, + "step": 2562 + }, + { + "epoch": 0.5774635987270116, + "grad_norm": 0.4910364987422849, + "learning_rate": 2.0681220210944106e-06, + "loss": 0.1834, + "step": 2563 + }, + { + "epoch": 0.57768890641282, + "grad_norm": 0.4893436935185404, + "learning_rate": 2.066287250060418e-06, + "loss": 0.1839, + "step": 2564 + }, + { + "epoch": 0.5779142140986284, + "grad_norm": 0.4708826062537264, + "learning_rate": 2.06445271985095e-06, + "loss": 0.1803, + "step": 2565 + }, + { + "epoch": 0.5781395217844368, + "grad_norm": 0.4368039556577337, + "learning_rate": 2.062618431484652e-06, + "loss": 0.1646, + "step": 2566 + }, + { + "epoch": 0.5783648294702453, + "grad_norm": 0.47832114481273286, + "learning_rate": 2.060784385980036e-06, + "loss": 0.1697, + "step": 2567 + }, + { + "epoch": 0.5785901371560538, + "grad_norm": 0.4806285725250625, + "learning_rate": 2.05895058435548e-06, + "loss": 0.1555, + "step": 2568 + }, + { + "epoch": 0.5788154448418622, + "grad_norm": 0.4664225895764955, + "learning_rate": 2.0571170276292233e-06, + "loss": 0.1599, + "step": 2569 + }, + { + "epoch": 0.5790407525276706, + "grad_norm": 0.4957335098591225, + "learning_rate": 2.0552837168193738e-06, + "loss": 0.1714, + "step": 2570 + }, + { + "epoch": 0.579266060213479, + "grad_norm": 0.43042123667416254, + "learning_rate": 2.0534506529439e-06, + "loss": 0.1607, + "step": 2571 + }, + { + "epoch": 0.5794913678992875, + "grad_norm": 0.4763220368799048, + "learning_rate": 2.051617837020633e-06, + "loss": 0.1802, + "step": 2572 + }, + { + "epoch": 0.5797166755850959, + "grad_norm": 0.44989470522014086, + "learning_rate": 2.0497852700672692e-06, + "loss": 0.1591, + "step": 2573 + }, + { + "epoch": 0.5799419832709043, + "grad_norm": 0.5048424829988221, + "learning_rate": 2.047952953101363e-06, + "loss": 0.1818, + "step": 2574 + }, + { + "epoch": 0.5801672909567127, + "grad_norm": 0.4538869193710852, + "learning_rate": 2.0461208871403333e-06, + "loss": 0.1625, + "step": 2575 + }, + { + "epoch": 0.5803925986425212, + "grad_norm": 0.4628310873921318, + "learning_rate": 2.0442890732014563e-06, + "loss": 0.1659, + "step": 2576 + }, + { + "epoch": 0.5806179063283297, + "grad_norm": 0.42700669281461157, + "learning_rate": 2.042457512301871e-06, + "loss": 0.1552, + "step": 2577 + }, + { + "epoch": 0.5808432140141381, + "grad_norm": 0.46373476117441936, + "learning_rate": 2.040626205458574e-06, + "loss": 0.1748, + "step": 2578 + }, + { + "epoch": 0.5810685216999465, + "grad_norm": 0.4804395603630478, + "learning_rate": 2.038795153688423e-06, + "loss": 0.179, + "step": 2579 + }, + { + "epoch": 0.5812938293857549, + "grad_norm": 0.465793547974621, + "learning_rate": 2.0369643580081326e-06, + "loss": 0.1806, + "step": 2580 + }, + { + "epoch": 0.5815191370715633, + "grad_norm": 0.45722792520173156, + "learning_rate": 2.0351338194342744e-06, + "loss": 0.1664, + "step": 2581 + }, + { + "epoch": 0.5817444447573717, + "grad_norm": 0.429351609639069, + "learning_rate": 2.0333035389832795e-06, + "loss": 0.1572, + "step": 2582 + }, + { + "epoch": 0.5819697524431802, + "grad_norm": 0.49765439275709844, + "learning_rate": 2.0314735176714336e-06, + "loss": 0.1877, + "step": 2583 + }, + { + "epoch": 0.5821950601289887, + "grad_norm": 0.5042727467300419, + "learning_rate": 2.0296437565148786e-06, + "loss": 0.1895, + "step": 2584 + }, + { + "epoch": 0.5824203678147971, + "grad_norm": 0.46869866539513694, + "learning_rate": 2.0278142565296153e-06, + "loss": 0.1647, + "step": 2585 + }, + { + "epoch": 0.5826456755006055, + "grad_norm": 0.467213208048088, + "learning_rate": 2.025985018731494e-06, + "loss": 0.1833, + "step": 2586 + }, + { + "epoch": 0.582870983186414, + "grad_norm": 0.4434573751333856, + "learning_rate": 2.0241560441362235e-06, + "loss": 0.1702, + "step": 2587 + }, + { + "epoch": 0.5830962908722224, + "grad_norm": 0.4663438942791567, + "learning_rate": 2.0223273337593647e-06, + "loss": 0.1838, + "step": 2588 + }, + { + "epoch": 0.5833215985580308, + "grad_norm": 0.4837746412963172, + "learning_rate": 2.020498888616333e-06, + "loss": 0.1839, + "step": 2589 + }, + { + "epoch": 0.5835469062438392, + "grad_norm": 0.4777457108245768, + "learning_rate": 2.0186707097223952e-06, + "loss": 0.1732, + "step": 2590 + }, + { + "epoch": 0.5837722139296476, + "grad_norm": 0.4558213080480749, + "learning_rate": 2.016842798092672e-06, + "loss": 0.1674, + "step": 2591 + }, + { + "epoch": 0.5839975216154561, + "grad_norm": 0.454144388691166, + "learning_rate": 2.0150151547421333e-06, + "loss": 0.1686, + "step": 2592 + }, + { + "epoch": 0.5842228293012646, + "grad_norm": 0.4569169772082708, + "learning_rate": 2.013187780685602e-06, + "loss": 0.1679, + "step": 2593 + }, + { + "epoch": 0.584448136987073, + "grad_norm": 0.40864010171219806, + "learning_rate": 2.0113606769377497e-06, + "loss": 0.1661, + "step": 2594 + }, + { + "epoch": 0.5846734446728814, + "grad_norm": 0.4639706421897903, + "learning_rate": 2.009533844513101e-06, + "loss": 0.1782, + "step": 2595 + }, + { + "epoch": 0.5848987523586898, + "grad_norm": 0.4394357847039995, + "learning_rate": 2.0077072844260267e-06, + "loss": 0.1589, + "step": 2596 + }, + { + "epoch": 0.5851240600444982, + "grad_norm": 0.4805132736970225, + "learning_rate": 2.0058809976907475e-06, + "loss": 0.1683, + "step": 2597 + }, + { + "epoch": 0.5853493677303067, + "grad_norm": 0.4759425390968267, + "learning_rate": 2.0040549853213326e-06, + "loss": 0.1766, + "step": 2598 + }, + { + "epoch": 0.5855746754161152, + "grad_norm": 0.443384404671143, + "learning_rate": 2.0022292483316984e-06, + "loss": 0.1686, + "step": 2599 + }, + { + "epoch": 0.5857999831019236, + "grad_norm": 0.4765453397039307, + "learning_rate": 2.0004037877356085e-06, + "loss": 0.1708, + "step": 2600 + }, + { + "epoch": 0.586025290787732, + "grad_norm": 0.4838943365691617, + "learning_rate": 1.998578604546674e-06, + "loss": 0.169, + "step": 2601 + }, + { + "epoch": 0.5862505984735404, + "grad_norm": 0.47314622061784894, + "learning_rate": 1.9967536997783495e-06, + "loss": 0.1798, + "step": 2602 + }, + { + "epoch": 0.5864759061593489, + "grad_norm": 0.4608347594390064, + "learning_rate": 1.9949290744439392e-06, + "loss": 0.1651, + "step": 2603 + }, + { + "epoch": 0.5867012138451573, + "grad_norm": 0.4722394564236731, + "learning_rate": 1.9931047295565863e-06, + "loss": 0.1729, + "step": 2604 + }, + { + "epoch": 0.5869265215309657, + "grad_norm": 0.4436319176869715, + "learning_rate": 1.9912806661292838e-06, + "loss": 0.1615, + "step": 2605 + }, + { + "epoch": 0.5871518292167741, + "grad_norm": 0.4577236447607943, + "learning_rate": 1.989456885174865e-06, + "loss": 0.1745, + "step": 2606 + }, + { + "epoch": 0.5873771369025826, + "grad_norm": 0.4443261491640263, + "learning_rate": 1.987633387706008e-06, + "loss": 0.1502, + "step": 2607 + }, + { + "epoch": 0.5876024445883911, + "grad_norm": 0.43624057624151924, + "learning_rate": 1.9858101747352326e-06, + "loss": 0.1613, + "step": 2608 + }, + { + "epoch": 0.5878277522741995, + "grad_norm": 0.4628822161523582, + "learning_rate": 1.9839872472749016e-06, + "loss": 0.1707, + "step": 2609 + }, + { + "epoch": 0.5880530599600079, + "grad_norm": 0.4323354951405151, + "learning_rate": 1.9821646063372174e-06, + "loss": 0.1523, + "step": 2610 + }, + { + "epoch": 0.5882783676458163, + "grad_norm": 0.46163630354932705, + "learning_rate": 1.9803422529342264e-06, + "loss": 0.1589, + "step": 2611 + }, + { + "epoch": 0.5885036753316247, + "grad_norm": 0.4521507297186822, + "learning_rate": 1.978520188077813e-06, + "loss": 0.1585, + "step": 2612 + }, + { + "epoch": 0.5887289830174331, + "grad_norm": 0.4885857745673763, + "learning_rate": 1.976698412779701e-06, + "loss": 0.1872, + "step": 2613 + }, + { + "epoch": 0.5889542907032416, + "grad_norm": 0.4437227647491848, + "learning_rate": 1.9748769280514544e-06, + "loss": 0.1693, + "step": 2614 + }, + { + "epoch": 0.5891795983890501, + "grad_norm": 0.45151088423918256, + "learning_rate": 1.973055734904477e-06, + "loss": 0.1708, + "step": 2615 + }, + { + "epoch": 0.5894049060748585, + "grad_norm": 0.473545068163997, + "learning_rate": 1.971234834350008e-06, + "loss": 0.1673, + "step": 2616 + }, + { + "epoch": 0.5896302137606669, + "grad_norm": 0.4617209032513543, + "learning_rate": 1.9694142273991264e-06, + "loss": 0.1709, + "step": 2617 + }, + { + "epoch": 0.5898555214464754, + "grad_norm": 0.4664517069394605, + "learning_rate": 1.967593915062748e-06, + "loss": 0.1741, + "step": 2618 + }, + { + "epoch": 0.5900808291322838, + "grad_norm": 0.45611166449780943, + "learning_rate": 1.9657738983516227e-06, + "loss": 0.1749, + "step": 2619 + }, + { + "epoch": 0.5903061368180922, + "grad_norm": 0.47410663307317547, + "learning_rate": 1.96395417827634e-06, + "loss": 0.1899, + "step": 2620 + }, + { + "epoch": 0.5905314445039006, + "grad_norm": 0.43002055884073553, + "learning_rate": 1.9621347558473216e-06, + "loss": 0.1463, + "step": 2621 + }, + { + "epoch": 0.590756752189709, + "grad_norm": 0.4668488908347395, + "learning_rate": 1.960315632074824e-06, + "loss": 0.1704, + "step": 2622 + }, + { + "epoch": 0.5909820598755176, + "grad_norm": 0.45917674856534796, + "learning_rate": 1.95849680796894e-06, + "loss": 0.1616, + "step": 2623 + }, + { + "epoch": 0.591207367561326, + "grad_norm": 0.4669536646246189, + "learning_rate": 1.9566782845395945e-06, + "loss": 0.1807, + "step": 2624 + }, + { + "epoch": 0.5914326752471344, + "grad_norm": 0.458790265032477, + "learning_rate": 1.9548600627965454e-06, + "loss": 0.1858, + "step": 2625 + }, + { + "epoch": 0.5916579829329428, + "grad_norm": 0.4626784881770388, + "learning_rate": 1.9530421437493843e-06, + "loss": 0.171, + "step": 2626 + }, + { + "epoch": 0.5918832906187512, + "grad_norm": 0.46410485499187215, + "learning_rate": 1.951224528407534e-06, + "loss": 0.1626, + "step": 2627 + }, + { + "epoch": 0.5921085983045596, + "grad_norm": 0.4657228301667429, + "learning_rate": 1.949407217780247e-06, + "loss": 0.1721, + "step": 2628 + }, + { + "epoch": 0.5923339059903681, + "grad_norm": 0.4632093421936262, + "learning_rate": 1.94759021287661e-06, + "loss": 0.1872, + "step": 2629 + }, + { + "epoch": 0.5925592136761765, + "grad_norm": 0.4819328436546404, + "learning_rate": 1.945773514705537e-06, + "loss": 0.1698, + "step": 2630 + }, + { + "epoch": 0.592784521361985, + "grad_norm": 0.442519004929695, + "learning_rate": 1.943957124275773e-06, + "loss": 0.1716, + "step": 2631 + }, + { + "epoch": 0.5930098290477934, + "grad_norm": 0.4075895077799261, + "learning_rate": 1.9421410425958915e-06, + "loss": 0.142, + "step": 2632 + }, + { + "epoch": 0.5932351367336018, + "grad_norm": 0.4780523119898107, + "learning_rate": 1.9403252706742957e-06, + "loss": 0.1802, + "step": 2633 + }, + { + "epoch": 0.5934604444194103, + "grad_norm": 0.4294812045261628, + "learning_rate": 1.938509809519216e-06, + "loss": 0.1647, + "step": 2634 + }, + { + "epoch": 0.5936857521052187, + "grad_norm": 0.456698096525506, + "learning_rate": 1.9366946601387103e-06, + "loss": 0.1679, + "step": 2635 + }, + { + "epoch": 0.5939110597910271, + "grad_norm": 0.4925193351335292, + "learning_rate": 1.934879823540663e-06, + "loss": 0.1942, + "step": 2636 + }, + { + "epoch": 0.5941363674768355, + "grad_norm": 0.5115316970744371, + "learning_rate": 1.9330653007327852e-06, + "loss": 0.1815, + "step": 2637 + }, + { + "epoch": 0.5943616751626439, + "grad_norm": 0.4739523991693173, + "learning_rate": 1.931251092722615e-06, + "loss": 0.1768, + "step": 2638 + }, + { + "epoch": 0.5945869828484525, + "grad_norm": 0.4683400797819289, + "learning_rate": 1.9294372005175125e-06, + "loss": 0.1718, + "step": 2639 + }, + { + "epoch": 0.5948122905342609, + "grad_norm": 0.5050530490471303, + "learning_rate": 1.9276236251246655e-06, + "loss": 0.1719, + "step": 2640 + }, + { + "epoch": 0.5950375982200693, + "grad_norm": 0.44967216446937647, + "learning_rate": 1.9258103675510846e-06, + "loss": 0.1683, + "step": 2641 + }, + { + "epoch": 0.5952629059058777, + "grad_norm": 0.5083600866578817, + "learning_rate": 1.9239974288036044e-06, + "loss": 0.1863, + "step": 2642 + }, + { + "epoch": 0.5954882135916861, + "grad_norm": 0.40343559345153, + "learning_rate": 1.9221848098888817e-06, + "loss": 0.1427, + "step": 2643 + }, + { + "epoch": 0.5957135212774946, + "grad_norm": 0.4357435151973131, + "learning_rate": 1.920372511813397e-06, + "loss": 0.1588, + "step": 2644 + }, + { + "epoch": 0.595938828963303, + "grad_norm": 0.450744818422895, + "learning_rate": 1.9185605355834518e-06, + "loss": 0.1665, + "step": 2645 + }, + { + "epoch": 0.5961641366491115, + "grad_norm": 0.4617050329576974, + "learning_rate": 1.916748882205168e-06, + "loss": 0.1586, + "step": 2646 + }, + { + "epoch": 0.5963894443349199, + "grad_norm": 0.44108717620657356, + "learning_rate": 1.91493755268449e-06, + "loss": 0.1559, + "step": 2647 + }, + { + "epoch": 0.5966147520207283, + "grad_norm": 0.4389522540307208, + "learning_rate": 1.913126548027181e-06, + "loss": 0.1671, + "step": 2648 + }, + { + "epoch": 0.5968400597065368, + "grad_norm": 0.45648725194338563, + "learning_rate": 1.9113158692388253e-06, + "loss": 0.1647, + "step": 2649 + }, + { + "epoch": 0.5970653673923452, + "grad_norm": 0.43789095933025013, + "learning_rate": 1.909505517324825e-06, + "loss": 0.1738, + "step": 2650 + }, + { + "epoch": 0.5972906750781536, + "grad_norm": 0.43659026026395514, + "learning_rate": 1.9076954932904e-06, + "loss": 0.1511, + "step": 2651 + }, + { + "epoch": 0.597515982763962, + "grad_norm": 0.4603261181340911, + "learning_rate": 1.905885798140591e-06, + "loss": 0.16, + "step": 2652 + }, + { + "epoch": 0.5977412904497704, + "grad_norm": 0.44421726551799345, + "learning_rate": 1.9040764328802523e-06, + "loss": 0.1592, + "step": 2653 + }, + { + "epoch": 0.597966598135579, + "grad_norm": 0.4272362790479577, + "learning_rate": 1.9022673985140585e-06, + "loss": 0.1568, + "step": 2654 + }, + { + "epoch": 0.5981919058213874, + "grad_norm": 0.460692265391587, + "learning_rate": 1.9004586960464993e-06, + "loss": 0.1769, + "step": 2655 + }, + { + "epoch": 0.5984172135071958, + "grad_norm": 0.44611832642724253, + "learning_rate": 1.8986503264818785e-06, + "loss": 0.1648, + "step": 2656 + }, + { + "epoch": 0.5986425211930042, + "grad_norm": 0.4360370396627426, + "learning_rate": 1.8968422908243156e-06, + "loss": 0.1701, + "step": 2657 + }, + { + "epoch": 0.5988678288788126, + "grad_norm": 0.4417496216468464, + "learning_rate": 1.895034590077747e-06, + "loss": 0.1639, + "step": 2658 + }, + { + "epoch": 0.599093136564621, + "grad_norm": 0.49293395397489076, + "learning_rate": 1.8932272252459213e-06, + "loss": 0.1851, + "step": 2659 + }, + { + "epoch": 0.5993184442504295, + "grad_norm": 0.48447236824054063, + "learning_rate": 1.8914201973324004e-06, + "loss": 0.1827, + "step": 2660 + }, + { + "epoch": 0.5995437519362379, + "grad_norm": 0.4247074914348031, + "learning_rate": 1.88961350734056e-06, + "loss": 0.1479, + "step": 2661 + }, + { + "epoch": 0.5997690596220464, + "grad_norm": 0.4628220318807962, + "learning_rate": 1.8878071562735873e-06, + "loss": 0.1821, + "step": 2662 + }, + { + "epoch": 0.5999943673078548, + "grad_norm": 0.4392365885191483, + "learning_rate": 1.8860011451344811e-06, + "loss": 0.1709, + "step": 2663 + }, + { + "epoch": 0.6002196749936632, + "grad_norm": 0.46804510788299897, + "learning_rate": 1.8841954749260535e-06, + "loss": 0.1755, + "step": 2664 + }, + { + "epoch": 0.6004449826794717, + "grad_norm": 0.4496005818387996, + "learning_rate": 1.8823901466509253e-06, + "loss": 0.1787, + "step": 2665 + }, + { + "epoch": 0.6006702903652801, + "grad_norm": 0.43391223477535656, + "learning_rate": 1.8805851613115278e-06, + "loss": 0.1707, + "step": 2666 + }, + { + "epoch": 0.6008955980510885, + "grad_norm": 0.44327803302504704, + "learning_rate": 1.878780519910102e-06, + "loss": 0.1532, + "step": 2667 + }, + { + "epoch": 0.6011209057368969, + "grad_norm": 0.46165307268777256, + "learning_rate": 1.8769762234486982e-06, + "loss": 0.1582, + "step": 2668 + }, + { + "epoch": 0.6013462134227053, + "grad_norm": 0.4434123601580583, + "learning_rate": 1.8751722729291747e-06, + "loss": 0.1707, + "step": 2669 + }, + { + "epoch": 0.6015715211085139, + "grad_norm": 0.4265862959398504, + "learning_rate": 1.8733686693531986e-06, + "loss": 0.1495, + "step": 2670 + }, + { + "epoch": 0.6017968287943223, + "grad_norm": 0.44676115551761725, + "learning_rate": 1.8715654137222434e-06, + "loss": 0.1593, + "step": 2671 + }, + { + "epoch": 0.6020221364801307, + "grad_norm": 0.460032716851711, + "learning_rate": 1.8697625070375893e-06, + "loss": 0.1717, + "step": 2672 + }, + { + "epoch": 0.6022474441659391, + "grad_norm": 0.44371758907880954, + "learning_rate": 1.8679599503003246e-06, + "loss": 0.1571, + "step": 2673 + }, + { + "epoch": 0.6024727518517475, + "grad_norm": 0.4276673722998321, + "learning_rate": 1.8661577445113399e-06, + "loss": 0.1639, + "step": 2674 + }, + { + "epoch": 0.602698059537556, + "grad_norm": 0.4589523006947759, + "learning_rate": 1.8643558906713344e-06, + "loss": 0.1671, + "step": 2675 + }, + { + "epoch": 0.6029233672233644, + "grad_norm": 0.4444427590263503, + "learning_rate": 1.8625543897808094e-06, + "loss": 0.1539, + "step": 2676 + }, + { + "epoch": 0.6031486749091728, + "grad_norm": 0.4339288748594548, + "learning_rate": 1.8607532428400714e-06, + "loss": 0.1626, + "step": 2677 + }, + { + "epoch": 0.6033739825949813, + "grad_norm": 0.44899760075051875, + "learning_rate": 1.8589524508492308e-06, + "loss": 0.1848, + "step": 2678 + }, + { + "epoch": 0.6035992902807897, + "grad_norm": 0.4420035051479376, + "learning_rate": 1.8571520148081992e-06, + "loss": 0.1666, + "step": 2679 + }, + { + "epoch": 0.6038245979665982, + "grad_norm": 0.4563960451283387, + "learning_rate": 1.8553519357166927e-06, + "loss": 0.1694, + "step": 2680 + }, + { + "epoch": 0.6040499056524066, + "grad_norm": 0.44262143236842727, + "learning_rate": 1.853552214574228e-06, + "loss": 0.1725, + "step": 2681 + }, + { + "epoch": 0.604275213338215, + "grad_norm": 0.41580648235137924, + "learning_rate": 1.8517528523801226e-06, + "loss": 0.1397, + "step": 2682 + }, + { + "epoch": 0.6045005210240234, + "grad_norm": 0.4601558740905194, + "learning_rate": 1.8499538501334955e-06, + "loss": 0.1642, + "step": 2683 + }, + { + "epoch": 0.6047258287098318, + "grad_norm": 0.4617766473771436, + "learning_rate": 1.8481552088332656e-06, + "loss": 0.1689, + "step": 2684 + }, + { + "epoch": 0.6049511363956402, + "grad_norm": 0.46934808711625325, + "learning_rate": 1.8463569294781509e-06, + "loss": 0.1801, + "step": 2685 + }, + { + "epoch": 0.6051764440814488, + "grad_norm": 0.47611799666870297, + "learning_rate": 1.84455901306667e-06, + "loss": 0.1684, + "step": 2686 + }, + { + "epoch": 0.6054017517672572, + "grad_norm": 0.4486970622779695, + "learning_rate": 1.842761460597138e-06, + "loss": 0.1651, + "step": 2687 + }, + { + "epoch": 0.6056270594530656, + "grad_norm": 0.4680524508389978, + "learning_rate": 1.8409642730676692e-06, + "loss": 0.1699, + "step": 2688 + }, + { + "epoch": 0.605852367138874, + "grad_norm": 0.44496477674543194, + "learning_rate": 1.8391674514761745e-06, + "loss": 0.1641, + "step": 2689 + }, + { + "epoch": 0.6060776748246824, + "grad_norm": 0.47685772323293657, + "learning_rate": 1.8373709968203624e-06, + "loss": 0.1785, + "step": 2690 + }, + { + "epoch": 0.6063029825104909, + "grad_norm": 0.41840355207723284, + "learning_rate": 1.8355749100977371e-06, + "loss": 0.1533, + "step": 2691 + }, + { + "epoch": 0.6065282901962993, + "grad_norm": 0.43215745356234536, + "learning_rate": 1.8337791923055983e-06, + "loss": 0.164, + "step": 2692 + }, + { + "epoch": 0.6067535978821077, + "grad_norm": 0.4146463028731051, + "learning_rate": 1.8319838444410412e-06, + "loss": 0.1461, + "step": 2693 + }, + { + "epoch": 0.6069789055679162, + "grad_norm": 0.43506368284267727, + "learning_rate": 1.8301888675009554e-06, + "loss": 0.1606, + "step": 2694 + }, + { + "epoch": 0.6072042132537246, + "grad_norm": 0.5016520189748299, + "learning_rate": 1.8283942624820247e-06, + "loss": 0.1869, + "step": 2695 + }, + { + "epoch": 0.6074295209395331, + "grad_norm": 0.4481516494179516, + "learning_rate": 1.8266000303807272e-06, + "loss": 0.1639, + "step": 2696 + }, + { + "epoch": 0.6076548286253415, + "grad_norm": 0.5259764085733836, + "learning_rate": 1.8248061721933325e-06, + "loss": 0.158, + "step": 2697 + }, + { + "epoch": 0.6078801363111499, + "grad_norm": 0.462300079252785, + "learning_rate": 1.8230126889159027e-06, + "loss": 0.1725, + "step": 2698 + }, + { + "epoch": 0.6081054439969583, + "grad_norm": 0.5028819405097953, + "learning_rate": 1.8212195815442934e-06, + "loss": 0.1665, + "step": 2699 + }, + { + "epoch": 0.6083307516827667, + "grad_norm": 0.4406804149141906, + "learning_rate": 1.8194268510741493e-06, + "loss": 0.1558, + "step": 2700 + }, + { + "epoch": 0.6085560593685753, + "grad_norm": 0.4784096207470811, + "learning_rate": 1.8176344985009064e-06, + "loss": 0.17, + "step": 2701 + }, + { + "epoch": 0.6087813670543837, + "grad_norm": 0.4813685411663574, + "learning_rate": 1.8158425248197931e-06, + "loss": 0.1802, + "step": 2702 + }, + { + "epoch": 0.6090066747401921, + "grad_norm": 0.46355889349886975, + "learning_rate": 1.8140509310258238e-06, + "loss": 0.1607, + "step": 2703 + }, + { + "epoch": 0.6092319824260005, + "grad_norm": 0.4773601700278416, + "learning_rate": 1.812259718113805e-06, + "loss": 0.1762, + "step": 2704 + }, + { + "epoch": 0.6094572901118089, + "grad_norm": 0.42936186737291065, + "learning_rate": 1.8104688870783296e-06, + "loss": 0.154, + "step": 2705 + }, + { + "epoch": 0.6096825977976174, + "grad_norm": 0.485920986765524, + "learning_rate": 1.8086784389137796e-06, + "loss": 0.194, + "step": 2706 + }, + { + "epoch": 0.6099079054834258, + "grad_norm": 0.4694977633675229, + "learning_rate": 1.806888374614324e-06, + "loss": 0.1702, + "step": 2707 + }, + { + "epoch": 0.6101332131692342, + "grad_norm": 0.4627159550005137, + "learning_rate": 1.8050986951739201e-06, + "loss": 0.1772, + "step": 2708 + }, + { + "epoch": 0.6103585208550427, + "grad_norm": 0.4442418880540444, + "learning_rate": 1.8033094015863082e-06, + "loss": 0.1638, + "step": 2709 + }, + { + "epoch": 0.6105838285408511, + "grad_norm": 0.4607735222604407, + "learning_rate": 1.8015204948450166e-06, + "loss": 0.1611, + "step": 2710 + }, + { + "epoch": 0.6108091362266596, + "grad_norm": 0.43525664328040414, + "learning_rate": 1.7997319759433596e-06, + "loss": 0.1637, + "step": 2711 + }, + { + "epoch": 0.611034443912468, + "grad_norm": 0.4669531429872553, + "learning_rate": 1.7979438458744343e-06, + "loss": 0.1751, + "step": 2712 + }, + { + "epoch": 0.6112597515982764, + "grad_norm": 0.4383106377730204, + "learning_rate": 1.7961561056311234e-06, + "loss": 0.1727, + "step": 2713 + }, + { + "epoch": 0.6114850592840848, + "grad_norm": 0.46898871723224606, + "learning_rate": 1.7943687562060919e-06, + "loss": 0.176, + "step": 2714 + }, + { + "epoch": 0.6117103669698932, + "grad_norm": 0.46145283457842057, + "learning_rate": 1.792581798591788e-06, + "loss": 0.1761, + "step": 2715 + }, + { + "epoch": 0.6119356746557016, + "grad_norm": 0.45015364303106153, + "learning_rate": 1.7907952337804429e-06, + "loss": 0.1652, + "step": 2716 + }, + { + "epoch": 0.6121609823415102, + "grad_norm": 0.4946084809734864, + "learning_rate": 1.7890090627640699e-06, + "loss": 0.1851, + "step": 2717 + }, + { + "epoch": 0.6123862900273186, + "grad_norm": 0.4726652855648285, + "learning_rate": 1.787223286534463e-06, + "loss": 0.1812, + "step": 2718 + }, + { + "epoch": 0.612611597713127, + "grad_norm": 0.47750517796510844, + "learning_rate": 1.785437906083197e-06, + "loss": 0.1753, + "step": 2719 + }, + { + "epoch": 0.6128369053989354, + "grad_norm": 0.4693671565765113, + "learning_rate": 1.783652922401627e-06, + "loss": 0.1764, + "step": 2720 + }, + { + "epoch": 0.6130622130847438, + "grad_norm": 0.4342260684869311, + "learning_rate": 1.7818683364808883e-06, + "loss": 0.158, + "step": 2721 + }, + { + "epoch": 0.6132875207705523, + "grad_norm": 0.4683016141114234, + "learning_rate": 1.7800841493118942e-06, + "loss": 0.1627, + "step": 2722 + }, + { + "epoch": 0.6135128284563607, + "grad_norm": 0.4696902925277998, + "learning_rate": 1.7783003618853384e-06, + "loss": 0.1704, + "step": 2723 + }, + { + "epoch": 0.6137381361421691, + "grad_norm": 0.4473367617436182, + "learning_rate": 1.776516975191691e-06, + "loss": 0.1643, + "step": 2724 + }, + { + "epoch": 0.6139634438279776, + "grad_norm": 0.44338339975653285, + "learning_rate": 1.7747339902212e-06, + "loss": 0.1597, + "step": 2725 + }, + { + "epoch": 0.614188751513786, + "grad_norm": 0.4331622845604683, + "learning_rate": 1.7729514079638915e-06, + "loss": 0.1542, + "step": 2726 + }, + { + "epoch": 0.6144140591995945, + "grad_norm": 0.47511608416592777, + "learning_rate": 1.7711692294095654e-06, + "loss": 0.1767, + "step": 2727 + }, + { + "epoch": 0.6146393668854029, + "grad_norm": 0.5016835171872812, + "learning_rate": 1.7693874555477996e-06, + "loss": 0.187, + "step": 2728 + }, + { + "epoch": 0.6148646745712113, + "grad_norm": 0.4655966009781916, + "learning_rate": 1.7676060873679473e-06, + "loss": 0.1729, + "step": 2729 + }, + { + "epoch": 0.6150899822570197, + "grad_norm": 0.4323029765439154, + "learning_rate": 1.7658251258591352e-06, + "loss": 0.1725, + "step": 2730 + }, + { + "epoch": 0.6153152899428281, + "grad_norm": 0.47186222491406027, + "learning_rate": 1.764044572010265e-06, + "loss": 0.1724, + "step": 2731 + }, + { + "epoch": 0.6155405976286366, + "grad_norm": 0.4676381059869174, + "learning_rate": 1.7622644268100116e-06, + "loss": 0.1884, + "step": 2732 + }, + { + "epoch": 0.6157659053144451, + "grad_norm": 0.48778473489006996, + "learning_rate": 1.7604846912468243e-06, + "loss": 0.1895, + "step": 2733 + }, + { + "epoch": 0.6159912130002535, + "grad_norm": 0.44861843491569864, + "learning_rate": 1.7587053663089233e-06, + "loss": 0.1644, + "step": 2734 + }, + { + "epoch": 0.6162165206860619, + "grad_norm": 0.4481769032079395, + "learning_rate": 1.7569264529843009e-06, + "loss": 0.1733, + "step": 2735 + }, + { + "epoch": 0.6164418283718703, + "grad_norm": 0.4649639997782487, + "learning_rate": 1.755147952260722e-06, + "loss": 0.1674, + "step": 2736 + }, + { + "epoch": 0.6166671360576788, + "grad_norm": 0.45740651193204557, + "learning_rate": 1.753369865125722e-06, + "loss": 0.1777, + "step": 2737 + }, + { + "epoch": 0.6168924437434872, + "grad_norm": 0.4575048029706965, + "learning_rate": 1.7515921925666053e-06, + "loss": 0.1808, + "step": 2738 + }, + { + "epoch": 0.6171177514292956, + "grad_norm": 0.4562328532116244, + "learning_rate": 1.749814935570448e-06, + "loss": 0.1773, + "step": 2739 + }, + { + "epoch": 0.617343059115104, + "grad_norm": 0.45299457443149754, + "learning_rate": 1.748038095124095e-06, + "loss": 0.1617, + "step": 2740 + }, + { + "epoch": 0.6175683668009125, + "grad_norm": 0.43620500196946815, + "learning_rate": 1.746261672214159e-06, + "loss": 0.1656, + "step": 2741 + }, + { + "epoch": 0.617793674486721, + "grad_norm": 0.4586503037799697, + "learning_rate": 1.7444856678270218e-06, + "loss": 0.1776, + "step": 2742 + }, + { + "epoch": 0.6180189821725294, + "grad_norm": 0.45463381866393865, + "learning_rate": 1.7427100829488325e-06, + "loss": 0.1677, + "step": 2743 + }, + { + "epoch": 0.6182442898583378, + "grad_norm": 0.45818753093702824, + "learning_rate": 1.7409349185655067e-06, + "loss": 0.1698, + "step": 2744 + }, + { + "epoch": 0.6184695975441462, + "grad_norm": 0.45133684687497166, + "learning_rate": 1.739160175662727e-06, + "loss": 0.1612, + "step": 2745 + }, + { + "epoch": 0.6186949052299546, + "grad_norm": 0.43897224195181617, + "learning_rate": 1.7373858552259421e-06, + "loss": 0.1766, + "step": 2746 + }, + { + "epoch": 0.618920212915763, + "grad_norm": 0.46242630539655954, + "learning_rate": 1.7356119582403663e-06, + "loss": 0.1705, + "step": 2747 + }, + { + "epoch": 0.6191455206015716, + "grad_norm": 0.4846029968450731, + "learning_rate": 1.733838485690978e-06, + "loss": 0.1679, + "step": 2748 + }, + { + "epoch": 0.61937082828738, + "grad_norm": 0.4361865304178895, + "learning_rate": 1.7320654385625208e-06, + "loss": 0.1537, + "step": 2749 + }, + { + "epoch": 0.6195961359731884, + "grad_norm": 0.42709962747922303, + "learning_rate": 1.7302928178395018e-06, + "loss": 0.1544, + "step": 2750 + }, + { + "epoch": 0.6198214436589968, + "grad_norm": 0.4729204284747366, + "learning_rate": 1.7285206245061908e-06, + "loss": 0.1676, + "step": 2751 + }, + { + "epoch": 0.6200467513448052, + "grad_norm": 0.42864499215327756, + "learning_rate": 1.726748859546621e-06, + "loss": 0.1544, + "step": 2752 + }, + { + "epoch": 0.6202720590306137, + "grad_norm": 0.4068238843828877, + "learning_rate": 1.7249775239445875e-06, + "loss": 0.1433, + "step": 2753 + }, + { + "epoch": 0.6204973667164221, + "grad_norm": 0.4023943484895699, + "learning_rate": 1.723206618683646e-06, + "loss": 0.1481, + "step": 2754 + }, + { + "epoch": 0.6207226744022305, + "grad_norm": 0.4577958355505525, + "learning_rate": 1.7214361447471156e-06, + "loss": 0.167, + "step": 2755 + }, + { + "epoch": 0.620947982088039, + "grad_norm": 0.47031401392457456, + "learning_rate": 1.7196661031180738e-06, + "loss": 0.182, + "step": 2756 + }, + { + "epoch": 0.6211732897738474, + "grad_norm": 0.45680605415127173, + "learning_rate": 1.7178964947793591e-06, + "loss": 0.1603, + "step": 2757 + }, + { + "epoch": 0.6213985974596559, + "grad_norm": 0.46597085030930985, + "learning_rate": 1.716127320713568e-06, + "loss": 0.1623, + "step": 2758 + }, + { + "epoch": 0.6216239051454643, + "grad_norm": 0.4584851355273736, + "learning_rate": 1.7143585819030583e-06, + "loss": 0.1627, + "step": 2759 + }, + { + "epoch": 0.6218492128312727, + "grad_norm": 0.42436266607365575, + "learning_rate": 1.7125902793299434e-06, + "loss": 0.1549, + "step": 2760 + }, + { + "epoch": 0.6220745205170811, + "grad_norm": 0.4012785148722167, + "learning_rate": 1.7108224139760982e-06, + "loss": 0.1394, + "step": 2761 + }, + { + "epoch": 0.6222998282028895, + "grad_norm": 0.4150226870345625, + "learning_rate": 1.7090549868231492e-06, + "loss": 0.1495, + "step": 2762 + }, + { + "epoch": 0.622525135888698, + "grad_norm": 0.4576970243010181, + "learning_rate": 1.707287998852485e-06, + "loss": 0.1787, + "step": 2763 + }, + { + "epoch": 0.6227504435745065, + "grad_norm": 0.41065424207162743, + "learning_rate": 1.7055214510452462e-06, + "loss": 0.1525, + "step": 2764 + }, + { + "epoch": 0.6229757512603149, + "grad_norm": 0.44516251935291046, + "learning_rate": 1.7037553443823332e-06, + "loss": 0.1647, + "step": 2765 + }, + { + "epoch": 0.6232010589461233, + "grad_norm": 0.4159304360771923, + "learning_rate": 1.7019896798443984e-06, + "loss": 0.1499, + "step": 2766 + }, + { + "epoch": 0.6234263666319317, + "grad_norm": 0.4429911263771683, + "learning_rate": 1.700224458411849e-06, + "loss": 0.1768, + "step": 2767 + }, + { + "epoch": 0.6236516743177402, + "grad_norm": 0.46157323624776825, + "learning_rate": 1.6984596810648475e-06, + "loss": 0.1779, + "step": 2768 + }, + { + "epoch": 0.6238769820035486, + "grad_norm": 0.45417738591830176, + "learning_rate": 1.6966953487833078e-06, + "loss": 0.1808, + "step": 2769 + }, + { + "epoch": 0.624102289689357, + "grad_norm": 0.4642291225427212, + "learning_rate": 1.6949314625468985e-06, + "loss": 0.1693, + "step": 2770 + }, + { + "epoch": 0.6243275973751654, + "grad_norm": 0.4719676253980263, + "learning_rate": 1.6931680233350404e-06, + "loss": 0.1682, + "step": 2771 + }, + { + "epoch": 0.6245529050609739, + "grad_norm": 0.4504049123077282, + "learning_rate": 1.6914050321269049e-06, + "loss": 0.1655, + "step": 2772 + }, + { + "epoch": 0.6247782127467824, + "grad_norm": 0.4807454498560626, + "learning_rate": 1.6896424899014158e-06, + "loss": 0.1605, + "step": 2773 + }, + { + "epoch": 0.6250035204325908, + "grad_norm": 0.47006657270283986, + "learning_rate": 1.6878803976372465e-06, + "loss": 0.1898, + "step": 2774 + }, + { + "epoch": 0.6252288281183992, + "grad_norm": 0.4688144690975506, + "learning_rate": 1.6861187563128217e-06, + "loss": 0.1703, + "step": 2775 + }, + { + "epoch": 0.6254541358042076, + "grad_norm": 0.44103662425353535, + "learning_rate": 1.6843575669063142e-06, + "loss": 0.1587, + "step": 2776 + }, + { + "epoch": 0.625679443490016, + "grad_norm": 0.4360417612529722, + "learning_rate": 1.682596830395648e-06, + "loss": 0.1626, + "step": 2777 + }, + { + "epoch": 0.6259047511758244, + "grad_norm": 0.44549947778845, + "learning_rate": 1.6808365477584953e-06, + "loss": 0.1766, + "step": 2778 + }, + { + "epoch": 0.6261300588616329, + "grad_norm": 0.48787357429516487, + "learning_rate": 1.6790767199722724e-06, + "loss": 0.1849, + "step": 2779 + }, + { + "epoch": 0.6263553665474414, + "grad_norm": 0.4543864413631497, + "learning_rate": 1.6773173480141487e-06, + "loss": 0.1701, + "step": 2780 + }, + { + "epoch": 0.6265806742332498, + "grad_norm": 0.4449753953769943, + "learning_rate": 1.6755584328610364e-06, + "loss": 0.1651, + "step": 2781 + }, + { + "epoch": 0.6268059819190582, + "grad_norm": 0.4675412985856472, + "learning_rate": 1.6737999754895965e-06, + "loss": 0.1665, + "step": 2782 + }, + { + "epoch": 0.6270312896048666, + "grad_norm": 0.4549272911689031, + "learning_rate": 1.6720419768762343e-06, + "loss": 0.178, + "step": 2783 + }, + { + "epoch": 0.6272565972906751, + "grad_norm": 0.43556484772071463, + "learning_rate": 1.6702844379971012e-06, + "loss": 0.1612, + "step": 2784 + }, + { + "epoch": 0.6274819049764835, + "grad_norm": 0.475001758657002, + "learning_rate": 1.668527359828092e-06, + "loss": 0.1665, + "step": 2785 + }, + { + "epoch": 0.6277072126622919, + "grad_norm": 0.457051651483831, + "learning_rate": 1.6667707433448482e-06, + "loss": 0.1733, + "step": 2786 + }, + { + "epoch": 0.6279325203481003, + "grad_norm": 0.45592061950787227, + "learning_rate": 1.6650145895227532e-06, + "loss": 0.1762, + "step": 2787 + }, + { + "epoch": 0.6281578280339088, + "grad_norm": 0.46208104124077887, + "learning_rate": 1.663258899336933e-06, + "loss": 0.1818, + "step": 2788 + }, + { + "epoch": 0.6283831357197173, + "grad_norm": 0.4621720885029457, + "learning_rate": 1.6615036737622574e-06, + "loss": 0.1596, + "step": 2789 + }, + { + "epoch": 0.6286084434055257, + "grad_norm": 0.5010290185461516, + "learning_rate": 1.6597489137733377e-06, + "loss": 0.186, + "step": 2790 + }, + { + "epoch": 0.6288337510913341, + "grad_norm": 0.4501992218373823, + "learning_rate": 1.6579946203445269e-06, + "loss": 0.1671, + "step": 2791 + }, + { + "epoch": 0.6290590587771425, + "grad_norm": 0.4469808105209307, + "learning_rate": 1.6562407944499175e-06, + "loss": 0.1601, + "step": 2792 + }, + { + "epoch": 0.6292843664629509, + "grad_norm": 0.440599469647921, + "learning_rate": 1.6544874370633456e-06, + "loss": 0.1629, + "step": 2793 + }, + { + "epoch": 0.6295096741487594, + "grad_norm": 0.4505384804227862, + "learning_rate": 1.652734549158384e-06, + "loss": 0.1714, + "step": 2794 + }, + { + "epoch": 0.6297349818345679, + "grad_norm": 0.44655327680939544, + "learning_rate": 1.6509821317083466e-06, + "loss": 0.1679, + "step": 2795 + }, + { + "epoch": 0.6299602895203763, + "grad_norm": 0.4200971213482022, + "learning_rate": 1.6492301856862855e-06, + "loss": 0.149, + "step": 2796 + }, + { + "epoch": 0.6301855972061847, + "grad_norm": 0.42324419421457493, + "learning_rate": 1.6474787120649903e-06, + "loss": 0.1669, + "step": 2797 + }, + { + "epoch": 0.6304109048919931, + "grad_norm": 0.4515067176790109, + "learning_rate": 1.6457277118169893e-06, + "loss": 0.169, + "step": 2798 + }, + { + "epoch": 0.6306362125778016, + "grad_norm": 0.46183372027809466, + "learning_rate": 1.6439771859145476e-06, + "loss": 0.1665, + "step": 2799 + }, + { + "epoch": 0.63086152026361, + "grad_norm": 0.44642522002565216, + "learning_rate": 1.6422271353296675e-06, + "loss": 0.1634, + "step": 2800 + }, + { + "epoch": 0.6310868279494184, + "grad_norm": 0.47715206520813574, + "learning_rate": 1.640477561034086e-06, + "loss": 0.1707, + "step": 2801 + }, + { + "epoch": 0.6313121356352268, + "grad_norm": 0.44004629277125445, + "learning_rate": 1.6387284639992773e-06, + "loss": 0.1537, + "step": 2802 + }, + { + "epoch": 0.6315374433210353, + "grad_norm": 0.45114917334218985, + "learning_rate": 1.6369798451964496e-06, + "loss": 0.1666, + "step": 2803 + }, + { + "epoch": 0.6317627510068438, + "grad_norm": 0.49548093993061443, + "learning_rate": 1.6352317055965458e-06, + "loss": 0.1804, + "step": 2804 + }, + { + "epoch": 0.6319880586926522, + "grad_norm": 0.45268940243981387, + "learning_rate": 1.6334840461702422e-06, + "loss": 0.1732, + "step": 2805 + }, + { + "epoch": 0.6322133663784606, + "grad_norm": 0.4547950005326099, + "learning_rate": 1.6317368678879497e-06, + "loss": 0.1545, + "step": 2806 + }, + { + "epoch": 0.632438674064269, + "grad_norm": 0.4585047343852091, + "learning_rate": 1.6299901717198102e-06, + "loss": 0.1578, + "step": 2807 + }, + { + "epoch": 0.6326639817500774, + "grad_norm": 0.44477091812663877, + "learning_rate": 1.6282439586356999e-06, + "loss": 0.1715, + "step": 2808 + }, + { + "epoch": 0.6328892894358858, + "grad_norm": 0.46616934511982, + "learning_rate": 1.6264982296052256e-06, + "loss": 0.166, + "step": 2809 + }, + { + "epoch": 0.6331145971216943, + "grad_norm": 0.4301733970205271, + "learning_rate": 1.6247529855977256e-06, + "loss": 0.1569, + "step": 2810 + }, + { + "epoch": 0.6333399048075028, + "grad_norm": 0.477973525850378, + "learning_rate": 1.6230082275822687e-06, + "loss": 0.1731, + "step": 2811 + }, + { + "epoch": 0.6335652124933112, + "grad_norm": 0.4209243745012404, + "learning_rate": 1.6212639565276538e-06, + "loss": 0.1531, + "step": 2812 + }, + { + "epoch": 0.6337905201791196, + "grad_norm": 0.4286022567893712, + "learning_rate": 1.6195201734024096e-06, + "loss": 0.153, + "step": 2813 + }, + { + "epoch": 0.634015827864928, + "grad_norm": 0.4677500283512557, + "learning_rate": 1.6177768791747957e-06, + "loss": 0.1697, + "step": 2814 + }, + { + "epoch": 0.6342411355507365, + "grad_norm": 0.45910634409346524, + "learning_rate": 1.6160340748127959e-06, + "loss": 0.1733, + "step": 2815 + }, + { + "epoch": 0.6344664432365449, + "grad_norm": 0.41259435917408654, + "learning_rate": 1.6142917612841252e-06, + "loss": 0.146, + "step": 2816 + }, + { + "epoch": 0.6346917509223533, + "grad_norm": 0.497089622522006, + "learning_rate": 1.612549939556225e-06, + "loss": 0.1638, + "step": 2817 + }, + { + "epoch": 0.6349170586081617, + "grad_norm": 0.4412652435507732, + "learning_rate": 1.610808610596265e-06, + "loss": 0.1577, + "step": 2818 + }, + { + "epoch": 0.6351423662939703, + "grad_norm": 0.47666619891428685, + "learning_rate": 1.6090677753711403e-06, + "loss": 0.1601, + "step": 2819 + }, + { + "epoch": 0.6353676739797787, + "grad_norm": 0.4527689785966767, + "learning_rate": 1.607327434847471e-06, + "loss": 0.1714, + "step": 2820 + }, + { + "epoch": 0.6355929816655871, + "grad_norm": 0.4765050245286566, + "learning_rate": 1.6055875899916034e-06, + "loss": 0.1782, + "step": 2821 + }, + { + "epoch": 0.6358182893513955, + "grad_norm": 0.4513463262030934, + "learning_rate": 1.6038482417696095e-06, + "loss": 0.1703, + "step": 2822 + }, + { + "epoch": 0.6360435970372039, + "grad_norm": 0.44429724171296026, + "learning_rate": 1.6021093911472825e-06, + "loss": 0.1715, + "step": 2823 + }, + { + "epoch": 0.6362689047230123, + "grad_norm": 0.42738194551712844, + "learning_rate": 1.6003710390901434e-06, + "loss": 0.1714, + "step": 2824 + }, + { + "epoch": 0.6364942124088208, + "grad_norm": 0.491886842959726, + "learning_rate": 1.5986331865634335e-06, + "loss": 0.1884, + "step": 2825 + }, + { + "epoch": 0.6367195200946292, + "grad_norm": 0.46790258595763395, + "learning_rate": 1.5968958345321178e-06, + "loss": 0.1743, + "step": 2826 + }, + { + "epoch": 0.6369448277804377, + "grad_norm": 0.46153808401384383, + "learning_rate": 1.5951589839608828e-06, + "loss": 0.1688, + "step": 2827 + }, + { + "epoch": 0.6371701354662461, + "grad_norm": 0.4398233878007649, + "learning_rate": 1.5934226358141368e-06, + "loss": 0.1589, + "step": 2828 + }, + { + "epoch": 0.6373954431520545, + "grad_norm": 0.45762844930314084, + "learning_rate": 1.5916867910560092e-06, + "loss": 0.1682, + "step": 2829 + }, + { + "epoch": 0.637620750837863, + "grad_norm": 0.47981261009495174, + "learning_rate": 1.5899514506503499e-06, + "loss": 0.1847, + "step": 2830 + }, + { + "epoch": 0.6378460585236714, + "grad_norm": 0.4790161675565743, + "learning_rate": 1.5882166155607306e-06, + "loss": 0.1826, + "step": 2831 + }, + { + "epoch": 0.6380713662094798, + "grad_norm": 0.46786904020678355, + "learning_rate": 1.5864822867504376e-06, + "loss": 0.167, + "step": 2832 + }, + { + "epoch": 0.6382966738952882, + "grad_norm": 0.44034849751182215, + "learning_rate": 1.58474846518248e-06, + "loss": 0.1689, + "step": 2833 + }, + { + "epoch": 0.6385219815810966, + "grad_norm": 0.4239343867778454, + "learning_rate": 1.5830151518195846e-06, + "loss": 0.1576, + "step": 2834 + }, + { + "epoch": 0.6387472892669052, + "grad_norm": 0.43798369067994597, + "learning_rate": 1.5812823476241962e-06, + "loss": 0.1628, + "step": 2835 + }, + { + "epoch": 0.6389725969527136, + "grad_norm": 0.4443353499057384, + "learning_rate": 1.5795500535584758e-06, + "loss": 0.169, + "step": 2836 + }, + { + "epoch": 0.639197904638522, + "grad_norm": 0.47320634150786534, + "learning_rate": 1.5778182705843017e-06, + "loss": 0.1803, + "step": 2837 + }, + { + "epoch": 0.6394232123243304, + "grad_norm": 0.45886524009325275, + "learning_rate": 1.5760869996632685e-06, + "loss": 0.1736, + "step": 2838 + }, + { + "epoch": 0.6396485200101388, + "grad_norm": 0.4175163957220707, + "learning_rate": 1.574356241756686e-06, + "loss": 0.158, + "step": 2839 + }, + { + "epoch": 0.6398738276959473, + "grad_norm": 0.4621036809079441, + "learning_rate": 1.572625997825581e-06, + "loss": 0.1707, + "step": 2840 + }, + { + "epoch": 0.6400991353817557, + "grad_norm": 0.4639447373374142, + "learning_rate": 1.5708962688306916e-06, + "loss": 0.1804, + "step": 2841 + }, + { + "epoch": 0.6403244430675642, + "grad_norm": 0.48794276565434075, + "learning_rate": 1.5691670557324734e-06, + "loss": 0.1736, + "step": 2842 + }, + { + "epoch": 0.6405497507533726, + "grad_norm": 0.5380283298140494, + "learning_rate": 1.5674383594910931e-06, + "loss": 0.1763, + "step": 2843 + }, + { + "epoch": 0.640775058439181, + "grad_norm": 0.45580879593361395, + "learning_rate": 1.5657101810664314e-06, + "loss": 0.1726, + "step": 2844 + }, + { + "epoch": 0.6410003661249895, + "grad_norm": 0.4172144794890706, + "learning_rate": 1.5639825214180808e-06, + "loss": 0.1528, + "step": 2845 + }, + { + "epoch": 0.6412256738107979, + "grad_norm": 0.4539983166811848, + "learning_rate": 1.5622553815053476e-06, + "loss": 0.1715, + "step": 2846 + }, + { + "epoch": 0.6414509814966063, + "grad_norm": 0.42530241291480886, + "learning_rate": 1.5605287622872478e-06, + "loss": 0.1676, + "step": 2847 + }, + { + "epoch": 0.6416762891824147, + "grad_norm": 0.44322249488673826, + "learning_rate": 1.558802664722508e-06, + "loss": 0.156, + "step": 2848 + }, + { + "epoch": 0.6419015968682231, + "grad_norm": 0.43260831519975973, + "learning_rate": 1.5570770897695672e-06, + "loss": 0.1678, + "step": 2849 + }, + { + "epoch": 0.6421269045540317, + "grad_norm": 0.4702669830784481, + "learning_rate": 1.555352038386571e-06, + "loss": 0.171, + "step": 2850 + }, + { + "epoch": 0.6423522122398401, + "grad_norm": 0.4573267236419879, + "learning_rate": 1.5536275115313776e-06, + "loss": 0.1598, + "step": 2851 + }, + { + "epoch": 0.6425775199256485, + "grad_norm": 0.47221271486643357, + "learning_rate": 1.5519035101615518e-06, + "loss": 0.1723, + "step": 2852 + }, + { + "epoch": 0.6428028276114569, + "grad_norm": 0.46404568613966796, + "learning_rate": 1.5501800352343673e-06, + "loss": 0.1769, + "step": 2853 + }, + { + "epoch": 0.6430281352972653, + "grad_norm": 0.4739038910060207, + "learning_rate": 1.5484570877068055e-06, + "loss": 0.1749, + "step": 2854 + }, + { + "epoch": 0.6432534429830737, + "grad_norm": 0.4892521651566746, + "learning_rate": 1.5467346685355553e-06, + "loss": 0.1864, + "step": 2855 + }, + { + "epoch": 0.6434787506688822, + "grad_norm": 0.4691418579436085, + "learning_rate": 1.5450127786770116e-06, + "loss": 0.1862, + "step": 2856 + }, + { + "epoch": 0.6437040583546906, + "grad_norm": 0.4687994822931264, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.1732, + "step": 2857 + }, + { + "epoch": 0.6439293660404991, + "grad_norm": 0.44273031133724056, + "learning_rate": 1.5415705907221545e-06, + "loss": 0.1523, + "step": 2858 + }, + { + "epoch": 0.6441546737263075, + "grad_norm": 0.45237596743535335, + "learning_rate": 1.53985029453716e-06, + "loss": 0.1634, + "step": 2859 + }, + { + "epoch": 0.6443799814121159, + "grad_norm": 0.5631080344505704, + "learning_rate": 1.5381305314875084e-06, + "loss": 0.1764, + "step": 2860 + }, + { + "epoch": 0.6446052890979244, + "grad_norm": 0.4343122808886178, + "learning_rate": 1.536411302528119e-06, + "loss": 0.1697, + "step": 2861 + }, + { + "epoch": 0.6448305967837328, + "grad_norm": 0.44690357773811284, + "learning_rate": 1.5346926086136171e-06, + "loss": 0.1685, + "step": 2862 + }, + { + "epoch": 0.6450559044695412, + "grad_norm": 0.47416546130848125, + "learning_rate": 1.5329744506983292e-06, + "loss": 0.1725, + "step": 2863 + }, + { + "epoch": 0.6452812121553496, + "grad_norm": 0.4356634645549042, + "learning_rate": 1.5312568297362834e-06, + "loss": 0.1575, + "step": 2864 + }, + { + "epoch": 0.645506519841158, + "grad_norm": 0.46466290089526685, + "learning_rate": 1.5295397466812115e-06, + "loss": 0.1702, + "step": 2865 + }, + { + "epoch": 0.6457318275269666, + "grad_norm": 0.4588207898132333, + "learning_rate": 1.5278232024865458e-06, + "loss": 0.1618, + "step": 2866 + }, + { + "epoch": 0.645957135212775, + "grad_norm": 0.4383705151209667, + "learning_rate": 1.5261071981054183e-06, + "loss": 0.1676, + "step": 2867 + }, + { + "epoch": 0.6461824428985834, + "grad_norm": 0.4505042772307737, + "learning_rate": 1.5243917344906625e-06, + "loss": 0.1615, + "step": 2868 + }, + { + "epoch": 0.6464077505843918, + "grad_norm": 0.41266542503484493, + "learning_rate": 1.5226768125948122e-06, + "loss": 0.1504, + "step": 2869 + }, + { + "epoch": 0.6466330582702002, + "grad_norm": 0.5070919933824632, + "learning_rate": 1.5209624333700985e-06, + "loss": 0.1501, + "step": 2870 + }, + { + "epoch": 0.6468583659560087, + "grad_norm": 0.42941636749695944, + "learning_rate": 1.5192485977684528e-06, + "loss": 0.1693, + "step": 2871 + }, + { + "epoch": 0.6470836736418171, + "grad_norm": 0.4724685573375727, + "learning_rate": 1.517535306741505e-06, + "loss": 0.1792, + "step": 2872 + }, + { + "epoch": 0.6473089813276255, + "grad_norm": 0.45372801009792174, + "learning_rate": 1.5158225612405808e-06, + "loss": 0.1586, + "step": 2873 + }, + { + "epoch": 0.647534289013434, + "grad_norm": 0.43406651616924996, + "learning_rate": 1.5141103622167042e-06, + "loss": 0.1564, + "step": 2874 + }, + { + "epoch": 0.6477595966992424, + "grad_norm": 0.434603289210475, + "learning_rate": 1.512398710620595e-06, + "loss": 0.1623, + "step": 2875 + }, + { + "epoch": 0.6479849043850509, + "grad_norm": 0.41620204096302416, + "learning_rate": 1.51068760740267e-06, + "loss": 0.1505, + "step": 2876 + }, + { + "epoch": 0.6482102120708593, + "grad_norm": 0.4304518336393149, + "learning_rate": 1.508977053513041e-06, + "loss": 0.1562, + "step": 2877 + }, + { + "epoch": 0.6484355197566677, + "grad_norm": 0.45491686063137143, + "learning_rate": 1.5072670499015151e-06, + "loss": 0.1706, + "step": 2878 + }, + { + "epoch": 0.6486608274424761, + "grad_norm": 0.4769764296464412, + "learning_rate": 1.5055575975175929e-06, + "loss": 0.1826, + "step": 2879 + }, + { + "epoch": 0.6488861351282845, + "grad_norm": 0.5013659109720912, + "learning_rate": 1.5038486973104704e-06, + "loss": 0.1931, + "step": 2880 + }, + { + "epoch": 0.6491114428140929, + "grad_norm": 0.44365522235384486, + "learning_rate": 1.5021403502290354e-06, + "loss": 0.1556, + "step": 2881 + }, + { + "epoch": 0.6493367504999015, + "grad_norm": 0.4423560812017721, + "learning_rate": 1.5004325572218698e-06, + "loss": 0.1622, + "step": 2882 + }, + { + "epoch": 0.6495620581857099, + "grad_norm": 0.46688913487218303, + "learning_rate": 1.4987253192372471e-06, + "loss": 0.1773, + "step": 2883 + }, + { + "epoch": 0.6497873658715183, + "grad_norm": 0.4587191094644422, + "learning_rate": 1.4970186372231347e-06, + "loss": 0.1737, + "step": 2884 + }, + { + "epoch": 0.6500126735573267, + "grad_norm": 0.48286669948693073, + "learning_rate": 1.4953125121271866e-06, + "loss": 0.1802, + "step": 2885 + }, + { + "epoch": 0.6502379812431351, + "grad_norm": 0.43040127296031444, + "learning_rate": 1.493606944896751e-06, + "loss": 0.1591, + "step": 2886 + }, + { + "epoch": 0.6504632889289436, + "grad_norm": 0.43969252599536995, + "learning_rate": 1.4919019364788678e-06, + "loss": 0.154, + "step": 2887 + }, + { + "epoch": 0.650688596614752, + "grad_norm": 0.4518734297454194, + "learning_rate": 1.490197487820263e-06, + "loss": 0.1735, + "step": 2888 + }, + { + "epoch": 0.6509139043005604, + "grad_norm": 0.45541581464642633, + "learning_rate": 1.4884935998673539e-06, + "loss": 0.1684, + "step": 2889 + }, + { + "epoch": 0.6511392119863689, + "grad_norm": 0.46614535886156255, + "learning_rate": 1.486790273566246e-06, + "loss": 0.1765, + "step": 2890 + }, + { + "epoch": 0.6513645196721773, + "grad_norm": 0.4379972883852115, + "learning_rate": 1.4850875098627326e-06, + "loss": 0.1688, + "step": 2891 + }, + { + "epoch": 0.6515898273579858, + "grad_norm": 0.46430985662758334, + "learning_rate": 1.483385309702295e-06, + "loss": 0.1765, + "step": 2892 + }, + { + "epoch": 0.6518151350437942, + "grad_norm": 0.42541312627135036, + "learning_rate": 1.4816836740301019e-06, + "loss": 0.1691, + "step": 2893 + }, + { + "epoch": 0.6520404427296026, + "grad_norm": 0.5223950571350945, + "learning_rate": 1.4799826037910082e-06, + "loss": 0.1923, + "step": 2894 + }, + { + "epoch": 0.652265750415411, + "grad_norm": 0.4234236693272207, + "learning_rate": 1.478282099929554e-06, + "loss": 0.1526, + "step": 2895 + }, + { + "epoch": 0.6524910581012194, + "grad_norm": 0.47284149584985924, + "learning_rate": 1.4765821633899663e-06, + "loss": 0.1728, + "step": 2896 + }, + { + "epoch": 0.652716365787028, + "grad_norm": 0.45217392509227117, + "learning_rate": 1.4748827951161566e-06, + "loss": 0.1633, + "step": 2897 + }, + { + "epoch": 0.6529416734728364, + "grad_norm": 0.4625031023159013, + "learning_rate": 1.4731839960517202e-06, + "loss": 0.17, + "step": 2898 + }, + { + "epoch": 0.6531669811586448, + "grad_norm": 0.470085548485971, + "learning_rate": 1.4714857671399374e-06, + "loss": 0.1923, + "step": 2899 + }, + { + "epoch": 0.6533922888444532, + "grad_norm": 0.4315420280155979, + "learning_rate": 1.4697881093237714e-06, + "loss": 0.1606, + "step": 2900 + }, + { + "epoch": 0.6536175965302616, + "grad_norm": 0.4407900719732138, + "learning_rate": 1.4680910235458692e-06, + "loss": 0.1635, + "step": 2901 + }, + { + "epoch": 0.65384290421607, + "grad_norm": 0.432307134894963, + "learning_rate": 1.4663945107485567e-06, + "loss": 0.1618, + "step": 2902 + }, + { + "epoch": 0.6540682119018785, + "grad_norm": 0.46691882474022245, + "learning_rate": 1.4646985718738466e-06, + "loss": 0.1791, + "step": 2903 + }, + { + "epoch": 0.6542935195876869, + "grad_norm": 0.44674464065975367, + "learning_rate": 1.4630032078634293e-06, + "loss": 0.165, + "step": 2904 + }, + { + "epoch": 0.6545188272734954, + "grad_norm": 0.4504972198573789, + "learning_rate": 1.461308419658678e-06, + "loss": 0.1622, + "step": 2905 + }, + { + "epoch": 0.6547441349593038, + "grad_norm": 0.4419051695013604, + "learning_rate": 1.4596142082006448e-06, + "loss": 0.1599, + "step": 2906 + }, + { + "epoch": 0.6549694426451123, + "grad_norm": 0.6374216842821923, + "learning_rate": 1.457920574430062e-06, + "loss": 0.195, + "step": 2907 + }, + { + "epoch": 0.6551947503309207, + "grad_norm": 0.4602701257624001, + "learning_rate": 1.456227519287343e-06, + "loss": 0.1757, + "step": 2908 + }, + { + "epoch": 0.6554200580167291, + "grad_norm": 0.4236936381738281, + "learning_rate": 1.4545350437125755e-06, + "loss": 0.1536, + "step": 2909 + }, + { + "epoch": 0.6556453657025375, + "grad_norm": 0.4860840988089347, + "learning_rate": 1.4528431486455311e-06, + "loss": 0.1789, + "step": 2910 + }, + { + "epoch": 0.6558706733883459, + "grad_norm": 0.4748100004700641, + "learning_rate": 1.451151835025653e-06, + "loss": 0.1736, + "step": 2911 + }, + { + "epoch": 0.6560959810741543, + "grad_norm": 0.4278338333190261, + "learning_rate": 1.4494611037920667e-06, + "loss": 0.1596, + "step": 2912 + }, + { + "epoch": 0.6563212887599629, + "grad_norm": 0.4688794341326289, + "learning_rate": 1.4477709558835724e-06, + "loss": 0.1688, + "step": 2913 + }, + { + "epoch": 0.6565465964457713, + "grad_norm": 0.4687981706465521, + "learning_rate": 1.4460813922386446e-06, + "loss": 0.1719, + "step": 2914 + }, + { + "epoch": 0.6567719041315797, + "grad_norm": 0.451945574512909, + "learning_rate": 1.4443924137954368e-06, + "loss": 0.1583, + "step": 2915 + }, + { + "epoch": 0.6569972118173881, + "grad_norm": 0.46728797859555166, + "learning_rate": 1.4427040214917742e-06, + "loss": 0.1696, + "step": 2916 + }, + { + "epoch": 0.6572225195031965, + "grad_norm": 0.4416027816502068, + "learning_rate": 1.4410162162651586e-06, + "loss": 0.1654, + "step": 2917 + }, + { + "epoch": 0.657447827189005, + "grad_norm": 0.45533735714447743, + "learning_rate": 1.4393289990527665e-06, + "loss": 0.156, + "step": 2918 + }, + { + "epoch": 0.6576731348748134, + "grad_norm": 0.4548346740596767, + "learning_rate": 1.4376423707914462e-06, + "loss": 0.167, + "step": 2919 + }, + { + "epoch": 0.6578984425606218, + "grad_norm": 0.4497338316343115, + "learning_rate": 1.4359563324177176e-06, + "loss": 0.1619, + "step": 2920 + }, + { + "epoch": 0.6581237502464303, + "grad_norm": 0.43963782167747745, + "learning_rate": 1.4342708848677774e-06, + "loss": 0.1625, + "step": 2921 + }, + { + "epoch": 0.6583490579322387, + "grad_norm": 0.45457523119099624, + "learning_rate": 1.43258602907749e-06, + "loss": 0.1625, + "step": 2922 + }, + { + "epoch": 0.6585743656180472, + "grad_norm": 0.4421685617761203, + "learning_rate": 1.430901765982395e-06, + "loss": 0.1754, + "step": 2923 + }, + { + "epoch": 0.6587996733038556, + "grad_norm": 0.4725721554417859, + "learning_rate": 1.429218096517699e-06, + "loss": 0.17, + "step": 2924 + }, + { + "epoch": 0.659024980989664, + "grad_norm": 0.38904007890397446, + "learning_rate": 1.4275350216182824e-06, + "loss": 0.1335, + "step": 2925 + }, + { + "epoch": 0.6592502886754724, + "grad_norm": 0.46883506152070087, + "learning_rate": 1.425852542218692e-06, + "loss": 0.178, + "step": 2926 + }, + { + "epoch": 0.6594755963612808, + "grad_norm": 0.43709294780290175, + "learning_rate": 1.4241706592531473e-06, + "loss": 0.1683, + "step": 2927 + }, + { + "epoch": 0.6597009040470893, + "grad_norm": 0.4575875819059064, + "learning_rate": 1.4224893736555364e-06, + "loss": 0.1747, + "step": 2928 + }, + { + "epoch": 0.6599262117328978, + "grad_norm": 0.4735597862999961, + "learning_rate": 1.420808686359412e-06, + "loss": 0.1757, + "step": 2929 + }, + { + "epoch": 0.6601515194187062, + "grad_norm": 0.44115967072161766, + "learning_rate": 1.4191285982979992e-06, + "loss": 0.1571, + "step": 2930 + }, + { + "epoch": 0.6603768271045146, + "grad_norm": 0.4548047368069635, + "learning_rate": 1.4174491104041866e-06, + "loss": 0.1742, + "step": 2931 + }, + { + "epoch": 0.660602134790323, + "grad_norm": 0.46255625192049116, + "learning_rate": 1.4157702236105326e-06, + "loss": 0.1826, + "step": 2932 + }, + { + "epoch": 0.6608274424761315, + "grad_norm": 0.43991964668446815, + "learning_rate": 1.414091938849259e-06, + "loss": 0.167, + "step": 2933 + }, + { + "epoch": 0.6610527501619399, + "grad_norm": 0.44916389098206744, + "learning_rate": 1.412414257052256e-06, + "loss": 0.1608, + "step": 2934 + }, + { + "epoch": 0.6612780578477483, + "grad_norm": 0.4593157531026723, + "learning_rate": 1.410737179151078e-06, + "loss": 0.1788, + "step": 2935 + }, + { + "epoch": 0.6615033655335567, + "grad_norm": 0.42024042452677723, + "learning_rate": 1.4090607060769423e-06, + "loss": 0.1518, + "step": 2936 + }, + { + "epoch": 0.6617286732193652, + "grad_norm": 0.4353260118557558, + "learning_rate": 1.407384838760734e-06, + "loss": 0.1742, + "step": 2937 + }, + { + "epoch": 0.6619539809051737, + "grad_norm": 0.43709724419014145, + "learning_rate": 1.4057095781329983e-06, + "loss": 0.1558, + "step": 2938 + }, + { + "epoch": 0.6621792885909821, + "grad_norm": 0.4359816116367562, + "learning_rate": 1.4040349251239444e-06, + "loss": 0.1505, + "step": 2939 + }, + { + "epoch": 0.6624045962767905, + "grad_norm": 0.45492755269965934, + "learning_rate": 1.402360880663447e-06, + "loss": 0.1674, + "step": 2940 + }, + { + "epoch": 0.6626299039625989, + "grad_norm": 0.46259457810278043, + "learning_rate": 1.4006874456810377e-06, + "loss": 0.171, + "step": 2941 + }, + { + "epoch": 0.6628552116484073, + "grad_norm": 0.4707948377795537, + "learning_rate": 1.3990146211059141e-06, + "loss": 0.1757, + "step": 2942 + }, + { + "epoch": 0.6630805193342157, + "grad_norm": 0.44744154364405875, + "learning_rate": 1.3973424078669346e-06, + "loss": 0.1696, + "step": 2943 + }, + { + "epoch": 0.6633058270200243, + "grad_norm": 0.47271195764724167, + "learning_rate": 1.3956708068926141e-06, + "loss": 0.1848, + "step": 2944 + }, + { + "epoch": 0.6635311347058327, + "grad_norm": 0.485270852547156, + "learning_rate": 1.393999819111133e-06, + "loss": 0.1702, + "step": 2945 + }, + { + "epoch": 0.6637564423916411, + "grad_norm": 0.4477821506498824, + "learning_rate": 1.3923294454503263e-06, + "loss": 0.1615, + "step": 2946 + }, + { + "epoch": 0.6639817500774495, + "grad_norm": 0.4285968442862378, + "learning_rate": 1.3906596868376923e-06, + "loss": 0.167, + "step": 2947 + }, + { + "epoch": 0.664207057763258, + "grad_norm": 0.4149948672404091, + "learning_rate": 1.3889905442003836e-06, + "loss": 0.1439, + "step": 2948 + }, + { + "epoch": 0.6644323654490664, + "grad_norm": 0.4596912990411504, + "learning_rate": 1.3873220184652143e-06, + "loss": 0.1812, + "step": 2949 + }, + { + "epoch": 0.6646576731348748, + "grad_norm": 0.45633216317699576, + "learning_rate": 1.3856541105586545e-06, + "loss": 0.173, + "step": 2950 + }, + { + "epoch": 0.6648829808206832, + "grad_norm": 0.46474398545897017, + "learning_rate": 1.3839868214068303e-06, + "loss": 0.1716, + "step": 2951 + }, + { + "epoch": 0.6651082885064917, + "grad_norm": 0.4503765149225735, + "learning_rate": 1.382320151935527e-06, + "loss": 0.1575, + "step": 2952 + }, + { + "epoch": 0.6653335961923001, + "grad_norm": 0.4397149456891927, + "learning_rate": 1.380654103070182e-06, + "loss": 0.1486, + "step": 2953 + }, + { + "epoch": 0.6655589038781086, + "grad_norm": 0.45104520656947134, + "learning_rate": 1.3789886757358916e-06, + "loss": 0.1703, + "step": 2954 + }, + { + "epoch": 0.665784211563917, + "grad_norm": 0.4586134850153647, + "learning_rate": 1.3773238708574054e-06, + "loss": 0.1765, + "step": 2955 + }, + { + "epoch": 0.6660095192497254, + "grad_norm": 0.45725415356444155, + "learning_rate": 1.375659689359126e-06, + "loss": 0.1697, + "step": 2956 + }, + { + "epoch": 0.6662348269355338, + "grad_norm": 0.429125649497335, + "learning_rate": 1.3739961321651139e-06, + "loss": 0.1567, + "step": 2957 + }, + { + "epoch": 0.6664601346213422, + "grad_norm": 0.45116645226330354, + "learning_rate": 1.3723332001990774e-06, + "loss": 0.1513, + "step": 2958 + }, + { + "epoch": 0.6666854423071507, + "grad_norm": 0.4154321054190309, + "learning_rate": 1.3706708943843822e-06, + "loss": 0.1421, + "step": 2959 + }, + { + "epoch": 0.6669107499929592, + "grad_norm": 0.438312443147914, + "learning_rate": 1.369009215644046e-06, + "loss": 0.1572, + "step": 2960 + }, + { + "epoch": 0.6671360576787676, + "grad_norm": 0.4154703470725542, + "learning_rate": 1.3673481649007347e-06, + "loss": 0.1565, + "step": 2961 + }, + { + "epoch": 0.667361365364576, + "grad_norm": 0.4296943780170812, + "learning_rate": 1.36568774307677e-06, + "loss": 0.15, + "step": 2962 + }, + { + "epoch": 0.6675866730503844, + "grad_norm": 0.4424275093755365, + "learning_rate": 1.36402795109412e-06, + "loss": 0.1618, + "step": 2963 + }, + { + "epoch": 0.6678119807361929, + "grad_norm": 0.41014988907135724, + "learning_rate": 1.362368789874407e-06, + "loss": 0.15, + "step": 2964 + }, + { + "epoch": 0.6680372884220013, + "grad_norm": 0.4476535507341546, + "learning_rate": 1.3607102603389016e-06, + "loss": 0.1597, + "step": 2965 + }, + { + "epoch": 0.6682625961078097, + "grad_norm": 0.4450823545201284, + "learning_rate": 1.3590523634085218e-06, + "loss": 0.1628, + "step": 2966 + }, + { + "epoch": 0.6684879037936181, + "grad_norm": 0.4407489458183779, + "learning_rate": 1.3573951000038376e-06, + "loss": 0.1553, + "step": 2967 + }, + { + "epoch": 0.6687132114794266, + "grad_norm": 0.44580210756366934, + "learning_rate": 1.3557384710450644e-06, + "loss": 0.183, + "step": 2968 + }, + { + "epoch": 0.6689385191652351, + "grad_norm": 0.4338130619861322, + "learning_rate": 1.3540824774520678e-06, + "loss": 0.1654, + "step": 2969 + }, + { + "epoch": 0.6691638268510435, + "grad_norm": 0.4334281259138233, + "learning_rate": 1.3524271201443578e-06, + "loss": 0.155, + "step": 2970 + }, + { + "epoch": 0.6693891345368519, + "grad_norm": 0.5108298456194338, + "learning_rate": 1.3507724000410933e-06, + "loss": 0.18, + "step": 2971 + }, + { + "epoch": 0.6696144422226603, + "grad_norm": 0.42147941514910625, + "learning_rate": 1.3491183180610807e-06, + "loss": 0.1598, + "step": 2972 + }, + { + "epoch": 0.6698397499084687, + "grad_norm": 0.47121748478749065, + "learning_rate": 1.347464875122766e-06, + "loss": 0.1518, + "step": 2973 + }, + { + "epoch": 0.6700650575942771, + "grad_norm": 0.41631404459931765, + "learning_rate": 1.3458120721442464e-06, + "loss": 0.1478, + "step": 2974 + }, + { + "epoch": 0.6702903652800856, + "grad_norm": 0.4258305162245609, + "learning_rate": 1.3441599100432635e-06, + "loss": 0.1537, + "step": 2975 + }, + { + "epoch": 0.6705156729658941, + "grad_norm": 0.4138417683739623, + "learning_rate": 1.3425083897371983e-06, + "loss": 0.1545, + "step": 2976 + }, + { + "epoch": 0.6707409806517025, + "grad_norm": 0.45756294566839184, + "learning_rate": 1.3408575121430812e-06, + "loss": 0.1612, + "step": 2977 + }, + { + "epoch": 0.6709662883375109, + "grad_norm": 0.4415297191242686, + "learning_rate": 1.3392072781775806e-06, + "loss": 0.1636, + "step": 2978 + }, + { + "epoch": 0.6711915960233193, + "grad_norm": 0.42697865830810067, + "learning_rate": 1.337557688757012e-06, + "loss": 0.1506, + "step": 2979 + }, + { + "epoch": 0.6714169037091278, + "grad_norm": 0.4748002717401216, + "learning_rate": 1.335908744797329e-06, + "loss": 0.1673, + "step": 2980 + }, + { + "epoch": 0.6716422113949362, + "grad_norm": 0.48047460757174437, + "learning_rate": 1.3342604472141296e-06, + "loss": 0.1776, + "step": 2981 + }, + { + "epoch": 0.6718675190807446, + "grad_norm": 0.4660511955808628, + "learning_rate": 1.3326127969226535e-06, + "loss": 0.1748, + "step": 2982 + }, + { + "epoch": 0.672092826766553, + "grad_norm": 0.43771581214022187, + "learning_rate": 1.3309657948377768e-06, + "loss": 0.1632, + "step": 2983 + }, + { + "epoch": 0.6723181344523615, + "grad_norm": 0.450687040230089, + "learning_rate": 1.3293194418740207e-06, + "loss": 0.1683, + "step": 2984 + }, + { + "epoch": 0.67254344213817, + "grad_norm": 0.4712507612968999, + "learning_rate": 1.3276737389455416e-06, + "loss": 0.1632, + "step": 2985 + }, + { + "epoch": 0.6727687498239784, + "grad_norm": 0.46168700839539434, + "learning_rate": 1.3260286869661378e-06, + "loss": 0.1672, + "step": 2986 + }, + { + "epoch": 0.6729940575097868, + "grad_norm": 0.6411696002061731, + "learning_rate": 1.3243842868492468e-06, + "loss": 0.1569, + "step": 2987 + }, + { + "epoch": 0.6732193651955952, + "grad_norm": 0.4600753922023911, + "learning_rate": 1.32274053950794e-06, + "loss": 0.1659, + "step": 2988 + }, + { + "epoch": 0.6734446728814036, + "grad_norm": 0.4557159831762484, + "learning_rate": 1.3210974458549318e-06, + "loss": 0.1727, + "step": 2989 + }, + { + "epoch": 0.6736699805672121, + "grad_norm": 0.4446239041707151, + "learning_rate": 1.3194550068025697e-06, + "loss": 0.1704, + "step": 2990 + }, + { + "epoch": 0.6738952882530206, + "grad_norm": 0.45169648834664844, + "learning_rate": 1.3178132232628374e-06, + "loss": 0.1675, + "step": 2991 + }, + { + "epoch": 0.674120595938829, + "grad_norm": 0.4454699122296075, + "learning_rate": 1.3161720961473583e-06, + "loss": 0.1676, + "step": 2992 + }, + { + "epoch": 0.6743459036246374, + "grad_norm": 0.47263790743223116, + "learning_rate": 1.3145316263673874e-06, + "loss": 0.1573, + "step": 2993 + }, + { + "epoch": 0.6745712113104458, + "grad_norm": 0.47314942642149976, + "learning_rate": 1.3128918148338183e-06, + "loss": 0.174, + "step": 2994 + }, + { + "epoch": 0.6747965189962543, + "grad_norm": 0.4640772290846561, + "learning_rate": 1.3112526624571753e-06, + "loss": 0.1762, + "step": 2995 + }, + { + "epoch": 0.6750218266820627, + "grad_norm": 0.46287035380499897, + "learning_rate": 1.3096141701476189e-06, + "loss": 0.1754, + "step": 2996 + }, + { + "epoch": 0.6752471343678711, + "grad_norm": 0.48254038351808376, + "learning_rate": 1.307976338814945e-06, + "loss": 0.175, + "step": 2997 + }, + { + "epoch": 0.6754724420536795, + "grad_norm": 0.4656671782106589, + "learning_rate": 1.3063391693685773e-06, + "loss": 0.1909, + "step": 2998 + }, + { + "epoch": 0.675697749739488, + "grad_norm": 0.44012303346331505, + "learning_rate": 1.3047026627175774e-06, + "loss": 0.1554, + "step": 2999 + }, + { + "epoch": 0.6759230574252965, + "grad_norm": 0.44908437013177216, + "learning_rate": 1.3030668197706347e-06, + "loss": 0.1607, + "step": 3000 + }, + { + "epoch": 0.6759230574252965, + "eval_loss": 0.16799412667751312, + "eval_runtime": 57.0458, + "eval_samples_per_second": 50.31, + "eval_steps_per_second": 6.293, + "step": 3000 + }, + { + "epoch": 0.6761483651111049, + "grad_norm": 0.4045207060586454, + "learning_rate": 1.3014316414360732e-06, + "loss": 0.1489, + "step": 3001 + }, + { + "epoch": 0.6763736727969133, + "grad_norm": 0.40214515031781006, + "learning_rate": 1.2997971286218448e-06, + "loss": 0.1465, + "step": 3002 + }, + { + "epoch": 0.6765989804827217, + "grad_norm": 0.43553698518365597, + "learning_rate": 1.2981632822355344e-06, + "loss": 0.1599, + "step": 3003 + }, + { + "epoch": 0.6768242881685301, + "grad_norm": 0.4733697672715525, + "learning_rate": 1.2965301031843574e-06, + "loss": 0.1699, + "step": 3004 + }, + { + "epoch": 0.6770495958543385, + "grad_norm": 0.4311278603284419, + "learning_rate": 1.294897592375155e-06, + "loss": 0.1498, + "step": 3005 + }, + { + "epoch": 0.677274903540147, + "grad_norm": 0.44463561437394716, + "learning_rate": 1.2932657507144014e-06, + "loss": 0.1662, + "step": 3006 + }, + { + "epoch": 0.6775002112259555, + "grad_norm": 0.44397961828400145, + "learning_rate": 1.2916345791081964e-06, + "loss": 0.1677, + "step": 3007 + }, + { + "epoch": 0.6777255189117639, + "grad_norm": 0.4436614598401153, + "learning_rate": 1.2900040784622686e-06, + "loss": 0.1594, + "step": 3008 + }, + { + "epoch": 0.6779508265975723, + "grad_norm": 0.44533916943535345, + "learning_rate": 1.2883742496819751e-06, + "loss": 0.1551, + "step": 3009 + }, + { + "epoch": 0.6781761342833807, + "grad_norm": 0.45376528949016137, + "learning_rate": 1.286745093672298e-06, + "loss": 0.1645, + "step": 3010 + }, + { + "epoch": 0.6784014419691892, + "grad_norm": 0.484755955658753, + "learning_rate": 1.2851166113378471e-06, + "loss": 0.1758, + "step": 3011 + }, + { + "epoch": 0.6786267496549976, + "grad_norm": 0.43057162693290235, + "learning_rate": 1.2834888035828597e-06, + "loss": 0.1579, + "step": 3012 + }, + { + "epoch": 0.678852057340806, + "grad_norm": 0.4440887399731602, + "learning_rate": 1.2818616713111945e-06, + "loss": 0.1724, + "step": 3013 + }, + { + "epoch": 0.6790773650266144, + "grad_norm": 0.45746611446786256, + "learning_rate": 1.2802352154263392e-06, + "loss": 0.1622, + "step": 3014 + }, + { + "epoch": 0.679302672712423, + "grad_norm": 0.43445431896986997, + "learning_rate": 1.2786094368314023e-06, + "loss": 0.1541, + "step": 3015 + }, + { + "epoch": 0.6795279803982314, + "grad_norm": 0.4587523732104358, + "learning_rate": 1.2769843364291202e-06, + "loss": 0.1736, + "step": 3016 + }, + { + "epoch": 0.6797532880840398, + "grad_norm": 0.4449372738968104, + "learning_rate": 1.2753599151218483e-06, + "loss": 0.1597, + "step": 3017 + }, + { + "epoch": 0.6799785957698482, + "grad_norm": 0.4377493639433729, + "learning_rate": 1.2737361738115681e-06, + "loss": 0.1461, + "step": 3018 + }, + { + "epoch": 0.6802039034556566, + "grad_norm": 0.46037661011260417, + "learning_rate": 1.2721131133998837e-06, + "loss": 0.1659, + "step": 3019 + }, + { + "epoch": 0.680429211141465, + "grad_norm": 0.46705358174265027, + "learning_rate": 1.2704907347880185e-06, + "loss": 0.1723, + "step": 3020 + }, + { + "epoch": 0.6806545188272735, + "grad_norm": 0.4604557207122367, + "learning_rate": 1.2688690388768205e-06, + "loss": 0.1767, + "step": 3021 + }, + { + "epoch": 0.6808798265130819, + "grad_norm": 0.4908203532007491, + "learning_rate": 1.2672480265667553e-06, + "loss": 0.1747, + "step": 3022 + }, + { + "epoch": 0.6811051341988904, + "grad_norm": 0.48024484956693997, + "learning_rate": 1.2656276987579118e-06, + "loss": 0.1768, + "step": 3023 + }, + { + "epoch": 0.6813304418846988, + "grad_norm": 0.47992747730797575, + "learning_rate": 1.2640080563499977e-06, + "loss": 0.1636, + "step": 3024 + }, + { + "epoch": 0.6815557495705072, + "grad_norm": 0.49616792180455216, + "learning_rate": 1.2623891002423383e-06, + "loss": 0.1687, + "step": 3025 + }, + { + "epoch": 0.6817810572563157, + "grad_norm": 0.4958452554843842, + "learning_rate": 1.2607708313338818e-06, + "loss": 0.1852, + "step": 3026 + }, + { + "epoch": 0.6820063649421241, + "grad_norm": 0.5867256506611247, + "learning_rate": 1.2591532505231906e-06, + "loss": 0.1804, + "step": 3027 + }, + { + "epoch": 0.6822316726279325, + "grad_norm": 0.47153342753382094, + "learning_rate": 1.2575363587084486e-06, + "loss": 0.1643, + "step": 3028 + }, + { + "epoch": 0.6824569803137409, + "grad_norm": 0.4248422833444112, + "learning_rate": 1.2559201567874554e-06, + "loss": 0.1621, + "step": 3029 + }, + { + "epoch": 0.6826822879995493, + "grad_norm": 0.48128772104019113, + "learning_rate": 1.2543046456576267e-06, + "loss": 0.1707, + "step": 3030 + }, + { + "epoch": 0.6829075956853579, + "grad_norm": 0.46131266986504005, + "learning_rate": 1.252689826215997e-06, + "loss": 0.1533, + "step": 3031 + }, + { + "epoch": 0.6831329033711663, + "grad_norm": 0.4606110301438835, + "learning_rate": 1.2510756993592138e-06, + "loss": 0.1598, + "step": 3032 + }, + { + "epoch": 0.6833582110569747, + "grad_norm": 0.42485485895311764, + "learning_rate": 1.2494622659835421e-06, + "loss": 0.1542, + "step": 3033 + }, + { + "epoch": 0.6835835187427831, + "grad_norm": 0.4255751499014428, + "learning_rate": 1.2478495269848626e-06, + "loss": 0.1622, + "step": 3034 + }, + { + "epoch": 0.6838088264285915, + "grad_norm": 0.4703466968405558, + "learning_rate": 1.246237483258667e-06, + "loss": 0.1669, + "step": 3035 + }, + { + "epoch": 0.6840341341144, + "grad_norm": 0.43788275434395674, + "learning_rate": 1.2446261357000655e-06, + "loss": 0.1461, + "step": 3036 + }, + { + "epoch": 0.6842594418002084, + "grad_norm": 0.41645411341294414, + "learning_rate": 1.243015485203777e-06, + "loss": 0.1532, + "step": 3037 + }, + { + "epoch": 0.6844847494860169, + "grad_norm": 0.4505932738914964, + "learning_rate": 1.2414055326641378e-06, + "loss": 0.1711, + "step": 3038 + }, + { + "epoch": 0.6847100571718253, + "grad_norm": 0.41893408546984606, + "learning_rate": 1.2397962789750923e-06, + "loss": 0.1542, + "step": 3039 + }, + { + "epoch": 0.6849353648576337, + "grad_norm": 0.4711724425645053, + "learning_rate": 1.2381877250302002e-06, + "loss": 0.1806, + "step": 3040 + }, + { + "epoch": 0.6851606725434422, + "grad_norm": 0.4495094197541863, + "learning_rate": 1.236579871722633e-06, + "loss": 0.1705, + "step": 3041 + }, + { + "epoch": 0.6853859802292506, + "grad_norm": 0.46861579849997625, + "learning_rate": 1.2349727199451696e-06, + "loss": 0.1804, + "step": 3042 + }, + { + "epoch": 0.685611287915059, + "grad_norm": 0.43204331280300484, + "learning_rate": 1.233366270590202e-06, + "loss": 0.1589, + "step": 3043 + }, + { + "epoch": 0.6858365956008674, + "grad_norm": 0.4705481793698645, + "learning_rate": 1.2317605245497324e-06, + "loss": 0.1619, + "step": 3044 + }, + { + "epoch": 0.6860619032866758, + "grad_norm": 0.4479845479410491, + "learning_rate": 1.2301554827153703e-06, + "loss": 0.1649, + "step": 3045 + }, + { + "epoch": 0.6862872109724844, + "grad_norm": 0.487934086624641, + "learning_rate": 1.2285511459783373e-06, + "loss": 0.1836, + "step": 3046 + }, + { + "epoch": 0.6865125186582928, + "grad_norm": 0.4096337075536667, + "learning_rate": 1.2269475152294601e-06, + "loss": 0.1499, + "step": 3047 + }, + { + "epoch": 0.6867378263441012, + "grad_norm": 0.4610643639275637, + "learning_rate": 1.225344591359177e-06, + "loss": 0.1644, + "step": 3048 + }, + { + "epoch": 0.6869631340299096, + "grad_norm": 0.47672152053337113, + "learning_rate": 1.2237423752575297e-06, + "loss": 0.1798, + "step": 3049 + }, + { + "epoch": 0.687188441715718, + "grad_norm": 0.44402906792318925, + "learning_rate": 1.2221408678141702e-06, + "loss": 0.1669, + "step": 3050 + }, + { + "epoch": 0.6874137494015264, + "grad_norm": 0.4548955980190698, + "learning_rate": 1.220540069918357e-06, + "loss": 0.1671, + "step": 3051 + }, + { + "epoch": 0.6876390570873349, + "grad_norm": 0.4224584486327237, + "learning_rate": 1.2189399824589513e-06, + "loss": 0.1526, + "step": 3052 + }, + { + "epoch": 0.6878643647731433, + "grad_norm": 0.48770313531656717, + "learning_rate": 1.217340606324424e-06, + "loss": 0.1848, + "step": 3053 + }, + { + "epoch": 0.6880896724589518, + "grad_norm": 0.4664340848379229, + "learning_rate": 1.2157419424028473e-06, + "loss": 0.1795, + "step": 3054 + }, + { + "epoch": 0.6883149801447602, + "grad_norm": 0.4444874834175369, + "learning_rate": 1.2141439915819008e-06, + "loss": 0.1678, + "step": 3055 + }, + { + "epoch": 0.6885402878305686, + "grad_norm": 0.47140326467430854, + "learning_rate": 1.2125467547488676e-06, + "loss": 0.187, + "step": 3056 + }, + { + "epoch": 0.6887655955163771, + "grad_norm": 0.46720163436252704, + "learning_rate": 1.210950232790632e-06, + "loss": 0.1764, + "step": 3057 + }, + { + "epoch": 0.6889909032021855, + "grad_norm": 0.4430209008331684, + "learning_rate": 1.2093544265936848e-06, + "loss": 0.165, + "step": 3058 + }, + { + "epoch": 0.6892162108879939, + "grad_norm": 0.44993053977578057, + "learning_rate": 1.2077593370441165e-06, + "loss": 0.1607, + "step": 3059 + }, + { + "epoch": 0.6894415185738023, + "grad_norm": 0.4438040608723411, + "learning_rate": 1.206164965027622e-06, + "loss": 0.1669, + "step": 3060 + }, + { + "epoch": 0.6896668262596107, + "grad_norm": 0.43615520114250794, + "learning_rate": 1.204571311429496e-06, + "loss": 0.1625, + "step": 3061 + }, + { + "epoch": 0.6898921339454193, + "grad_norm": 0.44234003498809504, + "learning_rate": 1.2029783771346344e-06, + "loss": 0.1712, + "step": 3062 + }, + { + "epoch": 0.6901174416312277, + "grad_norm": 0.4754168174583188, + "learning_rate": 1.2013861630275353e-06, + "loss": 0.1841, + "step": 3063 + }, + { + "epoch": 0.6903427493170361, + "grad_norm": 0.44423113576368556, + "learning_rate": 1.1997946699922946e-06, + "loss": 0.1611, + "step": 3064 + }, + { + "epoch": 0.6905680570028445, + "grad_norm": 0.4764929526855533, + "learning_rate": 1.1982038989126096e-06, + "loss": 0.1719, + "step": 3065 + }, + { + "epoch": 0.6907933646886529, + "grad_norm": 0.42564941605177264, + "learning_rate": 1.1966138506717776e-06, + "loss": 0.1655, + "step": 3066 + }, + { + "epoch": 0.6910186723744614, + "grad_norm": 0.45087613732261783, + "learning_rate": 1.195024526152691e-06, + "loss": 0.17, + "step": 3067 + }, + { + "epoch": 0.6912439800602698, + "grad_norm": 0.4712892917553011, + "learning_rate": 1.1934359262378443e-06, + "loss": 0.1763, + "step": 3068 + }, + { + "epoch": 0.6914692877460782, + "grad_norm": 0.45412685355624216, + "learning_rate": 1.1918480518093259e-06, + "loss": 0.1554, + "step": 3069 + }, + { + "epoch": 0.6916945954318867, + "grad_norm": 0.390124474083273, + "learning_rate": 1.190260903748825e-06, + "loss": 0.1399, + "step": 3070 + }, + { + "epoch": 0.6919199031176951, + "grad_norm": 0.43849874145879203, + "learning_rate": 1.1886744829376243e-06, + "loss": 0.1602, + "step": 3071 + }, + { + "epoch": 0.6921452108035036, + "grad_norm": 0.4620672680186048, + "learning_rate": 1.187088790256605e-06, + "loss": 0.1715, + "step": 3072 + }, + { + "epoch": 0.692370518489312, + "grad_norm": 0.44935644278980397, + "learning_rate": 1.185503826586244e-06, + "loss": 0.1805, + "step": 3073 + }, + { + "epoch": 0.6925958261751204, + "grad_norm": 0.4538354616966058, + "learning_rate": 1.1839195928066101e-06, + "loss": 0.1662, + "step": 3074 + }, + { + "epoch": 0.6928211338609288, + "grad_norm": 0.47524190400682453, + "learning_rate": 1.1823360897973723e-06, + "loss": 0.1866, + "step": 3075 + }, + { + "epoch": 0.6930464415467372, + "grad_norm": 0.43089998086134584, + "learning_rate": 1.1807533184377882e-06, + "loss": 0.1573, + "step": 3076 + }, + { + "epoch": 0.6932717492325456, + "grad_norm": 0.41576167564181804, + "learning_rate": 1.1791712796067134e-06, + "loss": 0.1508, + "step": 3077 + }, + { + "epoch": 0.6934970569183542, + "grad_norm": 0.48274571793644777, + "learning_rate": 1.1775899741825947e-06, + "loss": 0.1673, + "step": 3078 + }, + { + "epoch": 0.6937223646041626, + "grad_norm": 0.4893947854806362, + "learning_rate": 1.176009403043471e-06, + "loss": 0.1736, + "step": 3079 + }, + { + "epoch": 0.693947672289971, + "grad_norm": 0.4730824978174537, + "learning_rate": 1.1744295670669752e-06, + "loss": 0.1711, + "step": 3080 + }, + { + "epoch": 0.6941729799757794, + "grad_norm": 0.5060347302380224, + "learning_rate": 1.1728504671303326e-06, + "loss": 0.1751, + "step": 3081 + }, + { + "epoch": 0.6943982876615878, + "grad_norm": 0.4505293484677534, + "learning_rate": 1.171272104110356e-06, + "loss": 0.1656, + "step": 3082 + }, + { + "epoch": 0.6946235953473963, + "grad_norm": 0.4452757511404009, + "learning_rate": 1.1696944788834546e-06, + "loss": 0.1649, + "step": 3083 + }, + { + "epoch": 0.6948489030332047, + "grad_norm": 0.45827773081525214, + "learning_rate": 1.168117592325622e-06, + "loss": 0.1701, + "step": 3084 + }, + { + "epoch": 0.6950742107190131, + "grad_norm": 0.4377819171257355, + "learning_rate": 1.1665414453124468e-06, + "loss": 0.1679, + "step": 3085 + }, + { + "epoch": 0.6952995184048216, + "grad_norm": 0.4484631371844825, + "learning_rate": 1.1649660387191027e-06, + "loss": 0.1628, + "step": 3086 + }, + { + "epoch": 0.69552482609063, + "grad_norm": 0.45570552491806887, + "learning_rate": 1.1633913734203552e-06, + "loss": 0.1686, + "step": 3087 + }, + { + "epoch": 0.6957501337764385, + "grad_norm": 0.41071622844886346, + "learning_rate": 1.1618174502905586e-06, + "loss": 0.1588, + "step": 3088 + }, + { + "epoch": 0.6959754414622469, + "grad_norm": 0.47370173539290794, + "learning_rate": 1.1602442702036513e-06, + "loss": 0.1755, + "step": 3089 + }, + { + "epoch": 0.6962007491480553, + "grad_norm": 0.4731619063827171, + "learning_rate": 1.1586718340331634e-06, + "loss": 0.1699, + "step": 3090 + }, + { + "epoch": 0.6964260568338637, + "grad_norm": 0.46775883606009677, + "learning_rate": 1.1571001426522088e-06, + "loss": 0.1595, + "step": 3091 + }, + { + "epoch": 0.6966513645196721, + "grad_norm": 0.44700715854549383, + "learning_rate": 1.1555291969334907e-06, + "loss": 0.1693, + "step": 3092 + }, + { + "epoch": 0.6968766722054807, + "grad_norm": 0.42152805897781503, + "learning_rate": 1.1539589977492946e-06, + "loss": 0.1413, + "step": 3093 + }, + { + "epoch": 0.6971019798912891, + "grad_norm": 0.479734690330806, + "learning_rate": 1.1523895459714948e-06, + "loss": 0.1737, + "step": 3094 + }, + { + "epoch": 0.6973272875770975, + "grad_norm": 0.47660433689818005, + "learning_rate": 1.1508208424715511e-06, + "loss": 0.1723, + "step": 3095 + }, + { + "epoch": 0.6975525952629059, + "grad_norm": 0.4485912917118982, + "learning_rate": 1.1492528881205027e-06, + "loss": 0.1665, + "step": 3096 + }, + { + "epoch": 0.6977779029487143, + "grad_norm": 0.4270492145051454, + "learning_rate": 1.1476856837889774e-06, + "loss": 0.1693, + "step": 3097 + }, + { + "epoch": 0.6980032106345228, + "grad_norm": 0.4615238583151758, + "learning_rate": 1.146119230347187e-06, + "loss": 0.1726, + "step": 3098 + }, + { + "epoch": 0.6982285183203312, + "grad_norm": 0.43677171404599924, + "learning_rate": 1.1445535286649223e-06, + "loss": 0.1577, + "step": 3099 + }, + { + "epoch": 0.6984538260061396, + "grad_norm": 0.4406892246023865, + "learning_rate": 1.142988579611561e-06, + "loss": 0.1611, + "step": 3100 + }, + { + "epoch": 0.6986791336919481, + "grad_norm": 0.4277526947497852, + "learning_rate": 1.1414243840560595e-06, + "loss": 0.1641, + "step": 3101 + }, + { + "epoch": 0.6989044413777565, + "grad_norm": 0.45111454713057597, + "learning_rate": 1.1398609428669582e-06, + "loss": 0.1739, + "step": 3102 + }, + { + "epoch": 0.699129749063565, + "grad_norm": 0.444731358337096, + "learning_rate": 1.1382982569123781e-06, + "loss": 0.1469, + "step": 3103 + }, + { + "epoch": 0.6993550567493734, + "grad_norm": 0.45192785317576195, + "learning_rate": 1.136736327060019e-06, + "loss": 0.1573, + "step": 3104 + }, + { + "epoch": 0.6995803644351818, + "grad_norm": 0.49434267027969314, + "learning_rate": 1.1351751541771644e-06, + "loss": 0.1762, + "step": 3105 + }, + { + "epoch": 0.6998056721209902, + "grad_norm": 0.4524347245803618, + "learning_rate": 1.133614739130673e-06, + "loss": 0.1645, + "step": 3106 + }, + { + "epoch": 0.7000309798067986, + "grad_norm": 0.47144711671074446, + "learning_rate": 1.1320550827869875e-06, + "loss": 0.1845, + "step": 3107 + }, + { + "epoch": 0.700256287492607, + "grad_norm": 0.4707941552899743, + "learning_rate": 1.1304961860121246e-06, + "loss": 0.1896, + "step": 3108 + }, + { + "epoch": 0.7004815951784156, + "grad_norm": 0.4725917840867182, + "learning_rate": 1.128938049671683e-06, + "loss": 0.1868, + "step": 3109 + }, + { + "epoch": 0.700706902864224, + "grad_norm": 0.44558472463370097, + "learning_rate": 1.127380674630838e-06, + "loss": 0.1725, + "step": 3110 + }, + { + "epoch": 0.7009322105500324, + "grad_norm": 0.4468597410624056, + "learning_rate": 1.1258240617543407e-06, + "loss": 0.1671, + "step": 3111 + }, + { + "epoch": 0.7011575182358408, + "grad_norm": 0.4338321327882662, + "learning_rate": 1.1242682119065217e-06, + "loss": 0.1576, + "step": 3112 + }, + { + "epoch": 0.7013828259216492, + "grad_norm": 0.4781751422153249, + "learning_rate": 1.1227131259512857e-06, + "loss": 0.1813, + "step": 3113 + }, + { + "epoch": 0.7016081336074577, + "grad_norm": 0.4792975036852251, + "learning_rate": 1.121158804752113e-06, + "loss": 0.1781, + "step": 3114 + }, + { + "epoch": 0.7018334412932661, + "grad_norm": 0.4730756090642387, + "learning_rate": 1.119605249172062e-06, + "loss": 0.172, + "step": 3115 + }, + { + "epoch": 0.7020587489790745, + "grad_norm": 0.46507303652363785, + "learning_rate": 1.1180524600737624e-06, + "loss": 0.1663, + "step": 3116 + }, + { + "epoch": 0.702284056664883, + "grad_norm": 0.46062877683070563, + "learning_rate": 1.1165004383194218e-06, + "loss": 0.1743, + "step": 3117 + }, + { + "epoch": 0.7025093643506914, + "grad_norm": 0.46750166201748955, + "learning_rate": 1.1149491847708186e-06, + "loss": 0.164, + "step": 3118 + }, + { + "epoch": 0.7027346720364999, + "grad_norm": 0.4375762817297168, + "learning_rate": 1.1133987002893062e-06, + "loss": 0.1566, + "step": 3119 + }, + { + "epoch": 0.7029599797223083, + "grad_norm": 0.4406399091260563, + "learning_rate": 1.1118489857358129e-06, + "loss": 0.1628, + "step": 3120 + }, + { + "epoch": 0.7031852874081167, + "grad_norm": 0.4564990636133812, + "learning_rate": 1.1103000419708347e-06, + "loss": 0.1589, + "step": 3121 + }, + { + "epoch": 0.7034105950939251, + "grad_norm": 0.45532469623701693, + "learning_rate": 1.1087518698544444e-06, + "loss": 0.1641, + "step": 3122 + }, + { + "epoch": 0.7036359027797335, + "grad_norm": 0.44734410650026724, + "learning_rate": 1.1072044702462825e-06, + "loss": 0.179, + "step": 3123 + }, + { + "epoch": 0.703861210465542, + "grad_norm": 0.46493205582831404, + "learning_rate": 1.1056578440055631e-06, + "loss": 0.1744, + "step": 3124 + }, + { + "epoch": 0.7040865181513505, + "grad_norm": 0.4651953881447342, + "learning_rate": 1.1041119919910715e-06, + "loss": 0.1684, + "step": 3125 + }, + { + "epoch": 0.7043118258371589, + "grad_norm": 0.4507073066491429, + "learning_rate": 1.1025669150611594e-06, + "loss": 0.1717, + "step": 3126 + }, + { + "epoch": 0.7045371335229673, + "grad_norm": 0.46275713975440214, + "learning_rate": 1.101022614073752e-06, + "loss": 0.1639, + "step": 3127 + }, + { + "epoch": 0.7047624412087757, + "grad_norm": 0.4489673694014636, + "learning_rate": 1.0994790898863409e-06, + "loss": 0.1715, + "step": 3128 + }, + { + "epoch": 0.7049877488945842, + "grad_norm": 0.4587380987145532, + "learning_rate": 1.0979363433559892e-06, + "loss": 0.1589, + "step": 3129 + }, + { + "epoch": 0.7052130565803926, + "grad_norm": 0.4378067774255354, + "learning_rate": 1.0963943753393252e-06, + "loss": 0.1578, + "step": 3130 + }, + { + "epoch": 0.705438364266201, + "grad_norm": 0.43780712530149507, + "learning_rate": 1.094853186692546e-06, + "loss": 0.1532, + "step": 3131 + }, + { + "epoch": 0.7056636719520094, + "grad_norm": 0.4146130880415866, + "learning_rate": 1.0933127782714175e-06, + "loss": 0.1451, + "step": 3132 + }, + { + "epoch": 0.7058889796378179, + "grad_norm": 0.45362190818779047, + "learning_rate": 1.0917731509312696e-06, + "loss": 0.1636, + "step": 3133 + }, + { + "epoch": 0.7061142873236264, + "grad_norm": 0.4769075523148852, + "learning_rate": 1.0902343055270006e-06, + "loss": 0.1738, + "step": 3134 + }, + { + "epoch": 0.7063395950094348, + "grad_norm": 0.4277087614229393, + "learning_rate": 1.0886962429130754e-06, + "loss": 0.1569, + "step": 3135 + }, + { + "epoch": 0.7065649026952432, + "grad_norm": 0.4539210764809445, + "learning_rate": 1.0871589639435204e-06, + "loss": 0.1705, + "step": 3136 + }, + { + "epoch": 0.7067902103810516, + "grad_norm": 0.4605133448025996, + "learning_rate": 1.0856224694719313e-06, + "loss": 0.1682, + "step": 3137 + }, + { + "epoch": 0.70701551806686, + "grad_norm": 0.501251946330184, + "learning_rate": 1.0840867603514648e-06, + "loss": 0.1887, + "step": 3138 + }, + { + "epoch": 0.7072408257526684, + "grad_norm": 0.46898492867310515, + "learning_rate": 1.0825518374348442e-06, + "loss": 0.1853, + "step": 3139 + }, + { + "epoch": 0.707466133438477, + "grad_norm": 0.5130467467527561, + "learning_rate": 1.0810177015743536e-06, + "loss": 0.1612, + "step": 3140 + }, + { + "epoch": 0.7076914411242854, + "grad_norm": 0.48124537721033855, + "learning_rate": 1.079484353621842e-06, + "loss": 0.1662, + "step": 3141 + }, + { + "epoch": 0.7079167488100938, + "grad_norm": 0.46856761779782596, + "learning_rate": 1.0779517944287216e-06, + "loss": 0.1638, + "step": 3142 + }, + { + "epoch": 0.7081420564959022, + "grad_norm": 0.4179454718542911, + "learning_rate": 1.0764200248459633e-06, + "loss": 0.1437, + "step": 3143 + }, + { + "epoch": 0.7083673641817106, + "grad_norm": 0.4306162576332252, + "learning_rate": 1.0748890457241037e-06, + "loss": 0.1648, + "step": 3144 + }, + { + "epoch": 0.7085926718675191, + "grad_norm": 0.46919444260066906, + "learning_rate": 1.0733588579132365e-06, + "loss": 0.1713, + "step": 3145 + }, + { + "epoch": 0.7088179795533275, + "grad_norm": 0.43781908164035904, + "learning_rate": 1.0718294622630188e-06, + "loss": 0.1685, + "step": 3146 + }, + { + "epoch": 0.7090432872391359, + "grad_norm": 0.432576882987967, + "learning_rate": 1.0703008596226692e-06, + "loss": 0.1513, + "step": 3147 + }, + { + "epoch": 0.7092685949249444, + "grad_norm": 0.4481424064759406, + "learning_rate": 1.0687730508409594e-06, + "loss": 0.1797, + "step": 3148 + }, + { + "epoch": 0.7094939026107528, + "grad_norm": 0.44120600266491966, + "learning_rate": 1.0672460367662271e-06, + "loss": 0.1629, + "step": 3149 + }, + { + "epoch": 0.7097192102965613, + "grad_norm": 0.4322094215649472, + "learning_rate": 1.065719818246367e-06, + "loss": 0.1638, + "step": 3150 + }, + { + "epoch": 0.7099445179823697, + "grad_norm": 0.4956307645473907, + "learning_rate": 1.0641943961288298e-06, + "loss": 0.1742, + "step": 3151 + }, + { + "epoch": 0.7101698256681781, + "grad_norm": 0.4601322528572896, + "learning_rate": 1.062669771260627e-06, + "loss": 0.1767, + "step": 3152 + }, + { + "epoch": 0.7103951333539865, + "grad_norm": 0.45404772920785397, + "learning_rate": 1.0611459444883243e-06, + "loss": 0.1655, + "step": 3153 + }, + { + "epoch": 0.7106204410397949, + "grad_norm": 0.43564411050695134, + "learning_rate": 1.0596229166580477e-06, + "loss": 0.1659, + "step": 3154 + }, + { + "epoch": 0.7108457487256034, + "grad_norm": 0.44477324477423064, + "learning_rate": 1.0581006886154758e-06, + "loss": 0.1594, + "step": 3155 + }, + { + "epoch": 0.7110710564114119, + "grad_norm": 0.4505919285897323, + "learning_rate": 1.0565792612058462e-06, + "loss": 0.1669, + "step": 3156 + }, + { + "epoch": 0.7112963640972203, + "grad_norm": 0.44171325571528386, + "learning_rate": 1.0550586352739519e-06, + "loss": 0.1637, + "step": 3157 + }, + { + "epoch": 0.7115216717830287, + "grad_norm": 0.4149164370243133, + "learning_rate": 1.0535388116641376e-06, + "loss": 0.1548, + "step": 3158 + }, + { + "epoch": 0.7117469794688371, + "grad_norm": 0.4421894767987525, + "learning_rate": 1.0520197912203067e-06, + "loss": 0.1397, + "step": 3159 + }, + { + "epoch": 0.7119722871546456, + "grad_norm": 0.47087828162625306, + "learning_rate": 1.050501574785913e-06, + "loss": 0.179, + "step": 3160 + }, + { + "epoch": 0.712197594840454, + "grad_norm": 0.4696239257383317, + "learning_rate": 1.048984163203967e-06, + "loss": 0.1709, + "step": 3161 + }, + { + "epoch": 0.7124229025262624, + "grad_norm": 0.4859177360863819, + "learning_rate": 1.0474675573170293e-06, + "loss": 0.1663, + "step": 3162 + }, + { + "epoch": 0.7126482102120708, + "grad_norm": 0.44631395931804135, + "learning_rate": 1.045951757967215e-06, + "loss": 0.1719, + "step": 3163 + }, + { + "epoch": 0.7128735178978793, + "grad_norm": 0.47280730847158214, + "learning_rate": 1.0444367659961927e-06, + "loss": 0.1759, + "step": 3164 + }, + { + "epoch": 0.7130988255836878, + "grad_norm": 0.4354369191167459, + "learning_rate": 1.0429225822451792e-06, + "loss": 0.1638, + "step": 3165 + }, + { + "epoch": 0.7133241332694962, + "grad_norm": 0.486153799724593, + "learning_rate": 1.041409207554944e-06, + "loss": 0.1672, + "step": 3166 + }, + { + "epoch": 0.7135494409553046, + "grad_norm": 0.46433213124143724, + "learning_rate": 1.0398966427658091e-06, + "loss": 0.1699, + "step": 3167 + }, + { + "epoch": 0.713774748641113, + "grad_norm": 0.4509630259537625, + "learning_rate": 1.0383848887176437e-06, + "loss": 0.1711, + "step": 3168 + }, + { + "epoch": 0.7140000563269214, + "grad_norm": 0.4550356490613283, + "learning_rate": 1.0368739462498704e-06, + "loss": 0.1587, + "step": 3169 + }, + { + "epoch": 0.7142253640127298, + "grad_norm": 0.4636825328746946, + "learning_rate": 1.035363816201457e-06, + "loss": 0.1589, + "step": 3170 + }, + { + "epoch": 0.7144506716985383, + "grad_norm": 0.4654077571334755, + "learning_rate": 1.033854499410924e-06, + "loss": 0.1615, + "step": 3171 + }, + { + "epoch": 0.7146759793843468, + "grad_norm": 0.4235256859511598, + "learning_rate": 1.032345996716339e-06, + "loss": 0.1504, + "step": 3172 + }, + { + "epoch": 0.7149012870701552, + "grad_norm": 0.42387771215288067, + "learning_rate": 1.030838308955316e-06, + "loss": 0.15, + "step": 3173 + }, + { + "epoch": 0.7151265947559636, + "grad_norm": 0.48619516945464647, + "learning_rate": 1.0293314369650193e-06, + "loss": 0.1851, + "step": 3174 + }, + { + "epoch": 0.715351902441772, + "grad_norm": 0.3995951015220582, + "learning_rate": 1.027825381582157e-06, + "loss": 0.1357, + "step": 3175 + }, + { + "epoch": 0.7155772101275805, + "grad_norm": 0.4541216241290027, + "learning_rate": 1.0263201436429873e-06, + "loss": 0.1707, + "step": 3176 + }, + { + "epoch": 0.7158025178133889, + "grad_norm": 0.42958213575349363, + "learning_rate": 1.0248157239833111e-06, + "loss": 0.1526, + "step": 3177 + }, + { + "epoch": 0.7160278254991973, + "grad_norm": 0.4597097086240284, + "learning_rate": 1.0233121234384777e-06, + "loss": 0.1682, + "step": 3178 + }, + { + "epoch": 0.7162531331850057, + "grad_norm": 0.44494734257605095, + "learning_rate": 1.0218093428433807e-06, + "loss": 0.1682, + "step": 3179 + }, + { + "epoch": 0.7164784408708142, + "grad_norm": 0.46555939206427527, + "learning_rate": 1.0203073830324566e-06, + "loss": 0.175, + "step": 3180 + }, + { + "epoch": 0.7167037485566227, + "grad_norm": 0.45240458388750293, + "learning_rate": 1.0188062448396897e-06, + "loss": 0.165, + "step": 3181 + }, + { + "epoch": 0.7169290562424311, + "grad_norm": 0.46618165335136097, + "learning_rate": 1.0173059290986048e-06, + "loss": 0.156, + "step": 3182 + }, + { + "epoch": 0.7171543639282395, + "grad_norm": 0.4810502783521092, + "learning_rate": 1.015806436642271e-06, + "loss": 0.1741, + "step": 3183 + }, + { + "epoch": 0.7173796716140479, + "grad_norm": 0.4563263532363381, + "learning_rate": 1.0143077683033017e-06, + "loss": 0.166, + "step": 3184 + }, + { + "epoch": 0.7176049792998563, + "grad_norm": 0.45446676343758696, + "learning_rate": 1.0128099249138502e-06, + "loss": 0.1794, + "step": 3185 + }, + { + "epoch": 0.7178302869856648, + "grad_norm": 0.4583128626448037, + "learning_rate": 1.0113129073056149e-06, + "loss": 0.1799, + "step": 3186 + }, + { + "epoch": 0.7180555946714733, + "grad_norm": 0.456650576737451, + "learning_rate": 1.0098167163098319e-06, + "loss": 0.1738, + "step": 3187 + }, + { + "epoch": 0.7182809023572817, + "grad_norm": 0.47145916613199307, + "learning_rate": 1.008321352757281e-06, + "loss": 0.1728, + "step": 3188 + }, + { + "epoch": 0.7185062100430901, + "grad_norm": 0.4696972682485299, + "learning_rate": 1.0068268174782833e-06, + "loss": 0.1854, + "step": 3189 + }, + { + "epoch": 0.7187315177288985, + "grad_norm": 0.4650197558673947, + "learning_rate": 1.0053331113026962e-06, + "loss": 0.1674, + "step": 3190 + }, + { + "epoch": 0.718956825414707, + "grad_norm": 0.42180727860963796, + "learning_rate": 1.0038402350599214e-06, + "loss": 0.1593, + "step": 3191 + }, + { + "epoch": 0.7191821331005154, + "grad_norm": 0.501673986366076, + "learning_rate": 1.002348189578895e-06, + "loss": 0.1774, + "step": 3192 + }, + { + "epoch": 0.7194074407863238, + "grad_norm": 0.45715617850116697, + "learning_rate": 1.0008569756880956e-06, + "loss": 0.1588, + "step": 3193 + }, + { + "epoch": 0.7196327484721322, + "grad_norm": 0.4290907736663843, + "learning_rate": 9.993665942155395e-07, + "loss": 0.1583, + "step": 3194 + }, + { + "epoch": 0.7198580561579407, + "grad_norm": 0.4572787908084985, + "learning_rate": 9.978770459887778e-07, + "loss": 0.1645, + "step": 3195 + }, + { + "epoch": 0.7200833638437492, + "grad_norm": 0.491605490158168, + "learning_rate": 9.963883318349039e-07, + "loss": 0.1822, + "step": 3196 + }, + { + "epoch": 0.7203086715295576, + "grad_norm": 0.4297109396113438, + "learning_rate": 9.949004525805423e-07, + "loss": 0.1548, + "step": 3197 + }, + { + "epoch": 0.720533979215366, + "grad_norm": 0.3987139721734384, + "learning_rate": 9.934134090518593e-07, + "loss": 0.1361, + "step": 3198 + }, + { + "epoch": 0.7207592869011744, + "grad_norm": 0.4767023196058793, + "learning_rate": 9.919272020745529e-07, + "loss": 0.1675, + "step": 3199 + }, + { + "epoch": 0.7209845945869828, + "grad_norm": 0.43314857595280776, + "learning_rate": 9.904418324738605e-07, + "loss": 0.1602, + "step": 3200 + }, + { + "epoch": 0.7212099022727912, + "grad_norm": 0.44436148650394325, + "learning_rate": 9.889573010745507e-07, + "loss": 0.1706, + "step": 3201 + }, + { + "epoch": 0.7214352099585997, + "grad_norm": 0.4552714617082466, + "learning_rate": 9.874736087009285e-07, + "loss": 0.1629, + "step": 3202 + }, + { + "epoch": 0.7216605176444082, + "grad_norm": 0.4444100774567107, + "learning_rate": 9.859907561768335e-07, + "loss": 0.1664, + "step": 3203 + }, + { + "epoch": 0.7218858253302166, + "grad_norm": 0.4331176248377337, + "learning_rate": 9.84508744325639e-07, + "loss": 0.1529, + "step": 3204 + }, + { + "epoch": 0.722111133016025, + "grad_norm": 0.4067322739150992, + "learning_rate": 9.830275739702497e-07, + "loss": 0.1514, + "step": 3205 + }, + { + "epoch": 0.7223364407018334, + "grad_norm": 0.47464048336058434, + "learning_rate": 9.815472459331061e-07, + "loss": 0.1635, + "step": 3206 + }, + { + "epoch": 0.7225617483876419, + "grad_norm": 0.43758552273107326, + "learning_rate": 9.800677610361768e-07, + "loss": 0.1633, + "step": 3207 + }, + { + "epoch": 0.7227870560734503, + "grad_norm": 0.43275007013964417, + "learning_rate": 9.785891201009667e-07, + "loss": 0.1509, + "step": 3208 + }, + { + "epoch": 0.7230123637592587, + "grad_norm": 0.4753747099403738, + "learning_rate": 9.771113239485084e-07, + "loss": 0.1792, + "step": 3209 + }, + { + "epoch": 0.7232376714450671, + "grad_norm": 0.47024372725967206, + "learning_rate": 9.756343733993679e-07, + "loss": 0.1795, + "step": 3210 + }, + { + "epoch": 0.7234629791308756, + "grad_norm": 0.4536093362424985, + "learning_rate": 9.741582692736412e-07, + "loss": 0.1592, + "step": 3211 + }, + { + "epoch": 0.7236882868166841, + "grad_norm": 0.500526834715798, + "learning_rate": 9.726830123909527e-07, + "loss": 0.1719, + "step": 3212 + }, + { + "epoch": 0.7239135945024925, + "grad_norm": 0.45412321537404954, + "learning_rate": 9.71208603570459e-07, + "loss": 0.1658, + "step": 3213 + }, + { + "epoch": 0.7241389021883009, + "grad_norm": 0.44749016579960055, + "learning_rate": 9.697350436308428e-07, + "loss": 0.1665, + "step": 3214 + }, + { + "epoch": 0.7243642098741093, + "grad_norm": 0.43671994832700867, + "learning_rate": 9.68262333390318e-07, + "loss": 0.1505, + "step": 3215 + }, + { + "epoch": 0.7245895175599177, + "grad_norm": 0.4932293635625036, + "learning_rate": 9.667904736666258e-07, + "loss": 0.1843, + "step": 3216 + }, + { + "epoch": 0.7248148252457262, + "grad_norm": 0.45671807670225806, + "learning_rate": 9.653194652770343e-07, + "loss": 0.1603, + "step": 3217 + }, + { + "epoch": 0.7250401329315346, + "grad_norm": 0.43293902547079444, + "learning_rate": 9.638493090383408e-07, + "loss": 0.1546, + "step": 3218 + }, + { + "epoch": 0.7252654406173431, + "grad_norm": 0.45625612241527813, + "learning_rate": 9.623800057668675e-07, + "loss": 0.1665, + "step": 3219 + }, + { + "epoch": 0.7254907483031515, + "grad_norm": 0.45398733742240105, + "learning_rate": 9.60911556278463e-07, + "loss": 0.1645, + "step": 3220 + }, + { + "epoch": 0.7257160559889599, + "grad_norm": 0.46868149071989584, + "learning_rate": 9.594439613885044e-07, + "loss": 0.1628, + "step": 3221 + }, + { + "epoch": 0.7259413636747684, + "grad_norm": 0.4384631244294843, + "learning_rate": 9.579772219118899e-07, + "loss": 0.1526, + "step": 3222 + }, + { + "epoch": 0.7261666713605768, + "grad_norm": 0.4538230473118285, + "learning_rate": 9.565113386630482e-07, + "loss": 0.1616, + "step": 3223 + }, + { + "epoch": 0.7263919790463852, + "grad_norm": 0.4789101208246133, + "learning_rate": 9.550463124559267e-07, + "loss": 0.1689, + "step": 3224 + }, + { + "epoch": 0.7266172867321936, + "grad_norm": 0.47898674419840476, + "learning_rate": 9.535821441040017e-07, + "loss": 0.1791, + "step": 3225 + }, + { + "epoch": 0.726842594418002, + "grad_norm": 0.4772847438724727, + "learning_rate": 9.521188344202717e-07, + "loss": 0.1768, + "step": 3226 + }, + { + "epoch": 0.7270679021038106, + "grad_norm": 0.44362217745345384, + "learning_rate": 9.506563842172565e-07, + "loss": 0.1605, + "step": 3227 + }, + { + "epoch": 0.727293209789619, + "grad_norm": 0.46015435457720194, + "learning_rate": 9.491947943070015e-07, + "loss": 0.1657, + "step": 3228 + }, + { + "epoch": 0.7275185174754274, + "grad_norm": 0.4411231053798028, + "learning_rate": 9.477340655010717e-07, + "loss": 0.1625, + "step": 3229 + }, + { + "epoch": 0.7277438251612358, + "grad_norm": 0.5010532165461705, + "learning_rate": 9.462741986105573e-07, + "loss": 0.1787, + "step": 3230 + }, + { + "epoch": 0.7279691328470442, + "grad_norm": 0.48752685286780356, + "learning_rate": 9.448151944460657e-07, + "loss": 0.1669, + "step": 3231 + }, + { + "epoch": 0.7281944405328526, + "grad_norm": 0.41300065094803884, + "learning_rate": 9.433570538177289e-07, + "loss": 0.1507, + "step": 3232 + }, + { + "epoch": 0.7284197482186611, + "grad_norm": 0.46549256661712346, + "learning_rate": 9.418997775351985e-07, + "loss": 0.1813, + "step": 3233 + }, + { + "epoch": 0.7286450559044696, + "grad_norm": 0.48708012862292693, + "learning_rate": 9.404433664076442e-07, + "loss": 0.1677, + "step": 3234 + }, + { + "epoch": 0.728870363590278, + "grad_norm": 0.47386241704350274, + "learning_rate": 9.389878212437586e-07, + "loss": 0.152, + "step": 3235 + }, + { + "epoch": 0.7290956712760864, + "grad_norm": 0.48482776424099006, + "learning_rate": 9.375331428517506e-07, + "loss": 0.1647, + "step": 3236 + }, + { + "epoch": 0.7293209789618949, + "grad_norm": 0.4849064907730006, + "learning_rate": 9.360793320393483e-07, + "loss": 0.1715, + "step": 3237 + }, + { + "epoch": 0.7295462866477033, + "grad_norm": 0.4603867848789201, + "learning_rate": 9.346263896138e-07, + "loss": 0.174, + "step": 3238 + }, + { + "epoch": 0.7297715943335117, + "grad_norm": 0.4765852349576271, + "learning_rate": 9.33174316381869e-07, + "loss": 0.1784, + "step": 3239 + }, + { + "epoch": 0.7299969020193201, + "grad_norm": 0.45876250686499204, + "learning_rate": 9.317231131498383e-07, + "loss": 0.1608, + "step": 3240 + }, + { + "epoch": 0.7302222097051285, + "grad_norm": 0.4665679325838626, + "learning_rate": 9.302727807235079e-07, + "loss": 0.1845, + "step": 3241 + }, + { + "epoch": 0.730447517390937, + "grad_norm": 0.45668494612252986, + "learning_rate": 9.288233199081914e-07, + "loss": 0.1828, + "step": 3242 + }, + { + "epoch": 0.7306728250767455, + "grad_norm": 0.4870783104956319, + "learning_rate": 9.273747315087223e-07, + "loss": 0.1831, + "step": 3243 + }, + { + "epoch": 0.7308981327625539, + "grad_norm": 0.45609927079483825, + "learning_rate": 9.259270163294457e-07, + "loss": 0.179, + "step": 3244 + }, + { + "epoch": 0.7311234404483623, + "grad_norm": 0.4182234541652945, + "learning_rate": 9.244801751742258e-07, + "loss": 0.1521, + "step": 3245 + }, + { + "epoch": 0.7313487481341707, + "grad_norm": 0.47185640909917215, + "learning_rate": 9.230342088464381e-07, + "loss": 0.1692, + "step": 3246 + }, + { + "epoch": 0.7315740558199791, + "grad_norm": 0.484269734341363, + "learning_rate": 9.215891181489742e-07, + "loss": 0.1822, + "step": 3247 + }, + { + "epoch": 0.7317993635057876, + "grad_norm": 0.4364978205690923, + "learning_rate": 9.201449038842403e-07, + "loss": 0.1708, + "step": 3248 + }, + { + "epoch": 0.732024671191596, + "grad_norm": 0.4295064628584712, + "learning_rate": 9.187015668541526e-07, + "loss": 0.1706, + "step": 3249 + }, + { + "epoch": 0.7322499788774045, + "grad_norm": 0.4510897341392053, + "learning_rate": 9.172591078601448e-07, + "loss": 0.1758, + "step": 3250 + }, + { + "epoch": 0.7324752865632129, + "grad_norm": 0.46392844709469566, + "learning_rate": 9.158175277031584e-07, + "loss": 0.1752, + "step": 3251 + }, + { + "epoch": 0.7327005942490213, + "grad_norm": 0.45270062163216557, + "learning_rate": 9.143768271836506e-07, + "loss": 0.1748, + "step": 3252 + }, + { + "epoch": 0.7329259019348298, + "grad_norm": 0.44312613780170557, + "learning_rate": 9.129370071015886e-07, + "loss": 0.1595, + "step": 3253 + }, + { + "epoch": 0.7331512096206382, + "grad_norm": 0.41083984837081294, + "learning_rate": 9.114980682564492e-07, + "loss": 0.1428, + "step": 3254 + }, + { + "epoch": 0.7333765173064466, + "grad_norm": 0.46955978439808177, + "learning_rate": 9.100600114472238e-07, + "loss": 0.1743, + "step": 3255 + }, + { + "epoch": 0.733601824992255, + "grad_norm": 0.4804170379213079, + "learning_rate": 9.086228374724096e-07, + "loss": 0.1752, + "step": 3256 + }, + { + "epoch": 0.7338271326780634, + "grad_norm": 0.43827033805694543, + "learning_rate": 9.071865471300168e-07, + "loss": 0.1743, + "step": 3257 + }, + { + "epoch": 0.734052440363872, + "grad_norm": 0.4475936035355731, + "learning_rate": 9.057511412175646e-07, + "loss": 0.1634, + "step": 3258 + }, + { + "epoch": 0.7342777480496804, + "grad_norm": 0.4303249897725664, + "learning_rate": 9.043166205320789e-07, + "loss": 0.1552, + "step": 3259 + }, + { + "epoch": 0.7345030557354888, + "grad_norm": 0.44919641986869174, + "learning_rate": 9.028829858700974e-07, + "loss": 0.163, + "step": 3260 + }, + { + "epoch": 0.7347283634212972, + "grad_norm": 0.4536351853683652, + "learning_rate": 9.014502380276619e-07, + "loss": 0.1608, + "step": 3261 + }, + { + "epoch": 0.7349536711071056, + "grad_norm": 0.42989362873910825, + "learning_rate": 9.000183778003246e-07, + "loss": 0.1589, + "step": 3262 + }, + { + "epoch": 0.735178978792914, + "grad_norm": 0.46680041264289657, + "learning_rate": 8.985874059831456e-07, + "loss": 0.1682, + "step": 3263 + }, + { + "epoch": 0.7354042864787225, + "grad_norm": 0.4565199562942905, + "learning_rate": 8.971573233706881e-07, + "loss": 0.1726, + "step": 3264 + }, + { + "epoch": 0.7356295941645309, + "grad_norm": 0.44498243371037294, + "learning_rate": 8.957281307570254e-07, + "loss": 0.1633, + "step": 3265 + }, + { + "epoch": 0.7358549018503394, + "grad_norm": 0.45496449863839766, + "learning_rate": 8.942998289357333e-07, + "loss": 0.1591, + "step": 3266 + }, + { + "epoch": 0.7360802095361478, + "grad_norm": 0.4065154604913283, + "learning_rate": 8.928724186998961e-07, + "loss": 0.1508, + "step": 3267 + }, + { + "epoch": 0.7363055172219563, + "grad_norm": 0.4881853500877691, + "learning_rate": 8.914459008421e-07, + "loss": 0.1855, + "step": 3268 + }, + { + "epoch": 0.7365308249077647, + "grad_norm": 0.4585416071009008, + "learning_rate": 8.900202761544377e-07, + "loss": 0.1702, + "step": 3269 + }, + { + "epoch": 0.7367561325935731, + "grad_norm": 0.4519487301690307, + "learning_rate": 8.885955454285078e-07, + "loss": 0.1642, + "step": 3270 + }, + { + "epoch": 0.7369814402793815, + "grad_norm": 0.4475006728345198, + "learning_rate": 8.871717094554058e-07, + "loss": 0.1674, + "step": 3271 + }, + { + "epoch": 0.7372067479651899, + "grad_norm": 0.4198176664818188, + "learning_rate": 8.857487690257374e-07, + "loss": 0.1463, + "step": 3272 + }, + { + "epoch": 0.7374320556509983, + "grad_norm": 0.47141798428734527, + "learning_rate": 8.843267249296086e-07, + "loss": 0.1698, + "step": 3273 + }, + { + "epoch": 0.7376573633368069, + "grad_norm": 0.4749064815120668, + "learning_rate": 8.829055779566262e-07, + "loss": 0.1776, + "step": 3274 + }, + { + "epoch": 0.7378826710226153, + "grad_norm": 0.4260090267562525, + "learning_rate": 8.814853288959016e-07, + "loss": 0.1649, + "step": 3275 + }, + { + "epoch": 0.7381079787084237, + "grad_norm": 0.43292580510590073, + "learning_rate": 8.800659785360444e-07, + "loss": 0.1694, + "step": 3276 + }, + { + "epoch": 0.7383332863942321, + "grad_norm": 0.447440882656025, + "learning_rate": 8.786475276651688e-07, + "loss": 0.1586, + "step": 3277 + }, + { + "epoch": 0.7385585940800405, + "grad_norm": 0.4422069604185099, + "learning_rate": 8.772299770708859e-07, + "loss": 0.1437, + "step": 3278 + }, + { + "epoch": 0.738783901765849, + "grad_norm": 0.45031111384028055, + "learning_rate": 8.758133275403097e-07, + "loss": 0.1728, + "step": 3279 + }, + { + "epoch": 0.7390092094516574, + "grad_norm": 0.43814458010729157, + "learning_rate": 8.743975798600535e-07, + "loss": 0.1757, + "step": 3280 + }, + { + "epoch": 0.7392345171374658, + "grad_norm": 0.46831393190451565, + "learning_rate": 8.729827348162278e-07, + "loss": 0.1772, + "step": 3281 + }, + { + "epoch": 0.7394598248232743, + "grad_norm": 0.44962948201430775, + "learning_rate": 8.71568793194445e-07, + "loss": 0.1688, + "step": 3282 + }, + { + "epoch": 0.7396851325090827, + "grad_norm": 0.42446638922539254, + "learning_rate": 8.701557557798121e-07, + "loss": 0.1499, + "step": 3283 + }, + { + "epoch": 0.7399104401948912, + "grad_norm": 0.4793613966136212, + "learning_rate": 8.687436233569375e-07, + "loss": 0.1877, + "step": 3284 + }, + { + "epoch": 0.7401357478806996, + "grad_norm": 0.43102571842210313, + "learning_rate": 8.673323967099259e-07, + "loss": 0.1601, + "step": 3285 + }, + { + "epoch": 0.740361055566508, + "grad_norm": 0.47531567600137753, + "learning_rate": 8.659220766223778e-07, + "loss": 0.1728, + "step": 3286 + }, + { + "epoch": 0.7405863632523164, + "grad_norm": 0.4593349630016216, + "learning_rate": 8.645126638773926e-07, + "loss": 0.1811, + "step": 3287 + }, + { + "epoch": 0.7408116709381248, + "grad_norm": 0.43694017290443865, + "learning_rate": 8.631041592575643e-07, + "loss": 0.1564, + "step": 3288 + }, + { + "epoch": 0.7410369786239334, + "grad_norm": 0.4490940913307379, + "learning_rate": 8.616965635449814e-07, + "loss": 0.1793, + "step": 3289 + }, + { + "epoch": 0.7412622863097418, + "grad_norm": 0.43418112023902317, + "learning_rate": 8.602898775212317e-07, + "loss": 0.1534, + "step": 3290 + }, + { + "epoch": 0.7414875939955502, + "grad_norm": 0.4869778827045245, + "learning_rate": 8.588841019673938e-07, + "loss": 0.1717, + "step": 3291 + }, + { + "epoch": 0.7417129016813586, + "grad_norm": 0.43626749580846413, + "learning_rate": 8.57479237664044e-07, + "loss": 0.1612, + "step": 3292 + }, + { + "epoch": 0.741938209367167, + "grad_norm": 0.4624574313604066, + "learning_rate": 8.560752853912494e-07, + "loss": 0.164, + "step": 3293 + }, + { + "epoch": 0.7421635170529755, + "grad_norm": 0.4648530708481148, + "learning_rate": 8.546722459285727e-07, + "loss": 0.1721, + "step": 3294 + }, + { + "epoch": 0.7423888247387839, + "grad_norm": 0.4769062743766037, + "learning_rate": 8.532701200550714e-07, + "loss": 0.1732, + "step": 3295 + }, + { + "epoch": 0.7426141324245923, + "grad_norm": 0.44602823998061436, + "learning_rate": 8.518689085492909e-07, + "loss": 0.1654, + "step": 3296 + }, + { + "epoch": 0.7428394401104008, + "grad_norm": 0.43318025266571686, + "learning_rate": 8.504686121892741e-07, + "loss": 0.1743, + "step": 3297 + }, + { + "epoch": 0.7430647477962092, + "grad_norm": 0.4517623471927079, + "learning_rate": 8.490692317525514e-07, + "loss": 0.1631, + "step": 3298 + }, + { + "epoch": 0.7432900554820177, + "grad_norm": 0.44768604722124056, + "learning_rate": 8.476707680161486e-07, + "loss": 0.1655, + "step": 3299 + }, + { + "epoch": 0.7435153631678261, + "grad_norm": 0.4251779488195503, + "learning_rate": 8.462732217565783e-07, + "loss": 0.1578, + "step": 3300 + }, + { + "epoch": 0.7437406708536345, + "grad_norm": 0.45018155512884506, + "learning_rate": 8.448765937498471e-07, + "loss": 0.1721, + "step": 3301 + }, + { + "epoch": 0.7439659785394429, + "grad_norm": 0.47226598318932717, + "learning_rate": 8.434808847714512e-07, + "loss": 0.166, + "step": 3302 + }, + { + "epoch": 0.7441912862252513, + "grad_norm": 0.4939844470868201, + "learning_rate": 8.420860955963739e-07, + "loss": 0.1711, + "step": 3303 + }, + { + "epoch": 0.7444165939110597, + "grad_norm": 0.4729424553050723, + "learning_rate": 8.406922269990917e-07, + "loss": 0.1839, + "step": 3304 + }, + { + "epoch": 0.7446419015968683, + "grad_norm": 0.4515243703838667, + "learning_rate": 8.392992797535666e-07, + "loss": 0.1721, + "step": 3305 + }, + { + "epoch": 0.7448672092826767, + "grad_norm": 0.46565436918451203, + "learning_rate": 8.379072546332498e-07, + "loss": 0.1534, + "step": 3306 + }, + { + "epoch": 0.7450925169684851, + "grad_norm": 0.44870233055282444, + "learning_rate": 8.365161524110823e-07, + "loss": 0.1638, + "step": 3307 + }, + { + "epoch": 0.7453178246542935, + "grad_norm": 0.43923121511525803, + "learning_rate": 8.351259738594902e-07, + "loss": 0.1685, + "step": 3308 + }, + { + "epoch": 0.7455431323401019, + "grad_norm": 0.44657960627790266, + "learning_rate": 8.337367197503881e-07, + "loss": 0.1607, + "step": 3309 + }, + { + "epoch": 0.7457684400259104, + "grad_norm": 0.4717523831531228, + "learning_rate": 8.323483908551783e-07, + "loss": 0.1691, + "step": 3310 + }, + { + "epoch": 0.7459937477117188, + "grad_norm": 0.44615620541114376, + "learning_rate": 8.309609879447458e-07, + "loss": 0.1665, + "step": 3311 + }, + { + "epoch": 0.7462190553975272, + "grad_norm": 0.42230148328704603, + "learning_rate": 8.29574511789466e-07, + "loss": 0.1577, + "step": 3312 + }, + { + "epoch": 0.7464443630833357, + "grad_norm": 0.4577603134494316, + "learning_rate": 8.281889631591955e-07, + "loss": 0.1759, + "step": 3313 + }, + { + "epoch": 0.7466696707691441, + "grad_norm": 0.4425973026992569, + "learning_rate": 8.268043428232798e-07, + "loss": 0.1832, + "step": 3314 + }, + { + "epoch": 0.7468949784549526, + "grad_norm": 0.43235493042537204, + "learning_rate": 8.254206515505444e-07, + "loss": 0.1673, + "step": 3315 + }, + { + "epoch": 0.747120286140761, + "grad_norm": 0.4343196300635854, + "learning_rate": 8.240378901093035e-07, + "loss": 0.1508, + "step": 3316 + }, + { + "epoch": 0.7473455938265694, + "grad_norm": 0.4382195678752772, + "learning_rate": 8.22656059267353e-07, + "loss": 0.156, + "step": 3317 + }, + { + "epoch": 0.7475709015123778, + "grad_norm": 0.451813648822175, + "learning_rate": 8.212751597919708e-07, + "loss": 0.1655, + "step": 3318 + }, + { + "epoch": 0.7477962091981862, + "grad_norm": 0.44063982607969504, + "learning_rate": 8.198951924499202e-07, + "loss": 0.1673, + "step": 3319 + }, + { + "epoch": 0.7480215168839947, + "grad_norm": 0.43105682861923805, + "learning_rate": 8.185161580074444e-07, + "loss": 0.1593, + "step": 3320 + }, + { + "epoch": 0.7482468245698032, + "grad_norm": 0.4399731063904187, + "learning_rate": 8.171380572302712e-07, + "loss": 0.1589, + "step": 3321 + }, + { + "epoch": 0.7484721322556116, + "grad_norm": 0.45745149097874355, + "learning_rate": 8.157608908836071e-07, + "loss": 0.1744, + "step": 3322 + }, + { + "epoch": 0.74869743994142, + "grad_norm": 0.4286006780945182, + "learning_rate": 8.143846597321425e-07, + "loss": 0.1543, + "step": 3323 + }, + { + "epoch": 0.7489227476272284, + "grad_norm": 0.4814203668155584, + "learning_rate": 8.130093645400469e-07, + "loss": 0.1914, + "step": 3324 + }, + { + "epoch": 0.7491480553130369, + "grad_norm": 0.4606302966264785, + "learning_rate": 8.116350060709696e-07, + "loss": 0.1668, + "step": 3325 + }, + { + "epoch": 0.7493733629988453, + "grad_norm": 0.41157086911239055, + "learning_rate": 8.102615850880413e-07, + "loss": 0.1445, + "step": 3326 + }, + { + "epoch": 0.7495986706846537, + "grad_norm": 0.4266286615928688, + "learning_rate": 8.088891023538722e-07, + "loss": 0.1462, + "step": 3327 + }, + { + "epoch": 0.7498239783704621, + "grad_norm": 0.48536869365251284, + "learning_rate": 8.075175586305492e-07, + "loss": 0.1901, + "step": 3328 + }, + { + "epoch": 0.7500492860562706, + "grad_norm": 0.46988361520887173, + "learning_rate": 8.061469546796413e-07, + "loss": 0.1843, + "step": 3329 + }, + { + "epoch": 0.750274593742079, + "grad_norm": 0.45831848626964056, + "learning_rate": 8.047772912621921e-07, + "loss": 0.1625, + "step": 3330 + }, + { + "epoch": 0.7504999014278875, + "grad_norm": 0.4587920648570918, + "learning_rate": 8.034085691387253e-07, + "loss": 0.1846, + "step": 3331 + }, + { + "epoch": 0.7507252091136959, + "grad_norm": 0.40956765878757406, + "learning_rate": 8.020407890692419e-07, + "loss": 0.1488, + "step": 3332 + }, + { + "epoch": 0.7509505167995043, + "grad_norm": 0.4353758370206993, + "learning_rate": 8.006739518132179e-07, + "loss": 0.1545, + "step": 3333 + }, + { + "epoch": 0.7511758244853127, + "grad_norm": 0.4641289816451457, + "learning_rate": 7.993080581296087e-07, + "loss": 0.1635, + "step": 3334 + }, + { + "epoch": 0.7514011321711211, + "grad_norm": 0.45907451191374415, + "learning_rate": 7.979431087768424e-07, + "loss": 0.1562, + "step": 3335 + }, + { + "epoch": 0.7516264398569297, + "grad_norm": 0.4457915942038961, + "learning_rate": 7.96579104512826e-07, + "loss": 0.1621, + "step": 3336 + }, + { + "epoch": 0.7518517475427381, + "grad_norm": 0.43971107382154495, + "learning_rate": 7.95216046094939e-07, + "loss": 0.1592, + "step": 3337 + }, + { + "epoch": 0.7520770552285465, + "grad_norm": 0.4462239492354098, + "learning_rate": 7.938539342800373e-07, + "loss": 0.1632, + "step": 3338 + }, + { + "epoch": 0.7523023629143549, + "grad_norm": 0.44266563372147927, + "learning_rate": 7.92492769824452e-07, + "loss": 0.1543, + "step": 3339 + }, + { + "epoch": 0.7525276706001633, + "grad_norm": 0.4538005113036504, + "learning_rate": 7.911325534839851e-07, + "loss": 0.1601, + "step": 3340 + }, + { + "epoch": 0.7527529782859718, + "grad_norm": 0.43826491001549833, + "learning_rate": 7.897732860139157e-07, + "loss": 0.1558, + "step": 3341 + }, + { + "epoch": 0.7529782859717802, + "grad_norm": 0.4391438690707823, + "learning_rate": 7.884149681689937e-07, + "loss": 0.1518, + "step": 3342 + }, + { + "epoch": 0.7532035936575886, + "grad_norm": 0.44789690655792086, + "learning_rate": 7.870576007034414e-07, + "loss": 0.1598, + "step": 3343 + }, + { + "epoch": 0.7534289013433971, + "grad_norm": 0.45809496549962353, + "learning_rate": 7.857011843709559e-07, + "loss": 0.1681, + "step": 3344 + }, + { + "epoch": 0.7536542090292055, + "grad_norm": 0.47917916131401783, + "learning_rate": 7.843457199247034e-07, + "loss": 0.1792, + "step": 3345 + }, + { + "epoch": 0.753879516715014, + "grad_norm": 0.4575299892920548, + "learning_rate": 7.829912081173238e-07, + "loss": 0.162, + "step": 3346 + }, + { + "epoch": 0.7541048244008224, + "grad_norm": 0.433469207187306, + "learning_rate": 7.816376497009262e-07, + "loss": 0.1518, + "step": 3347 + }, + { + "epoch": 0.7543301320866308, + "grad_norm": 0.4529236494125318, + "learning_rate": 7.802850454270913e-07, + "loss": 0.1644, + "step": 3348 + }, + { + "epoch": 0.7545554397724392, + "grad_norm": 0.4425954708456862, + "learning_rate": 7.789333960468707e-07, + "loss": 0.1647, + "step": 3349 + }, + { + "epoch": 0.7547807474582476, + "grad_norm": 0.44602716256213215, + "learning_rate": 7.775827023107835e-07, + "loss": 0.169, + "step": 3350 + }, + { + "epoch": 0.755006055144056, + "grad_norm": 0.42697143797391696, + "learning_rate": 7.762329649688214e-07, + "loss": 0.1491, + "step": 3351 + }, + { + "epoch": 0.7552313628298646, + "grad_norm": 0.45036119941096736, + "learning_rate": 7.74884184770441e-07, + "loss": 0.1471, + "step": 3352 + }, + { + "epoch": 0.755456670515673, + "grad_norm": 0.44629647098950553, + "learning_rate": 7.735363624645712e-07, + "loss": 0.1672, + "step": 3353 + }, + { + "epoch": 0.7556819782014814, + "grad_norm": 0.41994337007086885, + "learning_rate": 7.721894987996076e-07, + "loss": 0.1575, + "step": 3354 + }, + { + "epoch": 0.7559072858872898, + "grad_norm": 0.47123478737917396, + "learning_rate": 7.708435945234124e-07, + "loss": 0.1682, + "step": 3355 + }, + { + "epoch": 0.7561325935730983, + "grad_norm": 0.4628498847767629, + "learning_rate": 7.694986503833171e-07, + "loss": 0.1729, + "step": 3356 + }, + { + "epoch": 0.7563579012589067, + "grad_norm": 0.5033313678408554, + "learning_rate": 7.681546671261181e-07, + "loss": 0.1893, + "step": 3357 + }, + { + "epoch": 0.7565832089447151, + "grad_norm": 0.47003588803055874, + "learning_rate": 7.668116454980804e-07, + "loss": 0.1639, + "step": 3358 + }, + { + "epoch": 0.7568085166305235, + "grad_norm": 0.4808729985229414, + "learning_rate": 7.654695862449327e-07, + "loss": 0.181, + "step": 3359 + }, + { + "epoch": 0.757033824316332, + "grad_norm": 0.4020553807594009, + "learning_rate": 7.641284901118703e-07, + "loss": 0.1429, + "step": 3360 + }, + { + "epoch": 0.7572591320021405, + "grad_norm": 0.4509209724967874, + "learning_rate": 7.627883578435555e-07, + "loss": 0.1709, + "step": 3361 + }, + { + "epoch": 0.7574844396879489, + "grad_norm": 0.439098278870818, + "learning_rate": 7.614491901841118e-07, + "loss": 0.1596, + "step": 3362 + }, + { + "epoch": 0.7577097473737573, + "grad_norm": 0.42676596141806666, + "learning_rate": 7.601109878771301e-07, + "loss": 0.1642, + "step": 3363 + }, + { + "epoch": 0.7579350550595657, + "grad_norm": 0.47749906430832806, + "learning_rate": 7.587737516656651e-07, + "loss": 0.1779, + "step": 3364 + }, + { + "epoch": 0.7581603627453741, + "grad_norm": 0.4495128898475257, + "learning_rate": 7.574374822922323e-07, + "loss": 0.1683, + "step": 3365 + }, + { + "epoch": 0.7583856704311825, + "grad_norm": 0.44148999465870903, + "learning_rate": 7.561021804988141e-07, + "loss": 0.1537, + "step": 3366 + }, + { + "epoch": 0.758610978116991, + "grad_norm": 0.452625726904043, + "learning_rate": 7.547678470268526e-07, + "loss": 0.1714, + "step": 3367 + }, + { + "epoch": 0.7588362858027995, + "grad_norm": 0.43101014296169676, + "learning_rate": 7.534344826172546e-07, + "loss": 0.1531, + "step": 3368 + }, + { + "epoch": 0.7590615934886079, + "grad_norm": 0.45293414403113275, + "learning_rate": 7.52102088010386e-07, + "loss": 0.1656, + "step": 3369 + }, + { + "epoch": 0.7592869011744163, + "grad_norm": 0.3856272321682941, + "learning_rate": 7.507706639460768e-07, + "loss": 0.1313, + "step": 3370 + }, + { + "epoch": 0.7595122088602247, + "grad_norm": 0.46391107818672217, + "learning_rate": 7.494402111636179e-07, + "loss": 0.1837, + "step": 3371 + }, + { + "epoch": 0.7597375165460332, + "grad_norm": 0.45052973556284776, + "learning_rate": 7.481107304017588e-07, + "loss": 0.1592, + "step": 3372 + }, + { + "epoch": 0.7599628242318416, + "grad_norm": 0.4451243974342673, + "learning_rate": 7.467822223987117e-07, + "loss": 0.1691, + "step": 3373 + }, + { + "epoch": 0.76018813191765, + "grad_norm": 0.4396472260650178, + "learning_rate": 7.454546878921465e-07, + "loss": 0.153, + "step": 3374 + }, + { + "epoch": 0.7604134396034584, + "grad_norm": 0.424450144634597, + "learning_rate": 7.441281276191939e-07, + "loss": 0.1458, + "step": 3375 + }, + { + "epoch": 0.760638747289267, + "grad_norm": 0.42101330410436405, + "learning_rate": 7.428025423164456e-07, + "loss": 0.1461, + "step": 3376 + }, + { + "epoch": 0.7608640549750754, + "grad_norm": 0.474498643632856, + "learning_rate": 7.414779327199461e-07, + "loss": 0.1782, + "step": 3377 + }, + { + "epoch": 0.7610893626608838, + "grad_norm": 0.4720959209909501, + "learning_rate": 7.401542995652033e-07, + "loss": 0.1703, + "step": 3378 + }, + { + "epoch": 0.7613146703466922, + "grad_norm": 0.4421899736684158, + "learning_rate": 7.388316435871825e-07, + "loss": 0.1604, + "step": 3379 + }, + { + "epoch": 0.7615399780325006, + "grad_norm": 0.4526668971264353, + "learning_rate": 7.375099655203033e-07, + "loss": 0.1553, + "step": 3380 + }, + { + "epoch": 0.761765285718309, + "grad_norm": 0.41956586774976173, + "learning_rate": 7.361892660984459e-07, + "loss": 0.1512, + "step": 3381 + }, + { + "epoch": 0.7619905934041175, + "grad_norm": 0.4168485014196386, + "learning_rate": 7.348695460549443e-07, + "loss": 0.1507, + "step": 3382 + }, + { + "epoch": 0.762215901089926, + "grad_norm": 0.46799513195528875, + "learning_rate": 7.335508061225907e-07, + "loss": 0.1734, + "step": 3383 + }, + { + "epoch": 0.7624412087757344, + "grad_norm": 0.4339593289248791, + "learning_rate": 7.322330470336314e-07, + "loss": 0.1694, + "step": 3384 + }, + { + "epoch": 0.7626665164615428, + "grad_norm": 0.4320748720289276, + "learning_rate": 7.309162695197692e-07, + "loss": 0.159, + "step": 3385 + }, + { + "epoch": 0.7628918241473512, + "grad_norm": 0.46326255234280456, + "learning_rate": 7.296004743121627e-07, + "loss": 0.1639, + "step": 3386 + }, + { + "epoch": 0.7631171318331597, + "grad_norm": 0.42691607139967114, + "learning_rate": 7.28285662141422e-07, + "loss": 0.1559, + "step": 3387 + }, + { + "epoch": 0.7633424395189681, + "grad_norm": 0.4780215824367993, + "learning_rate": 7.26971833737615e-07, + "loss": 0.1796, + "step": 3388 + }, + { + "epoch": 0.7635677472047765, + "grad_norm": 0.4529881530123205, + "learning_rate": 7.256589898302599e-07, + "loss": 0.1625, + "step": 3389 + }, + { + "epoch": 0.7637930548905849, + "grad_norm": 0.4313763379270615, + "learning_rate": 7.243471311483322e-07, + "loss": 0.154, + "step": 3390 + }, + { + "epoch": 0.7640183625763934, + "grad_norm": 0.430705475939594, + "learning_rate": 7.230362584202557e-07, + "loss": 0.1538, + "step": 3391 + }, + { + "epoch": 0.7642436702622019, + "grad_norm": 0.45647116517229996, + "learning_rate": 7.217263723739107e-07, + "loss": 0.1662, + "step": 3392 + }, + { + "epoch": 0.7644689779480103, + "grad_norm": 0.4141404952461146, + "learning_rate": 7.204174737366293e-07, + "loss": 0.153, + "step": 3393 + }, + { + "epoch": 0.7646942856338187, + "grad_norm": 0.44268182043390103, + "learning_rate": 7.191095632351908e-07, + "loss": 0.1636, + "step": 3394 + }, + { + "epoch": 0.7649195933196271, + "grad_norm": 0.4991653539305816, + "learning_rate": 7.178026415958311e-07, + "loss": 0.1734, + "step": 3395 + }, + { + "epoch": 0.7651449010054355, + "grad_norm": 0.48502941135515515, + "learning_rate": 7.164967095442357e-07, + "loss": 0.1941, + "step": 3396 + }, + { + "epoch": 0.765370208691244, + "grad_norm": 0.4548171603813897, + "learning_rate": 7.151917678055384e-07, + "loss": 0.1673, + "step": 3397 + }, + { + "epoch": 0.7655955163770524, + "grad_norm": 0.44242800388480935, + "learning_rate": 7.138878171043262e-07, + "loss": 0.1585, + "step": 3398 + }, + { + "epoch": 0.7658208240628609, + "grad_norm": 0.4702369813805908, + "learning_rate": 7.125848581646327e-07, + "loss": 0.1815, + "step": 3399 + }, + { + "epoch": 0.7660461317486693, + "grad_norm": 0.4459817800406519, + "learning_rate": 7.112828917099438e-07, + "loss": 0.1559, + "step": 3400 + }, + { + "epoch": 0.7662714394344777, + "grad_norm": 0.47155776314347125, + "learning_rate": 7.099819184631929e-07, + "loss": 0.1604, + "step": 3401 + }, + { + "epoch": 0.7664967471202861, + "grad_norm": 0.4442841116345462, + "learning_rate": 7.086819391467612e-07, + "loss": 0.1631, + "step": 3402 + }, + { + "epoch": 0.7667220548060946, + "grad_norm": 0.46623369423369443, + "learning_rate": 7.073829544824795e-07, + "loss": 0.1649, + "step": 3403 + }, + { + "epoch": 0.766947362491903, + "grad_norm": 0.43684824173002684, + "learning_rate": 7.060849651916244e-07, + "loss": 0.151, + "step": 3404 + }, + { + "epoch": 0.7671726701777114, + "grad_norm": 0.47000480422007973, + "learning_rate": 7.047879719949227e-07, + "loss": 0.1649, + "step": 3405 + }, + { + "epoch": 0.7673979778635198, + "grad_norm": 0.4288148937103212, + "learning_rate": 7.034919756125447e-07, + "loss": 0.1589, + "step": 3406 + }, + { + "epoch": 0.7676232855493283, + "grad_norm": 0.4586511385125181, + "learning_rate": 7.021969767641096e-07, + "loss": 0.1631, + "step": 3407 + }, + { + "epoch": 0.7678485932351368, + "grad_norm": 0.4840936475690084, + "learning_rate": 7.009029761686825e-07, + "loss": 0.1719, + "step": 3408 + }, + { + "epoch": 0.7680739009209452, + "grad_norm": 0.44945392700596776, + "learning_rate": 6.996099745447726e-07, + "loss": 0.1612, + "step": 3409 + }, + { + "epoch": 0.7682992086067536, + "grad_norm": 0.4823180145479983, + "learning_rate": 6.98317972610337e-07, + "loss": 0.1784, + "step": 3410 + }, + { + "epoch": 0.768524516292562, + "grad_norm": 0.4249117873479152, + "learning_rate": 6.970269710827754e-07, + "loss": 0.1398, + "step": 3411 + }, + { + "epoch": 0.7687498239783704, + "grad_norm": 0.4434079074922953, + "learning_rate": 6.957369706789319e-07, + "loss": 0.162, + "step": 3412 + }, + { + "epoch": 0.7689751316641789, + "grad_norm": 0.442407632608181, + "learning_rate": 6.944479721150971e-07, + "loss": 0.1568, + "step": 3413 + }, + { + "epoch": 0.7692004393499873, + "grad_norm": 0.4516848356456769, + "learning_rate": 6.931599761070027e-07, + "loss": 0.1748, + "step": 3414 + }, + { + "epoch": 0.7694257470357958, + "grad_norm": 0.45488938021503034, + "learning_rate": 6.91872983369826e-07, + "loss": 0.1745, + "step": 3415 + }, + { + "epoch": 0.7696510547216042, + "grad_norm": 0.44374361837708615, + "learning_rate": 6.905869946181848e-07, + "loss": 0.1706, + "step": 3416 + }, + { + "epoch": 0.7698763624074126, + "grad_norm": 0.43978996812767956, + "learning_rate": 6.893020105661416e-07, + "loss": 0.1609, + "step": 3417 + }, + { + "epoch": 0.7701016700932211, + "grad_norm": 0.4320759855757813, + "learning_rate": 6.880180319272006e-07, + "loss": 0.1588, + "step": 3418 + }, + { + "epoch": 0.7703269777790295, + "grad_norm": 0.45339348060028495, + "learning_rate": 6.867350594143058e-07, + "loss": 0.181, + "step": 3419 + }, + { + "epoch": 0.7705522854648379, + "grad_norm": 0.4587373088442844, + "learning_rate": 6.854530937398459e-07, + "loss": 0.1705, + "step": 3420 + }, + { + "epoch": 0.7707775931506463, + "grad_norm": 0.4473262497166802, + "learning_rate": 6.841721356156466e-07, + "loss": 0.1672, + "step": 3421 + }, + { + "epoch": 0.7710029008364547, + "grad_norm": 0.42553780878275643, + "learning_rate": 6.828921857529774e-07, + "loss": 0.1533, + "step": 3422 + }, + { + "epoch": 0.7712282085222633, + "grad_norm": 0.43608156938789194, + "learning_rate": 6.816132448625474e-07, + "loss": 0.1603, + "step": 3423 + }, + { + "epoch": 0.7714535162080717, + "grad_norm": 0.4507764698408686, + "learning_rate": 6.803353136545033e-07, + "loss": 0.1618, + "step": 3424 + }, + { + "epoch": 0.7716788238938801, + "grad_norm": 0.4399377395630173, + "learning_rate": 6.790583928384339e-07, + "loss": 0.169, + "step": 3425 + }, + { + "epoch": 0.7719041315796885, + "grad_norm": 0.473535880778693, + "learning_rate": 6.777824831233645e-07, + "loss": 0.1783, + "step": 3426 + }, + { + "epoch": 0.7721294392654969, + "grad_norm": 0.4532450669932579, + "learning_rate": 6.765075852177619e-07, + "loss": 0.1604, + "step": 3427 + }, + { + "epoch": 0.7723547469513053, + "grad_norm": 0.44768423798974444, + "learning_rate": 6.752336998295281e-07, + "loss": 0.1706, + "step": 3428 + }, + { + "epoch": 0.7725800546371138, + "grad_norm": 0.44244391534745875, + "learning_rate": 6.739608276660037e-07, + "loss": 0.1666, + "step": 3429 + }, + { + "epoch": 0.7728053623229223, + "grad_norm": 0.4523487054419559, + "learning_rate": 6.726889694339689e-07, + "loss": 0.1669, + "step": 3430 + }, + { + "epoch": 0.7730306700087307, + "grad_norm": 0.5085217798841944, + "learning_rate": 6.714181258396371e-07, + "loss": 0.1986, + "step": 3431 + }, + { + "epoch": 0.7732559776945391, + "grad_norm": 0.47229045875265024, + "learning_rate": 6.701482975886617e-07, + "loss": 0.1845, + "step": 3432 + }, + { + "epoch": 0.7734812853803475, + "grad_norm": 0.484909005547143, + "learning_rate": 6.688794853861316e-07, + "loss": 0.1771, + "step": 3433 + }, + { + "epoch": 0.773706593066156, + "grad_norm": 0.44693596519695084, + "learning_rate": 6.676116899365692e-07, + "loss": 0.1672, + "step": 3434 + }, + { + "epoch": 0.7739319007519644, + "grad_norm": 0.44034600380700484, + "learning_rate": 6.663449119439358e-07, + "loss": 0.1526, + "step": 3435 + }, + { + "epoch": 0.7741572084377728, + "grad_norm": 0.4356962276449663, + "learning_rate": 6.650791521116243e-07, + "loss": 0.1499, + "step": 3436 + }, + { + "epoch": 0.7743825161235812, + "grad_norm": 0.4426936323969574, + "learning_rate": 6.638144111424655e-07, + "loss": 0.1673, + "step": 3437 + }, + { + "epoch": 0.7746078238093898, + "grad_norm": 0.4512032297177184, + "learning_rate": 6.625506897387215e-07, + "loss": 0.1501, + "step": 3438 + }, + { + "epoch": 0.7748331314951982, + "grad_norm": 0.4658577544355102, + "learning_rate": 6.612879886020907e-07, + "loss": 0.1731, + "step": 3439 + }, + { + "epoch": 0.7750584391810066, + "grad_norm": 0.4506544833705105, + "learning_rate": 6.600263084337041e-07, + "loss": 0.1598, + "step": 3440 + }, + { + "epoch": 0.775283746866815, + "grad_norm": 0.45645554030714147, + "learning_rate": 6.587656499341247e-07, + "loss": 0.17, + "step": 3441 + }, + { + "epoch": 0.7755090545526234, + "grad_norm": 0.46075726763874536, + "learning_rate": 6.575060138033504e-07, + "loss": 0.1569, + "step": 3442 + }, + { + "epoch": 0.7757343622384318, + "grad_norm": 0.4462873371182239, + "learning_rate": 6.562474007408087e-07, + "loss": 0.1592, + "step": 3443 + }, + { + "epoch": 0.7759596699242403, + "grad_norm": 0.4228985500382316, + "learning_rate": 6.549898114453615e-07, + "loss": 0.1548, + "step": 3444 + }, + { + "epoch": 0.7761849776100487, + "grad_norm": 0.44206000369615256, + "learning_rate": 6.537332466153018e-07, + "loss": 0.1654, + "step": 3445 + }, + { + "epoch": 0.7764102852958572, + "grad_norm": 0.4391785110422063, + "learning_rate": 6.524777069483526e-07, + "loss": 0.166, + "step": 3446 + }, + { + "epoch": 0.7766355929816656, + "grad_norm": 0.45789639572674373, + "learning_rate": 6.512231931416674e-07, + "loss": 0.1841, + "step": 3447 + }, + { + "epoch": 0.776860900667474, + "grad_norm": 0.46517868683505453, + "learning_rate": 6.499697058918326e-07, + "loss": 0.1609, + "step": 3448 + }, + { + "epoch": 0.7770862083532825, + "grad_norm": 0.4443333854575121, + "learning_rate": 6.487172458948612e-07, + "loss": 0.1678, + "step": 3449 + }, + { + "epoch": 0.7773115160390909, + "grad_norm": 0.4635329723739391, + "learning_rate": 6.474658138461992e-07, + "loss": 0.1687, + "step": 3450 + }, + { + "epoch": 0.7775368237248993, + "grad_norm": 0.45563305194616577, + "learning_rate": 6.462154104407187e-07, + "loss": 0.1639, + "step": 3451 + }, + { + "epoch": 0.7777621314107077, + "grad_norm": 0.4280091304525378, + "learning_rate": 6.449660363727236e-07, + "loss": 0.1504, + "step": 3452 + }, + { + "epoch": 0.7779874390965161, + "grad_norm": 0.4356084723621205, + "learning_rate": 6.437176923359434e-07, + "loss": 0.1545, + "step": 3453 + }, + { + "epoch": 0.7782127467823247, + "grad_norm": 0.480001231565102, + "learning_rate": 6.424703790235374e-07, + "loss": 0.165, + "step": 3454 + }, + { + "epoch": 0.7784380544681331, + "grad_norm": 0.444313095357135, + "learning_rate": 6.41224097128093e-07, + "loss": 0.1633, + "step": 3455 + }, + { + "epoch": 0.7786633621539415, + "grad_norm": 0.49343742962456955, + "learning_rate": 6.399788473416229e-07, + "loss": 0.1882, + "step": 3456 + }, + { + "epoch": 0.7788886698397499, + "grad_norm": 0.4940556907472232, + "learning_rate": 6.387346303555691e-07, + "loss": 0.1683, + "step": 3457 + }, + { + "epoch": 0.7791139775255583, + "grad_norm": 0.4801521313149023, + "learning_rate": 6.374914468607976e-07, + "loss": 0.1717, + "step": 3458 + }, + { + "epoch": 0.7793392852113668, + "grad_norm": 0.44736873763651075, + "learning_rate": 6.362492975476033e-07, + "loss": 0.1765, + "step": 3459 + }, + { + "epoch": 0.7795645928971752, + "grad_norm": 0.46436139464236653, + "learning_rate": 6.35008183105704e-07, + "loss": 0.1763, + "step": 3460 + }, + { + "epoch": 0.7797899005829836, + "grad_norm": 0.4350505157674905, + "learning_rate": 6.337681042242447e-07, + "loss": 0.1543, + "step": 3461 + }, + { + "epoch": 0.7800152082687921, + "grad_norm": 0.4371641550620368, + "learning_rate": 6.325290615917961e-07, + "loss": 0.1608, + "step": 3462 + }, + { + "epoch": 0.7802405159546005, + "grad_norm": 0.4711435681266765, + "learning_rate": 6.312910558963505e-07, + "loss": 0.1761, + "step": 3463 + }, + { + "epoch": 0.780465823640409, + "grad_norm": 0.4510911367382343, + "learning_rate": 6.300540878253286e-07, + "loss": 0.1752, + "step": 3464 + }, + { + "epoch": 0.7806911313262174, + "grad_norm": 0.43591800900366834, + "learning_rate": 6.288181580655709e-07, + "loss": 0.1579, + "step": 3465 + }, + { + "epoch": 0.7809164390120258, + "grad_norm": 0.45904439381403594, + "learning_rate": 6.27583267303343e-07, + "loss": 0.1648, + "step": 3466 + }, + { + "epoch": 0.7811417466978342, + "grad_norm": 0.4563015918485367, + "learning_rate": 6.263494162243352e-07, + "loss": 0.1737, + "step": 3467 + }, + { + "epoch": 0.7813670543836426, + "grad_norm": 0.4740883114313893, + "learning_rate": 6.251166055136573e-07, + "loss": 0.1799, + "step": 3468 + }, + { + "epoch": 0.781592362069451, + "grad_norm": 0.4202067355571497, + "learning_rate": 6.238848358558439e-07, + "loss": 0.1536, + "step": 3469 + }, + { + "epoch": 0.7818176697552596, + "grad_norm": 0.44442022603070214, + "learning_rate": 6.226541079348517e-07, + "loss": 0.1672, + "step": 3470 + }, + { + "epoch": 0.782042977441068, + "grad_norm": 0.4577283398279841, + "learning_rate": 6.214244224340563e-07, + "loss": 0.1562, + "step": 3471 + }, + { + "epoch": 0.7822682851268764, + "grad_norm": 0.46400945555797785, + "learning_rate": 6.201957800362579e-07, + "loss": 0.1612, + "step": 3472 + }, + { + "epoch": 0.7824935928126848, + "grad_norm": 0.4530020161512245, + "learning_rate": 6.189681814236742e-07, + "loss": 0.1652, + "step": 3473 + }, + { + "epoch": 0.7827189004984932, + "grad_norm": 0.4789063403825609, + "learning_rate": 6.177416272779468e-07, + "loss": 0.1772, + "step": 3474 + }, + { + "epoch": 0.7829442081843017, + "grad_norm": 0.42922091353529745, + "learning_rate": 6.165161182801336e-07, + "loss": 0.1549, + "step": 3475 + }, + { + "epoch": 0.7831695158701101, + "grad_norm": 0.4730192044163614, + "learning_rate": 6.152916551107149e-07, + "loss": 0.1689, + "step": 3476 + }, + { + "epoch": 0.7833948235559185, + "grad_norm": 0.45248555109948896, + "learning_rate": 6.140682384495902e-07, + "loss": 0.1641, + "step": 3477 + }, + { + "epoch": 0.783620131241727, + "grad_norm": 0.46311255556141256, + "learning_rate": 6.12845868976076e-07, + "loss": 0.174, + "step": 3478 + }, + { + "epoch": 0.7838454389275354, + "grad_norm": 0.4205167754323703, + "learning_rate": 6.116245473689094e-07, + "loss": 0.157, + "step": 3479 + }, + { + "epoch": 0.7840707466133439, + "grad_norm": 0.4584501509671639, + "learning_rate": 6.104042743062439e-07, + "loss": 0.175, + "step": 3480 + }, + { + "epoch": 0.7842960542991523, + "grad_norm": 0.46521740166015524, + "learning_rate": 6.091850504656527e-07, + "loss": 0.166, + "step": 3481 + }, + { + "epoch": 0.7845213619849607, + "grad_norm": 0.50521755059122, + "learning_rate": 6.079668765241248e-07, + "loss": 0.1461, + "step": 3482 + }, + { + "epoch": 0.7847466696707691, + "grad_norm": 0.4878924144190757, + "learning_rate": 6.06749753158066e-07, + "loss": 0.1779, + "step": 3483 + }, + { + "epoch": 0.7849719773565775, + "grad_norm": 0.44983158905344167, + "learning_rate": 6.05533681043301e-07, + "loss": 0.1727, + "step": 3484 + }, + { + "epoch": 0.7851972850423861, + "grad_norm": 0.4487704368478451, + "learning_rate": 6.04318660855068e-07, + "loss": 0.1529, + "step": 3485 + }, + { + "epoch": 0.7854225927281945, + "grad_norm": 0.4572179530385381, + "learning_rate": 6.031046932680229e-07, + "loss": 0.1655, + "step": 3486 + }, + { + "epoch": 0.7856479004140029, + "grad_norm": 0.48630217569436524, + "learning_rate": 6.018917789562372e-07, + "loss": 0.1782, + "step": 3487 + }, + { + "epoch": 0.7858732080998113, + "grad_norm": 0.4660748476486515, + "learning_rate": 6.006799185931964e-07, + "loss": 0.1656, + "step": 3488 + }, + { + "epoch": 0.7860985157856197, + "grad_norm": 0.4752841392546566, + "learning_rate": 5.994691128518019e-07, + "loss": 0.1664, + "step": 3489 + }, + { + "epoch": 0.7863238234714282, + "grad_norm": 0.49698557010012784, + "learning_rate": 5.982593624043682e-07, + "loss": 0.1746, + "step": 3490 + }, + { + "epoch": 0.7865491311572366, + "grad_norm": 0.45007588954346855, + "learning_rate": 5.970506679226249e-07, + "loss": 0.1677, + "step": 3491 + }, + { + "epoch": 0.786774438843045, + "grad_norm": 0.47053959731364914, + "learning_rate": 5.958430300777157e-07, + "loss": 0.1678, + "step": 3492 + }, + { + "epoch": 0.7869997465288535, + "grad_norm": 0.43222138736389226, + "learning_rate": 5.94636449540196e-07, + "loss": 0.1646, + "step": 3493 + }, + { + "epoch": 0.7872250542146619, + "grad_norm": 0.4433485957921805, + "learning_rate": 5.934309269800359e-07, + "loss": 0.1716, + "step": 3494 + }, + { + "epoch": 0.7874503619004704, + "grad_norm": 0.4386131982263449, + "learning_rate": 5.922264630666161e-07, + "loss": 0.1582, + "step": 3495 + }, + { + "epoch": 0.7876756695862788, + "grad_norm": 0.43966420744945794, + "learning_rate": 5.910230584687316e-07, + "loss": 0.1659, + "step": 3496 + }, + { + "epoch": 0.7879009772720872, + "grad_norm": 0.451478217033209, + "learning_rate": 5.898207138545867e-07, + "loss": 0.1658, + "step": 3497 + }, + { + "epoch": 0.7881262849578956, + "grad_norm": 0.4674455130665875, + "learning_rate": 5.886194298917994e-07, + "loss": 0.1621, + "step": 3498 + }, + { + "epoch": 0.788351592643704, + "grad_norm": 0.44672464067661444, + "learning_rate": 5.874192072473995e-07, + "loss": 0.1722, + "step": 3499 + }, + { + "epoch": 0.7885769003295124, + "grad_norm": 0.44133265328105187, + "learning_rate": 5.862200465878228e-07, + "loss": 0.1655, + "step": 3500 + }, + { + "epoch": 0.7885769003295124, + "eval_loss": 0.16525281965732574, + "eval_runtime": 57.0178, + "eval_samples_per_second": 50.335, + "eval_steps_per_second": 6.296, + "step": 3500 + }, + { + "epoch": 0.788802208015321, + "grad_norm": 0.447694054382524, + "learning_rate": 5.850219485789199e-07, + "loss": 0.1739, + "step": 3501 + }, + { + "epoch": 0.7890275157011294, + "grad_norm": 0.44495853704619837, + "learning_rate": 5.838249138859509e-07, + "loss": 0.1688, + "step": 3502 + }, + { + "epoch": 0.7892528233869378, + "grad_norm": 0.46242254593518434, + "learning_rate": 5.826289431735832e-07, + "loss": 0.175, + "step": 3503 + }, + { + "epoch": 0.7894781310727462, + "grad_norm": 0.4589387834123284, + "learning_rate": 5.814340371058957e-07, + "loss": 0.1567, + "step": 3504 + }, + { + "epoch": 0.7897034387585546, + "grad_norm": 0.4644099584463894, + "learning_rate": 5.802401963463741e-07, + "loss": 0.1574, + "step": 3505 + }, + { + "epoch": 0.7899287464443631, + "grad_norm": 0.4776091224547653, + "learning_rate": 5.79047421557915e-07, + "loss": 0.1828, + "step": 3506 + }, + { + "epoch": 0.7901540541301715, + "grad_norm": 0.4597830365599348, + "learning_rate": 5.778557134028207e-07, + "loss": 0.1699, + "step": 3507 + }, + { + "epoch": 0.7903793618159799, + "grad_norm": 0.4266669078273944, + "learning_rate": 5.766650725428027e-07, + "loss": 0.1644, + "step": 3508 + }, + { + "epoch": 0.7906046695017884, + "grad_norm": 0.4142625068618784, + "learning_rate": 5.754754996389799e-07, + "loss": 0.1427, + "step": 3509 + }, + { + "epoch": 0.7908299771875968, + "grad_norm": 0.48550598576656795, + "learning_rate": 5.742869953518773e-07, + "loss": 0.186, + "step": 3510 + }, + { + "epoch": 0.7910552848734053, + "grad_norm": 0.4393223136267881, + "learning_rate": 5.730995603414274e-07, + "loss": 0.1651, + "step": 3511 + }, + { + "epoch": 0.7912805925592137, + "grad_norm": 0.4551477533302511, + "learning_rate": 5.719131952669679e-07, + "loss": 0.1777, + "step": 3512 + }, + { + "epoch": 0.7915059002450221, + "grad_norm": 0.4875271494827229, + "learning_rate": 5.707279007872435e-07, + "loss": 0.1669, + "step": 3513 + }, + { + "epoch": 0.7917312079308305, + "grad_norm": 0.4467682807092302, + "learning_rate": 5.695436775604049e-07, + "loss": 0.167, + "step": 3514 + }, + { + "epoch": 0.7919565156166389, + "grad_norm": 0.443479403484354, + "learning_rate": 5.683605262440056e-07, + "loss": 0.1684, + "step": 3515 + }, + { + "epoch": 0.7921818233024474, + "grad_norm": 0.41353146405669616, + "learning_rate": 5.671784474950068e-07, + "loss": 0.1518, + "step": 3516 + }, + { + "epoch": 0.7924071309882559, + "grad_norm": 0.4440311844659535, + "learning_rate": 5.659974419697723e-07, + "loss": 0.1692, + "step": 3517 + }, + { + "epoch": 0.7926324386740643, + "grad_norm": 0.43680539296399284, + "learning_rate": 5.648175103240694e-07, + "loss": 0.1637, + "step": 3518 + }, + { + "epoch": 0.7928577463598727, + "grad_norm": 0.46436309603686127, + "learning_rate": 5.636386532130717e-07, + "loss": 0.1774, + "step": 3519 + }, + { + "epoch": 0.7930830540456811, + "grad_norm": 0.4698061822970405, + "learning_rate": 5.624608712913531e-07, + "loss": 0.1623, + "step": 3520 + }, + { + "epoch": 0.7933083617314896, + "grad_norm": 0.47881350753490637, + "learning_rate": 5.612841652128939e-07, + "loss": 0.1805, + "step": 3521 + }, + { + "epoch": 0.793533669417298, + "grad_norm": 0.42722516952006817, + "learning_rate": 5.601085356310734e-07, + "loss": 0.1475, + "step": 3522 + }, + { + "epoch": 0.7937589771031064, + "grad_norm": 0.44396663337430564, + "learning_rate": 5.589339831986754e-07, + "loss": 0.1782, + "step": 3523 + }, + { + "epoch": 0.7939842847889148, + "grad_norm": 0.4811573776409562, + "learning_rate": 5.577605085678858e-07, + "loss": 0.1759, + "step": 3524 + }, + { + "epoch": 0.7942095924747233, + "grad_norm": 0.4674939702729354, + "learning_rate": 5.565881123902903e-07, + "loss": 0.1563, + "step": 3525 + }, + { + "epoch": 0.7944349001605318, + "grad_norm": 0.44258273857971364, + "learning_rate": 5.554167953168779e-07, + "loss": 0.162, + "step": 3526 + }, + { + "epoch": 0.7946602078463402, + "grad_norm": 0.4754183788667168, + "learning_rate": 5.542465579980361e-07, + "loss": 0.1729, + "step": 3527 + }, + { + "epoch": 0.7948855155321486, + "grad_norm": 0.45988817906970114, + "learning_rate": 5.530774010835552e-07, + "loss": 0.1725, + "step": 3528 + }, + { + "epoch": 0.795110823217957, + "grad_norm": 0.42055382108984185, + "learning_rate": 5.519093252226232e-07, + "loss": 0.1515, + "step": 3529 + }, + { + "epoch": 0.7953361309037654, + "grad_norm": 0.44906840173568024, + "learning_rate": 5.507423310638299e-07, + "loss": 0.166, + "step": 3530 + }, + { + "epoch": 0.7955614385895738, + "grad_norm": 0.4592011927665144, + "learning_rate": 5.495764192551642e-07, + "loss": 0.1645, + "step": 3531 + }, + { + "epoch": 0.7957867462753824, + "grad_norm": 0.45303143337523033, + "learning_rate": 5.48411590444012e-07, + "loss": 0.1744, + "step": 3532 + }, + { + "epoch": 0.7960120539611908, + "grad_norm": 0.45707866993162155, + "learning_rate": 5.47247845277161e-07, + "loss": 0.1784, + "step": 3533 + }, + { + "epoch": 0.7962373616469992, + "grad_norm": 0.4395294435873491, + "learning_rate": 5.460851844007945e-07, + "loss": 0.1759, + "step": 3534 + }, + { + "epoch": 0.7964626693328076, + "grad_norm": 0.4004141795434895, + "learning_rate": 5.449236084604942e-07, + "loss": 0.1315, + "step": 3535 + }, + { + "epoch": 0.796687977018616, + "grad_norm": 0.4565651119909482, + "learning_rate": 5.437631181012415e-07, + "loss": 0.1671, + "step": 3536 + }, + { + "epoch": 0.7969132847044245, + "grad_norm": 0.45340675264382463, + "learning_rate": 5.426037139674117e-07, + "loss": 0.1546, + "step": 3537 + }, + { + "epoch": 0.7971385923902329, + "grad_norm": 0.46860565528864334, + "learning_rate": 5.414453967027797e-07, + "loss": 0.1746, + "step": 3538 + }, + { + "epoch": 0.7973639000760413, + "grad_norm": 0.4768445920475158, + "learning_rate": 5.402881669505164e-07, + "loss": 0.1827, + "step": 3539 + }, + { + "epoch": 0.7975892077618498, + "grad_norm": 0.4496685989317005, + "learning_rate": 5.391320253531868e-07, + "loss": 0.1612, + "step": 3540 + }, + { + "epoch": 0.7978145154476582, + "grad_norm": 0.46237901292154426, + "learning_rate": 5.37976972552755e-07, + "loss": 0.1718, + "step": 3541 + }, + { + "epoch": 0.7980398231334667, + "grad_norm": 0.4309874048269584, + "learning_rate": 5.368230091905774e-07, + "loss": 0.1601, + "step": 3542 + }, + { + "epoch": 0.7982651308192751, + "grad_norm": 0.44396702245670083, + "learning_rate": 5.356701359074076e-07, + "loss": 0.1614, + "step": 3543 + }, + { + "epoch": 0.7984904385050835, + "grad_norm": 0.44576656717226, + "learning_rate": 5.345183533433926e-07, + "loss": 0.162, + "step": 3544 + }, + { + "epoch": 0.7987157461908919, + "grad_norm": 0.4437774514766101, + "learning_rate": 5.333676621380746e-07, + "loss": 0.1517, + "step": 3545 + }, + { + "epoch": 0.7989410538767003, + "grad_norm": 0.6054027748075538, + "learning_rate": 5.322180629303902e-07, + "loss": 0.1515, + "step": 3546 + }, + { + "epoch": 0.7991663615625088, + "grad_norm": 0.4216291595441519, + "learning_rate": 5.310695563586676e-07, + "loss": 0.153, + "step": 3547 + }, + { + "epoch": 0.7993916692483173, + "grad_norm": 0.46846840438467846, + "learning_rate": 5.299221430606313e-07, + "loss": 0.1712, + "step": 3548 + }, + { + "epoch": 0.7996169769341257, + "grad_norm": 0.46237370825992474, + "learning_rate": 5.287758236733956e-07, + "loss": 0.1632, + "step": 3549 + }, + { + "epoch": 0.7998422846199341, + "grad_norm": 0.4102005246516085, + "learning_rate": 5.276305988334701e-07, + "loss": 0.1449, + "step": 3550 + }, + { + "epoch": 0.8000675923057425, + "grad_norm": 0.4467298151974584, + "learning_rate": 5.264864691767551e-07, + "loss": 0.1561, + "step": 3551 + }, + { + "epoch": 0.800292899991551, + "grad_norm": 0.48108678764911383, + "learning_rate": 5.253434353385422e-07, + "loss": 0.1724, + "step": 3552 + }, + { + "epoch": 0.8005182076773594, + "grad_norm": 0.4538836760702192, + "learning_rate": 5.242014979535173e-07, + "loss": 0.1619, + "step": 3553 + }, + { + "epoch": 0.8007435153631678, + "grad_norm": 0.42904226450671384, + "learning_rate": 5.23060657655754e-07, + "loss": 0.1456, + "step": 3554 + }, + { + "epoch": 0.8009688230489762, + "grad_norm": 0.4261983342973197, + "learning_rate": 5.219209150787189e-07, + "loss": 0.1557, + "step": 3555 + }, + { + "epoch": 0.8011941307347847, + "grad_norm": 0.43129374630293527, + "learning_rate": 5.207822708552695e-07, + "loss": 0.158, + "step": 3556 + }, + { + "epoch": 0.8014194384205932, + "grad_norm": 0.43056827338592485, + "learning_rate": 5.196447256176509e-07, + "loss": 0.1545, + "step": 3557 + }, + { + "epoch": 0.8016447461064016, + "grad_norm": 0.4200607220958471, + "learning_rate": 5.185082799975013e-07, + "loss": 0.1563, + "step": 3558 + }, + { + "epoch": 0.80187005379221, + "grad_norm": 0.4680398736801693, + "learning_rate": 5.173729346258452e-07, + "loss": 0.1691, + "step": 3559 + }, + { + "epoch": 0.8020953614780184, + "grad_norm": 0.4459072773615186, + "learning_rate": 5.162386901330977e-07, + "loss": 0.1666, + "step": 3560 + }, + { + "epoch": 0.8023206691638268, + "grad_norm": 0.4345233407604955, + "learning_rate": 5.151055471490638e-07, + "loss": 0.1659, + "step": 3561 + }, + { + "epoch": 0.8025459768496352, + "grad_norm": 0.41233799619004785, + "learning_rate": 5.139735063029338e-07, + "loss": 0.1471, + "step": 3562 + }, + { + "epoch": 0.8027712845354437, + "grad_norm": 0.4408692449471384, + "learning_rate": 5.128425682232893e-07, + "loss": 0.16, + "step": 3563 + }, + { + "epoch": 0.8029965922212522, + "grad_norm": 0.44317555637180256, + "learning_rate": 5.117127335380967e-07, + "loss": 0.163, + "step": 3564 + }, + { + "epoch": 0.8032218999070606, + "grad_norm": 0.4355353184142801, + "learning_rate": 5.105840028747125e-07, + "loss": 0.1545, + "step": 3565 + }, + { + "epoch": 0.803447207592869, + "grad_norm": 0.4060373221716526, + "learning_rate": 5.094563768598773e-07, + "loss": 0.1421, + "step": 3566 + }, + { + "epoch": 0.8036725152786774, + "grad_norm": 0.4404421914611089, + "learning_rate": 5.083298561197205e-07, + "loss": 0.1551, + "step": 3567 + }, + { + "epoch": 0.8038978229644859, + "grad_norm": 0.42113294164684933, + "learning_rate": 5.07204441279758e-07, + "loss": 0.1481, + "step": 3568 + }, + { + "epoch": 0.8041231306502943, + "grad_norm": 0.4436654953326095, + "learning_rate": 5.060801329648896e-07, + "loss": 0.1535, + "step": 3569 + }, + { + "epoch": 0.8043484383361027, + "grad_norm": 0.437605877342742, + "learning_rate": 5.049569317994013e-07, + "loss": 0.1576, + "step": 3570 + }, + { + "epoch": 0.8045737460219111, + "grad_norm": 0.438630829710333, + "learning_rate": 5.038348384069663e-07, + "loss": 0.1721, + "step": 3571 + }, + { + "epoch": 0.8047990537077196, + "grad_norm": 0.4225382998116328, + "learning_rate": 5.027138534106399e-07, + "loss": 0.148, + "step": 3572 + }, + { + "epoch": 0.8050243613935281, + "grad_norm": 0.44328942138378036, + "learning_rate": 5.015939774328643e-07, + "loss": 0.1561, + "step": 3573 + }, + { + "epoch": 0.8052496690793365, + "grad_norm": 0.4311513913467441, + "learning_rate": 5.004752110954642e-07, + "loss": 0.1604, + "step": 3574 + }, + { + "epoch": 0.8054749767651449, + "grad_norm": 0.4431108668956084, + "learning_rate": 4.993575550196495e-07, + "loss": 0.1578, + "step": 3575 + }, + { + "epoch": 0.8057002844509533, + "grad_norm": 0.48055570285573584, + "learning_rate": 4.982410098260118e-07, + "loss": 0.1903, + "step": 3576 + }, + { + "epoch": 0.8059255921367617, + "grad_norm": 0.4654199375373882, + "learning_rate": 4.971255761345278e-07, + "loss": 0.1706, + "step": 3577 + }, + { + "epoch": 0.8061508998225702, + "grad_norm": 0.4671146994545411, + "learning_rate": 4.96011254564557e-07, + "loss": 0.1793, + "step": 3578 + }, + { + "epoch": 0.8063762075083787, + "grad_norm": 0.46768140627854843, + "learning_rate": 4.948980457348393e-07, + "loss": 0.171, + "step": 3579 + }, + { + "epoch": 0.8066015151941871, + "grad_norm": 0.458909520415724, + "learning_rate": 4.937859502634992e-07, + "loss": 0.1718, + "step": 3580 + }, + { + "epoch": 0.8068268228799955, + "grad_norm": 0.46912679071628177, + "learning_rate": 4.926749687680407e-07, + "loss": 0.1704, + "step": 3581 + }, + { + "epoch": 0.8070521305658039, + "grad_norm": 0.4669977551990922, + "learning_rate": 4.915651018653511e-07, + "loss": 0.173, + "step": 3582 + }, + { + "epoch": 0.8072774382516124, + "grad_norm": 0.467597955044253, + "learning_rate": 4.904563501716986e-07, + "loss": 0.1642, + "step": 3583 + }, + { + "epoch": 0.8075027459374208, + "grad_norm": 0.467262334714364, + "learning_rate": 4.893487143027307e-07, + "loss": 0.1831, + "step": 3584 + }, + { + "epoch": 0.8077280536232292, + "grad_norm": 0.4319817697566143, + "learning_rate": 4.88242194873477e-07, + "loss": 0.1568, + "step": 3585 + }, + { + "epoch": 0.8079533613090376, + "grad_norm": 0.4465987901143813, + "learning_rate": 4.871367924983458e-07, + "loss": 0.1439, + "step": 3586 + }, + { + "epoch": 0.8081786689948461, + "grad_norm": 0.41865855427218024, + "learning_rate": 4.860325077911271e-07, + "loss": 0.1473, + "step": 3587 + }, + { + "epoch": 0.8084039766806546, + "grad_norm": 0.47086109537514526, + "learning_rate": 4.84929341364988e-07, + "loss": 0.1626, + "step": 3588 + }, + { + "epoch": 0.808629284366463, + "grad_norm": 0.4506795856540844, + "learning_rate": 4.838272938324753e-07, + "loss": 0.1635, + "step": 3589 + }, + { + "epoch": 0.8088545920522714, + "grad_norm": 0.4313184641081095, + "learning_rate": 4.827263658055161e-07, + "loss": 0.1522, + "step": 3590 + }, + { + "epoch": 0.8090798997380798, + "grad_norm": 0.5095889633350696, + "learning_rate": 4.816265578954135e-07, + "loss": 0.1493, + "step": 3591 + }, + { + "epoch": 0.8093052074238882, + "grad_norm": 0.4534474287161811, + "learning_rate": 4.805278707128505e-07, + "loss": 0.1674, + "step": 3592 + }, + { + "epoch": 0.8095305151096966, + "grad_norm": 0.4474006463213298, + "learning_rate": 4.794303048678878e-07, + "loss": 0.1544, + "step": 3593 + }, + { + "epoch": 0.8097558227955051, + "grad_norm": 0.44702237022141944, + "learning_rate": 4.783338609699614e-07, + "loss": 0.1595, + "step": 3594 + }, + { + "epoch": 0.8099811304813136, + "grad_norm": 0.42807641282177544, + "learning_rate": 4.772385396278872e-07, + "loss": 0.1617, + "step": 3595 + }, + { + "epoch": 0.810206438167122, + "grad_norm": 0.45169597349059154, + "learning_rate": 4.7614434144985486e-07, + "loss": 0.1695, + "step": 3596 + }, + { + "epoch": 0.8104317458529304, + "grad_norm": 0.44366510633741574, + "learning_rate": 4.750512670434332e-07, + "loss": 0.1635, + "step": 3597 + }, + { + "epoch": 0.8106570535387388, + "grad_norm": 0.43710549419110745, + "learning_rate": 4.73959317015564e-07, + "loss": 0.1639, + "step": 3598 + }, + { + "epoch": 0.8108823612245473, + "grad_norm": 0.4489278873613295, + "learning_rate": 4.728684919725679e-07, + "loss": 0.1595, + "step": 3599 + }, + { + "epoch": 0.8111076689103557, + "grad_norm": 0.44944811075554414, + "learning_rate": 4.7177879252013945e-07, + "loss": 0.1652, + "step": 3600 + }, + { + "epoch": 0.8113329765961641, + "grad_norm": 0.41501489778325856, + "learning_rate": 4.70690219263347e-07, + "loss": 0.159, + "step": 3601 + }, + { + "epoch": 0.8115582842819725, + "grad_norm": 0.4567027891249196, + "learning_rate": 4.6960277280663574e-07, + "loss": 0.1756, + "step": 3602 + }, + { + "epoch": 0.811783591967781, + "grad_norm": 0.4195073452401527, + "learning_rate": 4.685164537538234e-07, + "loss": 0.1516, + "step": 3603 + }, + { + "epoch": 0.8120088996535895, + "grad_norm": 0.46637213927528715, + "learning_rate": 4.674312627081032e-07, + "loss": 0.1657, + "step": 3604 + }, + { + "epoch": 0.8122342073393979, + "grad_norm": 0.4681068664006425, + "learning_rate": 4.6634720027204093e-07, + "loss": 0.1688, + "step": 3605 + }, + { + "epoch": 0.8124595150252063, + "grad_norm": 0.4523754525984609, + "learning_rate": 4.6526426704757545e-07, + "loss": 0.1573, + "step": 3606 + }, + { + "epoch": 0.8126848227110147, + "grad_norm": 0.4329261125125419, + "learning_rate": 4.641824636360195e-07, + "loss": 0.1503, + "step": 3607 + }, + { + "epoch": 0.8129101303968231, + "grad_norm": 0.44733519538844485, + "learning_rate": 4.6310179063805916e-07, + "loss": 0.1606, + "step": 3608 + }, + { + "epoch": 0.8131354380826316, + "grad_norm": 0.42123735539874857, + "learning_rate": 4.620222486537507e-07, + "loss": 0.1488, + "step": 3609 + }, + { + "epoch": 0.81336074576844, + "grad_norm": 0.5737504540004175, + "learning_rate": 4.609438382825246e-07, + "loss": 0.1649, + "step": 3610 + }, + { + "epoch": 0.8135860534542485, + "grad_norm": 0.44789544425999944, + "learning_rate": 4.598665601231805e-07, + "loss": 0.176, + "step": 3611 + }, + { + "epoch": 0.8138113611400569, + "grad_norm": 0.45151193279436114, + "learning_rate": 4.587904147738925e-07, + "loss": 0.1731, + "step": 3612 + }, + { + "epoch": 0.8140366688258653, + "grad_norm": 0.45530104851191877, + "learning_rate": 4.577154028322023e-07, + "loss": 0.1694, + "step": 3613 + }, + { + "epoch": 0.8142619765116738, + "grad_norm": 0.44730224527483897, + "learning_rate": 4.566415248950251e-07, + "loss": 0.1547, + "step": 3614 + }, + { + "epoch": 0.8144872841974822, + "grad_norm": 0.43428263235202136, + "learning_rate": 4.555687815586454e-07, + "loss": 0.1596, + "step": 3615 + }, + { + "epoch": 0.8147125918832906, + "grad_norm": 0.421481732363363, + "learning_rate": 4.5449717341871646e-07, + "loss": 0.1597, + "step": 3616 + }, + { + "epoch": 0.814937899569099, + "grad_norm": 0.45944196930771464, + "learning_rate": 4.534267010702639e-07, + "loss": 0.1611, + "step": 3617 + }, + { + "epoch": 0.8151632072549074, + "grad_norm": 0.4307659700567999, + "learning_rate": 4.5235736510767957e-07, + "loss": 0.1666, + "step": 3618 + }, + { + "epoch": 0.815388514940716, + "grad_norm": 0.44378540515914777, + "learning_rate": 4.5128916612472735e-07, + "loss": 0.1828, + "step": 3619 + }, + { + "epoch": 0.8156138226265244, + "grad_norm": 0.4336660605172105, + "learning_rate": 4.5022210471453664e-07, + "loss": 0.1506, + "step": 3620 + }, + { + "epoch": 0.8158391303123328, + "grad_norm": 0.5013223697434481, + "learning_rate": 4.49156181469608e-07, + "loss": 0.152, + "step": 3621 + }, + { + "epoch": 0.8160644379981412, + "grad_norm": 0.44194526590862854, + "learning_rate": 4.480913969818099e-07, + "loss": 0.1678, + "step": 3622 + }, + { + "epoch": 0.8162897456839496, + "grad_norm": 0.46866440926930236, + "learning_rate": 4.470277518423749e-07, + "loss": 0.1603, + "step": 3623 + }, + { + "epoch": 0.816515053369758, + "grad_norm": 0.41533192517416473, + "learning_rate": 4.4596524664190674e-07, + "loss": 0.1557, + "step": 3624 + }, + { + "epoch": 0.8167403610555665, + "grad_norm": 0.41911837265705654, + "learning_rate": 4.449038819703758e-07, + "loss": 0.1497, + "step": 3625 + }, + { + "epoch": 0.816965668741375, + "grad_norm": 0.4600722080465385, + "learning_rate": 4.4384365841711684e-07, + "loss": 0.168, + "step": 3626 + }, + { + "epoch": 0.8171909764271834, + "grad_norm": 0.4483419242288435, + "learning_rate": 4.427845765708341e-07, + "loss": 0.1685, + "step": 3627 + }, + { + "epoch": 0.8174162841129918, + "grad_norm": 0.48460005643240656, + "learning_rate": 4.417266370195944e-07, + "loss": 0.1867, + "step": 3628 + }, + { + "epoch": 0.8176415917988002, + "grad_norm": 0.4473624903035198, + "learning_rate": 4.406698403508333e-07, + "loss": 0.1669, + "step": 3629 + }, + { + "epoch": 0.8178668994846087, + "grad_norm": 0.44672497190191635, + "learning_rate": 4.3961418715135097e-07, + "loss": 0.1544, + "step": 3630 + }, + { + "epoch": 0.8180922071704171, + "grad_norm": 0.44437526227272495, + "learning_rate": 4.385596780073112e-07, + "loss": 0.1605, + "step": 3631 + }, + { + "epoch": 0.8183175148562255, + "grad_norm": 0.4287678777500016, + "learning_rate": 4.3750631350424456e-07, + "loss": 0.1511, + "step": 3632 + }, + { + "epoch": 0.8185428225420339, + "grad_norm": 0.41387024760976776, + "learning_rate": 4.36454094227044e-07, + "loss": 0.154, + "step": 3633 + }, + { + "epoch": 0.8187681302278424, + "grad_norm": 0.4167728639400835, + "learning_rate": 4.354030207599691e-07, + "loss": 0.1527, + "step": 3634 + }, + { + "epoch": 0.8189934379136509, + "grad_norm": 0.45047741767624955, + "learning_rate": 4.3435309368664024e-07, + "loss": 0.1689, + "step": 3635 + }, + { + "epoch": 0.8192187455994593, + "grad_norm": 0.4691107904455213, + "learning_rate": 4.333043135900436e-07, + "loss": 0.1636, + "step": 3636 + }, + { + "epoch": 0.8194440532852677, + "grad_norm": 0.45110689352785416, + "learning_rate": 4.3225668105252834e-07, + "loss": 0.1585, + "step": 3637 + }, + { + "epoch": 0.8196693609710761, + "grad_norm": 0.4552829802937, + "learning_rate": 4.312101966558044e-07, + "loss": 0.1564, + "step": 3638 + }, + { + "epoch": 0.8198946686568845, + "grad_norm": 0.4640325795388481, + "learning_rate": 4.3016486098094667e-07, + "loss": 0.1659, + "step": 3639 + }, + { + "epoch": 0.820119976342693, + "grad_norm": 0.42355754164728154, + "learning_rate": 4.2912067460839066e-07, + "loss": 0.1539, + "step": 3640 + }, + { + "epoch": 0.8203452840285014, + "grad_norm": 0.47655337531439673, + "learning_rate": 4.280776381179336e-07, + "loss": 0.1778, + "step": 3641 + }, + { + "epoch": 0.8205705917143099, + "grad_norm": 0.45918656009297704, + "learning_rate": 4.2703575208873585e-07, + "loss": 0.1573, + "step": 3642 + }, + { + "epoch": 0.8207958994001183, + "grad_norm": 0.434332470042617, + "learning_rate": 4.259950170993166e-07, + "loss": 0.1657, + "step": 3643 + }, + { + "epoch": 0.8210212070859267, + "grad_norm": 0.4557457632634909, + "learning_rate": 4.2495543372755854e-07, + "loss": 0.1635, + "step": 3644 + }, + { + "epoch": 0.8212465147717352, + "grad_norm": 0.42076755278103484, + "learning_rate": 4.239170025507025e-07, + "loss": 0.1502, + "step": 3645 + }, + { + "epoch": 0.8214718224575436, + "grad_norm": 0.43634817451180896, + "learning_rate": 4.2287972414535084e-07, + "loss": 0.1629, + "step": 3646 + }, + { + "epoch": 0.821697130143352, + "grad_norm": 0.42810190725692854, + "learning_rate": 4.218435990874664e-07, + "loss": 0.154, + "step": 3647 + }, + { + "epoch": 0.8219224378291604, + "grad_norm": 0.4525438123842048, + "learning_rate": 4.208086279523699e-07, + "loss": 0.1631, + "step": 3648 + }, + { + "epoch": 0.8221477455149688, + "grad_norm": 0.4547008059705276, + "learning_rate": 4.197748113147429e-07, + "loss": 0.1721, + "step": 3649 + }, + { + "epoch": 0.8223730532007774, + "grad_norm": 0.45906716285677857, + "learning_rate": 4.1874214974862436e-07, + "loss": 0.1772, + "step": 3650 + }, + { + "epoch": 0.8225983608865858, + "grad_norm": 0.4966440897831315, + "learning_rate": 4.177106438274131e-07, + "loss": 0.1706, + "step": 3651 + }, + { + "epoch": 0.8228236685723942, + "grad_norm": 0.4434576610862618, + "learning_rate": 4.1668029412386677e-07, + "loss": 0.1581, + "step": 3652 + }, + { + "epoch": 0.8230489762582026, + "grad_norm": 0.4758618723518978, + "learning_rate": 4.1565110121009886e-07, + "loss": 0.1777, + "step": 3653 + }, + { + "epoch": 0.823274283944011, + "grad_norm": 0.4684921798046664, + "learning_rate": 4.146230656575831e-07, + "loss": 0.1864, + "step": 3654 + }, + { + "epoch": 0.8234995916298194, + "grad_norm": 0.4838928333303539, + "learning_rate": 4.1359618803714805e-07, + "loss": 0.1868, + "step": 3655 + }, + { + "epoch": 0.8237248993156279, + "grad_norm": 0.5058524525065489, + "learning_rate": 4.125704689189819e-07, + "loss": 0.1963, + "step": 3656 + }, + { + "epoch": 0.8239502070014363, + "grad_norm": 0.44082957917525795, + "learning_rate": 4.115459088726273e-07, + "loss": 0.1737, + "step": 3657 + }, + { + "epoch": 0.8241755146872448, + "grad_norm": 0.45863218045566473, + "learning_rate": 4.105225084669839e-07, + "loss": 0.1812, + "step": 3658 + }, + { + "epoch": 0.8244008223730532, + "grad_norm": 0.5070691383906857, + "learning_rate": 4.095002682703092e-07, + "loss": 0.1701, + "step": 3659 + }, + { + "epoch": 0.8246261300588617, + "grad_norm": 0.43809349919475893, + "learning_rate": 4.084791888502135e-07, + "loss": 0.1612, + "step": 3660 + }, + { + "epoch": 0.8248514377446701, + "grad_norm": 0.46050895485539195, + "learning_rate": 4.0745927077366493e-07, + "loss": 0.1594, + "step": 3661 + }, + { + "epoch": 0.8250767454304785, + "grad_norm": 0.43779103390515955, + "learning_rate": 4.0644051460698634e-07, + "loss": 0.1631, + "step": 3662 + }, + { + "epoch": 0.8253020531162869, + "grad_norm": 0.4234434403148721, + "learning_rate": 4.0542292091585447e-07, + "loss": 0.1517, + "step": 3663 + }, + { + "epoch": 0.8255273608020953, + "grad_norm": 0.43072361706630974, + "learning_rate": 4.0440649026530166e-07, + "loss": 0.1606, + "step": 3664 + }, + { + "epoch": 0.8257526684879037, + "grad_norm": 0.4815496168658912, + "learning_rate": 4.033912232197132e-07, + "loss": 0.1702, + "step": 3665 + }, + { + "epoch": 0.8259779761737123, + "grad_norm": 0.46436973542032134, + "learning_rate": 4.0237712034283004e-07, + "loss": 0.1512, + "step": 3666 + }, + { + "epoch": 0.8262032838595207, + "grad_norm": 0.43854227732480505, + "learning_rate": 4.0136418219774457e-07, + "loss": 0.1594, + "step": 3667 + }, + { + "epoch": 0.8264285915453291, + "grad_norm": 0.4378099555293077, + "learning_rate": 4.003524093469041e-07, + "loss": 0.1514, + "step": 3668 + }, + { + "epoch": 0.8266538992311375, + "grad_norm": 0.45526711653742324, + "learning_rate": 3.993418023521092e-07, + "loss": 0.1725, + "step": 3669 + }, + { + "epoch": 0.8268792069169459, + "grad_norm": 0.4222178051305471, + "learning_rate": 3.983323617745111e-07, + "loss": 0.1654, + "step": 3670 + }, + { + "epoch": 0.8271045146027544, + "grad_norm": 0.452569399360098, + "learning_rate": 3.9732408817461544e-07, + "loss": 0.1539, + "step": 3671 + }, + { + "epoch": 0.8273298222885628, + "grad_norm": 0.43688823323365417, + "learning_rate": 3.963169821122778e-07, + "loss": 0.1441, + "step": 3672 + }, + { + "epoch": 0.8275551299743712, + "grad_norm": 0.42820695758109645, + "learning_rate": 3.953110441467073e-07, + "loss": 0.1465, + "step": 3673 + }, + { + "epoch": 0.8277804376601797, + "grad_norm": 0.4740832924029802, + "learning_rate": 3.943062748364651e-07, + "loss": 0.172, + "step": 3674 + }, + { + "epoch": 0.8280057453459881, + "grad_norm": 0.44200816116439373, + "learning_rate": 3.9330267473945973e-07, + "loss": 0.155, + "step": 3675 + }, + { + "epoch": 0.8282310530317966, + "grad_norm": 0.4368552181200171, + "learning_rate": 3.9230024441295394e-07, + "loss": 0.1682, + "step": 3676 + }, + { + "epoch": 0.828456360717605, + "grad_norm": 0.4569260301010769, + "learning_rate": 3.9129898441356064e-07, + "loss": 0.1606, + "step": 3677 + }, + { + "epoch": 0.8286816684034134, + "grad_norm": 0.4434205468136306, + "learning_rate": 3.9029889529724113e-07, + "loss": 0.1506, + "step": 3678 + }, + { + "epoch": 0.8289069760892218, + "grad_norm": 0.4440642098880015, + "learning_rate": 3.892999776193085e-07, + "loss": 0.1647, + "step": 3679 + }, + { + "epoch": 0.8291322837750302, + "grad_norm": 0.4274920365085371, + "learning_rate": 3.8830223193442345e-07, + "loss": 0.1423, + "step": 3680 + }, + { + "epoch": 0.8293575914608388, + "grad_norm": 0.5139455886589481, + "learning_rate": 3.8730565879659845e-07, + "loss": 0.1809, + "step": 3681 + }, + { + "epoch": 0.8295828991466472, + "grad_norm": 0.44080429378378516, + "learning_rate": 3.863102587591919e-07, + "loss": 0.1593, + "step": 3682 + }, + { + "epoch": 0.8298082068324556, + "grad_norm": 0.49444159914010355, + "learning_rate": 3.853160323749128e-07, + "loss": 0.2009, + "step": 3683 + }, + { + "epoch": 0.830033514518264, + "grad_norm": 0.4891008840660727, + "learning_rate": 3.84322980195819e-07, + "loss": 0.1724, + "step": 3684 + }, + { + "epoch": 0.8302588222040724, + "grad_norm": 0.47295933044160815, + "learning_rate": 3.833311027733139e-07, + "loss": 0.184, + "step": 3685 + }, + { + "epoch": 0.8304841298898809, + "grad_norm": 0.47022851704366125, + "learning_rate": 3.823404006581513e-07, + "loss": 0.164, + "step": 3686 + }, + { + "epoch": 0.8307094375756893, + "grad_norm": 0.4380630094177868, + "learning_rate": 3.8135087440043017e-07, + "loss": 0.1619, + "step": 3687 + }, + { + "epoch": 0.8309347452614977, + "grad_norm": 0.4638092957578978, + "learning_rate": 3.8036252454959844e-07, + "loss": 0.1825, + "step": 3688 + }, + { + "epoch": 0.8311600529473062, + "grad_norm": 0.47315782573662674, + "learning_rate": 3.7937535165444875e-07, + "loss": 0.1743, + "step": 3689 + }, + { + "epoch": 0.8313853606331146, + "grad_norm": 0.4053253154687385, + "learning_rate": 3.7838935626312246e-07, + "loss": 0.1386, + "step": 3690 + }, + { + "epoch": 0.831610668318923, + "grad_norm": 0.45571181144508316, + "learning_rate": 3.7740453892310596e-07, + "loss": 0.1787, + "step": 3691 + }, + { + "epoch": 0.8318359760047315, + "grad_norm": 0.47022873292887507, + "learning_rate": 3.764209001812316e-07, + "loss": 0.1759, + "step": 3692 + }, + { + "epoch": 0.8320612836905399, + "grad_norm": 0.47226426974925256, + "learning_rate": 3.754384405836767e-07, + "loss": 0.169, + "step": 3693 + }, + { + "epoch": 0.8322865913763483, + "grad_norm": 0.4291900935138613, + "learning_rate": 3.7445716067596506e-07, + "loss": 0.1554, + "step": 3694 + }, + { + "epoch": 0.8325118990621567, + "grad_norm": 0.4571423031604165, + "learning_rate": 3.734770610029642e-07, + "loss": 0.17, + "step": 3695 + }, + { + "epoch": 0.8327372067479651, + "grad_norm": 0.4601601817817856, + "learning_rate": 3.72498142108888e-07, + "loss": 0.1661, + "step": 3696 + }, + { + "epoch": 0.8329625144337737, + "grad_norm": 0.4478116242398437, + "learning_rate": 3.7152040453729223e-07, + "loss": 0.1717, + "step": 3697 + }, + { + "epoch": 0.8331878221195821, + "grad_norm": 0.41487524123829644, + "learning_rate": 3.705438488310792e-07, + "loss": 0.1523, + "step": 3698 + }, + { + "epoch": 0.8334131298053905, + "grad_norm": 0.4055639832642743, + "learning_rate": 3.695684755324938e-07, + "loss": 0.1523, + "step": 3699 + }, + { + "epoch": 0.8336384374911989, + "grad_norm": 0.47016938326370955, + "learning_rate": 3.6859428518312394e-07, + "loss": 0.1767, + "step": 3700 + }, + { + "epoch": 0.8338637451770073, + "grad_norm": 0.4577507191309991, + "learning_rate": 3.6762127832390194e-07, + "loss": 0.1756, + "step": 3701 + }, + { + "epoch": 0.8340890528628158, + "grad_norm": 0.43660672389583594, + "learning_rate": 3.666494554951014e-07, + "loss": 0.152, + "step": 3702 + }, + { + "epoch": 0.8343143605486242, + "grad_norm": 0.47547250772194105, + "learning_rate": 3.656788172363401e-07, + "loss": 0.1453, + "step": 3703 + }, + { + "epoch": 0.8345396682344326, + "grad_norm": 0.46622923550773077, + "learning_rate": 3.6470936408657647e-07, + "loss": 0.1651, + "step": 3704 + }, + { + "epoch": 0.8347649759202411, + "grad_norm": 0.4581097090501249, + "learning_rate": 3.6374109658411207e-07, + "loss": 0.1679, + "step": 3705 + }, + { + "epoch": 0.8349902836060495, + "grad_norm": 0.44823263508587813, + "learning_rate": 3.6277401526659067e-07, + "loss": 0.1575, + "step": 3706 + }, + { + "epoch": 0.835215591291858, + "grad_norm": 0.46093409818813486, + "learning_rate": 3.6180812067099477e-07, + "loss": 0.1701, + "step": 3707 + }, + { + "epoch": 0.8354408989776664, + "grad_norm": 0.4516611374825156, + "learning_rate": 3.6084341333365135e-07, + "loss": 0.157, + "step": 3708 + }, + { + "epoch": 0.8356662066634748, + "grad_norm": 0.4462794938544173, + "learning_rate": 3.5987989379022536e-07, + "loss": 0.1641, + "step": 3709 + }, + { + "epoch": 0.8358915143492832, + "grad_norm": 0.41063464334942085, + "learning_rate": 3.58917562575723e-07, + "loss": 0.1404, + "step": 3710 + }, + { + "epoch": 0.8361168220350916, + "grad_norm": 0.45124185171166303, + "learning_rate": 3.57956420224492e-07, + "loss": 0.1589, + "step": 3711 + }, + { + "epoch": 0.8363421297209, + "grad_norm": 0.4385217183515485, + "learning_rate": 3.569964672702178e-07, + "loss": 0.1574, + "step": 3712 + }, + { + "epoch": 0.8365674374067086, + "grad_norm": 0.454174002644688, + "learning_rate": 3.5603770424592785e-07, + "loss": 0.151, + "step": 3713 + }, + { + "epoch": 0.836792745092517, + "grad_norm": 0.4502525032993486, + "learning_rate": 3.550801316839858e-07, + "loss": 0.1567, + "step": 3714 + }, + { + "epoch": 0.8370180527783254, + "grad_norm": 0.46041333991302785, + "learning_rate": 3.5412375011609714e-07, + "loss": 0.1832, + "step": 3715 + }, + { + "epoch": 0.8372433604641338, + "grad_norm": 0.4431380076563992, + "learning_rate": 3.531685600733051e-07, + "loss": 0.1545, + "step": 3716 + }, + { + "epoch": 0.8374686681499423, + "grad_norm": 0.4489074531569165, + "learning_rate": 3.5221456208598987e-07, + "loss": 0.1581, + "step": 3717 + }, + { + "epoch": 0.8376939758357507, + "grad_norm": 0.44342695332891413, + "learning_rate": 3.5126175668387275e-07, + "loss": 0.1633, + "step": 3718 + }, + { + "epoch": 0.8379192835215591, + "grad_norm": 0.4435067686185539, + "learning_rate": 3.503101443960094e-07, + "loss": 0.1659, + "step": 3719 + }, + { + "epoch": 0.8381445912073675, + "grad_norm": 0.46441733270934726, + "learning_rate": 3.4935972575079524e-07, + "loss": 0.1705, + "step": 3720 + }, + { + "epoch": 0.838369898893176, + "grad_norm": 0.4256756105154653, + "learning_rate": 3.484105012759631e-07, + "loss": 0.1508, + "step": 3721 + }, + { + "epoch": 0.8385952065789845, + "grad_norm": 0.46656829157935364, + "learning_rate": 3.474624714985805e-07, + "loss": 0.172, + "step": 3722 + }, + { + "epoch": 0.8388205142647929, + "grad_norm": 0.4764265342185396, + "learning_rate": 3.465156369450545e-07, + "loss": 0.1712, + "step": 3723 + }, + { + "epoch": 0.8390458219506013, + "grad_norm": 0.4307424997524114, + "learning_rate": 3.455699981411259e-07, + "loss": 0.1595, + "step": 3724 + }, + { + "epoch": 0.8392711296364097, + "grad_norm": 0.45242934399491797, + "learning_rate": 3.446255556118736e-07, + "loss": 0.1664, + "step": 3725 + }, + { + "epoch": 0.8394964373222181, + "grad_norm": 0.439084233684571, + "learning_rate": 3.436823098817102e-07, + "loss": 0.1568, + "step": 3726 + }, + { + "epoch": 0.8397217450080265, + "grad_norm": 0.4142475372019939, + "learning_rate": 3.427402614743863e-07, + "loss": 0.1423, + "step": 3727 + }, + { + "epoch": 0.8399470526938351, + "grad_norm": 0.4489991674150461, + "learning_rate": 3.417994109129852e-07, + "loss": 0.1632, + "step": 3728 + }, + { + "epoch": 0.8401723603796435, + "grad_norm": 0.43222220416253515, + "learning_rate": 3.408597587199261e-07, + "loss": 0.1547, + "step": 3729 + }, + { + "epoch": 0.8403976680654519, + "grad_norm": 0.4525132838990558, + "learning_rate": 3.3992130541696336e-07, + "loss": 0.1546, + "step": 3730 + }, + { + "epoch": 0.8406229757512603, + "grad_norm": 0.41287517626818676, + "learning_rate": 3.389840515251855e-07, + "loss": 0.1518, + "step": 3731 + }, + { + "epoch": 0.8408482834370687, + "grad_norm": 0.4530333469339068, + "learning_rate": 3.3804799756501335e-07, + "loss": 0.1561, + "step": 3732 + }, + { + "epoch": 0.8410735911228772, + "grad_norm": 0.4329258800206697, + "learning_rate": 3.371131440562042e-07, + "loss": 0.153, + "step": 3733 + }, + { + "epoch": 0.8412988988086856, + "grad_norm": 0.46767520891934067, + "learning_rate": 3.3617949151784623e-07, + "loss": 0.1664, + "step": 3734 + }, + { + "epoch": 0.841524206494494, + "grad_norm": 0.43974968924332375, + "learning_rate": 3.3524704046836305e-07, + "loss": 0.1652, + "step": 3735 + }, + { + "epoch": 0.8417495141803025, + "grad_norm": 0.4539276734624922, + "learning_rate": 3.343157914255085e-07, + "loss": 0.1517, + "step": 3736 + }, + { + "epoch": 0.841974821866111, + "grad_norm": 0.41170130238342745, + "learning_rate": 3.3338574490637154e-07, + "loss": 0.1531, + "step": 3737 + }, + { + "epoch": 0.8422001295519194, + "grad_norm": 0.43284218213879155, + "learning_rate": 3.3245690142737236e-07, + "loss": 0.1471, + "step": 3738 + }, + { + "epoch": 0.8424254372377278, + "grad_norm": 0.4616576107187646, + "learning_rate": 3.3152926150426256e-07, + "loss": 0.1632, + "step": 3739 + }, + { + "epoch": 0.8426507449235362, + "grad_norm": 0.4321179958688705, + "learning_rate": 3.306028256521265e-07, + "loss": 0.1578, + "step": 3740 + }, + { + "epoch": 0.8428760526093446, + "grad_norm": 0.4766566433480305, + "learning_rate": 3.296775943853789e-07, + "loss": 0.1767, + "step": 3741 + }, + { + "epoch": 0.843101360295153, + "grad_norm": 0.650664008577462, + "learning_rate": 3.287535682177667e-07, + "loss": 0.1737, + "step": 3742 + }, + { + "epoch": 0.8433266679809615, + "grad_norm": 0.8515655942203271, + "learning_rate": 3.278307476623674e-07, + "loss": 0.1601, + "step": 3743 + }, + { + "epoch": 0.84355197566677, + "grad_norm": 0.4701736707283644, + "learning_rate": 3.2690913323158795e-07, + "loss": 0.1715, + "step": 3744 + }, + { + "epoch": 0.8437772833525784, + "grad_norm": 0.47536104885784997, + "learning_rate": 3.259887254371677e-07, + "loss": 0.1737, + "step": 3745 + }, + { + "epoch": 0.8440025910383868, + "grad_norm": 0.4670088171765688, + "learning_rate": 3.2506952479017417e-07, + "loss": 0.1736, + "step": 3746 + }, + { + "epoch": 0.8442278987241952, + "grad_norm": 0.45109383912733286, + "learning_rate": 3.241515318010044e-07, + "loss": 0.1729, + "step": 3747 + }, + { + "epoch": 0.8444532064100037, + "grad_norm": 0.44511692597316493, + "learning_rate": 3.2323474697938727e-07, + "loss": 0.1759, + "step": 3748 + }, + { + "epoch": 0.8446785140958121, + "grad_norm": 0.49279297736167993, + "learning_rate": 3.223191708343776e-07, + "loss": 0.1796, + "step": 3749 + }, + { + "epoch": 0.8449038217816205, + "grad_norm": 0.4557299881766109, + "learning_rate": 3.214048038743622e-07, + "loss": 0.156, + "step": 3750 + }, + { + "epoch": 0.8451291294674289, + "grad_norm": 0.41751439552095965, + "learning_rate": 3.204916466070537e-07, + "loss": 0.1582, + "step": 3751 + }, + { + "epoch": 0.8453544371532374, + "grad_norm": 0.4527606673843056, + "learning_rate": 3.1957969953949506e-07, + "loss": 0.1648, + "step": 3752 + }, + { + "epoch": 0.8455797448390459, + "grad_norm": 0.45164995459375606, + "learning_rate": 3.186689631780565e-07, + "loss": 0.1603, + "step": 3753 + }, + { + "epoch": 0.8458050525248543, + "grad_norm": 0.47865666258053585, + "learning_rate": 3.1775943802843546e-07, + "loss": 0.1885, + "step": 3754 + }, + { + "epoch": 0.8460303602106627, + "grad_norm": 0.4443128259026948, + "learning_rate": 3.168511245956582e-07, + "loss": 0.1645, + "step": 3755 + }, + { + "epoch": 0.8462556678964711, + "grad_norm": 0.4728477022320633, + "learning_rate": 3.1594402338407633e-07, + "loss": 0.1709, + "step": 3756 + }, + { + "epoch": 0.8464809755822795, + "grad_norm": 0.43299047205316765, + "learning_rate": 3.1503813489736995e-07, + "loss": 0.1528, + "step": 3757 + }, + { + "epoch": 0.846706283268088, + "grad_norm": 0.41800855515216756, + "learning_rate": 3.141334596385448e-07, + "loss": 0.1571, + "step": 3758 + }, + { + "epoch": 0.8469315909538964, + "grad_norm": 0.48351100839888106, + "learning_rate": 3.132299981099335e-07, + "loss": 0.1783, + "step": 3759 + }, + { + "epoch": 0.8471568986397049, + "grad_norm": 0.4725072866936661, + "learning_rate": 3.12327750813195e-07, + "loss": 0.178, + "step": 3760 + }, + { + "epoch": 0.8473822063255133, + "grad_norm": 0.43270837489226754, + "learning_rate": 3.1142671824931275e-07, + "loss": 0.1522, + "step": 3761 + }, + { + "epoch": 0.8476075140113217, + "grad_norm": 0.44786175729480954, + "learning_rate": 3.105269009185974e-07, + "loss": 0.166, + "step": 3762 + }, + { + "epoch": 0.8478328216971301, + "grad_norm": 0.43880847250459676, + "learning_rate": 3.096282993206837e-07, + "loss": 0.1618, + "step": 3763 + }, + { + "epoch": 0.8480581293829386, + "grad_norm": 0.4327080597344897, + "learning_rate": 3.087309139545311e-07, + "loss": 0.1573, + "step": 3764 + }, + { + "epoch": 0.848283437068747, + "grad_norm": 0.4308074274299866, + "learning_rate": 3.0783474531842497e-07, + "loss": 0.1555, + "step": 3765 + }, + { + "epoch": 0.8485087447545554, + "grad_norm": 0.43079682992705726, + "learning_rate": 3.0693979390997333e-07, + "loss": 0.1501, + "step": 3766 + }, + { + "epoch": 0.8487340524403638, + "grad_norm": 0.45510628854536794, + "learning_rate": 3.0604606022611033e-07, + "loss": 0.1706, + "step": 3767 + }, + { + "epoch": 0.8489593601261723, + "grad_norm": 0.44853033904095685, + "learning_rate": 3.0515354476309293e-07, + "loss": 0.1649, + "step": 3768 + }, + { + "epoch": 0.8491846678119808, + "grad_norm": 0.433659686989014, + "learning_rate": 3.042622480165011e-07, + "loss": 0.1481, + "step": 3769 + }, + { + "epoch": 0.8494099754977892, + "grad_norm": 0.47270378093739307, + "learning_rate": 3.033721704812395e-07, + "loss": 0.1796, + "step": 3770 + }, + { + "epoch": 0.8496352831835976, + "grad_norm": 0.4513080701887885, + "learning_rate": 3.024833126515339e-07, + "loss": 0.1591, + "step": 3771 + }, + { + "epoch": 0.849860590869406, + "grad_norm": 0.4778572682795044, + "learning_rate": 3.0159567502093535e-07, + "loss": 0.1608, + "step": 3772 + }, + { + "epoch": 0.8500858985552144, + "grad_norm": 0.44508644752395715, + "learning_rate": 3.0070925808231456e-07, + "loss": 0.155, + "step": 3773 + }, + { + "epoch": 0.8503112062410229, + "grad_norm": 0.4113843763930944, + "learning_rate": 2.9982406232786614e-07, + "loss": 0.1476, + "step": 3774 + }, + { + "epoch": 0.8505365139268314, + "grad_norm": 0.41444901838610815, + "learning_rate": 2.9894008824910726e-07, + "loss": 0.1391, + "step": 3775 + }, + { + "epoch": 0.8507618216126398, + "grad_norm": 0.4559312143559945, + "learning_rate": 2.9805733633687467e-07, + "loss": 0.1665, + "step": 3776 + }, + { + "epoch": 0.8509871292984482, + "grad_norm": 0.46225080436085186, + "learning_rate": 2.9717580708132856e-07, + "loss": 0.1626, + "step": 3777 + }, + { + "epoch": 0.8512124369842566, + "grad_norm": 0.46737597074061826, + "learning_rate": 2.9629550097194787e-07, + "loss": 0.1591, + "step": 3778 + }, + { + "epoch": 0.851437744670065, + "grad_norm": 0.42987608350371975, + "learning_rate": 2.9541641849753557e-07, + "loss": 0.157, + "step": 3779 + }, + { + "epoch": 0.8516630523558735, + "grad_norm": 0.4416140841690853, + "learning_rate": 2.9453856014621224e-07, + "loss": 0.15, + "step": 3780 + }, + { + "epoch": 0.8518883600416819, + "grad_norm": 0.4545466781066878, + "learning_rate": 2.936619264054194e-07, + "loss": 0.154, + "step": 3781 + }, + { + "epoch": 0.8521136677274903, + "grad_norm": 0.4387776601398664, + "learning_rate": 2.9278651776192073e-07, + "loss": 0.155, + "step": 3782 + }, + { + "epoch": 0.8523389754132988, + "grad_norm": 0.4652458210065339, + "learning_rate": 2.919123347017963e-07, + "loss": 0.1687, + "step": 3783 + }, + { + "epoch": 0.8525642830991073, + "grad_norm": 0.43634008346359393, + "learning_rate": 2.910393777104481e-07, + "loss": 0.1525, + "step": 3784 + }, + { + "epoch": 0.8527895907849157, + "grad_norm": 0.4242350489334935, + "learning_rate": 2.901676472725973e-07, + "loss": 0.1525, + "step": 3785 + }, + { + "epoch": 0.8530148984707241, + "grad_norm": 0.45229154449959247, + "learning_rate": 2.892971438722822e-07, + "loss": 0.1691, + "step": 3786 + }, + { + "epoch": 0.8532402061565325, + "grad_norm": 0.4506123272525204, + "learning_rate": 2.8842786799286204e-07, + "loss": 0.1647, + "step": 3787 + }, + { + "epoch": 0.8534655138423409, + "grad_norm": 0.4379653085601568, + "learning_rate": 2.8755982011701183e-07, + "loss": 0.161, + "step": 3788 + }, + { + "epoch": 0.8536908215281493, + "grad_norm": 0.42607718490885427, + "learning_rate": 2.866930007267274e-07, + "loss": 0.1507, + "step": 3789 + }, + { + "epoch": 0.8539161292139578, + "grad_norm": 0.44606175682273036, + "learning_rate": 2.8582741030332095e-07, + "loss": 0.1607, + "step": 3790 + }, + { + "epoch": 0.8541414368997663, + "grad_norm": 0.4156134237866846, + "learning_rate": 2.8496304932742247e-07, + "loss": 0.152, + "step": 3791 + }, + { + "epoch": 0.8543667445855747, + "grad_norm": 0.45545474700590477, + "learning_rate": 2.840999182789797e-07, + "loss": 0.1586, + "step": 3792 + }, + { + "epoch": 0.8545920522713831, + "grad_norm": 0.4226153140577736, + "learning_rate": 2.8323801763725623e-07, + "loss": 0.1509, + "step": 3793 + }, + { + "epoch": 0.8548173599571915, + "grad_norm": 0.48362427520619916, + "learning_rate": 2.823773478808348e-07, + "loss": 0.1768, + "step": 3794 + }, + { + "epoch": 0.855042667643, + "grad_norm": 0.4146941876092988, + "learning_rate": 2.8151790948761165e-07, + "loss": 0.1527, + "step": 3795 + }, + { + "epoch": 0.8552679753288084, + "grad_norm": 0.4518963375291997, + "learning_rate": 2.806597029348018e-07, + "loss": 0.1703, + "step": 3796 + }, + { + "epoch": 0.8554932830146168, + "grad_norm": 0.44442842379013187, + "learning_rate": 2.7980272869893633e-07, + "loss": 0.1586, + "step": 3797 + }, + { + "epoch": 0.8557185907004252, + "grad_norm": 0.4665550730778522, + "learning_rate": 2.7894698725585866e-07, + "loss": 0.1619, + "step": 3798 + }, + { + "epoch": 0.8559438983862337, + "grad_norm": 0.46741163311501377, + "learning_rate": 2.7809247908073184e-07, + "loss": 0.1857, + "step": 3799 + }, + { + "epoch": 0.8561692060720422, + "grad_norm": 0.41565267238626635, + "learning_rate": 2.772392046480324e-07, + "loss": 0.1537, + "step": 3800 + }, + { + "epoch": 0.8563945137578506, + "grad_norm": 0.4408951754788661, + "learning_rate": 2.763871644315508e-07, + "loss": 0.1597, + "step": 3801 + }, + { + "epoch": 0.856619821443659, + "grad_norm": 0.4541212955646452, + "learning_rate": 2.755363589043944e-07, + "loss": 0.1595, + "step": 3802 + }, + { + "epoch": 0.8568451291294674, + "grad_norm": 0.469744634052532, + "learning_rate": 2.746867885389828e-07, + "loss": 0.1657, + "step": 3803 + }, + { + "epoch": 0.8570704368152758, + "grad_norm": 0.41875642649774314, + "learning_rate": 2.738384538070518e-07, + "loss": 0.1508, + "step": 3804 + }, + { + "epoch": 0.8572957445010843, + "grad_norm": 0.46337651841611527, + "learning_rate": 2.7299135517964897e-07, + "loss": 0.1732, + "step": 3805 + }, + { + "epoch": 0.8575210521868927, + "grad_norm": 0.42334417919685435, + "learning_rate": 2.7214549312713723e-07, + "loss": 0.1544, + "step": 3806 + }, + { + "epoch": 0.8577463598727012, + "grad_norm": 0.4376300006439983, + "learning_rate": 2.713008681191923e-07, + "loss": 0.1672, + "step": 3807 + }, + { + "epoch": 0.8579716675585096, + "grad_norm": 0.4113515886651012, + "learning_rate": 2.7045748062480254e-07, + "loss": 0.1499, + "step": 3808 + }, + { + "epoch": 0.858196975244318, + "grad_norm": 0.4251847781025433, + "learning_rate": 2.696153311122704e-07, + "loss": 0.1514, + "step": 3809 + }, + { + "epoch": 0.8584222829301265, + "grad_norm": 0.45207255308785904, + "learning_rate": 2.6877442004920873e-07, + "loss": 0.1653, + "step": 3810 + }, + { + "epoch": 0.8586475906159349, + "grad_norm": 0.47450706057625247, + "learning_rate": 2.6793474790254516e-07, + "loss": 0.1656, + "step": 3811 + }, + { + "epoch": 0.8588728983017433, + "grad_norm": 0.4548389185897817, + "learning_rate": 2.6709631513851834e-07, + "loss": 0.1636, + "step": 3812 + }, + { + "epoch": 0.8590982059875517, + "grad_norm": 0.43672418857370177, + "learning_rate": 2.6625912222267844e-07, + "loss": 0.1665, + "step": 3813 + }, + { + "epoch": 0.8593235136733601, + "grad_norm": 0.439917137583086, + "learning_rate": 2.654231696198878e-07, + "loss": 0.1585, + "step": 3814 + }, + { + "epoch": 0.8595488213591687, + "grad_norm": 0.4330880010194554, + "learning_rate": 2.645884577943192e-07, + "loss": 0.145, + "step": 3815 + }, + { + "epoch": 0.8597741290449771, + "grad_norm": 0.46372160022896086, + "learning_rate": 2.6375498720945717e-07, + "loss": 0.167, + "step": 3816 + }, + { + "epoch": 0.8599994367307855, + "grad_norm": 0.45319889064371016, + "learning_rate": 2.629227583280972e-07, + "loss": 0.1636, + "step": 3817 + }, + { + "epoch": 0.8602247444165939, + "grad_norm": 0.48684228342483954, + "learning_rate": 2.620917716123444e-07, + "loss": 0.1877, + "step": 3818 + }, + { + "epoch": 0.8604500521024023, + "grad_norm": 0.4765780795480495, + "learning_rate": 2.6126202752361554e-07, + "loss": 0.1673, + "step": 3819 + }, + { + "epoch": 0.8606753597882107, + "grad_norm": 0.4499872117877301, + "learning_rate": 2.604335265226354e-07, + "loss": 0.1557, + "step": 3820 + }, + { + "epoch": 0.8609006674740192, + "grad_norm": 0.44944931571898095, + "learning_rate": 2.5960626906944066e-07, + "loss": 0.1622, + "step": 3821 + }, + { + "epoch": 0.8611259751598277, + "grad_norm": 0.48671239001942607, + "learning_rate": 2.587802556233765e-07, + "loss": 0.1712, + "step": 3822 + }, + { + "epoch": 0.8613512828456361, + "grad_norm": 0.441257007010554, + "learning_rate": 2.5795548664309695e-07, + "loss": 0.1589, + "step": 3823 + }, + { + "epoch": 0.8615765905314445, + "grad_norm": 0.44410751441465046, + "learning_rate": 2.571319625865662e-07, + "loss": 0.1586, + "step": 3824 + }, + { + "epoch": 0.861801898217253, + "grad_norm": 0.438050405761917, + "learning_rate": 2.5630968391105515e-07, + "loss": 0.17, + "step": 3825 + }, + { + "epoch": 0.8620272059030614, + "grad_norm": 0.4414978074408367, + "learning_rate": 2.5548865107314606e-07, + "loss": 0.1523, + "step": 3826 + }, + { + "epoch": 0.8622525135888698, + "grad_norm": 0.44526391800192294, + "learning_rate": 2.546688645287268e-07, + "loss": 0.1574, + "step": 3827 + }, + { + "epoch": 0.8624778212746782, + "grad_norm": 0.4271814076979188, + "learning_rate": 2.5385032473299433e-07, + "loss": 0.1589, + "step": 3828 + }, + { + "epoch": 0.8627031289604866, + "grad_norm": 0.4378356254333961, + "learning_rate": 2.5303303214045423e-07, + "loss": 0.1591, + "step": 3829 + }, + { + "epoch": 0.8629284366462951, + "grad_norm": 0.4525059650092737, + "learning_rate": 2.522169872049174e-07, + "loss": 0.1781, + "step": 3830 + }, + { + "epoch": 0.8631537443321036, + "grad_norm": 0.43856280343594983, + "learning_rate": 2.5140219037950416e-07, + "loss": 0.1595, + "step": 3831 + }, + { + "epoch": 0.863379052017912, + "grad_norm": 0.46815088325236964, + "learning_rate": 2.5058864211664064e-07, + "loss": 0.1741, + "step": 3832 + }, + { + "epoch": 0.8636043597037204, + "grad_norm": 0.49963053906888, + "learning_rate": 2.4977634286805887e-07, + "loss": 0.1869, + "step": 3833 + }, + { + "epoch": 0.8638296673895288, + "grad_norm": 0.4316835553376431, + "learning_rate": 2.4896529308479966e-07, + "loss": 0.1545, + "step": 3834 + }, + { + "epoch": 0.8640549750753372, + "grad_norm": 0.42908511684269135, + "learning_rate": 2.4815549321720755e-07, + "loss": 0.1497, + "step": 3835 + }, + { + "epoch": 0.8642802827611457, + "grad_norm": 0.4241907221783983, + "learning_rate": 2.4734694371493507e-07, + "loss": 0.1504, + "step": 3836 + }, + { + "epoch": 0.8645055904469541, + "grad_norm": 0.42175879326928944, + "learning_rate": 2.4653964502693974e-07, + "loss": 0.1561, + "step": 3837 + }, + { + "epoch": 0.8647308981327626, + "grad_norm": 0.4429256751771366, + "learning_rate": 2.4573359760148354e-07, + "loss": 0.1603, + "step": 3838 + }, + { + "epoch": 0.864956205818571, + "grad_norm": 0.42293311738401956, + "learning_rate": 2.449288018861354e-07, + "loss": 0.164, + "step": 3839 + }, + { + "epoch": 0.8651815135043794, + "grad_norm": 0.4353821319328284, + "learning_rate": 2.441252583277678e-07, + "loss": 0.1576, + "step": 3840 + }, + { + "epoch": 0.8654068211901879, + "grad_norm": 0.43777055974443607, + "learning_rate": 2.433229673725593e-07, + "loss": 0.1504, + "step": 3841 + }, + { + "epoch": 0.8656321288759963, + "grad_norm": 0.4485141988089283, + "learning_rate": 2.425219294659908e-07, + "loss": 0.1716, + "step": 3842 + }, + { + "epoch": 0.8658574365618047, + "grad_norm": 0.47247406940127123, + "learning_rate": 2.4172214505285006e-07, + "loss": 0.1715, + "step": 3843 + }, + { + "epoch": 0.8660827442476131, + "grad_norm": 0.4344944632444246, + "learning_rate": 2.409236145772276e-07, + "loss": 0.1588, + "step": 3844 + }, + { + "epoch": 0.8663080519334215, + "grad_norm": 0.4363103997950014, + "learning_rate": 2.401263384825164e-07, + "loss": 0.1603, + "step": 3845 + }, + { + "epoch": 0.8665333596192301, + "grad_norm": 0.4323145195248832, + "learning_rate": 2.393303172114159e-07, + "loss": 0.1509, + "step": 3846 + }, + { + "epoch": 0.8667586673050385, + "grad_norm": 0.4337917936679102, + "learning_rate": 2.3853555120592506e-07, + "loss": 0.1481, + "step": 3847 + }, + { + "epoch": 0.8669839749908469, + "grad_norm": 0.4331822378106634, + "learning_rate": 2.377420409073497e-07, + "loss": 0.1534, + "step": 3848 + }, + { + "epoch": 0.8672092826766553, + "grad_norm": 0.43554798372472237, + "learning_rate": 2.3694978675629476e-07, + "loss": 0.1685, + "step": 3849 + }, + { + "epoch": 0.8674345903624637, + "grad_norm": 0.44951635486499003, + "learning_rate": 2.3615878919267116e-07, + "loss": 0.1799, + "step": 3850 + }, + { + "epoch": 0.8676598980482721, + "grad_norm": 0.4782836009692019, + "learning_rate": 2.3536904865568949e-07, + "loss": 0.1734, + "step": 3851 + }, + { + "epoch": 0.8678852057340806, + "grad_norm": 0.44717674040078415, + "learning_rate": 2.345805655838626e-07, + "loss": 0.1642, + "step": 3852 + }, + { + "epoch": 0.868110513419889, + "grad_norm": 0.4944820170968731, + "learning_rate": 2.337933404150064e-07, + "loss": 0.1747, + "step": 3853 + }, + { + "epoch": 0.8683358211056975, + "grad_norm": 0.4446058407762169, + "learning_rate": 2.3300737358623843e-07, + "loss": 0.1666, + "step": 3854 + }, + { + "epoch": 0.8685611287915059, + "grad_norm": 0.4520045515861524, + "learning_rate": 2.3222266553397542e-07, + "loss": 0.1711, + "step": 3855 + }, + { + "epoch": 0.8687864364773143, + "grad_norm": 0.466855332594211, + "learning_rate": 2.314392166939375e-07, + "loss": 0.1694, + "step": 3856 + }, + { + "epoch": 0.8690117441631228, + "grad_norm": 0.41656443320779735, + "learning_rate": 2.3065702750114383e-07, + "loss": 0.146, + "step": 3857 + }, + { + "epoch": 0.8692370518489312, + "grad_norm": 0.4381498657077599, + "learning_rate": 2.2987609838991536e-07, + "loss": 0.1621, + "step": 3858 + }, + { + "epoch": 0.8694623595347396, + "grad_norm": 0.4676158069562402, + "learning_rate": 2.2909642979387331e-07, + "loss": 0.1699, + "step": 3859 + }, + { + "epoch": 0.869687667220548, + "grad_norm": 0.4635437613891812, + "learning_rate": 2.2831802214593774e-07, + "loss": 0.1721, + "step": 3860 + }, + { + "epoch": 0.8699129749063564, + "grad_norm": 0.4327565925553693, + "learning_rate": 2.2754087587833014e-07, + "loss": 0.151, + "step": 3861 + }, + { + "epoch": 0.870138282592165, + "grad_norm": 0.48742952822239083, + "learning_rate": 2.2676499142257002e-07, + "loss": 0.1832, + "step": 3862 + }, + { + "epoch": 0.8703635902779734, + "grad_norm": 0.4399879967144114, + "learning_rate": 2.2599036920947836e-07, + "loss": 0.1624, + "step": 3863 + }, + { + "epoch": 0.8705888979637818, + "grad_norm": 0.4329857914979825, + "learning_rate": 2.2521700966917276e-07, + "loss": 0.1569, + "step": 3864 + }, + { + "epoch": 0.8708142056495902, + "grad_norm": 0.4269624885392436, + "learning_rate": 2.2444491323107138e-07, + "loss": 0.1561, + "step": 3865 + }, + { + "epoch": 0.8710395133353986, + "grad_norm": 0.451818487697091, + "learning_rate": 2.23674080323891e-07, + "loss": 0.1675, + "step": 3866 + }, + { + "epoch": 0.8712648210212071, + "grad_norm": 0.45625430875206713, + "learning_rate": 2.229045113756456e-07, + "loss": 0.1768, + "step": 3867 + }, + { + "epoch": 0.8714901287070155, + "grad_norm": 0.48191600027890824, + "learning_rate": 2.221362068136493e-07, + "loss": 0.173, + "step": 3868 + }, + { + "epoch": 0.8717154363928239, + "grad_norm": 0.4586870274095331, + "learning_rate": 2.2136916706451212e-07, + "loss": 0.1586, + "step": 3869 + }, + { + "epoch": 0.8719407440786324, + "grad_norm": 0.4495650394686771, + "learning_rate": 2.2060339255414232e-07, + "loss": 0.1638, + "step": 3870 + }, + { + "epoch": 0.8721660517644408, + "grad_norm": 0.4452343684455734, + "learning_rate": 2.198388837077467e-07, + "loss": 0.1607, + "step": 3871 + }, + { + "epoch": 0.8723913594502493, + "grad_norm": 0.4379884318082166, + "learning_rate": 2.190756409498282e-07, + "loss": 0.1639, + "step": 3872 + }, + { + "epoch": 0.8726166671360577, + "grad_norm": 0.4851259584146969, + "learning_rate": 2.1831366470418725e-07, + "loss": 0.1875, + "step": 3873 + }, + { + "epoch": 0.8728419748218661, + "grad_norm": 0.45277874346044966, + "learning_rate": 2.175529553939204e-07, + "loss": 0.1593, + "step": 3874 + }, + { + "epoch": 0.8730672825076745, + "grad_norm": 0.42765833263206704, + "learning_rate": 2.1679351344142146e-07, + "loss": 0.1551, + "step": 3875 + }, + { + "epoch": 0.8732925901934829, + "grad_norm": 0.44198734114554694, + "learning_rate": 2.1603533926838088e-07, + "loss": 0.1719, + "step": 3876 + }, + { + "epoch": 0.8735178978792915, + "grad_norm": 0.44288321113395057, + "learning_rate": 2.1527843329578328e-07, + "loss": 0.162, + "step": 3877 + }, + { + "epoch": 0.8737432055650999, + "grad_norm": 0.4755990929012781, + "learning_rate": 2.1452279594391167e-07, + "loss": 0.1785, + "step": 3878 + }, + { + "epoch": 0.8739685132509083, + "grad_norm": 0.42115083201637526, + "learning_rate": 2.1376842763234178e-07, + "loss": 0.1477, + "step": 3879 + }, + { + "epoch": 0.8741938209367167, + "grad_norm": 0.42144616006407, + "learning_rate": 2.1301532877994747e-07, + "loss": 0.1451, + "step": 3880 + }, + { + "epoch": 0.8744191286225251, + "grad_norm": 0.45724821723617687, + "learning_rate": 2.1226349980489614e-07, + "loss": 0.1795, + "step": 3881 + }, + { + "epoch": 0.8746444363083336, + "grad_norm": 0.44062311131762866, + "learning_rate": 2.1151294112464997e-07, + "loss": 0.1636, + "step": 3882 + }, + { + "epoch": 0.874869743994142, + "grad_norm": 0.4885236046757516, + "learning_rate": 2.1076365315596704e-07, + "loss": 0.1834, + "step": 3883 + }, + { + "epoch": 0.8750950516799504, + "grad_norm": 0.4642617358050222, + "learning_rate": 2.1001563631489807e-07, + "loss": 0.1628, + "step": 3884 + }, + { + "epoch": 0.8753203593657589, + "grad_norm": 0.45049186515702966, + "learning_rate": 2.0926889101679004e-07, + "loss": 0.1609, + "step": 3885 + }, + { + "epoch": 0.8755456670515673, + "grad_norm": 0.44658631424911166, + "learning_rate": 2.0852341767628182e-07, + "loss": 0.1726, + "step": 3886 + }, + { + "epoch": 0.8757709747373758, + "grad_norm": 0.4693250985801623, + "learning_rate": 2.07779216707307e-07, + "loss": 0.1792, + "step": 3887 + }, + { + "epoch": 0.8759962824231842, + "grad_norm": 0.4520979064657107, + "learning_rate": 2.0703628852309336e-07, + "loss": 0.1618, + "step": 3888 + }, + { + "epoch": 0.8762215901089926, + "grad_norm": 0.4210956983106802, + "learning_rate": 2.0629463353616013e-07, + "loss": 0.1609, + "step": 3889 + }, + { + "epoch": 0.876446897794801, + "grad_norm": 0.49923293483431874, + "learning_rate": 2.0555425215832176e-07, + "loss": 0.1569, + "step": 3890 + }, + { + "epoch": 0.8766722054806094, + "grad_norm": 0.45672939225021897, + "learning_rate": 2.048151448006841e-07, + "loss": 0.1793, + "step": 3891 + }, + { + "epoch": 0.8768975131664178, + "grad_norm": 0.4361479691592971, + "learning_rate": 2.0407731187364556e-07, + "loss": 0.1637, + "step": 3892 + }, + { + "epoch": 0.8771228208522264, + "grad_norm": 0.45947790569692776, + "learning_rate": 2.0334075378689781e-07, + "loss": 0.18, + "step": 3893 + }, + { + "epoch": 0.8773481285380348, + "grad_norm": 0.43702093717358403, + "learning_rate": 2.026054709494235e-07, + "loss": 0.1673, + "step": 3894 + }, + { + "epoch": 0.8775734362238432, + "grad_norm": 0.4266254212060104, + "learning_rate": 2.0187146376949852e-07, + "loss": 0.1519, + "step": 3895 + }, + { + "epoch": 0.8777987439096516, + "grad_norm": 0.47997273599386037, + "learning_rate": 2.0113873265468875e-07, + "loss": 0.1737, + "step": 3896 + }, + { + "epoch": 0.87802405159546, + "grad_norm": 0.4584781235840831, + "learning_rate": 2.0040727801185323e-07, + "loss": 0.1644, + "step": 3897 + }, + { + "epoch": 0.8782493592812685, + "grad_norm": 0.42055662677918965, + "learning_rate": 1.996771002471415e-07, + "loss": 0.15, + "step": 3898 + }, + { + "epoch": 0.8784746669670769, + "grad_norm": 0.4336517684316923, + "learning_rate": 1.9894819976599338e-07, + "loss": 0.1598, + "step": 3899 + }, + { + "epoch": 0.8786999746528853, + "grad_norm": 0.4199442466684665, + "learning_rate": 1.9822057697314102e-07, + "loss": 0.1555, + "step": 3900 + }, + { + "epoch": 0.8789252823386938, + "grad_norm": 0.44284189963556275, + "learning_rate": 1.9749423227260533e-07, + "loss": 0.1497, + "step": 3901 + }, + { + "epoch": 0.8791505900245022, + "grad_norm": 0.4368997794102865, + "learning_rate": 1.9676916606769874e-07, + "loss": 0.1652, + "step": 3902 + }, + { + "epoch": 0.8793758977103107, + "grad_norm": 0.4299620110919292, + "learning_rate": 1.9604537876102448e-07, + "loss": 0.1586, + "step": 3903 + }, + { + "epoch": 0.8796012053961191, + "grad_norm": 0.42598006551108003, + "learning_rate": 1.9532287075447325e-07, + "loss": 0.1563, + "step": 3904 + }, + { + "epoch": 0.8798265130819275, + "grad_norm": 0.456264261330521, + "learning_rate": 1.9460164244922698e-07, + "loss": 0.1607, + "step": 3905 + }, + { + "epoch": 0.8800518207677359, + "grad_norm": 0.45608484227009527, + "learning_rate": 1.9388169424575802e-07, + "loss": 0.1597, + "step": 3906 + }, + { + "epoch": 0.8802771284535443, + "grad_norm": 0.44234838105517343, + "learning_rate": 1.9316302654382528e-07, + "loss": 0.1546, + "step": 3907 + }, + { + "epoch": 0.8805024361393528, + "grad_norm": 0.44860209209839386, + "learning_rate": 1.9244563974247953e-07, + "loss": 0.1661, + "step": 3908 + }, + { + "epoch": 0.8807277438251613, + "grad_norm": 0.4588476285276534, + "learning_rate": 1.917295342400577e-07, + "loss": 0.1719, + "step": 3909 + }, + { + "epoch": 0.8809530515109697, + "grad_norm": 0.41726310850815207, + "learning_rate": 1.910147104341875e-07, + "loss": 0.15, + "step": 3910 + }, + { + "epoch": 0.8811783591967781, + "grad_norm": 0.4508361447920559, + "learning_rate": 1.9030116872178317e-07, + "loss": 0.1774, + "step": 3911 + }, + { + "epoch": 0.8814036668825865, + "grad_norm": 0.46320765936651387, + "learning_rate": 1.8958890949904802e-07, + "loss": 0.1797, + "step": 3912 + }, + { + "epoch": 0.881628974568395, + "grad_norm": 0.44474896231935784, + "learning_rate": 1.8887793316147373e-07, + "loss": 0.1653, + "step": 3913 + }, + { + "epoch": 0.8818542822542034, + "grad_norm": 0.4664350846493278, + "learning_rate": 1.881682401038379e-07, + "loss": 0.1571, + "step": 3914 + }, + { + "epoch": 0.8820795899400118, + "grad_norm": 0.41130903332171925, + "learning_rate": 1.8745983072020774e-07, + "loss": 0.15, + "step": 3915 + }, + { + "epoch": 0.8823048976258202, + "grad_norm": 0.47604540469134987, + "learning_rate": 1.8675270540393532e-07, + "loss": 0.166, + "step": 3916 + }, + { + "epoch": 0.8825302053116287, + "grad_norm": 0.45951964595782535, + "learning_rate": 1.8604686454766208e-07, + "loss": 0.1732, + "step": 3917 + }, + { + "epoch": 0.8827555129974372, + "grad_norm": 0.44953194571830196, + "learning_rate": 1.8534230854331454e-07, + "loss": 0.1681, + "step": 3918 + }, + { + "epoch": 0.8829808206832456, + "grad_norm": 0.4564109630485002, + "learning_rate": 1.8463903778210612e-07, + "loss": 0.1764, + "step": 3919 + }, + { + "epoch": 0.883206128369054, + "grad_norm": 0.4361897089574454, + "learning_rate": 1.8393705265453838e-07, + "loss": 0.1649, + "step": 3920 + }, + { + "epoch": 0.8834314360548624, + "grad_norm": 0.4499533260925918, + "learning_rate": 1.832363535503956e-07, + "loss": 0.1691, + "step": 3921 + }, + { + "epoch": 0.8836567437406708, + "grad_norm": 0.4308753161875606, + "learning_rate": 1.8253694085875047e-07, + "loss": 0.1472, + "step": 3922 + }, + { + "epoch": 0.8838820514264792, + "grad_norm": 0.4559721044480246, + "learning_rate": 1.8183881496796146e-07, + "loss": 0.1754, + "step": 3923 + }, + { + "epoch": 0.8841073591122878, + "grad_norm": 0.460635613545786, + "learning_rate": 1.8114197626567105e-07, + "loss": 0.18, + "step": 3924 + }, + { + "epoch": 0.8843326667980962, + "grad_norm": 0.46325341077251553, + "learning_rate": 1.8044642513880827e-07, + "loss": 0.1765, + "step": 3925 + }, + { + "epoch": 0.8845579744839046, + "grad_norm": 0.4617288110464993, + "learning_rate": 1.7975216197358648e-07, + "loss": 0.1643, + "step": 3926 + }, + { + "epoch": 0.884783282169713, + "grad_norm": 0.4439565217991711, + "learning_rate": 1.790591871555039e-07, + "loss": 0.1625, + "step": 3927 + }, + { + "epoch": 0.8850085898555214, + "grad_norm": 0.40371533836410345, + "learning_rate": 1.7836750106934475e-07, + "loss": 0.1496, + "step": 3928 + }, + { + "epoch": 0.8852338975413299, + "grad_norm": 0.44639730240313197, + "learning_rate": 1.776771040991751e-07, + "loss": 0.1618, + "step": 3929 + }, + { + "epoch": 0.8854592052271383, + "grad_norm": 0.46724100546770003, + "learning_rate": 1.7698799662834776e-07, + "loss": 0.1706, + "step": 3930 + }, + { + "epoch": 0.8856845129129467, + "grad_norm": 0.4360807856267971, + "learning_rate": 1.7630017903949775e-07, + "loss": 0.1604, + "step": 3931 + }, + { + "epoch": 0.8859098205987552, + "grad_norm": 0.47280295964478986, + "learning_rate": 1.7561365171454488e-07, + "loss": 0.1746, + "step": 3932 + }, + { + "epoch": 0.8861351282845636, + "grad_norm": 0.44140927064770286, + "learning_rate": 1.7492841503469165e-07, + "loss": 0.1575, + "step": 3933 + }, + { + "epoch": 0.8863604359703721, + "grad_norm": 0.43009307819901993, + "learning_rate": 1.7424446938042517e-07, + "loss": 0.1634, + "step": 3934 + }, + { + "epoch": 0.8865857436561805, + "grad_norm": 0.45954351184773873, + "learning_rate": 1.7356181513151464e-07, + "loss": 0.157, + "step": 3935 + }, + { + "epoch": 0.8868110513419889, + "grad_norm": 0.4134526736022512, + "learning_rate": 1.7288045266701247e-07, + "loss": 0.1436, + "step": 3936 + }, + { + "epoch": 0.8870363590277973, + "grad_norm": 0.4387411197513202, + "learning_rate": 1.7220038236525406e-07, + "loss": 0.1664, + "step": 3937 + }, + { + "epoch": 0.8872616667136057, + "grad_norm": 0.4215894006238945, + "learning_rate": 1.7152160460385703e-07, + "loss": 0.1544, + "step": 3938 + }, + { + "epoch": 0.8874869743994142, + "grad_norm": 0.46158213947249405, + "learning_rate": 1.7084411975972076e-07, + "loss": 0.1713, + "step": 3939 + }, + { + "epoch": 0.8877122820852227, + "grad_norm": 0.4630506230019692, + "learning_rate": 1.701679282090285e-07, + "loss": 0.1724, + "step": 3940 + }, + { + "epoch": 0.8879375897710311, + "grad_norm": 0.44845468522949833, + "learning_rate": 1.6949303032724297e-07, + "loss": 0.16, + "step": 3941 + }, + { + "epoch": 0.8881628974568395, + "grad_norm": 0.45472804150515667, + "learning_rate": 1.6881942648911077e-07, + "loss": 0.1559, + "step": 3942 + }, + { + "epoch": 0.8883882051426479, + "grad_norm": 0.4590125877737792, + "learning_rate": 1.6814711706865827e-07, + "loss": 0.1681, + "step": 3943 + }, + { + "epoch": 0.8886135128284564, + "grad_norm": 0.4411134236375282, + "learning_rate": 1.6747610243919437e-07, + "loss": 0.1596, + "step": 3944 + }, + { + "epoch": 0.8888388205142648, + "grad_norm": 0.45341729169844436, + "learning_rate": 1.6680638297330854e-07, + "loss": 0.1662, + "step": 3945 + }, + { + "epoch": 0.8890641282000732, + "grad_norm": 0.4750519276878271, + "learning_rate": 1.661379590428705e-07, + "loss": 0.1783, + "step": 3946 + }, + { + "epoch": 0.8892894358858816, + "grad_norm": 0.45338447209795535, + "learning_rate": 1.6547083101903173e-07, + "loss": 0.1652, + "step": 3947 + }, + { + "epoch": 0.8895147435716901, + "grad_norm": 0.4794439119896138, + "learning_rate": 1.6480499927222283e-07, + "loss": 0.178, + "step": 3948 + }, + { + "epoch": 0.8897400512574986, + "grad_norm": 0.4209327461598414, + "learning_rate": 1.641404641721561e-07, + "loss": 0.1488, + "step": 3949 + }, + { + "epoch": 0.889965358943307, + "grad_norm": 0.4465517385552874, + "learning_rate": 1.6347722608782284e-07, + "loss": 0.1703, + "step": 3950 + }, + { + "epoch": 0.8901906666291154, + "grad_norm": 0.420311737887731, + "learning_rate": 1.6281528538749425e-07, + "loss": 0.1536, + "step": 3951 + }, + { + "epoch": 0.8904159743149238, + "grad_norm": 0.4375541603718475, + "learning_rate": 1.6215464243872186e-07, + "loss": 0.1576, + "step": 3952 + }, + { + "epoch": 0.8906412820007322, + "grad_norm": 0.47976237711886216, + "learning_rate": 1.6149529760833504e-07, + "loss": 0.1732, + "step": 3953 + }, + { + "epoch": 0.8908665896865406, + "grad_norm": 0.3983779330892615, + "learning_rate": 1.608372512624448e-07, + "loss": 0.1432, + "step": 3954 + }, + { + "epoch": 0.8910918973723491, + "grad_norm": 0.4457110297169645, + "learning_rate": 1.6018050376643863e-07, + "loss": 0.1552, + "step": 3955 + }, + { + "epoch": 0.8913172050581576, + "grad_norm": 0.44530920286040226, + "learning_rate": 1.595250554849842e-07, + "loss": 0.1639, + "step": 3956 + }, + { + "epoch": 0.891542512743966, + "grad_norm": 0.4453177209739375, + "learning_rate": 1.5887090678202793e-07, + "loss": 0.1627, + "step": 3957 + }, + { + "epoch": 0.8917678204297744, + "grad_norm": 0.441765383020165, + "learning_rate": 1.5821805802079343e-07, + "loss": 0.1628, + "step": 3958 + }, + { + "epoch": 0.8919931281155828, + "grad_norm": 0.44257849791139087, + "learning_rate": 1.5756650956378377e-07, + "loss": 0.1707, + "step": 3959 + }, + { + "epoch": 0.8922184358013913, + "grad_norm": 0.4294573443584964, + "learning_rate": 1.5691626177277986e-07, + "loss": 0.1598, + "step": 3960 + }, + { + "epoch": 0.8924437434871997, + "grad_norm": 0.4336279934980086, + "learning_rate": 1.5626731500883951e-07, + "loss": 0.1612, + "step": 3961 + }, + { + "epoch": 0.8926690511730081, + "grad_norm": 0.47470511044670816, + "learning_rate": 1.5561966963229925e-07, + "loss": 0.1641, + "step": 3962 + }, + { + "epoch": 0.8928943588588165, + "grad_norm": 0.4568257942129516, + "learning_rate": 1.5497332600277137e-07, + "loss": 0.1638, + "step": 3963 + }, + { + "epoch": 0.893119666544625, + "grad_norm": 0.43741766315050706, + "learning_rate": 1.5432828447914743e-07, + "loss": 0.1692, + "step": 3964 + }, + { + "epoch": 0.8933449742304335, + "grad_norm": 0.4241996272329371, + "learning_rate": 1.5368454541959453e-07, + "loss": 0.1504, + "step": 3965 + }, + { + "epoch": 0.8935702819162419, + "grad_norm": 0.4819281946689036, + "learning_rate": 1.5304210918155677e-07, + "loss": 0.179, + "step": 3966 + }, + { + "epoch": 0.8937955896020503, + "grad_norm": 0.4301197811756299, + "learning_rate": 1.524009761217557e-07, + "loss": 0.1578, + "step": 3967 + }, + { + "epoch": 0.8940208972878587, + "grad_norm": 0.42725048602214344, + "learning_rate": 1.5176114659618796e-07, + "loss": 0.1588, + "step": 3968 + }, + { + "epoch": 0.8942462049736671, + "grad_norm": 0.43973456645763237, + "learning_rate": 1.5112262096012743e-07, + "loss": 0.1543, + "step": 3969 + }, + { + "epoch": 0.8944715126594756, + "grad_norm": 0.4347875070135398, + "learning_rate": 1.5048539956812324e-07, + "loss": 0.1635, + "step": 3970 + }, + { + "epoch": 0.8946968203452841, + "grad_norm": 0.48076984368406084, + "learning_rate": 1.4984948277400074e-07, + "loss": 0.1791, + "step": 3971 + }, + { + "epoch": 0.8949221280310925, + "grad_norm": 0.4283254441008252, + "learning_rate": 1.4921487093086134e-07, + "loss": 0.1699, + "step": 3972 + }, + { + "epoch": 0.8951474357169009, + "grad_norm": 0.42877276093374306, + "learning_rate": 1.4858156439108097e-07, + "loss": 0.1422, + "step": 3973 + }, + { + "epoch": 0.8953727434027093, + "grad_norm": 0.42431319448042076, + "learning_rate": 1.4794956350631106e-07, + "loss": 0.1658, + "step": 3974 + }, + { + "epoch": 0.8955980510885178, + "grad_norm": 0.4624182484344259, + "learning_rate": 1.473188686274782e-07, + "loss": 0.1637, + "step": 3975 + }, + { + "epoch": 0.8958233587743262, + "grad_norm": 0.4671839148848571, + "learning_rate": 1.4668948010478358e-07, + "loss": 0.1709, + "step": 3976 + }, + { + "epoch": 0.8960486664601346, + "grad_norm": 0.45885186405205436, + "learning_rate": 1.4606139828770378e-07, + "loss": 0.1748, + "step": 3977 + }, + { + "epoch": 0.896273974145943, + "grad_norm": 0.4385898898358086, + "learning_rate": 1.4543462352498844e-07, + "loss": 0.1578, + "step": 3978 + }, + { + "epoch": 0.8964992818317515, + "grad_norm": 0.41628849360498893, + "learning_rate": 1.448091561646628e-07, + "loss": 0.1627, + "step": 3979 + }, + { + "epoch": 0.89672458951756, + "grad_norm": 0.4422635667375076, + "learning_rate": 1.4418499655402512e-07, + "loss": 0.1561, + "step": 3980 + }, + { + "epoch": 0.8969498972033684, + "grad_norm": 0.4448828924549423, + "learning_rate": 1.435621450396485e-07, + "loss": 0.1627, + "step": 3981 + }, + { + "epoch": 0.8971752048891768, + "grad_norm": 0.4416954130780351, + "learning_rate": 1.4294060196737874e-07, + "loss": 0.155, + "step": 3982 + }, + { + "epoch": 0.8974005125749852, + "grad_norm": 0.4673780692466863, + "learning_rate": 1.4232036768233565e-07, + "loss": 0.1711, + "step": 3983 + }, + { + "epoch": 0.8976258202607936, + "grad_norm": 0.4329903819599156, + "learning_rate": 1.417014425289126e-07, + "loss": 0.1564, + "step": 3984 + }, + { + "epoch": 0.897851127946602, + "grad_norm": 0.4608735969788913, + "learning_rate": 1.4108382685077498e-07, + "loss": 0.1582, + "step": 3985 + }, + { + "epoch": 0.8980764356324105, + "grad_norm": 0.46239912947558653, + "learning_rate": 1.4046752099086236e-07, + "loss": 0.1703, + "step": 3986 + }, + { + "epoch": 0.898301743318219, + "grad_norm": 0.465490846974827, + "learning_rate": 1.398525252913857e-07, + "loss": 0.1727, + "step": 3987 + }, + { + "epoch": 0.8985270510040274, + "grad_norm": 0.4266814336013784, + "learning_rate": 1.3923884009382994e-07, + "loss": 0.1599, + "step": 3988 + }, + { + "epoch": 0.8987523586898358, + "grad_norm": 0.44808687723891033, + "learning_rate": 1.3862646573895134e-07, + "loss": 0.1681, + "step": 3989 + }, + { + "epoch": 0.8989776663756442, + "grad_norm": 0.43352134533818076, + "learning_rate": 1.380154025667782e-07, + "loss": 0.1601, + "step": 3990 + }, + { + "epoch": 0.8992029740614527, + "grad_norm": 0.45005256398911253, + "learning_rate": 1.374056509166119e-07, + "loss": 0.1672, + "step": 3991 + }, + { + "epoch": 0.8994282817472611, + "grad_norm": 0.4143410376925842, + "learning_rate": 1.367972111270241e-07, + "loss": 0.1496, + "step": 3992 + }, + { + "epoch": 0.8996535894330695, + "grad_norm": 0.42153627508726516, + "learning_rate": 1.3619008353585873e-07, + "loss": 0.161, + "step": 3993 + }, + { + "epoch": 0.8998788971188779, + "grad_norm": 0.4714636488961409, + "learning_rate": 1.3558426848023165e-07, + "loss": 0.1669, + "step": 3994 + }, + { + "epoch": 0.9001042048046864, + "grad_norm": 0.4232270815358733, + "learning_rate": 1.3497976629652882e-07, + "loss": 0.1531, + "step": 3995 + }, + { + "epoch": 0.9003295124904949, + "grad_norm": 0.4459821539358213, + "learning_rate": 1.3437657732040783e-07, + "loss": 0.1591, + "step": 3996 + }, + { + "epoch": 0.9005548201763033, + "grad_norm": 0.45269705914031105, + "learning_rate": 1.337747018867977e-07, + "loss": 0.1669, + "step": 3997 + }, + { + "epoch": 0.9007801278621117, + "grad_norm": 0.4531291932672073, + "learning_rate": 1.3317414032989668e-07, + "loss": 0.1652, + "step": 3998 + }, + { + "epoch": 0.9010054355479201, + "grad_norm": 0.44132911705786904, + "learning_rate": 1.3257489298317466e-07, + "loss": 0.158, + "step": 3999 + }, + { + "epoch": 0.9012307432337285, + "grad_norm": 0.4152174532542618, + "learning_rate": 1.3197696017937106e-07, + "loss": 0.1486, + "step": 4000 + }, + { + "epoch": 0.9012307432337285, + "eval_loss": 0.1641092747449875, + "eval_runtime": 57.1541, + "eval_samples_per_second": 50.215, + "eval_steps_per_second": 6.281, + "step": 4000 + }, + { + "epoch": 0.901456050919537, + "grad_norm": 0.4390831240427002, + "learning_rate": 1.3138034225049583e-07, + "loss": 0.1598, + "step": 4001 + }, + { + "epoch": 0.9016813586053454, + "grad_norm": 0.4551929603207974, + "learning_rate": 1.3078503952782845e-07, + "loss": 0.1674, + "step": 4002 + }, + { + "epoch": 0.9019066662911539, + "grad_norm": 0.4615680710621891, + "learning_rate": 1.3019105234191865e-07, + "loss": 0.173, + "step": 4003 + }, + { + "epoch": 0.9021319739769623, + "grad_norm": 0.4658375816098007, + "learning_rate": 1.2959838102258537e-07, + "loss": 0.1646, + "step": 4004 + }, + { + "epoch": 0.9023572816627707, + "grad_norm": 0.43150664563670893, + "learning_rate": 1.2900702589891652e-07, + "loss": 0.1567, + "step": 4005 + }, + { + "epoch": 0.9025825893485792, + "grad_norm": 0.4528535722707677, + "learning_rate": 1.2841698729927022e-07, + "loss": 0.1741, + "step": 4006 + }, + { + "epoch": 0.9028078970343876, + "grad_norm": 0.44494803835170044, + "learning_rate": 1.278282655512722e-07, + "loss": 0.169, + "step": 4007 + }, + { + "epoch": 0.903033204720196, + "grad_norm": 0.41500574404115415, + "learning_rate": 1.272408609818182e-07, + "loss": 0.1539, + "step": 4008 + }, + { + "epoch": 0.9032585124060044, + "grad_norm": 0.4091609731220559, + "learning_rate": 1.2665477391707203e-07, + "loss": 0.1411, + "step": 4009 + }, + { + "epoch": 0.9034838200918128, + "grad_norm": 0.4293767486048136, + "learning_rate": 1.2607000468246533e-07, + "loss": 0.1541, + "step": 4010 + }, + { + "epoch": 0.9037091277776214, + "grad_norm": 0.4383956064376272, + "learning_rate": 1.2548655360269974e-07, + "loss": 0.1645, + "step": 4011 + }, + { + "epoch": 0.9039344354634298, + "grad_norm": 0.4553050653734639, + "learning_rate": 1.2490442100174278e-07, + "loss": 0.1642, + "step": 4012 + }, + { + "epoch": 0.9041597431492382, + "grad_norm": 0.4442857114306503, + "learning_rate": 1.243236072028317e-07, + "loss": 0.1618, + "step": 4013 + }, + { + "epoch": 0.9043850508350466, + "grad_norm": 0.42097656519520466, + "learning_rate": 1.237441125284708e-07, + "loss": 0.1535, + "step": 4014 + }, + { + "epoch": 0.904610358520855, + "grad_norm": 0.4429325126025378, + "learning_rate": 1.2316593730043154e-07, + "loss": 0.1673, + "step": 4015 + }, + { + "epoch": 0.9048356662066634, + "grad_norm": 0.42661217028134385, + "learning_rate": 1.2258908183975322e-07, + "loss": 0.1493, + "step": 4016 + }, + { + "epoch": 0.9050609738924719, + "grad_norm": 0.4200944866412309, + "learning_rate": 1.2201354646674212e-07, + "loss": 0.1513, + "step": 4017 + }, + { + "epoch": 0.9052862815782804, + "grad_norm": 0.46997945676482433, + "learning_rate": 1.2143933150097154e-07, + "loss": 0.1663, + "step": 4018 + }, + { + "epoch": 0.9055115892640888, + "grad_norm": 0.4378096742992082, + "learning_rate": 1.2086643726128194e-07, + "loss": 0.1546, + "step": 4019 + }, + { + "epoch": 0.9057368969498972, + "grad_norm": 0.44925781218817923, + "learning_rate": 1.2029486406577972e-07, + "loss": 0.1581, + "step": 4020 + }, + { + "epoch": 0.9059622046357056, + "grad_norm": 0.44536767135639843, + "learning_rate": 1.1972461223183878e-07, + "loss": 0.1611, + "step": 4021 + }, + { + "epoch": 0.9061875123215141, + "grad_norm": 0.4634406209210946, + "learning_rate": 1.191556820760978e-07, + "loss": 0.1746, + "step": 4022 + }, + { + "epoch": 0.9064128200073225, + "grad_norm": 0.4478808730548626, + "learning_rate": 1.1858807391446319e-07, + "loss": 0.162, + "step": 4023 + }, + { + "epoch": 0.9066381276931309, + "grad_norm": 0.4646411307345472, + "learning_rate": 1.1802178806210624e-07, + "loss": 0.1699, + "step": 4024 + }, + { + "epoch": 0.9068634353789393, + "grad_norm": 0.4494953891763993, + "learning_rate": 1.1745682483346454e-07, + "loss": 0.1697, + "step": 4025 + }, + { + "epoch": 0.9070887430647478, + "grad_norm": 0.4160041663486042, + "learning_rate": 1.1689318454224191e-07, + "loss": 0.1479, + "step": 4026 + }, + { + "epoch": 0.9073140507505563, + "grad_norm": 0.4232320965546384, + "learning_rate": 1.1633086750140521e-07, + "loss": 0.142, + "step": 4027 + }, + { + "epoch": 0.9075393584363647, + "grad_norm": 0.45225565323399175, + "learning_rate": 1.1576987402318884e-07, + "loss": 0.1632, + "step": 4028 + }, + { + "epoch": 0.9077646661221731, + "grad_norm": 0.43827622258033266, + "learning_rate": 1.1521020441909226e-07, + "loss": 0.1583, + "step": 4029 + }, + { + "epoch": 0.9079899738079815, + "grad_norm": 0.460393439053109, + "learning_rate": 1.1465185899987797e-07, + "loss": 0.1702, + "step": 4030 + }, + { + "epoch": 0.9082152814937899, + "grad_norm": 0.4564588598094725, + "learning_rate": 1.140948380755752e-07, + "loss": 0.17, + "step": 4031 + }, + { + "epoch": 0.9084405891795984, + "grad_norm": 0.4546478498809176, + "learning_rate": 1.1353914195547655e-07, + "loss": 0.1656, + "step": 4032 + }, + { + "epoch": 0.9086658968654068, + "grad_norm": 0.41434026326700973, + "learning_rate": 1.1298477094813965e-07, + "loss": 0.1452, + "step": 4033 + }, + { + "epoch": 0.9088912045512153, + "grad_norm": 0.4659642681740653, + "learning_rate": 1.1243172536138547e-07, + "loss": 0.177, + "step": 4034 + }, + { + "epoch": 0.9091165122370237, + "grad_norm": 0.4494399847181398, + "learning_rate": 1.1188000550230005e-07, + "loss": 0.1607, + "step": 4035 + }, + { + "epoch": 0.9093418199228321, + "grad_norm": 0.4686002734934923, + "learning_rate": 1.1132961167723305e-07, + "loss": 0.1783, + "step": 4036 + }, + { + "epoch": 0.9095671276086406, + "grad_norm": 0.46053893206776747, + "learning_rate": 1.1078054419179724e-07, + "loss": 0.1696, + "step": 4037 + }, + { + "epoch": 0.909792435294449, + "grad_norm": 0.4624828290496912, + "learning_rate": 1.1023280335086956e-07, + "loss": 0.1659, + "step": 4038 + }, + { + "epoch": 0.9100177429802574, + "grad_norm": 0.46394668712722625, + "learning_rate": 1.0968638945858978e-07, + "loss": 0.1646, + "step": 4039 + }, + { + "epoch": 0.9102430506660658, + "grad_norm": 0.4628292317786987, + "learning_rate": 1.091413028183616e-07, + "loss": 0.1791, + "step": 4040 + }, + { + "epoch": 0.9104683583518742, + "grad_norm": 0.4400414224551469, + "learning_rate": 1.0859754373285125e-07, + "loss": 0.1718, + "step": 4041 + }, + { + "epoch": 0.9106936660376828, + "grad_norm": 0.4276572666580527, + "learning_rate": 1.0805511250398748e-07, + "loss": 0.1552, + "step": 4042 + }, + { + "epoch": 0.9109189737234912, + "grad_norm": 0.42453681976767615, + "learning_rate": 1.0751400943296269e-07, + "loss": 0.1534, + "step": 4043 + }, + { + "epoch": 0.9111442814092996, + "grad_norm": 0.4516334991313648, + "learning_rate": 1.06974234820231e-07, + "loss": 0.1518, + "step": 4044 + }, + { + "epoch": 0.911369589095108, + "grad_norm": 0.4170033302016585, + "learning_rate": 1.0643578896550877e-07, + "loss": 0.1589, + "step": 4045 + }, + { + "epoch": 0.9115948967809164, + "grad_norm": 0.461560070653569, + "learning_rate": 1.0589867216777544e-07, + "loss": 0.1705, + "step": 4046 + }, + { + "epoch": 0.9118202044667248, + "grad_norm": 0.4473968403488794, + "learning_rate": 1.0536288472527162e-07, + "loss": 0.1595, + "step": 4047 + }, + { + "epoch": 0.9120455121525333, + "grad_norm": 0.44697666549313836, + "learning_rate": 1.0482842693550044e-07, + "loss": 0.1516, + "step": 4048 + }, + { + "epoch": 0.9122708198383417, + "grad_norm": 0.42387073075499676, + "learning_rate": 1.042952990952259e-07, + "loss": 0.1565, + "step": 4049 + }, + { + "epoch": 0.9124961275241502, + "grad_norm": 0.4805349857821692, + "learning_rate": 1.0376350150047427e-07, + "loss": 0.1679, + "step": 4050 + }, + { + "epoch": 0.9127214352099586, + "grad_norm": 0.4189678898115683, + "learning_rate": 1.032330344465332e-07, + "loss": 0.1507, + "step": 4051 + }, + { + "epoch": 0.912946742895767, + "grad_norm": 0.4595453963379181, + "learning_rate": 1.0270389822795073e-07, + "loss": 0.1579, + "step": 4052 + }, + { + "epoch": 0.9131720505815755, + "grad_norm": 0.42052429010820647, + "learning_rate": 1.0217609313853738e-07, + "loss": 0.1567, + "step": 4053 + }, + { + "epoch": 0.9133973582673839, + "grad_norm": 0.4481906414377505, + "learning_rate": 1.0164961947136232e-07, + "loss": 0.1606, + "step": 4054 + }, + { + "epoch": 0.9136226659531923, + "grad_norm": 0.46076198282351655, + "learning_rate": 1.0112447751875809e-07, + "loss": 0.1711, + "step": 4055 + }, + { + "epoch": 0.9138479736390007, + "grad_norm": 0.45507406664498157, + "learning_rate": 1.0060066757231535e-07, + "loss": 0.1514, + "step": 4056 + }, + { + "epoch": 0.9140732813248091, + "grad_norm": 0.4305256138965196, + "learning_rate": 1.0007818992288671e-07, + "loss": 0.1459, + "step": 4057 + }, + { + "epoch": 0.9142985890106177, + "grad_norm": 0.4259152269153248, + "learning_rate": 9.955704486058482e-08, + "loss": 0.1543, + "step": 4058 + }, + { + "epoch": 0.9145238966964261, + "grad_norm": 0.4429284553343684, + "learning_rate": 9.903723267478154e-08, + "loss": 0.1492, + "step": 4059 + }, + { + "epoch": 0.9147492043822345, + "grad_norm": 0.4470400807470827, + "learning_rate": 9.85187536541099e-08, + "loss": 0.1528, + "step": 4060 + }, + { + "epoch": 0.9149745120680429, + "grad_norm": 0.4243384980173835, + "learning_rate": 9.800160808646154e-08, + "loss": 0.1484, + "step": 4061 + }, + { + "epoch": 0.9151998197538513, + "grad_norm": 0.43323021678161894, + "learning_rate": 9.748579625898758e-08, + "loss": 0.1583, + "step": 4062 + }, + { + "epoch": 0.9154251274396598, + "grad_norm": 0.46647106026959867, + "learning_rate": 9.697131845810032e-08, + "loss": 0.1692, + "step": 4063 + }, + { + "epoch": 0.9156504351254682, + "grad_norm": 0.45024196295577756, + "learning_rate": 9.645817496946902e-08, + "loss": 0.1604, + "step": 4064 + }, + { + "epoch": 0.9158757428112767, + "grad_norm": 0.47217901993001604, + "learning_rate": 9.594636607802355e-08, + "loss": 0.1757, + "step": 4065 + }, + { + "epoch": 0.9161010504970851, + "grad_norm": 0.45536743281664355, + "learning_rate": 9.54358920679524e-08, + "loss": 0.1717, + "step": 4066 + }, + { + "epoch": 0.9163263581828935, + "grad_norm": 0.44621423323158804, + "learning_rate": 9.492675322270273e-08, + "loss": 0.1586, + "step": 4067 + }, + { + "epoch": 0.916551665868702, + "grad_norm": 0.42612973215674155, + "learning_rate": 9.441894982498035e-08, + "loss": 0.144, + "step": 4068 + }, + { + "epoch": 0.9167769735545104, + "grad_norm": 0.4894229485704963, + "learning_rate": 9.391248215674942e-08, + "loss": 0.1832, + "step": 4069 + }, + { + "epoch": 0.9170022812403188, + "grad_norm": 0.4562211469487757, + "learning_rate": 9.340735049923277e-08, + "loss": 0.1631, + "step": 4070 + }, + { + "epoch": 0.9172275889261272, + "grad_norm": 0.40542397504716665, + "learning_rate": 9.290355513291105e-08, + "loss": 0.1397, + "step": 4071 + }, + { + "epoch": 0.9174528966119356, + "grad_norm": 0.4136552690525865, + "learning_rate": 9.24010963375227e-08, + "loss": 0.1562, + "step": 4072 + }, + { + "epoch": 0.9176782042977442, + "grad_norm": 0.4686243145081077, + "learning_rate": 9.189997439206538e-08, + "loss": 0.1869, + "step": 4073 + }, + { + "epoch": 0.9179035119835526, + "grad_norm": 0.42018753527012986, + "learning_rate": 9.140018957479236e-08, + "loss": 0.1424, + "step": 4074 + }, + { + "epoch": 0.918128819669361, + "grad_norm": 0.43715252053448034, + "learning_rate": 9.090174216321607e-08, + "loss": 0.1586, + "step": 4075 + }, + { + "epoch": 0.9183541273551694, + "grad_norm": 0.4801359843051152, + "learning_rate": 9.040463243410541e-08, + "loss": 0.1876, + "step": 4076 + }, + { + "epoch": 0.9185794350409778, + "grad_norm": 0.43350132626475907, + "learning_rate": 8.990886066348764e-08, + "loss": 0.151, + "step": 4077 + }, + { + "epoch": 0.9188047427267862, + "grad_norm": 0.46950108357843434, + "learning_rate": 8.941442712664561e-08, + "loss": 0.1663, + "step": 4078 + }, + { + "epoch": 0.9190300504125947, + "grad_norm": 0.47315703999830916, + "learning_rate": 8.892133209811971e-08, + "loss": 0.1704, + "step": 4079 + }, + { + "epoch": 0.9192553580984031, + "grad_norm": 0.4323217019972271, + "learning_rate": 8.842957585170814e-08, + "loss": 0.1637, + "step": 4080 + }, + { + "epoch": 0.9194806657842116, + "grad_norm": 0.4306241422286178, + "learning_rate": 8.79391586604636e-08, + "loss": 0.1532, + "step": 4081 + }, + { + "epoch": 0.91970597347002, + "grad_norm": 0.5155779152775195, + "learning_rate": 8.745008079669742e-08, + "loss": 0.1692, + "step": 4082 + }, + { + "epoch": 0.9199312811558285, + "grad_norm": 0.44759029635907943, + "learning_rate": 8.696234253197599e-08, + "loss": 0.1777, + "step": 4083 + }, + { + "epoch": 0.9201565888416369, + "grad_norm": 0.4732234798727992, + "learning_rate": 8.647594413712212e-08, + "loss": 0.1653, + "step": 4084 + }, + { + "epoch": 0.9203818965274453, + "grad_norm": 0.43626619654908017, + "learning_rate": 8.599088588221504e-08, + "loss": 0.1528, + "step": 4085 + }, + { + "epoch": 0.9206072042132537, + "grad_norm": 0.45510190506163617, + "learning_rate": 8.550716803658904e-08, + "loss": 0.1682, + "step": 4086 + }, + { + "epoch": 0.9208325118990621, + "grad_norm": 0.46345553795971556, + "learning_rate": 8.502479086883481e-08, + "loss": 0.1811, + "step": 4087 + }, + { + "epoch": 0.9210578195848705, + "grad_norm": 0.45385691385842275, + "learning_rate": 8.454375464679865e-08, + "loss": 0.1753, + "step": 4088 + }, + { + "epoch": 0.9212831272706791, + "grad_norm": 0.47200165085193946, + "learning_rate": 8.406405963758162e-08, + "loss": 0.1803, + "step": 4089 + }, + { + "epoch": 0.9215084349564875, + "grad_norm": 0.44286574119433586, + "learning_rate": 8.358570610754097e-08, + "loss": 0.1668, + "step": 4090 + }, + { + "epoch": 0.9217337426422959, + "grad_norm": 0.4588154827200427, + "learning_rate": 8.310869432228808e-08, + "loss": 0.1756, + "step": 4091 + }, + { + "epoch": 0.9219590503281043, + "grad_norm": 0.42401047270531106, + "learning_rate": 8.263302454669025e-08, + "loss": 0.1543, + "step": 4092 + }, + { + "epoch": 0.9221843580139127, + "grad_norm": 0.45141692413228246, + "learning_rate": 8.215869704486873e-08, + "loss": 0.1753, + "step": 4093 + }, + { + "epoch": 0.9224096656997212, + "grad_norm": 0.42959942879138735, + "learning_rate": 8.168571208020032e-08, + "loss": 0.1618, + "step": 4094 + }, + { + "epoch": 0.9226349733855296, + "grad_norm": 0.4368310025822997, + "learning_rate": 8.121406991531577e-08, + "loss": 0.161, + "step": 4095 + }, + { + "epoch": 0.922860281071338, + "grad_norm": 0.4253456979542523, + "learning_rate": 8.074377081210033e-08, + "loss": 0.1502, + "step": 4096 + }, + { + "epoch": 0.9230855887571465, + "grad_norm": 0.4473027627020876, + "learning_rate": 8.027481503169371e-08, + "loss": 0.1642, + "step": 4097 + }, + { + "epoch": 0.9233108964429549, + "grad_norm": 0.41877617337081813, + "learning_rate": 7.980720283448957e-08, + "loss": 0.151, + "step": 4098 + }, + { + "epoch": 0.9235362041287634, + "grad_norm": 0.466712516476667, + "learning_rate": 7.934093448013492e-08, + "loss": 0.1781, + "step": 4099 + }, + { + "epoch": 0.9237615118145718, + "grad_norm": 0.5195968715240509, + "learning_rate": 7.887601022753238e-08, + "loss": 0.1563, + "step": 4100 + }, + { + "epoch": 0.9239868195003802, + "grad_norm": 0.426931606378247, + "learning_rate": 7.841243033483575e-08, + "loss": 0.1449, + "step": 4101 + }, + { + "epoch": 0.9242121271861886, + "grad_norm": 0.42445115860695914, + "learning_rate": 7.795019505945495e-08, + "loss": 0.1607, + "step": 4102 + }, + { + "epoch": 0.924437434871997, + "grad_norm": 0.4593296063569233, + "learning_rate": 7.748930465805105e-08, + "loss": 0.1675, + "step": 4103 + }, + { + "epoch": 0.9246627425578055, + "grad_norm": 0.5069157405385022, + "learning_rate": 7.702975938653934e-08, + "loss": 0.177, + "step": 4104 + }, + { + "epoch": 0.924888050243614, + "grad_norm": 0.45249905957815867, + "learning_rate": 7.657155950008904e-08, + "loss": 0.1628, + "step": 4105 + }, + { + "epoch": 0.9251133579294224, + "grad_norm": 0.4470408034544917, + "learning_rate": 7.611470525312054e-08, + "loss": 0.1586, + "step": 4106 + }, + { + "epoch": 0.9253386656152308, + "grad_norm": 0.44824218275477895, + "learning_rate": 7.565919689930839e-08, + "loss": 0.1679, + "step": 4107 + }, + { + "epoch": 0.9255639733010392, + "grad_norm": 0.4311253596324764, + "learning_rate": 7.520503469157947e-08, + "loss": 0.1537, + "step": 4108 + }, + { + "epoch": 0.9257892809868477, + "grad_norm": 0.4338287417067563, + "learning_rate": 7.47522188821126e-08, + "loss": 0.1635, + "step": 4109 + }, + { + "epoch": 0.9260145886726561, + "grad_norm": 0.40745849283977253, + "learning_rate": 7.430074972234053e-08, + "loss": 0.1488, + "step": 4110 + }, + { + "epoch": 0.9262398963584645, + "grad_norm": 0.43887503555701995, + "learning_rate": 7.385062746294608e-08, + "loss": 0.1516, + "step": 4111 + }, + { + "epoch": 0.9264652040442729, + "grad_norm": 0.4018627338608952, + "learning_rate": 7.340185235386627e-08, + "loss": 0.1411, + "step": 4112 + }, + { + "epoch": 0.9266905117300814, + "grad_norm": 0.44779873005909376, + "learning_rate": 7.29544246442887e-08, + "loss": 0.1652, + "step": 4113 + }, + { + "epoch": 0.9269158194158899, + "grad_norm": 0.44713179213657817, + "learning_rate": 7.250834458265355e-08, + "loss": 0.1537, + "step": 4114 + }, + { + "epoch": 0.9271411271016983, + "grad_norm": 0.4595762620734535, + "learning_rate": 7.206361241665266e-08, + "loss": 0.1586, + "step": 4115 + }, + { + "epoch": 0.9273664347875067, + "grad_norm": 0.43891180073271946, + "learning_rate": 7.162022839322824e-08, + "loss": 0.1563, + "step": 4116 + }, + { + "epoch": 0.9275917424733151, + "grad_norm": 0.4251442569115848, + "learning_rate": 7.117819275857613e-08, + "loss": 0.1538, + "step": 4117 + }, + { + "epoch": 0.9278170501591235, + "grad_norm": 0.47173707838677387, + "learning_rate": 7.073750575814136e-08, + "loss": 0.1716, + "step": 4118 + }, + { + "epoch": 0.9280423578449319, + "grad_norm": 0.43865943815067737, + "learning_rate": 7.029816763662129e-08, + "loss": 0.1596, + "step": 4119 + }, + { + "epoch": 0.9282676655307405, + "grad_norm": 0.42953320810707557, + "learning_rate": 6.986017863796435e-08, + "loss": 0.1495, + "step": 4120 + }, + { + "epoch": 0.9284929732165489, + "grad_norm": 0.450151011441947, + "learning_rate": 6.94235390053688e-08, + "loss": 0.1614, + "step": 4121 + }, + { + "epoch": 0.9287182809023573, + "grad_norm": 0.4488870867696547, + "learning_rate": 6.898824898128515e-08, + "loss": 0.1563, + "step": 4122 + }, + { + "epoch": 0.9289435885881657, + "grad_norm": 0.44611372762540913, + "learning_rate": 6.85543088074131e-08, + "loss": 0.1546, + "step": 4123 + }, + { + "epoch": 0.9291688962739741, + "grad_norm": 0.4519170245736647, + "learning_rate": 6.81217187247038e-08, + "loss": 0.164, + "step": 4124 + }, + { + "epoch": 0.9293942039597826, + "grad_norm": 0.477418536042043, + "learning_rate": 6.769047897335818e-08, + "loss": 0.1739, + "step": 4125 + }, + { + "epoch": 0.929619511645591, + "grad_norm": 0.43284690176866625, + "learning_rate": 6.726058979282774e-08, + "loss": 0.1473, + "step": 4126 + }, + { + "epoch": 0.9298448193313994, + "grad_norm": 0.43567280322868546, + "learning_rate": 6.683205142181404e-08, + "loss": 0.1524, + "step": 4127 + }, + { + "epoch": 0.9300701270172079, + "grad_norm": 0.44821993608506294, + "learning_rate": 6.640486409826785e-08, + "loss": 0.1631, + "step": 4128 + }, + { + "epoch": 0.9302954347030163, + "grad_norm": 0.43448726853801395, + "learning_rate": 6.597902805939138e-08, + "loss": 0.1561, + "step": 4129 + }, + { + "epoch": 0.9305207423888248, + "grad_norm": 0.451294823079318, + "learning_rate": 6.555454354163437e-08, + "loss": 0.1682, + "step": 4130 + }, + { + "epoch": 0.9307460500746332, + "grad_norm": 0.4127094736662087, + "learning_rate": 6.513141078069828e-08, + "loss": 0.1478, + "step": 4131 + }, + { + "epoch": 0.9309713577604416, + "grad_norm": 0.4580223232558796, + "learning_rate": 6.470963001153268e-08, + "loss": 0.174, + "step": 4132 + }, + { + "epoch": 0.93119666544625, + "grad_norm": 0.4517773719327146, + "learning_rate": 6.428920146833606e-08, + "loss": 0.1647, + "step": 4133 + }, + { + "epoch": 0.9314219731320584, + "grad_norm": 0.45523878422093644, + "learning_rate": 6.387012538455723e-08, + "loss": 0.1735, + "step": 4134 + }, + { + "epoch": 0.9316472808178669, + "grad_norm": 0.4406438489107548, + "learning_rate": 6.345240199289365e-08, + "loss": 0.1665, + "step": 4135 + }, + { + "epoch": 0.9318725885036754, + "grad_norm": 0.43195386406422925, + "learning_rate": 6.303603152529119e-08, + "loss": 0.1604, + "step": 4136 + }, + { + "epoch": 0.9320978961894838, + "grad_norm": 0.40824194921116946, + "learning_rate": 6.262101421294547e-08, + "loss": 0.1458, + "step": 4137 + }, + { + "epoch": 0.9323232038752922, + "grad_norm": 0.43004156169031205, + "learning_rate": 6.220735028629937e-08, + "loss": 0.148, + "step": 4138 + }, + { + "epoch": 0.9325485115611006, + "grad_norm": 0.46098831833775916, + "learning_rate": 6.179503997504554e-08, + "loss": 0.1592, + "step": 4139 + }, + { + "epoch": 0.932773819246909, + "grad_norm": 0.4474886257346589, + "learning_rate": 6.13840835081242e-08, + "loss": 0.1522, + "step": 4140 + }, + { + "epoch": 0.9329991269327175, + "grad_norm": 0.4621572883555727, + "learning_rate": 6.097448111372446e-08, + "loss": 0.178, + "step": 4141 + }, + { + "epoch": 0.9332244346185259, + "grad_norm": 0.45919417075603514, + "learning_rate": 6.056623301928327e-08, + "loss": 0.1744, + "step": 4142 + }, + { + "epoch": 0.9334497423043343, + "grad_norm": 0.4450156197293602, + "learning_rate": 6.015933945148517e-08, + "loss": 0.1472, + "step": 4143 + }, + { + "epoch": 0.9336750499901428, + "grad_norm": 0.44044171741468113, + "learning_rate": 5.975380063626356e-08, + "loss": 0.1587, + "step": 4144 + }, + { + "epoch": 0.9339003576759513, + "grad_norm": 0.46579665915691665, + "learning_rate": 5.934961679879836e-08, + "loss": 0.1584, + "step": 4145 + }, + { + "epoch": 0.9341256653617597, + "grad_norm": 0.4131121823345429, + "learning_rate": 5.894678816351862e-08, + "loss": 0.1552, + "step": 4146 + }, + { + "epoch": 0.9343509730475681, + "grad_norm": 0.4481976506747898, + "learning_rate": 5.854531495409932e-08, + "loss": 0.1661, + "step": 4147 + }, + { + "epoch": 0.9345762807333765, + "grad_norm": 0.4373053456506344, + "learning_rate": 5.8145197393463806e-08, + "loss": 0.1583, + "step": 4148 + }, + { + "epoch": 0.9348015884191849, + "grad_norm": 0.435042736487097, + "learning_rate": 5.774643570378296e-08, + "loss": 0.1487, + "step": 4149 + }, + { + "epoch": 0.9350268961049933, + "grad_norm": 0.4280559738533269, + "learning_rate": 5.73490301064733e-08, + "loss": 0.1545, + "step": 4150 + }, + { + "epoch": 0.9352522037908018, + "grad_norm": 0.43738350565545886, + "learning_rate": 5.695298082219997e-08, + "loss": 0.1749, + "step": 4151 + }, + { + "epoch": 0.9354775114766103, + "grad_norm": 0.42621478825743375, + "learning_rate": 5.6558288070874544e-08, + "loss": 0.1557, + "step": 4152 + }, + { + "epoch": 0.9357028191624187, + "grad_norm": 0.47572378767286966, + "learning_rate": 5.616495207165451e-08, + "loss": 0.1817, + "step": 4153 + }, + { + "epoch": 0.9359281268482271, + "grad_norm": 0.4497113659953152, + "learning_rate": 5.577297304294543e-08, + "loss": 0.1622, + "step": 4154 + }, + { + "epoch": 0.9361534345340355, + "grad_norm": 0.442728534073399, + "learning_rate": 5.538235120239821e-08, + "loss": 0.1557, + "step": 4155 + }, + { + "epoch": 0.936378742219844, + "grad_norm": 0.4217312031364034, + "learning_rate": 5.4993086766910733e-08, + "loss": 0.1555, + "step": 4156 + }, + { + "epoch": 0.9366040499056524, + "grad_norm": 0.44976795988005147, + "learning_rate": 5.460517995262704e-08, + "loss": 0.1653, + "step": 4157 + }, + { + "epoch": 0.9368293575914608, + "grad_norm": 0.4333953507267874, + "learning_rate": 5.421863097493707e-08, + "loss": 0.1627, + "step": 4158 + }, + { + "epoch": 0.9370546652772692, + "grad_norm": 0.45405743234083135, + "learning_rate": 5.383344004847774e-08, + "loss": 0.1707, + "step": 4159 + }, + { + "epoch": 0.9372799729630777, + "grad_norm": 0.43466121162033305, + "learning_rate": 5.344960738713018e-08, + "loss": 0.1537, + "step": 4160 + }, + { + "epoch": 0.9375052806488862, + "grad_norm": 0.47169293096334286, + "learning_rate": 5.3067133204023344e-08, + "loss": 0.1787, + "step": 4161 + }, + { + "epoch": 0.9377305883346946, + "grad_norm": 0.4231644979732609, + "learning_rate": 5.268601771153042e-08, + "loss": 0.1522, + "step": 4162 + }, + { + "epoch": 0.937955896020503, + "grad_norm": 0.4863900112115708, + "learning_rate": 5.230626112127046e-08, + "loss": 0.1547, + "step": 4163 + }, + { + "epoch": 0.9381812037063114, + "grad_norm": 0.44783194283540945, + "learning_rate": 5.192786364410868e-08, + "loss": 0.1591, + "step": 4164 + }, + { + "epoch": 0.9384065113921198, + "grad_norm": 0.4092019878061088, + "learning_rate": 5.15508254901545e-08, + "loss": 0.1456, + "step": 4165 + }, + { + "epoch": 0.9386318190779283, + "grad_norm": 0.4445823051916237, + "learning_rate": 5.117514686876379e-08, + "loss": 0.1575, + "step": 4166 + }, + { + "epoch": 0.9388571267637368, + "grad_norm": 0.4336429106735467, + "learning_rate": 5.080082798853664e-08, + "loss": 0.1581, + "step": 4167 + }, + { + "epoch": 0.9390824344495452, + "grad_norm": 0.44094930633089635, + "learning_rate": 5.0427869057317894e-08, + "loss": 0.1623, + "step": 4168 + }, + { + "epoch": 0.9393077421353536, + "grad_norm": 0.4551560341788107, + "learning_rate": 5.0056270282198286e-08, + "loss": 0.1603, + "step": 4169 + }, + { + "epoch": 0.939533049821162, + "grad_norm": 0.44921347078405843, + "learning_rate": 4.9686031869512486e-08, + "loss": 0.1588, + "step": 4170 + }, + { + "epoch": 0.9397583575069705, + "grad_norm": 0.4479139293266144, + "learning_rate": 4.93171540248405e-08, + "loss": 0.1532, + "step": 4171 + }, + { + "epoch": 0.9399836651927789, + "grad_norm": 0.4935446374802541, + "learning_rate": 4.89496369530057e-08, + "loss": 0.1789, + "step": 4172 + }, + { + "epoch": 0.9402089728785873, + "grad_norm": 0.45600249943482984, + "learning_rate": 4.858348085807735e-08, + "loss": 0.1641, + "step": 4173 + }, + { + "epoch": 0.9404342805643957, + "grad_norm": 0.47747442524131983, + "learning_rate": 4.8218685943368094e-08, + "loss": 0.1722, + "step": 4174 + }, + { + "epoch": 0.9406595882502042, + "grad_norm": 0.4706286518285177, + "learning_rate": 4.7855252411434516e-08, + "loss": 0.1735, + "step": 4175 + }, + { + "epoch": 0.9408848959360127, + "grad_norm": 0.4377685761606597, + "learning_rate": 4.7493180464078246e-08, + "loss": 0.157, + "step": 4176 + }, + { + "epoch": 0.9411102036218211, + "grad_norm": 0.4420334717835541, + "learning_rate": 4.713247030234402e-08, + "loss": 0.1649, + "step": 4177 + }, + { + "epoch": 0.9413355113076295, + "grad_norm": 0.4459972830528949, + "learning_rate": 4.677312212652108e-08, + "loss": 0.1555, + "step": 4178 + }, + { + "epoch": 0.9415608189934379, + "grad_norm": 0.4554547347603984, + "learning_rate": 4.641513613614174e-08, + "loss": 0.1633, + "step": 4179 + }, + { + "epoch": 0.9417861266792463, + "grad_norm": 0.4495415014053769, + "learning_rate": 4.605851252998256e-08, + "loss": 0.1644, + "step": 4180 + }, + { + "epoch": 0.9420114343650547, + "grad_norm": 0.44564535692954393, + "learning_rate": 4.570325150606292e-08, + "loss": 0.156, + "step": 4181 + }, + { + "epoch": 0.9422367420508632, + "grad_norm": 0.43067121422417426, + "learning_rate": 4.5349353261646414e-08, + "loss": 0.156, + "step": 4182 + }, + { + "epoch": 0.9424620497366717, + "grad_norm": 0.41080438222357724, + "learning_rate": 4.4996817993239464e-08, + "loss": 0.1389, + "step": 4183 + }, + { + "epoch": 0.9426873574224801, + "grad_norm": 0.4483835006499614, + "learning_rate": 4.464564589659187e-08, + "loss": 0.1744, + "step": 4184 + }, + { + "epoch": 0.9429126651082885, + "grad_norm": 0.455552776939051, + "learning_rate": 4.4295837166696e-08, + "loss": 0.1585, + "step": 4185 + }, + { + "epoch": 0.943137972794097, + "grad_norm": 0.4559687841396016, + "learning_rate": 4.3947391997787857e-08, + "loss": 0.1722, + "step": 4186 + }, + { + "epoch": 0.9433632804799054, + "grad_norm": 0.4429344786544215, + "learning_rate": 4.360031058334602e-08, + "loss": 0.1765, + "step": 4187 + }, + { + "epoch": 0.9435885881657138, + "grad_norm": 0.44926582345942434, + "learning_rate": 4.325459311609187e-08, + "loss": 0.166, + "step": 4188 + }, + { + "epoch": 0.9438138958515222, + "grad_norm": 0.46874486792229064, + "learning_rate": 4.291023978798964e-08, + "loss": 0.1722, + "step": 4189 + }, + { + "epoch": 0.9440392035373306, + "grad_norm": 0.45640688154844694, + "learning_rate": 4.256725079024554e-08, + "loss": 0.1765, + "step": 4190 + }, + { + "epoch": 0.9442645112231391, + "grad_norm": 0.4707168833624861, + "learning_rate": 4.22256263133089e-08, + "loss": 0.1658, + "step": 4191 + }, + { + "epoch": 0.9444898189089476, + "grad_norm": 0.43440466316904386, + "learning_rate": 4.1885366546870754e-08, + "loss": 0.1647, + "step": 4192 + }, + { + "epoch": 0.944715126594756, + "grad_norm": 0.4631312322463988, + "learning_rate": 4.1546471679864975e-08, + "loss": 0.172, + "step": 4193 + }, + { + "epoch": 0.9449404342805644, + "grad_norm": 0.42681331812593015, + "learning_rate": 4.120894190046687e-08, + "loss": 0.1495, + "step": 4194 + }, + { + "epoch": 0.9451657419663728, + "grad_norm": 0.44100926354325404, + "learning_rate": 4.087277739609458e-08, + "loss": 0.155, + "step": 4195 + }, + { + "epoch": 0.9453910496521812, + "grad_norm": 0.45270580834446134, + "learning_rate": 4.053797835340739e-08, + "loss": 0.1633, + "step": 4196 + }, + { + "epoch": 0.9456163573379897, + "grad_norm": 0.4085837207073482, + "learning_rate": 4.020454495830689e-08, + "loss": 0.1428, + "step": 4197 + }, + { + "epoch": 0.9458416650237981, + "grad_norm": 0.4564405380348284, + "learning_rate": 3.987247739593636e-08, + "loss": 0.1681, + "step": 4198 + }, + { + "epoch": 0.9460669727096066, + "grad_norm": 0.4388805626940992, + "learning_rate": 3.9541775850679975e-08, + "loss": 0.1517, + "step": 4199 + }, + { + "epoch": 0.946292280395415, + "grad_norm": 0.4355142348053683, + "learning_rate": 3.9212440506164465e-08, + "loss": 0.1574, + "step": 4200 + }, + { + "epoch": 0.9465175880812234, + "grad_norm": 0.43903799042597585, + "learning_rate": 3.888447154525771e-08, + "loss": 0.1641, + "step": 4201 + }, + { + "epoch": 0.9467428957670319, + "grad_norm": 0.4393698824388695, + "learning_rate": 3.855786915006793e-08, + "loss": 0.1553, + "step": 4202 + }, + { + "epoch": 0.9469682034528403, + "grad_norm": 0.4432931845219573, + "learning_rate": 3.8232633501945896e-08, + "loss": 0.1432, + "step": 4203 + }, + { + "epoch": 0.9471935111386487, + "grad_norm": 0.4542846020186478, + "learning_rate": 3.790876478148242e-08, + "loss": 0.1672, + "step": 4204 + }, + { + "epoch": 0.9474188188244571, + "grad_norm": 0.4970208237029793, + "learning_rate": 3.758626316850977e-08, + "loss": 0.1754, + "step": 4205 + }, + { + "epoch": 0.9476441265102655, + "grad_norm": 0.47004472029860433, + "learning_rate": 3.726512884210165e-08, + "loss": 0.1679, + "step": 4206 + }, + { + "epoch": 0.9478694341960741, + "grad_norm": 0.4432995936852746, + "learning_rate": 3.694536198057097e-08, + "loss": 0.1511, + "step": 4207 + }, + { + "epoch": 0.9480947418818825, + "grad_norm": 0.458158961270028, + "learning_rate": 3.6626962761473205e-08, + "loss": 0.1662, + "step": 4208 + }, + { + "epoch": 0.9483200495676909, + "grad_norm": 0.43942168205999665, + "learning_rate": 3.630993136160332e-08, + "loss": 0.1483, + "step": 4209 + }, + { + "epoch": 0.9485453572534993, + "grad_norm": 0.45977353490683776, + "learning_rate": 3.599426795699662e-08, + "loss": 0.171, + "step": 4210 + }, + { + "epoch": 0.9487706649393077, + "grad_norm": 0.4406077688519674, + "learning_rate": 3.567997272293011e-08, + "loss": 0.1688, + "step": 4211 + }, + { + "epoch": 0.9489959726251161, + "grad_norm": 0.4327038993854469, + "learning_rate": 3.53670458339192e-08, + "loss": 0.1602, + "step": 4212 + }, + { + "epoch": 0.9492212803109246, + "grad_norm": 0.47338531000094225, + "learning_rate": 3.505548746372128e-08, + "loss": 0.1552, + "step": 4213 + }, + { + "epoch": 0.9494465879967331, + "grad_norm": 0.4435708796418568, + "learning_rate": 3.474529778533298e-08, + "loss": 0.1583, + "step": 4214 + }, + { + "epoch": 0.9496718956825415, + "grad_norm": 0.4379081175725884, + "learning_rate": 3.443647697099067e-08, + "loss": 0.1566, + "step": 4215 + }, + { + "epoch": 0.9498972033683499, + "grad_norm": 0.43032495116362124, + "learning_rate": 3.412902519217137e-08, + "loss": 0.1532, + "step": 4216 + }, + { + "epoch": 0.9501225110541583, + "grad_norm": 0.4455473474790535, + "learning_rate": 3.382294261959157e-08, + "loss": 0.1609, + "step": 4217 + }, + { + "epoch": 0.9503478187399668, + "grad_norm": 0.45514795872690167, + "learning_rate": 3.351822942320754e-08, + "loss": 0.1655, + "step": 4218 + }, + { + "epoch": 0.9505731264257752, + "grad_norm": 0.4936700153529475, + "learning_rate": 3.3214885772215046e-08, + "loss": 0.1768, + "step": 4219 + }, + { + "epoch": 0.9507984341115836, + "grad_norm": 0.42858205386729264, + "learning_rate": 3.2912911835049634e-08, + "loss": 0.1502, + "step": 4220 + }, + { + "epoch": 0.951023741797392, + "grad_norm": 0.4528643637199378, + "learning_rate": 3.261230777938607e-08, + "loss": 0.1748, + "step": 4221 + }, + { + "epoch": 0.9512490494832005, + "grad_norm": 0.46507200306757096, + "learning_rate": 3.231307377213833e-08, + "loss": 0.1755, + "step": 4222 + }, + { + "epoch": 0.951474357169009, + "grad_norm": 0.4662901160756157, + "learning_rate": 3.201520997946045e-08, + "loss": 0.1774, + "step": 4223 + }, + { + "epoch": 0.9516996648548174, + "grad_norm": 0.41862583282724913, + "learning_rate": 3.171871656674458e-08, + "loss": 0.1472, + "step": 4224 + }, + { + "epoch": 0.9519249725406258, + "grad_norm": 0.43096675177257887, + "learning_rate": 3.142359369862291e-08, + "loss": 0.1568, + "step": 4225 + }, + { + "epoch": 0.9521502802264342, + "grad_norm": 0.4655006603197659, + "learning_rate": 3.112984153896603e-08, + "loss": 0.1863, + "step": 4226 + }, + { + "epoch": 0.9523755879122426, + "grad_norm": 0.4680964289093296, + "learning_rate": 3.0837460250883186e-08, + "loss": 0.1674, + "step": 4227 + }, + { + "epoch": 0.9526008955980511, + "grad_norm": 0.45727446619617684, + "learning_rate": 3.0546449996723404e-08, + "loss": 0.1651, + "step": 4228 + }, + { + "epoch": 0.9528262032838595, + "grad_norm": 0.42139244968950884, + "learning_rate": 3.0256810938073534e-08, + "loss": 0.1506, + "step": 4229 + }, + { + "epoch": 0.953051510969668, + "grad_norm": 0.478306551116162, + "learning_rate": 2.996854323575937e-08, + "loss": 0.1779, + "step": 4230 + }, + { + "epoch": 0.9532768186554764, + "grad_norm": 0.46687878789085546, + "learning_rate": 2.968164704984483e-08, + "loss": 0.1618, + "step": 4231 + }, + { + "epoch": 0.9535021263412848, + "grad_norm": 0.46360474464913387, + "learning_rate": 2.939612253963331e-08, + "loss": 0.171, + "step": 4232 + }, + { + "epoch": 0.9537274340270933, + "grad_norm": 0.45770098430951445, + "learning_rate": 2.911196986366577e-08, + "loss": 0.1784, + "step": 4233 + }, + { + "epoch": 0.9539527417129017, + "grad_norm": 0.47280461829857645, + "learning_rate": 2.8829189179721552e-08, + "loss": 0.1713, + "step": 4234 + }, + { + "epoch": 0.9541780493987101, + "grad_norm": 0.47064507294618185, + "learning_rate": 2.8547780644818113e-08, + "loss": 0.1626, + "step": 4235 + }, + { + "epoch": 0.9544033570845185, + "grad_norm": 0.43018096434346265, + "learning_rate": 2.8267744415211296e-08, + "loss": 0.1486, + "step": 4236 + }, + { + "epoch": 0.9546286647703269, + "grad_norm": 0.4992967718165909, + "learning_rate": 2.7989080646394217e-08, + "loss": 0.1918, + "step": 4237 + }, + { + "epoch": 0.9548539724561355, + "grad_norm": 0.4300182244384638, + "learning_rate": 2.7711789493099495e-08, + "loss": 0.1483, + "step": 4238 + }, + { + "epoch": 0.9550792801419439, + "grad_norm": 0.423935734509779, + "learning_rate": 2.743587110929563e-08, + "loss": 0.1458, + "step": 4239 + }, + { + "epoch": 0.9553045878277523, + "grad_norm": 0.4344323773637306, + "learning_rate": 2.716132564819035e-08, + "loss": 0.1654, + "step": 4240 + }, + { + "epoch": 0.9555298955135607, + "grad_norm": 0.45111761661380184, + "learning_rate": 2.688815326222838e-08, + "loss": 0.1642, + "step": 4241 + }, + { + "epoch": 0.9557552031993691, + "grad_norm": 0.468915362716942, + "learning_rate": 2.661635410309199e-08, + "loss": 0.1693, + "step": 4242 + }, + { + "epoch": 0.9559805108851775, + "grad_norm": 0.4629891229876016, + "learning_rate": 2.6345928321701575e-08, + "loss": 0.1636, + "step": 4243 + }, + { + "epoch": 0.956205818570986, + "grad_norm": 0.4181347442157994, + "learning_rate": 2.6076876068213965e-08, + "loss": 0.1494, + "step": 4244 + }, + { + "epoch": 0.9564311262567944, + "grad_norm": 0.4548634822441496, + "learning_rate": 2.5809197492024372e-08, + "loss": 0.1773, + "step": 4245 + }, + { + "epoch": 0.9566564339426029, + "grad_norm": 0.4521359841254643, + "learning_rate": 2.554289274176419e-08, + "loss": 0.1636, + "step": 4246 + }, + { + "epoch": 0.9568817416284113, + "grad_norm": 0.46222868669279676, + "learning_rate": 2.5277961965302633e-08, + "loss": 0.168, + "step": 4247 + }, + { + "epoch": 0.9571070493142197, + "grad_norm": 0.4340277626588638, + "learning_rate": 2.5014405309746193e-08, + "loss": 0.1489, + "step": 4248 + }, + { + "epoch": 0.9573323570000282, + "grad_norm": 0.4484015941917351, + "learning_rate": 2.4752222921437807e-08, + "loss": 0.1585, + "step": 4249 + }, + { + "epoch": 0.9575576646858366, + "grad_norm": 0.4953506006181928, + "learning_rate": 2.449141494595797e-08, + "loss": 0.1753, + "step": 4250 + }, + { + "epoch": 0.957782972371645, + "grad_norm": 0.4380201358404699, + "learning_rate": 2.423198152812306e-08, + "loss": 0.1554, + "step": 4251 + }, + { + "epoch": 0.9580082800574534, + "grad_norm": 0.46472642527826996, + "learning_rate": 2.3973922811987295e-08, + "loss": 0.172, + "step": 4252 + }, + { + "epoch": 0.9582335877432618, + "grad_norm": 0.4665063942953351, + "learning_rate": 2.3717238940840493e-08, + "loss": 0.1564, + "step": 4253 + }, + { + "epoch": 0.9584588954290704, + "grad_norm": 0.438364655609606, + "learning_rate": 2.3461930057210037e-08, + "loss": 0.1561, + "step": 4254 + }, + { + "epoch": 0.9586842031148788, + "grad_norm": 0.40906168344979743, + "learning_rate": 2.320799630285947e-08, + "loss": 0.1477, + "step": 4255 + }, + { + "epoch": 0.9589095108006872, + "grad_norm": 0.43986881087290897, + "learning_rate": 2.2955437818788508e-08, + "loss": 0.1768, + "step": 4256 + }, + { + "epoch": 0.9591348184864956, + "grad_norm": 0.43284607975018463, + "learning_rate": 2.2704254745233577e-08, + "loss": 0.1548, + "step": 4257 + }, + { + "epoch": 0.959360126172304, + "grad_norm": 0.4415604684373658, + "learning_rate": 2.2454447221667563e-08, + "loss": 0.1625, + "step": 4258 + }, + { + "epoch": 0.9595854338581125, + "grad_norm": 0.4468803569884082, + "learning_rate": 2.2206015386798673e-08, + "loss": 0.1577, + "step": 4259 + }, + { + "epoch": 0.9598107415439209, + "grad_norm": 0.4290050376194191, + "learning_rate": 2.1958959378572398e-08, + "loss": 0.1486, + "step": 4260 + }, + { + "epoch": 0.9600360492297294, + "grad_norm": 0.4476931878299971, + "learning_rate": 2.1713279334169278e-08, + "loss": 0.1699, + "step": 4261 + }, + { + "epoch": 0.9602613569155378, + "grad_norm": 0.41024365780706123, + "learning_rate": 2.1468975390006587e-08, + "loss": 0.1468, + "step": 4262 + }, + { + "epoch": 0.9604866646013462, + "grad_norm": 0.4361987725562448, + "learning_rate": 2.1226047681737193e-08, + "loss": 0.149, + "step": 4263 + }, + { + "epoch": 0.9607119722871547, + "grad_norm": 0.45800293256825053, + "learning_rate": 2.0984496344249596e-08, + "loss": 0.1631, + "step": 4264 + }, + { + "epoch": 0.9609372799729631, + "grad_norm": 0.42662583436573603, + "learning_rate": 2.074432151166844e-08, + "loss": 0.1509, + "step": 4265 + }, + { + "epoch": 0.9611625876587715, + "grad_norm": 0.4958880556080488, + "learning_rate": 2.0505523317353727e-08, + "loss": 0.1757, + "step": 4266 + }, + { + "epoch": 0.9613878953445799, + "grad_norm": 0.43817582097436736, + "learning_rate": 2.0268101893901327e-08, + "loss": 0.1574, + "step": 4267 + }, + { + "epoch": 0.9616132030303883, + "grad_norm": 0.45311856274966783, + "learning_rate": 2.0032057373142453e-08, + "loss": 0.1699, + "step": 4268 + }, + { + "epoch": 0.9618385107161969, + "grad_norm": 0.43246736612667963, + "learning_rate": 1.9797389886143658e-08, + "loss": 0.1672, + "step": 4269 + }, + { + "epoch": 0.9620638184020053, + "grad_norm": 0.4232786155483982, + "learning_rate": 1.956409956320737e-08, + "loss": 0.1571, + "step": 4270 + }, + { + "epoch": 0.9622891260878137, + "grad_norm": 0.4524982807163172, + "learning_rate": 1.933218653387081e-08, + "loss": 0.1732, + "step": 4271 + }, + { + "epoch": 0.9625144337736221, + "grad_norm": 0.45017940062159684, + "learning_rate": 1.91016509269068e-08, + "loss": 0.1749, + "step": 4272 + }, + { + "epoch": 0.9627397414594305, + "grad_norm": 0.48359593711579807, + "learning_rate": 1.8872492870322945e-08, + "loss": 0.1738, + "step": 4273 + }, + { + "epoch": 0.962965049145239, + "grad_norm": 0.4336070785812144, + "learning_rate": 1.864471249136218e-08, + "loss": 0.1573, + "step": 4274 + }, + { + "epoch": 0.9631903568310474, + "grad_norm": 0.4651580000799828, + "learning_rate": 1.8418309916502787e-08, + "loss": 0.1666, + "step": 4275 + }, + { + "epoch": 0.9634156645168558, + "grad_norm": 0.44207816711390224, + "learning_rate": 1.819328527145725e-08, + "loss": 0.1473, + "step": 4276 + }, + { + "epoch": 0.9636409722026643, + "grad_norm": 0.42855102649879856, + "learning_rate": 1.7969638681173684e-08, + "loss": 0.1571, + "step": 4277 + }, + { + "epoch": 0.9638662798884727, + "grad_norm": 0.45264300220523773, + "learning_rate": 1.774737026983414e-08, + "loss": 0.1604, + "step": 4278 + }, + { + "epoch": 0.9640915875742812, + "grad_norm": 0.4640385173077475, + "learning_rate": 1.752648016085684e-08, + "loss": 0.1797, + "step": 4279 + }, + { + "epoch": 0.9643168952600896, + "grad_norm": 0.4385447637828304, + "learning_rate": 1.7306968476893393e-08, + "loss": 0.1637, + "step": 4280 + }, + { + "epoch": 0.964542202945898, + "grad_norm": 0.47694797536180056, + "learning_rate": 1.708883533983019e-08, + "loss": 0.1694, + "step": 4281 + }, + { + "epoch": 0.9647675106317064, + "grad_norm": 0.47024305408861466, + "learning_rate": 1.6872080870788955e-08, + "loss": 0.169, + "step": 4282 + }, + { + "epoch": 0.9649928183175148, + "grad_norm": 0.40449035538396055, + "learning_rate": 1.6656705190125078e-08, + "loss": 0.1438, + "step": 4283 + }, + { + "epoch": 0.9652181260033232, + "grad_norm": 0.43761716497214087, + "learning_rate": 1.6442708417428732e-08, + "loss": 0.1581, + "step": 4284 + }, + { + "epoch": 0.9654434336891318, + "grad_norm": 0.4287795234204464, + "learning_rate": 1.6230090671524312e-08, + "loss": 0.1555, + "step": 4285 + }, + { + "epoch": 0.9656687413749402, + "grad_norm": 0.4663216015091154, + "learning_rate": 1.6018852070470437e-08, + "loss": 0.156, + "step": 4286 + }, + { + "epoch": 0.9658940490607486, + "grad_norm": 0.460973957650909, + "learning_rate": 1.5808992731560225e-08, + "loss": 0.1689, + "step": 4287 + }, + { + "epoch": 0.966119356746557, + "grad_norm": 0.43606545686098996, + "learning_rate": 1.5600512771320462e-08, + "loss": 0.1527, + "step": 4288 + }, + { + "epoch": 0.9663446644323654, + "grad_norm": 0.4567228057315352, + "learning_rate": 1.5393412305512446e-08, + "loss": 0.1564, + "step": 4289 + }, + { + "epoch": 0.9665699721181739, + "grad_norm": 0.4642434211696, + "learning_rate": 1.518769144913168e-08, + "loss": 0.1613, + "step": 4290 + }, + { + "epoch": 0.9667952798039823, + "grad_norm": 0.43253345766342255, + "learning_rate": 1.4983350316406797e-08, + "loss": 0.1546, + "step": 4291 + }, + { + "epoch": 0.9670205874897907, + "grad_norm": 0.42280661992383445, + "learning_rate": 1.4780389020800923e-08, + "loss": 0.1498, + "step": 4292 + }, + { + "epoch": 0.9672458951755992, + "grad_norm": 0.4285046468676128, + "learning_rate": 1.4578807675011131e-08, + "loss": 0.1542, + "step": 4293 + }, + { + "epoch": 0.9674712028614076, + "grad_norm": 0.4650870968276868, + "learning_rate": 1.4378606390967609e-08, + "loss": 0.1668, + "step": 4294 + }, + { + "epoch": 0.9676965105472161, + "grad_norm": 0.44978442891278053, + "learning_rate": 1.4179785279835045e-08, + "loss": 0.1633, + "step": 4295 + }, + { + "epoch": 0.9679218182330245, + "grad_norm": 0.4171039867741593, + "learning_rate": 1.3982344452011242e-08, + "loss": 0.1524, + "step": 4296 + }, + { + "epoch": 0.9681471259188329, + "grad_norm": 0.4371637319321181, + "learning_rate": 1.3786284017127949e-08, + "loss": 0.1641, + "step": 4297 + }, + { + "epoch": 0.9683724336046413, + "grad_norm": 0.44298287179481954, + "learning_rate": 1.3591604084049747e-08, + "loss": 0.1618, + "step": 4298 + }, + { + "epoch": 0.9685977412904497, + "grad_norm": 0.4349868674582635, + "learning_rate": 1.3398304760875725e-08, + "loss": 0.1619, + "step": 4299 + }, + { + "epoch": 0.9688230489762581, + "grad_norm": 0.4605640859694425, + "learning_rate": 1.3206386154937245e-08, + "loss": 0.1715, + "step": 4300 + }, + { + "epoch": 0.9690483566620667, + "grad_norm": 0.46155520998702493, + "learning_rate": 1.30158483727999e-08, + "loss": 0.1724, + "step": 4301 + }, + { + "epoch": 0.9692736643478751, + "grad_norm": 0.44590022140343494, + "learning_rate": 1.2826691520262114e-08, + "loss": 0.1578, + "step": 4302 + }, + { + "epoch": 0.9694989720336835, + "grad_norm": 0.4200638925507643, + "learning_rate": 1.2638915702355702e-08, + "loss": 0.1442, + "step": 4303 + }, + { + "epoch": 0.9697242797194919, + "grad_norm": 0.4396843485692041, + "learning_rate": 1.2452521023345598e-08, + "loss": 0.153, + "step": 4304 + }, + { + "epoch": 0.9699495874053004, + "grad_norm": 0.44535623288638543, + "learning_rate": 1.2267507586729566e-08, + "loss": 0.1664, + "step": 4305 + }, + { + "epoch": 0.9701748950911088, + "grad_norm": 0.4110508859643994, + "learning_rate": 1.2083875495238761e-08, + "loss": 0.144, + "step": 4306 + }, + { + "epoch": 0.9704002027769172, + "grad_norm": 0.449642062406077, + "learning_rate": 1.1901624850837734e-08, + "loss": 0.1652, + "step": 4307 + }, + { + "epoch": 0.9706255104627256, + "grad_norm": 0.4693949667565058, + "learning_rate": 1.1720755754722757e-08, + "loss": 0.1727, + "step": 4308 + }, + { + "epoch": 0.9708508181485341, + "grad_norm": 0.4694820033769964, + "learning_rate": 1.1541268307324049e-08, + "loss": 0.1663, + "step": 4309 + }, + { + "epoch": 0.9710761258343426, + "grad_norm": 0.4563107806179698, + "learning_rate": 1.1363162608304112e-08, + "loss": 0.1696, + "step": 4310 + }, + { + "epoch": 0.971301433520151, + "grad_norm": 0.45027318880025363, + "learning_rate": 1.1186438756558838e-08, + "loss": 0.165, + "step": 4311 + }, + { + "epoch": 0.9715267412059594, + "grad_norm": 0.43884783774208264, + "learning_rate": 1.1011096850215842e-08, + "loss": 0.1704, + "step": 4312 + }, + { + "epoch": 0.9717520488917678, + "grad_norm": 0.4544012881951256, + "learning_rate": 1.083713698663641e-08, + "loss": 0.1615, + "step": 4313 + }, + { + "epoch": 0.9719773565775762, + "grad_norm": 0.4229028484997577, + "learning_rate": 1.0664559262413831e-08, + "loss": 0.1489, + "step": 4314 + }, + { + "epoch": 0.9722026642633846, + "grad_norm": 0.4602416093671792, + "learning_rate": 1.0493363773373677e-08, + "loss": 0.1751, + "step": 4315 + }, + { + "epoch": 0.9724279719491932, + "grad_norm": 0.4877002141305655, + "learning_rate": 1.0323550614574907e-08, + "loss": 0.1713, + "step": 4316 + }, + { + "epoch": 0.9726532796350016, + "grad_norm": 0.45932723589050195, + "learning_rate": 1.0155119880308483e-08, + "loss": 0.1557, + "step": 4317 + }, + { + "epoch": 0.97287858732081, + "grad_norm": 0.44479939828182713, + "learning_rate": 9.988071664097376e-09, + "loss": 0.1614, + "step": 4318 + }, + { + "epoch": 0.9731038950066184, + "grad_norm": 0.4476667621507759, + "learning_rate": 9.822406058697665e-09, + "loss": 0.1693, + "step": 4319 + }, + { + "epoch": 0.9733292026924268, + "grad_norm": 0.4621655986150245, + "learning_rate": 9.658123156096599e-09, + "loss": 0.1713, + "step": 4320 + }, + { + "epoch": 0.9735545103782353, + "grad_norm": 0.4379573378983632, + "learning_rate": 9.4952230475151e-09, + "loss": 0.1634, + "step": 4321 + }, + { + "epoch": 0.9737798180640437, + "grad_norm": 0.4190432399621594, + "learning_rate": 9.333705823404981e-09, + "loss": 0.1576, + "step": 4322 + }, + { + "epoch": 0.9740051257498521, + "grad_norm": 0.43626335604017696, + "learning_rate": 9.17357157345089e-09, + "loss": 0.1591, + "step": 4323 + }, + { + "epoch": 0.9742304334356606, + "grad_norm": 0.4657102007985677, + "learning_rate": 9.014820386569756e-09, + "loss": 0.1759, + "step": 4324 + }, + { + "epoch": 0.974455741121469, + "grad_norm": 0.45090687670995294, + "learning_rate": 8.85745235090968e-09, + "loss": 0.1586, + "step": 4325 + }, + { + "epoch": 0.9746810488072775, + "grad_norm": 0.45577566539977776, + "learning_rate": 8.701467553851317e-09, + "loss": 0.1684, + "step": 4326 + }, + { + "epoch": 0.9749063564930859, + "grad_norm": 0.44254090747984154, + "learning_rate": 8.54686608200761e-09, + "loss": 0.1652, + "step": 4327 + }, + { + "epoch": 0.9751316641788943, + "grad_norm": 0.4306059371724885, + "learning_rate": 8.393648021222666e-09, + "loss": 0.1469, + "step": 4328 + }, + { + "epoch": 0.9753569718647027, + "grad_norm": 0.4459995305235896, + "learning_rate": 8.241813456573156e-09, + "loss": 0.147, + "step": 4329 + }, + { + "epoch": 0.9755822795505111, + "grad_norm": 0.4423774718840724, + "learning_rate": 8.09136247236636e-09, + "loss": 0.1608, + "step": 4330 + }, + { + "epoch": 0.9758075872363196, + "grad_norm": 0.46203153659465085, + "learning_rate": 7.942295152142954e-09, + "loss": 0.1801, + "step": 4331 + }, + { + "epoch": 0.9760328949221281, + "grad_norm": 0.4805507511315122, + "learning_rate": 7.79461157867395e-09, + "loss": 0.1894, + "step": 4332 + }, + { + "epoch": 0.9762582026079365, + "grad_norm": 0.44103387529789523, + "learning_rate": 7.64831183396264e-09, + "loss": 0.1506, + "step": 4333 + }, + { + "epoch": 0.9764835102937449, + "grad_norm": 0.4311443984397687, + "learning_rate": 7.503395999244045e-09, + "loss": 0.1469, + "step": 4334 + }, + { + "epoch": 0.9767088179795533, + "grad_norm": 0.4211834999522721, + "learning_rate": 7.359864154984353e-09, + "loss": 0.153, + "step": 4335 + }, + { + "epoch": 0.9769341256653618, + "grad_norm": 0.4420275533015841, + "learning_rate": 7.217716380881479e-09, + "loss": 0.1693, + "step": 4336 + }, + { + "epoch": 0.9771594333511702, + "grad_norm": 0.4795316009592446, + "learning_rate": 7.076952755864508e-09, + "loss": 0.1693, + "step": 4337 + }, + { + "epoch": 0.9773847410369786, + "grad_norm": 0.45810794078547346, + "learning_rate": 6.937573358094529e-09, + "loss": 0.1567, + "step": 4338 + }, + { + "epoch": 0.977610048722787, + "grad_norm": 0.4359816938068075, + "learning_rate": 6.799578264963802e-09, + "loss": 0.1539, + "step": 4339 + }, + { + "epoch": 0.9778353564085955, + "grad_norm": 0.44307548882953984, + "learning_rate": 6.662967553095756e-09, + "loss": 0.1645, + "step": 4340 + }, + { + "epoch": 0.978060664094404, + "grad_norm": 0.4193895939577532, + "learning_rate": 6.527741298345269e-09, + "loss": 0.1522, + "step": 4341 + }, + { + "epoch": 0.9782859717802124, + "grad_norm": 0.4458540078924866, + "learning_rate": 6.3938995757981125e-09, + "loss": 0.1576, + "step": 4342 + }, + { + "epoch": 0.9785112794660208, + "grad_norm": 0.4370848874264759, + "learning_rate": 6.2614424597720605e-09, + "loss": 0.1527, + "step": 4343 + }, + { + "epoch": 0.9787365871518292, + "grad_norm": 0.4152276921109082, + "learning_rate": 6.1303700238152245e-09, + "loss": 0.1381, + "step": 4344 + }, + { + "epoch": 0.9789618948376376, + "grad_norm": 0.46252129789672053, + "learning_rate": 6.00068234070772e-09, + "loss": 0.1635, + "step": 4345 + }, + { + "epoch": 0.979187202523446, + "grad_norm": 0.4767149010467665, + "learning_rate": 5.8723794824597226e-09, + "loss": 0.1765, + "step": 4346 + }, + { + "epoch": 0.9794125102092545, + "grad_norm": 0.44958083881601446, + "learning_rate": 5.745461520313411e-09, + "loss": 0.1698, + "step": 4347 + }, + { + "epoch": 0.979637817895063, + "grad_norm": 0.4278482422580719, + "learning_rate": 5.6199285247415805e-09, + "loss": 0.1522, + "step": 4348 + }, + { + "epoch": 0.9798631255808714, + "grad_norm": 0.4295137947205883, + "learning_rate": 5.495780565447917e-09, + "loss": 0.1515, + "step": 4349 + }, + { + "epoch": 0.9800884332666798, + "grad_norm": 0.4546895034494392, + "learning_rate": 5.373017711367001e-09, + "loss": 0.1707, + "step": 4350 + }, + { + "epoch": 0.9803137409524882, + "grad_norm": 0.4864077658410422, + "learning_rate": 5.2516400306648615e-09, + "loss": 0.1701, + "step": 4351 + }, + { + "epoch": 0.9805390486382967, + "grad_norm": 0.41244209476278, + "learning_rate": 5.131647590737587e-09, + "loss": 0.144, + "step": 4352 + }, + { + "epoch": 0.9807643563241051, + "grad_norm": 0.4870322409096027, + "learning_rate": 5.0130404582127144e-09, + "loss": 0.1607, + "step": 4353 + }, + { + "epoch": 0.9809896640099135, + "grad_norm": 0.4309002134010249, + "learning_rate": 4.895818698948396e-09, + "loss": 0.1598, + "step": 4354 + }, + { + "epoch": 0.9812149716957219, + "grad_norm": 0.46020958226819203, + "learning_rate": 4.779982378033676e-09, + "loss": 0.1648, + "step": 4355 + }, + { + "epoch": 0.9814402793815304, + "grad_norm": 0.4487682500664778, + "learning_rate": 4.6655315597876615e-09, + "loss": 0.1679, + "step": 4356 + }, + { + "epoch": 0.9816655870673389, + "grad_norm": 0.4646769843605275, + "learning_rate": 4.552466307760905e-09, + "loss": 0.1519, + "step": 4357 + }, + { + "epoch": 0.9818908947531473, + "grad_norm": 0.4254814403952613, + "learning_rate": 4.440786684734577e-09, + "loss": 0.1563, + "step": 4358 + }, + { + "epoch": 0.9821162024389557, + "grad_norm": 0.46904488863949045, + "learning_rate": 4.330492752719628e-09, + "loss": 0.1709, + "step": 4359 + }, + { + "epoch": 0.9823415101247641, + "grad_norm": 0.4132654547785364, + "learning_rate": 4.221584572958737e-09, + "loss": 0.1559, + "step": 4360 + }, + { + "epoch": 0.9825668178105725, + "grad_norm": 0.4252100086714906, + "learning_rate": 4.114062205924085e-09, + "loss": 0.1475, + "step": 4361 + }, + { + "epoch": 0.982792125496381, + "grad_norm": 0.45349883842364785, + "learning_rate": 4.0079257113190275e-09, + "loss": 0.174, + "step": 4362 + }, + { + "epoch": 0.9830174331821895, + "grad_norm": 0.4172256065148545, + "learning_rate": 3.903175148077531e-09, + "loss": 0.148, + "step": 4363 + }, + { + "epoch": 0.9832427408679979, + "grad_norm": 0.4114268220537219, + "learning_rate": 3.799810574363072e-09, + "loss": 0.1482, + "step": 4364 + }, + { + "epoch": 0.9834680485538063, + "grad_norm": 0.4683959210216498, + "learning_rate": 3.697832047570571e-09, + "loss": 0.176, + "step": 4365 + }, + { + "epoch": 0.9836933562396147, + "grad_norm": 0.46435466336211473, + "learning_rate": 3.597239624325011e-09, + "loss": 0.1655, + "step": 4366 + }, + { + "epoch": 0.9839186639254232, + "grad_norm": 0.4627576202371994, + "learning_rate": 3.4980333604811567e-09, + "loss": 0.1714, + "step": 4367 + }, + { + "epoch": 0.9841439716112316, + "grad_norm": 0.4188439066099548, + "learning_rate": 3.4002133111246673e-09, + "loss": 0.1604, + "step": 4368 + }, + { + "epoch": 0.98436927929704, + "grad_norm": 0.49298061679775806, + "learning_rate": 3.303779530571538e-09, + "loss": 0.1847, + "step": 4369 + }, + { + "epoch": 0.9845945869828484, + "grad_norm": 0.4334874893225166, + "learning_rate": 3.208732072368104e-09, + "loss": 0.1521, + "step": 4370 + }, + { + "epoch": 0.9848198946686569, + "grad_norm": 0.45068722485114915, + "learning_rate": 3.1150709892899256e-09, + "loss": 0.1669, + "step": 4371 + }, + { + "epoch": 0.9850452023544654, + "grad_norm": 0.43304988165779407, + "learning_rate": 3.022796333344291e-09, + "loss": 0.1618, + "step": 4372 + }, + { + "epoch": 0.9852705100402738, + "grad_norm": 0.4376490797183304, + "learning_rate": 2.9319081557674377e-09, + "loss": 0.1652, + "step": 4373 + }, + { + "epoch": 0.9854958177260822, + "grad_norm": 0.47152229805747514, + "learning_rate": 2.8424065070262186e-09, + "loss": 0.1739, + "step": 4374 + }, + { + "epoch": 0.9857211254118906, + "grad_norm": 0.4535488407915571, + "learning_rate": 2.754291436817824e-09, + "loss": 0.1736, + "step": 4375 + }, + { + "epoch": 0.985946433097699, + "grad_norm": 0.4460571069050128, + "learning_rate": 2.6675629940689508e-09, + "loss": 0.1624, + "step": 4376 + }, + { + "epoch": 0.9861717407835074, + "grad_norm": 0.4317219630755231, + "learning_rate": 2.582221226936632e-09, + "loss": 0.1515, + "step": 4377 + }, + { + "epoch": 0.9863970484693159, + "grad_norm": 0.4392667210739418, + "learning_rate": 2.4982661828085175e-09, + "loss": 0.1584, + "step": 4378 + }, + { + "epoch": 0.9866223561551244, + "grad_norm": 0.4321702473982098, + "learning_rate": 2.415697908300929e-09, + "loss": 0.1527, + "step": 4379 + }, + { + "epoch": 0.9868476638409328, + "grad_norm": 0.44501284253721735, + "learning_rate": 2.3345164492616367e-09, + "loss": 0.1587, + "step": 4380 + }, + { + "epoch": 0.9870729715267412, + "grad_norm": 0.4214987211220386, + "learning_rate": 2.2547218507673606e-09, + "loss": 0.1492, + "step": 4381 + }, + { + "epoch": 0.9872982792125496, + "grad_norm": 0.4655262549628087, + "learning_rate": 2.1763141571248813e-09, + "loss": 0.1807, + "step": 4382 + }, + { + "epoch": 0.9875235868983581, + "grad_norm": 0.4681525858824225, + "learning_rate": 2.0992934118715948e-09, + "loss": 0.1626, + "step": 4383 + }, + { + "epoch": 0.9877488945841665, + "grad_norm": 0.45657489941037066, + "learning_rate": 2.0236596577738466e-09, + "loss": 0.1661, + "step": 4384 + }, + { + "epoch": 0.9879742022699749, + "grad_norm": 0.4577592270377889, + "learning_rate": 1.9494129368280432e-09, + "loss": 0.1388, + "step": 4385 + }, + { + "epoch": 0.9881995099557833, + "grad_norm": 0.4465127562411181, + "learning_rate": 1.876553290261207e-09, + "loss": 0.1587, + "step": 4386 + }, + { + "epoch": 0.9884248176415918, + "grad_norm": 0.425297674249025, + "learning_rate": 1.8050807585293095e-09, + "loss": 0.1493, + "step": 4387 + }, + { + "epoch": 0.9886501253274003, + "grad_norm": 0.47310454127744905, + "learning_rate": 1.7349953813183828e-09, + "loss": 0.1716, + "step": 4388 + }, + { + "epoch": 0.9888754330132087, + "grad_norm": 0.41763410440984927, + "learning_rate": 1.6662971975439645e-09, + "loss": 0.1451, + "step": 4389 + }, + { + "epoch": 0.9891007406990171, + "grad_norm": 0.45616367676430675, + "learning_rate": 1.5989862453522075e-09, + "loss": 0.1606, + "step": 4390 + }, + { + "epoch": 0.9893260483848255, + "grad_norm": 0.4470873635214427, + "learning_rate": 1.5330625621176598e-09, + "loss": 0.1586, + "step": 4391 + }, + { + "epoch": 0.9895513560706339, + "grad_norm": 0.4352080111829506, + "learning_rate": 1.468526184445762e-09, + "loss": 0.1481, + "step": 4392 + }, + { + "epoch": 0.9897766637564424, + "grad_norm": 0.49279384466775344, + "learning_rate": 1.4053771481711832e-09, + "loss": 0.1681, + "step": 4393 + }, + { + "epoch": 0.9900019714422508, + "grad_norm": 0.45749190663686623, + "learning_rate": 1.343615488357819e-09, + "loss": 0.1675, + "step": 4394 + }, + { + "epoch": 0.9902272791280593, + "grad_norm": 0.44724277781518934, + "learning_rate": 1.2832412393001814e-09, + "loss": 0.1531, + "step": 4395 + }, + { + "epoch": 0.9904525868138677, + "grad_norm": 0.4319791172555131, + "learning_rate": 1.2242544345211772e-09, + "loss": 0.1459, + "step": 4396 + }, + { + "epoch": 0.9906778944996761, + "grad_norm": 0.43553393458839024, + "learning_rate": 1.1666551067746058e-09, + "loss": 0.1721, + "step": 4397 + }, + { + "epoch": 0.9909032021854846, + "grad_norm": 0.4456828155057735, + "learning_rate": 1.1104432880429394e-09, + "loss": 0.1597, + "step": 4398 + }, + { + "epoch": 0.991128509871293, + "grad_norm": 0.4252379938079175, + "learning_rate": 1.0556190095384333e-09, + "loss": 0.1509, + "step": 4399 + }, + { + "epoch": 0.9913538175571014, + "grad_norm": 0.442878470986228, + "learning_rate": 1.0021823017028475e-09, + "loss": 0.1664, + "step": 4400 + }, + { + "epoch": 0.9915791252429098, + "grad_norm": 0.4605754352858284, + "learning_rate": 9.501331942080029e-10, + "loss": 0.1747, + "step": 4401 + }, + { + "epoch": 0.9918044329287182, + "grad_norm": 0.47354737069865976, + "learning_rate": 8.994717159546695e-10, + "loss": 0.174, + "step": 4402 + }, + { + "epoch": 0.9920297406145268, + "grad_norm": 0.4334550482680622, + "learning_rate": 8.501978950734014e-10, + "loss": 0.153, + "step": 4403 + }, + { + "epoch": 0.9922550483003352, + "grad_norm": 0.4396739580691475, + "learning_rate": 8.023117589237017e-10, + "loss": 0.153, + "step": 4404 + }, + { + "epoch": 0.9924803559861436, + "grad_norm": 0.4446143120091629, + "learning_rate": 7.558133340954121e-10, + "loss": 0.1692, + "step": 4405 + }, + { + "epoch": 0.992705663671952, + "grad_norm": 0.4719115585011776, + "learning_rate": 7.10702646406769e-10, + "loss": 0.1588, + "step": 4406 + }, + { + "epoch": 0.9929309713577604, + "grad_norm": 0.46483342750106316, + "learning_rate": 6.669797209069018e-10, + "loss": 0.1724, + "step": 4407 + }, + { + "epoch": 0.9931562790435688, + "grad_norm": 0.4201226822210143, + "learning_rate": 6.246445818727798e-10, + "loss": 0.1427, + "step": 4408 + }, + { + "epoch": 0.9933815867293773, + "grad_norm": 0.43913801419846854, + "learning_rate": 5.836972528119878e-10, + "loss": 0.1657, + "step": 4409 + }, + { + "epoch": 0.9936068944151858, + "grad_norm": 0.422273721226064, + "learning_rate": 5.44137756460783e-10, + "loss": 0.1467, + "step": 4410 + }, + { + "epoch": 0.9938322021009942, + "grad_norm": 0.4490237985417092, + "learning_rate": 5.059661147852057e-10, + "loss": 0.1648, + "step": 4411 + }, + { + "epoch": 0.9940575097868026, + "grad_norm": 0.4414617733179605, + "learning_rate": 4.691823489805236e-10, + "loss": 0.1623, + "step": 4412 + }, + { + "epoch": 0.994282817472611, + "grad_norm": 0.4228372676499378, + "learning_rate": 4.3378647947150965e-10, + "loss": 0.1594, + "step": 4413 + }, + { + "epoch": 0.9945081251584195, + "grad_norm": 0.4694257518360266, + "learning_rate": 3.9977852591188694e-10, + "loss": 0.1635, + "step": 4414 + }, + { + "epoch": 0.9947334328442279, + "grad_norm": 0.45933357375357814, + "learning_rate": 3.671585071854389e-10, + "loss": 0.1616, + "step": 4415 + }, + { + "epoch": 0.9949587405300363, + "grad_norm": 0.45938278901884055, + "learning_rate": 3.3592644140434393e-10, + "loss": 0.1645, + "step": 4416 + }, + { + "epoch": 0.9951840482158447, + "grad_norm": 0.45523923701699676, + "learning_rate": 3.0608234591084083e-10, + "loss": 0.1591, + "step": 4417 + }, + { + "epoch": 0.9954093559016532, + "grad_norm": 0.44141160905830173, + "learning_rate": 2.776262372761185e-10, + "loss": 0.1747, + "step": 4418 + }, + { + "epoch": 0.9956346635874617, + "grad_norm": 0.4390084522021925, + "learning_rate": 2.505581313011485e-10, + "loss": 0.1572, + "step": 4419 + }, + { + "epoch": 0.9958599712732701, + "grad_norm": 0.4581274844463194, + "learning_rate": 2.2487804301557503e-10, + "loss": 0.1685, + "step": 4420 + }, + { + "epoch": 0.9960852789590785, + "grad_norm": 0.4409320221013579, + "learning_rate": 2.0058598667854755e-10, + "loss": 0.1654, + "step": 4421 + }, + { + "epoch": 0.9963105866448869, + "grad_norm": 0.4529537270286282, + "learning_rate": 1.776819757787207e-10, + "loss": 0.1537, + "step": 4422 + }, + { + "epoch": 0.9965358943306953, + "grad_norm": 0.44705243153634683, + "learning_rate": 1.561660230336992e-10, + "loss": 0.1661, + "step": 4423 + }, + { + "epoch": 0.9967612020165038, + "grad_norm": 0.45180352443351807, + "learning_rate": 1.3603814039031547e-10, + "loss": 0.1638, + "step": 4424 + }, + { + "epoch": 0.9969865097023122, + "grad_norm": 0.41452875359688274, + "learning_rate": 1.1729833902518473e-10, + "loss": 0.1404, + "step": 4425 + }, + { + "epoch": 0.9972118173881207, + "grad_norm": 0.4399182903429231, + "learning_rate": 9.994662934387223e-11, + "loss": 0.1664, + "step": 4426 + }, + { + "epoch": 0.9974371250739291, + "grad_norm": 0.45073319788347704, + "learning_rate": 8.398302098061583e-11, + "loss": 0.1607, + "step": 4427 + }, + { + "epoch": 0.9976624327597375, + "grad_norm": 0.45068516321373914, + "learning_rate": 6.94075227999913e-11, + "loss": 0.1647, + "step": 4428 + }, + { + "epoch": 0.997887740445546, + "grad_norm": 0.43426538257606523, + "learning_rate": 5.62201428946918e-11, + "loss": 0.1566, + "step": 4429 + }, + { + "epoch": 0.9981130481313544, + "grad_norm": 0.4223608572586315, + "learning_rate": 4.44208885877484e-11, + "loss": 0.1468, + "step": 4430 + }, + { + "epoch": 0.9983383558171628, + "grad_norm": 0.43546003528356464, + "learning_rate": 3.400976643030962e-11, + "loss": 0.1594, + "step": 4431 + }, + { + "epoch": 0.9985636635029712, + "grad_norm": 0.463967525752199, + "learning_rate": 2.498678220386186e-11, + "loss": 0.1691, + "step": 4432 + }, + { + "epoch": 0.9987889711887796, + "grad_norm": 0.4664266129400738, + "learning_rate": 1.735194091800896e-11, + "loss": 0.1742, + "step": 4433 + }, + { + "epoch": 0.9990142788745882, + "grad_norm": 0.43993359276887173, + "learning_rate": 1.1105246812137538e-11, + "loss": 0.1567, + "step": 4434 + }, + { + "epoch": 0.9992395865603966, + "grad_norm": 0.44124974764658603, + "learning_rate": 6.246703355139438e-12, + "loss": 0.166, + "step": 4435 + }, + { + "epoch": 0.999464894246205, + "grad_norm": 0.44073587346288357, + "learning_rate": 2.7763132445790543e-12, + "loss": 0.1544, + "step": 4436 + }, + { + "epoch": 0.9996902019320134, + "grad_norm": 0.4457806367777774, + "learning_rate": 6.940784075259999e-13, + "loss": 0.1609, + "step": 4437 + }, + { + "epoch": 0.9999155096178218, + "grad_norm": 0.41883610535748583, + "learning_rate": 0.0, + "loss": 0.1437, + "step": 4438 + }, + { + "epoch": 0.9999155096178218, + "step": 4438, + "total_flos": 995060424065024.0, + "train_loss": 0.1932588364020722, + "train_runtime": 22166.8057, + "train_samples_per_second": 12.814, + "train_steps_per_second": 0.2 + } + ], + "logging_steps": 1, + "max_steps": 4438, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 995060424065024.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}