diff --git "a/checkpoints/checkpoint-4689/trainer_state.json" "b/checkpoints/checkpoint-4689/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/checkpoint-4689/trainer_state.json" @@ -0,0 +1,32856 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.998480850723595, + "eval_steps": 500, + "global_step": 4689, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006396418005916686, + "grad_norm": 115.30694580078125, + "learning_rate": 5e-06, + "loss": 6.4114, + "step": 1 + }, + { + "epoch": 0.0012792836011833373, + "grad_norm": 109.575439453125, + "learning_rate": 1e-05, + "loss": 5.8471, + "step": 2 + }, + { + "epoch": 0.001918925401775006, + "grad_norm": 64.81423950195312, + "learning_rate": 1.5e-05, + "loss": 4.3482, + "step": 3 + }, + { + "epoch": 0.0025585672023666745, + "grad_norm": 37.4569206237793, + "learning_rate": 2e-05, + "loss": 3.6161, + "step": 4 + }, + { + "epoch": 0.0031982090029583432, + "grad_norm": 29.528892517089844, + "learning_rate": 2.5e-05, + "loss": 2.6967, + "step": 5 + }, + { + "epoch": 0.003837850803550012, + "grad_norm": 25.729040145874023, + "learning_rate": 3e-05, + "loss": 2.6412, + "step": 6 + }, + { + "epoch": 0.004477492604141681, + "grad_norm": 21.95215606689453, + "learning_rate": 3.5e-05, + "loss": 2.2728, + "step": 7 + }, + { + "epoch": 0.005117134404733349, + "grad_norm": 17.429595947265625, + "learning_rate": 4e-05, + "loss": 1.8068, + "step": 8 + }, + { + "epoch": 0.005756776205325018, + "grad_norm": 13.558813095092773, + "learning_rate": 4.5e-05, + "loss": 1.7164, + "step": 9 + }, + { + "epoch": 0.0063964180059166865, + "grad_norm": 8.956698417663574, + "learning_rate": 5e-05, + "loss": 1.3478, + "step": 10 + }, + { + "epoch": 0.007036059806508356, + "grad_norm": 11.52576732635498, + "learning_rate": 4.99893139559735e-05, + "loss": 1.3521, + "step": 11 + }, + { + "epoch": 0.007675701607100024, + "grad_norm": 7.028160572052002, + "learning_rate": 4.9978627911947e-05, + "loss": 1.3142, + "step": 12 + }, + { + "epoch": 0.008315343407691692, + "grad_norm": 5.472947120666504, + "learning_rate": 4.99679418679205e-05, + "loss": 1.2399, + "step": 13 + }, + { + "epoch": 0.008954985208283361, + "grad_norm": 5.520193099975586, + "learning_rate": 4.995725582389399e-05, + "loss": 1.3414, + "step": 14 + }, + { + "epoch": 0.00959462700887503, + "grad_norm": 5.163415908813477, + "learning_rate": 4.99465697798675e-05, + "loss": 1.2096, + "step": 15 + }, + { + "epoch": 0.010234268809466698, + "grad_norm": 4.520339488983154, + "learning_rate": 4.993588373584099e-05, + "loss": 1.0653, + "step": 16 + }, + { + "epoch": 0.010873910610058367, + "grad_norm": 4.235777378082275, + "learning_rate": 4.992519769181449e-05, + "loss": 1.0698, + "step": 17 + }, + { + "epoch": 0.011513552410650036, + "grad_norm": 4.275694370269775, + "learning_rate": 4.991451164778799e-05, + "loss": 1.013, + "step": 18 + }, + { + "epoch": 0.012153194211241704, + "grad_norm": 4.225165843963623, + "learning_rate": 4.990382560376149e-05, + "loss": 1.0674, + "step": 19 + }, + { + "epoch": 0.012792836011833373, + "grad_norm": 4.273852348327637, + "learning_rate": 4.989313955973499e-05, + "loss": 0.9873, + "step": 20 + }, + { + "epoch": 0.013432477812425042, + "grad_norm": 4.532805442810059, + "learning_rate": 4.988245351570849e-05, + "loss": 1.0973, + "step": 21 + }, + { + "epoch": 0.014072119613016711, + "grad_norm": 4.303666114807129, + "learning_rate": 4.987176747168199e-05, + "loss": 1.1173, + "step": 22 + }, + { + "epoch": 0.014711761413608379, + "grad_norm": 4.099974632263184, + "learning_rate": 4.986108142765548e-05, + "loss": 1.0408, + "step": 23 + }, + { + "epoch": 0.015351403214200048, + "grad_norm": 3.9215502738952637, + "learning_rate": 4.985039538362899e-05, + "loss": 0.9867, + "step": 24 + }, + { + "epoch": 0.015991045014791715, + "grad_norm": 3.848891019821167, + "learning_rate": 4.983970933960248e-05, + "loss": 1.0161, + "step": 25 + }, + { + "epoch": 0.016630686815383385, + "grad_norm": 4.045166969299316, + "learning_rate": 4.9829023295575985e-05, + "loss": 1.1399, + "step": 26 + }, + { + "epoch": 0.017270328615975054, + "grad_norm": 3.542635917663574, + "learning_rate": 4.981833725154948e-05, + "loss": 0.8839, + "step": 27 + }, + { + "epoch": 0.017909970416566723, + "grad_norm": 3.7698657512664795, + "learning_rate": 4.980765120752298e-05, + "loss": 0.9537, + "step": 28 + }, + { + "epoch": 0.018549612217158392, + "grad_norm": 4.563840389251709, + "learning_rate": 4.979696516349648e-05, + "loss": 1.116, + "step": 29 + }, + { + "epoch": 0.01918925401775006, + "grad_norm": 3.640054225921631, + "learning_rate": 4.9786279119469976e-05, + "loss": 0.9602, + "step": 30 + }, + { + "epoch": 0.01982889581834173, + "grad_norm": 3.8122613430023193, + "learning_rate": 4.977559307544347e-05, + "loss": 0.9392, + "step": 31 + }, + { + "epoch": 0.020468537618933396, + "grad_norm": 3.983224391937256, + "learning_rate": 4.9764907031416975e-05, + "loss": 1.0715, + "step": 32 + }, + { + "epoch": 0.021108179419525065, + "grad_norm": 3.8251636028289795, + "learning_rate": 4.975422098739047e-05, + "loss": 1.0091, + "step": 33 + }, + { + "epoch": 0.021747821220116734, + "grad_norm": 3.240780830383301, + "learning_rate": 4.974353494336397e-05, + "loss": 0.8361, + "step": 34 + }, + { + "epoch": 0.022387463020708404, + "grad_norm": 4.041069984436035, + "learning_rate": 4.973284889933747e-05, + "loss": 1.0791, + "step": 35 + }, + { + "epoch": 0.023027104821300073, + "grad_norm": 3.640153408050537, + "learning_rate": 4.9722162855310965e-05, + "loss": 0.9687, + "step": 36 + }, + { + "epoch": 0.023666746621891742, + "grad_norm": 3.730983018875122, + "learning_rate": 4.9711476811284465e-05, + "loss": 0.9859, + "step": 37 + }, + { + "epoch": 0.024306388422483408, + "grad_norm": 3.9427647590637207, + "learning_rate": 4.9700790767257964e-05, + "loss": 1.0425, + "step": 38 + }, + { + "epoch": 0.024946030223075077, + "grad_norm": 4.326479911804199, + "learning_rate": 4.9690104723231464e-05, + "loss": 1.0436, + "step": 39 + }, + { + "epoch": 0.025585672023666746, + "grad_norm": 3.6924359798431396, + "learning_rate": 4.9679418679204956e-05, + "loss": 0.9543, + "step": 40 + }, + { + "epoch": 0.026225313824258415, + "grad_norm": 3.9280359745025635, + "learning_rate": 4.966873263517846e-05, + "loss": 1.0221, + "step": 41 + }, + { + "epoch": 0.026864955624850084, + "grad_norm": 3.4215242862701416, + "learning_rate": 4.9658046591151955e-05, + "loss": 0.8821, + "step": 42 + }, + { + "epoch": 0.027504597425441753, + "grad_norm": 3.314079523086548, + "learning_rate": 4.9647360547125454e-05, + "loss": 0.9022, + "step": 43 + }, + { + "epoch": 0.028144239226033423, + "grad_norm": 3.8960483074188232, + "learning_rate": 4.9636674503098954e-05, + "loss": 1.0886, + "step": 44 + }, + { + "epoch": 0.02878388102662509, + "grad_norm": 3.6893134117126465, + "learning_rate": 4.962598845907245e-05, + "loss": 1.0148, + "step": 45 + }, + { + "epoch": 0.029423522827216757, + "grad_norm": 3.6006226539611816, + "learning_rate": 4.961530241504595e-05, + "loss": 0.9509, + "step": 46 + }, + { + "epoch": 0.030063164627808427, + "grad_norm": 3.676569938659668, + "learning_rate": 4.960461637101945e-05, + "loss": 0.9786, + "step": 47 + }, + { + "epoch": 0.030702806428400096, + "grad_norm": 3.2735321521759033, + "learning_rate": 4.959393032699295e-05, + "loss": 0.9687, + "step": 48 + }, + { + "epoch": 0.031342448228991765, + "grad_norm": 3.3601109981536865, + "learning_rate": 4.9583244282966444e-05, + "loss": 0.9033, + "step": 49 + }, + { + "epoch": 0.03198209002958343, + "grad_norm": 3.4429924488067627, + "learning_rate": 4.957255823893995e-05, + "loss": 0.9358, + "step": 50 + }, + { + "epoch": 0.0326217318301751, + "grad_norm": 3.9405481815338135, + "learning_rate": 4.956187219491344e-05, + "loss": 0.8988, + "step": 51 + }, + { + "epoch": 0.03326137363076677, + "grad_norm": 3.5133135318756104, + "learning_rate": 4.955118615088694e-05, + "loss": 0.9658, + "step": 52 + }, + { + "epoch": 0.03390101543135844, + "grad_norm": 3.6065900325775146, + "learning_rate": 4.954050010686044e-05, + "loss": 0.93, + "step": 53 + }, + { + "epoch": 0.03454065723195011, + "grad_norm": 3.22421932220459, + "learning_rate": 4.952981406283394e-05, + "loss": 0.8686, + "step": 54 + }, + { + "epoch": 0.03518029903254178, + "grad_norm": 3.5975608825683594, + "learning_rate": 4.951912801880744e-05, + "loss": 0.9983, + "step": 55 + }, + { + "epoch": 0.035819940833133446, + "grad_norm": 3.5553078651428223, + "learning_rate": 4.950844197478094e-05, + "loss": 0.9925, + "step": 56 + }, + { + "epoch": 0.03645958263372511, + "grad_norm": 3.6313040256500244, + "learning_rate": 4.949775593075444e-05, + "loss": 0.9489, + "step": 57 + }, + { + "epoch": 0.037099224434316784, + "grad_norm": 3.73193621635437, + "learning_rate": 4.948706988672794e-05, + "loss": 0.8969, + "step": 58 + }, + { + "epoch": 0.03773886623490845, + "grad_norm": 3.727299213409424, + "learning_rate": 4.947638384270144e-05, + "loss": 0.9545, + "step": 59 + }, + { + "epoch": 0.03837850803550012, + "grad_norm": 3.72043776512146, + "learning_rate": 4.946569779867493e-05, + "loss": 1.0084, + "step": 60 + }, + { + "epoch": 0.03901814983609179, + "grad_norm": 3.5858588218688965, + "learning_rate": 4.9455011754648436e-05, + "loss": 1.0442, + "step": 61 + }, + { + "epoch": 0.03965779163668346, + "grad_norm": 3.367790699005127, + "learning_rate": 4.944432571062193e-05, + "loss": 0.9894, + "step": 62 + }, + { + "epoch": 0.040297433437275126, + "grad_norm": 3.3638293743133545, + "learning_rate": 4.943363966659543e-05, + "loss": 1.055, + "step": 63 + }, + { + "epoch": 0.04093707523786679, + "grad_norm": 3.0173652172088623, + "learning_rate": 4.942295362256893e-05, + "loss": 0.9459, + "step": 64 + }, + { + "epoch": 0.041576717038458465, + "grad_norm": 3.372331380844116, + "learning_rate": 4.941226757854243e-05, + "loss": 0.9825, + "step": 65 + }, + { + "epoch": 0.04221635883905013, + "grad_norm": 3.7092132568359375, + "learning_rate": 4.9401581534515926e-05, + "loss": 1.0245, + "step": 66 + }, + { + "epoch": 0.0428560006396418, + "grad_norm": 3.6751811504364014, + "learning_rate": 4.9390895490489425e-05, + "loss": 0.957, + "step": 67 + }, + { + "epoch": 0.04349564244023347, + "grad_norm": 3.696737051010132, + "learning_rate": 4.9380209446462925e-05, + "loss": 0.9626, + "step": 68 + }, + { + "epoch": 0.044135284240825134, + "grad_norm": 3.4946482181549072, + "learning_rate": 4.936952340243642e-05, + "loss": 0.9912, + "step": 69 + }, + { + "epoch": 0.04477492604141681, + "grad_norm": 3.5095081329345703, + "learning_rate": 4.9358837358409923e-05, + "loss": 0.9987, + "step": 70 + }, + { + "epoch": 0.04541456784200847, + "grad_norm": 3.264599084854126, + "learning_rate": 4.9348151314383416e-05, + "loss": 0.9057, + "step": 71 + }, + { + "epoch": 0.046054209642600145, + "grad_norm": 3.213542938232422, + "learning_rate": 4.9337465270356915e-05, + "loss": 0.8873, + "step": 72 + }, + { + "epoch": 0.04669385144319181, + "grad_norm": 3.387552499771118, + "learning_rate": 4.9326779226330415e-05, + "loss": 0.9401, + "step": 73 + }, + { + "epoch": 0.047333493243783484, + "grad_norm": 3.8830254077911377, + "learning_rate": 4.9316093182303914e-05, + "loss": 1.1228, + "step": 74 + }, + { + "epoch": 0.04797313504437515, + "grad_norm": 3.1371631622314453, + "learning_rate": 4.930540713827741e-05, + "loss": 0.9813, + "step": 75 + }, + { + "epoch": 0.048612776844966815, + "grad_norm": 3.096027374267578, + "learning_rate": 4.929472109425091e-05, + "loss": 0.8039, + "step": 76 + }, + { + "epoch": 0.04925241864555849, + "grad_norm": 2.909639358520508, + "learning_rate": 4.9284035050224405e-05, + "loss": 0.8033, + "step": 77 + }, + { + "epoch": 0.049892060446150154, + "grad_norm": 3.4502925872802734, + "learning_rate": 4.9273349006197905e-05, + "loss": 0.9759, + "step": 78 + }, + { + "epoch": 0.050531702246741826, + "grad_norm": 2.8735854625701904, + "learning_rate": 4.9262662962171404e-05, + "loss": 0.893, + "step": 79 + }, + { + "epoch": 0.05117134404733349, + "grad_norm": 3.286386489868164, + "learning_rate": 4.9251976918144904e-05, + "loss": 0.8929, + "step": 80 + }, + { + "epoch": 0.051810985847925164, + "grad_norm": 3.041092872619629, + "learning_rate": 4.924129087411841e-05, + "loss": 0.8326, + "step": 81 + }, + { + "epoch": 0.05245062764851683, + "grad_norm": 3.4309558868408203, + "learning_rate": 4.92306048300919e-05, + "loss": 0.9463, + "step": 82 + }, + { + "epoch": 0.053090269449108496, + "grad_norm": 4.044023036956787, + "learning_rate": 4.92199187860654e-05, + "loss": 1.1622, + "step": 83 + }, + { + "epoch": 0.05372991124970017, + "grad_norm": 3.4805939197540283, + "learning_rate": 4.92092327420389e-05, + "loss": 0.9696, + "step": 84 + }, + { + "epoch": 0.054369553050291834, + "grad_norm": 3.2364139556884766, + "learning_rate": 4.91985466980124e-05, + "loss": 0.9124, + "step": 85 + }, + { + "epoch": 0.05500919485088351, + "grad_norm": 3.1523385047912598, + "learning_rate": 4.918786065398589e-05, + "loss": 0.8971, + "step": 86 + }, + { + "epoch": 0.05564883665147517, + "grad_norm": 3.268002510070801, + "learning_rate": 4.91771746099594e-05, + "loss": 0.9238, + "step": 87 + }, + { + "epoch": 0.056288478452066845, + "grad_norm": 2.9969351291656494, + "learning_rate": 4.916648856593289e-05, + "loss": 0.8588, + "step": 88 + }, + { + "epoch": 0.05692812025265851, + "grad_norm": 3.401979923248291, + "learning_rate": 4.915580252190639e-05, + "loss": 0.9757, + "step": 89 + }, + { + "epoch": 0.05756776205325018, + "grad_norm": 3.6688380241394043, + "learning_rate": 4.914511647787989e-05, + "loss": 0.9622, + "step": 90 + }, + { + "epoch": 0.05820740385384185, + "grad_norm": 2.9018354415893555, + "learning_rate": 4.913443043385339e-05, + "loss": 0.8216, + "step": 91 + }, + { + "epoch": 0.058847045654433515, + "grad_norm": 3.3676681518554688, + "learning_rate": 4.912374438982689e-05, + "loss": 0.883, + "step": 92 + }, + { + "epoch": 0.05948668745502519, + "grad_norm": 3.1586813926696777, + "learning_rate": 4.911305834580039e-05, + "loss": 0.9087, + "step": 93 + }, + { + "epoch": 0.06012632925561685, + "grad_norm": 3.241666793823242, + "learning_rate": 4.910237230177389e-05, + "loss": 0.8813, + "step": 94 + }, + { + "epoch": 0.060765971056208526, + "grad_norm": 2.8825364112854004, + "learning_rate": 4.909168625774738e-05, + "loss": 0.809, + "step": 95 + }, + { + "epoch": 0.06140561285680019, + "grad_norm": 3.058403491973877, + "learning_rate": 4.9081000213720887e-05, + "loss": 0.8959, + "step": 96 + }, + { + "epoch": 0.06204525465739186, + "grad_norm": 3.4625747203826904, + "learning_rate": 4.907031416969438e-05, + "loss": 1.0556, + "step": 97 + }, + { + "epoch": 0.06268489645798353, + "grad_norm": 2.9675588607788086, + "learning_rate": 4.905962812566788e-05, + "loss": 0.8407, + "step": 98 + }, + { + "epoch": 0.0633245382585752, + "grad_norm": 3.3189573287963867, + "learning_rate": 4.904894208164138e-05, + "loss": 0.9563, + "step": 99 + }, + { + "epoch": 0.06396418005916686, + "grad_norm": 3.119657039642334, + "learning_rate": 4.903825603761488e-05, + "loss": 0.9348, + "step": 100 + }, + { + "epoch": 0.06460382185975853, + "grad_norm": 3.3534693717956543, + "learning_rate": 4.9027569993588377e-05, + "loss": 0.9842, + "step": 101 + }, + { + "epoch": 0.0652434636603502, + "grad_norm": 3.1526596546173096, + "learning_rate": 4.9016883949561876e-05, + "loss": 0.9038, + "step": 102 + }, + { + "epoch": 0.06588310546094188, + "grad_norm": 3.2262892723083496, + "learning_rate": 4.9006197905535375e-05, + "loss": 0.907, + "step": 103 + }, + { + "epoch": 0.06652274726153354, + "grad_norm": 3.2489101886749268, + "learning_rate": 4.899551186150887e-05, + "loss": 0.9427, + "step": 104 + }, + { + "epoch": 0.06716238906212521, + "grad_norm": 3.2974557876586914, + "learning_rate": 4.8984825817482374e-05, + "loss": 1.0098, + "step": 105 + }, + { + "epoch": 0.06780203086271688, + "grad_norm": 3.0968425273895264, + "learning_rate": 4.8974139773455867e-05, + "loss": 0.8744, + "step": 106 + }, + { + "epoch": 0.06844167266330854, + "grad_norm": 3.087939739227295, + "learning_rate": 4.8963453729429366e-05, + "loss": 0.9359, + "step": 107 + }, + { + "epoch": 0.06908131446390021, + "grad_norm": 3.2230167388916016, + "learning_rate": 4.8952767685402865e-05, + "loss": 0.9531, + "step": 108 + }, + { + "epoch": 0.06972095626449189, + "grad_norm": 3.247502088546753, + "learning_rate": 4.8942081641376365e-05, + "loss": 0.8503, + "step": 109 + }, + { + "epoch": 0.07036059806508356, + "grad_norm": 3.499711275100708, + "learning_rate": 4.8931395597349864e-05, + "loss": 0.9251, + "step": 110 + }, + { + "epoch": 0.07100023986567522, + "grad_norm": 3.1631062030792236, + "learning_rate": 4.8920709553323363e-05, + "loss": 0.9323, + "step": 111 + }, + { + "epoch": 0.07163988166626689, + "grad_norm": 3.4049339294433594, + "learning_rate": 4.891002350929686e-05, + "loss": 0.932, + "step": 112 + }, + { + "epoch": 0.07227952346685856, + "grad_norm": 3.1144092082977295, + "learning_rate": 4.889933746527036e-05, + "loss": 0.9352, + "step": 113 + }, + { + "epoch": 0.07291916526745022, + "grad_norm": 3.3719403743743896, + "learning_rate": 4.888865142124386e-05, + "loss": 0.9683, + "step": 114 + }, + { + "epoch": 0.0735588070680419, + "grad_norm": 3.163418769836426, + "learning_rate": 4.8877965377217354e-05, + "loss": 0.9329, + "step": 115 + }, + { + "epoch": 0.07419844886863357, + "grad_norm": 3.4448797702789307, + "learning_rate": 4.886727933319086e-05, + "loss": 0.9916, + "step": 116 + }, + { + "epoch": 0.07483809066922524, + "grad_norm": 3.127434253692627, + "learning_rate": 4.885659328916435e-05, + "loss": 0.8456, + "step": 117 + }, + { + "epoch": 0.0754777324698169, + "grad_norm": 2.9916157722473145, + "learning_rate": 4.884590724513785e-05, + "loss": 0.9171, + "step": 118 + }, + { + "epoch": 0.07611737427040857, + "grad_norm": 3.6702394485473633, + "learning_rate": 4.883522120111135e-05, + "loss": 1.003, + "step": 119 + }, + { + "epoch": 0.07675701607100024, + "grad_norm": 3.3792991638183594, + "learning_rate": 4.882453515708485e-05, + "loss": 0.9366, + "step": 120 + }, + { + "epoch": 0.0773966578715919, + "grad_norm": 3.2419188022613525, + "learning_rate": 4.8813849113058343e-05, + "loss": 0.9857, + "step": 121 + }, + { + "epoch": 0.07803629967218358, + "grad_norm": 3.113842010498047, + "learning_rate": 4.880316306903185e-05, + "loss": 0.9532, + "step": 122 + }, + { + "epoch": 0.07867594147277525, + "grad_norm": 3.0012636184692383, + "learning_rate": 4.879247702500534e-05, + "loss": 0.9319, + "step": 123 + }, + { + "epoch": 0.07931558327336692, + "grad_norm": 3.0980920791625977, + "learning_rate": 4.878179098097884e-05, + "loss": 0.8973, + "step": 124 + }, + { + "epoch": 0.07995522507395858, + "grad_norm": 2.9623830318450928, + "learning_rate": 4.877110493695234e-05, + "loss": 0.8228, + "step": 125 + }, + { + "epoch": 0.08059486687455025, + "grad_norm": 2.924691915512085, + "learning_rate": 4.876041889292584e-05, + "loss": 0.8359, + "step": 126 + }, + { + "epoch": 0.08123450867514193, + "grad_norm": 3.207602024078369, + "learning_rate": 4.874973284889934e-05, + "loss": 0.9506, + "step": 127 + }, + { + "epoch": 0.08187415047573358, + "grad_norm": 3.3331074714660645, + "learning_rate": 4.873904680487284e-05, + "loss": 0.9785, + "step": 128 + }, + { + "epoch": 0.08251379227632526, + "grad_norm": 3.4897632598876953, + "learning_rate": 4.872836076084634e-05, + "loss": 0.9967, + "step": 129 + }, + { + "epoch": 0.08315343407691693, + "grad_norm": 3.305739402770996, + "learning_rate": 4.871767471681983e-05, + "loss": 0.9802, + "step": 130 + }, + { + "epoch": 0.0837930758775086, + "grad_norm": 3.3973934650421143, + "learning_rate": 4.870698867279334e-05, + "loss": 0.9837, + "step": 131 + }, + { + "epoch": 0.08443271767810026, + "grad_norm": 2.781184196472168, + "learning_rate": 4.869630262876683e-05, + "loss": 0.8409, + "step": 132 + }, + { + "epoch": 0.08507235947869193, + "grad_norm": 3.37329363822937, + "learning_rate": 4.868561658474033e-05, + "loss": 0.9242, + "step": 133 + }, + { + "epoch": 0.0857120012792836, + "grad_norm": 3.1330392360687256, + "learning_rate": 4.867493054071383e-05, + "loss": 0.9759, + "step": 134 + }, + { + "epoch": 0.08635164307987526, + "grad_norm": 2.9041836261749268, + "learning_rate": 4.866424449668733e-05, + "loss": 0.8898, + "step": 135 + }, + { + "epoch": 0.08699128488046694, + "grad_norm": 3.207829236984253, + "learning_rate": 4.865355845266083e-05, + "loss": 0.9794, + "step": 136 + }, + { + "epoch": 0.08763092668105861, + "grad_norm": 2.3934268951416016, + "learning_rate": 4.8642872408634326e-05, + "loss": 0.7443, + "step": 137 + }, + { + "epoch": 0.08827056848165027, + "grad_norm": 3.6512794494628906, + "learning_rate": 4.8632186364607826e-05, + "loss": 1.0446, + "step": 138 + }, + { + "epoch": 0.08891021028224194, + "grad_norm": 2.666923999786377, + "learning_rate": 4.8621500320581325e-05, + "loss": 0.7934, + "step": 139 + }, + { + "epoch": 0.08954985208283361, + "grad_norm": 3.061638832092285, + "learning_rate": 4.8610814276554825e-05, + "loss": 0.934, + "step": 140 + }, + { + "epoch": 0.09018949388342529, + "grad_norm": 3.193471908569336, + "learning_rate": 4.860012823252832e-05, + "loss": 0.8959, + "step": 141 + }, + { + "epoch": 0.09082913568401695, + "grad_norm": 2.9615864753723145, + "learning_rate": 4.858944218850182e-05, + "loss": 0.8799, + "step": 142 + }, + { + "epoch": 0.09146877748460862, + "grad_norm": 3.2776529788970947, + "learning_rate": 4.8578756144475316e-05, + "loss": 0.9657, + "step": 143 + }, + { + "epoch": 0.09210841928520029, + "grad_norm": 2.8650059700012207, + "learning_rate": 4.8568070100448815e-05, + "loss": 0.8783, + "step": 144 + }, + { + "epoch": 0.09274806108579195, + "grad_norm": 3.1735057830810547, + "learning_rate": 4.8557384056422315e-05, + "loss": 0.893, + "step": 145 + }, + { + "epoch": 0.09338770288638362, + "grad_norm": 3.1530520915985107, + "learning_rate": 4.8546698012395814e-05, + "loss": 0.9654, + "step": 146 + }, + { + "epoch": 0.0940273446869753, + "grad_norm": 2.96382737159729, + "learning_rate": 4.853601196836931e-05, + "loss": 0.911, + "step": 147 + }, + { + "epoch": 0.09466698648756697, + "grad_norm": 2.9157848358154297, + "learning_rate": 4.852532592434281e-05, + "loss": 0.9214, + "step": 148 + }, + { + "epoch": 0.09530662828815863, + "grad_norm": 3.0061349868774414, + "learning_rate": 4.851463988031631e-05, + "loss": 0.8943, + "step": 149 + }, + { + "epoch": 0.0959462700887503, + "grad_norm": 2.904080629348755, + "learning_rate": 4.8503953836289805e-05, + "loss": 0.8833, + "step": 150 + }, + { + "epoch": 0.09658591188934197, + "grad_norm": 2.603848457336426, + "learning_rate": 4.849326779226331e-05, + "loss": 0.7851, + "step": 151 + }, + { + "epoch": 0.09722555368993363, + "grad_norm": 2.883955240249634, + "learning_rate": 4.84825817482368e-05, + "loss": 0.8886, + "step": 152 + }, + { + "epoch": 0.0978651954905253, + "grad_norm": 3.298178195953369, + "learning_rate": 4.84718957042103e-05, + "loss": 0.9739, + "step": 153 + }, + { + "epoch": 0.09850483729111698, + "grad_norm": 3.0739529132843018, + "learning_rate": 4.84612096601838e-05, + "loss": 0.9313, + "step": 154 + }, + { + "epoch": 0.09914447909170865, + "grad_norm": 2.864802598953247, + "learning_rate": 4.84505236161573e-05, + "loss": 0.8005, + "step": 155 + }, + { + "epoch": 0.09978412089230031, + "grad_norm": 3.0745465755462646, + "learning_rate": 4.8439837572130794e-05, + "loss": 0.9582, + "step": 156 + }, + { + "epoch": 0.10042376269289198, + "grad_norm": 2.968920946121216, + "learning_rate": 4.84291515281043e-05, + "loss": 0.9369, + "step": 157 + }, + { + "epoch": 0.10106340449348365, + "grad_norm": 3.003913164138794, + "learning_rate": 4.84184654840778e-05, + "loss": 0.8767, + "step": 158 + }, + { + "epoch": 0.10170304629407531, + "grad_norm": 3.1298813819885254, + "learning_rate": 4.840777944005129e-05, + "loss": 0.9916, + "step": 159 + }, + { + "epoch": 0.10234268809466698, + "grad_norm": 3.1826727390289307, + "learning_rate": 4.83970933960248e-05, + "loss": 0.8268, + "step": 160 + }, + { + "epoch": 0.10298232989525866, + "grad_norm": 2.9182395935058594, + "learning_rate": 4.838640735199829e-05, + "loss": 0.8493, + "step": 161 + }, + { + "epoch": 0.10362197169585033, + "grad_norm": 3.172877073287964, + "learning_rate": 4.837572130797179e-05, + "loss": 0.9514, + "step": 162 + }, + { + "epoch": 0.10426161349644199, + "grad_norm": 3.046769618988037, + "learning_rate": 4.836503526394529e-05, + "loss": 0.9571, + "step": 163 + }, + { + "epoch": 0.10490125529703366, + "grad_norm": 3.3287999629974365, + "learning_rate": 4.835434921991879e-05, + "loss": 0.809, + "step": 164 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 2.8156962394714355, + "learning_rate": 4.834366317589228e-05, + "loss": 0.9603, + "step": 165 + }, + { + "epoch": 0.10618053889821699, + "grad_norm": 2.867738723754883, + "learning_rate": 4.833297713186579e-05, + "loss": 0.8771, + "step": 166 + }, + { + "epoch": 0.10682018069880866, + "grad_norm": 3.0130057334899902, + "learning_rate": 4.832229108783928e-05, + "loss": 0.9795, + "step": 167 + }, + { + "epoch": 0.10745982249940034, + "grad_norm": 3.038479804992676, + "learning_rate": 4.8311605043812786e-05, + "loss": 1.0077, + "step": 168 + }, + { + "epoch": 0.10809946429999201, + "grad_norm": 2.715528726577759, + "learning_rate": 4.830091899978628e-05, + "loss": 0.8767, + "step": 169 + }, + { + "epoch": 0.10873910610058367, + "grad_norm": 2.719451427459717, + "learning_rate": 4.829023295575978e-05, + "loss": 0.8893, + "step": 170 + }, + { + "epoch": 0.10937874790117534, + "grad_norm": 3.0025105476379395, + "learning_rate": 4.827954691173328e-05, + "loss": 0.8376, + "step": 171 + }, + { + "epoch": 0.11001838970176701, + "grad_norm": 2.7460434436798096, + "learning_rate": 4.826886086770678e-05, + "loss": 0.8464, + "step": 172 + }, + { + "epoch": 0.11065803150235867, + "grad_norm": 2.9328606128692627, + "learning_rate": 4.8258174823680276e-05, + "loss": 0.8323, + "step": 173 + }, + { + "epoch": 0.11129767330295035, + "grad_norm": 2.961212158203125, + "learning_rate": 4.8247488779653776e-05, + "loss": 0.9018, + "step": 174 + }, + { + "epoch": 0.11193731510354202, + "grad_norm": 2.966740846633911, + "learning_rate": 4.8236802735627275e-05, + "loss": 0.8428, + "step": 175 + }, + { + "epoch": 0.11257695690413369, + "grad_norm": 3.171058177947998, + "learning_rate": 4.822611669160077e-05, + "loss": 0.9487, + "step": 176 + }, + { + "epoch": 0.11321659870472535, + "grad_norm": 2.882718563079834, + "learning_rate": 4.8215430647574274e-05, + "loss": 0.9581, + "step": 177 + }, + { + "epoch": 0.11385624050531702, + "grad_norm": 2.833556890487671, + "learning_rate": 4.8204744603547766e-05, + "loss": 0.8506, + "step": 178 + }, + { + "epoch": 0.1144958823059087, + "grad_norm": 2.9089512825012207, + "learning_rate": 4.8194058559521266e-05, + "loss": 0.9213, + "step": 179 + }, + { + "epoch": 0.11513552410650035, + "grad_norm": 3.0946218967437744, + "learning_rate": 4.8183372515494765e-05, + "loss": 0.8109, + "step": 180 + }, + { + "epoch": 0.11577516590709203, + "grad_norm": 2.8648128509521484, + "learning_rate": 4.8172686471468265e-05, + "loss": 0.8754, + "step": 181 + }, + { + "epoch": 0.1164148077076837, + "grad_norm": 2.9012584686279297, + "learning_rate": 4.8162000427441764e-05, + "loss": 0.8946, + "step": 182 + }, + { + "epoch": 0.11705444950827537, + "grad_norm": 3.00886869430542, + "learning_rate": 4.815131438341526e-05, + "loss": 1.0621, + "step": 183 + }, + { + "epoch": 0.11769409130886703, + "grad_norm": 2.6910293102264404, + "learning_rate": 4.814062833938876e-05, + "loss": 0.8765, + "step": 184 + }, + { + "epoch": 0.1183337331094587, + "grad_norm": 2.775951862335205, + "learning_rate": 4.8129942295362255e-05, + "loss": 0.8297, + "step": 185 + }, + { + "epoch": 0.11897337491005038, + "grad_norm": 2.6360793113708496, + "learning_rate": 4.811925625133576e-05, + "loss": 0.8422, + "step": 186 + }, + { + "epoch": 0.11961301671064203, + "grad_norm": 2.991544485092163, + "learning_rate": 4.8108570207309254e-05, + "loss": 0.9109, + "step": 187 + }, + { + "epoch": 0.1202526585112337, + "grad_norm": 3.1472723484039307, + "learning_rate": 4.809788416328275e-05, + "loss": 1.0306, + "step": 188 + }, + { + "epoch": 0.12089230031182538, + "grad_norm": 2.992020845413208, + "learning_rate": 4.808719811925625e-05, + "loss": 0.9407, + "step": 189 + }, + { + "epoch": 0.12153194211241705, + "grad_norm": 2.8963804244995117, + "learning_rate": 4.807651207522975e-05, + "loss": 0.8547, + "step": 190 + }, + { + "epoch": 0.12217158391300871, + "grad_norm": 3.120312452316284, + "learning_rate": 4.806582603120325e-05, + "loss": 1.0028, + "step": 191 + }, + { + "epoch": 0.12281122571360038, + "grad_norm": 3.0206565856933594, + "learning_rate": 4.805513998717675e-05, + "loss": 0.878, + "step": 192 + }, + { + "epoch": 0.12345086751419206, + "grad_norm": 2.732942819595337, + "learning_rate": 4.804445394315025e-05, + "loss": 0.8709, + "step": 193 + }, + { + "epoch": 0.12409050931478371, + "grad_norm": 2.938957691192627, + "learning_rate": 4.803376789912375e-05, + "loss": 0.965, + "step": 194 + }, + { + "epoch": 0.12473015111537539, + "grad_norm": 3.0302512645721436, + "learning_rate": 4.802308185509725e-05, + "loss": 0.936, + "step": 195 + }, + { + "epoch": 0.12536979291596706, + "grad_norm": 2.846325159072876, + "learning_rate": 4.801239581107074e-05, + "loss": 0.7807, + "step": 196 + }, + { + "epoch": 0.12600943471655873, + "grad_norm": 3.189849376678467, + "learning_rate": 4.800170976704425e-05, + "loss": 0.9737, + "step": 197 + }, + { + "epoch": 0.1266490765171504, + "grad_norm": 2.9430770874023438, + "learning_rate": 4.799102372301774e-05, + "loss": 0.8094, + "step": 198 + }, + { + "epoch": 0.12728871831774208, + "grad_norm": 2.7001142501831055, + "learning_rate": 4.798033767899124e-05, + "loss": 0.8156, + "step": 199 + }, + { + "epoch": 0.12792836011833372, + "grad_norm": 2.87261962890625, + "learning_rate": 4.796965163496474e-05, + "loss": 0.8413, + "step": 200 + }, + { + "epoch": 0.1285680019189254, + "grad_norm": 2.9979734420776367, + "learning_rate": 4.795896559093824e-05, + "loss": 0.8899, + "step": 201 + }, + { + "epoch": 0.12920764371951707, + "grad_norm": 3.017146110534668, + "learning_rate": 4.794827954691173e-05, + "loss": 0.9235, + "step": 202 + }, + { + "epoch": 0.12984728552010874, + "grad_norm": 2.893805742263794, + "learning_rate": 4.793759350288524e-05, + "loss": 0.8124, + "step": 203 + }, + { + "epoch": 0.1304869273207004, + "grad_norm": 3.0101611614227295, + "learning_rate": 4.7926907458858736e-05, + "loss": 0.8679, + "step": 204 + }, + { + "epoch": 0.13112656912129209, + "grad_norm": 2.552396535873413, + "learning_rate": 4.791622141483223e-05, + "loss": 0.7325, + "step": 205 + }, + { + "epoch": 0.13176621092188376, + "grad_norm": 3.1043059825897217, + "learning_rate": 4.7905535370805735e-05, + "loss": 0.8962, + "step": 206 + }, + { + "epoch": 0.1324058527224754, + "grad_norm": 3.2721590995788574, + "learning_rate": 4.789484932677923e-05, + "loss": 0.9915, + "step": 207 + }, + { + "epoch": 0.13304549452306708, + "grad_norm": 3.175801992416382, + "learning_rate": 4.788416328275273e-05, + "loss": 0.9229, + "step": 208 + }, + { + "epoch": 0.13368513632365875, + "grad_norm": 2.9646801948547363, + "learning_rate": 4.7873477238726226e-05, + "loss": 0.7331, + "step": 209 + }, + { + "epoch": 0.13432477812425042, + "grad_norm": 3.0931332111358643, + "learning_rate": 4.7862791194699726e-05, + "loss": 0.8776, + "step": 210 + }, + { + "epoch": 0.1349644199248421, + "grad_norm": 3.100283622741699, + "learning_rate": 4.785210515067322e-05, + "loss": 0.9053, + "step": 211 + }, + { + "epoch": 0.13560406172543377, + "grad_norm": 3.1369335651397705, + "learning_rate": 4.7841419106646724e-05, + "loss": 0.9115, + "step": 212 + }, + { + "epoch": 0.13624370352602544, + "grad_norm": 2.7910146713256836, + "learning_rate": 4.783073306262022e-05, + "loss": 0.8834, + "step": 213 + }, + { + "epoch": 0.13688334532661708, + "grad_norm": 2.5720582008361816, + "learning_rate": 4.7820047018593716e-05, + "loss": 0.7837, + "step": 214 + }, + { + "epoch": 0.13752298712720876, + "grad_norm": 2.730604410171509, + "learning_rate": 4.7809360974567216e-05, + "loss": 0.9334, + "step": 215 + }, + { + "epoch": 0.13816262892780043, + "grad_norm": 2.7905635833740234, + "learning_rate": 4.7798674930540715e-05, + "loss": 0.7764, + "step": 216 + }, + { + "epoch": 0.1388022707283921, + "grad_norm": 2.859790325164795, + "learning_rate": 4.7787988886514214e-05, + "loss": 0.9586, + "step": 217 + }, + { + "epoch": 0.13944191252898377, + "grad_norm": 2.570192575454712, + "learning_rate": 4.7777302842487714e-05, + "loss": 0.7886, + "step": 218 + }, + { + "epoch": 0.14008155432957545, + "grad_norm": 2.871694326400757, + "learning_rate": 4.776661679846121e-05, + "loss": 0.9145, + "step": 219 + }, + { + "epoch": 0.14072119613016712, + "grad_norm": 2.7373812198638916, + "learning_rate": 4.7755930754434706e-05, + "loss": 0.8752, + "step": 220 + }, + { + "epoch": 0.14136083793075876, + "grad_norm": 3.17299747467041, + "learning_rate": 4.774524471040821e-05, + "loss": 0.9535, + "step": 221 + }, + { + "epoch": 0.14200047973135044, + "grad_norm": 3.0749080181121826, + "learning_rate": 4.7734558666381704e-05, + "loss": 0.8973, + "step": 222 + }, + { + "epoch": 0.1426401215319421, + "grad_norm": 3.2594897747039795, + "learning_rate": 4.772387262235521e-05, + "loss": 0.887, + "step": 223 + }, + { + "epoch": 0.14327976333253378, + "grad_norm": 3.124821424484253, + "learning_rate": 4.77131865783287e-05, + "loss": 0.8867, + "step": 224 + }, + { + "epoch": 0.14391940513312546, + "grad_norm": 2.854959726333618, + "learning_rate": 4.77025005343022e-05, + "loss": 0.8612, + "step": 225 + }, + { + "epoch": 0.14455904693371713, + "grad_norm": 2.895678758621216, + "learning_rate": 4.76918144902757e-05, + "loss": 0.82, + "step": 226 + }, + { + "epoch": 0.1451986887343088, + "grad_norm": 2.8061985969543457, + "learning_rate": 4.76811284462492e-05, + "loss": 0.8399, + "step": 227 + }, + { + "epoch": 0.14583833053490045, + "grad_norm": 3.1230058670043945, + "learning_rate": 4.76704424022227e-05, + "loss": 0.8948, + "step": 228 + }, + { + "epoch": 0.14647797233549212, + "grad_norm": 3.0216288566589355, + "learning_rate": 4.76597563581962e-05, + "loss": 0.9485, + "step": 229 + }, + { + "epoch": 0.1471176141360838, + "grad_norm": 2.8085646629333496, + "learning_rate": 4.76490703141697e-05, + "loss": 0.8112, + "step": 230 + }, + { + "epoch": 0.14775725593667546, + "grad_norm": 3.0322203636169434, + "learning_rate": 4.763838427014319e-05, + "loss": 0.9738, + "step": 231 + }, + { + "epoch": 0.14839689773726714, + "grad_norm": 2.917573928833008, + "learning_rate": 4.76276982261167e-05, + "loss": 0.9267, + "step": 232 + }, + { + "epoch": 0.1490365395378588, + "grad_norm": 3.0448379516601562, + "learning_rate": 4.761701218209019e-05, + "loss": 0.8845, + "step": 233 + }, + { + "epoch": 0.14967618133845048, + "grad_norm": 2.8761885166168213, + "learning_rate": 4.760632613806369e-05, + "loss": 0.8912, + "step": 234 + }, + { + "epoch": 0.15031582313904213, + "grad_norm": 2.9035568237304688, + "learning_rate": 4.759564009403719e-05, + "loss": 1.0309, + "step": 235 + }, + { + "epoch": 0.1509554649396338, + "grad_norm": 2.924506187438965, + "learning_rate": 4.758495405001069e-05, + "loss": 0.8586, + "step": 236 + }, + { + "epoch": 0.15159510674022547, + "grad_norm": 2.5669422149658203, + "learning_rate": 4.757426800598419e-05, + "loss": 0.7553, + "step": 237 + }, + { + "epoch": 0.15223474854081714, + "grad_norm": 2.7570383548736572, + "learning_rate": 4.756358196195769e-05, + "loss": 0.8244, + "step": 238 + }, + { + "epoch": 0.15287439034140882, + "grad_norm": 2.893439531326294, + "learning_rate": 4.755289591793119e-05, + "loss": 0.807, + "step": 239 + }, + { + "epoch": 0.1535140321420005, + "grad_norm": 2.4479477405548096, + "learning_rate": 4.754220987390468e-05, + "loss": 0.727, + "step": 240 + }, + { + "epoch": 0.15415367394259216, + "grad_norm": 3.0554516315460205, + "learning_rate": 4.7531523829878186e-05, + "loss": 0.9386, + "step": 241 + }, + { + "epoch": 0.1547933157431838, + "grad_norm": 2.8616034984588623, + "learning_rate": 4.752083778585168e-05, + "loss": 0.8254, + "step": 242 + }, + { + "epoch": 0.15543295754377548, + "grad_norm": 3.0332019329071045, + "learning_rate": 4.751015174182518e-05, + "loss": 0.841, + "step": 243 + }, + { + "epoch": 0.15607259934436715, + "grad_norm": 3.25240421295166, + "learning_rate": 4.749946569779868e-05, + "loss": 0.9434, + "step": 244 + }, + { + "epoch": 0.15671224114495882, + "grad_norm": 2.9794118404388428, + "learning_rate": 4.7488779653772176e-05, + "loss": 0.7924, + "step": 245 + }, + { + "epoch": 0.1573518829455505, + "grad_norm": 3.0838968753814697, + "learning_rate": 4.747809360974567e-05, + "loss": 0.9092, + "step": 246 + }, + { + "epoch": 0.15799152474614217, + "grad_norm": 3.1214678287506104, + "learning_rate": 4.7467407565719175e-05, + "loss": 0.8556, + "step": 247 + }, + { + "epoch": 0.15863116654673384, + "grad_norm": 3.2549057006835938, + "learning_rate": 4.745672152169267e-05, + "loss": 0.8633, + "step": 248 + }, + { + "epoch": 0.1592708083473255, + "grad_norm": 2.9489269256591797, + "learning_rate": 4.744603547766617e-05, + "loss": 0.8536, + "step": 249 + }, + { + "epoch": 0.15991045014791716, + "grad_norm": 2.9182708263397217, + "learning_rate": 4.743534943363967e-05, + "loss": 0.8327, + "step": 250 + }, + { + "epoch": 0.16055009194850883, + "grad_norm": 2.681732654571533, + "learning_rate": 4.7424663389613166e-05, + "loss": 0.8328, + "step": 251 + }, + { + "epoch": 0.1611897337491005, + "grad_norm": 2.963179349899292, + "learning_rate": 4.741397734558667e-05, + "loss": 0.915, + "step": 252 + }, + { + "epoch": 0.16182937554969218, + "grad_norm": 2.7239153385162354, + "learning_rate": 4.7403291301560164e-05, + "loss": 0.802, + "step": 253 + }, + { + "epoch": 0.16246901735028385, + "grad_norm": 2.6116347312927246, + "learning_rate": 4.7392605257533664e-05, + "loss": 0.8365, + "step": 254 + }, + { + "epoch": 0.16310865915087552, + "grad_norm": 2.9473235607147217, + "learning_rate": 4.738191921350716e-05, + "loss": 0.8955, + "step": 255 + }, + { + "epoch": 0.16374830095146717, + "grad_norm": 2.610490083694458, + "learning_rate": 4.737123316948066e-05, + "loss": 0.8179, + "step": 256 + }, + { + "epoch": 0.16438794275205884, + "grad_norm": 2.599161148071289, + "learning_rate": 4.7360547125454155e-05, + "loss": 0.8483, + "step": 257 + }, + { + "epoch": 0.1650275845526505, + "grad_norm": 2.55717396736145, + "learning_rate": 4.734986108142766e-05, + "loss": 0.847, + "step": 258 + }, + { + "epoch": 0.1656672263532422, + "grad_norm": 2.6633405685424805, + "learning_rate": 4.7339175037401154e-05, + "loss": 0.8278, + "step": 259 + }, + { + "epoch": 0.16630686815383386, + "grad_norm": 2.5367047786712646, + "learning_rate": 4.732848899337465e-05, + "loss": 0.8217, + "step": 260 + }, + { + "epoch": 0.16694650995442553, + "grad_norm": 2.723659038543701, + "learning_rate": 4.731780294934815e-05, + "loss": 0.87, + "step": 261 + }, + { + "epoch": 0.1675861517550172, + "grad_norm": 3.103633403778076, + "learning_rate": 4.730711690532165e-05, + "loss": 1.0308, + "step": 262 + }, + { + "epoch": 0.16822579355560885, + "grad_norm": 2.508518695831299, + "learning_rate": 4.729643086129515e-05, + "loss": 0.8472, + "step": 263 + }, + { + "epoch": 0.16886543535620052, + "grad_norm": 2.7026095390319824, + "learning_rate": 4.728574481726865e-05, + "loss": 0.8161, + "step": 264 + }, + { + "epoch": 0.1695050771567922, + "grad_norm": 2.884580373764038, + "learning_rate": 4.727505877324215e-05, + "loss": 0.8383, + "step": 265 + }, + { + "epoch": 0.17014471895738387, + "grad_norm": 3.5235071182250977, + "learning_rate": 4.726437272921564e-05, + "loss": 0.9077, + "step": 266 + }, + { + "epoch": 0.17078436075797554, + "grad_norm": 2.661553144454956, + "learning_rate": 4.725368668518915e-05, + "loss": 0.7792, + "step": 267 + }, + { + "epoch": 0.1714240025585672, + "grad_norm": 2.683823347091675, + "learning_rate": 4.724300064116264e-05, + "loss": 0.771, + "step": 268 + }, + { + "epoch": 0.17206364435915888, + "grad_norm": 2.753601312637329, + "learning_rate": 4.723231459713614e-05, + "loss": 0.7982, + "step": 269 + }, + { + "epoch": 0.17270328615975053, + "grad_norm": 3.1290347576141357, + "learning_rate": 4.722162855310964e-05, + "loss": 0.8601, + "step": 270 + }, + { + "epoch": 0.1733429279603422, + "grad_norm": 2.8989670276641846, + "learning_rate": 4.721094250908314e-05, + "loss": 0.8535, + "step": 271 + }, + { + "epoch": 0.17398256976093388, + "grad_norm": 2.718829393386841, + "learning_rate": 4.720025646505664e-05, + "loss": 0.8542, + "step": 272 + }, + { + "epoch": 0.17462221156152555, + "grad_norm": 2.7551136016845703, + "learning_rate": 4.718957042103014e-05, + "loss": 0.7925, + "step": 273 + }, + { + "epoch": 0.17526185336211722, + "grad_norm": 2.7832024097442627, + "learning_rate": 4.717888437700364e-05, + "loss": 0.7897, + "step": 274 + }, + { + "epoch": 0.1759014951627089, + "grad_norm": 3.1213130950927734, + "learning_rate": 4.716819833297713e-05, + "loss": 0.8246, + "step": 275 + }, + { + "epoch": 0.17654113696330054, + "grad_norm": 3.139709949493408, + "learning_rate": 4.7157512288950636e-05, + "loss": 0.8428, + "step": 276 + }, + { + "epoch": 0.1771807787638922, + "grad_norm": 3.0290613174438477, + "learning_rate": 4.714682624492413e-05, + "loss": 0.9477, + "step": 277 + }, + { + "epoch": 0.17782042056448388, + "grad_norm": 2.9898009300231934, + "learning_rate": 4.7136140200897635e-05, + "loss": 0.8169, + "step": 278 + }, + { + "epoch": 0.17846006236507556, + "grad_norm": 2.819793462753296, + "learning_rate": 4.712545415687113e-05, + "loss": 0.932, + "step": 279 + }, + { + "epoch": 0.17909970416566723, + "grad_norm": 2.7697415351867676, + "learning_rate": 4.711476811284463e-05, + "loss": 0.8292, + "step": 280 + }, + { + "epoch": 0.1797393459662589, + "grad_norm": 2.452153205871582, + "learning_rate": 4.7104082068818126e-05, + "loss": 0.7456, + "step": 281 + }, + { + "epoch": 0.18037898776685057, + "grad_norm": 2.829197406768799, + "learning_rate": 4.7093396024791626e-05, + "loss": 0.9367, + "step": 282 + }, + { + "epoch": 0.18101862956744222, + "grad_norm": 2.7254505157470703, + "learning_rate": 4.7082709980765125e-05, + "loss": 0.9341, + "step": 283 + }, + { + "epoch": 0.1816582713680339, + "grad_norm": 2.5844085216522217, + "learning_rate": 4.7072023936738624e-05, + "loss": 0.7766, + "step": 284 + }, + { + "epoch": 0.18229791316862556, + "grad_norm": 2.2345259189605713, + "learning_rate": 4.7061337892712124e-05, + "loss": 0.6529, + "step": 285 + }, + { + "epoch": 0.18293755496921724, + "grad_norm": 2.6041507720947266, + "learning_rate": 4.7050651848685616e-05, + "loss": 0.8856, + "step": 286 + }, + { + "epoch": 0.1835771967698089, + "grad_norm": 2.6847994327545166, + "learning_rate": 4.703996580465912e-05, + "loss": 0.8321, + "step": 287 + }, + { + "epoch": 0.18421683857040058, + "grad_norm": 3.025704860687256, + "learning_rate": 4.7029279760632615e-05, + "loss": 0.783, + "step": 288 + }, + { + "epoch": 0.18485648037099225, + "grad_norm": 2.742975950241089, + "learning_rate": 4.7018593716606114e-05, + "loss": 0.9032, + "step": 289 + }, + { + "epoch": 0.1854961221715839, + "grad_norm": 2.452404737472534, + "learning_rate": 4.7007907672579614e-05, + "loss": 0.8441, + "step": 290 + }, + { + "epoch": 0.18613576397217557, + "grad_norm": 2.590944528579712, + "learning_rate": 4.699722162855311e-05, + "loss": 0.8323, + "step": 291 + }, + { + "epoch": 0.18677540577276724, + "grad_norm": 2.843992233276367, + "learning_rate": 4.6986535584526606e-05, + "loss": 0.8646, + "step": 292 + }, + { + "epoch": 0.18741504757335892, + "grad_norm": 2.8486318588256836, + "learning_rate": 4.697584954050011e-05, + "loss": 0.8576, + "step": 293 + }, + { + "epoch": 0.1880546893739506, + "grad_norm": 2.80191969871521, + "learning_rate": 4.6965163496473604e-05, + "loss": 0.8727, + "step": 294 + }, + { + "epoch": 0.18869433117454226, + "grad_norm": 3.0136806964874268, + "learning_rate": 4.6954477452447104e-05, + "loss": 0.8943, + "step": 295 + }, + { + "epoch": 0.18933397297513394, + "grad_norm": 2.673966407775879, + "learning_rate": 4.694379140842061e-05, + "loss": 0.805, + "step": 296 + }, + { + "epoch": 0.18997361477572558, + "grad_norm": 2.6993069648742676, + "learning_rate": 4.69331053643941e-05, + "loss": 0.8435, + "step": 297 + }, + { + "epoch": 0.19061325657631725, + "grad_norm": 2.735931634902954, + "learning_rate": 4.69224193203676e-05, + "loss": 0.894, + "step": 298 + }, + { + "epoch": 0.19125289837690893, + "grad_norm": 2.6338067054748535, + "learning_rate": 4.69117332763411e-05, + "loss": 0.8343, + "step": 299 + }, + { + "epoch": 0.1918925401775006, + "grad_norm": 2.6077582836151123, + "learning_rate": 4.69010472323146e-05, + "loss": 0.8534, + "step": 300 + }, + { + "epoch": 0.19253218197809227, + "grad_norm": 2.636435031890869, + "learning_rate": 4.689036118828809e-05, + "loss": 0.897, + "step": 301 + }, + { + "epoch": 0.19317182377868394, + "grad_norm": 2.7182765007019043, + "learning_rate": 4.68796751442616e-05, + "loss": 0.8465, + "step": 302 + }, + { + "epoch": 0.19381146557927562, + "grad_norm": 2.917398452758789, + "learning_rate": 4.686898910023509e-05, + "loss": 0.8881, + "step": 303 + }, + { + "epoch": 0.19445110737986726, + "grad_norm": 2.8616085052490234, + "learning_rate": 4.685830305620859e-05, + "loss": 0.8546, + "step": 304 + }, + { + "epoch": 0.19509074918045893, + "grad_norm": 2.760434150695801, + "learning_rate": 4.684761701218209e-05, + "loss": 0.9518, + "step": 305 + }, + { + "epoch": 0.1957303909810506, + "grad_norm": 2.4656758308410645, + "learning_rate": 4.683693096815559e-05, + "loss": 0.7429, + "step": 306 + }, + { + "epoch": 0.19637003278164228, + "grad_norm": 2.6935067176818848, + "learning_rate": 4.682624492412909e-05, + "loss": 0.8073, + "step": 307 + }, + { + "epoch": 0.19700967458223395, + "grad_norm": 2.8546817302703857, + "learning_rate": 4.681555888010259e-05, + "loss": 0.9638, + "step": 308 + }, + { + "epoch": 0.19764931638282562, + "grad_norm": 2.565056562423706, + "learning_rate": 4.680487283607609e-05, + "loss": 0.8272, + "step": 309 + }, + { + "epoch": 0.1982889581834173, + "grad_norm": 2.670307159423828, + "learning_rate": 4.679418679204959e-05, + "loss": 0.8595, + "step": 310 + }, + { + "epoch": 0.19892859998400894, + "grad_norm": 2.8646392822265625, + "learning_rate": 4.678350074802309e-05, + "loss": 0.924, + "step": 311 + }, + { + "epoch": 0.19956824178460061, + "grad_norm": 2.329155683517456, + "learning_rate": 4.677281470399658e-05, + "loss": 0.7529, + "step": 312 + }, + { + "epoch": 0.2002078835851923, + "grad_norm": 2.937229633331299, + "learning_rate": 4.6762128659970085e-05, + "loss": 0.8528, + "step": 313 + }, + { + "epoch": 0.20084752538578396, + "grad_norm": 3.061223030090332, + "learning_rate": 4.675144261594358e-05, + "loss": 0.9067, + "step": 314 + }, + { + "epoch": 0.20148716718637563, + "grad_norm": 2.890824794769287, + "learning_rate": 4.674075657191708e-05, + "loss": 0.9379, + "step": 315 + }, + { + "epoch": 0.2021268089869673, + "grad_norm": 2.474018096923828, + "learning_rate": 4.673007052789058e-05, + "loss": 0.7366, + "step": 316 + }, + { + "epoch": 0.20276645078755898, + "grad_norm": 2.6298065185546875, + "learning_rate": 4.6719384483864076e-05, + "loss": 0.7878, + "step": 317 + }, + { + "epoch": 0.20340609258815062, + "grad_norm": 2.8355021476745605, + "learning_rate": 4.6708698439837575e-05, + "loss": 0.7836, + "step": 318 + }, + { + "epoch": 0.2040457343887423, + "grad_norm": 2.437777280807495, + "learning_rate": 4.6698012395811075e-05, + "loss": 0.6924, + "step": 319 + }, + { + "epoch": 0.20468537618933397, + "grad_norm": 3.3331644535064697, + "learning_rate": 4.6687326351784574e-05, + "loss": 1.021, + "step": 320 + }, + { + "epoch": 0.20532501798992564, + "grad_norm": 2.9480056762695312, + "learning_rate": 4.667664030775807e-05, + "loss": 0.845, + "step": 321 + }, + { + "epoch": 0.2059646597905173, + "grad_norm": 2.711811065673828, + "learning_rate": 4.666595426373157e-05, + "loss": 0.7977, + "step": 322 + }, + { + "epoch": 0.20660430159110899, + "grad_norm": 2.9610745906829834, + "learning_rate": 4.6655268219705066e-05, + "loss": 0.8494, + "step": 323 + }, + { + "epoch": 0.20724394339170066, + "grad_norm": 2.647684097290039, + "learning_rate": 4.6644582175678565e-05, + "loss": 0.8241, + "step": 324 + }, + { + "epoch": 0.2078835851922923, + "grad_norm": 2.822852611541748, + "learning_rate": 4.6633896131652064e-05, + "loss": 0.8214, + "step": 325 + }, + { + "epoch": 0.20852322699288398, + "grad_norm": 2.310638427734375, + "learning_rate": 4.6623210087625564e-05, + "loss": 0.7653, + "step": 326 + }, + { + "epoch": 0.20916286879347565, + "grad_norm": 2.8434946537017822, + "learning_rate": 4.6612524043599056e-05, + "loss": 0.9074, + "step": 327 + }, + { + "epoch": 0.20980251059406732, + "grad_norm": 2.6273133754730225, + "learning_rate": 4.660183799957256e-05, + "loss": 0.8539, + "step": 328 + }, + { + "epoch": 0.210442152394659, + "grad_norm": 2.555723190307617, + "learning_rate": 4.659115195554606e-05, + "loss": 0.7931, + "step": 329 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 2.832242965698242, + "learning_rate": 4.6580465911519554e-05, + "loss": 0.9129, + "step": 330 + }, + { + "epoch": 0.21172143599584234, + "grad_norm": 2.4606082439422607, + "learning_rate": 4.656977986749306e-05, + "loss": 0.719, + "step": 331 + }, + { + "epoch": 0.21236107779643398, + "grad_norm": 2.9017698764801025, + "learning_rate": 4.655909382346655e-05, + "loss": 0.8765, + "step": 332 + }, + { + "epoch": 0.21300071959702566, + "grad_norm": 2.6410350799560547, + "learning_rate": 4.654840777944005e-05, + "loss": 0.7867, + "step": 333 + }, + { + "epoch": 0.21364036139761733, + "grad_norm": 2.5876753330230713, + "learning_rate": 4.653772173541355e-05, + "loss": 0.7032, + "step": 334 + }, + { + "epoch": 0.214280003198209, + "grad_norm": 2.436483860015869, + "learning_rate": 4.652703569138705e-05, + "loss": 0.784, + "step": 335 + }, + { + "epoch": 0.21491964499880067, + "grad_norm": 3.1525931358337402, + "learning_rate": 4.651634964736055e-05, + "loss": 0.912, + "step": 336 + }, + { + "epoch": 0.21555928679939235, + "grad_norm": 3.039365530014038, + "learning_rate": 4.650566360333405e-05, + "loss": 0.9294, + "step": 337 + }, + { + "epoch": 0.21619892859998402, + "grad_norm": 3.1444387435913086, + "learning_rate": 4.649497755930754e-05, + "loss": 0.8717, + "step": 338 + }, + { + "epoch": 0.21683857040057566, + "grad_norm": 2.7917325496673584, + "learning_rate": 4.648429151528105e-05, + "loss": 0.9408, + "step": 339 + }, + { + "epoch": 0.21747821220116734, + "grad_norm": 2.665494203567505, + "learning_rate": 4.647360547125454e-05, + "loss": 0.7739, + "step": 340 + }, + { + "epoch": 0.218117854001759, + "grad_norm": 3.186284065246582, + "learning_rate": 4.646291942722804e-05, + "loss": 0.8443, + "step": 341 + }, + { + "epoch": 0.21875749580235068, + "grad_norm": 2.6965200901031494, + "learning_rate": 4.645223338320155e-05, + "loss": 0.8259, + "step": 342 + }, + { + "epoch": 0.21939713760294235, + "grad_norm": 2.7016642093658447, + "learning_rate": 4.644154733917504e-05, + "loss": 0.831, + "step": 343 + }, + { + "epoch": 0.22003677940353403, + "grad_norm": 2.3008852005004883, + "learning_rate": 4.643086129514854e-05, + "loss": 0.6377, + "step": 344 + }, + { + "epoch": 0.2206764212041257, + "grad_norm": 2.7286880016326904, + "learning_rate": 4.642017525112204e-05, + "loss": 0.8643, + "step": 345 + }, + { + "epoch": 0.22131606300471734, + "grad_norm": 3.0022411346435547, + "learning_rate": 4.640948920709554e-05, + "loss": 0.941, + "step": 346 + }, + { + "epoch": 0.22195570480530902, + "grad_norm": 3.082437515258789, + "learning_rate": 4.639880316306903e-05, + "loss": 0.9995, + "step": 347 + }, + { + "epoch": 0.2225953466059007, + "grad_norm": 2.608405113220215, + "learning_rate": 4.6388117119042536e-05, + "loss": 0.8163, + "step": 348 + }, + { + "epoch": 0.22323498840649236, + "grad_norm": 2.5102648735046387, + "learning_rate": 4.637743107501603e-05, + "loss": 0.7536, + "step": 349 + }, + { + "epoch": 0.22387463020708404, + "grad_norm": 2.536804437637329, + "learning_rate": 4.636674503098953e-05, + "loss": 0.7645, + "step": 350 + }, + { + "epoch": 0.2245142720076757, + "grad_norm": 2.5854899883270264, + "learning_rate": 4.635605898696303e-05, + "loss": 0.8361, + "step": 351 + }, + { + "epoch": 0.22515391380826738, + "grad_norm": 2.534740924835205, + "learning_rate": 4.634537294293653e-05, + "loss": 0.8351, + "step": 352 + }, + { + "epoch": 0.22579355560885903, + "grad_norm": 2.5224814414978027, + "learning_rate": 4.6334686898910026e-05, + "loss": 0.7384, + "step": 353 + }, + { + "epoch": 0.2264331974094507, + "grad_norm": 2.448772668838501, + "learning_rate": 4.6324000854883525e-05, + "loss": 0.811, + "step": 354 + }, + { + "epoch": 0.22707283921004237, + "grad_norm": 2.641885757446289, + "learning_rate": 4.6313314810857025e-05, + "loss": 0.7952, + "step": 355 + }, + { + "epoch": 0.22771248101063404, + "grad_norm": 2.855888605117798, + "learning_rate": 4.630262876683052e-05, + "loss": 0.8237, + "step": 356 + }, + { + "epoch": 0.22835212281122572, + "grad_norm": 2.7013163566589355, + "learning_rate": 4.6291942722804023e-05, + "loss": 0.8137, + "step": 357 + }, + { + "epoch": 0.2289917646118174, + "grad_norm": 2.7953360080718994, + "learning_rate": 4.6281256678777516e-05, + "loss": 0.8377, + "step": 358 + }, + { + "epoch": 0.22963140641240906, + "grad_norm": 2.8884384632110596, + "learning_rate": 4.6270570634751015e-05, + "loss": 0.8629, + "step": 359 + }, + { + "epoch": 0.2302710482130007, + "grad_norm": 2.9556541442871094, + "learning_rate": 4.6259884590724515e-05, + "loss": 0.9102, + "step": 360 + }, + { + "epoch": 0.23091069001359238, + "grad_norm": 2.8044021129608154, + "learning_rate": 4.6249198546698014e-05, + "loss": 0.8388, + "step": 361 + }, + { + "epoch": 0.23155033181418405, + "grad_norm": 2.432013511657715, + "learning_rate": 4.6238512502671514e-05, + "loss": 0.7672, + "step": 362 + }, + { + "epoch": 0.23218997361477572, + "grad_norm": 2.922989845275879, + "learning_rate": 4.622782645864501e-05, + "loss": 0.9464, + "step": 363 + }, + { + "epoch": 0.2328296154153674, + "grad_norm": 2.6194958686828613, + "learning_rate": 4.621714041461851e-05, + "loss": 0.8294, + "step": 364 + }, + { + "epoch": 0.23346925721595907, + "grad_norm": 2.463254451751709, + "learning_rate": 4.620645437059201e-05, + "loss": 0.7614, + "step": 365 + }, + { + "epoch": 0.23410889901655074, + "grad_norm": 2.737752914428711, + "learning_rate": 4.619576832656551e-05, + "loss": 0.9387, + "step": 366 + }, + { + "epoch": 0.2347485408171424, + "grad_norm": 2.485262870788574, + "learning_rate": 4.6185082282539004e-05, + "loss": 0.7469, + "step": 367 + }, + { + "epoch": 0.23538818261773406, + "grad_norm": 2.812809944152832, + "learning_rate": 4.617439623851251e-05, + "loss": 0.9078, + "step": 368 + }, + { + "epoch": 0.23602782441832573, + "grad_norm": 2.802497625350952, + "learning_rate": 4.6163710194486e-05, + "loss": 0.9106, + "step": 369 + }, + { + "epoch": 0.2366674662189174, + "grad_norm": 2.443861484527588, + "learning_rate": 4.61530241504595e-05, + "loss": 0.799, + "step": 370 + }, + { + "epoch": 0.23730710801950908, + "grad_norm": 2.6237952709198, + "learning_rate": 4.6142338106433e-05, + "loss": 0.8169, + "step": 371 + }, + { + "epoch": 0.23794674982010075, + "grad_norm": 2.2601051330566406, + "learning_rate": 4.61316520624065e-05, + "loss": 0.7139, + "step": 372 + }, + { + "epoch": 0.23858639162069242, + "grad_norm": 2.7410929203033447, + "learning_rate": 4.612096601837999e-05, + "loss": 0.9006, + "step": 373 + }, + { + "epoch": 0.23922603342128407, + "grad_norm": 2.777866840362549, + "learning_rate": 4.61102799743535e-05, + "loss": 0.8401, + "step": 374 + }, + { + "epoch": 0.23986567522187574, + "grad_norm": 3.059474229812622, + "learning_rate": 4.6099593930327e-05, + "loss": 0.8802, + "step": 375 + }, + { + "epoch": 0.2405053170224674, + "grad_norm": 2.2261245250701904, + "learning_rate": 4.608890788630049e-05, + "loss": 0.7075, + "step": 376 + }, + { + "epoch": 0.24114495882305909, + "grad_norm": 2.6875627040863037, + "learning_rate": 4.6078221842274e-05, + "loss": 0.8127, + "step": 377 + }, + { + "epoch": 0.24178460062365076, + "grad_norm": 3.2723631858825684, + "learning_rate": 4.606753579824749e-05, + "loss": 1.087, + "step": 378 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 2.5249931812286377, + "learning_rate": 4.605684975422099e-05, + "loss": 0.7964, + "step": 379 + }, + { + "epoch": 0.2430638842248341, + "grad_norm": 2.862531900405884, + "learning_rate": 4.604616371019449e-05, + "loss": 0.9279, + "step": 380 + }, + { + "epoch": 0.24370352602542575, + "grad_norm": 2.866821527481079, + "learning_rate": 4.603547766616799e-05, + "loss": 0.8377, + "step": 381 + }, + { + "epoch": 0.24434316782601742, + "grad_norm": 2.5239150524139404, + "learning_rate": 4.602479162214148e-05, + "loss": 0.8168, + "step": 382 + }, + { + "epoch": 0.2449828096266091, + "grad_norm": 2.564471960067749, + "learning_rate": 4.6014105578114987e-05, + "loss": 0.835, + "step": 383 + }, + { + "epoch": 0.24562245142720077, + "grad_norm": 2.5914828777313232, + "learning_rate": 4.600341953408848e-05, + "loss": 0.8291, + "step": 384 + }, + { + "epoch": 0.24626209322779244, + "grad_norm": 2.713080644607544, + "learning_rate": 4.599273349006198e-05, + "loss": 0.8057, + "step": 385 + }, + { + "epoch": 0.2469017350283841, + "grad_norm": 2.5737624168395996, + "learning_rate": 4.598204744603548e-05, + "loss": 0.8268, + "step": 386 + }, + { + "epoch": 0.24754137682897578, + "grad_norm": 2.7671921253204346, + "learning_rate": 4.597136140200898e-05, + "loss": 0.9292, + "step": 387 + }, + { + "epoch": 0.24818101862956743, + "grad_norm": 2.4558112621307373, + "learning_rate": 4.5960675357982477e-05, + "loss": 0.7355, + "step": 388 + }, + { + "epoch": 0.2488206604301591, + "grad_norm": 2.8342125415802, + "learning_rate": 4.5949989313955976e-05, + "loss": 0.9629, + "step": 389 + }, + { + "epoch": 0.24946030223075077, + "grad_norm": 2.3475425243377686, + "learning_rate": 4.5939303269929475e-05, + "loss": 0.7883, + "step": 390 + }, + { + "epoch": 0.25009994403134245, + "grad_norm": 2.593355655670166, + "learning_rate": 4.5928617225902975e-05, + "loss": 0.7518, + "step": 391 + }, + { + "epoch": 0.2507395858319341, + "grad_norm": 2.8946495056152344, + "learning_rate": 4.5917931181876474e-05, + "loss": 0.9147, + "step": 392 + }, + { + "epoch": 0.2513792276325258, + "grad_norm": 2.570875406265259, + "learning_rate": 4.590724513784997e-05, + "loss": 0.7989, + "step": 393 + }, + { + "epoch": 0.25201886943311747, + "grad_norm": 2.8372998237609863, + "learning_rate": 4.589655909382347e-05, + "loss": 0.7927, + "step": 394 + }, + { + "epoch": 0.25265851123370914, + "grad_norm": 2.7271525859832764, + "learning_rate": 4.5885873049796965e-05, + "loss": 0.872, + "step": 395 + }, + { + "epoch": 0.2532981530343008, + "grad_norm": 2.768200397491455, + "learning_rate": 4.5875187005770465e-05, + "loss": 0.7903, + "step": 396 + }, + { + "epoch": 0.2539377948348925, + "grad_norm": 2.891479253768921, + "learning_rate": 4.5864500961743964e-05, + "loss": 0.8306, + "step": 397 + }, + { + "epoch": 0.25457743663548416, + "grad_norm": 2.669311761856079, + "learning_rate": 4.5853814917717463e-05, + "loss": 0.8316, + "step": 398 + }, + { + "epoch": 0.2552170784360758, + "grad_norm": 2.4247336387634277, + "learning_rate": 4.584312887369096e-05, + "loss": 0.7606, + "step": 399 + }, + { + "epoch": 0.25585672023666745, + "grad_norm": 2.8361196517944336, + "learning_rate": 4.583244282966446e-05, + "loss": 0.9152, + "step": 400 + }, + { + "epoch": 0.2564963620372591, + "grad_norm": 2.529350519180298, + "learning_rate": 4.582175678563796e-05, + "loss": 0.7546, + "step": 401 + }, + { + "epoch": 0.2571360038378508, + "grad_norm": 2.8802082538604736, + "learning_rate": 4.5811070741611454e-05, + "loss": 0.9035, + "step": 402 + }, + { + "epoch": 0.25777564563844246, + "grad_norm": 2.907050132751465, + "learning_rate": 4.580038469758496e-05, + "loss": 0.9008, + "step": 403 + }, + { + "epoch": 0.25841528743903414, + "grad_norm": 2.8581337928771973, + "learning_rate": 4.578969865355845e-05, + "loss": 0.8454, + "step": 404 + }, + { + "epoch": 0.2590549292396258, + "grad_norm": 2.6416478157043457, + "learning_rate": 4.577901260953195e-05, + "loss": 0.8683, + "step": 405 + }, + { + "epoch": 0.2596945710402175, + "grad_norm": 2.726173162460327, + "learning_rate": 4.576832656550545e-05, + "loss": 0.7448, + "step": 406 + }, + { + "epoch": 0.26033421284080915, + "grad_norm": 2.6585590839385986, + "learning_rate": 4.575764052147895e-05, + "loss": 0.7702, + "step": 407 + }, + { + "epoch": 0.2609738546414008, + "grad_norm": 2.6481173038482666, + "learning_rate": 4.574695447745245e-05, + "loss": 0.8915, + "step": 408 + }, + { + "epoch": 0.2616134964419925, + "grad_norm": 2.8057031631469727, + "learning_rate": 4.573626843342595e-05, + "loss": 0.7919, + "step": 409 + }, + { + "epoch": 0.26225313824258417, + "grad_norm": 2.424503803253174, + "learning_rate": 4.572558238939945e-05, + "loss": 0.7658, + "step": 410 + }, + { + "epoch": 0.26289278004317584, + "grad_norm": 2.7346384525299072, + "learning_rate": 4.571489634537294e-05, + "loss": 0.9483, + "step": 411 + }, + { + "epoch": 0.2635324218437675, + "grad_norm": 2.6265981197357178, + "learning_rate": 4.570421030134645e-05, + "loss": 0.8086, + "step": 412 + }, + { + "epoch": 0.26417206364435913, + "grad_norm": 2.5911974906921387, + "learning_rate": 4.569352425731994e-05, + "loss": 0.8003, + "step": 413 + }, + { + "epoch": 0.2648117054449508, + "grad_norm": 2.497795343399048, + "learning_rate": 4.568283821329344e-05, + "loss": 0.8405, + "step": 414 + }, + { + "epoch": 0.2654513472455425, + "grad_norm": 2.5634653568267822, + "learning_rate": 4.567215216926694e-05, + "loss": 0.7734, + "step": 415 + }, + { + "epoch": 0.26609098904613415, + "grad_norm": 2.757187843322754, + "learning_rate": 4.566146612524044e-05, + "loss": 0.9181, + "step": 416 + }, + { + "epoch": 0.2667306308467258, + "grad_norm": 2.64090633392334, + "learning_rate": 4.565078008121393e-05, + "loss": 0.8195, + "step": 417 + }, + { + "epoch": 0.2673702726473175, + "grad_norm": 2.511357069015503, + "learning_rate": 4.564009403718744e-05, + "loss": 0.7861, + "step": 418 + }, + { + "epoch": 0.26800991444790917, + "grad_norm": 2.4804275035858154, + "learning_rate": 4.562940799316093e-05, + "loss": 0.789, + "step": 419 + }, + { + "epoch": 0.26864955624850084, + "grad_norm": 2.790694236755371, + "learning_rate": 4.5618721949134436e-05, + "loss": 0.6871, + "step": 420 + }, + { + "epoch": 0.2692891980490925, + "grad_norm": 2.5106687545776367, + "learning_rate": 4.5608035905107935e-05, + "loss": 0.758, + "step": 421 + }, + { + "epoch": 0.2699288398496842, + "grad_norm": 2.534241199493408, + "learning_rate": 4.559734986108143e-05, + "loss": 0.8383, + "step": 422 + }, + { + "epoch": 0.27056848165027586, + "grad_norm": 2.6573526859283447, + "learning_rate": 4.5586663817054934e-05, + "loss": 0.7304, + "step": 423 + }, + { + "epoch": 0.27120812345086753, + "grad_norm": 2.482849359512329, + "learning_rate": 4.5575977773028427e-05, + "loss": 0.75, + "step": 424 + }, + { + "epoch": 0.2718477652514592, + "grad_norm": 2.5034193992614746, + "learning_rate": 4.5565291729001926e-05, + "loss": 0.7602, + "step": 425 + }, + { + "epoch": 0.2724874070520509, + "grad_norm": 2.8357973098754883, + "learning_rate": 4.5554605684975425e-05, + "loss": 0.8598, + "step": 426 + }, + { + "epoch": 0.2731270488526425, + "grad_norm": 2.5167903900146484, + "learning_rate": 4.5543919640948925e-05, + "loss": 0.8883, + "step": 427 + }, + { + "epoch": 0.27376669065323417, + "grad_norm": 2.647885322570801, + "learning_rate": 4.553323359692242e-05, + "loss": 0.8711, + "step": 428 + }, + { + "epoch": 0.27440633245382584, + "grad_norm": 2.8058571815490723, + "learning_rate": 4.552254755289592e-05, + "loss": 0.9287, + "step": 429 + }, + { + "epoch": 0.2750459742544175, + "grad_norm": 2.494300603866577, + "learning_rate": 4.5511861508869416e-05, + "loss": 0.7829, + "step": 430 + }, + { + "epoch": 0.2756856160550092, + "grad_norm": 2.3351948261260986, + "learning_rate": 4.5501175464842915e-05, + "loss": 0.7023, + "step": 431 + }, + { + "epoch": 0.27632525785560086, + "grad_norm": 2.4500770568847656, + "learning_rate": 4.5490489420816415e-05, + "loss": 0.7681, + "step": 432 + }, + { + "epoch": 0.27696489965619253, + "grad_norm": 2.6197237968444824, + "learning_rate": 4.5479803376789914e-05, + "loss": 0.862, + "step": 433 + }, + { + "epoch": 0.2776045414567842, + "grad_norm": 2.4630415439605713, + "learning_rate": 4.546911733276341e-05, + "loss": 0.714, + "step": 434 + }, + { + "epoch": 0.2782441832573759, + "grad_norm": 2.408474922180176, + "learning_rate": 4.545843128873691e-05, + "loss": 0.7727, + "step": 435 + }, + { + "epoch": 0.27888382505796755, + "grad_norm": 2.6363437175750732, + "learning_rate": 4.544774524471041e-05, + "loss": 0.7386, + "step": 436 + }, + { + "epoch": 0.2795234668585592, + "grad_norm": 2.4919941425323486, + "learning_rate": 4.5437059200683905e-05, + "loss": 0.7964, + "step": 437 + }, + { + "epoch": 0.2801631086591509, + "grad_norm": 2.2515480518341064, + "learning_rate": 4.542637315665741e-05, + "loss": 0.7267, + "step": 438 + }, + { + "epoch": 0.28080275045974257, + "grad_norm": 2.4243528842926025, + "learning_rate": 4.5415687112630903e-05, + "loss": 0.7142, + "step": 439 + }, + { + "epoch": 0.28144239226033424, + "grad_norm": 2.6440441608428955, + "learning_rate": 4.54050010686044e-05, + "loss": 0.7376, + "step": 440 + }, + { + "epoch": 0.28208203406092586, + "grad_norm": 2.3073890209198, + "learning_rate": 4.53943150245779e-05, + "loss": 0.7706, + "step": 441 + }, + { + "epoch": 0.28272167586151753, + "grad_norm": 2.70876407623291, + "learning_rate": 4.53836289805514e-05, + "loss": 0.8541, + "step": 442 + }, + { + "epoch": 0.2833613176621092, + "grad_norm": 2.3511955738067627, + "learning_rate": 4.53729429365249e-05, + "loss": 0.7319, + "step": 443 + }, + { + "epoch": 0.2840009594627009, + "grad_norm": 2.6385610103607178, + "learning_rate": 4.53622568924984e-05, + "loss": 0.7293, + "step": 444 + }, + { + "epoch": 0.28464060126329255, + "grad_norm": 2.6569063663482666, + "learning_rate": 4.53515708484719e-05, + "loss": 0.8382, + "step": 445 + }, + { + "epoch": 0.2852802430638842, + "grad_norm": 2.4706740379333496, + "learning_rate": 4.534088480444539e-05, + "loss": 0.726, + "step": 446 + }, + { + "epoch": 0.2859198848644759, + "grad_norm": 2.933239459991455, + "learning_rate": 4.53301987604189e-05, + "loss": 0.9803, + "step": 447 + }, + { + "epoch": 0.28655952666506757, + "grad_norm": 2.9820454120635986, + "learning_rate": 4.531951271639239e-05, + "loss": 1.0244, + "step": 448 + }, + { + "epoch": 0.28719916846565924, + "grad_norm": 2.8230414390563965, + "learning_rate": 4.53088266723659e-05, + "loss": 0.837, + "step": 449 + }, + { + "epoch": 0.2878388102662509, + "grad_norm": 2.7445719242095947, + "learning_rate": 4.529814062833939e-05, + "loss": 0.9338, + "step": 450 + }, + { + "epoch": 0.2884784520668426, + "grad_norm": 2.9792659282684326, + "learning_rate": 4.528745458431289e-05, + "loss": 0.9416, + "step": 451 + }, + { + "epoch": 0.28911809386743426, + "grad_norm": 2.9139254093170166, + "learning_rate": 4.527676854028639e-05, + "loss": 0.8946, + "step": 452 + }, + { + "epoch": 0.28975773566802593, + "grad_norm": 2.4100377559661865, + "learning_rate": 4.526608249625989e-05, + "loss": 0.7475, + "step": 453 + }, + { + "epoch": 0.2903973774686176, + "grad_norm": 2.572016954421997, + "learning_rate": 4.525539645223339e-05, + "loss": 0.7986, + "step": 454 + }, + { + "epoch": 0.2910370192692092, + "grad_norm": 2.9937071800231934, + "learning_rate": 4.5244710408206886e-05, + "loss": 0.9822, + "step": 455 + }, + { + "epoch": 0.2916766610698009, + "grad_norm": 2.890455722808838, + "learning_rate": 4.5234024364180386e-05, + "loss": 0.9258, + "step": 456 + }, + { + "epoch": 0.29231630287039256, + "grad_norm": 2.4596166610717773, + "learning_rate": 4.522333832015388e-05, + "loss": 0.75, + "step": 457 + }, + { + "epoch": 0.29295594467098424, + "grad_norm": 2.647298812866211, + "learning_rate": 4.5212652276127385e-05, + "loss": 0.8474, + "step": 458 + }, + { + "epoch": 0.2935955864715759, + "grad_norm": 2.4020044803619385, + "learning_rate": 4.520196623210088e-05, + "loss": 0.7695, + "step": 459 + }, + { + "epoch": 0.2942352282721676, + "grad_norm": 2.337449312210083, + "learning_rate": 4.5191280188074376e-05, + "loss": 0.8351, + "step": 460 + }, + { + "epoch": 0.29487487007275925, + "grad_norm": 2.482130527496338, + "learning_rate": 4.5180594144047876e-05, + "loss": 0.859, + "step": 461 + }, + { + "epoch": 0.2955145118733509, + "grad_norm": 2.4470834732055664, + "learning_rate": 4.5169908100021375e-05, + "loss": 0.7388, + "step": 462 + }, + { + "epoch": 0.2961541536739426, + "grad_norm": 2.231738805770874, + "learning_rate": 4.515922205599487e-05, + "loss": 0.6551, + "step": 463 + }, + { + "epoch": 0.29679379547453427, + "grad_norm": 2.5150177478790283, + "learning_rate": 4.5148536011968374e-05, + "loss": 0.8901, + "step": 464 + }, + { + "epoch": 0.29743343727512594, + "grad_norm": 2.5069150924682617, + "learning_rate": 4.5137849967941866e-05, + "loss": 0.7837, + "step": 465 + }, + { + "epoch": 0.2980730790757176, + "grad_norm": 2.964963674545288, + "learning_rate": 4.5127163923915366e-05, + "loss": 1.016, + "step": 466 + }, + { + "epoch": 0.2987127208763093, + "grad_norm": 3.0640437602996826, + "learning_rate": 4.511647787988887e-05, + "loss": 0.96, + "step": 467 + }, + { + "epoch": 0.29935236267690096, + "grad_norm": 2.9575908184051514, + "learning_rate": 4.5105791835862365e-05, + "loss": 0.8377, + "step": 468 + }, + { + "epoch": 0.2999920044774926, + "grad_norm": 2.550337076187134, + "learning_rate": 4.5095105791835864e-05, + "loss": 0.8829, + "step": 469 + }, + { + "epoch": 0.30063164627808425, + "grad_norm": 2.4377126693725586, + "learning_rate": 4.508441974780936e-05, + "loss": 0.7808, + "step": 470 + }, + { + "epoch": 0.3012712880786759, + "grad_norm": 3.052569627761841, + "learning_rate": 4.507373370378286e-05, + "loss": 0.9143, + "step": 471 + }, + { + "epoch": 0.3019109298792676, + "grad_norm": 2.787297487258911, + "learning_rate": 4.5063047659756355e-05, + "loss": 0.8985, + "step": 472 + }, + { + "epoch": 0.30255057167985927, + "grad_norm": 2.5775625705718994, + "learning_rate": 4.505236161572986e-05, + "loss": 0.8606, + "step": 473 + }, + { + "epoch": 0.30319021348045094, + "grad_norm": 2.481354236602783, + "learning_rate": 4.5041675571703354e-05, + "loss": 0.8308, + "step": 474 + }, + { + "epoch": 0.3038298552810426, + "grad_norm": 2.566333532333374, + "learning_rate": 4.503098952767685e-05, + "loss": 0.7915, + "step": 475 + }, + { + "epoch": 0.3044694970816343, + "grad_norm": 2.772073984146118, + "learning_rate": 4.502030348365035e-05, + "loss": 0.9494, + "step": 476 + }, + { + "epoch": 0.30510913888222596, + "grad_norm": 2.6241090297698975, + "learning_rate": 4.500961743962385e-05, + "loss": 0.8687, + "step": 477 + }, + { + "epoch": 0.30574878068281763, + "grad_norm": 2.497037649154663, + "learning_rate": 4.499893139559735e-05, + "loss": 0.8067, + "step": 478 + }, + { + "epoch": 0.3063884224834093, + "grad_norm": 2.2416837215423584, + "learning_rate": 4.498824535157085e-05, + "loss": 0.7778, + "step": 479 + }, + { + "epoch": 0.307028064284001, + "grad_norm": 2.4029901027679443, + "learning_rate": 4.497755930754435e-05, + "loss": 0.7819, + "step": 480 + }, + { + "epoch": 0.30766770608459265, + "grad_norm": 2.4983389377593994, + "learning_rate": 4.496687326351785e-05, + "loss": 0.8115, + "step": 481 + }, + { + "epoch": 0.3083073478851843, + "grad_norm": 2.6600162982940674, + "learning_rate": 4.495618721949135e-05, + "loss": 0.8323, + "step": 482 + }, + { + "epoch": 0.30894698968577594, + "grad_norm": 2.304379940032959, + "learning_rate": 4.494550117546484e-05, + "loss": 0.7997, + "step": 483 + }, + { + "epoch": 0.3095866314863676, + "grad_norm": 2.4892849922180176, + "learning_rate": 4.493481513143835e-05, + "loss": 0.6917, + "step": 484 + }, + { + "epoch": 0.3102262732869593, + "grad_norm": 2.624267578125, + "learning_rate": 4.492412908741184e-05, + "loss": 0.8025, + "step": 485 + }, + { + "epoch": 0.31086591508755096, + "grad_norm": 2.465531349182129, + "learning_rate": 4.491344304338534e-05, + "loss": 0.809, + "step": 486 + }, + { + "epoch": 0.31150555688814263, + "grad_norm": 2.4929726123809814, + "learning_rate": 4.490275699935884e-05, + "loss": 0.7541, + "step": 487 + }, + { + "epoch": 0.3121451986887343, + "grad_norm": 2.432112216949463, + "learning_rate": 4.489207095533234e-05, + "loss": 0.7769, + "step": 488 + }, + { + "epoch": 0.312784840489326, + "grad_norm": 2.6986589431762695, + "learning_rate": 4.488138491130584e-05, + "loss": 0.8269, + "step": 489 + }, + { + "epoch": 0.31342448228991765, + "grad_norm": 2.663753032684326, + "learning_rate": 4.487069886727934e-05, + "loss": 0.6985, + "step": 490 + }, + { + "epoch": 0.3140641240905093, + "grad_norm": 2.7952847480773926, + "learning_rate": 4.4860012823252836e-05, + "loss": 0.8281, + "step": 491 + }, + { + "epoch": 0.314703765891101, + "grad_norm": 2.578540086746216, + "learning_rate": 4.484932677922633e-05, + "loss": 0.8079, + "step": 492 + }, + { + "epoch": 0.31534340769169267, + "grad_norm": 2.872422456741333, + "learning_rate": 4.4838640735199835e-05, + "loss": 0.8898, + "step": 493 + }, + { + "epoch": 0.31598304949228434, + "grad_norm": 2.4779903888702393, + "learning_rate": 4.482795469117333e-05, + "loss": 0.8286, + "step": 494 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 2.6066203117370605, + "learning_rate": 4.481726864714683e-05, + "loss": 0.7893, + "step": 495 + }, + { + "epoch": 0.3172623330934677, + "grad_norm": 2.8848042488098145, + "learning_rate": 4.4806582603120326e-05, + "loss": 0.8778, + "step": 496 + }, + { + "epoch": 0.3179019748940593, + "grad_norm": 2.7134692668914795, + "learning_rate": 4.4795896559093826e-05, + "loss": 0.8456, + "step": 497 + }, + { + "epoch": 0.318541616694651, + "grad_norm": 2.522134780883789, + "learning_rate": 4.4785210515067325e-05, + "loss": 0.7932, + "step": 498 + }, + { + "epoch": 0.31918125849524265, + "grad_norm": 2.80462908744812, + "learning_rate": 4.4774524471040824e-05, + "loss": 0.904, + "step": 499 + }, + { + "epoch": 0.3198209002958343, + "grad_norm": 2.497408151626587, + "learning_rate": 4.4763838427014324e-05, + "loss": 0.7034, + "step": 500 + }, + { + "epoch": 0.320460542096426, + "grad_norm": 2.8543717861175537, + "learning_rate": 4.4753152382987816e-05, + "loss": 0.952, + "step": 501 + }, + { + "epoch": 0.32110018389701767, + "grad_norm": 2.526951789855957, + "learning_rate": 4.474246633896132e-05, + "loss": 0.707, + "step": 502 + }, + { + "epoch": 0.32173982569760934, + "grad_norm": 2.8484466075897217, + "learning_rate": 4.4731780294934815e-05, + "loss": 0.7867, + "step": 503 + }, + { + "epoch": 0.322379467498201, + "grad_norm": 2.264899253845215, + "learning_rate": 4.472109425090832e-05, + "loss": 0.7351, + "step": 504 + }, + { + "epoch": 0.3230191092987927, + "grad_norm": 2.4186136722564697, + "learning_rate": 4.4710408206881814e-05, + "loss": 0.8134, + "step": 505 + }, + { + "epoch": 0.32365875109938436, + "grad_norm": 2.5458221435546875, + "learning_rate": 4.469972216285531e-05, + "loss": 0.8059, + "step": 506 + }, + { + "epoch": 0.32429839289997603, + "grad_norm": 2.675720691680908, + "learning_rate": 4.468903611882881e-05, + "loss": 0.9216, + "step": 507 + }, + { + "epoch": 0.3249380347005677, + "grad_norm": 2.3849453926086426, + "learning_rate": 4.467835007480231e-05, + "loss": 0.7255, + "step": 508 + }, + { + "epoch": 0.3255776765011594, + "grad_norm": 2.522523880004883, + "learning_rate": 4.4667664030775805e-05, + "loss": 0.764, + "step": 509 + }, + { + "epoch": 0.32621731830175105, + "grad_norm": 2.300499677658081, + "learning_rate": 4.465697798674931e-05, + "loss": 0.6444, + "step": 510 + }, + { + "epoch": 0.32685696010234266, + "grad_norm": 2.8246982097625732, + "learning_rate": 4.46462919427228e-05, + "loss": 0.8461, + "step": 511 + }, + { + "epoch": 0.32749660190293434, + "grad_norm": 2.3655967712402344, + "learning_rate": 4.46356058986963e-05, + "loss": 0.7246, + "step": 512 + }, + { + "epoch": 0.328136243703526, + "grad_norm": 2.973665952682495, + "learning_rate": 4.462491985466981e-05, + "loss": 0.8732, + "step": 513 + }, + { + "epoch": 0.3287758855041177, + "grad_norm": 2.558032751083374, + "learning_rate": 4.46142338106433e-05, + "loss": 0.7513, + "step": 514 + }, + { + "epoch": 0.32941552730470935, + "grad_norm": 2.8842430114746094, + "learning_rate": 4.46035477666168e-05, + "loss": 0.7467, + "step": 515 + }, + { + "epoch": 0.330055169105301, + "grad_norm": 2.5235350131988525, + "learning_rate": 4.45928617225903e-05, + "loss": 0.7728, + "step": 516 + }, + { + "epoch": 0.3306948109058927, + "grad_norm": 2.788074493408203, + "learning_rate": 4.45821756785638e-05, + "loss": 0.8571, + "step": 517 + }, + { + "epoch": 0.3313344527064844, + "grad_norm": 2.650552749633789, + "learning_rate": 4.457148963453729e-05, + "loss": 0.8053, + "step": 518 + }, + { + "epoch": 0.33197409450707605, + "grad_norm": 2.4916484355926514, + "learning_rate": 4.45608035905108e-05, + "loss": 0.7712, + "step": 519 + }, + { + "epoch": 0.3326137363076677, + "grad_norm": 2.484419822692871, + "learning_rate": 4.455011754648429e-05, + "loss": 0.797, + "step": 520 + }, + { + "epoch": 0.3332533781082594, + "grad_norm": 2.747328996658325, + "learning_rate": 4.453943150245779e-05, + "loss": 0.8449, + "step": 521 + }, + { + "epoch": 0.33389301990885106, + "grad_norm": 2.469801187515259, + "learning_rate": 4.452874545843129e-05, + "loss": 0.8487, + "step": 522 + }, + { + "epoch": 0.33453266170944274, + "grad_norm": 2.670841932296753, + "learning_rate": 4.451805941440479e-05, + "loss": 0.8178, + "step": 523 + }, + { + "epoch": 0.3351723035100344, + "grad_norm": 2.2457292079925537, + "learning_rate": 4.450737337037829e-05, + "loss": 0.7296, + "step": 524 + }, + { + "epoch": 0.335811945310626, + "grad_norm": 2.5042967796325684, + "learning_rate": 4.449668732635179e-05, + "loss": 0.8448, + "step": 525 + }, + { + "epoch": 0.3364515871112177, + "grad_norm": 2.4827215671539307, + "learning_rate": 4.448600128232529e-05, + "loss": 0.8862, + "step": 526 + }, + { + "epoch": 0.33709122891180937, + "grad_norm": 2.4088759422302246, + "learning_rate": 4.447531523829878e-05, + "loss": 0.7664, + "step": 527 + }, + { + "epoch": 0.33773087071240104, + "grad_norm": 2.49212646484375, + "learning_rate": 4.4464629194272286e-05, + "loss": 0.8548, + "step": 528 + }, + { + "epoch": 0.3383705125129927, + "grad_norm": 2.1127588748931885, + "learning_rate": 4.445394315024578e-05, + "loss": 0.7091, + "step": 529 + }, + { + "epoch": 0.3390101543135844, + "grad_norm": 2.767030715942383, + "learning_rate": 4.444325710621928e-05, + "loss": 0.9049, + "step": 530 + }, + { + "epoch": 0.33964979611417606, + "grad_norm": 2.315673828125, + "learning_rate": 4.443257106219278e-05, + "loss": 0.7995, + "step": 531 + }, + { + "epoch": 0.34028943791476773, + "grad_norm": 2.6289587020874023, + "learning_rate": 4.4421885018166276e-05, + "loss": 0.8373, + "step": 532 + }, + { + "epoch": 0.3409290797153594, + "grad_norm": 2.2535407543182373, + "learning_rate": 4.4411198974139776e-05, + "loss": 0.7047, + "step": 533 + }, + { + "epoch": 0.3415687215159511, + "grad_norm": 2.6374125480651855, + "learning_rate": 4.4400512930113275e-05, + "loss": 0.7812, + "step": 534 + }, + { + "epoch": 0.34220836331654275, + "grad_norm": 2.6607789993286133, + "learning_rate": 4.4389826886086774e-05, + "loss": 0.761, + "step": 535 + }, + { + "epoch": 0.3428480051171344, + "grad_norm": 2.724668264389038, + "learning_rate": 4.4379140842060274e-05, + "loss": 0.7987, + "step": 536 + }, + { + "epoch": 0.3434876469177261, + "grad_norm": 2.52717924118042, + "learning_rate": 4.436845479803377e-05, + "loss": 0.7947, + "step": 537 + }, + { + "epoch": 0.34412728871831777, + "grad_norm": 2.9766016006469727, + "learning_rate": 4.4357768754007266e-05, + "loss": 0.8849, + "step": 538 + }, + { + "epoch": 0.3447669305189094, + "grad_norm": 2.571988582611084, + "learning_rate": 4.434708270998077e-05, + "loss": 0.7992, + "step": 539 + }, + { + "epoch": 0.34540657231950106, + "grad_norm": 2.6817004680633545, + "learning_rate": 4.4336396665954264e-05, + "loss": 0.7466, + "step": 540 + }, + { + "epoch": 0.34604621412009273, + "grad_norm": 2.3611767292022705, + "learning_rate": 4.4325710621927764e-05, + "loss": 0.7875, + "step": 541 + }, + { + "epoch": 0.3466858559206844, + "grad_norm": 2.5309157371520996, + "learning_rate": 4.431502457790126e-05, + "loss": 0.8233, + "step": 542 + }, + { + "epoch": 0.3473254977212761, + "grad_norm": 2.8166468143463135, + "learning_rate": 4.430433853387476e-05, + "loss": 0.8753, + "step": 543 + }, + { + "epoch": 0.34796513952186775, + "grad_norm": 2.665196180343628, + "learning_rate": 4.429365248984826e-05, + "loss": 0.8156, + "step": 544 + }, + { + "epoch": 0.3486047813224594, + "grad_norm": 2.1333603858947754, + "learning_rate": 4.428296644582176e-05, + "loss": 0.695, + "step": 545 + }, + { + "epoch": 0.3492444231230511, + "grad_norm": 2.1260366439819336, + "learning_rate": 4.427228040179526e-05, + "loss": 0.7013, + "step": 546 + }, + { + "epoch": 0.34988406492364277, + "grad_norm": 2.301292657852173, + "learning_rate": 4.426159435776875e-05, + "loss": 0.7797, + "step": 547 + }, + { + "epoch": 0.35052370672423444, + "grad_norm": 2.5352783203125, + "learning_rate": 4.425090831374226e-05, + "loss": 0.8562, + "step": 548 + }, + { + "epoch": 0.3511633485248261, + "grad_norm": 2.4450230598449707, + "learning_rate": 4.424022226971575e-05, + "loss": 0.8164, + "step": 549 + }, + { + "epoch": 0.3518029903254178, + "grad_norm": 2.4418418407440186, + "learning_rate": 4.422953622568925e-05, + "loss": 0.6877, + "step": 550 + }, + { + "epoch": 0.35244263212600946, + "grad_norm": 2.504390001296997, + "learning_rate": 4.421885018166275e-05, + "loss": 0.7671, + "step": 551 + }, + { + "epoch": 0.3530822739266011, + "grad_norm": 2.4340758323669434, + "learning_rate": 4.420816413763625e-05, + "loss": 0.8134, + "step": 552 + }, + { + "epoch": 0.35372191572719275, + "grad_norm": 2.650157928466797, + "learning_rate": 4.419747809360974e-05, + "loss": 0.9067, + "step": 553 + }, + { + "epoch": 0.3543615575277844, + "grad_norm": 2.4354360103607178, + "learning_rate": 4.418679204958325e-05, + "loss": 0.8253, + "step": 554 + }, + { + "epoch": 0.3550011993283761, + "grad_norm": 2.449326992034912, + "learning_rate": 4.417610600555674e-05, + "loss": 0.7755, + "step": 555 + }, + { + "epoch": 0.35564084112896777, + "grad_norm": 2.734926700592041, + "learning_rate": 4.416541996153024e-05, + "loss": 0.838, + "step": 556 + }, + { + "epoch": 0.35628048292955944, + "grad_norm": 2.658278703689575, + "learning_rate": 4.415473391750374e-05, + "loss": 0.8729, + "step": 557 + }, + { + "epoch": 0.3569201247301511, + "grad_norm": 2.790825366973877, + "learning_rate": 4.414404787347724e-05, + "loss": 0.8542, + "step": 558 + }, + { + "epoch": 0.3575597665307428, + "grad_norm": 2.4853923320770264, + "learning_rate": 4.413336182945074e-05, + "loss": 0.7759, + "step": 559 + }, + { + "epoch": 0.35819940833133446, + "grad_norm": 2.8261659145355225, + "learning_rate": 4.412267578542424e-05, + "loss": 0.9195, + "step": 560 + }, + { + "epoch": 0.35883905013192613, + "grad_norm": 2.0461819171905518, + "learning_rate": 4.411198974139774e-05, + "loss": 0.6423, + "step": 561 + }, + { + "epoch": 0.3594786919325178, + "grad_norm": 2.6489789485931396, + "learning_rate": 4.410130369737124e-05, + "loss": 0.8877, + "step": 562 + }, + { + "epoch": 0.3601183337331095, + "grad_norm": 2.5838587284088135, + "learning_rate": 4.4090617653344736e-05, + "loss": 0.7386, + "step": 563 + }, + { + "epoch": 0.36075797553370115, + "grad_norm": 2.292025566101074, + "learning_rate": 4.407993160931823e-05, + "loss": 0.767, + "step": 564 + }, + { + "epoch": 0.3613976173342928, + "grad_norm": 2.5135250091552734, + "learning_rate": 4.4069245565291735e-05, + "loss": 0.7544, + "step": 565 + }, + { + "epoch": 0.36203725913488444, + "grad_norm": 2.694716691970825, + "learning_rate": 4.405855952126523e-05, + "loss": 0.9038, + "step": 566 + }, + { + "epoch": 0.3626769009354761, + "grad_norm": 2.45570969581604, + "learning_rate": 4.404787347723873e-05, + "loss": 0.7544, + "step": 567 + }, + { + "epoch": 0.3633165427360678, + "grad_norm": 2.5258474349975586, + "learning_rate": 4.4037187433212226e-05, + "loss": 0.8299, + "step": 568 + }, + { + "epoch": 0.36395618453665946, + "grad_norm": 2.649857759475708, + "learning_rate": 4.4026501389185726e-05, + "loss": 0.7911, + "step": 569 + }, + { + "epoch": 0.3645958263372511, + "grad_norm": 2.8530423641204834, + "learning_rate": 4.4015815345159225e-05, + "loss": 0.8102, + "step": 570 + }, + { + "epoch": 0.3652354681378428, + "grad_norm": 2.6826438903808594, + "learning_rate": 4.4005129301132724e-05, + "loss": 0.8136, + "step": 571 + }, + { + "epoch": 0.3658751099384345, + "grad_norm": 2.5145070552825928, + "learning_rate": 4.3994443257106224e-05, + "loss": 0.7439, + "step": 572 + }, + { + "epoch": 0.36651475173902615, + "grad_norm": 2.6360299587249756, + "learning_rate": 4.3983757213079716e-05, + "loss": 0.6884, + "step": 573 + }, + { + "epoch": 0.3671543935396178, + "grad_norm": 2.543938636779785, + "learning_rate": 4.397307116905322e-05, + "loss": 0.7776, + "step": 574 + }, + { + "epoch": 0.3677940353402095, + "grad_norm": 2.270566940307617, + "learning_rate": 4.3962385125026715e-05, + "loss": 0.6941, + "step": 575 + }, + { + "epoch": 0.36843367714080116, + "grad_norm": 2.6573848724365234, + "learning_rate": 4.3951699081000214e-05, + "loss": 0.8608, + "step": 576 + }, + { + "epoch": 0.36907331894139284, + "grad_norm": 2.537231683731079, + "learning_rate": 4.3941013036973714e-05, + "loss": 0.8478, + "step": 577 + }, + { + "epoch": 0.3697129607419845, + "grad_norm": 2.367879867553711, + "learning_rate": 4.393032699294721e-05, + "loss": 0.7484, + "step": 578 + }, + { + "epoch": 0.3703526025425762, + "grad_norm": 3.0089163780212402, + "learning_rate": 4.391964094892071e-05, + "loss": 0.9008, + "step": 579 + }, + { + "epoch": 0.3709922443431678, + "grad_norm": 2.5588526725769043, + "learning_rate": 4.390895490489421e-05, + "loss": 0.836, + "step": 580 + }, + { + "epoch": 0.37163188614375947, + "grad_norm": 2.2014873027801514, + "learning_rate": 4.389826886086771e-05, + "loss": 0.7329, + "step": 581 + }, + { + "epoch": 0.37227152794435114, + "grad_norm": 2.430637836456299, + "learning_rate": 4.3887582816841204e-05, + "loss": 0.7465, + "step": 582 + }, + { + "epoch": 0.3729111697449428, + "grad_norm": 2.7012386322021484, + "learning_rate": 4.387689677281471e-05, + "loss": 0.835, + "step": 583 + }, + { + "epoch": 0.3735508115455345, + "grad_norm": 2.56974720954895, + "learning_rate": 4.38662107287882e-05, + "loss": 0.8503, + "step": 584 + }, + { + "epoch": 0.37419045334612616, + "grad_norm": 2.5912535190582275, + "learning_rate": 4.38555246847617e-05, + "loss": 0.8007, + "step": 585 + }, + { + "epoch": 0.37483009514671783, + "grad_norm": 2.491722345352173, + "learning_rate": 4.38448386407352e-05, + "loss": 0.8769, + "step": 586 + }, + { + "epoch": 0.3754697369473095, + "grad_norm": 2.7597715854644775, + "learning_rate": 4.38341525967087e-05, + "loss": 0.8429, + "step": 587 + }, + { + "epoch": 0.3761093787479012, + "grad_norm": 2.3039841651916504, + "learning_rate": 4.38234665526822e-05, + "loss": 0.7368, + "step": 588 + }, + { + "epoch": 0.37674902054849285, + "grad_norm": 2.0839362144470215, + "learning_rate": 4.38127805086557e-05, + "loss": 0.6954, + "step": 589 + }, + { + "epoch": 0.3773886623490845, + "grad_norm": 1.9851723909378052, + "learning_rate": 4.38020944646292e-05, + "loss": 0.6272, + "step": 590 + }, + { + "epoch": 0.3780283041496762, + "grad_norm": 2.592808723449707, + "learning_rate": 4.37914084206027e-05, + "loss": 0.8367, + "step": 591 + }, + { + "epoch": 0.37866794595026787, + "grad_norm": 2.5516512393951416, + "learning_rate": 4.37807223765762e-05, + "loss": 0.8028, + "step": 592 + }, + { + "epoch": 0.37930758775085954, + "grad_norm": 2.127110004425049, + "learning_rate": 4.377003633254969e-05, + "loss": 0.6786, + "step": 593 + }, + { + "epoch": 0.37994722955145116, + "grad_norm": 2.6587941646575928, + "learning_rate": 4.3759350288523196e-05, + "loss": 0.8044, + "step": 594 + }, + { + "epoch": 0.38058687135204283, + "grad_norm": 2.694045305252075, + "learning_rate": 4.374866424449669e-05, + "loss": 0.8164, + "step": 595 + }, + { + "epoch": 0.3812265131526345, + "grad_norm": 2.5876169204711914, + "learning_rate": 4.373797820047019e-05, + "loss": 0.7456, + "step": 596 + }, + { + "epoch": 0.3818661549532262, + "grad_norm": 2.4754750728607178, + "learning_rate": 4.372729215644369e-05, + "loss": 0.7261, + "step": 597 + }, + { + "epoch": 0.38250579675381785, + "grad_norm": 2.9321911334991455, + "learning_rate": 4.371660611241719e-05, + "loss": 0.9218, + "step": 598 + }, + { + "epoch": 0.3831454385544095, + "grad_norm": 2.8330905437469482, + "learning_rate": 4.370592006839068e-05, + "loss": 0.9054, + "step": 599 + }, + { + "epoch": 0.3837850803550012, + "grad_norm": 2.649747371673584, + "learning_rate": 4.3695234024364185e-05, + "loss": 0.7467, + "step": 600 + }, + { + "epoch": 0.38442472215559287, + "grad_norm": 2.5304954051971436, + "learning_rate": 4.368454798033768e-05, + "loss": 0.8089, + "step": 601 + }, + { + "epoch": 0.38506436395618454, + "grad_norm": 2.575817108154297, + "learning_rate": 4.367386193631118e-05, + "loss": 0.7682, + "step": 602 + }, + { + "epoch": 0.3857040057567762, + "grad_norm": 3.011615753173828, + "learning_rate": 4.366317589228468e-05, + "loss": 0.8105, + "step": 603 + }, + { + "epoch": 0.3863436475573679, + "grad_norm": 2.668299674987793, + "learning_rate": 4.3652489848258176e-05, + "loss": 0.8447, + "step": 604 + }, + { + "epoch": 0.38698328935795956, + "grad_norm": 2.440359592437744, + "learning_rate": 4.3641803804231676e-05, + "loss": 0.7538, + "step": 605 + }, + { + "epoch": 0.38762293115855123, + "grad_norm": 2.5039072036743164, + "learning_rate": 4.3631117760205175e-05, + "loss": 0.7949, + "step": 606 + }, + { + "epoch": 0.3882625729591429, + "grad_norm": 2.30733585357666, + "learning_rate": 4.3620431716178674e-05, + "loss": 0.7126, + "step": 607 + }, + { + "epoch": 0.3889022147597345, + "grad_norm": 2.566895008087158, + "learning_rate": 4.360974567215217e-05, + "loss": 0.8112, + "step": 608 + }, + { + "epoch": 0.3895418565603262, + "grad_norm": 2.3991849422454834, + "learning_rate": 4.359905962812567e-05, + "loss": 0.7751, + "step": 609 + }, + { + "epoch": 0.39018149836091787, + "grad_norm": 2.2058398723602295, + "learning_rate": 4.3588373584099166e-05, + "loss": 0.6873, + "step": 610 + }, + { + "epoch": 0.39082114016150954, + "grad_norm": 2.379000425338745, + "learning_rate": 4.3577687540072665e-05, + "loss": 0.7854, + "step": 611 + }, + { + "epoch": 0.3914607819621012, + "grad_norm": 2.1002843379974365, + "learning_rate": 4.3567001496046164e-05, + "loss": 0.6344, + "step": 612 + }, + { + "epoch": 0.3921004237626929, + "grad_norm": 2.43048357963562, + "learning_rate": 4.3556315452019664e-05, + "loss": 0.8256, + "step": 613 + }, + { + "epoch": 0.39274006556328456, + "grad_norm": 2.9100863933563232, + "learning_rate": 4.354562940799316e-05, + "loss": 0.844, + "step": 614 + }, + { + "epoch": 0.39337970736387623, + "grad_norm": 2.8449697494506836, + "learning_rate": 4.353494336396666e-05, + "loss": 0.9032, + "step": 615 + }, + { + "epoch": 0.3940193491644679, + "grad_norm": 2.6021840572357178, + "learning_rate": 4.352425731994016e-05, + "loss": 0.8271, + "step": 616 + }, + { + "epoch": 0.3946589909650596, + "grad_norm": 2.6643600463867188, + "learning_rate": 4.351357127591366e-05, + "loss": 0.8255, + "step": 617 + }, + { + "epoch": 0.39529863276565125, + "grad_norm": 2.3905458450317383, + "learning_rate": 4.350288523188716e-05, + "loss": 0.7539, + "step": 618 + }, + { + "epoch": 0.3959382745662429, + "grad_norm": 2.3749024868011475, + "learning_rate": 4.349219918786065e-05, + "loss": 0.8191, + "step": 619 + }, + { + "epoch": 0.3965779163668346, + "grad_norm": 2.5093674659729004, + "learning_rate": 4.348151314383416e-05, + "loss": 0.7669, + "step": 620 + }, + { + "epoch": 0.39721755816742627, + "grad_norm": 2.4019482135772705, + "learning_rate": 4.347082709980765e-05, + "loss": 0.7744, + "step": 621 + }, + { + "epoch": 0.3978571999680179, + "grad_norm": 2.5259127616882324, + "learning_rate": 4.346014105578115e-05, + "loss": 0.8141, + "step": 622 + }, + { + "epoch": 0.39849684176860956, + "grad_norm": 2.394785165786743, + "learning_rate": 4.344945501175465e-05, + "loss": 0.8367, + "step": 623 + }, + { + "epoch": 0.39913648356920123, + "grad_norm": 2.8428847789764404, + "learning_rate": 4.343876896772815e-05, + "loss": 0.9098, + "step": 624 + }, + { + "epoch": 0.3997761253697929, + "grad_norm": 2.2933173179626465, + "learning_rate": 4.342808292370165e-05, + "loss": 0.8307, + "step": 625 + }, + { + "epoch": 0.4004157671703846, + "grad_norm": 2.549668073654175, + "learning_rate": 4.341739687967515e-05, + "loss": 0.799, + "step": 626 + }, + { + "epoch": 0.40105540897097625, + "grad_norm": 2.5822505950927734, + "learning_rate": 4.340671083564865e-05, + "loss": 0.9867, + "step": 627 + }, + { + "epoch": 0.4016950507715679, + "grad_norm": 2.5839316844940186, + "learning_rate": 4.339602479162214e-05, + "loss": 0.7672, + "step": 628 + }, + { + "epoch": 0.4023346925721596, + "grad_norm": 2.860914468765259, + "learning_rate": 4.338533874759565e-05, + "loss": 0.9099, + "step": 629 + }, + { + "epoch": 0.40297433437275126, + "grad_norm": 2.6417458057403564, + "learning_rate": 4.337465270356914e-05, + "loss": 0.8955, + "step": 630 + }, + { + "epoch": 0.40361397617334294, + "grad_norm": 2.3838489055633545, + "learning_rate": 4.336396665954264e-05, + "loss": 0.7301, + "step": 631 + }, + { + "epoch": 0.4042536179739346, + "grad_norm": 2.394200325012207, + "learning_rate": 4.335328061551614e-05, + "loss": 0.7594, + "step": 632 + }, + { + "epoch": 0.4048932597745263, + "grad_norm": 2.947063684463501, + "learning_rate": 4.334259457148964e-05, + "loss": 0.9043, + "step": 633 + }, + { + "epoch": 0.40553290157511795, + "grad_norm": 2.514294147491455, + "learning_rate": 4.333190852746313e-05, + "loss": 0.8195, + "step": 634 + }, + { + "epoch": 0.4061725433757096, + "grad_norm": 2.889430284500122, + "learning_rate": 4.3321222483436636e-05, + "loss": 0.821, + "step": 635 + }, + { + "epoch": 0.40681218517630124, + "grad_norm": 2.493847370147705, + "learning_rate": 4.3310536439410135e-05, + "loss": 0.8217, + "step": 636 + }, + { + "epoch": 0.4074518269768929, + "grad_norm": 2.4664089679718018, + "learning_rate": 4.329985039538363e-05, + "loss": 0.7581, + "step": 637 + }, + { + "epoch": 0.4080914687774846, + "grad_norm": 2.623796224594116, + "learning_rate": 4.3289164351357134e-05, + "loss": 0.9139, + "step": 638 + }, + { + "epoch": 0.40873111057807626, + "grad_norm": 2.465691328048706, + "learning_rate": 4.327847830733063e-05, + "loss": 0.8096, + "step": 639 + }, + { + "epoch": 0.40937075237866793, + "grad_norm": 2.3869857788085938, + "learning_rate": 4.3267792263304126e-05, + "loss": 0.7679, + "step": 640 + }, + { + "epoch": 0.4100103941792596, + "grad_norm": 2.521512985229492, + "learning_rate": 4.3257106219277625e-05, + "loss": 0.7575, + "step": 641 + }, + { + "epoch": 0.4106500359798513, + "grad_norm": 2.2285306453704834, + "learning_rate": 4.3246420175251125e-05, + "loss": 0.6798, + "step": 642 + }, + { + "epoch": 0.41128967778044295, + "grad_norm": 2.48858904838562, + "learning_rate": 4.323573413122462e-05, + "loss": 0.7829, + "step": 643 + }, + { + "epoch": 0.4119293195810346, + "grad_norm": 2.2363147735595703, + "learning_rate": 4.3225048087198124e-05, + "loss": 0.8079, + "step": 644 + }, + { + "epoch": 0.4125689613816263, + "grad_norm": 2.4952003955841064, + "learning_rate": 4.3214362043171616e-05, + "loss": 0.8183, + "step": 645 + }, + { + "epoch": 0.41320860318221797, + "grad_norm": 2.6612093448638916, + "learning_rate": 4.320367599914512e-05, + "loss": 0.9394, + "step": 646 + }, + { + "epoch": 0.41384824498280964, + "grad_norm": 2.4674930572509766, + "learning_rate": 4.3192989955118615e-05, + "loss": 0.7378, + "step": 647 + }, + { + "epoch": 0.4144878867834013, + "grad_norm": 2.1883046627044678, + "learning_rate": 4.3182303911092114e-05, + "loss": 0.6946, + "step": 648 + }, + { + "epoch": 0.415127528583993, + "grad_norm": 2.3450024127960205, + "learning_rate": 4.3171617867065614e-05, + "loss": 0.6772, + "step": 649 + }, + { + "epoch": 0.4157671703845846, + "grad_norm": 2.1584982872009277, + "learning_rate": 4.316093182303911e-05, + "loss": 0.6317, + "step": 650 + }, + { + "epoch": 0.4164068121851763, + "grad_norm": 2.2315256595611572, + "learning_rate": 4.315024577901261e-05, + "loss": 0.6715, + "step": 651 + }, + { + "epoch": 0.41704645398576795, + "grad_norm": 2.278326988220215, + "learning_rate": 4.313955973498611e-05, + "loss": 0.6759, + "step": 652 + }, + { + "epoch": 0.4176860957863596, + "grad_norm": 2.892369031906128, + "learning_rate": 4.312887369095961e-05, + "loss": 0.8931, + "step": 653 + }, + { + "epoch": 0.4183257375869513, + "grad_norm": 2.7841877937316895, + "learning_rate": 4.3118187646933104e-05, + "loss": 0.7409, + "step": 654 + }, + { + "epoch": 0.41896537938754297, + "grad_norm": 2.8826382160186768, + "learning_rate": 4.310750160290661e-05, + "loss": 0.8583, + "step": 655 + }, + { + "epoch": 0.41960502118813464, + "grad_norm": 2.4668517112731934, + "learning_rate": 4.30968155588801e-05, + "loss": 0.7036, + "step": 656 + }, + { + "epoch": 0.4202446629887263, + "grad_norm": 2.5809171199798584, + "learning_rate": 4.30861295148536e-05, + "loss": 0.8181, + "step": 657 + }, + { + "epoch": 0.420884304789318, + "grad_norm": 2.7291839122772217, + "learning_rate": 4.30754434708271e-05, + "loss": 0.8102, + "step": 658 + }, + { + "epoch": 0.42152394658990966, + "grad_norm": 2.2775955200195312, + "learning_rate": 4.30647574268006e-05, + "loss": 0.6718, + "step": 659 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 2.565253734588623, + "learning_rate": 4.30540713827741e-05, + "loss": 0.8058, + "step": 660 + }, + { + "epoch": 0.422803230191093, + "grad_norm": 2.368832588195801, + "learning_rate": 4.30433853387476e-05, + "loss": 0.7757, + "step": 661 + }, + { + "epoch": 0.4234428719916847, + "grad_norm": 2.6588382720947266, + "learning_rate": 4.30326992947211e-05, + "loss": 0.842, + "step": 662 + }, + { + "epoch": 0.42408251379227635, + "grad_norm": 2.6485090255737305, + "learning_rate": 4.302201325069459e-05, + "loss": 0.8455, + "step": 663 + }, + { + "epoch": 0.42472215559286797, + "grad_norm": 2.6250391006469727, + "learning_rate": 4.30113272066681e-05, + "loss": 0.866, + "step": 664 + }, + { + "epoch": 0.42536179739345964, + "grad_norm": 2.5179715156555176, + "learning_rate": 4.300064116264159e-05, + "loss": 0.795, + "step": 665 + }, + { + "epoch": 0.4260014391940513, + "grad_norm": 2.139284372329712, + "learning_rate": 4.298995511861509e-05, + "loss": 0.6789, + "step": 666 + }, + { + "epoch": 0.426641080994643, + "grad_norm": 2.2436509132385254, + "learning_rate": 4.297926907458859e-05, + "loss": 0.7285, + "step": 667 + }, + { + "epoch": 0.42728072279523466, + "grad_norm": 2.4669735431671143, + "learning_rate": 4.296858303056209e-05, + "loss": 0.7738, + "step": 668 + }, + { + "epoch": 0.42792036459582633, + "grad_norm": 2.4628779888153076, + "learning_rate": 4.295789698653559e-05, + "loss": 0.8483, + "step": 669 + }, + { + "epoch": 0.428560006396418, + "grad_norm": 2.321608543395996, + "learning_rate": 4.2947210942509087e-05, + "loss": 0.756, + "step": 670 + }, + { + "epoch": 0.4291996481970097, + "grad_norm": 2.641051769256592, + "learning_rate": 4.2936524898482586e-05, + "loss": 0.7812, + "step": 671 + }, + { + "epoch": 0.42983928999760135, + "grad_norm": 2.4621758460998535, + "learning_rate": 4.292583885445608e-05, + "loss": 0.8161, + "step": 672 + }, + { + "epoch": 0.430478931798193, + "grad_norm": 2.2134177684783936, + "learning_rate": 4.2915152810429585e-05, + "loss": 0.6612, + "step": 673 + }, + { + "epoch": 0.4311185735987847, + "grad_norm": 2.3274316787719727, + "learning_rate": 4.290446676640308e-05, + "loss": 0.7569, + "step": 674 + }, + { + "epoch": 0.43175821539937637, + "grad_norm": 2.4170377254486084, + "learning_rate": 4.2893780722376583e-05, + "loss": 0.891, + "step": 675 + }, + { + "epoch": 0.43239785719996804, + "grad_norm": 2.400930643081665, + "learning_rate": 4.2883094678350076e-05, + "loss": 0.7742, + "step": 676 + }, + { + "epoch": 0.4330374990005597, + "grad_norm": 2.357038974761963, + "learning_rate": 4.2872408634323575e-05, + "loss": 0.7248, + "step": 677 + }, + { + "epoch": 0.43367714080115133, + "grad_norm": 2.403651714324951, + "learning_rate": 4.2861722590297075e-05, + "loss": 0.8527, + "step": 678 + }, + { + "epoch": 0.434316782601743, + "grad_norm": 2.695835590362549, + "learning_rate": 4.2851036546270574e-05, + "loss": 0.7556, + "step": 679 + }, + { + "epoch": 0.4349564244023347, + "grad_norm": 2.6627137660980225, + "learning_rate": 4.284035050224407e-05, + "loss": 0.8369, + "step": 680 + }, + { + "epoch": 0.43559606620292635, + "grad_norm": 2.368640661239624, + "learning_rate": 4.282966445821757e-05, + "loss": 0.6754, + "step": 681 + }, + { + "epoch": 0.436235708003518, + "grad_norm": 2.3832530975341797, + "learning_rate": 4.281897841419107e-05, + "loss": 0.7884, + "step": 682 + }, + { + "epoch": 0.4368753498041097, + "grad_norm": 2.2132256031036377, + "learning_rate": 4.2808292370164565e-05, + "loss": 0.7255, + "step": 683 + }, + { + "epoch": 0.43751499160470136, + "grad_norm": 2.579366683959961, + "learning_rate": 4.279760632613807e-05, + "loss": 0.841, + "step": 684 + }, + { + "epoch": 0.43815463340529304, + "grad_norm": 2.5500547885894775, + "learning_rate": 4.2786920282111563e-05, + "loss": 0.8392, + "step": 685 + }, + { + "epoch": 0.4387942752058847, + "grad_norm": 2.4645004272460938, + "learning_rate": 4.277623423808506e-05, + "loss": 0.684, + "step": 686 + }, + { + "epoch": 0.4394339170064764, + "grad_norm": 2.4809513092041016, + "learning_rate": 4.276554819405856e-05, + "loss": 0.684, + "step": 687 + }, + { + "epoch": 0.44007355880706805, + "grad_norm": 2.3200199604034424, + "learning_rate": 4.275486215003206e-05, + "loss": 0.7441, + "step": 688 + }, + { + "epoch": 0.4407132006076597, + "grad_norm": 2.687345027923584, + "learning_rate": 4.2744176106005554e-05, + "loss": 0.8809, + "step": 689 + }, + { + "epoch": 0.4413528424082514, + "grad_norm": 2.7118911743164062, + "learning_rate": 4.273349006197906e-05, + "loss": 0.786, + "step": 690 + }, + { + "epoch": 0.4419924842088431, + "grad_norm": 2.361036539077759, + "learning_rate": 4.272280401795255e-05, + "loss": 0.7542, + "step": 691 + }, + { + "epoch": 0.4426321260094347, + "grad_norm": 2.331491231918335, + "learning_rate": 4.271211797392605e-05, + "loss": 0.7253, + "step": 692 + }, + { + "epoch": 0.44327176781002636, + "grad_norm": 2.825608015060425, + "learning_rate": 4.270143192989955e-05, + "loss": 0.815, + "step": 693 + }, + { + "epoch": 0.44391140961061804, + "grad_norm": 2.79476261138916, + "learning_rate": 4.269074588587305e-05, + "loss": 0.7544, + "step": 694 + }, + { + "epoch": 0.4445510514112097, + "grad_norm": 2.341146945953369, + "learning_rate": 4.268005984184655e-05, + "loss": 0.7679, + "step": 695 + }, + { + "epoch": 0.4451906932118014, + "grad_norm": 2.362579584121704, + "learning_rate": 4.266937379782005e-05, + "loss": 0.8176, + "step": 696 + }, + { + "epoch": 0.44583033501239305, + "grad_norm": 2.415343999862671, + "learning_rate": 4.265868775379355e-05, + "loss": 0.772, + "step": 697 + }, + { + "epoch": 0.4464699768129847, + "grad_norm": 2.3519582748413086, + "learning_rate": 4.264800170976704e-05, + "loss": 0.7768, + "step": 698 + }, + { + "epoch": 0.4471096186135764, + "grad_norm": 2.468914270401001, + "learning_rate": 4.263731566574055e-05, + "loss": 0.8066, + "step": 699 + }, + { + "epoch": 0.44774926041416807, + "grad_norm": 2.216883420944214, + "learning_rate": 4.262662962171404e-05, + "loss": 0.741, + "step": 700 + }, + { + "epoch": 0.44838890221475974, + "grad_norm": 2.224919319152832, + "learning_rate": 4.2615943577687546e-05, + "loss": 0.7428, + "step": 701 + }, + { + "epoch": 0.4490285440153514, + "grad_norm": 2.4262194633483887, + "learning_rate": 4.260525753366104e-05, + "loss": 0.7415, + "step": 702 + }, + { + "epoch": 0.4496681858159431, + "grad_norm": 2.3034839630126953, + "learning_rate": 4.259457148963454e-05, + "loss": 0.7152, + "step": 703 + }, + { + "epoch": 0.45030782761653476, + "grad_norm": 2.5050270557403564, + "learning_rate": 4.258388544560804e-05, + "loss": 0.8726, + "step": 704 + }, + { + "epoch": 0.45094746941712643, + "grad_norm": 2.637089252471924, + "learning_rate": 4.257319940158154e-05, + "loss": 0.7758, + "step": 705 + }, + { + "epoch": 0.45158711121771805, + "grad_norm": 2.314626455307007, + "learning_rate": 4.2562513357555037e-05, + "loss": 0.8125, + "step": 706 + }, + { + "epoch": 0.4522267530183097, + "grad_norm": 2.5563905239105225, + "learning_rate": 4.2551827313528536e-05, + "loss": 0.8101, + "step": 707 + }, + { + "epoch": 0.4528663948189014, + "grad_norm": 2.3011550903320312, + "learning_rate": 4.2541141269502035e-05, + "loss": 0.7487, + "step": 708 + }, + { + "epoch": 0.45350603661949307, + "grad_norm": 2.784237861633301, + "learning_rate": 4.253045522547553e-05, + "loss": 0.8605, + "step": 709 + }, + { + "epoch": 0.45414567842008474, + "grad_norm": 2.527440309524536, + "learning_rate": 4.2519769181449034e-05, + "loss": 0.7039, + "step": 710 + }, + { + "epoch": 0.4547853202206764, + "grad_norm": 2.402970314025879, + "learning_rate": 4.2509083137422527e-05, + "loss": 0.802, + "step": 711 + }, + { + "epoch": 0.4554249620212681, + "grad_norm": 2.44575572013855, + "learning_rate": 4.2498397093396026e-05, + "loss": 0.8497, + "step": 712 + }, + { + "epoch": 0.45606460382185976, + "grad_norm": 2.604088068008423, + "learning_rate": 4.2487711049369525e-05, + "loss": 0.8716, + "step": 713 + }, + { + "epoch": 0.45670424562245143, + "grad_norm": 2.4849510192871094, + "learning_rate": 4.2477025005343025e-05, + "loss": 0.8625, + "step": 714 + }, + { + "epoch": 0.4573438874230431, + "grad_norm": 2.149947166442871, + "learning_rate": 4.2466338961316524e-05, + "loss": 0.6496, + "step": 715 + }, + { + "epoch": 0.4579835292236348, + "grad_norm": 2.463686227798462, + "learning_rate": 4.245565291729002e-05, + "loss": 0.8031, + "step": 716 + }, + { + "epoch": 0.45862317102422645, + "grad_norm": 2.275045156478882, + "learning_rate": 4.244496687326352e-05, + "loss": 0.7826, + "step": 717 + }, + { + "epoch": 0.4592628128248181, + "grad_norm": 2.113935947418213, + "learning_rate": 4.2434280829237015e-05, + "loss": 0.6638, + "step": 718 + }, + { + "epoch": 0.4599024546254098, + "grad_norm": 2.5344903469085693, + "learning_rate": 4.242359478521052e-05, + "loss": 0.8024, + "step": 719 + }, + { + "epoch": 0.4605420964260014, + "grad_norm": 2.180123805999756, + "learning_rate": 4.2412908741184014e-05, + "loss": 0.6937, + "step": 720 + }, + { + "epoch": 0.4611817382265931, + "grad_norm": 2.5835466384887695, + "learning_rate": 4.2402222697157513e-05, + "loss": 0.8583, + "step": 721 + }, + { + "epoch": 0.46182138002718476, + "grad_norm": 2.340942144393921, + "learning_rate": 4.239153665313101e-05, + "loss": 0.7583, + "step": 722 + }, + { + "epoch": 0.46246102182777643, + "grad_norm": 2.4271631240844727, + "learning_rate": 4.238085060910451e-05, + "loss": 0.7599, + "step": 723 + }, + { + "epoch": 0.4631006636283681, + "grad_norm": 2.501920223236084, + "learning_rate": 4.2370164565078005e-05, + "loss": 0.8234, + "step": 724 + }, + { + "epoch": 0.4637403054289598, + "grad_norm": 2.327273368835449, + "learning_rate": 4.235947852105151e-05, + "loss": 0.7222, + "step": 725 + }, + { + "epoch": 0.46437994722955145, + "grad_norm": 2.3421308994293213, + "learning_rate": 4.2348792477025003e-05, + "loss": 0.7665, + "step": 726 + }, + { + "epoch": 0.4650195890301431, + "grad_norm": 2.3444700241088867, + "learning_rate": 4.23381064329985e-05, + "loss": 0.7521, + "step": 727 + }, + { + "epoch": 0.4656592308307348, + "grad_norm": 2.665940284729004, + "learning_rate": 4.232742038897201e-05, + "loss": 0.9051, + "step": 728 + }, + { + "epoch": 0.46629887263132647, + "grad_norm": 2.394313097000122, + "learning_rate": 4.23167343449455e-05, + "loss": 0.7246, + "step": 729 + }, + { + "epoch": 0.46693851443191814, + "grad_norm": 2.325242280960083, + "learning_rate": 4.230604830091901e-05, + "loss": 0.7279, + "step": 730 + }, + { + "epoch": 0.4675781562325098, + "grad_norm": 2.416950225830078, + "learning_rate": 4.22953622568925e-05, + "loss": 0.7872, + "step": 731 + }, + { + "epoch": 0.4682177980331015, + "grad_norm": 2.328209400177002, + "learning_rate": 4.2284676212866e-05, + "loss": 0.7682, + "step": 732 + }, + { + "epoch": 0.46885743983369316, + "grad_norm": 2.7438161373138428, + "learning_rate": 4.22739901688395e-05, + "loss": 0.8456, + "step": 733 + }, + { + "epoch": 0.4694970816342848, + "grad_norm": 2.4973058700561523, + "learning_rate": 4.2263304124813e-05, + "loss": 0.7679, + "step": 734 + }, + { + "epoch": 0.47013672343487645, + "grad_norm": 2.284014940261841, + "learning_rate": 4.225261808078649e-05, + "loss": 0.6976, + "step": 735 + }, + { + "epoch": 0.4707763652354681, + "grad_norm": 2.4707276821136475, + "learning_rate": 4.224193203676e-05, + "loss": 0.8378, + "step": 736 + }, + { + "epoch": 0.4714160070360598, + "grad_norm": 2.4387924671173096, + "learning_rate": 4.223124599273349e-05, + "loss": 0.8649, + "step": 737 + }, + { + "epoch": 0.47205564883665146, + "grad_norm": 2.16929030418396, + "learning_rate": 4.222055994870699e-05, + "loss": 0.7006, + "step": 738 + }, + { + "epoch": 0.47269529063724314, + "grad_norm": 2.5939579010009766, + "learning_rate": 4.220987390468049e-05, + "loss": 0.8128, + "step": 739 + }, + { + "epoch": 0.4733349324378348, + "grad_norm": 2.0849668979644775, + "learning_rate": 4.219918786065399e-05, + "loss": 0.5965, + "step": 740 + }, + { + "epoch": 0.4739745742384265, + "grad_norm": 2.613360643386841, + "learning_rate": 4.218850181662749e-05, + "loss": 0.8357, + "step": 741 + }, + { + "epoch": 0.47461421603901816, + "grad_norm": 2.1418871879577637, + "learning_rate": 4.2177815772600986e-05, + "loss": 0.6704, + "step": 742 + }, + { + "epoch": 0.47525385783960983, + "grad_norm": 2.3548357486724854, + "learning_rate": 4.2167129728574486e-05, + "loss": 0.787, + "step": 743 + }, + { + "epoch": 0.4758934996402015, + "grad_norm": 2.6763994693756104, + "learning_rate": 4.215644368454798e-05, + "loss": 0.8784, + "step": 744 + }, + { + "epoch": 0.4765331414407932, + "grad_norm": 2.7560031414031982, + "learning_rate": 4.2145757640521485e-05, + "loss": 0.8917, + "step": 745 + }, + { + "epoch": 0.47717278324138485, + "grad_norm": 2.253741979598999, + "learning_rate": 4.213507159649498e-05, + "loss": 0.8011, + "step": 746 + }, + { + "epoch": 0.4778124250419765, + "grad_norm": 2.534224271774292, + "learning_rate": 4.2124385552468476e-05, + "loss": 0.796, + "step": 747 + }, + { + "epoch": 0.47845206684256814, + "grad_norm": 2.5827114582061768, + "learning_rate": 4.2113699508441976e-05, + "loss": 0.8557, + "step": 748 + }, + { + "epoch": 0.4790917086431598, + "grad_norm": 2.3602304458618164, + "learning_rate": 4.2103013464415475e-05, + "loss": 0.8351, + "step": 749 + }, + { + "epoch": 0.4797313504437515, + "grad_norm": 2.4108176231384277, + "learning_rate": 4.2092327420388975e-05, + "loss": 0.729, + "step": 750 + }, + { + "epoch": 0.48037099224434315, + "grad_norm": 2.093555450439453, + "learning_rate": 4.2081641376362474e-05, + "loss": 0.6602, + "step": 751 + }, + { + "epoch": 0.4810106340449348, + "grad_norm": 2.5413529872894287, + "learning_rate": 4.207095533233597e-05, + "loss": 0.8545, + "step": 752 + }, + { + "epoch": 0.4816502758455265, + "grad_norm": 2.3444695472717285, + "learning_rate": 4.2060269288309466e-05, + "loss": 0.6782, + "step": 753 + }, + { + "epoch": 0.48228991764611817, + "grad_norm": 2.5395138263702393, + "learning_rate": 4.204958324428297e-05, + "loss": 0.8499, + "step": 754 + }, + { + "epoch": 0.48292955944670984, + "grad_norm": 2.4707767963409424, + "learning_rate": 4.2038897200256465e-05, + "loss": 0.721, + "step": 755 + }, + { + "epoch": 0.4835692012473015, + "grad_norm": 2.2747201919555664, + "learning_rate": 4.2028211156229964e-05, + "loss": 0.7129, + "step": 756 + }, + { + "epoch": 0.4842088430478932, + "grad_norm": 2.6218323707580566, + "learning_rate": 4.201752511220346e-05, + "loss": 0.7978, + "step": 757 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 2.5273592472076416, + "learning_rate": 4.200683906817696e-05, + "loss": 0.745, + "step": 758 + }, + { + "epoch": 0.48548812664907653, + "grad_norm": 2.452383041381836, + "learning_rate": 4.199615302415046e-05, + "loss": 0.8259, + "step": 759 + }, + { + "epoch": 0.4861277684496682, + "grad_norm": 2.210493803024292, + "learning_rate": 4.198546698012396e-05, + "loss": 0.7466, + "step": 760 + }, + { + "epoch": 0.4867674102502599, + "grad_norm": 2.46187686920166, + "learning_rate": 4.197478093609746e-05, + "loss": 0.7492, + "step": 761 + }, + { + "epoch": 0.4874070520508515, + "grad_norm": 2.3991000652313232, + "learning_rate": 4.196409489207096e-05, + "loss": 0.7425, + "step": 762 + }, + { + "epoch": 0.48804669385144317, + "grad_norm": 2.3745670318603516, + "learning_rate": 4.195340884804446e-05, + "loss": 0.8105, + "step": 763 + }, + { + "epoch": 0.48868633565203484, + "grad_norm": 2.2570762634277344, + "learning_rate": 4.194272280401795e-05, + "loss": 0.7296, + "step": 764 + }, + { + "epoch": 0.4893259774526265, + "grad_norm": 2.485466480255127, + "learning_rate": 4.193203675999146e-05, + "loss": 0.8015, + "step": 765 + }, + { + "epoch": 0.4899656192532182, + "grad_norm": 2.5229861736297607, + "learning_rate": 4.192135071596495e-05, + "loss": 0.8643, + "step": 766 + }, + { + "epoch": 0.49060526105380986, + "grad_norm": 2.723752975463867, + "learning_rate": 4.191066467193845e-05, + "loss": 0.9526, + "step": 767 + }, + { + "epoch": 0.49124490285440153, + "grad_norm": 2.643864154815674, + "learning_rate": 4.189997862791195e-05, + "loss": 0.8067, + "step": 768 + }, + { + "epoch": 0.4918845446549932, + "grad_norm": 2.9591357707977295, + "learning_rate": 4.188929258388545e-05, + "loss": 0.9238, + "step": 769 + }, + { + "epoch": 0.4925241864555849, + "grad_norm": 2.3780622482299805, + "learning_rate": 4.187860653985894e-05, + "loss": 0.7758, + "step": 770 + }, + { + "epoch": 0.49316382825617655, + "grad_norm": 2.1047439575195312, + "learning_rate": 4.186792049583245e-05, + "loss": 0.7042, + "step": 771 + }, + { + "epoch": 0.4938034700567682, + "grad_norm": 2.330016851425171, + "learning_rate": 4.185723445180594e-05, + "loss": 0.7378, + "step": 772 + }, + { + "epoch": 0.4944431118573599, + "grad_norm": 2.4735641479492188, + "learning_rate": 4.184654840777944e-05, + "loss": 0.8509, + "step": 773 + }, + { + "epoch": 0.49508275365795157, + "grad_norm": 2.3342504501342773, + "learning_rate": 4.1835862363752946e-05, + "loss": 0.7791, + "step": 774 + }, + { + "epoch": 0.49572239545854324, + "grad_norm": 2.426173686981201, + "learning_rate": 4.182517631972644e-05, + "loss": 0.7744, + "step": 775 + }, + { + "epoch": 0.49636203725913486, + "grad_norm": 2.3046963214874268, + "learning_rate": 4.181449027569994e-05, + "loss": 0.7835, + "step": 776 + }, + { + "epoch": 0.49700167905972653, + "grad_norm": 2.3580424785614014, + "learning_rate": 4.180380423167344e-05, + "loss": 0.7936, + "step": 777 + }, + { + "epoch": 0.4976413208603182, + "grad_norm": 2.607560873031616, + "learning_rate": 4.1793118187646936e-05, + "loss": 0.8767, + "step": 778 + }, + { + "epoch": 0.4982809626609099, + "grad_norm": 2.2512693405151367, + "learning_rate": 4.178243214362043e-05, + "loss": 0.6273, + "step": 779 + }, + { + "epoch": 0.49892060446150155, + "grad_norm": 2.3312618732452393, + "learning_rate": 4.1771746099593935e-05, + "loss": 0.8266, + "step": 780 + }, + { + "epoch": 0.4995602462620932, + "grad_norm": 2.2288873195648193, + "learning_rate": 4.176106005556743e-05, + "loss": 0.7246, + "step": 781 + }, + { + "epoch": 0.5001998880626849, + "grad_norm": 2.559891939163208, + "learning_rate": 4.175037401154093e-05, + "loss": 0.8888, + "step": 782 + }, + { + "epoch": 0.5008395298632765, + "grad_norm": 2.577299118041992, + "learning_rate": 4.1739687967514426e-05, + "loss": 0.833, + "step": 783 + }, + { + "epoch": 0.5014791716638682, + "grad_norm": 2.1614084243774414, + "learning_rate": 4.1729001923487926e-05, + "loss": 0.6953, + "step": 784 + }, + { + "epoch": 0.5021188134644599, + "grad_norm": 2.394045114517212, + "learning_rate": 4.1718315879461425e-05, + "loss": 0.7687, + "step": 785 + }, + { + "epoch": 0.5027584552650516, + "grad_norm": 2.584552049636841, + "learning_rate": 4.1707629835434924e-05, + "loss": 0.8308, + "step": 786 + }, + { + "epoch": 0.5033980970656432, + "grad_norm": 2.7457361221313477, + "learning_rate": 4.1696943791408424e-05, + "loss": 0.9089, + "step": 787 + }, + { + "epoch": 0.5040377388662349, + "grad_norm": 2.0862643718719482, + "learning_rate": 4.168625774738192e-05, + "loss": 0.7044, + "step": 788 + }, + { + "epoch": 0.5046773806668265, + "grad_norm": 2.2941691875457764, + "learning_rate": 4.167557170335542e-05, + "loss": 0.736, + "step": 789 + }, + { + "epoch": 0.5053170224674183, + "grad_norm": 2.494553565979004, + "learning_rate": 4.1664885659328915e-05, + "loss": 0.8029, + "step": 790 + }, + { + "epoch": 0.5059566642680099, + "grad_norm": 2.7531251907348633, + "learning_rate": 4.165419961530242e-05, + "loss": 0.7177, + "step": 791 + }, + { + "epoch": 0.5065963060686016, + "grad_norm": 2.6599185466766357, + "learning_rate": 4.1643513571275914e-05, + "loss": 0.7248, + "step": 792 + }, + { + "epoch": 0.5072359478691932, + "grad_norm": 2.495180130004883, + "learning_rate": 4.163282752724941e-05, + "loss": 0.7985, + "step": 793 + }, + { + "epoch": 0.507875589669785, + "grad_norm": 2.3413760662078857, + "learning_rate": 4.162214148322291e-05, + "loss": 0.6592, + "step": 794 + }, + { + "epoch": 0.5085152314703766, + "grad_norm": 2.274296760559082, + "learning_rate": 4.161145543919641e-05, + "loss": 0.6942, + "step": 795 + }, + { + "epoch": 0.5091548732709683, + "grad_norm": 2.517688035964966, + "learning_rate": 4.160076939516991e-05, + "loss": 0.7514, + "step": 796 + }, + { + "epoch": 0.5097945150715599, + "grad_norm": 2.3168115615844727, + "learning_rate": 4.159008335114341e-05, + "loss": 0.7295, + "step": 797 + }, + { + "epoch": 0.5104341568721515, + "grad_norm": 2.2231526374816895, + "learning_rate": 4.157939730711691e-05, + "loss": 0.6427, + "step": 798 + }, + { + "epoch": 0.5110737986727433, + "grad_norm": 2.4800660610198975, + "learning_rate": 4.15687112630904e-05, + "loss": 0.7576, + "step": 799 + }, + { + "epoch": 0.5117134404733349, + "grad_norm": 2.3524868488311768, + "learning_rate": 4.155802521906391e-05, + "loss": 0.7715, + "step": 800 + }, + { + "epoch": 0.5123530822739266, + "grad_norm": 2.6067728996276855, + "learning_rate": 4.15473391750374e-05, + "loss": 0.8254, + "step": 801 + }, + { + "epoch": 0.5129927240745182, + "grad_norm": 2.444256067276001, + "learning_rate": 4.15366531310109e-05, + "loss": 0.7808, + "step": 802 + }, + { + "epoch": 0.51363236587511, + "grad_norm": 2.729552745819092, + "learning_rate": 4.15259670869844e-05, + "loss": 0.8508, + "step": 803 + }, + { + "epoch": 0.5142720076757016, + "grad_norm": 2.523963451385498, + "learning_rate": 4.15152810429579e-05, + "loss": 0.8453, + "step": 804 + }, + { + "epoch": 0.5149116494762933, + "grad_norm": 2.5863211154937744, + "learning_rate": 4.150459499893139e-05, + "loss": 0.8191, + "step": 805 + }, + { + "epoch": 0.5155512912768849, + "grad_norm": 2.0983970165252686, + "learning_rate": 4.14939089549049e-05, + "loss": 0.7109, + "step": 806 + }, + { + "epoch": 0.5161909330774767, + "grad_norm": 2.326378345489502, + "learning_rate": 4.14832229108784e-05, + "loss": 0.7702, + "step": 807 + }, + { + "epoch": 0.5168305748780683, + "grad_norm": 2.384821891784668, + "learning_rate": 4.147253686685189e-05, + "loss": 0.775, + "step": 808 + }, + { + "epoch": 0.51747021667866, + "grad_norm": 2.3771309852600098, + "learning_rate": 4.1461850822825396e-05, + "loss": 0.6902, + "step": 809 + }, + { + "epoch": 0.5181098584792516, + "grad_norm": 2.3961310386657715, + "learning_rate": 4.145116477879889e-05, + "loss": 0.6649, + "step": 810 + }, + { + "epoch": 0.5187495002798432, + "grad_norm": 2.2681758403778076, + "learning_rate": 4.144047873477239e-05, + "loss": 0.7042, + "step": 811 + }, + { + "epoch": 0.519389142080435, + "grad_norm": 2.1639442443847656, + "learning_rate": 4.142979269074589e-05, + "loss": 0.6832, + "step": 812 + }, + { + "epoch": 0.5200287838810266, + "grad_norm": 2.4273269176483154, + "learning_rate": 4.141910664671939e-05, + "loss": 0.787, + "step": 813 + }, + { + "epoch": 0.5206684256816183, + "grad_norm": 2.7979812622070312, + "learning_rate": 4.1408420602692886e-05, + "loss": 0.9597, + "step": 814 + }, + { + "epoch": 0.5213080674822099, + "grad_norm": 2.4153497219085693, + "learning_rate": 4.1397734558666386e-05, + "loss": 0.8146, + "step": 815 + }, + { + "epoch": 0.5219477092828017, + "grad_norm": 2.2426140308380127, + "learning_rate": 4.138704851463988e-05, + "loss": 0.7293, + "step": 816 + }, + { + "epoch": 0.5225873510833933, + "grad_norm": 2.6007134914398193, + "learning_rate": 4.1376362470613384e-05, + "loss": 0.8676, + "step": 817 + }, + { + "epoch": 0.523226992883985, + "grad_norm": 2.4803342819213867, + "learning_rate": 4.136567642658688e-05, + "loss": 0.7309, + "step": 818 + }, + { + "epoch": 0.5238666346845766, + "grad_norm": 2.0530762672424316, + "learning_rate": 4.1354990382560376e-05, + "loss": 0.6582, + "step": 819 + }, + { + "epoch": 0.5245062764851683, + "grad_norm": 2.6229400634765625, + "learning_rate": 4.1344304338533876e-05, + "loss": 0.9341, + "step": 820 + }, + { + "epoch": 0.52514591828576, + "grad_norm": 2.2598581314086914, + "learning_rate": 4.1333618294507375e-05, + "loss": 0.7169, + "step": 821 + }, + { + "epoch": 0.5257855600863517, + "grad_norm": 2.365225076675415, + "learning_rate": 4.1322932250480874e-05, + "loss": 0.834, + "step": 822 + }, + { + "epoch": 0.5264252018869433, + "grad_norm": 2.0500223636627197, + "learning_rate": 4.1312246206454374e-05, + "loss": 0.6791, + "step": 823 + }, + { + "epoch": 0.527064843687535, + "grad_norm": 2.4313559532165527, + "learning_rate": 4.130156016242787e-05, + "loss": 0.7526, + "step": 824 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 2.3353288173675537, + "learning_rate": 4.1290874118401366e-05, + "loss": 0.7322, + "step": 825 + }, + { + "epoch": 0.5283441272887183, + "grad_norm": 2.41013503074646, + "learning_rate": 4.128018807437487e-05, + "loss": 0.797, + "step": 826 + }, + { + "epoch": 0.52898376908931, + "grad_norm": 2.5067005157470703, + "learning_rate": 4.1269502030348364e-05, + "loss": 0.7427, + "step": 827 + }, + { + "epoch": 0.5296234108899016, + "grad_norm": 2.6221978664398193, + "learning_rate": 4.1258815986321864e-05, + "loss": 0.8642, + "step": 828 + }, + { + "epoch": 0.5302630526904933, + "grad_norm": 2.5964040756225586, + "learning_rate": 4.124812994229536e-05, + "loss": 0.8008, + "step": 829 + }, + { + "epoch": 0.530902694491085, + "grad_norm": 2.1217386722564697, + "learning_rate": 4.123744389826886e-05, + "loss": 0.6759, + "step": 830 + }, + { + "epoch": 0.5315423362916767, + "grad_norm": 2.4662091732025146, + "learning_rate": 4.122675785424236e-05, + "loss": 0.8088, + "step": 831 + }, + { + "epoch": 0.5321819780922683, + "grad_norm": 2.3242239952087402, + "learning_rate": 4.121607181021586e-05, + "loss": 0.8301, + "step": 832 + }, + { + "epoch": 0.53282161989286, + "grad_norm": 2.286344528198242, + "learning_rate": 4.120538576618936e-05, + "loss": 0.7303, + "step": 833 + }, + { + "epoch": 0.5334612616934516, + "grad_norm": 2.096362829208374, + "learning_rate": 4.119469972216285e-05, + "loss": 0.686, + "step": 834 + }, + { + "epoch": 0.5341009034940434, + "grad_norm": 2.350806951522827, + "learning_rate": 4.118401367813636e-05, + "loss": 0.7674, + "step": 835 + }, + { + "epoch": 0.534740545294635, + "grad_norm": 2.3228611946105957, + "learning_rate": 4.117332763410985e-05, + "loss": 0.7578, + "step": 836 + }, + { + "epoch": 0.5353801870952267, + "grad_norm": 2.219318389892578, + "learning_rate": 4.116264159008335e-05, + "loss": 0.7058, + "step": 837 + }, + { + "epoch": 0.5360198288958183, + "grad_norm": 2.051234722137451, + "learning_rate": 4.115195554605685e-05, + "loss": 0.6298, + "step": 838 + }, + { + "epoch": 0.53665947069641, + "grad_norm": 2.0875167846679688, + "learning_rate": 4.114126950203035e-05, + "loss": 0.6579, + "step": 839 + }, + { + "epoch": 0.5372991124970017, + "grad_norm": 2.814260959625244, + "learning_rate": 4.113058345800385e-05, + "loss": 0.9552, + "step": 840 + }, + { + "epoch": 0.5379387542975933, + "grad_norm": 2.2160706520080566, + "learning_rate": 4.111989741397735e-05, + "loss": 0.6829, + "step": 841 + }, + { + "epoch": 0.538578396098185, + "grad_norm": 2.3203299045562744, + "learning_rate": 4.110921136995085e-05, + "loss": 0.7216, + "step": 842 + }, + { + "epoch": 0.5392180378987766, + "grad_norm": 2.5681164264678955, + "learning_rate": 4.109852532592435e-05, + "loss": 0.8983, + "step": 843 + }, + { + "epoch": 0.5398576796993684, + "grad_norm": 2.9693334102630615, + "learning_rate": 4.108783928189785e-05, + "loss": 0.9235, + "step": 844 + }, + { + "epoch": 0.54049732149996, + "grad_norm": 2.320012331008911, + "learning_rate": 4.107715323787134e-05, + "loss": 0.7369, + "step": 845 + }, + { + "epoch": 0.5411369633005517, + "grad_norm": 2.2545132637023926, + "learning_rate": 4.1066467193844846e-05, + "loss": 0.7332, + "step": 846 + }, + { + "epoch": 0.5417766051011433, + "grad_norm": 2.706484317779541, + "learning_rate": 4.105578114981834e-05, + "loss": 0.7563, + "step": 847 + }, + { + "epoch": 0.5424162469017351, + "grad_norm": 2.2728209495544434, + "learning_rate": 4.104509510579184e-05, + "loss": 0.7921, + "step": 848 + }, + { + "epoch": 0.5430558887023267, + "grad_norm": 2.2434425354003906, + "learning_rate": 4.103440906176534e-05, + "loss": 0.7594, + "step": 849 + }, + { + "epoch": 0.5436955305029184, + "grad_norm": 2.4385409355163574, + "learning_rate": 4.1023723017738836e-05, + "loss": 0.7746, + "step": 850 + }, + { + "epoch": 0.54433517230351, + "grad_norm": 2.4708523750305176, + "learning_rate": 4.101303697371233e-05, + "loss": 0.771, + "step": 851 + }, + { + "epoch": 0.5449748141041018, + "grad_norm": 2.6323025226593018, + "learning_rate": 4.1002350929685835e-05, + "loss": 0.8149, + "step": 852 + }, + { + "epoch": 0.5456144559046934, + "grad_norm": 2.2841379642486572, + "learning_rate": 4.0991664885659334e-05, + "loss": 0.7075, + "step": 853 + }, + { + "epoch": 0.546254097705285, + "grad_norm": 2.273214101791382, + "learning_rate": 4.098097884163283e-05, + "loss": 0.7082, + "step": 854 + }, + { + "epoch": 0.5468937395058767, + "grad_norm": 2.3645529747009277, + "learning_rate": 4.097029279760633e-05, + "loss": 0.7747, + "step": 855 + }, + { + "epoch": 0.5475333813064683, + "grad_norm": 2.2649810314178467, + "learning_rate": 4.0959606753579826e-05, + "loss": 0.8063, + "step": 856 + }, + { + "epoch": 0.5481730231070601, + "grad_norm": 2.247239112854004, + "learning_rate": 4.0948920709553325e-05, + "loss": 0.6845, + "step": 857 + }, + { + "epoch": 0.5488126649076517, + "grad_norm": 2.3738226890563965, + "learning_rate": 4.0938234665526824e-05, + "loss": 0.8023, + "step": 858 + }, + { + "epoch": 0.5494523067082434, + "grad_norm": 2.4073843955993652, + "learning_rate": 4.0927548621500324e-05, + "loss": 0.7924, + "step": 859 + }, + { + "epoch": 0.550091948508835, + "grad_norm": 2.217334270477295, + "learning_rate": 4.0916862577473816e-05, + "loss": 0.7423, + "step": 860 + }, + { + "epoch": 0.5507315903094268, + "grad_norm": 2.7021453380584717, + "learning_rate": 4.090617653344732e-05, + "loss": 0.8054, + "step": 861 + }, + { + "epoch": 0.5513712321100184, + "grad_norm": 2.407155752182007, + "learning_rate": 4.0895490489420815e-05, + "loss": 0.7896, + "step": 862 + }, + { + "epoch": 0.5520108739106101, + "grad_norm": 2.5113189220428467, + "learning_rate": 4.0884804445394314e-05, + "loss": 0.7933, + "step": 863 + }, + { + "epoch": 0.5526505157112017, + "grad_norm": 2.4268243312835693, + "learning_rate": 4.0874118401367814e-05, + "loss": 0.8049, + "step": 864 + }, + { + "epoch": 0.5532901575117934, + "grad_norm": 2.3329579830169678, + "learning_rate": 4.086343235734131e-05, + "loss": 0.7039, + "step": 865 + }, + { + "epoch": 0.5539297993123851, + "grad_norm": 2.568941831588745, + "learning_rate": 4.085274631331481e-05, + "loss": 0.8756, + "step": 866 + }, + { + "epoch": 0.5545694411129767, + "grad_norm": 2.2570316791534424, + "learning_rate": 4.084206026928831e-05, + "loss": 0.7051, + "step": 867 + }, + { + "epoch": 0.5552090829135684, + "grad_norm": 2.3210926055908203, + "learning_rate": 4.083137422526181e-05, + "loss": 0.6965, + "step": 868 + }, + { + "epoch": 0.55584872471416, + "grad_norm": 2.353562116622925, + "learning_rate": 4.0820688181235304e-05, + "loss": 0.7586, + "step": 869 + }, + { + "epoch": 0.5564883665147518, + "grad_norm": 1.8056342601776123, + "learning_rate": 4.081000213720881e-05, + "loss": 0.6007, + "step": 870 + }, + { + "epoch": 0.5571280083153434, + "grad_norm": 2.4414591789245605, + "learning_rate": 4.07993160931823e-05, + "loss": 0.7651, + "step": 871 + }, + { + "epoch": 0.5577676501159351, + "grad_norm": 2.388673782348633, + "learning_rate": 4.078863004915581e-05, + "loss": 0.8275, + "step": 872 + }, + { + "epoch": 0.5584072919165267, + "grad_norm": 2.5591752529144287, + "learning_rate": 4.07779440051293e-05, + "loss": 0.8951, + "step": 873 + }, + { + "epoch": 0.5590469337171184, + "grad_norm": 2.5922563076019287, + "learning_rate": 4.07672579611028e-05, + "loss": 0.8393, + "step": 874 + }, + { + "epoch": 0.5596865755177101, + "grad_norm": 2.4653162956237793, + "learning_rate": 4.07565719170763e-05, + "loss": 0.7547, + "step": 875 + }, + { + "epoch": 0.5603262173183018, + "grad_norm": 2.1400020122528076, + "learning_rate": 4.07458858730498e-05, + "loss": 0.6497, + "step": 876 + }, + { + "epoch": 0.5609658591188934, + "grad_norm": 2.6126105785369873, + "learning_rate": 4.07351998290233e-05, + "loss": 0.8739, + "step": 877 + }, + { + "epoch": 0.5616055009194851, + "grad_norm": 2.373138904571533, + "learning_rate": 4.07245137849968e-05, + "loss": 0.8179, + "step": 878 + }, + { + "epoch": 0.5622451427200768, + "grad_norm": 2.454766273498535, + "learning_rate": 4.07138277409703e-05, + "loss": 0.8469, + "step": 879 + }, + { + "epoch": 0.5628847845206685, + "grad_norm": 2.2886855602264404, + "learning_rate": 4.070314169694379e-05, + "loss": 0.7826, + "step": 880 + }, + { + "epoch": 0.5635244263212601, + "grad_norm": 2.350181818008423, + "learning_rate": 4.0692455652917296e-05, + "loss": 0.8548, + "step": 881 + }, + { + "epoch": 0.5641640681218517, + "grad_norm": 2.328355073928833, + "learning_rate": 4.068176960889079e-05, + "loss": 0.7654, + "step": 882 + }, + { + "epoch": 0.5648037099224434, + "grad_norm": 2.591658592224121, + "learning_rate": 4.067108356486429e-05, + "loss": 0.8854, + "step": 883 + }, + { + "epoch": 0.5654433517230351, + "grad_norm": 2.2084527015686035, + "learning_rate": 4.066039752083779e-05, + "loss": 0.6809, + "step": 884 + }, + { + "epoch": 0.5660829935236268, + "grad_norm": 2.2749183177948, + "learning_rate": 4.064971147681129e-05, + "loss": 0.7978, + "step": 885 + }, + { + "epoch": 0.5667226353242184, + "grad_norm": 2.679457902908325, + "learning_rate": 4.0639025432784786e-05, + "loss": 0.9257, + "step": 886 + }, + { + "epoch": 0.5673622771248101, + "grad_norm": 2.0867230892181396, + "learning_rate": 4.0628339388758286e-05, + "loss": 0.7228, + "step": 887 + }, + { + "epoch": 0.5680019189254017, + "grad_norm": 2.5408248901367188, + "learning_rate": 4.0617653344731785e-05, + "loss": 0.8027, + "step": 888 + }, + { + "epoch": 0.5686415607259935, + "grad_norm": 2.151369571685791, + "learning_rate": 4.060696730070528e-05, + "loss": 0.7427, + "step": 889 + }, + { + "epoch": 0.5692812025265851, + "grad_norm": 2.505272150039673, + "learning_rate": 4.0596281256678784e-05, + "loss": 0.6939, + "step": 890 + }, + { + "epoch": 0.5699208443271768, + "grad_norm": 2.562251567840576, + "learning_rate": 4.0585595212652276e-05, + "loss": 0.766, + "step": 891 + }, + { + "epoch": 0.5705604861277684, + "grad_norm": 2.436009645462036, + "learning_rate": 4.0574909168625776e-05, + "loss": 0.7833, + "step": 892 + }, + { + "epoch": 0.5712001279283602, + "grad_norm": 2.7431294918060303, + "learning_rate": 4.0564223124599275e-05, + "loss": 0.7327, + "step": 893 + }, + { + "epoch": 0.5718397697289518, + "grad_norm": 2.437372922897339, + "learning_rate": 4.0553537080572774e-05, + "loss": 0.7623, + "step": 894 + }, + { + "epoch": 0.5724794115295434, + "grad_norm": 2.344759225845337, + "learning_rate": 4.054285103654627e-05, + "loss": 0.7997, + "step": 895 + }, + { + "epoch": 0.5731190533301351, + "grad_norm": 2.7062759399414062, + "learning_rate": 4.053216499251977e-05, + "loss": 0.848, + "step": 896 + }, + { + "epoch": 0.5737586951307267, + "grad_norm": 2.716770887374878, + "learning_rate": 4.0521478948493266e-05, + "loss": 0.8653, + "step": 897 + }, + { + "epoch": 0.5743983369313185, + "grad_norm": 2.593524694442749, + "learning_rate": 4.051079290446677e-05, + "loss": 0.8759, + "step": 898 + }, + { + "epoch": 0.5750379787319101, + "grad_norm": 2.676051616668701, + "learning_rate": 4.050010686044027e-05, + "loss": 0.8395, + "step": 899 + }, + { + "epoch": 0.5756776205325018, + "grad_norm": 2.438495635986328, + "learning_rate": 4.0489420816413764e-05, + "loss": 0.811, + "step": 900 + }, + { + "epoch": 0.5763172623330934, + "grad_norm": 2.6083335876464844, + "learning_rate": 4.047873477238727e-05, + "loss": 0.8095, + "step": 901 + }, + { + "epoch": 0.5769569041336852, + "grad_norm": 2.230010509490967, + "learning_rate": 4.046804872836076e-05, + "loss": 0.7149, + "step": 902 + }, + { + "epoch": 0.5775965459342768, + "grad_norm": 2.290432929992676, + "learning_rate": 4.045736268433426e-05, + "loss": 0.7345, + "step": 903 + }, + { + "epoch": 0.5782361877348685, + "grad_norm": 2.2592225074768066, + "learning_rate": 4.044667664030776e-05, + "loss": 0.6788, + "step": 904 + }, + { + "epoch": 0.5788758295354601, + "grad_norm": 2.3218696117401123, + "learning_rate": 4.043599059628126e-05, + "loss": 0.7009, + "step": 905 + }, + { + "epoch": 0.5795154713360519, + "grad_norm": 1.7487784624099731, + "learning_rate": 4.042530455225475e-05, + "loss": 0.6401, + "step": 906 + }, + { + "epoch": 0.5801551131366435, + "grad_norm": 2.319746255874634, + "learning_rate": 4.041461850822826e-05, + "loss": 0.7132, + "step": 907 + }, + { + "epoch": 0.5807947549372352, + "grad_norm": 2.304169178009033, + "learning_rate": 4.040393246420175e-05, + "loss": 0.7472, + "step": 908 + }, + { + "epoch": 0.5814343967378268, + "grad_norm": 2.371488332748413, + "learning_rate": 4.039324642017525e-05, + "loss": 0.7464, + "step": 909 + }, + { + "epoch": 0.5820740385384184, + "grad_norm": 2.3133323192596436, + "learning_rate": 4.038256037614875e-05, + "loss": 0.6964, + "step": 910 + }, + { + "epoch": 0.5827136803390102, + "grad_norm": 2.2671494483947754, + "learning_rate": 4.037187433212225e-05, + "loss": 0.7227, + "step": 911 + }, + { + "epoch": 0.5833533221396018, + "grad_norm": 2.6780896186828613, + "learning_rate": 4.036118828809575e-05, + "loss": 0.7842, + "step": 912 + }, + { + "epoch": 0.5839929639401935, + "grad_norm": 2.6332082748413086, + "learning_rate": 4.035050224406925e-05, + "loss": 0.8686, + "step": 913 + }, + { + "epoch": 0.5846326057407851, + "grad_norm": 2.6990888118743896, + "learning_rate": 4.033981620004275e-05, + "loss": 0.8382, + "step": 914 + }, + { + "epoch": 0.5852722475413769, + "grad_norm": 2.5040202140808105, + "learning_rate": 4.032913015601624e-05, + "loss": 0.7601, + "step": 915 + }, + { + "epoch": 0.5859118893419685, + "grad_norm": 2.675201654434204, + "learning_rate": 4.031844411198975e-05, + "loss": 0.8651, + "step": 916 + }, + { + "epoch": 0.5865515311425602, + "grad_norm": 2.1978743076324463, + "learning_rate": 4.030775806796324e-05, + "loss": 0.6559, + "step": 917 + }, + { + "epoch": 0.5871911729431518, + "grad_norm": 2.3096039295196533, + "learning_rate": 4.029707202393674e-05, + "loss": 0.7137, + "step": 918 + }, + { + "epoch": 0.5878308147437435, + "grad_norm": 2.233640670776367, + "learning_rate": 4.028638597991024e-05, + "loss": 0.7059, + "step": 919 + }, + { + "epoch": 0.5884704565443352, + "grad_norm": 2.2236380577087402, + "learning_rate": 4.027569993588374e-05, + "loss": 0.6919, + "step": 920 + }, + { + "epoch": 0.5891100983449269, + "grad_norm": 2.382692337036133, + "learning_rate": 4.026501389185724e-05, + "loss": 0.7835, + "step": 921 + }, + { + "epoch": 0.5897497401455185, + "grad_norm": 2.4091837406158447, + "learning_rate": 4.0254327847830736e-05, + "loss": 0.7366, + "step": 922 + }, + { + "epoch": 0.5903893819461101, + "grad_norm": 2.0652902126312256, + "learning_rate": 4.0243641803804235e-05, + "loss": 0.6362, + "step": 923 + }, + { + "epoch": 0.5910290237467019, + "grad_norm": 2.447798728942871, + "learning_rate": 4.023295575977773e-05, + "loss": 0.7498, + "step": 924 + }, + { + "epoch": 0.5916686655472935, + "grad_norm": 2.377028226852417, + "learning_rate": 4.0222269715751234e-05, + "loss": 0.7927, + "step": 925 + }, + { + "epoch": 0.5923083073478852, + "grad_norm": 1.9503337144851685, + "learning_rate": 4.021158367172473e-05, + "loss": 0.6526, + "step": 926 + }, + { + "epoch": 0.5929479491484768, + "grad_norm": 2.4022982120513916, + "learning_rate": 4.020089762769823e-05, + "loss": 0.8673, + "step": 927 + }, + { + "epoch": 0.5935875909490685, + "grad_norm": 2.441439628601074, + "learning_rate": 4.0190211583671725e-05, + "loss": 0.8123, + "step": 928 + }, + { + "epoch": 0.5942272327496602, + "grad_norm": 2.3275411128997803, + "learning_rate": 4.0179525539645225e-05, + "loss": 0.7722, + "step": 929 + }, + { + "epoch": 0.5948668745502519, + "grad_norm": 2.7536025047302246, + "learning_rate": 4.0168839495618724e-05, + "loss": 0.8429, + "step": 930 + }, + { + "epoch": 0.5955065163508435, + "grad_norm": 2.31563138961792, + "learning_rate": 4.0158153451592224e-05, + "loss": 0.7979, + "step": 931 + }, + { + "epoch": 0.5961461581514352, + "grad_norm": 2.359586715698242, + "learning_rate": 4.014746740756572e-05, + "loss": 0.7203, + "step": 932 + }, + { + "epoch": 0.5967857999520269, + "grad_norm": 2.407341241836548, + "learning_rate": 4.013678136353922e-05, + "loss": 0.804, + "step": 933 + }, + { + "epoch": 0.5974254417526186, + "grad_norm": 1.792413592338562, + "learning_rate": 4.012609531951272e-05, + "loss": 0.5876, + "step": 934 + }, + { + "epoch": 0.5980650835532102, + "grad_norm": 2.349578857421875, + "learning_rate": 4.0115409275486214e-05, + "loss": 0.8117, + "step": 935 + }, + { + "epoch": 0.5987047253538019, + "grad_norm": 2.211759090423584, + "learning_rate": 4.010472323145972e-05, + "loss": 0.7203, + "step": 936 + }, + { + "epoch": 0.5993443671543935, + "grad_norm": 2.309387445449829, + "learning_rate": 4.009403718743321e-05, + "loss": 0.756, + "step": 937 + }, + { + "epoch": 0.5999840089549852, + "grad_norm": 2.1751136779785156, + "learning_rate": 4.008335114340671e-05, + "loss": 0.6437, + "step": 938 + }, + { + "epoch": 0.6006236507555769, + "grad_norm": 2.28783917427063, + "learning_rate": 4.007266509938021e-05, + "loss": 0.6768, + "step": 939 + }, + { + "epoch": 0.6012632925561685, + "grad_norm": 2.1996519565582275, + "learning_rate": 4.006197905535371e-05, + "loss": 0.6821, + "step": 940 + }, + { + "epoch": 0.6019029343567602, + "grad_norm": 2.0751750469207764, + "learning_rate": 4.0051293011327204e-05, + "loss": 0.6784, + "step": 941 + }, + { + "epoch": 0.6025425761573519, + "grad_norm": 2.0684049129486084, + "learning_rate": 4.004060696730071e-05, + "loss": 0.6572, + "step": 942 + }, + { + "epoch": 0.6031822179579436, + "grad_norm": 2.560596227645874, + "learning_rate": 4.00299209232742e-05, + "loss": 0.7658, + "step": 943 + }, + { + "epoch": 0.6038218597585352, + "grad_norm": 2.3507702350616455, + "learning_rate": 4.00192348792477e-05, + "loss": 0.8368, + "step": 944 + }, + { + "epoch": 0.6044615015591269, + "grad_norm": 2.741117000579834, + "learning_rate": 4.000854883522121e-05, + "loss": 0.6982, + "step": 945 + }, + { + "epoch": 0.6051011433597185, + "grad_norm": 2.4322903156280518, + "learning_rate": 3.99978627911947e-05, + "loss": 0.7642, + "step": 946 + }, + { + "epoch": 0.6057407851603103, + "grad_norm": 2.6147563457489014, + "learning_rate": 3.99871767471682e-05, + "loss": 0.6749, + "step": 947 + }, + { + "epoch": 0.6063804269609019, + "grad_norm": 2.37351655960083, + "learning_rate": 3.99764907031417e-05, + "loss": 0.7726, + "step": 948 + }, + { + "epoch": 0.6070200687614936, + "grad_norm": 2.1228601932525635, + "learning_rate": 3.99658046591152e-05, + "loss": 0.6969, + "step": 949 + }, + { + "epoch": 0.6076597105620852, + "grad_norm": 2.356920003890991, + "learning_rate": 3.995511861508869e-05, + "loss": 0.8022, + "step": 950 + }, + { + "epoch": 0.6082993523626768, + "grad_norm": 2.4392523765563965, + "learning_rate": 3.99444325710622e-05, + "loss": 0.804, + "step": 951 + }, + { + "epoch": 0.6089389941632686, + "grad_norm": 2.4725987911224365, + "learning_rate": 3.993374652703569e-05, + "loss": 0.7574, + "step": 952 + }, + { + "epoch": 0.6095786359638602, + "grad_norm": 2.684683084487915, + "learning_rate": 3.992306048300919e-05, + "loss": 0.7596, + "step": 953 + }, + { + "epoch": 0.6102182777644519, + "grad_norm": 2.5766384601593018, + "learning_rate": 3.991237443898269e-05, + "loss": 0.8009, + "step": 954 + }, + { + "epoch": 0.6108579195650435, + "grad_norm": 2.322838544845581, + "learning_rate": 3.990168839495619e-05, + "loss": 0.722, + "step": 955 + }, + { + "epoch": 0.6114975613656353, + "grad_norm": 2.2413270473480225, + "learning_rate": 3.989100235092969e-05, + "loss": 0.7197, + "step": 956 + }, + { + "epoch": 0.6121372031662269, + "grad_norm": 2.4533560276031494, + "learning_rate": 3.988031630690319e-05, + "loss": 0.8411, + "step": 957 + }, + { + "epoch": 0.6127768449668186, + "grad_norm": 1.9486346244812012, + "learning_rate": 3.9869630262876686e-05, + "loss": 0.613, + "step": 958 + }, + { + "epoch": 0.6134164867674102, + "grad_norm": 2.55692458152771, + "learning_rate": 3.9858944218850185e-05, + "loss": 0.7848, + "step": 959 + }, + { + "epoch": 0.614056128568002, + "grad_norm": 2.433396816253662, + "learning_rate": 3.9848258174823685e-05, + "loss": 0.7827, + "step": 960 + }, + { + "epoch": 0.6146957703685936, + "grad_norm": 2.615607738494873, + "learning_rate": 3.983757213079718e-05, + "loss": 0.7819, + "step": 961 + }, + { + "epoch": 0.6153354121691853, + "grad_norm": 2.2539865970611572, + "learning_rate": 3.9826886086770683e-05, + "loss": 0.6861, + "step": 962 + }, + { + "epoch": 0.6159750539697769, + "grad_norm": 2.2904000282287598, + "learning_rate": 3.9816200042744176e-05, + "loss": 0.7327, + "step": 963 + }, + { + "epoch": 0.6166146957703686, + "grad_norm": 2.159097194671631, + "learning_rate": 3.9805513998717675e-05, + "loss": 0.6517, + "step": 964 + }, + { + "epoch": 0.6172543375709603, + "grad_norm": 2.511418342590332, + "learning_rate": 3.9794827954691175e-05, + "loss": 0.8489, + "step": 965 + }, + { + "epoch": 0.6178939793715519, + "grad_norm": 2.5162479877471924, + "learning_rate": 3.9784141910664674e-05, + "loss": 0.8049, + "step": 966 + }, + { + "epoch": 0.6185336211721436, + "grad_norm": 2.2590129375457764, + "learning_rate": 3.9773455866638173e-05, + "loss": 0.703, + "step": 967 + }, + { + "epoch": 0.6191732629727352, + "grad_norm": 2.4057836532592773, + "learning_rate": 3.976276982261167e-05, + "loss": 0.794, + "step": 968 + }, + { + "epoch": 0.619812904773327, + "grad_norm": 2.1555848121643066, + "learning_rate": 3.975208377858517e-05, + "loss": 0.6879, + "step": 969 + }, + { + "epoch": 0.6204525465739186, + "grad_norm": 3.082021713256836, + "learning_rate": 3.9741397734558665e-05, + "loss": 0.8459, + "step": 970 + }, + { + "epoch": 0.6210921883745103, + "grad_norm": 2.719717025756836, + "learning_rate": 3.973071169053217e-05, + "loss": 0.7577, + "step": 971 + }, + { + "epoch": 0.6217318301751019, + "grad_norm": 2.287407398223877, + "learning_rate": 3.9720025646505664e-05, + "loss": 0.7307, + "step": 972 + }, + { + "epoch": 0.6223714719756936, + "grad_norm": 1.997767448425293, + "learning_rate": 3.970933960247916e-05, + "loss": 0.6546, + "step": 973 + }, + { + "epoch": 0.6230111137762853, + "grad_norm": 2.116257905960083, + "learning_rate": 3.969865355845266e-05, + "loss": 0.7164, + "step": 974 + }, + { + "epoch": 0.623650755576877, + "grad_norm": 2.9210846424102783, + "learning_rate": 3.968796751442616e-05, + "loss": 0.8839, + "step": 975 + }, + { + "epoch": 0.6242903973774686, + "grad_norm": 2.0119614601135254, + "learning_rate": 3.967728147039966e-05, + "loss": 0.6569, + "step": 976 + }, + { + "epoch": 0.6249300391780603, + "grad_norm": 2.344700813293457, + "learning_rate": 3.966659542637316e-05, + "loss": 0.7608, + "step": 977 + }, + { + "epoch": 0.625569680978652, + "grad_norm": 2.141934394836426, + "learning_rate": 3.965590938234666e-05, + "loss": 0.6702, + "step": 978 + }, + { + "epoch": 0.6262093227792436, + "grad_norm": 2.2295076847076416, + "learning_rate": 3.964522333832015e-05, + "loss": 0.7682, + "step": 979 + }, + { + "epoch": 0.6268489645798353, + "grad_norm": 2.2250473499298096, + "learning_rate": 3.963453729429366e-05, + "loss": 0.7693, + "step": 980 + }, + { + "epoch": 0.6274886063804269, + "grad_norm": 2.484199047088623, + "learning_rate": 3.962385125026715e-05, + "loss": 0.7112, + "step": 981 + }, + { + "epoch": 0.6281282481810186, + "grad_norm": 2.427258014678955, + "learning_rate": 3.961316520624065e-05, + "loss": 0.7067, + "step": 982 + }, + { + "epoch": 0.6287678899816103, + "grad_norm": 2.7425003051757812, + "learning_rate": 3.960247916221415e-05, + "loss": 0.8382, + "step": 983 + }, + { + "epoch": 0.629407531782202, + "grad_norm": 2.24448299407959, + "learning_rate": 3.959179311818765e-05, + "loss": 0.7345, + "step": 984 + }, + { + "epoch": 0.6300471735827936, + "grad_norm": 2.4819138050079346, + "learning_rate": 3.958110707416115e-05, + "loss": 0.7582, + "step": 985 + }, + { + "epoch": 0.6306868153833853, + "grad_norm": 2.452561855316162, + "learning_rate": 3.957042103013465e-05, + "loss": 0.7174, + "step": 986 + }, + { + "epoch": 0.631326457183977, + "grad_norm": 2.4645004272460938, + "learning_rate": 3.955973498610814e-05, + "loss": 0.7948, + "step": 987 + }, + { + "epoch": 0.6319660989845687, + "grad_norm": 2.4163429737091064, + "learning_rate": 3.9549048942081647e-05, + "loss": 0.8258, + "step": 988 + }, + { + "epoch": 0.6326057407851603, + "grad_norm": 2.1647913455963135, + "learning_rate": 3.953836289805514e-05, + "loss": 0.6836, + "step": 989 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 2.318499803543091, + "learning_rate": 3.952767685402864e-05, + "loss": 0.7331, + "step": 990 + }, + { + "epoch": 0.6338850243863436, + "grad_norm": 2.34328031539917, + "learning_rate": 3.9516990810002145e-05, + "loss": 0.7087, + "step": 991 + }, + { + "epoch": 0.6345246661869354, + "grad_norm": 2.3378255367279053, + "learning_rate": 3.950630476597564e-05, + "loss": 0.7267, + "step": 992 + }, + { + "epoch": 0.635164307987527, + "grad_norm": 2.475339889526367, + "learning_rate": 3.9495618721949137e-05, + "loss": 0.7421, + "step": 993 + }, + { + "epoch": 0.6358039497881186, + "grad_norm": 2.289686918258667, + "learning_rate": 3.9484932677922636e-05, + "loss": 0.8081, + "step": 994 + }, + { + "epoch": 0.6364435915887103, + "grad_norm": 2.2486777305603027, + "learning_rate": 3.9474246633896135e-05, + "loss": 0.7988, + "step": 995 + }, + { + "epoch": 0.637083233389302, + "grad_norm": 2.104947805404663, + "learning_rate": 3.946356058986963e-05, + "loss": 0.719, + "step": 996 + }, + { + "epoch": 0.6377228751898937, + "grad_norm": 2.2510128021240234, + "learning_rate": 3.9452874545843134e-05, + "loss": 0.7524, + "step": 997 + }, + { + "epoch": 0.6383625169904853, + "grad_norm": 2.2770988941192627, + "learning_rate": 3.9442188501816627e-05, + "loss": 0.7286, + "step": 998 + }, + { + "epoch": 0.639002158791077, + "grad_norm": 2.0234215259552, + "learning_rate": 3.9431502457790126e-05, + "loss": 0.646, + "step": 999 + }, + { + "epoch": 0.6396418005916686, + "grad_norm": 2.0593552589416504, + "learning_rate": 3.9420816413763625e-05, + "loss": 0.6898, + "step": 1000 + }, + { + "epoch": 0.6402814423922604, + "grad_norm": 2.0624172687530518, + "learning_rate": 3.9410130369737125e-05, + "loss": 0.5996, + "step": 1001 + }, + { + "epoch": 0.640921084192852, + "grad_norm": 2.4475672245025635, + "learning_rate": 3.9399444325710624e-05, + "loss": 0.7919, + "step": 1002 + }, + { + "epoch": 0.6415607259934437, + "grad_norm": 2.5255558490753174, + "learning_rate": 3.9388758281684123e-05, + "loss": 0.8562, + "step": 1003 + }, + { + "epoch": 0.6422003677940353, + "grad_norm": 2.2773284912109375, + "learning_rate": 3.937807223765762e-05, + "loss": 0.7647, + "step": 1004 + }, + { + "epoch": 0.6428400095946271, + "grad_norm": 2.457221269607544, + "learning_rate": 3.9367386193631115e-05, + "loss": 0.7382, + "step": 1005 + }, + { + "epoch": 0.6434796513952187, + "grad_norm": 2.2238290309906006, + "learning_rate": 3.935670014960462e-05, + "loss": 0.6545, + "step": 1006 + }, + { + "epoch": 0.6441192931958103, + "grad_norm": 2.4462924003601074, + "learning_rate": 3.9346014105578114e-05, + "loss": 0.8379, + "step": 1007 + }, + { + "epoch": 0.644758934996402, + "grad_norm": 2.400723695755005, + "learning_rate": 3.9335328061551613e-05, + "loss": 0.748, + "step": 1008 + }, + { + "epoch": 0.6453985767969936, + "grad_norm": 2.4213650226593018, + "learning_rate": 3.932464201752511e-05, + "loss": 0.7917, + "step": 1009 + }, + { + "epoch": 0.6460382185975854, + "grad_norm": 2.2447752952575684, + "learning_rate": 3.931395597349861e-05, + "loss": 0.6654, + "step": 1010 + }, + { + "epoch": 0.646677860398177, + "grad_norm": 2.2563722133636475, + "learning_rate": 3.930326992947211e-05, + "loss": 0.7058, + "step": 1011 + }, + { + "epoch": 0.6473175021987687, + "grad_norm": 2.510535955429077, + "learning_rate": 3.929258388544561e-05, + "loss": 0.9369, + "step": 1012 + }, + { + "epoch": 0.6479571439993603, + "grad_norm": 2.573002338409424, + "learning_rate": 3.928189784141911e-05, + "loss": 0.8924, + "step": 1013 + }, + { + "epoch": 0.6485967857999521, + "grad_norm": 2.2554931640625, + "learning_rate": 3.927121179739261e-05, + "loss": 0.6641, + "step": 1014 + }, + { + "epoch": 0.6492364276005437, + "grad_norm": 2.2975242137908936, + "learning_rate": 3.926052575336611e-05, + "loss": 0.7967, + "step": 1015 + }, + { + "epoch": 0.6498760694011354, + "grad_norm": 2.3706226348876953, + "learning_rate": 3.92498397093396e-05, + "loss": 0.7115, + "step": 1016 + }, + { + "epoch": 0.650515711201727, + "grad_norm": 1.9498761892318726, + "learning_rate": 3.923915366531311e-05, + "loss": 0.6575, + "step": 1017 + }, + { + "epoch": 0.6511553530023187, + "grad_norm": 2.1991360187530518, + "learning_rate": 3.92284676212866e-05, + "loss": 0.7199, + "step": 1018 + }, + { + "epoch": 0.6517949948029104, + "grad_norm": 2.3318216800689697, + "learning_rate": 3.92177815772601e-05, + "loss": 0.7546, + "step": 1019 + }, + { + "epoch": 0.6524346366035021, + "grad_norm": 2.1762449741363525, + "learning_rate": 3.92070955332336e-05, + "loss": 0.7239, + "step": 1020 + }, + { + "epoch": 0.6530742784040937, + "grad_norm": 2.4617691040039062, + "learning_rate": 3.91964094892071e-05, + "loss": 0.8038, + "step": 1021 + }, + { + "epoch": 0.6537139202046853, + "grad_norm": 2.2549099922180176, + "learning_rate": 3.91857234451806e-05, + "loss": 0.7112, + "step": 1022 + }, + { + "epoch": 0.6543535620052771, + "grad_norm": 2.4799864292144775, + "learning_rate": 3.91750374011541e-05, + "loss": 0.7606, + "step": 1023 + }, + { + "epoch": 0.6549932038058687, + "grad_norm": 2.43270206451416, + "learning_rate": 3.9164351357127596e-05, + "loss": 0.7288, + "step": 1024 + }, + { + "epoch": 0.6556328456064604, + "grad_norm": 2.5053627490997314, + "learning_rate": 3.915366531310109e-05, + "loss": 0.7419, + "step": 1025 + }, + { + "epoch": 0.656272487407052, + "grad_norm": 2.3089120388031006, + "learning_rate": 3.9142979269074595e-05, + "loss": 0.7503, + "step": 1026 + }, + { + "epoch": 0.6569121292076437, + "grad_norm": 2.567331552505493, + "learning_rate": 3.913229322504809e-05, + "loss": 0.7968, + "step": 1027 + }, + { + "epoch": 0.6575517710082354, + "grad_norm": 2.1532084941864014, + "learning_rate": 3.912160718102159e-05, + "loss": 0.6958, + "step": 1028 + }, + { + "epoch": 0.6581914128088271, + "grad_norm": 2.290431499481201, + "learning_rate": 3.9110921136995086e-05, + "loss": 0.7493, + "step": 1029 + }, + { + "epoch": 0.6588310546094187, + "grad_norm": 2.2117385864257812, + "learning_rate": 3.9100235092968586e-05, + "loss": 0.6606, + "step": 1030 + }, + { + "epoch": 0.6594706964100104, + "grad_norm": 2.5073018074035645, + "learning_rate": 3.908954904894208e-05, + "loss": 0.8315, + "step": 1031 + }, + { + "epoch": 0.660110338210602, + "grad_norm": 2.0758044719696045, + "learning_rate": 3.9078863004915585e-05, + "loss": 0.6932, + "step": 1032 + }, + { + "epoch": 0.6607499800111938, + "grad_norm": 2.434968948364258, + "learning_rate": 3.906817696088908e-05, + "loss": 0.8139, + "step": 1033 + }, + { + "epoch": 0.6613896218117854, + "grad_norm": 2.257331371307373, + "learning_rate": 3.9057490916862577e-05, + "loss": 0.741, + "step": 1034 + }, + { + "epoch": 0.662029263612377, + "grad_norm": 2.3283777236938477, + "learning_rate": 3.9046804872836076e-05, + "loss": 0.7625, + "step": 1035 + }, + { + "epoch": 0.6626689054129687, + "grad_norm": 2.2232840061187744, + "learning_rate": 3.9036118828809575e-05, + "loss": 0.7106, + "step": 1036 + }, + { + "epoch": 0.6633085472135604, + "grad_norm": 1.8899500370025635, + "learning_rate": 3.9025432784783075e-05, + "loss": 0.6227, + "step": 1037 + }, + { + "epoch": 0.6639481890141521, + "grad_norm": 2.3554022312164307, + "learning_rate": 3.9014746740756574e-05, + "loss": 0.8066, + "step": 1038 + }, + { + "epoch": 0.6645878308147437, + "grad_norm": 2.1485507488250732, + "learning_rate": 3.900406069673007e-05, + "loss": 0.7169, + "step": 1039 + }, + { + "epoch": 0.6652274726153354, + "grad_norm": 2.33443284034729, + "learning_rate": 3.899337465270357e-05, + "loss": 0.903, + "step": 1040 + }, + { + "epoch": 0.665867114415927, + "grad_norm": 2.198026180267334, + "learning_rate": 3.898268860867707e-05, + "loss": 0.8025, + "step": 1041 + }, + { + "epoch": 0.6665067562165188, + "grad_norm": 2.3127546310424805, + "learning_rate": 3.8972002564650565e-05, + "loss": 0.8436, + "step": 1042 + }, + { + "epoch": 0.6671463980171104, + "grad_norm": 2.124969244003296, + "learning_rate": 3.896131652062407e-05, + "loss": 0.6896, + "step": 1043 + }, + { + "epoch": 0.6677860398177021, + "grad_norm": 2.501819133758545, + "learning_rate": 3.895063047659756e-05, + "loss": 0.8593, + "step": 1044 + }, + { + "epoch": 0.6684256816182937, + "grad_norm": 2.178406238555908, + "learning_rate": 3.893994443257106e-05, + "loss": 0.7511, + "step": 1045 + }, + { + "epoch": 0.6690653234188855, + "grad_norm": 2.3316667079925537, + "learning_rate": 3.892925838854456e-05, + "loss": 0.7708, + "step": 1046 + }, + { + "epoch": 0.6697049652194771, + "grad_norm": 2.149301528930664, + "learning_rate": 3.891857234451806e-05, + "loss": 0.7585, + "step": 1047 + }, + { + "epoch": 0.6703446070200688, + "grad_norm": 2.2174315452575684, + "learning_rate": 3.890788630049156e-05, + "loss": 0.7536, + "step": 1048 + }, + { + "epoch": 0.6709842488206604, + "grad_norm": 2.4261577129364014, + "learning_rate": 3.889720025646506e-05, + "loss": 0.8253, + "step": 1049 + }, + { + "epoch": 0.671623890621252, + "grad_norm": 2.417027711868286, + "learning_rate": 3.888651421243856e-05, + "loss": 0.8187, + "step": 1050 + }, + { + "epoch": 0.6722635324218438, + "grad_norm": 2.6399002075195312, + "learning_rate": 3.887582816841205e-05, + "loss": 0.8556, + "step": 1051 + }, + { + "epoch": 0.6729031742224354, + "grad_norm": 2.147928476333618, + "learning_rate": 3.886514212438556e-05, + "loss": 0.7248, + "step": 1052 + }, + { + "epoch": 0.6735428160230271, + "grad_norm": 2.2155518531799316, + "learning_rate": 3.885445608035905e-05, + "loss": 0.7822, + "step": 1053 + }, + { + "epoch": 0.6741824578236187, + "grad_norm": 2.535156726837158, + "learning_rate": 3.884377003633255e-05, + "loss": 0.7741, + "step": 1054 + }, + { + "epoch": 0.6748220996242105, + "grad_norm": 2.3333871364593506, + "learning_rate": 3.883308399230605e-05, + "loss": 0.6934, + "step": 1055 + }, + { + "epoch": 0.6754617414248021, + "grad_norm": 2.1857028007507324, + "learning_rate": 3.882239794827955e-05, + "loss": 0.6746, + "step": 1056 + }, + { + "epoch": 0.6761013832253938, + "grad_norm": 2.3149261474609375, + "learning_rate": 3.881171190425305e-05, + "loss": 0.7206, + "step": 1057 + }, + { + "epoch": 0.6767410250259854, + "grad_norm": 2.512051820755005, + "learning_rate": 3.880102586022655e-05, + "loss": 0.7582, + "step": 1058 + }, + { + "epoch": 0.6773806668265772, + "grad_norm": 2.1747212409973145, + "learning_rate": 3.879033981620005e-05, + "loss": 0.6835, + "step": 1059 + }, + { + "epoch": 0.6780203086271688, + "grad_norm": 2.1035051345825195, + "learning_rate": 3.877965377217354e-05, + "loss": 0.6676, + "step": 1060 + }, + { + "epoch": 0.6786599504277605, + "grad_norm": 2.3119547367095947, + "learning_rate": 3.8768967728147046e-05, + "loss": 0.7497, + "step": 1061 + }, + { + "epoch": 0.6792995922283521, + "grad_norm": 2.011989116668701, + "learning_rate": 3.875828168412054e-05, + "loss": 0.6539, + "step": 1062 + }, + { + "epoch": 0.6799392340289437, + "grad_norm": 2.3933212757110596, + "learning_rate": 3.874759564009404e-05, + "loss": 0.7156, + "step": 1063 + }, + { + "epoch": 0.6805788758295355, + "grad_norm": 2.542637586593628, + "learning_rate": 3.873690959606754e-05, + "loss": 0.7975, + "step": 1064 + }, + { + "epoch": 0.6812185176301271, + "grad_norm": 2.5723378658294678, + "learning_rate": 3.8726223552041036e-05, + "loss": 0.8028, + "step": 1065 + }, + { + "epoch": 0.6818581594307188, + "grad_norm": 2.502739429473877, + "learning_rate": 3.871553750801453e-05, + "loss": 0.7489, + "step": 1066 + }, + { + "epoch": 0.6824978012313104, + "grad_norm": 2.404087781906128, + "learning_rate": 3.8704851463988035e-05, + "loss": 0.8516, + "step": 1067 + }, + { + "epoch": 0.6831374430319022, + "grad_norm": 2.5609021186828613, + "learning_rate": 3.869416541996153e-05, + "loss": 0.7856, + "step": 1068 + }, + { + "epoch": 0.6837770848324938, + "grad_norm": 2.3363285064697266, + "learning_rate": 3.8683479375935034e-05, + "loss": 0.7413, + "step": 1069 + }, + { + "epoch": 0.6844167266330855, + "grad_norm": 1.9547827243804932, + "learning_rate": 3.867279333190853e-05, + "loss": 0.6662, + "step": 1070 + }, + { + "epoch": 0.6850563684336771, + "grad_norm": 2.200296640396118, + "learning_rate": 3.8662107287882026e-05, + "loss": 0.7274, + "step": 1071 + }, + { + "epoch": 0.6856960102342688, + "grad_norm": 2.2395646572113037, + "learning_rate": 3.865142124385553e-05, + "loss": 0.771, + "step": 1072 + }, + { + "epoch": 0.6863356520348605, + "grad_norm": 2.557474136352539, + "learning_rate": 3.8640735199829025e-05, + "loss": 0.9435, + "step": 1073 + }, + { + "epoch": 0.6869752938354522, + "grad_norm": 2.2866761684417725, + "learning_rate": 3.8630049155802524e-05, + "loss": 0.8536, + "step": 1074 + }, + { + "epoch": 0.6876149356360438, + "grad_norm": 2.2908213138580322, + "learning_rate": 3.861936311177602e-05, + "loss": 0.7573, + "step": 1075 + }, + { + "epoch": 0.6882545774366355, + "grad_norm": 2.2182838916778564, + "learning_rate": 3.860867706774952e-05, + "loss": 0.6515, + "step": 1076 + }, + { + "epoch": 0.6888942192372272, + "grad_norm": 2.248753309249878, + "learning_rate": 3.8597991023723015e-05, + "loss": 0.7935, + "step": 1077 + }, + { + "epoch": 0.6895338610378188, + "grad_norm": 2.6073074340820312, + "learning_rate": 3.858730497969652e-05, + "loss": 0.9092, + "step": 1078 + }, + { + "epoch": 0.6901735028384105, + "grad_norm": 2.2033393383026123, + "learning_rate": 3.8576618935670014e-05, + "loss": 0.685, + "step": 1079 + }, + { + "epoch": 0.6908131446390021, + "grad_norm": 2.3583688735961914, + "learning_rate": 3.856593289164351e-05, + "loss": 0.7056, + "step": 1080 + }, + { + "epoch": 0.6914527864395938, + "grad_norm": 2.2697761058807373, + "learning_rate": 3.855524684761701e-05, + "loss": 0.7562, + "step": 1081 + }, + { + "epoch": 0.6920924282401855, + "grad_norm": 2.031430244445801, + "learning_rate": 3.854456080359051e-05, + "loss": 0.6171, + "step": 1082 + }, + { + "epoch": 0.6927320700407772, + "grad_norm": 2.197084903717041, + "learning_rate": 3.853387475956401e-05, + "loss": 0.7065, + "step": 1083 + }, + { + "epoch": 0.6933717118413688, + "grad_norm": 2.3392534255981445, + "learning_rate": 3.852318871553751e-05, + "loss": 0.772, + "step": 1084 + }, + { + "epoch": 0.6940113536419605, + "grad_norm": 2.515352487564087, + "learning_rate": 3.851250267151101e-05, + "loss": 0.8118, + "step": 1085 + }, + { + "epoch": 0.6946509954425522, + "grad_norm": 2.1398885250091553, + "learning_rate": 3.85018166274845e-05, + "loss": 0.6057, + "step": 1086 + }, + { + "epoch": 0.6952906372431439, + "grad_norm": 2.4276764392852783, + "learning_rate": 3.849113058345801e-05, + "loss": 0.7836, + "step": 1087 + }, + { + "epoch": 0.6959302790437355, + "grad_norm": 2.5848066806793213, + "learning_rate": 3.84804445394315e-05, + "loss": 0.8149, + "step": 1088 + }, + { + "epoch": 0.6965699208443272, + "grad_norm": 2.8381292819976807, + "learning_rate": 3.8469758495405e-05, + "loss": 0.865, + "step": 1089 + }, + { + "epoch": 0.6972095626449188, + "grad_norm": 2.4076383113861084, + "learning_rate": 3.84590724513785e-05, + "loss": 0.7594, + "step": 1090 + }, + { + "epoch": 0.6978492044455105, + "grad_norm": 1.9375122785568237, + "learning_rate": 3.8448386407352e-05, + "loss": 0.6165, + "step": 1091 + }, + { + "epoch": 0.6984888462461022, + "grad_norm": 2.3143324851989746, + "learning_rate": 3.84377003633255e-05, + "loss": 0.8394, + "step": 1092 + }, + { + "epoch": 0.6991284880466938, + "grad_norm": 2.289259433746338, + "learning_rate": 3.8427014319299e-05, + "loss": 0.781, + "step": 1093 + }, + { + "epoch": 0.6997681298472855, + "grad_norm": 2.3176772594451904, + "learning_rate": 3.84163282752725e-05, + "loss": 0.7327, + "step": 1094 + }, + { + "epoch": 0.7004077716478772, + "grad_norm": 2.1678993701934814, + "learning_rate": 3.840564223124599e-05, + "loss": 0.7075, + "step": 1095 + }, + { + "epoch": 0.7010474134484689, + "grad_norm": 2.2163405418395996, + "learning_rate": 3.8394956187219496e-05, + "loss": 0.7503, + "step": 1096 + }, + { + "epoch": 0.7016870552490605, + "grad_norm": 2.2522881031036377, + "learning_rate": 3.838427014319299e-05, + "loss": 0.6734, + "step": 1097 + }, + { + "epoch": 0.7023266970496522, + "grad_norm": 1.8087902069091797, + "learning_rate": 3.8373584099166495e-05, + "loss": 0.5597, + "step": 1098 + }, + { + "epoch": 0.7029663388502438, + "grad_norm": 2.146606683731079, + "learning_rate": 3.836289805513999e-05, + "loss": 0.7376, + "step": 1099 + }, + { + "epoch": 0.7036059806508356, + "grad_norm": 2.0617082118988037, + "learning_rate": 3.835221201111349e-05, + "loss": 0.729, + "step": 1100 + }, + { + "epoch": 0.7042456224514272, + "grad_norm": 2.1947948932647705, + "learning_rate": 3.8341525967086986e-05, + "loss": 0.7183, + "step": 1101 + }, + { + "epoch": 0.7048852642520189, + "grad_norm": 2.5354106426239014, + "learning_rate": 3.8330839923060486e-05, + "loss": 0.8183, + "step": 1102 + }, + { + "epoch": 0.7055249060526105, + "grad_norm": 2.1710827350616455, + "learning_rate": 3.8320153879033985e-05, + "loss": 0.7325, + "step": 1103 + }, + { + "epoch": 0.7061645478532022, + "grad_norm": 2.609731674194336, + "learning_rate": 3.8309467835007484e-05, + "loss": 0.6996, + "step": 1104 + }, + { + "epoch": 0.7068041896537939, + "grad_norm": 2.4650039672851562, + "learning_rate": 3.8298781790980984e-05, + "loss": 0.7949, + "step": 1105 + }, + { + "epoch": 0.7074438314543855, + "grad_norm": 2.3932008743286133, + "learning_rate": 3.8288095746954476e-05, + "loss": 0.793, + "step": 1106 + }, + { + "epoch": 0.7080834732549772, + "grad_norm": 2.358497142791748, + "learning_rate": 3.827740970292798e-05, + "loss": 0.6543, + "step": 1107 + }, + { + "epoch": 0.7087231150555688, + "grad_norm": 2.4218297004699707, + "learning_rate": 3.8266723658901475e-05, + "loss": 0.8511, + "step": 1108 + }, + { + "epoch": 0.7093627568561606, + "grad_norm": 2.6183223724365234, + "learning_rate": 3.8256037614874974e-05, + "loss": 0.8149, + "step": 1109 + }, + { + "epoch": 0.7100023986567522, + "grad_norm": 2.5073516368865967, + "learning_rate": 3.8245351570848474e-05, + "loss": 0.7997, + "step": 1110 + }, + { + "epoch": 0.7106420404573439, + "grad_norm": 2.2073638439178467, + "learning_rate": 3.823466552682197e-05, + "loss": 0.6717, + "step": 1111 + }, + { + "epoch": 0.7112816822579355, + "grad_norm": 2.2807748317718506, + "learning_rate": 3.8223979482795466e-05, + "loss": 0.779, + "step": 1112 + }, + { + "epoch": 0.7119213240585273, + "grad_norm": 2.1985785961151123, + "learning_rate": 3.821329343876897e-05, + "loss": 0.7542, + "step": 1113 + }, + { + "epoch": 0.7125609658591189, + "grad_norm": 2.1787567138671875, + "learning_rate": 3.8202607394742464e-05, + "loss": 0.7691, + "step": 1114 + }, + { + "epoch": 0.7132006076597106, + "grad_norm": 2.225193977355957, + "learning_rate": 3.8191921350715964e-05, + "loss": 0.7979, + "step": 1115 + }, + { + "epoch": 0.7138402494603022, + "grad_norm": 2.2395787239074707, + "learning_rate": 3.818123530668947e-05, + "loss": 0.8434, + "step": 1116 + }, + { + "epoch": 0.714479891260894, + "grad_norm": 1.956053376197815, + "learning_rate": 3.817054926266296e-05, + "loss": 0.722, + "step": 1117 + }, + { + "epoch": 0.7151195330614856, + "grad_norm": 2.5603792667388916, + "learning_rate": 3.815986321863646e-05, + "loss": 0.8775, + "step": 1118 + }, + { + "epoch": 0.7157591748620772, + "grad_norm": 2.296046495437622, + "learning_rate": 3.814917717460996e-05, + "loss": 0.7028, + "step": 1119 + }, + { + "epoch": 0.7163988166626689, + "grad_norm": 1.8569482564926147, + "learning_rate": 3.813849113058346e-05, + "loss": 0.6368, + "step": 1120 + }, + { + "epoch": 0.7170384584632605, + "grad_norm": 2.0514161586761475, + "learning_rate": 3.812780508655695e-05, + "loss": 0.7731, + "step": 1121 + }, + { + "epoch": 0.7176781002638523, + "grad_norm": 2.2029731273651123, + "learning_rate": 3.811711904253046e-05, + "loss": 0.7006, + "step": 1122 + }, + { + "epoch": 0.7183177420644439, + "grad_norm": 2.734997510910034, + "learning_rate": 3.810643299850395e-05, + "loss": 0.8155, + "step": 1123 + }, + { + "epoch": 0.7189573838650356, + "grad_norm": 2.1038763523101807, + "learning_rate": 3.809574695447746e-05, + "loss": 0.728, + "step": 1124 + }, + { + "epoch": 0.7195970256656272, + "grad_norm": 2.012888193130493, + "learning_rate": 3.808506091045095e-05, + "loss": 0.7125, + "step": 1125 + }, + { + "epoch": 0.720236667466219, + "grad_norm": 2.211409568786621, + "learning_rate": 3.807437486642445e-05, + "loss": 0.7129, + "step": 1126 + }, + { + "epoch": 0.7208763092668106, + "grad_norm": 2.124588966369629, + "learning_rate": 3.806368882239795e-05, + "loss": 0.7434, + "step": 1127 + }, + { + "epoch": 0.7215159510674023, + "grad_norm": 2.153137445449829, + "learning_rate": 3.805300277837145e-05, + "loss": 0.6642, + "step": 1128 + }, + { + "epoch": 0.7221555928679939, + "grad_norm": 1.9890538454055786, + "learning_rate": 3.804231673434495e-05, + "loss": 0.6062, + "step": 1129 + }, + { + "epoch": 0.7227952346685856, + "grad_norm": 2.062160015106201, + "learning_rate": 3.803163069031845e-05, + "loss": 0.7052, + "step": 1130 + }, + { + "epoch": 0.7234348764691773, + "grad_norm": 2.4277961254119873, + "learning_rate": 3.802094464629195e-05, + "loss": 0.8142, + "step": 1131 + }, + { + "epoch": 0.7240745182697689, + "grad_norm": 2.8217086791992188, + "learning_rate": 3.801025860226544e-05, + "loss": 0.8606, + "step": 1132 + }, + { + "epoch": 0.7247141600703606, + "grad_norm": 2.7864484786987305, + "learning_rate": 3.7999572558238946e-05, + "loss": 0.7296, + "step": 1133 + }, + { + "epoch": 0.7253538018709522, + "grad_norm": 2.5194432735443115, + "learning_rate": 3.798888651421244e-05, + "loss": 0.8165, + "step": 1134 + }, + { + "epoch": 0.725993443671544, + "grad_norm": 2.039736747741699, + "learning_rate": 3.797820047018594e-05, + "loss": 0.6588, + "step": 1135 + }, + { + "epoch": 0.7266330854721356, + "grad_norm": 2.533721923828125, + "learning_rate": 3.796751442615944e-05, + "loss": 0.8458, + "step": 1136 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 2.209456205368042, + "learning_rate": 3.7956828382132936e-05, + "loss": 0.7995, + "step": 1137 + }, + { + "epoch": 0.7279123690733189, + "grad_norm": 2.631016969680786, + "learning_rate": 3.7946142338106436e-05, + "loss": 0.7869, + "step": 1138 + }, + { + "epoch": 0.7285520108739106, + "grad_norm": 2.063816547393799, + "learning_rate": 3.7935456294079935e-05, + "loss": 0.6902, + "step": 1139 + }, + { + "epoch": 0.7291916526745023, + "grad_norm": 2.4370014667510986, + "learning_rate": 3.7924770250053434e-05, + "loss": 0.81, + "step": 1140 + }, + { + "epoch": 0.729831294475094, + "grad_norm": 2.1266560554504395, + "learning_rate": 3.791408420602693e-05, + "loss": 0.6829, + "step": 1141 + }, + { + "epoch": 0.7304709362756856, + "grad_norm": 2.5482261180877686, + "learning_rate": 3.790339816200043e-05, + "loss": 0.8279, + "step": 1142 + }, + { + "epoch": 0.7311105780762773, + "grad_norm": 2.3252758979797363, + "learning_rate": 3.7892712117973926e-05, + "loss": 0.7678, + "step": 1143 + }, + { + "epoch": 0.731750219876869, + "grad_norm": 2.2964940071105957, + "learning_rate": 3.7882026073947425e-05, + "loss": 0.7522, + "step": 1144 + }, + { + "epoch": 0.7323898616774607, + "grad_norm": 2.393054485321045, + "learning_rate": 3.7871340029920924e-05, + "loss": 0.7094, + "step": 1145 + }, + { + "epoch": 0.7330295034780523, + "grad_norm": 2.358607530593872, + "learning_rate": 3.7860653985894424e-05, + "loss": 0.7883, + "step": 1146 + }, + { + "epoch": 0.7336691452786439, + "grad_norm": 1.9570059776306152, + "learning_rate": 3.784996794186792e-05, + "loss": 0.6241, + "step": 1147 + }, + { + "epoch": 0.7343087870792356, + "grad_norm": 2.6454644203186035, + "learning_rate": 3.783928189784142e-05, + "loss": 0.9089, + "step": 1148 + }, + { + "epoch": 0.7349484288798273, + "grad_norm": 2.110250234603882, + "learning_rate": 3.782859585381492e-05, + "loss": 0.6571, + "step": 1149 + }, + { + "epoch": 0.735588070680419, + "grad_norm": 2.5248188972473145, + "learning_rate": 3.7817909809788414e-05, + "loss": 0.7574, + "step": 1150 + }, + { + "epoch": 0.7362277124810106, + "grad_norm": 2.235224485397339, + "learning_rate": 3.780722376576192e-05, + "loss": 0.6953, + "step": 1151 + }, + { + "epoch": 0.7368673542816023, + "grad_norm": 2.39093017578125, + "learning_rate": 3.779653772173541e-05, + "loss": 0.7167, + "step": 1152 + }, + { + "epoch": 0.7375069960821939, + "grad_norm": 2.3500821590423584, + "learning_rate": 3.778585167770892e-05, + "loss": 0.731, + "step": 1153 + }, + { + "epoch": 0.7381466378827857, + "grad_norm": 2.0226786136627197, + "learning_rate": 3.777516563368241e-05, + "loss": 0.7074, + "step": 1154 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 2.3884618282318115, + "learning_rate": 3.776447958965591e-05, + "loss": 0.7841, + "step": 1155 + }, + { + "epoch": 0.739425921483969, + "grad_norm": 2.2239785194396973, + "learning_rate": 3.775379354562941e-05, + "loss": 0.7623, + "step": 1156 + }, + { + "epoch": 0.7400655632845606, + "grad_norm": 2.219381809234619, + "learning_rate": 3.774310750160291e-05, + "loss": 0.7284, + "step": 1157 + }, + { + "epoch": 0.7407052050851524, + "grad_norm": 1.8879082202911377, + "learning_rate": 3.77324214575764e-05, + "loss": 0.5831, + "step": 1158 + }, + { + "epoch": 0.741344846885744, + "grad_norm": 2.2116897106170654, + "learning_rate": 3.772173541354991e-05, + "loss": 0.7395, + "step": 1159 + }, + { + "epoch": 0.7419844886863356, + "grad_norm": 2.4407951831817627, + "learning_rate": 3.77110493695234e-05, + "loss": 0.7016, + "step": 1160 + }, + { + "epoch": 0.7426241304869273, + "grad_norm": 2.165018320083618, + "learning_rate": 3.77003633254969e-05, + "loss": 0.7261, + "step": 1161 + }, + { + "epoch": 0.7432637722875189, + "grad_norm": 2.498748540878296, + "learning_rate": 3.768967728147041e-05, + "loss": 0.8053, + "step": 1162 + }, + { + "epoch": 0.7439034140881107, + "grad_norm": 2.3558831214904785, + "learning_rate": 3.76789912374439e-05, + "loss": 0.7776, + "step": 1163 + }, + { + "epoch": 0.7445430558887023, + "grad_norm": 2.302196741104126, + "learning_rate": 3.76683051934174e-05, + "loss": 0.7353, + "step": 1164 + }, + { + "epoch": 0.745182697689294, + "grad_norm": 2.3921613693237305, + "learning_rate": 3.76576191493909e-05, + "loss": 0.7795, + "step": 1165 + }, + { + "epoch": 0.7458223394898856, + "grad_norm": 2.159858465194702, + "learning_rate": 3.76469331053644e-05, + "loss": 0.6372, + "step": 1166 + }, + { + "epoch": 0.7464619812904774, + "grad_norm": 2.30668568611145, + "learning_rate": 3.763624706133789e-05, + "loss": 0.7958, + "step": 1167 + }, + { + "epoch": 0.747101623091069, + "grad_norm": 2.2490170001983643, + "learning_rate": 3.7625561017311396e-05, + "loss": 0.6638, + "step": 1168 + }, + { + "epoch": 0.7477412648916607, + "grad_norm": 2.138835906982422, + "learning_rate": 3.761487497328489e-05, + "loss": 0.6424, + "step": 1169 + }, + { + "epoch": 0.7483809066922523, + "grad_norm": 2.253960609436035, + "learning_rate": 3.760418892925839e-05, + "loss": 0.6889, + "step": 1170 + }, + { + "epoch": 0.749020548492844, + "grad_norm": 2.385439395904541, + "learning_rate": 3.759350288523189e-05, + "loss": 0.7765, + "step": 1171 + }, + { + "epoch": 0.7496601902934357, + "grad_norm": 2.426828145980835, + "learning_rate": 3.758281684120539e-05, + "loss": 0.8011, + "step": 1172 + }, + { + "epoch": 0.7502998320940274, + "grad_norm": 1.8592880964279175, + "learning_rate": 3.7572130797178886e-05, + "loss": 0.5694, + "step": 1173 + }, + { + "epoch": 0.750939473894619, + "grad_norm": 2.2702317237854004, + "learning_rate": 3.7561444753152386e-05, + "loss": 0.7349, + "step": 1174 + }, + { + "epoch": 0.7515791156952106, + "grad_norm": 2.5193734169006348, + "learning_rate": 3.7550758709125885e-05, + "loss": 0.8441, + "step": 1175 + }, + { + "epoch": 0.7522187574958024, + "grad_norm": 2.177778959274292, + "learning_rate": 3.754007266509938e-05, + "loss": 0.7284, + "step": 1176 + }, + { + "epoch": 0.752858399296394, + "grad_norm": 2.343198299407959, + "learning_rate": 3.7529386621072884e-05, + "loss": 0.7082, + "step": 1177 + }, + { + "epoch": 0.7534980410969857, + "grad_norm": 2.4855828285217285, + "learning_rate": 3.7518700577046376e-05, + "loss": 0.8086, + "step": 1178 + }, + { + "epoch": 0.7541376828975773, + "grad_norm": 2.3655359745025635, + "learning_rate": 3.7508014533019876e-05, + "loss": 0.7972, + "step": 1179 + }, + { + "epoch": 0.754777324698169, + "grad_norm": 2.330594301223755, + "learning_rate": 3.7497328488993375e-05, + "loss": 0.7256, + "step": 1180 + }, + { + "epoch": 0.7554169664987607, + "grad_norm": 2.319443702697754, + "learning_rate": 3.7486642444966874e-05, + "loss": 0.7633, + "step": 1181 + }, + { + "epoch": 0.7560566082993524, + "grad_norm": 2.285426139831543, + "learning_rate": 3.7475956400940374e-05, + "loss": 0.7064, + "step": 1182 + }, + { + "epoch": 0.756696250099944, + "grad_norm": 2.323478937149048, + "learning_rate": 3.746527035691387e-05, + "loss": 0.741, + "step": 1183 + }, + { + "epoch": 0.7573358919005357, + "grad_norm": 2.4266202449798584, + "learning_rate": 3.745458431288737e-05, + "loss": 0.6884, + "step": 1184 + }, + { + "epoch": 0.7579755337011274, + "grad_norm": 1.986000418663025, + "learning_rate": 3.744389826886087e-05, + "loss": 0.6064, + "step": 1185 + }, + { + "epoch": 0.7586151755017191, + "grad_norm": 2.5264949798583984, + "learning_rate": 3.743321222483437e-05, + "loss": 0.6683, + "step": 1186 + }, + { + "epoch": 0.7592548173023107, + "grad_norm": 1.829809546470642, + "learning_rate": 3.7422526180807864e-05, + "loss": 0.5837, + "step": 1187 + }, + { + "epoch": 0.7598944591029023, + "grad_norm": 2.492159366607666, + "learning_rate": 3.741184013678137e-05, + "loss": 0.7506, + "step": 1188 + }, + { + "epoch": 0.760534100903494, + "grad_norm": 2.6433253288269043, + "learning_rate": 3.740115409275486e-05, + "loss": 0.8966, + "step": 1189 + }, + { + "epoch": 0.7611737427040857, + "grad_norm": 2.0945053100585938, + "learning_rate": 3.739046804872836e-05, + "loss": 0.6797, + "step": 1190 + }, + { + "epoch": 0.7618133845046774, + "grad_norm": 2.496220111846924, + "learning_rate": 3.737978200470186e-05, + "loss": 0.8196, + "step": 1191 + }, + { + "epoch": 0.762453026305269, + "grad_norm": 2.0942041873931885, + "learning_rate": 3.736909596067536e-05, + "loss": 0.6706, + "step": 1192 + }, + { + "epoch": 0.7630926681058607, + "grad_norm": 2.4206995964050293, + "learning_rate": 3.735840991664886e-05, + "loss": 0.7298, + "step": 1193 + }, + { + "epoch": 0.7637323099064524, + "grad_norm": 2.0311660766601562, + "learning_rate": 3.734772387262236e-05, + "loss": 0.6598, + "step": 1194 + }, + { + "epoch": 0.7643719517070441, + "grad_norm": 2.4630625247955322, + "learning_rate": 3.733703782859586e-05, + "loss": 0.8063, + "step": 1195 + }, + { + "epoch": 0.7650115935076357, + "grad_norm": 2.1187448501586914, + "learning_rate": 3.732635178456935e-05, + "loss": 0.6538, + "step": 1196 + }, + { + "epoch": 0.7656512353082274, + "grad_norm": 2.124770402908325, + "learning_rate": 3.731566574054286e-05, + "loss": 0.7195, + "step": 1197 + }, + { + "epoch": 0.766290877108819, + "grad_norm": 2.3621788024902344, + "learning_rate": 3.730497969651635e-05, + "loss": 0.7508, + "step": 1198 + }, + { + "epoch": 0.7669305189094108, + "grad_norm": 2.4492368698120117, + "learning_rate": 3.729429365248985e-05, + "loss": 0.7801, + "step": 1199 + }, + { + "epoch": 0.7675701607100024, + "grad_norm": 2.283175230026245, + "learning_rate": 3.728360760846335e-05, + "loss": 0.7324, + "step": 1200 + }, + { + "epoch": 0.7682098025105941, + "grad_norm": 2.2819981575012207, + "learning_rate": 3.727292156443685e-05, + "loss": 0.7157, + "step": 1201 + }, + { + "epoch": 0.7688494443111857, + "grad_norm": 2.2561097145080566, + "learning_rate": 3.726223552041034e-05, + "loss": 0.7515, + "step": 1202 + }, + { + "epoch": 0.7694890861117774, + "grad_norm": 2.3356516361236572, + "learning_rate": 3.725154947638385e-05, + "loss": 0.732, + "step": 1203 + }, + { + "epoch": 0.7701287279123691, + "grad_norm": 2.5061001777648926, + "learning_rate": 3.724086343235734e-05, + "loss": 0.7872, + "step": 1204 + }, + { + "epoch": 0.7707683697129607, + "grad_norm": 2.3949875831604004, + "learning_rate": 3.723017738833084e-05, + "loss": 0.8577, + "step": 1205 + }, + { + "epoch": 0.7714080115135524, + "grad_norm": 2.086625337600708, + "learning_rate": 3.721949134430434e-05, + "loss": 0.7291, + "step": 1206 + }, + { + "epoch": 0.772047653314144, + "grad_norm": 2.0133514404296875, + "learning_rate": 3.720880530027784e-05, + "loss": 0.6942, + "step": 1207 + }, + { + "epoch": 0.7726872951147358, + "grad_norm": 2.4376726150512695, + "learning_rate": 3.7198119256251344e-05, + "loss": 0.7066, + "step": 1208 + }, + { + "epoch": 0.7733269369153274, + "grad_norm": 2.303718090057373, + "learning_rate": 3.7187433212224836e-05, + "loss": 0.7749, + "step": 1209 + }, + { + "epoch": 0.7739665787159191, + "grad_norm": 2.269663095474243, + "learning_rate": 3.7176747168198335e-05, + "loss": 0.7027, + "step": 1210 + }, + { + "epoch": 0.7746062205165107, + "grad_norm": 2.428384304046631, + "learning_rate": 3.7166061124171835e-05, + "loss": 0.7895, + "step": 1211 + }, + { + "epoch": 0.7752458623171025, + "grad_norm": 2.3879854679107666, + "learning_rate": 3.7155375080145334e-05, + "loss": 0.6408, + "step": 1212 + }, + { + "epoch": 0.7758855041176941, + "grad_norm": 2.0581510066986084, + "learning_rate": 3.714468903611883e-05, + "loss": 0.7013, + "step": 1213 + }, + { + "epoch": 0.7765251459182858, + "grad_norm": 2.1578493118286133, + "learning_rate": 3.713400299209233e-05, + "loss": 0.7207, + "step": 1214 + }, + { + "epoch": 0.7771647877188774, + "grad_norm": 2.231703996658325, + "learning_rate": 3.7123316948065826e-05, + "loss": 0.7003, + "step": 1215 + }, + { + "epoch": 0.777804429519469, + "grad_norm": 2.595261335372925, + "learning_rate": 3.7112630904039325e-05, + "loss": 0.8654, + "step": 1216 + }, + { + "epoch": 0.7784440713200608, + "grad_norm": 2.0633652210235596, + "learning_rate": 3.7101944860012824e-05, + "loss": 0.6487, + "step": 1217 + }, + { + "epoch": 0.7790837131206524, + "grad_norm": 2.335822343826294, + "learning_rate": 3.7091258815986324e-05, + "loss": 0.7327, + "step": 1218 + }, + { + "epoch": 0.7797233549212441, + "grad_norm": 2.217700958251953, + "learning_rate": 3.708057277195982e-05, + "loss": 0.7911, + "step": 1219 + }, + { + "epoch": 0.7803629967218357, + "grad_norm": 2.5208353996276855, + "learning_rate": 3.706988672793332e-05, + "loss": 0.7615, + "step": 1220 + }, + { + "epoch": 0.7810026385224275, + "grad_norm": 2.1740236282348633, + "learning_rate": 3.705920068390682e-05, + "loss": 0.7098, + "step": 1221 + }, + { + "epoch": 0.7816422803230191, + "grad_norm": 2.3347630500793457, + "learning_rate": 3.7048514639880314e-05, + "loss": 0.7488, + "step": 1222 + }, + { + "epoch": 0.7822819221236108, + "grad_norm": 2.2441177368164062, + "learning_rate": 3.703782859585382e-05, + "loss": 0.72, + "step": 1223 + }, + { + "epoch": 0.7829215639242024, + "grad_norm": 2.384330987930298, + "learning_rate": 3.702714255182731e-05, + "loss": 0.6368, + "step": 1224 + }, + { + "epoch": 0.7835612057247942, + "grad_norm": 2.217966079711914, + "learning_rate": 3.701645650780081e-05, + "loss": 0.7255, + "step": 1225 + }, + { + "epoch": 0.7842008475253858, + "grad_norm": 2.063870668411255, + "learning_rate": 3.700577046377431e-05, + "loss": 0.6603, + "step": 1226 + }, + { + "epoch": 0.7848404893259775, + "grad_norm": 2.428542375564575, + "learning_rate": 3.699508441974781e-05, + "loss": 0.7724, + "step": 1227 + }, + { + "epoch": 0.7854801311265691, + "grad_norm": 2.1868059635162354, + "learning_rate": 3.698439837572131e-05, + "loss": 0.7082, + "step": 1228 + }, + { + "epoch": 0.7861197729271608, + "grad_norm": 2.1543080806732178, + "learning_rate": 3.697371233169481e-05, + "loss": 0.6656, + "step": 1229 + }, + { + "epoch": 0.7867594147277525, + "grad_norm": 2.152196168899536, + "learning_rate": 3.696302628766831e-05, + "loss": 0.6334, + "step": 1230 + }, + { + "epoch": 0.7873990565283441, + "grad_norm": 2.2862462997436523, + "learning_rate": 3.69523402436418e-05, + "loss": 0.7609, + "step": 1231 + }, + { + "epoch": 0.7880386983289358, + "grad_norm": 2.5716629028320312, + "learning_rate": 3.694165419961531e-05, + "loss": 0.7653, + "step": 1232 + }, + { + "epoch": 0.7886783401295274, + "grad_norm": 1.9692749977111816, + "learning_rate": 3.69309681555888e-05, + "loss": 0.6466, + "step": 1233 + }, + { + "epoch": 0.7893179819301192, + "grad_norm": 2.7022125720977783, + "learning_rate": 3.69202821115623e-05, + "loss": 0.849, + "step": 1234 + }, + { + "epoch": 0.7899576237307108, + "grad_norm": 1.8877711296081543, + "learning_rate": 3.69095960675358e-05, + "loss": 0.5916, + "step": 1235 + }, + { + "epoch": 0.7905972655313025, + "grad_norm": 2.6520678997039795, + "learning_rate": 3.68989100235093e-05, + "loss": 0.7768, + "step": 1236 + }, + { + "epoch": 0.7912369073318941, + "grad_norm": 2.200363874435425, + "learning_rate": 3.68882239794828e-05, + "loss": 0.6769, + "step": 1237 + }, + { + "epoch": 0.7918765491324858, + "grad_norm": 2.4259159564971924, + "learning_rate": 3.68775379354563e-05, + "loss": 0.7247, + "step": 1238 + }, + { + "epoch": 0.7925161909330775, + "grad_norm": 2.5592780113220215, + "learning_rate": 3.68668518914298e-05, + "loss": 0.8512, + "step": 1239 + }, + { + "epoch": 0.7931558327336692, + "grad_norm": 2.426992654800415, + "learning_rate": 3.6856165847403296e-05, + "loss": 0.7994, + "step": 1240 + }, + { + "epoch": 0.7937954745342608, + "grad_norm": 2.3934638500213623, + "learning_rate": 3.6845479803376795e-05, + "loss": 0.7471, + "step": 1241 + }, + { + "epoch": 0.7944351163348525, + "grad_norm": 2.1817221641540527, + "learning_rate": 3.683479375935029e-05, + "loss": 0.654, + "step": 1242 + }, + { + "epoch": 0.7950747581354441, + "grad_norm": 2.0756351947784424, + "learning_rate": 3.6824107715323794e-05, + "loss": 0.6676, + "step": 1243 + }, + { + "epoch": 0.7957143999360358, + "grad_norm": 2.337855339050293, + "learning_rate": 3.681342167129729e-05, + "loss": 0.7586, + "step": 1244 + }, + { + "epoch": 0.7963540417366275, + "grad_norm": 2.191373109817505, + "learning_rate": 3.6802735627270786e-05, + "loss": 0.7261, + "step": 1245 + }, + { + "epoch": 0.7969936835372191, + "grad_norm": 2.321526050567627, + "learning_rate": 3.6792049583244285e-05, + "loss": 0.7456, + "step": 1246 + }, + { + "epoch": 0.7976333253378108, + "grad_norm": 2.1950039863586426, + "learning_rate": 3.6781363539217785e-05, + "loss": 0.7507, + "step": 1247 + }, + { + "epoch": 0.7982729671384025, + "grad_norm": 2.4047510623931885, + "learning_rate": 3.677067749519128e-05, + "loss": 0.7925, + "step": 1248 + }, + { + "epoch": 0.7989126089389942, + "grad_norm": 2.0003113746643066, + "learning_rate": 3.6759991451164783e-05, + "loss": 0.6481, + "step": 1249 + }, + { + "epoch": 0.7995522507395858, + "grad_norm": 2.548973560333252, + "learning_rate": 3.6749305407138276e-05, + "loss": 0.7367, + "step": 1250 + }, + { + "epoch": 0.8001918925401775, + "grad_norm": 2.0148658752441406, + "learning_rate": 3.6738619363111775e-05, + "loss": 0.6819, + "step": 1251 + }, + { + "epoch": 0.8008315343407691, + "grad_norm": 2.331186294555664, + "learning_rate": 3.6727933319085275e-05, + "loss": 0.7564, + "step": 1252 + }, + { + "epoch": 0.8014711761413609, + "grad_norm": 2.488466739654541, + "learning_rate": 3.6717247275058774e-05, + "loss": 0.7621, + "step": 1253 + }, + { + "epoch": 0.8021108179419525, + "grad_norm": 2.188941478729248, + "learning_rate": 3.6706561231032274e-05, + "loss": 0.6951, + "step": 1254 + }, + { + "epoch": 0.8027504597425442, + "grad_norm": 2.0087170600891113, + "learning_rate": 3.669587518700577e-05, + "loss": 0.6634, + "step": 1255 + }, + { + "epoch": 0.8033901015431358, + "grad_norm": 2.3581948280334473, + "learning_rate": 3.668518914297927e-05, + "loss": 0.7753, + "step": 1256 + }, + { + "epoch": 0.8040297433437276, + "grad_norm": 1.9907422065734863, + "learning_rate": 3.6674503098952765e-05, + "loss": 0.7065, + "step": 1257 + }, + { + "epoch": 0.8046693851443192, + "grad_norm": 2.2611794471740723, + "learning_rate": 3.666381705492627e-05, + "loss": 0.7019, + "step": 1258 + }, + { + "epoch": 0.8053090269449108, + "grad_norm": 1.917324423789978, + "learning_rate": 3.6653131010899764e-05, + "loss": 0.6498, + "step": 1259 + }, + { + "epoch": 0.8059486687455025, + "grad_norm": 2.1342484951019287, + "learning_rate": 3.664244496687326e-05, + "loss": 0.6829, + "step": 1260 + }, + { + "epoch": 0.8065883105460941, + "grad_norm": 2.04067063331604, + "learning_rate": 3.663175892284676e-05, + "loss": 0.6444, + "step": 1261 + }, + { + "epoch": 0.8072279523466859, + "grad_norm": 2.402163028717041, + "learning_rate": 3.662107287882026e-05, + "loss": 0.7433, + "step": 1262 + }, + { + "epoch": 0.8078675941472775, + "grad_norm": 2.3471882343292236, + "learning_rate": 3.661038683479376e-05, + "loss": 0.6615, + "step": 1263 + }, + { + "epoch": 0.8085072359478692, + "grad_norm": 2.4029300212860107, + "learning_rate": 3.659970079076726e-05, + "loss": 0.78, + "step": 1264 + }, + { + "epoch": 0.8091468777484608, + "grad_norm": 2.2626712322235107, + "learning_rate": 3.658901474674076e-05, + "loss": 0.7374, + "step": 1265 + }, + { + "epoch": 0.8097865195490526, + "grad_norm": 2.345632314682007, + "learning_rate": 3.657832870271426e-05, + "loss": 0.7719, + "step": 1266 + }, + { + "epoch": 0.8104261613496442, + "grad_norm": 2.2906014919281006, + "learning_rate": 3.656764265868776e-05, + "loss": 0.6792, + "step": 1267 + }, + { + "epoch": 0.8110658031502359, + "grad_norm": 2.0952188968658447, + "learning_rate": 3.655695661466125e-05, + "loss": 0.6668, + "step": 1268 + }, + { + "epoch": 0.8117054449508275, + "grad_norm": 2.4131739139556885, + "learning_rate": 3.654627057063476e-05, + "loss": 0.7827, + "step": 1269 + }, + { + "epoch": 0.8123450867514193, + "grad_norm": 2.5857861042022705, + "learning_rate": 3.653558452660825e-05, + "loss": 0.7691, + "step": 1270 + }, + { + "epoch": 0.8129847285520109, + "grad_norm": 2.2531378269195557, + "learning_rate": 3.652489848258175e-05, + "loss": 0.7299, + "step": 1271 + }, + { + "epoch": 0.8136243703526025, + "grad_norm": 2.096651315689087, + "learning_rate": 3.651421243855525e-05, + "loss": 0.7459, + "step": 1272 + }, + { + "epoch": 0.8142640121531942, + "grad_norm": 2.3306047916412354, + "learning_rate": 3.650352639452875e-05, + "loss": 0.7505, + "step": 1273 + }, + { + "epoch": 0.8149036539537858, + "grad_norm": 2.36478590965271, + "learning_rate": 3.649284035050225e-05, + "loss": 0.7543, + "step": 1274 + }, + { + "epoch": 0.8155432957543776, + "grad_norm": 2.353835344314575, + "learning_rate": 3.6482154306475747e-05, + "loss": 0.7407, + "step": 1275 + }, + { + "epoch": 0.8161829375549692, + "grad_norm": 2.0713343620300293, + "learning_rate": 3.6471468262449246e-05, + "loss": 0.6208, + "step": 1276 + }, + { + "epoch": 0.8168225793555609, + "grad_norm": 2.1474461555480957, + "learning_rate": 3.646078221842274e-05, + "loss": 0.647, + "step": 1277 + }, + { + "epoch": 0.8174622211561525, + "grad_norm": 2.1109511852264404, + "learning_rate": 3.6450096174396245e-05, + "loss": 0.7284, + "step": 1278 + }, + { + "epoch": 0.8181018629567443, + "grad_norm": 2.1241261959075928, + "learning_rate": 3.643941013036974e-05, + "loss": 0.7017, + "step": 1279 + }, + { + "epoch": 0.8187415047573359, + "grad_norm": 2.4038076400756836, + "learning_rate": 3.6428724086343237e-05, + "loss": 0.7721, + "step": 1280 + }, + { + "epoch": 0.8193811465579276, + "grad_norm": 2.181736469268799, + "learning_rate": 3.6418038042316736e-05, + "loss": 0.7521, + "step": 1281 + }, + { + "epoch": 0.8200207883585192, + "grad_norm": 2.0946333408355713, + "learning_rate": 3.6407351998290235e-05, + "loss": 0.7338, + "step": 1282 + }, + { + "epoch": 0.8206604301591109, + "grad_norm": 2.207449197769165, + "learning_rate": 3.639666595426373e-05, + "loss": 0.7318, + "step": 1283 + }, + { + "epoch": 0.8213000719597026, + "grad_norm": 2.225128650665283, + "learning_rate": 3.6385979910237234e-05, + "loss": 0.7681, + "step": 1284 + }, + { + "epoch": 0.8219397137602943, + "grad_norm": 2.0560288429260254, + "learning_rate": 3.6375293866210733e-05, + "loss": 0.6559, + "step": 1285 + }, + { + "epoch": 0.8225793555608859, + "grad_norm": 2.371389389038086, + "learning_rate": 3.6364607822184226e-05, + "loss": 0.8418, + "step": 1286 + }, + { + "epoch": 0.8232189973614775, + "grad_norm": 2.5029571056365967, + "learning_rate": 3.635392177815773e-05, + "loss": 0.7838, + "step": 1287 + }, + { + "epoch": 0.8238586391620693, + "grad_norm": 2.071765422821045, + "learning_rate": 3.6343235734131225e-05, + "loss": 0.666, + "step": 1288 + }, + { + "epoch": 0.8244982809626609, + "grad_norm": 2.1000685691833496, + "learning_rate": 3.6332549690104724e-05, + "loss": 0.7008, + "step": 1289 + }, + { + "epoch": 0.8251379227632526, + "grad_norm": 2.053774356842041, + "learning_rate": 3.6321863646078223e-05, + "loss": 0.7242, + "step": 1290 + }, + { + "epoch": 0.8257775645638442, + "grad_norm": 2.024888038635254, + "learning_rate": 3.631117760205172e-05, + "loss": 0.6743, + "step": 1291 + }, + { + "epoch": 0.8264172063644359, + "grad_norm": 2.2509822845458984, + "learning_rate": 3.6300491558025215e-05, + "loss": 0.768, + "step": 1292 + }, + { + "epoch": 0.8270568481650276, + "grad_norm": 2.769113302230835, + "learning_rate": 3.628980551399872e-05, + "loss": 0.8041, + "step": 1293 + }, + { + "epoch": 0.8276964899656193, + "grad_norm": 2.3014938831329346, + "learning_rate": 3.6279119469972214e-05, + "loss": 0.6788, + "step": 1294 + }, + { + "epoch": 0.8283361317662109, + "grad_norm": 2.126314163208008, + "learning_rate": 3.626843342594572e-05, + "loss": 0.6831, + "step": 1295 + }, + { + "epoch": 0.8289757735668026, + "grad_norm": 2.3572607040405273, + "learning_rate": 3.625774738191921e-05, + "loss": 0.7451, + "step": 1296 + }, + { + "epoch": 0.8296154153673942, + "grad_norm": 2.5624842643737793, + "learning_rate": 3.624706133789271e-05, + "loss": 0.8626, + "step": 1297 + }, + { + "epoch": 0.830255057167986, + "grad_norm": 2.42989444732666, + "learning_rate": 3.623637529386621e-05, + "loss": 0.8091, + "step": 1298 + }, + { + "epoch": 0.8308946989685776, + "grad_norm": 2.4899890422821045, + "learning_rate": 3.622568924983971e-05, + "loss": 0.7937, + "step": 1299 + }, + { + "epoch": 0.8315343407691692, + "grad_norm": 1.9585925340652466, + "learning_rate": 3.621500320581321e-05, + "loss": 0.6753, + "step": 1300 + }, + { + "epoch": 0.8321739825697609, + "grad_norm": 2.3551599979400635, + "learning_rate": 3.620431716178671e-05, + "loss": 0.7959, + "step": 1301 + }, + { + "epoch": 0.8328136243703526, + "grad_norm": 2.1406025886535645, + "learning_rate": 3.619363111776021e-05, + "loss": 0.7247, + "step": 1302 + }, + { + "epoch": 0.8334532661709443, + "grad_norm": 2.042062759399414, + "learning_rate": 3.61829450737337e-05, + "loss": 0.6943, + "step": 1303 + }, + { + "epoch": 0.8340929079715359, + "grad_norm": 2.0793874263763428, + "learning_rate": 3.617225902970721e-05, + "loss": 0.6605, + "step": 1304 + }, + { + "epoch": 0.8347325497721276, + "grad_norm": 2.354625940322876, + "learning_rate": 3.61615729856807e-05, + "loss": 0.7457, + "step": 1305 + }, + { + "epoch": 0.8353721915727192, + "grad_norm": 2.216197967529297, + "learning_rate": 3.61508869416542e-05, + "loss": 0.7381, + "step": 1306 + }, + { + "epoch": 0.836011833373311, + "grad_norm": 2.133580446243286, + "learning_rate": 3.61402008976277e-05, + "loss": 0.6919, + "step": 1307 + }, + { + "epoch": 0.8366514751739026, + "grad_norm": 2.187563180923462, + "learning_rate": 3.61295148536012e-05, + "loss": 0.7091, + "step": 1308 + }, + { + "epoch": 0.8372911169744943, + "grad_norm": 2.0050559043884277, + "learning_rate": 3.61188288095747e-05, + "loss": 0.6969, + "step": 1309 + }, + { + "epoch": 0.8379307587750859, + "grad_norm": 1.9472792148590088, + "learning_rate": 3.61081427655482e-05, + "loss": 0.6985, + "step": 1310 + }, + { + "epoch": 0.8385704005756777, + "grad_norm": 1.801424503326416, + "learning_rate": 3.6097456721521696e-05, + "loss": 0.5894, + "step": 1311 + }, + { + "epoch": 0.8392100423762693, + "grad_norm": 2.658172607421875, + "learning_rate": 3.608677067749519e-05, + "loss": 0.8383, + "step": 1312 + }, + { + "epoch": 0.839849684176861, + "grad_norm": 2.0976152420043945, + "learning_rate": 3.6076084633468695e-05, + "loss": 0.6931, + "step": 1313 + }, + { + "epoch": 0.8404893259774526, + "grad_norm": 2.410163164138794, + "learning_rate": 3.606539858944219e-05, + "loss": 0.6566, + "step": 1314 + }, + { + "epoch": 0.8411289677780442, + "grad_norm": 2.395289182662964, + "learning_rate": 3.605471254541569e-05, + "loss": 0.8125, + "step": 1315 + }, + { + "epoch": 0.841768609578636, + "grad_norm": 2.275268793106079, + "learning_rate": 3.6044026501389187e-05, + "loss": 0.6973, + "step": 1316 + }, + { + "epoch": 0.8424082513792276, + "grad_norm": 2.461775541305542, + "learning_rate": 3.6033340457362686e-05, + "loss": 0.7726, + "step": 1317 + }, + { + "epoch": 0.8430478931798193, + "grad_norm": 2.811631202697754, + "learning_rate": 3.6022654413336185e-05, + "loss": 0.8024, + "step": 1318 + }, + { + "epoch": 0.8436875349804109, + "grad_norm": 2.365971565246582, + "learning_rate": 3.6011968369309685e-05, + "loss": 0.731, + "step": 1319 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 2.075807571411133, + "learning_rate": 3.6001282325283184e-05, + "loss": 0.6405, + "step": 1320 + }, + { + "epoch": 0.8449668185815943, + "grad_norm": 2.1783554553985596, + "learning_rate": 3.599059628125668e-05, + "loss": 0.7214, + "step": 1321 + }, + { + "epoch": 0.845606460382186, + "grad_norm": 1.9325662851333618, + "learning_rate": 3.597991023723018e-05, + "loss": 0.5924, + "step": 1322 + }, + { + "epoch": 0.8462461021827776, + "grad_norm": 2.0185577869415283, + "learning_rate": 3.5969224193203675e-05, + "loss": 0.6725, + "step": 1323 + }, + { + "epoch": 0.8468857439833694, + "grad_norm": 2.0572102069854736, + "learning_rate": 3.595853814917718e-05, + "loss": 0.6862, + "step": 1324 + }, + { + "epoch": 0.847525385783961, + "grad_norm": 2.135138750076294, + "learning_rate": 3.5947852105150674e-05, + "loss": 0.6981, + "step": 1325 + }, + { + "epoch": 0.8481650275845527, + "grad_norm": 2.1671605110168457, + "learning_rate": 3.593716606112417e-05, + "loss": 0.7776, + "step": 1326 + }, + { + "epoch": 0.8488046693851443, + "grad_norm": 2.223288059234619, + "learning_rate": 3.592648001709767e-05, + "loss": 0.7327, + "step": 1327 + }, + { + "epoch": 0.8494443111857359, + "grad_norm": 2.356409788131714, + "learning_rate": 3.591579397307117e-05, + "loss": 0.7353, + "step": 1328 + }, + { + "epoch": 0.8500839529863277, + "grad_norm": 2.5746231079101562, + "learning_rate": 3.5905107929044665e-05, + "loss": 0.8148, + "step": 1329 + }, + { + "epoch": 0.8507235947869193, + "grad_norm": 2.176095485687256, + "learning_rate": 3.589442188501817e-05, + "loss": 0.7427, + "step": 1330 + }, + { + "epoch": 0.851363236587511, + "grad_norm": 2.352288246154785, + "learning_rate": 3.588373584099167e-05, + "loss": 0.7535, + "step": 1331 + }, + { + "epoch": 0.8520028783881026, + "grad_norm": 2.154296875, + "learning_rate": 3.587304979696516e-05, + "loss": 0.6683, + "step": 1332 + }, + { + "epoch": 0.8526425201886944, + "grad_norm": 2.0666000843048096, + "learning_rate": 3.586236375293867e-05, + "loss": 0.6675, + "step": 1333 + }, + { + "epoch": 0.853282161989286, + "grad_norm": 2.3061776161193848, + "learning_rate": 3.585167770891216e-05, + "loss": 0.7446, + "step": 1334 + }, + { + "epoch": 0.8539218037898777, + "grad_norm": 2.3385169506073, + "learning_rate": 3.584099166488566e-05, + "loss": 0.7534, + "step": 1335 + }, + { + "epoch": 0.8545614455904693, + "grad_norm": 2.1738851070404053, + "learning_rate": 3.583030562085916e-05, + "loss": 0.7418, + "step": 1336 + }, + { + "epoch": 0.855201087391061, + "grad_norm": 2.168400287628174, + "learning_rate": 3.581961957683266e-05, + "loss": 0.5973, + "step": 1337 + }, + { + "epoch": 0.8558407291916527, + "grad_norm": 2.1141319274902344, + "learning_rate": 3.580893353280615e-05, + "loss": 0.6944, + "step": 1338 + }, + { + "epoch": 0.8564803709922444, + "grad_norm": 2.0818159580230713, + "learning_rate": 3.579824748877966e-05, + "loss": 0.7369, + "step": 1339 + }, + { + "epoch": 0.857120012792836, + "grad_norm": 2.6422712802886963, + "learning_rate": 3.578756144475315e-05, + "loss": 0.8136, + "step": 1340 + }, + { + "epoch": 0.8577596545934277, + "grad_norm": 2.1801748275756836, + "learning_rate": 3.577687540072665e-05, + "loss": 0.7068, + "step": 1341 + }, + { + "epoch": 0.8583992963940194, + "grad_norm": 2.3806281089782715, + "learning_rate": 3.576618935670015e-05, + "loss": 0.8342, + "step": 1342 + }, + { + "epoch": 0.859038938194611, + "grad_norm": 2.6606218814849854, + "learning_rate": 3.575550331267365e-05, + "loss": 0.9223, + "step": 1343 + }, + { + "epoch": 0.8596785799952027, + "grad_norm": 2.5878489017486572, + "learning_rate": 3.574481726864715e-05, + "loss": 0.81, + "step": 1344 + }, + { + "epoch": 0.8603182217957943, + "grad_norm": 2.1978557109832764, + "learning_rate": 3.573413122462065e-05, + "loss": 0.7554, + "step": 1345 + }, + { + "epoch": 0.860957863596386, + "grad_norm": 2.2777504920959473, + "learning_rate": 3.572344518059415e-05, + "loss": 0.7967, + "step": 1346 + }, + { + "epoch": 0.8615975053969777, + "grad_norm": 2.2588183879852295, + "learning_rate": 3.571275913656764e-05, + "loss": 0.7188, + "step": 1347 + }, + { + "epoch": 0.8622371471975694, + "grad_norm": 2.308361768722534, + "learning_rate": 3.5702073092541146e-05, + "loss": 0.8122, + "step": 1348 + }, + { + "epoch": 0.862876788998161, + "grad_norm": 2.2274303436279297, + "learning_rate": 3.569138704851464e-05, + "loss": 0.6891, + "step": 1349 + }, + { + "epoch": 0.8635164307987527, + "grad_norm": 1.8308266401290894, + "learning_rate": 3.5680701004488144e-05, + "loss": 0.5985, + "step": 1350 + }, + { + "epoch": 0.8641560725993443, + "grad_norm": 2.005376100540161, + "learning_rate": 3.567001496046164e-05, + "loss": 0.7149, + "step": 1351 + }, + { + "epoch": 0.8647957143999361, + "grad_norm": 2.1678292751312256, + "learning_rate": 3.5659328916435136e-05, + "loss": 0.7674, + "step": 1352 + }, + { + "epoch": 0.8654353562005277, + "grad_norm": 2.299746513366699, + "learning_rate": 3.5648642872408636e-05, + "loss": 0.7136, + "step": 1353 + }, + { + "epoch": 0.8660749980011194, + "grad_norm": 2.1002919673919678, + "learning_rate": 3.5637956828382135e-05, + "loss": 0.6933, + "step": 1354 + }, + { + "epoch": 0.866714639801711, + "grad_norm": 2.0909488201141357, + "learning_rate": 3.5627270784355635e-05, + "loss": 0.7582, + "step": 1355 + }, + { + "epoch": 0.8673542816023027, + "grad_norm": 2.0758700370788574, + "learning_rate": 3.5616584740329134e-05, + "loss": 0.7156, + "step": 1356 + }, + { + "epoch": 0.8679939234028944, + "grad_norm": 2.148381233215332, + "learning_rate": 3.560589869630263e-05, + "loss": 0.7108, + "step": 1357 + }, + { + "epoch": 0.868633565203486, + "grad_norm": 2.36637806892395, + "learning_rate": 3.5595212652276126e-05, + "loss": 0.7925, + "step": 1358 + }, + { + "epoch": 0.8692732070040777, + "grad_norm": 2.166687488555908, + "learning_rate": 3.558452660824963e-05, + "loss": 0.6827, + "step": 1359 + }, + { + "epoch": 0.8699128488046693, + "grad_norm": 2.006511688232422, + "learning_rate": 3.5573840564223125e-05, + "loss": 0.6206, + "step": 1360 + }, + { + "epoch": 0.8705524906052611, + "grad_norm": 2.1921260356903076, + "learning_rate": 3.5563154520196624e-05, + "loss": 0.7716, + "step": 1361 + }, + { + "epoch": 0.8711921324058527, + "grad_norm": 2.094515323638916, + "learning_rate": 3.555246847617012e-05, + "loss": 0.6804, + "step": 1362 + }, + { + "epoch": 0.8718317742064444, + "grad_norm": 2.1194989681243896, + "learning_rate": 3.554178243214362e-05, + "loss": 0.719, + "step": 1363 + }, + { + "epoch": 0.872471416007036, + "grad_norm": 2.3637473583221436, + "learning_rate": 3.553109638811712e-05, + "loss": 0.7319, + "step": 1364 + }, + { + "epoch": 0.8731110578076278, + "grad_norm": 2.376315116882324, + "learning_rate": 3.552041034409062e-05, + "loss": 0.8048, + "step": 1365 + }, + { + "epoch": 0.8737506996082194, + "grad_norm": 2.3273696899414062, + "learning_rate": 3.550972430006412e-05, + "loss": 0.7703, + "step": 1366 + }, + { + "epoch": 0.8743903414088111, + "grad_norm": 2.234257936477661, + "learning_rate": 3.549903825603761e-05, + "loss": 0.6514, + "step": 1367 + }, + { + "epoch": 0.8750299832094027, + "grad_norm": 2.3246567249298096, + "learning_rate": 3.548835221201112e-05, + "loss": 0.7174, + "step": 1368 + }, + { + "epoch": 0.8756696250099945, + "grad_norm": 2.4329986572265625, + "learning_rate": 3.547766616798461e-05, + "loss": 0.6824, + "step": 1369 + }, + { + "epoch": 0.8763092668105861, + "grad_norm": 2.325556516647339, + "learning_rate": 3.546698012395811e-05, + "loss": 0.733, + "step": 1370 + }, + { + "epoch": 0.8769489086111777, + "grad_norm": 2.095371723175049, + "learning_rate": 3.545629407993161e-05, + "loss": 0.7055, + "step": 1371 + }, + { + "epoch": 0.8775885504117694, + "grad_norm": 2.398172616958618, + "learning_rate": 3.544560803590511e-05, + "loss": 0.8113, + "step": 1372 + }, + { + "epoch": 0.878228192212361, + "grad_norm": 2.422332286834717, + "learning_rate": 3.54349219918786e-05, + "loss": 0.774, + "step": 1373 + }, + { + "epoch": 0.8788678340129528, + "grad_norm": 2.316647529602051, + "learning_rate": 3.542423594785211e-05, + "loss": 0.7921, + "step": 1374 + }, + { + "epoch": 0.8795074758135444, + "grad_norm": 2.2691285610198975, + "learning_rate": 3.54135499038256e-05, + "loss": 0.8109, + "step": 1375 + }, + { + "epoch": 0.8801471176141361, + "grad_norm": 2.6274847984313965, + "learning_rate": 3.54028638597991e-05, + "loss": 0.8763, + "step": 1376 + }, + { + "epoch": 0.8807867594147277, + "grad_norm": 2.0136759281158447, + "learning_rate": 3.539217781577261e-05, + "loss": 0.6571, + "step": 1377 + }, + { + "epoch": 0.8814264012153195, + "grad_norm": 2.4123120307922363, + "learning_rate": 3.53814917717461e-05, + "loss": 0.7667, + "step": 1378 + }, + { + "epoch": 0.8820660430159111, + "grad_norm": 2.4716055393218994, + "learning_rate": 3.5370805727719606e-05, + "loss": 0.7813, + "step": 1379 + }, + { + "epoch": 0.8827056848165028, + "grad_norm": 2.250933885574341, + "learning_rate": 3.53601196836931e-05, + "loss": 0.7448, + "step": 1380 + }, + { + "epoch": 0.8833453266170944, + "grad_norm": 2.155230760574341, + "learning_rate": 3.53494336396666e-05, + "loss": 0.6957, + "step": 1381 + }, + { + "epoch": 0.8839849684176861, + "grad_norm": 2.1631875038146973, + "learning_rate": 3.53387475956401e-05, + "loss": 0.7243, + "step": 1382 + }, + { + "epoch": 0.8846246102182778, + "grad_norm": 2.1185851097106934, + "learning_rate": 3.5328061551613596e-05, + "loss": 0.751, + "step": 1383 + }, + { + "epoch": 0.8852642520188694, + "grad_norm": 2.457045316696167, + "learning_rate": 3.531737550758709e-05, + "loss": 0.8195, + "step": 1384 + }, + { + "epoch": 0.8859038938194611, + "grad_norm": 2.2588415145874023, + "learning_rate": 3.5306689463560595e-05, + "loss": 0.7922, + "step": 1385 + }, + { + "epoch": 0.8865435356200527, + "grad_norm": 2.1469454765319824, + "learning_rate": 3.529600341953409e-05, + "loss": 0.7189, + "step": 1386 + }, + { + "epoch": 0.8871831774206445, + "grad_norm": 2.1095893383026123, + "learning_rate": 3.528531737550759e-05, + "loss": 0.6831, + "step": 1387 + }, + { + "epoch": 0.8878228192212361, + "grad_norm": 1.88996422290802, + "learning_rate": 3.5274631331481086e-05, + "loss": 0.5968, + "step": 1388 + }, + { + "epoch": 0.8884624610218278, + "grad_norm": 2.091191053390503, + "learning_rate": 3.5263945287454586e-05, + "loss": 0.6709, + "step": 1389 + }, + { + "epoch": 0.8891021028224194, + "grad_norm": 2.206300735473633, + "learning_rate": 3.5253259243428085e-05, + "loss": 0.6793, + "step": 1390 + }, + { + "epoch": 0.8897417446230111, + "grad_norm": 2.602550506591797, + "learning_rate": 3.5242573199401584e-05, + "loss": 0.8169, + "step": 1391 + }, + { + "epoch": 0.8903813864236028, + "grad_norm": 2.4747698307037354, + "learning_rate": 3.5231887155375084e-05, + "loss": 0.6866, + "step": 1392 + }, + { + "epoch": 0.8910210282241945, + "grad_norm": 2.1457183361053467, + "learning_rate": 3.5221201111348576e-05, + "loss": 0.6873, + "step": 1393 + }, + { + "epoch": 0.8916606700247861, + "grad_norm": 2.2383368015289307, + "learning_rate": 3.521051506732208e-05, + "loss": 0.6547, + "step": 1394 + }, + { + "epoch": 0.8923003118253778, + "grad_norm": 2.245839834213257, + "learning_rate": 3.5199829023295575e-05, + "loss": 0.6049, + "step": 1395 + }, + { + "epoch": 0.8929399536259695, + "grad_norm": 2.3442234992980957, + "learning_rate": 3.5189142979269074e-05, + "loss": 0.751, + "step": 1396 + }, + { + "epoch": 0.8935795954265612, + "grad_norm": 2.34509015083313, + "learning_rate": 3.5178456935242574e-05, + "loss": 0.7231, + "step": 1397 + }, + { + "epoch": 0.8942192372271528, + "grad_norm": 2.611557722091675, + "learning_rate": 3.516777089121607e-05, + "loss": 0.7842, + "step": 1398 + }, + { + "epoch": 0.8948588790277444, + "grad_norm": 2.5007386207580566, + "learning_rate": 3.515708484718957e-05, + "loss": 0.755, + "step": 1399 + }, + { + "epoch": 0.8954985208283361, + "grad_norm": 2.210387706756592, + "learning_rate": 3.514639880316307e-05, + "loss": 0.743, + "step": 1400 + }, + { + "epoch": 0.8961381626289278, + "grad_norm": 2.619443655014038, + "learning_rate": 3.513571275913657e-05, + "loss": 0.7962, + "step": 1401 + }, + { + "epoch": 0.8967778044295195, + "grad_norm": 2.3215034008026123, + "learning_rate": 3.5125026715110064e-05, + "loss": 0.7684, + "step": 1402 + }, + { + "epoch": 0.8974174462301111, + "grad_norm": 2.1196274757385254, + "learning_rate": 3.511434067108357e-05, + "loss": 0.6856, + "step": 1403 + }, + { + "epoch": 0.8980570880307028, + "grad_norm": 1.861092448234558, + "learning_rate": 3.510365462705706e-05, + "loss": 0.6071, + "step": 1404 + }, + { + "epoch": 0.8986967298312944, + "grad_norm": 2.4183058738708496, + "learning_rate": 3.509296858303056e-05, + "loss": 0.8031, + "step": 1405 + }, + { + "epoch": 0.8993363716318862, + "grad_norm": 2.504423141479492, + "learning_rate": 3.508228253900406e-05, + "loss": 0.8647, + "step": 1406 + }, + { + "epoch": 0.8999760134324778, + "grad_norm": 2.1402695178985596, + "learning_rate": 3.507159649497756e-05, + "loss": 0.7569, + "step": 1407 + }, + { + "epoch": 0.9006156552330695, + "grad_norm": 2.400038719177246, + "learning_rate": 3.506091045095106e-05, + "loss": 0.805, + "step": 1408 + }, + { + "epoch": 0.9012552970336611, + "grad_norm": 1.9367976188659668, + "learning_rate": 3.505022440692456e-05, + "loss": 0.6685, + "step": 1409 + }, + { + "epoch": 0.9018949388342529, + "grad_norm": 1.9515469074249268, + "learning_rate": 3.503953836289806e-05, + "loss": 0.6853, + "step": 1410 + }, + { + "epoch": 0.9025345806348445, + "grad_norm": 2.433276891708374, + "learning_rate": 3.502885231887156e-05, + "loss": 0.7776, + "step": 1411 + }, + { + "epoch": 0.9031742224354361, + "grad_norm": 1.9331090450286865, + "learning_rate": 3.501816627484506e-05, + "loss": 0.5848, + "step": 1412 + }, + { + "epoch": 0.9038138642360278, + "grad_norm": 2.093459367752075, + "learning_rate": 3.500748023081855e-05, + "loss": 0.6599, + "step": 1413 + }, + { + "epoch": 0.9044535060366194, + "grad_norm": 2.2528626918792725, + "learning_rate": 3.4996794186792056e-05, + "loss": 0.6265, + "step": 1414 + }, + { + "epoch": 0.9050931478372112, + "grad_norm": 2.2050235271453857, + "learning_rate": 3.498610814276555e-05, + "loss": 0.7569, + "step": 1415 + }, + { + "epoch": 0.9057327896378028, + "grad_norm": 2.386476993560791, + "learning_rate": 3.497542209873905e-05, + "loss": 0.7386, + "step": 1416 + }, + { + "epoch": 0.9063724314383945, + "grad_norm": 2.1573355197906494, + "learning_rate": 3.496473605471255e-05, + "loss": 0.7586, + "step": 1417 + }, + { + "epoch": 0.9070120732389861, + "grad_norm": 2.611687183380127, + "learning_rate": 3.495405001068605e-05, + "loss": 0.9539, + "step": 1418 + }, + { + "epoch": 0.9076517150395779, + "grad_norm": 1.9923269748687744, + "learning_rate": 3.494336396665954e-05, + "loss": 0.6406, + "step": 1419 + }, + { + "epoch": 0.9082913568401695, + "grad_norm": 2.258986234664917, + "learning_rate": 3.4932677922633046e-05, + "loss": 0.7374, + "step": 1420 + }, + { + "epoch": 0.9089309986407612, + "grad_norm": 2.371077299118042, + "learning_rate": 3.492199187860654e-05, + "loss": 0.7627, + "step": 1421 + }, + { + "epoch": 0.9095706404413528, + "grad_norm": 2.3209691047668457, + "learning_rate": 3.491130583458004e-05, + "loss": 0.7666, + "step": 1422 + }, + { + "epoch": 0.9102102822419446, + "grad_norm": 2.3601765632629395, + "learning_rate": 3.4900619790553544e-05, + "loss": 0.9136, + "step": 1423 + }, + { + "epoch": 0.9108499240425362, + "grad_norm": 2.2075514793395996, + "learning_rate": 3.4889933746527036e-05, + "loss": 0.7335, + "step": 1424 + }, + { + "epoch": 0.9114895658431279, + "grad_norm": 2.5802149772644043, + "learning_rate": 3.4879247702500536e-05, + "loss": 0.8581, + "step": 1425 + }, + { + "epoch": 0.9121292076437195, + "grad_norm": 2.288961887359619, + "learning_rate": 3.4868561658474035e-05, + "loss": 0.7055, + "step": 1426 + }, + { + "epoch": 0.9127688494443111, + "grad_norm": 2.0689785480499268, + "learning_rate": 3.4857875614447534e-05, + "loss": 0.6377, + "step": 1427 + }, + { + "epoch": 0.9134084912449029, + "grad_norm": 2.295675039291382, + "learning_rate": 3.484718957042103e-05, + "loss": 0.7938, + "step": 1428 + }, + { + "epoch": 0.9140481330454945, + "grad_norm": 1.9098725318908691, + "learning_rate": 3.483650352639453e-05, + "loss": 0.5767, + "step": 1429 + }, + { + "epoch": 0.9146877748460862, + "grad_norm": 2.506603956222534, + "learning_rate": 3.4825817482368026e-05, + "loss": 0.8863, + "step": 1430 + }, + { + "epoch": 0.9153274166466778, + "grad_norm": 2.042794704437256, + "learning_rate": 3.4815131438341525e-05, + "loss": 0.7206, + "step": 1431 + }, + { + "epoch": 0.9159670584472696, + "grad_norm": 1.9123684167861938, + "learning_rate": 3.4804445394315024e-05, + "loss": 0.6235, + "step": 1432 + }, + { + "epoch": 0.9166067002478612, + "grad_norm": 1.985778570175171, + "learning_rate": 3.4793759350288524e-05, + "loss": 0.6174, + "step": 1433 + }, + { + "epoch": 0.9172463420484529, + "grad_norm": 1.978415846824646, + "learning_rate": 3.478307330626202e-05, + "loss": 0.6026, + "step": 1434 + }, + { + "epoch": 0.9178859838490445, + "grad_norm": 2.103729486465454, + "learning_rate": 3.477238726223552e-05, + "loss": 0.6872, + "step": 1435 + }, + { + "epoch": 0.9185256256496362, + "grad_norm": 2.2244601249694824, + "learning_rate": 3.476170121820902e-05, + "loss": 0.7687, + "step": 1436 + }, + { + "epoch": 0.9191652674502279, + "grad_norm": 2.4220504760742188, + "learning_rate": 3.475101517418252e-05, + "loss": 0.7686, + "step": 1437 + }, + { + "epoch": 0.9198049092508196, + "grad_norm": 2.257708787918091, + "learning_rate": 3.474032913015602e-05, + "loss": 0.6922, + "step": 1438 + }, + { + "epoch": 0.9204445510514112, + "grad_norm": 2.208747625350952, + "learning_rate": 3.472964308612951e-05, + "loss": 0.7214, + "step": 1439 + }, + { + "epoch": 0.9210841928520028, + "grad_norm": 2.384122371673584, + "learning_rate": 3.471895704210302e-05, + "loss": 0.7772, + "step": 1440 + }, + { + "epoch": 0.9217238346525946, + "grad_norm": 2.1721436977386475, + "learning_rate": 3.470827099807651e-05, + "loss": 0.6018, + "step": 1441 + }, + { + "epoch": 0.9223634764531862, + "grad_norm": 2.0613491535186768, + "learning_rate": 3.469758495405001e-05, + "loss": 0.773, + "step": 1442 + }, + { + "epoch": 0.9230031182537779, + "grad_norm": 1.8981919288635254, + "learning_rate": 3.468689891002351e-05, + "loss": 0.592, + "step": 1443 + }, + { + "epoch": 0.9236427600543695, + "grad_norm": 2.4995343685150146, + "learning_rate": 3.467621286599701e-05, + "loss": 0.8375, + "step": 1444 + }, + { + "epoch": 0.9242824018549612, + "grad_norm": 2.4255168437957764, + "learning_rate": 3.466552682197051e-05, + "loss": 0.8258, + "step": 1445 + }, + { + "epoch": 0.9249220436555529, + "grad_norm": 2.2315685749053955, + "learning_rate": 3.465484077794401e-05, + "loss": 0.6425, + "step": 1446 + }, + { + "epoch": 0.9255616854561446, + "grad_norm": 2.0109851360321045, + "learning_rate": 3.464415473391751e-05, + "loss": 0.696, + "step": 1447 + }, + { + "epoch": 0.9262013272567362, + "grad_norm": 2.160836935043335, + "learning_rate": 3.4633468689891e-05, + "loss": 0.7189, + "step": 1448 + }, + { + "epoch": 0.9268409690573279, + "grad_norm": 2.1626944541931152, + "learning_rate": 3.462278264586451e-05, + "loss": 0.7197, + "step": 1449 + }, + { + "epoch": 0.9274806108579196, + "grad_norm": 2.059540033340454, + "learning_rate": 3.4612096601838e-05, + "loss": 0.774, + "step": 1450 + }, + { + "epoch": 0.9281202526585113, + "grad_norm": 1.9875566959381104, + "learning_rate": 3.46014105578115e-05, + "loss": 0.7031, + "step": 1451 + }, + { + "epoch": 0.9287598944591029, + "grad_norm": 2.532210111618042, + "learning_rate": 3.4590724513785e-05, + "loss": 0.693, + "step": 1452 + }, + { + "epoch": 0.9293995362596946, + "grad_norm": 2.38069224357605, + "learning_rate": 3.45800384697585e-05, + "loss": 0.8662, + "step": 1453 + }, + { + "epoch": 0.9300391780602862, + "grad_norm": 2.1424291133880615, + "learning_rate": 3.456935242573199e-05, + "loss": 0.6892, + "step": 1454 + }, + { + "epoch": 0.9306788198608779, + "grad_norm": 2.37778902053833, + "learning_rate": 3.4558666381705496e-05, + "loss": 0.7303, + "step": 1455 + }, + { + "epoch": 0.9313184616614696, + "grad_norm": 2.07857608795166, + "learning_rate": 3.4547980337678996e-05, + "loss": 0.6485, + "step": 1456 + }, + { + "epoch": 0.9319581034620612, + "grad_norm": 1.9782344102859497, + "learning_rate": 3.453729429365249e-05, + "loss": 0.664, + "step": 1457 + }, + { + "epoch": 0.9325977452626529, + "grad_norm": 2.370725631713867, + "learning_rate": 3.4526608249625994e-05, + "loss": 0.8319, + "step": 1458 + }, + { + "epoch": 0.9332373870632446, + "grad_norm": 2.090789318084717, + "learning_rate": 3.451592220559949e-05, + "loss": 0.7378, + "step": 1459 + }, + { + "epoch": 0.9338770288638363, + "grad_norm": 2.2956361770629883, + "learning_rate": 3.4505236161572986e-05, + "loss": 0.6983, + "step": 1460 + }, + { + "epoch": 0.9345166706644279, + "grad_norm": 2.0403873920440674, + "learning_rate": 3.4494550117546486e-05, + "loss": 0.6757, + "step": 1461 + }, + { + "epoch": 0.9351563124650196, + "grad_norm": 2.0827760696411133, + "learning_rate": 3.4483864073519985e-05, + "loss": 0.6727, + "step": 1462 + }, + { + "epoch": 0.9357959542656112, + "grad_norm": 2.2579751014709473, + "learning_rate": 3.4473178029493484e-05, + "loss": 0.767, + "step": 1463 + }, + { + "epoch": 0.936435596066203, + "grad_norm": 2.264143943786621, + "learning_rate": 3.4462491985466984e-05, + "loss": 0.6654, + "step": 1464 + }, + { + "epoch": 0.9370752378667946, + "grad_norm": 2.1262547969818115, + "learning_rate": 3.4451805941440476e-05, + "loss": 0.6198, + "step": 1465 + }, + { + "epoch": 0.9377148796673863, + "grad_norm": 2.262063503265381, + "learning_rate": 3.444111989741398e-05, + "loss": 0.7275, + "step": 1466 + }, + { + "epoch": 0.9383545214679779, + "grad_norm": 2.2953622341156006, + "learning_rate": 3.4430433853387475e-05, + "loss": 0.7939, + "step": 1467 + }, + { + "epoch": 0.9389941632685695, + "grad_norm": 1.8402847051620483, + "learning_rate": 3.4419747809360974e-05, + "loss": 0.6079, + "step": 1468 + }, + { + "epoch": 0.9396338050691613, + "grad_norm": 2.457789659500122, + "learning_rate": 3.440906176533448e-05, + "loss": 0.756, + "step": 1469 + }, + { + "epoch": 0.9402734468697529, + "grad_norm": 2.532585620880127, + "learning_rate": 3.439837572130797e-05, + "loss": 0.7221, + "step": 1470 + }, + { + "epoch": 0.9409130886703446, + "grad_norm": 2.0720150470733643, + "learning_rate": 3.438768967728147e-05, + "loss": 0.6831, + "step": 1471 + }, + { + "epoch": 0.9415527304709362, + "grad_norm": 2.1068005561828613, + "learning_rate": 3.437700363325497e-05, + "loss": 0.6341, + "step": 1472 + }, + { + "epoch": 0.942192372271528, + "grad_norm": 2.184021234512329, + "learning_rate": 3.436631758922847e-05, + "loss": 0.7116, + "step": 1473 + }, + { + "epoch": 0.9428320140721196, + "grad_norm": 2.212113380432129, + "learning_rate": 3.4355631545201964e-05, + "loss": 0.7128, + "step": 1474 + }, + { + "epoch": 0.9434716558727113, + "grad_norm": 2.516669511795044, + "learning_rate": 3.434494550117547e-05, + "loss": 0.7688, + "step": 1475 + }, + { + "epoch": 0.9441112976733029, + "grad_norm": 2.029060125350952, + "learning_rate": 3.433425945714896e-05, + "loss": 0.5678, + "step": 1476 + }, + { + "epoch": 0.9447509394738947, + "grad_norm": 2.0237064361572266, + "learning_rate": 3.432357341312246e-05, + "loss": 0.6731, + "step": 1477 + }, + { + "epoch": 0.9453905812744863, + "grad_norm": 2.291206121444702, + "learning_rate": 3.431288736909596e-05, + "loss": 0.6812, + "step": 1478 + }, + { + "epoch": 0.946030223075078, + "grad_norm": 2.2760465145111084, + "learning_rate": 3.430220132506946e-05, + "loss": 0.739, + "step": 1479 + }, + { + "epoch": 0.9466698648756696, + "grad_norm": 2.578695774078369, + "learning_rate": 3.429151528104296e-05, + "loss": 0.7474, + "step": 1480 + }, + { + "epoch": 0.9473095066762613, + "grad_norm": 2.1519055366516113, + "learning_rate": 3.428082923701646e-05, + "loss": 0.6507, + "step": 1481 + }, + { + "epoch": 0.947949148476853, + "grad_norm": 2.2226076126098633, + "learning_rate": 3.427014319298996e-05, + "loss": 0.7141, + "step": 1482 + }, + { + "epoch": 0.9485887902774446, + "grad_norm": 2.4331486225128174, + "learning_rate": 3.425945714896345e-05, + "loss": 0.8209, + "step": 1483 + }, + { + "epoch": 0.9492284320780363, + "grad_norm": 2.1664061546325684, + "learning_rate": 3.424877110493696e-05, + "loss": 0.7168, + "step": 1484 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 1.9615627527236938, + "learning_rate": 3.423808506091045e-05, + "loss": 0.6672, + "step": 1485 + }, + { + "epoch": 0.9505077156792197, + "grad_norm": 2.095708131790161, + "learning_rate": 3.422739901688395e-05, + "loss": 0.7392, + "step": 1486 + }, + { + "epoch": 0.9511473574798113, + "grad_norm": 1.9985183477401733, + "learning_rate": 3.421671297285745e-05, + "loss": 0.6717, + "step": 1487 + }, + { + "epoch": 0.951786999280403, + "grad_norm": 1.9108582735061646, + "learning_rate": 3.420602692883095e-05, + "loss": 0.585, + "step": 1488 + }, + { + "epoch": 0.9524266410809946, + "grad_norm": 1.8438045978546143, + "learning_rate": 3.419534088480445e-05, + "loss": 0.6819, + "step": 1489 + }, + { + "epoch": 0.9530662828815863, + "grad_norm": 2.203334331512451, + "learning_rate": 3.418465484077795e-05, + "loss": 0.7779, + "step": 1490 + }, + { + "epoch": 0.953705924682178, + "grad_norm": 1.9355978965759277, + "learning_rate": 3.4173968796751446e-05, + "loss": 0.684, + "step": 1491 + }, + { + "epoch": 0.9543455664827697, + "grad_norm": 1.9291232824325562, + "learning_rate": 3.4163282752724945e-05, + "loss": 0.6304, + "step": 1492 + }, + { + "epoch": 0.9549852082833613, + "grad_norm": 2.56270170211792, + "learning_rate": 3.4152596708698445e-05, + "loss": 0.8354, + "step": 1493 + }, + { + "epoch": 0.955624850083953, + "grad_norm": 2.3271262645721436, + "learning_rate": 3.414191066467194e-05, + "loss": 0.6864, + "step": 1494 + }, + { + "epoch": 0.9562644918845447, + "grad_norm": 2.4378902912139893, + "learning_rate": 3.4131224620645444e-05, + "loss": 0.8168, + "step": 1495 + }, + { + "epoch": 0.9569041336851363, + "grad_norm": 2.4102671146392822, + "learning_rate": 3.4120538576618936e-05, + "loss": 0.7631, + "step": 1496 + }, + { + "epoch": 0.957543775485728, + "grad_norm": 2.0062508583068848, + "learning_rate": 3.4109852532592436e-05, + "loss": 0.6655, + "step": 1497 + }, + { + "epoch": 0.9581834172863196, + "grad_norm": 2.330242156982422, + "learning_rate": 3.4099166488565935e-05, + "loss": 0.7531, + "step": 1498 + }, + { + "epoch": 0.9588230590869113, + "grad_norm": 1.935307502746582, + "learning_rate": 3.4088480444539434e-05, + "loss": 0.6319, + "step": 1499 + }, + { + "epoch": 0.959462700887503, + "grad_norm": 2.0610244274139404, + "learning_rate": 3.407779440051293e-05, + "loss": 0.6892, + "step": 1500 + }, + { + "epoch": 0.9601023426880947, + "grad_norm": 1.7754333019256592, + "learning_rate": 3.406710835648643e-05, + "loss": 0.5632, + "step": 1501 + }, + { + "epoch": 0.9607419844886863, + "grad_norm": 2.2314975261688232, + "learning_rate": 3.405642231245993e-05, + "loss": 0.7993, + "step": 1502 + }, + { + "epoch": 0.961381626289278, + "grad_norm": 2.5227909088134766, + "learning_rate": 3.4045736268433425e-05, + "loss": 0.7917, + "step": 1503 + }, + { + "epoch": 0.9620212680898697, + "grad_norm": 2.2489991188049316, + "learning_rate": 3.403505022440693e-05, + "loss": 0.7007, + "step": 1504 + }, + { + "epoch": 0.9626609098904614, + "grad_norm": 2.4014620780944824, + "learning_rate": 3.4024364180380424e-05, + "loss": 0.7098, + "step": 1505 + }, + { + "epoch": 0.963300551691053, + "grad_norm": 2.0850815773010254, + "learning_rate": 3.401367813635392e-05, + "loss": 0.6114, + "step": 1506 + }, + { + "epoch": 0.9639401934916447, + "grad_norm": 2.3688106536865234, + "learning_rate": 3.400299209232742e-05, + "loss": 0.7705, + "step": 1507 + }, + { + "epoch": 0.9645798352922363, + "grad_norm": 2.0088326930999756, + "learning_rate": 3.399230604830092e-05, + "loss": 0.6809, + "step": 1508 + }, + { + "epoch": 0.9652194770928281, + "grad_norm": 2.115326404571533, + "learning_rate": 3.3981620004274414e-05, + "loss": 0.7547, + "step": 1509 + }, + { + "epoch": 0.9658591188934197, + "grad_norm": 2.5346519947052, + "learning_rate": 3.397093396024792e-05, + "loss": 0.7843, + "step": 1510 + }, + { + "epoch": 0.9664987606940113, + "grad_norm": 2.709911346435547, + "learning_rate": 3.396024791622141e-05, + "loss": 0.8325, + "step": 1511 + }, + { + "epoch": 0.967138402494603, + "grad_norm": 2.412773609161377, + "learning_rate": 3.394956187219491e-05, + "loss": 0.7499, + "step": 1512 + }, + { + "epoch": 0.9677780442951947, + "grad_norm": 2.4920361042022705, + "learning_rate": 3.393887582816841e-05, + "loss": 0.8206, + "step": 1513 + }, + { + "epoch": 0.9684176860957864, + "grad_norm": 1.972916841506958, + "learning_rate": 3.392818978414191e-05, + "loss": 0.6819, + "step": 1514 + }, + { + "epoch": 0.969057327896378, + "grad_norm": 2.2179787158966064, + "learning_rate": 3.391750374011541e-05, + "loss": 0.7832, + "step": 1515 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 2.270247220993042, + "learning_rate": 3.390681769608891e-05, + "loss": 0.7791, + "step": 1516 + }, + { + "epoch": 0.9703366114975613, + "grad_norm": 1.8921737670898438, + "learning_rate": 3.389613165206241e-05, + "loss": 0.65, + "step": 1517 + }, + { + "epoch": 0.9709762532981531, + "grad_norm": 2.5758469104766846, + "learning_rate": 3.388544560803591e-05, + "loss": 0.9067, + "step": 1518 + }, + { + "epoch": 0.9716158950987447, + "grad_norm": 2.161949634552002, + "learning_rate": 3.387475956400941e-05, + "loss": 0.7332, + "step": 1519 + }, + { + "epoch": 0.9722555368993364, + "grad_norm": 2.2156999111175537, + "learning_rate": 3.38640735199829e-05, + "loss": 0.7205, + "step": 1520 + }, + { + "epoch": 0.972895178699928, + "grad_norm": 1.9285770654678345, + "learning_rate": 3.385338747595641e-05, + "loss": 0.624, + "step": 1521 + }, + { + "epoch": 0.9735348205005198, + "grad_norm": 2.0817832946777344, + "learning_rate": 3.38427014319299e-05, + "loss": 0.6611, + "step": 1522 + }, + { + "epoch": 0.9741744623011114, + "grad_norm": 2.0828697681427, + "learning_rate": 3.38320153879034e-05, + "loss": 0.7319, + "step": 1523 + }, + { + "epoch": 0.974814104101703, + "grad_norm": 2.047278881072998, + "learning_rate": 3.38213293438769e-05, + "loss": 0.6958, + "step": 1524 + }, + { + "epoch": 0.9754537459022947, + "grad_norm": 2.0445549488067627, + "learning_rate": 3.38106432998504e-05, + "loss": 0.658, + "step": 1525 + }, + { + "epoch": 0.9760933877028863, + "grad_norm": 2.2505154609680176, + "learning_rate": 3.37999572558239e-05, + "loss": 0.7333, + "step": 1526 + }, + { + "epoch": 0.9767330295034781, + "grad_norm": 2.253756523132324, + "learning_rate": 3.3789271211797396e-05, + "loss": 0.6782, + "step": 1527 + }, + { + "epoch": 0.9773726713040697, + "grad_norm": 2.053600788116455, + "learning_rate": 3.3778585167770895e-05, + "loss": 0.6976, + "step": 1528 + }, + { + "epoch": 0.9780123131046614, + "grad_norm": 2.214647054672241, + "learning_rate": 3.376789912374439e-05, + "loss": 0.7142, + "step": 1529 + }, + { + "epoch": 0.978651954905253, + "grad_norm": 2.3531408309936523, + "learning_rate": 3.3757213079717894e-05, + "loss": 0.7426, + "step": 1530 + }, + { + "epoch": 0.9792915967058448, + "grad_norm": 2.190523147583008, + "learning_rate": 3.374652703569139e-05, + "loss": 0.7056, + "step": 1531 + }, + { + "epoch": 0.9799312385064364, + "grad_norm": 2.0233254432678223, + "learning_rate": 3.3735840991664886e-05, + "loss": 0.6839, + "step": 1532 + }, + { + "epoch": 0.9805708803070281, + "grad_norm": 2.0214545726776123, + "learning_rate": 3.3725154947638385e-05, + "loss": 0.6686, + "step": 1533 + }, + { + "epoch": 0.9812105221076197, + "grad_norm": 2.2215795516967773, + "learning_rate": 3.3714468903611885e-05, + "loss": 0.71, + "step": 1534 + }, + { + "epoch": 0.9818501639082114, + "grad_norm": 2.437448263168335, + "learning_rate": 3.3703782859585384e-05, + "loss": 0.7416, + "step": 1535 + }, + { + "epoch": 0.9824898057088031, + "grad_norm": 2.5444507598876953, + "learning_rate": 3.3693096815558884e-05, + "loss": 0.8026, + "step": 1536 + }, + { + "epoch": 0.9831294475093948, + "grad_norm": 2.484264612197876, + "learning_rate": 3.368241077153238e-05, + "loss": 0.6371, + "step": 1537 + }, + { + "epoch": 0.9837690893099864, + "grad_norm": 2.301780939102173, + "learning_rate": 3.3671724727505875e-05, + "loss": 0.821, + "step": 1538 + }, + { + "epoch": 0.984408731110578, + "grad_norm": 2.186701774597168, + "learning_rate": 3.366103868347938e-05, + "loss": 0.7156, + "step": 1539 + }, + { + "epoch": 0.9850483729111698, + "grad_norm": 2.459995985031128, + "learning_rate": 3.3650352639452874e-05, + "loss": 0.8182, + "step": 1540 + }, + { + "epoch": 0.9856880147117614, + "grad_norm": 1.895358920097351, + "learning_rate": 3.3639666595426374e-05, + "loss": 0.6198, + "step": 1541 + }, + { + "epoch": 0.9863276565123531, + "grad_norm": 2.1757967472076416, + "learning_rate": 3.362898055139987e-05, + "loss": 0.6945, + "step": 1542 + }, + { + "epoch": 0.9869672983129447, + "grad_norm": 2.508127212524414, + "learning_rate": 3.361829450737337e-05, + "loss": 0.8474, + "step": 1543 + }, + { + "epoch": 0.9876069401135364, + "grad_norm": 1.9849086999893188, + "learning_rate": 3.3607608463346865e-05, + "loss": 0.6226, + "step": 1544 + }, + { + "epoch": 0.9882465819141281, + "grad_norm": 2.3702750205993652, + "learning_rate": 3.359692241932037e-05, + "loss": 0.7921, + "step": 1545 + }, + { + "epoch": 0.9888862237147198, + "grad_norm": 1.851857304573059, + "learning_rate": 3.3586236375293864e-05, + "loss": 0.6299, + "step": 1546 + }, + { + "epoch": 0.9895258655153114, + "grad_norm": 2.113546133041382, + "learning_rate": 3.357555033126737e-05, + "loss": 0.691, + "step": 1547 + }, + { + "epoch": 0.9901655073159031, + "grad_norm": 2.486071825027466, + "learning_rate": 3.356486428724087e-05, + "loss": 0.7461, + "step": 1548 + }, + { + "epoch": 0.9908051491164948, + "grad_norm": 1.8651572465896606, + "learning_rate": 3.355417824321436e-05, + "loss": 0.5847, + "step": 1549 + }, + { + "epoch": 0.9914447909170865, + "grad_norm": 2.235811710357666, + "learning_rate": 3.354349219918787e-05, + "loss": 0.7416, + "step": 1550 + }, + { + "epoch": 0.9920844327176781, + "grad_norm": 2.370832681655884, + "learning_rate": 3.353280615516136e-05, + "loss": 0.7106, + "step": 1551 + }, + { + "epoch": 0.9927240745182697, + "grad_norm": 1.9883408546447754, + "learning_rate": 3.352212011113486e-05, + "loss": 0.6786, + "step": 1552 + }, + { + "epoch": 0.9933637163188614, + "grad_norm": 2.5140347480773926, + "learning_rate": 3.351143406710836e-05, + "loss": 0.7636, + "step": 1553 + }, + { + "epoch": 0.9940033581194531, + "grad_norm": 1.814552664756775, + "learning_rate": 3.350074802308186e-05, + "loss": 0.5652, + "step": 1554 + }, + { + "epoch": 0.9946429999200448, + "grad_norm": 2.2113420963287354, + "learning_rate": 3.349006197905535e-05, + "loss": 0.7076, + "step": 1555 + }, + { + "epoch": 0.9952826417206364, + "grad_norm": 2.382270574569702, + "learning_rate": 3.347937593502886e-05, + "loss": 0.7714, + "step": 1556 + }, + { + "epoch": 0.9959222835212281, + "grad_norm": 2.217782497406006, + "learning_rate": 3.346868989100235e-05, + "loss": 0.6295, + "step": 1557 + }, + { + "epoch": 0.9965619253218198, + "grad_norm": 2.3199214935302734, + "learning_rate": 3.345800384697585e-05, + "loss": 0.6673, + "step": 1558 + }, + { + "epoch": 0.9972015671224115, + "grad_norm": 2.612952470779419, + "learning_rate": 3.344731780294935e-05, + "loss": 0.8183, + "step": 1559 + }, + { + "epoch": 0.9978412089230031, + "grad_norm": 2.0006492137908936, + "learning_rate": 3.343663175892285e-05, + "loss": 0.6892, + "step": 1560 + }, + { + "epoch": 0.9984808507235948, + "grad_norm": 2.211374282836914, + "learning_rate": 3.342594571489635e-05, + "loss": 0.7048, + "step": 1561 + }, + { + "epoch": 0.9991204925241864, + "grad_norm": 2.4126431941986084, + "learning_rate": 3.3415259670869847e-05, + "loss": 0.8143, + "step": 1562 + }, + { + "epoch": 0.9997601343247782, + "grad_norm": 2.181459665298462, + "learning_rate": 3.3404573626843346e-05, + "loss": 0.71, + "step": 1563 + }, + { + "epoch": 1.0, + "grad_norm": 3.5695137977600098, + "learning_rate": 3.339388758281684e-05, + "loss": 0.6863, + "step": 1564 + }, + { + "epoch": 1.0006396418005916, + "grad_norm": 1.6426266431808472, + "learning_rate": 3.3383201538790345e-05, + "loss": 0.5212, + "step": 1565 + }, + { + "epoch": 1.0012792836011832, + "grad_norm": 1.604065179824829, + "learning_rate": 3.337251549476384e-05, + "loss": 0.4942, + "step": 1566 + }, + { + "epoch": 1.001918925401775, + "grad_norm": 1.596436619758606, + "learning_rate": 3.336182945073734e-05, + "loss": 0.4653, + "step": 1567 + }, + { + "epoch": 1.0025585672023667, + "grad_norm": 2.0100901126861572, + "learning_rate": 3.3351143406710836e-05, + "loss": 0.4729, + "step": 1568 + }, + { + "epoch": 1.0031982090029583, + "grad_norm": 1.6763489246368408, + "learning_rate": 3.3340457362684335e-05, + "loss": 0.5096, + "step": 1569 + }, + { + "epoch": 1.00383785080355, + "grad_norm": 1.9359748363494873, + "learning_rate": 3.3329771318657835e-05, + "loss": 0.6162, + "step": 1570 + }, + { + "epoch": 1.0044774926041418, + "grad_norm": 2.0272462368011475, + "learning_rate": 3.3319085274631334e-05, + "loss": 0.5823, + "step": 1571 + }, + { + "epoch": 1.0051171344047334, + "grad_norm": 1.9165846109390259, + "learning_rate": 3.3308399230604833e-05, + "loss": 0.4592, + "step": 1572 + }, + { + "epoch": 1.005756776205325, + "grad_norm": 1.9332995414733887, + "learning_rate": 3.3297713186578326e-05, + "loss": 0.5524, + "step": 1573 + }, + { + "epoch": 1.0063964180059166, + "grad_norm": 1.8141871690750122, + "learning_rate": 3.328702714255183e-05, + "loss": 0.4715, + "step": 1574 + }, + { + "epoch": 1.0070360598065085, + "grad_norm": 2.228400230407715, + "learning_rate": 3.3276341098525325e-05, + "loss": 0.6197, + "step": 1575 + }, + { + "epoch": 1.0076757016071, + "grad_norm": 2.3265390396118164, + "learning_rate": 3.326565505449883e-05, + "loss": 0.5837, + "step": 1576 + }, + { + "epoch": 1.0083153434076917, + "grad_norm": 2.107447624206543, + "learning_rate": 3.3254969010472323e-05, + "loss": 0.547, + "step": 1577 + }, + { + "epoch": 1.0089549852082833, + "grad_norm": 2.1493191719055176, + "learning_rate": 3.324428296644582e-05, + "loss": 0.5573, + "step": 1578 + }, + { + "epoch": 1.009594627008875, + "grad_norm": 2.201444149017334, + "learning_rate": 3.323359692241932e-05, + "loss": 0.5181, + "step": 1579 + }, + { + "epoch": 1.0102342688094668, + "grad_norm": 2.342329263687134, + "learning_rate": 3.322291087839282e-05, + "loss": 0.5142, + "step": 1580 + }, + { + "epoch": 1.0108739106100584, + "grad_norm": 2.622166156768799, + "learning_rate": 3.321222483436632e-05, + "loss": 0.5671, + "step": 1581 + }, + { + "epoch": 1.01151355241065, + "grad_norm": 2.068490982055664, + "learning_rate": 3.320153879033982e-05, + "loss": 0.458, + "step": 1582 + }, + { + "epoch": 1.0121531942112416, + "grad_norm": 2.3126823902130127, + "learning_rate": 3.319085274631332e-05, + "loss": 0.4838, + "step": 1583 + }, + { + "epoch": 1.0127928360118335, + "grad_norm": 2.4509313106536865, + "learning_rate": 3.318016670228681e-05, + "loss": 0.5925, + "step": 1584 + }, + { + "epoch": 1.013432477812425, + "grad_norm": 2.2765822410583496, + "learning_rate": 3.316948065826032e-05, + "loss": 0.454, + "step": 1585 + }, + { + "epoch": 1.0140721196130167, + "grad_norm": 2.35550856590271, + "learning_rate": 3.315879461423381e-05, + "loss": 0.5222, + "step": 1586 + }, + { + "epoch": 1.0147117614136083, + "grad_norm": 2.231818914413452, + "learning_rate": 3.314810857020731e-05, + "loss": 0.5094, + "step": 1587 + }, + { + "epoch": 1.0153514032142001, + "grad_norm": 2.6193008422851562, + "learning_rate": 3.313742252618081e-05, + "loss": 0.635, + "step": 1588 + }, + { + "epoch": 1.0159910450147918, + "grad_norm": 2.495405435562134, + "learning_rate": 3.312673648215431e-05, + "loss": 0.5587, + "step": 1589 + }, + { + "epoch": 1.0166306868153834, + "grad_norm": 2.183990001678467, + "learning_rate": 3.31160504381278e-05, + "loss": 0.5388, + "step": 1590 + }, + { + "epoch": 1.017270328615975, + "grad_norm": 2.247999668121338, + "learning_rate": 3.310536439410131e-05, + "loss": 0.5116, + "step": 1591 + }, + { + "epoch": 1.0179099704165666, + "grad_norm": 2.329559087753296, + "learning_rate": 3.30946783500748e-05, + "loss": 0.5222, + "step": 1592 + }, + { + "epoch": 1.0185496122171585, + "grad_norm": 2.255418062210083, + "learning_rate": 3.30839923060483e-05, + "loss": 0.5116, + "step": 1593 + }, + { + "epoch": 1.01918925401775, + "grad_norm": 2.4894025325775146, + "learning_rate": 3.3073306262021806e-05, + "loss": 0.5985, + "step": 1594 + }, + { + "epoch": 1.0198288958183417, + "grad_norm": 2.6374289989471436, + "learning_rate": 3.30626202179953e-05, + "loss": 0.5363, + "step": 1595 + }, + { + "epoch": 1.0204685376189333, + "grad_norm": 2.0918147563934326, + "learning_rate": 3.30519341739688e-05, + "loss": 0.4625, + "step": 1596 + }, + { + "epoch": 1.0211081794195251, + "grad_norm": 2.248098373413086, + "learning_rate": 3.30412481299423e-05, + "loss": 0.4838, + "step": 1597 + }, + { + "epoch": 1.0217478212201168, + "grad_norm": 1.9481654167175293, + "learning_rate": 3.3030562085915797e-05, + "loss": 0.4333, + "step": 1598 + }, + { + "epoch": 1.0223874630207084, + "grad_norm": 2.1654884815216064, + "learning_rate": 3.301987604188929e-05, + "loss": 0.481, + "step": 1599 + }, + { + "epoch": 1.0230271048213, + "grad_norm": 2.157902717590332, + "learning_rate": 3.3009189997862795e-05, + "loss": 0.4918, + "step": 1600 + }, + { + "epoch": 1.0236667466218918, + "grad_norm": 1.9855064153671265, + "learning_rate": 3.299850395383629e-05, + "loss": 0.4431, + "step": 1601 + }, + { + "epoch": 1.0243063884224834, + "grad_norm": 2.373035430908203, + "learning_rate": 3.298781790980979e-05, + "loss": 0.4841, + "step": 1602 + }, + { + "epoch": 1.024946030223075, + "grad_norm": 2.2268319129943848, + "learning_rate": 3.2977131865783287e-05, + "loss": 0.4685, + "step": 1603 + }, + { + "epoch": 1.0255856720236667, + "grad_norm": 2.439455986022949, + "learning_rate": 3.2966445821756786e-05, + "loss": 0.5072, + "step": 1604 + }, + { + "epoch": 1.0262253138242585, + "grad_norm": 2.6799020767211914, + "learning_rate": 3.2955759777730285e-05, + "loss": 0.5656, + "step": 1605 + }, + { + "epoch": 1.0268649556248501, + "grad_norm": 2.32594895362854, + "learning_rate": 3.2945073733703785e-05, + "loss": 0.4995, + "step": 1606 + }, + { + "epoch": 1.0275045974254418, + "grad_norm": 2.4114809036254883, + "learning_rate": 3.2934387689677284e-05, + "loss": 0.5164, + "step": 1607 + }, + { + "epoch": 1.0281442392260334, + "grad_norm": 2.1412720680236816, + "learning_rate": 3.292370164565078e-05, + "loss": 0.4681, + "step": 1608 + }, + { + "epoch": 1.028783881026625, + "grad_norm": 2.4099087715148926, + "learning_rate": 3.291301560162428e-05, + "loss": 0.5592, + "step": 1609 + }, + { + "epoch": 1.0294235228272168, + "grad_norm": 2.376927614212036, + "learning_rate": 3.2902329557597775e-05, + "loss": 0.4882, + "step": 1610 + }, + { + "epoch": 1.0300631646278084, + "grad_norm": 2.545624256134033, + "learning_rate": 3.289164351357128e-05, + "loss": 0.5526, + "step": 1611 + }, + { + "epoch": 1.0307028064284, + "grad_norm": 2.2848265171051025, + "learning_rate": 3.2880957469544774e-05, + "loss": 0.5354, + "step": 1612 + }, + { + "epoch": 1.0313424482289917, + "grad_norm": 2.2950668334960938, + "learning_rate": 3.2870271425518273e-05, + "loss": 0.498, + "step": 1613 + }, + { + "epoch": 1.0319820900295835, + "grad_norm": 2.34885835647583, + "learning_rate": 3.285958538149177e-05, + "loss": 0.4906, + "step": 1614 + }, + { + "epoch": 1.0326217318301751, + "grad_norm": 2.462394952774048, + "learning_rate": 3.284889933746527e-05, + "loss": 0.5359, + "step": 1615 + }, + { + "epoch": 1.0332613736307668, + "grad_norm": 2.243147373199463, + "learning_rate": 3.283821329343877e-05, + "loss": 0.4938, + "step": 1616 + }, + { + "epoch": 1.0339010154313584, + "grad_norm": 2.5488319396972656, + "learning_rate": 3.282752724941227e-05, + "loss": 0.569, + "step": 1617 + }, + { + "epoch": 1.0345406572319502, + "grad_norm": 2.2287886142730713, + "learning_rate": 3.281684120538577e-05, + "loss": 0.5268, + "step": 1618 + }, + { + "epoch": 1.0351802990325418, + "grad_norm": 2.110866069793701, + "learning_rate": 3.280615516135926e-05, + "loss": 0.484, + "step": 1619 + }, + { + "epoch": 1.0358199408331334, + "grad_norm": 2.268686532974243, + "learning_rate": 3.279546911733277e-05, + "loss": 0.5439, + "step": 1620 + }, + { + "epoch": 1.036459582633725, + "grad_norm": 2.119105100631714, + "learning_rate": 3.278478307330626e-05, + "loss": 0.4944, + "step": 1621 + }, + { + "epoch": 1.0370992244343167, + "grad_norm": 1.9542510509490967, + "learning_rate": 3.277409702927976e-05, + "loss": 0.441, + "step": 1622 + }, + { + "epoch": 1.0377388662349085, + "grad_norm": 2.3195297718048096, + "learning_rate": 3.276341098525326e-05, + "loss": 0.5325, + "step": 1623 + }, + { + "epoch": 1.0383785080355001, + "grad_norm": 2.036125421524048, + "learning_rate": 3.275272494122676e-05, + "loss": 0.4529, + "step": 1624 + }, + { + "epoch": 1.0390181498360918, + "grad_norm": 2.167020559310913, + "learning_rate": 3.274203889720026e-05, + "loss": 0.4869, + "step": 1625 + }, + { + "epoch": 1.0396577916366834, + "grad_norm": 2.308004140853882, + "learning_rate": 3.273135285317376e-05, + "loss": 0.5272, + "step": 1626 + }, + { + "epoch": 1.0402974334372752, + "grad_norm": 2.354403257369995, + "learning_rate": 3.272066680914726e-05, + "loss": 0.5588, + "step": 1627 + }, + { + "epoch": 1.0409370752378668, + "grad_norm": 2.3287439346313477, + "learning_rate": 3.270998076512075e-05, + "loss": 0.4983, + "step": 1628 + }, + { + "epoch": 1.0415767170384584, + "grad_norm": 2.4980506896972656, + "learning_rate": 3.2699294721094256e-05, + "loss": 0.6183, + "step": 1629 + }, + { + "epoch": 1.04221635883905, + "grad_norm": 2.5118374824523926, + "learning_rate": 3.268860867706775e-05, + "loss": 0.6046, + "step": 1630 + }, + { + "epoch": 1.042856000639642, + "grad_norm": 2.1099486351013184, + "learning_rate": 3.2677922633041255e-05, + "loss": 0.4761, + "step": 1631 + }, + { + "epoch": 1.0434956424402335, + "grad_norm": 2.3047168254852295, + "learning_rate": 3.266723658901475e-05, + "loss": 0.5008, + "step": 1632 + }, + { + "epoch": 1.0441352842408251, + "grad_norm": 2.1408493518829346, + "learning_rate": 3.265655054498825e-05, + "loss": 0.4468, + "step": 1633 + }, + { + "epoch": 1.0447749260414168, + "grad_norm": 2.2916908264160156, + "learning_rate": 3.2645864500961746e-05, + "loss": 0.4752, + "step": 1634 + }, + { + "epoch": 1.0454145678420084, + "grad_norm": 2.1892850399017334, + "learning_rate": 3.2635178456935246e-05, + "loss": 0.4713, + "step": 1635 + }, + { + "epoch": 1.0460542096426002, + "grad_norm": 2.178524971008301, + "learning_rate": 3.262449241290874e-05, + "loss": 0.467, + "step": 1636 + }, + { + "epoch": 1.0466938514431918, + "grad_norm": 2.6131863594055176, + "learning_rate": 3.2613806368882245e-05, + "loss": 0.5901, + "step": 1637 + }, + { + "epoch": 1.0473334932437834, + "grad_norm": 2.690558910369873, + "learning_rate": 3.260312032485574e-05, + "loss": 0.5401, + "step": 1638 + }, + { + "epoch": 1.047973135044375, + "grad_norm": 2.3080806732177734, + "learning_rate": 3.2592434280829236e-05, + "loss": 0.468, + "step": 1639 + }, + { + "epoch": 1.048612776844967, + "grad_norm": 2.7645251750946045, + "learning_rate": 3.258174823680274e-05, + "loss": 0.5848, + "step": 1640 + }, + { + "epoch": 1.0492524186455585, + "grad_norm": 2.3509116172790527, + "learning_rate": 3.2571062192776235e-05, + "loss": 0.4677, + "step": 1641 + }, + { + "epoch": 1.0498920604461501, + "grad_norm": 2.726024627685547, + "learning_rate": 3.2560376148749735e-05, + "loss": 0.5132, + "step": 1642 + }, + { + "epoch": 1.0505317022467417, + "grad_norm": 2.6302695274353027, + "learning_rate": 3.2549690104723234e-05, + "loss": 0.5078, + "step": 1643 + }, + { + "epoch": 1.0511713440473336, + "grad_norm": 2.5397214889526367, + "learning_rate": 3.253900406069673e-05, + "loss": 0.5132, + "step": 1644 + }, + { + "epoch": 1.0518109858479252, + "grad_norm": 2.930471658706665, + "learning_rate": 3.2528318016670226e-05, + "loss": 0.6909, + "step": 1645 + }, + { + "epoch": 1.0524506276485168, + "grad_norm": 2.448770046234131, + "learning_rate": 3.251763197264373e-05, + "loss": 0.5303, + "step": 1646 + }, + { + "epoch": 1.0530902694491084, + "grad_norm": 2.4772374629974365, + "learning_rate": 3.2506945928617225e-05, + "loss": 0.5678, + "step": 1647 + }, + { + "epoch": 1.0537299112497003, + "grad_norm": 2.4332692623138428, + "learning_rate": 3.2496259884590724e-05, + "loss": 0.5135, + "step": 1648 + }, + { + "epoch": 1.054369553050292, + "grad_norm": 2.41996431350708, + "learning_rate": 3.248557384056422e-05, + "loss": 0.5068, + "step": 1649 + }, + { + "epoch": 1.0550091948508835, + "grad_norm": 2.6703288555145264, + "learning_rate": 3.247488779653772e-05, + "loss": 0.6021, + "step": 1650 + }, + { + "epoch": 1.0556488366514751, + "grad_norm": 2.1734867095947266, + "learning_rate": 3.246420175251122e-05, + "loss": 0.4897, + "step": 1651 + }, + { + "epoch": 1.0562884784520667, + "grad_norm": 2.1070706844329834, + "learning_rate": 3.245351570848472e-05, + "loss": 0.4673, + "step": 1652 + }, + { + "epoch": 1.0569281202526586, + "grad_norm": 2.2720072269439697, + "learning_rate": 3.244282966445822e-05, + "loss": 0.5331, + "step": 1653 + }, + { + "epoch": 1.0575677620532502, + "grad_norm": 2.6807985305786133, + "learning_rate": 3.243214362043171e-05, + "loss": 0.563, + "step": 1654 + }, + { + "epoch": 1.0582074038538418, + "grad_norm": 2.374823808670044, + "learning_rate": 3.242145757640522e-05, + "loss": 0.5634, + "step": 1655 + }, + { + "epoch": 1.0588470456544334, + "grad_norm": 2.0399231910705566, + "learning_rate": 3.241077153237871e-05, + "loss": 0.4685, + "step": 1656 + }, + { + "epoch": 1.0594866874550253, + "grad_norm": 2.1718246936798096, + "learning_rate": 3.240008548835221e-05, + "loss": 0.4879, + "step": 1657 + }, + { + "epoch": 1.060126329255617, + "grad_norm": 2.1819610595703125, + "learning_rate": 3.238939944432571e-05, + "loss": 0.5479, + "step": 1658 + }, + { + "epoch": 1.0607659710562085, + "grad_norm": 2.3052191734313965, + "learning_rate": 3.237871340029921e-05, + "loss": 0.5069, + "step": 1659 + }, + { + "epoch": 1.0614056128568001, + "grad_norm": 2.1954407691955566, + "learning_rate": 3.236802735627271e-05, + "loss": 0.504, + "step": 1660 + }, + { + "epoch": 1.062045254657392, + "grad_norm": 2.302370309829712, + "learning_rate": 3.235734131224621e-05, + "loss": 0.5981, + "step": 1661 + }, + { + "epoch": 1.0626848964579836, + "grad_norm": 2.2455694675445557, + "learning_rate": 3.234665526821971e-05, + "loss": 0.5061, + "step": 1662 + }, + { + "epoch": 1.0633245382585752, + "grad_norm": 2.276129722595215, + "learning_rate": 3.233596922419321e-05, + "loss": 0.5088, + "step": 1663 + }, + { + "epoch": 1.0639641800591668, + "grad_norm": 2.2016842365264893, + "learning_rate": 3.232528318016671e-05, + "loss": 0.4994, + "step": 1664 + }, + { + "epoch": 1.0646038218597584, + "grad_norm": 2.456360340118408, + "learning_rate": 3.23145971361402e-05, + "loss": 0.5224, + "step": 1665 + }, + { + "epoch": 1.0652434636603503, + "grad_norm": 2.2001917362213135, + "learning_rate": 3.2303911092113706e-05, + "loss": 0.4921, + "step": 1666 + }, + { + "epoch": 1.065883105460942, + "grad_norm": 2.542221784591675, + "learning_rate": 3.22932250480872e-05, + "loss": 0.5592, + "step": 1667 + }, + { + "epoch": 1.0665227472615335, + "grad_norm": 2.315314769744873, + "learning_rate": 3.22825390040607e-05, + "loss": 0.4884, + "step": 1668 + }, + { + "epoch": 1.0671623890621251, + "grad_norm": 2.0502121448516846, + "learning_rate": 3.22718529600342e-05, + "loss": 0.4319, + "step": 1669 + }, + { + "epoch": 1.067802030862717, + "grad_norm": 2.166858434677124, + "learning_rate": 3.2261166916007696e-05, + "loss": 0.4877, + "step": 1670 + }, + { + "epoch": 1.0684416726633086, + "grad_norm": 2.3658769130706787, + "learning_rate": 3.2250480871981196e-05, + "loss": 0.5817, + "step": 1671 + }, + { + "epoch": 1.0690813144639002, + "grad_norm": 2.1855645179748535, + "learning_rate": 3.2239794827954695e-05, + "loss": 0.5, + "step": 1672 + }, + { + "epoch": 1.0697209562644918, + "grad_norm": 2.242687463760376, + "learning_rate": 3.2229108783928194e-05, + "loss": 0.5415, + "step": 1673 + }, + { + "epoch": 1.0703605980650837, + "grad_norm": 2.314005136489868, + "learning_rate": 3.221842273990169e-05, + "loss": 0.5546, + "step": 1674 + }, + { + "epoch": 1.0710002398656753, + "grad_norm": 2.3146510124206543, + "learning_rate": 3.220773669587519e-05, + "loss": 0.5637, + "step": 1675 + }, + { + "epoch": 1.071639881666267, + "grad_norm": 2.439074993133545, + "learning_rate": 3.2197050651848686e-05, + "loss": 0.5315, + "step": 1676 + }, + { + "epoch": 1.0722795234668585, + "grad_norm": 2.0321543216705322, + "learning_rate": 3.2186364607822185e-05, + "loss": 0.5095, + "step": 1677 + }, + { + "epoch": 1.0729191652674501, + "grad_norm": 2.128948450088501, + "learning_rate": 3.2175678563795684e-05, + "loss": 0.4817, + "step": 1678 + }, + { + "epoch": 1.073558807068042, + "grad_norm": 1.978027105331421, + "learning_rate": 3.2164992519769184e-05, + "loss": 0.4654, + "step": 1679 + }, + { + "epoch": 1.0741984488686336, + "grad_norm": 2.456768751144409, + "learning_rate": 3.2154306475742676e-05, + "loss": 0.5976, + "step": 1680 + }, + { + "epoch": 1.0748380906692252, + "grad_norm": 2.5293171405792236, + "learning_rate": 3.214362043171618e-05, + "loss": 0.6063, + "step": 1681 + }, + { + "epoch": 1.0754777324698168, + "grad_norm": 2.1288278102874756, + "learning_rate": 3.2132934387689675e-05, + "loss": 0.4797, + "step": 1682 + }, + { + "epoch": 1.0761173742704087, + "grad_norm": 2.437877655029297, + "learning_rate": 3.2122248343663175e-05, + "loss": 0.5627, + "step": 1683 + }, + { + "epoch": 1.0767570160710003, + "grad_norm": 2.3062119483947754, + "learning_rate": 3.2111562299636674e-05, + "loss": 0.5625, + "step": 1684 + }, + { + "epoch": 1.077396657871592, + "grad_norm": 2.208761215209961, + "learning_rate": 3.210087625561017e-05, + "loss": 0.5284, + "step": 1685 + }, + { + "epoch": 1.0780362996721835, + "grad_norm": 2.0689878463745117, + "learning_rate": 3.209019021158367e-05, + "loss": 0.4581, + "step": 1686 + }, + { + "epoch": 1.0786759414727753, + "grad_norm": 2.234243631362915, + "learning_rate": 3.207950416755717e-05, + "loss": 0.5071, + "step": 1687 + }, + { + "epoch": 1.079315583273367, + "grad_norm": 2.042379379272461, + "learning_rate": 3.206881812353067e-05, + "loss": 0.4514, + "step": 1688 + }, + { + "epoch": 1.0799552250739586, + "grad_norm": 2.1682631969451904, + "learning_rate": 3.205813207950417e-05, + "loss": 0.4962, + "step": 1689 + }, + { + "epoch": 1.0805948668745502, + "grad_norm": 2.0820109844207764, + "learning_rate": 3.204744603547767e-05, + "loss": 0.4754, + "step": 1690 + }, + { + "epoch": 1.0812345086751418, + "grad_norm": 2.0925025939941406, + "learning_rate": 3.203675999145116e-05, + "loss": 0.4463, + "step": 1691 + }, + { + "epoch": 1.0818741504757337, + "grad_norm": 2.506754159927368, + "learning_rate": 3.202607394742467e-05, + "loss": 0.5349, + "step": 1692 + }, + { + "epoch": 1.0825137922763253, + "grad_norm": 2.241452693939209, + "learning_rate": 3.201538790339816e-05, + "loss": 0.506, + "step": 1693 + }, + { + "epoch": 1.0831534340769169, + "grad_norm": 2.0173118114471436, + "learning_rate": 3.200470185937166e-05, + "loss": 0.4268, + "step": 1694 + }, + { + "epoch": 1.0837930758775085, + "grad_norm": 2.372399091720581, + "learning_rate": 3.199401581534516e-05, + "loss": 0.5435, + "step": 1695 + }, + { + "epoch": 1.0844327176781003, + "grad_norm": 2.4139435291290283, + "learning_rate": 3.198332977131866e-05, + "loss": 0.487, + "step": 1696 + }, + { + "epoch": 1.085072359478692, + "grad_norm": 2.4056973457336426, + "learning_rate": 3.197264372729216e-05, + "loss": 0.5084, + "step": 1697 + }, + { + "epoch": 1.0857120012792836, + "grad_norm": 2.368945598602295, + "learning_rate": 3.196195768326566e-05, + "loss": 0.5362, + "step": 1698 + }, + { + "epoch": 1.0863516430798752, + "grad_norm": 2.039180040359497, + "learning_rate": 3.195127163923916e-05, + "loss": 0.4644, + "step": 1699 + }, + { + "epoch": 1.086991284880467, + "grad_norm": 2.312699556350708, + "learning_rate": 3.194058559521265e-05, + "loss": 0.529, + "step": 1700 + }, + { + "epoch": 1.0876309266810587, + "grad_norm": 2.5925090312957764, + "learning_rate": 3.1929899551186156e-05, + "loss": 0.5571, + "step": 1701 + }, + { + "epoch": 1.0882705684816503, + "grad_norm": 2.3800933361053467, + "learning_rate": 3.191921350715965e-05, + "loss": 0.4864, + "step": 1702 + }, + { + "epoch": 1.0889102102822419, + "grad_norm": 2.4689550399780273, + "learning_rate": 3.190852746313315e-05, + "loss": 0.5101, + "step": 1703 + }, + { + "epoch": 1.0895498520828335, + "grad_norm": 2.5429491996765137, + "learning_rate": 3.189784141910665e-05, + "loss": 0.5454, + "step": 1704 + }, + { + "epoch": 1.0901894938834253, + "grad_norm": 2.4416191577911377, + "learning_rate": 3.188715537508015e-05, + "loss": 0.5248, + "step": 1705 + }, + { + "epoch": 1.090829135684017, + "grad_norm": 2.0285744667053223, + "learning_rate": 3.1876469331053646e-05, + "loss": 0.4487, + "step": 1706 + }, + { + "epoch": 1.0914687774846086, + "grad_norm": 2.597395658493042, + "learning_rate": 3.1865783287027146e-05, + "loss": 0.5802, + "step": 1707 + }, + { + "epoch": 1.0921084192852002, + "grad_norm": 2.3871214389801025, + "learning_rate": 3.1855097243000645e-05, + "loss": 0.5307, + "step": 1708 + }, + { + "epoch": 1.092748061085792, + "grad_norm": 2.5163402557373047, + "learning_rate": 3.184441119897414e-05, + "loss": 0.5412, + "step": 1709 + }, + { + "epoch": 1.0933877028863837, + "grad_norm": 2.5021064281463623, + "learning_rate": 3.1833725154947644e-05, + "loss": 0.5411, + "step": 1710 + }, + { + "epoch": 1.0940273446869753, + "grad_norm": 2.2159810066223145, + "learning_rate": 3.1823039110921136e-05, + "loss": 0.4951, + "step": 1711 + }, + { + "epoch": 1.0946669864875669, + "grad_norm": 2.221435546875, + "learning_rate": 3.1812353066894636e-05, + "loss": 0.5281, + "step": 1712 + }, + { + "epoch": 1.0953066282881587, + "grad_norm": 2.063404083251953, + "learning_rate": 3.1801667022868135e-05, + "loss": 0.4629, + "step": 1713 + }, + { + "epoch": 1.0959462700887503, + "grad_norm": 2.227576971054077, + "learning_rate": 3.1790980978841634e-05, + "loss": 0.5423, + "step": 1714 + }, + { + "epoch": 1.096585911889342, + "grad_norm": 2.4510653018951416, + "learning_rate": 3.178029493481513e-05, + "loss": 0.5714, + "step": 1715 + }, + { + "epoch": 1.0972255536899336, + "grad_norm": 2.192448616027832, + "learning_rate": 3.176960889078863e-05, + "loss": 0.5217, + "step": 1716 + }, + { + "epoch": 1.0978651954905252, + "grad_norm": 2.5083277225494385, + "learning_rate": 3.175892284676213e-05, + "loss": 0.471, + "step": 1717 + }, + { + "epoch": 1.098504837291117, + "grad_norm": 2.673832893371582, + "learning_rate": 3.174823680273563e-05, + "loss": 0.5701, + "step": 1718 + }, + { + "epoch": 1.0991444790917086, + "grad_norm": 2.177384614944458, + "learning_rate": 3.173755075870913e-05, + "loss": 0.4577, + "step": 1719 + }, + { + "epoch": 1.0997841208923003, + "grad_norm": 1.9219969511032104, + "learning_rate": 3.1726864714682624e-05, + "loss": 0.4965, + "step": 1720 + }, + { + "epoch": 1.1004237626928919, + "grad_norm": 2.3052754402160645, + "learning_rate": 3.171617867065613e-05, + "loss": 0.5333, + "step": 1721 + }, + { + "epoch": 1.1010634044934837, + "grad_norm": 2.6739320755004883, + "learning_rate": 3.170549262662962e-05, + "loss": 0.6156, + "step": 1722 + }, + { + "epoch": 1.1017030462940753, + "grad_norm": 2.549940824508667, + "learning_rate": 3.169480658260312e-05, + "loss": 0.5416, + "step": 1723 + }, + { + "epoch": 1.102342688094667, + "grad_norm": 2.229623794555664, + "learning_rate": 3.168412053857662e-05, + "loss": 0.5133, + "step": 1724 + }, + { + "epoch": 1.1029823298952586, + "grad_norm": 2.307368755340576, + "learning_rate": 3.167343449455012e-05, + "loss": 0.5406, + "step": 1725 + }, + { + "epoch": 1.1036219716958504, + "grad_norm": 2.2113728523254395, + "learning_rate": 3.166274845052361e-05, + "loss": 0.4913, + "step": 1726 + }, + { + "epoch": 1.104261613496442, + "grad_norm": 2.130842685699463, + "learning_rate": 3.165206240649712e-05, + "loss": 0.4465, + "step": 1727 + }, + { + "epoch": 1.1049012552970336, + "grad_norm": 2.5895724296569824, + "learning_rate": 3.164137636247061e-05, + "loss": 0.5207, + "step": 1728 + }, + { + "epoch": 1.1055408970976253, + "grad_norm": 2.168288469314575, + "learning_rate": 3.163069031844411e-05, + "loss": 0.4188, + "step": 1729 + }, + { + "epoch": 1.1061805388982169, + "grad_norm": 2.2262396812438965, + "learning_rate": 3.162000427441761e-05, + "loss": 0.4625, + "step": 1730 + }, + { + "epoch": 1.1068201806988087, + "grad_norm": 1.9149888753890991, + "learning_rate": 3.160931823039111e-05, + "loss": 0.4358, + "step": 1731 + }, + { + "epoch": 1.1074598224994003, + "grad_norm": 2.387164354324341, + "learning_rate": 3.159863218636461e-05, + "loss": 0.5349, + "step": 1732 + }, + { + "epoch": 1.108099464299992, + "grad_norm": 2.6190738677978516, + "learning_rate": 3.158794614233811e-05, + "loss": 0.6151, + "step": 1733 + }, + { + "epoch": 1.1087391061005836, + "grad_norm": 2.2280044555664062, + "learning_rate": 3.157726009831161e-05, + "loss": 0.4655, + "step": 1734 + }, + { + "epoch": 1.1093787479011754, + "grad_norm": 2.3109242916107178, + "learning_rate": 3.15665740542851e-05, + "loss": 0.4759, + "step": 1735 + }, + { + "epoch": 1.110018389701767, + "grad_norm": 2.2519354820251465, + "learning_rate": 3.155588801025861e-05, + "loss": 0.4684, + "step": 1736 + }, + { + "epoch": 1.1106580315023586, + "grad_norm": 2.682269811630249, + "learning_rate": 3.15452019662321e-05, + "loss": 0.5475, + "step": 1737 + }, + { + "epoch": 1.1112976733029503, + "grad_norm": 2.540397882461548, + "learning_rate": 3.15345159222056e-05, + "loss": 0.5639, + "step": 1738 + }, + { + "epoch": 1.111937315103542, + "grad_norm": 2.049651861190796, + "learning_rate": 3.15238298781791e-05, + "loss": 0.4704, + "step": 1739 + }, + { + "epoch": 1.1125769569041337, + "grad_norm": 2.3338117599487305, + "learning_rate": 3.15131438341526e-05, + "loss": 0.5141, + "step": 1740 + }, + { + "epoch": 1.1132165987047253, + "grad_norm": 2.401749849319458, + "learning_rate": 3.15024577901261e-05, + "loss": 0.4323, + "step": 1741 + }, + { + "epoch": 1.113856240505317, + "grad_norm": 2.462519645690918, + "learning_rate": 3.1491771746099596e-05, + "loss": 0.5605, + "step": 1742 + }, + { + "epoch": 1.1144958823059088, + "grad_norm": 2.33021879196167, + "learning_rate": 3.1481085702073096e-05, + "loss": 0.4488, + "step": 1743 + }, + { + "epoch": 1.1151355241065004, + "grad_norm": 2.8066039085388184, + "learning_rate": 3.1470399658046595e-05, + "loss": 0.6131, + "step": 1744 + }, + { + "epoch": 1.115775165907092, + "grad_norm": 2.5402162075042725, + "learning_rate": 3.1459713614020094e-05, + "loss": 0.5407, + "step": 1745 + }, + { + "epoch": 1.1164148077076836, + "grad_norm": 1.9940400123596191, + "learning_rate": 3.144902756999359e-05, + "loss": 0.4291, + "step": 1746 + }, + { + "epoch": 1.1170544495082755, + "grad_norm": 2.5242414474487305, + "learning_rate": 3.143834152596709e-05, + "loss": 0.4987, + "step": 1747 + }, + { + "epoch": 1.117694091308867, + "grad_norm": 2.269988775253296, + "learning_rate": 3.1427655481940586e-05, + "loss": 0.5295, + "step": 1748 + }, + { + "epoch": 1.1183337331094587, + "grad_norm": 1.8427963256835938, + "learning_rate": 3.1416969437914085e-05, + "loss": 0.4444, + "step": 1749 + }, + { + "epoch": 1.1189733749100503, + "grad_norm": 2.09195613861084, + "learning_rate": 3.1406283393887584e-05, + "loss": 0.4826, + "step": 1750 + }, + { + "epoch": 1.119613016710642, + "grad_norm": 2.6894187927246094, + "learning_rate": 3.1395597349861084e-05, + "loss": 0.6025, + "step": 1751 + }, + { + "epoch": 1.1202526585112338, + "grad_norm": 2.4898767471313477, + "learning_rate": 3.138491130583458e-05, + "loss": 0.563, + "step": 1752 + }, + { + "epoch": 1.1208923003118254, + "grad_norm": 2.3334531784057617, + "learning_rate": 3.137422526180808e-05, + "loss": 0.5537, + "step": 1753 + }, + { + "epoch": 1.121531942112417, + "grad_norm": 2.3252806663513184, + "learning_rate": 3.136353921778158e-05, + "loss": 0.4919, + "step": 1754 + }, + { + "epoch": 1.1221715839130086, + "grad_norm": 2.38567852973938, + "learning_rate": 3.1352853173755074e-05, + "loss": 0.5485, + "step": 1755 + }, + { + "epoch": 1.1228112257136005, + "grad_norm": 2.1960718631744385, + "learning_rate": 3.134216712972858e-05, + "loss": 0.512, + "step": 1756 + }, + { + "epoch": 1.123450867514192, + "grad_norm": 2.3051846027374268, + "learning_rate": 3.133148108570207e-05, + "loss": 0.5179, + "step": 1757 + }, + { + "epoch": 1.1240905093147837, + "grad_norm": 2.105882406234741, + "learning_rate": 3.132079504167557e-05, + "loss": 0.4633, + "step": 1758 + }, + { + "epoch": 1.1247301511153753, + "grad_norm": 2.0388476848602295, + "learning_rate": 3.131010899764907e-05, + "loss": 0.4703, + "step": 1759 + }, + { + "epoch": 1.1253697929159672, + "grad_norm": 2.288141965866089, + "learning_rate": 3.129942295362257e-05, + "loss": 0.5103, + "step": 1760 + }, + { + "epoch": 1.1260094347165588, + "grad_norm": 2.0263402462005615, + "learning_rate": 3.1288736909596064e-05, + "loss": 0.4017, + "step": 1761 + }, + { + "epoch": 1.1266490765171504, + "grad_norm": 2.3946094512939453, + "learning_rate": 3.127805086556957e-05, + "loss": 0.5202, + "step": 1762 + }, + { + "epoch": 1.127288718317742, + "grad_norm": 2.4939112663269043, + "learning_rate": 3.126736482154307e-05, + "loss": 0.5099, + "step": 1763 + }, + { + "epoch": 1.1279283601183336, + "grad_norm": 2.4583992958068848, + "learning_rate": 3.125667877751656e-05, + "loss": 0.5652, + "step": 1764 + }, + { + "epoch": 1.1285680019189255, + "grad_norm": 2.297574520111084, + "learning_rate": 3.124599273349007e-05, + "loss": 0.5321, + "step": 1765 + }, + { + "epoch": 1.129207643719517, + "grad_norm": 2.2621891498565674, + "learning_rate": 3.123530668946356e-05, + "loss": 0.5335, + "step": 1766 + }, + { + "epoch": 1.1298472855201087, + "grad_norm": 2.1876683235168457, + "learning_rate": 3.122462064543706e-05, + "loss": 0.4652, + "step": 1767 + }, + { + "epoch": 1.1304869273207003, + "grad_norm": 2.2956509590148926, + "learning_rate": 3.121393460141056e-05, + "loss": 0.5176, + "step": 1768 + }, + { + "epoch": 1.1311265691212922, + "grad_norm": 2.2569634914398193, + "learning_rate": 3.120324855738406e-05, + "loss": 0.5787, + "step": 1769 + }, + { + "epoch": 1.1317662109218838, + "grad_norm": 2.247763156890869, + "learning_rate": 3.119256251335755e-05, + "loss": 0.5321, + "step": 1770 + }, + { + "epoch": 1.1324058527224754, + "grad_norm": 2.1102590560913086, + "learning_rate": 3.118187646933106e-05, + "loss": 0.4761, + "step": 1771 + }, + { + "epoch": 1.133045494523067, + "grad_norm": 2.3571970462799072, + "learning_rate": 3.117119042530455e-05, + "loss": 0.5254, + "step": 1772 + }, + { + "epoch": 1.1336851363236589, + "grad_norm": 2.1841487884521484, + "learning_rate": 3.1160504381278056e-05, + "loss": 0.495, + "step": 1773 + }, + { + "epoch": 1.1343247781242505, + "grad_norm": 2.3618931770324707, + "learning_rate": 3.114981833725155e-05, + "loss": 0.5374, + "step": 1774 + }, + { + "epoch": 1.134964419924842, + "grad_norm": 2.4433460235595703, + "learning_rate": 3.113913229322505e-05, + "loss": 0.5619, + "step": 1775 + }, + { + "epoch": 1.1356040617254337, + "grad_norm": 2.4330520629882812, + "learning_rate": 3.112844624919855e-05, + "loss": 0.5556, + "step": 1776 + }, + { + "epoch": 1.1362437035260253, + "grad_norm": 2.3832008838653564, + "learning_rate": 3.111776020517205e-05, + "loss": 0.5469, + "step": 1777 + }, + { + "epoch": 1.1368833453266172, + "grad_norm": 2.4882397651672363, + "learning_rate": 3.1107074161145546e-05, + "loss": 0.5791, + "step": 1778 + }, + { + "epoch": 1.1375229871272088, + "grad_norm": 2.6583292484283447, + "learning_rate": 3.1096388117119046e-05, + "loss": 0.6074, + "step": 1779 + }, + { + "epoch": 1.1381626289278004, + "grad_norm": 2.230259656906128, + "learning_rate": 3.1085702073092545e-05, + "loss": 0.4952, + "step": 1780 + }, + { + "epoch": 1.138802270728392, + "grad_norm": 2.4918019771575928, + "learning_rate": 3.107501602906604e-05, + "loss": 0.5324, + "step": 1781 + }, + { + "epoch": 1.1394419125289839, + "grad_norm": 2.433915853500366, + "learning_rate": 3.1064329985039544e-05, + "loss": 0.5071, + "step": 1782 + }, + { + "epoch": 1.1400815543295755, + "grad_norm": 2.3743603229522705, + "learning_rate": 3.1053643941013036e-05, + "loss": 0.5497, + "step": 1783 + }, + { + "epoch": 1.140721196130167, + "grad_norm": 2.286311388015747, + "learning_rate": 3.1042957896986536e-05, + "loss": 0.4842, + "step": 1784 + }, + { + "epoch": 1.1413608379307587, + "grad_norm": 2.4942941665649414, + "learning_rate": 3.1032271852960035e-05, + "loss": 0.5559, + "step": 1785 + }, + { + "epoch": 1.1420004797313505, + "grad_norm": 2.2576510906219482, + "learning_rate": 3.1021585808933534e-05, + "loss": 0.5333, + "step": 1786 + }, + { + "epoch": 1.1426401215319422, + "grad_norm": 2.5893917083740234, + "learning_rate": 3.1010899764907034e-05, + "loss": 0.601, + "step": 1787 + }, + { + "epoch": 1.1432797633325338, + "grad_norm": 2.2144362926483154, + "learning_rate": 3.100021372088053e-05, + "loss": 0.4977, + "step": 1788 + }, + { + "epoch": 1.1439194051331254, + "grad_norm": 2.635012626647949, + "learning_rate": 3.098952767685403e-05, + "loss": 0.6411, + "step": 1789 + }, + { + "epoch": 1.144559046933717, + "grad_norm": 2.1270768642425537, + "learning_rate": 3.0978841632827525e-05, + "loss": 0.4926, + "step": 1790 + }, + { + "epoch": 1.1451986887343089, + "grad_norm": 2.225350856781006, + "learning_rate": 3.096815558880103e-05, + "loss": 0.4963, + "step": 1791 + }, + { + "epoch": 1.1458383305349005, + "grad_norm": 2.297593355178833, + "learning_rate": 3.0957469544774524e-05, + "loss": 0.5099, + "step": 1792 + }, + { + "epoch": 1.146477972335492, + "grad_norm": 2.215409517288208, + "learning_rate": 3.094678350074802e-05, + "loss": 0.4963, + "step": 1793 + }, + { + "epoch": 1.1471176141360837, + "grad_norm": 2.4630534648895264, + "learning_rate": 3.093609745672152e-05, + "loss": 0.5437, + "step": 1794 + }, + { + "epoch": 1.1477572559366755, + "grad_norm": 2.34175443649292, + "learning_rate": 3.092541141269502e-05, + "loss": 0.5717, + "step": 1795 + }, + { + "epoch": 1.1483968977372672, + "grad_norm": 2.273455858230591, + "learning_rate": 3.091472536866852e-05, + "loss": 0.512, + "step": 1796 + }, + { + "epoch": 1.1490365395378588, + "grad_norm": 2.186774492263794, + "learning_rate": 3.090403932464202e-05, + "loss": 0.5155, + "step": 1797 + }, + { + "epoch": 1.1496761813384504, + "grad_norm": 2.2865583896636963, + "learning_rate": 3.089335328061552e-05, + "loss": 0.5246, + "step": 1798 + }, + { + "epoch": 1.1503158231390422, + "grad_norm": 2.0999932289123535, + "learning_rate": 3.088266723658901e-05, + "loss": 0.4981, + "step": 1799 + }, + { + "epoch": 1.1509554649396339, + "grad_norm": 2.296555757522583, + "learning_rate": 3.087198119256252e-05, + "loss": 0.5723, + "step": 1800 + }, + { + "epoch": 1.1515951067402255, + "grad_norm": 2.5469870567321777, + "learning_rate": 3.086129514853601e-05, + "loss": 0.6252, + "step": 1801 + }, + { + "epoch": 1.152234748540817, + "grad_norm": 2.245955228805542, + "learning_rate": 3.085060910450952e-05, + "loss": 0.543, + "step": 1802 + }, + { + "epoch": 1.1528743903414087, + "grad_norm": 2.162105083465576, + "learning_rate": 3.083992306048301e-05, + "loss": 0.481, + "step": 1803 + }, + { + "epoch": 1.1535140321420005, + "grad_norm": 2.15701961517334, + "learning_rate": 3.082923701645651e-05, + "loss": 0.5094, + "step": 1804 + }, + { + "epoch": 1.1541536739425922, + "grad_norm": 2.099560260772705, + "learning_rate": 3.081855097243001e-05, + "loss": 0.5572, + "step": 1805 + }, + { + "epoch": 1.1547933157431838, + "grad_norm": 1.998724341392517, + "learning_rate": 3.080786492840351e-05, + "loss": 0.4999, + "step": 1806 + }, + { + "epoch": 1.1554329575437754, + "grad_norm": 2.2188196182250977, + "learning_rate": 3.0797178884377e-05, + "loss": 0.4778, + "step": 1807 + }, + { + "epoch": 1.1560725993443672, + "grad_norm": 2.0485172271728516, + "learning_rate": 3.078649284035051e-05, + "loss": 0.4841, + "step": 1808 + }, + { + "epoch": 1.1567122411449589, + "grad_norm": 2.4112133979797363, + "learning_rate": 3.0775806796324006e-05, + "loss": 0.5285, + "step": 1809 + }, + { + "epoch": 1.1573518829455505, + "grad_norm": 2.2543160915374756, + "learning_rate": 3.07651207522975e-05, + "loss": 0.494, + "step": 1810 + }, + { + "epoch": 1.157991524746142, + "grad_norm": 2.1535301208496094, + "learning_rate": 3.0754434708271005e-05, + "loss": 0.5282, + "step": 1811 + }, + { + "epoch": 1.158631166546734, + "grad_norm": 2.1569716930389404, + "learning_rate": 3.07437486642445e-05, + "loss": 0.5009, + "step": 1812 + }, + { + "epoch": 1.1592708083473255, + "grad_norm": 2.5529816150665283, + "learning_rate": 3.0733062620218e-05, + "loss": 0.5303, + "step": 1813 + }, + { + "epoch": 1.1599104501479172, + "grad_norm": 2.2742972373962402, + "learning_rate": 3.0722376576191496e-05, + "loss": 0.5143, + "step": 1814 + }, + { + "epoch": 1.1605500919485088, + "grad_norm": 2.1131622791290283, + "learning_rate": 3.0711690532164995e-05, + "loss": 0.439, + "step": 1815 + }, + { + "epoch": 1.1611897337491004, + "grad_norm": 2.4731392860412598, + "learning_rate": 3.070100448813849e-05, + "loss": 0.4983, + "step": 1816 + }, + { + "epoch": 1.1618293755496922, + "grad_norm": 2.3981497287750244, + "learning_rate": 3.0690318444111994e-05, + "loss": 0.4753, + "step": 1817 + }, + { + "epoch": 1.1624690173502839, + "grad_norm": 2.3521344661712646, + "learning_rate": 3.067963240008549e-05, + "loss": 0.5067, + "step": 1818 + }, + { + "epoch": 1.1631086591508755, + "grad_norm": 2.4216907024383545, + "learning_rate": 3.0668946356058986e-05, + "loss": 0.4858, + "step": 1819 + }, + { + "epoch": 1.163748300951467, + "grad_norm": 2.7841217517852783, + "learning_rate": 3.0658260312032485e-05, + "loss": 0.5465, + "step": 1820 + }, + { + "epoch": 1.164387942752059, + "grad_norm": 2.0943174362182617, + "learning_rate": 3.0647574268005985e-05, + "loss": 0.4304, + "step": 1821 + }, + { + "epoch": 1.1650275845526505, + "grad_norm": 2.673572063446045, + "learning_rate": 3.0636888223979484e-05, + "loss": 0.5114, + "step": 1822 + }, + { + "epoch": 1.1656672263532422, + "grad_norm": 2.243863105773926, + "learning_rate": 3.0626202179952984e-05, + "loss": 0.4814, + "step": 1823 + }, + { + "epoch": 1.1663068681538338, + "grad_norm": 2.5512664318084717, + "learning_rate": 3.061551613592648e-05, + "loss": 0.5605, + "step": 1824 + }, + { + "epoch": 1.1669465099544256, + "grad_norm": 2.4531712532043457, + "learning_rate": 3.0604830091899975e-05, + "loss": 0.5505, + "step": 1825 + }, + { + "epoch": 1.1675861517550172, + "grad_norm": 2.149388313293457, + "learning_rate": 3.059414404787348e-05, + "loss": 0.5078, + "step": 1826 + }, + { + "epoch": 1.1682257935556088, + "grad_norm": 2.3568930625915527, + "learning_rate": 3.0583458003846974e-05, + "loss": 0.5571, + "step": 1827 + }, + { + "epoch": 1.1688654353562005, + "grad_norm": 2.529860258102417, + "learning_rate": 3.0572771959820474e-05, + "loss": 0.5956, + "step": 1828 + }, + { + "epoch": 1.169505077156792, + "grad_norm": 2.334620714187622, + "learning_rate": 3.056208591579397e-05, + "loss": 0.5125, + "step": 1829 + }, + { + "epoch": 1.170144718957384, + "grad_norm": 2.1386635303497314, + "learning_rate": 3.055139987176747e-05, + "loss": 0.4714, + "step": 1830 + }, + { + "epoch": 1.1707843607579755, + "grad_norm": 2.1763722896575928, + "learning_rate": 3.054071382774097e-05, + "loss": 0.4904, + "step": 1831 + }, + { + "epoch": 1.1714240025585672, + "grad_norm": 2.3565046787261963, + "learning_rate": 3.053002778371447e-05, + "loss": 0.5369, + "step": 1832 + }, + { + "epoch": 1.172063644359159, + "grad_norm": 2.5247156620025635, + "learning_rate": 3.051934173968797e-05, + "loss": 0.6099, + "step": 1833 + }, + { + "epoch": 1.1727032861597506, + "grad_norm": 2.020989179611206, + "learning_rate": 3.050865569566147e-05, + "loss": 0.5122, + "step": 1834 + }, + { + "epoch": 1.1733429279603422, + "grad_norm": 2.1518349647521973, + "learning_rate": 3.0497969651634966e-05, + "loss": 0.5161, + "step": 1835 + }, + { + "epoch": 1.1739825697609338, + "grad_norm": 2.229703903198242, + "learning_rate": 3.0487283607608462e-05, + "loss": 0.5164, + "step": 1836 + }, + { + "epoch": 1.1746222115615255, + "grad_norm": 2.6590027809143066, + "learning_rate": 3.0476597563581964e-05, + "loss": 0.5673, + "step": 1837 + }, + { + "epoch": 1.1752618533621173, + "grad_norm": 2.182640314102173, + "learning_rate": 3.0465911519555464e-05, + "loss": 0.4899, + "step": 1838 + }, + { + "epoch": 1.175901495162709, + "grad_norm": 2.5984950065612793, + "learning_rate": 3.045522547552896e-05, + "loss": 0.5967, + "step": 1839 + }, + { + "epoch": 1.1765411369633005, + "grad_norm": 2.2198374271392822, + "learning_rate": 3.0444539431502463e-05, + "loss": 0.4849, + "step": 1840 + }, + { + "epoch": 1.1771807787638922, + "grad_norm": 2.1248881816864014, + "learning_rate": 3.043385338747596e-05, + "loss": 0.4681, + "step": 1841 + }, + { + "epoch": 1.1778204205644838, + "grad_norm": 2.1325602531433105, + "learning_rate": 3.0423167343449454e-05, + "loss": 0.4678, + "step": 1842 + }, + { + "epoch": 1.1784600623650756, + "grad_norm": 2.462951898574829, + "learning_rate": 3.0412481299422957e-05, + "loss": 0.5682, + "step": 1843 + }, + { + "epoch": 1.1790997041656672, + "grad_norm": 2.3943634033203125, + "learning_rate": 3.0401795255396453e-05, + "loss": 0.5249, + "step": 1844 + }, + { + "epoch": 1.1797393459662588, + "grad_norm": 2.098130226135254, + "learning_rate": 3.039110921136995e-05, + "loss": 0.4924, + "step": 1845 + }, + { + "epoch": 1.1803789877668507, + "grad_norm": 2.0938916206359863, + "learning_rate": 3.0380423167343452e-05, + "loss": 0.4612, + "step": 1846 + }, + { + "epoch": 1.1810186295674423, + "grad_norm": 2.520514726638794, + "learning_rate": 3.0369737123316948e-05, + "loss": 0.5647, + "step": 1847 + }, + { + "epoch": 1.181658271368034, + "grad_norm": 2.529978036880493, + "learning_rate": 3.0359051079290447e-05, + "loss": 0.575, + "step": 1848 + }, + { + "epoch": 1.1822979131686255, + "grad_norm": 2.61244535446167, + "learning_rate": 3.0348365035263947e-05, + "loss": 0.5425, + "step": 1849 + }, + { + "epoch": 1.1829375549692172, + "grad_norm": 2.6496524810791016, + "learning_rate": 3.0337678991237446e-05, + "loss": 0.5568, + "step": 1850 + }, + { + "epoch": 1.183577196769809, + "grad_norm": 2.020493745803833, + "learning_rate": 3.0326992947210942e-05, + "loss": 0.4521, + "step": 1851 + }, + { + "epoch": 1.1842168385704006, + "grad_norm": 2.384413242340088, + "learning_rate": 3.0316306903184445e-05, + "loss": 0.5559, + "step": 1852 + }, + { + "epoch": 1.1848564803709922, + "grad_norm": 2.1782498359680176, + "learning_rate": 3.030562085915794e-05, + "loss": 0.46, + "step": 1853 + }, + { + "epoch": 1.1854961221715838, + "grad_norm": 2.1637609004974365, + "learning_rate": 3.0294934815131437e-05, + "loss": 0.4933, + "step": 1854 + }, + { + "epoch": 1.1861357639721755, + "grad_norm": 2.533543348312378, + "learning_rate": 3.028424877110494e-05, + "loss": 0.5828, + "step": 1855 + }, + { + "epoch": 1.1867754057727673, + "grad_norm": 2.0163798332214355, + "learning_rate": 3.0273562727078435e-05, + "loss": 0.4684, + "step": 1856 + }, + { + "epoch": 1.187415047573359, + "grad_norm": 2.392141580581665, + "learning_rate": 3.0262876683051938e-05, + "loss": 0.4986, + "step": 1857 + }, + { + "epoch": 1.1880546893739505, + "grad_norm": 2.5294408798217773, + "learning_rate": 3.0252190639025434e-05, + "loss": 0.5441, + "step": 1858 + }, + { + "epoch": 1.1886943311745424, + "grad_norm": 2.6419923305511475, + "learning_rate": 3.024150459499893e-05, + "loss": 0.5689, + "step": 1859 + }, + { + "epoch": 1.189333972975134, + "grad_norm": 2.1667978763580322, + "learning_rate": 3.0230818550972433e-05, + "loss": 0.4693, + "step": 1860 + }, + { + "epoch": 1.1899736147757256, + "grad_norm": 1.8922816514968872, + "learning_rate": 3.0220132506945932e-05, + "loss": 0.3944, + "step": 1861 + }, + { + "epoch": 1.1906132565763172, + "grad_norm": 2.4143950939178467, + "learning_rate": 3.0209446462919428e-05, + "loss": 0.4918, + "step": 1862 + }, + { + "epoch": 1.1912528983769088, + "grad_norm": 2.2610855102539062, + "learning_rate": 3.019876041889293e-05, + "loss": 0.4902, + "step": 1863 + }, + { + "epoch": 1.1918925401775007, + "grad_norm": 2.3496017456054688, + "learning_rate": 3.0188074374866427e-05, + "loss": 0.5427, + "step": 1864 + }, + { + "epoch": 1.1925321819780923, + "grad_norm": 2.298569679260254, + "learning_rate": 3.0177388330839923e-05, + "loss": 0.528, + "step": 1865 + }, + { + "epoch": 1.193171823778684, + "grad_norm": 2.2193431854248047, + "learning_rate": 3.0166702286813426e-05, + "loss": 0.4479, + "step": 1866 + }, + { + "epoch": 1.1938114655792755, + "grad_norm": 2.7167887687683105, + "learning_rate": 3.015601624278692e-05, + "loss": 0.6171, + "step": 1867 + }, + { + "epoch": 1.1944511073798671, + "grad_norm": 2.5493767261505127, + "learning_rate": 3.0145330198760418e-05, + "loss": 0.5822, + "step": 1868 + }, + { + "epoch": 1.195090749180459, + "grad_norm": 2.3377022743225098, + "learning_rate": 3.013464415473392e-05, + "loss": 0.5856, + "step": 1869 + }, + { + "epoch": 1.1957303909810506, + "grad_norm": 2.2534713745117188, + "learning_rate": 3.0123958110707416e-05, + "loss": 0.5162, + "step": 1870 + }, + { + "epoch": 1.1963700327816422, + "grad_norm": 2.7288925647735596, + "learning_rate": 3.0113272066680916e-05, + "loss": 0.58, + "step": 1871 + }, + { + "epoch": 1.197009674582234, + "grad_norm": 2.2652347087860107, + "learning_rate": 3.0102586022654415e-05, + "loss": 0.5065, + "step": 1872 + }, + { + "epoch": 1.1976493163828257, + "grad_norm": 2.3203070163726807, + "learning_rate": 3.0091899978627914e-05, + "loss": 0.5162, + "step": 1873 + }, + { + "epoch": 1.1982889581834173, + "grad_norm": 2.561286449432373, + "learning_rate": 3.008121393460141e-05, + "loss": 0.5865, + "step": 1874 + }, + { + "epoch": 1.198928599984009, + "grad_norm": 2.284147024154663, + "learning_rate": 3.0070527890574913e-05, + "loss": 0.497, + "step": 1875 + }, + { + "epoch": 1.1995682417846005, + "grad_norm": 2.2626378536224365, + "learning_rate": 3.005984184654841e-05, + "loss": 0.5086, + "step": 1876 + }, + { + "epoch": 1.2002078835851924, + "grad_norm": 2.298335552215576, + "learning_rate": 3.0049155802521905e-05, + "loss": 0.5078, + "step": 1877 + }, + { + "epoch": 1.200847525385784, + "grad_norm": 2.6296801567077637, + "learning_rate": 3.0038469758495408e-05, + "loss": 0.6498, + "step": 1878 + }, + { + "epoch": 1.2014871671863756, + "grad_norm": 2.3466734886169434, + "learning_rate": 3.0027783714468904e-05, + "loss": 0.5188, + "step": 1879 + }, + { + "epoch": 1.2021268089869672, + "grad_norm": 2.5008387565612793, + "learning_rate": 3.00170976704424e-05, + "loss": 0.551, + "step": 1880 + }, + { + "epoch": 1.202766450787559, + "grad_norm": 2.2959585189819336, + "learning_rate": 3.0006411626415902e-05, + "loss": 0.4949, + "step": 1881 + }, + { + "epoch": 1.2034060925881507, + "grad_norm": 2.023841619491577, + "learning_rate": 2.99957255823894e-05, + "loss": 0.4609, + "step": 1882 + }, + { + "epoch": 1.2040457343887423, + "grad_norm": 2.2053587436676025, + "learning_rate": 2.9985039538362898e-05, + "loss": 0.5189, + "step": 1883 + }, + { + "epoch": 1.204685376189334, + "grad_norm": 2.360405683517456, + "learning_rate": 2.99743534943364e-05, + "loss": 0.5432, + "step": 1884 + }, + { + "epoch": 1.2053250179899258, + "grad_norm": 2.5589892864227295, + "learning_rate": 2.9963667450309897e-05, + "loss": 0.5874, + "step": 1885 + }, + { + "epoch": 1.2059646597905174, + "grad_norm": 2.4862349033355713, + "learning_rate": 2.99529814062834e-05, + "loss": 0.5617, + "step": 1886 + }, + { + "epoch": 1.206604301591109, + "grad_norm": 2.2344589233398438, + "learning_rate": 2.9942295362256895e-05, + "loss": 0.5109, + "step": 1887 + }, + { + "epoch": 1.2072439433917006, + "grad_norm": 2.1723625659942627, + "learning_rate": 2.993160931823039e-05, + "loss": 0.4738, + "step": 1888 + }, + { + "epoch": 1.2078835851922922, + "grad_norm": 2.378105640411377, + "learning_rate": 2.9920923274203894e-05, + "loss": 0.5761, + "step": 1889 + }, + { + "epoch": 1.208523226992884, + "grad_norm": 2.038362741470337, + "learning_rate": 2.991023723017739e-05, + "loss": 0.46, + "step": 1890 + }, + { + "epoch": 1.2091628687934757, + "grad_norm": 2.382270574569702, + "learning_rate": 2.9899551186150886e-05, + "loss": 0.5294, + "step": 1891 + }, + { + "epoch": 1.2098025105940673, + "grad_norm": 2.626929759979248, + "learning_rate": 2.988886514212439e-05, + "loss": 0.6001, + "step": 1892 + }, + { + "epoch": 1.210442152394659, + "grad_norm": 2.4395911693573, + "learning_rate": 2.9878179098097885e-05, + "loss": 0.5452, + "step": 1893 + }, + { + "epoch": 1.2110817941952507, + "grad_norm": 2.2741150856018066, + "learning_rate": 2.9867493054071384e-05, + "loss": 0.5078, + "step": 1894 + }, + { + "epoch": 1.2117214359958424, + "grad_norm": 2.165040969848633, + "learning_rate": 2.9856807010044883e-05, + "loss": 0.501, + "step": 1895 + }, + { + "epoch": 1.212361077796434, + "grad_norm": 2.318448305130005, + "learning_rate": 2.9846120966018383e-05, + "loss": 0.5125, + "step": 1896 + }, + { + "epoch": 1.2130007195970256, + "grad_norm": 2.4866325855255127, + "learning_rate": 2.983543492199188e-05, + "loss": 0.4923, + "step": 1897 + }, + { + "epoch": 1.2136403613976174, + "grad_norm": 2.2698774337768555, + "learning_rate": 2.982474887796538e-05, + "loss": 0.5061, + "step": 1898 + }, + { + "epoch": 1.214280003198209, + "grad_norm": 2.0199472904205322, + "learning_rate": 2.9814062833938877e-05, + "loss": 0.455, + "step": 1899 + }, + { + "epoch": 1.2149196449988007, + "grad_norm": 2.336916446685791, + "learning_rate": 2.9803376789912373e-05, + "loss": 0.5062, + "step": 1900 + }, + { + "epoch": 1.2155592867993923, + "grad_norm": 1.9909005165100098, + "learning_rate": 2.9792690745885876e-05, + "loss": 0.4283, + "step": 1901 + }, + { + "epoch": 1.216198928599984, + "grad_norm": 2.293056011199951, + "learning_rate": 2.9782004701859372e-05, + "loss": 0.5115, + "step": 1902 + }, + { + "epoch": 1.2168385704005757, + "grad_norm": 2.2186899185180664, + "learning_rate": 2.9771318657832868e-05, + "loss": 0.4795, + "step": 1903 + }, + { + "epoch": 1.2174782122011674, + "grad_norm": 2.1828689575195312, + "learning_rate": 2.976063261380637e-05, + "loss": 0.4769, + "step": 1904 + }, + { + "epoch": 1.218117854001759, + "grad_norm": 2.4945461750030518, + "learning_rate": 2.9749946569779867e-05, + "loss": 0.5201, + "step": 1905 + }, + { + "epoch": 1.2187574958023506, + "grad_norm": 2.6482813358306885, + "learning_rate": 2.9739260525753366e-05, + "loss": 0.5395, + "step": 1906 + }, + { + "epoch": 1.2193971376029424, + "grad_norm": 2.400118827819824, + "learning_rate": 2.972857448172687e-05, + "loss": 0.4887, + "step": 1907 + }, + { + "epoch": 1.220036779403534, + "grad_norm": 2.3764233589172363, + "learning_rate": 2.9717888437700365e-05, + "loss": 0.4791, + "step": 1908 + }, + { + "epoch": 1.2206764212041257, + "grad_norm": 2.2382543087005615, + "learning_rate": 2.970720239367386e-05, + "loss": 0.4716, + "step": 1909 + }, + { + "epoch": 1.2213160630047173, + "grad_norm": 2.2786457538604736, + "learning_rate": 2.9696516349647364e-05, + "loss": 0.4923, + "step": 1910 + }, + { + "epoch": 1.2219557048053091, + "grad_norm": 2.7733383178710938, + "learning_rate": 2.968583030562086e-05, + "loss": 0.5627, + "step": 1911 + }, + { + "epoch": 1.2225953466059007, + "grad_norm": 2.51462459564209, + "learning_rate": 2.9675144261594356e-05, + "loss": 0.503, + "step": 1912 + }, + { + "epoch": 1.2232349884064924, + "grad_norm": 2.627152681350708, + "learning_rate": 2.966445821756786e-05, + "loss": 0.4894, + "step": 1913 + }, + { + "epoch": 1.223874630207084, + "grad_norm": 2.597862482070923, + "learning_rate": 2.9653772173541354e-05, + "loss": 0.5304, + "step": 1914 + }, + { + "epoch": 1.2245142720076756, + "grad_norm": 2.7536518573760986, + "learning_rate": 2.9643086129514857e-05, + "loss": 0.5714, + "step": 1915 + }, + { + "epoch": 1.2251539138082674, + "grad_norm": 2.5098819732666016, + "learning_rate": 2.9632400085488353e-05, + "loss": 0.5616, + "step": 1916 + }, + { + "epoch": 1.225793555608859, + "grad_norm": 2.7121522426605225, + "learning_rate": 2.9621714041461852e-05, + "loss": 0.6312, + "step": 1917 + }, + { + "epoch": 1.2264331974094507, + "grad_norm": 2.089224100112915, + "learning_rate": 2.9611027997435352e-05, + "loss": 0.478, + "step": 1918 + }, + { + "epoch": 1.2270728392100423, + "grad_norm": 2.100416898727417, + "learning_rate": 2.960034195340885e-05, + "loss": 0.457, + "step": 1919 + }, + { + "epoch": 1.2277124810106341, + "grad_norm": 2.5298514366149902, + "learning_rate": 2.9589655909382347e-05, + "loss": 0.5628, + "step": 1920 + }, + { + "epoch": 1.2283521228112257, + "grad_norm": 2.3800530433654785, + "learning_rate": 2.957896986535585e-05, + "loss": 0.4894, + "step": 1921 + }, + { + "epoch": 1.2289917646118174, + "grad_norm": 2.2956044673919678, + "learning_rate": 2.9568283821329346e-05, + "loss": 0.5338, + "step": 1922 + }, + { + "epoch": 1.229631406412409, + "grad_norm": 2.2654309272766113, + "learning_rate": 2.9557597777302842e-05, + "loss": 0.5196, + "step": 1923 + }, + { + "epoch": 1.2302710482130008, + "grad_norm": 2.2639081478118896, + "learning_rate": 2.9546911733276345e-05, + "loss": 0.5301, + "step": 1924 + }, + { + "epoch": 1.2309106900135924, + "grad_norm": 2.1569902896881104, + "learning_rate": 2.953622568924984e-05, + "loss": 0.5003, + "step": 1925 + }, + { + "epoch": 1.231550331814184, + "grad_norm": 2.600153923034668, + "learning_rate": 2.9525539645223337e-05, + "loss": 0.5973, + "step": 1926 + }, + { + "epoch": 1.2321899736147757, + "grad_norm": 2.052147626876831, + "learning_rate": 2.951485360119684e-05, + "loss": 0.4899, + "step": 1927 + }, + { + "epoch": 1.2328296154153673, + "grad_norm": 2.007266044616699, + "learning_rate": 2.9504167557170335e-05, + "loss": 0.537, + "step": 1928 + }, + { + "epoch": 1.2334692572159591, + "grad_norm": 2.7690622806549072, + "learning_rate": 2.9493481513143835e-05, + "loss": 0.615, + "step": 1929 + }, + { + "epoch": 1.2341088990165507, + "grad_norm": 2.003932476043701, + "learning_rate": 2.9482795469117337e-05, + "loss": 0.478, + "step": 1930 + }, + { + "epoch": 1.2347485408171424, + "grad_norm": 1.9905788898468018, + "learning_rate": 2.9472109425090833e-05, + "loss": 0.456, + "step": 1931 + }, + { + "epoch": 1.235388182617734, + "grad_norm": 2.386472702026367, + "learning_rate": 2.946142338106433e-05, + "loss": 0.5624, + "step": 1932 + }, + { + "epoch": 1.2360278244183258, + "grad_norm": 2.2222349643707275, + "learning_rate": 2.9450737337037832e-05, + "loss": 0.5586, + "step": 1933 + }, + { + "epoch": 1.2366674662189174, + "grad_norm": 2.3216359615325928, + "learning_rate": 2.9440051293011328e-05, + "loss": 0.5685, + "step": 1934 + }, + { + "epoch": 1.237307108019509, + "grad_norm": 2.619050979614258, + "learning_rate": 2.9429365248984824e-05, + "loss": 0.6147, + "step": 1935 + }, + { + "epoch": 1.2379467498201007, + "grad_norm": 2.078671932220459, + "learning_rate": 2.9418679204958327e-05, + "loss": 0.4646, + "step": 1936 + }, + { + "epoch": 1.2385863916206925, + "grad_norm": 2.0084242820739746, + "learning_rate": 2.9407993160931823e-05, + "loss": 0.4625, + "step": 1937 + }, + { + "epoch": 1.2392260334212841, + "grad_norm": 2.3187005519866943, + "learning_rate": 2.9397307116905322e-05, + "loss": 0.546, + "step": 1938 + }, + { + "epoch": 1.2398656752218757, + "grad_norm": 2.308591604232788, + "learning_rate": 2.938662107287882e-05, + "loss": 0.5192, + "step": 1939 + }, + { + "epoch": 1.2405053170224674, + "grad_norm": 2.412846088409424, + "learning_rate": 2.937593502885232e-05, + "loss": 0.5236, + "step": 1940 + }, + { + "epoch": 1.241144958823059, + "grad_norm": 2.4006268978118896, + "learning_rate": 2.936524898482582e-05, + "loss": 0.5036, + "step": 1941 + }, + { + "epoch": 1.2417846006236508, + "grad_norm": 2.17791485786438, + "learning_rate": 2.935456294079932e-05, + "loss": 0.5213, + "step": 1942 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 2.010676860809326, + "learning_rate": 2.9343876896772816e-05, + "loss": 0.4613, + "step": 1943 + }, + { + "epoch": 1.243063884224834, + "grad_norm": 2.296203851699829, + "learning_rate": 2.9333190852746318e-05, + "loss": 0.5059, + "step": 1944 + }, + { + "epoch": 1.2437035260254257, + "grad_norm": 2.5326197147369385, + "learning_rate": 2.9322504808719814e-05, + "loss": 0.5295, + "step": 1945 + }, + { + "epoch": 1.2443431678260175, + "grad_norm": 2.0551929473876953, + "learning_rate": 2.931181876469331e-05, + "loss": 0.4428, + "step": 1946 + }, + { + "epoch": 1.2449828096266091, + "grad_norm": 2.3067052364349365, + "learning_rate": 2.9301132720666813e-05, + "loss": 0.5254, + "step": 1947 + }, + { + "epoch": 1.2456224514272007, + "grad_norm": 2.5025124549865723, + "learning_rate": 2.929044667664031e-05, + "loss": 0.5666, + "step": 1948 + }, + { + "epoch": 1.2462620932277924, + "grad_norm": 2.6275241374969482, + "learning_rate": 2.9279760632613805e-05, + "loss": 0.5826, + "step": 1949 + }, + { + "epoch": 1.2469017350283842, + "grad_norm": 2.2546980381011963, + "learning_rate": 2.9269074588587308e-05, + "loss": 0.5274, + "step": 1950 + }, + { + "epoch": 1.2475413768289758, + "grad_norm": 3.0122833251953125, + "learning_rate": 2.9258388544560804e-05, + "loss": 0.6895, + "step": 1951 + }, + { + "epoch": 1.2481810186295674, + "grad_norm": 2.42378306388855, + "learning_rate": 2.9247702500534303e-05, + "loss": 0.5366, + "step": 1952 + }, + { + "epoch": 1.248820660430159, + "grad_norm": 2.654308795928955, + "learning_rate": 2.9237016456507806e-05, + "loss": 0.591, + "step": 1953 + }, + { + "epoch": 1.2494603022307507, + "grad_norm": 2.3251614570617676, + "learning_rate": 2.9226330412481302e-05, + "loss": 0.5264, + "step": 1954 + }, + { + "epoch": 1.2500999440313425, + "grad_norm": 2.356510877609253, + "learning_rate": 2.9215644368454798e-05, + "loss": 0.4748, + "step": 1955 + }, + { + "epoch": 1.2507395858319341, + "grad_norm": 2.6153526306152344, + "learning_rate": 2.92049583244283e-05, + "loss": 0.5786, + "step": 1956 + }, + { + "epoch": 1.2513792276325257, + "grad_norm": 2.101062774658203, + "learning_rate": 2.9194272280401796e-05, + "loss": 0.4809, + "step": 1957 + }, + { + "epoch": 1.2520188694331176, + "grad_norm": 2.471029281616211, + "learning_rate": 2.9183586236375292e-05, + "loss": 0.6066, + "step": 1958 + }, + { + "epoch": 1.2526585112337092, + "grad_norm": 2.107891082763672, + "learning_rate": 2.9172900192348795e-05, + "loss": 0.476, + "step": 1959 + }, + { + "epoch": 1.2532981530343008, + "grad_norm": 2.149592399597168, + "learning_rate": 2.916221414832229e-05, + "loss": 0.5439, + "step": 1960 + }, + { + "epoch": 1.2539377948348924, + "grad_norm": 2.310904026031494, + "learning_rate": 2.915152810429579e-05, + "loss": 0.5272, + "step": 1961 + }, + { + "epoch": 1.254577436635484, + "grad_norm": 1.9122196435928345, + "learning_rate": 2.914084206026929e-05, + "loss": 0.4634, + "step": 1962 + }, + { + "epoch": 1.2552170784360759, + "grad_norm": 2.219306230545044, + "learning_rate": 2.913015601624279e-05, + "loss": 0.5078, + "step": 1963 + }, + { + "epoch": 1.2558567202366675, + "grad_norm": 2.1937732696533203, + "learning_rate": 2.9119469972216285e-05, + "loss": 0.5078, + "step": 1964 + }, + { + "epoch": 1.2564963620372591, + "grad_norm": 2.5236258506774902, + "learning_rate": 2.9108783928189788e-05, + "loss": 0.5697, + "step": 1965 + }, + { + "epoch": 1.2571360038378507, + "grad_norm": 2.051093339920044, + "learning_rate": 2.9098097884163284e-05, + "loss": 0.4919, + "step": 1966 + }, + { + "epoch": 1.2577756456384424, + "grad_norm": 2.2349143028259277, + "learning_rate": 2.908741184013678e-05, + "loss": 0.5124, + "step": 1967 + }, + { + "epoch": 1.2584152874390342, + "grad_norm": 2.3856165409088135, + "learning_rate": 2.9076725796110283e-05, + "loss": 0.5226, + "step": 1968 + }, + { + "epoch": 1.2590549292396258, + "grad_norm": 2.5890698432922363, + "learning_rate": 2.906603975208378e-05, + "loss": 0.552, + "step": 1969 + }, + { + "epoch": 1.2596945710402174, + "grad_norm": 2.295560121536255, + "learning_rate": 2.905535370805728e-05, + "loss": 0.498, + "step": 1970 + }, + { + "epoch": 1.2603342128408093, + "grad_norm": 2.6017038822174072, + "learning_rate": 2.9044667664030777e-05, + "loss": 0.5966, + "step": 1971 + }, + { + "epoch": 1.2609738546414009, + "grad_norm": 2.2091705799102783, + "learning_rate": 2.9033981620004273e-05, + "loss": 0.4673, + "step": 1972 + }, + { + "epoch": 1.2616134964419925, + "grad_norm": 2.3184378147125244, + "learning_rate": 2.9023295575977776e-05, + "loss": 0.4996, + "step": 1973 + }, + { + "epoch": 1.2622531382425841, + "grad_norm": 2.452347993850708, + "learning_rate": 2.9012609531951272e-05, + "loss": 0.5428, + "step": 1974 + }, + { + "epoch": 1.2628927800431757, + "grad_norm": 2.132094621658325, + "learning_rate": 2.900192348792477e-05, + "loss": 0.4738, + "step": 1975 + }, + { + "epoch": 1.2635324218437676, + "grad_norm": 2.8011162281036377, + "learning_rate": 2.8991237443898274e-05, + "loss": 0.5909, + "step": 1976 + }, + { + "epoch": 1.2641720636443592, + "grad_norm": 2.4866397380828857, + "learning_rate": 2.898055139987177e-05, + "loss": 0.5347, + "step": 1977 + }, + { + "epoch": 1.2648117054449508, + "grad_norm": 2.5434610843658447, + "learning_rate": 2.8969865355845266e-05, + "loss": 0.5399, + "step": 1978 + }, + { + "epoch": 1.2654513472455424, + "grad_norm": 2.4744534492492676, + "learning_rate": 2.895917931181877e-05, + "loss": 0.5092, + "step": 1979 + }, + { + "epoch": 1.266090989046134, + "grad_norm": 2.4128072261810303, + "learning_rate": 2.8948493267792265e-05, + "loss": 0.5439, + "step": 1980 + }, + { + "epoch": 1.2667306308467259, + "grad_norm": 2.2428719997406006, + "learning_rate": 2.893780722376576e-05, + "loss": 0.5078, + "step": 1981 + }, + { + "epoch": 1.2673702726473175, + "grad_norm": 2.4592859745025635, + "learning_rate": 2.8927121179739264e-05, + "loss": 0.545, + "step": 1982 + }, + { + "epoch": 1.2680099144479091, + "grad_norm": 2.5110604763031006, + "learning_rate": 2.891643513571276e-05, + "loss": 0.5183, + "step": 1983 + }, + { + "epoch": 1.268649556248501, + "grad_norm": 2.4705893993377686, + "learning_rate": 2.890574909168626e-05, + "loss": 0.576, + "step": 1984 + }, + { + "epoch": 1.2692891980490926, + "grad_norm": 2.5122761726379395, + "learning_rate": 2.8895063047659758e-05, + "loss": 0.5709, + "step": 1985 + }, + { + "epoch": 1.2699288398496842, + "grad_norm": 2.5504796504974365, + "learning_rate": 2.8884377003633258e-05, + "loss": 0.4922, + "step": 1986 + }, + { + "epoch": 1.2705684816502758, + "grad_norm": 2.3992555141448975, + "learning_rate": 2.8873690959606754e-05, + "loss": 0.5057, + "step": 1987 + }, + { + "epoch": 1.2712081234508674, + "grad_norm": 2.371886968612671, + "learning_rate": 2.8863004915580256e-05, + "loss": 0.5184, + "step": 1988 + }, + { + "epoch": 1.2718477652514593, + "grad_norm": 2.611604690551758, + "learning_rate": 2.8852318871553752e-05, + "loss": 0.5043, + "step": 1989 + }, + { + "epoch": 1.2724874070520509, + "grad_norm": 2.582867383956909, + "learning_rate": 2.8841632827527248e-05, + "loss": 0.5469, + "step": 1990 + }, + { + "epoch": 1.2731270488526425, + "grad_norm": 2.025519371032715, + "learning_rate": 2.883094678350075e-05, + "loss": 0.4452, + "step": 1991 + }, + { + "epoch": 1.2737666906532341, + "grad_norm": 2.218400239944458, + "learning_rate": 2.8820260739474247e-05, + "loss": 0.5025, + "step": 1992 + }, + { + "epoch": 1.2744063324538257, + "grad_norm": 2.2676448822021484, + "learning_rate": 2.8809574695447743e-05, + "loss": 0.504, + "step": 1993 + }, + { + "epoch": 1.2750459742544176, + "grad_norm": 2.5380170345306396, + "learning_rate": 2.8798888651421246e-05, + "loss": 0.5355, + "step": 1994 + }, + { + "epoch": 1.2756856160550092, + "grad_norm": 2.441412925720215, + "learning_rate": 2.878820260739474e-05, + "loss": 0.5809, + "step": 1995 + }, + { + "epoch": 1.2763252578556008, + "grad_norm": 2.257662296295166, + "learning_rate": 2.877751656336824e-05, + "loss": 0.5474, + "step": 1996 + }, + { + "epoch": 1.2769648996561926, + "grad_norm": 2.474874258041382, + "learning_rate": 2.876683051934174e-05, + "loss": 0.5936, + "step": 1997 + }, + { + "epoch": 1.2776045414567843, + "grad_norm": 2.524951696395874, + "learning_rate": 2.875614447531524e-05, + "loss": 0.56, + "step": 1998 + }, + { + "epoch": 1.2782441832573759, + "grad_norm": 2.394655227661133, + "learning_rate": 2.8745458431288743e-05, + "loss": 0.5084, + "step": 1999 + }, + { + "epoch": 1.2788838250579675, + "grad_norm": 2.157370090484619, + "learning_rate": 2.873477238726224e-05, + "loss": 0.4924, + "step": 2000 + }, + { + "epoch": 1.279523466858559, + "grad_norm": 2.449563503265381, + "learning_rate": 2.8724086343235734e-05, + "loss": 0.5842, + "step": 2001 + }, + { + "epoch": 1.280163108659151, + "grad_norm": 2.3932130336761475, + "learning_rate": 2.8713400299209237e-05, + "loss": 0.5419, + "step": 2002 + }, + { + "epoch": 1.2808027504597426, + "grad_norm": 2.4203569889068604, + "learning_rate": 2.8702714255182733e-05, + "loss": 0.5544, + "step": 2003 + }, + { + "epoch": 1.2814423922603342, + "grad_norm": 2.7092268466949463, + "learning_rate": 2.869202821115623e-05, + "loss": 0.6667, + "step": 2004 + }, + { + "epoch": 1.2820820340609258, + "grad_norm": 2.446917772293091, + "learning_rate": 2.8681342167129732e-05, + "loss": 0.5398, + "step": 2005 + }, + { + "epoch": 1.2827216758615174, + "grad_norm": 2.144728183746338, + "learning_rate": 2.8670656123103228e-05, + "loss": 0.5031, + "step": 2006 + }, + { + "epoch": 1.2833613176621093, + "grad_norm": 1.9469785690307617, + "learning_rate": 2.8659970079076727e-05, + "loss": 0.4171, + "step": 2007 + }, + { + "epoch": 1.2840009594627009, + "grad_norm": 2.4777307510375977, + "learning_rate": 2.8649284035050227e-05, + "loss": 0.5256, + "step": 2008 + }, + { + "epoch": 1.2846406012632925, + "grad_norm": 2.231151819229126, + "learning_rate": 2.8638597991023726e-05, + "loss": 0.5158, + "step": 2009 + }, + { + "epoch": 1.2852802430638843, + "grad_norm": 2.5464847087860107, + "learning_rate": 2.8627911946997222e-05, + "loss": 0.5611, + "step": 2010 + }, + { + "epoch": 1.285919884864476, + "grad_norm": 2.2185094356536865, + "learning_rate": 2.8617225902970725e-05, + "loss": 0.4874, + "step": 2011 + }, + { + "epoch": 1.2865595266650676, + "grad_norm": 2.411947011947632, + "learning_rate": 2.860653985894422e-05, + "loss": 0.5173, + "step": 2012 + }, + { + "epoch": 1.2871991684656592, + "grad_norm": 2.1727848052978516, + "learning_rate": 2.8595853814917717e-05, + "loss": 0.4461, + "step": 2013 + }, + { + "epoch": 1.2878388102662508, + "grad_norm": 2.1403584480285645, + "learning_rate": 2.858516777089122e-05, + "loss": 0.4893, + "step": 2014 + }, + { + "epoch": 1.2884784520668426, + "grad_norm": 2.215038299560547, + "learning_rate": 2.8574481726864715e-05, + "loss": 0.509, + "step": 2015 + }, + { + "epoch": 1.2891180938674343, + "grad_norm": 2.4422502517700195, + "learning_rate": 2.856379568283821e-05, + "loss": 0.5538, + "step": 2016 + }, + { + "epoch": 1.2897577356680259, + "grad_norm": 2.3239777088165283, + "learning_rate": 2.8553109638811714e-05, + "loss": 0.5624, + "step": 2017 + }, + { + "epoch": 1.2903973774686177, + "grad_norm": 2.610250949859619, + "learning_rate": 2.854242359478521e-05, + "loss": 0.6156, + "step": 2018 + }, + { + "epoch": 1.291037019269209, + "grad_norm": 2.389676332473755, + "learning_rate": 2.853173755075871e-05, + "loss": 0.5765, + "step": 2019 + }, + { + "epoch": 1.291676661069801, + "grad_norm": 2.3729984760284424, + "learning_rate": 2.852105150673221e-05, + "loss": 0.5785, + "step": 2020 + }, + { + "epoch": 1.2923163028703926, + "grad_norm": 2.128331422805786, + "learning_rate": 2.8510365462705708e-05, + "loss": 0.4843, + "step": 2021 + }, + { + "epoch": 1.2929559446709842, + "grad_norm": 1.9310767650604248, + "learning_rate": 2.8499679418679204e-05, + "loss": 0.4125, + "step": 2022 + }, + { + "epoch": 1.293595586471576, + "grad_norm": 2.1316521167755127, + "learning_rate": 2.8488993374652707e-05, + "loss": 0.4842, + "step": 2023 + }, + { + "epoch": 1.2942352282721676, + "grad_norm": 2.2978675365448, + "learning_rate": 2.8478307330626203e-05, + "loss": 0.5318, + "step": 2024 + }, + { + "epoch": 1.2948748700727593, + "grad_norm": 2.1795082092285156, + "learning_rate": 2.84676212865997e-05, + "loss": 0.508, + "step": 2025 + }, + { + "epoch": 1.2955145118733509, + "grad_norm": 2.3023741245269775, + "learning_rate": 2.84569352425732e-05, + "loss": 0.4939, + "step": 2026 + }, + { + "epoch": 1.2961541536739425, + "grad_norm": 2.1928141117095947, + "learning_rate": 2.8446249198546698e-05, + "loss": 0.5242, + "step": 2027 + }, + { + "epoch": 1.2967937954745343, + "grad_norm": 2.3251402378082275, + "learning_rate": 2.84355631545202e-05, + "loss": 0.5041, + "step": 2028 + }, + { + "epoch": 1.297433437275126, + "grad_norm": 2.124814033508301, + "learning_rate": 2.8424877110493696e-05, + "loss": 0.4453, + "step": 2029 + }, + { + "epoch": 1.2980730790757176, + "grad_norm": 2.234825849533081, + "learning_rate": 2.8414191066467196e-05, + "loss": 0.4578, + "step": 2030 + }, + { + "epoch": 1.2987127208763094, + "grad_norm": 2.198017120361328, + "learning_rate": 2.8403505022440695e-05, + "loss": 0.4864, + "step": 2031 + }, + { + "epoch": 1.299352362676901, + "grad_norm": 2.599411725997925, + "learning_rate": 2.8392818978414194e-05, + "loss": 0.586, + "step": 2032 + }, + { + "epoch": 1.2999920044774926, + "grad_norm": 3.019258737564087, + "learning_rate": 2.838213293438769e-05, + "loss": 0.6559, + "step": 2033 + }, + { + "epoch": 1.3006316462780843, + "grad_norm": 2.443129539489746, + "learning_rate": 2.8371446890361193e-05, + "loss": 0.5496, + "step": 2034 + }, + { + "epoch": 1.3012712880786759, + "grad_norm": 2.0341763496398926, + "learning_rate": 2.836076084633469e-05, + "loss": 0.4104, + "step": 2035 + }, + { + "epoch": 1.3019109298792677, + "grad_norm": 2.5824708938598633, + "learning_rate": 2.8350074802308185e-05, + "loss": 0.5244, + "step": 2036 + }, + { + "epoch": 1.3025505716798593, + "grad_norm": 2.288459300994873, + "learning_rate": 2.8339388758281688e-05, + "loss": 0.5062, + "step": 2037 + }, + { + "epoch": 1.303190213480451, + "grad_norm": 2.3950438499450684, + "learning_rate": 2.8328702714255184e-05, + "loss": 0.5136, + "step": 2038 + }, + { + "epoch": 1.3038298552810426, + "grad_norm": 2.1054158210754395, + "learning_rate": 2.831801667022868e-05, + "loss": 0.4868, + "step": 2039 + }, + { + "epoch": 1.3044694970816342, + "grad_norm": 2.3945541381835938, + "learning_rate": 2.8307330626202182e-05, + "loss": 0.4904, + "step": 2040 + }, + { + "epoch": 1.305109138882226, + "grad_norm": 2.5768918991088867, + "learning_rate": 2.829664458217568e-05, + "loss": 0.5297, + "step": 2041 + }, + { + "epoch": 1.3057487806828176, + "grad_norm": 1.9596999883651733, + "learning_rate": 2.8285958538149178e-05, + "loss": 0.4286, + "step": 2042 + }, + { + "epoch": 1.3063884224834093, + "grad_norm": 2.276869535446167, + "learning_rate": 2.8275272494122677e-05, + "loss": 0.508, + "step": 2043 + }, + { + "epoch": 1.307028064284001, + "grad_norm": 2.4440102577209473, + "learning_rate": 2.8264586450096177e-05, + "loss": 0.5578, + "step": 2044 + }, + { + "epoch": 1.3076677060845927, + "grad_norm": 2.5011610984802246, + "learning_rate": 2.8253900406069672e-05, + "loss": 0.5728, + "step": 2045 + }, + { + "epoch": 1.3083073478851843, + "grad_norm": 2.142320394515991, + "learning_rate": 2.8243214362043175e-05, + "loss": 0.4671, + "step": 2046 + }, + { + "epoch": 1.308946989685776, + "grad_norm": 2.733914613723755, + "learning_rate": 2.823252831801667e-05, + "loss": 0.6407, + "step": 2047 + }, + { + "epoch": 1.3095866314863676, + "grad_norm": 2.557730197906494, + "learning_rate": 2.8221842273990167e-05, + "loss": 0.5838, + "step": 2048 + }, + { + "epoch": 1.3102262732869594, + "grad_norm": 2.604875326156616, + "learning_rate": 2.821115622996367e-05, + "loss": 0.6167, + "step": 2049 + }, + { + "epoch": 1.310865915087551, + "grad_norm": 2.5942413806915283, + "learning_rate": 2.8200470185937166e-05, + "loss": 0.5882, + "step": 2050 + }, + { + "epoch": 1.3115055568881426, + "grad_norm": 2.3755242824554443, + "learning_rate": 2.8189784141910662e-05, + "loss": 0.4943, + "step": 2051 + }, + { + "epoch": 1.3121451986887342, + "grad_norm": 2.50661301612854, + "learning_rate": 2.8179098097884165e-05, + "loss": 0.5179, + "step": 2052 + }, + { + "epoch": 1.3127848404893259, + "grad_norm": 2.281099319458008, + "learning_rate": 2.8168412053857664e-05, + "loss": 0.4805, + "step": 2053 + }, + { + "epoch": 1.3134244822899177, + "grad_norm": 2.772557258605957, + "learning_rate": 2.8157726009831163e-05, + "loss": 0.6278, + "step": 2054 + }, + { + "epoch": 1.3140641240905093, + "grad_norm": 2.296475410461426, + "learning_rate": 2.8147039965804663e-05, + "loss": 0.5352, + "step": 2055 + }, + { + "epoch": 1.314703765891101, + "grad_norm": 2.5317020416259766, + "learning_rate": 2.813635392177816e-05, + "loss": 0.5365, + "step": 2056 + }, + { + "epoch": 1.3153434076916928, + "grad_norm": 2.149646759033203, + "learning_rate": 2.812566787775166e-05, + "loss": 0.4913, + "step": 2057 + }, + { + "epoch": 1.3159830494922844, + "grad_norm": 2.709839105606079, + "learning_rate": 2.8114981833725157e-05, + "loss": 0.6162, + "step": 2058 + }, + { + "epoch": 1.316622691292876, + "grad_norm": 2.190150499343872, + "learning_rate": 2.8104295789698653e-05, + "loss": 0.4981, + "step": 2059 + }, + { + "epoch": 1.3172623330934676, + "grad_norm": 2.7094614505767822, + "learning_rate": 2.8093609745672156e-05, + "loss": 0.5431, + "step": 2060 + }, + { + "epoch": 1.3179019748940592, + "grad_norm": 2.36130690574646, + "learning_rate": 2.8082923701645652e-05, + "loss": 0.522, + "step": 2061 + }, + { + "epoch": 1.318541616694651, + "grad_norm": 2.252898693084717, + "learning_rate": 2.8072237657619148e-05, + "loss": 0.5399, + "step": 2062 + }, + { + "epoch": 1.3191812584952427, + "grad_norm": 1.9746092557907104, + "learning_rate": 2.806155161359265e-05, + "loss": 0.4657, + "step": 2063 + }, + { + "epoch": 1.3198209002958343, + "grad_norm": 2.377559185028076, + "learning_rate": 2.8050865569566147e-05, + "loss": 0.5234, + "step": 2064 + }, + { + "epoch": 1.320460542096426, + "grad_norm": 2.5517830848693848, + "learning_rate": 2.8040179525539646e-05, + "loss": 0.5731, + "step": 2065 + }, + { + "epoch": 1.3211001838970176, + "grad_norm": 2.527186155319214, + "learning_rate": 2.8029493481513146e-05, + "loss": 0.5295, + "step": 2066 + }, + { + "epoch": 1.3217398256976094, + "grad_norm": 2.2997121810913086, + "learning_rate": 2.8018807437486645e-05, + "loss": 0.4839, + "step": 2067 + }, + { + "epoch": 1.322379467498201, + "grad_norm": 2.2630481719970703, + "learning_rate": 2.800812139346014e-05, + "loss": 0.4631, + "step": 2068 + }, + { + "epoch": 1.3230191092987926, + "grad_norm": 2.2099435329437256, + "learning_rate": 2.7997435349433644e-05, + "loss": 0.4858, + "step": 2069 + }, + { + "epoch": 1.3236587510993845, + "grad_norm": 2.2741899490356445, + "learning_rate": 2.798674930540714e-05, + "loss": 0.5324, + "step": 2070 + }, + { + "epoch": 1.324298392899976, + "grad_norm": 2.35532808303833, + "learning_rate": 2.7976063261380636e-05, + "loss": 0.5085, + "step": 2071 + }, + { + "epoch": 1.3249380347005677, + "grad_norm": 2.399014711380005, + "learning_rate": 2.796537721735414e-05, + "loss": 0.5461, + "step": 2072 + }, + { + "epoch": 1.3255776765011593, + "grad_norm": 2.3861753940582275, + "learning_rate": 2.7954691173327634e-05, + "loss": 0.4752, + "step": 2073 + }, + { + "epoch": 1.326217318301751, + "grad_norm": 2.0552330017089844, + "learning_rate": 2.794400512930113e-05, + "loss": 0.4591, + "step": 2074 + }, + { + "epoch": 1.3268569601023428, + "grad_norm": 2.3268253803253174, + "learning_rate": 2.7933319085274633e-05, + "loss": 0.5582, + "step": 2075 + }, + { + "epoch": 1.3274966019029344, + "grad_norm": 2.1823971271514893, + "learning_rate": 2.7922633041248132e-05, + "loss": 0.4822, + "step": 2076 + }, + { + "epoch": 1.328136243703526, + "grad_norm": 2.5841736793518066, + "learning_rate": 2.791194699722163e-05, + "loss": 0.6848, + "step": 2077 + }, + { + "epoch": 1.3287758855041176, + "grad_norm": 2.1023504734039307, + "learning_rate": 2.790126095319513e-05, + "loss": 0.4682, + "step": 2078 + }, + { + "epoch": 1.3294155273047092, + "grad_norm": 1.984799861907959, + "learning_rate": 2.7890574909168627e-05, + "loss": 0.4474, + "step": 2079 + }, + { + "epoch": 1.330055169105301, + "grad_norm": 2.2900149822235107, + "learning_rate": 2.7879888865142123e-05, + "loss": 0.4714, + "step": 2080 + }, + { + "epoch": 1.3306948109058927, + "grad_norm": 2.4979443550109863, + "learning_rate": 2.7869202821115626e-05, + "loss": 0.5637, + "step": 2081 + }, + { + "epoch": 1.3313344527064843, + "grad_norm": 2.433444023132324, + "learning_rate": 2.7858516777089122e-05, + "loss": 0.5719, + "step": 2082 + }, + { + "epoch": 1.3319740945070762, + "grad_norm": 2.504610300064087, + "learning_rate": 2.7847830733062625e-05, + "loss": 0.5263, + "step": 2083 + }, + { + "epoch": 1.3326137363076678, + "grad_norm": 2.452442169189453, + "learning_rate": 2.783714468903612e-05, + "loss": 0.5364, + "step": 2084 + }, + { + "epoch": 1.3332533781082594, + "grad_norm": 3.985201120376587, + "learning_rate": 2.7826458645009616e-05, + "loss": 0.6818, + "step": 2085 + }, + { + "epoch": 1.333893019908851, + "grad_norm": 2.133826971054077, + "learning_rate": 2.781577260098312e-05, + "loss": 0.4908, + "step": 2086 + }, + { + "epoch": 1.3345326617094426, + "grad_norm": 2.267771005630493, + "learning_rate": 2.7805086556956615e-05, + "loss": 0.551, + "step": 2087 + }, + { + "epoch": 1.3351723035100345, + "grad_norm": 2.3498358726501465, + "learning_rate": 2.7794400512930115e-05, + "loss": 0.5345, + "step": 2088 + }, + { + "epoch": 1.335811945310626, + "grad_norm": 2.802976369857788, + "learning_rate": 2.7783714468903614e-05, + "loss": 0.5857, + "step": 2089 + }, + { + "epoch": 1.3364515871112177, + "grad_norm": 2.341111660003662, + "learning_rate": 2.7773028424877113e-05, + "loss": 0.4999, + "step": 2090 + }, + { + "epoch": 1.3370912289118093, + "grad_norm": 2.1392176151275635, + "learning_rate": 2.776234238085061e-05, + "loss": 0.4702, + "step": 2091 + }, + { + "epoch": 1.337730870712401, + "grad_norm": 2.261763572692871, + "learning_rate": 2.7751656336824112e-05, + "loss": 0.4695, + "step": 2092 + }, + { + "epoch": 1.3383705125129928, + "grad_norm": 2.3268396854400635, + "learning_rate": 2.7740970292797608e-05, + "loss": 0.5237, + "step": 2093 + }, + { + "epoch": 1.3390101543135844, + "grad_norm": 2.6014983654022217, + "learning_rate": 2.7730284248771104e-05, + "loss": 0.5824, + "step": 2094 + }, + { + "epoch": 1.339649796114176, + "grad_norm": 2.150987148284912, + "learning_rate": 2.7719598204744607e-05, + "loss": 0.4741, + "step": 2095 + }, + { + "epoch": 1.3402894379147678, + "grad_norm": 2.799398422241211, + "learning_rate": 2.7708912160718103e-05, + "loss": 0.5981, + "step": 2096 + }, + { + "epoch": 1.3409290797153595, + "grad_norm": 2.435011148452759, + "learning_rate": 2.76982261166916e-05, + "loss": 0.5574, + "step": 2097 + }, + { + "epoch": 1.341568721515951, + "grad_norm": 2.4173593521118164, + "learning_rate": 2.76875400726651e-05, + "loss": 0.5573, + "step": 2098 + }, + { + "epoch": 1.3422083633165427, + "grad_norm": 2.50612473487854, + "learning_rate": 2.76768540286386e-05, + "loss": 0.6099, + "step": 2099 + }, + { + "epoch": 1.3428480051171343, + "grad_norm": 2.367081642150879, + "learning_rate": 2.7666167984612097e-05, + "loss": 0.5173, + "step": 2100 + }, + { + "epoch": 1.3434876469177262, + "grad_norm": 2.1075406074523926, + "learning_rate": 2.76554819405856e-05, + "loss": 0.4961, + "step": 2101 + }, + { + "epoch": 1.3441272887183178, + "grad_norm": 2.4579215049743652, + "learning_rate": 2.7644795896559095e-05, + "loss": 0.5693, + "step": 2102 + }, + { + "epoch": 1.3447669305189094, + "grad_norm": 2.2384231090545654, + "learning_rate": 2.763410985253259e-05, + "loss": 0.5457, + "step": 2103 + }, + { + "epoch": 1.345406572319501, + "grad_norm": 2.145146131515503, + "learning_rate": 2.7623423808506094e-05, + "loss": 0.496, + "step": 2104 + }, + { + "epoch": 1.3460462141200926, + "grad_norm": 2.1262755393981934, + "learning_rate": 2.761273776447959e-05, + "loss": 0.5263, + "step": 2105 + }, + { + "epoch": 1.3466858559206845, + "grad_norm": 2.1277053356170654, + "learning_rate": 2.7602051720453086e-05, + "loss": 0.5083, + "step": 2106 + }, + { + "epoch": 1.347325497721276, + "grad_norm": 2.3164451122283936, + "learning_rate": 2.759136567642659e-05, + "loss": 0.556, + "step": 2107 + }, + { + "epoch": 1.3479651395218677, + "grad_norm": 2.571687936782837, + "learning_rate": 2.7580679632400085e-05, + "loss": 0.582, + "step": 2108 + }, + { + "epoch": 1.3486047813224595, + "grad_norm": 2.159640312194824, + "learning_rate": 2.7569993588373584e-05, + "loss": 0.5143, + "step": 2109 + }, + { + "epoch": 1.3492444231230512, + "grad_norm": 2.2526590824127197, + "learning_rate": 2.7559307544347084e-05, + "loss": 0.4905, + "step": 2110 + }, + { + "epoch": 1.3498840649236428, + "grad_norm": 2.506826639175415, + "learning_rate": 2.7548621500320583e-05, + "loss": 0.5409, + "step": 2111 + }, + { + "epoch": 1.3505237067242344, + "grad_norm": 2.7504212856292725, + "learning_rate": 2.7537935456294082e-05, + "loss": 0.5882, + "step": 2112 + }, + { + "epoch": 1.351163348524826, + "grad_norm": 2.5309371948242188, + "learning_rate": 2.752724941226758e-05, + "loss": 0.5614, + "step": 2113 + }, + { + "epoch": 1.3518029903254178, + "grad_norm": 2.4666011333465576, + "learning_rate": 2.7516563368241078e-05, + "loss": 0.588, + "step": 2114 + }, + { + "epoch": 1.3524426321260095, + "grad_norm": 2.1948835849761963, + "learning_rate": 2.750587732421458e-05, + "loss": 0.4583, + "step": 2115 + }, + { + "epoch": 1.353082273926601, + "grad_norm": 2.1075820922851562, + "learning_rate": 2.7495191280188076e-05, + "loss": 0.4812, + "step": 2116 + }, + { + "epoch": 1.3537219157271927, + "grad_norm": 2.339005947113037, + "learning_rate": 2.7484505236161572e-05, + "loss": 0.5513, + "step": 2117 + }, + { + "epoch": 1.3543615575277843, + "grad_norm": 2.250328302383423, + "learning_rate": 2.7473819192135075e-05, + "loss": 0.4958, + "step": 2118 + }, + { + "epoch": 1.3550011993283761, + "grad_norm": 2.2444028854370117, + "learning_rate": 2.746313314810857e-05, + "loss": 0.4974, + "step": 2119 + }, + { + "epoch": 1.3556408411289678, + "grad_norm": 2.374600410461426, + "learning_rate": 2.7452447104082067e-05, + "loss": 0.531, + "step": 2120 + }, + { + "epoch": 1.3562804829295594, + "grad_norm": 2.4628140926361084, + "learning_rate": 2.744176106005557e-05, + "loss": 0.5539, + "step": 2121 + }, + { + "epoch": 1.3569201247301512, + "grad_norm": 2.035003900527954, + "learning_rate": 2.743107501602907e-05, + "loss": 0.4566, + "step": 2122 + }, + { + "epoch": 1.3575597665307428, + "grad_norm": 2.145343780517578, + "learning_rate": 2.7420388972002565e-05, + "loss": 0.512, + "step": 2123 + }, + { + "epoch": 1.3581994083313345, + "grad_norm": 2.484570264816284, + "learning_rate": 2.7409702927976068e-05, + "loss": 0.5532, + "step": 2124 + }, + { + "epoch": 1.358839050131926, + "grad_norm": 2.4494516849517822, + "learning_rate": 2.7399016883949564e-05, + "loss": 0.5327, + "step": 2125 + }, + { + "epoch": 1.3594786919325177, + "grad_norm": 2.1916258335113525, + "learning_rate": 2.738833083992306e-05, + "loss": 0.4664, + "step": 2126 + }, + { + "epoch": 1.3601183337331095, + "grad_norm": 2.4104466438293457, + "learning_rate": 2.7377644795896563e-05, + "loss": 0.5273, + "step": 2127 + }, + { + "epoch": 1.3607579755337011, + "grad_norm": 2.0217537879943848, + "learning_rate": 2.736695875187006e-05, + "loss": 0.4607, + "step": 2128 + }, + { + "epoch": 1.3613976173342928, + "grad_norm": 2.6108813285827637, + "learning_rate": 2.7356272707843555e-05, + "loss": 0.5809, + "step": 2129 + }, + { + "epoch": 1.3620372591348844, + "grad_norm": 2.5624256134033203, + "learning_rate": 2.7345586663817057e-05, + "loss": 0.5379, + "step": 2130 + }, + { + "epoch": 1.362676900935476, + "grad_norm": 2.43609356880188, + "learning_rate": 2.7334900619790553e-05, + "loss": 0.5331, + "step": 2131 + }, + { + "epoch": 1.3633165427360678, + "grad_norm": 2.9627552032470703, + "learning_rate": 2.7324214575764053e-05, + "loss": 0.598, + "step": 2132 + }, + { + "epoch": 1.3639561845366595, + "grad_norm": 2.130108118057251, + "learning_rate": 2.7313528531737552e-05, + "loss": 0.4547, + "step": 2133 + }, + { + "epoch": 1.364595826337251, + "grad_norm": 2.660867691040039, + "learning_rate": 2.730284248771105e-05, + "loss": 0.5587, + "step": 2134 + }, + { + "epoch": 1.365235468137843, + "grad_norm": 2.347076892852783, + "learning_rate": 2.7292156443684547e-05, + "loss": 0.5002, + "step": 2135 + }, + { + "epoch": 1.3658751099384345, + "grad_norm": 2.2299551963806152, + "learning_rate": 2.728147039965805e-05, + "loss": 0.5184, + "step": 2136 + }, + { + "epoch": 1.3665147517390261, + "grad_norm": 2.504868507385254, + "learning_rate": 2.7270784355631546e-05, + "loss": 0.5281, + "step": 2137 + }, + { + "epoch": 1.3671543935396178, + "grad_norm": 2.4945545196533203, + "learning_rate": 2.7260098311605042e-05, + "loss": 0.5573, + "step": 2138 + }, + { + "epoch": 1.3677940353402094, + "grad_norm": 2.51690411567688, + "learning_rate": 2.7249412267578545e-05, + "loss": 0.515, + "step": 2139 + }, + { + "epoch": 1.3684336771408012, + "grad_norm": 2.4938085079193115, + "learning_rate": 2.723872622355204e-05, + "loss": 0.6039, + "step": 2140 + }, + { + "epoch": 1.3690733189413928, + "grad_norm": 2.6224842071533203, + "learning_rate": 2.7228040179525543e-05, + "loss": 0.5899, + "step": 2141 + }, + { + "epoch": 1.3697129607419845, + "grad_norm": 2.0296831130981445, + "learning_rate": 2.721735413549904e-05, + "loss": 0.4765, + "step": 2142 + }, + { + "epoch": 1.3703526025425763, + "grad_norm": 2.07785964012146, + "learning_rate": 2.7206668091472535e-05, + "loss": 0.4803, + "step": 2143 + }, + { + "epoch": 1.3709922443431677, + "grad_norm": 2.2916152477264404, + "learning_rate": 2.7195982047446038e-05, + "loss": 0.5085, + "step": 2144 + }, + { + "epoch": 1.3716318861437595, + "grad_norm": 2.2170231342315674, + "learning_rate": 2.7185296003419538e-05, + "loss": 0.4794, + "step": 2145 + }, + { + "epoch": 1.3722715279443511, + "grad_norm": 2.2349820137023926, + "learning_rate": 2.7174609959393034e-05, + "loss": 0.4767, + "step": 2146 + }, + { + "epoch": 1.3729111697449428, + "grad_norm": 2.1949474811553955, + "learning_rate": 2.7163923915366536e-05, + "loss": 0.4729, + "step": 2147 + }, + { + "epoch": 1.3735508115455346, + "grad_norm": 2.1735739707946777, + "learning_rate": 2.7153237871340032e-05, + "loss": 0.5466, + "step": 2148 + }, + { + "epoch": 1.3741904533461262, + "grad_norm": 2.28950834274292, + "learning_rate": 2.7142551827313528e-05, + "loss": 0.5615, + "step": 2149 + }, + { + "epoch": 1.3748300951467178, + "grad_norm": 2.4909775257110596, + "learning_rate": 2.713186578328703e-05, + "loss": 0.5696, + "step": 2150 + }, + { + "epoch": 1.3754697369473095, + "grad_norm": 2.452730655670166, + "learning_rate": 2.7121179739260527e-05, + "loss": 0.5121, + "step": 2151 + }, + { + "epoch": 1.376109378747901, + "grad_norm": 2.5877861976623535, + "learning_rate": 2.7110493695234023e-05, + "loss": 0.5762, + "step": 2152 + }, + { + "epoch": 1.376749020548493, + "grad_norm": 2.182504415512085, + "learning_rate": 2.7099807651207526e-05, + "loss": 0.4633, + "step": 2153 + }, + { + "epoch": 1.3773886623490845, + "grad_norm": 2.3482775688171387, + "learning_rate": 2.708912160718102e-05, + "loss": 0.5491, + "step": 2154 + }, + { + "epoch": 1.3780283041496761, + "grad_norm": 2.494331121444702, + "learning_rate": 2.707843556315452e-05, + "loss": 0.5298, + "step": 2155 + }, + { + "epoch": 1.378667945950268, + "grad_norm": 2.030103921890259, + "learning_rate": 2.706774951912802e-05, + "loss": 0.4647, + "step": 2156 + }, + { + "epoch": 1.3793075877508596, + "grad_norm": 2.099980354309082, + "learning_rate": 2.705706347510152e-05, + "loss": 0.4702, + "step": 2157 + }, + { + "epoch": 1.3799472295514512, + "grad_norm": 2.0252654552459717, + "learning_rate": 2.7046377431075016e-05, + "loss": 0.4824, + "step": 2158 + }, + { + "epoch": 1.3805868713520428, + "grad_norm": 2.409728765487671, + "learning_rate": 2.703569138704852e-05, + "loss": 0.561, + "step": 2159 + }, + { + "epoch": 1.3812265131526344, + "grad_norm": 2.1638243198394775, + "learning_rate": 2.7025005343022014e-05, + "loss": 0.4875, + "step": 2160 + }, + { + "epoch": 1.3818661549532263, + "grad_norm": 2.7950870990753174, + "learning_rate": 2.701431929899551e-05, + "loss": 0.6157, + "step": 2161 + }, + { + "epoch": 1.382505796753818, + "grad_norm": 2.2581465244293213, + "learning_rate": 2.7003633254969013e-05, + "loss": 0.5136, + "step": 2162 + }, + { + "epoch": 1.3831454385544095, + "grad_norm": 2.0895376205444336, + "learning_rate": 2.699294721094251e-05, + "loss": 0.4632, + "step": 2163 + }, + { + "epoch": 1.3837850803550011, + "grad_norm": 2.4460158348083496, + "learning_rate": 2.6982261166916005e-05, + "loss": 0.5093, + "step": 2164 + }, + { + "epoch": 1.3844247221555928, + "grad_norm": 2.2036702632904053, + "learning_rate": 2.6971575122889508e-05, + "loss": 0.4756, + "step": 2165 + }, + { + "epoch": 1.3850643639561846, + "grad_norm": 2.2187347412109375, + "learning_rate": 2.6960889078863004e-05, + "loss": 0.4978, + "step": 2166 + }, + { + "epoch": 1.3857040057567762, + "grad_norm": 2.660968065261841, + "learning_rate": 2.6950203034836507e-05, + "loss": 0.6019, + "step": 2167 + }, + { + "epoch": 1.3863436475573678, + "grad_norm": 2.3281824588775635, + "learning_rate": 2.6939516990810006e-05, + "loss": 0.4823, + "step": 2168 + }, + { + "epoch": 1.3869832893579597, + "grad_norm": 2.594407796859741, + "learning_rate": 2.6928830946783502e-05, + "loss": 0.5807, + "step": 2169 + }, + { + "epoch": 1.3876229311585513, + "grad_norm": 2.920761823654175, + "learning_rate": 2.6918144902757005e-05, + "loss": 0.6076, + "step": 2170 + }, + { + "epoch": 1.388262572959143, + "grad_norm": 2.4506754875183105, + "learning_rate": 2.69074588587305e-05, + "loss": 0.5178, + "step": 2171 + }, + { + "epoch": 1.3889022147597345, + "grad_norm": 2.345679998397827, + "learning_rate": 2.6896772814703997e-05, + "loss": 0.5214, + "step": 2172 + }, + { + "epoch": 1.3895418565603261, + "grad_norm": 2.139457941055298, + "learning_rate": 2.68860867706775e-05, + "loss": 0.4267, + "step": 2173 + }, + { + "epoch": 1.390181498360918, + "grad_norm": 2.454946279525757, + "learning_rate": 2.6875400726650995e-05, + "loss": 0.5434, + "step": 2174 + }, + { + "epoch": 1.3908211401615096, + "grad_norm": 2.123443365097046, + "learning_rate": 2.686471468262449e-05, + "loss": 0.4884, + "step": 2175 + }, + { + "epoch": 1.3914607819621012, + "grad_norm": 2.107789993286133, + "learning_rate": 2.6854028638597994e-05, + "loss": 0.4471, + "step": 2176 + }, + { + "epoch": 1.3921004237626928, + "grad_norm": 2.5171778202056885, + "learning_rate": 2.684334259457149e-05, + "loss": 0.5957, + "step": 2177 + }, + { + "epoch": 1.3927400655632844, + "grad_norm": 2.0856642723083496, + "learning_rate": 2.683265655054499e-05, + "loss": 0.4599, + "step": 2178 + }, + { + "epoch": 1.3933797073638763, + "grad_norm": 2.1569645404815674, + "learning_rate": 2.682197050651849e-05, + "loss": 0.4768, + "step": 2179 + }, + { + "epoch": 1.394019349164468, + "grad_norm": 2.365912675857544, + "learning_rate": 2.6811284462491988e-05, + "loss": 0.5399, + "step": 2180 + }, + { + "epoch": 1.3946589909650595, + "grad_norm": 2.346978187561035, + "learning_rate": 2.6800598418465484e-05, + "loss": 0.539, + "step": 2181 + }, + { + "epoch": 1.3952986327656514, + "grad_norm": 2.289849281311035, + "learning_rate": 2.6789912374438987e-05, + "loss": 0.5232, + "step": 2182 + }, + { + "epoch": 1.395938274566243, + "grad_norm": 1.9996365308761597, + "learning_rate": 2.6779226330412483e-05, + "loss": 0.4733, + "step": 2183 + }, + { + "epoch": 1.3965779163668346, + "grad_norm": 2.292644500732422, + "learning_rate": 2.676854028638598e-05, + "loss": 0.498, + "step": 2184 + }, + { + "epoch": 1.3972175581674262, + "grad_norm": 2.5906386375427246, + "learning_rate": 2.675785424235948e-05, + "loss": 0.5188, + "step": 2185 + }, + { + "epoch": 1.3978571999680178, + "grad_norm": 1.995419979095459, + "learning_rate": 2.6747168198332977e-05, + "loss": 0.438, + "step": 2186 + }, + { + "epoch": 1.3984968417686097, + "grad_norm": 2.6388261318206787, + "learning_rate": 2.6736482154306473e-05, + "loss": 0.5653, + "step": 2187 + }, + { + "epoch": 1.3991364835692013, + "grad_norm": 2.474968433380127, + "learning_rate": 2.6725796110279976e-05, + "loss": 0.5431, + "step": 2188 + }, + { + "epoch": 1.399776125369793, + "grad_norm": 2.0440359115600586, + "learning_rate": 2.6715110066253472e-05, + "loss": 0.4536, + "step": 2189 + }, + { + "epoch": 1.4004157671703845, + "grad_norm": 2.609251022338867, + "learning_rate": 2.670442402222697e-05, + "loss": 0.5955, + "step": 2190 + }, + { + "epoch": 1.4010554089709761, + "grad_norm": 2.4600322246551514, + "learning_rate": 2.6693737978200474e-05, + "loss": 0.5551, + "step": 2191 + }, + { + "epoch": 1.401695050771568, + "grad_norm": 1.9794594049453735, + "learning_rate": 2.668305193417397e-05, + "loss": 0.4636, + "step": 2192 + }, + { + "epoch": 1.4023346925721596, + "grad_norm": 2.4079878330230713, + "learning_rate": 2.6672365890147466e-05, + "loss": 0.4984, + "step": 2193 + }, + { + "epoch": 1.4029743343727512, + "grad_norm": 2.2058138847351074, + "learning_rate": 2.666167984612097e-05, + "loss": 0.4616, + "step": 2194 + }, + { + "epoch": 1.403613976173343, + "grad_norm": 2.2484633922576904, + "learning_rate": 2.6650993802094465e-05, + "loss": 0.4869, + "step": 2195 + }, + { + "epoch": 1.4042536179739347, + "grad_norm": 2.3188061714172363, + "learning_rate": 2.6640307758067968e-05, + "loss": 0.4867, + "step": 2196 + }, + { + "epoch": 1.4048932597745263, + "grad_norm": 2.2876102924346924, + "learning_rate": 2.6629621714041464e-05, + "loss": 0.5466, + "step": 2197 + }, + { + "epoch": 1.405532901575118, + "grad_norm": 2.480281114578247, + "learning_rate": 2.661893567001496e-05, + "loss": 0.5372, + "step": 2198 + }, + { + "epoch": 1.4061725433757095, + "grad_norm": 2.280669927597046, + "learning_rate": 2.6608249625988462e-05, + "loss": 0.4989, + "step": 2199 + }, + { + "epoch": 1.4068121851763014, + "grad_norm": 2.2180023193359375, + "learning_rate": 2.659756358196196e-05, + "loss": 0.4653, + "step": 2200 + }, + { + "epoch": 1.407451826976893, + "grad_norm": 2.4890365600585938, + "learning_rate": 2.6586877537935458e-05, + "loss": 0.5191, + "step": 2201 + }, + { + "epoch": 1.4080914687774846, + "grad_norm": 2.0783698558807373, + "learning_rate": 2.6576191493908957e-05, + "loss": 0.4173, + "step": 2202 + }, + { + "epoch": 1.4087311105780762, + "grad_norm": 2.147512197494507, + "learning_rate": 2.6565505449882456e-05, + "loss": 0.4562, + "step": 2203 + }, + { + "epoch": 1.4093707523786678, + "grad_norm": 2.341219425201416, + "learning_rate": 2.6554819405855952e-05, + "loss": 0.5178, + "step": 2204 + }, + { + "epoch": 1.4100103941792597, + "grad_norm": 2.4911327362060547, + "learning_rate": 2.6544133361829455e-05, + "loss": 0.5357, + "step": 2205 + }, + { + "epoch": 1.4106500359798513, + "grad_norm": 2.4268505573272705, + "learning_rate": 2.653344731780295e-05, + "loss": 0.5103, + "step": 2206 + }, + { + "epoch": 1.411289677780443, + "grad_norm": 2.058560848236084, + "learning_rate": 2.6522761273776447e-05, + "loss": 0.4807, + "step": 2207 + }, + { + "epoch": 1.4119293195810347, + "grad_norm": 2.4109654426574707, + "learning_rate": 2.651207522974995e-05, + "loss": 0.5472, + "step": 2208 + }, + { + "epoch": 1.4125689613816264, + "grad_norm": 2.400390386581421, + "learning_rate": 2.6501389185723446e-05, + "loss": 0.5038, + "step": 2209 + }, + { + "epoch": 1.413208603182218, + "grad_norm": 2.4504222869873047, + "learning_rate": 2.6490703141696942e-05, + "loss": 0.5602, + "step": 2210 + }, + { + "epoch": 1.4138482449828096, + "grad_norm": 2.4026997089385986, + "learning_rate": 2.6480017097670445e-05, + "loss": 0.5372, + "step": 2211 + }, + { + "epoch": 1.4144878867834012, + "grad_norm": 2.321833848953247, + "learning_rate": 2.646933105364394e-05, + "loss": 0.52, + "step": 2212 + }, + { + "epoch": 1.415127528583993, + "grad_norm": 2.1148695945739746, + "learning_rate": 2.645864500961744e-05, + "loss": 0.4957, + "step": 2213 + }, + { + "epoch": 1.4157671703845847, + "grad_norm": 2.3939766883850098, + "learning_rate": 2.6447958965590943e-05, + "loss": 0.5347, + "step": 2214 + }, + { + "epoch": 1.4164068121851763, + "grad_norm": 2.4524459838867188, + "learning_rate": 2.643727292156444e-05, + "loss": 0.5177, + "step": 2215 + }, + { + "epoch": 1.417046453985768, + "grad_norm": 2.472960948944092, + "learning_rate": 2.6426586877537935e-05, + "loss": 0.5091, + "step": 2216 + }, + { + "epoch": 1.4176860957863595, + "grad_norm": 2.2612154483795166, + "learning_rate": 2.6415900833511437e-05, + "loss": 0.4907, + "step": 2217 + }, + { + "epoch": 1.4183257375869514, + "grad_norm": 2.2258989810943604, + "learning_rate": 2.6405214789484933e-05, + "loss": 0.4859, + "step": 2218 + }, + { + "epoch": 1.418965379387543, + "grad_norm": 2.5067334175109863, + "learning_rate": 2.639452874545843e-05, + "loss": 0.5395, + "step": 2219 + }, + { + "epoch": 1.4196050211881346, + "grad_norm": 2.2099735736846924, + "learning_rate": 2.6383842701431932e-05, + "loss": 0.4753, + "step": 2220 + }, + { + "epoch": 1.4202446629887264, + "grad_norm": 2.1398520469665527, + "learning_rate": 2.6373156657405428e-05, + "loss": 0.4466, + "step": 2221 + }, + { + "epoch": 1.420884304789318, + "grad_norm": 2.32903790473938, + "learning_rate": 2.6362470613378924e-05, + "loss": 0.5017, + "step": 2222 + }, + { + "epoch": 1.4215239465899097, + "grad_norm": 2.1925742626190186, + "learning_rate": 2.6351784569352427e-05, + "loss": 0.5192, + "step": 2223 + }, + { + "epoch": 1.4221635883905013, + "grad_norm": 1.9855402708053589, + "learning_rate": 2.6341098525325926e-05, + "loss": 0.4411, + "step": 2224 + }, + { + "epoch": 1.422803230191093, + "grad_norm": 1.9421648979187012, + "learning_rate": 2.6330412481299426e-05, + "loss": 0.4103, + "step": 2225 + }, + { + "epoch": 1.4234428719916847, + "grad_norm": 1.8968526124954224, + "learning_rate": 2.6319726437272925e-05, + "loss": 0.4386, + "step": 2226 + }, + { + "epoch": 1.4240825137922763, + "grad_norm": 2.6044423580169678, + "learning_rate": 2.630904039324642e-05, + "loss": 0.5969, + "step": 2227 + }, + { + "epoch": 1.424722155592868, + "grad_norm": 2.27577543258667, + "learning_rate": 2.6298354349219924e-05, + "loss": 0.4865, + "step": 2228 + }, + { + "epoch": 1.4253617973934596, + "grad_norm": 2.3630144596099854, + "learning_rate": 2.628766830519342e-05, + "loss": 0.5102, + "step": 2229 + }, + { + "epoch": 1.4260014391940512, + "grad_norm": 2.4413297176361084, + "learning_rate": 2.6276982261166916e-05, + "loss": 0.5066, + "step": 2230 + }, + { + "epoch": 1.426641080994643, + "grad_norm": 2.246939182281494, + "learning_rate": 2.6266296217140418e-05, + "loss": 0.4822, + "step": 2231 + }, + { + "epoch": 1.4272807227952347, + "grad_norm": 2.679086685180664, + "learning_rate": 2.6255610173113914e-05, + "loss": 0.615, + "step": 2232 + }, + { + "epoch": 1.4279203645958263, + "grad_norm": 2.3805325031280518, + "learning_rate": 2.624492412908741e-05, + "loss": 0.5101, + "step": 2233 + }, + { + "epoch": 1.4285600063964181, + "grad_norm": 2.2592151165008545, + "learning_rate": 2.6234238085060913e-05, + "loss": 0.5103, + "step": 2234 + }, + { + "epoch": 1.4291996481970097, + "grad_norm": 2.27573823928833, + "learning_rate": 2.622355204103441e-05, + "loss": 0.5052, + "step": 2235 + }, + { + "epoch": 1.4298392899976013, + "grad_norm": 2.236330986022949, + "learning_rate": 2.621286599700791e-05, + "loss": 0.4856, + "step": 2236 + }, + { + "epoch": 1.430478931798193, + "grad_norm": 2.019353151321411, + "learning_rate": 2.6202179952981408e-05, + "loss": 0.4785, + "step": 2237 + }, + { + "epoch": 1.4311185735987846, + "grad_norm": 2.6522879600524902, + "learning_rate": 2.6191493908954907e-05, + "loss": 0.6043, + "step": 2238 + }, + { + "epoch": 1.4317582153993764, + "grad_norm": 2.162611246109009, + "learning_rate": 2.6180807864928403e-05, + "loss": 0.4513, + "step": 2239 + }, + { + "epoch": 1.432397857199968, + "grad_norm": 2.396352529525757, + "learning_rate": 2.6170121820901906e-05, + "loss": 0.504, + "step": 2240 + }, + { + "epoch": 1.4330374990005597, + "grad_norm": 2.499436378479004, + "learning_rate": 2.6159435776875402e-05, + "loss": 0.5159, + "step": 2241 + }, + { + "epoch": 1.4336771408011513, + "grad_norm": 2.3926210403442383, + "learning_rate": 2.6148749732848898e-05, + "loss": 0.5528, + "step": 2242 + }, + { + "epoch": 1.434316782601743, + "grad_norm": 2.022723913192749, + "learning_rate": 2.61380636888224e-05, + "loss": 0.461, + "step": 2243 + }, + { + "epoch": 1.4349564244023347, + "grad_norm": 2.447625160217285, + "learning_rate": 2.6127377644795896e-05, + "loss": 0.6057, + "step": 2244 + }, + { + "epoch": 1.4355960662029263, + "grad_norm": 2.4095206260681152, + "learning_rate": 2.6116691600769392e-05, + "loss": 0.5665, + "step": 2245 + }, + { + "epoch": 1.436235708003518, + "grad_norm": 2.1642065048217773, + "learning_rate": 2.6106005556742895e-05, + "loss": 0.4461, + "step": 2246 + }, + { + "epoch": 1.4368753498041098, + "grad_norm": 2.603368043899536, + "learning_rate": 2.6095319512716395e-05, + "loss": 0.5777, + "step": 2247 + }, + { + "epoch": 1.4375149916047014, + "grad_norm": 2.2961888313293457, + "learning_rate": 2.608463346868989e-05, + "loss": 0.5096, + "step": 2248 + }, + { + "epoch": 1.438154633405293, + "grad_norm": 2.879202127456665, + "learning_rate": 2.6073947424663393e-05, + "loss": 0.6394, + "step": 2249 + }, + { + "epoch": 1.4387942752058847, + "grad_norm": 2.4922022819519043, + "learning_rate": 2.606326138063689e-05, + "loss": 0.5425, + "step": 2250 + }, + { + "epoch": 1.4394339170064763, + "grad_norm": 2.512561798095703, + "learning_rate": 2.6052575336610392e-05, + "loss": 0.5895, + "step": 2251 + }, + { + "epoch": 1.440073558807068, + "grad_norm": 2.3080883026123047, + "learning_rate": 2.6041889292583888e-05, + "loss": 0.4951, + "step": 2252 + }, + { + "epoch": 1.4407132006076597, + "grad_norm": 2.3077144622802734, + "learning_rate": 2.6031203248557384e-05, + "loss": 0.5104, + "step": 2253 + }, + { + "epoch": 1.4413528424082513, + "grad_norm": 2.0974791049957275, + "learning_rate": 2.6020517204530887e-05, + "loss": 0.4529, + "step": 2254 + }, + { + "epoch": 1.4419924842088432, + "grad_norm": 2.2111384868621826, + "learning_rate": 2.6009831160504383e-05, + "loss": 0.5113, + "step": 2255 + }, + { + "epoch": 1.4426321260094346, + "grad_norm": 2.3056976795196533, + "learning_rate": 2.599914511647788e-05, + "loss": 0.5128, + "step": 2256 + }, + { + "epoch": 1.4432717678100264, + "grad_norm": 2.189061164855957, + "learning_rate": 2.598845907245138e-05, + "loss": 0.45, + "step": 2257 + }, + { + "epoch": 1.443911409610618, + "grad_norm": 2.0347671508789062, + "learning_rate": 2.5977773028424877e-05, + "loss": 0.4605, + "step": 2258 + }, + { + "epoch": 1.4445510514112097, + "grad_norm": 2.1968576908111572, + "learning_rate": 2.5967086984398377e-05, + "loss": 0.4658, + "step": 2259 + }, + { + "epoch": 1.4451906932118015, + "grad_norm": 2.511888027191162, + "learning_rate": 2.5956400940371876e-05, + "loss": 0.5669, + "step": 2260 + }, + { + "epoch": 1.445830335012393, + "grad_norm": 2.248568296432495, + "learning_rate": 2.5945714896345375e-05, + "loss": 0.4958, + "step": 2261 + }, + { + "epoch": 1.4464699768129847, + "grad_norm": 2.163270950317383, + "learning_rate": 2.593502885231887e-05, + "loss": 0.4576, + "step": 2262 + }, + { + "epoch": 1.4471096186135763, + "grad_norm": 2.094024896621704, + "learning_rate": 2.5924342808292374e-05, + "loss": 0.4509, + "step": 2263 + }, + { + "epoch": 1.447749260414168, + "grad_norm": 2.8308749198913574, + "learning_rate": 2.591365676426587e-05, + "loss": 0.5922, + "step": 2264 + }, + { + "epoch": 1.4483889022147598, + "grad_norm": 2.6014363765716553, + "learning_rate": 2.5902970720239366e-05, + "loss": 0.5073, + "step": 2265 + }, + { + "epoch": 1.4490285440153514, + "grad_norm": 2.258899688720703, + "learning_rate": 2.589228467621287e-05, + "loss": 0.5131, + "step": 2266 + }, + { + "epoch": 1.449668185815943, + "grad_norm": 2.1131486892700195, + "learning_rate": 2.5881598632186365e-05, + "loss": 0.4759, + "step": 2267 + }, + { + "epoch": 1.4503078276165349, + "grad_norm": 2.4243154525756836, + "learning_rate": 2.587091258815986e-05, + "loss": 0.541, + "step": 2268 + }, + { + "epoch": 1.4509474694171265, + "grad_norm": 2.215507984161377, + "learning_rate": 2.5860226544133364e-05, + "loss": 0.4902, + "step": 2269 + }, + { + "epoch": 1.451587111217718, + "grad_norm": 2.332162380218506, + "learning_rate": 2.5849540500106863e-05, + "loss": 0.507, + "step": 2270 + }, + { + "epoch": 1.4522267530183097, + "grad_norm": 2.3548402786254883, + "learning_rate": 2.583885445608036e-05, + "loss": 0.4851, + "step": 2271 + }, + { + "epoch": 1.4528663948189013, + "grad_norm": 2.058729887008667, + "learning_rate": 2.582816841205386e-05, + "loss": 0.4478, + "step": 2272 + }, + { + "epoch": 1.4535060366194932, + "grad_norm": 2.3999671936035156, + "learning_rate": 2.5817482368027358e-05, + "loss": 0.5128, + "step": 2273 + }, + { + "epoch": 1.4541456784200848, + "grad_norm": 2.3504152297973633, + "learning_rate": 2.5806796324000854e-05, + "loss": 0.5526, + "step": 2274 + }, + { + "epoch": 1.4547853202206764, + "grad_norm": 2.4086127281188965, + "learning_rate": 2.5796110279974356e-05, + "loss": 0.4998, + "step": 2275 + }, + { + "epoch": 1.455424962021268, + "grad_norm": 2.562897205352783, + "learning_rate": 2.5785424235947852e-05, + "loss": 0.546, + "step": 2276 + }, + { + "epoch": 1.4560646038218596, + "grad_norm": 2.3442258834838867, + "learning_rate": 2.5774738191921348e-05, + "loss": 0.4747, + "step": 2277 + }, + { + "epoch": 1.4567042456224515, + "grad_norm": 2.460014581680298, + "learning_rate": 2.576405214789485e-05, + "loss": 0.5114, + "step": 2278 + }, + { + "epoch": 1.457343887423043, + "grad_norm": 2.154784679412842, + "learning_rate": 2.5753366103868347e-05, + "loss": 0.4355, + "step": 2279 + }, + { + "epoch": 1.4579835292236347, + "grad_norm": 1.9900468587875366, + "learning_rate": 2.574268005984185e-05, + "loss": 0.4314, + "step": 2280 + }, + { + "epoch": 1.4586231710242266, + "grad_norm": 2.4974210262298584, + "learning_rate": 2.5731994015815346e-05, + "loss": 0.5147, + "step": 2281 + }, + { + "epoch": 1.4592628128248182, + "grad_norm": 2.3759498596191406, + "learning_rate": 2.5721307971788845e-05, + "loss": 0.4576, + "step": 2282 + }, + { + "epoch": 1.4599024546254098, + "grad_norm": 2.3022379875183105, + "learning_rate": 2.5710621927762344e-05, + "loss": 0.4939, + "step": 2283 + }, + { + "epoch": 1.4605420964260014, + "grad_norm": 2.2871007919311523, + "learning_rate": 2.5699935883735844e-05, + "loss": 0.4867, + "step": 2284 + }, + { + "epoch": 1.461181738226593, + "grad_norm": 2.8387856483459473, + "learning_rate": 2.568924983970934e-05, + "loss": 0.6164, + "step": 2285 + }, + { + "epoch": 1.4618213800271849, + "grad_norm": 2.685382127761841, + "learning_rate": 2.5678563795682843e-05, + "loss": 0.5572, + "step": 2286 + }, + { + "epoch": 1.4624610218277765, + "grad_norm": 2.6641671657562256, + "learning_rate": 2.566787775165634e-05, + "loss": 0.5765, + "step": 2287 + }, + { + "epoch": 1.463100663628368, + "grad_norm": 2.3019654750823975, + "learning_rate": 2.5657191707629834e-05, + "loss": 0.49, + "step": 2288 + }, + { + "epoch": 1.4637403054289597, + "grad_norm": 2.3412725925445557, + "learning_rate": 2.5646505663603337e-05, + "loss": 0.4738, + "step": 2289 + }, + { + "epoch": 1.4643799472295513, + "grad_norm": 2.0882978439331055, + "learning_rate": 2.5635819619576833e-05, + "loss": 0.4569, + "step": 2290 + }, + { + "epoch": 1.4650195890301432, + "grad_norm": 2.3611629009246826, + "learning_rate": 2.562513357555033e-05, + "loss": 0.5354, + "step": 2291 + }, + { + "epoch": 1.4656592308307348, + "grad_norm": 2.6620607376098633, + "learning_rate": 2.5614447531523832e-05, + "loss": 0.5473, + "step": 2292 + }, + { + "epoch": 1.4662988726313264, + "grad_norm": 2.0741662979125977, + "learning_rate": 2.560376148749733e-05, + "loss": 0.4435, + "step": 2293 + }, + { + "epoch": 1.4669385144319183, + "grad_norm": 2.37638258934021, + "learning_rate": 2.5593075443470827e-05, + "loss": 0.5466, + "step": 2294 + }, + { + "epoch": 1.4675781562325099, + "grad_norm": 2.0607833862304688, + "learning_rate": 2.558238939944433e-05, + "loss": 0.4425, + "step": 2295 + }, + { + "epoch": 1.4682177980331015, + "grad_norm": 1.9675531387329102, + "learning_rate": 2.5571703355417826e-05, + "loss": 0.4611, + "step": 2296 + }, + { + "epoch": 1.468857439833693, + "grad_norm": 2.290937662124634, + "learning_rate": 2.5561017311391322e-05, + "loss": 0.5219, + "step": 2297 + }, + { + "epoch": 1.4694970816342847, + "grad_norm": 2.2166943550109863, + "learning_rate": 2.5550331267364825e-05, + "loss": 0.5278, + "step": 2298 + }, + { + "epoch": 1.4701367234348766, + "grad_norm": 2.4589569568634033, + "learning_rate": 2.553964522333832e-05, + "loss": 0.5253, + "step": 2299 + }, + { + "epoch": 1.4707763652354682, + "grad_norm": 2.170689105987549, + "learning_rate": 2.5528959179311817e-05, + "loss": 0.4899, + "step": 2300 + }, + { + "epoch": 1.4714160070360598, + "grad_norm": 2.7934813499450684, + "learning_rate": 2.551827313528532e-05, + "loss": 0.6217, + "step": 2301 + }, + { + "epoch": 1.4720556488366514, + "grad_norm": 2.2422893047332764, + "learning_rate": 2.5507587091258815e-05, + "loss": 0.5055, + "step": 2302 + }, + { + "epoch": 1.472695290637243, + "grad_norm": 2.328583240509033, + "learning_rate": 2.5496901047232315e-05, + "loss": 0.5058, + "step": 2303 + }, + { + "epoch": 1.4733349324378349, + "grad_norm": 2.06060528755188, + "learning_rate": 2.5486215003205814e-05, + "loss": 0.4975, + "step": 2304 + }, + { + "epoch": 1.4739745742384265, + "grad_norm": 1.9319361448287964, + "learning_rate": 2.5475528959179313e-05, + "loss": 0.4185, + "step": 2305 + }, + { + "epoch": 1.474614216039018, + "grad_norm": 1.9154685735702515, + "learning_rate": 2.546484291515281e-05, + "loss": 0.4282, + "step": 2306 + }, + { + "epoch": 1.47525385783961, + "grad_norm": 2.055645227432251, + "learning_rate": 2.5454156871126312e-05, + "loss": 0.4691, + "step": 2307 + }, + { + "epoch": 1.4758934996402016, + "grad_norm": 2.6475746631622314, + "learning_rate": 2.5443470827099808e-05, + "loss": 0.5837, + "step": 2308 + }, + { + "epoch": 1.4765331414407932, + "grad_norm": 2.161628484725952, + "learning_rate": 2.543278478307331e-05, + "loss": 0.4263, + "step": 2309 + }, + { + "epoch": 1.4771727832413848, + "grad_norm": 2.270693302154541, + "learning_rate": 2.5422098739046807e-05, + "loss": 0.5034, + "step": 2310 + }, + { + "epoch": 1.4778124250419764, + "grad_norm": 2.5863564014434814, + "learning_rate": 2.5411412695020303e-05, + "loss": 0.5412, + "step": 2311 + }, + { + "epoch": 1.4784520668425682, + "grad_norm": 2.4512579441070557, + "learning_rate": 2.5400726650993806e-05, + "loss": 0.5, + "step": 2312 + }, + { + "epoch": 1.4790917086431599, + "grad_norm": 2.654994249343872, + "learning_rate": 2.53900406069673e-05, + "loss": 0.563, + "step": 2313 + }, + { + "epoch": 1.4797313504437515, + "grad_norm": 2.3832905292510986, + "learning_rate": 2.5379354562940798e-05, + "loss": 0.4877, + "step": 2314 + }, + { + "epoch": 1.480370992244343, + "grad_norm": 2.3197555541992188, + "learning_rate": 2.53686685189143e-05, + "loss": 0.4872, + "step": 2315 + }, + { + "epoch": 1.4810106340449347, + "grad_norm": 2.1848902702331543, + "learning_rate": 2.53579824748878e-05, + "loss": 0.4878, + "step": 2316 + }, + { + "epoch": 1.4816502758455266, + "grad_norm": 2.4237613677978516, + "learning_rate": 2.5347296430861296e-05, + "loss": 0.5306, + "step": 2317 + }, + { + "epoch": 1.4822899176461182, + "grad_norm": 2.2572357654571533, + "learning_rate": 2.53366103868348e-05, + "loss": 0.468, + "step": 2318 + }, + { + "epoch": 1.4829295594467098, + "grad_norm": 2.74531888961792, + "learning_rate": 2.5325924342808294e-05, + "loss": 0.6163, + "step": 2319 + }, + { + "epoch": 1.4835692012473016, + "grad_norm": 2.414482355117798, + "learning_rate": 2.531523829878179e-05, + "loss": 0.4906, + "step": 2320 + }, + { + "epoch": 1.4842088430478932, + "grad_norm": 2.572911500930786, + "learning_rate": 2.5304552254755293e-05, + "loss": 0.5517, + "step": 2321 + }, + { + "epoch": 1.4848484848484849, + "grad_norm": 2.223451614379883, + "learning_rate": 2.529386621072879e-05, + "loss": 0.483, + "step": 2322 + }, + { + "epoch": 1.4854881266490765, + "grad_norm": 2.360363245010376, + "learning_rate": 2.5283180166702285e-05, + "loss": 0.5478, + "step": 2323 + }, + { + "epoch": 1.486127768449668, + "grad_norm": 2.163233995437622, + "learning_rate": 2.5272494122675788e-05, + "loss": 0.4739, + "step": 2324 + }, + { + "epoch": 1.48676741025026, + "grad_norm": 2.0591564178466797, + "learning_rate": 2.5261808078649284e-05, + "loss": 0.4739, + "step": 2325 + }, + { + "epoch": 1.4874070520508516, + "grad_norm": 2.2115094661712646, + "learning_rate": 2.5251122034622783e-05, + "loss": 0.4749, + "step": 2326 + }, + { + "epoch": 1.4880466938514432, + "grad_norm": 2.305901288986206, + "learning_rate": 2.5240435990596282e-05, + "loss": 0.5349, + "step": 2327 + }, + { + "epoch": 1.4886863356520348, + "grad_norm": 2.492730140686035, + "learning_rate": 2.5229749946569782e-05, + "loss": 0.5528, + "step": 2328 + }, + { + "epoch": 1.4893259774526264, + "grad_norm": 2.506945848464966, + "learning_rate": 2.5219063902543278e-05, + "loss": 0.594, + "step": 2329 + }, + { + "epoch": 1.4899656192532182, + "grad_norm": 2.1096086502075195, + "learning_rate": 2.520837785851678e-05, + "loss": 0.4492, + "step": 2330 + }, + { + "epoch": 1.4906052610538099, + "grad_norm": 2.1972506046295166, + "learning_rate": 2.5197691814490277e-05, + "loss": 0.5063, + "step": 2331 + }, + { + "epoch": 1.4912449028544015, + "grad_norm": 2.3153460025787354, + "learning_rate": 2.5187005770463773e-05, + "loss": 0.5166, + "step": 2332 + }, + { + "epoch": 1.4918845446549933, + "grad_norm": 2.286513566970825, + "learning_rate": 2.5176319726437275e-05, + "loss": 0.5647, + "step": 2333 + }, + { + "epoch": 1.492524186455585, + "grad_norm": 2.254351854324341, + "learning_rate": 2.516563368241077e-05, + "loss": 0.5644, + "step": 2334 + }, + { + "epoch": 1.4931638282561766, + "grad_norm": 2.4628243446350098, + "learning_rate": 2.5154947638384267e-05, + "loss": 0.566, + "step": 2335 + }, + { + "epoch": 1.4938034700567682, + "grad_norm": 2.4899988174438477, + "learning_rate": 2.514426159435777e-05, + "loss": 0.5532, + "step": 2336 + }, + { + "epoch": 1.4944431118573598, + "grad_norm": 2.2579925060272217, + "learning_rate": 2.5133575550331266e-05, + "loss": 0.4957, + "step": 2337 + }, + { + "epoch": 1.4950827536579516, + "grad_norm": 2.414663314819336, + "learning_rate": 2.512288950630477e-05, + "loss": 0.5331, + "step": 2338 + }, + { + "epoch": 1.4957223954585432, + "grad_norm": 2.4178664684295654, + "learning_rate": 2.5112203462278268e-05, + "loss": 0.5849, + "step": 2339 + }, + { + "epoch": 1.4963620372591349, + "grad_norm": 2.6198863983154297, + "learning_rate": 2.5101517418251764e-05, + "loss": 0.5785, + "step": 2340 + }, + { + "epoch": 1.4970016790597265, + "grad_norm": 2.1563780307769775, + "learning_rate": 2.5090831374225267e-05, + "loss": 0.4707, + "step": 2341 + }, + { + "epoch": 1.497641320860318, + "grad_norm": 2.6375300884246826, + "learning_rate": 2.5080145330198763e-05, + "loss": 0.6339, + "step": 2342 + }, + { + "epoch": 1.49828096266091, + "grad_norm": 2.2239177227020264, + "learning_rate": 2.506945928617226e-05, + "loss": 0.5054, + "step": 2343 + }, + { + "epoch": 1.4989206044615015, + "grad_norm": 2.573611259460449, + "learning_rate": 2.505877324214576e-05, + "loss": 0.5459, + "step": 2344 + }, + { + "epoch": 1.4995602462620932, + "grad_norm": 2.5579721927642822, + "learning_rate": 2.5048087198119257e-05, + "loss": 0.6102, + "step": 2345 + }, + { + "epoch": 1.500199888062685, + "grad_norm": 2.1255881786346436, + "learning_rate": 2.5037401154092753e-05, + "loss": 0.4868, + "step": 2346 + }, + { + "epoch": 1.5008395298632764, + "grad_norm": 1.931162714958191, + "learning_rate": 2.5026715110066256e-05, + "loss": 0.4776, + "step": 2347 + }, + { + "epoch": 1.5014791716638682, + "grad_norm": 2.4322550296783447, + "learning_rate": 2.5016029066039752e-05, + "loss": 0.5074, + "step": 2348 + }, + { + "epoch": 1.5021188134644599, + "grad_norm": 2.068201780319214, + "learning_rate": 2.500534302201325e-05, + "loss": 0.4391, + "step": 2349 + }, + { + "epoch": 1.5027584552650515, + "grad_norm": 2.310361862182617, + "learning_rate": 2.499465697798675e-05, + "loss": 0.5295, + "step": 2350 + }, + { + "epoch": 1.5033980970656433, + "grad_norm": 2.4590678215026855, + "learning_rate": 2.498397093396025e-05, + "loss": 0.5207, + "step": 2351 + }, + { + "epoch": 1.504037738866235, + "grad_norm": 1.9980255365371704, + "learning_rate": 2.497328488993375e-05, + "loss": 0.4232, + "step": 2352 + }, + { + "epoch": 1.5046773806668265, + "grad_norm": 2.018057107925415, + "learning_rate": 2.4962598845907246e-05, + "loss": 0.4972, + "step": 2353 + }, + { + "epoch": 1.5053170224674184, + "grad_norm": 2.4456212520599365, + "learning_rate": 2.4951912801880745e-05, + "loss": 0.5755, + "step": 2354 + }, + { + "epoch": 1.5059566642680098, + "grad_norm": 2.4807541370391846, + "learning_rate": 2.4941226757854244e-05, + "loss": 0.5315, + "step": 2355 + }, + { + "epoch": 1.5065963060686016, + "grad_norm": 2.744309425354004, + "learning_rate": 2.493054071382774e-05, + "loss": 0.586, + "step": 2356 + }, + { + "epoch": 1.5072359478691932, + "grad_norm": 2.254199981689453, + "learning_rate": 2.491985466980124e-05, + "loss": 0.4765, + "step": 2357 + }, + { + "epoch": 1.5078755896697849, + "grad_norm": 2.388324022293091, + "learning_rate": 2.490916862577474e-05, + "loss": 0.522, + "step": 2358 + }, + { + "epoch": 1.5085152314703767, + "grad_norm": 2.601102113723755, + "learning_rate": 2.489848258174824e-05, + "loss": 0.5713, + "step": 2359 + }, + { + "epoch": 1.5091548732709683, + "grad_norm": 2.91523814201355, + "learning_rate": 2.4887796537721734e-05, + "loss": 0.6079, + "step": 2360 + }, + { + "epoch": 1.50979451507156, + "grad_norm": 2.6223254203796387, + "learning_rate": 2.4877110493695234e-05, + "loss": 0.5552, + "step": 2361 + }, + { + "epoch": 1.5104341568721515, + "grad_norm": 2.5035362243652344, + "learning_rate": 2.4866424449668736e-05, + "loss": 0.5441, + "step": 2362 + }, + { + "epoch": 1.5110737986727432, + "grad_norm": 2.1271495819091797, + "learning_rate": 2.4855738405642232e-05, + "loss": 0.4628, + "step": 2363 + }, + { + "epoch": 1.511713440473335, + "grad_norm": 2.455970287322998, + "learning_rate": 2.4845052361615732e-05, + "loss": 0.5418, + "step": 2364 + }, + { + "epoch": 1.5123530822739266, + "grad_norm": 2.3929998874664307, + "learning_rate": 2.483436631758923e-05, + "loss": 0.5146, + "step": 2365 + }, + { + "epoch": 1.5129927240745182, + "grad_norm": 2.2422893047332764, + "learning_rate": 2.4823680273562727e-05, + "loss": 0.4333, + "step": 2366 + }, + { + "epoch": 1.51363236587511, + "grad_norm": 2.44172739982605, + "learning_rate": 2.4812994229536226e-05, + "loss": 0.5739, + "step": 2367 + }, + { + "epoch": 1.5142720076757015, + "grad_norm": 2.661695718765259, + "learning_rate": 2.4802308185509726e-05, + "loss": 0.5704, + "step": 2368 + }, + { + "epoch": 1.5149116494762933, + "grad_norm": 2.466061592102051, + "learning_rate": 2.4791622141483222e-05, + "loss": 0.5118, + "step": 2369 + }, + { + "epoch": 1.515551291276885, + "grad_norm": 2.4462573528289795, + "learning_rate": 2.478093609745672e-05, + "loss": 0.521, + "step": 2370 + }, + { + "epoch": 1.5161909330774765, + "grad_norm": 2.2767608165740967, + "learning_rate": 2.477025005343022e-05, + "loss": 0.4889, + "step": 2371 + }, + { + "epoch": 1.5168305748780684, + "grad_norm": 2.4078407287597656, + "learning_rate": 2.475956400940372e-05, + "loss": 0.5219, + "step": 2372 + }, + { + "epoch": 1.51747021667866, + "grad_norm": 2.4542019367218018, + "learning_rate": 2.474887796537722e-05, + "loss": 0.549, + "step": 2373 + }, + { + "epoch": 1.5181098584792516, + "grad_norm": 1.9831424951553345, + "learning_rate": 2.473819192135072e-05, + "loss": 0.4752, + "step": 2374 + }, + { + "epoch": 1.5187495002798432, + "grad_norm": 2.366851568222046, + "learning_rate": 2.4727505877324218e-05, + "loss": 0.528, + "step": 2375 + }, + { + "epoch": 1.5193891420804349, + "grad_norm": 2.438781261444092, + "learning_rate": 2.4716819833297714e-05, + "loss": 0.5119, + "step": 2376 + }, + { + "epoch": 1.5200287838810267, + "grad_norm": 2.2130415439605713, + "learning_rate": 2.4706133789271213e-05, + "loss": 0.4586, + "step": 2377 + }, + { + "epoch": 1.5206684256816183, + "grad_norm": 2.1566548347473145, + "learning_rate": 2.4695447745244713e-05, + "loss": 0.4803, + "step": 2378 + }, + { + "epoch": 1.52130806748221, + "grad_norm": 2.7691001892089844, + "learning_rate": 2.468476170121821e-05, + "loss": 0.5838, + "step": 2379 + }, + { + "epoch": 1.5219477092828018, + "grad_norm": 2.4336585998535156, + "learning_rate": 2.4674075657191708e-05, + "loss": 0.5288, + "step": 2380 + }, + { + "epoch": 1.5225873510833932, + "grad_norm": 2.7915258407592773, + "learning_rate": 2.4663389613165207e-05, + "loss": 0.6181, + "step": 2381 + }, + { + "epoch": 1.523226992883985, + "grad_norm": 2.3237874507904053, + "learning_rate": 2.4652703569138703e-05, + "loss": 0.5097, + "step": 2382 + }, + { + "epoch": 1.5238666346845766, + "grad_norm": 2.3393502235412598, + "learning_rate": 2.4642017525112203e-05, + "loss": 0.4951, + "step": 2383 + }, + { + "epoch": 1.5245062764851682, + "grad_norm": 1.9856390953063965, + "learning_rate": 2.4631331481085702e-05, + "loss": 0.4398, + "step": 2384 + }, + { + "epoch": 1.52514591828576, + "grad_norm": 2.613715410232544, + "learning_rate": 2.4620645437059205e-05, + "loss": 0.5613, + "step": 2385 + }, + { + "epoch": 1.5257855600863517, + "grad_norm": 2.336085796356201, + "learning_rate": 2.46099593930327e-05, + "loss": 0.5135, + "step": 2386 + }, + { + "epoch": 1.5264252018869433, + "grad_norm": 2.4587197303771973, + "learning_rate": 2.45992733490062e-05, + "loss": 0.534, + "step": 2387 + }, + { + "epoch": 1.5270648436875351, + "grad_norm": 2.4791629314422607, + "learning_rate": 2.45885873049797e-05, + "loss": 0.5948, + "step": 2388 + }, + { + "epoch": 1.5277044854881265, + "grad_norm": 2.2141366004943848, + "learning_rate": 2.4577901260953196e-05, + "loss": 0.4922, + "step": 2389 + }, + { + "epoch": 1.5283441272887184, + "grad_norm": 2.0014305114746094, + "learning_rate": 2.4567215216926695e-05, + "loss": 0.4436, + "step": 2390 + }, + { + "epoch": 1.52898376908931, + "grad_norm": 2.351747751235962, + "learning_rate": 2.4556529172900194e-05, + "loss": 0.5386, + "step": 2391 + }, + { + "epoch": 1.5296234108899016, + "grad_norm": 2.6862196922302246, + "learning_rate": 2.454584312887369e-05, + "loss": 0.5773, + "step": 2392 + }, + { + "epoch": 1.5302630526904935, + "grad_norm": 2.5580666065216064, + "learning_rate": 2.453515708484719e-05, + "loss": 0.5424, + "step": 2393 + }, + { + "epoch": 1.5309026944910848, + "grad_norm": 2.2380003929138184, + "learning_rate": 2.452447104082069e-05, + "loss": 0.47, + "step": 2394 + }, + { + "epoch": 1.5315423362916767, + "grad_norm": 2.3826396465301514, + "learning_rate": 2.4513784996794188e-05, + "loss": 0.464, + "step": 2395 + }, + { + "epoch": 1.5321819780922683, + "grad_norm": 2.5672996044158936, + "learning_rate": 2.4503098952767688e-05, + "loss": 0.6625, + "step": 2396 + }, + { + "epoch": 1.53282161989286, + "grad_norm": 2.3602516651153564, + "learning_rate": 2.4492412908741187e-05, + "loss": 0.5199, + "step": 2397 + }, + { + "epoch": 1.5334612616934518, + "grad_norm": 2.279737949371338, + "learning_rate": 2.4481726864714683e-05, + "loss": 0.526, + "step": 2398 + }, + { + "epoch": 1.5341009034940434, + "grad_norm": 2.4020626544952393, + "learning_rate": 2.4471040820688182e-05, + "loss": 0.5661, + "step": 2399 + }, + { + "epoch": 1.534740545294635, + "grad_norm": 2.4102251529693604, + "learning_rate": 2.4460354776661682e-05, + "loss": 0.4876, + "step": 2400 + }, + { + "epoch": 1.5353801870952268, + "grad_norm": 2.311112403869629, + "learning_rate": 2.444966873263518e-05, + "loss": 0.5119, + "step": 2401 + }, + { + "epoch": 1.5360198288958182, + "grad_norm": 2.345923662185669, + "learning_rate": 2.4438982688608677e-05, + "loss": 0.5181, + "step": 2402 + }, + { + "epoch": 1.53665947069641, + "grad_norm": 2.7882421016693115, + "learning_rate": 2.4428296644582176e-05, + "loss": 0.5897, + "step": 2403 + }, + { + "epoch": 1.5372991124970017, + "grad_norm": 2.60638427734375, + "learning_rate": 2.4417610600555676e-05, + "loss": 0.535, + "step": 2404 + }, + { + "epoch": 1.5379387542975933, + "grad_norm": 2.110541582107544, + "learning_rate": 2.4406924556529172e-05, + "loss": 0.4471, + "step": 2405 + }, + { + "epoch": 1.5385783960981851, + "grad_norm": 2.3438408374786377, + "learning_rate": 2.439623851250267e-05, + "loss": 0.5193, + "step": 2406 + }, + { + "epoch": 1.5392180378987765, + "grad_norm": 2.492509126663208, + "learning_rate": 2.438555246847617e-05, + "loss": 0.5633, + "step": 2407 + }, + { + "epoch": 1.5398576796993684, + "grad_norm": 2.2724616527557373, + "learning_rate": 2.437486642444967e-05, + "loss": 0.486, + "step": 2408 + }, + { + "epoch": 1.54049732149996, + "grad_norm": 2.726543426513672, + "learning_rate": 2.436418038042317e-05, + "loss": 0.6241, + "step": 2409 + }, + { + "epoch": 1.5411369633005516, + "grad_norm": 2.4403836727142334, + "learning_rate": 2.435349433639667e-05, + "loss": 0.5459, + "step": 2410 + }, + { + "epoch": 1.5417766051011434, + "grad_norm": 2.583202362060547, + "learning_rate": 2.4342808292370165e-05, + "loss": 0.5753, + "step": 2411 + }, + { + "epoch": 1.542416246901735, + "grad_norm": 2.208462715148926, + "learning_rate": 2.4332122248343664e-05, + "loss": 0.4685, + "step": 2412 + }, + { + "epoch": 1.5430558887023267, + "grad_norm": 2.230743169784546, + "learning_rate": 2.4321436204317163e-05, + "loss": 0.5163, + "step": 2413 + }, + { + "epoch": 1.5436955305029185, + "grad_norm": 2.3793694972991943, + "learning_rate": 2.4310750160290663e-05, + "loss": 0.4592, + "step": 2414 + }, + { + "epoch": 1.54433517230351, + "grad_norm": 2.219308376312256, + "learning_rate": 2.430006411626416e-05, + "loss": 0.5265, + "step": 2415 + }, + { + "epoch": 1.5449748141041018, + "grad_norm": 2.5354859828948975, + "learning_rate": 2.4289378072237658e-05, + "loss": 0.4937, + "step": 2416 + }, + { + "epoch": 1.5456144559046934, + "grad_norm": 2.6889488697052, + "learning_rate": 2.4278692028211157e-05, + "loss": 0.5672, + "step": 2417 + }, + { + "epoch": 1.546254097705285, + "grad_norm": 2.38702130317688, + "learning_rate": 2.4268005984184657e-05, + "loss": 0.5382, + "step": 2418 + }, + { + "epoch": 1.5468937395058768, + "grad_norm": 2.2605645656585693, + "learning_rate": 2.4257319940158156e-05, + "loss": 0.5044, + "step": 2419 + }, + { + "epoch": 1.5475333813064682, + "grad_norm": 2.3581440448760986, + "learning_rate": 2.4246633896131655e-05, + "loss": 0.5478, + "step": 2420 + }, + { + "epoch": 1.54817302310706, + "grad_norm": 2.385565996170044, + "learning_rate": 2.423594785210515e-05, + "loss": 0.5248, + "step": 2421 + }, + { + "epoch": 1.5488126649076517, + "grad_norm": 2.7944037914276123, + "learning_rate": 2.422526180807865e-05, + "loss": 0.5119, + "step": 2422 + }, + { + "epoch": 1.5494523067082433, + "grad_norm": 2.3456242084503174, + "learning_rate": 2.421457576405215e-05, + "loss": 0.5525, + "step": 2423 + }, + { + "epoch": 1.5500919485088351, + "grad_norm": 2.555832624435425, + "learning_rate": 2.4203889720025646e-05, + "loss": 0.5438, + "step": 2424 + }, + { + "epoch": 1.5507315903094268, + "grad_norm": 2.172679901123047, + "learning_rate": 2.4193203675999145e-05, + "loss": 0.5005, + "step": 2425 + }, + { + "epoch": 1.5513712321100184, + "grad_norm": 2.465785264968872, + "learning_rate": 2.4182517631972645e-05, + "loss": 0.5269, + "step": 2426 + }, + { + "epoch": 1.5520108739106102, + "grad_norm": 2.101972818374634, + "learning_rate": 2.417183158794614e-05, + "loss": 0.4562, + "step": 2427 + }, + { + "epoch": 1.5526505157112016, + "grad_norm": 2.3771657943725586, + "learning_rate": 2.416114554391964e-05, + "loss": 0.5279, + "step": 2428 + }, + { + "epoch": 1.5532901575117934, + "grad_norm": 2.1550076007843018, + "learning_rate": 2.415045949989314e-05, + "loss": 0.4694, + "step": 2429 + }, + { + "epoch": 1.553929799312385, + "grad_norm": 2.417736053466797, + "learning_rate": 2.413977345586664e-05, + "loss": 0.5019, + "step": 2430 + }, + { + "epoch": 1.5545694411129767, + "grad_norm": 2.389709234237671, + "learning_rate": 2.4129087411840138e-05, + "loss": 0.5168, + "step": 2431 + }, + { + "epoch": 1.5552090829135685, + "grad_norm": 2.6814632415771484, + "learning_rate": 2.4118401367813638e-05, + "loss": 0.6498, + "step": 2432 + }, + { + "epoch": 1.55584872471416, + "grad_norm": 2.4444580078125, + "learning_rate": 2.4107715323787137e-05, + "loss": 0.5677, + "step": 2433 + }, + { + "epoch": 1.5564883665147518, + "grad_norm": 2.2038381099700928, + "learning_rate": 2.4097029279760633e-05, + "loss": 0.5135, + "step": 2434 + }, + { + "epoch": 1.5571280083153434, + "grad_norm": 2.348320245742798, + "learning_rate": 2.4086343235734132e-05, + "loss": 0.5614, + "step": 2435 + }, + { + "epoch": 1.557767650115935, + "grad_norm": 2.051631212234497, + "learning_rate": 2.407565719170763e-05, + "loss": 0.4845, + "step": 2436 + }, + { + "epoch": 1.5584072919165268, + "grad_norm": 2.6718955039978027, + "learning_rate": 2.4064971147681128e-05, + "loss": 0.6259, + "step": 2437 + }, + { + "epoch": 1.5590469337171184, + "grad_norm": 2.91152286529541, + "learning_rate": 2.4054285103654627e-05, + "loss": 0.6573, + "step": 2438 + }, + { + "epoch": 1.55968657551771, + "grad_norm": 2.3460750579833984, + "learning_rate": 2.4043599059628126e-05, + "loss": 0.5212, + "step": 2439 + }, + { + "epoch": 1.560326217318302, + "grad_norm": 2.128023862838745, + "learning_rate": 2.4032913015601626e-05, + "loss": 0.4552, + "step": 2440 + }, + { + "epoch": 1.5609658591188933, + "grad_norm": 2.336611747741699, + "learning_rate": 2.4022226971575125e-05, + "loss": 0.5525, + "step": 2441 + }, + { + "epoch": 1.5616055009194851, + "grad_norm": 2.3026328086853027, + "learning_rate": 2.4011540927548624e-05, + "loss": 0.5435, + "step": 2442 + }, + { + "epoch": 1.5622451427200768, + "grad_norm": 2.2032392024993896, + "learning_rate": 2.4000854883522124e-05, + "loss": 0.4847, + "step": 2443 + }, + { + "epoch": 1.5628847845206684, + "grad_norm": 2.166548490524292, + "learning_rate": 2.399016883949562e-05, + "loss": 0.4893, + "step": 2444 + }, + { + "epoch": 1.5635244263212602, + "grad_norm": 2.324126720428467, + "learning_rate": 2.397948279546912e-05, + "loss": 0.4876, + "step": 2445 + }, + { + "epoch": 1.5641640681218516, + "grad_norm": 2.2490391731262207, + "learning_rate": 2.396879675144262e-05, + "loss": 0.5261, + "step": 2446 + }, + { + "epoch": 1.5648037099224434, + "grad_norm": 2.0372722148895264, + "learning_rate": 2.3958110707416114e-05, + "loss": 0.4597, + "step": 2447 + }, + { + "epoch": 1.565443351723035, + "grad_norm": 2.200812578201294, + "learning_rate": 2.3947424663389614e-05, + "loss": 0.4848, + "step": 2448 + }, + { + "epoch": 1.5660829935236267, + "grad_norm": 2.1502532958984375, + "learning_rate": 2.3936738619363113e-05, + "loss": 0.4964, + "step": 2449 + }, + { + "epoch": 1.5667226353242185, + "grad_norm": 2.05069899559021, + "learning_rate": 2.392605257533661e-05, + "loss": 0.4847, + "step": 2450 + }, + { + "epoch": 1.5673622771248101, + "grad_norm": 2.511824131011963, + "learning_rate": 2.391536653131011e-05, + "loss": 0.5475, + "step": 2451 + }, + { + "epoch": 1.5680019189254017, + "grad_norm": 2.2022783756256104, + "learning_rate": 2.3904680487283608e-05, + "loss": 0.5098, + "step": 2452 + }, + { + "epoch": 1.5686415607259936, + "grad_norm": 2.268644332885742, + "learning_rate": 2.3893994443257107e-05, + "loss": 0.4785, + "step": 2453 + }, + { + "epoch": 1.569281202526585, + "grad_norm": 2.7114834785461426, + "learning_rate": 2.3883308399230607e-05, + "loss": 0.5204, + "step": 2454 + }, + { + "epoch": 1.5699208443271768, + "grad_norm": 2.5339622497558594, + "learning_rate": 2.3872622355204106e-05, + "loss": 0.5187, + "step": 2455 + }, + { + "epoch": 1.5705604861277684, + "grad_norm": 2.329665422439575, + "learning_rate": 2.3861936311177605e-05, + "loss": 0.4988, + "step": 2456 + }, + { + "epoch": 1.57120012792836, + "grad_norm": 2.543318033218384, + "learning_rate": 2.38512502671511e-05, + "loss": 0.524, + "step": 2457 + }, + { + "epoch": 1.571839769728952, + "grad_norm": 2.1351418495178223, + "learning_rate": 2.38405642231246e-05, + "loss": 0.4507, + "step": 2458 + }, + { + "epoch": 1.5724794115295433, + "grad_norm": 2.34116792678833, + "learning_rate": 2.38298781790981e-05, + "loss": 0.4895, + "step": 2459 + }, + { + "epoch": 1.5731190533301351, + "grad_norm": 2.2495195865631104, + "learning_rate": 2.3819192135071596e-05, + "loss": 0.4784, + "step": 2460 + }, + { + "epoch": 1.5737586951307267, + "grad_norm": 2.2945520877838135, + "learning_rate": 2.3808506091045095e-05, + "loss": 0.4666, + "step": 2461 + }, + { + "epoch": 1.5743983369313184, + "grad_norm": 2.7183754444122314, + "learning_rate": 2.3797820047018595e-05, + "loss": 0.6223, + "step": 2462 + }, + { + "epoch": 1.5750379787319102, + "grad_norm": 2.485534191131592, + "learning_rate": 2.3787134002992094e-05, + "loss": 0.5399, + "step": 2463 + }, + { + "epoch": 1.5756776205325018, + "grad_norm": 2.1604371070861816, + "learning_rate": 2.3776447958965593e-05, + "loss": 0.4515, + "step": 2464 + }, + { + "epoch": 1.5763172623330934, + "grad_norm": 2.5859181880950928, + "learning_rate": 2.3765761914939093e-05, + "loss": 0.5557, + "step": 2465 + }, + { + "epoch": 1.5769569041336853, + "grad_norm": 2.3516976833343506, + "learning_rate": 2.375507587091259e-05, + "loss": 0.5024, + "step": 2466 + }, + { + "epoch": 1.5775965459342767, + "grad_norm": 2.5704262256622314, + "learning_rate": 2.3744389826886088e-05, + "loss": 0.5738, + "step": 2467 + }, + { + "epoch": 1.5782361877348685, + "grad_norm": 2.746694564819336, + "learning_rate": 2.3733703782859587e-05, + "loss": 0.569, + "step": 2468 + }, + { + "epoch": 1.5788758295354601, + "grad_norm": 2.1925268173217773, + "learning_rate": 2.3723017738833083e-05, + "loss": 0.5233, + "step": 2469 + }, + { + "epoch": 1.5795154713360517, + "grad_norm": 2.1631176471710205, + "learning_rate": 2.3712331694806583e-05, + "loss": 0.4705, + "step": 2470 + }, + { + "epoch": 1.5801551131366436, + "grad_norm": 2.339540958404541, + "learning_rate": 2.3701645650780082e-05, + "loss": 0.4981, + "step": 2471 + }, + { + "epoch": 1.5807947549372352, + "grad_norm": 2.373715400695801, + "learning_rate": 2.369095960675358e-05, + "loss": 0.4896, + "step": 2472 + }, + { + "epoch": 1.5814343967378268, + "grad_norm": 2.7771546840667725, + "learning_rate": 2.3680273562727078e-05, + "loss": 0.5849, + "step": 2473 + }, + { + "epoch": 1.5820740385384184, + "grad_norm": 2.1099908351898193, + "learning_rate": 2.3669587518700577e-05, + "loss": 0.5005, + "step": 2474 + }, + { + "epoch": 1.58271368033901, + "grad_norm": 2.2248928546905518, + "learning_rate": 2.3658901474674076e-05, + "loss": 0.4991, + "step": 2475 + }, + { + "epoch": 1.583353322139602, + "grad_norm": 1.9041866064071655, + "learning_rate": 2.3648215430647576e-05, + "loss": 0.4407, + "step": 2476 + }, + { + "epoch": 1.5839929639401935, + "grad_norm": 2.2991416454315186, + "learning_rate": 2.3637529386621075e-05, + "loss": 0.5053, + "step": 2477 + }, + { + "epoch": 1.5846326057407851, + "grad_norm": 2.454655170440674, + "learning_rate": 2.3626843342594574e-05, + "loss": 0.5965, + "step": 2478 + }, + { + "epoch": 1.585272247541377, + "grad_norm": 2.389505386352539, + "learning_rate": 2.361615729856807e-05, + "loss": 0.4951, + "step": 2479 + }, + { + "epoch": 1.5859118893419684, + "grad_norm": 2.505373239517212, + "learning_rate": 2.360547125454157e-05, + "loss": 0.5485, + "step": 2480 + }, + { + "epoch": 1.5865515311425602, + "grad_norm": 2.362473726272583, + "learning_rate": 2.359478521051507e-05, + "loss": 0.5577, + "step": 2481 + }, + { + "epoch": 1.5871911729431518, + "grad_norm": 2.4360294342041016, + "learning_rate": 2.3584099166488565e-05, + "loss": 0.5513, + "step": 2482 + }, + { + "epoch": 1.5878308147437434, + "grad_norm": 2.365406036376953, + "learning_rate": 2.3573413122462064e-05, + "loss": 0.5373, + "step": 2483 + }, + { + "epoch": 1.5884704565443353, + "grad_norm": 2.66648530960083, + "learning_rate": 2.3562727078435564e-05, + "loss": 0.5579, + "step": 2484 + }, + { + "epoch": 1.589110098344927, + "grad_norm": 2.3460564613342285, + "learning_rate": 2.3552041034409063e-05, + "loss": 0.5011, + "step": 2485 + }, + { + "epoch": 1.5897497401455185, + "grad_norm": 2.4095308780670166, + "learning_rate": 2.3541354990382562e-05, + "loss": 0.4704, + "step": 2486 + }, + { + "epoch": 1.5903893819461101, + "grad_norm": 2.5295562744140625, + "learning_rate": 2.3530668946356062e-05, + "loss": 0.5096, + "step": 2487 + }, + { + "epoch": 1.5910290237467017, + "grad_norm": 2.5730931758880615, + "learning_rate": 2.351998290232956e-05, + "loss": 0.6036, + "step": 2488 + }, + { + "epoch": 1.5916686655472936, + "grad_norm": 2.3112635612487793, + "learning_rate": 2.3509296858303057e-05, + "loss": 0.5241, + "step": 2489 + }, + { + "epoch": 1.5923083073478852, + "grad_norm": 2.616288661956787, + "learning_rate": 2.3498610814276557e-05, + "loss": 0.576, + "step": 2490 + }, + { + "epoch": 1.5929479491484768, + "grad_norm": 2.232785701751709, + "learning_rate": 2.3487924770250056e-05, + "loss": 0.4771, + "step": 2491 + }, + { + "epoch": 1.5935875909490687, + "grad_norm": 2.0340754985809326, + "learning_rate": 2.3477238726223552e-05, + "loss": 0.4216, + "step": 2492 + }, + { + "epoch": 1.59422723274966, + "grad_norm": 2.366130828857422, + "learning_rate": 2.346655268219705e-05, + "loss": 0.5477, + "step": 2493 + }, + { + "epoch": 1.594866874550252, + "grad_norm": 2.0449624061584473, + "learning_rate": 2.345586663817055e-05, + "loss": 0.4029, + "step": 2494 + }, + { + "epoch": 1.5955065163508435, + "grad_norm": 2.3952925205230713, + "learning_rate": 2.3445180594144047e-05, + "loss": 0.529, + "step": 2495 + }, + { + "epoch": 1.5961461581514351, + "grad_norm": 2.255967140197754, + "learning_rate": 2.3434494550117546e-05, + "loss": 0.4837, + "step": 2496 + }, + { + "epoch": 1.596785799952027, + "grad_norm": 2.468334913253784, + "learning_rate": 2.3423808506091045e-05, + "loss": 0.5097, + "step": 2497 + }, + { + "epoch": 1.5974254417526186, + "grad_norm": 2.680081605911255, + "learning_rate": 2.3413122462064545e-05, + "loss": 0.5233, + "step": 2498 + }, + { + "epoch": 1.5980650835532102, + "grad_norm": 2.3744771480560303, + "learning_rate": 2.3402436418038044e-05, + "loss": 0.4636, + "step": 2499 + }, + { + "epoch": 1.598704725353802, + "grad_norm": 2.4233438968658447, + "learning_rate": 2.3391750374011543e-05, + "loss": 0.5244, + "step": 2500 + }, + { + "epoch": 1.5993443671543934, + "grad_norm": 2.5019216537475586, + "learning_rate": 2.3381064329985043e-05, + "loss": 0.5743, + "step": 2501 + }, + { + "epoch": 1.5999840089549853, + "grad_norm": 2.3580856323242188, + "learning_rate": 2.337037828595854e-05, + "loss": 0.4786, + "step": 2502 + }, + { + "epoch": 1.6006236507555769, + "grad_norm": 2.263636350631714, + "learning_rate": 2.3359692241932038e-05, + "loss": 0.483, + "step": 2503 + }, + { + "epoch": 1.6012632925561685, + "grad_norm": 2.3565735816955566, + "learning_rate": 2.3349006197905537e-05, + "loss": 0.5169, + "step": 2504 + }, + { + "epoch": 1.6019029343567603, + "grad_norm": 2.528686761856079, + "learning_rate": 2.3338320153879033e-05, + "loss": 0.5606, + "step": 2505 + }, + { + "epoch": 1.6025425761573517, + "grad_norm": 2.5530025959014893, + "learning_rate": 2.3327634109852533e-05, + "loss": 0.5467, + "step": 2506 + }, + { + "epoch": 1.6031822179579436, + "grad_norm": 2.213629722595215, + "learning_rate": 2.3316948065826032e-05, + "loss": 0.4782, + "step": 2507 + }, + { + "epoch": 1.6038218597585352, + "grad_norm": 2.1079697608947754, + "learning_rate": 2.3306262021799528e-05, + "loss": 0.4336, + "step": 2508 + }, + { + "epoch": 1.6044615015591268, + "grad_norm": 2.2900006771087646, + "learning_rate": 2.329557597777303e-05, + "loss": 0.5071, + "step": 2509 + }, + { + "epoch": 1.6051011433597187, + "grad_norm": 2.401519298553467, + "learning_rate": 2.328488993374653e-05, + "loss": 0.5066, + "step": 2510 + }, + { + "epoch": 1.6057407851603103, + "grad_norm": 2.4686508178710938, + "learning_rate": 2.3274203889720026e-05, + "loss": 0.5248, + "step": 2511 + }, + { + "epoch": 1.6063804269609019, + "grad_norm": 2.6671738624572754, + "learning_rate": 2.3263517845693526e-05, + "loss": 0.6211, + "step": 2512 + }, + { + "epoch": 1.6070200687614937, + "grad_norm": 2.3263304233551025, + "learning_rate": 2.3252831801667025e-05, + "loss": 0.5074, + "step": 2513 + }, + { + "epoch": 1.6076597105620851, + "grad_norm": 2.212210178375244, + "learning_rate": 2.3242145757640524e-05, + "loss": 0.4756, + "step": 2514 + }, + { + "epoch": 1.608299352362677, + "grad_norm": 2.4901797771453857, + "learning_rate": 2.323145971361402e-05, + "loss": 0.5385, + "step": 2515 + }, + { + "epoch": 1.6089389941632686, + "grad_norm": 2.246525287628174, + "learning_rate": 2.322077366958752e-05, + "loss": 0.5141, + "step": 2516 + }, + { + "epoch": 1.6095786359638602, + "grad_norm": 2.166330575942993, + "learning_rate": 2.321008762556102e-05, + "loss": 0.5059, + "step": 2517 + }, + { + "epoch": 1.610218277764452, + "grad_norm": 2.4365246295928955, + "learning_rate": 2.3199401581534515e-05, + "loss": 0.5342, + "step": 2518 + }, + { + "epoch": 1.6108579195650434, + "grad_norm": 2.8213133811950684, + "learning_rate": 2.3188715537508014e-05, + "loss": 0.5542, + "step": 2519 + }, + { + "epoch": 1.6114975613656353, + "grad_norm": 2.5161750316619873, + "learning_rate": 2.3178029493481514e-05, + "loss": 0.5332, + "step": 2520 + }, + { + "epoch": 1.6121372031662269, + "grad_norm": 2.147672176361084, + "learning_rate": 2.3167343449455013e-05, + "loss": 0.4664, + "step": 2521 + }, + { + "epoch": 1.6127768449668185, + "grad_norm": 2.2341268062591553, + "learning_rate": 2.3156657405428512e-05, + "loss": 0.4846, + "step": 2522 + }, + { + "epoch": 1.6134164867674103, + "grad_norm": 2.222297430038452, + "learning_rate": 2.3145971361402012e-05, + "loss": 0.4714, + "step": 2523 + }, + { + "epoch": 1.614056128568002, + "grad_norm": 2.679208517074585, + "learning_rate": 2.3135285317375508e-05, + "loss": 0.5589, + "step": 2524 + }, + { + "epoch": 1.6146957703685936, + "grad_norm": 2.6643195152282715, + "learning_rate": 2.3124599273349007e-05, + "loss": 0.6067, + "step": 2525 + }, + { + "epoch": 1.6153354121691854, + "grad_norm": 2.6646013259887695, + "learning_rate": 2.3113913229322506e-05, + "loss": 0.5803, + "step": 2526 + }, + { + "epoch": 1.6159750539697768, + "grad_norm": 1.8550915718078613, + "learning_rate": 2.3103227185296006e-05, + "loss": 0.4113, + "step": 2527 + }, + { + "epoch": 1.6166146957703686, + "grad_norm": 2.5979645252227783, + "learning_rate": 2.3092541141269502e-05, + "loss": 0.5713, + "step": 2528 + }, + { + "epoch": 1.6172543375709603, + "grad_norm": 2.0909793376922607, + "learning_rate": 2.3081855097243e-05, + "loss": 0.4413, + "step": 2529 + }, + { + "epoch": 1.6178939793715519, + "grad_norm": 2.5317177772521973, + "learning_rate": 2.30711690532165e-05, + "loss": 0.6021, + "step": 2530 + }, + { + "epoch": 1.6185336211721437, + "grad_norm": 2.742863416671753, + "learning_rate": 2.3060483009189996e-05, + "loss": 0.6303, + "step": 2531 + }, + { + "epoch": 1.6191732629727351, + "grad_norm": 2.336408853530884, + "learning_rate": 2.30497969651635e-05, + "loss": 0.5161, + "step": 2532 + }, + { + "epoch": 1.619812904773327, + "grad_norm": 2.3802170753479004, + "learning_rate": 2.3039110921137e-05, + "loss": 0.5682, + "step": 2533 + }, + { + "epoch": 1.6204525465739186, + "grad_norm": 2.4165353775024414, + "learning_rate": 2.3028424877110495e-05, + "loss": 0.4955, + "step": 2534 + }, + { + "epoch": 1.6210921883745102, + "grad_norm": 2.000215768814087, + "learning_rate": 2.3017738833083994e-05, + "loss": 0.4386, + "step": 2535 + }, + { + "epoch": 1.621731830175102, + "grad_norm": 2.3797097206115723, + "learning_rate": 2.3007052789057493e-05, + "loss": 0.5436, + "step": 2536 + }, + { + "epoch": 1.6223714719756936, + "grad_norm": 2.0173850059509277, + "learning_rate": 2.299636674503099e-05, + "loss": 0.4529, + "step": 2537 + }, + { + "epoch": 1.6230111137762853, + "grad_norm": 2.5582783222198486, + "learning_rate": 2.298568070100449e-05, + "loss": 0.5541, + "step": 2538 + }, + { + "epoch": 1.623650755576877, + "grad_norm": 2.180779457092285, + "learning_rate": 2.2974994656977988e-05, + "loss": 0.4889, + "step": 2539 + }, + { + "epoch": 1.6242903973774685, + "grad_norm": 2.3111088275909424, + "learning_rate": 2.2964308612951487e-05, + "loss": 0.5305, + "step": 2540 + }, + { + "epoch": 1.6249300391780603, + "grad_norm": 2.559722661972046, + "learning_rate": 2.2953622568924983e-05, + "loss": 0.5528, + "step": 2541 + }, + { + "epoch": 1.625569680978652, + "grad_norm": 2.272745370864868, + "learning_rate": 2.2942936524898483e-05, + "loss": 0.5074, + "step": 2542 + }, + { + "epoch": 1.6262093227792436, + "grad_norm": 2.2575020790100098, + "learning_rate": 2.2932250480871982e-05, + "loss": 0.5299, + "step": 2543 + }, + { + "epoch": 1.6268489645798354, + "grad_norm": 2.011913537979126, + "learning_rate": 2.292156443684548e-05, + "loss": 0.4872, + "step": 2544 + }, + { + "epoch": 1.6274886063804268, + "grad_norm": 2.4279534816741943, + "learning_rate": 2.291087839281898e-05, + "loss": 0.5225, + "step": 2545 + }, + { + "epoch": 1.6281282481810186, + "grad_norm": 2.4316117763519287, + "learning_rate": 2.290019234879248e-05, + "loss": 0.5218, + "step": 2546 + }, + { + "epoch": 1.6287678899816103, + "grad_norm": 2.136850357055664, + "learning_rate": 2.2889506304765976e-05, + "loss": 0.464, + "step": 2547 + }, + { + "epoch": 1.6294075317822019, + "grad_norm": 2.725249767303467, + "learning_rate": 2.2878820260739475e-05, + "loss": 0.5861, + "step": 2548 + }, + { + "epoch": 1.6300471735827937, + "grad_norm": 2.4032304286956787, + "learning_rate": 2.2868134216712975e-05, + "loss": 0.5284, + "step": 2549 + }, + { + "epoch": 1.6306868153833853, + "grad_norm": 2.399120807647705, + "learning_rate": 2.285744817268647e-05, + "loss": 0.5152, + "step": 2550 + }, + { + "epoch": 1.631326457183977, + "grad_norm": 2.2249791622161865, + "learning_rate": 2.284676212865997e-05, + "loss": 0.4273, + "step": 2551 + }, + { + "epoch": 1.6319660989845688, + "grad_norm": 2.3641135692596436, + "learning_rate": 2.283607608463347e-05, + "loss": 0.5114, + "step": 2552 + }, + { + "epoch": 1.6326057407851602, + "grad_norm": 2.41913104057312, + "learning_rate": 2.2825390040606965e-05, + "loss": 0.5252, + "step": 2553 + }, + { + "epoch": 1.633245382585752, + "grad_norm": 2.295541763305664, + "learning_rate": 2.2814703996580465e-05, + "loss": 0.4923, + "step": 2554 + }, + { + "epoch": 1.6338850243863436, + "grad_norm": 2.612786293029785, + "learning_rate": 2.2804017952553968e-05, + "loss": 0.546, + "step": 2555 + }, + { + "epoch": 1.6345246661869353, + "grad_norm": 2.3593502044677734, + "learning_rate": 2.2793331908527467e-05, + "loss": 0.5378, + "step": 2556 + }, + { + "epoch": 1.635164307987527, + "grad_norm": 2.059650182723999, + "learning_rate": 2.2782645864500963e-05, + "loss": 0.4711, + "step": 2557 + }, + { + "epoch": 1.6358039497881185, + "grad_norm": 2.145604133605957, + "learning_rate": 2.2771959820474462e-05, + "loss": 0.4969, + "step": 2558 + }, + { + "epoch": 1.6364435915887103, + "grad_norm": 2.426478147506714, + "learning_rate": 2.276127377644796e-05, + "loss": 0.5175, + "step": 2559 + }, + { + "epoch": 1.637083233389302, + "grad_norm": 2.0644280910491943, + "learning_rate": 2.2750587732421458e-05, + "loss": 0.4738, + "step": 2560 + }, + { + "epoch": 1.6377228751898936, + "grad_norm": 2.0993101596832275, + "learning_rate": 2.2739901688394957e-05, + "loss": 0.4775, + "step": 2561 + }, + { + "epoch": 1.6383625169904854, + "grad_norm": 2.2274608612060547, + "learning_rate": 2.2729215644368456e-05, + "loss": 0.513, + "step": 2562 + }, + { + "epoch": 1.639002158791077, + "grad_norm": 2.3448235988616943, + "learning_rate": 2.2718529600341952e-05, + "loss": 0.5602, + "step": 2563 + }, + { + "epoch": 1.6396418005916686, + "grad_norm": 2.2966411113739014, + "learning_rate": 2.2707843556315452e-05, + "loss": 0.4986, + "step": 2564 + }, + { + "epoch": 1.6402814423922605, + "grad_norm": 2.4774394035339355, + "learning_rate": 2.269715751228895e-05, + "loss": 0.5447, + "step": 2565 + }, + { + "epoch": 1.6409210841928519, + "grad_norm": 2.184979200363159, + "learning_rate": 2.268647146826245e-05, + "loss": 0.4976, + "step": 2566 + }, + { + "epoch": 1.6415607259934437, + "grad_norm": 2.3675427436828613, + "learning_rate": 2.267578542423595e-05, + "loss": 0.4856, + "step": 2567 + }, + { + "epoch": 1.6422003677940353, + "grad_norm": 1.8464293479919434, + "learning_rate": 2.266509938020945e-05, + "loss": 0.4123, + "step": 2568 + }, + { + "epoch": 1.642840009594627, + "grad_norm": 2.4367034435272217, + "learning_rate": 2.265441333618295e-05, + "loss": 0.5033, + "step": 2569 + }, + { + "epoch": 1.6434796513952188, + "grad_norm": 2.310029983520508, + "learning_rate": 2.2643727292156444e-05, + "loss": 0.4984, + "step": 2570 + }, + { + "epoch": 1.6441192931958102, + "grad_norm": 2.379513740539551, + "learning_rate": 2.2633041248129944e-05, + "loss": 0.5556, + "step": 2571 + }, + { + "epoch": 1.644758934996402, + "grad_norm": 2.559941291809082, + "learning_rate": 2.2622355204103443e-05, + "loss": 0.5274, + "step": 2572 + }, + { + "epoch": 1.6453985767969936, + "grad_norm": 2.173079252243042, + "learning_rate": 2.261166916007694e-05, + "loss": 0.4719, + "step": 2573 + }, + { + "epoch": 1.6460382185975853, + "grad_norm": 2.2452356815338135, + "learning_rate": 2.260098311605044e-05, + "loss": 0.5161, + "step": 2574 + }, + { + "epoch": 1.646677860398177, + "grad_norm": 2.241701602935791, + "learning_rate": 2.2590297072023938e-05, + "loss": 0.5116, + "step": 2575 + }, + { + "epoch": 1.6473175021987687, + "grad_norm": 2.3983466625213623, + "learning_rate": 2.2579611027997434e-05, + "loss": 0.5111, + "step": 2576 + }, + { + "epoch": 1.6479571439993603, + "grad_norm": 1.8889271020889282, + "learning_rate": 2.2568924983970933e-05, + "loss": 0.4058, + "step": 2577 + }, + { + "epoch": 1.6485967857999522, + "grad_norm": 2.5848183631896973, + "learning_rate": 2.2558238939944436e-05, + "loss": 0.5651, + "step": 2578 + }, + { + "epoch": 1.6492364276005436, + "grad_norm": 2.315823554992676, + "learning_rate": 2.2547552895917932e-05, + "loss": 0.5282, + "step": 2579 + }, + { + "epoch": 1.6498760694011354, + "grad_norm": 2.0703957080841064, + "learning_rate": 2.253686685189143e-05, + "loss": 0.4409, + "step": 2580 + }, + { + "epoch": 1.650515711201727, + "grad_norm": 2.217512607574463, + "learning_rate": 2.252618080786493e-05, + "loss": 0.4882, + "step": 2581 + }, + { + "epoch": 1.6511553530023186, + "grad_norm": 2.308084487915039, + "learning_rate": 2.2515494763838427e-05, + "loss": 0.483, + "step": 2582 + }, + { + "epoch": 1.6517949948029105, + "grad_norm": 2.424596071243286, + "learning_rate": 2.2504808719811926e-05, + "loss": 0.5044, + "step": 2583 + }, + { + "epoch": 1.652434636603502, + "grad_norm": 2.650787830352783, + "learning_rate": 2.2494122675785425e-05, + "loss": 0.5302, + "step": 2584 + }, + { + "epoch": 1.6530742784040937, + "grad_norm": 2.6999995708465576, + "learning_rate": 2.2483436631758925e-05, + "loss": 0.5953, + "step": 2585 + }, + { + "epoch": 1.6537139202046853, + "grad_norm": 2.433746337890625, + "learning_rate": 2.247275058773242e-05, + "loss": 0.5218, + "step": 2586 + }, + { + "epoch": 1.654353562005277, + "grad_norm": 2.474039077758789, + "learning_rate": 2.246206454370592e-05, + "loss": 0.5407, + "step": 2587 + }, + { + "epoch": 1.6549932038058688, + "grad_norm": 2.5290632247924805, + "learning_rate": 2.245137849967942e-05, + "loss": 0.5568, + "step": 2588 + }, + { + "epoch": 1.6556328456064604, + "grad_norm": 2.3100783824920654, + "learning_rate": 2.244069245565292e-05, + "loss": 0.4886, + "step": 2589 + }, + { + "epoch": 1.656272487407052, + "grad_norm": 2.191134452819824, + "learning_rate": 2.2430006411626418e-05, + "loss": 0.4679, + "step": 2590 + }, + { + "epoch": 1.6569121292076439, + "grad_norm": 2.317922592163086, + "learning_rate": 2.2419320367599918e-05, + "loss": 0.5093, + "step": 2591 + }, + { + "epoch": 1.6575517710082353, + "grad_norm": 2.3259685039520264, + "learning_rate": 2.2408634323573414e-05, + "loss": 0.4974, + "step": 2592 + }, + { + "epoch": 1.658191412808827, + "grad_norm": 2.282165765762329, + "learning_rate": 2.2397948279546913e-05, + "loss": 0.5191, + "step": 2593 + }, + { + "epoch": 1.6588310546094187, + "grad_norm": 2.230090379714966, + "learning_rate": 2.2387262235520412e-05, + "loss": 0.4937, + "step": 2594 + }, + { + "epoch": 1.6594706964100103, + "grad_norm": 2.2409491539001465, + "learning_rate": 2.2376576191493908e-05, + "loss": 0.5004, + "step": 2595 + }, + { + "epoch": 1.6601103382106022, + "grad_norm": 2.1581904888153076, + "learning_rate": 2.2365890147467408e-05, + "loss": 0.4849, + "step": 2596 + }, + { + "epoch": 1.6607499800111938, + "grad_norm": 2.3513121604919434, + "learning_rate": 2.2355204103440907e-05, + "loss": 0.5291, + "step": 2597 + }, + { + "epoch": 1.6613896218117854, + "grad_norm": 2.3284194469451904, + "learning_rate": 2.2344518059414406e-05, + "loss": 0.506, + "step": 2598 + }, + { + "epoch": 1.662029263612377, + "grad_norm": 2.3850560188293457, + "learning_rate": 2.2333832015387902e-05, + "loss": 0.5509, + "step": 2599 + }, + { + "epoch": 1.6626689054129686, + "grad_norm": 2.166862726211548, + "learning_rate": 2.23231459713614e-05, + "loss": 0.481, + "step": 2600 + }, + { + "epoch": 1.6633085472135605, + "grad_norm": 2.1800949573516846, + "learning_rate": 2.2312459927334904e-05, + "loss": 0.4643, + "step": 2601 + }, + { + "epoch": 1.663948189014152, + "grad_norm": 1.9909067153930664, + "learning_rate": 2.23017738833084e-05, + "loss": 0.4281, + "step": 2602 + }, + { + "epoch": 1.6645878308147437, + "grad_norm": 2.312584400177002, + "learning_rate": 2.22910878392819e-05, + "loss": 0.5142, + "step": 2603 + }, + { + "epoch": 1.6652274726153355, + "grad_norm": 2.3507843017578125, + "learning_rate": 2.22804017952554e-05, + "loss": 0.5246, + "step": 2604 + }, + { + "epoch": 1.665867114415927, + "grad_norm": 2.3416669368743896, + "learning_rate": 2.2269715751228895e-05, + "loss": 0.5173, + "step": 2605 + }, + { + "epoch": 1.6665067562165188, + "grad_norm": 2.5131280422210693, + "learning_rate": 2.2259029707202394e-05, + "loss": 0.61, + "step": 2606 + }, + { + "epoch": 1.6671463980171104, + "grad_norm": 2.2798163890838623, + "learning_rate": 2.2248343663175894e-05, + "loss": 0.5147, + "step": 2607 + }, + { + "epoch": 1.667786039817702, + "grad_norm": 2.3884902000427246, + "learning_rate": 2.223765761914939e-05, + "loss": 0.5237, + "step": 2608 + }, + { + "epoch": 1.6684256816182939, + "grad_norm": 2.1361193656921387, + "learning_rate": 2.222697157512289e-05, + "loss": 0.5225, + "step": 2609 + }, + { + "epoch": 1.6690653234188855, + "grad_norm": 2.3209760189056396, + "learning_rate": 2.221628553109639e-05, + "loss": 0.5232, + "step": 2610 + }, + { + "epoch": 1.669704965219477, + "grad_norm": 2.5959839820861816, + "learning_rate": 2.2205599487069888e-05, + "loss": 0.5796, + "step": 2611 + }, + { + "epoch": 1.670344607020069, + "grad_norm": 2.0414974689483643, + "learning_rate": 2.2194913443043387e-05, + "loss": 0.4931, + "step": 2612 + }, + { + "epoch": 1.6709842488206603, + "grad_norm": 2.2084851264953613, + "learning_rate": 2.2184227399016887e-05, + "loss": 0.503, + "step": 2613 + }, + { + "epoch": 1.6716238906212522, + "grad_norm": 2.4800655841827393, + "learning_rate": 2.2173541354990386e-05, + "loss": 0.5165, + "step": 2614 + }, + { + "epoch": 1.6722635324218438, + "grad_norm": 2.23939847946167, + "learning_rate": 2.2162855310963882e-05, + "loss": 0.5291, + "step": 2615 + }, + { + "epoch": 1.6729031742224354, + "grad_norm": 2.307274103164673, + "learning_rate": 2.215216926693738e-05, + "loss": 0.5308, + "step": 2616 + }, + { + "epoch": 1.6735428160230272, + "grad_norm": 2.3671650886535645, + "learning_rate": 2.214148322291088e-05, + "loss": 0.5045, + "step": 2617 + }, + { + "epoch": 1.6741824578236186, + "grad_norm": 1.9408921003341675, + "learning_rate": 2.2130797178884377e-05, + "loss": 0.433, + "step": 2618 + }, + { + "epoch": 1.6748220996242105, + "grad_norm": 2.1856043338775635, + "learning_rate": 2.2120111134857876e-05, + "loss": 0.5002, + "step": 2619 + }, + { + "epoch": 1.675461741424802, + "grad_norm": 2.113482713699341, + "learning_rate": 2.2109425090831375e-05, + "loss": 0.5113, + "step": 2620 + }, + { + "epoch": 1.6761013832253937, + "grad_norm": 2.0282042026519775, + "learning_rate": 2.209873904680487e-05, + "loss": 0.4542, + "step": 2621 + }, + { + "epoch": 1.6767410250259855, + "grad_norm": 2.4339025020599365, + "learning_rate": 2.208805300277837e-05, + "loss": 0.6055, + "step": 2622 + }, + { + "epoch": 1.6773806668265772, + "grad_norm": 2.0931270122528076, + "learning_rate": 2.207736695875187e-05, + "loss": 0.4387, + "step": 2623 + }, + { + "epoch": 1.6780203086271688, + "grad_norm": 2.082490921020508, + "learning_rate": 2.206668091472537e-05, + "loss": 0.4739, + "step": 2624 + }, + { + "epoch": 1.6786599504277606, + "grad_norm": 2.464775562286377, + "learning_rate": 2.205599487069887e-05, + "loss": 0.5754, + "step": 2625 + }, + { + "epoch": 1.679299592228352, + "grad_norm": 2.403154134750366, + "learning_rate": 2.2045308826672368e-05, + "loss": 0.4517, + "step": 2626 + }, + { + "epoch": 1.6799392340289439, + "grad_norm": 2.4314749240875244, + "learning_rate": 2.2034622782645867e-05, + "loss": 0.5536, + "step": 2627 + }, + { + "epoch": 1.6805788758295355, + "grad_norm": 2.4847919940948486, + "learning_rate": 2.2023936738619363e-05, + "loss": 0.5423, + "step": 2628 + }, + { + "epoch": 1.681218517630127, + "grad_norm": 2.0763776302337646, + "learning_rate": 2.2013250694592863e-05, + "loss": 0.4765, + "step": 2629 + }, + { + "epoch": 1.681858159430719, + "grad_norm": 2.3855700492858887, + "learning_rate": 2.2002564650566362e-05, + "loss": 0.5392, + "step": 2630 + }, + { + "epoch": 1.6824978012313103, + "grad_norm": 2.0628979206085205, + "learning_rate": 2.1991878606539858e-05, + "loss": 0.4635, + "step": 2631 + }, + { + "epoch": 1.6831374430319022, + "grad_norm": 1.7442479133605957, + "learning_rate": 2.1981192562513357e-05, + "loss": 0.3898, + "step": 2632 + }, + { + "epoch": 1.6837770848324938, + "grad_norm": 2.3893167972564697, + "learning_rate": 2.1970506518486857e-05, + "loss": 0.4624, + "step": 2633 + }, + { + "epoch": 1.6844167266330854, + "grad_norm": 2.4031567573547363, + "learning_rate": 2.1959820474460356e-05, + "loss": 0.5156, + "step": 2634 + }, + { + "epoch": 1.6850563684336772, + "grad_norm": 2.734078884124756, + "learning_rate": 2.1949134430433856e-05, + "loss": 0.582, + "step": 2635 + }, + { + "epoch": 1.6856960102342688, + "grad_norm": 2.3971729278564453, + "learning_rate": 2.1938448386407355e-05, + "loss": 0.5392, + "step": 2636 + }, + { + "epoch": 1.6863356520348605, + "grad_norm": 2.26690936088562, + "learning_rate": 2.192776234238085e-05, + "loss": 0.5037, + "step": 2637 + }, + { + "epoch": 1.6869752938354523, + "grad_norm": 2.397167444229126, + "learning_rate": 2.191707629835435e-05, + "loss": 0.527, + "step": 2638 + }, + { + "epoch": 1.6876149356360437, + "grad_norm": 2.739813804626465, + "learning_rate": 2.190639025432785e-05, + "loss": 0.585, + "step": 2639 + }, + { + "epoch": 1.6882545774366355, + "grad_norm": 2.084416389465332, + "learning_rate": 2.189570421030135e-05, + "loss": 0.4488, + "step": 2640 + }, + { + "epoch": 1.6888942192372272, + "grad_norm": 2.519582986831665, + "learning_rate": 2.1885018166274845e-05, + "loss": 0.5626, + "step": 2641 + }, + { + "epoch": 1.6895338610378188, + "grad_norm": 2.360438108444214, + "learning_rate": 2.1874332122248344e-05, + "loss": 0.5438, + "step": 2642 + }, + { + "epoch": 1.6901735028384106, + "grad_norm": 2.7401225566864014, + "learning_rate": 2.1863646078221844e-05, + "loss": 0.5803, + "step": 2643 + }, + { + "epoch": 1.690813144639002, + "grad_norm": 2.3514299392700195, + "learning_rate": 2.185296003419534e-05, + "loss": 0.4884, + "step": 2644 + }, + { + "epoch": 1.6914527864395938, + "grad_norm": 2.3277571201324463, + "learning_rate": 2.184227399016884e-05, + "loss": 0.5094, + "step": 2645 + }, + { + "epoch": 1.6920924282401855, + "grad_norm": 2.3622419834136963, + "learning_rate": 2.183158794614234e-05, + "loss": 0.5348, + "step": 2646 + }, + { + "epoch": 1.692732070040777, + "grad_norm": 2.3978912830352783, + "learning_rate": 2.1820901902115838e-05, + "loss": 0.5384, + "step": 2647 + }, + { + "epoch": 1.693371711841369, + "grad_norm": 2.294590950012207, + "learning_rate": 2.1810215858089337e-05, + "loss": 0.5264, + "step": 2648 + }, + { + "epoch": 1.6940113536419605, + "grad_norm": 2.33931303024292, + "learning_rate": 2.1799529814062836e-05, + "loss": 0.5296, + "step": 2649 + }, + { + "epoch": 1.6946509954425522, + "grad_norm": 2.219364881515503, + "learning_rate": 2.1788843770036332e-05, + "loss": 0.5125, + "step": 2650 + }, + { + "epoch": 1.695290637243144, + "grad_norm": 2.0647964477539062, + "learning_rate": 2.1778157726009832e-05, + "loss": 0.427, + "step": 2651 + }, + { + "epoch": 1.6959302790437354, + "grad_norm": 2.1907999515533447, + "learning_rate": 2.176747168198333e-05, + "loss": 0.4691, + "step": 2652 + }, + { + "epoch": 1.6965699208443272, + "grad_norm": 2.2468862533569336, + "learning_rate": 2.175678563795683e-05, + "loss": 0.4703, + "step": 2653 + }, + { + "epoch": 1.6972095626449188, + "grad_norm": 2.134443759918213, + "learning_rate": 2.1746099593930327e-05, + "loss": 0.4764, + "step": 2654 + }, + { + "epoch": 1.6978492044455105, + "grad_norm": 2.3065402507781982, + "learning_rate": 2.1735413549903826e-05, + "loss": 0.4695, + "step": 2655 + }, + { + "epoch": 1.6984888462461023, + "grad_norm": 2.2199900150299072, + "learning_rate": 2.1724727505877325e-05, + "loss": 0.4493, + "step": 2656 + }, + { + "epoch": 1.6991284880466937, + "grad_norm": 2.345433235168457, + "learning_rate": 2.1714041461850825e-05, + "loss": 0.5201, + "step": 2657 + }, + { + "epoch": 1.6997681298472855, + "grad_norm": 2.6745223999023438, + "learning_rate": 2.1703355417824324e-05, + "loss": 0.5153, + "step": 2658 + }, + { + "epoch": 1.7004077716478772, + "grad_norm": 2.2623071670532227, + "learning_rate": 2.1692669373797823e-05, + "loss": 0.5347, + "step": 2659 + }, + { + "epoch": 1.7010474134484688, + "grad_norm": 2.07926082611084, + "learning_rate": 2.168198332977132e-05, + "loss": 0.437, + "step": 2660 + }, + { + "epoch": 1.7016870552490606, + "grad_norm": 1.9367485046386719, + "learning_rate": 2.167129728574482e-05, + "loss": 0.4556, + "step": 2661 + }, + { + "epoch": 1.7023266970496522, + "grad_norm": 2.360860586166382, + "learning_rate": 2.1660611241718318e-05, + "loss": 0.555, + "step": 2662 + }, + { + "epoch": 1.7029663388502438, + "grad_norm": 2.382340908050537, + "learning_rate": 2.1649925197691814e-05, + "loss": 0.4895, + "step": 2663 + }, + { + "epoch": 1.7036059806508357, + "grad_norm": 2.5638585090637207, + "learning_rate": 2.1639239153665313e-05, + "loss": 0.563, + "step": 2664 + }, + { + "epoch": 1.704245622451427, + "grad_norm": 2.3636674880981445, + "learning_rate": 2.1628553109638813e-05, + "loss": 0.5476, + "step": 2665 + }, + { + "epoch": 1.704885264252019, + "grad_norm": 2.633293628692627, + "learning_rate": 2.161786706561231e-05, + "loss": 0.6173, + "step": 2666 + }, + { + "epoch": 1.7055249060526105, + "grad_norm": 2.334656000137329, + "learning_rate": 2.1607181021585808e-05, + "loss": 0.5153, + "step": 2667 + }, + { + "epoch": 1.7061645478532022, + "grad_norm": 2.401369571685791, + "learning_rate": 2.1596494977559307e-05, + "loss": 0.5303, + "step": 2668 + }, + { + "epoch": 1.706804189653794, + "grad_norm": 2.0251777172088623, + "learning_rate": 2.1585808933532807e-05, + "loss": 0.4164, + "step": 2669 + }, + { + "epoch": 1.7074438314543854, + "grad_norm": 2.3030428886413574, + "learning_rate": 2.1575122889506306e-05, + "loss": 0.4798, + "step": 2670 + }, + { + "epoch": 1.7080834732549772, + "grad_norm": 2.1975905895233154, + "learning_rate": 2.1564436845479806e-05, + "loss": 0.4875, + "step": 2671 + }, + { + "epoch": 1.7087231150555688, + "grad_norm": 2.6817076206207275, + "learning_rate": 2.1553750801453305e-05, + "loss": 0.4836, + "step": 2672 + }, + { + "epoch": 1.7093627568561605, + "grad_norm": 2.809296131134033, + "learning_rate": 2.15430647574268e-05, + "loss": 0.5827, + "step": 2673 + }, + { + "epoch": 1.7100023986567523, + "grad_norm": 2.5263898372650146, + "learning_rate": 2.15323787134003e-05, + "loss": 0.5547, + "step": 2674 + }, + { + "epoch": 1.710642040457344, + "grad_norm": 1.942217230796814, + "learning_rate": 2.15216926693738e-05, + "loss": 0.3903, + "step": 2675 + }, + { + "epoch": 1.7112816822579355, + "grad_norm": 2.0551159381866455, + "learning_rate": 2.1511006625347296e-05, + "loss": 0.4718, + "step": 2676 + }, + { + "epoch": 1.7119213240585274, + "grad_norm": 2.2917423248291016, + "learning_rate": 2.1500320581320795e-05, + "loss": 0.5013, + "step": 2677 + }, + { + "epoch": 1.7125609658591188, + "grad_norm": 2.160872220993042, + "learning_rate": 2.1489634537294294e-05, + "loss": 0.5068, + "step": 2678 + }, + { + "epoch": 1.7132006076597106, + "grad_norm": 2.2712011337280273, + "learning_rate": 2.1478948493267794e-05, + "loss": 0.5293, + "step": 2679 + }, + { + "epoch": 1.7138402494603022, + "grad_norm": 2.1471195220947266, + "learning_rate": 2.1468262449241293e-05, + "loss": 0.5069, + "step": 2680 + }, + { + "epoch": 1.7144798912608938, + "grad_norm": 2.3133046627044678, + "learning_rate": 2.1457576405214792e-05, + "loss": 0.5018, + "step": 2681 + }, + { + "epoch": 1.7151195330614857, + "grad_norm": 2.4129765033721924, + "learning_rate": 2.1446890361188292e-05, + "loss": 0.5156, + "step": 2682 + }, + { + "epoch": 1.715759174862077, + "grad_norm": 2.685666084289551, + "learning_rate": 2.1436204317161788e-05, + "loss": 0.5877, + "step": 2683 + }, + { + "epoch": 1.716398816662669, + "grad_norm": 2.536916971206665, + "learning_rate": 2.1425518273135287e-05, + "loss": 0.5649, + "step": 2684 + }, + { + "epoch": 1.7170384584632605, + "grad_norm": 2.58223819732666, + "learning_rate": 2.1414832229108786e-05, + "loss": 0.5849, + "step": 2685 + }, + { + "epoch": 1.7176781002638521, + "grad_norm": 2.4905548095703125, + "learning_rate": 2.1404146185082282e-05, + "loss": 0.5486, + "step": 2686 + }, + { + "epoch": 1.718317742064444, + "grad_norm": 2.3483517169952393, + "learning_rate": 2.1393460141055782e-05, + "loss": 0.5212, + "step": 2687 + }, + { + "epoch": 1.7189573838650356, + "grad_norm": 2.4959635734558105, + "learning_rate": 2.138277409702928e-05, + "loss": 0.5784, + "step": 2688 + }, + { + "epoch": 1.7195970256656272, + "grad_norm": 2.4645330905914307, + "learning_rate": 2.1372088053002777e-05, + "loss": 0.5065, + "step": 2689 + }, + { + "epoch": 1.720236667466219, + "grad_norm": 2.55702543258667, + "learning_rate": 2.1361402008976276e-05, + "loss": 0.5177, + "step": 2690 + }, + { + "epoch": 1.7208763092668105, + "grad_norm": 2.447925329208374, + "learning_rate": 2.1350715964949776e-05, + "loss": 0.5278, + "step": 2691 + }, + { + "epoch": 1.7215159510674023, + "grad_norm": 2.4395155906677246, + "learning_rate": 2.1340029920923275e-05, + "loss": 0.5456, + "step": 2692 + }, + { + "epoch": 1.722155592867994, + "grad_norm": 2.313857316970825, + "learning_rate": 2.1329343876896775e-05, + "loss": 0.5294, + "step": 2693 + }, + { + "epoch": 1.7227952346685855, + "grad_norm": 2.389639377593994, + "learning_rate": 2.1318657832870274e-05, + "loss": 0.5167, + "step": 2694 + }, + { + "epoch": 1.7234348764691774, + "grad_norm": 2.291781425476074, + "learning_rate": 2.1307971788843773e-05, + "loss": 0.5264, + "step": 2695 + }, + { + "epoch": 1.7240745182697688, + "grad_norm": 2.516266345977783, + "learning_rate": 2.129728574481727e-05, + "loss": 0.5836, + "step": 2696 + }, + { + "epoch": 1.7247141600703606, + "grad_norm": 2.0757901668548584, + "learning_rate": 2.128659970079077e-05, + "loss": 0.4871, + "step": 2697 + }, + { + "epoch": 1.7253538018709522, + "grad_norm": 2.7365918159484863, + "learning_rate": 2.1275913656764268e-05, + "loss": 0.5389, + "step": 2698 + }, + { + "epoch": 1.7259934436715438, + "grad_norm": 2.2034716606140137, + "learning_rate": 2.1265227612737764e-05, + "loss": 0.5078, + "step": 2699 + }, + { + "epoch": 1.7266330854721357, + "grad_norm": 1.9844567775726318, + "learning_rate": 2.1254541568711263e-05, + "loss": 0.4717, + "step": 2700 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 2.068657875061035, + "learning_rate": 2.1243855524684763e-05, + "loss": 0.4478, + "step": 2701 + }, + { + "epoch": 1.727912369073319, + "grad_norm": 2.292952537536621, + "learning_rate": 2.1233169480658262e-05, + "loss": 0.5104, + "step": 2702 + }, + { + "epoch": 1.7285520108739107, + "grad_norm": 2.1269376277923584, + "learning_rate": 2.122248343663176e-05, + "loss": 0.4561, + "step": 2703 + }, + { + "epoch": 1.7291916526745021, + "grad_norm": 2.290296792984009, + "learning_rate": 2.121179739260526e-05, + "loss": 0.5308, + "step": 2704 + }, + { + "epoch": 1.729831294475094, + "grad_norm": 2.1942138671875, + "learning_rate": 2.1201111348578757e-05, + "loss": 0.4779, + "step": 2705 + }, + { + "epoch": 1.7304709362756856, + "grad_norm": 2.4728095531463623, + "learning_rate": 2.1190425304552256e-05, + "loss": 0.5806, + "step": 2706 + }, + { + "epoch": 1.7311105780762772, + "grad_norm": 2.411494493484497, + "learning_rate": 2.1179739260525755e-05, + "loss": 0.5074, + "step": 2707 + }, + { + "epoch": 1.731750219876869, + "grad_norm": 2.0176281929016113, + "learning_rate": 2.116905321649925e-05, + "loss": 0.4025, + "step": 2708 + }, + { + "epoch": 1.7323898616774607, + "grad_norm": 2.5532777309417725, + "learning_rate": 2.115836717247275e-05, + "loss": 0.4898, + "step": 2709 + }, + { + "epoch": 1.7330295034780523, + "grad_norm": 2.271014928817749, + "learning_rate": 2.114768112844625e-05, + "loss": 0.4581, + "step": 2710 + }, + { + "epoch": 1.733669145278644, + "grad_norm": 2.446220636367798, + "learning_rate": 2.113699508441975e-05, + "loss": 0.4832, + "step": 2711 + }, + { + "epoch": 1.7343087870792355, + "grad_norm": 2.006302833557129, + "learning_rate": 2.1126309040393245e-05, + "loss": 0.4575, + "step": 2712 + }, + { + "epoch": 1.7349484288798274, + "grad_norm": 2.2831568717956543, + "learning_rate": 2.1115622996366745e-05, + "loss": 0.4792, + "step": 2713 + }, + { + "epoch": 1.735588070680419, + "grad_norm": 2.3639678955078125, + "learning_rate": 2.1104936952340244e-05, + "loss": 0.487, + "step": 2714 + }, + { + "epoch": 1.7362277124810106, + "grad_norm": 2.579700231552124, + "learning_rate": 2.1094250908313744e-05, + "loss": 0.5431, + "step": 2715 + }, + { + "epoch": 1.7368673542816024, + "grad_norm": 2.3579726219177246, + "learning_rate": 2.1083564864287243e-05, + "loss": 0.5355, + "step": 2716 + }, + { + "epoch": 1.7375069960821938, + "grad_norm": 2.5764262676239014, + "learning_rate": 2.1072878820260742e-05, + "loss": 0.5081, + "step": 2717 + }, + { + "epoch": 1.7381466378827857, + "grad_norm": 2.2579660415649414, + "learning_rate": 2.1062192776234238e-05, + "loss": 0.4355, + "step": 2718 + }, + { + "epoch": 1.7387862796833773, + "grad_norm": 2.340811252593994, + "learning_rate": 2.1051506732207738e-05, + "loss": 0.4853, + "step": 2719 + }, + { + "epoch": 1.739425921483969, + "grad_norm": 2.223498821258545, + "learning_rate": 2.1040820688181237e-05, + "loss": 0.4966, + "step": 2720 + }, + { + "epoch": 1.7400655632845607, + "grad_norm": 2.233729124069214, + "learning_rate": 2.1030134644154733e-05, + "loss": 0.4446, + "step": 2721 + }, + { + "epoch": 1.7407052050851524, + "grad_norm": 2.414647102355957, + "learning_rate": 2.1019448600128232e-05, + "loss": 0.4673, + "step": 2722 + }, + { + "epoch": 1.741344846885744, + "grad_norm": 2.403012752532959, + "learning_rate": 2.100876255610173e-05, + "loss": 0.4835, + "step": 2723 + }, + { + "epoch": 1.7419844886863356, + "grad_norm": 2.7517170906066895, + "learning_rate": 2.099807651207523e-05, + "loss": 0.5839, + "step": 2724 + }, + { + "epoch": 1.7426241304869272, + "grad_norm": 2.5492401123046875, + "learning_rate": 2.098739046804873e-05, + "loss": 0.5046, + "step": 2725 + }, + { + "epoch": 1.743263772287519, + "grad_norm": 2.58609676361084, + "learning_rate": 2.097670442402223e-05, + "loss": 0.5653, + "step": 2726 + }, + { + "epoch": 1.7439034140881107, + "grad_norm": 2.214102268218994, + "learning_rate": 2.096601837999573e-05, + "loss": 0.5014, + "step": 2727 + }, + { + "epoch": 1.7445430558887023, + "grad_norm": 2.146833896636963, + "learning_rate": 2.0955332335969225e-05, + "loss": 0.4724, + "step": 2728 + }, + { + "epoch": 1.7451826976892941, + "grad_norm": 2.0483169555664062, + "learning_rate": 2.0944646291942724e-05, + "loss": 0.4511, + "step": 2729 + }, + { + "epoch": 1.7458223394898855, + "grad_norm": 2.2580761909484863, + "learning_rate": 2.0933960247916224e-05, + "loss": 0.475, + "step": 2730 + }, + { + "epoch": 1.7464619812904774, + "grad_norm": 2.654022455215454, + "learning_rate": 2.092327420388972e-05, + "loss": 0.5649, + "step": 2731 + }, + { + "epoch": 1.747101623091069, + "grad_norm": 2.297478675842285, + "learning_rate": 2.091258815986322e-05, + "loss": 0.4369, + "step": 2732 + }, + { + "epoch": 1.7477412648916606, + "grad_norm": 2.2317726612091064, + "learning_rate": 2.090190211583672e-05, + "loss": 0.4485, + "step": 2733 + }, + { + "epoch": 1.7483809066922524, + "grad_norm": 2.0698795318603516, + "learning_rate": 2.0891216071810214e-05, + "loss": 0.4651, + "step": 2734 + }, + { + "epoch": 1.749020548492844, + "grad_norm": 2.26355242729187, + "learning_rate": 2.0880530027783714e-05, + "loss": 0.503, + "step": 2735 + }, + { + "epoch": 1.7496601902934357, + "grad_norm": 2.4342384338378906, + "learning_rate": 2.0869843983757213e-05, + "loss": 0.4994, + "step": 2736 + }, + { + "epoch": 1.7502998320940275, + "grad_norm": 2.657132148742676, + "learning_rate": 2.0859157939730713e-05, + "loss": 0.549, + "step": 2737 + }, + { + "epoch": 1.750939473894619, + "grad_norm": 2.6180858612060547, + "learning_rate": 2.0848471895704212e-05, + "loss": 0.5755, + "step": 2738 + }, + { + "epoch": 1.7515791156952107, + "grad_norm": 2.4720981121063232, + "learning_rate": 2.083778585167771e-05, + "loss": 0.5363, + "step": 2739 + }, + { + "epoch": 1.7522187574958024, + "grad_norm": 2.6236979961395264, + "learning_rate": 2.082709980765121e-05, + "loss": 0.5526, + "step": 2740 + }, + { + "epoch": 1.752858399296394, + "grad_norm": 2.3086047172546387, + "learning_rate": 2.0816413763624707e-05, + "loss": 0.5098, + "step": 2741 + }, + { + "epoch": 1.7534980410969858, + "grad_norm": 2.465480327606201, + "learning_rate": 2.0805727719598206e-05, + "loss": 0.5077, + "step": 2742 + }, + { + "epoch": 1.7541376828975772, + "grad_norm": 2.3380141258239746, + "learning_rate": 2.0795041675571705e-05, + "loss": 0.544, + "step": 2743 + }, + { + "epoch": 1.754777324698169, + "grad_norm": 2.066955327987671, + "learning_rate": 2.07843556315452e-05, + "loss": 0.4561, + "step": 2744 + }, + { + "epoch": 1.7554169664987607, + "grad_norm": 2.0033040046691895, + "learning_rate": 2.07736695875187e-05, + "loss": 0.4036, + "step": 2745 + }, + { + "epoch": 1.7560566082993523, + "grad_norm": 2.2523396015167236, + "learning_rate": 2.07629835434922e-05, + "loss": 0.5443, + "step": 2746 + }, + { + "epoch": 1.7566962500999441, + "grad_norm": 2.8302853107452393, + "learning_rate": 2.0752297499465696e-05, + "loss": 0.6189, + "step": 2747 + }, + { + "epoch": 1.7573358919005357, + "grad_norm": 2.0967698097229004, + "learning_rate": 2.07416114554392e-05, + "loss": 0.4722, + "step": 2748 + }, + { + "epoch": 1.7579755337011274, + "grad_norm": 2.1303067207336426, + "learning_rate": 2.0730925411412698e-05, + "loss": 0.4575, + "step": 2749 + }, + { + "epoch": 1.7586151755017192, + "grad_norm": 2.3548195362091064, + "learning_rate": 2.0720239367386194e-05, + "loss": 0.523, + "step": 2750 + }, + { + "epoch": 1.7592548173023106, + "grad_norm": 2.5518958568573, + "learning_rate": 2.0709553323359693e-05, + "loss": 0.5023, + "step": 2751 + }, + { + "epoch": 1.7598944591029024, + "grad_norm": 2.3645482063293457, + "learning_rate": 2.0698867279333193e-05, + "loss": 0.5029, + "step": 2752 + }, + { + "epoch": 1.760534100903494, + "grad_norm": 2.020030975341797, + "learning_rate": 2.0688181235306692e-05, + "loss": 0.4404, + "step": 2753 + }, + { + "epoch": 1.7611737427040857, + "grad_norm": 2.188493251800537, + "learning_rate": 2.0677495191280188e-05, + "loss": 0.5262, + "step": 2754 + }, + { + "epoch": 1.7618133845046775, + "grad_norm": 2.2824320793151855, + "learning_rate": 2.0666809147253688e-05, + "loss": 0.491, + "step": 2755 + }, + { + "epoch": 1.762453026305269, + "grad_norm": 2.4223709106445312, + "learning_rate": 2.0656123103227187e-05, + "loss": 0.499, + "step": 2756 + }, + { + "epoch": 1.7630926681058607, + "grad_norm": 2.6509532928466797, + "learning_rate": 2.0645437059200683e-05, + "loss": 0.6081, + "step": 2757 + }, + { + "epoch": 1.7637323099064524, + "grad_norm": 2.5494544506073, + "learning_rate": 2.0634751015174182e-05, + "loss": 0.5476, + "step": 2758 + }, + { + "epoch": 1.764371951707044, + "grad_norm": 2.4070253372192383, + "learning_rate": 2.062406497114768e-05, + "loss": 0.4977, + "step": 2759 + }, + { + "epoch": 1.7650115935076358, + "grad_norm": 2.3602166175842285, + "learning_rate": 2.061337892712118e-05, + "loss": 0.5285, + "step": 2760 + }, + { + "epoch": 1.7656512353082274, + "grad_norm": 2.388374090194702, + "learning_rate": 2.060269288309468e-05, + "loss": 0.5323, + "step": 2761 + }, + { + "epoch": 1.766290877108819, + "grad_norm": 2.2147531509399414, + "learning_rate": 2.059200683906818e-05, + "loss": 0.4189, + "step": 2762 + }, + { + "epoch": 1.7669305189094109, + "grad_norm": 2.5422163009643555, + "learning_rate": 2.0581320795041676e-05, + "loss": 0.5374, + "step": 2763 + }, + { + "epoch": 1.7675701607100023, + "grad_norm": 2.307199716567993, + "learning_rate": 2.0570634751015175e-05, + "loss": 0.5011, + "step": 2764 + }, + { + "epoch": 1.7682098025105941, + "grad_norm": 2.382908821105957, + "learning_rate": 2.0559948706988674e-05, + "loss": 0.5394, + "step": 2765 + }, + { + "epoch": 1.7688494443111857, + "grad_norm": 2.2391819953918457, + "learning_rate": 2.0549262662962174e-05, + "loss": 0.4584, + "step": 2766 + }, + { + "epoch": 1.7694890861117774, + "grad_norm": 2.414435386657715, + "learning_rate": 2.053857661893567e-05, + "loss": 0.5168, + "step": 2767 + }, + { + "epoch": 1.7701287279123692, + "grad_norm": 2.6296498775482178, + "learning_rate": 2.052789057490917e-05, + "loss": 0.5576, + "step": 2768 + }, + { + "epoch": 1.7707683697129606, + "grad_norm": 2.425842523574829, + "learning_rate": 2.051720453088267e-05, + "loss": 0.5158, + "step": 2769 + }, + { + "epoch": 1.7714080115135524, + "grad_norm": 2.3860042095184326, + "learning_rate": 2.0506518486856164e-05, + "loss": 0.5222, + "step": 2770 + }, + { + "epoch": 1.772047653314144, + "grad_norm": 2.504329204559326, + "learning_rate": 2.0495832442829667e-05, + "loss": 0.5406, + "step": 2771 + }, + { + "epoch": 1.7726872951147357, + "grad_norm": 2.3346235752105713, + "learning_rate": 2.0485146398803167e-05, + "loss": 0.5279, + "step": 2772 + }, + { + "epoch": 1.7733269369153275, + "grad_norm": 2.253188371658325, + "learning_rate": 2.0474460354776662e-05, + "loss": 0.4823, + "step": 2773 + }, + { + "epoch": 1.7739665787159191, + "grad_norm": 2.167649745941162, + "learning_rate": 2.0463774310750162e-05, + "loss": 0.4633, + "step": 2774 + }, + { + "epoch": 1.7746062205165107, + "grad_norm": 2.0699386596679688, + "learning_rate": 2.045308826672366e-05, + "loss": 0.4717, + "step": 2775 + }, + { + "epoch": 1.7752458623171026, + "grad_norm": 2.171119213104248, + "learning_rate": 2.0442402222697157e-05, + "loss": 0.4561, + "step": 2776 + }, + { + "epoch": 1.775885504117694, + "grad_norm": 2.9215235710144043, + "learning_rate": 2.0431716178670657e-05, + "loss": 0.5582, + "step": 2777 + }, + { + "epoch": 1.7765251459182858, + "grad_norm": 2.1220853328704834, + "learning_rate": 2.0421030134644156e-05, + "loss": 0.4898, + "step": 2778 + }, + { + "epoch": 1.7771647877188774, + "grad_norm": 2.2881810665130615, + "learning_rate": 2.0410344090617652e-05, + "loss": 0.5048, + "step": 2779 + }, + { + "epoch": 1.777804429519469, + "grad_norm": 2.4088737964630127, + "learning_rate": 2.039965804659115e-05, + "loss": 0.5668, + "step": 2780 + }, + { + "epoch": 1.7784440713200609, + "grad_norm": 2.85402250289917, + "learning_rate": 2.038897200256465e-05, + "loss": 0.6494, + "step": 2781 + }, + { + "epoch": 1.7790837131206523, + "grad_norm": 2.4570300579071045, + "learning_rate": 2.037828595853815e-05, + "loss": 0.5642, + "step": 2782 + }, + { + "epoch": 1.7797233549212441, + "grad_norm": 2.45947527885437, + "learning_rate": 2.036759991451165e-05, + "loss": 0.5535, + "step": 2783 + }, + { + "epoch": 1.7803629967218357, + "grad_norm": 2.2584729194641113, + "learning_rate": 2.035691387048515e-05, + "loss": 0.5012, + "step": 2784 + }, + { + "epoch": 1.7810026385224274, + "grad_norm": 2.148829698562622, + "learning_rate": 2.0346227826458648e-05, + "loss": 0.4715, + "step": 2785 + }, + { + "epoch": 1.7816422803230192, + "grad_norm": 2.112736701965332, + "learning_rate": 2.0335541782432144e-05, + "loss": 0.4649, + "step": 2786 + }, + { + "epoch": 1.7822819221236108, + "grad_norm": 2.688622236251831, + "learning_rate": 2.0324855738405643e-05, + "loss": 0.5545, + "step": 2787 + }, + { + "epoch": 1.7829215639242024, + "grad_norm": 2.1854984760284424, + "learning_rate": 2.0314169694379143e-05, + "loss": 0.4671, + "step": 2788 + }, + { + "epoch": 1.7835612057247943, + "grad_norm": 2.436605453491211, + "learning_rate": 2.030348365035264e-05, + "loss": 0.5054, + "step": 2789 + }, + { + "epoch": 1.7842008475253857, + "grad_norm": 2.344019889831543, + "learning_rate": 2.0292797606326138e-05, + "loss": 0.5215, + "step": 2790 + }, + { + "epoch": 1.7848404893259775, + "grad_norm": 2.161363124847412, + "learning_rate": 2.0282111562299637e-05, + "loss": 0.4662, + "step": 2791 + }, + { + "epoch": 1.7854801311265691, + "grad_norm": 2.3229241371154785, + "learning_rate": 2.0271425518273133e-05, + "loss": 0.5488, + "step": 2792 + }, + { + "epoch": 1.7861197729271607, + "grad_norm": 2.2552490234375, + "learning_rate": 2.0260739474246633e-05, + "loss": 0.5126, + "step": 2793 + }, + { + "epoch": 1.7867594147277526, + "grad_norm": 2.301288604736328, + "learning_rate": 2.0250053430220136e-05, + "loss": 0.5465, + "step": 2794 + }, + { + "epoch": 1.787399056528344, + "grad_norm": 2.272421360015869, + "learning_rate": 2.0239367386193635e-05, + "loss": 0.4651, + "step": 2795 + }, + { + "epoch": 1.7880386983289358, + "grad_norm": 2.031331777572632, + "learning_rate": 2.022868134216713e-05, + "loss": 0.4683, + "step": 2796 + }, + { + "epoch": 1.7886783401295274, + "grad_norm": 2.4139904975891113, + "learning_rate": 2.021799529814063e-05, + "loss": 0.4937, + "step": 2797 + }, + { + "epoch": 1.789317981930119, + "grad_norm": 2.238555669784546, + "learning_rate": 2.020730925411413e-05, + "loss": 0.4962, + "step": 2798 + }, + { + "epoch": 1.7899576237307109, + "grad_norm": 2.5187525749206543, + "learning_rate": 2.0196623210087626e-05, + "loss": 0.593, + "step": 2799 + }, + { + "epoch": 1.7905972655313025, + "grad_norm": 2.1062705516815186, + "learning_rate": 2.0185937166061125e-05, + "loss": 0.4517, + "step": 2800 + }, + { + "epoch": 1.7912369073318941, + "grad_norm": 2.270381450653076, + "learning_rate": 2.0175251122034624e-05, + "loss": 0.483, + "step": 2801 + }, + { + "epoch": 1.791876549132486, + "grad_norm": 2.4763448238372803, + "learning_rate": 2.016456507800812e-05, + "loss": 0.5357, + "step": 2802 + }, + { + "epoch": 1.7925161909330773, + "grad_norm": 2.541749954223633, + "learning_rate": 2.015387903398162e-05, + "loss": 0.5305, + "step": 2803 + }, + { + "epoch": 1.7931558327336692, + "grad_norm": 2.6028640270233154, + "learning_rate": 2.014319298995512e-05, + "loss": 0.561, + "step": 2804 + }, + { + "epoch": 1.7937954745342608, + "grad_norm": 2.383382558822632, + "learning_rate": 2.013250694592862e-05, + "loss": 0.5404, + "step": 2805 + }, + { + "epoch": 1.7944351163348524, + "grad_norm": 2.184458017349243, + "learning_rate": 2.0121820901902118e-05, + "loss": 0.4944, + "step": 2806 + }, + { + "epoch": 1.7950747581354443, + "grad_norm": 2.631732225418091, + "learning_rate": 2.0111134857875617e-05, + "loss": 0.5581, + "step": 2807 + }, + { + "epoch": 1.7957143999360357, + "grad_norm": 2.4651522636413574, + "learning_rate": 2.0100448813849116e-05, + "loss": 0.5583, + "step": 2808 + }, + { + "epoch": 1.7963540417366275, + "grad_norm": 2.227365255355835, + "learning_rate": 2.0089762769822612e-05, + "loss": 0.4797, + "step": 2809 + }, + { + "epoch": 1.796993683537219, + "grad_norm": 2.237375020980835, + "learning_rate": 2.0079076725796112e-05, + "loss": 0.4999, + "step": 2810 + }, + { + "epoch": 1.7976333253378107, + "grad_norm": 2.3141372203826904, + "learning_rate": 2.006839068176961e-05, + "loss": 0.5572, + "step": 2811 + }, + { + "epoch": 1.7982729671384026, + "grad_norm": 2.1367897987365723, + "learning_rate": 2.0057704637743107e-05, + "loss": 0.4739, + "step": 2812 + }, + { + "epoch": 1.7989126089389942, + "grad_norm": 2.7138278484344482, + "learning_rate": 2.0047018593716606e-05, + "loss": 0.5358, + "step": 2813 + }, + { + "epoch": 1.7995522507395858, + "grad_norm": 2.4364922046661377, + "learning_rate": 2.0036332549690106e-05, + "loss": 0.5178, + "step": 2814 + }, + { + "epoch": 1.8001918925401776, + "grad_norm": 2.3911404609680176, + "learning_rate": 2.0025646505663602e-05, + "loss": 0.5185, + "step": 2815 + }, + { + "epoch": 1.800831534340769, + "grad_norm": 2.4266908168792725, + "learning_rate": 2.00149604616371e-05, + "loss": 0.5385, + "step": 2816 + }, + { + "epoch": 1.8014711761413609, + "grad_norm": 2.301403760910034, + "learning_rate": 2.0004274417610604e-05, + "loss": 0.5091, + "step": 2817 + }, + { + "epoch": 1.8021108179419525, + "grad_norm": 2.1748011112213135, + "learning_rate": 1.99935883735841e-05, + "loss": 0.5035, + "step": 2818 + }, + { + "epoch": 1.802750459742544, + "grad_norm": 2.486557960510254, + "learning_rate": 1.99829023295576e-05, + "loss": 0.5657, + "step": 2819 + }, + { + "epoch": 1.803390101543136, + "grad_norm": 2.9996259212493896, + "learning_rate": 1.99722162855311e-05, + "loss": 0.6418, + "step": 2820 + }, + { + "epoch": 1.8040297433437276, + "grad_norm": 2.488780975341797, + "learning_rate": 1.9961530241504595e-05, + "loss": 0.5772, + "step": 2821 + }, + { + "epoch": 1.8046693851443192, + "grad_norm": 2.3787550926208496, + "learning_rate": 1.9950844197478094e-05, + "loss": 0.5399, + "step": 2822 + }, + { + "epoch": 1.8053090269449108, + "grad_norm": 2.30735445022583, + "learning_rate": 1.9940158153451593e-05, + "loss": 0.5211, + "step": 2823 + }, + { + "epoch": 1.8059486687455024, + "grad_norm": 2.4023263454437256, + "learning_rate": 1.9929472109425093e-05, + "loss": 0.5413, + "step": 2824 + }, + { + "epoch": 1.8065883105460943, + "grad_norm": 2.2471938133239746, + "learning_rate": 1.991878606539859e-05, + "loss": 0.4598, + "step": 2825 + }, + { + "epoch": 1.8072279523466859, + "grad_norm": 2.5233891010284424, + "learning_rate": 1.9908100021372088e-05, + "loss": 0.5087, + "step": 2826 + }, + { + "epoch": 1.8078675941472775, + "grad_norm": 2.597228765487671, + "learning_rate": 1.9897413977345587e-05, + "loss": 0.5766, + "step": 2827 + }, + { + "epoch": 1.8085072359478693, + "grad_norm": 2.249992609024048, + "learning_rate": 1.9886727933319087e-05, + "loss": 0.5059, + "step": 2828 + }, + { + "epoch": 1.8091468777484607, + "grad_norm": 2.3526294231414795, + "learning_rate": 1.9876041889292586e-05, + "loss": 0.538, + "step": 2829 + }, + { + "epoch": 1.8097865195490526, + "grad_norm": 2.0748045444488525, + "learning_rate": 1.9865355845266085e-05, + "loss": 0.4645, + "step": 2830 + }, + { + "epoch": 1.8104261613496442, + "grad_norm": 2.502467393875122, + "learning_rate": 1.985466980123958e-05, + "loss": 0.5954, + "step": 2831 + }, + { + "epoch": 1.8110658031502358, + "grad_norm": 2.1872975826263428, + "learning_rate": 1.984398375721308e-05, + "loss": 0.5333, + "step": 2832 + }, + { + "epoch": 1.8117054449508276, + "grad_norm": 2.1395134925842285, + "learning_rate": 1.983329771318658e-05, + "loss": 0.4916, + "step": 2833 + }, + { + "epoch": 1.8123450867514193, + "grad_norm": 1.8827162981033325, + "learning_rate": 1.9822611669160076e-05, + "loss": 0.4423, + "step": 2834 + }, + { + "epoch": 1.8129847285520109, + "grad_norm": 2.33115816116333, + "learning_rate": 1.9811925625133575e-05, + "loss": 0.5272, + "step": 2835 + }, + { + "epoch": 1.8136243703526025, + "grad_norm": 2.1068661212921143, + "learning_rate": 1.9801239581107075e-05, + "loss": 0.4565, + "step": 2836 + }, + { + "epoch": 1.814264012153194, + "grad_norm": 2.497335910797119, + "learning_rate": 1.9790553537080574e-05, + "loss": 0.5472, + "step": 2837 + }, + { + "epoch": 1.814903653953786, + "grad_norm": 2.2983293533325195, + "learning_rate": 1.977986749305407e-05, + "loss": 0.5695, + "step": 2838 + }, + { + "epoch": 1.8155432957543776, + "grad_norm": 2.3338510990142822, + "learning_rate": 1.976918144902757e-05, + "loss": 0.5426, + "step": 2839 + }, + { + "epoch": 1.8161829375549692, + "grad_norm": 2.2502028942108154, + "learning_rate": 1.9758495405001072e-05, + "loss": 0.4919, + "step": 2840 + }, + { + "epoch": 1.816822579355561, + "grad_norm": 2.1771280765533447, + "learning_rate": 1.9747809360974568e-05, + "loss": 0.4862, + "step": 2841 + }, + { + "epoch": 1.8174622211561524, + "grad_norm": 1.9078037738800049, + "learning_rate": 1.9737123316948068e-05, + "loss": 0.4006, + "step": 2842 + }, + { + "epoch": 1.8181018629567443, + "grad_norm": 2.28057861328125, + "learning_rate": 1.9726437272921567e-05, + "loss": 0.5188, + "step": 2843 + }, + { + "epoch": 1.8187415047573359, + "grad_norm": 2.471477508544922, + "learning_rate": 1.9715751228895063e-05, + "loss": 0.531, + "step": 2844 + }, + { + "epoch": 1.8193811465579275, + "grad_norm": 2.534019947052002, + "learning_rate": 1.9705065184868562e-05, + "loss": 0.5377, + "step": 2845 + }, + { + "epoch": 1.8200207883585193, + "grad_norm": 2.325458288192749, + "learning_rate": 1.9694379140842062e-05, + "loss": 0.5048, + "step": 2846 + }, + { + "epoch": 1.820660430159111, + "grad_norm": 2.1005265712738037, + "learning_rate": 1.9683693096815558e-05, + "loss": 0.5358, + "step": 2847 + }, + { + "epoch": 1.8213000719597026, + "grad_norm": 2.2133257389068604, + "learning_rate": 1.9673007052789057e-05, + "loss": 0.4617, + "step": 2848 + }, + { + "epoch": 1.8219397137602944, + "grad_norm": 2.370957851409912, + "learning_rate": 1.9662321008762556e-05, + "loss": 0.5456, + "step": 2849 + }, + { + "epoch": 1.8225793555608858, + "grad_norm": 2.4923248291015625, + "learning_rate": 1.9651634964736056e-05, + "loss": 0.5363, + "step": 2850 + }, + { + "epoch": 1.8232189973614776, + "grad_norm": 2.3630828857421875, + "learning_rate": 1.9640948920709555e-05, + "loss": 0.5031, + "step": 2851 + }, + { + "epoch": 1.8238586391620693, + "grad_norm": 2.4305341243743896, + "learning_rate": 1.9630262876683054e-05, + "loss": 0.5095, + "step": 2852 + }, + { + "epoch": 1.8244982809626609, + "grad_norm": 2.325582981109619, + "learning_rate": 1.9619576832656554e-05, + "loss": 0.5035, + "step": 2853 + }, + { + "epoch": 1.8251379227632527, + "grad_norm": 2.299703359603882, + "learning_rate": 1.960889078863005e-05, + "loss": 0.4883, + "step": 2854 + }, + { + "epoch": 1.825777564563844, + "grad_norm": 2.228072166442871, + "learning_rate": 1.959820474460355e-05, + "loss": 0.4957, + "step": 2855 + }, + { + "epoch": 1.826417206364436, + "grad_norm": 2.7845544815063477, + "learning_rate": 1.958751870057705e-05, + "loss": 0.5759, + "step": 2856 + }, + { + "epoch": 1.8270568481650276, + "grad_norm": 2.498368501663208, + "learning_rate": 1.9576832656550545e-05, + "loss": 0.5698, + "step": 2857 + }, + { + "epoch": 1.8276964899656192, + "grad_norm": 2.3309435844421387, + "learning_rate": 1.9566146612524044e-05, + "loss": 0.5041, + "step": 2858 + }, + { + "epoch": 1.828336131766211, + "grad_norm": 2.438725471496582, + "learning_rate": 1.9555460568497543e-05, + "loss": 0.4812, + "step": 2859 + }, + { + "epoch": 1.8289757735668026, + "grad_norm": 1.9939138889312744, + "learning_rate": 1.954477452447104e-05, + "loss": 0.4068, + "step": 2860 + }, + { + "epoch": 1.8296154153673942, + "grad_norm": 2.1259148120880127, + "learning_rate": 1.953408848044454e-05, + "loss": 0.4568, + "step": 2861 + }, + { + "epoch": 1.830255057167986, + "grad_norm": 2.136159896850586, + "learning_rate": 1.9523402436418038e-05, + "loss": 0.4804, + "step": 2862 + }, + { + "epoch": 1.8308946989685775, + "grad_norm": 2.6476261615753174, + "learning_rate": 1.9512716392391537e-05, + "loss": 0.547, + "step": 2863 + }, + { + "epoch": 1.8315343407691693, + "grad_norm": 2.4074833393096924, + "learning_rate": 1.9502030348365037e-05, + "loss": 0.4942, + "step": 2864 + }, + { + "epoch": 1.832173982569761, + "grad_norm": 2.452183485031128, + "learning_rate": 1.9491344304338536e-05, + "loss": 0.5082, + "step": 2865 + }, + { + "epoch": 1.8328136243703526, + "grad_norm": 2.49965500831604, + "learning_rate": 1.9480658260312035e-05, + "loss": 0.5594, + "step": 2866 + }, + { + "epoch": 1.8334532661709444, + "grad_norm": 2.890207529067993, + "learning_rate": 1.946997221628553e-05, + "loss": 0.6236, + "step": 2867 + }, + { + "epoch": 1.8340929079715358, + "grad_norm": 2.6452999114990234, + "learning_rate": 1.945928617225903e-05, + "loss": 0.5172, + "step": 2868 + }, + { + "epoch": 1.8347325497721276, + "grad_norm": 2.1806747913360596, + "learning_rate": 1.944860012823253e-05, + "loss": 0.5026, + "step": 2869 + }, + { + "epoch": 1.8353721915727192, + "grad_norm": 2.406280994415283, + "learning_rate": 1.9437914084206026e-05, + "loss": 0.5113, + "step": 2870 + }, + { + "epoch": 1.8360118333733109, + "grad_norm": 2.241525650024414, + "learning_rate": 1.9427228040179525e-05, + "loss": 0.4731, + "step": 2871 + }, + { + "epoch": 1.8366514751739027, + "grad_norm": 2.9569907188415527, + "learning_rate": 1.9416541996153025e-05, + "loss": 0.6266, + "step": 2872 + }, + { + "epoch": 1.8372911169744943, + "grad_norm": 2.120638370513916, + "learning_rate": 1.9405855952126524e-05, + "loss": 0.4776, + "step": 2873 + }, + { + "epoch": 1.837930758775086, + "grad_norm": 2.5313446521759033, + "learning_rate": 1.9395169908100024e-05, + "loss": 0.5621, + "step": 2874 + }, + { + "epoch": 1.8385704005756778, + "grad_norm": 2.3138656616210938, + "learning_rate": 1.9384483864073523e-05, + "loss": 0.4946, + "step": 2875 + }, + { + "epoch": 1.8392100423762692, + "grad_norm": 2.272691011428833, + "learning_rate": 1.937379782004702e-05, + "loss": 0.504, + "step": 2876 + }, + { + "epoch": 1.839849684176861, + "grad_norm": 2.0644941329956055, + "learning_rate": 1.9363111776020518e-05, + "loss": 0.4001, + "step": 2877 + }, + { + "epoch": 1.8404893259774526, + "grad_norm": 2.3746023178100586, + "learning_rate": 1.9352425731994018e-05, + "loss": 0.482, + "step": 2878 + }, + { + "epoch": 1.8411289677780442, + "grad_norm": 2.5728492736816406, + "learning_rate": 1.9341739687967517e-05, + "loss": 0.5322, + "step": 2879 + }, + { + "epoch": 1.841768609578636, + "grad_norm": 2.387606620788574, + "learning_rate": 1.9331053643941013e-05, + "loss": 0.5333, + "step": 2880 + }, + { + "epoch": 1.8424082513792275, + "grad_norm": 2.3684723377227783, + "learning_rate": 1.9320367599914512e-05, + "loss": 0.5318, + "step": 2881 + }, + { + "epoch": 1.8430478931798193, + "grad_norm": 2.52715802192688, + "learning_rate": 1.930968155588801e-05, + "loss": 0.6003, + "step": 2882 + }, + { + "epoch": 1.843687534980411, + "grad_norm": 2.1936612129211426, + "learning_rate": 1.9298995511861508e-05, + "loss": 0.502, + "step": 2883 + }, + { + "epoch": 1.8443271767810026, + "grad_norm": 2.4667599201202393, + "learning_rate": 1.9288309467835007e-05, + "loss": 0.5527, + "step": 2884 + }, + { + "epoch": 1.8449668185815944, + "grad_norm": 2.4780595302581787, + "learning_rate": 1.9277623423808506e-05, + "loss": 0.5833, + "step": 2885 + }, + { + "epoch": 1.845606460382186, + "grad_norm": 2.8425557613372803, + "learning_rate": 1.9266937379782006e-05, + "loss": 0.6324, + "step": 2886 + }, + { + "epoch": 1.8462461021827776, + "grad_norm": 2.422700881958008, + "learning_rate": 1.9256251335755505e-05, + "loss": 0.517, + "step": 2887 + }, + { + "epoch": 1.8468857439833695, + "grad_norm": 2.296245813369751, + "learning_rate": 1.9245565291729004e-05, + "loss": 0.5092, + "step": 2888 + }, + { + "epoch": 1.8475253857839609, + "grad_norm": 2.169027328491211, + "learning_rate": 1.92348792477025e-05, + "loss": 0.5144, + "step": 2889 + }, + { + "epoch": 1.8481650275845527, + "grad_norm": 2.3023126125335693, + "learning_rate": 1.9224193203676e-05, + "loss": 0.5187, + "step": 2890 + }, + { + "epoch": 1.8488046693851443, + "grad_norm": 2.524099349975586, + "learning_rate": 1.92135071596495e-05, + "loss": 0.5299, + "step": 2891 + }, + { + "epoch": 1.849444311185736, + "grad_norm": 2.147122383117676, + "learning_rate": 1.9202821115622995e-05, + "loss": 0.4887, + "step": 2892 + }, + { + "epoch": 1.8500839529863278, + "grad_norm": 2.1998541355133057, + "learning_rate": 1.9192135071596494e-05, + "loss": 0.499, + "step": 2893 + }, + { + "epoch": 1.8507235947869192, + "grad_norm": 2.1991591453552246, + "learning_rate": 1.9181449027569994e-05, + "loss": 0.5174, + "step": 2894 + }, + { + "epoch": 1.851363236587511, + "grad_norm": 2.147448778152466, + "learning_rate": 1.9170762983543493e-05, + "loss": 0.4673, + "step": 2895 + }, + { + "epoch": 1.8520028783881026, + "grad_norm": 2.2695560455322266, + "learning_rate": 1.9160076939516993e-05, + "loss": 0.5346, + "step": 2896 + }, + { + "epoch": 1.8526425201886942, + "grad_norm": 2.296457290649414, + "learning_rate": 1.9149390895490492e-05, + "loss": 0.5016, + "step": 2897 + }, + { + "epoch": 1.853282161989286, + "grad_norm": 2.324726104736328, + "learning_rate": 1.913870485146399e-05, + "loss": 0.521, + "step": 2898 + }, + { + "epoch": 1.8539218037898777, + "grad_norm": 2.3072283267974854, + "learning_rate": 1.9128018807437487e-05, + "loss": 0.5735, + "step": 2899 + }, + { + "epoch": 1.8545614455904693, + "grad_norm": 2.0744760036468506, + "learning_rate": 1.9117332763410987e-05, + "loss": 0.4706, + "step": 2900 + }, + { + "epoch": 1.8552010873910612, + "grad_norm": 2.046598434448242, + "learning_rate": 1.9106646719384486e-05, + "loss": 0.4366, + "step": 2901 + }, + { + "epoch": 1.8558407291916525, + "grad_norm": 2.2961080074310303, + "learning_rate": 1.9095960675357982e-05, + "loss": 0.4851, + "step": 2902 + }, + { + "epoch": 1.8564803709922444, + "grad_norm": 3.0587995052337646, + "learning_rate": 1.908527463133148e-05, + "loss": 0.6864, + "step": 2903 + }, + { + "epoch": 1.857120012792836, + "grad_norm": 2.288538694381714, + "learning_rate": 1.907458858730498e-05, + "loss": 0.4571, + "step": 2904 + }, + { + "epoch": 1.8577596545934276, + "grad_norm": 2.589794874191284, + "learning_rate": 1.9063902543278477e-05, + "loss": 0.5282, + "step": 2905 + }, + { + "epoch": 1.8583992963940195, + "grad_norm": 2.2429935932159424, + "learning_rate": 1.9053216499251976e-05, + "loss": 0.4864, + "step": 2906 + }, + { + "epoch": 1.8590389381946109, + "grad_norm": 2.186279058456421, + "learning_rate": 1.9042530455225475e-05, + "loss": 0.4685, + "step": 2907 + }, + { + "epoch": 1.8596785799952027, + "grad_norm": 2.2375638484954834, + "learning_rate": 1.9031844411198975e-05, + "loss": 0.527, + "step": 2908 + }, + { + "epoch": 1.8603182217957943, + "grad_norm": 2.140592098236084, + "learning_rate": 1.9021158367172474e-05, + "loss": 0.4668, + "step": 2909 + }, + { + "epoch": 1.860957863596386, + "grad_norm": 2.629301071166992, + "learning_rate": 1.9010472323145973e-05, + "loss": 0.5853, + "step": 2910 + }, + { + "epoch": 1.8615975053969778, + "grad_norm": 2.436850070953369, + "learning_rate": 1.8999786279119473e-05, + "loss": 0.4926, + "step": 2911 + }, + { + "epoch": 1.8622371471975694, + "grad_norm": 2.32812237739563, + "learning_rate": 1.898910023509297e-05, + "loss": 0.4996, + "step": 2912 + }, + { + "epoch": 1.862876788998161, + "grad_norm": 2.576978921890259, + "learning_rate": 1.8978414191066468e-05, + "loss": 0.5053, + "step": 2913 + }, + { + "epoch": 1.8635164307987528, + "grad_norm": 2.375523567199707, + "learning_rate": 1.8967728147039967e-05, + "loss": 0.5484, + "step": 2914 + }, + { + "epoch": 1.8641560725993442, + "grad_norm": 2.790060520172119, + "learning_rate": 1.8957042103013463e-05, + "loss": 0.6171, + "step": 2915 + }, + { + "epoch": 1.864795714399936, + "grad_norm": 2.5728256702423096, + "learning_rate": 1.8946356058986963e-05, + "loss": 0.5514, + "step": 2916 + }, + { + "epoch": 1.8654353562005277, + "grad_norm": 2.630537986755371, + "learning_rate": 1.8935670014960462e-05, + "loss": 0.5538, + "step": 2917 + }, + { + "epoch": 1.8660749980011193, + "grad_norm": 2.4287662506103516, + "learning_rate": 1.892498397093396e-05, + "loss": 0.4809, + "step": 2918 + }, + { + "epoch": 1.8667146398017112, + "grad_norm": 2.2396273612976074, + "learning_rate": 1.891429792690746e-05, + "loss": 0.4775, + "step": 2919 + }, + { + "epoch": 1.8673542816023025, + "grad_norm": 2.4905190467834473, + "learning_rate": 1.890361188288096e-05, + "loss": 0.5294, + "step": 2920 + }, + { + "epoch": 1.8679939234028944, + "grad_norm": 2.278452157974243, + "learning_rate": 1.889292583885446e-05, + "loss": 0.5034, + "step": 2921 + }, + { + "epoch": 1.868633565203486, + "grad_norm": 2.7109076976776123, + "learning_rate": 1.8882239794827956e-05, + "loss": 0.5633, + "step": 2922 + }, + { + "epoch": 1.8692732070040776, + "grad_norm": 2.3873202800750732, + "learning_rate": 1.8871553750801455e-05, + "loss": 0.5639, + "step": 2923 + }, + { + "epoch": 1.8699128488046695, + "grad_norm": 2.337348222732544, + "learning_rate": 1.8860867706774954e-05, + "loss": 0.5228, + "step": 2924 + }, + { + "epoch": 1.870552490605261, + "grad_norm": 2.482011556625366, + "learning_rate": 1.885018166274845e-05, + "loss": 0.5571, + "step": 2925 + }, + { + "epoch": 1.8711921324058527, + "grad_norm": 2.2913031578063965, + "learning_rate": 1.883949561872195e-05, + "loss": 0.5206, + "step": 2926 + }, + { + "epoch": 1.8718317742064445, + "grad_norm": 2.7040319442749023, + "learning_rate": 1.882880957469545e-05, + "loss": 0.6375, + "step": 2927 + }, + { + "epoch": 1.872471416007036, + "grad_norm": 2.2623605728149414, + "learning_rate": 1.8818123530668945e-05, + "loss": 0.4861, + "step": 2928 + }, + { + "epoch": 1.8731110578076278, + "grad_norm": 2.1366677284240723, + "learning_rate": 1.8807437486642444e-05, + "loss": 0.4707, + "step": 2929 + }, + { + "epoch": 1.8737506996082194, + "grad_norm": 2.506088972091675, + "learning_rate": 1.8796751442615944e-05, + "loss": 0.5611, + "step": 2930 + }, + { + "epoch": 1.874390341408811, + "grad_norm": 2.580218553543091, + "learning_rate": 1.8786065398589443e-05, + "loss": 0.4617, + "step": 2931 + }, + { + "epoch": 1.8750299832094028, + "grad_norm": 2.405839443206787, + "learning_rate": 1.8775379354562942e-05, + "loss": 0.5625, + "step": 2932 + }, + { + "epoch": 1.8756696250099945, + "grad_norm": 2.0239696502685547, + "learning_rate": 1.8764693310536442e-05, + "loss": 0.4668, + "step": 2933 + }, + { + "epoch": 1.876309266810586, + "grad_norm": 2.750504732131958, + "learning_rate": 1.8754007266509938e-05, + "loss": 0.5686, + "step": 2934 + }, + { + "epoch": 1.8769489086111777, + "grad_norm": 2.5134050846099854, + "learning_rate": 1.8743321222483437e-05, + "loss": 0.489, + "step": 2935 + }, + { + "epoch": 1.8775885504117693, + "grad_norm": 2.1368775367736816, + "learning_rate": 1.8732635178456937e-05, + "loss": 0.478, + "step": 2936 + }, + { + "epoch": 1.8782281922123611, + "grad_norm": 2.401207208633423, + "learning_rate": 1.8721949134430436e-05, + "loss": 0.514, + "step": 2937 + }, + { + "epoch": 1.8788678340129528, + "grad_norm": 2.598897695541382, + "learning_rate": 1.8711263090403932e-05, + "loss": 0.6011, + "step": 2938 + }, + { + "epoch": 1.8795074758135444, + "grad_norm": 2.474565267562866, + "learning_rate": 1.870057704637743e-05, + "loss": 0.5442, + "step": 2939 + }, + { + "epoch": 1.8801471176141362, + "grad_norm": 2.3614892959594727, + "learning_rate": 1.868989100235093e-05, + "loss": 0.5204, + "step": 2940 + }, + { + "epoch": 1.8807867594147276, + "grad_norm": 2.716115951538086, + "learning_rate": 1.867920495832443e-05, + "loss": 0.5508, + "step": 2941 + }, + { + "epoch": 1.8814264012153195, + "grad_norm": 2.105520486831665, + "learning_rate": 1.866851891429793e-05, + "loss": 0.4866, + "step": 2942 + }, + { + "epoch": 1.882066043015911, + "grad_norm": 2.2073347568511963, + "learning_rate": 1.865783287027143e-05, + "loss": 0.5264, + "step": 2943 + }, + { + "epoch": 1.8827056848165027, + "grad_norm": 2.342784881591797, + "learning_rate": 1.8647146826244925e-05, + "loss": 0.5095, + "step": 2944 + }, + { + "epoch": 1.8833453266170945, + "grad_norm": 2.5862765312194824, + "learning_rate": 1.8636460782218424e-05, + "loss": 0.5496, + "step": 2945 + }, + { + "epoch": 1.8839849684176861, + "grad_norm": 2.324101686477661, + "learning_rate": 1.8625774738191923e-05, + "loss": 0.5267, + "step": 2946 + }, + { + "epoch": 1.8846246102182778, + "grad_norm": 2.2458910942077637, + "learning_rate": 1.861508869416542e-05, + "loss": 0.498, + "step": 2947 + }, + { + "epoch": 1.8852642520188694, + "grad_norm": 1.9799495935440063, + "learning_rate": 1.860440265013892e-05, + "loss": 0.4484, + "step": 2948 + }, + { + "epoch": 1.885903893819461, + "grad_norm": 2.2013511657714844, + "learning_rate": 1.8593716606112418e-05, + "loss": 0.4924, + "step": 2949 + }, + { + "epoch": 1.8865435356200528, + "grad_norm": 2.6202101707458496, + "learning_rate": 1.8583030562085917e-05, + "loss": 0.6297, + "step": 2950 + }, + { + "epoch": 1.8871831774206445, + "grad_norm": 2.379664659500122, + "learning_rate": 1.8572344518059413e-05, + "loss": 0.5283, + "step": 2951 + }, + { + "epoch": 1.887822819221236, + "grad_norm": 2.2940313816070557, + "learning_rate": 1.8561658474032913e-05, + "loss": 0.4914, + "step": 2952 + }, + { + "epoch": 1.888462461021828, + "grad_norm": 2.6438848972320557, + "learning_rate": 1.8550972430006412e-05, + "loss": 0.5502, + "step": 2953 + }, + { + "epoch": 1.8891021028224193, + "grad_norm": 2.051809549331665, + "learning_rate": 1.854028638597991e-05, + "loss": 0.4924, + "step": 2954 + }, + { + "epoch": 1.8897417446230111, + "grad_norm": 2.2035608291625977, + "learning_rate": 1.852960034195341e-05, + "loss": 0.5055, + "step": 2955 + }, + { + "epoch": 1.8903813864236028, + "grad_norm": 2.3531949520111084, + "learning_rate": 1.851891429792691e-05, + "loss": 0.5324, + "step": 2956 + }, + { + "epoch": 1.8910210282241944, + "grad_norm": 2.36861515045166, + "learning_rate": 1.8508228253900406e-05, + "loss": 0.548, + "step": 2957 + }, + { + "epoch": 1.8916606700247862, + "grad_norm": 2.393678903579712, + "learning_rate": 1.8497542209873906e-05, + "loss": 0.4805, + "step": 2958 + }, + { + "epoch": 1.8923003118253778, + "grad_norm": 2.4288806915283203, + "learning_rate": 1.8486856165847405e-05, + "loss": 0.5568, + "step": 2959 + }, + { + "epoch": 1.8929399536259695, + "grad_norm": 2.0793449878692627, + "learning_rate": 1.84761701218209e-05, + "loss": 0.4437, + "step": 2960 + }, + { + "epoch": 1.8935795954265613, + "grad_norm": 2.4150569438934326, + "learning_rate": 1.84654840777944e-05, + "loss": 0.4948, + "step": 2961 + }, + { + "epoch": 1.8942192372271527, + "grad_norm": 2.2338144779205322, + "learning_rate": 1.84547980337679e-05, + "loss": 0.4872, + "step": 2962 + }, + { + "epoch": 1.8948588790277445, + "grad_norm": 2.272186756134033, + "learning_rate": 1.84441119897414e-05, + "loss": 0.5006, + "step": 2963 + }, + { + "epoch": 1.8954985208283361, + "grad_norm": 2.1584689617156982, + "learning_rate": 1.84334259457149e-05, + "loss": 0.4713, + "step": 2964 + }, + { + "epoch": 1.8961381626289278, + "grad_norm": 2.4406025409698486, + "learning_rate": 1.8422739901688398e-05, + "loss": 0.5635, + "step": 2965 + }, + { + "epoch": 1.8967778044295196, + "grad_norm": 2.464693546295166, + "learning_rate": 1.8412053857661897e-05, + "loss": 0.5373, + "step": 2966 + }, + { + "epoch": 1.897417446230111, + "grad_norm": 2.3238608837127686, + "learning_rate": 1.8401367813635393e-05, + "loss": 0.4669, + "step": 2967 + }, + { + "epoch": 1.8980570880307028, + "grad_norm": 2.6213347911834717, + "learning_rate": 1.8390681769608892e-05, + "loss": 0.5633, + "step": 2968 + }, + { + "epoch": 1.8986967298312944, + "grad_norm": 2.555800437927246, + "learning_rate": 1.8379995725582392e-05, + "loss": 0.5642, + "step": 2969 + }, + { + "epoch": 1.899336371631886, + "grad_norm": 2.4231338500976562, + "learning_rate": 1.8369309681555888e-05, + "loss": 0.4436, + "step": 2970 + }, + { + "epoch": 1.899976013432478, + "grad_norm": 2.431324005126953, + "learning_rate": 1.8358623637529387e-05, + "loss": 0.519, + "step": 2971 + }, + { + "epoch": 1.9006156552330695, + "grad_norm": 2.5057456493377686, + "learning_rate": 1.8347937593502886e-05, + "loss": 0.5272, + "step": 2972 + }, + { + "epoch": 1.9012552970336611, + "grad_norm": 2.324465751647949, + "learning_rate": 1.8337251549476382e-05, + "loss": 0.5113, + "step": 2973 + }, + { + "epoch": 1.901894938834253, + "grad_norm": 2.5816757678985596, + "learning_rate": 1.8326565505449882e-05, + "loss": 0.576, + "step": 2974 + }, + { + "epoch": 1.9025345806348444, + "grad_norm": 2.0733630657196045, + "learning_rate": 1.831587946142338e-05, + "loss": 0.4283, + "step": 2975 + }, + { + "epoch": 1.9031742224354362, + "grad_norm": 2.216127872467041, + "learning_rate": 1.830519341739688e-05, + "loss": 0.4933, + "step": 2976 + }, + { + "epoch": 1.9038138642360278, + "grad_norm": 2.7155752182006836, + "learning_rate": 1.829450737337038e-05, + "loss": 0.5388, + "step": 2977 + }, + { + "epoch": 1.9044535060366194, + "grad_norm": 2.3714749813079834, + "learning_rate": 1.828382132934388e-05, + "loss": 0.5123, + "step": 2978 + }, + { + "epoch": 1.9050931478372113, + "grad_norm": 2.344045877456665, + "learning_rate": 1.827313528531738e-05, + "loss": 0.477, + "step": 2979 + }, + { + "epoch": 1.9057327896378027, + "grad_norm": 2.5849430561065674, + "learning_rate": 1.8262449241290875e-05, + "loss": 0.524, + "step": 2980 + }, + { + "epoch": 1.9063724314383945, + "grad_norm": 2.4501564502716064, + "learning_rate": 1.8251763197264374e-05, + "loss": 0.555, + "step": 2981 + }, + { + "epoch": 1.9070120732389861, + "grad_norm": 1.8876103162765503, + "learning_rate": 1.8241077153237873e-05, + "loss": 0.4177, + "step": 2982 + }, + { + "epoch": 1.9076517150395778, + "grad_norm": 2.443512201309204, + "learning_rate": 1.823039110921137e-05, + "loss": 0.5273, + "step": 2983 + }, + { + "epoch": 1.9082913568401696, + "grad_norm": 2.362765312194824, + "learning_rate": 1.821970506518487e-05, + "loss": 0.5382, + "step": 2984 + }, + { + "epoch": 1.9089309986407612, + "grad_norm": 2.348677158355713, + "learning_rate": 1.8209019021158368e-05, + "loss": 0.5361, + "step": 2985 + }, + { + "epoch": 1.9095706404413528, + "grad_norm": 2.2172768115997314, + "learning_rate": 1.8198332977131864e-05, + "loss": 0.4562, + "step": 2986 + }, + { + "epoch": 1.9102102822419447, + "grad_norm": 2.3510067462921143, + "learning_rate": 1.8187646933105367e-05, + "loss": 0.5394, + "step": 2987 + }, + { + "epoch": 1.910849924042536, + "grad_norm": 2.348907709121704, + "learning_rate": 1.8176960889078866e-05, + "loss": 0.5132, + "step": 2988 + }, + { + "epoch": 1.911489565843128, + "grad_norm": 2.321542739868164, + "learning_rate": 1.8166274845052362e-05, + "loss": 0.5038, + "step": 2989 + }, + { + "epoch": 1.9121292076437195, + "grad_norm": 2.314908027648926, + "learning_rate": 1.815558880102586e-05, + "loss": 0.4966, + "step": 2990 + }, + { + "epoch": 1.9127688494443111, + "grad_norm": 2.22035813331604, + "learning_rate": 1.814490275699936e-05, + "loss": 0.4783, + "step": 2991 + }, + { + "epoch": 1.913408491244903, + "grad_norm": 2.2341954708099365, + "learning_rate": 1.813421671297286e-05, + "loss": 0.5306, + "step": 2992 + }, + { + "epoch": 1.9140481330454944, + "grad_norm": 2.263852119445801, + "learning_rate": 1.8123530668946356e-05, + "loss": 0.5019, + "step": 2993 + }, + { + "epoch": 1.9146877748460862, + "grad_norm": 2.124631404876709, + "learning_rate": 1.8112844624919855e-05, + "loss": 0.4434, + "step": 2994 + }, + { + "epoch": 1.9153274166466778, + "grad_norm": 2.5393638610839844, + "learning_rate": 1.8102158580893355e-05, + "loss": 0.5231, + "step": 2995 + }, + { + "epoch": 1.9159670584472694, + "grad_norm": 2.1628592014312744, + "learning_rate": 1.809147253686685e-05, + "loss": 0.4982, + "step": 2996 + }, + { + "epoch": 1.9166067002478613, + "grad_norm": 2.546919584274292, + "learning_rate": 1.808078649284035e-05, + "loss": 0.59, + "step": 2997 + }, + { + "epoch": 1.917246342048453, + "grad_norm": 2.281437873840332, + "learning_rate": 1.807010044881385e-05, + "loss": 0.5009, + "step": 2998 + }, + { + "epoch": 1.9178859838490445, + "grad_norm": 2.231894016265869, + "learning_rate": 1.805941440478735e-05, + "loss": 0.5017, + "step": 2999 + }, + { + "epoch": 1.9185256256496364, + "grad_norm": 2.126573085784912, + "learning_rate": 1.8048728360760848e-05, + "loss": 0.4442, + "step": 3000 + }, + { + "epoch": 1.9191652674502278, + "grad_norm": 2.1402084827423096, + "learning_rate": 1.8038042316734348e-05, + "loss": 0.4721, + "step": 3001 + }, + { + "epoch": 1.9198049092508196, + "grad_norm": 2.5384037494659424, + "learning_rate": 1.8027356272707844e-05, + "loss": 0.5233, + "step": 3002 + }, + { + "epoch": 1.9204445510514112, + "grad_norm": 2.306366443634033, + "learning_rate": 1.8016670228681343e-05, + "loss": 0.503, + "step": 3003 + }, + { + "epoch": 1.9210841928520028, + "grad_norm": 2.258857250213623, + "learning_rate": 1.8005984184654842e-05, + "loss": 0.5245, + "step": 3004 + }, + { + "epoch": 1.9217238346525947, + "grad_norm": 2.570263385772705, + "learning_rate": 1.799529814062834e-05, + "loss": 0.5526, + "step": 3005 + }, + { + "epoch": 1.922363476453186, + "grad_norm": 2.4310100078582764, + "learning_rate": 1.7984612096601838e-05, + "loss": 0.4631, + "step": 3006 + }, + { + "epoch": 1.923003118253778, + "grad_norm": 2.628833293914795, + "learning_rate": 1.7973926052575337e-05, + "loss": 0.5495, + "step": 3007 + }, + { + "epoch": 1.9236427600543695, + "grad_norm": 2.5494179725646973, + "learning_rate": 1.7963240008548836e-05, + "loss": 0.5466, + "step": 3008 + }, + { + "epoch": 1.9242824018549611, + "grad_norm": 2.444270610809326, + "learning_rate": 1.7952553964522332e-05, + "loss": 0.5158, + "step": 3009 + }, + { + "epoch": 1.924922043655553, + "grad_norm": 1.8992811441421509, + "learning_rate": 1.7941867920495835e-05, + "loss": 0.4101, + "step": 3010 + }, + { + "epoch": 1.9255616854561446, + "grad_norm": 2.1182570457458496, + "learning_rate": 1.7931181876469334e-05, + "loss": 0.4505, + "step": 3011 + }, + { + "epoch": 1.9262013272567362, + "grad_norm": 2.212345600128174, + "learning_rate": 1.792049583244283e-05, + "loss": 0.538, + "step": 3012 + }, + { + "epoch": 1.926840969057328, + "grad_norm": 2.1981735229492188, + "learning_rate": 1.790980978841633e-05, + "loss": 0.477, + "step": 3013 + }, + { + "epoch": 1.9274806108579194, + "grad_norm": 2.655529260635376, + "learning_rate": 1.789912374438983e-05, + "loss": 0.5713, + "step": 3014 + }, + { + "epoch": 1.9281202526585113, + "grad_norm": 2.3781497478485107, + "learning_rate": 1.7888437700363325e-05, + "loss": 0.4905, + "step": 3015 + }, + { + "epoch": 1.928759894459103, + "grad_norm": 2.2360687255859375, + "learning_rate": 1.7877751656336824e-05, + "loss": 0.4368, + "step": 3016 + }, + { + "epoch": 1.9293995362596945, + "grad_norm": 2.6301515102386475, + "learning_rate": 1.7867065612310324e-05, + "loss": 0.5156, + "step": 3017 + }, + { + "epoch": 1.9300391780602864, + "grad_norm": 2.265232563018799, + "learning_rate": 1.785637956828382e-05, + "loss": 0.5159, + "step": 3018 + }, + { + "epoch": 1.9306788198608777, + "grad_norm": 2.2301290035247803, + "learning_rate": 1.784569352425732e-05, + "loss": 0.4527, + "step": 3019 + }, + { + "epoch": 1.9313184616614696, + "grad_norm": 2.714449405670166, + "learning_rate": 1.783500748023082e-05, + "loss": 0.5691, + "step": 3020 + }, + { + "epoch": 1.9319581034620612, + "grad_norm": 2.274967908859253, + "learning_rate": 1.7824321436204318e-05, + "loss": 0.4735, + "step": 3021 + }, + { + "epoch": 1.9325977452626528, + "grad_norm": 2.453223943710327, + "learning_rate": 1.7813635392177817e-05, + "loss": 0.5531, + "step": 3022 + }, + { + "epoch": 1.9332373870632447, + "grad_norm": 2.511776924133301, + "learning_rate": 1.7802949348151317e-05, + "loss": 0.5687, + "step": 3023 + }, + { + "epoch": 1.9338770288638363, + "grad_norm": 2.385563611984253, + "learning_rate": 1.7792263304124816e-05, + "loss": 0.5505, + "step": 3024 + }, + { + "epoch": 1.934516670664428, + "grad_norm": 2.4991912841796875, + "learning_rate": 1.7781577260098312e-05, + "loss": 0.5437, + "step": 3025 + }, + { + "epoch": 1.9351563124650197, + "grad_norm": 2.4782683849334717, + "learning_rate": 1.777089121607181e-05, + "loss": 0.5373, + "step": 3026 + }, + { + "epoch": 1.9357959542656111, + "grad_norm": 2.317281723022461, + "learning_rate": 1.776020517204531e-05, + "loss": 0.4439, + "step": 3027 + }, + { + "epoch": 1.936435596066203, + "grad_norm": 2.05065655708313, + "learning_rate": 1.7749519128018807e-05, + "loss": 0.427, + "step": 3028 + }, + { + "epoch": 1.9370752378667946, + "grad_norm": 2.3775722980499268, + "learning_rate": 1.7738833083992306e-05, + "loss": 0.4866, + "step": 3029 + }, + { + "epoch": 1.9377148796673862, + "grad_norm": 2.2226779460906982, + "learning_rate": 1.7728147039965805e-05, + "loss": 0.4843, + "step": 3030 + }, + { + "epoch": 1.938354521467978, + "grad_norm": 2.216209650039673, + "learning_rate": 1.77174609959393e-05, + "loss": 0.4761, + "step": 3031 + }, + { + "epoch": 1.9389941632685694, + "grad_norm": 2.2940866947174072, + "learning_rate": 1.77067749519128e-05, + "loss": 0.4643, + "step": 3032 + }, + { + "epoch": 1.9396338050691613, + "grad_norm": 2.479513168334961, + "learning_rate": 1.7696088907886303e-05, + "loss": 0.531, + "step": 3033 + }, + { + "epoch": 1.940273446869753, + "grad_norm": 2.4281535148620605, + "learning_rate": 1.7685402863859803e-05, + "loss": 0.5095, + "step": 3034 + }, + { + "epoch": 1.9409130886703445, + "grad_norm": 2.246185779571533, + "learning_rate": 1.76747168198333e-05, + "loss": 0.479, + "step": 3035 + }, + { + "epoch": 1.9415527304709363, + "grad_norm": 2.2305679321289062, + "learning_rate": 1.7664030775806798e-05, + "loss": 0.5134, + "step": 3036 + }, + { + "epoch": 1.942192372271528, + "grad_norm": 2.1311020851135254, + "learning_rate": 1.7653344731780298e-05, + "loss": 0.4557, + "step": 3037 + }, + { + "epoch": 1.9428320140721196, + "grad_norm": 2.451103448867798, + "learning_rate": 1.7642658687753794e-05, + "loss": 0.506, + "step": 3038 + }, + { + "epoch": 1.9434716558727114, + "grad_norm": 2.1855053901672363, + "learning_rate": 1.7631972643727293e-05, + "loss": 0.4452, + "step": 3039 + }, + { + "epoch": 1.9441112976733028, + "grad_norm": 2.606523036956787, + "learning_rate": 1.7621286599700792e-05, + "loss": 0.544, + "step": 3040 + }, + { + "epoch": 1.9447509394738947, + "grad_norm": 2.280884265899658, + "learning_rate": 1.7610600555674288e-05, + "loss": 0.4756, + "step": 3041 + }, + { + "epoch": 1.9453905812744863, + "grad_norm": 2.6737313270568848, + "learning_rate": 1.7599914511647788e-05, + "loss": 0.5752, + "step": 3042 + }, + { + "epoch": 1.946030223075078, + "grad_norm": 2.865755319595337, + "learning_rate": 1.7589228467621287e-05, + "loss": 0.5725, + "step": 3043 + }, + { + "epoch": 1.9466698648756697, + "grad_norm": 2.591252326965332, + "learning_rate": 1.7578542423594786e-05, + "loss": 0.5937, + "step": 3044 + }, + { + "epoch": 1.9473095066762613, + "grad_norm": 2.2542731761932373, + "learning_rate": 1.7567856379568286e-05, + "loss": 0.4685, + "step": 3045 + }, + { + "epoch": 1.947949148476853, + "grad_norm": 2.729081630706787, + "learning_rate": 1.7557170335541785e-05, + "loss": 0.5462, + "step": 3046 + }, + { + "epoch": 1.9485887902774446, + "grad_norm": 2.0769948959350586, + "learning_rate": 1.754648429151528e-05, + "loss": 0.4565, + "step": 3047 + }, + { + "epoch": 1.9492284320780362, + "grad_norm": 2.435673236846924, + "learning_rate": 1.753579824748878e-05, + "loss": 0.5068, + "step": 3048 + }, + { + "epoch": 1.949868073878628, + "grad_norm": 2.4823808670043945, + "learning_rate": 1.752511220346228e-05, + "loss": 0.5019, + "step": 3049 + }, + { + "epoch": 1.9505077156792197, + "grad_norm": 2.8090693950653076, + "learning_rate": 1.751442615943578e-05, + "loss": 0.5619, + "step": 3050 + }, + { + "epoch": 1.9511473574798113, + "grad_norm": 2.2274117469787598, + "learning_rate": 1.7503740115409275e-05, + "loss": 0.4628, + "step": 3051 + }, + { + "epoch": 1.9517869992804031, + "grad_norm": 2.081827402114868, + "learning_rate": 1.7493054071382774e-05, + "loss": 0.4409, + "step": 3052 + }, + { + "epoch": 1.9524266410809945, + "grad_norm": 2.340615749359131, + "learning_rate": 1.7482368027356274e-05, + "loss": 0.4745, + "step": 3053 + }, + { + "epoch": 1.9530662828815863, + "grad_norm": 2.2291791439056396, + "learning_rate": 1.747168198332977e-05, + "loss": 0.4801, + "step": 3054 + }, + { + "epoch": 1.953705924682178, + "grad_norm": 2.5104756355285645, + "learning_rate": 1.746099593930327e-05, + "loss": 0.5641, + "step": 3055 + }, + { + "epoch": 1.9543455664827696, + "grad_norm": 2.5856759548187256, + "learning_rate": 1.7450309895276772e-05, + "loss": 0.5749, + "step": 3056 + }, + { + "epoch": 1.9549852082833614, + "grad_norm": 2.6332132816314697, + "learning_rate": 1.7439623851250268e-05, + "loss": 0.5672, + "step": 3057 + }, + { + "epoch": 1.955624850083953, + "grad_norm": 2.5158519744873047, + "learning_rate": 1.7428937807223767e-05, + "loss": 0.5407, + "step": 3058 + }, + { + "epoch": 1.9562644918845447, + "grad_norm": 2.2477149963378906, + "learning_rate": 1.7418251763197267e-05, + "loss": 0.4761, + "step": 3059 + }, + { + "epoch": 1.9569041336851363, + "grad_norm": 2.567655563354492, + "learning_rate": 1.7407565719170763e-05, + "loss": 0.5278, + "step": 3060 + }, + { + "epoch": 1.9575437754857279, + "grad_norm": 2.1150951385498047, + "learning_rate": 1.7396879675144262e-05, + "loss": 0.4789, + "step": 3061 + }, + { + "epoch": 1.9581834172863197, + "grad_norm": 2.217879056930542, + "learning_rate": 1.738619363111776e-05, + "loss": 0.451, + "step": 3062 + }, + { + "epoch": 1.9588230590869113, + "grad_norm": 2.183075189590454, + "learning_rate": 1.737550758709126e-05, + "loss": 0.4829, + "step": 3063 + }, + { + "epoch": 1.959462700887503, + "grad_norm": 2.404634714126587, + "learning_rate": 1.7364821543064757e-05, + "loss": 0.4893, + "step": 3064 + }, + { + "epoch": 1.9601023426880948, + "grad_norm": 2.8002612590789795, + "learning_rate": 1.7354135499038256e-05, + "loss": 0.5211, + "step": 3065 + }, + { + "epoch": 1.9607419844886862, + "grad_norm": 2.499987840652466, + "learning_rate": 1.7343449455011755e-05, + "loss": 0.5745, + "step": 3066 + }, + { + "epoch": 1.961381626289278, + "grad_norm": 2.4582366943359375, + "learning_rate": 1.7332763410985255e-05, + "loss": 0.4864, + "step": 3067 + }, + { + "epoch": 1.9620212680898697, + "grad_norm": 2.733107805252075, + "learning_rate": 1.7322077366958754e-05, + "loss": 0.5544, + "step": 3068 + }, + { + "epoch": 1.9626609098904613, + "grad_norm": 2.6358394622802734, + "learning_rate": 1.7311391322932253e-05, + "loss": 0.5606, + "step": 3069 + }, + { + "epoch": 1.963300551691053, + "grad_norm": 1.966907262802124, + "learning_rate": 1.730070527890575e-05, + "loss": 0.4333, + "step": 3070 + }, + { + "epoch": 1.9639401934916447, + "grad_norm": 2.583651542663574, + "learning_rate": 1.729001923487925e-05, + "loss": 0.5282, + "step": 3071 + }, + { + "epoch": 1.9645798352922363, + "grad_norm": 2.4442920684814453, + "learning_rate": 1.7279333190852748e-05, + "loss": 0.5422, + "step": 3072 + }, + { + "epoch": 1.9652194770928282, + "grad_norm": 2.4457221031188965, + "learning_rate": 1.7268647146826244e-05, + "loss": 0.4959, + "step": 3073 + }, + { + "epoch": 1.9658591188934196, + "grad_norm": 2.5622451305389404, + "learning_rate": 1.7257961102799743e-05, + "loss": 0.5233, + "step": 3074 + }, + { + "epoch": 1.9664987606940114, + "grad_norm": 2.5282044410705566, + "learning_rate": 1.7247275058773243e-05, + "loss": 0.5662, + "step": 3075 + }, + { + "epoch": 1.967138402494603, + "grad_norm": 2.5381572246551514, + "learning_rate": 1.7236589014746742e-05, + "loss": 0.5321, + "step": 3076 + }, + { + "epoch": 1.9677780442951947, + "grad_norm": 2.80287504196167, + "learning_rate": 1.7225902970720238e-05, + "loss": 0.5776, + "step": 3077 + }, + { + "epoch": 1.9684176860957865, + "grad_norm": 2.027730941772461, + "learning_rate": 1.7215216926693737e-05, + "loss": 0.4621, + "step": 3078 + }, + { + "epoch": 1.9690573278963779, + "grad_norm": 2.3324129581451416, + "learning_rate": 1.720453088266724e-05, + "loss": 0.5041, + "step": 3079 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 2.5484001636505127, + "learning_rate": 1.7193844838640736e-05, + "loss": 0.5477, + "step": 3080 + }, + { + "epoch": 1.9703366114975613, + "grad_norm": 2.2478559017181396, + "learning_rate": 1.7183158794614236e-05, + "loss": 0.4545, + "step": 3081 + }, + { + "epoch": 1.970976253298153, + "grad_norm": 2.5246973037719727, + "learning_rate": 1.7172472750587735e-05, + "loss": 0.5894, + "step": 3082 + }, + { + "epoch": 1.9716158950987448, + "grad_norm": 2.2863972187042236, + "learning_rate": 1.716178670656123e-05, + "loss": 0.5205, + "step": 3083 + }, + { + "epoch": 1.9722555368993364, + "grad_norm": 2.898996353149414, + "learning_rate": 1.715110066253473e-05, + "loss": 0.563, + "step": 3084 + }, + { + "epoch": 1.972895178699928, + "grad_norm": 2.2894740104675293, + "learning_rate": 1.714041461850823e-05, + "loss": 0.4931, + "step": 3085 + }, + { + "epoch": 1.9735348205005199, + "grad_norm": 2.2987220287323, + "learning_rate": 1.7129728574481726e-05, + "loss": 0.461, + "step": 3086 + }, + { + "epoch": 1.9741744623011113, + "grad_norm": 2.349090576171875, + "learning_rate": 1.7119042530455225e-05, + "loss": 0.5559, + "step": 3087 + }, + { + "epoch": 1.974814104101703, + "grad_norm": 2.413607358932495, + "learning_rate": 1.7108356486428724e-05, + "loss": 0.5607, + "step": 3088 + }, + { + "epoch": 1.9754537459022947, + "grad_norm": 2.560260534286499, + "learning_rate": 1.7097670442402224e-05, + "loss": 0.5358, + "step": 3089 + }, + { + "epoch": 1.9760933877028863, + "grad_norm": 2.351215124130249, + "learning_rate": 1.7086984398375723e-05, + "loss": 0.5353, + "step": 3090 + }, + { + "epoch": 1.9767330295034782, + "grad_norm": 2.2487165927886963, + "learning_rate": 1.7076298354349222e-05, + "loss": 0.5013, + "step": 3091 + }, + { + "epoch": 1.9773726713040696, + "grad_norm": 2.1342883110046387, + "learning_rate": 1.7065612310322722e-05, + "loss": 0.4601, + "step": 3092 + }, + { + "epoch": 1.9780123131046614, + "grad_norm": 1.9691661596298218, + "learning_rate": 1.7054926266296218e-05, + "loss": 0.4331, + "step": 3093 + }, + { + "epoch": 1.978651954905253, + "grad_norm": 2.109966278076172, + "learning_rate": 1.7044240222269717e-05, + "loss": 0.4633, + "step": 3094 + }, + { + "epoch": 1.9792915967058446, + "grad_norm": 2.1279499530792236, + "learning_rate": 1.7033554178243216e-05, + "loss": 0.5282, + "step": 3095 + }, + { + "epoch": 1.9799312385064365, + "grad_norm": 2.070390224456787, + "learning_rate": 1.7022868134216712e-05, + "loss": 0.4413, + "step": 3096 + }, + { + "epoch": 1.980570880307028, + "grad_norm": 2.4855387210845947, + "learning_rate": 1.7012182090190212e-05, + "loss": 0.5064, + "step": 3097 + }, + { + "epoch": 1.9812105221076197, + "grad_norm": 2.0367043018341064, + "learning_rate": 1.700149604616371e-05, + "loss": 0.4512, + "step": 3098 + }, + { + "epoch": 1.9818501639082116, + "grad_norm": 2.409388542175293, + "learning_rate": 1.6990810002137207e-05, + "loss": 0.4886, + "step": 3099 + }, + { + "epoch": 1.982489805708803, + "grad_norm": 2.350985527038574, + "learning_rate": 1.6980123958110707e-05, + "loss": 0.5295, + "step": 3100 + }, + { + "epoch": 1.9831294475093948, + "grad_norm": 2.4386706352233887, + "learning_rate": 1.6969437914084206e-05, + "loss": 0.4977, + "step": 3101 + }, + { + "epoch": 1.9837690893099864, + "grad_norm": 2.4338319301605225, + "learning_rate": 1.6958751870057705e-05, + "loss": 0.5428, + "step": 3102 + }, + { + "epoch": 1.984408731110578, + "grad_norm": 1.9656285047531128, + "learning_rate": 1.6948065826031205e-05, + "loss": 0.448, + "step": 3103 + }, + { + "epoch": 1.9850483729111699, + "grad_norm": 2.2694687843322754, + "learning_rate": 1.6937379782004704e-05, + "loss": 0.5236, + "step": 3104 + }, + { + "epoch": 1.9856880147117613, + "grad_norm": 2.2327353954315186, + "learning_rate": 1.6926693737978203e-05, + "loss": 0.4987, + "step": 3105 + }, + { + "epoch": 1.986327656512353, + "grad_norm": 2.2643015384674072, + "learning_rate": 1.69160076939517e-05, + "loss": 0.531, + "step": 3106 + }, + { + "epoch": 1.9869672983129447, + "grad_norm": 2.438274383544922, + "learning_rate": 1.69053216499252e-05, + "loss": 0.532, + "step": 3107 + }, + { + "epoch": 1.9876069401135363, + "grad_norm": 2.36454701423645, + "learning_rate": 1.6894635605898698e-05, + "loss": 0.4898, + "step": 3108 + }, + { + "epoch": 1.9882465819141282, + "grad_norm": 2.1013450622558594, + "learning_rate": 1.6883949561872194e-05, + "loss": 0.399, + "step": 3109 + }, + { + "epoch": 1.9888862237147198, + "grad_norm": 2.293032646179199, + "learning_rate": 1.6873263517845693e-05, + "loss": 0.5122, + "step": 3110 + }, + { + "epoch": 1.9895258655153114, + "grad_norm": 2.1009013652801514, + "learning_rate": 1.6862577473819193e-05, + "loss": 0.457, + "step": 3111 + }, + { + "epoch": 1.9901655073159032, + "grad_norm": 2.314861536026001, + "learning_rate": 1.6851891429792692e-05, + "loss": 0.4906, + "step": 3112 + }, + { + "epoch": 1.9908051491164946, + "grad_norm": 2.2989134788513184, + "learning_rate": 1.684120538576619e-05, + "loss": 0.432, + "step": 3113 + }, + { + "epoch": 1.9914447909170865, + "grad_norm": 2.49627685546875, + "learning_rate": 1.683051934173969e-05, + "loss": 0.469, + "step": 3114 + }, + { + "epoch": 1.992084432717678, + "grad_norm": 2.0957040786743164, + "learning_rate": 1.6819833297713187e-05, + "loss": 0.4659, + "step": 3115 + }, + { + "epoch": 1.9927240745182697, + "grad_norm": 2.485936403274536, + "learning_rate": 1.6809147253686686e-05, + "loss": 0.5456, + "step": 3116 + }, + { + "epoch": 1.9933637163188616, + "grad_norm": 2.732510805130005, + "learning_rate": 1.6798461209660185e-05, + "loss": 0.5705, + "step": 3117 + }, + { + "epoch": 1.994003358119453, + "grad_norm": 2.1635005474090576, + "learning_rate": 1.6787775165633685e-05, + "loss": 0.4087, + "step": 3118 + }, + { + "epoch": 1.9946429999200448, + "grad_norm": 2.3673348426818848, + "learning_rate": 1.677708912160718e-05, + "loss": 0.5093, + "step": 3119 + }, + { + "epoch": 1.9952826417206364, + "grad_norm": 2.510932207107544, + "learning_rate": 1.676640307758068e-05, + "loss": 0.5167, + "step": 3120 + }, + { + "epoch": 1.995922283521228, + "grad_norm": 2.699578285217285, + "learning_rate": 1.675571703355418e-05, + "loss": 0.5348, + "step": 3121 + }, + { + "epoch": 1.9965619253218199, + "grad_norm": 2.4369664192199707, + "learning_rate": 1.6745030989527676e-05, + "loss": 0.4954, + "step": 3122 + }, + { + "epoch": 1.9972015671224115, + "grad_norm": 2.1577939987182617, + "learning_rate": 1.6734344945501175e-05, + "loss": 0.459, + "step": 3123 + }, + { + "epoch": 1.997841208923003, + "grad_norm": 2.1966986656188965, + "learning_rate": 1.6723658901474674e-05, + "loss": 0.4481, + "step": 3124 + }, + { + "epoch": 1.998480850723595, + "grad_norm": 2.145503520965576, + "learning_rate": 1.6712972857448174e-05, + "loss": 0.4754, + "step": 3125 + }, + { + "epoch": 1.9991204925241863, + "grad_norm": 2.522552967071533, + "learning_rate": 1.6702286813421673e-05, + "loss": 0.4993, + "step": 3126 + }, + { + "epoch": 1.9997601343247782, + "grad_norm": 2.595780611038208, + "learning_rate": 1.6691600769395172e-05, + "loss": 0.5107, + "step": 3127 + }, + { + "epoch": 2.0, + "grad_norm": 3.449460029602051, + "learning_rate": 1.668091472536867e-05, + "loss": 0.416, + "step": 3128 + }, + { + "epoch": 2.000639641800592, + "grad_norm": 1.6779128313064575, + "learning_rate": 1.6670228681342168e-05, + "loss": 0.3353, + "step": 3129 + }, + { + "epoch": 2.0012792836011832, + "grad_norm": 1.5910295248031616, + "learning_rate": 1.6659542637315667e-05, + "loss": 0.3762, + "step": 3130 + }, + { + "epoch": 2.001918925401775, + "grad_norm": 1.697697401046753, + "learning_rate": 1.6648856593289163e-05, + "loss": 0.3306, + "step": 3131 + }, + { + "epoch": 2.0025585672023665, + "grad_norm": 1.518368124961853, + "learning_rate": 1.6638170549262662e-05, + "loss": 0.327, + "step": 3132 + }, + { + "epoch": 2.0031982090029583, + "grad_norm": 1.7516897916793823, + "learning_rate": 1.6627484505236162e-05, + "loss": 0.3522, + "step": 3133 + }, + { + "epoch": 2.00383785080355, + "grad_norm": 1.559342861175537, + "learning_rate": 1.661679846120966e-05, + "loss": 0.3021, + "step": 3134 + }, + { + "epoch": 2.0044774926041415, + "grad_norm": 1.6947582960128784, + "learning_rate": 1.660611241718316e-05, + "loss": 0.3277, + "step": 3135 + }, + { + "epoch": 2.0051171344047334, + "grad_norm": 1.9805963039398193, + "learning_rate": 1.659542637315666e-05, + "loss": 0.3193, + "step": 3136 + }, + { + "epoch": 2.005756776205325, + "grad_norm": 1.708906888961792, + "learning_rate": 1.658474032913016e-05, + "loss": 0.3352, + "step": 3137 + }, + { + "epoch": 2.0063964180059166, + "grad_norm": 2.005021572113037, + "learning_rate": 1.6574054285103655e-05, + "loss": 0.3301, + "step": 3138 + }, + { + "epoch": 2.0070360598065085, + "grad_norm": 1.9154053926467896, + "learning_rate": 1.6563368241077155e-05, + "loss": 0.3003, + "step": 3139 + }, + { + "epoch": 2.0076757016071, + "grad_norm": 2.0008208751678467, + "learning_rate": 1.6552682197050654e-05, + "loss": 0.2872, + "step": 3140 + }, + { + "epoch": 2.0083153434076917, + "grad_norm": 2.1309220790863037, + "learning_rate": 1.654199615302415e-05, + "loss": 0.3055, + "step": 3141 + }, + { + "epoch": 2.0089549852082835, + "grad_norm": 2.131603956222534, + "learning_rate": 1.653131010899765e-05, + "loss": 0.2969, + "step": 3142 + }, + { + "epoch": 2.009594627008875, + "grad_norm": 2.8020849227905273, + "learning_rate": 1.652062406497115e-05, + "loss": 0.3187, + "step": 3143 + }, + { + "epoch": 2.0102342688094668, + "grad_norm": 2.71633243560791, + "learning_rate": 1.6509938020944645e-05, + "loss": 0.3, + "step": 3144 + }, + { + "epoch": 2.010873910610058, + "grad_norm": 2.8960108757019043, + "learning_rate": 1.6499251976918144e-05, + "loss": 0.3143, + "step": 3145 + }, + { + "epoch": 2.01151355241065, + "grad_norm": 2.872610569000244, + "learning_rate": 1.6488565932891643e-05, + "loss": 0.3024, + "step": 3146 + }, + { + "epoch": 2.012153194211242, + "grad_norm": 2.6689562797546387, + "learning_rate": 1.6477879888865143e-05, + "loss": 0.2687, + "step": 3147 + }, + { + "epoch": 2.0127928360118332, + "grad_norm": 3.2189183235168457, + "learning_rate": 1.6467193844838642e-05, + "loss": 0.3055, + "step": 3148 + }, + { + "epoch": 2.013432477812425, + "grad_norm": 3.3254477977752686, + "learning_rate": 1.645650780081214e-05, + "loss": 0.3233, + "step": 3149 + }, + { + "epoch": 2.014072119613017, + "grad_norm": 3.1897618770599365, + "learning_rate": 1.644582175678564e-05, + "loss": 0.2796, + "step": 3150 + }, + { + "epoch": 2.0147117614136083, + "grad_norm": 3.6358282566070557, + "learning_rate": 1.6435135712759137e-05, + "loss": 0.3246, + "step": 3151 + }, + { + "epoch": 2.0153514032142, + "grad_norm": 3.135077953338623, + "learning_rate": 1.6424449668732636e-05, + "loss": 0.2811, + "step": 3152 + }, + { + "epoch": 2.0159910450147915, + "grad_norm": 3.6325466632843018, + "learning_rate": 1.6413763624706135e-05, + "loss": 0.2951, + "step": 3153 + }, + { + "epoch": 2.0166306868153834, + "grad_norm": 3.7397799491882324, + "learning_rate": 1.640307758067963e-05, + "loss": 0.3308, + "step": 3154 + }, + { + "epoch": 2.017270328615975, + "grad_norm": 3.0837008953094482, + "learning_rate": 1.639239153665313e-05, + "loss": 0.2761, + "step": 3155 + }, + { + "epoch": 2.0179099704165666, + "grad_norm": 2.698352098464966, + "learning_rate": 1.638170549262663e-05, + "loss": 0.2523, + "step": 3156 + }, + { + "epoch": 2.0185496122171585, + "grad_norm": 3.8014771938323975, + "learning_rate": 1.637101944860013e-05, + "loss": 0.3365, + "step": 3157 + }, + { + "epoch": 2.01918925401775, + "grad_norm": 2.8840444087982178, + "learning_rate": 1.636033340457363e-05, + "loss": 0.2794, + "step": 3158 + }, + { + "epoch": 2.0198288958183417, + "grad_norm": 3.2458548545837402, + "learning_rate": 1.6349647360547128e-05, + "loss": 0.3257, + "step": 3159 + }, + { + "epoch": 2.0204685376189335, + "grad_norm": 2.931417226791382, + "learning_rate": 1.6338961316520628e-05, + "loss": 0.3198, + "step": 3160 + }, + { + "epoch": 2.021108179419525, + "grad_norm": 2.8906733989715576, + "learning_rate": 1.6328275272494124e-05, + "loss": 0.3006, + "step": 3161 + }, + { + "epoch": 2.0217478212201168, + "grad_norm": 3.2544641494750977, + "learning_rate": 1.6317589228467623e-05, + "loss": 0.3437, + "step": 3162 + }, + { + "epoch": 2.0223874630207086, + "grad_norm": 2.798978567123413, + "learning_rate": 1.6306903184441122e-05, + "loss": 0.2978, + "step": 3163 + }, + { + "epoch": 2.0230271048213, + "grad_norm": 3.4615495204925537, + "learning_rate": 1.6296217140414618e-05, + "loss": 0.3628, + "step": 3164 + }, + { + "epoch": 2.023666746621892, + "grad_norm": 2.4950344562530518, + "learning_rate": 1.6285531096388118e-05, + "loss": 0.284, + "step": 3165 + }, + { + "epoch": 2.0243063884224832, + "grad_norm": 2.7329065799713135, + "learning_rate": 1.6274845052361617e-05, + "loss": 0.3045, + "step": 3166 + }, + { + "epoch": 2.024946030223075, + "grad_norm": 2.344383478164673, + "learning_rate": 1.6264159008335113e-05, + "loss": 0.2859, + "step": 3167 + }, + { + "epoch": 2.025585672023667, + "grad_norm": 2.4195828437805176, + "learning_rate": 1.6253472964308612e-05, + "loss": 0.3003, + "step": 3168 + }, + { + "epoch": 2.0262253138242583, + "grad_norm": 2.586026668548584, + "learning_rate": 1.624278692028211e-05, + "loss": 0.3086, + "step": 3169 + }, + { + "epoch": 2.02686495562485, + "grad_norm": 2.6065242290496826, + "learning_rate": 1.623210087625561e-05, + "loss": 0.3079, + "step": 3170 + }, + { + "epoch": 2.0275045974254415, + "grad_norm": 2.847074508666992, + "learning_rate": 1.622141483222911e-05, + "loss": 0.3241, + "step": 3171 + }, + { + "epoch": 2.0281442392260334, + "grad_norm": 2.3567025661468506, + "learning_rate": 1.621072878820261e-05, + "loss": 0.2637, + "step": 3172 + }, + { + "epoch": 2.028783881026625, + "grad_norm": 2.294857978820801, + "learning_rate": 1.6200042744176106e-05, + "loss": 0.2702, + "step": 3173 + }, + { + "epoch": 2.0294235228272166, + "grad_norm": 2.289886951446533, + "learning_rate": 1.6189356700149605e-05, + "loss": 0.3044, + "step": 3174 + }, + { + "epoch": 2.0300631646278084, + "grad_norm": 2.121716022491455, + "learning_rate": 1.6178670656123104e-05, + "loss": 0.2667, + "step": 3175 + }, + { + "epoch": 2.0307028064284003, + "grad_norm": 2.6956703662872314, + "learning_rate": 1.6167984612096604e-05, + "loss": 0.2687, + "step": 3176 + }, + { + "epoch": 2.0313424482289917, + "grad_norm": 2.691291570663452, + "learning_rate": 1.61572985680701e-05, + "loss": 0.2943, + "step": 3177 + }, + { + "epoch": 2.0319820900295835, + "grad_norm": 3.67265248298645, + "learning_rate": 1.61466125240436e-05, + "loss": 0.3831, + "step": 3178 + }, + { + "epoch": 2.032621731830175, + "grad_norm": 1.999361515045166, + "learning_rate": 1.61359264800171e-05, + "loss": 0.2548, + "step": 3179 + }, + { + "epoch": 2.0332613736307668, + "grad_norm": 2.9225454330444336, + "learning_rate": 1.6125240435990598e-05, + "loss": 0.3297, + "step": 3180 + }, + { + "epoch": 2.0339010154313586, + "grad_norm": 2.4496564865112305, + "learning_rate": 1.6114554391964097e-05, + "loss": 0.2729, + "step": 3181 + }, + { + "epoch": 2.03454065723195, + "grad_norm": 3.0396206378936768, + "learning_rate": 1.6103868347937597e-05, + "loss": 0.3091, + "step": 3182 + }, + { + "epoch": 2.035180299032542, + "grad_norm": 2.9822778701782227, + "learning_rate": 1.6093182303911093e-05, + "loss": 0.3054, + "step": 3183 + }, + { + "epoch": 2.0358199408331332, + "grad_norm": 2.0436558723449707, + "learning_rate": 1.6082496259884592e-05, + "loss": 0.247, + "step": 3184 + }, + { + "epoch": 2.036459582633725, + "grad_norm": 2.5129218101501465, + "learning_rate": 1.607181021585809e-05, + "loss": 0.2828, + "step": 3185 + }, + { + "epoch": 2.037099224434317, + "grad_norm": 2.7617108821868896, + "learning_rate": 1.6061124171831587e-05, + "loss": 0.2711, + "step": 3186 + }, + { + "epoch": 2.0377388662349083, + "grad_norm": 3.0609190464019775, + "learning_rate": 1.6050438127805087e-05, + "loss": 0.31, + "step": 3187 + }, + { + "epoch": 2.0383785080355, + "grad_norm": 2.60442852973938, + "learning_rate": 1.6039752083778586e-05, + "loss": 0.277, + "step": 3188 + }, + { + "epoch": 2.039018149836092, + "grad_norm": 2.3617653846740723, + "learning_rate": 1.6029066039752085e-05, + "loss": 0.2499, + "step": 3189 + }, + { + "epoch": 2.0396577916366834, + "grad_norm": 2.6222386360168457, + "learning_rate": 1.601837999572558e-05, + "loss": 0.3078, + "step": 3190 + }, + { + "epoch": 2.040297433437275, + "grad_norm": 2.4635252952575684, + "learning_rate": 1.600769395169908e-05, + "loss": 0.2807, + "step": 3191 + }, + { + "epoch": 2.0409370752378666, + "grad_norm": 2.7139599323272705, + "learning_rate": 1.599700790767258e-05, + "loss": 0.3031, + "step": 3192 + }, + { + "epoch": 2.0415767170384584, + "grad_norm": 2.8060879707336426, + "learning_rate": 1.598632186364608e-05, + "loss": 0.3309, + "step": 3193 + }, + { + "epoch": 2.0422163588390503, + "grad_norm": 2.574420213699341, + "learning_rate": 1.597563581961958e-05, + "loss": 0.2647, + "step": 3194 + }, + { + "epoch": 2.0428560006396417, + "grad_norm": 3.502035140991211, + "learning_rate": 1.5964949775593078e-05, + "loss": 0.3351, + "step": 3195 + }, + { + "epoch": 2.0434956424402335, + "grad_norm": 2.8940072059631348, + "learning_rate": 1.5954263731566574e-05, + "loss": 0.3199, + "step": 3196 + }, + { + "epoch": 2.0441352842408254, + "grad_norm": 2.750485420227051, + "learning_rate": 1.5943577687540073e-05, + "loss": 0.2863, + "step": 3197 + }, + { + "epoch": 2.0447749260414168, + "grad_norm": 2.437612295150757, + "learning_rate": 1.5932891643513573e-05, + "loss": 0.2654, + "step": 3198 + }, + { + "epoch": 2.0454145678420086, + "grad_norm": 3.353499174118042, + "learning_rate": 1.592220559948707e-05, + "loss": 0.3058, + "step": 3199 + }, + { + "epoch": 2.0460542096426, + "grad_norm": 2.707676649093628, + "learning_rate": 1.5911519555460568e-05, + "loss": 0.2841, + "step": 3200 + }, + { + "epoch": 2.046693851443192, + "grad_norm": 2.9310965538024902, + "learning_rate": 1.5900833511434068e-05, + "loss": 0.3341, + "step": 3201 + }, + { + "epoch": 2.0473334932437837, + "grad_norm": 2.8691303730010986, + "learning_rate": 1.5890147467407564e-05, + "loss": 0.2995, + "step": 3202 + }, + { + "epoch": 2.047973135044375, + "grad_norm": 2.653183698654175, + "learning_rate": 1.5879461423381066e-05, + "loss": 0.2781, + "step": 3203 + }, + { + "epoch": 2.048612776844967, + "grad_norm": 2.5871102809906006, + "learning_rate": 1.5868775379354566e-05, + "loss": 0.2884, + "step": 3204 + }, + { + "epoch": 2.0492524186455583, + "grad_norm": 3.0146594047546387, + "learning_rate": 1.5858089335328065e-05, + "loss": 0.3131, + "step": 3205 + }, + { + "epoch": 2.04989206044615, + "grad_norm": 2.956112861633301, + "learning_rate": 1.584740329130156e-05, + "loss": 0.307, + "step": 3206 + }, + { + "epoch": 2.050531702246742, + "grad_norm": 2.8111886978149414, + "learning_rate": 1.583671724727506e-05, + "loss": 0.2979, + "step": 3207 + }, + { + "epoch": 2.0511713440473334, + "grad_norm": 2.4186933040618896, + "learning_rate": 1.582603120324856e-05, + "loss": 0.2598, + "step": 3208 + }, + { + "epoch": 2.051810985847925, + "grad_norm": 2.7398626804351807, + "learning_rate": 1.5815345159222056e-05, + "loss": 0.3058, + "step": 3209 + }, + { + "epoch": 2.052450627648517, + "grad_norm": 2.7144486904144287, + "learning_rate": 1.5804659115195555e-05, + "loss": 0.2994, + "step": 3210 + }, + { + "epoch": 2.0530902694491084, + "grad_norm": 2.864875555038452, + "learning_rate": 1.5793973071169054e-05, + "loss": 0.3262, + "step": 3211 + }, + { + "epoch": 2.0537299112497003, + "grad_norm": 2.4187493324279785, + "learning_rate": 1.578328702714255e-05, + "loss": 0.3022, + "step": 3212 + }, + { + "epoch": 2.0543695530502917, + "grad_norm": 2.491227388381958, + "learning_rate": 1.577260098311605e-05, + "loss": 0.263, + "step": 3213 + }, + { + "epoch": 2.0550091948508835, + "grad_norm": 2.6596786975860596, + "learning_rate": 1.576191493908955e-05, + "loss": 0.3023, + "step": 3214 + }, + { + "epoch": 2.0556488366514754, + "grad_norm": 3.1954710483551025, + "learning_rate": 1.575122889506305e-05, + "loss": 0.3333, + "step": 3215 + }, + { + "epoch": 2.0562884784520667, + "grad_norm": 3.25301194190979, + "learning_rate": 1.5740542851036548e-05, + "loss": 0.3294, + "step": 3216 + }, + { + "epoch": 2.0569281202526586, + "grad_norm": 2.6421799659729004, + "learning_rate": 1.5729856807010047e-05, + "loss": 0.2713, + "step": 3217 + }, + { + "epoch": 2.05756776205325, + "grad_norm": 3.407914638519287, + "learning_rate": 1.5719170762983547e-05, + "loss": 0.341, + "step": 3218 + }, + { + "epoch": 2.058207403853842, + "grad_norm": 2.7729814052581787, + "learning_rate": 1.5708484718957042e-05, + "loss": 0.2861, + "step": 3219 + }, + { + "epoch": 2.0588470456544337, + "grad_norm": 2.9381232261657715, + "learning_rate": 1.5697798674930542e-05, + "loss": 0.3389, + "step": 3220 + }, + { + "epoch": 2.059486687455025, + "grad_norm": 2.676520586013794, + "learning_rate": 1.568711263090404e-05, + "loss": 0.295, + "step": 3221 + }, + { + "epoch": 2.060126329255617, + "grad_norm": 2.7993195056915283, + "learning_rate": 1.5676426586877537e-05, + "loss": 0.311, + "step": 3222 + }, + { + "epoch": 2.0607659710562087, + "grad_norm": 2.848257303237915, + "learning_rate": 1.5665740542851037e-05, + "loss": 0.2875, + "step": 3223 + }, + { + "epoch": 2.0614056128568, + "grad_norm": 2.6933562755584717, + "learning_rate": 1.5655054498824536e-05, + "loss": 0.2928, + "step": 3224 + }, + { + "epoch": 2.062045254657392, + "grad_norm": 2.7110655307769775, + "learning_rate": 1.5644368454798032e-05, + "loss": 0.2609, + "step": 3225 + }, + { + "epoch": 2.0626848964579834, + "grad_norm": 3.1989595890045166, + "learning_rate": 1.5633682410771535e-05, + "loss": 0.3319, + "step": 3226 + }, + { + "epoch": 2.063324538258575, + "grad_norm": 2.603562116622925, + "learning_rate": 1.5622996366745034e-05, + "loss": 0.2546, + "step": 3227 + }, + { + "epoch": 2.063964180059167, + "grad_norm": 3.1629462242126465, + "learning_rate": 1.561231032271853e-05, + "loss": 0.3066, + "step": 3228 + }, + { + "epoch": 2.0646038218597584, + "grad_norm": 2.6760365962982178, + "learning_rate": 1.560162427869203e-05, + "loss": 0.2827, + "step": 3229 + }, + { + "epoch": 2.0652434636603503, + "grad_norm": 2.3832077980041504, + "learning_rate": 1.559093823466553e-05, + "loss": 0.2777, + "step": 3230 + }, + { + "epoch": 2.0658831054609417, + "grad_norm": 3.096395492553711, + "learning_rate": 1.5580252190639028e-05, + "loss": 0.3242, + "step": 3231 + }, + { + "epoch": 2.0665227472615335, + "grad_norm": 3.1770401000976562, + "learning_rate": 1.5569566146612524e-05, + "loss": 0.2627, + "step": 3232 + }, + { + "epoch": 2.0671623890621253, + "grad_norm": 2.8355541229248047, + "learning_rate": 1.5558880102586023e-05, + "loss": 0.2977, + "step": 3233 + }, + { + "epoch": 2.0678020308627167, + "grad_norm": 2.670912504196167, + "learning_rate": 1.5548194058559523e-05, + "loss": 0.3027, + "step": 3234 + }, + { + "epoch": 2.0684416726633086, + "grad_norm": 2.2594873905181885, + "learning_rate": 1.553750801453302e-05, + "loss": 0.2624, + "step": 3235 + }, + { + "epoch": 2.0690813144639004, + "grad_norm": 3.151336431503296, + "learning_rate": 1.5526821970506518e-05, + "loss": 0.301, + "step": 3236 + }, + { + "epoch": 2.069720956264492, + "grad_norm": 3.330171585083008, + "learning_rate": 1.5516135926480017e-05, + "loss": 0.3277, + "step": 3237 + }, + { + "epoch": 2.0703605980650837, + "grad_norm": 2.6647558212280273, + "learning_rate": 1.5505449882453517e-05, + "loss": 0.2838, + "step": 3238 + }, + { + "epoch": 2.071000239865675, + "grad_norm": 2.5245156288146973, + "learning_rate": 1.5494763838427016e-05, + "loss": 0.2961, + "step": 3239 + }, + { + "epoch": 2.071639881666267, + "grad_norm": 2.5992631912231445, + "learning_rate": 1.5484077794400516e-05, + "loss": 0.2693, + "step": 3240 + }, + { + "epoch": 2.0722795234668587, + "grad_norm": 2.877854108810425, + "learning_rate": 1.547339175037401e-05, + "loss": 0.2962, + "step": 3241 + }, + { + "epoch": 2.07291916526745, + "grad_norm": 2.711557388305664, + "learning_rate": 1.546270570634751e-05, + "loss": 0.288, + "step": 3242 + }, + { + "epoch": 2.073558807068042, + "grad_norm": 2.771714687347412, + "learning_rate": 1.545201966232101e-05, + "loss": 0.2765, + "step": 3243 + }, + { + "epoch": 2.0741984488686334, + "grad_norm": 2.429236888885498, + "learning_rate": 1.5441333618294506e-05, + "loss": 0.2563, + "step": 3244 + }, + { + "epoch": 2.074838090669225, + "grad_norm": 2.883639097213745, + "learning_rate": 1.5430647574268006e-05, + "loss": 0.3183, + "step": 3245 + }, + { + "epoch": 2.075477732469817, + "grad_norm": 3.3720297813415527, + "learning_rate": 1.5419961530241505e-05, + "loss": 0.2925, + "step": 3246 + }, + { + "epoch": 2.0761173742704084, + "grad_norm": 2.7470321655273438, + "learning_rate": 1.5409275486215004e-05, + "loss": 0.2961, + "step": 3247 + }, + { + "epoch": 2.0767570160710003, + "grad_norm": 2.5316994190216064, + "learning_rate": 1.53985894421885e-05, + "loss": 0.2613, + "step": 3248 + }, + { + "epoch": 2.077396657871592, + "grad_norm": 2.7023704051971436, + "learning_rate": 1.5387903398162003e-05, + "loss": 0.2608, + "step": 3249 + }, + { + "epoch": 2.0780362996721835, + "grad_norm": 2.663525104522705, + "learning_rate": 1.5377217354135502e-05, + "loss": 0.2833, + "step": 3250 + }, + { + "epoch": 2.0786759414727753, + "grad_norm": 2.6081383228302, + "learning_rate": 1.5366531310109e-05, + "loss": 0.2604, + "step": 3251 + }, + { + "epoch": 2.0793155832733667, + "grad_norm": 2.700937032699585, + "learning_rate": 1.5355845266082498e-05, + "loss": 0.2704, + "step": 3252 + }, + { + "epoch": 2.0799552250739586, + "grad_norm": 3.3110196590423584, + "learning_rate": 1.5345159222055997e-05, + "loss": 0.2994, + "step": 3253 + }, + { + "epoch": 2.0805948668745504, + "grad_norm": 2.718968152999878, + "learning_rate": 1.5334473178029493e-05, + "loss": 0.2641, + "step": 3254 + }, + { + "epoch": 2.081234508675142, + "grad_norm": 3.172600030899048, + "learning_rate": 1.5323787134002992e-05, + "loss": 0.3076, + "step": 3255 + }, + { + "epoch": 2.0818741504757337, + "grad_norm": 2.9849905967712402, + "learning_rate": 1.5313101089976492e-05, + "loss": 0.3096, + "step": 3256 + }, + { + "epoch": 2.082513792276325, + "grad_norm": 3.1546828746795654, + "learning_rate": 1.5302415045949988e-05, + "loss": 0.3006, + "step": 3257 + }, + { + "epoch": 2.083153434076917, + "grad_norm": 2.601928949356079, + "learning_rate": 1.5291729001923487e-05, + "loss": 0.2734, + "step": 3258 + }, + { + "epoch": 2.0837930758775087, + "grad_norm": 2.700784921646118, + "learning_rate": 1.5281042957896986e-05, + "loss": 0.2794, + "step": 3259 + }, + { + "epoch": 2.0844327176781, + "grad_norm": 3.1294023990631104, + "learning_rate": 1.5270356913870486e-05, + "loss": 0.3098, + "step": 3260 + }, + { + "epoch": 2.085072359478692, + "grad_norm": 2.5297465324401855, + "learning_rate": 1.5259670869843985e-05, + "loss": 0.279, + "step": 3261 + }, + { + "epoch": 2.085712001279284, + "grad_norm": 2.892437219619751, + "learning_rate": 1.5248984825817483e-05, + "loss": 0.2939, + "step": 3262 + }, + { + "epoch": 2.086351643079875, + "grad_norm": 2.8874454498291016, + "learning_rate": 1.5238298781790982e-05, + "loss": 0.296, + "step": 3263 + }, + { + "epoch": 2.086991284880467, + "grad_norm": 2.9296672344207764, + "learning_rate": 1.522761273776448e-05, + "loss": 0.3122, + "step": 3264 + }, + { + "epoch": 2.0876309266810584, + "grad_norm": 2.9001364707946777, + "learning_rate": 1.521692669373798e-05, + "loss": 0.2765, + "step": 3265 + }, + { + "epoch": 2.0882705684816503, + "grad_norm": 2.669949531555176, + "learning_rate": 1.5206240649711479e-05, + "loss": 0.3069, + "step": 3266 + }, + { + "epoch": 2.088910210282242, + "grad_norm": 3.0146543979644775, + "learning_rate": 1.5195554605684975e-05, + "loss": 0.2975, + "step": 3267 + }, + { + "epoch": 2.0895498520828335, + "grad_norm": 2.7401692867279053, + "learning_rate": 1.5184868561658474e-05, + "loss": 0.2788, + "step": 3268 + }, + { + "epoch": 2.0901894938834253, + "grad_norm": 2.876371383666992, + "learning_rate": 1.5174182517631973e-05, + "loss": 0.2945, + "step": 3269 + }, + { + "epoch": 2.0908291356840167, + "grad_norm": 2.3925490379333496, + "learning_rate": 1.5163496473605471e-05, + "loss": 0.2676, + "step": 3270 + }, + { + "epoch": 2.0914687774846086, + "grad_norm": 2.8799896240234375, + "learning_rate": 1.515281042957897e-05, + "loss": 0.2998, + "step": 3271 + }, + { + "epoch": 2.0921084192852004, + "grad_norm": 2.9084677696228027, + "learning_rate": 1.514212438555247e-05, + "loss": 0.3046, + "step": 3272 + }, + { + "epoch": 2.092748061085792, + "grad_norm": 2.9953341484069824, + "learning_rate": 1.5131438341525969e-05, + "loss": 0.2805, + "step": 3273 + }, + { + "epoch": 2.0933877028863837, + "grad_norm": 3.266188144683838, + "learning_rate": 1.5120752297499465e-05, + "loss": 0.2944, + "step": 3274 + }, + { + "epoch": 2.0940273446869755, + "grad_norm": 2.7826449871063232, + "learning_rate": 1.5110066253472966e-05, + "loss": 0.2753, + "step": 3275 + }, + { + "epoch": 2.094666986487567, + "grad_norm": 2.854180335998535, + "learning_rate": 1.5099380209446465e-05, + "loss": 0.3095, + "step": 3276 + }, + { + "epoch": 2.0953066282881587, + "grad_norm": 2.8137595653533936, + "learning_rate": 1.5088694165419961e-05, + "loss": 0.2961, + "step": 3277 + }, + { + "epoch": 2.09594627008875, + "grad_norm": 2.908651351928711, + "learning_rate": 1.507800812139346e-05, + "loss": 0.2537, + "step": 3278 + }, + { + "epoch": 2.096585911889342, + "grad_norm": 3.0123391151428223, + "learning_rate": 1.506732207736696e-05, + "loss": 0.3113, + "step": 3279 + }, + { + "epoch": 2.097225553689934, + "grad_norm": 2.7820565700531006, + "learning_rate": 1.5056636033340458e-05, + "loss": 0.2918, + "step": 3280 + }, + { + "epoch": 2.097865195490525, + "grad_norm": 2.8627681732177734, + "learning_rate": 1.5045949989313957e-05, + "loss": 0.3028, + "step": 3281 + }, + { + "epoch": 2.098504837291117, + "grad_norm": 2.9475295543670654, + "learning_rate": 1.5035263945287457e-05, + "loss": 0.2941, + "step": 3282 + }, + { + "epoch": 2.0991444790917084, + "grad_norm": 3.054466724395752, + "learning_rate": 1.5024577901260953e-05, + "loss": 0.308, + "step": 3283 + }, + { + "epoch": 2.0997841208923003, + "grad_norm": 2.772033452987671, + "learning_rate": 1.5013891857234452e-05, + "loss": 0.2655, + "step": 3284 + }, + { + "epoch": 2.100423762692892, + "grad_norm": 3.1366946697235107, + "learning_rate": 1.5003205813207951e-05, + "loss": 0.295, + "step": 3285 + }, + { + "epoch": 2.1010634044934835, + "grad_norm": 2.5524094104766846, + "learning_rate": 1.4992519769181449e-05, + "loss": 0.263, + "step": 3286 + }, + { + "epoch": 2.1017030462940753, + "grad_norm": 2.7674403190612793, + "learning_rate": 1.4981833725154948e-05, + "loss": 0.2847, + "step": 3287 + }, + { + "epoch": 2.102342688094667, + "grad_norm": 2.9924843311309814, + "learning_rate": 1.4971147681128448e-05, + "loss": 0.302, + "step": 3288 + }, + { + "epoch": 2.1029823298952586, + "grad_norm": 2.7296512126922607, + "learning_rate": 1.4960461637101947e-05, + "loss": 0.279, + "step": 3289 + }, + { + "epoch": 2.1036219716958504, + "grad_norm": 2.4712929725646973, + "learning_rate": 1.4949775593075443e-05, + "loss": 0.2431, + "step": 3290 + }, + { + "epoch": 2.104261613496442, + "grad_norm": 3.1025404930114746, + "learning_rate": 1.4939089549048942e-05, + "loss": 0.3214, + "step": 3291 + }, + { + "epoch": 2.1049012552970336, + "grad_norm": 3.0940849781036377, + "learning_rate": 1.4928403505022442e-05, + "loss": 0.3067, + "step": 3292 + }, + { + "epoch": 2.1055408970976255, + "grad_norm": 2.9295239448547363, + "learning_rate": 1.491771746099594e-05, + "loss": 0.2979, + "step": 3293 + }, + { + "epoch": 2.106180538898217, + "grad_norm": 2.9662773609161377, + "learning_rate": 1.4907031416969439e-05, + "loss": 0.3117, + "step": 3294 + }, + { + "epoch": 2.1068201806988087, + "grad_norm": 2.6901750564575195, + "learning_rate": 1.4896345372942938e-05, + "loss": 0.284, + "step": 3295 + }, + { + "epoch": 2.1074598224994006, + "grad_norm": 2.865873098373413, + "learning_rate": 1.4885659328916434e-05, + "loss": 0.3094, + "step": 3296 + }, + { + "epoch": 2.108099464299992, + "grad_norm": 3.3490869998931885, + "learning_rate": 1.4874973284889933e-05, + "loss": 0.3159, + "step": 3297 + }, + { + "epoch": 2.108739106100584, + "grad_norm": 2.8624188899993896, + "learning_rate": 1.4864287240863434e-05, + "loss": 0.3002, + "step": 3298 + }, + { + "epoch": 2.109378747901175, + "grad_norm": 2.627501964569092, + "learning_rate": 1.485360119683693e-05, + "loss": 0.2713, + "step": 3299 + }, + { + "epoch": 2.110018389701767, + "grad_norm": 2.5993735790252686, + "learning_rate": 1.484291515281043e-05, + "loss": 0.3047, + "step": 3300 + }, + { + "epoch": 2.110658031502359, + "grad_norm": 2.5539181232452393, + "learning_rate": 1.483222910878393e-05, + "loss": 0.2842, + "step": 3301 + }, + { + "epoch": 2.1112976733029503, + "grad_norm": 2.971036434173584, + "learning_rate": 1.4821543064757429e-05, + "loss": 0.3129, + "step": 3302 + }, + { + "epoch": 2.111937315103542, + "grad_norm": 2.9908058643341064, + "learning_rate": 1.4810857020730926e-05, + "loss": 0.2893, + "step": 3303 + }, + { + "epoch": 2.1125769569041335, + "grad_norm": 2.337106227874756, + "learning_rate": 1.4800170976704426e-05, + "loss": 0.2813, + "step": 3304 + }, + { + "epoch": 2.1132165987047253, + "grad_norm": 2.473165273666382, + "learning_rate": 1.4789484932677925e-05, + "loss": 0.2861, + "step": 3305 + }, + { + "epoch": 2.113856240505317, + "grad_norm": 2.905780792236328, + "learning_rate": 1.4778798888651421e-05, + "loss": 0.2922, + "step": 3306 + }, + { + "epoch": 2.1144958823059086, + "grad_norm": 2.6335463523864746, + "learning_rate": 1.476811284462492e-05, + "loss": 0.2775, + "step": 3307 + }, + { + "epoch": 2.1151355241065004, + "grad_norm": 2.7311782836914062, + "learning_rate": 1.475742680059842e-05, + "loss": 0.2832, + "step": 3308 + }, + { + "epoch": 2.115775165907092, + "grad_norm": 3.064622640609741, + "learning_rate": 1.4746740756571917e-05, + "loss": 0.3248, + "step": 3309 + }, + { + "epoch": 2.1164148077076836, + "grad_norm": 2.511990785598755, + "learning_rate": 1.4736054712545417e-05, + "loss": 0.2734, + "step": 3310 + }, + { + "epoch": 2.1170544495082755, + "grad_norm": 2.5764358043670654, + "learning_rate": 1.4725368668518916e-05, + "loss": 0.2979, + "step": 3311 + }, + { + "epoch": 2.117694091308867, + "grad_norm": 2.4979262351989746, + "learning_rate": 1.4714682624492412e-05, + "loss": 0.2788, + "step": 3312 + }, + { + "epoch": 2.1183337331094587, + "grad_norm": 2.798145294189453, + "learning_rate": 1.4703996580465911e-05, + "loss": 0.3077, + "step": 3313 + }, + { + "epoch": 2.1189733749100506, + "grad_norm": 2.6941370964050293, + "learning_rate": 1.469331053643941e-05, + "loss": 0.2939, + "step": 3314 + }, + { + "epoch": 2.119613016710642, + "grad_norm": 2.527987241744995, + "learning_rate": 1.468262449241291e-05, + "loss": 0.2551, + "step": 3315 + }, + { + "epoch": 2.120252658511234, + "grad_norm": 3.3765578269958496, + "learning_rate": 1.4671938448386408e-05, + "loss": 0.3339, + "step": 3316 + }, + { + "epoch": 2.120892300311825, + "grad_norm": 2.503570795059204, + "learning_rate": 1.4661252404359907e-05, + "loss": 0.2646, + "step": 3317 + }, + { + "epoch": 2.121531942112417, + "grad_norm": 2.9610562324523926, + "learning_rate": 1.4650566360333406e-05, + "loss": 0.3163, + "step": 3318 + }, + { + "epoch": 2.122171583913009, + "grad_norm": 2.8845345973968506, + "learning_rate": 1.4639880316306902e-05, + "loss": 0.3079, + "step": 3319 + }, + { + "epoch": 2.1228112257136003, + "grad_norm": 2.881183385848999, + "learning_rate": 1.4629194272280402e-05, + "loss": 0.2834, + "step": 3320 + }, + { + "epoch": 2.123450867514192, + "grad_norm": 2.780571222305298, + "learning_rate": 1.4618508228253903e-05, + "loss": 0.3087, + "step": 3321 + }, + { + "epoch": 2.124090509314784, + "grad_norm": 2.384935140609741, + "learning_rate": 1.4607822184227399e-05, + "loss": 0.2594, + "step": 3322 + }, + { + "epoch": 2.1247301511153753, + "grad_norm": 3.1249091625213623, + "learning_rate": 1.4597136140200898e-05, + "loss": 0.3069, + "step": 3323 + }, + { + "epoch": 2.125369792915967, + "grad_norm": 2.998181104660034, + "learning_rate": 1.4586450096174398e-05, + "loss": 0.3112, + "step": 3324 + }, + { + "epoch": 2.1260094347165586, + "grad_norm": 2.590634822845459, + "learning_rate": 1.4575764052147895e-05, + "loss": 0.2646, + "step": 3325 + }, + { + "epoch": 2.1266490765171504, + "grad_norm": 3.157170295715332, + "learning_rate": 1.4565078008121395e-05, + "loss": 0.3284, + "step": 3326 + }, + { + "epoch": 2.1272887183177422, + "grad_norm": 2.5375778675079346, + "learning_rate": 1.4554391964094894e-05, + "loss": 0.265, + "step": 3327 + }, + { + "epoch": 2.1279283601183336, + "grad_norm": 3.733571767807007, + "learning_rate": 1.454370592006839e-05, + "loss": 0.324, + "step": 3328 + }, + { + "epoch": 2.1285680019189255, + "grad_norm": 2.771312952041626, + "learning_rate": 1.453301987604189e-05, + "loss": 0.2974, + "step": 3329 + }, + { + "epoch": 2.129207643719517, + "grad_norm": 2.676919460296631, + "learning_rate": 1.4522333832015389e-05, + "loss": 0.278, + "step": 3330 + }, + { + "epoch": 2.1298472855201087, + "grad_norm": 2.596200942993164, + "learning_rate": 1.4511647787988888e-05, + "loss": 0.294, + "step": 3331 + }, + { + "epoch": 2.1304869273207006, + "grad_norm": 3.036965847015381, + "learning_rate": 1.4500961743962386e-05, + "loss": 0.2903, + "step": 3332 + }, + { + "epoch": 2.131126569121292, + "grad_norm": 3.010483503341675, + "learning_rate": 1.4490275699935885e-05, + "loss": 0.2975, + "step": 3333 + }, + { + "epoch": 2.131766210921884, + "grad_norm": 3.0769550800323486, + "learning_rate": 1.4479589655909384e-05, + "loss": 0.2878, + "step": 3334 + }, + { + "epoch": 2.132405852722475, + "grad_norm": 2.7077112197875977, + "learning_rate": 1.446890361188288e-05, + "loss": 0.2944, + "step": 3335 + }, + { + "epoch": 2.133045494523067, + "grad_norm": 3.1698193550109863, + "learning_rate": 1.445821756785638e-05, + "loss": 0.2907, + "step": 3336 + }, + { + "epoch": 2.133685136323659, + "grad_norm": 2.898766279220581, + "learning_rate": 1.4447531523829879e-05, + "loss": 0.2838, + "step": 3337 + }, + { + "epoch": 2.1343247781242503, + "grad_norm": 2.564615249633789, + "learning_rate": 1.4436845479803377e-05, + "loss": 0.2644, + "step": 3338 + }, + { + "epoch": 2.134964419924842, + "grad_norm": 2.5013158321380615, + "learning_rate": 1.4426159435776876e-05, + "loss": 0.2676, + "step": 3339 + }, + { + "epoch": 2.135604061725434, + "grad_norm": 3.6368777751922607, + "learning_rate": 1.4415473391750375e-05, + "loss": 0.3047, + "step": 3340 + }, + { + "epoch": 2.1362437035260253, + "grad_norm": 3.0526676177978516, + "learning_rate": 1.4404787347723871e-05, + "loss": 0.318, + "step": 3341 + }, + { + "epoch": 2.136883345326617, + "grad_norm": 3.0170764923095703, + "learning_rate": 1.439410130369737e-05, + "loss": 0.2822, + "step": 3342 + }, + { + "epoch": 2.1375229871272086, + "grad_norm": 2.3196122646331787, + "learning_rate": 1.438341525967087e-05, + "loss": 0.2389, + "step": 3343 + }, + { + "epoch": 2.1381626289278004, + "grad_norm": 2.5502870082855225, + "learning_rate": 1.4372729215644371e-05, + "loss": 0.2909, + "step": 3344 + }, + { + "epoch": 2.1388022707283922, + "grad_norm": 2.881182909011841, + "learning_rate": 1.4362043171617867e-05, + "loss": 0.314, + "step": 3345 + }, + { + "epoch": 2.1394419125289836, + "grad_norm": 2.882300615310669, + "learning_rate": 1.4351357127591367e-05, + "loss": 0.272, + "step": 3346 + }, + { + "epoch": 2.1400815543295755, + "grad_norm": 3.044785976409912, + "learning_rate": 1.4340671083564866e-05, + "loss": 0.2888, + "step": 3347 + }, + { + "epoch": 2.1407211961301673, + "grad_norm": 3.0978946685791016, + "learning_rate": 1.4329985039538364e-05, + "loss": 0.3126, + "step": 3348 + }, + { + "epoch": 2.1413608379307587, + "grad_norm": 3.6918516159057617, + "learning_rate": 1.4319298995511863e-05, + "loss": 0.3276, + "step": 3349 + }, + { + "epoch": 2.1420004797313505, + "grad_norm": 2.791208505630493, + "learning_rate": 1.4308612951485362e-05, + "loss": 0.2766, + "step": 3350 + }, + { + "epoch": 2.142640121531942, + "grad_norm": 2.750880002975464, + "learning_rate": 1.4297926907458858e-05, + "loss": 0.3169, + "step": 3351 + }, + { + "epoch": 2.143279763332534, + "grad_norm": 2.405256509780884, + "learning_rate": 1.4287240863432358e-05, + "loss": 0.2649, + "step": 3352 + }, + { + "epoch": 2.1439194051331256, + "grad_norm": 2.3942768573760986, + "learning_rate": 1.4276554819405857e-05, + "loss": 0.2422, + "step": 3353 + }, + { + "epoch": 2.144559046933717, + "grad_norm": 3.2327497005462646, + "learning_rate": 1.4265868775379355e-05, + "loss": 0.3109, + "step": 3354 + }, + { + "epoch": 2.145198688734309, + "grad_norm": 3.0916550159454346, + "learning_rate": 1.4255182731352854e-05, + "loss": 0.3093, + "step": 3355 + }, + { + "epoch": 2.1458383305349003, + "grad_norm": 2.913485050201416, + "learning_rate": 1.4244496687326353e-05, + "loss": 0.2844, + "step": 3356 + }, + { + "epoch": 2.146477972335492, + "grad_norm": 3.2898309230804443, + "learning_rate": 1.423381064329985e-05, + "loss": 0.3317, + "step": 3357 + }, + { + "epoch": 2.147117614136084, + "grad_norm": 2.7780044078826904, + "learning_rate": 1.4223124599273349e-05, + "loss": 0.3027, + "step": 3358 + }, + { + "epoch": 2.1477572559366753, + "grad_norm": 2.799602746963501, + "learning_rate": 1.4212438555246848e-05, + "loss": 0.277, + "step": 3359 + }, + { + "epoch": 2.148396897737267, + "grad_norm": 2.9556028842926025, + "learning_rate": 1.4201752511220347e-05, + "loss": 0.2848, + "step": 3360 + }, + { + "epoch": 2.149036539537859, + "grad_norm": 2.9669811725616455, + "learning_rate": 1.4191066467193845e-05, + "loss": 0.3034, + "step": 3361 + }, + { + "epoch": 2.1496761813384504, + "grad_norm": 2.8910956382751465, + "learning_rate": 1.4180380423167345e-05, + "loss": 0.272, + "step": 3362 + }, + { + "epoch": 2.1503158231390422, + "grad_norm": 3.1302740573883057, + "learning_rate": 1.4169694379140844e-05, + "loss": 0.3078, + "step": 3363 + }, + { + "epoch": 2.1509554649396336, + "grad_norm": 3.0801546573638916, + "learning_rate": 1.415900833511434e-05, + "loss": 0.3043, + "step": 3364 + }, + { + "epoch": 2.1515951067402255, + "grad_norm": 2.647568941116333, + "learning_rate": 1.414832229108784e-05, + "loss": 0.2723, + "step": 3365 + }, + { + "epoch": 2.1522347485408173, + "grad_norm": 2.8725883960723877, + "learning_rate": 1.4137636247061339e-05, + "loss": 0.2625, + "step": 3366 + }, + { + "epoch": 2.1528743903414087, + "grad_norm": 2.886364698410034, + "learning_rate": 1.4126950203034836e-05, + "loss": 0.2982, + "step": 3367 + }, + { + "epoch": 2.1535140321420005, + "grad_norm": 2.9265527725219727, + "learning_rate": 1.4116264159008336e-05, + "loss": 0.2883, + "step": 3368 + }, + { + "epoch": 2.1541536739425924, + "grad_norm": 2.4564144611358643, + "learning_rate": 1.4105578114981835e-05, + "loss": 0.2732, + "step": 3369 + }, + { + "epoch": 2.154793315743184, + "grad_norm": 2.6180050373077393, + "learning_rate": 1.4094892070955331e-05, + "loss": 0.2887, + "step": 3370 + }, + { + "epoch": 2.1554329575437756, + "grad_norm": 2.618345260620117, + "learning_rate": 1.4084206026928832e-05, + "loss": 0.2792, + "step": 3371 + }, + { + "epoch": 2.156072599344367, + "grad_norm": 2.967028856277466, + "learning_rate": 1.4073519982902331e-05, + "loss": 0.2945, + "step": 3372 + }, + { + "epoch": 2.156712241144959, + "grad_norm": 3.5304341316223145, + "learning_rate": 1.406283393887583e-05, + "loss": 0.3093, + "step": 3373 + }, + { + "epoch": 2.1573518829455507, + "grad_norm": 2.704852819442749, + "learning_rate": 1.4052147894849327e-05, + "loss": 0.2748, + "step": 3374 + }, + { + "epoch": 2.157991524746142, + "grad_norm": 2.722247362136841, + "learning_rate": 1.4041461850822826e-05, + "loss": 0.2743, + "step": 3375 + }, + { + "epoch": 2.158631166546734, + "grad_norm": 3.7692978382110596, + "learning_rate": 1.4030775806796325e-05, + "loss": 0.3565, + "step": 3376 + }, + { + "epoch": 2.1592708083473253, + "grad_norm": 2.812537670135498, + "learning_rate": 1.4020089762769823e-05, + "loss": 0.2525, + "step": 3377 + }, + { + "epoch": 2.159910450147917, + "grad_norm": 2.6271371841430664, + "learning_rate": 1.4009403718743322e-05, + "loss": 0.2953, + "step": 3378 + }, + { + "epoch": 2.160550091948509, + "grad_norm": 3.192307710647583, + "learning_rate": 1.3998717674716822e-05, + "loss": 0.3093, + "step": 3379 + }, + { + "epoch": 2.1611897337491004, + "grad_norm": 3.243069648742676, + "learning_rate": 1.3988031630690318e-05, + "loss": 0.2989, + "step": 3380 + }, + { + "epoch": 2.1618293755496922, + "grad_norm": 3.1782214641571045, + "learning_rate": 1.3977345586663817e-05, + "loss": 0.3023, + "step": 3381 + }, + { + "epoch": 2.1624690173502836, + "grad_norm": 2.870701551437378, + "learning_rate": 1.3966659542637317e-05, + "loss": 0.2988, + "step": 3382 + }, + { + "epoch": 2.1631086591508755, + "grad_norm": 2.7732129096984863, + "learning_rate": 1.3955973498610814e-05, + "loss": 0.2828, + "step": 3383 + }, + { + "epoch": 2.1637483009514673, + "grad_norm": 2.599611282348633, + "learning_rate": 1.3945287454584314e-05, + "loss": 0.2854, + "step": 3384 + }, + { + "epoch": 2.1643879427520587, + "grad_norm": 3.072861909866333, + "learning_rate": 1.3934601410557813e-05, + "loss": 0.3059, + "step": 3385 + }, + { + "epoch": 2.1650275845526505, + "grad_norm": 2.7385823726654053, + "learning_rate": 1.3923915366531312e-05, + "loss": 0.3106, + "step": 3386 + }, + { + "epoch": 2.1656672263532424, + "grad_norm": 2.8344128131866455, + "learning_rate": 1.3913229322504808e-05, + "loss": 0.293, + "step": 3387 + }, + { + "epoch": 2.1663068681538338, + "grad_norm": 2.4170472621917725, + "learning_rate": 1.3902543278478308e-05, + "loss": 0.2688, + "step": 3388 + }, + { + "epoch": 2.1669465099544256, + "grad_norm": 2.664489984512329, + "learning_rate": 1.3891857234451807e-05, + "loss": 0.2807, + "step": 3389 + }, + { + "epoch": 2.167586151755017, + "grad_norm": 2.6602396965026855, + "learning_rate": 1.3881171190425305e-05, + "loss": 0.2936, + "step": 3390 + }, + { + "epoch": 2.168225793555609, + "grad_norm": 2.9426655769348145, + "learning_rate": 1.3870485146398804e-05, + "loss": 0.3038, + "step": 3391 + }, + { + "epoch": 2.1688654353562007, + "grad_norm": 2.8860256671905518, + "learning_rate": 1.3859799102372303e-05, + "loss": 0.3081, + "step": 3392 + }, + { + "epoch": 2.169505077156792, + "grad_norm": 2.702208995819092, + "learning_rate": 1.38491130583458e-05, + "loss": 0.2803, + "step": 3393 + }, + { + "epoch": 2.170144718957384, + "grad_norm": 3.1027512550354004, + "learning_rate": 1.38384270143193e-05, + "loss": 0.3046, + "step": 3394 + }, + { + "epoch": 2.1707843607579758, + "grad_norm": 3.0601143836975098, + "learning_rate": 1.38277409702928e-05, + "loss": 0.2921, + "step": 3395 + }, + { + "epoch": 2.171424002558567, + "grad_norm": 3.29142165184021, + "learning_rate": 1.3817054926266296e-05, + "loss": 0.3275, + "step": 3396 + }, + { + "epoch": 2.172063644359159, + "grad_norm": 2.895599603652954, + "learning_rate": 1.3806368882239795e-05, + "loss": 0.3053, + "step": 3397 + }, + { + "epoch": 2.1727032861597504, + "grad_norm": 2.7596819400787354, + "learning_rate": 1.3795682838213294e-05, + "loss": 0.2662, + "step": 3398 + }, + { + "epoch": 2.1733429279603422, + "grad_norm": 3.2723777294158936, + "learning_rate": 1.3784996794186792e-05, + "loss": 0.3229, + "step": 3399 + }, + { + "epoch": 2.173982569760934, + "grad_norm": 3.1947109699249268, + "learning_rate": 1.3774310750160291e-05, + "loss": 0.312, + "step": 3400 + }, + { + "epoch": 2.1746222115615255, + "grad_norm": 3.437711238861084, + "learning_rate": 1.376362470613379e-05, + "loss": 0.3301, + "step": 3401 + }, + { + "epoch": 2.1752618533621173, + "grad_norm": 2.752490997314453, + "learning_rate": 1.375293866210729e-05, + "loss": 0.2879, + "step": 3402 + }, + { + "epoch": 2.1759014951627087, + "grad_norm": 2.922455310821533, + "learning_rate": 1.3742252618080786e-05, + "loss": 0.2952, + "step": 3403 + }, + { + "epoch": 2.1765411369633005, + "grad_norm": 2.505164384841919, + "learning_rate": 1.3731566574054286e-05, + "loss": 0.2655, + "step": 3404 + }, + { + "epoch": 2.1771807787638924, + "grad_norm": 3.0617635250091553, + "learning_rate": 1.3720880530027785e-05, + "loss": 0.3192, + "step": 3405 + }, + { + "epoch": 2.1778204205644838, + "grad_norm": 2.7670798301696777, + "learning_rate": 1.3710194486001283e-05, + "loss": 0.2838, + "step": 3406 + }, + { + "epoch": 2.1784600623650756, + "grad_norm": 2.8397114276885986, + "learning_rate": 1.3699508441974782e-05, + "loss": 0.2785, + "step": 3407 + }, + { + "epoch": 2.179099704165667, + "grad_norm": 3.077963352203369, + "learning_rate": 1.3688822397948281e-05, + "loss": 0.2929, + "step": 3408 + }, + { + "epoch": 2.179739345966259, + "grad_norm": 3.0644330978393555, + "learning_rate": 1.3678136353921777e-05, + "loss": 0.2865, + "step": 3409 + }, + { + "epoch": 2.1803789877668507, + "grad_norm": 2.621067523956299, + "learning_rate": 1.3667450309895277e-05, + "loss": 0.2673, + "step": 3410 + }, + { + "epoch": 2.181018629567442, + "grad_norm": 3.0704424381256104, + "learning_rate": 1.3656764265868776e-05, + "loss": 0.3176, + "step": 3411 + }, + { + "epoch": 2.181658271368034, + "grad_norm": 3.1157379150390625, + "learning_rate": 1.3646078221842274e-05, + "loss": 0.3075, + "step": 3412 + }, + { + "epoch": 2.1822979131686258, + "grad_norm": 3.213707208633423, + "learning_rate": 1.3635392177815773e-05, + "loss": 0.2793, + "step": 3413 + }, + { + "epoch": 2.182937554969217, + "grad_norm": 2.6101722717285156, + "learning_rate": 1.3624706133789272e-05, + "loss": 0.2636, + "step": 3414 + }, + { + "epoch": 2.183577196769809, + "grad_norm": 2.9204750061035156, + "learning_rate": 1.3614020089762772e-05, + "loss": 0.292, + "step": 3415 + }, + { + "epoch": 2.1842168385704004, + "grad_norm": 3.1458258628845215, + "learning_rate": 1.3603334045736268e-05, + "loss": 0.267, + "step": 3416 + }, + { + "epoch": 2.1848564803709922, + "grad_norm": 2.9545929431915283, + "learning_rate": 1.3592648001709769e-05, + "loss": 0.3117, + "step": 3417 + }, + { + "epoch": 2.185496122171584, + "grad_norm": 2.680281639099121, + "learning_rate": 1.3581961957683268e-05, + "loss": 0.2697, + "step": 3418 + }, + { + "epoch": 2.1861357639721755, + "grad_norm": 2.8024208545684814, + "learning_rate": 1.3571275913656764e-05, + "loss": 0.2978, + "step": 3419 + }, + { + "epoch": 2.1867754057727673, + "grad_norm": 3.025855541229248, + "learning_rate": 1.3560589869630263e-05, + "loss": 0.2808, + "step": 3420 + }, + { + "epoch": 2.187415047573359, + "grad_norm": 3.2208971977233887, + "learning_rate": 1.3549903825603763e-05, + "loss": 0.3339, + "step": 3421 + }, + { + "epoch": 2.1880546893739505, + "grad_norm": 2.4828367233276367, + "learning_rate": 1.353921778157726e-05, + "loss": 0.2814, + "step": 3422 + }, + { + "epoch": 2.1886943311745424, + "grad_norm": 2.6162936687469482, + "learning_rate": 1.352853173755076e-05, + "loss": 0.2625, + "step": 3423 + }, + { + "epoch": 2.1893339729751338, + "grad_norm": 2.6846094131469727, + "learning_rate": 1.351784569352426e-05, + "loss": 0.29, + "step": 3424 + }, + { + "epoch": 2.1899736147757256, + "grad_norm": 2.86295485496521, + "learning_rate": 1.3507159649497755e-05, + "loss": 0.3253, + "step": 3425 + }, + { + "epoch": 2.1906132565763174, + "grad_norm": 2.699244976043701, + "learning_rate": 1.3496473605471255e-05, + "loss": 0.2953, + "step": 3426 + }, + { + "epoch": 2.191252898376909, + "grad_norm": 2.9537737369537354, + "learning_rate": 1.3485787561444754e-05, + "loss": 0.3111, + "step": 3427 + }, + { + "epoch": 2.1918925401775007, + "grad_norm": 2.543488025665283, + "learning_rate": 1.3475101517418253e-05, + "loss": 0.3009, + "step": 3428 + }, + { + "epoch": 2.192532181978092, + "grad_norm": 2.7410712242126465, + "learning_rate": 1.3464415473391751e-05, + "loss": 0.2775, + "step": 3429 + }, + { + "epoch": 2.193171823778684, + "grad_norm": 2.979616641998291, + "learning_rate": 1.345372942936525e-05, + "loss": 0.3075, + "step": 3430 + }, + { + "epoch": 2.1938114655792758, + "grad_norm": 2.9277937412261963, + "learning_rate": 1.344304338533875e-05, + "loss": 0.2885, + "step": 3431 + }, + { + "epoch": 2.194451107379867, + "grad_norm": 2.8581950664520264, + "learning_rate": 1.3432357341312246e-05, + "loss": 0.2782, + "step": 3432 + }, + { + "epoch": 2.195090749180459, + "grad_norm": 2.4586994647979736, + "learning_rate": 1.3421671297285745e-05, + "loss": 0.2519, + "step": 3433 + }, + { + "epoch": 2.1957303909810504, + "grad_norm": 3.1406280994415283, + "learning_rate": 1.3410985253259244e-05, + "loss": 0.3196, + "step": 3434 + }, + { + "epoch": 2.1963700327816422, + "grad_norm": 2.5794754028320312, + "learning_rate": 1.3400299209232742e-05, + "loss": 0.3065, + "step": 3435 + }, + { + "epoch": 2.197009674582234, + "grad_norm": 2.432429313659668, + "learning_rate": 1.3389613165206241e-05, + "loss": 0.2801, + "step": 3436 + }, + { + "epoch": 2.1976493163828255, + "grad_norm": 2.5403757095336914, + "learning_rate": 1.337892712117974e-05, + "loss": 0.28, + "step": 3437 + }, + { + "epoch": 2.1982889581834173, + "grad_norm": 2.7475318908691406, + "learning_rate": 1.3368241077153237e-05, + "loss": 0.2747, + "step": 3438 + }, + { + "epoch": 2.198928599984009, + "grad_norm": 2.8518126010894775, + "learning_rate": 1.3357555033126736e-05, + "loss": 0.3006, + "step": 3439 + }, + { + "epoch": 2.1995682417846005, + "grad_norm": 2.685335159301758, + "learning_rate": 1.3346868989100237e-05, + "loss": 0.2915, + "step": 3440 + }, + { + "epoch": 2.2002078835851924, + "grad_norm": 3.181511163711548, + "learning_rate": 1.3336182945073733e-05, + "loss": 0.3398, + "step": 3441 + }, + { + "epoch": 2.2008475253857838, + "grad_norm": 2.777080535888672, + "learning_rate": 1.3325496901047232e-05, + "loss": 0.3063, + "step": 3442 + }, + { + "epoch": 2.2014871671863756, + "grad_norm": 2.3552916049957275, + "learning_rate": 1.3314810857020732e-05, + "loss": 0.2554, + "step": 3443 + }, + { + "epoch": 2.2021268089869674, + "grad_norm": 3.068506956100464, + "learning_rate": 1.3304124812994231e-05, + "loss": 0.2936, + "step": 3444 + }, + { + "epoch": 2.202766450787559, + "grad_norm": 3.520688056945801, + "learning_rate": 1.3293438768967729e-05, + "loss": 0.3561, + "step": 3445 + }, + { + "epoch": 2.2034060925881507, + "grad_norm": 2.4779372215270996, + "learning_rate": 1.3282752724941228e-05, + "loss": 0.2822, + "step": 3446 + }, + { + "epoch": 2.2040457343887425, + "grad_norm": 2.91658616065979, + "learning_rate": 1.3272066680914728e-05, + "loss": 0.2783, + "step": 3447 + }, + { + "epoch": 2.204685376189334, + "grad_norm": 2.7341294288635254, + "learning_rate": 1.3261380636888224e-05, + "loss": 0.2648, + "step": 3448 + }, + { + "epoch": 2.2053250179899258, + "grad_norm": 3.193854331970215, + "learning_rate": 1.3250694592861723e-05, + "loss": 0.2955, + "step": 3449 + }, + { + "epoch": 2.205964659790517, + "grad_norm": 2.6825780868530273, + "learning_rate": 1.3240008548835222e-05, + "loss": 0.2744, + "step": 3450 + }, + { + "epoch": 2.206604301591109, + "grad_norm": 2.523312568664551, + "learning_rate": 1.322932250480872e-05, + "loss": 0.2778, + "step": 3451 + }, + { + "epoch": 2.207243943391701, + "grad_norm": 2.689265251159668, + "learning_rate": 1.321863646078222e-05, + "loss": 0.2669, + "step": 3452 + }, + { + "epoch": 2.207883585192292, + "grad_norm": 2.6073591709136963, + "learning_rate": 1.3207950416755719e-05, + "loss": 0.2803, + "step": 3453 + }, + { + "epoch": 2.208523226992884, + "grad_norm": 2.992279052734375, + "learning_rate": 1.3197264372729215e-05, + "loss": 0.2748, + "step": 3454 + }, + { + "epoch": 2.2091628687934755, + "grad_norm": 3.035036325454712, + "learning_rate": 1.3186578328702714e-05, + "loss": 0.3243, + "step": 3455 + }, + { + "epoch": 2.2098025105940673, + "grad_norm": 2.693161725997925, + "learning_rate": 1.3175892284676213e-05, + "loss": 0.2809, + "step": 3456 + }, + { + "epoch": 2.210442152394659, + "grad_norm": 3.079709053039551, + "learning_rate": 1.3165206240649713e-05, + "loss": 0.3027, + "step": 3457 + }, + { + "epoch": 2.2110817941952505, + "grad_norm": 3.111102819442749, + "learning_rate": 1.315452019662321e-05, + "loss": 0.3032, + "step": 3458 + }, + { + "epoch": 2.2117214359958424, + "grad_norm": 2.6593542098999023, + "learning_rate": 1.314383415259671e-05, + "loss": 0.2664, + "step": 3459 + }, + { + "epoch": 2.2123610777964338, + "grad_norm": 3.213458299636841, + "learning_rate": 1.3133148108570209e-05, + "loss": 0.3159, + "step": 3460 + }, + { + "epoch": 2.2130007195970256, + "grad_norm": 3.1140432357788086, + "learning_rate": 1.3122462064543705e-05, + "loss": 0.3099, + "step": 3461 + }, + { + "epoch": 2.2136403613976174, + "grad_norm": 2.8638763427734375, + "learning_rate": 1.3111776020517204e-05, + "loss": 0.2673, + "step": 3462 + }, + { + "epoch": 2.214280003198209, + "grad_norm": 2.9843013286590576, + "learning_rate": 1.3101089976490704e-05, + "loss": 0.2828, + "step": 3463 + }, + { + "epoch": 2.2149196449988007, + "grad_norm": 2.785283088684082, + "learning_rate": 1.3090403932464202e-05, + "loss": 0.2827, + "step": 3464 + }, + { + "epoch": 2.2155592867993925, + "grad_norm": 2.647834062576294, + "learning_rate": 1.3079717888437701e-05, + "loss": 0.2496, + "step": 3465 + }, + { + "epoch": 2.216198928599984, + "grad_norm": 2.7125985622406006, + "learning_rate": 1.30690318444112e-05, + "loss": 0.2965, + "step": 3466 + }, + { + "epoch": 2.2168385704005757, + "grad_norm": 3.4798755645751953, + "learning_rate": 1.3058345800384696e-05, + "loss": 0.292, + "step": 3467 + }, + { + "epoch": 2.217478212201167, + "grad_norm": 2.8353967666625977, + "learning_rate": 1.3047659756358197e-05, + "loss": 0.284, + "step": 3468 + }, + { + "epoch": 2.218117854001759, + "grad_norm": 2.951139450073242, + "learning_rate": 1.3036973712331697e-05, + "loss": 0.2733, + "step": 3469 + }, + { + "epoch": 2.218757495802351, + "grad_norm": 3.917747974395752, + "learning_rate": 1.3026287668305196e-05, + "loss": 0.3437, + "step": 3470 + }, + { + "epoch": 2.219397137602942, + "grad_norm": 3.203145742416382, + "learning_rate": 1.3015601624278692e-05, + "loss": 0.3045, + "step": 3471 + }, + { + "epoch": 2.220036779403534, + "grad_norm": 3.0355777740478516, + "learning_rate": 1.3004915580252191e-05, + "loss": 0.2826, + "step": 3472 + }, + { + "epoch": 2.220676421204126, + "grad_norm": 2.922559976577759, + "learning_rate": 1.299422953622569e-05, + "loss": 0.2841, + "step": 3473 + }, + { + "epoch": 2.2213160630047173, + "grad_norm": 3.310992956161499, + "learning_rate": 1.2983543492199188e-05, + "loss": 0.2893, + "step": 3474 + }, + { + "epoch": 2.221955704805309, + "grad_norm": 2.6760876178741455, + "learning_rate": 1.2972857448172688e-05, + "loss": 0.2732, + "step": 3475 + }, + { + "epoch": 2.2225953466059005, + "grad_norm": 2.867703676223755, + "learning_rate": 1.2962171404146187e-05, + "loss": 0.299, + "step": 3476 + }, + { + "epoch": 2.2232349884064924, + "grad_norm": 2.7198235988616943, + "learning_rate": 1.2951485360119683e-05, + "loss": 0.2922, + "step": 3477 + }, + { + "epoch": 2.223874630207084, + "grad_norm": 3.2691850662231445, + "learning_rate": 1.2940799316093182e-05, + "loss": 0.3039, + "step": 3478 + }, + { + "epoch": 2.2245142720076756, + "grad_norm": 2.9139389991760254, + "learning_rate": 1.2930113272066682e-05, + "loss": 0.3006, + "step": 3479 + }, + { + "epoch": 2.2251539138082674, + "grad_norm": 3.0620779991149902, + "learning_rate": 1.291942722804018e-05, + "loss": 0.3219, + "step": 3480 + }, + { + "epoch": 2.225793555608859, + "grad_norm": 3.0976994037628174, + "learning_rate": 1.2908741184013679e-05, + "loss": 0.3073, + "step": 3481 + }, + { + "epoch": 2.2264331974094507, + "grad_norm": 2.9936912059783936, + "learning_rate": 1.2898055139987178e-05, + "loss": 0.3193, + "step": 3482 + }, + { + "epoch": 2.2270728392100425, + "grad_norm": 2.6903512477874756, + "learning_rate": 1.2887369095960674e-05, + "loss": 0.2939, + "step": 3483 + }, + { + "epoch": 2.227712481010634, + "grad_norm": 3.3316521644592285, + "learning_rate": 1.2876683051934174e-05, + "loss": 0.2894, + "step": 3484 + }, + { + "epoch": 2.2283521228112257, + "grad_norm": 2.9921982288360596, + "learning_rate": 1.2865997007907673e-05, + "loss": 0.3064, + "step": 3485 + }, + { + "epoch": 2.2289917646118176, + "grad_norm": 2.711712598800659, + "learning_rate": 1.2855310963881172e-05, + "loss": 0.2814, + "step": 3486 + }, + { + "epoch": 2.229631406412409, + "grad_norm": 2.6264872550964355, + "learning_rate": 1.284462491985467e-05, + "loss": 0.2915, + "step": 3487 + }, + { + "epoch": 2.230271048213001, + "grad_norm": 2.4429709911346436, + "learning_rate": 1.283393887582817e-05, + "loss": 0.268, + "step": 3488 + }, + { + "epoch": 2.230910690013592, + "grad_norm": 2.457186698913574, + "learning_rate": 1.2823252831801669e-05, + "loss": 0.2916, + "step": 3489 + }, + { + "epoch": 2.231550331814184, + "grad_norm": 3.016545534133911, + "learning_rate": 1.2812566787775165e-05, + "loss": 0.2995, + "step": 3490 + }, + { + "epoch": 2.232189973614776, + "grad_norm": 2.7489140033721924, + "learning_rate": 1.2801880743748666e-05, + "loss": 0.2965, + "step": 3491 + }, + { + "epoch": 2.2328296154153673, + "grad_norm": 2.608473777770996, + "learning_rate": 1.2791194699722165e-05, + "loss": 0.2758, + "step": 3492 + }, + { + "epoch": 2.233469257215959, + "grad_norm": 3.392732858657837, + "learning_rate": 1.2780508655695661e-05, + "loss": 0.3289, + "step": 3493 + }, + { + "epoch": 2.234108899016551, + "grad_norm": 3.0134336948394775, + "learning_rate": 1.276982261166916e-05, + "loss": 0.3151, + "step": 3494 + }, + { + "epoch": 2.2347485408171424, + "grad_norm": 2.6988487243652344, + "learning_rate": 1.275913656764266e-05, + "loss": 0.2755, + "step": 3495 + }, + { + "epoch": 2.235388182617734, + "grad_norm": 3.099886655807495, + "learning_rate": 1.2748450523616157e-05, + "loss": 0.3069, + "step": 3496 + }, + { + "epoch": 2.2360278244183256, + "grad_norm": 2.533621072769165, + "learning_rate": 1.2737764479589657e-05, + "loss": 0.2794, + "step": 3497 + }, + { + "epoch": 2.2366674662189174, + "grad_norm": 2.615140676498413, + "learning_rate": 1.2727078435563156e-05, + "loss": 0.2849, + "step": 3498 + }, + { + "epoch": 2.2373071080195093, + "grad_norm": 2.7561891078948975, + "learning_rate": 1.2716392391536655e-05, + "loss": 0.2835, + "step": 3499 + }, + { + "epoch": 2.2379467498201007, + "grad_norm": 2.9579265117645264, + "learning_rate": 1.2705706347510151e-05, + "loss": 0.2871, + "step": 3500 + }, + { + "epoch": 2.2385863916206925, + "grad_norm": 2.5754003524780273, + "learning_rate": 1.269502030348365e-05, + "loss": 0.2673, + "step": 3501 + }, + { + "epoch": 2.239226033421284, + "grad_norm": 3.4438931941986084, + "learning_rate": 1.268433425945715e-05, + "loss": 0.3123, + "step": 3502 + }, + { + "epoch": 2.2398656752218757, + "grad_norm": 2.4750845432281494, + "learning_rate": 1.2673648215430648e-05, + "loss": 0.2662, + "step": 3503 + }, + { + "epoch": 2.2405053170224676, + "grad_norm": 2.9451913833618164, + "learning_rate": 1.2662962171404147e-05, + "loss": 0.2835, + "step": 3504 + }, + { + "epoch": 2.241144958823059, + "grad_norm": 2.945578098297119, + "learning_rate": 1.2652276127377647e-05, + "loss": 0.2939, + "step": 3505 + }, + { + "epoch": 2.241784600623651, + "grad_norm": 2.783414125442505, + "learning_rate": 1.2641590083351143e-05, + "loss": 0.2902, + "step": 3506 + }, + { + "epoch": 2.242424242424242, + "grad_norm": 2.7330236434936523, + "learning_rate": 1.2630904039324642e-05, + "loss": 0.284, + "step": 3507 + }, + { + "epoch": 2.243063884224834, + "grad_norm": 3.278259515762329, + "learning_rate": 1.2620217995298141e-05, + "loss": 0.3287, + "step": 3508 + }, + { + "epoch": 2.243703526025426, + "grad_norm": 2.8566737174987793, + "learning_rate": 1.2609531951271639e-05, + "loss": 0.2773, + "step": 3509 + }, + { + "epoch": 2.2443431678260173, + "grad_norm": 3.131185531616211, + "learning_rate": 1.2598845907245138e-05, + "loss": 0.2857, + "step": 3510 + }, + { + "epoch": 2.244982809626609, + "grad_norm": 2.4174184799194336, + "learning_rate": 1.2588159863218638e-05, + "loss": 0.269, + "step": 3511 + }, + { + "epoch": 2.245622451427201, + "grad_norm": 2.4447731971740723, + "learning_rate": 1.2577473819192134e-05, + "loss": 0.2692, + "step": 3512 + }, + { + "epoch": 2.2462620932277924, + "grad_norm": 3.203326940536499, + "learning_rate": 1.2566787775165633e-05, + "loss": 0.3228, + "step": 3513 + }, + { + "epoch": 2.246901735028384, + "grad_norm": 2.9603841304779053, + "learning_rate": 1.2556101731139134e-05, + "loss": 0.3081, + "step": 3514 + }, + { + "epoch": 2.2475413768289756, + "grad_norm": 2.5194952487945557, + "learning_rate": 1.2545415687112633e-05, + "loss": 0.2684, + "step": 3515 + }, + { + "epoch": 2.2481810186295674, + "grad_norm": 2.6615219116210938, + "learning_rate": 1.253472964308613e-05, + "loss": 0.2706, + "step": 3516 + }, + { + "epoch": 2.2488206604301593, + "grad_norm": 2.532273054122925, + "learning_rate": 1.2524043599059629e-05, + "loss": 0.2542, + "step": 3517 + }, + { + "epoch": 2.2494603022307507, + "grad_norm": 2.634317636489868, + "learning_rate": 1.2513357555033128e-05, + "loss": 0.2808, + "step": 3518 + }, + { + "epoch": 2.2500999440313425, + "grad_norm": 2.510207414627075, + "learning_rate": 1.2502671511006626e-05, + "loss": 0.2723, + "step": 3519 + }, + { + "epoch": 2.2507395858319343, + "grad_norm": 2.733276605606079, + "learning_rate": 1.2491985466980125e-05, + "loss": 0.2714, + "step": 3520 + }, + { + "epoch": 2.2513792276325257, + "grad_norm": 2.5398309230804443, + "learning_rate": 1.2481299422953623e-05, + "loss": 0.2778, + "step": 3521 + }, + { + "epoch": 2.2520188694331176, + "grad_norm": 2.616506576538086, + "learning_rate": 1.2470613378927122e-05, + "loss": 0.2673, + "step": 3522 + }, + { + "epoch": 2.252658511233709, + "grad_norm": 2.689331293106079, + "learning_rate": 1.245992733490062e-05, + "loss": 0.2695, + "step": 3523 + }, + { + "epoch": 2.253298153034301, + "grad_norm": 3.1806883811950684, + "learning_rate": 1.244924129087412e-05, + "loss": 0.2884, + "step": 3524 + }, + { + "epoch": 2.2539377948348926, + "grad_norm": 3.4878897666931152, + "learning_rate": 1.2438555246847617e-05, + "loss": 0.3052, + "step": 3525 + }, + { + "epoch": 2.254577436635484, + "grad_norm": 2.564763069152832, + "learning_rate": 1.2427869202821116e-05, + "loss": 0.2591, + "step": 3526 + }, + { + "epoch": 2.255217078436076, + "grad_norm": 3.0779974460601807, + "learning_rate": 1.2417183158794616e-05, + "loss": 0.2996, + "step": 3527 + }, + { + "epoch": 2.2558567202366673, + "grad_norm": 3.2080490589141846, + "learning_rate": 1.2406497114768113e-05, + "loss": 0.3045, + "step": 3528 + }, + { + "epoch": 2.256496362037259, + "grad_norm": 2.921027421951294, + "learning_rate": 1.2395811070741611e-05, + "loss": 0.2745, + "step": 3529 + }, + { + "epoch": 2.257136003837851, + "grad_norm": 2.5587399005889893, + "learning_rate": 1.238512502671511e-05, + "loss": 0.2563, + "step": 3530 + }, + { + "epoch": 2.2577756456384424, + "grad_norm": 2.963634967803955, + "learning_rate": 1.237443898268861e-05, + "loss": 0.2807, + "step": 3531 + }, + { + "epoch": 2.258415287439034, + "grad_norm": 2.8558361530303955, + "learning_rate": 1.2363752938662109e-05, + "loss": 0.2667, + "step": 3532 + }, + { + "epoch": 2.2590549292396256, + "grad_norm": 3.426204204559326, + "learning_rate": 1.2353066894635607e-05, + "loss": 0.3137, + "step": 3533 + }, + { + "epoch": 2.2596945710402174, + "grad_norm": 3.069042444229126, + "learning_rate": 1.2342380850609104e-05, + "loss": 0.2926, + "step": 3534 + }, + { + "epoch": 2.2603342128408093, + "grad_norm": 3.452301263809204, + "learning_rate": 1.2331694806582604e-05, + "loss": 0.3159, + "step": 3535 + }, + { + "epoch": 2.2609738546414007, + "grad_norm": 3.6270313262939453, + "learning_rate": 1.2321008762556101e-05, + "loss": 0.3129, + "step": 3536 + }, + { + "epoch": 2.2616134964419925, + "grad_norm": 3.6193034648895264, + "learning_rate": 1.2310322718529602e-05, + "loss": 0.3215, + "step": 3537 + }, + { + "epoch": 2.2622531382425843, + "grad_norm": 2.4998440742492676, + "learning_rate": 1.22996366745031e-05, + "loss": 0.2587, + "step": 3538 + }, + { + "epoch": 2.2628927800431757, + "grad_norm": 3.1039974689483643, + "learning_rate": 1.2288950630476598e-05, + "loss": 0.3018, + "step": 3539 + }, + { + "epoch": 2.2635324218437676, + "grad_norm": 2.9664714336395264, + "learning_rate": 1.2278264586450097e-05, + "loss": 0.306, + "step": 3540 + }, + { + "epoch": 2.264172063644359, + "grad_norm": 3.3815581798553467, + "learning_rate": 1.2267578542423595e-05, + "loss": 0.3236, + "step": 3541 + }, + { + "epoch": 2.264811705444951, + "grad_norm": 2.8701741695404053, + "learning_rate": 1.2256892498397094e-05, + "loss": 0.3001, + "step": 3542 + }, + { + "epoch": 2.2654513472455426, + "grad_norm": 2.553720712661743, + "learning_rate": 1.2246206454370594e-05, + "loss": 0.2727, + "step": 3543 + }, + { + "epoch": 2.266090989046134, + "grad_norm": 3.5508580207824707, + "learning_rate": 1.2235520410344091e-05, + "loss": 0.3707, + "step": 3544 + }, + { + "epoch": 2.266730630846726, + "grad_norm": 2.6788525581359863, + "learning_rate": 1.222483436631759e-05, + "loss": 0.2791, + "step": 3545 + }, + { + "epoch": 2.2673702726473177, + "grad_norm": 3.6063945293426514, + "learning_rate": 1.2214148322291088e-05, + "loss": 0.2972, + "step": 3546 + }, + { + "epoch": 2.268009914447909, + "grad_norm": 2.747265338897705, + "learning_rate": 1.2203462278264586e-05, + "loss": 0.2586, + "step": 3547 + }, + { + "epoch": 2.268649556248501, + "grad_norm": 2.4529383182525635, + "learning_rate": 1.2192776234238085e-05, + "loss": 0.2541, + "step": 3548 + }, + { + "epoch": 2.2692891980490923, + "grad_norm": 3.0293221473693848, + "learning_rate": 1.2182090190211585e-05, + "loss": 0.3092, + "step": 3549 + }, + { + "epoch": 2.269928839849684, + "grad_norm": 2.801374673843384, + "learning_rate": 1.2171404146185082e-05, + "loss": 0.2795, + "step": 3550 + }, + { + "epoch": 2.270568481650276, + "grad_norm": 3.0589401721954346, + "learning_rate": 1.2160718102158582e-05, + "loss": 0.3201, + "step": 3551 + }, + { + "epoch": 2.2712081234508674, + "grad_norm": 2.9472100734710693, + "learning_rate": 1.215003205813208e-05, + "loss": 0.3126, + "step": 3552 + }, + { + "epoch": 2.2718477652514593, + "grad_norm": 2.8621063232421875, + "learning_rate": 1.2139346014105579e-05, + "loss": 0.3024, + "step": 3553 + }, + { + "epoch": 2.2724874070520507, + "grad_norm": 2.146125316619873, + "learning_rate": 1.2128659970079078e-05, + "loss": 0.2638, + "step": 3554 + }, + { + "epoch": 2.2731270488526425, + "grad_norm": 3.0356833934783936, + "learning_rate": 1.2117973926052576e-05, + "loss": 0.294, + "step": 3555 + }, + { + "epoch": 2.2737666906532343, + "grad_norm": 2.876336097717285, + "learning_rate": 1.2107287882026075e-05, + "loss": 0.299, + "step": 3556 + }, + { + "epoch": 2.2744063324538257, + "grad_norm": 2.3675389289855957, + "learning_rate": 1.2096601837999573e-05, + "loss": 0.2381, + "step": 3557 + }, + { + "epoch": 2.2750459742544176, + "grad_norm": 2.785595417022705, + "learning_rate": 1.208591579397307e-05, + "loss": 0.2713, + "step": 3558 + }, + { + "epoch": 2.275685616055009, + "grad_norm": 3.246232509613037, + "learning_rate": 1.207522974994657e-05, + "loss": 0.2857, + "step": 3559 + }, + { + "epoch": 2.276325257855601, + "grad_norm": 3.060328245162964, + "learning_rate": 1.2064543705920069e-05, + "loss": 0.3072, + "step": 3560 + }, + { + "epoch": 2.2769648996561926, + "grad_norm": 3.2622430324554443, + "learning_rate": 1.2053857661893568e-05, + "loss": 0.3001, + "step": 3561 + }, + { + "epoch": 2.277604541456784, + "grad_norm": 3.003283977508545, + "learning_rate": 1.2043171617867066e-05, + "loss": 0.2907, + "step": 3562 + }, + { + "epoch": 2.278244183257376, + "grad_norm": 2.7570948600769043, + "learning_rate": 1.2032485573840564e-05, + "loss": 0.2956, + "step": 3563 + }, + { + "epoch": 2.2788838250579677, + "grad_norm": 2.963020086288452, + "learning_rate": 1.2021799529814063e-05, + "loss": 0.2987, + "step": 3564 + }, + { + "epoch": 2.279523466858559, + "grad_norm": 3.440944194793701, + "learning_rate": 1.2011113485787563e-05, + "loss": 0.3164, + "step": 3565 + }, + { + "epoch": 2.280163108659151, + "grad_norm": 2.6674869060516357, + "learning_rate": 1.2000427441761062e-05, + "loss": 0.2763, + "step": 3566 + }, + { + "epoch": 2.280802750459743, + "grad_norm": 2.35956072807312, + "learning_rate": 1.198974139773456e-05, + "loss": 0.246, + "step": 3567 + }, + { + "epoch": 2.281442392260334, + "grad_norm": 2.8054215908050537, + "learning_rate": 1.1979055353708057e-05, + "loss": 0.2925, + "step": 3568 + }, + { + "epoch": 2.282082034060926, + "grad_norm": 3.342362403869629, + "learning_rate": 1.1968369309681557e-05, + "loss": 0.2962, + "step": 3569 + }, + { + "epoch": 2.2827216758615174, + "grad_norm": 2.6458470821380615, + "learning_rate": 1.1957683265655054e-05, + "loss": 0.2823, + "step": 3570 + }, + { + "epoch": 2.2833613176621093, + "grad_norm": 2.852384090423584, + "learning_rate": 1.1946997221628554e-05, + "loss": 0.2847, + "step": 3571 + }, + { + "epoch": 2.284000959462701, + "grad_norm": 3.3162930011749268, + "learning_rate": 1.1936311177602053e-05, + "loss": 0.3047, + "step": 3572 + }, + { + "epoch": 2.2846406012632925, + "grad_norm": 2.9299237728118896, + "learning_rate": 1.192562513357555e-05, + "loss": 0.3103, + "step": 3573 + }, + { + "epoch": 2.2852802430638843, + "grad_norm": 2.87814998626709, + "learning_rate": 1.191493908954905e-05, + "loss": 0.2912, + "step": 3574 + }, + { + "epoch": 2.2859198848644757, + "grad_norm": 3.104611873626709, + "learning_rate": 1.1904253045522548e-05, + "loss": 0.3076, + "step": 3575 + }, + { + "epoch": 2.2865595266650676, + "grad_norm": 2.6216070652008057, + "learning_rate": 1.1893567001496047e-05, + "loss": 0.2493, + "step": 3576 + }, + { + "epoch": 2.2871991684656594, + "grad_norm": 2.7422173023223877, + "learning_rate": 1.1882880957469546e-05, + "loss": 0.2779, + "step": 3577 + }, + { + "epoch": 2.287838810266251, + "grad_norm": 3.13505220413208, + "learning_rate": 1.1872194913443044e-05, + "loss": 0.2831, + "step": 3578 + }, + { + "epoch": 2.2884784520668426, + "grad_norm": 3.029228925704956, + "learning_rate": 1.1861508869416542e-05, + "loss": 0.2738, + "step": 3579 + }, + { + "epoch": 2.289118093867434, + "grad_norm": 3.0017192363739014, + "learning_rate": 1.1850822825390041e-05, + "loss": 0.299, + "step": 3580 + }, + { + "epoch": 2.289757735668026, + "grad_norm": 3.0335376262664795, + "learning_rate": 1.1840136781363539e-05, + "loss": 0.2707, + "step": 3581 + }, + { + "epoch": 2.2903973774686177, + "grad_norm": 3.517801284790039, + "learning_rate": 1.1829450737337038e-05, + "loss": 0.3206, + "step": 3582 + }, + { + "epoch": 2.291037019269209, + "grad_norm": 3.0517585277557373, + "learning_rate": 1.1818764693310537e-05, + "loss": 0.2749, + "step": 3583 + }, + { + "epoch": 2.291676661069801, + "grad_norm": 2.737506866455078, + "learning_rate": 1.1808078649284035e-05, + "loss": 0.2896, + "step": 3584 + }, + { + "epoch": 2.2923163028703923, + "grad_norm": 3.2319812774658203, + "learning_rate": 1.1797392605257535e-05, + "loss": 0.2848, + "step": 3585 + }, + { + "epoch": 2.292955944670984, + "grad_norm": 2.5811519622802734, + "learning_rate": 1.1786706561231032e-05, + "loss": 0.2723, + "step": 3586 + }, + { + "epoch": 2.293595586471576, + "grad_norm": 3.0970749855041504, + "learning_rate": 1.1776020517204532e-05, + "loss": 0.2942, + "step": 3587 + }, + { + "epoch": 2.2942352282721674, + "grad_norm": 2.8350048065185547, + "learning_rate": 1.1765334473178031e-05, + "loss": 0.2902, + "step": 3588 + }, + { + "epoch": 2.2948748700727593, + "grad_norm": 3.1132586002349854, + "learning_rate": 1.1754648429151529e-05, + "loss": 0.3253, + "step": 3589 + }, + { + "epoch": 2.295514511873351, + "grad_norm": 2.8081939220428467, + "learning_rate": 1.1743962385125028e-05, + "loss": 0.3361, + "step": 3590 + }, + { + "epoch": 2.2961541536739425, + "grad_norm": 2.7446351051330566, + "learning_rate": 1.1733276341098526e-05, + "loss": 0.3125, + "step": 3591 + }, + { + "epoch": 2.2967937954745343, + "grad_norm": 2.469299554824829, + "learning_rate": 1.1722590297072023e-05, + "loss": 0.2807, + "step": 3592 + }, + { + "epoch": 2.297433437275126, + "grad_norm": 2.5957143306732178, + "learning_rate": 1.1711904253045523e-05, + "loss": 0.269, + "step": 3593 + }, + { + "epoch": 2.2980730790757176, + "grad_norm": 3.183300733566284, + "learning_rate": 1.1701218209019022e-05, + "loss": 0.3129, + "step": 3594 + }, + { + "epoch": 2.2987127208763094, + "grad_norm": 3.1013343334198, + "learning_rate": 1.1690532164992521e-05, + "loss": 0.3364, + "step": 3595 + }, + { + "epoch": 2.299352362676901, + "grad_norm": 2.8254292011260986, + "learning_rate": 1.1679846120966019e-05, + "loss": 0.2968, + "step": 3596 + }, + { + "epoch": 2.2999920044774926, + "grad_norm": 2.916802167892456, + "learning_rate": 1.1669160076939517e-05, + "loss": 0.3055, + "step": 3597 + }, + { + "epoch": 2.3006316462780845, + "grad_norm": 2.860269784927368, + "learning_rate": 1.1658474032913016e-05, + "loss": 0.2841, + "step": 3598 + }, + { + "epoch": 2.301271288078676, + "grad_norm": 3.069105625152588, + "learning_rate": 1.1647787988886515e-05, + "loss": 0.3359, + "step": 3599 + }, + { + "epoch": 2.3019109298792677, + "grad_norm": 2.949805498123169, + "learning_rate": 1.1637101944860013e-05, + "loss": 0.3142, + "step": 3600 + }, + { + "epoch": 2.302550571679859, + "grad_norm": 2.347810983657837, + "learning_rate": 1.1626415900833512e-05, + "loss": 0.2713, + "step": 3601 + }, + { + "epoch": 2.303190213480451, + "grad_norm": 2.819849967956543, + "learning_rate": 1.161572985680701e-05, + "loss": 0.2732, + "step": 3602 + }, + { + "epoch": 2.303829855281043, + "grad_norm": 3.033372640609741, + "learning_rate": 1.160504381278051e-05, + "loss": 0.3099, + "step": 3603 + }, + { + "epoch": 2.304469497081634, + "grad_norm": 3.0310096740722656, + "learning_rate": 1.1594357768754007e-05, + "loss": 0.3128, + "step": 3604 + }, + { + "epoch": 2.305109138882226, + "grad_norm": 2.5342955589294434, + "learning_rate": 1.1583671724727507e-05, + "loss": 0.2754, + "step": 3605 + }, + { + "epoch": 2.3057487806828174, + "grad_norm": 2.884960651397705, + "learning_rate": 1.1572985680701006e-05, + "loss": 0.287, + "step": 3606 + }, + { + "epoch": 2.3063884224834093, + "grad_norm": 2.89473032951355, + "learning_rate": 1.1562299636674504e-05, + "loss": 0.3224, + "step": 3607 + }, + { + "epoch": 2.307028064284001, + "grad_norm": 2.676666498184204, + "learning_rate": 1.1551613592648003e-05, + "loss": 0.2937, + "step": 3608 + }, + { + "epoch": 2.3076677060845925, + "grad_norm": 2.7825429439544678, + "learning_rate": 1.15409275486215e-05, + "loss": 0.2927, + "step": 3609 + }, + { + "epoch": 2.3083073478851843, + "grad_norm": 3.067427396774292, + "learning_rate": 1.1530241504594998e-05, + "loss": 0.3044, + "step": 3610 + }, + { + "epoch": 2.3089469896857757, + "grad_norm": 2.424522876739502, + "learning_rate": 1.15195554605685e-05, + "loss": 0.2595, + "step": 3611 + }, + { + "epoch": 2.3095866314863676, + "grad_norm": 2.830282688140869, + "learning_rate": 1.1508869416541997e-05, + "loss": 0.293, + "step": 3612 + }, + { + "epoch": 2.3102262732869594, + "grad_norm": 2.446810007095337, + "learning_rate": 1.1498183372515495e-05, + "loss": 0.2833, + "step": 3613 + }, + { + "epoch": 2.310865915087551, + "grad_norm": 2.5802533626556396, + "learning_rate": 1.1487497328488994e-05, + "loss": 0.2757, + "step": 3614 + }, + { + "epoch": 2.3115055568881426, + "grad_norm": 3.2124335765838623, + "learning_rate": 1.1476811284462492e-05, + "loss": 0.2916, + "step": 3615 + }, + { + "epoch": 2.3121451986887345, + "grad_norm": 3.295328140258789, + "learning_rate": 1.1466125240435991e-05, + "loss": 0.2986, + "step": 3616 + }, + { + "epoch": 2.312784840489326, + "grad_norm": 3.008997678756714, + "learning_rate": 1.145543919640949e-05, + "loss": 0.3017, + "step": 3617 + }, + { + "epoch": 2.3134244822899177, + "grad_norm": 2.7125957012176514, + "learning_rate": 1.1444753152382988e-05, + "loss": 0.2679, + "step": 3618 + }, + { + "epoch": 2.3140641240905095, + "grad_norm": 2.620945692062378, + "learning_rate": 1.1434067108356487e-05, + "loss": 0.2816, + "step": 3619 + }, + { + "epoch": 2.314703765891101, + "grad_norm": 2.4135873317718506, + "learning_rate": 1.1423381064329985e-05, + "loss": 0.2611, + "step": 3620 + }, + { + "epoch": 2.315343407691693, + "grad_norm": 2.9060771465301514, + "learning_rate": 1.1412695020303483e-05, + "loss": 0.3122, + "step": 3621 + }, + { + "epoch": 2.315983049492284, + "grad_norm": 2.7879769802093506, + "learning_rate": 1.1402008976276984e-05, + "loss": 0.2905, + "step": 3622 + }, + { + "epoch": 2.316622691292876, + "grad_norm": 3.1545228958129883, + "learning_rate": 1.1391322932250481e-05, + "loss": 0.2966, + "step": 3623 + }, + { + "epoch": 2.317262333093468, + "grad_norm": 3.8493826389312744, + "learning_rate": 1.138063688822398e-05, + "loss": 0.347, + "step": 3624 + }, + { + "epoch": 2.3179019748940592, + "grad_norm": 3.12675142288208, + "learning_rate": 1.1369950844197479e-05, + "loss": 0.3052, + "step": 3625 + }, + { + "epoch": 2.318541616694651, + "grad_norm": 3.5017809867858887, + "learning_rate": 1.1359264800170976e-05, + "loss": 0.3243, + "step": 3626 + }, + { + "epoch": 2.3191812584952425, + "grad_norm": 3.026428699493408, + "learning_rate": 1.1348578756144476e-05, + "loss": 0.2856, + "step": 3627 + }, + { + "epoch": 2.3198209002958343, + "grad_norm": 2.612839698791504, + "learning_rate": 1.1337892712117975e-05, + "loss": 0.2697, + "step": 3628 + }, + { + "epoch": 2.320460542096426, + "grad_norm": 3.0483486652374268, + "learning_rate": 1.1327206668091474e-05, + "loss": 0.3003, + "step": 3629 + }, + { + "epoch": 2.3211001838970176, + "grad_norm": 2.959425449371338, + "learning_rate": 1.1316520624064972e-05, + "loss": 0.2974, + "step": 3630 + }, + { + "epoch": 2.3217398256976094, + "grad_norm": 3.1794562339782715, + "learning_rate": 1.130583458003847e-05, + "loss": 0.3078, + "step": 3631 + }, + { + "epoch": 2.322379467498201, + "grad_norm": 2.986335277557373, + "learning_rate": 1.1295148536011969e-05, + "loss": 0.3065, + "step": 3632 + }, + { + "epoch": 2.3230191092987926, + "grad_norm": 2.8978092670440674, + "learning_rate": 1.1284462491985467e-05, + "loss": 0.3007, + "step": 3633 + }, + { + "epoch": 2.3236587510993845, + "grad_norm": 3.015186071395874, + "learning_rate": 1.1273776447958966e-05, + "loss": 0.2925, + "step": 3634 + }, + { + "epoch": 2.324298392899976, + "grad_norm": 2.970170259475708, + "learning_rate": 1.1263090403932465e-05, + "loss": 0.2929, + "step": 3635 + }, + { + "epoch": 2.3249380347005677, + "grad_norm": 2.594212055206299, + "learning_rate": 1.1252404359905963e-05, + "loss": 0.2692, + "step": 3636 + }, + { + "epoch": 2.3255776765011595, + "grad_norm": 2.7781248092651367, + "learning_rate": 1.1241718315879462e-05, + "loss": 0.2988, + "step": 3637 + }, + { + "epoch": 2.326217318301751, + "grad_norm": 3.0339715480804443, + "learning_rate": 1.123103227185296e-05, + "loss": 0.3237, + "step": 3638 + }, + { + "epoch": 2.3268569601023428, + "grad_norm": 3.0787346363067627, + "learning_rate": 1.122034622782646e-05, + "loss": 0.2851, + "step": 3639 + }, + { + "epoch": 2.327496601902934, + "grad_norm": 3.1304264068603516, + "learning_rate": 1.1209660183799959e-05, + "loss": 0.2996, + "step": 3640 + }, + { + "epoch": 2.328136243703526, + "grad_norm": 2.516671895980835, + "learning_rate": 1.1198974139773456e-05, + "loss": 0.2806, + "step": 3641 + }, + { + "epoch": 2.328775885504118, + "grad_norm": 2.9395248889923096, + "learning_rate": 1.1188288095746954e-05, + "loss": 0.2813, + "step": 3642 + }, + { + "epoch": 2.3294155273047092, + "grad_norm": 2.470202922821045, + "learning_rate": 1.1177602051720453e-05, + "loss": 0.2676, + "step": 3643 + }, + { + "epoch": 2.330055169105301, + "grad_norm": 2.4521126747131348, + "learning_rate": 1.1166916007693951e-05, + "loss": 0.2614, + "step": 3644 + }, + { + "epoch": 2.330694810905893, + "grad_norm": 2.8323287963867188, + "learning_rate": 1.1156229963667452e-05, + "loss": 0.2964, + "step": 3645 + }, + { + "epoch": 2.3313344527064843, + "grad_norm": 2.8041605949401855, + "learning_rate": 1.114554391964095e-05, + "loss": 0.3195, + "step": 3646 + }, + { + "epoch": 2.331974094507076, + "grad_norm": 2.9341700077056885, + "learning_rate": 1.1134857875614448e-05, + "loss": 0.2682, + "step": 3647 + }, + { + "epoch": 2.3326137363076676, + "grad_norm": 2.4643263816833496, + "learning_rate": 1.1124171831587947e-05, + "loss": 0.2755, + "step": 3648 + }, + { + "epoch": 2.3332533781082594, + "grad_norm": 2.8114049434661865, + "learning_rate": 1.1113485787561445e-05, + "loss": 0.3238, + "step": 3649 + }, + { + "epoch": 2.3338930199088512, + "grad_norm": 2.4782729148864746, + "learning_rate": 1.1102799743534944e-05, + "loss": 0.2692, + "step": 3650 + }, + { + "epoch": 2.3345326617094426, + "grad_norm": 2.503711700439453, + "learning_rate": 1.1092113699508443e-05, + "loss": 0.2578, + "step": 3651 + }, + { + "epoch": 2.3351723035100345, + "grad_norm": 2.252203941345215, + "learning_rate": 1.1081427655481941e-05, + "loss": 0.2558, + "step": 3652 + }, + { + "epoch": 2.335811945310626, + "grad_norm": 2.3472766876220703, + "learning_rate": 1.107074161145544e-05, + "loss": 0.2442, + "step": 3653 + }, + { + "epoch": 2.3364515871112177, + "grad_norm": 2.6745517253875732, + "learning_rate": 1.1060055567428938e-05, + "loss": 0.2721, + "step": 3654 + }, + { + "epoch": 2.3370912289118095, + "grad_norm": 2.808493137359619, + "learning_rate": 1.1049369523402436e-05, + "loss": 0.2842, + "step": 3655 + }, + { + "epoch": 2.337730870712401, + "grad_norm": 3.1037187576293945, + "learning_rate": 1.1038683479375935e-05, + "loss": 0.2903, + "step": 3656 + }, + { + "epoch": 2.3383705125129928, + "grad_norm": 3.305421829223633, + "learning_rate": 1.1027997435349434e-05, + "loss": 0.3088, + "step": 3657 + }, + { + "epoch": 2.339010154313584, + "grad_norm": 2.80588960647583, + "learning_rate": 1.1017311391322934e-05, + "loss": 0.2875, + "step": 3658 + }, + { + "epoch": 2.339649796114176, + "grad_norm": 2.851426601409912, + "learning_rate": 1.1006625347296431e-05, + "loss": 0.2864, + "step": 3659 + }, + { + "epoch": 2.340289437914768, + "grad_norm": 3.5976970195770264, + "learning_rate": 1.0995939303269929e-05, + "loss": 0.2983, + "step": 3660 + }, + { + "epoch": 2.3409290797153592, + "grad_norm": 2.727888584136963, + "learning_rate": 1.0985253259243428e-05, + "loss": 0.2842, + "step": 3661 + }, + { + "epoch": 2.341568721515951, + "grad_norm": 3.325143575668335, + "learning_rate": 1.0974567215216928e-05, + "loss": 0.2972, + "step": 3662 + }, + { + "epoch": 2.342208363316543, + "grad_norm": 3.104851245880127, + "learning_rate": 1.0963881171190425e-05, + "loss": 0.3287, + "step": 3663 + }, + { + "epoch": 2.3428480051171343, + "grad_norm": 3.4042844772338867, + "learning_rate": 1.0953195127163925e-05, + "loss": 0.2965, + "step": 3664 + }, + { + "epoch": 2.343487646917726, + "grad_norm": 2.57411527633667, + "learning_rate": 1.0942509083137422e-05, + "loss": 0.259, + "step": 3665 + }, + { + "epoch": 2.344127288718318, + "grad_norm": 3.6349639892578125, + "learning_rate": 1.0931823039110922e-05, + "loss": 0.3281, + "step": 3666 + }, + { + "epoch": 2.3447669305189094, + "grad_norm": 2.897465705871582, + "learning_rate": 1.092113699508442e-05, + "loss": 0.2843, + "step": 3667 + }, + { + "epoch": 2.3454065723195012, + "grad_norm": 2.945551872253418, + "learning_rate": 1.0910450951057919e-05, + "loss": 0.2843, + "step": 3668 + }, + { + "epoch": 2.3460462141200926, + "grad_norm": 2.63677716255188, + "learning_rate": 1.0899764907031418e-05, + "loss": 0.2751, + "step": 3669 + }, + { + "epoch": 2.3466858559206845, + "grad_norm": 2.6026413440704346, + "learning_rate": 1.0889078863004916e-05, + "loss": 0.2596, + "step": 3670 + }, + { + "epoch": 2.3473254977212763, + "grad_norm": 2.672790288925171, + "learning_rate": 1.0878392818978415e-05, + "loss": 0.2818, + "step": 3671 + }, + { + "epoch": 2.3479651395218677, + "grad_norm": 3.1021134853363037, + "learning_rate": 1.0867706774951913e-05, + "loss": 0.3195, + "step": 3672 + }, + { + "epoch": 2.3486047813224595, + "grad_norm": 2.8973262310028076, + "learning_rate": 1.0857020730925412e-05, + "loss": 0.2964, + "step": 3673 + }, + { + "epoch": 2.349244423123051, + "grad_norm": 2.77655029296875, + "learning_rate": 1.0846334686898912e-05, + "loss": 0.3002, + "step": 3674 + }, + { + "epoch": 2.3498840649236428, + "grad_norm": 3.0009922981262207, + "learning_rate": 1.083564864287241e-05, + "loss": 0.2659, + "step": 3675 + }, + { + "epoch": 2.3505237067242346, + "grad_norm": 2.9666941165924072, + "learning_rate": 1.0824962598845907e-05, + "loss": 0.2883, + "step": 3676 + }, + { + "epoch": 2.351163348524826, + "grad_norm": 2.9443798065185547, + "learning_rate": 1.0814276554819406e-05, + "loss": 0.2621, + "step": 3677 + }, + { + "epoch": 2.351802990325418, + "grad_norm": 3.0699689388275146, + "learning_rate": 1.0803590510792904e-05, + "loss": 0.3216, + "step": 3678 + }, + { + "epoch": 2.3524426321260092, + "grad_norm": 2.7166452407836914, + "learning_rate": 1.0792904466766403e-05, + "loss": 0.3031, + "step": 3679 + }, + { + "epoch": 2.353082273926601, + "grad_norm": 2.7714357376098633, + "learning_rate": 1.0782218422739903e-05, + "loss": 0.2893, + "step": 3680 + }, + { + "epoch": 2.353721915727193, + "grad_norm": 2.809751272201538, + "learning_rate": 1.07715323787134e-05, + "loss": 0.2869, + "step": 3681 + }, + { + "epoch": 2.3543615575277843, + "grad_norm": 2.7445809841156006, + "learning_rate": 1.07608463346869e-05, + "loss": 0.2663, + "step": 3682 + }, + { + "epoch": 2.355001199328376, + "grad_norm": 2.7037882804870605, + "learning_rate": 1.0750160290660397e-05, + "loss": 0.2791, + "step": 3683 + }, + { + "epoch": 2.3556408411289675, + "grad_norm": 2.7959351539611816, + "learning_rate": 1.0739474246633897e-05, + "loss": 0.2668, + "step": 3684 + }, + { + "epoch": 2.3562804829295594, + "grad_norm": 2.936558723449707, + "learning_rate": 1.0728788202607396e-05, + "loss": 0.2747, + "step": 3685 + }, + { + "epoch": 2.3569201247301512, + "grad_norm": 2.8345210552215576, + "learning_rate": 1.0718102158580894e-05, + "loss": 0.2765, + "step": 3686 + }, + { + "epoch": 2.3575597665307426, + "grad_norm": 2.9661409854888916, + "learning_rate": 1.0707416114554393e-05, + "loss": 0.2932, + "step": 3687 + }, + { + "epoch": 2.3581994083313345, + "grad_norm": 2.79435133934021, + "learning_rate": 1.0696730070527891e-05, + "loss": 0.2833, + "step": 3688 + }, + { + "epoch": 2.3588390501319263, + "grad_norm": 3.4028706550598145, + "learning_rate": 1.0686044026501389e-05, + "loss": 0.3369, + "step": 3689 + }, + { + "epoch": 2.3594786919325177, + "grad_norm": 2.9375522136688232, + "learning_rate": 1.0675357982474888e-05, + "loss": 0.2764, + "step": 3690 + }, + { + "epoch": 2.3601183337331095, + "grad_norm": 2.9318432807922363, + "learning_rate": 1.0664671938448387e-05, + "loss": 0.2938, + "step": 3691 + }, + { + "epoch": 2.3607579755337014, + "grad_norm": 3.0722882747650146, + "learning_rate": 1.0653985894421887e-05, + "loss": 0.3198, + "step": 3692 + }, + { + "epoch": 2.3613976173342928, + "grad_norm": 3.190333366394043, + "learning_rate": 1.0643299850395384e-05, + "loss": 0.314, + "step": 3693 + }, + { + "epoch": 2.3620372591348846, + "grad_norm": 2.869821548461914, + "learning_rate": 1.0632613806368882e-05, + "loss": 0.297, + "step": 3694 + }, + { + "epoch": 2.362676900935476, + "grad_norm": 2.383671760559082, + "learning_rate": 1.0621927762342381e-05, + "loss": 0.2533, + "step": 3695 + }, + { + "epoch": 2.363316542736068, + "grad_norm": 2.9709298610687256, + "learning_rate": 1.061124171831588e-05, + "loss": 0.2927, + "step": 3696 + }, + { + "epoch": 2.3639561845366597, + "grad_norm": 2.5270161628723145, + "learning_rate": 1.0600555674289378e-05, + "loss": 0.2609, + "step": 3697 + }, + { + "epoch": 2.364595826337251, + "grad_norm": 2.8073136806488037, + "learning_rate": 1.0589869630262878e-05, + "loss": 0.3017, + "step": 3698 + }, + { + "epoch": 2.365235468137843, + "grad_norm": 2.8503897190093994, + "learning_rate": 1.0579183586236375e-05, + "loss": 0.2802, + "step": 3699 + }, + { + "epoch": 2.3658751099384343, + "grad_norm": 3.4927291870117188, + "learning_rate": 1.0568497542209875e-05, + "loss": 0.3041, + "step": 3700 + }, + { + "epoch": 2.366514751739026, + "grad_norm": 2.66800856590271, + "learning_rate": 1.0557811498183372e-05, + "loss": 0.2682, + "step": 3701 + }, + { + "epoch": 2.367154393539618, + "grad_norm": 2.816920280456543, + "learning_rate": 1.0547125454156872e-05, + "loss": 0.2908, + "step": 3702 + }, + { + "epoch": 2.3677940353402094, + "grad_norm": 2.8394041061401367, + "learning_rate": 1.0536439410130371e-05, + "loss": 0.2957, + "step": 3703 + }, + { + "epoch": 2.368433677140801, + "grad_norm": 2.9479761123657227, + "learning_rate": 1.0525753366103869e-05, + "loss": 0.2872, + "step": 3704 + }, + { + "epoch": 2.3690733189413926, + "grad_norm": 2.8398666381835938, + "learning_rate": 1.0515067322077366e-05, + "loss": 0.2759, + "step": 3705 + }, + { + "epoch": 2.3697129607419845, + "grad_norm": 2.759230375289917, + "learning_rate": 1.0504381278050866e-05, + "loss": 0.2443, + "step": 3706 + }, + { + "epoch": 2.3703526025425763, + "grad_norm": 3.539905071258545, + "learning_rate": 1.0493695234024365e-05, + "loss": 0.3492, + "step": 3707 + }, + { + "epoch": 2.3709922443431677, + "grad_norm": 2.892258882522583, + "learning_rate": 1.0483009189997865e-05, + "loss": 0.2783, + "step": 3708 + }, + { + "epoch": 2.3716318861437595, + "grad_norm": 3.180210828781128, + "learning_rate": 1.0472323145971362e-05, + "loss": 0.3087, + "step": 3709 + }, + { + "epoch": 2.372271527944351, + "grad_norm": 2.7415599822998047, + "learning_rate": 1.046163710194486e-05, + "loss": 0.2646, + "step": 3710 + }, + { + "epoch": 2.3729111697449428, + "grad_norm": 2.877748489379883, + "learning_rate": 1.045095105791836e-05, + "loss": 0.2888, + "step": 3711 + }, + { + "epoch": 2.3735508115455346, + "grad_norm": 3.225186586380005, + "learning_rate": 1.0440265013891857e-05, + "loss": 0.2963, + "step": 3712 + }, + { + "epoch": 2.374190453346126, + "grad_norm": 3.4280874729156494, + "learning_rate": 1.0429578969865356e-05, + "loss": 0.3265, + "step": 3713 + }, + { + "epoch": 2.374830095146718, + "grad_norm": 3.0741846561431885, + "learning_rate": 1.0418892925838856e-05, + "loss": 0.3147, + "step": 3714 + }, + { + "epoch": 2.3754697369473097, + "grad_norm": 2.8286306858062744, + "learning_rate": 1.0408206881812353e-05, + "loss": 0.2968, + "step": 3715 + }, + { + "epoch": 2.376109378747901, + "grad_norm": 3.084357500076294, + "learning_rate": 1.0397520837785853e-05, + "loss": 0.3014, + "step": 3716 + }, + { + "epoch": 2.376749020548493, + "grad_norm": 2.366436719894409, + "learning_rate": 1.038683479375935e-05, + "loss": 0.2577, + "step": 3717 + }, + { + "epoch": 2.3773886623490847, + "grad_norm": 3.192760705947876, + "learning_rate": 1.0376148749732848e-05, + "loss": 0.3105, + "step": 3718 + }, + { + "epoch": 2.378028304149676, + "grad_norm": 2.9491522312164307, + "learning_rate": 1.0365462705706349e-05, + "loss": 0.2733, + "step": 3719 + }, + { + "epoch": 2.378667945950268, + "grad_norm": 2.8468129634857178, + "learning_rate": 1.0354776661679847e-05, + "loss": 0.2958, + "step": 3720 + }, + { + "epoch": 2.3793075877508594, + "grad_norm": 3.020120620727539, + "learning_rate": 1.0344090617653346e-05, + "loss": 0.287, + "step": 3721 + }, + { + "epoch": 2.379947229551451, + "grad_norm": 2.782242774963379, + "learning_rate": 1.0333404573626844e-05, + "loss": 0.2834, + "step": 3722 + }, + { + "epoch": 2.380586871352043, + "grad_norm": 2.4562535285949707, + "learning_rate": 1.0322718529600341e-05, + "loss": 0.265, + "step": 3723 + }, + { + "epoch": 2.3812265131526344, + "grad_norm": 3.276191473007202, + "learning_rate": 1.031203248557384e-05, + "loss": 0.2997, + "step": 3724 + }, + { + "epoch": 2.3818661549532263, + "grad_norm": 3.6544899940490723, + "learning_rate": 1.030134644154734e-05, + "loss": 0.2853, + "step": 3725 + }, + { + "epoch": 2.3825057967538177, + "grad_norm": 3.0346362590789795, + "learning_rate": 1.0290660397520838e-05, + "loss": 0.3135, + "step": 3726 + }, + { + "epoch": 2.3831454385544095, + "grad_norm": 2.6481335163116455, + "learning_rate": 1.0279974353494337e-05, + "loss": 0.2849, + "step": 3727 + }, + { + "epoch": 2.3837850803550014, + "grad_norm": 3.0483789443969727, + "learning_rate": 1.0269288309467835e-05, + "loss": 0.3003, + "step": 3728 + }, + { + "epoch": 2.3844247221555928, + "grad_norm": 2.8507449626922607, + "learning_rate": 1.0258602265441334e-05, + "loss": 0.2897, + "step": 3729 + }, + { + "epoch": 2.3850643639561846, + "grad_norm": 3.2301740646362305, + "learning_rate": 1.0247916221414834e-05, + "loss": 0.3326, + "step": 3730 + }, + { + "epoch": 2.385704005756776, + "grad_norm": 2.715714931488037, + "learning_rate": 1.0237230177388331e-05, + "loss": 0.2821, + "step": 3731 + }, + { + "epoch": 2.386343647557368, + "grad_norm": 3.216115951538086, + "learning_rate": 1.022654413336183e-05, + "loss": 0.2876, + "step": 3732 + }, + { + "epoch": 2.3869832893579597, + "grad_norm": 2.814248561859131, + "learning_rate": 1.0215858089335328e-05, + "loss": 0.2834, + "step": 3733 + }, + { + "epoch": 2.387622931158551, + "grad_norm": 3.2641689777374268, + "learning_rate": 1.0205172045308826e-05, + "loss": 0.3174, + "step": 3734 + }, + { + "epoch": 2.388262572959143, + "grad_norm": 3.176546573638916, + "learning_rate": 1.0194486001282325e-05, + "loss": 0.3059, + "step": 3735 + }, + { + "epoch": 2.3889022147597343, + "grad_norm": 4.12556791305542, + "learning_rate": 1.0183799957255825e-05, + "loss": 0.2998, + "step": 3736 + }, + { + "epoch": 2.389541856560326, + "grad_norm": 3.233046770095825, + "learning_rate": 1.0173113913229324e-05, + "loss": 0.3255, + "step": 3737 + }, + { + "epoch": 2.390181498360918, + "grad_norm": 3.198136329650879, + "learning_rate": 1.0162427869202822e-05, + "loss": 0.3273, + "step": 3738 + }, + { + "epoch": 2.3908211401615094, + "grad_norm": 3.109236478805542, + "learning_rate": 1.015174182517632e-05, + "loss": 0.3073, + "step": 3739 + }, + { + "epoch": 2.391460781962101, + "grad_norm": 3.0272021293640137, + "learning_rate": 1.0141055781149819e-05, + "loss": 0.2934, + "step": 3740 + }, + { + "epoch": 2.392100423762693, + "grad_norm": 2.720465660095215, + "learning_rate": 1.0130369737123316e-05, + "loss": 0.2869, + "step": 3741 + }, + { + "epoch": 2.3927400655632844, + "grad_norm": 2.529658794403076, + "learning_rate": 1.0119683693096817e-05, + "loss": 0.2758, + "step": 3742 + }, + { + "epoch": 2.3933797073638763, + "grad_norm": 3.0153563022613525, + "learning_rate": 1.0108997649070315e-05, + "loss": 0.282, + "step": 3743 + }, + { + "epoch": 2.394019349164468, + "grad_norm": 3.0945286750793457, + "learning_rate": 1.0098311605043813e-05, + "loss": 0.287, + "step": 3744 + }, + { + "epoch": 2.3946589909650595, + "grad_norm": 2.4137110710144043, + "learning_rate": 1.0087625561017312e-05, + "loss": 0.2544, + "step": 3745 + }, + { + "epoch": 2.3952986327656514, + "grad_norm": 2.4856338500976562, + "learning_rate": 1.007693951699081e-05, + "loss": 0.2748, + "step": 3746 + }, + { + "epoch": 2.3959382745662428, + "grad_norm": 3.0739831924438477, + "learning_rate": 1.006625347296431e-05, + "loss": 0.2976, + "step": 3747 + }, + { + "epoch": 2.3965779163668346, + "grad_norm": 3.1753807067871094, + "learning_rate": 1.0055567428937809e-05, + "loss": 0.3064, + "step": 3748 + }, + { + "epoch": 2.3972175581674264, + "grad_norm": 3.212027072906494, + "learning_rate": 1.0044881384911306e-05, + "loss": 0.313, + "step": 3749 + }, + { + "epoch": 2.397857199968018, + "grad_norm": 2.9604339599609375, + "learning_rate": 1.0034195340884806e-05, + "loss": 0.282, + "step": 3750 + }, + { + "epoch": 2.3984968417686097, + "grad_norm": 3.2050273418426514, + "learning_rate": 1.0023509296858303e-05, + "loss": 0.2993, + "step": 3751 + }, + { + "epoch": 2.399136483569201, + "grad_norm": 3.095064878463745, + "learning_rate": 1.0012823252831801e-05, + "loss": 0.306, + "step": 3752 + }, + { + "epoch": 2.399776125369793, + "grad_norm": 2.8404853343963623, + "learning_rate": 1.0002137208805302e-05, + "loss": 0.2786, + "step": 3753 + }, + { + "epoch": 2.4004157671703847, + "grad_norm": 3.09138560295105, + "learning_rate": 9.9914511647788e-06, + "loss": 0.3047, + "step": 3754 + }, + { + "epoch": 2.401055408970976, + "grad_norm": 2.408557653427124, + "learning_rate": 9.980765120752297e-06, + "loss": 0.2469, + "step": 3755 + }, + { + "epoch": 2.401695050771568, + "grad_norm": 2.66947865486145, + "learning_rate": 9.970079076725797e-06, + "loss": 0.2922, + "step": 3756 + }, + { + "epoch": 2.4023346925721594, + "grad_norm": 3.7358579635620117, + "learning_rate": 9.959393032699294e-06, + "loss": 0.327, + "step": 3757 + }, + { + "epoch": 2.402974334372751, + "grad_norm": 3.0420963764190674, + "learning_rate": 9.948706988672794e-06, + "loss": 0.3022, + "step": 3758 + }, + { + "epoch": 2.403613976173343, + "grad_norm": 2.7667346000671387, + "learning_rate": 9.938020944646293e-06, + "loss": 0.2858, + "step": 3759 + }, + { + "epoch": 2.4042536179739344, + "grad_norm": 2.8193728923797607, + "learning_rate": 9.92733490061979e-06, + "loss": 0.2927, + "step": 3760 + }, + { + "epoch": 2.4048932597745263, + "grad_norm": 3.4549965858459473, + "learning_rate": 9.91664885659329e-06, + "loss": 0.33, + "step": 3761 + }, + { + "epoch": 2.405532901575118, + "grad_norm": 3.107081174850464, + "learning_rate": 9.905962812566788e-06, + "loss": 0.3167, + "step": 3762 + }, + { + "epoch": 2.4061725433757095, + "grad_norm": 3.1439502239227295, + "learning_rate": 9.895276768540287e-06, + "loss": 0.3096, + "step": 3763 + }, + { + "epoch": 2.4068121851763014, + "grad_norm": 2.7166993618011475, + "learning_rate": 9.884590724513785e-06, + "loss": 0.2972, + "step": 3764 + }, + { + "epoch": 2.4074518269768928, + "grad_norm": 2.7572853565216064, + "learning_rate": 9.873904680487284e-06, + "loss": 0.2771, + "step": 3765 + }, + { + "epoch": 2.4080914687774846, + "grad_norm": 2.433554172515869, + "learning_rate": 9.863218636460784e-06, + "loss": 0.2796, + "step": 3766 + }, + { + "epoch": 2.4087311105780764, + "grad_norm": 3.07726788520813, + "learning_rate": 9.852532592434281e-06, + "loss": 0.3298, + "step": 3767 + }, + { + "epoch": 2.409370752378668, + "grad_norm": 2.5659122467041016, + "learning_rate": 9.841846548407779e-06, + "loss": 0.2553, + "step": 3768 + }, + { + "epoch": 2.4100103941792597, + "grad_norm": 2.7963790893554688, + "learning_rate": 9.831160504381278e-06, + "loss": 0.271, + "step": 3769 + }, + { + "epoch": 2.4106500359798515, + "grad_norm": 3.315323829650879, + "learning_rate": 9.820474460354778e-06, + "loss": 0.3064, + "step": 3770 + }, + { + "epoch": 2.411289677780443, + "grad_norm": 3.1749017238616943, + "learning_rate": 9.809788416328277e-06, + "loss": 0.2982, + "step": 3771 + }, + { + "epoch": 2.4119293195810347, + "grad_norm": 3.3845622539520264, + "learning_rate": 9.799102372301775e-06, + "loss": 0.3226, + "step": 3772 + }, + { + "epoch": 2.412568961381626, + "grad_norm": 4.004214286804199, + "learning_rate": 9.788416328275272e-06, + "loss": 0.4031, + "step": 3773 + }, + { + "epoch": 2.413208603182218, + "grad_norm": 3.194394111633301, + "learning_rate": 9.777730284248772e-06, + "loss": 0.2979, + "step": 3774 + }, + { + "epoch": 2.41384824498281, + "grad_norm": 2.644986629486084, + "learning_rate": 9.76704424022227e-06, + "loss": 0.2844, + "step": 3775 + }, + { + "epoch": 2.414487886783401, + "grad_norm": 2.4975733757019043, + "learning_rate": 9.756358196195769e-06, + "loss": 0.2555, + "step": 3776 + }, + { + "epoch": 2.415127528583993, + "grad_norm": 2.4662675857543945, + "learning_rate": 9.745672152169268e-06, + "loss": 0.2635, + "step": 3777 + }, + { + "epoch": 2.4157671703845844, + "grad_norm": 2.750217914581299, + "learning_rate": 9.734986108142766e-06, + "loss": 0.2826, + "step": 3778 + }, + { + "epoch": 2.4164068121851763, + "grad_norm": 2.7455925941467285, + "learning_rate": 9.724300064116265e-06, + "loss": 0.3025, + "step": 3779 + }, + { + "epoch": 2.417046453985768, + "grad_norm": 3.1072282791137695, + "learning_rate": 9.713614020089763e-06, + "loss": 0.3015, + "step": 3780 + }, + { + "epoch": 2.4176860957863595, + "grad_norm": 3.4795072078704834, + "learning_rate": 9.702927976063262e-06, + "loss": 0.3053, + "step": 3781 + }, + { + "epoch": 2.4183257375869514, + "grad_norm": 3.366302490234375, + "learning_rate": 9.692241932036761e-06, + "loss": 0.3326, + "step": 3782 + }, + { + "epoch": 2.4189653793875427, + "grad_norm": 3.1606807708740234, + "learning_rate": 9.681555888010259e-06, + "loss": 0.305, + "step": 3783 + }, + { + "epoch": 2.4196050211881346, + "grad_norm": 3.016425371170044, + "learning_rate": 9.670869843983758e-06, + "loss": 0.312, + "step": 3784 + }, + { + "epoch": 2.4202446629887264, + "grad_norm": 2.9911797046661377, + "learning_rate": 9.660183799957256e-06, + "loss": 0.2563, + "step": 3785 + }, + { + "epoch": 2.420884304789318, + "grad_norm": 3.333845615386963, + "learning_rate": 9.649497755930754e-06, + "loss": 0.2976, + "step": 3786 + }, + { + "epoch": 2.4215239465899097, + "grad_norm": 2.6827268600463867, + "learning_rate": 9.638811711904253e-06, + "loss": 0.2878, + "step": 3787 + }, + { + "epoch": 2.4221635883905015, + "grad_norm": 2.8698997497558594, + "learning_rate": 9.628125667877753e-06, + "loss": 0.2922, + "step": 3788 + }, + { + "epoch": 2.422803230191093, + "grad_norm": 3.5182323455810547, + "learning_rate": 9.61743962385125e-06, + "loss": 0.3209, + "step": 3789 + }, + { + "epoch": 2.4234428719916847, + "grad_norm": 3.059401273727417, + "learning_rate": 9.60675357982475e-06, + "loss": 0.3213, + "step": 3790 + }, + { + "epoch": 2.4240825137922766, + "grad_norm": 2.9321200847625732, + "learning_rate": 9.596067535798247e-06, + "loss": 0.2907, + "step": 3791 + }, + { + "epoch": 2.424722155592868, + "grad_norm": 3.1854536533355713, + "learning_rate": 9.585381491771747e-06, + "loss": 0.3111, + "step": 3792 + }, + { + "epoch": 2.42536179739346, + "grad_norm": 3.072195529937744, + "learning_rate": 9.574695447745246e-06, + "loss": 0.3044, + "step": 3793 + }, + { + "epoch": 2.426001439194051, + "grad_norm": 2.6200971603393555, + "learning_rate": 9.564009403718744e-06, + "loss": 0.2712, + "step": 3794 + }, + { + "epoch": 2.426641080994643, + "grad_norm": 2.600599527359009, + "learning_rate": 9.553323359692243e-06, + "loss": 0.2755, + "step": 3795 + }, + { + "epoch": 2.427280722795235, + "grad_norm": 2.7904534339904785, + "learning_rate": 9.54263731566574e-06, + "loss": 0.294, + "step": 3796 + }, + { + "epoch": 2.4279203645958263, + "grad_norm": 3.008108377456665, + "learning_rate": 9.531951271639238e-06, + "loss": 0.2795, + "step": 3797 + }, + { + "epoch": 2.428560006396418, + "grad_norm": 3.046015501022339, + "learning_rate": 9.521265227612738e-06, + "loss": 0.2723, + "step": 3798 + }, + { + "epoch": 2.4291996481970095, + "grad_norm": 2.8530921936035156, + "learning_rate": 9.510579183586237e-06, + "loss": 0.2778, + "step": 3799 + }, + { + "epoch": 2.4298392899976013, + "grad_norm": 2.9153363704681396, + "learning_rate": 9.499893139559736e-06, + "loss": 0.2916, + "step": 3800 + }, + { + "epoch": 2.430478931798193, + "grad_norm": 2.841264486312866, + "learning_rate": 9.489207095533234e-06, + "loss": 0.291, + "step": 3801 + }, + { + "epoch": 2.4311185735987846, + "grad_norm": 2.686324119567871, + "learning_rate": 9.478521051506732e-06, + "loss": 0.2732, + "step": 3802 + }, + { + "epoch": 2.4317582153993764, + "grad_norm": 2.989762544631958, + "learning_rate": 9.467835007480231e-06, + "loss": 0.2931, + "step": 3803 + }, + { + "epoch": 2.432397857199968, + "grad_norm": 4.386168003082275, + "learning_rate": 9.45714896345373e-06, + "loss": 0.3725, + "step": 3804 + }, + { + "epoch": 2.4330374990005597, + "grad_norm": 3.045278549194336, + "learning_rate": 9.44646291942723e-06, + "loss": 0.2673, + "step": 3805 + }, + { + "epoch": 2.4336771408011515, + "grad_norm": 2.8226819038391113, + "learning_rate": 9.435776875400727e-06, + "loss": 0.2795, + "step": 3806 + }, + { + "epoch": 2.434316782601743, + "grad_norm": 2.549082040786743, + "learning_rate": 9.425090831374225e-06, + "loss": 0.2406, + "step": 3807 + }, + { + "epoch": 2.4349564244023347, + "grad_norm": 2.201199769973755, + "learning_rate": 9.414404787347725e-06, + "loss": 0.2643, + "step": 3808 + }, + { + "epoch": 2.435596066202926, + "grad_norm": 2.9440882205963135, + "learning_rate": 9.403718743321222e-06, + "loss": 0.288, + "step": 3809 + }, + { + "epoch": 2.436235708003518, + "grad_norm": 2.640242338180542, + "learning_rate": 9.393032699294722e-06, + "loss": 0.2742, + "step": 3810 + }, + { + "epoch": 2.43687534980411, + "grad_norm": 2.9424214363098145, + "learning_rate": 9.382346655268221e-06, + "loss": 0.2899, + "step": 3811 + }, + { + "epoch": 2.437514991604701, + "grad_norm": 2.7310006618499756, + "learning_rate": 9.371660611241719e-06, + "loss": 0.2965, + "step": 3812 + }, + { + "epoch": 2.438154633405293, + "grad_norm": 2.5351955890655518, + "learning_rate": 9.360974567215218e-06, + "loss": 0.2855, + "step": 3813 + }, + { + "epoch": 2.438794275205885, + "grad_norm": 2.6945552825927734, + "learning_rate": 9.350288523188716e-06, + "loss": 0.2923, + "step": 3814 + }, + { + "epoch": 2.4394339170064763, + "grad_norm": 2.9604012966156006, + "learning_rate": 9.339602479162215e-06, + "loss": 0.2966, + "step": 3815 + }, + { + "epoch": 2.440073558807068, + "grad_norm": 2.9590959548950195, + "learning_rate": 9.328916435135714e-06, + "loss": 0.2887, + "step": 3816 + }, + { + "epoch": 2.44071320060766, + "grad_norm": 2.7463204860687256, + "learning_rate": 9.318230391109212e-06, + "loss": 0.2952, + "step": 3817 + }, + { + "epoch": 2.4413528424082513, + "grad_norm": 3.2178103923797607, + "learning_rate": 9.30754434708271e-06, + "loss": 0.2919, + "step": 3818 + }, + { + "epoch": 2.441992484208843, + "grad_norm": 2.7675511837005615, + "learning_rate": 9.296858303056209e-06, + "loss": 0.3024, + "step": 3819 + }, + { + "epoch": 2.4426321260094346, + "grad_norm": 3.3939640522003174, + "learning_rate": 9.286172259029707e-06, + "loss": 0.3119, + "step": 3820 + }, + { + "epoch": 2.4432717678100264, + "grad_norm": 3.474416971206665, + "learning_rate": 9.275486215003206e-06, + "loss": 0.3304, + "step": 3821 + }, + { + "epoch": 2.4439114096106183, + "grad_norm": 3.2494993209838867, + "learning_rate": 9.264800170976705e-06, + "loss": 0.3305, + "step": 3822 + }, + { + "epoch": 2.4445510514112097, + "grad_norm": 3.611361503601074, + "learning_rate": 9.254114126950203e-06, + "loss": 0.3414, + "step": 3823 + }, + { + "epoch": 2.4451906932118015, + "grad_norm": 2.5496439933776855, + "learning_rate": 9.243428082923702e-06, + "loss": 0.2901, + "step": 3824 + }, + { + "epoch": 2.445830335012393, + "grad_norm": 2.8153884410858154, + "learning_rate": 9.2327420388972e-06, + "loss": 0.2907, + "step": 3825 + }, + { + "epoch": 2.4464699768129847, + "grad_norm": 2.9571709632873535, + "learning_rate": 9.2220559948707e-06, + "loss": 0.2957, + "step": 3826 + }, + { + "epoch": 2.4471096186135766, + "grad_norm": 2.7724413871765137, + "learning_rate": 9.211369950844199e-06, + "loss": 0.277, + "step": 3827 + }, + { + "epoch": 2.447749260414168, + "grad_norm": 3.408076524734497, + "learning_rate": 9.200683906817697e-06, + "loss": 0.3127, + "step": 3828 + }, + { + "epoch": 2.44838890221476, + "grad_norm": 2.7393248081207275, + "learning_rate": 9.189997862791196e-06, + "loss": 0.2981, + "step": 3829 + }, + { + "epoch": 2.449028544015351, + "grad_norm": 3.1942086219787598, + "learning_rate": 9.179311818764694e-06, + "loss": 0.3203, + "step": 3830 + }, + { + "epoch": 2.449668185815943, + "grad_norm": 3.0015649795532227, + "learning_rate": 9.168625774738191e-06, + "loss": 0.3244, + "step": 3831 + }, + { + "epoch": 2.450307827616535, + "grad_norm": 3.103132486343384, + "learning_rate": 9.15793973071169e-06, + "loss": 0.3077, + "step": 3832 + }, + { + "epoch": 2.4509474694171263, + "grad_norm": 2.9493842124938965, + "learning_rate": 9.14725368668519e-06, + "loss": 0.3186, + "step": 3833 + }, + { + "epoch": 2.451587111217718, + "grad_norm": 2.5046045780181885, + "learning_rate": 9.13656764265869e-06, + "loss": 0.2629, + "step": 3834 + }, + { + "epoch": 2.4522267530183095, + "grad_norm": 2.747251510620117, + "learning_rate": 9.125881598632187e-06, + "loss": 0.2943, + "step": 3835 + }, + { + "epoch": 2.4528663948189013, + "grad_norm": 2.305379867553711, + "learning_rate": 9.115195554605685e-06, + "loss": 0.2509, + "step": 3836 + }, + { + "epoch": 2.453506036619493, + "grad_norm": 3.2020890712738037, + "learning_rate": 9.104509510579184e-06, + "loss": 0.3169, + "step": 3837 + }, + { + "epoch": 2.4541456784200846, + "grad_norm": 2.862238645553589, + "learning_rate": 9.093823466552683e-06, + "loss": 0.2717, + "step": 3838 + }, + { + "epoch": 2.4547853202206764, + "grad_norm": 3.574880838394165, + "learning_rate": 9.083137422526181e-06, + "loss": 0.3171, + "step": 3839 + }, + { + "epoch": 2.4554249620212683, + "grad_norm": 3.0155539512634277, + "learning_rate": 9.07245137849968e-06, + "loss": 0.3112, + "step": 3840 + }, + { + "epoch": 2.4560646038218596, + "grad_norm": 2.5281171798706055, + "learning_rate": 9.061765334473178e-06, + "loss": 0.2634, + "step": 3841 + }, + { + "epoch": 2.4567042456224515, + "grad_norm": 2.6424858570098877, + "learning_rate": 9.051079290446677e-06, + "loss": 0.2873, + "step": 3842 + }, + { + "epoch": 2.4573438874230433, + "grad_norm": 2.6515753269195557, + "learning_rate": 9.040393246420175e-06, + "loss": 0.2972, + "step": 3843 + }, + { + "epoch": 2.4579835292236347, + "grad_norm": 2.188934803009033, + "learning_rate": 9.029707202393674e-06, + "loss": 0.2395, + "step": 3844 + }, + { + "epoch": 2.4586231710242266, + "grad_norm": 3.127741813659668, + "learning_rate": 9.019021158367174e-06, + "loss": 0.3043, + "step": 3845 + }, + { + "epoch": 2.459262812824818, + "grad_norm": 2.7275326251983643, + "learning_rate": 9.008335114340671e-06, + "loss": 0.2901, + "step": 3846 + }, + { + "epoch": 2.45990245462541, + "grad_norm": 3.017045021057129, + "learning_rate": 8.99764907031417e-06, + "loss": 0.296, + "step": 3847 + }, + { + "epoch": 2.4605420964260016, + "grad_norm": 3.1838059425354004, + "learning_rate": 8.986963026287669e-06, + "loss": 0.3134, + "step": 3848 + }, + { + "epoch": 2.461181738226593, + "grad_norm": 3.933943271636963, + "learning_rate": 8.976276982261166e-06, + "loss": 0.3207, + "step": 3849 + }, + { + "epoch": 2.461821380027185, + "grad_norm": 3.2233517169952393, + "learning_rate": 8.965590938234667e-06, + "loss": 0.294, + "step": 3850 + }, + { + "epoch": 2.4624610218277763, + "grad_norm": 3.0609567165374756, + "learning_rate": 8.954904894208165e-06, + "loss": 0.3083, + "step": 3851 + }, + { + "epoch": 2.463100663628368, + "grad_norm": 3.034419298171997, + "learning_rate": 8.944218850181663e-06, + "loss": 0.2866, + "step": 3852 + }, + { + "epoch": 2.46374030542896, + "grad_norm": 2.9410407543182373, + "learning_rate": 8.933532806155162e-06, + "loss": 0.2609, + "step": 3853 + }, + { + "epoch": 2.4643799472295513, + "grad_norm": 2.7242746353149414, + "learning_rate": 8.92284676212866e-06, + "loss": 0.2627, + "step": 3854 + }, + { + "epoch": 2.465019589030143, + "grad_norm": 3.1908862590789795, + "learning_rate": 8.912160718102159e-06, + "loss": 0.2887, + "step": 3855 + }, + { + "epoch": 2.4656592308307346, + "grad_norm": 3.2623515129089355, + "learning_rate": 8.901474674075658e-06, + "loss": 0.3116, + "step": 3856 + }, + { + "epoch": 2.4662988726313264, + "grad_norm": 2.709096670150757, + "learning_rate": 8.890788630049156e-06, + "loss": 0.2738, + "step": 3857 + }, + { + "epoch": 2.4669385144319183, + "grad_norm": 2.7787728309631348, + "learning_rate": 8.880102586022655e-06, + "loss": 0.2679, + "step": 3858 + }, + { + "epoch": 2.4675781562325096, + "grad_norm": 3.2225756645202637, + "learning_rate": 8.869416541996153e-06, + "loss": 0.2948, + "step": 3859 + }, + { + "epoch": 2.4682177980331015, + "grad_norm": 3.0603277683258057, + "learning_rate": 8.85873049796965e-06, + "loss": 0.3131, + "step": 3860 + }, + { + "epoch": 2.4688574398336933, + "grad_norm": 3.017467498779297, + "learning_rate": 8.848044453943152e-06, + "loss": 0.2859, + "step": 3861 + }, + { + "epoch": 2.4694970816342847, + "grad_norm": 3.512159585952759, + "learning_rate": 8.83735840991665e-06, + "loss": 0.3014, + "step": 3862 + }, + { + "epoch": 2.4701367234348766, + "grad_norm": 3.199126720428467, + "learning_rate": 8.826672365890149e-06, + "loss": 0.2867, + "step": 3863 + }, + { + "epoch": 2.470776365235468, + "grad_norm": 2.4926111698150635, + "learning_rate": 8.815986321863646e-06, + "loss": 0.27, + "step": 3864 + }, + { + "epoch": 2.47141600703606, + "grad_norm": 3.3452162742614746, + "learning_rate": 8.805300277837144e-06, + "loss": 0.3301, + "step": 3865 + }, + { + "epoch": 2.4720556488366516, + "grad_norm": 2.844399929046631, + "learning_rate": 8.794614233810643e-06, + "loss": 0.2904, + "step": 3866 + }, + { + "epoch": 2.472695290637243, + "grad_norm": 2.7875826358795166, + "learning_rate": 8.783928189784143e-06, + "loss": 0.278, + "step": 3867 + }, + { + "epoch": 2.473334932437835, + "grad_norm": 3.422585964202881, + "learning_rate": 8.77324214575764e-06, + "loss": 0.3017, + "step": 3868 + }, + { + "epoch": 2.4739745742384267, + "grad_norm": 2.911555290222168, + "learning_rate": 8.76255610173114e-06, + "loss": 0.2923, + "step": 3869 + }, + { + "epoch": 2.474614216039018, + "grad_norm": 3.060211658477783, + "learning_rate": 8.751870057704638e-06, + "loss": 0.2735, + "step": 3870 + }, + { + "epoch": 2.47525385783961, + "grad_norm": 3.1995270252227783, + "learning_rate": 8.741184013678137e-06, + "loss": 0.3038, + "step": 3871 + }, + { + "epoch": 2.4758934996402013, + "grad_norm": 2.997093915939331, + "learning_rate": 8.730497969651635e-06, + "loss": 0.2703, + "step": 3872 + }, + { + "epoch": 2.476533141440793, + "grad_norm": 3.3046493530273438, + "learning_rate": 8.719811925625134e-06, + "loss": 0.3212, + "step": 3873 + }, + { + "epoch": 2.477172783241385, + "grad_norm": 2.493839740753174, + "learning_rate": 8.709125881598633e-06, + "loss": 0.268, + "step": 3874 + }, + { + "epoch": 2.4778124250419764, + "grad_norm": 3.9661970138549805, + "learning_rate": 8.698439837572131e-06, + "loss": 0.3268, + "step": 3875 + }, + { + "epoch": 2.4784520668425682, + "grad_norm": 3.3390679359436035, + "learning_rate": 8.68775379354563e-06, + "loss": 0.2832, + "step": 3876 + }, + { + "epoch": 2.4790917086431596, + "grad_norm": 3.179314374923706, + "learning_rate": 8.677067749519128e-06, + "loss": 0.3027, + "step": 3877 + }, + { + "epoch": 2.4797313504437515, + "grad_norm": 2.945326089859009, + "learning_rate": 8.666381705492627e-06, + "loss": 0.3058, + "step": 3878 + }, + { + "epoch": 2.4803709922443433, + "grad_norm": 2.443571090698242, + "learning_rate": 8.655695661466127e-06, + "loss": 0.288, + "step": 3879 + }, + { + "epoch": 2.4810106340449347, + "grad_norm": 3.9710521697998047, + "learning_rate": 8.645009617439624e-06, + "loss": 0.3439, + "step": 3880 + }, + { + "epoch": 2.4816502758455266, + "grad_norm": 3.116760015487671, + "learning_rate": 8.634323573413122e-06, + "loss": 0.2744, + "step": 3881 + }, + { + "epoch": 2.482289917646118, + "grad_norm": 3.2043404579162598, + "learning_rate": 8.623637529386621e-06, + "loss": 0.3218, + "step": 3882 + }, + { + "epoch": 2.48292955944671, + "grad_norm": 2.778614044189453, + "learning_rate": 8.612951485360119e-06, + "loss": 0.3039, + "step": 3883 + }, + { + "epoch": 2.4835692012473016, + "grad_norm": 2.793134927749634, + "learning_rate": 8.60226544133362e-06, + "loss": 0.2741, + "step": 3884 + }, + { + "epoch": 2.484208843047893, + "grad_norm": 2.952721118927002, + "learning_rate": 8.591579397307118e-06, + "loss": 0.2887, + "step": 3885 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 2.861395835876465, + "learning_rate": 8.580893353280615e-06, + "loss": 0.2936, + "step": 3886 + }, + { + "epoch": 2.4854881266490767, + "grad_norm": 3.0399374961853027, + "learning_rate": 8.570207309254115e-06, + "loss": 0.2947, + "step": 3887 + }, + { + "epoch": 2.486127768449668, + "grad_norm": 3.764336109161377, + "learning_rate": 8.559521265227612e-06, + "loss": 0.3227, + "step": 3888 + }, + { + "epoch": 2.48676741025026, + "grad_norm": 2.946716785430908, + "learning_rate": 8.548835221201112e-06, + "loss": 0.2827, + "step": 3889 + }, + { + "epoch": 2.4874070520508513, + "grad_norm": 2.3790247440338135, + "learning_rate": 8.538149177174611e-06, + "loss": 0.268, + "step": 3890 + }, + { + "epoch": 2.488046693851443, + "grad_norm": 3.0598161220550537, + "learning_rate": 8.527463133148109e-06, + "loss": 0.3098, + "step": 3891 + }, + { + "epoch": 2.488686335652035, + "grad_norm": 2.887455701828003, + "learning_rate": 8.516777089121608e-06, + "loss": 0.3085, + "step": 3892 + }, + { + "epoch": 2.4893259774526264, + "grad_norm": 2.9641199111938477, + "learning_rate": 8.506091045095106e-06, + "loss": 0.3061, + "step": 3893 + }, + { + "epoch": 2.4899656192532182, + "grad_norm": 2.924095869064331, + "learning_rate": 8.495405001068604e-06, + "loss": 0.2748, + "step": 3894 + }, + { + "epoch": 2.49060526105381, + "grad_norm": 2.6567180156707764, + "learning_rate": 8.484718957042103e-06, + "loss": 0.2891, + "step": 3895 + }, + { + "epoch": 2.4912449028544015, + "grad_norm": 3.831763505935669, + "learning_rate": 8.474032913015602e-06, + "loss": 0.3186, + "step": 3896 + }, + { + "epoch": 2.4918845446549933, + "grad_norm": 3.099663257598877, + "learning_rate": 8.463346868989102e-06, + "loss": 0.2924, + "step": 3897 + }, + { + "epoch": 2.4925241864555847, + "grad_norm": 2.84446382522583, + "learning_rate": 8.4526608249626e-06, + "loss": 0.3218, + "step": 3898 + }, + { + "epoch": 2.4931638282561766, + "grad_norm": 3.2212624549865723, + "learning_rate": 8.441974780936097e-06, + "loss": 0.334, + "step": 3899 + }, + { + "epoch": 2.4938034700567684, + "grad_norm": 2.6497578620910645, + "learning_rate": 8.431288736909596e-06, + "loss": 0.2723, + "step": 3900 + }, + { + "epoch": 2.49444311185736, + "grad_norm": 3.1708333492279053, + "learning_rate": 8.420602692883096e-06, + "loss": 0.3016, + "step": 3901 + }, + { + "epoch": 2.4950827536579516, + "grad_norm": 2.6580488681793213, + "learning_rate": 8.409916648856593e-06, + "loss": 0.2629, + "step": 3902 + }, + { + "epoch": 2.495722395458543, + "grad_norm": 3.1520209312438965, + "learning_rate": 8.399230604830093e-06, + "loss": 0.3161, + "step": 3903 + }, + { + "epoch": 2.496362037259135, + "grad_norm": 3.023639440536499, + "learning_rate": 8.38854456080359e-06, + "loss": 0.3246, + "step": 3904 + }, + { + "epoch": 2.4970016790597267, + "grad_norm": 2.6570332050323486, + "learning_rate": 8.37785851677709e-06, + "loss": 0.2733, + "step": 3905 + }, + { + "epoch": 2.497641320860318, + "grad_norm": 2.9623055458068848, + "learning_rate": 8.367172472750587e-06, + "loss": 0.3021, + "step": 3906 + }, + { + "epoch": 2.49828096266091, + "grad_norm": 3.133467435836792, + "learning_rate": 8.356486428724087e-06, + "loss": 0.3116, + "step": 3907 + }, + { + "epoch": 2.4989206044615013, + "grad_norm": 2.862210988998413, + "learning_rate": 8.345800384697586e-06, + "loss": 0.2773, + "step": 3908 + }, + { + "epoch": 2.499560246262093, + "grad_norm": 2.55251407623291, + "learning_rate": 8.335114340671084e-06, + "loss": 0.2911, + "step": 3909 + }, + { + "epoch": 2.500199888062685, + "grad_norm": 2.627422571182251, + "learning_rate": 8.324428296644582e-06, + "loss": 0.2591, + "step": 3910 + }, + { + "epoch": 2.5008395298632764, + "grad_norm": 2.694646120071411, + "learning_rate": 8.313742252618081e-06, + "loss": 0.2883, + "step": 3911 + }, + { + "epoch": 2.5014791716638682, + "grad_norm": 2.600728988647461, + "learning_rate": 8.30305620859158e-06, + "loss": 0.2816, + "step": 3912 + }, + { + "epoch": 2.5021188134644596, + "grad_norm": 2.8494765758514404, + "learning_rate": 8.29237016456508e-06, + "loss": 0.2964, + "step": 3913 + }, + { + "epoch": 2.5027584552650515, + "grad_norm": 2.6035451889038086, + "learning_rate": 8.281684120538577e-06, + "loss": 0.2791, + "step": 3914 + }, + { + "epoch": 2.5033980970656433, + "grad_norm": 2.9588165283203125, + "learning_rate": 8.270998076512075e-06, + "loss": 0.2856, + "step": 3915 + }, + { + "epoch": 2.504037738866235, + "grad_norm": 3.0478639602661133, + "learning_rate": 8.260312032485574e-06, + "loss": 0.2933, + "step": 3916 + }, + { + "epoch": 2.5046773806668265, + "grad_norm": 2.9690191745758057, + "learning_rate": 8.249625988459072e-06, + "loss": 0.3171, + "step": 3917 + }, + { + "epoch": 2.5053170224674184, + "grad_norm": 2.783714771270752, + "learning_rate": 8.238939944432571e-06, + "loss": 0.2907, + "step": 3918 + }, + { + "epoch": 2.50595666426801, + "grad_norm": 2.7072505950927734, + "learning_rate": 8.22825390040607e-06, + "loss": 0.2691, + "step": 3919 + }, + { + "epoch": 2.5065963060686016, + "grad_norm": 3.027251958847046, + "learning_rate": 8.217567856379568e-06, + "loss": 0.3043, + "step": 3920 + }, + { + "epoch": 2.5072359478691935, + "grad_norm": 2.955143928527832, + "learning_rate": 8.206881812353068e-06, + "loss": 0.2907, + "step": 3921 + }, + { + "epoch": 2.507875589669785, + "grad_norm": 3.3920645713806152, + "learning_rate": 8.196195768326565e-06, + "loss": 0.3007, + "step": 3922 + }, + { + "epoch": 2.5085152314703767, + "grad_norm": 3.103003978729248, + "learning_rate": 8.185509724300065e-06, + "loss": 0.2806, + "step": 3923 + }, + { + "epoch": 2.509154873270968, + "grad_norm": 2.414053440093994, + "learning_rate": 8.174823680273564e-06, + "loss": 0.2626, + "step": 3924 + }, + { + "epoch": 2.50979451507156, + "grad_norm": 2.831071138381958, + "learning_rate": 8.164137636247062e-06, + "loss": 0.2681, + "step": 3925 + }, + { + "epoch": 2.5104341568721518, + "grad_norm": 3.113445520401001, + "learning_rate": 8.153451592220561e-06, + "loss": 0.3075, + "step": 3926 + }, + { + "epoch": 2.511073798672743, + "grad_norm": 2.6940696239471436, + "learning_rate": 8.142765548194059e-06, + "loss": 0.2855, + "step": 3927 + }, + { + "epoch": 2.511713440473335, + "grad_norm": 3.4920425415039062, + "learning_rate": 8.132079504167556e-06, + "loss": 0.3122, + "step": 3928 + }, + { + "epoch": 2.5123530822739264, + "grad_norm": 2.999638557434082, + "learning_rate": 8.121393460141056e-06, + "loss": 0.3005, + "step": 3929 + }, + { + "epoch": 2.5129927240745182, + "grad_norm": 4.281359672546387, + "learning_rate": 8.110707416114555e-06, + "loss": 0.2857, + "step": 3930 + }, + { + "epoch": 2.51363236587511, + "grad_norm": 2.734402656555176, + "learning_rate": 8.100021372088053e-06, + "loss": 0.2738, + "step": 3931 + }, + { + "epoch": 2.5142720076757015, + "grad_norm": 3.25498366355896, + "learning_rate": 8.089335328061552e-06, + "loss": 0.2969, + "step": 3932 + }, + { + "epoch": 2.5149116494762933, + "grad_norm": 2.784135103225708, + "learning_rate": 8.07864928403505e-06, + "loss": 0.2899, + "step": 3933 + }, + { + "epoch": 2.5155512912768847, + "grad_norm": 3.077578067779541, + "learning_rate": 8.06796324000855e-06, + "loss": 0.3244, + "step": 3934 + }, + { + "epoch": 2.5161909330774765, + "grad_norm": 2.8698017597198486, + "learning_rate": 8.057277195982049e-06, + "loss": 0.2667, + "step": 3935 + }, + { + "epoch": 2.5168305748780684, + "grad_norm": 3.7617104053497314, + "learning_rate": 8.046591151955546e-06, + "loss": 0.3495, + "step": 3936 + }, + { + "epoch": 2.51747021667866, + "grad_norm": 2.79166841506958, + "learning_rate": 8.035905107929046e-06, + "loss": 0.2674, + "step": 3937 + }, + { + "epoch": 2.5181098584792516, + "grad_norm": 2.9812519550323486, + "learning_rate": 8.025219063902543e-06, + "loss": 0.2957, + "step": 3938 + }, + { + "epoch": 2.518749500279843, + "grad_norm": 2.6640431880950928, + "learning_rate": 8.014533019876043e-06, + "loss": 0.2538, + "step": 3939 + }, + { + "epoch": 2.519389142080435, + "grad_norm": 3.042208671569824, + "learning_rate": 8.00384697584954e-06, + "loss": 0.3077, + "step": 3940 + }, + { + "epoch": 2.5200287838810267, + "grad_norm": 2.7115213871002197, + "learning_rate": 7.99316093182304e-06, + "loss": 0.2562, + "step": 3941 + }, + { + "epoch": 2.5206684256816185, + "grad_norm": 2.6850244998931885, + "learning_rate": 7.982474887796539e-06, + "loss": 0.2683, + "step": 3942 + }, + { + "epoch": 2.52130806748221, + "grad_norm": 2.752390146255493, + "learning_rate": 7.971788843770037e-06, + "loss": 0.2732, + "step": 3943 + }, + { + "epoch": 2.5219477092828018, + "grad_norm": 3.4455316066741943, + "learning_rate": 7.961102799743534e-06, + "loss": 0.3295, + "step": 3944 + }, + { + "epoch": 2.522587351083393, + "grad_norm": 2.862778663635254, + "learning_rate": 7.950416755717034e-06, + "loss": 0.2688, + "step": 3945 + }, + { + "epoch": 2.523226992883985, + "grad_norm": 3.092811107635498, + "learning_rate": 7.939730711690533e-06, + "loss": 0.3134, + "step": 3946 + }, + { + "epoch": 2.523866634684577, + "grad_norm": 3.047186851501465, + "learning_rate": 7.929044667664032e-06, + "loss": 0.3042, + "step": 3947 + }, + { + "epoch": 2.5245062764851682, + "grad_norm": 3.0295207500457764, + "learning_rate": 7.91835862363753e-06, + "loss": 0.3017, + "step": 3948 + }, + { + "epoch": 2.52514591828576, + "grad_norm": 3.1070966720581055, + "learning_rate": 7.907672579611028e-06, + "loss": 0.2831, + "step": 3949 + }, + { + "epoch": 2.5257855600863515, + "grad_norm": 2.9946415424346924, + "learning_rate": 7.896986535584527e-06, + "loss": 0.287, + "step": 3950 + }, + { + "epoch": 2.5264252018869433, + "grad_norm": 2.954479694366455, + "learning_rate": 7.886300491558025e-06, + "loss": 0.2925, + "step": 3951 + }, + { + "epoch": 2.527064843687535, + "grad_norm": 2.645458459854126, + "learning_rate": 7.875614447531524e-06, + "loss": 0.2816, + "step": 3952 + }, + { + "epoch": 2.5277044854881265, + "grad_norm": 2.9948172569274902, + "learning_rate": 7.864928403505024e-06, + "loss": 0.2759, + "step": 3953 + }, + { + "epoch": 2.5283441272887184, + "grad_norm": 3.8406200408935547, + "learning_rate": 7.854242359478521e-06, + "loss": 0.3134, + "step": 3954 + }, + { + "epoch": 2.5289837690893098, + "grad_norm": 3.5990071296691895, + "learning_rate": 7.84355631545202e-06, + "loss": 0.3293, + "step": 3955 + }, + { + "epoch": 2.5296234108899016, + "grad_norm": 2.8128585815429688, + "learning_rate": 7.832870271425518e-06, + "loss": 0.2828, + "step": 3956 + }, + { + "epoch": 2.5302630526904935, + "grad_norm": 2.500379800796509, + "learning_rate": 7.822184227399016e-06, + "loss": 0.276, + "step": 3957 + }, + { + "epoch": 2.530902694491085, + "grad_norm": 3.474045991897583, + "learning_rate": 7.811498183372517e-06, + "loss": 0.319, + "step": 3958 + }, + { + "epoch": 2.5315423362916767, + "grad_norm": 3.13374924659729, + "learning_rate": 7.800812139346015e-06, + "loss": 0.3268, + "step": 3959 + }, + { + "epoch": 2.532181978092268, + "grad_norm": 3.1712162494659424, + "learning_rate": 7.790126095319514e-06, + "loss": 0.3041, + "step": 3960 + }, + { + "epoch": 2.53282161989286, + "grad_norm": 3.28678297996521, + "learning_rate": 7.779440051293012e-06, + "loss": 0.3128, + "step": 3961 + }, + { + "epoch": 2.5334612616934518, + "grad_norm": 2.8915796279907227, + "learning_rate": 7.76875400726651e-06, + "loss": 0.2954, + "step": 3962 + }, + { + "epoch": 2.5341009034940436, + "grad_norm": 2.7176880836486816, + "learning_rate": 7.758067963240009e-06, + "loss": 0.287, + "step": 3963 + }, + { + "epoch": 2.534740545294635, + "grad_norm": 3.1310877799987793, + "learning_rate": 7.747381919213508e-06, + "loss": 0.2817, + "step": 3964 + }, + { + "epoch": 2.535380187095227, + "grad_norm": 2.8056352138519287, + "learning_rate": 7.736695875187006e-06, + "loss": 0.2795, + "step": 3965 + }, + { + "epoch": 2.5360198288958182, + "grad_norm": 3.34025502204895, + "learning_rate": 7.726009831160505e-06, + "loss": 0.3309, + "step": 3966 + }, + { + "epoch": 2.53665947069641, + "grad_norm": 2.857496500015259, + "learning_rate": 7.715323787134003e-06, + "loss": 0.279, + "step": 3967 + }, + { + "epoch": 2.537299112497002, + "grad_norm": 2.914743185043335, + "learning_rate": 7.704637743107502e-06, + "loss": 0.2792, + "step": 3968 + }, + { + "epoch": 2.5379387542975933, + "grad_norm": 3.172365665435791, + "learning_rate": 7.693951699081002e-06, + "loss": 0.2887, + "step": 3969 + }, + { + "epoch": 2.538578396098185, + "grad_norm": 2.498138427734375, + "learning_rate": 7.6832656550545e-06, + "loss": 0.278, + "step": 3970 + }, + { + "epoch": 2.5392180378987765, + "grad_norm": 2.6336593627929688, + "learning_rate": 7.672579611027999e-06, + "loss": 0.2843, + "step": 3971 + }, + { + "epoch": 2.5398576796993684, + "grad_norm": 2.6753458976745605, + "learning_rate": 7.661893567001496e-06, + "loss": 0.2703, + "step": 3972 + }, + { + "epoch": 2.54049732149996, + "grad_norm": 2.757615327835083, + "learning_rate": 7.651207522974994e-06, + "loss": 0.2837, + "step": 3973 + }, + { + "epoch": 2.5411369633005516, + "grad_norm": 3.0092389583587646, + "learning_rate": 7.640521478948493e-06, + "loss": 0.2946, + "step": 3974 + }, + { + "epoch": 2.5417766051011434, + "grad_norm": 3.061602830886841, + "learning_rate": 7.629835434921993e-06, + "loss": 0.2963, + "step": 3975 + }, + { + "epoch": 2.542416246901735, + "grad_norm": 3.014116048812866, + "learning_rate": 7.619149390895491e-06, + "loss": 0.3126, + "step": 3976 + }, + { + "epoch": 2.5430558887023267, + "grad_norm": 2.7754018306732178, + "learning_rate": 7.60846334686899e-06, + "loss": 0.3018, + "step": 3977 + }, + { + "epoch": 2.5436955305029185, + "grad_norm": 2.6593613624572754, + "learning_rate": 7.597777302842487e-06, + "loss": 0.2675, + "step": 3978 + }, + { + "epoch": 2.54433517230351, + "grad_norm": 2.8170251846313477, + "learning_rate": 7.587091258815987e-06, + "loss": 0.2756, + "step": 3979 + }, + { + "epoch": 2.5449748141041018, + "grad_norm": 2.7848563194274902, + "learning_rate": 7.576405214789485e-06, + "loss": 0.3, + "step": 3980 + }, + { + "epoch": 2.545614455904693, + "grad_norm": 2.87302303314209, + "learning_rate": 7.5657191707629845e-06, + "loss": 0.2922, + "step": 3981 + }, + { + "epoch": 2.546254097705285, + "grad_norm": 3.23467755317688, + "learning_rate": 7.555033126736483e-06, + "loss": 0.317, + "step": 3982 + }, + { + "epoch": 2.546893739505877, + "grad_norm": 2.7781662940979004, + "learning_rate": 7.544347082709981e-06, + "loss": 0.2736, + "step": 3983 + }, + { + "epoch": 2.5475333813064682, + "grad_norm": 3.339779853820801, + "learning_rate": 7.53366103868348e-06, + "loss": 0.328, + "step": 3984 + }, + { + "epoch": 2.54817302310706, + "grad_norm": 2.7671165466308594, + "learning_rate": 7.522974994656979e-06, + "loss": 0.2959, + "step": 3985 + }, + { + "epoch": 2.5488126649076515, + "grad_norm": 2.848484754562378, + "learning_rate": 7.512288950630476e-06, + "loss": 0.2797, + "step": 3986 + }, + { + "epoch": 2.5494523067082433, + "grad_norm": 3.0425779819488525, + "learning_rate": 7.501602906603976e-06, + "loss": 0.3006, + "step": 3987 + }, + { + "epoch": 2.550091948508835, + "grad_norm": 3.1285667419433594, + "learning_rate": 7.490916862577474e-06, + "loss": 0.311, + "step": 3988 + }, + { + "epoch": 2.550731590309427, + "grad_norm": 2.9664113521575928, + "learning_rate": 7.4802308185509735e-06, + "loss": 0.2844, + "step": 3989 + }, + { + "epoch": 2.5513712321100184, + "grad_norm": 2.473754405975342, + "learning_rate": 7.469544774524471e-06, + "loss": 0.2855, + "step": 3990 + }, + { + "epoch": 2.55201087391061, + "grad_norm": 2.819472074508667, + "learning_rate": 7.45885873049797e-06, + "loss": 0.2886, + "step": 3991 + }, + { + "epoch": 2.5526505157112016, + "grad_norm": 2.516455888748169, + "learning_rate": 7.448172686471469e-06, + "loss": 0.2477, + "step": 3992 + }, + { + "epoch": 2.5532901575117934, + "grad_norm": 3.2210848331451416, + "learning_rate": 7.437486642444967e-06, + "loss": 0.2987, + "step": 3993 + }, + { + "epoch": 2.5539297993123853, + "grad_norm": 2.665578842163086, + "learning_rate": 7.426800598418465e-06, + "loss": 0.2768, + "step": 3994 + }, + { + "epoch": 2.5545694411129767, + "grad_norm": 2.6386947631835938, + "learning_rate": 7.416114554391965e-06, + "loss": 0.2756, + "step": 3995 + }, + { + "epoch": 2.5552090829135685, + "grad_norm": 2.5386404991149902, + "learning_rate": 7.405428510365463e-06, + "loss": 0.2931, + "step": 3996 + }, + { + "epoch": 2.55584872471416, + "grad_norm": 2.7998013496398926, + "learning_rate": 7.3947424663389625e-06, + "loss": 0.299, + "step": 3997 + }, + { + "epoch": 2.5564883665147518, + "grad_norm": 3.289069652557373, + "learning_rate": 7.38405642231246e-06, + "loss": 0.2997, + "step": 3998 + }, + { + "epoch": 2.5571280083153436, + "grad_norm": 2.8917007446289062, + "learning_rate": 7.373370378285959e-06, + "loss": 0.2718, + "step": 3999 + }, + { + "epoch": 2.557767650115935, + "grad_norm": 3.172271251678467, + "learning_rate": 7.362684334259458e-06, + "loss": 0.3084, + "step": 4000 + }, + { + "epoch": 2.558407291916527, + "grad_norm": 3.154325246810913, + "learning_rate": 7.351998290232956e-06, + "loss": 0.309, + "step": 4001 + }, + { + "epoch": 2.559046933717118, + "grad_norm": 3.2781569957733154, + "learning_rate": 7.341312246206455e-06, + "loss": 0.3389, + "step": 4002 + }, + { + "epoch": 2.55968657551771, + "grad_norm": 2.648505449295044, + "learning_rate": 7.3306262021799536e-06, + "loss": 0.2812, + "step": 4003 + }, + { + "epoch": 2.560326217318302, + "grad_norm": 2.9028067588806152, + "learning_rate": 7.319940158153451e-06, + "loss": 0.2761, + "step": 4004 + }, + { + "epoch": 2.5609658591188933, + "grad_norm": 2.9735097885131836, + "learning_rate": 7.3092541141269514e-06, + "loss": 0.2909, + "step": 4005 + }, + { + "epoch": 2.561605500919485, + "grad_norm": 3.0845820903778076, + "learning_rate": 7.298568070100449e-06, + "loss": 0.3103, + "step": 4006 + }, + { + "epoch": 2.5622451427200765, + "grad_norm": 3.088407516479492, + "learning_rate": 7.287882026073948e-06, + "loss": 0.317, + "step": 4007 + }, + { + "epoch": 2.5628847845206684, + "grad_norm": 2.7409865856170654, + "learning_rate": 7.277195982047447e-06, + "loss": 0.2958, + "step": 4008 + }, + { + "epoch": 2.56352442632126, + "grad_norm": 2.571981906890869, + "learning_rate": 7.266509938020945e-06, + "loss": 0.2662, + "step": 4009 + }, + { + "epoch": 2.5641640681218516, + "grad_norm": 2.704202651977539, + "learning_rate": 7.255823893994444e-06, + "loss": 0.2677, + "step": 4010 + }, + { + "epoch": 2.5648037099224434, + "grad_norm": 3.417504072189331, + "learning_rate": 7.2451378499679425e-06, + "loss": 0.3036, + "step": 4011 + }, + { + "epoch": 2.565443351723035, + "grad_norm": 3.1518378257751465, + "learning_rate": 7.23445180594144e-06, + "loss": 0.3001, + "step": 4012 + }, + { + "epoch": 2.5660829935236267, + "grad_norm": 2.7487339973449707, + "learning_rate": 7.2237657619149396e-06, + "loss": 0.3017, + "step": 4013 + }, + { + "epoch": 2.5667226353242185, + "grad_norm": 2.619185209274292, + "learning_rate": 7.213079717888438e-06, + "loss": 0.2794, + "step": 4014 + }, + { + "epoch": 2.5673622771248104, + "grad_norm": 2.819486141204834, + "learning_rate": 7.202393673861936e-06, + "loss": 0.2912, + "step": 4015 + }, + { + "epoch": 2.5680019189254017, + "grad_norm": 2.6165525913238525, + "learning_rate": 7.191707629835435e-06, + "loss": 0.2865, + "step": 4016 + }, + { + "epoch": 2.5686415607259936, + "grad_norm": 2.7600765228271484, + "learning_rate": 7.181021585808934e-06, + "loss": 0.2846, + "step": 4017 + }, + { + "epoch": 2.569281202526585, + "grad_norm": 2.990995168685913, + "learning_rate": 7.170335541782433e-06, + "loss": 0.307, + "step": 4018 + }, + { + "epoch": 2.569920844327177, + "grad_norm": 3.0928447246551514, + "learning_rate": 7.1596494977559315e-06, + "loss": 0.2963, + "step": 4019 + }, + { + "epoch": 2.5705604861277687, + "grad_norm": 2.7217769622802734, + "learning_rate": 7.148963453729429e-06, + "loss": 0.2705, + "step": 4020 + }, + { + "epoch": 2.57120012792836, + "grad_norm": 2.7806236743927, + "learning_rate": 7.1382774097029285e-06, + "loss": 0.2911, + "step": 4021 + }, + { + "epoch": 2.571839769728952, + "grad_norm": 2.488596200942993, + "learning_rate": 7.127591365676427e-06, + "loss": 0.2555, + "step": 4022 + }, + { + "epoch": 2.5724794115295433, + "grad_norm": 3.095839023590088, + "learning_rate": 7.116905321649925e-06, + "loss": 0.2877, + "step": 4023 + }, + { + "epoch": 2.573119053330135, + "grad_norm": 3.1489639282226562, + "learning_rate": 7.106219277623424e-06, + "loss": 0.3054, + "step": 4024 + }, + { + "epoch": 2.573758695130727, + "grad_norm": 2.6779065132141113, + "learning_rate": 7.095533233596923e-06, + "loss": 0.2885, + "step": 4025 + }, + { + "epoch": 2.5743983369313184, + "grad_norm": 3.1690926551818848, + "learning_rate": 7.084847189570422e-06, + "loss": 0.3234, + "step": 4026 + }, + { + "epoch": 2.57503797873191, + "grad_norm": 3.006356954574585, + "learning_rate": 7.07416114554392e-06, + "loss": 0.282, + "step": 4027 + }, + { + "epoch": 2.5756776205325016, + "grad_norm": 3.131333589553833, + "learning_rate": 7.063475101517418e-06, + "loss": 0.2896, + "step": 4028 + }, + { + "epoch": 2.5763172623330934, + "grad_norm": 2.638676881790161, + "learning_rate": 7.0527890574909175e-06, + "loss": 0.2626, + "step": 4029 + }, + { + "epoch": 2.5769569041336853, + "grad_norm": 2.6633353233337402, + "learning_rate": 7.042103013464416e-06, + "loss": 0.2764, + "step": 4030 + }, + { + "epoch": 2.5775965459342767, + "grad_norm": 2.9712777137756348, + "learning_rate": 7.031416969437915e-06, + "loss": 0.2985, + "step": 4031 + }, + { + "epoch": 2.5782361877348685, + "grad_norm": 2.6915102005004883, + "learning_rate": 7.020730925411413e-06, + "loss": 0.2768, + "step": 4032 + }, + { + "epoch": 2.57887582953546, + "grad_norm": 2.3778769969940186, + "learning_rate": 7.0100448813849115e-06, + "loss": 0.2317, + "step": 4033 + }, + { + "epoch": 2.5795154713360517, + "grad_norm": 2.6297054290771484, + "learning_rate": 6.999358837358411e-06, + "loss": 0.2694, + "step": 4034 + }, + { + "epoch": 2.5801551131366436, + "grad_norm": 3.0256714820861816, + "learning_rate": 6.9886727933319086e-06, + "loss": 0.2985, + "step": 4035 + }, + { + "epoch": 2.5807947549372354, + "grad_norm": 2.7137451171875, + "learning_rate": 6.977986749305407e-06, + "loss": 0.2594, + "step": 4036 + }, + { + "epoch": 2.581434396737827, + "grad_norm": 3.3964784145355225, + "learning_rate": 6.9673007052789065e-06, + "loss": 0.3148, + "step": 4037 + }, + { + "epoch": 2.582074038538418, + "grad_norm": 2.715463876724243, + "learning_rate": 6.956614661252404e-06, + "loss": 0.2957, + "step": 4038 + }, + { + "epoch": 2.58271368033901, + "grad_norm": 3.4258534908294678, + "learning_rate": 6.9459286172259035e-06, + "loss": 0.3052, + "step": 4039 + }, + { + "epoch": 2.583353322139602, + "grad_norm": 2.7731049060821533, + "learning_rate": 6.935242573199402e-06, + "loss": 0.2786, + "step": 4040 + }, + { + "epoch": 2.5839929639401937, + "grad_norm": 3.120088815689087, + "learning_rate": 6.9245565291729e-06, + "loss": 0.3164, + "step": 4041 + }, + { + "epoch": 2.584632605740785, + "grad_norm": 3.1760048866271973, + "learning_rate": 6.9138704851464e-06, + "loss": 0.3056, + "step": 4042 + }, + { + "epoch": 2.585272247541377, + "grad_norm": 2.9035680294036865, + "learning_rate": 6.9031844411198975e-06, + "loss": 0.2629, + "step": 4043 + }, + { + "epoch": 2.5859118893419684, + "grad_norm": 3.0123603343963623, + "learning_rate": 6.892498397093396e-06, + "loss": 0.2954, + "step": 4044 + }, + { + "epoch": 2.58655153114256, + "grad_norm": 2.7939774990081787, + "learning_rate": 6.881812353066895e-06, + "loss": 0.2842, + "step": 4045 + }, + { + "epoch": 2.587191172943152, + "grad_norm": 3.3975231647491455, + "learning_rate": 6.871126309040393e-06, + "loss": 0.3054, + "step": 4046 + }, + { + "epoch": 2.5878308147437434, + "grad_norm": 2.6104512214660645, + "learning_rate": 6.8604402650138924e-06, + "loss": 0.2876, + "step": 4047 + }, + { + "epoch": 2.5884704565443353, + "grad_norm": 2.675142765045166, + "learning_rate": 6.849754220987391e-06, + "loss": 0.2713, + "step": 4048 + }, + { + "epoch": 2.5891100983449267, + "grad_norm": 2.454266309738159, + "learning_rate": 6.839068176960889e-06, + "loss": 0.2504, + "step": 4049 + }, + { + "epoch": 2.5897497401455185, + "grad_norm": 3.079059600830078, + "learning_rate": 6.828382132934388e-06, + "loss": 0.2757, + "step": 4050 + }, + { + "epoch": 2.5903893819461103, + "grad_norm": 2.8647725582122803, + "learning_rate": 6.8176960889078865e-06, + "loss": 0.289, + "step": 4051 + }, + { + "epoch": 2.5910290237467017, + "grad_norm": 2.5410890579223633, + "learning_rate": 6.807010044881386e-06, + "loss": 0.2809, + "step": 4052 + }, + { + "epoch": 2.5916686655472936, + "grad_norm": 3.003767251968384, + "learning_rate": 6.796324000854884e-06, + "loss": 0.2836, + "step": 4053 + }, + { + "epoch": 2.592308307347885, + "grad_norm": 3.547102928161621, + "learning_rate": 6.785637956828382e-06, + "loss": 0.3019, + "step": 4054 + }, + { + "epoch": 2.592947949148477, + "grad_norm": 2.8899435997009277, + "learning_rate": 6.774951912801881e-06, + "loss": 0.2901, + "step": 4055 + }, + { + "epoch": 2.5935875909490687, + "grad_norm": 2.9045917987823486, + "learning_rate": 6.76426586877538e-06, + "loss": 0.3119, + "step": 4056 + }, + { + "epoch": 2.59422723274966, + "grad_norm": 2.90828013420105, + "learning_rate": 6.753579824748878e-06, + "loss": 0.2734, + "step": 4057 + }, + { + "epoch": 2.594866874550252, + "grad_norm": 2.9471561908721924, + "learning_rate": 6.742893780722377e-06, + "loss": 0.2726, + "step": 4058 + }, + { + "epoch": 2.5955065163508433, + "grad_norm": 3.0729422569274902, + "learning_rate": 6.7322077366958755e-06, + "loss": 0.2866, + "step": 4059 + }, + { + "epoch": 2.596146158151435, + "grad_norm": 3.5099430084228516, + "learning_rate": 6.721521692669375e-06, + "loss": 0.3069, + "step": 4060 + }, + { + "epoch": 2.596785799952027, + "grad_norm": 2.2890100479125977, + "learning_rate": 6.7108356486428725e-06, + "loss": 0.2404, + "step": 4061 + }, + { + "epoch": 2.597425441752619, + "grad_norm": 3.3020029067993164, + "learning_rate": 6.700149604616371e-06, + "loss": 0.3025, + "step": 4062 + }, + { + "epoch": 2.59806508355321, + "grad_norm": 3.097515344619751, + "learning_rate": 6.68946356058987e-06, + "loss": 0.2925, + "step": 4063 + }, + { + "epoch": 2.598704725353802, + "grad_norm": 2.8480024337768555, + "learning_rate": 6.678777516563368e-06, + "loss": 0.2968, + "step": 4064 + }, + { + "epoch": 2.5993443671543934, + "grad_norm": 3.331103563308716, + "learning_rate": 6.6680914725368666e-06, + "loss": 0.2908, + "step": 4065 + }, + { + "epoch": 2.5999840089549853, + "grad_norm": 2.9043595790863037, + "learning_rate": 6.657405428510366e-06, + "loss": 0.277, + "step": 4066 + }, + { + "epoch": 2.600623650755577, + "grad_norm": 2.9335644245147705, + "learning_rate": 6.6467193844838644e-06, + "loss": 0.2937, + "step": 4067 + }, + { + "epoch": 2.6012632925561685, + "grad_norm": 2.8348264694213867, + "learning_rate": 6.636033340457364e-06, + "loss": 0.2869, + "step": 4068 + }, + { + "epoch": 2.6019029343567603, + "grad_norm": 3.2087388038635254, + "learning_rate": 6.6253472964308615e-06, + "loss": 0.2994, + "step": 4069 + }, + { + "epoch": 2.6025425761573517, + "grad_norm": 3.7750985622406006, + "learning_rate": 6.61466125240436e-06, + "loss": 0.3279, + "step": 4070 + }, + { + "epoch": 2.6031822179579436, + "grad_norm": 2.9611592292785645, + "learning_rate": 6.603975208377859e-06, + "loss": 0.2988, + "step": 4071 + }, + { + "epoch": 2.6038218597585354, + "grad_norm": 3.236565351486206, + "learning_rate": 6.593289164351357e-06, + "loss": 0.2778, + "step": 4072 + }, + { + "epoch": 2.604461501559127, + "grad_norm": 3.189505100250244, + "learning_rate": 6.582603120324856e-06, + "loss": 0.2825, + "step": 4073 + }, + { + "epoch": 2.6051011433597187, + "grad_norm": 2.6999635696411133, + "learning_rate": 6.571917076298355e-06, + "loss": 0.2905, + "step": 4074 + }, + { + "epoch": 2.60574078516031, + "grad_norm": 3.3205056190490723, + "learning_rate": 6.5612310322718526e-06, + "loss": 0.2985, + "step": 4075 + }, + { + "epoch": 2.606380426960902, + "grad_norm": 2.713329553604126, + "learning_rate": 6.550544988245352e-06, + "loss": 0.2801, + "step": 4076 + }, + { + "epoch": 2.6070200687614937, + "grad_norm": 2.5777816772460938, + "learning_rate": 6.5398589442188504e-06, + "loss": 0.2761, + "step": 4077 + }, + { + "epoch": 2.607659710562085, + "grad_norm": 3.083207130432129, + "learning_rate": 6.529172900192348e-06, + "loss": 0.3011, + "step": 4078 + }, + { + "epoch": 2.608299352362677, + "grad_norm": 2.7377207279205322, + "learning_rate": 6.518486856165848e-06, + "loss": 0.2865, + "step": 4079 + }, + { + "epoch": 2.6089389941632684, + "grad_norm": 2.998675584793091, + "learning_rate": 6.507800812139346e-06, + "loss": 0.2871, + "step": 4080 + }, + { + "epoch": 2.60957863596386, + "grad_norm": 2.904697895050049, + "learning_rate": 6.497114768112845e-06, + "loss": 0.287, + "step": 4081 + }, + { + "epoch": 2.610218277764452, + "grad_norm": 3.5210788249969482, + "learning_rate": 6.486428724086344e-06, + "loss": 0.301, + "step": 4082 + }, + { + "epoch": 2.6108579195650434, + "grad_norm": 2.9328877925872803, + "learning_rate": 6.4757426800598415e-06, + "loss": 0.3037, + "step": 4083 + }, + { + "epoch": 2.6114975613656353, + "grad_norm": 2.5591320991516113, + "learning_rate": 6.465056636033341e-06, + "loss": 0.2738, + "step": 4084 + }, + { + "epoch": 2.6121372031662267, + "grad_norm": 3.4498910903930664, + "learning_rate": 6.454370592006839e-06, + "loss": 0.3155, + "step": 4085 + }, + { + "epoch": 2.6127768449668185, + "grad_norm": 3.096059560775757, + "learning_rate": 6.443684547980337e-06, + "loss": 0.3301, + "step": 4086 + }, + { + "epoch": 2.6134164867674103, + "grad_norm": 2.978118419647217, + "learning_rate": 6.4329985039538364e-06, + "loss": 0.3004, + "step": 4087 + }, + { + "epoch": 2.614056128568002, + "grad_norm": 2.6096348762512207, + "learning_rate": 6.422312459927335e-06, + "loss": 0.2883, + "step": 4088 + }, + { + "epoch": 2.6146957703685936, + "grad_norm": 2.766460418701172, + "learning_rate": 6.411626415900834e-06, + "loss": 0.2613, + "step": 4089 + }, + { + "epoch": 2.6153354121691854, + "grad_norm": 3.018059492111206, + "learning_rate": 6.400940371874333e-06, + "loss": 0.322, + "step": 4090 + }, + { + "epoch": 2.615975053969777, + "grad_norm": 3.1155269145965576, + "learning_rate": 6.3902543278478305e-06, + "loss": 0.3166, + "step": 4091 + }, + { + "epoch": 2.6166146957703686, + "grad_norm": 2.544541597366333, + "learning_rate": 6.37956828382133e-06, + "loss": 0.2626, + "step": 4092 + }, + { + "epoch": 2.6172543375709605, + "grad_norm": 3.250281572341919, + "learning_rate": 6.368882239794828e-06, + "loss": 0.2917, + "step": 4093 + }, + { + "epoch": 2.617893979371552, + "grad_norm": 2.824223041534424, + "learning_rate": 6.358196195768328e-06, + "loss": 0.2765, + "step": 4094 + }, + { + "epoch": 2.6185336211721437, + "grad_norm": 2.7045154571533203, + "learning_rate": 6.347510151741825e-06, + "loss": 0.2823, + "step": 4095 + }, + { + "epoch": 2.619173262972735, + "grad_norm": 3.0567705631256104, + "learning_rate": 6.336824107715324e-06, + "loss": 0.2925, + "step": 4096 + }, + { + "epoch": 2.619812904773327, + "grad_norm": 3.1804537773132324, + "learning_rate": 6.326138063688823e-06, + "loss": 0.2905, + "step": 4097 + }, + { + "epoch": 2.620452546573919, + "grad_norm": 3.218569040298462, + "learning_rate": 6.315452019662321e-06, + "loss": 0.2998, + "step": 4098 + }, + { + "epoch": 2.62109218837451, + "grad_norm": 2.8695592880249023, + "learning_rate": 6.3047659756358195e-06, + "loss": 0.2942, + "step": 4099 + }, + { + "epoch": 2.621731830175102, + "grad_norm": 2.8517274856567383, + "learning_rate": 6.294079931609319e-06, + "loss": 0.2795, + "step": 4100 + }, + { + "epoch": 2.6223714719756934, + "grad_norm": 3.258681297302246, + "learning_rate": 6.2833938875828165e-06, + "loss": 0.2932, + "step": 4101 + }, + { + "epoch": 2.6230111137762853, + "grad_norm": 2.917255401611328, + "learning_rate": 6.272707843556317e-06, + "loss": 0.2898, + "step": 4102 + }, + { + "epoch": 2.623650755576877, + "grad_norm": 2.938506603240967, + "learning_rate": 6.262021799529814e-06, + "loss": 0.3056, + "step": 4103 + }, + { + "epoch": 2.6242903973774685, + "grad_norm": 2.802382707595825, + "learning_rate": 6.251335755503313e-06, + "loss": 0.2829, + "step": 4104 + }, + { + "epoch": 2.6249300391780603, + "grad_norm": 2.872706890106201, + "learning_rate": 6.240649711476811e-06, + "loss": 0.2924, + "step": 4105 + }, + { + "epoch": 2.6255696809786517, + "grad_norm": 3.044586658477783, + "learning_rate": 6.22996366745031e-06, + "loss": 0.308, + "step": 4106 + }, + { + "epoch": 2.6262093227792436, + "grad_norm": 2.8546431064605713, + "learning_rate": 6.2192776234238084e-06, + "loss": 0.2886, + "step": 4107 + }, + { + "epoch": 2.6268489645798354, + "grad_norm": 2.623234272003174, + "learning_rate": 6.208591579397308e-06, + "loss": 0.2724, + "step": 4108 + }, + { + "epoch": 2.627488606380427, + "grad_norm": 3.020480155944824, + "learning_rate": 6.1979055353708055e-06, + "loss": 0.3103, + "step": 4109 + }, + { + "epoch": 2.6281282481810186, + "grad_norm": 3.087705612182617, + "learning_rate": 6.187219491344305e-06, + "loss": 0.308, + "step": 4110 + }, + { + "epoch": 2.62876788998161, + "grad_norm": 2.831956386566162, + "learning_rate": 6.176533447317803e-06, + "loss": 0.2956, + "step": 4111 + }, + { + "epoch": 2.629407531782202, + "grad_norm": 2.6048920154571533, + "learning_rate": 6.165847403291302e-06, + "loss": 0.2971, + "step": 4112 + }, + { + "epoch": 2.6300471735827937, + "grad_norm": 2.5622875690460205, + "learning_rate": 6.155161359264801e-06, + "loss": 0.2731, + "step": 4113 + }, + { + "epoch": 2.6306868153833856, + "grad_norm": 2.952578544616699, + "learning_rate": 6.144475315238299e-06, + "loss": 0.3012, + "step": 4114 + }, + { + "epoch": 2.631326457183977, + "grad_norm": 2.9624619483947754, + "learning_rate": 6.133789271211797e-06, + "loss": 0.2823, + "step": 4115 + }, + { + "epoch": 2.631966098984569, + "grad_norm": 2.854189395904541, + "learning_rate": 6.123103227185297e-06, + "loss": 0.2993, + "step": 4116 + }, + { + "epoch": 2.63260574078516, + "grad_norm": 3.04116153717041, + "learning_rate": 6.112417183158795e-06, + "loss": 0.3294, + "step": 4117 + }, + { + "epoch": 2.633245382585752, + "grad_norm": 2.9307477474212646, + "learning_rate": 6.101731139132293e-06, + "loss": 0.291, + "step": 4118 + }, + { + "epoch": 2.633885024386344, + "grad_norm": 2.656273365020752, + "learning_rate": 6.091045095105792e-06, + "loss": 0.2822, + "step": 4119 + }, + { + "epoch": 2.6345246661869353, + "grad_norm": 3.1066153049468994, + "learning_rate": 6.080359051079291e-06, + "loss": 0.3067, + "step": 4120 + }, + { + "epoch": 2.635164307987527, + "grad_norm": 2.728170394897461, + "learning_rate": 6.069673007052789e-06, + "loss": 0.2658, + "step": 4121 + }, + { + "epoch": 2.6358039497881185, + "grad_norm": 2.8378944396972656, + "learning_rate": 6.058986963026288e-06, + "loss": 0.2826, + "step": 4122 + }, + { + "epoch": 2.6364435915887103, + "grad_norm": 3.1763226985931396, + "learning_rate": 6.048300918999786e-06, + "loss": 0.2976, + "step": 4123 + }, + { + "epoch": 2.637083233389302, + "grad_norm": 3.4446120262145996, + "learning_rate": 6.037614874973285e-06, + "loss": 0.3283, + "step": 4124 + }, + { + "epoch": 2.6377228751898936, + "grad_norm": 3.0102880001068115, + "learning_rate": 6.026928830946784e-06, + "loss": 0.3093, + "step": 4125 + }, + { + "epoch": 2.6383625169904854, + "grad_norm": 2.9607622623443604, + "learning_rate": 6.016242786920282e-06, + "loss": 0.3053, + "step": 4126 + }, + { + "epoch": 2.639002158791077, + "grad_norm": 2.8787426948547363, + "learning_rate": 6.005556742893781e-06, + "loss": 0.2801, + "step": 4127 + }, + { + "epoch": 2.6396418005916686, + "grad_norm": 2.6342411041259766, + "learning_rate": 5.99487069886728e-06, + "loss": 0.2734, + "step": 4128 + }, + { + "epoch": 2.6402814423922605, + "grad_norm": 3.4327261447906494, + "learning_rate": 5.984184654840778e-06, + "loss": 0.3107, + "step": 4129 + }, + { + "epoch": 2.640921084192852, + "grad_norm": 2.9412829875946045, + "learning_rate": 5.973498610814277e-06, + "loss": 0.2986, + "step": 4130 + }, + { + "epoch": 2.6415607259934437, + "grad_norm": 2.458150863647461, + "learning_rate": 5.962812566787775e-06, + "loss": 0.2578, + "step": 4131 + }, + { + "epoch": 2.642200367794035, + "grad_norm": 3.3725287914276123, + "learning_rate": 5.952126522761274e-06, + "loss": 0.3139, + "step": 4132 + }, + { + "epoch": 2.642840009594627, + "grad_norm": 3.250953435897827, + "learning_rate": 5.941440478734773e-06, + "loss": 0.3064, + "step": 4133 + }, + { + "epoch": 2.643479651395219, + "grad_norm": 2.531280517578125, + "learning_rate": 5.930754434708271e-06, + "loss": 0.2549, + "step": 4134 + }, + { + "epoch": 2.64411929319581, + "grad_norm": 2.4597527980804443, + "learning_rate": 5.920068390681769e-06, + "loss": 0.2585, + "step": 4135 + }, + { + "epoch": 2.644758934996402, + "grad_norm": 2.8698153495788574, + "learning_rate": 5.909382346655269e-06, + "loss": 0.3029, + "step": 4136 + }, + { + "epoch": 2.6453985767969934, + "grad_norm": 3.027813673019409, + "learning_rate": 5.898696302628767e-06, + "loss": 0.3142, + "step": 4137 + }, + { + "epoch": 2.6460382185975853, + "grad_norm": 2.9331483840942383, + "learning_rate": 5.888010258602266e-06, + "loss": 0.2862, + "step": 4138 + }, + { + "epoch": 2.646677860398177, + "grad_norm": 3.0726869106292725, + "learning_rate": 5.877324214575764e-06, + "loss": 0.2923, + "step": 4139 + }, + { + "epoch": 2.647317502198769, + "grad_norm": 2.5462653636932373, + "learning_rate": 5.866638170549263e-06, + "loss": 0.2852, + "step": 4140 + }, + { + "epoch": 2.6479571439993603, + "grad_norm": 2.565436840057373, + "learning_rate": 5.855952126522761e-06, + "loss": 0.2619, + "step": 4141 + }, + { + "epoch": 2.648596785799952, + "grad_norm": 3.403623580932617, + "learning_rate": 5.845266082496261e-06, + "loss": 0.3462, + "step": 4142 + }, + { + "epoch": 2.6492364276005436, + "grad_norm": 2.9035401344299316, + "learning_rate": 5.834580038469758e-06, + "loss": 0.3012, + "step": 4143 + }, + { + "epoch": 2.6498760694011354, + "grad_norm": 2.489408254623413, + "learning_rate": 5.823893994443258e-06, + "loss": 0.2695, + "step": 4144 + }, + { + "epoch": 2.6505157112017272, + "grad_norm": 3.242091655731201, + "learning_rate": 5.813207950416756e-06, + "loss": 0.3183, + "step": 4145 + }, + { + "epoch": 2.6511553530023186, + "grad_norm": 2.4779622554779053, + "learning_rate": 5.802521906390255e-06, + "loss": 0.2602, + "step": 4146 + }, + { + "epoch": 2.6517949948029105, + "grad_norm": 3.315955400466919, + "learning_rate": 5.791835862363753e-06, + "loss": 0.2795, + "step": 4147 + }, + { + "epoch": 2.652434636603502, + "grad_norm": 2.996861696243286, + "learning_rate": 5.781149818337252e-06, + "loss": 0.2877, + "step": 4148 + }, + { + "epoch": 2.6530742784040937, + "grad_norm": 2.8245441913604736, + "learning_rate": 5.77046377431075e-06, + "loss": 0.289, + "step": 4149 + }, + { + "epoch": 2.6537139202046856, + "grad_norm": 2.9259579181671143, + "learning_rate": 5.75977773028425e-06, + "loss": 0.3059, + "step": 4150 + }, + { + "epoch": 2.654353562005277, + "grad_norm": 2.6691040992736816, + "learning_rate": 5.749091686257747e-06, + "loss": 0.2711, + "step": 4151 + }, + { + "epoch": 2.654993203805869, + "grad_norm": 3.4666895866394043, + "learning_rate": 5.738405642231246e-06, + "loss": 0.2953, + "step": 4152 + }, + { + "epoch": 2.65563284560646, + "grad_norm": 2.7008304595947266, + "learning_rate": 5.727719598204745e-06, + "loss": 0.2763, + "step": 4153 + }, + { + "epoch": 2.656272487407052, + "grad_norm": 3.0840418338775635, + "learning_rate": 5.717033554178244e-06, + "loss": 0.2928, + "step": 4154 + }, + { + "epoch": 2.656912129207644, + "grad_norm": 3.2053513526916504, + "learning_rate": 5.706347510151741e-06, + "loss": 0.3142, + "step": 4155 + }, + { + "epoch": 2.6575517710082353, + "grad_norm": 3.2546536922454834, + "learning_rate": 5.695661466125241e-06, + "loss": 0.3102, + "step": 4156 + }, + { + "epoch": 2.658191412808827, + "grad_norm": 3.1947944164276123, + "learning_rate": 5.684975422098739e-06, + "loss": 0.2905, + "step": 4157 + }, + { + "epoch": 2.6588310546094185, + "grad_norm": 3.201735734939575, + "learning_rate": 5.674289378072238e-06, + "loss": 0.3058, + "step": 4158 + }, + { + "epoch": 2.6594706964100103, + "grad_norm": 2.9859352111816406, + "learning_rate": 5.663603334045737e-06, + "loss": 0.2993, + "step": 4159 + }, + { + "epoch": 2.660110338210602, + "grad_norm": 3.122661590576172, + "learning_rate": 5.652917290019235e-06, + "loss": 0.2867, + "step": 4160 + }, + { + "epoch": 2.660749980011194, + "grad_norm": 2.6536948680877686, + "learning_rate": 5.642231245992733e-06, + "loss": 0.2663, + "step": 4161 + }, + { + "epoch": 2.6613896218117854, + "grad_norm": 3.0495078563690186, + "learning_rate": 5.631545201966233e-06, + "loss": 0.2985, + "step": 4162 + }, + { + "epoch": 2.662029263612377, + "grad_norm": 3.108170509338379, + "learning_rate": 5.620859157939731e-06, + "loss": 0.3308, + "step": 4163 + }, + { + "epoch": 2.6626689054129686, + "grad_norm": 3.2326366901397705, + "learning_rate": 5.61017311391323e-06, + "loss": 0.2939, + "step": 4164 + }, + { + "epoch": 2.6633085472135605, + "grad_norm": 2.168445348739624, + "learning_rate": 5.599487069886728e-06, + "loss": 0.2424, + "step": 4165 + }, + { + "epoch": 2.6639481890141523, + "grad_norm": 2.6619482040405273, + "learning_rate": 5.588801025860227e-06, + "loss": 0.2654, + "step": 4166 + }, + { + "epoch": 2.6645878308147437, + "grad_norm": 2.5055179595947266, + "learning_rate": 5.578114981833726e-06, + "loss": 0.2801, + "step": 4167 + }, + { + "epoch": 2.6652274726153355, + "grad_norm": 3.1837360858917236, + "learning_rate": 5.567428937807224e-06, + "loss": 0.3115, + "step": 4168 + }, + { + "epoch": 2.665867114415927, + "grad_norm": 2.5889251232147217, + "learning_rate": 5.556742893780722e-06, + "loss": 0.2524, + "step": 4169 + }, + { + "epoch": 2.666506756216519, + "grad_norm": 2.5486278533935547, + "learning_rate": 5.546056849754222e-06, + "loss": 0.2824, + "step": 4170 + }, + { + "epoch": 2.6671463980171106, + "grad_norm": 3.0237362384796143, + "learning_rate": 5.53537080572772e-06, + "loss": 0.2861, + "step": 4171 + }, + { + "epoch": 2.667786039817702, + "grad_norm": 3.123845338821411, + "learning_rate": 5.524684761701218e-06, + "loss": 0.3142, + "step": 4172 + }, + { + "epoch": 2.668425681618294, + "grad_norm": 2.7165184020996094, + "learning_rate": 5.513998717674717e-06, + "loss": 0.3037, + "step": 4173 + }, + { + "epoch": 2.6690653234188852, + "grad_norm": 3.402109146118164, + "learning_rate": 5.503312673648216e-06, + "loss": 0.2981, + "step": 4174 + }, + { + "epoch": 2.669704965219477, + "grad_norm": 3.0483319759368896, + "learning_rate": 5.492626629621714e-06, + "loss": 0.3018, + "step": 4175 + }, + { + "epoch": 2.670344607020069, + "grad_norm": 2.8692407608032227, + "learning_rate": 5.481940585595213e-06, + "loss": 0.2783, + "step": 4176 + }, + { + "epoch": 2.6709842488206603, + "grad_norm": 2.7753992080688477, + "learning_rate": 5.471254541568711e-06, + "loss": 0.2686, + "step": 4177 + }, + { + "epoch": 2.671623890621252, + "grad_norm": 3.161330461502075, + "learning_rate": 5.46056849754221e-06, + "loss": 0.2931, + "step": 4178 + }, + { + "epoch": 2.6722635324218436, + "grad_norm": 3.287898540496826, + "learning_rate": 5.449882453515709e-06, + "loss": 0.2971, + "step": 4179 + }, + { + "epoch": 2.6729031742224354, + "grad_norm": 3.2551393508911133, + "learning_rate": 5.439196409489208e-06, + "loss": 0.3176, + "step": 4180 + }, + { + "epoch": 2.6735428160230272, + "grad_norm": 2.699275016784668, + "learning_rate": 5.428510365462706e-06, + "loss": 0.2939, + "step": 4181 + }, + { + "epoch": 2.6741824578236186, + "grad_norm": 2.2508926391601562, + "learning_rate": 5.417824321436205e-06, + "loss": 0.2436, + "step": 4182 + }, + { + "epoch": 2.6748220996242105, + "grad_norm": 2.7938640117645264, + "learning_rate": 5.407138277409703e-06, + "loss": 0.2834, + "step": 4183 + }, + { + "epoch": 2.675461741424802, + "grad_norm": 2.591665744781494, + "learning_rate": 5.396452233383202e-06, + "loss": 0.2577, + "step": 4184 + }, + { + "epoch": 2.6761013832253937, + "grad_norm": 2.881643295288086, + "learning_rate": 5.3857661893567e-06, + "loss": 0.2878, + "step": 4185 + }, + { + "epoch": 2.6767410250259855, + "grad_norm": 3.2424135208129883, + "learning_rate": 5.375080145330199e-06, + "loss": 0.2822, + "step": 4186 + }, + { + "epoch": 2.6773806668265774, + "grad_norm": 2.951124906539917, + "learning_rate": 5.364394101303698e-06, + "loss": 0.2882, + "step": 4187 + }, + { + "epoch": 2.6780203086271688, + "grad_norm": 2.629875659942627, + "learning_rate": 5.353708057277197e-06, + "loss": 0.2837, + "step": 4188 + }, + { + "epoch": 2.6786599504277606, + "grad_norm": 2.9838707447052, + "learning_rate": 5.343022013250694e-06, + "loss": 0.2696, + "step": 4189 + }, + { + "epoch": 2.679299592228352, + "grad_norm": 3.3642942905426025, + "learning_rate": 5.332335969224194e-06, + "loss": 0.2945, + "step": 4190 + }, + { + "epoch": 2.679939234028944, + "grad_norm": 2.6537294387817383, + "learning_rate": 5.321649925197692e-06, + "loss": 0.2659, + "step": 4191 + }, + { + "epoch": 2.6805788758295357, + "grad_norm": 2.961134672164917, + "learning_rate": 5.310963881171191e-06, + "loss": 0.3021, + "step": 4192 + }, + { + "epoch": 2.681218517630127, + "grad_norm": 2.9832959175109863, + "learning_rate": 5.300277837144689e-06, + "loss": 0.2759, + "step": 4193 + }, + { + "epoch": 2.681858159430719, + "grad_norm": 3.2399749755859375, + "learning_rate": 5.289591793118188e-06, + "loss": 0.2901, + "step": 4194 + }, + { + "epoch": 2.6824978012313103, + "grad_norm": 3.1127965450286865, + "learning_rate": 5.278905749091686e-06, + "loss": 0.2898, + "step": 4195 + }, + { + "epoch": 2.683137443031902, + "grad_norm": 2.775202989578247, + "learning_rate": 5.2682197050651856e-06, + "loss": 0.2701, + "step": 4196 + }, + { + "epoch": 2.683777084832494, + "grad_norm": 3.1817328929901123, + "learning_rate": 5.257533661038683e-06, + "loss": 0.2887, + "step": 4197 + }, + { + "epoch": 2.6844167266330854, + "grad_norm": 2.8848459720611572, + "learning_rate": 5.246847617012183e-06, + "loss": 0.2833, + "step": 4198 + }, + { + "epoch": 2.6850563684336772, + "grad_norm": 3.141246795654297, + "learning_rate": 5.236161572985681e-06, + "loss": 0.3281, + "step": 4199 + }, + { + "epoch": 2.6856960102342686, + "grad_norm": 2.347219467163086, + "learning_rate": 5.22547552895918e-06, + "loss": 0.2425, + "step": 4200 + }, + { + "epoch": 2.6863356520348605, + "grad_norm": 2.8728418350219727, + "learning_rate": 5.214789484932678e-06, + "loss": 0.2825, + "step": 4201 + }, + { + "epoch": 2.6869752938354523, + "grad_norm": 2.6260263919830322, + "learning_rate": 5.204103440906177e-06, + "loss": 0.2744, + "step": 4202 + }, + { + "epoch": 2.6876149356360437, + "grad_norm": 2.8408377170562744, + "learning_rate": 5.193417396879675e-06, + "loss": 0.2717, + "step": 4203 + }, + { + "epoch": 2.6882545774366355, + "grad_norm": 2.8726351261138916, + "learning_rate": 5.1827313528531745e-06, + "loss": 0.2622, + "step": 4204 + }, + { + "epoch": 2.688894219237227, + "grad_norm": 3.2461130619049072, + "learning_rate": 5.172045308826673e-06, + "loss": 0.3063, + "step": 4205 + }, + { + "epoch": 2.6895338610378188, + "grad_norm": 3.5422470569610596, + "learning_rate": 5.161359264800171e-06, + "loss": 0.3048, + "step": 4206 + }, + { + "epoch": 2.6901735028384106, + "grad_norm": 2.274430513381958, + "learning_rate": 5.15067322077367e-06, + "loss": 0.2461, + "step": 4207 + }, + { + "epoch": 2.690813144639002, + "grad_norm": 3.0163540840148926, + "learning_rate": 5.139987176747169e-06, + "loss": 0.2998, + "step": 4208 + }, + { + "epoch": 2.691452786439594, + "grad_norm": 3.1758873462677, + "learning_rate": 5.129301132720667e-06, + "loss": 0.2838, + "step": 4209 + }, + { + "epoch": 2.6920924282401852, + "grad_norm": 2.705134391784668, + "learning_rate": 5.118615088694166e-06, + "loss": 0.2716, + "step": 4210 + }, + { + "epoch": 2.692732070040777, + "grad_norm": 3.293877124786377, + "learning_rate": 5.107929044667664e-06, + "loss": 0.3109, + "step": 4211 + }, + { + "epoch": 2.693371711841369, + "grad_norm": 2.952885150909424, + "learning_rate": 5.097243000641163e-06, + "loss": 0.2894, + "step": 4212 + }, + { + "epoch": 2.6940113536419608, + "grad_norm": 2.751307725906372, + "learning_rate": 5.086556956614662e-06, + "loss": 0.2919, + "step": 4213 + }, + { + "epoch": 2.694650995442552, + "grad_norm": 2.981849193572998, + "learning_rate": 5.07587091258816e-06, + "loss": 0.2938, + "step": 4214 + }, + { + "epoch": 2.695290637243144, + "grad_norm": 3.51705265045166, + "learning_rate": 5.065184868561658e-06, + "loss": 0.3294, + "step": 4215 + }, + { + "epoch": 2.6959302790437354, + "grad_norm": 3.0744051933288574, + "learning_rate": 5.0544988245351576e-06, + "loss": 0.2863, + "step": 4216 + }, + { + "epoch": 2.6965699208443272, + "grad_norm": 2.6534125804901123, + "learning_rate": 5.043812780508656e-06, + "loss": 0.2721, + "step": 4217 + }, + { + "epoch": 2.697209562644919, + "grad_norm": 3.1245999336242676, + "learning_rate": 5.033126736482155e-06, + "loss": 0.2685, + "step": 4218 + }, + { + "epoch": 2.6978492044455105, + "grad_norm": 3.1842539310455322, + "learning_rate": 5.022440692455653e-06, + "loss": 0.2827, + "step": 4219 + }, + { + "epoch": 2.6984888462461023, + "grad_norm": 2.8639585971832275, + "learning_rate": 5.011754648429152e-06, + "loss": 0.2741, + "step": 4220 + }, + { + "epoch": 2.6991284880466937, + "grad_norm": 3.223266839981079, + "learning_rate": 5.001068604402651e-06, + "loss": 0.2813, + "step": 4221 + }, + { + "epoch": 2.6997681298472855, + "grad_norm": 2.59563946723938, + "learning_rate": 4.990382560376149e-06, + "loss": 0.2703, + "step": 4222 + }, + { + "epoch": 2.7004077716478774, + "grad_norm": 3.1214075088500977, + "learning_rate": 4.979696516349647e-06, + "loss": 0.3095, + "step": 4223 + }, + { + "epoch": 2.7010474134484688, + "grad_norm": 3.323255777359009, + "learning_rate": 4.9690104723231465e-06, + "loss": 0.2836, + "step": 4224 + }, + { + "epoch": 2.7016870552490606, + "grad_norm": 2.9470183849334717, + "learning_rate": 4.958324428296645e-06, + "loss": 0.2959, + "step": 4225 + }, + { + "epoch": 2.702326697049652, + "grad_norm": 2.841754198074341, + "learning_rate": 4.9476383842701436e-06, + "loss": 0.2668, + "step": 4226 + }, + { + "epoch": 2.702966338850244, + "grad_norm": 2.7331454753875732, + "learning_rate": 4.936952340243642e-06, + "loss": 0.2629, + "step": 4227 + }, + { + "epoch": 2.7036059806508357, + "grad_norm": 3.3476362228393555, + "learning_rate": 4.926266296217141e-06, + "loss": 0.2822, + "step": 4228 + }, + { + "epoch": 2.704245622451427, + "grad_norm": 3.634173631668091, + "learning_rate": 4.915580252190639e-06, + "loss": 0.3238, + "step": 4229 + }, + { + "epoch": 2.704885264252019, + "grad_norm": 2.7092347145080566, + "learning_rate": 4.9048942081641385e-06, + "loss": 0.2771, + "step": 4230 + }, + { + "epoch": 2.7055249060526103, + "grad_norm": 3.4174089431762695, + "learning_rate": 4.894208164137636e-06, + "loss": 0.3338, + "step": 4231 + }, + { + "epoch": 2.706164547853202, + "grad_norm": 3.521181583404541, + "learning_rate": 4.883522120111135e-06, + "loss": 0.313, + "step": 4232 + }, + { + "epoch": 2.706804189653794, + "grad_norm": 3.2000532150268555, + "learning_rate": 4.872836076084634e-06, + "loss": 0.2663, + "step": 4233 + }, + { + "epoch": 2.7074438314543854, + "grad_norm": 3.327611207962036, + "learning_rate": 4.8621500320581325e-06, + "loss": 0.2922, + "step": 4234 + }, + { + "epoch": 2.7080834732549772, + "grad_norm": 2.673038959503174, + "learning_rate": 4.851463988031631e-06, + "loss": 0.2725, + "step": 4235 + }, + { + "epoch": 2.7087231150555686, + "grad_norm": 2.920344829559326, + "learning_rate": 4.8407779440051296e-06, + "loss": 0.2803, + "step": 4236 + }, + { + "epoch": 2.7093627568561605, + "grad_norm": 2.897177219390869, + "learning_rate": 4.830091899978628e-06, + "loss": 0.3026, + "step": 4237 + }, + { + "epoch": 2.7100023986567523, + "grad_norm": 3.0198631286621094, + "learning_rate": 4.819405855952127e-06, + "loss": 0.2877, + "step": 4238 + }, + { + "epoch": 2.710642040457344, + "grad_norm": 2.6498470306396484, + "learning_rate": 4.808719811925625e-06, + "loss": 0.2596, + "step": 4239 + }, + { + "epoch": 2.7112816822579355, + "grad_norm": 3.356194257736206, + "learning_rate": 4.798033767899124e-06, + "loss": 0.3074, + "step": 4240 + }, + { + "epoch": 2.7119213240585274, + "grad_norm": 3.342958450317383, + "learning_rate": 4.787347723872623e-06, + "loss": 0.3235, + "step": 4241 + }, + { + "epoch": 2.7125609658591188, + "grad_norm": 2.8352184295654297, + "learning_rate": 4.7766616798461215e-06, + "loss": 0.2694, + "step": 4242 + }, + { + "epoch": 2.7132006076597106, + "grad_norm": 2.9823131561279297, + "learning_rate": 4.765975635819619e-06, + "loss": 0.2917, + "step": 4243 + }, + { + "epoch": 2.7138402494603024, + "grad_norm": 3.069110870361328, + "learning_rate": 4.7552895917931185e-06, + "loss": 0.3048, + "step": 4244 + }, + { + "epoch": 2.714479891260894, + "grad_norm": 2.6191227436065674, + "learning_rate": 4.744603547766617e-06, + "loss": 0.2805, + "step": 4245 + }, + { + "epoch": 2.7151195330614857, + "grad_norm": 2.5602385997772217, + "learning_rate": 4.7339175037401155e-06, + "loss": 0.2715, + "step": 4246 + }, + { + "epoch": 2.715759174862077, + "grad_norm": 3.137227773666382, + "learning_rate": 4.723231459713615e-06, + "loss": 0.3087, + "step": 4247 + }, + { + "epoch": 2.716398816662669, + "grad_norm": 2.524340867996216, + "learning_rate": 4.712545415687113e-06, + "loss": 0.2628, + "step": 4248 + }, + { + "epoch": 2.7170384584632608, + "grad_norm": 3.0208139419555664, + "learning_rate": 4.701859371660611e-06, + "loss": 0.2891, + "step": 4249 + }, + { + "epoch": 2.717678100263852, + "grad_norm": 2.7642719745635986, + "learning_rate": 4.6911733276341105e-06, + "loss": 0.2808, + "step": 4250 + }, + { + "epoch": 2.718317742064444, + "grad_norm": 2.7058794498443604, + "learning_rate": 4.680487283607609e-06, + "loss": 0.2731, + "step": 4251 + }, + { + "epoch": 2.7189573838650354, + "grad_norm": 2.7685511112213135, + "learning_rate": 4.6698012395811075e-06, + "loss": 0.2963, + "step": 4252 + }, + { + "epoch": 2.719597025665627, + "grad_norm": 3.138655662536621, + "learning_rate": 4.659115195554606e-06, + "loss": 0.2915, + "step": 4253 + }, + { + "epoch": 2.720236667466219, + "grad_norm": 2.9547863006591797, + "learning_rate": 4.6484291515281045e-06, + "loss": 0.2812, + "step": 4254 + }, + { + "epoch": 2.7208763092668105, + "grad_norm": 2.6979265213012695, + "learning_rate": 4.637743107501603e-06, + "loss": 0.2777, + "step": 4255 + }, + { + "epoch": 2.7215159510674023, + "grad_norm": 2.6382367610931396, + "learning_rate": 4.6270570634751015e-06, + "loss": 0.2528, + "step": 4256 + }, + { + "epoch": 2.7221555928679937, + "grad_norm": 2.7664802074432373, + "learning_rate": 4.6163710194486e-06, + "loss": 0.2712, + "step": 4257 + }, + { + "epoch": 2.7227952346685855, + "grad_norm": 3.006429672241211, + "learning_rate": 4.605684975422099e-06, + "loss": 0.2812, + "step": 4258 + }, + { + "epoch": 2.7234348764691774, + "grad_norm": 2.851895809173584, + "learning_rate": 4.594998931395598e-06, + "loss": 0.2895, + "step": 4259 + }, + { + "epoch": 2.7240745182697688, + "grad_norm": 2.8017094135284424, + "learning_rate": 4.584312887369096e-06, + "loss": 0.2591, + "step": 4260 + }, + { + "epoch": 2.7247141600703606, + "grad_norm": 3.3680760860443115, + "learning_rate": 4.573626843342595e-06, + "loss": 0.288, + "step": 4261 + }, + { + "epoch": 2.725353801870952, + "grad_norm": 2.263298988342285, + "learning_rate": 4.5629407993160935e-06, + "loss": 0.2514, + "step": 4262 + }, + { + "epoch": 2.725993443671544, + "grad_norm": 2.707742691040039, + "learning_rate": 4.552254755289592e-06, + "loss": 0.2798, + "step": 4263 + }, + { + "epoch": 2.7266330854721357, + "grad_norm": 2.7986104488372803, + "learning_rate": 4.5415687112630905e-06, + "loss": 0.3024, + "step": 4264 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 2.538247585296631, + "learning_rate": 4.530882667236589e-06, + "loss": 0.2689, + "step": 4265 + }, + { + "epoch": 2.727912369073319, + "grad_norm": 2.6567046642303467, + "learning_rate": 4.5201966232100875e-06, + "loss": 0.2737, + "step": 4266 + }, + { + "epoch": 2.7285520108739107, + "grad_norm": 2.814138174057007, + "learning_rate": 4.509510579183587e-06, + "loss": 0.2854, + "step": 4267 + }, + { + "epoch": 2.729191652674502, + "grad_norm": 3.2301576137542725, + "learning_rate": 4.498824535157085e-06, + "loss": 0.3337, + "step": 4268 + }, + { + "epoch": 2.729831294475094, + "grad_norm": 3.3410801887512207, + "learning_rate": 4.488138491130583e-06, + "loss": 0.3102, + "step": 4269 + }, + { + "epoch": 2.730470936275686, + "grad_norm": 2.9629862308502197, + "learning_rate": 4.4774524471040824e-06, + "loss": 0.3011, + "step": 4270 + }, + { + "epoch": 2.731110578076277, + "grad_norm": 2.753415822982788, + "learning_rate": 4.466766403077581e-06, + "loss": 0.3009, + "step": 4271 + }, + { + "epoch": 2.731750219876869, + "grad_norm": 2.6558845043182373, + "learning_rate": 4.4560803590510795e-06, + "loss": 0.2821, + "step": 4272 + }, + { + "epoch": 2.7323898616774605, + "grad_norm": 2.7958014011383057, + "learning_rate": 4.445394315024578e-06, + "loss": 0.2767, + "step": 4273 + }, + { + "epoch": 2.7330295034780523, + "grad_norm": 2.5357003211975098, + "learning_rate": 4.4347082709980765e-06, + "loss": 0.2557, + "step": 4274 + }, + { + "epoch": 2.733669145278644, + "grad_norm": 2.817138671875, + "learning_rate": 4.424022226971576e-06, + "loss": 0.2833, + "step": 4275 + }, + { + "epoch": 2.7343087870792355, + "grad_norm": 2.883695363998413, + "learning_rate": 4.413336182945074e-06, + "loss": 0.2795, + "step": 4276 + }, + { + "epoch": 2.7349484288798274, + "grad_norm": 3.019054651260376, + "learning_rate": 4.402650138918572e-06, + "loss": 0.2828, + "step": 4277 + }, + { + "epoch": 2.7355880706804188, + "grad_norm": 2.4605958461761475, + "learning_rate": 4.391964094892071e-06, + "loss": 0.2547, + "step": 4278 + }, + { + "epoch": 2.7362277124810106, + "grad_norm": 2.8073604106903076, + "learning_rate": 4.38127805086557e-06, + "loss": 0.2618, + "step": 4279 + }, + { + "epoch": 2.7368673542816024, + "grad_norm": 3.1890933513641357, + "learning_rate": 4.3705920068390684e-06, + "loss": 0.3149, + "step": 4280 + }, + { + "epoch": 2.737506996082194, + "grad_norm": 2.758699655532837, + "learning_rate": 4.359905962812567e-06, + "loss": 0.2642, + "step": 4281 + }, + { + "epoch": 2.7381466378827857, + "grad_norm": 3.0185227394104004, + "learning_rate": 4.3492199187860655e-06, + "loss": 0.302, + "step": 4282 + }, + { + "epoch": 2.738786279683377, + "grad_norm": 2.9913456439971924, + "learning_rate": 4.338533874759564e-06, + "loss": 0.2947, + "step": 4283 + }, + { + "epoch": 2.739425921483969, + "grad_norm": 3.417546272277832, + "learning_rate": 4.327847830733063e-06, + "loss": 0.3232, + "step": 4284 + }, + { + "epoch": 2.7400655632845607, + "grad_norm": 3.3433175086975098, + "learning_rate": 4.317161786706561e-06, + "loss": 0.3161, + "step": 4285 + }, + { + "epoch": 2.7407052050851526, + "grad_norm": 3.0465805530548096, + "learning_rate": 4.3064757426800595e-06, + "loss": 0.2781, + "step": 4286 + }, + { + "epoch": 2.741344846885744, + "grad_norm": 3.096892833709717, + "learning_rate": 4.295789698653559e-06, + "loss": 0.2965, + "step": 4287 + }, + { + "epoch": 2.7419844886863354, + "grad_norm": 3.2183213233947754, + "learning_rate": 4.285103654627057e-06, + "loss": 0.2968, + "step": 4288 + }, + { + "epoch": 2.742624130486927, + "grad_norm": 3.0616166591644287, + "learning_rate": 4.274417610600556e-06, + "loss": 0.2947, + "step": 4289 + }, + { + "epoch": 2.743263772287519, + "grad_norm": 3.0991125106811523, + "learning_rate": 4.2637315665740544e-06, + "loss": 0.289, + "step": 4290 + }, + { + "epoch": 2.743903414088111, + "grad_norm": 2.7623724937438965, + "learning_rate": 4.253045522547553e-06, + "loss": 0.2643, + "step": 4291 + }, + { + "epoch": 2.7445430558887023, + "grad_norm": 2.611386299133301, + "learning_rate": 4.2423594785210515e-06, + "loss": 0.2585, + "step": 4292 + }, + { + "epoch": 2.745182697689294, + "grad_norm": 3.471163511276245, + "learning_rate": 4.231673434494551e-06, + "loss": 0.3117, + "step": 4293 + }, + { + "epoch": 2.7458223394898855, + "grad_norm": 3.1270809173583984, + "learning_rate": 4.2209873904680485e-06, + "loss": 0.2695, + "step": 4294 + }, + { + "epoch": 2.7464619812904774, + "grad_norm": 3.1040518283843994, + "learning_rate": 4.210301346441548e-06, + "loss": 0.2915, + "step": 4295 + }, + { + "epoch": 2.747101623091069, + "grad_norm": 3.2834489345550537, + "learning_rate": 4.199615302415046e-06, + "loss": 0.3236, + "step": 4296 + }, + { + "epoch": 2.7477412648916606, + "grad_norm": 2.616297483444214, + "learning_rate": 4.188929258388545e-06, + "loss": 0.2593, + "step": 4297 + }, + { + "epoch": 2.7483809066922524, + "grad_norm": 3.05403208732605, + "learning_rate": 4.178243214362043e-06, + "loss": 0.2999, + "step": 4298 + }, + { + "epoch": 2.749020548492844, + "grad_norm": 2.8387467861175537, + "learning_rate": 4.167557170335542e-06, + "loss": 0.2715, + "step": 4299 + }, + { + "epoch": 2.7496601902934357, + "grad_norm": 3.0170047283172607, + "learning_rate": 4.1568711263090404e-06, + "loss": 0.3102, + "step": 4300 + }, + { + "epoch": 2.7502998320940275, + "grad_norm": 3.2853238582611084, + "learning_rate": 4.14618508228254e-06, + "loss": 0.2833, + "step": 4301 + }, + { + "epoch": 2.750939473894619, + "grad_norm": 2.645761489868164, + "learning_rate": 4.1354990382560375e-06, + "loss": 0.2832, + "step": 4302 + }, + { + "epoch": 2.7515791156952107, + "grad_norm": 2.951225757598877, + "learning_rate": 4.124812994229536e-06, + "loss": 0.2751, + "step": 4303 + }, + { + "epoch": 2.752218757495802, + "grad_norm": 3.365863561630249, + "learning_rate": 4.114126950203035e-06, + "loss": 0.2885, + "step": 4304 + }, + { + "epoch": 2.752858399296394, + "grad_norm": 2.9383087158203125, + "learning_rate": 4.103440906176534e-06, + "loss": 0.2691, + "step": 4305 + }, + { + "epoch": 2.753498041096986, + "grad_norm": 2.956564426422119, + "learning_rate": 4.092754862150032e-06, + "loss": 0.2851, + "step": 4306 + }, + { + "epoch": 2.754137682897577, + "grad_norm": 2.7642822265625, + "learning_rate": 4.082068818123531e-06, + "loss": 0.2559, + "step": 4307 + }, + { + "epoch": 2.754777324698169, + "grad_norm": 2.8226330280303955, + "learning_rate": 4.071382774097029e-06, + "loss": 0.2947, + "step": 4308 + }, + { + "epoch": 2.7554169664987604, + "grad_norm": 2.5690696239471436, + "learning_rate": 4.060696730070528e-06, + "loss": 0.278, + "step": 4309 + }, + { + "epoch": 2.7560566082993523, + "grad_norm": 2.728264331817627, + "learning_rate": 4.0500106860440264e-06, + "loss": 0.2755, + "step": 4310 + }, + { + "epoch": 2.756696250099944, + "grad_norm": 3.820427417755127, + "learning_rate": 4.039324642017525e-06, + "loss": 0.328, + "step": 4311 + }, + { + "epoch": 2.757335891900536, + "grad_norm": 3.085919141769409, + "learning_rate": 4.028638597991024e-06, + "loss": 0.3017, + "step": 4312 + }, + { + "epoch": 2.7579755337011274, + "grad_norm": 2.4799747467041016, + "learning_rate": 4.017952553964523e-06, + "loss": 0.2752, + "step": 4313 + }, + { + "epoch": 2.758615175501719, + "grad_norm": 2.7526347637176514, + "learning_rate": 4.007266509938021e-06, + "loss": 0.2852, + "step": 4314 + }, + { + "epoch": 2.7592548173023106, + "grad_norm": 3.2440788745880127, + "learning_rate": 3.99658046591152e-06, + "loss": 0.2774, + "step": 4315 + }, + { + "epoch": 2.7598944591029024, + "grad_norm": 3.0366628170013428, + "learning_rate": 3.985894421885018e-06, + "loss": 0.2959, + "step": 4316 + }, + { + "epoch": 2.7605341009034943, + "grad_norm": 3.4696261882781982, + "learning_rate": 3.975208377858517e-06, + "loss": 0.3348, + "step": 4317 + }, + { + "epoch": 2.7611737427040857, + "grad_norm": 2.8783366680145264, + "learning_rate": 3.964522333832016e-06, + "loss": 0.2849, + "step": 4318 + }, + { + "epoch": 2.7618133845046775, + "grad_norm": 3.1641640663146973, + "learning_rate": 3.953836289805514e-06, + "loss": 0.3092, + "step": 4319 + }, + { + "epoch": 2.762453026305269, + "grad_norm": 2.837355375289917, + "learning_rate": 3.9431502457790124e-06, + "loss": 0.2783, + "step": 4320 + }, + { + "epoch": 2.7630926681058607, + "grad_norm": 3.71797251701355, + "learning_rate": 3.932464201752512e-06, + "loss": 0.2831, + "step": 4321 + }, + { + "epoch": 2.7637323099064526, + "grad_norm": 2.7682456970214844, + "learning_rate": 3.92177815772601e-06, + "loss": 0.2827, + "step": 4322 + }, + { + "epoch": 2.764371951707044, + "grad_norm": 2.8644909858703613, + "learning_rate": 3.911092113699508e-06, + "loss": 0.2739, + "step": 4323 + }, + { + "epoch": 2.765011593507636, + "grad_norm": 2.6375932693481445, + "learning_rate": 3.900406069673007e-06, + "loss": 0.2696, + "step": 4324 + }, + { + "epoch": 2.765651235308227, + "grad_norm": 2.5685291290283203, + "learning_rate": 3.889720025646506e-06, + "loss": 0.2862, + "step": 4325 + }, + { + "epoch": 2.766290877108819, + "grad_norm": 2.8904924392700195, + "learning_rate": 3.879033981620004e-06, + "loss": 0.2928, + "step": 4326 + }, + { + "epoch": 2.766930518909411, + "grad_norm": 2.633225679397583, + "learning_rate": 3.868347937593503e-06, + "loss": 0.2808, + "step": 4327 + }, + { + "epoch": 2.7675701607100023, + "grad_norm": 2.960737466812134, + "learning_rate": 3.857661893567001e-06, + "loss": 0.3142, + "step": 4328 + }, + { + "epoch": 2.768209802510594, + "grad_norm": 2.710519552230835, + "learning_rate": 3.846975849540501e-06, + "loss": 0.2606, + "step": 4329 + }, + { + "epoch": 2.7688494443111855, + "grad_norm": 3.3091506958007812, + "learning_rate": 3.836289805513999e-06, + "loss": 0.3251, + "step": 4330 + }, + { + "epoch": 2.7694890861117774, + "grad_norm": 3.3194949626922607, + "learning_rate": 3.825603761487497e-06, + "loss": 0.2808, + "step": 4331 + }, + { + "epoch": 2.770128727912369, + "grad_norm": 3.1072676181793213, + "learning_rate": 3.814917717460996e-06, + "loss": 0.2722, + "step": 4332 + }, + { + "epoch": 2.7707683697129606, + "grad_norm": 2.84649920463562, + "learning_rate": 3.804231673434495e-06, + "loss": 0.2841, + "step": 4333 + }, + { + "epoch": 2.7714080115135524, + "grad_norm": 2.699164867401123, + "learning_rate": 3.7935456294079933e-06, + "loss": 0.2688, + "step": 4334 + }, + { + "epoch": 2.772047653314144, + "grad_norm": 2.55708384513855, + "learning_rate": 3.7828595853814923e-06, + "loss": 0.2679, + "step": 4335 + }, + { + "epoch": 2.7726872951147357, + "grad_norm": 3.57153058052063, + "learning_rate": 3.7721735413549904e-06, + "loss": 0.3103, + "step": 4336 + }, + { + "epoch": 2.7733269369153275, + "grad_norm": 2.890146493911743, + "learning_rate": 3.7614874973284893e-06, + "loss": 0.2904, + "step": 4337 + }, + { + "epoch": 2.7739665787159193, + "grad_norm": 2.9208571910858154, + "learning_rate": 3.750801453301988e-06, + "loss": 0.2812, + "step": 4338 + }, + { + "epoch": 2.7746062205165107, + "grad_norm": 2.7111175060272217, + "learning_rate": 3.7401154092754868e-06, + "loss": 0.2941, + "step": 4339 + }, + { + "epoch": 2.7752458623171026, + "grad_norm": 3.1619319915771484, + "learning_rate": 3.729429365248985e-06, + "loss": 0.2947, + "step": 4340 + }, + { + "epoch": 2.775885504117694, + "grad_norm": 2.8458549976348877, + "learning_rate": 3.7187433212224834e-06, + "loss": 0.2706, + "step": 4341 + }, + { + "epoch": 2.776525145918286, + "grad_norm": 2.991636037826538, + "learning_rate": 3.7080572771959823e-06, + "loss": 0.3078, + "step": 4342 + }, + { + "epoch": 2.7771647877188776, + "grad_norm": 2.2828590869903564, + "learning_rate": 3.6973712331694812e-06, + "loss": 0.2424, + "step": 4343 + }, + { + "epoch": 2.777804429519469, + "grad_norm": 2.745360851287842, + "learning_rate": 3.6866851891429793e-06, + "loss": 0.2866, + "step": 4344 + }, + { + "epoch": 2.778444071320061, + "grad_norm": 2.9150893688201904, + "learning_rate": 3.675999145116478e-06, + "loss": 0.2782, + "step": 4345 + }, + { + "epoch": 2.7790837131206523, + "grad_norm": 2.833770990371704, + "learning_rate": 3.6653131010899768e-06, + "loss": 0.2686, + "step": 4346 + }, + { + "epoch": 2.779723354921244, + "grad_norm": 2.651599407196045, + "learning_rate": 3.6546270570634757e-06, + "loss": 0.2562, + "step": 4347 + }, + { + "epoch": 2.780362996721836, + "grad_norm": 2.827512502670288, + "learning_rate": 3.643941013036974e-06, + "loss": 0.2857, + "step": 4348 + }, + { + "epoch": 2.7810026385224274, + "grad_norm": 3.3250107765197754, + "learning_rate": 3.6332549690104723e-06, + "loss": 0.2835, + "step": 4349 + }, + { + "epoch": 2.781642280323019, + "grad_norm": 3.0457489490509033, + "learning_rate": 3.6225689249839713e-06, + "loss": 0.3035, + "step": 4350 + }, + { + "epoch": 2.7822819221236106, + "grad_norm": 3.0998637676239014, + "learning_rate": 3.6118828809574698e-06, + "loss": 0.3205, + "step": 4351 + }, + { + "epoch": 2.7829215639242024, + "grad_norm": 3.358431100845337, + "learning_rate": 3.601196836930968e-06, + "loss": 0.3037, + "step": 4352 + }, + { + "epoch": 2.7835612057247943, + "grad_norm": 2.650522470474243, + "learning_rate": 3.590510792904467e-06, + "loss": 0.2775, + "step": 4353 + }, + { + "epoch": 2.7842008475253857, + "grad_norm": 4.187964916229248, + "learning_rate": 3.5798247488779657e-06, + "loss": 0.3216, + "step": 4354 + }, + { + "epoch": 2.7848404893259775, + "grad_norm": 3.1567907333374023, + "learning_rate": 3.5691387048514643e-06, + "loss": 0.2811, + "step": 4355 + }, + { + "epoch": 2.785480131126569, + "grad_norm": 3.484435796737671, + "learning_rate": 3.5584526608249624e-06, + "loss": 0.2766, + "step": 4356 + }, + { + "epoch": 2.7861197729271607, + "grad_norm": 3.3332550525665283, + "learning_rate": 3.5477666167984613e-06, + "loss": 0.3362, + "step": 4357 + }, + { + "epoch": 2.7867594147277526, + "grad_norm": 2.9056642055511475, + "learning_rate": 3.53708057277196e-06, + "loss": 0.2717, + "step": 4358 + }, + { + "epoch": 2.787399056528344, + "grad_norm": 3.070425510406494, + "learning_rate": 3.5263945287454587e-06, + "loss": 0.302, + "step": 4359 + }, + { + "epoch": 2.788038698328936, + "grad_norm": 3.1967034339904785, + "learning_rate": 3.5157084847189577e-06, + "loss": 0.2879, + "step": 4360 + }, + { + "epoch": 2.788678340129527, + "grad_norm": 3.0473732948303223, + "learning_rate": 3.5050224406924558e-06, + "loss": 0.3098, + "step": 4361 + }, + { + "epoch": 2.789317981930119, + "grad_norm": 2.8728370666503906, + "learning_rate": 3.4943363966659543e-06, + "loss": 0.2813, + "step": 4362 + }, + { + "epoch": 2.789957623730711, + "grad_norm": 3.050708055496216, + "learning_rate": 3.4836503526394532e-06, + "loss": 0.2851, + "step": 4363 + }, + { + "epoch": 2.7905972655313027, + "grad_norm": 3.4656426906585693, + "learning_rate": 3.4729643086129517e-06, + "loss": 0.2976, + "step": 4364 + }, + { + "epoch": 2.791236907331894, + "grad_norm": 2.9025158882141113, + "learning_rate": 3.46227826458645e-06, + "loss": 0.3071, + "step": 4365 + }, + { + "epoch": 2.791876549132486, + "grad_norm": 3.0199522972106934, + "learning_rate": 3.4515922205599488e-06, + "loss": 0.3034, + "step": 4366 + }, + { + "epoch": 2.7925161909330773, + "grad_norm": 2.7764556407928467, + "learning_rate": 3.4409061765334477e-06, + "loss": 0.2871, + "step": 4367 + }, + { + "epoch": 2.793155832733669, + "grad_norm": 2.6626172065734863, + "learning_rate": 3.4302201325069462e-06, + "loss": 0.2799, + "step": 4368 + }, + { + "epoch": 2.793795474534261, + "grad_norm": 2.648641347885132, + "learning_rate": 3.4195340884804443e-06, + "loss": 0.2761, + "step": 4369 + }, + { + "epoch": 2.7944351163348524, + "grad_norm": 2.799649715423584, + "learning_rate": 3.4088480444539433e-06, + "loss": 0.2965, + "step": 4370 + }, + { + "epoch": 2.7950747581354443, + "grad_norm": 3.063737392425537, + "learning_rate": 3.398162000427442e-06, + "loss": 0.3055, + "step": 4371 + }, + { + "epoch": 2.7957143999360357, + "grad_norm": 3.050278663635254, + "learning_rate": 3.3874759564009407e-06, + "loss": 0.2963, + "step": 4372 + }, + { + "epoch": 2.7963540417366275, + "grad_norm": 3.035537004470825, + "learning_rate": 3.376789912374439e-06, + "loss": 0.2972, + "step": 4373 + }, + { + "epoch": 2.7969936835372193, + "grad_norm": 2.829928398132324, + "learning_rate": 3.3661038683479377e-06, + "loss": 0.2796, + "step": 4374 + }, + { + "epoch": 2.7976333253378107, + "grad_norm": 3.306140899658203, + "learning_rate": 3.3554178243214363e-06, + "loss": 0.3257, + "step": 4375 + }, + { + "epoch": 2.7982729671384026, + "grad_norm": 3.1015918254852295, + "learning_rate": 3.344731780294935e-06, + "loss": 0.2802, + "step": 4376 + }, + { + "epoch": 2.798912608938994, + "grad_norm": 2.5656542778015137, + "learning_rate": 3.3340457362684333e-06, + "loss": 0.2749, + "step": 4377 + }, + { + "epoch": 2.799552250739586, + "grad_norm": 3.343717098236084, + "learning_rate": 3.3233596922419322e-06, + "loss": 0.3091, + "step": 4378 + }, + { + "epoch": 2.8001918925401776, + "grad_norm": 3.0400617122650146, + "learning_rate": 3.3126736482154307e-06, + "loss": 0.3018, + "step": 4379 + }, + { + "epoch": 2.800831534340769, + "grad_norm": 3.060997247695923, + "learning_rate": 3.3019876041889297e-06, + "loss": 0.2844, + "step": 4380 + }, + { + "epoch": 2.801471176141361, + "grad_norm": 2.625800848007202, + "learning_rate": 3.291301560162428e-06, + "loss": 0.2574, + "step": 4381 + }, + { + "epoch": 2.8021108179419523, + "grad_norm": 2.535931348800659, + "learning_rate": 3.2806155161359263e-06, + "loss": 0.249, + "step": 4382 + }, + { + "epoch": 2.802750459742544, + "grad_norm": 2.6953437328338623, + "learning_rate": 3.2699294721094252e-06, + "loss": 0.2939, + "step": 4383 + }, + { + "epoch": 2.803390101543136, + "grad_norm": 2.51912522315979, + "learning_rate": 3.259243428082924e-06, + "loss": 0.2707, + "step": 4384 + }, + { + "epoch": 2.804029743343728, + "grad_norm": 3.424211263656616, + "learning_rate": 3.2485573840564227e-06, + "loss": 0.3125, + "step": 4385 + }, + { + "epoch": 2.804669385144319, + "grad_norm": 2.679070472717285, + "learning_rate": 3.2378713400299208e-06, + "loss": 0.2781, + "step": 4386 + }, + { + "epoch": 2.8053090269449106, + "grad_norm": 2.960639238357544, + "learning_rate": 3.2271852960034197e-06, + "loss": 0.2779, + "step": 4387 + }, + { + "epoch": 2.8059486687455024, + "grad_norm": 2.8190581798553467, + "learning_rate": 3.2164992519769182e-06, + "loss": 0.2968, + "step": 4388 + }, + { + "epoch": 2.8065883105460943, + "grad_norm": 2.830000400543213, + "learning_rate": 3.205813207950417e-06, + "loss": 0.2794, + "step": 4389 + }, + { + "epoch": 2.807227952346686, + "grad_norm": 3.1681251525878906, + "learning_rate": 3.1951271639239152e-06, + "loss": 0.2954, + "step": 4390 + }, + { + "epoch": 2.8078675941472775, + "grad_norm": 3.336529493331909, + "learning_rate": 3.184441119897414e-06, + "loss": 0.3086, + "step": 4391 + }, + { + "epoch": 2.8085072359478693, + "grad_norm": 2.824638605117798, + "learning_rate": 3.1737550758709127e-06, + "loss": 0.3031, + "step": 4392 + }, + { + "epoch": 2.8091468777484607, + "grad_norm": 2.7733805179595947, + "learning_rate": 3.1630690318444116e-06, + "loss": 0.2981, + "step": 4393 + }, + { + "epoch": 2.8097865195490526, + "grad_norm": 2.6935651302337646, + "learning_rate": 3.1523829878179097e-06, + "loss": 0.2525, + "step": 4394 + }, + { + "epoch": 2.8104261613496444, + "grad_norm": 2.861525058746338, + "learning_rate": 3.1416969437914082e-06, + "loss": 0.2942, + "step": 4395 + }, + { + "epoch": 2.811065803150236, + "grad_norm": 2.9324049949645996, + "learning_rate": 3.131010899764907e-06, + "loss": 0.275, + "step": 4396 + }, + { + "epoch": 2.8117054449508276, + "grad_norm": 3.4254677295684814, + "learning_rate": 3.1203248557384057e-06, + "loss": 0.3047, + "step": 4397 + }, + { + "epoch": 2.812345086751419, + "grad_norm": 2.902474880218506, + "learning_rate": 3.1096388117119042e-06, + "loss": 0.2991, + "step": 4398 + }, + { + "epoch": 2.812984728552011, + "grad_norm": 3.1528453826904297, + "learning_rate": 3.0989527676854027e-06, + "loss": 0.2864, + "step": 4399 + }, + { + "epoch": 2.8136243703526027, + "grad_norm": 2.527876138687134, + "learning_rate": 3.0882667236589017e-06, + "loss": 0.2738, + "step": 4400 + }, + { + "epoch": 2.814264012153194, + "grad_norm": 2.8778088092803955, + "learning_rate": 3.0775806796324006e-06, + "loss": 0.2776, + "step": 4401 + }, + { + "epoch": 2.814903653953786, + "grad_norm": 2.662203311920166, + "learning_rate": 3.0668946356058987e-06, + "loss": 0.2826, + "step": 4402 + }, + { + "epoch": 2.8155432957543773, + "grad_norm": 2.9346282482147217, + "learning_rate": 3.0562085915793976e-06, + "loss": 0.2772, + "step": 4403 + }, + { + "epoch": 2.816182937554969, + "grad_norm": 3.0452077388763428, + "learning_rate": 3.045522547552896e-06, + "loss": 0.3035, + "step": 4404 + }, + { + "epoch": 2.816822579355561, + "grad_norm": 2.934515953063965, + "learning_rate": 3.0348365035263947e-06, + "loss": 0.2905, + "step": 4405 + }, + { + "epoch": 2.8174622211561524, + "grad_norm": 3.2118186950683594, + "learning_rate": 3.024150459499893e-06, + "loss": 0.3183, + "step": 4406 + }, + { + "epoch": 2.8181018629567443, + "grad_norm": 2.4275498390197754, + "learning_rate": 3.013464415473392e-06, + "loss": 0.2464, + "step": 4407 + }, + { + "epoch": 2.8187415047573356, + "grad_norm": 2.818660020828247, + "learning_rate": 3.0027783714468906e-06, + "loss": 0.259, + "step": 4408 + }, + { + "epoch": 2.8193811465579275, + "grad_norm": 2.9263291358947754, + "learning_rate": 2.992092327420389e-06, + "loss": 0.2972, + "step": 4409 + }, + { + "epoch": 2.8200207883585193, + "grad_norm": 2.783811569213867, + "learning_rate": 2.9814062833938877e-06, + "loss": 0.2654, + "step": 4410 + }, + { + "epoch": 2.820660430159111, + "grad_norm": 2.725036144256592, + "learning_rate": 2.9707202393673866e-06, + "loss": 0.2806, + "step": 4411 + }, + { + "epoch": 2.8213000719597026, + "grad_norm": 3.0917141437530518, + "learning_rate": 2.9600341953408847e-06, + "loss": 0.3035, + "step": 4412 + }, + { + "epoch": 2.8219397137602944, + "grad_norm": 3.1890900135040283, + "learning_rate": 2.9493481513143836e-06, + "loss": 0.3042, + "step": 4413 + }, + { + "epoch": 2.822579355560886, + "grad_norm": 2.684833526611328, + "learning_rate": 2.938662107287882e-06, + "loss": 0.2932, + "step": 4414 + }, + { + "epoch": 2.8232189973614776, + "grad_norm": 3.2052884101867676, + "learning_rate": 2.9279760632613807e-06, + "loss": 0.3093, + "step": 4415 + }, + { + "epoch": 2.8238586391620695, + "grad_norm": 2.8202271461486816, + "learning_rate": 2.917290019234879e-06, + "loss": 0.2974, + "step": 4416 + }, + { + "epoch": 2.824498280962661, + "grad_norm": 3.0298025608062744, + "learning_rate": 2.906603975208378e-06, + "loss": 0.2873, + "step": 4417 + }, + { + "epoch": 2.8251379227632527, + "grad_norm": 3.641127109527588, + "learning_rate": 2.8959179311818766e-06, + "loss": 0.2691, + "step": 4418 + }, + { + "epoch": 2.825777564563844, + "grad_norm": 2.972038507461548, + "learning_rate": 2.885231887155375e-06, + "loss": 0.2884, + "step": 4419 + }, + { + "epoch": 2.826417206364436, + "grad_norm": 2.6751515865325928, + "learning_rate": 2.8745458431288737e-06, + "loss": 0.2743, + "step": 4420 + }, + { + "epoch": 2.827056848165028, + "grad_norm": 3.1087398529052734, + "learning_rate": 2.8638597991023726e-06, + "loss": 0.3031, + "step": 4421 + }, + { + "epoch": 2.827696489965619, + "grad_norm": 2.8744654655456543, + "learning_rate": 2.8531737550758707e-06, + "loss": 0.2735, + "step": 4422 + }, + { + "epoch": 2.828336131766211, + "grad_norm": 2.2815442085266113, + "learning_rate": 2.8424877110493696e-06, + "loss": 0.2462, + "step": 4423 + }, + { + "epoch": 2.8289757735668024, + "grad_norm": 2.687335252761841, + "learning_rate": 2.8318016670228686e-06, + "loss": 0.2748, + "step": 4424 + }, + { + "epoch": 2.8296154153673942, + "grad_norm": 2.5950405597686768, + "learning_rate": 2.8211156229963667e-06, + "loss": 0.2687, + "step": 4425 + }, + { + "epoch": 2.830255057167986, + "grad_norm": 2.342142105102539, + "learning_rate": 2.8104295789698656e-06, + "loss": 0.2572, + "step": 4426 + }, + { + "epoch": 2.8308946989685775, + "grad_norm": 2.861114740371704, + "learning_rate": 2.799743534943364e-06, + "loss": 0.2811, + "step": 4427 + }, + { + "epoch": 2.8315343407691693, + "grad_norm": 2.8731443881988525, + "learning_rate": 2.789057490916863e-06, + "loss": 0.283, + "step": 4428 + }, + { + "epoch": 2.8321739825697607, + "grad_norm": 3.097060441970825, + "learning_rate": 2.778371446890361e-06, + "loss": 0.3096, + "step": 4429 + }, + { + "epoch": 2.8328136243703526, + "grad_norm": 4.110934257507324, + "learning_rate": 2.76768540286386e-06, + "loss": 0.3314, + "step": 4430 + }, + { + "epoch": 2.8334532661709444, + "grad_norm": 2.848787546157837, + "learning_rate": 2.7569993588373586e-06, + "loss": 0.2873, + "step": 4431 + }, + { + "epoch": 2.834092907971536, + "grad_norm": 2.7672512531280518, + "learning_rate": 2.746313314810857e-06, + "loss": 0.2787, + "step": 4432 + }, + { + "epoch": 2.8347325497721276, + "grad_norm": 2.677736282348633, + "learning_rate": 2.7356272707843556e-06, + "loss": 0.2816, + "step": 4433 + }, + { + "epoch": 2.835372191572719, + "grad_norm": 3.465515613555908, + "learning_rate": 2.7249412267578546e-06, + "loss": 0.2938, + "step": 4434 + }, + { + "epoch": 2.836011833373311, + "grad_norm": 2.5248990058898926, + "learning_rate": 2.714255182731353e-06, + "loss": 0.2536, + "step": 4435 + }, + { + "epoch": 2.8366514751739027, + "grad_norm": 2.5295493602752686, + "learning_rate": 2.7035691387048516e-06, + "loss": 0.2795, + "step": 4436 + }, + { + "epoch": 2.8372911169744945, + "grad_norm": 2.877410888671875, + "learning_rate": 2.69288309467835e-06, + "loss": 0.2868, + "step": 4437 + }, + { + "epoch": 2.837930758775086, + "grad_norm": 3.1989567279815674, + "learning_rate": 2.682197050651849e-06, + "loss": 0.2704, + "step": 4438 + }, + { + "epoch": 2.8385704005756778, + "grad_norm": 2.678420305252075, + "learning_rate": 2.671511006625347e-06, + "loss": 0.2627, + "step": 4439 + }, + { + "epoch": 2.839210042376269, + "grad_norm": 2.7687106132507324, + "learning_rate": 2.660824962598846e-06, + "loss": 0.2703, + "step": 4440 + }, + { + "epoch": 2.839849684176861, + "grad_norm": 2.8759844303131104, + "learning_rate": 2.6501389185723446e-06, + "loss": 0.2849, + "step": 4441 + }, + { + "epoch": 2.840489325977453, + "grad_norm": 2.9912939071655273, + "learning_rate": 2.639452874545843e-06, + "loss": 0.2894, + "step": 4442 + }, + { + "epoch": 2.8411289677780442, + "grad_norm": 3.125586748123169, + "learning_rate": 2.6287668305193416e-06, + "loss": 0.2847, + "step": 4443 + }, + { + "epoch": 2.841768609578636, + "grad_norm": 3.2092106342315674, + "learning_rate": 2.6180807864928406e-06, + "loss": 0.3125, + "step": 4444 + }, + { + "epoch": 2.8424082513792275, + "grad_norm": 2.9209115505218506, + "learning_rate": 2.607394742466339e-06, + "loss": 0.3, + "step": 4445 + }, + { + "epoch": 2.8430478931798193, + "grad_norm": 3.406400442123413, + "learning_rate": 2.5967086984398376e-06, + "loss": 0.3029, + "step": 4446 + }, + { + "epoch": 2.843687534980411, + "grad_norm": 2.841463804244995, + "learning_rate": 2.5860226544133365e-06, + "loss": 0.2708, + "step": 4447 + }, + { + "epoch": 2.8443271767810026, + "grad_norm": 2.6019279956817627, + "learning_rate": 2.575336610386835e-06, + "loss": 0.2326, + "step": 4448 + }, + { + "epoch": 2.8449668185815944, + "grad_norm": 3.1764168739318848, + "learning_rate": 2.5646505663603336e-06, + "loss": 0.2887, + "step": 4449 + }, + { + "epoch": 2.845606460382186, + "grad_norm": 3.4707717895507812, + "learning_rate": 2.553964522333832e-06, + "loss": 0.3073, + "step": 4450 + }, + { + "epoch": 2.8462461021827776, + "grad_norm": 3.472691297531128, + "learning_rate": 2.543278478307331e-06, + "loss": 0.2948, + "step": 4451 + }, + { + "epoch": 2.8468857439833695, + "grad_norm": 3.0213184356689453, + "learning_rate": 2.532592434280829e-06, + "loss": 0.3179, + "step": 4452 + }, + { + "epoch": 2.847525385783961, + "grad_norm": 2.8417153358459473, + "learning_rate": 2.521906390254328e-06, + "loss": 0.289, + "step": 4453 + }, + { + "epoch": 2.8481650275845527, + "grad_norm": 3.0796499252319336, + "learning_rate": 2.5112203462278266e-06, + "loss": 0.2905, + "step": 4454 + }, + { + "epoch": 2.848804669385144, + "grad_norm": 3.237510919570923, + "learning_rate": 2.5005343022013255e-06, + "loss": 0.2824, + "step": 4455 + }, + { + "epoch": 2.849444311185736, + "grad_norm": 2.8167152404785156, + "learning_rate": 2.4898482581748236e-06, + "loss": 0.267, + "step": 4456 + }, + { + "epoch": 2.8500839529863278, + "grad_norm": 3.1566784381866455, + "learning_rate": 2.4791622141483225e-06, + "loss": 0.2993, + "step": 4457 + }, + { + "epoch": 2.850723594786919, + "grad_norm": 3.1024036407470703, + "learning_rate": 2.468476170121821e-06, + "loss": 0.2976, + "step": 4458 + }, + { + "epoch": 2.851363236587511, + "grad_norm": 3.156390428543091, + "learning_rate": 2.4577901260953196e-06, + "loss": 0.3021, + "step": 4459 + }, + { + "epoch": 2.8520028783881024, + "grad_norm": 3.0627458095550537, + "learning_rate": 2.447104082068818e-06, + "loss": 0.3186, + "step": 4460 + }, + { + "epoch": 2.8526425201886942, + "grad_norm": 2.8380753993988037, + "learning_rate": 2.436418038042317e-06, + "loss": 0.2671, + "step": 4461 + }, + { + "epoch": 2.853282161989286, + "grad_norm": 3.080984354019165, + "learning_rate": 2.4257319940158155e-06, + "loss": 0.2833, + "step": 4462 + }, + { + "epoch": 2.853921803789878, + "grad_norm": 3.3176052570343018, + "learning_rate": 2.415045949989314e-06, + "loss": 0.3048, + "step": 4463 + }, + { + "epoch": 2.8545614455904693, + "grad_norm": 2.612539052963257, + "learning_rate": 2.4043599059628125e-06, + "loss": 0.2625, + "step": 4464 + }, + { + "epoch": 2.855201087391061, + "grad_norm": 3.0982754230499268, + "learning_rate": 2.3936738619363115e-06, + "loss": 0.2867, + "step": 4465 + }, + { + "epoch": 2.8558407291916525, + "grad_norm": 3.2011759281158447, + "learning_rate": 2.3829878179098096e-06, + "loss": 0.3032, + "step": 4466 + }, + { + "epoch": 2.8564803709922444, + "grad_norm": 2.8386285305023193, + "learning_rate": 2.3723017738833085e-06, + "loss": 0.2833, + "step": 4467 + }, + { + "epoch": 2.8571200127928362, + "grad_norm": 3.224090814590454, + "learning_rate": 2.3616157298568075e-06, + "loss": 0.3061, + "step": 4468 + }, + { + "epoch": 2.8577596545934276, + "grad_norm": 2.700939893722534, + "learning_rate": 2.3509296858303055e-06, + "loss": 0.2576, + "step": 4469 + }, + { + "epoch": 2.8583992963940195, + "grad_norm": 3.0068764686584473, + "learning_rate": 2.3402436418038045e-06, + "loss": 0.3104, + "step": 4470 + }, + { + "epoch": 2.859038938194611, + "grad_norm": 3.267613410949707, + "learning_rate": 2.329557597777303e-06, + "loss": 0.2954, + "step": 4471 + }, + { + "epoch": 2.8596785799952027, + "grad_norm": 2.724755048751831, + "learning_rate": 2.3188715537508015e-06, + "loss": 0.2525, + "step": 4472 + }, + { + "epoch": 2.8603182217957945, + "grad_norm": 2.891340732574463, + "learning_rate": 2.3081855097243e-06, + "loss": 0.2697, + "step": 4473 + }, + { + "epoch": 2.860957863596386, + "grad_norm": 2.728368043899536, + "learning_rate": 2.297499465697799e-06, + "loss": 0.2832, + "step": 4474 + }, + { + "epoch": 2.8615975053969778, + "grad_norm": 2.3273205757141113, + "learning_rate": 2.2868134216712975e-06, + "loss": 0.2386, + "step": 4475 + }, + { + "epoch": 2.862237147197569, + "grad_norm": 2.6215415000915527, + "learning_rate": 2.276127377644796e-06, + "loss": 0.2768, + "step": 4476 + }, + { + "epoch": 2.862876788998161, + "grad_norm": 2.777740478515625, + "learning_rate": 2.2654413336182945e-06, + "loss": 0.2642, + "step": 4477 + }, + { + "epoch": 2.863516430798753, + "grad_norm": 3.0811619758605957, + "learning_rate": 2.2547552895917935e-06, + "loss": 0.2731, + "step": 4478 + }, + { + "epoch": 2.8641560725993442, + "grad_norm": 3.0682265758514404, + "learning_rate": 2.2440692455652915e-06, + "loss": 0.2958, + "step": 4479 + }, + { + "epoch": 2.864795714399936, + "grad_norm": 2.815882921218872, + "learning_rate": 2.2333832015387905e-06, + "loss": 0.2821, + "step": 4480 + }, + { + "epoch": 2.8654353562005275, + "grad_norm": 3.0600154399871826, + "learning_rate": 2.222697157512289e-06, + "loss": 0.3083, + "step": 4481 + }, + { + "epoch": 2.8660749980011193, + "grad_norm": 2.6533071994781494, + "learning_rate": 2.212011113485788e-06, + "loss": 0.2635, + "step": 4482 + }, + { + "epoch": 2.866714639801711, + "grad_norm": 2.6439638137817383, + "learning_rate": 2.201325069459286e-06, + "loss": 0.2573, + "step": 4483 + }, + { + "epoch": 2.8673542816023025, + "grad_norm": 3.31026291847229, + "learning_rate": 2.190639025432785e-06, + "loss": 0.3141, + "step": 4484 + }, + { + "epoch": 2.8679939234028944, + "grad_norm": 3.262610912322998, + "learning_rate": 2.1799529814062835e-06, + "loss": 0.3036, + "step": 4485 + }, + { + "epoch": 2.868633565203486, + "grad_norm": 2.77650785446167, + "learning_rate": 2.169266937379782e-06, + "loss": 0.273, + "step": 4486 + }, + { + "epoch": 2.8692732070040776, + "grad_norm": 3.118419647216797, + "learning_rate": 2.1585808933532805e-06, + "loss": 0.279, + "step": 4487 + }, + { + "epoch": 2.8699128488046695, + "grad_norm": 3.176316022872925, + "learning_rate": 2.1478948493267794e-06, + "loss": 0.2782, + "step": 4488 + }, + { + "epoch": 2.8705524906052613, + "grad_norm": 3.211397886276245, + "learning_rate": 2.137208805300278e-06, + "loss": 0.2885, + "step": 4489 + }, + { + "epoch": 2.8711921324058527, + "grad_norm": 3.126208543777466, + "learning_rate": 2.1265227612737765e-06, + "loss": 0.2837, + "step": 4490 + }, + { + "epoch": 2.8718317742064445, + "grad_norm": 2.1356916427612305, + "learning_rate": 2.1158367172472754e-06, + "loss": 0.2516, + "step": 4491 + }, + { + "epoch": 2.872471416007036, + "grad_norm": 3.3706891536712646, + "learning_rate": 2.105150673220774e-06, + "loss": 0.3187, + "step": 4492 + }, + { + "epoch": 2.8731110578076278, + "grad_norm": 3.4606504440307617, + "learning_rate": 2.0944646291942724e-06, + "loss": 0.2996, + "step": 4493 + }, + { + "epoch": 2.8737506996082196, + "grad_norm": 3.250021457672119, + "learning_rate": 2.083778585167771e-06, + "loss": 0.3108, + "step": 4494 + }, + { + "epoch": 2.874390341408811, + "grad_norm": 2.7848429679870605, + "learning_rate": 2.07309254114127e-06, + "loss": 0.2632, + "step": 4495 + }, + { + "epoch": 2.875029983209403, + "grad_norm": 2.243865966796875, + "learning_rate": 2.062406497114768e-06, + "loss": 0.2432, + "step": 4496 + }, + { + "epoch": 2.8756696250099942, + "grad_norm": 2.5174126625061035, + "learning_rate": 2.051720453088267e-06, + "loss": 0.2667, + "step": 4497 + }, + { + "epoch": 2.876309266810586, + "grad_norm": 3.000579833984375, + "learning_rate": 2.0410344090617654e-06, + "loss": 0.2652, + "step": 4498 + }, + { + "epoch": 2.876948908611178, + "grad_norm": 3.139763116836548, + "learning_rate": 2.030348365035264e-06, + "loss": 0.2981, + "step": 4499 + }, + { + "epoch": 2.8775885504117693, + "grad_norm": 2.74404239654541, + "learning_rate": 2.0196623210087625e-06, + "loss": 0.2581, + "step": 4500 + }, + { + "epoch": 2.878228192212361, + "grad_norm": 2.860971212387085, + "learning_rate": 2.0089762769822614e-06, + "loss": 0.2817, + "step": 4501 + }, + { + "epoch": 2.8788678340129525, + "grad_norm": 3.165374994277954, + "learning_rate": 1.99829023295576e-06, + "loss": 0.2826, + "step": 4502 + }, + { + "epoch": 2.8795074758135444, + "grad_norm": 2.5561602115631104, + "learning_rate": 1.9876041889292584e-06, + "loss": 0.2511, + "step": 4503 + }, + { + "epoch": 2.880147117614136, + "grad_norm": 2.6605477333068848, + "learning_rate": 1.976918144902757e-06, + "loss": 0.2697, + "step": 4504 + }, + { + "epoch": 2.8807867594147276, + "grad_norm": 3.8273935317993164, + "learning_rate": 1.966232100876256e-06, + "loss": 0.3125, + "step": 4505 + }, + { + "epoch": 2.8814264012153195, + "grad_norm": 3.6742067337036133, + "learning_rate": 1.955546056849754e-06, + "loss": 0.3285, + "step": 4506 + }, + { + "epoch": 2.882066043015911, + "grad_norm": 3.0872433185577393, + "learning_rate": 1.944860012823253e-06, + "loss": 0.2994, + "step": 4507 + }, + { + "epoch": 2.8827056848165027, + "grad_norm": 2.846733808517456, + "learning_rate": 1.9341739687967514e-06, + "loss": 0.2831, + "step": 4508 + }, + { + "epoch": 2.8833453266170945, + "grad_norm": 2.8222081661224365, + "learning_rate": 1.9234879247702504e-06, + "loss": 0.2686, + "step": 4509 + }, + { + "epoch": 2.8839849684176864, + "grad_norm": 2.885746717453003, + "learning_rate": 1.9128018807437485e-06, + "loss": 0.2595, + "step": 4510 + }, + { + "epoch": 2.8846246102182778, + "grad_norm": 2.5349819660186768, + "learning_rate": 1.9021158367172474e-06, + "loss": 0.2918, + "step": 4511 + }, + { + "epoch": 2.885264252018869, + "grad_norm": 3.315539598464966, + "learning_rate": 1.8914297926907461e-06, + "loss": 0.3017, + "step": 4512 + }, + { + "epoch": 2.885903893819461, + "grad_norm": 3.2616286277770996, + "learning_rate": 1.8807437486642446e-06, + "loss": 0.2977, + "step": 4513 + }, + { + "epoch": 2.886543535620053, + "grad_norm": 3.0158700942993164, + "learning_rate": 1.8700577046377434e-06, + "loss": 0.2667, + "step": 4514 + }, + { + "epoch": 2.8871831774206447, + "grad_norm": 2.6524617671966553, + "learning_rate": 1.8593716606112417e-06, + "loss": 0.2821, + "step": 4515 + }, + { + "epoch": 2.887822819221236, + "grad_norm": 2.8941731452941895, + "learning_rate": 1.8486856165847406e-06, + "loss": 0.2974, + "step": 4516 + }, + { + "epoch": 2.888462461021828, + "grad_norm": 2.765868902206421, + "learning_rate": 1.837999572558239e-06, + "loss": 0.2752, + "step": 4517 + }, + { + "epoch": 2.8891021028224193, + "grad_norm": 2.797605514526367, + "learning_rate": 1.8273135285317379e-06, + "loss": 0.3112, + "step": 4518 + }, + { + "epoch": 2.889741744623011, + "grad_norm": 2.6544525623321533, + "learning_rate": 1.8166274845052362e-06, + "loss": 0.2703, + "step": 4519 + }, + { + "epoch": 2.890381386423603, + "grad_norm": 3.714121103286743, + "learning_rate": 1.8059414404787349e-06, + "loss": 0.3115, + "step": 4520 + }, + { + "epoch": 2.8910210282241944, + "grad_norm": 2.5904996395111084, + "learning_rate": 1.7952553964522334e-06, + "loss": 0.2619, + "step": 4521 + }, + { + "epoch": 2.891660670024786, + "grad_norm": 3.224226713180542, + "learning_rate": 1.7845693524257321e-06, + "loss": 0.303, + "step": 4522 + }, + { + "epoch": 2.8923003118253776, + "grad_norm": 3.2155778408050537, + "learning_rate": 1.7738833083992306e-06, + "loss": 0.2972, + "step": 4523 + }, + { + "epoch": 2.8929399536259695, + "grad_norm": 3.257056474685669, + "learning_rate": 1.7631972643727294e-06, + "loss": 0.3011, + "step": 4524 + }, + { + "epoch": 2.8935795954265613, + "grad_norm": 3.115983486175537, + "learning_rate": 1.7525112203462279e-06, + "loss": 0.2917, + "step": 4525 + }, + { + "epoch": 2.8942192372271527, + "grad_norm": 3.4388442039489746, + "learning_rate": 1.7418251763197266e-06, + "loss": 0.3171, + "step": 4526 + }, + { + "epoch": 2.8948588790277445, + "grad_norm": 2.5056612491607666, + "learning_rate": 1.731139132293225e-06, + "loss": 0.2642, + "step": 4527 + }, + { + "epoch": 2.895498520828336, + "grad_norm": 3.1889901161193848, + "learning_rate": 1.7204530882667239e-06, + "loss": 0.3055, + "step": 4528 + }, + { + "epoch": 2.8961381626289278, + "grad_norm": 3.3815791606903076, + "learning_rate": 1.7097670442402222e-06, + "loss": 0.2895, + "step": 4529 + }, + { + "epoch": 2.8967778044295196, + "grad_norm": 3.011918783187866, + "learning_rate": 1.699081000213721e-06, + "loss": 0.3007, + "step": 4530 + }, + { + "epoch": 2.897417446230111, + "grad_norm": 2.953803777694702, + "learning_rate": 1.6883949561872194e-06, + "loss": 0.2794, + "step": 4531 + }, + { + "epoch": 2.898057088030703, + "grad_norm": 3.4943125247955322, + "learning_rate": 1.6777089121607181e-06, + "loss": 0.3091, + "step": 4532 + }, + { + "epoch": 2.8986967298312942, + "grad_norm": 2.9962806701660156, + "learning_rate": 1.6670228681342166e-06, + "loss": 0.2724, + "step": 4533 + }, + { + "epoch": 2.899336371631886, + "grad_norm": 2.714372158050537, + "learning_rate": 1.6563368241077154e-06, + "loss": 0.2733, + "step": 4534 + }, + { + "epoch": 2.899976013432478, + "grad_norm": 2.6885035037994385, + "learning_rate": 1.645650780081214e-06, + "loss": 0.2838, + "step": 4535 + }, + { + "epoch": 2.9006156552330697, + "grad_norm": 2.593261957168579, + "learning_rate": 1.6349647360547126e-06, + "loss": 0.2649, + "step": 4536 + }, + { + "epoch": 2.901255297033661, + "grad_norm": 2.710954189300537, + "learning_rate": 1.6242786920282113e-06, + "loss": 0.2735, + "step": 4537 + }, + { + "epoch": 2.901894938834253, + "grad_norm": 2.6329758167266846, + "learning_rate": 1.6135926480017099e-06, + "loss": 0.2472, + "step": 4538 + }, + { + "epoch": 2.9025345806348444, + "grad_norm": 2.478379964828491, + "learning_rate": 1.6029066039752086e-06, + "loss": 0.244, + "step": 4539 + }, + { + "epoch": 2.903174222435436, + "grad_norm": 2.824335813522339, + "learning_rate": 1.592220559948707e-06, + "loss": 0.2779, + "step": 4540 + }, + { + "epoch": 2.903813864236028, + "grad_norm": 3.711716651916504, + "learning_rate": 1.5815345159222058e-06, + "loss": 0.305, + "step": 4541 + }, + { + "epoch": 2.9044535060366194, + "grad_norm": 3.162688970565796, + "learning_rate": 1.5708484718957041e-06, + "loss": 0.2961, + "step": 4542 + }, + { + "epoch": 2.9050931478372113, + "grad_norm": 2.8410279750823975, + "learning_rate": 1.5601624278692028e-06, + "loss": 0.282, + "step": 4543 + }, + { + "epoch": 2.9057327896378027, + "grad_norm": 2.512073278427124, + "learning_rate": 1.5494763838427014e-06, + "loss": 0.2541, + "step": 4544 + }, + { + "epoch": 2.9063724314383945, + "grad_norm": 3.05137038230896, + "learning_rate": 1.5387903398162003e-06, + "loss": 0.2748, + "step": 4545 + }, + { + "epoch": 2.9070120732389864, + "grad_norm": 3.158571481704712, + "learning_rate": 1.5281042957896988e-06, + "loss": 0.2934, + "step": 4546 + }, + { + "epoch": 2.9076517150395778, + "grad_norm": 3.0834968090057373, + "learning_rate": 1.5174182517631973e-06, + "loss": 0.2638, + "step": 4547 + }, + { + "epoch": 2.9082913568401696, + "grad_norm": 2.686398983001709, + "learning_rate": 1.506732207736696e-06, + "loss": 0.2628, + "step": 4548 + }, + { + "epoch": 2.908930998640761, + "grad_norm": 2.960505485534668, + "learning_rate": 1.4960461637101946e-06, + "loss": 0.2825, + "step": 4549 + }, + { + "epoch": 2.909570640441353, + "grad_norm": 2.898348569869995, + "learning_rate": 1.4853601196836933e-06, + "loss": 0.2795, + "step": 4550 + }, + { + "epoch": 2.9102102822419447, + "grad_norm": 3.2978429794311523, + "learning_rate": 1.4746740756571918e-06, + "loss": 0.3135, + "step": 4551 + }, + { + "epoch": 2.910849924042536, + "grad_norm": 2.648906946182251, + "learning_rate": 1.4639880316306903e-06, + "loss": 0.25, + "step": 4552 + }, + { + "epoch": 2.911489565843128, + "grad_norm": 2.886461019515991, + "learning_rate": 1.453301987604189e-06, + "loss": 0.2965, + "step": 4553 + }, + { + "epoch": 2.9121292076437193, + "grad_norm": 2.8545072078704834, + "learning_rate": 1.4426159435776876e-06, + "loss": 0.2652, + "step": 4554 + }, + { + "epoch": 2.912768849444311, + "grad_norm": 3.019754648208618, + "learning_rate": 1.4319298995511863e-06, + "loss": 0.295, + "step": 4555 + }, + { + "epoch": 2.913408491244903, + "grad_norm": 3.1926445960998535, + "learning_rate": 1.4212438555246848e-06, + "loss": 0.2914, + "step": 4556 + }, + { + "epoch": 2.9140481330454944, + "grad_norm": 3.1505846977233887, + "learning_rate": 1.4105578114981833e-06, + "loss": 0.2963, + "step": 4557 + }, + { + "epoch": 2.914687774846086, + "grad_norm": 3.0904791355133057, + "learning_rate": 1.399871767471682e-06, + "loss": 0.2917, + "step": 4558 + }, + { + "epoch": 2.9153274166466776, + "grad_norm": 2.906182050704956, + "learning_rate": 1.3891857234451806e-06, + "loss": 0.2804, + "step": 4559 + }, + { + "epoch": 2.9159670584472694, + "grad_norm": 3.5451202392578125, + "learning_rate": 1.3784996794186793e-06, + "loss": 0.3088, + "step": 4560 + }, + { + "epoch": 2.9166067002478613, + "grad_norm": 3.023319959640503, + "learning_rate": 1.3678136353921778e-06, + "loss": 0.2729, + "step": 4561 + }, + { + "epoch": 2.917246342048453, + "grad_norm": 2.8218863010406494, + "learning_rate": 1.3571275913656765e-06, + "loss": 0.2796, + "step": 4562 + }, + { + "epoch": 2.9178859838490445, + "grad_norm": 2.7403781414031982, + "learning_rate": 1.346441547339175e-06, + "loss": 0.2801, + "step": 4563 + }, + { + "epoch": 2.9185256256496364, + "grad_norm": 2.7108497619628906, + "learning_rate": 1.3357555033126736e-06, + "loss": 0.2733, + "step": 4564 + }, + { + "epoch": 2.9191652674502278, + "grad_norm": 3.103050708770752, + "learning_rate": 1.3250694592861723e-06, + "loss": 0.284, + "step": 4565 + }, + { + "epoch": 2.9198049092508196, + "grad_norm": 2.8473711013793945, + "learning_rate": 1.3143834152596708e-06, + "loss": 0.2772, + "step": 4566 + }, + { + "epoch": 2.9204445510514114, + "grad_norm": 3.30690860748291, + "learning_rate": 1.3036973712331695e-06, + "loss": 0.2965, + "step": 4567 + }, + { + "epoch": 2.921084192852003, + "grad_norm": 2.8652989864349365, + "learning_rate": 1.2930113272066683e-06, + "loss": 0.2906, + "step": 4568 + }, + { + "epoch": 2.9217238346525947, + "grad_norm": 2.619109869003296, + "learning_rate": 1.2823252831801668e-06, + "loss": 0.2501, + "step": 4569 + }, + { + "epoch": 2.922363476453186, + "grad_norm": 2.9598729610443115, + "learning_rate": 1.2716392391536655e-06, + "loss": 0.3186, + "step": 4570 + }, + { + "epoch": 2.923003118253778, + "grad_norm": 2.9078493118286133, + "learning_rate": 1.260953195127164e-06, + "loss": 0.2738, + "step": 4571 + }, + { + "epoch": 2.9236427600543697, + "grad_norm": 2.7794830799102783, + "learning_rate": 1.2502671511006627e-06, + "loss": 0.2777, + "step": 4572 + }, + { + "epoch": 2.924282401854961, + "grad_norm": 2.8971762657165527, + "learning_rate": 1.2395811070741613e-06, + "loss": 0.2937, + "step": 4573 + }, + { + "epoch": 2.924922043655553, + "grad_norm": 2.7297163009643555, + "learning_rate": 1.2288950630476598e-06, + "loss": 0.2604, + "step": 4574 + }, + { + "epoch": 2.9255616854561444, + "grad_norm": 2.7503600120544434, + "learning_rate": 1.2182090190211585e-06, + "loss": 0.2768, + "step": 4575 + }, + { + "epoch": 2.926201327256736, + "grad_norm": 2.616666078567505, + "learning_rate": 1.207522974994657e-06, + "loss": 0.2734, + "step": 4576 + }, + { + "epoch": 2.926840969057328, + "grad_norm": 3.06693434715271, + "learning_rate": 1.1968369309681557e-06, + "loss": 0.2854, + "step": 4577 + }, + { + "epoch": 2.9274806108579194, + "grad_norm": 3.1562938690185547, + "learning_rate": 1.1861508869416543e-06, + "loss": 0.275, + "step": 4578 + }, + { + "epoch": 2.9281202526585113, + "grad_norm": 2.811310052871704, + "learning_rate": 1.1754648429151528e-06, + "loss": 0.2588, + "step": 4579 + }, + { + "epoch": 2.9287598944591027, + "grad_norm": 3.0411245822906494, + "learning_rate": 1.1647787988886515e-06, + "loss": 0.2912, + "step": 4580 + }, + { + "epoch": 2.9293995362596945, + "grad_norm": 2.5922415256500244, + "learning_rate": 1.15409275486215e-06, + "loss": 0.2643, + "step": 4581 + }, + { + "epoch": 2.9300391780602864, + "grad_norm": 2.923546075820923, + "learning_rate": 1.1434067108356487e-06, + "loss": 0.2655, + "step": 4582 + }, + { + "epoch": 2.9306788198608777, + "grad_norm": 3.06540584564209, + "learning_rate": 1.1327206668091473e-06, + "loss": 0.2842, + "step": 4583 + }, + { + "epoch": 2.9313184616614696, + "grad_norm": 2.955547571182251, + "learning_rate": 1.1220346227826458e-06, + "loss": 0.3139, + "step": 4584 + }, + { + "epoch": 2.931958103462061, + "grad_norm": 3.281717300415039, + "learning_rate": 1.1113485787561445e-06, + "loss": 0.316, + "step": 4585 + }, + { + "epoch": 2.932597745262653, + "grad_norm": 2.9767303466796875, + "learning_rate": 1.100662534729643e-06, + "loss": 0.2929, + "step": 4586 + }, + { + "epoch": 2.9332373870632447, + "grad_norm": 3.0934088230133057, + "learning_rate": 1.0899764907031417e-06, + "loss": 0.2836, + "step": 4587 + }, + { + "epoch": 2.9338770288638365, + "grad_norm": 2.8757448196411133, + "learning_rate": 1.0792904466766403e-06, + "loss": 0.2863, + "step": 4588 + }, + { + "epoch": 2.934516670664428, + "grad_norm": 2.8582520484924316, + "learning_rate": 1.068604402650139e-06, + "loss": 0.2793, + "step": 4589 + }, + { + "epoch": 2.9351563124650197, + "grad_norm": 2.8924968242645264, + "learning_rate": 1.0579183586236377e-06, + "loss": 0.2853, + "step": 4590 + }, + { + "epoch": 2.935795954265611, + "grad_norm": 2.6857917308807373, + "learning_rate": 1.0472323145971362e-06, + "loss": 0.2709, + "step": 4591 + }, + { + "epoch": 2.936435596066203, + "grad_norm": 2.851781129837036, + "learning_rate": 1.036546270570635e-06, + "loss": 0.2837, + "step": 4592 + }, + { + "epoch": 2.937075237866795, + "grad_norm": 2.95515775680542, + "learning_rate": 1.0258602265441335e-06, + "loss": 0.3108, + "step": 4593 + }, + { + "epoch": 2.937714879667386, + "grad_norm": 3.1146178245544434, + "learning_rate": 1.015174182517632e-06, + "loss": 0.2955, + "step": 4594 + }, + { + "epoch": 2.938354521467978, + "grad_norm": 2.9489970207214355, + "learning_rate": 1.0044881384911307e-06, + "loss": 0.3026, + "step": 4595 + }, + { + "epoch": 2.9389941632685694, + "grad_norm": 3.0633018016815186, + "learning_rate": 9.938020944646292e-07, + "loss": 0.2744, + "step": 4596 + }, + { + "epoch": 2.9396338050691613, + "grad_norm": 3.242702007293701, + "learning_rate": 9.83116050438128e-07, + "loss": 0.3041, + "step": 4597 + }, + { + "epoch": 2.940273446869753, + "grad_norm": 3.3694849014282227, + "learning_rate": 9.724300064116265e-07, + "loss": 0.3213, + "step": 4598 + }, + { + "epoch": 2.9409130886703445, + "grad_norm": 3.079346179962158, + "learning_rate": 9.617439623851252e-07, + "loss": 0.3116, + "step": 4599 + }, + { + "epoch": 2.9415527304709363, + "grad_norm": 2.477478504180908, + "learning_rate": 9.510579183586237e-07, + "loss": 0.2413, + "step": 4600 + }, + { + "epoch": 2.9421923722715277, + "grad_norm": 3.136911153793335, + "learning_rate": 9.403718743321223e-07, + "loss": 0.2701, + "step": 4601 + }, + { + "epoch": 2.9428320140721196, + "grad_norm": 3.002061605453491, + "learning_rate": 9.296858303056208e-07, + "loss": 0.2876, + "step": 4602 + }, + { + "epoch": 2.9434716558727114, + "grad_norm": 2.8170931339263916, + "learning_rate": 9.189997862791195e-07, + "loss": 0.2912, + "step": 4603 + }, + { + "epoch": 2.944111297673303, + "grad_norm": 2.8673248291015625, + "learning_rate": 9.083137422526181e-07, + "loss": 0.28, + "step": 4604 + }, + { + "epoch": 2.9447509394738947, + "grad_norm": 3.366997241973877, + "learning_rate": 8.976276982261167e-07, + "loss": 0.2881, + "step": 4605 + }, + { + "epoch": 2.945390581274486, + "grad_norm": 2.804033041000366, + "learning_rate": 8.869416541996153e-07, + "loss": 0.2745, + "step": 4606 + }, + { + "epoch": 2.946030223075078, + "grad_norm": 3.1108407974243164, + "learning_rate": 8.762556101731139e-07, + "loss": 0.286, + "step": 4607 + }, + { + "epoch": 2.9466698648756697, + "grad_norm": 3.3729159832000732, + "learning_rate": 8.655695661466125e-07, + "loss": 0.2911, + "step": 4608 + }, + { + "epoch": 2.9473095066762616, + "grad_norm": 3.1210105419158936, + "learning_rate": 8.548835221201111e-07, + "loss": 0.2777, + "step": 4609 + }, + { + "epoch": 2.947949148476853, + "grad_norm": 3.2672693729400635, + "learning_rate": 8.441974780936097e-07, + "loss": 0.2989, + "step": 4610 + }, + { + "epoch": 2.9485887902774444, + "grad_norm": 3.032050132751465, + "learning_rate": 8.335114340671083e-07, + "loss": 0.262, + "step": 4611 + }, + { + "epoch": 2.949228432078036, + "grad_norm": 3.284799814224243, + "learning_rate": 8.22825390040607e-07, + "loss": 0.2883, + "step": 4612 + }, + { + "epoch": 2.949868073878628, + "grad_norm": 2.4401063919067383, + "learning_rate": 8.121393460141057e-07, + "loss": 0.2458, + "step": 4613 + }, + { + "epoch": 2.95050771567922, + "grad_norm": 3.0984020233154297, + "learning_rate": 8.014533019876043e-07, + "loss": 0.274, + "step": 4614 + }, + { + "epoch": 2.9511473574798113, + "grad_norm": 2.945162534713745, + "learning_rate": 7.907672579611029e-07, + "loss": 0.2926, + "step": 4615 + }, + { + "epoch": 2.951786999280403, + "grad_norm": 2.8837413787841797, + "learning_rate": 7.800812139346014e-07, + "loss": 0.2818, + "step": 4616 + }, + { + "epoch": 2.9524266410809945, + "grad_norm": 2.7469866275787354, + "learning_rate": 7.693951699081002e-07, + "loss": 0.2657, + "step": 4617 + }, + { + "epoch": 2.9530662828815863, + "grad_norm": 2.9806694984436035, + "learning_rate": 7.587091258815987e-07, + "loss": 0.2623, + "step": 4618 + }, + { + "epoch": 2.953705924682178, + "grad_norm": 3.107755422592163, + "learning_rate": 7.480230818550973e-07, + "loss": 0.2943, + "step": 4619 + }, + { + "epoch": 2.9543455664827696, + "grad_norm": 3.3334574699401855, + "learning_rate": 7.373370378285959e-07, + "loss": 0.3062, + "step": 4620 + }, + { + "epoch": 2.9549852082833614, + "grad_norm": 2.7644970417022705, + "learning_rate": 7.266509938020945e-07, + "loss": 0.2529, + "step": 4621 + }, + { + "epoch": 2.955624850083953, + "grad_norm": 3.0519533157348633, + "learning_rate": 7.159649497755931e-07, + "loss": 0.2793, + "step": 4622 + }, + { + "epoch": 2.9562644918845447, + "grad_norm": 2.708420991897583, + "learning_rate": 7.052789057490917e-07, + "loss": 0.2743, + "step": 4623 + }, + { + "epoch": 2.9569041336851365, + "grad_norm": 2.8920557498931885, + "learning_rate": 6.945928617225903e-07, + "loss": 0.2816, + "step": 4624 + }, + { + "epoch": 2.957543775485728, + "grad_norm": 3.275573253631592, + "learning_rate": 6.839068176960889e-07, + "loss": 0.3154, + "step": 4625 + }, + { + "epoch": 2.9581834172863197, + "grad_norm": 2.6203067302703857, + "learning_rate": 6.732207736695875e-07, + "loss": 0.2602, + "step": 4626 + }, + { + "epoch": 2.958823059086911, + "grad_norm": 2.8467485904693604, + "learning_rate": 6.625347296430861e-07, + "loss": 0.2629, + "step": 4627 + }, + { + "epoch": 2.959462700887503, + "grad_norm": 2.786341905593872, + "learning_rate": 6.518486856165848e-07, + "loss": 0.2406, + "step": 4628 + }, + { + "epoch": 2.960102342688095, + "grad_norm": 3.226491928100586, + "learning_rate": 6.411626415900834e-07, + "loss": 0.2766, + "step": 4629 + }, + { + "epoch": 2.960741984488686, + "grad_norm": 3.217517137527466, + "learning_rate": 6.30476597563582e-07, + "loss": 0.3048, + "step": 4630 + }, + { + "epoch": 2.961381626289278, + "grad_norm": 2.7901575565338135, + "learning_rate": 6.197905535370806e-07, + "loss": 0.2738, + "step": 4631 + }, + { + "epoch": 2.9620212680898694, + "grad_norm": 2.7078211307525635, + "learning_rate": 6.091045095105793e-07, + "loss": 0.2655, + "step": 4632 + }, + { + "epoch": 2.9626609098904613, + "grad_norm": 3.1929845809936523, + "learning_rate": 5.984184654840779e-07, + "loss": 0.2868, + "step": 4633 + }, + { + "epoch": 2.963300551691053, + "grad_norm": 2.801015615463257, + "learning_rate": 5.877324214575764e-07, + "loss": 0.2869, + "step": 4634 + }, + { + "epoch": 2.963940193491645, + "grad_norm": 3.106300115585327, + "learning_rate": 5.77046377431075e-07, + "loss": 0.2797, + "step": 4635 + }, + { + "epoch": 2.9645798352922363, + "grad_norm": 2.3474364280700684, + "learning_rate": 5.663603334045736e-07, + "loss": 0.2464, + "step": 4636 + }, + { + "epoch": 2.965219477092828, + "grad_norm": 3.233705759048462, + "learning_rate": 5.556742893780722e-07, + "loss": 0.2834, + "step": 4637 + }, + { + "epoch": 2.9658591188934196, + "grad_norm": 2.533975601196289, + "learning_rate": 5.449882453515709e-07, + "loss": 0.2609, + "step": 4638 + }, + { + "epoch": 2.9664987606940114, + "grad_norm": 3.2399234771728516, + "learning_rate": 5.343022013250695e-07, + "loss": 0.311, + "step": 4639 + }, + { + "epoch": 2.9671384024946033, + "grad_norm": 3.1690189838409424, + "learning_rate": 5.236161572985681e-07, + "loss": 0.2663, + "step": 4640 + }, + { + "epoch": 2.9677780442951947, + "grad_norm": 2.751340389251709, + "learning_rate": 5.129301132720667e-07, + "loss": 0.2567, + "step": 4641 + }, + { + "epoch": 2.9684176860957865, + "grad_norm": 3.2145743370056152, + "learning_rate": 5.022440692455654e-07, + "loss": 0.2953, + "step": 4642 + }, + { + "epoch": 2.969057327896378, + "grad_norm": 3.530580759048462, + "learning_rate": 4.91558025219064e-07, + "loss": 0.3116, + "step": 4643 + }, + { + "epoch": 2.9696969696969697, + "grad_norm": 2.7425405979156494, + "learning_rate": 4.808719811925626e-07, + "loss": 0.2679, + "step": 4644 + }, + { + "epoch": 2.9703366114975616, + "grad_norm": 2.740150213241577, + "learning_rate": 4.7018593716606116e-07, + "loss": 0.2739, + "step": 4645 + }, + { + "epoch": 2.970976253298153, + "grad_norm": 3.510528087615967, + "learning_rate": 4.5949989313955973e-07, + "loss": 0.3106, + "step": 4646 + }, + { + "epoch": 2.971615895098745, + "grad_norm": 3.5076143741607666, + "learning_rate": 4.4881384911305835e-07, + "loss": 0.2982, + "step": 4647 + }, + { + "epoch": 2.972255536899336, + "grad_norm": 2.959351062774658, + "learning_rate": 4.3812780508655697e-07, + "loss": 0.2874, + "step": 4648 + }, + { + "epoch": 2.972895178699928, + "grad_norm": 2.9966440200805664, + "learning_rate": 4.2744176106005554e-07, + "loss": 0.2705, + "step": 4649 + }, + { + "epoch": 2.97353482050052, + "grad_norm": 2.4923782348632812, + "learning_rate": 4.1675571703355416e-07, + "loss": 0.2624, + "step": 4650 + }, + { + "epoch": 2.9741744623011113, + "grad_norm": 3.163743019104004, + "learning_rate": 4.0606967300705283e-07, + "loss": 0.2943, + "step": 4651 + }, + { + "epoch": 2.974814104101703, + "grad_norm": 3.299694061279297, + "learning_rate": 3.9538362898055145e-07, + "loss": 0.3077, + "step": 4652 + }, + { + "epoch": 2.9754537459022945, + "grad_norm": 3.307133674621582, + "learning_rate": 3.846975849540501e-07, + "loss": 0.307, + "step": 4653 + }, + { + "epoch": 2.9760933877028863, + "grad_norm": 3.399186611175537, + "learning_rate": 3.7401154092754864e-07, + "loss": 0.3026, + "step": 4654 + }, + { + "epoch": 2.976733029503478, + "grad_norm": 3.3977606296539307, + "learning_rate": 3.6332549690104726e-07, + "loss": 0.3022, + "step": 4655 + }, + { + "epoch": 2.9773726713040696, + "grad_norm": 3.213376998901367, + "learning_rate": 3.5263945287454583e-07, + "loss": 0.2829, + "step": 4656 + }, + { + "epoch": 2.9780123131046614, + "grad_norm": 3.924910545349121, + "learning_rate": 3.4195340884804445e-07, + "loss": 0.2802, + "step": 4657 + }, + { + "epoch": 2.978651954905253, + "grad_norm": 2.69236421585083, + "learning_rate": 3.3126736482154307e-07, + "loss": 0.245, + "step": 4658 + }, + { + "epoch": 2.9792915967058446, + "grad_norm": 2.9249889850616455, + "learning_rate": 3.205813207950417e-07, + "loss": 0.2761, + "step": 4659 + }, + { + "epoch": 2.9799312385064365, + "grad_norm": 3.207481861114502, + "learning_rate": 3.098952767685403e-07, + "loss": 0.292, + "step": 4660 + }, + { + "epoch": 2.9805708803070283, + "grad_norm": 3.408125400543213, + "learning_rate": 2.9920923274203894e-07, + "loss": 0.3128, + "step": 4661 + }, + { + "epoch": 2.9812105221076197, + "grad_norm": 3.132514238357544, + "learning_rate": 2.885231887155375e-07, + "loss": 0.2999, + "step": 4662 + }, + { + "epoch": 2.9818501639082116, + "grad_norm": 3.2416889667510986, + "learning_rate": 2.778371446890361e-07, + "loss": 0.2716, + "step": 4663 + }, + { + "epoch": 2.982489805708803, + "grad_norm": 3.1789774894714355, + "learning_rate": 2.6715110066253475e-07, + "loss": 0.2896, + "step": 4664 + }, + { + "epoch": 2.983129447509395, + "grad_norm": 3.2657458782196045, + "learning_rate": 2.5646505663603337e-07, + "loss": 0.3093, + "step": 4665 + }, + { + "epoch": 2.9837690893099866, + "grad_norm": 2.9810163974761963, + "learning_rate": 2.45779012609532e-07, + "loss": 0.2547, + "step": 4666 + }, + { + "epoch": 2.984408731110578, + "grad_norm": 2.948413133621216, + "learning_rate": 2.3509296858303058e-07, + "loss": 0.2837, + "step": 4667 + }, + { + "epoch": 2.98504837291117, + "grad_norm": 2.8497731685638428, + "learning_rate": 2.2440692455652918e-07, + "loss": 0.2936, + "step": 4668 + }, + { + "epoch": 2.9856880147117613, + "grad_norm": 2.8002233505249023, + "learning_rate": 2.1372088053002777e-07, + "loss": 0.278, + "step": 4669 + }, + { + "epoch": 2.986327656512353, + "grad_norm": 3.0508875846862793, + "learning_rate": 2.0303483650352642e-07, + "loss": 0.2853, + "step": 4670 + }, + { + "epoch": 2.986967298312945, + "grad_norm": 2.9186458587646484, + "learning_rate": 1.9234879247702504e-07, + "loss": 0.2628, + "step": 4671 + }, + { + "epoch": 2.9876069401135363, + "grad_norm": 3.23622989654541, + "learning_rate": 1.8166274845052363e-07, + "loss": 0.2986, + "step": 4672 + }, + { + "epoch": 2.988246581914128, + "grad_norm": 2.586787462234497, + "learning_rate": 1.7097670442402223e-07, + "loss": 0.2546, + "step": 4673 + }, + { + "epoch": 2.9888862237147196, + "grad_norm": 2.9180619716644287, + "learning_rate": 1.6029066039752085e-07, + "loss": 0.2896, + "step": 4674 + }, + { + "epoch": 2.9895258655153114, + "grad_norm": 3.361057758331299, + "learning_rate": 1.4960461637101947e-07, + "loss": 0.2963, + "step": 4675 + }, + { + "epoch": 2.9901655073159032, + "grad_norm": 2.8037657737731934, + "learning_rate": 1.3891857234451806e-07, + "loss": 0.2759, + "step": 4676 + }, + { + "epoch": 2.9908051491164946, + "grad_norm": 3.336202383041382, + "learning_rate": 1.2823252831801668e-07, + "loss": 0.3032, + "step": 4677 + }, + { + "epoch": 2.9914447909170865, + "grad_norm": 3.1167173385620117, + "learning_rate": 1.1754648429151529e-07, + "loss": 0.2851, + "step": 4678 + }, + { + "epoch": 2.992084432717678, + "grad_norm": 3.106213331222534, + "learning_rate": 1.0686044026501388e-07, + "loss": 0.281, + "step": 4679 + }, + { + "epoch": 2.9927240745182697, + "grad_norm": 3.0309648513793945, + "learning_rate": 9.617439623851252e-08, + "loss": 0.2963, + "step": 4680 + }, + { + "epoch": 2.9933637163188616, + "grad_norm": 3.160137414932251, + "learning_rate": 8.548835221201111e-08, + "loss": 0.3064, + "step": 4681 + }, + { + "epoch": 2.994003358119453, + "grad_norm": 2.576639175415039, + "learning_rate": 7.480230818550973e-08, + "loss": 0.2664, + "step": 4682 + }, + { + "epoch": 2.994642999920045, + "grad_norm": 2.7932701110839844, + "learning_rate": 6.411626415900834e-08, + "loss": 0.2658, + "step": 4683 + }, + { + "epoch": 2.995282641720636, + "grad_norm": 3.1135575771331787, + "learning_rate": 5.343022013250694e-08, + "loss": 0.2885, + "step": 4684 + }, + { + "epoch": 2.995922283521228, + "grad_norm": 2.361483097076416, + "learning_rate": 4.2744176106005557e-08, + "loss": 0.2544, + "step": 4685 + }, + { + "epoch": 2.99656192532182, + "grad_norm": 3.1596367359161377, + "learning_rate": 3.205813207950417e-08, + "loss": 0.2872, + "step": 4686 + }, + { + "epoch": 2.9972015671224117, + "grad_norm": 3.0970194339752197, + "learning_rate": 2.1372088053002778e-08, + "loss": 0.2975, + "step": 4687 + }, + { + "epoch": 2.997841208923003, + "grad_norm": 2.7904469966888428, + "learning_rate": 1.0686044026501389e-08, + "loss": 0.279, + "step": 4688 + }, + { + "epoch": 2.998480850723595, + "grad_norm": 3.129556655883789, + "learning_rate": 0.0, + "loss": 0.3075, + "step": 4689 + } + ], + "logging_steps": 1, + "max_steps": 4689, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0989668175037286e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}