{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9991116375481197, "eval_steps": 500, "global_step": 5064, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005922416345869114, "grad_norm": 18.717661840154292, "learning_rate": 1.3157894736842107e-07, "loss": 1.1434, "step": 1 }, { "epoch": 0.0011844832691738228, "grad_norm": 15.975211698366682, "learning_rate": 2.6315789473684213e-07, "loss": 1.1722, "step": 2 }, { "epoch": 0.0017767249037607344, "grad_norm": 13.436630281888167, "learning_rate": 3.9473684210526315e-07, "loss": 1.2216, "step": 3 }, { "epoch": 0.0023689665383476457, "grad_norm": 14.141974273595771, "learning_rate": 5.263157894736843e-07, "loss": 1.2084, "step": 4 }, { "epoch": 0.002961208172934557, "grad_norm": 12.817546549612011, "learning_rate": 6.578947368421053e-07, "loss": 1.1613, "step": 5 }, { "epoch": 0.0035534498075214687, "grad_norm": 15.101738254046671, "learning_rate": 7.894736842105263e-07, "loss": 1.2291, "step": 6 }, { "epoch": 0.00414569144210838, "grad_norm": 21.872300636479174, "learning_rate": 9.210526315789474e-07, "loss": 1.2358, "step": 7 }, { "epoch": 0.004737933076695291, "grad_norm": 18.219829859695743, "learning_rate": 1.0526315789473685e-06, "loss": 1.2111, "step": 8 }, { "epoch": 0.005330174711282203, "grad_norm": 11.651800626605043, "learning_rate": 1.1842105263157894e-06, "loss": 1.1699, "step": 9 }, { "epoch": 0.005922416345869114, "grad_norm": 14.428397731094817, "learning_rate": 1.3157894736842106e-06, "loss": 1.2005, "step": 10 }, { "epoch": 0.006514657980456026, "grad_norm": 11.505996928681398, "learning_rate": 1.4473684210526317e-06, "loss": 1.1428, "step": 11 }, { "epoch": 0.0071068996150429374, "grad_norm": 9.258825562856831, "learning_rate": 1.5789473684210526e-06, "loss": 1.1801, "step": 12 }, { "epoch": 0.007699141249629849, "grad_norm": 11.003890134146193, "learning_rate": 1.710526315789474e-06, "loss": 1.1603, "step": 13 }, { "epoch": 0.00829138288421676, "grad_norm": 8.53365801142655, "learning_rate": 1.8421052631578948e-06, "loss": 1.1299, "step": 14 }, { "epoch": 0.008883624518803672, "grad_norm": 9.898879357555042, "learning_rate": 1.973684210526316e-06, "loss": 1.1691, "step": 15 }, { "epoch": 0.009475866153390583, "grad_norm": 10.549302142463828, "learning_rate": 2.105263157894737e-06, "loss": 1.1276, "step": 16 }, { "epoch": 0.010068107787977496, "grad_norm": 13.735637524962215, "learning_rate": 2.236842105263158e-06, "loss": 1.0958, "step": 17 }, { "epoch": 0.010660349422564407, "grad_norm": 9.075762665514619, "learning_rate": 2.368421052631579e-06, "loss": 1.1352, "step": 18 }, { "epoch": 0.011252591057151318, "grad_norm": 8.125019295709604, "learning_rate": 2.5e-06, "loss": 1.0965, "step": 19 }, { "epoch": 0.011844832691738229, "grad_norm": 6.28753990985354, "learning_rate": 2.631578947368421e-06, "loss": 1.0706, "step": 20 }, { "epoch": 0.01243707432632514, "grad_norm": 9.479465602728467, "learning_rate": 2.7631578947368424e-06, "loss": 1.1351, "step": 21 }, { "epoch": 0.013029315960912053, "grad_norm": 10.298865353341801, "learning_rate": 2.8947368421052634e-06, "loss": 1.1403, "step": 22 }, { "epoch": 0.013621557595498964, "grad_norm": 37.987752673055155, "learning_rate": 3.0263157894736843e-06, "loss": 1.0901, "step": 23 }, { "epoch": 0.014213799230085875, "grad_norm": 9.457834823064632, "learning_rate": 3.157894736842105e-06, "loss": 1.1241, "step": 24 }, { "epoch": 0.014806040864672786, "grad_norm": 6.66281436022819, "learning_rate": 3.289473684210527e-06, "loss": 1.0351, "step": 25 }, { "epoch": 0.015398282499259699, "grad_norm": 6.610062092413164, "learning_rate": 3.421052631578948e-06, "loss": 1.0715, "step": 26 }, { "epoch": 0.01599052413384661, "grad_norm": 6.998988378675071, "learning_rate": 3.5526315789473687e-06, "loss": 1.0664, "step": 27 }, { "epoch": 0.01658276576843352, "grad_norm": 6.559109520573562, "learning_rate": 3.6842105263157896e-06, "loss": 1.0371, "step": 28 }, { "epoch": 0.017175007403020432, "grad_norm": 8.1722379764705, "learning_rate": 3.815789473684211e-06, "loss": 1.0594, "step": 29 }, { "epoch": 0.017767249037607343, "grad_norm": 9.992648704415512, "learning_rate": 3.947368421052632e-06, "loss": 1.0802, "step": 30 }, { "epoch": 0.018359490672194254, "grad_norm": 6.773662148667347, "learning_rate": 4.078947368421053e-06, "loss": 0.9767, "step": 31 }, { "epoch": 0.018951732306781165, "grad_norm": 5.160580354167821, "learning_rate": 4.210526315789474e-06, "loss": 1.0125, "step": 32 }, { "epoch": 0.019543973941368076, "grad_norm": 7.601356319162516, "learning_rate": 4.342105263157895e-06, "loss": 1.0109, "step": 33 }, { "epoch": 0.02013621557595499, "grad_norm": 6.835933036484314, "learning_rate": 4.473684210526316e-06, "loss": 1.0355, "step": 34 }, { "epoch": 0.020728457210541902, "grad_norm": 14.435112479372032, "learning_rate": 4.605263157894737e-06, "loss": 1.0154, "step": 35 }, { "epoch": 0.021320698845128813, "grad_norm": 7.829860179549529, "learning_rate": 4.736842105263158e-06, "loss": 1.04, "step": 36 }, { "epoch": 0.021912940479715724, "grad_norm": 7.169740079807402, "learning_rate": 4.8684210526315795e-06, "loss": 0.9694, "step": 37 }, { "epoch": 0.022505182114302635, "grad_norm": 8.83069550799535, "learning_rate": 5e-06, "loss": 0.9532, "step": 38 }, { "epoch": 0.023097423748889546, "grad_norm": 9.131263259747842, "learning_rate": 5.131578947368422e-06, "loss": 0.9806, "step": 39 }, { "epoch": 0.023689665383476458, "grad_norm": 10.809549787669706, "learning_rate": 5.263157894736842e-06, "loss": 1.0129, "step": 40 }, { "epoch": 0.02428190701806337, "grad_norm": 8.804492257877177, "learning_rate": 5.394736842105264e-06, "loss": 1.0173, "step": 41 }, { "epoch": 0.02487414865265028, "grad_norm": 20.71245515215828, "learning_rate": 5.526315789473685e-06, "loss": 0.9932, "step": 42 }, { "epoch": 0.025466390287237194, "grad_norm": 9.655846535170545, "learning_rate": 5.657894736842106e-06, "loss": 0.9879, "step": 43 }, { "epoch": 0.026058631921824105, "grad_norm": 6.537124531460736, "learning_rate": 5.789473684210527e-06, "loss": 1.0188, "step": 44 }, { "epoch": 0.026650873556411016, "grad_norm": 13.091149382863613, "learning_rate": 5.921052631578948e-06, "loss": 0.9964, "step": 45 }, { "epoch": 0.027243115190997928, "grad_norm": 8.006244083133073, "learning_rate": 6.0526315789473685e-06, "loss": 0.9726, "step": 46 }, { "epoch": 0.02783535682558484, "grad_norm": 7.668734764475963, "learning_rate": 6.18421052631579e-06, "loss": 1.0153, "step": 47 }, { "epoch": 0.02842759846017175, "grad_norm": 8.137447421123358, "learning_rate": 6.31578947368421e-06, "loss": 1.0091, "step": 48 }, { "epoch": 0.02901984009475866, "grad_norm": 7.606771295715497, "learning_rate": 6.447368421052632e-06, "loss": 0.9991, "step": 49 }, { "epoch": 0.029612081729345572, "grad_norm": 7.345858891479215, "learning_rate": 6.578947368421054e-06, "loss": 0.9509, "step": 50 }, { "epoch": 0.030204323363932483, "grad_norm": 7.643466506554061, "learning_rate": 6.710526315789474e-06, "loss": 0.9572, "step": 51 }, { "epoch": 0.030796564998519398, "grad_norm": 13.10135554836337, "learning_rate": 6.842105263157896e-06, "loss": 0.9797, "step": 52 }, { "epoch": 0.03138880663310631, "grad_norm": 63.002320155093024, "learning_rate": 6.973684210526316e-06, "loss": 0.9633, "step": 53 }, { "epoch": 0.03198104826769322, "grad_norm": 6.071766418833052, "learning_rate": 7.1052631578947375e-06, "loss": 0.9146, "step": 54 }, { "epoch": 0.03257328990228013, "grad_norm": 15.636734262160939, "learning_rate": 7.236842105263158e-06, "loss": 1.0414, "step": 55 }, { "epoch": 0.03316553153686704, "grad_norm": 22.37892911472697, "learning_rate": 7.368421052631579e-06, "loss": 0.9515, "step": 56 }, { "epoch": 0.03375777317145395, "grad_norm": 11.914023755244859, "learning_rate": 7.500000000000001e-06, "loss": 0.9538, "step": 57 }, { "epoch": 0.034350014806040864, "grad_norm": 8.586686698837346, "learning_rate": 7.631578947368423e-06, "loss": 0.9835, "step": 58 }, { "epoch": 0.034942256440627775, "grad_norm": 27.707875718601368, "learning_rate": 7.763157894736843e-06, "loss": 1.0073, "step": 59 }, { "epoch": 0.035534498075214686, "grad_norm": 24.521633695176586, "learning_rate": 7.894736842105265e-06, "loss": 0.961, "step": 60 }, { "epoch": 0.0361267397098016, "grad_norm": 100.54943552048488, "learning_rate": 8.026315789473685e-06, "loss": 1.0181, "step": 61 }, { "epoch": 0.03671898134438851, "grad_norm": 9.987380890435652, "learning_rate": 8.157894736842106e-06, "loss": 0.9711, "step": 62 }, { "epoch": 0.03731122297897542, "grad_norm": 9.222884295954952, "learning_rate": 8.289473684210526e-06, "loss": 0.9489, "step": 63 }, { "epoch": 0.03790346461356233, "grad_norm": 19.82352747253693, "learning_rate": 8.421052631578948e-06, "loss": 0.9661, "step": 64 }, { "epoch": 0.03849570624814924, "grad_norm": 8.36286785237061, "learning_rate": 8.552631578947368e-06, "loss": 0.9554, "step": 65 }, { "epoch": 0.03908794788273615, "grad_norm": 14.58438374105295, "learning_rate": 8.68421052631579e-06, "loss": 0.9321, "step": 66 }, { "epoch": 0.03968018951732307, "grad_norm": 12.929431130450986, "learning_rate": 8.81578947368421e-06, "loss": 0.9036, "step": 67 }, { "epoch": 0.04027243115190998, "grad_norm": 16.089145436097088, "learning_rate": 8.947368421052632e-06, "loss": 0.9221, "step": 68 }, { "epoch": 0.04086467278649689, "grad_norm": 9.617533163896221, "learning_rate": 9.078947368421054e-06, "loss": 0.9167, "step": 69 }, { "epoch": 0.041456914421083804, "grad_norm": 7.694991032785801, "learning_rate": 9.210526315789474e-06, "loss": 0.9336, "step": 70 }, { "epoch": 0.042049156055670715, "grad_norm": 14.08369539185404, "learning_rate": 9.342105263157895e-06, "loss": 0.9538, "step": 71 }, { "epoch": 0.042641397690257626, "grad_norm": 7.803725255120761, "learning_rate": 9.473684210526315e-06, "loss": 0.9534, "step": 72 }, { "epoch": 0.04323363932484454, "grad_norm": 10.048093652946386, "learning_rate": 9.605263157894737e-06, "loss": 1.0111, "step": 73 }, { "epoch": 0.04382588095943145, "grad_norm": 8.06770139647209, "learning_rate": 9.736842105263159e-06, "loss": 1.0268, "step": 74 }, { "epoch": 0.04441812259401836, "grad_norm": 26.383451642079425, "learning_rate": 9.868421052631579e-06, "loss": 0.9725, "step": 75 }, { "epoch": 0.04501036422860527, "grad_norm": 10.64378030761482, "learning_rate": 1e-05, "loss": 0.9351, "step": 76 }, { "epoch": 0.04560260586319218, "grad_norm": 17.542208054609976, "learning_rate": 1.0131578947368421e-05, "loss": 0.9737, "step": 77 }, { "epoch": 0.04619484749777909, "grad_norm": 4.904257105145783, "learning_rate": 1.0263157894736844e-05, "loss": 0.9626, "step": 78 }, { "epoch": 0.046787089132366004, "grad_norm": 11.772948651444537, "learning_rate": 1.0394736842105264e-05, "loss": 0.9473, "step": 79 }, { "epoch": 0.047379330766952915, "grad_norm": 5.711100010911588, "learning_rate": 1.0526315789473684e-05, "loss": 0.9313, "step": 80 }, { "epoch": 0.047971572401539826, "grad_norm": 17.763080812645768, "learning_rate": 1.0657894736842108e-05, "loss": 0.9619, "step": 81 }, { "epoch": 0.04856381403612674, "grad_norm": 48.7479319965714, "learning_rate": 1.0789473684210528e-05, "loss": 0.9389, "step": 82 }, { "epoch": 0.04915605567071365, "grad_norm": 12.652307160279797, "learning_rate": 1.0921052631578948e-05, "loss": 0.9623, "step": 83 }, { "epoch": 0.04974829730530056, "grad_norm": 5.463588308594221, "learning_rate": 1.105263157894737e-05, "loss": 0.9464, "step": 84 }, { "epoch": 0.05034053893988748, "grad_norm": 24.19528693762655, "learning_rate": 1.1184210526315792e-05, "loss": 0.8953, "step": 85 }, { "epoch": 0.05093278057447439, "grad_norm": 6.29193429817291, "learning_rate": 1.1315789473684212e-05, "loss": 0.9005, "step": 86 }, { "epoch": 0.0515250222090613, "grad_norm": 13.36573014121912, "learning_rate": 1.1447368421052632e-05, "loss": 0.9075, "step": 87 }, { "epoch": 0.05211726384364821, "grad_norm": 10.377086501174952, "learning_rate": 1.1578947368421053e-05, "loss": 0.9572, "step": 88 }, { "epoch": 0.05270950547823512, "grad_norm": 8.69662787671184, "learning_rate": 1.1710526315789475e-05, "loss": 0.934, "step": 89 }, { "epoch": 0.05330174711282203, "grad_norm": 12.914962830795819, "learning_rate": 1.1842105263157895e-05, "loss": 0.9381, "step": 90 }, { "epoch": 0.053893988747408944, "grad_norm": 5.960162225544222, "learning_rate": 1.1973684210526317e-05, "loss": 0.8615, "step": 91 }, { "epoch": 0.054486230381995855, "grad_norm": 14.528434970541078, "learning_rate": 1.2105263157894737e-05, "loss": 0.9648, "step": 92 }, { "epoch": 0.055078472016582766, "grad_norm": 10.384675365673601, "learning_rate": 1.2236842105263159e-05, "loss": 0.912, "step": 93 }, { "epoch": 0.05567071365116968, "grad_norm": 7.33465251957887, "learning_rate": 1.236842105263158e-05, "loss": 0.9293, "step": 94 }, { "epoch": 0.05626295528575659, "grad_norm": 4.727007339545679, "learning_rate": 1.25e-05, "loss": 0.9342, "step": 95 }, { "epoch": 0.0568551969203435, "grad_norm": 5.324071485836242, "learning_rate": 1.263157894736842e-05, "loss": 0.9379, "step": 96 }, { "epoch": 0.05744743855493041, "grad_norm": 7.490618326236765, "learning_rate": 1.2763157894736844e-05, "loss": 0.8976, "step": 97 }, { "epoch": 0.05803968018951732, "grad_norm": 10.570705779798235, "learning_rate": 1.2894736842105264e-05, "loss": 0.9339, "step": 98 }, { "epoch": 0.05863192182410423, "grad_norm": 6.638002466312831, "learning_rate": 1.3026315789473684e-05, "loss": 0.9004, "step": 99 }, { "epoch": 0.059224163458691144, "grad_norm": 6.082259049790152, "learning_rate": 1.3157894736842108e-05, "loss": 0.9741, "step": 100 }, { "epoch": 0.059816405093278055, "grad_norm": 27.384068364009153, "learning_rate": 1.3289473684210528e-05, "loss": 0.9887, "step": 101 }, { "epoch": 0.060408646727864966, "grad_norm": 7.414470248625567, "learning_rate": 1.3421052631578948e-05, "loss": 0.9548, "step": 102 }, { "epoch": 0.06100088836245188, "grad_norm": 12.181747128869292, "learning_rate": 1.3552631578947371e-05, "loss": 0.9342, "step": 103 }, { "epoch": 0.061593129997038795, "grad_norm": 18.814533980462837, "learning_rate": 1.3684210526315791e-05, "loss": 0.8821, "step": 104 }, { "epoch": 0.062185371631625706, "grad_norm": 11.37830438605211, "learning_rate": 1.3815789473684211e-05, "loss": 0.9453, "step": 105 }, { "epoch": 0.06277761326621262, "grad_norm": 7.700584289573716, "learning_rate": 1.3947368421052631e-05, "loss": 0.9137, "step": 106 }, { "epoch": 0.06336985490079952, "grad_norm": 24.239676785791914, "learning_rate": 1.4078947368421055e-05, "loss": 0.9604, "step": 107 }, { "epoch": 0.06396209653538644, "grad_norm": 15.296887074877906, "learning_rate": 1.4210526315789475e-05, "loss": 0.9241, "step": 108 }, { "epoch": 0.06455433816997334, "grad_norm": 5.911563958223732, "learning_rate": 1.4342105263157895e-05, "loss": 0.8995, "step": 109 }, { "epoch": 0.06514657980456026, "grad_norm": 6.472036308433493, "learning_rate": 1.4473684210526317e-05, "loss": 0.9123, "step": 110 }, { "epoch": 0.06573882143914717, "grad_norm": 8.838225442941134, "learning_rate": 1.4605263157894739e-05, "loss": 0.866, "step": 111 }, { "epoch": 0.06633106307373408, "grad_norm": 6.179553262811797, "learning_rate": 1.4736842105263159e-05, "loss": 0.9363, "step": 112 }, { "epoch": 0.06692330470832099, "grad_norm": 8.05988074037456, "learning_rate": 1.486842105263158e-05, "loss": 0.8737, "step": 113 }, { "epoch": 0.0675155463429079, "grad_norm": 8.800587055109485, "learning_rate": 1.5000000000000002e-05, "loss": 0.9535, "step": 114 }, { "epoch": 0.06810778797749482, "grad_norm": 14.297791370718492, "learning_rate": 1.5131578947368422e-05, "loss": 0.9063, "step": 115 }, { "epoch": 0.06870002961208173, "grad_norm": 12.532931475700789, "learning_rate": 1.5263157894736846e-05, "loss": 0.9646, "step": 116 }, { "epoch": 0.06929227124666865, "grad_norm": 10.961935762322105, "learning_rate": 1.5394736842105264e-05, "loss": 0.9074, "step": 117 }, { "epoch": 0.06988451288125555, "grad_norm": 16.197406867612408, "learning_rate": 1.5526315789473686e-05, "loss": 0.9213, "step": 118 }, { "epoch": 0.07047675451584247, "grad_norm": 4.618325199175887, "learning_rate": 1.5657894736842107e-05, "loss": 0.9723, "step": 119 }, { "epoch": 0.07106899615042937, "grad_norm": 12.33412742358314, "learning_rate": 1.578947368421053e-05, "loss": 0.9582, "step": 120 }, { "epoch": 0.07166123778501629, "grad_norm": 8.051122431463254, "learning_rate": 1.5921052631578948e-05, "loss": 0.9051, "step": 121 }, { "epoch": 0.0722534794196032, "grad_norm": 247.48175705261465, "learning_rate": 1.605263157894737e-05, "loss": 0.9365, "step": 122 }, { "epoch": 0.07284572105419011, "grad_norm": 4.117202388731062, "learning_rate": 1.618421052631579e-05, "loss": 0.919, "step": 123 }, { "epoch": 0.07343796268877702, "grad_norm": 4.4180789641519755, "learning_rate": 1.6315789473684213e-05, "loss": 0.9452, "step": 124 }, { "epoch": 0.07403020432336394, "grad_norm": 10.408859163405296, "learning_rate": 1.644736842105263e-05, "loss": 0.9171, "step": 125 }, { "epoch": 0.07462244595795084, "grad_norm": 5.812635898712064, "learning_rate": 1.6578947368421053e-05, "loss": 0.9144, "step": 126 }, { "epoch": 0.07521468759253776, "grad_norm": 16.681339227878798, "learning_rate": 1.6710526315789475e-05, "loss": 0.9183, "step": 127 }, { "epoch": 0.07580692922712466, "grad_norm": 5.4477969668787445, "learning_rate": 1.6842105263157896e-05, "loss": 0.9358, "step": 128 }, { "epoch": 0.07639917086171158, "grad_norm": 6.20964288619415, "learning_rate": 1.6973684210526318e-05, "loss": 0.9687, "step": 129 }, { "epoch": 0.07699141249629848, "grad_norm": 16.245213640484007, "learning_rate": 1.7105263157894737e-05, "loss": 0.9135, "step": 130 }, { "epoch": 0.0775836541308854, "grad_norm": 11.924195905537667, "learning_rate": 1.723684210526316e-05, "loss": 0.9306, "step": 131 }, { "epoch": 0.0781758957654723, "grad_norm": 234.49611616589422, "learning_rate": 1.736842105263158e-05, "loss": 0.95, "step": 132 }, { "epoch": 0.07876813740005922, "grad_norm": 5.235845225399804, "learning_rate": 1.7500000000000002e-05, "loss": 0.9042, "step": 133 }, { "epoch": 0.07936037903464614, "grad_norm": 7.032132390324782, "learning_rate": 1.763157894736842e-05, "loss": 0.958, "step": 134 }, { "epoch": 0.07995262066923305, "grad_norm": 9.064372001479308, "learning_rate": 1.7763157894736845e-05, "loss": 0.8924, "step": 135 }, { "epoch": 0.08054486230381996, "grad_norm": 6.104808834461892, "learning_rate": 1.7894736842105264e-05, "loss": 0.9176, "step": 136 }, { "epoch": 0.08113710393840687, "grad_norm": 8.886467300119726, "learning_rate": 1.8026315789473685e-05, "loss": 0.8794, "step": 137 }, { "epoch": 0.08172934557299379, "grad_norm": 7.344989153314532, "learning_rate": 1.8157894736842107e-05, "loss": 0.9546, "step": 138 }, { "epoch": 0.08232158720758069, "grad_norm": 3.97659060586214, "learning_rate": 1.828947368421053e-05, "loss": 0.9321, "step": 139 }, { "epoch": 0.08291382884216761, "grad_norm": 11.103481278827003, "learning_rate": 1.8421052631578947e-05, "loss": 0.9627, "step": 140 }, { "epoch": 0.08350607047675451, "grad_norm": 4.6592325848802565, "learning_rate": 1.8552631578947373e-05, "loss": 0.9245, "step": 141 }, { "epoch": 0.08409831211134143, "grad_norm": 4.7813770731736245, "learning_rate": 1.868421052631579e-05, "loss": 0.9103, "step": 142 }, { "epoch": 0.08469055374592833, "grad_norm": 3.923276266087163, "learning_rate": 1.8815789473684213e-05, "loss": 0.9362, "step": 143 }, { "epoch": 0.08528279538051525, "grad_norm": 5.389802023966097, "learning_rate": 1.894736842105263e-05, "loss": 0.8964, "step": 144 }, { "epoch": 0.08587503701510216, "grad_norm": 3.416809050980661, "learning_rate": 1.9078947368421056e-05, "loss": 0.9409, "step": 145 }, { "epoch": 0.08646727864968907, "grad_norm": 7.72766710731213, "learning_rate": 1.9210526315789474e-05, "loss": 0.9519, "step": 146 }, { "epoch": 0.08705952028427598, "grad_norm": 4.149089365478041, "learning_rate": 1.9342105263157896e-05, "loss": 0.8684, "step": 147 }, { "epoch": 0.0876517619188629, "grad_norm": 3.6666693738461795, "learning_rate": 1.9473684210526318e-05, "loss": 0.9186, "step": 148 }, { "epoch": 0.0882440035534498, "grad_norm": 4.7322097295129195, "learning_rate": 1.960526315789474e-05, "loss": 0.942, "step": 149 }, { "epoch": 0.08883624518803672, "grad_norm": 4.525436493311952, "learning_rate": 1.9736842105263158e-05, "loss": 0.9201, "step": 150 }, { "epoch": 0.08942848682262364, "grad_norm": 4.785414884022584, "learning_rate": 1.986842105263158e-05, "loss": 0.9087, "step": 151 }, { "epoch": 0.09002072845721054, "grad_norm": 6.846600500963897, "learning_rate": 2e-05, "loss": 0.9397, "step": 152 }, { "epoch": 0.09061297009179746, "grad_norm": 10.702916565838894, "learning_rate": 1.9999997954718838e-05, "loss": 0.9056, "step": 153 }, { "epoch": 0.09120521172638436, "grad_norm": 5.25139552872273, "learning_rate": 1.9999991818876183e-05, "loss": 0.9303, "step": 154 }, { "epoch": 0.09179745336097128, "grad_norm": 3.289991163103779, "learning_rate": 1.999998159247455e-05, "loss": 0.8763, "step": 155 }, { "epoch": 0.09238969499555819, "grad_norm": 4.456341363473166, "learning_rate": 1.9999967275518118e-05, "loss": 0.8785, "step": 156 }, { "epoch": 0.0929819366301451, "grad_norm": 3.149442922056867, "learning_rate": 1.9999948868012743e-05, "loss": 0.9021, "step": 157 }, { "epoch": 0.09357417826473201, "grad_norm": 2.4588136993301704, "learning_rate": 1.999992636996596e-05, "loss": 0.8785, "step": 158 }, { "epoch": 0.09416641989931893, "grad_norm": 4.5661756454164255, "learning_rate": 1.9999899781386968e-05, "loss": 0.8916, "step": 159 }, { "epoch": 0.09475866153390583, "grad_norm": 2.30357391498451, "learning_rate": 1.9999869102286638e-05, "loss": 0.9262, "step": 160 }, { "epoch": 0.09535090316849275, "grad_norm": 2.6065419794469467, "learning_rate": 1.9999834332677534e-05, "loss": 0.9292, "step": 161 }, { "epoch": 0.09594314480307965, "grad_norm": 2.8765449701366164, "learning_rate": 1.9999795472573865e-05, "loss": 0.9565, "step": 162 }, { "epoch": 0.09653538643766657, "grad_norm": 2.049749661177928, "learning_rate": 1.9999752521991535e-05, "loss": 0.934, "step": 163 }, { "epoch": 0.09712762807225347, "grad_norm": 2.143243330636517, "learning_rate": 1.9999705480948107e-05, "loss": 0.9237, "step": 164 }, { "epoch": 0.09771986970684039, "grad_norm": 4.902266517773235, "learning_rate": 1.9999654349462828e-05, "loss": 0.9111, "step": 165 }, { "epoch": 0.0983121113414273, "grad_norm": 3.7313193755804397, "learning_rate": 1.9999599127556614e-05, "loss": 0.8766, "step": 166 }, { "epoch": 0.09890435297601421, "grad_norm": 2.121600886528297, "learning_rate": 1.9999539815252048e-05, "loss": 0.8929, "step": 167 }, { "epoch": 0.09949659461060112, "grad_norm": 3.3403412194087374, "learning_rate": 1.99994764125734e-05, "loss": 0.9495, "step": 168 }, { "epoch": 0.10008883624518804, "grad_norm": 2.468640773279232, "learning_rate": 1.99994089195466e-05, "loss": 0.926, "step": 169 }, { "epoch": 0.10068107787977496, "grad_norm": 6.834707950691288, "learning_rate": 1.9999337336199257e-05, "loss": 0.8343, "step": 170 }, { "epoch": 0.10127331951436186, "grad_norm": 8.374475093505808, "learning_rate": 1.9999261662560657e-05, "loss": 0.8807, "step": 171 }, { "epoch": 0.10186556114894878, "grad_norm": 3.7723195507143905, "learning_rate": 1.999918189866175e-05, "loss": 0.9351, "step": 172 }, { "epoch": 0.10245780278353568, "grad_norm": 5.716189938070129, "learning_rate": 1.9999098044535164e-05, "loss": 0.9152, "step": 173 }, { "epoch": 0.1030500444181226, "grad_norm": 9.612635045762634, "learning_rate": 1.9999010100215202e-05, "loss": 0.8843, "step": 174 }, { "epoch": 0.1036422860527095, "grad_norm": 4.703911264627151, "learning_rate": 1.999891806573784e-05, "loss": 0.9066, "step": 175 }, { "epoch": 0.10423452768729642, "grad_norm": 4.8130705468771335, "learning_rate": 1.9998821941140716e-05, "loss": 0.9073, "step": 176 }, { "epoch": 0.10482676932188333, "grad_norm": 4.204520297054476, "learning_rate": 1.999872172646316e-05, "loss": 0.8713, "step": 177 }, { "epoch": 0.10541901095647024, "grad_norm": 9.243056732431207, "learning_rate": 1.9998617421746166e-05, "loss": 0.899, "step": 178 }, { "epoch": 0.10601125259105715, "grad_norm": 7.819090907308756, "learning_rate": 1.9998509027032392e-05, "loss": 0.9286, "step": 179 }, { "epoch": 0.10660349422564407, "grad_norm": 3.3915991356633426, "learning_rate": 1.9998396542366188e-05, "loss": 0.9553, "step": 180 }, { "epoch": 0.10719573586023097, "grad_norm": 6.875890351613609, "learning_rate": 1.9998279967793558e-05, "loss": 0.9107, "step": 181 }, { "epoch": 0.10778797749481789, "grad_norm": 18.641411497663324, "learning_rate": 1.9998159303362193e-05, "loss": 0.9007, "step": 182 }, { "epoch": 0.10838021912940479, "grad_norm": 2.8778263671228794, "learning_rate": 1.9998034549121445e-05, "loss": 0.9288, "step": 183 }, { "epoch": 0.10897246076399171, "grad_norm": 7.460744753488814, "learning_rate": 1.9997905705122352e-05, "loss": 0.9221, "step": 184 }, { "epoch": 0.10956470239857861, "grad_norm": 38.271685018077065, "learning_rate": 1.9997772771417615e-05, "loss": 0.8964, "step": 185 }, { "epoch": 0.11015694403316553, "grad_norm": 5.046727664405057, "learning_rate": 1.9997635748061615e-05, "loss": 0.9444, "step": 186 }, { "epoch": 0.11074918566775244, "grad_norm": 8.624881305642129, "learning_rate": 1.99974946351104e-05, "loss": 0.9131, "step": 187 }, { "epoch": 0.11134142730233935, "grad_norm": 8.837284064529285, "learning_rate": 1.999734943262169e-05, "loss": 0.906, "step": 188 }, { "epoch": 0.11193366893692627, "grad_norm": 10.041268687119777, "learning_rate": 1.999720014065489e-05, "loss": 0.901, "step": 189 }, { "epoch": 0.11252591057151318, "grad_norm": 2.6502443902067765, "learning_rate": 1.9997046759271055e-05, "loss": 0.9126, "step": 190 }, { "epoch": 0.1131181522061001, "grad_norm": 5.889543292461257, "learning_rate": 1.999688928853294e-05, "loss": 0.9007, "step": 191 }, { "epoch": 0.113710393840687, "grad_norm": 3.5995531544601578, "learning_rate": 1.999672772850495e-05, "loss": 0.8766, "step": 192 }, { "epoch": 0.11430263547527392, "grad_norm": 1.796533079320344, "learning_rate": 1.9996562079253177e-05, "loss": 0.9259, "step": 193 }, { "epoch": 0.11489487710986082, "grad_norm": 3.417277613043803, "learning_rate": 1.999639234084538e-05, "loss": 0.8797, "step": 194 }, { "epoch": 0.11548711874444774, "grad_norm": 4.546208064925828, "learning_rate": 1.999621851335099e-05, "loss": 0.9833, "step": 195 }, { "epoch": 0.11607936037903464, "grad_norm": 1.8321385514505888, "learning_rate": 1.9996040596841118e-05, "loss": 0.8972, "step": 196 }, { "epoch": 0.11667160201362156, "grad_norm": 4.278863948371289, "learning_rate": 1.9995858591388532e-05, "loss": 0.9177, "step": 197 }, { "epoch": 0.11726384364820847, "grad_norm": 2.5684660110656488, "learning_rate": 1.999567249706769e-05, "loss": 0.9269, "step": 198 }, { "epoch": 0.11785608528279538, "grad_norm": 2.6222054577372784, "learning_rate": 1.9995482313954713e-05, "loss": 0.9453, "step": 199 }, { "epoch": 0.11844832691738229, "grad_norm": 2.4017263015983557, "learning_rate": 1.9995288042127396e-05, "loss": 0.9334, "step": 200 }, { "epoch": 0.1190405685519692, "grad_norm": 3.4487920247999244, "learning_rate": 1.999508968166521e-05, "loss": 0.8751, "step": 201 }, { "epoch": 0.11963281018655611, "grad_norm": 4.923653355915247, "learning_rate": 1.999488723264929e-05, "loss": 0.9157, "step": 202 }, { "epoch": 0.12022505182114303, "grad_norm": 2.210102296979183, "learning_rate": 1.9994680695162453e-05, "loss": 0.9256, "step": 203 }, { "epoch": 0.12081729345572993, "grad_norm": 3.8316888952990786, "learning_rate": 1.999447006928918e-05, "loss": 0.8524, "step": 204 }, { "epoch": 0.12140953509031685, "grad_norm": 1.703827800390735, "learning_rate": 1.999425535511564e-05, "loss": 0.8918, "step": 205 }, { "epoch": 0.12200177672490375, "grad_norm": 3.5473701576710623, "learning_rate": 1.999403655272965e-05, "loss": 0.8804, "step": 206 }, { "epoch": 0.12259401835949067, "grad_norm": 2.2496153840955415, "learning_rate": 1.999381366222072e-05, "loss": 0.9551, "step": 207 }, { "epoch": 0.12318625999407759, "grad_norm": 2.126988606794187, "learning_rate": 1.999358668368002e-05, "loss": 0.8835, "step": 208 }, { "epoch": 0.1237785016286645, "grad_norm": 2.660694810075823, "learning_rate": 1.9993355617200404e-05, "loss": 0.8951, "step": 209 }, { "epoch": 0.12437074326325141, "grad_norm": 4.602299174530589, "learning_rate": 1.9993120462876385e-05, "loss": 0.9475, "step": 210 }, { "epoch": 0.12496298489783832, "grad_norm": 3.931184146620322, "learning_rate": 1.9992881220804157e-05, "loss": 0.8922, "step": 211 }, { "epoch": 0.12555522653242523, "grad_norm": 2.6749433222616292, "learning_rate": 1.9992637891081585e-05, "loss": 0.9381, "step": 212 }, { "epoch": 0.12614746816701214, "grad_norm": 3.4189957335214105, "learning_rate": 1.9992390473808195e-05, "loss": 0.8404, "step": 213 }, { "epoch": 0.12673970980159904, "grad_norm": 2.1866401943466, "learning_rate": 1.999213896908521e-05, "loss": 0.8585, "step": 214 }, { "epoch": 0.12733195143618598, "grad_norm": 1.6522576171627141, "learning_rate": 1.9991883377015497e-05, "loss": 0.8548, "step": 215 }, { "epoch": 0.12792419307077288, "grad_norm": 2.2509610072884074, "learning_rate": 1.9991623697703613e-05, "loss": 0.9282, "step": 216 }, { "epoch": 0.12851643470535978, "grad_norm": 4.028256239000191, "learning_rate": 1.9991359931255782e-05, "loss": 0.9042, "step": 217 }, { "epoch": 0.1291086763399467, "grad_norm": 1.8685142977539413, "learning_rate": 1.9991092077779895e-05, "loss": 0.9028, "step": 218 }, { "epoch": 0.12970091797453362, "grad_norm": 2.87573942157228, "learning_rate": 1.9990820137385525e-05, "loss": 0.8854, "step": 219 }, { "epoch": 0.13029315960912052, "grad_norm": 2.4941325943289323, "learning_rate": 1.9990544110183907e-05, "loss": 0.9238, "step": 220 }, { "epoch": 0.13088540124370743, "grad_norm": 3.4805013559739235, "learning_rate": 1.999026399628795e-05, "loss": 0.8405, "step": 221 }, { "epoch": 0.13147764287829433, "grad_norm": 1.8670578876428667, "learning_rate": 1.998997979581224e-05, "loss": 0.8898, "step": 222 }, { "epoch": 0.13206988451288126, "grad_norm": 2.2661788538769043, "learning_rate": 1.9989691508873032e-05, "loss": 0.9229, "step": 223 }, { "epoch": 0.13266212614746817, "grad_norm": 2.499121551088041, "learning_rate": 1.9989399135588246e-05, "loss": 0.9035, "step": 224 }, { "epoch": 0.13325436778205507, "grad_norm": 3.4358794596174334, "learning_rate": 1.9989102676077484e-05, "loss": 0.9143, "step": 225 }, { "epoch": 0.13384660941664198, "grad_norm": 3.4214432168583353, "learning_rate": 1.9988802130462017e-05, "loss": 0.9384, "step": 226 }, { "epoch": 0.1344388510512289, "grad_norm": 2.544662580599115, "learning_rate": 1.9988497498864776e-05, "loss": 0.8979, "step": 227 }, { "epoch": 0.1350310926858158, "grad_norm": 2.087938927884032, "learning_rate": 1.9988188781410377e-05, "loss": 0.8918, "step": 228 }, { "epoch": 0.13562333432040272, "grad_norm": 3.1166292965067925, "learning_rate": 1.9987875978225107e-05, "loss": 0.9169, "step": 229 }, { "epoch": 0.13621557595498965, "grad_norm": 2.9358196092245383, "learning_rate": 1.9987559089436917e-05, "loss": 0.9254, "step": 230 }, { "epoch": 0.13680781758957655, "grad_norm": 4.473321811652426, "learning_rate": 1.9987238115175428e-05, "loss": 0.9227, "step": 231 }, { "epoch": 0.13740005922416346, "grad_norm": 3.988884973768667, "learning_rate": 1.998691305557194e-05, "loss": 0.8807, "step": 232 }, { "epoch": 0.13799230085875036, "grad_norm": 3.5180020915988632, "learning_rate": 1.9986583910759427e-05, "loss": 0.9089, "step": 233 }, { "epoch": 0.1385845424933373, "grad_norm": 4.311583066844405, "learning_rate": 1.9986250680872515e-05, "loss": 0.8793, "step": 234 }, { "epoch": 0.1391767841279242, "grad_norm": 4.094646789970844, "learning_rate": 1.9985913366047524e-05, "loss": 0.8717, "step": 235 }, { "epoch": 0.1397690257625111, "grad_norm": 5.37580338032932, "learning_rate": 1.998557196642243e-05, "loss": 0.9662, "step": 236 }, { "epoch": 0.140361267397098, "grad_norm": 5.374652282579761, "learning_rate": 1.9985226482136887e-05, "loss": 0.9267, "step": 237 }, { "epoch": 0.14095350903168494, "grad_norm": 8.012244025293068, "learning_rate": 1.9984876913332215e-05, "loss": 0.9403, "step": 238 }, { "epoch": 0.14154575066627184, "grad_norm": 4.000814675128233, "learning_rate": 1.998452326015141e-05, "loss": 0.9192, "step": 239 }, { "epoch": 0.14213799230085875, "grad_norm": 2.743822493310051, "learning_rate": 1.9984165522739135e-05, "loss": 0.876, "step": 240 }, { "epoch": 0.14273023393544565, "grad_norm": 3.584538417944988, "learning_rate": 1.9983803701241723e-05, "loss": 0.9261, "step": 241 }, { "epoch": 0.14332247557003258, "grad_norm": 7.9272167764769845, "learning_rate": 1.998343779580718e-05, "loss": 0.8728, "step": 242 }, { "epoch": 0.14391471720461949, "grad_norm": 2.481390145790585, "learning_rate": 1.9983067806585184e-05, "loss": 0.8885, "step": 243 }, { "epoch": 0.1445069588392064, "grad_norm": 6.220923433805533, "learning_rate": 1.998269373372708e-05, "loss": 0.8931, "step": 244 }, { "epoch": 0.1450992004737933, "grad_norm": 4.045398395801386, "learning_rate": 1.9982315577385885e-05, "loss": 0.8867, "step": 245 }, { "epoch": 0.14569144210838023, "grad_norm": 5.776802690732297, "learning_rate": 1.9981933337716288e-05, "loss": 0.8379, "step": 246 }, { "epoch": 0.14628368374296713, "grad_norm": 14.36864250446338, "learning_rate": 1.998154701487464e-05, "loss": 0.9106, "step": 247 }, { "epoch": 0.14687592537755403, "grad_norm": 3.967462127445808, "learning_rate": 1.9981156609018977e-05, "loss": 0.9349, "step": 248 }, { "epoch": 0.14746816701214097, "grad_norm": 3.265424009929576, "learning_rate": 1.998076212030899e-05, "loss": 0.8946, "step": 249 }, { "epoch": 0.14806040864672787, "grad_norm": 5.252463182631547, "learning_rate": 1.9980363548906056e-05, "loss": 0.9346, "step": 250 }, { "epoch": 0.14865265028131477, "grad_norm": 3.913170557844579, "learning_rate": 1.9979960894973202e-05, "loss": 0.9643, "step": 251 }, { "epoch": 0.14924489191590168, "grad_norm": 10.316912742558536, "learning_rate": 1.9979554158675145e-05, "loss": 0.8396, "step": 252 }, { "epoch": 0.1498371335504886, "grad_norm": 3.808990535792799, "learning_rate": 1.9979143340178258e-05, "loss": 0.9148, "step": 253 }, { "epoch": 0.15042937518507551, "grad_norm": 12.348960229120108, "learning_rate": 1.997872843965059e-05, "loss": 0.8809, "step": 254 }, { "epoch": 0.15102161681966242, "grad_norm": 7.004456279640057, "learning_rate": 1.997830945726186e-05, "loss": 0.8835, "step": 255 }, { "epoch": 0.15161385845424932, "grad_norm": 7.66590001684898, "learning_rate": 1.9977886393183454e-05, "loss": 0.8604, "step": 256 }, { "epoch": 0.15220610008883625, "grad_norm": 3.272971996729421, "learning_rate": 1.997745924758843e-05, "loss": 0.9124, "step": 257 }, { "epoch": 0.15279834172342316, "grad_norm": 3.163402751353713, "learning_rate": 1.9977028020651516e-05, "loss": 0.9089, "step": 258 }, { "epoch": 0.15339058335801006, "grad_norm": 9.234252191086176, "learning_rate": 1.9976592712549102e-05, "loss": 0.9354, "step": 259 }, { "epoch": 0.15398282499259697, "grad_norm": 3.471952759539684, "learning_rate": 1.9976153323459262e-05, "loss": 0.9084, "step": 260 }, { "epoch": 0.1545750666271839, "grad_norm": 19.265326355686057, "learning_rate": 1.9975709853561725e-05, "loss": 0.8811, "step": 261 }, { "epoch": 0.1551673082617708, "grad_norm": 25.404153390256994, "learning_rate": 1.9975262303037896e-05, "loss": 0.9463, "step": 262 }, { "epoch": 0.1557595498963577, "grad_norm": 1.790796295391984, "learning_rate": 1.997481067207085e-05, "loss": 0.9076, "step": 263 }, { "epoch": 0.1563517915309446, "grad_norm": 1.5429882740755128, "learning_rate": 1.9974354960845326e-05, "loss": 0.8787, "step": 264 }, { "epoch": 0.15694403316553154, "grad_norm": 1.8706636553302713, "learning_rate": 1.997389516954774e-05, "loss": 0.9265, "step": 265 }, { "epoch": 0.15753627480011845, "grad_norm": 2.730000619216694, "learning_rate": 1.997343129836617e-05, "loss": 0.9074, "step": 266 }, { "epoch": 0.15812851643470535, "grad_norm": 6.628653976491072, "learning_rate": 1.9972963347490366e-05, "loss": 0.8885, "step": 267 }, { "epoch": 0.15872075806929228, "grad_norm": 1.9178468971516744, "learning_rate": 1.9972491317111745e-05, "loss": 0.8867, "step": 268 }, { "epoch": 0.1593129997038792, "grad_norm": 3.3558537963254267, "learning_rate": 1.9972015207423396e-05, "loss": 0.844, "step": 269 }, { "epoch": 0.1599052413384661, "grad_norm": 4.932473991808671, "learning_rate": 1.997153501862007e-05, "loss": 0.9451, "step": 270 }, { "epoch": 0.160497482973053, "grad_norm": 1.7319648821727869, "learning_rate": 1.99710507508982e-05, "loss": 0.8596, "step": 271 }, { "epoch": 0.16108972460763993, "grad_norm": 3.1569769697720558, "learning_rate": 1.9970562404455872e-05, "loss": 0.89, "step": 272 }, { "epoch": 0.16168196624222683, "grad_norm": 3.6061121751456753, "learning_rate": 1.9970069979492846e-05, "loss": 0.8865, "step": 273 }, { "epoch": 0.16227420787681374, "grad_norm": 19.08626640686028, "learning_rate": 1.9969573476210558e-05, "loss": 0.9106, "step": 274 }, { "epoch": 0.16286644951140064, "grad_norm": 1.5734768192780313, "learning_rate": 1.99690728948121e-05, "loss": 0.9331, "step": 275 }, { "epoch": 0.16345869114598757, "grad_norm": 1.8545986500276248, "learning_rate": 1.996856823550224e-05, "loss": 0.8472, "step": 276 }, { "epoch": 0.16405093278057448, "grad_norm": 2.3229100358592563, "learning_rate": 1.9968059498487415e-05, "loss": 0.8914, "step": 277 }, { "epoch": 0.16464317441516138, "grad_norm": 1.736892427592765, "learning_rate": 1.996754668397572e-05, "loss": 0.9359, "step": 278 }, { "epoch": 0.16523541604974828, "grad_norm": 3.6486468336727444, "learning_rate": 1.9967029792176932e-05, "loss": 0.8875, "step": 279 }, { "epoch": 0.16582765768433522, "grad_norm": 1.7408683315370481, "learning_rate": 1.9966508823302484e-05, "loss": 0.877, "step": 280 }, { "epoch": 0.16641989931892212, "grad_norm": 1.8687430980001674, "learning_rate": 1.9965983777565483e-05, "loss": 0.8905, "step": 281 }, { "epoch": 0.16701214095350903, "grad_norm": 3.904642504972808, "learning_rate": 1.9965454655180704e-05, "loss": 0.852, "step": 282 }, { "epoch": 0.16760438258809596, "grad_norm": 1.9130632597634436, "learning_rate": 1.9964921456364584e-05, "loss": 0.9199, "step": 283 }, { "epoch": 0.16819662422268286, "grad_norm": 2.675290531374258, "learning_rate": 1.9964384181335237e-05, "loss": 0.9072, "step": 284 }, { "epoch": 0.16878886585726977, "grad_norm": 1.5847471748990105, "learning_rate": 1.9963842830312434e-05, "loss": 0.9185, "step": 285 }, { "epoch": 0.16938110749185667, "grad_norm": 10.321315145183203, "learning_rate": 1.996329740351762e-05, "loss": 0.8816, "step": 286 }, { "epoch": 0.1699733491264436, "grad_norm": 10.276955241455104, "learning_rate": 1.9962747901173904e-05, "loss": 0.9097, "step": 287 }, { "epoch": 0.1705655907610305, "grad_norm": 2.3615243550709155, "learning_rate": 1.9962194323506064e-05, "loss": 0.928, "step": 288 }, { "epoch": 0.1711578323956174, "grad_norm": 1.1933250899884889, "learning_rate": 1.9961636670740546e-05, "loss": 0.9304, "step": 289 }, { "epoch": 0.1717500740302043, "grad_norm": 3.5837635071883396, "learning_rate": 1.9961074943105457e-05, "loss": 0.9264, "step": 290 }, { "epoch": 0.17234231566479125, "grad_norm": 1.4231322142951308, "learning_rate": 1.996050914083058e-05, "loss": 0.915, "step": 291 }, { "epoch": 0.17293455729937815, "grad_norm": 2.3576679543712724, "learning_rate": 1.9959939264147355e-05, "loss": 0.9305, "step": 292 }, { "epoch": 0.17352679893396505, "grad_norm": 2.292913955194318, "learning_rate": 1.99593653132889e-05, "loss": 0.9098, "step": 293 }, { "epoch": 0.17411904056855196, "grad_norm": 1.9398072904004944, "learning_rate": 1.9958787288489983e-05, "loss": 0.9107, "step": 294 }, { "epoch": 0.1747112822031389, "grad_norm": 11.08115964739409, "learning_rate": 1.9958205189987066e-05, "loss": 0.8475, "step": 295 }, { "epoch": 0.1753035238377258, "grad_norm": 2.6304317819640852, "learning_rate": 1.9957619018018243e-05, "loss": 0.8905, "step": 296 }, { "epoch": 0.1758957654723127, "grad_norm": 3.3173060476207477, "learning_rate": 1.99570287728233e-05, "loss": 0.8585, "step": 297 }, { "epoch": 0.1764880071068996, "grad_norm": 1.7335753531753704, "learning_rate": 1.9956434454643675e-05, "loss": 0.8835, "step": 298 }, { "epoch": 0.17708024874148653, "grad_norm": 1.5781518168435433, "learning_rate": 1.995583606372248e-05, "loss": 0.8636, "step": 299 }, { "epoch": 0.17767249037607344, "grad_norm": 3.186713245410372, "learning_rate": 1.9955233600304496e-05, "loss": 0.8857, "step": 300 }, { "epoch": 0.17826473201066034, "grad_norm": 2.4742503690715254, "learning_rate": 1.9954627064636157e-05, "loss": 0.9306, "step": 301 }, { "epoch": 0.17885697364524727, "grad_norm": 2.1188838286209513, "learning_rate": 1.995401645696557e-05, "loss": 0.8874, "step": 302 }, { "epoch": 0.17944921527983418, "grad_norm": 1.979483491889314, "learning_rate": 1.9953401777542517e-05, "loss": 0.9408, "step": 303 }, { "epoch": 0.18004145691442108, "grad_norm": 1.285049652875338, "learning_rate": 1.9952783026618424e-05, "loss": 0.8872, "step": 304 }, { "epoch": 0.180633698549008, "grad_norm": 1.9180889335059634, "learning_rate": 1.9952160204446404e-05, "loss": 0.8832, "step": 305 }, { "epoch": 0.18122594018359492, "grad_norm": 2.556643811478596, "learning_rate": 1.995153331128122e-05, "loss": 0.8946, "step": 306 }, { "epoch": 0.18181818181818182, "grad_norm": 2.4449018469180723, "learning_rate": 1.9950902347379316e-05, "loss": 0.9229, "step": 307 }, { "epoch": 0.18241042345276873, "grad_norm": 2.436648980404225, "learning_rate": 1.9950267312998783e-05, "loss": 0.9155, "step": 308 }, { "epoch": 0.18300266508735563, "grad_norm": 2.1396571184843793, "learning_rate": 1.9949628208399388e-05, "loss": 0.9097, "step": 309 }, { "epoch": 0.18359490672194256, "grad_norm": 1.8397633591444138, "learning_rate": 1.994898503384256e-05, "loss": 0.9461, "step": 310 }, { "epoch": 0.18418714835652947, "grad_norm": 5.288654703997796, "learning_rate": 1.9948337789591396e-05, "loss": 0.9367, "step": 311 }, { "epoch": 0.18477938999111637, "grad_norm": 1.8505284885364202, "learning_rate": 1.9947686475910656e-05, "loss": 0.8994, "step": 312 }, { "epoch": 0.18537163162570328, "grad_norm": 1.655781456697678, "learning_rate": 1.9947031093066758e-05, "loss": 0.8772, "step": 313 }, { "epoch": 0.1859638732602902, "grad_norm": 2.0037040325972835, "learning_rate": 1.9946371641327794e-05, "loss": 0.9276, "step": 314 }, { "epoch": 0.1865561148948771, "grad_norm": 4.111557030910506, "learning_rate": 1.994570812096352e-05, "loss": 0.9274, "step": 315 }, { "epoch": 0.18714835652946402, "grad_norm": 1.6261247430270642, "learning_rate": 1.9945040532245352e-05, "loss": 0.9014, "step": 316 }, { "epoch": 0.18774059816405092, "grad_norm": 1.6162415339185936, "learning_rate": 1.9944368875446363e-05, "loss": 0.9511, "step": 317 }, { "epoch": 0.18833283979863785, "grad_norm": 1.5148471582795817, "learning_rate": 1.9943693150841312e-05, "loss": 0.8605, "step": 318 }, { "epoch": 0.18892508143322476, "grad_norm": 1.540594634489607, "learning_rate": 1.99430133587066e-05, "loss": 0.8838, "step": 319 }, { "epoch": 0.18951732306781166, "grad_norm": 2.049032648950319, "learning_rate": 1.9942329499320298e-05, "loss": 0.9253, "step": 320 }, { "epoch": 0.1901095647023986, "grad_norm": 1.977962711697837, "learning_rate": 1.994164157296215e-05, "loss": 0.921, "step": 321 }, { "epoch": 0.1907018063369855, "grad_norm": 5.4655707773851265, "learning_rate": 1.994094957991355e-05, "loss": 0.8246, "step": 322 }, { "epoch": 0.1912940479715724, "grad_norm": 1.264635984959813, "learning_rate": 1.994025352045757e-05, "loss": 0.8664, "step": 323 }, { "epoch": 0.1918862896061593, "grad_norm": 2.1323498253950275, "learning_rate": 1.9939553394878926e-05, "loss": 0.9058, "step": 324 }, { "epoch": 0.19247853124074624, "grad_norm": 1.9659140774542014, "learning_rate": 1.9938849203464023e-05, "loss": 0.8865, "step": 325 }, { "epoch": 0.19307077287533314, "grad_norm": 1.9108336737757217, "learning_rate": 1.99381409465009e-05, "loss": 0.9114, "step": 326 }, { "epoch": 0.19366301450992005, "grad_norm": 1.9649788148181904, "learning_rate": 1.9937428624279284e-05, "loss": 0.8786, "step": 327 }, { "epoch": 0.19425525614450695, "grad_norm": 1.999999763126164, "learning_rate": 1.9936712237090554e-05, "loss": 0.8931, "step": 328 }, { "epoch": 0.19484749777909388, "grad_norm": 2.6307075206759976, "learning_rate": 1.993599178522775e-05, "loss": 0.8924, "step": 329 }, { "epoch": 0.19543973941368079, "grad_norm": 1.6672297479747191, "learning_rate": 1.9935267268985577e-05, "loss": 0.8818, "step": 330 }, { "epoch": 0.1960319810482677, "grad_norm": 1.9888771189937435, "learning_rate": 1.9934538688660403e-05, "loss": 0.9191, "step": 331 }, { "epoch": 0.1966242226828546, "grad_norm": 2.656316010840173, "learning_rate": 1.9933806044550262e-05, "loss": 0.8806, "step": 332 }, { "epoch": 0.19721646431744153, "grad_norm": 4.286512255009421, "learning_rate": 1.9933069336954842e-05, "loss": 0.8969, "step": 333 }, { "epoch": 0.19780870595202843, "grad_norm": 3.055765430366512, "learning_rate": 1.99323285661755e-05, "loss": 0.9115, "step": 334 }, { "epoch": 0.19840094758661533, "grad_norm": 1.7964416584438196, "learning_rate": 1.9931583732515252e-05, "loss": 0.8734, "step": 335 }, { "epoch": 0.19899318922120224, "grad_norm": 1.8519391859954, "learning_rate": 1.993083483627878e-05, "loss": 0.9265, "step": 336 }, { "epoch": 0.19958543085578917, "grad_norm": 1.9971914645128772, "learning_rate": 1.993008187777242e-05, "loss": 0.8784, "step": 337 }, { "epoch": 0.20017767249037607, "grad_norm": 2.4232990241295522, "learning_rate": 1.9929324857304175e-05, "loss": 0.8837, "step": 338 }, { "epoch": 0.20076991412496298, "grad_norm": 1.4178062062472774, "learning_rate": 1.9928563775183713e-05, "loss": 0.8635, "step": 339 }, { "epoch": 0.2013621557595499, "grad_norm": 4.756076969204727, "learning_rate": 1.9927798631722353e-05, "loss": 0.9008, "step": 340 }, { "epoch": 0.20195439739413681, "grad_norm": 5.072255729219359, "learning_rate": 1.992702942723309e-05, "loss": 0.8908, "step": 341 }, { "epoch": 0.20254663902872372, "grad_norm": 3.724264153241877, "learning_rate": 1.9926256162030564e-05, "loss": 0.893, "step": 342 }, { "epoch": 0.20313888066331062, "grad_norm": 2.284558332966103, "learning_rate": 1.992547883643109e-05, "loss": 0.8823, "step": 343 }, { "epoch": 0.20373112229789755, "grad_norm": 7.51141744653435, "learning_rate": 1.9924697450752636e-05, "loss": 0.8618, "step": 344 }, { "epoch": 0.20432336393248446, "grad_norm": 2.7293707676417487, "learning_rate": 1.9923912005314827e-05, "loss": 0.8983, "step": 345 }, { "epoch": 0.20491560556707136, "grad_norm": 3.2471120301938776, "learning_rate": 1.9923122500438964e-05, "loss": 0.9105, "step": 346 }, { "epoch": 0.20550784720165827, "grad_norm": 6.07549147250831, "learning_rate": 1.9922328936447992e-05, "loss": 0.8483, "step": 347 }, { "epoch": 0.2061000888362452, "grad_norm": 2.791357973212501, "learning_rate": 1.9921531313666526e-05, "loss": 0.8484, "step": 348 }, { "epoch": 0.2066923304708321, "grad_norm": 2.076877360733566, "learning_rate": 1.992072963242084e-05, "loss": 0.9362, "step": 349 }, { "epoch": 0.207284572105419, "grad_norm": 2.7096944494505024, "learning_rate": 1.9919923893038863e-05, "loss": 0.8252, "step": 350 }, { "epoch": 0.2078768137400059, "grad_norm": 2.355203227286866, "learning_rate": 1.991911409585019e-05, "loss": 0.92, "step": 351 }, { "epoch": 0.20846905537459284, "grad_norm": 3.436005203104444, "learning_rate": 1.991830024118607e-05, "loss": 0.8602, "step": 352 }, { "epoch": 0.20906129700917975, "grad_norm": 2.3276079656629856, "learning_rate": 1.991748232937942e-05, "loss": 0.8665, "step": 353 }, { "epoch": 0.20965353864376665, "grad_norm": 3.406633969476107, "learning_rate": 1.991666036076481e-05, "loss": 0.8774, "step": 354 }, { "epoch": 0.21024578027835356, "grad_norm": 3.2519752361578043, "learning_rate": 1.9915834335678476e-05, "loss": 0.938, "step": 355 }, { "epoch": 0.2108380219129405, "grad_norm": 1.5571411069585908, "learning_rate": 1.99150042544583e-05, "loss": 0.8665, "step": 356 }, { "epoch": 0.2114302635475274, "grad_norm": 1.797561656093368, "learning_rate": 1.9914170117443837e-05, "loss": 0.8467, "step": 357 }, { "epoch": 0.2120225051821143, "grad_norm": 2.173677269835551, "learning_rate": 1.9913331924976295e-05, "loss": 0.8809, "step": 358 }, { "epoch": 0.21261474681670123, "grad_norm": 1.4857064069360264, "learning_rate": 1.9912489677398543e-05, "loss": 0.8524, "step": 359 }, { "epoch": 0.21320698845128813, "grad_norm": 1.5451577112951609, "learning_rate": 1.991164337505511e-05, "loss": 0.8625, "step": 360 }, { "epoch": 0.21379923008587504, "grad_norm": 4.152036575903902, "learning_rate": 1.9910793018292168e-05, "loss": 0.891, "step": 361 }, { "epoch": 0.21439147172046194, "grad_norm": 1.7176492006424662, "learning_rate": 1.990993860745758e-05, "loss": 0.8393, "step": 362 }, { "epoch": 0.21498371335504887, "grad_norm": 16.044944435431123, "learning_rate": 1.9909080142900835e-05, "loss": 0.8994, "step": 363 }, { "epoch": 0.21557595498963578, "grad_norm": 3.070726746879335, "learning_rate": 1.99082176249731e-05, "loss": 0.9275, "step": 364 }, { "epoch": 0.21616819662422268, "grad_norm": 11.949735878042022, "learning_rate": 1.9907351054027188e-05, "loss": 0.8794, "step": 365 }, { "epoch": 0.21676043825880958, "grad_norm": 4.743677400565406, "learning_rate": 1.9906480430417575e-05, "loss": 0.8665, "step": 366 }, { "epoch": 0.21735267989339652, "grad_norm": 6.192596337832938, "learning_rate": 1.99056057545004e-05, "loss": 0.8838, "step": 367 }, { "epoch": 0.21794492152798342, "grad_norm": 2.1298876915873426, "learning_rate": 1.9904727026633453e-05, "loss": 0.8655, "step": 368 }, { "epoch": 0.21853716316257032, "grad_norm": 2.872112183326634, "learning_rate": 1.9903844247176182e-05, "loss": 0.8857, "step": 369 }, { "epoch": 0.21912940479715723, "grad_norm": 2.116013503348104, "learning_rate": 1.9902957416489693e-05, "loss": 0.8675, "step": 370 }, { "epoch": 0.21972164643174416, "grad_norm": 1.9650140489362669, "learning_rate": 1.990206653493675e-05, "loss": 0.8901, "step": 371 }, { "epoch": 0.22031388806633107, "grad_norm": 2.1533395141766847, "learning_rate": 1.9901171602881778e-05, "loss": 0.8268, "step": 372 }, { "epoch": 0.22090612970091797, "grad_norm": 1.8679750441425857, "learning_rate": 1.990027262069085e-05, "loss": 0.8956, "step": 373 }, { "epoch": 0.22149837133550487, "grad_norm": 1.876927910343411, "learning_rate": 1.9899369588731697e-05, "loss": 0.915, "step": 374 }, { "epoch": 0.2220906129700918, "grad_norm": 1.966063586945644, "learning_rate": 1.9898462507373713e-05, "loss": 0.85, "step": 375 }, { "epoch": 0.2226828546046787, "grad_norm": 5.175814315545093, "learning_rate": 1.9897551376987948e-05, "loss": 0.8877, "step": 376 }, { "epoch": 0.2232750962392656, "grad_norm": 3.3647238312551293, "learning_rate": 1.9896636197947104e-05, "loss": 0.9013, "step": 377 }, { "epoch": 0.22386733787385255, "grad_norm": 3.0764967820975113, "learning_rate": 1.9895716970625544e-05, "loss": 0.9351, "step": 378 }, { "epoch": 0.22445957950843945, "grad_norm": 2.601494620918347, "learning_rate": 1.9894793695399276e-05, "loss": 0.8853, "step": 379 }, { "epoch": 0.22505182114302635, "grad_norm": 2.4828022845602735, "learning_rate": 1.9893866372645975e-05, "loss": 0.8749, "step": 380 }, { "epoch": 0.22564406277761326, "grad_norm": 2.0904211550387846, "learning_rate": 1.989293500274497e-05, "loss": 0.8613, "step": 381 }, { "epoch": 0.2262363044122002, "grad_norm": 2.594547419908662, "learning_rate": 1.989199958607724e-05, "loss": 0.8853, "step": 382 }, { "epoch": 0.2268285460467871, "grad_norm": 3.3824116389291046, "learning_rate": 1.9891060123025427e-05, "loss": 0.8458, "step": 383 }, { "epoch": 0.227420787681374, "grad_norm": 2.388543281852487, "learning_rate": 1.9890116613973822e-05, "loss": 0.8779, "step": 384 }, { "epoch": 0.2280130293159609, "grad_norm": 2.3270199405136425, "learning_rate": 1.9889169059308374e-05, "loss": 0.9211, "step": 385 }, { "epoch": 0.22860527095054783, "grad_norm": 3.4392018992585625, "learning_rate": 1.9888217459416685e-05, "loss": 0.8896, "step": 386 }, { "epoch": 0.22919751258513474, "grad_norm": 2.9845110896198865, "learning_rate": 1.9887261814688017e-05, "loss": 0.8656, "step": 387 }, { "epoch": 0.22978975421972164, "grad_norm": 1.7722551264363466, "learning_rate": 1.9886302125513276e-05, "loss": 0.9233, "step": 388 }, { "epoch": 0.23038199585430855, "grad_norm": 2.3468863928571864, "learning_rate": 1.9885338392285032e-05, "loss": 0.9028, "step": 389 }, { "epoch": 0.23097423748889548, "grad_norm": 2.2088822196102136, "learning_rate": 1.9884370615397507e-05, "loss": 0.876, "step": 390 }, { "epoch": 0.23156647912348238, "grad_norm": 2.954411705632498, "learning_rate": 1.9883398795246577e-05, "loss": 0.9139, "step": 391 }, { "epoch": 0.2321587207580693, "grad_norm": 2.2449696353833963, "learning_rate": 1.9882422932229765e-05, "loss": 0.8882, "step": 392 }, { "epoch": 0.2327509623926562, "grad_norm": 1.911476833050929, "learning_rate": 1.988144302674626e-05, "loss": 0.8947, "step": 393 }, { "epoch": 0.23334320402724312, "grad_norm": 2.0157438479335625, "learning_rate": 1.9880459079196898e-05, "loss": 0.8994, "step": 394 }, { "epoch": 0.23393544566183003, "grad_norm": 2.2549219273884655, "learning_rate": 1.9879471089984168e-05, "loss": 0.8613, "step": 395 }, { "epoch": 0.23452768729641693, "grad_norm": 2.369416456819026, "learning_rate": 1.9878479059512212e-05, "loss": 0.8987, "step": 396 }, { "epoch": 0.23511992893100386, "grad_norm": 2.065708029987891, "learning_rate": 1.9877482988186825e-05, "loss": 0.8903, "step": 397 }, { "epoch": 0.23571217056559077, "grad_norm": 1.7370389947259615, "learning_rate": 1.987648287641546e-05, "loss": 0.9043, "step": 398 }, { "epoch": 0.23630441220017767, "grad_norm": 3.4678559229085546, "learning_rate": 1.987547872460722e-05, "loss": 0.867, "step": 399 }, { "epoch": 0.23689665383476458, "grad_norm": 2.1054707516143694, "learning_rate": 1.987447053317285e-05, "loss": 0.863, "step": 400 }, { "epoch": 0.2374888954693515, "grad_norm": 2.820738998283495, "learning_rate": 1.9873458302524767e-05, "loss": 0.9357, "step": 401 }, { "epoch": 0.2380811371039384, "grad_norm": 4.024264238834477, "learning_rate": 1.9872442033077027e-05, "loss": 0.8398, "step": 402 }, { "epoch": 0.23867337873852532, "grad_norm": 2.7872478316820755, "learning_rate": 1.9871421725245342e-05, "loss": 0.8621, "step": 403 }, { "epoch": 0.23926562037311222, "grad_norm": 2.641334532671031, "learning_rate": 1.9870397379447074e-05, "loss": 0.9111, "step": 404 }, { "epoch": 0.23985786200769915, "grad_norm": 11.371926019038037, "learning_rate": 1.9869368996101238e-05, "loss": 0.863, "step": 405 }, { "epoch": 0.24045010364228606, "grad_norm": 2.7635300627133925, "learning_rate": 1.98683365756285e-05, "loss": 0.866, "step": 406 }, { "epoch": 0.24104234527687296, "grad_norm": 2.7794994837778084, "learning_rate": 1.986730011845118e-05, "loss": 0.8632, "step": 407 }, { "epoch": 0.24163458691145986, "grad_norm": 3.08645053943537, "learning_rate": 1.9866259624993246e-05, "loss": 0.9136, "step": 408 }, { "epoch": 0.2422268285460468, "grad_norm": 2.6869600181928255, "learning_rate": 1.9865215095680322e-05, "loss": 0.8791, "step": 409 }, { "epoch": 0.2428190701806337, "grad_norm": 2.772950886041293, "learning_rate": 1.986416653093967e-05, "loss": 0.9567, "step": 410 }, { "epoch": 0.2434113118152206, "grad_norm": 4.210601184028655, "learning_rate": 1.986311393120022e-05, "loss": 0.8685, "step": 411 }, { "epoch": 0.2440035534498075, "grad_norm": 9.174595783532942, "learning_rate": 1.9862057296892546e-05, "loss": 0.888, "step": 412 }, { "epoch": 0.24459579508439444, "grad_norm": 4.247915627932793, "learning_rate": 1.9860996628448866e-05, "loss": 0.8967, "step": 413 }, { "epoch": 0.24518803671898134, "grad_norm": 2.835218121637848, "learning_rate": 1.985993192630305e-05, "loss": 0.8579, "step": 414 }, { "epoch": 0.24578027835356825, "grad_norm": 2.9565335495075638, "learning_rate": 1.985886319089063e-05, "loss": 0.9049, "step": 415 }, { "epoch": 0.24637251998815518, "grad_norm": 3.778725444535079, "learning_rate": 1.9857790422648774e-05, "loss": 0.8764, "step": 416 }, { "epoch": 0.24696476162274208, "grad_norm": 3.1572313052180685, "learning_rate": 1.9856713622016305e-05, "loss": 0.8345, "step": 417 }, { "epoch": 0.247557003257329, "grad_norm": 2.24801735688883, "learning_rate": 1.9855632789433695e-05, "loss": 0.8645, "step": 418 }, { "epoch": 0.2481492448919159, "grad_norm": 23.485871617876615, "learning_rate": 1.985454792534306e-05, "loss": 0.8602, "step": 419 }, { "epoch": 0.24874148652650283, "grad_norm": 6.005824269857163, "learning_rate": 1.9853459030188183e-05, "loss": 0.8688, "step": 420 }, { "epoch": 0.24933372816108973, "grad_norm": 2.1616583205115796, "learning_rate": 1.985236610441447e-05, "loss": 0.907, "step": 421 }, { "epoch": 0.24992596979567663, "grad_norm": 2.7443618096985993, "learning_rate": 1.9851269148468998e-05, "loss": 0.8735, "step": 422 }, { "epoch": 0.25051821143026354, "grad_norm": 1.9467808502303237, "learning_rate": 1.9850168162800482e-05, "loss": 0.88, "step": 423 }, { "epoch": 0.25111045306485047, "grad_norm": 2.293568091632234, "learning_rate": 1.9849063147859282e-05, "loss": 0.8251, "step": 424 }, { "epoch": 0.25170269469943735, "grad_norm": 6.45680121195122, "learning_rate": 1.9847954104097416e-05, "loss": 0.8671, "step": 425 }, { "epoch": 0.2522949363340243, "grad_norm": 2.6574165173210362, "learning_rate": 1.9846841031968545e-05, "loss": 0.8692, "step": 426 }, { "epoch": 0.2528871779686112, "grad_norm": 3.3705354945908645, "learning_rate": 1.9845723931927975e-05, "loss": 0.8723, "step": 427 }, { "epoch": 0.2534794196031981, "grad_norm": 2.039139055549686, "learning_rate": 1.9844602804432667e-05, "loss": 0.8749, "step": 428 }, { "epoch": 0.254071661237785, "grad_norm": 2.3388722575578482, "learning_rate": 1.9843477649941223e-05, "loss": 0.8881, "step": 429 }, { "epoch": 0.25466390287237195, "grad_norm": 2.3766640178995257, "learning_rate": 1.9842348468913895e-05, "loss": 0.8715, "step": 430 }, { "epoch": 0.2552561445069588, "grad_norm": 2.114898403529378, "learning_rate": 1.9841215261812578e-05, "loss": 0.8633, "step": 431 }, { "epoch": 0.25584838614154576, "grad_norm": 3.219697021874039, "learning_rate": 1.9840078029100826e-05, "loss": 0.8636, "step": 432 }, { "epoch": 0.25644062777613263, "grad_norm": 2.73094050276702, "learning_rate": 1.9838936771243823e-05, "loss": 0.8747, "step": 433 }, { "epoch": 0.25703286941071957, "grad_norm": 3.2677036829236252, "learning_rate": 1.983779148870841e-05, "loss": 0.8409, "step": 434 }, { "epoch": 0.2576251110453065, "grad_norm": 3.691432854617829, "learning_rate": 1.9836642181963074e-05, "loss": 0.9213, "step": 435 }, { "epoch": 0.2582173526798934, "grad_norm": 1.913831109953419, "learning_rate": 1.9835488851477943e-05, "loss": 0.878, "step": 436 }, { "epoch": 0.2588095943144803, "grad_norm": 1.362390320852231, "learning_rate": 1.9834331497724795e-05, "loss": 0.8654, "step": 437 }, { "epoch": 0.25940183594906724, "grad_norm": 4.080901803953802, "learning_rate": 1.983317012117705e-05, "loss": 0.8479, "step": 438 }, { "epoch": 0.2599940775836541, "grad_norm": 2.235992282079437, "learning_rate": 1.983200472230979e-05, "loss": 0.8802, "step": 439 }, { "epoch": 0.26058631921824105, "grad_norm": 2.171461179516376, "learning_rate": 1.983083530159971e-05, "loss": 0.9045, "step": 440 }, { "epoch": 0.261178560852828, "grad_norm": 2.7476770749878145, "learning_rate": 1.9829661859525176e-05, "loss": 0.9307, "step": 441 }, { "epoch": 0.26177080248741486, "grad_norm": 4.118541834493762, "learning_rate": 1.9828484396566197e-05, "loss": 0.9237, "step": 442 }, { "epoch": 0.2623630441220018, "grad_norm": 1.827944414998417, "learning_rate": 1.982730291320442e-05, "loss": 0.8942, "step": 443 }, { "epoch": 0.26295528575658866, "grad_norm": 1.8056252338200751, "learning_rate": 1.982611740992313e-05, "loss": 0.8737, "step": 444 }, { "epoch": 0.2635475273911756, "grad_norm": 2.1706398680744328, "learning_rate": 1.982492788720727e-05, "loss": 0.8847, "step": 445 }, { "epoch": 0.2641397690257625, "grad_norm": 2.0469770330929933, "learning_rate": 1.9823734345543422e-05, "loss": 0.8588, "step": 446 }, { "epoch": 0.2647320106603494, "grad_norm": 1.86522114555931, "learning_rate": 1.9822536785419815e-05, "loss": 0.8934, "step": 447 }, { "epoch": 0.26532425229493634, "grad_norm": 1.5844917473664697, "learning_rate": 1.982133520732631e-05, "loss": 0.8978, "step": 448 }, { "epoch": 0.26591649392952327, "grad_norm": 2.8109169891194394, "learning_rate": 1.9820129611754428e-05, "loss": 0.889, "step": 449 }, { "epoch": 0.26650873556411014, "grad_norm": 2.176184810578095, "learning_rate": 1.981891999919732e-05, "loss": 0.8648, "step": 450 }, { "epoch": 0.2671009771986971, "grad_norm": 2.343556853907917, "learning_rate": 1.981770637014979e-05, "loss": 0.8753, "step": 451 }, { "epoch": 0.26769321883328395, "grad_norm": 2.961712486256478, "learning_rate": 1.981648872510828e-05, "loss": 0.8267, "step": 452 }, { "epoch": 0.2682854604678709, "grad_norm": 1.4529926233743575, "learning_rate": 1.981526706457087e-05, "loss": 0.8387, "step": 453 }, { "epoch": 0.2688777021024578, "grad_norm": 1.9505755733893613, "learning_rate": 1.9814041389037292e-05, "loss": 0.8853, "step": 454 }, { "epoch": 0.2694699437370447, "grad_norm": 2.200797769974991, "learning_rate": 1.981281169900892e-05, "loss": 0.9057, "step": 455 }, { "epoch": 0.2700621853716316, "grad_norm": 2.1870312653366137, "learning_rate": 1.9811577994988755e-05, "loss": 0.872, "step": 456 }, { "epoch": 0.27065442700621856, "grad_norm": 1.2993955402450537, "learning_rate": 1.9810340277481463e-05, "loss": 0.8613, "step": 457 }, { "epoch": 0.27124666864080543, "grad_norm": 2.425202483291421, "learning_rate": 1.9809098546993333e-05, "loss": 0.9174, "step": 458 }, { "epoch": 0.27183891027539236, "grad_norm": 1.83353264512604, "learning_rate": 1.9807852804032306e-05, "loss": 0.8911, "step": 459 }, { "epoch": 0.2724311519099793, "grad_norm": 1.8314326605476354, "learning_rate": 1.980660304910796e-05, "loss": 0.9341, "step": 460 }, { "epoch": 0.2730233935445662, "grad_norm": 1.6973761426278493, "learning_rate": 1.9805349282731513e-05, "loss": 0.9077, "step": 461 }, { "epoch": 0.2736156351791531, "grad_norm": 1.695076266085342, "learning_rate": 1.9804091505415833e-05, "loss": 0.8425, "step": 462 }, { "epoch": 0.27420787681374, "grad_norm": 1.6422303345636073, "learning_rate": 1.9802829717675413e-05, "loss": 0.8885, "step": 463 }, { "epoch": 0.2748001184483269, "grad_norm": 2.8627350079091456, "learning_rate": 1.98015639200264e-05, "loss": 0.8507, "step": 464 }, { "epoch": 0.27539236008291385, "grad_norm": 1.3298065268134922, "learning_rate": 1.980029411298657e-05, "loss": 0.8834, "step": 465 }, { "epoch": 0.2759846017175007, "grad_norm": 1.3923498378130503, "learning_rate": 1.979902029707536e-05, "loss": 0.8122, "step": 466 }, { "epoch": 0.27657684335208765, "grad_norm": 2.558317417775508, "learning_rate": 1.9797742472813815e-05, "loss": 0.9, "step": 467 }, { "epoch": 0.2771690849866746, "grad_norm": 2.184421600404926, "learning_rate": 1.9796460640724646e-05, "loss": 0.8803, "step": 468 }, { "epoch": 0.27776132662126146, "grad_norm": 1.5965091803938036, "learning_rate": 1.9795174801332195e-05, "loss": 0.8966, "step": 469 }, { "epoch": 0.2783535682558484, "grad_norm": 1.6260839104501157, "learning_rate": 1.9793884955162442e-05, "loss": 0.867, "step": 470 }, { "epoch": 0.27894580989043527, "grad_norm": 1.7420396047568658, "learning_rate": 1.9792591102743006e-05, "loss": 0.8878, "step": 471 }, { "epoch": 0.2795380515250222, "grad_norm": 1.306769898643035, "learning_rate": 1.979129324460314e-05, "loss": 0.8346, "step": 472 }, { "epoch": 0.28013029315960913, "grad_norm": 2.108370805335531, "learning_rate": 1.978999138127375e-05, "loss": 0.8706, "step": 473 }, { "epoch": 0.280722534794196, "grad_norm": 4.654961995545459, "learning_rate": 1.9788685513287368e-05, "loss": 0.8544, "step": 474 }, { "epoch": 0.28131477642878294, "grad_norm": 5.160042089413392, "learning_rate": 1.9787375641178162e-05, "loss": 0.8469, "step": 475 }, { "epoch": 0.2819070180633699, "grad_norm": 2.0855599172877337, "learning_rate": 1.9786061765481954e-05, "loss": 0.9022, "step": 476 }, { "epoch": 0.28249925969795675, "grad_norm": 1.621103726662229, "learning_rate": 1.9784743886736185e-05, "loss": 0.8852, "step": 477 }, { "epoch": 0.2830915013325437, "grad_norm": 1.959405094759062, "learning_rate": 1.9783422005479942e-05, "loss": 0.8392, "step": 478 }, { "epoch": 0.2836837429671306, "grad_norm": 2.3454164384529523, "learning_rate": 1.978209612225395e-05, "loss": 0.8649, "step": 479 }, { "epoch": 0.2842759846017175, "grad_norm": 2.3325577065069933, "learning_rate": 1.9780766237600574e-05, "loss": 0.8854, "step": 480 }, { "epoch": 0.2848682262363044, "grad_norm": 2.937131824507951, "learning_rate": 1.9779432352063806e-05, "loss": 0.9174, "step": 481 }, { "epoch": 0.2854604678708913, "grad_norm": 2.1825918189828744, "learning_rate": 1.977809446618928e-05, "loss": 0.8332, "step": 482 }, { "epoch": 0.28605270950547823, "grad_norm": 2.6555766398144494, "learning_rate": 1.9776752580524268e-05, "loss": 0.8701, "step": 483 }, { "epoch": 0.28664495114006516, "grad_norm": 2.1721573688476927, "learning_rate": 1.9775406695617677e-05, "loss": 0.9111, "step": 484 }, { "epoch": 0.28723719277465204, "grad_norm": 2.605674968341765, "learning_rate": 1.977405681202005e-05, "loss": 0.9035, "step": 485 }, { "epoch": 0.28782943440923897, "grad_norm": 1.8150442904058317, "learning_rate": 1.977270293028357e-05, "loss": 0.8809, "step": 486 }, { "epoch": 0.2884216760438259, "grad_norm": 2.126220077493787, "learning_rate": 1.977134505096204e-05, "loss": 0.8752, "step": 487 }, { "epoch": 0.2890139176784128, "grad_norm": 3.4772636452956593, "learning_rate": 1.9769983174610918e-05, "loss": 0.8583, "step": 488 }, { "epoch": 0.2896061593129997, "grad_norm": 2.44750945614034, "learning_rate": 1.9768617301787284e-05, "loss": 0.9164, "step": 489 }, { "epoch": 0.2901984009475866, "grad_norm": 2.0538731894956586, "learning_rate": 1.9767247433049858e-05, "loss": 0.8891, "step": 490 }, { "epoch": 0.2907906425821735, "grad_norm": 2.315988950710287, "learning_rate": 1.9765873568958996e-05, "loss": 0.868, "step": 491 }, { "epoch": 0.29138288421676045, "grad_norm": 2.6759070473500586, "learning_rate": 1.9764495710076678e-05, "loss": 0.8976, "step": 492 }, { "epoch": 0.29197512585134733, "grad_norm": 2.8515729411143624, "learning_rate": 1.9763113856966532e-05, "loss": 0.8445, "step": 493 }, { "epoch": 0.29256736748593426, "grad_norm": 5.706902531350699, "learning_rate": 1.9761728010193812e-05, "loss": 0.8302, "step": 494 }, { "epoch": 0.2931596091205212, "grad_norm": 2.3398897793110813, "learning_rate": 1.9760338170325405e-05, "loss": 0.8355, "step": 495 }, { "epoch": 0.29375185075510807, "grad_norm": 3.290716851896505, "learning_rate": 1.975894433792984e-05, "loss": 0.9306, "step": 496 }, { "epoch": 0.294344092389695, "grad_norm": 2.140513045045442, "learning_rate": 1.975754651357727e-05, "loss": 0.8518, "step": 497 }, { "epoch": 0.29493633402428193, "grad_norm": 2.1673411170485193, "learning_rate": 1.9756144697839477e-05, "loss": 0.8967, "step": 498 }, { "epoch": 0.2955285756588688, "grad_norm": 3.750627708947347, "learning_rate": 1.975473889128989e-05, "loss": 0.8229, "step": 499 }, { "epoch": 0.29612081729345574, "grad_norm": 2.3819533335097303, "learning_rate": 1.9753329094503563e-05, "loss": 0.8417, "step": 500 }, { "epoch": 0.2967130589280426, "grad_norm": 2.1932086970836924, "learning_rate": 1.9751915308057176e-05, "loss": 0.8552, "step": 501 }, { "epoch": 0.29730530056262955, "grad_norm": 3.717240525518049, "learning_rate": 1.9750497532529053e-05, "loss": 0.8384, "step": 502 }, { "epoch": 0.2978975421972165, "grad_norm": 3.828126525639594, "learning_rate": 1.9749075768499148e-05, "loss": 0.8433, "step": 503 }, { "epoch": 0.29848978383180336, "grad_norm": 2.6956236004584277, "learning_rate": 1.974765001654903e-05, "loss": 0.8441, "step": 504 }, { "epoch": 0.2990820254663903, "grad_norm": 3.676685156782072, "learning_rate": 1.974622027726192e-05, "loss": 0.8943, "step": 505 }, { "epoch": 0.2996742671009772, "grad_norm": 2.590931432467633, "learning_rate": 1.9744786551222658e-05, "loss": 0.8639, "step": 506 }, { "epoch": 0.3002665087355641, "grad_norm": 3.9335673574606935, "learning_rate": 1.9743348839017728e-05, "loss": 0.8656, "step": 507 }, { "epoch": 0.30085875037015103, "grad_norm": 2.2776114718789904, "learning_rate": 1.974190714123522e-05, "loss": 0.8451, "step": 508 }, { "epoch": 0.3014509920047379, "grad_norm": 4.638920612322161, "learning_rate": 1.974046145846488e-05, "loss": 0.8153, "step": 509 }, { "epoch": 0.30204323363932484, "grad_norm": 6.980939867299674, "learning_rate": 1.9739011791298073e-05, "loss": 0.8504, "step": 510 }, { "epoch": 0.30263547527391177, "grad_norm": 4.408663359040869, "learning_rate": 1.973755814032779e-05, "loss": 0.8419, "step": 511 }, { "epoch": 0.30322771690849865, "grad_norm": 3.0168512612679113, "learning_rate": 1.9736100506148657e-05, "loss": 0.8906, "step": 512 }, { "epoch": 0.3038199585430856, "grad_norm": 3.541981364911518, "learning_rate": 1.973463888935693e-05, "loss": 0.8498, "step": 513 }, { "epoch": 0.3044122001776725, "grad_norm": 2.628539633705203, "learning_rate": 1.9733173290550494e-05, "loss": 0.8722, "step": 514 }, { "epoch": 0.3050044418122594, "grad_norm": 8.965610022562394, "learning_rate": 1.973170371032886e-05, "loss": 0.888, "step": 515 }, { "epoch": 0.3055966834468463, "grad_norm": 3.339091960381498, "learning_rate": 1.9730230149293167e-05, "loss": 0.8096, "step": 516 }, { "epoch": 0.30618892508143325, "grad_norm": 2.91895416655151, "learning_rate": 1.9728752608046184e-05, "loss": 0.8708, "step": 517 }, { "epoch": 0.3067811667160201, "grad_norm": 2.8857357994218216, "learning_rate": 1.9727271087192312e-05, "loss": 0.8614, "step": 518 }, { "epoch": 0.30737340835060706, "grad_norm": 3.8897082719903815, "learning_rate": 1.9725785587337574e-05, "loss": 0.8364, "step": 519 }, { "epoch": 0.30796564998519393, "grad_norm": 3.0809821310404657, "learning_rate": 1.9724296109089623e-05, "loss": 0.8634, "step": 520 }, { "epoch": 0.30855789161978087, "grad_norm": 4.31589497235874, "learning_rate": 1.972280265305774e-05, "loss": 0.8938, "step": 521 }, { "epoch": 0.3091501332543678, "grad_norm": 3.4659891853357174, "learning_rate": 1.9721305219852833e-05, "loss": 0.8612, "step": 522 }, { "epoch": 0.3097423748889547, "grad_norm": 3.1684707972088106, "learning_rate": 1.9719803810087436e-05, "loss": 0.8838, "step": 523 }, { "epoch": 0.3103346165235416, "grad_norm": 5.701615286531985, "learning_rate": 1.971829842437571e-05, "loss": 0.8652, "step": 524 }, { "epoch": 0.31092685815812854, "grad_norm": 3.4465521972170987, "learning_rate": 1.971678906333344e-05, "loss": 0.8845, "step": 525 }, { "epoch": 0.3115190997927154, "grad_norm": 1.6217244261703954, "learning_rate": 1.971527572757804e-05, "loss": 0.8227, "step": 526 }, { "epoch": 0.31211134142730235, "grad_norm": 2.3507099292364897, "learning_rate": 1.9713758417728555e-05, "loss": 0.863, "step": 527 }, { "epoch": 0.3127035830618892, "grad_norm": 2.281502522601501, "learning_rate": 1.971223713440564e-05, "loss": 0.8567, "step": 528 }, { "epoch": 0.31329582469647616, "grad_norm": 3.1146189997548976, "learning_rate": 1.97107118782316e-05, "loss": 0.8631, "step": 529 }, { "epoch": 0.3138880663310631, "grad_norm": 1.9410780864006059, "learning_rate": 1.970918264983034e-05, "loss": 0.8465, "step": 530 }, { "epoch": 0.31448030796564996, "grad_norm": 3.224337429475279, "learning_rate": 1.97076494498274e-05, "loss": 0.846, "step": 531 }, { "epoch": 0.3150725496002369, "grad_norm": 6.086192190115328, "learning_rate": 1.970611227884995e-05, "loss": 0.8496, "step": 532 }, { "epoch": 0.3156647912348238, "grad_norm": 2.644702902924804, "learning_rate": 1.9704571137526775e-05, "loss": 0.7961, "step": 533 }, { "epoch": 0.3162570328694107, "grad_norm": 5.673305681652432, "learning_rate": 1.9703026026488288e-05, "loss": 0.8546, "step": 534 }, { "epoch": 0.31684927450399764, "grad_norm": 2.5921802921198767, "learning_rate": 1.9701476946366533e-05, "loss": 0.8838, "step": 535 }, { "epoch": 0.31744151613858457, "grad_norm": 2.008362256981687, "learning_rate": 1.9699923897795165e-05, "loss": 0.9067, "step": 536 }, { "epoch": 0.31803375777317144, "grad_norm": 3.335473379056157, "learning_rate": 1.969836688140947e-05, "loss": 0.8315, "step": 537 }, { "epoch": 0.3186259994077584, "grad_norm": 1.6713798386358392, "learning_rate": 1.9696805897846353e-05, "loss": 0.8996, "step": 538 }, { "epoch": 0.31921824104234525, "grad_norm": 1.6247895343380683, "learning_rate": 1.9695240947744345e-05, "loss": 0.9091, "step": 539 }, { "epoch": 0.3198104826769322, "grad_norm": 2.0514048927331685, "learning_rate": 1.9693672031743604e-05, "loss": 0.8807, "step": 540 }, { "epoch": 0.3204027243115191, "grad_norm": 3.2346881141878923, "learning_rate": 1.9692099150485897e-05, "loss": 0.8744, "step": 541 }, { "epoch": 0.320994965946106, "grad_norm": 1.993896753955855, "learning_rate": 1.9690522304614624e-05, "loss": 0.8935, "step": 542 }, { "epoch": 0.3215872075806929, "grad_norm": 1.6718977284924557, "learning_rate": 1.9688941494774807e-05, "loss": 0.871, "step": 543 }, { "epoch": 0.32217944921527986, "grad_norm": 1.635671027032527, "learning_rate": 1.9687356721613084e-05, "loss": 0.8915, "step": 544 }, { "epoch": 0.32277169084986673, "grad_norm": 1.873327797284262, "learning_rate": 1.968576798577771e-05, "loss": 0.8994, "step": 545 }, { "epoch": 0.32336393248445366, "grad_norm": 1.2751536161018124, "learning_rate": 1.9684175287918576e-05, "loss": 0.8471, "step": 546 }, { "epoch": 0.32395617411904054, "grad_norm": 2.7974266868919426, "learning_rate": 1.9682578628687183e-05, "loss": 0.8911, "step": 547 }, { "epoch": 0.3245484157536275, "grad_norm": 1.6428003910383486, "learning_rate": 1.968097800873665e-05, "loss": 0.877, "step": 548 }, { "epoch": 0.3251406573882144, "grad_norm": 5.184321377796686, "learning_rate": 1.9679373428721728e-05, "loss": 0.8149, "step": 549 }, { "epoch": 0.3257328990228013, "grad_norm": 3.034456419245674, "learning_rate": 1.9677764889298775e-05, "loss": 0.9059, "step": 550 }, { "epoch": 0.3263251406573882, "grad_norm": 1.824534137714233, "learning_rate": 1.9676152391125773e-05, "loss": 0.8729, "step": 551 }, { "epoch": 0.32691738229197514, "grad_norm": 2.2247918235652526, "learning_rate": 1.9674535934862327e-05, "loss": 0.899, "step": 552 }, { "epoch": 0.327509623926562, "grad_norm": 2.0674047220831655, "learning_rate": 1.967291552116966e-05, "loss": 0.8418, "step": 553 }, { "epoch": 0.32810186556114895, "grad_norm": 1.8961950219733992, "learning_rate": 1.967129115071061e-05, "loss": 0.8938, "step": 554 }, { "epoch": 0.3286941071957359, "grad_norm": 1.5716138089131724, "learning_rate": 1.9669662824149632e-05, "loss": 0.8675, "step": 555 }, { "epoch": 0.32928634883032276, "grad_norm": 2.505642914698843, "learning_rate": 1.966803054215281e-05, "loss": 0.8575, "step": 556 }, { "epoch": 0.3298785904649097, "grad_norm": 3.1345316813906745, "learning_rate": 1.966639430538784e-05, "loss": 0.8786, "step": 557 }, { "epoch": 0.33047083209949657, "grad_norm": 2.56591314502249, "learning_rate": 1.966475411452403e-05, "loss": 0.8637, "step": 558 }, { "epoch": 0.3310630737340835, "grad_norm": 2.7664296698481032, "learning_rate": 1.966310997023231e-05, "loss": 0.9181, "step": 559 }, { "epoch": 0.33165531536867043, "grad_norm": 1.5231814543200226, "learning_rate": 1.966146187318523e-05, "loss": 0.8748, "step": 560 }, { "epoch": 0.3322475570032573, "grad_norm": 4.815465527837311, "learning_rate": 1.9659809824056954e-05, "loss": 0.8408, "step": 561 }, { "epoch": 0.33283979863784424, "grad_norm": 1.6014447979733908, "learning_rate": 1.9658153823523262e-05, "loss": 0.8672, "step": 562 }, { "epoch": 0.3334320402724312, "grad_norm": 2.2744955163709006, "learning_rate": 1.9656493872261554e-05, "loss": 0.9016, "step": 563 }, { "epoch": 0.33402428190701805, "grad_norm": 1.781058566595245, "learning_rate": 1.9654829970950838e-05, "loss": 0.8764, "step": 564 }, { "epoch": 0.334616523541605, "grad_norm": 1.7582008304848717, "learning_rate": 1.9653162120271746e-05, "loss": 0.877, "step": 565 }, { "epoch": 0.3352087651761919, "grad_norm": 1.9319354513096203, "learning_rate": 1.965149032090653e-05, "loss": 0.8599, "step": 566 }, { "epoch": 0.3358010068107788, "grad_norm": 1.6123157537046895, "learning_rate": 1.9649814573539037e-05, "loss": 0.8374, "step": 567 }, { "epoch": 0.3363932484453657, "grad_norm": 2.605187020230856, "learning_rate": 1.9648134878854747e-05, "loss": 0.8699, "step": 568 }, { "epoch": 0.3369854900799526, "grad_norm": 2.4840368350540016, "learning_rate": 1.9646451237540756e-05, "loss": 0.8727, "step": 569 }, { "epoch": 0.33757773171453953, "grad_norm": 3.6970475679434465, "learning_rate": 1.9644763650285758e-05, "loss": 0.8354, "step": 570 }, { "epoch": 0.33816997334912646, "grad_norm": 1.804745820267029, "learning_rate": 1.964307211778008e-05, "loss": 0.8231, "step": 571 }, { "epoch": 0.33876221498371334, "grad_norm": 1.7814414820272138, "learning_rate": 1.9641376640715646e-05, "loss": 0.8829, "step": 572 }, { "epoch": 0.33935445661830027, "grad_norm": 2.246104894860363, "learning_rate": 1.963967721978601e-05, "loss": 0.8279, "step": 573 }, { "epoch": 0.3399466982528872, "grad_norm": 1.5276776993277508, "learning_rate": 1.963797385568632e-05, "loss": 0.8421, "step": 574 }, { "epoch": 0.3405389398874741, "grad_norm": 3.984819253438696, "learning_rate": 1.9636266549113358e-05, "loss": 0.8715, "step": 575 }, { "epoch": 0.341131181522061, "grad_norm": 1.6707727313470635, "learning_rate": 1.96345553007655e-05, "loss": 0.868, "step": 576 }, { "epoch": 0.3417234231566479, "grad_norm": 1.8434045564594856, "learning_rate": 1.9632840111342747e-05, "loss": 0.8946, "step": 577 }, { "epoch": 0.3423156647912348, "grad_norm": 2.5166196457952386, "learning_rate": 1.9631120981546713e-05, "loss": 0.8169, "step": 578 }, { "epoch": 0.34290790642582175, "grad_norm": 1.7256723725893435, "learning_rate": 1.962939791208061e-05, "loss": 0.8983, "step": 579 }, { "epoch": 0.3435001480604086, "grad_norm": 1.537497453982475, "learning_rate": 1.9627670903649273e-05, "loss": 0.8535, "step": 580 }, { "epoch": 0.34409238969499556, "grad_norm": 2.0982710201843346, "learning_rate": 1.9625939956959146e-05, "loss": 0.8352, "step": 581 }, { "epoch": 0.3446846313295825, "grad_norm": 1.6615651140303191, "learning_rate": 1.9624205072718285e-05, "loss": 0.8424, "step": 582 }, { "epoch": 0.34527687296416937, "grad_norm": 2.2732967556610144, "learning_rate": 1.9622466251636352e-05, "loss": 0.8548, "step": 583 }, { "epoch": 0.3458691145987563, "grad_norm": 1.9163123664346449, "learning_rate": 1.9620723494424627e-05, "loss": 0.8903, "step": 584 }, { "epoch": 0.34646135623334323, "grad_norm": 1.8941032464371603, "learning_rate": 1.961897680179599e-05, "loss": 0.856, "step": 585 }, { "epoch": 0.3470535978679301, "grad_norm": 1.851962175877676, "learning_rate": 1.9617226174464945e-05, "loss": 0.8798, "step": 586 }, { "epoch": 0.34764583950251704, "grad_norm": 1.4058738019152999, "learning_rate": 1.961547161314759e-05, "loss": 0.862, "step": 587 }, { "epoch": 0.3482380811371039, "grad_norm": 1.9819045004335991, "learning_rate": 1.9613713118561638e-05, "loss": 0.8779, "step": 588 }, { "epoch": 0.34883032277169085, "grad_norm": 1.6060471271059031, "learning_rate": 1.961195069142642e-05, "loss": 0.843, "step": 589 }, { "epoch": 0.3494225644062778, "grad_norm": 1.5640055985461452, "learning_rate": 1.961018433246286e-05, "loss": 0.8379, "step": 590 }, { "epoch": 0.35001480604086466, "grad_norm": 3.0209204466667905, "learning_rate": 1.9608414042393503e-05, "loss": 0.825, "step": 591 }, { "epoch": 0.3506070476754516, "grad_norm": 1.6509809673195002, "learning_rate": 1.9606639821942496e-05, "loss": 0.858, "step": 592 }, { "epoch": 0.3511992893100385, "grad_norm": 1.8338700183391534, "learning_rate": 1.9604861671835593e-05, "loss": 0.9081, "step": 593 }, { "epoch": 0.3517915309446254, "grad_norm": 1.6832166078423174, "learning_rate": 1.9603079592800157e-05, "loss": 0.8525, "step": 594 }, { "epoch": 0.35238377257921233, "grad_norm": 3.6998428943163715, "learning_rate": 1.960129358556516e-05, "loss": 0.9052, "step": 595 }, { "epoch": 0.3529760142137992, "grad_norm": 1.8174759599966348, "learning_rate": 1.9599503650861183e-05, "loss": 0.8153, "step": 596 }, { "epoch": 0.35356825584838614, "grad_norm": 3.5284496969394277, "learning_rate": 1.9597709789420404e-05, "loss": 0.8488, "step": 597 }, { "epoch": 0.35416049748297307, "grad_norm": 8.653779192107086, "learning_rate": 1.959591200197662e-05, "loss": 0.8625, "step": 598 }, { "epoch": 0.35475273911755995, "grad_norm": 1.8759995754403318, "learning_rate": 1.9594110289265218e-05, "loss": 0.8618, "step": 599 }, { "epoch": 0.3553449807521469, "grad_norm": 1.9485225987035864, "learning_rate": 1.9592304652023208e-05, "loss": 0.8226, "step": 600 }, { "epoch": 0.3559372223867338, "grad_norm": 3.686184834700953, "learning_rate": 1.959049509098919e-05, "loss": 0.8781, "step": 601 }, { "epoch": 0.3565294640213207, "grad_norm": 2.8230171313225783, "learning_rate": 1.9588681606903385e-05, "loss": 0.8445, "step": 602 }, { "epoch": 0.3571217056559076, "grad_norm": 2.7255745530386193, "learning_rate": 1.95868642005076e-05, "loss": 0.8965, "step": 603 }, { "epoch": 0.35771394729049455, "grad_norm": 3.1116205729926385, "learning_rate": 1.9585042872545266e-05, "loss": 0.8434, "step": 604 }, { "epoch": 0.3583061889250814, "grad_norm": 3.8126207167526154, "learning_rate": 1.9583217623761404e-05, "loss": 0.8651, "step": 605 }, { "epoch": 0.35889843055966836, "grad_norm": 1.658269096943048, "learning_rate": 1.958138845490264e-05, "loss": 0.8434, "step": 606 }, { "epoch": 0.35949067219425523, "grad_norm": 2.040025083348231, "learning_rate": 1.9579555366717214e-05, "loss": 0.8795, "step": 607 }, { "epoch": 0.36008291382884217, "grad_norm": 4.804650864705654, "learning_rate": 1.9577718359954955e-05, "loss": 0.8943, "step": 608 }, { "epoch": 0.3606751554634291, "grad_norm": 1.7709023659733834, "learning_rate": 1.957587743536731e-05, "loss": 0.8878, "step": 609 }, { "epoch": 0.361267397098016, "grad_norm": 2.25150632588813, "learning_rate": 1.9574032593707314e-05, "loss": 0.8498, "step": 610 }, { "epoch": 0.3618596387326029, "grad_norm": 7.9605607465838695, "learning_rate": 1.9572183835729613e-05, "loss": 0.9201, "step": 611 }, { "epoch": 0.36245188036718984, "grad_norm": 2.5393697091836627, "learning_rate": 1.957033116219045e-05, "loss": 0.8733, "step": 612 }, { "epoch": 0.3630441220017767, "grad_norm": 1.6878302312671156, "learning_rate": 1.956847457384768e-05, "loss": 0.8936, "step": 613 }, { "epoch": 0.36363636363636365, "grad_norm": 1.868290304790888, "learning_rate": 1.956661407146075e-05, "loss": 0.8469, "step": 614 }, { "epoch": 0.3642286052709505, "grad_norm": 1.7524149989576479, "learning_rate": 1.9564749655790706e-05, "loss": 0.8834, "step": 615 }, { "epoch": 0.36482084690553745, "grad_norm": 3.268017562205005, "learning_rate": 1.9562881327600197e-05, "loss": 0.8549, "step": 616 }, { "epoch": 0.3654130885401244, "grad_norm": 1.976076053518677, "learning_rate": 1.9561009087653483e-05, "loss": 0.8568, "step": 617 }, { "epoch": 0.36600533017471126, "grad_norm": 2.637521424974901, "learning_rate": 1.955913293671641e-05, "loss": 0.8918, "step": 618 }, { "epoch": 0.3665975718092982, "grad_norm": 3.314444680794765, "learning_rate": 1.9557252875556428e-05, "loss": 0.8414, "step": 619 }, { "epoch": 0.3671898134438851, "grad_norm": 2.3746791597113766, "learning_rate": 1.9555368904942593e-05, "loss": 0.8457, "step": 620 }, { "epoch": 0.367782055078472, "grad_norm": 2.6011871499667523, "learning_rate": 1.9553481025645545e-05, "loss": 0.8873, "step": 621 }, { "epoch": 0.36837429671305894, "grad_norm": 6.819452995677599, "learning_rate": 1.9551589238437546e-05, "loss": 0.8907, "step": 622 }, { "epoch": 0.36896653834764587, "grad_norm": 2.048515035382417, "learning_rate": 1.954969354409243e-05, "loss": 0.8392, "step": 623 }, { "epoch": 0.36955877998223274, "grad_norm": 2.3431036995490855, "learning_rate": 1.954779394338566e-05, "loss": 0.8916, "step": 624 }, { "epoch": 0.3701510216168197, "grad_norm": 3.0084271040530073, "learning_rate": 1.954589043709426e-05, "loss": 0.904, "step": 625 }, { "epoch": 0.37074326325140655, "grad_norm": 1.5517306350499644, "learning_rate": 1.954398302599688e-05, "loss": 0.8716, "step": 626 }, { "epoch": 0.3713355048859935, "grad_norm": 4.467132411617738, "learning_rate": 1.954207171087376e-05, "loss": 0.8593, "step": 627 }, { "epoch": 0.3719277465205804, "grad_norm": 2.292921810220871, "learning_rate": 1.9540156492506734e-05, "loss": 0.8766, "step": 628 }, { "epoch": 0.3725199881551673, "grad_norm": 1.9559474188452068, "learning_rate": 1.9538237371679233e-05, "loss": 0.8705, "step": 629 }, { "epoch": 0.3731122297897542, "grad_norm": 1.6930579045607708, "learning_rate": 1.9536314349176288e-05, "loss": 0.8455, "step": 630 }, { "epoch": 0.37370447142434116, "grad_norm": 5.371918547587357, "learning_rate": 1.9534387425784518e-05, "loss": 0.8998, "step": 631 }, { "epoch": 0.37429671305892803, "grad_norm": 1.7900417336112107, "learning_rate": 1.9532456602292148e-05, "loss": 0.8481, "step": 632 }, { "epoch": 0.37488895469351496, "grad_norm": 2.2958736067272416, "learning_rate": 1.9530521879488993e-05, "loss": 0.9105, "step": 633 }, { "epoch": 0.37548119632810184, "grad_norm": 2.638086764469702, "learning_rate": 1.952858325816646e-05, "loss": 0.8381, "step": 634 }, { "epoch": 0.3760734379626888, "grad_norm": 2.439133220470199, "learning_rate": 1.9526640739117555e-05, "loss": 0.8751, "step": 635 }, { "epoch": 0.3766656795972757, "grad_norm": 1.8675503716373216, "learning_rate": 1.9524694323136883e-05, "loss": 0.883, "step": 636 }, { "epoch": 0.3772579212318626, "grad_norm": 4.742878698850084, "learning_rate": 1.952274401102063e-05, "loss": 0.888, "step": 637 }, { "epoch": 0.3778501628664495, "grad_norm": 2.420961821182289, "learning_rate": 1.952078980356659e-05, "loss": 0.8812, "step": 638 }, { "epoch": 0.37844240450103644, "grad_norm": 2.237864761118068, "learning_rate": 1.9518831701574136e-05, "loss": 0.8906, "step": 639 }, { "epoch": 0.3790346461356233, "grad_norm": 3.5814294479841915, "learning_rate": 1.951686970584425e-05, "loss": 0.8762, "step": 640 }, { "epoch": 0.37962688777021025, "grad_norm": 3.322882097775164, "learning_rate": 1.951490381717949e-05, "loss": 0.8855, "step": 641 }, { "epoch": 0.3802191294047972, "grad_norm": 2.2781530112916104, "learning_rate": 1.9512934036384026e-05, "loss": 0.8861, "step": 642 }, { "epoch": 0.38081137103938406, "grad_norm": 3.1293049656485166, "learning_rate": 1.95109603642636e-05, "loss": 0.8684, "step": 643 }, { "epoch": 0.381403612673971, "grad_norm": 2.2366175666301626, "learning_rate": 1.9508982801625557e-05, "loss": 0.8209, "step": 644 }, { "epoch": 0.38199585430855787, "grad_norm": 2.0870198727029776, "learning_rate": 1.9507001349278834e-05, "loss": 0.8726, "step": 645 }, { "epoch": 0.3825880959431448, "grad_norm": 5.789578141166538, "learning_rate": 1.9505016008033953e-05, "loss": 0.8065, "step": 646 }, { "epoch": 0.38318033757773173, "grad_norm": 3.0810399663606183, "learning_rate": 1.9503026778703034e-05, "loss": 0.8686, "step": 647 }, { "epoch": 0.3837725792123186, "grad_norm": 3.6634728348627528, "learning_rate": 1.950103366209978e-05, "loss": 0.8719, "step": 648 }, { "epoch": 0.38436482084690554, "grad_norm": 2.8736691268816874, "learning_rate": 1.949903665903949e-05, "loss": 0.8767, "step": 649 }, { "epoch": 0.3849570624814925, "grad_norm": 5.882320046408276, "learning_rate": 1.949703577033905e-05, "loss": 0.8671, "step": 650 }, { "epoch": 0.38554930411607935, "grad_norm": 2.9528476437318925, "learning_rate": 1.9495030996816932e-05, "loss": 0.9013, "step": 651 }, { "epoch": 0.3861415457506663, "grad_norm": 2.6363739674663247, "learning_rate": 1.9493022339293207e-05, "loss": 0.8833, "step": 652 }, { "epoch": 0.38673378738525316, "grad_norm": 2.17785246783246, "learning_rate": 1.949100979858953e-05, "loss": 0.8866, "step": 653 }, { "epoch": 0.3873260290198401, "grad_norm": 5.128470877775802, "learning_rate": 1.9488993375529137e-05, "loss": 0.9099, "step": 654 }, { "epoch": 0.387918270654427, "grad_norm": 2.804392512181631, "learning_rate": 1.9486973070936862e-05, "loss": 0.8498, "step": 655 }, { "epoch": 0.3885105122890139, "grad_norm": 2.9434325093740905, "learning_rate": 1.9484948885639122e-05, "loss": 0.8449, "step": 656 }, { "epoch": 0.38910275392360083, "grad_norm": 4.154023552286942, "learning_rate": 1.9482920820463923e-05, "loss": 0.8563, "step": 657 }, { "epoch": 0.38969499555818776, "grad_norm": 3.148511348448834, "learning_rate": 1.948088887624086e-05, "loss": 0.8056, "step": 658 }, { "epoch": 0.39028723719277464, "grad_norm": 3.926327591148058, "learning_rate": 1.947885305380111e-05, "loss": 0.8888, "step": 659 }, { "epoch": 0.39087947882736157, "grad_norm": 5.980099519975163, "learning_rate": 1.9476813353977442e-05, "loss": 0.8454, "step": 660 }, { "epoch": 0.3914717204619485, "grad_norm": 3.3367209292691, "learning_rate": 1.9474769777604198e-05, "loss": 0.8548, "step": 661 }, { "epoch": 0.3920639620965354, "grad_norm": 2.968203146893538, "learning_rate": 1.947272232551733e-05, "loss": 0.84, "step": 662 }, { "epoch": 0.3926562037311223, "grad_norm": 39.05789203370318, "learning_rate": 1.9470670998554352e-05, "loss": 0.8914, "step": 663 }, { "epoch": 0.3932484453657092, "grad_norm": 3.4148463229844506, "learning_rate": 1.9468615797554374e-05, "loss": 0.8605, "step": 664 }, { "epoch": 0.3938406870002961, "grad_norm": 6.8180037854052316, "learning_rate": 1.9466556723358084e-05, "loss": 0.9065, "step": 665 }, { "epoch": 0.39443292863488305, "grad_norm": 2.1152979135979484, "learning_rate": 1.946449377680777e-05, "loss": 0.8731, "step": 666 }, { "epoch": 0.3950251702694699, "grad_norm": 2.4870435911253033, "learning_rate": 1.9462426958747285e-05, "loss": 0.8652, "step": 667 }, { "epoch": 0.39561741190405686, "grad_norm": 2.6453498019333277, "learning_rate": 1.9460356270022073e-05, "loss": 0.898, "step": 668 }, { "epoch": 0.3962096535386438, "grad_norm": 3.0508069490190066, "learning_rate": 1.945828171147917e-05, "loss": 0.9055, "step": 669 }, { "epoch": 0.39680189517323067, "grad_norm": 1.6241048949900765, "learning_rate": 1.945620328396718e-05, "loss": 0.9111, "step": 670 }, { "epoch": 0.3973941368078176, "grad_norm": 1.2381656243162942, "learning_rate": 1.9454120988336297e-05, "loss": 0.9074, "step": 671 }, { "epoch": 0.3979863784424045, "grad_norm": 1.8466695435602942, "learning_rate": 1.9452034825438302e-05, "loss": 0.9597, "step": 672 }, { "epoch": 0.3985786200769914, "grad_norm": 1.2562124363063687, "learning_rate": 1.9449944796126547e-05, "loss": 0.9371, "step": 673 }, { "epoch": 0.39917086171157834, "grad_norm": 1.7231260645325064, "learning_rate": 1.9447850901255975e-05, "loss": 0.8556, "step": 674 }, { "epoch": 0.3997631033461652, "grad_norm": 1.6192363719486302, "learning_rate": 1.9445753141683107e-05, "loss": 0.8885, "step": 675 }, { "epoch": 0.40035534498075215, "grad_norm": 1.675728205658867, "learning_rate": 1.9443651518266044e-05, "loss": 0.8692, "step": 676 }, { "epoch": 0.4009475866153391, "grad_norm": 2.04303793153201, "learning_rate": 1.9441546031864467e-05, "loss": 0.8552, "step": 677 }, { "epoch": 0.40153982824992596, "grad_norm": 1.9133034698626181, "learning_rate": 1.943943668333964e-05, "loss": 0.882, "step": 678 }, { "epoch": 0.4021320698845129, "grad_norm": 2.1370541384543342, "learning_rate": 1.9437323473554404e-05, "loss": 0.8968, "step": 679 }, { "epoch": 0.4027243115190998, "grad_norm": 1.5028317313676052, "learning_rate": 1.943520640337318e-05, "loss": 0.8582, "step": 680 }, { "epoch": 0.4033165531536867, "grad_norm": 3.9594823865775868, "learning_rate": 1.943308547366197e-05, "loss": 0.8111, "step": 681 }, { "epoch": 0.40390879478827363, "grad_norm": 3.417942000806396, "learning_rate": 1.9430960685288355e-05, "loss": 0.8385, "step": 682 }, { "epoch": 0.4045010364228605, "grad_norm": 1.6652473512379486, "learning_rate": 1.9428832039121487e-05, "loss": 0.8686, "step": 683 }, { "epoch": 0.40509327805744744, "grad_norm": 3.8546279443523663, "learning_rate": 1.942669953603211e-05, "loss": 0.8697, "step": 684 }, { "epoch": 0.40568551969203437, "grad_norm": 1.9412501113108496, "learning_rate": 1.9424563176892534e-05, "loss": 0.8235, "step": 685 }, { "epoch": 0.40627776132662125, "grad_norm": 1.8569982182714377, "learning_rate": 1.9422422962576646e-05, "loss": 0.9344, "step": 686 }, { "epoch": 0.4068700029612082, "grad_norm": 2.1750669740257824, "learning_rate": 1.9420278893959922e-05, "loss": 0.868, "step": 687 }, { "epoch": 0.4074622445957951, "grad_norm": 2.4649163202529607, "learning_rate": 1.94181309719194e-05, "loss": 0.8585, "step": 688 }, { "epoch": 0.408054486230382, "grad_norm": 2.5516191944482443, "learning_rate": 1.9415979197333704e-05, "loss": 0.866, "step": 689 }, { "epoch": 0.4086467278649689, "grad_norm": 2.140588212831023, "learning_rate": 1.941382357108303e-05, "loss": 0.8736, "step": 690 }, { "epoch": 0.4092389694995558, "grad_norm": 2.6369171422119417, "learning_rate": 1.941166409404915e-05, "loss": 0.8362, "step": 691 }, { "epoch": 0.4098312111341427, "grad_norm": 3.0607023188332074, "learning_rate": 1.9409500767115414e-05, "loss": 0.8399, "step": 692 }, { "epoch": 0.41042345276872966, "grad_norm": 3.0627901913227924, "learning_rate": 1.940733359116674e-05, "loss": 0.879, "step": 693 }, { "epoch": 0.41101569440331653, "grad_norm": 3.2361644739692292, "learning_rate": 1.9405162567089627e-05, "loss": 0.8678, "step": 694 }, { "epoch": 0.41160793603790347, "grad_norm": 18.463939120851368, "learning_rate": 1.940298769577215e-05, "loss": 0.9256, "step": 695 }, { "epoch": 0.4122001776724904, "grad_norm": 10.368145595990683, "learning_rate": 1.9400808978103948e-05, "loss": 0.8188, "step": 696 }, { "epoch": 0.4127924193070773, "grad_norm": 2.4187494879479745, "learning_rate": 1.939862641497624e-05, "loss": 0.8745, "step": 697 }, { "epoch": 0.4133846609416642, "grad_norm": 1.806561112554444, "learning_rate": 1.939644000728182e-05, "loss": 0.8622, "step": 698 }, { "epoch": 0.41397690257625114, "grad_norm": 2.376096051071182, "learning_rate": 1.9394249755915047e-05, "loss": 0.8613, "step": 699 }, { "epoch": 0.414569144210838, "grad_norm": 2.0606522501522724, "learning_rate": 1.939205566177186e-05, "loss": 0.8889, "step": 700 }, { "epoch": 0.41516138584542495, "grad_norm": 1.6424519963083875, "learning_rate": 1.9389857725749767e-05, "loss": 0.924, "step": 701 }, { "epoch": 0.4157536274800118, "grad_norm": 2.5660132236892608, "learning_rate": 1.938765594874785e-05, "loss": 0.8472, "step": 702 }, { "epoch": 0.41634586911459875, "grad_norm": 2.8325159916694598, "learning_rate": 1.9385450331666754e-05, "loss": 0.8887, "step": 703 }, { "epoch": 0.4169381107491857, "grad_norm": 1.7008840901930986, "learning_rate": 1.93832408754087e-05, "loss": 0.8748, "step": 704 }, { "epoch": 0.41753035238377256, "grad_norm": 1.9216283469656232, "learning_rate": 1.9381027580877486e-05, "loss": 0.8415, "step": 705 }, { "epoch": 0.4181225940183595, "grad_norm": 2.026996390561362, "learning_rate": 1.937881044897847e-05, "loss": 0.8416, "step": 706 }, { "epoch": 0.4187148356529464, "grad_norm": 1.423893980632455, "learning_rate": 1.9376589480618583e-05, "loss": 0.8505, "step": 707 }, { "epoch": 0.4193070772875333, "grad_norm": 1.8333091580377612, "learning_rate": 1.937436467670633e-05, "loss": 0.8715, "step": 708 }, { "epoch": 0.41989931892212023, "grad_norm": 2.2298036589587658, "learning_rate": 1.937213603815178e-05, "loss": 0.8767, "step": 709 }, { "epoch": 0.4204915605567071, "grad_norm": 3.440326466071599, "learning_rate": 1.9369903565866565e-05, "loss": 0.8322, "step": 710 }, { "epoch": 0.42108380219129404, "grad_norm": 1.6529181371554102, "learning_rate": 1.9367667260763898e-05, "loss": 0.899, "step": 711 }, { "epoch": 0.421676043825881, "grad_norm": 1.811748367956541, "learning_rate": 1.936542712375855e-05, "loss": 0.805, "step": 712 }, { "epoch": 0.42226828546046785, "grad_norm": 1.4423225369050077, "learning_rate": 1.9363183155766867e-05, "loss": 0.9193, "step": 713 }, { "epoch": 0.4228605270950548, "grad_norm": 3.711540769404225, "learning_rate": 1.9360935357706756e-05, "loss": 0.8143, "step": 714 }, { "epoch": 0.4234527687296417, "grad_norm": 1.6013652544896453, "learning_rate": 1.9358683730497695e-05, "loss": 0.8883, "step": 715 }, { "epoch": 0.4240450103642286, "grad_norm": 4.765502585221681, "learning_rate": 1.9356428275060722e-05, "loss": 0.845, "step": 716 }, { "epoch": 0.4246372519988155, "grad_norm": 3.1849530858655983, "learning_rate": 1.9354168992318448e-05, "loss": 0.9136, "step": 717 }, { "epoch": 0.42522949363340246, "grad_norm": 1.62933697491851, "learning_rate": 1.9351905883195044e-05, "loss": 0.8887, "step": 718 }, { "epoch": 0.42582173526798933, "grad_norm": 1.8945696646562518, "learning_rate": 1.9349638948616253e-05, "loss": 0.8268, "step": 719 }, { "epoch": 0.42641397690257626, "grad_norm": 2.4172838989795125, "learning_rate": 1.934736818950937e-05, "loss": 0.8683, "step": 720 }, { "epoch": 0.42700621853716314, "grad_norm": 1.884065405882965, "learning_rate": 1.9345093606803276e-05, "loss": 0.8832, "step": 721 }, { "epoch": 0.42759846017175007, "grad_norm": 1.77943938389625, "learning_rate": 1.9342815201428394e-05, "loss": 0.8842, "step": 722 }, { "epoch": 0.428190701806337, "grad_norm": 1.4357452116165423, "learning_rate": 1.9340532974316727e-05, "loss": 0.8274, "step": 723 }, { "epoch": 0.4287829434409239, "grad_norm": 1.4379221296115967, "learning_rate": 1.9338246926401828e-05, "loss": 0.8388, "step": 724 }, { "epoch": 0.4293751850755108, "grad_norm": 2.07752636716361, "learning_rate": 1.933595705861882e-05, "loss": 0.833, "step": 725 }, { "epoch": 0.42996742671009774, "grad_norm": 2.0105999576116322, "learning_rate": 1.9333663371904388e-05, "loss": 0.8473, "step": 726 }, { "epoch": 0.4305596683446846, "grad_norm": 3.956301063546753, "learning_rate": 1.933136586719678e-05, "loss": 0.8496, "step": 727 }, { "epoch": 0.43115190997927155, "grad_norm": 1.3549992353009284, "learning_rate": 1.9329064545435803e-05, "loss": 0.8747, "step": 728 }, { "epoch": 0.43174415161385843, "grad_norm": 2.227577277579618, "learning_rate": 1.932675940756283e-05, "loss": 0.8362, "step": 729 }, { "epoch": 0.43233639324844536, "grad_norm": 1.5147909801235255, "learning_rate": 1.932445045452079e-05, "loss": 0.8688, "step": 730 }, { "epoch": 0.4329286348830323, "grad_norm": 1.920697239844712, "learning_rate": 1.9322137687254175e-05, "loss": 0.8608, "step": 731 }, { "epoch": 0.43352087651761917, "grad_norm": 1.9550148078238179, "learning_rate": 1.931982110670904e-05, "loss": 0.8438, "step": 732 }, { "epoch": 0.4341131181522061, "grad_norm": 1.9357461615720977, "learning_rate": 1.9317500713832987e-05, "loss": 0.8289, "step": 733 }, { "epoch": 0.43470535978679303, "grad_norm": 2.436470304212335, "learning_rate": 1.9315176509575196e-05, "loss": 0.8505, "step": 734 }, { "epoch": 0.4352976014213799, "grad_norm": 2.0394551856453633, "learning_rate": 1.93128484948864e-05, "loss": 0.8595, "step": 735 }, { "epoch": 0.43588984305596684, "grad_norm": 16.717645135664075, "learning_rate": 1.9310516670718877e-05, "loss": 0.8358, "step": 736 }, { "epoch": 0.4364820846905538, "grad_norm": 1.9423408377644282, "learning_rate": 1.930818103802648e-05, "loss": 0.861, "step": 737 }, { "epoch": 0.43707432632514065, "grad_norm": 4.812386875770368, "learning_rate": 1.9305841597764615e-05, "loss": 0.8801, "step": 738 }, { "epoch": 0.4376665679597276, "grad_norm": 1.4672305025896835, "learning_rate": 1.9303498350890246e-05, "loss": 0.8782, "step": 739 }, { "epoch": 0.43825880959431446, "grad_norm": 2.2508593137348676, "learning_rate": 1.9301151298361887e-05, "loss": 0.877, "step": 740 }, { "epoch": 0.4388510512289014, "grad_norm": 1.7135079210199697, "learning_rate": 1.9298800441139623e-05, "loss": 0.8972, "step": 741 }, { "epoch": 0.4394432928634883, "grad_norm": 1.4490997536994275, "learning_rate": 1.9296445780185077e-05, "loss": 0.8224, "step": 742 }, { "epoch": 0.4400355344980752, "grad_norm": 1.770609855482205, "learning_rate": 1.9294087316461446e-05, "loss": 0.8803, "step": 743 }, { "epoch": 0.44062777613266213, "grad_norm": 4.232242865631687, "learning_rate": 1.929172505093347e-05, "loss": 0.8834, "step": 744 }, { "epoch": 0.44122001776724906, "grad_norm": 2.0717939954326625, "learning_rate": 1.9289358984567446e-05, "loss": 0.8896, "step": 745 }, { "epoch": 0.44181225940183594, "grad_norm": 4.304229428729412, "learning_rate": 1.928698911833123e-05, "loss": 0.8621, "step": 746 }, { "epoch": 0.44240450103642287, "grad_norm": 2.200899929703116, "learning_rate": 1.928461545319424e-05, "loss": 0.8648, "step": 747 }, { "epoch": 0.44299674267100975, "grad_norm": 1.6834184926932727, "learning_rate": 1.9282237990127425e-05, "loss": 0.8714, "step": 748 }, { "epoch": 0.4435889843055967, "grad_norm": 2.104833385776733, "learning_rate": 1.927985673010331e-05, "loss": 0.8172, "step": 749 }, { "epoch": 0.4441812259401836, "grad_norm": 3.6197381435137324, "learning_rate": 1.9277471674095952e-05, "loss": 0.8392, "step": 750 }, { "epoch": 0.4447734675747705, "grad_norm": 2.11958620090702, "learning_rate": 1.9275082823080992e-05, "loss": 0.8735, "step": 751 }, { "epoch": 0.4453657092093574, "grad_norm": 3.3384124014666874, "learning_rate": 1.927269017803559e-05, "loss": 0.9202, "step": 752 }, { "epoch": 0.44595795084394435, "grad_norm": 2.705834935266405, "learning_rate": 1.9270293739938477e-05, "loss": 0.8619, "step": 753 }, { "epoch": 0.4465501924785312, "grad_norm": 2.7094042650834527, "learning_rate": 1.9267893509769927e-05, "loss": 0.8918, "step": 754 }, { "epoch": 0.44714243411311816, "grad_norm": 1.7854900824015136, "learning_rate": 1.926548948851178e-05, "loss": 0.8564, "step": 755 }, { "epoch": 0.4477346757477051, "grad_norm": 1.6005502953829145, "learning_rate": 1.92630816771474e-05, "loss": 0.8412, "step": 756 }, { "epoch": 0.44832691738229197, "grad_norm": 7.492700831605615, "learning_rate": 1.9260670076661735e-05, "loss": 0.8718, "step": 757 }, { "epoch": 0.4489191590168789, "grad_norm": 5.427630233101737, "learning_rate": 1.925825468804125e-05, "loss": 0.8778, "step": 758 }, { "epoch": 0.4495114006514658, "grad_norm": 1.8888166698278774, "learning_rate": 1.9255835512273982e-05, "loss": 0.8596, "step": 759 }, { "epoch": 0.4501036422860527, "grad_norm": 2.1812538978009055, "learning_rate": 1.9253412550349507e-05, "loss": 0.8691, "step": 760 }, { "epoch": 0.45069588392063964, "grad_norm": 2.924997761771959, "learning_rate": 1.9250985803258957e-05, "loss": 0.8579, "step": 761 }, { "epoch": 0.4512881255552265, "grad_norm": 2.1898518842107424, "learning_rate": 1.9248555271995006e-05, "loss": 0.9125, "step": 762 }, { "epoch": 0.45188036718981345, "grad_norm": 2.1171881776125234, "learning_rate": 1.924612095755188e-05, "loss": 0.818, "step": 763 }, { "epoch": 0.4524726088244004, "grad_norm": 1.860211579546916, "learning_rate": 1.924368286092534e-05, "loss": 0.8827, "step": 764 }, { "epoch": 0.45306485045898726, "grad_norm": 1.6407946441138859, "learning_rate": 1.9241240983112718e-05, "loss": 0.8979, "step": 765 }, { "epoch": 0.4536570920935742, "grad_norm": 1.8118381222826243, "learning_rate": 1.9238795325112867e-05, "loss": 0.8969, "step": 766 }, { "epoch": 0.45424933372816106, "grad_norm": 1.5722837226227442, "learning_rate": 1.9236345887926215e-05, "loss": 0.8211, "step": 767 }, { "epoch": 0.454841575362748, "grad_norm": 1.690859845921261, "learning_rate": 1.92338926725547e-05, "loss": 0.9023, "step": 768 }, { "epoch": 0.45543381699733493, "grad_norm": 1.3414797354584787, "learning_rate": 1.9231435680001844e-05, "loss": 0.8433, "step": 769 }, { "epoch": 0.4560260586319218, "grad_norm": 2.9314823394828884, "learning_rate": 1.9228974911272682e-05, "loss": 0.8694, "step": 770 }, { "epoch": 0.45661830026650874, "grad_norm": 1.6345887387544766, "learning_rate": 1.9226510367373812e-05, "loss": 0.9751, "step": 771 }, { "epoch": 0.45721054190109567, "grad_norm": 1.4992911219830736, "learning_rate": 1.922404204931337e-05, "loss": 0.8129, "step": 772 }, { "epoch": 0.45780278353568254, "grad_norm": 1.4628930115387142, "learning_rate": 1.9221569958101038e-05, "loss": 0.8945, "step": 773 }, { "epoch": 0.4583950251702695, "grad_norm": 2.723334540080781, "learning_rate": 1.9219094094748035e-05, "loss": 0.8383, "step": 774 }, { "epoch": 0.4589872668048564, "grad_norm": 1.9492331431341308, "learning_rate": 1.9216614460267132e-05, "loss": 0.8664, "step": 775 }, { "epoch": 0.4595795084394433, "grad_norm": 1.8901483017176506, "learning_rate": 1.9214131055672648e-05, "loss": 0.9114, "step": 776 }, { "epoch": 0.4601717500740302, "grad_norm": 1.5947480549984965, "learning_rate": 1.921164388198042e-05, "loss": 0.8619, "step": 777 }, { "epoch": 0.4607639917086171, "grad_norm": 2.944268928829697, "learning_rate": 1.9209152940207846e-05, "loss": 0.8841, "step": 778 }, { "epoch": 0.461356233343204, "grad_norm": 1.9959401062119286, "learning_rate": 1.920665823137387e-05, "loss": 0.8942, "step": 779 }, { "epoch": 0.46194847497779096, "grad_norm": 2.055054417491741, "learning_rate": 1.920415975649896e-05, "loss": 0.8985, "step": 780 }, { "epoch": 0.46254071661237783, "grad_norm": 1.43509961761953, "learning_rate": 1.9201657516605136e-05, "loss": 0.8409, "step": 781 }, { "epoch": 0.46313295824696477, "grad_norm": 2.0964064977562464, "learning_rate": 1.9199151512715948e-05, "loss": 0.8603, "step": 782 }, { "epoch": 0.4637251998815517, "grad_norm": 2.0471728931299933, "learning_rate": 1.9196641745856502e-05, "loss": 0.8195, "step": 783 }, { "epoch": 0.4643174415161386, "grad_norm": 1.9479876262651854, "learning_rate": 1.919412821705343e-05, "loss": 0.892, "step": 784 }, { "epoch": 0.4649096831507255, "grad_norm": 1.8429301406660341, "learning_rate": 1.9191610927334905e-05, "loss": 0.8517, "step": 785 }, { "epoch": 0.4655019247853124, "grad_norm": 1.6626463990171203, "learning_rate": 1.918908987773064e-05, "loss": 0.8422, "step": 786 }, { "epoch": 0.4660941664198993, "grad_norm": 2.484399506113593, "learning_rate": 1.918656506927189e-05, "loss": 0.8349, "step": 787 }, { "epoch": 0.46668640805448625, "grad_norm": 2.2394704693255325, "learning_rate": 1.918403650299144e-05, "loss": 0.8584, "step": 788 }, { "epoch": 0.4672786496890731, "grad_norm": 2.7696631299861494, "learning_rate": 1.9181504179923622e-05, "loss": 0.8702, "step": 789 }, { "epoch": 0.46787089132366005, "grad_norm": 2.3702451262189603, "learning_rate": 1.9178968101104285e-05, "loss": 0.8812, "step": 790 }, { "epoch": 0.468463132958247, "grad_norm": 3.347228840796043, "learning_rate": 1.9176428267570842e-05, "loss": 0.8925, "step": 791 }, { "epoch": 0.46905537459283386, "grad_norm": 1.8592907890766055, "learning_rate": 1.917388468036222e-05, "loss": 0.859, "step": 792 }, { "epoch": 0.4696476162274208, "grad_norm": 1.8720866828747977, "learning_rate": 1.917133734051889e-05, "loss": 0.8683, "step": 793 }, { "epoch": 0.4702398578620077, "grad_norm": 2.3125042184278684, "learning_rate": 1.9168786249082862e-05, "loss": 0.8256, "step": 794 }, { "epoch": 0.4708320994965946, "grad_norm": 1.719445243382972, "learning_rate": 1.916623140709767e-05, "loss": 0.9095, "step": 795 }, { "epoch": 0.47142434113118153, "grad_norm": 3.0132158639000695, "learning_rate": 1.9163672815608392e-05, "loss": 0.8053, "step": 796 }, { "epoch": 0.4720165827657684, "grad_norm": 1.933417130240812, "learning_rate": 1.9161110475661634e-05, "loss": 0.8653, "step": 797 }, { "epoch": 0.47260882440035534, "grad_norm": 2.001369020588165, "learning_rate": 1.9158544388305534e-05, "loss": 0.8821, "step": 798 }, { "epoch": 0.4732010660349423, "grad_norm": 2.236004511554376, "learning_rate": 1.9155974554589774e-05, "loss": 0.8531, "step": 799 }, { "epoch": 0.47379330766952915, "grad_norm": 3.7355229765335785, "learning_rate": 1.915340097556555e-05, "loss": 0.8913, "step": 800 }, { "epoch": 0.4743855493041161, "grad_norm": 6.246619019749031, "learning_rate": 1.915082365228561e-05, "loss": 0.9032, "step": 801 }, { "epoch": 0.474977790938703, "grad_norm": 1.8815129005316729, "learning_rate": 1.914824258580422e-05, "loss": 0.8619, "step": 802 }, { "epoch": 0.4755700325732899, "grad_norm": 1.662450993675011, "learning_rate": 1.9145657777177186e-05, "loss": 0.8465, "step": 803 }, { "epoch": 0.4761622742078768, "grad_norm": 1.8295549474984136, "learning_rate": 1.914306922746183e-05, "loss": 0.8569, "step": 804 }, { "epoch": 0.4767545158424637, "grad_norm": 10.419046505396, "learning_rate": 1.914047693771702e-05, "loss": 0.8476, "step": 805 }, { "epoch": 0.47734675747705063, "grad_norm": 4.545366161612398, "learning_rate": 1.9137880909003155e-05, "loss": 0.8815, "step": 806 }, { "epoch": 0.47793899911163756, "grad_norm": 2.3239626037301266, "learning_rate": 1.9135281142382147e-05, "loss": 0.812, "step": 807 }, { "epoch": 0.47853124074622444, "grad_norm": 3.2052660355799256, "learning_rate": 1.913267763891745e-05, "loss": 0.8444, "step": 808 }, { "epoch": 0.47912348238081137, "grad_norm": 2.661447353848109, "learning_rate": 1.913007039967404e-05, "loss": 0.8676, "step": 809 }, { "epoch": 0.4797157240153983, "grad_norm": 3.1975284655785043, "learning_rate": 1.912745942571843e-05, "loss": 0.8973, "step": 810 }, { "epoch": 0.4803079656499852, "grad_norm": 8.817742135689066, "learning_rate": 1.9124844718118657e-05, "loss": 0.8338, "step": 811 }, { "epoch": 0.4809002072845721, "grad_norm": 6.647384676362516, "learning_rate": 1.9122226277944276e-05, "loss": 0.8034, "step": 812 }, { "epoch": 0.48149244891915904, "grad_norm": 3.0068701129220665, "learning_rate": 1.911960410626638e-05, "loss": 0.8558, "step": 813 }, { "epoch": 0.4820846905537459, "grad_norm": 2.4470838917289055, "learning_rate": 1.9116978204157583e-05, "loss": 0.8781, "step": 814 }, { "epoch": 0.48267693218833285, "grad_norm": 3.568749125351664, "learning_rate": 1.911434857269203e-05, "loss": 0.8366, "step": 815 }, { "epoch": 0.48326917382291973, "grad_norm": 2.788154497136242, "learning_rate": 1.9111715212945384e-05, "loss": 0.8179, "step": 816 }, { "epoch": 0.48386141545750666, "grad_norm": 2.561601471580433, "learning_rate": 1.9109078125994843e-05, "loss": 0.8388, "step": 817 }, { "epoch": 0.4844536570920936, "grad_norm": 13.67055884933429, "learning_rate": 1.9106437312919116e-05, "loss": 0.851, "step": 818 }, { "epoch": 0.48504589872668047, "grad_norm": 3.884383319733934, "learning_rate": 1.910379277479845e-05, "loss": 0.8999, "step": 819 }, { "epoch": 0.4856381403612674, "grad_norm": 4.2385883179797625, "learning_rate": 1.910114451271461e-05, "loss": 0.8981, "step": 820 }, { "epoch": 0.48623038199585433, "grad_norm": 2.8064585598908356, "learning_rate": 1.909849252775088e-05, "loss": 0.8633, "step": 821 }, { "epoch": 0.4868226236304412, "grad_norm": 3.4012309556283227, "learning_rate": 1.9095836820992074e-05, "loss": 0.8425, "step": 822 }, { "epoch": 0.48741486526502814, "grad_norm": 1.8123186335536858, "learning_rate": 1.9093177393524524e-05, "loss": 0.8615, "step": 823 }, { "epoch": 0.488007106899615, "grad_norm": 3.468928077402017, "learning_rate": 1.9090514246436085e-05, "loss": 0.8995, "step": 824 }, { "epoch": 0.48859934853420195, "grad_norm": 1.8021817875541082, "learning_rate": 1.908784738081614e-05, "loss": 0.8284, "step": 825 }, { "epoch": 0.4891915901687889, "grad_norm": 2.4825340026394374, "learning_rate": 1.9085176797755575e-05, "loss": 0.8585, "step": 826 }, { "epoch": 0.48978383180337576, "grad_norm": 2.47602922744696, "learning_rate": 1.908250249834682e-05, "loss": 0.8805, "step": 827 }, { "epoch": 0.4903760734379627, "grad_norm": 1.9033817119185694, "learning_rate": 1.9079824483683808e-05, "loss": 0.9176, "step": 828 }, { "epoch": 0.4909683150725496, "grad_norm": 1.5335390856862279, "learning_rate": 1.9077142754862e-05, "loss": 0.8394, "step": 829 }, { "epoch": 0.4915605567071365, "grad_norm": 1.8269219296715522, "learning_rate": 1.9074457312978373e-05, "loss": 0.8565, "step": 830 }, { "epoch": 0.49215279834172343, "grad_norm": 1.8745343150998126, "learning_rate": 1.907176815913142e-05, "loss": 0.8292, "step": 831 }, { "epoch": 0.49274503997631036, "grad_norm": 3.137235022218658, "learning_rate": 1.9069075294421163e-05, "loss": 0.8564, "step": 832 }, { "epoch": 0.49333728161089724, "grad_norm": 4.136469645798408, "learning_rate": 1.906637871994913e-05, "loss": 0.8611, "step": 833 }, { "epoch": 0.49392952324548417, "grad_norm": 2.105315946203615, "learning_rate": 1.9063678436818372e-05, "loss": 0.8306, "step": 834 }, { "epoch": 0.49452176488007105, "grad_norm": 1.6169111636378513, "learning_rate": 1.906097444613346e-05, "loss": 0.8065, "step": 835 }, { "epoch": 0.495114006514658, "grad_norm": 2.1563275408886495, "learning_rate": 1.905826674900048e-05, "loss": 0.8578, "step": 836 }, { "epoch": 0.4957062481492449, "grad_norm": 1.6035367865321255, "learning_rate": 1.9055555346527024e-05, "loss": 0.8172, "step": 837 }, { "epoch": 0.4962984897838318, "grad_norm": 2.7262844772305868, "learning_rate": 1.9052840239822218e-05, "loss": 0.8535, "step": 838 }, { "epoch": 0.4968907314184187, "grad_norm": 5.363131725816022, "learning_rate": 1.9050121429996682e-05, "loss": 0.8659, "step": 839 }, { "epoch": 0.49748297305300565, "grad_norm": 8.968200408901346, "learning_rate": 1.904739891816257e-05, "loss": 0.8318, "step": 840 }, { "epoch": 0.4980752146875925, "grad_norm": 1.7248247185116594, "learning_rate": 1.904467270543354e-05, "loss": 0.8957, "step": 841 }, { "epoch": 0.49866745632217946, "grad_norm": 1.9562698832994654, "learning_rate": 1.9041942792924767e-05, "loss": 0.8345, "step": 842 }, { "epoch": 0.49925969795676634, "grad_norm": 3.9947226743168156, "learning_rate": 1.9039209181752942e-05, "loss": 0.8652, "step": 843 }, { "epoch": 0.49985193959135327, "grad_norm": 3.1571777018867677, "learning_rate": 1.903647187303626e-05, "loss": 0.8515, "step": 844 }, { "epoch": 0.5004441812259401, "grad_norm": 2.1169391797594668, "learning_rate": 1.9033730867894436e-05, "loss": 0.8119, "step": 845 }, { "epoch": 0.5010364228605271, "grad_norm": 2.7287026350418806, "learning_rate": 1.9030986167448696e-05, "loss": 0.9056, "step": 846 }, { "epoch": 0.501628664495114, "grad_norm": 4.03173653799457, "learning_rate": 1.9028237772821778e-05, "loss": 0.835, "step": 847 }, { "epoch": 0.5022209061297009, "grad_norm": 3.2102449210627246, "learning_rate": 1.902548568513793e-05, "loss": 0.869, "step": 848 }, { "epoch": 0.5028131477642879, "grad_norm": 2.243822048790965, "learning_rate": 1.9022729905522906e-05, "loss": 0.8895, "step": 849 }, { "epoch": 0.5034053893988747, "grad_norm": 2.4533940450724314, "learning_rate": 1.9019970435103978e-05, "loss": 0.8636, "step": 850 }, { "epoch": 0.5039976310334616, "grad_norm": 2.848746378952194, "learning_rate": 1.9017207275009925e-05, "loss": 0.892, "step": 851 }, { "epoch": 0.5045898726680486, "grad_norm": 2.3326834597031105, "learning_rate": 1.9014440426371034e-05, "loss": 0.7968, "step": 852 }, { "epoch": 0.5051821143026355, "grad_norm": 2.745519544350419, "learning_rate": 1.9011669890319104e-05, "loss": 0.8378, "step": 853 }, { "epoch": 0.5057743559372224, "grad_norm": 2.3271640191288183, "learning_rate": 1.9008895667987434e-05, "loss": 0.9074, "step": 854 }, { "epoch": 0.5063665975718094, "grad_norm": 2.06754274248316, "learning_rate": 1.9006117760510846e-05, "loss": 0.8939, "step": 855 }, { "epoch": 0.5069588392063962, "grad_norm": 4.335317472534966, "learning_rate": 1.9003336169025655e-05, "loss": 0.8571, "step": 856 }, { "epoch": 0.5075510808409831, "grad_norm": 2.494123585371266, "learning_rate": 1.9000550894669686e-05, "loss": 0.8623, "step": 857 }, { "epoch": 0.50814332247557, "grad_norm": 2.8756951470707577, "learning_rate": 1.8997761938582277e-05, "loss": 0.8292, "step": 858 }, { "epoch": 0.508735564110157, "grad_norm": 2.761198259777182, "learning_rate": 1.8994969301904266e-05, "loss": 0.868, "step": 859 }, { "epoch": 0.5093278057447439, "grad_norm": 6.139448459726971, "learning_rate": 1.8992172985778002e-05, "loss": 0.8189, "step": 860 }, { "epoch": 0.5099200473793307, "grad_norm": 2.3982620378137, "learning_rate": 1.898937299134733e-05, "loss": 0.8413, "step": 861 }, { "epoch": 0.5105122890139177, "grad_norm": 2.4360272672092758, "learning_rate": 1.8986569319757605e-05, "loss": 0.8573, "step": 862 }, { "epoch": 0.5111045306485046, "grad_norm": 2.978466515723497, "learning_rate": 1.898376197215569e-05, "loss": 0.8423, "step": 863 }, { "epoch": 0.5116967722830915, "grad_norm": 3.3896178499592464, "learning_rate": 1.8980950949689952e-05, "loss": 0.8559, "step": 864 }, { "epoch": 0.5122890139176784, "grad_norm": 3.3035441398858993, "learning_rate": 1.8978136253510248e-05, "loss": 0.8613, "step": 865 }, { "epoch": 0.5128812555522653, "grad_norm": 2.8070122645483035, "learning_rate": 1.897531788476795e-05, "loss": 0.8941, "step": 866 }, { "epoch": 0.5134734971868522, "grad_norm": 3.414743000811395, "learning_rate": 1.8972495844615933e-05, "loss": 0.8576, "step": 867 }, { "epoch": 0.5140657388214391, "grad_norm": 4.129209851440052, "learning_rate": 1.896967013420857e-05, "loss": 0.8392, "step": 868 }, { "epoch": 0.5146579804560261, "grad_norm": 2.7614250927203696, "learning_rate": 1.896684075470173e-05, "loss": 0.8044, "step": 869 }, { "epoch": 0.515250222090613, "grad_norm": 5.7190450126340275, "learning_rate": 1.896400770725279e-05, "loss": 0.8782, "step": 870 }, { "epoch": 0.5158424637251999, "grad_norm": 3.264689428254472, "learning_rate": 1.896117099302063e-05, "loss": 0.8781, "step": 871 }, { "epoch": 0.5164347053597867, "grad_norm": 2.3052957981729123, "learning_rate": 1.8958330613165622e-05, "loss": 0.7944, "step": 872 }, { "epoch": 0.5170269469943737, "grad_norm": 3.6741300739615745, "learning_rate": 1.895548656884964e-05, "loss": 0.846, "step": 873 }, { "epoch": 0.5176191886289606, "grad_norm": 2.633170158045045, "learning_rate": 1.8952638861236066e-05, "loss": 0.837, "step": 874 }, { "epoch": 0.5182114302635475, "grad_norm": 3.933048892917864, "learning_rate": 1.894978749148976e-05, "loss": 0.8227, "step": 875 }, { "epoch": 0.5188036718981345, "grad_norm": 3.415883244982847, "learning_rate": 1.8946932460777105e-05, "loss": 0.8591, "step": 876 }, { "epoch": 0.5193959135327213, "grad_norm": 2.1556681217465905, "learning_rate": 1.8944073770265958e-05, "loss": 0.8476, "step": 877 }, { "epoch": 0.5199881551673082, "grad_norm": 3.3058563739280307, "learning_rate": 1.894121142112569e-05, "loss": 0.8595, "step": 878 }, { "epoch": 0.5205803968018952, "grad_norm": 1.6701206007058242, "learning_rate": 1.8938345414527165e-05, "loss": 0.8418, "step": 879 }, { "epoch": 0.5211726384364821, "grad_norm": 2.8666163842304786, "learning_rate": 1.8935475751642736e-05, "loss": 0.8302, "step": 880 }, { "epoch": 0.521764880071069, "grad_norm": 2.448842057434439, "learning_rate": 1.893260243364626e-05, "loss": 0.8475, "step": 881 }, { "epoch": 0.522357121705656, "grad_norm": 2.8805816085789355, "learning_rate": 1.8929725461713083e-05, "loss": 0.8595, "step": 882 }, { "epoch": 0.5229493633402428, "grad_norm": 2.190759226441507, "learning_rate": 1.892684483702005e-05, "loss": 0.8447, "step": 883 }, { "epoch": 0.5235416049748297, "grad_norm": 2.7950857557738464, "learning_rate": 1.8923960560745495e-05, "loss": 0.8345, "step": 884 }, { "epoch": 0.5241338466094166, "grad_norm": 2.2234121325584213, "learning_rate": 1.8921072634069255e-05, "loss": 0.8267, "step": 885 }, { "epoch": 0.5247260882440036, "grad_norm": 1.457185905309736, "learning_rate": 1.891818105817265e-05, "loss": 0.8214, "step": 886 }, { "epoch": 0.5253183298785905, "grad_norm": 3.4034755252325515, "learning_rate": 1.8915285834238498e-05, "loss": 0.8626, "step": 887 }, { "epoch": 0.5259105715131773, "grad_norm": 3.715757488653005, "learning_rate": 1.891238696345111e-05, "loss": 0.8602, "step": 888 }, { "epoch": 0.5265028131477643, "grad_norm": 1.8540026574196378, "learning_rate": 1.890948444699629e-05, "loss": 0.8251, "step": 889 }, { "epoch": 0.5270950547823512, "grad_norm": 2.1775940247813343, "learning_rate": 1.8906578286061325e-05, "loss": 0.8593, "step": 890 }, { "epoch": 0.5276872964169381, "grad_norm": 2.6307649588163047, "learning_rate": 1.8903668481834996e-05, "loss": 0.8612, "step": 891 }, { "epoch": 0.528279538051525, "grad_norm": 3.989337506531906, "learning_rate": 1.890075503550758e-05, "loss": 0.8391, "step": 892 }, { "epoch": 0.528871779686112, "grad_norm": 8.95506433868868, "learning_rate": 1.889783794827085e-05, "loss": 0.8594, "step": 893 }, { "epoch": 0.5294640213206988, "grad_norm": 2.6218182036375026, "learning_rate": 1.8894917221318038e-05, "loss": 0.8828, "step": 894 }, { "epoch": 0.5300562629552857, "grad_norm": 1.8329700035472636, "learning_rate": 1.8891992855843902e-05, "loss": 0.8307, "step": 895 }, { "epoch": 0.5306485045898727, "grad_norm": 2.011555405209645, "learning_rate": 1.888906485304467e-05, "loss": 0.8204, "step": 896 }, { "epoch": 0.5312407462244596, "grad_norm": 2.8138211953978933, "learning_rate": 1.8886133214118053e-05, "loss": 0.8544, "step": 897 }, { "epoch": 0.5318329878590465, "grad_norm": 2.7779050915507666, "learning_rate": 1.888319794026326e-05, "loss": 0.8776, "step": 898 }, { "epoch": 0.5324252294936334, "grad_norm": 2.867628838160467, "learning_rate": 1.8880259032680985e-05, "loss": 0.8714, "step": 899 }, { "epoch": 0.5330174711282203, "grad_norm": 2.645795842168525, "learning_rate": 1.88773164925734e-05, "loss": 0.8673, "step": 900 }, { "epoch": 0.5336097127628072, "grad_norm": 1.988822701440226, "learning_rate": 1.887437032114418e-05, "loss": 0.8058, "step": 901 }, { "epoch": 0.5342019543973942, "grad_norm": 1.6378154166647196, "learning_rate": 1.887142051959847e-05, "loss": 0.8464, "step": 902 }, { "epoch": 0.5347941960319811, "grad_norm": 2.2058743781314036, "learning_rate": 1.8868467089142893e-05, "loss": 0.8751, "step": 903 }, { "epoch": 0.5353864376665679, "grad_norm": 2.5282653299151527, "learning_rate": 1.8865510030985588e-05, "loss": 0.8621, "step": 904 }, { "epoch": 0.5359786793011548, "grad_norm": 27.31034341693317, "learning_rate": 1.8862549346336144e-05, "loss": 0.8408, "step": 905 }, { "epoch": 0.5365709209357418, "grad_norm": 2.8532802195059985, "learning_rate": 1.8859585036405653e-05, "loss": 0.8242, "step": 906 }, { "epoch": 0.5371631625703287, "grad_norm": 2.0534557906487887, "learning_rate": 1.8856617102406685e-05, "loss": 0.8428, "step": 907 }, { "epoch": 0.5377554042049156, "grad_norm": 1.5482048628272183, "learning_rate": 1.885364554555329e-05, "loss": 0.8626, "step": 908 }, { "epoch": 0.5383476458395026, "grad_norm": 2.1575688293112507, "learning_rate": 1.8850670367061003e-05, "loss": 0.8558, "step": 909 }, { "epoch": 0.5389398874740894, "grad_norm": 2.102806685634922, "learning_rate": 1.884769156814684e-05, "loss": 0.8687, "step": 910 }, { "epoch": 0.5395321291086763, "grad_norm": 1.4463393941880638, "learning_rate": 1.884470915002929e-05, "loss": 0.8818, "step": 911 }, { "epoch": 0.5401243707432632, "grad_norm": 1.4609406531497033, "learning_rate": 1.884172311392834e-05, "loss": 0.8548, "step": 912 }, { "epoch": 0.5407166123778502, "grad_norm": 1.8329647817569432, "learning_rate": 1.883873346106544e-05, "loss": 0.858, "step": 913 }, { "epoch": 0.5413088540124371, "grad_norm": 3.9152641529549856, "learning_rate": 1.883574019266353e-05, "loss": 0.8783, "step": 914 }, { "epoch": 0.5419010956470239, "grad_norm": 1.4892888981468162, "learning_rate": 1.8832743309947026e-05, "loss": 0.8504, "step": 915 }, { "epoch": 0.5424933372816109, "grad_norm": 2.1974930679295963, "learning_rate": 1.8829742814141813e-05, "loss": 0.7888, "step": 916 }, { "epoch": 0.5430855789161978, "grad_norm": 2.960377295236711, "learning_rate": 1.8826738706475275e-05, "loss": 0.9234, "step": 917 }, { "epoch": 0.5436778205507847, "grad_norm": 2.413195078205313, "learning_rate": 1.882373098817625e-05, "loss": 0.8354, "step": 918 }, { "epoch": 0.5442700621853717, "grad_norm": 2.29920506736285, "learning_rate": 1.882071966047507e-05, "loss": 0.8359, "step": 919 }, { "epoch": 0.5448623038199586, "grad_norm": 2.2127645514863916, "learning_rate": 1.8817704724603536e-05, "loss": 0.8713, "step": 920 }, { "epoch": 0.5454545454545454, "grad_norm": 2.5056883347384074, "learning_rate": 1.8814686181794927e-05, "loss": 0.8836, "step": 921 }, { "epoch": 0.5460467870891323, "grad_norm": 2.319729335193821, "learning_rate": 1.8811664033283993e-05, "loss": 0.8758, "step": 922 }, { "epoch": 0.5466390287237193, "grad_norm": 2.4646633874772603, "learning_rate": 1.880863828030697e-05, "loss": 0.8338, "step": 923 }, { "epoch": 0.5472312703583062, "grad_norm": 3.5967734066777624, "learning_rate": 1.880560892410155e-05, "loss": 0.8142, "step": 924 }, { "epoch": 0.5478235119928931, "grad_norm": 1.931083687931415, "learning_rate": 1.8802575965906923e-05, "loss": 0.8624, "step": 925 }, { "epoch": 0.54841575362748, "grad_norm": 1.3658707686323024, "learning_rate": 1.879953940696373e-05, "loss": 0.82, "step": 926 }, { "epoch": 0.5490079952620669, "grad_norm": 1.5399776635493545, "learning_rate": 1.87964992485141e-05, "loss": 0.8752, "step": 927 }, { "epoch": 0.5496002368966538, "grad_norm": 1.2888985969973952, "learning_rate": 1.8793455491801623e-05, "loss": 0.8258, "step": 928 }, { "epoch": 0.5501924785312408, "grad_norm": 1.2235740279377274, "learning_rate": 1.8790408138071372e-05, "loss": 0.7807, "step": 929 }, { "epoch": 0.5507847201658277, "grad_norm": 1.3948987520636114, "learning_rate": 1.878735718856988e-05, "loss": 0.85, "step": 930 }, { "epoch": 0.5513769618004146, "grad_norm": 1.1530555013033301, "learning_rate": 1.8784302644545165e-05, "loss": 0.8434, "step": 931 }, { "epoch": 0.5519692034350014, "grad_norm": 1.1001632567365147, "learning_rate": 1.8781244507246706e-05, "loss": 0.8357, "step": 932 }, { "epoch": 0.5525614450695884, "grad_norm": 1.1272615396567933, "learning_rate": 1.8778182777925447e-05, "loss": 0.8673, "step": 933 }, { "epoch": 0.5531536867041753, "grad_norm": 1.5817714863387373, "learning_rate": 1.877511745783381e-05, "loss": 0.8628, "step": 934 }, { "epoch": 0.5537459283387622, "grad_norm": 1.5329765777416222, "learning_rate": 1.8772048548225684e-05, "loss": 0.8412, "step": 935 }, { "epoch": 0.5543381699733492, "grad_norm": 1.15668478337925, "learning_rate": 1.8768976050356428e-05, "loss": 0.8673, "step": 936 }, { "epoch": 0.554930411607936, "grad_norm": 1.4034733165443383, "learning_rate": 1.8765899965482858e-05, "loss": 0.8063, "step": 937 }, { "epoch": 0.5555226532425229, "grad_norm": 1.278219411060756, "learning_rate": 1.876282029486328e-05, "loss": 0.8432, "step": 938 }, { "epoch": 0.5561148948771099, "grad_norm": 1.434098032436352, "learning_rate": 1.8759737039757437e-05, "loss": 0.8616, "step": 939 }, { "epoch": 0.5567071365116968, "grad_norm": 1.3706128629162078, "learning_rate": 1.8756650201426565e-05, "loss": 0.8397, "step": 940 }, { "epoch": 0.5572993781462837, "grad_norm": 1.4524914917856544, "learning_rate": 1.875355978113335e-05, "loss": 0.8423, "step": 941 }, { "epoch": 0.5578916197808705, "grad_norm": 1.6212056588454582, "learning_rate": 1.8750465780141946e-05, "loss": 0.8259, "step": 942 }, { "epoch": 0.5584838614154575, "grad_norm": 1.5764324461444237, "learning_rate": 1.8747368199717977e-05, "loss": 0.8723, "step": 943 }, { "epoch": 0.5590761030500444, "grad_norm": 1.5399903048769072, "learning_rate": 1.8744267041128528e-05, "loss": 0.8563, "step": 944 }, { "epoch": 0.5596683446846313, "grad_norm": 1.4178813586778414, "learning_rate": 1.874116230564214e-05, "loss": 0.8551, "step": 945 }, { "epoch": 0.5602605863192183, "grad_norm": 1.2554205520730095, "learning_rate": 1.8738053994528835e-05, "loss": 0.84, "step": 946 }, { "epoch": 0.5608528279538052, "grad_norm": 1.436381834061602, "learning_rate": 1.873494210906008e-05, "loss": 0.8633, "step": 947 }, { "epoch": 0.561445069588392, "grad_norm": 1.572609783293438, "learning_rate": 1.8731826650508812e-05, "loss": 0.8575, "step": 948 }, { "epoch": 0.562037311222979, "grad_norm": 1.9911513522446955, "learning_rate": 1.872870762014943e-05, "loss": 0.8699, "step": 949 }, { "epoch": 0.5626295528575659, "grad_norm": 1.7221797946035173, "learning_rate": 1.8725585019257794e-05, "loss": 0.8288, "step": 950 }, { "epoch": 0.5632217944921528, "grad_norm": 1.6316667513165497, "learning_rate": 1.8722458849111218e-05, "loss": 0.8476, "step": 951 }, { "epoch": 0.5638140361267397, "grad_norm": 1.8817617861124867, "learning_rate": 1.8719329110988487e-05, "loss": 0.8812, "step": 952 }, { "epoch": 0.5644062777613266, "grad_norm": 1.225383750075909, "learning_rate": 1.871619580616984e-05, "loss": 0.8646, "step": 953 }, { "epoch": 0.5649985193959135, "grad_norm": 4.355740946012875, "learning_rate": 1.871305893593697e-05, "loss": 0.8008, "step": 954 }, { "epoch": 0.5655907610305004, "grad_norm": 2.1927471549874564, "learning_rate": 1.8709918501573038e-05, "loss": 0.8762, "step": 955 }, { "epoch": 0.5661830026650874, "grad_norm": 3.055980708697755, "learning_rate": 1.8706774504362655e-05, "loss": 0.8584, "step": 956 }, { "epoch": 0.5667752442996743, "grad_norm": 1.5384769512510463, "learning_rate": 1.8703626945591895e-05, "loss": 0.874, "step": 957 }, { "epoch": 0.5673674859342612, "grad_norm": 1.593507898135259, "learning_rate": 1.8700475826548285e-05, "loss": 0.8539, "step": 958 }, { "epoch": 0.567959727568848, "grad_norm": 2.4658738412503185, "learning_rate": 1.8697321148520812e-05, "loss": 0.8464, "step": 959 }, { "epoch": 0.568551969203435, "grad_norm": 2.3647390172875213, "learning_rate": 1.8694162912799917e-05, "loss": 0.8487, "step": 960 }, { "epoch": 0.5691442108380219, "grad_norm": 1.8822227155497309, "learning_rate": 1.869100112067749e-05, "loss": 0.924, "step": 961 }, { "epoch": 0.5697364524726088, "grad_norm": 3.1476849757648258, "learning_rate": 1.868783577344689e-05, "loss": 0.8641, "step": 962 }, { "epoch": 0.5703286941071958, "grad_norm": 2.391640628039692, "learning_rate": 1.8684666872402913e-05, "loss": 0.8189, "step": 963 }, { "epoch": 0.5709209357417826, "grad_norm": 1.9209124622082125, "learning_rate": 1.8681494418841825e-05, "loss": 0.9141, "step": 964 }, { "epoch": 0.5715131773763695, "grad_norm": 1.883444300120634, "learning_rate": 1.8678318414061336e-05, "loss": 0.8244, "step": 965 }, { "epoch": 0.5721054190109565, "grad_norm": 1.8683731975284492, "learning_rate": 1.867513885936061e-05, "loss": 0.8039, "step": 966 }, { "epoch": 0.5726976606455434, "grad_norm": 2.186169687223295, "learning_rate": 1.8671955756040264e-05, "loss": 0.8481, "step": 967 }, { "epoch": 0.5732899022801303, "grad_norm": 2.4126100375538746, "learning_rate": 1.8668769105402366e-05, "loss": 0.8283, "step": 968 }, { "epoch": 0.5738821439147173, "grad_norm": 5.881790876957725, "learning_rate": 1.8665578908750437e-05, "loss": 0.8004, "step": 969 }, { "epoch": 0.5744743855493041, "grad_norm": 2.1957974263362963, "learning_rate": 1.8662385167389443e-05, "loss": 0.8125, "step": 970 }, { "epoch": 0.575066627183891, "grad_norm": 2.122100726977036, "learning_rate": 1.8659187882625807e-05, "loss": 0.8323, "step": 971 }, { "epoch": 0.5756588688184779, "grad_norm": 2.0089182722346512, "learning_rate": 1.8655987055767396e-05, "loss": 0.8107, "step": 972 }, { "epoch": 0.5762511104530649, "grad_norm": 3.4788064271268677, "learning_rate": 1.8652782688123535e-05, "loss": 0.8357, "step": 973 }, { "epoch": 0.5768433520876518, "grad_norm": 1.9971681598642872, "learning_rate": 1.864957478100498e-05, "loss": 0.8487, "step": 974 }, { "epoch": 0.5774355937222386, "grad_norm": 2.7963971479792096, "learning_rate": 1.8646363335723952e-05, "loss": 0.9236, "step": 975 }, { "epoch": 0.5780278353568256, "grad_norm": 2.6501420919148706, "learning_rate": 1.864314835359411e-05, "loss": 0.8643, "step": 976 }, { "epoch": 0.5786200769914125, "grad_norm": 2.5379734296943273, "learning_rate": 1.863992983593056e-05, "loss": 0.8564, "step": 977 }, { "epoch": 0.5792123186259994, "grad_norm": 3.4686271540396936, "learning_rate": 1.8636707784049867e-05, "loss": 0.8665, "step": 978 }, { "epoch": 0.5798045602605864, "grad_norm": 1.9639248298320788, "learning_rate": 1.8633482199270025e-05, "loss": 0.8785, "step": 979 }, { "epoch": 0.5803968018951732, "grad_norm": 3.009335654283905, "learning_rate": 1.8630253082910473e-05, "loss": 0.8629, "step": 980 }, { "epoch": 0.5809890435297601, "grad_norm": 2.5913527017658535, "learning_rate": 1.862702043629211e-05, "loss": 0.8483, "step": 981 }, { "epoch": 0.581581285164347, "grad_norm": 2.3301173927122636, "learning_rate": 1.8623784260737272e-05, "loss": 0.8872, "step": 982 }, { "epoch": 0.582173526798934, "grad_norm": 2.1728733813785954, "learning_rate": 1.862054455756973e-05, "loss": 0.8851, "step": 983 }, { "epoch": 0.5827657684335209, "grad_norm": 2.6022654651479153, "learning_rate": 1.8617301328114704e-05, "loss": 0.853, "step": 984 }, { "epoch": 0.5833580100681078, "grad_norm": 5.499406695169115, "learning_rate": 1.8614054573698867e-05, "loss": 0.8533, "step": 985 }, { "epoch": 0.5839502517026947, "grad_norm": 7.358085625855621, "learning_rate": 1.861080429565031e-05, "loss": 0.8609, "step": 986 }, { "epoch": 0.5845424933372816, "grad_norm": 2.5901934912509406, "learning_rate": 1.8607550495298594e-05, "loss": 0.8123, "step": 987 }, { "epoch": 0.5851347349718685, "grad_norm": 3.577130582550033, "learning_rate": 1.8604293173974694e-05, "loss": 0.8424, "step": 988 }, { "epoch": 0.5857269766064555, "grad_norm": 6.266496305800451, "learning_rate": 1.8601032333011045e-05, "loss": 0.8278, "step": 989 }, { "epoch": 0.5863192182410424, "grad_norm": 3.688636492554437, "learning_rate": 1.8597767973741514e-05, "loss": 0.8391, "step": 990 }, { "epoch": 0.5869114598756292, "grad_norm": 3.4278947346345556, "learning_rate": 1.8594500097501403e-05, "loss": 0.8925, "step": 991 }, { "epoch": 0.5875037015102161, "grad_norm": 2.798245409340499, "learning_rate": 1.8591228705627464e-05, "loss": 0.8683, "step": 992 }, { "epoch": 0.5880959431448031, "grad_norm": 2.6101217239588674, "learning_rate": 1.858795379945787e-05, "loss": 0.7957, "step": 993 }, { "epoch": 0.58868818477939, "grad_norm": 1.9767054625050304, "learning_rate": 1.8584675380332254e-05, "loss": 0.8868, "step": 994 }, { "epoch": 0.5892804264139769, "grad_norm": 3.561966477734429, "learning_rate": 1.8581393449591667e-05, "loss": 0.8462, "step": 995 }, { "epoch": 0.5898726680485639, "grad_norm": 1.6872502667794813, "learning_rate": 1.8578108008578603e-05, "loss": 0.8334, "step": 996 }, { "epoch": 0.5904649096831507, "grad_norm": 2.4287274023717615, "learning_rate": 1.8574819058636993e-05, "loss": 0.8921, "step": 997 }, { "epoch": 0.5910571513177376, "grad_norm": 1.7111015618500356, "learning_rate": 1.8571526601112202e-05, "loss": 0.8314, "step": 998 }, { "epoch": 0.5916493929523245, "grad_norm": 2.5502109328589233, "learning_rate": 1.856823063735103e-05, "loss": 0.8725, "step": 999 }, { "epoch": 0.5922416345869115, "grad_norm": 2.040139228442696, "learning_rate": 1.8564931168701713e-05, "loss": 0.8347, "step": 1000 }, { "epoch": 0.5928338762214984, "grad_norm": 2.5055813610082303, "learning_rate": 1.8561628196513923e-05, "loss": 0.8585, "step": 1001 }, { "epoch": 0.5934261178560852, "grad_norm": 2.1524220648045427, "learning_rate": 1.855832172213875e-05, "loss": 0.8474, "step": 1002 }, { "epoch": 0.5940183594906722, "grad_norm": 1.619731922084106, "learning_rate": 1.8555011746928738e-05, "loss": 0.8179, "step": 1003 }, { "epoch": 0.5946106011252591, "grad_norm": 2.8662321460118805, "learning_rate": 1.855169827223785e-05, "loss": 0.8484, "step": 1004 }, { "epoch": 0.595202842759846, "grad_norm": 2.449255266934718, "learning_rate": 1.8548381299421486e-05, "loss": 0.8856, "step": 1005 }, { "epoch": 0.595795084394433, "grad_norm": 1.8813446878490585, "learning_rate": 1.854506082983647e-05, "loss": 0.8173, "step": 1006 }, { "epoch": 0.5963873260290199, "grad_norm": 2.5469931424576586, "learning_rate": 1.8541736864841064e-05, "loss": 0.8532, "step": 1007 }, { "epoch": 0.5969795676636067, "grad_norm": 2.641308461614614, "learning_rate": 1.8538409405794952e-05, "loss": 0.8217, "step": 1008 }, { "epoch": 0.5975718092981936, "grad_norm": 1.4950837736543847, "learning_rate": 1.8535078454059256e-05, "loss": 0.8381, "step": 1009 }, { "epoch": 0.5981640509327806, "grad_norm": 2.338715560487363, "learning_rate": 1.8531744010996525e-05, "loss": 0.8573, "step": 1010 }, { "epoch": 0.5987562925673675, "grad_norm": 1.6059242683751407, "learning_rate": 1.8528406077970725e-05, "loss": 0.8527, "step": 1011 }, { "epoch": 0.5993485342019544, "grad_norm": 12.361811261173756, "learning_rate": 1.8525064656347265e-05, "loss": 0.8298, "step": 1012 }, { "epoch": 0.5999407758365413, "grad_norm": 1.1511502155917168, "learning_rate": 1.8521719747492974e-05, "loss": 0.8212, "step": 1013 }, { "epoch": 0.6005330174711282, "grad_norm": 1.6632441653678172, "learning_rate": 1.8518371352776107e-05, "loss": 0.8415, "step": 1014 }, { "epoch": 0.6011252591057151, "grad_norm": 4.116651414311895, "learning_rate": 1.8515019473566346e-05, "loss": 0.8252, "step": 1015 }, { "epoch": 0.6017175007403021, "grad_norm": 1.8732938641491954, "learning_rate": 1.85116641112348e-05, "loss": 0.8283, "step": 1016 }, { "epoch": 0.602309742374889, "grad_norm": 1.8390615077922212, "learning_rate": 1.8508305267153992e-05, "loss": 0.8132, "step": 1017 }, { "epoch": 0.6029019840094758, "grad_norm": 2.633892399081572, "learning_rate": 1.850494294269789e-05, "loss": 0.8758, "step": 1018 }, { "epoch": 0.6034942256440627, "grad_norm": 2.3622253842023335, "learning_rate": 1.8501577139241865e-05, "loss": 0.829, "step": 1019 }, { "epoch": 0.6040864672786497, "grad_norm": 2.037756142332905, "learning_rate": 1.8498207858162724e-05, "loss": 0.867, "step": 1020 }, { "epoch": 0.6046787089132366, "grad_norm": 4.528232825533869, "learning_rate": 1.8494835100838693e-05, "loss": 0.8648, "step": 1021 }, { "epoch": 0.6052709505478235, "grad_norm": 2.8907054221222, "learning_rate": 1.8491458868649417e-05, "loss": 0.8574, "step": 1022 }, { "epoch": 0.6058631921824105, "grad_norm": 2.182754237899523, "learning_rate": 1.8488079162975965e-05, "loss": 0.8716, "step": 1023 }, { "epoch": 0.6064554338169973, "grad_norm": 3.6488297829005316, "learning_rate": 1.8484695985200832e-05, "loss": 0.8888, "step": 1024 }, { "epoch": 0.6070476754515842, "grad_norm": 2.1360401343086375, "learning_rate": 1.848130933670792e-05, "loss": 0.853, "step": 1025 }, { "epoch": 0.6076399170861712, "grad_norm": 2.7634831832013074, "learning_rate": 1.847791921888256e-05, "loss": 0.8444, "step": 1026 }, { "epoch": 0.6082321587207581, "grad_norm": 2.548946934028294, "learning_rate": 1.847452563311151e-05, "loss": 0.8552, "step": 1027 }, { "epoch": 0.608824400355345, "grad_norm": 2.00472769722451, "learning_rate": 1.8471128580782923e-05, "loss": 0.7907, "step": 1028 }, { "epoch": 0.6094166419899318, "grad_norm": 7.112058400031312, "learning_rate": 1.8467728063286398e-05, "loss": 0.811, "step": 1029 }, { "epoch": 0.6100088836245188, "grad_norm": 5.904134501716097, "learning_rate": 1.8464324082012926e-05, "loss": 0.8236, "step": 1030 }, { "epoch": 0.6106011252591057, "grad_norm": 2.677789919762913, "learning_rate": 1.8460916638354934e-05, "loss": 0.8855, "step": 1031 }, { "epoch": 0.6111933668936926, "grad_norm": 2.979499541128458, "learning_rate": 1.845750573370626e-05, "loss": 0.81, "step": 1032 }, { "epoch": 0.6117856085282796, "grad_norm": 4.395198347599419, "learning_rate": 1.845409136946215e-05, "loss": 0.8649, "step": 1033 }, { "epoch": 0.6123778501628665, "grad_norm": 1.9418555795640626, "learning_rate": 1.8450673547019273e-05, "loss": 0.8106, "step": 1034 }, { "epoch": 0.6129700917974533, "grad_norm": 2.50689095850713, "learning_rate": 1.844725226777571e-05, "loss": 0.8387, "step": 1035 }, { "epoch": 0.6135623334320403, "grad_norm": 2.812993440172385, "learning_rate": 1.844382753313096e-05, "loss": 0.8102, "step": 1036 }, { "epoch": 0.6141545750666272, "grad_norm": 2.6604257345575206, "learning_rate": 1.844039934448593e-05, "loss": 0.8277, "step": 1037 }, { "epoch": 0.6147468167012141, "grad_norm": 2.561145533436182, "learning_rate": 1.8436967703242938e-05, "loss": 0.8557, "step": 1038 }, { "epoch": 0.615339058335801, "grad_norm": 3.2075888751958, "learning_rate": 1.8433532610805724e-05, "loss": 0.8399, "step": 1039 }, { "epoch": 0.6159312999703879, "grad_norm": 7.7990669355472475, "learning_rate": 1.843009406857943e-05, "loss": 0.8473, "step": 1040 }, { "epoch": 0.6165235416049748, "grad_norm": 3.132725554622429, "learning_rate": 1.8426652077970616e-05, "loss": 0.8058, "step": 1041 }, { "epoch": 0.6171157832395617, "grad_norm": 2.1854341940737187, "learning_rate": 1.842320664038725e-05, "loss": 0.8126, "step": 1042 }, { "epoch": 0.6177080248741487, "grad_norm": 2.9704427255656825, "learning_rate": 1.841975775723871e-05, "loss": 0.8319, "step": 1043 }, { "epoch": 0.6183002665087356, "grad_norm": 2.2725465194986367, "learning_rate": 1.8416305429935776e-05, "loss": 0.8729, "step": 1044 }, { "epoch": 0.6188925081433225, "grad_norm": 2.5307221134117674, "learning_rate": 1.8412849659890652e-05, "loss": 0.8412, "step": 1045 }, { "epoch": 0.6194847497779093, "grad_norm": 1.810981613810124, "learning_rate": 1.840939044851694e-05, "loss": 0.8673, "step": 1046 }, { "epoch": 0.6200769914124963, "grad_norm": 2.4443325681183907, "learning_rate": 1.840592779722965e-05, "loss": 0.7945, "step": 1047 }, { "epoch": 0.6206692330470832, "grad_norm": 2.8376072263141485, "learning_rate": 1.8402461707445206e-05, "loss": 0.8427, "step": 1048 }, { "epoch": 0.6212614746816701, "grad_norm": 3.6210685733471397, "learning_rate": 1.8398992180581427e-05, "loss": 0.826, "step": 1049 }, { "epoch": 0.6218537163162571, "grad_norm": 1.561394208799311, "learning_rate": 1.839551921805755e-05, "loss": 0.8572, "step": 1050 }, { "epoch": 0.6224459579508439, "grad_norm": 3.103802020893097, "learning_rate": 1.839204282129421e-05, "loss": 0.8646, "step": 1051 }, { "epoch": 0.6230381995854308, "grad_norm": 4.425061232798762, "learning_rate": 1.8388562991713447e-05, "loss": 0.8143, "step": 1052 }, { "epoch": 0.6236304412200178, "grad_norm": 2.191428810992078, "learning_rate": 1.838507973073871e-05, "loss": 0.8502, "step": 1053 }, { "epoch": 0.6242226828546047, "grad_norm": 4.95011030076338, "learning_rate": 1.8381593039794846e-05, "loss": 0.864, "step": 1054 }, { "epoch": 0.6248149244891916, "grad_norm": 2.7797880624241516, "learning_rate": 1.837810292030811e-05, "loss": 0.859, "step": 1055 }, { "epoch": 0.6254071661237784, "grad_norm": 4.869981957929177, "learning_rate": 1.8374609373706156e-05, "loss": 0.8444, "step": 1056 }, { "epoch": 0.6259994077583654, "grad_norm": 2.5746569296165522, "learning_rate": 1.8371112401418042e-05, "loss": 0.8364, "step": 1057 }, { "epoch": 0.6265916493929523, "grad_norm": 2.0625606665572653, "learning_rate": 1.8367612004874224e-05, "loss": 0.8906, "step": 1058 }, { "epoch": 0.6271838910275392, "grad_norm": 2.676271947186104, "learning_rate": 1.8364108185506563e-05, "loss": 0.8459, "step": 1059 }, { "epoch": 0.6277761326621262, "grad_norm": 1.5883481724433661, "learning_rate": 1.8360600944748316e-05, "loss": 0.7969, "step": 1060 }, { "epoch": 0.6283683742967131, "grad_norm": 1.695724970760026, "learning_rate": 1.8357090284034145e-05, "loss": 0.8488, "step": 1061 }, { "epoch": 0.6289606159312999, "grad_norm": 5.99104068302577, "learning_rate": 1.8353576204800106e-05, "loss": 0.8584, "step": 1062 }, { "epoch": 0.6295528575658869, "grad_norm": 1.4158669500557108, "learning_rate": 1.8350058708483654e-05, "loss": 0.8109, "step": 1063 }, { "epoch": 0.6301450992004738, "grad_norm": 4.619972925464011, "learning_rate": 1.8346537796523643e-05, "loss": 0.8387, "step": 1064 }, { "epoch": 0.6307373408350607, "grad_norm": 1.840623636179062, "learning_rate": 1.834301347036033e-05, "loss": 0.7773, "step": 1065 }, { "epoch": 0.6313295824696477, "grad_norm": 3.0573832313039073, "learning_rate": 1.833948573143535e-05, "loss": 0.8626, "step": 1066 }, { "epoch": 0.6319218241042345, "grad_norm": 2.518622510039811, "learning_rate": 1.8335954581191758e-05, "loss": 0.8783, "step": 1067 }, { "epoch": 0.6325140657388214, "grad_norm": 9.493928447056057, "learning_rate": 1.8332420021073992e-05, "loss": 0.8544, "step": 1068 }, { "epoch": 0.6331063073734083, "grad_norm": 1.720905288086446, "learning_rate": 1.8328882052527878e-05, "loss": 0.8282, "step": 1069 }, { "epoch": 0.6336985490079953, "grad_norm": 1.8020645303459293, "learning_rate": 1.8325340677000646e-05, "loss": 0.8923, "step": 1070 }, { "epoch": 0.6342907906425822, "grad_norm": 2.8749501095196517, "learning_rate": 1.8321795895940925e-05, "loss": 0.8544, "step": 1071 }, { "epoch": 0.6348830322771691, "grad_norm": 2.008670621988076, "learning_rate": 1.8318247710798728e-05, "loss": 0.8462, "step": 1072 }, { "epoch": 0.635475273911756, "grad_norm": 1.8563596582266815, "learning_rate": 1.8314696123025456e-05, "loss": 0.8324, "step": 1073 }, { "epoch": 0.6360675155463429, "grad_norm": 2.228469119107762, "learning_rate": 1.831114113407391e-05, "loss": 0.8325, "step": 1074 }, { "epoch": 0.6366597571809298, "grad_norm": 2.5028199204197596, "learning_rate": 1.8307582745398282e-05, "loss": 0.7938, "step": 1075 }, { "epoch": 0.6372519988155168, "grad_norm": 2.563042207855269, "learning_rate": 1.8304020958454156e-05, "loss": 0.8539, "step": 1076 }, { "epoch": 0.6378442404501037, "grad_norm": 2.748425207975687, "learning_rate": 1.83004557746985e-05, "loss": 0.9026, "step": 1077 }, { "epoch": 0.6384364820846905, "grad_norm": 2.198078366256531, "learning_rate": 1.8296887195589678e-05, "loss": 0.836, "step": 1078 }, { "epoch": 0.6390287237192774, "grad_norm": 2.978264957639652, "learning_rate": 1.829331522258743e-05, "loss": 0.8374, "step": 1079 }, { "epoch": 0.6396209653538644, "grad_norm": 3.0902393566945694, "learning_rate": 1.8289739857152903e-05, "loss": 0.8522, "step": 1080 }, { "epoch": 0.6402132069884513, "grad_norm": 2.5510293746898793, "learning_rate": 1.828616110074862e-05, "loss": 0.8959, "step": 1081 }, { "epoch": 0.6408054486230382, "grad_norm": 2.4920369060953, "learning_rate": 1.8282578954838493e-05, "loss": 0.8635, "step": 1082 }, { "epoch": 0.6413976902576252, "grad_norm": 3.8584939608718174, "learning_rate": 1.8278993420887822e-05, "loss": 0.897, "step": 1083 }, { "epoch": 0.641989931892212, "grad_norm": 1.6943535494324076, "learning_rate": 1.8275404500363293e-05, "loss": 0.8555, "step": 1084 }, { "epoch": 0.6425821735267989, "grad_norm": 3.625435056497657, "learning_rate": 1.8271812194732972e-05, "loss": 0.8555, "step": 1085 }, { "epoch": 0.6431744151613858, "grad_norm": 9.548040429248383, "learning_rate": 1.8268216505466318e-05, "loss": 0.9243, "step": 1086 }, { "epoch": 0.6437666567959728, "grad_norm": 6.204666207240246, "learning_rate": 1.8264617434034168e-05, "loss": 0.8679, "step": 1087 }, { "epoch": 0.6443588984305597, "grad_norm": 2.3034334204515767, "learning_rate": 1.826101498190875e-05, "loss": 0.8646, "step": 1088 }, { "epoch": 0.6449511400651465, "grad_norm": 4.514126342622965, "learning_rate": 1.825740915056366e-05, "loss": 0.8552, "step": 1089 }, { "epoch": 0.6455433816997335, "grad_norm": 2.4208151622387604, "learning_rate": 1.8253799941473894e-05, "loss": 0.8319, "step": 1090 }, { "epoch": 0.6461356233343204, "grad_norm": 3.3019190869678736, "learning_rate": 1.8250187356115817e-05, "loss": 0.8964, "step": 1091 }, { "epoch": 0.6467278649689073, "grad_norm": 6.537722710331606, "learning_rate": 1.824657139596718e-05, "loss": 0.8184, "step": 1092 }, { "epoch": 0.6473201066034943, "grad_norm": 5.245807404499727, "learning_rate": 1.8242952062507115e-05, "loss": 0.8338, "step": 1093 }, { "epoch": 0.6479123482380811, "grad_norm": 3.8290099537018167, "learning_rate": 1.8239329357216135e-05, "loss": 0.7772, "step": 1094 }, { "epoch": 0.648504589872668, "grad_norm": 4.36239109924843, "learning_rate": 1.8235703281576127e-05, "loss": 0.8034, "step": 1095 }, { "epoch": 0.649096831507255, "grad_norm": 3.266500591644286, "learning_rate": 1.823207383707036e-05, "loss": 0.8272, "step": 1096 }, { "epoch": 0.6496890731418419, "grad_norm": 1.9798959268550758, "learning_rate": 1.822844102518348e-05, "loss": 0.843, "step": 1097 }, { "epoch": 0.6502813147764288, "grad_norm": 2.4272575236741343, "learning_rate": 1.8224804847401518e-05, "loss": 0.8178, "step": 1098 }, { "epoch": 0.6508735564110157, "grad_norm": 2.643309325700988, "learning_rate": 1.822116530521187e-05, "loss": 0.8607, "step": 1099 }, { "epoch": 0.6514657980456026, "grad_norm": 2.7517207819440466, "learning_rate": 1.821752240010331e-05, "loss": 0.9101, "step": 1100 }, { "epoch": 0.6520580396801895, "grad_norm": 2.409184915147776, "learning_rate": 1.8213876133565996e-05, "loss": 0.894, "step": 1101 }, { "epoch": 0.6526502813147764, "grad_norm": 1.8447833000009395, "learning_rate": 1.8210226507091454e-05, "loss": 0.8607, "step": 1102 }, { "epoch": 0.6532425229493634, "grad_norm": 2.9713191095410174, "learning_rate": 1.820657352217259e-05, "loss": 0.834, "step": 1103 }, { "epoch": 0.6538347645839503, "grad_norm": 2.5876456677398956, "learning_rate": 1.8202917180303673e-05, "loss": 0.866, "step": 1104 }, { "epoch": 0.6544270062185371, "grad_norm": 2.3509085863918284, "learning_rate": 1.8199257482980358e-05, "loss": 0.8582, "step": 1105 }, { "epoch": 0.655019247853124, "grad_norm": 1.5013962529594334, "learning_rate": 1.819559443169967e-05, "loss": 0.8376, "step": 1106 }, { "epoch": 0.655611489487711, "grad_norm": 2.604248538807811, "learning_rate": 1.8191928027959996e-05, "loss": 0.8598, "step": 1107 }, { "epoch": 0.6562037311222979, "grad_norm": 1.8138214242680486, "learning_rate": 1.8188258273261104e-05, "loss": 0.8527, "step": 1108 }, { "epoch": 0.6567959727568848, "grad_norm": 2.716863323227044, "learning_rate": 1.818458516910413e-05, "loss": 0.8676, "step": 1109 }, { "epoch": 0.6573882143914718, "grad_norm": 1.9705395914912947, "learning_rate": 1.818090871699158e-05, "loss": 0.8832, "step": 1110 }, { "epoch": 0.6579804560260586, "grad_norm": 4.001918700211936, "learning_rate": 1.817722891842733e-05, "loss": 0.858, "step": 1111 }, { "epoch": 0.6585726976606455, "grad_norm": 1.7504239586578514, "learning_rate": 1.8173545774916628e-05, "loss": 0.8335, "step": 1112 }, { "epoch": 0.6591649392952325, "grad_norm": 2.103180849415543, "learning_rate": 1.816985928796608e-05, "loss": 0.8058, "step": 1113 }, { "epoch": 0.6597571809298194, "grad_norm": 2.042550989077482, "learning_rate": 1.8166169459083673e-05, "loss": 0.8622, "step": 1114 }, { "epoch": 0.6603494225644063, "grad_norm": 2.895471180154479, "learning_rate": 1.8162476289778745e-05, "loss": 0.8349, "step": 1115 }, { "epoch": 0.6609416641989931, "grad_norm": 2.1623953611204163, "learning_rate": 1.8158779781562022e-05, "loss": 0.8047, "step": 1116 }, { "epoch": 0.6615339058335801, "grad_norm": 4.82625679912337, "learning_rate": 1.8155079935945577e-05, "loss": 0.8058, "step": 1117 }, { "epoch": 0.662126147468167, "grad_norm": 2.1209114049054163, "learning_rate": 1.8151376754442856e-05, "loss": 0.8345, "step": 1118 }, { "epoch": 0.6627183891027539, "grad_norm": 3.202247450296245, "learning_rate": 1.8147670238568666e-05, "loss": 0.8264, "step": 1119 }, { "epoch": 0.6633106307373409, "grad_norm": 4.683917620146975, "learning_rate": 1.8143960389839184e-05, "loss": 0.8997, "step": 1120 }, { "epoch": 0.6639028723719278, "grad_norm": 1.5146727248882648, "learning_rate": 1.8140247209771946e-05, "loss": 0.8525, "step": 1121 }, { "epoch": 0.6644951140065146, "grad_norm": 2.4296936822760746, "learning_rate": 1.8136530699885852e-05, "loss": 0.8435, "step": 1122 }, { "epoch": 0.6650873556411016, "grad_norm": 3.833524070202994, "learning_rate": 1.813281086170116e-05, "loss": 0.9018, "step": 1123 }, { "epoch": 0.6656795972756885, "grad_norm": 2.0040513437046648, "learning_rate": 1.8129087696739497e-05, "loss": 0.8821, "step": 1124 }, { "epoch": 0.6662718389102754, "grad_norm": 1.4186690415848557, "learning_rate": 1.8125361206523845e-05, "loss": 0.8447, "step": 1125 }, { "epoch": 0.6668640805448623, "grad_norm": 1.4088568380399054, "learning_rate": 1.8121631392578545e-05, "loss": 0.8548, "step": 1126 }, { "epoch": 0.6674563221794492, "grad_norm": 1.8279521308810265, "learning_rate": 1.811789825642931e-05, "loss": 0.8499, "step": 1127 }, { "epoch": 0.6680485638140361, "grad_norm": 1.7623710462745776, "learning_rate": 1.8114161799603195e-05, "loss": 0.8746, "step": 1128 }, { "epoch": 0.668640805448623, "grad_norm": 1.6841484340107102, "learning_rate": 1.8110422023628623e-05, "loss": 0.8287, "step": 1129 }, { "epoch": 0.66923304708321, "grad_norm": 1.5460477858044241, "learning_rate": 1.810667893003537e-05, "loss": 0.8403, "step": 1130 }, { "epoch": 0.6698252887177969, "grad_norm": 2.904407513681924, "learning_rate": 1.8102932520354572e-05, "loss": 0.8535, "step": 1131 }, { "epoch": 0.6704175303523838, "grad_norm": 2.3502598935192682, "learning_rate": 1.8099182796118727e-05, "loss": 0.9069, "step": 1132 }, { "epoch": 0.6710097719869706, "grad_norm": 2.267045869299457, "learning_rate": 1.8095429758861682e-05, "loss": 0.8095, "step": 1133 }, { "epoch": 0.6716020136215576, "grad_norm": 1.5438780262047374, "learning_rate": 1.8091673410118633e-05, "loss": 0.8264, "step": 1134 }, { "epoch": 0.6721942552561445, "grad_norm": 2.956645508277099, "learning_rate": 1.8087913751426142e-05, "loss": 0.8709, "step": 1135 }, { "epoch": 0.6727864968907314, "grad_norm": 2.2889022888507284, "learning_rate": 1.8084150784322123e-05, "loss": 0.836, "step": 1136 }, { "epoch": 0.6733787385253184, "grad_norm": 1.4850766527643833, "learning_rate": 1.8080384510345838e-05, "loss": 0.8603, "step": 1137 }, { "epoch": 0.6739709801599052, "grad_norm": 2.1026164044539315, "learning_rate": 1.8076614931037908e-05, "loss": 0.8917, "step": 1138 }, { "epoch": 0.6745632217944921, "grad_norm": 1.620826120946746, "learning_rate": 1.80728420479403e-05, "loss": 0.8327, "step": 1139 }, { "epoch": 0.6751554634290791, "grad_norm": 1.5100307488301057, "learning_rate": 1.8069065862596338e-05, "loss": 0.872, "step": 1140 }, { "epoch": 0.675747705063666, "grad_norm": 1.3534401313900613, "learning_rate": 1.8065286376550692e-05, "loss": 0.8426, "step": 1141 }, { "epoch": 0.6763399466982529, "grad_norm": 12.092384661716544, "learning_rate": 1.8061503591349386e-05, "loss": 0.8416, "step": 1142 }, { "epoch": 0.6769321883328397, "grad_norm": 1.208766260196838, "learning_rate": 1.8057717508539786e-05, "loss": 0.8475, "step": 1143 }, { "epoch": 0.6775244299674267, "grad_norm": 1.2451969427779477, "learning_rate": 1.8053928129670624e-05, "loss": 0.8306, "step": 1144 }, { "epoch": 0.6781166716020136, "grad_norm": 1.799182555450414, "learning_rate": 1.805013545629196e-05, "loss": 0.8296, "step": 1145 }, { "epoch": 0.6787089132366005, "grad_norm": 1.3830942844154197, "learning_rate": 1.8046339489955214e-05, "loss": 0.8246, "step": 1146 }, { "epoch": 0.6793011548711875, "grad_norm": 1.6078573859008807, "learning_rate": 1.804254023221315e-05, "loss": 0.8071, "step": 1147 }, { "epoch": 0.6798933965057744, "grad_norm": 1.335961933557725, "learning_rate": 1.8038737684619874e-05, "loss": 0.8593, "step": 1148 }, { "epoch": 0.6804856381403612, "grad_norm": 1.546298180666286, "learning_rate": 1.8034931848730846e-05, "loss": 0.8249, "step": 1149 }, { "epoch": 0.6810778797749482, "grad_norm": 1.635069545524243, "learning_rate": 1.8031122726102868e-05, "loss": 0.877, "step": 1150 }, { "epoch": 0.6816701214095351, "grad_norm": 1.2691447098723208, "learning_rate": 1.802731031829408e-05, "loss": 0.8395, "step": 1151 }, { "epoch": 0.682262363044122, "grad_norm": 1.869539168329112, "learning_rate": 1.8023494626863976e-05, "loss": 0.813, "step": 1152 }, { "epoch": 0.682854604678709, "grad_norm": 4.282750886399974, "learning_rate": 1.8019675653373387e-05, "loss": 0.8337, "step": 1153 }, { "epoch": 0.6834468463132958, "grad_norm": 1.5109022276624124, "learning_rate": 1.8015853399384488e-05, "loss": 0.8654, "step": 1154 }, { "epoch": 0.6840390879478827, "grad_norm": 3.215342397054156, "learning_rate": 1.8012027866460797e-05, "loss": 0.9001, "step": 1155 }, { "epoch": 0.6846313295824696, "grad_norm": 1.311513238293177, "learning_rate": 1.8008199056167167e-05, "loss": 0.8889, "step": 1156 }, { "epoch": 0.6852235712170566, "grad_norm": 1.4742093712676303, "learning_rate": 1.80043669700698e-05, "loss": 0.8175, "step": 1157 }, { "epoch": 0.6858158128516435, "grad_norm": 1.2459495798186484, "learning_rate": 1.8000531609736236e-05, "loss": 0.8096, "step": 1158 }, { "epoch": 0.6864080544862304, "grad_norm": 1.6603825601369882, "learning_rate": 1.799669297673535e-05, "loss": 0.8859, "step": 1159 }, { "epoch": 0.6870002961208173, "grad_norm": 1.8179070682239424, "learning_rate": 1.7992851072637366e-05, "loss": 0.8541, "step": 1160 }, { "epoch": 0.6875925377554042, "grad_norm": 1.2406374896101122, "learning_rate": 1.7989005899013828e-05, "loss": 0.8413, "step": 1161 }, { "epoch": 0.6881847793899911, "grad_norm": 1.5721991668978201, "learning_rate": 1.798515745743764e-05, "loss": 0.8636, "step": 1162 }, { "epoch": 0.688777021024578, "grad_norm": 1.8095072497988238, "learning_rate": 1.798130574948302e-05, "loss": 0.8892, "step": 1163 }, { "epoch": 0.689369262659165, "grad_norm": 1.8048696574204277, "learning_rate": 1.797745077672554e-05, "loss": 0.8207, "step": 1164 }, { "epoch": 0.6899615042937518, "grad_norm": 1.9168478766964387, "learning_rate": 1.7973592540742095e-05, "loss": 0.8593, "step": 1165 }, { "epoch": 0.6905537459283387, "grad_norm": 1.7351792742264414, "learning_rate": 1.7969731043110928e-05, "loss": 0.845, "step": 1166 }, { "epoch": 0.6911459875629257, "grad_norm": 1.9689492610583388, "learning_rate": 1.79658662854116e-05, "loss": 0.8399, "step": 1167 }, { "epoch": 0.6917382291975126, "grad_norm": 2.378890204311825, "learning_rate": 1.7961998269225024e-05, "loss": 0.836, "step": 1168 }, { "epoch": 0.6923304708320995, "grad_norm": 1.7234996114622456, "learning_rate": 1.7958126996133427e-05, "loss": 0.8726, "step": 1169 }, { "epoch": 0.6929227124666865, "grad_norm": 12.35334136288384, "learning_rate": 1.7954252467720386e-05, "loss": 0.8219, "step": 1170 }, { "epoch": 0.6935149541012733, "grad_norm": 1.5256431304922908, "learning_rate": 1.7950374685570794e-05, "loss": 0.8227, "step": 1171 }, { "epoch": 0.6941071957358602, "grad_norm": 2.490658777946983, "learning_rate": 1.7946493651270883e-05, "loss": 0.8785, "step": 1172 }, { "epoch": 0.6946994373704471, "grad_norm": 2.143120458120011, "learning_rate": 1.794260936640822e-05, "loss": 0.7914, "step": 1173 }, { "epoch": 0.6952916790050341, "grad_norm": 1.961814139735007, "learning_rate": 1.7938721832571688e-05, "loss": 0.84, "step": 1174 }, { "epoch": 0.695883920639621, "grad_norm": 1.8808018807507807, "learning_rate": 1.7934831051351513e-05, "loss": 0.8379, "step": 1175 }, { "epoch": 0.6964761622742078, "grad_norm": 3.8564691984179915, "learning_rate": 1.793093702433924e-05, "loss": 0.8931, "step": 1176 }, { "epoch": 0.6970684039087948, "grad_norm": 2.991169714160442, "learning_rate": 1.792703975312774e-05, "loss": 0.8591, "step": 1177 }, { "epoch": 0.6976606455433817, "grad_norm": 2.0564470226192775, "learning_rate": 1.792313923931123e-05, "loss": 0.8484, "step": 1178 }, { "epoch": 0.6982528871779686, "grad_norm": 2.7676305573686486, "learning_rate": 1.791923548448523e-05, "loss": 0.9096, "step": 1179 }, { "epoch": 0.6988451288125556, "grad_norm": 2.313908537479878, "learning_rate": 1.7915328490246594e-05, "loss": 0.8442, "step": 1180 }, { "epoch": 0.6994373704471424, "grad_norm": 2.6234620243010967, "learning_rate": 1.7911418258193503e-05, "loss": 0.8982, "step": 1181 }, { "epoch": 0.7000296120817293, "grad_norm": 1.8042865323547594, "learning_rate": 1.7907504789925473e-05, "loss": 0.8913, "step": 1182 }, { "epoch": 0.7006218537163162, "grad_norm": 2.7891631926045948, "learning_rate": 1.7903588087043314e-05, "loss": 0.833, "step": 1183 }, { "epoch": 0.7012140953509032, "grad_norm": 1.8602348121875936, "learning_rate": 1.789966815114919e-05, "loss": 0.8298, "step": 1184 }, { "epoch": 0.7018063369854901, "grad_norm": 2.8372707284352074, "learning_rate": 1.7895744983846575e-05, "loss": 0.8192, "step": 1185 }, { "epoch": 0.702398578620077, "grad_norm": 1.8134109667963114, "learning_rate": 1.789181858674026e-05, "loss": 0.8336, "step": 1186 }, { "epoch": 0.7029908202546639, "grad_norm": 3.3699318930231392, "learning_rate": 1.7887888961436367e-05, "loss": 0.837, "step": 1187 }, { "epoch": 0.7035830618892508, "grad_norm": 3.5279273185363933, "learning_rate": 1.788395610954233e-05, "loss": 0.8584, "step": 1188 }, { "epoch": 0.7041753035238377, "grad_norm": 10.545341229489324, "learning_rate": 1.7880020032666906e-05, "loss": 0.8366, "step": 1189 }, { "epoch": 0.7047675451584247, "grad_norm": 1.8317791616806671, "learning_rate": 1.7876080732420176e-05, "loss": 0.8156, "step": 1190 }, { "epoch": 0.7053597867930116, "grad_norm": 2.8166796146116635, "learning_rate": 1.7872138210413533e-05, "loss": 0.8591, "step": 1191 }, { "epoch": 0.7059520284275984, "grad_norm": 2.3506890987289837, "learning_rate": 1.7868192468259686e-05, "loss": 0.8696, "step": 1192 }, { "epoch": 0.7065442700621853, "grad_norm": 1.9915077835053718, "learning_rate": 1.7864243507572678e-05, "loss": 0.8078, "step": 1193 }, { "epoch": 0.7071365116967723, "grad_norm": 2.4789943921714728, "learning_rate": 1.7860291329967842e-05, "loss": 0.8549, "step": 1194 }, { "epoch": 0.7077287533313592, "grad_norm": 3.5203512739721114, "learning_rate": 1.7856335937061843e-05, "loss": 0.8259, "step": 1195 }, { "epoch": 0.7083209949659461, "grad_norm": 2.7626514330217207, "learning_rate": 1.7852377330472668e-05, "loss": 0.8024, "step": 1196 }, { "epoch": 0.7089132366005331, "grad_norm": 2.9434175694356286, "learning_rate": 1.7848415511819602e-05, "loss": 0.8561, "step": 1197 }, { "epoch": 0.7095054782351199, "grad_norm": 2.938160794113487, "learning_rate": 1.7844450482723258e-05, "loss": 0.8348, "step": 1198 }, { "epoch": 0.7100977198697068, "grad_norm": 7.531717951600264, "learning_rate": 1.7840482244805546e-05, "loss": 0.8257, "step": 1199 }, { "epoch": 0.7106899615042938, "grad_norm": 2.5108516098902345, "learning_rate": 1.783651079968971e-05, "loss": 0.8359, "step": 1200 }, { "epoch": 0.7112822031388807, "grad_norm": 6.9259734842190515, "learning_rate": 1.7832536149000283e-05, "loss": 0.8493, "step": 1201 }, { "epoch": 0.7118744447734676, "grad_norm": 3.604438919377138, "learning_rate": 1.782855829436313e-05, "loss": 0.9005, "step": 1202 }, { "epoch": 0.7124666864080544, "grad_norm": 6.432790286739704, "learning_rate": 1.782457723740541e-05, "loss": 0.8502, "step": 1203 }, { "epoch": 0.7130589280426414, "grad_norm": 2.309337652505556, "learning_rate": 1.7820592979755605e-05, "loss": 0.8378, "step": 1204 }, { "epoch": 0.7136511696772283, "grad_norm": 4.86010637400942, "learning_rate": 1.78166055230435e-05, "loss": 0.8571, "step": 1205 }, { "epoch": 0.7142434113118152, "grad_norm": 2.4643862263555003, "learning_rate": 1.7812614868900185e-05, "loss": 0.8026, "step": 1206 }, { "epoch": 0.7148356529464022, "grad_norm": 4.488518458955996, "learning_rate": 1.7808621018958063e-05, "loss": 0.8474, "step": 1207 }, { "epoch": 0.7154278945809891, "grad_norm": 2.142014115136399, "learning_rate": 1.7804623974850844e-05, "loss": 0.8327, "step": 1208 }, { "epoch": 0.7160201362155759, "grad_norm": 3.059145127668178, "learning_rate": 1.7800623738213544e-05, "loss": 0.8184, "step": 1209 }, { "epoch": 0.7166123778501629, "grad_norm": 3.1694873538825927, "learning_rate": 1.779662031068249e-05, "loss": 0.8609, "step": 1210 }, { "epoch": 0.7172046194847498, "grad_norm": 1.8086628702754877, "learning_rate": 1.7792613693895298e-05, "loss": 0.8932, "step": 1211 }, { "epoch": 0.7177968611193367, "grad_norm": 3.4357386272756254, "learning_rate": 1.7788603889490907e-05, "loss": 0.825, "step": 1212 }, { "epoch": 0.7183891027539236, "grad_norm": 5.825010332449306, "learning_rate": 1.7784590899109554e-05, "loss": 0.8121, "step": 1213 }, { "epoch": 0.7189813443885105, "grad_norm": 4.868145531572263, "learning_rate": 1.778057472439277e-05, "loss": 0.8699, "step": 1214 }, { "epoch": 0.7195735860230974, "grad_norm": 2.4243315664795104, "learning_rate": 1.7776555366983403e-05, "loss": 0.8094, "step": 1215 }, { "epoch": 0.7201658276576843, "grad_norm": 2.2560480044043554, "learning_rate": 1.7772532828525593e-05, "loss": 0.8866, "step": 1216 }, { "epoch": 0.7207580692922713, "grad_norm": 2.7080185493295432, "learning_rate": 1.7768507110664787e-05, "loss": 0.8779, "step": 1217 }, { "epoch": 0.7213503109268582, "grad_norm": 2.1434635867346246, "learning_rate": 1.7764478215047725e-05, "loss": 0.8646, "step": 1218 }, { "epoch": 0.721942552561445, "grad_norm": 3.651352165498139, "learning_rate": 1.776044614332246e-05, "loss": 0.8161, "step": 1219 }, { "epoch": 0.722534794196032, "grad_norm": 3.1506750177909093, "learning_rate": 1.7756410897138326e-05, "loss": 0.8791, "step": 1220 }, { "epoch": 0.7231270358306189, "grad_norm": 3.4301185321356424, "learning_rate": 1.7752372478145975e-05, "loss": 0.8843, "step": 1221 }, { "epoch": 0.7237192774652058, "grad_norm": 2.985509123858241, "learning_rate": 1.7748330887997344e-05, "loss": 0.8521, "step": 1222 }, { "epoch": 0.7243115190997927, "grad_norm": 2.1395675990186964, "learning_rate": 1.774428612834567e-05, "loss": 0.8504, "step": 1223 }, { "epoch": 0.7249037607343797, "grad_norm": 1.8208520277656095, "learning_rate": 1.7740238200845485e-05, "loss": 0.8359, "step": 1224 }, { "epoch": 0.7254960023689665, "grad_norm": 2.5196284085738583, "learning_rate": 1.773618710715262e-05, "loss": 0.8758, "step": 1225 }, { "epoch": 0.7260882440035534, "grad_norm": 1.9151106256797734, "learning_rate": 1.7732132848924206e-05, "loss": 0.8402, "step": 1226 }, { "epoch": 0.7266804856381404, "grad_norm": 2.8093175656971927, "learning_rate": 1.7728075427818658e-05, "loss": 0.8215, "step": 1227 }, { "epoch": 0.7272727272727273, "grad_norm": 3.506653025025842, "learning_rate": 1.7724014845495684e-05, "loss": 0.8823, "step": 1228 }, { "epoch": 0.7278649689073142, "grad_norm": 3.033233001673288, "learning_rate": 1.77199511036163e-05, "loss": 0.8542, "step": 1229 }, { "epoch": 0.728457210541901, "grad_norm": 1.474917032531982, "learning_rate": 1.77158842038428e-05, "loss": 0.8106, "step": 1230 }, { "epoch": 0.729049452176488, "grad_norm": 5.616912789611092, "learning_rate": 1.7711814147838776e-05, "loss": 0.8468, "step": 1231 }, { "epoch": 0.7296416938110749, "grad_norm": 2.2432187424799217, "learning_rate": 1.7707740937269108e-05, "loss": 0.7949, "step": 1232 }, { "epoch": 0.7302339354456618, "grad_norm": 3.5975642635267424, "learning_rate": 1.770366457379997e-05, "loss": 0.8752, "step": 1233 }, { "epoch": 0.7308261770802488, "grad_norm": 2.0011627880439096, "learning_rate": 1.769958505909882e-05, "loss": 0.8102, "step": 1234 }, { "epoch": 0.7314184187148357, "grad_norm": 2.8280112394549604, "learning_rate": 1.7695502394834416e-05, "loss": 0.8692, "step": 1235 }, { "epoch": 0.7320106603494225, "grad_norm": 5.669137853206307, "learning_rate": 1.7691416582676792e-05, "loss": 0.8747, "step": 1236 }, { "epoch": 0.7326029019840095, "grad_norm": 2.1063941357764935, "learning_rate": 1.768732762429728e-05, "loss": 0.7983, "step": 1237 }, { "epoch": 0.7331951436185964, "grad_norm": 2.4339492898585253, "learning_rate": 1.7683235521368484e-05, "loss": 0.8579, "step": 1238 }, { "epoch": 0.7337873852531833, "grad_norm": 3.432527823235171, "learning_rate": 1.7679140275564315e-05, "loss": 0.814, "step": 1239 }, { "epoch": 0.7343796268877703, "grad_norm": 2.4768439625101286, "learning_rate": 1.7675041888559952e-05, "loss": 0.8373, "step": 1240 }, { "epoch": 0.7349718685223571, "grad_norm": 3.360300015627485, "learning_rate": 1.767094036203187e-05, "loss": 0.8396, "step": 1241 }, { "epoch": 0.735564110156944, "grad_norm": 3.705948689891925, "learning_rate": 1.7666835697657824e-05, "loss": 0.8773, "step": 1242 }, { "epoch": 0.7361563517915309, "grad_norm": 2.29301370449494, "learning_rate": 1.7662727897116843e-05, "loss": 0.8446, "step": 1243 }, { "epoch": 0.7367485934261179, "grad_norm": 5.029628549833236, "learning_rate": 1.7658616962089262e-05, "loss": 0.7834, "step": 1244 }, { "epoch": 0.7373408350607048, "grad_norm": 3.1057311683988056, "learning_rate": 1.765450289425668e-05, "loss": 0.832, "step": 1245 }, { "epoch": 0.7379330766952917, "grad_norm": 3.097561035378791, "learning_rate": 1.765038569530198e-05, "loss": 0.8543, "step": 1246 }, { "epoch": 0.7385253183298786, "grad_norm": 3.034406664943384, "learning_rate": 1.7646265366909324e-05, "loss": 0.8386, "step": 1247 }, { "epoch": 0.7391175599644655, "grad_norm": 2.6367359298667985, "learning_rate": 1.7642141910764164e-05, "loss": 0.8551, "step": 1248 }, { "epoch": 0.7397098015990524, "grad_norm": 2.3375256950022014, "learning_rate": 1.763801532855323e-05, "loss": 0.8636, "step": 1249 }, { "epoch": 0.7403020432336394, "grad_norm": 2.518186118551617, "learning_rate": 1.7633885621964516e-05, "loss": 0.8382, "step": 1250 }, { "epoch": 0.7408942848682263, "grad_norm": 2.8355859671570003, "learning_rate": 1.762975279268731e-05, "loss": 0.8467, "step": 1251 }, { "epoch": 0.7414865265028131, "grad_norm": 3.038664124903949, "learning_rate": 1.7625616842412166e-05, "loss": 0.8035, "step": 1252 }, { "epoch": 0.7420787681374, "grad_norm": 3.4648343011990033, "learning_rate": 1.7621477772830927e-05, "loss": 0.8167, "step": 1253 }, { "epoch": 0.742671009771987, "grad_norm": 2.5985099036690764, "learning_rate": 1.76173355856367e-05, "loss": 0.852, "step": 1254 }, { "epoch": 0.7432632514065739, "grad_norm": 2.043804821692882, "learning_rate": 1.761319028252388e-05, "loss": 0.8029, "step": 1255 }, { "epoch": 0.7438554930411608, "grad_norm": 3.499822034990886, "learning_rate": 1.7609041865188122e-05, "loss": 0.8653, "step": 1256 }, { "epoch": 0.7444477346757477, "grad_norm": 2.351751327143647, "learning_rate": 1.7604890335326362e-05, "loss": 0.8544, "step": 1257 }, { "epoch": 0.7450399763103346, "grad_norm": 1.989760304041313, "learning_rate": 1.7600735694636814e-05, "loss": 0.809, "step": 1258 }, { "epoch": 0.7456322179449215, "grad_norm": 3.2403502877273676, "learning_rate": 1.7596577944818954e-05, "loss": 0.8615, "step": 1259 }, { "epoch": 0.7462244595795084, "grad_norm": 1.5631166920770456, "learning_rate": 1.759241708757354e-05, "loss": 0.8369, "step": 1260 }, { "epoch": 0.7468167012140954, "grad_norm": 5.042322573986611, "learning_rate": 1.7588253124602596e-05, "loss": 0.7944, "step": 1261 }, { "epoch": 0.7474089428486823, "grad_norm": 4.732786628166864, "learning_rate": 1.7584086057609413e-05, "loss": 0.8202, "step": 1262 }, { "epoch": 0.7480011844832691, "grad_norm": 2.275400890981689, "learning_rate": 1.757991588829856e-05, "loss": 0.8174, "step": 1263 }, { "epoch": 0.7485934261178561, "grad_norm": 2.669086920815046, "learning_rate": 1.757574261837587e-05, "loss": 0.867, "step": 1264 }, { "epoch": 0.749185667752443, "grad_norm": 2.6052774063562865, "learning_rate": 1.7571566249548446e-05, "loss": 0.8538, "step": 1265 }, { "epoch": 0.7497779093870299, "grad_norm": 2.405180495723794, "learning_rate": 1.7567386783524655e-05, "loss": 0.8009, "step": 1266 }, { "epoch": 0.7503701510216169, "grad_norm": 3.3676019565827886, "learning_rate": 1.756320422201413e-05, "loss": 0.8042, "step": 1267 }, { "epoch": 0.7509623926562037, "grad_norm": 2.966741525614473, "learning_rate": 1.7559018566727788e-05, "loss": 0.8026, "step": 1268 }, { "epoch": 0.7515546342907906, "grad_norm": 3.2543032515193993, "learning_rate": 1.755482981937778e-05, "loss": 0.8627, "step": 1269 }, { "epoch": 0.7521468759253775, "grad_norm": 2.0946507733084165, "learning_rate": 1.755063798167755e-05, "loss": 0.8687, "step": 1270 }, { "epoch": 0.7527391175599645, "grad_norm": 1.3831223141092206, "learning_rate": 1.754644305534179e-05, "loss": 0.8339, "step": 1271 }, { "epoch": 0.7533313591945514, "grad_norm": 2.0665293112108465, "learning_rate": 1.754224504208647e-05, "loss": 0.7964, "step": 1272 }, { "epoch": 0.7539236008291383, "grad_norm": 13.182638808399124, "learning_rate": 1.7538043943628803e-05, "loss": 0.851, "step": 1273 }, { "epoch": 0.7545158424637252, "grad_norm": 1.7213978689349443, "learning_rate": 1.7533839761687278e-05, "loss": 0.848, "step": 1274 }, { "epoch": 0.7551080840983121, "grad_norm": 4.2008316501367435, "learning_rate": 1.7529632497981644e-05, "loss": 0.7775, "step": 1275 }, { "epoch": 0.755700325732899, "grad_norm": 1.7693985409902604, "learning_rate": 1.7525422154232906e-05, "loss": 0.8109, "step": 1276 }, { "epoch": 0.756292567367486, "grad_norm": 2.2297535857353155, "learning_rate": 1.752120873216333e-05, "loss": 0.9018, "step": 1277 }, { "epoch": 0.7568848090020729, "grad_norm": 2.7453149628984215, "learning_rate": 1.7516992233496443e-05, "loss": 0.8978, "step": 1278 }, { "epoch": 0.7574770506366597, "grad_norm": 1.6904372827667524, "learning_rate": 1.7512772659957037e-05, "loss": 0.875, "step": 1279 }, { "epoch": 0.7580692922712466, "grad_norm": 2.943908693131239, "learning_rate": 1.7508550013271146e-05, "loss": 0.8365, "step": 1280 }, { "epoch": 0.7586615339058336, "grad_norm": 2.5437925239618684, "learning_rate": 1.7504324295166073e-05, "loss": 0.8528, "step": 1281 }, { "epoch": 0.7592537755404205, "grad_norm": 1.6253829546363479, "learning_rate": 1.7500095507370376e-05, "loss": 0.8533, "step": 1282 }, { "epoch": 0.7598460171750074, "grad_norm": 2.534930764133692, "learning_rate": 1.7495863651613866e-05, "loss": 0.8562, "step": 1283 }, { "epoch": 0.7604382588095944, "grad_norm": 1.6060087765012423, "learning_rate": 1.749162872962761e-05, "loss": 0.8215, "step": 1284 }, { "epoch": 0.7610305004441812, "grad_norm": 1.7368003536795598, "learning_rate": 1.7487390743143927e-05, "loss": 0.8376, "step": 1285 }, { "epoch": 0.7616227420787681, "grad_norm": 3.382650140379708, "learning_rate": 1.7483149693896396e-05, "loss": 0.8282, "step": 1286 }, { "epoch": 0.762214983713355, "grad_norm": 3.2394560487893074, "learning_rate": 1.747890558361984e-05, "loss": 0.8334, "step": 1287 }, { "epoch": 0.762807225347942, "grad_norm": 2.1963133524390295, "learning_rate": 1.7474658414050344e-05, "loss": 0.8512, "step": 1288 }, { "epoch": 0.7633994669825289, "grad_norm": 2.7546045395326986, "learning_rate": 1.7470408186925233e-05, "loss": 0.846, "step": 1289 }, { "epoch": 0.7639917086171157, "grad_norm": 2.4453730617259004, "learning_rate": 1.7466154903983092e-05, "loss": 0.8403, "step": 1290 }, { "epoch": 0.7645839502517027, "grad_norm": 3.391610937118782, "learning_rate": 1.7461898566963754e-05, "loss": 0.844, "step": 1291 }, { "epoch": 0.7651761918862896, "grad_norm": 2.9168088797780345, "learning_rate": 1.74576391776083e-05, "loss": 0.8823, "step": 1292 }, { "epoch": 0.7657684335208765, "grad_norm": 2.199990393304276, "learning_rate": 1.745337673765906e-05, "loss": 0.8407, "step": 1293 }, { "epoch": 0.7663606751554635, "grad_norm": 1.9287995776670166, "learning_rate": 1.744911124885961e-05, "loss": 0.8179, "step": 1294 }, { "epoch": 0.7669529167900503, "grad_norm": 2.2102693326836964, "learning_rate": 1.7444842712954778e-05, "loss": 0.7751, "step": 1295 }, { "epoch": 0.7675451584246372, "grad_norm": 2.388498039600017, "learning_rate": 1.7440571131690626e-05, "loss": 0.8686, "step": 1296 }, { "epoch": 0.7681374000592242, "grad_norm": 2.6036812285499322, "learning_rate": 1.7436296506814483e-05, "loss": 0.8603, "step": 1297 }, { "epoch": 0.7687296416938111, "grad_norm": 2.9525736440709425, "learning_rate": 1.7432018840074905e-05, "loss": 0.8569, "step": 1298 }, { "epoch": 0.769321883328398, "grad_norm": 2.301471909050177, "learning_rate": 1.7427738133221694e-05, "loss": 0.8967, "step": 1299 }, { "epoch": 0.769914124962985, "grad_norm": 2.399067499647489, "learning_rate": 1.742345438800591e-05, "loss": 0.886, "step": 1300 }, { "epoch": 0.7705063665975718, "grad_norm": 1.6895013555018883, "learning_rate": 1.7419167606179837e-05, "loss": 0.8087, "step": 1301 }, { "epoch": 0.7710986082321587, "grad_norm": 3.155550562789463, "learning_rate": 1.7414877789497017e-05, "loss": 0.8124, "step": 1302 }, { "epoch": 0.7716908498667456, "grad_norm": 4.009429752899135, "learning_rate": 1.741058493971222e-05, "loss": 0.835, "step": 1303 }, { "epoch": 0.7722830915013326, "grad_norm": 2.7343050132154483, "learning_rate": 1.7406289058581466e-05, "loss": 0.8687, "step": 1304 }, { "epoch": 0.7728753331359195, "grad_norm": 2.511148964673554, "learning_rate": 1.7401990147862008e-05, "loss": 0.8697, "step": 1305 }, { "epoch": 0.7734675747705063, "grad_norm": 1.6005516598974556, "learning_rate": 1.739768820931235e-05, "loss": 0.8558, "step": 1306 }, { "epoch": 0.7740598164050932, "grad_norm": 5.650942014927322, "learning_rate": 1.7393383244692218e-05, "loss": 0.8184, "step": 1307 }, { "epoch": 0.7746520580396802, "grad_norm": 3.5550381737938634, "learning_rate": 1.7389075255762592e-05, "loss": 0.8144, "step": 1308 }, { "epoch": 0.7752442996742671, "grad_norm": 2.764917013421342, "learning_rate": 1.738476424428568e-05, "loss": 0.8719, "step": 1309 }, { "epoch": 0.775836541308854, "grad_norm": 3.587759059350319, "learning_rate": 1.7380450212024924e-05, "loss": 0.861, "step": 1310 }, { "epoch": 0.776428782943441, "grad_norm": 2.253441060117673, "learning_rate": 1.737613316074501e-05, "loss": 0.8321, "step": 1311 }, { "epoch": 0.7770210245780278, "grad_norm": 1.4280085430434677, "learning_rate": 1.737181309221185e-05, "loss": 0.8575, "step": 1312 }, { "epoch": 0.7776132662126147, "grad_norm": 2.035397944647251, "learning_rate": 1.73674900081926e-05, "loss": 0.8998, "step": 1313 }, { "epoch": 0.7782055078472017, "grad_norm": 1.519998519535366, "learning_rate": 1.7363163910455646e-05, "loss": 0.7885, "step": 1314 }, { "epoch": 0.7787977494817886, "grad_norm": 3.650699183495285, "learning_rate": 1.73588348007706e-05, "loss": 0.8609, "step": 1315 }, { "epoch": 0.7793899911163755, "grad_norm": 5.6510846760274775, "learning_rate": 1.735450268090831e-05, "loss": 0.8084, "step": 1316 }, { "epoch": 0.7799822327509623, "grad_norm": 2.09775440341691, "learning_rate": 1.735016755264086e-05, "loss": 0.8738, "step": 1317 }, { "epoch": 0.7805744743855493, "grad_norm": 1.931648307541478, "learning_rate": 1.7345829417741564e-05, "loss": 0.815, "step": 1318 }, { "epoch": 0.7811667160201362, "grad_norm": 3.1720176685267183, "learning_rate": 1.734148827798496e-05, "loss": 0.8635, "step": 1319 }, { "epoch": 0.7817589576547231, "grad_norm": 3.202514276793904, "learning_rate": 1.7337144135146818e-05, "loss": 0.8523, "step": 1320 }, { "epoch": 0.7823511992893101, "grad_norm": 2.3467580376278767, "learning_rate": 1.7332796991004137e-05, "loss": 0.8929, "step": 1321 }, { "epoch": 0.782943440923897, "grad_norm": 1.7193074019861174, "learning_rate": 1.7328446847335142e-05, "loss": 0.8243, "step": 1322 }, { "epoch": 0.7835356825584838, "grad_norm": 1.93789230603662, "learning_rate": 1.7324093705919288e-05, "loss": 0.8041, "step": 1323 }, { "epoch": 0.7841279241930708, "grad_norm": 2.813798090440131, "learning_rate": 1.731973756853726e-05, "loss": 0.8083, "step": 1324 }, { "epoch": 0.7847201658276577, "grad_norm": 5.447428960254784, "learning_rate": 1.7315378436970952e-05, "loss": 0.8272, "step": 1325 }, { "epoch": 0.7853124074622446, "grad_norm": 1.5252289352217188, "learning_rate": 1.73110163130035e-05, "loss": 0.9191, "step": 1326 }, { "epoch": 0.7859046490968316, "grad_norm": 4.098459400505394, "learning_rate": 1.730665119841926e-05, "loss": 0.8249, "step": 1327 }, { "epoch": 0.7864968907314184, "grad_norm": 1.3507732532666377, "learning_rate": 1.7302283095003807e-05, "loss": 0.8448, "step": 1328 }, { "epoch": 0.7870891323660053, "grad_norm": 1.8647038075327456, "learning_rate": 1.729791200454394e-05, "loss": 0.8271, "step": 1329 }, { "epoch": 0.7876813740005922, "grad_norm": 6.007731309930798, "learning_rate": 1.729353792882768e-05, "loss": 0.8679, "step": 1330 }, { "epoch": 0.7882736156351792, "grad_norm": 1.730951437129075, "learning_rate": 1.7289160869644273e-05, "loss": 0.8049, "step": 1331 }, { "epoch": 0.7888658572697661, "grad_norm": 2.5126410391090825, "learning_rate": 1.728478082878418e-05, "loss": 0.854, "step": 1332 }, { "epoch": 0.7894580989043529, "grad_norm": 17.97525643692378, "learning_rate": 1.7280397808039087e-05, "loss": 0.8243, "step": 1333 }, { "epoch": 0.7900503405389399, "grad_norm": 3.5691587966526512, "learning_rate": 1.7276011809201896e-05, "loss": 0.8118, "step": 1334 }, { "epoch": 0.7906425821735268, "grad_norm": 1.8559040171020436, "learning_rate": 1.7271622834066722e-05, "loss": 0.8886, "step": 1335 }, { "epoch": 0.7912348238081137, "grad_norm": 1.6994200843830343, "learning_rate": 1.7267230884428905e-05, "loss": 0.8215, "step": 1336 }, { "epoch": 0.7918270654427007, "grad_norm": 2.6015274500336343, "learning_rate": 1.7262835962085e-05, "loss": 0.7956, "step": 1337 }, { "epoch": 0.7924193070772876, "grad_norm": 1.8824140694819236, "learning_rate": 1.725843806883278e-05, "loss": 0.8642, "step": 1338 }, { "epoch": 0.7930115487118744, "grad_norm": 2.14510961510416, "learning_rate": 1.7254037206471226e-05, "loss": 0.7958, "step": 1339 }, { "epoch": 0.7936037903464613, "grad_norm": 1.978972467508621, "learning_rate": 1.7249633376800542e-05, "loss": 0.7886, "step": 1340 }, { "epoch": 0.7941960319810483, "grad_norm": 1.797453382477477, "learning_rate": 1.724522658162214e-05, "loss": 0.8117, "step": 1341 }, { "epoch": 0.7947882736156352, "grad_norm": 1.9495032921229507, "learning_rate": 1.7240816822738646e-05, "loss": 0.7986, "step": 1342 }, { "epoch": 0.7953805152502221, "grad_norm": 2.420632731466788, "learning_rate": 1.72364041019539e-05, "loss": 0.8428, "step": 1343 }, { "epoch": 0.795972756884809, "grad_norm": 1.430427148223359, "learning_rate": 1.7231988421072957e-05, "loss": 0.8502, "step": 1344 }, { "epoch": 0.7965649985193959, "grad_norm": 2.0260574149321755, "learning_rate": 1.7227569781902073e-05, "loss": 0.819, "step": 1345 }, { "epoch": 0.7971572401539828, "grad_norm": 1.8930044657278047, "learning_rate": 1.722314818624872e-05, "loss": 0.8378, "step": 1346 }, { "epoch": 0.7977494817885697, "grad_norm": 1.368043792921863, "learning_rate": 1.7218723635921587e-05, "loss": 0.8363, "step": 1347 }, { "epoch": 0.7983417234231567, "grad_norm": 1.6583322562133502, "learning_rate": 1.7214296132730555e-05, "loss": 0.8587, "step": 1348 }, { "epoch": 0.7989339650577436, "grad_norm": 2.6952971468399842, "learning_rate": 1.7209865678486727e-05, "loss": 0.8388, "step": 1349 }, { "epoch": 0.7995262066923304, "grad_norm": 1.603441854661535, "learning_rate": 1.7205432275002403e-05, "loss": 0.8303, "step": 1350 }, { "epoch": 0.8001184483269174, "grad_norm": 1.3660414196014246, "learning_rate": 1.7200995924091102e-05, "loss": 0.8137, "step": 1351 }, { "epoch": 0.8007106899615043, "grad_norm": 1.3428573586009116, "learning_rate": 1.719655662756753e-05, "loss": 0.8324, "step": 1352 }, { "epoch": 0.8013029315960912, "grad_norm": 3.4551970616956185, "learning_rate": 1.719211438724762e-05, "loss": 0.8495, "step": 1353 }, { "epoch": 0.8018951732306782, "grad_norm": 3.699225184360079, "learning_rate": 1.7187669204948495e-05, "loss": 0.8617, "step": 1354 }, { "epoch": 0.802487414865265, "grad_norm": 1.6762625181042976, "learning_rate": 1.718322108248848e-05, "loss": 0.8473, "step": 1355 }, { "epoch": 0.8030796564998519, "grad_norm": 1.3717529222149383, "learning_rate": 1.7178770021687113e-05, "loss": 0.834, "step": 1356 }, { "epoch": 0.8036718981344388, "grad_norm": 2.315814133619738, "learning_rate": 1.7174316024365123e-05, "loss": 0.8763, "step": 1357 }, { "epoch": 0.8042641397690258, "grad_norm": 4.886422808842332, "learning_rate": 1.7169859092344448e-05, "loss": 0.8444, "step": 1358 }, { "epoch": 0.8048563814036127, "grad_norm": 1.7525565651645565, "learning_rate": 1.7165399227448222e-05, "loss": 0.8134, "step": 1359 }, { "epoch": 0.8054486230381996, "grad_norm": 2.806549546737031, "learning_rate": 1.7160936431500785e-05, "loss": 0.9172, "step": 1360 }, { "epoch": 0.8060408646727865, "grad_norm": 1.5747965592763717, "learning_rate": 1.7156470706327665e-05, "loss": 0.8433, "step": 1361 }, { "epoch": 0.8066331063073734, "grad_norm": 2.128730321422943, "learning_rate": 1.7152002053755604e-05, "loss": 0.8361, "step": 1362 }, { "epoch": 0.8072253479419603, "grad_norm": 2.327065092345198, "learning_rate": 1.7147530475612524e-05, "loss": 0.8628, "step": 1363 }, { "epoch": 0.8078175895765473, "grad_norm": 2.380546899275701, "learning_rate": 1.714305597372755e-05, "loss": 0.7691, "step": 1364 }, { "epoch": 0.8084098312111342, "grad_norm": 2.5637775941669063, "learning_rate": 1.7138578549931013e-05, "loss": 0.7957, "step": 1365 }, { "epoch": 0.809002072845721, "grad_norm": 2.4241927150678175, "learning_rate": 1.713409820605443e-05, "loss": 0.8272, "step": 1366 }, { "epoch": 0.8095943144803079, "grad_norm": 1.8511672871361389, "learning_rate": 1.7129614943930505e-05, "loss": 0.8062, "step": 1367 }, { "epoch": 0.8101865561148949, "grad_norm": 5.52637450036081, "learning_rate": 1.7125128765393157e-05, "loss": 0.8065, "step": 1368 }, { "epoch": 0.8107787977494818, "grad_norm": 1.5937260830934799, "learning_rate": 1.7120639672277474e-05, "loss": 0.8085, "step": 1369 }, { "epoch": 0.8113710393840687, "grad_norm": 1.4639078667885639, "learning_rate": 1.7116147666419755e-05, "loss": 0.8631, "step": 1370 }, { "epoch": 0.8119632810186556, "grad_norm": 2.4168601194929518, "learning_rate": 1.7111652749657473e-05, "loss": 0.816, "step": 1371 }, { "epoch": 0.8125555226532425, "grad_norm": 7.289050587362146, "learning_rate": 1.7107154923829317e-05, "loss": 0.8525, "step": 1372 }, { "epoch": 0.8131477642878294, "grad_norm": 2.360028584777416, "learning_rate": 1.710265419077514e-05, "loss": 0.8313, "step": 1373 }, { "epoch": 0.8137400059224164, "grad_norm": 2.0945779721069457, "learning_rate": 1.7098150552335997e-05, "loss": 0.8624, "step": 1374 }, { "epoch": 0.8143322475570033, "grad_norm": 3.0412982148200283, "learning_rate": 1.709364401035413e-05, "loss": 0.8574, "step": 1375 }, { "epoch": 0.8149244891915902, "grad_norm": 1.5284655725565341, "learning_rate": 1.708913456667297e-05, "loss": 0.8844, "step": 1376 }, { "epoch": 0.815516730826177, "grad_norm": 3.2308336643923368, "learning_rate": 1.7084622223137128e-05, "loss": 0.8091, "step": 1377 }, { "epoch": 0.816108972460764, "grad_norm": 1.830650660023485, "learning_rate": 1.7080106981592407e-05, "loss": 0.8669, "step": 1378 }, { "epoch": 0.8167012140953509, "grad_norm": 2.814611843926401, "learning_rate": 1.70755888438858e-05, "loss": 0.8626, "step": 1379 }, { "epoch": 0.8172934557299378, "grad_norm": 2.183664883059378, "learning_rate": 1.7071067811865477e-05, "loss": 0.8077, "step": 1380 }, { "epoch": 0.8178856973645248, "grad_norm": 3.4013315528606594, "learning_rate": 1.7066543887380793e-05, "loss": 0.8089, "step": 1381 }, { "epoch": 0.8184779389991116, "grad_norm": 2.299860689598534, "learning_rate": 1.7062017072282285e-05, "loss": 0.8607, "step": 1382 }, { "epoch": 0.8190701806336985, "grad_norm": 2.268163956608553, "learning_rate": 1.7057487368421685e-05, "loss": 0.8744, "step": 1383 }, { "epoch": 0.8196624222682855, "grad_norm": 3.0103061900949166, "learning_rate": 1.705295477765188e-05, "loss": 0.7831, "step": 1384 }, { "epoch": 0.8202546639028724, "grad_norm": 1.9846928716505232, "learning_rate": 1.7048419301826973e-05, "loss": 0.8019, "step": 1385 }, { "epoch": 0.8208469055374593, "grad_norm": 2.1436359754808434, "learning_rate": 1.7043880942802212e-05, "loss": 0.7726, "step": 1386 }, { "epoch": 0.8214391471720462, "grad_norm": 15.99919570552301, "learning_rate": 1.7039339702434057e-05, "loss": 0.8581, "step": 1387 }, { "epoch": 0.8220313888066331, "grad_norm": 1.9788780361165421, "learning_rate": 1.7034795582580118e-05, "loss": 0.8926, "step": 1388 }, { "epoch": 0.82262363044122, "grad_norm": 1.9979870871622687, "learning_rate": 1.70302485850992e-05, "loss": 0.8445, "step": 1389 }, { "epoch": 0.8232158720758069, "grad_norm": 3.3526689648805106, "learning_rate": 1.7025698711851283e-05, "loss": 0.8779, "step": 1390 }, { "epoch": 0.8238081137103939, "grad_norm": 3.2663903535745553, "learning_rate": 1.7021145964697518e-05, "loss": 0.8397, "step": 1391 }, { "epoch": 0.8244003553449808, "grad_norm": 2.4623605879732278, "learning_rate": 1.701659034550023e-05, "loss": 0.8613, "step": 1392 }, { "epoch": 0.8249925969795676, "grad_norm": 4.055785244924466, "learning_rate": 1.7012031856122936e-05, "loss": 0.8329, "step": 1393 }, { "epoch": 0.8255848386141545, "grad_norm": 2.546760379245732, "learning_rate": 1.70074704984303e-05, "loss": 0.8129, "step": 1394 }, { "epoch": 0.8261770802487415, "grad_norm": 2.004660636750862, "learning_rate": 1.7002906274288187e-05, "loss": 0.8649, "step": 1395 }, { "epoch": 0.8267693218833284, "grad_norm": 3.63927143622832, "learning_rate": 1.6998339185563614e-05, "loss": 0.8651, "step": 1396 }, { "epoch": 0.8273615635179153, "grad_norm": 2.6389702902480354, "learning_rate": 1.6993769234124777e-05, "loss": 0.8868, "step": 1397 }, { "epoch": 0.8279538051525023, "grad_norm": 3.2789514048802326, "learning_rate": 1.6989196421841045e-05, "loss": 0.8329, "step": 1398 }, { "epoch": 0.8285460467870891, "grad_norm": 2.9698551323065248, "learning_rate": 1.6984620750582957e-05, "loss": 0.8506, "step": 1399 }, { "epoch": 0.829138288421676, "grad_norm": 4.493208712376449, "learning_rate": 1.6980042222222216e-05, "loss": 0.8512, "step": 1400 }, { "epoch": 0.829730530056263, "grad_norm": 3.2446475490162587, "learning_rate": 1.69754608386317e-05, "loss": 0.8283, "step": 1401 }, { "epoch": 0.8303227716908499, "grad_norm": 3.8139641340470707, "learning_rate": 1.697087660168545e-05, "loss": 0.8369, "step": 1402 }, { "epoch": 0.8309150133254368, "grad_norm": 2.537763008254762, "learning_rate": 1.6966289513258683e-05, "loss": 0.8677, "step": 1403 }, { "epoch": 0.8315072549600236, "grad_norm": 2.97503941123126, "learning_rate": 1.6961699575227767e-05, "loss": 0.8321, "step": 1404 }, { "epoch": 0.8320994965946106, "grad_norm": 5.054631443547838, "learning_rate": 1.6957106789470252e-05, "loss": 0.8819, "step": 1405 }, { "epoch": 0.8326917382291975, "grad_norm": 3.2685313483268086, "learning_rate": 1.695251115786484e-05, "loss": 0.8571, "step": 1406 }, { "epoch": 0.8332839798637844, "grad_norm": 2.6791989072294466, "learning_rate": 1.6947912682291412e-05, "loss": 0.82, "step": 1407 }, { "epoch": 0.8338762214983714, "grad_norm": 6.388461734354429, "learning_rate": 1.694331136463099e-05, "loss": 0.8654, "step": 1408 }, { "epoch": 0.8344684631329582, "grad_norm": 5.378746896229759, "learning_rate": 1.6938707206765784e-05, "loss": 0.8756, "step": 1409 }, { "epoch": 0.8350607047675451, "grad_norm": 2.382113562740471, "learning_rate": 1.6934100210579144e-05, "loss": 0.8521, "step": 1410 }, { "epoch": 0.8356529464021321, "grad_norm": 2.3110802302573936, "learning_rate": 1.69294903779556e-05, "loss": 0.8355, "step": 1411 }, { "epoch": 0.836245188036719, "grad_norm": 2.217801291758983, "learning_rate": 1.6924877710780818e-05, "loss": 0.8798, "step": 1412 }, { "epoch": 0.8368374296713059, "grad_norm": 2.0560538956481036, "learning_rate": 1.6920262210941657e-05, "loss": 0.8128, "step": 1413 }, { "epoch": 0.8374296713058929, "grad_norm": 6.864186095594025, "learning_rate": 1.69156438803261e-05, "loss": 0.8533, "step": 1414 }, { "epoch": 0.8380219129404797, "grad_norm": 2.136582758478113, "learning_rate": 1.6911022720823315e-05, "loss": 0.8236, "step": 1415 }, { "epoch": 0.8386141545750666, "grad_norm": 2.499403080840201, "learning_rate": 1.690639873432361e-05, "loss": 0.8663, "step": 1416 }, { "epoch": 0.8392063962096535, "grad_norm": 6.035882530883013, "learning_rate": 1.6901771922718453e-05, "loss": 0.8783, "step": 1417 }, { "epoch": 0.8397986378442405, "grad_norm": 4.243343370936844, "learning_rate": 1.6897142287900477e-05, "loss": 0.8794, "step": 1418 }, { "epoch": 0.8403908794788274, "grad_norm": 6.601930358453118, "learning_rate": 1.6892509831763467e-05, "loss": 0.8406, "step": 1419 }, { "epoch": 0.8409831211134142, "grad_norm": 2.546863435034202, "learning_rate": 1.6887874556202342e-05, "loss": 0.8262, "step": 1420 }, { "epoch": 0.8415753627480012, "grad_norm": 1.626447021048131, "learning_rate": 1.6883236463113207e-05, "loss": 0.8047, "step": 1421 }, { "epoch": 0.8421676043825881, "grad_norm": 16.71050322079958, "learning_rate": 1.687859555439329e-05, "loss": 0.8308, "step": 1422 }, { "epoch": 0.842759846017175, "grad_norm": 2.0493773260165353, "learning_rate": 1.6873951831940993e-05, "loss": 0.8338, "step": 1423 }, { "epoch": 0.843352087651762, "grad_norm": 2.4162309237205988, "learning_rate": 1.686930529765585e-05, "loss": 0.8567, "step": 1424 }, { "epoch": 0.8439443292863489, "grad_norm": 2.4878273609774753, "learning_rate": 1.6864655953438563e-05, "loss": 0.8029, "step": 1425 }, { "epoch": 0.8445365709209357, "grad_norm": 3.2572445643608003, "learning_rate": 1.6860003801190975e-05, "loss": 0.8457, "step": 1426 }, { "epoch": 0.8451288125555226, "grad_norm": 5.916107483028511, "learning_rate": 1.6855348842816074e-05, "loss": 0.8797, "step": 1427 }, { "epoch": 0.8457210541901096, "grad_norm": 2.2527335241769166, "learning_rate": 1.6850691080218e-05, "loss": 0.8791, "step": 1428 }, { "epoch": 0.8463132958246965, "grad_norm": 5.083451401380974, "learning_rate": 1.6846030515302044e-05, "loss": 0.8085, "step": 1429 }, { "epoch": 0.8469055374592834, "grad_norm": 3.1992848879644495, "learning_rate": 1.6841367149974638e-05, "loss": 0.8713, "step": 1430 }, { "epoch": 0.8474977790938703, "grad_norm": 2.1404065000354775, "learning_rate": 1.6836700986143354e-05, "loss": 0.8767, "step": 1431 }, { "epoch": 0.8480900207284572, "grad_norm": 3.2502047486489327, "learning_rate": 1.683203202571692e-05, "loss": 0.825, "step": 1432 }, { "epoch": 0.8486822623630441, "grad_norm": 4.062783383833649, "learning_rate": 1.682736027060521e-05, "loss": 0.7975, "step": 1433 }, { "epoch": 0.849274503997631, "grad_norm": 3.1346761905120077, "learning_rate": 1.6822685722719224e-05, "loss": 0.8598, "step": 1434 }, { "epoch": 0.849866745632218, "grad_norm": 4.335065021022113, "learning_rate": 1.681800838397112e-05, "loss": 0.7865, "step": 1435 }, { "epoch": 0.8504589872668049, "grad_norm": 2.017703357099882, "learning_rate": 1.681332825627419e-05, "loss": 0.8488, "step": 1436 }, { "epoch": 0.8510512289013917, "grad_norm": 4.753567787732221, "learning_rate": 1.680864534154287e-05, "loss": 0.8702, "step": 1437 }, { "epoch": 0.8516434705359787, "grad_norm": 2.438375125937311, "learning_rate": 1.680395964169274e-05, "loss": 0.8571, "step": 1438 }, { "epoch": 0.8522357121705656, "grad_norm": 2.9251937289705916, "learning_rate": 1.6799271158640517e-05, "loss": 0.822, "step": 1439 }, { "epoch": 0.8528279538051525, "grad_norm": 2.2312157866256412, "learning_rate": 1.6794579894304043e-05, "loss": 0.8243, "step": 1440 }, { "epoch": 0.8534201954397395, "grad_norm": 7.074372608081394, "learning_rate": 1.678988585060231e-05, "loss": 0.8624, "step": 1441 }, { "epoch": 0.8540124370743263, "grad_norm": 2.7556705357562357, "learning_rate": 1.6785189029455455e-05, "loss": 0.8614, "step": 1442 }, { "epoch": 0.8546046787089132, "grad_norm": 1.9746989976797515, "learning_rate": 1.6780489432784738e-05, "loss": 0.8229, "step": 1443 }, { "epoch": 0.8551969203435001, "grad_norm": 1.9413368148573928, "learning_rate": 1.6775787062512557e-05, "loss": 0.8694, "step": 1444 }, { "epoch": 0.8557891619780871, "grad_norm": 2.2060934638723206, "learning_rate": 1.6771081920562445e-05, "loss": 0.8598, "step": 1445 }, { "epoch": 0.856381403612674, "grad_norm": 3.5024309364669444, "learning_rate": 1.676637400885907e-05, "loss": 0.8485, "step": 1446 }, { "epoch": 0.8569736452472608, "grad_norm": 4.018519154408224, "learning_rate": 1.676166332932824e-05, "loss": 0.8383, "step": 1447 }, { "epoch": 0.8575658868818478, "grad_norm": 1.7615281242175937, "learning_rate": 1.6756949883896874e-05, "loss": 0.8035, "step": 1448 }, { "epoch": 0.8581581285164347, "grad_norm": 2.1966167616847403, "learning_rate": 1.675223367449305e-05, "loss": 0.831, "step": 1449 }, { "epoch": 0.8587503701510216, "grad_norm": 2.6550583205151685, "learning_rate": 1.6747514703045952e-05, "loss": 0.8216, "step": 1450 }, { "epoch": 0.8593426117856086, "grad_norm": 2.478511653719764, "learning_rate": 1.6742792971485912e-05, "loss": 0.8828, "step": 1451 }, { "epoch": 0.8599348534201955, "grad_norm": 5.962171814245029, "learning_rate": 1.673806848174438e-05, "loss": 0.8454, "step": 1452 }, { "epoch": 0.8605270950547823, "grad_norm": 2.083540364486756, "learning_rate": 1.6733341235753938e-05, "loss": 0.8842, "step": 1453 }, { "epoch": 0.8611193366893692, "grad_norm": 2.063306619099545, "learning_rate": 1.67286112354483e-05, "loss": 0.8155, "step": 1454 }, { "epoch": 0.8617115783239562, "grad_norm": 3.43994740810729, "learning_rate": 1.6723878482762296e-05, "loss": 0.8795, "step": 1455 }, { "epoch": 0.8623038199585431, "grad_norm": 3.1123310742355312, "learning_rate": 1.671914297963189e-05, "loss": 0.8512, "step": 1456 }, { "epoch": 0.86289606159313, "grad_norm": 3.1237187679340455, "learning_rate": 1.671440472799417e-05, "loss": 0.8313, "step": 1457 }, { "epoch": 0.8634883032277169, "grad_norm": 2.450839707155073, "learning_rate": 1.670966372978735e-05, "loss": 0.8421, "step": 1458 }, { "epoch": 0.8640805448623038, "grad_norm": 10.847110912043622, "learning_rate": 1.6704919986950757e-05, "loss": 0.9009, "step": 1459 }, { "epoch": 0.8646727864968907, "grad_norm": 3.949302192305963, "learning_rate": 1.670017350142486e-05, "loss": 0.8754, "step": 1460 }, { "epoch": 0.8652650281314777, "grad_norm": 1.8975667299053336, "learning_rate": 1.6695424275151228e-05, "loss": 0.819, "step": 1461 }, { "epoch": 0.8658572697660646, "grad_norm": 3.3380137589905337, "learning_rate": 1.669067231007256e-05, "loss": 0.85, "step": 1462 }, { "epoch": 0.8664495114006515, "grad_norm": 2.2925444313034133, "learning_rate": 1.668591760813269e-05, "loss": 0.8283, "step": 1463 }, { "epoch": 0.8670417530352383, "grad_norm": 1.7424162303718678, "learning_rate": 1.668116017127655e-05, "loss": 0.8352, "step": 1464 }, { "epoch": 0.8676339946698253, "grad_norm": 2.3289766280717425, "learning_rate": 1.66764000014502e-05, "loss": 0.8662, "step": 1465 }, { "epoch": 0.8682262363044122, "grad_norm": 2.501537435831264, "learning_rate": 1.667163710060082e-05, "loss": 0.8126, "step": 1466 }, { "epoch": 0.8688184779389991, "grad_norm": 2.9512357732950405, "learning_rate": 1.6666871470676692e-05, "loss": 0.8318, "step": 1467 }, { "epoch": 0.8694107195735861, "grad_norm": 1.7783090866183635, "learning_rate": 1.6662103113627246e-05, "loss": 0.8499, "step": 1468 }, { "epoch": 0.8700029612081729, "grad_norm": 2.5732417344764875, "learning_rate": 1.6657332031402992e-05, "loss": 0.8511, "step": 1469 }, { "epoch": 0.8705952028427598, "grad_norm": 2.270239747001915, "learning_rate": 1.6652558225955582e-05, "loss": 0.8056, "step": 1470 }, { "epoch": 0.8711874444773468, "grad_norm": 2.0703557879671104, "learning_rate": 1.6647781699237765e-05, "loss": 0.8341, "step": 1471 }, { "epoch": 0.8717796861119337, "grad_norm": 3.563658846641916, "learning_rate": 1.6643002453203405e-05, "loss": 0.8619, "step": 1472 }, { "epoch": 0.8723719277465206, "grad_norm": 2.8049555007770453, "learning_rate": 1.6638220489807495e-05, "loss": 0.8093, "step": 1473 }, { "epoch": 0.8729641693811075, "grad_norm": 5.679515019474668, "learning_rate": 1.6633435811006117e-05, "loss": 0.8665, "step": 1474 }, { "epoch": 0.8735564110156944, "grad_norm": 5.038477719863848, "learning_rate": 1.6628648418756474e-05, "loss": 0.8382, "step": 1475 }, { "epoch": 0.8741486526502813, "grad_norm": 8.615254589030032, "learning_rate": 1.662385831501688e-05, "loss": 0.7959, "step": 1476 }, { "epoch": 0.8747408942848682, "grad_norm": 8.49226125726425, "learning_rate": 1.6619065501746762e-05, "loss": 0.8569, "step": 1477 }, { "epoch": 0.8753331359194552, "grad_norm": 2.216292664316629, "learning_rate": 1.661426998090664e-05, "loss": 0.8056, "step": 1478 }, { "epoch": 0.8759253775540421, "grad_norm": 3.1706560681895177, "learning_rate": 1.6609471754458163e-05, "loss": 0.8421, "step": 1479 }, { "epoch": 0.8765176191886289, "grad_norm": 2.596125359352298, "learning_rate": 1.6604670824364067e-05, "loss": 0.8084, "step": 1480 }, { "epoch": 0.8771098608232158, "grad_norm": 3.599816985420332, "learning_rate": 1.6599867192588207e-05, "loss": 0.8211, "step": 1481 }, { "epoch": 0.8777021024578028, "grad_norm": 5.1170671887832, "learning_rate": 1.6595060861095534e-05, "loss": 0.8538, "step": 1482 }, { "epoch": 0.8782943440923897, "grad_norm": 3.911881349947541, "learning_rate": 1.6590251831852113e-05, "loss": 0.8509, "step": 1483 }, { "epoch": 0.8788865857269766, "grad_norm": 3.6771517985702125, "learning_rate": 1.6585440106825107e-05, "loss": 0.7833, "step": 1484 }, { "epoch": 0.8794788273615635, "grad_norm": 2.5698713996771105, "learning_rate": 1.6580625687982776e-05, "loss": 0.814, "step": 1485 }, { "epoch": 0.8800710689961504, "grad_norm": 10.101577187451042, "learning_rate": 1.6575808577294492e-05, "loss": 0.819, "step": 1486 }, { "epoch": 0.8806633106307373, "grad_norm": 2.2073990472582476, "learning_rate": 1.657098877673073e-05, "loss": 0.885, "step": 1487 }, { "epoch": 0.8812555522653243, "grad_norm": 4.739398445699331, "learning_rate": 1.6566166288263046e-05, "loss": 0.8629, "step": 1488 }, { "epoch": 0.8818477938999112, "grad_norm": 3.1974524924806107, "learning_rate": 1.656134111386412e-05, "loss": 0.8511, "step": 1489 }, { "epoch": 0.8824400355344981, "grad_norm": 2.5663036406803608, "learning_rate": 1.6556513255507714e-05, "loss": 0.816, "step": 1490 }, { "epoch": 0.883032277169085, "grad_norm": 1.9680923900191523, "learning_rate": 1.65516827151687e-05, "loss": 0.8843, "step": 1491 }, { "epoch": 0.8836245188036719, "grad_norm": 2.414157535179823, "learning_rate": 1.6546849494823037e-05, "loss": 0.8753, "step": 1492 }, { "epoch": 0.8842167604382588, "grad_norm": 2.4575703751091065, "learning_rate": 1.654201359644778e-05, "loss": 0.8654, "step": 1493 }, { "epoch": 0.8848090020728457, "grad_norm": 2.992592460545843, "learning_rate": 1.653717502202109e-05, "loss": 0.8567, "step": 1494 }, { "epoch": 0.8854012437074327, "grad_norm": 2.2703367582766525, "learning_rate": 1.653233377352221e-05, "loss": 0.8335, "step": 1495 }, { "epoch": 0.8859934853420195, "grad_norm": 20.27278103196212, "learning_rate": 1.652748985293149e-05, "loss": 0.883, "step": 1496 }, { "epoch": 0.8865857269766064, "grad_norm": 1.8516469130504767, "learning_rate": 1.652264326223036e-05, "loss": 0.7949, "step": 1497 }, { "epoch": 0.8871779686111934, "grad_norm": 4.524574044226397, "learning_rate": 1.6517794003401345e-05, "loss": 0.8369, "step": 1498 }, { "epoch": 0.8877702102457803, "grad_norm": 2.3066187010469865, "learning_rate": 1.6512942078428072e-05, "loss": 0.8298, "step": 1499 }, { "epoch": 0.8883624518803672, "grad_norm": 3.9255893035367437, "learning_rate": 1.650808748929525e-05, "loss": 0.8436, "step": 1500 }, { "epoch": 0.8889546935149542, "grad_norm": 2.475550455756812, "learning_rate": 1.6503230237988676e-05, "loss": 0.7807, "step": 1501 }, { "epoch": 0.889546935149541, "grad_norm": 1.9064164754303783, "learning_rate": 1.6498370326495242e-05, "loss": 0.8097, "step": 1502 }, { "epoch": 0.8901391767841279, "grad_norm": 2.9818131068437967, "learning_rate": 1.649350775680292e-05, "loss": 0.8664, "step": 1503 }, { "epoch": 0.8907314184187148, "grad_norm": 1.928535242218003, "learning_rate": 1.648864253090078e-05, "loss": 0.8444, "step": 1504 }, { "epoch": 0.8913236600533018, "grad_norm": 2.4186278041533753, "learning_rate": 1.6483774650778973e-05, "loss": 0.8225, "step": 1505 }, { "epoch": 0.8919159016878887, "grad_norm": 6.329817463922851, "learning_rate": 1.6478904118428735e-05, "loss": 0.8687, "step": 1506 }, { "epoch": 0.8925081433224755, "grad_norm": 2.406656080352376, "learning_rate": 1.647403093584238e-05, "loss": 0.8689, "step": 1507 }, { "epoch": 0.8931003849570625, "grad_norm": 1.698392307205128, "learning_rate": 1.6469155105013324e-05, "loss": 0.8136, "step": 1508 }, { "epoch": 0.8936926265916494, "grad_norm": 1.7222065175556518, "learning_rate": 1.646427662793605e-05, "loss": 0.8621, "step": 1509 }, { "epoch": 0.8942848682262363, "grad_norm": 1.835429252270241, "learning_rate": 1.6459395506606133e-05, "loss": 0.8463, "step": 1510 }, { "epoch": 0.8948771098608232, "grad_norm": 2.02354089579679, "learning_rate": 1.6454511743020222e-05, "loss": 0.822, "step": 1511 }, { "epoch": 0.8954693514954102, "grad_norm": 2.178325990673068, "learning_rate": 1.6449625339176056e-05, "loss": 0.9125, "step": 1512 }, { "epoch": 0.896061593129997, "grad_norm": 1.6566812304799738, "learning_rate": 1.6444736297072446e-05, "loss": 0.8098, "step": 1513 }, { "epoch": 0.8966538347645839, "grad_norm": 2.7595341729881344, "learning_rate": 1.6439844618709285e-05, "loss": 0.8959, "step": 1514 }, { "epoch": 0.8972460763991709, "grad_norm": 1.992051362483181, "learning_rate": 1.6434950306087544e-05, "loss": 0.8598, "step": 1515 }, { "epoch": 0.8978383180337578, "grad_norm": 1.4714721824680785, "learning_rate": 1.6430053361209274e-05, "loss": 0.8236, "step": 1516 }, { "epoch": 0.8984305596683447, "grad_norm": 1.6591161099805962, "learning_rate": 1.6425153786077598e-05, "loss": 0.8749, "step": 1517 }, { "epoch": 0.8990228013029316, "grad_norm": 1.2831071701329722, "learning_rate": 1.642025158269672e-05, "loss": 0.8128, "step": 1518 }, { "epoch": 0.8996150429375185, "grad_norm": 1.4872368851201596, "learning_rate": 1.641534675307192e-05, "loss": 0.7701, "step": 1519 }, { "epoch": 0.9002072845721054, "grad_norm": 1.756308879818358, "learning_rate": 1.641043929920954e-05, "loss": 0.8523, "step": 1520 }, { "epoch": 0.9007995262066923, "grad_norm": 2.161708847108366, "learning_rate": 1.6405529223117013e-05, "loss": 0.8145, "step": 1521 }, { "epoch": 0.9013917678412793, "grad_norm": 1.585827703784451, "learning_rate": 1.6400616526802835e-05, "loss": 0.8352, "step": 1522 }, { "epoch": 0.9019840094758661, "grad_norm": 1.823999072218306, "learning_rate": 1.6395701212276573e-05, "loss": 0.8103, "step": 1523 }, { "epoch": 0.902576251110453, "grad_norm": 1.7892988299933283, "learning_rate": 1.6390783281548865e-05, "loss": 0.8468, "step": 1524 }, { "epoch": 0.90316849274504, "grad_norm": 1.828224641878733, "learning_rate": 1.638586273663143e-05, "loss": 0.8189, "step": 1525 }, { "epoch": 0.9037607343796269, "grad_norm": 1.5516682705105311, "learning_rate": 1.6380939579537033e-05, "loss": 0.8347, "step": 1526 }, { "epoch": 0.9043529760142138, "grad_norm": 3.834168570081106, "learning_rate": 1.6376013812279534e-05, "loss": 0.8573, "step": 1527 }, { "epoch": 0.9049452176488008, "grad_norm": 2.1526452193356547, "learning_rate": 1.6371085436873847e-05, "loss": 0.8469, "step": 1528 }, { "epoch": 0.9055374592833876, "grad_norm": 1.525680512787262, "learning_rate": 1.636615445533595e-05, "loss": 0.8192, "step": 1529 }, { "epoch": 0.9061297009179745, "grad_norm": 2.288883878412789, "learning_rate": 1.6361220869682896e-05, "loss": 0.8153, "step": 1530 }, { "epoch": 0.9067219425525614, "grad_norm": 4.621322518818992, "learning_rate": 1.63562846819328e-05, "loss": 0.8144, "step": 1531 }, { "epoch": 0.9073141841871484, "grad_norm": 1.6212241602499209, "learning_rate": 1.635134589410483e-05, "loss": 0.8537, "step": 1532 }, { "epoch": 0.9079064258217353, "grad_norm": 1.750883625265076, "learning_rate": 1.6346404508219244e-05, "loss": 0.8252, "step": 1533 }, { "epoch": 0.9084986674563221, "grad_norm": 1.8359912026959841, "learning_rate": 1.6341460526297335e-05, "loss": 0.8425, "step": 1534 }, { "epoch": 0.9090909090909091, "grad_norm": 1.4533966344737113, "learning_rate": 1.6336513950361474e-05, "loss": 0.8562, "step": 1535 }, { "epoch": 0.909683150725496, "grad_norm": 2.2110168582468552, "learning_rate": 1.6331564782435087e-05, "loss": 0.7873, "step": 1536 }, { "epoch": 0.9102753923600829, "grad_norm": 2.4584163208420913, "learning_rate": 1.6326613024542667e-05, "loss": 0.8297, "step": 1537 }, { "epoch": 0.9108676339946699, "grad_norm": 2.304101358316969, "learning_rate": 1.6321658678709752e-05, "loss": 0.8304, "step": 1538 }, { "epoch": 0.9114598756292568, "grad_norm": 1.5235885803236862, "learning_rate": 1.6316701746962956e-05, "loss": 0.8285, "step": 1539 }, { "epoch": 0.9120521172638436, "grad_norm": 2.0501459681351517, "learning_rate": 1.6311742231329936e-05, "loss": 0.8338, "step": 1540 }, { "epoch": 0.9126443588984305, "grad_norm": 1.7456834254936182, "learning_rate": 1.630678013383942e-05, "loss": 0.8232, "step": 1541 }, { "epoch": 0.9132366005330175, "grad_norm": 2.017939224936834, "learning_rate": 1.6301815456521185e-05, "loss": 0.8093, "step": 1542 }, { "epoch": 0.9138288421676044, "grad_norm": 1.7066296321756638, "learning_rate": 1.629684820140606e-05, "loss": 0.8545, "step": 1543 }, { "epoch": 0.9144210838021913, "grad_norm": 5.4294245310007, "learning_rate": 1.6291878370525925e-05, "loss": 0.828, "step": 1544 }, { "epoch": 0.9150133254367782, "grad_norm": 2.6780329423128446, "learning_rate": 1.6286905965913732e-05, "loss": 0.8682, "step": 1545 }, { "epoch": 0.9156055670713651, "grad_norm": 2.5467678711002364, "learning_rate": 1.6281930989603466e-05, "loss": 0.7832, "step": 1546 }, { "epoch": 0.916197808705952, "grad_norm": 2.230883357213969, "learning_rate": 1.627695344363018e-05, "loss": 0.8445, "step": 1547 }, { "epoch": 0.916790050340539, "grad_norm": 4.2585630086605315, "learning_rate": 1.627197333002996e-05, "loss": 0.7631, "step": 1548 }, { "epoch": 0.9173822919751259, "grad_norm": 2.018426386994712, "learning_rate": 1.6266990650839965e-05, "loss": 0.8065, "step": 1549 }, { "epoch": 0.9179745336097128, "grad_norm": 5.5918944687522405, "learning_rate": 1.6262005408098378e-05, "loss": 0.8405, "step": 1550 }, { "epoch": 0.9185667752442996, "grad_norm": 3.0051047010536234, "learning_rate": 1.6257017603844452e-05, "loss": 0.8329, "step": 1551 }, { "epoch": 0.9191590168788866, "grad_norm": 1.6536747992743932, "learning_rate": 1.6252027240118472e-05, "loss": 0.8, "step": 1552 }, { "epoch": 0.9197512585134735, "grad_norm": 2.3069408419926978, "learning_rate": 1.6247034318961788e-05, "loss": 0.7958, "step": 1553 }, { "epoch": 0.9203435001480604, "grad_norm": 2.5997053054107244, "learning_rate": 1.624203884241678e-05, "loss": 0.8594, "step": 1554 }, { "epoch": 0.9209357417826474, "grad_norm": 2.63780427340796, "learning_rate": 1.6237040812526875e-05, "loss": 0.84, "step": 1555 }, { "epoch": 0.9215279834172342, "grad_norm": 2.4212885909450206, "learning_rate": 1.6232040231336556e-05, "loss": 0.8491, "step": 1556 }, { "epoch": 0.9221202250518211, "grad_norm": 2.2044096639579815, "learning_rate": 1.6227037100891335e-05, "loss": 0.8598, "step": 1557 }, { "epoch": 0.922712466686408, "grad_norm": 1.6328122663952063, "learning_rate": 1.6222031423237776e-05, "loss": 0.8353, "step": 1558 }, { "epoch": 0.923304708320995, "grad_norm": 2.3168890361561263, "learning_rate": 1.6217023200423483e-05, "loss": 0.8548, "step": 1559 }, { "epoch": 0.9238969499555819, "grad_norm": 1.7424095619475186, "learning_rate": 1.6212012434497103e-05, "loss": 0.7988, "step": 1560 }, { "epoch": 0.9244891915901687, "grad_norm": 1.6126116008758955, "learning_rate": 1.6206999127508318e-05, "loss": 0.85, "step": 1561 }, { "epoch": 0.9250814332247557, "grad_norm": 1.4774563212178027, "learning_rate": 1.620198328150785e-05, "loss": 0.7911, "step": 1562 }, { "epoch": 0.9256736748593426, "grad_norm": 1.3520460937622927, "learning_rate": 1.6196964898547474e-05, "loss": 0.8204, "step": 1563 }, { "epoch": 0.9262659164939295, "grad_norm": 1.5690304650328533, "learning_rate": 1.6191943980679975e-05, "loss": 0.7729, "step": 1564 }, { "epoch": 0.9268581581285165, "grad_norm": 1.7240591652512482, "learning_rate": 1.61869205299592e-05, "loss": 0.8504, "step": 1565 }, { "epoch": 0.9274503997631034, "grad_norm": 1.629961765323752, "learning_rate": 1.6181894548440022e-05, "loss": 0.8158, "step": 1566 }, { "epoch": 0.9280426413976902, "grad_norm": 1.664765209050695, "learning_rate": 1.6176866038178348e-05, "loss": 0.8291, "step": 1567 }, { "epoch": 0.9286348830322771, "grad_norm": 1.3306049204265953, "learning_rate": 1.617183500123112e-05, "loss": 0.7634, "step": 1568 }, { "epoch": 0.9292271246668641, "grad_norm": 1.7500132992266746, "learning_rate": 1.6166801439656322e-05, "loss": 0.8743, "step": 1569 }, { "epoch": 0.929819366301451, "grad_norm": 1.5049905349441384, "learning_rate": 1.6161765355512958e-05, "loss": 0.8252, "step": 1570 }, { "epoch": 0.9304116079360379, "grad_norm": 1.4564671298739658, "learning_rate": 1.615672675086107e-05, "loss": 0.8371, "step": 1571 }, { "epoch": 0.9310038495706248, "grad_norm": 1.3839434144832101, "learning_rate": 1.615168562776173e-05, "loss": 0.8293, "step": 1572 }, { "epoch": 0.9315960912052117, "grad_norm": 1.4540537733669165, "learning_rate": 1.6146641988277044e-05, "loss": 0.8373, "step": 1573 }, { "epoch": 0.9321883328397986, "grad_norm": 5.541810457275405, "learning_rate": 1.6141595834470142e-05, "loss": 0.8454, "step": 1574 }, { "epoch": 0.9327805744743856, "grad_norm": 1.5031210400984882, "learning_rate": 1.6136547168405185e-05, "loss": 0.8377, "step": 1575 }, { "epoch": 0.9333728161089725, "grad_norm": 2.397459980734325, "learning_rate": 1.6131495992147363e-05, "loss": 0.8208, "step": 1576 }, { "epoch": 0.9339650577435594, "grad_norm": 4.008252947997453, "learning_rate": 1.6126442307762886e-05, "loss": 0.8473, "step": 1577 }, { "epoch": 0.9345572993781462, "grad_norm": 2.148720748325416, "learning_rate": 1.6121386117319e-05, "loss": 0.8426, "step": 1578 }, { "epoch": 0.9351495410127332, "grad_norm": 4.17679478258533, "learning_rate": 1.611632742288397e-05, "loss": 0.7861, "step": 1579 }, { "epoch": 0.9357417826473201, "grad_norm": 1.458220294326226, "learning_rate": 1.6111266226527086e-05, "loss": 0.797, "step": 1580 }, { "epoch": 0.936334024281907, "grad_norm": 2.424871404734393, "learning_rate": 1.6106202530318662e-05, "loss": 0.7973, "step": 1581 }, { "epoch": 0.936926265916494, "grad_norm": 1.2771446682809802, "learning_rate": 1.6101136336330037e-05, "loss": 0.8365, "step": 1582 }, { "epoch": 0.9375185075510808, "grad_norm": 1.1198521871087879, "learning_rate": 1.6096067646633568e-05, "loss": 0.8309, "step": 1583 }, { "epoch": 0.9381107491856677, "grad_norm": 1.4872636584914338, "learning_rate": 1.609099646330263e-05, "loss": 0.8144, "step": 1584 }, { "epoch": 0.9387029908202547, "grad_norm": 1.2530678270199203, "learning_rate": 1.6085922788411625e-05, "loss": 0.8321, "step": 1585 }, { "epoch": 0.9392952324548416, "grad_norm": 3.1068270835989167, "learning_rate": 1.6080846624035972e-05, "loss": 0.8287, "step": 1586 }, { "epoch": 0.9398874740894285, "grad_norm": 1.4244796316413282, "learning_rate": 1.6075767972252107e-05, "loss": 0.827, "step": 1587 }, { "epoch": 0.9404797157240155, "grad_norm": 1.8678497643123235, "learning_rate": 1.6070686835137484e-05, "loss": 0.8164, "step": 1588 }, { "epoch": 0.9410719573586023, "grad_norm": 2.0794440551572744, "learning_rate": 1.6065603214770576e-05, "loss": 0.8368, "step": 1589 }, { "epoch": 0.9416641989931892, "grad_norm": 1.4310283817792016, "learning_rate": 1.6060517113230866e-05, "loss": 0.8414, "step": 1590 }, { "epoch": 0.9422564406277761, "grad_norm": 1.1400352112299277, "learning_rate": 1.605542853259886e-05, "loss": 0.8632, "step": 1591 }, { "epoch": 0.9428486822623631, "grad_norm": 2.2260369027177402, "learning_rate": 1.605033747495607e-05, "loss": 0.8193, "step": 1592 }, { "epoch": 0.94344092389695, "grad_norm": 1.8613131439464332, "learning_rate": 1.6045243942385026e-05, "loss": 0.8796, "step": 1593 }, { "epoch": 0.9440331655315368, "grad_norm": 1.6025848171890213, "learning_rate": 1.6040147936969263e-05, "loss": 0.8037, "step": 1594 }, { "epoch": 0.9446254071661238, "grad_norm": 1.9429865966947775, "learning_rate": 1.6035049460793346e-05, "loss": 0.8611, "step": 1595 }, { "epoch": 0.9452176488007107, "grad_norm": 1.4125880162885809, "learning_rate": 1.602994851594283e-05, "loss": 0.8218, "step": 1596 }, { "epoch": 0.9458098904352976, "grad_norm": 1.4496484290895701, "learning_rate": 1.6024845104504295e-05, "loss": 0.831, "step": 1597 }, { "epoch": 0.9464021320698845, "grad_norm": 1.0965352833215942, "learning_rate": 1.6019739228565314e-05, "loss": 0.7957, "step": 1598 }, { "epoch": 0.9469943737044714, "grad_norm": 2.5680253220532525, "learning_rate": 1.6014630890214483e-05, "loss": 0.8338, "step": 1599 }, { "epoch": 0.9475866153390583, "grad_norm": 1.4680992447699333, "learning_rate": 1.6009520091541403e-05, "loss": 0.8043, "step": 1600 }, { "epoch": 0.9481788569736452, "grad_norm": 1.808937748229738, "learning_rate": 1.600440683463667e-05, "loss": 0.8217, "step": 1601 }, { "epoch": 0.9487710986082322, "grad_norm": 1.0990585324473146, "learning_rate": 1.5999291121591894e-05, "loss": 0.8004, "step": 1602 }, { "epoch": 0.9493633402428191, "grad_norm": 1.3712584097732767, "learning_rate": 1.59941729544997e-05, "loss": 0.8557, "step": 1603 }, { "epoch": 0.949955581877406, "grad_norm": 1.8683892068199166, "learning_rate": 1.5989052335453695e-05, "loss": 0.8331, "step": 1604 }, { "epoch": 0.9505478235119929, "grad_norm": 1.6272782502704557, "learning_rate": 1.598392926654851e-05, "loss": 0.8554, "step": 1605 }, { "epoch": 0.9511400651465798, "grad_norm": 6.533052707890747, "learning_rate": 1.5978803749879754e-05, "loss": 0.8254, "step": 1606 }, { "epoch": 0.9517323067811667, "grad_norm": 1.5123671105986136, "learning_rate": 1.5973675787544062e-05, "loss": 0.8491, "step": 1607 }, { "epoch": 0.9523245484157536, "grad_norm": 2.956300129012484, "learning_rate": 1.596854538163906e-05, "loss": 0.8472, "step": 1608 }, { "epoch": 0.9529167900503406, "grad_norm": 1.1789264958256336, "learning_rate": 1.5963412534263368e-05, "loss": 0.8399, "step": 1609 }, { "epoch": 0.9535090316849274, "grad_norm": 2.116364944004953, "learning_rate": 1.595827724751661e-05, "loss": 0.8087, "step": 1610 }, { "epoch": 0.9541012733195143, "grad_norm": 1.0294899321599131, "learning_rate": 1.5953139523499407e-05, "loss": 0.8103, "step": 1611 }, { "epoch": 0.9546935149541013, "grad_norm": 1.2277020777634202, "learning_rate": 1.5947999364313378e-05, "loss": 0.8414, "step": 1612 }, { "epoch": 0.9552857565886882, "grad_norm": 2.0861925635013807, "learning_rate": 1.594285677206114e-05, "loss": 0.8432, "step": 1613 }, { "epoch": 0.9558779982232751, "grad_norm": 1.6204140200740225, "learning_rate": 1.5937711748846292e-05, "loss": 0.8855, "step": 1614 }, { "epoch": 0.9564702398578621, "grad_norm": 1.5551835590922503, "learning_rate": 1.5932564296773452e-05, "loss": 0.8211, "step": 1615 }, { "epoch": 0.9570624814924489, "grad_norm": 1.338127305643877, "learning_rate": 1.5927414417948205e-05, "loss": 0.8845, "step": 1616 }, { "epoch": 0.9576547231270358, "grad_norm": 1.2387865170884367, "learning_rate": 1.592226211447715e-05, "loss": 0.7897, "step": 1617 }, { "epoch": 0.9582469647616227, "grad_norm": 1.226435037795598, "learning_rate": 1.5917107388467866e-05, "loss": 0.842, "step": 1618 }, { "epoch": 0.9588392063962097, "grad_norm": 1.2457195951382203, "learning_rate": 1.5911950242028924e-05, "loss": 0.804, "step": 1619 }, { "epoch": 0.9594314480307966, "grad_norm": 1.6797866491993712, "learning_rate": 1.5906790677269887e-05, "loss": 0.8548, "step": 1620 }, { "epoch": 0.9600236896653834, "grad_norm": 1.202377183866836, "learning_rate": 1.590162869630131e-05, "loss": 0.8212, "step": 1621 }, { "epoch": 0.9606159312999704, "grad_norm": 1.6157690809746326, "learning_rate": 1.589646430123473e-05, "loss": 0.8579, "step": 1622 }, { "epoch": 0.9612081729345573, "grad_norm": 1.0841272568029858, "learning_rate": 1.5891297494182677e-05, "loss": 0.8165, "step": 1623 }, { "epoch": 0.9618004145691442, "grad_norm": 1.0296129071614393, "learning_rate": 1.5886128277258665e-05, "loss": 0.8144, "step": 1624 }, { "epoch": 0.9623926562037312, "grad_norm": 8.702037235312739, "learning_rate": 1.5880956652577194e-05, "loss": 0.8223, "step": 1625 }, { "epoch": 0.9629848978383181, "grad_norm": 1.1958806153516213, "learning_rate": 1.587578262225375e-05, "loss": 0.8464, "step": 1626 }, { "epoch": 0.9635771394729049, "grad_norm": 3.794741977806126, "learning_rate": 1.5870606188404803e-05, "loss": 0.7858, "step": 1627 }, { "epoch": 0.9641693811074918, "grad_norm": 2.0476194711292743, "learning_rate": 1.5865427353147805e-05, "loss": 0.8624, "step": 1628 }, { "epoch": 0.9647616227420788, "grad_norm": 1.1041034856721486, "learning_rate": 1.586024611860119e-05, "loss": 0.827, "step": 1629 }, { "epoch": 0.9653538643766657, "grad_norm": 1.2670169849566177, "learning_rate": 1.5855062486884377e-05, "loss": 0.8591, "step": 1630 }, { "epoch": 0.9659461060112526, "grad_norm": 1.4173702686257748, "learning_rate": 1.5849876460117756e-05, "loss": 0.8619, "step": 1631 }, { "epoch": 0.9665383476458395, "grad_norm": 3.026025776762052, "learning_rate": 1.5844688040422714e-05, "loss": 0.8098, "step": 1632 }, { "epoch": 0.9671305892804264, "grad_norm": 1.4053556345848581, "learning_rate": 1.5839497229921596e-05, "loss": 0.807, "step": 1633 }, { "epoch": 0.9677228309150133, "grad_norm": 1.7031188178538146, "learning_rate": 1.5834304030737744e-05, "loss": 0.7775, "step": 1634 }, { "epoch": 0.9683150725496003, "grad_norm": 2.9127557245302187, "learning_rate": 1.5829108444995463e-05, "loss": 0.7833, "step": 1635 }, { "epoch": 0.9689073141841872, "grad_norm": 1.180567784206561, "learning_rate": 1.582391047482004e-05, "loss": 0.8379, "step": 1636 }, { "epoch": 0.969499555818774, "grad_norm": 1.6229999823880097, "learning_rate": 1.581871012233774e-05, "loss": 0.8217, "step": 1637 }, { "epoch": 0.9700917974533609, "grad_norm": 1.208083563753721, "learning_rate": 1.5813507389675796e-05, "loss": 0.8457, "step": 1638 }, { "epoch": 0.9706840390879479, "grad_norm": 2.1554849675219407, "learning_rate": 1.5808302278962425e-05, "loss": 0.8622, "step": 1639 }, { "epoch": 0.9712762807225348, "grad_norm": 2.64337198259556, "learning_rate": 1.58030947923268e-05, "loss": 0.7997, "step": 1640 }, { "epoch": 0.9718685223571217, "grad_norm": 1.5576104708781473, "learning_rate": 1.5797884931899085e-05, "loss": 0.8174, "step": 1641 }, { "epoch": 0.9724607639917087, "grad_norm": 2.038533162623759, "learning_rate": 1.57926726998104e-05, "loss": 0.7796, "step": 1642 }, { "epoch": 0.9730530056262955, "grad_norm": 2.9263975793725403, "learning_rate": 1.5787458098192846e-05, "loss": 0.8364, "step": 1643 }, { "epoch": 0.9736452472608824, "grad_norm": 1.4490134642361368, "learning_rate": 1.5782241129179482e-05, "loss": 0.8124, "step": 1644 }, { "epoch": 0.9742374888954694, "grad_norm": 1.533779399794962, "learning_rate": 1.5777021794904347e-05, "loss": 0.8359, "step": 1645 }, { "epoch": 0.9748297305300563, "grad_norm": 1.3549406554518657, "learning_rate": 1.5771800097502437e-05, "loss": 0.7979, "step": 1646 }, { "epoch": 0.9754219721646432, "grad_norm": 1.5156946411040901, "learning_rate": 1.5766576039109727e-05, "loss": 0.8221, "step": 1647 }, { "epoch": 0.97601421379923, "grad_norm": 1.0074291473613406, "learning_rate": 1.5761349621863145e-05, "loss": 0.7519, "step": 1648 }, { "epoch": 0.976606455433817, "grad_norm": 1.749198076091115, "learning_rate": 1.575612084790059e-05, "loss": 0.8644, "step": 1649 }, { "epoch": 0.9771986970684039, "grad_norm": 1.576797332615058, "learning_rate": 1.5750889719360927e-05, "loss": 0.8243, "step": 1650 }, { "epoch": 0.9777909387029908, "grad_norm": 1.1947271437764446, "learning_rate": 1.5745656238383983e-05, "loss": 0.8444, "step": 1651 }, { "epoch": 0.9783831803375778, "grad_norm": 1.4690410500886613, "learning_rate": 1.574042040711054e-05, "loss": 0.8234, "step": 1652 }, { "epoch": 0.9789754219721647, "grad_norm": 1.3832210724891452, "learning_rate": 1.5735182227682353e-05, "loss": 0.8242, "step": 1653 }, { "epoch": 0.9795676636067515, "grad_norm": 1.0700735949850522, "learning_rate": 1.572994170224213e-05, "loss": 0.738, "step": 1654 }, { "epoch": 0.9801599052413384, "grad_norm": 1.048484464353663, "learning_rate": 1.572469883293354e-05, "loss": 0.8888, "step": 1655 }, { "epoch": 0.9807521468759254, "grad_norm": 1.2979362621765205, "learning_rate": 1.571945362190121e-05, "loss": 0.8947, "step": 1656 }, { "epoch": 0.9813443885105123, "grad_norm": 1.5547020809846688, "learning_rate": 1.571420607129073e-05, "loss": 0.8581, "step": 1657 }, { "epoch": 0.9819366301450992, "grad_norm": 2.0365430398639117, "learning_rate": 1.5708956183248644e-05, "loss": 0.856, "step": 1658 }, { "epoch": 0.9825288717796861, "grad_norm": 1.3434352080990184, "learning_rate": 1.5703703959922444e-05, "loss": 0.84, "step": 1659 }, { "epoch": 0.983121113414273, "grad_norm": 1.5105070318212588, "learning_rate": 1.5698449403460593e-05, "loss": 0.8324, "step": 1660 }, { "epoch": 0.9837133550488599, "grad_norm": 1.2274682794404093, "learning_rate": 1.5693192516012497e-05, "loss": 0.7942, "step": 1661 }, { "epoch": 0.9843055966834469, "grad_norm": 1.5114527110798501, "learning_rate": 1.5687933299728517e-05, "loss": 0.9006, "step": 1662 }, { "epoch": 0.9848978383180338, "grad_norm": 1.118678025977753, "learning_rate": 1.568267175675997e-05, "loss": 0.8235, "step": 1663 }, { "epoch": 0.9854900799526207, "grad_norm": 0.9278952601780849, "learning_rate": 1.567740788925912e-05, "loss": 0.8125, "step": 1664 }, { "epoch": 0.9860823215872075, "grad_norm": 1.2284653816674624, "learning_rate": 1.5672141699379195e-05, "loss": 0.7883, "step": 1665 }, { "epoch": 0.9866745632217945, "grad_norm": 1.5199993795896831, "learning_rate": 1.5666873189274344e-05, "loss": 0.8336, "step": 1666 }, { "epoch": 0.9872668048563814, "grad_norm": 1.2261415724075908, "learning_rate": 1.56616023610997e-05, "loss": 0.8881, "step": 1667 }, { "epoch": 0.9878590464909683, "grad_norm": 1.0819795359579585, "learning_rate": 1.5656329217011322e-05, "loss": 0.782, "step": 1668 }, { "epoch": 0.9884512881255553, "grad_norm": 1.3268966177305566, "learning_rate": 1.565105375916623e-05, "loss": 0.8229, "step": 1669 }, { "epoch": 0.9890435297601421, "grad_norm": 1.2761359970445783, "learning_rate": 1.5645775989722366e-05, "loss": 0.8224, "step": 1670 }, { "epoch": 0.989635771394729, "grad_norm": 1.194769993158141, "learning_rate": 1.5640495910838652e-05, "loss": 0.8327, "step": 1671 }, { "epoch": 0.990228013029316, "grad_norm": 1.2775750866147921, "learning_rate": 1.563521352467493e-05, "loss": 0.8399, "step": 1672 }, { "epoch": 0.9908202546639029, "grad_norm": 1.0439693594751094, "learning_rate": 1.562992883339199e-05, "loss": 0.838, "step": 1673 }, { "epoch": 0.9914124962984898, "grad_norm": 1.4853858307458851, "learning_rate": 1.562464183915157e-05, "loss": 0.8309, "step": 1674 }, { "epoch": 0.9920047379330766, "grad_norm": 1.2338933063602229, "learning_rate": 1.5619352544116354e-05, "loss": 0.7848, "step": 1675 }, { "epoch": 0.9925969795676636, "grad_norm": 1.1283006710007517, "learning_rate": 1.5614060950449948e-05, "loss": 0.8699, "step": 1676 }, { "epoch": 0.9931892212022505, "grad_norm": 1.0265880293302552, "learning_rate": 1.5608767060316927e-05, "loss": 0.7997, "step": 1677 }, { "epoch": 0.9937814628368374, "grad_norm": 8.09364358410445, "learning_rate": 1.560347087588278e-05, "loss": 0.8407, "step": 1678 }, { "epoch": 0.9943737044714244, "grad_norm": 1.1900974681831218, "learning_rate": 1.5598172399313946e-05, "loss": 0.8168, "step": 1679 }, { "epoch": 0.9949659461060113, "grad_norm": 1.4239135447597135, "learning_rate": 1.5592871632777798e-05, "loss": 0.8455, "step": 1680 }, { "epoch": 0.9955581877405981, "grad_norm": 1.0018040213544295, "learning_rate": 1.5587568578442654e-05, "loss": 0.8209, "step": 1681 }, { "epoch": 0.996150429375185, "grad_norm": 0.9312885357262206, "learning_rate": 1.5582263238477753e-05, "loss": 0.8201, "step": 1682 }, { "epoch": 0.996742671009772, "grad_norm": 4.822722321751858, "learning_rate": 1.5576955615053283e-05, "loss": 0.826, "step": 1683 }, { "epoch": 0.9973349126443589, "grad_norm": 1.3705883900182685, "learning_rate": 1.557164571034036e-05, "loss": 0.8634, "step": 1684 }, { "epoch": 0.9979271542789458, "grad_norm": 1.2331718885455818, "learning_rate": 1.5566333526511032e-05, "loss": 0.836, "step": 1685 }, { "epoch": 0.9985193959135327, "grad_norm": 1.7535753725578784, "learning_rate": 1.5561019065738282e-05, "loss": 0.7917, "step": 1686 }, { "epoch": 0.9991116375481196, "grad_norm": 1.141962242390919, "learning_rate": 1.5555702330196024e-05, "loss": 0.8084, "step": 1687 }, { "epoch": 0.9997038791827065, "grad_norm": 0.9912371715304618, "learning_rate": 1.55503833220591e-05, "loss": 0.8038, "step": 1688 }, { "epoch": 1.0002961208172934, "grad_norm": 1.1500062970774998, "learning_rate": 1.5545062043503284e-05, "loss": 0.7195, "step": 1689 }, { "epoch": 1.0008883624518803, "grad_norm": 1.162441466138965, "learning_rate": 1.5539738496705277e-05, "loss": 0.7466, "step": 1690 }, { "epoch": 1.0014806040864672, "grad_norm": 2.452972866337579, "learning_rate": 1.553441268384271e-05, "loss": 0.7629, "step": 1691 }, { "epoch": 1.0020728457210542, "grad_norm": 1.1676699232346432, "learning_rate": 1.5529084607094144e-05, "loss": 0.7211, "step": 1692 }, { "epoch": 1.002665087355641, "grad_norm": 1.1249391475584591, "learning_rate": 1.5523754268639053e-05, "loss": 0.6833, "step": 1693 }, { "epoch": 1.003257328990228, "grad_norm": 0.9431636171309491, "learning_rate": 1.5518421670657856e-05, "loss": 0.6873, "step": 1694 }, { "epoch": 1.003849570624815, "grad_norm": 1.2138310832226853, "learning_rate": 1.5513086815331876e-05, "loss": 0.7482, "step": 1695 }, { "epoch": 1.0044418122594019, "grad_norm": 1.3175202159803, "learning_rate": 1.550774970484337e-05, "loss": 0.7627, "step": 1696 }, { "epoch": 1.0050340538939888, "grad_norm": 1.2998459350824845, "learning_rate": 1.5502410341375525e-05, "loss": 0.7568, "step": 1697 }, { "epoch": 1.0056262955285757, "grad_norm": 1.3077350264029703, "learning_rate": 1.5497068727112435e-05, "loss": 0.7064, "step": 1698 }, { "epoch": 1.0062185371631627, "grad_norm": 1.295716875999854, "learning_rate": 1.5491724864239116e-05, "loss": 0.7511, "step": 1699 }, { "epoch": 1.0068107787977494, "grad_norm": 1.0973356675970865, "learning_rate": 1.5486378754941514e-05, "loss": 0.7318, "step": 1700 }, { "epoch": 1.0074030204323363, "grad_norm": 1.064599007825042, "learning_rate": 1.5481030401406486e-05, "loss": 0.67, "step": 1701 }, { "epoch": 1.0079952620669232, "grad_norm": 1.7373649804796532, "learning_rate": 1.5475679805821814e-05, "loss": 0.7182, "step": 1702 }, { "epoch": 1.0085875037015102, "grad_norm": 1.1434410016960337, "learning_rate": 1.547032697037619e-05, "loss": 0.7471, "step": 1703 }, { "epoch": 1.0091797453360971, "grad_norm": 1.027785100761493, "learning_rate": 1.546497189725922e-05, "loss": 0.7033, "step": 1704 }, { "epoch": 1.009771986970684, "grad_norm": 1.4750262197277808, "learning_rate": 1.5459614588661435e-05, "loss": 0.6917, "step": 1705 }, { "epoch": 1.010364228605271, "grad_norm": 1.5320626210105739, "learning_rate": 1.5454255046774273e-05, "loss": 0.7396, "step": 1706 }, { "epoch": 1.010956470239858, "grad_norm": 1.2399324918591237, "learning_rate": 1.5448893273790093e-05, "loss": 0.7728, "step": 1707 }, { "epoch": 1.0115487118744448, "grad_norm": 1.2465148539400586, "learning_rate": 1.5443529271902155e-05, "loss": 0.7421, "step": 1708 }, { "epoch": 1.0121409535090318, "grad_norm": 1.0519427663128549, "learning_rate": 1.543816304330464e-05, "loss": 0.744, "step": 1709 }, { "epoch": 1.0127331951436187, "grad_norm": 1.1516952366005058, "learning_rate": 1.543279459019264e-05, "loss": 0.7283, "step": 1710 }, { "epoch": 1.0133254367782054, "grad_norm": 1.1650321254394644, "learning_rate": 1.542742391476215e-05, "loss": 0.7158, "step": 1711 }, { "epoch": 1.0139176784127923, "grad_norm": 1.0202804013315474, "learning_rate": 1.5422051019210082e-05, "loss": 0.7283, "step": 1712 }, { "epoch": 1.0145099200473793, "grad_norm": 1.923691433363424, "learning_rate": 1.5416675905734247e-05, "loss": 0.7629, "step": 1713 }, { "epoch": 1.0151021616819662, "grad_norm": 1.5605996887729816, "learning_rate": 1.5411298576533376e-05, "loss": 0.7222, "step": 1714 }, { "epoch": 1.0156944033165531, "grad_norm": 1.181981096535394, "learning_rate": 1.540591903380709e-05, "loss": 0.7437, "step": 1715 }, { "epoch": 1.01628664495114, "grad_norm": 1.2141712016450033, "learning_rate": 1.5400537279755935e-05, "loss": 0.7403, "step": 1716 }, { "epoch": 1.016878886585727, "grad_norm": 1.4181664222010109, "learning_rate": 1.539515331658134e-05, "loss": 0.7366, "step": 1717 }, { "epoch": 1.017471128220314, "grad_norm": 1.5554806786394424, "learning_rate": 1.538976714648566e-05, "loss": 0.7507, "step": 1718 }, { "epoch": 1.0180633698549009, "grad_norm": 1.2281864029513376, "learning_rate": 1.5384378771672132e-05, "loss": 0.7295, "step": 1719 }, { "epoch": 1.0186556114894878, "grad_norm": 1.8180155521778687, "learning_rate": 1.5378988194344913e-05, "loss": 0.6907, "step": 1720 }, { "epoch": 1.0192478531240745, "grad_norm": 1.3836146904218154, "learning_rate": 1.537359541670904e-05, "loss": 0.7357, "step": 1721 }, { "epoch": 1.0198400947586614, "grad_norm": 1.2844050388131933, "learning_rate": 1.5368200440970478e-05, "loss": 0.7001, "step": 1722 }, { "epoch": 1.0204323363932484, "grad_norm": 1.0399087505464837, "learning_rate": 1.5362803269336063e-05, "loss": 0.707, "step": 1723 }, { "epoch": 1.0210245780278353, "grad_norm": 1.2549348887269711, "learning_rate": 1.5357403904013546e-05, "loss": 0.7467, "step": 1724 }, { "epoch": 1.0216168196624222, "grad_norm": 1.1036457909371953, "learning_rate": 1.535200234721157e-05, "loss": 0.7386, "step": 1725 }, { "epoch": 1.0222090612970092, "grad_norm": 1.5796396509435675, "learning_rate": 1.5346598601139677e-05, "loss": 0.698, "step": 1726 }, { "epoch": 1.022801302931596, "grad_norm": 1.172115897988437, "learning_rate": 1.5341192668008305e-05, "loss": 0.7232, "step": 1727 }, { "epoch": 1.023393544566183, "grad_norm": 1.2380517696446258, "learning_rate": 1.533578455002878e-05, "loss": 0.7418, "step": 1728 }, { "epoch": 1.02398578620077, "grad_norm": 1.0709459361086207, "learning_rate": 1.5330374249413327e-05, "loss": 0.7005, "step": 1729 }, { "epoch": 1.024578027835357, "grad_norm": 1.7356966783518266, "learning_rate": 1.5324961768375065e-05, "loss": 0.7308, "step": 1730 }, { "epoch": 1.0251702694699438, "grad_norm": 1.9959691714404597, "learning_rate": 1.5319547109128e-05, "loss": 0.6944, "step": 1731 }, { "epoch": 1.0257625111045305, "grad_norm": 1.1724773521675533, "learning_rate": 1.531413027388704e-05, "loss": 0.7128, "step": 1732 }, { "epoch": 1.0263547527391175, "grad_norm": 1.3876472736084153, "learning_rate": 1.5308711264867966e-05, "loss": 0.7233, "step": 1733 }, { "epoch": 1.0269469943737044, "grad_norm": 0.9552747056186334, "learning_rate": 1.5303290084287465e-05, "loss": 0.7608, "step": 1734 }, { "epoch": 1.0275392360082913, "grad_norm": 1.4703684447974952, "learning_rate": 1.52978667343631e-05, "loss": 0.7286, "step": 1735 }, { "epoch": 1.0281314776428783, "grad_norm": 1.295631529067337, "learning_rate": 1.5292441217313324e-05, "loss": 0.7348, "step": 1736 }, { "epoch": 1.0287237192774652, "grad_norm": 1.7649342180752203, "learning_rate": 1.5287013535357488e-05, "loss": 0.7281, "step": 1737 }, { "epoch": 1.0293159609120521, "grad_norm": 1.0923294902447087, "learning_rate": 1.5281583690715805e-05, "loss": 0.6939, "step": 1738 }, { "epoch": 1.029908202546639, "grad_norm": 1.499563132831407, "learning_rate": 1.52761516856094e-05, "loss": 0.6817, "step": 1739 }, { "epoch": 1.030500444181226, "grad_norm": 1.0171580664278979, "learning_rate": 1.5270717522260264e-05, "loss": 0.7234, "step": 1740 }, { "epoch": 1.031092685815813, "grad_norm": 1.0354234657563626, "learning_rate": 1.526528120289127e-05, "loss": 0.7194, "step": 1741 }, { "epoch": 1.0316849274503999, "grad_norm": 1.2340045067110246, "learning_rate": 1.5259842729726186e-05, "loss": 0.7582, "step": 1742 }, { "epoch": 1.0322771690849866, "grad_norm": 1.1262612916228454, "learning_rate": 1.5254402104989652e-05, "loss": 0.7168, "step": 1743 }, { "epoch": 1.0328694107195735, "grad_norm": 1.2118168057016097, "learning_rate": 1.5248959330907186e-05, "loss": 0.743, "step": 1744 }, { "epoch": 1.0334616523541604, "grad_norm": 0.988865475204544, "learning_rate": 1.5243514409705187e-05, "loss": 0.7318, "step": 1745 }, { "epoch": 1.0340538939887474, "grad_norm": 1.1771979349040396, "learning_rate": 1.5238067343610943e-05, "loss": 0.7278, "step": 1746 }, { "epoch": 1.0346461356233343, "grad_norm": 1.3840039027011963, "learning_rate": 1.5232618134852598e-05, "loss": 0.7461, "step": 1747 }, { "epoch": 1.0352383772579212, "grad_norm": 6.602016761729066, "learning_rate": 1.52271667856592e-05, "loss": 0.7317, "step": 1748 }, { "epoch": 1.0358306188925082, "grad_norm": 1.0065104505442493, "learning_rate": 1.522171329826064e-05, "loss": 0.7306, "step": 1749 }, { "epoch": 1.036422860527095, "grad_norm": 1.0752528953543894, "learning_rate": 1.5216257674887718e-05, "loss": 0.683, "step": 1750 }, { "epoch": 1.037015102161682, "grad_norm": 1.0970642144292997, "learning_rate": 1.5210799917772076e-05, "loss": 0.7274, "step": 1751 }, { "epoch": 1.037607343796269, "grad_norm": 2.4805051543151233, "learning_rate": 1.5205340029146256e-05, "loss": 0.7554, "step": 1752 }, { "epoch": 1.0381995854308559, "grad_norm": 1.7822116435903128, "learning_rate": 1.5199878011243647e-05, "loss": 0.7089, "step": 1753 }, { "epoch": 1.0387918270654426, "grad_norm": 0.8928687106650833, "learning_rate": 1.5194413866298536e-05, "loss": 0.7456, "step": 1754 }, { "epoch": 1.0393840687000295, "grad_norm": 1.19053453703619, "learning_rate": 1.5188947596546053e-05, "loss": 0.7291, "step": 1755 }, { "epoch": 1.0399763103346165, "grad_norm": 1.4988527163029588, "learning_rate": 1.5183479204222216e-05, "loss": 0.7221, "step": 1756 }, { "epoch": 1.0405685519692034, "grad_norm": 1.7373245033776372, "learning_rate": 1.5178008691563902e-05, "loss": 0.7289, "step": 1757 }, { "epoch": 1.0411607936037903, "grad_norm": 1.838119156554147, "learning_rate": 1.5172536060808857e-05, "loss": 0.726, "step": 1758 }, { "epoch": 1.0417530352383773, "grad_norm": 1.3624164343579135, "learning_rate": 1.5167061314195702e-05, "loss": 0.7325, "step": 1759 }, { "epoch": 1.0423452768729642, "grad_norm": 1.6612931900677586, "learning_rate": 1.5161584453963908e-05, "loss": 0.7035, "step": 1760 }, { "epoch": 1.0429375185075511, "grad_norm": 1.4616110065160015, "learning_rate": 1.5156105482353827e-05, "loss": 0.7442, "step": 1761 }, { "epoch": 1.043529760142138, "grad_norm": 2.0905521686727737, "learning_rate": 1.5150624401606658e-05, "loss": 0.7269, "step": 1762 }, { "epoch": 1.044122001776725, "grad_norm": 1.7611278415862754, "learning_rate": 1.5145141213964479e-05, "loss": 0.696, "step": 1763 }, { "epoch": 1.044714243411312, "grad_norm": 2.1466849185074683, "learning_rate": 1.5139655921670213e-05, "loss": 0.6893, "step": 1764 }, { "epoch": 1.0453064850458986, "grad_norm": 3.86853781945047, "learning_rate": 1.5134168526967661e-05, "loss": 0.7508, "step": 1765 }, { "epoch": 1.0458987266804856, "grad_norm": 3.5972940752882137, "learning_rate": 1.5128679032101472e-05, "loss": 0.7366, "step": 1766 }, { "epoch": 1.0464909683150725, "grad_norm": 3.6857544393967347, "learning_rate": 1.5123187439317159e-05, "loss": 0.7766, "step": 1767 }, { "epoch": 1.0470832099496594, "grad_norm": 1.6678885841692537, "learning_rate": 1.5117693750861096e-05, "loss": 0.7361, "step": 1768 }, { "epoch": 1.0476754515842464, "grad_norm": 1.672518635910601, "learning_rate": 1.5112197968980503e-05, "loss": 0.6881, "step": 1769 }, { "epoch": 1.0482676932188333, "grad_norm": 1.5591607166759955, "learning_rate": 1.5106700095923471e-05, "loss": 0.693, "step": 1770 }, { "epoch": 1.0488599348534202, "grad_norm": 2.257758886616912, "learning_rate": 1.5101200133938933e-05, "loss": 0.6811, "step": 1771 }, { "epoch": 1.0494521764880071, "grad_norm": 2.0363297195079912, "learning_rate": 1.5095698085276692e-05, "loss": 0.7407, "step": 1772 }, { "epoch": 1.050044418122594, "grad_norm": 2.830719217097853, "learning_rate": 1.5090193952187382e-05, "loss": 0.7523, "step": 1773 }, { "epoch": 1.050636659757181, "grad_norm": 1.671928706296085, "learning_rate": 1.5084687736922514e-05, "loss": 0.7253, "step": 1774 }, { "epoch": 1.051228901391768, "grad_norm": 2.790885707597884, "learning_rate": 1.5079179441734435e-05, "loss": 0.7218, "step": 1775 }, { "epoch": 1.0518211430263547, "grad_norm": 1.7819284353063456, "learning_rate": 1.5073669068876348e-05, "loss": 0.73, "step": 1776 }, { "epoch": 1.0524133846609416, "grad_norm": 2.7217998288857794, "learning_rate": 1.5068156620602303e-05, "loss": 0.7359, "step": 1777 }, { "epoch": 1.0530056262955285, "grad_norm": 5.101430064202783, "learning_rate": 1.5062642099167208e-05, "loss": 0.7175, "step": 1778 }, { "epoch": 1.0535978679301155, "grad_norm": 2.4798155357352902, "learning_rate": 1.5057125506826806e-05, "loss": 0.7212, "step": 1779 }, { "epoch": 1.0541901095647024, "grad_norm": 1.4306493055601683, "learning_rate": 1.5051606845837699e-05, "loss": 0.7508, "step": 1780 }, { "epoch": 1.0547823511992893, "grad_norm": 1.9617350912249327, "learning_rate": 1.5046086118457325e-05, "loss": 0.7302, "step": 1781 }, { "epoch": 1.0553745928338762, "grad_norm": 3.2294694912122766, "learning_rate": 1.5040563326943974e-05, "loss": 0.7164, "step": 1782 }, { "epoch": 1.0559668344684632, "grad_norm": 2.159249108038316, "learning_rate": 1.5035038473556776e-05, "loss": 0.7142, "step": 1783 }, { "epoch": 1.05655907610305, "grad_norm": 1.810542509614688, "learning_rate": 1.5029511560555707e-05, "loss": 0.7212, "step": 1784 }, { "epoch": 1.057151317737637, "grad_norm": 4.37550667109893, "learning_rate": 1.5023982590201586e-05, "loss": 0.6615, "step": 1785 }, { "epoch": 1.057743559372224, "grad_norm": 1.4873887560427135, "learning_rate": 1.5018451564756078e-05, "loss": 0.6939, "step": 1786 }, { "epoch": 1.0583358010068107, "grad_norm": 3.2445272993387335, "learning_rate": 1.5012918486481677e-05, "loss": 0.7296, "step": 1787 }, { "epoch": 1.0589280426413976, "grad_norm": 1.6958901351830118, "learning_rate": 1.5007383357641723e-05, "loss": 0.7386, "step": 1788 }, { "epoch": 1.0595202842759845, "grad_norm": 1.6693502713038142, "learning_rate": 1.5001846180500399e-05, "loss": 0.7317, "step": 1789 }, { "epoch": 1.0601125259105715, "grad_norm": 1.4628738488051636, "learning_rate": 1.499630695732272e-05, "loss": 0.7039, "step": 1790 }, { "epoch": 1.0607047675451584, "grad_norm": 1.4961438926110393, "learning_rate": 1.4990765690374537e-05, "loss": 0.7023, "step": 1791 }, { "epoch": 1.0612970091797453, "grad_norm": 2.1544637844797125, "learning_rate": 1.4985222381922543e-05, "loss": 0.7214, "step": 1792 }, { "epoch": 1.0618892508143323, "grad_norm": 8.28217196999516, "learning_rate": 1.4979677034234265e-05, "loss": 0.717, "step": 1793 }, { "epoch": 1.0624814924489192, "grad_norm": 2.501132894894816, "learning_rate": 1.4974129649578058e-05, "loss": 0.7356, "step": 1794 }, { "epoch": 1.0630737340835061, "grad_norm": 1.16138368637112, "learning_rate": 1.4968580230223112e-05, "loss": 0.7435, "step": 1795 }, { "epoch": 1.063665975718093, "grad_norm": 1.5162128926090968, "learning_rate": 1.496302877843946e-05, "loss": 0.7392, "step": 1796 }, { "epoch": 1.06425821735268, "grad_norm": 1.2373419739712945, "learning_rate": 1.4957475296497953e-05, "loss": 0.7, "step": 1797 }, { "epoch": 1.0648504589872667, "grad_norm": 1.9258973289098624, "learning_rate": 1.4951919786670274e-05, "loss": 0.7231, "step": 1798 }, { "epoch": 1.0654427006218536, "grad_norm": 1.2248509462971935, "learning_rate": 1.4946362251228943e-05, "loss": 0.7276, "step": 1799 }, { "epoch": 1.0660349422564406, "grad_norm": 1.5808585539599749, "learning_rate": 1.4940802692447306e-05, "loss": 0.7257, "step": 1800 }, { "epoch": 1.0666271838910275, "grad_norm": 1.3982040276801868, "learning_rate": 1.493524111259953e-05, "loss": 0.7563, "step": 1801 }, { "epoch": 1.0672194255256144, "grad_norm": 3.7095487179566713, "learning_rate": 1.4929677513960621e-05, "loss": 0.7217, "step": 1802 }, { "epoch": 1.0678116671602014, "grad_norm": 2.0014255531666936, "learning_rate": 1.4924111898806395e-05, "loss": 0.7425, "step": 1803 }, { "epoch": 1.0684039087947883, "grad_norm": 1.4939878195406038, "learning_rate": 1.4918544269413511e-05, "loss": 0.727, "step": 1804 }, { "epoch": 1.0689961504293752, "grad_norm": 1.74927476414085, "learning_rate": 1.4912974628059433e-05, "loss": 0.7637, "step": 1805 }, { "epoch": 1.0695883920639622, "grad_norm": 1.658292058667489, "learning_rate": 1.4907402977022465e-05, "loss": 0.7623, "step": 1806 }, { "epoch": 1.070180633698549, "grad_norm": 1.4315694213602883, "learning_rate": 1.4901829318581722e-05, "loss": 0.7211, "step": 1807 }, { "epoch": 1.070772875333136, "grad_norm": 1.704429216725604, "learning_rate": 1.4896253655017146e-05, "loss": 0.7316, "step": 1808 }, { "epoch": 1.0713651169677227, "grad_norm": 1.3312378206473277, "learning_rate": 1.4890675988609493e-05, "loss": 0.7199, "step": 1809 }, { "epoch": 1.0719573586023097, "grad_norm": 3.97929527512917, "learning_rate": 1.4885096321640346e-05, "loss": 0.725, "step": 1810 }, { "epoch": 1.0725496002368966, "grad_norm": 1.50003812253246, "learning_rate": 1.48795146563921e-05, "loss": 0.7149, "step": 1811 }, { "epoch": 1.0731418418714835, "grad_norm": 1.67906861392692, "learning_rate": 1.4873930995147971e-05, "loss": 0.7448, "step": 1812 }, { "epoch": 1.0737340835060705, "grad_norm": 1.6617287890649148, "learning_rate": 1.4868345340191992e-05, "loss": 0.7207, "step": 1813 }, { "epoch": 1.0743263251406574, "grad_norm": 1.3825842739615184, "learning_rate": 1.4862757693809009e-05, "loss": 0.7454, "step": 1814 }, { "epoch": 1.0749185667752443, "grad_norm": 1.4144163977489808, "learning_rate": 1.485716805828468e-05, "loss": 0.7158, "step": 1815 }, { "epoch": 1.0755108084098313, "grad_norm": 1.8101168890390542, "learning_rate": 1.4851576435905489e-05, "loss": 0.7072, "step": 1816 }, { "epoch": 1.0761030500444182, "grad_norm": 1.872316239101291, "learning_rate": 1.4845982828958713e-05, "loss": 0.7332, "step": 1817 }, { "epoch": 1.0766952916790051, "grad_norm": 1.3296117156310565, "learning_rate": 1.484038723973246e-05, "loss": 0.7099, "step": 1818 }, { "epoch": 1.077287533313592, "grad_norm": 1.4154353624795581, "learning_rate": 1.4834789670515637e-05, "loss": 0.7188, "step": 1819 }, { "epoch": 1.0778797749481788, "grad_norm": 1.7457686235458405, "learning_rate": 1.4829190123597965e-05, "loss": 0.7769, "step": 1820 }, { "epoch": 1.0784720165827657, "grad_norm": 1.7634196004341314, "learning_rate": 1.4823588601269973e-05, "loss": 0.7494, "step": 1821 }, { "epoch": 1.0790642582173526, "grad_norm": 1.2440518586809848, "learning_rate": 1.4817985105823003e-05, "loss": 0.6979, "step": 1822 }, { "epoch": 1.0796564998519396, "grad_norm": 2.2385183360083953, "learning_rate": 1.4812379639549194e-05, "loss": 0.7202, "step": 1823 }, { "epoch": 1.0802487414865265, "grad_norm": 1.2215652923766105, "learning_rate": 1.4806772204741503e-05, "loss": 0.7091, "step": 1824 }, { "epoch": 1.0808409831211134, "grad_norm": 1.4449642609762843, "learning_rate": 1.4801162803693676e-05, "loss": 0.7442, "step": 1825 }, { "epoch": 1.0814332247557004, "grad_norm": 1.105763165926035, "learning_rate": 1.4795551438700283e-05, "loss": 0.7515, "step": 1826 }, { "epoch": 1.0820254663902873, "grad_norm": 1.6473343255705488, "learning_rate": 1.4789938112056683e-05, "loss": 0.7527, "step": 1827 }, { "epoch": 1.0826177080248742, "grad_norm": 1.618781048885639, "learning_rate": 1.4784322826059048e-05, "loss": 0.7249, "step": 1828 }, { "epoch": 1.0832099496594612, "grad_norm": 1.652228341540137, "learning_rate": 1.4778705583004338e-05, "loss": 0.7447, "step": 1829 }, { "epoch": 1.083802191294048, "grad_norm": 2.4386960209217343, "learning_rate": 1.4773086385190328e-05, "loss": 0.7448, "step": 1830 }, { "epoch": 1.0843944329286348, "grad_norm": 1.259552923914992, "learning_rate": 1.4767465234915577e-05, "loss": 0.7799, "step": 1831 }, { "epoch": 1.0849866745632217, "grad_norm": 3.1470024337770446, "learning_rate": 1.4761842134479463e-05, "loss": 0.7429, "step": 1832 }, { "epoch": 1.0855789161978087, "grad_norm": 1.2003867622349509, "learning_rate": 1.4756217086182142e-05, "loss": 0.7326, "step": 1833 }, { "epoch": 1.0861711578323956, "grad_norm": 2.4507353417893314, "learning_rate": 1.4750590092324579e-05, "loss": 0.7183, "step": 1834 }, { "epoch": 1.0867633994669825, "grad_norm": 1.1444638541124244, "learning_rate": 1.474496115520853e-05, "loss": 0.7436, "step": 1835 }, { "epoch": 1.0873556411015695, "grad_norm": 1.9216426351330924, "learning_rate": 1.4739330277136546e-05, "loss": 0.7302, "step": 1836 }, { "epoch": 1.0879478827361564, "grad_norm": 1.7272854817225367, "learning_rate": 1.4733697460411973e-05, "loss": 0.7041, "step": 1837 }, { "epoch": 1.0885401243707433, "grad_norm": 1.1808892975599021, "learning_rate": 1.4728062707338949e-05, "loss": 0.7214, "step": 1838 }, { "epoch": 1.0891323660053303, "grad_norm": 1.2060838830500324, "learning_rate": 1.4722426020222406e-05, "loss": 0.7387, "step": 1839 }, { "epoch": 1.0897246076399172, "grad_norm": 1.2045178599251487, "learning_rate": 1.4716787401368067e-05, "loss": 0.708, "step": 1840 }, { "epoch": 1.090316849274504, "grad_norm": 1.4796202285269815, "learning_rate": 1.4711146853082443e-05, "loss": 0.7453, "step": 1841 }, { "epoch": 1.0909090909090908, "grad_norm": 1.2637405534511554, "learning_rate": 1.4705504377672834e-05, "loss": 0.7001, "step": 1842 }, { "epoch": 1.0915013325436778, "grad_norm": 0.9993594395369135, "learning_rate": 1.4699859977447335e-05, "loss": 0.7461, "step": 1843 }, { "epoch": 1.0920935741782647, "grad_norm": 1.3269419928105537, "learning_rate": 1.4694213654714816e-05, "loss": 0.7391, "step": 1844 }, { "epoch": 1.0926858158128516, "grad_norm": 1.4669810778171086, "learning_rate": 1.4688565411784943e-05, "loss": 0.7218, "step": 1845 }, { "epoch": 1.0932780574474386, "grad_norm": 3.385399768271858, "learning_rate": 1.4682915250968169e-05, "loss": 0.7459, "step": 1846 }, { "epoch": 1.0938702990820255, "grad_norm": 1.6211356022303685, "learning_rate": 1.4677263174575723e-05, "loss": 0.7276, "step": 1847 }, { "epoch": 1.0944625407166124, "grad_norm": 1.3469501603849017, "learning_rate": 1.4671609184919622e-05, "loss": 0.7409, "step": 1848 }, { "epoch": 1.0950547823511994, "grad_norm": 1.3639447724050635, "learning_rate": 1.4665953284312668e-05, "loss": 0.7392, "step": 1849 }, { "epoch": 1.0956470239857863, "grad_norm": 1.2226531191169623, "learning_rate": 1.4660295475068443e-05, "loss": 0.7384, "step": 1850 }, { "epoch": 1.0962392656203732, "grad_norm": 1.5816714452127887, "learning_rate": 1.4654635759501306e-05, "loss": 0.7145, "step": 1851 }, { "epoch": 1.09683150725496, "grad_norm": 1.3398064935081868, "learning_rate": 1.4648974139926403e-05, "loss": 0.7341, "step": 1852 }, { "epoch": 1.0974237488895469, "grad_norm": 1.3669350828600808, "learning_rate": 1.4643310618659646e-05, "loss": 0.7229, "step": 1853 }, { "epoch": 1.0980159905241338, "grad_norm": 1.7617904906850115, "learning_rate": 1.4637645198017745e-05, "loss": 0.7067, "step": 1854 }, { "epoch": 1.0986082321587207, "grad_norm": 3.706221805008793, "learning_rate": 1.463197788031817e-05, "loss": 0.7075, "step": 1855 }, { "epoch": 1.0992004737933077, "grad_norm": 1.6435750167047545, "learning_rate": 1.4626308667879175e-05, "loss": 0.758, "step": 1856 }, { "epoch": 1.0997927154278946, "grad_norm": 1.7817244713218505, "learning_rate": 1.462063756301978e-05, "loss": 0.7666, "step": 1857 }, { "epoch": 1.1003849570624815, "grad_norm": 3.183409165734301, "learning_rate": 1.4614964568059795e-05, "loss": 0.7431, "step": 1858 }, { "epoch": 1.1009771986970684, "grad_norm": 1.5588746699566092, "learning_rate": 1.4609289685319788e-05, "loss": 0.6989, "step": 1859 }, { "epoch": 1.1015694403316554, "grad_norm": 1.4715880184354002, "learning_rate": 1.4603612917121107e-05, "loss": 0.7476, "step": 1860 }, { "epoch": 1.1021616819662423, "grad_norm": 1.5483999283147984, "learning_rate": 1.4597934265785868e-05, "loss": 0.7095, "step": 1861 }, { "epoch": 1.102753923600829, "grad_norm": 1.8136131665101558, "learning_rate": 1.4592253733636961e-05, "loss": 0.7218, "step": 1862 }, { "epoch": 1.103346165235416, "grad_norm": 1.898459387929092, "learning_rate": 1.4586571322998041e-05, "loss": 0.6928, "step": 1863 }, { "epoch": 1.1039384068700029, "grad_norm": 1.1332820859860129, "learning_rate": 1.4580887036193539e-05, "loss": 0.7379, "step": 1864 }, { "epoch": 1.1045306485045898, "grad_norm": 1.1910399671887877, "learning_rate": 1.4575200875548639e-05, "loss": 0.7093, "step": 1865 }, { "epoch": 1.1051228901391768, "grad_norm": 1.7135675328136981, "learning_rate": 1.4569512843389306e-05, "loss": 0.7452, "step": 1866 }, { "epoch": 1.1057151317737637, "grad_norm": 1.3131715172492713, "learning_rate": 1.4563822942042264e-05, "loss": 0.7512, "step": 1867 }, { "epoch": 1.1063073734083506, "grad_norm": 1.301975931331709, "learning_rate": 1.4558131173835002e-05, "loss": 0.724, "step": 1868 }, { "epoch": 1.1068996150429375, "grad_norm": 1.9451185830687892, "learning_rate": 1.4552437541095774e-05, "loss": 0.7658, "step": 1869 }, { "epoch": 1.1074918566775245, "grad_norm": 2.5902094254473274, "learning_rate": 1.4546742046153596e-05, "loss": 0.7389, "step": 1870 }, { "epoch": 1.1080840983121114, "grad_norm": 1.8005401821027935, "learning_rate": 1.4541044691338246e-05, "loss": 0.7113, "step": 1871 }, { "epoch": 1.1086763399466983, "grad_norm": 1.3297390320707363, "learning_rate": 1.453534547898026e-05, "loss": 0.7192, "step": 1872 }, { "epoch": 1.109268581581285, "grad_norm": 1.616999402923775, "learning_rate": 1.452964441141094e-05, "loss": 0.703, "step": 1873 }, { "epoch": 1.109860823215872, "grad_norm": 3.5826588522082607, "learning_rate": 1.4523941490962342e-05, "loss": 0.7405, "step": 1874 }, { "epoch": 1.110453064850459, "grad_norm": 1.5079201259207546, "learning_rate": 1.451823671996728e-05, "loss": 0.739, "step": 1875 }, { "epoch": 1.1110453064850458, "grad_norm": 2.719011118153245, "learning_rate": 1.451253010075933e-05, "loss": 0.7482, "step": 1876 }, { "epoch": 1.1116375481196328, "grad_norm": 2.508076865789526, "learning_rate": 1.450682163567281e-05, "loss": 0.7225, "step": 1877 }, { "epoch": 1.1122297897542197, "grad_norm": 1.6745532655484892, "learning_rate": 1.4501111327042817e-05, "loss": 0.7662, "step": 1878 }, { "epoch": 1.1128220313888066, "grad_norm": 1.617999952278557, "learning_rate": 1.4495399177205177e-05, "loss": 0.7055, "step": 1879 }, { "epoch": 1.1134142730233936, "grad_norm": 1.783576483289883, "learning_rate": 1.4489685188496488e-05, "loss": 0.7175, "step": 1880 }, { "epoch": 1.1140065146579805, "grad_norm": 1.8113648186864728, "learning_rate": 1.4483969363254085e-05, "loss": 0.7221, "step": 1881 }, { "epoch": 1.1145987562925674, "grad_norm": 4.466398490357165, "learning_rate": 1.447825170381607e-05, "loss": 0.731, "step": 1882 }, { "epoch": 1.1151909979271544, "grad_norm": 2.3544006277235776, "learning_rate": 1.4472532212521281e-05, "loss": 0.7599, "step": 1883 }, { "epoch": 1.115783239561741, "grad_norm": 1.5430309293543931, "learning_rate": 1.446681089170932e-05, "loss": 0.7418, "step": 1884 }, { "epoch": 1.116375481196328, "grad_norm": 2.547169019555421, "learning_rate": 1.4461087743720519e-05, "loss": 0.7436, "step": 1885 }, { "epoch": 1.116967722830915, "grad_norm": 2.4575093456645174, "learning_rate": 1.4455362770895976e-05, "loss": 0.7316, "step": 1886 }, { "epoch": 1.1175599644655019, "grad_norm": 1.5479754084135895, "learning_rate": 1.444963597557752e-05, "loss": 0.6865, "step": 1887 }, { "epoch": 1.1181522061000888, "grad_norm": 1.5331344869939385, "learning_rate": 1.4443907360107734e-05, "loss": 0.7007, "step": 1888 }, { "epoch": 1.1187444477346757, "grad_norm": 2.1333020071731896, "learning_rate": 1.4438176926829944e-05, "loss": 0.727, "step": 1889 }, { "epoch": 1.1193366893692627, "grad_norm": 2.758567222418793, "learning_rate": 1.4432444678088222e-05, "loss": 0.7018, "step": 1890 }, { "epoch": 1.1199289310038496, "grad_norm": 1.7694989327792088, "learning_rate": 1.4426710616227377e-05, "loss": 0.7148, "step": 1891 }, { "epoch": 1.1205211726384365, "grad_norm": 1.674707947535516, "learning_rate": 1.4420974743592964e-05, "loss": 0.7497, "step": 1892 }, { "epoch": 1.1211134142730235, "grad_norm": 1.7836099010979485, "learning_rate": 1.4415237062531277e-05, "loss": 0.7138, "step": 1893 }, { "epoch": 1.1217056559076104, "grad_norm": 1.2259620137684928, "learning_rate": 1.4409497575389352e-05, "loss": 0.6871, "step": 1894 }, { "epoch": 1.122297897542197, "grad_norm": 3.7532717279318066, "learning_rate": 1.4403756284514962e-05, "loss": 0.7662, "step": 1895 }, { "epoch": 1.122890139176784, "grad_norm": 1.6907860579891794, "learning_rate": 1.4398013192256615e-05, "loss": 0.7459, "step": 1896 }, { "epoch": 1.123482380811371, "grad_norm": 2.1021110669948166, "learning_rate": 1.439226830096356e-05, "loss": 0.7327, "step": 1897 }, { "epoch": 1.124074622445958, "grad_norm": 3.754781447508906, "learning_rate": 1.438652161298578e-05, "loss": 0.7541, "step": 1898 }, { "epoch": 1.1246668640805448, "grad_norm": 1.6070012623831906, "learning_rate": 1.4380773130673994e-05, "loss": 0.7569, "step": 1899 }, { "epoch": 1.1252591057151318, "grad_norm": 7.979250377670888, "learning_rate": 1.4375022856379657e-05, "loss": 0.766, "step": 1900 }, { "epoch": 1.1258513473497187, "grad_norm": 1.3313809617715828, "learning_rate": 1.436927079245495e-05, "loss": 0.7326, "step": 1901 }, { "epoch": 1.1264435889843056, "grad_norm": 1.9341986555985717, "learning_rate": 1.4363516941252795e-05, "loss": 0.7408, "step": 1902 }, { "epoch": 1.1270358306188926, "grad_norm": 1.6237139068552036, "learning_rate": 1.4357761305126836e-05, "loss": 0.7064, "step": 1903 }, { "epoch": 1.1276280722534795, "grad_norm": 1.2456602536397703, "learning_rate": 1.4352003886431459e-05, "loss": 0.7131, "step": 1904 }, { "epoch": 1.1282203138880664, "grad_norm": 1.8462015297534495, "learning_rate": 1.4346244687521761e-05, "loss": 0.719, "step": 1905 }, { "epoch": 1.1288125555226531, "grad_norm": 2.308176089812671, "learning_rate": 1.434048371075359e-05, "loss": 0.6911, "step": 1906 }, { "epoch": 1.12940479715724, "grad_norm": 13.436596947136401, "learning_rate": 1.43347209584835e-05, "loss": 0.7109, "step": 1907 }, { "epoch": 1.129997038791827, "grad_norm": 2.4217564676011896, "learning_rate": 1.4328956433068789e-05, "loss": 0.7412, "step": 1908 }, { "epoch": 1.130589280426414, "grad_norm": 1.917753337825582, "learning_rate": 1.4323190136867464e-05, "loss": 0.753, "step": 1909 }, { "epoch": 1.1311815220610009, "grad_norm": 1.411783160540838, "learning_rate": 1.4317422072238271e-05, "loss": 0.7423, "step": 1910 }, { "epoch": 1.1317737636955878, "grad_norm": 1.6181997909355976, "learning_rate": 1.4311652241540668e-05, "loss": 0.7672, "step": 1911 }, { "epoch": 1.1323660053301747, "grad_norm": 1.8668388500696302, "learning_rate": 1.4305880647134847e-05, "loss": 0.7447, "step": 1912 }, { "epoch": 1.1329582469647617, "grad_norm": 1.6503141304547309, "learning_rate": 1.4300107291381704e-05, "loss": 0.7519, "step": 1913 }, { "epoch": 1.1335504885993486, "grad_norm": 1.8711353472468055, "learning_rate": 1.4294332176642875e-05, "loss": 0.7117, "step": 1914 }, { "epoch": 1.1341427302339355, "grad_norm": 1.5597160551862157, "learning_rate": 1.4288555305280702e-05, "loss": 0.7413, "step": 1915 }, { "epoch": 1.1347349718685225, "grad_norm": 1.9438549888259047, "learning_rate": 1.4282776679658255e-05, "loss": 0.7131, "step": 1916 }, { "epoch": 1.1353272135031092, "grad_norm": 3.576441284877924, "learning_rate": 1.4276996302139312e-05, "loss": 0.7173, "step": 1917 }, { "epoch": 1.135919455137696, "grad_norm": 4.139553745005125, "learning_rate": 1.4271214175088374e-05, "loss": 0.7347, "step": 1918 }, { "epoch": 1.136511696772283, "grad_norm": 2.094913208691617, "learning_rate": 1.4265430300870656e-05, "loss": 0.7003, "step": 1919 }, { "epoch": 1.13710393840687, "grad_norm": 1.961827791646241, "learning_rate": 1.425964468185209e-05, "loss": 0.7393, "step": 1920 }, { "epoch": 1.137696180041457, "grad_norm": 1.2595516677616476, "learning_rate": 1.4253857320399316e-05, "loss": 0.7199, "step": 1921 }, { "epoch": 1.1382884216760438, "grad_norm": 1.76264901153827, "learning_rate": 1.4248068218879691e-05, "loss": 0.7213, "step": 1922 }, { "epoch": 1.1388806633106308, "grad_norm": 1.8110656077220395, "learning_rate": 1.4242277379661286e-05, "loss": 0.7134, "step": 1923 }, { "epoch": 1.1394729049452177, "grad_norm": 1.3189613973752465, "learning_rate": 1.4236484805112878e-05, "loss": 0.7287, "step": 1924 }, { "epoch": 1.1400651465798046, "grad_norm": 2.039277540030174, "learning_rate": 1.4230690497603955e-05, "loss": 0.716, "step": 1925 }, { "epoch": 1.1406573882143916, "grad_norm": 2.20069189081546, "learning_rate": 1.4224894459504717e-05, "loss": 0.7297, "step": 1926 }, { "epoch": 1.1412496298489785, "grad_norm": 1.342282720151037, "learning_rate": 1.4219096693186065e-05, "loss": 0.7599, "step": 1927 }, { "epoch": 1.1418418714835652, "grad_norm": 2.128575546767518, "learning_rate": 1.4213297201019618e-05, "loss": 0.727, "step": 1928 }, { "epoch": 1.1424341131181521, "grad_norm": 5.679213605059689, "learning_rate": 1.4207495985377687e-05, "loss": 0.7075, "step": 1929 }, { "epoch": 1.143026354752739, "grad_norm": 1.4477866874730978, "learning_rate": 1.4201693048633302e-05, "loss": 0.7705, "step": 1930 }, { "epoch": 1.143618596387326, "grad_norm": 1.3689716380240244, "learning_rate": 1.4195888393160183e-05, "loss": 0.7145, "step": 1931 }, { "epoch": 1.144210838021913, "grad_norm": 2.1302801407154153, "learning_rate": 1.419008202133277e-05, "loss": 0.7193, "step": 1932 }, { "epoch": 1.1448030796564999, "grad_norm": 1.300453821118966, "learning_rate": 1.4184273935526184e-05, "loss": 0.7635, "step": 1933 }, { "epoch": 1.1453953212910868, "grad_norm": 1.9849938187340435, "learning_rate": 1.4178464138116272e-05, "loss": 0.7525, "step": 1934 }, { "epoch": 1.1459875629256737, "grad_norm": 1.7532705143201168, "learning_rate": 1.4172652631479558e-05, "loss": 0.7159, "step": 1935 }, { "epoch": 1.1465798045602607, "grad_norm": 1.993678172399548, "learning_rate": 1.4166839417993281e-05, "loss": 0.7287, "step": 1936 }, { "epoch": 1.1471720461948476, "grad_norm": 1.6612921095486382, "learning_rate": 1.4161024500035364e-05, "loss": 0.7116, "step": 1937 }, { "epoch": 1.1477642878294345, "grad_norm": 2.704022908452418, "learning_rate": 1.4155207879984447e-05, "loss": 0.7005, "step": 1938 }, { "epoch": 1.1483565294640212, "grad_norm": 1.5244879998598664, "learning_rate": 1.4149389560219846e-05, "loss": 0.7588, "step": 1939 }, { "epoch": 1.1489487710986082, "grad_norm": 3.6604754255800307, "learning_rate": 1.414356954312158e-05, "loss": 0.749, "step": 1940 }, { "epoch": 1.149541012733195, "grad_norm": 1.9096638139679427, "learning_rate": 1.4137747831070371e-05, "loss": 0.7127, "step": 1941 }, { "epoch": 1.150133254367782, "grad_norm": 1.5220531333094949, "learning_rate": 1.4131924426447621e-05, "loss": 0.7561, "step": 1942 }, { "epoch": 1.150725496002369, "grad_norm": 2.3373596861886456, "learning_rate": 1.412609933163543e-05, "loss": 0.7053, "step": 1943 }, { "epoch": 1.1513177376369559, "grad_norm": 1.4526913515699773, "learning_rate": 1.4120272549016591e-05, "loss": 0.7197, "step": 1944 }, { "epoch": 1.1519099792715428, "grad_norm": 1.7959045608328434, "learning_rate": 1.4114444080974585e-05, "loss": 0.7105, "step": 1945 }, { "epoch": 1.1525022209061297, "grad_norm": 2.2273046483347834, "learning_rate": 1.4108613929893586e-05, "loss": 0.7099, "step": 1946 }, { "epoch": 1.1530944625407167, "grad_norm": 1.851272958075549, "learning_rate": 1.410278209815845e-05, "loss": 0.6989, "step": 1947 }, { "epoch": 1.1536867041753036, "grad_norm": 1.4743710555973606, "learning_rate": 1.4096948588154723e-05, "loss": 0.7193, "step": 1948 }, { "epoch": 1.1542789458098905, "grad_norm": 1.8040748031208247, "learning_rate": 1.4091113402268644e-05, "loss": 0.7769, "step": 1949 }, { "epoch": 1.1548711874444773, "grad_norm": 4.56346912920917, "learning_rate": 1.4085276542887128e-05, "loss": 0.7509, "step": 1950 }, { "epoch": 1.1554634290790642, "grad_norm": 1.83204117485103, "learning_rate": 1.4079438012397777e-05, "loss": 0.7835, "step": 1951 }, { "epoch": 1.1560556707136511, "grad_norm": 1.7404819759752297, "learning_rate": 1.4073597813188884e-05, "loss": 0.7317, "step": 1952 }, { "epoch": 1.156647912348238, "grad_norm": 2.5984937357601803, "learning_rate": 1.4067755947649416e-05, "loss": 0.735, "step": 1953 }, { "epoch": 1.157240153982825, "grad_norm": 1.6054589075238175, "learning_rate": 1.4061912418169024e-05, "loss": 0.7258, "step": 1954 }, { "epoch": 1.157832395617412, "grad_norm": 1.9010263960704716, "learning_rate": 1.405606722713804e-05, "loss": 0.7318, "step": 1955 }, { "epoch": 1.1584246372519988, "grad_norm": 1.859336257713254, "learning_rate": 1.405022037694748e-05, "loss": 0.721, "step": 1956 }, { "epoch": 1.1590168788865858, "grad_norm": 1.815312595979808, "learning_rate": 1.4044371869989024e-05, "loss": 0.7329, "step": 1957 }, { "epoch": 1.1596091205211727, "grad_norm": 2.126985969178438, "learning_rate": 1.4038521708655054e-05, "loss": 0.7508, "step": 1958 }, { "epoch": 1.1602013621557596, "grad_norm": 1.5244816275828186, "learning_rate": 1.4032669895338602e-05, "loss": 0.7254, "step": 1959 }, { "epoch": 1.1607936037903466, "grad_norm": 2.1945138390122834, "learning_rate": 1.40268164324334e-05, "loss": 0.7529, "step": 1960 }, { "epoch": 1.1613858454249333, "grad_norm": 2.5793485490868235, "learning_rate": 1.4020961322333833e-05, "loss": 0.7489, "step": 1961 }, { "epoch": 1.1619780870595202, "grad_norm": 1.6233997764262025, "learning_rate": 1.4015104567434981e-05, "loss": 0.7458, "step": 1962 }, { "epoch": 1.1625703286941071, "grad_norm": 2.1892983877595875, "learning_rate": 1.4009246170132575e-05, "loss": 0.7234, "step": 1963 }, { "epoch": 1.163162570328694, "grad_norm": 2.2188497921341654, "learning_rate": 1.400338613282304e-05, "loss": 0.7235, "step": 1964 }, { "epoch": 1.163754811963281, "grad_norm": 2.6410164085265233, "learning_rate": 1.3997524457903455e-05, "loss": 0.7395, "step": 1965 }, { "epoch": 1.164347053597868, "grad_norm": 2.8137504546585395, "learning_rate": 1.3991661147771574e-05, "loss": 0.725, "step": 1966 }, { "epoch": 1.1649392952324549, "grad_norm": 4.3683730385339965, "learning_rate": 1.398579620482582e-05, "loss": 0.7569, "step": 1967 }, { "epoch": 1.1655315368670418, "grad_norm": 3.000428917939229, "learning_rate": 1.3979929631465286e-05, "loss": 0.703, "step": 1968 }, { "epoch": 1.1661237785016287, "grad_norm": 1.8828332802791297, "learning_rate": 1.3974061430089731e-05, "loss": 0.7377, "step": 1969 }, { "epoch": 1.1667160201362157, "grad_norm": 1.6402504327655474, "learning_rate": 1.396819160309958e-05, "loss": 0.7423, "step": 1970 }, { "epoch": 1.1673082617708026, "grad_norm": 6.3377427586159625, "learning_rate": 1.396232015289592e-05, "loss": 0.7424, "step": 1971 }, { "epoch": 1.1679005034053893, "grad_norm": 2.2909334335027407, "learning_rate": 1.3956447081880506e-05, "loss": 0.7274, "step": 1972 }, { "epoch": 1.1684927450399762, "grad_norm": 1.7468340066260577, "learning_rate": 1.3950572392455755e-05, "loss": 0.7087, "step": 1973 }, { "epoch": 1.1690849866745632, "grad_norm": 3.9317619791484644, "learning_rate": 1.394469608702474e-05, "loss": 0.7178, "step": 1974 }, { "epoch": 1.16967722830915, "grad_norm": 1.8428851547057905, "learning_rate": 1.3938818167991208e-05, "loss": 0.7067, "step": 1975 }, { "epoch": 1.170269469943737, "grad_norm": 3.147154685015463, "learning_rate": 1.3932938637759555e-05, "loss": 0.7499, "step": 1976 }, { "epoch": 1.170861711578324, "grad_norm": 5.221103236332467, "learning_rate": 1.3927057498734837e-05, "loss": 0.7523, "step": 1977 }, { "epoch": 1.171453953212911, "grad_norm": 2.367076528325581, "learning_rate": 1.3921174753322775e-05, "loss": 0.739, "step": 1978 }, { "epoch": 1.1720461948474978, "grad_norm": 2.216413750813569, "learning_rate": 1.3915290403929738e-05, "loss": 0.732, "step": 1979 }, { "epoch": 1.1726384364820848, "grad_norm": 2.7444238440546584, "learning_rate": 1.390940445296276e-05, "loss": 0.7295, "step": 1980 }, { "epoch": 1.1732306781166717, "grad_norm": 3.5442659708991076, "learning_rate": 1.3903516902829525e-05, "loss": 0.7312, "step": 1981 }, { "epoch": 1.1738229197512586, "grad_norm": 2.4148636767736478, "learning_rate": 1.3897627755938372e-05, "loss": 0.7149, "step": 1982 }, { "epoch": 1.1744151613858453, "grad_norm": 1.686105294543067, "learning_rate": 1.389173701469829e-05, "loss": 0.7449, "step": 1983 }, { "epoch": 1.1750074030204323, "grad_norm": 2.18226411659928, "learning_rate": 1.388584468151893e-05, "loss": 0.7074, "step": 1984 }, { "epoch": 1.1755996446550192, "grad_norm": 1.6462875388280893, "learning_rate": 1.3879950758810577e-05, "loss": 0.7186, "step": 1985 }, { "epoch": 1.1761918862896061, "grad_norm": 1.9514852968058827, "learning_rate": 1.3874055248984191e-05, "loss": 0.7708, "step": 1986 }, { "epoch": 1.176784127924193, "grad_norm": 2.5287146246879058, "learning_rate": 1.3868158154451354e-05, "loss": 0.7552, "step": 1987 }, { "epoch": 1.17737636955878, "grad_norm": 5.354876267561049, "learning_rate": 1.3862259477624317e-05, "loss": 0.7096, "step": 1988 }, { "epoch": 1.177968611193367, "grad_norm": 2.080724856383468, "learning_rate": 1.3856359220915967e-05, "loss": 0.7306, "step": 1989 }, { "epoch": 1.1785608528279539, "grad_norm": 1.9442702346562672, "learning_rate": 1.3850457386739846e-05, "loss": 0.7006, "step": 1990 }, { "epoch": 1.1791530944625408, "grad_norm": 2.217412566401062, "learning_rate": 1.3844553977510127e-05, "loss": 0.7313, "step": 1991 }, { "epoch": 1.1797453360971275, "grad_norm": 2.3005541144417094, "learning_rate": 1.3838648995641645e-05, "loss": 0.7372, "step": 1992 }, { "epoch": 1.1803375777317147, "grad_norm": 1.5633590022176485, "learning_rate": 1.3832742443549865e-05, "loss": 0.7273, "step": 1993 }, { "epoch": 1.1809298193663014, "grad_norm": 2.177628183648929, "learning_rate": 1.3826834323650899e-05, "loss": 0.7433, "step": 1994 }, { "epoch": 1.1815220610008883, "grad_norm": 2.1872512165965596, "learning_rate": 1.3820924638361501e-05, "loss": 0.7884, "step": 1995 }, { "epoch": 1.1821143026354752, "grad_norm": 2.7491390741203183, "learning_rate": 1.3815013390099068e-05, "loss": 0.77, "step": 1996 }, { "epoch": 1.1827065442700622, "grad_norm": 2.2399320892471595, "learning_rate": 1.380910058128163e-05, "loss": 0.7449, "step": 1997 }, { "epoch": 1.183298785904649, "grad_norm": 1.8717444666929153, "learning_rate": 1.3803186214327852e-05, "loss": 0.7234, "step": 1998 }, { "epoch": 1.183891027539236, "grad_norm": 2.3196057634912934, "learning_rate": 1.3797270291657056e-05, "loss": 0.7311, "step": 1999 }, { "epoch": 1.184483269173823, "grad_norm": 1.5216917870082234, "learning_rate": 1.3791352815689174e-05, "loss": 0.7444, "step": 2000 }, { "epoch": 1.18507551080841, "grad_norm": 4.333885469046637, "learning_rate": 1.37854337888448e-05, "loss": 0.7638, "step": 2001 }, { "epoch": 1.1856677524429968, "grad_norm": 2.019243585845638, "learning_rate": 1.3779513213545132e-05, "loss": 0.7491, "step": 2002 }, { "epoch": 1.1862599940775835, "grad_norm": 2.577233313193749, "learning_rate": 1.3773591092212035e-05, "loss": 0.7296, "step": 2003 }, { "epoch": 1.1868522357121707, "grad_norm": 1.9298566377245834, "learning_rate": 1.3767667427267976e-05, "loss": 0.7447, "step": 2004 }, { "epoch": 1.1874444773467574, "grad_norm": 2.427978631614327, "learning_rate": 1.3761742221136078e-05, "loss": 0.7494, "step": 2005 }, { "epoch": 1.1880367189813443, "grad_norm": 5.679942493223697, "learning_rate": 1.3755815476240076e-05, "loss": 0.7124, "step": 2006 }, { "epoch": 1.1886289606159313, "grad_norm": 2.087484384962119, "learning_rate": 1.3749887195004345e-05, "loss": 0.7371, "step": 2007 }, { "epoch": 1.1892212022505182, "grad_norm": 2.329160941125, "learning_rate": 1.3743957379853885e-05, "loss": 0.7031, "step": 2008 }, { "epoch": 1.1898134438851051, "grad_norm": 2.8641168542308226, "learning_rate": 1.3738026033214323e-05, "loss": 0.7713, "step": 2009 }, { "epoch": 1.190405685519692, "grad_norm": 2.7900911750772703, "learning_rate": 1.3732093157511914e-05, "loss": 0.753, "step": 2010 }, { "epoch": 1.190997927154279, "grad_norm": 1.8472081752565688, "learning_rate": 1.372615875517354e-05, "loss": 0.7305, "step": 2011 }, { "epoch": 1.191590168788866, "grad_norm": 1.9016156353449258, "learning_rate": 1.3720222828626699e-05, "loss": 0.7529, "step": 2012 }, { "epoch": 1.1921824104234529, "grad_norm": 1.9348033443818209, "learning_rate": 1.3714285380299525e-05, "loss": 0.7213, "step": 2013 }, { "epoch": 1.1927746520580396, "grad_norm": 1.371298962671068, "learning_rate": 1.3708346412620768e-05, "loss": 0.7334, "step": 2014 }, { "epoch": 1.1933668936926265, "grad_norm": 1.4004771260654063, "learning_rate": 1.3702405928019795e-05, "loss": 0.715, "step": 2015 }, { "epoch": 1.1939591353272134, "grad_norm": 1.6163665195760184, "learning_rate": 1.3696463928926602e-05, "loss": 0.727, "step": 2016 }, { "epoch": 1.1945513769618004, "grad_norm": 1.8911450014106126, "learning_rate": 1.36905204177718e-05, "loss": 0.7406, "step": 2017 }, { "epoch": 1.1951436185963873, "grad_norm": 2.6466552053840413, "learning_rate": 1.3684575396986622e-05, "loss": 0.7064, "step": 2018 }, { "epoch": 1.1957358602309742, "grad_norm": 1.7980339830263137, "learning_rate": 1.3678628869002908e-05, "loss": 0.6718, "step": 2019 }, { "epoch": 1.1963281018655612, "grad_norm": 2.18736908567566, "learning_rate": 1.3672680836253129e-05, "loss": 0.7137, "step": 2020 }, { "epoch": 1.196920343500148, "grad_norm": 6.371103342127131, "learning_rate": 1.3666731301170364e-05, "loss": 0.7216, "step": 2021 }, { "epoch": 1.197512585134735, "grad_norm": 1.7740748237595747, "learning_rate": 1.3660780266188306e-05, "loss": 0.7277, "step": 2022 }, { "epoch": 1.198104826769322, "grad_norm": 1.437742129128349, "learning_rate": 1.3654827733741263e-05, "loss": 0.7204, "step": 2023 }, { "epoch": 1.1986970684039089, "grad_norm": 2.9002638880137597, "learning_rate": 1.3648873706264159e-05, "loss": 0.6936, "step": 2024 }, { "epoch": 1.1992893100384956, "grad_norm": 1.6559528300674338, "learning_rate": 1.3642918186192521e-05, "loss": 0.7051, "step": 2025 }, { "epoch": 1.1998815516730825, "grad_norm": 2.2195218190520465, "learning_rate": 1.363696117596249e-05, "loss": 0.7208, "step": 2026 }, { "epoch": 1.2004737933076695, "grad_norm": 1.8165964870897855, "learning_rate": 1.363100267801083e-05, "loss": 0.7576, "step": 2027 }, { "epoch": 1.2010660349422564, "grad_norm": 2.3943878965553655, "learning_rate": 1.3625042694774886e-05, "loss": 0.7357, "step": 2028 }, { "epoch": 1.2016582765768433, "grad_norm": 1.7673693572256555, "learning_rate": 1.3619081228692639e-05, "loss": 0.7445, "step": 2029 }, { "epoch": 1.2022505182114303, "grad_norm": 2.3818913908134585, "learning_rate": 1.3613118282202653e-05, "loss": 0.773, "step": 2030 }, { "epoch": 1.2028427598460172, "grad_norm": 1.3134819673961666, "learning_rate": 1.360715385774412e-05, "loss": 0.7121, "step": 2031 }, { "epoch": 1.2034350014806041, "grad_norm": 1.4209296259531388, "learning_rate": 1.3601187957756814e-05, "loss": 0.7476, "step": 2032 }, { "epoch": 1.204027243115191, "grad_norm": 1.5555865610475328, "learning_rate": 1.3595220584681132e-05, "loss": 0.7267, "step": 2033 }, { "epoch": 1.204619484749778, "grad_norm": 1.7478667815196771, "learning_rate": 1.358925174095806e-05, "loss": 0.736, "step": 2034 }, { "epoch": 1.205211726384365, "grad_norm": 1.5866637854107661, "learning_rate": 1.3583281429029197e-05, "loss": 0.734, "step": 2035 }, { "epoch": 1.2058039680189516, "grad_norm": 1.7725107135676927, "learning_rate": 1.3577309651336728e-05, "loss": 0.7305, "step": 2036 }, { "epoch": 1.2063962096535386, "grad_norm": 1.623392832320982, "learning_rate": 1.3571336410323448e-05, "loss": 0.7349, "step": 2037 }, { "epoch": 1.2069884512881255, "grad_norm": 1.6230978180494768, "learning_rate": 1.3565361708432754e-05, "loss": 0.7369, "step": 2038 }, { "epoch": 1.2075806929227124, "grad_norm": 1.3893660474733283, "learning_rate": 1.3559385548108628e-05, "loss": 0.777, "step": 2039 }, { "epoch": 1.2081729345572993, "grad_norm": 1.8002057697729996, "learning_rate": 1.3553407931795662e-05, "loss": 0.7434, "step": 2040 }, { "epoch": 1.2087651761918863, "grad_norm": 2.208673349342069, "learning_rate": 1.3547428861939031e-05, "loss": 0.7277, "step": 2041 }, { "epoch": 1.2093574178264732, "grad_norm": 2.130797939268362, "learning_rate": 1.3541448340984516e-05, "loss": 0.7517, "step": 2042 }, { "epoch": 1.2099496594610601, "grad_norm": 1.6356617427047075, "learning_rate": 1.3535466371378483e-05, "loss": 0.7003, "step": 2043 }, { "epoch": 1.210541901095647, "grad_norm": 1.41442682311907, "learning_rate": 1.3529482955567896e-05, "loss": 0.7375, "step": 2044 }, { "epoch": 1.211134142730234, "grad_norm": 1.31478939013615, "learning_rate": 1.352349809600031e-05, "loss": 0.7268, "step": 2045 }, { "epoch": 1.211726384364821, "grad_norm": 2.630006601001849, "learning_rate": 1.3517511795123864e-05, "loss": 0.7314, "step": 2046 }, { "epoch": 1.2123186259994077, "grad_norm": 2.771345319898994, "learning_rate": 1.3511524055387293e-05, "loss": 0.738, "step": 2047 }, { "epoch": 1.2129108676339946, "grad_norm": 1.4849713723280222, "learning_rate": 1.3505534879239923e-05, "loss": 0.7009, "step": 2048 }, { "epoch": 1.2135031092685815, "grad_norm": 1.6458441198708356, "learning_rate": 1.3499544269131662e-05, "loss": 0.7104, "step": 2049 }, { "epoch": 1.2140953509031684, "grad_norm": 1.7131561551779049, "learning_rate": 1.3493552227513007e-05, "loss": 0.7311, "step": 2050 }, { "epoch": 1.2146875925377554, "grad_norm": 1.3615653142107633, "learning_rate": 1.3487558756835037e-05, "loss": 0.7221, "step": 2051 }, { "epoch": 1.2152798341723423, "grad_norm": 1.2965127926167885, "learning_rate": 1.348156385954942e-05, "loss": 0.7176, "step": 2052 }, { "epoch": 1.2158720758069292, "grad_norm": 1.2844748989522476, "learning_rate": 1.347556753810841e-05, "loss": 0.7511, "step": 2053 }, { "epoch": 1.2164643174415162, "grad_norm": 1.530448235101446, "learning_rate": 1.3469569794964832e-05, "loss": 0.7388, "step": 2054 }, { "epoch": 1.217056559076103, "grad_norm": 1.958572967417298, "learning_rate": 1.346357063257211e-05, "loss": 0.706, "step": 2055 }, { "epoch": 1.21764880071069, "grad_norm": 2.5624134832322514, "learning_rate": 1.3457570053384225e-05, "loss": 0.7516, "step": 2056 }, { "epoch": 1.218241042345277, "grad_norm": 1.4244459644738547, "learning_rate": 1.3451568059855769e-05, "loss": 0.7369, "step": 2057 }, { "epoch": 1.2188332839798637, "grad_norm": 1.3769283070535534, "learning_rate": 1.3445564654441879e-05, "loss": 0.7497, "step": 2058 }, { "epoch": 1.2194255256144506, "grad_norm": 2.074590247262327, "learning_rate": 1.3439559839598296e-05, "loss": 0.7637, "step": 2059 }, { "epoch": 1.2200177672490375, "grad_norm": 2.0719993078058923, "learning_rate": 1.3433553617781318e-05, "loss": 0.7669, "step": 2060 }, { "epoch": 1.2206100088836245, "grad_norm": 2.3062477000075248, "learning_rate": 1.3427545991447838e-05, "loss": 0.7561, "step": 2061 }, { "epoch": 1.2212022505182114, "grad_norm": 1.45455392841872, "learning_rate": 1.3421536963055304e-05, "loss": 0.7921, "step": 2062 }, { "epoch": 1.2217944921527983, "grad_norm": 1.8743454895734402, "learning_rate": 1.341552653506175e-05, "loss": 0.7586, "step": 2063 }, { "epoch": 1.2223867337873853, "grad_norm": 1.6979098890708813, "learning_rate": 1.3409514709925777e-05, "loss": 0.741, "step": 2064 }, { "epoch": 1.2229789754219722, "grad_norm": 3.0439938556284045, "learning_rate": 1.3403501490106562e-05, "loss": 0.753, "step": 2065 }, { "epoch": 1.2235712170565591, "grad_norm": 1.4161385510159186, "learning_rate": 1.3397486878063852e-05, "loss": 0.7204, "step": 2066 }, { "epoch": 1.224163458691146, "grad_norm": 1.4943178672108177, "learning_rate": 1.3391470876257957e-05, "loss": 0.7726, "step": 2067 }, { "epoch": 1.224755700325733, "grad_norm": 9.496258794249528, "learning_rate": 1.3385453487149765e-05, "loss": 0.7337, "step": 2068 }, { "epoch": 1.2253479419603197, "grad_norm": 2.31673978824299, "learning_rate": 1.3379434713200719e-05, "loss": 0.7277, "step": 2069 }, { "epoch": 1.2259401835949066, "grad_norm": 4.688994620362354, "learning_rate": 1.3373414556872844e-05, "loss": 0.7385, "step": 2070 }, { "epoch": 1.2265324252294936, "grad_norm": 2.7363030213596686, "learning_rate": 1.3367393020628716e-05, "loss": 0.7473, "step": 2071 }, { "epoch": 1.2271246668640805, "grad_norm": 2.074298161390372, "learning_rate": 1.3361370106931486e-05, "loss": 0.7227, "step": 2072 }, { "epoch": 1.2277169084986674, "grad_norm": 1.790196274702212, "learning_rate": 1.3355345818244864e-05, "loss": 0.7003, "step": 2073 }, { "epoch": 1.2283091501332544, "grad_norm": 2.2647881453530636, "learning_rate": 1.3349320157033121e-05, "loss": 0.6884, "step": 2074 }, { "epoch": 1.2289013917678413, "grad_norm": 2.009616238480604, "learning_rate": 1.3343293125761095e-05, "loss": 0.739, "step": 2075 }, { "epoch": 1.2294936334024282, "grad_norm": 1.5327846716374411, "learning_rate": 1.3337264726894175e-05, "loss": 0.765, "step": 2076 }, { "epoch": 1.2300858750370152, "grad_norm": 1.5570663444419837, "learning_rate": 1.333123496289832e-05, "loss": 0.8065, "step": 2077 }, { "epoch": 1.230678116671602, "grad_norm": 9.590583102694426, "learning_rate": 1.3325203836240039e-05, "loss": 0.7578, "step": 2078 }, { "epoch": 1.231270358306189, "grad_norm": 1.945591127870425, "learning_rate": 1.3319171349386407e-05, "loss": 0.7356, "step": 2079 }, { "epoch": 1.2318625999407757, "grad_norm": 1.9607332954570817, "learning_rate": 1.3313137504805042e-05, "loss": 0.72, "step": 2080 }, { "epoch": 1.2324548415753627, "grad_norm": 2.1351386740486387, "learning_rate": 1.3307102304964137e-05, "loss": 0.7345, "step": 2081 }, { "epoch": 1.2330470832099496, "grad_norm": 2.0536248837102336, "learning_rate": 1.3301065752332415e-05, "loss": 0.7216, "step": 2082 }, { "epoch": 1.2336393248445365, "grad_norm": 2.055956373479122, "learning_rate": 1.329502784937918e-05, "loss": 0.734, "step": 2083 }, { "epoch": 1.2342315664791235, "grad_norm": 1.335091728441026, "learning_rate": 1.328898859857426e-05, "loss": 0.6955, "step": 2084 }, { "epoch": 1.2348238081137104, "grad_norm": 3.0796986220464118, "learning_rate": 1.328294800238806e-05, "loss": 0.6923, "step": 2085 }, { "epoch": 1.2354160497482973, "grad_norm": 1.4601736651102073, "learning_rate": 1.3276906063291511e-05, "loss": 0.7418, "step": 2086 }, { "epoch": 1.2360082913828843, "grad_norm": 1.9846393822643258, "learning_rate": 1.327086278375612e-05, "loss": 0.7092, "step": 2087 }, { "epoch": 1.2366005330174712, "grad_norm": 1.5795884904364481, "learning_rate": 1.3264818166253917e-05, "loss": 0.7119, "step": 2088 }, { "epoch": 1.2371927746520581, "grad_norm": 3.1635752576713663, "learning_rate": 1.3258772213257493e-05, "loss": 0.724, "step": 2089 }, { "epoch": 1.237785016286645, "grad_norm": 2.0561446054723573, "learning_rate": 1.3252724927239986e-05, "loss": 0.7008, "step": 2090 }, { "epoch": 1.2383772579212318, "grad_norm": 1.977573696753446, "learning_rate": 1.3246676310675076e-05, "loss": 0.6925, "step": 2091 }, { "epoch": 1.2389694995558187, "grad_norm": 1.459694866102705, "learning_rate": 1.3240626366036982e-05, "loss": 0.7235, "step": 2092 }, { "epoch": 1.2395617411904056, "grad_norm": 1.7170760478119698, "learning_rate": 1.3234575095800477e-05, "loss": 0.672, "step": 2093 }, { "epoch": 1.2401539828249926, "grad_norm": 2.4802114399082673, "learning_rate": 1.3228522502440868e-05, "loss": 0.7352, "step": 2094 }, { "epoch": 1.2407462244595795, "grad_norm": 2.0363567883714713, "learning_rate": 1.3222468588434007e-05, "loss": 0.7406, "step": 2095 }, { "epoch": 1.2413384660941664, "grad_norm": 1.6732345457040516, "learning_rate": 1.3216413356256286e-05, "loss": 0.7671, "step": 2096 }, { "epoch": 1.2419307077287534, "grad_norm": 2.0841489770262744, "learning_rate": 1.3210356808384634e-05, "loss": 0.7337, "step": 2097 }, { "epoch": 1.2425229493633403, "grad_norm": 2.033101647120512, "learning_rate": 1.3204298947296521e-05, "loss": 0.7288, "step": 2098 }, { "epoch": 1.2431151909979272, "grad_norm": 7.475913929909522, "learning_rate": 1.3198239775469953e-05, "loss": 0.7209, "step": 2099 }, { "epoch": 1.2437074326325142, "grad_norm": 2.093757868519759, "learning_rate": 1.319217929538347e-05, "loss": 0.7292, "step": 2100 }, { "epoch": 1.244299674267101, "grad_norm": 1.9048288578338362, "learning_rate": 1.318611750951615e-05, "loss": 0.7345, "step": 2101 }, { "epoch": 1.2448919159016878, "grad_norm": 2.054573126945815, "learning_rate": 1.3180054420347603e-05, "loss": 0.7389, "step": 2102 }, { "epoch": 1.2454841575362747, "grad_norm": 1.8305207739909872, "learning_rate": 1.317399003035798e-05, "loss": 0.7297, "step": 2103 }, { "epoch": 1.2460763991708617, "grad_norm": 1.9690959794278322, "learning_rate": 1.3167924342027947e-05, "loss": 0.7013, "step": 2104 }, { "epoch": 1.2466686408054486, "grad_norm": 2.0772164992233715, "learning_rate": 1.316185735783872e-05, "loss": 0.7077, "step": 2105 }, { "epoch": 1.2472608824400355, "grad_norm": 2.034723235828488, "learning_rate": 1.315578908027203e-05, "loss": 0.7378, "step": 2106 }, { "epoch": 1.2478531240746225, "grad_norm": 1.5869955061233847, "learning_rate": 1.3149719511810152e-05, "loss": 0.7371, "step": 2107 }, { "epoch": 1.2484453657092094, "grad_norm": 3.66716265133577, "learning_rate": 1.3143648654935875e-05, "loss": 0.7766, "step": 2108 }, { "epoch": 1.2490376073437963, "grad_norm": 2.1574547752973707, "learning_rate": 1.3137576512132524e-05, "loss": 0.7387, "step": 2109 }, { "epoch": 1.2496298489783833, "grad_norm": 2.0549597904621426, "learning_rate": 1.313150308588394e-05, "loss": 0.7119, "step": 2110 }, { "epoch": 1.25022209061297, "grad_norm": 1.766968422618495, "learning_rate": 1.3125428378674507e-05, "loss": 0.7619, "step": 2111 }, { "epoch": 1.2508143322475571, "grad_norm": 1.6504903814709435, "learning_rate": 1.311935239298911e-05, "loss": 0.7339, "step": 2112 }, { "epoch": 1.2514065738821438, "grad_norm": 2.3755573371765846, "learning_rate": 1.3113275131313179e-05, "loss": 0.7706, "step": 2113 }, { "epoch": 1.2519988155167308, "grad_norm": 6.419415940537187, "learning_rate": 1.310719659613265e-05, "loss": 0.7298, "step": 2114 }, { "epoch": 1.2525910571513177, "grad_norm": 4.777728911657151, "learning_rate": 1.3101116789933988e-05, "loss": 0.7351, "step": 2115 }, { "epoch": 1.2531832987859046, "grad_norm": 1.9830147062841872, "learning_rate": 1.3095035715204171e-05, "loss": 0.7352, "step": 2116 }, { "epoch": 1.2537755404204916, "grad_norm": 3.8969606332634212, "learning_rate": 1.3088953374430709e-05, "loss": 0.7565, "step": 2117 }, { "epoch": 1.2543677820550785, "grad_norm": 3.895878218256781, "learning_rate": 1.3082869770101613e-05, "loss": 0.737, "step": 2118 }, { "epoch": 1.2549600236896654, "grad_norm": 3.0246769337442725, "learning_rate": 1.3076784904705426e-05, "loss": 0.7413, "step": 2119 }, { "epoch": 1.2555522653242523, "grad_norm": 6.0066054184240105, "learning_rate": 1.3070698780731194e-05, "loss": 0.7304, "step": 2120 }, { "epoch": 1.2561445069588393, "grad_norm": 3.00887439367028, "learning_rate": 1.306461140066849e-05, "loss": 0.7248, "step": 2121 }, { "epoch": 1.256736748593426, "grad_norm": 2.3778171464319318, "learning_rate": 1.305852276700739e-05, "loss": 0.7199, "step": 2122 }, { "epoch": 1.2573289902280131, "grad_norm": 3.933106640277816, "learning_rate": 1.3052432882238487e-05, "loss": 0.7539, "step": 2123 }, { "epoch": 1.2579212318625999, "grad_norm": 12.537942854482242, "learning_rate": 1.304634174885289e-05, "loss": 0.7503, "step": 2124 }, { "epoch": 1.2585134734971868, "grad_norm": 3.6650070710135174, "learning_rate": 1.3040249369342215e-05, "loss": 0.7451, "step": 2125 }, { "epoch": 1.2591057151317737, "grad_norm": 3.320172986670575, "learning_rate": 1.3034155746198588e-05, "loss": 0.692, "step": 2126 }, { "epoch": 1.2596979567663606, "grad_norm": 2.414366162856636, "learning_rate": 1.3028060881914639e-05, "loss": 0.7276, "step": 2127 }, { "epoch": 1.2602901984009476, "grad_norm": 1.845695162557533, "learning_rate": 1.3021964778983513e-05, "loss": 0.7083, "step": 2128 }, { "epoch": 1.2608824400355345, "grad_norm": 3.8714818295986957, "learning_rate": 1.3015867439898862e-05, "loss": 0.7238, "step": 2129 }, { "epoch": 1.2614746816701214, "grad_norm": 3.0251438217693205, "learning_rate": 1.3009768867154834e-05, "loss": 0.7342, "step": 2130 }, { "epoch": 1.2620669233047084, "grad_norm": 3.7078771253414717, "learning_rate": 1.3003669063246096e-05, "loss": 0.7253, "step": 2131 }, { "epoch": 1.2626591649392953, "grad_norm": 1.6876561495268882, "learning_rate": 1.2997568030667802e-05, "loss": 0.7286, "step": 2132 }, { "epoch": 1.263251406573882, "grad_norm": 2.3907642362721835, "learning_rate": 1.2991465771915626e-05, "loss": 0.7579, "step": 2133 }, { "epoch": 1.2638436482084692, "grad_norm": 2.5589770747316867, "learning_rate": 1.2985362289485728e-05, "loss": 0.7411, "step": 2134 }, { "epoch": 1.2644358898430559, "grad_norm": 1.670540837036476, "learning_rate": 1.2979257585874782e-05, "loss": 0.7457, "step": 2135 }, { "epoch": 1.2650281314776428, "grad_norm": 2.1619277987058525, "learning_rate": 1.2973151663579948e-05, "loss": 0.7158, "step": 2136 }, { "epoch": 1.2656203731122297, "grad_norm": 4.927979400324849, "learning_rate": 1.2967044525098897e-05, "loss": 0.7082, "step": 2137 }, { "epoch": 1.2662126147468167, "grad_norm": 1.7102954049147032, "learning_rate": 1.296093617292979e-05, "loss": 0.731, "step": 2138 }, { "epoch": 1.2668048563814036, "grad_norm": 4.087043152351807, "learning_rate": 1.295482660957129e-05, "loss": 0.7433, "step": 2139 }, { "epoch": 1.2673970980159905, "grad_norm": 7.571384572693805, "learning_rate": 1.2948715837522542e-05, "loss": 0.7354, "step": 2140 }, { "epoch": 1.2679893396505775, "grad_norm": 2.3629194134848928, "learning_rate": 1.2942603859283207e-05, "loss": 0.7327, "step": 2141 }, { "epoch": 1.2685815812851644, "grad_norm": 2.8525406546001415, "learning_rate": 1.2936490677353422e-05, "loss": 0.7268, "step": 2142 }, { "epoch": 1.2691738229197513, "grad_norm": 14.104534903708966, "learning_rate": 1.2930376294233821e-05, "loss": 0.723, "step": 2143 }, { "epoch": 1.269766064554338, "grad_norm": 2.488060816626756, "learning_rate": 1.2924260712425536e-05, "loss": 0.7055, "step": 2144 }, { "epoch": 1.2703583061889252, "grad_norm": 2.281444459835155, "learning_rate": 1.2918143934430178e-05, "loss": 0.7358, "step": 2145 }, { "epoch": 1.270950547823512, "grad_norm": 2.863071025708633, "learning_rate": 1.2912025962749856e-05, "loss": 0.7511, "step": 2146 }, { "epoch": 1.2715427894580988, "grad_norm": 3.4445132717535145, "learning_rate": 1.2905906799887164e-05, "loss": 0.7211, "step": 2147 }, { "epoch": 1.2721350310926858, "grad_norm": 2.992321062019045, "learning_rate": 1.2899786448345186e-05, "loss": 0.7323, "step": 2148 }, { "epoch": 1.2727272727272727, "grad_norm": 2.7754937679502767, "learning_rate": 1.2893664910627486e-05, "loss": 0.6995, "step": 2149 }, { "epoch": 1.2733195143618596, "grad_norm": 4.043526562299961, "learning_rate": 1.288754218923812e-05, "loss": 0.7038, "step": 2150 }, { "epoch": 1.2739117559964466, "grad_norm": 2.601133025890244, "learning_rate": 1.2881418286681622e-05, "loss": 0.7592, "step": 2151 }, { "epoch": 1.2745039976310335, "grad_norm": 3.228479699181939, "learning_rate": 1.2875293205463018e-05, "loss": 0.7457, "step": 2152 }, { "epoch": 1.2750962392656204, "grad_norm": 5.282129958437338, "learning_rate": 1.2869166948087804e-05, "loss": 0.7341, "step": 2153 }, { "epoch": 1.2756884809002074, "grad_norm": 12.516944032609397, "learning_rate": 1.2863039517061968e-05, "loss": 0.7495, "step": 2154 }, { "epoch": 1.276280722534794, "grad_norm": 4.462130212698797, "learning_rate": 1.2856910914891973e-05, "loss": 0.7348, "step": 2155 }, { "epoch": 1.2768729641693812, "grad_norm": 4.499459533831295, "learning_rate": 1.2850781144084763e-05, "loss": 0.7377, "step": 2156 }, { "epoch": 1.277465205803968, "grad_norm": 4.366118773756862, "learning_rate": 1.284465020714776e-05, "loss": 0.7213, "step": 2157 }, { "epoch": 1.2780574474385549, "grad_norm": 5.796358264091171, "learning_rate": 1.2838518106588856e-05, "loss": 0.7296, "step": 2158 }, { "epoch": 1.2786496890731418, "grad_norm": 3.051441847538537, "learning_rate": 1.2832384844916433e-05, "loss": 0.7283, "step": 2159 }, { "epoch": 1.2792419307077287, "grad_norm": 2.3238935491908865, "learning_rate": 1.2826250424639329e-05, "loss": 0.7258, "step": 2160 }, { "epoch": 1.2798341723423157, "grad_norm": 10.583266939609347, "learning_rate": 1.282011484826688e-05, "loss": 0.6906, "step": 2161 }, { "epoch": 1.2804264139769026, "grad_norm": 1.9613859782854648, "learning_rate": 1.2813978118308872e-05, "loss": 0.7381, "step": 2162 }, { "epoch": 1.2810186556114895, "grad_norm": 1.5522438888542716, "learning_rate": 1.2807840237275578e-05, "loss": 0.7759, "step": 2163 }, { "epoch": 1.2816108972460765, "grad_norm": 2.0620688237603866, "learning_rate": 1.2801701207677731e-05, "loss": 0.7275, "step": 2164 }, { "epoch": 1.2822031388806634, "grad_norm": 4.033929396881479, "learning_rate": 1.2795561032026546e-05, "loss": 0.7161, "step": 2165 }, { "epoch": 1.28279538051525, "grad_norm": 2.2174066380945128, "learning_rate": 1.2789419712833698e-05, "loss": 0.7347, "step": 2166 }, { "epoch": 1.2833876221498373, "grad_norm": 2.989193169042572, "learning_rate": 1.2783277252611326e-05, "loss": 0.744, "step": 2167 }, { "epoch": 1.283979863784424, "grad_norm": 3.7142286905163817, "learning_rate": 1.277713365387205e-05, "loss": 0.711, "step": 2168 }, { "epoch": 1.284572105419011, "grad_norm": 5.139644730862864, "learning_rate": 1.2770988919128943e-05, "loss": 0.7386, "step": 2169 }, { "epoch": 1.2851643470535978, "grad_norm": 6.589804111626187, "learning_rate": 1.2764843050895548e-05, "loss": 0.7376, "step": 2170 }, { "epoch": 1.2857565886881848, "grad_norm": 7.048157778907931, "learning_rate": 1.275869605168587e-05, "loss": 0.726, "step": 2171 }, { "epoch": 1.2863488303227717, "grad_norm": 4.519497731902214, "learning_rate": 1.2752547924014378e-05, "loss": 0.7176, "step": 2172 }, { "epoch": 1.2869410719573586, "grad_norm": 4.560564674061755, "learning_rate": 1.2746398670396003e-05, "loss": 0.7548, "step": 2173 }, { "epoch": 1.2875333135919456, "grad_norm": 2.476033120505578, "learning_rate": 1.2740248293346134e-05, "loss": 0.6937, "step": 2174 }, { "epoch": 1.2881255552265325, "grad_norm": 1.711083220222386, "learning_rate": 1.2734096795380619e-05, "loss": 0.7451, "step": 2175 }, { "epoch": 1.2887177968611194, "grad_norm": 2.308003756437765, "learning_rate": 1.2727944179015773e-05, "loss": 0.7071, "step": 2176 }, { "epoch": 1.2893100384957061, "grad_norm": 1.9018295125948241, "learning_rate": 1.2721790446768355e-05, "loss": 0.7064, "step": 2177 }, { "epoch": 1.2899022801302933, "grad_norm": 3.3632041500633663, "learning_rate": 1.271563560115559e-05, "loss": 0.756, "step": 2178 }, { "epoch": 1.29049452176488, "grad_norm": 9.199019371454177, "learning_rate": 1.2709479644695157e-05, "loss": 0.7183, "step": 2179 }, { "epoch": 1.291086763399467, "grad_norm": 3.5389831036172223, "learning_rate": 1.2703322579905191e-05, "loss": 0.717, "step": 2180 }, { "epoch": 1.2916790050340539, "grad_norm": 7.54002229069042, "learning_rate": 1.2697164409304273e-05, "loss": 0.7707, "step": 2181 }, { "epoch": 1.2922712466686408, "grad_norm": 1.7866494430336994, "learning_rate": 1.269100513541144e-05, "loss": 0.7525, "step": 2182 }, { "epoch": 1.2928634883032277, "grad_norm": 1.4683566571838764, "learning_rate": 1.2684844760746188e-05, "loss": 0.741, "step": 2183 }, { "epoch": 1.2934557299378147, "grad_norm": 1.8765066274808877, "learning_rate": 1.2678683287828451e-05, "loss": 0.7657, "step": 2184 }, { "epoch": 1.2940479715724016, "grad_norm": 3.013366849613377, "learning_rate": 1.267252071917862e-05, "loss": 0.7016, "step": 2185 }, { "epoch": 1.2946402132069885, "grad_norm": 1.7177077320985006, "learning_rate": 1.266635705731753e-05, "loss": 0.692, "step": 2186 }, { "epoch": 1.2952324548415755, "grad_norm": 3.9718676101648813, "learning_rate": 1.266019230476647e-05, "loss": 0.6845, "step": 2187 }, { "epoch": 1.2958246964761622, "grad_norm": 7.981490810275518, "learning_rate": 1.2654026464047165e-05, "loss": 0.6967, "step": 2188 }, { "epoch": 1.2964169381107493, "grad_norm": 1.6653196455250276, "learning_rate": 1.2647859537681794e-05, "loss": 0.7264, "step": 2189 }, { "epoch": 1.297009179745336, "grad_norm": 3.229577009236302, "learning_rate": 1.2641691528192976e-05, "loss": 0.7175, "step": 2190 }, { "epoch": 1.297601421379923, "grad_norm": 1.8793833759264493, "learning_rate": 1.2635522438103775e-05, "loss": 0.7383, "step": 2191 }, { "epoch": 1.29819366301451, "grad_norm": 6.074008068005565, "learning_rate": 1.262935226993769e-05, "loss": 0.716, "step": 2192 }, { "epoch": 1.2987859046490968, "grad_norm": 3.365277587578852, "learning_rate": 1.2623181026218676e-05, "loss": 0.7079, "step": 2193 }, { "epoch": 1.2993781462836838, "grad_norm": 3.8730032760836712, "learning_rate": 1.261700870947111e-05, "loss": 0.7094, "step": 2194 }, { "epoch": 1.2999703879182707, "grad_norm": 2.4954320743950205, "learning_rate": 1.2610835322219822e-05, "loss": 0.7249, "step": 2195 }, { "epoch": 1.3005626295528576, "grad_norm": 2.328817696663847, "learning_rate": 1.2604660866990072e-05, "loss": 0.7027, "step": 2196 }, { "epoch": 1.3011548711874446, "grad_norm": 1.7371517673037984, "learning_rate": 1.259848534630756e-05, "loss": 0.7134, "step": 2197 }, { "epoch": 1.3017471128220315, "grad_norm": 2.4321062518581096, "learning_rate": 1.2592308762698422e-05, "loss": 0.7529, "step": 2198 }, { "epoch": 1.3023393544566182, "grad_norm": 2.551030855888438, "learning_rate": 1.2586131118689229e-05, "loss": 0.7395, "step": 2199 }, { "epoch": 1.3029315960912053, "grad_norm": 2.5192997738692253, "learning_rate": 1.257995241680698e-05, "loss": 0.767, "step": 2200 }, { "epoch": 1.303523837725792, "grad_norm": 1.871844395912189, "learning_rate": 1.2573772659579118e-05, "loss": 0.7456, "step": 2201 }, { "epoch": 1.304116079360379, "grad_norm": 2.1558955925829903, "learning_rate": 1.2567591849533507e-05, "loss": 0.7714, "step": 2202 }, { "epoch": 1.304708320994966, "grad_norm": 1.5443144366947106, "learning_rate": 1.2561409989198445e-05, "loss": 0.7153, "step": 2203 }, { "epoch": 1.3053005626295529, "grad_norm": 2.893987768096241, "learning_rate": 1.2555227081102663e-05, "loss": 0.7501, "step": 2204 }, { "epoch": 1.3058928042641398, "grad_norm": 2.9864438149165293, "learning_rate": 1.2549043127775317e-05, "loss": 0.7392, "step": 2205 }, { "epoch": 1.3064850458987267, "grad_norm": 2.2495043135725044, "learning_rate": 1.2542858131745997e-05, "loss": 0.7344, "step": 2206 }, { "epoch": 1.3070772875333136, "grad_norm": 1.8073617958458001, "learning_rate": 1.2536672095544705e-05, "loss": 0.739, "step": 2207 }, { "epoch": 1.3076695291679006, "grad_norm": 5.482475430635642, "learning_rate": 1.253048502170188e-05, "loss": 0.7081, "step": 2208 }, { "epoch": 1.3082617708024875, "grad_norm": 1.6369830148297229, "learning_rate": 1.2524296912748391e-05, "loss": 0.743, "step": 2209 }, { "epoch": 1.3088540124370742, "grad_norm": 2.3037478748956457, "learning_rate": 1.2518107771215511e-05, "loss": 0.7013, "step": 2210 }, { "epoch": 1.3094462540716614, "grad_norm": 2.2512579980860727, "learning_rate": 1.2511917599634957e-05, "loss": 0.7202, "step": 2211 }, { "epoch": 1.310038495706248, "grad_norm": 3.811836598474259, "learning_rate": 1.2505726400538849e-05, "loss": 0.735, "step": 2212 }, { "epoch": 1.310630737340835, "grad_norm": 3.888905349606496, "learning_rate": 1.2499534176459742e-05, "loss": 0.7649, "step": 2213 }, { "epoch": 1.311222978975422, "grad_norm": 2.6409628828714062, "learning_rate": 1.24933409299306e-05, "loss": 0.7387, "step": 2214 }, { "epoch": 1.3118152206100089, "grad_norm": 1.1361250179762734, "learning_rate": 1.2487146663484811e-05, "loss": 0.7357, "step": 2215 }, { "epoch": 1.3124074622445958, "grad_norm": 3.0048958781591812, "learning_rate": 1.2480951379656175e-05, "loss": 0.7016, "step": 2216 }, { "epoch": 1.3129997038791827, "grad_norm": 3.618139046477133, "learning_rate": 1.247475508097892e-05, "loss": 0.7279, "step": 2217 }, { "epoch": 1.3135919455137697, "grad_norm": 5.253731958808833, "learning_rate": 1.2468557769987667e-05, "loss": 0.749, "step": 2218 }, { "epoch": 1.3141841871483566, "grad_norm": 1.6888707222430752, "learning_rate": 1.2462359449217478e-05, "loss": 0.7063, "step": 2219 }, { "epoch": 1.3147764287829435, "grad_norm": 1.8167018428081732, "learning_rate": 1.2456160121203808e-05, "loss": 0.6972, "step": 2220 }, { "epoch": 1.3153686704175303, "grad_norm": 3.091651135895501, "learning_rate": 1.244995978848253e-05, "loss": 0.7003, "step": 2221 }, { "epoch": 1.3159609120521172, "grad_norm": 1.5525088892116699, "learning_rate": 1.2443758453589934e-05, "loss": 0.722, "step": 2222 }, { "epoch": 1.3165531536867041, "grad_norm": 2.7538503380997894, "learning_rate": 1.2437556119062707e-05, "loss": 0.7218, "step": 2223 }, { "epoch": 1.317145395321291, "grad_norm": 2.232065009961151, "learning_rate": 1.243135278743796e-05, "loss": 0.7338, "step": 2224 }, { "epoch": 1.317737636955878, "grad_norm": 1.4109391466878773, "learning_rate": 1.24251484612532e-05, "loss": 0.7339, "step": 2225 }, { "epoch": 1.318329878590465, "grad_norm": 5.387146562198588, "learning_rate": 1.2418943143046346e-05, "loss": 0.7163, "step": 2226 }, { "epoch": 1.3189221202250518, "grad_norm": 7.336494868885874, "learning_rate": 1.2412736835355725e-05, "loss": 0.7206, "step": 2227 }, { "epoch": 1.3195143618596388, "grad_norm": 5.47208025176641, "learning_rate": 1.2406529540720063e-05, "loss": 0.7154, "step": 2228 }, { "epoch": 1.3201066034942257, "grad_norm": 2.6695992583732946, "learning_rate": 1.2400321261678492e-05, "loss": 0.7236, "step": 2229 }, { "epoch": 1.3206988451288126, "grad_norm": 2.195132600070081, "learning_rate": 1.239411200077055e-05, "loss": 0.7574, "step": 2230 }, { "epoch": 1.3212910867633996, "grad_norm": 11.165845514321358, "learning_rate": 1.238790176053617e-05, "loss": 0.7546, "step": 2231 }, { "epoch": 1.3218833283979863, "grad_norm": 2.9312352374249864, "learning_rate": 1.2381690543515692e-05, "loss": 0.7301, "step": 2232 }, { "epoch": 1.3224755700325732, "grad_norm": 1.6545166318992972, "learning_rate": 1.2375478352249854e-05, "loss": 0.7211, "step": 2233 }, { "epoch": 1.3230678116671601, "grad_norm": 2.1861482000851518, "learning_rate": 1.236926518927979e-05, "loss": 0.6936, "step": 2234 }, { "epoch": 1.323660053301747, "grad_norm": 1.8651361150526538, "learning_rate": 1.2363051057147036e-05, "loss": 0.7116, "step": 2235 }, { "epoch": 1.324252294936334, "grad_norm": 2.1494204464514923, "learning_rate": 1.2356835958393513e-05, "loss": 0.746, "step": 2236 }, { "epoch": 1.324844536570921, "grad_norm": 1.5645161950911375, "learning_rate": 1.2350619895561557e-05, "loss": 0.7028, "step": 2237 }, { "epoch": 1.3254367782055079, "grad_norm": 1.7327580503628912, "learning_rate": 1.2344402871193876e-05, "loss": 0.7459, "step": 2238 }, { "epoch": 1.3260290198400948, "grad_norm": 2.9366360287029085, "learning_rate": 1.2338184887833595e-05, "loss": 0.7929, "step": 2239 }, { "epoch": 1.3266212614746817, "grad_norm": 2.234776359408874, "learning_rate": 1.2331965948024209e-05, "loss": 0.741, "step": 2240 }, { "epoch": 1.3272135031092684, "grad_norm": 16.615421551346895, "learning_rate": 1.232574605430962e-05, "loss": 0.7348, "step": 2241 }, { "epoch": 1.3278057447438556, "grad_norm": 1.836805999142044, "learning_rate": 1.2319525209234109e-05, "loss": 0.7431, "step": 2242 }, { "epoch": 1.3283979863784423, "grad_norm": 3.6059103156218217, "learning_rate": 1.2313303415342358e-05, "loss": 0.702, "step": 2243 }, { "epoch": 1.3289902280130292, "grad_norm": 2.494436178191213, "learning_rate": 1.230708067517942e-05, "loss": 0.7407, "step": 2244 }, { "epoch": 1.3295824696476162, "grad_norm": 3.946731442600297, "learning_rate": 1.230085699129076e-05, "loss": 0.6811, "step": 2245 }, { "epoch": 1.330174711282203, "grad_norm": 4.20508246282007, "learning_rate": 1.2294632366222201e-05, "loss": 0.7007, "step": 2246 }, { "epoch": 1.33076695291679, "grad_norm": 6.980679537862269, "learning_rate": 1.2288406802519974e-05, "loss": 0.6982, "step": 2247 }, { "epoch": 1.331359194551377, "grad_norm": 2.0063581908355195, "learning_rate": 1.2282180302730683e-05, "loss": 0.7171, "step": 2248 }, { "epoch": 1.331951436185964, "grad_norm": 2.812205651172888, "learning_rate": 1.2275952869401311e-05, "loss": 0.7062, "step": 2249 }, { "epoch": 1.3325436778205508, "grad_norm": 4.260513166725431, "learning_rate": 1.2269724505079234e-05, "loss": 0.7677, "step": 2250 }, { "epoch": 1.3331359194551378, "grad_norm": 2.395786745837272, "learning_rate": 1.2263495212312202e-05, "loss": 0.7321, "step": 2251 }, { "epoch": 1.3337281610897245, "grad_norm": 3.4523356684471933, "learning_rate": 1.2257264993648345e-05, "loss": 0.7213, "step": 2252 }, { "epoch": 1.3343204027243116, "grad_norm": 1.6680835833538643, "learning_rate": 1.2251033851636174e-05, "loss": 0.7316, "step": 2253 }, { "epoch": 1.3349126443588983, "grad_norm": 4.066851683497322, "learning_rate": 1.2244801788824577e-05, "loss": 0.7494, "step": 2254 }, { "epoch": 1.3355048859934853, "grad_norm": 2.2024623170572575, "learning_rate": 1.2238568807762813e-05, "loss": 0.7078, "step": 2255 }, { "epoch": 1.3360971276280722, "grad_norm": 4.096509077552267, "learning_rate": 1.223233491100053e-05, "loss": 0.6927, "step": 2256 }, { "epoch": 1.3366893692626591, "grad_norm": 1.7054951183155915, "learning_rate": 1.2226100101087737e-05, "loss": 0.744, "step": 2257 }, { "epoch": 1.337281610897246, "grad_norm": 2.256752220268974, "learning_rate": 1.2219864380574822e-05, "loss": 0.7134, "step": 2258 }, { "epoch": 1.337873852531833, "grad_norm": 2.6221532584973217, "learning_rate": 1.2213627752012547e-05, "loss": 0.7147, "step": 2259 }, { "epoch": 1.33846609416642, "grad_norm": 3.3038467185814953, "learning_rate": 1.2207390217952044e-05, "loss": 0.7271, "step": 2260 }, { "epoch": 1.3390583358010069, "grad_norm": 1.9956526626820097, "learning_rate": 1.2201151780944813e-05, "loss": 0.7499, "step": 2261 }, { "epoch": 1.3396505774355938, "grad_norm": 2.58029133331588, "learning_rate": 1.2194912443542728e-05, "loss": 0.7362, "step": 2262 }, { "epoch": 1.3402428190701805, "grad_norm": 1.3093228436379185, "learning_rate": 1.2188672208298028e-05, "loss": 0.7464, "step": 2263 }, { "epoch": 1.3408350607047677, "grad_norm": 2.2716810173689943, "learning_rate": 1.2182431077763317e-05, "loss": 0.7151, "step": 2264 }, { "epoch": 1.3414273023393544, "grad_norm": 3.7636447060204112, "learning_rate": 1.2176189054491576e-05, "loss": 0.7064, "step": 2265 }, { "epoch": 1.3420195439739413, "grad_norm": 1.6344445874345244, "learning_rate": 1.2169946141036133e-05, "loss": 0.7248, "step": 2266 }, { "epoch": 1.3426117856085282, "grad_norm": 1.660994198119721, "learning_rate": 1.2163702339950702e-05, "loss": 0.7344, "step": 2267 }, { "epoch": 1.3432040272431152, "grad_norm": 2.3481480865911912, "learning_rate": 1.2157457653789337e-05, "loss": 0.7358, "step": 2268 }, { "epoch": 1.343796268877702, "grad_norm": 2.085597969173187, "learning_rate": 1.2151212085106478e-05, "loss": 0.7227, "step": 2269 }, { "epoch": 1.344388510512289, "grad_norm": 1.6707958861915149, "learning_rate": 1.2144965636456903e-05, "loss": 0.7298, "step": 2270 }, { "epoch": 1.344980752146876, "grad_norm": 2.7788857483382206, "learning_rate": 1.213871831039577e-05, "loss": 0.6869, "step": 2271 }, { "epoch": 1.345572993781463, "grad_norm": 2.382424211273845, "learning_rate": 1.2132470109478577e-05, "loss": 0.6929, "step": 2272 }, { "epoch": 1.3461652354160498, "grad_norm": 3.2307082039534047, "learning_rate": 1.21262210362612e-05, "loss": 0.716, "step": 2273 }, { "epoch": 1.3467574770506365, "grad_norm": 2.304898917214821, "learning_rate": 1.2119971093299852e-05, "loss": 0.7246, "step": 2274 }, { "epoch": 1.3473497186852237, "grad_norm": 4.297876675540321, "learning_rate": 1.2113720283151115e-05, "loss": 0.7579, "step": 2275 }, { "epoch": 1.3479419603198104, "grad_norm": 2.19405196855775, "learning_rate": 1.2107468608371924e-05, "loss": 0.7604, "step": 2276 }, { "epoch": 1.3485342019543973, "grad_norm": 2.236962795916548, "learning_rate": 1.2101216071519561e-05, "loss": 0.7458, "step": 2277 }, { "epoch": 1.3491264435889843, "grad_norm": 3.9633695711684975, "learning_rate": 1.209496267515167e-05, "loss": 0.7194, "step": 2278 }, { "epoch": 1.3497186852235712, "grad_norm": 2.4068817921340333, "learning_rate": 1.2088708421826238e-05, "loss": 0.7136, "step": 2279 }, { "epoch": 1.3503109268581581, "grad_norm": 1.5247334800083856, "learning_rate": 1.2082453314101607e-05, "loss": 0.6947, "step": 2280 }, { "epoch": 1.350903168492745, "grad_norm": 1.673058636797337, "learning_rate": 1.2076197354536472e-05, "loss": 0.7307, "step": 2281 }, { "epoch": 1.351495410127332, "grad_norm": 2.7089039748271495, "learning_rate": 1.2069940545689867e-05, "loss": 0.7495, "step": 2282 }, { "epoch": 1.352087651761919, "grad_norm": 1.4724209313434784, "learning_rate": 1.2063682890121178e-05, "loss": 0.7076, "step": 2283 }, { "epoch": 1.3526798933965059, "grad_norm": 1.4347383550436956, "learning_rate": 1.2057424390390141e-05, "loss": 0.7122, "step": 2284 }, { "epoch": 1.3532721350310926, "grad_norm": 1.6233887203509207, "learning_rate": 1.2051165049056836e-05, "loss": 0.7273, "step": 2285 }, { "epoch": 1.3538643766656797, "grad_norm": 2.2293249045490504, "learning_rate": 1.2044904868681684e-05, "loss": 0.7501, "step": 2286 }, { "epoch": 1.3544566183002664, "grad_norm": 2.3431634966622115, "learning_rate": 1.2038643851825449e-05, "loss": 0.7621, "step": 2287 }, { "epoch": 1.3550488599348534, "grad_norm": 2.132037815874192, "learning_rate": 1.203238200104924e-05, "loss": 0.7268, "step": 2288 }, { "epoch": 1.3556411015694403, "grad_norm": 2.230932140756579, "learning_rate": 1.2026119318914507e-05, "loss": 0.735, "step": 2289 }, { "epoch": 1.3562333432040272, "grad_norm": 2.313542593223687, "learning_rate": 1.2019855807983036e-05, "loss": 0.7422, "step": 2290 }, { "epoch": 1.3568255848386142, "grad_norm": 1.9178912189104353, "learning_rate": 1.201359147081696e-05, "loss": 0.73, "step": 2291 }, { "epoch": 1.357417826473201, "grad_norm": 1.5513535945386705, "learning_rate": 1.200732630997874e-05, "loss": 0.7273, "step": 2292 }, { "epoch": 1.358010068107788, "grad_norm": 2.3144421995406512, "learning_rate": 1.2001060328031185e-05, "loss": 0.7184, "step": 2293 }, { "epoch": 1.358602309742375, "grad_norm": 1.8962622817640602, "learning_rate": 1.1994793527537427e-05, "loss": 0.7409, "step": 2294 }, { "epoch": 1.3591945513769619, "grad_norm": 1.9723792893075207, "learning_rate": 1.198852591106095e-05, "loss": 0.7409, "step": 2295 }, { "epoch": 1.3597867930115486, "grad_norm": 1.6685845910473676, "learning_rate": 1.1982257481165547e-05, "loss": 0.6975, "step": 2296 }, { "epoch": 1.3603790346461357, "grad_norm": 1.8774532039410945, "learning_rate": 1.1975988240415373e-05, "loss": 0.7433, "step": 2297 }, { "epoch": 1.3609712762807225, "grad_norm": 4.187478803679397, "learning_rate": 1.1969718191374888e-05, "loss": 0.7295, "step": 2298 }, { "epoch": 1.3615635179153094, "grad_norm": 2.2037953249975315, "learning_rate": 1.1963447336608906e-05, "loss": 0.7087, "step": 2299 }, { "epoch": 1.3621557595498963, "grad_norm": 3.372329419689456, "learning_rate": 1.1957175678682548e-05, "loss": 0.737, "step": 2300 }, { "epoch": 1.3627480011844832, "grad_norm": 1.7609785926431862, "learning_rate": 1.1950903220161286e-05, "loss": 0.7049, "step": 2301 }, { "epoch": 1.3633402428190702, "grad_norm": 10.575058480914846, "learning_rate": 1.1944629963610897e-05, "loss": 0.732, "step": 2302 }, { "epoch": 1.3639324844536571, "grad_norm": 1.7154442069759137, "learning_rate": 1.1938355911597503e-05, "loss": 0.7155, "step": 2303 }, { "epoch": 1.364524726088244, "grad_norm": 2.2620417290126946, "learning_rate": 1.1932081066687544e-05, "loss": 0.7689, "step": 2304 }, { "epoch": 1.365116967722831, "grad_norm": 5.085468174057115, "learning_rate": 1.1925805431447779e-05, "loss": 0.7161, "step": 2305 }, { "epoch": 1.365709209357418, "grad_norm": 2.4608886800316285, "learning_rate": 1.1919529008445302e-05, "loss": 0.7487, "step": 2306 }, { "epoch": 1.3663014509920046, "grad_norm": 2.196016033444546, "learning_rate": 1.1913251800247515e-05, "loss": 0.7473, "step": 2307 }, { "epoch": 1.3668936926265918, "grad_norm": 2.3351921735739136, "learning_rate": 1.1906973809422163e-05, "loss": 0.7114, "step": 2308 }, { "epoch": 1.3674859342611785, "grad_norm": 3.712094735632108, "learning_rate": 1.1900695038537283e-05, "loss": 0.7126, "step": 2309 }, { "epoch": 1.3680781758957654, "grad_norm": 3.0445067803167434, "learning_rate": 1.1894415490161253e-05, "loss": 0.7616, "step": 2310 }, { "epoch": 1.3686704175303523, "grad_norm": 5.092443503760028, "learning_rate": 1.1888135166862756e-05, "loss": 0.7598, "step": 2311 }, { "epoch": 1.3692626591649393, "grad_norm": 1.7515743364833825, "learning_rate": 1.1881854071210805e-05, "loss": 0.7519, "step": 2312 }, { "epoch": 1.3698549007995262, "grad_norm": 1.4595659443345792, "learning_rate": 1.1875572205774712e-05, "loss": 0.6944, "step": 2313 }, { "epoch": 1.3704471424341131, "grad_norm": 2.0276065761611566, "learning_rate": 1.1869289573124125e-05, "loss": 0.7117, "step": 2314 }, { "epoch": 1.3710393840687, "grad_norm": 1.6385422571466461, "learning_rate": 1.1863006175828984e-05, "loss": 0.7348, "step": 2315 }, { "epoch": 1.371631625703287, "grad_norm": 1.2986389722977396, "learning_rate": 1.1856722016459554e-05, "loss": 0.7225, "step": 2316 }, { "epoch": 1.372223867337874, "grad_norm": 2.5907128138808284, "learning_rate": 1.1850437097586412e-05, "loss": 0.7053, "step": 2317 }, { "epoch": 1.3728161089724606, "grad_norm": 1.72659341320089, "learning_rate": 1.1844151421780442e-05, "loss": 0.7747, "step": 2318 }, { "epoch": 1.3734083506070478, "grad_norm": 2.3936636379326326, "learning_rate": 1.1837864991612839e-05, "loss": 0.746, "step": 2319 }, { "epoch": 1.3740005922416345, "grad_norm": 3.268948302001786, "learning_rate": 1.1831577809655105e-05, "loss": 0.7287, "step": 2320 }, { "epoch": 1.3745928338762214, "grad_norm": 1.8358669402385104, "learning_rate": 1.1825289878479054e-05, "loss": 0.7323, "step": 2321 }, { "epoch": 1.3751850755108084, "grad_norm": 6.659136666043565, "learning_rate": 1.18190012006568e-05, "loss": 0.781, "step": 2322 }, { "epoch": 1.3757773171453953, "grad_norm": 1.8103702241279684, "learning_rate": 1.1812711778760768e-05, "loss": 0.7562, "step": 2323 }, { "epoch": 1.3763695587799822, "grad_norm": 3.4205148227314486, "learning_rate": 1.1806421615363685e-05, "loss": 0.7057, "step": 2324 }, { "epoch": 1.3769618004145692, "grad_norm": 5.3299975994261874, "learning_rate": 1.1800130713038582e-05, "loss": 0.7554, "step": 2325 }, { "epoch": 1.377554042049156, "grad_norm": 1.5067357582677707, "learning_rate": 1.179383907435879e-05, "loss": 0.728, "step": 2326 }, { "epoch": 1.378146283683743, "grad_norm": 1.5937327086086364, "learning_rate": 1.1787546701897947e-05, "loss": 0.6923, "step": 2327 }, { "epoch": 1.37873852531833, "grad_norm": 2.9084759352815137, "learning_rate": 1.1781253598229982e-05, "loss": 0.6867, "step": 2328 }, { "epoch": 1.3793307669529167, "grad_norm": 1.8065003892691773, "learning_rate": 1.177495976592913e-05, "loss": 0.7264, "step": 2329 }, { "epoch": 1.3799230085875038, "grad_norm": 1.374778675210592, "learning_rate": 1.1768665207569922e-05, "loss": 0.763, "step": 2330 }, { "epoch": 1.3805152502220905, "grad_norm": 5.135815860167982, "learning_rate": 1.176236992572719e-05, "loss": 0.7287, "step": 2331 }, { "epoch": 1.3811074918566775, "grad_norm": 1.9451779116322265, "learning_rate": 1.1756073922976056e-05, "loss": 0.7603, "step": 2332 }, { "epoch": 1.3816997334912644, "grad_norm": 4.102609143725733, "learning_rate": 1.1749777201891937e-05, "loss": 0.6891, "step": 2333 }, { "epoch": 1.3822919751258513, "grad_norm": 2.703163169605834, "learning_rate": 1.1743479765050549e-05, "loss": 0.7238, "step": 2334 }, { "epoch": 1.3828842167604383, "grad_norm": 1.4161656206435274, "learning_rate": 1.1737181615027894e-05, "loss": 0.7072, "step": 2335 }, { "epoch": 1.3834764583950252, "grad_norm": 2.111623863485621, "learning_rate": 1.1730882754400274e-05, "loss": 0.6903, "step": 2336 }, { "epoch": 1.3840687000296121, "grad_norm": 1.9893098347865092, "learning_rate": 1.172458318574427e-05, "loss": 0.7353, "step": 2337 }, { "epoch": 1.384660941664199, "grad_norm": 3.119953828491785, "learning_rate": 1.1718282911636774e-05, "loss": 0.7126, "step": 2338 }, { "epoch": 1.385253183298786, "grad_norm": 2.0807212215419937, "learning_rate": 1.1711981934654937e-05, "loss": 0.7486, "step": 2339 }, { "epoch": 1.3858454249333727, "grad_norm": 2.1409029627127025, "learning_rate": 1.1705680257376224e-05, "loss": 0.736, "step": 2340 }, { "epoch": 1.3864376665679599, "grad_norm": 1.7678639437104227, "learning_rate": 1.1699377882378367e-05, "loss": 0.6811, "step": 2341 }, { "epoch": 1.3870299082025466, "grad_norm": 1.1969173608840782, "learning_rate": 1.1693074812239397e-05, "loss": 0.7072, "step": 2342 }, { "epoch": 1.3876221498371335, "grad_norm": 1.7602686350385015, "learning_rate": 1.1686771049537621e-05, "loss": 0.6984, "step": 2343 }, { "epoch": 1.3882143914717204, "grad_norm": 1.4360466219925514, "learning_rate": 1.1680466596851635e-05, "loss": 0.7065, "step": 2344 }, { "epoch": 1.3888066331063074, "grad_norm": 4.263733785806742, "learning_rate": 1.1674161456760314e-05, "loss": 0.7292, "step": 2345 }, { "epoch": 1.3893988747408943, "grad_norm": 2.471335440136607, "learning_rate": 1.1667855631842815e-05, "loss": 0.7671, "step": 2346 }, { "epoch": 1.3899911163754812, "grad_norm": 4.172593773758679, "learning_rate": 1.1661549124678573e-05, "loss": 0.751, "step": 2347 }, { "epoch": 1.3905833580100682, "grad_norm": 2.5493693541099667, "learning_rate": 1.1655241937847305e-05, "loss": 0.7763, "step": 2348 }, { "epoch": 1.391175599644655, "grad_norm": 12.734269954324368, "learning_rate": 1.1648934073929008e-05, "loss": 0.7433, "step": 2349 }, { "epoch": 1.391767841279242, "grad_norm": 2.0172453728072015, "learning_rate": 1.164262553550395e-05, "loss": 0.7186, "step": 2350 }, { "epoch": 1.3923600829138287, "grad_norm": 1.6347819754535515, "learning_rate": 1.1636316325152678e-05, "loss": 0.7313, "step": 2351 }, { "epoch": 1.3929523245484159, "grad_norm": 1.6197215465398926, "learning_rate": 1.1630006445456015e-05, "loss": 0.7162, "step": 2352 }, { "epoch": 1.3935445661830026, "grad_norm": 1.3419132909074647, "learning_rate": 1.1623695898995057e-05, "loss": 0.6812, "step": 2353 }, { "epoch": 1.3941368078175895, "grad_norm": 1.999373727061114, "learning_rate": 1.161738468835117e-05, "loss": 0.6904, "step": 2354 }, { "epoch": 1.3947290494521765, "grad_norm": 2.0145229640481106, "learning_rate": 1.1611072816105995e-05, "loss": 0.7258, "step": 2355 }, { "epoch": 1.3953212910867634, "grad_norm": 1.6691310278707019, "learning_rate": 1.1604760284841446e-05, "loss": 0.7245, "step": 2356 }, { "epoch": 1.3959135327213503, "grad_norm": 4.128799120066488, "learning_rate": 1.1598447097139698e-05, "loss": 0.7285, "step": 2357 }, { "epoch": 1.3965057743559373, "grad_norm": 2.0773835779838663, "learning_rate": 1.1592133255583204e-05, "loss": 0.7516, "step": 2358 }, { "epoch": 1.3970980159905242, "grad_norm": 1.3942800893003757, "learning_rate": 1.1585818762754678e-05, "loss": 0.731, "step": 2359 }, { "epoch": 1.3976902576251111, "grad_norm": 1.8775327674483524, "learning_rate": 1.1579503621237102e-05, "loss": 0.7454, "step": 2360 }, { "epoch": 1.398282499259698, "grad_norm": 3.027896060244714, "learning_rate": 1.1573187833613723e-05, "loss": 0.7275, "step": 2361 }, { "epoch": 1.3988747408942848, "grad_norm": 1.4716030596428373, "learning_rate": 1.156687140246806e-05, "loss": 0.7367, "step": 2362 }, { "epoch": 1.399466982528872, "grad_norm": 1.7820811493950572, "learning_rate": 1.1560554330383881e-05, "loss": 0.7146, "step": 2363 }, { "epoch": 1.4000592241634586, "grad_norm": 2.7037191130979323, "learning_rate": 1.1554236619945229e-05, "loss": 0.7242, "step": 2364 }, { "epoch": 1.4006514657980456, "grad_norm": 1.7688945832299963, "learning_rate": 1.1547918273736397e-05, "loss": 0.7575, "step": 2365 }, { "epoch": 1.4012437074326325, "grad_norm": 1.7948443959599745, "learning_rate": 1.1541599294341952e-05, "loss": 0.7385, "step": 2366 }, { "epoch": 1.4018359490672194, "grad_norm": 2.5494232660163414, "learning_rate": 1.1535279684346702e-05, "loss": 0.7321, "step": 2367 }, { "epoch": 1.4024281907018064, "grad_norm": 1.131792892745796, "learning_rate": 1.1528959446335735e-05, "loss": 0.7251, "step": 2368 }, { "epoch": 1.4030204323363933, "grad_norm": 1.7242196863607848, "learning_rate": 1.1522638582894372e-05, "loss": 0.7357, "step": 2369 }, { "epoch": 1.4036126739709802, "grad_norm": 1.7494101461566682, "learning_rate": 1.1516317096608207e-05, "loss": 0.7277, "step": 2370 }, { "epoch": 1.4042049156055671, "grad_norm": 2.45681555550249, "learning_rate": 1.1509994990063085e-05, "loss": 0.6967, "step": 2371 }, { "epoch": 1.404797157240154, "grad_norm": 2.9094052043800223, "learning_rate": 1.1503672265845098e-05, "loss": 0.7258, "step": 2372 }, { "epoch": 1.4053893988747408, "grad_norm": 2.4462949935085616, "learning_rate": 1.1497348926540602e-05, "loss": 0.7472, "step": 2373 }, { "epoch": 1.4059816405093277, "grad_norm": 1.5009612756951904, "learning_rate": 1.1491024974736191e-05, "loss": 0.6824, "step": 2374 }, { "epoch": 1.4065738821439147, "grad_norm": 3.095173697672372, "learning_rate": 1.1484700413018724e-05, "loss": 0.7173, "step": 2375 }, { "epoch": 1.4071661237785016, "grad_norm": 2.4512057822789606, "learning_rate": 1.1478375243975298e-05, "loss": 0.7297, "step": 2376 }, { "epoch": 1.4077583654130885, "grad_norm": 1.9421393264777882, "learning_rate": 1.1472049470193263e-05, "loss": 0.7504, "step": 2377 }, { "epoch": 1.4083506070476755, "grad_norm": 2.4342391043313047, "learning_rate": 1.1465723094260219e-05, "loss": 0.7148, "step": 2378 }, { "epoch": 1.4089428486822624, "grad_norm": 1.2678552133839875, "learning_rate": 1.1459396118764007e-05, "loss": 0.6937, "step": 2379 }, { "epoch": 1.4095350903168493, "grad_norm": 1.6681069042211958, "learning_rate": 1.1453068546292718e-05, "loss": 0.7413, "step": 2380 }, { "epoch": 1.4101273319514362, "grad_norm": 2.0274279135928706, "learning_rate": 1.1446740379434681e-05, "loss": 0.7516, "step": 2381 }, { "epoch": 1.4107195735860232, "grad_norm": 1.4001394906264233, "learning_rate": 1.1440411620778478e-05, "loss": 0.7393, "step": 2382 }, { "epoch": 1.41131181522061, "grad_norm": 3.0338701312366445, "learning_rate": 1.1434082272912923e-05, "loss": 0.7396, "step": 2383 }, { "epoch": 1.4119040568551968, "grad_norm": 1.786481375473841, "learning_rate": 1.1427752338427075e-05, "loss": 0.7079, "step": 2384 }, { "epoch": 1.4124962984897838, "grad_norm": 1.1712156976339714, "learning_rate": 1.1421421819910235e-05, "loss": 0.7258, "step": 2385 }, { "epoch": 1.4130885401243707, "grad_norm": 2.996800157879306, "learning_rate": 1.141509071995194e-05, "loss": 0.7282, "step": 2386 }, { "epoch": 1.4136807817589576, "grad_norm": 1.3892970824112951, "learning_rate": 1.1408759041141963e-05, "loss": 0.7463, "step": 2387 }, { "epoch": 1.4142730233935445, "grad_norm": 1.6106966314025877, "learning_rate": 1.1402426786070326e-05, "loss": 0.7159, "step": 2388 }, { "epoch": 1.4148652650281315, "grad_norm": 2.391973281082433, "learning_rate": 1.1396093957327266e-05, "loss": 0.6767, "step": 2389 }, { "epoch": 1.4154575066627184, "grad_norm": 1.8064771701538858, "learning_rate": 1.1389760557503275e-05, "loss": 0.7618, "step": 2390 }, { "epoch": 1.4160497482973053, "grad_norm": 1.9436742635325912, "learning_rate": 1.1383426589189062e-05, "loss": 0.7507, "step": 2391 }, { "epoch": 1.4166419899318923, "grad_norm": 1.4722158504330858, "learning_rate": 1.1377092054975586e-05, "loss": 0.7029, "step": 2392 }, { "epoch": 1.417234231566479, "grad_norm": 1.2331100621292894, "learning_rate": 1.1370756957454015e-05, "loss": 0.6877, "step": 2393 }, { "epoch": 1.4178264732010661, "grad_norm": 1.8467647418967628, "learning_rate": 1.1364421299215773e-05, "loss": 0.734, "step": 2394 }, { "epoch": 1.4184187148356528, "grad_norm": 2.0082819089866293, "learning_rate": 1.135808508285249e-05, "loss": 0.719, "step": 2395 }, { "epoch": 1.4190109564702398, "grad_norm": 1.461710287319451, "learning_rate": 1.135174831095604e-05, "loss": 0.7138, "step": 2396 }, { "epoch": 1.4196031981048267, "grad_norm": 1.519799568496123, "learning_rate": 1.134541098611852e-05, "loss": 0.7084, "step": 2397 }, { "epoch": 1.4201954397394136, "grad_norm": 2.315400648418732, "learning_rate": 1.133907311093225e-05, "loss": 0.7295, "step": 2398 }, { "epoch": 1.4207876813740006, "grad_norm": 1.8933519527297487, "learning_rate": 1.133273468798978e-05, "loss": 0.7078, "step": 2399 }, { "epoch": 1.4213799230085875, "grad_norm": 1.91221104798189, "learning_rate": 1.1326395719883876e-05, "loss": 0.6819, "step": 2400 }, { "epoch": 1.4219721646431744, "grad_norm": 1.979878107597815, "learning_rate": 1.1320056209207538e-05, "loss": 0.71, "step": 2401 }, { "epoch": 1.4225644062777614, "grad_norm": 2.977882494720392, "learning_rate": 1.1313716158553978e-05, "loss": 0.7408, "step": 2402 }, { "epoch": 1.4231566479123483, "grad_norm": 1.6024476014527083, "learning_rate": 1.1307375570516637e-05, "loss": 0.7167, "step": 2403 }, { "epoch": 1.423748889546935, "grad_norm": 1.714605953833717, "learning_rate": 1.130103444768917e-05, "loss": 0.7197, "step": 2404 }, { "epoch": 1.4243411311815222, "grad_norm": 2.5811940978711068, "learning_rate": 1.1294692792665452e-05, "loss": 0.6986, "step": 2405 }, { "epoch": 1.4249333728161089, "grad_norm": 5.955582148467331, "learning_rate": 1.1288350608039577e-05, "loss": 0.7285, "step": 2406 }, { "epoch": 1.4255256144506958, "grad_norm": 1.580417338453988, "learning_rate": 1.1282007896405858e-05, "loss": 0.7571, "step": 2407 }, { "epoch": 1.4261178560852827, "grad_norm": 1.7271693274308317, "learning_rate": 1.1275664660358818e-05, "loss": 0.7756, "step": 2408 }, { "epoch": 1.4267100977198697, "grad_norm": 1.5963788064488735, "learning_rate": 1.1269320902493199e-05, "loss": 0.7245, "step": 2409 }, { "epoch": 1.4273023393544566, "grad_norm": 2.2232952704603828, "learning_rate": 1.1262976625403954e-05, "loss": 0.7156, "step": 2410 }, { "epoch": 1.4278945809890435, "grad_norm": 4.047457885454327, "learning_rate": 1.1256631831686245e-05, "loss": 0.7397, "step": 2411 }, { "epoch": 1.4284868226236305, "grad_norm": 2.079661525227277, "learning_rate": 1.1250286523935456e-05, "loss": 0.6906, "step": 2412 }, { "epoch": 1.4290790642582174, "grad_norm": 2.3170134187542666, "learning_rate": 1.1243940704747169e-05, "loss": 0.7696, "step": 2413 }, { "epoch": 1.4296713058928043, "grad_norm": 4.447126564702576, "learning_rate": 1.1237594376717188e-05, "loss": 0.7215, "step": 2414 }, { "epoch": 1.430263547527391, "grad_norm": 1.487487996677641, "learning_rate": 1.1231247542441507e-05, "loss": 0.7387, "step": 2415 }, { "epoch": 1.4308557891619782, "grad_norm": 2.7450360360522925, "learning_rate": 1.122490020451635e-05, "loss": 0.7184, "step": 2416 }, { "epoch": 1.431448030796565, "grad_norm": 4.220383960594748, "learning_rate": 1.1218552365538129e-05, "loss": 0.7528, "step": 2417 }, { "epoch": 1.4320402724311518, "grad_norm": 8.716959325200545, "learning_rate": 1.1212204028103469e-05, "loss": 0.7771, "step": 2418 }, { "epoch": 1.4326325140657388, "grad_norm": 1.6083934310732948, "learning_rate": 1.1205855194809191e-05, "loss": 0.7349, "step": 2419 }, { "epoch": 1.4332247557003257, "grad_norm": 2.098687005967919, "learning_rate": 1.1199505868252336e-05, "loss": 0.7355, "step": 2420 }, { "epoch": 1.4338169973349126, "grad_norm": 1.6917431368313898, "learning_rate": 1.1193156051030128e-05, "loss": 0.6713, "step": 2421 }, { "epoch": 1.4344092389694996, "grad_norm": 1.9121827759052907, "learning_rate": 1.1186805745739999e-05, "loss": 0.7065, "step": 2422 }, { "epoch": 1.4350014806040865, "grad_norm": 1.7303129056373097, "learning_rate": 1.1180454954979583e-05, "loss": 0.6621, "step": 2423 }, { "epoch": 1.4355937222386734, "grad_norm": 1.4653359584564212, "learning_rate": 1.1174103681346711e-05, "loss": 0.7288, "step": 2424 }, { "epoch": 1.4361859638732604, "grad_norm": 1.6242879043550942, "learning_rate": 1.1167751927439407e-05, "loss": 0.7364, "step": 2425 }, { "epoch": 1.436778205507847, "grad_norm": 1.879062950337473, "learning_rate": 1.1161399695855903e-05, "loss": 0.7296, "step": 2426 }, { "epoch": 1.4373704471424342, "grad_norm": 2.2495686189430617, "learning_rate": 1.1155046989194613e-05, "loss": 0.775, "step": 2427 }, { "epoch": 1.437962688777021, "grad_norm": 1.667428421117216, "learning_rate": 1.1148693810054152e-05, "loss": 0.7023, "step": 2428 }, { "epoch": 1.4385549304116079, "grad_norm": 1.3747252199835087, "learning_rate": 1.1142340161033331e-05, "loss": 0.7057, "step": 2429 }, { "epoch": 1.4391471720461948, "grad_norm": 2.463314444705448, "learning_rate": 1.1135986044731144e-05, "loss": 0.6567, "step": 2430 }, { "epoch": 1.4397394136807817, "grad_norm": 1.5810081020557705, "learning_rate": 1.1129631463746789e-05, "loss": 0.6708, "step": 2431 }, { "epoch": 1.4403316553153687, "grad_norm": 1.8170531880544196, "learning_rate": 1.112327642067964e-05, "loss": 0.7159, "step": 2432 }, { "epoch": 1.4409238969499556, "grad_norm": 5.805209899239579, "learning_rate": 1.1116920918129271e-05, "loss": 0.7091, "step": 2433 }, { "epoch": 1.4415161385845425, "grad_norm": 1.9375366699904837, "learning_rate": 1.111056495869544e-05, "loss": 0.7578, "step": 2434 }, { "epoch": 1.4421083802191295, "grad_norm": 2.8102088648986525, "learning_rate": 1.110420854497809e-05, "loss": 0.7223, "step": 2435 }, { "epoch": 1.4427006218537164, "grad_norm": 2.1131020428074456, "learning_rate": 1.1097851679577351e-05, "loss": 0.7278, "step": 2436 }, { "epoch": 1.443292863488303, "grad_norm": 2.3630646661630004, "learning_rate": 1.1091494365093542e-05, "loss": 0.751, "step": 2437 }, { "epoch": 1.4438851051228903, "grad_norm": 3.8773111167009624, "learning_rate": 1.1085136604127161e-05, "loss": 0.7495, "step": 2438 }, { "epoch": 1.444477346757477, "grad_norm": 2.7572277967036456, "learning_rate": 1.1078778399278885e-05, "loss": 0.749, "step": 2439 }, { "epoch": 1.445069588392064, "grad_norm": 1.469912383127116, "learning_rate": 1.1072419753149585e-05, "loss": 0.7198, "step": 2440 }, { "epoch": 1.4456618300266508, "grad_norm": 2.777349956968284, "learning_rate": 1.1066060668340298e-05, "loss": 0.6924, "step": 2441 }, { "epoch": 1.4462540716612378, "grad_norm": 1.4767013976366694, "learning_rate": 1.105970114745225e-05, "loss": 0.7943, "step": 2442 }, { "epoch": 1.4468463132958247, "grad_norm": 3.0907295250096785, "learning_rate": 1.1053341193086844e-05, "loss": 0.6945, "step": 2443 }, { "epoch": 1.4474385549304116, "grad_norm": 1.473657931484691, "learning_rate": 1.104698080784566e-05, "loss": 0.7246, "step": 2444 }, { "epoch": 1.4480307965649986, "grad_norm": 1.5243657541187539, "learning_rate": 1.1040619994330446e-05, "loss": 0.7208, "step": 2445 }, { "epoch": 1.4486230381995855, "grad_norm": 1.4396515407272992, "learning_rate": 1.1034258755143141e-05, "loss": 0.7082, "step": 2446 }, { "epoch": 1.4492152798341724, "grad_norm": 4.67485917339473, "learning_rate": 1.1027897092885846e-05, "loss": 0.7418, "step": 2447 }, { "epoch": 1.4498075214687591, "grad_norm": 1.8934539767167, "learning_rate": 1.1021535010160838e-05, "loss": 0.7048, "step": 2448 }, { "epoch": 1.4503997631033463, "grad_norm": 1.7934976398624312, "learning_rate": 1.1015172509570567e-05, "loss": 0.7438, "step": 2449 }, { "epoch": 1.450992004737933, "grad_norm": 3.1616972894112574, "learning_rate": 1.1008809593717653e-05, "loss": 0.7354, "step": 2450 }, { "epoch": 1.45158424637252, "grad_norm": 2.5005464211548216, "learning_rate": 1.1002446265204887e-05, "loss": 0.7263, "step": 2451 }, { "epoch": 1.4521764880071069, "grad_norm": 3.4868223181586875, "learning_rate": 1.0996082526635227e-05, "loss": 0.7188, "step": 2452 }, { "epoch": 1.4527687296416938, "grad_norm": 1.6735246025817165, "learning_rate": 1.0989718380611805e-05, "loss": 0.6973, "step": 2453 }, { "epoch": 1.4533609712762807, "grad_norm": 2.147911052347242, "learning_rate": 1.0983353829737909e-05, "loss": 0.6972, "step": 2454 }, { "epoch": 1.4539532129108677, "grad_norm": 1.1261985891425348, "learning_rate": 1.0976988876616998e-05, "loss": 0.7311, "step": 2455 }, { "epoch": 1.4545454545454546, "grad_norm": 1.3716117307279665, "learning_rate": 1.0970623523852699e-05, "loss": 0.7525, "step": 2456 }, { "epoch": 1.4551376961800415, "grad_norm": 2.1195272638552165, "learning_rate": 1.0964257774048799e-05, "loss": 0.7471, "step": 2457 }, { "epoch": 1.4557299378146284, "grad_norm": 1.9959343877258, "learning_rate": 1.0957891629809248e-05, "loss": 0.761, "step": 2458 }, { "epoch": 1.4563221794492152, "grad_norm": 2.2291293933510095, "learning_rate": 1.0951525093738151e-05, "loss": 0.7153, "step": 2459 }, { "epoch": 1.4569144210838023, "grad_norm": 1.5622339941082886, "learning_rate": 1.0945158168439786e-05, "loss": 0.7644, "step": 2460 }, { "epoch": 1.457506662718389, "grad_norm": 2.3239762154569554, "learning_rate": 1.0938790856518582e-05, "loss": 0.7644, "step": 2461 }, { "epoch": 1.458098904352976, "grad_norm": 2.650172654861937, "learning_rate": 1.0932423160579126e-05, "loss": 0.7249, "step": 2462 }, { "epoch": 1.4586911459875629, "grad_norm": 1.717625601814134, "learning_rate": 1.0926055083226166e-05, "loss": 0.7325, "step": 2463 }, { "epoch": 1.4592833876221498, "grad_norm": 3.9491611186011384, "learning_rate": 1.0919686627064603e-05, "loss": 0.7214, "step": 2464 }, { "epoch": 1.4598756292567368, "grad_norm": 1.2001329473694484, "learning_rate": 1.091331779469949e-05, "loss": 0.7511, "step": 2465 }, { "epoch": 1.4604678708913237, "grad_norm": 1.742997197381938, "learning_rate": 1.0906948588736044e-05, "loss": 0.7016, "step": 2466 }, { "epoch": 1.4610601125259106, "grad_norm": 1.2448970057704796, "learning_rate": 1.0900579011779622e-05, "loss": 0.7431, "step": 2467 }, { "epoch": 1.4616523541604975, "grad_norm": 1.3860151218112158, "learning_rate": 1.0894209066435746e-05, "loss": 0.715, "step": 2468 }, { "epoch": 1.4622445957950845, "grad_norm": 16.17995815293019, "learning_rate": 1.0887838755310072e-05, "loss": 0.7018, "step": 2469 }, { "epoch": 1.4628368374296712, "grad_norm": 1.7009020003527942, "learning_rate": 1.0881468081008428e-05, "loss": 0.7178, "step": 2470 }, { "epoch": 1.4634290790642583, "grad_norm": 3.5437807058153696, "learning_rate": 1.0875097046136764e-05, "loss": 0.7219, "step": 2471 }, { "epoch": 1.464021320698845, "grad_norm": 1.3944824507804543, "learning_rate": 1.0868725653301206e-05, "loss": 0.7535, "step": 2472 }, { "epoch": 1.464613562333432, "grad_norm": 1.3126484653094563, "learning_rate": 1.0862353905108002e-05, "loss": 0.7229, "step": 2473 }, { "epoch": 1.465205803968019, "grad_norm": 2.458706343995586, "learning_rate": 1.085598180416356e-05, "loss": 0.7186, "step": 2474 }, { "epoch": 1.4657980456026058, "grad_norm": 4.852881691527256, "learning_rate": 1.0849609353074423e-05, "loss": 0.7252, "step": 2475 }, { "epoch": 1.4663902872371928, "grad_norm": 2.036540439548759, "learning_rate": 1.0843236554447288e-05, "loss": 0.7141, "step": 2476 }, { "epoch": 1.4669825288717797, "grad_norm": 2.066706696048189, "learning_rate": 1.0836863410888983e-05, "loss": 0.7087, "step": 2477 }, { "epoch": 1.4675747705063666, "grad_norm": 1.7357600707145933, "learning_rate": 1.0830489925006485e-05, "loss": 0.7375, "step": 2478 }, { "epoch": 1.4681670121409536, "grad_norm": 2.55498272013775, "learning_rate": 1.0824116099406905e-05, "loss": 0.7577, "step": 2479 }, { "epoch": 1.4687592537755405, "grad_norm": 1.3261417160385482, "learning_rate": 1.0817741936697499e-05, "loss": 0.7181, "step": 2480 }, { "epoch": 1.4693514954101272, "grad_norm": 2.4364379575015587, "learning_rate": 1.0811367439485658e-05, "loss": 0.7276, "step": 2481 }, { "epoch": 1.4699437370447144, "grad_norm": 4.800149895264068, "learning_rate": 1.0804992610378907e-05, "loss": 0.7436, "step": 2482 }, { "epoch": 1.470535978679301, "grad_norm": 5.046483999796505, "learning_rate": 1.0798617451984912e-05, "loss": 0.7433, "step": 2483 }, { "epoch": 1.471128220313888, "grad_norm": 1.9692087823077262, "learning_rate": 1.0792241966911472e-05, "loss": 0.6953, "step": 2484 }, { "epoch": 1.471720461948475, "grad_norm": 2.128256332160169, "learning_rate": 1.0785866157766515e-05, "loss": 0.7532, "step": 2485 }, { "epoch": 1.4723127035830619, "grad_norm": 1.6170247391820483, "learning_rate": 1.077949002715811e-05, "loss": 0.747, "step": 2486 }, { "epoch": 1.4729049452176488, "grad_norm": 2.938634854754934, "learning_rate": 1.0773113577694452e-05, "loss": 0.7153, "step": 2487 }, { "epoch": 1.4734971868522357, "grad_norm": 4.000268288506294, "learning_rate": 1.0766736811983864e-05, "loss": 0.7287, "step": 2488 }, { "epoch": 1.4740894284868227, "grad_norm": 3.613730312963113, "learning_rate": 1.0760359732634806e-05, "loss": 0.7625, "step": 2489 }, { "epoch": 1.4746816701214096, "grad_norm": 5.433532028459535, "learning_rate": 1.0753982342255863e-05, "loss": 0.7634, "step": 2490 }, { "epoch": 1.4752739117559965, "grad_norm": 3.32345033735752, "learning_rate": 1.0747604643455735e-05, "loss": 0.7312, "step": 2491 }, { "epoch": 1.4758661533905832, "grad_norm": 1.5311040334516028, "learning_rate": 1.0741226638843276e-05, "loss": 0.7367, "step": 2492 }, { "epoch": 1.4764583950251704, "grad_norm": 2.7319357660771355, "learning_rate": 1.0734848331027437e-05, "loss": 0.7359, "step": 2493 }, { "epoch": 1.477050636659757, "grad_norm": 18.11777663466048, "learning_rate": 1.072846972261731e-05, "loss": 0.7704, "step": 2494 }, { "epoch": 1.477642878294344, "grad_norm": 2.4993251519063895, "learning_rate": 1.07220908162221e-05, "loss": 0.7497, "step": 2495 }, { "epoch": 1.478235119928931, "grad_norm": 1.8417261729994114, "learning_rate": 1.0715711614451146e-05, "loss": 0.7485, "step": 2496 }, { "epoch": 1.478827361563518, "grad_norm": 2.7762420879853558, "learning_rate": 1.0709332119913889e-05, "loss": 0.7371, "step": 2497 }, { "epoch": 1.4794196031981048, "grad_norm": 2.8156204757494394, "learning_rate": 1.0702952335219912e-05, "loss": 0.7295, "step": 2498 }, { "epoch": 1.4800118448326918, "grad_norm": 3.4764320543388645, "learning_rate": 1.0696572262978897e-05, "loss": 0.7117, "step": 2499 }, { "epoch": 1.4806040864672787, "grad_norm": 2.613928689321406, "learning_rate": 1.0690191905800659e-05, "loss": 0.7364, "step": 2500 }, { "epoch": 1.4811963281018656, "grad_norm": 3.310002223294027, "learning_rate": 1.0683811266295122e-05, "loss": 0.7071, "step": 2501 }, { "epoch": 1.4817885697364526, "grad_norm": 1.8031901176722984, "learning_rate": 1.067743034707232e-05, "loss": 0.7363, "step": 2502 }, { "epoch": 1.4823808113710393, "grad_norm": 2.736654320785856, "learning_rate": 1.0671049150742414e-05, "loss": 0.7292, "step": 2503 }, { "epoch": 1.4829730530056264, "grad_norm": 6.165916524526913, "learning_rate": 1.066466767991567e-05, "loss": 0.7259, "step": 2504 }, { "epoch": 1.4835652946402131, "grad_norm": 3.2740591856838286, "learning_rate": 1.065828593720247e-05, "loss": 0.7333, "step": 2505 }, { "epoch": 1.4841575362748, "grad_norm": 8.686085817094266, "learning_rate": 1.0651903925213304e-05, "loss": 0.694, "step": 2506 }, { "epoch": 1.484749777909387, "grad_norm": 3.2639051263293744, "learning_rate": 1.0645521646558774e-05, "loss": 0.7347, "step": 2507 }, { "epoch": 1.485342019543974, "grad_norm": 2.5554270332110334, "learning_rate": 1.0639139103849591e-05, "loss": 0.7331, "step": 2508 }, { "epoch": 1.4859342611785609, "grad_norm": 2.926347424292201, "learning_rate": 1.0632756299696576e-05, "loss": 0.7181, "step": 2509 }, { "epoch": 1.4865265028131478, "grad_norm": 2.3947137814149047, "learning_rate": 1.062637323671065e-05, "loss": 0.7079, "step": 2510 }, { "epoch": 1.4871187444477347, "grad_norm": 2.6108737168898264, "learning_rate": 1.061998991750285e-05, "loss": 0.7863, "step": 2511 }, { "epoch": 1.4877109860823217, "grad_norm": 2.7866821674655755, "learning_rate": 1.0613606344684309e-05, "loss": 0.7435, "step": 2512 }, { "epoch": 1.4883032277169086, "grad_norm": 2.1621024720841935, "learning_rate": 1.0607222520866268e-05, "loss": 0.7383, "step": 2513 }, { "epoch": 1.4888954693514953, "grad_norm": 1.4595498878836146, "learning_rate": 1.060083844866007e-05, "loss": 0.7215, "step": 2514 }, { "epoch": 1.4894877109860825, "grad_norm": 2.4323235608077267, "learning_rate": 1.0594454130677159e-05, "loss": 0.7737, "step": 2515 }, { "epoch": 1.4900799526206692, "grad_norm": 2.0951574345823167, "learning_rate": 1.0588069569529085e-05, "loss": 0.751, "step": 2516 }, { "epoch": 1.490672194255256, "grad_norm": 2.7057235512306264, "learning_rate": 1.0581684767827483e-05, "loss": 0.7164, "step": 2517 }, { "epoch": 1.491264435889843, "grad_norm": 2.3050182970407462, "learning_rate": 1.0575299728184105e-05, "loss": 0.7266, "step": 2518 }, { "epoch": 1.49185667752443, "grad_norm": 2.658571871840395, "learning_rate": 1.0568914453210784e-05, "loss": 0.729, "step": 2519 }, { "epoch": 1.492448919159017, "grad_norm": 2.6694118505347046, "learning_rate": 1.0562528945519463e-05, "loss": 0.7099, "step": 2520 }, { "epoch": 1.4930411607936038, "grad_norm": 2.7419506378224257, "learning_rate": 1.0556143207722167e-05, "loss": 0.7001, "step": 2521 }, { "epoch": 1.4936334024281908, "grad_norm": 1.5146904472068337, "learning_rate": 1.0549757242431032e-05, "loss": 0.7012, "step": 2522 }, { "epoch": 1.4942256440627777, "grad_norm": 2.187917848241182, "learning_rate": 1.0543371052258262e-05, "loss": 0.7006, "step": 2523 }, { "epoch": 1.4948178856973646, "grad_norm": 3.0173641599728307, "learning_rate": 1.0536984639816183e-05, "loss": 0.7009, "step": 2524 }, { "epoch": 1.4954101273319513, "grad_norm": 4.342321069678089, "learning_rate": 1.0530598007717188e-05, "loss": 0.7325, "step": 2525 }, { "epoch": 1.4960023689665383, "grad_norm": 2.168797900177454, "learning_rate": 1.0524211158573772e-05, "loss": 0.7077, "step": 2526 }, { "epoch": 1.4965946106011252, "grad_norm": 2.459366128626486, "learning_rate": 1.0517824094998514e-05, "loss": 0.7149, "step": 2527 }, { "epoch": 1.4971868522357121, "grad_norm": 3.5713298768798794, "learning_rate": 1.0511436819604082e-05, "loss": 0.734, "step": 2528 }, { "epoch": 1.497779093870299, "grad_norm": 1.9883923998857207, "learning_rate": 1.050504933500323e-05, "loss": 0.6835, "step": 2529 }, { "epoch": 1.498371335504886, "grad_norm": 2.939625720286527, "learning_rate": 1.0498661643808801e-05, "loss": 0.6833, "step": 2530 }, { "epoch": 1.498963577139473, "grad_norm": 3.2278416983726586, "learning_rate": 1.0492273748633718e-05, "loss": 0.7615, "step": 2531 }, { "epoch": 1.4995558187740599, "grad_norm": 2.134003837207207, "learning_rate": 1.0485885652090992e-05, "loss": 0.7285, "step": 2532 }, { "epoch": 1.5001480604086468, "grad_norm": 1.4289709457674888, "learning_rate": 1.0479497356793708e-05, "loss": 0.7301, "step": 2533 }, { "epoch": 1.5007403020432335, "grad_norm": 2.2718039122799936, "learning_rate": 1.0473108865355046e-05, "loss": 0.6829, "step": 2534 }, { "epoch": 1.5013325436778207, "grad_norm": 2.1362613649289837, "learning_rate": 1.046672018038825e-05, "loss": 0.7585, "step": 2535 }, { "epoch": 1.5019247853124074, "grad_norm": 4.9287065917580035, "learning_rate": 1.0460331304506658e-05, "loss": 0.6854, "step": 2536 }, { "epoch": 1.5025170269469945, "grad_norm": 2.71738703423648, "learning_rate": 1.0453942240323676e-05, "loss": 0.6972, "step": 2537 }, { "epoch": 1.5031092685815812, "grad_norm": 3.3571433608705967, "learning_rate": 1.044755299045279e-05, "loss": 0.7208, "step": 2538 }, { "epoch": 1.5037015102161682, "grad_norm": 2.2306864648628104, "learning_rate": 1.0441163557507565e-05, "loss": 0.7, "step": 2539 }, { "epoch": 1.504293751850755, "grad_norm": 2.127715558269432, "learning_rate": 1.0434773944101637e-05, "loss": 0.7028, "step": 2540 }, { "epoch": 1.504885993485342, "grad_norm": 3.283305061966627, "learning_rate": 1.0428384152848716e-05, "loss": 0.7194, "step": 2541 }, { "epoch": 1.505478235119929, "grad_norm": 1.7421560789370591, "learning_rate": 1.0421994186362591e-05, "loss": 0.7146, "step": 2542 }, { "epoch": 1.5060704767545159, "grad_norm": 2.1998312820966035, "learning_rate": 1.0415604047257108e-05, "loss": 0.7055, "step": 2543 }, { "epoch": 1.5066627183891028, "grad_norm": 2.8243735302280797, "learning_rate": 1.0409213738146207e-05, "loss": 0.7101, "step": 2544 }, { "epoch": 1.5072549600236895, "grad_norm": 2.064730143131007, "learning_rate": 1.0402823261643869e-05, "loss": 0.7449, "step": 2545 }, { "epoch": 1.5078472016582767, "grad_norm": 2.1534779272441016, "learning_rate": 1.039643262036417e-05, "loss": 0.7249, "step": 2546 }, { "epoch": 1.5084394432928634, "grad_norm": 1.9313484117848871, "learning_rate": 1.039004181692123e-05, "loss": 0.7402, "step": 2547 }, { "epoch": 1.5090316849274505, "grad_norm": 2.075887908145325, "learning_rate": 1.0383650853929261e-05, "loss": 0.7791, "step": 2548 }, { "epoch": 1.5096239265620373, "grad_norm": 3.2492748129540985, "learning_rate": 1.0377259734002514e-05, "loss": 0.7529, "step": 2549 }, { "epoch": 1.5102161681966242, "grad_norm": 1.900962739440968, "learning_rate": 1.0370868459755325e-05, "loss": 0.7034, "step": 2550 }, { "epoch": 1.5108084098312111, "grad_norm": 1.477785833608308, "learning_rate": 1.0364477033802079e-05, "loss": 0.6937, "step": 2551 }, { "epoch": 1.511400651465798, "grad_norm": 3.631747743457973, "learning_rate": 1.0358085458757233e-05, "loss": 0.7212, "step": 2552 }, { "epoch": 1.511992893100385, "grad_norm": 3.5138447414359173, "learning_rate": 1.0351693737235296e-05, "loss": 0.7277, "step": 2553 }, { "epoch": 1.512585134734972, "grad_norm": 1.3857492580523203, "learning_rate": 1.0345301871850843e-05, "loss": 0.7219, "step": 2554 }, { "epoch": 1.5131773763695588, "grad_norm": 2.6281307473566673, "learning_rate": 1.0338909865218509e-05, "loss": 0.7223, "step": 2555 }, { "epoch": 1.5137696180041456, "grad_norm": 11.099970381589284, "learning_rate": 1.0332517719952982e-05, "loss": 0.7396, "step": 2556 }, { "epoch": 1.5143618596387327, "grad_norm": 5.440070964115479, "learning_rate": 1.0326125438669008e-05, "loss": 0.7263, "step": 2557 }, { "epoch": 1.5149541012733194, "grad_norm": 3.966364535381863, "learning_rate": 1.0319733023981392e-05, "loss": 0.7212, "step": 2558 }, { "epoch": 1.5155463429079066, "grad_norm": 9.420534733675245, "learning_rate": 1.031334047850499e-05, "loss": 0.7458, "step": 2559 }, { "epoch": 1.5161385845424933, "grad_norm": 2.6027648620090997, "learning_rate": 1.030694780485471e-05, "loss": 0.7141, "step": 2560 }, { "epoch": 1.5167308261770802, "grad_norm": 1.4387286301673126, "learning_rate": 1.030055500564552e-05, "loss": 0.7644, "step": 2561 }, { "epoch": 1.5173230678116671, "grad_norm": 1.956401956469561, "learning_rate": 1.0294162083492429e-05, "loss": 0.7265, "step": 2562 }, { "epoch": 1.517915309446254, "grad_norm": 1.9974664355897158, "learning_rate": 1.0287769041010506e-05, "loss": 0.752, "step": 2563 }, { "epoch": 1.518507551080841, "grad_norm": 2.317095214652806, "learning_rate": 1.0281375880814864e-05, "loss": 0.7235, "step": 2564 }, { "epoch": 1.519099792715428, "grad_norm": 2.301255206281844, "learning_rate": 1.0274982605520662e-05, "loss": 0.7452, "step": 2565 }, { "epoch": 1.5196920343500149, "grad_norm": 3.1153435591558387, "learning_rate": 1.0268589217743114e-05, "loss": 0.7009, "step": 2566 }, { "epoch": 1.5202842759846016, "grad_norm": 8.104588610609271, "learning_rate": 1.0262195720097472e-05, "loss": 0.7581, "step": 2567 }, { "epoch": 1.5208765176191887, "grad_norm": 1.629901500067734, "learning_rate": 1.0255802115199034e-05, "loss": 0.746, "step": 2568 }, { "epoch": 1.5214687592537754, "grad_norm": 2.5849788803020193, "learning_rate": 1.0249408405663148e-05, "loss": 0.7137, "step": 2569 }, { "epoch": 1.5220610008883626, "grad_norm": 1.7706404764967858, "learning_rate": 1.0243014594105201e-05, "loss": 0.6867, "step": 2570 }, { "epoch": 1.5226532425229493, "grad_norm": 3.0042524859021564, "learning_rate": 1.0236620683140616e-05, "loss": 0.6803, "step": 2571 }, { "epoch": 1.5232454841575362, "grad_norm": 1.7908570112056044, "learning_rate": 1.0230226675384868e-05, "loss": 0.7065, "step": 2572 }, { "epoch": 1.5238377257921232, "grad_norm": 2.8915396186908393, "learning_rate": 1.0223832573453463e-05, "loss": 0.7225, "step": 2573 }, { "epoch": 1.52442996742671, "grad_norm": 10.14931172158288, "learning_rate": 1.021743837996195e-05, "loss": 0.7015, "step": 2574 }, { "epoch": 1.525022209061297, "grad_norm": 2.3225645983111307, "learning_rate": 1.0211044097525908e-05, "loss": 0.7365, "step": 2575 }, { "epoch": 1.525614450695884, "grad_norm": 1.8117439188164521, "learning_rate": 1.0204649728760969e-05, "loss": 0.7822, "step": 2576 }, { "epoch": 1.526206692330471, "grad_norm": 2.4639010150094647, "learning_rate": 1.0198255276282778e-05, "loss": 0.7265, "step": 2577 }, { "epoch": 1.5267989339650576, "grad_norm": 1.866652089335267, "learning_rate": 1.0191860742707034e-05, "loss": 0.7577, "step": 2578 }, { "epoch": 1.5273911755996448, "grad_norm": 2.434779909956098, "learning_rate": 1.0185466130649455e-05, "loss": 0.7516, "step": 2579 }, { "epoch": 1.5279834172342315, "grad_norm": 2.0991586945316416, "learning_rate": 1.0179071442725801e-05, "loss": 0.7407, "step": 2580 }, { "epoch": 1.5285756588688186, "grad_norm": 2.5982167009442847, "learning_rate": 1.0172676681551857e-05, "loss": 0.7136, "step": 2581 }, { "epoch": 1.5291679005034053, "grad_norm": 4.3978929309621835, "learning_rate": 1.0166281849743438e-05, "loss": 0.7124, "step": 2582 }, { "epoch": 1.5297601421379923, "grad_norm": 1.4422002498656223, "learning_rate": 1.0159886949916394e-05, "loss": 0.7052, "step": 2583 }, { "epoch": 1.5303523837725792, "grad_norm": 2.917865956798247, "learning_rate": 1.0153491984686595e-05, "loss": 0.7503, "step": 2584 }, { "epoch": 1.5309446254071661, "grad_norm": 1.5687207674041863, "learning_rate": 1.0147096956669945e-05, "loss": 0.7344, "step": 2585 }, { "epoch": 1.531536867041753, "grad_norm": 2.2546664576414828, "learning_rate": 1.0140701868482365e-05, "loss": 0.7531, "step": 2586 }, { "epoch": 1.53212910867634, "grad_norm": 1.4308336411744214, "learning_rate": 1.013430672273981e-05, "loss": 0.7343, "step": 2587 }, { "epoch": 1.532721350310927, "grad_norm": 2.396679670787782, "learning_rate": 1.0127911522058256e-05, "loss": 0.7603, "step": 2588 }, { "epoch": 1.5333135919455136, "grad_norm": 2.149803137842192, "learning_rate": 1.0121516269053693e-05, "loss": 0.715, "step": 2589 }, { "epoch": 1.5339058335801008, "grad_norm": 1.5562821807570975, "learning_rate": 1.0115120966342145e-05, "loss": 0.7023, "step": 2590 }, { "epoch": 1.5344980752146875, "grad_norm": 3.920771426341168, "learning_rate": 1.0108725616539648e-05, "loss": 0.7465, "step": 2591 }, { "epoch": 1.5350903168492747, "grad_norm": 2.6406023004555945, "learning_rate": 1.0102330222262257e-05, "loss": 0.6824, "step": 2592 }, { "epoch": 1.5356825584838614, "grad_norm": 1.999159078713688, "learning_rate": 1.0095934786126055e-05, "loss": 0.7184, "step": 2593 }, { "epoch": 1.5362748001184483, "grad_norm": 2.6618576577531012, "learning_rate": 1.0089539310747127e-05, "loss": 0.6993, "step": 2594 }, { "epoch": 1.5368670417530352, "grad_norm": 2.088264527488616, "learning_rate": 1.0083143798741587e-05, "loss": 0.7093, "step": 2595 }, { "epoch": 1.5374592833876222, "grad_norm": 1.2592517848012976, "learning_rate": 1.007674825272556e-05, "loss": 0.7187, "step": 2596 }, { "epoch": 1.538051525022209, "grad_norm": 2.4043437880802703, "learning_rate": 1.007035267531518e-05, "loss": 0.7162, "step": 2597 }, { "epoch": 1.5386437666567958, "grad_norm": 1.3595141476229315, "learning_rate": 1.0063957069126602e-05, "loss": 0.7122, "step": 2598 }, { "epoch": 1.539236008291383, "grad_norm": 1.5459899337725218, "learning_rate": 1.0057561436775982e-05, "loss": 0.7631, "step": 2599 }, { "epoch": 1.5398282499259697, "grad_norm": 1.6231837605639283, "learning_rate": 1.0051165780879503e-05, "loss": 0.7219, "step": 2600 }, { "epoch": 1.5404204915605568, "grad_norm": 2.0281678385057265, "learning_rate": 1.0044770104053336e-05, "loss": 0.7281, "step": 2601 }, { "epoch": 1.5410127331951435, "grad_norm": 1.761034640996413, "learning_rate": 1.0038374408913684e-05, "loss": 0.7147, "step": 2602 }, { "epoch": 1.5416049748297307, "grad_norm": 2.4171957792447216, "learning_rate": 1.0031978698076738e-05, "loss": 0.7625, "step": 2603 }, { "epoch": 1.5421972164643174, "grad_norm": 2.4766490230586955, "learning_rate": 1.002558297415871e-05, "loss": 0.7588, "step": 2604 }, { "epoch": 1.5427894580989043, "grad_norm": 1.6895743443704907, "learning_rate": 1.00191872397758e-05, "loss": 0.7303, "step": 2605 }, { "epoch": 1.5433816997334913, "grad_norm": 3.162304535278318, "learning_rate": 1.0012791497544238e-05, "loss": 0.7232, "step": 2606 }, { "epoch": 1.5439739413680782, "grad_norm": 1.6202855977215762, "learning_rate": 1.000639575008023e-05, "loss": 0.7778, "step": 2607 }, { "epoch": 1.5445661830026651, "grad_norm": 2.9346216243576317, "learning_rate": 1e-05, "loss": 0.7523, "step": 2608 }, { "epoch": 1.5451584246372518, "grad_norm": 2.5997819364592805, "learning_rate": 9.993604249919773e-06, "loss": 0.7149, "step": 2609 }, { "epoch": 1.545750666271839, "grad_norm": 2.2690927538275956, "learning_rate": 9.987208502455767e-06, "loss": 0.7492, "step": 2610 }, { "epoch": 1.5463429079064257, "grad_norm": 2.0044261798649456, "learning_rate": 9.980812760224202e-06, "loss": 0.7161, "step": 2611 }, { "epoch": 1.5469351495410129, "grad_norm": 1.7008339193116702, "learning_rate": 9.974417025841293e-06, "loss": 0.7075, "step": 2612 }, { "epoch": 1.5475273911755996, "grad_norm": 2.802313549002366, "learning_rate": 9.968021301923264e-06, "loss": 0.7673, "step": 2613 }, { "epoch": 1.5481196328101867, "grad_norm": 5.540492098493376, "learning_rate": 9.961625591086321e-06, "loss": 0.6843, "step": 2614 }, { "epoch": 1.5487118744447734, "grad_norm": 1.7826155088059201, "learning_rate": 9.955229895946666e-06, "loss": 0.6854, "step": 2615 }, { "epoch": 1.5493041160793604, "grad_norm": 3.214394133267478, "learning_rate": 9.9488342191205e-06, "loss": 0.7126, "step": 2616 }, { "epoch": 1.5498963577139473, "grad_norm": 3.977691609819524, "learning_rate": 9.942438563224018e-06, "loss": 0.7541, "step": 2617 }, { "epoch": 1.5504885993485342, "grad_norm": 2.7729164430172393, "learning_rate": 9.936042930873403e-06, "loss": 0.7343, "step": 2618 }, { "epoch": 1.5510808409831212, "grad_norm": 3.2382422721234945, "learning_rate": 9.929647324684823e-06, "loss": 0.6899, "step": 2619 }, { "epoch": 1.5516730826177079, "grad_norm": 1.712687804564103, "learning_rate": 9.923251747274441e-06, "loss": 0.7011, "step": 2620 }, { "epoch": 1.552265324252295, "grad_norm": 2.1866276642869127, "learning_rate": 9.916856201258413e-06, "loss": 0.7067, "step": 2621 }, { "epoch": 1.5528575658868817, "grad_norm": 1.4026718170049086, "learning_rate": 9.910460689252876e-06, "loss": 0.7055, "step": 2622 }, { "epoch": 1.5534498075214689, "grad_norm": 1.3836603781583152, "learning_rate": 9.904065213873949e-06, "loss": 0.7652, "step": 2623 }, { "epoch": 1.5540420491560556, "grad_norm": 2.193906980580802, "learning_rate": 9.897669777737745e-06, "loss": 0.7304, "step": 2624 }, { "epoch": 1.5546342907906427, "grad_norm": 7.203396030176273, "learning_rate": 9.891274383460354e-06, "loss": 0.7812, "step": 2625 }, { "epoch": 1.5552265324252295, "grad_norm": 1.8434299981876887, "learning_rate": 9.884879033657859e-06, "loss": 0.748, "step": 2626 }, { "epoch": 1.5558187740598164, "grad_norm": 1.821123721515522, "learning_rate": 9.878483730946308e-06, "loss": 0.6896, "step": 2627 }, { "epoch": 1.5564110156944033, "grad_norm": 1.7271066903414265, "learning_rate": 9.872088477941748e-06, "loss": 0.7252, "step": 2628 }, { "epoch": 1.5570032573289903, "grad_norm": 1.925482681465068, "learning_rate": 9.86569327726019e-06, "loss": 0.7099, "step": 2629 }, { "epoch": 1.5575954989635772, "grad_norm": 1.6134202565322096, "learning_rate": 9.859298131517639e-06, "loss": 0.7507, "step": 2630 }, { "epoch": 1.558187740598164, "grad_norm": 3.513343627530898, "learning_rate": 9.852903043330059e-06, "loss": 0.7638, "step": 2631 }, { "epoch": 1.558779982232751, "grad_norm": 1.442402404811214, "learning_rate": 9.846508015313407e-06, "loss": 0.7419, "step": 2632 }, { "epoch": 1.5593722238673378, "grad_norm": 2.2944058513698544, "learning_rate": 9.84011305008361e-06, "loss": 0.7012, "step": 2633 }, { "epoch": 1.559964465501925, "grad_norm": 4.277848662731923, "learning_rate": 9.833718150256567e-06, "loss": 0.7286, "step": 2634 }, { "epoch": 1.5605567071365116, "grad_norm": 2.2320066876585987, "learning_rate": 9.827323318448148e-06, "loss": 0.7014, "step": 2635 }, { "epoch": 1.5611489487710986, "grad_norm": 2.592881982866087, "learning_rate": 9.820928557274202e-06, "loss": 0.7711, "step": 2636 }, { "epoch": 1.5617411904056855, "grad_norm": 2.3317654184305923, "learning_rate": 9.814533869350547e-06, "loss": 0.6949, "step": 2637 }, { "epoch": 1.5623334320402724, "grad_norm": 3.3958407254332323, "learning_rate": 9.808139257292971e-06, "loss": 0.7517, "step": 2638 }, { "epoch": 1.5629256736748594, "grad_norm": 1.8868331278047943, "learning_rate": 9.801744723717225e-06, "loss": 0.7492, "step": 2639 }, { "epoch": 1.5635179153094463, "grad_norm": 1.899167075781751, "learning_rate": 9.795350271239034e-06, "loss": 0.7545, "step": 2640 }, { "epoch": 1.5641101569440332, "grad_norm": 1.6299887369278279, "learning_rate": 9.78895590247409e-06, "loss": 0.7296, "step": 2641 }, { "epoch": 1.56470239857862, "grad_norm": 86.37994019353313, "learning_rate": 9.782561620038055e-06, "loss": 0.7508, "step": 2642 }, { "epoch": 1.565294640213207, "grad_norm": 2.0626173331144932, "learning_rate": 9.77616742654654e-06, "loss": 0.7579, "step": 2643 }, { "epoch": 1.5658868818477938, "grad_norm": 3.3316806059009325, "learning_rate": 9.769773324615133e-06, "loss": 0.7028, "step": 2644 }, { "epoch": 1.566479123482381, "grad_norm": 1.7459776374665734, "learning_rate": 9.763379316859386e-06, "loss": 0.711, "step": 2645 }, { "epoch": 1.5670713651169677, "grad_norm": 4.838875211558403, "learning_rate": 9.756985405894802e-06, "loss": 0.7106, "step": 2646 }, { "epoch": 1.5676636067515546, "grad_norm": 1.3863030191233248, "learning_rate": 9.750591594336854e-06, "loss": 0.7431, "step": 2647 }, { "epoch": 1.5682558483861415, "grad_norm": 1.734491733463122, "learning_rate": 9.744197884800968e-06, "loss": 0.7221, "step": 2648 }, { "epoch": 1.5688480900207284, "grad_norm": 1.4554333705299076, "learning_rate": 9.73780427990253e-06, "loss": 0.7297, "step": 2649 }, { "epoch": 1.5694403316553154, "grad_norm": 3.2911736777923952, "learning_rate": 9.731410782256889e-06, "loss": 0.7361, "step": 2650 }, { "epoch": 1.5700325732899023, "grad_norm": 1.3472702183611696, "learning_rate": 9.72501739447934e-06, "loss": 0.7268, "step": 2651 }, { "epoch": 1.5706248149244892, "grad_norm": 1.7709537123753905, "learning_rate": 9.718624119185138e-06, "loss": 0.7226, "step": 2652 }, { "epoch": 1.571217056559076, "grad_norm": 2.0914175608541434, "learning_rate": 9.712230958989494e-06, "loss": 0.7413, "step": 2653 }, { "epoch": 1.571809298193663, "grad_norm": 1.1748662544040405, "learning_rate": 9.705837916507575e-06, "loss": 0.7256, "step": 2654 }, { "epoch": 1.5724015398282498, "grad_norm": 1.9165526544435088, "learning_rate": 9.699444994354483e-06, "loss": 0.7225, "step": 2655 }, { "epoch": 1.572993781462837, "grad_norm": 1.4887726373278507, "learning_rate": 9.693052195145292e-06, "loss": 0.7339, "step": 2656 }, { "epoch": 1.5735860230974237, "grad_norm": 2.8971799022670233, "learning_rate": 9.68665952149501e-06, "loss": 0.7242, "step": 2657 }, { "epoch": 1.5741782647320106, "grad_norm": 2.3325271582358895, "learning_rate": 9.680266976018613e-06, "loss": 0.7538, "step": 2658 }, { "epoch": 1.5747705063665975, "grad_norm": 1.4580873725563652, "learning_rate": 9.673874561330994e-06, "loss": 0.7313, "step": 2659 }, { "epoch": 1.5753627480011845, "grad_norm": 1.498906817025249, "learning_rate": 9.66748228004702e-06, "loss": 0.7552, "step": 2660 }, { "epoch": 1.5759549896357714, "grad_norm": 1.0553079729268877, "learning_rate": 9.661090134781493e-06, "loss": 0.7426, "step": 2661 }, { "epoch": 1.5765472312703583, "grad_norm": 1.3421402504246218, "learning_rate": 9.654698128149162e-06, "loss": 0.7538, "step": 2662 }, { "epoch": 1.5771394729049453, "grad_norm": 2.3722370387111322, "learning_rate": 9.648306262764708e-06, "loss": 0.7284, "step": 2663 }, { "epoch": 1.577731714539532, "grad_norm": 1.80136298232067, "learning_rate": 9.64191454124277e-06, "loss": 0.7216, "step": 2664 }, { "epoch": 1.5783239561741191, "grad_norm": 2.4789404827141723, "learning_rate": 9.635522966197923e-06, "loss": 0.6985, "step": 2665 }, { "epoch": 1.5789161978087058, "grad_norm": 1.6133875143981933, "learning_rate": 9.62913154024468e-06, "loss": 0.7065, "step": 2666 }, { "epoch": 1.579508439443293, "grad_norm": 1.878564085742203, "learning_rate": 9.622740265997488e-06, "loss": 0.7201, "step": 2667 }, { "epoch": 1.5801006810778797, "grad_norm": 1.561377075762865, "learning_rate": 9.61634914607074e-06, "loss": 0.7158, "step": 2668 }, { "epoch": 1.5806929227124666, "grad_norm": 1.7864895574938302, "learning_rate": 9.60995818307877e-06, "loss": 0.729, "step": 2669 }, { "epoch": 1.5812851643470536, "grad_norm": 1.661603731541404, "learning_rate": 9.603567379635836e-06, "loss": 0.7625, "step": 2670 }, { "epoch": 1.5818774059816405, "grad_norm": 1.299944010246059, "learning_rate": 9.597176738356134e-06, "loss": 0.7081, "step": 2671 }, { "epoch": 1.5824696476162274, "grad_norm": 2.1529097270049706, "learning_rate": 9.590786261853798e-06, "loss": 0.6694, "step": 2672 }, { "epoch": 1.5830618892508144, "grad_norm": 1.4703201941101165, "learning_rate": 9.584395952742892e-06, "loss": 0.696, "step": 2673 }, { "epoch": 1.5836541308854013, "grad_norm": 1.9214984260880197, "learning_rate": 9.578005813637414e-06, "loss": 0.7194, "step": 2674 }, { "epoch": 1.584246372519988, "grad_norm": 1.3154793348516993, "learning_rate": 9.571615847151287e-06, "loss": 0.7264, "step": 2675 }, { "epoch": 1.5848386141545752, "grad_norm": 2.0731383642895658, "learning_rate": 9.565226055898366e-06, "loss": 0.7398, "step": 2676 }, { "epoch": 1.5854308557891619, "grad_norm": 1.269718722070663, "learning_rate": 9.558836442492437e-06, "loss": 0.7618, "step": 2677 }, { "epoch": 1.586023097423749, "grad_norm": 2.326022738330298, "learning_rate": 9.552447009547214e-06, "loss": 0.7153, "step": 2678 }, { "epoch": 1.5866153390583357, "grad_norm": 2.108051114609614, "learning_rate": 9.546057759676328e-06, "loss": 0.6837, "step": 2679 }, { "epoch": 1.5872075806929227, "grad_norm": 1.4247839172166672, "learning_rate": 9.539668695493344e-06, "loss": 0.7276, "step": 2680 }, { "epoch": 1.5877998223275096, "grad_norm": 1.0542131540553648, "learning_rate": 9.53327981961175e-06, "loss": 0.7446, "step": 2681 }, { "epoch": 1.5883920639620965, "grad_norm": 1.8059660461442077, "learning_rate": 9.52689113464496e-06, "loss": 0.731, "step": 2682 }, { "epoch": 1.5889843055966835, "grad_norm": 1.568683168663255, "learning_rate": 9.520502643206293e-06, "loss": 0.712, "step": 2683 }, { "epoch": 1.5895765472312704, "grad_norm": 3.4501788436091254, "learning_rate": 9.514114347909011e-06, "loss": 0.7132, "step": 2684 }, { "epoch": 1.5901687888658573, "grad_norm": 2.0059923334239707, "learning_rate": 9.507726251366283e-06, "loss": 0.7292, "step": 2685 }, { "epoch": 1.590761030500444, "grad_norm": 3.8951776491133745, "learning_rate": 9.501338356191204e-06, "loss": 0.7396, "step": 2686 }, { "epoch": 1.5913532721350312, "grad_norm": 2.4788086356042216, "learning_rate": 9.494950664996771e-06, "loss": 0.7391, "step": 2687 }, { "epoch": 1.591945513769618, "grad_norm": 2.9671138222012954, "learning_rate": 9.488563180395922e-06, "loss": 0.7501, "step": 2688 }, { "epoch": 1.592537755404205, "grad_norm": 1.7325289372388857, "learning_rate": 9.482175905001489e-06, "loss": 0.7592, "step": 2689 }, { "epoch": 1.5931299970387918, "grad_norm": 2.5802186860418552, "learning_rate": 9.475788841426232e-06, "loss": 0.6955, "step": 2690 }, { "epoch": 1.5937222386733787, "grad_norm": 2.804837641485897, "learning_rate": 9.469401992282817e-06, "loss": 0.7185, "step": 2691 }, { "epoch": 1.5943144803079656, "grad_norm": 2.4503401391313893, "learning_rate": 9.463015360183819e-06, "loss": 0.7131, "step": 2692 }, { "epoch": 1.5949067219425526, "grad_norm": 2.1268807947669606, "learning_rate": 9.456628947741738e-06, "loss": 0.7144, "step": 2693 }, { "epoch": 1.5954989635771395, "grad_norm": 2.110094339237852, "learning_rate": 9.450242757568975e-06, "loss": 0.7282, "step": 2694 }, { "epoch": 1.5960912052117264, "grad_norm": 1.7700346930524404, "learning_rate": 9.443856792277836e-06, "loss": 0.7554, "step": 2695 }, { "epoch": 1.5966834468463134, "grad_norm": 1.0280242622620175, "learning_rate": 9.43747105448054e-06, "loss": 0.7444, "step": 2696 }, { "epoch": 1.5972756884809, "grad_norm": 1.3872664321744306, "learning_rate": 9.431085546789218e-06, "loss": 0.7354, "step": 2697 }, { "epoch": 1.5978679301154872, "grad_norm": 1.2479775404016484, "learning_rate": 9.424700271815901e-06, "loss": 0.7271, "step": 2698 }, { "epoch": 1.598460171750074, "grad_norm": 1.6017123862586502, "learning_rate": 9.41831523217252e-06, "loss": 0.7117, "step": 2699 }, { "epoch": 1.599052413384661, "grad_norm": 3.210802359158483, "learning_rate": 9.41193043047092e-06, "loss": 0.7564, "step": 2700 }, { "epoch": 1.5996446550192478, "grad_norm": 3.6114823786582515, "learning_rate": 9.405545869322843e-06, "loss": 0.7498, "step": 2701 }, { "epoch": 1.6002368966538347, "grad_norm": 1.6710996935533942, "learning_rate": 9.399161551339933e-06, "loss": 0.7275, "step": 2702 }, { "epoch": 1.6008291382884217, "grad_norm": 1.6984346113201354, "learning_rate": 9.392777479133736e-06, "loss": 0.7232, "step": 2703 }, { "epoch": 1.6014213799230086, "grad_norm": 1.9419725358546698, "learning_rate": 9.386393655315696e-06, "loss": 0.7397, "step": 2704 }, { "epoch": 1.6020136215575955, "grad_norm": 3.9234250963518043, "learning_rate": 9.380010082497152e-06, "loss": 0.7185, "step": 2705 }, { "epoch": 1.6026058631921825, "grad_norm": 1.6194229418389212, "learning_rate": 9.373626763289352e-06, "loss": 0.7287, "step": 2706 }, { "epoch": 1.6031981048267694, "grad_norm": 1.3815447699004129, "learning_rate": 9.367243700303427e-06, "loss": 0.7354, "step": 2707 }, { "epoch": 1.603790346461356, "grad_norm": 2.310394866629508, "learning_rate": 9.36086089615041e-06, "loss": 0.7429, "step": 2708 }, { "epoch": 1.6043825880959433, "grad_norm": 24.261527338301693, "learning_rate": 9.354478353441226e-06, "loss": 0.7592, "step": 2709 }, { "epoch": 1.60497482973053, "grad_norm": 2.170178314510858, "learning_rate": 9.3480960747867e-06, "loss": 0.7337, "step": 2710 }, { "epoch": 1.6055670713651171, "grad_norm": 3.053942328323577, "learning_rate": 9.341714062797533e-06, "loss": 0.7174, "step": 2711 }, { "epoch": 1.6061593129997038, "grad_norm": 1.3735266945797255, "learning_rate": 9.335332320084331e-06, "loss": 0.7538, "step": 2712 }, { "epoch": 1.6067515546342908, "grad_norm": 1.8525060260331747, "learning_rate": 9.32895084925759e-06, "loss": 0.7206, "step": 2713 }, { "epoch": 1.6073437962688777, "grad_norm": 1.6312829687512218, "learning_rate": 9.322569652927685e-06, "loss": 0.7047, "step": 2714 }, { "epoch": 1.6079360379034646, "grad_norm": 1.5960390029649218, "learning_rate": 9.316188733704883e-06, "loss": 0.7373, "step": 2715 }, { "epoch": 1.6085282795380516, "grad_norm": 1.6939174639156191, "learning_rate": 9.309808094199343e-06, "loss": 0.7343, "step": 2716 }, { "epoch": 1.6091205211726385, "grad_norm": 5.647495987042836, "learning_rate": 9.303427737021105e-06, "loss": 0.7441, "step": 2717 }, { "epoch": 1.6097127628072254, "grad_norm": 1.0834842055792864, "learning_rate": 9.297047664780093e-06, "loss": 0.7104, "step": 2718 }, { "epoch": 1.6103050044418121, "grad_norm": 1.3533184427520821, "learning_rate": 9.290667880086115e-06, "loss": 0.7428, "step": 2719 }, { "epoch": 1.6108972460763993, "grad_norm": 1.1679095552719452, "learning_rate": 9.284288385548858e-06, "loss": 0.7262, "step": 2720 }, { "epoch": 1.611489487710986, "grad_norm": 8.140610596435375, "learning_rate": 9.2779091837779e-06, "loss": 0.7327, "step": 2721 }, { "epoch": 1.6120817293455731, "grad_norm": 1.2293837136922272, "learning_rate": 9.271530277382695e-06, "loss": 0.7129, "step": 2722 }, { "epoch": 1.6126739709801599, "grad_norm": 1.1470847973636114, "learning_rate": 9.265151668972566e-06, "loss": 0.6852, "step": 2723 }, { "epoch": 1.6132662126147468, "grad_norm": 2.4405096525740038, "learning_rate": 9.258773361156725e-06, "loss": 0.725, "step": 2724 }, { "epoch": 1.6138584542493337, "grad_norm": 1.336420955664339, "learning_rate": 9.252395356544263e-06, "loss": 0.6653, "step": 2725 }, { "epoch": 1.6144506958839207, "grad_norm": 3.435337274510319, "learning_rate": 9.246017657744142e-06, "loss": 0.7157, "step": 2726 }, { "epoch": 1.6150429375185076, "grad_norm": 1.7205900312208158, "learning_rate": 9.239640267365197e-06, "loss": 0.6767, "step": 2727 }, { "epoch": 1.6156351791530945, "grad_norm": 0.9404428582680933, "learning_rate": 9.233263188016138e-06, "loss": 0.7388, "step": 2728 }, { "epoch": 1.6162274207876814, "grad_norm": 1.643384773023064, "learning_rate": 9.22688642230555e-06, "loss": 0.7729, "step": 2729 }, { "epoch": 1.6168196624222682, "grad_norm": 0.9226766736351931, "learning_rate": 9.220509972841893e-06, "loss": 0.7097, "step": 2730 }, { "epoch": 1.6174119040568553, "grad_norm": 1.292558562901911, "learning_rate": 9.214133842233486e-06, "loss": 0.7638, "step": 2731 }, { "epoch": 1.618004145691442, "grad_norm": 1.268236145418197, "learning_rate": 9.207758033088533e-06, "loss": 0.7443, "step": 2732 }, { "epoch": 1.6185963873260292, "grad_norm": 1.152636990406385, "learning_rate": 9.20138254801509e-06, "loss": 0.7477, "step": 2733 }, { "epoch": 1.6191886289606159, "grad_norm": 2.325992339134092, "learning_rate": 9.195007389621098e-06, "loss": 0.7597, "step": 2734 }, { "epoch": 1.6197808705952028, "grad_norm": 1.3096504733315142, "learning_rate": 9.188632560514345e-06, "loss": 0.7028, "step": 2735 }, { "epoch": 1.6203731122297897, "grad_norm": 1.0546949833315633, "learning_rate": 9.182258063302504e-06, "loss": 0.7131, "step": 2736 }, { "epoch": 1.6209653538643767, "grad_norm": 0.9210797626479261, "learning_rate": 9.175883900593095e-06, "loss": 0.7453, "step": 2737 }, { "epoch": 1.6215575954989636, "grad_norm": 1.5075058070737895, "learning_rate": 9.16951007499352e-06, "loss": 0.7248, "step": 2738 }, { "epoch": 1.6221498371335505, "grad_norm": 1.169767212334899, "learning_rate": 9.163136589111019e-06, "loss": 0.732, "step": 2739 }, { "epoch": 1.6227420787681375, "grad_norm": 1.615392214615148, "learning_rate": 9.156763445552714e-06, "loss": 0.7017, "step": 2740 }, { "epoch": 1.6233343204027242, "grad_norm": 1.0292405348020153, "learning_rate": 9.150390646925578e-06, "loss": 0.69, "step": 2741 }, { "epoch": 1.6239265620373113, "grad_norm": 1.8762128663128925, "learning_rate": 9.144018195836445e-06, "loss": 0.7546, "step": 2742 }, { "epoch": 1.624518803671898, "grad_norm": 0.8711466459720059, "learning_rate": 9.137646094892e-06, "loss": 0.7039, "step": 2743 }, { "epoch": 1.6251110453064852, "grad_norm": 0.9463455729109375, "learning_rate": 9.131274346698797e-06, "loss": 0.7569, "step": 2744 }, { "epoch": 1.625703286941072, "grad_norm": 0.9011458492231906, "learning_rate": 9.124902953863235e-06, "loss": 0.7567, "step": 2745 }, { "epoch": 1.6262955285756588, "grad_norm": 2.289091980573055, "learning_rate": 9.118531918991578e-06, "loss": 0.749, "step": 2746 }, { "epoch": 1.6268877702102458, "grad_norm": 1.2146535737974766, "learning_rate": 9.112161244689931e-06, "loss": 0.7254, "step": 2747 }, { "epoch": 1.6274800118448327, "grad_norm": 2.4627393306018646, "learning_rate": 9.105790933564259e-06, "loss": 0.7543, "step": 2748 }, { "epoch": 1.6280722534794196, "grad_norm": 1.002159085261613, "learning_rate": 9.09942098822038e-06, "loss": 0.7394, "step": 2749 }, { "epoch": 1.6286644951140063, "grad_norm": 1.639564743885887, "learning_rate": 9.09305141126396e-06, "loss": 0.7178, "step": 2750 }, { "epoch": 1.6292567367485935, "grad_norm": 0.9813191864395472, "learning_rate": 9.086682205300512e-06, "loss": 0.7354, "step": 2751 }, { "epoch": 1.6298489783831802, "grad_norm": 2.1747632589697155, "learning_rate": 9.080313372935399e-06, "loss": 0.7072, "step": 2752 }, { "epoch": 1.6304412200177674, "grad_norm": 0.8902643816753388, "learning_rate": 9.073944916773835e-06, "loss": 0.7272, "step": 2753 }, { "epoch": 1.631033461652354, "grad_norm": 0.883321085183472, "learning_rate": 9.067576839420876e-06, "loss": 0.7433, "step": 2754 }, { "epoch": 1.6316257032869412, "grad_norm": 1.191509270778969, "learning_rate": 9.06120914348142e-06, "loss": 0.7682, "step": 2755 }, { "epoch": 1.632217944921528, "grad_norm": 0.8931773470587896, "learning_rate": 9.054841831560216e-06, "loss": 0.7217, "step": 2756 }, { "epoch": 1.6328101865561149, "grad_norm": 0.9206149509553022, "learning_rate": 9.04847490626185e-06, "loss": 0.7244, "step": 2757 }, { "epoch": 1.6334024281907018, "grad_norm": 0.9602886650943193, "learning_rate": 9.042108370190757e-06, "loss": 0.7327, "step": 2758 }, { "epoch": 1.6339946698252887, "grad_norm": 1.0731812685624111, "learning_rate": 9.035742225951203e-06, "loss": 0.7128, "step": 2759 }, { "epoch": 1.6345869114598757, "grad_norm": 1.0001996418004009, "learning_rate": 9.029376476147303e-06, "loss": 0.7283, "step": 2760 }, { "epoch": 1.6351791530944624, "grad_norm": 1.0101272999135227, "learning_rate": 9.023011123383002e-06, "loss": 0.7698, "step": 2761 }, { "epoch": 1.6357713947290495, "grad_norm": 7.445183865475451, "learning_rate": 9.016646170262096e-06, "loss": 0.7248, "step": 2762 }, { "epoch": 1.6363636363636362, "grad_norm": 1.0354333262038447, "learning_rate": 9.010281619388198e-06, "loss": 0.7165, "step": 2763 }, { "epoch": 1.6369558779982234, "grad_norm": 1.0303790501337338, "learning_rate": 9.003917473364774e-06, "loss": 0.773, "step": 2764 }, { "epoch": 1.63754811963281, "grad_norm": 1.0048176835409444, "learning_rate": 8.997553734795115e-06, "loss": 0.7446, "step": 2765 }, { "epoch": 1.6381403612673973, "grad_norm": 0.8644074880461828, "learning_rate": 8.991190406282352e-06, "loss": 0.7565, "step": 2766 }, { "epoch": 1.638732602901984, "grad_norm": 1.484278848736965, "learning_rate": 8.984827490429437e-06, "loss": 0.7242, "step": 2767 }, { "epoch": 1.639324844536571, "grad_norm": 1.2307819145316636, "learning_rate": 8.978464989839165e-06, "loss": 0.7396, "step": 2768 }, { "epoch": 1.6399170861711578, "grad_norm": 1.2176803902884394, "learning_rate": 8.972102907114157e-06, "loss": 0.745, "step": 2769 }, { "epoch": 1.6405093278057448, "grad_norm": 1.0887898932988884, "learning_rate": 8.965741244856864e-06, "loss": 0.6943, "step": 2770 }, { "epoch": 1.6411015694403317, "grad_norm": 0.988196915871078, "learning_rate": 8.959380005669559e-06, "loss": 0.7085, "step": 2771 }, { "epoch": 1.6416938110749184, "grad_norm": 0.9375850317494164, "learning_rate": 8.953019192154344e-06, "loss": 0.7181, "step": 2772 }, { "epoch": 1.6422860527095056, "grad_norm": 1.0916414927857014, "learning_rate": 8.946658806913158e-06, "loss": 0.7182, "step": 2773 }, { "epoch": 1.6428782943440923, "grad_norm": 0.9559469293114459, "learning_rate": 8.940298852547753e-06, "loss": 0.6924, "step": 2774 }, { "epoch": 1.6434705359786794, "grad_norm": 0.9122282528732937, "learning_rate": 8.933939331659707e-06, "loss": 0.6802, "step": 2775 }, { "epoch": 1.6440627776132661, "grad_norm": 1.2827645783241866, "learning_rate": 8.927580246850418e-06, "loss": 0.7219, "step": 2776 }, { "epoch": 1.6446550192478533, "grad_norm": 1.0856897372943664, "learning_rate": 8.921221600721115e-06, "loss": 0.7749, "step": 2777 }, { "epoch": 1.64524726088244, "grad_norm": 0.9976872404160881, "learning_rate": 8.914863395872844e-06, "loss": 0.6998, "step": 2778 }, { "epoch": 1.645839502517027, "grad_norm": 0.9187575917239228, "learning_rate": 8.908505634906461e-06, "loss": 0.6714, "step": 2779 }, { "epoch": 1.6464317441516139, "grad_norm": 0.8887264946359307, "learning_rate": 8.90214832042265e-06, "loss": 0.7127, "step": 2780 }, { "epoch": 1.6470239857862008, "grad_norm": 0.8836649493280779, "learning_rate": 8.895791455021912e-06, "loss": 0.716, "step": 2781 }, { "epoch": 1.6476162274207877, "grad_norm": 0.9011624604992315, "learning_rate": 8.889435041304565e-06, "loss": 0.723, "step": 2782 }, { "epoch": 1.6482084690553744, "grad_norm": 1.2551214513482096, "learning_rate": 8.88307908187073e-06, "loss": 0.7301, "step": 2783 }, { "epoch": 1.6488007106899616, "grad_norm": 1.030561508128019, "learning_rate": 8.876723579320363e-06, "loss": 0.6991, "step": 2784 }, { "epoch": 1.6493929523245483, "grad_norm": 0.9450181046757289, "learning_rate": 8.870368536253213e-06, "loss": 0.7391, "step": 2785 }, { "epoch": 1.6499851939591355, "grad_norm": 1.0028902776005366, "learning_rate": 8.86401395526886e-06, "loss": 0.7461, "step": 2786 }, { "epoch": 1.6505774355937222, "grad_norm": 1.0849709154792493, "learning_rate": 8.857659838966672e-06, "loss": 0.7124, "step": 2787 }, { "epoch": 1.651169677228309, "grad_norm": 1.418277931700241, "learning_rate": 8.85130618994585e-06, "loss": 0.7166, "step": 2788 }, { "epoch": 1.651761918862896, "grad_norm": 1.0799657812191223, "learning_rate": 8.844953010805388e-06, "loss": 0.6659, "step": 2789 }, { "epoch": 1.652354160497483, "grad_norm": 0.9047344447663203, "learning_rate": 8.838600304144102e-06, "loss": 0.7237, "step": 2790 }, { "epoch": 1.65294640213207, "grad_norm": 0.8954991208705074, "learning_rate": 8.832248072560594e-06, "loss": 0.7404, "step": 2791 }, { "epoch": 1.6535386437666568, "grad_norm": 1.255225726733933, "learning_rate": 8.825896318653294e-06, "loss": 0.718, "step": 2792 }, { "epoch": 1.6541308854012438, "grad_norm": 1.8935384231370476, "learning_rate": 8.81954504502042e-06, "loss": 0.741, "step": 2793 }, { "epoch": 1.6547231270358305, "grad_norm": 0.8395781786009291, "learning_rate": 8.813194254260006e-06, "loss": 0.7077, "step": 2794 }, { "epoch": 1.6553153686704176, "grad_norm": 1.1680365597009021, "learning_rate": 8.806843948969875e-06, "loss": 0.7082, "step": 2795 }, { "epoch": 1.6559076103050043, "grad_norm": 0.912567977351368, "learning_rate": 8.800494131747667e-06, "loss": 0.7654, "step": 2796 }, { "epoch": 1.6564998519395915, "grad_norm": 2.4309897712239166, "learning_rate": 8.794144805190809e-06, "loss": 0.7536, "step": 2797 }, { "epoch": 1.6570920935741782, "grad_norm": 1.0403551057860285, "learning_rate": 8.787795971896536e-06, "loss": 0.7459, "step": 2798 }, { "epoch": 1.6576843352087651, "grad_norm": 1.5383317536648582, "learning_rate": 8.781447634461874e-06, "loss": 0.722, "step": 2799 }, { "epoch": 1.658276576843352, "grad_norm": 0.9088453416120332, "learning_rate": 8.775099795483651e-06, "loss": 0.7737, "step": 2800 }, { "epoch": 1.658868818477939, "grad_norm": 1.042835871852608, "learning_rate": 8.768752457558492e-06, "loss": 0.7334, "step": 2801 }, { "epoch": 1.659461060112526, "grad_norm": 1.0274279053461044, "learning_rate": 8.762405623282817e-06, "loss": 0.6733, "step": 2802 }, { "epoch": 1.6600533017471129, "grad_norm": 0.9918727284685397, "learning_rate": 8.756059295252833e-06, "loss": 0.7416, "step": 2803 }, { "epoch": 1.6606455433816998, "grad_norm": 1.306893253891884, "learning_rate": 8.749713476064547e-06, "loss": 0.7739, "step": 2804 }, { "epoch": 1.6612377850162865, "grad_norm": 1.2615372668097604, "learning_rate": 8.743368168313757e-06, "loss": 0.7619, "step": 2805 }, { "epoch": 1.6618300266508736, "grad_norm": 0.905731661272337, "learning_rate": 8.737023374596051e-06, "loss": 0.73, "step": 2806 }, { "epoch": 1.6624222682854604, "grad_norm": 1.609306011429277, "learning_rate": 8.730679097506804e-06, "loss": 0.7174, "step": 2807 }, { "epoch": 1.6630145099200475, "grad_norm": 1.294248981532454, "learning_rate": 8.724335339641185e-06, "loss": 0.7094, "step": 2808 }, { "epoch": 1.6636067515546342, "grad_norm": 0.939745188957189, "learning_rate": 8.717992103594142e-06, "loss": 0.6777, "step": 2809 }, { "epoch": 1.6641989931892212, "grad_norm": 1.3256782257737456, "learning_rate": 8.711649391960424e-06, "loss": 0.6926, "step": 2810 }, { "epoch": 1.664791234823808, "grad_norm": 0.8735845886685419, "learning_rate": 8.705307207334552e-06, "loss": 0.7247, "step": 2811 }, { "epoch": 1.665383476458395, "grad_norm": 1.0270651510259472, "learning_rate": 8.698965552310834e-06, "loss": 0.7213, "step": 2812 }, { "epoch": 1.665975718092982, "grad_norm": 0.8887392623106478, "learning_rate": 8.692624429483364e-06, "loss": 0.7645, "step": 2813 }, { "epoch": 1.6665679597275689, "grad_norm": 1.1018807997801454, "learning_rate": 8.686283841446027e-06, "loss": 0.7248, "step": 2814 }, { "epoch": 1.6671602013621558, "grad_norm": 0.9580382702616205, "learning_rate": 8.679943790792466e-06, "loss": 0.704, "step": 2815 }, { "epoch": 1.6677524429967425, "grad_norm": 1.775823863507453, "learning_rate": 8.673604280116127e-06, "loss": 0.7412, "step": 2816 }, { "epoch": 1.6683446846313297, "grad_norm": 0.8788118750792545, "learning_rate": 8.667265312010224e-06, "loss": 0.7352, "step": 2817 }, { "epoch": 1.6689369262659164, "grad_norm": 1.55508648257972, "learning_rate": 8.660926889067753e-06, "loss": 0.7357, "step": 2818 }, { "epoch": 1.6695291679005035, "grad_norm": 1.062045887571016, "learning_rate": 8.654589013881481e-06, "loss": 0.7618, "step": 2819 }, { "epoch": 1.6701214095350903, "grad_norm": 0.8983582875998769, "learning_rate": 8.648251689043961e-06, "loss": 0.7351, "step": 2820 }, { "epoch": 1.6707136511696772, "grad_norm": 1.03505281944072, "learning_rate": 8.641914917147512e-06, "loss": 0.7224, "step": 2821 }, { "epoch": 1.6713058928042641, "grad_norm": 0.9169880645190774, "learning_rate": 8.635578700784232e-06, "loss": 0.7079, "step": 2822 }, { "epoch": 1.671898134438851, "grad_norm": 1.2057396000296163, "learning_rate": 8.629243042545989e-06, "loss": 0.7403, "step": 2823 }, { "epoch": 1.672490376073438, "grad_norm": 1.0465851255608474, "learning_rate": 8.622907945024418e-06, "loss": 0.7507, "step": 2824 }, { "epoch": 1.673082617708025, "grad_norm": 0.9218356861943021, "learning_rate": 8.616573410810938e-06, "loss": 0.6975, "step": 2825 }, { "epoch": 1.6736748593426118, "grad_norm": 1.1722796718213668, "learning_rate": 8.61023944249673e-06, "loss": 0.6915, "step": 2826 }, { "epoch": 1.6742671009771986, "grad_norm": 1.0126608768047232, "learning_rate": 8.603906042672738e-06, "loss": 0.7349, "step": 2827 }, { "epoch": 1.6748593426117857, "grad_norm": 0.8864132486291652, "learning_rate": 8.597573213929677e-06, "loss": 0.7473, "step": 2828 }, { "epoch": 1.6754515842463724, "grad_norm": 0.8297726828669127, "learning_rate": 8.591240958858036e-06, "loss": 0.6998, "step": 2829 }, { "epoch": 1.6760438258809596, "grad_norm": 1.0399711698669063, "learning_rate": 8.584909280048064e-06, "loss": 0.6883, "step": 2830 }, { "epoch": 1.6766360675155463, "grad_norm": 1.4494905018858395, "learning_rate": 8.57857818008977e-06, "loss": 0.7302, "step": 2831 }, { "epoch": 1.6772283091501332, "grad_norm": 2.2622801471005656, "learning_rate": 8.572247661572926e-06, "loss": 0.7195, "step": 2832 }, { "epoch": 1.6778205507847201, "grad_norm": 0.9090049659373163, "learning_rate": 8.565917727087078e-06, "loss": 0.6993, "step": 2833 }, { "epoch": 1.678412792419307, "grad_norm": 1.0718271118863054, "learning_rate": 8.559588379221525e-06, "loss": 0.7332, "step": 2834 }, { "epoch": 1.679005034053894, "grad_norm": 1.3513550671215728, "learning_rate": 8.55325962056532e-06, "loss": 0.7081, "step": 2835 }, { "epoch": 1.679597275688481, "grad_norm": 1.2868537736190298, "learning_rate": 8.546931453707285e-06, "loss": 0.7261, "step": 2836 }, { "epoch": 1.6801895173230679, "grad_norm": 0.8991809072064495, "learning_rate": 8.540603881235993e-06, "loss": 0.7581, "step": 2837 }, { "epoch": 1.6807817589576546, "grad_norm": 0.9833047762425767, "learning_rate": 8.534276905739783e-06, "loss": 0.7371, "step": 2838 }, { "epoch": 1.6813740005922417, "grad_norm": 1.3090020706900172, "learning_rate": 8.527950529806739e-06, "loss": 0.7371, "step": 2839 }, { "epoch": 1.6819662422268284, "grad_norm": 1.3428401139365922, "learning_rate": 8.521624756024706e-06, "loss": 0.7124, "step": 2840 }, { "epoch": 1.6825584838614156, "grad_norm": 0.9822516800617229, "learning_rate": 8.515299586981278e-06, "loss": 0.7122, "step": 2841 }, { "epoch": 1.6831507254960023, "grad_norm": 1.5404692861197815, "learning_rate": 8.508975025263814e-06, "loss": 0.7303, "step": 2842 }, { "epoch": 1.6837429671305892, "grad_norm": 0.946622538768465, "learning_rate": 8.502651073459403e-06, "loss": 0.7374, "step": 2843 }, { "epoch": 1.6843352087651762, "grad_norm": 1.1084910982075946, "learning_rate": 8.496327734154905e-06, "loss": 0.6946, "step": 2844 }, { "epoch": 1.684927450399763, "grad_norm": 0.8603924254623376, "learning_rate": 8.490005009936918e-06, "loss": 0.7159, "step": 2845 }, { "epoch": 1.68551969203435, "grad_norm": 1.0846672515700364, "learning_rate": 8.483682903391796e-06, "loss": 0.7591, "step": 2846 }, { "epoch": 1.686111933668937, "grad_norm": 1.1750404325244188, "learning_rate": 8.477361417105631e-06, "loss": 0.7344, "step": 2847 }, { "epoch": 1.686704175303524, "grad_norm": 1.0501143321182151, "learning_rate": 8.471040553664269e-06, "loss": 0.7063, "step": 2848 }, { "epoch": 1.6872964169381106, "grad_norm": 1.2763094400193624, "learning_rate": 8.464720315653298e-06, "loss": 0.6708, "step": 2849 }, { "epoch": 1.6878886585726978, "grad_norm": 1.111124108113765, "learning_rate": 8.458400705658051e-06, "loss": 0.7201, "step": 2850 }, { "epoch": 1.6884809002072845, "grad_norm": 0.9062869224487132, "learning_rate": 8.452081726263604e-06, "loss": 0.7048, "step": 2851 }, { "epoch": 1.6890731418418716, "grad_norm": 1.2248497508498524, "learning_rate": 8.445763380054773e-06, "loss": 0.7171, "step": 2852 }, { "epoch": 1.6896653834764583, "grad_norm": 1.6317101564085743, "learning_rate": 8.43944566961612e-06, "loss": 0.6921, "step": 2853 }, { "epoch": 1.6902576251110453, "grad_norm": 0.8594789130404841, "learning_rate": 8.433128597531943e-06, "loss": 0.7166, "step": 2854 }, { "epoch": 1.6908498667456322, "grad_norm": 5.253886253022391, "learning_rate": 8.426812166386278e-06, "loss": 0.6991, "step": 2855 }, { "epoch": 1.6914421083802191, "grad_norm": 1.0371352611636877, "learning_rate": 8.420496378762901e-06, "loss": 0.7239, "step": 2856 }, { "epoch": 1.692034350014806, "grad_norm": 0.8825325351256947, "learning_rate": 8.414181237245324e-06, "loss": 0.7318, "step": 2857 }, { "epoch": 1.692626591649393, "grad_norm": 1.0387049681798395, "learning_rate": 8.407866744416801e-06, "loss": 0.7132, "step": 2858 }, { "epoch": 1.69321883328398, "grad_norm": 1.0193831157896063, "learning_rate": 8.401552902860306e-06, "loss": 0.7193, "step": 2859 }, { "epoch": 1.6938110749185666, "grad_norm": 1.2765032613192198, "learning_rate": 8.395239715158558e-06, "loss": 0.7425, "step": 2860 }, { "epoch": 1.6944033165531538, "grad_norm": 1.0423155817988132, "learning_rate": 8.388927183894005e-06, "loss": 0.7557, "step": 2861 }, { "epoch": 1.6949955581877405, "grad_norm": 1.2949288338272016, "learning_rate": 8.382615311648833e-06, "loss": 0.7196, "step": 2862 }, { "epoch": 1.6955877998223277, "grad_norm": 1.203764230621225, "learning_rate": 8.376304101004946e-06, "loss": 0.7589, "step": 2863 }, { "epoch": 1.6961800414569144, "grad_norm": 1.1439950575179505, "learning_rate": 8.369993554543987e-06, "loss": 0.7518, "step": 2864 }, { "epoch": 1.6967722830915013, "grad_norm": 1.768457656980784, "learning_rate": 8.363683674847323e-06, "loss": 0.7382, "step": 2865 }, { "epoch": 1.6973645247260882, "grad_norm": 1.0904038835740486, "learning_rate": 8.357374464496056e-06, "loss": 0.7482, "step": 2866 }, { "epoch": 1.6979567663606752, "grad_norm": 1.4275207295309171, "learning_rate": 8.351065926070994e-06, "loss": 0.7192, "step": 2867 }, { "epoch": 1.698549007995262, "grad_norm": 0.8975759750455716, "learning_rate": 8.344758062152696e-06, "loss": 0.7208, "step": 2868 }, { "epoch": 1.699141249629849, "grad_norm": 0.9962272566707975, "learning_rate": 8.338450875321428e-06, "loss": 0.7174, "step": 2869 }, { "epoch": 1.699733491264436, "grad_norm": 0.9458627764840206, "learning_rate": 8.332144368157192e-06, "loss": 0.7004, "step": 2870 }, { "epoch": 1.7003257328990227, "grad_norm": 0.9243786772003465, "learning_rate": 8.325838543239688e-06, "loss": 0.7212, "step": 2871 }, { "epoch": 1.7009179745336098, "grad_norm": 1.095152314018298, "learning_rate": 8.319533403148368e-06, "loss": 0.7413, "step": 2872 }, { "epoch": 1.7015102161681965, "grad_norm": 1.179387764922565, "learning_rate": 8.31322895046238e-06, "loss": 0.7317, "step": 2873 }, { "epoch": 1.7021024578027837, "grad_norm": 1.0825336416478861, "learning_rate": 8.306925187760608e-06, "loss": 0.7383, "step": 2874 }, { "epoch": 1.7026946994373704, "grad_norm": 1.1435368543161897, "learning_rate": 8.300622117621634e-06, "loss": 0.7225, "step": 2875 }, { "epoch": 1.7032869410719573, "grad_norm": 1.0508503526183521, "learning_rate": 8.29431974262378e-06, "loss": 0.7231, "step": 2876 }, { "epoch": 1.7038791827065443, "grad_norm": 2.6026780734301576, "learning_rate": 8.288018065345063e-06, "loss": 0.7578, "step": 2877 }, { "epoch": 1.7044714243411312, "grad_norm": 1.036737957257567, "learning_rate": 8.28171708836323e-06, "loss": 0.727, "step": 2878 }, { "epoch": 1.7050636659757181, "grad_norm": 1.2395033785247513, "learning_rate": 8.275416814255731e-06, "loss": 0.7323, "step": 2879 }, { "epoch": 1.705655907610305, "grad_norm": 5.3070838585377915, "learning_rate": 8.269117245599729e-06, "loss": 0.7276, "step": 2880 }, { "epoch": 1.706248149244892, "grad_norm": 1.9790319279693616, "learning_rate": 8.262818384972108e-06, "loss": 0.7232, "step": 2881 }, { "epoch": 1.7068403908794787, "grad_norm": 0.9027953973682145, "learning_rate": 8.256520234949456e-06, "loss": 0.7473, "step": 2882 }, { "epoch": 1.7074326325140659, "grad_norm": 1.0026460097696173, "learning_rate": 8.250222798108068e-06, "loss": 0.6998, "step": 2883 }, { "epoch": 1.7080248741486526, "grad_norm": 1.4657670953683113, "learning_rate": 8.243926077023945e-06, "loss": 0.663, "step": 2884 }, { "epoch": 1.7086171157832397, "grad_norm": 1.0969379355734514, "learning_rate": 8.23763007427281e-06, "loss": 0.7078, "step": 2885 }, { "epoch": 1.7092093574178264, "grad_norm": 0.967745297644781, "learning_rate": 8.23133479243008e-06, "loss": 0.7153, "step": 2886 }, { "epoch": 1.7098015990524134, "grad_norm": 1.2060481495888675, "learning_rate": 8.225040234070873e-06, "loss": 0.7624, "step": 2887 }, { "epoch": 1.7103938406870003, "grad_norm": 2.120123919682014, "learning_rate": 8.218746401770021e-06, "loss": 0.7574, "step": 2888 }, { "epoch": 1.7109860823215872, "grad_norm": 1.1153560787494186, "learning_rate": 8.212453298102054e-06, "loss": 0.7589, "step": 2889 }, { "epoch": 1.7115783239561742, "grad_norm": 1.5107438969118132, "learning_rate": 8.206160925641211e-06, "loss": 0.7536, "step": 2890 }, { "epoch": 1.712170565590761, "grad_norm": 1.054181917285995, "learning_rate": 8.19986928696142e-06, "loss": 0.7439, "step": 2891 }, { "epoch": 1.712762807225348, "grad_norm": 1.530532649227264, "learning_rate": 8.193578384636317e-06, "loss": 0.7304, "step": 2892 }, { "epoch": 1.7133550488599347, "grad_norm": 1.4100039607836896, "learning_rate": 8.187288221239232e-06, "loss": 0.7115, "step": 2893 }, { "epoch": 1.7139472904945219, "grad_norm": 1.3653865280730575, "learning_rate": 8.180998799343203e-06, "loss": 0.6963, "step": 2894 }, { "epoch": 1.7145395321291086, "grad_norm": 1.3761845598375588, "learning_rate": 8.17471012152095e-06, "loss": 0.7766, "step": 2895 }, { "epoch": 1.7151317737636957, "grad_norm": 0.9896735131841189, "learning_rate": 8.168422190344896e-06, "loss": 0.7104, "step": 2896 }, { "epoch": 1.7157240153982825, "grad_norm": 1.8180792395206247, "learning_rate": 8.162135008387164e-06, "loss": 0.7355, "step": 2897 }, { "epoch": 1.7163162570328694, "grad_norm": 1.2917397756929976, "learning_rate": 8.155848578219563e-06, "loss": 0.7736, "step": 2898 }, { "epoch": 1.7169084986674563, "grad_norm": 1.2049090200411698, "learning_rate": 8.14956290241359e-06, "loss": 0.7332, "step": 2899 }, { "epoch": 1.7175007403020432, "grad_norm": 2.413612490345052, "learning_rate": 8.14327798354045e-06, "loss": 0.7142, "step": 2900 }, { "epoch": 1.7180929819366302, "grad_norm": 1.0571402249668937, "learning_rate": 8.136993824171019e-06, "loss": 0.7277, "step": 2901 }, { "epoch": 1.718685223571217, "grad_norm": 1.2201789441380906, "learning_rate": 8.130710426875881e-06, "loss": 0.7176, "step": 2902 }, { "epoch": 1.719277465205804, "grad_norm": 1.041688994211538, "learning_rate": 8.12442779422529e-06, "loss": 0.6969, "step": 2903 }, { "epoch": 1.7198697068403908, "grad_norm": 1.1679307028654387, "learning_rate": 8.118145928789198e-06, "loss": 0.6998, "step": 2904 }, { "epoch": 1.720461948474978, "grad_norm": 2.7733038021139875, "learning_rate": 8.111864833137246e-06, "loss": 0.6915, "step": 2905 }, { "epoch": 1.7210541901095646, "grad_norm": 1.5759312229276083, "learning_rate": 8.105584509838754e-06, "loss": 0.7227, "step": 2906 }, { "epoch": 1.7216464317441518, "grad_norm": 1.3336882101988723, "learning_rate": 8.099304961462722e-06, "loss": 0.741, "step": 2907 }, { "epoch": 1.7222386733787385, "grad_norm": 1.2123410041999985, "learning_rate": 8.093026190577839e-06, "loss": 0.7105, "step": 2908 }, { "epoch": 1.7228309150133254, "grad_norm": 1.0373687336456456, "learning_rate": 8.086748199752483e-06, "loss": 0.6718, "step": 2909 }, { "epoch": 1.7234231566479123, "grad_norm": 2.863392250283223, "learning_rate": 8.080470991554703e-06, "loss": 0.7303, "step": 2910 }, { "epoch": 1.7240153982824993, "grad_norm": 29.798561463464726, "learning_rate": 8.074194568552224e-06, "loss": 0.7341, "step": 2911 }, { "epoch": 1.7246076399170862, "grad_norm": 1.3629065680833, "learning_rate": 8.067918933312459e-06, "loss": 0.7307, "step": 2912 }, { "epoch": 1.725199881551673, "grad_norm": 1.0966158414516722, "learning_rate": 8.061644088402499e-06, "loss": 0.7015, "step": 2913 }, { "epoch": 1.72579212318626, "grad_norm": 1.270462849175571, "learning_rate": 8.055370036389105e-06, "loss": 0.7317, "step": 2914 }, { "epoch": 1.7263843648208468, "grad_norm": 1.1763045807935812, "learning_rate": 8.04909677983872e-06, "loss": 0.7091, "step": 2915 }, { "epoch": 1.726976606455434, "grad_norm": 3.4592157510743173, "learning_rate": 8.042824321317453e-06, "loss": 0.7266, "step": 2916 }, { "epoch": 1.7275688480900206, "grad_norm": 1.1717886947291323, "learning_rate": 8.036552663391099e-06, "loss": 0.7113, "step": 2917 }, { "epoch": 1.7281610897246078, "grad_norm": 4.1475058496080965, "learning_rate": 8.030281808625114e-06, "loss": 0.7183, "step": 2918 }, { "epoch": 1.7287533313591945, "grad_norm": 1.5657993451062597, "learning_rate": 8.02401175958463e-06, "loss": 0.7218, "step": 2919 }, { "epoch": 1.7293455729937814, "grad_norm": 1.070325978428891, "learning_rate": 8.017742518834454e-06, "loss": 0.723, "step": 2920 }, { "epoch": 1.7299378146283684, "grad_norm": 1.1119220102928689, "learning_rate": 8.011474088939056e-06, "loss": 0.6992, "step": 2921 }, { "epoch": 1.7305300562629553, "grad_norm": 1.3076317154786299, "learning_rate": 8.005206472462576e-06, "loss": 0.7237, "step": 2922 }, { "epoch": 1.7311222978975422, "grad_norm": 1.2441146503285951, "learning_rate": 7.998939671968817e-06, "loss": 0.7263, "step": 2923 }, { "epoch": 1.731714539532129, "grad_norm": 1.3530915910623935, "learning_rate": 7.99267369002126e-06, "loss": 0.7264, "step": 2924 }, { "epoch": 1.732306781166716, "grad_norm": 1.1749639635147615, "learning_rate": 7.986408529183045e-06, "loss": 0.7354, "step": 2925 }, { "epoch": 1.7328990228013028, "grad_norm": 0.9443164977742067, "learning_rate": 7.980144192016967e-06, "loss": 0.7564, "step": 2926 }, { "epoch": 1.73349126443589, "grad_norm": 1.2969219620095116, "learning_rate": 7.973880681085495e-06, "loss": 0.7822, "step": 2927 }, { "epoch": 1.7340835060704767, "grad_norm": 1.3340663571052536, "learning_rate": 7.967617998950762e-06, "loss": 0.7495, "step": 2928 }, { "epoch": 1.7346757477050638, "grad_norm": 1.2273379932145483, "learning_rate": 7.961356148174554e-06, "loss": 0.7072, "step": 2929 }, { "epoch": 1.7352679893396505, "grad_norm": 1.3141531019330541, "learning_rate": 7.955095131318319e-06, "loss": 0.7258, "step": 2930 }, { "epoch": 1.7358602309742375, "grad_norm": 1.203363496166313, "learning_rate": 7.948834950943165e-06, "loss": 0.721, "step": 2931 }, { "epoch": 1.7364524726088244, "grad_norm": 1.1742529128764918, "learning_rate": 7.942575609609857e-06, "loss": 0.7185, "step": 2932 }, { "epoch": 1.7370447142434113, "grad_norm": 1.428016405572957, "learning_rate": 7.936317109878824e-06, "loss": 0.7647, "step": 2933 }, { "epoch": 1.7376369558779983, "grad_norm": 1.2790352849117015, "learning_rate": 7.930059454310138e-06, "loss": 0.7581, "step": 2934 }, { "epoch": 1.738229197512585, "grad_norm": 0.9761646608961596, "learning_rate": 7.923802645463532e-06, "loss": 0.7217, "step": 2935 }, { "epoch": 1.7388214391471721, "grad_norm": 1.5751448044306215, "learning_rate": 7.917546685898393e-06, "loss": 0.6937, "step": 2936 }, { "epoch": 1.7394136807817588, "grad_norm": 1.1640978244444167, "learning_rate": 7.911291578173767e-06, "loss": 0.6968, "step": 2937 }, { "epoch": 1.740005922416346, "grad_norm": 1.4304427410478942, "learning_rate": 7.905037324848334e-06, "loss": 0.7239, "step": 2938 }, { "epoch": 1.7405981640509327, "grad_norm": 0.9786975945632999, "learning_rate": 7.898783928480442e-06, "loss": 0.744, "step": 2939 }, { "epoch": 1.7411904056855196, "grad_norm": 1.206680722391421, "learning_rate": 7.89253139162808e-06, "loss": 0.732, "step": 2940 }, { "epoch": 1.7417826473201066, "grad_norm": 1.9134032463262014, "learning_rate": 7.88627971684889e-06, "loss": 0.7539, "step": 2941 }, { "epoch": 1.7423748889546935, "grad_norm": 1.7876317020070605, "learning_rate": 7.880028906700153e-06, "loss": 0.7261, "step": 2942 }, { "epoch": 1.7429671305892804, "grad_norm": 1.2450546265182965, "learning_rate": 7.873778963738806e-06, "loss": 0.692, "step": 2943 }, { "epoch": 1.7435593722238674, "grad_norm": 1.273671537217201, "learning_rate": 7.867529890521424e-06, "loss": 0.6972, "step": 2944 }, { "epoch": 1.7441516138584543, "grad_norm": 0.9478991818498798, "learning_rate": 7.861281689604237e-06, "loss": 0.7267, "step": 2945 }, { "epoch": 1.744743855493041, "grad_norm": 0.9418695622023932, "learning_rate": 7.8550343635431e-06, "loss": 0.7253, "step": 2946 }, { "epoch": 1.7453360971276282, "grad_norm": 1.5972561604224107, "learning_rate": 7.848787914893525e-06, "loss": 0.719, "step": 2947 }, { "epoch": 1.7459283387622149, "grad_norm": 1.6316651832184046, "learning_rate": 7.842542346210663e-06, "loss": 0.6981, "step": 2948 }, { "epoch": 1.746520580396802, "grad_norm": 1.1436403440246294, "learning_rate": 7.836297660049303e-06, "loss": 0.7392, "step": 2949 }, { "epoch": 1.7471128220313887, "grad_norm": 1.1760242256445133, "learning_rate": 7.83005385896387e-06, "loss": 0.7368, "step": 2950 }, { "epoch": 1.7477050636659757, "grad_norm": 1.0420980744058013, "learning_rate": 7.823810945508427e-06, "loss": 0.7202, "step": 2951 }, { "epoch": 1.7482973053005626, "grad_norm": 1.236600456703946, "learning_rate": 7.817568922236683e-06, "loss": 0.7232, "step": 2952 }, { "epoch": 1.7488895469351495, "grad_norm": 1.1503525583737613, "learning_rate": 7.811327791701977e-06, "loss": 0.7134, "step": 2953 }, { "epoch": 1.7494817885697365, "grad_norm": 2.7772710614120597, "learning_rate": 7.805087556457275e-06, "loss": 0.6838, "step": 2954 }, { "epoch": 1.7500740302043234, "grad_norm": 1.5536873224508294, "learning_rate": 7.798848219055189e-06, "loss": 0.7342, "step": 2955 }, { "epoch": 1.7506662718389103, "grad_norm": 1.3445549621095612, "learning_rate": 7.792609782047958e-06, "loss": 0.7372, "step": 2956 }, { "epoch": 1.751258513473497, "grad_norm": 1.0834091497432776, "learning_rate": 7.786372247987454e-06, "loss": 0.6987, "step": 2957 }, { "epoch": 1.7518507551080842, "grad_norm": 3.789898070049763, "learning_rate": 7.78013561942518e-06, "loss": 0.7085, "step": 2958 }, { "epoch": 1.752442996742671, "grad_norm": 1.2154346445934634, "learning_rate": 7.773899898912266e-06, "loss": 0.7192, "step": 2959 }, { "epoch": 1.753035238377258, "grad_norm": 1.3478249479015387, "learning_rate": 7.76766508899947e-06, "loss": 0.7343, "step": 2960 }, { "epoch": 1.7536274800118448, "grad_norm": 1.4976847760098018, "learning_rate": 7.761431192237192e-06, "loss": 0.7646, "step": 2961 }, { "epoch": 1.7542197216464317, "grad_norm": 1.81752309263868, "learning_rate": 7.755198211175428e-06, "loss": 0.7278, "step": 2962 }, { "epoch": 1.7548119632810186, "grad_norm": 1.1597990895135137, "learning_rate": 7.74896614836383e-06, "loss": 0.7343, "step": 2963 }, { "epoch": 1.7554042049156056, "grad_norm": 1.5225622446141434, "learning_rate": 7.742735006351656e-06, "loss": 0.7068, "step": 2964 }, { "epoch": 1.7559964465501925, "grad_norm": 1.1131198166318628, "learning_rate": 7.736504787687804e-06, "loss": 0.6979, "step": 2965 }, { "epoch": 1.7565886881847794, "grad_norm": 1.236323876399398, "learning_rate": 7.73027549492077e-06, "loss": 0.7224, "step": 2966 }, { "epoch": 1.7571809298193664, "grad_norm": 1.2400368319010493, "learning_rate": 7.724047130598692e-06, "loss": 0.6906, "step": 2967 }, { "epoch": 1.757773171453953, "grad_norm": 1.28829954217693, "learning_rate": 7.717819697269322e-06, "loss": 0.6975, "step": 2968 }, { "epoch": 1.7583654130885402, "grad_norm": 0.9178925212400618, "learning_rate": 7.711593197480031e-06, "loss": 0.7548, "step": 2969 }, { "epoch": 1.758957654723127, "grad_norm": 1.3227709395440883, "learning_rate": 7.7053676337778e-06, "loss": 0.7114, "step": 2970 }, { "epoch": 1.759549896357714, "grad_norm": 0.9702755831253254, "learning_rate": 7.699143008709245e-06, "loss": 0.7199, "step": 2971 }, { "epoch": 1.7601421379923008, "grad_norm": 0.9961523350834506, "learning_rate": 7.69291932482058e-06, "loss": 0.7949, "step": 2972 }, { "epoch": 1.7607343796268877, "grad_norm": 1.2207456078535526, "learning_rate": 7.686696584657649e-06, "loss": 0.743, "step": 2973 }, { "epoch": 1.7613266212614747, "grad_norm": 1.3380369368359777, "learning_rate": 7.680474790765895e-06, "loss": 0.6948, "step": 2974 }, { "epoch": 1.7619188628960616, "grad_norm": 0.8789763724986712, "learning_rate": 7.674253945690383e-06, "loss": 0.6813, "step": 2975 }, { "epoch": 1.7625111045306485, "grad_norm": 1.0544272650985724, "learning_rate": 7.668034051975793e-06, "loss": 0.6844, "step": 2976 }, { "epoch": 1.7631033461652355, "grad_norm": 1.257614243686341, "learning_rate": 7.661815112166408e-06, "loss": 0.7071, "step": 2977 }, { "epoch": 1.7636955877998224, "grad_norm": 1.0368071883700545, "learning_rate": 7.655597128806125e-06, "loss": 0.7018, "step": 2978 }, { "epoch": 1.764287829434409, "grad_norm": 1.5446506502619204, "learning_rate": 7.649380104438446e-06, "loss": 0.7544, "step": 2979 }, { "epoch": 1.7648800710689962, "grad_norm": 0.990885528622176, "learning_rate": 7.643164041606489e-06, "loss": 0.7179, "step": 2980 }, { "epoch": 1.765472312703583, "grad_norm": 0.9953972053282248, "learning_rate": 7.63694894285297e-06, "loss": 0.7065, "step": 2981 }, { "epoch": 1.7660645543381701, "grad_norm": 6.360092353723965, "learning_rate": 7.630734810720212e-06, "loss": 0.708, "step": 2982 }, { "epoch": 1.7666567959727568, "grad_norm": 2.422095125571542, "learning_rate": 7.624521647750149e-06, "loss": 0.7443, "step": 2983 }, { "epoch": 1.7672490376073438, "grad_norm": 1.0870002901322164, "learning_rate": 7.618309456484309e-06, "loss": 0.7297, "step": 2984 }, { "epoch": 1.7678412792419307, "grad_norm": 2.3370356729061803, "learning_rate": 7.612098239463833e-06, "loss": 0.718, "step": 2985 }, { "epoch": 1.7684335208765176, "grad_norm": 1.2080348447951503, "learning_rate": 7.605887999229454e-06, "loss": 0.72, "step": 2986 }, { "epoch": 1.7690257625111045, "grad_norm": 1.0232344898785648, "learning_rate": 7.599678738321512e-06, "loss": 0.7236, "step": 2987 }, { "epoch": 1.7696180041456915, "grad_norm": 1.347583694196258, "learning_rate": 7.593470459279939e-06, "loss": 0.7651, "step": 2988 }, { "epoch": 1.7702102457802784, "grad_norm": 1.4355786367789864, "learning_rate": 7.58726316464428e-06, "loss": 0.7125, "step": 2989 }, { "epoch": 1.7708024874148651, "grad_norm": 1.0074909388188276, "learning_rate": 7.581056856953656e-06, "loss": 0.7373, "step": 2990 }, { "epoch": 1.7713947290494523, "grad_norm": 1.06347341429192, "learning_rate": 7.574851538746802e-06, "loss": 0.7273, "step": 2991 }, { "epoch": 1.771986970684039, "grad_norm": 1.4128916933234186, "learning_rate": 7.568647212562043e-06, "loss": 0.7169, "step": 2992 }, { "epoch": 1.7725792123186261, "grad_norm": 0.9450378936333973, "learning_rate": 7.562443880937297e-06, "loss": 0.7388, "step": 2993 }, { "epoch": 1.7731714539532129, "grad_norm": 0.9163136638470174, "learning_rate": 7.55624154641007e-06, "loss": 0.7219, "step": 2994 }, { "epoch": 1.7737636955877998, "grad_norm": 1.324578116509981, "learning_rate": 7.550040211517472e-06, "loss": 0.7602, "step": 2995 }, { "epoch": 1.7743559372223867, "grad_norm": 1.1787072674151278, "learning_rate": 7.543839878796195e-06, "loss": 0.7183, "step": 2996 }, { "epoch": 1.7749481788569736, "grad_norm": 3.8914286195953993, "learning_rate": 7.537640550782527e-06, "loss": 0.7204, "step": 2997 }, { "epoch": 1.7755404204915606, "grad_norm": 1.0219901640955456, "learning_rate": 7.531442230012336e-06, "loss": 0.7438, "step": 2998 }, { "epoch": 1.7761326621261475, "grad_norm": 1.2855857922477254, "learning_rate": 7.525244919021084e-06, "loss": 0.738, "step": 2999 }, { "epoch": 1.7767249037607344, "grad_norm": 2.344182476915904, "learning_rate": 7.519048620343825e-06, "loss": 0.7248, "step": 3000 }, { "epoch": 1.7773171453953212, "grad_norm": 1.1756563315618664, "learning_rate": 7.512853336515193e-06, "loss": 0.7267, "step": 3001 }, { "epoch": 1.7779093870299083, "grad_norm": 0.9661265454991339, "learning_rate": 7.506659070069404e-06, "loss": 0.6421, "step": 3002 }, { "epoch": 1.778501628664495, "grad_norm": 2.2188327107977934, "learning_rate": 7.5004658235402594e-06, "loss": 0.7092, "step": 3003 }, { "epoch": 1.7790938702990822, "grad_norm": 2.792499182936962, "learning_rate": 7.494273599461153e-06, "loss": 0.7787, "step": 3004 }, { "epoch": 1.7796861119336689, "grad_norm": 1.4377298749803848, "learning_rate": 7.4880824003650475e-06, "loss": 0.7323, "step": 3005 }, { "epoch": 1.7802783535682558, "grad_norm": 1.1084690396732224, "learning_rate": 7.481892228784491e-06, "loss": 0.728, "step": 3006 }, { "epoch": 1.7808705952028427, "grad_norm": 1.375063278971446, "learning_rate": 7.475703087251611e-06, "loss": 0.7102, "step": 3007 }, { "epoch": 1.7814628368374297, "grad_norm": 1.251032911053362, "learning_rate": 7.469514978298119e-06, "loss": 0.7353, "step": 3008 }, { "epoch": 1.7820550784720166, "grad_norm": 0.8237693730719295, "learning_rate": 7.463327904455299e-06, "loss": 0.6709, "step": 3009 }, { "epoch": 1.7826473201066035, "grad_norm": 1.3285542970533006, "learning_rate": 7.457141868254007e-06, "loss": 0.7046, "step": 3010 }, { "epoch": 1.7832395617411905, "grad_norm": 2.4110688518280003, "learning_rate": 7.450956872224684e-06, "loss": 0.6963, "step": 3011 }, { "epoch": 1.7838318033757772, "grad_norm": 1.0311760681825095, "learning_rate": 7.444772918897336e-06, "loss": 0.7458, "step": 3012 }, { "epoch": 1.7844240450103643, "grad_norm": 3.042245299524146, "learning_rate": 7.438590010801558e-06, "loss": 0.7007, "step": 3013 }, { "epoch": 1.785016286644951, "grad_norm": 1.1638757581822774, "learning_rate": 7.432408150466497e-06, "loss": 0.727, "step": 3014 }, { "epoch": 1.7856085282795382, "grad_norm": 1.8489857858233996, "learning_rate": 7.426227340420886e-06, "loss": 0.7187, "step": 3015 }, { "epoch": 1.786200769914125, "grad_norm": 1.8469830538002985, "learning_rate": 7.42004758319302e-06, "loss": 0.7172, "step": 3016 }, { "epoch": 1.7867930115487118, "grad_norm": 1.411130583258253, "learning_rate": 7.413868881310778e-06, "loss": 0.7044, "step": 3017 }, { "epoch": 1.7873852531832988, "grad_norm": 1.2920931978075625, "learning_rate": 7.40769123730158e-06, "loss": 0.7222, "step": 3018 }, { "epoch": 1.7879774948178857, "grad_norm": 1.3026695316857428, "learning_rate": 7.401514653692442e-06, "loss": 0.7283, "step": 3019 }, { "epoch": 1.7885697364524726, "grad_norm": 1.002137271655342, "learning_rate": 7.395339133009931e-06, "loss": 0.7016, "step": 3020 }, { "epoch": 1.7891619780870596, "grad_norm": 1.755038606150744, "learning_rate": 7.3891646777801826e-06, "loss": 0.7114, "step": 3021 }, { "epoch": 1.7897542197216465, "grad_norm": 1.0069551996461956, "learning_rate": 7.382991290528892e-06, "loss": 0.7261, "step": 3022 }, { "epoch": 1.7903464613562332, "grad_norm": 1.144265082942235, "learning_rate": 7.376818973781328e-06, "loss": 0.6922, "step": 3023 }, { "epoch": 1.7909387029908204, "grad_norm": 0.9697054263146545, "learning_rate": 7.370647730062311e-06, "loss": 0.7872, "step": 3024 }, { "epoch": 1.791530944625407, "grad_norm": 1.0689276075884193, "learning_rate": 7.364477561896231e-06, "loss": 0.6867, "step": 3025 }, { "epoch": 1.7921231862599942, "grad_norm": 0.9895555328791956, "learning_rate": 7.358308471807028e-06, "loss": 0.7047, "step": 3026 }, { "epoch": 1.792715427894581, "grad_norm": 1.426959406213989, "learning_rate": 7.3521404623182065e-06, "loss": 0.6768, "step": 3027 }, { "epoch": 1.7933076695291679, "grad_norm": 1.6038922032905296, "learning_rate": 7.3459735359528366e-06, "loss": 0.7169, "step": 3028 }, { "epoch": 1.7938999111637548, "grad_norm": 1.2100286028193288, "learning_rate": 7.339807695233534e-06, "loss": 0.732, "step": 3029 }, { "epoch": 1.7944921527983417, "grad_norm": 2.091644826171607, "learning_rate": 7.333642942682473e-06, "loss": 0.7597, "step": 3030 }, { "epoch": 1.7950843944329287, "grad_norm": 0.9734538843147006, "learning_rate": 7.327479280821381e-06, "loss": 0.6885, "step": 3031 }, { "epoch": 1.7956766360675156, "grad_norm": 0.9935775103575984, "learning_rate": 7.3213167121715514e-06, "loss": 0.7143, "step": 3032 }, { "epoch": 1.7962688777021025, "grad_norm": 1.2604409769783473, "learning_rate": 7.315155239253815e-06, "loss": 0.7302, "step": 3033 }, { "epoch": 1.7968611193366892, "grad_norm": 1.1543012632612109, "learning_rate": 7.308994864588562e-06, "loss": 0.7388, "step": 3034 }, { "epoch": 1.7974533609712764, "grad_norm": 1.242912338801917, "learning_rate": 7.302835590695731e-06, "loss": 0.732, "step": 3035 }, { "epoch": 1.798045602605863, "grad_norm": 0.9047047765521495, "learning_rate": 7.296677420094811e-06, "loss": 0.6763, "step": 3036 }, { "epoch": 1.7986378442404503, "grad_norm": 1.0900389528678076, "learning_rate": 7.290520355304844e-06, "loss": 0.7187, "step": 3037 }, { "epoch": 1.799230085875037, "grad_norm": 1.1766868265659325, "learning_rate": 7.284364398844412e-06, "loss": 0.7759, "step": 3038 }, { "epoch": 1.799822327509624, "grad_norm": 0.969905582098005, "learning_rate": 7.2782095532316486e-06, "loss": 0.6934, "step": 3039 }, { "epoch": 1.8004145691442108, "grad_norm": 1.0483421180595986, "learning_rate": 7.27205582098423e-06, "loss": 0.6879, "step": 3040 }, { "epoch": 1.8010068107787978, "grad_norm": 1.4279823666510263, "learning_rate": 7.265903204619386e-06, "loss": 0.6903, "step": 3041 }, { "epoch": 1.8015990524133847, "grad_norm": 1.036318973747682, "learning_rate": 7.25975170665387e-06, "loss": 0.7208, "step": 3042 }, { "epoch": 1.8021912940479716, "grad_norm": 1.1321513164832115, "learning_rate": 7.253601329604001e-06, "loss": 0.7162, "step": 3043 }, { "epoch": 1.8027835356825586, "grad_norm": 7.227215013383004, "learning_rate": 7.247452075985622e-06, "loss": 0.7006, "step": 3044 }, { "epoch": 1.8033757773171453, "grad_norm": 0.9690712025929449, "learning_rate": 7.241303948314135e-06, "loss": 0.7178, "step": 3045 }, { "epoch": 1.8039680189517324, "grad_norm": 2.051690522200872, "learning_rate": 7.235156949104455e-06, "loss": 0.6948, "step": 3046 }, { "epoch": 1.8045602605863191, "grad_norm": 0.975264804113868, "learning_rate": 7.22901108087106e-06, "loss": 0.7484, "step": 3047 }, { "epoch": 1.8051525022209063, "grad_norm": 1.286263931105427, "learning_rate": 7.222866346127952e-06, "loss": 0.7124, "step": 3048 }, { "epoch": 1.805744743855493, "grad_norm": 1.5309931798364385, "learning_rate": 7.216722747388678e-06, "loss": 0.7074, "step": 3049 }, { "epoch": 1.80633698549008, "grad_norm": 1.7076934436600835, "learning_rate": 7.210580287166307e-06, "loss": 0.7149, "step": 3050 }, { "epoch": 1.8069292271246669, "grad_norm": 1.1300533995630113, "learning_rate": 7.2044389679734564e-06, "loss": 0.7234, "step": 3051 }, { "epoch": 1.8075214687592538, "grad_norm": 1.0540182120458212, "learning_rate": 7.198298792322271e-06, "loss": 0.7289, "step": 3052 }, { "epoch": 1.8081137103938407, "grad_norm": 1.4987232839307902, "learning_rate": 7.192159762724427e-06, "loss": 0.7248, "step": 3053 }, { "epoch": 1.8087059520284274, "grad_norm": 1.0497678988574621, "learning_rate": 7.186021881691132e-06, "loss": 0.6714, "step": 3054 }, { "epoch": 1.8092981936630146, "grad_norm": 1.0903848686159592, "learning_rate": 7.179885151733124e-06, "loss": 0.6885, "step": 3055 }, { "epoch": 1.8098904352976013, "grad_norm": 1.0133138934590773, "learning_rate": 7.173749575360671e-06, "loss": 0.7397, "step": 3056 }, { "epoch": 1.8104826769321885, "grad_norm": 1.2805524494775062, "learning_rate": 7.167615155083574e-06, "loss": 0.6995, "step": 3057 }, { "epoch": 1.8110749185667752, "grad_norm": 1.353348949222249, "learning_rate": 7.1614818934111475e-06, "loss": 0.704, "step": 3058 }, { "epoch": 1.8116671602013623, "grad_norm": 1.7884004913807032, "learning_rate": 7.155349792852242e-06, "loss": 0.7066, "step": 3059 }, { "epoch": 1.812259401835949, "grad_norm": 1.1372875659844779, "learning_rate": 7.1492188559152364e-06, "loss": 0.7382, "step": 3060 }, { "epoch": 1.812851643470536, "grad_norm": 7.427167503376271, "learning_rate": 7.143089085108028e-06, "loss": 0.7224, "step": 3061 }, { "epoch": 1.8134438851051229, "grad_norm": 1.602444693750489, "learning_rate": 7.136960482938035e-06, "loss": 0.7293, "step": 3062 }, { "epoch": 1.8140361267397098, "grad_norm": 4.355339095528307, "learning_rate": 7.130833051912198e-06, "loss": 0.7455, "step": 3063 }, { "epoch": 1.8146283683742968, "grad_norm": 1.2740183492816513, "learning_rate": 7.124706794536984e-06, "loss": 0.7043, "step": 3064 }, { "epoch": 1.8152206100088835, "grad_norm": 1.0624879073095552, "learning_rate": 7.11858171331838e-06, "loss": 0.7437, "step": 3065 }, { "epoch": 1.8158128516434706, "grad_norm": 1.1249819254474553, "learning_rate": 7.112457810761883e-06, "loss": 0.7151, "step": 3066 }, { "epoch": 1.8164050932780573, "grad_norm": 2.4801786068161737, "learning_rate": 7.106335089372517e-06, "loss": 0.7494, "step": 3067 }, { "epoch": 1.8169973349126445, "grad_norm": 1.7960579111312531, "learning_rate": 7.100213551654816e-06, "loss": 0.6956, "step": 3068 }, { "epoch": 1.8175895765472312, "grad_norm": 1.206786649404624, "learning_rate": 7.0940932001128395e-06, "loss": 0.7599, "step": 3069 }, { "epoch": 1.8181818181818183, "grad_norm": 1.3862079077433727, "learning_rate": 7.087974037250146e-06, "loss": 0.7371, "step": 3070 }, { "epoch": 1.818774059816405, "grad_norm": 1.9789719604629894, "learning_rate": 7.0818560655698246e-06, "loss": 0.7472, "step": 3071 }, { "epoch": 1.819366301450992, "grad_norm": 1.4121044579900446, "learning_rate": 7.075739287574467e-06, "loss": 0.7147, "step": 3072 }, { "epoch": 1.819958543085579, "grad_norm": 2.1510778746002455, "learning_rate": 7.069623705766182e-06, "loss": 0.7472, "step": 3073 }, { "epoch": 1.8205507847201658, "grad_norm": 1.0077829214521765, "learning_rate": 7.063509322646581e-06, "loss": 0.6905, "step": 3074 }, { "epoch": 1.8211430263547528, "grad_norm": 1.3469407014130284, "learning_rate": 7.057396140716796e-06, "loss": 0.7024, "step": 3075 }, { "epoch": 1.8217352679893395, "grad_norm": 1.876760028468839, "learning_rate": 7.051284162477459e-06, "loss": 0.7303, "step": 3076 }, { "epoch": 1.8223275096239266, "grad_norm": 1.049747547083592, "learning_rate": 7.0451733904287166e-06, "loss": 0.7227, "step": 3077 }, { "epoch": 1.8229197512585134, "grad_norm": 1.0058819206164897, "learning_rate": 7.039063827070214e-06, "loss": 0.713, "step": 3078 }, { "epoch": 1.8235119928931005, "grad_norm": 3.6076196015946236, "learning_rate": 7.0329554749011045e-06, "loss": 0.7295, "step": 3079 }, { "epoch": 1.8241042345276872, "grad_norm": 1.0057618249589486, "learning_rate": 7.026848336420053e-06, "loss": 0.7129, "step": 3080 }, { "epoch": 1.8246964761622744, "grad_norm": 1.7386425971569412, "learning_rate": 7.020742414125223e-06, "loss": 0.6903, "step": 3081 }, { "epoch": 1.825288717796861, "grad_norm": 2.473617947321098, "learning_rate": 7.014637710514274e-06, "loss": 0.7285, "step": 3082 }, { "epoch": 1.825880959431448, "grad_norm": 0.9327751534341746, "learning_rate": 7.008534228084376e-06, "loss": 0.7183, "step": 3083 }, { "epoch": 1.826473201066035, "grad_norm": 1.0584729282570624, "learning_rate": 7.002431969332197e-06, "loss": 0.7485, "step": 3084 }, { "epoch": 1.8270654427006219, "grad_norm": 0.9797461090483691, "learning_rate": 6.996330936753907e-06, "loss": 0.7756, "step": 3085 }, { "epoch": 1.8276576843352088, "grad_norm": 1.1221799073488412, "learning_rate": 6.990231132845169e-06, "loss": 0.7499, "step": 3086 }, { "epoch": 1.8282499259697955, "grad_norm": 0.9306758935450702, "learning_rate": 6.984132560101143e-06, "loss": 0.7304, "step": 3087 }, { "epoch": 1.8288421676043827, "grad_norm": 1.0887511525648792, "learning_rate": 6.978035221016487e-06, "loss": 0.7267, "step": 3088 }, { "epoch": 1.8294344092389694, "grad_norm": 1.1273648529558058, "learning_rate": 6.971939118085365e-06, "loss": 0.6752, "step": 3089 }, { "epoch": 1.8300266508735565, "grad_norm": 1.0281268202300238, "learning_rate": 6.965844253801416e-06, "loss": 0.723, "step": 3090 }, { "epoch": 1.8306188925081432, "grad_norm": 1.0970082995953543, "learning_rate": 6.959750630657787e-06, "loss": 0.7901, "step": 3091 }, { "epoch": 1.8312111341427302, "grad_norm": 1.0998649102940627, "learning_rate": 6.953658251147109e-06, "loss": 0.7464, "step": 3092 }, { "epoch": 1.8318033757773171, "grad_norm": 1.6332086008816058, "learning_rate": 6.947567117761517e-06, "loss": 0.7037, "step": 3093 }, { "epoch": 1.832395617411904, "grad_norm": 2.4583445925569114, "learning_rate": 6.941477232992614e-06, "loss": 0.698, "step": 3094 }, { "epoch": 1.832987859046491, "grad_norm": 1.1349251674267553, "learning_rate": 6.935388599331514e-06, "loss": 0.6615, "step": 3095 }, { "epoch": 1.833580100681078, "grad_norm": 1.0921356725030107, "learning_rate": 6.929301219268806e-06, "loss": 0.7015, "step": 3096 }, { "epoch": 1.8341723423156648, "grad_norm": 1.534601523832624, "learning_rate": 6.92321509529458e-06, "loss": 0.6909, "step": 3097 }, { "epoch": 1.8347645839502515, "grad_norm": 1.2237585847130814, "learning_rate": 6.917130229898387e-06, "loss": 0.6804, "step": 3098 }, { "epoch": 1.8353568255848387, "grad_norm": 1.6141007678469457, "learning_rate": 6.911046625569293e-06, "loss": 0.7228, "step": 3099 }, { "epoch": 1.8359490672194254, "grad_norm": 0.9997505945449734, "learning_rate": 6.90496428479583e-06, "loss": 0.7101, "step": 3100 }, { "epoch": 1.8365413088540126, "grad_norm": 1.0339643235502096, "learning_rate": 6.898883210066018e-06, "loss": 0.7141, "step": 3101 }, { "epoch": 1.8371335504885993, "grad_norm": 0.9214598557673855, "learning_rate": 6.892803403867352e-06, "loss": 0.7178, "step": 3102 }, { "epoch": 1.8377257921231862, "grad_norm": 1.2899691433939153, "learning_rate": 6.886724868686823e-06, "loss": 0.7322, "step": 3103 }, { "epoch": 1.8383180337577731, "grad_norm": 1.0878616869188478, "learning_rate": 6.8806476070108905e-06, "loss": 0.7229, "step": 3104 }, { "epoch": 1.83891027539236, "grad_norm": 2.1890307846418455, "learning_rate": 6.874571621325498e-06, "loss": 0.7179, "step": 3105 }, { "epoch": 1.839502517026947, "grad_norm": 1.2042122977243805, "learning_rate": 6.868496914116063e-06, "loss": 0.7208, "step": 3106 }, { "epoch": 1.840094758661534, "grad_norm": 1.1067009260330778, "learning_rate": 6.86242348786748e-06, "loss": 0.7104, "step": 3107 }, { "epoch": 1.8406870002961209, "grad_norm": 1.977612629506757, "learning_rate": 6.856351345064127e-06, "loss": 0.7083, "step": 3108 }, { "epoch": 1.8412792419307076, "grad_norm": 1.183996644590801, "learning_rate": 6.850280488189851e-06, "loss": 0.7136, "step": 3109 }, { "epoch": 1.8418714835652947, "grad_norm": 1.1336099389139767, "learning_rate": 6.844210919727971e-06, "loss": 0.7121, "step": 3110 }, { "epoch": 1.8424637251998814, "grad_norm": 0.9915840493395013, "learning_rate": 6.838142642161283e-06, "loss": 0.7146, "step": 3111 }, { "epoch": 1.8430559668344686, "grad_norm": 1.0672864776594517, "learning_rate": 6.8320756579720545e-06, "loss": 0.7185, "step": 3112 }, { "epoch": 1.8436482084690553, "grad_norm": 1.2611537685575909, "learning_rate": 6.826009969642027e-06, "loss": 0.7399, "step": 3113 }, { "epoch": 1.8442404501036422, "grad_norm": 3.728635542061314, "learning_rate": 6.819945579652401e-06, "loss": 0.7557, "step": 3114 }, { "epoch": 1.8448326917382292, "grad_norm": 1.3284266412732086, "learning_rate": 6.813882490483854e-06, "loss": 0.6961, "step": 3115 }, { "epoch": 1.845424933372816, "grad_norm": 19.468979938411216, "learning_rate": 6.807820704616532e-06, "loss": 0.7411, "step": 3116 }, { "epoch": 1.846017175007403, "grad_norm": 0.8689154585001878, "learning_rate": 6.801760224530052e-06, "loss": 0.7244, "step": 3117 }, { "epoch": 1.84660941664199, "grad_norm": 1.068439800318271, "learning_rate": 6.795701052703482e-06, "loss": 0.7105, "step": 3118 }, { "epoch": 1.847201658276577, "grad_norm": 0.9526703958307156, "learning_rate": 6.7896431916153684e-06, "loss": 0.7437, "step": 3119 }, { "epoch": 1.8477938999111636, "grad_norm": 1.0630490772363779, "learning_rate": 6.783586643743714e-06, "loss": 0.7221, "step": 3120 }, { "epoch": 1.8483861415457508, "grad_norm": 3.5896361583850567, "learning_rate": 6.777531411565996e-06, "loss": 0.7202, "step": 3121 }, { "epoch": 1.8489783831803375, "grad_norm": 1.7343409372636471, "learning_rate": 6.7714774975591335e-06, "loss": 0.7145, "step": 3122 }, { "epoch": 1.8495706248149246, "grad_norm": 1.0396615169001493, "learning_rate": 6.7654249041995256e-06, "loss": 0.7005, "step": 3123 }, { "epoch": 1.8501628664495113, "grad_norm": 1.642370057671462, "learning_rate": 6.75937363396302e-06, "loss": 0.7372, "step": 3124 }, { "epoch": 1.8507551080840983, "grad_norm": 1.1074855634634668, "learning_rate": 6.753323689324931e-06, "loss": 0.7431, "step": 3125 }, { "epoch": 1.8513473497186852, "grad_norm": 0.9324152655129814, "learning_rate": 6.7472750727600155e-06, "loss": 0.7135, "step": 3126 }, { "epoch": 1.8519395913532721, "grad_norm": 0.8914241503209785, "learning_rate": 6.741227786742509e-06, "loss": 0.7181, "step": 3127 }, { "epoch": 1.852531832987859, "grad_norm": 1.1891158801295643, "learning_rate": 6.735181833746087e-06, "loss": 0.6945, "step": 3128 }, { "epoch": 1.853124074622446, "grad_norm": 1.0727481880906515, "learning_rate": 6.729137216243886e-06, "loss": 0.6876, "step": 3129 }, { "epoch": 1.853716316257033, "grad_norm": 1.0797517345200798, "learning_rate": 6.7230939367084915e-06, "loss": 0.7165, "step": 3130 }, { "epoch": 1.8543085578916196, "grad_norm": 1.4893641209790685, "learning_rate": 6.717051997611944e-06, "loss": 0.7433, "step": 3131 }, { "epoch": 1.8549007995262068, "grad_norm": 1.0413984547576909, "learning_rate": 6.711011401425741e-06, "loss": 0.6766, "step": 3132 }, { "epoch": 1.8554930411607935, "grad_norm": 1.1532788341257012, "learning_rate": 6.704972150620825e-06, "loss": 0.6584, "step": 3133 }, { "epoch": 1.8560852827953807, "grad_norm": 1.680364023670173, "learning_rate": 6.698934247667587e-06, "loss": 0.7095, "step": 3134 }, { "epoch": 1.8566775244299674, "grad_norm": 1.5274601112160506, "learning_rate": 6.692897695035866e-06, "loss": 0.6719, "step": 3135 }, { "epoch": 1.8572697660645543, "grad_norm": 1.4344751525084347, "learning_rate": 6.686862495194958e-06, "loss": 0.7164, "step": 3136 }, { "epoch": 1.8578620076991412, "grad_norm": 1.305338228972161, "learning_rate": 6.6808286506135975e-06, "loss": 0.7633, "step": 3137 }, { "epoch": 1.8584542493337282, "grad_norm": 1.1939810781229667, "learning_rate": 6.6747961637599645e-06, "loss": 0.7098, "step": 3138 }, { "epoch": 1.859046490968315, "grad_norm": 1.454501033733436, "learning_rate": 6.668765037101682e-06, "loss": 0.7129, "step": 3139 }, { "epoch": 1.859638732602902, "grad_norm": 0.9270487154278511, "learning_rate": 6.662735273105827e-06, "loss": 0.7298, "step": 3140 }, { "epoch": 1.860230974237489, "grad_norm": 0.887328217763018, "learning_rate": 6.656706874238909e-06, "loss": 0.6998, "step": 3141 }, { "epoch": 1.8608232158720757, "grad_norm": 0.9240842979460729, "learning_rate": 6.650679842966881e-06, "loss": 0.7242, "step": 3142 }, { "epoch": 1.8614154575066628, "grad_norm": 1.4560811731823002, "learning_rate": 6.644654181755139e-06, "loss": 0.7467, "step": 3143 }, { "epoch": 1.8620076991412495, "grad_norm": 1.8642002876762704, "learning_rate": 6.638629893068516e-06, "loss": 0.7094, "step": 3144 }, { "epoch": 1.8625999407758367, "grad_norm": 1.3637576327896082, "learning_rate": 6.632606979371287e-06, "loss": 0.7453, "step": 3145 }, { "epoch": 1.8631921824104234, "grad_norm": 1.318364652725098, "learning_rate": 6.62658544312716e-06, "loss": 0.6683, "step": 3146 }, { "epoch": 1.8637844240450103, "grad_norm": 1.457651252213059, "learning_rate": 6.620565286799283e-06, "loss": 0.7037, "step": 3147 }, { "epoch": 1.8643766656795973, "grad_norm": 1.7273054218815638, "learning_rate": 6.614546512850237e-06, "loss": 0.7301, "step": 3148 }, { "epoch": 1.8649689073141842, "grad_norm": 1.2049671686395615, "learning_rate": 6.608529123742047e-06, "loss": 0.728, "step": 3149 }, { "epoch": 1.8655611489487711, "grad_norm": 1.17602916283294, "learning_rate": 6.6025131219361505e-06, "loss": 0.7274, "step": 3150 }, { "epoch": 1.866153390583358, "grad_norm": 2.03746704987476, "learning_rate": 6.596498509893438e-06, "loss": 0.7063, "step": 3151 }, { "epoch": 1.866745632217945, "grad_norm": 1.1900210213393627, "learning_rate": 6.590485290074224e-06, "loss": 0.721, "step": 3152 }, { "epoch": 1.8673378738525317, "grad_norm": 2.1534721717971452, "learning_rate": 6.584473464938257e-06, "loss": 0.7324, "step": 3153 }, { "epoch": 1.8679301154871188, "grad_norm": 2.9498552630407606, "learning_rate": 6.5784630369447e-06, "loss": 0.7615, "step": 3154 }, { "epoch": 1.8685223571217056, "grad_norm": 1.2185800168618288, "learning_rate": 6.572454008552166e-06, "loss": 0.7466, "step": 3155 }, { "epoch": 1.8691145987562927, "grad_norm": 1.1826866438599948, "learning_rate": 6.566446382218683e-06, "loss": 0.745, "step": 3156 }, { "epoch": 1.8697068403908794, "grad_norm": 1.6999039368124431, "learning_rate": 6.5604401604017095e-06, "loss": 0.6985, "step": 3157 }, { "epoch": 1.8702990820254664, "grad_norm": 1.244053260256039, "learning_rate": 6.5544353455581245e-06, "loss": 0.6739, "step": 3158 }, { "epoch": 1.8708913236600533, "grad_norm": 1.718673210214095, "learning_rate": 6.5484319401442346e-06, "loss": 0.7268, "step": 3159 }, { "epoch": 1.8714835652946402, "grad_norm": 1.9410211268230035, "learning_rate": 6.542429946615774e-06, "loss": 0.7235, "step": 3160 }, { "epoch": 1.8720758069292271, "grad_norm": 1.1807820843110362, "learning_rate": 6.536429367427896e-06, "loss": 0.7566, "step": 3161 }, { "epoch": 1.872668048563814, "grad_norm": 1.4651639003519918, "learning_rate": 6.53043020503517e-06, "loss": 0.7173, "step": 3162 }, { "epoch": 1.873260290198401, "grad_norm": 2.002968615540737, "learning_rate": 6.5244324618915925e-06, "loss": 0.7393, "step": 3163 }, { "epoch": 1.8738525318329877, "grad_norm": 1.1224074176105243, "learning_rate": 6.5184361404505795e-06, "loss": 0.7074, "step": 3164 }, { "epoch": 1.8744447734675749, "grad_norm": 1.5396485670035198, "learning_rate": 6.512441243164967e-06, "loss": 0.7426, "step": 3165 }, { "epoch": 1.8750370151021616, "grad_norm": 1.1447433665685038, "learning_rate": 6.506447772486997e-06, "loss": 0.7526, "step": 3166 }, { "epoch": 1.8756292567367487, "grad_norm": 1.943211358510783, "learning_rate": 6.50045573086834e-06, "loss": 0.7328, "step": 3167 }, { "epoch": 1.8762214983713354, "grad_norm": 1.8487347809196877, "learning_rate": 6.4944651207600765e-06, "loss": 0.7435, "step": 3168 }, { "epoch": 1.8768137400059224, "grad_norm": 0.9270621419455923, "learning_rate": 6.488475944612709e-06, "loss": 0.7267, "step": 3169 }, { "epoch": 1.8774059816405093, "grad_norm": 1.4497042883892517, "learning_rate": 6.4824882048761406e-06, "loss": 0.7384, "step": 3170 }, { "epoch": 1.8779982232750962, "grad_norm": 1.7243801348273786, "learning_rate": 6.476501903999695e-06, "loss": 0.7276, "step": 3171 }, { "epoch": 1.8785904649096832, "grad_norm": 2.212944195283541, "learning_rate": 6.470517044432104e-06, "loss": 0.7246, "step": 3172 }, { "epoch": 1.87918270654427, "grad_norm": 2.9708159494514095, "learning_rate": 6.464533628621521e-06, "loss": 0.7134, "step": 3173 }, { "epoch": 1.879774948178857, "grad_norm": 1.1598634102463192, "learning_rate": 6.458551659015486e-06, "loss": 0.6919, "step": 3174 }, { "epoch": 1.8803671898134438, "grad_norm": 1.0582572327884274, "learning_rate": 6.452571138060971e-06, "loss": 0.7249, "step": 3175 }, { "epoch": 1.880959431448031, "grad_norm": 1.2040138425826092, "learning_rate": 6.446592068204341e-06, "loss": 0.7295, "step": 3176 }, { "epoch": 1.8815516730826176, "grad_norm": 2.157200215176374, "learning_rate": 6.440614451891376e-06, "loss": 0.7359, "step": 3177 }, { "epoch": 1.8821439147172048, "grad_norm": 1.6977745304387328, "learning_rate": 6.43463829156725e-06, "loss": 0.7341, "step": 3178 }, { "epoch": 1.8827361563517915, "grad_norm": 1.5684451007006281, "learning_rate": 6.428663589676554e-06, "loss": 0.72, "step": 3179 }, { "epoch": 1.8833283979863784, "grad_norm": 1.1271225922581531, "learning_rate": 6.422690348663276e-06, "loss": 0.7196, "step": 3180 }, { "epoch": 1.8839206396209653, "grad_norm": 1.8620701322408784, "learning_rate": 6.41671857097081e-06, "loss": 0.6995, "step": 3181 }, { "epoch": 1.8845128812555523, "grad_norm": 1.7096834779094625, "learning_rate": 6.410748259041941e-06, "loss": 0.7097, "step": 3182 }, { "epoch": 1.8851051228901392, "grad_norm": 1.4823000392501897, "learning_rate": 6.40477941531887e-06, "loss": 0.7037, "step": 3183 }, { "epoch": 1.8856973645247261, "grad_norm": 1.226315830076985, "learning_rate": 6.398812042243187e-06, "loss": 0.728, "step": 3184 }, { "epoch": 1.886289606159313, "grad_norm": 0.9904003037669937, "learning_rate": 6.392846142255886e-06, "loss": 0.6943, "step": 3185 }, { "epoch": 1.8868818477938998, "grad_norm": 1.2611406463421255, "learning_rate": 6.3868817177973505e-06, "loss": 0.7045, "step": 3186 }, { "epoch": 1.887474089428487, "grad_norm": 1.4663012894424847, "learning_rate": 6.380918771307365e-06, "loss": 0.6833, "step": 3187 }, { "epoch": 1.8880663310630736, "grad_norm": 1.3127393848518931, "learning_rate": 6.3749573052251155e-06, "loss": 0.7176, "step": 3188 }, { "epoch": 1.8886585726976608, "grad_norm": 1.4212019430587715, "learning_rate": 6.368997321989176e-06, "loss": 0.7226, "step": 3189 }, { "epoch": 1.8892508143322475, "grad_norm": 1.2265922099963948, "learning_rate": 6.363038824037511e-06, "loss": 0.7691, "step": 3190 }, { "epoch": 1.8898430559668344, "grad_norm": 2.4233051026500196, "learning_rate": 6.357081813807482e-06, "loss": 0.736, "step": 3191 }, { "epoch": 1.8904352976014214, "grad_norm": 1.3155312645080448, "learning_rate": 6.351126293735843e-06, "loss": 0.7628, "step": 3192 }, { "epoch": 1.8910275392360083, "grad_norm": 1.411204552132623, "learning_rate": 6.345172266258739e-06, "loss": 0.7293, "step": 3193 }, { "epoch": 1.8916197808705952, "grad_norm": 1.5791438885255882, "learning_rate": 6.339219733811697e-06, "loss": 0.6975, "step": 3194 }, { "epoch": 1.8922120225051822, "grad_norm": 1.4220630376908199, "learning_rate": 6.333268698829639e-06, "loss": 0.7482, "step": 3195 }, { "epoch": 1.892804264139769, "grad_norm": 1.3250303025534889, "learning_rate": 6.327319163746871e-06, "loss": 0.7284, "step": 3196 }, { "epoch": 1.8933965057743558, "grad_norm": 1.0527611253118723, "learning_rate": 6.321371130997095e-06, "loss": 0.6912, "step": 3197 }, { "epoch": 1.893988747408943, "grad_norm": 1.1033965769900478, "learning_rate": 6.315424603013382e-06, "loss": 0.7213, "step": 3198 }, { "epoch": 1.8945809890435297, "grad_norm": 1.2063474598713093, "learning_rate": 6.309479582228202e-06, "loss": 0.7157, "step": 3199 }, { "epoch": 1.8951732306781168, "grad_norm": 1.269742008185876, "learning_rate": 6.303536071073397e-06, "loss": 0.7374, "step": 3200 }, { "epoch": 1.8957654723127035, "grad_norm": 1.8184812717508338, "learning_rate": 6.297594071980208e-06, "loss": 0.6878, "step": 3201 }, { "epoch": 1.8963577139472905, "grad_norm": 1.511960235444543, "learning_rate": 6.291653587379236e-06, "loss": 0.7505, "step": 3202 }, { "epoch": 1.8969499555818774, "grad_norm": 1.2751220645522783, "learning_rate": 6.2857146197004755e-06, "loss": 0.8061, "step": 3203 }, { "epoch": 1.8975421972164643, "grad_norm": 1.1058079431721495, "learning_rate": 6.2797771713733025e-06, "loss": 0.7353, "step": 3204 }, { "epoch": 1.8981344388510513, "grad_norm": 0.9715354260971804, "learning_rate": 6.273841244826466e-06, "loss": 0.7067, "step": 3205 }, { "epoch": 1.898726680485638, "grad_norm": 1.3513816230607942, "learning_rate": 6.267906842488088e-06, "loss": 0.7182, "step": 3206 }, { "epoch": 1.8993189221202251, "grad_norm": 2.4611133660776714, "learning_rate": 6.261973966785679e-06, "loss": 0.7578, "step": 3207 }, { "epoch": 1.8999111637548118, "grad_norm": 3.4200966306498968, "learning_rate": 6.256042620146119e-06, "loss": 0.7041, "step": 3208 }, { "epoch": 1.900503405389399, "grad_norm": 1.0636075328664951, "learning_rate": 6.250112804995661e-06, "loss": 0.7919, "step": 3209 }, { "epoch": 1.9010956470239857, "grad_norm": 1.1252808954511786, "learning_rate": 6.2441845237599285e-06, "loss": 0.7849, "step": 3210 }, { "epoch": 1.9016878886585729, "grad_norm": 0.9806450437275593, "learning_rate": 6.238257778863925e-06, "loss": 0.7062, "step": 3211 }, { "epoch": 1.9022801302931596, "grad_norm": 1.6392036224674011, "learning_rate": 6.232332572732025e-06, "loss": 0.7196, "step": 3212 }, { "epoch": 1.9028723719277465, "grad_norm": 1.5302573409478557, "learning_rate": 6.226408907787971e-06, "loss": 0.7276, "step": 3213 }, { "epoch": 1.9034646135623334, "grad_norm": 3.974467334315002, "learning_rate": 6.22048678645487e-06, "loss": 0.6877, "step": 3214 }, { "epoch": 1.9040568551969204, "grad_norm": 1.2222534462798487, "learning_rate": 6.2145662111552045e-06, "loss": 0.7145, "step": 3215 }, { "epoch": 1.9046490968315073, "grad_norm": 0.8412923141136364, "learning_rate": 6.208647184310826e-06, "loss": 0.7067, "step": 3216 }, { "epoch": 1.905241338466094, "grad_norm": 1.8567585845983157, "learning_rate": 6.202729708342948e-06, "loss": 0.7041, "step": 3217 }, { "epoch": 1.9058335801006812, "grad_norm": 1.3812265601103704, "learning_rate": 6.196813785672149e-06, "loss": 0.7327, "step": 3218 }, { "epoch": 1.9064258217352679, "grad_norm": 1.490042310350291, "learning_rate": 6.1908994187183726e-06, "loss": 0.7277, "step": 3219 }, { "epoch": 1.907018063369855, "grad_norm": 1.954642417359045, "learning_rate": 6.184986609900934e-06, "loss": 0.6789, "step": 3220 }, { "epoch": 1.9076103050044417, "grad_norm": 1.837812310692102, "learning_rate": 6.1790753616385e-06, "loss": 0.7116, "step": 3221 }, { "epoch": 1.9082025466390289, "grad_norm": 1.352234369694832, "learning_rate": 6.173165676349103e-06, "loss": 0.6998, "step": 3222 }, { "epoch": 1.9087947882736156, "grad_norm": 1.353210285362207, "learning_rate": 6.167257556450139e-06, "loss": 0.7536, "step": 3223 }, { "epoch": 1.9093870299082025, "grad_norm": 2.00212211868531, "learning_rate": 6.16135100435836e-06, "loss": 0.6977, "step": 3224 }, { "epoch": 1.9099792715427895, "grad_norm": 1.5184121891582234, "learning_rate": 6.155446022489877e-06, "loss": 0.7228, "step": 3225 }, { "epoch": 1.9105715131773764, "grad_norm": 1.7192454857712378, "learning_rate": 6.149542613260157e-06, "loss": 0.7029, "step": 3226 }, { "epoch": 1.9111637548119633, "grad_norm": 0.948075216205065, "learning_rate": 6.143640779084035e-06, "loss": 0.6953, "step": 3227 }, { "epoch": 1.91175599644655, "grad_norm": 1.6132188424013671, "learning_rate": 6.137740522375687e-06, "loss": 0.7213, "step": 3228 }, { "epoch": 1.9123482380811372, "grad_norm": 3.072529086497967, "learning_rate": 6.13184184554865e-06, "loss": 0.7483, "step": 3229 }, { "epoch": 1.912940479715724, "grad_norm": 2.1449484779319925, "learning_rate": 6.1259447510158136e-06, "loss": 0.7394, "step": 3230 }, { "epoch": 1.913532721350311, "grad_norm": 1.2064970456317576, "learning_rate": 6.120049241189423e-06, "loss": 0.7599, "step": 3231 }, { "epoch": 1.9141249629848978, "grad_norm": 1.5178757032575456, "learning_rate": 6.114155318481076e-06, "loss": 0.7253, "step": 3232 }, { "epoch": 1.914717204619485, "grad_norm": 1.7616627788897157, "learning_rate": 6.108262985301714e-06, "loss": 0.713, "step": 3233 }, { "epoch": 1.9153094462540716, "grad_norm": 1.5277812357867129, "learning_rate": 6.102372244061631e-06, "loss": 0.7322, "step": 3234 }, { "epoch": 1.9159016878886586, "grad_norm": 1.241728693166342, "learning_rate": 6.0964830971704755e-06, "loss": 0.7061, "step": 3235 }, { "epoch": 1.9164939295232455, "grad_norm": 8.016340143006273, "learning_rate": 6.090595547037242e-06, "loss": 0.7629, "step": 3236 }, { "epoch": 1.9170861711578324, "grad_norm": 3.098312029179306, "learning_rate": 6.084709596070264e-06, "loss": 0.7099, "step": 3237 }, { "epoch": 1.9176784127924194, "grad_norm": 1.570777816333163, "learning_rate": 6.078825246677229e-06, "loss": 0.6777, "step": 3238 }, { "epoch": 1.918270654427006, "grad_norm": 1.1666760383543127, "learning_rate": 6.072942501265164e-06, "loss": 0.7347, "step": 3239 }, { "epoch": 1.9188628960615932, "grad_norm": 1.1072653690074732, "learning_rate": 6.06706136224045e-06, "loss": 0.7289, "step": 3240 }, { "epoch": 1.91945513769618, "grad_norm": 1.534029814565199, "learning_rate": 6.061181832008795e-06, "loss": 0.7361, "step": 3241 }, { "epoch": 1.920047379330767, "grad_norm": 11.474053636259848, "learning_rate": 6.055303912975261e-06, "loss": 0.7246, "step": 3242 }, { "epoch": 1.9206396209653538, "grad_norm": 1.1182633924025185, "learning_rate": 6.049427607544247e-06, "loss": 0.716, "step": 3243 }, { "epoch": 1.9212318625999407, "grad_norm": 1.7977091796689737, "learning_rate": 6.0435529181195e-06, "loss": 0.691, "step": 3244 }, { "epoch": 1.9218241042345277, "grad_norm": 1.2523074350306502, "learning_rate": 6.0376798471040835e-06, "loss": 0.7398, "step": 3245 }, { "epoch": 1.9224163458691146, "grad_norm": 1.379470202336427, "learning_rate": 6.031808396900422e-06, "loss": 0.7037, "step": 3246 }, { "epoch": 1.9230085875037015, "grad_norm": 1.1064953536693023, "learning_rate": 6.025938569910271e-06, "loss": 0.7371, "step": 3247 }, { "epoch": 1.9236008291382884, "grad_norm": 1.3352312831811772, "learning_rate": 6.020070368534719e-06, "loss": 0.7153, "step": 3248 }, { "epoch": 1.9241930707728754, "grad_norm": 6.84844144864244, "learning_rate": 6.0142037951741824e-06, "loss": 0.6729, "step": 3249 }, { "epoch": 1.924785312407462, "grad_norm": 1.2959405118553768, "learning_rate": 6.00833885222843e-06, "loss": 0.6972, "step": 3250 }, { "epoch": 1.9253775540420492, "grad_norm": 1.6921129797399264, "learning_rate": 6.002475542096548e-06, "loss": 0.7132, "step": 3251 }, { "epoch": 1.925969795676636, "grad_norm": 1.899108534796794, "learning_rate": 5.996613867176964e-06, "loss": 0.7072, "step": 3252 }, { "epoch": 1.926562037311223, "grad_norm": 1.6781426823646384, "learning_rate": 5.9907538298674265e-06, "loss": 0.7478, "step": 3253 }, { "epoch": 1.9271542789458098, "grad_norm": 1.3313919012515292, "learning_rate": 5.984895432565022e-06, "loss": 0.7394, "step": 3254 }, { "epoch": 1.9277465205803967, "grad_norm": 1.556101241994347, "learning_rate": 5.979038677666167e-06, "loss": 0.7304, "step": 3255 }, { "epoch": 1.9283387622149837, "grad_norm": 1.434032373329083, "learning_rate": 5.973183567566605e-06, "loss": 0.7178, "step": 3256 }, { "epoch": 1.9289310038495706, "grad_norm": 2.078218024535206, "learning_rate": 5.967330104661402e-06, "loss": 0.6953, "step": 3257 }, { "epoch": 1.9295232454841575, "grad_norm": 1.9496636146502733, "learning_rate": 5.96147829134495e-06, "loss": 0.6895, "step": 3258 }, { "epoch": 1.9301154871187445, "grad_norm": 1.7924248358414332, "learning_rate": 5.955628130010977e-06, "loss": 0.7637, "step": 3259 }, { "epoch": 1.9307077287533314, "grad_norm": 1.6764468299545592, "learning_rate": 5.949779623052526e-06, "loss": 0.6733, "step": 3260 }, { "epoch": 1.9312999703879181, "grad_norm": 1.3373116114928854, "learning_rate": 5.9439327728619634e-06, "loss": 0.6799, "step": 3261 }, { "epoch": 1.9318922120225053, "grad_norm": 1.6119800507160023, "learning_rate": 5.9380875818309805e-06, "loss": 0.6576, "step": 3262 }, { "epoch": 1.932484453657092, "grad_norm": 2.318030452874941, "learning_rate": 5.932244052350585e-06, "loss": 0.6927, "step": 3263 }, { "epoch": 1.9330766952916791, "grad_norm": 1.498808583130641, "learning_rate": 5.926402186811118e-06, "loss": 0.6914, "step": 3264 }, { "epoch": 1.9336689369262658, "grad_norm": 1.1247024199131759, "learning_rate": 5.920561987602224e-06, "loss": 0.721, "step": 3265 }, { "epoch": 1.9342611785608528, "grad_norm": 2.2083931482923993, "learning_rate": 5.914723457112877e-06, "loss": 0.6962, "step": 3266 }, { "epoch": 1.9348534201954397, "grad_norm": 2.703573235446084, "learning_rate": 5.908886597731358e-06, "loss": 0.6997, "step": 3267 }, { "epoch": 1.9354456618300266, "grad_norm": 1.4305912748643144, "learning_rate": 5.903051411845282e-06, "loss": 0.7246, "step": 3268 }, { "epoch": 1.9360379034646136, "grad_norm": 1.4530135063532168, "learning_rate": 5.897217901841554e-06, "loss": 0.6948, "step": 3269 }, { "epoch": 1.9366301450992005, "grad_norm": 1.4638908118569964, "learning_rate": 5.8913860701064175e-06, "loss": 0.7293, "step": 3270 }, { "epoch": 1.9372223867337874, "grad_norm": 1.3397680960176233, "learning_rate": 5.885555919025414e-06, "loss": 0.7114, "step": 3271 }, { "epoch": 1.9378146283683741, "grad_norm": 1.4925051038532378, "learning_rate": 5.879727450983412e-06, "loss": 0.6961, "step": 3272 }, { "epoch": 1.9384068700029613, "grad_norm": 1.2079008891186787, "learning_rate": 5.873900668364572e-06, "loss": 0.7337, "step": 3273 }, { "epoch": 1.938999111637548, "grad_norm": 1.1990174795782194, "learning_rate": 5.868075573552383e-06, "loss": 0.7555, "step": 3274 }, { "epoch": 1.9395913532721352, "grad_norm": 1.473905606977259, "learning_rate": 5.862252168929632e-06, "loss": 0.7142, "step": 3275 }, { "epoch": 1.9401835949067219, "grad_norm": 2.1885900149709694, "learning_rate": 5.856430456878424e-06, "loss": 0.7396, "step": 3276 }, { "epoch": 1.9407758365413088, "grad_norm": 1.9165088813619322, "learning_rate": 5.850610439780158e-06, "loss": 0.7055, "step": 3277 }, { "epoch": 1.9413680781758957, "grad_norm": 1.1598467306680342, "learning_rate": 5.844792120015556e-06, "loss": 0.7183, "step": 3278 }, { "epoch": 1.9419603198104827, "grad_norm": 1.4065104988302588, "learning_rate": 5.838975499964636e-06, "loss": 0.6838, "step": 3279 }, { "epoch": 1.9425525614450696, "grad_norm": 1.3241034015935482, "learning_rate": 5.833160582006722e-06, "loss": 0.7029, "step": 3280 }, { "epoch": 1.9431448030796565, "grad_norm": 1.5431286389019339, "learning_rate": 5.827347368520444e-06, "loss": 0.7218, "step": 3281 }, { "epoch": 1.9437370447142435, "grad_norm": 1.4239447287632976, "learning_rate": 5.821535861883729e-06, "loss": 0.7142, "step": 3282 }, { "epoch": 1.9443292863488302, "grad_norm": 1.4592705812164701, "learning_rate": 5.815726064473812e-06, "loss": 0.7701, "step": 3283 }, { "epoch": 1.9449215279834173, "grad_norm": 1.449449394771617, "learning_rate": 5.8099179786672365e-06, "loss": 0.7341, "step": 3284 }, { "epoch": 1.945513769618004, "grad_norm": 1.131051802200009, "learning_rate": 5.80411160683982e-06, "loss": 0.7324, "step": 3285 }, { "epoch": 1.9461060112525912, "grad_norm": 1.2255859547243508, "learning_rate": 5.798306951366701e-06, "loss": 0.7226, "step": 3286 }, { "epoch": 1.946698252887178, "grad_norm": 1.8162184207193348, "learning_rate": 5.7925040146223155e-06, "loss": 0.6954, "step": 3287 }, { "epoch": 1.9472904945217648, "grad_norm": 1.2603648716863414, "learning_rate": 5.786702798980388e-06, "loss": 0.7035, "step": 3288 }, { "epoch": 1.9478827361563518, "grad_norm": 2.0203946682183487, "learning_rate": 5.780903306813937e-06, "loss": 0.696, "step": 3289 }, { "epoch": 1.9484749777909387, "grad_norm": 2.638924315398722, "learning_rate": 5.775105540495284e-06, "loss": 0.6752, "step": 3290 }, { "epoch": 1.9490672194255256, "grad_norm": 4.143046528582363, "learning_rate": 5.769309502396046e-06, "loss": 0.7165, "step": 3291 }, { "epoch": 1.9496594610601126, "grad_norm": 1.9906732930148696, "learning_rate": 5.763515194887126e-06, "loss": 0.7242, "step": 3292 }, { "epoch": 1.9502517026946995, "grad_norm": 1.5769279032518237, "learning_rate": 5.757722620338715e-06, "loss": 0.7076, "step": 3293 }, { "epoch": 1.9508439443292862, "grad_norm": 1.4341838760864, "learning_rate": 5.751931781120308e-06, "loss": 0.7051, "step": 3294 }, { "epoch": 1.9514361859638734, "grad_norm": 1.4434054947134414, "learning_rate": 5.746142679600687e-06, "loss": 0.7102, "step": 3295 }, { "epoch": 1.95202842759846, "grad_norm": 2.0084791627967924, "learning_rate": 5.740355318147916e-06, "loss": 0.7052, "step": 3296 }, { "epoch": 1.9526206692330472, "grad_norm": 1.3474318765956996, "learning_rate": 5.734569699129347e-06, "loss": 0.7527, "step": 3297 }, { "epoch": 1.953212910867634, "grad_norm": 1.9311183919130495, "learning_rate": 5.728785824911627e-06, "loss": 0.7138, "step": 3298 }, { "epoch": 1.9538051525022209, "grad_norm": 1.8418123316642743, "learning_rate": 5.723003697860692e-06, "loss": 0.6989, "step": 3299 }, { "epoch": 1.9543973941368078, "grad_norm": 1.8187532795120556, "learning_rate": 5.717223320341751e-06, "loss": 0.6974, "step": 3300 }, { "epoch": 1.9549896357713947, "grad_norm": 1.355389837624177, "learning_rate": 5.711444694719299e-06, "loss": 0.7415, "step": 3301 }, { "epoch": 1.9555818774059817, "grad_norm": 1.4523243460694153, "learning_rate": 5.70566782335713e-06, "loss": 0.7046, "step": 3302 }, { "epoch": 1.9561741190405686, "grad_norm": 1.3091809995414065, "learning_rate": 5.699892708618297e-06, "loss": 0.7168, "step": 3303 }, { "epoch": 1.9567663606751555, "grad_norm": 2.142139867997893, "learning_rate": 5.69411935286516e-06, "loss": 0.7549, "step": 3304 }, { "epoch": 1.9573586023097422, "grad_norm": 2.5024110696824455, "learning_rate": 5.6883477584593325e-06, "loss": 0.7202, "step": 3305 }, { "epoch": 1.9579508439443294, "grad_norm": 2.2513293693234813, "learning_rate": 5.682577927761732e-06, "loss": 0.7172, "step": 3306 }, { "epoch": 1.958543085578916, "grad_norm": 1.2507660474168645, "learning_rate": 5.676809863132537e-06, "loss": 0.711, "step": 3307 }, { "epoch": 1.9591353272135033, "grad_norm": 1.2539660203271004, "learning_rate": 5.671043566931216e-06, "loss": 0.7007, "step": 3308 }, { "epoch": 1.95972756884809, "grad_norm": 1.9143243645347823, "learning_rate": 5.665279041516501e-06, "loss": 0.7193, "step": 3309 }, { "epoch": 1.960319810482677, "grad_norm": 6.37124555621677, "learning_rate": 5.659516289246414e-06, "loss": 0.7501, "step": 3310 }, { "epoch": 1.9609120521172638, "grad_norm": 1.4767774435352106, "learning_rate": 5.6537553124782395e-06, "loss": 0.7479, "step": 3311 }, { "epoch": 1.9615042937518508, "grad_norm": 1.6017773415211547, "learning_rate": 5.647996113568547e-06, "loss": 0.7181, "step": 3312 }, { "epoch": 1.9620965353864377, "grad_norm": 2.1133632644301747, "learning_rate": 5.642238694873165e-06, "loss": 0.7301, "step": 3313 }, { "epoch": 1.9626887770210246, "grad_norm": 1.4351193282078467, "learning_rate": 5.636483058747209e-06, "loss": 0.6863, "step": 3314 }, { "epoch": 1.9632810186556116, "grad_norm": 1.7463182715444514, "learning_rate": 5.63072920754505e-06, "loss": 0.6823, "step": 3315 }, { "epoch": 1.9638732602901983, "grad_norm": 1.9641501791574498, "learning_rate": 5.624977143620347e-06, "loss": 0.7183, "step": 3316 }, { "epoch": 1.9644655019247854, "grad_norm": 1.5984745729773937, "learning_rate": 5.619226869326006e-06, "loss": 0.7001, "step": 3317 }, { "epoch": 1.9650577435593721, "grad_norm": 1.504200180863183, "learning_rate": 5.613478387014223e-06, "loss": 0.7457, "step": 3318 }, { "epoch": 1.9656499851939593, "grad_norm": 1.165932682338627, "learning_rate": 5.6077316990364415e-06, "loss": 0.7008, "step": 3319 }, { "epoch": 1.966242226828546, "grad_norm": 1.2741156158971194, "learning_rate": 5.601986807743388e-06, "loss": 0.724, "step": 3320 }, { "epoch": 1.966834468463133, "grad_norm": 1.376216878070863, "learning_rate": 5.5962437154850434e-06, "loss": 0.738, "step": 3321 }, { "epoch": 1.9674267100977199, "grad_norm": 2.0904006798570567, "learning_rate": 5.5905024246106485e-06, "loss": 0.7218, "step": 3322 }, { "epoch": 1.9680189517323068, "grad_norm": 1.0737572438567344, "learning_rate": 5.584762937468722e-06, "loss": 0.7327, "step": 3323 }, { "epoch": 1.9686111933668937, "grad_norm": 2.387906025872129, "learning_rate": 5.579025256407038e-06, "loss": 0.7739, "step": 3324 }, { "epoch": 1.9692034350014807, "grad_norm": 4.887193226453433, "learning_rate": 5.573289383772628e-06, "loss": 0.7045, "step": 3325 }, { "epoch": 1.9697956766360676, "grad_norm": 1.2124274832546234, "learning_rate": 5.567555321911782e-06, "loss": 0.6857, "step": 3326 }, { "epoch": 1.9703879182706543, "grad_norm": 1.2868902005451346, "learning_rate": 5.561823073170056e-06, "loss": 0.7062, "step": 3327 }, { "epoch": 1.9709801599052414, "grad_norm": 1.4155299558301717, "learning_rate": 5.55609263989227e-06, "loss": 0.6972, "step": 3328 }, { "epoch": 1.9715724015398282, "grad_norm": 1.990767811294212, "learning_rate": 5.550364024422486e-06, "loss": 0.742, "step": 3329 }, { "epoch": 1.9721646431744153, "grad_norm": 1.2244919516098602, "learning_rate": 5.544637229104027e-06, "loss": 0.7393, "step": 3330 }, { "epoch": 1.972756884809002, "grad_norm": 1.5478990157497663, "learning_rate": 5.538912256279479e-06, "loss": 0.7008, "step": 3331 }, { "epoch": 1.973349126443589, "grad_norm": 1.248386021207724, "learning_rate": 5.533189108290682e-06, "loss": 0.708, "step": 3332 }, { "epoch": 1.9739413680781759, "grad_norm": 1.1494240153147253, "learning_rate": 5.5274677874787195e-06, "loss": 0.7127, "step": 3333 }, { "epoch": 1.9745336097127628, "grad_norm": 2.0642350332580097, "learning_rate": 5.5217482961839305e-06, "loss": 0.7059, "step": 3334 }, { "epoch": 1.9751258513473497, "grad_norm": 1.4089725977182308, "learning_rate": 5.516030636745914e-06, "loss": 0.6773, "step": 3335 }, { "epoch": 1.9757180929819367, "grad_norm": 1.9085909255822204, "learning_rate": 5.51031481150352e-06, "loss": 0.7134, "step": 3336 }, { "epoch": 1.9763103346165236, "grad_norm": 2.0113642440749877, "learning_rate": 5.504600822794827e-06, "loss": 0.7131, "step": 3337 }, { "epoch": 1.9769025762511103, "grad_norm": 1.0613716617183546, "learning_rate": 5.498888672957187e-06, "loss": 0.7234, "step": 3338 }, { "epoch": 1.9774948178856975, "grad_norm": 1.9231732798281154, "learning_rate": 5.493178364327192e-06, "loss": 0.7543, "step": 3339 }, { "epoch": 1.9780870595202842, "grad_norm": 1.2516392211577612, "learning_rate": 5.487469899240678e-06, "loss": 0.6811, "step": 3340 }, { "epoch": 1.9786793011548713, "grad_norm": 1.767242904928232, "learning_rate": 5.481763280032722e-06, "loss": 0.7349, "step": 3341 }, { "epoch": 1.979271542789458, "grad_norm": 1.6839167459983475, "learning_rate": 5.476058509037658e-06, "loss": 0.7343, "step": 3342 }, { "epoch": 1.979863784424045, "grad_norm": 2.2075665620689353, "learning_rate": 5.470355588589063e-06, "loss": 0.7268, "step": 3343 }, { "epoch": 1.980456026058632, "grad_norm": 2.8495458199508636, "learning_rate": 5.4646545210197435e-06, "loss": 0.761, "step": 3344 }, { "epoch": 1.9810482676932188, "grad_norm": 1.7526127261012925, "learning_rate": 5.458955308661758e-06, "loss": 0.7317, "step": 3345 }, { "epoch": 1.9816405093278058, "grad_norm": 1.953204597527354, "learning_rate": 5.453257953846405e-06, "loss": 0.7254, "step": 3346 }, { "epoch": 1.9822327509623927, "grad_norm": 2.6955617878160893, "learning_rate": 5.447562458904227e-06, "loss": 0.7285, "step": 3347 }, { "epoch": 1.9828249925969796, "grad_norm": 1.6118188469147372, "learning_rate": 5.441868826165002e-06, "loss": 0.7359, "step": 3348 }, { "epoch": 1.9834172342315664, "grad_norm": 1.9770522714718386, "learning_rate": 5.436177057957739e-06, "loss": 0.7308, "step": 3349 }, { "epoch": 1.9840094758661535, "grad_norm": 1.629247494410455, "learning_rate": 5.430487156610695e-06, "loss": 0.7142, "step": 3350 }, { "epoch": 1.9846017175007402, "grad_norm": 1.2839049398259734, "learning_rate": 5.4247991244513635e-06, "loss": 0.7203, "step": 3351 }, { "epoch": 1.9851939591353274, "grad_norm": 1.864717196615187, "learning_rate": 5.419112963806468e-06, "loss": 0.6983, "step": 3352 }, { "epoch": 1.985786200769914, "grad_norm": 1.4608471929423679, "learning_rate": 5.4134286770019595e-06, "loss": 0.6954, "step": 3353 }, { "epoch": 1.986378442404501, "grad_norm": 2.368513304613487, "learning_rate": 5.407746266363039e-06, "loss": 0.7199, "step": 3354 }, { "epoch": 1.986970684039088, "grad_norm": 2.3133080313204375, "learning_rate": 5.402065734214135e-06, "loss": 0.7527, "step": 3355 }, { "epoch": 1.9875629256736749, "grad_norm": 1.236304503675414, "learning_rate": 5.3963870828788975e-06, "loss": 0.6784, "step": 3356 }, { "epoch": 1.9881551673082618, "grad_norm": 1.6483424083328333, "learning_rate": 5.390710314680214e-06, "loss": 0.7044, "step": 3357 }, { "epoch": 1.9887474089428485, "grad_norm": 6.5521490398349735, "learning_rate": 5.3850354319402095e-06, "loss": 0.6524, "step": 3358 }, { "epoch": 1.9893396505774357, "grad_norm": 1.2554308783934172, "learning_rate": 5.379362436980222e-06, "loss": 0.7141, "step": 3359 }, { "epoch": 1.9899318922120224, "grad_norm": 1.8973904162055975, "learning_rate": 5.373691332120832e-06, "loss": 0.71, "step": 3360 }, { "epoch": 1.9905241338466095, "grad_norm": 1.3579617035026215, "learning_rate": 5.368022119681833e-06, "loss": 0.7219, "step": 3361 }, { "epoch": 1.9911163754811962, "grad_norm": 3.758540120537383, "learning_rate": 5.362354801982259e-06, "loss": 0.7086, "step": 3362 }, { "epoch": 1.9917086171157834, "grad_norm": 4.086844855136015, "learning_rate": 5.356689381340354e-06, "loss": 0.7156, "step": 3363 }, { "epoch": 1.99230085875037, "grad_norm": 2.7000807650636185, "learning_rate": 5.351025860073604e-06, "loss": 0.7186, "step": 3364 }, { "epoch": 1.992893100384957, "grad_norm": 1.302878544865362, "learning_rate": 5.345364240498696e-06, "loss": 0.7177, "step": 3365 }, { "epoch": 1.993485342019544, "grad_norm": 1.341163287967209, "learning_rate": 5.3397045249315615e-06, "loss": 0.747, "step": 3366 }, { "epoch": 1.994077583654131, "grad_norm": 1.8874536605468482, "learning_rate": 5.334046715687334e-06, "loss": 0.736, "step": 3367 }, { "epoch": 1.9946698252887178, "grad_norm": 1.1757321216705952, "learning_rate": 5.328390815080381e-06, "loss": 0.707, "step": 3368 }, { "epoch": 1.9952620669233045, "grad_norm": 2.40844448714098, "learning_rate": 5.32273682542428e-06, "loss": 0.7085, "step": 3369 }, { "epoch": 1.9958543085578917, "grad_norm": 1.7338073370473768, "learning_rate": 5.317084749031835e-06, "loss": 0.6737, "step": 3370 }, { "epoch": 1.9964465501924784, "grad_norm": 1.602486533397963, "learning_rate": 5.311434588215057e-06, "loss": 0.7334, "step": 3371 }, { "epoch": 1.9970387918270656, "grad_norm": 1.8450498725558269, "learning_rate": 5.3057863452851875e-06, "loss": 0.7309, "step": 3372 }, { "epoch": 1.9976310334616523, "grad_norm": 1.1497170574169295, "learning_rate": 5.300140022552671e-06, "loss": 0.7375, "step": 3373 }, { "epoch": 1.9982232750962394, "grad_norm": 1.5753751548871004, "learning_rate": 5.294495622327167e-06, "loss": 0.7498, "step": 3374 }, { "epoch": 1.9988155167308261, "grad_norm": 7.073970313653954, "learning_rate": 5.288853146917557e-06, "loss": 0.7538, "step": 3375 }, { "epoch": 1.999407758365413, "grad_norm": 1.2888380404329518, "learning_rate": 5.283212598631935e-06, "loss": 0.7392, "step": 3376 }, { "epoch": 2.0, "grad_norm": 1.1340654280485976, "learning_rate": 5.277573979777597e-06, "loss": 0.6702, "step": 3377 }, { "epoch": 2.0005922416345867, "grad_norm": 1.1704057850103424, "learning_rate": 5.271937292661054e-06, "loss": 0.6327, "step": 3378 }, { "epoch": 2.001184483269174, "grad_norm": 1.2928344272015833, "learning_rate": 5.266302539588029e-06, "loss": 0.651, "step": 3379 }, { "epoch": 2.0017767249037606, "grad_norm": 1.8183856023739275, "learning_rate": 5.260669722863457e-06, "loss": 0.6276, "step": 3380 }, { "epoch": 2.0023689665383477, "grad_norm": 1.232591682636013, "learning_rate": 5.255038844791475e-06, "loss": 0.655, "step": 3381 }, { "epoch": 2.0029612081729344, "grad_norm": 1.439614040820058, "learning_rate": 5.249409907675422e-06, "loss": 0.6478, "step": 3382 }, { "epoch": 2.0035534498075216, "grad_norm": 1.2378098354139504, "learning_rate": 5.243782913817858e-06, "loss": 0.5961, "step": 3383 }, { "epoch": 2.0041456914421083, "grad_norm": 1.3329846166899064, "learning_rate": 5.238157865520539e-06, "loss": 0.6206, "step": 3384 }, { "epoch": 2.0047379330766955, "grad_norm": 2.1513887798109037, "learning_rate": 5.232534765084425e-06, "loss": 0.6225, "step": 3385 }, { "epoch": 2.005330174711282, "grad_norm": 1.4341250498776674, "learning_rate": 5.226913614809677e-06, "loss": 0.6466, "step": 3386 }, { "epoch": 2.0059224163458693, "grad_norm": 2.65394252956428, "learning_rate": 5.221294416995661e-06, "loss": 0.6219, "step": 3387 }, { "epoch": 2.006514657980456, "grad_norm": 1.939125905672404, "learning_rate": 5.215677173940959e-06, "loss": 0.6105, "step": 3388 }, { "epoch": 2.0071068996150427, "grad_norm": 1.6103895073133687, "learning_rate": 5.210061887943318e-06, "loss": 0.5788, "step": 3389 }, { "epoch": 2.00769914124963, "grad_norm": 5.940476923941293, "learning_rate": 5.204448561299718e-06, "loss": 0.608, "step": 3390 }, { "epoch": 2.0082913828842166, "grad_norm": 3.4355880637707736, "learning_rate": 5.1988371963063235e-06, "loss": 0.6455, "step": 3391 }, { "epoch": 2.0088836245188038, "grad_norm": 1.709718605173615, "learning_rate": 5.193227795258505e-06, "loss": 0.6509, "step": 3392 }, { "epoch": 2.0094758661533905, "grad_norm": 1.521862412293069, "learning_rate": 5.187620360450809e-06, "loss": 0.6062, "step": 3393 }, { "epoch": 2.0100681077879776, "grad_norm": 2.304842968308541, "learning_rate": 5.182014894176999e-06, "loss": 0.6268, "step": 3394 }, { "epoch": 2.0106603494225643, "grad_norm": 1.6981834404747536, "learning_rate": 5.176411398730028e-06, "loss": 0.6083, "step": 3395 }, { "epoch": 2.0112525910571515, "grad_norm": 1.5414486726862469, "learning_rate": 5.170809876402039e-06, "loss": 0.5742, "step": 3396 }, { "epoch": 2.011844832691738, "grad_norm": 1.8324966309812314, "learning_rate": 5.165210329484366e-06, "loss": 0.6401, "step": 3397 }, { "epoch": 2.0124370743263253, "grad_norm": 1.5280075849514587, "learning_rate": 5.159612760267541e-06, "loss": 0.612, "step": 3398 }, { "epoch": 2.013029315960912, "grad_norm": 1.7053593517597887, "learning_rate": 5.154017171041289e-06, "loss": 0.6234, "step": 3399 }, { "epoch": 2.0136215575954988, "grad_norm": 2.384672543167793, "learning_rate": 5.148423564094517e-06, "loss": 0.5944, "step": 3400 }, { "epoch": 2.014213799230086, "grad_norm": 1.7192733690837867, "learning_rate": 5.142831941715321e-06, "loss": 0.6438, "step": 3401 }, { "epoch": 2.0148060408646726, "grad_norm": 1.2298645317583594, "learning_rate": 5.137242306190991e-06, "loss": 0.6059, "step": 3402 }, { "epoch": 2.01539828249926, "grad_norm": 1.421035277118102, "learning_rate": 5.13165465980801e-06, "loss": 0.6251, "step": 3403 }, { "epoch": 2.0159905241338465, "grad_norm": 1.3393515437383583, "learning_rate": 5.126069004852033e-06, "loss": 0.6224, "step": 3404 }, { "epoch": 2.0165827657684336, "grad_norm": 1.7027652434897638, "learning_rate": 5.120485343607901e-06, "loss": 0.6055, "step": 3405 }, { "epoch": 2.0171750074030204, "grad_norm": 1.1354965871126625, "learning_rate": 5.114903678359655e-06, "loss": 0.6418, "step": 3406 }, { "epoch": 2.0177672490376075, "grad_norm": 2.17653820273378, "learning_rate": 5.10932401139051e-06, "loss": 0.6267, "step": 3407 }, { "epoch": 2.0183594906721942, "grad_norm": 1.1887528683987758, "learning_rate": 5.103746344982859e-06, "loss": 0.603, "step": 3408 }, { "epoch": 2.0189517323067814, "grad_norm": 1.2166339620141267, "learning_rate": 5.09817068141828e-06, "loss": 0.6271, "step": 3409 }, { "epoch": 2.019543973941368, "grad_norm": 1.4946678433457767, "learning_rate": 5.092597022977539e-06, "loss": 0.6135, "step": 3410 }, { "epoch": 2.020136215575955, "grad_norm": 2.181670296560735, "learning_rate": 5.087025371940568e-06, "loss": 0.5295, "step": 3411 }, { "epoch": 2.020728457210542, "grad_norm": 1.1109403860055909, "learning_rate": 5.081455730586495e-06, "loss": 0.6452, "step": 3412 }, { "epoch": 2.0213206988451287, "grad_norm": 1.5294096669192148, "learning_rate": 5.075888101193605e-06, "loss": 0.6353, "step": 3413 }, { "epoch": 2.021912940479716, "grad_norm": 1.2375562922087688, "learning_rate": 5.070322486039383e-06, "loss": 0.6426, "step": 3414 }, { "epoch": 2.0225051821143025, "grad_norm": 1.1958259044194008, "learning_rate": 5.06475888740047e-06, "loss": 0.6161, "step": 3415 }, { "epoch": 2.0230974237488897, "grad_norm": 1.3364125839027878, "learning_rate": 5.059197307552698e-06, "loss": 0.6045, "step": 3416 }, { "epoch": 2.0236896653834764, "grad_norm": 1.471484617701428, "learning_rate": 5.053637748771058e-06, "loss": 0.6244, "step": 3417 }, { "epoch": 2.0242819070180635, "grad_norm": 1.9697646130426174, "learning_rate": 5.048080213329729e-06, "loss": 0.6303, "step": 3418 }, { "epoch": 2.0248741486526503, "grad_norm": 1.2533508619798654, "learning_rate": 5.04252470350205e-06, "loss": 0.5929, "step": 3419 }, { "epoch": 2.0254663902872374, "grad_norm": 1.249684000829904, "learning_rate": 5.036971221560543e-06, "loss": 0.6563, "step": 3420 }, { "epoch": 2.026058631921824, "grad_norm": 1.7695633483618758, "learning_rate": 5.031419769776887e-06, "loss": 0.6099, "step": 3421 }, { "epoch": 2.026650873556411, "grad_norm": 1.5156337584508315, "learning_rate": 5.025870350421945e-06, "loss": 0.6385, "step": 3422 }, { "epoch": 2.027243115190998, "grad_norm": 1.1787473479254456, "learning_rate": 5.020322965765736e-06, "loss": 0.6373, "step": 3423 }, { "epoch": 2.0278353568255847, "grad_norm": 1.2307377200251604, "learning_rate": 5.0147776180774575e-06, "loss": 0.6469, "step": 3424 }, { "epoch": 2.028427598460172, "grad_norm": 1.2005680095700468, "learning_rate": 5.009234309625467e-06, "loss": 0.5903, "step": 3425 }, { "epoch": 2.0290198400947586, "grad_norm": 1.8848824259084624, "learning_rate": 5.003693042677283e-06, "loss": 0.5888, "step": 3426 }, { "epoch": 2.0296120817293457, "grad_norm": 1.4702739853724482, "learning_rate": 4.998153819499601e-06, "loss": 0.5864, "step": 3427 }, { "epoch": 2.0302043233639324, "grad_norm": 1.3620518239621033, "learning_rate": 4.992616642358279e-06, "loss": 0.617, "step": 3428 }, { "epoch": 2.0307965649985196, "grad_norm": 1.1287181683000989, "learning_rate": 4.9870815135183285e-06, "loss": 0.605, "step": 3429 }, { "epoch": 2.0313888066331063, "grad_norm": 1.8477536160925232, "learning_rate": 4.9815484352439244e-06, "loss": 0.6376, "step": 3430 }, { "epoch": 2.0319810482676934, "grad_norm": 1.2689842522920158, "learning_rate": 4.976017409798413e-06, "loss": 0.5849, "step": 3431 }, { "epoch": 2.03257328990228, "grad_norm": 2.2267139790954498, "learning_rate": 4.970488439444296e-06, "loss": 0.6276, "step": 3432 }, { "epoch": 2.033165531536867, "grad_norm": 2.263989349996733, "learning_rate": 4.964961526443231e-06, "loss": 0.6187, "step": 3433 }, { "epoch": 2.033757773171454, "grad_norm": 1.1002565279108922, "learning_rate": 4.95943667305603e-06, "loss": 0.6165, "step": 3434 }, { "epoch": 2.0343500148060407, "grad_norm": 1.1437907896314625, "learning_rate": 4.953913881542677e-06, "loss": 0.6687, "step": 3435 }, { "epoch": 2.034942256440628, "grad_norm": 1.4291661707165033, "learning_rate": 4.948393154162303e-06, "loss": 0.6324, "step": 3436 }, { "epoch": 2.0355344980752146, "grad_norm": 1.2447703838850679, "learning_rate": 4.9428744931731965e-06, "loss": 0.5916, "step": 3437 }, { "epoch": 2.0361267397098017, "grad_norm": 1.3396172149707037, "learning_rate": 4.937357900832793e-06, "loss": 0.6117, "step": 3438 }, { "epoch": 2.0367189813443884, "grad_norm": 1.327487438241329, "learning_rate": 4.931843379397695e-06, "loss": 0.6275, "step": 3439 }, { "epoch": 2.0373112229789756, "grad_norm": 5.634667309355351, "learning_rate": 4.926330931123659e-06, "loss": 0.5808, "step": 3440 }, { "epoch": 2.0379034646135623, "grad_norm": 1.712596745761093, "learning_rate": 4.920820558265569e-06, "loss": 0.6298, "step": 3441 }, { "epoch": 2.038495706248149, "grad_norm": 1.6350123351542276, "learning_rate": 4.915312263077488e-06, "loss": 0.6246, "step": 3442 }, { "epoch": 2.039087947882736, "grad_norm": 1.4377006668550185, "learning_rate": 4.909806047812617e-06, "loss": 0.6212, "step": 3443 }, { "epoch": 2.039680189517323, "grad_norm": 1.3626893907998683, "learning_rate": 4.904301914723315e-06, "loss": 0.5958, "step": 3444 }, { "epoch": 2.04027243115191, "grad_norm": 1.881609654716602, "learning_rate": 4.898799866061068e-06, "loss": 0.6389, "step": 3445 }, { "epoch": 2.0408646727864967, "grad_norm": 1.0881012049516798, "learning_rate": 4.89329990407653e-06, "loss": 0.6571, "step": 3446 }, { "epoch": 2.041456914421084, "grad_norm": 1.4033919456100592, "learning_rate": 4.887802031019498e-06, "loss": 0.631, "step": 3447 }, { "epoch": 2.0420491560556706, "grad_norm": 3.113208111035224, "learning_rate": 4.882306249138909e-06, "loss": 0.6045, "step": 3448 }, { "epoch": 2.0426413976902578, "grad_norm": 1.2724517686667787, "learning_rate": 4.876812560682842e-06, "loss": 0.6156, "step": 3449 }, { "epoch": 2.0432336393248445, "grad_norm": 1.472100349180877, "learning_rate": 4.871320967898528e-06, "loss": 0.6171, "step": 3450 }, { "epoch": 2.0438258809594316, "grad_norm": 1.7554861202098981, "learning_rate": 4.865831473032342e-06, "loss": 0.6485, "step": 3451 }, { "epoch": 2.0444181225940183, "grad_norm": 0.892915766556675, "learning_rate": 4.860344078329791e-06, "loss": 0.5951, "step": 3452 }, { "epoch": 2.045010364228605, "grad_norm": 1.2477613762328605, "learning_rate": 4.8548587860355255e-06, "loss": 0.6422, "step": 3453 }, { "epoch": 2.045602605863192, "grad_norm": 1.3476862158178415, "learning_rate": 4.849375598393342e-06, "loss": 0.6421, "step": 3454 }, { "epoch": 2.046194847497779, "grad_norm": 1.3151784488318263, "learning_rate": 4.843894517646176e-06, "loss": 0.6417, "step": 3455 }, { "epoch": 2.046787089132366, "grad_norm": 2.0867879393200934, "learning_rate": 4.838415546036095e-06, "loss": 0.6006, "step": 3456 }, { "epoch": 2.0473793307669528, "grad_norm": 1.528214198655404, "learning_rate": 4.8329386858043005e-06, "loss": 0.6131, "step": 3457 }, { "epoch": 2.04797157240154, "grad_norm": 1.6703970326441546, "learning_rate": 4.827463939191141e-06, "loss": 0.6009, "step": 3458 }, { "epoch": 2.0485638140361266, "grad_norm": 1.084600846636688, "learning_rate": 4.821991308436102e-06, "loss": 0.5974, "step": 3459 }, { "epoch": 2.049156055670714, "grad_norm": 1.4736165984007512, "learning_rate": 4.816520795777789e-06, "loss": 0.5891, "step": 3460 }, { "epoch": 2.0497482973053005, "grad_norm": 1.4108193712127584, "learning_rate": 4.811052403453949e-06, "loss": 0.5992, "step": 3461 }, { "epoch": 2.0503405389398877, "grad_norm": 1.2902478597441098, "learning_rate": 4.805586133701468e-06, "loss": 0.652, "step": 3462 }, { "epoch": 2.0509327805744744, "grad_norm": 1.4732401085829676, "learning_rate": 4.800121988756352e-06, "loss": 0.6209, "step": 3463 }, { "epoch": 2.051525022209061, "grad_norm": 1.7492854759727523, "learning_rate": 4.7946599708537485e-06, "loss": 0.5899, "step": 3464 }, { "epoch": 2.0521172638436482, "grad_norm": 1.3793707141008569, "learning_rate": 4.789200082227924e-06, "loss": 0.6281, "step": 3465 }, { "epoch": 2.052709505478235, "grad_norm": 1.5203850237041459, "learning_rate": 4.783742325112286e-06, "loss": 0.6517, "step": 3466 }, { "epoch": 2.053301747112822, "grad_norm": 1.5248222664948767, "learning_rate": 4.7782867017393585e-06, "loss": 0.6476, "step": 3467 }, { "epoch": 2.053893988747409, "grad_norm": 1.2625856377787512, "learning_rate": 4.772833214340805e-06, "loss": 0.6413, "step": 3468 }, { "epoch": 2.054486230381996, "grad_norm": 1.2851858362812827, "learning_rate": 4.7673818651474e-06, "loss": 0.629, "step": 3469 }, { "epoch": 2.0550784720165827, "grad_norm": 1.296157068335424, "learning_rate": 4.761932656389061e-06, "loss": 0.6189, "step": 3470 }, { "epoch": 2.05567071365117, "grad_norm": 1.4886765308566954, "learning_rate": 4.756485590294813e-06, "loss": 0.6472, "step": 3471 }, { "epoch": 2.0562629552857565, "grad_norm": 1.1373661307548197, "learning_rate": 4.751040669092819e-06, "loss": 0.5558, "step": 3472 }, { "epoch": 2.0568551969203437, "grad_norm": 1.3633758311544337, "learning_rate": 4.745597895010351e-06, "loss": 0.604, "step": 3473 }, { "epoch": 2.0574474385549304, "grad_norm": 1.1929108488372469, "learning_rate": 4.740157270273816e-06, "loss": 0.6514, "step": 3474 }, { "epoch": 2.058039680189517, "grad_norm": 3.1263654625674975, "learning_rate": 4.7347187971087294e-06, "loss": 0.6156, "step": 3475 }, { "epoch": 2.0586319218241043, "grad_norm": 2.327298864991031, "learning_rate": 4.729282477739741e-06, "loss": 0.6476, "step": 3476 }, { "epoch": 2.059224163458691, "grad_norm": 1.5846789608099434, "learning_rate": 4.723848314390604e-06, "loss": 0.6529, "step": 3477 }, { "epoch": 2.059816405093278, "grad_norm": 1.4670029209129003, "learning_rate": 4.718416309284196e-06, "loss": 0.6004, "step": 3478 }, { "epoch": 2.060408646727865, "grad_norm": 4.684220098207152, "learning_rate": 4.712986464642515e-06, "loss": 0.5856, "step": 3479 }, { "epoch": 2.061000888362452, "grad_norm": 1.0163949128750598, "learning_rate": 4.707558782686677e-06, "loss": 0.5652, "step": 3480 }, { "epoch": 2.0615931299970387, "grad_norm": 1.9902642913838888, "learning_rate": 4.702133265636905e-06, "loss": 0.595, "step": 3481 }, { "epoch": 2.062185371631626, "grad_norm": 0.976503070725885, "learning_rate": 4.6967099157125384e-06, "loss": 0.5816, "step": 3482 }, { "epoch": 2.0627776132662126, "grad_norm": 1.1674854697438484, "learning_rate": 4.6912887351320336e-06, "loss": 0.6192, "step": 3483 }, { "epoch": 2.0633698549007997, "grad_norm": 1.080121092579032, "learning_rate": 4.685869726112963e-06, "loss": 0.5851, "step": 3484 }, { "epoch": 2.0639620965353864, "grad_norm": 1.1715927470368515, "learning_rate": 4.680452890872003e-06, "loss": 0.6208, "step": 3485 }, { "epoch": 2.064554338169973, "grad_norm": 2.3721843450455093, "learning_rate": 4.675038231624939e-06, "loss": 0.6333, "step": 3486 }, { "epoch": 2.0651465798045603, "grad_norm": 1.4695028389780564, "learning_rate": 4.669625750586675e-06, "loss": 0.5885, "step": 3487 }, { "epoch": 2.065738821439147, "grad_norm": 6.3301735183432, "learning_rate": 4.664215449971225e-06, "loss": 0.6118, "step": 3488 }, { "epoch": 2.066331063073734, "grad_norm": 0.9900749638912005, "learning_rate": 4.658807331991702e-06, "loss": 0.5987, "step": 3489 }, { "epoch": 2.066923304708321, "grad_norm": 1.1137652898298571, "learning_rate": 4.653401398860324e-06, "loss": 0.6017, "step": 3490 }, { "epoch": 2.067515546342908, "grad_norm": 1.2870621869899241, "learning_rate": 4.64799765278843e-06, "loss": 0.6256, "step": 3491 }, { "epoch": 2.0681077879774947, "grad_norm": 1.8663809342938509, "learning_rate": 4.6425960959864556e-06, "loss": 0.6282, "step": 3492 }, { "epoch": 2.068700029612082, "grad_norm": 1.1936625573372333, "learning_rate": 4.637196730663941e-06, "loss": 0.6056, "step": 3493 }, { "epoch": 2.0692922712466686, "grad_norm": 1.1774196803926553, "learning_rate": 4.631799559029524e-06, "loss": 0.6547, "step": 3494 }, { "epoch": 2.0698845128812557, "grad_norm": 1.0847314619442951, "learning_rate": 4.626404583290956e-06, "loss": 0.6047, "step": 3495 }, { "epoch": 2.0704767545158425, "grad_norm": 1.8732417061022584, "learning_rate": 4.621011805655093e-06, "loss": 0.6184, "step": 3496 }, { "epoch": 2.071068996150429, "grad_norm": 1.0566441622558125, "learning_rate": 4.615621228327869e-06, "loss": 0.6037, "step": 3497 }, { "epoch": 2.0716612377850163, "grad_norm": 1.4098753153815882, "learning_rate": 4.61023285351434e-06, "loss": 0.6616, "step": 3498 }, { "epoch": 2.072253479419603, "grad_norm": 1.0888925606822704, "learning_rate": 4.60484668341866e-06, "loss": 0.6057, "step": 3499 }, { "epoch": 2.07284572105419, "grad_norm": 1.6235022191865385, "learning_rate": 4.599462720244071e-06, "loss": 0.615, "step": 3500 }, { "epoch": 2.073437962688777, "grad_norm": 1.268826339976644, "learning_rate": 4.594080966192912e-06, "loss": 0.6819, "step": 3501 }, { "epoch": 2.074030204323364, "grad_norm": 1.188326581207744, "learning_rate": 4.5887014234666275e-06, "loss": 0.5907, "step": 3502 }, { "epoch": 2.0746224459579508, "grad_norm": 1.6688453348930554, "learning_rate": 4.583324094265757e-06, "loss": 0.6598, "step": 3503 }, { "epoch": 2.075214687592538, "grad_norm": 1.3153998492810253, "learning_rate": 4.577948980789924e-06, "loss": 0.6265, "step": 3504 }, { "epoch": 2.0758069292271246, "grad_norm": 1.2858231765513355, "learning_rate": 4.572576085237853e-06, "loss": 0.5932, "step": 3505 }, { "epoch": 2.0763991708617118, "grad_norm": 0.9882491735932337, "learning_rate": 4.567205409807362e-06, "loss": 0.5705, "step": 3506 }, { "epoch": 2.0769914124962985, "grad_norm": 1.731064255571183, "learning_rate": 4.561836956695362e-06, "loss": 0.6653, "step": 3507 }, { "epoch": 2.077583654130885, "grad_norm": 2.457969135297415, "learning_rate": 4.556470728097849e-06, "loss": 0.6452, "step": 3508 }, { "epoch": 2.0781758957654723, "grad_norm": 1.9842081906939593, "learning_rate": 4.55110672620991e-06, "loss": 0.6041, "step": 3509 }, { "epoch": 2.078768137400059, "grad_norm": 1.9749587351865325, "learning_rate": 4.545744953225726e-06, "loss": 0.5994, "step": 3510 }, { "epoch": 2.079360379034646, "grad_norm": 3.1505013963136412, "learning_rate": 4.540385411338567e-06, "loss": 0.6332, "step": 3511 }, { "epoch": 2.079952620669233, "grad_norm": 1.275314172819857, "learning_rate": 4.535028102740785e-06, "loss": 0.6609, "step": 3512 }, { "epoch": 2.08054486230382, "grad_norm": 1.4137589146456349, "learning_rate": 4.529673029623815e-06, "loss": 0.626, "step": 3513 }, { "epoch": 2.081137103938407, "grad_norm": 3.083605720052062, "learning_rate": 4.524320194178189e-06, "loss": 0.607, "step": 3514 }, { "epoch": 2.081729345572994, "grad_norm": 4.411371237327307, "learning_rate": 4.518969598593515e-06, "loss": 0.6471, "step": 3515 }, { "epoch": 2.0823215872075806, "grad_norm": 1.3266195802580654, "learning_rate": 4.5136212450584895e-06, "loss": 0.6135, "step": 3516 }, { "epoch": 2.082913828842168, "grad_norm": 1.4653968395659105, "learning_rate": 4.508275135760887e-06, "loss": 0.6175, "step": 3517 }, { "epoch": 2.0835060704767545, "grad_norm": 0.990190635939963, "learning_rate": 4.502931272887572e-06, "loss": 0.6221, "step": 3518 }, { "epoch": 2.0840983121113412, "grad_norm": 1.8306688688850197, "learning_rate": 4.497589658624477e-06, "loss": 0.5972, "step": 3519 }, { "epoch": 2.0846905537459284, "grad_norm": 1.2748246161237164, "learning_rate": 4.492250295156632e-06, "loss": 0.6064, "step": 3520 }, { "epoch": 2.085282795380515, "grad_norm": 3.1717060924349134, "learning_rate": 4.486913184668127e-06, "loss": 0.6155, "step": 3521 }, { "epoch": 2.0858750370151022, "grad_norm": 1.6215550053410732, "learning_rate": 4.481578329342149e-06, "loss": 0.6155, "step": 3522 }, { "epoch": 2.086467278649689, "grad_norm": 1.8412996687036585, "learning_rate": 4.476245731360947e-06, "loss": 0.6175, "step": 3523 }, { "epoch": 2.087059520284276, "grad_norm": 1.6036947552013425, "learning_rate": 4.470915392905862e-06, "loss": 0.6359, "step": 3524 }, { "epoch": 2.087651761918863, "grad_norm": 1.2185010704003587, "learning_rate": 4.465587316157291e-06, "loss": 0.648, "step": 3525 }, { "epoch": 2.08824400355345, "grad_norm": 2.117675570470455, "learning_rate": 4.460261503294726e-06, "loss": 0.5918, "step": 3526 }, { "epoch": 2.0888362451880367, "grad_norm": 1.26985997913265, "learning_rate": 4.4549379564967174e-06, "loss": 0.6935, "step": 3527 }, { "epoch": 2.089428486822624, "grad_norm": 1.421007208158808, "learning_rate": 4.449616677940904e-06, "loss": 0.5882, "step": 3528 }, { "epoch": 2.0900207284572105, "grad_norm": 2.5687635296741846, "learning_rate": 4.444297669803981e-06, "loss": 0.5908, "step": 3529 }, { "epoch": 2.0906129700917973, "grad_norm": 1.6681381199853806, "learning_rate": 4.4389809342617195e-06, "loss": 0.6284, "step": 3530 }, { "epoch": 2.0912052117263844, "grad_norm": 1.6690216078536573, "learning_rate": 4.433666473488971e-06, "loss": 0.6287, "step": 3531 }, { "epoch": 2.091797453360971, "grad_norm": 1.231182818609419, "learning_rate": 4.428354289659641e-06, "loss": 0.5921, "step": 3532 }, { "epoch": 2.0923896949955583, "grad_norm": 1.1422701274620846, "learning_rate": 4.423044384946719e-06, "loss": 0.5949, "step": 3533 }, { "epoch": 2.092981936630145, "grad_norm": 1.5358988678809373, "learning_rate": 4.417736761522249e-06, "loss": 0.5869, "step": 3534 }, { "epoch": 2.093574178264732, "grad_norm": 1.8994263747170306, "learning_rate": 4.412431421557351e-06, "loss": 0.6305, "step": 3535 }, { "epoch": 2.094166419899319, "grad_norm": 1.0574749107241654, "learning_rate": 4.407128367222203e-06, "loss": 0.5842, "step": 3536 }, { "epoch": 2.094758661533906, "grad_norm": 1.756684625648516, "learning_rate": 4.401827600686059e-06, "loss": 0.6044, "step": 3537 }, { "epoch": 2.0953509031684927, "grad_norm": 1.1516953678691502, "learning_rate": 4.396529124117223e-06, "loss": 0.6213, "step": 3538 }, { "epoch": 2.09594314480308, "grad_norm": 1.208360469786924, "learning_rate": 4.391232939683077e-06, "loss": 0.6195, "step": 3539 }, { "epoch": 2.0965353864376666, "grad_norm": 1.7581006133683124, "learning_rate": 4.38593904955005e-06, "loss": 0.609, "step": 3540 }, { "epoch": 2.0971276280722533, "grad_norm": 1.370855782157487, "learning_rate": 4.380647455883651e-06, "loss": 0.5826, "step": 3541 }, { "epoch": 2.0977198697068404, "grad_norm": 2.388671548513241, "learning_rate": 4.37535816084843e-06, "loss": 0.6563, "step": 3542 }, { "epoch": 2.098312111341427, "grad_norm": 1.6360347895846583, "learning_rate": 4.3700711666080135e-06, "loss": 0.6808, "step": 3543 }, { "epoch": 2.0989043529760143, "grad_norm": 1.4588269440103798, "learning_rate": 4.364786475325072e-06, "loss": 0.6258, "step": 3544 }, { "epoch": 2.099496594610601, "grad_norm": 1.4736283479225063, "learning_rate": 4.359504089161351e-06, "loss": 0.6395, "step": 3545 }, { "epoch": 2.100088836245188, "grad_norm": 1.1782359902227395, "learning_rate": 4.354224010277632e-06, "loss": 0.6292, "step": 3546 }, { "epoch": 2.100681077879775, "grad_norm": 1.1179803013332563, "learning_rate": 4.348946240833774e-06, "loss": 0.5898, "step": 3547 }, { "epoch": 2.101273319514362, "grad_norm": 1.4815157595037098, "learning_rate": 4.343670782988679e-06, "loss": 0.6271, "step": 3548 }, { "epoch": 2.1018655611489487, "grad_norm": 1.291616988218737, "learning_rate": 4.338397638900301e-06, "loss": 0.5947, "step": 3549 }, { "epoch": 2.102457802783536, "grad_norm": 1.2765987298770318, "learning_rate": 4.333126810725655e-06, "loss": 0.5707, "step": 3550 }, { "epoch": 2.1030500444181226, "grad_norm": 1.375312819341615, "learning_rate": 4.32785830062081e-06, "loss": 0.6335, "step": 3551 }, { "epoch": 2.1036422860527093, "grad_norm": 1.6256621838008019, "learning_rate": 4.322592110740882e-06, "loss": 0.6424, "step": 3552 }, { "epoch": 2.1042345276872965, "grad_norm": 1.289927765078695, "learning_rate": 4.317328243240032e-06, "loss": 0.5974, "step": 3553 }, { "epoch": 2.104826769321883, "grad_norm": 1.2729987598808552, "learning_rate": 4.312066700271483e-06, "loss": 0.6065, "step": 3554 }, { "epoch": 2.1054190109564703, "grad_norm": 1.5025651892089609, "learning_rate": 4.306807483987505e-06, "loss": 0.6541, "step": 3555 }, { "epoch": 2.106011252591057, "grad_norm": 4.027247679133015, "learning_rate": 4.30155059653941e-06, "loss": 0.6414, "step": 3556 }, { "epoch": 2.106603494225644, "grad_norm": 1.3103868133396175, "learning_rate": 4.296296040077557e-06, "loss": 0.6134, "step": 3557 }, { "epoch": 2.107195735860231, "grad_norm": 1.6626367225184855, "learning_rate": 4.291043816751357e-06, "loss": 0.5832, "step": 3558 }, { "epoch": 2.107787977494818, "grad_norm": 1.8842126011304945, "learning_rate": 4.285793928709271e-06, "loss": 0.6293, "step": 3559 }, { "epoch": 2.1083802191294048, "grad_norm": 1.5670020537319092, "learning_rate": 4.280546378098792e-06, "loss": 0.6197, "step": 3560 }, { "epoch": 2.108972460763992, "grad_norm": 1.4217647311568198, "learning_rate": 4.2753011670664625e-06, "loss": 0.6005, "step": 3561 }, { "epoch": 2.1095647023985786, "grad_norm": 1.4382610743287825, "learning_rate": 4.270058297757871e-06, "loss": 0.6199, "step": 3562 }, { "epoch": 2.1101569440331653, "grad_norm": 1.5200269742920969, "learning_rate": 4.264817772317653e-06, "loss": 0.6163, "step": 3563 }, { "epoch": 2.1107491856677525, "grad_norm": 2.061967748457973, "learning_rate": 4.259579592889464e-06, "loss": 0.6036, "step": 3564 }, { "epoch": 2.111341427302339, "grad_norm": 1.193306955598674, "learning_rate": 4.25434376161602e-06, "loss": 0.5957, "step": 3565 }, { "epoch": 2.1119336689369264, "grad_norm": 1.15551634693182, "learning_rate": 4.249110280639076e-06, "loss": 0.5846, "step": 3566 }, { "epoch": 2.112525910571513, "grad_norm": 1.9325467571821633, "learning_rate": 4.243879152099415e-06, "loss": 0.5902, "step": 3567 }, { "epoch": 2.1131181522061, "grad_norm": 1.4819027125682924, "learning_rate": 4.238650378136859e-06, "loss": 0.5652, "step": 3568 }, { "epoch": 2.113710393840687, "grad_norm": 1.221063734006737, "learning_rate": 4.233423960890275e-06, "loss": 0.6213, "step": 3569 }, { "epoch": 2.114302635475274, "grad_norm": 1.2311749892527528, "learning_rate": 4.228199902497565e-06, "loss": 0.621, "step": 3570 }, { "epoch": 2.114894877109861, "grad_norm": 1.6157414496724527, "learning_rate": 4.222978205095659e-06, "loss": 0.6328, "step": 3571 }, { "epoch": 2.115487118744448, "grad_norm": 2.1238378827778654, "learning_rate": 4.217758870820522e-06, "loss": 0.6247, "step": 3572 }, { "epoch": 2.1160793603790347, "grad_norm": 1.6576077793873196, "learning_rate": 4.2125419018071565e-06, "loss": 0.5893, "step": 3573 }, { "epoch": 2.1166716020136214, "grad_norm": 7.289033310054799, "learning_rate": 4.207327300189602e-06, "loss": 0.6229, "step": 3574 }, { "epoch": 2.1172638436482085, "grad_norm": 1.9096206920777807, "learning_rate": 4.202115068100919e-06, "loss": 0.6134, "step": 3575 }, { "epoch": 2.1178560852827952, "grad_norm": 1.3294555847995595, "learning_rate": 4.196905207673201e-06, "loss": 0.6541, "step": 3576 }, { "epoch": 2.1184483269173824, "grad_norm": 2.478570928826208, "learning_rate": 4.191697721037577e-06, "loss": 0.64, "step": 3577 }, { "epoch": 2.119040568551969, "grad_norm": 1.3384108708761757, "learning_rate": 4.186492610324204e-06, "loss": 0.6313, "step": 3578 }, { "epoch": 2.1196328101865562, "grad_norm": 2.059122205094186, "learning_rate": 4.181289877662263e-06, "loss": 0.6483, "step": 3579 }, { "epoch": 2.120225051821143, "grad_norm": 2.210349418550527, "learning_rate": 4.176089525179961e-06, "loss": 0.6482, "step": 3580 }, { "epoch": 2.12081729345573, "grad_norm": 1.2899545185125494, "learning_rate": 4.1708915550045385e-06, "loss": 0.6031, "step": 3581 }, { "epoch": 2.121409535090317, "grad_norm": 1.0636519968386704, "learning_rate": 4.165695969262259e-06, "loss": 0.6238, "step": 3582 }, { "epoch": 2.122001776724904, "grad_norm": 1.8256192982659298, "learning_rate": 4.160502770078407e-06, "loss": 0.6507, "step": 3583 }, { "epoch": 2.1225940183594907, "grad_norm": 1.3887604621578677, "learning_rate": 4.155311959577289e-06, "loss": 0.5671, "step": 3584 }, { "epoch": 2.1231862599940774, "grad_norm": 1.0622378454165742, "learning_rate": 4.150123539882246e-06, "loss": 0.5976, "step": 3585 }, { "epoch": 2.1237785016286646, "grad_norm": 1.3574063696461265, "learning_rate": 4.144937513115627e-06, "loss": 0.6602, "step": 3586 }, { "epoch": 2.1243707432632513, "grad_norm": 1.4538940676417582, "learning_rate": 4.139753881398813e-06, "loss": 0.5957, "step": 3587 }, { "epoch": 2.1249629848978384, "grad_norm": 2.264238665767258, "learning_rate": 4.134572646852196e-06, "loss": 0.5999, "step": 3588 }, { "epoch": 2.125555226532425, "grad_norm": 1.2966266388996097, "learning_rate": 4.1293938115952e-06, "loss": 0.6102, "step": 3589 }, { "epoch": 2.1261474681670123, "grad_norm": 33.071460074035244, "learning_rate": 4.124217377746251e-06, "loss": 0.566, "step": 3590 }, { "epoch": 2.126739709801599, "grad_norm": 1.1430460336126136, "learning_rate": 4.11904334742281e-06, "loss": 0.6487, "step": 3591 }, { "epoch": 2.127331951436186, "grad_norm": 1.340637625601947, "learning_rate": 4.113871722741337e-06, "loss": 0.614, "step": 3592 }, { "epoch": 2.127924193070773, "grad_norm": 4.321691288846663, "learning_rate": 4.108702505817327e-06, "loss": 0.6521, "step": 3593 }, { "epoch": 2.12851643470536, "grad_norm": 1.3840804956759036, "learning_rate": 4.103535698765272e-06, "loss": 0.6095, "step": 3594 }, { "epoch": 2.1291086763399467, "grad_norm": 1.5085339460848268, "learning_rate": 4.098371303698694e-06, "loss": 0.5992, "step": 3595 }, { "epoch": 2.1297009179745334, "grad_norm": 2.0820647527427854, "learning_rate": 4.093209322730114e-06, "loss": 0.5808, "step": 3596 }, { "epoch": 2.1302931596091206, "grad_norm": 1.2113506268447882, "learning_rate": 4.08804975797108e-06, "loss": 0.6079, "step": 3597 }, { "epoch": 2.1308854012437073, "grad_norm": 1.450986037398985, "learning_rate": 4.082892611532136e-06, "loss": 0.634, "step": 3598 }, { "epoch": 2.1314776428782944, "grad_norm": 1.8781499277404807, "learning_rate": 4.077737885522852e-06, "loss": 0.6374, "step": 3599 }, { "epoch": 2.132069884512881, "grad_norm": 1.7864320630965398, "learning_rate": 4.072585582051798e-06, "loss": 0.627, "step": 3600 }, { "epoch": 2.1326621261474683, "grad_norm": 1.4956135203030114, "learning_rate": 4.067435703226552e-06, "loss": 0.6108, "step": 3601 }, { "epoch": 2.133254367782055, "grad_norm": 2.2180019781234, "learning_rate": 4.0622882511537076e-06, "loss": 0.5863, "step": 3602 }, { "epoch": 2.133846609416642, "grad_norm": 4.119143056595805, "learning_rate": 4.057143227938866e-06, "loss": 0.6186, "step": 3603 }, { "epoch": 2.134438851051229, "grad_norm": 1.5204561646279764, "learning_rate": 4.052000635686627e-06, "loss": 0.5892, "step": 3604 }, { "epoch": 2.135031092685816, "grad_norm": 4.47672704923902, "learning_rate": 4.046860476500596e-06, "loss": 0.6405, "step": 3605 }, { "epoch": 2.1356233343204027, "grad_norm": 1.7257594440244672, "learning_rate": 4.0417227524833925e-06, "loss": 0.6228, "step": 3606 }, { "epoch": 2.1362155759549895, "grad_norm": 1.0415334302691686, "learning_rate": 4.036587465736635e-06, "loss": 0.6577, "step": 3607 }, { "epoch": 2.1368078175895766, "grad_norm": 1.265219398940147, "learning_rate": 4.031454618360945e-06, "loss": 0.6531, "step": 3608 }, { "epoch": 2.1374000592241633, "grad_norm": 1.2617611265687647, "learning_rate": 4.02632421245594e-06, "loss": 0.5512, "step": 3609 }, { "epoch": 2.1379923008587505, "grad_norm": 1.241378144123858, "learning_rate": 4.021196250120248e-06, "loss": 0.6223, "step": 3610 }, { "epoch": 2.138584542493337, "grad_norm": 1.8843558424320774, "learning_rate": 4.016070733451496e-06, "loss": 0.6307, "step": 3611 }, { "epoch": 2.1391767841279243, "grad_norm": 1.152700464562051, "learning_rate": 4.0109476645463076e-06, "loss": 0.6149, "step": 3612 }, { "epoch": 2.139769025762511, "grad_norm": 1.646886331579089, "learning_rate": 4.005827045500301e-06, "loss": 0.6022, "step": 3613 }, { "epoch": 2.140361267397098, "grad_norm": 1.4285955128879986, "learning_rate": 4.000708878408103e-06, "loss": 0.5811, "step": 3614 }, { "epoch": 2.140953509031685, "grad_norm": 1.8299679014912458, "learning_rate": 3.9955931653633365e-06, "loss": 0.6211, "step": 3615 }, { "epoch": 2.141545750666272, "grad_norm": 1.4096002284217504, "learning_rate": 3.990479908458602e-06, "loss": 0.6149, "step": 3616 }, { "epoch": 2.1421379923008588, "grad_norm": 2.69120669503384, "learning_rate": 3.985369109785516e-06, "loss": 0.6343, "step": 3617 }, { "epoch": 2.1427302339354455, "grad_norm": 1.1754488546881272, "learning_rate": 3.980260771434685e-06, "loss": 0.6071, "step": 3618 }, { "epoch": 2.1433224755700326, "grad_norm": 1.6113726977096838, "learning_rate": 3.975154895495711e-06, "loss": 0.5882, "step": 3619 }, { "epoch": 2.1439147172046193, "grad_norm": 2.791107478639297, "learning_rate": 3.970051484057171e-06, "loss": 0.635, "step": 3620 }, { "epoch": 2.1445069588392065, "grad_norm": 1.5913773207532422, "learning_rate": 3.9649505392066544e-06, "loss": 0.5979, "step": 3621 }, { "epoch": 2.145099200473793, "grad_norm": 1.5408491987361748, "learning_rate": 3.959852063030738e-06, "loss": 0.6264, "step": 3622 }, { "epoch": 2.1456914421083804, "grad_norm": 1.4228715868052781, "learning_rate": 3.9547560576149815e-06, "loss": 0.6112, "step": 3623 }, { "epoch": 2.146283683742967, "grad_norm": 1.894084960567437, "learning_rate": 3.949662525043935e-06, "loss": 0.6981, "step": 3624 }, { "epoch": 2.1468759253775542, "grad_norm": 29.083821486159245, "learning_rate": 3.944571467401142e-06, "loss": 0.5861, "step": 3625 }, { "epoch": 2.147468167012141, "grad_norm": 1.778409125554096, "learning_rate": 3.939482886769136e-06, "loss": 0.6135, "step": 3626 }, { "epoch": 2.148060408646728, "grad_norm": 1.7787742540030977, "learning_rate": 3.934396785229429e-06, "loss": 0.5599, "step": 3627 }, { "epoch": 2.148652650281315, "grad_norm": 1.9679765606795838, "learning_rate": 3.929313164862518e-06, "loss": 0.637, "step": 3628 }, { "epoch": 2.1492448919159015, "grad_norm": 1.5563281004545866, "learning_rate": 3.924232027747894e-06, "loss": 0.6389, "step": 3629 }, { "epoch": 2.1498371335504887, "grad_norm": 1.2422466217603876, "learning_rate": 3.919153375964032e-06, "loss": 0.567, "step": 3630 }, { "epoch": 2.1504293751850754, "grad_norm": 2.00976111403471, "learning_rate": 3.91407721158838e-06, "loss": 0.5458, "step": 3631 }, { "epoch": 2.1510216168196625, "grad_norm": 1.6951508776864774, "learning_rate": 3.909003536697374e-06, "loss": 0.605, "step": 3632 }, { "epoch": 2.1516138584542492, "grad_norm": 1.558169994124345, "learning_rate": 3.903932353366435e-06, "loss": 0.595, "step": 3633 }, { "epoch": 2.1522061000888364, "grad_norm": 1.6276022870031455, "learning_rate": 3.898863663669965e-06, "loss": 0.6334, "step": 3634 }, { "epoch": 2.152798341723423, "grad_norm": 1.6198572914267186, "learning_rate": 3.8937974696813405e-06, "loss": 0.5621, "step": 3635 }, { "epoch": 2.1533905833580103, "grad_norm": 1.9472727059260369, "learning_rate": 3.888733773472916e-06, "loss": 0.5736, "step": 3636 }, { "epoch": 2.153982824992597, "grad_norm": 2.552984173348855, "learning_rate": 3.883672577116035e-06, "loss": 0.6166, "step": 3637 }, { "epoch": 2.154575066627184, "grad_norm": 1.9217704150949895, "learning_rate": 3.878613882681002e-06, "loss": 0.6168, "step": 3638 }, { "epoch": 2.155167308261771, "grad_norm": 1.5327012862258527, "learning_rate": 3.873557692237119e-06, "loss": 0.5939, "step": 3639 }, { "epoch": 2.1557595498963575, "grad_norm": 1.4830748099663695, "learning_rate": 3.868504007852641e-06, "loss": 0.636, "step": 3640 }, { "epoch": 2.1563517915309447, "grad_norm": 1.4934300428699974, "learning_rate": 3.86345283159482e-06, "loss": 0.5907, "step": 3641 }, { "epoch": 2.1569440331655314, "grad_norm": 1.8303353542056642, "learning_rate": 3.8584041655298606e-06, "loss": 0.6217, "step": 3642 }, { "epoch": 2.1575362748001186, "grad_norm": 2.6188924474056203, "learning_rate": 3.853358011722961e-06, "loss": 0.6247, "step": 3643 }, { "epoch": 2.1581285164347053, "grad_norm": 1.9829717288285205, "learning_rate": 3.848314372238272e-06, "loss": 0.5971, "step": 3644 }, { "epoch": 2.1587207580692924, "grad_norm": 2.9821607547301348, "learning_rate": 3.8432732491389345e-06, "loss": 0.6207, "step": 3645 }, { "epoch": 2.159312999703879, "grad_norm": 1.4864857735219268, "learning_rate": 3.838234644487045e-06, "loss": 0.6185, "step": 3646 }, { "epoch": 2.1599052413384663, "grad_norm": 2.2631386629045287, "learning_rate": 3.833198560343682e-06, "loss": 0.5928, "step": 3647 }, { "epoch": 2.160497482973053, "grad_norm": 1.2595848974397195, "learning_rate": 3.828164998768879e-06, "loss": 0.6202, "step": 3648 }, { "epoch": 2.16108972460764, "grad_norm": 1.2855787943939143, "learning_rate": 3.8231339618216556e-06, "loss": 0.5912, "step": 3649 }, { "epoch": 2.161681966242227, "grad_norm": 3.733946616282587, "learning_rate": 3.8181054515599806e-06, "loss": 0.642, "step": 3650 }, { "epoch": 2.1622742078768136, "grad_norm": 1.9716595141173552, "learning_rate": 3.8130794700408027e-06, "loss": 0.576, "step": 3651 }, { "epoch": 2.1628664495114007, "grad_norm": 10.436223336411803, "learning_rate": 3.8080560193200288e-06, "loss": 0.59, "step": 3652 }, { "epoch": 2.1634586911459874, "grad_norm": 1.6433990265618101, "learning_rate": 3.803035101452531e-06, "loss": 0.6323, "step": 3653 }, { "epoch": 2.1640509327805746, "grad_norm": 1.4190321545816333, "learning_rate": 3.798016718492148e-06, "loss": 0.6114, "step": 3654 }, { "epoch": 2.1646431744151613, "grad_norm": 1.559115691980612, "learning_rate": 3.7930008724916846e-06, "loss": 0.583, "step": 3655 }, { "epoch": 2.1652354160497485, "grad_norm": 2.1204592019182655, "learning_rate": 3.7879875655029018e-06, "loss": 0.6153, "step": 3656 }, { "epoch": 2.165827657684335, "grad_norm": 1.9790725686824444, "learning_rate": 3.782976799576519e-06, "loss": 0.6118, "step": 3657 }, { "epoch": 2.1664198993189223, "grad_norm": 1.7029886439459894, "learning_rate": 3.7779685767622255e-06, "loss": 0.6315, "step": 3658 }, { "epoch": 2.167012140953509, "grad_norm": 1.3790840194704055, "learning_rate": 3.7729628991086687e-06, "loss": 0.6036, "step": 3659 }, { "epoch": 2.167604382588096, "grad_norm": 1.7802518189464025, "learning_rate": 3.7679597686634495e-06, "loss": 0.5877, "step": 3660 }, { "epoch": 2.168196624222683, "grad_norm": 1.746849203801897, "learning_rate": 3.7629591874731264e-06, "loss": 0.6504, "step": 3661 }, { "epoch": 2.1687888658572696, "grad_norm": 1.6998904736750884, "learning_rate": 3.757961157583221e-06, "loss": 0.6304, "step": 3662 }, { "epoch": 2.1693811074918568, "grad_norm": 1.3810219749838843, "learning_rate": 3.7529656810382133e-06, "loss": 0.6296, "step": 3663 }, { "epoch": 2.1699733491264435, "grad_norm": 1.298383302796027, "learning_rate": 3.7479727598815287e-06, "loss": 0.6187, "step": 3664 }, { "epoch": 2.1705655907610306, "grad_norm": 2.1800686642840987, "learning_rate": 3.7429823961555513e-06, "loss": 0.6048, "step": 3665 }, { "epoch": 2.1711578323956173, "grad_norm": 1.138658124740238, "learning_rate": 3.7379945919016225e-06, "loss": 0.659, "step": 3666 }, { "epoch": 2.1717500740302045, "grad_norm": 1.3434860594717237, "learning_rate": 3.733009349160042e-06, "loss": 0.6048, "step": 3667 }, { "epoch": 2.172342315664791, "grad_norm": 1.764600534002413, "learning_rate": 3.7280266699700406e-06, "loss": 0.5861, "step": 3668 }, { "epoch": 2.1729345572993783, "grad_norm": 1.1095526363681014, "learning_rate": 3.7230465563698214e-06, "loss": 0.6066, "step": 3669 }, { "epoch": 2.173526798933965, "grad_norm": 3.064677575014838, "learning_rate": 3.7180690103965313e-06, "loss": 0.6762, "step": 3670 }, { "epoch": 2.1741190405685518, "grad_norm": 1.0892290245854241, "learning_rate": 3.713094034086273e-06, "loss": 0.6142, "step": 3671 }, { "epoch": 2.174711282203139, "grad_norm": 2.014740928840878, "learning_rate": 3.7081216294740773e-06, "loss": 0.6006, "step": 3672 }, { "epoch": 2.1753035238377256, "grad_norm": 1.6840077237428435, "learning_rate": 3.703151798593945e-06, "loss": 0.6111, "step": 3673 }, { "epoch": 2.175895765472313, "grad_norm": 3.147435725567642, "learning_rate": 3.6981845434788188e-06, "loss": 0.59, "step": 3674 }, { "epoch": 2.1764880071068995, "grad_norm": 2.3668044579752547, "learning_rate": 3.693219866160582e-06, "loss": 0.6486, "step": 3675 }, { "epoch": 2.1770802487414866, "grad_norm": 1.7822979150577114, "learning_rate": 3.688257768670065e-06, "loss": 0.617, "step": 3676 }, { "epoch": 2.1776724903760734, "grad_norm": 1.5174818432951496, "learning_rate": 3.6832982530370465e-06, "loss": 0.62, "step": 3677 }, { "epoch": 2.1782647320106605, "grad_norm": 1.4448335060389763, "learning_rate": 3.678341321290252e-06, "loss": 0.6469, "step": 3678 }, { "epoch": 2.178856973645247, "grad_norm": 1.55193829165598, "learning_rate": 3.6733869754573403e-06, "loss": 0.602, "step": 3679 }, { "epoch": 2.1794492152798344, "grad_norm": 1.1427395519868442, "learning_rate": 3.668435217564915e-06, "loss": 0.6552, "step": 3680 }, { "epoch": 2.180041456914421, "grad_norm": 1.3343442976110556, "learning_rate": 3.663486049638527e-06, "loss": 0.6427, "step": 3681 }, { "epoch": 2.180633698549008, "grad_norm": 1.6318631166682127, "learning_rate": 3.658539473702667e-06, "loss": 0.6476, "step": 3682 }, { "epoch": 2.181225940183595, "grad_norm": 1.4447436327409222, "learning_rate": 3.65359549178076e-06, "loss": 0.595, "step": 3683 }, { "epoch": 2.1818181818181817, "grad_norm": 1.1309723434982337, "learning_rate": 3.6486541058951696e-06, "loss": 0.6008, "step": 3684 }, { "epoch": 2.182410423452769, "grad_norm": 1.7100247943717297, "learning_rate": 3.6437153180672034e-06, "loss": 0.6291, "step": 3685 }, { "epoch": 2.1830026650873555, "grad_norm": 1.6181277098556224, "learning_rate": 3.638779130317106e-06, "loss": 0.6039, "step": 3686 }, { "epoch": 2.1835949067219427, "grad_norm": 1.5754065682938263, "learning_rate": 3.633845544664053e-06, "loss": 0.6282, "step": 3687 }, { "epoch": 2.1841871483565294, "grad_norm": 2.344647272098969, "learning_rate": 3.628914563126156e-06, "loss": 0.6411, "step": 3688 }, { "epoch": 2.1847793899911165, "grad_norm": 6.915326366107702, "learning_rate": 3.6239861877204684e-06, "loss": 0.5722, "step": 3689 }, { "epoch": 2.1853716316257032, "grad_norm": 1.3363840965456972, "learning_rate": 3.6190604204629685e-06, "loss": 0.6376, "step": 3690 }, { "epoch": 2.1859638732602904, "grad_norm": 1.5541366844193774, "learning_rate": 3.6141372633685767e-06, "loss": 0.6474, "step": 3691 }, { "epoch": 2.186556114894877, "grad_norm": 2.254316560958191, "learning_rate": 3.6092167184511352e-06, "loss": 0.6008, "step": 3692 }, { "epoch": 2.187148356529464, "grad_norm": 2.255989122727738, "learning_rate": 3.6042987877234304e-06, "loss": 0.6062, "step": 3693 }, { "epoch": 2.187740598164051, "grad_norm": 1.320086179706794, "learning_rate": 3.5993834731971654e-06, "loss": 0.5777, "step": 3694 }, { "epoch": 2.1883328397986377, "grad_norm": 1.8280860055033141, "learning_rate": 3.594470776882989e-06, "loss": 0.5831, "step": 3695 }, { "epoch": 2.188925081433225, "grad_norm": 1.9043201529453062, "learning_rate": 3.5895607007904597e-06, "loss": 0.5996, "step": 3696 }, { "epoch": 2.1895173230678115, "grad_norm": 2.358121027410781, "learning_rate": 3.584653246928085e-06, "loss": 0.6427, "step": 3697 }, { "epoch": 2.1901095647023987, "grad_norm": 1.3402210675480135, "learning_rate": 3.5797484173032806e-06, "loss": 0.6018, "step": 3698 }, { "epoch": 2.1907018063369854, "grad_norm": 2.2835653872932706, "learning_rate": 3.5748462139224048e-06, "loss": 0.656, "step": 3699 }, { "epoch": 2.1912940479715726, "grad_norm": 1.1925267592647002, "learning_rate": 3.569946638790729e-06, "loss": 0.5937, "step": 3700 }, { "epoch": 2.1918862896061593, "grad_norm": 6.950862477600291, "learning_rate": 3.5650496939124602e-06, "loss": 0.5998, "step": 3701 }, { "epoch": 2.1924785312407464, "grad_norm": 1.5302796583951903, "learning_rate": 3.5601553812907174e-06, "loss": 0.6095, "step": 3702 }, { "epoch": 2.193070772875333, "grad_norm": 1.6634344074466962, "learning_rate": 3.555263702927558e-06, "loss": 0.6575, "step": 3703 }, { "epoch": 2.19366301450992, "grad_norm": 1.3910377831755019, "learning_rate": 3.5503746608239487e-06, "loss": 0.6121, "step": 3704 }, { "epoch": 2.194255256144507, "grad_norm": 1.6713226754844968, "learning_rate": 3.54548825697978e-06, "loss": 0.641, "step": 3705 }, { "epoch": 2.1948474977790937, "grad_norm": 1.5747338103446598, "learning_rate": 3.5406044933938688e-06, "loss": 0.6064, "step": 3706 }, { "epoch": 2.195439739413681, "grad_norm": 1.5768868641879534, "learning_rate": 3.535723372063952e-06, "loss": 0.6197, "step": 3707 }, { "epoch": 2.1960319810482676, "grad_norm": 1.6152013346052505, "learning_rate": 3.5308448949866805e-06, "loss": 0.6304, "step": 3708 }, { "epoch": 2.1966242226828547, "grad_norm": 1.3489870236047408, "learning_rate": 3.5259690641576216e-06, "loss": 0.6066, "step": 3709 }, { "epoch": 2.1972164643174414, "grad_norm": 2.0855784992139825, "learning_rate": 3.5210958815712672e-06, "loss": 0.5858, "step": 3710 }, { "epoch": 2.1978087059520286, "grad_norm": 2.1655140503921855, "learning_rate": 3.5162253492210276e-06, "loss": 0.6431, "step": 3711 }, { "epoch": 2.1984009475866153, "grad_norm": 1.3963981694086718, "learning_rate": 3.5113574690992203e-06, "loss": 0.6041, "step": 3712 }, { "epoch": 2.198993189221202, "grad_norm": 1.9394182753615543, "learning_rate": 3.5064922431970793e-06, "loss": 0.5795, "step": 3713 }, { "epoch": 2.199585430855789, "grad_norm": 1.3857185510161663, "learning_rate": 3.5016296735047584e-06, "loss": 0.6244, "step": 3714 }, { "epoch": 2.200177672490376, "grad_norm": 1.6061122144253255, "learning_rate": 3.496769762011325e-06, "loss": 0.6293, "step": 3715 }, { "epoch": 2.200769914124963, "grad_norm": 5.40581536454228, "learning_rate": 3.4919125107047537e-06, "loss": 0.6259, "step": 3716 }, { "epoch": 2.2013621557595497, "grad_norm": 1.2657815177175666, "learning_rate": 3.487057921571929e-06, "loss": 0.5988, "step": 3717 }, { "epoch": 2.201954397394137, "grad_norm": 1.7259059897471152, "learning_rate": 3.482205996598654e-06, "loss": 0.6134, "step": 3718 }, { "epoch": 2.2025466390287236, "grad_norm": 1.184129395946116, "learning_rate": 3.477356737769645e-06, "loss": 0.6274, "step": 3719 }, { "epoch": 2.2031388806633108, "grad_norm": 1.7443680463119289, "learning_rate": 3.472510147068515e-06, "loss": 0.6375, "step": 3720 }, { "epoch": 2.2037311222978975, "grad_norm": 1.4046681220312973, "learning_rate": 3.4676662264777905e-06, "loss": 0.6456, "step": 3721 }, { "epoch": 2.2043233639324846, "grad_norm": 2.5357065102499994, "learning_rate": 3.4628249779789105e-06, "loss": 0.6201, "step": 3722 }, { "epoch": 2.2049156055670713, "grad_norm": 2.357882593644408, "learning_rate": 3.4579864035522236e-06, "loss": 0.6352, "step": 3723 }, { "epoch": 2.205507847201658, "grad_norm": 1.2137268888170112, "learning_rate": 3.4531505051769665e-06, "loss": 0.6398, "step": 3724 }, { "epoch": 2.206100088836245, "grad_norm": 2.239315397553286, "learning_rate": 3.4483172848312994e-06, "loss": 0.6236, "step": 3725 }, { "epoch": 2.206692330470832, "grad_norm": 1.557932862394711, "learning_rate": 3.4434867444922857e-06, "loss": 0.5774, "step": 3726 }, { "epoch": 2.207284572105419, "grad_norm": 1.5777287070135115, "learning_rate": 3.438658886135884e-06, "loss": 0.5508, "step": 3727 }, { "epoch": 2.2078768137400058, "grad_norm": 1.413920945713966, "learning_rate": 3.433833711736957e-06, "loss": 0.6097, "step": 3728 }, { "epoch": 2.208469055374593, "grad_norm": 1.258492807065135, "learning_rate": 3.429011223269274e-06, "loss": 0.6256, "step": 3729 }, { "epoch": 2.2090612970091796, "grad_norm": 3.75247152278292, "learning_rate": 3.4241914227055096e-06, "loss": 0.5718, "step": 3730 }, { "epoch": 2.209653538643767, "grad_norm": 3.7102331792806478, "learning_rate": 3.4193743120172297e-06, "loss": 0.5975, "step": 3731 }, { "epoch": 2.2102457802783535, "grad_norm": 1.7080087366642982, "learning_rate": 3.414559893174898e-06, "loss": 0.5949, "step": 3732 }, { "epoch": 2.2108380219129407, "grad_norm": 2.0252022919779153, "learning_rate": 3.4097481681478873e-06, "loss": 0.6596, "step": 3733 }, { "epoch": 2.2114302635475274, "grad_norm": 1.7291383197002848, "learning_rate": 3.4049391389044674e-06, "loss": 0.581, "step": 3734 }, { "epoch": 2.212022505182114, "grad_norm": 2.2935670684623286, "learning_rate": 3.4001328074117977e-06, "loss": 0.6159, "step": 3735 }, { "epoch": 2.2126147468167012, "grad_norm": 1.5768762657428204, "learning_rate": 3.3953291756359354e-06, "loss": 0.6298, "step": 3736 }, { "epoch": 2.213206988451288, "grad_norm": 2.260996804956774, "learning_rate": 3.3905282455418375e-06, "loss": 0.6006, "step": 3737 }, { "epoch": 2.213799230085875, "grad_norm": 1.4582667950449473, "learning_rate": 3.3857300190933606e-06, "loss": 0.6607, "step": 3738 }, { "epoch": 2.214391471720462, "grad_norm": 1.7196961502523231, "learning_rate": 3.3809344982532435e-06, "loss": 0.6154, "step": 3739 }, { "epoch": 2.214983713355049, "grad_norm": 1.4224301706484694, "learning_rate": 3.376141684983121e-06, "loss": 0.6257, "step": 3740 }, { "epoch": 2.2155759549896357, "grad_norm": 1.5242446120056001, "learning_rate": 3.3713515812435305e-06, "loss": 0.6395, "step": 3741 }, { "epoch": 2.216168196624223, "grad_norm": 3.082276038619601, "learning_rate": 3.366564188993887e-06, "loss": 0.607, "step": 3742 }, { "epoch": 2.2167604382588095, "grad_norm": 1.1091952397891178, "learning_rate": 3.36177951019251e-06, "loss": 0.5673, "step": 3743 }, { "epoch": 2.2173526798933967, "grad_norm": 1.0623328174773825, "learning_rate": 3.3569975467965955e-06, "loss": 0.6197, "step": 3744 }, { "epoch": 2.2179449215279834, "grad_norm": 1.9139569358167408, "learning_rate": 3.352218300762241e-06, "loss": 0.6325, "step": 3745 }, { "epoch": 2.21853716316257, "grad_norm": 1.754753069962319, "learning_rate": 3.347441774044421e-06, "loss": 0.6194, "step": 3746 }, { "epoch": 2.2191294047971573, "grad_norm": 1.311850230899881, "learning_rate": 3.3426679685970096e-06, "loss": 0.6365, "step": 3747 }, { "epoch": 2.219721646431744, "grad_norm": 2.3171585866726243, "learning_rate": 3.337896886372757e-06, "loss": 0.6015, "step": 3748 }, { "epoch": 2.220313888066331, "grad_norm": 3.610142905808541, "learning_rate": 3.3331285293233086e-06, "loss": 0.5806, "step": 3749 }, { "epoch": 2.220906129700918, "grad_norm": 1.4393742110166086, "learning_rate": 3.3283628993991846e-06, "loss": 0.5956, "step": 3750 }, { "epoch": 2.221498371335505, "grad_norm": 1.5219021843139517, "learning_rate": 3.3235999985498036e-06, "loss": 0.6268, "step": 3751 }, { "epoch": 2.2220906129700917, "grad_norm": 8.505541739782977, "learning_rate": 3.3188398287234504e-06, "loss": 0.602, "step": 3752 }, { "epoch": 2.222682854604679, "grad_norm": 1.4777906472192541, "learning_rate": 3.3140823918673117e-06, "loss": 0.5723, "step": 3753 }, { "epoch": 2.2232750962392656, "grad_norm": 3.427821946647888, "learning_rate": 3.3093276899274373e-06, "loss": 0.6404, "step": 3754 }, { "epoch": 2.2238673378738527, "grad_norm": 1.5440256787529059, "learning_rate": 3.3045757248487763e-06, "loss": 0.5923, "step": 3755 }, { "epoch": 2.2244595795084394, "grad_norm": 1.0121502394479982, "learning_rate": 3.2998264985751425e-06, "loss": 0.6091, "step": 3756 }, { "epoch": 2.225051821143026, "grad_norm": 1.435124556629406, "learning_rate": 3.2950800130492434e-06, "loss": 0.5865, "step": 3757 }, { "epoch": 2.2256440627776133, "grad_norm": 1.400676794978446, "learning_rate": 3.2903362702126516e-06, "loss": 0.5851, "step": 3758 }, { "epoch": 2.2262363044122, "grad_norm": 2.776885503195463, "learning_rate": 3.2855952720058303e-06, "loss": 0.6104, "step": 3759 }, { "epoch": 2.226828546046787, "grad_norm": 1.2754379174271564, "learning_rate": 3.2808570203681135e-06, "loss": 0.6243, "step": 3760 }, { "epoch": 2.227420787681374, "grad_norm": 2.047090232279367, "learning_rate": 3.2761215172377057e-06, "loss": 0.6244, "step": 3761 }, { "epoch": 2.228013029315961, "grad_norm": 1.9009466375062916, "learning_rate": 3.271388764551702e-06, "loss": 0.6162, "step": 3762 }, { "epoch": 2.2286052709505477, "grad_norm": 1.7400174906099655, "learning_rate": 3.266658764246062e-06, "loss": 0.5828, "step": 3763 }, { "epoch": 2.229197512585135, "grad_norm": 1.4745943692336556, "learning_rate": 3.2619315182556234e-06, "loss": 0.6394, "step": 3764 }, { "epoch": 2.2297897542197216, "grad_norm": 2.7463760563137667, "learning_rate": 3.25720702851409e-06, "loss": 0.5372, "step": 3765 }, { "epoch": 2.2303819958543087, "grad_norm": 1.291326363777691, "learning_rate": 3.2524852969540477e-06, "loss": 0.6244, "step": 3766 }, { "epoch": 2.2309742374888955, "grad_norm": 3.0151646713135456, "learning_rate": 3.2477663255069536e-06, "loss": 0.5868, "step": 3767 }, { "epoch": 2.231566479123482, "grad_norm": 1.3901935328359527, "learning_rate": 3.243050116103128e-06, "loss": 0.6149, "step": 3768 }, { "epoch": 2.2321587207580693, "grad_norm": 2.8729426761941332, "learning_rate": 3.2383366706717647e-06, "loss": 0.6374, "step": 3769 }, { "epoch": 2.232750962392656, "grad_norm": 1.3514288298661428, "learning_rate": 3.2336259911409283e-06, "loss": 0.6504, "step": 3770 }, { "epoch": 2.233343204027243, "grad_norm": 1.2214092762638862, "learning_rate": 3.228918079437556e-06, "loss": 0.6331, "step": 3771 }, { "epoch": 2.23393544566183, "grad_norm": 1.9604923888716306, "learning_rate": 3.2242129374874478e-06, "loss": 0.5907, "step": 3772 }, { "epoch": 2.234527687296417, "grad_norm": 1.726601509131056, "learning_rate": 3.219510567215264e-06, "loss": 0.6266, "step": 3773 }, { "epoch": 2.2351199289310038, "grad_norm": 1.518643995060012, "learning_rate": 3.2148109705445442e-06, "loss": 0.6445, "step": 3774 }, { "epoch": 2.235712170565591, "grad_norm": 1.386321333561844, "learning_rate": 3.2101141493976938e-06, "loss": 0.6196, "step": 3775 }, { "epoch": 2.2363044122001776, "grad_norm": 1.5786767296496107, "learning_rate": 3.205420105695963e-06, "loss": 0.6102, "step": 3776 }, { "epoch": 2.2368966538347648, "grad_norm": 1.2013453929038438, "learning_rate": 3.200728841359487e-06, "loss": 0.6102, "step": 3777 }, { "epoch": 2.2374888954693515, "grad_norm": 1.4238732131144223, "learning_rate": 3.1960403583072596e-06, "loss": 0.5799, "step": 3778 }, { "epoch": 2.238081137103938, "grad_norm": 1.2261171846122096, "learning_rate": 3.191354658457131e-06, "loss": 0.6501, "step": 3779 }, { "epoch": 2.2386733787385253, "grad_norm": 1.5582931846027446, "learning_rate": 3.186671743725812e-06, "loss": 0.6227, "step": 3780 }, { "epoch": 2.239265620373112, "grad_norm": 1.4151694573036904, "learning_rate": 3.181991616028882e-06, "loss": 0.6248, "step": 3781 }, { "epoch": 2.239857862007699, "grad_norm": 1.2588101337192434, "learning_rate": 3.1773142772807796e-06, "loss": 0.5951, "step": 3782 }, { "epoch": 2.240450103642286, "grad_norm": 1.4993382133597462, "learning_rate": 3.172639729394795e-06, "loss": 0.6159, "step": 3783 }, { "epoch": 2.241042345276873, "grad_norm": 1.8917475122461924, "learning_rate": 3.1679679742830806e-06, "loss": 0.5925, "step": 3784 }, { "epoch": 2.24163458691146, "grad_norm": 2.418861532155358, "learning_rate": 3.1632990138566467e-06, "loss": 0.614, "step": 3785 }, { "epoch": 2.242226828546047, "grad_norm": 1.705180026300204, "learning_rate": 3.158632850025367e-06, "loss": 0.5872, "step": 3786 }, { "epoch": 2.2428190701806336, "grad_norm": 1.377966079629573, "learning_rate": 3.1539694846979594e-06, "loss": 0.6421, "step": 3787 }, { "epoch": 2.243411311815221, "grad_norm": 1.58665211047525, "learning_rate": 3.1493089197820015e-06, "loss": 0.6665, "step": 3788 }, { "epoch": 2.2440035534498075, "grad_norm": 1.338148955510755, "learning_rate": 3.1446511571839267e-06, "loss": 0.6043, "step": 3789 }, { "epoch": 2.244595795084394, "grad_norm": 1.1705176333063276, "learning_rate": 3.139996198809028e-06, "loss": 0.6098, "step": 3790 }, { "epoch": 2.2451880367189814, "grad_norm": 1.8169627762543634, "learning_rate": 3.1353440465614403e-06, "loss": 0.6236, "step": 3791 }, { "epoch": 2.245780278353568, "grad_norm": 1.291671583977258, "learning_rate": 3.1306947023441524e-06, "loss": 0.616, "step": 3792 }, { "epoch": 2.2463725199881552, "grad_norm": 3.0359361674296963, "learning_rate": 3.1260481680590116e-06, "loss": 0.6486, "step": 3793 }, { "epoch": 2.246964761622742, "grad_norm": 1.4138943360346141, "learning_rate": 3.121404445606714e-06, "loss": 0.654, "step": 3794 }, { "epoch": 2.247557003257329, "grad_norm": 1.5503123140519945, "learning_rate": 3.1167635368867997e-06, "loss": 0.6364, "step": 3795 }, { "epoch": 2.248149244891916, "grad_norm": 1.4287633207856516, "learning_rate": 3.112125443797659e-06, "loss": 0.5875, "step": 3796 }, { "epoch": 2.248741486526503, "grad_norm": 2.002912860104456, "learning_rate": 3.107490168236539e-06, "loss": 0.6282, "step": 3797 }, { "epoch": 2.2493337281610897, "grad_norm": 1.4230050402558123, "learning_rate": 3.1028577120995216e-06, "loss": 0.6012, "step": 3798 }, { "epoch": 2.249925969795677, "grad_norm": 1.877917813203428, "learning_rate": 3.0982280772815478e-06, "loss": 0.6339, "step": 3799 }, { "epoch": 2.2505182114302635, "grad_norm": 1.6288711989281626, "learning_rate": 3.0936012656763937e-06, "loss": 0.5788, "step": 3800 }, { "epoch": 2.2511104530648502, "grad_norm": 1.6985228890250441, "learning_rate": 3.0889772791766892e-06, "loss": 0.6384, "step": 3801 }, { "epoch": 2.2517026946994374, "grad_norm": 1.2149735698196262, "learning_rate": 3.0843561196739013e-06, "loss": 0.6121, "step": 3802 }, { "epoch": 2.252294936334024, "grad_norm": 1.8781077925643568, "learning_rate": 3.079737789058348e-06, "loss": 0.6069, "step": 3803 }, { "epoch": 2.2528871779686113, "grad_norm": 2.685977214125362, "learning_rate": 3.075122289219181e-06, "loss": 0.5838, "step": 3804 }, { "epoch": 2.253479419603198, "grad_norm": 1.951312531114018, "learning_rate": 3.0705096220444066e-06, "loss": 0.5911, "step": 3805 }, { "epoch": 2.254071661237785, "grad_norm": 1.9964640855025602, "learning_rate": 3.0658997894208573e-06, "loss": 0.5825, "step": 3806 }, { "epoch": 2.254663902872372, "grad_norm": 1.4301827317146103, "learning_rate": 3.0612927932342205e-06, "loss": 0.5751, "step": 3807 }, { "epoch": 2.255256144506959, "grad_norm": 1.3725748768219552, "learning_rate": 3.0566886353690106e-06, "loss": 0.6418, "step": 3808 }, { "epoch": 2.2558483861415457, "grad_norm": 1.7520734317776463, "learning_rate": 3.052087317708593e-06, "loss": 0.6551, "step": 3809 }, { "epoch": 2.256440627776133, "grad_norm": 1.6914593574417534, "learning_rate": 3.047488842135159e-06, "loss": 0.6149, "step": 3810 }, { "epoch": 2.2570328694107196, "grad_norm": 1.15940591732544, "learning_rate": 3.0428932105297516e-06, "loss": 0.5848, "step": 3811 }, { "epoch": 2.2576251110453063, "grad_norm": 1.2240564347576872, "learning_rate": 3.038300424772237e-06, "loss": 0.6324, "step": 3812 }, { "epoch": 2.2582173526798934, "grad_norm": 1.4337226917688144, "learning_rate": 3.0337104867413215e-06, "loss": 0.678, "step": 3813 }, { "epoch": 2.25880959431448, "grad_norm": 1.691348892207366, "learning_rate": 3.0291233983145494e-06, "loss": 0.6343, "step": 3814 }, { "epoch": 2.2594018359490673, "grad_norm": 1.5404862204108754, "learning_rate": 3.0245391613683027e-06, "loss": 0.5813, "step": 3815 }, { "epoch": 2.259994077583654, "grad_norm": 3.706170880625637, "learning_rate": 3.019957777777788e-06, "loss": 0.5958, "step": 3816 }, { "epoch": 2.260586319218241, "grad_norm": 1.4585279326793774, "learning_rate": 3.015379249417045e-06, "loss": 0.6097, "step": 3817 }, { "epoch": 2.261178560852828, "grad_norm": 1.9982994091231006, "learning_rate": 3.010803578158954e-06, "loss": 0.6247, "step": 3818 }, { "epoch": 2.261770802487415, "grad_norm": 17.925157107499732, "learning_rate": 3.006230765875224e-06, "loss": 0.6337, "step": 3819 }, { "epoch": 2.2623630441220017, "grad_norm": 2.42901036591496, "learning_rate": 3.00166081443639e-06, "loss": 0.6131, "step": 3820 }, { "epoch": 2.262955285756589, "grad_norm": 1.4874686031510487, "learning_rate": 2.997093725711815e-06, "loss": 0.6154, "step": 3821 }, { "epoch": 2.2635475273911756, "grad_norm": 1.5630584463741388, "learning_rate": 2.9925295015696978e-06, "loss": 0.6295, "step": 3822 }, { "epoch": 2.2641397690257623, "grad_norm": 2.901494892160737, "learning_rate": 2.987968143877068e-06, "loss": 0.6562, "step": 3823 }, { "epoch": 2.2647320106603495, "grad_norm": 1.97840113224956, "learning_rate": 2.9834096544997725e-06, "loss": 0.633, "step": 3824 }, { "epoch": 2.265324252294936, "grad_norm": 2.1415089601168034, "learning_rate": 2.9788540353024863e-06, "loss": 0.5627, "step": 3825 }, { "epoch": 2.2659164939295233, "grad_norm": 1.8670402712788894, "learning_rate": 2.9743012881487187e-06, "loss": 0.579, "step": 3826 }, { "epoch": 2.26650873556411, "grad_norm": 2.79632330229638, "learning_rate": 2.9697514149008044e-06, "loss": 0.6382, "step": 3827 }, { "epoch": 2.267100977198697, "grad_norm": 2.1965219327895813, "learning_rate": 2.965204417419886e-06, "loss": 0.595, "step": 3828 }, { "epoch": 2.267693218833284, "grad_norm": 1.456269981743204, "learning_rate": 2.960660297565945e-06, "loss": 0.6127, "step": 3829 }, { "epoch": 2.268285460467871, "grad_norm": 1.5711432419219262, "learning_rate": 2.956119057197785e-06, "loss": 0.6269, "step": 3830 }, { "epoch": 2.2688777021024578, "grad_norm": 2.2290008657723965, "learning_rate": 2.9515806981730322e-06, "loss": 0.6162, "step": 3831 }, { "epoch": 2.269469943737045, "grad_norm": 1.8539455895230874, "learning_rate": 2.9470452223481206e-06, "loss": 0.6062, "step": 3832 }, { "epoch": 2.2700621853716316, "grad_norm": 1.7914148986066236, "learning_rate": 2.942512631578318e-06, "loss": 0.6494, "step": 3833 }, { "epoch": 2.2706544270062183, "grad_norm": 1.579902714082181, "learning_rate": 2.9379829277177152e-06, "loss": 0.6123, "step": 3834 }, { "epoch": 2.2712466686408055, "grad_norm": 1.8966069617893526, "learning_rate": 2.933456112619212e-06, "loss": 0.6139, "step": 3835 }, { "epoch": 2.271838910275392, "grad_norm": 3.372987103884868, "learning_rate": 2.9289321881345257e-06, "loss": 0.6225, "step": 3836 }, { "epoch": 2.2724311519099794, "grad_norm": 2.358810973987772, "learning_rate": 2.9244111561141997e-06, "loss": 0.6399, "step": 3837 }, { "epoch": 2.273023393544566, "grad_norm": 1.660137040911655, "learning_rate": 2.9198930184075944e-06, "loss": 0.5812, "step": 3838 }, { "epoch": 2.273615635179153, "grad_norm": 2.200855422546957, "learning_rate": 2.915377776862878e-06, "loss": 0.6089, "step": 3839 }, { "epoch": 2.27420787681374, "grad_norm": 1.6572553582906615, "learning_rate": 2.9108654333270346e-06, "loss": 0.5949, "step": 3840 }, { "epoch": 2.274800118448327, "grad_norm": 1.4498432049468442, "learning_rate": 2.9063559896458704e-06, "loss": 0.6329, "step": 3841 }, { "epoch": 2.275392360082914, "grad_norm": 1.1737273223504179, "learning_rate": 2.901849447664008e-06, "loss": 0.6561, "step": 3842 }, { "epoch": 2.275984601717501, "grad_norm": 2.66610971272177, "learning_rate": 2.897345809224864e-06, "loss": 0.5836, "step": 3843 }, { "epoch": 2.2765768433520877, "grad_norm": 1.839143446701005, "learning_rate": 2.892845076170685e-06, "loss": 0.6346, "step": 3844 }, { "epoch": 2.2771690849866744, "grad_norm": 1.60827795865389, "learning_rate": 2.8883472503425236e-06, "loss": 0.6318, "step": 3845 }, { "epoch": 2.2777613266212615, "grad_norm": 1.5301557763688713, "learning_rate": 2.8838523335802525e-06, "loss": 0.6759, "step": 3846 }, { "epoch": 2.2783535682558482, "grad_norm": 1.5562091375300113, "learning_rate": 2.8793603277225302e-06, "loss": 0.6203, "step": 3847 }, { "epoch": 2.2789458098904354, "grad_norm": 1.588136398742716, "learning_rate": 2.8748712346068464e-06, "loss": 0.5912, "step": 3848 }, { "epoch": 2.279538051525022, "grad_norm": 1.841240319682772, "learning_rate": 2.8703850560694966e-06, "loss": 0.6484, "step": 3849 }, { "epoch": 2.2801302931596092, "grad_norm": 8.661704543502498, "learning_rate": 2.865901793945576e-06, "loss": 0.6019, "step": 3850 }, { "epoch": 2.280722534794196, "grad_norm": 1.9390476098819909, "learning_rate": 2.8614214500689886e-06, "loss": 0.6344, "step": 3851 }, { "epoch": 2.281314776428783, "grad_norm": 1.4608667959226727, "learning_rate": 2.8569440262724502e-06, "loss": 0.608, "step": 3852 }, { "epoch": 2.28190701806337, "grad_norm": 2.253638236763826, "learning_rate": 2.8524695243874814e-06, "loss": 0.6115, "step": 3853 }, { "epoch": 2.282499259697957, "grad_norm": 1.5747389470810782, "learning_rate": 2.8479979462444017e-06, "loss": 0.6291, "step": 3854 }, { "epoch": 2.2830915013325437, "grad_norm": 1.216195294388743, "learning_rate": 2.8435292936723356e-06, "loss": 0.6106, "step": 3855 }, { "epoch": 2.2836837429671304, "grad_norm": 2.7276242343011745, "learning_rate": 2.8390635684992163e-06, "loss": 0.5509, "step": 3856 }, { "epoch": 2.2842759846017175, "grad_norm": 2.323606613377955, "learning_rate": 2.83460077255178e-06, "loss": 0.5926, "step": 3857 }, { "epoch": 2.2848682262363043, "grad_norm": 2.38216532290336, "learning_rate": 2.8301409076555574e-06, "loss": 0.6453, "step": 3858 }, { "epoch": 2.2854604678708914, "grad_norm": 1.3563075087072833, "learning_rate": 2.8256839756348807e-06, "loss": 0.6039, "step": 3859 }, { "epoch": 2.286052709505478, "grad_norm": 1.2668051387431, "learning_rate": 2.821229978312889e-06, "loss": 0.5929, "step": 3860 }, { "epoch": 2.2866449511400653, "grad_norm": 1.2956377312836582, "learning_rate": 2.8167789175115223e-06, "loss": 0.6503, "step": 3861 }, { "epoch": 2.287237192774652, "grad_norm": 1.9302456323206396, "learning_rate": 2.8123307950515087e-06, "loss": 0.6626, "step": 3862 }, { "epoch": 2.287829434409239, "grad_norm": 1.6044535773095645, "learning_rate": 2.80788561275238e-06, "loss": 0.5979, "step": 3863 }, { "epoch": 2.288421676043826, "grad_norm": 2.2939252755635975, "learning_rate": 2.8034433724324716e-06, "loss": 0.6646, "step": 3864 }, { "epoch": 2.289013917678413, "grad_norm": 2.471817823244044, "learning_rate": 2.7990040759089022e-06, "loss": 0.5829, "step": 3865 }, { "epoch": 2.2896061593129997, "grad_norm": 3.3806573958288566, "learning_rate": 2.7945677249976e-06, "loss": 0.6553, "step": 3866 }, { "epoch": 2.2901984009475864, "grad_norm": 2.737369171178317, "learning_rate": 2.7901343215132758e-06, "loss": 0.6349, "step": 3867 }, { "epoch": 2.2907906425821736, "grad_norm": 2.298571285772045, "learning_rate": 2.7857038672694492e-06, "loss": 0.5691, "step": 3868 }, { "epoch": 2.2913828842167603, "grad_norm": 1.5520258709926762, "learning_rate": 2.7812763640784155e-06, "loss": 0.6444, "step": 3869 }, { "epoch": 2.2919751258513474, "grad_norm": 1.7364947241082105, "learning_rate": 2.776851813751281e-06, "loss": 0.6432, "step": 3870 }, { "epoch": 2.292567367485934, "grad_norm": 1.5769815666091875, "learning_rate": 2.77243021809793e-06, "loss": 0.6317, "step": 3871 }, { "epoch": 2.2931596091205213, "grad_norm": 2.137316840538586, "learning_rate": 2.7680115789270478e-06, "loss": 0.6133, "step": 3872 }, { "epoch": 2.293751850755108, "grad_norm": 2.816072721070139, "learning_rate": 2.763595898046101e-06, "loss": 0.562, "step": 3873 }, { "epoch": 2.294344092389695, "grad_norm": 1.3276083688656188, "learning_rate": 2.7591831772613576e-06, "loss": 0.5834, "step": 3874 }, { "epoch": 2.294936334024282, "grad_norm": 1.2689499619456701, "learning_rate": 2.754773418377863e-06, "loss": 0.6007, "step": 3875 }, { "epoch": 2.295528575658869, "grad_norm": 3.840896322229496, "learning_rate": 2.750366623199462e-06, "loss": 0.6271, "step": 3876 }, { "epoch": 2.2961208172934557, "grad_norm": 1.1950201080078542, "learning_rate": 2.745962793528775e-06, "loss": 0.585, "step": 3877 }, { "epoch": 2.2967130589280425, "grad_norm": 2.0462234251989484, "learning_rate": 2.7415619311672236e-06, "loss": 0.639, "step": 3878 }, { "epoch": 2.2973053005626296, "grad_norm": 1.853413330196118, "learning_rate": 2.7371640379150032e-06, "loss": 0.6103, "step": 3879 }, { "epoch": 2.2978975421972163, "grad_norm": 2.1374274208984647, "learning_rate": 2.7327691155710978e-06, "loss": 0.5992, "step": 3880 }, { "epoch": 2.2984897838318035, "grad_norm": 1.9972945709906602, "learning_rate": 2.7283771659332805e-06, "loss": 0.62, "step": 3881 }, { "epoch": 2.29908202546639, "grad_norm": 1.0982870260711513, "learning_rate": 2.723988190798108e-06, "loss": 0.6119, "step": 3882 }, { "epoch": 2.2996742671009773, "grad_norm": 1.2876134638144012, "learning_rate": 2.7196021919609163e-06, "loss": 0.6366, "step": 3883 }, { "epoch": 2.300266508735564, "grad_norm": 1.8016509667404628, "learning_rate": 2.7152191712158207e-06, "loss": 0.6041, "step": 3884 }, { "epoch": 2.300858750370151, "grad_norm": 1.2850926199615083, "learning_rate": 2.710839130355727e-06, "loss": 0.6157, "step": 3885 }, { "epoch": 2.301450992004738, "grad_norm": 1.4731799841992599, "learning_rate": 2.706462071172322e-06, "loss": 0.5742, "step": 3886 }, { "epoch": 2.302043233639325, "grad_norm": 1.7403825798627295, "learning_rate": 2.7020879954560642e-06, "loss": 0.6451, "step": 3887 }, { "epoch": 2.3026354752739118, "grad_norm": 3.5575301409939653, "learning_rate": 2.697716904996196e-06, "loss": 0.6125, "step": 3888 }, { "epoch": 2.3032277169084985, "grad_norm": 1.283131812355814, "learning_rate": 2.6933488015807406e-06, "loss": 0.6057, "step": 3889 }, { "epoch": 2.3038199585430856, "grad_norm": 3.1172645215689783, "learning_rate": 2.6889836869965016e-06, "loss": 0.6172, "step": 3890 }, { "epoch": 2.3044122001776723, "grad_norm": 2.218448251870392, "learning_rate": 2.6846215630290516e-06, "loss": 0.6496, "step": 3891 }, { "epoch": 2.3050044418122595, "grad_norm": 2.3254463534891796, "learning_rate": 2.6802624314627436e-06, "loss": 0.6223, "step": 3892 }, { "epoch": 2.305596683446846, "grad_norm": 1.2275387722818925, "learning_rate": 2.67590629408071e-06, "loss": 0.5998, "step": 3893 }, { "epoch": 2.3061889250814334, "grad_norm": 3.0555438618968584, "learning_rate": 2.6715531526648585e-06, "loss": 0.6273, "step": 3894 }, { "epoch": 2.30678116671602, "grad_norm": 1.5752564674463383, "learning_rate": 2.6672030089958668e-06, "loss": 0.6596, "step": 3895 }, { "epoch": 2.3073734083506072, "grad_norm": 1.7511275969987508, "learning_rate": 2.6628558648531845e-06, "loss": 0.5788, "step": 3896 }, { "epoch": 2.307965649985194, "grad_norm": 0.9655204456900459, "learning_rate": 2.6585117220150403e-06, "loss": 0.639, "step": 3897 }, { "epoch": 2.308557891619781, "grad_norm": 1.1990519232172137, "learning_rate": 2.654170582258441e-06, "loss": 0.5954, "step": 3898 }, { "epoch": 2.309150133254368, "grad_norm": 3.208547550346025, "learning_rate": 2.649832447359142e-06, "loss": 0.6034, "step": 3899 }, { "epoch": 2.3097423748889545, "grad_norm": 2.2300260612204577, "learning_rate": 2.645497319091692e-06, "loss": 0.5987, "step": 3900 }, { "epoch": 2.3103346165235417, "grad_norm": 2.0946840308163543, "learning_rate": 2.6411651992294065e-06, "loss": 0.6319, "step": 3901 }, { "epoch": 2.3109268581581284, "grad_norm": 2.079061501961239, "learning_rate": 2.63683608954436e-06, "loss": 0.636, "step": 3902 }, { "epoch": 2.3115190997927155, "grad_norm": 1.2445711582719796, "learning_rate": 2.6325099918074017e-06, "loss": 0.6395, "step": 3903 }, { "epoch": 2.3121113414273022, "grad_norm": 1.864032684312602, "learning_rate": 2.6281869077881507e-06, "loss": 0.5782, "step": 3904 }, { "epoch": 2.3127035830618894, "grad_norm": 1.3316080049678458, "learning_rate": 2.6238668392549947e-06, "loss": 0.5901, "step": 3905 }, { "epoch": 2.313295824696476, "grad_norm": 1.6276785587062519, "learning_rate": 2.619549787975081e-06, "loss": 0.6372, "step": 3906 }, { "epoch": 2.3138880663310633, "grad_norm": 5.7918361147135045, "learning_rate": 2.615235755714324e-06, "loss": 0.6331, "step": 3907 }, { "epoch": 2.31448030796565, "grad_norm": 2.2663927095141925, "learning_rate": 2.6109247442374088e-06, "loss": 0.6412, "step": 3908 }, { "epoch": 2.315072549600237, "grad_norm": 7.124389289252508, "learning_rate": 2.6066167553077826e-06, "loss": 0.5689, "step": 3909 }, { "epoch": 2.315664791234824, "grad_norm": 1.363067404396117, "learning_rate": 2.602311790687655e-06, "loss": 0.6523, "step": 3910 }, { "epoch": 2.3162570328694105, "grad_norm": 1.5813779524915303, "learning_rate": 2.5980098521379936e-06, "loss": 0.6186, "step": 3911 }, { "epoch": 2.3168492745039977, "grad_norm": 1.481882250550122, "learning_rate": 2.593710941418537e-06, "loss": 0.5903, "step": 3912 }, { "epoch": 2.3174415161385844, "grad_norm": 1.9025600054836969, "learning_rate": 2.5894150602877834e-06, "loss": 0.6135, "step": 3913 }, { "epoch": 2.3180337577731716, "grad_norm": 1.2895740270749483, "learning_rate": 2.585122210502987e-06, "loss": 0.5903, "step": 3914 }, { "epoch": 2.3186259994077583, "grad_norm": 1.647388330853826, "learning_rate": 2.5808323938201642e-06, "loss": 0.5979, "step": 3915 }, { "epoch": 2.3192182410423454, "grad_norm": 1.28519854345509, "learning_rate": 2.5765456119940933e-06, "loss": 0.6295, "step": 3916 }, { "epoch": 2.319810482676932, "grad_norm": 1.2525366592486644, "learning_rate": 2.5722618667783063e-06, "loss": 0.5877, "step": 3917 }, { "epoch": 2.3204027243115193, "grad_norm": 1.3317926678190848, "learning_rate": 2.5679811599251003e-06, "loss": 0.5926, "step": 3918 }, { "epoch": 2.320994965946106, "grad_norm": 2.428793102384277, "learning_rate": 2.5637034931855197e-06, "loss": 0.5911, "step": 3919 }, { "epoch": 2.321587207580693, "grad_norm": 1.4721974585666002, "learning_rate": 2.559428868309377e-06, "loss": 0.579, "step": 3920 }, { "epoch": 2.32217944921528, "grad_norm": 3.8645920617049465, "learning_rate": 2.5551572870452268e-06, "loss": 0.6181, "step": 3921 }, { "epoch": 2.3227716908498666, "grad_norm": 1.5226219139675943, "learning_rate": 2.5508887511403936e-06, "loss": 0.5791, "step": 3922 }, { "epoch": 2.3233639324844537, "grad_norm": 2.2075587892085116, "learning_rate": 2.5466232623409416e-06, "loss": 0.634, "step": 3923 }, { "epoch": 2.3239561741190404, "grad_norm": 1.3599172487421667, "learning_rate": 2.542360822391702e-06, "loss": 0.6161, "step": 3924 }, { "epoch": 2.3245484157536276, "grad_norm": 2.3033687836929557, "learning_rate": 2.538101433036246e-06, "loss": 0.6239, "step": 3925 }, { "epoch": 2.3251406573882143, "grad_norm": 1.9111808167290096, "learning_rate": 2.5338450960169105e-06, "loss": 0.5993, "step": 3926 }, { "epoch": 2.3257328990228014, "grad_norm": 3.326963232812606, "learning_rate": 2.52959181307477e-06, "loss": 0.6112, "step": 3927 }, { "epoch": 2.326325140657388, "grad_norm": 1.796700647763413, "learning_rate": 2.525341585949662e-06, "loss": 0.6028, "step": 3928 }, { "epoch": 2.3269173822919753, "grad_norm": 2.312629837062101, "learning_rate": 2.521094416380162e-06, "loss": 0.6497, "step": 3929 }, { "epoch": 2.327509623926562, "grad_norm": 1.2457487465053927, "learning_rate": 2.5168503061036086e-06, "loss": 0.6098, "step": 3930 }, { "epoch": 2.328101865561149, "grad_norm": 3.791923080168204, "learning_rate": 2.5126092568560754e-06, "loss": 0.5886, "step": 3931 }, { "epoch": 2.328694107195736, "grad_norm": 4.853398587684118, "learning_rate": 2.5083712703723952e-06, "loss": 0.6286, "step": 3932 }, { "epoch": 2.3292863488303226, "grad_norm": 2.2593203003697915, "learning_rate": 2.5041363483861357e-06, "loss": 0.6453, "step": 3933 }, { "epoch": 2.3298785904649097, "grad_norm": 1.6615315018284669, "learning_rate": 2.499904492629627e-06, "loss": 0.5813, "step": 3934 }, { "epoch": 2.3304708320994965, "grad_norm": 1.6816787030820344, "learning_rate": 2.4956757048339307e-06, "loss": 0.5839, "step": 3935 }, { "epoch": 2.3310630737340836, "grad_norm": 2.769364650023261, "learning_rate": 2.4914499867288577e-06, "loss": 0.6956, "step": 3936 }, { "epoch": 2.3316553153686703, "grad_norm": 3.642860545017731, "learning_rate": 2.487227340042966e-06, "loss": 0.6321, "step": 3937 }, { "epoch": 2.3322475570032575, "grad_norm": 1.3381965332512193, "learning_rate": 2.483007766503558e-06, "loss": 0.5996, "step": 3938 }, { "epoch": 2.332839798637844, "grad_norm": 1.7008693234557766, "learning_rate": 2.4787912678366755e-06, "loss": 0.593, "step": 3939 }, { "epoch": 2.3334320402724313, "grad_norm": 1.3507684647991987, "learning_rate": 2.474577845767099e-06, "loss": 0.6144, "step": 3940 }, { "epoch": 2.334024281907018, "grad_norm": 2.0794417313783233, "learning_rate": 2.4703675020183583e-06, "loss": 0.5919, "step": 3941 }, { "epoch": 2.334616523541605, "grad_norm": 2.4036165381923915, "learning_rate": 2.4661602383127235e-06, "loss": 0.6163, "step": 3942 }, { "epoch": 2.335208765176192, "grad_norm": 2.006173079525012, "learning_rate": 2.461956056371201e-06, "loss": 0.6006, "step": 3943 }, { "epoch": 2.3358010068107786, "grad_norm": 2.5021972165514677, "learning_rate": 2.4577549579135318e-06, "loss": 0.6524, "step": 3944 }, { "epoch": 2.3363932484453658, "grad_norm": 2.2918385752916772, "learning_rate": 2.453556944658206e-06, "loss": 0.6192, "step": 3945 }, { "epoch": 2.3369854900799525, "grad_norm": 2.0313012400244546, "learning_rate": 2.449362018322451e-06, "loss": 0.5882, "step": 3946 }, { "epoch": 2.3375777317145396, "grad_norm": 2.1865753540293245, "learning_rate": 2.445170180622223e-06, "loss": 0.5776, "step": 3947 }, { "epoch": 2.3381699733491264, "grad_norm": 1.6788725756694558, "learning_rate": 2.440981433272216e-06, "loss": 0.6286, "step": 3948 }, { "epoch": 2.3387622149837135, "grad_norm": 1.3993996393149903, "learning_rate": 2.4367957779858675e-06, "loss": 0.6201, "step": 3949 }, { "epoch": 2.3393544566183, "grad_norm": 2.5163778162713877, "learning_rate": 2.43261321647535e-06, "loss": 0.6333, "step": 3950 }, { "epoch": 2.3399466982528874, "grad_norm": 2.1183056118895647, "learning_rate": 2.4284337504515577e-06, "loss": 0.6251, "step": 3951 }, { "epoch": 2.340538939887474, "grad_norm": 2.9285242355007957, "learning_rate": 2.42425738162413e-06, "loss": 0.6026, "step": 3952 }, { "epoch": 2.3411311815220612, "grad_norm": 2.1726586914478, "learning_rate": 2.420084111701442e-06, "loss": 0.6219, "step": 3953 }, { "epoch": 2.341723423156648, "grad_norm": 1.7766065564148414, "learning_rate": 2.4159139423905898e-06, "loss": 0.6071, "step": 3954 }, { "epoch": 2.3423156647912347, "grad_norm": 1.798438650040322, "learning_rate": 2.411746875397407e-06, "loss": 0.5952, "step": 3955 }, { "epoch": 2.342907906425822, "grad_norm": 1.8115265686116828, "learning_rate": 2.4075829124264606e-06, "loss": 0.6037, "step": 3956 }, { "epoch": 2.3435001480604085, "grad_norm": 1.2886622921652453, "learning_rate": 2.4034220551810484e-06, "loss": 0.6178, "step": 3957 }, { "epoch": 2.3440923896949957, "grad_norm": 1.2555330420863857, "learning_rate": 2.3992643053631904e-06, "loss": 0.617, "step": 3958 }, { "epoch": 2.3446846313295824, "grad_norm": 1.6387727645080599, "learning_rate": 2.3951096646736403e-06, "loss": 0.5982, "step": 3959 }, { "epoch": 2.3452768729641695, "grad_norm": 1.2682217733893184, "learning_rate": 2.3909581348118803e-06, "loss": 0.6339, "step": 3960 }, { "epoch": 2.3458691145987562, "grad_norm": 1.3586475278862904, "learning_rate": 2.386809717476123e-06, "loss": 0.6429, "step": 3961 }, { "epoch": 2.3464613562333434, "grad_norm": 1.3204559918998942, "learning_rate": 2.3826644143633017e-06, "loss": 0.5899, "step": 3962 }, { "epoch": 2.34705359786793, "grad_norm": 1.3928626985319164, "learning_rate": 2.3785222271690754e-06, "loss": 0.6113, "step": 3963 }, { "epoch": 2.3476458395025173, "grad_norm": 9.312328145389811, "learning_rate": 2.3743831575878352e-06, "loss": 0.6527, "step": 3964 }, { "epoch": 2.348238081137104, "grad_norm": 3.170276661797359, "learning_rate": 2.370247207312695e-06, "loss": 0.6355, "step": 3965 }, { "epoch": 2.3488303227716907, "grad_norm": 2.153769559929862, "learning_rate": 2.366114378035489e-06, "loss": 0.5854, "step": 3966 }, { "epoch": 2.349422564406278, "grad_norm": 2.010875700018389, "learning_rate": 2.361984671446773e-06, "loss": 0.6136, "step": 3967 }, { "epoch": 2.3500148060408645, "grad_norm": 1.8883203647412998, "learning_rate": 2.3578580892358337e-06, "loss": 0.6464, "step": 3968 }, { "epoch": 2.3506070476754517, "grad_norm": 1.3913502661830128, "learning_rate": 2.3537346330906776e-06, "loss": 0.6297, "step": 3969 }, { "epoch": 2.3511992893100384, "grad_norm": 1.5332485123384902, "learning_rate": 2.3496143046980256e-06, "loss": 0.5928, "step": 3970 }, { "epoch": 2.3517915309446256, "grad_norm": 1.3001291166457274, "learning_rate": 2.345497105743323e-06, "loss": 0.6043, "step": 3971 }, { "epoch": 2.3523837725792123, "grad_norm": 1.4754086076781734, "learning_rate": 2.3413830379107395e-06, "loss": 0.6199, "step": 3972 }, { "epoch": 2.352976014213799, "grad_norm": 2.409278659416967, "learning_rate": 2.337272102883157e-06, "loss": 0.6453, "step": 3973 }, { "epoch": 2.353568255848386, "grad_norm": 2.3466875892630683, "learning_rate": 2.3331643023421813e-06, "loss": 0.6564, "step": 3974 }, { "epoch": 2.3541604974829733, "grad_norm": 2.714570742335837, "learning_rate": 2.329059637968132e-06, "loss": 0.6404, "step": 3975 }, { "epoch": 2.35475273911756, "grad_norm": 2.3929716670625356, "learning_rate": 2.324958111440051e-06, "loss": 0.6362, "step": 3976 }, { "epoch": 2.3553449807521467, "grad_norm": 1.6228125008698313, "learning_rate": 2.3208597244356867e-06, "loss": 0.5842, "step": 3977 }, { "epoch": 2.355937222386734, "grad_norm": 3.384644411190184, "learning_rate": 2.316764478631518e-06, "loss": 0.627, "step": 3978 }, { "epoch": 2.3565294640213206, "grad_norm": 1.5260570773569566, "learning_rate": 2.3126723757027245e-06, "loss": 0.6295, "step": 3979 }, { "epoch": 2.3571217056559077, "grad_norm": 1.8203121426278104, "learning_rate": 2.30858341732321e-06, "loss": 0.6369, "step": 3980 }, { "epoch": 2.3577139472904944, "grad_norm": 1.1006202274680998, "learning_rate": 2.3044976051655854e-06, "loss": 0.6267, "step": 3981 }, { "epoch": 2.3583061889250816, "grad_norm": 1.8643488805973478, "learning_rate": 2.300414940901182e-06, "loss": 0.6002, "step": 3982 }, { "epoch": 2.3588984305596683, "grad_norm": 1.4796650275447034, "learning_rate": 2.2963354262000335e-06, "loss": 0.6101, "step": 3983 }, { "epoch": 2.359490672194255, "grad_norm": 3.064100269638554, "learning_rate": 2.292259062730897e-06, "loss": 0.5567, "step": 3984 }, { "epoch": 2.360082913828842, "grad_norm": 1.5811392863582345, "learning_rate": 2.2881858521612275e-06, "loss": 0.6186, "step": 3985 }, { "epoch": 2.3606751554634293, "grad_norm": 2.1586453620244597, "learning_rate": 2.2841157961572034e-06, "loss": 0.6014, "step": 3986 }, { "epoch": 2.361267397098016, "grad_norm": 2.641507002266892, "learning_rate": 2.2800488963837043e-06, "loss": 0.6133, "step": 3987 }, { "epoch": 2.3618596387326027, "grad_norm": 1.1730513592343241, "learning_rate": 2.2759851545043175e-06, "loss": 0.6223, "step": 3988 }, { "epoch": 2.36245188036719, "grad_norm": 1.4092715716805935, "learning_rate": 2.2719245721813455e-06, "loss": 0.6399, "step": 3989 }, { "epoch": 2.3630441220017766, "grad_norm": 1.8498348981776163, "learning_rate": 2.2678671510757953e-06, "loss": 0.6188, "step": 3990 }, { "epoch": 2.3636363636363638, "grad_norm": 2.143878596825647, "learning_rate": 2.263812892847381e-06, "loss": 0.6321, "step": 3991 }, { "epoch": 2.3642286052709505, "grad_norm": 1.2933879029907467, "learning_rate": 2.259761799154516e-06, "loss": 0.6037, "step": 3992 }, { "epoch": 2.3648208469055376, "grad_norm": 1.5882145664047522, "learning_rate": 2.2557138716543316e-06, "loss": 0.5955, "step": 3993 }, { "epoch": 2.3654130885401243, "grad_norm": 1.686459234669479, "learning_rate": 2.251669112002657e-06, "loss": 0.6064, "step": 3994 }, { "epoch": 2.366005330174711, "grad_norm": 1.7498942595889782, "learning_rate": 2.2476275218540266e-06, "loss": 0.5981, "step": 3995 }, { "epoch": 2.366597571809298, "grad_norm": 2.8193121891643997, "learning_rate": 2.243589102861673e-06, "loss": 0.6234, "step": 3996 }, { "epoch": 2.3671898134438853, "grad_norm": 1.4594782573094889, "learning_rate": 2.239553856677541e-06, "loss": 0.6007, "step": 3997 }, { "epoch": 2.367782055078472, "grad_norm": 1.485059207419929, "learning_rate": 2.235521784952275e-06, "loss": 0.6011, "step": 3998 }, { "epoch": 2.3683742967130588, "grad_norm": 1.3309379388804508, "learning_rate": 2.231492889335217e-06, "loss": 0.5943, "step": 3999 }, { "epoch": 2.368966538347646, "grad_norm": 2.6267695132329454, "learning_rate": 2.227467171474409e-06, "loss": 0.6136, "step": 4000 }, { "epoch": 2.3695587799822326, "grad_norm": 4.446529481550082, "learning_rate": 2.223444633016597e-06, "loss": 0.6042, "step": 4001 }, { "epoch": 2.37015102161682, "grad_norm": 1.9069444415000925, "learning_rate": 2.2194252756072343e-06, "loss": 0.6064, "step": 4002 }, { "epoch": 2.3707432632514065, "grad_norm": 1.8238334546399355, "learning_rate": 2.2154091008904497e-06, "loss": 0.6344, "step": 4003 }, { "epoch": 2.3713355048859937, "grad_norm": 2.5613993978909693, "learning_rate": 2.2113961105090933e-06, "loss": 0.5878, "step": 4004 }, { "epoch": 2.3719277465205804, "grad_norm": 1.2783779891008764, "learning_rate": 2.207386306104701e-06, "loss": 0.6141, "step": 4005 }, { "epoch": 2.372519988155167, "grad_norm": 1.4416435791183768, "learning_rate": 2.2033796893175152e-06, "loss": 0.6002, "step": 4006 }, { "epoch": 2.3731122297897542, "grad_norm": 1.6055773889827405, "learning_rate": 2.1993762617864555e-06, "loss": 0.6106, "step": 4007 }, { "epoch": 2.3737044714243414, "grad_norm": 1.33887538582045, "learning_rate": 2.195376025149156e-06, "loss": 0.6547, "step": 4008 }, { "epoch": 2.374296713058928, "grad_norm": 1.7979827304398968, "learning_rate": 2.1913789810419393e-06, "loss": 0.5853, "step": 4009 }, { "epoch": 2.374888954693515, "grad_norm": 1.678514747907677, "learning_rate": 2.1873851310998194e-06, "loss": 0.6229, "step": 4010 }, { "epoch": 2.375481196328102, "grad_norm": 12.535062449115955, "learning_rate": 2.183394476956504e-06, "loss": 0.636, "step": 4011 }, { "epoch": 2.3760734379626887, "grad_norm": 3.6917293032690623, "learning_rate": 2.179407020244395e-06, "loss": 0.6194, "step": 4012 }, { "epoch": 2.376665679597276, "grad_norm": 1.3674840173653329, "learning_rate": 2.175422762594591e-06, "loss": 0.6274, "step": 4013 }, { "epoch": 2.3772579212318625, "grad_norm": 1.7064037619998782, "learning_rate": 2.1714417056368752e-06, "loss": 0.6281, "step": 4014 }, { "epoch": 2.3778501628664497, "grad_norm": 4.498067534083709, "learning_rate": 2.167463850999719e-06, "loss": 0.6391, "step": 4015 }, { "epoch": 2.3784424045010364, "grad_norm": 3.0010927370986615, "learning_rate": 2.1634892003102935e-06, "loss": 0.6198, "step": 4016 }, { "epoch": 2.379034646135623, "grad_norm": 2.2759248099052205, "learning_rate": 2.159517755194456e-06, "loss": 0.6536, "step": 4017 }, { "epoch": 2.3796268877702103, "grad_norm": 1.5564439174661013, "learning_rate": 2.155549517276747e-06, "loss": 0.6276, "step": 4018 }, { "epoch": 2.3802191294047974, "grad_norm": 1.6225752200568844, "learning_rate": 2.1515844881803993e-06, "loss": 0.6166, "step": 4019 }, { "epoch": 2.380811371039384, "grad_norm": 1.9066398001458083, "learning_rate": 2.1476226695273326e-06, "loss": 0.6092, "step": 4020 }, { "epoch": 2.381403612673971, "grad_norm": 2.76829177620669, "learning_rate": 2.143664062938158e-06, "loss": 0.6401, "step": 4021 }, { "epoch": 2.381995854308558, "grad_norm": 3.0015528560284026, "learning_rate": 2.1397086700321635e-06, "loss": 0.6389, "step": 4022 }, { "epoch": 2.3825880959431447, "grad_norm": 1.4409882465741872, "learning_rate": 2.1357564924273265e-06, "loss": 0.6066, "step": 4023 }, { "epoch": 2.383180337577732, "grad_norm": 1.6002150244136195, "learning_rate": 2.1318075317403152e-06, "loss": 0.6025, "step": 4024 }, { "epoch": 2.3837725792123186, "grad_norm": 1.9985808815147001, "learning_rate": 2.1278617895864706e-06, "loss": 0.6046, "step": 4025 }, { "epoch": 2.3843648208469057, "grad_norm": 1.694290847273592, "learning_rate": 2.123919267579828e-06, "loss": 0.6045, "step": 4026 }, { "epoch": 2.3849570624814924, "grad_norm": 2.032855773037069, "learning_rate": 2.1199799673330956e-06, "loss": 0.6228, "step": 4027 }, { "epoch": 2.385549304116079, "grad_norm": 1.4539445622297456, "learning_rate": 2.1160438904576743e-06, "loss": 0.6275, "step": 4028 }, { "epoch": 2.3861415457506663, "grad_norm": 2.5084329606273528, "learning_rate": 2.1121110385636357e-06, "loss": 0.6437, "step": 4029 }, { "epoch": 2.386733787385253, "grad_norm": 1.5794313508804994, "learning_rate": 2.108181413259741e-06, "loss": 0.5742, "step": 4030 }, { "epoch": 2.38732602901984, "grad_norm": 1.3385924339428654, "learning_rate": 2.104255016153426e-06, "loss": 0.6265, "step": 4031 }, { "epoch": 2.387918270654427, "grad_norm": 2.2669122640842505, "learning_rate": 2.1003318488508107e-06, "loss": 0.6635, "step": 4032 }, { "epoch": 2.388510512289014, "grad_norm": 1.7123022659492335, "learning_rate": 2.0964119129566864e-06, "loss": 0.634, "step": 4033 }, { "epoch": 2.3891027539236007, "grad_norm": 2.074583285155401, "learning_rate": 2.092495210074532e-06, "loss": 0.6422, "step": 4034 }, { "epoch": 2.389694995558188, "grad_norm": 1.9435860623142942, "learning_rate": 2.0885817418064947e-06, "loss": 0.6352, "step": 4035 }, { "epoch": 2.3902872371927746, "grad_norm": 1.7101502157361959, "learning_rate": 2.0846715097534087e-06, "loss": 0.6161, "step": 4036 }, { "epoch": 2.3908794788273617, "grad_norm": 1.7335618347196917, "learning_rate": 2.0807645155147726e-06, "loss": 0.6222, "step": 4037 }, { "epoch": 2.3914717204619484, "grad_norm": 1.5176551517398615, "learning_rate": 2.0768607606887724e-06, "loss": 0.6145, "step": 4038 }, { "epoch": 2.392063962096535, "grad_norm": 1.091782921163788, "learning_rate": 2.072960246872261e-06, "loss": 0.5978, "step": 4039 }, { "epoch": 2.3926562037311223, "grad_norm": 2.07357282739027, "learning_rate": 2.069062975660765e-06, "loss": 0.6421, "step": 4040 }, { "epoch": 2.393248445365709, "grad_norm": 1.5424667085190742, "learning_rate": 2.0651689486484894e-06, "loss": 0.6532, "step": 4041 }, { "epoch": 2.393840687000296, "grad_norm": 1.6880108409323265, "learning_rate": 2.0612781674283142e-06, "loss": 0.5966, "step": 4042 }, { "epoch": 2.394432928634883, "grad_norm": 1.3807759801970425, "learning_rate": 2.057390633591785e-06, "loss": 0.5957, "step": 4043 }, { "epoch": 2.39502517026947, "grad_norm": 1.3765906802348291, "learning_rate": 2.0535063487291176e-06, "loss": 0.636, "step": 4044 }, { "epoch": 2.3956174119040567, "grad_norm": 3.0061313109171595, "learning_rate": 2.049625314429207e-06, "loss": 0.6586, "step": 4045 }, { "epoch": 2.396209653538644, "grad_norm": 1.1890562111369984, "learning_rate": 2.045747532279616e-06, "loss": 0.5867, "step": 4046 }, { "epoch": 2.3968018951732306, "grad_norm": 1.4910777330850993, "learning_rate": 2.0418730038665747e-06, "loss": 0.654, "step": 4047 }, { "epoch": 2.3973941368078178, "grad_norm": 2.874685506178504, "learning_rate": 2.038001730774978e-06, "loss": 0.6087, "step": 4048 }, { "epoch": 2.3979863784424045, "grad_norm": 1.3061726816651926, "learning_rate": 2.034133714588399e-06, "loss": 0.5734, "step": 4049 }, { "epoch": 2.398578620076991, "grad_norm": 1.7785239459876332, "learning_rate": 2.0302689568890753e-06, "loss": 0.656, "step": 4050 }, { "epoch": 2.3991708617115783, "grad_norm": 1.6332928430746765, "learning_rate": 2.0264074592579087e-06, "loss": 0.6034, "step": 4051 }, { "epoch": 2.399763103346165, "grad_norm": 1.5176551764079678, "learning_rate": 2.022549223274465e-06, "loss": 0.608, "step": 4052 }, { "epoch": 2.400355344980752, "grad_norm": 1.2607503040846897, "learning_rate": 2.0186942505169827e-06, "loss": 0.6439, "step": 4053 }, { "epoch": 2.400947586615339, "grad_norm": 2.158511661493091, "learning_rate": 2.0148425425623673e-06, "loss": 0.6046, "step": 4054 }, { "epoch": 2.401539828249926, "grad_norm": 2.465164565539923, "learning_rate": 2.0109941009861743e-06, "loss": 0.6046, "step": 4055 }, { "epoch": 2.4021320698845128, "grad_norm": 1.3196892824682709, "learning_rate": 2.0071489273626376e-06, "loss": 0.6952, "step": 4056 }, { "epoch": 2.4027243115191, "grad_norm": 1.6021832014855415, "learning_rate": 2.0033070232646488e-06, "loss": 0.6369, "step": 4057 }, { "epoch": 2.4033165531536866, "grad_norm": 6.273666490794738, "learning_rate": 1.999468390263769e-06, "loss": 0.5949, "step": 4058 }, { "epoch": 2.403908794788274, "grad_norm": 1.3248267529829056, "learning_rate": 1.995633029930204e-06, "loss": 0.6476, "step": 4059 }, { "epoch": 2.4045010364228605, "grad_norm": 4.130727246499741, "learning_rate": 1.9918009438328365e-06, "loss": 0.6206, "step": 4060 }, { "epoch": 2.405093278057447, "grad_norm": 1.8044158856252108, "learning_rate": 1.9879721335392088e-06, "loss": 0.6502, "step": 4061 }, { "epoch": 2.4056855196920344, "grad_norm": 1.2394089890201714, "learning_rate": 1.9841466006155162e-06, "loss": 0.62, "step": 4062 }, { "epoch": 2.406277761326621, "grad_norm": 1.3020178733946122, "learning_rate": 1.9803243466266154e-06, "loss": 0.604, "step": 4063 }, { "epoch": 2.4068700029612082, "grad_norm": 2.3665692021469855, "learning_rate": 1.976505373136025e-06, "loss": 0.5555, "step": 4064 }, { "epoch": 2.407462244595795, "grad_norm": 1.8076468168639461, "learning_rate": 1.9726896817059214e-06, "loss": 0.5771, "step": 4065 }, { "epoch": 2.408054486230382, "grad_norm": 2.2441477724689425, "learning_rate": 1.968877273897136e-06, "loss": 0.6202, "step": 4066 }, { "epoch": 2.408646727864969, "grad_norm": 1.2331684527553564, "learning_rate": 1.965068151269156e-06, "loss": 0.6105, "step": 4067 }, { "epoch": 2.409238969499556, "grad_norm": 3.5419262263798377, "learning_rate": 1.9612623153801267e-06, "loss": 0.6276, "step": 4068 }, { "epoch": 2.4098312111341427, "grad_norm": 1.2135255066194424, "learning_rate": 1.9574597677868535e-06, "loss": 0.5914, "step": 4069 }, { "epoch": 2.41042345276873, "grad_norm": 1.1257223069045508, "learning_rate": 1.953660510044789e-06, "loss": 0.6071, "step": 4070 }, { "epoch": 2.4110156944033165, "grad_norm": 1.7369991319720672, "learning_rate": 1.949864543708042e-06, "loss": 0.6332, "step": 4071 }, { "epoch": 2.4116079360379032, "grad_norm": 1.42930483040224, "learning_rate": 1.946071870329377e-06, "loss": 0.6303, "step": 4072 }, { "epoch": 2.4122001776724904, "grad_norm": 1.5717751665701418, "learning_rate": 1.9422824914602135e-06, "loss": 0.6262, "step": 4073 }, { "epoch": 2.412792419307077, "grad_norm": 1.6832189402748274, "learning_rate": 1.9384964086506185e-06, "loss": 0.5809, "step": 4074 }, { "epoch": 2.4133846609416643, "grad_norm": 1.7647206463793592, "learning_rate": 1.9347136234493093e-06, "loss": 0.6337, "step": 4075 }, { "epoch": 2.413976902576251, "grad_norm": 1.9697614615701116, "learning_rate": 1.930934137403665e-06, "loss": 0.5884, "step": 4076 }, { "epoch": 2.414569144210838, "grad_norm": 2.1805592052438523, "learning_rate": 1.9271579520597005e-06, "loss": 0.5893, "step": 4077 }, { "epoch": 2.415161385845425, "grad_norm": 2.016248237439067, "learning_rate": 1.923385068962095e-06, "loss": 0.6267, "step": 4078 }, { "epoch": 2.415753627480012, "grad_norm": 2.363174925154651, "learning_rate": 1.919615489654163e-06, "loss": 0.6469, "step": 4079 }, { "epoch": 2.4163458691145987, "grad_norm": 1.3925989121360791, "learning_rate": 1.9158492156778807e-06, "loss": 0.6612, "step": 4080 }, { "epoch": 2.416938110749186, "grad_norm": 1.273060654648486, "learning_rate": 1.91208624857386e-06, "loss": 0.6226, "step": 4081 }, { "epoch": 2.4175303523837726, "grad_norm": 1.4601623756383526, "learning_rate": 1.908326589881372e-06, "loss": 0.6188, "step": 4082 }, { "epoch": 2.4181225940183593, "grad_norm": 1.5467664483267456, "learning_rate": 1.9045702411383227e-06, "loss": 0.5895, "step": 4083 }, { "epoch": 2.4187148356529464, "grad_norm": 2.82373984208031, "learning_rate": 1.9008172038812744e-06, "loss": 0.6258, "step": 4084 }, { "epoch": 2.419307077287533, "grad_norm": 1.9417680433070386, "learning_rate": 1.897067479645428e-06, "loss": 0.6163, "step": 4085 }, { "epoch": 2.4198993189221203, "grad_norm": 2.2916627310601374, "learning_rate": 1.8933210699646342e-06, "loss": 0.6046, "step": 4086 }, { "epoch": 2.420491560556707, "grad_norm": 1.5024563307793963, "learning_rate": 1.8895779763713806e-06, "loss": 0.5865, "step": 4087 }, { "epoch": 2.421083802191294, "grad_norm": 1.446643763862051, "learning_rate": 1.885838200396808e-06, "loss": 0.6188, "step": 4088 }, { "epoch": 2.421676043825881, "grad_norm": 1.0593789622862213, "learning_rate": 1.8821017435706912e-06, "loss": 0.6205, "step": 4089 }, { "epoch": 2.422268285460468, "grad_norm": 1.5753893128495062, "learning_rate": 1.8783686074214546e-06, "loss": 0.6676, "step": 4090 }, { "epoch": 2.4228605270950547, "grad_norm": 1.5478395153422457, "learning_rate": 1.874638793476159e-06, "loss": 0.6609, "step": 4091 }, { "epoch": 2.423452768729642, "grad_norm": 2.513139396256183, "learning_rate": 1.8709123032605058e-06, "loss": 0.6139, "step": 4092 }, { "epoch": 2.4240450103642286, "grad_norm": 4.903771647987582, "learning_rate": 1.8671891382988416e-06, "loss": 0.6128, "step": 4093 }, { "epoch": 2.4246372519988153, "grad_norm": 1.0479089231627525, "learning_rate": 1.8634693001141513e-06, "loss": 0.6405, "step": 4094 }, { "epoch": 2.4252294936334025, "grad_norm": 4.228476276270857, "learning_rate": 1.8597527902280577e-06, "loss": 0.6362, "step": 4095 }, { "epoch": 2.425821735267989, "grad_norm": 1.5814852714772147, "learning_rate": 1.856039610160818e-06, "loss": 0.6704, "step": 4096 }, { "epoch": 2.4264139769025763, "grad_norm": 2.1012557519164776, "learning_rate": 1.8523297614313351e-06, "loss": 0.5896, "step": 4097 }, { "epoch": 2.427006218537163, "grad_norm": 1.463430055540698, "learning_rate": 1.8486232455571473e-06, "loss": 0.5984, "step": 4098 }, { "epoch": 2.42759846017175, "grad_norm": 1.6658819397730513, "learning_rate": 1.8449200640544274e-06, "loss": 0.5641, "step": 4099 }, { "epoch": 2.428190701806337, "grad_norm": 2.26128039729551, "learning_rate": 1.8412202184379801e-06, "loss": 0.6307, "step": 4100 }, { "epoch": 2.428782943440924, "grad_norm": 1.4559178783717623, "learning_rate": 1.837523710221254e-06, "loss": 0.6036, "step": 4101 }, { "epoch": 2.4293751850755108, "grad_norm": 1.7791634905991207, "learning_rate": 1.8338305409163314e-06, "loss": 0.6759, "step": 4102 }, { "epoch": 2.429967426710098, "grad_norm": 1.697277492617549, "learning_rate": 1.8301407120339232e-06, "loss": 0.6221, "step": 4103 }, { "epoch": 2.4305596683446846, "grad_norm": 2.2136330956036696, "learning_rate": 1.826454225083375e-06, "loss": 0.6591, "step": 4104 }, { "epoch": 2.4311519099792713, "grad_norm": 1.3508855440744538, "learning_rate": 1.8227710815726686e-06, "loss": 0.6392, "step": 4105 }, { "epoch": 2.4317441516138585, "grad_norm": 1.8566181281306422, "learning_rate": 1.8190912830084207e-06, "loss": 0.5795, "step": 4106 }, { "epoch": 2.432336393248445, "grad_norm": 3.144464503693746, "learning_rate": 1.815414830895873e-06, "loss": 0.593, "step": 4107 }, { "epoch": 2.4329286348830323, "grad_norm": 2.7242904341679846, "learning_rate": 1.811741726738898e-06, "loss": 0.6229, "step": 4108 }, { "epoch": 2.433520876517619, "grad_norm": 3.3833912508202517, "learning_rate": 1.8080719720400052e-06, "loss": 0.6076, "step": 4109 }, { "epoch": 2.434113118152206, "grad_norm": 3.0225222759099992, "learning_rate": 1.8044055683003358e-06, "loss": 0.6193, "step": 4110 }, { "epoch": 2.434705359786793, "grad_norm": 1.4370596639040112, "learning_rate": 1.8007425170196435e-06, "loss": 0.6325, "step": 4111 }, { "epoch": 2.43529760142138, "grad_norm": 1.5211255655218439, "learning_rate": 1.7970828196963286e-06, "loss": 0.556, "step": 4112 }, { "epoch": 2.435889843055967, "grad_norm": 1.3392790413616675, "learning_rate": 1.7934264778274157e-06, "loss": 0.5869, "step": 4113 }, { "epoch": 2.436482084690554, "grad_norm": 2.2652578152092957, "learning_rate": 1.7897734929085508e-06, "loss": 0.6022, "step": 4114 }, { "epoch": 2.4370743263251406, "grad_norm": 1.5991779329388756, "learning_rate": 1.7861238664340075e-06, "loss": 0.6318, "step": 4115 }, { "epoch": 2.4376665679597274, "grad_norm": 1.6856573090275266, "learning_rate": 1.7824775998966926e-06, "loss": 0.6195, "step": 4116 }, { "epoch": 2.4382588095943145, "grad_norm": 1.872131972586157, "learning_rate": 1.7788346947881352e-06, "loss": 0.6015, "step": 4117 }, { "epoch": 2.4388510512289012, "grad_norm": 1.3474243564708543, "learning_rate": 1.7751951525984857e-06, "loss": 0.5991, "step": 4118 }, { "epoch": 2.4394432928634884, "grad_norm": 1.44125755213156, "learning_rate": 1.7715589748165196e-06, "loss": 0.6336, "step": 4119 }, { "epoch": 2.440035534498075, "grad_norm": 1.628282300330072, "learning_rate": 1.7679261629296408e-06, "loss": 0.628, "step": 4120 }, { "epoch": 2.4406277761326622, "grad_norm": 1.5525192264857999, "learning_rate": 1.7642967184238758e-06, "loss": 0.5914, "step": 4121 }, { "epoch": 2.441220017767249, "grad_norm": 1.3773547139505093, "learning_rate": 1.7606706427838682e-06, "loss": 0.6638, "step": 4122 }, { "epoch": 2.441812259401836, "grad_norm": 2.496487937041218, "learning_rate": 1.7570479374928862e-06, "loss": 0.6213, "step": 4123 }, { "epoch": 2.442404501036423, "grad_norm": 3.2817990638971257, "learning_rate": 1.7534286040328208e-06, "loss": 0.6255, "step": 4124 }, { "epoch": 2.44299674267101, "grad_norm": 3.6057888078556712, "learning_rate": 1.7498126438841857e-06, "loss": 0.6445, "step": 4125 }, { "epoch": 2.4435889843055967, "grad_norm": 1.5818140283182256, "learning_rate": 1.7462000585261096e-06, "loss": 0.6281, "step": 4126 }, { "epoch": 2.4441812259401834, "grad_norm": 1.9261028578432795, "learning_rate": 1.7425908494363408e-06, "loss": 0.6367, "step": 4127 }, { "epoch": 2.4447734675747705, "grad_norm": 2.984646605350875, "learning_rate": 1.7389850180912537e-06, "loss": 0.6526, "step": 4128 }, { "epoch": 2.4453657092093573, "grad_norm": 1.361931780545937, "learning_rate": 1.735382565965832e-06, "loss": 0.5942, "step": 4129 }, { "epoch": 2.4459579508439444, "grad_norm": 1.230243762561541, "learning_rate": 1.7317834945336843e-06, "loss": 0.6713, "step": 4130 }, { "epoch": 2.446550192478531, "grad_norm": 2.9100521565655173, "learning_rate": 1.7281878052670288e-06, "loss": 0.641, "step": 4131 }, { "epoch": 2.4471424341131183, "grad_norm": 1.3667964273082245, "learning_rate": 1.724595499636711e-06, "loss": 0.5686, "step": 4132 }, { "epoch": 2.447734675747705, "grad_norm": 1.2327143660385451, "learning_rate": 1.7210065791121789e-06, "loss": 0.6167, "step": 4133 }, { "epoch": 2.448326917382292, "grad_norm": 2.0388204744932894, "learning_rate": 1.7174210451615091e-06, "loss": 0.6042, "step": 4134 }, { "epoch": 2.448919159016879, "grad_norm": 1.5150026413148958, "learning_rate": 1.713838899251381e-06, "loss": 0.627, "step": 4135 }, { "epoch": 2.449511400651466, "grad_norm": 1.1466180408826923, "learning_rate": 1.7102601428470988e-06, "loss": 0.6109, "step": 4136 }, { "epoch": 2.4501036422860527, "grad_norm": 2.0780281123685422, "learning_rate": 1.7066847774125716e-06, "loss": 0.6083, "step": 4137 }, { "epoch": 2.4506958839206394, "grad_norm": 1.3954878819749057, "learning_rate": 1.7031128044103272e-06, "loss": 0.5951, "step": 4138 }, { "epoch": 2.4512881255552266, "grad_norm": 1.2417256271708208, "learning_rate": 1.6995442253015003e-06, "loss": 0.5816, "step": 4139 }, { "epoch": 2.4518803671898133, "grad_norm": 2.1291302266514998, "learning_rate": 1.6959790415458454e-06, "loss": 0.6318, "step": 4140 }, { "epoch": 2.4524726088244004, "grad_norm": 1.294464705411972, "learning_rate": 1.692417254601717e-06, "loss": 0.6079, "step": 4141 }, { "epoch": 2.453064850458987, "grad_norm": 1.723843313270676, "learning_rate": 1.6888588659260929e-06, "loss": 0.6003, "step": 4142 }, { "epoch": 2.4536570920935743, "grad_norm": 2.01346115839394, "learning_rate": 1.6853038769745466e-06, "loss": 0.6387, "step": 4143 }, { "epoch": 2.454249333728161, "grad_norm": 1.7697041782949663, "learning_rate": 1.6817522892012762e-06, "loss": 0.6412, "step": 4144 }, { "epoch": 2.454841575362748, "grad_norm": 1.1993784853907747, "learning_rate": 1.6782041040590769e-06, "loss": 0.595, "step": 4145 }, { "epoch": 2.455433816997335, "grad_norm": 1.5669145113783172, "learning_rate": 1.6746593229993545e-06, "loss": 0.5877, "step": 4146 }, { "epoch": 2.456026058631922, "grad_norm": 1.1370169190841048, "learning_rate": 1.6711179474721272e-06, "loss": 0.6322, "step": 4147 }, { "epoch": 2.4566183002665087, "grad_norm": 3.5535580287557127, "learning_rate": 1.6675799789260128e-06, "loss": 0.631, "step": 4148 }, { "epoch": 2.4572105419010954, "grad_norm": 1.85149070130156, "learning_rate": 1.6640454188082444e-06, "loss": 0.6152, "step": 4149 }, { "epoch": 2.4578027835356826, "grad_norm": 2.303568347956373, "learning_rate": 1.6605142685646503e-06, "loss": 0.6196, "step": 4150 }, { "epoch": 2.4583950251702693, "grad_norm": 2.3689995504227372, "learning_rate": 1.6569865296396748e-06, "loss": 0.5813, "step": 4151 }, { "epoch": 2.4589872668048565, "grad_norm": 1.894218094477947, "learning_rate": 1.6534622034763558e-06, "loss": 0.6069, "step": 4152 }, { "epoch": 2.459579508439443, "grad_norm": 4.169690853566014, "learning_rate": 1.6499412915163481e-06, "loss": 0.6636, "step": 4153 }, { "epoch": 2.4601717500740303, "grad_norm": 1.6611317565107804, "learning_rate": 1.6464237951998952e-06, "loss": 0.561, "step": 4154 }, { "epoch": 2.460763991708617, "grad_norm": 2.215191682910294, "learning_rate": 1.642909715965857e-06, "loss": 0.63, "step": 4155 }, { "epoch": 2.461356233343204, "grad_norm": 1.6918194298709026, "learning_rate": 1.6393990552516848e-06, "loss": 0.6134, "step": 4156 }, { "epoch": 2.461948474977791, "grad_norm": 1.5262087510396043, "learning_rate": 1.63589181449344e-06, "loss": 0.6413, "step": 4157 }, { "epoch": 2.462540716612378, "grad_norm": 1.2968558382847615, "learning_rate": 1.6323879951257783e-06, "loss": 0.6197, "step": 4158 }, { "epoch": 2.4631329582469648, "grad_norm": 1.6679364569860284, "learning_rate": 1.628887598581962e-06, "loss": 0.5777, "step": 4159 }, { "epoch": 2.4637251998815515, "grad_norm": 1.0369459266334156, "learning_rate": 1.6253906262938457e-06, "loss": 0.6209, "step": 4160 }, { "epoch": 2.4643174415161386, "grad_norm": 1.5112263549928397, "learning_rate": 1.6218970796918931e-06, "loss": 0.6424, "step": 4161 }, { "epoch": 2.4649096831507253, "grad_norm": 1.2179638572938345, "learning_rate": 1.6184069602051578e-06, "loss": 0.6032, "step": 4162 }, { "epoch": 2.4655019247853125, "grad_norm": 1.2210989669346337, "learning_rate": 1.614920269261293e-06, "loss": 0.6041, "step": 4163 }, { "epoch": 2.466094166419899, "grad_norm": 1.5638850592616937, "learning_rate": 1.611437008286555e-06, "loss": 0.6265, "step": 4164 }, { "epoch": 2.4666864080544864, "grad_norm": 1.9348031966836825, "learning_rate": 1.6079571787057946e-06, "loss": 0.588, "step": 4165 }, { "epoch": 2.467278649689073, "grad_norm": 1.4655585715236081, "learning_rate": 1.6044807819424545e-06, "loss": 0.6662, "step": 4166 }, { "epoch": 2.46787089132366, "grad_norm": 1.35471687098539, "learning_rate": 1.6010078194185752e-06, "loss": 0.6332, "step": 4167 }, { "epoch": 2.468463132958247, "grad_norm": 1.4021986342558357, "learning_rate": 1.5975382925547966e-06, "loss": 0.6344, "step": 4168 }, { "epoch": 2.469055374592834, "grad_norm": 1.5104775229268375, "learning_rate": 1.5940722027703516e-06, "loss": 0.595, "step": 4169 }, { "epoch": 2.469647616227421, "grad_norm": 1.5242057096864499, "learning_rate": 1.5906095514830645e-06, "loss": 0.6058, "step": 4170 }, { "epoch": 2.4702398578620075, "grad_norm": 1.446713003214256, "learning_rate": 1.5871503401093501e-06, "loss": 0.6319, "step": 4171 }, { "epoch": 2.4708320994965947, "grad_norm": 2.343073886207364, "learning_rate": 1.5836945700642248e-06, "loss": 0.6254, "step": 4172 }, { "epoch": 2.4714243411311814, "grad_norm": 2.399540643140653, "learning_rate": 1.580242242761295e-06, "loss": 0.5846, "step": 4173 }, { "epoch": 2.4720165827657685, "grad_norm": 1.928520767924082, "learning_rate": 1.5767933596127528e-06, "loss": 0.6639, "step": 4174 }, { "epoch": 2.4726088244003552, "grad_norm": 1.1218273751756729, "learning_rate": 1.5733479220293847e-06, "loss": 0.5455, "step": 4175 }, { "epoch": 2.4732010660349424, "grad_norm": 1.379515727004053, "learning_rate": 1.56990593142057e-06, "loss": 0.6029, "step": 4176 }, { "epoch": 2.473793307669529, "grad_norm": 2.093979049163968, "learning_rate": 1.5664673891942805e-06, "loss": 0.5524, "step": 4177 }, { "epoch": 2.4743855493041162, "grad_norm": 1.4867542033675507, "learning_rate": 1.5630322967570655e-06, "loss": 0.6112, "step": 4178 }, { "epoch": 2.474977790938703, "grad_norm": 1.5435512842922596, "learning_rate": 1.559600655514074e-06, "loss": 0.609, "step": 4179 }, { "epoch": 2.47557003257329, "grad_norm": 1.5223906435727905, "learning_rate": 1.5561724668690436e-06, "loss": 0.5927, "step": 4180 }, { "epoch": 2.476162274207877, "grad_norm": 1.7939175275171766, "learning_rate": 1.5527477322242934e-06, "loss": 0.5754, "step": 4181 }, { "epoch": 2.4767545158424635, "grad_norm": 2.322851003634435, "learning_rate": 1.5493264529807305e-06, "loss": 0.6463, "step": 4182 }, { "epoch": 2.4773467574770507, "grad_norm": 1.4341109913608379, "learning_rate": 1.5459086305378524e-06, "loss": 0.6512, "step": 4183 }, { "epoch": 2.4779389991116374, "grad_norm": 2.000472244942328, "learning_rate": 1.5424942662937436e-06, "loss": 0.6175, "step": 4184 }, { "epoch": 2.4785312407462246, "grad_norm": 1.1364991284051775, "learning_rate": 1.5390833616450684e-06, "loss": 0.5778, "step": 4185 }, { "epoch": 2.4791234823808113, "grad_norm": 2.5864290370042187, "learning_rate": 1.5356759179870762e-06, "loss": 0.6695, "step": 4186 }, { "epoch": 2.4797157240153984, "grad_norm": 1.4569921907313288, "learning_rate": 1.5322719367136064e-06, "loss": 0.5809, "step": 4187 }, { "epoch": 2.480307965649985, "grad_norm": 1.3478923100787537, "learning_rate": 1.5288714192170796e-06, "loss": 0.5834, "step": 4188 }, { "epoch": 2.4809002072845723, "grad_norm": 3.810585927154581, "learning_rate": 1.5254743668884963e-06, "loss": 0.6608, "step": 4189 }, { "epoch": 2.481492448919159, "grad_norm": 1.3909152535067673, "learning_rate": 1.522080781117441e-06, "loss": 0.6216, "step": 4190 }, { "epoch": 2.482084690553746, "grad_norm": 1.7514258321845537, "learning_rate": 1.5186906632920816e-06, "loss": 0.6203, "step": 4191 }, { "epoch": 2.482676932188333, "grad_norm": 1.731744905638932, "learning_rate": 1.5153040147991716e-06, "loss": 0.613, "step": 4192 }, { "epoch": 2.4832691738229196, "grad_norm": 2.3774079021997765, "learning_rate": 1.5119208370240369e-06, "loss": 0.6039, "step": 4193 }, { "epoch": 2.4838614154575067, "grad_norm": 1.9933956673478872, "learning_rate": 1.5085411313505849e-06, "loss": 0.6405, "step": 4194 }, { "epoch": 2.4844536570920934, "grad_norm": 2.8685180286755187, "learning_rate": 1.5051648991613077e-06, "loss": 0.621, "step": 4195 }, { "epoch": 2.4850458987266806, "grad_norm": 1.2386381396531745, "learning_rate": 1.5017921418372772e-06, "loss": 0.6757, "step": 4196 }, { "epoch": 2.4856381403612673, "grad_norm": 1.2887006546671949, "learning_rate": 1.4984228607581386e-06, "loss": 0.599, "step": 4197 }, { "epoch": 2.4862303819958544, "grad_norm": 1.3284199306461797, "learning_rate": 1.4950570573021138e-06, "loss": 0.5998, "step": 4198 }, { "epoch": 2.486822623630441, "grad_norm": 1.505776114908257, "learning_rate": 1.4916947328460108e-06, "loss": 0.6197, "step": 4199 }, { "epoch": 2.4874148652650283, "grad_norm": 1.4391666401175998, "learning_rate": 1.4883358887652044e-06, "loss": 0.5938, "step": 4200 }, { "epoch": 2.488007106899615, "grad_norm": 1.7810197494911437, "learning_rate": 1.484980526433657e-06, "loss": 0.6247, "step": 4201 }, { "epoch": 2.488599348534202, "grad_norm": 1.8202240648965637, "learning_rate": 1.4816286472238939e-06, "loss": 0.5803, "step": 4202 }, { "epoch": 2.489191590168789, "grad_norm": 1.5697032510584463, "learning_rate": 1.4782802525070282e-06, "loss": 0.6114, "step": 4203 }, { "epoch": 2.4897838318033756, "grad_norm": 1.3273698484858674, "learning_rate": 1.474935343652736e-06, "loss": 0.6103, "step": 4204 }, { "epoch": 2.4903760734379627, "grad_norm": 1.825999416280271, "learning_rate": 1.4715939220292775e-06, "loss": 0.6109, "step": 4205 }, { "epoch": 2.4909683150725495, "grad_norm": 6.697769541146492, "learning_rate": 1.4682559890034787e-06, "loss": 0.5785, "step": 4206 }, { "epoch": 2.4915605567071366, "grad_norm": 4.951124117950075, "learning_rate": 1.4649215459407462e-06, "loss": 0.6181, "step": 4207 }, { "epoch": 2.4921527983417233, "grad_norm": 2.2685218356339996, "learning_rate": 1.461590594205049e-06, "loss": 0.5906, "step": 4208 }, { "epoch": 2.4927450399763105, "grad_norm": 1.187772811816334, "learning_rate": 1.4582631351589405e-06, "loss": 0.621, "step": 4209 }, { "epoch": 2.493337281610897, "grad_norm": 1.568313648134276, "learning_rate": 1.4549391701635308e-06, "loss": 0.6407, "step": 4210 }, { "epoch": 2.4939295232454843, "grad_norm": 2.056945291130729, "learning_rate": 1.4516187005785153e-06, "loss": 0.6526, "step": 4211 }, { "epoch": 2.494521764880071, "grad_norm": 2.5897828120036372, "learning_rate": 1.4483017277621482e-06, "loss": 0.5857, "step": 4212 }, { "epoch": 2.495114006514658, "grad_norm": 2.003851685407157, "learning_rate": 1.4449882530712621e-06, "loss": 0.5891, "step": 4213 }, { "epoch": 2.495706248149245, "grad_norm": 2.031397305896863, "learning_rate": 1.4416782778612514e-06, "loss": 0.6374, "step": 4214 }, { "epoch": 2.4962984897838316, "grad_norm": 2.460742739390165, "learning_rate": 1.4383718034860806e-06, "loss": 0.6282, "step": 4215 }, { "epoch": 2.4968907314184188, "grad_norm": 1.8883652271349463, "learning_rate": 1.4350688312982864e-06, "loss": 0.607, "step": 4216 }, { "epoch": 2.4974829730530055, "grad_norm": 1.6383575358198943, "learning_rate": 1.4317693626489715e-06, "loss": 0.6155, "step": 4217 }, { "epoch": 2.4980752146875926, "grad_norm": 1.5789440462021105, "learning_rate": 1.428473398887802e-06, "loss": 0.6475, "step": 4218 }, { "epoch": 2.4986674563221793, "grad_norm": 1.7872165908651498, "learning_rate": 1.4251809413630103e-06, "loss": 0.6318, "step": 4219 }, { "epoch": 2.4992596979567665, "grad_norm": 1.2177122630184891, "learning_rate": 1.421891991421399e-06, "loss": 0.6186, "step": 4220 }, { "epoch": 2.499851939591353, "grad_norm": 5.5110163664079765, "learning_rate": 1.4186065504083356e-06, "loss": 0.6125, "step": 4221 }, { "epoch": 2.50044418122594, "grad_norm": 1.3204263434006236, "learning_rate": 1.4153246196677483e-06, "loss": 0.5692, "step": 4222 }, { "epoch": 2.501036422860527, "grad_norm": 2.040538027329011, "learning_rate": 1.4120462005421287e-06, "loss": 0.6165, "step": 4223 }, { "epoch": 2.5016286644951142, "grad_norm": 4.142369576433715, "learning_rate": 1.4087712943725384e-06, "loss": 0.6178, "step": 4224 }, { "epoch": 2.502220906129701, "grad_norm": 1.1861793588300895, "learning_rate": 1.405499902498597e-06, "loss": 0.6187, "step": 4225 }, { "epoch": 2.5028131477642876, "grad_norm": 1.2674089845867869, "learning_rate": 1.40223202625849e-06, "loss": 0.6052, "step": 4226 }, { "epoch": 2.503405389398875, "grad_norm": 1.6386149471462386, "learning_rate": 1.3989676669889562e-06, "loss": 0.5914, "step": 4227 }, { "epoch": 2.5039976310334615, "grad_norm": 1.5069585186135463, "learning_rate": 1.395706826025306e-06, "loss": 0.5885, "step": 4228 }, { "epoch": 2.5045898726680487, "grad_norm": 1.5769533827442805, "learning_rate": 1.392449504701412e-06, "loss": 0.578, "step": 4229 }, { "epoch": 2.5051821143026354, "grad_norm": 1.9577342067869337, "learning_rate": 1.3891957043496917e-06, "loss": 0.5767, "step": 4230 }, { "epoch": 2.5057743559372225, "grad_norm": 1.820348565915433, "learning_rate": 1.3859454263011373e-06, "loss": 0.6608, "step": 4231 }, { "epoch": 2.5063665975718092, "grad_norm": 1.4196495980962014, "learning_rate": 1.3826986718852952e-06, "loss": 0.6256, "step": 4232 }, { "epoch": 2.506958839206396, "grad_norm": 2.614783295311572, "learning_rate": 1.3794554424302752e-06, "loss": 0.6216, "step": 4233 }, { "epoch": 2.507551080840983, "grad_norm": 2.0862712740764726, "learning_rate": 1.3762157392627317e-06, "loss": 0.6103, "step": 4234 }, { "epoch": 2.5081433224755703, "grad_norm": 1.4923395580122512, "learning_rate": 1.372979563707889e-06, "loss": 0.6247, "step": 4235 }, { "epoch": 2.508735564110157, "grad_norm": 1.6903494282840945, "learning_rate": 1.3697469170895282e-06, "loss": 0.6138, "step": 4236 }, { "epoch": 2.5093278057447437, "grad_norm": 1.7287805590209488, "learning_rate": 1.3665178007299818e-06, "loss": 0.5927, "step": 4237 }, { "epoch": 2.509920047379331, "grad_norm": 2.0975391419269314, "learning_rate": 1.363292215950135e-06, "loss": 0.5833, "step": 4238 }, { "epoch": 2.5105122890139175, "grad_norm": 1.7555964609395533, "learning_rate": 1.3600701640694392e-06, "loss": 0.6121, "step": 4239 }, { "epoch": 2.5111045306485047, "grad_norm": 1.6961673222728129, "learning_rate": 1.3568516464058946e-06, "loss": 0.6195, "step": 4240 }, { "epoch": 2.5116967722830914, "grad_norm": 2.079432102930987, "learning_rate": 1.3536366642760534e-06, "loss": 0.6057, "step": 4241 }, { "epoch": 2.5122890139176786, "grad_norm": 1.1628789903318941, "learning_rate": 1.350425218995024e-06, "loss": 0.5979, "step": 4242 }, { "epoch": 2.5128812555522653, "grad_norm": 1.454884994824475, "learning_rate": 1.3472173118764686e-06, "loss": 0.5959, "step": 4243 }, { "epoch": 2.513473497186852, "grad_norm": 1.977230587423518, "learning_rate": 1.3440129442326045e-06, "loss": 0.604, "step": 4244 }, { "epoch": 2.514065738821439, "grad_norm": 1.4424961332700483, "learning_rate": 1.3408121173741962e-06, "loss": 0.609, "step": 4245 }, { "epoch": 2.5146579804560263, "grad_norm": 1.178276635818679, "learning_rate": 1.3376148326105586e-06, "loss": 0.6249, "step": 4246 }, { "epoch": 2.515250222090613, "grad_norm": 1.5026051517578134, "learning_rate": 1.3344210912495648e-06, "loss": 0.642, "step": 4247 }, { "epoch": 2.5158424637251997, "grad_norm": 1.8145268871556535, "learning_rate": 1.3312308945976348e-06, "loss": 0.6375, "step": 4248 }, { "epoch": 2.516434705359787, "grad_norm": 1.7801182768357935, "learning_rate": 1.3280442439597384e-06, "loss": 0.6133, "step": 4249 }, { "epoch": 2.5170269469943736, "grad_norm": 2.2296683859183224, "learning_rate": 1.3248611406393918e-06, "loss": 0.6702, "step": 4250 }, { "epoch": 2.5176191886289607, "grad_norm": 1.8998316765515955, "learning_rate": 1.3216815859386667e-06, "loss": 0.5944, "step": 4251 }, { "epoch": 2.5182114302635474, "grad_norm": 2.0543176892307833, "learning_rate": 1.318505581158177e-06, "loss": 0.6315, "step": 4252 }, { "epoch": 2.5188036718981346, "grad_norm": 1.5059166609031631, "learning_rate": 1.3153331275970904e-06, "loss": 0.5729, "step": 4253 }, { "epoch": 2.5193959135327213, "grad_norm": 1.6358415801788577, "learning_rate": 1.3121642265531154e-06, "loss": 0.6296, "step": 4254 }, { "epoch": 2.519988155167308, "grad_norm": 1.36873712039331, "learning_rate": 1.3089988793225139e-06, "loss": 0.615, "step": 4255 }, { "epoch": 2.520580396801895, "grad_norm": 1.2889128303477049, "learning_rate": 1.305837087200087e-06, "loss": 0.6231, "step": 4256 }, { "epoch": 2.5211726384364823, "grad_norm": 4.424445990535661, "learning_rate": 1.30267885147919e-06, "loss": 0.6012, "step": 4257 }, { "epoch": 2.521764880071069, "grad_norm": 1.5525798274801588, "learning_rate": 1.299524173451715e-06, "loss": 0.5881, "step": 4258 }, { "epoch": 2.5223571217056557, "grad_norm": 2.316949095558541, "learning_rate": 1.2963730544081065e-06, "loss": 0.6059, "step": 4259 }, { "epoch": 2.522949363340243, "grad_norm": 1.6494823394802665, "learning_rate": 1.2932254956373457e-06, "loss": 0.6213, "step": 4260 }, { "epoch": 2.5235416049748296, "grad_norm": 1.5280751943361606, "learning_rate": 1.290081498426965e-06, "loss": 0.5645, "step": 4261 }, { "epoch": 2.5241338466094168, "grad_norm": 1.9376208589060975, "learning_rate": 1.286941064063031e-06, "loss": 0.5926, "step": 4262 }, { "epoch": 2.5247260882440035, "grad_norm": 1.429613590939648, "learning_rate": 1.2838041938301638e-06, "loss": 0.6164, "step": 4263 }, { "epoch": 2.5253183298785906, "grad_norm": 2.5583596054226247, "learning_rate": 1.2806708890115138e-06, "loss": 0.5951, "step": 4264 }, { "epoch": 2.5259105715131773, "grad_norm": 2.2738048703872513, "learning_rate": 1.2775411508887837e-06, "loss": 0.6259, "step": 4265 }, { "epoch": 2.526502813147764, "grad_norm": 2.872265401198682, "learning_rate": 1.2744149807422113e-06, "loss": 0.5911, "step": 4266 }, { "epoch": 2.527095054782351, "grad_norm": 1.6508438193817, "learning_rate": 1.2712923798505727e-06, "loss": 0.6179, "step": 4267 }, { "epoch": 2.5276872964169383, "grad_norm": 2.2705730263320825, "learning_rate": 1.2681733494911897e-06, "loss": 0.6396, "step": 4268 }, { "epoch": 2.528279538051525, "grad_norm": 1.7932950964539862, "learning_rate": 1.2650578909399225e-06, "loss": 0.5945, "step": 4269 }, { "epoch": 2.5288717796861118, "grad_norm": 2.039463332472355, "learning_rate": 1.2619460054711685e-06, "loss": 0.598, "step": 4270 }, { "epoch": 2.529464021320699, "grad_norm": 2.1940382419730446, "learning_rate": 1.2588376943578594e-06, "loss": 0.6173, "step": 4271 }, { "epoch": 2.5300562629552856, "grad_norm": 1.334149394220382, "learning_rate": 1.2557329588714739e-06, "loss": 0.6212, "step": 4272 }, { "epoch": 2.530648504589873, "grad_norm": 1.153175614903197, "learning_rate": 1.2526318002820237e-06, "loss": 0.5776, "step": 4273 }, { "epoch": 2.5312407462244595, "grad_norm": 2.7701095890179848, "learning_rate": 1.2495342198580562e-06, "loss": 0.6507, "step": 4274 }, { "epoch": 2.5318329878590466, "grad_norm": 1.9084746730843827, "learning_rate": 1.2464402188666524e-06, "loss": 0.5635, "step": 4275 }, { "epoch": 2.5324252294936334, "grad_norm": 3.3261345903916824, "learning_rate": 1.2433497985734356e-06, "loss": 0.6515, "step": 4276 }, { "epoch": 2.53301747112822, "grad_norm": 1.3772062277063388, "learning_rate": 1.2402629602425643e-06, "loss": 0.6683, "step": 4277 }, { "epoch": 2.533609712762807, "grad_norm": 1.5163170294865773, "learning_rate": 1.237179705136725e-06, "loss": 0.6679, "step": 4278 }, { "epoch": 2.5342019543973944, "grad_norm": 1.5235149346531982, "learning_rate": 1.2341000345171417e-06, "loss": 0.5914, "step": 4279 }, { "epoch": 2.534794196031981, "grad_norm": 1.4537614359095834, "learning_rate": 1.2310239496435749e-06, "loss": 0.5615, "step": 4280 }, { "epoch": 2.535386437666568, "grad_norm": 3.3433167384515285, "learning_rate": 1.2279514517743208e-06, "loss": 0.5961, "step": 4281 }, { "epoch": 2.535978679301155, "grad_norm": 1.759299393387278, "learning_rate": 1.2248825421661937e-06, "loss": 0.5994, "step": 4282 }, { "epoch": 2.5365709209357417, "grad_norm": 5.824067607433355, "learning_rate": 1.2218172220745562e-06, "loss": 0.6236, "step": 4283 }, { "epoch": 2.537163162570329, "grad_norm": 4.849146537686694, "learning_rate": 1.2187554927532963e-06, "loss": 0.6548, "step": 4284 }, { "epoch": 2.5377554042049155, "grad_norm": 1.8728352219215554, "learning_rate": 1.2156973554548369e-06, "loss": 0.6009, "step": 4285 }, { "epoch": 2.5383476458395027, "grad_norm": 2.628941079406281, "learning_rate": 1.2126428114301204e-06, "loss": 0.5838, "step": 4286 }, { "epoch": 2.5389398874740894, "grad_norm": 1.7988812066523692, "learning_rate": 1.2095918619286306e-06, "loss": 0.611, "step": 4287 }, { "epoch": 2.539532129108676, "grad_norm": 5.152642550826023, "learning_rate": 1.2065445081983795e-06, "loss": 0.627, "step": 4288 }, { "epoch": 2.5401243707432632, "grad_norm": 2.539557904372852, "learning_rate": 1.2035007514859054e-06, "loss": 0.5975, "step": 4289 }, { "epoch": 2.5407166123778504, "grad_norm": 1.4457429738287353, "learning_rate": 1.2004605930362724e-06, "loss": 0.566, "step": 4290 }, { "epoch": 2.541308854012437, "grad_norm": 3.4785094614740677, "learning_rate": 1.19742403409308e-06, "loss": 0.5893, "step": 4291 }, { "epoch": 2.541901095647024, "grad_norm": 2.4252390321481703, "learning_rate": 1.194391075898451e-06, "loss": 0.5936, "step": 4292 }, { "epoch": 2.542493337281611, "grad_norm": 1.483539871741351, "learning_rate": 1.191361719693036e-06, "loss": 0.6091, "step": 4293 }, { "epoch": 2.5430855789161977, "grad_norm": 1.7394003556175763, "learning_rate": 1.1883359667160087e-06, "loss": 0.6362, "step": 4294 }, { "epoch": 2.543677820550785, "grad_norm": 1.2312450282067198, "learning_rate": 1.185313818205076e-06, "loss": 0.6277, "step": 4295 }, { "epoch": 2.5442700621853716, "grad_norm": 1.2235952845823084, "learning_rate": 1.1822952753964667e-06, "loss": 0.5776, "step": 4296 }, { "epoch": 2.5448623038199587, "grad_norm": 1.3836979953577477, "learning_rate": 1.179280339524933e-06, "loss": 0.6458, "step": 4297 }, { "epoch": 2.5454545454545454, "grad_norm": 1.550091891298104, "learning_rate": 1.1762690118237518e-06, "loss": 0.6219, "step": 4298 }, { "epoch": 2.546046787089132, "grad_norm": 2.6309104401555286, "learning_rate": 1.1732612935247267e-06, "loss": 0.6267, "step": 4299 }, { "epoch": 2.5466390287237193, "grad_norm": 1.7871342600964928, "learning_rate": 1.1702571858581867e-06, "loss": 0.6586, "step": 4300 }, { "epoch": 2.5472312703583064, "grad_norm": 3.2118108366654257, "learning_rate": 1.167256690052978e-06, "loss": 0.6428, "step": 4301 }, { "epoch": 2.547823511992893, "grad_norm": 1.8557141426320194, "learning_rate": 1.1642598073364707e-06, "loss": 0.5865, "step": 4302 }, { "epoch": 2.54841575362748, "grad_norm": 1.7419326195322122, "learning_rate": 1.1612665389345613e-06, "loss": 0.586, "step": 4303 }, { "epoch": 2.549007995262067, "grad_norm": 4.789685134855233, "learning_rate": 1.158276886071662e-06, "loss": 0.6206, "step": 4304 }, { "epoch": 2.5496002368966537, "grad_norm": 1.681521269697521, "learning_rate": 1.1552908499707115e-06, "loss": 0.5966, "step": 4305 }, { "epoch": 2.550192478531241, "grad_norm": 1.7353432303503473, "learning_rate": 1.1523084318531641e-06, "loss": 0.6347, "step": 4306 }, { "epoch": 2.5507847201658276, "grad_norm": 1.4255690439020448, "learning_rate": 1.1493296329390003e-06, "loss": 0.6146, "step": 4307 }, { "epoch": 2.5513769618004147, "grad_norm": 2.180356591475331, "learning_rate": 1.1463544544467109e-06, "loss": 0.6158, "step": 4308 }, { "epoch": 2.5519692034350014, "grad_norm": 1.6710087648778493, "learning_rate": 1.143382897593316e-06, "loss": 0.6162, "step": 4309 }, { "epoch": 2.552561445069588, "grad_norm": 1.5114876553797243, "learning_rate": 1.1404149635943462e-06, "loss": 0.5982, "step": 4310 }, { "epoch": 2.5531536867041753, "grad_norm": 1.9274949790613265, "learning_rate": 1.1374506536638574e-06, "loss": 0.5799, "step": 4311 }, { "epoch": 2.5537459283387625, "grad_norm": 1.1942475585564856, "learning_rate": 1.134489969014414e-06, "loss": 0.5993, "step": 4312 }, { "epoch": 2.554338169973349, "grad_norm": 2.7949402099250795, "learning_rate": 1.1315329108571072e-06, "loss": 0.5959, "step": 4313 }, { "epoch": 2.554930411607936, "grad_norm": 2.1002894683071345, "learning_rate": 1.1285794804015349e-06, "loss": 0.6032, "step": 4314 }, { "epoch": 2.555522653242523, "grad_norm": 1.332872243171736, "learning_rate": 1.125629678855822e-06, "loss": 0.6241, "step": 4315 }, { "epoch": 2.5561148948771097, "grad_norm": 2.3220966071152294, "learning_rate": 1.1226835074265985e-06, "loss": 0.6073, "step": 4316 }, { "epoch": 2.556707136511697, "grad_norm": 1.166136350433321, "learning_rate": 1.1197409673190186e-06, "loss": 0.5955, "step": 4317 }, { "epoch": 2.5572993781462836, "grad_norm": 1.4802954541867506, "learning_rate": 1.1168020597367435e-06, "loss": 0.6378, "step": 4318 }, { "epoch": 2.5578916197808708, "grad_norm": 1.4086127534519697, "learning_rate": 1.1138667858819497e-06, "loss": 0.5804, "step": 4319 }, { "epoch": 2.5584838614154575, "grad_norm": 1.6694041074811221, "learning_rate": 1.1109351469553331e-06, "loss": 0.6109, "step": 4320 }, { "epoch": 2.559076103050044, "grad_norm": 1.81052273937072, "learning_rate": 1.1080071441560992e-06, "loss": 0.6108, "step": 4321 }, { "epoch": 2.5596683446846313, "grad_norm": 2.0222094639871147, "learning_rate": 1.105082778681964e-06, "loss": 0.5742, "step": 4322 }, { "epoch": 2.5602605863192185, "grad_norm": 2.3303481014699003, "learning_rate": 1.1021620517291566e-06, "loss": 0.6679, "step": 4323 }, { "epoch": 2.560852827953805, "grad_norm": 1.2189662484050676, "learning_rate": 1.0992449644924186e-06, "loss": 0.5603, "step": 4324 }, { "epoch": 2.561445069588392, "grad_norm": 1.6090711124893189, "learning_rate": 1.0963315181650058e-06, "loss": 0.5734, "step": 4325 }, { "epoch": 2.562037311222979, "grad_norm": 1.4371331051564935, "learning_rate": 1.0934217139386805e-06, "loss": 0.6292, "step": 4326 }, { "epoch": 2.5626295528575658, "grad_norm": 1.637674628017765, "learning_rate": 1.0905155530037125e-06, "loss": 0.6119, "step": 4327 }, { "epoch": 2.563221794492153, "grad_norm": 2.25569504293535, "learning_rate": 1.087613036548888e-06, "loss": 0.5816, "step": 4328 }, { "epoch": 2.5638140361267396, "grad_norm": 2.2331219302066136, "learning_rate": 1.0847141657615023e-06, "loss": 0.6207, "step": 4329 }, { "epoch": 2.564406277761327, "grad_norm": 1.0824128878079005, "learning_rate": 1.0818189418273527e-06, "loss": 0.6287, "step": 4330 }, { "epoch": 2.5649985193959135, "grad_norm": 1.7593659910084987, "learning_rate": 1.0789273659307476e-06, "loss": 0.6439, "step": 4331 }, { "epoch": 2.5655907610305, "grad_norm": 1.761466082062497, "learning_rate": 1.0760394392545058e-06, "loss": 0.6039, "step": 4332 }, { "epoch": 2.5661830026650874, "grad_norm": 1.7622676780184343, "learning_rate": 1.0731551629799542e-06, "loss": 0.5764, "step": 4333 }, { "epoch": 2.5667752442996745, "grad_norm": 3.3213032917052576, "learning_rate": 1.0702745382869207e-06, "loss": 0.6205, "step": 4334 }, { "epoch": 2.5673674859342612, "grad_norm": 1.14995546121375, "learning_rate": 1.0673975663537428e-06, "loss": 0.5977, "step": 4335 }, { "epoch": 2.567959727568848, "grad_norm": 1.225755679706984, "learning_rate": 1.064524248357265e-06, "loss": 0.6445, "step": 4336 }, { "epoch": 2.568551969203435, "grad_norm": 1.9102011927596232, "learning_rate": 1.0616545854728388e-06, "loss": 0.6223, "step": 4337 }, { "epoch": 2.569144210838022, "grad_norm": 1.644287131836029, "learning_rate": 1.0587885788743112e-06, "loss": 0.6169, "step": 4338 }, { "epoch": 2.569736452472609, "grad_norm": 2.229153860357448, "learning_rate": 1.0559262297340433e-06, "loss": 0.6087, "step": 4339 }, { "epoch": 2.5703286941071957, "grad_norm": 1.8865674189894543, "learning_rate": 1.0530675392228995e-06, "loss": 0.6173, "step": 4340 }, { "epoch": 2.570920935741783, "grad_norm": 6.002385683935582, "learning_rate": 1.0502125085102422e-06, "loss": 0.6208, "step": 4341 }, { "epoch": 2.5715131773763695, "grad_norm": 2.24501027749272, "learning_rate": 1.047361138763937e-06, "loss": 0.6327, "step": 4342 }, { "epoch": 2.5721054190109562, "grad_norm": 1.488334337447085, "learning_rate": 1.0445134311503592e-06, "loss": 0.6553, "step": 4343 }, { "epoch": 2.5726976606455434, "grad_norm": 5.179170676032384, "learning_rate": 1.0416693868343796e-06, "loss": 0.5995, "step": 4344 }, { "epoch": 2.5732899022801305, "grad_norm": 2.911488695193342, "learning_rate": 1.0388290069793726e-06, "loss": 0.6315, "step": 4345 }, { "epoch": 2.5738821439147173, "grad_norm": 1.257777620049107, "learning_rate": 1.035992292747211e-06, "loss": 0.5844, "step": 4346 }, { "epoch": 2.574474385549304, "grad_norm": 1.0731010623057406, "learning_rate": 1.0331592452982718e-06, "loss": 0.5987, "step": 4347 }, { "epoch": 2.575066627183891, "grad_norm": 3.003901689507578, "learning_rate": 1.030329865791434e-06, "loss": 0.6051, "step": 4348 }, { "epoch": 2.575658868818478, "grad_norm": 1.7372715719465501, "learning_rate": 1.0275041553840691e-06, "loss": 0.6226, "step": 4349 }, { "epoch": 2.576251110453065, "grad_norm": 1.7371414977000834, "learning_rate": 1.0246821152320507e-06, "loss": 0.5935, "step": 4350 }, { "epoch": 2.5768433520876517, "grad_norm": 1.408153611176849, "learning_rate": 1.0218637464897541e-06, "loss": 0.5643, "step": 4351 }, { "epoch": 2.577435593722239, "grad_norm": 1.780798414186877, "learning_rate": 1.0190490503100515e-06, "loss": 0.6226, "step": 4352 }, { "epoch": 2.5780278353568256, "grad_norm": 2.2724733020724424, "learning_rate": 1.0162380278443107e-06, "loss": 0.6212, "step": 4353 }, { "epoch": 2.5786200769914123, "grad_norm": 2.3417587051881874, "learning_rate": 1.0134306802423965e-06, "loss": 0.5835, "step": 4354 }, { "epoch": 2.5792123186259994, "grad_norm": 2.1212455863514625, "learning_rate": 1.010627008652675e-06, "loss": 0.6133, "step": 4355 }, { "epoch": 2.5798045602605866, "grad_norm": 2.605778840709358, "learning_rate": 1.0078270142220015e-06, "loss": 0.6165, "step": 4356 }, { "epoch": 2.5803968018951733, "grad_norm": 1.30684333675587, "learning_rate": 1.0050306980957358e-06, "loss": 0.6219, "step": 4357 }, { "epoch": 2.58098904352976, "grad_norm": 1.3407378615869856, "learning_rate": 1.0022380614177251e-06, "loss": 0.6391, "step": 4358 }, { "epoch": 2.581581285164347, "grad_norm": 1.673176460809278, "learning_rate": 9.994491053303169e-07, "loss": 0.6031, "step": 4359 }, { "epoch": 2.582173526798934, "grad_norm": 1.3507011115002057, "learning_rate": 9.966638309743481e-07, "loss": 0.6097, "step": 4360 }, { "epoch": 2.582765768433521, "grad_norm": 1.31675965356572, "learning_rate": 9.938822394891568e-07, "loss": 0.613, "step": 4361 }, { "epoch": 2.5833580100681077, "grad_norm": 1.3590258887169568, "learning_rate": 9.911043320125657e-07, "loss": 0.6662, "step": 4362 }, { "epoch": 2.583950251702695, "grad_norm": 1.8770135822453762, "learning_rate": 9.883301096808995e-07, "loss": 0.6422, "step": 4363 }, { "epoch": 2.5845424933372816, "grad_norm": 2.121220810517652, "learning_rate": 9.85559573628967e-07, "loss": 0.6463, "step": 4364 }, { "epoch": 2.5851347349718683, "grad_norm": 1.939410319473363, "learning_rate": 9.827927249900782e-07, "loss": 0.6276, "step": 4365 }, { "epoch": 2.5857269766064555, "grad_norm": 1.5896698304385093, "learning_rate": 9.800295648960245e-07, "loss": 0.5878, "step": 4366 }, { "epoch": 2.5863192182410426, "grad_norm": 2.349609733093395, "learning_rate": 9.772700944770973e-07, "loss": 0.639, "step": 4367 }, { "epoch": 2.5869114598756293, "grad_norm": 1.502375187876737, "learning_rate": 9.74514314862074e-07, "loss": 0.623, "step": 4368 }, { "epoch": 2.587503701510216, "grad_norm": 1.2592816904026312, "learning_rate": 9.717622271782234e-07, "loss": 0.6151, "step": 4369 }, { "epoch": 2.588095943144803, "grad_norm": 1.5838796869432388, "learning_rate": 9.690138325513043e-07, "loss": 0.6271, "step": 4370 }, { "epoch": 2.58868818477939, "grad_norm": 3.9815136759012213, "learning_rate": 9.66269132105565e-07, "loss": 0.6234, "step": 4371 }, { "epoch": 2.589280426413977, "grad_norm": 1.332493171381054, "learning_rate": 9.635281269637409e-07, "loss": 0.6035, "step": 4372 }, { "epoch": 2.5898726680485638, "grad_norm": 1.7683470516926967, "learning_rate": 9.607908182470593e-07, "loss": 0.6349, "step": 4373 }, { "epoch": 2.590464909683151, "grad_norm": 2.0170932132048334, "learning_rate": 9.580572070752335e-07, "loss": 0.6105, "step": 4374 }, { "epoch": 2.5910571513177376, "grad_norm": 2.740791397745886, "learning_rate": 9.553272945664604e-07, "loss": 0.5933, "step": 4375 }, { "epoch": 2.5916493929523243, "grad_norm": 1.4563051854293856, "learning_rate": 9.52601081837431e-07, "loss": 0.6438, "step": 4376 }, { "epoch": 2.5922416345869115, "grad_norm": 2.7013335054171974, "learning_rate": 9.498785700033197e-07, "loss": 0.6372, "step": 4377 }, { "epoch": 2.5928338762214986, "grad_norm": 2.0736824122584423, "learning_rate": 9.471597601777871e-07, "loss": 0.6126, "step": 4378 }, { "epoch": 2.5934261178560853, "grad_norm": 1.3687787675542844, "learning_rate": 9.444446534729767e-07, "loss": 0.6138, "step": 4379 }, { "epoch": 2.594018359490672, "grad_norm": 1.4601120548746158, "learning_rate": 9.417332509995203e-07, "loss": 0.5814, "step": 4380 }, { "epoch": 2.594610601125259, "grad_norm": 1.6919905995540607, "learning_rate": 9.390255538665383e-07, "loss": 0.6357, "step": 4381 }, { "epoch": 2.595202842759846, "grad_norm": 1.5823616402702225, "learning_rate": 9.36321563181628e-07, "loss": 0.585, "step": 4382 }, { "epoch": 2.595795084394433, "grad_norm": 1.960553817746592, "learning_rate": 9.336212800508715e-07, "loss": 0.6303, "step": 4383 }, { "epoch": 2.59638732602902, "grad_norm": 17.328103997100726, "learning_rate": 9.309247055788384e-07, "loss": 0.6071, "step": 4384 }, { "epoch": 2.596979567663607, "grad_norm": 2.096167704908563, "learning_rate": 9.282318408685809e-07, "loss": 0.6205, "step": 4385 }, { "epoch": 2.5975718092981936, "grad_norm": 3.5082685509985216, "learning_rate": 9.255426870216311e-07, "loss": 0.6005, "step": 4386 }, { "epoch": 2.5981640509327804, "grad_norm": 1.6098721047621571, "learning_rate": 9.228572451380024e-07, "loss": 0.6077, "step": 4387 }, { "epoch": 2.5987562925673675, "grad_norm": 2.5927416189734593, "learning_rate": 9.201755163161918e-07, "loss": 0.597, "step": 4388 }, { "epoch": 2.5993485342019547, "grad_norm": 1.9744478641135261, "learning_rate": 9.174975016531828e-07, "loss": 0.5956, "step": 4389 }, { "epoch": 2.5999407758365414, "grad_norm": 1.587520907903784, "learning_rate": 9.148232022444259e-07, "loss": 0.5978, "step": 4390 }, { "epoch": 2.600533017471128, "grad_norm": 1.7030721112666247, "learning_rate": 9.121526191838626e-07, "loss": 0.6412, "step": 4391 }, { "epoch": 2.6011252591057152, "grad_norm": 1.6855774573137718, "learning_rate": 9.094857535639157e-07, "loss": 0.6673, "step": 4392 }, { "epoch": 2.601717500740302, "grad_norm": 1.670526552319908, "learning_rate": 9.068226064754781e-07, "loss": 0.6057, "step": 4393 }, { "epoch": 2.602309742374889, "grad_norm": 3.2516145639118346, "learning_rate": 9.041631790079275e-07, "loss": 0.607, "step": 4394 }, { "epoch": 2.602901984009476, "grad_norm": 2.0502691399541777, "learning_rate": 9.015074722491212e-07, "loss": 0.6516, "step": 4395 }, { "epoch": 2.603494225644063, "grad_norm": 1.9295772880960242, "learning_rate": 8.988554872853927e-07, "loss": 0.6131, "step": 4396 }, { "epoch": 2.6040864672786497, "grad_norm": 2.8204499089521318, "learning_rate": 8.962072252015519e-07, "loss": 0.6764, "step": 4397 }, { "epoch": 2.6046787089132364, "grad_norm": 1.449343492660855, "learning_rate": 8.935626870808856e-07, "loss": 0.5797, "step": 4398 }, { "epoch": 2.6052709505478235, "grad_norm": 1.4023824555418185, "learning_rate": 8.909218740051596e-07, "loss": 0.6024, "step": 4399 }, { "epoch": 2.6058631921824107, "grad_norm": 1.9603813068399214, "learning_rate": 8.882847870546174e-07, "loss": 0.5941, "step": 4400 }, { "epoch": 2.6064554338169974, "grad_norm": 2.1424478675974252, "learning_rate": 8.856514273079741e-07, "loss": 0.6012, "step": 4401 }, { "epoch": 2.607047675451584, "grad_norm": 5.123535144408223, "learning_rate": 8.830217958424192e-07, "loss": 0.5819, "step": 4402 }, { "epoch": 2.6076399170861713, "grad_norm": 2.2597985178839117, "learning_rate": 8.803958937336221e-07, "loss": 0.6186, "step": 4403 }, { "epoch": 2.608232158720758, "grad_norm": 2.218872286361537, "learning_rate": 8.77773722055727e-07, "loss": 0.6509, "step": 4404 }, { "epoch": 2.608824400355345, "grad_norm": 1.4707962923229974, "learning_rate": 8.751552818813469e-07, "loss": 0.6754, "step": 4405 }, { "epoch": 2.609416641989932, "grad_norm": 1.45480818288004, "learning_rate": 8.725405742815695e-07, "loss": 0.5915, "step": 4406 }, { "epoch": 2.610008883624519, "grad_norm": 1.3986566797454838, "learning_rate": 8.699296003259594e-07, "loss": 0.5885, "step": 4407 }, { "epoch": 2.6106011252591057, "grad_norm": 5.235464145421576, "learning_rate": 8.673223610825532e-07, "loss": 0.6181, "step": 4408 }, { "epoch": 2.6111933668936924, "grad_norm": 1.2189308602666966, "learning_rate": 8.647188576178567e-07, "loss": 0.606, "step": 4409 }, { "epoch": 2.6117856085282796, "grad_norm": 1.4608812222029703, "learning_rate": 8.62119090996848e-07, "loss": 0.6097, "step": 4410 }, { "epoch": 2.6123778501628667, "grad_norm": 1.7812634754687648, "learning_rate": 8.595230622829797e-07, "loss": 0.6253, "step": 4411 }, { "epoch": 2.6129700917974534, "grad_norm": 1.3064781743089184, "learning_rate": 8.569307725381715e-07, "loss": 0.6165, "step": 4412 }, { "epoch": 2.61356233343204, "grad_norm": 1.335671673105394, "learning_rate": 8.543422228228182e-07, "loss": 0.6033, "step": 4413 }, { "epoch": 2.6141545750666273, "grad_norm": 1.5391838884745948, "learning_rate": 8.517574141957796e-07, "loss": 0.6247, "step": 4414 }, { "epoch": 2.614746816701214, "grad_norm": 1.620678307778965, "learning_rate": 8.491763477143911e-07, "loss": 0.5832, "step": 4415 }, { "epoch": 2.615339058335801, "grad_norm": 1.9416245565474328, "learning_rate": 8.46599024434449e-07, "loss": 0.6067, "step": 4416 }, { "epoch": 2.615931299970388, "grad_norm": 3.5183251486251983, "learning_rate": 8.440254454102303e-07, "loss": 0.5849, "step": 4417 }, { "epoch": 2.616523541604975, "grad_norm": 1.735719411627244, "learning_rate": 8.414556116944672e-07, "loss": 0.637, "step": 4418 }, { "epoch": 2.6171157832395617, "grad_norm": 1.2608882302510347, "learning_rate": 8.388895243383699e-07, "loss": 0.6155, "step": 4419 }, { "epoch": 2.6177080248741484, "grad_norm": 2.0273357584791865, "learning_rate": 8.363271843916099e-07, "loss": 0.6128, "step": 4420 }, { "epoch": 2.6183002665087356, "grad_norm": 1.125910415428338, "learning_rate": 8.33768592902332e-07, "loss": 0.6086, "step": 4421 }, { "epoch": 2.6188925081433228, "grad_norm": 2.818588547251622, "learning_rate": 8.312137509171392e-07, "loss": 0.5505, "step": 4422 }, { "epoch": 2.6194847497779095, "grad_norm": 2.4748929370800457, "learning_rate": 8.286626594811098e-07, "loss": 0.5675, "step": 4423 }, { "epoch": 2.620076991412496, "grad_norm": 2.5020858763222456, "learning_rate": 8.261153196377814e-07, "loss": 0.5973, "step": 4424 }, { "epoch": 2.6206692330470833, "grad_norm": 5.677134946889842, "learning_rate": 8.235717324291604e-07, "loss": 0.622, "step": 4425 }, { "epoch": 2.62126147468167, "grad_norm": 3.5621389375037813, "learning_rate": 8.210318988957166e-07, "loss": 0.584, "step": 4426 }, { "epoch": 2.621853716316257, "grad_norm": 1.8691129401457045, "learning_rate": 8.184958200763826e-07, "loss": 0.5916, "step": 4427 }, { "epoch": 2.622445957950844, "grad_norm": 4.897419852333168, "learning_rate": 8.159634970085595e-07, "loss": 0.5899, "step": 4428 }, { "epoch": 2.623038199585431, "grad_norm": 1.4035158403121284, "learning_rate": 8.134349307281109e-07, "loss": 0.5928, "step": 4429 }, { "epoch": 2.6236304412200178, "grad_norm": 2.473039251325127, "learning_rate": 8.109101222693616e-07, "loss": 0.5929, "step": 4430 }, { "epoch": 2.6242226828546045, "grad_norm": 2.0204745202248353, "learning_rate": 8.083890726650978e-07, "loss": 0.6055, "step": 4431 }, { "epoch": 2.6248149244891916, "grad_norm": 1.4459314914665022, "learning_rate": 8.058717829465723e-07, "loss": 0.6025, "step": 4432 }, { "epoch": 2.6254071661237783, "grad_norm": 2.1952381195541553, "learning_rate": 8.033582541435003e-07, "loss": 0.6302, "step": 4433 }, { "epoch": 2.6259994077583655, "grad_norm": 1.7238053969572114, "learning_rate": 8.008484872840538e-07, "loss": 0.6178, "step": 4434 }, { "epoch": 2.626591649392952, "grad_norm": 1.4421860814594145, "learning_rate": 7.983424833948694e-07, "loss": 0.6132, "step": 4435 }, { "epoch": 2.6271838910275394, "grad_norm": 33.61594311760086, "learning_rate": 7.958402435010415e-07, "loss": 0.6302, "step": 4436 }, { "epoch": 2.627776132662126, "grad_norm": 1.3890112121581686, "learning_rate": 7.933417686261325e-07, "loss": 0.6571, "step": 4437 }, { "epoch": 2.628368374296713, "grad_norm": 1.5916247723609014, "learning_rate": 7.908470597921547e-07, "loss": 0.6012, "step": 4438 }, { "epoch": 2.6289606159313, "grad_norm": 1.1892053559966738, "learning_rate": 7.883561180195831e-07, "loss": 0.6098, "step": 4439 }, { "epoch": 2.629552857565887, "grad_norm": 3.0816929839131504, "learning_rate": 7.858689443273548e-07, "loss": 0.6013, "step": 4440 }, { "epoch": 2.630145099200474, "grad_norm": 3.903155006740842, "learning_rate": 7.833855397328682e-07, "loss": 0.5941, "step": 4441 }, { "epoch": 2.6307373408350605, "grad_norm": 2.618889450319004, "learning_rate": 7.809059052519674e-07, "loss": 0.644, "step": 4442 }, { "epoch": 2.6313295824696477, "grad_norm": 1.993232060605033, "learning_rate": 7.784300418989665e-07, "loss": 0.591, "step": 4443 }, { "epoch": 2.6319218241042344, "grad_norm": 1.2377163825617075, "learning_rate": 7.759579506866311e-07, "loss": 0.5866, "step": 4444 }, { "epoch": 2.6325140657388215, "grad_norm": 1.843881731614733, "learning_rate": 7.73489632626192e-07, "loss": 0.642, "step": 4445 }, { "epoch": 2.6331063073734082, "grad_norm": 2.4131522683099518, "learning_rate": 7.710250887273196e-07, "loss": 0.6004, "step": 4446 }, { "epoch": 2.6336985490079954, "grad_norm": 1.1424501229679849, "learning_rate": 7.685643199981574e-07, "loss": 0.6154, "step": 4447 }, { "epoch": 2.634290790642582, "grad_norm": 1.353394407111203, "learning_rate": 7.66107327445299e-07, "loss": 0.6123, "step": 4448 }, { "epoch": 2.6348830322771692, "grad_norm": 1.4623696640133033, "learning_rate": 7.636541120737906e-07, "loss": 0.5979, "step": 4449 }, { "epoch": 2.635475273911756, "grad_norm": 1.6100152183594005, "learning_rate": 7.612046748871327e-07, "loss": 0.6087, "step": 4450 }, { "epoch": 2.636067515546343, "grad_norm": 1.5347034575841871, "learning_rate": 7.58759016887286e-07, "loss": 0.6232, "step": 4451 }, { "epoch": 2.63665975718093, "grad_norm": 1.487392673062803, "learning_rate": 7.563171390746627e-07, "loss": 0.5846, "step": 4452 }, { "epoch": 2.6372519988155165, "grad_norm": 1.9959820986132857, "learning_rate": 7.53879042448128e-07, "loss": 0.6068, "step": 4453 }, { "epoch": 2.6378442404501037, "grad_norm": 1.3610216722187833, "learning_rate": 7.514447280049964e-07, "loss": 0.5942, "step": 4454 }, { "epoch": 2.6384364820846904, "grad_norm": 2.2754665990675136, "learning_rate": 7.490141967410436e-07, "loss": 0.6712, "step": 4455 }, { "epoch": 2.6390287237192775, "grad_norm": 1.3895620105466362, "learning_rate": 7.465874496504944e-07, "loss": 0.6119, "step": 4456 }, { "epoch": 2.6396209653538643, "grad_norm": 1.1470130133503953, "learning_rate": 7.441644877260212e-07, "loss": 0.5964, "step": 4457 }, { "epoch": 2.6402132069884514, "grad_norm": 2.521466762407184, "learning_rate": 7.417453119587525e-07, "loss": 0.5729, "step": 4458 }, { "epoch": 2.640805448623038, "grad_norm": 2.4309719657662376, "learning_rate": 7.393299233382678e-07, "loss": 0.6652, "step": 4459 }, { "epoch": 2.6413976902576253, "grad_norm": 3.395055166928025, "learning_rate": 7.369183228526e-07, "loss": 0.6361, "step": 4460 }, { "epoch": 2.641989931892212, "grad_norm": 2.006109738193744, "learning_rate": 7.345105114882245e-07, "loss": 0.5852, "step": 4461 }, { "epoch": 2.642582173526799, "grad_norm": 1.4964269607335803, "learning_rate": 7.321064902300723e-07, "loss": 0.6232, "step": 4462 }, { "epoch": 2.643174415161386, "grad_norm": 3.2714212450603393, "learning_rate": 7.297062600615268e-07, "loss": 0.6469, "step": 4463 }, { "epoch": 2.6437666567959726, "grad_norm": 1.8897563765286818, "learning_rate": 7.273098219644137e-07, "loss": 0.6354, "step": 4464 }, { "epoch": 2.6443588984305597, "grad_norm": 1.528350989418592, "learning_rate": 7.249171769190111e-07, "loss": 0.6233, "step": 4465 }, { "epoch": 2.6449511400651464, "grad_norm": 3.275762358290711, "learning_rate": 7.225283259040472e-07, "loss": 0.578, "step": 4466 }, { "epoch": 2.6455433816997336, "grad_norm": 1.5673100456277456, "learning_rate": 7.201432698966959e-07, "loss": 0.6478, "step": 4467 }, { "epoch": 2.6461356233343203, "grad_norm": 1.447196429106465, "learning_rate": 7.17762009872579e-07, "loss": 0.5887, "step": 4468 }, { "epoch": 2.6467278649689074, "grad_norm": 1.4964135311485778, "learning_rate": 7.15384546805764e-07, "loss": 0.6, "step": 4469 }, { "epoch": 2.647320106603494, "grad_norm": 1.9472910578091667, "learning_rate": 7.130108816687687e-07, "loss": 0.5853, "step": 4470 }, { "epoch": 2.647912348238081, "grad_norm": 1.2610856530380568, "learning_rate": 7.106410154325571e-07, "loss": 0.6081, "step": 4471 }, { "epoch": 2.648504589872668, "grad_norm": 1.3825561685529, "learning_rate": 7.082749490665353e-07, "loss": 0.6435, "step": 4472 }, { "epoch": 2.649096831507255, "grad_norm": 1.5107349206863576, "learning_rate": 7.059126835385577e-07, "loss": 0.6626, "step": 4473 }, { "epoch": 2.649689073141842, "grad_norm": 1.9273475424947197, "learning_rate": 7.035542198149237e-07, "loss": 0.5902, "step": 4474 }, { "epoch": 2.6502813147764286, "grad_norm": 2.5130850136023075, "learning_rate": 7.011995588603804e-07, "loss": 0.6471, "step": 4475 }, { "epoch": 2.6508735564110157, "grad_norm": 1.5585005044488192, "learning_rate": 6.98848701638114e-07, "loss": 0.6371, "step": 4476 }, { "epoch": 2.6514657980456025, "grad_norm": 1.3238990292587902, "learning_rate": 6.965016491097553e-07, "loss": 0.6118, "step": 4477 }, { "epoch": 2.6520580396801896, "grad_norm": 1.3046884838922062, "learning_rate": 6.941584022353865e-07, "loss": 0.6345, "step": 4478 }, { "epoch": 2.6526502813147763, "grad_norm": 5.787734168290906, "learning_rate": 6.918189619735205e-07, "loss": 0.6323, "step": 4479 }, { "epoch": 2.6532425229493635, "grad_norm": 1.9229458354471165, "learning_rate": 6.894833292811265e-07, "loss": 0.6234, "step": 4480 }, { "epoch": 2.65383476458395, "grad_norm": 1.4760976120663631, "learning_rate": 6.871515051136046e-07, "loss": 0.6093, "step": 4481 }, { "epoch": 2.654427006218537, "grad_norm": 1.6700900299285095, "learning_rate": 6.848234904248041e-07, "loss": 0.6118, "step": 4482 }, { "epoch": 2.655019247853124, "grad_norm": 2.3039568980390683, "learning_rate": 6.824992861670132e-07, "loss": 0.654, "step": 4483 }, { "epoch": 2.655611489487711, "grad_norm": 1.779803366967843, "learning_rate": 6.801788932909648e-07, "loss": 0.6177, "step": 4484 }, { "epoch": 2.656203731122298, "grad_norm": 1.4560279559121052, "learning_rate": 6.778623127458261e-07, "loss": 0.6, "step": 4485 }, { "epoch": 2.6567959727568846, "grad_norm": 1.6569231878529236, "learning_rate": 6.755495454792116e-07, "loss": 0.6316, "step": 4486 }, { "epoch": 2.6573882143914718, "grad_norm": 1.4672246843225127, "learning_rate": 6.73240592437171e-07, "loss": 0.5814, "step": 4487 }, { "epoch": 2.6579804560260585, "grad_norm": 1.929651108949212, "learning_rate": 6.709354545641989e-07, "loss": 0.6056, "step": 4488 }, { "epoch": 2.6585726976606456, "grad_norm": 1.1762025778237375, "learning_rate": 6.68634132803222e-07, "loss": 0.5838, "step": 4489 }, { "epoch": 2.6591649392952323, "grad_norm": 1.2260691704307056, "learning_rate": 6.663366280956152e-07, "loss": 0.5867, "step": 4490 }, { "epoch": 2.6597571809298195, "grad_norm": 1.5821020306273104, "learning_rate": 6.640429413811833e-07, "loss": 0.6146, "step": 4491 }, { "epoch": 2.660349422564406, "grad_norm": 3.7438552072655042, "learning_rate": 6.617530735981758e-07, "loss": 0.6057, "step": 4492 }, { "epoch": 2.660941664198993, "grad_norm": 1.4861734342086816, "learning_rate": 6.594670256832769e-07, "loss": 0.5755, "step": 4493 }, { "epoch": 2.66153390583358, "grad_norm": 1.3176811467599943, "learning_rate": 6.571847985716063e-07, "loss": 0.6114, "step": 4494 }, { "epoch": 2.6621261474681672, "grad_norm": 2.5936137237565986, "learning_rate": 6.549063931967247e-07, "loss": 0.6174, "step": 4495 }, { "epoch": 2.662718389102754, "grad_norm": 1.5641301693338778, "learning_rate": 6.526318104906293e-07, "loss": 0.6262, "step": 4496 }, { "epoch": 2.6633106307373406, "grad_norm": 1.7900396459995547, "learning_rate": 6.503610513837522e-07, "loss": 0.557, "step": 4497 }, { "epoch": 2.663902872371928, "grad_norm": 1.3448660046809024, "learning_rate": 6.480941168049593e-07, "loss": 0.5565, "step": 4498 }, { "epoch": 2.6644951140065145, "grad_norm": 1.5352709594541674, "learning_rate": 6.458310076815544e-07, "loss": 0.6334, "step": 4499 }, { "epoch": 2.6650873556411017, "grad_norm": 1.5829649420114509, "learning_rate": 6.435717249392803e-07, "loss": 0.6498, "step": 4500 }, { "epoch": 2.6656795972756884, "grad_norm": 1.8146453565537977, "learning_rate": 6.413162695023078e-07, "loss": 0.6241, "step": 4501 }, { "epoch": 2.6662718389102755, "grad_norm": 1.6561074771734265, "learning_rate": 6.390646422932445e-07, "loss": 0.6356, "step": 4502 }, { "epoch": 2.6668640805448622, "grad_norm": 1.4096379062668936, "learning_rate": 6.368168442331324e-07, "loss": 0.6571, "step": 4503 }, { "epoch": 2.667456322179449, "grad_norm": 1.568445220559726, "learning_rate": 6.345728762414504e-07, "loss": 0.5933, "step": 4504 }, { "epoch": 2.668048563814036, "grad_norm": 1.2456099463597277, "learning_rate": 6.323327392361056e-07, "loss": 0.604, "step": 4505 }, { "epoch": 2.6686408054486233, "grad_norm": 2.1193064430307804, "learning_rate": 6.300964341334382e-07, "loss": 0.6107, "step": 4506 }, { "epoch": 2.66923304708321, "grad_norm": 1.981391634375128, "learning_rate": 6.278639618482241e-07, "loss": 0.6398, "step": 4507 }, { "epoch": 2.6698252887177967, "grad_norm": 2.04267323424306, "learning_rate": 6.256353232936718e-07, "loss": 0.5972, "step": 4508 }, { "epoch": 2.670417530352384, "grad_norm": 1.8887831971144367, "learning_rate": 6.234105193814177e-07, "loss": 0.6266, "step": 4509 }, { "epoch": 2.6710097719869705, "grad_norm": 1.7381054901434658, "learning_rate": 6.211895510215316e-07, "loss": 0.6574, "step": 4510 }, { "epoch": 2.6716020136215577, "grad_norm": 1.443595604409538, "learning_rate": 6.189724191225155e-07, "loss": 0.6292, "step": 4511 }, { "epoch": 2.6721942552561444, "grad_norm": 1.6341383418243982, "learning_rate": 6.167591245913029e-07, "loss": 0.6265, "step": 4512 }, { "epoch": 2.6727864968907316, "grad_norm": 1.468790273283455, "learning_rate": 6.145496683332508e-07, "loss": 0.6301, "step": 4513 }, { "epoch": 2.6733787385253183, "grad_norm": 1.3435540901590721, "learning_rate": 6.123440512521539e-07, "loss": 0.6249, "step": 4514 }, { "epoch": 2.673970980159905, "grad_norm": 2.159903078044722, "learning_rate": 6.101422742502349e-07, "loss": 0.6205, "step": 4515 }, { "epoch": 2.674563221794492, "grad_norm": 1.5612885890524655, "learning_rate": 6.079443382281424e-07, "loss": 0.6187, "step": 4516 }, { "epoch": 2.6751554634290793, "grad_norm": 1.9002762382063694, "learning_rate": 6.05750244084956e-07, "loss": 0.6108, "step": 4517 }, { "epoch": 2.675747705063666, "grad_norm": 3.9695947033078127, "learning_rate": 6.035599927181834e-07, "loss": 0.6114, "step": 4518 }, { "epoch": 2.6763399466982527, "grad_norm": 1.3099993291001881, "learning_rate": 6.013735850237623e-07, "loss": 0.603, "step": 4519 }, { "epoch": 2.67693218833284, "grad_norm": 1.2559274936986125, "learning_rate": 5.99191021896055e-07, "loss": 0.591, "step": 4520 }, { "epoch": 2.6775244299674266, "grad_norm": 1.494733461660258, "learning_rate": 5.97012304227852e-07, "loss": 0.5815, "step": 4521 }, { "epoch": 2.6781166716020137, "grad_norm": 3.1926894336773475, "learning_rate": 5.948374329103723e-07, "loss": 0.5866, "step": 4522 }, { "epoch": 2.6787089132366004, "grad_norm": 1.4852694724957944, "learning_rate": 5.926664088332612e-07, "loss": 0.5798, "step": 4523 }, { "epoch": 2.6793011548711876, "grad_norm": 2.351294923764927, "learning_rate": 5.904992328845893e-07, "loss": 0.6286, "step": 4524 }, { "epoch": 2.6798933965057743, "grad_norm": 1.243806142521426, "learning_rate": 5.88335905950852e-07, "loss": 0.6393, "step": 4525 }, { "epoch": 2.680485638140361, "grad_norm": 1.4595958476913502, "learning_rate": 5.861764289169713e-07, "loss": 0.6362, "step": 4526 }, { "epoch": 2.681077879774948, "grad_norm": 2.5372193151492066, "learning_rate": 5.840208026662986e-07, "loss": 0.6335, "step": 4527 }, { "epoch": 2.6816701214095353, "grad_norm": 1.9022729218122443, "learning_rate": 5.818690280806038e-07, "loss": 0.6046, "step": 4528 }, { "epoch": 2.682262363044122, "grad_norm": 1.300761914523257, "learning_rate": 5.797211060400809e-07, "loss": 0.6347, "step": 4529 }, { "epoch": 2.6828546046787087, "grad_norm": 1.545954882819192, "learning_rate": 5.775770374233558e-07, "loss": 0.6014, "step": 4530 }, { "epoch": 2.683446846313296, "grad_norm": 1.3134870233911644, "learning_rate": 5.754368231074703e-07, "loss": 0.624, "step": 4531 }, { "epoch": 2.6840390879478826, "grad_norm": 1.257399217998619, "learning_rate": 5.73300463967893e-07, "loss": 0.6017, "step": 4532 }, { "epoch": 2.6846313295824697, "grad_norm": 2.0429701939538267, "learning_rate": 5.711679608785136e-07, "loss": 0.6038, "step": 4533 }, { "epoch": 2.6852235712170565, "grad_norm": 2.112515949275844, "learning_rate": 5.690393147116491e-07, "loss": 0.595, "step": 4534 }, { "epoch": 2.6858158128516436, "grad_norm": 1.4181848302497644, "learning_rate": 5.669145263380316e-07, "loss": 0.618, "step": 4535 }, { "epoch": 2.6864080544862303, "grad_norm": 1.817228663528393, "learning_rate": 5.647935966268225e-07, "loss": 0.6222, "step": 4536 }, { "epoch": 2.687000296120817, "grad_norm": 2.6043085112867095, "learning_rate": 5.626765264455992e-07, "loss": 0.6255, "step": 4537 }, { "epoch": 2.687592537755404, "grad_norm": 1.3403299480462842, "learning_rate": 5.60563316660363e-07, "loss": 0.6334, "step": 4538 }, { "epoch": 2.6881847793899913, "grad_norm": 1.5039519498314093, "learning_rate": 5.58453968135535e-07, "loss": 0.5825, "step": 4539 }, { "epoch": 2.688777021024578, "grad_norm": 1.2364951870402103, "learning_rate": 5.563484817339581e-07, "loss": 0.6074, "step": 4540 }, { "epoch": 2.6893692626591648, "grad_norm": 2.509772691752077, "learning_rate": 5.542468583168936e-07, "loss": 0.6249, "step": 4541 }, { "epoch": 2.689961504293752, "grad_norm": 1.1947349763864274, "learning_rate": 5.521490987440259e-07, "loss": 0.6369, "step": 4542 }, { "epoch": 2.6905537459283386, "grad_norm": 1.541811330073855, "learning_rate": 5.500552038734541e-07, "loss": 0.5862, "step": 4543 }, { "epoch": 2.691145987562926, "grad_norm": 1.287641189747786, "learning_rate": 5.47965174561701e-07, "loss": 0.5996, "step": 4544 }, { "epoch": 2.6917382291975125, "grad_norm": 1.7408712569531013, "learning_rate": 5.458790116637036e-07, "loss": 0.639, "step": 4545 }, { "epoch": 2.6923304708320996, "grad_norm": 1.2121113431420716, "learning_rate": 5.437967160328228e-07, "loss": 0.5543, "step": 4546 }, { "epoch": 2.6929227124666864, "grad_norm": 3.4295681334615797, "learning_rate": 5.417182885208317e-07, "loss": 0.6128, "step": 4547 }, { "epoch": 2.693514954101273, "grad_norm": 2.73332565881467, "learning_rate": 5.396437299779278e-07, "loss": 0.6197, "step": 4548 }, { "epoch": 2.69410719573586, "grad_norm": 1.7659745103441442, "learning_rate": 5.375730412527191e-07, "loss": 0.6616, "step": 4549 }, { "epoch": 2.6946994373704474, "grad_norm": 6.0658236906308955, "learning_rate": 5.355062231922326e-07, "loss": 0.6502, "step": 4550 }, { "epoch": 2.695291679005034, "grad_norm": 1.6358319710857092, "learning_rate": 5.334432766419162e-07, "loss": 0.6126, "step": 4551 }, { "epoch": 2.695883920639621, "grad_norm": 1.8693759376554604, "learning_rate": 5.313842024456306e-07, "loss": 0.639, "step": 4552 }, { "epoch": 2.696476162274208, "grad_norm": 1.2740182256023065, "learning_rate": 5.29329001445652e-07, "loss": 0.648, "step": 4553 }, { "epoch": 2.6970684039087947, "grad_norm": 2.009984038445754, "learning_rate": 5.272776744826724e-07, "loss": 0.6539, "step": 4554 }, { "epoch": 2.697660645543382, "grad_norm": 1.7618233562419785, "learning_rate": 5.252302223958006e-07, "loss": 0.5866, "step": 4555 }, { "epoch": 2.6982528871779685, "grad_norm": 2.567467853853205, "learning_rate": 5.231866460225621e-07, "loss": 0.603, "step": 4556 }, { "epoch": 2.6988451288125557, "grad_norm": 1.9843404125639688, "learning_rate": 5.211469461988916e-07, "loss": 0.6404, "step": 4557 }, { "epoch": 2.6994373704471424, "grad_norm": 1.87203615558653, "learning_rate": 5.191111237591406e-07, "loss": 0.6232, "step": 4558 }, { "epoch": 2.700029612081729, "grad_norm": 1.8160459925973307, "learning_rate": 5.170791795360752e-07, "loss": 0.6328, "step": 4559 }, { "epoch": 2.7006218537163162, "grad_norm": 1.776112360381722, "learning_rate": 5.150511143608782e-07, "loss": 0.5592, "step": 4560 }, { "epoch": 2.7012140953509034, "grad_norm": 1.7788576070929014, "learning_rate": 5.130269290631407e-07, "loss": 0.6206, "step": 4561 }, { "epoch": 2.70180633698549, "grad_norm": 1.3844584078213082, "learning_rate": 5.110066244708645e-07, "loss": 0.5974, "step": 4562 }, { "epoch": 2.702398578620077, "grad_norm": 2.280195440016197, "learning_rate": 5.08990201410472e-07, "loss": 0.6314, "step": 4563 }, { "epoch": 2.702990820254664, "grad_norm": 1.5039208783289522, "learning_rate": 5.069776607067944e-07, "loss": 0.5886, "step": 4564 }, { "epoch": 2.7035830618892507, "grad_norm": 2.515470221955119, "learning_rate": 5.04969003183069e-07, "loss": 0.5987, "step": 4565 }, { "epoch": 2.704175303523838, "grad_norm": 1.4253523242169355, "learning_rate": 5.029642296609538e-07, "loss": 0.63, "step": 4566 }, { "epoch": 2.7047675451584245, "grad_norm": 1.0838622160262505, "learning_rate": 5.009633409605131e-07, "loss": 0.5878, "step": 4567 }, { "epoch": 2.7053597867930117, "grad_norm": 2.5944428791188345, "learning_rate": 4.98966337900224e-07, "loss": 0.577, "step": 4568 }, { "epoch": 2.7059520284275984, "grad_norm": 1.6365174307980475, "learning_rate": 4.969732212969691e-07, "loss": 0.6307, "step": 4569 }, { "epoch": 2.706544270062185, "grad_norm": 1.2985396532324271, "learning_rate": 4.949839919660481e-07, "loss": 0.5999, "step": 4570 }, { "epoch": 2.7071365116967723, "grad_norm": 1.6192926604384, "learning_rate": 4.929986507211681e-07, "loss": 0.6267, "step": 4571 }, { "epoch": 2.7077287533313594, "grad_norm": 1.550515843078195, "learning_rate": 4.91017198374445e-07, "loss": 0.6368, "step": 4572 }, { "epoch": 2.708320994965946, "grad_norm": 3.445551249960511, "learning_rate": 4.890396357364025e-07, "loss": 0.6394, "step": 4573 }, { "epoch": 2.708913236600533, "grad_norm": 2.0889306837221677, "learning_rate": 4.870659636159758e-07, "loss": 0.6194, "step": 4574 }, { "epoch": 2.70950547823512, "grad_norm": 1.512212913808209, "learning_rate": 4.850961828205103e-07, "loss": 0.64, "step": 4575 }, { "epoch": 2.7100977198697067, "grad_norm": 1.4199048585388745, "learning_rate": 4.831302941557537e-07, "loss": 0.6473, "step": 4576 }, { "epoch": 2.710689961504294, "grad_norm": 1.785317344909293, "learning_rate": 4.811682984258659e-07, "loss": 0.6042, "step": 4577 }, { "epoch": 2.7112822031388806, "grad_norm": 1.3142205419516673, "learning_rate": 4.79210196433414e-07, "loss": 0.6127, "step": 4578 }, { "epoch": 2.7118744447734677, "grad_norm": 2.677760144755771, "learning_rate": 4.772559889793716e-07, "loss": 0.571, "step": 4579 }, { "epoch": 2.7124666864080544, "grad_norm": 1.6507834481358612, "learning_rate": 4.7530567686312035e-07, "loss": 0.65, "step": 4580 }, { "epoch": 2.713058928042641, "grad_norm": 1.8447154534722792, "learning_rate": 4.7335926088244556e-07, "loss": 0.5999, "step": 4581 }, { "epoch": 2.7136511696772283, "grad_norm": 1.488823816754773, "learning_rate": 4.7141674183354247e-07, "loss": 0.6284, "step": 4582 }, { "epoch": 2.7142434113118155, "grad_norm": 1.7787433279339757, "learning_rate": 4.6947812051100995e-07, "loss": 0.5897, "step": 4583 }, { "epoch": 2.714835652946402, "grad_norm": 1.2062389769136208, "learning_rate": 4.6754339770785474e-07, "loss": 0.5984, "step": 4584 }, { "epoch": 2.715427894580989, "grad_norm": 1.48700055953898, "learning_rate": 4.6561257421548377e-07, "loss": 0.5989, "step": 4585 }, { "epoch": 2.716020136215576, "grad_norm": 1.7381187253725767, "learning_rate": 4.636856508237164e-07, "loss": 0.5941, "step": 4586 }, { "epoch": 2.7166123778501627, "grad_norm": 8.014229956967728, "learning_rate": 4.617626283207688e-07, "loss": 0.6453, "step": 4587 }, { "epoch": 2.71720461948475, "grad_norm": 1.655651410898122, "learning_rate": 4.5984350749326835e-07, "loss": 0.6157, "step": 4588 }, { "epoch": 2.7177968611193366, "grad_norm": 1.4064515132654336, "learning_rate": 4.5792828912624154e-07, "loss": 0.5953, "step": 4589 }, { "epoch": 2.7183891027539238, "grad_norm": 2.5112416819185404, "learning_rate": 4.5601697400312175e-07, "loss": 0.609, "step": 4590 }, { "epoch": 2.7189813443885105, "grad_norm": 1.484009351861582, "learning_rate": 4.541095629057435e-07, "loss": 0.6504, "step": 4591 }, { "epoch": 2.719573586023097, "grad_norm": 1.4442625515083019, "learning_rate": 4.5220605661434605e-07, "loss": 0.5921, "step": 4592 }, { "epoch": 2.7201658276576843, "grad_norm": 1.2531427351279538, "learning_rate": 4.503064559075687e-07, "loss": 0.622, "step": 4593 }, { "epoch": 2.7207580692922715, "grad_norm": 1.772483151004884, "learning_rate": 4.4841076156245665e-07, "loss": 0.5807, "step": 4594 }, { "epoch": 2.721350310926858, "grad_norm": 1.01520707328506, "learning_rate": 4.465189743544551e-07, "loss": 0.6132, "step": 4595 }, { "epoch": 2.721942552561445, "grad_norm": 1.8850344959256942, "learning_rate": 4.4463109505741177e-07, "loss": 0.62, "step": 4596 }, { "epoch": 2.722534794196032, "grad_norm": 1.742313192790708, "learning_rate": 4.427471244435733e-07, "loss": 0.6011, "step": 4597 }, { "epoch": 2.7231270358306188, "grad_norm": 1.4656681253439428, "learning_rate": 4.408670632835932e-07, "loss": 0.5727, "step": 4598 }, { "epoch": 2.723719277465206, "grad_norm": 1.7292315439029244, "learning_rate": 4.389909123465186e-07, "loss": 0.5966, "step": 4599 }, { "epoch": 2.7243115190997926, "grad_norm": 1.6450950995513567, "learning_rate": 4.3711867239980335e-07, "loss": 0.6282, "step": 4600 }, { "epoch": 2.72490376073438, "grad_norm": 1.5986011485085827, "learning_rate": 4.3525034420929815e-07, "loss": 0.5897, "step": 4601 }, { "epoch": 2.7254960023689665, "grad_norm": 1.668306756549204, "learning_rate": 4.3338592853925277e-07, "loss": 0.6291, "step": 4602 }, { "epoch": 2.726088244003553, "grad_norm": 1.3048391452095505, "learning_rate": 4.315254261523194e-07, "loss": 0.6368, "step": 4603 }, { "epoch": 2.7266804856381404, "grad_norm": 1.4097561245163255, "learning_rate": 4.296688378095493e-07, "loss": 0.659, "step": 4604 }, { "epoch": 2.7272727272727275, "grad_norm": 4.424354272711558, "learning_rate": 4.278161642703904e-07, "loss": 0.6217, "step": 4605 }, { "epoch": 2.7278649689073142, "grad_norm": 1.5665435392205018, "learning_rate": 4.2596740629268997e-07, "loss": 0.6582, "step": 4606 }, { "epoch": 2.728457210541901, "grad_norm": 1.2828718477664653, "learning_rate": 4.2412256463269295e-07, "loss": 0.6032, "step": 4607 }, { "epoch": 2.729049452176488, "grad_norm": 1.6646289965329508, "learning_rate": 4.222816400450458e-07, "loss": 0.6442, "step": 4608 }, { "epoch": 2.729641693811075, "grad_norm": 1.2239860350713117, "learning_rate": 4.204446332827894e-07, "loss": 0.5899, "step": 4609 }, { "epoch": 2.730233935445662, "grad_norm": 1.1809420431718864, "learning_rate": 4.186115450973616e-07, "loss": 0.5778, "step": 4610 }, { "epoch": 2.7308261770802487, "grad_norm": 3.218531272958351, "learning_rate": 4.1678237623859917e-07, "loss": 0.604, "step": 4611 }, { "epoch": 2.731418418714836, "grad_norm": 4.9639832700100195, "learning_rate": 4.1495712745473595e-07, "loss": 0.5754, "step": 4612 }, { "epoch": 2.7320106603494225, "grad_norm": 1.6291943122524715, "learning_rate": 4.1313579949240123e-07, "loss": 0.6341, "step": 4613 }, { "epoch": 2.7326029019840092, "grad_norm": 1.2955725976386623, "learning_rate": 4.1131839309661803e-07, "loss": 0.6163, "step": 4614 }, { "epoch": 2.7331951436185964, "grad_norm": 1.6474895593726941, "learning_rate": 4.0950490901081055e-07, "loss": 0.5947, "step": 4615 }, { "epoch": 2.7337873852531835, "grad_norm": 1.1944008670704511, "learning_rate": 4.076953479767964e-07, "loss": 0.6119, "step": 4616 }, { "epoch": 2.7343796268877703, "grad_norm": 1.8527741398776432, "learning_rate": 4.0588971073478477e-07, "loss": 0.597, "step": 4617 }, { "epoch": 2.734971868522357, "grad_norm": 1.6717926231193392, "learning_rate": 4.040879980233836e-07, "loss": 0.5978, "step": 4618 }, { "epoch": 2.735564110156944, "grad_norm": 1.351512519792787, "learning_rate": 4.022902105795956e-07, "loss": 0.5928, "step": 4619 }, { "epoch": 2.736156351791531, "grad_norm": 2.021922048948864, "learning_rate": 4.004963491388203e-07, "loss": 0.5927, "step": 4620 }, { "epoch": 2.736748593426118, "grad_norm": 2.259608972990153, "learning_rate": 3.987064144348407e-07, "loss": 0.5992, "step": 4621 }, { "epoch": 2.7373408350607047, "grad_norm": 2.5222440416768124, "learning_rate": 3.969204071998445e-07, "loss": 0.6311, "step": 4622 }, { "epoch": 2.737933076695292, "grad_norm": 2.0706914867727875, "learning_rate": 3.951383281644106e-07, "loss": 0.6518, "step": 4623 }, { "epoch": 2.7385253183298786, "grad_norm": 1.0448933909161093, "learning_rate": 3.93360178057508e-07, "loss": 0.6289, "step": 4624 }, { "epoch": 2.7391175599644653, "grad_norm": 1.7416780397549962, "learning_rate": 3.9158595760649954e-07, "loss": 0.6319, "step": 4625 }, { "epoch": 2.7397098015990524, "grad_norm": 7.362023134660734, "learning_rate": 3.8981566753714116e-07, "loss": 0.6252, "step": 4626 }, { "epoch": 2.7403020432336396, "grad_norm": 1.4619722411701648, "learning_rate": 3.8804930857358256e-07, "loss": 0.5815, "step": 4627 }, { "epoch": 2.7408942848682263, "grad_norm": 2.5074140857959697, "learning_rate": 3.8628688143836244e-07, "loss": 0.656, "step": 4628 }, { "epoch": 2.741486526502813, "grad_norm": 1.7710644494678003, "learning_rate": 3.845283868524119e-07, "loss": 0.6125, "step": 4629 }, { "epoch": 2.7420787681374, "grad_norm": 1.302185304449621, "learning_rate": 3.827738255350555e-07, "loss": 0.6145, "step": 4630 }, { "epoch": 2.742671009771987, "grad_norm": 1.488799773221385, "learning_rate": 3.810231982040091e-07, "loss": 0.6006, "step": 4631 }, { "epoch": 2.743263251406574, "grad_norm": 1.1890826196658626, "learning_rate": 3.792765055753755e-07, "loss": 0.5658, "step": 4632 }, { "epoch": 2.7438554930411607, "grad_norm": 1.6943845095513352, "learning_rate": 3.775337483636488e-07, "loss": 0.6312, "step": 4633 }, { "epoch": 2.744447734675748, "grad_norm": 2.162351020574408, "learning_rate": 3.757949272817174e-07, "loss": 0.6157, "step": 4634 }, { "epoch": 2.7450399763103346, "grad_norm": 1.8172507844780956, "learning_rate": 3.7406004304085584e-07, "loss": 0.5986, "step": 4635 }, { "epoch": 2.7456322179449213, "grad_norm": 2.857384440493848, "learning_rate": 3.723290963507309e-07, "loss": 0.6173, "step": 4636 }, { "epoch": 2.7462244595795084, "grad_norm": 2.336254189546982, "learning_rate": 3.706020879193939e-07, "loss": 0.6436, "step": 4637 }, { "epoch": 2.7468167012140956, "grad_norm": 2.576256451190125, "learning_rate": 3.688790184532909e-07, "loss": 0.61, "step": 4638 }, { "epoch": 2.7474089428486823, "grad_norm": 2.029285692911393, "learning_rate": 3.671598886572525e-07, "loss": 0.6187, "step": 4639 }, { "epoch": 2.748001184483269, "grad_norm": 1.3899689164399947, "learning_rate": 3.654446992345018e-07, "loss": 0.6091, "step": 4640 }, { "epoch": 2.748593426117856, "grad_norm": 1.3664074951874416, "learning_rate": 3.6373345088664525e-07, "loss": 0.5813, "step": 4641 }, { "epoch": 2.749185667752443, "grad_norm": 2.891864884426199, "learning_rate": 3.620261443136819e-07, "loss": 0.5623, "step": 4642 }, { "epoch": 2.74977790938703, "grad_norm": 2.0669641721163403, "learning_rate": 3.6032278021399415e-07, "loss": 0.5992, "step": 4643 }, { "epoch": 2.7503701510216167, "grad_norm": 1.6857686046412492, "learning_rate": 3.5862335928435465e-07, "loss": 0.6651, "step": 4644 }, { "epoch": 2.750962392656204, "grad_norm": 1.6062047172916332, "learning_rate": 3.569278822199218e-07, "loss": 0.5714, "step": 4645 }, { "epoch": 2.7515546342907906, "grad_norm": 1.4375077948074273, "learning_rate": 3.5523634971424194e-07, "loss": 0.624, "step": 4646 }, { "epoch": 2.7521468759253773, "grad_norm": 2.233044908101775, "learning_rate": 3.5354876245924596e-07, "loss": 0.5747, "step": 4647 }, { "epoch": 2.7527391175599645, "grad_norm": 6.883080250715156, "learning_rate": 3.5186512114525283e-07, "loss": 0.5813, "step": 4648 }, { "epoch": 2.7533313591945516, "grad_norm": 2.2999746530713394, "learning_rate": 3.50185426460965e-07, "loss": 0.6157, "step": 4649 }, { "epoch": 2.7539236008291383, "grad_norm": 1.8968933961570913, "learning_rate": 3.485096790934739e-07, "loss": 0.6322, "step": 4650 }, { "epoch": 2.754515842463725, "grad_norm": 1.4387652445210517, "learning_rate": 3.4683787972825345e-07, "loss": 0.6494, "step": 4651 }, { "epoch": 2.755108084098312, "grad_norm": 1.7110296382242969, "learning_rate": 3.451700290491633e-07, "loss": 0.5789, "step": 4652 }, { "epoch": 2.755700325732899, "grad_norm": 1.781421507136215, "learning_rate": 3.4350612773844996e-07, "loss": 0.5754, "step": 4653 }, { "epoch": 2.756292567367486, "grad_norm": 1.6547622885370215, "learning_rate": 3.4184617647674e-07, "loss": 0.5839, "step": 4654 }, { "epoch": 2.756884809002073, "grad_norm": 1.5141595029370973, "learning_rate": 3.40190175943047e-07, "loss": 0.6275, "step": 4655 }, { "epoch": 2.75747705063666, "grad_norm": 1.4347172437913116, "learning_rate": 3.3853812681477136e-07, "loss": 0.6384, "step": 4656 }, { "epoch": 2.7580692922712466, "grad_norm": 2.134280092349403, "learning_rate": 3.368900297676925e-07, "loss": 0.6151, "step": 4657 }, { "epoch": 2.7586615339058334, "grad_norm": 1.6195000996671558, "learning_rate": 3.3524588547597327e-07, "loss": 0.5915, "step": 4658 }, { "epoch": 2.7592537755404205, "grad_norm": 1.4226797326345686, "learning_rate": 3.336056946121613e-07, "loss": 0.6438, "step": 4659 }, { "epoch": 2.7598460171750077, "grad_norm": 2.076976223041084, "learning_rate": 3.3196945784718993e-07, "loss": 0.6008, "step": 4660 }, { "epoch": 2.7604382588095944, "grad_norm": 1.4596510371769844, "learning_rate": 3.303371758503693e-07, "loss": 0.6462, "step": 4661 }, { "epoch": 2.761030500444181, "grad_norm": 1.596455551281381, "learning_rate": 3.287088492893942e-07, "loss": 0.5848, "step": 4662 }, { "epoch": 2.7616227420787682, "grad_norm": 1.3584900013265102, "learning_rate": 3.270844788303429e-07, "loss": 0.6033, "step": 4663 }, { "epoch": 2.762214983713355, "grad_norm": 1.4668378316726418, "learning_rate": 3.25464065137675e-07, "loss": 0.5542, "step": 4664 }, { "epoch": 2.762807225347942, "grad_norm": 2.1567477871091936, "learning_rate": 3.2384760887423036e-07, "loss": 0.6195, "step": 4665 }, { "epoch": 2.763399466982529, "grad_norm": 1.7423110133051698, "learning_rate": 3.2223511070122893e-07, "loss": 0.6229, "step": 4666 }, { "epoch": 2.763991708617116, "grad_norm": 5.691959866524479, "learning_rate": 3.2062657127827413e-07, "loss": 0.6046, "step": 4667 }, { "epoch": 2.7645839502517027, "grad_norm": 2.0605725048101124, "learning_rate": 3.190219912633519e-07, "loss": 0.5933, "step": 4668 }, { "epoch": 2.7651761918862894, "grad_norm": 1.6855632316948044, "learning_rate": 3.1742137131281937e-07, "loss": 0.5716, "step": 4669 }, { "epoch": 2.7657684335208765, "grad_norm": 1.4437261940020687, "learning_rate": 3.158247120814251e-07, "loss": 0.6427, "step": 4670 }, { "epoch": 2.7663606751554637, "grad_norm": 1.9620940489950571, "learning_rate": 3.142320142222899e-07, "loss": 0.6286, "step": 4671 }, { "epoch": 2.7669529167900504, "grad_norm": 1.5525721497968263, "learning_rate": 3.1264327838692153e-07, "loss": 0.6121, "step": 4672 }, { "epoch": 2.767545158424637, "grad_norm": 1.864998629614895, "learning_rate": 3.1105850522519574e-07, "loss": 0.6336, "step": 4673 }, { "epoch": 2.7681374000592243, "grad_norm": 2.834526941324686, "learning_rate": 3.094776953853762e-07, "loss": 0.5941, "step": 4674 }, { "epoch": 2.768729641693811, "grad_norm": 2.1256789045491966, "learning_rate": 3.079008495141056e-07, "loss": 0.6248, "step": 4675 }, { "epoch": 2.769321883328398, "grad_norm": 1.8764504253445144, "learning_rate": 3.063279682564002e-07, "loss": 0.6022, "step": 4676 }, { "epoch": 2.769914124962985, "grad_norm": 2.287234147163577, "learning_rate": 3.047590522556565e-07, "loss": 0.6267, "step": 4677 }, { "epoch": 2.770506366597572, "grad_norm": 2.975356179185165, "learning_rate": 3.0319410215365e-07, "loss": 0.6251, "step": 4678 }, { "epoch": 2.7710986082321587, "grad_norm": 2.5144656587013507, "learning_rate": 3.0163311859053524e-07, "loss": 0.585, "step": 4679 }, { "epoch": 2.7716908498667454, "grad_norm": 1.9765530263280324, "learning_rate": 3.0007610220483927e-07, "loss": 0.5958, "step": 4680 }, { "epoch": 2.7722830915013326, "grad_norm": 1.8855944082384668, "learning_rate": 2.9852305363347044e-07, "loss": 0.6075, "step": 4681 }, { "epoch": 2.7728753331359197, "grad_norm": 1.4481580120122306, "learning_rate": 2.969739735117128e-07, "loss": 0.631, "step": 4682 }, { "epoch": 2.7734675747705064, "grad_norm": 1.4994595976642986, "learning_rate": 2.954288624732293e-07, "loss": 0.6311, "step": 4683 }, { "epoch": 2.774059816405093, "grad_norm": 2.9375587558348415, "learning_rate": 2.9388772115005457e-07, "loss": 0.6206, "step": 4684 }, { "epoch": 2.7746520580396803, "grad_norm": 1.3813361341030252, "learning_rate": 2.9235055017260205e-07, "loss": 0.6059, "step": 4685 }, { "epoch": 2.775244299674267, "grad_norm": 1.2073925168982438, "learning_rate": 2.9081735016966205e-07, "loss": 0.6153, "step": 4686 }, { "epoch": 2.775836541308854, "grad_norm": 1.6373735275001204, "learning_rate": 2.892881217684007e-07, "loss": 0.6376, "step": 4687 }, { "epoch": 2.776428782943441, "grad_norm": 1.3737783710680564, "learning_rate": 2.877628655943576e-07, "loss": 0.5936, "step": 4688 }, { "epoch": 2.777021024578028, "grad_norm": 1.4776319579183617, "learning_rate": 2.8624158227144703e-07, "loss": 0.6292, "step": 4689 }, { "epoch": 2.7776132662126147, "grad_norm": 1.4187427316372916, "learning_rate": 2.847242724219612e-07, "loss": 0.631, "step": 4690 }, { "epoch": 2.7782055078472014, "grad_norm": 1.5610769267747608, "learning_rate": 2.8321093666656253e-07, "loss": 0.6175, "step": 4691 }, { "epoch": 2.7787977494817886, "grad_norm": 1.3150995164435961, "learning_rate": 2.8170157562429466e-07, "loss": 0.6882, "step": 4692 }, { "epoch": 2.7793899911163757, "grad_norm": 1.3910028681163722, "learning_rate": 2.801961899125671e-07, "loss": 0.6123, "step": 4693 }, { "epoch": 2.7799822327509625, "grad_norm": 1.8024828509396476, "learning_rate": 2.7869478014716953e-07, "loss": 0.6624, "step": 4694 }, { "epoch": 2.780574474385549, "grad_norm": 1.2492833932312706, "learning_rate": 2.7719734694226065e-07, "loss": 0.6252, "step": 4695 }, { "epoch": 2.7811667160201363, "grad_norm": 1.8896802955465037, "learning_rate": 2.757038909103793e-07, "loss": 0.5868, "step": 4696 }, { "epoch": 2.781758957654723, "grad_norm": 2.317063717995178, "learning_rate": 2.74214412662428e-07, "loss": 0.6063, "step": 4697 }, { "epoch": 2.78235119928931, "grad_norm": 1.2618887822595275, "learning_rate": 2.7272891280769044e-07, "loss": 0.6231, "step": 4698 }, { "epoch": 2.782943440923897, "grad_norm": 2.519622445595849, "learning_rate": 2.7124739195381724e-07, "loss": 0.6046, "step": 4699 }, { "epoch": 2.783535682558484, "grad_norm": 1.3397833913049007, "learning_rate": 2.697698507068358e-07, "loss": 0.6201, "step": 4700 }, { "epoch": 2.7841279241930708, "grad_norm": 1.6268323399434212, "learning_rate": 2.682962896711427e-07, "loss": 0.5909, "step": 4701 }, { "epoch": 2.7847201658276575, "grad_norm": 1.6990969743760185, "learning_rate": 2.6682670944950804e-07, "loss": 0.6215, "step": 4702 }, { "epoch": 2.7853124074622446, "grad_norm": 1.8833596532734562, "learning_rate": 2.653611106430698e-07, "loss": 0.6729, "step": 4703 }, { "epoch": 2.7859046490968318, "grad_norm": 1.6243542801393849, "learning_rate": 2.638994938513451e-07, "loss": 0.5851, "step": 4704 }, { "epoch": 2.7864968907314185, "grad_norm": 1.2192739780041595, "learning_rate": 2.624418596722134e-07, "loss": 0.6167, "step": 4705 }, { "epoch": 2.787089132366005, "grad_norm": 1.8467947724763827, "learning_rate": 2.609882087019311e-07, "loss": 0.5764, "step": 4706 }, { "epoch": 2.7876813740005923, "grad_norm": 1.4594864341656142, "learning_rate": 2.595385415351215e-07, "loss": 0.612, "step": 4707 }, { "epoch": 2.788273615635179, "grad_norm": 1.854617047861561, "learning_rate": 2.580928587647824e-07, "loss": 0.6462, "step": 4708 }, { "epoch": 2.788865857269766, "grad_norm": 1.4487640953791494, "learning_rate": 2.566511609822775e-07, "loss": 0.6045, "step": 4709 }, { "epoch": 2.789458098904353, "grad_norm": 5.36212138647543, "learning_rate": 2.5521344877734165e-07, "loss": 0.6486, "step": 4710 }, { "epoch": 2.79005034053894, "grad_norm": 2.5695670114046285, "learning_rate": 2.5377972273808115e-07, "loss": 0.6292, "step": 4711 }, { "epoch": 2.790642582173527, "grad_norm": 1.674042925609699, "learning_rate": 2.523499834509724e-07, "loss": 0.6021, "step": 4712 }, { "epoch": 2.7912348238081135, "grad_norm": 1.7422315504699424, "learning_rate": 2.5092423150085643e-07, "loss": 0.6147, "step": 4713 }, { "epoch": 2.7918270654427007, "grad_norm": 1.518935615818262, "learning_rate": 2.495024674709468e-07, "loss": 0.6175, "step": 4714 }, { "epoch": 2.792419307077288, "grad_norm": 3.9189187326117443, "learning_rate": 2.480846919428237e-07, "loss": 0.6206, "step": 4715 }, { "epoch": 2.7930115487118745, "grad_norm": 1.5020841924960988, "learning_rate": 2.4667090549644e-07, "loss": 0.6688, "step": 4716 }, { "epoch": 2.7936037903464612, "grad_norm": 2.035594629956855, "learning_rate": 2.45261108710112e-07, "loss": 0.556, "step": 4717 }, { "epoch": 2.7941960319810484, "grad_norm": 1.7405526485195462, "learning_rate": 2.43855302160525e-07, "loss": 0.6238, "step": 4718 }, { "epoch": 2.794788273615635, "grad_norm": 4.375187374245842, "learning_rate": 2.424534864227346e-07, "loss": 0.6078, "step": 4719 }, { "epoch": 2.7953805152502222, "grad_norm": 12.710028326671614, "learning_rate": 2.4105566207016207e-07, "loss": 0.6527, "step": 4720 }, { "epoch": 2.795972756884809, "grad_norm": 1.3120633605538479, "learning_rate": 2.396618296745956e-07, "loss": 0.6098, "step": 4721 }, { "epoch": 2.796564998519396, "grad_norm": 1.6308815559170025, "learning_rate": 2.3827198980619025e-07, "loss": 0.6458, "step": 4722 }, { "epoch": 2.797157240153983, "grad_norm": 1.3307922640687166, "learning_rate": 2.3688614303347012e-07, "loss": 0.59, "step": 4723 }, { "epoch": 2.7977494817885695, "grad_norm": 1.3953095149862986, "learning_rate": 2.3550428992332508e-07, "loss": 0.5971, "step": 4724 }, { "epoch": 2.7983417234231567, "grad_norm": 1.241408618714826, "learning_rate": 2.341264310410085e-07, "loss": 0.6275, "step": 4725 }, { "epoch": 2.798933965057744, "grad_norm": 2.7908340168986254, "learning_rate": 2.327525669501418e-07, "loss": 0.5977, "step": 4726 }, { "epoch": 2.7995262066923305, "grad_norm": 1.46971406756966, "learning_rate": 2.3138269821271654e-07, "loss": 0.6049, "step": 4727 }, { "epoch": 2.8001184483269173, "grad_norm": 1.23275893833151, "learning_rate": 2.3001682538908333e-07, "loss": 0.5917, "step": 4728 }, { "epoch": 2.8007106899615044, "grad_norm": 2.1840457881629582, "learning_rate": 2.286549490379597e-07, "loss": 0.5872, "step": 4729 }, { "epoch": 2.801302931596091, "grad_norm": 1.9787660250351953, "learning_rate": 2.2729706971643117e-07, "loss": 0.6328, "step": 4730 }, { "epoch": 2.8018951732306783, "grad_norm": 1.6062910758904774, "learning_rate": 2.2594318797994895e-07, "loss": 0.5954, "step": 4731 }, { "epoch": 2.802487414865265, "grad_norm": 1.7630976777586607, "learning_rate": 2.245933043823234e-07, "loss": 0.5963, "step": 4732 }, { "epoch": 2.803079656499852, "grad_norm": 1.5256593573589792, "learning_rate": 2.232474194757339e-07, "loss": 0.6046, "step": 4733 }, { "epoch": 2.803671898134439, "grad_norm": 1.811287510313242, "learning_rate": 2.2190553381072234e-07, "loss": 0.6255, "step": 4734 }, { "epoch": 2.8042641397690256, "grad_norm": 1.6696578027302151, "learning_rate": 2.2056764793619845e-07, "loss": 0.6031, "step": 4735 }, { "epoch": 2.8048563814036127, "grad_norm": 2.2390362799933348, "learning_rate": 2.1923376239942895e-07, "loss": 0.6192, "step": 4736 }, { "epoch": 2.8054486230382, "grad_norm": 1.9676446261540672, "learning_rate": 2.179038777460507e-07, "loss": 0.5974, "step": 4737 }, { "epoch": 2.8060408646727866, "grad_norm": 1.3850897301178713, "learning_rate": 2.1657799452005856e-07, "loss": 0.58, "step": 4738 }, { "epoch": 2.8066331063073733, "grad_norm": 1.5591433252070857, "learning_rate": 2.1525611326381756e-07, "loss": 0.6151, "step": 4739 }, { "epoch": 2.8072253479419604, "grad_norm": 1.190679482654428, "learning_rate": 2.139382345180474e-07, "loss": 0.6089, "step": 4740 }, { "epoch": 2.807817589576547, "grad_norm": 1.4443287017918227, "learning_rate": 2.1262435882183685e-07, "loss": 0.649, "step": 4741 }, { "epoch": 2.8084098312111343, "grad_norm": 2.138041941041724, "learning_rate": 2.1131448671263378e-07, "loss": 0.602, "step": 4742 }, { "epoch": 2.809002072845721, "grad_norm": 2.021125849306582, "learning_rate": 2.1000861872625066e-07, "loss": 0.6106, "step": 4743 }, { "epoch": 2.809594314480308, "grad_norm": 1.1718395287718038, "learning_rate": 2.0870675539686024e-07, "loss": 0.6226, "step": 4744 }, { "epoch": 2.810186556114895, "grad_norm": 2.4236065030797453, "learning_rate": 2.0740889725699654e-07, "loss": 0.5738, "step": 4745 }, { "epoch": 2.8107787977494816, "grad_norm": 2.134568247280834, "learning_rate": 2.0611504483756038e-07, "loss": 0.6524, "step": 4746 }, { "epoch": 2.8113710393840687, "grad_norm": 1.2716048058881149, "learning_rate": 2.0482519866780516e-07, "loss": 0.5964, "step": 4747 }, { "epoch": 2.8119632810186554, "grad_norm": 1.2026027684878298, "learning_rate": 2.0353935927535428e-07, "loss": 0.5937, "step": 4748 }, { "epoch": 2.8125555226532426, "grad_norm": 1.7177800039620705, "learning_rate": 2.0225752718618707e-07, "loss": 0.6003, "step": 4749 }, { "epoch": 2.8131477642878293, "grad_norm": 2.3836143414025974, "learning_rate": 2.009797029246452e-07, "loss": 0.568, "step": 4750 }, { "epoch": 2.8137400059224165, "grad_norm": 1.5700728206598682, "learning_rate": 1.997058870134294e-07, "loss": 0.5908, "step": 4751 }, { "epoch": 2.814332247557003, "grad_norm": 1.4091090434483953, "learning_rate": 1.9843607997360403e-07, "loss": 0.5847, "step": 4752 }, { "epoch": 2.8149244891915903, "grad_norm": 3.983754518350977, "learning_rate": 1.9717028232458912e-07, "loss": 0.6242, "step": 4753 }, { "epoch": 2.815516730826177, "grad_norm": 3.552480776764616, "learning_rate": 1.959084945841705e-07, "loss": 0.6299, "step": 4754 }, { "epoch": 2.816108972460764, "grad_norm": 1.1920497610991834, "learning_rate": 1.9465071726848638e-07, "loss": 0.6314, "step": 4755 }, { "epoch": 2.816701214095351, "grad_norm": 1.0342228137761196, "learning_rate": 1.9339695089204192e-07, "loss": 0.5991, "step": 4756 }, { "epoch": 2.8172934557299376, "grad_norm": 1.5464771974705371, "learning_rate": 1.921471959676957e-07, "loss": 0.5717, "step": 4757 }, { "epoch": 2.8178856973645248, "grad_norm": 9.400365344050027, "learning_rate": 1.9090145300666885e-07, "loss": 0.5665, "step": 4758 }, { "epoch": 2.8184779389991115, "grad_norm": 1.451507907352805, "learning_rate": 1.8965972251854038e-07, "loss": 0.6476, "step": 4759 }, { "epoch": 2.8190701806336986, "grad_norm": 1.467526846495732, "learning_rate": 1.884220050112462e-07, "loss": 0.5874, "step": 4760 }, { "epoch": 2.8196624222682853, "grad_norm": 1.2322692339357366, "learning_rate": 1.8718830099108464e-07, "loss": 0.5639, "step": 4761 }, { "epoch": 2.8202546639028725, "grad_norm": 1.9431671531415993, "learning_rate": 1.8595861096270874e-07, "loss": 0.6446, "step": 4762 }, { "epoch": 2.820846905537459, "grad_norm": 1.6843027054345925, "learning_rate": 1.8473293542913163e-07, "loss": 0.6301, "step": 4763 }, { "epoch": 2.8214391471720464, "grad_norm": 2.487244252300873, "learning_rate": 1.8351127489172227e-07, "loss": 0.646, "step": 4764 }, { "epoch": 2.822031388806633, "grad_norm": 3.497230980934624, "learning_rate": 1.8229362985021092e-07, "loss": 0.5818, "step": 4765 }, { "epoch": 2.82262363044122, "grad_norm": 2.308374655735379, "learning_rate": 1.8108000080267918e-07, "loss": 0.6312, "step": 4766 }, { "epoch": 2.823215872075807, "grad_norm": 2.0176260134625874, "learning_rate": 1.7987038824557323e-07, "loss": 0.5929, "step": 4767 }, { "epoch": 2.8238081137103936, "grad_norm": 1.7213851889188727, "learning_rate": 1.7866479267369062e-07, "loss": 0.6419, "step": 4768 }, { "epoch": 2.824400355344981, "grad_norm": 1.510546311903195, "learning_rate": 1.7746321458018802e-07, "loss": 0.5843, "step": 4769 }, { "epoch": 2.8249925969795675, "grad_norm": 1.9187963373200847, "learning_rate": 1.7626565445657883e-07, "loss": 0.5998, "step": 4770 }, { "epoch": 2.8255848386141547, "grad_norm": 1.3445654001439231, "learning_rate": 1.750721127927324e-07, "loss": 0.6266, "step": 4771 }, { "epoch": 2.8261770802487414, "grad_norm": 1.4215769707003076, "learning_rate": 1.7388259007687368e-07, "loss": 0.6382, "step": 4772 }, { "epoch": 2.8267693218833285, "grad_norm": 2.129246036856138, "learning_rate": 1.7269708679558572e-07, "loss": 0.6008, "step": 4773 }, { "epoch": 2.8273615635179152, "grad_norm": 1.4627320973002913, "learning_rate": 1.715156034338039e-07, "loss": 0.6084, "step": 4774 }, { "epoch": 2.8279538051525024, "grad_norm": 2.8153006747845875, "learning_rate": 1.7033814047482388e-07, "loss": 0.6042, "step": 4775 }, { "epoch": 2.828546046787089, "grad_norm": 1.507076721738337, "learning_rate": 1.691646984002937e-07, "loss": 0.6466, "step": 4776 }, { "epoch": 2.8291382884216763, "grad_norm": 1.2884117944639297, "learning_rate": 1.6799527769021495e-07, "loss": 0.6195, "step": 4777 }, { "epoch": 2.829730530056263, "grad_norm": 1.6946887003219235, "learning_rate": 1.6682987882294722e-07, "loss": 0.6082, "step": 4778 }, { "epoch": 2.8303227716908497, "grad_norm": 1.5567973673661333, "learning_rate": 1.6566850227520693e-07, "loss": 0.5863, "step": 4779 }, { "epoch": 2.830915013325437, "grad_norm": 1.7210984837021315, "learning_rate": 1.6451114852206073e-07, "loss": 0.633, "step": 4780 }, { "epoch": 2.8315072549600235, "grad_norm": 1.5582674489351216, "learning_rate": 1.6335781803692884e-07, "loss": 0.5535, "step": 4781 }, { "epoch": 2.8320994965946107, "grad_norm": 3.604937316955796, "learning_rate": 1.6220851129159164e-07, "loss": 0.6094, "step": 4782 }, { "epoch": 2.8326917382291974, "grad_norm": 1.468440659774134, "learning_rate": 1.6106322875617974e-07, "loss": 0.6098, "step": 4783 }, { "epoch": 2.8332839798637846, "grad_norm": 1.5367592010242979, "learning_rate": 1.5992197089917727e-07, "loss": 0.5844, "step": 4784 }, { "epoch": 2.8338762214983713, "grad_norm": 1.7789542552715796, "learning_rate": 1.587847381874219e-07, "loss": 0.6287, "step": 4785 }, { "epoch": 2.834468463132958, "grad_norm": 5.853973890713451, "learning_rate": 1.576515310861071e-07, "loss": 0.5924, "step": 4786 }, { "epoch": 2.835060704767545, "grad_norm": 1.7798910211182182, "learning_rate": 1.565223500587798e-07, "loss": 0.6681, "step": 4787 }, { "epoch": 2.8356529464021323, "grad_norm": 2.8553443898913935, "learning_rate": 1.55397195567335e-07, "loss": 0.6537, "step": 4788 }, { "epoch": 2.836245188036719, "grad_norm": 1.1857098480480426, "learning_rate": 1.5427606807202676e-07, "loss": 0.6129, "step": 4789 }, { "epoch": 2.8368374296713057, "grad_norm": 1.5704580950283231, "learning_rate": 1.5315896803145824e-07, "loss": 0.6026, "step": 4790 }, { "epoch": 2.837429671305893, "grad_norm": 4.451172736833375, "learning_rate": 1.5204589590258722e-07, "loss": 0.6233, "step": 4791 }, { "epoch": 2.8380219129404796, "grad_norm": 1.6353470217624213, "learning_rate": 1.5093685214072173e-07, "loss": 0.6129, "step": 4792 }, { "epoch": 2.8386141545750667, "grad_norm": 1.660643984933811, "learning_rate": 1.4983183719952222e-07, "loss": 0.6326, "step": 4793 }, { "epoch": 2.8392063962096534, "grad_norm": 1.456440719487532, "learning_rate": 1.4873085153100485e-07, "loss": 0.6038, "step": 4794 }, { "epoch": 2.8397986378442406, "grad_norm": 1.2676177701134983, "learning_rate": 1.4763389558553164e-07, "loss": 0.5914, "step": 4795 }, { "epoch": 2.8403908794788273, "grad_norm": 1.4386662121643279, "learning_rate": 1.4654096981182031e-07, "loss": 0.617, "step": 4796 }, { "epoch": 2.840983121113414, "grad_norm": 1.3829547994752223, "learning_rate": 1.4545207465693877e-07, "loss": 0.6172, "step": 4797 }, { "epoch": 2.841575362748001, "grad_norm": 2.9258003797550542, "learning_rate": 1.4436721056630853e-07, "loss": 0.6146, "step": 4798 }, { "epoch": 2.8421676043825883, "grad_norm": 3.4493369680899795, "learning_rate": 1.432863779836968e-07, "loss": 0.6168, "step": 4799 }, { "epoch": 2.842759846017175, "grad_norm": 1.7932986975776593, "learning_rate": 1.4220957735122663e-07, "loss": 0.624, "step": 4800 }, { "epoch": 2.8433520876517617, "grad_norm": 2.6126448363470867, "learning_rate": 1.41136809109369e-07, "loss": 0.5994, "step": 4801 }, { "epoch": 2.843944329286349, "grad_norm": 1.9157604337658336, "learning_rate": 1.400680736969484e-07, "loss": 0.6276, "step": 4802 }, { "epoch": 2.8445365709209356, "grad_norm": 1.7072043505201628, "learning_rate": 1.390033715511363e-07, "loss": 0.6496, "step": 4803 }, { "epoch": 2.8451288125555227, "grad_norm": 1.6570154960641135, "learning_rate": 1.3794270310745538e-07, "loss": 0.6308, "step": 4804 }, { "epoch": 2.8457210541901095, "grad_norm": 1.344278584688019, "learning_rate": 1.3688606879977863e-07, "loss": 0.58, "step": 4805 }, { "epoch": 2.8463132958246966, "grad_norm": 1.4565547659975684, "learning_rate": 1.3583346906033024e-07, "loss": 0.6234, "step": 4806 }, { "epoch": 2.8469055374592833, "grad_norm": 1.5371160933434378, "learning_rate": 1.347849043196814e-07, "loss": 0.6329, "step": 4807 }, { "epoch": 2.84749777909387, "grad_norm": 1.5015920663462485, "learning_rate": 1.3374037500675452e-07, "loss": 0.6034, "step": 4808 }, { "epoch": 2.848090020728457, "grad_norm": 1.2589321162204534, "learning_rate": 1.326998815488212e-07, "loss": 0.5696, "step": 4809 }, { "epoch": 2.8486822623630443, "grad_norm": 1.5267234806346195, "learning_rate": 1.3166342437150204e-07, "loss": 0.6538, "step": 4810 }, { "epoch": 2.849274503997631, "grad_norm": 3.0702462243014934, "learning_rate": 1.306310038987657e-07, "loss": 0.63, "step": 4811 }, { "epoch": 2.8498667456322178, "grad_norm": 1.6574653955539411, "learning_rate": 1.2960262055292884e-07, "loss": 0.6405, "step": 4812 }, { "epoch": 2.850458987266805, "grad_norm": 1.55721639877075, "learning_rate": 1.2857827475466045e-07, "loss": 0.6251, "step": 4813 }, { "epoch": 2.8510512289013916, "grad_norm": 1.5458939977128197, "learning_rate": 1.275579669229743e-07, "loss": 0.6109, "step": 4814 }, { "epoch": 2.8516434705359788, "grad_norm": 2.5596201627651243, "learning_rate": 1.2654169747523425e-07, "loss": 0.632, "step": 4815 }, { "epoch": 2.8522357121705655, "grad_norm": 1.4732233414875766, "learning_rate": 1.2552946682715116e-07, "loss": 0.6403, "step": 4816 }, { "epoch": 2.8528279538051526, "grad_norm": 1.900721516109027, "learning_rate": 1.2452127539278493e-07, "loss": 0.5844, "step": 4817 }, { "epoch": 2.8534201954397393, "grad_norm": 1.634171972189307, "learning_rate": 1.2351712358454115e-07, "loss": 0.6421, "step": 4818 }, { "epoch": 2.854012437074326, "grad_norm": 1.4903972592743484, "learning_rate": 1.2251701181317577e-07, "loss": 0.5858, "step": 4819 }, { "epoch": 2.854604678708913, "grad_norm": 5.834292688749914, "learning_rate": 1.215209404877904e-07, "loss": 0.6248, "step": 4820 }, { "epoch": 2.8551969203435004, "grad_norm": 2.127865932703041, "learning_rate": 1.2052891001583356e-07, "loss": 0.5898, "step": 4821 }, { "epoch": 2.855789161978087, "grad_norm": 1.2644572972552057, "learning_rate": 1.1954092080310288e-07, "loss": 0.6052, "step": 4822 }, { "epoch": 2.856381403612674, "grad_norm": 3.002356796290768, "learning_rate": 1.185569732537406e-07, "loss": 0.6027, "step": 4823 }, { "epoch": 2.856973645247261, "grad_norm": 2.3834117533925405, "learning_rate": 1.1757706777023592e-07, "loss": 0.6043, "step": 4824 }, { "epoch": 2.8575658868818476, "grad_norm": 1.9852462249046476, "learning_rate": 1.1660120475342707e-07, "loss": 0.5817, "step": 4825 }, { "epoch": 2.858158128516435, "grad_norm": 3.440130845092385, "learning_rate": 1.1562938460249473e-07, "loss": 0.647, "step": 4826 }, { "epoch": 2.8587503701510215, "grad_norm": 1.5159101594522972, "learning_rate": 1.1466160771496982e-07, "loss": 0.6234, "step": 4827 }, { "epoch": 2.8593426117856087, "grad_norm": 1.7623292425646084, "learning_rate": 1.1369787448672675e-07, "loss": 0.6387, "step": 4828 }, { "epoch": 2.8599348534201954, "grad_norm": 1.1069616291653879, "learning_rate": 1.1273818531198689e-07, "loss": 0.651, "step": 4829 }, { "epoch": 2.860527095054782, "grad_norm": 1.5908965577567722, "learning_rate": 1.1178254058331616e-07, "loss": 0.5953, "step": 4830 }, { "epoch": 2.8611193366893692, "grad_norm": 1.1175423505897313, "learning_rate": 1.1083094069162747e-07, "loss": 0.5764, "step": 4831 }, { "epoch": 2.8617115783239564, "grad_norm": 2.6373508177619223, "learning_rate": 1.0988338602618053e-07, "loss": 0.5694, "step": 4832 }, { "epoch": 2.862303819958543, "grad_norm": 1.5234468544649382, "learning_rate": 1.0893987697457531e-07, "loss": 0.602, "step": 4833 }, { "epoch": 2.86289606159313, "grad_norm": 2.2250099868509743, "learning_rate": 1.0800041392276194e-07, "loss": 0.6286, "step": 4834 }, { "epoch": 2.863488303227717, "grad_norm": 2.59536596351284, "learning_rate": 1.0706499725503306e-07, "loss": 0.6136, "step": 4835 }, { "epoch": 2.8640805448623037, "grad_norm": 1.4278623489444429, "learning_rate": 1.061336273540281e-07, "loss": 0.616, "step": 4836 }, { "epoch": 2.864672786496891, "grad_norm": 4.753893258101949, "learning_rate": 1.0520630460072789e-07, "loss": 0.6092, "step": 4837 }, { "epoch": 2.8652650281314775, "grad_norm": 5.378554417045982, "learning_rate": 1.0428302937445899e-07, "loss": 0.6194, "step": 4838 }, { "epoch": 2.8658572697660647, "grad_norm": 1.8174164225079488, "learning_rate": 1.0336380205289598e-07, "loss": 0.6245, "step": 4839 }, { "epoch": 2.8664495114006514, "grad_norm": 1.8785119619391066, "learning_rate": 1.024486230120525e-07, "loss": 0.6206, "step": 4840 }, { "epoch": 2.867041753035238, "grad_norm": 1.4437126735611092, "learning_rate": 1.0153749262628798e-07, "loss": 0.604, "step": 4841 }, { "epoch": 2.8676339946698253, "grad_norm": 2.6007194776549225, "learning_rate": 1.0063041126830542e-07, "loss": 0.6137, "step": 4842 }, { "epoch": 2.8682262363044124, "grad_norm": 1.7004655793931742, "learning_rate": 9.972737930915576e-08, "loss": 0.6176, "step": 4843 }, { "epoch": 2.868818477938999, "grad_norm": 1.7229767787502699, "learning_rate": 9.882839711822468e-08, "loss": 0.6027, "step": 4844 }, { "epoch": 2.869410719573586, "grad_norm": 2.90523507649429, "learning_rate": 9.793346506325019e-08, "loss": 0.6238, "step": 4845 }, { "epoch": 2.870002961208173, "grad_norm": 2.42243276703994, "learning_rate": 9.704258351030838e-08, "loss": 0.6451, "step": 4846 }, { "epoch": 2.8705952028427597, "grad_norm": 1.7458977764762762, "learning_rate": 9.615575282381995e-08, "loss": 0.5998, "step": 4847 }, { "epoch": 2.871187444477347, "grad_norm": 1.3809462087198652, "learning_rate": 9.527297336654917e-08, "loss": 0.6044, "step": 4848 }, { "epoch": 2.8717796861119336, "grad_norm": 2.8002775238184037, "learning_rate": 9.439424549960164e-08, "loss": 0.5919, "step": 4849 }, { "epoch": 2.8723719277465207, "grad_norm": 1.6723003456861165, "learning_rate": 9.351956958242648e-08, "loss": 0.6054, "step": 4850 }, { "epoch": 2.8729641693811074, "grad_norm": 1.2925306658163453, "learning_rate": 9.264894597281637e-08, "loss": 0.5964, "step": 4851 }, { "epoch": 2.873556411015694, "grad_norm": 1.5873219385850321, "learning_rate": 9.178237502690423e-08, "loss": 0.6179, "step": 4852 }, { "epoch": 2.8741486526502813, "grad_norm": 1.839675287935247, "learning_rate": 9.091985709916651e-08, "loss": 0.6329, "step": 4853 }, { "epoch": 2.8747408942848685, "grad_norm": 1.1428648528723437, "learning_rate": 9.006139254242319e-08, "loss": 0.5924, "step": 4854 }, { "epoch": 2.875333135919455, "grad_norm": 1.5256355451305668, "learning_rate": 8.920698170783226e-08, "loss": 0.572, "step": 4855 }, { "epoch": 2.875925377554042, "grad_norm": 1.4817600093717371, "learning_rate": 8.835662494489638e-08, "loss": 0.653, "step": 4856 }, { "epoch": 2.876517619188629, "grad_norm": 1.9011431000162187, "learning_rate": 8.751032260145841e-08, "loss": 0.6457, "step": 4857 }, { "epoch": 2.8771098608232157, "grad_norm": 1.4632309745360057, "learning_rate": 8.6668075023707e-08, "loss": 0.6112, "step": 4858 }, { "epoch": 2.877702102457803, "grad_norm": 2.231165669864582, "learning_rate": 8.582988255616542e-08, "loss": 0.6457, "step": 4859 }, { "epoch": 2.8782943440923896, "grad_norm": 1.6820428525262583, "learning_rate": 8.499574554170276e-08, "loss": 0.6196, "step": 4860 }, { "epoch": 2.8788865857269768, "grad_norm": 2.545677098827465, "learning_rate": 8.416566432152717e-08, "loss": 0.6142, "step": 4861 }, { "epoch": 2.8794788273615635, "grad_norm": 7.836633886663809, "learning_rate": 8.333963923519039e-08, "loss": 0.6608, "step": 4862 }, { "epoch": 2.88007106899615, "grad_norm": 2.1612339210989977, "learning_rate": 8.251767062058102e-08, "loss": 0.5832, "step": 4863 }, { "epoch": 2.8806633106307373, "grad_norm": 4.073221900832398, "learning_rate": 8.169975881393122e-08, "loss": 0.6748, "step": 4864 }, { "epoch": 2.8812555522653245, "grad_norm": 1.332971531493207, "learning_rate": 8.088590414981445e-08, "loss": 0.6198, "step": 4865 }, { "epoch": 2.881847793899911, "grad_norm": 1.322813916282647, "learning_rate": 8.007610696114e-08, "loss": 0.6119, "step": 4866 }, { "epoch": 2.882440035534498, "grad_norm": 1.4855030858592297, "learning_rate": 7.927036757916284e-08, "loss": 0.556, "step": 4867 }, { "epoch": 2.883032277169085, "grad_norm": 1.3953005756472212, "learning_rate": 7.846868633347492e-08, "loss": 0.5986, "step": 4868 }, { "epoch": 2.8836245188036718, "grad_norm": 2.9292427514375365, "learning_rate": 7.767106355200949e-08, "loss": 0.637, "step": 4869 }, { "epoch": 2.884216760438259, "grad_norm": 1.5163064833083102, "learning_rate": 7.687749956103774e-08, "loss": 0.6435, "step": 4870 }, { "epoch": 2.8848090020728456, "grad_norm": 2.053869781792157, "learning_rate": 7.608799468517336e-08, "loss": 0.5933, "step": 4871 }, { "epoch": 2.885401243707433, "grad_norm": 1.1888149397737269, "learning_rate": 7.530254924736691e-08, "loss": 0.6247, "step": 4872 }, { "epoch": 2.8859934853420195, "grad_norm": 1.6691352976148741, "learning_rate": 7.452116356891136e-08, "loss": 0.5899, "step": 4873 }, { "epoch": 2.886585726976606, "grad_norm": 1.9588081253912122, "learning_rate": 7.374383796943663e-08, "loss": 0.6164, "step": 4874 }, { "epoch": 2.8871779686111934, "grad_norm": 1.983783309455853, "learning_rate": 7.297057276691277e-08, "loss": 0.5889, "step": 4875 }, { "epoch": 2.8877702102457805, "grad_norm": 1.2950722515129418, "learning_rate": 7.22013682776479e-08, "loss": 0.5975, "step": 4876 }, { "epoch": 2.888362451880367, "grad_norm": 1.469392578281443, "learning_rate": 7.143622481629031e-08, "loss": 0.5583, "step": 4877 }, { "epoch": 2.888954693514954, "grad_norm": 1.7592018298109806, "learning_rate": 7.067514269582743e-08, "loss": 0.5739, "step": 4878 }, { "epoch": 2.889546935149541, "grad_norm": 1.3423603999327254, "learning_rate": 6.991812222758354e-08, "loss": 0.5982, "step": 4879 }, { "epoch": 2.890139176784128, "grad_norm": 1.6619904015701097, "learning_rate": 6.916516372122429e-08, "loss": 0.65, "step": 4880 }, { "epoch": 2.890731418418715, "grad_norm": 1.287530941188681, "learning_rate": 6.841626748474994e-08, "loss": 0.6254, "step": 4881 }, { "epoch": 2.8913236600533017, "grad_norm": 1.6059010609864288, "learning_rate": 6.767143382450214e-08, "loss": 0.609, "step": 4882 }, { "epoch": 2.891915901687889, "grad_norm": 1.4171569214064548, "learning_rate": 6.693066304516049e-08, "loss": 0.6497, "step": 4883 }, { "epoch": 2.8925081433224755, "grad_norm": 2.499973096967963, "learning_rate": 6.619395544974039e-08, "loss": 0.6378, "step": 4884 }, { "epoch": 2.8931003849570622, "grad_norm": 2.2303635681374527, "learning_rate": 6.546131133959743e-08, "loss": 0.5796, "step": 4885 }, { "epoch": 2.8936926265916494, "grad_norm": 2.4145279843725187, "learning_rate": 6.473273101442412e-08, "loss": 0.6072, "step": 4886 }, { "epoch": 2.8942848682262365, "grad_norm": 1.9679250585944843, "learning_rate": 6.400821477225206e-08, "loss": 0.5963, "step": 4887 }, { "epoch": 2.8948771098608232, "grad_norm": 2.3343239037159074, "learning_rate": 6.32877629094475e-08, "loss": 0.5846, "step": 4888 }, { "epoch": 2.89546935149541, "grad_norm": 1.5461453160570697, "learning_rate": 6.25713757207158e-08, "loss": 0.5769, "step": 4889 }, { "epoch": 2.896061593129997, "grad_norm": 1.0705482530363706, "learning_rate": 6.185905349910038e-08, "loss": 0.6512, "step": 4890 }, { "epoch": 2.896653834764584, "grad_norm": 1.2649633299278897, "learning_rate": 6.115079653598032e-08, "loss": 0.6649, "step": 4891 }, { "epoch": 2.897246076399171, "grad_norm": 1.8470616975035683, "learning_rate": 6.044660512107392e-08, "loss": 0.6089, "step": 4892 }, { "epoch": 2.8978383180337577, "grad_norm": 2.1565586383593662, "learning_rate": 5.974647954243295e-08, "loss": 0.5952, "step": 4893 }, { "epoch": 2.898430559668345, "grad_norm": 1.854936286527562, "learning_rate": 5.905042008645057e-08, "loss": 0.5861, "step": 4894 }, { "epoch": 2.8990228013029316, "grad_norm": 1.935575054571336, "learning_rate": 5.835842703785233e-08, "loss": 0.6239, "step": 4895 }, { "epoch": 2.8996150429375183, "grad_norm": 1.7416078769823777, "learning_rate": 5.7670500679702925e-08, "loss": 0.6335, "step": 4896 }, { "epoch": 2.9002072845721054, "grad_norm": 1.1470576618099952, "learning_rate": 5.698664129340281e-08, "loss": 0.5932, "step": 4897 }, { "epoch": 2.9007995262066926, "grad_norm": 1.2955327547356683, "learning_rate": 5.630684915868934e-08, "loss": 0.5998, "step": 4898 }, { "epoch": 2.9013917678412793, "grad_norm": 3.332322229387898, "learning_rate": 5.5631124553636726e-08, "loss": 0.6397, "step": 4899 }, { "epoch": 2.901984009475866, "grad_norm": 1.3915052794106848, "learning_rate": 5.4959467754651665e-08, "loss": 0.6373, "step": 4900 }, { "epoch": 2.902576251110453, "grad_norm": 1.8217424896661099, "learning_rate": 5.429187903647992e-08, "loss": 0.6303, "step": 4901 }, { "epoch": 2.90316849274504, "grad_norm": 1.5269216925197477, "learning_rate": 5.3628358672205285e-08, "loss": 0.6163, "step": 4902 }, { "epoch": 2.903760734379627, "grad_norm": 2.2464586629579517, "learning_rate": 5.2968906933243966e-08, "loss": 0.6387, "step": 4903 }, { "epoch": 2.9043529760142137, "grad_norm": 1.4028855974264578, "learning_rate": 5.231352408934687e-08, "loss": 0.6465, "step": 4904 }, { "epoch": 2.904945217648801, "grad_norm": 2.456044814128531, "learning_rate": 5.1662210408605084e-08, "loss": 0.5744, "step": 4905 }, { "epoch": 2.9055374592833876, "grad_norm": 1.8315857516887082, "learning_rate": 5.101496615744106e-08, "loss": 0.5918, "step": 4906 }, { "epoch": 2.9061297009179743, "grad_norm": 1.3272661044838177, "learning_rate": 5.0371791600614115e-08, "loss": 0.6366, "step": 4907 }, { "epoch": 2.9067219425525614, "grad_norm": 2.857393405458701, "learning_rate": 4.973268700121936e-08, "loss": 0.5693, "step": 4908 }, { "epoch": 2.9073141841871486, "grad_norm": 2.217648509730232, "learning_rate": 4.9097652620685444e-08, "loss": 0.628, "step": 4909 }, { "epoch": 2.9079064258217353, "grad_norm": 1.286864564902065, "learning_rate": 4.846668871877902e-08, "loss": 0.641, "step": 4910 }, { "epoch": 2.908498667456322, "grad_norm": 1.297605169345718, "learning_rate": 4.783979555359808e-08, "loss": 0.6, "step": 4911 }, { "epoch": 2.909090909090909, "grad_norm": 1.2656130698182047, "learning_rate": 4.721697338157749e-08, "loss": 0.5856, "step": 4912 }, { "epoch": 2.909683150725496, "grad_norm": 1.1801287041757442, "learning_rate": 4.65982224574868e-08, "loss": 0.5315, "step": 4913 }, { "epoch": 2.910275392360083, "grad_norm": 1.7243061122338232, "learning_rate": 4.59835430344302e-08, "loss": 0.5509, "step": 4914 }, { "epoch": 2.9108676339946697, "grad_norm": 3.045143418965923, "learning_rate": 4.537293536384657e-08, "loss": 0.5648, "step": 4915 }, { "epoch": 2.911459875629257, "grad_norm": 1.4709546764859878, "learning_rate": 4.476639969550722e-08, "loss": 0.5994, "step": 4916 }, { "epoch": 2.9120521172638436, "grad_norm": 2.2971830899861776, "learning_rate": 4.416393627752147e-08, "loss": 0.6151, "step": 4917 }, { "epoch": 2.9126443588984303, "grad_norm": 1.832372143366767, "learning_rate": 4.3565545356327734e-08, "loss": 0.6323, "step": 4918 }, { "epoch": 2.9132366005330175, "grad_norm": 2.17503203723051, "learning_rate": 4.2971227176704656e-08, "loss": 0.6108, "step": 4919 }, { "epoch": 2.9138288421676046, "grad_norm": 2.3265306058858797, "learning_rate": 4.2380981981759994e-08, "loss": 0.613, "step": 4920 }, { "epoch": 2.9144210838021913, "grad_norm": 1.367396635079341, "learning_rate": 4.179481001293839e-08, "loss": 0.5899, "step": 4921 }, { "epoch": 2.915013325436778, "grad_norm": 2.8847118428481857, "learning_rate": 4.1212711510015826e-08, "loss": 0.6114, "step": 4922 }, { "epoch": 2.915605567071365, "grad_norm": 1.3502886513621417, "learning_rate": 4.0634686711104043e-08, "loss": 0.6113, "step": 4923 }, { "epoch": 2.916197808705952, "grad_norm": 1.2799009361253797, "learning_rate": 4.006073585264725e-08, "loss": 0.611, "step": 4924 }, { "epoch": 2.916790050340539, "grad_norm": 42.704793158144476, "learning_rate": 3.94908591694243e-08, "loss": 0.6016, "step": 4925 }, { "epoch": 2.9173822919751258, "grad_norm": 3.2359359053380894, "learning_rate": 3.89250568945454e-08, "loss": 0.6014, "step": 4926 }, { "epoch": 2.917974533609713, "grad_norm": 1.5404185217083588, "learning_rate": 3.836332925945874e-08, "loss": 0.6352, "step": 4927 }, { "epoch": 2.9185667752442996, "grad_norm": 1.193895601439141, "learning_rate": 3.7805676493938294e-08, "loss": 0.6499, "step": 4928 }, { "epoch": 2.9191590168788863, "grad_norm": 1.7488780731222586, "learning_rate": 3.7252098826098256e-08, "loss": 0.6066, "step": 4929 }, { "epoch": 2.9197512585134735, "grad_norm": 1.3649816816969873, "learning_rate": 3.6702596482381946e-08, "loss": 0.5781, "step": 4930 }, { "epoch": 2.9203435001480607, "grad_norm": 1.4996743472643042, "learning_rate": 3.615716968756733e-08, "loss": 0.6191, "step": 4931 }, { "epoch": 2.9209357417826474, "grad_norm": 2.174241162693474, "learning_rate": 3.5615818664764825e-08, "loss": 0.6638, "step": 4932 }, { "epoch": 2.921527983417234, "grad_norm": 1.0378189780160854, "learning_rate": 3.507854363541619e-08, "loss": 0.6222, "step": 4933 }, { "epoch": 2.9221202250518212, "grad_norm": 1.273845744800928, "learning_rate": 3.454534481929783e-08, "loss": 0.6489, "step": 4934 }, { "epoch": 2.922712466686408, "grad_norm": 1.665710013846029, "learning_rate": 3.4016222434518634e-08, "loss": 0.5685, "step": 4935 }, { "epoch": 2.923304708320995, "grad_norm": 1.59905029910758, "learning_rate": 3.349117669751767e-08, "loss": 0.581, "step": 4936 }, { "epoch": 2.923896949955582, "grad_norm": 1.6375386411169495, "learning_rate": 3.297020782307092e-08, "loss": 0.648, "step": 4937 }, { "epoch": 2.924489191590169, "grad_norm": 2.9794337991351627, "learning_rate": 3.245331602428126e-08, "loss": 0.6716, "step": 4938 }, { "epoch": 2.9250814332247557, "grad_norm": 1.1832076972764827, "learning_rate": 3.194050151258732e-08, "loss": 0.6242, "step": 4939 }, { "epoch": 2.9256736748593424, "grad_norm": 1.4696232417709514, "learning_rate": 3.14317644977602e-08, "loss": 0.6092, "step": 4940 }, { "epoch": 2.9262659164939295, "grad_norm": 1.448707576808793, "learning_rate": 3.09271051879001e-08, "loss": 0.6279, "step": 4941 }, { "epoch": 2.9268581581285167, "grad_norm": 1.6727775055688563, "learning_rate": 3.0426523789442994e-08, "loss": 0.6081, "step": 4942 }, { "epoch": 2.9274503997631034, "grad_norm": 1.3971103869765082, "learning_rate": 2.9930020507153986e-08, "loss": 0.6083, "step": 4943 }, { "epoch": 2.92804264139769, "grad_norm": 2.2833557926974057, "learning_rate": 2.9437595544130615e-08, "loss": 0.5705, "step": 4944 }, { "epoch": 2.9286348830322773, "grad_norm": 3.0762266827362024, "learning_rate": 2.8949249101801747e-08, "loss": 0.575, "step": 4945 }, { "epoch": 2.929227124666864, "grad_norm": 1.5877663565907878, "learning_rate": 2.8464981379929814e-08, "loss": 0.5788, "step": 4946 }, { "epoch": 2.929819366301451, "grad_norm": 1.87679323242289, "learning_rate": 2.7984792576606355e-08, "loss": 0.6044, "step": 4947 }, { "epoch": 2.930411607936038, "grad_norm": 1.579589442857244, "learning_rate": 2.7508682888257587e-08, "loss": 0.6532, "step": 4948 }, { "epoch": 2.931003849570625, "grad_norm": 2.2731032093347125, "learning_rate": 2.7036652509636607e-08, "loss": 0.6169, "step": 4949 }, { "epoch": 2.9315960912052117, "grad_norm": 1.6629613159148906, "learning_rate": 2.6568701633832295e-08, "loss": 0.6204, "step": 4950 }, { "epoch": 2.9321883328397984, "grad_norm": 1.539079572245288, "learning_rate": 2.610483045226264e-08, "loss": 0.6334, "step": 4951 }, { "epoch": 2.9327805744743856, "grad_norm": 3.602880436876509, "learning_rate": 2.5645039154675867e-08, "loss": 0.639, "step": 4952 }, { "epoch": 2.9333728161089727, "grad_norm": 1.987049639752013, "learning_rate": 2.518932792915263e-08, "loss": 0.5984, "step": 4953 }, { "epoch": 2.9339650577435594, "grad_norm": 1.4505610755352987, "learning_rate": 2.4737696962106038e-08, "loss": 0.5792, "step": 4954 }, { "epoch": 2.934557299378146, "grad_norm": 1.5647319892731515, "learning_rate": 2.4290146438277205e-08, "loss": 0.6523, "step": 4955 }, { "epoch": 2.9351495410127333, "grad_norm": 1.6632250660344898, "learning_rate": 2.3846676540739687e-08, "loss": 0.6055, "step": 4956 }, { "epoch": 2.93574178264732, "grad_norm": 6.646406313636889, "learning_rate": 2.3407287450897265e-08, "loss": 0.553, "step": 4957 }, { "epoch": 2.936334024281907, "grad_norm": 1.5348933781298375, "learning_rate": 2.2971979348485053e-08, "loss": 0.5773, "step": 4958 }, { "epoch": 2.936926265916494, "grad_norm": 7.44752974245422, "learning_rate": 2.25407524115695e-08, "loss": 0.6163, "step": 4959 }, { "epoch": 2.937518507551081, "grad_norm": 1.6171911575067186, "learning_rate": 2.2113606816546172e-08, "loss": 0.6417, "step": 4960 }, { "epoch": 2.9381107491856677, "grad_norm": 3.9860985428266287, "learning_rate": 2.169054273814086e-08, "loss": 0.6266, "step": 4961 }, { "epoch": 2.9387029908202544, "grad_norm": 2.1282877407533936, "learning_rate": 2.127156034941069e-08, "loss": 0.6488, "step": 4962 }, { "epoch": 2.9392952324548416, "grad_norm": 1.54117884175081, "learning_rate": 2.085665982174412e-08, "loss": 0.5842, "step": 4963 }, { "epoch": 2.9398874740894287, "grad_norm": 1.6319922889959433, "learning_rate": 2.0445841324856497e-08, "loss": 0.5998, "step": 4964 }, { "epoch": 2.9404797157240155, "grad_norm": 1.9689793022752626, "learning_rate": 2.0039105026798956e-08, "loss": 0.5922, "step": 4965 }, { "epoch": 2.941071957358602, "grad_norm": 1.9910821968525951, "learning_rate": 1.9636451093947296e-08, "loss": 0.6316, "step": 4966 }, { "epoch": 2.9416641989931893, "grad_norm": 1.1742386957010071, "learning_rate": 1.9237879691009764e-08, "loss": 0.615, "step": 4967 }, { "epoch": 2.942256440627776, "grad_norm": 1.9540025838105881, "learning_rate": 1.8843390981024835e-08, "loss": 0.6529, "step": 4968 }, { "epoch": 2.942848682262363, "grad_norm": 1.426910256416881, "learning_rate": 1.84529851253612e-08, "loss": 0.6153, "step": 4969 }, { "epoch": 2.94344092389695, "grad_norm": 1.3438358517937634, "learning_rate": 1.8066662283715562e-08, "loss": 0.628, "step": 4970 }, { "epoch": 2.944033165531537, "grad_norm": 1.3977430066085448, "learning_rate": 1.768442261411707e-08, "loss": 0.5951, "step": 4971 }, { "epoch": 2.9446254071661238, "grad_norm": 1.205901999673902, "learning_rate": 1.7306266272921756e-08, "loss": 0.63, "step": 4972 }, { "epoch": 2.9452176488007105, "grad_norm": 1.4995147029787064, "learning_rate": 1.6932193414817e-08, "loss": 0.6355, "step": 4973 }, { "epoch": 2.9458098904352976, "grad_norm": 1.2679556957348548, "learning_rate": 1.6562204192821507e-08, "loss": 0.6433, "step": 4974 }, { "epoch": 2.9464021320698848, "grad_norm": 10.158096968146605, "learning_rate": 1.619629875827977e-08, "loss": 0.5938, "step": 4975 }, { "epoch": 2.9469943737044715, "grad_norm": 2.776393491326393, "learning_rate": 1.583447726086762e-08, "loss": 0.6448, "step": 4976 }, { "epoch": 2.947586615339058, "grad_norm": 2.679025395136911, "learning_rate": 1.5476739848592216e-08, "loss": 0.6748, "step": 4977 }, { "epoch": 2.9481788569736453, "grad_norm": 1.4748116377281646, "learning_rate": 1.5123086667786502e-08, "loss": 0.598, "step": 4978 }, { "epoch": 2.948771098608232, "grad_norm": 1.4962671971975132, "learning_rate": 1.4773517863114761e-08, "loss": 0.5961, "step": 4979 }, { "epoch": 2.949363340242819, "grad_norm": 1.6298275886188416, "learning_rate": 1.4428033577571498e-08, "loss": 0.6241, "step": 4980 }, { "epoch": 2.949955581877406, "grad_norm": 1.314024889434035, "learning_rate": 1.4086633952478113e-08, "loss": 0.6465, "step": 4981 }, { "epoch": 2.950547823511993, "grad_norm": 1.0907588428254045, "learning_rate": 1.3749319127486228e-08, "loss": 0.5865, "step": 4982 }, { "epoch": 2.95114006514658, "grad_norm": 1.3791495256977004, "learning_rate": 1.341608924057658e-08, "loss": 0.6294, "step": 4983 }, { "epoch": 2.9517323067811665, "grad_norm": 1.42971017695065, "learning_rate": 1.3086944428060132e-08, "loss": 0.5849, "step": 4984 }, { "epoch": 2.9523245484157536, "grad_norm": 1.9580880132072813, "learning_rate": 1.2761884824573634e-08, "loss": 0.6101, "step": 4985 }, { "epoch": 2.952916790050341, "grad_norm": 1.678533797027778, "learning_rate": 1.2440910563086273e-08, "loss": 0.6036, "step": 4986 }, { "epoch": 2.9535090316849275, "grad_norm": 1.8624668233784294, "learning_rate": 1.2124021774894134e-08, "loss": 0.625, "step": 4987 }, { "epoch": 2.954101273319514, "grad_norm": 1.7622666153251665, "learning_rate": 1.181121858962353e-08, "loss": 0.5638, "step": 4988 }, { "epoch": 2.9546935149541014, "grad_norm": 1.4944115685837704, "learning_rate": 1.1502501135225442e-08, "loss": 0.5821, "step": 4989 }, { "epoch": 2.955285756588688, "grad_norm": 2.772196851809732, "learning_rate": 1.1197869537986627e-08, "loss": 0.6126, "step": 4990 }, { "epoch": 2.9558779982232752, "grad_norm": 2.6820552266235147, "learning_rate": 1.089732392251519e-08, "loss": 0.5898, "step": 4991 }, { "epoch": 2.956470239857862, "grad_norm": 1.26193680911539, "learning_rate": 1.0600864411753897e-08, "loss": 0.6345, "step": 4992 }, { "epoch": 2.957062481492449, "grad_norm": 3.7109469716570436, "learning_rate": 1.0308491126969077e-08, "loss": 0.5613, "step": 4993 }, { "epoch": 2.957654723127036, "grad_norm": 1.426244864019141, "learning_rate": 1.0020204187759507e-08, "loss": 0.625, "step": 4994 }, { "epoch": 2.9582469647616225, "grad_norm": 1.3313942225304733, "learning_rate": 9.736003712050857e-09, "loss": 0.5922, "step": 4995 }, { "epoch": 2.9588392063962097, "grad_norm": 3.724539629662786, "learning_rate": 9.455889816095687e-09, "loss": 0.585, "step": 4996 }, { "epoch": 2.959431448030797, "grad_norm": 1.8116214951176344, "learning_rate": 9.179862614476787e-09, "loss": 0.646, "step": 4997 }, { "epoch": 2.9600236896653835, "grad_norm": 1.4870564082539035, "learning_rate": 8.907922220104947e-09, "loss": 0.6339, "step": 4998 }, { "epoch": 2.9606159312999702, "grad_norm": 2.5117803113313917, "learning_rate": 8.640068744220077e-09, "loss": 0.5936, "step": 4999 }, { "epoch": 2.9612081729345574, "grad_norm": 2.643791847377505, "learning_rate": 8.376302296387862e-09, "loss": 0.6284, "step": 5000 }, { "epoch": 2.961800414569144, "grad_norm": 1.5248955556340484, "learning_rate": 8.116622984504219e-09, "loss": 0.608, "step": 5001 }, { "epoch": 2.9623926562037313, "grad_norm": 1.4206625140578077, "learning_rate": 7.861030914791956e-09, "loss": 0.6016, "step": 5002 }, { "epoch": 2.962984897838318, "grad_norm": 2.7467705894682704, "learning_rate": 7.609526191804107e-09, "loss": 0.6142, "step": 5003 }, { "epoch": 2.963577139472905, "grad_norm": 2.227943416338275, "learning_rate": 7.362108918418376e-09, "loss": 0.6039, "step": 5004 }, { "epoch": 2.964169381107492, "grad_norm": 2.2376624114324635, "learning_rate": 7.118779195843806e-09, "loss": 0.6396, "step": 5005 }, { "epoch": 2.9647616227420786, "grad_norm": 2.2974484985519372, "learning_rate": 6.8795371236163315e-09, "loss": 0.6084, "step": 5006 }, { "epoch": 2.9653538643766657, "grad_norm": 2.445011281093722, "learning_rate": 6.64438279959767e-09, "loss": 0.6647, "step": 5007 }, { "epoch": 2.965946106011253, "grad_norm": 2.5322699217659825, "learning_rate": 6.413316319979768e-09, "loss": 0.6158, "step": 5008 }, { "epoch": 2.9665383476458396, "grad_norm": 2.264489636975484, "learning_rate": 6.186337779282569e-09, "loss": 0.6319, "step": 5009 }, { "epoch": 2.9671305892804263, "grad_norm": 1.276152934848022, "learning_rate": 5.9634472703518075e-09, "loss": 0.5986, "step": 5010 }, { "epoch": 2.9677228309150134, "grad_norm": 1.7722200702077937, "learning_rate": 5.744644884363437e-09, "loss": 0.6465, "step": 5011 }, { "epoch": 2.9683150725496, "grad_norm": 2.8327566548651344, "learning_rate": 5.529930710820308e-09, "loss": 0.5873, "step": 5012 }, { "epoch": 2.9689073141841873, "grad_norm": 15.271569350154476, "learning_rate": 5.319304837549943e-09, "loss": 0.5923, "step": 5013 }, { "epoch": 2.969499555818774, "grad_norm": 3.7239511417804705, "learning_rate": 5.112767350713421e-09, "loss": 0.6025, "step": 5014 }, { "epoch": 2.970091797453361, "grad_norm": 1.4648198398137937, "learning_rate": 4.9103183347942725e-09, "loss": 0.6499, "step": 5015 }, { "epoch": 2.970684039087948, "grad_norm": 1.6998540857276536, "learning_rate": 4.711957872606254e-09, "loss": 0.6304, "step": 5016 }, { "epoch": 2.9712762807225346, "grad_norm": 1.6467406630255363, "learning_rate": 4.517686045288905e-09, "loss": 0.6548, "step": 5017 }, { "epoch": 2.9718685223571217, "grad_norm": 1.6033639446909023, "learning_rate": 4.327502932311989e-09, "loss": 0.5407, "step": 5018 }, { "epoch": 2.972460763991709, "grad_norm": 2.013297951719648, "learning_rate": 4.141408611469944e-09, "loss": 0.6036, "step": 5019 }, { "epoch": 2.9730530056262956, "grad_norm": 1.5184280052213182, "learning_rate": 3.959403158885211e-09, "loss": 0.655, "step": 5020 }, { "epoch": 2.9736452472608823, "grad_norm": 1.2805590152228392, "learning_rate": 3.781486649010458e-09, "loss": 0.6361, "step": 5021 }, { "epoch": 2.9742374888954695, "grad_norm": 1.4488055335719843, "learning_rate": 3.607659154621912e-09, "loss": 0.5857, "step": 5022 }, { "epoch": 2.974829730530056, "grad_norm": 1.5067233321604163, "learning_rate": 3.437920746824919e-09, "loss": 0.6314, "step": 5023 }, { "epoch": 2.9754219721646433, "grad_norm": 2.505217366867062, "learning_rate": 3.2722714950517154e-09, "loss": 0.593, "step": 5024 }, { "epoch": 2.97601421379923, "grad_norm": 2.030933501052515, "learning_rate": 3.110711467063654e-09, "loss": 0.6655, "step": 5025 }, { "epoch": 2.976606455433817, "grad_norm": 1.2552202263134016, "learning_rate": 2.95324072894565e-09, "loss": 0.6237, "step": 5026 }, { "epoch": 2.977198697068404, "grad_norm": 1.3223714382480334, "learning_rate": 2.7998593451139534e-09, "loss": 0.5913, "step": 5027 }, { "epoch": 2.9777909387029906, "grad_norm": 1.3538171647456099, "learning_rate": 2.6505673783094875e-09, "loss": 0.6199, "step": 5028 }, { "epoch": 2.9783831803375778, "grad_norm": 1.3318176094069416, "learning_rate": 2.5053648896011804e-09, "loss": 0.5841, "step": 5029 }, { "epoch": 2.978975421972165, "grad_norm": 1.589489686996947, "learning_rate": 2.364251938384854e-09, "loss": 0.6094, "step": 5030 }, { "epoch": 2.9795676636067516, "grad_norm": 1.7400278108274327, "learning_rate": 2.227228582384333e-09, "loss": 0.6143, "step": 5031 }, { "epoch": 2.9801599052413383, "grad_norm": 2.585378676689252, "learning_rate": 2.0942948776481175e-09, "loss": 0.5828, "step": 5032 }, { "epoch": 2.9807521468759255, "grad_norm": 1.7280844207518962, "learning_rate": 1.965450878556041e-09, "loss": 0.6314, "step": 5033 }, { "epoch": 2.981344388510512, "grad_norm": 5.265908624204134, "learning_rate": 1.8406966378103909e-09, "loss": 0.5925, "step": 5034 }, { "epoch": 2.9819366301450994, "grad_norm": 1.4033183600587051, "learning_rate": 1.720032206443678e-09, "loss": 0.6348, "step": 5035 }, { "epoch": 2.982528871779686, "grad_norm": 1.8992093800736356, "learning_rate": 1.6034576338141982e-09, "loss": 0.655, "step": 5036 }, { "epoch": 2.983121113414273, "grad_norm": 1.6151253256582707, "learning_rate": 1.4909729676071405e-09, "loss": 0.5662, "step": 5037 }, { "epoch": 2.98371335504886, "grad_norm": 1.2292587721661237, "learning_rate": 1.3825782538368083e-09, "loss": 0.6521, "step": 5038 }, { "epoch": 2.9843055966834466, "grad_norm": 1.4529622161539926, "learning_rate": 1.278273536839958e-09, "loss": 0.6323, "step": 5039 }, { "epoch": 2.984897838318034, "grad_norm": 1.498906553325035, "learning_rate": 1.178058859285791e-09, "loss": 0.5986, "step": 5040 }, { "epoch": 2.985490079952621, "grad_norm": 1.2660500957601097, "learning_rate": 1.0819342621648522e-09, "loss": 0.5796, "step": 5041 }, { "epoch": 2.9860823215872077, "grad_norm": 1.336021937389783, "learning_rate": 9.898997848001302e-10, "loss": 0.5771, "step": 5042 }, { "epoch": 2.9866745632217944, "grad_norm": 2.936187746887494, "learning_rate": 9.019554648381778e-10, "loss": 0.5746, "step": 5043 }, { "epoch": 2.9872668048563815, "grad_norm": 1.974248988367071, "learning_rate": 8.181013382524416e-10, "loss": 0.596, "step": 5044 }, { "epoch": 2.9878590464909682, "grad_norm": 2.076337021390945, "learning_rate": 7.383374393454823e-10, "loss": 0.6021, "step": 5045 }, { "epoch": 2.9884512881255554, "grad_norm": 1.2554188823022927, "learning_rate": 6.626638007434239e-10, "loss": 0.5964, "step": 5046 }, { "epoch": 2.989043529760142, "grad_norm": 1.4522435596098013, "learning_rate": 5.910804534015046e-10, "loss": 0.6149, "step": 5047 }, { "epoch": 2.9896357713947292, "grad_norm": 1.785638501758462, "learning_rate": 5.23587426601857e-10, "loss": 0.6229, "step": 5048 }, { "epoch": 2.990228013029316, "grad_norm": 1.5511181558630578, "learning_rate": 4.601847479523969e-10, "loss": 0.5953, "step": 5049 }, { "epoch": 2.9908202546639027, "grad_norm": 1.5809877291449523, "learning_rate": 4.008724433890443e-10, "loss": 0.6185, "step": 5050 }, { "epoch": 2.99141249629849, "grad_norm": 1.25867971827954, "learning_rate": 3.4565053717350303e-10, "loss": 0.6286, "step": 5051 }, { "epoch": 2.9920047379330765, "grad_norm": 1.6791689479616938, "learning_rate": 2.94519051895481e-10, "loss": 0.6264, "step": 5052 }, { "epoch": 2.9925969795676637, "grad_norm": 2.2845001847932447, "learning_rate": 2.474780084682493e-10, "loss": 0.6321, "step": 5053 }, { "epoch": 2.9931892212022504, "grad_norm": 2.632181608604146, "learning_rate": 2.0452742613641386e-10, "loss": 0.6344, "step": 5054 }, { "epoch": 2.9937814628368375, "grad_norm": 1.8651242937205437, "learning_rate": 1.6566732246925398e-10, "loss": 0.6184, "step": 5055 }, { "epoch": 2.9943737044714243, "grad_norm": 1.375082869113834, "learning_rate": 1.3089771336072256e-10, "loss": 0.6076, "step": 5056 }, { "epoch": 2.9949659461060114, "grad_norm": 1.3906730573347121, "learning_rate": 1.0021861303610714e-10, "loss": 0.5459, "step": 5057 }, { "epoch": 2.995558187740598, "grad_norm": 1.4138902354298646, "learning_rate": 7.363003404314839e-11, "loss": 0.5922, "step": 5058 }, { "epoch": 2.9961504293751853, "grad_norm": 4.735285490670945, "learning_rate": 5.113198725870128e-11, "loss": 0.643, "step": 5059 }, { "epoch": 2.996742671009772, "grad_norm": 4.0056402967283296, "learning_rate": 3.272448188429422e-11, "loss": 0.5758, "step": 5060 }, { "epoch": 2.9973349126443587, "grad_norm": 1.6787822740596676, "learning_rate": 1.8407525452790454e-11, "loss": 0.5937, "step": 5061 }, { "epoch": 2.997927154278946, "grad_norm": 1.5279759168116307, "learning_rate": 8.181123817285752e-12, "loss": 0.6155, "step": 5062 }, { "epoch": 2.9985193959135326, "grad_norm": 1.977332811161579, "learning_rate": 2.0452811633209224e-12, "loss": 0.6343, "step": 5063 }, { "epoch": 2.9991116375481197, "grad_norm": 1.904074374443596, "learning_rate": 0.0, "loss": 0.595, "step": 5064 }, { "epoch": 2.9991116375481197, "step": 5064, "total_flos": 3794121211510784.0, "train_loss": 0.7367976606951505, "train_runtime": 36668.0035, "train_samples_per_second": 17.677, "train_steps_per_second": 0.138 } ], "logging_steps": 1.0, "max_steps": 5064, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 6000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3794121211510784.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }