diff --git "a/checkpoint-3108/trainer_state.json" "b/checkpoint-3108/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-3108/trainer_state.json" @@ -0,0 +1,21817 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 622, + "global_step": 3108, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.2006484270095825, + "learning_rate": 2.5e-06, + "loss": 5.0998, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 5.15006685256958, + "eval_runtime": 28.119, + "eval_samples_per_second": 3.023, + "eval_steps_per_second": 1.529, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 1.6314904689788818, + "learning_rate": 5e-06, + "loss": 5.6374, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 1.3538857698440552, + "learning_rate": 7.5e-06, + "loss": 5.0656, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 0.9769891500473022, + "learning_rate": 1e-05, + "loss": 5.1906, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 1.4604706764221191, + "learning_rate": 1.25e-05, + "loss": 5.1107, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 1.2502201795578003, + "learning_rate": 1.5e-05, + "loss": 4.8447, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 1.1797655820846558, + "learning_rate": 1.7500000000000002e-05, + "loss": 5.0042, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 1.3177151679992676, + "learning_rate": 2e-05, + "loss": 4.878, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 1.1382358074188232, + "learning_rate": 2.2499999999999998e-05, + "loss": 5.106, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 1.1523857116699219, + "learning_rate": 2.5e-05, + "loss": 5.241, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.994879961013794, + "learning_rate": 2.75e-05, + "loss": 4.4475, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 1.0009862184524536, + "learning_rate": 3e-05, + "loss": 4.615, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 1.1249079704284668, + "learning_rate": 3.2500000000000004e-05, + "loss": 4.3469, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 1.1036263704299927, + "learning_rate": 3.5000000000000004e-05, + "loss": 4.7748, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 1.1240339279174805, + "learning_rate": 3.75e-05, + "loss": 4.8696, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 0.9816920757293701, + "learning_rate": 4e-05, + "loss": 4.5595, + "step": 16 + }, + { + "epoch": 0.02, + "grad_norm": 1.1084593534469604, + "learning_rate": 4.25e-05, + "loss": 4.2922, + "step": 17 + }, + { + "epoch": 0.02, + "grad_norm": 0.997968316078186, + "learning_rate": 4.4999999999999996e-05, + "loss": 4.3057, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 1.0401185750961304, + "learning_rate": 4.75e-05, + "loss": 4.364, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 1.0804249048233032, + "learning_rate": 5e-05, + "loss": 4.8663, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 1.2384392023086548, + "learning_rate": 5.25e-05, + "loss": 4.2992, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 1.231845736503601, + "learning_rate": 5.5e-05, + "loss": 4.8637, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 1.242187261581421, + "learning_rate": 5.75e-05, + "loss": 3.865, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 1.2873363494873047, + "learning_rate": 6e-05, + "loss": 3.8585, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 1.1170119047164917, + "learning_rate": 6.25e-05, + "loss": 3.8948, + "step": 25 + }, + { + "epoch": 0.03, + "grad_norm": 1.1938223838806152, + "learning_rate": 6.500000000000001e-05, + "loss": 3.8009, + "step": 26 + }, + { + "epoch": 0.03, + "grad_norm": 1.3068759441375732, + "learning_rate": 6.75e-05, + "loss": 3.6811, + "step": 27 + }, + { + "epoch": 0.03, + "grad_norm": 1.280858039855957, + "learning_rate": 7.000000000000001e-05, + "loss": 3.9975, + "step": 28 + }, + { + "epoch": 0.03, + "grad_norm": 1.3869880437850952, + "learning_rate": 7.25e-05, + "loss": 3.5562, + "step": 29 + }, + { + "epoch": 0.03, + "grad_norm": 1.3908005952835083, + "learning_rate": 7.5e-05, + "loss": 3.296, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 1.622360348701477, + "learning_rate": 7.75e-05, + "loss": 3.8233, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 1.5797035694122314, + "learning_rate": 8e-05, + "loss": 3.6274, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 1.816627025604248, + "learning_rate": 8.25e-05, + "loss": 3.3872, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 2.1191844940185547, + "learning_rate": 8.5e-05, + "loss": 3.5982, + "step": 34 + }, + { + "epoch": 0.03, + "grad_norm": 1.9559025764465332, + "learning_rate": 8.75e-05, + "loss": 3.5972, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 1.6593563556671143, + "learning_rate": 8.999999999999999e-05, + "loss": 3.3982, + "step": 36 + }, + { + "epoch": 0.04, + "grad_norm": 2.0360360145568848, + "learning_rate": 9.25e-05, + "loss": 3.5185, + "step": 37 + }, + { + "epoch": 0.04, + "grad_norm": 1.9218908548355103, + "learning_rate": 9.5e-05, + "loss": 3.5998, + "step": 38 + }, + { + "epoch": 0.04, + "grad_norm": 1.6969695091247559, + "learning_rate": 9.750000000000001e-05, + "loss": 3.2894, + "step": 39 + }, + { + "epoch": 0.04, + "grad_norm": 1.6107378005981445, + "learning_rate": 0.0001, + "loss": 3.3074, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 1.6574798822402954, + "learning_rate": 0.0001025, + "loss": 3.4148, + "step": 41 + }, + { + "epoch": 0.04, + "grad_norm": 1.8693658113479614, + "learning_rate": 0.000105, + "loss": 3.4726, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 1.4956166744232178, + "learning_rate": 0.0001075, + "loss": 2.9462, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 1.6204662322998047, + "learning_rate": 0.00011, + "loss": 3.1303, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 1.8228565454483032, + "learning_rate": 0.00011250000000000001, + "loss": 3.9787, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 1.6097328662872314, + "learning_rate": 0.000115, + "loss": 2.9556, + "step": 46 + }, + { + "epoch": 0.05, + "grad_norm": 1.843204140663147, + "learning_rate": 0.0001175, + "loss": 3.0469, + "step": 47 + }, + { + "epoch": 0.05, + "grad_norm": 1.8468220233917236, + "learning_rate": 0.00012, + "loss": 3.1549, + "step": 48 + }, + { + "epoch": 0.05, + "grad_norm": 1.8635084629058838, + "learning_rate": 0.0001225, + "loss": 3.2309, + "step": 49 + }, + { + "epoch": 0.05, + "grad_norm": 1.9674320220947266, + "learning_rate": 0.000125, + "loss": 3.264, + "step": 50 + }, + { + "epoch": 0.05, + "grad_norm": 1.514456868171692, + "learning_rate": 0.0001275, + "loss": 2.895, + "step": 51 + }, + { + "epoch": 0.05, + "grad_norm": 1.8026971817016602, + "learning_rate": 0.00013000000000000002, + "loss": 2.9692, + "step": 52 + }, + { + "epoch": 0.05, + "grad_norm": 1.872318983078003, + "learning_rate": 0.00013250000000000002, + "loss": 3.7424, + "step": 53 + }, + { + "epoch": 0.05, + "grad_norm": 1.7249802350997925, + "learning_rate": 0.000135, + "loss": 2.941, + "step": 54 + }, + { + "epoch": 0.05, + "grad_norm": 1.677011489868164, + "learning_rate": 0.0001375, + "loss": 3.3312, + "step": 55 + }, + { + "epoch": 0.05, + "grad_norm": 1.9033194780349731, + "learning_rate": 0.00014000000000000001, + "loss": 2.5066, + "step": 56 + }, + { + "epoch": 0.06, + "grad_norm": 1.9987455606460571, + "learning_rate": 0.0001425, + "loss": 3.4693, + "step": 57 + }, + { + "epoch": 0.06, + "grad_norm": 1.9788786172866821, + "learning_rate": 0.000145, + "loss": 2.8805, + "step": 58 + }, + { + "epoch": 0.06, + "grad_norm": 1.7737230062484741, + "learning_rate": 0.0001475, + "loss": 2.8758, + "step": 59 + }, + { + "epoch": 0.06, + "grad_norm": 1.7360577583312988, + "learning_rate": 0.00015, + "loss": 3.4, + "step": 60 + }, + { + "epoch": 0.06, + "grad_norm": 1.982911229133606, + "learning_rate": 0.0001525, + "loss": 2.779, + "step": 61 + }, + { + "epoch": 0.06, + "grad_norm": 1.9741348028182983, + "learning_rate": 0.000155, + "loss": 3.3381, + "step": 62 + }, + { + "epoch": 0.06, + "grad_norm": 2.060384750366211, + "learning_rate": 0.0001575, + "loss": 3.6285, + "step": 63 + }, + { + "epoch": 0.06, + "grad_norm": 1.6591861248016357, + "learning_rate": 0.00016, + "loss": 3.1691, + "step": 64 + }, + { + "epoch": 0.06, + "grad_norm": 1.8308738470077515, + "learning_rate": 0.00016250000000000002, + "loss": 3.3354, + "step": 65 + }, + { + "epoch": 0.06, + "grad_norm": 1.8891240358352661, + "learning_rate": 0.000165, + "loss": 3.2649, + "step": 66 + }, + { + "epoch": 0.06, + "grad_norm": 1.697945237159729, + "learning_rate": 0.0001675, + "loss": 2.8464, + "step": 67 + }, + { + "epoch": 0.07, + "grad_norm": 1.6797360181808472, + "learning_rate": 0.00017, + "loss": 3.3645, + "step": 68 + }, + { + "epoch": 0.07, + "grad_norm": 1.964936375617981, + "learning_rate": 0.0001725, + "loss": 2.6052, + "step": 69 + }, + { + "epoch": 0.07, + "grad_norm": 2.081937313079834, + "learning_rate": 0.000175, + "loss": 3.3901, + "step": 70 + }, + { + "epoch": 0.07, + "grad_norm": 1.8318378925323486, + "learning_rate": 0.0001775, + "loss": 3.2655, + "step": 71 + }, + { + "epoch": 0.07, + "grad_norm": 1.757249116897583, + "learning_rate": 0.00017999999999999998, + "loss": 3.254, + "step": 72 + }, + { + "epoch": 0.07, + "grad_norm": 1.8252121210098267, + "learning_rate": 0.0001825, + "loss": 2.5852, + "step": 73 + }, + { + "epoch": 0.07, + "grad_norm": 2.0867161750793457, + "learning_rate": 0.000185, + "loss": 2.8474, + "step": 74 + }, + { + "epoch": 0.07, + "grad_norm": 2.1990573406219482, + "learning_rate": 0.0001875, + "loss": 2.5813, + "step": 75 + }, + { + "epoch": 0.07, + "grad_norm": 2.1143388748168945, + "learning_rate": 0.00019, + "loss": 3.1759, + "step": 76 + }, + { + "epoch": 0.07, + "grad_norm": 2.1437313556671143, + "learning_rate": 0.00019250000000000002, + "loss": 2.7499, + "step": 77 + }, + { + "epoch": 0.08, + "grad_norm": 2.0984110832214355, + "learning_rate": 0.00019500000000000002, + "loss": 2.6248, + "step": 78 + }, + { + "epoch": 0.08, + "grad_norm": 2.28216290473938, + "learning_rate": 0.0001975, + "loss": 2.5725, + "step": 79 + }, + { + "epoch": 0.08, + "grad_norm": 1.966034173965454, + "learning_rate": 0.0002, + "loss": 3.0547, + "step": 80 + }, + { + "epoch": 0.08, + "grad_norm": 2.117560625076294, + "learning_rate": 0.00020250000000000002, + "loss": 3.1316, + "step": 81 + }, + { + "epoch": 0.08, + "grad_norm": 1.8826152086257935, + "learning_rate": 0.000205, + "loss": 2.8674, + "step": 82 + }, + { + "epoch": 0.08, + "grad_norm": 1.997240424156189, + "learning_rate": 0.0002075, + "loss": 2.4458, + "step": 83 + }, + { + "epoch": 0.08, + "grad_norm": 1.6871142387390137, + "learning_rate": 0.00021, + "loss": 2.814, + "step": 84 + }, + { + "epoch": 0.08, + "grad_norm": 2.066634178161621, + "learning_rate": 0.0002125, + "loss": 3.2066, + "step": 85 + }, + { + "epoch": 0.08, + "grad_norm": 1.9600077867507935, + "learning_rate": 0.000215, + "loss": 3.2272, + "step": 86 + }, + { + "epoch": 0.08, + "grad_norm": 2.1469485759735107, + "learning_rate": 0.0002175, + "loss": 3.4447, + "step": 87 + }, + { + "epoch": 0.08, + "grad_norm": 1.944505214691162, + "learning_rate": 0.00022, + "loss": 2.5898, + "step": 88 + }, + { + "epoch": 0.09, + "grad_norm": 2.3109183311462402, + "learning_rate": 0.00022250000000000001, + "loss": 2.771, + "step": 89 + }, + { + "epoch": 0.09, + "grad_norm": 2.174560070037842, + "learning_rate": 0.00022500000000000002, + "loss": 3.0631, + "step": 90 + }, + { + "epoch": 0.09, + "grad_norm": 1.8415343761444092, + "learning_rate": 0.0002275, + "loss": 3.123, + "step": 91 + }, + { + "epoch": 0.09, + "grad_norm": 2.104952096939087, + "learning_rate": 0.00023, + "loss": 3.1166, + "step": 92 + }, + { + "epoch": 0.09, + "grad_norm": 2.3860461711883545, + "learning_rate": 0.0002325, + "loss": 3.048, + "step": 93 + }, + { + "epoch": 0.09, + "grad_norm": 2.132197856903076, + "learning_rate": 0.000235, + "loss": 2.7315, + "step": 94 + }, + { + "epoch": 0.09, + "grad_norm": 1.7788770198822021, + "learning_rate": 0.0002375, + "loss": 3.127, + "step": 95 + }, + { + "epoch": 0.09, + "grad_norm": 1.8100048303604126, + "learning_rate": 0.00024, + "loss": 2.3229, + "step": 96 + }, + { + "epoch": 0.09, + "grad_norm": 1.7176936864852905, + "learning_rate": 0.00024249999999999999, + "loss": 2.8595, + "step": 97 + }, + { + "epoch": 0.09, + "grad_norm": 1.920607089996338, + "learning_rate": 0.000245, + "loss": 2.6285, + "step": 98 + }, + { + "epoch": 0.1, + "grad_norm": 1.8976895809173584, + "learning_rate": 0.0002475, + "loss": 2.3511, + "step": 99 + }, + { + "epoch": 0.1, + "grad_norm": 2.0189640522003174, + "learning_rate": 0.00025, + "loss": 2.7418, + "step": 100 + }, + { + "epoch": 0.1, + "grad_norm": 1.9701827764511108, + "learning_rate": 0.00024999993182517067, + "loss": 2.0604, + "step": 101 + }, + { + "epoch": 0.1, + "grad_norm": 3.252211570739746, + "learning_rate": 0.00024999972730075704, + "loss": 2.6496, + "step": 102 + }, + { + "epoch": 0.1, + "grad_norm": 2.1773853302001953, + "learning_rate": 0.0002499993864269822, + "loss": 2.8563, + "step": 103 + }, + { + "epoch": 0.1, + "grad_norm": 2.173621654510498, + "learning_rate": 0.00024999890920421796, + "loss": 2.5704, + "step": 104 + }, + { + "epoch": 0.1, + "grad_norm": 1.8827345371246338, + "learning_rate": 0.0002499982956329849, + "loss": 2.7194, + "step": 105 + }, + { + "epoch": 0.1, + "grad_norm": 2.4898743629455566, + "learning_rate": 0.00024999754571395234, + "loss": 3.4996, + "step": 106 + }, + { + "epoch": 0.1, + "grad_norm": 2.006110191345215, + "learning_rate": 0.00024999665944793815, + "loss": 2.7179, + "step": 107 + }, + { + "epoch": 0.1, + "grad_norm": 2.0414044857025146, + "learning_rate": 0.00024999563683590924, + "loss": 2.9323, + "step": 108 + }, + { + "epoch": 0.11, + "grad_norm": 2.1107375621795654, + "learning_rate": 0.0002499944778789809, + "loss": 2.4901, + "step": 109 + }, + { + "epoch": 0.11, + "grad_norm": 1.8949179649353027, + "learning_rate": 0.0002499931825784175, + "loss": 2.6771, + "step": 110 + }, + { + "epoch": 0.11, + "grad_norm": 1.852339267730713, + "learning_rate": 0.00024999175093563185, + "loss": 2.5123, + "step": 111 + }, + { + "epoch": 0.11, + "grad_norm": 1.7306336164474487, + "learning_rate": 0.0002499901829521856, + "loss": 2.7333, + "step": 112 + }, + { + "epoch": 0.11, + "grad_norm": 2.1602630615234375, + "learning_rate": 0.000249988478629789, + "loss": 3.2803, + "step": 113 + }, + { + "epoch": 0.11, + "grad_norm": 1.8918505907058716, + "learning_rate": 0.0002499866379703013, + "loss": 2.7467, + "step": 114 + }, + { + "epoch": 0.11, + "grad_norm": 1.9145008325576782, + "learning_rate": 0.0002499846609757302, + "loss": 2.7413, + "step": 115 + }, + { + "epoch": 0.11, + "grad_norm": 1.8250600099563599, + "learning_rate": 0.00024998254764823215, + "loss": 2.5852, + "step": 116 + }, + { + "epoch": 0.11, + "grad_norm": 2.1568727493286133, + "learning_rate": 0.0002499802979901124, + "loss": 2.7673, + "step": 117 + }, + { + "epoch": 0.11, + "grad_norm": 1.7749338150024414, + "learning_rate": 0.0002499779120038249, + "loss": 2.4479, + "step": 118 + }, + { + "epoch": 0.11, + "grad_norm": 1.7486765384674072, + "learning_rate": 0.0002499753896919723, + "loss": 1.8766, + "step": 119 + }, + { + "epoch": 0.12, + "grad_norm": 2.076768159866333, + "learning_rate": 0.0002499727310573059, + "loss": 1.8791, + "step": 120 + }, + { + "epoch": 0.12, + "grad_norm": 1.604339599609375, + "learning_rate": 0.00024996993610272567, + "loss": 2.0573, + "step": 121 + }, + { + "epoch": 0.12, + "grad_norm": 2.116293430328369, + "learning_rate": 0.00024996700483128037, + "loss": 2.7077, + "step": 122 + }, + { + "epoch": 0.12, + "grad_norm": 2.233560800552368, + "learning_rate": 0.0002499639372461675, + "loss": 3.1111, + "step": 123 + }, + { + "epoch": 0.12, + "grad_norm": 1.750420093536377, + "learning_rate": 0.0002499607333507331, + "loss": 2.4861, + "step": 124 + }, + { + "epoch": 0.12, + "grad_norm": 1.8790860176086426, + "learning_rate": 0.000249957393148472, + "loss": 2.5195, + "step": 125 + }, + { + "epoch": 0.12, + "grad_norm": 1.7548282146453857, + "learning_rate": 0.0002499539166430276, + "loss": 2.5072, + "step": 126 + }, + { + "epoch": 0.12, + "grad_norm": 1.8878798484802246, + "learning_rate": 0.0002499503038381922, + "loss": 2.3164, + "step": 127 + }, + { + "epoch": 0.12, + "grad_norm": 2.040093421936035, + "learning_rate": 0.00024994655473790657, + "loss": 2.0806, + "step": 128 + }, + { + "epoch": 0.12, + "grad_norm": 1.9570492506027222, + "learning_rate": 0.0002499426693462602, + "loss": 2.7483, + "step": 129 + }, + { + "epoch": 0.13, + "grad_norm": 1.9189451932907104, + "learning_rate": 0.0002499386476674913, + "loss": 2.4365, + "step": 130 + }, + { + "epoch": 0.13, + "grad_norm": 1.840690016746521, + "learning_rate": 0.0002499344897059867, + "loss": 2.2675, + "step": 131 + }, + { + "epoch": 0.13, + "grad_norm": 2.0497002601623535, + "learning_rate": 0.0002499301954662818, + "loss": 2.3665, + "step": 132 + }, + { + "epoch": 0.13, + "grad_norm": 2.057999849319458, + "learning_rate": 0.0002499257649530609, + "loss": 2.2258, + "step": 133 + }, + { + "epoch": 0.13, + "grad_norm": 2.1040830612182617, + "learning_rate": 0.00024992119817115674, + "loss": 2.9013, + "step": 134 + }, + { + "epoch": 0.13, + "grad_norm": 2.1239287853240967, + "learning_rate": 0.0002499164951255507, + "loss": 2.4111, + "step": 135 + }, + { + "epoch": 0.13, + "grad_norm": 2.246864080429077, + "learning_rate": 0.00024991165582137286, + "loss": 2.1288, + "step": 136 + }, + { + "epoch": 0.13, + "grad_norm": 1.854050874710083, + "learning_rate": 0.000249906680263902, + "loss": 2.308, + "step": 137 + }, + { + "epoch": 0.13, + "grad_norm": 1.7639249563217163, + "learning_rate": 0.0002499015684585654, + "loss": 2.1624, + "step": 138 + }, + { + "epoch": 0.13, + "grad_norm": 1.9622340202331543, + "learning_rate": 0.0002498963204109389, + "loss": 1.7293, + "step": 139 + }, + { + "epoch": 0.14, + "grad_norm": 1.9182345867156982, + "learning_rate": 0.0002498909361267472, + "loss": 2.5676, + "step": 140 + }, + { + "epoch": 0.14, + "grad_norm": 1.546036720275879, + "learning_rate": 0.0002498854156118634, + "loss": 2.0416, + "step": 141 + }, + { + "epoch": 0.14, + "grad_norm": 1.7901947498321533, + "learning_rate": 0.0002498797588723093, + "loss": 2.3867, + "step": 142 + }, + { + "epoch": 0.14, + "grad_norm": 1.682002305984497, + "learning_rate": 0.00024987396591425517, + "loss": 2.2, + "step": 143 + }, + { + "epoch": 0.14, + "grad_norm": 2.1231274604797363, + "learning_rate": 0.00024986803674402003, + "loss": 2.7393, + "step": 144 + }, + { + "epoch": 0.14, + "grad_norm": 1.7583919763565063, + "learning_rate": 0.0002498619713680714, + "loss": 2.2841, + "step": 145 + }, + { + "epoch": 0.14, + "grad_norm": 1.6599968671798706, + "learning_rate": 0.00024985576979302533, + "loss": 1.9753, + "step": 146 + }, + { + "epoch": 0.14, + "grad_norm": 1.5316509008407593, + "learning_rate": 0.00024984943202564655, + "loss": 1.9288, + "step": 147 + }, + { + "epoch": 0.14, + "grad_norm": 1.8190314769744873, + "learning_rate": 0.0002498429580728482, + "loss": 2.2112, + "step": 148 + }, + { + "epoch": 0.14, + "grad_norm": 1.7357500791549683, + "learning_rate": 0.00024983634794169214, + "loss": 2.2501, + "step": 149 + }, + { + "epoch": 0.14, + "grad_norm": 1.8484951257705688, + "learning_rate": 0.0002498296016393886, + "loss": 1.7479, + "step": 150 + }, + { + "epoch": 0.15, + "grad_norm": 1.765267252922058, + "learning_rate": 0.00024982271917329646, + "loss": 2.5703, + "step": 151 + }, + { + "epoch": 0.15, + "grad_norm": 1.6502829790115356, + "learning_rate": 0.0002498157005509231, + "loss": 2.7001, + "step": 152 + }, + { + "epoch": 0.15, + "grad_norm": 2.010906934738159, + "learning_rate": 0.0002498085457799244, + "loss": 2.8633, + "step": 153 + }, + { + "epoch": 0.15, + "grad_norm": 1.850074052810669, + "learning_rate": 0.00024980125486810477, + "loss": 2.2063, + "step": 154 + }, + { + "epoch": 0.15, + "grad_norm": 1.9126758575439453, + "learning_rate": 0.00024979382782341713, + "loss": 2.1976, + "step": 155 + }, + { + "epoch": 0.15, + "grad_norm": 2.0436511039733887, + "learning_rate": 0.00024978626465396286, + "loss": 2.3515, + "step": 156 + }, + { + "epoch": 0.15, + "grad_norm": 2.3156819343566895, + "learning_rate": 0.0002497785653679919, + "loss": 2.7545, + "step": 157 + }, + { + "epoch": 0.15, + "grad_norm": 1.9115028381347656, + "learning_rate": 0.00024977072997390247, + "loss": 2.206, + "step": 158 + }, + { + "epoch": 0.15, + "grad_norm": 1.8286609649658203, + "learning_rate": 0.00024976275848024156, + "loss": 2.371, + "step": 159 + }, + { + "epoch": 0.15, + "grad_norm": 1.6878124475479126, + "learning_rate": 0.0002497546508957044, + "loss": 2.1747, + "step": 160 + }, + { + "epoch": 0.16, + "grad_norm": 1.893005132675171, + "learning_rate": 0.00024974640722913465, + "loss": 1.7254, + "step": 161 + }, + { + "epoch": 0.16, + "grad_norm": 2.11098313331604, + "learning_rate": 0.0002497380274895246, + "loss": 2.5951, + "step": 162 + }, + { + "epoch": 0.16, + "grad_norm": 1.7469336986541748, + "learning_rate": 0.00024972951168601476, + "loss": 2.2754, + "step": 163 + }, + { + "epoch": 0.16, + "grad_norm": 1.835694670677185, + "learning_rate": 0.00024972085982789415, + "loss": 2.1267, + "step": 164 + }, + { + "epoch": 0.16, + "grad_norm": 1.6754062175750732, + "learning_rate": 0.0002497120719246002, + "loss": 1.9974, + "step": 165 + }, + { + "epoch": 0.16, + "grad_norm": 1.8209201097488403, + "learning_rate": 0.0002497031479857188, + "loss": 2.5032, + "step": 166 + }, + { + "epoch": 0.16, + "grad_norm": 1.76840341091156, + "learning_rate": 0.000249694088020984, + "loss": 1.7389, + "step": 167 + }, + { + "epoch": 0.16, + "grad_norm": 1.552156925201416, + "learning_rate": 0.00024968489204027863, + "loss": 1.8085, + "step": 168 + }, + { + "epoch": 0.16, + "grad_norm": 1.761948585510254, + "learning_rate": 0.0002496755600536334, + "loss": 2.0805, + "step": 169 + }, + { + "epoch": 0.16, + "grad_norm": 2.200204372406006, + "learning_rate": 0.0002496660920712277, + "loss": 2.0036, + "step": 170 + }, + { + "epoch": 0.17, + "grad_norm": 1.8660489320755005, + "learning_rate": 0.0002496564881033892, + "loss": 2.1293, + "step": 171 + }, + { + "epoch": 0.17, + "grad_norm": 2.33247709274292, + "learning_rate": 0.00024964674816059393, + "loss": 2.3811, + "step": 172 + }, + { + "epoch": 0.17, + "grad_norm": 1.8773770332336426, + "learning_rate": 0.00024963687225346604, + "loss": 1.9161, + "step": 173 + }, + { + "epoch": 0.17, + "grad_norm": 2.1089746952056885, + "learning_rate": 0.0002496268603927783, + "loss": 2.5818, + "step": 174 + }, + { + "epoch": 0.17, + "grad_norm": 2.0784616470336914, + "learning_rate": 0.00024961671258945156, + "loss": 2.0916, + "step": 175 + }, + { + "epoch": 0.17, + "grad_norm": 1.8207252025604248, + "learning_rate": 0.000249606428854555, + "loss": 2.3204, + "step": 176 + }, + { + "epoch": 0.17, + "grad_norm": 1.9220213890075684, + "learning_rate": 0.00024959600919930607, + "loss": 2.5432, + "step": 177 + }, + { + "epoch": 0.17, + "grad_norm": 1.8817466497421265, + "learning_rate": 0.0002495854536350706, + "loss": 2.4326, + "step": 178 + }, + { + "epoch": 0.17, + "grad_norm": 1.8520444631576538, + "learning_rate": 0.0002495747621733625, + "loss": 1.9673, + "step": 179 + }, + { + "epoch": 0.17, + "grad_norm": 1.7097537517547607, + "learning_rate": 0.00024956393482584397, + "loss": 2.0189, + "step": 180 + }, + { + "epoch": 0.17, + "grad_norm": 1.8573436737060547, + "learning_rate": 0.0002495529716043254, + "loss": 2.0942, + "step": 181 + }, + { + "epoch": 0.18, + "grad_norm": 1.9761507511138916, + "learning_rate": 0.00024954187252076564, + "loss": 2.189, + "step": 182 + }, + { + "epoch": 0.18, + "grad_norm": 1.8549177646636963, + "learning_rate": 0.00024953063758727137, + "loss": 2.0821, + "step": 183 + }, + { + "epoch": 0.18, + "grad_norm": 1.7199348211288452, + "learning_rate": 0.00024951926681609767, + "loss": 1.9875, + "step": 184 + }, + { + "epoch": 0.18, + "grad_norm": 1.6276706457138062, + "learning_rate": 0.00024950776021964775, + "loss": 2.0707, + "step": 185 + }, + { + "epoch": 0.18, + "grad_norm": 1.5845235586166382, + "learning_rate": 0.000249496117810473, + "loss": 1.5413, + "step": 186 + }, + { + "epoch": 0.18, + "grad_norm": 1.7384014129638672, + "learning_rate": 0.00024948433960127284, + "loss": 2.1632, + "step": 187 + }, + { + "epoch": 0.18, + "grad_norm": 2.0509490966796875, + "learning_rate": 0.000249472425604895, + "loss": 1.758, + "step": 188 + }, + { + "epoch": 0.18, + "grad_norm": 1.9983670711517334, + "learning_rate": 0.0002494603758343352, + "loss": 2.5845, + "step": 189 + }, + { + "epoch": 0.18, + "grad_norm": 2.0809295177459717, + "learning_rate": 0.0002494481903027373, + "loss": 2.6146, + "step": 190 + }, + { + "epoch": 0.18, + "grad_norm": 1.7669973373413086, + "learning_rate": 0.0002494358690233933, + "loss": 2.0003, + "step": 191 + }, + { + "epoch": 0.19, + "grad_norm": 1.7728503942489624, + "learning_rate": 0.0002494234120097431, + "loss": 2.0072, + "step": 192 + }, + { + "epoch": 0.19, + "grad_norm": 1.9957607984542847, + "learning_rate": 0.0002494108192753748, + "loss": 2.2449, + "step": 193 + }, + { + "epoch": 0.19, + "grad_norm": 2.0449464321136475, + "learning_rate": 0.0002493980908340246, + "loss": 2.0996, + "step": 194 + }, + { + "epoch": 0.19, + "grad_norm": 1.8009361028671265, + "learning_rate": 0.0002493852266995766, + "loss": 2.0876, + "step": 195 + }, + { + "epoch": 0.19, + "grad_norm": 1.9847387075424194, + "learning_rate": 0.00024937222688606303, + "loss": 1.8041, + "step": 196 + }, + { + "epoch": 0.19, + "grad_norm": 2.0888640880584717, + "learning_rate": 0.00024935909140766393, + "loss": 2.4709, + "step": 197 + }, + { + "epoch": 0.19, + "grad_norm": 1.6796460151672363, + "learning_rate": 0.0002493458202787075, + "loss": 1.9681, + "step": 198 + }, + { + "epoch": 0.19, + "grad_norm": 1.6660256385803223, + "learning_rate": 0.0002493324135136699, + "loss": 2.5484, + "step": 199 + }, + { + "epoch": 0.19, + "grad_norm": 1.8919657468795776, + "learning_rate": 0.0002493188711271751, + "loss": 2.5249, + "step": 200 + }, + { + "epoch": 0.19, + "grad_norm": 1.8183636665344238, + "learning_rate": 0.0002493051931339952, + "loss": 1.8468, + "step": 201 + }, + { + "epoch": 0.19, + "grad_norm": 1.6800963878631592, + "learning_rate": 0.0002492913795490501, + "loss": 1.9648, + "step": 202 + }, + { + "epoch": 0.2, + "grad_norm": 1.8262184858322144, + "learning_rate": 0.00024927743038740747, + "loss": 2.0492, + "step": 203 + }, + { + "epoch": 0.2, + "grad_norm": 2.1556880474090576, + "learning_rate": 0.0002492633456642832, + "loss": 2.4633, + "step": 204 + }, + { + "epoch": 0.2, + "grad_norm": 1.7076188325881958, + "learning_rate": 0.0002492491253950408, + "loss": 2.1612, + "step": 205 + }, + { + "epoch": 0.2, + "grad_norm": 1.8892126083374023, + "learning_rate": 0.0002492347695951917, + "loss": 1.9447, + "step": 206 + }, + { + "epoch": 0.2, + "grad_norm": 1.873687744140625, + "learning_rate": 0.0002492202782803952, + "loss": 2.7744, + "step": 207 + }, + { + "epoch": 0.2, + "grad_norm": 1.7962186336517334, + "learning_rate": 0.0002492056514664583, + "loss": 2.0622, + "step": 208 + }, + { + "epoch": 0.2, + "grad_norm": 1.9481604099273682, + "learning_rate": 0.00024919088916933597, + "loss": 1.9443, + "step": 209 + }, + { + "epoch": 0.2, + "grad_norm": 1.7974705696105957, + "learning_rate": 0.0002491759914051308, + "loss": 2.3544, + "step": 210 + }, + { + "epoch": 0.2, + "grad_norm": 1.5557405948638916, + "learning_rate": 0.00024916095819009336, + "loss": 2.2321, + "step": 211 + }, + { + "epoch": 0.2, + "grad_norm": 1.729710340499878, + "learning_rate": 0.0002491457895406218, + "loss": 1.9179, + "step": 212 + }, + { + "epoch": 0.21, + "grad_norm": 1.6658893823623657, + "learning_rate": 0.00024913048547326193, + "loss": 2.088, + "step": 213 + }, + { + "epoch": 0.21, + "grad_norm": 1.6439865827560425, + "learning_rate": 0.0002491150460047075, + "loss": 1.8413, + "step": 214 + }, + { + "epoch": 0.21, + "grad_norm": 1.595862865447998, + "learning_rate": 0.00024909947115179983, + "loss": 2.0446, + "step": 215 + }, + { + "epoch": 0.21, + "grad_norm": 2.21343994140625, + "learning_rate": 0.00024908376093152784, + "loss": 2.841, + "step": 216 + }, + { + "epoch": 0.21, + "grad_norm": 1.7837032079696655, + "learning_rate": 0.0002490679153610283, + "loss": 1.8496, + "step": 217 + }, + { + "epoch": 0.21, + "grad_norm": 1.6079756021499634, + "learning_rate": 0.00024905193445758545, + "loss": 1.9118, + "step": 218 + }, + { + "epoch": 0.21, + "grad_norm": 1.9322482347488403, + "learning_rate": 0.00024903581823863125, + "loss": 2.2768, + "step": 219 + }, + { + "epoch": 0.21, + "grad_norm": 1.9454126358032227, + "learning_rate": 0.0002490195667217452, + "loss": 2.0224, + "step": 220 + }, + { + "epoch": 0.21, + "grad_norm": 1.8890690803527832, + "learning_rate": 0.00024900317992465447, + "loss": 1.9491, + "step": 221 + }, + { + "epoch": 0.21, + "grad_norm": 1.844327449798584, + "learning_rate": 0.0002489866578652337, + "loss": 2.3316, + "step": 222 + }, + { + "epoch": 0.22, + "grad_norm": 1.9561625719070435, + "learning_rate": 0.00024897000056150505, + "loss": 1.8929, + "step": 223 + }, + { + "epoch": 0.22, + "grad_norm": 1.812215805053711, + "learning_rate": 0.0002489532080316383, + "loss": 1.5681, + "step": 224 + }, + { + "epoch": 0.22, + "grad_norm": 1.7232577800750732, + "learning_rate": 0.0002489362802939507, + "loss": 2.2427, + "step": 225 + }, + { + "epoch": 0.22, + "grad_norm": 1.5524508953094482, + "learning_rate": 0.00024891921736690703, + "loss": 2.1043, + "step": 226 + }, + { + "epoch": 0.22, + "grad_norm": 1.5697345733642578, + "learning_rate": 0.0002489020192691194, + "loss": 1.9664, + "step": 227 + }, + { + "epoch": 0.22, + "grad_norm": 1.5127205848693848, + "learning_rate": 0.0002488846860193475, + "loss": 1.5816, + "step": 228 + }, + { + "epoch": 0.22, + "grad_norm": 2.036536693572998, + "learning_rate": 0.0002488672176364984, + "loss": 2.7449, + "step": 229 + }, + { + "epoch": 0.22, + "grad_norm": 1.9850804805755615, + "learning_rate": 0.0002488496141396265, + "loss": 1.8061, + "step": 230 + }, + { + "epoch": 0.22, + "grad_norm": 2.029428720474243, + "learning_rate": 0.0002488318755479337, + "loss": 1.7861, + "step": 231 + }, + { + "epoch": 0.22, + "grad_norm": 1.6909549236297607, + "learning_rate": 0.00024881400188076923, + "loss": 2.1058, + "step": 232 + }, + { + "epoch": 0.22, + "grad_norm": 1.7147257328033447, + "learning_rate": 0.0002487959931576296, + "loss": 1.6736, + "step": 233 + }, + { + "epoch": 0.23, + "grad_norm": 1.6619194746017456, + "learning_rate": 0.00024877784939815863, + "loss": 1.7423, + "step": 234 + }, + { + "epoch": 0.23, + "grad_norm": 1.7721213102340698, + "learning_rate": 0.0002487595706221476, + "loss": 1.4057, + "step": 235 + }, + { + "epoch": 0.23, + "grad_norm": 1.6098127365112305, + "learning_rate": 0.00024874115684953485, + "loss": 1.6113, + "step": 236 + }, + { + "epoch": 0.23, + "grad_norm": 2.2047505378723145, + "learning_rate": 0.00024872260810040607, + "loss": 2.5387, + "step": 237 + }, + { + "epoch": 0.23, + "grad_norm": 1.6279314756393433, + "learning_rate": 0.0002487039243949943, + "loss": 1.9872, + "step": 238 + }, + { + "epoch": 0.23, + "grad_norm": 1.795262098312378, + "learning_rate": 0.0002486851057536795, + "loss": 1.4461, + "step": 239 + }, + { + "epoch": 0.23, + "grad_norm": 1.9134374856948853, + "learning_rate": 0.00024866615219698915, + "loss": 2.3969, + "step": 240 + }, + { + "epoch": 0.23, + "grad_norm": 1.6883046627044678, + "learning_rate": 0.0002486470637455976, + "loss": 2.2354, + "step": 241 + }, + { + "epoch": 0.23, + "grad_norm": 1.7430942058563232, + "learning_rate": 0.00024862784042032666, + "loss": 1.5288, + "step": 242 + }, + { + "epoch": 0.23, + "grad_norm": 1.6902698278427124, + "learning_rate": 0.00024860848224214486, + "loss": 2.102, + "step": 243 + }, + { + "epoch": 0.24, + "grad_norm": 1.5325380563735962, + "learning_rate": 0.0002485889892321683, + "loss": 1.3993, + "step": 244 + }, + { + "epoch": 0.24, + "grad_norm": 1.5996594429016113, + "learning_rate": 0.00024856936141165963, + "loss": 1.586, + "step": 245 + }, + { + "epoch": 0.24, + "grad_norm": 1.9675129652023315, + "learning_rate": 0.00024854959880202905, + "loss": 2.6326, + "step": 246 + }, + { + "epoch": 0.24, + "grad_norm": 1.9281418323516846, + "learning_rate": 0.00024852970142483346, + "loss": 2.3056, + "step": 247 + }, + { + "epoch": 0.24, + "grad_norm": 2.0125837326049805, + "learning_rate": 0.00024850966930177687, + "loss": 2.2092, + "step": 248 + }, + { + "epoch": 0.24, + "grad_norm": 1.904648780822754, + "learning_rate": 0.00024848950245471023, + "loss": 2.1627, + "step": 249 + }, + { + "epoch": 0.24, + "grad_norm": 1.7093470096588135, + "learning_rate": 0.00024846920090563156, + "loss": 1.9851, + "step": 250 + }, + { + "epoch": 0.24, + "grad_norm": 1.7834309339523315, + "learning_rate": 0.0002484487646766857, + "loss": 2.112, + "step": 251 + }, + { + "epoch": 0.24, + "grad_norm": 1.5248178243637085, + "learning_rate": 0.0002484281937901644, + "loss": 1.426, + "step": 252 + }, + { + "epoch": 0.24, + "grad_norm": 1.4602184295654297, + "learning_rate": 0.0002484074882685063, + "loss": 1.3696, + "step": 253 + }, + { + "epoch": 0.25, + "grad_norm": 1.773917555809021, + "learning_rate": 0.0002483866481342971, + "loss": 2.1261, + "step": 254 + }, + { + "epoch": 0.25, + "grad_norm": 1.692901611328125, + "learning_rate": 0.000248365673410269, + "loss": 2.3204, + "step": 255 + }, + { + "epoch": 0.25, + "grad_norm": 1.8946104049682617, + "learning_rate": 0.0002483445641193012, + "loss": 1.8685, + "step": 256 + }, + { + "epoch": 0.25, + "grad_norm": 1.919169306755066, + "learning_rate": 0.0002483233202844197, + "loss": 2.0433, + "step": 257 + }, + { + "epoch": 0.25, + "grad_norm": 1.84755277633667, + "learning_rate": 0.00024830194192879715, + "loss": 2.2363, + "step": 258 + }, + { + "epoch": 0.25, + "grad_norm": 2.112445116043091, + "learning_rate": 0.00024828042907575304, + "loss": 2.6892, + "step": 259 + }, + { + "epoch": 0.25, + "grad_norm": 1.6115814447402954, + "learning_rate": 0.0002482587817487536, + "loss": 1.8159, + "step": 260 + }, + { + "epoch": 0.25, + "grad_norm": 1.6731853485107422, + "learning_rate": 0.00024823699997141154, + "loss": 2.1162, + "step": 261 + }, + { + "epoch": 0.25, + "grad_norm": 1.5666356086730957, + "learning_rate": 0.0002482150837674864, + "loss": 1.8052, + "step": 262 + }, + { + "epoch": 0.25, + "grad_norm": 1.5038379430770874, + "learning_rate": 0.0002481930331608844, + "loss": 1.8459, + "step": 263 + }, + { + "epoch": 0.25, + "grad_norm": 1.5254186391830444, + "learning_rate": 0.00024817084817565827, + "loss": 1.9475, + "step": 264 + }, + { + "epoch": 0.26, + "grad_norm": 1.7081129550933838, + "learning_rate": 0.0002481485288360072, + "loss": 2.4121, + "step": 265 + }, + { + "epoch": 0.26, + "grad_norm": 1.9189531803131104, + "learning_rate": 0.0002481260751662772, + "loss": 2.4579, + "step": 266 + }, + { + "epoch": 0.26, + "grad_norm": 1.6706724166870117, + "learning_rate": 0.00024810348719096065, + "loss": 1.6386, + "step": 267 + }, + { + "epoch": 0.26, + "grad_norm": 1.539415717124939, + "learning_rate": 0.0002480807649346964, + "loss": 1.6492, + "step": 268 + }, + { + "epoch": 0.26, + "grad_norm": 1.747955560684204, + "learning_rate": 0.00024805790842226985, + "loss": 1.6512, + "step": 269 + }, + { + "epoch": 0.26, + "grad_norm": 1.6738513708114624, + "learning_rate": 0.0002480349176786128, + "loss": 1.7555, + "step": 270 + }, + { + "epoch": 0.26, + "grad_norm": 1.9724032878875732, + "learning_rate": 0.0002480117927288035, + "loss": 1.6291, + "step": 271 + }, + { + "epoch": 0.26, + "grad_norm": 1.6531236171722412, + "learning_rate": 0.00024798853359806665, + "loss": 1.6063, + "step": 272 + }, + { + "epoch": 0.26, + "grad_norm": 1.9296516180038452, + "learning_rate": 0.0002479651403117732, + "loss": 1.4847, + "step": 273 + }, + { + "epoch": 0.26, + "grad_norm": 1.7940136194229126, + "learning_rate": 0.0002479416128954404, + "loss": 2.4839, + "step": 274 + }, + { + "epoch": 0.27, + "grad_norm": 2.010266065597534, + "learning_rate": 0.00024791795137473204, + "loss": 1.7243, + "step": 275 + }, + { + "epoch": 0.27, + "grad_norm": 1.8504762649536133, + "learning_rate": 0.00024789415577545793, + "loss": 1.9652, + "step": 276 + }, + { + "epoch": 0.27, + "grad_norm": 1.4306342601776123, + "learning_rate": 0.0002478702261235743, + "loss": 1.5664, + "step": 277 + }, + { + "epoch": 0.27, + "grad_norm": 1.6604520082473755, + "learning_rate": 0.0002478461624451835, + "loss": 2.0659, + "step": 278 + }, + { + "epoch": 0.27, + "grad_norm": 2.0522496700286865, + "learning_rate": 0.0002478219647665342, + "loss": 1.9088, + "step": 279 + }, + { + "epoch": 0.27, + "grad_norm": 1.6429219245910645, + "learning_rate": 0.0002477976331140211, + "loss": 1.7491, + "step": 280 + }, + { + "epoch": 0.27, + "grad_norm": 1.397255301475525, + "learning_rate": 0.00024777316751418515, + "loss": 1.3155, + "step": 281 + }, + { + "epoch": 0.27, + "grad_norm": 1.5616679191589355, + "learning_rate": 0.00024774856799371326, + "loss": 1.8068, + "step": 282 + }, + { + "epoch": 0.27, + "grad_norm": 1.7590296268463135, + "learning_rate": 0.00024772383457943864, + "loss": 1.7305, + "step": 283 + }, + { + "epoch": 0.27, + "grad_norm": 1.6376055479049683, + "learning_rate": 0.00024769896729834036, + "loss": 1.7105, + "step": 284 + }, + { + "epoch": 0.28, + "grad_norm": 1.385542392730713, + "learning_rate": 0.00024767396617754364, + "loss": 1.605, + "step": 285 + }, + { + "epoch": 0.28, + "grad_norm": 1.827626347541809, + "learning_rate": 0.0002476488312443195, + "loss": 2.032, + "step": 286 + }, + { + "epoch": 0.28, + "grad_norm": 2.1502676010131836, + "learning_rate": 0.00024762356252608527, + "loss": 1.9228, + "step": 287 + }, + { + "epoch": 0.28, + "grad_norm": 1.5369927883148193, + "learning_rate": 0.00024759816005040384, + "loss": 1.3886, + "step": 288 + }, + { + "epoch": 0.28, + "grad_norm": 1.7193357944488525, + "learning_rate": 0.0002475726238449842, + "loss": 2.0007, + "step": 289 + }, + { + "epoch": 0.28, + "grad_norm": 2.561558246612549, + "learning_rate": 0.00024754695393768114, + "loss": 2.6965, + "step": 290 + }, + { + "epoch": 0.28, + "grad_norm": 2.2155981063842773, + "learning_rate": 0.0002475211503564954, + "loss": 1.678, + "step": 291 + }, + { + "epoch": 0.28, + "grad_norm": 1.59506356716156, + "learning_rate": 0.00024749521312957337, + "loss": 1.7036, + "step": 292 + }, + { + "epoch": 0.28, + "grad_norm": 1.5700795650482178, + "learning_rate": 0.0002474691422852074, + "loss": 1.5835, + "step": 293 + }, + { + "epoch": 0.28, + "grad_norm": 1.8655942678451538, + "learning_rate": 0.00024744293785183537, + "loss": 1.6957, + "step": 294 + }, + { + "epoch": 0.28, + "grad_norm": 1.434198260307312, + "learning_rate": 0.00024741659985804115, + "loss": 1.5576, + "step": 295 + }, + { + "epoch": 0.29, + "grad_norm": 1.4937015771865845, + "learning_rate": 0.0002473901283325541, + "loss": 1.7204, + "step": 296 + }, + { + "epoch": 0.29, + "grad_norm": 1.9438872337341309, + "learning_rate": 0.00024736352330424923, + "loss": 2.2574, + "step": 297 + }, + { + "epoch": 0.29, + "grad_norm": 1.8500808477401733, + "learning_rate": 0.0002473367848021473, + "loss": 2.2356, + "step": 298 + }, + { + "epoch": 0.29, + "grad_norm": 1.6820452213287354, + "learning_rate": 0.00024730991285541455, + "loss": 2.7996, + "step": 299 + }, + { + "epoch": 0.29, + "grad_norm": 1.6098867654800415, + "learning_rate": 0.0002472829074933628, + "loss": 2.0764, + "step": 300 + }, + { + "epoch": 0.29, + "grad_norm": 1.763376235961914, + "learning_rate": 0.00024725576874544956, + "loss": 2.1674, + "step": 301 + }, + { + "epoch": 0.29, + "grad_norm": 1.7412651777267456, + "learning_rate": 0.0002472284966412776, + "loss": 2.2091, + "step": 302 + }, + { + "epoch": 0.29, + "grad_norm": 1.690738320350647, + "learning_rate": 0.00024720109121059524, + "loss": 2.1094, + "step": 303 + }, + { + "epoch": 0.29, + "grad_norm": 1.3498783111572266, + "learning_rate": 0.00024717355248329625, + "loss": 1.8, + "step": 304 + }, + { + "epoch": 0.29, + "grad_norm": 1.6631073951721191, + "learning_rate": 0.00024714588048941987, + "loss": 2.251, + "step": 305 + }, + { + "epoch": 0.3, + "grad_norm": 1.7227246761322021, + "learning_rate": 0.0002471180752591506, + "loss": 1.8822, + "step": 306 + }, + { + "epoch": 0.3, + "grad_norm": 1.7856996059417725, + "learning_rate": 0.00024709013682281826, + "loss": 1.914, + "step": 307 + }, + { + "epoch": 0.3, + "grad_norm": 2.0128395557403564, + "learning_rate": 0.0002470620652108981, + "loss": 2.0113, + "step": 308 + }, + { + "epoch": 0.3, + "grad_norm": 1.4962693452835083, + "learning_rate": 0.00024703386045401047, + "loss": 1.7985, + "step": 309 + }, + { + "epoch": 0.3, + "grad_norm": 1.5838319063186646, + "learning_rate": 0.0002470055225829211, + "loss": 1.5443, + "step": 310 + }, + { + "epoch": 0.3, + "grad_norm": 1.6290487051010132, + "learning_rate": 0.0002469770516285409, + "loss": 1.6863, + "step": 311 + }, + { + "epoch": 0.3, + "grad_norm": 1.46754789352417, + "learning_rate": 0.0002469484476219259, + "loss": 1.6632, + "step": 312 + }, + { + "epoch": 0.3, + "grad_norm": 1.8494882583618164, + "learning_rate": 0.00024691971059427717, + "loss": 1.7439, + "step": 313 + }, + { + "epoch": 0.3, + "grad_norm": 1.868640661239624, + "learning_rate": 0.000246890840576941, + "loss": 2.0376, + "step": 314 + }, + { + "epoch": 0.3, + "grad_norm": 1.553097128868103, + "learning_rate": 0.0002468618376014088, + "loss": 1.9508, + "step": 315 + }, + { + "epoch": 0.31, + "grad_norm": 1.6028902530670166, + "learning_rate": 0.000246832701699317, + "loss": 1.5764, + "step": 316 + }, + { + "epoch": 0.31, + "grad_norm": 2.1267387866973877, + "learning_rate": 0.0002468034329024468, + "loss": 1.8886, + "step": 317 + }, + { + "epoch": 0.31, + "grad_norm": 1.5905228853225708, + "learning_rate": 0.00024677403124272456, + "loss": 1.8134, + "step": 318 + }, + { + "epoch": 0.31, + "grad_norm": 1.4769141674041748, + "learning_rate": 0.0002467444967522216, + "loss": 1.2115, + "step": 319 + }, + { + "epoch": 0.31, + "grad_norm": 1.4753649234771729, + "learning_rate": 0.000246714829463154, + "loss": 1.653, + "step": 320 + }, + { + "epoch": 0.31, + "grad_norm": 1.6038315296173096, + "learning_rate": 0.0002466850294078828, + "loss": 2.1011, + "step": 321 + }, + { + "epoch": 0.31, + "grad_norm": 1.7277235984802246, + "learning_rate": 0.00024665509661891385, + "loss": 2.1318, + "step": 322 + }, + { + "epoch": 0.31, + "grad_norm": 1.6639180183410645, + "learning_rate": 0.0002466250311288977, + "loss": 1.6277, + "step": 323 + }, + { + "epoch": 0.31, + "grad_norm": 1.660370111465454, + "learning_rate": 0.00024659483297062964, + "loss": 2.112, + "step": 324 + }, + { + "epoch": 0.31, + "grad_norm": 1.7722567319869995, + "learning_rate": 0.0002465645021770499, + "loss": 2.0158, + "step": 325 + }, + { + "epoch": 0.31, + "grad_norm": 1.559173583984375, + "learning_rate": 0.00024653403878124305, + "loss": 1.8675, + "step": 326 + }, + { + "epoch": 0.32, + "grad_norm": 1.679623007774353, + "learning_rate": 0.0002465034428164386, + "loss": 1.844, + "step": 327 + }, + { + "epoch": 0.32, + "grad_norm": 1.72882878780365, + "learning_rate": 0.00024647271431601055, + "loss": 2.2323, + "step": 328 + }, + { + "epoch": 0.32, + "grad_norm": 1.4677948951721191, + "learning_rate": 0.00024644185331347735, + "loss": 1.8718, + "step": 329 + }, + { + "epoch": 0.32, + "grad_norm": 1.5333250761032104, + "learning_rate": 0.00024641085984250223, + "loss": 2.0585, + "step": 330 + }, + { + "epoch": 0.32, + "grad_norm": 1.307824730873108, + "learning_rate": 0.0002463797339368927, + "loss": 1.8447, + "step": 331 + }, + { + "epoch": 0.32, + "grad_norm": 1.6973696947097778, + "learning_rate": 0.0002463484756306009, + "loss": 1.5235, + "step": 332 + }, + { + "epoch": 0.32, + "grad_norm": 1.3276690244674683, + "learning_rate": 0.0002463170849577232, + "loss": 1.9472, + "step": 333 + }, + { + "epoch": 0.32, + "grad_norm": 1.557236671447754, + "learning_rate": 0.0002462855619525005, + "loss": 1.6783, + "step": 334 + }, + { + "epoch": 0.32, + "grad_norm": 1.6383414268493652, + "learning_rate": 0.000246253906649318, + "loss": 1.4636, + "step": 335 + }, + { + "epoch": 0.32, + "grad_norm": 1.4756358861923218, + "learning_rate": 0.0002462221190827053, + "loss": 1.4215, + "step": 336 + }, + { + "epoch": 0.33, + "grad_norm": 1.6746768951416016, + "learning_rate": 0.000246190199287336, + "loss": 1.7268, + "step": 337 + }, + { + "epoch": 0.33, + "grad_norm": 1.8532724380493164, + "learning_rate": 0.00024615814729802833, + "loss": 2.2173, + "step": 338 + }, + { + "epoch": 0.33, + "grad_norm": 1.8261704444885254, + "learning_rate": 0.0002461259631497444, + "loss": 1.6643, + "step": 339 + }, + { + "epoch": 0.33, + "grad_norm": 1.6665236949920654, + "learning_rate": 0.0002460936468775907, + "loss": 1.9251, + "step": 340 + }, + { + "epoch": 0.33, + "grad_norm": 1.5993882417678833, + "learning_rate": 0.00024606119851681757, + "loss": 1.4908, + "step": 341 + }, + { + "epoch": 0.33, + "grad_norm": 1.8725510835647583, + "learning_rate": 0.00024602861810281966, + "loss": 2.1784, + "step": 342 + }, + { + "epoch": 0.33, + "grad_norm": 1.803350567817688, + "learning_rate": 0.0002459959056711357, + "loss": 2.11, + "step": 343 + }, + { + "epoch": 0.33, + "grad_norm": 1.6295034885406494, + "learning_rate": 0.00024596306125744815, + "loss": 1.777, + "step": 344 + }, + { + "epoch": 0.33, + "grad_norm": 2.189607858657837, + "learning_rate": 0.00024593008489758375, + "loss": 2.4662, + "step": 345 + }, + { + "epoch": 0.33, + "grad_norm": 1.835728645324707, + "learning_rate": 0.0002458969766275129, + "loss": 1.8764, + "step": 346 + }, + { + "epoch": 0.33, + "grad_norm": 1.6741111278533936, + "learning_rate": 0.00024586373648335014, + "loss": 1.8256, + "step": 347 + }, + { + "epoch": 0.34, + "grad_norm": 1.5066823959350586, + "learning_rate": 0.0002458303645013536, + "loss": 1.4491, + "step": 348 + }, + { + "epoch": 0.34, + "grad_norm": 1.439063549041748, + "learning_rate": 0.00024579686071792543, + "loss": 1.8896, + "step": 349 + }, + { + "epoch": 0.34, + "grad_norm": 1.2669867277145386, + "learning_rate": 0.0002457632251696115, + "loss": 1.7784, + "step": 350 + }, + { + "epoch": 0.34, + "grad_norm": 1.4590708017349243, + "learning_rate": 0.00024572945789310123, + "loss": 1.5708, + "step": 351 + }, + { + "epoch": 0.34, + "grad_norm": 1.7421554327011108, + "learning_rate": 0.000245695558925228, + "loss": 2.0778, + "step": 352 + }, + { + "epoch": 0.34, + "grad_norm": 1.5311520099639893, + "learning_rate": 0.00024566152830296875, + "loss": 1.4703, + "step": 353 + }, + { + "epoch": 0.34, + "grad_norm": 1.9303866624832153, + "learning_rate": 0.0002456273660634438, + "loss": 2.0816, + "step": 354 + }, + { + "epoch": 0.34, + "grad_norm": 1.5437086820602417, + "learning_rate": 0.0002455930722439174, + "loss": 1.4711, + "step": 355 + }, + { + "epoch": 0.34, + "grad_norm": 1.50086510181427, + "learning_rate": 0.0002455586468817971, + "loss": 1.8523, + "step": 356 + }, + { + "epoch": 0.34, + "grad_norm": 1.5427656173706055, + "learning_rate": 0.00024552409001463393, + "loss": 1.6157, + "step": 357 + }, + { + "epoch": 0.35, + "grad_norm": 1.7002308368682861, + "learning_rate": 0.00024548940168012253, + "loss": 1.724, + "step": 358 + }, + { + "epoch": 0.35, + "grad_norm": 1.5757559537887573, + "learning_rate": 0.0002454545819161008, + "loss": 1.663, + "step": 359 + }, + { + "epoch": 0.35, + "grad_norm": 1.4780895709991455, + "learning_rate": 0.00024541963076055, + "loss": 1.488, + "step": 360 + }, + { + "epoch": 0.35, + "grad_norm": 1.7047303915023804, + "learning_rate": 0.00024538454825159486, + "loss": 2.0954, + "step": 361 + }, + { + "epoch": 0.35, + "grad_norm": 1.9181485176086426, + "learning_rate": 0.00024534933442750317, + "loss": 1.5492, + "step": 362 + }, + { + "epoch": 0.35, + "grad_norm": 2.150815010070801, + "learning_rate": 0.0002453139893266861, + "loss": 2.1316, + "step": 363 + }, + { + "epoch": 0.35, + "grad_norm": 1.663496732711792, + "learning_rate": 0.00024527851298769803, + "loss": 1.9913, + "step": 364 + }, + { + "epoch": 0.35, + "grad_norm": 1.6102313995361328, + "learning_rate": 0.00024524290544923643, + "loss": 1.8153, + "step": 365 + }, + { + "epoch": 0.35, + "grad_norm": 2.003488302230835, + "learning_rate": 0.0002452071667501419, + "loss": 2.0315, + "step": 366 + }, + { + "epoch": 0.35, + "grad_norm": 1.4711098670959473, + "learning_rate": 0.0002451712969293982, + "loss": 1.4064, + "step": 367 + }, + { + "epoch": 0.36, + "grad_norm": 1.4077496528625488, + "learning_rate": 0.0002451352960261319, + "loss": 1.8027, + "step": 368 + }, + { + "epoch": 0.36, + "grad_norm": 1.4813429117202759, + "learning_rate": 0.0002450991640796127, + "loss": 1.9595, + "step": 369 + }, + { + "epoch": 0.36, + "grad_norm": 1.755618929862976, + "learning_rate": 0.00024506290112925335, + "loss": 1.8781, + "step": 370 + }, + { + "epoch": 0.36, + "grad_norm": 1.6948919296264648, + "learning_rate": 0.00024502650721460926, + "loss": 1.3217, + "step": 371 + }, + { + "epoch": 0.36, + "grad_norm": 1.6204948425292969, + "learning_rate": 0.00024498998237537883, + "loss": 1.7845, + "step": 372 + }, + { + "epoch": 0.36, + "grad_norm": 1.4599529504776, + "learning_rate": 0.00024495332665140337, + "loss": 1.8832, + "step": 373 + }, + { + "epoch": 0.36, + "grad_norm": 1.5336644649505615, + "learning_rate": 0.00024491654008266666, + "loss": 1.7582, + "step": 374 + }, + { + "epoch": 0.36, + "grad_norm": 1.6207972764968872, + "learning_rate": 0.00024487962270929546, + "loss": 1.6177, + "step": 375 + }, + { + "epoch": 0.36, + "grad_norm": 1.5989536046981812, + "learning_rate": 0.0002448425745715592, + "loss": 2.1292, + "step": 376 + }, + { + "epoch": 0.36, + "grad_norm": 1.7636903524398804, + "learning_rate": 0.0002448053957098699, + "loss": 1.8165, + "step": 377 + }, + { + "epoch": 0.36, + "grad_norm": 1.6132402420043945, + "learning_rate": 0.0002447680861647821, + "loss": 1.8227, + "step": 378 + }, + { + "epoch": 0.37, + "grad_norm": 1.8587868213653564, + "learning_rate": 0.0002447306459769929, + "loss": 2.365, + "step": 379 + }, + { + "epoch": 0.37, + "grad_norm": 1.4449355602264404, + "learning_rate": 0.000244693075187342, + "loss": 1.8724, + "step": 380 + }, + { + "epoch": 0.37, + "grad_norm": 1.2807527780532837, + "learning_rate": 0.0002446553738368116, + "loss": 1.7084, + "step": 381 + }, + { + "epoch": 0.37, + "grad_norm": 1.2705549001693726, + "learning_rate": 0.0002446175419665261, + "loss": 1.5953, + "step": 382 + }, + { + "epoch": 0.37, + "grad_norm": 1.444528341293335, + "learning_rate": 0.00024457957961775253, + "loss": 1.7351, + "step": 383 + }, + { + "epoch": 0.37, + "grad_norm": 1.6186937093734741, + "learning_rate": 0.00024454148683189996, + "loss": 2.1234, + "step": 384 + }, + { + "epoch": 0.37, + "grad_norm": 1.2942836284637451, + "learning_rate": 0.00024450326365052, + "loss": 1.5367, + "step": 385 + }, + { + "epoch": 0.37, + "grad_norm": 1.3657950162887573, + "learning_rate": 0.0002444649101153064, + "loss": 1.417, + "step": 386 + }, + { + "epoch": 0.37, + "grad_norm": 1.2977917194366455, + "learning_rate": 0.0002444264262680951, + "loss": 1.6442, + "step": 387 + }, + { + "epoch": 0.37, + "grad_norm": 1.5234405994415283, + "learning_rate": 0.0002443878121508641, + "loss": 1.7331, + "step": 388 + }, + { + "epoch": 0.38, + "grad_norm": 1.580356478691101, + "learning_rate": 0.00024434906780573356, + "loss": 1.5812, + "step": 389 + }, + { + "epoch": 0.38, + "grad_norm": 1.5635511875152588, + "learning_rate": 0.0002443101932749658, + "loss": 1.7743, + "step": 390 + }, + { + "epoch": 0.38, + "grad_norm": 1.4234613180160522, + "learning_rate": 0.00024427118860096504, + "loss": 1.9202, + "step": 391 + }, + { + "epoch": 0.38, + "grad_norm": 1.8113735914230347, + "learning_rate": 0.00024423205382627746, + "loss": 2.3395, + "step": 392 + }, + { + "epoch": 0.38, + "grad_norm": 1.5529452562332153, + "learning_rate": 0.0002441927889935911, + "loss": 1.6039, + "step": 393 + }, + { + "epoch": 0.38, + "grad_norm": 1.3958035707473755, + "learning_rate": 0.000244153394145736, + "loss": 1.5429, + "step": 394 + }, + { + "epoch": 0.38, + "grad_norm": 1.723780632019043, + "learning_rate": 0.0002441138693256839, + "loss": 1.7384, + "step": 395 + }, + { + "epoch": 0.38, + "grad_norm": 1.403164029121399, + "learning_rate": 0.00024407421457654845, + "loss": 0.9538, + "step": 396 + }, + { + "epoch": 0.38, + "grad_norm": 1.4130743741989136, + "learning_rate": 0.00024403442994158487, + "loss": 1.9526, + "step": 397 + }, + { + "epoch": 0.38, + "grad_norm": 1.4831724166870117, + "learning_rate": 0.00024399451546419017, + "loss": 1.6449, + "step": 398 + }, + { + "epoch": 0.39, + "grad_norm": 1.6771824359893799, + "learning_rate": 0.00024395447118790293, + "loss": 1.7758, + "step": 399 + }, + { + "epoch": 0.39, + "grad_norm": 1.5150790214538574, + "learning_rate": 0.00024391429715640335, + "loss": 1.9961, + "step": 400 + }, + { + "epoch": 0.39, + "grad_norm": 1.5657678842544556, + "learning_rate": 0.0002438739934135131, + "loss": 1.5219, + "step": 401 + }, + { + "epoch": 0.39, + "grad_norm": 1.4939355850219727, + "learning_rate": 0.00024383356000319548, + "loss": 1.5112, + "step": 402 + }, + { + "epoch": 0.39, + "grad_norm": 1.6075009107589722, + "learning_rate": 0.0002437929969695551, + "loss": 1.4834, + "step": 403 + }, + { + "epoch": 0.39, + "grad_norm": 1.6066113710403442, + "learning_rate": 0.000243752304356838, + "loss": 1.655, + "step": 404 + }, + { + "epoch": 0.39, + "grad_norm": 1.713575005531311, + "learning_rate": 0.0002437114822094316, + "loss": 1.8589, + "step": 405 + }, + { + "epoch": 0.39, + "grad_norm": 1.6600104570388794, + "learning_rate": 0.00024367053057186455, + "loss": 1.9319, + "step": 406 + }, + { + "epoch": 0.39, + "grad_norm": 1.8421694040298462, + "learning_rate": 0.0002436294494888068, + "loss": 2.1056, + "step": 407 + }, + { + "epoch": 0.39, + "grad_norm": 1.7146579027175903, + "learning_rate": 0.0002435882390050695, + "loss": 2.3835, + "step": 408 + }, + { + "epoch": 0.39, + "grad_norm": 1.558237075805664, + "learning_rate": 0.0002435468991656049, + "loss": 1.972, + "step": 409 + }, + { + "epoch": 0.4, + "grad_norm": 1.5981314182281494, + "learning_rate": 0.0002435054300155064, + "loss": 1.8716, + "step": 410 + }, + { + "epoch": 0.4, + "grad_norm": 1.376088261604309, + "learning_rate": 0.00024346383160000847, + "loss": 1.6165, + "step": 411 + }, + { + "epoch": 0.4, + "grad_norm": 1.5061323642730713, + "learning_rate": 0.0002434221039644865, + "loss": 1.6584, + "step": 412 + }, + { + "epoch": 0.4, + "grad_norm": 1.7581522464752197, + "learning_rate": 0.00024338024715445688, + "loss": 1.9282, + "step": 413 + }, + { + "epoch": 0.4, + "grad_norm": 1.3032132387161255, + "learning_rate": 0.00024333826121557695, + "loss": 1.1937, + "step": 414 + }, + { + "epoch": 0.4, + "grad_norm": 1.632356882095337, + "learning_rate": 0.0002432961461936448, + "loss": 2.075, + "step": 415 + }, + { + "epoch": 0.4, + "grad_norm": 1.2259700298309326, + "learning_rate": 0.00024325390213459944, + "loss": 1.1513, + "step": 416 + }, + { + "epoch": 0.4, + "grad_norm": 1.721431016921997, + "learning_rate": 0.00024321152908452054, + "loss": 1.5305, + "step": 417 + }, + { + "epoch": 0.4, + "grad_norm": 1.5428179502487183, + "learning_rate": 0.00024316902708962848, + "loss": 1.7885, + "step": 418 + }, + { + "epoch": 0.4, + "grad_norm": 1.6045695543289185, + "learning_rate": 0.0002431263961962844, + "loss": 1.6599, + "step": 419 + }, + { + "epoch": 0.41, + "grad_norm": 1.6771968603134155, + "learning_rate": 0.00024308363645098984, + "loss": 1.6666, + "step": 420 + }, + { + "epoch": 0.41, + "grad_norm": 1.5328516960144043, + "learning_rate": 0.00024304074790038716, + "loss": 1.7956, + "step": 421 + }, + { + "epoch": 0.41, + "grad_norm": 1.6807280778884888, + "learning_rate": 0.00024299773059125896, + "loss": 1.7099, + "step": 422 + }, + { + "epoch": 0.41, + "grad_norm": 1.528563380241394, + "learning_rate": 0.00024295458457052844, + "loss": 1.5751, + "step": 423 + }, + { + "epoch": 0.41, + "grad_norm": 1.2582366466522217, + "learning_rate": 0.00024291130988525917, + "loss": 1.5967, + "step": 424 + }, + { + "epoch": 0.41, + "grad_norm": 1.5818768739700317, + "learning_rate": 0.00024286790658265507, + "loss": 2.0187, + "step": 425 + }, + { + "epoch": 0.41, + "grad_norm": 1.647545576095581, + "learning_rate": 0.00024282437471006033, + "loss": 2.2781, + "step": 426 + }, + { + "epoch": 0.41, + "grad_norm": 1.1883927583694458, + "learning_rate": 0.00024278071431495937, + "loss": 1.7889, + "step": 427 + }, + { + "epoch": 0.41, + "grad_norm": 1.8100999593734741, + "learning_rate": 0.00024273692544497688, + "loss": 1.6325, + "step": 428 + }, + { + "epoch": 0.41, + "grad_norm": 1.4264518022537231, + "learning_rate": 0.0002426930081478776, + "loss": 1.8201, + "step": 429 + }, + { + "epoch": 0.42, + "grad_norm": 1.2940385341644287, + "learning_rate": 0.00024264896247156643, + "loss": 1.508, + "step": 430 + }, + { + "epoch": 0.42, + "grad_norm": 1.3940926790237427, + "learning_rate": 0.00024260478846408823, + "loss": 1.317, + "step": 431 + }, + { + "epoch": 0.42, + "grad_norm": 1.3203697204589844, + "learning_rate": 0.00024256048617362792, + "loss": 1.5154, + "step": 432 + }, + { + "epoch": 0.42, + "grad_norm": 1.5709110498428345, + "learning_rate": 0.00024251605564851032, + "loss": 1.4274, + "step": 433 + }, + { + "epoch": 0.42, + "grad_norm": 1.4669488668441772, + "learning_rate": 0.00024247149693720012, + "loss": 1.5945, + "step": 434 + }, + { + "epoch": 0.42, + "grad_norm": 1.9155356884002686, + "learning_rate": 0.00024242681008830184, + "loss": 1.727, + "step": 435 + }, + { + "epoch": 0.42, + "grad_norm": 1.4303950071334839, + "learning_rate": 0.00024238199515055976, + "loss": 1.215, + "step": 436 + }, + { + "epoch": 0.42, + "grad_norm": 1.5403741598129272, + "learning_rate": 0.0002423370521728579, + "loss": 1.2804, + "step": 437 + }, + { + "epoch": 0.42, + "grad_norm": 2.0623433589935303, + "learning_rate": 0.00024229198120421993, + "loss": 1.5453, + "step": 438 + }, + { + "epoch": 0.42, + "grad_norm": 1.7625672817230225, + "learning_rate": 0.00024224678229380913, + "loss": 1.923, + "step": 439 + }, + { + "epoch": 0.42, + "grad_norm": 1.7964345216751099, + "learning_rate": 0.00024220145549092842, + "loss": 1.68, + "step": 440 + }, + { + "epoch": 0.43, + "grad_norm": 1.8445159196853638, + "learning_rate": 0.00024215600084502006, + "loss": 1.8517, + "step": 441 + }, + { + "epoch": 0.43, + "grad_norm": 1.7241743803024292, + "learning_rate": 0.0002421104184056659, + "loss": 1.8859, + "step": 442 + }, + { + "epoch": 0.43, + "grad_norm": 1.3813362121582031, + "learning_rate": 0.00024206470822258713, + "loss": 1.3435, + "step": 443 + }, + { + "epoch": 0.43, + "grad_norm": 1.6187509298324585, + "learning_rate": 0.0002420188703456443, + "loss": 1.9744, + "step": 444 + }, + { + "epoch": 0.43, + "grad_norm": 1.5142652988433838, + "learning_rate": 0.00024197290482483725, + "loss": 1.768, + "step": 445 + }, + { + "epoch": 0.43, + "grad_norm": 1.3997564315795898, + "learning_rate": 0.000241926811710305, + "loss": 1.6986, + "step": 446 + }, + { + "epoch": 0.43, + "grad_norm": 1.248969554901123, + "learning_rate": 0.00024188059105232585, + "loss": 1.2985, + "step": 447 + }, + { + "epoch": 0.43, + "grad_norm": 1.3714370727539062, + "learning_rate": 0.00024183424290131716, + "loss": 1.626, + "step": 448 + }, + { + "epoch": 0.43, + "grad_norm": 1.3766262531280518, + "learning_rate": 0.0002417877673078353, + "loss": 1.3913, + "step": 449 + }, + { + "epoch": 0.43, + "grad_norm": 1.621702790260315, + "learning_rate": 0.00024174116432257583, + "loss": 1.7584, + "step": 450 + }, + { + "epoch": 0.44, + "grad_norm": 1.6640499830245972, + "learning_rate": 0.00024169443399637303, + "loss": 1.7288, + "step": 451 + }, + { + "epoch": 0.44, + "grad_norm": 1.7537161111831665, + "learning_rate": 0.00024164757638020032, + "loss": 1.8502, + "step": 452 + }, + { + "epoch": 0.44, + "grad_norm": 1.614371657371521, + "learning_rate": 0.00024160059152516982, + "loss": 1.8407, + "step": 453 + }, + { + "epoch": 0.44, + "grad_norm": 1.3671376705169678, + "learning_rate": 0.0002415534794825325, + "loss": 1.7035, + "step": 454 + }, + { + "epoch": 0.44, + "grad_norm": 1.1287087202072144, + "learning_rate": 0.00024150624030367797, + "loss": 0.8822, + "step": 455 + }, + { + "epoch": 0.44, + "grad_norm": 2.104029893875122, + "learning_rate": 0.0002414588740401347, + "loss": 2.4046, + "step": 456 + }, + { + "epoch": 0.44, + "grad_norm": 1.5074779987335205, + "learning_rate": 0.00024141138074356962, + "loss": 1.4848, + "step": 457 + }, + { + "epoch": 0.44, + "grad_norm": 1.8815460205078125, + "learning_rate": 0.00024136376046578835, + "loss": 1.759, + "step": 458 + }, + { + "epoch": 0.44, + "grad_norm": 1.5019757747650146, + "learning_rate": 0.00024131601325873487, + "loss": 1.4991, + "step": 459 + }, + { + "epoch": 0.44, + "grad_norm": 1.6193299293518066, + "learning_rate": 0.00024126813917449175, + "loss": 1.818, + "step": 460 + }, + { + "epoch": 0.44, + "grad_norm": 1.4965165853500366, + "learning_rate": 0.0002412201382652799, + "loss": 1.4364, + "step": 461 + }, + { + "epoch": 0.45, + "grad_norm": 1.6111191511154175, + "learning_rate": 0.00024117201058345862, + "loss": 1.7969, + "step": 462 + }, + { + "epoch": 0.45, + "grad_norm": 1.6606050729751587, + "learning_rate": 0.00024112375618152537, + "loss": 1.8442, + "step": 463 + }, + { + "epoch": 0.45, + "grad_norm": 1.7113983631134033, + "learning_rate": 0.00024107537511211603, + "loss": 1.6806, + "step": 464 + }, + { + "epoch": 0.45, + "grad_norm": 1.5785653591156006, + "learning_rate": 0.00024102686742800446, + "loss": 1.6221, + "step": 465 + }, + { + "epoch": 0.45, + "grad_norm": 1.715955138206482, + "learning_rate": 0.0002409782331821027, + "loss": 1.8423, + "step": 466 + }, + { + "epoch": 0.45, + "grad_norm": 1.7139517068862915, + "learning_rate": 0.00024092947242746094, + "loss": 1.9045, + "step": 467 + }, + { + "epoch": 0.45, + "grad_norm": 1.2441296577453613, + "learning_rate": 0.00024088058521726718, + "loss": 1.809, + "step": 468 + }, + { + "epoch": 0.45, + "grad_norm": 1.487850546836853, + "learning_rate": 0.00024083157160484752, + "loss": 1.3382, + "step": 469 + }, + { + "epoch": 0.45, + "grad_norm": 1.3797458410263062, + "learning_rate": 0.00024078243164366586, + "loss": 1.3186, + "step": 470 + }, + { + "epoch": 0.45, + "grad_norm": 1.5861365795135498, + "learning_rate": 0.0002407331653873239, + "loss": 2.0695, + "step": 471 + }, + { + "epoch": 0.46, + "grad_norm": 1.4714101552963257, + "learning_rate": 0.00024068377288956116, + "loss": 1.8704, + "step": 472 + }, + { + "epoch": 0.46, + "grad_norm": 1.6222436428070068, + "learning_rate": 0.00024063425420425485, + "loss": 2.0155, + "step": 473 + }, + { + "epoch": 0.46, + "grad_norm": 1.7147741317749023, + "learning_rate": 0.00024058460938541982, + "loss": 2.2635, + "step": 474 + }, + { + "epoch": 0.46, + "grad_norm": 1.7251660823822021, + "learning_rate": 0.0002405348384872085, + "loss": 2.3124, + "step": 475 + }, + { + "epoch": 0.46, + "grad_norm": 1.4105515480041504, + "learning_rate": 0.00024048494156391086, + "loss": 1.5038, + "step": 476 + }, + { + "epoch": 0.46, + "grad_norm": 1.4747495651245117, + "learning_rate": 0.0002404349186699543, + "loss": 1.8607, + "step": 477 + }, + { + "epoch": 0.46, + "grad_norm": 1.4962126016616821, + "learning_rate": 0.00024038476985990364, + "loss": 1.9751, + "step": 478 + }, + { + "epoch": 0.46, + "grad_norm": 1.5216854810714722, + "learning_rate": 0.00024033449518846112, + "loss": 1.6452, + "step": 479 + }, + { + "epoch": 0.46, + "grad_norm": 1.7051867246627808, + "learning_rate": 0.0002402840947104662, + "loss": 1.5337, + "step": 480 + }, + { + "epoch": 0.46, + "grad_norm": 1.6586822271347046, + "learning_rate": 0.00024023356848089557, + "loss": 1.5478, + "step": 481 + }, + { + "epoch": 0.47, + "grad_norm": 1.2107691764831543, + "learning_rate": 0.00024018291655486306, + "loss": 1.278, + "step": 482 + }, + { + "epoch": 0.47, + "grad_norm": 1.4238454103469849, + "learning_rate": 0.00024013213898761975, + "loss": 1.6255, + "step": 483 + }, + { + "epoch": 0.47, + "grad_norm": 1.3502719402313232, + "learning_rate": 0.0002400812358345536, + "loss": 1.3436, + "step": 484 + }, + { + "epoch": 0.47, + "grad_norm": 2.2329463958740234, + "learning_rate": 0.00024003020715118967, + "loss": 2.3051, + "step": 485 + }, + { + "epoch": 0.47, + "grad_norm": 1.6358450651168823, + "learning_rate": 0.00023997905299318983, + "loss": 2.2955, + "step": 486 + }, + { + "epoch": 0.47, + "grad_norm": 1.5185836553573608, + "learning_rate": 0.000239927773416353, + "loss": 1.5445, + "step": 487 + }, + { + "epoch": 0.47, + "grad_norm": 1.5232542753219604, + "learning_rate": 0.00023987636847661476, + "loss": 1.6744, + "step": 488 + }, + { + "epoch": 0.47, + "grad_norm": 1.4239147901535034, + "learning_rate": 0.00023982483823004747, + "loss": 1.666, + "step": 489 + }, + { + "epoch": 0.47, + "grad_norm": 1.3807685375213623, + "learning_rate": 0.00023977318273286018, + "loss": 1.5856, + "step": 490 + }, + { + "epoch": 0.47, + "grad_norm": 1.573913812637329, + "learning_rate": 0.00023972140204139858, + "loss": 1.9832, + "step": 491 + }, + { + "epoch": 0.47, + "grad_norm": 1.556381344795227, + "learning_rate": 0.0002396694962121449, + "loss": 1.4315, + "step": 492 + }, + { + "epoch": 0.48, + "grad_norm": 1.7060798406600952, + "learning_rate": 0.00023961746530171788, + "loss": 1.7671, + "step": 493 + }, + { + "epoch": 0.48, + "grad_norm": 1.5967423915863037, + "learning_rate": 0.0002395653093668727, + "loss": 1.9734, + "step": 494 + }, + { + "epoch": 0.48, + "grad_norm": 1.390377402305603, + "learning_rate": 0.00023951302846450087, + "loss": 1.4402, + "step": 495 + }, + { + "epoch": 0.48, + "grad_norm": 1.330527663230896, + "learning_rate": 0.00023946062265163033, + "loss": 1.591, + "step": 496 + }, + { + "epoch": 0.48, + "grad_norm": 1.6965402364730835, + "learning_rate": 0.00023940809198542512, + "loss": 2.3242, + "step": 497 + }, + { + "epoch": 0.48, + "grad_norm": 1.51449716091156, + "learning_rate": 0.0002393554365231856, + "loss": 1.619, + "step": 498 + }, + { + "epoch": 0.48, + "grad_norm": 1.6005566120147705, + "learning_rate": 0.00023930265632234816, + "loss": 2.0559, + "step": 499 + }, + { + "epoch": 0.48, + "grad_norm": 1.4369415044784546, + "learning_rate": 0.00023924975144048533, + "loss": 1.4451, + "step": 500 + }, + { + "epoch": 0.48, + "grad_norm": 1.4336689710617065, + "learning_rate": 0.00023919672193530565, + "loss": 1.7534, + "step": 501 + }, + { + "epoch": 0.48, + "grad_norm": 1.449487566947937, + "learning_rate": 0.00023914356786465348, + "loss": 1.8109, + "step": 502 + }, + { + "epoch": 0.49, + "grad_norm": 1.439719319343567, + "learning_rate": 0.00023909028928650918, + "loss": 1.6258, + "step": 503 + }, + { + "epoch": 0.49, + "grad_norm": 1.702451229095459, + "learning_rate": 0.00023903688625898887, + "loss": 1.9753, + "step": 504 + }, + { + "epoch": 0.49, + "grad_norm": 1.1217689514160156, + "learning_rate": 0.00023898335884034444, + "loss": 1.0806, + "step": 505 + }, + { + "epoch": 0.49, + "grad_norm": 1.43269944190979, + "learning_rate": 0.00023892970708896342, + "loss": 2.2551, + "step": 506 + }, + { + "epoch": 0.49, + "grad_norm": 1.3758916854858398, + "learning_rate": 0.00023887593106336903, + "loss": 1.6455, + "step": 507 + }, + { + "epoch": 0.49, + "grad_norm": 1.348081350326538, + "learning_rate": 0.00023882203082221996, + "loss": 1.6481, + "step": 508 + }, + { + "epoch": 0.49, + "grad_norm": 1.44756019115448, + "learning_rate": 0.00023876800642431048, + "loss": 1.6608, + "step": 509 + }, + { + "epoch": 0.49, + "grad_norm": 1.5149917602539062, + "learning_rate": 0.00023871385792857027, + "loss": 1.6434, + "step": 510 + }, + { + "epoch": 0.49, + "grad_norm": 1.436800241470337, + "learning_rate": 0.00023865958539406434, + "loss": 1.9541, + "step": 511 + }, + { + "epoch": 0.49, + "grad_norm": 1.563488483428955, + "learning_rate": 0.00023860518887999305, + "loss": 1.921, + "step": 512 + }, + { + "epoch": 0.5, + "grad_norm": 1.2544896602630615, + "learning_rate": 0.00023855066844569194, + "loss": 1.285, + "step": 513 + }, + { + "epoch": 0.5, + "grad_norm": 1.1815474033355713, + "learning_rate": 0.00023849602415063176, + "loss": 1.5958, + "step": 514 + }, + { + "epoch": 0.5, + "grad_norm": 1.3179055452346802, + "learning_rate": 0.00023844125605441835, + "loss": 1.5887, + "step": 515 + }, + { + "epoch": 0.5, + "grad_norm": 1.3224471807479858, + "learning_rate": 0.0002383863642167926, + "loss": 1.3453, + "step": 516 + }, + { + "epoch": 0.5, + "grad_norm": 1.3743826150894165, + "learning_rate": 0.00023833134869763041, + "loss": 1.6145, + "step": 517 + }, + { + "epoch": 0.5, + "grad_norm": 1.7813752889633179, + "learning_rate": 0.00023827620955694248, + "loss": 1.5723, + "step": 518 + }, + { + "epoch": 0.5, + "grad_norm": 1.457780361175537, + "learning_rate": 0.00023822094685487451, + "loss": 1.5773, + "step": 519 + }, + { + "epoch": 0.5, + "grad_norm": 1.4680794477462769, + "learning_rate": 0.00023816556065170693, + "loss": 1.7291, + "step": 520 + }, + { + "epoch": 0.5, + "grad_norm": 1.5272331237792969, + "learning_rate": 0.00023811005100785471, + "loss": 1.6012, + "step": 521 + }, + { + "epoch": 0.5, + "grad_norm": 1.6784194707870483, + "learning_rate": 0.00023805441798386778, + "loss": 1.6109, + "step": 522 + }, + { + "epoch": 0.5, + "grad_norm": 1.5598907470703125, + "learning_rate": 0.00023799866164043044, + "loss": 2.0627, + "step": 523 + }, + { + "epoch": 0.51, + "grad_norm": 1.470335841178894, + "learning_rate": 0.0002379427820383615, + "loss": 1.8267, + "step": 524 + }, + { + "epoch": 0.51, + "grad_norm": 1.677306890487671, + "learning_rate": 0.00023788677923861434, + "loss": 2.2845, + "step": 525 + }, + { + "epoch": 0.51, + "grad_norm": 1.2650090456008911, + "learning_rate": 0.00023783065330227666, + "loss": 1.6407, + "step": 526 + }, + { + "epoch": 0.51, + "grad_norm": 1.6766515970230103, + "learning_rate": 0.00023777440429057043, + "loss": 1.7039, + "step": 527 + }, + { + "epoch": 0.51, + "grad_norm": 1.3705694675445557, + "learning_rate": 0.00023771803226485197, + "loss": 1.8251, + "step": 528 + }, + { + "epoch": 0.51, + "grad_norm": 1.3645520210266113, + "learning_rate": 0.00023766153728661173, + "loss": 1.0754, + "step": 529 + }, + { + "epoch": 0.51, + "grad_norm": 1.5679587125778198, + "learning_rate": 0.00023760491941747419, + "loss": 1.7131, + "step": 530 + }, + { + "epoch": 0.51, + "grad_norm": 1.4037436246871948, + "learning_rate": 0.0002375481787191981, + "loss": 1.7182, + "step": 531 + }, + { + "epoch": 0.51, + "grad_norm": 1.4753473997116089, + "learning_rate": 0.000237491315253676, + "loss": 1.9983, + "step": 532 + }, + { + "epoch": 0.51, + "grad_norm": 1.307569980621338, + "learning_rate": 0.00023743432908293437, + "loss": 1.4774, + "step": 533 + }, + { + "epoch": 0.52, + "grad_norm": 1.5248976945877075, + "learning_rate": 0.00023737722026913357, + "loss": 1.678, + "step": 534 + }, + { + "epoch": 0.52, + "grad_norm": 1.6276731491088867, + "learning_rate": 0.00023731998887456778, + "loss": 1.9258, + "step": 535 + }, + { + "epoch": 0.52, + "grad_norm": 1.5609790086746216, + "learning_rate": 0.0002372626349616649, + "loss": 1.7404, + "step": 536 + }, + { + "epoch": 0.52, + "grad_norm": 1.3915233612060547, + "learning_rate": 0.0002372051585929863, + "loss": 1.5489, + "step": 537 + }, + { + "epoch": 0.52, + "grad_norm": 1.6098966598510742, + "learning_rate": 0.0002371475598312271, + "loss": 1.7612, + "step": 538 + }, + { + "epoch": 0.52, + "grad_norm": 1.3052818775177002, + "learning_rate": 0.00023708983873921586, + "loss": 1.2804, + "step": 539 + }, + { + "epoch": 0.52, + "grad_norm": 1.3293620347976685, + "learning_rate": 0.00023703199537991466, + "loss": 1.4353, + "step": 540 + }, + { + "epoch": 0.52, + "grad_norm": 1.4364746809005737, + "learning_rate": 0.00023697402981641877, + "loss": 1.4233, + "step": 541 + }, + { + "epoch": 0.52, + "grad_norm": 1.7351473569869995, + "learning_rate": 0.00023691594211195695, + "loss": 1.9575, + "step": 542 + }, + { + "epoch": 0.52, + "grad_norm": 1.3488695621490479, + "learning_rate": 0.00023685773232989105, + "loss": 2.0196, + "step": 543 + }, + { + "epoch": 0.53, + "grad_norm": 1.455739140510559, + "learning_rate": 0.0002367994005337162, + "loss": 2.1026, + "step": 544 + }, + { + "epoch": 0.53, + "grad_norm": 1.2512130737304688, + "learning_rate": 0.00023674094678706052, + "loss": 1.3881, + "step": 545 + }, + { + "epoch": 0.53, + "grad_norm": 1.6788513660430908, + "learning_rate": 0.0002366823711536852, + "loss": 1.5795, + "step": 546 + }, + { + "epoch": 0.53, + "grad_norm": 1.0756875276565552, + "learning_rate": 0.00023662367369748442, + "loss": 1.3518, + "step": 547 + }, + { + "epoch": 0.53, + "grad_norm": 1.5534878969192505, + "learning_rate": 0.00023656485448248516, + "loss": 1.4272, + "step": 548 + }, + { + "epoch": 0.53, + "grad_norm": 1.276198387145996, + "learning_rate": 0.00023650591357284728, + "loss": 1.3384, + "step": 549 + }, + { + "epoch": 0.53, + "grad_norm": 1.4616369009017944, + "learning_rate": 0.00023644685103286337, + "loss": 1.6083, + "step": 550 + }, + { + "epoch": 0.53, + "grad_norm": 1.487240195274353, + "learning_rate": 0.00023638766692695869, + "loss": 1.2336, + "step": 551 + }, + { + "epoch": 0.53, + "grad_norm": 1.3095462322235107, + "learning_rate": 0.00023632836131969105, + "loss": 1.3381, + "step": 552 + }, + { + "epoch": 0.53, + "grad_norm": 2.327471971511841, + "learning_rate": 0.0002362689342757509, + "loss": 2.192, + "step": 553 + }, + { + "epoch": 0.53, + "grad_norm": 1.9238693714141846, + "learning_rate": 0.00023620938585996106, + "loss": 1.6526, + "step": 554 + }, + { + "epoch": 0.54, + "grad_norm": 1.6897307634353638, + "learning_rate": 0.00023614971613727684, + "loss": 1.6713, + "step": 555 + }, + { + "epoch": 0.54, + "grad_norm": 2.526841163635254, + "learning_rate": 0.00023608992517278578, + "loss": 2.0541, + "step": 556 + }, + { + "epoch": 0.54, + "grad_norm": 1.5254669189453125, + "learning_rate": 0.00023603001303170763, + "loss": 1.3508, + "step": 557 + }, + { + "epoch": 0.54, + "grad_norm": 1.57151198387146, + "learning_rate": 0.0002359699797793945, + "loss": 1.3197, + "step": 558 + }, + { + "epoch": 0.54, + "grad_norm": 1.5244841575622559, + "learning_rate": 0.00023590982548133045, + "loss": 1.4338, + "step": 559 + }, + { + "epoch": 0.54, + "grad_norm": 1.4043896198272705, + "learning_rate": 0.0002358495502031316, + "loss": 1.9264, + "step": 560 + }, + { + "epoch": 0.54, + "grad_norm": 1.4733903408050537, + "learning_rate": 0.00023578915401054607, + "loss": 1.59, + "step": 561 + }, + { + "epoch": 0.54, + "grad_norm": 1.7210224866867065, + "learning_rate": 0.0002357286369694539, + "loss": 1.7421, + "step": 562 + }, + { + "epoch": 0.54, + "grad_norm": 1.291479468345642, + "learning_rate": 0.00023566799914586688, + "loss": 0.9787, + "step": 563 + }, + { + "epoch": 0.54, + "grad_norm": 1.6497067213058472, + "learning_rate": 0.00023560724060592858, + "loss": 1.5709, + "step": 564 + }, + { + "epoch": 0.55, + "grad_norm": 1.6274982690811157, + "learning_rate": 0.00023554636141591426, + "loss": 1.5853, + "step": 565 + }, + { + "epoch": 0.55, + "grad_norm": 1.4311622381210327, + "learning_rate": 0.00023548536164223077, + "loss": 1.7003, + "step": 566 + }, + { + "epoch": 0.55, + "grad_norm": 1.6563549041748047, + "learning_rate": 0.00023542424135141656, + "loss": 1.4655, + "step": 567 + }, + { + "epoch": 0.55, + "grad_norm": 1.6957557201385498, + "learning_rate": 0.00023536300061014132, + "loss": 1.8792, + "step": 568 + }, + { + "epoch": 0.55, + "grad_norm": 1.2087243795394897, + "learning_rate": 0.00023530163948520645, + "loss": 1.262, + "step": 569 + }, + { + "epoch": 0.55, + "grad_norm": 1.3456618785858154, + "learning_rate": 0.00023524015804354437, + "loss": 1.6861, + "step": 570 + }, + { + "epoch": 0.55, + "grad_norm": 1.5029468536376953, + "learning_rate": 0.00023517855635221896, + "loss": 1.7885, + "step": 571 + }, + { + "epoch": 0.55, + "grad_norm": 1.4606847763061523, + "learning_rate": 0.00023511683447842514, + "loss": 1.4353, + "step": 572 + }, + { + "epoch": 0.55, + "grad_norm": 1.4862562417984009, + "learning_rate": 0.00023505499248948898, + "loss": 1.7917, + "step": 573 + }, + { + "epoch": 0.55, + "grad_norm": 1.3696837425231934, + "learning_rate": 0.00023499303045286751, + "loss": 1.2735, + "step": 574 + }, + { + "epoch": 0.56, + "grad_norm": 1.4719946384429932, + "learning_rate": 0.00023493094843614877, + "loss": 1.867, + "step": 575 + }, + { + "epoch": 0.56, + "grad_norm": 1.2641010284423828, + "learning_rate": 0.0002348687465070517, + "loss": 1.3491, + "step": 576 + }, + { + "epoch": 0.56, + "grad_norm": 1.2275723218917847, + "learning_rate": 0.00023480642473342592, + "loss": 1.5747, + "step": 577 + }, + { + "epoch": 0.56, + "grad_norm": 1.214805245399475, + "learning_rate": 0.00023474398318325192, + "loss": 1.5349, + "step": 578 + }, + { + "epoch": 0.56, + "grad_norm": 1.4233876466751099, + "learning_rate": 0.0002346814219246407, + "loss": 2.0182, + "step": 579 + }, + { + "epoch": 0.56, + "grad_norm": 1.363861322402954, + "learning_rate": 0.00023461874102583395, + "loss": 1.8423, + "step": 580 + }, + { + "epoch": 0.56, + "grad_norm": 1.2210170030593872, + "learning_rate": 0.00023455594055520385, + "loss": 1.1782, + "step": 581 + }, + { + "epoch": 0.56, + "grad_norm": 1.3371609449386597, + "learning_rate": 0.0002344930205812529, + "loss": 1.7994, + "step": 582 + }, + { + "epoch": 0.56, + "grad_norm": 1.424874186515808, + "learning_rate": 0.00023442998117261416, + "loss": 1.637, + "step": 583 + }, + { + "epoch": 0.56, + "grad_norm": 1.1814637184143066, + "learning_rate": 0.00023436682239805074, + "loss": 1.334, + "step": 584 + }, + { + "epoch": 0.56, + "grad_norm": 1.3004404306411743, + "learning_rate": 0.00023430354432645606, + "loss": 1.1703, + "step": 585 + }, + { + "epoch": 0.57, + "grad_norm": 1.332945704460144, + "learning_rate": 0.00023424014702685375, + "loss": 1.3147, + "step": 586 + }, + { + "epoch": 0.57, + "grad_norm": 1.5192474126815796, + "learning_rate": 0.00023417663056839733, + "loss": 1.2995, + "step": 587 + }, + { + "epoch": 0.57, + "grad_norm": 1.5328348875045776, + "learning_rate": 0.00023411299502037043, + "loss": 1.4266, + "step": 588 + }, + { + "epoch": 0.57, + "grad_norm": 1.7186061143875122, + "learning_rate": 0.00023404924045218652, + "loss": 1.2097, + "step": 589 + }, + { + "epoch": 0.57, + "grad_norm": 1.892555832862854, + "learning_rate": 0.0002339853669333889, + "loss": 1.9445, + "step": 590 + }, + { + "epoch": 0.57, + "grad_norm": 1.4742870330810547, + "learning_rate": 0.0002339213745336506, + "loss": 1.4, + "step": 591 + }, + { + "epoch": 0.57, + "grad_norm": 1.3957617282867432, + "learning_rate": 0.00023385726332277441, + "loss": 1.0397, + "step": 592 + }, + { + "epoch": 0.57, + "grad_norm": 1.274652123451233, + "learning_rate": 0.00023379303337069262, + "loss": 1.5659, + "step": 593 + }, + { + "epoch": 0.57, + "grad_norm": 2.0992603302001953, + "learning_rate": 0.00023372868474746714, + "loss": 1.8078, + "step": 594 + }, + { + "epoch": 0.57, + "grad_norm": 1.4869073629379272, + "learning_rate": 0.00023366421752328922, + "loss": 1.3452, + "step": 595 + }, + { + "epoch": 0.58, + "grad_norm": 1.4044032096862793, + "learning_rate": 0.00023359963176847957, + "loss": 1.3954, + "step": 596 + }, + { + "epoch": 0.58, + "grad_norm": 1.3737423419952393, + "learning_rate": 0.0002335349275534881, + "loss": 1.2106, + "step": 597 + }, + { + "epoch": 0.58, + "grad_norm": 2.022343873977661, + "learning_rate": 0.00023347010494889406, + "loss": 1.8948, + "step": 598 + }, + { + "epoch": 0.58, + "grad_norm": 1.3906071186065674, + "learning_rate": 0.00023340516402540573, + "loss": 1.5172, + "step": 599 + }, + { + "epoch": 0.58, + "grad_norm": 1.825240135192871, + "learning_rate": 0.0002333401048538605, + "loss": 1.0034, + "step": 600 + }, + { + "epoch": 0.58, + "grad_norm": 1.4067100286483765, + "learning_rate": 0.00023327492750522475, + "loss": 1.7208, + "step": 601 + }, + { + "epoch": 0.58, + "grad_norm": 1.222434401512146, + "learning_rate": 0.0002332096320505937, + "loss": 1.1459, + "step": 602 + }, + { + "epoch": 0.58, + "grad_norm": 1.7354257106781006, + "learning_rate": 0.00023314421856119154, + "loss": 1.5793, + "step": 603 + }, + { + "epoch": 0.58, + "grad_norm": 1.384118914604187, + "learning_rate": 0.00023307868710837105, + "loss": 1.0095, + "step": 604 + }, + { + "epoch": 0.58, + "grad_norm": 1.459686040878296, + "learning_rate": 0.00023301303776361378, + "loss": 1.528, + "step": 605 + }, + { + "epoch": 0.58, + "grad_norm": 1.4759612083435059, + "learning_rate": 0.0002329472705985299, + "loss": 1.378, + "step": 606 + }, + { + "epoch": 0.59, + "grad_norm": 1.5465447902679443, + "learning_rate": 0.00023288138568485802, + "loss": 1.5053, + "step": 607 + }, + { + "epoch": 0.59, + "grad_norm": 1.2388064861297607, + "learning_rate": 0.0002328153830944652, + "loss": 1.2588, + "step": 608 + }, + { + "epoch": 0.59, + "grad_norm": 1.4053964614868164, + "learning_rate": 0.00023274926289934688, + "loss": 1.014, + "step": 609 + }, + { + "epoch": 0.59, + "grad_norm": 1.5368363857269287, + "learning_rate": 0.00023268302517162688, + "loss": 1.6706, + "step": 610 + }, + { + "epoch": 0.59, + "grad_norm": 1.3434786796569824, + "learning_rate": 0.00023261666998355702, + "loss": 1.1686, + "step": 611 + }, + { + "epoch": 0.59, + "grad_norm": 1.3576295375823975, + "learning_rate": 0.00023255019740751743, + "loss": 1.5393, + "step": 612 + }, + { + "epoch": 0.59, + "grad_norm": 1.285962700843811, + "learning_rate": 0.0002324836075160162, + "loss": 1.7462, + "step": 613 + }, + { + "epoch": 0.59, + "grad_norm": 2.13918399810791, + "learning_rate": 0.0002324169003816894, + "loss": 1.4822, + "step": 614 + }, + { + "epoch": 0.59, + "grad_norm": 1.7046306133270264, + "learning_rate": 0.0002323500760773009, + "loss": 1.1588, + "step": 615 + }, + { + "epoch": 0.59, + "grad_norm": 1.6164623498916626, + "learning_rate": 0.00023228313467574263, + "loss": 1.9175, + "step": 616 + }, + { + "epoch": 0.6, + "grad_norm": 1.4854605197906494, + "learning_rate": 0.00023221607625003402, + "loss": 1.7528, + "step": 617 + }, + { + "epoch": 0.6, + "grad_norm": 1.6400645971298218, + "learning_rate": 0.00023214890087332218, + "loss": 1.9289, + "step": 618 + }, + { + "epoch": 0.6, + "grad_norm": 1.5701171159744263, + "learning_rate": 0.00023208160861888186, + "loss": 1.3207, + "step": 619 + }, + { + "epoch": 0.6, + "grad_norm": 1.5051968097686768, + "learning_rate": 0.00023201419956011526, + "loss": 1.4028, + "step": 620 + }, + { + "epoch": 0.6, + "grad_norm": 1.1874357461929321, + "learning_rate": 0.000231946673770552, + "loss": 1.3708, + "step": 621 + }, + { + "epoch": 0.6, + "grad_norm": 1.3110204935073853, + "learning_rate": 0.00023187903132384903, + "loss": 1.3477, + "step": 622 + }, + { + "epoch": 0.6, + "eval_loss": 1.6304495334625244, + "eval_runtime": 28.33, + "eval_samples_per_second": 3.0, + "eval_steps_per_second": 1.518, + "step": 622 + }, + { + "epoch": 0.6, + "grad_norm": 1.3583319187164307, + "learning_rate": 0.00023181127229379058, + "loss": 1.6669, + "step": 623 + }, + { + "epoch": 0.6, + "grad_norm": 1.400608777999878, + "learning_rate": 0.00023174339675428792, + "loss": 1.1634, + "step": 624 + }, + { + "epoch": 0.6, + "grad_norm": 1.403623342514038, + "learning_rate": 0.0002316754047793796, + "loss": 1.4847, + "step": 625 + }, + { + "epoch": 0.6, + "grad_norm": 1.511208176612854, + "learning_rate": 0.00023160729644323099, + "loss": 1.3538, + "step": 626 + }, + { + "epoch": 0.61, + "grad_norm": 1.1673988103866577, + "learning_rate": 0.00023153907182013456, + "loss": 1.1773, + "step": 627 + }, + { + "epoch": 0.61, + "grad_norm": 1.5797700881958008, + "learning_rate": 0.00023147073098450953, + "loss": 1.6715, + "step": 628 + }, + { + "epoch": 0.61, + "grad_norm": 1.7258118391036987, + "learning_rate": 0.00023140227401090185, + "loss": 1.9853, + "step": 629 + }, + { + "epoch": 0.61, + "grad_norm": 1.3571829795837402, + "learning_rate": 0.00023133370097398425, + "loss": 1.624, + "step": 630 + }, + { + "epoch": 0.61, + "grad_norm": 1.462410807609558, + "learning_rate": 0.00023126501194855597, + "loss": 1.0793, + "step": 631 + }, + { + "epoch": 0.61, + "grad_norm": 1.2532066106796265, + "learning_rate": 0.00023119620700954286, + "loss": 1.3601, + "step": 632 + }, + { + "epoch": 0.61, + "grad_norm": 2.077949285507202, + "learning_rate": 0.0002311272862319971, + "loss": 1.9824, + "step": 633 + }, + { + "epoch": 0.61, + "grad_norm": 1.5285255908966064, + "learning_rate": 0.00023105824969109736, + "loss": 1.3991, + "step": 634 + }, + { + "epoch": 0.61, + "grad_norm": 1.544295310974121, + "learning_rate": 0.0002309890974621484, + "loss": 1.8974, + "step": 635 + }, + { + "epoch": 0.61, + "grad_norm": 1.4555768966674805, + "learning_rate": 0.00023091982962058138, + "loss": 1.4607, + "step": 636 + }, + { + "epoch": 0.61, + "grad_norm": 1.2902973890304565, + "learning_rate": 0.00023085044624195344, + "loss": 1.7837, + "step": 637 + }, + { + "epoch": 0.62, + "grad_norm": 1.6322664022445679, + "learning_rate": 0.0002307809474019478, + "loss": 1.3603, + "step": 638 + }, + { + "epoch": 0.62, + "grad_norm": 1.295804738998413, + "learning_rate": 0.00023071133317637357, + "loss": 1.3386, + "step": 639 + }, + { + "epoch": 0.62, + "grad_norm": 1.3429510593414307, + "learning_rate": 0.0002306416036411658, + "loss": 1.2507, + "step": 640 + }, + { + "epoch": 0.62, + "grad_norm": 1.6754704713821411, + "learning_rate": 0.0002305717588723852, + "loss": 1.4455, + "step": 641 + }, + { + "epoch": 0.62, + "grad_norm": 1.1602115631103516, + "learning_rate": 0.00023050179894621835, + "loss": 1.6829, + "step": 642 + }, + { + "epoch": 0.62, + "grad_norm": 1.412318468093872, + "learning_rate": 0.00023043172393897728, + "loss": 1.5495, + "step": 643 + }, + { + "epoch": 0.62, + "grad_norm": 1.4912530183792114, + "learning_rate": 0.00023036153392709963, + "loss": 1.6909, + "step": 644 + }, + { + "epoch": 0.62, + "grad_norm": 1.4303503036499023, + "learning_rate": 0.0002302912289871485, + "loss": 1.5794, + "step": 645 + }, + { + "epoch": 0.62, + "grad_norm": 1.3841469287872314, + "learning_rate": 0.00023022080919581227, + "loss": 1.9355, + "step": 646 + }, + { + "epoch": 0.62, + "grad_norm": 1.5403145551681519, + "learning_rate": 0.00023015027462990474, + "loss": 1.6106, + "step": 647 + }, + { + "epoch": 0.63, + "grad_norm": 1.370869517326355, + "learning_rate": 0.00023007962536636474, + "loss": 1.4042, + "step": 648 + }, + { + "epoch": 0.63, + "grad_norm": 1.6991840600967407, + "learning_rate": 0.00023000886148225635, + "loss": 1.5327, + "step": 649 + }, + { + "epoch": 0.63, + "grad_norm": 1.7101843357086182, + "learning_rate": 0.0002299379830547686, + "loss": 1.4779, + "step": 650 + }, + { + "epoch": 0.63, + "grad_norm": 1.3180490732192993, + "learning_rate": 0.00022986699016121545, + "loss": 1.5081, + "step": 651 + }, + { + "epoch": 0.63, + "grad_norm": 1.180501937866211, + "learning_rate": 0.0002297958828790358, + "loss": 1.0716, + "step": 652 + }, + { + "epoch": 0.63, + "grad_norm": 1.322432279586792, + "learning_rate": 0.00022972466128579327, + "loss": 1.277, + "step": 653 + }, + { + "epoch": 0.63, + "grad_norm": 1.6292730569839478, + "learning_rate": 0.00022965332545917623, + "loss": 1.631, + "step": 654 + }, + { + "epoch": 0.63, + "grad_norm": 1.7111241817474365, + "learning_rate": 0.00022958187547699748, + "loss": 1.3783, + "step": 655 + }, + { + "epoch": 0.63, + "grad_norm": 1.3672375679016113, + "learning_rate": 0.0002295103114171946, + "loss": 1.0966, + "step": 656 + }, + { + "epoch": 0.63, + "grad_norm": 1.34515380859375, + "learning_rate": 0.00022943863335782937, + "loss": 1.1398, + "step": 657 + }, + { + "epoch": 0.64, + "grad_norm": 1.2631828784942627, + "learning_rate": 0.00022936684137708806, + "loss": 1.3126, + "step": 658 + }, + { + "epoch": 0.64, + "grad_norm": 1.538662314414978, + "learning_rate": 0.00022929493555328117, + "loss": 1.7226, + "step": 659 + }, + { + "epoch": 0.64, + "grad_norm": 1.458886981010437, + "learning_rate": 0.00022922291596484337, + "loss": 1.5284, + "step": 660 + }, + { + "epoch": 0.64, + "grad_norm": 1.525400996208191, + "learning_rate": 0.00022915078269033346, + "loss": 1.3691, + "step": 661 + }, + { + "epoch": 0.64, + "grad_norm": 1.4852648973464966, + "learning_rate": 0.00022907853580843415, + "loss": 1.6369, + "step": 662 + }, + { + "epoch": 0.64, + "grad_norm": 1.523781418800354, + "learning_rate": 0.0002290061753979522, + "loss": 1.4643, + "step": 663 + }, + { + "epoch": 0.64, + "grad_norm": 1.3667558431625366, + "learning_rate": 0.00022893370153781817, + "loss": 1.0672, + "step": 664 + }, + { + "epoch": 0.64, + "grad_norm": 1.8203182220458984, + "learning_rate": 0.00022886111430708627, + "loss": 2.1031, + "step": 665 + }, + { + "epoch": 0.64, + "grad_norm": 1.458509922027588, + "learning_rate": 0.00022878841378493452, + "loss": 1.5407, + "step": 666 + }, + { + "epoch": 0.64, + "grad_norm": 1.1977659463882446, + "learning_rate": 0.0002287156000506644, + "loss": 1.5075, + "step": 667 + }, + { + "epoch": 0.64, + "grad_norm": 1.1007845401763916, + "learning_rate": 0.00022864267318370096, + "loss": 0.9576, + "step": 668 + }, + { + "epoch": 0.65, + "grad_norm": 1.6353801488876343, + "learning_rate": 0.00022856963326359266, + "loss": 1.7101, + "step": 669 + }, + { + "epoch": 0.65, + "grad_norm": 1.335712194442749, + "learning_rate": 0.00022849648037001115, + "loss": 1.0678, + "step": 670 + }, + { + "epoch": 0.65, + "grad_norm": 1.375508189201355, + "learning_rate": 0.0002284232145827515, + "loss": 1.4913, + "step": 671 + }, + { + "epoch": 0.65, + "grad_norm": 1.211701512336731, + "learning_rate": 0.00022834983598173182, + "loss": 1.0438, + "step": 672 + }, + { + "epoch": 0.65, + "grad_norm": 1.7795249223709106, + "learning_rate": 0.00022827634464699323, + "loss": 1.9137, + "step": 673 + }, + { + "epoch": 0.65, + "grad_norm": 1.4882405996322632, + "learning_rate": 0.00022820274065869992, + "loss": 1.5704, + "step": 674 + }, + { + "epoch": 0.65, + "grad_norm": 1.3354179859161377, + "learning_rate": 0.00022812902409713893, + "loss": 1.3485, + "step": 675 + }, + { + "epoch": 0.65, + "grad_norm": 1.5397255420684814, + "learning_rate": 0.00022805519504272005, + "loss": 2.0343, + "step": 676 + }, + { + "epoch": 0.65, + "grad_norm": 1.4044562578201294, + "learning_rate": 0.00022798125357597582, + "loss": 1.7446, + "step": 677 + }, + { + "epoch": 0.65, + "grad_norm": 1.284012794494629, + "learning_rate": 0.00022790719977756142, + "loss": 0.9859, + "step": 678 + }, + { + "epoch": 0.66, + "grad_norm": 1.879234790802002, + "learning_rate": 0.00022783303372825447, + "loss": 1.7374, + "step": 679 + }, + { + "epoch": 0.66, + "grad_norm": 1.278829574584961, + "learning_rate": 0.00022775875550895517, + "loss": 1.4937, + "step": 680 + }, + { + "epoch": 0.66, + "grad_norm": 1.3762723207473755, + "learning_rate": 0.00022768436520068596, + "loss": 1.1091, + "step": 681 + }, + { + "epoch": 0.66, + "grad_norm": 1.4809486865997314, + "learning_rate": 0.00022760986288459156, + "loss": 1.6401, + "step": 682 + }, + { + "epoch": 0.66, + "grad_norm": 1.4666237831115723, + "learning_rate": 0.00022753524864193892, + "loss": 1.7877, + "step": 683 + }, + { + "epoch": 0.66, + "grad_norm": 1.3258100748062134, + "learning_rate": 0.00022746052255411708, + "loss": 1.2407, + "step": 684 + }, + { + "epoch": 0.66, + "grad_norm": 1.178125023841858, + "learning_rate": 0.000227385684702637, + "loss": 1.4674, + "step": 685 + }, + { + "epoch": 0.66, + "grad_norm": 1.3732388019561768, + "learning_rate": 0.00022731073516913164, + "loss": 1.4729, + "step": 686 + }, + { + "epoch": 0.66, + "grad_norm": 1.357108235359192, + "learning_rate": 0.00022723567403535572, + "loss": 1.428, + "step": 687 + }, + { + "epoch": 0.66, + "grad_norm": 1.293455719947815, + "learning_rate": 0.00022716050138318574, + "loss": 1.3115, + "step": 688 + }, + { + "epoch": 0.67, + "grad_norm": 1.1836376190185547, + "learning_rate": 0.00022708521729461985, + "loss": 1.2031, + "step": 689 + }, + { + "epoch": 0.67, + "grad_norm": 1.3905551433563232, + "learning_rate": 0.00022700982185177765, + "loss": 1.8465, + "step": 690 + }, + { + "epoch": 0.67, + "grad_norm": 1.2469663619995117, + "learning_rate": 0.00022693431513690033, + "loss": 1.5076, + "step": 691 + }, + { + "epoch": 0.67, + "grad_norm": 1.2081923484802246, + "learning_rate": 0.00022685869723235042, + "loss": 1.5242, + "step": 692 + }, + { + "epoch": 0.67, + "grad_norm": 1.2785173654556274, + "learning_rate": 0.00022678296822061173, + "loss": 1.3025, + "step": 693 + }, + { + "epoch": 0.67, + "grad_norm": 1.3872441053390503, + "learning_rate": 0.00022670712818428923, + "loss": 1.7145, + "step": 694 + }, + { + "epoch": 0.67, + "grad_norm": 1.1730290651321411, + "learning_rate": 0.00022663117720610903, + "loss": 1.1832, + "step": 695 + }, + { + "epoch": 0.67, + "grad_norm": 1.0906507968902588, + "learning_rate": 0.00022655511536891825, + "loss": 1.378, + "step": 696 + }, + { + "epoch": 0.67, + "grad_norm": 1.3207216262817383, + "learning_rate": 0.00022647894275568492, + "loss": 1.2161, + "step": 697 + }, + { + "epoch": 0.67, + "grad_norm": 1.4555737972259521, + "learning_rate": 0.00022640265944949797, + "loss": 1.4499, + "step": 698 + }, + { + "epoch": 0.67, + "grad_norm": 1.3717491626739502, + "learning_rate": 0.000226326265533567, + "loss": 1.7293, + "step": 699 + }, + { + "epoch": 0.68, + "grad_norm": 1.2536697387695312, + "learning_rate": 0.00022624976109122225, + "loss": 1.4178, + "step": 700 + }, + { + "epoch": 0.68, + "grad_norm": 1.550447940826416, + "learning_rate": 0.0002261731462059146, + "loss": 1.4674, + "step": 701 + }, + { + "epoch": 0.68, + "grad_norm": 1.2245453596115112, + "learning_rate": 0.0002260964209612153, + "loss": 1.2625, + "step": 702 + }, + { + "epoch": 0.68, + "grad_norm": 1.2388466596603394, + "learning_rate": 0.0002260195854408161, + "loss": 1.6174, + "step": 703 + }, + { + "epoch": 0.68, + "grad_norm": 1.4022188186645508, + "learning_rate": 0.00022594263972852897, + "loss": 1.0605, + "step": 704 + }, + { + "epoch": 0.68, + "grad_norm": 1.275978922843933, + "learning_rate": 0.00022586558390828604, + "loss": 1.3885, + "step": 705 + }, + { + "epoch": 0.68, + "grad_norm": 1.493241786956787, + "learning_rate": 0.0002257884180641396, + "loss": 1.5634, + "step": 706 + }, + { + "epoch": 0.68, + "grad_norm": 1.1711384057998657, + "learning_rate": 0.00022571114228026198, + "loss": 1.2732, + "step": 707 + }, + { + "epoch": 0.68, + "grad_norm": 1.2927380800247192, + "learning_rate": 0.0002256337566409454, + "loss": 1.4771, + "step": 708 + }, + { + "epoch": 0.68, + "grad_norm": 1.2375295162200928, + "learning_rate": 0.0002255562612306018, + "loss": 0.9908, + "step": 709 + }, + { + "epoch": 0.69, + "grad_norm": 1.1654285192489624, + "learning_rate": 0.00022547865613376308, + "loss": 0.9158, + "step": 710 + }, + { + "epoch": 0.69, + "grad_norm": 1.694615125656128, + "learning_rate": 0.0002254009414350806, + "loss": 1.517, + "step": 711 + }, + { + "epoch": 0.69, + "grad_norm": 1.110138177871704, + "learning_rate": 0.0002253231172193254, + "loss": 1.2596, + "step": 712 + }, + { + "epoch": 0.69, + "grad_norm": 1.5078182220458984, + "learning_rate": 0.00022524518357138788, + "loss": 1.6952, + "step": 713 + }, + { + "epoch": 0.69, + "grad_norm": 1.7366492748260498, + "learning_rate": 0.00022516714057627783, + "loss": 1.8556, + "step": 714 + }, + { + "epoch": 0.69, + "grad_norm": 1.4952809810638428, + "learning_rate": 0.0002250889883191244, + "loss": 1.5153, + "step": 715 + }, + { + "epoch": 0.69, + "grad_norm": 1.2906392812728882, + "learning_rate": 0.00022501072688517576, + "loss": 1.0992, + "step": 716 + }, + { + "epoch": 0.69, + "grad_norm": 1.26261568069458, + "learning_rate": 0.0002249323563597994, + "loss": 1.1001, + "step": 717 + }, + { + "epoch": 0.69, + "grad_norm": 1.437803030014038, + "learning_rate": 0.00022485387682848155, + "loss": 1.519, + "step": 718 + }, + { + "epoch": 0.69, + "grad_norm": 1.454239845275879, + "learning_rate": 0.00022477528837682755, + "loss": 1.258, + "step": 719 + }, + { + "epoch": 0.69, + "grad_norm": 1.403212547302246, + "learning_rate": 0.00022469659109056143, + "loss": 1.6505, + "step": 720 + }, + { + "epoch": 0.7, + "grad_norm": 1.2380965948104858, + "learning_rate": 0.000224617785055526, + "loss": 1.3611, + "step": 721 + }, + { + "epoch": 0.7, + "grad_norm": 1.41748046875, + "learning_rate": 0.00022453887035768266, + "loss": 1.8791, + "step": 722 + }, + { + "epoch": 0.7, + "grad_norm": 1.4731191396713257, + "learning_rate": 0.00022445984708311132, + "loss": 1.3579, + "step": 723 + }, + { + "epoch": 0.7, + "grad_norm": 1.688100814819336, + "learning_rate": 0.00022438071531801038, + "loss": 1.5332, + "step": 724 + }, + { + "epoch": 0.7, + "grad_norm": 1.7024562358856201, + "learning_rate": 0.00022430147514869653, + "loss": 1.7091, + "step": 725 + }, + { + "epoch": 0.7, + "grad_norm": 1.3762348890304565, + "learning_rate": 0.00022422212666160478, + "loss": 1.7395, + "step": 726 + }, + { + "epoch": 0.7, + "grad_norm": 1.4019901752471924, + "learning_rate": 0.0002241426699432882, + "loss": 1.0866, + "step": 727 + }, + { + "epoch": 0.7, + "grad_norm": 1.861675500869751, + "learning_rate": 0.00022406310508041796, + "loss": 2.1158, + "step": 728 + }, + { + "epoch": 0.7, + "grad_norm": 1.3458441495895386, + "learning_rate": 0.00022398343215978322, + "loss": 1.4403, + "step": 729 + }, + { + "epoch": 0.7, + "grad_norm": 1.5659409761428833, + "learning_rate": 0.000223903651268291, + "loss": 1.7634, + "step": 730 + }, + { + "epoch": 0.71, + "grad_norm": 1.2884780168533325, + "learning_rate": 0.00022382376249296598, + "loss": 1.5543, + "step": 731 + }, + { + "epoch": 0.71, + "grad_norm": 1.0735148191452026, + "learning_rate": 0.00022374376592095076, + "loss": 1.2122, + "step": 732 + }, + { + "epoch": 0.71, + "grad_norm": 1.3894761800765991, + "learning_rate": 0.00022366366163950523, + "loss": 1.7313, + "step": 733 + }, + { + "epoch": 0.71, + "grad_norm": 1.186671257019043, + "learning_rate": 0.00022358344973600705, + "loss": 0.9415, + "step": 734 + }, + { + "epoch": 0.71, + "grad_norm": 1.2778778076171875, + "learning_rate": 0.00022350313029795112, + "loss": 1.871, + "step": 735 + }, + { + "epoch": 0.71, + "grad_norm": 1.7073333263397217, + "learning_rate": 0.0002234227034129496, + "loss": 1.7262, + "step": 736 + }, + { + "epoch": 0.71, + "grad_norm": 1.0448979139328003, + "learning_rate": 0.00022334216916873196, + "loss": 0.8838, + "step": 737 + }, + { + "epoch": 0.71, + "grad_norm": 2.0367178916931152, + "learning_rate": 0.00022326152765314472, + "loss": 1.5082, + "step": 738 + }, + { + "epoch": 0.71, + "grad_norm": 1.5965039730072021, + "learning_rate": 0.0002231807789541515, + "loss": 1.6947, + "step": 739 + }, + { + "epoch": 0.71, + "grad_norm": 1.2586615085601807, + "learning_rate": 0.00022309992315983263, + "loss": 1.2724, + "step": 740 + }, + { + "epoch": 0.72, + "grad_norm": 1.498313546180725, + "learning_rate": 0.0002230189603583855, + "loss": 1.6083, + "step": 741 + }, + { + "epoch": 0.72, + "grad_norm": 1.2010595798492432, + "learning_rate": 0.00022293789063812403, + "loss": 1.0541, + "step": 742 + }, + { + "epoch": 0.72, + "grad_norm": 1.2845265865325928, + "learning_rate": 0.00022285671408747888, + "loss": 0.9787, + "step": 743 + }, + { + "epoch": 0.72, + "grad_norm": 1.5691664218902588, + "learning_rate": 0.00022277543079499723, + "loss": 1.9072, + "step": 744 + }, + { + "epoch": 0.72, + "grad_norm": 1.8554484844207764, + "learning_rate": 0.00022269404084934269, + "loss": 1.6881, + "step": 745 + }, + { + "epoch": 0.72, + "grad_norm": 1.4981218576431274, + "learning_rate": 0.00022261254433929514, + "loss": 1.2664, + "step": 746 + }, + { + "epoch": 0.72, + "grad_norm": 1.5050944089889526, + "learning_rate": 0.00022253094135375075, + "loss": 1.3197, + "step": 747 + }, + { + "epoch": 0.72, + "grad_norm": 1.4656778573989868, + "learning_rate": 0.0002224492319817219, + "loss": 1.5641, + "step": 748 + }, + { + "epoch": 0.72, + "grad_norm": 1.433406114578247, + "learning_rate": 0.0002223674163123369, + "loss": 1.6133, + "step": 749 + }, + { + "epoch": 0.72, + "grad_norm": 1.4999152421951294, + "learning_rate": 0.00022228549443484004, + "loss": 1.1804, + "step": 750 + }, + { + "epoch": 0.72, + "grad_norm": 1.386148452758789, + "learning_rate": 0.00022220346643859153, + "loss": 1.4288, + "step": 751 + }, + { + "epoch": 0.73, + "grad_norm": 1.305163025856018, + "learning_rate": 0.00022212133241306722, + "loss": 0.9577, + "step": 752 + }, + { + "epoch": 0.73, + "grad_norm": 1.3146147727966309, + "learning_rate": 0.00022203909244785874, + "loss": 1.4475, + "step": 753 + }, + { + "epoch": 0.73, + "grad_norm": 1.5211604833602905, + "learning_rate": 0.00022195674663267324, + "loss": 1.7606, + "step": 754 + }, + { + "epoch": 0.73, + "grad_norm": 1.4717007875442505, + "learning_rate": 0.0002218742950573332, + "loss": 1.638, + "step": 755 + }, + { + "epoch": 0.73, + "grad_norm": 1.1527776718139648, + "learning_rate": 0.0002217917378117767, + "loss": 1.3322, + "step": 756 + }, + { + "epoch": 0.73, + "grad_norm": 1.599092721939087, + "learning_rate": 0.00022170907498605686, + "loss": 1.6299, + "step": 757 + }, + { + "epoch": 0.73, + "grad_norm": 1.2002819776535034, + "learning_rate": 0.0002216263066703421, + "loss": 1.3424, + "step": 758 + }, + { + "epoch": 0.73, + "grad_norm": 1.3727798461914062, + "learning_rate": 0.0002215434329549159, + "loss": 1.7718, + "step": 759 + }, + { + "epoch": 0.73, + "grad_norm": 1.1956920623779297, + "learning_rate": 0.0002214604539301767, + "loss": 0.7773, + "step": 760 + }, + { + "epoch": 0.73, + "grad_norm": 1.3804597854614258, + "learning_rate": 0.0002213773696866377, + "loss": 1.3221, + "step": 761 + }, + { + "epoch": 0.74, + "grad_norm": 1.134540319442749, + "learning_rate": 0.00022129418031492705, + "loss": 1.1406, + "step": 762 + }, + { + "epoch": 0.74, + "grad_norm": 1.599867582321167, + "learning_rate": 0.00022121088590578743, + "loss": 1.5812, + "step": 763 + }, + { + "epoch": 0.74, + "grad_norm": 1.1916415691375732, + "learning_rate": 0.0002211274865500762, + "loss": 1.1639, + "step": 764 + }, + { + "epoch": 0.74, + "grad_norm": 1.4034931659698486, + "learning_rate": 0.00022104398233876516, + "loss": 1.6485, + "step": 765 + }, + { + "epoch": 0.74, + "grad_norm": 1.217153549194336, + "learning_rate": 0.0002209603733629404, + "loss": 1.2622, + "step": 766 + }, + { + "epoch": 0.74, + "grad_norm": 1.2423417568206787, + "learning_rate": 0.00022087665971380244, + "loss": 1.3384, + "step": 767 + }, + { + "epoch": 0.74, + "grad_norm": 1.2787408828735352, + "learning_rate": 0.00022079284148266587, + "loss": 1.3278, + "step": 768 + }, + { + "epoch": 0.74, + "grad_norm": 1.0672158002853394, + "learning_rate": 0.0002207089187609594, + "loss": 1.4183, + "step": 769 + }, + { + "epoch": 0.74, + "grad_norm": 1.4464133977890015, + "learning_rate": 0.00022062489164022565, + "loss": 2.0707, + "step": 770 + }, + { + "epoch": 0.74, + "grad_norm": 1.4786409139633179, + "learning_rate": 0.00022054076021212128, + "loss": 1.3817, + "step": 771 + }, + { + "epoch": 0.75, + "grad_norm": 1.325783133506775, + "learning_rate": 0.00022045652456841651, + "loss": 1.2509, + "step": 772 + }, + { + "epoch": 0.75, + "grad_norm": 1.3716827630996704, + "learning_rate": 0.0002203721848009954, + "loss": 1.5183, + "step": 773 + }, + { + "epoch": 0.75, + "grad_norm": 1.4279142618179321, + "learning_rate": 0.00022028774100185555, + "loss": 2.0254, + "step": 774 + }, + { + "epoch": 0.75, + "grad_norm": 1.3607648611068726, + "learning_rate": 0.00022020319326310805, + "loss": 1.5698, + "step": 775 + }, + { + "epoch": 0.75, + "grad_norm": 1.3437821865081787, + "learning_rate": 0.00022011854167697725, + "loss": 1.5756, + "step": 776 + }, + { + "epoch": 0.75, + "grad_norm": 1.1076477766036987, + "learning_rate": 0.00022003378633580098, + "loss": 1.0518, + "step": 777 + }, + { + "epoch": 0.75, + "grad_norm": 1.1801143884658813, + "learning_rate": 0.00021994892733203007, + "loss": 1.3424, + "step": 778 + }, + { + "epoch": 0.75, + "grad_norm": 1.5795953273773193, + "learning_rate": 0.00021986396475822848, + "loss": 1.5052, + "step": 779 + }, + { + "epoch": 0.75, + "grad_norm": 1.3943787813186646, + "learning_rate": 0.00021977889870707318, + "loss": 1.4324, + "step": 780 + }, + { + "epoch": 0.75, + "grad_norm": 1.5282405614852905, + "learning_rate": 0.00021969372927135397, + "loss": 1.3772, + "step": 781 + }, + { + "epoch": 0.75, + "grad_norm": 1.6440399885177612, + "learning_rate": 0.00021960845654397349, + "loss": 1.1698, + "step": 782 + }, + { + "epoch": 0.76, + "grad_norm": 1.5308705568313599, + "learning_rate": 0.0002195230806179469, + "loss": 1.4512, + "step": 783 + }, + { + "epoch": 0.76, + "grad_norm": 1.3149093389511108, + "learning_rate": 0.00021943760158640212, + "loss": 1.1414, + "step": 784 + }, + { + "epoch": 0.76, + "grad_norm": 2.2626311779022217, + "learning_rate": 0.00021935201954257938, + "loss": 2.0695, + "step": 785 + }, + { + "epoch": 0.76, + "grad_norm": 1.4505892992019653, + "learning_rate": 0.0002192663345798314, + "loss": 1.6039, + "step": 786 + }, + { + "epoch": 0.76, + "grad_norm": 1.3184144496917725, + "learning_rate": 0.00021918054679162302, + "loss": 1.2249, + "step": 787 + }, + { + "epoch": 0.76, + "grad_norm": 1.313861608505249, + "learning_rate": 0.00021909465627153142, + "loss": 1.5637, + "step": 788 + }, + { + "epoch": 0.76, + "grad_norm": 1.7990531921386719, + "learning_rate": 0.00021900866311324568, + "loss": 1.7634, + "step": 789 + }, + { + "epoch": 0.76, + "grad_norm": 1.6349966526031494, + "learning_rate": 0.00021892256741056694, + "loss": 1.9673, + "step": 790 + }, + { + "epoch": 0.76, + "grad_norm": 1.3929771184921265, + "learning_rate": 0.00021883636925740813, + "loss": 1.3263, + "step": 791 + }, + { + "epoch": 0.76, + "grad_norm": 1.237173318862915, + "learning_rate": 0.00021875006874779397, + "loss": 1.5712, + "step": 792 + }, + { + "epoch": 0.77, + "grad_norm": 1.4733974933624268, + "learning_rate": 0.00021866366597586082, + "loss": 1.3899, + "step": 793 + }, + { + "epoch": 0.77, + "grad_norm": 1.4081130027770996, + "learning_rate": 0.0002185771610358566, + "loss": 1.6359, + "step": 794 + }, + { + "epoch": 0.77, + "grad_norm": 1.3193763494491577, + "learning_rate": 0.00021849055402214064, + "loss": 1.3589, + "step": 795 + }, + { + "epoch": 0.77, + "grad_norm": 1.4265916347503662, + "learning_rate": 0.00021840384502918363, + "loss": 1.4326, + "step": 796 + }, + { + "epoch": 0.77, + "grad_norm": 1.3795539140701294, + "learning_rate": 0.00021831703415156752, + "loss": 1.2139, + "step": 797 + }, + { + "epoch": 0.77, + "grad_norm": 1.7029786109924316, + "learning_rate": 0.00021823012148398542, + "loss": 1.5998, + "step": 798 + }, + { + "epoch": 0.77, + "grad_norm": 1.5365678071975708, + "learning_rate": 0.00021814310712124134, + "loss": 1.9567, + "step": 799 + }, + { + "epoch": 0.77, + "grad_norm": 1.8077536821365356, + "learning_rate": 0.00021805599115825038, + "loss": 1.8023, + "step": 800 + }, + { + "epoch": 0.77, + "grad_norm": 1.4349457025527954, + "learning_rate": 0.0002179687736900383, + "loss": 1.7043, + "step": 801 + }, + { + "epoch": 0.77, + "grad_norm": 1.450710654258728, + "learning_rate": 0.00021788145481174183, + "loss": 1.2796, + "step": 802 + }, + { + "epoch": 0.78, + "grad_norm": 1.2185072898864746, + "learning_rate": 0.00021779403461860805, + "loss": 1.1272, + "step": 803 + }, + { + "epoch": 0.78, + "grad_norm": 1.6366219520568848, + "learning_rate": 0.00021770651320599473, + "loss": 1.5251, + "step": 804 + }, + { + "epoch": 0.78, + "grad_norm": 1.445094347000122, + "learning_rate": 0.0002176188906693699, + "loss": 1.3265, + "step": 805 + }, + { + "epoch": 0.78, + "grad_norm": 1.5272247791290283, + "learning_rate": 0.00021753116710431205, + "loss": 0.9833, + "step": 806 + }, + { + "epoch": 0.78, + "grad_norm": 1.4574639797210693, + "learning_rate": 0.00021744334260650984, + "loss": 1.6412, + "step": 807 + }, + { + "epoch": 0.78, + "grad_norm": 1.1877381801605225, + "learning_rate": 0.0002173554172717619, + "loss": 1.3086, + "step": 808 + }, + { + "epoch": 0.78, + "grad_norm": 1.4031157493591309, + "learning_rate": 0.000217267391195977, + "loss": 1.587, + "step": 809 + }, + { + "epoch": 0.78, + "grad_norm": 1.2422850131988525, + "learning_rate": 0.0002171792644751737, + "loss": 1.334, + "step": 810 + }, + { + "epoch": 0.78, + "grad_norm": 1.3261579275131226, + "learning_rate": 0.00021709103720548052, + "loss": 1.2248, + "step": 811 + }, + { + "epoch": 0.78, + "grad_norm": 1.0835561752319336, + "learning_rate": 0.00021700270948313532, + "loss": 0.9224, + "step": 812 + }, + { + "epoch": 0.78, + "grad_norm": 1.3430688381195068, + "learning_rate": 0.00021691428140448594, + "loss": 1.2989, + "step": 813 + }, + { + "epoch": 0.79, + "grad_norm": 1.2712783813476562, + "learning_rate": 0.00021682575306598934, + "loss": 1.4027, + "step": 814 + }, + { + "epoch": 0.79, + "grad_norm": 1.2724041938781738, + "learning_rate": 0.00021673712456421202, + "loss": 1.2837, + "step": 815 + }, + { + "epoch": 0.79, + "grad_norm": 1.2153583765029907, + "learning_rate": 0.00021664839599582974, + "loss": 1.3666, + "step": 816 + }, + { + "epoch": 0.79, + "grad_norm": 1.2074302434921265, + "learning_rate": 0.00021655956745762742, + "loss": 1.4386, + "step": 817 + }, + { + "epoch": 0.79, + "grad_norm": 1.372488021850586, + "learning_rate": 0.0002164706390464989, + "loss": 1.626, + "step": 818 + }, + { + "epoch": 0.79, + "grad_norm": 1.1615793704986572, + "learning_rate": 0.00021638161085944706, + "loss": 1.1384, + "step": 819 + }, + { + "epoch": 0.79, + "grad_norm": 1.1077085733413696, + "learning_rate": 0.00021629248299358367, + "loss": 0.9159, + "step": 820 + }, + { + "epoch": 0.79, + "grad_norm": 1.2498594522476196, + "learning_rate": 0.00021620325554612912, + "loss": 1.2139, + "step": 821 + }, + { + "epoch": 0.79, + "grad_norm": 1.6090998649597168, + "learning_rate": 0.00021611392861441243, + "loss": 1.9134, + "step": 822 + }, + { + "epoch": 0.79, + "grad_norm": 1.2484623193740845, + "learning_rate": 0.00021602450229587123, + "loss": 1.2969, + "step": 823 + }, + { + "epoch": 0.8, + "grad_norm": 1.4213321208953857, + "learning_rate": 0.0002159349766880515, + "loss": 1.717, + "step": 824 + }, + { + "epoch": 0.8, + "grad_norm": 1.0547410249710083, + "learning_rate": 0.0002158453518886075, + "loss": 0.9849, + "step": 825 + }, + { + "epoch": 0.8, + "grad_norm": 1.2111304998397827, + "learning_rate": 0.00021575562799530169, + "loss": 1.2678, + "step": 826 + }, + { + "epoch": 0.8, + "grad_norm": 1.66452956199646, + "learning_rate": 0.0002156658051060047, + "loss": 1.4332, + "step": 827 + }, + { + "epoch": 0.8, + "grad_norm": 1.3838409185409546, + "learning_rate": 0.0002155758833186951, + "loss": 1.1844, + "step": 828 + }, + { + "epoch": 0.8, + "grad_norm": 1.5047062635421753, + "learning_rate": 0.0002154858627314593, + "loss": 1.7889, + "step": 829 + }, + { + "epoch": 0.8, + "grad_norm": 1.286152720451355, + "learning_rate": 0.00021539574344249152, + "loss": 1.1978, + "step": 830 + }, + { + "epoch": 0.8, + "grad_norm": 1.1742347478866577, + "learning_rate": 0.00021530552555009365, + "loss": 1.0202, + "step": 831 + }, + { + "epoch": 0.8, + "grad_norm": 1.7456132173538208, + "learning_rate": 0.0002152152091526751, + "loss": 1.0362, + "step": 832 + }, + { + "epoch": 0.8, + "grad_norm": 1.381914496421814, + "learning_rate": 0.00021512479434875276, + "loss": 1.4382, + "step": 833 + }, + { + "epoch": 0.81, + "grad_norm": 1.2699590921401978, + "learning_rate": 0.00021503428123695082, + "loss": 1.512, + "step": 834 + }, + { + "epoch": 0.81, + "grad_norm": 1.1738026142120361, + "learning_rate": 0.00021494366991600076, + "loss": 1.2607, + "step": 835 + }, + { + "epoch": 0.81, + "grad_norm": 1.9550883769989014, + "learning_rate": 0.0002148529604847412, + "loss": 1.8124, + "step": 836 + }, + { + "epoch": 0.81, + "grad_norm": 1.504042148590088, + "learning_rate": 0.00021476215304211765, + "loss": 1.4629, + "step": 837 + }, + { + "epoch": 0.81, + "grad_norm": 1.0610342025756836, + "learning_rate": 0.00021467124768718269, + "loss": 0.9717, + "step": 838 + }, + { + "epoch": 0.81, + "grad_norm": 1.478193759918213, + "learning_rate": 0.00021458024451909564, + "loss": 1.802, + "step": 839 + }, + { + "epoch": 0.81, + "grad_norm": 1.218469262123108, + "learning_rate": 0.00021448914363712246, + "loss": 1.3854, + "step": 840 + }, + { + "epoch": 0.81, + "grad_norm": 1.2758036851882935, + "learning_rate": 0.00021439794514063577, + "loss": 1.2681, + "step": 841 + }, + { + "epoch": 0.81, + "grad_norm": 1.3164936304092407, + "learning_rate": 0.00021430664912911463, + "loss": 1.4467, + "step": 842 + }, + { + "epoch": 0.81, + "grad_norm": 1.414792537689209, + "learning_rate": 0.0002142152557021445, + "loss": 1.5323, + "step": 843 + }, + { + "epoch": 0.81, + "grad_norm": 1.1610770225524902, + "learning_rate": 0.00021412376495941705, + "loss": 1.4057, + "step": 844 + }, + { + "epoch": 0.82, + "grad_norm": 1.2606096267700195, + "learning_rate": 0.00021403217700073017, + "loss": 1.19, + "step": 845 + }, + { + "epoch": 0.82, + "grad_norm": 1.1834993362426758, + "learning_rate": 0.0002139404919259877, + "loss": 1.5106, + "step": 846 + }, + { + "epoch": 0.82, + "grad_norm": 1.2146964073181152, + "learning_rate": 0.00021384870983519956, + "loss": 1.4995, + "step": 847 + }, + { + "epoch": 0.82, + "grad_norm": 1.208569049835205, + "learning_rate": 0.0002137568308284813, + "loss": 1.0893, + "step": 848 + }, + { + "epoch": 0.82, + "grad_norm": 1.429599642753601, + "learning_rate": 0.00021366485500605437, + "loss": 1.1302, + "step": 849 + }, + { + "epoch": 0.82, + "grad_norm": 1.868918538093567, + "learning_rate": 0.0002135727824682457, + "loss": 2.0047, + "step": 850 + }, + { + "epoch": 0.82, + "grad_norm": 1.1158243417739868, + "learning_rate": 0.00021348061331548777, + "loss": 1.1251, + "step": 851 + }, + { + "epoch": 0.82, + "grad_norm": 1.1816633939743042, + "learning_rate": 0.00021338834764831843, + "loss": 1.0859, + "step": 852 + }, + { + "epoch": 0.82, + "grad_norm": 1.3078497648239136, + "learning_rate": 0.0002132959855673809, + "loss": 1.3654, + "step": 853 + }, + { + "epoch": 0.82, + "grad_norm": 1.5226352214813232, + "learning_rate": 0.00021320352717342335, + "loss": 1.3744, + "step": 854 + }, + { + "epoch": 0.83, + "grad_norm": 1.194767713546753, + "learning_rate": 0.0002131109725672992, + "loss": 1.1332, + "step": 855 + }, + { + "epoch": 0.83, + "grad_norm": 1.292400598526001, + "learning_rate": 0.00021301832184996686, + "loss": 1.1676, + "step": 856 + }, + { + "epoch": 0.83, + "grad_norm": 1.307769536972046, + "learning_rate": 0.00021292557512248933, + "loss": 1.3764, + "step": 857 + }, + { + "epoch": 0.83, + "grad_norm": 1.5784074068069458, + "learning_rate": 0.00021283273248603457, + "loss": 1.8682, + "step": 858 + }, + { + "epoch": 0.83, + "grad_norm": 1.4524656534194946, + "learning_rate": 0.00021273979404187507, + "loss": 1.355, + "step": 859 + }, + { + "epoch": 0.83, + "grad_norm": 1.2774903774261475, + "learning_rate": 0.0002126467598913878, + "loss": 1.1275, + "step": 860 + }, + { + "epoch": 0.83, + "grad_norm": 1.364617943763733, + "learning_rate": 0.00021255363013605419, + "loss": 1.7862, + "step": 861 + }, + { + "epoch": 0.83, + "grad_norm": 1.259527325630188, + "learning_rate": 0.0002124604048774599, + "loss": 1.3798, + "step": 862 + }, + { + "epoch": 0.83, + "grad_norm": 1.6640812158584595, + "learning_rate": 0.0002123670842172948, + "loss": 1.8239, + "step": 863 + }, + { + "epoch": 0.83, + "grad_norm": 1.2190797328948975, + "learning_rate": 0.0002122736682573528, + "loss": 1.0124, + "step": 864 + }, + { + "epoch": 0.83, + "grad_norm": 1.341090202331543, + "learning_rate": 0.00021218015709953175, + "loss": 0.985, + "step": 865 + }, + { + "epoch": 0.84, + "grad_norm": 1.3625142574310303, + "learning_rate": 0.00021208655084583344, + "loss": 1.0811, + "step": 866 + }, + { + "epoch": 0.84, + "grad_norm": 1.861794352531433, + "learning_rate": 0.0002119928495983633, + "loss": 1.5199, + "step": 867 + }, + { + "epoch": 0.84, + "grad_norm": 1.5730104446411133, + "learning_rate": 0.00021189905345933032, + "loss": 1.4189, + "step": 868 + }, + { + "epoch": 0.84, + "grad_norm": 1.224785566329956, + "learning_rate": 0.0002118051625310471, + "loss": 1.2948, + "step": 869 + }, + { + "epoch": 0.84, + "grad_norm": 1.4107848405838013, + "learning_rate": 0.00021171117691592964, + "loss": 1.3929, + "step": 870 + }, + { + "epoch": 0.84, + "grad_norm": 1.16511070728302, + "learning_rate": 0.00021161709671649721, + "loss": 1.0999, + "step": 871 + }, + { + "epoch": 0.84, + "grad_norm": 2.30039119720459, + "learning_rate": 0.00021152292203537216, + "loss": 1.6013, + "step": 872 + }, + { + "epoch": 0.84, + "grad_norm": 1.5170789957046509, + "learning_rate": 0.00021142865297528004, + "loss": 1.8839, + "step": 873 + }, + { + "epoch": 0.84, + "grad_norm": 1.2411909103393555, + "learning_rate": 0.00021133428963904927, + "loss": 1.4265, + "step": 874 + }, + { + "epoch": 0.84, + "grad_norm": 1.0655996799468994, + "learning_rate": 0.00021123983212961112, + "loss": 1.2643, + "step": 875 + }, + { + "epoch": 0.85, + "grad_norm": 1.1014069318771362, + "learning_rate": 0.00021114528054999953, + "loss": 1.0326, + "step": 876 + }, + { + "epoch": 0.85, + "grad_norm": 1.3042875528335571, + "learning_rate": 0.0002110506350033512, + "loss": 1.5919, + "step": 877 + }, + { + "epoch": 0.85, + "grad_norm": 0.9417317509651184, + "learning_rate": 0.00021095589559290518, + "loss": 0.8426, + "step": 878 + }, + { + "epoch": 0.85, + "grad_norm": 1.341861605644226, + "learning_rate": 0.00021086106242200296, + "loss": 1.2377, + "step": 879 + }, + { + "epoch": 0.85, + "grad_norm": 1.2200474739074707, + "learning_rate": 0.0002107661355940883, + "loss": 1.4793, + "step": 880 + }, + { + "epoch": 0.85, + "grad_norm": 1.4158042669296265, + "learning_rate": 0.00021067111521270713, + "loss": 1.752, + "step": 881 + }, + { + "epoch": 0.85, + "grad_norm": 1.4061779975891113, + "learning_rate": 0.0002105760013815074, + "loss": 1.089, + "step": 882 + }, + { + "epoch": 0.85, + "grad_norm": 1.3693805932998657, + "learning_rate": 0.00021048079420423908, + "loss": 1.4861, + "step": 883 + }, + { + "epoch": 0.85, + "grad_norm": 1.4483033418655396, + "learning_rate": 0.00021038549378475386, + "loss": 1.4553, + "step": 884 + }, + { + "epoch": 0.85, + "grad_norm": 1.283494234085083, + "learning_rate": 0.00021029010022700518, + "loss": 1.2053, + "step": 885 + }, + { + "epoch": 0.86, + "grad_norm": 1.5050575733184814, + "learning_rate": 0.00021019461363504805, + "loss": 1.2142, + "step": 886 + }, + { + "epoch": 0.86, + "grad_norm": 1.2599867582321167, + "learning_rate": 0.00021009903411303902, + "loss": 1.3242, + "step": 887 + }, + { + "epoch": 0.86, + "grad_norm": 1.2808127403259277, + "learning_rate": 0.00021000336176523593, + "loss": 1.1527, + "step": 888 + }, + { + "epoch": 0.86, + "grad_norm": 1.145213007926941, + "learning_rate": 0.00020990759669599799, + "loss": 1.1483, + "step": 889 + }, + { + "epoch": 0.86, + "grad_norm": 1.1295148134231567, + "learning_rate": 0.0002098117390097854, + "loss": 1.0159, + "step": 890 + }, + { + "epoch": 0.86, + "grad_norm": 1.159382939338684, + "learning_rate": 0.00020971578881115948, + "loss": 1.1626, + "step": 891 + }, + { + "epoch": 0.86, + "grad_norm": 1.6490403413772583, + "learning_rate": 0.00020961974620478248, + "loss": 1.7539, + "step": 892 + }, + { + "epoch": 0.86, + "grad_norm": 1.517959475517273, + "learning_rate": 0.0002095236112954174, + "loss": 1.6324, + "step": 893 + }, + { + "epoch": 0.86, + "grad_norm": 1.4953640699386597, + "learning_rate": 0.0002094273841879279, + "loss": 1.4384, + "step": 894 + }, + { + "epoch": 0.86, + "grad_norm": 1.3297251462936401, + "learning_rate": 0.00020933106498727825, + "loss": 1.1462, + "step": 895 + }, + { + "epoch": 0.86, + "grad_norm": 1.2726671695709229, + "learning_rate": 0.00020923465379853323, + "loss": 1.0989, + "step": 896 + }, + { + "epoch": 0.87, + "grad_norm": 1.1799503564834595, + "learning_rate": 0.00020913815072685785, + "loss": 0.9603, + "step": 897 + }, + { + "epoch": 0.87, + "grad_norm": 1.2199748754501343, + "learning_rate": 0.00020904155587751742, + "loss": 1.149, + "step": 898 + }, + { + "epoch": 0.87, + "grad_norm": 1.8773846626281738, + "learning_rate": 0.00020894486935587732, + "loss": 0.8965, + "step": 899 + }, + { + "epoch": 0.87, + "grad_norm": 1.669113039970398, + "learning_rate": 0.00020884809126740295, + "loss": 1.7323, + "step": 900 + }, + { + "epoch": 0.87, + "grad_norm": 1.3780359029769897, + "learning_rate": 0.00020875122171765958, + "loss": 1.47, + "step": 901 + }, + { + "epoch": 0.87, + "grad_norm": 1.1259912252426147, + "learning_rate": 0.00020865426081231227, + "loss": 1.3505, + "step": 902 + }, + { + "epoch": 0.87, + "grad_norm": 1.4640130996704102, + "learning_rate": 0.00020855720865712568, + "loss": 1.6051, + "step": 903 + }, + { + "epoch": 0.87, + "grad_norm": 1.0640982389450073, + "learning_rate": 0.00020846006535796407, + "loss": 1.109, + "step": 904 + }, + { + "epoch": 0.87, + "grad_norm": 1.1131609678268433, + "learning_rate": 0.00020836283102079108, + "loss": 1.1407, + "step": 905 + }, + { + "epoch": 0.87, + "grad_norm": 1.1612396240234375, + "learning_rate": 0.00020826550575166964, + "loss": 1.2, + "step": 906 + }, + { + "epoch": 0.88, + "grad_norm": 1.6626241207122803, + "learning_rate": 0.00020816808965676186, + "loss": 1.8475, + "step": 907 + }, + { + "epoch": 0.88, + "grad_norm": 1.438838005065918, + "learning_rate": 0.000208070582842329, + "loss": 1.4186, + "step": 908 + }, + { + "epoch": 0.88, + "grad_norm": 1.4505665302276611, + "learning_rate": 0.00020797298541473127, + "loss": 1.6378, + "step": 909 + }, + { + "epoch": 0.88, + "grad_norm": 1.3546316623687744, + "learning_rate": 0.00020787529748042758, + "loss": 1.3801, + "step": 910 + }, + { + "epoch": 0.88, + "grad_norm": 1.5958383083343506, + "learning_rate": 0.0002077775191459757, + "loss": 1.7361, + "step": 911 + }, + { + "epoch": 0.88, + "grad_norm": 1.6373250484466553, + "learning_rate": 0.000207679650518032, + "loss": 1.8638, + "step": 912 + }, + { + "epoch": 0.88, + "grad_norm": 1.126513957977295, + "learning_rate": 0.00020758169170335123, + "loss": 1.2306, + "step": 913 + }, + { + "epoch": 0.88, + "grad_norm": 1.2469696998596191, + "learning_rate": 0.0002074836428087867, + "loss": 1.4286, + "step": 914 + }, + { + "epoch": 0.88, + "grad_norm": 1.3433685302734375, + "learning_rate": 0.00020738550394128983, + "loss": 1.1645, + "step": 915 + }, + { + "epoch": 0.88, + "grad_norm": 1.274887204170227, + "learning_rate": 0.0002072872752079102, + "loss": 1.4313, + "step": 916 + }, + { + "epoch": 0.89, + "grad_norm": 1.2697557210922241, + "learning_rate": 0.0002071889567157955, + "loss": 1.1121, + "step": 917 + }, + { + "epoch": 0.89, + "grad_norm": 1.6256011724472046, + "learning_rate": 0.00020709054857219122, + "loss": 2.3053, + "step": 918 + }, + { + "epoch": 0.89, + "grad_norm": 1.4982554912567139, + "learning_rate": 0.00020699205088444073, + "loss": 1.5737, + "step": 919 + }, + { + "epoch": 0.89, + "grad_norm": 1.3380775451660156, + "learning_rate": 0.00020689346375998504, + "loss": 1.3681, + "step": 920 + }, + { + "epoch": 0.89, + "grad_norm": 1.1807115077972412, + "learning_rate": 0.0002067947873063627, + "loss": 1.121, + "step": 921 + }, + { + "epoch": 0.89, + "grad_norm": 1.6416897773742676, + "learning_rate": 0.0002066960216312097, + "loss": 1.585, + "step": 922 + }, + { + "epoch": 0.89, + "grad_norm": 1.310311198234558, + "learning_rate": 0.00020659716684225942, + "loss": 1.0295, + "step": 923 + }, + { + "epoch": 0.89, + "grad_norm": 1.283509373664856, + "learning_rate": 0.00020649822304734234, + "loss": 0.8625, + "step": 924 + }, + { + "epoch": 0.89, + "grad_norm": 1.6311004161834717, + "learning_rate": 0.00020639919035438616, + "loss": 1.5964, + "step": 925 + }, + { + "epoch": 0.89, + "grad_norm": 1.4839829206466675, + "learning_rate": 0.00020630006887141533, + "loss": 1.3802, + "step": 926 + }, + { + "epoch": 0.89, + "grad_norm": 1.6016814708709717, + "learning_rate": 0.00020620085870655142, + "loss": 1.3962, + "step": 927 + }, + { + "epoch": 0.9, + "grad_norm": 1.3555843830108643, + "learning_rate": 0.00020610155996801256, + "loss": 1.3931, + "step": 928 + }, + { + "epoch": 0.9, + "grad_norm": 1.5287679433822632, + "learning_rate": 0.00020600217276411353, + "loss": 1.0829, + "step": 929 + }, + { + "epoch": 0.9, + "grad_norm": 1.1010750532150269, + "learning_rate": 0.00020590269720326568, + "loss": 0.9906, + "step": 930 + }, + { + "epoch": 0.9, + "grad_norm": 1.2286770343780518, + "learning_rate": 0.00020580313339397662, + "loss": 1.099, + "step": 931 + }, + { + "epoch": 0.9, + "grad_norm": 1.4633378982543945, + "learning_rate": 0.0002057034814448503, + "loss": 1.2768, + "step": 932 + }, + { + "epoch": 0.9, + "grad_norm": 2.0793206691741943, + "learning_rate": 0.0002056037414645868, + "loss": 1.5472, + "step": 933 + }, + { + "epoch": 0.9, + "grad_norm": 1.041421890258789, + "learning_rate": 0.00020550391356198222, + "loss": 1.0693, + "step": 934 + }, + { + "epoch": 0.9, + "grad_norm": 1.7149021625518799, + "learning_rate": 0.00020540399784592852, + "loss": 1.6992, + "step": 935 + }, + { + "epoch": 0.9, + "grad_norm": 1.1274030208587646, + "learning_rate": 0.00020530399442541356, + "loss": 1.1349, + "step": 936 + }, + { + "epoch": 0.9, + "grad_norm": 1.6209064722061157, + "learning_rate": 0.00020520390340952074, + "loss": 1.6395, + "step": 937 + }, + { + "epoch": 0.91, + "grad_norm": 1.0727425813674927, + "learning_rate": 0.00020510372490742913, + "loss": 1.0555, + "step": 938 + }, + { + "epoch": 0.91, + "grad_norm": 1.1689037084579468, + "learning_rate": 0.00020500345902841311, + "loss": 1.4163, + "step": 939 + }, + { + "epoch": 0.91, + "grad_norm": 1.1197943687438965, + "learning_rate": 0.00020490310588184242, + "loss": 1.669, + "step": 940 + }, + { + "epoch": 0.91, + "grad_norm": 1.0973910093307495, + "learning_rate": 0.00020480266557718201, + "loss": 1.2043, + "step": 941 + }, + { + "epoch": 0.91, + "grad_norm": 1.459172010421753, + "learning_rate": 0.00020470213822399198, + "loss": 1.6599, + "step": 942 + }, + { + "epoch": 0.91, + "grad_norm": 1.4092894792556763, + "learning_rate": 0.00020460152393192717, + "loss": 1.5244, + "step": 943 + }, + { + "epoch": 0.91, + "grad_norm": 1.0109457969665527, + "learning_rate": 0.0002045008228107374, + "loss": 1.0698, + "step": 944 + }, + { + "epoch": 0.91, + "grad_norm": 1.1727893352508545, + "learning_rate": 0.00020440003497026722, + "loss": 0.8874, + "step": 945 + }, + { + "epoch": 0.91, + "grad_norm": 1.4926542043685913, + "learning_rate": 0.0002042991605204557, + "loss": 1.1908, + "step": 946 + }, + { + "epoch": 0.91, + "grad_norm": 1.2154324054718018, + "learning_rate": 0.00020419819957133642, + "loss": 1.0055, + "step": 947 + }, + { + "epoch": 0.92, + "grad_norm": 1.1640907526016235, + "learning_rate": 0.00020409715223303733, + "loss": 1.0244, + "step": 948 + }, + { + "epoch": 0.92, + "grad_norm": 1.094460129737854, + "learning_rate": 0.00020399601861578054, + "loss": 1.086, + "step": 949 + }, + { + "epoch": 0.92, + "grad_norm": 1.0807240009307861, + "learning_rate": 0.0002038947988298824, + "loss": 1.0952, + "step": 950 + }, + { + "epoch": 0.92, + "grad_norm": 1.2182865142822266, + "learning_rate": 0.00020379349298575312, + "loss": 0.8286, + "step": 951 + }, + { + "epoch": 0.92, + "grad_norm": 1.2438262701034546, + "learning_rate": 0.0002036921011938968, + "loss": 1.449, + "step": 952 + }, + { + "epoch": 0.92, + "grad_norm": 1.0871261358261108, + "learning_rate": 0.00020359062356491142, + "loss": 1.005, + "step": 953 + }, + { + "epoch": 0.92, + "grad_norm": 1.3494447469711304, + "learning_rate": 0.0002034890602094885, + "loss": 1.599, + "step": 954 + }, + { + "epoch": 0.92, + "grad_norm": 1.3108654022216797, + "learning_rate": 0.00020338741123841297, + "loss": 1.3039, + "step": 955 + }, + { + "epoch": 0.92, + "grad_norm": 1.4159644842147827, + "learning_rate": 0.00020328567676256335, + "loss": 1.3015, + "step": 956 + }, + { + "epoch": 0.92, + "grad_norm": 1.267722249031067, + "learning_rate": 0.00020318385689291126, + "loss": 1.1788, + "step": 957 + }, + { + "epoch": 0.92, + "grad_norm": 1.2124974727630615, + "learning_rate": 0.00020308195174052157, + "loss": 1.3477, + "step": 958 + }, + { + "epoch": 0.93, + "grad_norm": 1.7106714248657227, + "learning_rate": 0.00020297996141655216, + "loss": 1.7461, + "step": 959 + }, + { + "epoch": 0.93, + "grad_norm": 1.3902013301849365, + "learning_rate": 0.00020287788603225377, + "loss": 1.3323, + "step": 960 + }, + { + "epoch": 0.93, + "grad_norm": 0.9589385390281677, + "learning_rate": 0.00020277572569896995, + "loss": 0.7713, + "step": 961 + }, + { + "epoch": 0.93, + "grad_norm": 1.223151683807373, + "learning_rate": 0.00020267348052813692, + "loss": 1.2357, + "step": 962 + }, + { + "epoch": 0.93, + "grad_norm": 0.9997661709785461, + "learning_rate": 0.0002025711506312834, + "loss": 0.9083, + "step": 963 + }, + { + "epoch": 0.93, + "grad_norm": 1.2039114236831665, + "learning_rate": 0.0002024687361200306, + "loss": 1.0277, + "step": 964 + }, + { + "epoch": 0.93, + "grad_norm": 1.2620604038238525, + "learning_rate": 0.00020236623710609202, + "loss": 0.9147, + "step": 965 + }, + { + "epoch": 0.93, + "grad_norm": 1.603683352470398, + "learning_rate": 0.00020226365370127323, + "loss": 1.3692, + "step": 966 + }, + { + "epoch": 0.93, + "grad_norm": 1.1889077425003052, + "learning_rate": 0.00020216098601747197, + "loss": 1.3586, + "step": 967 + }, + { + "epoch": 0.93, + "grad_norm": 0.9763696193695068, + "learning_rate": 0.00020205823416667788, + "loss": 1.0737, + "step": 968 + }, + { + "epoch": 0.94, + "grad_norm": 1.9170948266983032, + "learning_rate": 0.00020195539826097238, + "loss": 1.7105, + "step": 969 + }, + { + "epoch": 0.94, + "grad_norm": 1.3122986555099487, + "learning_rate": 0.00020185247841252856, + "loss": 1.6161, + "step": 970 + }, + { + "epoch": 0.94, + "grad_norm": 1.4224249124526978, + "learning_rate": 0.00020174947473361118, + "loss": 1.5831, + "step": 971 + }, + { + "epoch": 0.94, + "grad_norm": 1.293426752090454, + "learning_rate": 0.00020164638733657637, + "loss": 1.3102, + "step": 972 + }, + { + "epoch": 0.94, + "grad_norm": 1.07170569896698, + "learning_rate": 0.0002015432163338715, + "loss": 0.8819, + "step": 973 + }, + { + "epoch": 0.94, + "grad_norm": 1.181535005569458, + "learning_rate": 0.00020143996183803525, + "loss": 1.1775, + "step": 974 + }, + { + "epoch": 0.94, + "grad_norm": 1.3837966918945312, + "learning_rate": 0.00020133662396169743, + "loss": 1.3791, + "step": 975 + }, + { + "epoch": 0.94, + "grad_norm": 1.1325147151947021, + "learning_rate": 0.00020123320281757856, + "loss": 0.9244, + "step": 976 + }, + { + "epoch": 0.94, + "grad_norm": 1.8120399713516235, + "learning_rate": 0.0002011296985184903, + "loss": 1.3342, + "step": 977 + }, + { + "epoch": 0.94, + "grad_norm": 1.260125994682312, + "learning_rate": 0.0002010261111773347, + "loss": 1.3665, + "step": 978 + }, + { + "epoch": 0.94, + "grad_norm": 1.342794418334961, + "learning_rate": 0.00020092244090710472, + "loss": 1.2215, + "step": 979 + }, + { + "epoch": 0.95, + "grad_norm": 1.5904815196990967, + "learning_rate": 0.0002008186878208834, + "loss": 1.071, + "step": 980 + }, + { + "epoch": 0.95, + "grad_norm": 1.2493515014648438, + "learning_rate": 0.00020071485203184455, + "loss": 1.5185, + "step": 981 + }, + { + "epoch": 0.95, + "grad_norm": 1.2379603385925293, + "learning_rate": 0.00020061093365325184, + "loss": 1.239, + "step": 982 + }, + { + "epoch": 0.95, + "grad_norm": 1.4626051187515259, + "learning_rate": 0.00020050693279845914, + "loss": 1.5534, + "step": 983 + }, + { + "epoch": 0.95, + "grad_norm": 1.3357350826263428, + "learning_rate": 0.0002004028495809103, + "loss": 1.8324, + "step": 984 + }, + { + "epoch": 0.95, + "grad_norm": 1.4206643104553223, + "learning_rate": 0.0002002986841141391, + "loss": 1.251, + "step": 985 + }, + { + "epoch": 0.95, + "grad_norm": 1.436003565788269, + "learning_rate": 0.00020019443651176884, + "loss": 1.7747, + "step": 986 + }, + { + "epoch": 0.95, + "grad_norm": 1.350991129875183, + "learning_rate": 0.0002000901068875126, + "loss": 1.5258, + "step": 987 + }, + { + "epoch": 0.95, + "grad_norm": 1.1044528484344482, + "learning_rate": 0.00019998569535517276, + "loss": 1.269, + "step": 988 + }, + { + "epoch": 0.95, + "grad_norm": 1.4368743896484375, + "learning_rate": 0.00019988120202864125, + "loss": 1.7985, + "step": 989 + }, + { + "epoch": 0.96, + "grad_norm": 1.080803394317627, + "learning_rate": 0.00019977662702189904, + "loss": 0.8469, + "step": 990 + }, + { + "epoch": 0.96, + "grad_norm": 1.4379607439041138, + "learning_rate": 0.00019967197044901618, + "loss": 1.48, + "step": 991 + }, + { + "epoch": 0.96, + "grad_norm": 1.1783312559127808, + "learning_rate": 0.00019956723242415194, + "loss": 1.4203, + "step": 992 + }, + { + "epoch": 0.96, + "grad_norm": 1.2120602130889893, + "learning_rate": 0.0001994624130615542, + "loss": 1.2925, + "step": 993 + }, + { + "epoch": 0.96, + "grad_norm": 1.0752936601638794, + "learning_rate": 0.00019935751247555954, + "loss": 1.4643, + "step": 994 + }, + { + "epoch": 0.96, + "grad_norm": 1.1408238410949707, + "learning_rate": 0.0001992525307805934, + "loss": 1.3049, + "step": 995 + }, + { + "epoch": 0.96, + "grad_norm": 1.1272218227386475, + "learning_rate": 0.0001991474680911694, + "loss": 1.0908, + "step": 996 + }, + { + "epoch": 0.96, + "grad_norm": 1.525564432144165, + "learning_rate": 0.0001990423245218897, + "loss": 1.2493, + "step": 997 + }, + { + "epoch": 0.96, + "grad_norm": 1.3659435510635376, + "learning_rate": 0.0001989371001874446, + "loss": 1.402, + "step": 998 + }, + { + "epoch": 0.96, + "grad_norm": 1.344510793685913, + "learning_rate": 0.00019883179520261245, + "loss": 1.519, + "step": 999 + }, + { + "epoch": 0.97, + "grad_norm": 1.455416202545166, + "learning_rate": 0.0001987264096822598, + "loss": 1.6428, + "step": 1000 + }, + { + "epoch": 0.97, + "grad_norm": 1.206539273262024, + "learning_rate": 0.00019862094374134074, + "loss": 1.2198, + "step": 1001 + }, + { + "epoch": 0.97, + "grad_norm": 1.2967417240142822, + "learning_rate": 0.0001985153974948973, + "loss": 1.4467, + "step": 1002 + }, + { + "epoch": 0.97, + "grad_norm": 1.3688677549362183, + "learning_rate": 0.00019840977105805896, + "loss": 1.5175, + "step": 1003 + }, + { + "epoch": 0.97, + "grad_norm": 1.4224023818969727, + "learning_rate": 0.00019830406454604289, + "loss": 1.3388, + "step": 1004 + }, + { + "epoch": 0.97, + "grad_norm": 1.1016898155212402, + "learning_rate": 0.00019819827807415332, + "loss": 1.256, + "step": 1005 + }, + { + "epoch": 0.97, + "grad_norm": 0.9830572605133057, + "learning_rate": 0.00019809241175778195, + "loss": 1.0293, + "step": 1006 + }, + { + "epoch": 0.97, + "grad_norm": 1.3320682048797607, + "learning_rate": 0.00019798646571240742, + "loss": 1.6151, + "step": 1007 + }, + { + "epoch": 0.97, + "grad_norm": 1.203979730606079, + "learning_rate": 0.00019788044005359534, + "loss": 1.0101, + "step": 1008 + }, + { + "epoch": 0.97, + "grad_norm": 1.121233344078064, + "learning_rate": 0.0001977743348969983, + "loss": 0.9686, + "step": 1009 + }, + { + "epoch": 0.97, + "grad_norm": 1.2444918155670166, + "learning_rate": 0.0001976681503583555, + "loss": 1.5227, + "step": 1010 + }, + { + "epoch": 0.98, + "grad_norm": 1.322102665901184, + "learning_rate": 0.0001975618865534927, + "loss": 1.6012, + "step": 1011 + }, + { + "epoch": 0.98, + "grad_norm": 1.2791893482208252, + "learning_rate": 0.0001974555435983222, + "loss": 1.3951, + "step": 1012 + }, + { + "epoch": 0.98, + "grad_norm": 1.2599291801452637, + "learning_rate": 0.00019734912160884255, + "loss": 1.5789, + "step": 1013 + }, + { + "epoch": 0.98, + "grad_norm": 1.2614623308181763, + "learning_rate": 0.0001972426207011387, + "loss": 1.2646, + "step": 1014 + }, + { + "epoch": 0.98, + "grad_norm": 0.9194993376731873, + "learning_rate": 0.0001971360409913814, + "loss": 1.1464, + "step": 1015 + }, + { + "epoch": 0.98, + "grad_norm": 1.739387035369873, + "learning_rate": 0.00019702938259582754, + "loss": 1.6735, + "step": 1016 + }, + { + "epoch": 0.98, + "grad_norm": 1.1382012367248535, + "learning_rate": 0.00019692264563081984, + "loss": 1.2768, + "step": 1017 + }, + { + "epoch": 0.98, + "grad_norm": 0.9404625296592712, + "learning_rate": 0.00019681583021278673, + "loss": 1.1835, + "step": 1018 + }, + { + "epoch": 0.98, + "grad_norm": 1.2209988832473755, + "learning_rate": 0.00019670893645824212, + "loss": 0.9123, + "step": 1019 + }, + { + "epoch": 0.98, + "grad_norm": 1.1911715269088745, + "learning_rate": 0.00019660196448378542, + "loss": 1.4704, + "step": 1020 + }, + { + "epoch": 0.99, + "grad_norm": 1.1453022956848145, + "learning_rate": 0.00019649491440610137, + "loss": 1.1527, + "step": 1021 + }, + { + "epoch": 0.99, + "grad_norm": 0.9455304145812988, + "learning_rate": 0.0001963877863419599, + "loss": 0.9554, + "step": 1022 + }, + { + "epoch": 0.99, + "grad_norm": 1.5460333824157715, + "learning_rate": 0.00019628058040821607, + "loss": 1.5023, + "step": 1023 + }, + { + "epoch": 0.99, + "grad_norm": 1.0515046119689941, + "learning_rate": 0.00019617329672180975, + "loss": 1.3552, + "step": 1024 + }, + { + "epoch": 0.99, + "grad_norm": 1.4096556901931763, + "learning_rate": 0.00019606593539976574, + "loss": 1.3264, + "step": 1025 + }, + { + "epoch": 0.99, + "grad_norm": 1.2442529201507568, + "learning_rate": 0.00019595849655919343, + "loss": 0.8898, + "step": 1026 + }, + { + "epoch": 0.99, + "grad_norm": 0.8789858818054199, + "learning_rate": 0.00019585098031728684, + "loss": 0.9477, + "step": 1027 + }, + { + "epoch": 0.99, + "grad_norm": 1.2422945499420166, + "learning_rate": 0.0001957433867913244, + "loss": 1.1831, + "step": 1028 + }, + { + "epoch": 0.99, + "grad_norm": 1.2705669403076172, + "learning_rate": 0.00019563571609866883, + "loss": 1.4243, + "step": 1029 + }, + { + "epoch": 0.99, + "grad_norm": 1.4680469036102295, + "learning_rate": 0.000195527968356767, + "loss": 1.2629, + "step": 1030 + }, + { + "epoch": 1.0, + "grad_norm": 1.369444727897644, + "learning_rate": 0.0001954201436831499, + "loss": 1.6357, + "step": 1031 + }, + { + "epoch": 1.0, + "grad_norm": 1.4526840448379517, + "learning_rate": 0.00019531224219543233, + "loss": 1.3395, + "step": 1032 + }, + { + "epoch": 1.0, + "grad_norm": 1.1711682081222534, + "learning_rate": 0.00019520426401131304, + "loss": 1.0378, + "step": 1033 + }, + { + "epoch": 1.0, + "grad_norm": 1.085681676864624, + "learning_rate": 0.0001950962092485742, + "loss": 1.3059, + "step": 1034 + }, + { + "epoch": 1.0, + "grad_norm": 1.2222062349319458, + "learning_rate": 0.00019498807802508173, + "loss": 1.1306, + "step": 1035 + }, + { + "epoch": 1.0, + "grad_norm": 0.879671573638916, + "learning_rate": 0.00019487987045878485, + "loss": 0.7041, + "step": 1036 + }, + { + "epoch": 1.0, + "grad_norm": 1.091941475868225, + "learning_rate": 0.00019477158666771617, + "loss": 0.9351, + "step": 1037 + }, + { + "epoch": 1.0, + "grad_norm": 1.0203659534454346, + "learning_rate": 0.00019466322676999122, + "loss": 1.1873, + "step": 1038 + }, + { + "epoch": 1.0, + "grad_norm": 1.1126773357391357, + "learning_rate": 0.00019455479088380874, + "loss": 0.9875, + "step": 1039 + }, + { + "epoch": 1.0, + "grad_norm": 1.186734676361084, + "learning_rate": 0.00019444627912745027, + "loss": 1.146, + "step": 1040 + }, + { + "epoch": 1.0, + "grad_norm": 1.2858699560165405, + "learning_rate": 0.00019433769161928023, + "loss": 0.932, + "step": 1041 + }, + { + "epoch": 1.01, + "grad_norm": 1.0115717649459839, + "learning_rate": 0.00019422902847774544, + "loss": 0.9152, + "step": 1042 + }, + { + "epoch": 1.01, + "grad_norm": 1.0166630744934082, + "learning_rate": 0.00019412028982137547, + "loss": 0.981, + "step": 1043 + }, + { + "epoch": 1.01, + "grad_norm": 1.0794763565063477, + "learning_rate": 0.00019401147576878207, + "loss": 0.8717, + "step": 1044 + }, + { + "epoch": 1.01, + "grad_norm": 1.314604640007019, + "learning_rate": 0.0001939025864386594, + "loss": 0.8732, + "step": 1045 + }, + { + "epoch": 1.01, + "grad_norm": 1.1519172191619873, + "learning_rate": 0.00019379362194978355, + "loss": 0.9723, + "step": 1046 + }, + { + "epoch": 1.01, + "grad_norm": 1.5442712306976318, + "learning_rate": 0.00019368458242101274, + "loss": 1.3932, + "step": 1047 + }, + { + "epoch": 1.01, + "grad_norm": 1.2915006875991821, + "learning_rate": 0.000193575467971287, + "loss": 0.9708, + "step": 1048 + }, + { + "epoch": 1.01, + "grad_norm": 1.2446788549423218, + "learning_rate": 0.00019346627871962805, + "loss": 0.969, + "step": 1049 + }, + { + "epoch": 1.01, + "grad_norm": 1.3067604303359985, + "learning_rate": 0.00019335701478513922, + "loss": 1.1397, + "step": 1050 + }, + { + "epoch": 1.01, + "grad_norm": 1.340515375137329, + "learning_rate": 0.00019324767628700533, + "loss": 1.2468, + "step": 1051 + }, + { + "epoch": 1.02, + "grad_norm": 1.191666841506958, + "learning_rate": 0.0001931382633444925, + "loss": 0.9814, + "step": 1052 + }, + { + "epoch": 1.02, + "grad_norm": 1.2348589897155762, + "learning_rate": 0.00019302877607694808, + "loss": 0.8009, + "step": 1053 + }, + { + "epoch": 1.02, + "grad_norm": 1.4749141931533813, + "learning_rate": 0.00019291921460380047, + "loss": 0.9804, + "step": 1054 + }, + { + "epoch": 1.02, + "grad_norm": 1.1957335472106934, + "learning_rate": 0.00019280957904455908, + "loss": 0.7987, + "step": 1055 + }, + { + "epoch": 1.02, + "grad_norm": 1.232792854309082, + "learning_rate": 0.00019269986951881397, + "loss": 0.7048, + "step": 1056 + }, + { + "epoch": 1.02, + "grad_norm": 1.184746503829956, + "learning_rate": 0.00019259008614623608, + "loss": 1.0233, + "step": 1057 + }, + { + "epoch": 1.02, + "grad_norm": 1.4626327753067017, + "learning_rate": 0.00019248022904657682, + "loss": 1.2599, + "step": 1058 + }, + { + "epoch": 1.02, + "grad_norm": 1.4971569776535034, + "learning_rate": 0.00019237029833966791, + "loss": 1.1236, + "step": 1059 + }, + { + "epoch": 1.02, + "grad_norm": 1.2860374450683594, + "learning_rate": 0.0001922602941454216, + "loss": 0.9537, + "step": 1060 + }, + { + "epoch": 1.02, + "grad_norm": 1.3197345733642578, + "learning_rate": 0.0001921502165838301, + "loss": 0.6415, + "step": 1061 + }, + { + "epoch": 1.03, + "grad_norm": 1.3860671520233154, + "learning_rate": 0.0001920400657749657, + "loss": 1.11, + "step": 1062 + }, + { + "epoch": 1.03, + "grad_norm": 1.2164676189422607, + "learning_rate": 0.0001919298418389806, + "loss": 0.9324, + "step": 1063 + }, + { + "epoch": 1.03, + "grad_norm": 1.0971100330352783, + "learning_rate": 0.00019181954489610675, + "loss": 0.7691, + "step": 1064 + }, + { + "epoch": 1.03, + "grad_norm": 1.2012615203857422, + "learning_rate": 0.00019170917506665586, + "loss": 1.0179, + "step": 1065 + }, + { + "epoch": 1.03, + "grad_norm": 1.4872874021530151, + "learning_rate": 0.00019159873247101894, + "loss": 1.2214, + "step": 1066 + }, + { + "epoch": 1.03, + "grad_norm": 1.1903997659683228, + "learning_rate": 0.00019148821722966654, + "loss": 0.7667, + "step": 1067 + }, + { + "epoch": 1.03, + "grad_norm": 1.1436363458633423, + "learning_rate": 0.0001913776294631483, + "loss": 0.9839, + "step": 1068 + }, + { + "epoch": 1.03, + "grad_norm": 1.2749754190444946, + "learning_rate": 0.0001912669692920931, + "loss": 0.7828, + "step": 1069 + }, + { + "epoch": 1.03, + "grad_norm": 1.1588584184646606, + "learning_rate": 0.00019115623683720872, + "loss": 0.8214, + "step": 1070 + }, + { + "epoch": 1.03, + "grad_norm": 1.3065457344055176, + "learning_rate": 0.0001910454322192819, + "loss": 0.7492, + "step": 1071 + }, + { + "epoch": 1.03, + "grad_norm": 1.2145508527755737, + "learning_rate": 0.00019093455555917792, + "loss": 1.2346, + "step": 1072 + }, + { + "epoch": 1.04, + "grad_norm": 1.2115508317947388, + "learning_rate": 0.00019082360697784086, + "loss": 1.1687, + "step": 1073 + }, + { + "epoch": 1.04, + "grad_norm": 1.1368943452835083, + "learning_rate": 0.00019071258659629297, + "loss": 0.9316, + "step": 1074 + }, + { + "epoch": 1.04, + "grad_norm": 1.0516806840896606, + "learning_rate": 0.00019060149453563509, + "loss": 0.6583, + "step": 1075 + }, + { + "epoch": 1.04, + "grad_norm": 1.2155439853668213, + "learning_rate": 0.0001904903309170461, + "loss": 1.109, + "step": 1076 + }, + { + "epoch": 1.04, + "grad_norm": 1.0046262741088867, + "learning_rate": 0.000190379095861783, + "loss": 0.5603, + "step": 1077 + }, + { + "epoch": 1.04, + "grad_norm": 1.075215458869934, + "learning_rate": 0.00019026778949118061, + "loss": 0.7349, + "step": 1078 + }, + { + "epoch": 1.04, + "grad_norm": 1.7739472389221191, + "learning_rate": 0.00019015641192665172, + "loss": 1.2746, + "step": 1079 + }, + { + "epoch": 1.04, + "grad_norm": 1.1507693529129028, + "learning_rate": 0.00019004496328968662, + "loss": 0.9482, + "step": 1080 + }, + { + "epoch": 1.04, + "grad_norm": 1.2549113035202026, + "learning_rate": 0.00018993344370185316, + "loss": 1.0353, + "step": 1081 + }, + { + "epoch": 1.04, + "grad_norm": 1.3268182277679443, + "learning_rate": 0.00018982185328479662, + "loss": 1.2067, + "step": 1082 + }, + { + "epoch": 1.05, + "grad_norm": 1.0886125564575195, + "learning_rate": 0.00018971019216023954, + "loss": 0.6659, + "step": 1083 + }, + { + "epoch": 1.05, + "grad_norm": 1.0336356163024902, + "learning_rate": 0.00018959846044998152, + "loss": 0.7435, + "step": 1084 + }, + { + "epoch": 1.05, + "grad_norm": 1.3987981081008911, + "learning_rate": 0.00018948665827589928, + "loss": 1.3236, + "step": 1085 + }, + { + "epoch": 1.05, + "grad_norm": 1.208726167678833, + "learning_rate": 0.00018937478575994628, + "loss": 0.8633, + "step": 1086 + }, + { + "epoch": 1.05, + "grad_norm": 0.9976479411125183, + "learning_rate": 0.0001892628430241527, + "loss": 0.71, + "step": 1087 + }, + { + "epoch": 1.05, + "grad_norm": 1.2253488302230835, + "learning_rate": 0.00018915083019062552, + "loss": 0.8444, + "step": 1088 + }, + { + "epoch": 1.05, + "grad_norm": 1.0337365865707397, + "learning_rate": 0.00018903874738154786, + "loss": 0.8667, + "step": 1089 + }, + { + "epoch": 1.05, + "grad_norm": 0.9623100757598877, + "learning_rate": 0.00018892659471917946, + "loss": 0.631, + "step": 1090 + }, + { + "epoch": 1.05, + "grad_norm": 1.048235297203064, + "learning_rate": 0.00018881437232585614, + "loss": 0.8731, + "step": 1091 + }, + { + "epoch": 1.05, + "grad_norm": 1.1531683206558228, + "learning_rate": 0.00018870208032398968, + "loss": 0.965, + "step": 1092 + }, + { + "epoch": 1.06, + "grad_norm": 1.4350179433822632, + "learning_rate": 0.00018858971883606795, + "loss": 1.3628, + "step": 1093 + }, + { + "epoch": 1.06, + "grad_norm": 1.2467992305755615, + "learning_rate": 0.00018847728798465462, + "loss": 1.0464, + "step": 1094 + }, + { + "epoch": 1.06, + "grad_norm": 1.1029807329177856, + "learning_rate": 0.00018836478789238887, + "loss": 0.5425, + "step": 1095 + }, + { + "epoch": 1.06, + "grad_norm": 1.1991395950317383, + "learning_rate": 0.0001882522186819855, + "loss": 0.7859, + "step": 1096 + }, + { + "epoch": 1.06, + "grad_norm": 1.2336832284927368, + "learning_rate": 0.0001881395804762347, + "loss": 0.9693, + "step": 1097 + }, + { + "epoch": 1.06, + "grad_norm": 1.3640801906585693, + "learning_rate": 0.00018802687339800194, + "loss": 1.1183, + "step": 1098 + }, + { + "epoch": 1.06, + "grad_norm": 0.9197866320610046, + "learning_rate": 0.0001879140975702278, + "loss": 0.5181, + "step": 1099 + }, + { + "epoch": 1.06, + "grad_norm": 1.2878599166870117, + "learning_rate": 0.00018780125311592782, + "loss": 1.1359, + "step": 1100 + }, + { + "epoch": 1.06, + "grad_norm": 1.1678839921951294, + "learning_rate": 0.00018768834015819242, + "loss": 0.8813, + "step": 1101 + }, + { + "epoch": 1.06, + "grad_norm": 1.182902216911316, + "learning_rate": 0.00018757535882018677, + "loss": 0.9381, + "step": 1102 + }, + { + "epoch": 1.06, + "grad_norm": 1.4679259061813354, + "learning_rate": 0.00018746230922515056, + "loss": 1.3104, + "step": 1103 + }, + { + "epoch": 1.07, + "grad_norm": 1.3574234247207642, + "learning_rate": 0.00018734919149639804, + "loss": 1.3659, + "step": 1104 + }, + { + "epoch": 1.07, + "grad_norm": 1.0465551614761353, + "learning_rate": 0.00018723600575731766, + "loss": 0.7132, + "step": 1105 + }, + { + "epoch": 1.07, + "grad_norm": 1.2153966426849365, + "learning_rate": 0.0001871227521313722, + "loss": 1.1371, + "step": 1106 + }, + { + "epoch": 1.07, + "grad_norm": 1.1352930068969727, + "learning_rate": 0.00018700943074209832, + "loss": 0.8152, + "step": 1107 + }, + { + "epoch": 1.07, + "grad_norm": 1.13296639919281, + "learning_rate": 0.0001868960417131067, + "loss": 0.7719, + "step": 1108 + }, + { + "epoch": 1.07, + "grad_norm": 0.8700066804885864, + "learning_rate": 0.00018678258516808183, + "loss": 0.4736, + "step": 1109 + }, + { + "epoch": 1.07, + "grad_norm": 1.1155449151992798, + "learning_rate": 0.00018666906123078177, + "loss": 0.8818, + "step": 1110 + }, + { + "epoch": 1.07, + "grad_norm": 1.0150140523910522, + "learning_rate": 0.00018655547002503807, + "loss": 0.6064, + "step": 1111 + }, + { + "epoch": 1.07, + "grad_norm": 1.755456805229187, + "learning_rate": 0.00018644181167475578, + "loss": 1.3616, + "step": 1112 + }, + { + "epoch": 1.07, + "grad_norm": 1.0811755657196045, + "learning_rate": 0.0001863280863039131, + "loss": 0.8551, + "step": 1113 + }, + { + "epoch": 1.08, + "grad_norm": 1.2699004411697388, + "learning_rate": 0.00018621429403656135, + "loss": 0.9028, + "step": 1114 + }, + { + "epoch": 1.08, + "grad_norm": 1.1369472742080688, + "learning_rate": 0.0001861004349968248, + "loss": 0.8264, + "step": 1115 + }, + { + "epoch": 1.08, + "grad_norm": 0.8735272884368896, + "learning_rate": 0.00018598650930890063, + "loss": 0.4994, + "step": 1116 + }, + { + "epoch": 1.08, + "grad_norm": 1.5912067890167236, + "learning_rate": 0.00018587251709705857, + "loss": 1.245, + "step": 1117 + }, + { + "epoch": 1.08, + "grad_norm": 1.310185194015503, + "learning_rate": 0.00018575845848564117, + "loss": 1.0203, + "step": 1118 + }, + { + "epoch": 1.08, + "grad_norm": 1.2570037841796875, + "learning_rate": 0.00018564433359906311, + "loss": 0.8932, + "step": 1119 + }, + { + "epoch": 1.08, + "grad_norm": 1.3394275903701782, + "learning_rate": 0.00018553014256181158, + "loss": 1.4023, + "step": 1120 + }, + { + "epoch": 1.08, + "grad_norm": 1.382513403892517, + "learning_rate": 0.00018541588549844584, + "loss": 0.6844, + "step": 1121 + }, + { + "epoch": 1.08, + "grad_norm": 1.244809865951538, + "learning_rate": 0.00018530156253359714, + "loss": 0.9417, + "step": 1122 + }, + { + "epoch": 1.08, + "grad_norm": 1.0259147882461548, + "learning_rate": 0.00018518717379196873, + "loss": 0.8997, + "step": 1123 + }, + { + "epoch": 1.08, + "grad_norm": 0.8853318691253662, + "learning_rate": 0.00018507271939833546, + "loss": 0.4958, + "step": 1124 + }, + { + "epoch": 1.09, + "grad_norm": 1.3527164459228516, + "learning_rate": 0.00018495819947754398, + "loss": 1.1756, + "step": 1125 + }, + { + "epoch": 1.09, + "grad_norm": 1.355082392692566, + "learning_rate": 0.00018484361415451218, + "loss": 1.2114, + "step": 1126 + }, + { + "epoch": 1.09, + "grad_norm": 1.2699202299118042, + "learning_rate": 0.00018472896355422946, + "loss": 0.6697, + "step": 1127 + }, + { + "epoch": 1.09, + "grad_norm": 1.4729478359222412, + "learning_rate": 0.00018461424780175645, + "loss": 1.4576, + "step": 1128 + }, + { + "epoch": 1.09, + "grad_norm": 1.1743948459625244, + "learning_rate": 0.00018449946702222465, + "loss": 0.8025, + "step": 1129 + }, + { + "epoch": 1.09, + "grad_norm": 1.4571646451950073, + "learning_rate": 0.00018438462134083675, + "loss": 1.0594, + "step": 1130 + }, + { + "epoch": 1.09, + "grad_norm": 1.3907976150512695, + "learning_rate": 0.00018426971088286604, + "loss": 1.0993, + "step": 1131 + }, + { + "epoch": 1.09, + "grad_norm": 1.4448930025100708, + "learning_rate": 0.00018415473577365653, + "loss": 1.171, + "step": 1132 + }, + { + "epoch": 1.09, + "grad_norm": 1.0253756046295166, + "learning_rate": 0.00018403969613862278, + "loss": 0.592, + "step": 1133 + }, + { + "epoch": 1.09, + "grad_norm": 1.2286980152130127, + "learning_rate": 0.00018392459210324975, + "loss": 1.043, + "step": 1134 + }, + { + "epoch": 1.1, + "grad_norm": 1.1266531944274902, + "learning_rate": 0.00018380942379309247, + "loss": 0.6981, + "step": 1135 + }, + { + "epoch": 1.1, + "grad_norm": 1.3554151058197021, + "learning_rate": 0.00018369419133377636, + "loss": 0.6397, + "step": 1136 + }, + { + "epoch": 1.1, + "grad_norm": 1.3931509256362915, + "learning_rate": 0.0001835788948509966, + "loss": 0.9987, + "step": 1137 + }, + { + "epoch": 1.1, + "grad_norm": 1.070232629776001, + "learning_rate": 0.00018346353447051827, + "loss": 0.8607, + "step": 1138 + }, + { + "epoch": 1.1, + "grad_norm": 1.1246999502182007, + "learning_rate": 0.00018334811031817626, + "loss": 0.8323, + "step": 1139 + }, + { + "epoch": 1.1, + "grad_norm": 1.3503284454345703, + "learning_rate": 0.00018323262251987476, + "loss": 0.9194, + "step": 1140 + }, + { + "epoch": 1.1, + "grad_norm": 1.0961241722106934, + "learning_rate": 0.00018311707120158768, + "loss": 0.7868, + "step": 1141 + }, + { + "epoch": 1.1, + "grad_norm": 1.3863087892532349, + "learning_rate": 0.000183001456489358, + "loss": 1.2253, + "step": 1142 + }, + { + "epoch": 1.1, + "grad_norm": 1.0947837829589844, + "learning_rate": 0.000182885778509298, + "loss": 0.932, + "step": 1143 + }, + { + "epoch": 1.1, + "grad_norm": 1.5243417024612427, + "learning_rate": 0.00018277003738758886, + "loss": 1.1453, + "step": 1144 + }, + { + "epoch": 1.11, + "grad_norm": 1.2207045555114746, + "learning_rate": 0.00018265423325048067, + "loss": 1.0409, + "step": 1145 + }, + { + "epoch": 1.11, + "grad_norm": 1.0962190628051758, + "learning_rate": 0.00018253836622429229, + "loss": 0.981, + "step": 1146 + }, + { + "epoch": 1.11, + "grad_norm": 0.893530547618866, + "learning_rate": 0.00018242243643541115, + "loss": 0.6334, + "step": 1147 + }, + { + "epoch": 1.11, + "grad_norm": 1.153269648551941, + "learning_rate": 0.00018230644401029313, + "loss": 0.9353, + "step": 1148 + }, + { + "epoch": 1.11, + "grad_norm": 1.3481030464172363, + "learning_rate": 0.00018219038907546254, + "loss": 1.11, + "step": 1149 + }, + { + "epoch": 1.11, + "grad_norm": 1.2734599113464355, + "learning_rate": 0.00018207427175751165, + "loss": 1.037, + "step": 1150 + }, + { + "epoch": 1.11, + "grad_norm": 0.8931764364242554, + "learning_rate": 0.000181958092183101, + "loss": 0.5017, + "step": 1151 + }, + { + "epoch": 1.11, + "grad_norm": 1.3497036695480347, + "learning_rate": 0.0001818418504789589, + "loss": 0.7842, + "step": 1152 + }, + { + "epoch": 1.11, + "grad_norm": 1.232225775718689, + "learning_rate": 0.00018172554677188155, + "loss": 0.9868, + "step": 1153 + }, + { + "epoch": 1.11, + "grad_norm": 1.5071431398391724, + "learning_rate": 0.00018160918118873264, + "loss": 1.1483, + "step": 1154 + }, + { + "epoch": 1.11, + "grad_norm": 1.155139684677124, + "learning_rate": 0.00018149275385644353, + "loss": 0.9804, + "step": 1155 + }, + { + "epoch": 1.12, + "grad_norm": 1.130774736404419, + "learning_rate": 0.00018137626490201273, + "loss": 0.9467, + "step": 1156 + }, + { + "epoch": 1.12, + "grad_norm": 1.2794990539550781, + "learning_rate": 0.00018125971445250613, + "loss": 1.1415, + "step": 1157 + }, + { + "epoch": 1.12, + "grad_norm": 1.308632731437683, + "learning_rate": 0.00018114310263505666, + "loss": 0.9147, + "step": 1158 + }, + { + "epoch": 1.12, + "grad_norm": 1.013698935508728, + "learning_rate": 0.0001810264295768641, + "loss": 0.5198, + "step": 1159 + }, + { + "epoch": 1.12, + "grad_norm": 1.1336946487426758, + "learning_rate": 0.0001809096954051952, + "loss": 0.7719, + "step": 1160 + }, + { + "epoch": 1.12, + "grad_norm": 1.2344475984573364, + "learning_rate": 0.00018079290024738316, + "loss": 0.9903, + "step": 1161 + }, + { + "epoch": 1.12, + "grad_norm": 1.2032207250595093, + "learning_rate": 0.0001806760442308279, + "loss": 0.8217, + "step": 1162 + }, + { + "epoch": 1.12, + "grad_norm": 1.0712535381317139, + "learning_rate": 0.00018055912748299565, + "loss": 0.9274, + "step": 1163 + }, + { + "epoch": 1.12, + "grad_norm": 1.2632014751434326, + "learning_rate": 0.00018044215013141886, + "loss": 0.842, + "step": 1164 + }, + { + "epoch": 1.12, + "grad_norm": 1.4885759353637695, + "learning_rate": 0.00018032511230369605, + "loss": 0.9768, + "step": 1165 + }, + { + "epoch": 1.13, + "grad_norm": 1.3400936126708984, + "learning_rate": 0.00018020801412749185, + "loss": 0.7842, + "step": 1166 + }, + { + "epoch": 1.13, + "grad_norm": 1.155679702758789, + "learning_rate": 0.0001800908557305366, + "loss": 0.4815, + "step": 1167 + }, + { + "epoch": 1.13, + "grad_norm": 1.4821432828903198, + "learning_rate": 0.00017997363724062633, + "loss": 0.9798, + "step": 1168 + }, + { + "epoch": 1.13, + "grad_norm": 1.346875786781311, + "learning_rate": 0.0001798563587856227, + "loss": 0.9786, + "step": 1169 + }, + { + "epoch": 1.13, + "grad_norm": 1.5733568668365479, + "learning_rate": 0.0001797390204934527, + "loss": 1.1719, + "step": 1170 + }, + { + "epoch": 1.13, + "grad_norm": 0.9875895380973816, + "learning_rate": 0.0001796216224921086, + "loss": 0.4865, + "step": 1171 + }, + { + "epoch": 1.13, + "grad_norm": 1.3760789632797241, + "learning_rate": 0.00017950416490964785, + "loss": 0.9958, + "step": 1172 + }, + { + "epoch": 1.13, + "grad_norm": 1.535853624343872, + "learning_rate": 0.00017938664787419285, + "loss": 1.6119, + "step": 1173 + }, + { + "epoch": 1.13, + "grad_norm": 1.409841537475586, + "learning_rate": 0.00017926907151393087, + "loss": 1.1741, + "step": 1174 + }, + { + "epoch": 1.13, + "grad_norm": 1.3689299821853638, + "learning_rate": 0.00017915143595711387, + "loss": 0.7744, + "step": 1175 + }, + { + "epoch": 1.14, + "grad_norm": 1.3054869174957275, + "learning_rate": 0.00017903374133205834, + "loss": 0.901, + "step": 1176 + }, + { + "epoch": 1.14, + "grad_norm": 1.3351161479949951, + "learning_rate": 0.00017891598776714537, + "loss": 1.3365, + "step": 1177 + }, + { + "epoch": 1.14, + "grad_norm": 1.4273672103881836, + "learning_rate": 0.0001787981753908202, + "loss": 1.0755, + "step": 1178 + }, + { + "epoch": 1.14, + "grad_norm": 1.6055691242218018, + "learning_rate": 0.00017868030433159217, + "loss": 1.131, + "step": 1179 + }, + { + "epoch": 1.14, + "grad_norm": 0.9369842410087585, + "learning_rate": 0.00017856237471803473, + "loss": 0.6455, + "step": 1180 + }, + { + "epoch": 1.14, + "grad_norm": 1.1839696168899536, + "learning_rate": 0.00017844438667878525, + "loss": 1.3519, + "step": 1181 + }, + { + "epoch": 1.14, + "grad_norm": 0.9531559348106384, + "learning_rate": 0.00017832634034254476, + "loss": 0.6091, + "step": 1182 + }, + { + "epoch": 1.14, + "grad_norm": 1.396750569343567, + "learning_rate": 0.00017820823583807778, + "loss": 0.6756, + "step": 1183 + }, + { + "epoch": 1.14, + "grad_norm": 1.2895655632019043, + "learning_rate": 0.00017809007329421244, + "loss": 1.3657, + "step": 1184 + }, + { + "epoch": 1.14, + "grad_norm": 1.5052480697631836, + "learning_rate": 0.00017797185283984017, + "loss": 1.3727, + "step": 1185 + }, + { + "epoch": 1.14, + "grad_norm": 1.4311729669570923, + "learning_rate": 0.00017785357460391543, + "loss": 1.0337, + "step": 1186 + }, + { + "epoch": 1.15, + "grad_norm": 1.0396034717559814, + "learning_rate": 0.00017773523871545581, + "loss": 0.6779, + "step": 1187 + }, + { + "epoch": 1.15, + "grad_norm": 1.4612584114074707, + "learning_rate": 0.0001776168453035419, + "loss": 1.2791, + "step": 1188 + }, + { + "epoch": 1.15, + "grad_norm": 1.1547785997390747, + "learning_rate": 0.00017749839449731673, + "loss": 0.8077, + "step": 1189 + }, + { + "epoch": 1.15, + "grad_norm": 1.1935621500015259, + "learning_rate": 0.00017737988642598624, + "loss": 0.6824, + "step": 1190 + }, + { + "epoch": 1.15, + "grad_norm": 1.043798565864563, + "learning_rate": 0.00017726132121881864, + "loss": 0.7545, + "step": 1191 + }, + { + "epoch": 1.15, + "grad_norm": 1.3549226522445679, + "learning_rate": 0.00017714269900514457, + "loss": 1.1167, + "step": 1192 + }, + { + "epoch": 1.15, + "grad_norm": 1.1858012676239014, + "learning_rate": 0.00017702401991435683, + "loss": 0.9483, + "step": 1193 + }, + { + "epoch": 1.15, + "grad_norm": 1.0237493515014648, + "learning_rate": 0.0001769052840759102, + "loss": 0.7064, + "step": 1194 + }, + { + "epoch": 1.15, + "grad_norm": 1.4379832744598389, + "learning_rate": 0.00017678649161932146, + "loss": 1.1197, + "step": 1195 + }, + { + "epoch": 1.15, + "grad_norm": 0.8990068435668945, + "learning_rate": 0.00017666764267416904, + "loss": 0.4801, + "step": 1196 + }, + { + "epoch": 1.16, + "grad_norm": 1.2909594774246216, + "learning_rate": 0.0001765487373700931, + "loss": 0.9925, + "step": 1197 + }, + { + "epoch": 1.16, + "grad_norm": 1.4378669261932373, + "learning_rate": 0.0001764297758367952, + "loss": 1.1316, + "step": 1198 + }, + { + "epoch": 1.16, + "grad_norm": 1.493698239326477, + "learning_rate": 0.00017631075820403823, + "loss": 0.9835, + "step": 1199 + }, + { + "epoch": 1.16, + "grad_norm": 1.2861640453338623, + "learning_rate": 0.00017619168460164636, + "loss": 1.2319, + "step": 1200 + }, + { + "epoch": 1.16, + "grad_norm": 1.1576422452926636, + "learning_rate": 0.00017607255515950468, + "loss": 0.75, + "step": 1201 + }, + { + "epoch": 1.16, + "grad_norm": 1.2239806652069092, + "learning_rate": 0.00017595337000755936, + "loss": 0.9199, + "step": 1202 + }, + { + "epoch": 1.16, + "grad_norm": 1.2435945272445679, + "learning_rate": 0.0001758341292758171, + "loss": 1.1669, + "step": 1203 + }, + { + "epoch": 1.16, + "grad_norm": 1.130832552909851, + "learning_rate": 0.00017571483309434545, + "loss": 0.5273, + "step": 1204 + }, + { + "epoch": 1.16, + "grad_norm": 1.2331628799438477, + "learning_rate": 0.00017559548159327238, + "loss": 0.8775, + "step": 1205 + }, + { + "epoch": 1.16, + "grad_norm": 1.422377586364746, + "learning_rate": 0.0001754760749027861, + "loss": 0.9985, + "step": 1206 + }, + { + "epoch": 1.17, + "grad_norm": 1.3476437330245972, + "learning_rate": 0.0001753566131531352, + "loss": 0.9553, + "step": 1207 + }, + { + "epoch": 1.17, + "grad_norm": 1.1213027238845825, + "learning_rate": 0.00017523709647462816, + "loss": 0.9864, + "step": 1208 + }, + { + "epoch": 1.17, + "grad_norm": 1.28190016746521, + "learning_rate": 0.00017511752499763347, + "loss": 0.9803, + "step": 1209 + }, + { + "epoch": 1.17, + "grad_norm": 1.8669500350952148, + "learning_rate": 0.00017499789885257937, + "loss": 1.5848, + "step": 1210 + }, + { + "epoch": 1.17, + "grad_norm": 1.5373106002807617, + "learning_rate": 0.0001748782181699537, + "loss": 1.3516, + "step": 1211 + }, + { + "epoch": 1.17, + "grad_norm": 1.0895917415618896, + "learning_rate": 0.00017475848308030385, + "loss": 0.8032, + "step": 1212 + }, + { + "epoch": 1.17, + "grad_norm": 1.0846320390701294, + "learning_rate": 0.00017463869371423657, + "loss": 0.7299, + "step": 1213 + }, + { + "epoch": 1.17, + "grad_norm": 1.2971018552780151, + "learning_rate": 0.0001745188502024177, + "loss": 0.8337, + "step": 1214 + }, + { + "epoch": 1.17, + "grad_norm": 1.5091488361358643, + "learning_rate": 0.00017439895267557225, + "loss": 1.2648, + "step": 1215 + }, + { + "epoch": 1.17, + "grad_norm": 1.170003056526184, + "learning_rate": 0.00017427900126448415, + "loss": 0.8008, + "step": 1216 + }, + { + "epoch": 1.17, + "grad_norm": 1.247881293296814, + "learning_rate": 0.000174158996099996, + "loss": 0.8748, + "step": 1217 + }, + { + "epoch": 1.18, + "grad_norm": 1.2198271751403809, + "learning_rate": 0.00017403893731300917, + "loss": 0.8583, + "step": 1218 + }, + { + "epoch": 1.18, + "grad_norm": 1.173490285873413, + "learning_rate": 0.0001739188250344834, + "loss": 0.8296, + "step": 1219 + }, + { + "epoch": 1.18, + "grad_norm": 1.0720738172531128, + "learning_rate": 0.00017379865939543684, + "loss": 0.56, + "step": 1220 + }, + { + "epoch": 1.18, + "grad_norm": 1.481160044670105, + "learning_rate": 0.00017367844052694592, + "loss": 0.8178, + "step": 1221 + }, + { + "epoch": 1.18, + "grad_norm": 1.0757296085357666, + "learning_rate": 0.00017355816856014496, + "loss": 0.7001, + "step": 1222 + }, + { + "epoch": 1.18, + "grad_norm": 0.9257085919380188, + "learning_rate": 0.00017343784362622636, + "loss": 0.6719, + "step": 1223 + }, + { + "epoch": 1.18, + "grad_norm": 1.2127162218093872, + "learning_rate": 0.00017331746585644018, + "loss": 0.8813, + "step": 1224 + }, + { + "epoch": 1.18, + "grad_norm": 1.4545499086380005, + "learning_rate": 0.0001731970353820942, + "loss": 0.7659, + "step": 1225 + }, + { + "epoch": 1.18, + "grad_norm": 1.1807411909103394, + "learning_rate": 0.00017307655233455362, + "loss": 0.632, + "step": 1226 + }, + { + "epoch": 1.18, + "grad_norm": 1.146098256111145, + "learning_rate": 0.00017295601684524104, + "loss": 0.9484, + "step": 1227 + }, + { + "epoch": 1.19, + "grad_norm": 1.102598786354065, + "learning_rate": 0.00017283542904563624, + "loss": 0.6946, + "step": 1228 + }, + { + "epoch": 1.19, + "grad_norm": 1.276293396949768, + "learning_rate": 0.00017271478906727604, + "loss": 1.104, + "step": 1229 + }, + { + "epoch": 1.19, + "grad_norm": 1.6087700128555298, + "learning_rate": 0.00017259409704175425, + "loss": 0.9789, + "step": 1230 + }, + { + "epoch": 1.19, + "grad_norm": 1.503319263458252, + "learning_rate": 0.00017247335310072135, + "loss": 1.3899, + "step": 1231 + }, + { + "epoch": 1.19, + "grad_norm": 1.1361923217773438, + "learning_rate": 0.00017235255737588453, + "loss": 0.663, + "step": 1232 + }, + { + "epoch": 1.19, + "grad_norm": 1.0255531072616577, + "learning_rate": 0.0001722317099990074, + "loss": 0.8311, + "step": 1233 + }, + { + "epoch": 1.19, + "grad_norm": 1.3245635032653809, + "learning_rate": 0.00017211081110191, + "loss": 0.8968, + "step": 1234 + }, + { + "epoch": 1.19, + "grad_norm": 1.347150444984436, + "learning_rate": 0.0001719898608164685, + "loss": 0.99, + "step": 1235 + }, + { + "epoch": 1.19, + "grad_norm": 1.0764960050582886, + "learning_rate": 0.00017186885927461515, + "loss": 0.8349, + "step": 1236 + }, + { + "epoch": 1.19, + "grad_norm": 1.0420310497283936, + "learning_rate": 0.00017174780660833805, + "loss": 0.8254, + "step": 1237 + }, + { + "epoch": 1.19, + "grad_norm": 1.7763484716415405, + "learning_rate": 0.00017162670294968116, + "loss": 1.8731, + "step": 1238 + }, + { + "epoch": 1.2, + "grad_norm": 0.8756821751594543, + "learning_rate": 0.00017150554843074406, + "loss": 0.4927, + "step": 1239 + }, + { + "epoch": 1.2, + "grad_norm": 1.204406499862671, + "learning_rate": 0.00017138434318368168, + "loss": 0.955, + "step": 1240 + }, + { + "epoch": 1.2, + "grad_norm": 1.2387770414352417, + "learning_rate": 0.00017126308734070444, + "loss": 1.0619, + "step": 1241 + }, + { + "epoch": 1.2, + "grad_norm": 1.0382620096206665, + "learning_rate": 0.00017114178103407783, + "loss": 0.7705, + "step": 1242 + }, + { + "epoch": 1.2, + "grad_norm": 0.9521413445472717, + "learning_rate": 0.00017102042439612246, + "loss": 0.5972, + "step": 1243 + }, + { + "epoch": 1.2, + "grad_norm": 1.4533095359802246, + "learning_rate": 0.00017089901755921387, + "loss": 1.268, + "step": 1244 + }, + { + "epoch": 1.2, + "eval_loss": 1.4754925966262817, + "eval_runtime": 28.2048, + "eval_samples_per_second": 3.014, + "eval_steps_per_second": 1.525, + "step": 1244 + }, + { + "epoch": 1.2, + "grad_norm": 1.1730684041976929, + "learning_rate": 0.00017077756065578224, + "loss": 0.838, + "step": 1245 + }, + { + "epoch": 1.2, + "grad_norm": 1.598128080368042, + "learning_rate": 0.00017065605381831242, + "loss": 0.9254, + "step": 1246 + }, + { + "epoch": 1.2, + "grad_norm": 1.2155981063842773, + "learning_rate": 0.00017053449717934377, + "loss": 0.9429, + "step": 1247 + }, + { + "epoch": 1.2, + "grad_norm": 1.1935361623764038, + "learning_rate": 0.00017041289087146999, + "loss": 0.8267, + "step": 1248 + }, + { + "epoch": 1.21, + "grad_norm": 1.027895450592041, + "learning_rate": 0.0001702912350273388, + "loss": 0.8369, + "step": 1249 + }, + { + "epoch": 1.21, + "grad_norm": 1.0867793560028076, + "learning_rate": 0.00017016952977965216, + "loss": 0.701, + "step": 1250 + }, + { + "epoch": 1.21, + "grad_norm": 1.261580467224121, + "learning_rate": 0.00017004777526116578, + "loss": 1.3502, + "step": 1251 + }, + { + "epoch": 1.21, + "grad_norm": 1.3365131616592407, + "learning_rate": 0.0001699259716046891, + "loss": 1.1648, + "step": 1252 + }, + { + "epoch": 1.21, + "grad_norm": 1.0136491060256958, + "learning_rate": 0.00016980411894308535, + "loss": 0.7513, + "step": 1253 + }, + { + "epoch": 1.21, + "grad_norm": 1.0561655759811401, + "learning_rate": 0.00016968221740927092, + "loss": 0.6481, + "step": 1254 + }, + { + "epoch": 1.21, + "grad_norm": 1.2038730382919312, + "learning_rate": 0.0001695602671362158, + "loss": 1.1284, + "step": 1255 + }, + { + "epoch": 1.21, + "grad_norm": 1.089046835899353, + "learning_rate": 0.00016943826825694297, + "loss": 0.6818, + "step": 1256 + }, + { + "epoch": 1.21, + "grad_norm": 1.2403234243392944, + "learning_rate": 0.00016931622090452845, + "loss": 1.0335, + "step": 1257 + }, + { + "epoch": 1.21, + "grad_norm": 1.31325101852417, + "learning_rate": 0.00016919412521210117, + "loss": 0.8025, + "step": 1258 + }, + { + "epoch": 1.22, + "grad_norm": 0.9705163836479187, + "learning_rate": 0.0001690719813128428, + "loss": 0.5008, + "step": 1259 + }, + { + "epoch": 1.22, + "grad_norm": 1.4634290933609009, + "learning_rate": 0.00016894978933998758, + "loss": 1.2704, + "step": 1260 + }, + { + "epoch": 1.22, + "grad_norm": 1.4494082927703857, + "learning_rate": 0.00016882754942682216, + "loss": 0.9463, + "step": 1261 + }, + { + "epoch": 1.22, + "grad_norm": 1.2695739269256592, + "learning_rate": 0.0001687052617066855, + "loss": 0.8996, + "step": 1262 + }, + { + "epoch": 1.22, + "grad_norm": 0.9623706936836243, + "learning_rate": 0.0001685829263129687, + "loss": 0.8756, + "step": 1263 + }, + { + "epoch": 1.22, + "grad_norm": 0.99055016040802, + "learning_rate": 0.00016846054337911487, + "loss": 0.6137, + "step": 1264 + }, + { + "epoch": 1.22, + "grad_norm": 1.3150849342346191, + "learning_rate": 0.00016833811303861907, + "loss": 0.7671, + "step": 1265 + }, + { + "epoch": 1.22, + "grad_norm": 1.3409266471862793, + "learning_rate": 0.00016821563542502782, + "loss": 0.9181, + "step": 1266 + }, + { + "epoch": 1.22, + "grad_norm": 1.1556544303894043, + "learning_rate": 0.0001680931106719395, + "loss": 0.8652, + "step": 1267 + }, + { + "epoch": 1.22, + "grad_norm": 1.262967824935913, + "learning_rate": 0.00016797053891300377, + "loss": 0.9559, + "step": 1268 + }, + { + "epoch": 1.22, + "grad_norm": 1.3712326288223267, + "learning_rate": 0.0001678479202819215, + "loss": 1.1403, + "step": 1269 + }, + { + "epoch": 1.23, + "grad_norm": 1.3038856983184814, + "learning_rate": 0.00016772525491244472, + "loss": 1.1641, + "step": 1270 + }, + { + "epoch": 1.23, + "grad_norm": 1.2619274854660034, + "learning_rate": 0.00016760254293837664, + "loss": 0.8464, + "step": 1271 + }, + { + "epoch": 1.23, + "grad_norm": 1.2340606451034546, + "learning_rate": 0.00016747978449357104, + "loss": 1.0419, + "step": 1272 + }, + { + "epoch": 1.23, + "grad_norm": 0.9198154211044312, + "learning_rate": 0.00016735697971193247, + "loss": 0.7761, + "step": 1273 + }, + { + "epoch": 1.23, + "grad_norm": 1.0390851497650146, + "learning_rate": 0.00016723412872741617, + "loss": 0.989, + "step": 1274 + }, + { + "epoch": 1.23, + "grad_norm": 1.4333105087280273, + "learning_rate": 0.00016711123167402752, + "loss": 1.2331, + "step": 1275 + }, + { + "epoch": 1.23, + "grad_norm": 1.4277148246765137, + "learning_rate": 0.0001669882886858224, + "loss": 0.7869, + "step": 1276 + }, + { + "epoch": 1.23, + "grad_norm": 1.2195308208465576, + "learning_rate": 0.00016686529989690658, + "loss": 1.0324, + "step": 1277 + }, + { + "epoch": 1.23, + "grad_norm": 1.05024254322052, + "learning_rate": 0.00016674226544143595, + "loss": 0.7656, + "step": 1278 + }, + { + "epoch": 1.23, + "grad_norm": 1.4788092374801636, + "learning_rate": 0.0001666191854536162, + "loss": 1.0497, + "step": 1279 + }, + { + "epoch": 1.24, + "grad_norm": 1.185929536819458, + "learning_rate": 0.0001664960600677026, + "loss": 0.8825, + "step": 1280 + }, + { + "epoch": 1.24, + "grad_norm": 1.2931797504425049, + "learning_rate": 0.00016637288941799997, + "loss": 1.2804, + "step": 1281 + }, + { + "epoch": 1.24, + "grad_norm": 1.1615878343582153, + "learning_rate": 0.00016624967363886253, + "loss": 0.7602, + "step": 1282 + }, + { + "epoch": 1.24, + "grad_norm": 1.217430830001831, + "learning_rate": 0.00016612641286469377, + "loss": 0.9886, + "step": 1283 + }, + { + "epoch": 1.24, + "grad_norm": 1.4690346717834473, + "learning_rate": 0.00016600310722994613, + "loss": 1.1719, + "step": 1284 + }, + { + "epoch": 1.24, + "grad_norm": 1.2001279592514038, + "learning_rate": 0.00016587975686912113, + "loss": 0.877, + "step": 1285 + }, + { + "epoch": 1.24, + "grad_norm": 0.923620879650116, + "learning_rate": 0.0001657563619167689, + "loss": 0.6299, + "step": 1286 + }, + { + "epoch": 1.24, + "grad_norm": 1.2251518964767456, + "learning_rate": 0.0001656329225074884, + "loss": 0.7887, + "step": 1287 + }, + { + "epoch": 1.24, + "grad_norm": 1.6504628658294678, + "learning_rate": 0.00016550943877592704, + "loss": 0.9195, + "step": 1288 + }, + { + "epoch": 1.24, + "grad_norm": 1.341532826423645, + "learning_rate": 0.00016538591085678043, + "loss": 0.8872, + "step": 1289 + }, + { + "epoch": 1.25, + "grad_norm": 1.5258677005767822, + "learning_rate": 0.00016526233888479255, + "loss": 1.1348, + "step": 1290 + }, + { + "epoch": 1.25, + "grad_norm": 1.1335400342941284, + "learning_rate": 0.00016513872299475535, + "loss": 0.7941, + "step": 1291 + }, + { + "epoch": 1.25, + "grad_norm": 1.2744227647781372, + "learning_rate": 0.00016501506332150873, + "loss": 0.9789, + "step": 1292 + }, + { + "epoch": 1.25, + "grad_norm": 1.2145026922225952, + "learning_rate": 0.00016489135999994025, + "loss": 0.9446, + "step": 1293 + }, + { + "epoch": 1.25, + "grad_norm": 0.9690343141555786, + "learning_rate": 0.00016476761316498527, + "loss": 0.8523, + "step": 1294 + }, + { + "epoch": 1.25, + "grad_norm": 1.2858216762542725, + "learning_rate": 0.00016464382295162643, + "loss": 1.2376, + "step": 1295 + }, + { + "epoch": 1.25, + "grad_norm": 1.5635852813720703, + "learning_rate": 0.00016451998949489378, + "loss": 1.3802, + "step": 1296 + }, + { + "epoch": 1.25, + "grad_norm": 1.374159812927246, + "learning_rate": 0.00016439611292986452, + "loss": 1.1782, + "step": 1297 + }, + { + "epoch": 1.25, + "grad_norm": 1.4581077098846436, + "learning_rate": 0.0001642721933916628, + "loss": 1.1469, + "step": 1298 + }, + { + "epoch": 1.25, + "grad_norm": 1.4149705171585083, + "learning_rate": 0.0001641482310154598, + "loss": 1.1342, + "step": 1299 + }, + { + "epoch": 1.25, + "grad_norm": 0.981275737285614, + "learning_rate": 0.0001640242259364733, + "loss": 0.6698, + "step": 1300 + }, + { + "epoch": 1.26, + "grad_norm": 1.568472146987915, + "learning_rate": 0.0001639001782899677, + "loss": 0.8356, + "step": 1301 + }, + { + "epoch": 1.26, + "grad_norm": 1.5767405033111572, + "learning_rate": 0.0001637760882112539, + "loss": 1.1833, + "step": 1302 + }, + { + "epoch": 1.26, + "grad_norm": 1.1946420669555664, + "learning_rate": 0.0001636519558356889, + "loss": 0.938, + "step": 1303 + }, + { + "epoch": 1.26, + "grad_norm": 1.0555033683776855, + "learning_rate": 0.00016352778129867607, + "loss": 0.7367, + "step": 1304 + }, + { + "epoch": 1.26, + "grad_norm": 1.7251577377319336, + "learning_rate": 0.00016340356473566454, + "loss": 1.2856, + "step": 1305 + }, + { + "epoch": 1.26, + "grad_norm": 1.1670684814453125, + "learning_rate": 0.00016327930628214952, + "loss": 1.1319, + "step": 1306 + }, + { + "epoch": 1.26, + "grad_norm": 1.2817931175231934, + "learning_rate": 0.0001631550060736717, + "loss": 1.0645, + "step": 1307 + }, + { + "epoch": 1.26, + "grad_norm": 1.2379206418991089, + "learning_rate": 0.00016303066424581746, + "loss": 0.6547, + "step": 1308 + }, + { + "epoch": 1.26, + "grad_norm": 1.0804026126861572, + "learning_rate": 0.0001629062809342185, + "loss": 0.7313, + "step": 1309 + }, + { + "epoch": 1.26, + "grad_norm": 1.3285449743270874, + "learning_rate": 0.00016278185627455183, + "loss": 0.9509, + "step": 1310 + }, + { + "epoch": 1.27, + "grad_norm": 1.3058103322982788, + "learning_rate": 0.00016265739040253948, + "loss": 0.974, + "step": 1311 + }, + { + "epoch": 1.27, + "grad_norm": 1.261012077331543, + "learning_rate": 0.00016253288345394854, + "loss": 0.9932, + "step": 1312 + }, + { + "epoch": 1.27, + "grad_norm": 1.5090525150299072, + "learning_rate": 0.0001624083355645908, + "loss": 1.2195, + "step": 1313 + }, + { + "epoch": 1.27, + "grad_norm": 1.3207299709320068, + "learning_rate": 0.00016228374687032277, + "loss": 0.9553, + "step": 1314 + }, + { + "epoch": 1.27, + "grad_norm": 1.3063914775848389, + "learning_rate": 0.00016215911750704545, + "loss": 0.9287, + "step": 1315 + }, + { + "epoch": 1.27, + "grad_norm": 1.2555960416793823, + "learning_rate": 0.00016203444761070425, + "loss": 0.9512, + "step": 1316 + }, + { + "epoch": 1.27, + "grad_norm": 1.2886016368865967, + "learning_rate": 0.0001619097373172887, + "loss": 1.2255, + "step": 1317 + }, + { + "epoch": 1.27, + "grad_norm": 1.507990837097168, + "learning_rate": 0.00016178498676283252, + "loss": 1.2742, + "step": 1318 + }, + { + "epoch": 1.27, + "grad_norm": 1.4528714418411255, + "learning_rate": 0.00016166019608341317, + "loss": 1.0882, + "step": 1319 + }, + { + "epoch": 1.27, + "grad_norm": 1.2334988117218018, + "learning_rate": 0.00016153536541515208, + "loss": 1.0011, + "step": 1320 + }, + { + "epoch": 1.28, + "grad_norm": 1.2907617092132568, + "learning_rate": 0.00016141049489421414, + "loss": 1.0559, + "step": 1321 + }, + { + "epoch": 1.28, + "grad_norm": 1.0541855096817017, + "learning_rate": 0.00016128558465680777, + "loss": 0.996, + "step": 1322 + }, + { + "epoch": 1.28, + "grad_norm": 0.9502456188201904, + "learning_rate": 0.0001611606348391848, + "loss": 0.6858, + "step": 1323 + }, + { + "epoch": 1.28, + "grad_norm": 1.081721305847168, + "learning_rate": 0.00016103564557764006, + "loss": 0.9702, + "step": 1324 + }, + { + "epoch": 1.28, + "grad_norm": 1.2104740142822266, + "learning_rate": 0.00016091061700851153, + "loss": 0.8464, + "step": 1325 + }, + { + "epoch": 1.28, + "grad_norm": 1.061679720878601, + "learning_rate": 0.00016078554926818, + "loss": 0.7161, + "step": 1326 + }, + { + "epoch": 1.28, + "grad_norm": 1.1888184547424316, + "learning_rate": 0.0001606604424930691, + "loss": 0.9908, + "step": 1327 + }, + { + "epoch": 1.28, + "grad_norm": 1.3492275476455688, + "learning_rate": 0.00016053529681964485, + "loss": 1.1636, + "step": 1328 + }, + { + "epoch": 1.28, + "grad_norm": 1.8432413339614868, + "learning_rate": 0.00016041011238441592, + "loss": 1.0478, + "step": 1329 + }, + { + "epoch": 1.28, + "grad_norm": 1.2208960056304932, + "learning_rate": 0.00016028488932393303, + "loss": 0.9052, + "step": 1330 + }, + { + "epoch": 1.28, + "grad_norm": 1.5409330129623413, + "learning_rate": 0.00016015962777478926, + "loss": 0.8948, + "step": 1331 + }, + { + "epoch": 1.29, + "grad_norm": 1.4419323205947876, + "learning_rate": 0.0001600343278736195, + "loss": 1.148, + "step": 1332 + }, + { + "epoch": 1.29, + "grad_norm": 1.1415199041366577, + "learning_rate": 0.00015990898975710058, + "loss": 1.129, + "step": 1333 + }, + { + "epoch": 1.29, + "grad_norm": 1.625235676765442, + "learning_rate": 0.000159783613561951, + "loss": 1.2489, + "step": 1334 + }, + { + "epoch": 1.29, + "grad_norm": 1.1700189113616943, + "learning_rate": 0.00015965819942493064, + "loss": 0.7232, + "step": 1335 + }, + { + "epoch": 1.29, + "grad_norm": 1.1410647630691528, + "learning_rate": 0.00015953274748284102, + "loss": 0.8594, + "step": 1336 + }, + { + "epoch": 1.29, + "grad_norm": 1.0741961002349854, + "learning_rate": 0.00015940725787252477, + "loss": 0.8474, + "step": 1337 + }, + { + "epoch": 1.29, + "grad_norm": 1.0182451009750366, + "learning_rate": 0.00015928173073086557, + "loss": 0.7573, + "step": 1338 + }, + { + "epoch": 1.29, + "grad_norm": 1.0768117904663086, + "learning_rate": 0.00015915616619478813, + "loss": 1.2029, + "step": 1339 + }, + { + "epoch": 1.29, + "grad_norm": 1.025019884109497, + "learning_rate": 0.0001590305644012578, + "loss": 0.9151, + "step": 1340 + }, + { + "epoch": 1.29, + "grad_norm": 1.3672230243682861, + "learning_rate": 0.0001589049254872808, + "loss": 1.3459, + "step": 1341 + }, + { + "epoch": 1.3, + "grad_norm": 1.2047271728515625, + "learning_rate": 0.00015877924958990363, + "loss": 1.0726, + "step": 1342 + }, + { + "epoch": 1.3, + "grad_norm": 4.216643810272217, + "learning_rate": 0.00015865353684621332, + "loss": 0.902, + "step": 1343 + }, + { + "epoch": 1.3, + "grad_norm": 1.0406891107559204, + "learning_rate": 0.00015852778739333683, + "loss": 0.7834, + "step": 1344 + }, + { + "epoch": 1.3, + "grad_norm": 1.3921639919281006, + "learning_rate": 0.00015840200136844144, + "loss": 1.0515, + "step": 1345 + }, + { + "epoch": 1.3, + "grad_norm": 1.4456580877304077, + "learning_rate": 0.00015827617890873418, + "loss": 0.9935, + "step": 1346 + }, + { + "epoch": 1.3, + "grad_norm": 1.5223323106765747, + "learning_rate": 0.00015815032015146181, + "loss": 1.2268, + "step": 1347 + }, + { + "epoch": 1.3, + "grad_norm": 1.01329505443573, + "learning_rate": 0.0001580244252339108, + "loss": 0.6671, + "step": 1348 + }, + { + "epoch": 1.3, + "grad_norm": 1.2426472902297974, + "learning_rate": 0.00015789849429340684, + "loss": 0.8958, + "step": 1349 + }, + { + "epoch": 1.3, + "grad_norm": 1.4329215288162231, + "learning_rate": 0.00015777252746731517, + "loss": 0.8278, + "step": 1350 + }, + { + "epoch": 1.3, + "grad_norm": 1.1606448888778687, + "learning_rate": 0.00015764652489304003, + "loss": 0.9097, + "step": 1351 + }, + { + "epoch": 1.31, + "grad_norm": 1.0638396739959717, + "learning_rate": 0.00015752048670802468, + "loss": 0.8128, + "step": 1352 + }, + { + "epoch": 1.31, + "grad_norm": 1.1829980611801147, + "learning_rate": 0.00015739441304975124, + "loss": 0.7878, + "step": 1353 + }, + { + "epoch": 1.31, + "grad_norm": 1.1757057905197144, + "learning_rate": 0.0001572683040557405, + "loss": 0.6479, + "step": 1354 + }, + { + "epoch": 1.31, + "grad_norm": 1.0985661745071411, + "learning_rate": 0.00015714215986355182, + "loss": 0.8554, + "step": 1355 + }, + { + "epoch": 1.31, + "grad_norm": 1.2881649732589722, + "learning_rate": 0.00015701598061078294, + "loss": 1.0739, + "step": 1356 + }, + { + "epoch": 1.31, + "grad_norm": 1.6648950576782227, + "learning_rate": 0.0001568897664350698, + "loss": 1.4414, + "step": 1357 + }, + { + "epoch": 1.31, + "grad_norm": 1.171886682510376, + "learning_rate": 0.00015676351747408653, + "loss": 0.6933, + "step": 1358 + }, + { + "epoch": 1.31, + "grad_norm": 1.1470980644226074, + "learning_rate": 0.00015663723386554512, + "loss": 0.8455, + "step": 1359 + }, + { + "epoch": 1.31, + "grad_norm": 1.2526499032974243, + "learning_rate": 0.00015651091574719544, + "loss": 1.1787, + "step": 1360 + }, + { + "epoch": 1.31, + "grad_norm": 1.0531766414642334, + "learning_rate": 0.00015638456325682486, + "loss": 0.615, + "step": 1361 + }, + { + "epoch": 1.31, + "grad_norm": 0.813473641872406, + "learning_rate": 0.00015625817653225842, + "loss": 0.4417, + "step": 1362 + }, + { + "epoch": 1.32, + "grad_norm": 1.1922723054885864, + "learning_rate": 0.00015613175571135837, + "loss": 1.0914, + "step": 1363 + }, + { + "epoch": 1.32, + "grad_norm": 1.3487348556518555, + "learning_rate": 0.00015600530093202417, + "loss": 1.0338, + "step": 1364 + }, + { + "epoch": 1.32, + "grad_norm": 1.015734076499939, + "learning_rate": 0.0001558788123321924, + "loss": 0.6342, + "step": 1365 + }, + { + "epoch": 1.32, + "grad_norm": 1.4268220663070679, + "learning_rate": 0.00015575229004983647, + "loss": 1.3462, + "step": 1366 + }, + { + "epoch": 1.32, + "grad_norm": 1.2125145196914673, + "learning_rate": 0.0001556257342229665, + "loss": 1.0775, + "step": 1367 + }, + { + "epoch": 1.32, + "grad_norm": 1.6474987268447876, + "learning_rate": 0.00015549914498962927, + "loss": 0.8819, + "step": 1368 + }, + { + "epoch": 1.32, + "grad_norm": 1.220054030418396, + "learning_rate": 0.000155372522487908, + "loss": 1.0804, + "step": 1369 + }, + { + "epoch": 1.32, + "grad_norm": 1.1821403503417969, + "learning_rate": 0.00015524586685592212, + "loss": 0.7528, + "step": 1370 + }, + { + "epoch": 1.32, + "grad_norm": 1.4041728973388672, + "learning_rate": 0.00015511917823182728, + "loss": 1.0064, + "step": 1371 + }, + { + "epoch": 1.32, + "grad_norm": 1.2141079902648926, + "learning_rate": 0.00015499245675381504, + "loss": 0.8372, + "step": 1372 + }, + { + "epoch": 1.33, + "grad_norm": 1.4008363485336304, + "learning_rate": 0.00015486570256011286, + "loss": 0.8686, + "step": 1373 + }, + { + "epoch": 1.33, + "grad_norm": 1.357132911682129, + "learning_rate": 0.00015473891578898395, + "loss": 1.2691, + "step": 1374 + }, + { + "epoch": 1.33, + "grad_norm": 1.5801461935043335, + "learning_rate": 0.0001546120965787268, + "loss": 1.0234, + "step": 1375 + }, + { + "epoch": 1.33, + "grad_norm": 1.1867705583572388, + "learning_rate": 0.00015448524506767565, + "loss": 0.9288, + "step": 1376 + }, + { + "epoch": 1.33, + "grad_norm": 1.2222601175308228, + "learning_rate": 0.00015435836139419964, + "loss": 1.1417, + "step": 1377 + }, + { + "epoch": 1.33, + "grad_norm": 1.1129976511001587, + "learning_rate": 0.00015423144569670324, + "loss": 0.994, + "step": 1378 + }, + { + "epoch": 1.33, + "grad_norm": 1.2277920246124268, + "learning_rate": 0.00015410449811362566, + "loss": 0.6984, + "step": 1379 + }, + { + "epoch": 1.33, + "grad_norm": 1.030815601348877, + "learning_rate": 0.00015397751878344106, + "loss": 0.7813, + "step": 1380 + }, + { + "epoch": 1.33, + "grad_norm": 0.9685385227203369, + "learning_rate": 0.00015385050784465804, + "loss": 0.5461, + "step": 1381 + }, + { + "epoch": 1.33, + "grad_norm": 1.3186579942703247, + "learning_rate": 0.00015372346543581985, + "loss": 0.7531, + "step": 1382 + }, + { + "epoch": 1.33, + "grad_norm": 1.6090222597122192, + "learning_rate": 0.00015359639169550403, + "loss": 1.5783, + "step": 1383 + }, + { + "epoch": 1.34, + "grad_norm": 1.7105401754379272, + "learning_rate": 0.0001534692867623222, + "loss": 1.302, + "step": 1384 + }, + { + "epoch": 1.34, + "grad_norm": 1.6060900688171387, + "learning_rate": 0.00015334215077492016, + "loss": 1.0983, + "step": 1385 + }, + { + "epoch": 1.34, + "grad_norm": 1.1998249292373657, + "learning_rate": 0.00015321498387197745, + "loss": 0.892, + "step": 1386 + }, + { + "epoch": 1.34, + "grad_norm": 1.340060830116272, + "learning_rate": 0.0001530877861922073, + "loss": 0.8928, + "step": 1387 + }, + { + "epoch": 1.34, + "grad_norm": 1.3283123970031738, + "learning_rate": 0.00015296055787435674, + "loss": 0.8075, + "step": 1388 + }, + { + "epoch": 1.34, + "grad_norm": 1.1961750984191895, + "learning_rate": 0.00015283329905720595, + "loss": 0.6859, + "step": 1389 + }, + { + "epoch": 1.34, + "grad_norm": 1.2345889806747437, + "learning_rate": 0.00015270600987956858, + "loss": 1.0289, + "step": 1390 + }, + { + "epoch": 1.34, + "grad_norm": 1.4048421382904053, + "learning_rate": 0.0001525786904802913, + "loss": 1.1179, + "step": 1391 + }, + { + "epoch": 1.34, + "grad_norm": 1.215480089187622, + "learning_rate": 0.00015245134099825376, + "loss": 0.9839, + "step": 1392 + }, + { + "epoch": 1.34, + "grad_norm": 1.142815113067627, + "learning_rate": 0.00015232396157236839, + "loss": 0.9429, + "step": 1393 + }, + { + "epoch": 1.35, + "grad_norm": 1.5882869958877563, + "learning_rate": 0.00015219655234158033, + "loss": 1.4284, + "step": 1394 + }, + { + "epoch": 1.35, + "grad_norm": 1.3742319345474243, + "learning_rate": 0.00015206911344486728, + "loss": 1.5371, + "step": 1395 + }, + { + "epoch": 1.35, + "grad_norm": 0.8513232469558716, + "learning_rate": 0.0001519416450212392, + "loss": 0.5392, + "step": 1396 + }, + { + "epoch": 1.35, + "grad_norm": 1.068568468093872, + "learning_rate": 0.00015181414720973833, + "loss": 0.7804, + "step": 1397 + }, + { + "epoch": 1.35, + "grad_norm": 0.9229900240898132, + "learning_rate": 0.00015168662014943885, + "loss": 0.6506, + "step": 1398 + }, + { + "epoch": 1.35, + "grad_norm": 1.099913477897644, + "learning_rate": 0.000151559063979447, + "loss": 1.0733, + "step": 1399 + }, + { + "epoch": 1.35, + "grad_norm": 1.2708427906036377, + "learning_rate": 0.00015143147883890065, + "loss": 0.89, + "step": 1400 + }, + { + "epoch": 1.35, + "grad_norm": 1.1063934564590454, + "learning_rate": 0.00015130386486696942, + "loss": 0.8854, + "step": 1401 + }, + { + "epoch": 1.35, + "grad_norm": 1.1168344020843506, + "learning_rate": 0.00015117622220285417, + "loss": 0.885, + "step": 1402 + }, + { + "epoch": 1.35, + "grad_norm": 1.444875955581665, + "learning_rate": 0.00015104855098578725, + "loss": 0.9915, + "step": 1403 + }, + { + "epoch": 1.36, + "grad_norm": 1.1813578605651855, + "learning_rate": 0.00015092085135503207, + "loss": 0.837, + "step": 1404 + }, + { + "epoch": 1.36, + "grad_norm": 0.827250599861145, + "learning_rate": 0.00015079312344988294, + "loss": 0.6524, + "step": 1405 + }, + { + "epoch": 1.36, + "grad_norm": 1.2115777730941772, + "learning_rate": 0.00015066536740966524, + "loss": 1.1392, + "step": 1406 + }, + { + "epoch": 1.36, + "grad_norm": 1.4499708414077759, + "learning_rate": 0.00015053758337373483, + "loss": 1.3624, + "step": 1407 + }, + { + "epoch": 1.36, + "grad_norm": 1.160974383354187, + "learning_rate": 0.00015040977148147823, + "loss": 0.8182, + "step": 1408 + }, + { + "epoch": 1.36, + "grad_norm": 1.2181323766708374, + "learning_rate": 0.00015028193187231227, + "loss": 1.2563, + "step": 1409 + }, + { + "epoch": 1.36, + "grad_norm": 1.2282092571258545, + "learning_rate": 0.00015015406468568405, + "loss": 0.7993, + "step": 1410 + }, + { + "epoch": 1.36, + "grad_norm": 1.5285307168960571, + "learning_rate": 0.00015002617006107074, + "loss": 1.247, + "step": 1411 + }, + { + "epoch": 1.36, + "grad_norm": 1.6421436071395874, + "learning_rate": 0.0001498982481379795, + "loss": 1.4423, + "step": 1412 + }, + { + "epoch": 1.36, + "grad_norm": 1.2297186851501465, + "learning_rate": 0.00014977029905594719, + "loss": 1.1478, + "step": 1413 + }, + { + "epoch": 1.36, + "grad_norm": 1.445056676864624, + "learning_rate": 0.00014964232295454027, + "loss": 0.9675, + "step": 1414 + }, + { + "epoch": 1.37, + "grad_norm": 1.2162317037582397, + "learning_rate": 0.0001495143199733548, + "loss": 0.7747, + "step": 1415 + }, + { + "epoch": 1.37, + "grad_norm": 1.1276546716690063, + "learning_rate": 0.00014938629025201605, + "loss": 0.7539, + "step": 1416 + }, + { + "epoch": 1.37, + "grad_norm": 1.2960880994796753, + "learning_rate": 0.00014925823393017846, + "loss": 1.0838, + "step": 1417 + }, + { + "epoch": 1.37, + "grad_norm": 1.4467681646347046, + "learning_rate": 0.00014913015114752554, + "loss": 1.2902, + "step": 1418 + }, + { + "epoch": 1.37, + "grad_norm": 1.0221946239471436, + "learning_rate": 0.00014900204204376964, + "loss": 0.7521, + "step": 1419 + }, + { + "epoch": 1.37, + "grad_norm": 1.7419352531433105, + "learning_rate": 0.00014887390675865183, + "loss": 1.3662, + "step": 1420 + }, + { + "epoch": 1.37, + "grad_norm": 1.406962275505066, + "learning_rate": 0.00014874574543194172, + "loss": 0.8323, + "step": 1421 + }, + { + "epoch": 1.37, + "grad_norm": 1.3675340414047241, + "learning_rate": 0.0001486175582034373, + "loss": 1.1238, + "step": 1422 + }, + { + "epoch": 1.37, + "grad_norm": 1.2011470794677734, + "learning_rate": 0.00014848934521296492, + "loss": 0.6788, + "step": 1423 + }, + { + "epoch": 1.37, + "grad_norm": 1.2760342359542847, + "learning_rate": 0.0001483611066003789, + "loss": 1.1523, + "step": 1424 + }, + { + "epoch": 1.38, + "grad_norm": 1.3156168460845947, + "learning_rate": 0.00014823284250556164, + "loss": 1.3448, + "step": 1425 + }, + { + "epoch": 1.38, + "grad_norm": 1.3862630128860474, + "learning_rate": 0.00014810455306842312, + "loss": 0.928, + "step": 1426 + }, + { + "epoch": 1.38, + "grad_norm": 1.1149640083312988, + "learning_rate": 0.00014797623842890132, + "loss": 1.0698, + "step": 1427 + }, + { + "epoch": 1.38, + "grad_norm": 1.6064804792404175, + "learning_rate": 0.0001478478987269613, + "loss": 1.1212, + "step": 1428 + }, + { + "epoch": 1.38, + "grad_norm": 1.2255253791809082, + "learning_rate": 0.00014771953410259577, + "loss": 1.0053, + "step": 1429 + }, + { + "epoch": 1.38, + "grad_norm": 1.1613675355911255, + "learning_rate": 0.0001475911446958245, + "loss": 0.8965, + "step": 1430 + }, + { + "epoch": 1.38, + "grad_norm": 1.42286217212677, + "learning_rate": 0.00014746273064669425, + "loss": 1.2165, + "step": 1431 + }, + { + "epoch": 1.38, + "grad_norm": 1.0494049787521362, + "learning_rate": 0.00014733429209527876, + "loss": 0.7192, + "step": 1432 + }, + { + "epoch": 1.38, + "grad_norm": 1.020506739616394, + "learning_rate": 0.00014720582918167845, + "loss": 0.7034, + "step": 1433 + }, + { + "epoch": 1.38, + "grad_norm": 1.3031187057495117, + "learning_rate": 0.00014707734204602027, + "loss": 0.7535, + "step": 1434 + }, + { + "epoch": 1.39, + "grad_norm": 1.0941170454025269, + "learning_rate": 0.0001469488308284577, + "loss": 0.8199, + "step": 1435 + }, + { + "epoch": 1.39, + "grad_norm": 1.1094365119934082, + "learning_rate": 0.00014682029566917042, + "loss": 0.6207, + "step": 1436 + }, + { + "epoch": 1.39, + "grad_norm": 1.0460867881774902, + "learning_rate": 0.00014669173670836416, + "loss": 0.8711, + "step": 1437 + }, + { + "epoch": 1.39, + "grad_norm": 1.3944430351257324, + "learning_rate": 0.00014656315408627076, + "loss": 0.9551, + "step": 1438 + }, + { + "epoch": 1.39, + "grad_norm": 1.3652623891830444, + "learning_rate": 0.00014643454794314775, + "loss": 0.9262, + "step": 1439 + }, + { + "epoch": 1.39, + "grad_norm": 1.2555691003799438, + "learning_rate": 0.00014630591841927838, + "loss": 1.0118, + "step": 1440 + }, + { + "epoch": 1.39, + "grad_norm": 1.467557668685913, + "learning_rate": 0.00014617726565497138, + "loss": 1.202, + "step": 1441 + }, + { + "epoch": 1.39, + "grad_norm": 0.7499139308929443, + "learning_rate": 0.00014604858979056084, + "loss": 0.3857, + "step": 1442 + }, + { + "epoch": 1.39, + "grad_norm": 1.6192408800125122, + "learning_rate": 0.00014591989096640604, + "loss": 1.1809, + "step": 1443 + }, + { + "epoch": 1.39, + "grad_norm": 1.4283156394958496, + "learning_rate": 0.00014579116932289128, + "loss": 0.9565, + "step": 1444 + }, + { + "epoch": 1.39, + "grad_norm": 1.0685770511627197, + "learning_rate": 0.0001456624250004258, + "loss": 0.7835, + "step": 1445 + }, + { + "epoch": 1.4, + "grad_norm": 1.3966482877731323, + "learning_rate": 0.00014553365813944351, + "loss": 1.1632, + "step": 1446 + }, + { + "epoch": 1.4, + "grad_norm": 1.2985618114471436, + "learning_rate": 0.00014540486888040306, + "loss": 0.8087, + "step": 1447 + }, + { + "epoch": 1.4, + "grad_norm": 1.616455316543579, + "learning_rate": 0.00014527605736378731, + "loss": 0.9968, + "step": 1448 + }, + { + "epoch": 1.4, + "grad_norm": 1.1916112899780273, + "learning_rate": 0.00014514722373010357, + "loss": 1.1061, + "step": 1449 + }, + { + "epoch": 1.4, + "grad_norm": 1.2969380617141724, + "learning_rate": 0.00014501836811988322, + "loss": 0.825, + "step": 1450 + }, + { + "epoch": 1.4, + "grad_norm": 1.3342739343643188, + "learning_rate": 0.00014488949067368158, + "loss": 1.0068, + "step": 1451 + }, + { + "epoch": 1.4, + "grad_norm": 1.2201863527297974, + "learning_rate": 0.00014476059153207779, + "loss": 0.8792, + "step": 1452 + }, + { + "epoch": 1.4, + "grad_norm": 1.213496446609497, + "learning_rate": 0.00014463167083567476, + "loss": 0.843, + "step": 1453 + }, + { + "epoch": 1.4, + "grad_norm": 1.3563382625579834, + "learning_rate": 0.00014450272872509878, + "loss": 1.0632, + "step": 1454 + }, + { + "epoch": 1.4, + "grad_norm": 1.2063534259796143, + "learning_rate": 0.00014437376534099958, + "loss": 0.7342, + "step": 1455 + }, + { + "epoch": 1.41, + "grad_norm": 0.9122269749641418, + "learning_rate": 0.00014424478082405003, + "loss": 0.4485, + "step": 1456 + }, + { + "epoch": 1.41, + "grad_norm": 1.608677625656128, + "learning_rate": 0.0001441157753149461, + "loss": 1.0279, + "step": 1457 + }, + { + "epoch": 1.41, + "grad_norm": 1.0568768978118896, + "learning_rate": 0.0001439867489544066, + "loss": 0.6611, + "step": 1458 + }, + { + "epoch": 1.41, + "grad_norm": 1.5482009649276733, + "learning_rate": 0.0001438577018831733, + "loss": 1.3599, + "step": 1459 + }, + { + "epoch": 1.41, + "grad_norm": 1.0343140363693237, + "learning_rate": 0.00014372863424201018, + "loss": 1.0214, + "step": 1460 + }, + { + "epoch": 1.41, + "grad_norm": 1.2698018550872803, + "learning_rate": 0.000143599546171704, + "loss": 0.9969, + "step": 1461 + }, + { + "epoch": 1.41, + "grad_norm": 1.0547878742218018, + "learning_rate": 0.0001434704378130637, + "loss": 0.9606, + "step": 1462 + }, + { + "epoch": 1.41, + "grad_norm": 1.2461342811584473, + "learning_rate": 0.00014334130930692023, + "loss": 1.1568, + "step": 1463 + }, + { + "epoch": 1.41, + "grad_norm": 1.1943764686584473, + "learning_rate": 0.00014321216079412667, + "loss": 0.8852, + "step": 1464 + }, + { + "epoch": 1.41, + "grad_norm": 1.0265756845474243, + "learning_rate": 0.0001430829924155578, + "loss": 0.8125, + "step": 1465 + }, + { + "epoch": 1.42, + "grad_norm": 1.0927410125732422, + "learning_rate": 0.0001429538043121102, + "loss": 0.8627, + "step": 1466 + }, + { + "epoch": 1.42, + "grad_norm": 0.9594815969467163, + "learning_rate": 0.00014282459662470193, + "loss": 0.7131, + "step": 1467 + }, + { + "epoch": 1.42, + "grad_norm": 1.190256118774414, + "learning_rate": 0.00014269536949427228, + "loss": 0.8461, + "step": 1468 + }, + { + "epoch": 1.42, + "grad_norm": 1.3381212949752808, + "learning_rate": 0.00014256612306178193, + "loss": 1.261, + "step": 1469 + }, + { + "epoch": 1.42, + "grad_norm": 1.52683687210083, + "learning_rate": 0.0001424368574682125, + "loss": 1.0221, + "step": 1470 + }, + { + "epoch": 1.42, + "grad_norm": 1.3034223318099976, + "learning_rate": 0.0001423075728545666, + "loss": 0.9496, + "step": 1471 + }, + { + "epoch": 1.42, + "grad_norm": 1.6940810680389404, + "learning_rate": 0.0001421782693618674, + "loss": 1.3889, + "step": 1472 + }, + { + "epoch": 1.42, + "grad_norm": 0.9740185737609863, + "learning_rate": 0.000142048947131159, + "loss": 0.8512, + "step": 1473 + }, + { + "epoch": 1.42, + "grad_norm": 1.2997483015060425, + "learning_rate": 0.00014191960630350554, + "loss": 0.9987, + "step": 1474 + }, + { + "epoch": 1.42, + "grad_norm": 1.3484057188034058, + "learning_rate": 0.00014179024701999174, + "loss": 0.814, + "step": 1475 + }, + { + "epoch": 1.42, + "grad_norm": 1.665818691253662, + "learning_rate": 0.00014166086942172238, + "loss": 0.9987, + "step": 1476 + }, + { + "epoch": 1.43, + "grad_norm": 0.8933560252189636, + "learning_rate": 0.0001415314736498221, + "loss": 0.6541, + "step": 1477 + }, + { + "epoch": 1.43, + "grad_norm": 1.2219523191452026, + "learning_rate": 0.00014140205984543555, + "loss": 0.8712, + "step": 1478 + }, + { + "epoch": 1.43, + "grad_norm": 1.3110696077346802, + "learning_rate": 0.00014127262814972688, + "loss": 0.7635, + "step": 1479 + }, + { + "epoch": 1.43, + "grad_norm": 1.2418899536132812, + "learning_rate": 0.00014114317870387986, + "loss": 0.7825, + "step": 1480 + }, + { + "epoch": 1.43, + "grad_norm": 1.1317052841186523, + "learning_rate": 0.0001410137116490976, + "loss": 0.9781, + "step": 1481 + }, + { + "epoch": 1.43, + "grad_norm": 1.1634619235992432, + "learning_rate": 0.0001408842271266024, + "loss": 0.706, + "step": 1482 + }, + { + "epoch": 1.43, + "grad_norm": 1.0847632884979248, + "learning_rate": 0.0001407547252776356, + "loss": 0.9616, + "step": 1483 + }, + { + "epoch": 1.43, + "grad_norm": 1.4543782472610474, + "learning_rate": 0.00014062520624345752, + "loss": 0.8445, + "step": 1484 + }, + { + "epoch": 1.43, + "grad_norm": 1.081655502319336, + "learning_rate": 0.00014049567016534716, + "loss": 0.7163, + "step": 1485 + }, + { + "epoch": 1.43, + "grad_norm": 0.9920649528503418, + "learning_rate": 0.00014036611718460203, + "loss": 0.5961, + "step": 1486 + }, + { + "epoch": 1.44, + "grad_norm": 1.379042387008667, + "learning_rate": 0.0001402365474425383, + "loss": 1.1791, + "step": 1487 + }, + { + "epoch": 1.44, + "grad_norm": 1.5765132904052734, + "learning_rate": 0.0001401069610804902, + "loss": 1.2701, + "step": 1488 + }, + { + "epoch": 1.44, + "grad_norm": 1.1928505897521973, + "learning_rate": 0.00013997735823981016, + "loss": 0.8722, + "step": 1489 + }, + { + "epoch": 1.44, + "grad_norm": 1.1438642740249634, + "learning_rate": 0.00013984773906186874, + "loss": 0.711, + "step": 1490 + }, + { + "epoch": 1.44, + "grad_norm": 1.0099387168884277, + "learning_rate": 0.00013971810368805404, + "loss": 0.6834, + "step": 1491 + }, + { + "epoch": 1.44, + "grad_norm": 1.1511831283569336, + "learning_rate": 0.00013958845225977204, + "loss": 0.8102, + "step": 1492 + }, + { + "epoch": 1.44, + "grad_norm": 1.2723909616470337, + "learning_rate": 0.0001394587849184461, + "loss": 0.8983, + "step": 1493 + }, + { + "epoch": 1.44, + "grad_norm": 1.0919370651245117, + "learning_rate": 0.00013932910180551714, + "loss": 0.5822, + "step": 1494 + }, + { + "epoch": 1.44, + "grad_norm": 1.42845618724823, + "learning_rate": 0.000139199403062443, + "loss": 0.8466, + "step": 1495 + }, + { + "epoch": 1.44, + "grad_norm": 1.8288830518722534, + "learning_rate": 0.00013906968883069883, + "loss": 1.0043, + "step": 1496 + }, + { + "epoch": 1.44, + "grad_norm": 1.3897336721420288, + "learning_rate": 0.00013893995925177643, + "loss": 0.8318, + "step": 1497 + }, + { + "epoch": 1.45, + "grad_norm": 1.1646203994750977, + "learning_rate": 0.00013881021446718458, + "loss": 0.8964, + "step": 1498 + }, + { + "epoch": 1.45, + "grad_norm": 1.2308577299118042, + "learning_rate": 0.0001386804546184485, + "loss": 0.7555, + "step": 1499 + }, + { + "epoch": 1.45, + "grad_norm": 1.551567554473877, + "learning_rate": 0.00013855067984710988, + "loss": 1.4192, + "step": 1500 + }, + { + "epoch": 1.45, + "grad_norm": 1.1755435466766357, + "learning_rate": 0.00013842089029472671, + "loss": 0.8243, + "step": 1501 + }, + { + "epoch": 1.45, + "grad_norm": 1.1090277433395386, + "learning_rate": 0.000138291086102873, + "loss": 0.7283, + "step": 1502 + }, + { + "epoch": 1.45, + "grad_norm": 1.368507981300354, + "learning_rate": 0.0001381612674131389, + "loss": 1.298, + "step": 1503 + }, + { + "epoch": 1.45, + "grad_norm": 1.3204469680786133, + "learning_rate": 0.00013803143436713023, + "loss": 0.9995, + "step": 1504 + }, + { + "epoch": 1.45, + "grad_norm": 1.4416377544403076, + "learning_rate": 0.0001379015871064685, + "loss": 0.9646, + "step": 1505 + }, + { + "epoch": 1.45, + "grad_norm": 1.3736774921417236, + "learning_rate": 0.00013777172577279084, + "loss": 1.0291, + "step": 1506 + }, + { + "epoch": 1.45, + "grad_norm": 1.1656876802444458, + "learning_rate": 0.00013764185050774957, + "loss": 0.9058, + "step": 1507 + }, + { + "epoch": 1.46, + "grad_norm": 0.9170559644699097, + "learning_rate": 0.0001375119614530123, + "loss": 0.686, + "step": 1508 + }, + { + "epoch": 1.46, + "grad_norm": 1.2503794431686401, + "learning_rate": 0.00013738205875026165, + "loss": 0.6152, + "step": 1509 + }, + { + "epoch": 1.46, + "grad_norm": 0.9299453496932983, + "learning_rate": 0.0001372521425411951, + "loss": 0.5456, + "step": 1510 + }, + { + "epoch": 1.46, + "grad_norm": 1.043879747390747, + "learning_rate": 0.00013712221296752493, + "loss": 0.6329, + "step": 1511 + }, + { + "epoch": 1.46, + "grad_norm": 0.9172692894935608, + "learning_rate": 0.00013699227017097793, + "loss": 0.6599, + "step": 1512 + }, + { + "epoch": 1.46, + "grad_norm": 1.0393551588058472, + "learning_rate": 0.00013686231429329543, + "loss": 0.7241, + "step": 1513 + }, + { + "epoch": 1.46, + "grad_norm": 1.1035585403442383, + "learning_rate": 0.00013673234547623283, + "loss": 0.7771, + "step": 1514 + }, + { + "epoch": 1.46, + "grad_norm": 1.099898099899292, + "learning_rate": 0.00013660236386155988, + "loss": 0.7152, + "step": 1515 + }, + { + "epoch": 1.46, + "grad_norm": 1.1429855823516846, + "learning_rate": 0.00013647236959106004, + "loss": 0.8427, + "step": 1516 + }, + { + "epoch": 1.46, + "grad_norm": 0.9771751761436462, + "learning_rate": 0.00013634236280653085, + "loss": 0.6092, + "step": 1517 + }, + { + "epoch": 1.47, + "grad_norm": 1.1621931791305542, + "learning_rate": 0.00013621234364978325, + "loss": 0.9588, + "step": 1518 + }, + { + "epoch": 1.47, + "grad_norm": 1.0901761054992676, + "learning_rate": 0.00013608231226264179, + "loss": 0.8274, + "step": 1519 + }, + { + "epoch": 1.47, + "grad_norm": 1.3528239727020264, + "learning_rate": 0.00013595226878694442, + "loss": 1.2063, + "step": 1520 + }, + { + "epoch": 1.47, + "grad_norm": 1.6294797658920288, + "learning_rate": 0.00013582221336454214, + "loss": 1.2005, + "step": 1521 + }, + { + "epoch": 1.47, + "grad_norm": 1.2546576261520386, + "learning_rate": 0.0001356921461372991, + "loss": 0.8308, + "step": 1522 + }, + { + "epoch": 1.47, + "grad_norm": 1.2436532974243164, + "learning_rate": 0.0001355620672470922, + "loss": 0.9026, + "step": 1523 + }, + { + "epoch": 1.47, + "grad_norm": 1.2715680599212646, + "learning_rate": 0.00013543197683581123, + "loss": 0.8447, + "step": 1524 + }, + { + "epoch": 1.47, + "grad_norm": 1.0670286417007446, + "learning_rate": 0.00013530187504535844, + "loss": 0.7272, + "step": 1525 + }, + { + "epoch": 1.47, + "grad_norm": 0.9957221746444702, + "learning_rate": 0.0001351717620176484, + "loss": 0.726, + "step": 1526 + }, + { + "epoch": 1.47, + "grad_norm": 1.3824573755264282, + "learning_rate": 0.00013504163789460823, + "loss": 0.9593, + "step": 1527 + }, + { + "epoch": 1.47, + "grad_norm": 1.2613593339920044, + "learning_rate": 0.00013491150281817675, + "loss": 1.1716, + "step": 1528 + }, + { + "epoch": 1.48, + "grad_norm": 1.1571170091629028, + "learning_rate": 0.00013478135693030513, + "loss": 0.8439, + "step": 1529 + }, + { + "epoch": 1.48, + "grad_norm": 1.3491780757904053, + "learning_rate": 0.00013465120037295606, + "loss": 1.152, + "step": 1530 + }, + { + "epoch": 1.48, + "grad_norm": 1.3383604288101196, + "learning_rate": 0.000134521033288104, + "loss": 0.9002, + "step": 1531 + }, + { + "epoch": 1.48, + "grad_norm": 1.0674033164978027, + "learning_rate": 0.00013439085581773481, + "loss": 0.7625, + "step": 1532 + }, + { + "epoch": 1.48, + "grad_norm": 1.0441478490829468, + "learning_rate": 0.00013426066810384572, + "loss": 0.6468, + "step": 1533 + }, + { + "epoch": 1.48, + "grad_norm": 1.3126263618469238, + "learning_rate": 0.0001341304702884452, + "loss": 1.0375, + "step": 1534 + }, + { + "epoch": 1.48, + "grad_norm": 1.1851307153701782, + "learning_rate": 0.00013400026251355257, + "loss": 1.0054, + "step": 1535 + }, + { + "epoch": 1.48, + "grad_norm": 1.0638484954833984, + "learning_rate": 0.0001338700449211982, + "loss": 0.6203, + "step": 1536 + }, + { + "epoch": 1.48, + "grad_norm": 1.396504521369934, + "learning_rate": 0.00013373981765342304, + "loss": 1.4193, + "step": 1537 + }, + { + "epoch": 1.48, + "grad_norm": 1.1101491451263428, + "learning_rate": 0.00013360958085227866, + "loss": 0.6708, + "step": 1538 + }, + { + "epoch": 1.49, + "grad_norm": 0.9452186226844788, + "learning_rate": 0.000133479334659827, + "loss": 0.8028, + "step": 1539 + }, + { + "epoch": 1.49, + "grad_norm": 0.9814640283584595, + "learning_rate": 0.0001333490792181402, + "loss": 0.8605, + "step": 1540 + }, + { + "epoch": 1.49, + "grad_norm": 1.123543620109558, + "learning_rate": 0.0001332188146693006, + "loss": 0.738, + "step": 1541 + }, + { + "epoch": 1.49, + "grad_norm": 1.3086490631103516, + "learning_rate": 0.00013308854115540042, + "loss": 1.0046, + "step": 1542 + }, + { + "epoch": 1.49, + "grad_norm": 0.9039118885993958, + "learning_rate": 0.00013295825881854168, + "loss": 0.7189, + "step": 1543 + }, + { + "epoch": 1.49, + "grad_norm": 1.2540236711502075, + "learning_rate": 0.00013282796780083588, + "loss": 1.0421, + "step": 1544 + }, + { + "epoch": 1.49, + "grad_norm": 1.2566171884536743, + "learning_rate": 0.00013269766824440424, + "loss": 0.9669, + "step": 1545 + }, + { + "epoch": 1.49, + "grad_norm": 1.304291844367981, + "learning_rate": 0.00013256736029137705, + "loss": 0.8689, + "step": 1546 + }, + { + "epoch": 1.49, + "grad_norm": 1.2309794425964355, + "learning_rate": 0.00013243704408389393, + "loss": 1.1371, + "step": 1547 + }, + { + "epoch": 1.49, + "grad_norm": 1.1405993700027466, + "learning_rate": 0.00013230671976410344, + "loss": 1.0609, + "step": 1548 + }, + { + "epoch": 1.5, + "grad_norm": 1.1470849514007568, + "learning_rate": 0.00013217638747416296, + "loss": 0.949, + "step": 1549 + }, + { + "epoch": 1.5, + "grad_norm": 1.3759626150131226, + "learning_rate": 0.00013204604735623867, + "loss": 1.3161, + "step": 1550 + }, + { + "epoch": 1.5, + "grad_norm": 1.4023290872573853, + "learning_rate": 0.00013191569955250512, + "loss": 1.1112, + "step": 1551 + }, + { + "epoch": 1.5, + "grad_norm": 1.2357980012893677, + "learning_rate": 0.0001317853442051454, + "loss": 1.0148, + "step": 1552 + }, + { + "epoch": 1.5, + "grad_norm": 1.2000268697738647, + "learning_rate": 0.00013165498145635072, + "loss": 0.8196, + "step": 1553 + }, + { + "epoch": 1.5, + "grad_norm": 0.956527829170227, + "learning_rate": 0.0001315246114483205, + "loss": 0.5612, + "step": 1554 + }, + { + "epoch": 1.5, + "grad_norm": 1.0706440210342407, + "learning_rate": 0.00013139423432326187, + "loss": 1.0052, + "step": 1555 + }, + { + "epoch": 1.5, + "grad_norm": 0.8552688956260681, + "learning_rate": 0.0001312638502233899, + "loss": 0.6562, + "step": 1556 + }, + { + "epoch": 1.5, + "grad_norm": 1.3808249235153198, + "learning_rate": 0.00013113345929092728, + "loss": 0.9808, + "step": 1557 + }, + { + "epoch": 1.5, + "grad_norm": 1.2328780889511108, + "learning_rate": 0.00013100306166810395, + "loss": 0.996, + "step": 1558 + }, + { + "epoch": 1.5, + "grad_norm": 1.611670732498169, + "learning_rate": 0.0001308726574971574, + "loss": 1.3974, + "step": 1559 + }, + { + "epoch": 1.51, + "grad_norm": 1.2272011041641235, + "learning_rate": 0.00013074224692033202, + "loss": 0.7273, + "step": 1560 + }, + { + "epoch": 1.51, + "grad_norm": 1.3213979005813599, + "learning_rate": 0.00013061183007987939, + "loss": 1.0876, + "step": 1561 + }, + { + "epoch": 1.51, + "grad_norm": 0.980205237865448, + "learning_rate": 0.00013048140711805788, + "loss": 0.6217, + "step": 1562 + }, + { + "epoch": 1.51, + "grad_norm": 1.5037871599197388, + "learning_rate": 0.0001303509781771324, + "loss": 1.4245, + "step": 1563 + }, + { + "epoch": 1.51, + "grad_norm": 1.2028981447219849, + "learning_rate": 0.00013022054339937458, + "loss": 0.9513, + "step": 1564 + }, + { + "epoch": 1.51, + "grad_norm": 1.123295545578003, + "learning_rate": 0.00013009010292706224, + "loss": 0.8559, + "step": 1565 + }, + { + "epoch": 1.51, + "grad_norm": 1.2569561004638672, + "learning_rate": 0.00012995965690247956, + "loss": 1.1427, + "step": 1566 + }, + { + "epoch": 1.51, + "grad_norm": 1.2987000942230225, + "learning_rate": 0.00012982920546791667, + "loss": 0.8334, + "step": 1567 + }, + { + "epoch": 1.51, + "grad_norm": 1.2294447422027588, + "learning_rate": 0.00012969874876566964, + "loss": 1.0278, + "step": 1568 + }, + { + "epoch": 1.51, + "grad_norm": 1.5399640798568726, + "learning_rate": 0.00012956828693804027, + "loss": 1.4469, + "step": 1569 + }, + { + "epoch": 1.52, + "grad_norm": 1.1422955989837646, + "learning_rate": 0.00012943782012733598, + "loss": 0.682, + "step": 1570 + }, + { + "epoch": 1.52, + "grad_norm": 1.2182711362838745, + "learning_rate": 0.00012930734847586963, + "loss": 1.0153, + "step": 1571 + }, + { + "epoch": 1.52, + "grad_norm": 1.2179765701293945, + "learning_rate": 0.00012917687212595934, + "loss": 0.5329, + "step": 1572 + }, + { + "epoch": 1.52, + "grad_norm": 1.2386904954910278, + "learning_rate": 0.00012904639121992834, + "loss": 0.9261, + "step": 1573 + }, + { + "epoch": 1.52, + "grad_norm": 1.0264219045639038, + "learning_rate": 0.00012891590590010483, + "loss": 0.7033, + "step": 1574 + }, + { + "epoch": 1.52, + "grad_norm": 1.187770962715149, + "learning_rate": 0.00012878541630882183, + "loss": 1.0614, + "step": 1575 + }, + { + "epoch": 1.52, + "grad_norm": 0.918658971786499, + "learning_rate": 0.00012865492258841709, + "loss": 0.6751, + "step": 1576 + }, + { + "epoch": 1.52, + "grad_norm": 0.899031400680542, + "learning_rate": 0.0001285244248812327, + "loss": 0.7438, + "step": 1577 + }, + { + "epoch": 1.52, + "grad_norm": 1.3633794784545898, + "learning_rate": 0.00012839392332961538, + "loss": 0.8423, + "step": 1578 + }, + { + "epoch": 1.52, + "grad_norm": 1.436004400253296, + "learning_rate": 0.0001282634180759157, + "loss": 0.9681, + "step": 1579 + }, + { + "epoch": 1.53, + "grad_norm": 1.1468559503555298, + "learning_rate": 0.00012813290926248856, + "loss": 0.6503, + "step": 1580 + }, + { + "epoch": 1.53, + "grad_norm": 1.2126736640930176, + "learning_rate": 0.0001280023970316925, + "loss": 0.9377, + "step": 1581 + }, + { + "epoch": 1.53, + "grad_norm": 1.6733336448669434, + "learning_rate": 0.0001278718815258899, + "loss": 0.9995, + "step": 1582 + }, + { + "epoch": 1.53, + "grad_norm": 1.2680801153182983, + "learning_rate": 0.00012774136288744685, + "loss": 0.9284, + "step": 1583 + }, + { + "epoch": 1.53, + "grad_norm": 1.2771387100219727, + "learning_rate": 0.00012761084125873266, + "loss": 0.9957, + "step": 1584 + }, + { + "epoch": 1.53, + "grad_norm": 1.2861063480377197, + "learning_rate": 0.0001274803167821199, + "loss": 0.9192, + "step": 1585 + }, + { + "epoch": 1.53, + "grad_norm": 1.2119808197021484, + "learning_rate": 0.0001273497895999844, + "loss": 0.8964, + "step": 1586 + }, + { + "epoch": 1.53, + "grad_norm": 0.9807702302932739, + "learning_rate": 0.00012721925985470481, + "loss": 0.6691, + "step": 1587 + }, + { + "epoch": 1.53, + "grad_norm": 1.4058620929718018, + "learning_rate": 0.0001270887276886626, + "loss": 1.1064, + "step": 1588 + }, + { + "epoch": 1.53, + "grad_norm": 1.1498188972473145, + "learning_rate": 0.000126958193244242, + "loss": 0.9539, + "step": 1589 + }, + { + "epoch": 1.53, + "grad_norm": 0.8641445636749268, + "learning_rate": 0.0001268276566638295, + "loss": 0.7446, + "step": 1590 + }, + { + "epoch": 1.54, + "grad_norm": 0.9338680505752563, + "learning_rate": 0.00012669711808981413, + "loss": 0.6265, + "step": 1591 + }, + { + "epoch": 1.54, + "grad_norm": 1.3480240106582642, + "learning_rate": 0.00012656657766458698, + "loss": 1.1703, + "step": 1592 + }, + { + "epoch": 1.54, + "grad_norm": 1.0655312538146973, + "learning_rate": 0.00012643603553054116, + "loss": 1.2219, + "step": 1593 + }, + { + "epoch": 1.54, + "grad_norm": 1.325788974761963, + "learning_rate": 0.00012630549183007176, + "loss": 0.8702, + "step": 1594 + }, + { + "epoch": 1.54, + "grad_norm": 1.2315642833709717, + "learning_rate": 0.0001261749467055754, + "loss": 0.6193, + "step": 1595 + }, + { + "epoch": 1.54, + "grad_norm": 1.9755103588104248, + "learning_rate": 0.0001260444002994504, + "loss": 0.9851, + "step": 1596 + }, + { + "epoch": 1.54, + "grad_norm": 1.2712702751159668, + "learning_rate": 0.00012591385275409638, + "loss": 1.2029, + "step": 1597 + }, + { + "epoch": 1.54, + "grad_norm": 1.4285528659820557, + "learning_rate": 0.00012578330421191428, + "loss": 1.1168, + "step": 1598 + }, + { + "epoch": 1.54, + "grad_norm": 1.7838413715362549, + "learning_rate": 0.0001256527548153061, + "loss": 1.12, + "step": 1599 + }, + { + "epoch": 1.54, + "grad_norm": 0.9507400393486023, + "learning_rate": 0.00012552220470667472, + "loss": 0.6351, + "step": 1600 + }, + { + "epoch": 1.55, + "grad_norm": 0.8768706917762756, + "learning_rate": 0.0001253916540284239, + "loss": 0.5172, + "step": 1601 + }, + { + "epoch": 1.55, + "grad_norm": 1.1305594444274902, + "learning_rate": 0.0001252611029229579, + "loss": 0.7985, + "step": 1602 + }, + { + "epoch": 1.55, + "grad_norm": 1.086591124534607, + "learning_rate": 0.0001251305515326816, + "loss": 0.7133, + "step": 1603 + }, + { + "epoch": 1.55, + "grad_norm": 1.212422251701355, + "learning_rate": 0.000125, + "loss": 0.8705, + "step": 1604 + }, + { + "epoch": 1.55, + "grad_norm": 1.0888148546218872, + "learning_rate": 0.00012486944846731843, + "loss": 0.8609, + "step": 1605 + }, + { + "epoch": 1.55, + "grad_norm": 1.300306797027588, + "learning_rate": 0.0001247388970770421, + "loss": 0.8719, + "step": 1606 + }, + { + "epoch": 1.55, + "grad_norm": 1.213152289390564, + "learning_rate": 0.00012460834597157613, + "loss": 0.7651, + "step": 1607 + }, + { + "epoch": 1.55, + "grad_norm": 1.1947771310806274, + "learning_rate": 0.00012447779529332532, + "loss": 0.7965, + "step": 1608 + }, + { + "epoch": 1.55, + "grad_norm": 0.9396220445632935, + "learning_rate": 0.00012434724518469394, + "loss": 0.7223, + "step": 1609 + }, + { + "epoch": 1.55, + "grad_norm": 1.363637924194336, + "learning_rate": 0.00012421669578808575, + "loss": 0.8611, + "step": 1610 + }, + { + "epoch": 1.56, + "grad_norm": 1.1343002319335938, + "learning_rate": 0.00012408614724590365, + "loss": 0.7998, + "step": 1611 + }, + { + "epoch": 1.56, + "grad_norm": 1.739519476890564, + "learning_rate": 0.00012395559970054964, + "loss": 1.5514, + "step": 1612 + }, + { + "epoch": 1.56, + "grad_norm": 1.0173429250717163, + "learning_rate": 0.00012382505329442463, + "loss": 0.6983, + "step": 1613 + }, + { + "epoch": 1.56, + "grad_norm": 1.0030152797698975, + "learning_rate": 0.0001236945081699283, + "loss": 0.9811, + "step": 1614 + }, + { + "epoch": 1.56, + "grad_norm": 1.199392318725586, + "learning_rate": 0.00012356396446945887, + "loss": 1.2252, + "step": 1615 + }, + { + "epoch": 1.56, + "grad_norm": 0.9313424229621887, + "learning_rate": 0.000123433422335413, + "loss": 0.6646, + "step": 1616 + }, + { + "epoch": 1.56, + "grad_norm": 1.1196904182434082, + "learning_rate": 0.00012330288191018588, + "loss": 0.8581, + "step": 1617 + }, + { + "epoch": 1.56, + "grad_norm": 1.0649549961090088, + "learning_rate": 0.00012317234333617048, + "loss": 0.5758, + "step": 1618 + }, + { + "epoch": 1.56, + "grad_norm": 0.8960950970649719, + "learning_rate": 0.000123041806755758, + "loss": 0.6896, + "step": 1619 + }, + { + "epoch": 1.56, + "grad_norm": 1.4193485975265503, + "learning_rate": 0.00012291127231133737, + "loss": 1.1335, + "step": 1620 + }, + { + "epoch": 1.56, + "grad_norm": 1.1770983934402466, + "learning_rate": 0.0001227807401452952, + "loss": 0.9083, + "step": 1621 + }, + { + "epoch": 1.57, + "grad_norm": 1.001149296760559, + "learning_rate": 0.0001226502104000156, + "loss": 0.8143, + "step": 1622 + }, + { + "epoch": 1.57, + "grad_norm": 1.3807704448699951, + "learning_rate": 0.0001225196832178801, + "loss": 1.4339, + "step": 1623 + }, + { + "epoch": 1.57, + "grad_norm": 1.19136381149292, + "learning_rate": 0.00012238915874126737, + "loss": 0.8244, + "step": 1624 + }, + { + "epoch": 1.57, + "grad_norm": 1.25921630859375, + "learning_rate": 0.00012225863711255315, + "loss": 1.0063, + "step": 1625 + }, + { + "epoch": 1.57, + "grad_norm": 0.991645097732544, + "learning_rate": 0.0001221281184741101, + "loss": 0.7513, + "step": 1626 + }, + { + "epoch": 1.57, + "grad_norm": 1.4019756317138672, + "learning_rate": 0.00012199760296830754, + "loss": 1.0369, + "step": 1627 + }, + { + "epoch": 1.57, + "grad_norm": 1.0501247644424438, + "learning_rate": 0.00012186709073751149, + "loss": 0.9839, + "step": 1628 + }, + { + "epoch": 1.57, + "grad_norm": 0.8534177541732788, + "learning_rate": 0.0001217365819240843, + "loss": 0.5141, + "step": 1629 + }, + { + "epoch": 1.57, + "grad_norm": 1.4940394163131714, + "learning_rate": 0.00012160607667038463, + "loss": 1.0276, + "step": 1630 + }, + { + "epoch": 1.57, + "grad_norm": 1.013058066368103, + "learning_rate": 0.00012147557511876729, + "loss": 0.985, + "step": 1631 + }, + { + "epoch": 1.58, + "grad_norm": 1.1882599592208862, + "learning_rate": 0.00012134507741158296, + "loss": 1.0577, + "step": 1632 + }, + { + "epoch": 1.58, + "grad_norm": 1.571408748626709, + "learning_rate": 0.0001212145836911782, + "loss": 1.0311, + "step": 1633 + }, + { + "epoch": 1.58, + "grad_norm": 1.175654411315918, + "learning_rate": 0.00012108409409989524, + "loss": 0.7299, + "step": 1634 + }, + { + "epoch": 1.58, + "grad_norm": 1.0675251483917236, + "learning_rate": 0.00012095360878007173, + "loss": 0.7819, + "step": 1635 + }, + { + "epoch": 1.58, + "grad_norm": 1.181757926940918, + "learning_rate": 0.00012082312787404072, + "loss": 0.8871, + "step": 1636 + }, + { + "epoch": 1.58, + "grad_norm": 1.2116355895996094, + "learning_rate": 0.0001206926515241304, + "loss": 0.9785, + "step": 1637 + }, + { + "epoch": 1.58, + "grad_norm": 1.3060861825942993, + "learning_rate": 0.00012056217987266403, + "loss": 0.9901, + "step": 1638 + }, + { + "epoch": 1.58, + "grad_norm": 1.3970521688461304, + "learning_rate": 0.00012043171306195977, + "loss": 0.8015, + "step": 1639 + }, + { + "epoch": 1.58, + "grad_norm": 1.1799612045288086, + "learning_rate": 0.00012030125123433038, + "loss": 1.1066, + "step": 1640 + }, + { + "epoch": 1.58, + "grad_norm": 1.2180298566818237, + "learning_rate": 0.00012017079453208332, + "loss": 0.8753, + "step": 1641 + }, + { + "epoch": 1.58, + "grad_norm": 1.4600939750671387, + "learning_rate": 0.00012004034309752044, + "loss": 0.8927, + "step": 1642 + }, + { + "epoch": 1.59, + "grad_norm": 1.1986831426620483, + "learning_rate": 0.00011990989707293774, + "loss": 1.1189, + "step": 1643 + }, + { + "epoch": 1.59, + "grad_norm": 1.06000816822052, + "learning_rate": 0.00011977945660062543, + "loss": 0.7865, + "step": 1644 + }, + { + "epoch": 1.59, + "grad_norm": 1.4312909841537476, + "learning_rate": 0.00011964902182286759, + "loss": 1.0943, + "step": 1645 + }, + { + "epoch": 1.59, + "grad_norm": 1.35890531539917, + "learning_rate": 0.00011951859288194214, + "loss": 0.8092, + "step": 1646 + }, + { + "epoch": 1.59, + "grad_norm": 1.47928786277771, + "learning_rate": 0.0001193881699201206, + "loss": 1.1237, + "step": 1647 + }, + { + "epoch": 1.59, + "grad_norm": 1.0819437503814697, + "learning_rate": 0.00011925775307966799, + "loss": 0.6155, + "step": 1648 + }, + { + "epoch": 1.59, + "grad_norm": 1.1378880739212036, + "learning_rate": 0.00011912734250284264, + "loss": 0.9366, + "step": 1649 + }, + { + "epoch": 1.59, + "grad_norm": 1.1234723329544067, + "learning_rate": 0.00011899693833189607, + "loss": 0.7372, + "step": 1650 + }, + { + "epoch": 1.59, + "grad_norm": 1.1745365858078003, + "learning_rate": 0.00011886654070907275, + "loss": 0.626, + "step": 1651 + }, + { + "epoch": 1.59, + "grad_norm": 1.4009250402450562, + "learning_rate": 0.00011873614977661011, + "loss": 1.2027, + "step": 1652 + }, + { + "epoch": 1.6, + "grad_norm": 1.167280673980713, + "learning_rate": 0.00011860576567673814, + "loss": 0.8992, + "step": 1653 + }, + { + "epoch": 1.6, + "grad_norm": 1.1178878545761108, + "learning_rate": 0.00011847538855167954, + "loss": 0.6062, + "step": 1654 + }, + { + "epoch": 1.6, + "grad_norm": 1.3119362592697144, + "learning_rate": 0.00011834501854364929, + "loss": 1.0858, + "step": 1655 + }, + { + "epoch": 1.6, + "grad_norm": 1.1016749143600464, + "learning_rate": 0.00011821465579485464, + "loss": 0.8645, + "step": 1656 + }, + { + "epoch": 1.6, + "grad_norm": 1.315070390701294, + "learning_rate": 0.00011808430044749492, + "loss": 0.9478, + "step": 1657 + }, + { + "epoch": 1.6, + "grad_norm": 1.3839716911315918, + "learning_rate": 0.00011795395264376138, + "loss": 0.8104, + "step": 1658 + }, + { + "epoch": 1.6, + "grad_norm": 1.1264443397521973, + "learning_rate": 0.00011782361252583709, + "loss": 0.6846, + "step": 1659 + }, + { + "epoch": 1.6, + "grad_norm": 1.3495508432388306, + "learning_rate": 0.00011769328023589661, + "loss": 1.0685, + "step": 1660 + }, + { + "epoch": 1.6, + "grad_norm": 1.1462335586547852, + "learning_rate": 0.0001175629559161061, + "loss": 0.7131, + "step": 1661 + }, + { + "epoch": 1.6, + "grad_norm": 1.0215023756027222, + "learning_rate": 0.000117432639708623, + "loss": 0.8112, + "step": 1662 + }, + { + "epoch": 1.61, + "grad_norm": 1.1370227336883545, + "learning_rate": 0.0001173023317555958, + "loss": 0.7725, + "step": 1663 + }, + { + "epoch": 1.61, + "grad_norm": 1.4763069152832031, + "learning_rate": 0.00011717203219916411, + "loss": 0.9204, + "step": 1664 + }, + { + "epoch": 1.61, + "grad_norm": 1.1759175062179565, + "learning_rate": 0.00011704174118145832, + "loss": 1.1234, + "step": 1665 + }, + { + "epoch": 1.61, + "grad_norm": 1.1167455911636353, + "learning_rate": 0.00011691145884459956, + "loss": 0.7173, + "step": 1666 + }, + { + "epoch": 1.61, + "grad_norm": 1.054011583328247, + "learning_rate": 0.00011678118533069937, + "loss": 0.6072, + "step": 1667 + }, + { + "epoch": 1.61, + "grad_norm": 1.2380262613296509, + "learning_rate": 0.0001166509207818598, + "loss": 0.8174, + "step": 1668 + }, + { + "epoch": 1.61, + "grad_norm": 1.1825151443481445, + "learning_rate": 0.00011652066534017304, + "loss": 0.5941, + "step": 1669 + }, + { + "epoch": 1.61, + "grad_norm": 1.345412015914917, + "learning_rate": 0.00011639041914772138, + "loss": 0.8084, + "step": 1670 + }, + { + "epoch": 1.61, + "grad_norm": 1.1241743564605713, + "learning_rate": 0.00011626018234657698, + "loss": 0.8581, + "step": 1671 + }, + { + "epoch": 1.61, + "grad_norm": 1.2227263450622559, + "learning_rate": 0.00011612995507880181, + "loss": 0.9555, + "step": 1672 + }, + { + "epoch": 1.61, + "grad_norm": 1.9729316234588623, + "learning_rate": 0.00011599973748644745, + "loss": 1.0784, + "step": 1673 + }, + { + "epoch": 1.62, + "grad_norm": 1.5396397113800049, + "learning_rate": 0.00011586952971155484, + "loss": 1.3129, + "step": 1674 + }, + { + "epoch": 1.62, + "grad_norm": 1.2963906526565552, + "learning_rate": 0.00011573933189615429, + "loss": 0.9409, + "step": 1675 + }, + { + "epoch": 1.62, + "grad_norm": 1.539739966392517, + "learning_rate": 0.00011560914418226522, + "loss": 1.3643, + "step": 1676 + }, + { + "epoch": 1.62, + "grad_norm": 1.143496036529541, + "learning_rate": 0.00011547896671189602, + "loss": 0.8545, + "step": 1677 + }, + { + "epoch": 1.62, + "grad_norm": 1.3482871055603027, + "learning_rate": 0.00011534879962704396, + "loss": 0.7935, + "step": 1678 + }, + { + "epoch": 1.62, + "grad_norm": 1.1831111907958984, + "learning_rate": 0.00011521864306969489, + "loss": 0.8544, + "step": 1679 + }, + { + "epoch": 1.62, + "grad_norm": 1.034184217453003, + "learning_rate": 0.00011508849718182328, + "loss": 0.831, + "step": 1680 + }, + { + "epoch": 1.62, + "grad_norm": 1.2171016931533813, + "learning_rate": 0.00011495836210539185, + "loss": 0.9327, + "step": 1681 + }, + { + "epoch": 1.62, + "grad_norm": 1.1705622673034668, + "learning_rate": 0.0001148282379823516, + "loss": 0.7978, + "step": 1682 + }, + { + "epoch": 1.62, + "grad_norm": 1.2037105560302734, + "learning_rate": 0.00011469812495464164, + "loss": 0.9249, + "step": 1683 + }, + { + "epoch": 1.63, + "grad_norm": 1.4301562309265137, + "learning_rate": 0.00011456802316418879, + "loss": 1.2376, + "step": 1684 + }, + { + "epoch": 1.63, + "grad_norm": 0.8832158446311951, + "learning_rate": 0.00011443793275290783, + "loss": 0.5311, + "step": 1685 + }, + { + "epoch": 1.63, + "grad_norm": 1.06523597240448, + "learning_rate": 0.00011430785386270095, + "loss": 0.74, + "step": 1686 + }, + { + "epoch": 1.63, + "grad_norm": 1.0592454671859741, + "learning_rate": 0.00011417778663545786, + "loss": 0.9425, + "step": 1687 + }, + { + "epoch": 1.63, + "grad_norm": 1.1511181592941284, + "learning_rate": 0.00011404773121305557, + "loss": 0.991, + "step": 1688 + }, + { + "epoch": 1.63, + "grad_norm": 1.0210840702056885, + "learning_rate": 0.0001139176877373582, + "loss": 0.7791, + "step": 1689 + }, + { + "epoch": 1.63, + "grad_norm": 1.1962254047393799, + "learning_rate": 0.00011378765635021676, + "loss": 1.0616, + "step": 1690 + }, + { + "epoch": 1.63, + "grad_norm": 1.127206563949585, + "learning_rate": 0.00011365763719346918, + "loss": 0.9061, + "step": 1691 + }, + { + "epoch": 1.63, + "grad_norm": 0.847680926322937, + "learning_rate": 0.00011352763040893996, + "loss": 0.5002, + "step": 1692 + }, + { + "epoch": 1.63, + "grad_norm": 1.2024614810943604, + "learning_rate": 0.00011339763613844014, + "loss": 0.7685, + "step": 1693 + }, + { + "epoch": 1.64, + "grad_norm": 1.1078310012817383, + "learning_rate": 0.00011326765452376717, + "loss": 0.8315, + "step": 1694 + }, + { + "epoch": 1.64, + "grad_norm": 1.2252180576324463, + "learning_rate": 0.0001131376857067046, + "loss": 0.8286, + "step": 1695 + }, + { + "epoch": 1.64, + "grad_norm": 0.832125186920166, + "learning_rate": 0.00011300772982902208, + "loss": 0.4677, + "step": 1696 + }, + { + "epoch": 1.64, + "grad_norm": 1.0959038734436035, + "learning_rate": 0.0001128777870324751, + "loss": 0.8417, + "step": 1697 + }, + { + "epoch": 1.64, + "grad_norm": 1.1298578977584839, + "learning_rate": 0.00011274785745880492, + "loss": 0.927, + "step": 1698 + }, + { + "epoch": 1.64, + "grad_norm": 1.3126767873764038, + "learning_rate": 0.0001126179412497384, + "loss": 0.9112, + "step": 1699 + }, + { + "epoch": 1.64, + "grad_norm": 1.009222388267517, + "learning_rate": 0.0001124880385469877, + "loss": 1.0107, + "step": 1700 + }, + { + "epoch": 1.64, + "grad_norm": 1.049607515335083, + "learning_rate": 0.00011235814949225044, + "loss": 0.7388, + "step": 1701 + }, + { + "epoch": 1.64, + "grad_norm": 1.1917518377304077, + "learning_rate": 0.00011222827422720916, + "loss": 0.8073, + "step": 1702 + }, + { + "epoch": 1.64, + "grad_norm": 1.2827389240264893, + "learning_rate": 0.00011209841289353151, + "loss": 0.7094, + "step": 1703 + }, + { + "epoch": 1.64, + "grad_norm": 0.8789745569229126, + "learning_rate": 0.0001119685656328698, + "loss": 0.5815, + "step": 1704 + }, + { + "epoch": 1.65, + "grad_norm": 0.9101778268814087, + "learning_rate": 0.00011183873258686112, + "loss": 0.5959, + "step": 1705 + }, + { + "epoch": 1.65, + "grad_norm": 0.9749009013175964, + "learning_rate": 0.00011170891389712704, + "loss": 0.636, + "step": 1706 + }, + { + "epoch": 1.65, + "grad_norm": 1.2081713676452637, + "learning_rate": 0.00011157910970527335, + "loss": 0.9361, + "step": 1707 + }, + { + "epoch": 1.65, + "grad_norm": 1.3563265800476074, + "learning_rate": 0.00011144932015289017, + "loss": 0.9394, + "step": 1708 + }, + { + "epoch": 1.65, + "grad_norm": 1.036575436592102, + "learning_rate": 0.00011131954538155153, + "loss": 0.8256, + "step": 1709 + }, + { + "epoch": 1.65, + "grad_norm": 1.026749610900879, + "learning_rate": 0.00011118978553281541, + "loss": 0.7384, + "step": 1710 + }, + { + "epoch": 1.65, + "grad_norm": 0.8870358467102051, + "learning_rate": 0.00011106004074822356, + "loss": 0.5476, + "step": 1711 + }, + { + "epoch": 1.65, + "grad_norm": 0.9484975337982178, + "learning_rate": 0.00011093031116930122, + "loss": 0.655, + "step": 1712 + }, + { + "epoch": 1.65, + "grad_norm": 1.34621000289917, + "learning_rate": 0.000110800596937557, + "loss": 1.0488, + "step": 1713 + }, + { + "epoch": 1.65, + "grad_norm": 1.0034739971160889, + "learning_rate": 0.0001106708981944829, + "loss": 0.6783, + "step": 1714 + }, + { + "epoch": 1.66, + "grad_norm": 1.5905283689498901, + "learning_rate": 0.00011054121508155387, + "loss": 1.4052, + "step": 1715 + }, + { + "epoch": 1.66, + "grad_norm": 1.4879212379455566, + "learning_rate": 0.00011041154774022798, + "loss": 1.1154, + "step": 1716 + }, + { + "epoch": 1.66, + "grad_norm": 1.1741530895233154, + "learning_rate": 0.00011028189631194599, + "loss": 0.8851, + "step": 1717 + }, + { + "epoch": 1.66, + "grad_norm": 1.1095561981201172, + "learning_rate": 0.00011015226093813129, + "loss": 0.8269, + "step": 1718 + }, + { + "epoch": 1.66, + "grad_norm": 1.4847856760025024, + "learning_rate": 0.00011002264176018983, + "loss": 1.2673, + "step": 1719 + }, + { + "epoch": 1.66, + "grad_norm": 1.4990216493606567, + "learning_rate": 0.00010989303891950983, + "loss": 1.5933, + "step": 1720 + }, + { + "epoch": 1.66, + "grad_norm": 1.3621212244033813, + "learning_rate": 0.00010976345255746172, + "loss": 0.7648, + "step": 1721 + }, + { + "epoch": 1.66, + "grad_norm": 1.175217866897583, + "learning_rate": 0.00010963388281539797, + "loss": 0.9908, + "step": 1722 + }, + { + "epoch": 1.66, + "grad_norm": 1.1372090578079224, + "learning_rate": 0.00010950432983465288, + "loss": 0.887, + "step": 1723 + }, + { + "epoch": 1.66, + "grad_norm": 1.3557263612747192, + "learning_rate": 0.0001093747937565425, + "loss": 1.0648, + "step": 1724 + }, + { + "epoch": 1.67, + "grad_norm": 1.1040289402008057, + "learning_rate": 0.00010924527472236439, + "loss": 0.5231, + "step": 1725 + }, + { + "epoch": 1.67, + "grad_norm": 1.4421457052230835, + "learning_rate": 0.00010911577287339762, + "loss": 1.5833, + "step": 1726 + }, + { + "epoch": 1.67, + "grad_norm": 1.1788628101348877, + "learning_rate": 0.00010898628835090243, + "loss": 0.8498, + "step": 1727 + }, + { + "epoch": 1.67, + "grad_norm": 1.0729405879974365, + "learning_rate": 0.00010885682129612016, + "loss": 0.857, + "step": 1728 + }, + { + "epoch": 1.67, + "grad_norm": 0.8381689786911011, + "learning_rate": 0.00010872737185027318, + "loss": 0.5624, + "step": 1729 + }, + { + "epoch": 1.67, + "grad_norm": 1.0547059774398804, + "learning_rate": 0.00010859794015456449, + "loss": 0.9163, + "step": 1730 + }, + { + "epoch": 1.67, + "grad_norm": 1.547420859336853, + "learning_rate": 0.00010846852635017794, + "loss": 1.1406, + "step": 1731 + }, + { + "epoch": 1.67, + "grad_norm": 1.2853301763534546, + "learning_rate": 0.00010833913057827769, + "loss": 0.9673, + "step": 1732 + }, + { + "epoch": 1.67, + "grad_norm": 1.172054648399353, + "learning_rate": 0.00010820975298000828, + "loss": 0.8101, + "step": 1733 + }, + { + "epoch": 1.67, + "grad_norm": 1.3405346870422363, + "learning_rate": 0.00010808039369649445, + "loss": 0.8742, + "step": 1734 + }, + { + "epoch": 1.67, + "grad_norm": 0.9736365675926208, + "learning_rate": 0.00010795105286884104, + "loss": 0.6063, + "step": 1735 + }, + { + "epoch": 1.68, + "grad_norm": 1.025546908378601, + "learning_rate": 0.00010782173063813258, + "loss": 0.7712, + "step": 1736 + }, + { + "epoch": 1.68, + "grad_norm": 1.3535629510879517, + "learning_rate": 0.00010769242714543343, + "loss": 1.039, + "step": 1737 + }, + { + "epoch": 1.68, + "grad_norm": 1.2922720909118652, + "learning_rate": 0.0001075631425317875, + "loss": 1.0313, + "step": 1738 + }, + { + "epoch": 1.68, + "grad_norm": 1.1379432678222656, + "learning_rate": 0.00010743387693821807, + "loss": 1.0168, + "step": 1739 + }, + { + "epoch": 1.68, + "grad_norm": 1.063762903213501, + "learning_rate": 0.00010730463050572773, + "loss": 0.6007, + "step": 1740 + }, + { + "epoch": 1.68, + "grad_norm": 1.3603583574295044, + "learning_rate": 0.00010717540337529809, + "loss": 0.9233, + "step": 1741 + }, + { + "epoch": 1.68, + "grad_norm": 1.1882977485656738, + "learning_rate": 0.00010704619568788979, + "loss": 0.8773, + "step": 1742 + }, + { + "epoch": 1.68, + "grad_norm": 0.9956715106964111, + "learning_rate": 0.0001069170075844422, + "loss": 0.6026, + "step": 1743 + }, + { + "epoch": 1.68, + "grad_norm": 1.4287210702896118, + "learning_rate": 0.00010678783920587335, + "loss": 0.8701, + "step": 1744 + }, + { + "epoch": 1.68, + "grad_norm": 1.173557996749878, + "learning_rate": 0.00010665869069307979, + "loss": 0.7308, + "step": 1745 + }, + { + "epoch": 1.69, + "grad_norm": 1.12995183467865, + "learning_rate": 0.00010652956218693632, + "loss": 0.9228, + "step": 1746 + }, + { + "epoch": 1.69, + "grad_norm": 1.1359423398971558, + "learning_rate": 0.000106400453828296, + "loss": 0.9147, + "step": 1747 + }, + { + "epoch": 1.69, + "grad_norm": 1.2205817699432373, + "learning_rate": 0.00010627136575798981, + "loss": 0.9102, + "step": 1748 + }, + { + "epoch": 1.69, + "grad_norm": 1.2005549669265747, + "learning_rate": 0.00010614229811682674, + "loss": 0.9014, + "step": 1749 + }, + { + "epoch": 1.69, + "grad_norm": 1.3272981643676758, + "learning_rate": 0.0001060132510455934, + "loss": 0.992, + "step": 1750 + }, + { + "epoch": 1.69, + "grad_norm": 1.143885612487793, + "learning_rate": 0.00010588422468505396, + "loss": 0.7217, + "step": 1751 + }, + { + "epoch": 1.69, + "grad_norm": 1.626641869544983, + "learning_rate": 0.00010575521917595005, + "loss": 1.2493, + "step": 1752 + }, + { + "epoch": 1.69, + "grad_norm": 1.1745210886001587, + "learning_rate": 0.00010562623465900048, + "loss": 1.0751, + "step": 1753 + }, + { + "epoch": 1.69, + "grad_norm": 1.2065612077713013, + "learning_rate": 0.00010549727127490125, + "loss": 0.7826, + "step": 1754 + }, + { + "epoch": 1.69, + "grad_norm": 1.4140163660049438, + "learning_rate": 0.00010536832916432527, + "loss": 0.9993, + "step": 1755 + }, + { + "epoch": 1.69, + "grad_norm": 1.2656209468841553, + "learning_rate": 0.00010523940846792222, + "loss": 0.7522, + "step": 1756 + }, + { + "epoch": 1.7, + "grad_norm": 1.4236201047897339, + "learning_rate": 0.00010511050932631841, + "loss": 1.0854, + "step": 1757 + }, + { + "epoch": 1.7, + "grad_norm": 0.815237820148468, + "learning_rate": 0.0001049816318801168, + "loss": 0.3725, + "step": 1758 + }, + { + "epoch": 1.7, + "grad_norm": 1.0349665880203247, + "learning_rate": 0.0001048527762698964, + "loss": 0.8778, + "step": 1759 + }, + { + "epoch": 1.7, + "grad_norm": 1.2187834978103638, + "learning_rate": 0.00010472394263621266, + "loss": 0.9184, + "step": 1760 + }, + { + "epoch": 1.7, + "grad_norm": 1.215148687362671, + "learning_rate": 0.00010459513111959695, + "loss": 1.087, + "step": 1761 + }, + { + "epoch": 1.7, + "grad_norm": 1.1747918128967285, + "learning_rate": 0.00010446634186055646, + "loss": 0.7544, + "step": 1762 + }, + { + "epoch": 1.7, + "grad_norm": 1.4393612146377563, + "learning_rate": 0.00010433757499957423, + "loss": 1.2608, + "step": 1763 + }, + { + "epoch": 1.7, + "grad_norm": 1.0888879299163818, + "learning_rate": 0.00010420883067710875, + "loss": 0.8795, + "step": 1764 + }, + { + "epoch": 1.7, + "grad_norm": 1.2865713834762573, + "learning_rate": 0.00010408010903359398, + "loss": 0.9863, + "step": 1765 + }, + { + "epoch": 1.7, + "grad_norm": 1.033387303352356, + "learning_rate": 0.00010395141020943918, + "loss": 0.6285, + "step": 1766 + }, + { + "epoch": 1.71, + "grad_norm": 1.0320261716842651, + "learning_rate": 0.00010382273434502862, + "loss": 0.6868, + "step": 1767 + }, + { + "epoch": 1.71, + "grad_norm": 1.0416369438171387, + "learning_rate": 0.00010369408158072164, + "loss": 0.8719, + "step": 1768 + }, + { + "epoch": 1.71, + "grad_norm": 1.0624701976776123, + "learning_rate": 0.00010356545205685227, + "loss": 0.8254, + "step": 1769 + }, + { + "epoch": 1.71, + "grad_norm": 1.2965495586395264, + "learning_rate": 0.00010343684591372926, + "loss": 0.9601, + "step": 1770 + }, + { + "epoch": 1.71, + "grad_norm": 1.0764302015304565, + "learning_rate": 0.00010330826329163587, + "loss": 0.5743, + "step": 1771 + }, + { + "epoch": 1.71, + "grad_norm": 1.0314794778823853, + "learning_rate": 0.00010317970433082963, + "loss": 0.5785, + "step": 1772 + }, + { + "epoch": 1.71, + "grad_norm": 1.2629597187042236, + "learning_rate": 0.00010305116917154233, + "loss": 0.5249, + "step": 1773 + }, + { + "epoch": 1.71, + "grad_norm": 1.4034559726715088, + "learning_rate": 0.00010292265795397974, + "loss": 0.8213, + "step": 1774 + }, + { + "epoch": 1.71, + "grad_norm": 1.548949122428894, + "learning_rate": 0.00010279417081832161, + "loss": 1.2624, + "step": 1775 + }, + { + "epoch": 1.71, + "grad_norm": 1.2316782474517822, + "learning_rate": 0.00010266570790472129, + "loss": 0.8921, + "step": 1776 + }, + { + "epoch": 1.72, + "grad_norm": 1.1409668922424316, + "learning_rate": 0.00010253726935330578, + "loss": 0.8301, + "step": 1777 + }, + { + "epoch": 1.72, + "grad_norm": 1.2139928340911865, + "learning_rate": 0.00010240885530417557, + "loss": 0.842, + "step": 1778 + }, + { + "epoch": 1.72, + "grad_norm": 1.1755350828170776, + "learning_rate": 0.00010228046589740425, + "loss": 1.2071, + "step": 1779 + }, + { + "epoch": 1.72, + "grad_norm": 1.0287277698516846, + "learning_rate": 0.00010215210127303874, + "loss": 0.6262, + "step": 1780 + }, + { + "epoch": 1.72, + "grad_norm": 1.7859232425689697, + "learning_rate": 0.00010202376157109869, + "loss": 1.2228, + "step": 1781 + }, + { + "epoch": 1.72, + "grad_norm": 1.0035349130630493, + "learning_rate": 0.00010189544693157684, + "loss": 0.7594, + "step": 1782 + }, + { + "epoch": 1.72, + "grad_norm": 1.4260820150375366, + "learning_rate": 0.00010176715749443838, + "loss": 0.8775, + "step": 1783 + }, + { + "epoch": 1.72, + "grad_norm": 1.3202534914016724, + "learning_rate": 0.00010163889339962109, + "loss": 0.9169, + "step": 1784 + }, + { + "epoch": 1.72, + "grad_norm": 1.4712368249893188, + "learning_rate": 0.00010151065478703508, + "loss": 1.197, + "step": 1785 + }, + { + "epoch": 1.72, + "grad_norm": 1.1128965616226196, + "learning_rate": 0.00010138244179656271, + "loss": 0.7421, + "step": 1786 + }, + { + "epoch": 1.72, + "grad_norm": 0.981045663356781, + "learning_rate": 0.00010125425456805831, + "loss": 0.5326, + "step": 1787 + }, + { + "epoch": 1.73, + "grad_norm": 1.068075180053711, + "learning_rate": 0.00010112609324134818, + "loss": 0.8425, + "step": 1788 + }, + { + "epoch": 1.73, + "grad_norm": 0.9965289235115051, + "learning_rate": 0.00010099795795623037, + "loss": 0.6533, + "step": 1789 + }, + { + "epoch": 1.73, + "grad_norm": 1.4239498376846313, + "learning_rate": 0.00010086984885247446, + "loss": 0.9943, + "step": 1790 + }, + { + "epoch": 1.73, + "grad_norm": 1.2689576148986816, + "learning_rate": 0.00010074176606982158, + "loss": 0.9002, + "step": 1791 + }, + { + "epoch": 1.73, + "grad_norm": 0.8137116432189941, + "learning_rate": 0.00010061370974798398, + "loss": 0.6084, + "step": 1792 + }, + { + "epoch": 1.73, + "grad_norm": 1.1245088577270508, + "learning_rate": 0.0001004856800266452, + "loss": 0.6368, + "step": 1793 + }, + { + "epoch": 1.73, + "grad_norm": 1.4730548858642578, + "learning_rate": 0.00010035767704545972, + "loss": 1.2741, + "step": 1794 + }, + { + "epoch": 1.73, + "grad_norm": 1.6720154285430908, + "learning_rate": 0.00010022970094405282, + "loss": 1.2503, + "step": 1795 + }, + { + "epoch": 1.73, + "grad_norm": 1.2065510749816895, + "learning_rate": 0.00010010175186202051, + "loss": 0.8932, + "step": 1796 + }, + { + "epoch": 1.73, + "grad_norm": 1.3618618249893188, + "learning_rate": 9.997382993892925e-05, + "loss": 0.8034, + "step": 1797 + }, + { + "epoch": 1.74, + "grad_norm": 0.9418081641197205, + "learning_rate": 9.984593531431596e-05, + "loss": 0.6693, + "step": 1798 + }, + { + "epoch": 1.74, + "grad_norm": 1.377590298652649, + "learning_rate": 9.971806812768777e-05, + "loss": 1.045, + "step": 1799 + }, + { + "epoch": 1.74, + "grad_norm": 1.1018116474151611, + "learning_rate": 9.959022851852181e-05, + "loss": 0.9014, + "step": 1800 + }, + { + "epoch": 1.74, + "grad_norm": 1.0155657529830933, + "learning_rate": 9.94624166262652e-05, + "loss": 0.8305, + "step": 1801 + }, + { + "epoch": 1.74, + "grad_norm": 0.9664143323898315, + "learning_rate": 9.93346325903348e-05, + "loss": 0.9144, + "step": 1802 + }, + { + "epoch": 1.74, + "grad_norm": 1.164772868156433, + "learning_rate": 9.92068765501171e-05, + "loss": 0.6573, + "step": 1803 + }, + { + "epoch": 1.74, + "grad_norm": 1.186354160308838, + "learning_rate": 9.907914864496795e-05, + "loss": 0.795, + "step": 1804 + }, + { + "epoch": 1.74, + "grad_norm": 2.0216922760009766, + "learning_rate": 9.895144901421276e-05, + "loss": 0.9515, + "step": 1805 + }, + { + "epoch": 1.74, + "grad_norm": 1.3452105522155762, + "learning_rate": 9.882377779714581e-05, + "loss": 0.9363, + "step": 1806 + }, + { + "epoch": 1.74, + "grad_norm": 1.1607165336608887, + "learning_rate": 9.869613513303061e-05, + "loss": 0.8842, + "step": 1807 + }, + { + "epoch": 1.75, + "grad_norm": 1.6268951892852783, + "learning_rate": 9.856852116109934e-05, + "loss": 0.9655, + "step": 1808 + }, + { + "epoch": 1.75, + "grad_norm": 1.1395448446273804, + "learning_rate": 9.844093602055302e-05, + "loss": 0.7502, + "step": 1809 + }, + { + "epoch": 1.75, + "grad_norm": 0.886443018913269, + "learning_rate": 9.831337985056118e-05, + "loss": 0.5854, + "step": 1810 + }, + { + "epoch": 1.75, + "grad_norm": 1.210634469985962, + "learning_rate": 9.818585279026172e-05, + "loss": 0.5795, + "step": 1811 + }, + { + "epoch": 1.75, + "grad_norm": 1.1434961557388306, + "learning_rate": 9.805835497876081e-05, + "loss": 0.8192, + "step": 1812 + }, + { + "epoch": 1.75, + "grad_norm": 1.2203586101531982, + "learning_rate": 9.793088655513271e-05, + "loss": 0.9838, + "step": 1813 + }, + { + "epoch": 1.75, + "grad_norm": 1.5141627788543701, + "learning_rate": 9.780344765841966e-05, + "loss": 0.9707, + "step": 1814 + }, + { + "epoch": 1.75, + "grad_norm": 1.3851544857025146, + "learning_rate": 9.767603842763163e-05, + "loss": 1.0544, + "step": 1815 + }, + { + "epoch": 1.75, + "grad_norm": 1.5114184617996216, + "learning_rate": 9.754865900174626e-05, + "loss": 1.0471, + "step": 1816 + }, + { + "epoch": 1.75, + "grad_norm": 1.1894009113311768, + "learning_rate": 9.742130951970872e-05, + "loss": 0.8344, + "step": 1817 + }, + { + "epoch": 1.75, + "grad_norm": 1.153277039527893, + "learning_rate": 9.729399012043142e-05, + "loss": 0.8441, + "step": 1818 + }, + { + "epoch": 1.76, + "grad_norm": 1.0140668153762817, + "learning_rate": 9.716670094279408e-05, + "loss": 0.5894, + "step": 1819 + }, + { + "epoch": 1.76, + "grad_norm": 1.1728570461273193, + "learning_rate": 9.703944212564331e-05, + "loss": 0.9653, + "step": 1820 + }, + { + "epoch": 1.76, + "grad_norm": 1.7548686265945435, + "learning_rate": 9.69122138077927e-05, + "loss": 1.028, + "step": 1821 + }, + { + "epoch": 1.76, + "grad_norm": 1.3905855417251587, + "learning_rate": 9.678501612802263e-05, + "loss": 1.0491, + "step": 1822 + }, + { + "epoch": 1.76, + "grad_norm": 1.0206429958343506, + "learning_rate": 9.665784922507986e-05, + "loss": 0.7978, + "step": 1823 + }, + { + "epoch": 1.76, + "grad_norm": 1.292028546333313, + "learning_rate": 9.653071323767782e-05, + "loss": 1.0386, + "step": 1824 + }, + { + "epoch": 1.76, + "grad_norm": 1.0786666870117188, + "learning_rate": 9.6403608304496e-05, + "loss": 0.9896, + "step": 1825 + }, + { + "epoch": 1.76, + "grad_norm": 1.233822226524353, + "learning_rate": 9.627653456418015e-05, + "loss": 0.8662, + "step": 1826 + }, + { + "epoch": 1.76, + "grad_norm": 1.1821707487106323, + "learning_rate": 9.6149492155342e-05, + "loss": 0.9316, + "step": 1827 + }, + { + "epoch": 1.76, + "grad_norm": 1.213242769241333, + "learning_rate": 9.602248121655896e-05, + "loss": 0.8224, + "step": 1828 + }, + { + "epoch": 1.77, + "grad_norm": 1.3918932676315308, + "learning_rate": 9.589550188637431e-05, + "loss": 1.3013, + "step": 1829 + }, + { + "epoch": 1.77, + "grad_norm": 0.8927656412124634, + "learning_rate": 9.576855430329677e-05, + "loss": 0.4922, + "step": 1830 + }, + { + "epoch": 1.77, + "grad_norm": 1.371404767036438, + "learning_rate": 9.564163860580034e-05, + "loss": 1.4036, + "step": 1831 + }, + { + "epoch": 1.77, + "grad_norm": 1.2355639934539795, + "learning_rate": 9.551475493232434e-05, + "loss": 1.0875, + "step": 1832 + }, + { + "epoch": 1.77, + "grad_norm": 1.2743234634399414, + "learning_rate": 9.538790342127317e-05, + "loss": 0.8392, + "step": 1833 + }, + { + "epoch": 1.77, + "grad_norm": 1.3681198358535767, + "learning_rate": 9.526108421101608e-05, + "loss": 1.1149, + "step": 1834 + }, + { + "epoch": 1.77, + "grad_norm": 0.9238698482513428, + "learning_rate": 9.513429743988715e-05, + "loss": 0.7203, + "step": 1835 + }, + { + "epoch": 1.77, + "grad_norm": 1.1493499279022217, + "learning_rate": 9.5007543246185e-05, + "loss": 0.6815, + "step": 1836 + }, + { + "epoch": 1.77, + "grad_norm": 1.1091593503952026, + "learning_rate": 9.488082176817276e-05, + "loss": 1.0553, + "step": 1837 + }, + { + "epoch": 1.77, + "grad_norm": 1.177230954170227, + "learning_rate": 9.475413314407791e-05, + "loss": 1.1332, + "step": 1838 + }, + { + "epoch": 1.78, + "grad_norm": 1.0102418661117554, + "learning_rate": 9.462747751209203e-05, + "loss": 0.6488, + "step": 1839 + }, + { + "epoch": 1.78, + "grad_norm": 0.9865624308586121, + "learning_rate": 9.450085501037074e-05, + "loss": 0.7, + "step": 1840 + }, + { + "epoch": 1.78, + "grad_norm": 1.4693363904953003, + "learning_rate": 9.437426577703352e-05, + "loss": 0.8631, + "step": 1841 + }, + { + "epoch": 1.78, + "grad_norm": 1.2326639890670776, + "learning_rate": 9.424770995016355e-05, + "loss": 0.9311, + "step": 1842 + }, + { + "epoch": 1.78, + "grad_norm": 1.074099063873291, + "learning_rate": 9.412118766780762e-05, + "loss": 0.9159, + "step": 1843 + }, + { + "epoch": 1.78, + "grad_norm": 1.105237603187561, + "learning_rate": 9.399469906797584e-05, + "loss": 0.9116, + "step": 1844 + }, + { + "epoch": 1.78, + "grad_norm": 0.9910746812820435, + "learning_rate": 9.386824428864169e-05, + "loss": 0.6469, + "step": 1845 + }, + { + "epoch": 1.78, + "grad_norm": 1.4138925075531006, + "learning_rate": 9.37418234677416e-05, + "loss": 1.0355, + "step": 1846 + }, + { + "epoch": 1.78, + "grad_norm": 1.0311132669448853, + "learning_rate": 9.361543674317517e-05, + "loss": 0.6162, + "step": 1847 + }, + { + "epoch": 1.78, + "grad_norm": 1.1188533306121826, + "learning_rate": 9.348908425280462e-05, + "loss": 1.0292, + "step": 1848 + }, + { + "epoch": 1.78, + "grad_norm": 0.9886927604675293, + "learning_rate": 9.33627661344549e-05, + "loss": 0.5008, + "step": 1849 + }, + { + "epoch": 1.79, + "grad_norm": 1.3715204000473022, + "learning_rate": 9.323648252591351e-05, + "loss": 0.8163, + "step": 1850 + }, + { + "epoch": 1.79, + "grad_norm": 0.8927873373031616, + "learning_rate": 9.311023356493021e-05, + "loss": 0.4377, + "step": 1851 + }, + { + "epoch": 1.79, + "grad_norm": 1.2950105667114258, + "learning_rate": 9.298401938921708e-05, + "loss": 0.8877, + "step": 1852 + }, + { + "epoch": 1.79, + "grad_norm": 1.3683269023895264, + "learning_rate": 9.285784013644817e-05, + "loss": 1.2326, + "step": 1853 + }, + { + "epoch": 1.79, + "grad_norm": 1.0928535461425781, + "learning_rate": 9.273169594425951e-05, + "loss": 0.7462, + "step": 1854 + }, + { + "epoch": 1.79, + "grad_norm": 1.2238028049468994, + "learning_rate": 9.260558695024877e-05, + "loss": 0.7335, + "step": 1855 + }, + { + "epoch": 1.79, + "grad_norm": 1.110521674156189, + "learning_rate": 9.247951329197532e-05, + "loss": 0.7661, + "step": 1856 + }, + { + "epoch": 1.79, + "grad_norm": 1.1721173524856567, + "learning_rate": 9.235347510695997e-05, + "loss": 0.9289, + "step": 1857 + }, + { + "epoch": 1.79, + "grad_norm": 1.3499724864959717, + "learning_rate": 9.222747253268485e-05, + "loss": 1.1241, + "step": 1858 + }, + { + "epoch": 1.79, + "grad_norm": 1.076082706451416, + "learning_rate": 9.210150570659317e-05, + "loss": 0.7515, + "step": 1859 + }, + { + "epoch": 1.8, + "grad_norm": 1.311646819114685, + "learning_rate": 9.197557476608926e-05, + "loss": 0.9019, + "step": 1860 + }, + { + "epoch": 1.8, + "grad_norm": 1.0273613929748535, + "learning_rate": 9.18496798485382e-05, + "loss": 0.8496, + "step": 1861 + }, + { + "epoch": 1.8, + "grad_norm": 1.1790790557861328, + "learning_rate": 9.172382109126584e-05, + "loss": 0.8454, + "step": 1862 + }, + { + "epoch": 1.8, + "grad_norm": 1.3008348941802979, + "learning_rate": 9.159799863155857e-05, + "loss": 1.0305, + "step": 1863 + }, + { + "epoch": 1.8, + "grad_norm": 1.2865149974822998, + "learning_rate": 9.147221260666317e-05, + "loss": 0.7998, + "step": 1864 + }, + { + "epoch": 1.8, + "grad_norm": 1.3089659214019775, + "learning_rate": 9.134646315378673e-05, + "loss": 0.7147, + "step": 1865 + }, + { + "epoch": 1.8, + "grad_norm": 1.3149784803390503, + "learning_rate": 9.122075041009636e-05, + "loss": 0.8714, + "step": 1866 + }, + { + "epoch": 1.8, + "eval_loss": 1.2798618078231812, + "eval_runtime": 28.3827, + "eval_samples_per_second": 2.995, + "eval_steps_per_second": 1.515, + "step": 1866 + }, + { + "epoch": 1.8, + "grad_norm": 1.132293462753296, + "learning_rate": 9.109507451271922e-05, + "loss": 0.7688, + "step": 1867 + }, + { + "epoch": 1.8, + "grad_norm": 1.052720308303833, + "learning_rate": 9.096943559874222e-05, + "loss": 0.8002, + "step": 1868 + }, + { + "epoch": 1.8, + "grad_norm": 1.0711005926132202, + "learning_rate": 9.084383380521194e-05, + "loss": 0.6079, + "step": 1869 + }, + { + "epoch": 1.81, + "grad_norm": 1.3810640573501587, + "learning_rate": 9.071826926913446e-05, + "loss": 1.1552, + "step": 1870 + }, + { + "epoch": 1.81, + "grad_norm": 1.2143968343734741, + "learning_rate": 9.059274212747525e-05, + "loss": 0.8241, + "step": 1871 + }, + { + "epoch": 1.81, + "grad_norm": 1.5594843626022339, + "learning_rate": 9.046725251715898e-05, + "loss": 1.6285, + "step": 1872 + }, + { + "epoch": 1.81, + "grad_norm": 0.9207326173782349, + "learning_rate": 9.03418005750694e-05, + "loss": 0.6927, + "step": 1873 + }, + { + "epoch": 1.81, + "grad_norm": 0.8353189826011658, + "learning_rate": 9.021638643804907e-05, + "loss": 0.4263, + "step": 1874 + }, + { + "epoch": 1.81, + "grad_norm": 1.267177939414978, + "learning_rate": 9.009101024289941e-05, + "loss": 0.948, + "step": 1875 + }, + { + "epoch": 1.81, + "grad_norm": 1.1151788234710693, + "learning_rate": 8.996567212638047e-05, + "loss": 0.9628, + "step": 1876 + }, + { + "epoch": 1.81, + "grad_norm": 1.1289645433425903, + "learning_rate": 8.984037222521074e-05, + "loss": 1.014, + "step": 1877 + }, + { + "epoch": 1.81, + "grad_norm": 1.169303297996521, + "learning_rate": 8.971511067606696e-05, + "loss": 0.8052, + "step": 1878 + }, + { + "epoch": 1.81, + "grad_norm": 1.068725347518921, + "learning_rate": 8.958988761558411e-05, + "loss": 0.734, + "step": 1879 + }, + { + "epoch": 1.81, + "grad_norm": 1.1281707286834717, + "learning_rate": 8.946470318035514e-05, + "loss": 0.9301, + "step": 1880 + }, + { + "epoch": 1.82, + "grad_norm": 1.137937068939209, + "learning_rate": 8.93395575069309e-05, + "loss": 0.638, + "step": 1881 + }, + { + "epoch": 1.82, + "grad_norm": 1.3555268049240112, + "learning_rate": 8.921445073182e-05, + "loss": 0.9903, + "step": 1882 + }, + { + "epoch": 1.82, + "grad_norm": 1.433186411857605, + "learning_rate": 8.908938299148847e-05, + "loss": 0.9861, + "step": 1883 + }, + { + "epoch": 1.82, + "grad_norm": 0.9451619982719421, + "learning_rate": 8.896435442235995e-05, + "loss": 0.5024, + "step": 1884 + }, + { + "epoch": 1.82, + "grad_norm": 1.4170985221862793, + "learning_rate": 8.883936516081521e-05, + "loss": 1.1437, + "step": 1885 + }, + { + "epoch": 1.82, + "grad_norm": 1.1750236749649048, + "learning_rate": 8.871441534319223e-05, + "loss": 0.7278, + "step": 1886 + }, + { + "epoch": 1.82, + "grad_norm": 0.9916924238204956, + "learning_rate": 8.858950510578588e-05, + "loss": 0.5394, + "step": 1887 + }, + { + "epoch": 1.82, + "grad_norm": 1.1440494060516357, + "learning_rate": 8.846463458484794e-05, + "loss": 0.7198, + "step": 1888 + }, + { + "epoch": 1.82, + "grad_norm": 1.0721584558486938, + "learning_rate": 8.833980391658685e-05, + "loss": 0.7725, + "step": 1889 + }, + { + "epoch": 1.82, + "grad_norm": 1.2798032760620117, + "learning_rate": 8.821501323716751e-05, + "loss": 1.0844, + "step": 1890 + }, + { + "epoch": 1.83, + "grad_norm": 0.939553439617157, + "learning_rate": 8.809026268271131e-05, + "loss": 0.6705, + "step": 1891 + }, + { + "epoch": 1.83, + "grad_norm": 1.2908331155776978, + "learning_rate": 8.796555238929577e-05, + "loss": 1.0146, + "step": 1892 + }, + { + "epoch": 1.83, + "grad_norm": 1.5145471096038818, + "learning_rate": 8.784088249295455e-05, + "loss": 1.2522, + "step": 1893 + }, + { + "epoch": 1.83, + "grad_norm": 0.8970005512237549, + "learning_rate": 8.771625312967727e-05, + "loss": 0.6222, + "step": 1894 + }, + { + "epoch": 1.83, + "grad_norm": 1.233276605606079, + "learning_rate": 8.759166443540923e-05, + "loss": 0.8866, + "step": 1895 + }, + { + "epoch": 1.83, + "grad_norm": 1.0272518396377563, + "learning_rate": 8.746711654605152e-05, + "loss": 0.8453, + "step": 1896 + }, + { + "epoch": 1.83, + "grad_norm": 1.503488302230835, + "learning_rate": 8.734260959746054e-05, + "loss": 1.0577, + "step": 1897 + }, + { + "epoch": 1.83, + "grad_norm": 1.0901001691818237, + "learning_rate": 8.721814372544817e-05, + "loss": 0.8395, + "step": 1898 + }, + { + "epoch": 1.83, + "grad_norm": 1.0978087186813354, + "learning_rate": 8.709371906578147e-05, + "loss": 0.598, + "step": 1899 + }, + { + "epoch": 1.83, + "grad_norm": 1.0039117336273193, + "learning_rate": 8.696933575418254e-05, + "loss": 0.8613, + "step": 1900 + }, + { + "epoch": 1.83, + "grad_norm": 1.0104575157165527, + "learning_rate": 8.684499392632831e-05, + "loss": 0.7842, + "step": 1901 + }, + { + "epoch": 1.84, + "grad_norm": 1.2603685855865479, + "learning_rate": 8.672069371785052e-05, + "loss": 0.8023, + "step": 1902 + }, + { + "epoch": 1.84, + "grad_norm": 1.1488606929779053, + "learning_rate": 8.659643526433547e-05, + "loss": 1.0167, + "step": 1903 + }, + { + "epoch": 1.84, + "grad_norm": 1.049129843711853, + "learning_rate": 8.647221870132396e-05, + "loss": 0.7355, + "step": 1904 + }, + { + "epoch": 1.84, + "grad_norm": 1.4575903415679932, + "learning_rate": 8.634804416431113e-05, + "loss": 0.9889, + "step": 1905 + }, + { + "epoch": 1.84, + "grad_norm": 0.9238115549087524, + "learning_rate": 8.622391178874614e-05, + "loss": 0.5083, + "step": 1906 + }, + { + "epoch": 1.84, + "grad_norm": 1.1920543909072876, + "learning_rate": 8.609982171003231e-05, + "loss": 0.9315, + "step": 1907 + }, + { + "epoch": 1.84, + "grad_norm": 2.0908291339874268, + "learning_rate": 8.597577406352671e-05, + "loss": 0.9874, + "step": 1908 + }, + { + "epoch": 1.84, + "grad_norm": 1.1943837404251099, + "learning_rate": 8.58517689845402e-05, + "loss": 0.8991, + "step": 1909 + }, + { + "epoch": 1.84, + "grad_norm": 1.6717904806137085, + "learning_rate": 8.572780660833723e-05, + "loss": 1.4169, + "step": 1910 + }, + { + "epoch": 1.84, + "grad_norm": 1.2120851278305054, + "learning_rate": 8.560388707013553e-05, + "loss": 0.8864, + "step": 1911 + }, + { + "epoch": 1.85, + "grad_norm": 1.237656593322754, + "learning_rate": 8.548001050510624e-05, + "loss": 0.7664, + "step": 1912 + }, + { + "epoch": 1.85, + "grad_norm": 0.891211986541748, + "learning_rate": 8.535617704837358e-05, + "loss": 0.4961, + "step": 1913 + }, + { + "epoch": 1.85, + "grad_norm": 1.9243524074554443, + "learning_rate": 8.523238683501472e-05, + "loss": 1.4698, + "step": 1914 + }, + { + "epoch": 1.85, + "grad_norm": 1.1669132709503174, + "learning_rate": 8.510864000005974e-05, + "loss": 0.858, + "step": 1915 + }, + { + "epoch": 1.85, + "grad_norm": 1.3465845584869385, + "learning_rate": 8.49849366784913e-05, + "loss": 1.1071, + "step": 1916 + }, + { + "epoch": 1.85, + "grad_norm": 1.277543306350708, + "learning_rate": 8.486127700524468e-05, + "loss": 0.9655, + "step": 1917 + }, + { + "epoch": 1.85, + "grad_norm": 1.2550066709518433, + "learning_rate": 8.473766111520747e-05, + "loss": 0.8256, + "step": 1918 + }, + { + "epoch": 1.85, + "grad_norm": 1.0959876775741577, + "learning_rate": 8.461408914321962e-05, + "loss": 0.8549, + "step": 1919 + }, + { + "epoch": 1.85, + "grad_norm": 1.254042387008667, + "learning_rate": 8.4490561224073e-05, + "loss": 0.9881, + "step": 1920 + }, + { + "epoch": 1.85, + "grad_norm": 1.036967396736145, + "learning_rate": 8.436707749251161e-05, + "loss": 0.8791, + "step": 1921 + }, + { + "epoch": 1.86, + "grad_norm": 1.089248538017273, + "learning_rate": 8.424363808323107e-05, + "loss": 0.7721, + "step": 1922 + }, + { + "epoch": 1.86, + "grad_norm": 1.3928107023239136, + "learning_rate": 8.41202431308789e-05, + "loss": 1.0117, + "step": 1923 + }, + { + "epoch": 1.86, + "grad_norm": 1.0236631631851196, + "learning_rate": 8.399689277005386e-05, + "loss": 0.8051, + "step": 1924 + }, + { + "epoch": 1.86, + "grad_norm": 1.1423760652542114, + "learning_rate": 8.387358713530621e-05, + "loss": 0.7716, + "step": 1925 + }, + { + "epoch": 1.86, + "grad_norm": 1.547163724899292, + "learning_rate": 8.375032636113744e-05, + "loss": 0.8395, + "step": 1926 + }, + { + "epoch": 1.86, + "grad_norm": 1.176893949508667, + "learning_rate": 8.362711058200001e-05, + "loss": 1.0601, + "step": 1927 + }, + { + "epoch": 1.86, + "grad_norm": 1.064943552017212, + "learning_rate": 8.350393993229742e-05, + "loss": 0.8291, + "step": 1928 + }, + { + "epoch": 1.86, + "grad_norm": 0.9675112962722778, + "learning_rate": 8.33808145463838e-05, + "loss": 0.6613, + "step": 1929 + }, + { + "epoch": 1.86, + "grad_norm": 1.179880142211914, + "learning_rate": 8.325773455856406e-05, + "loss": 0.6113, + "step": 1930 + }, + { + "epoch": 1.86, + "grad_norm": 1.3865076303482056, + "learning_rate": 8.313470010309345e-05, + "loss": 1.164, + "step": 1931 + }, + { + "epoch": 1.86, + "grad_norm": 1.0845417976379395, + "learning_rate": 8.301171131417764e-05, + "loss": 0.8402, + "step": 1932 + }, + { + "epoch": 1.87, + "grad_norm": 1.3481731414794922, + "learning_rate": 8.28887683259725e-05, + "loss": 1.0072, + "step": 1933 + }, + { + "epoch": 1.87, + "grad_norm": 1.0890945196151733, + "learning_rate": 8.276587127258386e-05, + "loss": 0.715, + "step": 1934 + }, + { + "epoch": 1.87, + "grad_norm": 1.2235881090164185, + "learning_rate": 8.264302028806755e-05, + "loss": 0.7171, + "step": 1935 + }, + { + "epoch": 1.87, + "grad_norm": 1.18882155418396, + "learning_rate": 8.252021550642899e-05, + "loss": 0.8737, + "step": 1936 + }, + { + "epoch": 1.87, + "grad_norm": 1.4608304500579834, + "learning_rate": 8.239745706162336e-05, + "loss": 1.1279, + "step": 1937 + }, + { + "epoch": 1.87, + "grad_norm": 1.1702600717544556, + "learning_rate": 8.227474508755529e-05, + "loss": 0.8266, + "step": 1938 + }, + { + "epoch": 1.87, + "grad_norm": 1.3556264638900757, + "learning_rate": 8.215207971807856e-05, + "loss": 0.9647, + "step": 1939 + }, + { + "epoch": 1.87, + "grad_norm": 1.1378785371780396, + "learning_rate": 8.20294610869963e-05, + "loss": 0.734, + "step": 1940 + }, + { + "epoch": 1.87, + "grad_norm": 1.2239617109298706, + "learning_rate": 8.19068893280605e-05, + "loss": 0.9285, + "step": 1941 + }, + { + "epoch": 1.87, + "grad_norm": 1.1962335109710693, + "learning_rate": 8.178436457497218e-05, + "loss": 0.9248, + "step": 1942 + }, + { + "epoch": 1.88, + "grad_norm": 1.020392656326294, + "learning_rate": 8.1661886961381e-05, + "loss": 0.8781, + "step": 1943 + }, + { + "epoch": 1.88, + "grad_norm": 1.4522514343261719, + "learning_rate": 8.153945662088514e-05, + "loss": 1.4785, + "step": 1944 + }, + { + "epoch": 1.88, + "grad_norm": 1.1410571336746216, + "learning_rate": 8.14170736870313e-05, + "loss": 1.0401, + "step": 1945 + }, + { + "epoch": 1.88, + "grad_norm": 1.2002620697021484, + "learning_rate": 8.129473829331452e-05, + "loss": 0.9184, + "step": 1946 + }, + { + "epoch": 1.88, + "grad_norm": 1.2167078256607056, + "learning_rate": 8.117245057317785e-05, + "loss": 0.8945, + "step": 1947 + }, + { + "epoch": 1.88, + "grad_norm": 1.2816885709762573, + "learning_rate": 8.10502106600124e-05, + "loss": 0.9105, + "step": 1948 + }, + { + "epoch": 1.88, + "grad_norm": 1.3111622333526611, + "learning_rate": 8.092801868715719e-05, + "loss": 0.8802, + "step": 1949 + }, + { + "epoch": 1.88, + "grad_norm": 2.199481964111328, + "learning_rate": 8.080587478789881e-05, + "loss": 1.4535, + "step": 1950 + }, + { + "epoch": 1.88, + "grad_norm": 1.4825800657272339, + "learning_rate": 8.068377909547157e-05, + "loss": 1.0266, + "step": 1951 + }, + { + "epoch": 1.88, + "grad_norm": 1.1725435256958008, + "learning_rate": 8.056173174305706e-05, + "loss": 1.0433, + "step": 1952 + }, + { + "epoch": 1.89, + "grad_norm": 1.0390863418579102, + "learning_rate": 8.043973286378419e-05, + "loss": 0.6835, + "step": 1953 + }, + { + "epoch": 1.89, + "grad_norm": 1.0793219804763794, + "learning_rate": 8.031778259072909e-05, + "loss": 0.8351, + "step": 1954 + }, + { + "epoch": 1.89, + "grad_norm": 1.5510258674621582, + "learning_rate": 8.01958810569147e-05, + "loss": 1.6831, + "step": 1955 + }, + { + "epoch": 1.89, + "grad_norm": 1.1949468851089478, + "learning_rate": 8.007402839531092e-05, + "loss": 0.8632, + "step": 1956 + }, + { + "epoch": 1.89, + "grad_norm": 1.6904135942459106, + "learning_rate": 7.995222473883426e-05, + "loss": 1.221, + "step": 1957 + }, + { + "epoch": 1.89, + "grad_norm": 1.4048141241073608, + "learning_rate": 7.983047022034785e-05, + "loss": 1.3236, + "step": 1958 + }, + { + "epoch": 1.89, + "grad_norm": 0.9245966672897339, + "learning_rate": 7.97087649726612e-05, + "loss": 0.7279, + "step": 1959 + }, + { + "epoch": 1.89, + "grad_norm": 0.9320887327194214, + "learning_rate": 7.958710912853003e-05, + "loss": 0.6237, + "step": 1960 + }, + { + "epoch": 1.89, + "grad_norm": 0.9442594647407532, + "learning_rate": 7.946550282065623e-05, + "loss": 0.5785, + "step": 1961 + }, + { + "epoch": 1.89, + "grad_norm": 0.9472630620002747, + "learning_rate": 7.93439461816876e-05, + "loss": 0.7583, + "step": 1962 + }, + { + "epoch": 1.89, + "grad_norm": 1.3176424503326416, + "learning_rate": 7.922243934421783e-05, + "loss": 1.0933, + "step": 1963 + }, + { + "epoch": 1.9, + "grad_norm": 1.2619996070861816, + "learning_rate": 7.910098244078618e-05, + "loss": 1.1577, + "step": 1964 + }, + { + "epoch": 1.9, + "grad_norm": 1.3114757537841797, + "learning_rate": 7.897957560387755e-05, + "loss": 1.1675, + "step": 1965 + }, + { + "epoch": 1.9, + "grad_norm": 0.9982705116271973, + "learning_rate": 7.885821896592221e-05, + "loss": 1.0683, + "step": 1966 + }, + { + "epoch": 1.9, + "grad_norm": 1.1973057985305786, + "learning_rate": 7.873691265929562e-05, + "loss": 0.7786, + "step": 1967 + }, + { + "epoch": 1.9, + "grad_norm": 1.3525385856628418, + "learning_rate": 7.861565681631838e-05, + "loss": 0.9712, + "step": 1968 + }, + { + "epoch": 1.9, + "grad_norm": 1.4053092002868652, + "learning_rate": 7.849445156925594e-05, + "loss": 0.9474, + "step": 1969 + }, + { + "epoch": 1.9, + "grad_norm": 1.096291184425354, + "learning_rate": 7.837329705031882e-05, + "loss": 0.6538, + "step": 1970 + }, + { + "epoch": 1.9, + "grad_norm": 1.0880426168441772, + "learning_rate": 7.825219339166193e-05, + "loss": 0.7716, + "step": 1971 + }, + { + "epoch": 1.9, + "grad_norm": 1.013417363166809, + "learning_rate": 7.813114072538488e-05, + "loss": 0.7396, + "step": 1972 + }, + { + "epoch": 1.9, + "grad_norm": 0.8553948402404785, + "learning_rate": 7.801013918353149e-05, + "loss": 0.412, + "step": 1973 + }, + { + "epoch": 1.91, + "grad_norm": 1.061767578125, + "learning_rate": 7.788918889809e-05, + "loss": 0.6492, + "step": 1974 + }, + { + "epoch": 1.91, + "grad_norm": 1.1720582246780396, + "learning_rate": 7.77682900009926e-05, + "loss": 0.9722, + "step": 1975 + }, + { + "epoch": 1.91, + "grad_norm": 1.6846498250961304, + "learning_rate": 7.764744262411548e-05, + "loss": 1.0323, + "step": 1976 + }, + { + "epoch": 1.91, + "grad_norm": 1.277524709701538, + "learning_rate": 7.752664689927868e-05, + "loss": 0.9301, + "step": 1977 + }, + { + "epoch": 1.91, + "grad_norm": 1.1959736347198486, + "learning_rate": 7.740590295824578e-05, + "loss": 0.9538, + "step": 1978 + }, + { + "epoch": 1.91, + "grad_norm": 1.221312403678894, + "learning_rate": 7.728521093272398e-05, + "loss": 0.8935, + "step": 1979 + }, + { + "epoch": 1.91, + "grad_norm": 1.4521623849868774, + "learning_rate": 7.716457095436378e-05, + "loss": 1.4089, + "step": 1980 + }, + { + "epoch": 1.91, + "grad_norm": 1.021066665649414, + "learning_rate": 7.704398315475897e-05, + "loss": 0.6881, + "step": 1981 + }, + { + "epoch": 1.91, + "grad_norm": 1.46727454662323, + "learning_rate": 7.692344766544642e-05, + "loss": 0.9396, + "step": 1982 + }, + { + "epoch": 1.91, + "grad_norm": 0.8718202114105225, + "learning_rate": 7.680296461790583e-05, + "loss": 0.673, + "step": 1983 + }, + { + "epoch": 1.92, + "grad_norm": 1.5483649969100952, + "learning_rate": 7.668253414355986e-05, + "loss": 1.0771, + "step": 1984 + }, + { + "epoch": 1.92, + "grad_norm": 0.7343392968177795, + "learning_rate": 7.656215637377367e-05, + "loss": 0.5231, + "step": 1985 + }, + { + "epoch": 1.92, + "grad_norm": 1.5243200063705444, + "learning_rate": 7.644183143985504e-05, + "loss": 1.1263, + "step": 1986 + }, + { + "epoch": 1.92, + "grad_norm": 1.0501224994659424, + "learning_rate": 7.632155947305411e-05, + "loss": 0.7451, + "step": 1987 + }, + { + "epoch": 1.92, + "grad_norm": 0.9759675860404968, + "learning_rate": 7.620134060456316e-05, + "loss": 0.5377, + "step": 1988 + }, + { + "epoch": 1.92, + "grad_norm": 1.44871187210083, + "learning_rate": 7.608117496551665e-05, + "loss": 1.0826, + "step": 1989 + }, + { + "epoch": 1.92, + "grad_norm": 1.0177377462387085, + "learning_rate": 7.596106268699088e-05, + "loss": 0.8488, + "step": 1990 + }, + { + "epoch": 1.92, + "grad_norm": 1.3080190420150757, + "learning_rate": 7.584100390000405e-05, + "loss": 0.8976, + "step": 1991 + }, + { + "epoch": 1.92, + "grad_norm": 1.238023042678833, + "learning_rate": 7.572099873551585e-05, + "loss": 0.863, + "step": 1992 + }, + { + "epoch": 1.92, + "grad_norm": 0.9784834980964661, + "learning_rate": 7.560104732442774e-05, + "loss": 0.5743, + "step": 1993 + }, + { + "epoch": 1.92, + "grad_norm": 1.2834821939468384, + "learning_rate": 7.54811497975823e-05, + "loss": 1.0481, + "step": 1994 + }, + { + "epoch": 1.93, + "grad_norm": 0.9290242195129395, + "learning_rate": 7.536130628576346e-05, + "loss": 0.5789, + "step": 1995 + }, + { + "epoch": 1.93, + "grad_norm": 1.137740969657898, + "learning_rate": 7.524151691969615e-05, + "loss": 0.74, + "step": 1996 + }, + { + "epoch": 1.93, + "grad_norm": 0.9530397653579712, + "learning_rate": 7.512178183004632e-05, + "loss": 0.3774, + "step": 1997 + }, + { + "epoch": 1.93, + "grad_norm": 0.8863974809646606, + "learning_rate": 7.500210114742068e-05, + "loss": 0.7082, + "step": 1998 + }, + { + "epoch": 1.93, + "grad_norm": 1.524322748184204, + "learning_rate": 7.488247500236656e-05, + "loss": 0.891, + "step": 1999 + }, + { + "epoch": 1.93, + "grad_norm": 0.9274705648422241, + "learning_rate": 7.476290352537188e-05, + "loss": 0.5485, + "step": 2000 + }, + { + "epoch": 1.93, + "grad_norm": 1.0648436546325684, + "learning_rate": 7.464338684686481e-05, + "loss": 0.6977, + "step": 2001 + }, + { + "epoch": 1.93, + "grad_norm": 1.333897590637207, + "learning_rate": 7.452392509721391e-05, + "loss": 0.7772, + "step": 2002 + }, + { + "epoch": 1.93, + "grad_norm": 1.1420466899871826, + "learning_rate": 7.440451840672766e-05, + "loss": 0.8032, + "step": 2003 + }, + { + "epoch": 1.93, + "grad_norm": 1.3571137189865112, + "learning_rate": 7.428516690565455e-05, + "loss": 0.8893, + "step": 2004 + }, + { + "epoch": 1.94, + "grad_norm": 1.165768027305603, + "learning_rate": 7.416587072418293e-05, + "loss": 0.647, + "step": 2005 + }, + { + "epoch": 1.94, + "grad_norm": 1.4117252826690674, + "learning_rate": 7.404662999244069e-05, + "loss": 0.9203, + "step": 2006 + }, + { + "epoch": 1.94, + "grad_norm": 1.206149935722351, + "learning_rate": 7.392744484049533e-05, + "loss": 0.8442, + "step": 2007 + }, + { + "epoch": 1.94, + "grad_norm": 1.2842216491699219, + "learning_rate": 7.380831539835367e-05, + "loss": 1.046, + "step": 2008 + }, + { + "epoch": 1.94, + "grad_norm": 1.2140345573425293, + "learning_rate": 7.368924179596177e-05, + "loss": 0.5501, + "step": 2009 + }, + { + "epoch": 1.94, + "grad_norm": 1.0220657587051392, + "learning_rate": 7.357022416320485e-05, + "loss": 0.7256, + "step": 2010 + }, + { + "epoch": 1.94, + "grad_norm": 1.4720944166183472, + "learning_rate": 7.345126262990693e-05, + "loss": 0.977, + "step": 2011 + }, + { + "epoch": 1.94, + "grad_norm": 1.208706021308899, + "learning_rate": 7.3332357325831e-05, + "loss": 0.7604, + "step": 2012 + }, + { + "epoch": 1.94, + "grad_norm": 1.3358442783355713, + "learning_rate": 7.32135083806786e-05, + "loss": 1.0311, + "step": 2013 + }, + { + "epoch": 1.94, + "grad_norm": 1.1216422319412231, + "learning_rate": 7.309471592408984e-05, + "loss": 0.5732, + "step": 2014 + }, + { + "epoch": 1.94, + "grad_norm": 1.0371495485305786, + "learning_rate": 7.297598008564324e-05, + "loss": 0.5215, + "step": 2015 + }, + { + "epoch": 1.95, + "grad_norm": 2.6368014812469482, + "learning_rate": 7.285730099485543e-05, + "loss": 1.1245, + "step": 2016 + }, + { + "epoch": 1.95, + "grad_norm": 1.0725258588790894, + "learning_rate": 7.273867878118139e-05, + "loss": 0.5464, + "step": 2017 + }, + { + "epoch": 1.95, + "grad_norm": 1.368296504020691, + "learning_rate": 7.262011357401377e-05, + "loss": 1.0568, + "step": 2018 + }, + { + "epoch": 1.95, + "grad_norm": 1.1201132535934448, + "learning_rate": 7.250160550268329e-05, + "loss": 0.8696, + "step": 2019 + }, + { + "epoch": 1.95, + "grad_norm": 1.248964548110962, + "learning_rate": 7.238315469645815e-05, + "loss": 0.8216, + "step": 2020 + }, + { + "epoch": 1.95, + "grad_norm": 1.132161259651184, + "learning_rate": 7.226476128454415e-05, + "loss": 0.6487, + "step": 2021 + }, + { + "epoch": 1.95, + "grad_norm": 1.1532809734344482, + "learning_rate": 7.214642539608457e-05, + "loss": 0.7051, + "step": 2022 + }, + { + "epoch": 1.95, + "grad_norm": 1.2578155994415283, + "learning_rate": 7.202814716015987e-05, + "loss": 0.8329, + "step": 2023 + }, + { + "epoch": 1.95, + "grad_norm": 1.1598081588745117, + "learning_rate": 7.190992670578758e-05, + "loss": 0.8972, + "step": 2024 + }, + { + "epoch": 1.95, + "grad_norm": 1.4254798889160156, + "learning_rate": 7.179176416192223e-05, + "loss": 1.0065, + "step": 2025 + }, + { + "epoch": 1.96, + "grad_norm": 1.1195855140686035, + "learning_rate": 7.167365965745529e-05, + "loss": 0.6957, + "step": 2026 + }, + { + "epoch": 1.96, + "grad_norm": 1.5119630098342896, + "learning_rate": 7.155561332121477e-05, + "loss": 1.0989, + "step": 2027 + }, + { + "epoch": 1.96, + "grad_norm": 0.9728872776031494, + "learning_rate": 7.143762528196524e-05, + "loss": 0.696, + "step": 2028 + }, + { + "epoch": 1.96, + "grad_norm": 1.5496985912322998, + "learning_rate": 7.131969566840788e-05, + "loss": 0.9291, + "step": 2029 + }, + { + "epoch": 1.96, + "grad_norm": 1.1622270345687866, + "learning_rate": 7.120182460917987e-05, + "loss": 0.6708, + "step": 2030 + }, + { + "epoch": 1.96, + "grad_norm": 1.5910062789916992, + "learning_rate": 7.108401223285463e-05, + "loss": 1.271, + "step": 2031 + }, + { + "epoch": 1.96, + "grad_norm": 1.4135732650756836, + "learning_rate": 7.096625866794165e-05, + "loss": 0.7867, + "step": 2032 + }, + { + "epoch": 1.96, + "grad_norm": 1.1207356452941895, + "learning_rate": 7.084856404288619e-05, + "loss": 0.8527, + "step": 2033 + }, + { + "epoch": 1.96, + "grad_norm": 1.189270257949829, + "learning_rate": 7.07309284860692e-05, + "loss": 1.0127, + "step": 2034 + }, + { + "epoch": 1.96, + "grad_norm": 0.9003916382789612, + "learning_rate": 7.061335212580717e-05, + "loss": 0.6056, + "step": 2035 + }, + { + "epoch": 1.97, + "grad_norm": 1.007911205291748, + "learning_rate": 7.049583509035218e-05, + "loss": 0.8295, + "step": 2036 + }, + { + "epoch": 1.97, + "grad_norm": 1.3060964345932007, + "learning_rate": 7.037837750789144e-05, + "loss": 0.7436, + "step": 2037 + }, + { + "epoch": 1.97, + "grad_norm": 1.2931039333343506, + "learning_rate": 7.026097950654733e-05, + "loss": 0.8215, + "step": 2038 + }, + { + "epoch": 1.97, + "grad_norm": 1.0952670574188232, + "learning_rate": 7.014364121437727e-05, + "loss": 0.5497, + "step": 2039 + }, + { + "epoch": 1.97, + "grad_norm": 1.260249376296997, + "learning_rate": 7.002636275937366e-05, + "loss": 1.1938, + "step": 2040 + }, + { + "epoch": 1.97, + "grad_norm": 1.329917311668396, + "learning_rate": 6.990914426946341e-05, + "loss": 0.9008, + "step": 2041 + }, + { + "epoch": 1.97, + "grad_norm": 0.9461268782615662, + "learning_rate": 6.979198587250812e-05, + "loss": 0.6912, + "step": 2042 + }, + { + "epoch": 1.97, + "grad_norm": 1.3266767263412476, + "learning_rate": 6.967488769630394e-05, + "loss": 0.6323, + "step": 2043 + }, + { + "epoch": 1.97, + "grad_norm": 1.455551028251648, + "learning_rate": 6.955784986858117e-05, + "loss": 1.2073, + "step": 2044 + }, + { + "epoch": 1.97, + "grad_norm": 1.1189351081848145, + "learning_rate": 6.944087251700437e-05, + "loss": 0.8706, + "step": 2045 + }, + { + "epoch": 1.97, + "grad_norm": 1.1993297338485718, + "learning_rate": 6.932395576917209e-05, + "loss": 0.8263, + "step": 2046 + }, + { + "epoch": 1.98, + "grad_norm": 1.2151789665222168, + "learning_rate": 6.920709975261686e-05, + "loss": 1.04, + "step": 2047 + }, + { + "epoch": 1.98, + "grad_norm": 1.1951751708984375, + "learning_rate": 6.909030459480487e-05, + "loss": 0.804, + "step": 2048 + }, + { + "epoch": 1.98, + "grad_norm": 0.9988811612129211, + "learning_rate": 6.89735704231359e-05, + "loss": 0.7224, + "step": 2049 + }, + { + "epoch": 1.98, + "grad_norm": 0.8926522731781006, + "learning_rate": 6.885689736494337e-05, + "loss": 0.4905, + "step": 2050 + }, + { + "epoch": 1.98, + "grad_norm": 1.233476996421814, + "learning_rate": 6.87402855474939e-05, + "loss": 1.0149, + "step": 2051 + }, + { + "epoch": 1.98, + "grad_norm": 1.099674940109253, + "learning_rate": 6.862373509798725e-05, + "loss": 0.8594, + "step": 2052 + }, + { + "epoch": 1.98, + "grad_norm": 0.9110369682312012, + "learning_rate": 6.850724614355649e-05, + "loss": 0.6211, + "step": 2053 + }, + { + "epoch": 1.98, + "grad_norm": 1.037454605102539, + "learning_rate": 6.839081881126735e-05, + "loss": 0.6983, + "step": 2054 + }, + { + "epoch": 1.98, + "grad_norm": 1.5326181650161743, + "learning_rate": 6.827445322811849e-05, + "loss": 1.3385, + "step": 2055 + }, + { + "epoch": 1.98, + "grad_norm": 1.00337815284729, + "learning_rate": 6.81581495210411e-05, + "loss": 0.7778, + "step": 2056 + }, + { + "epoch": 1.99, + "grad_norm": 0.9707116484642029, + "learning_rate": 6.804190781689904e-05, + "loss": 0.493, + "step": 2057 + }, + { + "epoch": 1.99, + "grad_norm": 1.5447719097137451, + "learning_rate": 6.792572824248839e-05, + "loss": 1.3885, + "step": 2058 + }, + { + "epoch": 1.99, + "grad_norm": 1.3587324619293213, + "learning_rate": 6.78096109245375e-05, + "loss": 1.0177, + "step": 2059 + }, + { + "epoch": 1.99, + "grad_norm": 1.4563665390014648, + "learning_rate": 6.769355598970686e-05, + "loss": 1.0932, + "step": 2060 + }, + { + "epoch": 1.99, + "grad_norm": 1.1776785850524902, + "learning_rate": 6.757756356458888e-05, + "loss": 1.0123, + "step": 2061 + }, + { + "epoch": 1.99, + "grad_norm": 1.409449577331543, + "learning_rate": 6.746163377570777e-05, + "loss": 0.9802, + "step": 2062 + }, + { + "epoch": 1.99, + "grad_norm": 1.5689934492111206, + "learning_rate": 6.73457667495193e-05, + "loss": 0.939, + "step": 2063 + }, + { + "epoch": 1.99, + "grad_norm": 1.2428174018859863, + "learning_rate": 6.722996261241116e-05, + "loss": 1.004, + "step": 2064 + }, + { + "epoch": 1.99, + "grad_norm": 1.0462515354156494, + "learning_rate": 6.711422149070201e-05, + "loss": 0.809, + "step": 2065 + }, + { + "epoch": 1.99, + "grad_norm": 0.9034873843193054, + "learning_rate": 6.699854351064201e-05, + "loss": 0.6096, + "step": 2066 + }, + { + "epoch": 2.0, + "grad_norm": 1.0596833229064941, + "learning_rate": 6.688292879841231e-05, + "loss": 0.7736, + "step": 2067 + }, + { + "epoch": 2.0, + "grad_norm": 1.184257984161377, + "learning_rate": 6.676737748012523e-05, + "loss": 0.9299, + "step": 2068 + }, + { + "epoch": 2.0, + "grad_norm": 1.2924439907073975, + "learning_rate": 6.665188968182379e-05, + "loss": 1.0715, + "step": 2069 + }, + { + "epoch": 2.0, + "grad_norm": 1.367877721786499, + "learning_rate": 6.65364655294817e-05, + "loss": 1.1742, + "step": 2070 + }, + { + "epoch": 2.0, + "grad_norm": 1.4632872343063354, + "learning_rate": 6.642110514900342e-05, + "loss": 1.1471, + "step": 2071 + }, + { + "epoch": 2.0, + "grad_norm": 1.3757456541061401, + "learning_rate": 6.630580866622369e-05, + "loss": 1.0932, + "step": 2072 + }, + { + "epoch": 2.0, + "grad_norm": 0.8886003494262695, + "learning_rate": 6.619057620690756e-05, + "loss": 0.6071, + "step": 2073 + }, + { + "epoch": 2.0, + "grad_norm": 0.5674740672111511, + "learning_rate": 6.607540789675029e-05, + "loss": 0.2693, + "step": 2074 + }, + { + "epoch": 2.0, + "grad_norm": 0.8528835773468018, + "learning_rate": 6.596030386137723e-05, + "loss": 0.4279, + "step": 2075 + }, + { + "epoch": 2.0, + "grad_norm": 0.6943017244338989, + "learning_rate": 6.58452642263435e-05, + "loss": 0.3272, + "step": 2076 + }, + { + "epoch": 2.0, + "grad_norm": 1.0943104028701782, + "learning_rate": 6.573028911713397e-05, + "loss": 0.7113, + "step": 2077 + }, + { + "epoch": 2.01, + "grad_norm": 0.8859809041023254, + "learning_rate": 6.561537865916328e-05, + "loss": 0.7564, + "step": 2078 + }, + { + "epoch": 2.01, + "grad_norm": 0.8713234066963196, + "learning_rate": 6.550053297777538e-05, + "loss": 0.4953, + "step": 2079 + }, + { + "epoch": 2.01, + "grad_norm": 0.8634008765220642, + "learning_rate": 6.538575219824358e-05, + "loss": 0.4017, + "step": 2080 + }, + { + "epoch": 2.01, + "grad_norm": 0.8085753321647644, + "learning_rate": 6.527103644577056e-05, + "loss": 0.395, + "step": 2081 + }, + { + "epoch": 2.01, + "grad_norm": 0.8304145336151123, + "learning_rate": 6.515638584548787e-05, + "loss": 0.541, + "step": 2082 + }, + { + "epoch": 2.01, + "grad_norm": 0.8201612830162048, + "learning_rate": 6.504180052245609e-05, + "loss": 0.5326, + "step": 2083 + }, + { + "epoch": 2.01, + "grad_norm": 0.7562471032142639, + "learning_rate": 6.492728060166453e-05, + "loss": 0.4054, + "step": 2084 + }, + { + "epoch": 2.01, + "grad_norm": 0.9142252206802368, + "learning_rate": 6.48128262080313e-05, + "loss": 0.5096, + "step": 2085 + }, + { + "epoch": 2.01, + "grad_norm": 0.7330839037895203, + "learning_rate": 6.469843746640286e-05, + "loss": 0.4187, + "step": 2086 + }, + { + "epoch": 2.01, + "grad_norm": 0.9274792075157166, + "learning_rate": 6.458411450155418e-05, + "loss": 0.5977, + "step": 2087 + }, + { + "epoch": 2.02, + "grad_norm": 0.8541355729103088, + "learning_rate": 6.446985743818841e-05, + "loss": 0.4998, + "step": 2088 + }, + { + "epoch": 2.02, + "grad_norm": 0.8299204111099243, + "learning_rate": 6.435566640093689e-05, + "loss": 0.4044, + "step": 2089 + }, + { + "epoch": 2.02, + "grad_norm": 1.0775256156921387, + "learning_rate": 6.424154151435886e-05, + "loss": 0.4907, + "step": 2090 + }, + { + "epoch": 2.02, + "grad_norm": 1.031470775604248, + "learning_rate": 6.412748290294141e-05, + "loss": 0.5825, + "step": 2091 + }, + { + "epoch": 2.02, + "grad_norm": 1.0457535982131958, + "learning_rate": 6.40134906910994e-05, + "loss": 0.5561, + "step": 2092 + }, + { + "epoch": 2.02, + "grad_norm": 0.9657010436058044, + "learning_rate": 6.389956500317523e-05, + "loss": 0.3679, + "step": 2093 + }, + { + "epoch": 2.02, + "grad_norm": 0.9064496755599976, + "learning_rate": 6.378570596343871e-05, + "loss": 0.4382, + "step": 2094 + }, + { + "epoch": 2.02, + "grad_norm": 1.033677577972412, + "learning_rate": 6.367191369608691e-05, + "loss": 0.4445, + "step": 2095 + }, + { + "epoch": 2.02, + "grad_norm": 0.8130964040756226, + "learning_rate": 6.355818832524424e-05, + "loss": 0.2655, + "step": 2096 + }, + { + "epoch": 2.02, + "grad_norm": 0.8505420088768005, + "learning_rate": 6.344452997496196e-05, + "loss": 0.3172, + "step": 2097 + }, + { + "epoch": 2.03, + "grad_norm": 1.4362200498580933, + "learning_rate": 6.333093876921825e-05, + "loss": 0.5314, + "step": 2098 + }, + { + "epoch": 2.03, + "grad_norm": 1.2488155364990234, + "learning_rate": 6.32174148319182e-05, + "loss": 0.4211, + "step": 2099 + }, + { + "epoch": 2.03, + "grad_norm": 1.3689916133880615, + "learning_rate": 6.310395828689331e-05, + "loss": 0.6113, + "step": 2100 + }, + { + "epoch": 2.03, + "grad_norm": 1.188043236732483, + "learning_rate": 6.299056925790173e-05, + "loss": 0.459, + "step": 2101 + }, + { + "epoch": 2.03, + "grad_norm": 1.1546956300735474, + "learning_rate": 6.287724786862782e-05, + "loss": 0.4895, + "step": 2102 + }, + { + "epoch": 2.03, + "grad_norm": 1.079819679260254, + "learning_rate": 6.276399424268233e-05, + "loss": 0.3741, + "step": 2103 + }, + { + "epoch": 2.03, + "grad_norm": 1.0333255529403687, + "learning_rate": 6.265080850360199e-05, + "loss": 0.3441, + "step": 2104 + }, + { + "epoch": 2.03, + "grad_norm": 1.281787395477295, + "learning_rate": 6.253769077484945e-05, + "loss": 0.4487, + "step": 2105 + }, + { + "epoch": 2.03, + "grad_norm": 1.5447965860366821, + "learning_rate": 6.242464117981327e-05, + "loss": 0.7834, + "step": 2106 + }, + { + "epoch": 2.03, + "grad_norm": 0.9776402711868286, + "learning_rate": 6.231165984180762e-05, + "loss": 0.4425, + "step": 2107 + }, + { + "epoch": 2.03, + "grad_norm": 1.036454677581787, + "learning_rate": 6.219874688407221e-05, + "loss": 0.4159, + "step": 2108 + }, + { + "epoch": 2.04, + "grad_norm": 1.1195133924484253, + "learning_rate": 6.208590242977223e-05, + "loss": 0.3327, + "step": 2109 + }, + { + "epoch": 2.04, + "grad_norm": 1.0579090118408203, + "learning_rate": 6.197312660199805e-05, + "loss": 0.5053, + "step": 2110 + }, + { + "epoch": 2.04, + "grad_norm": 1.0781733989715576, + "learning_rate": 6.186041952376532e-05, + "loss": 0.3166, + "step": 2111 + }, + { + "epoch": 2.04, + "grad_norm": 1.1364599466323853, + "learning_rate": 6.174778131801454e-05, + "loss": 0.4686, + "step": 2112 + }, + { + "epoch": 2.04, + "grad_norm": 1.6187351942062378, + "learning_rate": 6.163521210761114e-05, + "loss": 0.8032, + "step": 2113 + }, + { + "epoch": 2.04, + "grad_norm": 1.0550283193588257, + "learning_rate": 6.152271201534539e-05, + "loss": 0.4735, + "step": 2114 + }, + { + "epoch": 2.04, + "grad_norm": 1.0076221227645874, + "learning_rate": 6.141028116393204e-05, + "loss": 0.5, + "step": 2115 + }, + { + "epoch": 2.04, + "grad_norm": 1.0741852521896362, + "learning_rate": 6.129791967601031e-05, + "loss": 0.5462, + "step": 2116 + }, + { + "epoch": 2.04, + "grad_norm": 1.0995590686798096, + "learning_rate": 6.11856276741439e-05, + "loss": 0.3663, + "step": 2117 + }, + { + "epoch": 2.04, + "grad_norm": 1.0732592344284058, + "learning_rate": 6.107340528082054e-05, + "loss": 0.4206, + "step": 2118 + }, + { + "epoch": 2.05, + "grad_norm": 1.1388449668884277, + "learning_rate": 6.096125261845212e-05, + "loss": 0.4032, + "step": 2119 + }, + { + "epoch": 2.05, + "grad_norm": 1.468422770500183, + "learning_rate": 6.084916980937451e-05, + "loss": 0.3697, + "step": 2120 + }, + { + "epoch": 2.05, + "grad_norm": 1.2283095121383667, + "learning_rate": 6.07371569758473e-05, + "loss": 0.7188, + "step": 2121 + }, + { + "epoch": 2.05, + "grad_norm": 1.4419937133789062, + "learning_rate": 6.062521424005378e-05, + "loss": 0.7527, + "step": 2122 + }, + { + "epoch": 2.05, + "grad_norm": 1.368731141090393, + "learning_rate": 6.051334172410074e-05, + "loss": 0.4648, + "step": 2123 + }, + { + "epoch": 2.05, + "grad_norm": 1.0447849035263062, + "learning_rate": 6.0401539550018494e-05, + "loss": 0.3418, + "step": 2124 + }, + { + "epoch": 2.05, + "grad_norm": 1.1050643920898438, + "learning_rate": 6.028980783976051e-05, + "loss": 0.4347, + "step": 2125 + }, + { + "epoch": 2.05, + "grad_norm": 1.1442632675170898, + "learning_rate": 6.017814671520339e-05, + "loss": 0.4525, + "step": 2126 + }, + { + "epoch": 2.05, + "grad_norm": 1.100126028060913, + "learning_rate": 6.006655629814686e-05, + "loss": 0.3395, + "step": 2127 + }, + { + "epoch": 2.05, + "grad_norm": 1.2252593040466309, + "learning_rate": 5.995503671031342e-05, + "loss": 0.6215, + "step": 2128 + }, + { + "epoch": 2.06, + "grad_norm": 0.6980013251304626, + "learning_rate": 5.9843588073348315e-05, + "loss": 0.2842, + "step": 2129 + }, + { + "epoch": 2.06, + "grad_norm": 1.2332143783569336, + "learning_rate": 5.9732210508819384e-05, + "loss": 0.4217, + "step": 2130 + }, + { + "epoch": 2.06, + "grad_norm": 0.9792746305465698, + "learning_rate": 5.962090413821704e-05, + "loss": 0.4573, + "step": 2131 + }, + { + "epoch": 2.06, + "grad_norm": 1.107351541519165, + "learning_rate": 5.950966908295394e-05, + "loss": 0.3899, + "step": 2132 + }, + { + "epoch": 2.06, + "grad_norm": 0.6578118205070496, + "learning_rate": 5.9398505464364926e-05, + "loss": 0.2116, + "step": 2133 + }, + { + "epoch": 2.06, + "grad_norm": 1.0342390537261963, + "learning_rate": 5.928741340370701e-05, + "loss": 0.3738, + "step": 2134 + }, + { + "epoch": 2.06, + "grad_norm": 1.2607660293579102, + "learning_rate": 5.9176393022159174e-05, + "loss": 0.4923, + "step": 2135 + }, + { + "epoch": 2.06, + "grad_norm": 1.0737662315368652, + "learning_rate": 5.906544444082207e-05, + "loss": 0.4356, + "step": 2136 + }, + { + "epoch": 2.06, + "grad_norm": 1.0083526372909546, + "learning_rate": 5.895456778071809e-05, + "loss": 0.3503, + "step": 2137 + }, + { + "epoch": 2.06, + "grad_norm": 1.2637425661087036, + "learning_rate": 5.884376316279126e-05, + "loss": 0.6458, + "step": 2138 + }, + { + "epoch": 2.06, + "grad_norm": 1.065369725227356, + "learning_rate": 5.8733030707906936e-05, + "loss": 0.5274, + "step": 2139 + }, + { + "epoch": 2.07, + "grad_norm": 1.150827169418335, + "learning_rate": 5.862237053685175e-05, + "loss": 0.3604, + "step": 2140 + }, + { + "epoch": 2.07, + "grad_norm": 1.1921899318695068, + "learning_rate": 5.851178277033349e-05, + "loss": 0.5229, + "step": 2141 + }, + { + "epoch": 2.07, + "grad_norm": 1.0592162609100342, + "learning_rate": 5.840126752898106e-05, + "loss": 0.4295, + "step": 2142 + }, + { + "epoch": 2.07, + "grad_norm": 1.2355237007141113, + "learning_rate": 5.8290824933344175e-05, + "loss": 0.7435, + "step": 2143 + }, + { + "epoch": 2.07, + "grad_norm": 0.973853349685669, + "learning_rate": 5.818045510389322e-05, + "loss": 0.2713, + "step": 2144 + }, + { + "epoch": 2.07, + "grad_norm": 1.0709655284881592, + "learning_rate": 5.807015816101943e-05, + "loss": 0.2912, + "step": 2145 + }, + { + "epoch": 2.07, + "grad_norm": 1.268304705619812, + "learning_rate": 5.795993422503436e-05, + "loss": 0.5308, + "step": 2146 + }, + { + "epoch": 2.07, + "grad_norm": 0.886254608631134, + "learning_rate": 5.784978341616992e-05, + "loss": 0.2374, + "step": 2147 + }, + { + "epoch": 2.07, + "grad_norm": 0.9648481011390686, + "learning_rate": 5.7739705854578424e-05, + "loss": 0.4132, + "step": 2148 + }, + { + "epoch": 2.07, + "grad_norm": 1.0699951648712158, + "learning_rate": 5.7629701660332104e-05, + "loss": 0.418, + "step": 2149 + }, + { + "epoch": 2.08, + "grad_norm": 1.2619209289550781, + "learning_rate": 5.7519770953423256e-05, + "loss": 0.5038, + "step": 2150 + }, + { + "epoch": 2.08, + "grad_norm": 0.9739295840263367, + "learning_rate": 5.740991385376393e-05, + "loss": 0.5273, + "step": 2151 + }, + { + "epoch": 2.08, + "grad_norm": 0.7552351951599121, + "learning_rate": 5.7300130481186054e-05, + "loss": 0.2951, + "step": 2152 + }, + { + "epoch": 2.08, + "grad_norm": 1.2434828281402588, + "learning_rate": 5.7190420955440996e-05, + "loss": 0.581, + "step": 2153 + }, + { + "epoch": 2.08, + "grad_norm": 1.1353987455368042, + "learning_rate": 5.708078539619954e-05, + "loss": 0.4426, + "step": 2154 + }, + { + "epoch": 2.08, + "grad_norm": 1.0610759258270264, + "learning_rate": 5.697122392305194e-05, + "loss": 0.4811, + "step": 2155 + }, + { + "epoch": 2.08, + "grad_norm": 2.2417352199554443, + "learning_rate": 5.6861736655507546e-05, + "loss": 0.3413, + "step": 2156 + }, + { + "epoch": 2.08, + "grad_norm": 1.0961496829986572, + "learning_rate": 5.6752323712994684e-05, + "loss": 0.3918, + "step": 2157 + }, + { + "epoch": 2.08, + "grad_norm": 0.965447187423706, + "learning_rate": 5.664298521486076e-05, + "loss": 0.3328, + "step": 2158 + }, + { + "epoch": 2.08, + "grad_norm": 1.3192046880722046, + "learning_rate": 5.6533721280371945e-05, + "loss": 0.5889, + "step": 2159 + }, + { + "epoch": 2.08, + "grad_norm": 1.0044875144958496, + "learning_rate": 5.6424532028713e-05, + "loss": 0.3278, + "step": 2160 + }, + { + "epoch": 2.09, + "grad_norm": 0.924491286277771, + "learning_rate": 5.631541757898727e-05, + "loss": 0.2975, + "step": 2161 + }, + { + "epoch": 2.09, + "grad_norm": 1.2042051553726196, + "learning_rate": 5.6206378050216434e-05, + "loss": 0.5941, + "step": 2162 + }, + { + "epoch": 2.09, + "grad_norm": 1.0397312641143799, + "learning_rate": 5.609741356134061e-05, + "loss": 0.312, + "step": 2163 + }, + { + "epoch": 2.09, + "grad_norm": 1.298201322555542, + "learning_rate": 5.598852423121793e-05, + "loss": 0.5786, + "step": 2164 + }, + { + "epoch": 2.09, + "grad_norm": 1.0598686933517456, + "learning_rate": 5.587971017862453e-05, + "loss": 0.2725, + "step": 2165 + }, + { + "epoch": 2.09, + "grad_norm": 1.0505437850952148, + "learning_rate": 5.577097152225456e-05, + "loss": 0.3511, + "step": 2166 + }, + { + "epoch": 2.09, + "grad_norm": 1.3280149698257446, + "learning_rate": 5.566230838071983e-05, + "loss": 0.6197, + "step": 2167 + }, + { + "epoch": 2.09, + "grad_norm": 1.2126686573028564, + "learning_rate": 5.555372087254976e-05, + "loss": 0.4556, + "step": 2168 + }, + { + "epoch": 2.09, + "grad_norm": 0.9505837559700012, + "learning_rate": 5.544520911619128e-05, + "loss": 0.3118, + "step": 2169 + }, + { + "epoch": 2.09, + "grad_norm": 1.4259228706359863, + "learning_rate": 5.533677323000882e-05, + "loss": 0.5703, + "step": 2170 + }, + { + "epoch": 2.1, + "grad_norm": 1.0361673831939697, + "learning_rate": 5.522841333228389e-05, + "loss": 0.3863, + "step": 2171 + }, + { + "epoch": 2.1, + "grad_norm": 1.2724729776382446, + "learning_rate": 5.512012954121513e-05, + "loss": 0.6094, + "step": 2172 + }, + { + "epoch": 2.1, + "grad_norm": 1.155977725982666, + "learning_rate": 5.501192197491829e-05, + "loss": 0.3416, + "step": 2173 + }, + { + "epoch": 2.1, + "grad_norm": 2.008774757385254, + "learning_rate": 5.490379075142586e-05, + "loss": 0.8598, + "step": 2174 + }, + { + "epoch": 2.1, + "grad_norm": 1.293354868888855, + "learning_rate": 5.479573598868701e-05, + "loss": 0.5247, + "step": 2175 + }, + { + "epoch": 2.1, + "grad_norm": 1.1473639011383057, + "learning_rate": 5.468775780456768e-05, + "loss": 0.5186, + "step": 2176 + }, + { + "epoch": 2.1, + "grad_norm": 1.0745112895965576, + "learning_rate": 5.457985631685015e-05, + "loss": 0.5412, + "step": 2177 + }, + { + "epoch": 2.1, + "grad_norm": 1.0768483877182007, + "learning_rate": 5.4472031643233054e-05, + "loss": 0.6722, + "step": 2178 + }, + { + "epoch": 2.1, + "grad_norm": 1.7210749387741089, + "learning_rate": 5.4364283901331206e-05, + "loss": 0.9224, + "step": 2179 + }, + { + "epoch": 2.1, + "grad_norm": 1.2692290544509888, + "learning_rate": 5.4256613208675586e-05, + "loss": 0.6347, + "step": 2180 + }, + { + "epoch": 2.11, + "grad_norm": 0.937003493309021, + "learning_rate": 5.4149019682713155e-05, + "loss": 0.3242, + "step": 2181 + }, + { + "epoch": 2.11, + "grad_norm": 1.2912002801895142, + "learning_rate": 5.4041503440806594e-05, + "loss": 0.4896, + "step": 2182 + }, + { + "epoch": 2.11, + "grad_norm": 1.1830394268035889, + "learning_rate": 5.3934064600234256e-05, + "loss": 0.5103, + "step": 2183 + }, + { + "epoch": 2.11, + "grad_norm": 1.0488359928131104, + "learning_rate": 5.382670327819024e-05, + "loss": 0.3721, + "step": 2184 + }, + { + "epoch": 2.11, + "grad_norm": 0.9774886965751648, + "learning_rate": 5.371941959178393e-05, + "loss": 0.3123, + "step": 2185 + }, + { + "epoch": 2.11, + "grad_norm": 0.9702259302139282, + "learning_rate": 5.361221365804006e-05, + "loss": 0.3097, + "step": 2186 + }, + { + "epoch": 2.11, + "grad_norm": 0.9657133221626282, + "learning_rate": 5.350508559389863e-05, + "loss": 0.3055, + "step": 2187 + }, + { + "epoch": 2.11, + "grad_norm": 1.1606115102767944, + "learning_rate": 5.339803551621461e-05, + "loss": 0.474, + "step": 2188 + }, + { + "epoch": 2.11, + "grad_norm": 1.0346829891204834, + "learning_rate": 5.329106354175792e-05, + "loss": 0.4327, + "step": 2189 + }, + { + "epoch": 2.11, + "grad_norm": 1.5432517528533936, + "learning_rate": 5.318416978721327e-05, + "loss": 0.7782, + "step": 2190 + }, + { + "epoch": 2.11, + "grad_norm": 0.9665737152099609, + "learning_rate": 5.307735436918015e-05, + "loss": 0.3279, + "step": 2191 + }, + { + "epoch": 2.12, + "grad_norm": 1.0604126453399658, + "learning_rate": 5.297061740417249e-05, + "loss": 0.3749, + "step": 2192 + }, + { + "epoch": 2.12, + "grad_norm": 1.3815175294876099, + "learning_rate": 5.286395900861862e-05, + "loss": 0.6345, + "step": 2193 + }, + { + "epoch": 2.12, + "grad_norm": 1.2199488878250122, + "learning_rate": 5.2757379298861345e-05, + "loss": 0.3725, + "step": 2194 + }, + { + "epoch": 2.12, + "grad_norm": 1.2785658836364746, + "learning_rate": 5.265087839115745e-05, + "loss": 0.5089, + "step": 2195 + }, + { + "epoch": 2.12, + "grad_norm": 1.1950833797454834, + "learning_rate": 5.254445640167781e-05, + "loss": 0.4159, + "step": 2196 + }, + { + "epoch": 2.12, + "grad_norm": 1.044345736503601, + "learning_rate": 5.243811344650731e-05, + "loss": 0.3404, + "step": 2197 + }, + { + "epoch": 2.12, + "grad_norm": 1.1644901037216187, + "learning_rate": 5.233184964164453e-05, + "loss": 0.4536, + "step": 2198 + }, + { + "epoch": 2.12, + "grad_norm": 1.763175368309021, + "learning_rate": 5.222566510300172e-05, + "loss": 0.8698, + "step": 2199 + }, + { + "epoch": 2.12, + "grad_norm": 1.374753713607788, + "learning_rate": 5.211955994640466e-05, + "loss": 0.5751, + "step": 2200 + }, + { + "epoch": 2.12, + "grad_norm": 1.0620756149291992, + "learning_rate": 5.2013534287592635e-05, + "loss": 0.4053, + "step": 2201 + }, + { + "epoch": 2.13, + "grad_norm": 1.3876912593841553, + "learning_rate": 5.1907588242218095e-05, + "loss": 0.5241, + "step": 2202 + }, + { + "epoch": 2.13, + "grad_norm": 1.2697367668151855, + "learning_rate": 5.1801721925846686e-05, + "loss": 0.4191, + "step": 2203 + }, + { + "epoch": 2.13, + "grad_norm": 1.1614696979522705, + "learning_rate": 5.1695935453957106e-05, + "loss": 0.4245, + "step": 2204 + }, + { + "epoch": 2.13, + "grad_norm": 1.3249655961990356, + "learning_rate": 5.159022894194101e-05, + "loss": 0.3672, + "step": 2205 + }, + { + "epoch": 2.13, + "grad_norm": 1.215848684310913, + "learning_rate": 5.148460250510273e-05, + "loss": 0.3222, + "step": 2206 + }, + { + "epoch": 2.13, + "grad_norm": 1.129189133644104, + "learning_rate": 5.1379056258659254e-05, + "loss": 0.3361, + "step": 2207 + }, + { + "epoch": 2.13, + "grad_norm": 1.1304503679275513, + "learning_rate": 5.12735903177402e-05, + "loss": 0.3433, + "step": 2208 + }, + { + "epoch": 2.13, + "grad_norm": 1.2168861627578735, + "learning_rate": 5.1168204797387526e-05, + "loss": 0.6113, + "step": 2209 + }, + { + "epoch": 2.13, + "grad_norm": 1.3831435441970825, + "learning_rate": 5.106289981255543e-05, + "loss": 0.6427, + "step": 2210 + }, + { + "epoch": 2.13, + "grad_norm": 1.728682518005371, + "learning_rate": 5.0957675478110295e-05, + "loss": 0.7229, + "step": 2211 + }, + { + "epoch": 2.14, + "grad_norm": 1.1218372583389282, + "learning_rate": 5.08525319088306e-05, + "loss": 0.4039, + "step": 2212 + }, + { + "epoch": 2.14, + "grad_norm": 1.3881877660751343, + "learning_rate": 5.0747469219406636e-05, + "loss": 0.5552, + "step": 2213 + }, + { + "epoch": 2.14, + "grad_norm": 0.7441228628158569, + "learning_rate": 5.0642487524440435e-05, + "loss": 0.2112, + "step": 2214 + }, + { + "epoch": 2.14, + "grad_norm": 0.9063644409179688, + "learning_rate": 5.053758693844583e-05, + "loss": 0.2105, + "step": 2215 + }, + { + "epoch": 2.14, + "grad_norm": 1.262115240097046, + "learning_rate": 5.0432767575848084e-05, + "loss": 0.6285, + "step": 2216 + }, + { + "epoch": 2.14, + "grad_norm": 1.1467372179031372, + "learning_rate": 5.032802955098383e-05, + "loss": 0.5605, + "step": 2217 + }, + { + "epoch": 2.14, + "grad_norm": 1.2581697702407837, + "learning_rate": 5.022337297810099e-05, + "loss": 0.7714, + "step": 2218 + }, + { + "epoch": 2.14, + "grad_norm": 1.472180724143982, + "learning_rate": 5.011879797135878e-05, + "loss": 0.6244, + "step": 2219 + }, + { + "epoch": 2.14, + "grad_norm": 1.2381073236465454, + "learning_rate": 5.001430464482726e-05, + "loss": 0.3459, + "step": 2220 + }, + { + "epoch": 2.14, + "grad_norm": 1.65796959400177, + "learning_rate": 4.990989311248742e-05, + "loss": 0.6309, + "step": 2221 + }, + { + "epoch": 2.14, + "grad_norm": 1.3571516275405884, + "learning_rate": 4.9805563488231175e-05, + "loss": 0.4847, + "step": 2222 + }, + { + "epoch": 2.15, + "grad_norm": 1.4138978719711304, + "learning_rate": 4.9701315885860937e-05, + "loss": 0.5352, + "step": 2223 + }, + { + "epoch": 2.15, + "grad_norm": 1.268650770187378, + "learning_rate": 4.9597150419089686e-05, + "loss": 0.3221, + "step": 2224 + }, + { + "epoch": 2.15, + "grad_norm": 1.0456558465957642, + "learning_rate": 4.94930672015409e-05, + "loss": 0.5437, + "step": 2225 + }, + { + "epoch": 2.15, + "grad_norm": 1.2303794622421265, + "learning_rate": 4.938906634674821e-05, + "loss": 0.4286, + "step": 2226 + }, + { + "epoch": 2.15, + "grad_norm": 1.1339678764343262, + "learning_rate": 4.928514796815545e-05, + "loss": 0.519, + "step": 2227 + }, + { + "epoch": 2.15, + "grad_norm": 1.1388310194015503, + "learning_rate": 4.918131217911658e-05, + "loss": 0.3987, + "step": 2228 + }, + { + "epoch": 2.15, + "grad_norm": 1.1729490756988525, + "learning_rate": 4.9077559092895296e-05, + "loss": 0.3761, + "step": 2229 + }, + { + "epoch": 2.15, + "grad_norm": 1.2407212257385254, + "learning_rate": 4.897388882266529e-05, + "loss": 0.3952, + "step": 2230 + }, + { + "epoch": 2.15, + "grad_norm": 1.1271698474884033, + "learning_rate": 4.887030148150975e-05, + "loss": 0.3412, + "step": 2231 + }, + { + "epoch": 2.15, + "grad_norm": 1.3033124208450317, + "learning_rate": 4.876679718242143e-05, + "loss": 0.5853, + "step": 2232 + }, + { + "epoch": 2.16, + "grad_norm": 1.2884212732315063, + "learning_rate": 4.8663376038302606e-05, + "loss": 0.5988, + "step": 2233 + }, + { + "epoch": 2.16, + "grad_norm": 1.3859330415725708, + "learning_rate": 4.856003816196476e-05, + "loss": 0.5825, + "step": 2234 + }, + { + "epoch": 2.16, + "grad_norm": 1.530526876449585, + "learning_rate": 4.84567836661285e-05, + "loss": 0.6353, + "step": 2235 + }, + { + "epoch": 2.16, + "grad_norm": 1.39138662815094, + "learning_rate": 4.835361266342365e-05, + "loss": 0.5961, + "step": 2236 + }, + { + "epoch": 2.16, + "grad_norm": 1.0767650604248047, + "learning_rate": 4.825052526638883e-05, + "loss": 0.4035, + "step": 2237 + }, + { + "epoch": 2.16, + "grad_norm": 0.9491590857505798, + "learning_rate": 4.814752158747146e-05, + "loss": 0.4804, + "step": 2238 + }, + { + "epoch": 2.16, + "grad_norm": 1.1009563207626343, + "learning_rate": 4.804460173902764e-05, + "loss": 0.3648, + "step": 2239 + }, + { + "epoch": 2.16, + "grad_norm": 1.2546765804290771, + "learning_rate": 4.794176583332213e-05, + "loss": 0.4318, + "step": 2240 + }, + { + "epoch": 2.16, + "grad_norm": 1.1268149614334106, + "learning_rate": 4.783901398252805e-05, + "loss": 0.367, + "step": 2241 + }, + { + "epoch": 2.16, + "grad_norm": 0.9629319906234741, + "learning_rate": 4.773634629872677e-05, + "loss": 0.3666, + "step": 2242 + }, + { + "epoch": 2.17, + "grad_norm": 1.2154501676559448, + "learning_rate": 4.7633762893908e-05, + "loss": 0.4603, + "step": 2243 + }, + { + "epoch": 2.17, + "grad_norm": 1.1271207332611084, + "learning_rate": 4.7531263879969406e-05, + "loss": 0.7909, + "step": 2244 + }, + { + "epoch": 2.17, + "grad_norm": 0.8619101643562317, + "learning_rate": 4.742884936871664e-05, + "loss": 0.2442, + "step": 2245 + }, + { + "epoch": 2.17, + "grad_norm": 0.8308319449424744, + "learning_rate": 4.7326519471863116e-05, + "loss": 0.3029, + "step": 2246 + }, + { + "epoch": 2.17, + "grad_norm": 0.9519083499908447, + "learning_rate": 4.7224274301030085e-05, + "loss": 0.2644, + "step": 2247 + }, + { + "epoch": 2.17, + "grad_norm": 1.3647533655166626, + "learning_rate": 4.712211396774628e-05, + "loss": 0.5271, + "step": 2248 + }, + { + "epoch": 2.17, + "grad_norm": 1.305850625038147, + "learning_rate": 4.7020038583447845e-05, + "loss": 0.635, + "step": 2249 + }, + { + "epoch": 2.17, + "grad_norm": 0.9908753037452698, + "learning_rate": 4.6918048259478445e-05, + "loss": 0.2768, + "step": 2250 + }, + { + "epoch": 2.17, + "grad_norm": 1.2427970170974731, + "learning_rate": 4.6816143107088734e-05, + "loss": 0.5618, + "step": 2251 + }, + { + "epoch": 2.17, + "grad_norm": 1.0422190427780151, + "learning_rate": 4.671432323743667e-05, + "loss": 0.3206, + "step": 2252 + }, + { + "epoch": 2.17, + "grad_norm": 0.9672759771347046, + "learning_rate": 4.6612588761586996e-05, + "loss": 0.3396, + "step": 2253 + }, + { + "epoch": 2.18, + "grad_norm": 0.9199987053871155, + "learning_rate": 4.6510939790511505e-05, + "loss": 0.3626, + "step": 2254 + }, + { + "epoch": 2.18, + "grad_norm": 1.0544426441192627, + "learning_rate": 4.640937643508857e-05, + "loss": 0.3362, + "step": 2255 + }, + { + "epoch": 2.18, + "grad_norm": 1.022659182548523, + "learning_rate": 4.6307898806103196e-05, + "loss": 0.4554, + "step": 2256 + }, + { + "epoch": 2.18, + "grad_norm": 1.1253160238265991, + "learning_rate": 4.620650701424689e-05, + "loss": 0.4351, + "step": 2257 + }, + { + "epoch": 2.18, + "grad_norm": 1.3265272378921509, + "learning_rate": 4.610520117011761e-05, + "loss": 0.56, + "step": 2258 + }, + { + "epoch": 2.18, + "grad_norm": 1.306456208229065, + "learning_rate": 4.600398138421946e-05, + "loss": 0.4775, + "step": 2259 + }, + { + "epoch": 2.18, + "grad_norm": 1.0992588996887207, + "learning_rate": 4.590284776696267e-05, + "loss": 0.4471, + "step": 2260 + }, + { + "epoch": 2.18, + "grad_norm": 1.5208834409713745, + "learning_rate": 4.5801800428663594e-05, + "loss": 0.7269, + "step": 2261 + }, + { + "epoch": 2.18, + "grad_norm": 1.2537895441055298, + "learning_rate": 4.570083947954433e-05, + "loss": 0.407, + "step": 2262 + }, + { + "epoch": 2.18, + "grad_norm": 1.3734372854232788, + "learning_rate": 4.5599965029732796e-05, + "loss": 0.7895, + "step": 2263 + }, + { + "epoch": 2.19, + "grad_norm": 1.1329195499420166, + "learning_rate": 4.5499177189262624e-05, + "loss": 0.4148, + "step": 2264 + }, + { + "epoch": 2.19, + "grad_norm": 1.6518480777740479, + "learning_rate": 4.539847606807289e-05, + "loss": 0.5311, + "step": 2265 + }, + { + "epoch": 2.19, + "grad_norm": 0.8638691902160645, + "learning_rate": 4.529786177600809e-05, + "loss": 0.3688, + "step": 2266 + }, + { + "epoch": 2.19, + "grad_norm": 1.7491494417190552, + "learning_rate": 4.5197334422817985e-05, + "loss": 0.4297, + "step": 2267 + }, + { + "epoch": 2.19, + "grad_norm": 1.2566337585449219, + "learning_rate": 4.509689411815761e-05, + "loss": 0.5839, + "step": 2268 + }, + { + "epoch": 2.19, + "grad_norm": 1.0148571729660034, + "learning_rate": 4.499654097158695e-05, + "loss": 0.2864, + "step": 2269 + }, + { + "epoch": 2.19, + "grad_norm": 0.9205653667449951, + "learning_rate": 4.489627509257089e-05, + "loss": 0.2769, + "step": 2270 + }, + { + "epoch": 2.19, + "grad_norm": 1.584425687789917, + "learning_rate": 4.479609659047927e-05, + "loss": 0.4954, + "step": 2271 + }, + { + "epoch": 2.19, + "grad_norm": 1.1181970834732056, + "learning_rate": 4.4696005574586474e-05, + "loss": 0.4403, + "step": 2272 + }, + { + "epoch": 2.19, + "grad_norm": 0.8639723658561707, + "learning_rate": 4.4596002154071515e-05, + "loss": 0.2166, + "step": 2273 + }, + { + "epoch": 2.19, + "grad_norm": 1.1667298078536987, + "learning_rate": 4.449608643801778e-05, + "loss": 0.3535, + "step": 2274 + }, + { + "epoch": 2.2, + "grad_norm": 1.1740378141403198, + "learning_rate": 4.43962585354132e-05, + "loss": 0.4346, + "step": 2275 + }, + { + "epoch": 2.2, + "grad_norm": 1.2857762575149536, + "learning_rate": 4.42965185551497e-05, + "loss": 0.6013, + "step": 2276 + }, + { + "epoch": 2.2, + "grad_norm": 1.3287543058395386, + "learning_rate": 4.41968666060234e-05, + "loss": 0.5719, + "step": 2277 + }, + { + "epoch": 2.2, + "grad_norm": 1.3587270975112915, + "learning_rate": 4.409730279673431e-05, + "loss": 0.5678, + "step": 2278 + }, + { + "epoch": 2.2, + "grad_norm": 1.4869335889816284, + "learning_rate": 4.399782723588645e-05, + "loss": 0.4488, + "step": 2279 + }, + { + "epoch": 2.2, + "grad_norm": 1.63266921043396, + "learning_rate": 4.389844003198745e-05, + "loss": 0.5575, + "step": 2280 + }, + { + "epoch": 2.2, + "grad_norm": 1.1633329391479492, + "learning_rate": 4.379914129344857e-05, + "loss": 0.3981, + "step": 2281 + }, + { + "epoch": 2.2, + "grad_norm": 1.0538408756256104, + "learning_rate": 4.3699931128584666e-05, + "loss": 0.3228, + "step": 2282 + }, + { + "epoch": 2.2, + "grad_norm": 1.7124279737472534, + "learning_rate": 4.36008096456139e-05, + "loss": 0.8432, + "step": 2283 + }, + { + "epoch": 2.2, + "grad_norm": 1.0280526876449585, + "learning_rate": 4.350177695265768e-05, + "loss": 0.3095, + "step": 2284 + }, + { + "epoch": 2.21, + "grad_norm": 0.974923849105835, + "learning_rate": 4.340283315774059e-05, + "loss": 0.2465, + "step": 2285 + }, + { + "epoch": 2.21, + "grad_norm": 1.1634050607681274, + "learning_rate": 4.3303978368790296e-05, + "loss": 0.3219, + "step": 2286 + }, + { + "epoch": 2.21, + "grad_norm": 1.2221295833587646, + "learning_rate": 4.320521269363734e-05, + "loss": 0.6253, + "step": 2287 + }, + { + "epoch": 2.21, + "grad_norm": 1.442355751991272, + "learning_rate": 4.310653624001497e-05, + "loss": 0.5688, + "step": 2288 + }, + { + "epoch": 2.21, + "grad_norm": 0.9941724538803101, + "learning_rate": 4.3007949115559275e-05, + "loss": 0.3502, + "step": 2289 + }, + { + "epoch": 2.21, + "grad_norm": 1.345016598701477, + "learning_rate": 4.29094514278088e-05, + "loss": 0.506, + "step": 2290 + }, + { + "epoch": 2.21, + "grad_norm": 1.1910314559936523, + "learning_rate": 4.2811043284204516e-05, + "loss": 0.4473, + "step": 2291 + }, + { + "epoch": 2.21, + "grad_norm": 1.248070240020752, + "learning_rate": 4.2712724792089804e-05, + "loss": 0.3997, + "step": 2292 + }, + { + "epoch": 2.21, + "grad_norm": 0.9537377953529358, + "learning_rate": 4.2614496058710204e-05, + "loss": 0.2997, + "step": 2293 + }, + { + "epoch": 2.21, + "grad_norm": 1.08860445022583, + "learning_rate": 4.2516357191213345e-05, + "loss": 0.3847, + "step": 2294 + }, + { + "epoch": 2.22, + "grad_norm": 1.0493781566619873, + "learning_rate": 4.2418308296648775e-05, + "loss": 0.2967, + "step": 2295 + }, + { + "epoch": 2.22, + "grad_norm": 0.9866517782211304, + "learning_rate": 4.232034948196806e-05, + "loss": 0.3186, + "step": 2296 + }, + { + "epoch": 2.22, + "grad_norm": 1.015709638595581, + "learning_rate": 4.222248085402435e-05, + "loss": 0.2899, + "step": 2297 + }, + { + "epoch": 2.22, + "grad_norm": 1.0806543827056885, + "learning_rate": 4.212470251957244e-05, + "loss": 0.4528, + "step": 2298 + }, + { + "epoch": 2.22, + "grad_norm": 1.2434687614440918, + "learning_rate": 4.2027014585268725e-05, + "loss": 0.6677, + "step": 2299 + }, + { + "epoch": 2.22, + "grad_norm": 1.008055567741394, + "learning_rate": 4.1929417157670974e-05, + "loss": 0.299, + "step": 2300 + }, + { + "epoch": 2.22, + "grad_norm": 1.2269865274429321, + "learning_rate": 4.183191034323815e-05, + "loss": 0.4347, + "step": 2301 + }, + { + "epoch": 2.22, + "grad_norm": 0.9386024475097656, + "learning_rate": 4.173449424833037e-05, + "loss": 0.3465, + "step": 2302 + }, + { + "epoch": 2.22, + "grad_norm": 1.2053933143615723, + "learning_rate": 4.1637168979208936e-05, + "loss": 0.4694, + "step": 2303 + }, + { + "epoch": 2.22, + "grad_norm": 0.9446846842765808, + "learning_rate": 4.153993464203594e-05, + "loss": 0.2619, + "step": 2304 + }, + { + "epoch": 2.22, + "grad_norm": 1.3338463306427002, + "learning_rate": 4.144279134287435e-05, + "loss": 0.4563, + "step": 2305 + }, + { + "epoch": 2.23, + "grad_norm": 1.197728157043457, + "learning_rate": 4.1345739187687735e-05, + "loss": 0.4348, + "step": 2306 + }, + { + "epoch": 2.23, + "grad_norm": 0.8967550992965698, + "learning_rate": 4.1248778282340446e-05, + "loss": 0.1982, + "step": 2307 + }, + { + "epoch": 2.23, + "grad_norm": 0.874433696269989, + "learning_rate": 4.11519087325971e-05, + "loss": 0.2406, + "step": 2308 + }, + { + "epoch": 2.23, + "grad_norm": 1.0845016241073608, + "learning_rate": 4.1055130644122696e-05, + "loss": 0.3382, + "step": 2309 + }, + { + "epoch": 2.23, + "grad_norm": 1.096617579460144, + "learning_rate": 4.09584441224826e-05, + "loss": 0.385, + "step": 2310 + }, + { + "epoch": 2.23, + "grad_norm": 1.0413081645965576, + "learning_rate": 4.086184927314218e-05, + "loss": 0.2901, + "step": 2311 + }, + { + "epoch": 2.23, + "grad_norm": 1.065138578414917, + "learning_rate": 4.0765346201466766e-05, + "loss": 0.3099, + "step": 2312 + }, + { + "epoch": 2.23, + "grad_norm": 1.1384869813919067, + "learning_rate": 4.066893501272176e-05, + "loss": 0.4254, + "step": 2313 + }, + { + "epoch": 2.23, + "grad_norm": 1.032501459121704, + "learning_rate": 4.0572615812072136e-05, + "loss": 0.3063, + "step": 2314 + }, + { + "epoch": 2.23, + "grad_norm": 1.5010746717453003, + "learning_rate": 4.047638870458266e-05, + "loss": 0.5251, + "step": 2315 + }, + { + "epoch": 2.24, + "grad_norm": 1.0425777435302734, + "learning_rate": 4.0380253795217524e-05, + "loss": 0.287, + "step": 2316 + }, + { + "epoch": 2.24, + "grad_norm": 0.8620198369026184, + "learning_rate": 4.028421118884053e-05, + "loss": 0.2375, + "step": 2317 + }, + { + "epoch": 2.24, + "grad_norm": 1.0534603595733643, + "learning_rate": 4.018826099021464e-05, + "loss": 0.3245, + "step": 2318 + }, + { + "epoch": 2.24, + "grad_norm": 0.8742746114730835, + "learning_rate": 4.009240330400203e-05, + "loss": 0.3547, + "step": 2319 + }, + { + "epoch": 2.24, + "grad_norm": 1.9023158550262451, + "learning_rate": 3.9996638234764085e-05, + "loss": 0.5178, + "step": 2320 + }, + { + "epoch": 2.24, + "grad_norm": 1.1507781744003296, + "learning_rate": 3.990096588696099e-05, + "loss": 0.37, + "step": 2321 + }, + { + "epoch": 2.24, + "grad_norm": 1.4944819211959839, + "learning_rate": 3.980538636495197e-05, + "loss": 0.4217, + "step": 2322 + }, + { + "epoch": 2.24, + "grad_norm": 1.1764553785324097, + "learning_rate": 3.9709899772994815e-05, + "loss": 0.3115, + "step": 2323 + }, + { + "epoch": 2.24, + "grad_norm": 1.1834471225738525, + "learning_rate": 3.961450621524615e-05, + "loss": 0.4248, + "step": 2324 + }, + { + "epoch": 2.24, + "grad_norm": 1.237874984741211, + "learning_rate": 3.951920579576093e-05, + "loss": 0.6359, + "step": 2325 + }, + { + "epoch": 2.25, + "grad_norm": 1.123441457748413, + "learning_rate": 3.9423998618492604e-05, + "loss": 0.3109, + "step": 2326 + }, + { + "epoch": 2.25, + "grad_norm": 1.0116443634033203, + "learning_rate": 3.932888478729288e-05, + "loss": 0.3113, + "step": 2327 + }, + { + "epoch": 2.25, + "grad_norm": 1.0924867391586304, + "learning_rate": 3.9233864405911726e-05, + "loss": 0.2679, + "step": 2328 + }, + { + "epoch": 2.25, + "grad_norm": 1.3951201438903809, + "learning_rate": 3.913893757799708e-05, + "loss": 0.4859, + "step": 2329 + }, + { + "epoch": 2.25, + "grad_norm": 1.3379281759262085, + "learning_rate": 3.904410440709483e-05, + "loss": 0.6546, + "step": 2330 + }, + { + "epoch": 2.25, + "grad_norm": 0.9945024847984314, + "learning_rate": 3.894936499664881e-05, + "loss": 0.2794, + "step": 2331 + }, + { + "epoch": 2.25, + "grad_norm": 1.0659005641937256, + "learning_rate": 3.885471945000046e-05, + "loss": 0.4962, + "step": 2332 + }, + { + "epoch": 2.25, + "grad_norm": 1.343820333480835, + "learning_rate": 3.8760167870388924e-05, + "loss": 0.7368, + "step": 2333 + }, + { + "epoch": 2.25, + "grad_norm": 0.9904674291610718, + "learning_rate": 3.8665710360950736e-05, + "loss": 0.285, + "step": 2334 + }, + { + "epoch": 2.25, + "grad_norm": 1.2321860790252686, + "learning_rate": 3.857134702471997e-05, + "loss": 0.3921, + "step": 2335 + }, + { + "epoch": 2.25, + "grad_norm": 1.082987904548645, + "learning_rate": 3.847707796462786e-05, + "loss": 0.422, + "step": 2336 + }, + { + "epoch": 2.26, + "grad_norm": 1.3982007503509521, + "learning_rate": 3.838290328350282e-05, + "loss": 0.4305, + "step": 2337 + }, + { + "epoch": 2.26, + "grad_norm": 0.945181667804718, + "learning_rate": 3.828882308407037e-05, + "loss": 0.2157, + "step": 2338 + }, + { + "epoch": 2.26, + "grad_norm": 1.1719783544540405, + "learning_rate": 3.8194837468952944e-05, + "loss": 0.2699, + "step": 2339 + }, + { + "epoch": 2.26, + "grad_norm": 0.8167923092842102, + "learning_rate": 3.8100946540669715e-05, + "loss": 0.3927, + "step": 2340 + }, + { + "epoch": 2.26, + "grad_norm": 1.3482931852340698, + "learning_rate": 3.800715040163675e-05, + "loss": 0.5332, + "step": 2341 + }, + { + "epoch": 2.26, + "grad_norm": 0.9033116102218628, + "learning_rate": 3.7913449154166586e-05, + "loss": 0.2163, + "step": 2342 + }, + { + "epoch": 2.26, + "grad_norm": 1.194532036781311, + "learning_rate": 3.781984290046828e-05, + "loss": 0.4082, + "step": 2343 + }, + { + "epoch": 2.26, + "grad_norm": 1.3558499813079834, + "learning_rate": 3.7726331742647226e-05, + "loss": 0.4672, + "step": 2344 + }, + { + "epoch": 2.26, + "grad_norm": 1.1556100845336914, + "learning_rate": 3.76329157827052e-05, + "loss": 0.3475, + "step": 2345 + }, + { + "epoch": 2.26, + "grad_norm": 1.085553526878357, + "learning_rate": 3.7539595122540096e-05, + "loss": 0.347, + "step": 2346 + }, + { + "epoch": 2.27, + "grad_norm": 0.914618968963623, + "learning_rate": 3.7446369863945827e-05, + "loss": 0.2416, + "step": 2347 + }, + { + "epoch": 2.27, + "grad_norm": 1.248558759689331, + "learning_rate": 3.735324010861217e-05, + "loss": 0.4279, + "step": 2348 + }, + { + "epoch": 2.27, + "grad_norm": 1.3649533987045288, + "learning_rate": 3.726020595812493e-05, + "loss": 0.3499, + "step": 2349 + }, + { + "epoch": 2.27, + "grad_norm": 1.2154896259307861, + "learning_rate": 3.716726751396543e-05, + "loss": 0.5063, + "step": 2350 + }, + { + "epoch": 2.27, + "grad_norm": 1.166061520576477, + "learning_rate": 3.707442487751064e-05, + "loss": 0.3876, + "step": 2351 + }, + { + "epoch": 2.27, + "grad_norm": 1.124474048614502, + "learning_rate": 3.698167815003314e-05, + "loss": 0.4285, + "step": 2352 + }, + { + "epoch": 2.27, + "grad_norm": 1.103945016860962, + "learning_rate": 3.688902743270077e-05, + "loss": 0.2512, + "step": 2353 + }, + { + "epoch": 2.27, + "grad_norm": 0.8782851099967957, + "learning_rate": 3.679647282657668e-05, + "loss": 0.2574, + "step": 2354 + }, + { + "epoch": 2.27, + "grad_norm": 1.2587329149246216, + "learning_rate": 3.670401443261913e-05, + "loss": 0.4832, + "step": 2355 + }, + { + "epoch": 2.27, + "grad_norm": 0.9656249284744263, + "learning_rate": 3.661165235168157e-05, + "loss": 0.3533, + "step": 2356 + }, + { + "epoch": 2.28, + "grad_norm": 1.1450453996658325, + "learning_rate": 3.651938668451227e-05, + "loss": 0.4874, + "step": 2357 + }, + { + "epoch": 2.28, + "grad_norm": 0.9184198379516602, + "learning_rate": 3.64272175317543e-05, + "loss": 0.374, + "step": 2358 + }, + { + "epoch": 2.28, + "grad_norm": 1.2353371381759644, + "learning_rate": 3.6335144993945646e-05, + "loss": 0.4365, + "step": 2359 + }, + { + "epoch": 2.28, + "grad_norm": 1.098734736442566, + "learning_rate": 3.6243169171518724e-05, + "loss": 0.3502, + "step": 2360 + }, + { + "epoch": 2.28, + "grad_norm": 1.0059494972229004, + "learning_rate": 3.61512901648005e-05, + "loss": 0.3741, + "step": 2361 + }, + { + "epoch": 2.28, + "grad_norm": 1.4236122369766235, + "learning_rate": 3.60595080740123e-05, + "loss": 0.6132, + "step": 2362 + }, + { + "epoch": 2.28, + "grad_norm": 1.201125144958496, + "learning_rate": 3.596782299926986e-05, + "loss": 0.4928, + "step": 2363 + }, + { + "epoch": 2.28, + "grad_norm": 1.2504150867462158, + "learning_rate": 3.587623504058299e-05, + "loss": 0.4114, + "step": 2364 + }, + { + "epoch": 2.28, + "grad_norm": 1.357285976409912, + "learning_rate": 3.578474429785553e-05, + "loss": 0.4101, + "step": 2365 + }, + { + "epoch": 2.28, + "grad_norm": 1.2139153480529785, + "learning_rate": 3.569335087088539e-05, + "loss": 0.441, + "step": 2366 + }, + { + "epoch": 2.28, + "grad_norm": 1.486601710319519, + "learning_rate": 3.560205485936428e-05, + "loss": 0.4436, + "step": 2367 + }, + { + "epoch": 2.29, + "grad_norm": 1.061099886894226, + "learning_rate": 3.551085636287755e-05, + "loss": 0.3589, + "step": 2368 + }, + { + "epoch": 2.29, + "grad_norm": 1.5647423267364502, + "learning_rate": 3.541975548090436e-05, + "loss": 0.5031, + "step": 2369 + }, + { + "epoch": 2.29, + "grad_norm": 1.248528003692627, + "learning_rate": 3.5328752312817306e-05, + "loss": 0.6558, + "step": 2370 + }, + { + "epoch": 2.29, + "grad_norm": 1.2119498252868652, + "learning_rate": 3.523784695788236e-05, + "loss": 0.509, + "step": 2371 + }, + { + "epoch": 2.29, + "grad_norm": 1.6099773645401, + "learning_rate": 3.514703951525886e-05, + "loss": 0.4712, + "step": 2372 + }, + { + "epoch": 2.29, + "grad_norm": 0.9579163193702698, + "learning_rate": 3.5056330083999235e-05, + "loss": 0.4728, + "step": 2373 + }, + { + "epoch": 2.29, + "grad_norm": 0.9517368078231812, + "learning_rate": 3.49657187630492e-05, + "loss": 0.3334, + "step": 2374 + }, + { + "epoch": 2.29, + "grad_norm": 1.0279760360717773, + "learning_rate": 3.487520565124729e-05, + "loss": 0.3993, + "step": 2375 + }, + { + "epoch": 2.29, + "grad_norm": 1.0859653949737549, + "learning_rate": 3.47847908473249e-05, + "loss": 0.4396, + "step": 2376 + }, + { + "epoch": 2.29, + "grad_norm": 1.1414768695831299, + "learning_rate": 3.4694474449906354e-05, + "loss": 0.4042, + "step": 2377 + }, + { + "epoch": 2.3, + "grad_norm": 1.0698778629302979, + "learning_rate": 3.460425655750848e-05, + "loss": 0.4052, + "step": 2378 + }, + { + "epoch": 2.3, + "grad_norm": 1.0497967004776, + "learning_rate": 3.451413726854069e-05, + "loss": 0.4626, + "step": 2379 + }, + { + "epoch": 2.3, + "grad_norm": 1.1601060628890991, + "learning_rate": 3.4424116681304905e-05, + "loss": 0.4332, + "step": 2380 + }, + { + "epoch": 2.3, + "grad_norm": 1.2115287780761719, + "learning_rate": 3.4334194893995304e-05, + "loss": 0.4783, + "step": 2381 + }, + { + "epoch": 2.3, + "grad_norm": 1.2976157665252686, + "learning_rate": 3.4244372004698346e-05, + "loss": 0.2723, + "step": 2382 + }, + { + "epoch": 2.3, + "grad_norm": 1.005487322807312, + "learning_rate": 3.415464811139254e-05, + "loss": 0.5934, + "step": 2383 + }, + { + "epoch": 2.3, + "grad_norm": 0.8844375014305115, + "learning_rate": 3.406502331194855e-05, + "loss": 0.3645, + "step": 2384 + }, + { + "epoch": 2.3, + "grad_norm": 0.8819464445114136, + "learning_rate": 3.3975497704128804e-05, + "loss": 0.3613, + "step": 2385 + }, + { + "epoch": 2.3, + "grad_norm": 0.8191189169883728, + "learning_rate": 3.388607138558758e-05, + "loss": 0.299, + "step": 2386 + }, + { + "epoch": 2.3, + "grad_norm": 1.0504474639892578, + "learning_rate": 3.379674445387092e-05, + "loss": 0.4023, + "step": 2387 + }, + { + "epoch": 2.31, + "grad_norm": 1.0183119773864746, + "learning_rate": 3.370751700641636e-05, + "loss": 0.3868, + "step": 2388 + }, + { + "epoch": 2.31, + "grad_norm": 1.4171295166015625, + "learning_rate": 3.3618389140552974e-05, + "loss": 0.4165, + "step": 2389 + }, + { + "epoch": 2.31, + "grad_norm": 1.227512001991272, + "learning_rate": 3.3529360953501135e-05, + "loss": 0.491, + "step": 2390 + }, + { + "epoch": 2.31, + "grad_norm": 1.2160135507583618, + "learning_rate": 3.3440432542372625e-05, + "loss": 0.4508, + "step": 2391 + }, + { + "epoch": 2.31, + "grad_norm": 1.185299038887024, + "learning_rate": 3.335160400417024e-05, + "loss": 0.4429, + "step": 2392 + }, + { + "epoch": 2.31, + "grad_norm": 1.095727562904358, + "learning_rate": 3.326287543578799e-05, + "loss": 0.3752, + "step": 2393 + }, + { + "epoch": 2.31, + "grad_norm": 1.0111972093582153, + "learning_rate": 3.3174246934010665e-05, + "loss": 0.2842, + "step": 2394 + }, + { + "epoch": 2.31, + "grad_norm": 1.4653085470199585, + "learning_rate": 3.3085718595514086e-05, + "loss": 0.6001, + "step": 2395 + }, + { + "epoch": 2.31, + "grad_norm": 1.3188362121582031, + "learning_rate": 3.299729051686468e-05, + "loss": 0.3737, + "step": 2396 + }, + { + "epoch": 2.31, + "grad_norm": 1.0367945432662964, + "learning_rate": 3.2908962794519495e-05, + "loss": 0.4076, + "step": 2397 + }, + { + "epoch": 2.31, + "grad_norm": 1.2478039264678955, + "learning_rate": 3.282073552482627e-05, + "loss": 0.4904, + "step": 2398 + }, + { + "epoch": 2.32, + "grad_norm": 1.0667824745178223, + "learning_rate": 3.273260880402303e-05, + "loss": 0.3117, + "step": 2399 + }, + { + "epoch": 2.32, + "grad_norm": 0.9630646705627441, + "learning_rate": 3.264458272823814e-05, + "loss": 0.3421, + "step": 2400 + }, + { + "epoch": 2.32, + "grad_norm": 1.0218043327331543, + "learning_rate": 3.255665739349019e-05, + "loss": 0.4979, + "step": 2401 + }, + { + "epoch": 2.32, + "grad_norm": 1.0328983068466187, + "learning_rate": 3.246883289568796e-05, + "loss": 0.3001, + "step": 2402 + }, + { + "epoch": 2.32, + "grad_norm": 0.900199830532074, + "learning_rate": 3.238110933063014e-05, + "loss": 0.2591, + "step": 2403 + }, + { + "epoch": 2.32, + "grad_norm": 1.41981840133667, + "learning_rate": 3.2293486794005303e-05, + "loss": 0.6724, + "step": 2404 + }, + { + "epoch": 2.32, + "grad_norm": 1.0938422679901123, + "learning_rate": 3.220596538139196e-05, + "loss": 0.5295, + "step": 2405 + }, + { + "epoch": 2.32, + "grad_norm": 1.074320673942566, + "learning_rate": 3.21185451882582e-05, + "loss": 0.2767, + "step": 2406 + }, + { + "epoch": 2.32, + "grad_norm": 0.9862778186798096, + "learning_rate": 3.2031226309961675e-05, + "loss": 0.2652, + "step": 2407 + }, + { + "epoch": 2.32, + "grad_norm": 1.1148425340652466, + "learning_rate": 3.1944008841749664e-05, + "loss": 0.2569, + "step": 2408 + }, + { + "epoch": 2.33, + "grad_norm": 1.2387659549713135, + "learning_rate": 3.18568928787587e-05, + "loss": 0.4366, + "step": 2409 + }, + { + "epoch": 2.33, + "grad_norm": 1.2367215156555176, + "learning_rate": 3.1769878516014635e-05, + "loss": 0.5176, + "step": 2410 + }, + { + "epoch": 2.33, + "grad_norm": 1.0763795375823975, + "learning_rate": 3.168296584843248e-05, + "loss": 0.3899, + "step": 2411 + }, + { + "epoch": 2.33, + "grad_norm": 1.3850998878479004, + "learning_rate": 3.1596154970816385e-05, + "loss": 0.5462, + "step": 2412 + }, + { + "epoch": 2.33, + "grad_norm": 1.4004285335540771, + "learning_rate": 3.1509445977859405e-05, + "loss": 0.365, + "step": 2413 + }, + { + "epoch": 2.33, + "grad_norm": 0.9528489708900452, + "learning_rate": 3.142283896414343e-05, + "loss": 0.2813, + "step": 2414 + }, + { + "epoch": 2.33, + "grad_norm": 1.764185905456543, + "learning_rate": 3.133633402413917e-05, + "loss": 0.6395, + "step": 2415 + }, + { + "epoch": 2.33, + "grad_norm": 1.0002202987670898, + "learning_rate": 3.124993125220603e-05, + "loss": 0.2165, + "step": 2416 + }, + { + "epoch": 2.33, + "grad_norm": 1.2747881412506104, + "learning_rate": 3.116363074259188e-05, + "loss": 0.47, + "step": 2417 + }, + { + "epoch": 2.33, + "grad_norm": 1.329888939857483, + "learning_rate": 3.1077432589433046e-05, + "loss": 0.4552, + "step": 2418 + }, + { + "epoch": 2.33, + "grad_norm": 1.0620276927947998, + "learning_rate": 3.09913368867543e-05, + "loss": 0.3858, + "step": 2419 + }, + { + "epoch": 2.34, + "grad_norm": 1.7584563493728638, + "learning_rate": 3.090534372846858e-05, + "loss": 0.7671, + "step": 2420 + }, + { + "epoch": 2.34, + "grad_norm": 1.1476322412490845, + "learning_rate": 3.081945320837698e-05, + "loss": 0.4855, + "step": 2421 + }, + { + "epoch": 2.34, + "grad_norm": 1.1755834817886353, + "learning_rate": 3.073366542016862e-05, + "loss": 0.335, + "step": 2422 + }, + { + "epoch": 2.34, + "grad_norm": 1.312078595161438, + "learning_rate": 3.064798045742062e-05, + "loss": 0.5109, + "step": 2423 + }, + { + "epoch": 2.34, + "grad_norm": 1.0246888399124146, + "learning_rate": 3.056239841359791e-05, + "loss": 0.2884, + "step": 2424 + }, + { + "epoch": 2.34, + "grad_norm": 1.1685364246368408, + "learning_rate": 3.047691938205309e-05, + "loss": 0.4957, + "step": 2425 + }, + { + "epoch": 2.34, + "grad_norm": 0.8868763446807861, + "learning_rate": 3.0391543456026543e-05, + "loss": 0.2575, + "step": 2426 + }, + { + "epoch": 2.34, + "grad_norm": 1.6953266859054565, + "learning_rate": 3.0306270728646042e-05, + "loss": 0.5522, + "step": 2427 + }, + { + "epoch": 2.34, + "grad_norm": 1.304674506187439, + "learning_rate": 3.0221101292926823e-05, + "loss": 0.216, + "step": 2428 + }, + { + "epoch": 2.34, + "grad_norm": 1.3259423971176147, + "learning_rate": 3.0136035241771524e-05, + "loss": 0.4338, + "step": 2429 + }, + { + "epoch": 2.35, + "grad_norm": 1.0263630151748657, + "learning_rate": 3.0051072667969956e-05, + "loss": 0.4382, + "step": 2430 + }, + { + "epoch": 2.35, + "grad_norm": 1.4216090440750122, + "learning_rate": 2.996621366419905e-05, + "loss": 0.4007, + "step": 2431 + }, + { + "epoch": 2.35, + "grad_norm": 1.4432778358459473, + "learning_rate": 2.9881458323022733e-05, + "loss": 0.5448, + "step": 2432 + }, + { + "epoch": 2.35, + "grad_norm": 0.9634547233581543, + "learning_rate": 2.9796806736891975e-05, + "loss": 0.3259, + "step": 2433 + }, + { + "epoch": 2.35, + "grad_norm": 1.2892754077911377, + "learning_rate": 2.971225899814446e-05, + "loss": 0.5287, + "step": 2434 + }, + { + "epoch": 2.35, + "grad_norm": 1.2004163265228271, + "learning_rate": 2.9627815199004593e-05, + "loss": 0.4014, + "step": 2435 + }, + { + "epoch": 2.35, + "grad_norm": 1.484840750694275, + "learning_rate": 2.9543475431583505e-05, + "loss": 0.4791, + "step": 2436 + }, + { + "epoch": 2.35, + "grad_norm": 1.1654618978500366, + "learning_rate": 2.9459239787878776e-05, + "loss": 0.3014, + "step": 2437 + }, + { + "epoch": 2.35, + "grad_norm": 1.3821611404418945, + "learning_rate": 2.937510835977439e-05, + "loss": 0.5759, + "step": 2438 + }, + { + "epoch": 2.35, + "grad_norm": 1.085803508758545, + "learning_rate": 2.9291081239040605e-05, + "loss": 0.3286, + "step": 2439 + }, + { + "epoch": 2.36, + "grad_norm": 0.9716854691505432, + "learning_rate": 2.920715851733413e-05, + "loss": 0.2903, + "step": 2440 + }, + { + "epoch": 2.36, + "grad_norm": 1.0556275844573975, + "learning_rate": 2.9123340286197565e-05, + "loss": 0.4209, + "step": 2441 + }, + { + "epoch": 2.36, + "grad_norm": 1.300984263420105, + "learning_rate": 2.9039626637059612e-05, + "loss": 0.3497, + "step": 2442 + }, + { + "epoch": 2.36, + "grad_norm": 1.464999794960022, + "learning_rate": 2.8956017661234846e-05, + "loss": 0.5126, + "step": 2443 + }, + { + "epoch": 2.36, + "grad_norm": 0.9424381852149963, + "learning_rate": 2.8872513449923797e-05, + "loss": 0.3025, + "step": 2444 + }, + { + "epoch": 2.36, + "grad_norm": 1.313447117805481, + "learning_rate": 2.8789114094212575e-05, + "loss": 0.4539, + "step": 2445 + }, + { + "epoch": 2.36, + "grad_norm": 1.538489580154419, + "learning_rate": 2.870581968507295e-05, + "loss": 0.6295, + "step": 2446 + }, + { + "epoch": 2.36, + "grad_norm": 1.1631628274917603, + "learning_rate": 2.8622630313362303e-05, + "loss": 0.3899, + "step": 2447 + }, + { + "epoch": 2.36, + "grad_norm": 1.0515528917312622, + "learning_rate": 2.8539546069823327e-05, + "loss": 0.33, + "step": 2448 + }, + { + "epoch": 2.36, + "grad_norm": 1.3940054178237915, + "learning_rate": 2.8456567045084105e-05, + "loss": 0.4198, + "step": 2449 + }, + { + "epoch": 2.36, + "grad_norm": 1.207826018333435, + "learning_rate": 2.8373693329657878e-05, + "loss": 0.5596, + "step": 2450 + }, + { + "epoch": 2.37, + "grad_norm": 1.2761030197143555, + "learning_rate": 2.8290925013943147e-05, + "loss": 0.496, + "step": 2451 + }, + { + "epoch": 2.37, + "grad_norm": 1.7271332740783691, + "learning_rate": 2.8208262188223337e-05, + "loss": 0.7644, + "step": 2452 + }, + { + "epoch": 2.37, + "grad_norm": 1.387328028678894, + "learning_rate": 2.8125704942666794e-05, + "loss": 0.5055, + "step": 2453 + }, + { + "epoch": 2.37, + "grad_norm": 1.200985312461853, + "learning_rate": 2.80432533673268e-05, + "loss": 0.4813, + "step": 2454 + }, + { + "epoch": 2.37, + "grad_norm": 1.1662079095840454, + "learning_rate": 2.796090755214127e-05, + "loss": 0.4065, + "step": 2455 + }, + { + "epoch": 2.37, + "grad_norm": 1.2971113920211792, + "learning_rate": 2.7878667586932776e-05, + "loss": 0.4729, + "step": 2456 + }, + { + "epoch": 2.37, + "grad_norm": 1.2580766677856445, + "learning_rate": 2.7796533561408502e-05, + "loss": 0.3174, + "step": 2457 + }, + { + "epoch": 2.37, + "grad_norm": 1.2634315490722656, + "learning_rate": 2.771450556515999e-05, + "loss": 0.3987, + "step": 2458 + }, + { + "epoch": 2.37, + "grad_norm": 1.1510578393936157, + "learning_rate": 2.763258368766315e-05, + "loss": 0.3606, + "step": 2459 + }, + { + "epoch": 2.37, + "grad_norm": 0.9381775856018066, + "learning_rate": 2.7550768018278107e-05, + "loss": 0.2334, + "step": 2460 + }, + { + "epoch": 2.38, + "grad_norm": 1.036440134048462, + "learning_rate": 2.7469058646249255e-05, + "loss": 0.3621, + "step": 2461 + }, + { + "epoch": 2.38, + "grad_norm": 1.370670199394226, + "learning_rate": 2.7387455660704857e-05, + "loss": 0.7745, + "step": 2462 + }, + { + "epoch": 2.38, + "grad_norm": 1.025439977645874, + "learning_rate": 2.7305959150657326e-05, + "loss": 0.3705, + "step": 2463 + }, + { + "epoch": 2.38, + "grad_norm": 1.114072322845459, + "learning_rate": 2.722456920500274e-05, + "loss": 0.4108, + "step": 2464 + }, + { + "epoch": 2.38, + "grad_norm": 0.997879683971405, + "learning_rate": 2.7143285912521117e-05, + "loss": 0.2627, + "step": 2465 + }, + { + "epoch": 2.38, + "grad_norm": 1.13973069190979, + "learning_rate": 2.7062109361876003e-05, + "loss": 0.4388, + "step": 2466 + }, + { + "epoch": 2.38, + "grad_norm": 1.333604335784912, + "learning_rate": 2.6981039641614523e-05, + "loss": 0.4221, + "step": 2467 + }, + { + "epoch": 2.38, + "grad_norm": 1.3350666761398315, + "learning_rate": 2.6900076840167377e-05, + "loss": 0.5773, + "step": 2468 + }, + { + "epoch": 2.38, + "grad_norm": 1.058940052986145, + "learning_rate": 2.6819221045848538e-05, + "loss": 0.3743, + "step": 2469 + }, + { + "epoch": 2.38, + "grad_norm": 1.0323387384414673, + "learning_rate": 2.673847234685528e-05, + "loss": 0.2662, + "step": 2470 + }, + { + "epoch": 2.39, + "grad_norm": 1.5275335311889648, + "learning_rate": 2.6657830831268034e-05, + "loss": 0.6337, + "step": 2471 + }, + { + "epoch": 2.39, + "grad_norm": 1.729491949081421, + "learning_rate": 2.6577296587050424e-05, + "loss": 0.9006, + "step": 2472 + }, + { + "epoch": 2.39, + "grad_norm": 1.2076356410980225, + "learning_rate": 2.6496869702048925e-05, + "loss": 0.4235, + "step": 2473 + }, + { + "epoch": 2.39, + "grad_norm": 0.8329384326934814, + "learning_rate": 2.6416550263992936e-05, + "loss": 0.2444, + "step": 2474 + }, + { + "epoch": 2.39, + "grad_norm": 1.3011808395385742, + "learning_rate": 2.633633836049476e-05, + "loss": 0.295, + "step": 2475 + }, + { + "epoch": 2.39, + "grad_norm": 1.2912174463272095, + "learning_rate": 2.6256234079049286e-05, + "loss": 0.4929, + "step": 2476 + }, + { + "epoch": 2.39, + "grad_norm": 1.389458179473877, + "learning_rate": 2.6176237507034044e-05, + "loss": 0.3581, + "step": 2477 + }, + { + "epoch": 2.39, + "grad_norm": 1.1142305135726929, + "learning_rate": 2.6096348731709033e-05, + "loss": 0.4159, + "step": 2478 + }, + { + "epoch": 2.39, + "grad_norm": 0.8560962677001953, + "learning_rate": 2.601656784021679e-05, + "loss": 0.2975, + "step": 2479 + }, + { + "epoch": 2.39, + "grad_norm": 1.4160083532333374, + "learning_rate": 2.5936894919582067e-05, + "loss": 0.385, + "step": 2480 + }, + { + "epoch": 2.39, + "grad_norm": 1.0558053255081177, + "learning_rate": 2.58573300567118e-05, + "loss": 0.4253, + "step": 2481 + }, + { + "epoch": 2.4, + "grad_norm": 0.969610333442688, + "learning_rate": 2.5777873338395235e-05, + "loss": 0.2576, + "step": 2482 + }, + { + "epoch": 2.4, + "grad_norm": 1.3804552555084229, + "learning_rate": 2.5698524851303488e-05, + "loss": 0.5133, + "step": 2483 + }, + { + "epoch": 2.4, + "grad_norm": 1.0985430479049683, + "learning_rate": 2.5619284681989624e-05, + "loss": 0.3691, + "step": 2484 + }, + { + "epoch": 2.4, + "grad_norm": 1.1674649715423584, + "learning_rate": 2.55401529168887e-05, + "loss": 0.329, + "step": 2485 + }, + { + "epoch": 2.4, + "grad_norm": 1.197788119316101, + "learning_rate": 2.546112964231735e-05, + "loss": 0.3808, + "step": 2486 + }, + { + "epoch": 2.4, + "grad_norm": 1.2476664781570435, + "learning_rate": 2.5382214944474012e-05, + "loss": 0.4382, + "step": 2487 + }, + { + "epoch": 2.4, + "grad_norm": 1.19683039188385, + "learning_rate": 2.5303408909438578e-05, + "loss": 0.4408, + "step": 2488 + }, + { + "epoch": 2.4, + "eval_loss": 1.3875722885131836, + "eval_runtime": 28.1984, + "eval_samples_per_second": 3.014, + "eval_steps_per_second": 1.525, + "step": 2488 + }, + { + "epoch": 2.4, + "grad_norm": 1.302209734916687, + "learning_rate": 2.5224711623172442e-05, + "loss": 0.4414, + "step": 2489 + }, + { + "epoch": 2.4, + "grad_norm": 1.1015681028366089, + "learning_rate": 2.514612317151846e-05, + "loss": 0.3905, + "step": 2490 + }, + { + "epoch": 2.4, + "grad_norm": 1.0991705656051636, + "learning_rate": 2.5067643640200642e-05, + "loss": 0.4307, + "step": 2491 + }, + { + "epoch": 2.41, + "grad_norm": 0.901372492313385, + "learning_rate": 2.4989273114824228e-05, + "loss": 0.2361, + "step": 2492 + }, + { + "epoch": 2.41, + "grad_norm": 0.8259288668632507, + "learning_rate": 2.491101168087563e-05, + "loss": 0.2363, + "step": 2493 + }, + { + "epoch": 2.41, + "grad_norm": 1.2005654573440552, + "learning_rate": 2.4832859423722183e-05, + "loss": 0.4297, + "step": 2494 + }, + { + "epoch": 2.41, + "grad_norm": 1.0432602167129517, + "learning_rate": 2.475481642861213e-05, + "loss": 0.2909, + "step": 2495 + }, + { + "epoch": 2.41, + "grad_norm": 1.3437654972076416, + "learning_rate": 2.4676882780674606e-05, + "loss": 0.4414, + "step": 2496 + }, + { + "epoch": 2.41, + "grad_norm": 1.3620154857635498, + "learning_rate": 2.4599058564919396e-05, + "loss": 0.4478, + "step": 2497 + }, + { + "epoch": 2.41, + "grad_norm": 1.377049207687378, + "learning_rate": 2.4521343866236945e-05, + "loss": 0.4468, + "step": 2498 + }, + { + "epoch": 2.41, + "grad_norm": 1.2696644067764282, + "learning_rate": 2.444373876939819e-05, + "loss": 0.4608, + "step": 2499 + }, + { + "epoch": 2.41, + "grad_norm": 1.1020128726959229, + "learning_rate": 2.4366243359054642e-05, + "loss": 0.4635, + "step": 2500 + }, + { + "epoch": 2.41, + "grad_norm": 0.926050066947937, + "learning_rate": 2.4288857719738044e-05, + "loss": 0.2092, + "step": 2501 + }, + { + "epoch": 2.42, + "grad_norm": 1.5648494958877563, + "learning_rate": 2.42115819358604e-05, + "loss": 0.7375, + "step": 2502 + }, + { + "epoch": 2.42, + "grad_norm": 1.2817730903625488, + "learning_rate": 2.413441609171399e-05, + "loss": 0.5618, + "step": 2503 + }, + { + "epoch": 2.42, + "grad_norm": 1.5735052824020386, + "learning_rate": 2.405736027147107e-05, + "loss": 0.5577, + "step": 2504 + }, + { + "epoch": 2.42, + "grad_norm": 1.031183123588562, + "learning_rate": 2.398041455918394e-05, + "loss": 0.4408, + "step": 2505 + }, + { + "epoch": 2.42, + "grad_norm": 1.3467059135437012, + "learning_rate": 2.390357903878472e-05, + "loss": 0.4451, + "step": 2506 + }, + { + "epoch": 2.42, + "grad_norm": 1.2030715942382812, + "learning_rate": 2.3826853794085445e-05, + "loss": 0.4461, + "step": 2507 + }, + { + "epoch": 2.42, + "grad_norm": 1.080093502998352, + "learning_rate": 2.3750238908777796e-05, + "loss": 0.3372, + "step": 2508 + }, + { + "epoch": 2.42, + "grad_norm": 1.9159194231033325, + "learning_rate": 2.367373446643302e-05, + "loss": 0.3811, + "step": 2509 + }, + { + "epoch": 2.42, + "grad_norm": 1.255796194076538, + "learning_rate": 2.3597340550502015e-05, + "loss": 0.5714, + "step": 2510 + }, + { + "epoch": 2.42, + "grad_norm": 1.7954977750778198, + "learning_rate": 2.3521057244315066e-05, + "loss": 0.739, + "step": 2511 + }, + { + "epoch": 2.42, + "grad_norm": 1.65315842628479, + "learning_rate": 2.344488463108177e-05, + "loss": 0.5431, + "step": 2512 + }, + { + "epoch": 2.43, + "grad_norm": 0.9481430053710938, + "learning_rate": 2.3368822793890963e-05, + "loss": 0.344, + "step": 2513 + }, + { + "epoch": 2.43, + "grad_norm": 0.8809416890144348, + "learning_rate": 2.3292871815710783e-05, + "loss": 0.1632, + "step": 2514 + }, + { + "epoch": 2.43, + "grad_norm": 0.9627254605293274, + "learning_rate": 2.3217031779388276e-05, + "loss": 0.2672, + "step": 2515 + }, + { + "epoch": 2.43, + "grad_norm": 1.5389256477355957, + "learning_rate": 2.3141302767649586e-05, + "loss": 0.8815, + "step": 2516 + }, + { + "epoch": 2.43, + "grad_norm": 1.2893158197402954, + "learning_rate": 2.306568486309965e-05, + "loss": 0.3986, + "step": 2517 + }, + { + "epoch": 2.43, + "grad_norm": 1.3341116905212402, + "learning_rate": 2.2990178148222365e-05, + "loss": 0.3183, + "step": 2518 + }, + { + "epoch": 2.43, + "grad_norm": 1.7593715190887451, + "learning_rate": 2.2914782705380192e-05, + "loss": 0.5908, + "step": 2519 + }, + { + "epoch": 2.43, + "grad_norm": 1.4277440309524536, + "learning_rate": 2.2839498616814252e-05, + "loss": 0.6308, + "step": 2520 + }, + { + "epoch": 2.43, + "grad_norm": 1.268128752708435, + "learning_rate": 2.2764325964644285e-05, + "loss": 0.5316, + "step": 2521 + }, + { + "epoch": 2.43, + "grad_norm": 1.3595266342163086, + "learning_rate": 2.2689264830868375e-05, + "loss": 0.4631, + "step": 2522 + }, + { + "epoch": 2.44, + "grad_norm": 1.1918679475784302, + "learning_rate": 2.261431529736299e-05, + "loss": 0.3397, + "step": 2523 + }, + { + "epoch": 2.44, + "grad_norm": 1.1698386669158936, + "learning_rate": 2.253947744588293e-05, + "loss": 0.5497, + "step": 2524 + }, + { + "epoch": 2.44, + "grad_norm": 0.8681797385215759, + "learning_rate": 2.2464751358061086e-05, + "loss": 0.309, + "step": 2525 + }, + { + "epoch": 2.44, + "grad_norm": 1.7204582691192627, + "learning_rate": 2.2390137115408465e-05, + "loss": 0.8977, + "step": 2526 + }, + { + "epoch": 2.44, + "grad_norm": 1.3766608238220215, + "learning_rate": 2.2315634799314062e-05, + "loss": 0.5383, + "step": 2527 + }, + { + "epoch": 2.44, + "grad_norm": 1.0767016410827637, + "learning_rate": 2.224124449104485e-05, + "loss": 0.4095, + "step": 2528 + }, + { + "epoch": 2.44, + "grad_norm": 1.1819124221801758, + "learning_rate": 2.216696627174554e-05, + "loss": 0.4, + "step": 2529 + }, + { + "epoch": 2.44, + "grad_norm": 1.377302646636963, + "learning_rate": 2.2092800222438607e-05, + "loss": 0.5399, + "step": 2530 + }, + { + "epoch": 2.44, + "grad_norm": 1.3226739168167114, + "learning_rate": 2.2018746424024198e-05, + "loss": 0.5938, + "step": 2531 + }, + { + "epoch": 2.44, + "grad_norm": 1.3821479082107544, + "learning_rate": 2.194480495727999e-05, + "loss": 0.2878, + "step": 2532 + }, + { + "epoch": 2.44, + "grad_norm": 1.3803555965423584, + "learning_rate": 2.187097590286109e-05, + "loss": 0.4424, + "step": 2533 + }, + { + "epoch": 2.45, + "grad_norm": 1.3531436920166016, + "learning_rate": 2.1797259341300057e-05, + "loss": 0.4328, + "step": 2534 + }, + { + "epoch": 2.45, + "grad_norm": 1.4491779804229736, + "learning_rate": 2.172365535300677e-05, + "loss": 0.5579, + "step": 2535 + }, + { + "epoch": 2.45, + "grad_norm": 1.1792582273483276, + "learning_rate": 2.1650164018268207e-05, + "loss": 0.4401, + "step": 2536 + }, + { + "epoch": 2.45, + "grad_norm": 1.2620794773101807, + "learning_rate": 2.1576785417248517e-05, + "loss": 0.4252, + "step": 2537 + }, + { + "epoch": 2.45, + "grad_norm": 1.075289011001587, + "learning_rate": 2.150351962998884e-05, + "loss": 0.4413, + "step": 2538 + }, + { + "epoch": 2.45, + "grad_norm": 1.0587613582611084, + "learning_rate": 2.1430366736407372e-05, + "loss": 0.3882, + "step": 2539 + }, + { + "epoch": 2.45, + "grad_norm": 1.1711111068725586, + "learning_rate": 2.1357326816299052e-05, + "loss": 0.3951, + "step": 2540 + }, + { + "epoch": 2.45, + "grad_norm": 0.9997272491455078, + "learning_rate": 2.1284399949335608e-05, + "loss": 0.2251, + "step": 2541 + }, + { + "epoch": 2.45, + "grad_norm": 1.2561250925064087, + "learning_rate": 2.1211586215065498e-05, + "loss": 0.4576, + "step": 2542 + }, + { + "epoch": 2.45, + "grad_norm": 1.2198114395141602, + "learning_rate": 2.113888569291375e-05, + "loss": 0.5197, + "step": 2543 + }, + { + "epoch": 2.46, + "grad_norm": 1.087884783744812, + "learning_rate": 2.1066298462181833e-05, + "loss": 0.4415, + "step": 2544 + }, + { + "epoch": 2.46, + "grad_norm": 0.8363281488418579, + "learning_rate": 2.099382460204778e-05, + "loss": 0.2089, + "step": 2545 + }, + { + "epoch": 2.46, + "grad_norm": 1.0022271871566772, + "learning_rate": 2.092146419156586e-05, + "loss": 0.2795, + "step": 2546 + }, + { + "epoch": 2.46, + "grad_norm": 1.191450834274292, + "learning_rate": 2.0849217309666575e-05, + "loss": 0.3494, + "step": 2547 + }, + { + "epoch": 2.46, + "grad_norm": 1.3831026554107666, + "learning_rate": 2.077708403515663e-05, + "loss": 0.488, + "step": 2548 + }, + { + "epoch": 2.46, + "grad_norm": 1.0185648202896118, + "learning_rate": 2.070506444671885e-05, + "loss": 0.2596, + "step": 2549 + }, + { + "epoch": 2.46, + "grad_norm": 1.1526098251342773, + "learning_rate": 2.0633158622911973e-05, + "loss": 0.3748, + "step": 2550 + }, + { + "epoch": 2.46, + "grad_norm": 1.4703809022903442, + "learning_rate": 2.056136664217065e-05, + "loss": 0.4452, + "step": 2551 + }, + { + "epoch": 2.46, + "grad_norm": 1.032928466796875, + "learning_rate": 2.0489688582805436e-05, + "loss": 0.3714, + "step": 2552 + }, + { + "epoch": 2.46, + "grad_norm": 1.4959150552749634, + "learning_rate": 2.041812452300254e-05, + "loss": 0.526, + "step": 2553 + }, + { + "epoch": 2.47, + "grad_norm": 1.0824545621871948, + "learning_rate": 2.0346674540823827e-05, + "loss": 0.456, + "step": 2554 + }, + { + "epoch": 2.47, + "grad_norm": 0.9160414934158325, + "learning_rate": 2.0275338714206725e-05, + "loss": 0.3043, + "step": 2555 + }, + { + "epoch": 2.47, + "grad_norm": 1.0382851362228394, + "learning_rate": 2.0204117120964177e-05, + "loss": 0.2935, + "step": 2556 + }, + { + "epoch": 2.47, + "grad_norm": 1.2792209386825562, + "learning_rate": 2.013300983878455e-05, + "loss": 0.4012, + "step": 2557 + }, + { + "epoch": 2.47, + "grad_norm": 0.9995631575584412, + "learning_rate": 2.0062016945231438e-05, + "loss": 0.2868, + "step": 2558 + }, + { + "epoch": 2.47, + "grad_norm": 1.1381494998931885, + "learning_rate": 1.9991138517743644e-05, + "loss": 0.3517, + "step": 2559 + }, + { + "epoch": 2.47, + "grad_norm": 1.2145975828170776, + "learning_rate": 1.992037463363526e-05, + "loss": 0.4534, + "step": 2560 + }, + { + "epoch": 2.47, + "grad_norm": 1.2908990383148193, + "learning_rate": 1.9849725370095274e-05, + "loss": 0.4325, + "step": 2561 + }, + { + "epoch": 2.47, + "grad_norm": 1.0607693195343018, + "learning_rate": 1.9779190804187705e-05, + "loss": 0.318, + "step": 2562 + }, + { + "epoch": 2.47, + "grad_norm": 0.806749165058136, + "learning_rate": 1.9708771012851513e-05, + "loss": 0.1497, + "step": 2563 + }, + { + "epoch": 2.47, + "grad_norm": 1.0389070510864258, + "learning_rate": 1.9638466072900374e-05, + "loss": 0.3352, + "step": 2564 + }, + { + "epoch": 2.48, + "grad_norm": 1.8952654600143433, + "learning_rate": 1.956827606102274e-05, + "loss": 0.6747, + "step": 2565 + }, + { + "epoch": 2.48, + "grad_norm": 1.2378493547439575, + "learning_rate": 1.9498201053781654e-05, + "loss": 0.2968, + "step": 2566 + }, + { + "epoch": 2.48, + "grad_norm": 1.103607177734375, + "learning_rate": 1.9428241127614794e-05, + "loss": 0.341, + "step": 2567 + }, + { + "epoch": 2.48, + "grad_norm": 1.4015883207321167, + "learning_rate": 1.935839635883424e-05, + "loss": 0.5368, + "step": 2568 + }, + { + "epoch": 2.48, + "grad_norm": 1.289624571800232, + "learning_rate": 1.9288666823626426e-05, + "loss": 0.3724, + "step": 2569 + }, + { + "epoch": 2.48, + "grad_norm": 0.9046264886856079, + "learning_rate": 1.9219052598052206e-05, + "loss": 0.3472, + "step": 2570 + }, + { + "epoch": 2.48, + "grad_norm": 1.072669506072998, + "learning_rate": 1.914955375804657e-05, + "loss": 0.3162, + "step": 2571 + }, + { + "epoch": 2.48, + "grad_norm": 1.3176350593566895, + "learning_rate": 1.9080170379418616e-05, + "loss": 0.2784, + "step": 2572 + }, + { + "epoch": 2.48, + "grad_norm": 1.032930612564087, + "learning_rate": 1.9010902537851607e-05, + "loss": 0.2979, + "step": 2573 + }, + { + "epoch": 2.48, + "grad_norm": 1.0955973863601685, + "learning_rate": 1.894175030890269e-05, + "loss": 0.3136, + "step": 2574 + }, + { + "epoch": 2.49, + "grad_norm": 1.3473103046417236, + "learning_rate": 1.8872713768002925e-05, + "loss": 0.3991, + "step": 2575 + }, + { + "epoch": 2.49, + "grad_norm": 0.9937989711761475, + "learning_rate": 1.880379299045716e-05, + "loss": 0.32, + "step": 2576 + }, + { + "epoch": 2.49, + "grad_norm": 1.153117299079895, + "learning_rate": 1.8734988051444038e-05, + "loss": 0.3852, + "step": 2577 + }, + { + "epoch": 2.49, + "grad_norm": 1.0335619449615479, + "learning_rate": 1.8666299026015785e-05, + "loss": 0.2585, + "step": 2578 + }, + { + "epoch": 2.49, + "grad_norm": 1.1094386577606201, + "learning_rate": 1.8597725989098145e-05, + "loss": 0.3719, + "step": 2579 + }, + { + "epoch": 2.49, + "grad_norm": 1.0681630373001099, + "learning_rate": 1.852926901549047e-05, + "loss": 0.3646, + "step": 2580 + }, + { + "epoch": 2.49, + "grad_norm": 1.2787219285964966, + "learning_rate": 1.846092817986543e-05, + "loss": 0.418, + "step": 2581 + }, + { + "epoch": 2.49, + "grad_norm": 1.0077821016311646, + "learning_rate": 1.8392703556769016e-05, + "loss": 0.5207, + "step": 2582 + }, + { + "epoch": 2.49, + "grad_norm": 0.9804518222808838, + "learning_rate": 1.832459522062041e-05, + "loss": 0.2823, + "step": 2583 + }, + { + "epoch": 2.49, + "grad_norm": 1.2154254913330078, + "learning_rate": 1.8256603245712087e-05, + "loss": 0.4744, + "step": 2584 + }, + { + "epoch": 2.5, + "grad_norm": 0.9940858483314514, + "learning_rate": 1.8188727706209456e-05, + "loss": 0.4446, + "step": 2585 + }, + { + "epoch": 2.5, + "grad_norm": 1.0796043872833252, + "learning_rate": 1.8120968676150983e-05, + "loss": 0.3029, + "step": 2586 + }, + { + "epoch": 2.5, + "grad_norm": 1.2031382322311401, + "learning_rate": 1.8053326229447998e-05, + "loss": 0.4059, + "step": 2587 + }, + { + "epoch": 2.5, + "grad_norm": 1.7458419799804688, + "learning_rate": 1.798580043988475e-05, + "loss": 0.4546, + "step": 2588 + }, + { + "epoch": 2.5, + "grad_norm": 0.965041995048523, + "learning_rate": 1.7918391381118158e-05, + "loss": 0.2469, + "step": 2589 + }, + { + "epoch": 2.5, + "grad_norm": 0.9897942543029785, + "learning_rate": 1.7851099126677816e-05, + "loss": 0.2851, + "step": 2590 + }, + { + "epoch": 2.5, + "grad_norm": 1.304334282875061, + "learning_rate": 1.7783923749966e-05, + "loss": 0.4326, + "step": 2591 + }, + { + "epoch": 2.5, + "grad_norm": 1.433759331703186, + "learning_rate": 1.771686532425737e-05, + "loss": 0.4978, + "step": 2592 + }, + { + "epoch": 2.5, + "grad_norm": 1.1951675415039062, + "learning_rate": 1.7649923922699096e-05, + "loss": 0.4889, + "step": 2593 + }, + { + "epoch": 2.5, + "grad_norm": 1.2474220991134644, + "learning_rate": 1.7583099618310645e-05, + "loss": 0.5135, + "step": 2594 + }, + { + "epoch": 2.5, + "grad_norm": 1.3049352169036865, + "learning_rate": 1.751639248398383e-05, + "loss": 0.29, + "step": 2595 + }, + { + "epoch": 2.51, + "grad_norm": 1.994936466217041, + "learning_rate": 1.7449802592482605e-05, + "loss": 0.751, + "step": 2596 + }, + { + "epoch": 2.51, + "grad_norm": 1.0624207258224487, + "learning_rate": 1.738333001644299e-05, + "loss": 0.3416, + "step": 2597 + }, + { + "epoch": 2.51, + "grad_norm": 1.1713080406188965, + "learning_rate": 1.731697482837316e-05, + "loss": 0.4251, + "step": 2598 + }, + { + "epoch": 2.51, + "grad_norm": 1.1803196668624878, + "learning_rate": 1.7250737100653127e-05, + "loss": 0.3597, + "step": 2599 + }, + { + "epoch": 2.51, + "grad_norm": 1.2651060819625854, + "learning_rate": 1.7184616905534828e-05, + "loss": 0.4793, + "step": 2600 + }, + { + "epoch": 2.51, + "grad_norm": 1.1736713647842407, + "learning_rate": 1.7118614315142014e-05, + "loss": 0.3246, + "step": 2601 + }, + { + "epoch": 2.51, + "grad_norm": 1.0619823932647705, + "learning_rate": 1.7052729401470123e-05, + "loss": 0.2607, + "step": 2602 + }, + { + "epoch": 2.51, + "grad_norm": 1.2877070903778076, + "learning_rate": 1.6986962236386212e-05, + "loss": 0.3589, + "step": 2603 + }, + { + "epoch": 2.51, + "grad_norm": 1.2267651557922363, + "learning_rate": 1.6921312891628974e-05, + "loss": 0.4943, + "step": 2604 + }, + { + "epoch": 2.51, + "grad_norm": 1.2303855419158936, + "learning_rate": 1.685578143880846e-05, + "loss": 0.5195, + "step": 2605 + }, + { + "epoch": 2.52, + "grad_norm": 1.031379222869873, + "learning_rate": 1.6790367949406292e-05, + "loss": 0.4127, + "step": 2606 + }, + { + "epoch": 2.52, + "grad_norm": 1.1098753213882446, + "learning_rate": 1.672507249477527e-05, + "loss": 0.333, + "step": 2607 + }, + { + "epoch": 2.52, + "grad_norm": 1.3014777898788452, + "learning_rate": 1.6659895146139498e-05, + "loss": 0.4767, + "step": 2608 + }, + { + "epoch": 2.52, + "grad_norm": 1.0662829875946045, + "learning_rate": 1.6594835974594267e-05, + "loss": 0.442, + "step": 2609 + }, + { + "epoch": 2.52, + "grad_norm": 1.6387438774108887, + "learning_rate": 1.652989505110594e-05, + "loss": 0.489, + "step": 2610 + }, + { + "epoch": 2.52, + "grad_norm": 0.8665334582328796, + "learning_rate": 1.646507244651188e-05, + "loss": 0.2739, + "step": 2611 + }, + { + "epoch": 2.52, + "grad_norm": 0.9079167246818542, + "learning_rate": 1.640036823152044e-05, + "loss": 0.177, + "step": 2612 + }, + { + "epoch": 2.52, + "grad_norm": 1.1278053522109985, + "learning_rate": 1.633578247671079e-05, + "loss": 0.1977, + "step": 2613 + }, + { + "epoch": 2.52, + "grad_norm": 1.1719645261764526, + "learning_rate": 1.6271315252532877e-05, + "loss": 0.4519, + "step": 2614 + }, + { + "epoch": 2.52, + "grad_norm": 1.4588279724121094, + "learning_rate": 1.6206966629307373e-05, + "loss": 0.4433, + "step": 2615 + }, + { + "epoch": 2.53, + "grad_norm": 0.8502780199050903, + "learning_rate": 1.6142736677225602e-05, + "loss": 0.1827, + "step": 2616 + }, + { + "epoch": 2.53, + "grad_norm": 1.2789843082427979, + "learning_rate": 1.6078625466349416e-05, + "loss": 0.4402, + "step": 2617 + }, + { + "epoch": 2.53, + "grad_norm": 1.2401293516159058, + "learning_rate": 1.601463306661112e-05, + "loss": 0.5667, + "step": 2618 + }, + { + "epoch": 2.53, + "grad_norm": 1.4416359663009644, + "learning_rate": 1.5950759547813498e-05, + "loss": 0.4659, + "step": 2619 + }, + { + "epoch": 2.53, + "grad_norm": 1.694312572479248, + "learning_rate": 1.5887004979629595e-05, + "loss": 0.6871, + "step": 2620 + }, + { + "epoch": 2.53, + "grad_norm": 1.3438770771026611, + "learning_rate": 1.5823369431602687e-05, + "loss": 0.4474, + "step": 2621 + }, + { + "epoch": 2.53, + "grad_norm": 1.0863066911697388, + "learning_rate": 1.5759852973146263e-05, + "loss": 0.3095, + "step": 2622 + }, + { + "epoch": 2.53, + "grad_norm": 0.8890448212623596, + "learning_rate": 1.5696455673543955e-05, + "loss": 0.2267, + "step": 2623 + }, + { + "epoch": 2.53, + "grad_norm": 1.167129397392273, + "learning_rate": 1.563317760194931e-05, + "loss": 0.3832, + "step": 2624 + }, + { + "epoch": 2.53, + "grad_norm": 1.2073942422866821, + "learning_rate": 1.5570018827385873e-05, + "loss": 0.3542, + "step": 2625 + }, + { + "epoch": 2.53, + "grad_norm": 1.1186821460723877, + "learning_rate": 1.550697941874711e-05, + "loss": 0.323, + "step": 2626 + }, + { + "epoch": 2.54, + "grad_norm": 1.2039204835891724, + "learning_rate": 1.544405944479617e-05, + "loss": 0.4761, + "step": 2627 + }, + { + "epoch": 2.54, + "grad_norm": 1.3113536834716797, + "learning_rate": 1.5381258974166063e-05, + "loss": 0.5072, + "step": 2628 + }, + { + "epoch": 2.54, + "grad_norm": 0.8303695321083069, + "learning_rate": 1.5318578075359314e-05, + "loss": 0.2384, + "step": 2629 + }, + { + "epoch": 2.54, + "grad_norm": 1.0342708826065063, + "learning_rate": 1.525601681674811e-05, + "loss": 0.4, + "step": 2630 + }, + { + "epoch": 2.54, + "grad_norm": 1.3412367105484009, + "learning_rate": 1.5193575266574097e-05, + "loss": 0.423, + "step": 2631 + }, + { + "epoch": 2.54, + "grad_norm": 1.3921438455581665, + "learning_rate": 1.5131253492948332e-05, + "loss": 0.4646, + "step": 2632 + }, + { + "epoch": 2.54, + "grad_norm": 1.6994187831878662, + "learning_rate": 1.5069051563851216e-05, + "loss": 0.7081, + "step": 2633 + }, + { + "epoch": 2.54, + "grad_norm": 1.210211157798767, + "learning_rate": 1.5006969547132502e-05, + "loss": 0.3938, + "step": 2634 + }, + { + "epoch": 2.54, + "grad_norm": 0.8103139400482178, + "learning_rate": 1.4945007510511046e-05, + "loss": 0.2196, + "step": 2635 + }, + { + "epoch": 2.54, + "grad_norm": 1.6641013622283936, + "learning_rate": 1.4883165521574857e-05, + "loss": 0.5646, + "step": 2636 + }, + { + "epoch": 2.55, + "grad_norm": 1.088254451751709, + "learning_rate": 1.4821443647781041e-05, + "loss": 0.3477, + "step": 2637 + }, + { + "epoch": 2.55, + "grad_norm": 1.6455141305923462, + "learning_rate": 1.4759841956455632e-05, + "loss": 0.5446, + "step": 2638 + }, + { + "epoch": 2.55, + "grad_norm": 1.1219127178192139, + "learning_rate": 1.4698360514793563e-05, + "loss": 0.331, + "step": 2639 + }, + { + "epoch": 2.55, + "grad_norm": 1.228132963180542, + "learning_rate": 1.4636999389858686e-05, + "loss": 0.4265, + "step": 2640 + }, + { + "epoch": 2.55, + "grad_norm": 1.4756618738174438, + "learning_rate": 1.4575758648583487e-05, + "loss": 0.7019, + "step": 2641 + }, + { + "epoch": 2.55, + "grad_norm": 0.95322585105896, + "learning_rate": 1.4514638357769241e-05, + "loss": 0.301, + "step": 2642 + }, + { + "epoch": 2.55, + "grad_norm": 1.2768433094024658, + "learning_rate": 1.4453638584085744e-05, + "loss": 0.4116, + "step": 2643 + }, + { + "epoch": 2.55, + "grad_norm": 1.3799253702163696, + "learning_rate": 1.4392759394071433e-05, + "loss": 0.4148, + "step": 2644 + }, + { + "epoch": 2.55, + "grad_norm": 1.175272822380066, + "learning_rate": 1.4332000854133151e-05, + "loss": 0.3961, + "step": 2645 + }, + { + "epoch": 2.55, + "grad_norm": 1.1589256525039673, + "learning_rate": 1.4271363030546116e-05, + "loss": 0.4175, + "step": 2646 + }, + { + "epoch": 2.56, + "grad_norm": 1.0186060667037964, + "learning_rate": 1.4210845989453949e-05, + "loss": 0.3345, + "step": 2647 + }, + { + "epoch": 2.56, + "grad_norm": 1.1230732202529907, + "learning_rate": 1.4150449796868442e-05, + "loss": 0.36, + "step": 2648 + }, + { + "epoch": 2.56, + "grad_norm": 1.1087164878845215, + "learning_rate": 1.4090174518669605e-05, + "loss": 0.3748, + "step": 2649 + }, + { + "epoch": 2.56, + "grad_norm": 1.1244251728057861, + "learning_rate": 1.4030020220605497e-05, + "loss": 0.2803, + "step": 2650 + }, + { + "epoch": 2.56, + "grad_norm": 1.1560827493667603, + "learning_rate": 1.3969986968292366e-05, + "loss": 0.4778, + "step": 2651 + }, + { + "epoch": 2.56, + "grad_norm": 0.9187933206558228, + "learning_rate": 1.391007482721425e-05, + "loss": 0.3128, + "step": 2652 + }, + { + "epoch": 2.56, + "grad_norm": 1.490533471107483, + "learning_rate": 1.3850283862723176e-05, + "loss": 0.4867, + "step": 2653 + }, + { + "epoch": 2.56, + "grad_norm": 1.209583044052124, + "learning_rate": 1.3790614140038918e-05, + "loss": 0.3516, + "step": 2654 + }, + { + "epoch": 2.56, + "grad_norm": 1.234925389289856, + "learning_rate": 1.3731065724249107e-05, + "loss": 0.5393, + "step": 2655 + }, + { + "epoch": 2.56, + "grad_norm": 1.2917462587356567, + "learning_rate": 1.3671638680308976e-05, + "loss": 0.4702, + "step": 2656 + }, + { + "epoch": 2.56, + "grad_norm": 1.0113184452056885, + "learning_rate": 1.3612333073041342e-05, + "loss": 0.2885, + "step": 2657 + }, + { + "epoch": 2.57, + "grad_norm": 1.164046049118042, + "learning_rate": 1.355314896713665e-05, + "loss": 0.4355, + "step": 2658 + }, + { + "epoch": 2.57, + "grad_norm": 1.1011210680007935, + "learning_rate": 1.3494086427152732e-05, + "loss": 0.3958, + "step": 2659 + }, + { + "epoch": 2.57, + "grad_norm": 1.180242896080017, + "learning_rate": 1.343514551751486e-05, + "loss": 0.3329, + "step": 2660 + }, + { + "epoch": 2.57, + "grad_norm": 1.20024836063385, + "learning_rate": 1.337632630251559e-05, + "loss": 0.4231, + "step": 2661 + }, + { + "epoch": 2.57, + "grad_norm": 1.0782047510147095, + "learning_rate": 1.3317628846314808e-05, + "loss": 0.344, + "step": 2662 + }, + { + "epoch": 2.57, + "grad_norm": 1.1018481254577637, + "learning_rate": 1.3259053212939507e-05, + "loss": 0.3958, + "step": 2663 + }, + { + "epoch": 2.57, + "grad_norm": 1.0811465978622437, + "learning_rate": 1.320059946628381e-05, + "loss": 0.3304, + "step": 2664 + }, + { + "epoch": 2.57, + "grad_norm": 1.1098639965057373, + "learning_rate": 1.3142267670108953e-05, + "loss": 0.3149, + "step": 2665 + }, + { + "epoch": 2.57, + "grad_norm": 1.4178462028503418, + "learning_rate": 1.308405788804308e-05, + "loss": 0.3847, + "step": 2666 + }, + { + "epoch": 2.57, + "grad_norm": 0.8337834477424622, + "learning_rate": 1.3025970183581232e-05, + "loss": 0.1917, + "step": 2667 + }, + { + "epoch": 2.58, + "grad_norm": 0.899633526802063, + "learning_rate": 1.2968004620085363e-05, + "loss": 0.2307, + "step": 2668 + }, + { + "epoch": 2.58, + "grad_norm": 1.339258074760437, + "learning_rate": 1.2910161260784149e-05, + "loss": 0.362, + "step": 2669 + }, + { + "epoch": 2.58, + "grad_norm": 1.5382481813430786, + "learning_rate": 1.2852440168772933e-05, + "loss": 0.4168, + "step": 2670 + }, + { + "epoch": 2.58, + "grad_norm": 1.2240447998046875, + "learning_rate": 1.279484140701373e-05, + "loss": 0.3134, + "step": 2671 + }, + { + "epoch": 2.58, + "grad_norm": 1.4382638931274414, + "learning_rate": 1.2737365038335141e-05, + "loss": 0.4588, + "step": 2672 + }, + { + "epoch": 2.58, + "grad_norm": 1.4672664403915405, + "learning_rate": 1.2680011125432212e-05, + "loss": 0.804, + "step": 2673 + }, + { + "epoch": 2.58, + "grad_norm": 1.018543004989624, + "learning_rate": 1.2622779730866424e-05, + "loss": 0.3097, + "step": 2674 + }, + { + "epoch": 2.58, + "grad_norm": 1.3507752418518066, + "learning_rate": 1.2565670917065647e-05, + "loss": 0.3549, + "step": 2675 + }, + { + "epoch": 2.58, + "grad_norm": 1.420772910118103, + "learning_rate": 1.2508684746324022e-05, + "loss": 0.3168, + "step": 2676 + }, + { + "epoch": 2.58, + "grad_norm": 1.2173686027526855, + "learning_rate": 1.2451821280801912e-05, + "loss": 0.3887, + "step": 2677 + }, + { + "epoch": 2.58, + "grad_norm": 1.013791561126709, + "learning_rate": 1.239508058252578e-05, + "loss": 0.416, + "step": 2678 + }, + { + "epoch": 2.59, + "grad_norm": 1.1254581212997437, + "learning_rate": 1.2338462713388302e-05, + "loss": 0.3371, + "step": 2679 + }, + { + "epoch": 2.59, + "grad_norm": 1.3764967918395996, + "learning_rate": 1.2281967735148045e-05, + "loss": 0.3323, + "step": 2680 + }, + { + "epoch": 2.59, + "grad_norm": 1.2175017595291138, + "learning_rate": 1.2225595709429577e-05, + "loss": 0.4322, + "step": 2681 + }, + { + "epoch": 2.59, + "grad_norm": 0.9489290118217468, + "learning_rate": 1.216934669772335e-05, + "loss": 0.1919, + "step": 2682 + }, + { + "epoch": 2.59, + "grad_norm": 1.3265076875686646, + "learning_rate": 1.211322076138567e-05, + "loss": 0.436, + "step": 2683 + }, + { + "epoch": 2.59, + "grad_norm": 1.6516838073730469, + "learning_rate": 1.2057217961638514e-05, + "loss": 0.7439, + "step": 2684 + }, + { + "epoch": 2.59, + "grad_norm": 1.1286579370498657, + "learning_rate": 1.2001338359569583e-05, + "loss": 0.5829, + "step": 2685 + }, + { + "epoch": 2.59, + "grad_norm": 1.074607491493225, + "learning_rate": 1.194558201613223e-05, + "loss": 0.3329, + "step": 2686 + }, + { + "epoch": 2.59, + "grad_norm": 0.897049605846405, + "learning_rate": 1.1889948992145294e-05, + "loss": 0.221, + "step": 2687 + }, + { + "epoch": 2.59, + "grad_norm": 0.8550513982772827, + "learning_rate": 1.1834439348293105e-05, + "loss": 0.2026, + "step": 2688 + }, + { + "epoch": 2.6, + "grad_norm": 1.5562115907669067, + "learning_rate": 1.1779053145125492e-05, + "loss": 0.8372, + "step": 2689 + }, + { + "epoch": 2.6, + "grad_norm": 1.3165887594223022, + "learning_rate": 1.1723790443057533e-05, + "loss": 0.5669, + "step": 2690 + }, + { + "epoch": 2.6, + "grad_norm": 1.4699093103408813, + "learning_rate": 1.1668651302369632e-05, + "loss": 0.4754, + "step": 2691 + }, + { + "epoch": 2.6, + "grad_norm": 1.1091874837875366, + "learning_rate": 1.1613635783207413e-05, + "loss": 0.4171, + "step": 2692 + }, + { + "epoch": 2.6, + "grad_norm": 1.1977604627609253, + "learning_rate": 1.155874394558168e-05, + "loss": 0.388, + "step": 2693 + }, + { + "epoch": 2.6, + "grad_norm": 1.3541066646575928, + "learning_rate": 1.150397584936827e-05, + "loss": 0.5098, + "step": 2694 + }, + { + "epoch": 2.6, + "grad_norm": 1.4909467697143555, + "learning_rate": 1.1449331554308065e-05, + "loss": 0.6722, + "step": 2695 + }, + { + "epoch": 2.6, + "grad_norm": 1.1262574195861816, + "learning_rate": 1.1394811120006967e-05, + "loss": 0.3256, + "step": 2696 + }, + { + "epoch": 2.6, + "grad_norm": 1.2156447172164917, + "learning_rate": 1.1340414605935643e-05, + "loss": 0.4615, + "step": 2697 + }, + { + "epoch": 2.6, + "grad_norm": 1.238400936126709, + "learning_rate": 1.1286142071429723e-05, + "loss": 0.285, + "step": 2698 + }, + { + "epoch": 2.61, + "grad_norm": 1.21889328956604, + "learning_rate": 1.1231993575689506e-05, + "loss": 0.3242, + "step": 2699 + }, + { + "epoch": 2.61, + "grad_norm": 1.147277593612671, + "learning_rate": 1.1177969177780051e-05, + "loss": 0.5024, + "step": 2700 + }, + { + "epoch": 2.61, + "grad_norm": 0.8607897162437439, + "learning_rate": 1.112406893663101e-05, + "loss": 0.2267, + "step": 2701 + }, + { + "epoch": 2.61, + "grad_norm": 1.7340608835220337, + "learning_rate": 1.1070292911036603e-05, + "loss": 0.6051, + "step": 2702 + }, + { + "epoch": 2.61, + "grad_norm": 0.8612796068191528, + "learning_rate": 1.1016641159655574e-05, + "loss": 0.3697, + "step": 2703 + }, + { + "epoch": 2.61, + "grad_norm": 0.6801596879959106, + "learning_rate": 1.0963113741011138e-05, + "loss": 0.1928, + "step": 2704 + }, + { + "epoch": 2.61, + "grad_norm": 1.342772364616394, + "learning_rate": 1.0909710713490824e-05, + "loss": 0.4311, + "step": 2705 + }, + { + "epoch": 2.61, + "grad_norm": 1.0663646459579468, + "learning_rate": 1.0856432135346511e-05, + "loss": 0.4227, + "step": 2706 + }, + { + "epoch": 2.61, + "grad_norm": 1.1165860891342163, + "learning_rate": 1.080327806469436e-05, + "loss": 0.2888, + "step": 2707 + }, + { + "epoch": 2.61, + "grad_norm": 1.1469182968139648, + "learning_rate": 1.0750248559514661e-05, + "loss": 0.4555, + "step": 2708 + }, + { + "epoch": 2.61, + "grad_norm": 1.4056341648101807, + "learning_rate": 1.0697343677651863e-05, + "loss": 0.5328, + "step": 2709 + }, + { + "epoch": 2.62, + "grad_norm": 0.9922555685043335, + "learning_rate": 1.0644563476814418e-05, + "loss": 0.2429, + "step": 2710 + }, + { + "epoch": 2.62, + "grad_norm": 1.1998769044876099, + "learning_rate": 1.0591908014574907e-05, + "loss": 0.3518, + "step": 2711 + }, + { + "epoch": 2.62, + "grad_norm": 1.6901216506958008, + "learning_rate": 1.0539377348369711e-05, + "loss": 0.7558, + "step": 2712 + }, + { + "epoch": 2.62, + "grad_norm": 1.06063711643219, + "learning_rate": 1.0486971535499129e-05, + "loss": 0.3632, + "step": 2713 + }, + { + "epoch": 2.62, + "grad_norm": 1.7559407949447632, + "learning_rate": 1.0434690633127326e-05, + "loss": 0.6834, + "step": 2714 + }, + { + "epoch": 2.62, + "grad_norm": 1.5990970134735107, + "learning_rate": 1.038253469828214e-05, + "loss": 0.7073, + "step": 2715 + }, + { + "epoch": 2.62, + "grad_norm": 1.2649844884872437, + "learning_rate": 1.0330503787855092e-05, + "loss": 0.4412, + "step": 2716 + }, + { + "epoch": 2.62, + "grad_norm": 1.3318960666656494, + "learning_rate": 1.0278597958601422e-05, + "loss": 0.354, + "step": 2717 + }, + { + "epoch": 2.62, + "grad_norm": 1.6753989458084106, + "learning_rate": 1.0226817267139824e-05, + "loss": 0.54, + "step": 2718 + }, + { + "epoch": 2.62, + "grad_norm": 1.4355957508087158, + "learning_rate": 1.0175161769952548e-05, + "loss": 0.5194, + "step": 2719 + }, + { + "epoch": 2.63, + "grad_norm": 1.2790511846542358, + "learning_rate": 1.0123631523385231e-05, + "loss": 0.6185, + "step": 2720 + }, + { + "epoch": 2.63, + "grad_norm": 1.0246258974075317, + "learning_rate": 1.0072226583646981e-05, + "loss": 0.1929, + "step": 2721 + }, + { + "epoch": 2.63, + "grad_norm": 1.2745766639709473, + "learning_rate": 1.0020947006810153e-05, + "loss": 0.3861, + "step": 2722 + }, + { + "epoch": 2.63, + "grad_norm": 1.1661189794540405, + "learning_rate": 9.96979284881036e-06, + "loss": 0.3073, + "step": 2723 + }, + { + "epoch": 2.63, + "grad_norm": 1.1576132774353027, + "learning_rate": 9.918764165446389e-06, + "loss": 0.4697, + "step": 2724 + }, + { + "epoch": 2.63, + "grad_norm": 0.875788688659668, + "learning_rate": 9.867861012380255e-06, + "loss": 0.2326, + "step": 2725 + }, + { + "epoch": 2.63, + "grad_norm": 1.3949896097183228, + "learning_rate": 9.817083445136932e-06, + "loss": 0.5425, + "step": 2726 + }, + { + "epoch": 2.63, + "grad_norm": 1.2175787687301636, + "learning_rate": 9.766431519104447e-06, + "loss": 0.3765, + "step": 2727 + }, + { + "epoch": 2.63, + "grad_norm": 1.4833611249923706, + "learning_rate": 9.71590528953381e-06, + "loss": 0.5869, + "step": 2728 + }, + { + "epoch": 2.63, + "grad_norm": 1.0458709001541138, + "learning_rate": 9.665504811538884e-06, + "loss": 0.4591, + "step": 2729 + }, + { + "epoch": 2.64, + "grad_norm": 1.4730372428894043, + "learning_rate": 9.615230140096367e-06, + "loss": 0.5924, + "step": 2730 + }, + { + "epoch": 2.64, + "grad_norm": 1.1391056776046753, + "learning_rate": 9.565081330045716e-06, + "loss": 0.3789, + "step": 2731 + }, + { + "epoch": 2.64, + "grad_norm": 1.3682113885879517, + "learning_rate": 9.515058436089158e-06, + "loss": 0.2783, + "step": 2732 + }, + { + "epoch": 2.64, + "grad_norm": 1.0194765329360962, + "learning_rate": 9.465161512791504e-06, + "loss": 0.2986, + "step": 2733 + }, + { + "epoch": 2.64, + "grad_norm": 1.2771401405334473, + "learning_rate": 9.415390614580175e-06, + "loss": 0.3452, + "step": 2734 + }, + { + "epoch": 2.64, + "grad_norm": 1.1327400207519531, + "learning_rate": 9.365745795745145e-06, + "loss": 0.4536, + "step": 2735 + }, + { + "epoch": 2.64, + "grad_norm": 1.046260952949524, + "learning_rate": 9.316227110438849e-06, + "loss": 0.4057, + "step": 2736 + }, + { + "epoch": 2.64, + "grad_norm": 1.086682677268982, + "learning_rate": 9.266834612676134e-06, + "loss": 0.4568, + "step": 2737 + }, + { + "epoch": 2.64, + "grad_norm": 1.1781115531921387, + "learning_rate": 9.217568356334166e-06, + "loss": 0.3688, + "step": 2738 + }, + { + "epoch": 2.64, + "grad_norm": 1.3109606504440308, + "learning_rate": 9.168428395152486e-06, + "loss": 0.3833, + "step": 2739 + }, + { + "epoch": 2.64, + "grad_norm": 1.3341201543807983, + "learning_rate": 9.119414782732829e-06, + "loss": 0.4116, + "step": 2740 + }, + { + "epoch": 2.65, + "grad_norm": 1.1230562925338745, + "learning_rate": 9.070527572539067e-06, + "loss": 0.3619, + "step": 2741 + }, + { + "epoch": 2.65, + "grad_norm": 1.5410712957382202, + "learning_rate": 9.021766817897292e-06, + "loss": 0.6043, + "step": 2742 + }, + { + "epoch": 2.65, + "grad_norm": 0.9172453880310059, + "learning_rate": 8.97313257199557e-06, + "loss": 0.3465, + "step": 2743 + }, + { + "epoch": 2.65, + "grad_norm": 1.0832079648971558, + "learning_rate": 8.92462488788398e-06, + "loss": 0.2514, + "step": 2744 + }, + { + "epoch": 2.65, + "grad_norm": 1.1828948259353638, + "learning_rate": 8.8762438184746e-06, + "loss": 0.3498, + "step": 2745 + }, + { + "epoch": 2.65, + "grad_norm": 1.2682899236679077, + "learning_rate": 8.827989416541394e-06, + "loss": 0.452, + "step": 2746 + }, + { + "epoch": 2.65, + "grad_norm": 1.2498657703399658, + "learning_rate": 8.779861734720104e-06, + "loss": 0.5518, + "step": 2747 + }, + { + "epoch": 2.65, + "grad_norm": 1.219550371170044, + "learning_rate": 8.73186082550828e-06, + "loss": 0.4601, + "step": 2748 + }, + { + "epoch": 2.65, + "grad_norm": 1.1973583698272705, + "learning_rate": 8.683986741265151e-06, + "loss": 0.4653, + "step": 2749 + }, + { + "epoch": 2.65, + "grad_norm": 1.2459843158721924, + "learning_rate": 8.63623953421168e-06, + "loss": 0.4605, + "step": 2750 + }, + { + "epoch": 2.66, + "grad_norm": 1.2489614486694336, + "learning_rate": 8.588619256430383e-06, + "loss": 0.3642, + "step": 2751 + }, + { + "epoch": 2.66, + "grad_norm": 1.1608028411865234, + "learning_rate": 8.541125959865295e-06, + "loss": 0.3983, + "step": 2752 + }, + { + "epoch": 2.66, + "grad_norm": 1.3494408130645752, + "learning_rate": 8.493759696322031e-06, + "loss": 0.4337, + "step": 2753 + }, + { + "epoch": 2.66, + "grad_norm": 1.4363130331039429, + "learning_rate": 8.44652051746754e-06, + "loss": 0.5642, + "step": 2754 + }, + { + "epoch": 2.66, + "grad_norm": 1.4055970907211304, + "learning_rate": 8.399408474830187e-06, + "loss": 0.3069, + "step": 2755 + }, + { + "epoch": 2.66, + "grad_norm": 1.1449294090270996, + "learning_rate": 8.352423619799682e-06, + "loss": 0.4492, + "step": 2756 + }, + { + "epoch": 2.66, + "grad_norm": 1.1602027416229248, + "learning_rate": 8.305566003626974e-06, + "loss": 0.3871, + "step": 2757 + }, + { + "epoch": 2.66, + "grad_norm": 1.279659390449524, + "learning_rate": 8.258835677424207e-06, + "loss": 0.4457, + "step": 2758 + }, + { + "epoch": 2.66, + "grad_norm": 1.2665632963180542, + "learning_rate": 8.212232692164687e-06, + "loss": 0.4153, + "step": 2759 + }, + { + "epoch": 2.66, + "grad_norm": 0.9248252511024475, + "learning_rate": 8.165757098682863e-06, + "loss": 0.1849, + "step": 2760 + }, + { + "epoch": 2.67, + "grad_norm": 1.4571242332458496, + "learning_rate": 8.119408947674157e-06, + "loss": 0.5424, + "step": 2761 + }, + { + "epoch": 2.67, + "grad_norm": 1.1283844709396362, + "learning_rate": 8.073188289694985e-06, + "loss": 0.4197, + "step": 2762 + }, + { + "epoch": 2.67, + "grad_norm": 1.0835353136062622, + "learning_rate": 8.027095175162757e-06, + "loss": 0.3708, + "step": 2763 + }, + { + "epoch": 2.67, + "grad_norm": 1.2077913284301758, + "learning_rate": 7.981129654355698e-06, + "loss": 0.3373, + "step": 2764 + }, + { + "epoch": 2.67, + "grad_norm": 1.2850297689437866, + "learning_rate": 7.93529177741288e-06, + "loss": 0.4046, + "step": 2765 + }, + { + "epoch": 2.67, + "grad_norm": 0.9630417823791504, + "learning_rate": 7.889581594334103e-06, + "loss": 0.302, + "step": 2766 + }, + { + "epoch": 2.67, + "grad_norm": 1.1772398948669434, + "learning_rate": 7.84399915497995e-06, + "loss": 0.3867, + "step": 2767 + }, + { + "epoch": 2.67, + "grad_norm": 1.3503518104553223, + "learning_rate": 7.798544509071596e-06, + "loss": 0.3149, + "step": 2768 + }, + { + "epoch": 2.67, + "grad_norm": 1.1282005310058594, + "learning_rate": 7.753217706190862e-06, + "loss": 0.3866, + "step": 2769 + }, + { + "epoch": 2.67, + "grad_norm": 0.9566932916641235, + "learning_rate": 7.708018795780079e-06, + "loss": 0.428, + "step": 2770 + }, + { + "epoch": 2.67, + "grad_norm": 1.6949591636657715, + "learning_rate": 7.662947827142123e-06, + "loss": 1.0602, + "step": 2771 + }, + { + "epoch": 2.68, + "grad_norm": 1.5378934144973755, + "learning_rate": 7.6180048494402656e-06, + "loss": 0.6878, + "step": 2772 + }, + { + "epoch": 2.68, + "grad_norm": 1.2534940242767334, + "learning_rate": 7.5731899116981696e-06, + "loss": 0.3789, + "step": 2773 + }, + { + "epoch": 2.68, + "grad_norm": 1.306345820426941, + "learning_rate": 7.528503062799882e-06, + "loss": 0.4367, + "step": 2774 + }, + { + "epoch": 2.68, + "grad_norm": 1.062415361404419, + "learning_rate": 7.4839443514896775e-06, + "loss": 0.4448, + "step": 2775 + }, + { + "epoch": 2.68, + "grad_norm": 1.4112834930419922, + "learning_rate": 7.439513826372091e-06, + "loss": 0.5658, + "step": 2776 + }, + { + "epoch": 2.68, + "grad_norm": 1.1098510026931763, + "learning_rate": 7.3952115359117725e-06, + "loss": 0.3965, + "step": 2777 + }, + { + "epoch": 2.68, + "grad_norm": 1.332319974899292, + "learning_rate": 7.351037528433594e-06, + "loss": 0.4484, + "step": 2778 + }, + { + "epoch": 2.68, + "grad_norm": 1.1495813131332397, + "learning_rate": 7.306991852122413e-06, + "loss": 0.3646, + "step": 2779 + }, + { + "epoch": 2.68, + "grad_norm": 1.1314551830291748, + "learning_rate": 7.263074555023133e-06, + "loss": 0.3927, + "step": 2780 + }, + { + "epoch": 2.68, + "grad_norm": 1.2844690084457397, + "learning_rate": 7.219285685040627e-06, + "loss": 0.2945, + "step": 2781 + }, + { + "epoch": 2.69, + "grad_norm": 0.8933795690536499, + "learning_rate": 7.175625289939688e-06, + "loss": 0.2147, + "step": 2782 + }, + { + "epoch": 2.69, + "grad_norm": 1.2195204496383667, + "learning_rate": 7.132093417344929e-06, + "loss": 0.4342, + "step": 2783 + }, + { + "epoch": 2.69, + "grad_norm": 1.2162507772445679, + "learning_rate": 7.0886901147408226e-06, + "loss": 0.3175, + "step": 2784 + }, + { + "epoch": 2.69, + "grad_norm": 0.8239806294441223, + "learning_rate": 7.045415429471569e-06, + "loss": 0.2895, + "step": 2785 + }, + { + "epoch": 2.69, + "grad_norm": 1.3585073947906494, + "learning_rate": 7.002269408741074e-06, + "loss": 0.4936, + "step": 2786 + }, + { + "epoch": 2.69, + "grad_norm": 1.044557809829712, + "learning_rate": 6.959252099612873e-06, + "loss": 0.3316, + "step": 2787 + }, + { + "epoch": 2.69, + "grad_norm": 0.9965437054634094, + "learning_rate": 6.9163635490101654e-06, + "loss": 0.4026, + "step": 2788 + }, + { + "epoch": 2.69, + "grad_norm": 1.2901641130447388, + "learning_rate": 6.873603803715642e-06, + "loss": 0.4783, + "step": 2789 + }, + { + "epoch": 2.69, + "grad_norm": 1.0988811254501343, + "learning_rate": 6.830972910371519e-06, + "loss": 0.4749, + "step": 2790 + }, + { + "epoch": 2.69, + "grad_norm": 1.1592463254928589, + "learning_rate": 6.788470915479475e-06, + "loss": 0.3396, + "step": 2791 + }, + { + "epoch": 2.69, + "grad_norm": 1.192961573600769, + "learning_rate": 6.7460978654005634e-06, + "loss": 0.2703, + "step": 2792 + }, + { + "epoch": 2.7, + "grad_norm": 1.6996821165084839, + "learning_rate": 6.703853806355187e-06, + "loss": 0.6261, + "step": 2793 + }, + { + "epoch": 2.7, + "grad_norm": 1.1305897235870361, + "learning_rate": 6.661738784423041e-06, + "loss": 0.4905, + "step": 2794 + }, + { + "epoch": 2.7, + "grad_norm": 1.7390271425247192, + "learning_rate": 6.6197528455431e-06, + "loss": 0.6344, + "step": 2795 + }, + { + "epoch": 2.7, + "grad_norm": 0.9697789549827576, + "learning_rate": 6.577896035513509e-06, + "loss": 0.3131, + "step": 2796 + }, + { + "epoch": 2.7, + "grad_norm": 1.1802033185958862, + "learning_rate": 6.536168399991535e-06, + "loss": 0.3639, + "step": 2797 + }, + { + "epoch": 2.7, + "grad_norm": 1.0719521045684814, + "learning_rate": 6.494569984493575e-06, + "loss": 0.2925, + "step": 2798 + }, + { + "epoch": 2.7, + "grad_norm": 1.4077153205871582, + "learning_rate": 6.453100834395098e-06, + "loss": 0.4654, + "step": 2799 + }, + { + "epoch": 2.7, + "grad_norm": 1.2906742095947266, + "learning_rate": 6.411760994930516e-06, + "loss": 0.4707, + "step": 2800 + }, + { + "epoch": 2.7, + "grad_norm": 1.1949652433395386, + "learning_rate": 6.370550511193202e-06, + "loss": 0.4016, + "step": 2801 + }, + { + "epoch": 2.7, + "grad_norm": 1.407058835029602, + "learning_rate": 6.329469428135462e-06, + "loss": 0.5245, + "step": 2802 + }, + { + "epoch": 2.71, + "grad_norm": 0.9813088178634644, + "learning_rate": 6.288517790568424e-06, + "loss": 0.3017, + "step": 2803 + }, + { + "epoch": 2.71, + "grad_norm": 1.1702017784118652, + "learning_rate": 6.247695643161993e-06, + "loss": 0.4415, + "step": 2804 + }, + { + "epoch": 2.71, + "grad_norm": 1.032965064048767, + "learning_rate": 6.207003030444914e-06, + "loss": 0.3122, + "step": 2805 + }, + { + "epoch": 2.71, + "grad_norm": 0.9007138013839722, + "learning_rate": 6.166439996804538e-06, + "loss": 0.3185, + "step": 2806 + }, + { + "epoch": 2.71, + "grad_norm": 1.3804324865341187, + "learning_rate": 6.126006586486918e-06, + "loss": 0.4441, + "step": 2807 + }, + { + "epoch": 2.71, + "grad_norm": 1.2858846187591553, + "learning_rate": 6.085702843596691e-06, + "loss": 0.4465, + "step": 2808 + }, + { + "epoch": 2.71, + "grad_norm": 1.1197959184646606, + "learning_rate": 6.045528812097093e-06, + "loss": 0.3603, + "step": 2809 + }, + { + "epoch": 2.71, + "grad_norm": 1.3286681175231934, + "learning_rate": 6.00548453580986e-06, + "loss": 0.5375, + "step": 2810 + }, + { + "epoch": 2.71, + "grad_norm": 1.308885097503662, + "learning_rate": 5.965570058415137e-06, + "loss": 0.5407, + "step": 2811 + }, + { + "epoch": 2.71, + "grad_norm": 1.5666403770446777, + "learning_rate": 5.925785423451569e-06, + "loss": 0.482, + "step": 2812 + }, + { + "epoch": 2.72, + "grad_norm": 1.2590450048446655, + "learning_rate": 5.886130674316106e-06, + "loss": 0.5029, + "step": 2813 + }, + { + "epoch": 2.72, + "grad_norm": 0.8867388963699341, + "learning_rate": 5.846605854264039e-06, + "loss": 0.2502, + "step": 2814 + }, + { + "epoch": 2.72, + "grad_norm": 1.2063246965408325, + "learning_rate": 5.807211006408908e-06, + "loss": 0.3824, + "step": 2815 + }, + { + "epoch": 2.72, + "grad_norm": 1.8002722263336182, + "learning_rate": 5.767946173722574e-06, + "loss": 0.5485, + "step": 2816 + }, + { + "epoch": 2.72, + "grad_norm": 1.7626540660858154, + "learning_rate": 5.728811399034958e-06, + "loss": 0.6995, + "step": 2817 + }, + { + "epoch": 2.72, + "grad_norm": 0.8790903091430664, + "learning_rate": 5.689806725034191e-06, + "loss": 0.2667, + "step": 2818 + }, + { + "epoch": 2.72, + "grad_norm": 1.3238037824630737, + "learning_rate": 5.6509321942664334e-06, + "loss": 0.5077, + "step": 2819 + }, + { + "epoch": 2.72, + "grad_norm": 1.1437560319900513, + "learning_rate": 5.612187849135947e-06, + "loss": 0.3441, + "step": 2820 + }, + { + "epoch": 2.72, + "grad_norm": 1.3718996047973633, + "learning_rate": 5.573573731904951e-06, + "loss": 0.5211, + "step": 2821 + }, + { + "epoch": 2.72, + "grad_norm": 1.0614670515060425, + "learning_rate": 5.5350898846935894e-06, + "loss": 0.2664, + "step": 2822 + }, + { + "epoch": 2.72, + "grad_norm": 1.1064701080322266, + "learning_rate": 5.496736349479989e-06, + "loss": 0.4569, + "step": 2823 + }, + { + "epoch": 2.73, + "grad_norm": 1.287601113319397, + "learning_rate": 5.458513168100046e-06, + "loss": 0.5515, + "step": 2824 + }, + { + "epoch": 2.73, + "grad_norm": 1.123247504234314, + "learning_rate": 5.420420382247521e-06, + "loss": 0.3065, + "step": 2825 + }, + { + "epoch": 2.73, + "grad_norm": 0.889412522315979, + "learning_rate": 5.382458033473897e-06, + "loss": 0.2599, + "step": 2826 + }, + { + "epoch": 2.73, + "grad_norm": 1.0928308963775635, + "learning_rate": 5.344626163188412e-06, + "loss": 0.2895, + "step": 2827 + }, + { + "epoch": 2.73, + "grad_norm": 1.2472971677780151, + "learning_rate": 5.306924812657987e-06, + "loss": 0.3615, + "step": 2828 + }, + { + "epoch": 2.73, + "grad_norm": 1.1193463802337646, + "learning_rate": 5.269354023007114e-06, + "loss": 0.438, + "step": 2829 + }, + { + "epoch": 2.73, + "grad_norm": 0.9217707514762878, + "learning_rate": 5.231913835217941e-06, + "loss": 0.3097, + "step": 2830 + }, + { + "epoch": 2.73, + "grad_norm": 1.294617772102356, + "learning_rate": 5.194604290130106e-06, + "loss": 0.5423, + "step": 2831 + }, + { + "epoch": 2.73, + "grad_norm": 1.2628179788589478, + "learning_rate": 5.157425428440762e-06, + "loss": 0.4024, + "step": 2832 + }, + { + "epoch": 2.73, + "grad_norm": 0.9742478728294373, + "learning_rate": 5.120377290704512e-06, + "loss": 0.3548, + "step": 2833 + }, + { + "epoch": 2.74, + "grad_norm": 1.4343669414520264, + "learning_rate": 5.083459917333361e-06, + "loss": 0.319, + "step": 2834 + }, + { + "epoch": 2.74, + "grad_norm": 1.464012861251831, + "learning_rate": 5.046673348596681e-06, + "loss": 0.5287, + "step": 2835 + }, + { + "epoch": 2.74, + "grad_norm": 1.1931774616241455, + "learning_rate": 5.010017624621152e-06, + "loss": 0.3692, + "step": 2836 + }, + { + "epoch": 2.74, + "grad_norm": 1.4124280214309692, + "learning_rate": 4.973492785390763e-06, + "loss": 0.4301, + "step": 2837 + }, + { + "epoch": 2.74, + "grad_norm": 0.9147828817367554, + "learning_rate": 4.937098870746671e-06, + "loss": 0.2231, + "step": 2838 + }, + { + "epoch": 2.74, + "grad_norm": 1.17789626121521, + "learning_rate": 4.900835920387287e-06, + "loss": 0.3442, + "step": 2839 + }, + { + "epoch": 2.74, + "grad_norm": 1.1786454916000366, + "learning_rate": 4.864703973868123e-06, + "loss": 0.3725, + "step": 2840 + }, + { + "epoch": 2.74, + "grad_norm": 0.9402766227722168, + "learning_rate": 4.82870307060182e-06, + "loss": 0.2451, + "step": 2841 + }, + { + "epoch": 2.74, + "grad_norm": 1.3032366037368774, + "learning_rate": 4.792833249858075e-06, + "loss": 0.4427, + "step": 2842 + }, + { + "epoch": 2.74, + "grad_norm": 1.2752867937088013, + "learning_rate": 4.757094550763549e-06, + "loss": 0.5675, + "step": 2843 + }, + { + "epoch": 2.75, + "grad_norm": 1.0933231115341187, + "learning_rate": 4.721487012301962e-06, + "loss": 0.3532, + "step": 2844 + }, + { + "epoch": 2.75, + "grad_norm": 1.2620882987976074, + "learning_rate": 4.686010673313909e-06, + "loss": 0.4495, + "step": 2845 + }, + { + "epoch": 2.75, + "grad_norm": 1.1065340042114258, + "learning_rate": 4.650665572496868e-06, + "loss": 0.464, + "step": 2846 + }, + { + "epoch": 2.75, + "grad_norm": 0.9144507050514221, + "learning_rate": 4.615451748405164e-06, + "loss": 0.2445, + "step": 2847 + }, + { + "epoch": 2.75, + "grad_norm": 1.1331830024719238, + "learning_rate": 4.580369239449989e-06, + "loss": 0.3741, + "step": 2848 + }, + { + "epoch": 2.75, + "grad_norm": 1.3149724006652832, + "learning_rate": 4.545418083899216e-06, + "loss": 0.4282, + "step": 2849 + }, + { + "epoch": 2.75, + "grad_norm": 1.846543550491333, + "learning_rate": 4.510598319877463e-06, + "loss": 0.5719, + "step": 2850 + }, + { + "epoch": 2.75, + "grad_norm": 1.2957957983016968, + "learning_rate": 4.475909985366066e-06, + "loss": 0.5492, + "step": 2851 + }, + { + "epoch": 2.75, + "grad_norm": 1.146454095840454, + "learning_rate": 4.441353118202942e-06, + "loss": 0.4014, + "step": 2852 + }, + { + "epoch": 2.75, + "grad_norm": 1.318638563156128, + "learning_rate": 4.406927756082618e-06, + "loss": 0.5174, + "step": 2853 + }, + { + "epoch": 2.75, + "grad_norm": 0.845371425151825, + "learning_rate": 4.372633936556195e-06, + "loss": 0.2613, + "step": 2854 + }, + { + "epoch": 2.76, + "grad_norm": 1.4363237619400024, + "learning_rate": 4.338471697031304e-06, + "loss": 0.4672, + "step": 2855 + }, + { + "epoch": 2.76, + "grad_norm": 0.9613076448440552, + "learning_rate": 4.304441074771986e-06, + "loss": 0.3455, + "step": 2856 + }, + { + "epoch": 2.76, + "grad_norm": 1.239383339881897, + "learning_rate": 4.270542106898756e-06, + "loss": 0.4925, + "step": 2857 + }, + { + "epoch": 2.76, + "grad_norm": 1.1034066677093506, + "learning_rate": 4.236774830388534e-06, + "loss": 0.3413, + "step": 2858 + }, + { + "epoch": 2.76, + "grad_norm": 1.3597811460494995, + "learning_rate": 4.2031392820745665e-06, + "loss": 0.347, + "step": 2859 + }, + { + "epoch": 2.76, + "grad_norm": 1.3148061037063599, + "learning_rate": 4.169635498646407e-06, + "loss": 0.4692, + "step": 2860 + }, + { + "epoch": 2.76, + "grad_norm": 1.3651525974273682, + "learning_rate": 4.136263516649891e-06, + "loss": 0.5351, + "step": 2861 + }, + { + "epoch": 2.76, + "grad_norm": 1.1956440210342407, + "learning_rate": 4.103023372487094e-06, + "loss": 0.4367, + "step": 2862 + }, + { + "epoch": 2.76, + "grad_norm": 1.114611268043518, + "learning_rate": 4.069915102416291e-06, + "loss": 0.4394, + "step": 2863 + }, + { + "epoch": 2.76, + "grad_norm": 1.038771390914917, + "learning_rate": 4.036938742551871e-06, + "loss": 0.2957, + "step": 2864 + }, + { + "epoch": 2.77, + "grad_norm": 1.0876121520996094, + "learning_rate": 4.004094328864338e-06, + "loss": 0.3007, + "step": 2865 + }, + { + "epoch": 2.77, + "grad_norm": 1.1242945194244385, + "learning_rate": 3.971381897180326e-06, + "loss": 0.2948, + "step": 2866 + }, + { + "epoch": 2.77, + "grad_norm": 1.0400104522705078, + "learning_rate": 3.938801483182458e-06, + "loss": 0.4133, + "step": 2867 + }, + { + "epoch": 2.77, + "grad_norm": 1.0308600664138794, + "learning_rate": 3.906353122409334e-06, + "loss": 0.2428, + "step": 2868 + }, + { + "epoch": 2.77, + "grad_norm": 1.1233997344970703, + "learning_rate": 3.8740368502555856e-06, + "loss": 0.3727, + "step": 2869 + }, + { + "epoch": 2.77, + "grad_norm": 1.2486021518707275, + "learning_rate": 3.841852701971668e-06, + "loss": 0.3912, + "step": 2870 + }, + { + "epoch": 2.77, + "grad_norm": 1.3520262241363525, + "learning_rate": 3.8098007126639857e-06, + "loss": 0.6819, + "step": 2871 + }, + { + "epoch": 2.77, + "grad_norm": 1.4625575542449951, + "learning_rate": 3.7778809172947514e-06, + "loss": 0.6033, + "step": 2872 + }, + { + "epoch": 2.77, + "grad_norm": 0.9293382167816162, + "learning_rate": 3.7460933506820032e-06, + "loss": 0.3032, + "step": 2873 + }, + { + "epoch": 2.77, + "grad_norm": 1.2324880361557007, + "learning_rate": 3.7144380474995326e-06, + "loss": 0.3844, + "step": 2874 + }, + { + "epoch": 2.78, + "grad_norm": 0.906033992767334, + "learning_rate": 3.682915042276816e-06, + "loss": 0.2879, + "step": 2875 + }, + { + "epoch": 2.78, + "grad_norm": 1.577123999595642, + "learning_rate": 3.6515243693991256e-06, + "loss": 0.4891, + "step": 2876 + }, + { + "epoch": 2.78, + "grad_norm": 1.0596559047698975, + "learning_rate": 3.6202660631072802e-06, + "loss": 0.2943, + "step": 2877 + }, + { + "epoch": 2.78, + "grad_norm": 0.8984016180038452, + "learning_rate": 3.5891401574977555e-06, + "loss": 0.1917, + "step": 2878 + }, + { + "epoch": 2.78, + "grad_norm": 1.0884068012237549, + "learning_rate": 3.5581466865226287e-06, + "loss": 0.3737, + "step": 2879 + }, + { + "epoch": 2.78, + "grad_norm": 1.3667714595794678, + "learning_rate": 3.5272856839894814e-06, + "loss": 0.5069, + "step": 2880 + }, + { + "epoch": 2.78, + "grad_norm": 1.6226465702056885, + "learning_rate": 3.4965571835614e-06, + "loss": 0.6759, + "step": 2881 + }, + { + "epoch": 2.78, + "grad_norm": 1.6317212581634521, + "learning_rate": 3.4659612187569604e-06, + "loss": 0.8197, + "step": 2882 + }, + { + "epoch": 2.78, + "grad_norm": 1.2995193004608154, + "learning_rate": 3.435497822950148e-06, + "loss": 0.4017, + "step": 2883 + }, + { + "epoch": 2.78, + "grad_norm": 1.2911146879196167, + "learning_rate": 3.405167029370368e-06, + "loss": 0.522, + "step": 2884 + }, + { + "epoch": 2.78, + "grad_norm": 1.1538434028625488, + "learning_rate": 3.3749688711023492e-06, + "loss": 0.2955, + "step": 2885 + }, + { + "epoch": 2.79, + "grad_norm": 1.2104440927505493, + "learning_rate": 3.344903381086159e-06, + "loss": 0.3577, + "step": 2886 + }, + { + "epoch": 2.79, + "grad_norm": 1.281571388244629, + "learning_rate": 3.3149705921171752e-06, + "loss": 0.4891, + "step": 2887 + }, + { + "epoch": 2.79, + "grad_norm": 1.0648771524429321, + "learning_rate": 3.2851705368459883e-06, + "loss": 0.3683, + "step": 2888 + }, + { + "epoch": 2.79, + "grad_norm": 1.3280951976776123, + "learning_rate": 3.2555032477784017e-06, + "loss": 0.4345, + "step": 2889 + }, + { + "epoch": 2.79, + "grad_norm": 1.4385279417037964, + "learning_rate": 3.2259687572754464e-06, + "loss": 0.7291, + "step": 2890 + }, + { + "epoch": 2.79, + "grad_norm": 0.8171462416648865, + "learning_rate": 3.1965670975532544e-06, + "loss": 0.2052, + "step": 2891 + }, + { + "epoch": 2.79, + "grad_norm": 1.12943434715271, + "learning_rate": 3.167298300683047e-06, + "loss": 0.4011, + "step": 2892 + }, + { + "epoch": 2.79, + "grad_norm": 1.2015761137008667, + "learning_rate": 3.138162398591174e-06, + "loss": 0.4803, + "step": 2893 + }, + { + "epoch": 2.79, + "grad_norm": 0.9145457148551941, + "learning_rate": 3.109159423058991e-06, + "loss": 0.2703, + "step": 2894 + }, + { + "epoch": 2.79, + "grad_norm": 1.4654762744903564, + "learning_rate": 3.080289405722886e-06, + "loss": 0.5258, + "step": 2895 + }, + { + "epoch": 2.8, + "grad_norm": 1.0232746601104736, + "learning_rate": 3.0515523780741403e-06, + "loss": 0.2846, + "step": 2896 + }, + { + "epoch": 2.8, + "grad_norm": 1.163627028465271, + "learning_rate": 3.0229483714590835e-06, + "loss": 0.2929, + "step": 2897 + }, + { + "epoch": 2.8, + "grad_norm": 1.0843936204910278, + "learning_rate": 2.994477417078867e-06, + "loss": 0.3374, + "step": 2898 + }, + { + "epoch": 2.8, + "grad_norm": 1.0600237846374512, + "learning_rate": 2.9661395459895103e-06, + "loss": 0.3439, + "step": 2899 + }, + { + "epoch": 2.8, + "grad_norm": 1.2277902364730835, + "learning_rate": 2.937934789101912e-06, + "loss": 0.4068, + "step": 2900 + }, + { + "epoch": 2.8, + "grad_norm": 1.0997445583343506, + "learning_rate": 2.9098631771817403e-06, + "loss": 0.3053, + "step": 2901 + }, + { + "epoch": 2.8, + "grad_norm": 1.4788862466812134, + "learning_rate": 2.8819247408494316e-06, + "loss": 0.5985, + "step": 2902 + }, + { + "epoch": 2.8, + "grad_norm": 1.0369575023651123, + "learning_rate": 2.854119510580136e-06, + "loss": 0.2808, + "step": 2903 + }, + { + "epoch": 2.8, + "grad_norm": 1.1584135293960571, + "learning_rate": 2.826447516703745e-06, + "loss": 0.4053, + "step": 2904 + }, + { + "epoch": 2.8, + "grad_norm": 1.258483648300171, + "learning_rate": 2.7989087894047945e-06, + "loss": 0.3445, + "step": 2905 + }, + { + "epoch": 2.81, + "grad_norm": 1.4125854969024658, + "learning_rate": 2.7715033587224214e-06, + "loss": 0.4582, + "step": 2906 + }, + { + "epoch": 2.81, + "grad_norm": 1.1342895030975342, + "learning_rate": 2.744231254550436e-06, + "loss": 0.3992, + "step": 2907 + }, + { + "epoch": 2.81, + "grad_norm": 1.0188727378845215, + "learning_rate": 2.7170925066371673e-06, + "loss": 0.3321, + "step": 2908 + }, + { + "epoch": 2.81, + "grad_norm": 0.8607746362686157, + "learning_rate": 2.6900871445854626e-06, + "loss": 0.3204, + "step": 2909 + }, + { + "epoch": 2.81, + "grad_norm": 1.1790703535079956, + "learning_rate": 2.663215197852717e-06, + "loss": 0.3829, + "step": 2910 + }, + { + "epoch": 2.81, + "grad_norm": 1.5910663604736328, + "learning_rate": 2.636476695750775e-06, + "loss": 0.4193, + "step": 2911 + }, + { + "epoch": 2.81, + "grad_norm": 1.5405675172805786, + "learning_rate": 2.6098716674459176e-06, + "loss": 0.5446, + "step": 2912 + }, + { + "epoch": 2.81, + "grad_norm": 0.7244126200675964, + "learning_rate": 2.583400141958847e-06, + "loss": 0.2043, + "step": 2913 + }, + { + "epoch": 2.81, + "grad_norm": 1.202313780784607, + "learning_rate": 2.5570621481646043e-06, + "loss": 0.3616, + "step": 2914 + }, + { + "epoch": 2.81, + "grad_norm": 1.1371735334396362, + "learning_rate": 2.5308577147926394e-06, + "loss": 0.3795, + "step": 2915 + }, + { + "epoch": 2.81, + "grad_norm": 1.038470983505249, + "learning_rate": 2.504786870426656e-06, + "loss": 0.2639, + "step": 2916 + }, + { + "epoch": 2.82, + "grad_norm": 1.292428970336914, + "learning_rate": 2.4788496435046437e-06, + "loss": 0.4548, + "step": 2917 + }, + { + "epoch": 2.82, + "grad_norm": 1.237728238105774, + "learning_rate": 2.453046062318887e-06, + "loss": 0.3921, + "step": 2918 + }, + { + "epoch": 2.82, + "grad_norm": 1.202520489692688, + "learning_rate": 2.4273761550158563e-06, + "loss": 0.5007, + "step": 2919 + }, + { + "epoch": 2.82, + "grad_norm": 1.2154775857925415, + "learning_rate": 2.4018399495961964e-06, + "loss": 0.4211, + "step": 2920 + }, + { + "epoch": 2.82, + "grad_norm": 1.4609144926071167, + "learning_rate": 2.37643747391475e-06, + "loss": 0.6236, + "step": 2921 + }, + { + "epoch": 2.82, + "grad_norm": 1.2069356441497803, + "learning_rate": 2.3511687556804634e-06, + "loss": 0.3363, + "step": 2922 + }, + { + "epoch": 2.82, + "grad_norm": 0.7999249696731567, + "learning_rate": 2.326033822456386e-06, + "loss": 0.2567, + "step": 2923 + }, + { + "epoch": 2.82, + "grad_norm": 1.2546871900558472, + "learning_rate": 2.3010327016596278e-06, + "loss": 0.3224, + "step": 2924 + }, + { + "epoch": 2.82, + "grad_norm": 1.238844394683838, + "learning_rate": 2.2761654205613603e-06, + "loss": 0.4932, + "step": 2925 + }, + { + "epoch": 2.82, + "grad_norm": 1.4584134817123413, + "learning_rate": 2.2514320062867325e-06, + "loss": 0.6393, + "step": 2926 + }, + { + "epoch": 2.83, + "grad_norm": 1.3758424520492554, + "learning_rate": 2.2268324858148724e-06, + "loss": 0.5076, + "step": 2927 + }, + { + "epoch": 2.83, + "grad_norm": 1.5602376461029053, + "learning_rate": 2.2023668859788985e-06, + "loss": 0.5522, + "step": 2928 + }, + { + "epoch": 2.83, + "grad_norm": 1.5447279214859009, + "learning_rate": 2.178035233465797e-06, + "loss": 0.6943, + "step": 2929 + }, + { + "epoch": 2.83, + "grad_norm": 1.119056224822998, + "learning_rate": 2.1538375548164903e-06, + "loss": 0.4048, + "step": 2930 + }, + { + "epoch": 2.83, + "grad_norm": 1.579376220703125, + "learning_rate": 2.129773876425711e-06, + "loss": 0.5035, + "step": 2931 + }, + { + "epoch": 2.83, + "grad_norm": 1.2969238758087158, + "learning_rate": 2.105844224542061e-06, + "loss": 0.4769, + "step": 2932 + }, + { + "epoch": 2.83, + "grad_norm": 0.8729587197303772, + "learning_rate": 2.0820486252679655e-06, + "loss": 0.2227, + "step": 2933 + }, + { + "epoch": 2.83, + "grad_norm": 1.4230064153671265, + "learning_rate": 2.058387104559578e-06, + "loss": 0.468, + "step": 2934 + }, + { + "epoch": 2.83, + "grad_norm": 1.446077585220337, + "learning_rate": 2.034859688226823e-06, + "loss": 0.3544, + "step": 2935 + }, + { + "epoch": 2.83, + "grad_norm": 1.2802209854125977, + "learning_rate": 2.0114664019333512e-06, + "loss": 0.4822, + "step": 2936 + }, + { + "epoch": 2.83, + "grad_norm": 1.3602778911590576, + "learning_rate": 1.9882072711964876e-06, + "loss": 0.4064, + "step": 2937 + }, + { + "epoch": 2.84, + "grad_norm": 1.9845119714736938, + "learning_rate": 1.9650823213872014e-06, + "loss": 0.6687, + "step": 2938 + }, + { + "epoch": 2.84, + "grad_norm": 1.3791077136993408, + "learning_rate": 1.942091577730176e-06, + "loss": 0.4893, + "step": 2939 + }, + { + "epoch": 2.84, + "grad_norm": 0.945283055305481, + "learning_rate": 1.919235065303629e-06, + "loss": 0.2887, + "step": 2940 + }, + { + "epoch": 2.84, + "grad_norm": 1.4401484727859497, + "learning_rate": 1.8965128090393802e-06, + "loss": 0.3711, + "step": 2941 + }, + { + "epoch": 2.84, + "grad_norm": 1.3931560516357422, + "learning_rate": 1.8739248337227844e-06, + "loss": 0.5732, + "step": 2942 + }, + { + "epoch": 2.84, + "grad_norm": 1.1365431547164917, + "learning_rate": 1.851471163992785e-06, + "loss": 0.3554, + "step": 2943 + }, + { + "epoch": 2.84, + "grad_norm": 1.4375975131988525, + "learning_rate": 1.8291518243417627e-06, + "loss": 0.4748, + "step": 2944 + }, + { + "epoch": 2.84, + "grad_norm": 1.0275496244430542, + "learning_rate": 1.8069668391155759e-06, + "loss": 0.2599, + "step": 2945 + }, + { + "epoch": 2.84, + "grad_norm": 0.8617614507675171, + "learning_rate": 1.7849162325135754e-06, + "loss": 0.1928, + "step": 2946 + }, + { + "epoch": 2.84, + "grad_norm": 1.23322594165802, + "learning_rate": 1.7630000285884795e-06, + "loss": 0.305, + "step": 2947 + }, + { + "epoch": 2.85, + "grad_norm": 1.6121059656143188, + "learning_rate": 1.7412182512464292e-06, + "loss": 0.4491, + "step": 2948 + }, + { + "epoch": 2.85, + "grad_norm": 1.5377715826034546, + "learning_rate": 1.7195709242469465e-06, + "loss": 0.49, + "step": 2949 + }, + { + "epoch": 2.85, + "grad_norm": 1.2269455194473267, + "learning_rate": 1.6980580712028514e-06, + "loss": 0.3071, + "step": 2950 + }, + { + "epoch": 2.85, + "grad_norm": 1.3769416809082031, + "learning_rate": 1.6766797155803315e-06, + "loss": 0.4086, + "step": 2951 + }, + { + "epoch": 2.85, + "grad_norm": 1.137343168258667, + "learning_rate": 1.6554358806988024e-06, + "loss": 0.3242, + "step": 2952 + }, + { + "epoch": 2.85, + "grad_norm": 0.8603381514549255, + "learning_rate": 1.6343265897310055e-06, + "loss": 0.2855, + "step": 2953 + }, + { + "epoch": 2.85, + "grad_norm": 1.6535147428512573, + "learning_rate": 1.6133518657028972e-06, + "loss": 0.7309, + "step": 2954 + }, + { + "epoch": 2.85, + "grad_norm": 1.2747342586517334, + "learning_rate": 1.5925117314936484e-06, + "loss": 0.4667, + "step": 2955 + }, + { + "epoch": 2.85, + "grad_norm": 1.1546356678009033, + "learning_rate": 1.5718062098356023e-06, + "loss": 0.4735, + "step": 2956 + }, + { + "epoch": 2.85, + "grad_norm": 1.2965714931488037, + "learning_rate": 1.5512353233143178e-06, + "loss": 0.3944, + "step": 2957 + }, + { + "epoch": 2.86, + "grad_norm": 1.7952131032943726, + "learning_rate": 1.530799094368443e-06, + "loss": 0.5763, + "step": 2958 + }, + { + "epoch": 2.86, + "grad_norm": 1.420607566833496, + "learning_rate": 1.5104975452897573e-06, + "loss": 0.4716, + "step": 2959 + }, + { + "epoch": 2.86, + "grad_norm": 1.3332853317260742, + "learning_rate": 1.490330698223158e-06, + "loss": 0.4557, + "step": 2960 + }, + { + "epoch": 2.86, + "grad_norm": 1.3416107892990112, + "learning_rate": 1.4702985751665482e-06, + "loss": 0.4973, + "step": 2961 + }, + { + "epoch": 2.86, + "grad_norm": 0.9640762209892273, + "learning_rate": 1.4504011979709491e-06, + "loss": 0.3745, + "step": 2962 + }, + { + "epoch": 2.86, + "grad_norm": 1.2113425731658936, + "learning_rate": 1.4306385883403329e-06, + "loss": 0.4207, + "step": 2963 + }, + { + "epoch": 2.86, + "grad_norm": 1.397342562675476, + "learning_rate": 1.411010767831733e-06, + "loss": 0.5736, + "step": 2964 + }, + { + "epoch": 2.86, + "grad_norm": 1.103853464126587, + "learning_rate": 1.391517757855107e-06, + "loss": 0.3447, + "step": 2965 + }, + { + "epoch": 2.86, + "grad_norm": 1.2431985139846802, + "learning_rate": 1.3721595796733626e-06, + "loss": 0.4443, + "step": 2966 + }, + { + "epoch": 2.86, + "grad_norm": 1.4864449501037598, + "learning_rate": 1.3529362544023733e-06, + "loss": 0.314, + "step": 2967 + }, + { + "epoch": 2.86, + "grad_norm": 1.0626156330108643, + "learning_rate": 1.3338478030108792e-06, + "loss": 0.4855, + "step": 2968 + }, + { + "epoch": 2.87, + "grad_norm": 1.3877254724502563, + "learning_rate": 1.3148942463205166e-06, + "loss": 0.4789, + "step": 2969 + }, + { + "epoch": 2.87, + "grad_norm": 0.9821919202804565, + "learning_rate": 1.2960756050057476e-06, + "loss": 0.2581, + "step": 2970 + }, + { + "epoch": 2.87, + "grad_norm": 1.2562929391860962, + "learning_rate": 1.27739189959393e-06, + "loss": 0.5034, + "step": 2971 + }, + { + "epoch": 2.87, + "grad_norm": 1.0050729513168335, + "learning_rate": 1.2588431504651915e-06, + "loss": 0.3125, + "step": 2972 + }, + { + "epoch": 2.87, + "grad_norm": 1.1035274267196655, + "learning_rate": 1.240429377852431e-06, + "loss": 0.4549, + "step": 2973 + }, + { + "epoch": 2.87, + "grad_norm": 1.6296160221099854, + "learning_rate": 1.2221506018413597e-06, + "loss": 0.7023, + "step": 2974 + }, + { + "epoch": 2.87, + "grad_norm": 1.1510668992996216, + "learning_rate": 1.2040068423704171e-06, + "loss": 0.3249, + "step": 2975 + }, + { + "epoch": 2.87, + "grad_norm": 1.4502555131912231, + "learning_rate": 1.1859981192307718e-06, + "loss": 0.41, + "step": 2976 + }, + { + "epoch": 2.87, + "grad_norm": 1.4568229913711548, + "learning_rate": 1.1681244520662798e-06, + "loss": 0.6871, + "step": 2977 + }, + { + "epoch": 2.87, + "grad_norm": 1.3338775634765625, + "learning_rate": 1.1503858603734846e-06, + "loss": 0.5211, + "step": 2978 + }, + { + "epoch": 2.88, + "grad_norm": 1.2410080432891846, + "learning_rate": 1.1327823635016026e-06, + "loss": 0.3128, + "step": 2979 + }, + { + "epoch": 2.88, + "grad_norm": 1.0830005407333374, + "learning_rate": 1.1153139806524964e-06, + "loss": 0.272, + "step": 2980 + }, + { + "epoch": 2.88, + "grad_norm": 1.1216962337493896, + "learning_rate": 1.0979807308805906e-06, + "loss": 0.4014, + "step": 2981 + }, + { + "epoch": 2.88, + "grad_norm": 1.6853399276733398, + "learning_rate": 1.0807826330929694e-06, + "loss": 0.6638, + "step": 2982 + }, + { + "epoch": 2.88, + "grad_norm": 1.0854101181030273, + "learning_rate": 1.0637197060492793e-06, + "loss": 0.2654, + "step": 2983 + }, + { + "epoch": 2.88, + "grad_norm": 1.1459499597549438, + "learning_rate": 1.0467919683617017e-06, + "loss": 0.3811, + "step": 2984 + }, + { + "epoch": 2.88, + "grad_norm": 1.1812524795532227, + "learning_rate": 1.0299994384949802e-06, + "loss": 0.3847, + "step": 2985 + }, + { + "epoch": 2.88, + "grad_norm": 1.2767400741577148, + "learning_rate": 1.0133421347663374e-06, + "loss": 0.5387, + "step": 2986 + }, + { + "epoch": 2.88, + "grad_norm": 1.6375187635421753, + "learning_rate": 9.968200753455308e-07, + "loss": 0.6675, + "step": 2987 + }, + { + "epoch": 2.88, + "grad_norm": 1.0926377773284912, + "learning_rate": 9.804332782547693e-07, + "loss": 0.3265, + "step": 2988 + }, + { + "epoch": 2.89, + "grad_norm": 1.2261865139007568, + "learning_rate": 9.641817613687404e-07, + "loss": 0.4266, + "step": 2989 + }, + { + "epoch": 2.89, + "grad_norm": 1.200217604637146, + "learning_rate": 9.480655424145418e-07, + "loss": 0.3329, + "step": 2990 + }, + { + "epoch": 2.89, + "grad_norm": 1.0978448390960693, + "learning_rate": 9.320846389717086e-07, + "loss": 0.3821, + "step": 2991 + }, + { + "epoch": 2.89, + "grad_norm": 1.2996379137039185, + "learning_rate": 9.162390684721577e-07, + "loss": 0.428, + "step": 2992 + }, + { + "epoch": 2.89, + "grad_norm": 1.2850451469421387, + "learning_rate": 9.005288482002022e-07, + "loss": 0.4459, + "step": 2993 + }, + { + "epoch": 2.89, + "grad_norm": 0.824524462223053, + "learning_rate": 8.849539952925089e-07, + "loss": 0.2747, + "step": 2994 + }, + { + "epoch": 2.89, + "grad_norm": 1.188064694404602, + "learning_rate": 8.695145267380717e-07, + "loss": 0.4026, + "step": 2995 + }, + { + "epoch": 2.89, + "grad_norm": 1.0427355766296387, + "learning_rate": 8.54210459378238e-07, + "loss": 0.4607, + "step": 2996 + }, + { + "epoch": 2.89, + "grad_norm": 0.7268507480621338, + "learning_rate": 8.390418099066266e-07, + "loss": 0.2667, + "step": 2997 + }, + { + "epoch": 2.89, + "grad_norm": 1.2438222169876099, + "learning_rate": 8.240085948691689e-07, + "loss": 0.3948, + "step": 2998 + }, + { + "epoch": 2.89, + "grad_norm": 1.2525782585144043, + "learning_rate": 8.091108306640532e-07, + "loss": 0.5778, + "step": 2999 + }, + { + "epoch": 2.9, + "grad_norm": 1.011322259902954, + "learning_rate": 7.943485335417249e-07, + "loss": 0.4481, + "step": 3000 + }, + { + "epoch": 2.9, + "grad_norm": 1.1529054641723633, + "learning_rate": 7.797217196048451e-07, + "loss": 0.3791, + "step": 3001 + }, + { + "epoch": 2.9, + "grad_norm": 1.1260803937911987, + "learning_rate": 7.652304048083175e-07, + "loss": 0.4783, + "step": 3002 + }, + { + "epoch": 2.9, + "grad_norm": 1.1295585632324219, + "learning_rate": 7.508746049592063e-07, + "loss": 0.3297, + "step": 3003 + }, + { + "epoch": 2.9, + "grad_norm": 1.0275527238845825, + "learning_rate": 7.366543357168048e-07, + "loss": 0.2958, + "step": 3004 + }, + { + "epoch": 2.9, + "grad_norm": 1.4418631792068481, + "learning_rate": 7.225696125925246e-07, + "loss": 0.5509, + "step": 3005 + }, + { + "epoch": 2.9, + "grad_norm": 1.5385364294052124, + "learning_rate": 7.086204509499511e-07, + "loss": 0.3432, + "step": 3006 + }, + { + "epoch": 2.9, + "grad_norm": 0.9590983986854553, + "learning_rate": 6.948068660048019e-07, + "loss": 0.1927, + "step": 3007 + }, + { + "epoch": 2.9, + "grad_norm": 1.226137399673462, + "learning_rate": 6.81128872824871e-07, + "loss": 0.5011, + "step": 3008 + }, + { + "epoch": 2.9, + "grad_norm": 1.3301578760147095, + "learning_rate": 6.67586486330099e-07, + "loss": 0.5187, + "step": 3009 + }, + { + "epoch": 2.91, + "grad_norm": 1.4750556945800781, + "learning_rate": 6.541797212924888e-07, + "loss": 0.4413, + "step": 3010 + }, + { + "epoch": 2.91, + "grad_norm": 1.5725219249725342, + "learning_rate": 6.409085923360785e-07, + "loss": 0.4741, + "step": 3011 + }, + { + "epoch": 2.91, + "grad_norm": 1.067269206047058, + "learning_rate": 6.27773113936983e-07, + "loss": 0.2581, + "step": 3012 + }, + { + "epoch": 2.91, + "grad_norm": 1.217340350151062, + "learning_rate": 6.147733004233796e-07, + "loss": 0.3935, + "step": 3013 + }, + { + "epoch": 2.91, + "grad_norm": 1.3149571418762207, + "learning_rate": 6.019091659753978e-07, + "loss": 0.4206, + "step": 3014 + }, + { + "epoch": 2.91, + "grad_norm": 1.2064661979675293, + "learning_rate": 5.891807246251879e-07, + "loss": 0.4474, + "step": 3015 + }, + { + "epoch": 2.91, + "grad_norm": 1.103737473487854, + "learning_rate": 5.765879902569493e-07, + "loss": 0.3483, + "step": 3016 + }, + { + "epoch": 2.91, + "grad_norm": 1.2660019397735596, + "learning_rate": 5.641309766067499e-07, + "loss": 0.3404, + "step": 3017 + }, + { + "epoch": 2.91, + "grad_norm": 1.3228251934051514, + "learning_rate": 5.518096972627063e-07, + "loss": 0.414, + "step": 3018 + }, + { + "epoch": 2.91, + "grad_norm": 0.9580221772193909, + "learning_rate": 5.396241656648037e-07, + "loss": 0.2349, + "step": 3019 + }, + { + "epoch": 2.92, + "grad_norm": 1.2516742944717407, + "learning_rate": 5.275743951050071e-07, + "loss": 0.5049, + "step": 3020 + }, + { + "epoch": 2.92, + "grad_norm": 0.7590013742446899, + "learning_rate": 5.156603987271635e-07, + "loss": 0.1885, + "step": 3021 + }, + { + "epoch": 2.92, + "grad_norm": 1.0176297426223755, + "learning_rate": 5.038821895270302e-07, + "loss": 0.462, + "step": 3022 + }, + { + "epoch": 2.92, + "grad_norm": 1.2553941011428833, + "learning_rate": 4.922397803522466e-07, + "loss": 0.5014, + "step": 3023 + }, + { + "epoch": 2.92, + "grad_norm": 1.3685301542282104, + "learning_rate": 4.807331839023349e-07, + "loss": 0.6243, + "step": 3024 + }, + { + "epoch": 2.92, + "grad_norm": 1.0733470916748047, + "learning_rate": 4.6936241272863e-07, + "loss": 0.425, + "step": 3025 + }, + { + "epoch": 2.92, + "grad_norm": 1.3270857334136963, + "learning_rate": 4.581274792343493e-07, + "loss": 0.551, + "step": 3026 + }, + { + "epoch": 2.92, + "grad_norm": 1.0223822593688965, + "learning_rate": 4.470283956745508e-07, + "loss": 0.3436, + "step": 3027 + }, + { + "epoch": 2.92, + "grad_norm": 1.0676158666610718, + "learning_rate": 4.360651741560501e-07, + "loss": 0.3039, + "step": 3028 + }, + { + "epoch": 2.92, + "grad_norm": 0.9071052670478821, + "learning_rate": 4.2523782663753117e-07, + "loss": 0.2872, + "step": 3029 + }, + { + "epoch": 2.92, + "grad_norm": 1.0709900856018066, + "learning_rate": 4.145463649294079e-07, + "loss": 0.3877, + "step": 3030 + }, + { + "epoch": 2.93, + "grad_norm": 1.3956645727157593, + "learning_rate": 4.039908006939069e-07, + "loss": 0.4614, + "step": 3031 + }, + { + "epoch": 2.93, + "grad_norm": 1.042420506477356, + "learning_rate": 3.935711454450125e-07, + "loss": 0.3412, + "step": 3032 + }, + { + "epoch": 2.93, + "grad_norm": 0.9218428134918213, + "learning_rate": 3.832874105484524e-07, + "loss": 0.1879, + "step": 3033 + }, + { + "epoch": 2.93, + "grad_norm": 1.0542484521865845, + "learning_rate": 3.7313960722169803e-07, + "loss": 0.3305, + "step": 3034 + }, + { + "epoch": 2.93, + "grad_norm": 0.9976268410682678, + "learning_rate": 3.6312774653395054e-07, + "loss": 0.2927, + "step": 3035 + }, + { + "epoch": 2.93, + "grad_norm": 1.625490665435791, + "learning_rate": 3.5325183940611293e-07, + "loss": 0.452, + "step": 3036 + }, + { + "epoch": 2.93, + "grad_norm": 1.3954522609710693, + "learning_rate": 3.435118966107903e-07, + "loss": 0.4176, + "step": 3037 + }, + { + "epoch": 2.93, + "grad_norm": 1.1922634840011597, + "learning_rate": 3.3390792877230347e-07, + "loss": 0.3824, + "step": 3038 + }, + { + "epoch": 2.93, + "grad_norm": 1.2640684843063354, + "learning_rate": 3.2443994636663363e-07, + "loss": 0.3669, + "step": 3039 + }, + { + "epoch": 2.93, + "grad_norm": 1.0344746112823486, + "learning_rate": 3.151079597214085e-07, + "loss": 0.3111, + "step": 3040 + }, + { + "epoch": 2.94, + "grad_norm": 1.2467656135559082, + "learning_rate": 3.0591197901595757e-07, + "loss": 0.6657, + "step": 3041 + }, + { + "epoch": 2.94, + "grad_norm": 1.0825926065444946, + "learning_rate": 2.968520142812292e-07, + "loss": 0.3323, + "step": 3042 + }, + { + "epoch": 2.94, + "grad_norm": 1.1362985372543335, + "learning_rate": 2.8792807539979036e-07, + "loss": 0.4313, + "step": 3043 + }, + { + "epoch": 2.94, + "grad_norm": 1.332108974456787, + "learning_rate": 2.7914017210585454e-07, + "loss": 0.6444, + "step": 3044 + }, + { + "epoch": 2.94, + "grad_norm": 1.0163555145263672, + "learning_rate": 2.7048831398525375e-07, + "loss": 0.2917, + "step": 3045 + }, + { + "epoch": 2.94, + "grad_norm": 1.1551223993301392, + "learning_rate": 2.6197251047541117e-07, + "loss": 0.3461, + "step": 3046 + }, + { + "epoch": 2.94, + "grad_norm": 1.4475940465927124, + "learning_rate": 2.535927708653268e-07, + "loss": 0.5594, + "step": 3047 + }, + { + "epoch": 2.94, + "grad_norm": 1.1431162357330322, + "learning_rate": 2.4534910429560575e-07, + "loss": 0.3737, + "step": 3048 + }, + { + "epoch": 2.94, + "grad_norm": 1.167028546333313, + "learning_rate": 2.372415197584299e-07, + "loss": 0.327, + "step": 3049 + }, + { + "epoch": 2.94, + "grad_norm": 1.110597014427185, + "learning_rate": 2.292700260975028e-07, + "loss": 0.315, + "step": 3050 + }, + { + "epoch": 2.94, + "grad_norm": 1.1866278648376465, + "learning_rate": 2.2143463200813285e-07, + "loss": 0.3174, + "step": 3051 + }, + { + "epoch": 2.95, + "grad_norm": 1.0540951490402222, + "learning_rate": 2.1373534603713608e-07, + "loss": 0.4002, + "step": 3052 + }, + { + "epoch": 2.95, + "grad_norm": 1.6657365560531616, + "learning_rate": 2.0617217658287779e-07, + "loss": 0.6559, + "step": 3053 + }, + { + "epoch": 2.95, + "grad_norm": 0.9559653401374817, + "learning_rate": 1.9874513189523102e-07, + "loss": 0.2344, + "step": 3054 + }, + { + "epoch": 2.95, + "grad_norm": 0.9411253333091736, + "learning_rate": 1.914542200756181e-07, + "loss": 0.2153, + "step": 3055 + }, + { + "epoch": 2.95, + "grad_norm": 0.9949620366096497, + "learning_rate": 1.842994490769273e-07, + "loss": 0.2434, + "step": 3056 + }, + { + "epoch": 2.95, + "grad_norm": 1.1056835651397705, + "learning_rate": 1.772808267035547e-07, + "loss": 0.2845, + "step": 3057 + }, + { + "epoch": 2.95, + "grad_norm": 1.0158149003982544, + "learning_rate": 1.703983606114179e-07, + "loss": 0.2857, + "step": 3058 + }, + { + "epoch": 2.95, + "grad_norm": 1.2848867177963257, + "learning_rate": 1.6365205830787267e-07, + "loss": 0.5692, + "step": 3059 + }, + { + "epoch": 2.95, + "grad_norm": 1.018404483795166, + "learning_rate": 1.5704192715178257e-07, + "loss": 0.3901, + "step": 3060 + }, + { + "epoch": 2.95, + "grad_norm": 1.9516348838806152, + "learning_rate": 1.5056797435344937e-07, + "loss": 0.6798, + "step": 3061 + }, + { + "epoch": 2.96, + "grad_norm": 1.1646894216537476, + "learning_rate": 1.4423020697465484e-07, + "loss": 0.3379, + "step": 3062 + }, + { + "epoch": 2.96, + "grad_norm": 1.3057594299316406, + "learning_rate": 1.3802863192860504e-07, + "loss": 0.4302, + "step": 3063 + }, + { + "epoch": 2.96, + "grad_norm": 1.0812674760818481, + "learning_rate": 1.3196325597995828e-07, + "loss": 0.3526, + "step": 3064 + }, + { + "epoch": 2.96, + "grad_norm": 1.0315954685211182, + "learning_rate": 1.2603408574483887e-07, + "loss": 0.325, + "step": 3065 + }, + { + "epoch": 2.96, + "grad_norm": 0.7634711265563965, + "learning_rate": 1.2024112769074002e-07, + "loss": 0.1287, + "step": 3066 + }, + { + "epoch": 2.96, + "grad_norm": 1.4026670455932617, + "learning_rate": 1.145843881366071e-07, + "loss": 0.5154, + "step": 3067 + }, + { + "epoch": 2.96, + "grad_norm": 1.2732499837875366, + "learning_rate": 1.0906387325280987e-07, + "loss": 0.4641, + "step": 3068 + }, + { + "epoch": 2.96, + "grad_norm": 1.0098817348480225, + "learning_rate": 1.0367958906110087e-07, + "loss": 0.3296, + "step": 3069 + }, + { + "epoch": 2.96, + "grad_norm": 1.400363564491272, + "learning_rate": 9.843154143465704e-08, + "loss": 0.4768, + "step": 3070 + }, + { + "epoch": 2.96, + "grad_norm": 1.1068822145462036, + "learning_rate": 9.331973609801037e-08, + "loss": 0.3435, + "step": 3071 + }, + { + "epoch": 2.97, + "grad_norm": 1.1789405345916748, + "learning_rate": 8.834417862711718e-08, + "loss": 0.2829, + "step": 3072 + }, + { + "epoch": 2.97, + "grad_norm": 1.0974359512329102, + "learning_rate": 8.350487444930278e-08, + "loss": 0.3189, + "step": 3073 + }, + { + "epoch": 2.97, + "grad_norm": 1.0662319660186768, + "learning_rate": 7.880182884327513e-08, + "loss": 0.1984, + "step": 3074 + }, + { + "epoch": 2.97, + "grad_norm": 0.9128117561340332, + "learning_rate": 7.423504693908346e-08, + "loss": 0.3309, + "step": 3075 + }, + { + "epoch": 2.97, + "grad_norm": 1.2563143968582153, + "learning_rate": 6.980453371817353e-08, + "loss": 0.3568, + "step": 3076 + }, + { + "epoch": 2.97, + "grad_norm": 0.6046451330184937, + "learning_rate": 6.55102940133323e-08, + "loss": 0.1443, + "step": 3077 + }, + { + "epoch": 2.97, + "grad_norm": 1.0894300937652588, + "learning_rate": 6.135233250871563e-08, + "loss": 0.2837, + "step": 3078 + }, + { + "epoch": 2.97, + "grad_norm": 1.0161566734313965, + "learning_rate": 5.7330653739806614e-08, + "loss": 0.3274, + "step": 3079 + }, + { + "epoch": 2.97, + "grad_norm": 1.2504090070724487, + "learning_rate": 5.344526209344336e-08, + "loss": 0.5186, + "step": 3080 + }, + { + "epoch": 2.97, + "grad_norm": 0.9541034698486328, + "learning_rate": 4.9696161807805117e-08, + "loss": 0.2524, + "step": 3081 + }, + { + "epoch": 2.97, + "grad_norm": 1.2277942895889282, + "learning_rate": 4.608335697238453e-08, + "loss": 0.3188, + "step": 3082 + }, + { + "epoch": 2.98, + "grad_norm": 1.3345773220062256, + "learning_rate": 4.260685152804311e-08, + "loss": 0.5299, + "step": 3083 + }, + { + "epoch": 2.98, + "grad_norm": 1.1143313646316528, + "learning_rate": 3.926664926692802e-08, + "loss": 0.2181, + "step": 3084 + }, + { + "epoch": 2.98, + "grad_norm": 1.5047612190246582, + "learning_rate": 3.606275383251367e-08, + "loss": 0.5564, + "step": 3085 + }, + { + "epoch": 2.98, + "grad_norm": 1.266991376876831, + "learning_rate": 3.299516871962949e-08, + "loss": 0.4578, + "step": 3086 + }, + { + "epoch": 2.98, + "grad_norm": 1.2709894180297852, + "learning_rate": 3.006389727436276e-08, + "loss": 0.5225, + "step": 3087 + }, + { + "epoch": 2.98, + "grad_norm": 1.071608543395996, + "learning_rate": 2.726894269414193e-08, + "loss": 0.4342, + "step": 3088 + }, + { + "epoch": 2.98, + "grad_norm": 1.1185423135757446, + "learning_rate": 2.4610308027708806e-08, + "loss": 0.4581, + "step": 3089 + }, + { + "epoch": 2.98, + "grad_norm": 1.1018141508102417, + "learning_rate": 2.2087996175076953e-08, + "loss": 0.4682, + "step": 3090 + }, + { + "epoch": 2.98, + "grad_norm": 1.2371251583099365, + "learning_rate": 1.970200988758719e-08, + "loss": 0.2975, + "step": 3091 + }, + { + "epoch": 2.98, + "grad_norm": 1.2653716802597046, + "learning_rate": 1.745235176786597e-08, + "loss": 0.4085, + "step": 3092 + }, + { + "epoch": 2.99, + "grad_norm": 1.0708848237991333, + "learning_rate": 1.533902426983924e-08, + "loss": 0.4154, + "step": 3093 + }, + { + "epoch": 2.99, + "grad_norm": 1.3194761276245117, + "learning_rate": 1.336202969871858e-08, + "loss": 0.3781, + "step": 3094 + }, + { + "epoch": 2.99, + "grad_norm": 1.083683967590332, + "learning_rate": 1.1521370210987314e-08, + "loss": 0.2848, + "step": 3095 + }, + { + "epoch": 2.99, + "grad_norm": 1.0737828016281128, + "learning_rate": 9.817047814442148e-09, + "loss": 0.2543, + "step": 3096 + }, + { + "epoch": 2.99, + "grad_norm": 0.9343252182006836, + "learning_rate": 8.249064368151537e-09, + "loss": 0.247, + "step": 3097 + }, + { + "epoch": 2.99, + "grad_norm": 1.1907275915145874, + "learning_rate": 6.817421582483441e-09, + "loss": 0.3894, + "step": 3098 + }, + { + "epoch": 2.99, + "grad_norm": 1.0009660720825195, + "learning_rate": 5.5221210190498085e-09, + "loss": 0.2432, + "step": 3099 + }, + { + "epoch": 2.99, + "grad_norm": 1.1556274890899658, + "learning_rate": 4.363164090775973e-09, + "loss": 0.3347, + "step": 3100 + }, + { + "epoch": 2.99, + "grad_norm": 1.2074859142303467, + "learning_rate": 3.340552061831259e-09, + "loss": 0.3179, + "step": 3101 + }, + { + "epoch": 2.99, + "grad_norm": 1.529038429260254, + "learning_rate": 2.4542860476844954e-09, + "loss": 0.8287, + "step": 3102 + }, + { + "epoch": 3.0, + "grad_norm": 1.0030752420425415, + "learning_rate": 1.7043670150901358e-09, + "loss": 0.2722, + "step": 3103 + }, + { + "epoch": 3.0, + "grad_norm": 1.2378499507904053, + "learning_rate": 1.0907957820327496e-09, + "loss": 0.3671, + "step": 3104 + }, + { + "epoch": 3.0, + "grad_norm": 1.0954062938690186, + "learning_rate": 6.135730178102872e-10, + "loss": 0.3245, + "step": 3105 + }, + { + "epoch": 3.0, + "grad_norm": 1.0416110754013062, + "learning_rate": 2.726992429646913e-10, + "loss": 0.3151, + "step": 3106 + }, + { + "epoch": 3.0, + "grad_norm": 1.0947151184082031, + "learning_rate": 6.817482933740848e-11, + "loss": 0.3989, + "step": 3107 + }, + { + "epoch": 3.0, + "grad_norm": 1.2456382513046265, + "learning_rate": 0.0, + "loss": 0.4042, + "step": 3108 + } + ], + "logging_steps": 1, + "max_steps": 3108, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 4.416706180131324e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}