diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13693 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.16778523489932887, + "eval_steps": 199, + "global_step": 1700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 380.0, + "learning_rate": 8.403361344537815e-08, + "loss": 10.0168, + "num_input_tokens_seen": 2097152, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 348.0, + "learning_rate": 1.680672268907563e-07, + "loss": 9.9578, + "num_input_tokens_seen": 4194304, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 372.0, + "learning_rate": 2.5210084033613445e-07, + "loss": 9.9307, + "num_input_tokens_seen": 6291456, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 356.0, + "learning_rate": 3.361344537815126e-07, + "loss": 10.0096, + "num_input_tokens_seen": 8388608, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 422.0, + "learning_rate": 4.201680672268908e-07, + "loss": 10.0019, + "num_input_tokens_seen": 10485760, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 344.0, + "learning_rate": 5.042016806722689e-07, + "loss": 9.8981, + "num_input_tokens_seen": 12582912, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 346.0, + "learning_rate": 5.882352941176471e-07, + "loss": 9.9229, + "num_input_tokens_seen": 14680064, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 304.0, + "learning_rate": 6.722689075630252e-07, + "loss": 9.8722, + "num_input_tokens_seen": 16777216, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 264.0, + "learning_rate": 7.563025210084034e-07, + "loss": 9.84, + "num_input_tokens_seen": 18874368, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 252.0, + "learning_rate": 8.403361344537816e-07, + "loss": 9.7397, + "num_input_tokens_seen": 20971520, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 248.0, + "learning_rate": 9.243697478991598e-07, + "loss": 9.7562, + "num_input_tokens_seen": 23068672, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 214.0, + "learning_rate": 1.0084033613445378e-06, + "loss": 9.6955, + "num_input_tokens_seen": 25165824, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 196.0, + "learning_rate": 1.092436974789916e-06, + "loss": 9.6317, + "num_input_tokens_seen": 27262976, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 157.0, + "learning_rate": 1.1764705882352942e-06, + "loss": 9.522, + "num_input_tokens_seen": 29360128, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 133.0, + "learning_rate": 1.2605042016806724e-06, + "loss": 9.4459, + "num_input_tokens_seen": 31457280, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 111.5, + "learning_rate": 1.3445378151260504e-06, + "loss": 9.3929, + "num_input_tokens_seen": 33554432, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 96.5, + "learning_rate": 1.4285714285714286e-06, + "loss": 9.3054, + "num_input_tokens_seen": 35651584, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 75.5, + "learning_rate": 1.5126050420168068e-06, + "loss": 9.2098, + "num_input_tokens_seen": 37748736, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 65.0, + "learning_rate": 1.5966386554621848e-06, + "loss": 9.1634, + "num_input_tokens_seen": 39845888, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 54.0, + "learning_rate": 1.6806722689075632e-06, + "loss": 9.1167, + "num_input_tokens_seen": 41943040, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 44.25, + "learning_rate": 1.7647058823529414e-06, + "loss": 9.0489, + "num_input_tokens_seen": 44040192, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 38.0, + "learning_rate": 1.8487394957983196e-06, + "loss": 8.9928, + "num_input_tokens_seen": 46137344, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 35.25, + "learning_rate": 1.932773109243698e-06, + "loss": 8.8916, + "num_input_tokens_seen": 48234496, + "step": 23 + }, + { + "epoch": 0.01, + "grad_norm": 30.0, + "learning_rate": 2.0168067226890756e-06, + "loss": 8.8398, + "num_input_tokens_seen": 50331648, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 26.125, + "learning_rate": 2.100840336134454e-06, + "loss": 8.75, + "num_input_tokens_seen": 52428800, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 24.5, + "learning_rate": 2.184873949579832e-06, + "loss": 8.6363, + "num_input_tokens_seen": 54525952, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 24.625, + "learning_rate": 2.2689075630252102e-06, + "loss": 8.6346, + "num_input_tokens_seen": 56623104, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 26.75, + "learning_rate": 2.3529411764705885e-06, + "loss": 8.5871, + "num_input_tokens_seen": 58720256, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 26.75, + "learning_rate": 2.4369747899159667e-06, + "loss": 8.5476, + "num_input_tokens_seen": 60817408, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 25.25, + "learning_rate": 2.521008403361345e-06, + "loss": 8.5004, + "num_input_tokens_seen": 62914560, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 22.75, + "learning_rate": 2.605042016806723e-06, + "loss": 8.4537, + "num_input_tokens_seen": 65011712, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 18.75, + "learning_rate": 2.689075630252101e-06, + "loss": 8.4088, + "num_input_tokens_seen": 67108864, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 18.5, + "learning_rate": 2.7731092436974795e-06, + "loss": 8.389, + "num_input_tokens_seen": 69206016, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 15.5625, + "learning_rate": 2.8571428571428573e-06, + "loss": 8.3228, + "num_input_tokens_seen": 71303168, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 13.5625, + "learning_rate": 2.9411764705882355e-06, + "loss": 8.2781, + "num_input_tokens_seen": 73400320, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 13.0625, + "learning_rate": 3.0252100840336137e-06, + "loss": 8.2525, + "num_input_tokens_seen": 75497472, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 13.25, + "learning_rate": 3.109243697478992e-06, + "loss": 8.1735, + "num_input_tokens_seen": 77594624, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 12.625, + "learning_rate": 3.1932773109243696e-06, + "loss": 8.1301, + "num_input_tokens_seen": 79691776, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 12.5625, + "learning_rate": 3.2773109243697483e-06, + "loss": 8.0726, + "num_input_tokens_seen": 81788928, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 11.0, + "learning_rate": 3.3613445378151265e-06, + "loss": 8.0339, + "num_input_tokens_seen": 83886080, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 10.1875, + "learning_rate": 3.4453781512605043e-06, + "loss": 8.0077, + "num_input_tokens_seen": 85983232, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 9.5625, + "learning_rate": 3.529411764705883e-06, + "loss": 7.9286, + "num_input_tokens_seen": 88080384, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 8.4375, + "learning_rate": 3.6134453781512607e-06, + "loss": 7.9055, + "num_input_tokens_seen": 90177536, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 7.53125, + "learning_rate": 3.6974789915966393e-06, + "loss": 7.8555, + "num_input_tokens_seen": 92274688, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 7.15625, + "learning_rate": 3.781512605042017e-06, + "loss": 7.7934, + "num_input_tokens_seen": 94371840, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 7.125, + "learning_rate": 3.865546218487396e-06, + "loss": 7.7522, + "num_input_tokens_seen": 96468992, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 7.09375, + "learning_rate": 3.9495798319327735e-06, + "loss": 7.7007, + "num_input_tokens_seen": 98566144, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 7.3125, + "learning_rate": 4.033613445378151e-06, + "loss": 7.6809, + "num_input_tokens_seen": 100663296, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 7.59375, + "learning_rate": 4.11764705882353e-06, + "loss": 7.623, + "num_input_tokens_seen": 102760448, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 6.625, + "learning_rate": 4.201680672268908e-06, + "loss": 7.6064, + "num_input_tokens_seen": 104857600, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 6.1875, + "learning_rate": 4.2857142857142855e-06, + "loss": 7.5352, + "num_input_tokens_seen": 106954752, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 6.28125, + "learning_rate": 4.369747899159664e-06, + "loss": 7.4911, + "num_input_tokens_seen": 109051904, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 6.15625, + "learning_rate": 4.453781512605043e-06, + "loss": 7.4393, + "num_input_tokens_seen": 111149056, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 5.78125, + "learning_rate": 4.5378151260504205e-06, + "loss": 7.4134, + "num_input_tokens_seen": 113246208, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 5.8125, + "learning_rate": 4.621848739495799e-06, + "loss": 7.3876, + "num_input_tokens_seen": 115343360, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 5.375, + "learning_rate": 4.705882352941177e-06, + "loss": 7.3283, + "num_input_tokens_seen": 117440512, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 5.5625, + "learning_rate": 4.7899159663865555e-06, + "loss": 7.2959, + "num_input_tokens_seen": 119537664, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 5.21875, + "learning_rate": 4.873949579831933e-06, + "loss": 7.2391, + "num_input_tokens_seen": 121634816, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 5.1875, + "learning_rate": 4.957983193277311e-06, + "loss": 7.1795, + "num_input_tokens_seen": 123731968, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 5.0625, + "learning_rate": 5.04201680672269e-06, + "loss": 7.1513, + "num_input_tokens_seen": 125829120, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 5.34375, + "learning_rate": 5.1260504201680675e-06, + "loss": 7.0947, + "num_input_tokens_seen": 127926272, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 4.90625, + "learning_rate": 5.210084033613446e-06, + "loss": 7.051, + "num_input_tokens_seen": 130023424, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 4.90625, + "learning_rate": 5.294117647058824e-06, + "loss": 7.0014, + "num_input_tokens_seen": 132120576, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 4.625, + "learning_rate": 5.378151260504202e-06, + "loss": 6.9464, + "num_input_tokens_seen": 134217728, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 6.0625, + "learning_rate": 5.4621848739495795e-06, + "loss": 6.881, + "num_input_tokens_seen": 136314880, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 5.03125, + "learning_rate": 5.546218487394959e-06, + "loss": 6.8252, + "num_input_tokens_seen": 138412032, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 4.96875, + "learning_rate": 5.630252100840337e-06, + "loss": 6.8061, + "num_input_tokens_seen": 140509184, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 5.34375, + "learning_rate": 5.7142857142857145e-06, + "loss": 6.7608, + "num_input_tokens_seen": 142606336, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 4.375, + "learning_rate": 5.798319327731093e-06, + "loss": 6.7459, + "num_input_tokens_seen": 144703488, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 4.65625, + "learning_rate": 5.882352941176471e-06, + "loss": 6.679, + "num_input_tokens_seen": 146800640, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 5.625, + "learning_rate": 5.9663865546218495e-06, + "loss": 6.6458, + "num_input_tokens_seen": 148897792, + "step": 71 + }, + { + "epoch": 0.02, + "grad_norm": 6.25, + "learning_rate": 6.050420168067227e-06, + "loss": 6.6044, + "num_input_tokens_seen": 150994944, + "step": 72 + }, + { + "epoch": 0.02, + "grad_norm": 4.90625, + "learning_rate": 6.134453781512606e-06, + "loss": 6.5125, + "num_input_tokens_seen": 153092096, + "step": 73 + }, + { + "epoch": 0.02, + "grad_norm": 4.6875, + "learning_rate": 6.218487394957984e-06, + "loss": 6.5516, + "num_input_tokens_seen": 155189248, + "step": 74 + }, + { + "epoch": 0.02, + "grad_norm": 4.6875, + "learning_rate": 6.3025210084033615e-06, + "loss": 6.4681, + "num_input_tokens_seen": 157286400, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 6.1875, + "learning_rate": 6.386554621848739e-06, + "loss": 6.4484, + "num_input_tokens_seen": 159383552, + "step": 76 + }, + { + "epoch": 0.02, + "grad_norm": 4.625, + "learning_rate": 6.470588235294119e-06, + "loss": 6.4324, + "num_input_tokens_seen": 161480704, + "step": 77 + }, + { + "epoch": 0.02, + "grad_norm": 4.84375, + "learning_rate": 6.5546218487394966e-06, + "loss": 6.3783, + "num_input_tokens_seen": 163577856, + "step": 78 + }, + { + "epoch": 0.02, + "grad_norm": 6.34375, + "learning_rate": 6.638655462184874e-06, + "loss": 6.3552, + "num_input_tokens_seen": 165675008, + "step": 79 + }, + { + "epoch": 0.02, + "grad_norm": 5.8125, + "learning_rate": 6.722689075630253e-06, + "loss": 6.3333, + "num_input_tokens_seen": 167772160, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 6.21875, + "learning_rate": 6.806722689075631e-06, + "loss": 6.2396, + "num_input_tokens_seen": 169869312, + "step": 81 + }, + { + "epoch": 0.02, + "grad_norm": 9.0, + "learning_rate": 6.8907563025210085e-06, + "loss": 6.2413, + "num_input_tokens_seen": 171966464, + "step": 82 + }, + { + "epoch": 0.02, + "grad_norm": 7.53125, + "learning_rate": 6.974789915966387e-06, + "loss": 6.2187, + "num_input_tokens_seen": 174063616, + "step": 83 + }, + { + "epoch": 0.02, + "grad_norm": 8.4375, + "learning_rate": 7.058823529411766e-06, + "loss": 6.199, + "num_input_tokens_seen": 176160768, + "step": 84 + }, + { + "epoch": 0.02, + "grad_norm": 8.375, + "learning_rate": 7.1428571428571436e-06, + "loss": 6.1808, + "num_input_tokens_seen": 178257920, + "step": 85 + }, + { + "epoch": 0.02, + "grad_norm": 6.75, + "learning_rate": 7.226890756302521e-06, + "loss": 6.152, + "num_input_tokens_seen": 180355072, + "step": 86 + }, + { + "epoch": 0.02, + "grad_norm": 7.6875, + "learning_rate": 7.310924369747899e-06, + "loss": 6.0847, + "num_input_tokens_seen": 182452224, + "step": 87 + }, + { + "epoch": 0.02, + "grad_norm": 7.125, + "learning_rate": 7.394957983193279e-06, + "loss": 6.1132, + "num_input_tokens_seen": 184549376, + "step": 88 + }, + { + "epoch": 0.02, + "grad_norm": 7.46875, + "learning_rate": 7.478991596638656e-06, + "loss": 6.1028, + "num_input_tokens_seen": 186646528, + "step": 89 + }, + { + "epoch": 0.02, + "grad_norm": 6.6875, + "learning_rate": 7.563025210084034e-06, + "loss": 6.0179, + "num_input_tokens_seen": 188743680, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 6.6875, + "learning_rate": 7.647058823529411e-06, + "loss": 6.0006, + "num_input_tokens_seen": 190840832, + "step": 91 + }, + { + "epoch": 0.02, + "grad_norm": 7.40625, + "learning_rate": 7.731092436974791e-06, + "loss": 5.9917, + "num_input_tokens_seen": 192937984, + "step": 92 + }, + { + "epoch": 0.02, + "grad_norm": 5.5, + "learning_rate": 7.815126050420168e-06, + "loss": 5.9925, + "num_input_tokens_seen": 195035136, + "step": 93 + }, + { + "epoch": 0.02, + "grad_norm": 7.90625, + "learning_rate": 7.899159663865547e-06, + "loss": 5.9592, + "num_input_tokens_seen": 197132288, + "step": 94 + }, + { + "epoch": 0.02, + "grad_norm": 7.625, + "learning_rate": 7.983193277310926e-06, + "loss": 5.8628, + "num_input_tokens_seen": 199229440, + "step": 95 + }, + { + "epoch": 0.02, + "grad_norm": 7.96875, + "learning_rate": 8.067226890756303e-06, + "loss": 5.8931, + "num_input_tokens_seen": 201326592, + "step": 96 + }, + { + "epoch": 0.02, + "grad_norm": 8.0625, + "learning_rate": 8.151260504201681e-06, + "loss": 5.8708, + "num_input_tokens_seen": 203423744, + "step": 97 + }, + { + "epoch": 0.02, + "grad_norm": 7.3125, + "learning_rate": 8.23529411764706e-06, + "loss": 5.8157, + "num_input_tokens_seen": 205520896, + "step": 98 + }, + { + "epoch": 0.02, + "grad_norm": 7.46875, + "learning_rate": 8.319327731092438e-06, + "loss": 5.8302, + "num_input_tokens_seen": 207618048, + "step": 99 + }, + { + "epoch": 0.02, + "grad_norm": 8.875, + "learning_rate": 8.403361344537815e-06, + "loss": 5.777, + "num_input_tokens_seen": 209715200, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 7.0625, + "learning_rate": 8.487394957983194e-06, + "loss": 5.7678, + "num_input_tokens_seen": 211812352, + "step": 101 + }, + { + "epoch": 0.02, + "grad_norm": 10.5625, + "learning_rate": 8.571428571428571e-06, + "loss": 5.7426, + "num_input_tokens_seen": 213909504, + "step": 102 + }, + { + "epoch": 0.02, + "grad_norm": 11.1875, + "learning_rate": 8.655462184873951e-06, + "loss": 5.7215, + "num_input_tokens_seen": 216006656, + "step": 103 + }, + { + "epoch": 0.02, + "grad_norm": 5.34375, + "learning_rate": 8.739495798319328e-06, + "loss": 5.7174, + "num_input_tokens_seen": 218103808, + "step": 104 + }, + { + "epoch": 0.02, + "grad_norm": 12.9375, + "learning_rate": 8.823529411764707e-06, + "loss": 5.7294, + "num_input_tokens_seen": 220200960, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 12.125, + "learning_rate": 8.907563025210085e-06, + "loss": 5.7037, + "num_input_tokens_seen": 222298112, + "step": 106 + }, + { + "epoch": 0.02, + "grad_norm": 7.78125, + "learning_rate": 8.991596638655462e-06, + "loss": 5.6525, + "num_input_tokens_seen": 224395264, + "step": 107 + }, + { + "epoch": 0.02, + "grad_norm": 9.0625, + "learning_rate": 9.075630252100841e-06, + "loss": 5.6111, + "num_input_tokens_seen": 226492416, + "step": 108 + }, + { + "epoch": 0.02, + "grad_norm": 10.125, + "learning_rate": 9.15966386554622e-06, + "loss": 5.5881, + "num_input_tokens_seen": 228589568, + "step": 109 + }, + { + "epoch": 0.02, + "grad_norm": 9.625, + "learning_rate": 9.243697478991598e-06, + "loss": 5.6082, + "num_input_tokens_seen": 230686720, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 8.1875, + "learning_rate": 9.327731092436975e-06, + "loss": 5.5969, + "num_input_tokens_seen": 232783872, + "step": 111 + }, + { + "epoch": 0.02, + "grad_norm": 10.375, + "learning_rate": 9.411764705882354e-06, + "loss": 5.5476, + "num_input_tokens_seen": 234881024, + "step": 112 + }, + { + "epoch": 0.02, + "grad_norm": 7.90625, + "learning_rate": 9.49579831932773e-06, + "loss": 5.5416, + "num_input_tokens_seen": 236978176, + "step": 113 + }, + { + "epoch": 0.02, + "grad_norm": 10.3125, + "learning_rate": 9.579831932773111e-06, + "loss": 5.4936, + "num_input_tokens_seen": 239075328, + "step": 114 + }, + { + "epoch": 0.02, + "grad_norm": 9.1875, + "learning_rate": 9.663865546218488e-06, + "loss": 5.4903, + "num_input_tokens_seen": 241172480, + "step": 115 + }, + { + "epoch": 0.02, + "grad_norm": 9.4375, + "learning_rate": 9.747899159663867e-06, + "loss": 5.4852, + "num_input_tokens_seen": 243269632, + "step": 116 + }, + { + "epoch": 0.02, + "grad_norm": 8.75, + "learning_rate": 9.831932773109244e-06, + "loss": 5.4291, + "num_input_tokens_seen": 245366784, + "step": 117 + }, + { + "epoch": 0.02, + "grad_norm": 6.0, + "learning_rate": 9.915966386554622e-06, + "loss": 5.4299, + "num_input_tokens_seen": 247463936, + "step": 118 + }, + { + "epoch": 0.02, + "grad_norm": 8.125, + "learning_rate": 1e-05, + "loss": 5.4144, + "num_input_tokens_seen": 249561088, + "step": 119 + }, + { + "epoch": 0.03, + "grad_norm": 6.96875, + "learning_rate": 1.008403361344538e-05, + "loss": 5.3902, + "num_input_tokens_seen": 251658240, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 8.1875, + "learning_rate": 1.0168067226890756e-05, + "loss": 5.3525, + "num_input_tokens_seen": 253755392, + "step": 121 + }, + { + "epoch": 0.03, + "grad_norm": 7.6875, + "learning_rate": 1.0252100840336135e-05, + "loss": 5.3348, + "num_input_tokens_seen": 255852544, + "step": 122 + }, + { + "epoch": 0.03, + "grad_norm": 9.5625, + "learning_rate": 1.0336134453781514e-05, + "loss": 5.3413, + "num_input_tokens_seen": 257949696, + "step": 123 + }, + { + "epoch": 0.03, + "grad_norm": 7.21875, + "learning_rate": 1.0420168067226892e-05, + "loss": 5.3003, + "num_input_tokens_seen": 260046848, + "step": 124 + }, + { + "epoch": 0.03, + "grad_norm": 11.1875, + "learning_rate": 1.0504201680672271e-05, + "loss": 5.2606, + "num_input_tokens_seen": 262144000, + "step": 125 + }, + { + "epoch": 0.03, + "grad_norm": 8.5625, + "learning_rate": 1.0588235294117648e-05, + "loss": 5.2561, + "num_input_tokens_seen": 264241152, + "step": 126 + }, + { + "epoch": 0.03, + "grad_norm": 10.8125, + "learning_rate": 1.0672268907563026e-05, + "loss": 5.2177, + "num_input_tokens_seen": 266338304, + "step": 127 + }, + { + "epoch": 0.03, + "grad_norm": 11.375, + "learning_rate": 1.0756302521008403e-05, + "loss": 5.2021, + "num_input_tokens_seen": 268435456, + "step": 128 + }, + { + "epoch": 0.03, + "grad_norm": 7.84375, + "learning_rate": 1.0840336134453782e-05, + "loss": 5.1699, + "num_input_tokens_seen": 270532608, + "step": 129 + }, + { + "epoch": 0.03, + "grad_norm": 11.5, + "learning_rate": 1.0924369747899159e-05, + "loss": 5.1648, + "num_input_tokens_seen": 272629760, + "step": 130 + }, + { + "epoch": 0.03, + "grad_norm": 8.875, + "learning_rate": 1.100840336134454e-05, + "loss": 5.1014, + "num_input_tokens_seen": 274726912, + "step": 131 + }, + { + "epoch": 0.03, + "grad_norm": 10.5, + "learning_rate": 1.1092436974789918e-05, + "loss": 5.106, + "num_input_tokens_seen": 276824064, + "step": 132 + }, + { + "epoch": 0.03, + "grad_norm": 11.125, + "learning_rate": 1.1176470588235295e-05, + "loss": 5.1104, + "num_input_tokens_seen": 278921216, + "step": 133 + }, + { + "epoch": 0.03, + "grad_norm": 8.0625, + "learning_rate": 1.1260504201680673e-05, + "loss": 5.1012, + "num_input_tokens_seen": 281018368, + "step": 134 + }, + { + "epoch": 0.03, + "grad_norm": 9.3125, + "learning_rate": 1.134453781512605e-05, + "loss": 5.0835, + "num_input_tokens_seen": 283115520, + "step": 135 + }, + { + "epoch": 0.03, + "grad_norm": 8.875, + "learning_rate": 1.1428571428571429e-05, + "loss": 5.0666, + "num_input_tokens_seen": 285212672, + "step": 136 + }, + { + "epoch": 0.03, + "grad_norm": 7.15625, + "learning_rate": 1.1512605042016806e-05, + "loss": 5.043, + "num_input_tokens_seen": 287309824, + "step": 137 + }, + { + "epoch": 0.03, + "grad_norm": 7.46875, + "learning_rate": 1.1596638655462186e-05, + "loss": 4.98, + "num_input_tokens_seen": 289406976, + "step": 138 + }, + { + "epoch": 0.03, + "grad_norm": 8.125, + "learning_rate": 1.1680672268907565e-05, + "loss": 4.9469, + "num_input_tokens_seen": 291504128, + "step": 139 + }, + { + "epoch": 0.03, + "grad_norm": 8.375, + "learning_rate": 1.1764705882352942e-05, + "loss": 4.9545, + "num_input_tokens_seen": 293601280, + "step": 140 + }, + { + "epoch": 0.03, + "grad_norm": 7.90625, + "learning_rate": 1.184873949579832e-05, + "loss": 4.9428, + "num_input_tokens_seen": 295698432, + "step": 141 + }, + { + "epoch": 0.03, + "grad_norm": 8.1875, + "learning_rate": 1.1932773109243699e-05, + "loss": 4.9189, + "num_input_tokens_seen": 297795584, + "step": 142 + }, + { + "epoch": 0.03, + "grad_norm": 8.4375, + "learning_rate": 1.2016806722689076e-05, + "loss": 4.901, + "num_input_tokens_seen": 299892736, + "step": 143 + }, + { + "epoch": 0.03, + "grad_norm": 7.78125, + "learning_rate": 1.2100840336134455e-05, + "loss": 4.8716, + "num_input_tokens_seen": 301989888, + "step": 144 + }, + { + "epoch": 0.03, + "grad_norm": 8.5625, + "learning_rate": 1.2184873949579832e-05, + "loss": 4.8398, + "num_input_tokens_seen": 304087040, + "step": 145 + }, + { + "epoch": 0.03, + "grad_norm": 6.96875, + "learning_rate": 1.2268907563025212e-05, + "loss": 4.8527, + "num_input_tokens_seen": 306184192, + "step": 146 + }, + { + "epoch": 0.03, + "grad_norm": 10.25, + "learning_rate": 1.235294117647059e-05, + "loss": 4.8271, + "num_input_tokens_seen": 308281344, + "step": 147 + }, + { + "epoch": 0.03, + "grad_norm": 9.9375, + "learning_rate": 1.2436974789915967e-05, + "loss": 4.8113, + "num_input_tokens_seen": 310378496, + "step": 148 + }, + { + "epoch": 0.03, + "grad_norm": 5.5, + "learning_rate": 1.2521008403361346e-05, + "loss": 4.7797, + "num_input_tokens_seen": 312475648, + "step": 149 + }, + { + "epoch": 0.03, + "grad_norm": 10.25, + "learning_rate": 1.2605042016806723e-05, + "loss": 4.773, + "num_input_tokens_seen": 314572800, + "step": 150 + }, + { + "epoch": 0.03, + "grad_norm": 9.875, + "learning_rate": 1.2689075630252102e-05, + "loss": 4.7342, + "num_input_tokens_seen": 316669952, + "step": 151 + }, + { + "epoch": 0.03, + "grad_norm": 7.375, + "learning_rate": 1.2773109243697479e-05, + "loss": 4.7429, + "num_input_tokens_seen": 318767104, + "step": 152 + }, + { + "epoch": 0.03, + "grad_norm": 8.25, + "learning_rate": 1.2857142857142859e-05, + "loss": 4.6898, + "num_input_tokens_seen": 320864256, + "step": 153 + }, + { + "epoch": 0.03, + "grad_norm": 7.65625, + "learning_rate": 1.2941176470588238e-05, + "loss": 4.7092, + "num_input_tokens_seen": 322961408, + "step": 154 + }, + { + "epoch": 0.03, + "grad_norm": 8.4375, + "learning_rate": 1.3025210084033614e-05, + "loss": 4.6652, + "num_input_tokens_seen": 325058560, + "step": 155 + }, + { + "epoch": 0.03, + "grad_norm": 7.0625, + "learning_rate": 1.3109243697478993e-05, + "loss": 4.6469, + "num_input_tokens_seen": 327155712, + "step": 156 + }, + { + "epoch": 0.03, + "grad_norm": 9.4375, + "learning_rate": 1.319327731092437e-05, + "loss": 4.6116, + "num_input_tokens_seen": 329252864, + "step": 157 + }, + { + "epoch": 0.03, + "grad_norm": 7.6875, + "learning_rate": 1.3277310924369749e-05, + "loss": 4.6151, + "num_input_tokens_seen": 331350016, + "step": 158 + }, + { + "epoch": 0.03, + "grad_norm": 6.90625, + "learning_rate": 1.3361344537815126e-05, + "loss": 4.589, + "num_input_tokens_seen": 333447168, + "step": 159 + }, + { + "epoch": 0.03, + "grad_norm": 8.9375, + "learning_rate": 1.3445378151260506e-05, + "loss": 4.554, + "num_input_tokens_seen": 335544320, + "step": 160 + }, + { + "epoch": 0.03, + "grad_norm": 6.3125, + "learning_rate": 1.3529411764705885e-05, + "loss": 4.5447, + "num_input_tokens_seen": 337641472, + "step": 161 + }, + { + "epoch": 0.03, + "grad_norm": 8.0, + "learning_rate": 1.3613445378151261e-05, + "loss": 4.5427, + "num_input_tokens_seen": 339738624, + "step": 162 + }, + { + "epoch": 0.03, + "grad_norm": 7.46875, + "learning_rate": 1.369747899159664e-05, + "loss": 4.5064, + "num_input_tokens_seen": 341835776, + "step": 163 + }, + { + "epoch": 0.03, + "grad_norm": 6.75, + "learning_rate": 1.3781512605042017e-05, + "loss": 4.5028, + "num_input_tokens_seen": 343932928, + "step": 164 + }, + { + "epoch": 0.03, + "grad_norm": 6.625, + "learning_rate": 1.3865546218487396e-05, + "loss": 4.4906, + "num_input_tokens_seen": 346030080, + "step": 165 + }, + { + "epoch": 0.03, + "grad_norm": 6.15625, + "learning_rate": 1.3949579831932774e-05, + "loss": 4.4578, + "num_input_tokens_seen": 348127232, + "step": 166 + }, + { + "epoch": 0.04, + "grad_norm": 5.09375, + "learning_rate": 1.4033613445378151e-05, + "loss": 4.4623, + "num_input_tokens_seen": 350224384, + "step": 167 + }, + { + "epoch": 0.04, + "grad_norm": 7.25, + "learning_rate": 1.4117647058823532e-05, + "loss": 4.4293, + "num_input_tokens_seen": 352321536, + "step": 168 + }, + { + "epoch": 0.04, + "grad_norm": 7.375, + "learning_rate": 1.4201680672268908e-05, + "loss": 4.4135, + "num_input_tokens_seen": 354418688, + "step": 169 + }, + { + "epoch": 0.04, + "grad_norm": 6.5625, + "learning_rate": 1.4285714285714287e-05, + "loss": 4.367, + "num_input_tokens_seen": 356515840, + "step": 170 + }, + { + "epoch": 0.04, + "grad_norm": 7.75, + "learning_rate": 1.4369747899159666e-05, + "loss": 4.3954, + "num_input_tokens_seen": 358612992, + "step": 171 + }, + { + "epoch": 0.04, + "grad_norm": 5.0, + "learning_rate": 1.4453781512605043e-05, + "loss": 4.3388, + "num_input_tokens_seen": 360710144, + "step": 172 + }, + { + "epoch": 0.04, + "grad_norm": 8.875, + "learning_rate": 1.4537815126050421e-05, + "loss": 4.3209, + "num_input_tokens_seen": 362807296, + "step": 173 + }, + { + "epoch": 0.04, + "grad_norm": 5.9375, + "learning_rate": 1.4621848739495798e-05, + "loss": 4.3165, + "num_input_tokens_seen": 364904448, + "step": 174 + }, + { + "epoch": 0.04, + "grad_norm": 6.0625, + "learning_rate": 1.4705882352941179e-05, + "loss": 4.3025, + "num_input_tokens_seen": 367001600, + "step": 175 + }, + { + "epoch": 0.04, + "grad_norm": 8.6875, + "learning_rate": 1.4789915966386557e-05, + "loss": 4.2631, + "num_input_tokens_seen": 369098752, + "step": 176 + }, + { + "epoch": 0.04, + "grad_norm": 6.75, + "learning_rate": 1.4873949579831934e-05, + "loss": 4.2474, + "num_input_tokens_seen": 371195904, + "step": 177 + }, + { + "epoch": 0.04, + "grad_norm": 9.5625, + "learning_rate": 1.4957983193277313e-05, + "loss": 4.2473, + "num_input_tokens_seen": 373293056, + "step": 178 + }, + { + "epoch": 0.04, + "grad_norm": 8.1875, + "learning_rate": 1.504201680672269e-05, + "loss": 4.194, + "num_input_tokens_seen": 375390208, + "step": 179 + }, + { + "epoch": 0.04, + "grad_norm": 9.4375, + "learning_rate": 1.5126050420168068e-05, + "loss": 4.1842, + "num_input_tokens_seen": 377487360, + "step": 180 + }, + { + "epoch": 0.04, + "grad_norm": 7.8125, + "learning_rate": 1.5210084033613445e-05, + "loss": 4.1562, + "num_input_tokens_seen": 379584512, + "step": 181 + }, + { + "epoch": 0.04, + "grad_norm": 10.375, + "learning_rate": 1.5294117647058822e-05, + "loss": 4.1952, + "num_input_tokens_seen": 381681664, + "step": 182 + }, + { + "epoch": 0.04, + "grad_norm": 10.0, + "learning_rate": 1.5378151260504204e-05, + "loss": 4.1421, + "num_input_tokens_seen": 383778816, + "step": 183 + }, + { + "epoch": 0.04, + "grad_norm": 6.5625, + "learning_rate": 1.5462184873949583e-05, + "loss": 4.158, + "num_input_tokens_seen": 385875968, + "step": 184 + }, + { + "epoch": 0.04, + "grad_norm": 8.3125, + "learning_rate": 1.5546218487394958e-05, + "loss": 4.1286, + "num_input_tokens_seen": 387973120, + "step": 185 + }, + { + "epoch": 0.04, + "grad_norm": 4.8125, + "learning_rate": 1.5630252100840337e-05, + "loss": 4.11, + "num_input_tokens_seen": 390070272, + "step": 186 + }, + { + "epoch": 0.04, + "grad_norm": 8.0625, + "learning_rate": 1.5714285714285715e-05, + "loss": 4.0771, + "num_input_tokens_seen": 392167424, + "step": 187 + }, + { + "epoch": 0.04, + "grad_norm": 6.8125, + "learning_rate": 1.5798319327731094e-05, + "loss": 4.07, + "num_input_tokens_seen": 394264576, + "step": 188 + }, + { + "epoch": 0.04, + "grad_norm": 7.90625, + "learning_rate": 1.5882352941176473e-05, + "loss": 4.0397, + "num_input_tokens_seen": 396361728, + "step": 189 + }, + { + "epoch": 0.04, + "grad_norm": 6.53125, + "learning_rate": 1.596638655462185e-05, + "loss": 4.0532, + "num_input_tokens_seen": 398458880, + "step": 190 + }, + { + "epoch": 0.04, + "grad_norm": 10.75, + "learning_rate": 1.605042016806723e-05, + "loss": 4.0002, + "num_input_tokens_seen": 400556032, + "step": 191 + }, + { + "epoch": 0.04, + "grad_norm": 11.125, + "learning_rate": 1.6134453781512605e-05, + "loss": 3.9925, + "num_input_tokens_seen": 402653184, + "step": 192 + }, + { + "epoch": 0.04, + "grad_norm": 6.25, + "learning_rate": 1.6218487394957984e-05, + "loss": 3.9996, + "num_input_tokens_seen": 404750336, + "step": 193 + }, + { + "epoch": 0.04, + "grad_norm": 8.5625, + "learning_rate": 1.6302521008403362e-05, + "loss": 3.9756, + "num_input_tokens_seen": 406847488, + "step": 194 + }, + { + "epoch": 0.04, + "grad_norm": 8.0625, + "learning_rate": 1.638655462184874e-05, + "loss": 3.9886, + "num_input_tokens_seen": 408944640, + "step": 195 + }, + { + "epoch": 0.04, + "grad_norm": 5.34375, + "learning_rate": 1.647058823529412e-05, + "loss": 3.9601, + "num_input_tokens_seen": 411041792, + "step": 196 + }, + { + "epoch": 0.04, + "grad_norm": 7.375, + "learning_rate": 1.6554621848739495e-05, + "loss": 3.953, + "num_input_tokens_seen": 413138944, + "step": 197 + }, + { + "epoch": 0.04, + "grad_norm": 6.125, + "learning_rate": 1.6638655462184877e-05, + "loss": 3.9374, + "num_input_tokens_seen": 415236096, + "step": 198 + }, + { + "epoch": 0.04, + "grad_norm": 7.75, + "learning_rate": 1.6722689075630255e-05, + "loss": 3.9469, + "num_input_tokens_seen": 417333248, + "step": 199 + }, + { + "epoch": 0.04, + "eval_loss": 3.913985252380371, + "eval_runtime": 2006.3376, + "eval_samples_per_second": 1.965, + "eval_steps_per_second": 0.491, + "num_input_tokens_seen": 417333248, + "step": 199 + }, + { + "epoch": 0.04, + "grad_norm": 7.25, + "learning_rate": 1.680672268907563e-05, + "loss": 3.9549, + "num_input_tokens_seen": 419430400, + "step": 200 + }, + { + "epoch": 0.04, + "grad_norm": 7.40625, + "learning_rate": 1.689075630252101e-05, + "loss": 3.923, + "num_input_tokens_seen": 421527552, + "step": 201 + }, + { + "epoch": 0.04, + "grad_norm": 8.625, + "learning_rate": 1.6974789915966388e-05, + "loss": 3.9051, + "num_input_tokens_seen": 423624704, + "step": 202 + }, + { + "epoch": 0.04, + "grad_norm": 7.4375, + "learning_rate": 1.7058823529411767e-05, + "loss": 3.8699, + "num_input_tokens_seen": 425721856, + "step": 203 + }, + { + "epoch": 0.04, + "grad_norm": 5.75, + "learning_rate": 1.7142857142857142e-05, + "loss": 3.8571, + "num_input_tokens_seen": 427819008, + "step": 204 + }, + { + "epoch": 0.04, + "grad_norm": 7.125, + "learning_rate": 1.7226890756302524e-05, + "loss": 3.8535, + "num_input_tokens_seen": 429916160, + "step": 205 + }, + { + "epoch": 0.04, + "grad_norm": 6.15625, + "learning_rate": 1.7310924369747902e-05, + "loss": 3.7989, + "num_input_tokens_seen": 432013312, + "step": 206 + }, + { + "epoch": 0.04, + "grad_norm": 5.75, + "learning_rate": 1.7394957983193278e-05, + "loss": 3.8366, + "num_input_tokens_seen": 434110464, + "step": 207 + }, + { + "epoch": 0.04, + "grad_norm": 7.78125, + "learning_rate": 1.7478991596638656e-05, + "loss": 3.8179, + "num_input_tokens_seen": 436207616, + "step": 208 + }, + { + "epoch": 0.04, + "grad_norm": 7.5625, + "learning_rate": 1.7563025210084035e-05, + "loss": 3.7902, + "num_input_tokens_seen": 438304768, + "step": 209 + }, + { + "epoch": 0.04, + "grad_norm": 5.15625, + "learning_rate": 1.7647058823529414e-05, + "loss": 3.7911, + "num_input_tokens_seen": 440401920, + "step": 210 + }, + { + "epoch": 0.04, + "grad_norm": 10.0, + "learning_rate": 1.7731092436974792e-05, + "loss": 3.7648, + "num_input_tokens_seen": 442499072, + "step": 211 + }, + { + "epoch": 0.04, + "grad_norm": 10.0625, + "learning_rate": 1.781512605042017e-05, + "loss": 3.7809, + "num_input_tokens_seen": 444596224, + "step": 212 + }, + { + "epoch": 0.04, + "grad_norm": 6.375, + "learning_rate": 1.789915966386555e-05, + "loss": 3.7486, + "num_input_tokens_seen": 446693376, + "step": 213 + }, + { + "epoch": 0.04, + "grad_norm": 6.125, + "learning_rate": 1.7983193277310925e-05, + "loss": 3.7506, + "num_input_tokens_seen": 448790528, + "step": 214 + }, + { + "epoch": 0.05, + "grad_norm": 6.3125, + "learning_rate": 1.8067226890756303e-05, + "loss": 3.7224, + "num_input_tokens_seen": 450887680, + "step": 215 + }, + { + "epoch": 0.05, + "grad_norm": 6.96875, + "learning_rate": 1.8151260504201682e-05, + "loss": 3.6858, + "num_input_tokens_seen": 452984832, + "step": 216 + }, + { + "epoch": 0.05, + "grad_norm": 6.0625, + "learning_rate": 1.823529411764706e-05, + "loss": 3.7196, + "num_input_tokens_seen": 455081984, + "step": 217 + }, + { + "epoch": 0.05, + "grad_norm": 4.71875, + "learning_rate": 1.831932773109244e-05, + "loss": 3.7189, + "num_input_tokens_seen": 457179136, + "step": 218 + }, + { + "epoch": 0.05, + "grad_norm": 4.28125, + "learning_rate": 1.8403361344537814e-05, + "loss": 3.6689, + "num_input_tokens_seen": 459276288, + "step": 219 + }, + { + "epoch": 0.05, + "grad_norm": 5.1875, + "learning_rate": 1.8487394957983196e-05, + "loss": 3.6684, + "num_input_tokens_seen": 461373440, + "step": 220 + }, + { + "epoch": 0.05, + "grad_norm": 4.40625, + "learning_rate": 1.8571428571428575e-05, + "loss": 3.6715, + "num_input_tokens_seen": 463470592, + "step": 221 + }, + { + "epoch": 0.05, + "grad_norm": 5.78125, + "learning_rate": 1.865546218487395e-05, + "loss": 3.6402, + "num_input_tokens_seen": 465567744, + "step": 222 + }, + { + "epoch": 0.05, + "grad_norm": 5.4375, + "learning_rate": 1.873949579831933e-05, + "loss": 3.6363, + "num_input_tokens_seen": 467664896, + "step": 223 + }, + { + "epoch": 0.05, + "grad_norm": 5.40625, + "learning_rate": 1.8823529411764708e-05, + "loss": 3.6408, + "num_input_tokens_seen": 469762048, + "step": 224 + }, + { + "epoch": 0.05, + "grad_norm": 4.90625, + "learning_rate": 1.8907563025210086e-05, + "loss": 3.6181, + "num_input_tokens_seen": 471859200, + "step": 225 + }, + { + "epoch": 0.05, + "grad_norm": 4.78125, + "learning_rate": 1.899159663865546e-05, + "loss": 3.6198, + "num_input_tokens_seen": 473956352, + "step": 226 + }, + { + "epoch": 0.05, + "grad_norm": 4.71875, + "learning_rate": 1.9075630252100844e-05, + "loss": 3.5722, + "num_input_tokens_seen": 476053504, + "step": 227 + }, + { + "epoch": 0.05, + "grad_norm": 5.09375, + "learning_rate": 1.9159663865546222e-05, + "loss": 3.5493, + "num_input_tokens_seen": 478150656, + "step": 228 + }, + { + "epoch": 0.05, + "grad_norm": 5.84375, + "learning_rate": 1.9243697478991597e-05, + "loss": 3.5328, + "num_input_tokens_seen": 480247808, + "step": 229 + }, + { + "epoch": 0.05, + "grad_norm": 6.0625, + "learning_rate": 1.9327731092436976e-05, + "loss": 3.569, + "num_input_tokens_seen": 482344960, + "step": 230 + }, + { + "epoch": 0.05, + "grad_norm": 3.890625, + "learning_rate": 1.9411764705882355e-05, + "loss": 3.459, + "num_input_tokens_seen": 484442112, + "step": 231 + }, + { + "epoch": 0.05, + "grad_norm": 8.4375, + "learning_rate": 1.9495798319327733e-05, + "loss": 3.5222, + "num_input_tokens_seen": 486539264, + "step": 232 + }, + { + "epoch": 0.05, + "grad_norm": 7.375, + "learning_rate": 1.957983193277311e-05, + "loss": 3.5104, + "num_input_tokens_seen": 488636416, + "step": 233 + }, + { + "epoch": 0.05, + "grad_norm": 4.875, + "learning_rate": 1.9663865546218487e-05, + "loss": 3.5062, + "num_input_tokens_seen": 490733568, + "step": 234 + }, + { + "epoch": 0.05, + "grad_norm": 6.34375, + "learning_rate": 1.974789915966387e-05, + "loss": 3.4506, + "num_input_tokens_seen": 492830720, + "step": 235 + }, + { + "epoch": 0.05, + "grad_norm": 4.8125, + "learning_rate": 1.9831932773109244e-05, + "loss": 3.4939, + "num_input_tokens_seen": 494927872, + "step": 236 + }, + { + "epoch": 0.05, + "grad_norm": 5.8125, + "learning_rate": 1.9915966386554623e-05, + "loss": 3.4827, + "num_input_tokens_seen": 497025024, + "step": 237 + }, + { + "epoch": 0.05, + "grad_norm": 5.375, + "learning_rate": 2e-05, + "loss": 3.4589, + "num_input_tokens_seen": 499122176, + "step": 238 + }, + { + "epoch": 0.05, + "grad_norm": 4.09375, + "learning_rate": 2e-05, + "loss": 3.4193, + "num_input_tokens_seen": 501219328, + "step": 239 + }, + { + "epoch": 0.05, + "grad_norm": 4.28125, + "learning_rate": 2e-05, + "loss": 3.4441, + "num_input_tokens_seen": 503316480, + "step": 240 + }, + { + "epoch": 0.05, + "grad_norm": 3.59375, + "learning_rate": 2e-05, + "loss": 3.4409, + "num_input_tokens_seen": 505413632, + "step": 241 + }, + { + "epoch": 0.05, + "grad_norm": 4.65625, + "learning_rate": 2e-05, + "loss": 3.4129, + "num_input_tokens_seen": 507510784, + "step": 242 + }, + { + "epoch": 0.05, + "grad_norm": 3.90625, + "learning_rate": 2e-05, + "loss": 3.4003, + "num_input_tokens_seen": 509607936, + "step": 243 + }, + { + "epoch": 0.05, + "grad_norm": 3.015625, + "learning_rate": 2e-05, + "loss": 3.4113, + "num_input_tokens_seen": 511705088, + "step": 244 + }, + { + "epoch": 0.05, + "grad_norm": 4.25, + "learning_rate": 2e-05, + "loss": 3.38, + "num_input_tokens_seen": 513802240, + "step": 245 + }, + { + "epoch": 0.05, + "grad_norm": 3.40625, + "learning_rate": 2e-05, + "loss": 3.3491, + "num_input_tokens_seen": 515899392, + "step": 246 + }, + { + "epoch": 0.05, + "grad_norm": 4.09375, + "learning_rate": 2e-05, + "loss": 3.3739, + "num_input_tokens_seen": 517996544, + "step": 247 + }, + { + "epoch": 0.05, + "grad_norm": 3.25, + "learning_rate": 2e-05, + "loss": 3.3593, + "num_input_tokens_seen": 520093696, + "step": 248 + }, + { + "epoch": 0.05, + "grad_norm": 4.5625, + "learning_rate": 2e-05, + "loss": 3.3046, + "num_input_tokens_seen": 522190848, + "step": 249 + }, + { + "epoch": 0.05, + "grad_norm": 3.984375, + "learning_rate": 2e-05, + "loss": 3.344, + "num_input_tokens_seen": 524288000, + "step": 250 + }, + { + "epoch": 0.05, + "grad_norm": 4.125, + "learning_rate": 2e-05, + "loss": 3.3125, + "num_input_tokens_seen": 526385152, + "step": 251 + }, + { + "epoch": 0.05, + "grad_norm": 4.0625, + "learning_rate": 2e-05, + "loss": 3.2951, + "num_input_tokens_seen": 528482304, + "step": 252 + }, + { + "epoch": 0.05, + "grad_norm": 4.46875, + "learning_rate": 2e-05, + "loss": 3.3293, + "num_input_tokens_seen": 530579456, + "step": 253 + }, + { + "epoch": 0.05, + "grad_norm": 4.21875, + "learning_rate": 2e-05, + "loss": 3.2859, + "num_input_tokens_seen": 532676608, + "step": 254 + }, + { + "epoch": 0.05, + "grad_norm": 3.9375, + "learning_rate": 2e-05, + "loss": 3.295, + "num_input_tokens_seen": 534773760, + "step": 255 + }, + { + "epoch": 0.05, + "grad_norm": 3.84375, + "learning_rate": 2e-05, + "loss": 3.3064, + "num_input_tokens_seen": 536870912, + "step": 256 + }, + { + "epoch": 0.05, + "grad_norm": 3.296875, + "learning_rate": 2e-05, + "loss": 3.2556, + "num_input_tokens_seen": 538968064, + "step": 257 + }, + { + "epoch": 0.05, + "grad_norm": 3.515625, + "learning_rate": 2e-05, + "loss": 3.2735, + "num_input_tokens_seen": 541065216, + "step": 258 + }, + { + "epoch": 0.05, + "grad_norm": 2.890625, + "learning_rate": 2e-05, + "loss": 3.2571, + "num_input_tokens_seen": 543162368, + "step": 259 + }, + { + "epoch": 0.05, + "grad_norm": 3.265625, + "learning_rate": 2e-05, + "loss": 3.2361, + "num_input_tokens_seen": 545259520, + "step": 260 + }, + { + "epoch": 0.05, + "grad_norm": 3.359375, + "learning_rate": 2e-05, + "loss": 3.2487, + "num_input_tokens_seen": 547356672, + "step": 261 + }, + { + "epoch": 0.05, + "grad_norm": 4.28125, + "learning_rate": 2e-05, + "loss": 3.1921, + "num_input_tokens_seen": 549453824, + "step": 262 + }, + { + "epoch": 0.06, + "grad_norm": 3.078125, + "learning_rate": 2e-05, + "loss": 3.1962, + "num_input_tokens_seen": 551550976, + "step": 263 + }, + { + "epoch": 0.06, + "grad_norm": 4.59375, + "learning_rate": 2e-05, + "loss": 3.2288, + "num_input_tokens_seen": 553648128, + "step": 264 + }, + { + "epoch": 0.06, + "grad_norm": 3.765625, + "learning_rate": 2e-05, + "loss": 3.1997, + "num_input_tokens_seen": 555745280, + "step": 265 + }, + { + "epoch": 0.06, + "grad_norm": 3.9375, + "learning_rate": 2e-05, + "loss": 3.2058, + "num_input_tokens_seen": 557842432, + "step": 266 + }, + { + "epoch": 0.06, + "grad_norm": 4.5625, + "learning_rate": 2e-05, + "loss": 3.1865, + "num_input_tokens_seen": 559939584, + "step": 267 + }, + { + "epoch": 0.06, + "grad_norm": 4.3125, + "learning_rate": 2e-05, + "loss": 3.1915, + "num_input_tokens_seen": 562036736, + "step": 268 + }, + { + "epoch": 0.06, + "grad_norm": 4.375, + "learning_rate": 2e-05, + "loss": 3.1936, + "num_input_tokens_seen": 564133888, + "step": 269 + }, + { + "epoch": 0.06, + "grad_norm": 4.375, + "learning_rate": 2e-05, + "loss": 3.2147, + "num_input_tokens_seen": 566231040, + "step": 270 + }, + { + "epoch": 0.06, + "grad_norm": 3.484375, + "learning_rate": 2e-05, + "loss": 3.1502, + "num_input_tokens_seen": 568328192, + "step": 271 + }, + { + "epoch": 0.06, + "grad_norm": 6.125, + "learning_rate": 2e-05, + "loss": 3.212, + "num_input_tokens_seen": 570425344, + "step": 272 + }, + { + "epoch": 0.06, + "grad_norm": 5.9375, + "learning_rate": 2e-05, + "loss": 3.1481, + "num_input_tokens_seen": 572522496, + "step": 273 + }, + { + "epoch": 0.06, + "grad_norm": 3.078125, + "learning_rate": 2e-05, + "loss": 3.155, + "num_input_tokens_seen": 574619648, + "step": 274 + }, + { + "epoch": 0.06, + "grad_norm": 4.34375, + "learning_rate": 2e-05, + "loss": 3.1369, + "num_input_tokens_seen": 576716800, + "step": 275 + }, + { + "epoch": 0.06, + "grad_norm": 4.1875, + "learning_rate": 2e-05, + "loss": 3.1274, + "num_input_tokens_seen": 578813952, + "step": 276 + }, + { + "epoch": 0.06, + "grad_norm": 3.28125, + "learning_rate": 2e-05, + "loss": 3.1304, + "num_input_tokens_seen": 580911104, + "step": 277 + }, + { + "epoch": 0.06, + "grad_norm": 5.34375, + "learning_rate": 2e-05, + "loss": 3.1417, + "num_input_tokens_seen": 583008256, + "step": 278 + }, + { + "epoch": 0.06, + "grad_norm": 5.21875, + "learning_rate": 2e-05, + "loss": 3.1535, + "num_input_tokens_seen": 585105408, + "step": 279 + }, + { + "epoch": 0.06, + "grad_norm": 2.703125, + "learning_rate": 2e-05, + "loss": 3.0986, + "num_input_tokens_seen": 587202560, + "step": 280 + }, + { + "epoch": 0.06, + "grad_norm": 6.03125, + "learning_rate": 2e-05, + "loss": 3.1155, + "num_input_tokens_seen": 589299712, + "step": 281 + }, + { + "epoch": 0.06, + "grad_norm": 5.5625, + "learning_rate": 2e-05, + "loss": 3.0778, + "num_input_tokens_seen": 591396864, + "step": 282 + }, + { + "epoch": 0.06, + "grad_norm": 3.5, + "learning_rate": 2e-05, + "loss": 3.0915, + "num_input_tokens_seen": 593494016, + "step": 283 + }, + { + "epoch": 0.06, + "grad_norm": 3.46875, + "learning_rate": 2e-05, + "loss": 3.1342, + "num_input_tokens_seen": 595591168, + "step": 284 + }, + { + "epoch": 0.06, + "grad_norm": 3.65625, + "learning_rate": 2e-05, + "loss": 3.0678, + "num_input_tokens_seen": 597688320, + "step": 285 + }, + { + "epoch": 0.06, + "grad_norm": 4.15625, + "learning_rate": 2e-05, + "loss": 3.1034, + "num_input_tokens_seen": 599785472, + "step": 286 + }, + { + "epoch": 0.06, + "grad_norm": 2.90625, + "learning_rate": 2e-05, + "loss": 3.097, + "num_input_tokens_seen": 601882624, + "step": 287 + }, + { + "epoch": 0.06, + "grad_norm": 3.34375, + "learning_rate": 2e-05, + "loss": 3.1068, + "num_input_tokens_seen": 603979776, + "step": 288 + }, + { + "epoch": 0.06, + "grad_norm": 2.6875, + "learning_rate": 2e-05, + "loss": 3.0652, + "num_input_tokens_seen": 606076928, + "step": 289 + }, + { + "epoch": 0.06, + "grad_norm": 3.0, + "learning_rate": 2e-05, + "loss": 3.0617, + "num_input_tokens_seen": 608174080, + "step": 290 + }, + { + "epoch": 0.06, + "grad_norm": 2.515625, + "learning_rate": 2e-05, + "loss": 3.0646, + "num_input_tokens_seen": 610271232, + "step": 291 + }, + { + "epoch": 0.06, + "grad_norm": 3.546875, + "learning_rate": 2e-05, + "loss": 3.0943, + "num_input_tokens_seen": 612368384, + "step": 292 + }, + { + "epoch": 0.06, + "grad_norm": 2.375, + "learning_rate": 2e-05, + "loss": 3.0165, + "num_input_tokens_seen": 614465536, + "step": 293 + }, + { + "epoch": 0.06, + "grad_norm": 3.515625, + "learning_rate": 2e-05, + "loss": 3.0296, + "num_input_tokens_seen": 616562688, + "step": 294 + }, + { + "epoch": 0.06, + "grad_norm": 3.046875, + "learning_rate": 2e-05, + "loss": 3.0338, + "num_input_tokens_seen": 618659840, + "step": 295 + }, + { + "epoch": 0.06, + "grad_norm": 3.375, + "learning_rate": 2e-05, + "loss": 3.044, + "num_input_tokens_seen": 620756992, + "step": 296 + }, + { + "epoch": 0.06, + "grad_norm": 2.734375, + "learning_rate": 2e-05, + "loss": 3.0633, + "num_input_tokens_seen": 622854144, + "step": 297 + }, + { + "epoch": 0.06, + "grad_norm": 3.609375, + "learning_rate": 2e-05, + "loss": 3.0242, + "num_input_tokens_seen": 624951296, + "step": 298 + }, + { + "epoch": 0.06, + "grad_norm": 2.890625, + "learning_rate": 2e-05, + "loss": 3.0351, + "num_input_tokens_seen": 627048448, + "step": 299 + }, + { + "epoch": 0.06, + "grad_norm": 3.484375, + "learning_rate": 2e-05, + "loss": 2.9913, + "num_input_tokens_seen": 629145600, + "step": 300 + }, + { + "epoch": 0.06, + "grad_norm": 3.46875, + "learning_rate": 2e-05, + "loss": 3.0039, + "num_input_tokens_seen": 631242752, + "step": 301 + }, + { + "epoch": 0.06, + "grad_norm": 2.75, + "learning_rate": 2e-05, + "loss": 3.0468, + "num_input_tokens_seen": 633339904, + "step": 302 + }, + { + "epoch": 0.06, + "grad_norm": 3.640625, + "learning_rate": 2e-05, + "loss": 3.032, + "num_input_tokens_seen": 635437056, + "step": 303 + }, + { + "epoch": 0.06, + "grad_norm": 3.25, + "learning_rate": 2e-05, + "loss": 3.0188, + "num_input_tokens_seen": 637534208, + "step": 304 + }, + { + "epoch": 0.06, + "grad_norm": 2.96875, + "learning_rate": 2e-05, + "loss": 3.0026, + "num_input_tokens_seen": 639631360, + "step": 305 + }, + { + "epoch": 0.06, + "grad_norm": 4.03125, + "learning_rate": 2e-05, + "loss": 2.9779, + "num_input_tokens_seen": 641728512, + "step": 306 + }, + { + "epoch": 0.06, + "grad_norm": 3.765625, + "learning_rate": 2e-05, + "loss": 3.0523, + "num_input_tokens_seen": 643825664, + "step": 307 + }, + { + "epoch": 0.06, + "grad_norm": 3.015625, + "learning_rate": 2e-05, + "loss": 3.0251, + "num_input_tokens_seen": 645922816, + "step": 308 + }, + { + "epoch": 0.06, + "grad_norm": 3.203125, + "learning_rate": 2e-05, + "loss": 3.0103, + "num_input_tokens_seen": 648019968, + "step": 309 + }, + { + "epoch": 0.07, + "grad_norm": 2.625, + "learning_rate": 2e-05, + "loss": 3.0095, + "num_input_tokens_seen": 650117120, + "step": 310 + }, + { + "epoch": 0.07, + "grad_norm": 3.125, + "learning_rate": 2e-05, + "loss": 2.9667, + "num_input_tokens_seen": 652214272, + "step": 311 + }, + { + "epoch": 0.07, + "grad_norm": 2.46875, + "learning_rate": 2e-05, + "loss": 2.9823, + "num_input_tokens_seen": 654311424, + "step": 312 + }, + { + "epoch": 0.07, + "grad_norm": 2.90625, + "learning_rate": 2e-05, + "loss": 2.9807, + "num_input_tokens_seen": 656408576, + "step": 313 + }, + { + "epoch": 0.07, + "grad_norm": 3.609375, + "learning_rate": 2e-05, + "loss": 2.9975, + "num_input_tokens_seen": 658505728, + "step": 314 + }, + { + "epoch": 0.07, + "grad_norm": 2.890625, + "learning_rate": 2e-05, + "loss": 2.9686, + "num_input_tokens_seen": 660602880, + "step": 315 + }, + { + "epoch": 0.07, + "grad_norm": 3.109375, + "learning_rate": 2e-05, + "loss": 2.9864, + "num_input_tokens_seen": 662700032, + "step": 316 + }, + { + "epoch": 0.07, + "grad_norm": 3.0625, + "learning_rate": 2e-05, + "loss": 2.9523, + "num_input_tokens_seen": 664797184, + "step": 317 + }, + { + "epoch": 0.07, + "grad_norm": 2.828125, + "learning_rate": 2e-05, + "loss": 2.9933, + "num_input_tokens_seen": 666894336, + "step": 318 + }, + { + "epoch": 0.07, + "grad_norm": 2.90625, + "learning_rate": 2e-05, + "loss": 2.9572, + "num_input_tokens_seen": 668991488, + "step": 319 + }, + { + "epoch": 0.07, + "grad_norm": 2.796875, + "learning_rate": 2e-05, + "loss": 2.9504, + "num_input_tokens_seen": 671088640, + "step": 320 + }, + { + "epoch": 0.07, + "grad_norm": 2.671875, + "learning_rate": 2e-05, + "loss": 2.9586, + "num_input_tokens_seen": 673185792, + "step": 321 + }, + { + "epoch": 0.07, + "grad_norm": 2.484375, + "learning_rate": 2e-05, + "loss": 2.9605, + "num_input_tokens_seen": 675282944, + "step": 322 + }, + { + "epoch": 0.07, + "grad_norm": 2.5, + "learning_rate": 2e-05, + "loss": 2.9396, + "num_input_tokens_seen": 677380096, + "step": 323 + }, + { + "epoch": 0.07, + "grad_norm": 3.140625, + "learning_rate": 2e-05, + "loss": 2.9407, + "num_input_tokens_seen": 679477248, + "step": 324 + }, + { + "epoch": 0.07, + "grad_norm": 3.15625, + "learning_rate": 2e-05, + "loss": 2.9254, + "num_input_tokens_seen": 681574400, + "step": 325 + }, + { + "epoch": 0.07, + "grad_norm": 3.015625, + "learning_rate": 2e-05, + "loss": 2.9158, + "num_input_tokens_seen": 683671552, + "step": 326 + }, + { + "epoch": 0.07, + "grad_norm": 3.203125, + "learning_rate": 2e-05, + "loss": 2.9317, + "num_input_tokens_seen": 685768704, + "step": 327 + }, + { + "epoch": 0.07, + "grad_norm": 3.0625, + "learning_rate": 2e-05, + "loss": 2.9183, + "num_input_tokens_seen": 687865856, + "step": 328 + }, + { + "epoch": 0.07, + "grad_norm": 2.640625, + "learning_rate": 2e-05, + "loss": 2.9378, + "num_input_tokens_seen": 689963008, + "step": 329 + }, + { + "epoch": 0.07, + "grad_norm": 2.65625, + "learning_rate": 2e-05, + "loss": 2.9237, + "num_input_tokens_seen": 692060160, + "step": 330 + }, + { + "epoch": 0.07, + "grad_norm": 2.578125, + "learning_rate": 2e-05, + "loss": 2.9259, + "num_input_tokens_seen": 694157312, + "step": 331 + }, + { + "epoch": 0.07, + "grad_norm": 2.34375, + "learning_rate": 2e-05, + "loss": 2.9404, + "num_input_tokens_seen": 696254464, + "step": 332 + }, + { + "epoch": 0.07, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 2.9242, + "num_input_tokens_seen": 698351616, + "step": 333 + }, + { + "epoch": 0.07, + "grad_norm": 3.15625, + "learning_rate": 2e-05, + "loss": 2.8951, + "num_input_tokens_seen": 700448768, + "step": 334 + }, + { + "epoch": 0.07, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 2.8961, + "num_input_tokens_seen": 702545920, + "step": 335 + }, + { + "epoch": 0.07, + "grad_norm": 3.609375, + "learning_rate": 2e-05, + "loss": 2.8834, + "num_input_tokens_seen": 704643072, + "step": 336 + }, + { + "epoch": 0.07, + "grad_norm": 3.046875, + "learning_rate": 2e-05, + "loss": 2.8938, + "num_input_tokens_seen": 706740224, + "step": 337 + }, + { + "epoch": 0.07, + "grad_norm": 3.328125, + "learning_rate": 2e-05, + "loss": 2.9106, + "num_input_tokens_seen": 708837376, + "step": 338 + }, + { + "epoch": 0.07, + "grad_norm": 3.09375, + "learning_rate": 2e-05, + "loss": 2.8693, + "num_input_tokens_seen": 710934528, + "step": 339 + }, + { + "epoch": 0.07, + "grad_norm": 2.734375, + "learning_rate": 2e-05, + "loss": 2.927, + "num_input_tokens_seen": 713031680, + "step": 340 + }, + { + "epoch": 0.07, + "grad_norm": 3.296875, + "learning_rate": 2e-05, + "loss": 2.8972, + "num_input_tokens_seen": 715128832, + "step": 341 + }, + { + "epoch": 0.07, + "grad_norm": 2.234375, + "learning_rate": 2e-05, + "loss": 2.8604, + "num_input_tokens_seen": 717225984, + "step": 342 + }, + { + "epoch": 0.07, + "grad_norm": 4.28125, + "learning_rate": 2e-05, + "loss": 2.8847, + "num_input_tokens_seen": 719323136, + "step": 343 + }, + { + "epoch": 0.07, + "grad_norm": 3.3125, + "learning_rate": 2e-05, + "loss": 2.8893, + "num_input_tokens_seen": 721420288, + "step": 344 + }, + { + "epoch": 0.07, + "grad_norm": 3.8125, + "learning_rate": 2e-05, + "loss": 2.8779, + "num_input_tokens_seen": 723517440, + "step": 345 + }, + { + "epoch": 0.07, + "grad_norm": 4.4375, + "learning_rate": 2e-05, + "loss": 2.8721, + "num_input_tokens_seen": 725614592, + "step": 346 + }, + { + "epoch": 0.07, + "grad_norm": 2.265625, + "learning_rate": 2e-05, + "loss": 2.8788, + "num_input_tokens_seen": 727711744, + "step": 347 + }, + { + "epoch": 0.07, + "grad_norm": 4.59375, + "learning_rate": 2e-05, + "loss": 2.8812, + "num_input_tokens_seen": 729808896, + "step": 348 + }, + { + "epoch": 0.07, + "grad_norm": 5.59375, + "learning_rate": 2e-05, + "loss": 2.8586, + "num_input_tokens_seen": 731906048, + "step": 349 + }, + { + "epoch": 0.07, + "grad_norm": 2.84375, + "learning_rate": 2e-05, + "loss": 2.8703, + "num_input_tokens_seen": 734003200, + "step": 350 + }, + { + "epoch": 0.07, + "grad_norm": 4.09375, + "learning_rate": 2e-05, + "loss": 2.8958, + "num_input_tokens_seen": 736100352, + "step": 351 + }, + { + "epoch": 0.07, + "grad_norm": 4.125, + "learning_rate": 2e-05, + "loss": 2.8572, + "num_input_tokens_seen": 738197504, + "step": 352 + }, + { + "epoch": 0.07, + "grad_norm": 2.765625, + "learning_rate": 2e-05, + "loss": 2.8491, + "num_input_tokens_seen": 740294656, + "step": 353 + }, + { + "epoch": 0.07, + "grad_norm": 3.078125, + "learning_rate": 2e-05, + "loss": 2.8936, + "num_input_tokens_seen": 742391808, + "step": 354 + }, + { + "epoch": 0.07, + "grad_norm": 4.1875, + "learning_rate": 2e-05, + "loss": 2.8708, + "num_input_tokens_seen": 744488960, + "step": 355 + }, + { + "epoch": 0.07, + "grad_norm": 3.671875, + "learning_rate": 2e-05, + "loss": 2.8501, + "num_input_tokens_seen": 746586112, + "step": 356 + }, + { + "epoch": 0.07, + "grad_norm": 2.53125, + "learning_rate": 2e-05, + "loss": 2.8226, + "num_input_tokens_seen": 748683264, + "step": 357 + }, + { + "epoch": 0.08, + "grad_norm": 3.28125, + "learning_rate": 2e-05, + "loss": 2.8612, + "num_input_tokens_seen": 750780416, + "step": 358 + }, + { + "epoch": 0.08, + "grad_norm": 3.921875, + "learning_rate": 2e-05, + "loss": 2.8409, + "num_input_tokens_seen": 752877568, + "step": 359 + }, + { + "epoch": 0.08, + "grad_norm": 2.703125, + "learning_rate": 2e-05, + "loss": 2.8417, + "num_input_tokens_seen": 754974720, + "step": 360 + }, + { + "epoch": 0.08, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 2.8713, + "num_input_tokens_seen": 757071872, + "step": 361 + }, + { + "epoch": 0.08, + "grad_norm": 2.875, + "learning_rate": 2e-05, + "loss": 2.8447, + "num_input_tokens_seen": 759169024, + "step": 362 + }, + { + "epoch": 0.08, + "grad_norm": 2.765625, + "learning_rate": 2e-05, + "loss": 2.8592, + "num_input_tokens_seen": 761266176, + "step": 363 + }, + { + "epoch": 0.08, + "grad_norm": 3.328125, + "learning_rate": 2e-05, + "loss": 2.8172, + "num_input_tokens_seen": 763363328, + "step": 364 + }, + { + "epoch": 0.08, + "grad_norm": 3.78125, + "learning_rate": 2e-05, + "loss": 2.8316, + "num_input_tokens_seen": 765460480, + "step": 365 + }, + { + "epoch": 0.08, + "grad_norm": 2.109375, + "learning_rate": 2e-05, + "loss": 2.8401, + "num_input_tokens_seen": 767557632, + "step": 366 + }, + { + "epoch": 0.08, + "grad_norm": 2.71875, + "learning_rate": 2e-05, + "loss": 2.8352, + "num_input_tokens_seen": 769654784, + "step": 367 + }, + { + "epoch": 0.08, + "grad_norm": 2.5625, + "learning_rate": 2e-05, + "loss": 2.8366, + "num_input_tokens_seen": 771751936, + "step": 368 + }, + { + "epoch": 0.08, + "grad_norm": 2.453125, + "learning_rate": 2e-05, + "loss": 2.8411, + "num_input_tokens_seen": 773849088, + "step": 369 + }, + { + "epoch": 0.08, + "grad_norm": 2.421875, + "learning_rate": 2e-05, + "loss": 2.8475, + "num_input_tokens_seen": 775946240, + "step": 370 + }, + { + "epoch": 0.08, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 2.8005, + "num_input_tokens_seen": 778043392, + "step": 371 + }, + { + "epoch": 0.08, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 2.8355, + "num_input_tokens_seen": 780140544, + "step": 372 + }, + { + "epoch": 0.08, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 2.8547, + "num_input_tokens_seen": 782237696, + "step": 373 + }, + { + "epoch": 0.08, + "grad_norm": 2.515625, + "learning_rate": 2e-05, + "loss": 2.8163, + "num_input_tokens_seen": 784334848, + "step": 374 + }, + { + "epoch": 0.08, + "grad_norm": 2.171875, + "learning_rate": 2e-05, + "loss": 2.7912, + "num_input_tokens_seen": 786432000, + "step": 375 + }, + { + "epoch": 0.08, + "grad_norm": 2.546875, + "learning_rate": 2e-05, + "loss": 2.8027, + "num_input_tokens_seen": 788529152, + "step": 376 + }, + { + "epoch": 0.08, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 2.8182, + "num_input_tokens_seen": 790626304, + "step": 377 + }, + { + "epoch": 0.08, + "grad_norm": 2.71875, + "learning_rate": 2e-05, + "loss": 2.7678, + "num_input_tokens_seen": 792723456, + "step": 378 + }, + { + "epoch": 0.08, + "grad_norm": 2.9375, + "learning_rate": 2e-05, + "loss": 2.8225, + "num_input_tokens_seen": 794820608, + "step": 379 + }, + { + "epoch": 0.08, + "grad_norm": 2.6875, + "learning_rate": 2e-05, + "loss": 2.8011, + "num_input_tokens_seen": 796917760, + "step": 380 + }, + { + "epoch": 0.08, + "grad_norm": 2.359375, + "learning_rate": 2e-05, + "loss": 2.7975, + "num_input_tokens_seen": 799014912, + "step": 381 + }, + { + "epoch": 0.08, + "grad_norm": 3.203125, + "learning_rate": 2e-05, + "loss": 2.7733, + "num_input_tokens_seen": 801112064, + "step": 382 + }, + { + "epoch": 0.08, + "grad_norm": 2.84375, + "learning_rate": 2e-05, + "loss": 2.8112, + "num_input_tokens_seen": 803209216, + "step": 383 + }, + { + "epoch": 0.08, + "grad_norm": 2.453125, + "learning_rate": 2e-05, + "loss": 2.7977, + "num_input_tokens_seen": 805306368, + "step": 384 + }, + { + "epoch": 0.08, + "grad_norm": 2.625, + "learning_rate": 2e-05, + "loss": 2.8178, + "num_input_tokens_seen": 807403520, + "step": 385 + }, + { + "epoch": 0.08, + "grad_norm": 2.1875, + "learning_rate": 2e-05, + "loss": 2.7852, + "num_input_tokens_seen": 809500672, + "step": 386 + }, + { + "epoch": 0.08, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 2.8056, + "num_input_tokens_seen": 811597824, + "step": 387 + }, + { + "epoch": 0.08, + "grad_norm": 2.4375, + "learning_rate": 2e-05, + "loss": 2.7934, + "num_input_tokens_seen": 813694976, + "step": 388 + }, + { + "epoch": 0.08, + "grad_norm": 2.390625, + "learning_rate": 2e-05, + "loss": 2.806, + "num_input_tokens_seen": 815792128, + "step": 389 + }, + { + "epoch": 0.08, + "grad_norm": 2.109375, + "learning_rate": 2e-05, + "loss": 2.7833, + "num_input_tokens_seen": 817889280, + "step": 390 + }, + { + "epoch": 0.08, + "grad_norm": 2.375, + "learning_rate": 2e-05, + "loss": 2.7811, + "num_input_tokens_seen": 819986432, + "step": 391 + }, + { + "epoch": 0.08, + "grad_norm": 2.234375, + "learning_rate": 2e-05, + "loss": 2.7697, + "num_input_tokens_seen": 822083584, + "step": 392 + }, + { + "epoch": 0.08, + "grad_norm": 3.078125, + "learning_rate": 2e-05, + "loss": 2.7823, + "num_input_tokens_seen": 824180736, + "step": 393 + }, + { + "epoch": 0.08, + "grad_norm": 2.640625, + "learning_rate": 2e-05, + "loss": 2.8006, + "num_input_tokens_seen": 826277888, + "step": 394 + }, + { + "epoch": 0.08, + "grad_norm": 2.5625, + "learning_rate": 2e-05, + "loss": 2.7893, + "num_input_tokens_seen": 828375040, + "step": 395 + }, + { + "epoch": 0.08, + "grad_norm": 3.078125, + "learning_rate": 2e-05, + "loss": 2.7442, + "num_input_tokens_seen": 830472192, + "step": 396 + }, + { + "epoch": 0.08, + "grad_norm": 2.4375, + "learning_rate": 2e-05, + "loss": 2.7833, + "num_input_tokens_seen": 832569344, + "step": 397 + }, + { + "epoch": 0.08, + "grad_norm": 2.734375, + "learning_rate": 2e-05, + "loss": 2.8004, + "num_input_tokens_seen": 834666496, + "step": 398 + }, + { + "epoch": 0.08, + "eval_loss": 2.7730445861816406, + "eval_runtime": 1661.813, + "eval_samples_per_second": 2.372, + "eval_steps_per_second": 0.593, + "num_input_tokens_seen": 834666496, + "step": 398 + }, + { + "epoch": 0.08, + "grad_norm": 3.78125, + "learning_rate": 2e-05, + "loss": 2.776, + "num_input_tokens_seen": 836763648, + "step": 399 + }, + { + "epoch": 0.08, + "grad_norm": 2.375, + "learning_rate": 2e-05, + "loss": 2.8008, + "num_input_tokens_seen": 838860800, + "step": 400 + }, + { + "epoch": 0.08, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 2.7404, + "num_input_tokens_seen": 840957952, + "step": 401 + }, + { + "epoch": 0.08, + "grad_norm": 2.84375, + "learning_rate": 2e-05, + "loss": 2.737, + "num_input_tokens_seen": 843055104, + "step": 402 + }, + { + "epoch": 0.08, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 2.7816, + "num_input_tokens_seen": 845152256, + "step": 403 + }, + { + "epoch": 0.08, + "grad_norm": 2.09375, + "learning_rate": 2e-05, + "loss": 2.7375, + "num_input_tokens_seen": 847249408, + "step": 404 + }, + { + "epoch": 0.08, + "grad_norm": 2.15625, + "learning_rate": 2e-05, + "loss": 2.7568, + "num_input_tokens_seen": 849346560, + "step": 405 + }, + { + "epoch": 0.09, + "grad_norm": 2.28125, + "learning_rate": 2e-05, + "loss": 2.7541, + "num_input_tokens_seen": 851443712, + "step": 406 + }, + { + "epoch": 0.09, + "grad_norm": 2.5625, + "learning_rate": 2e-05, + "loss": 2.7865, + "num_input_tokens_seen": 853540864, + "step": 407 + }, + { + "epoch": 0.09, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 2.7722, + "num_input_tokens_seen": 855638016, + "step": 408 + }, + { + "epoch": 0.09, + "grad_norm": 2.59375, + "learning_rate": 2e-05, + "loss": 2.7532, + "num_input_tokens_seen": 857735168, + "step": 409 + }, + { + "epoch": 0.09, + "grad_norm": 2.5625, + "learning_rate": 2e-05, + "loss": 2.7556, + "num_input_tokens_seen": 859832320, + "step": 410 + }, + { + "epoch": 0.09, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 2.735, + "num_input_tokens_seen": 861929472, + "step": 411 + }, + { + "epoch": 0.09, + "grad_norm": 2.75, + "learning_rate": 2e-05, + "loss": 2.7688, + "num_input_tokens_seen": 864026624, + "step": 412 + }, + { + "epoch": 0.09, + "grad_norm": 2.5625, + "learning_rate": 2e-05, + "loss": 2.7644, + "num_input_tokens_seen": 866123776, + "step": 413 + }, + { + "epoch": 0.09, + "grad_norm": 1.9453125, + "learning_rate": 2e-05, + "loss": 2.7741, + "num_input_tokens_seen": 868220928, + "step": 414 + }, + { + "epoch": 0.09, + "grad_norm": 2.859375, + "learning_rate": 2e-05, + "loss": 2.7287, + "num_input_tokens_seen": 870318080, + "step": 415 + }, + { + "epoch": 0.09, + "grad_norm": 3.1875, + "learning_rate": 2e-05, + "loss": 2.757, + "num_input_tokens_seen": 872415232, + "step": 416 + }, + { + "epoch": 0.09, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 2.7484, + "num_input_tokens_seen": 874512384, + "step": 417 + }, + { + "epoch": 0.09, + "grad_norm": 2.5625, + "learning_rate": 2e-05, + "loss": 2.7537, + "num_input_tokens_seen": 876609536, + "step": 418 + }, + { + "epoch": 0.09, + "grad_norm": 3.265625, + "learning_rate": 2e-05, + "loss": 2.7671, + "num_input_tokens_seen": 878706688, + "step": 419 + }, + { + "epoch": 0.09, + "grad_norm": 2.53125, + "learning_rate": 2e-05, + "loss": 2.7466, + "num_input_tokens_seen": 880803840, + "step": 420 + }, + { + "epoch": 0.09, + "grad_norm": 4.59375, + "learning_rate": 2e-05, + "loss": 2.7435, + "num_input_tokens_seen": 882900992, + "step": 421 + }, + { + "epoch": 0.09, + "grad_norm": 3.078125, + "learning_rate": 2e-05, + "loss": 2.7226, + "num_input_tokens_seen": 884998144, + "step": 422 + }, + { + "epoch": 0.09, + "grad_norm": 5.0625, + "learning_rate": 2e-05, + "loss": 2.7318, + "num_input_tokens_seen": 887095296, + "step": 423 + }, + { + "epoch": 0.09, + "grad_norm": 3.546875, + "learning_rate": 2e-05, + "loss": 2.7617, + "num_input_tokens_seen": 889192448, + "step": 424 + }, + { + "epoch": 0.09, + "grad_norm": 11.0625, + "learning_rate": 2e-05, + "loss": 2.7973, + "num_input_tokens_seen": 891289600, + "step": 425 + }, + { + "epoch": 0.09, + "grad_norm": 10.4375, + "learning_rate": 2e-05, + "loss": 2.7629, + "num_input_tokens_seen": 893386752, + "step": 426 + }, + { + "epoch": 0.09, + "grad_norm": 3.203125, + "learning_rate": 2e-05, + "loss": 2.7409, + "num_input_tokens_seen": 895483904, + "step": 427 + }, + { + "epoch": 0.09, + "grad_norm": 7.8125, + "learning_rate": 2e-05, + "loss": 2.792, + "num_input_tokens_seen": 897581056, + "step": 428 + }, + { + "epoch": 0.09, + "grad_norm": 7.75, + "learning_rate": 2e-05, + "loss": 2.769, + "num_input_tokens_seen": 899678208, + "step": 429 + }, + { + "epoch": 0.09, + "grad_norm": 4.03125, + "learning_rate": 2e-05, + "loss": 2.7214, + "num_input_tokens_seen": 901775360, + "step": 430 + }, + { + "epoch": 0.09, + "grad_norm": 10.5625, + "learning_rate": 2e-05, + "loss": 2.7974, + "num_input_tokens_seen": 903872512, + "step": 431 + }, + { + "epoch": 0.09, + "grad_norm": 11.625, + "learning_rate": 2e-05, + "loss": 2.8303, + "num_input_tokens_seen": 905969664, + "step": 432 + }, + { + "epoch": 0.09, + "grad_norm": 7.78125, + "learning_rate": 2e-05, + "loss": 2.8003, + "num_input_tokens_seen": 908066816, + "step": 433 + }, + { + "epoch": 0.09, + "grad_norm": 3.859375, + "learning_rate": 2e-05, + "loss": 2.7659, + "num_input_tokens_seen": 910163968, + "step": 434 + }, + { + "epoch": 0.09, + "grad_norm": 5.4375, + "learning_rate": 2e-05, + "loss": 2.7698, + "num_input_tokens_seen": 912261120, + "step": 435 + }, + { + "epoch": 0.09, + "grad_norm": 4.71875, + "learning_rate": 2e-05, + "loss": 2.7832, + "num_input_tokens_seen": 914358272, + "step": 436 + }, + { + "epoch": 0.09, + "grad_norm": 2.703125, + "learning_rate": 2e-05, + "loss": 2.7357, + "num_input_tokens_seen": 916455424, + "step": 437 + }, + { + "epoch": 0.09, + "grad_norm": 4.21875, + "learning_rate": 2e-05, + "loss": 2.7697, + "num_input_tokens_seen": 918552576, + "step": 438 + }, + { + "epoch": 0.09, + "grad_norm": 3.703125, + "learning_rate": 2e-05, + "loss": 2.7555, + "num_input_tokens_seen": 920649728, + "step": 439 + }, + { + "epoch": 0.09, + "grad_norm": 2.4375, + "learning_rate": 2e-05, + "loss": 2.7363, + "num_input_tokens_seen": 922746880, + "step": 440 + }, + { + "epoch": 0.09, + "grad_norm": 3.8125, + "learning_rate": 2e-05, + "loss": 2.7422, + "num_input_tokens_seen": 924844032, + "step": 441 + }, + { + "epoch": 0.09, + "grad_norm": 3.515625, + "learning_rate": 2e-05, + "loss": 2.737, + "num_input_tokens_seen": 926941184, + "step": 442 + }, + { + "epoch": 0.09, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 2.7024, + "num_input_tokens_seen": 929038336, + "step": 443 + }, + { + "epoch": 0.09, + "grad_norm": 3.78125, + "learning_rate": 2e-05, + "loss": 2.7377, + "num_input_tokens_seen": 931135488, + "step": 444 + }, + { + "epoch": 0.09, + "grad_norm": 4.0625, + "learning_rate": 2e-05, + "loss": 2.7484, + "num_input_tokens_seen": 933232640, + "step": 445 + }, + { + "epoch": 0.09, + "grad_norm": 2.5625, + "learning_rate": 2e-05, + "loss": 2.7241, + "num_input_tokens_seen": 935329792, + "step": 446 + }, + { + "epoch": 0.09, + "grad_norm": 2.796875, + "learning_rate": 2e-05, + "loss": 2.6985, + "num_input_tokens_seen": 937426944, + "step": 447 + }, + { + "epoch": 0.09, + "grad_norm": 3.53125, + "learning_rate": 2e-05, + "loss": 2.723, + "num_input_tokens_seen": 939524096, + "step": 448 + }, + { + "epoch": 0.09, + "grad_norm": 2.75, + "learning_rate": 2e-05, + "loss": 2.7293, + "num_input_tokens_seen": 941621248, + "step": 449 + }, + { + "epoch": 0.09, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 2.7126, + "num_input_tokens_seen": 943718400, + "step": 450 + }, + { + "epoch": 0.09, + "grad_norm": 2.578125, + "learning_rate": 2e-05, + "loss": 2.7239, + "num_input_tokens_seen": 945815552, + "step": 451 + }, + { + "epoch": 0.09, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 2.7052, + "num_input_tokens_seen": 947912704, + "step": 452 + }, + { + "epoch": 0.1, + "grad_norm": 1.8359375, + "learning_rate": 2e-05, + "loss": 2.7283, + "num_input_tokens_seen": 950009856, + "step": 453 + }, + { + "epoch": 0.1, + "grad_norm": 2.375, + "learning_rate": 2e-05, + "loss": 2.7342, + "num_input_tokens_seen": 952107008, + "step": 454 + }, + { + "epoch": 0.1, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 2.7049, + "num_input_tokens_seen": 954204160, + "step": 455 + }, + { + "epoch": 0.1, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 2.7269, + "num_input_tokens_seen": 956301312, + "step": 456 + }, + { + "epoch": 0.1, + "grad_norm": 2.390625, + "learning_rate": 2e-05, + "loss": 2.7227, + "num_input_tokens_seen": 958398464, + "step": 457 + }, + { + "epoch": 0.1, + "grad_norm": 1.8046875, + "learning_rate": 2e-05, + "loss": 2.7137, + "num_input_tokens_seen": 960495616, + "step": 458 + }, + { + "epoch": 0.1, + "grad_norm": 2.359375, + "learning_rate": 2e-05, + "loss": 2.6715, + "num_input_tokens_seen": 962592768, + "step": 459 + }, + { + "epoch": 0.1, + "grad_norm": 2.5, + "learning_rate": 2e-05, + "loss": 2.7087, + "num_input_tokens_seen": 964689920, + "step": 460 + }, + { + "epoch": 0.1, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 2.7266, + "num_input_tokens_seen": 966787072, + "step": 461 + }, + { + "epoch": 0.1, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 2.6943, + "num_input_tokens_seen": 968884224, + "step": 462 + }, + { + "epoch": 0.1, + "grad_norm": 2.15625, + "learning_rate": 2e-05, + "loss": 2.6758, + "num_input_tokens_seen": 970981376, + "step": 463 + }, + { + "epoch": 0.1, + "grad_norm": 2.15625, + "learning_rate": 2e-05, + "loss": 2.6947, + "num_input_tokens_seen": 973078528, + "step": 464 + }, + { + "epoch": 0.1, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 2.7042, + "num_input_tokens_seen": 975175680, + "step": 465 + }, + { + "epoch": 0.1, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 2.7282, + "num_input_tokens_seen": 977272832, + "step": 466 + }, + { + "epoch": 0.1, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 2.6719, + "num_input_tokens_seen": 979369984, + "step": 467 + }, + { + "epoch": 0.1, + "grad_norm": 1.828125, + "learning_rate": 2e-05, + "loss": 2.7167, + "num_input_tokens_seen": 981467136, + "step": 468 + }, + { + "epoch": 0.1, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 2.6965, + "num_input_tokens_seen": 983564288, + "step": 469 + }, + { + "epoch": 0.1, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 2.7068, + "num_input_tokens_seen": 985661440, + "step": 470 + }, + { + "epoch": 0.1, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 2.6865, + "num_input_tokens_seen": 987758592, + "step": 471 + }, + { + "epoch": 0.1, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 2.7114, + "num_input_tokens_seen": 989855744, + "step": 472 + }, + { + "epoch": 0.1, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 2.6863, + "num_input_tokens_seen": 991952896, + "step": 473 + }, + { + "epoch": 0.1, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 2.6849, + "num_input_tokens_seen": 994050048, + "step": 474 + }, + { + "epoch": 0.1, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 2.6971, + "num_input_tokens_seen": 996147200, + "step": 475 + }, + { + "epoch": 0.1, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 2.6676, + "num_input_tokens_seen": 998244352, + "step": 476 + }, + { + "epoch": 0.1, + "grad_norm": 1.9765625, + "learning_rate": 2e-05, + "loss": 2.6912, + "num_input_tokens_seen": 1000341504, + "step": 477 + }, + { + "epoch": 0.1, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 2.7104, + "num_input_tokens_seen": 1002438656, + "step": 478 + }, + { + "epoch": 0.1, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 2.6734, + "num_input_tokens_seen": 1004535808, + "step": 479 + }, + { + "epoch": 0.1, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 2.7041, + "num_input_tokens_seen": 1006632960, + "step": 480 + }, + { + "epoch": 0.1, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 2.6588, + "num_input_tokens_seen": 1008730112, + "step": 481 + }, + { + "epoch": 0.1, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 2.6661, + "num_input_tokens_seen": 1010827264, + "step": 482 + }, + { + "epoch": 0.1, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 2.6809, + "num_input_tokens_seen": 1012924416, + "step": 483 + }, + { + "epoch": 0.1, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 2.6699, + "num_input_tokens_seen": 1015021568, + "step": 484 + }, + { + "epoch": 0.1, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 2.6395, + "num_input_tokens_seen": 1017118720, + "step": 485 + }, + { + "epoch": 0.1, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 2.6861, + "num_input_tokens_seen": 1019215872, + "step": 486 + }, + { + "epoch": 0.1, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 2.6745, + "num_input_tokens_seen": 1021313024, + "step": 487 + }, + { + "epoch": 0.1, + "grad_norm": 2.125, + "learning_rate": 2e-05, + "loss": 2.6688, + "num_input_tokens_seen": 1023410176, + "step": 488 + }, + { + "epoch": 0.1, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 2.684, + "num_input_tokens_seen": 1025507328, + "step": 489 + }, + { + "epoch": 0.1, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 2.681, + "num_input_tokens_seen": 1027604480, + "step": 490 + }, + { + "epoch": 0.1, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 2.6767, + "num_input_tokens_seen": 1029701632, + "step": 491 + }, + { + "epoch": 0.1, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 2.6788, + "num_input_tokens_seen": 1031798784, + "step": 492 + }, + { + "epoch": 0.1, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 2.6646, + "num_input_tokens_seen": 1033895936, + "step": 493 + }, + { + "epoch": 0.1, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 2.6781, + "num_input_tokens_seen": 1035993088, + "step": 494 + }, + { + "epoch": 0.1, + "grad_norm": 1.9921875, + "learning_rate": 2e-05, + "loss": 2.6442, + "num_input_tokens_seen": 1038090240, + "step": 495 + }, + { + "epoch": 0.1, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 2.675, + "num_input_tokens_seen": 1040187392, + "step": 496 + }, + { + "epoch": 0.1, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 2.6755, + "num_input_tokens_seen": 1042284544, + "step": 497 + }, + { + "epoch": 0.1, + "grad_norm": 1.84375, + "learning_rate": 2e-05, + "loss": 2.6628, + "num_input_tokens_seen": 1044381696, + "step": 498 + }, + { + "epoch": 0.1, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 2.641, + "num_input_tokens_seen": 1046478848, + "step": 499 + }, + { + "epoch": 0.1, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 2.6776, + "num_input_tokens_seen": 1048576000, + "step": 500 + }, + { + "epoch": 0.11, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 2.6597, + "num_input_tokens_seen": 1050673152, + "step": 501 + }, + { + "epoch": 0.11, + "grad_norm": 2.234375, + "learning_rate": 2e-05, + "loss": 2.6489, + "num_input_tokens_seen": 1052770304, + "step": 502 + }, + { + "epoch": 0.11, + "grad_norm": 2.359375, + "learning_rate": 2e-05, + "loss": 2.6456, + "num_input_tokens_seen": 1054867456, + "step": 503 + }, + { + "epoch": 0.11, + "grad_norm": 1.34375, + "learning_rate": 2e-05, + "loss": 2.6376, + "num_input_tokens_seen": 1056964608, + "step": 504 + }, + { + "epoch": 0.11, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 2.639, + "num_input_tokens_seen": 1059061760, + "step": 505 + }, + { + "epoch": 0.11, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 2.648, + "num_input_tokens_seen": 1061158912, + "step": 506 + }, + { + "epoch": 0.11, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 2.6524, + "num_input_tokens_seen": 1063256064, + "step": 507 + }, + { + "epoch": 0.11, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 2.6346, + "num_input_tokens_seen": 1065353216, + "step": 508 + }, + { + "epoch": 0.11, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 2.6405, + "num_input_tokens_seen": 1067450368, + "step": 509 + }, + { + "epoch": 0.11, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 2.6305, + "num_input_tokens_seen": 1069547520, + "step": 510 + }, + { + "epoch": 0.11, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 2.6503, + "num_input_tokens_seen": 1071644672, + "step": 511 + }, + { + "epoch": 0.11, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 2.6229, + "num_input_tokens_seen": 1073741824, + "step": 512 + }, + { + "epoch": 0.11, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 2.6231, + "num_input_tokens_seen": 1075838976, + "step": 513 + }, + { + "epoch": 0.11, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 2.6399, + "num_input_tokens_seen": 1077936128, + "step": 514 + }, + { + "epoch": 0.11, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 2.62, + "num_input_tokens_seen": 1080033280, + "step": 515 + }, + { + "epoch": 0.11, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 2.6226, + "num_input_tokens_seen": 1082130432, + "step": 516 + }, + { + "epoch": 0.11, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 2.6314, + "num_input_tokens_seen": 1084227584, + "step": 517 + }, + { + "epoch": 0.11, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 2.6009, + "num_input_tokens_seen": 1086324736, + "step": 518 + }, + { + "epoch": 0.11, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 2.6375, + "num_input_tokens_seen": 1088421888, + "step": 519 + }, + { + "epoch": 0.11, + "grad_norm": 1.2734375, + "learning_rate": 2e-05, + "loss": 2.6241, + "num_input_tokens_seen": 1090519040, + "step": 520 + }, + { + "epoch": 0.11, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 2.6293, + "num_input_tokens_seen": 1092616192, + "step": 521 + }, + { + "epoch": 0.11, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.6184, + "num_input_tokens_seen": 1094713344, + "step": 522 + }, + { + "epoch": 0.11, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 2.6116, + "num_input_tokens_seen": 1096810496, + "step": 523 + }, + { + "epoch": 0.11, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 2.5915, + "num_input_tokens_seen": 1098907648, + "step": 524 + }, + { + "epoch": 0.11, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.6169, + "num_input_tokens_seen": 1101004800, + "step": 525 + }, + { + "epoch": 0.11, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 2.596, + "num_input_tokens_seen": 1103101952, + "step": 526 + }, + { + "epoch": 0.11, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 2.6431, + "num_input_tokens_seen": 1105199104, + "step": 527 + }, + { + "epoch": 0.11, + "grad_norm": 1.7890625, + "learning_rate": 2e-05, + "loss": 2.6304, + "num_input_tokens_seen": 1107296256, + "step": 528 + }, + { + "epoch": 0.11, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 2.6342, + "num_input_tokens_seen": 1109393408, + "step": 529 + }, + { + "epoch": 0.11, + "grad_norm": 1.96875, + "learning_rate": 2e-05, + "loss": 2.6268, + "num_input_tokens_seen": 1111490560, + "step": 530 + }, + { + "epoch": 0.11, + "grad_norm": 1.9140625, + "learning_rate": 2e-05, + "loss": 2.6557, + "num_input_tokens_seen": 1113587712, + "step": 531 + }, + { + "epoch": 0.11, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 2.609, + "num_input_tokens_seen": 1115684864, + "step": 532 + }, + { + "epoch": 0.11, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 2.5872, + "num_input_tokens_seen": 1117782016, + "step": 533 + }, + { + "epoch": 0.11, + "grad_norm": 1.9453125, + "learning_rate": 2e-05, + "loss": 2.6239, + "num_input_tokens_seen": 1119879168, + "step": 534 + }, + { + "epoch": 0.11, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 2.5891, + "num_input_tokens_seen": 1121976320, + "step": 535 + }, + { + "epoch": 0.11, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 2.6165, + "num_input_tokens_seen": 1124073472, + "step": 536 + }, + { + "epoch": 0.11, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 2.6474, + "num_input_tokens_seen": 1126170624, + "step": 537 + }, + { + "epoch": 0.11, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 2.6127, + "num_input_tokens_seen": 1128267776, + "step": 538 + }, + { + "epoch": 0.11, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 2.6028, + "num_input_tokens_seen": 1130364928, + "step": 539 + }, + { + "epoch": 0.11, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 2.5819, + "num_input_tokens_seen": 1132462080, + "step": 540 + }, + { + "epoch": 0.11, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 2.5896, + "num_input_tokens_seen": 1134559232, + "step": 541 + }, + { + "epoch": 0.11, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 2.5851, + "num_input_tokens_seen": 1136656384, + "step": 542 + }, + { + "epoch": 0.11, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 2.5615, + "num_input_tokens_seen": 1138753536, + "step": 543 + }, + { + "epoch": 0.11, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 2.601, + "num_input_tokens_seen": 1140850688, + "step": 544 + }, + { + "epoch": 0.11, + "grad_norm": 1.3125, + "learning_rate": 2e-05, + "loss": 2.6026, + "num_input_tokens_seen": 1142947840, + "step": 545 + }, + { + "epoch": 0.11, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 2.6076, + "num_input_tokens_seen": 1145044992, + "step": 546 + }, + { + "epoch": 0.11, + "grad_norm": 1.265625, + "learning_rate": 2e-05, + "loss": 2.6204, + "num_input_tokens_seen": 1147142144, + "step": 547 + }, + { + "epoch": 0.11, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 2.6222, + "num_input_tokens_seen": 1149239296, + "step": 548 + }, + { + "epoch": 0.12, + "grad_norm": 1.328125, + "learning_rate": 2e-05, + "loss": 2.62, + "num_input_tokens_seen": 1151336448, + "step": 549 + }, + { + "epoch": 0.12, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 2.6029, + "num_input_tokens_seen": 1153433600, + "step": 550 + }, + { + "epoch": 0.12, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 2.6162, + "num_input_tokens_seen": 1155530752, + "step": 551 + }, + { + "epoch": 0.12, + "grad_norm": 1.3046875, + "learning_rate": 2e-05, + "loss": 2.5789, + "num_input_tokens_seen": 1157627904, + "step": 552 + }, + { + "epoch": 0.12, + "grad_norm": 1.9453125, + "learning_rate": 2e-05, + "loss": 2.5987, + "num_input_tokens_seen": 1159725056, + "step": 553 + }, + { + "epoch": 0.12, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 2.5675, + "num_input_tokens_seen": 1161822208, + "step": 554 + }, + { + "epoch": 0.12, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 2.5748, + "num_input_tokens_seen": 1163919360, + "step": 555 + }, + { + "epoch": 0.12, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 2.626, + "num_input_tokens_seen": 1166016512, + "step": 556 + }, + { + "epoch": 0.12, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 2.6002, + "num_input_tokens_seen": 1168113664, + "step": 557 + }, + { + "epoch": 0.12, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 2.6136, + "num_input_tokens_seen": 1170210816, + "step": 558 + }, + { + "epoch": 0.12, + "grad_norm": 1.1953125, + "learning_rate": 2e-05, + "loss": 2.5847, + "num_input_tokens_seen": 1172307968, + "step": 559 + }, + { + "epoch": 0.12, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 2.5762, + "num_input_tokens_seen": 1174405120, + "step": 560 + }, + { + "epoch": 0.12, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 2.5625, + "num_input_tokens_seen": 1176502272, + "step": 561 + }, + { + "epoch": 0.12, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 2.5663, + "num_input_tokens_seen": 1178599424, + "step": 562 + }, + { + "epoch": 0.12, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 2.5882, + "num_input_tokens_seen": 1180696576, + "step": 563 + }, + { + "epoch": 0.12, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 2.5758, + "num_input_tokens_seen": 1182793728, + "step": 564 + }, + { + "epoch": 0.12, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 2.5826, + "num_input_tokens_seen": 1184890880, + "step": 565 + }, + { + "epoch": 0.12, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 2.5991, + "num_input_tokens_seen": 1186988032, + "step": 566 + }, + { + "epoch": 0.12, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.5852, + "num_input_tokens_seen": 1189085184, + "step": 567 + }, + { + "epoch": 0.12, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 2.6272, + "num_input_tokens_seen": 1191182336, + "step": 568 + }, + { + "epoch": 0.12, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 2.5717, + "num_input_tokens_seen": 1193279488, + "step": 569 + }, + { + "epoch": 0.12, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 2.5766, + "num_input_tokens_seen": 1195376640, + "step": 570 + }, + { + "epoch": 0.12, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 2.5873, + "num_input_tokens_seen": 1197473792, + "step": 571 + }, + { + "epoch": 0.12, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 2.5767, + "num_input_tokens_seen": 1199570944, + "step": 572 + }, + { + "epoch": 0.12, + "grad_norm": 1.2890625, + "learning_rate": 2e-05, + "loss": 2.5672, + "num_input_tokens_seen": 1201668096, + "step": 573 + }, + { + "epoch": 0.12, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 2.6013, + "num_input_tokens_seen": 1203765248, + "step": 574 + }, + { + "epoch": 0.12, + "grad_norm": 1.828125, + "learning_rate": 2e-05, + "loss": 2.5735, + "num_input_tokens_seen": 1205862400, + "step": 575 + }, + { + "epoch": 0.12, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 2.5683, + "num_input_tokens_seen": 1207959552, + "step": 576 + }, + { + "epoch": 0.12, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 2.571, + "num_input_tokens_seen": 1210056704, + "step": 577 + }, + { + "epoch": 0.12, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 2.5195, + "num_input_tokens_seen": 1212153856, + "step": 578 + }, + { + "epoch": 0.12, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.6242, + "num_input_tokens_seen": 1214251008, + "step": 579 + }, + { + "epoch": 0.12, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 2.5576, + "num_input_tokens_seen": 1216348160, + "step": 580 + }, + { + "epoch": 0.12, + "grad_norm": 1.328125, + "learning_rate": 2e-05, + "loss": 2.5746, + "num_input_tokens_seen": 1218445312, + "step": 581 + }, + { + "epoch": 0.12, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 2.5561, + "num_input_tokens_seen": 1220542464, + "step": 582 + }, + { + "epoch": 0.12, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 2.6001, + "num_input_tokens_seen": 1222639616, + "step": 583 + }, + { + "epoch": 0.12, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 2.5807, + "num_input_tokens_seen": 1224736768, + "step": 584 + }, + { + "epoch": 0.12, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.5861, + "num_input_tokens_seen": 1226833920, + "step": 585 + }, + { + "epoch": 0.12, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 2.5884, + "num_input_tokens_seen": 1228931072, + "step": 586 + }, + { + "epoch": 0.12, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.56, + "num_input_tokens_seen": 1231028224, + "step": 587 + }, + { + "epoch": 0.12, + "grad_norm": 1.3046875, + "learning_rate": 2e-05, + "loss": 2.5724, + "num_input_tokens_seen": 1233125376, + "step": 588 + }, + { + "epoch": 0.12, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 2.5797, + "num_input_tokens_seen": 1235222528, + "step": 589 + }, + { + "epoch": 0.12, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 2.5742, + "num_input_tokens_seen": 1237319680, + "step": 590 + }, + { + "epoch": 0.12, + "grad_norm": 1.265625, + "learning_rate": 2e-05, + "loss": 2.5676, + "num_input_tokens_seen": 1239416832, + "step": 591 + }, + { + "epoch": 0.12, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 2.5713, + "num_input_tokens_seen": 1241513984, + "step": 592 + }, + { + "epoch": 0.12, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 2.5664, + "num_input_tokens_seen": 1243611136, + "step": 593 + }, + { + "epoch": 0.12, + "grad_norm": 1.1640625, + "learning_rate": 2e-05, + "loss": 2.5582, + "num_input_tokens_seen": 1245708288, + "step": 594 + }, + { + "epoch": 0.12, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 2.5555, + "num_input_tokens_seen": 1247805440, + "step": 595 + }, + { + "epoch": 0.12, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 2.5855, + "num_input_tokens_seen": 1249902592, + "step": 596 + }, + { + "epoch": 0.13, + "grad_norm": 1.390625, + "learning_rate": 2e-05, + "loss": 2.5575, + "num_input_tokens_seen": 1251999744, + "step": 597 + }, + { + "epoch": 0.13, + "eval_loss": 2.5605051517486572, + "eval_runtime": 1935.8541, + "eval_samples_per_second": 2.036, + "eval_steps_per_second": 0.509, + "num_input_tokens_seen": 1251999744, + "step": 597 + }, + { + "epoch": 0.13, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 2.5658, + "num_input_tokens_seen": 1254096896, + "step": 598 + }, + { + "epoch": 0.13, + "grad_norm": 1.3046875, + "learning_rate": 2e-05, + "loss": 2.5309, + "num_input_tokens_seen": 1256194048, + "step": 599 + }, + { + "epoch": 0.13, + "grad_norm": 1.1953125, + "learning_rate": 2e-05, + "loss": 2.5477, + "num_input_tokens_seen": 1258291200, + "step": 600 + }, + { + "epoch": 0.13, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 2.592, + "num_input_tokens_seen": 1260388352, + "step": 601 + }, + { + "epoch": 0.13, + "grad_norm": 1.828125, + "learning_rate": 2e-05, + "loss": 2.5534, + "num_input_tokens_seen": 1262485504, + "step": 602 + }, + { + "epoch": 0.13, + "grad_norm": 1.390625, + "learning_rate": 2e-05, + "loss": 2.5679, + "num_input_tokens_seen": 1264582656, + "step": 603 + }, + { + "epoch": 0.13, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 2.5482, + "num_input_tokens_seen": 1266679808, + "step": 604 + }, + { + "epoch": 0.13, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 2.5591, + "num_input_tokens_seen": 1268776960, + "step": 605 + }, + { + "epoch": 0.13, + "grad_norm": 5.21875, + "learning_rate": 2e-05, + "loss": 2.5742, + "num_input_tokens_seen": 1270874112, + "step": 606 + }, + { + "epoch": 0.13, + "grad_norm": 1.9375, + "learning_rate": 2e-05, + "loss": 2.5941, + "num_input_tokens_seen": 1272971264, + "step": 607 + }, + { + "epoch": 0.13, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 2.5539, + "num_input_tokens_seen": 1275068416, + "step": 608 + }, + { + "epoch": 0.13, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 2.5596, + "num_input_tokens_seen": 1277165568, + "step": 609 + }, + { + "epoch": 0.13, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 2.5698, + "num_input_tokens_seen": 1279262720, + "step": 610 + }, + { + "epoch": 0.13, + "grad_norm": 1.125, + "learning_rate": 2e-05, + "loss": 2.5453, + "num_input_tokens_seen": 1281359872, + "step": 611 + }, + { + "epoch": 0.13, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 2.5818, + "num_input_tokens_seen": 1283457024, + "step": 612 + }, + { + "epoch": 0.13, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 2.5525, + "num_input_tokens_seen": 1285554176, + "step": 613 + }, + { + "epoch": 0.13, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 2.5555, + "num_input_tokens_seen": 1287651328, + "step": 614 + }, + { + "epoch": 0.13, + "grad_norm": 1.0859375, + "learning_rate": 2e-05, + "loss": 2.5297, + "num_input_tokens_seen": 1289748480, + "step": 615 + }, + { + "epoch": 0.13, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 2.5414, + "num_input_tokens_seen": 1291845632, + "step": 616 + }, + { + "epoch": 0.13, + "grad_norm": 1.2265625, + "learning_rate": 2e-05, + "loss": 2.5455, + "num_input_tokens_seen": 1293942784, + "step": 617 + }, + { + "epoch": 0.13, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 2.5228, + "num_input_tokens_seen": 1296039936, + "step": 618 + }, + { + "epoch": 0.13, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 2.5407, + "num_input_tokens_seen": 1298137088, + "step": 619 + }, + { + "epoch": 0.13, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.5332, + "num_input_tokens_seen": 1300234240, + "step": 620 + }, + { + "epoch": 0.13, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 2.5389, + "num_input_tokens_seen": 1302331392, + "step": 621 + }, + { + "epoch": 0.13, + "grad_norm": 1.171875, + "learning_rate": 2e-05, + "loss": 2.5826, + "num_input_tokens_seen": 1304428544, + "step": 622 + }, + { + "epoch": 0.13, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 2.5491, + "num_input_tokens_seen": 1306525696, + "step": 623 + }, + { + "epoch": 0.13, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 2.5525, + "num_input_tokens_seen": 1308622848, + "step": 624 + }, + { + "epoch": 0.13, + "grad_norm": 1.328125, + "learning_rate": 2e-05, + "loss": 2.5601, + "num_input_tokens_seen": 1310720000, + "step": 625 + }, + { + "epoch": 0.13, + "grad_norm": 1.8515625, + "learning_rate": 2e-05, + "loss": 2.5455, + "num_input_tokens_seen": 1312817152, + "step": 626 + }, + { + "epoch": 0.13, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 2.5564, + "num_input_tokens_seen": 1314914304, + "step": 627 + }, + { + "epoch": 0.13, + "grad_norm": 1.34375, + "learning_rate": 2e-05, + "loss": 2.5425, + "num_input_tokens_seen": 1317011456, + "step": 628 + }, + { + "epoch": 0.13, + "grad_norm": 1.9609375, + "learning_rate": 2e-05, + "loss": 2.5185, + "num_input_tokens_seen": 1319108608, + "step": 629 + }, + { + "epoch": 0.13, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 2.5366, + "num_input_tokens_seen": 1321205760, + "step": 630 + }, + { + "epoch": 0.13, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 2.5214, + "num_input_tokens_seen": 1323302912, + "step": 631 + }, + { + "epoch": 0.13, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 2.551, + "num_input_tokens_seen": 1325400064, + "step": 632 + }, + { + "epoch": 0.13, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 2.556, + "num_input_tokens_seen": 1327497216, + "step": 633 + }, + { + "epoch": 0.13, + "grad_norm": 1.3125, + "learning_rate": 2e-05, + "loss": 2.5366, + "num_input_tokens_seen": 1329594368, + "step": 634 + }, + { + "epoch": 0.13, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 2.5448, + "num_input_tokens_seen": 1331691520, + "step": 635 + }, + { + "epoch": 0.13, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 2.5318, + "num_input_tokens_seen": 1333788672, + "step": 636 + }, + { + "epoch": 0.13, + "grad_norm": 1.171875, + "learning_rate": 2e-05, + "loss": 2.5578, + "num_input_tokens_seen": 1335885824, + "step": 637 + }, + { + "epoch": 0.13, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 2.5275, + "num_input_tokens_seen": 1337982976, + "step": 638 + }, + { + "epoch": 0.13, + "grad_norm": 0.99609375, + "learning_rate": 2e-05, + "loss": 2.5075, + "num_input_tokens_seen": 1340080128, + "step": 639 + }, + { + "epoch": 0.13, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 2.5553, + "num_input_tokens_seen": 1342177280, + "step": 640 + }, + { + "epoch": 0.13, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 2.5386, + "num_input_tokens_seen": 1344274432, + "step": 641 + }, + { + "epoch": 0.13, + "grad_norm": 1.1796875, + "learning_rate": 2e-05, + "loss": 2.5444, + "num_input_tokens_seen": 1346371584, + "step": 642 + }, + { + "epoch": 0.13, + "grad_norm": 1.390625, + "learning_rate": 2e-05, + "loss": 2.5225, + "num_input_tokens_seen": 1348468736, + "step": 643 + }, + { + "epoch": 0.14, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 2.4973, + "num_input_tokens_seen": 1350565888, + "step": 644 + }, + { + "epoch": 0.14, + "grad_norm": 1.3046875, + "learning_rate": 2e-05, + "loss": 2.5117, + "num_input_tokens_seen": 1352663040, + "step": 645 + }, + { + "epoch": 0.14, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 2.5352, + "num_input_tokens_seen": 1354760192, + "step": 646 + }, + { + "epoch": 0.14, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 2.5621, + "num_input_tokens_seen": 1356857344, + "step": 647 + }, + { + "epoch": 0.14, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.5334, + "num_input_tokens_seen": 1358954496, + "step": 648 + }, + { + "epoch": 0.14, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.5368, + "num_input_tokens_seen": 1361051648, + "step": 649 + }, + { + "epoch": 0.14, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.4947, + "num_input_tokens_seen": 1363148800, + "step": 650 + }, + { + "epoch": 0.14, + "grad_norm": 1.21875, + "learning_rate": 2e-05, + "loss": 2.5294, + "num_input_tokens_seen": 1365245952, + "step": 651 + }, + { + "epoch": 0.14, + "grad_norm": 1.125, + "learning_rate": 2e-05, + "loss": 2.5132, + "num_input_tokens_seen": 1367343104, + "step": 652 + }, + { + "epoch": 0.14, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.5071, + "num_input_tokens_seen": 1369440256, + "step": 653 + }, + { + "epoch": 0.14, + "grad_norm": 1.2109375, + "learning_rate": 2e-05, + "loss": 2.5225, + "num_input_tokens_seen": 1371537408, + "step": 654 + }, + { + "epoch": 0.14, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.5289, + "num_input_tokens_seen": 1373634560, + "step": 655 + }, + { + "epoch": 0.14, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.5269, + "num_input_tokens_seen": 1375731712, + "step": 656 + }, + { + "epoch": 0.14, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 2.5342, + "num_input_tokens_seen": 1377828864, + "step": 657 + }, + { + "epoch": 0.14, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 2.5428, + "num_input_tokens_seen": 1379926016, + "step": 658 + }, + { + "epoch": 0.14, + "grad_norm": 1.3046875, + "learning_rate": 2e-05, + "loss": 2.5184, + "num_input_tokens_seen": 1382023168, + "step": 659 + }, + { + "epoch": 0.14, + "grad_norm": 1.125, + "learning_rate": 2e-05, + "loss": 2.5133, + "num_input_tokens_seen": 1384120320, + "step": 660 + }, + { + "epoch": 0.14, + "grad_norm": 1.3125, + "learning_rate": 2e-05, + "loss": 2.5176, + "num_input_tokens_seen": 1386217472, + "step": 661 + }, + { + "epoch": 0.14, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 2.5149, + "num_input_tokens_seen": 1388314624, + "step": 662 + }, + { + "epoch": 0.14, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.4967, + "num_input_tokens_seen": 1390411776, + "step": 663 + }, + { + "epoch": 0.14, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 2.5214, + "num_input_tokens_seen": 1392508928, + "step": 664 + }, + { + "epoch": 0.14, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 2.5204, + "num_input_tokens_seen": 1394606080, + "step": 665 + }, + { + "epoch": 0.14, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 2.5478, + "num_input_tokens_seen": 1396703232, + "step": 666 + }, + { + "epoch": 0.14, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.5185, + "num_input_tokens_seen": 1398800384, + "step": 667 + }, + { + "epoch": 0.14, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 2.5132, + "num_input_tokens_seen": 1400897536, + "step": 668 + }, + { + "epoch": 0.14, + "grad_norm": 1.1875, + "learning_rate": 2e-05, + "loss": 2.5008, + "num_input_tokens_seen": 1402994688, + "step": 669 + }, + { + "epoch": 0.14, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 2.5182, + "num_input_tokens_seen": 1405091840, + "step": 670 + }, + { + "epoch": 0.14, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 2.5165, + "num_input_tokens_seen": 1407188992, + "step": 671 + }, + { + "epoch": 0.14, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 2.4936, + "num_input_tokens_seen": 1409286144, + "step": 672 + }, + { + "epoch": 0.14, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 2.5164, + "num_input_tokens_seen": 1411383296, + "step": 673 + }, + { + "epoch": 0.14, + "grad_norm": 1.2890625, + "learning_rate": 2e-05, + "loss": 2.4825, + "num_input_tokens_seen": 1413480448, + "step": 674 + }, + { + "epoch": 0.14, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 2.5515, + "num_input_tokens_seen": 1415577600, + "step": 675 + }, + { + "epoch": 0.14, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 2.5405, + "num_input_tokens_seen": 1417674752, + "step": 676 + }, + { + "epoch": 0.14, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 2.5031, + "num_input_tokens_seen": 1419771904, + "step": 677 + }, + { + "epoch": 0.14, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 2.5156, + "num_input_tokens_seen": 1421869056, + "step": 678 + }, + { + "epoch": 0.14, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 2.5381, + "num_input_tokens_seen": 1423966208, + "step": 679 + }, + { + "epoch": 0.14, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.5342, + "num_input_tokens_seen": 1426063360, + "step": 680 + }, + { + "epoch": 0.14, + "grad_norm": 1.1640625, + "learning_rate": 2e-05, + "loss": 2.51, + "num_input_tokens_seen": 1428160512, + "step": 681 + }, + { + "epoch": 0.14, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 2.5108, + "num_input_tokens_seen": 1430257664, + "step": 682 + }, + { + "epoch": 0.14, + "grad_norm": 1.2109375, + "learning_rate": 2e-05, + "loss": 2.5245, + "num_input_tokens_seen": 1432354816, + "step": 683 + }, + { + "epoch": 0.14, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 2.5298, + "num_input_tokens_seen": 1434451968, + "step": 684 + }, + { + "epoch": 0.14, + "grad_norm": 1.7890625, + "learning_rate": 2e-05, + "loss": 2.4931, + "num_input_tokens_seen": 1436549120, + "step": 685 + }, + { + "epoch": 0.14, + "grad_norm": 1.2265625, + "learning_rate": 2e-05, + "loss": 2.5164, + "num_input_tokens_seen": 1438646272, + "step": 686 + }, + { + "epoch": 0.14, + "grad_norm": 1.921875, + "learning_rate": 2e-05, + "loss": 2.512, + "num_input_tokens_seen": 1440743424, + "step": 687 + }, + { + "epoch": 0.14, + "grad_norm": 2.15625, + "learning_rate": 2e-05, + "loss": 2.5037, + "num_input_tokens_seen": 1442840576, + "step": 688 + }, + { + "epoch": 0.14, + "grad_norm": 1.3046875, + "learning_rate": 2e-05, + "loss": 2.5065, + "num_input_tokens_seen": 1444937728, + "step": 689 + }, + { + "epoch": 0.14, + "grad_norm": 1.9375, + "learning_rate": 2e-05, + "loss": 2.5398, + "num_input_tokens_seen": 1447034880, + "step": 690 + }, + { + "epoch": 0.14, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 2.5232, + "num_input_tokens_seen": 1449132032, + "step": 691 + }, + { + "epoch": 0.15, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 2.4913, + "num_input_tokens_seen": 1451229184, + "step": 692 + }, + { + "epoch": 0.15, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 2.493, + "num_input_tokens_seen": 1453326336, + "step": 693 + }, + { + "epoch": 0.15, + "grad_norm": 2.109375, + "learning_rate": 2e-05, + "loss": 2.4812, + "num_input_tokens_seen": 1455423488, + "step": 694 + }, + { + "epoch": 0.15, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 2.4951, + "num_input_tokens_seen": 1457520640, + "step": 695 + }, + { + "epoch": 0.15, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 2.4912, + "num_input_tokens_seen": 1459617792, + "step": 696 + }, + { + "epoch": 0.15, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 2.4773, + "num_input_tokens_seen": 1461714944, + "step": 697 + }, + { + "epoch": 0.15, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 2.4976, + "num_input_tokens_seen": 1463812096, + "step": 698 + }, + { + "epoch": 0.15, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.5132, + "num_input_tokens_seen": 1465909248, + "step": 699 + }, + { + "epoch": 0.15, + "grad_norm": 1.2890625, + "learning_rate": 2e-05, + "loss": 2.5073, + "num_input_tokens_seen": 1468006400, + "step": 700 + }, + { + "epoch": 0.15, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.4998, + "num_input_tokens_seen": 1470103552, + "step": 701 + }, + { + "epoch": 0.15, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 2.4995, + "num_input_tokens_seen": 1472200704, + "step": 702 + }, + { + "epoch": 0.15, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.5251, + "num_input_tokens_seen": 1474297856, + "step": 703 + }, + { + "epoch": 0.15, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.4824, + "num_input_tokens_seen": 1476395008, + "step": 704 + }, + { + "epoch": 0.15, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.5041, + "num_input_tokens_seen": 1478492160, + "step": 705 + }, + { + "epoch": 0.15, + "grad_norm": 0.9921875, + "learning_rate": 2e-05, + "loss": 2.4976, + "num_input_tokens_seen": 1480589312, + "step": 706 + }, + { + "epoch": 0.15, + "grad_norm": 1.2265625, + "learning_rate": 2e-05, + "loss": 2.4816, + "num_input_tokens_seen": 1482686464, + "step": 707 + }, + { + "epoch": 0.15, + "grad_norm": 1.1796875, + "learning_rate": 2e-05, + "loss": 2.4875, + "num_input_tokens_seen": 1484783616, + "step": 708 + }, + { + "epoch": 0.15, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.4723, + "num_input_tokens_seen": 1486880768, + "step": 709 + }, + { + "epoch": 0.15, + "grad_norm": 1.1171875, + "learning_rate": 2e-05, + "loss": 2.5009, + "num_input_tokens_seen": 1488977920, + "step": 710 + }, + { + "epoch": 0.15, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.4659, + "num_input_tokens_seen": 1491075072, + "step": 711 + }, + { + "epoch": 0.15, + "grad_norm": 1.125, + "learning_rate": 2e-05, + "loss": 2.5063, + "num_input_tokens_seen": 1493172224, + "step": 712 + }, + { + "epoch": 0.15, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.4913, + "num_input_tokens_seen": 1495269376, + "step": 713 + }, + { + "epoch": 0.15, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.5255, + "num_input_tokens_seen": 1497366528, + "step": 714 + }, + { + "epoch": 0.15, + "grad_norm": 0.9453125, + "learning_rate": 2e-05, + "loss": 2.4863, + "num_input_tokens_seen": 1499463680, + "step": 715 + }, + { + "epoch": 0.15, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.5022, + "num_input_tokens_seen": 1501560832, + "step": 716 + }, + { + "epoch": 0.15, + "grad_norm": 0.98046875, + "learning_rate": 2e-05, + "loss": 2.4878, + "num_input_tokens_seen": 1503657984, + "step": 717 + }, + { + "epoch": 0.15, + "grad_norm": 1.9921875, + "learning_rate": 2e-05, + "loss": 2.4977, + "num_input_tokens_seen": 1505755136, + "step": 718 + }, + { + "epoch": 0.15, + "grad_norm": 1.3046875, + "learning_rate": 2e-05, + "loss": 2.4998, + "num_input_tokens_seen": 1507852288, + "step": 719 + }, + { + "epoch": 0.15, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 2.4922, + "num_input_tokens_seen": 1509949440, + "step": 720 + }, + { + "epoch": 0.15, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 2.4744, + "num_input_tokens_seen": 1512046592, + "step": 721 + }, + { + "epoch": 0.15, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 2.5217, + "num_input_tokens_seen": 1514143744, + "step": 722 + }, + { + "epoch": 0.15, + "grad_norm": 1.8359375, + "learning_rate": 2e-05, + "loss": 2.4968, + "num_input_tokens_seen": 1516240896, + "step": 723 + }, + { + "epoch": 0.15, + "grad_norm": 1.1875, + "learning_rate": 2e-05, + "loss": 2.5143, + "num_input_tokens_seen": 1518338048, + "step": 724 + }, + { + "epoch": 0.15, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 2.4557, + "num_input_tokens_seen": 1520435200, + "step": 725 + }, + { + "epoch": 0.15, + "grad_norm": 2.0, + "learning_rate": 2e-05, + "loss": 2.4728, + "num_input_tokens_seen": 1522532352, + "step": 726 + }, + { + "epoch": 0.15, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 2.5286, + "num_input_tokens_seen": 1524629504, + "step": 727 + }, + { + "epoch": 0.15, + "grad_norm": 3.9375, + "learning_rate": 2e-05, + "loss": 2.5188, + "num_input_tokens_seen": 1526726656, + "step": 728 + }, + { + "epoch": 0.15, + "grad_norm": 3.265625, + "learning_rate": 2e-05, + "loss": 2.5145, + "num_input_tokens_seen": 1528823808, + "step": 729 + }, + { + "epoch": 0.15, + "grad_norm": 4.0625, + "learning_rate": 2e-05, + "loss": 2.4789, + "num_input_tokens_seen": 1530920960, + "step": 730 + }, + { + "epoch": 0.15, + "grad_norm": 3.9375, + "learning_rate": 2e-05, + "loss": 2.513, + "num_input_tokens_seen": 1533018112, + "step": 731 + }, + { + "epoch": 0.15, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 2.503, + "num_input_tokens_seen": 1535115264, + "step": 732 + }, + { + "epoch": 0.15, + "grad_norm": 3.171875, + "learning_rate": 2e-05, + "loss": 2.493, + "num_input_tokens_seen": 1537212416, + "step": 733 + }, + { + "epoch": 0.15, + "grad_norm": 2.71875, + "learning_rate": 2e-05, + "loss": 2.5021, + "num_input_tokens_seen": 1539309568, + "step": 734 + }, + { + "epoch": 0.15, + "grad_norm": 3.25, + "learning_rate": 2e-05, + "loss": 2.4947, + "num_input_tokens_seen": 1541406720, + "step": 735 + }, + { + "epoch": 0.15, + "grad_norm": 3.0, + "learning_rate": 2e-05, + "loss": 2.4681, + "num_input_tokens_seen": 1543503872, + "step": 736 + }, + { + "epoch": 0.15, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 2.451, + "num_input_tokens_seen": 1545601024, + "step": 737 + }, + { + "epoch": 0.15, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 2.4803, + "num_input_tokens_seen": 1547698176, + "step": 738 + }, + { + "epoch": 0.15, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 2.4772, + "num_input_tokens_seen": 1549795328, + "step": 739 + }, + { + "epoch": 0.16, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.4969, + "num_input_tokens_seen": 1551892480, + "step": 740 + }, + { + "epoch": 0.16, + "grad_norm": 1.8984375, + "learning_rate": 2e-05, + "loss": 2.5116, + "num_input_tokens_seen": 1553989632, + "step": 741 + }, + { + "epoch": 0.16, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 2.5076, + "num_input_tokens_seen": 1556086784, + "step": 742 + }, + { + "epoch": 0.16, + "grad_norm": 1.9140625, + "learning_rate": 2e-05, + "loss": 2.4929, + "num_input_tokens_seen": 1558183936, + "step": 743 + }, + { + "epoch": 0.16, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 2.4747, + "num_input_tokens_seen": 1560281088, + "step": 744 + }, + { + "epoch": 0.16, + "grad_norm": 2.65625, + "learning_rate": 2e-05, + "loss": 2.516, + "num_input_tokens_seen": 1562378240, + "step": 745 + }, + { + "epoch": 0.16, + "grad_norm": 2.46875, + "learning_rate": 2e-05, + "loss": 2.4961, + "num_input_tokens_seen": 1564475392, + "step": 746 + }, + { + "epoch": 0.16, + "grad_norm": 1.8515625, + "learning_rate": 2e-05, + "loss": 2.4857, + "num_input_tokens_seen": 1566572544, + "step": 747 + }, + { + "epoch": 0.16, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 2.5108, + "num_input_tokens_seen": 1568669696, + "step": 748 + }, + { + "epoch": 0.16, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 2.4894, + "num_input_tokens_seen": 1570766848, + "step": 749 + }, + { + "epoch": 0.16, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 2.478, + "num_input_tokens_seen": 1572864000, + "step": 750 + }, + { + "epoch": 0.16, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 2.4634, + "num_input_tokens_seen": 1574961152, + "step": 751 + }, + { + "epoch": 0.16, + "grad_norm": 1.125, + "learning_rate": 2e-05, + "loss": 2.4478, + "num_input_tokens_seen": 1577058304, + "step": 752 + }, + { + "epoch": 0.16, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 2.4704, + "num_input_tokens_seen": 1579155456, + "step": 753 + }, + { + "epoch": 0.16, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 2.4485, + "num_input_tokens_seen": 1581252608, + "step": 754 + }, + { + "epoch": 0.16, + "grad_norm": 1.9140625, + "learning_rate": 2e-05, + "loss": 2.4761, + "num_input_tokens_seen": 1583349760, + "step": 755 + }, + { + "epoch": 0.16, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 2.4574, + "num_input_tokens_seen": 1585446912, + "step": 756 + }, + { + "epoch": 0.16, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 2.4602, + "num_input_tokens_seen": 1587544064, + "step": 757 + }, + { + "epoch": 0.16, + "grad_norm": 2.109375, + "learning_rate": 2e-05, + "loss": 2.4939, + "num_input_tokens_seen": 1589641216, + "step": 758 + }, + { + "epoch": 0.16, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.4781, + "num_input_tokens_seen": 1591738368, + "step": 759 + }, + { + "epoch": 0.16, + "grad_norm": 2.75, + "learning_rate": 2e-05, + "loss": 2.4875, + "num_input_tokens_seen": 1593835520, + "step": 760 + }, + { + "epoch": 0.16, + "grad_norm": 2.75, + "learning_rate": 2e-05, + "loss": 2.4751, + "num_input_tokens_seen": 1595932672, + "step": 761 + }, + { + "epoch": 0.16, + "grad_norm": 1.171875, + "learning_rate": 2e-05, + "loss": 2.474, + "num_input_tokens_seen": 1598029824, + "step": 762 + }, + { + "epoch": 0.16, + "grad_norm": 2.640625, + "learning_rate": 2e-05, + "loss": 2.5156, + "num_input_tokens_seen": 1600126976, + "step": 763 + }, + { + "epoch": 0.16, + "grad_norm": 2.765625, + "learning_rate": 2e-05, + "loss": 2.508, + "num_input_tokens_seen": 1602224128, + "step": 764 + }, + { + "epoch": 0.16, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 2.4859, + "num_input_tokens_seen": 1604321280, + "step": 765 + }, + { + "epoch": 0.16, + "grad_norm": 1.9609375, + "learning_rate": 2e-05, + "loss": 2.4775, + "num_input_tokens_seen": 1606418432, + "step": 766 + }, + { + "epoch": 0.16, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 2.529, + "num_input_tokens_seen": 1608515584, + "step": 767 + }, + { + "epoch": 0.16, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 2.4666, + "num_input_tokens_seen": 1610612736, + "step": 768 + }, + { + "epoch": 0.16, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 2.4742, + "num_input_tokens_seen": 1612709888, + "step": 769 + }, + { + "epoch": 0.16, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 2.4831, + "num_input_tokens_seen": 1614807040, + "step": 770 + }, + { + "epoch": 0.16, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 2.4959, + "num_input_tokens_seen": 1616904192, + "step": 771 + }, + { + "epoch": 0.16, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 2.4368, + "num_input_tokens_seen": 1619001344, + "step": 772 + }, + { + "epoch": 0.16, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 2.5079, + "num_input_tokens_seen": 1621098496, + "step": 773 + }, + { + "epoch": 0.16, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 2.4824, + "num_input_tokens_seen": 1623195648, + "step": 774 + }, + { + "epoch": 0.16, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 2.5034, + "num_input_tokens_seen": 1625292800, + "step": 775 + }, + { + "epoch": 0.16, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 2.4613, + "num_input_tokens_seen": 1627389952, + "step": 776 + }, + { + "epoch": 0.16, + "grad_norm": 1.1640625, + "learning_rate": 2e-05, + "loss": 2.4565, + "num_input_tokens_seen": 1629487104, + "step": 777 + }, + { + "epoch": 0.16, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 2.4804, + "num_input_tokens_seen": 1631584256, + "step": 778 + }, + { + "epoch": 0.16, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 2.494, + "num_input_tokens_seen": 1633681408, + "step": 779 + }, + { + "epoch": 0.16, + "grad_norm": 0.97265625, + "learning_rate": 2e-05, + "loss": 2.5035, + "num_input_tokens_seen": 1635778560, + "step": 780 + }, + { + "epoch": 0.16, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 2.4932, + "num_input_tokens_seen": 1637875712, + "step": 781 + }, + { + "epoch": 0.16, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.4587, + "num_input_tokens_seen": 1639972864, + "step": 782 + }, + { + "epoch": 0.16, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 2.4703, + "num_input_tokens_seen": 1642070016, + "step": 783 + }, + { + "epoch": 0.16, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 2.4613, + "num_input_tokens_seen": 1644167168, + "step": 784 + }, + { + "epoch": 0.16, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 2.4795, + "num_input_tokens_seen": 1646264320, + "step": 785 + }, + { + "epoch": 0.16, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 2.448, + "num_input_tokens_seen": 1648361472, + "step": 786 + }, + { + "epoch": 0.17, + "grad_norm": 1.0859375, + "learning_rate": 2e-05, + "loss": 2.4778, + "num_input_tokens_seen": 1650458624, + "step": 787 + }, + { + "epoch": 0.17, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 2.421, + "num_input_tokens_seen": 1652555776, + "step": 788 + }, + { + "epoch": 0.17, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 2.4363, + "num_input_tokens_seen": 1654652928, + "step": 789 + }, + { + "epoch": 0.17, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.4879, + "num_input_tokens_seen": 1656750080, + "step": 790 + }, + { + "epoch": 0.17, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 2.4929, + "num_input_tokens_seen": 1658847232, + "step": 791 + }, + { + "epoch": 0.17, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 2.4515, + "num_input_tokens_seen": 1660944384, + "step": 792 + }, + { + "epoch": 0.17, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.4379, + "num_input_tokens_seen": 1663041536, + "step": 793 + }, + { + "epoch": 0.17, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 2.4681, + "num_input_tokens_seen": 1665138688, + "step": 794 + }, + { + "epoch": 0.17, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.4484, + "num_input_tokens_seen": 1667235840, + "step": 795 + }, + { + "epoch": 0.17, + "grad_norm": 0.96875, + "learning_rate": 2e-05, + "loss": 2.4251, + "num_input_tokens_seen": 1669332992, + "step": 796 + }, + { + "epoch": 0.17, + "eval_loss": 2.4695074558258057, + "eval_runtime": 2061.0517, + "eval_samples_per_second": 1.913, + "eval_steps_per_second": 0.478, + "num_input_tokens_seen": 1669332992, + "step": 796 + }, + { + "epoch": 0.17, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.4705, + "num_input_tokens_seen": 1671430144, + "step": 797 + }, + { + "epoch": 0.17, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.4855, + "num_input_tokens_seen": 1673527296, + "step": 798 + }, + { + "epoch": 0.17, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.4348, + "num_input_tokens_seen": 1675624448, + "step": 799 + }, + { + "epoch": 0.17, + "grad_norm": 1.1484375, + "learning_rate": 2e-05, + "loss": 2.4627, + "num_input_tokens_seen": 1677721600, + "step": 800 + }, + { + "epoch": 0.17, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.4761, + "num_input_tokens_seen": 1679818752, + "step": 801 + }, + { + "epoch": 0.17, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.4697, + "num_input_tokens_seen": 1681915904, + "step": 802 + }, + { + "epoch": 0.17, + "grad_norm": 1.2109375, + "learning_rate": 2e-05, + "loss": 2.4944, + "num_input_tokens_seen": 1684013056, + "step": 803 + }, + { + "epoch": 0.17, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.4437, + "num_input_tokens_seen": 1686110208, + "step": 804 + }, + { + "epoch": 0.17, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.4706, + "num_input_tokens_seen": 1688207360, + "step": 805 + }, + { + "epoch": 0.17, + "grad_norm": 0.98828125, + "learning_rate": 2e-05, + "loss": 2.4807, + "num_input_tokens_seen": 1690304512, + "step": 806 + }, + { + "epoch": 0.17, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 2.453, + "num_input_tokens_seen": 1692401664, + "step": 807 + }, + { + "epoch": 0.17, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.4425, + "num_input_tokens_seen": 1694498816, + "step": 808 + }, + { + "epoch": 0.17, + "grad_norm": 1.140625, + "learning_rate": 2e-05, + "loss": 2.4742, + "num_input_tokens_seen": 1696595968, + "step": 809 + }, + { + "epoch": 0.17, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.4811, + "num_input_tokens_seen": 1698693120, + "step": 810 + }, + { + "epoch": 0.17, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.4935, + "num_input_tokens_seen": 1700790272, + "step": 811 + }, + { + "epoch": 0.17, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.4464, + "num_input_tokens_seen": 1702887424, + "step": 812 + }, + { + "epoch": 0.17, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.4123, + "num_input_tokens_seen": 1704984576, + "step": 813 + }, + { + "epoch": 0.17, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.4494, + "num_input_tokens_seen": 1707081728, + "step": 814 + }, + { + "epoch": 0.17, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.4583, + "num_input_tokens_seen": 1709178880, + "step": 815 + }, + { + "epoch": 0.17, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.4519, + "num_input_tokens_seen": 1711276032, + "step": 816 + }, + { + "epoch": 0.17, + "grad_norm": 1.1875, + "learning_rate": 2e-05, + "loss": 2.4436, + "num_input_tokens_seen": 1713373184, + "step": 817 + }, + { + "epoch": 0.17, + "grad_norm": 0.9921875, + "learning_rate": 2e-05, + "loss": 2.4592, + "num_input_tokens_seen": 1715470336, + "step": 818 + }, + { + "epoch": 0.17, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.4919, + "num_input_tokens_seen": 1717567488, + "step": 819 + }, + { + "epoch": 0.17, + "grad_norm": 1.109375, + "learning_rate": 2e-05, + "loss": 2.4772, + "num_input_tokens_seen": 1719664640, + "step": 820 + }, + { + "epoch": 0.17, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.4854, + "num_input_tokens_seen": 1721761792, + "step": 821 + }, + { + "epoch": 0.17, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.4528, + "num_input_tokens_seen": 1723858944, + "step": 822 + }, + { + "epoch": 0.17, + "grad_norm": 0.9921875, + "learning_rate": 2e-05, + "loss": 2.4591, + "num_input_tokens_seen": 1725956096, + "step": 823 + }, + { + "epoch": 0.17, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.4214, + "num_input_tokens_seen": 1728053248, + "step": 824 + }, + { + "epoch": 0.17, + "grad_norm": 1.140625, + "learning_rate": 2e-05, + "loss": 2.4467, + "num_input_tokens_seen": 1730150400, + "step": 825 + }, + { + "epoch": 0.17, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.4653, + "num_input_tokens_seen": 1732247552, + "step": 826 + }, + { + "epoch": 0.17, + "grad_norm": 1.1875, + "learning_rate": 2e-05, + "loss": 2.4252, + "num_input_tokens_seen": 1734344704, + "step": 827 + }, + { + "epoch": 0.17, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.4554, + "num_input_tokens_seen": 1736441856, + "step": 828 + }, + { + "epoch": 0.17, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.4544, + "num_input_tokens_seen": 1738539008, + "step": 829 + }, + { + "epoch": 0.17, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.4676, + "num_input_tokens_seen": 1740636160, + "step": 830 + }, + { + "epoch": 0.17, + "grad_norm": 1.0234375, + "learning_rate": 2e-05, + "loss": 2.4643, + "num_input_tokens_seen": 1742733312, + "step": 831 + }, + { + "epoch": 0.17, + "grad_norm": 0.98046875, + "learning_rate": 2e-05, + "loss": 2.4789, + "num_input_tokens_seen": 1744830464, + "step": 832 + }, + { + "epoch": 0.17, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.4576, + "num_input_tokens_seen": 1746927616, + "step": 833 + }, + { + "epoch": 0.17, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.4416, + "num_input_tokens_seen": 1749024768, + "step": 834 + }, + { + "epoch": 0.18, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.48, + "num_input_tokens_seen": 1751121920, + "step": 835 + }, + { + "epoch": 0.18, + "grad_norm": 0.92578125, + "learning_rate": 2e-05, + "loss": 2.4802, + "num_input_tokens_seen": 1753219072, + "step": 836 + }, + { + "epoch": 0.18, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 2.4611, + "num_input_tokens_seen": 1755316224, + "step": 837 + }, + { + "epoch": 0.18, + "grad_norm": 0.98046875, + "learning_rate": 2e-05, + "loss": 2.472, + "num_input_tokens_seen": 1757413376, + "step": 838 + }, + { + "epoch": 0.18, + "grad_norm": 1.1640625, + "learning_rate": 2e-05, + "loss": 2.492, + "num_input_tokens_seen": 1759510528, + "step": 839 + }, + { + "epoch": 0.18, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.4486, + "num_input_tokens_seen": 1761607680, + "step": 840 + }, + { + "epoch": 0.18, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.452, + "num_input_tokens_seen": 1763704832, + "step": 841 + }, + { + "epoch": 0.18, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.4545, + "num_input_tokens_seen": 1765801984, + "step": 842 + }, + { + "epoch": 0.18, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.4087, + "num_input_tokens_seen": 1767899136, + "step": 843 + }, + { + "epoch": 0.18, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.4595, + "num_input_tokens_seen": 1769996288, + "step": 844 + }, + { + "epoch": 0.18, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.452, + "num_input_tokens_seen": 1772093440, + "step": 845 + }, + { + "epoch": 0.18, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.4693, + "num_input_tokens_seen": 1774190592, + "step": 846 + }, + { + "epoch": 0.18, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.4573, + "num_input_tokens_seen": 1776287744, + "step": 847 + }, + { + "epoch": 0.18, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.4381, + "num_input_tokens_seen": 1778384896, + "step": 848 + }, + { + "epoch": 0.18, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.4512, + "num_input_tokens_seen": 1780482048, + "step": 849 + }, + { + "epoch": 0.18, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.4829, + "num_input_tokens_seen": 1782579200, + "step": 850 + }, + { + "epoch": 0.18, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.453, + "num_input_tokens_seen": 1784676352, + "step": 851 + }, + { + "epoch": 0.18, + "grad_norm": 1.3125, + "learning_rate": 2e-05, + "loss": 2.4334, + "num_input_tokens_seen": 1786773504, + "step": 852 + }, + { + "epoch": 0.18, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 2.4499, + "num_input_tokens_seen": 1788870656, + "step": 853 + }, + { + "epoch": 0.18, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.46, + "num_input_tokens_seen": 1790967808, + "step": 854 + }, + { + "epoch": 0.18, + "grad_norm": 1.2890625, + "learning_rate": 2e-05, + "loss": 2.4843, + "num_input_tokens_seen": 1793064960, + "step": 855 + }, + { + "epoch": 0.18, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.4248, + "num_input_tokens_seen": 1795162112, + "step": 856 + }, + { + "epoch": 0.18, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.4235, + "num_input_tokens_seen": 1797259264, + "step": 857 + }, + { + "epoch": 0.18, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.4795, + "num_input_tokens_seen": 1799356416, + "step": 858 + }, + { + "epoch": 0.18, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.4658, + "num_input_tokens_seen": 1801453568, + "step": 859 + }, + { + "epoch": 0.18, + "grad_norm": 1.1796875, + "learning_rate": 2e-05, + "loss": 2.4507, + "num_input_tokens_seen": 1803550720, + "step": 860 + }, + { + "epoch": 0.18, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.4535, + "num_input_tokens_seen": 1805647872, + "step": 861 + }, + { + "epoch": 0.18, + "grad_norm": 0.98828125, + "learning_rate": 2e-05, + "loss": 2.457, + "num_input_tokens_seen": 1807745024, + "step": 862 + }, + { + "epoch": 0.18, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 2.4409, + "num_input_tokens_seen": 1809842176, + "step": 863 + }, + { + "epoch": 0.18, + "grad_norm": 1.3125, + "learning_rate": 2e-05, + "loss": 2.4444, + "num_input_tokens_seen": 1811939328, + "step": 864 + }, + { + "epoch": 0.18, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.4728, + "num_input_tokens_seen": 1814036480, + "step": 865 + }, + { + "epoch": 0.18, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.4656, + "num_input_tokens_seen": 1816133632, + "step": 866 + }, + { + "epoch": 0.18, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.4547, + "num_input_tokens_seen": 1818230784, + "step": 867 + }, + { + "epoch": 0.18, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.4452, + "num_input_tokens_seen": 1820327936, + "step": 868 + }, + { + "epoch": 0.18, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 2.4749, + "num_input_tokens_seen": 1822425088, + "step": 869 + }, + { + "epoch": 0.18, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.4474, + "num_input_tokens_seen": 1824522240, + "step": 870 + }, + { + "epoch": 0.18, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.4634, + "num_input_tokens_seen": 1826619392, + "step": 871 + }, + { + "epoch": 0.18, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.4355, + "num_input_tokens_seen": 1828716544, + "step": 872 + }, + { + "epoch": 0.18, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.4704, + "num_input_tokens_seen": 1830813696, + "step": 873 + }, + { + "epoch": 0.18, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.4355, + "num_input_tokens_seen": 1832910848, + "step": 874 + }, + { + "epoch": 0.18, + "grad_norm": 1.21875, + "learning_rate": 2e-05, + "loss": 2.4125, + "num_input_tokens_seen": 1835008000, + "step": 875 + }, + { + "epoch": 0.18, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.4341, + "num_input_tokens_seen": 1837105152, + "step": 876 + }, + { + "epoch": 0.18, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 2.4112, + "num_input_tokens_seen": 1839202304, + "step": 877 + }, + { + "epoch": 0.18, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 2.442, + "num_input_tokens_seen": 1841299456, + "step": 878 + }, + { + "epoch": 0.18, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.4334, + "num_input_tokens_seen": 1843396608, + "step": 879 + }, + { + "epoch": 0.18, + "grad_norm": 0.9765625, + "learning_rate": 2e-05, + "loss": 2.4545, + "num_input_tokens_seen": 1845493760, + "step": 880 + }, + { + "epoch": 0.18, + "grad_norm": 1.1796875, + "learning_rate": 2e-05, + "loss": 2.4182, + "num_input_tokens_seen": 1847590912, + "step": 881 + }, + { + "epoch": 0.18, + "grad_norm": 1.171875, + "learning_rate": 2e-05, + "loss": 2.4012, + "num_input_tokens_seen": 1849688064, + "step": 882 + }, + { + "epoch": 0.19, + "grad_norm": 0.9453125, + "learning_rate": 2e-05, + "loss": 2.4413, + "num_input_tokens_seen": 1851785216, + "step": 883 + }, + { + "epoch": 0.19, + "grad_norm": 1.171875, + "learning_rate": 2e-05, + "loss": 2.4417, + "num_input_tokens_seen": 1853882368, + "step": 884 + }, + { + "epoch": 0.19, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.4323, + "num_input_tokens_seen": 1855979520, + "step": 885 + }, + { + "epoch": 0.19, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.4185, + "num_input_tokens_seen": 1858076672, + "step": 886 + }, + { + "epoch": 0.19, + "grad_norm": 0.9296875, + "learning_rate": 2e-05, + "loss": 2.4407, + "num_input_tokens_seen": 1860173824, + "step": 887 + }, + { + "epoch": 0.19, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.4701, + "num_input_tokens_seen": 1862270976, + "step": 888 + }, + { + "epoch": 0.19, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.4231, + "num_input_tokens_seen": 1864368128, + "step": 889 + }, + { + "epoch": 0.19, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.4478, + "num_input_tokens_seen": 1866465280, + "step": 890 + }, + { + "epoch": 0.19, + "grad_norm": 0.94140625, + "learning_rate": 2e-05, + "loss": 2.4009, + "num_input_tokens_seen": 1868562432, + "step": 891 + }, + { + "epoch": 0.19, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.4281, + "num_input_tokens_seen": 1870659584, + "step": 892 + }, + { + "epoch": 0.19, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.4311, + "num_input_tokens_seen": 1872756736, + "step": 893 + }, + { + "epoch": 0.19, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.4418, + "num_input_tokens_seen": 1874853888, + "step": 894 + }, + { + "epoch": 0.19, + "grad_norm": 1.1953125, + "learning_rate": 2e-05, + "loss": 2.4156, + "num_input_tokens_seen": 1876951040, + "step": 895 + }, + { + "epoch": 0.19, + "grad_norm": 1.1484375, + "learning_rate": 2e-05, + "loss": 2.4254, + "num_input_tokens_seen": 1879048192, + "step": 896 + }, + { + "epoch": 0.19, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.4424, + "num_input_tokens_seen": 1881145344, + "step": 897 + }, + { + "epoch": 0.19, + "grad_norm": 1.140625, + "learning_rate": 2e-05, + "loss": 2.4737, + "num_input_tokens_seen": 1883242496, + "step": 898 + }, + { + "epoch": 0.19, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.4374, + "num_input_tokens_seen": 1885339648, + "step": 899 + }, + { + "epoch": 0.19, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.409, + "num_input_tokens_seen": 1887436800, + "step": 900 + }, + { + "epoch": 0.0, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.4298, + "num_input_tokens_seen": 1889533952, + "step": 901 + }, + { + "epoch": 0.0, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.4122, + "num_input_tokens_seen": 1891631104, + "step": 902 + }, + { + "epoch": 0.0, + "grad_norm": 0.9296875, + "learning_rate": 2e-05, + "loss": 2.4203, + "num_input_tokens_seen": 1893728256, + "step": 903 + }, + { + "epoch": 0.0, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.4853, + "num_input_tokens_seen": 1895825408, + "step": 904 + }, + { + "epoch": 0.0, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.4303, + "num_input_tokens_seen": 1897922560, + "step": 905 + }, + { + "epoch": 0.0, + "grad_norm": 0.96875, + "learning_rate": 2e-05, + "loss": 2.4303, + "num_input_tokens_seen": 1900019712, + "step": 906 + }, + { + "epoch": 0.0, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 2.4605, + "num_input_tokens_seen": 1902116864, + "step": 907 + }, + { + "epoch": 0.0, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.4464, + "num_input_tokens_seen": 1904214016, + "step": 908 + }, + { + "epoch": 0.0, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.4673, + "num_input_tokens_seen": 1906311168, + "step": 909 + }, + { + "epoch": 0.0, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.4465, + "num_input_tokens_seen": 1908408320, + "step": 910 + }, + { + "epoch": 0.0, + "grad_norm": 0.91796875, + "learning_rate": 2e-05, + "loss": 2.4285, + "num_input_tokens_seen": 1910505472, + "step": 911 + }, + { + "epoch": 0.0, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.4267, + "num_input_tokens_seen": 1912602624, + "step": 912 + }, + { + "epoch": 0.0, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.4029, + "num_input_tokens_seen": 1914699776, + "step": 913 + }, + { + "epoch": 0.0, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.4503, + "num_input_tokens_seen": 1916796928, + "step": 914 + }, + { + "epoch": 0.0, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.4261, + "num_input_tokens_seen": 1918894080, + "step": 915 + }, + { + "epoch": 0.0, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.4751, + "num_input_tokens_seen": 1920991232, + "step": 916 + }, + { + "epoch": 0.0, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.4031, + "num_input_tokens_seen": 1923088384, + "step": 917 + }, + { + "epoch": 0.0, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.4036, + "num_input_tokens_seen": 1925185536, + "step": 918 + }, + { + "epoch": 0.0, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.4391, + "num_input_tokens_seen": 1927282688, + "step": 919 + }, + { + "epoch": 0.0, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.4295, + "num_input_tokens_seen": 1929379840, + "step": 920 + }, + { + "epoch": 0.0, + "grad_norm": 0.94140625, + "learning_rate": 2e-05, + "loss": 2.4197, + "num_input_tokens_seen": 1931476992, + "step": 921 + }, + { + "epoch": 0.0, + "grad_norm": 0.92578125, + "learning_rate": 2e-05, + "loss": 2.4067, + "num_input_tokens_seen": 1933574144, + "step": 922 + }, + { + "epoch": 0.0, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.4255, + "num_input_tokens_seen": 1935671296, + "step": 923 + }, + { + "epoch": 0.01, + "grad_norm": 0.8125, + "learning_rate": 2e-05, + "loss": 2.421, + "num_input_tokens_seen": 1937768448, + "step": 924 + }, + { + "epoch": 0.01, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.4383, + "num_input_tokens_seen": 1939865600, + "step": 925 + }, + { + "epoch": 0.01, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.3856, + "num_input_tokens_seen": 1941962752, + "step": 926 + }, + { + "epoch": 0.01, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.4388, + "num_input_tokens_seen": 1944059904, + "step": 927 + }, + { + "epoch": 0.01, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 2.4253, + "num_input_tokens_seen": 1946157056, + "step": 928 + }, + { + "epoch": 0.01, + "grad_norm": 1.171875, + "learning_rate": 2e-05, + "loss": 2.4118, + "num_input_tokens_seen": 1948254208, + "step": 929 + }, + { + "epoch": 0.01, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.4621, + "num_input_tokens_seen": 1950351360, + "step": 930 + }, + { + "epoch": 0.01, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.4353, + "num_input_tokens_seen": 1952448512, + "step": 931 + }, + { + "epoch": 0.01, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.4226, + "num_input_tokens_seen": 1954545664, + "step": 932 + }, + { + "epoch": 0.01, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.4451, + "num_input_tokens_seen": 1956642816, + "step": 933 + }, + { + "epoch": 0.01, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.4228, + "num_input_tokens_seen": 1958739968, + "step": 934 + }, + { + "epoch": 0.01, + "grad_norm": 0.75390625, + "learning_rate": 2e-05, + "loss": 2.433, + "num_input_tokens_seen": 1960837120, + "step": 935 + }, + { + "epoch": 0.01, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.4262, + "num_input_tokens_seen": 1962934272, + "step": 936 + }, + { + "epoch": 0.01, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.4121, + "num_input_tokens_seen": 1965031424, + "step": 937 + }, + { + "epoch": 0.01, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.449, + "num_input_tokens_seen": 1967128576, + "step": 938 + }, + { + "epoch": 0.01, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.4098, + "num_input_tokens_seen": 1969225728, + "step": 939 + }, + { + "epoch": 0.01, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.4141, + "num_input_tokens_seen": 1971322880, + "step": 940 + }, + { + "epoch": 0.01, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.4295, + "num_input_tokens_seen": 1973420032, + "step": 941 + }, + { + "epoch": 0.01, + "grad_norm": 0.92578125, + "learning_rate": 2e-05, + "loss": 2.4309, + "num_input_tokens_seen": 1975517184, + "step": 942 + }, + { + "epoch": 0.01, + "grad_norm": 0.9296875, + "learning_rate": 2e-05, + "loss": 2.4348, + "num_input_tokens_seen": 1977614336, + "step": 943 + }, + { + "epoch": 0.01, + "grad_norm": 0.9453125, + "learning_rate": 2e-05, + "loss": 2.4063, + "num_input_tokens_seen": 1979711488, + "step": 944 + }, + { + "epoch": 0.01, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.4389, + "num_input_tokens_seen": 1981808640, + "step": 945 + }, + { + "epoch": 0.01, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.404, + "num_input_tokens_seen": 1983905792, + "step": 946 + }, + { + "epoch": 0.01, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 2.4343, + "num_input_tokens_seen": 1986002944, + "step": 947 + }, + { + "epoch": 0.01, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.4231, + "num_input_tokens_seen": 1988100096, + "step": 948 + }, + { + "epoch": 0.01, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.4037, + "num_input_tokens_seen": 1990197248, + "step": 949 + }, + { + "epoch": 0.01, + "grad_norm": 1.109375, + "learning_rate": 2e-05, + "loss": 2.4282, + "num_input_tokens_seen": 1992294400, + "step": 950 + }, + { + "epoch": 0.01, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.3873, + "num_input_tokens_seen": 1994391552, + "step": 951 + }, + { + "epoch": 0.01, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.4168, + "num_input_tokens_seen": 1996488704, + "step": 952 + }, + { + "epoch": 0.01, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.379, + "num_input_tokens_seen": 1998585856, + "step": 953 + }, + { + "epoch": 0.01, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.4274, + "num_input_tokens_seen": 2000683008, + "step": 954 + }, + { + "epoch": 0.01, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.4256, + "num_input_tokens_seen": 2002780160, + "step": 955 + }, + { + "epoch": 0.01, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.3978, + "num_input_tokens_seen": 2004877312, + "step": 956 + }, + { + "epoch": 0.01, + "grad_norm": 1.1796875, + "learning_rate": 2e-05, + "loss": 2.436, + "num_input_tokens_seen": 2006974464, + "step": 957 + }, + { + "epoch": 0.01, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.4034, + "num_input_tokens_seen": 2009071616, + "step": 958 + }, + { + "epoch": 0.01, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.4084, + "num_input_tokens_seen": 2011168768, + "step": 959 + }, + { + "epoch": 0.01, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.4185, + "num_input_tokens_seen": 2013265920, + "step": 960 + }, + { + "epoch": 0.01, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.4243, + "num_input_tokens_seen": 2015363072, + "step": 961 + }, + { + "epoch": 0.01, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.4175, + "num_input_tokens_seen": 2017460224, + "step": 962 + }, + { + "epoch": 0.01, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.4205, + "num_input_tokens_seen": 2019557376, + "step": 963 + }, + { + "epoch": 0.01, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.4379, + "num_input_tokens_seen": 2021654528, + "step": 964 + }, + { + "epoch": 0.01, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.3875, + "num_input_tokens_seen": 2023751680, + "step": 965 + }, + { + "epoch": 0.01, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.3922, + "num_input_tokens_seen": 2025848832, + "step": 966 + }, + { + "epoch": 0.01, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.3979, + "num_input_tokens_seen": 2027945984, + "step": 967 + }, + { + "epoch": 0.01, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.4118, + "num_input_tokens_seen": 2030043136, + "step": 968 + }, + { + "epoch": 0.01, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.4504, + "num_input_tokens_seen": 2032140288, + "step": 969 + }, + { + "epoch": 0.01, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.4206, + "num_input_tokens_seen": 2034237440, + "step": 970 + }, + { + "epoch": 0.01, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.4255, + "num_input_tokens_seen": 2036334592, + "step": 971 + }, + { + "epoch": 0.02, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.4186, + "num_input_tokens_seen": 2038431744, + "step": 972 + }, + { + "epoch": 0.02, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.4145, + "num_input_tokens_seen": 2040528896, + "step": 973 + }, + { + "epoch": 0.02, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.4348, + "num_input_tokens_seen": 2042626048, + "step": 974 + }, + { + "epoch": 0.02, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.4064, + "num_input_tokens_seen": 2044723200, + "step": 975 + }, + { + "epoch": 0.02, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.4116, + "num_input_tokens_seen": 2046820352, + "step": 976 + }, + { + "epoch": 0.02, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.4249, + "num_input_tokens_seen": 2048917504, + "step": 977 + }, + { + "epoch": 0.02, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.4131, + "num_input_tokens_seen": 2051014656, + "step": 978 + }, + { + "epoch": 0.02, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.4183, + "num_input_tokens_seen": 2053111808, + "step": 979 + }, + { + "epoch": 0.02, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.4368, + "num_input_tokens_seen": 2055208960, + "step": 980 + }, + { + "epoch": 0.02, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.3713, + "num_input_tokens_seen": 2057306112, + "step": 981 + }, + { + "epoch": 0.02, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.4042, + "num_input_tokens_seen": 2059403264, + "step": 982 + }, + { + "epoch": 0.02, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.4134, + "num_input_tokens_seen": 2061500416, + "step": 983 + }, + { + "epoch": 0.02, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.4265, + "num_input_tokens_seen": 2063597568, + "step": 984 + }, + { + "epoch": 0.02, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.4184, + "num_input_tokens_seen": 2065694720, + "step": 985 + }, + { + "epoch": 0.02, + "grad_norm": 0.94140625, + "learning_rate": 2e-05, + "loss": 2.4253, + "num_input_tokens_seen": 2067791872, + "step": 986 + }, + { + "epoch": 0.02, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.3763, + "num_input_tokens_seen": 2069889024, + "step": 987 + }, + { + "epoch": 0.02, + "grad_norm": 1.1171875, + "learning_rate": 2e-05, + "loss": 2.4447, + "num_input_tokens_seen": 2071986176, + "step": 988 + }, + { + "epoch": 0.02, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.4417, + "num_input_tokens_seen": 2074083328, + "step": 989 + }, + { + "epoch": 0.02, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.3975, + "num_input_tokens_seen": 2076180480, + "step": 990 + }, + { + "epoch": 0.02, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.4077, + "num_input_tokens_seen": 2078277632, + "step": 991 + }, + { + "epoch": 0.02, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.4241, + "num_input_tokens_seen": 2080374784, + "step": 992 + }, + { + "epoch": 0.02, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 2.4241, + "num_input_tokens_seen": 2082471936, + "step": 993 + }, + { + "epoch": 0.02, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 2.4238, + "num_input_tokens_seen": 2084569088, + "step": 994 + }, + { + "epoch": 0.02, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.3792, + "num_input_tokens_seen": 2086666240, + "step": 995 + }, + { + "epoch": 0.02, + "eval_loss": 2.4099812507629395, + "eval_runtime": 2026.4024, + "eval_samples_per_second": 1.945, + "eval_steps_per_second": 0.487, + "num_input_tokens_seen": 2086666240, + "step": 995 + }, + { + "epoch": 0.02, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 2.3921, + "num_input_tokens_seen": 2088763392, + "step": 996 + }, + { + "epoch": 0.02, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 2.3957, + "num_input_tokens_seen": 2090860544, + "step": 997 + }, + { + "epoch": 0.02, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.3755, + "num_input_tokens_seen": 2092957696, + "step": 998 + }, + { + "epoch": 0.02, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.406, + "num_input_tokens_seen": 2095054848, + "step": 999 + }, + { + "epoch": 0.02, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.3788, + "num_input_tokens_seen": 2097152000, + "step": 1000 + }, + { + "epoch": 0.02, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.395, + "num_input_tokens_seen": 2099249152, + "step": 1001 + }, + { + "epoch": 0.02, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.3892, + "num_input_tokens_seen": 2101346304, + "step": 1002 + }, + { + "epoch": 0.02, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.3921, + "num_input_tokens_seen": 2103443456, + "step": 1003 + }, + { + "epoch": 0.02, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.4217, + "num_input_tokens_seen": 2105540608, + "step": 1004 + }, + { + "epoch": 0.02, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.418, + "num_input_tokens_seen": 2107637760, + "step": 1005 + }, + { + "epoch": 0.02, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.4269, + "num_input_tokens_seen": 2109734912, + "step": 1006 + }, + { + "epoch": 0.02, + "grad_norm": 0.94140625, + "learning_rate": 2e-05, + "loss": 2.4026, + "num_input_tokens_seen": 2111832064, + "step": 1007 + }, + { + "epoch": 0.02, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.3926, + "num_input_tokens_seen": 2113929216, + "step": 1008 + }, + { + "epoch": 0.02, + "grad_norm": 1.2109375, + "learning_rate": 2e-05, + "loss": 2.4017, + "num_input_tokens_seen": 2116026368, + "step": 1009 + }, + { + "epoch": 0.02, + "grad_norm": 0.98046875, + "learning_rate": 2e-05, + "loss": 2.4085, + "num_input_tokens_seen": 2118123520, + "step": 1010 + }, + { + "epoch": 0.02, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.4296, + "num_input_tokens_seen": 2120220672, + "step": 1011 + }, + { + "epoch": 0.02, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 2.3999, + "num_input_tokens_seen": 2122317824, + "step": 1012 + }, + { + "epoch": 0.02, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.4103, + "num_input_tokens_seen": 2124414976, + "step": 1013 + }, + { + "epoch": 0.02, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 2.379, + "num_input_tokens_seen": 2126512128, + "step": 1014 + }, + { + "epoch": 0.02, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 2.4024, + "num_input_tokens_seen": 2128609280, + "step": 1015 + }, + { + "epoch": 0.02, + "grad_norm": 1.9453125, + "learning_rate": 2e-05, + "loss": 2.4079, + "num_input_tokens_seen": 2130706432, + "step": 1016 + }, + { + "epoch": 0.02, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.396, + "num_input_tokens_seen": 2132803584, + "step": 1017 + }, + { + "epoch": 0.02, + "grad_norm": 1.125, + "learning_rate": 2e-05, + "loss": 2.4098, + "num_input_tokens_seen": 2134900736, + "step": 1018 + }, + { + "epoch": 0.02, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 2.4249, + "num_input_tokens_seen": 2136997888, + "step": 1019 + }, + { + "epoch": 0.03, + "grad_norm": 1.1796875, + "learning_rate": 2e-05, + "loss": 2.4094, + "num_input_tokens_seen": 2139095040, + "step": 1020 + }, + { + "epoch": 0.03, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 2.4284, + "num_input_tokens_seen": 2141192192, + "step": 1021 + }, + { + "epoch": 0.03, + "grad_norm": 1.1953125, + "learning_rate": 2e-05, + "loss": 2.3914, + "num_input_tokens_seen": 2143289344, + "step": 1022 + }, + { + "epoch": 0.03, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.4305, + "num_input_tokens_seen": 2145386496, + "step": 1023 + }, + { + "epoch": 0.03, + "grad_norm": 7.34375, + "learning_rate": 2e-05, + "loss": 2.4282, + "num_input_tokens_seen": 2147483648, + "step": 1024 + }, + { + "epoch": 0.03, + "grad_norm": 2.203125, + "learning_rate": 2e-05, + "loss": 2.3861, + "num_input_tokens_seen": 2149580800, + "step": 1025 + }, + { + "epoch": 0.03, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 2.3983, + "num_input_tokens_seen": 2151677952, + "step": 1026 + }, + { + "epoch": 0.03, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 2.4085, + "num_input_tokens_seen": 2153775104, + "step": 1027 + }, + { + "epoch": 0.03, + "grad_norm": 1.140625, + "learning_rate": 2e-05, + "loss": 2.3947, + "num_input_tokens_seen": 2155872256, + "step": 1028 + }, + { + "epoch": 0.03, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 2.3926, + "num_input_tokens_seen": 2157969408, + "step": 1029 + }, + { + "epoch": 0.03, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 2.407, + "num_input_tokens_seen": 2160066560, + "step": 1030 + }, + { + "epoch": 0.03, + "grad_norm": 2.8125, + "learning_rate": 2e-05, + "loss": 2.3723, + "num_input_tokens_seen": 2162163712, + "step": 1031 + }, + { + "epoch": 0.03, + "grad_norm": 2.75, + "learning_rate": 2e-05, + "loss": 2.394, + "num_input_tokens_seen": 2164260864, + "step": 1032 + }, + { + "epoch": 0.03, + "grad_norm": 2.8125, + "learning_rate": 2e-05, + "loss": 2.4265, + "num_input_tokens_seen": 2166358016, + "step": 1033 + }, + { + "epoch": 0.03, + "grad_norm": 2.765625, + "learning_rate": 2e-05, + "loss": 2.4274, + "num_input_tokens_seen": 2168455168, + "step": 1034 + }, + { + "epoch": 0.03, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.4115, + "num_input_tokens_seen": 2170552320, + "step": 1035 + }, + { + "epoch": 0.03, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 2.409, + "num_input_tokens_seen": 2172649472, + "step": 1036 + }, + { + "epoch": 0.03, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 2.4012, + "num_input_tokens_seen": 2174746624, + "step": 1037 + }, + { + "epoch": 0.03, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.3897, + "num_input_tokens_seen": 2176843776, + "step": 1038 + }, + { + "epoch": 0.03, + "grad_norm": 1.1953125, + "learning_rate": 2e-05, + "loss": 2.3675, + "num_input_tokens_seen": 2178940928, + "step": 1039 + }, + { + "epoch": 0.03, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 2.3848, + "num_input_tokens_seen": 2181038080, + "step": 1040 + }, + { + "epoch": 0.03, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.4051, + "num_input_tokens_seen": 2183135232, + "step": 1041 + }, + { + "epoch": 0.03, + "grad_norm": 1.1640625, + "learning_rate": 2e-05, + "loss": 2.3864, + "num_input_tokens_seen": 2185232384, + "step": 1042 + }, + { + "epoch": 0.03, + "grad_norm": 1.2890625, + "learning_rate": 2e-05, + "loss": 2.4005, + "num_input_tokens_seen": 2187329536, + "step": 1043 + }, + { + "epoch": 0.03, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.4049, + "num_input_tokens_seen": 2189426688, + "step": 1044 + }, + { + "epoch": 0.03, + "grad_norm": 1.3046875, + "learning_rate": 2e-05, + "loss": 2.378, + "num_input_tokens_seen": 2191523840, + "step": 1045 + }, + { + "epoch": 0.03, + "grad_norm": 0.9765625, + "learning_rate": 2e-05, + "loss": 2.3973, + "num_input_tokens_seen": 2193620992, + "step": 1046 + }, + { + "epoch": 0.03, + "grad_norm": 1.3125, + "learning_rate": 2e-05, + "loss": 2.3868, + "num_input_tokens_seen": 2195718144, + "step": 1047 + }, + { + "epoch": 0.03, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.4234, + "num_input_tokens_seen": 2197815296, + "step": 1048 + }, + { + "epoch": 0.03, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.3843, + "num_input_tokens_seen": 2199912448, + "step": 1049 + }, + { + "epoch": 0.03, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.4011, + "num_input_tokens_seen": 2202009600, + "step": 1050 + }, + { + "epoch": 0.03, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.3929, + "num_input_tokens_seen": 2204106752, + "step": 1051 + }, + { + "epoch": 0.03, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.411, + "num_input_tokens_seen": 2206203904, + "step": 1052 + }, + { + "epoch": 0.03, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.3944, + "num_input_tokens_seen": 2208301056, + "step": 1053 + }, + { + "epoch": 0.03, + "grad_norm": 0.9296875, + "learning_rate": 2e-05, + "loss": 2.3961, + "num_input_tokens_seen": 2210398208, + "step": 1054 + }, + { + "epoch": 0.03, + "grad_norm": 1.1953125, + "learning_rate": 2e-05, + "loss": 2.4035, + "num_input_tokens_seen": 2212495360, + "step": 1055 + }, + { + "epoch": 0.03, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.3996, + "num_input_tokens_seen": 2214592512, + "step": 1056 + }, + { + "epoch": 0.03, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 2.3795, + "num_input_tokens_seen": 2216689664, + "step": 1057 + }, + { + "epoch": 0.03, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.4011, + "num_input_tokens_seen": 2218786816, + "step": 1058 + }, + { + "epoch": 0.03, + "grad_norm": 0.97265625, + "learning_rate": 2e-05, + "loss": 2.4035, + "num_input_tokens_seen": 2220883968, + "step": 1059 + }, + { + "epoch": 0.03, + "grad_norm": 1.109375, + "learning_rate": 2e-05, + "loss": 2.3736, + "num_input_tokens_seen": 2222981120, + "step": 1060 + }, + { + "epoch": 0.03, + "grad_norm": 0.9453125, + "learning_rate": 2e-05, + "loss": 2.3902, + "num_input_tokens_seen": 2225078272, + "step": 1061 + }, + { + "epoch": 0.03, + "grad_norm": 1.0859375, + "learning_rate": 2e-05, + "loss": 2.4029, + "num_input_tokens_seen": 2227175424, + "step": 1062 + }, + { + "epoch": 0.03, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.3827, + "num_input_tokens_seen": 2229272576, + "step": 1063 + }, + { + "epoch": 0.03, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.3872, + "num_input_tokens_seen": 2231369728, + "step": 1064 + }, + { + "epoch": 0.03, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.4143, + "num_input_tokens_seen": 2233466880, + "step": 1065 + }, + { + "epoch": 0.03, + "grad_norm": 0.91796875, + "learning_rate": 2e-05, + "loss": 2.4009, + "num_input_tokens_seen": 2235564032, + "step": 1066 + }, + { + "epoch": 0.04, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.4237, + "num_input_tokens_seen": 2237661184, + "step": 1067 + }, + { + "epoch": 0.04, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.4082, + "num_input_tokens_seen": 2239758336, + "step": 1068 + }, + { + "epoch": 0.04, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.3918, + "num_input_tokens_seen": 2241855488, + "step": 1069 + }, + { + "epoch": 0.04, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.3951, + "num_input_tokens_seen": 2243952640, + "step": 1070 + }, + { + "epoch": 0.04, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.4266, + "num_input_tokens_seen": 2246049792, + "step": 1071 + }, + { + "epoch": 0.04, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.3887, + "num_input_tokens_seen": 2248146944, + "step": 1072 + }, + { + "epoch": 0.04, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.407, + "num_input_tokens_seen": 2250244096, + "step": 1073 + }, + { + "epoch": 0.04, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.4165, + "num_input_tokens_seen": 2252341248, + "step": 1074 + }, + { + "epoch": 0.04, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 2.4115, + "num_input_tokens_seen": 2254438400, + "step": 1075 + }, + { + "epoch": 0.04, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.4064, + "num_input_tokens_seen": 2256535552, + "step": 1076 + }, + { + "epoch": 0.04, + "grad_norm": 0.98046875, + "learning_rate": 2e-05, + "loss": 2.4055, + "num_input_tokens_seen": 2258632704, + "step": 1077 + }, + { + "epoch": 0.04, + "grad_norm": 1.0234375, + "learning_rate": 2e-05, + "loss": 2.3839, + "num_input_tokens_seen": 2260729856, + "step": 1078 + }, + { + "epoch": 0.04, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.3689, + "num_input_tokens_seen": 2262827008, + "step": 1079 + }, + { + "epoch": 0.04, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.3904, + "num_input_tokens_seen": 2264924160, + "step": 1080 + }, + { + "epoch": 0.04, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.3937, + "num_input_tokens_seen": 2267021312, + "step": 1081 + }, + { + "epoch": 0.04, + "grad_norm": 0.8125, + "learning_rate": 2e-05, + "loss": 2.394, + "num_input_tokens_seen": 2269118464, + "step": 1082 + }, + { + "epoch": 0.04, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.3519, + "num_input_tokens_seen": 2271215616, + "step": 1083 + }, + { + "epoch": 0.04, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.4015, + "num_input_tokens_seen": 2273312768, + "step": 1084 + }, + { + "epoch": 0.04, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.4102, + "num_input_tokens_seen": 2275409920, + "step": 1085 + }, + { + "epoch": 0.04, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.3982, + "num_input_tokens_seen": 2277507072, + "step": 1086 + }, + { + "epoch": 0.04, + "grad_norm": 0.98828125, + "learning_rate": 2e-05, + "loss": 2.3755, + "num_input_tokens_seen": 2279604224, + "step": 1087 + }, + { + "epoch": 0.04, + "grad_norm": 0.703125, + "learning_rate": 2e-05, + "loss": 2.3891, + "num_input_tokens_seen": 2281701376, + "step": 1088 + }, + { + "epoch": 0.04, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.3752, + "num_input_tokens_seen": 2283798528, + "step": 1089 + }, + { + "epoch": 0.04, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.4111, + "num_input_tokens_seen": 2285895680, + "step": 1090 + }, + { + "epoch": 0.04, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.3601, + "num_input_tokens_seen": 2287992832, + "step": 1091 + }, + { + "epoch": 0.04, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.3638, + "num_input_tokens_seen": 2290089984, + "step": 1092 + }, + { + "epoch": 0.04, + "grad_norm": 0.68359375, + "learning_rate": 2e-05, + "loss": 2.3908, + "num_input_tokens_seen": 2292187136, + "step": 1093 + }, + { + "epoch": 0.04, + "grad_norm": 0.75390625, + "learning_rate": 2e-05, + "loss": 2.3776, + "num_input_tokens_seen": 2294284288, + "step": 1094 + }, + { + "epoch": 0.04, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.3821, + "num_input_tokens_seen": 2296381440, + "step": 1095 + }, + { + "epoch": 0.04, + "grad_norm": 0.6953125, + "learning_rate": 2e-05, + "loss": 2.3815, + "num_input_tokens_seen": 2298478592, + "step": 1096 + }, + { + "epoch": 0.04, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 2.3876, + "num_input_tokens_seen": 2300575744, + "step": 1097 + }, + { + "epoch": 0.04, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.365, + "num_input_tokens_seen": 2302672896, + "step": 1098 + }, + { + "epoch": 0.04, + "grad_norm": 0.6953125, + "learning_rate": 2e-05, + "loss": 2.3973, + "num_input_tokens_seen": 2304770048, + "step": 1099 + }, + { + "epoch": 0.04, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.4167, + "num_input_tokens_seen": 2306867200, + "step": 1100 + }, + { + "epoch": 0.04, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.3971, + "num_input_tokens_seen": 2308964352, + "step": 1101 + }, + { + "epoch": 0.04, + "grad_norm": 0.8125, + "learning_rate": 2e-05, + "loss": 2.3951, + "num_input_tokens_seen": 2311061504, + "step": 1102 + }, + { + "epoch": 0.04, + "grad_norm": 0.6953125, + "learning_rate": 2e-05, + "loss": 2.3836, + "num_input_tokens_seen": 2313158656, + "step": 1103 + }, + { + "epoch": 0.04, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.4029, + "num_input_tokens_seen": 2315255808, + "step": 1104 + }, + { + "epoch": 0.04, + "grad_norm": 0.7265625, + "learning_rate": 2e-05, + "loss": 2.3726, + "num_input_tokens_seen": 2317352960, + "step": 1105 + }, + { + "epoch": 0.04, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.3544, + "num_input_tokens_seen": 2319450112, + "step": 1106 + }, + { + "epoch": 0.04, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.4069, + "num_input_tokens_seen": 2321547264, + "step": 1107 + }, + { + "epoch": 0.04, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.3749, + "num_input_tokens_seen": 2323644416, + "step": 1108 + }, + { + "epoch": 0.04, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.372, + "num_input_tokens_seen": 2325741568, + "step": 1109 + }, + { + "epoch": 0.04, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.381, + "num_input_tokens_seen": 2327838720, + "step": 1110 + }, + { + "epoch": 0.04, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.3766, + "num_input_tokens_seen": 2329935872, + "step": 1111 + }, + { + "epoch": 0.04, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.4038, + "num_input_tokens_seen": 2332033024, + "step": 1112 + }, + { + "epoch": 0.04, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.3697, + "num_input_tokens_seen": 2334130176, + "step": 1113 + }, + { + "epoch": 0.04, + "grad_norm": 0.796875, + "learning_rate": 2e-05, + "loss": 2.3886, + "num_input_tokens_seen": 2336227328, + "step": 1114 + }, + { + "epoch": 0.05, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.405, + "num_input_tokens_seen": 2338324480, + "step": 1115 + }, + { + "epoch": 0.05, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.364, + "num_input_tokens_seen": 2340421632, + "step": 1116 + }, + { + "epoch": 0.05, + "grad_norm": 0.69921875, + "learning_rate": 2e-05, + "loss": 2.3912, + "num_input_tokens_seen": 2342518784, + "step": 1117 + }, + { + "epoch": 0.05, + "grad_norm": 1.3046875, + "learning_rate": 2e-05, + "loss": 2.4048, + "num_input_tokens_seen": 2344615936, + "step": 1118 + }, + { + "epoch": 0.05, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.382, + "num_input_tokens_seen": 2346713088, + "step": 1119 + }, + { + "epoch": 0.05, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.3883, + "num_input_tokens_seen": 2348810240, + "step": 1120 + }, + { + "epoch": 0.05, + "grad_norm": 1.2109375, + "learning_rate": 2e-05, + "loss": 2.3781, + "num_input_tokens_seen": 2350907392, + "step": 1121 + }, + { + "epoch": 0.05, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.3828, + "num_input_tokens_seen": 2353004544, + "step": 1122 + }, + { + "epoch": 0.05, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.3987, + "num_input_tokens_seen": 2355101696, + "step": 1123 + }, + { + "epoch": 0.05, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.3944, + "num_input_tokens_seen": 2357198848, + "step": 1124 + }, + { + "epoch": 0.05, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.4032, + "num_input_tokens_seen": 2359296000, + "step": 1125 + }, + { + "epoch": 0.05, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.3996, + "num_input_tokens_seen": 2361393152, + "step": 1126 + }, + { + "epoch": 0.05, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.3727, + "num_input_tokens_seen": 2363490304, + "step": 1127 + }, + { + "epoch": 0.05, + "grad_norm": 0.97265625, + "learning_rate": 2e-05, + "loss": 2.3563, + "num_input_tokens_seen": 2365587456, + "step": 1128 + }, + { + "epoch": 0.05, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.3712, + "num_input_tokens_seen": 2367684608, + "step": 1129 + }, + { + "epoch": 0.05, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.3919, + "num_input_tokens_seen": 2369781760, + "step": 1130 + }, + { + "epoch": 0.05, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.3306, + "num_input_tokens_seen": 2371878912, + "step": 1131 + }, + { + "epoch": 0.05, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.3813, + "num_input_tokens_seen": 2373976064, + "step": 1132 + }, + { + "epoch": 0.05, + "grad_norm": 0.67578125, + "learning_rate": 2e-05, + "loss": 2.3796, + "num_input_tokens_seen": 2376073216, + "step": 1133 + }, + { + "epoch": 0.05, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.3817, + "num_input_tokens_seen": 2378170368, + "step": 1134 + }, + { + "epoch": 0.05, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.3592, + "num_input_tokens_seen": 2380267520, + "step": 1135 + }, + { + "epoch": 0.05, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.3894, + "num_input_tokens_seen": 2382364672, + "step": 1136 + }, + { + "epoch": 0.05, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.3929, + "num_input_tokens_seen": 2384461824, + "step": 1137 + }, + { + "epoch": 0.05, + "grad_norm": 0.734375, + "learning_rate": 2e-05, + "loss": 2.37, + "num_input_tokens_seen": 2386558976, + "step": 1138 + }, + { + "epoch": 0.05, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.355, + "num_input_tokens_seen": 2388656128, + "step": 1139 + }, + { + "epoch": 0.05, + "grad_norm": 0.72265625, + "learning_rate": 2e-05, + "loss": 2.389, + "num_input_tokens_seen": 2390753280, + "step": 1140 + }, + { + "epoch": 0.05, + "grad_norm": 0.70703125, + "learning_rate": 2e-05, + "loss": 2.378, + "num_input_tokens_seen": 2392850432, + "step": 1141 + }, + { + "epoch": 0.05, + "grad_norm": 0.6875, + "learning_rate": 2e-05, + "loss": 2.3726, + "num_input_tokens_seen": 2394947584, + "step": 1142 + }, + { + "epoch": 0.05, + "grad_norm": 0.6875, + "learning_rate": 2e-05, + "loss": 2.3594, + "num_input_tokens_seen": 2397044736, + "step": 1143 + }, + { + "epoch": 0.05, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 2.3947, + "num_input_tokens_seen": 2399141888, + "step": 1144 + }, + { + "epoch": 0.05, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.3654, + "num_input_tokens_seen": 2401239040, + "step": 1145 + }, + { + "epoch": 0.05, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.3599, + "num_input_tokens_seen": 2403336192, + "step": 1146 + }, + { + "epoch": 0.05, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.3896, + "num_input_tokens_seen": 2405433344, + "step": 1147 + }, + { + "epoch": 0.05, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 2.3827, + "num_input_tokens_seen": 2407530496, + "step": 1148 + }, + { + "epoch": 0.05, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.3571, + "num_input_tokens_seen": 2409627648, + "step": 1149 + }, + { + "epoch": 0.05, + "grad_norm": 0.69140625, + "learning_rate": 2e-05, + "loss": 2.3918, + "num_input_tokens_seen": 2411724800, + "step": 1150 + }, + { + "epoch": 0.05, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 2.3673, + "num_input_tokens_seen": 2413821952, + "step": 1151 + }, + { + "epoch": 0.05, + "grad_norm": 0.68359375, + "learning_rate": 2e-05, + "loss": 2.3667, + "num_input_tokens_seen": 2415919104, + "step": 1152 + }, + { + "epoch": 0.05, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 2.3982, + "num_input_tokens_seen": 2418016256, + "step": 1153 + }, + { + "epoch": 0.05, + "grad_norm": 0.70703125, + "learning_rate": 2e-05, + "loss": 2.375, + "num_input_tokens_seen": 2420113408, + "step": 1154 + }, + { + "epoch": 0.05, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 2.3779, + "num_input_tokens_seen": 2422210560, + "step": 1155 + }, + { + "epoch": 0.05, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.3876, + "num_input_tokens_seen": 2424307712, + "step": 1156 + }, + { + "epoch": 0.05, + "grad_norm": 0.6640625, + "learning_rate": 2e-05, + "loss": 2.3711, + "num_input_tokens_seen": 2426404864, + "step": 1157 + }, + { + "epoch": 0.05, + "grad_norm": 0.6953125, + "learning_rate": 2e-05, + "loss": 2.3751, + "num_input_tokens_seen": 2428502016, + "step": 1158 + }, + { + "epoch": 0.05, + "grad_norm": 0.671875, + "learning_rate": 2e-05, + "loss": 2.3844, + "num_input_tokens_seen": 2430599168, + "step": 1159 + }, + { + "epoch": 0.05, + "grad_norm": 0.734375, + "learning_rate": 2e-05, + "loss": 2.3802, + "num_input_tokens_seen": 2432696320, + "step": 1160 + }, + { + "epoch": 0.05, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.3819, + "num_input_tokens_seen": 2434793472, + "step": 1161 + }, + { + "epoch": 0.05, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.3521, + "num_input_tokens_seen": 2436890624, + "step": 1162 + }, + { + "epoch": 0.06, + "grad_norm": 0.75, + "learning_rate": 2e-05, + "loss": 2.3467, + "num_input_tokens_seen": 2438987776, + "step": 1163 + }, + { + "epoch": 0.06, + "grad_norm": 0.65625, + "learning_rate": 2e-05, + "loss": 2.3812, + "num_input_tokens_seen": 2441084928, + "step": 1164 + }, + { + "epoch": 0.06, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.3569, + "num_input_tokens_seen": 2443182080, + "step": 1165 + }, + { + "epoch": 0.06, + "grad_norm": 0.6796875, + "learning_rate": 2e-05, + "loss": 2.37, + "num_input_tokens_seen": 2445279232, + "step": 1166 + }, + { + "epoch": 0.06, + "grad_norm": 0.6171875, + "learning_rate": 2e-05, + "loss": 2.3787, + "num_input_tokens_seen": 2447376384, + "step": 1167 + }, + { + "epoch": 0.06, + "grad_norm": 0.70703125, + "learning_rate": 2e-05, + "loss": 2.3899, + "num_input_tokens_seen": 2449473536, + "step": 1168 + }, + { + "epoch": 0.06, + "grad_norm": 0.63671875, + "learning_rate": 2e-05, + "loss": 2.3714, + "num_input_tokens_seen": 2451570688, + "step": 1169 + }, + { + "epoch": 0.06, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 2.4156, + "num_input_tokens_seen": 2453667840, + "step": 1170 + }, + { + "epoch": 0.06, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.3558, + "num_input_tokens_seen": 2455764992, + "step": 1171 + }, + { + "epoch": 0.06, + "grad_norm": 0.703125, + "learning_rate": 2e-05, + "loss": 2.4273, + "num_input_tokens_seen": 2457862144, + "step": 1172 + }, + { + "epoch": 0.06, + "grad_norm": 0.71875, + "learning_rate": 2e-05, + "loss": 2.3684, + "num_input_tokens_seen": 2459959296, + "step": 1173 + }, + { + "epoch": 0.06, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.3827, + "num_input_tokens_seen": 2462056448, + "step": 1174 + }, + { + "epoch": 0.06, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.362, + "num_input_tokens_seen": 2464153600, + "step": 1175 + }, + { + "epoch": 0.06, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.3605, + "num_input_tokens_seen": 2466250752, + "step": 1176 + }, + { + "epoch": 0.06, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.3563, + "num_input_tokens_seen": 2468347904, + "step": 1177 + }, + { + "epoch": 0.06, + "grad_norm": 0.72265625, + "learning_rate": 2e-05, + "loss": 2.3817, + "num_input_tokens_seen": 2470445056, + "step": 1178 + }, + { + "epoch": 0.06, + "grad_norm": 0.796875, + "learning_rate": 2e-05, + "loss": 2.4126, + "num_input_tokens_seen": 2472542208, + "step": 1179 + }, + { + "epoch": 0.06, + "grad_norm": 0.6953125, + "learning_rate": 2e-05, + "loss": 2.3548, + "num_input_tokens_seen": 2474639360, + "step": 1180 + }, + { + "epoch": 0.06, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.3785, + "num_input_tokens_seen": 2476736512, + "step": 1181 + }, + { + "epoch": 0.06, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.3426, + "num_input_tokens_seen": 2478833664, + "step": 1182 + }, + { + "epoch": 0.06, + "grad_norm": 0.796875, + "learning_rate": 2e-05, + "loss": 2.3601, + "num_input_tokens_seen": 2480930816, + "step": 1183 + }, + { + "epoch": 0.06, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.4139, + "num_input_tokens_seen": 2483027968, + "step": 1184 + }, + { + "epoch": 0.06, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.35, + "num_input_tokens_seen": 2485125120, + "step": 1185 + }, + { + "epoch": 0.06, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.3875, + "num_input_tokens_seen": 2487222272, + "step": 1186 + }, + { + "epoch": 0.06, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.3787, + "num_input_tokens_seen": 2489319424, + "step": 1187 + }, + { + "epoch": 0.06, + "grad_norm": 0.75, + "learning_rate": 2e-05, + "loss": 2.3997, + "num_input_tokens_seen": 2491416576, + "step": 1188 + }, + { + "epoch": 0.06, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.3565, + "num_input_tokens_seen": 2493513728, + "step": 1189 + }, + { + "epoch": 0.06, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.3515, + "num_input_tokens_seen": 2495610880, + "step": 1190 + }, + { + "epoch": 0.06, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.3808, + "num_input_tokens_seen": 2497708032, + "step": 1191 + }, + { + "epoch": 0.06, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.3939, + "num_input_tokens_seen": 2499805184, + "step": 1192 + }, + { + "epoch": 0.06, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.3304, + "num_input_tokens_seen": 2501902336, + "step": 1193 + }, + { + "epoch": 0.06, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.3546, + "num_input_tokens_seen": 2503999488, + "step": 1194 + }, + { + "epoch": 0.06, + "eval_loss": 2.372859001159668, + "eval_runtime": 1897.0017, + "eval_samples_per_second": 2.078, + "eval_steps_per_second": 0.52, + "num_input_tokens_seen": 2503999488, + "step": 1194 + }, + { + "epoch": 0.06, + "grad_norm": 0.70703125, + "learning_rate": 2e-05, + "loss": 2.3559, + "num_input_tokens_seen": 2506096640, + "step": 1195 + }, + { + "epoch": 0.06, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.3569, + "num_input_tokens_seen": 2508193792, + "step": 1196 + }, + { + "epoch": 0.06, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.3959, + "num_input_tokens_seen": 2510290944, + "step": 1197 + }, + { + "epoch": 0.06, + "grad_norm": 0.734375, + "learning_rate": 2e-05, + "loss": 2.3615, + "num_input_tokens_seen": 2512388096, + "step": 1198 + }, + { + "epoch": 0.06, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.3735, + "num_input_tokens_seen": 2514485248, + "step": 1199 + }, + { + "epoch": 0.06, + "grad_norm": 0.671875, + "learning_rate": 2e-05, + "loss": 2.3342, + "num_input_tokens_seen": 2516582400, + "step": 1200 + }, + { + "epoch": 0.06, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.3555, + "num_input_tokens_seen": 2518679552, + "step": 1201 + }, + { + "epoch": 0.06, + "grad_norm": 0.765625, + "learning_rate": 2e-05, + "loss": 2.3955, + "num_input_tokens_seen": 2520776704, + "step": 1202 + }, + { + "epoch": 0.06, + "grad_norm": 0.72265625, + "learning_rate": 2e-05, + "loss": 2.3726, + "num_input_tokens_seen": 2522873856, + "step": 1203 + }, + { + "epoch": 0.06, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.3716, + "num_input_tokens_seen": 2524971008, + "step": 1204 + }, + { + "epoch": 0.06, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.3548, + "num_input_tokens_seen": 2527068160, + "step": 1205 + }, + { + "epoch": 0.06, + "grad_norm": 0.69140625, + "learning_rate": 2e-05, + "loss": 2.3516, + "num_input_tokens_seen": 2529165312, + "step": 1206 + }, + { + "epoch": 0.06, + "grad_norm": 0.71875, + "learning_rate": 2e-05, + "loss": 2.3967, + "num_input_tokens_seen": 2531262464, + "step": 1207 + }, + { + "epoch": 0.06, + "grad_norm": 0.75390625, + "learning_rate": 2e-05, + "loss": 2.3888, + "num_input_tokens_seen": 2533359616, + "step": 1208 + }, + { + "epoch": 0.06, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.3843, + "num_input_tokens_seen": 2535456768, + "step": 1209 + }, + { + "epoch": 0.07, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.3755, + "num_input_tokens_seen": 2537553920, + "step": 1210 + }, + { + "epoch": 0.07, + "grad_norm": 0.6640625, + "learning_rate": 2e-05, + "loss": 2.3513, + "num_input_tokens_seen": 2539651072, + "step": 1211 + }, + { + "epoch": 0.07, + "grad_norm": 0.75390625, + "learning_rate": 2e-05, + "loss": 2.3526, + "num_input_tokens_seen": 2541748224, + "step": 1212 + }, + { + "epoch": 0.07, + "grad_norm": 0.671875, + "learning_rate": 2e-05, + "loss": 2.367, + "num_input_tokens_seen": 2543845376, + "step": 1213 + }, + { + "epoch": 0.07, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.3739, + "num_input_tokens_seen": 2545942528, + "step": 1214 + }, + { + "epoch": 0.07, + "grad_norm": 0.66015625, + "learning_rate": 2e-05, + "loss": 2.3719, + "num_input_tokens_seen": 2548039680, + "step": 1215 + }, + { + "epoch": 0.07, + "grad_norm": 0.6796875, + "learning_rate": 2e-05, + "loss": 2.3799, + "num_input_tokens_seen": 2550136832, + "step": 1216 + }, + { + "epoch": 0.07, + "grad_norm": 0.72265625, + "learning_rate": 2e-05, + "loss": 2.3465, + "num_input_tokens_seen": 2552233984, + "step": 1217 + }, + { + "epoch": 0.07, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.3854, + "num_input_tokens_seen": 2554331136, + "step": 1218 + }, + { + "epoch": 0.07, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.363, + "num_input_tokens_seen": 2556428288, + "step": 1219 + }, + { + "epoch": 0.07, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.3576, + "num_input_tokens_seen": 2558525440, + "step": 1220 + }, + { + "epoch": 0.07, + "grad_norm": 0.71875, + "learning_rate": 2e-05, + "loss": 2.3671, + "num_input_tokens_seen": 2560622592, + "step": 1221 + }, + { + "epoch": 0.07, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.3709, + "num_input_tokens_seen": 2562719744, + "step": 1222 + }, + { + "epoch": 0.07, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 2.3506, + "num_input_tokens_seen": 2564816896, + "step": 1223 + }, + { + "epoch": 0.07, + "grad_norm": 0.72265625, + "learning_rate": 2e-05, + "loss": 2.3674, + "num_input_tokens_seen": 2566914048, + "step": 1224 + }, + { + "epoch": 0.07, + "grad_norm": 0.640625, + "learning_rate": 2e-05, + "loss": 2.3599, + "num_input_tokens_seen": 2569011200, + "step": 1225 + }, + { + "epoch": 0.07, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.3493, + "num_input_tokens_seen": 2571108352, + "step": 1226 + }, + { + "epoch": 0.07, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.369, + "num_input_tokens_seen": 2573205504, + "step": 1227 + }, + { + "epoch": 0.07, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 2.3542, + "num_input_tokens_seen": 2575302656, + "step": 1228 + }, + { + "epoch": 0.07, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.3671, + "num_input_tokens_seen": 2577399808, + "step": 1229 + }, + { + "epoch": 0.07, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.3604, + "num_input_tokens_seen": 2579496960, + "step": 1230 + }, + { + "epoch": 0.07, + "grad_norm": 0.7265625, + "learning_rate": 2e-05, + "loss": 2.3795, + "num_input_tokens_seen": 2581594112, + "step": 1231 + }, + { + "epoch": 0.07, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.3827, + "num_input_tokens_seen": 2583691264, + "step": 1232 + }, + { + "epoch": 0.07, + "grad_norm": 0.6875, + "learning_rate": 2e-05, + "loss": 2.3625, + "num_input_tokens_seen": 2585788416, + "step": 1233 + }, + { + "epoch": 0.07, + "grad_norm": 0.734375, + "learning_rate": 2e-05, + "loss": 2.3476, + "num_input_tokens_seen": 2587885568, + "step": 1234 + }, + { + "epoch": 0.07, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.3495, + "num_input_tokens_seen": 2589982720, + "step": 1235 + }, + { + "epoch": 0.07, + "grad_norm": 0.69921875, + "learning_rate": 2e-05, + "loss": 2.3431, + "num_input_tokens_seen": 2592079872, + "step": 1236 + }, + { + "epoch": 0.07, + "grad_norm": 0.69140625, + "learning_rate": 2e-05, + "loss": 2.3571, + "num_input_tokens_seen": 2594177024, + "step": 1237 + }, + { + "epoch": 0.07, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.3629, + "num_input_tokens_seen": 2596274176, + "step": 1238 + }, + { + "epoch": 0.07, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.338, + "num_input_tokens_seen": 2598371328, + "step": 1239 + }, + { + "epoch": 0.07, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.3846, + "num_input_tokens_seen": 2600468480, + "step": 1240 + }, + { + "epoch": 0.07, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.3641, + "num_input_tokens_seen": 2602565632, + "step": 1241 + }, + { + "epoch": 0.07, + "grad_norm": 0.69140625, + "learning_rate": 2e-05, + "loss": 2.3304, + "num_input_tokens_seen": 2604662784, + "step": 1242 + }, + { + "epoch": 0.07, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.3442, + "num_input_tokens_seen": 2606759936, + "step": 1243 + }, + { + "epoch": 0.07, + "grad_norm": 0.71875, + "learning_rate": 2e-05, + "loss": 2.3699, + "num_input_tokens_seen": 2608857088, + "step": 1244 + }, + { + "epoch": 0.07, + "grad_norm": 0.6484375, + "learning_rate": 2e-05, + "loss": 2.3471, + "num_input_tokens_seen": 2610954240, + "step": 1245 + }, + { + "epoch": 0.07, + "grad_norm": 0.7265625, + "learning_rate": 2e-05, + "loss": 2.353, + "num_input_tokens_seen": 2613051392, + "step": 1246 + }, + { + "epoch": 0.07, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.3584, + "num_input_tokens_seen": 2615148544, + "step": 1247 + }, + { + "epoch": 0.07, + "grad_norm": 0.6796875, + "learning_rate": 2e-05, + "loss": 2.368, + "num_input_tokens_seen": 2617245696, + "step": 1248 + }, + { + "epoch": 0.07, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.3467, + "num_input_tokens_seen": 2619342848, + "step": 1249 + }, + { + "epoch": 0.07, + "grad_norm": 0.69921875, + "learning_rate": 2e-05, + "loss": 2.357, + "num_input_tokens_seen": 2621440000, + "step": 1250 + }, + { + "epoch": 0.07, + "grad_norm": 1.7890625, + "learning_rate": 2e-05, + "loss": 2.3929, + "num_input_tokens_seen": 2623537152, + "step": 1251 + }, + { + "epoch": 0.07, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.3437, + "num_input_tokens_seen": 2625634304, + "step": 1252 + }, + { + "epoch": 0.07, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.3393, + "num_input_tokens_seen": 2627731456, + "step": 1253 + }, + { + "epoch": 0.07, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.3768, + "num_input_tokens_seen": 2629828608, + "step": 1254 + }, + { + "epoch": 0.07, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.3709, + "num_input_tokens_seen": 2631925760, + "step": 1255 + }, + { + "epoch": 0.07, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.3585, + "num_input_tokens_seen": 2634022912, + "step": 1256 + }, + { + "epoch": 0.07, + "grad_norm": 0.69140625, + "learning_rate": 2e-05, + "loss": 2.3238, + "num_input_tokens_seen": 2636120064, + "step": 1257 + }, + { + "epoch": 0.08, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.3714, + "num_input_tokens_seen": 2638217216, + "step": 1258 + }, + { + "epoch": 0.08, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.3459, + "num_input_tokens_seen": 2640314368, + "step": 1259 + }, + { + "epoch": 0.08, + "grad_norm": 0.76171875, + "learning_rate": 2e-05, + "loss": 2.3369, + "num_input_tokens_seen": 2642411520, + "step": 1260 + }, + { + "epoch": 0.08, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.3652, + "num_input_tokens_seen": 2644508672, + "step": 1261 + }, + { + "epoch": 0.08, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.3519, + "num_input_tokens_seen": 2646605824, + "step": 1262 + }, + { + "epoch": 0.08, + "grad_norm": 0.63671875, + "learning_rate": 2e-05, + "loss": 2.3634, + "num_input_tokens_seen": 2648702976, + "step": 1263 + }, + { + "epoch": 0.08, + "grad_norm": 0.99609375, + "learning_rate": 2e-05, + "loss": 2.3311, + "num_input_tokens_seen": 2650800128, + "step": 1264 + }, + { + "epoch": 0.08, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 2.346, + "num_input_tokens_seen": 2652897280, + "step": 1265 + }, + { + "epoch": 0.08, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.3565, + "num_input_tokens_seen": 2654994432, + "step": 1266 + }, + { + "epoch": 0.08, + "grad_norm": 1.140625, + "learning_rate": 2e-05, + "loss": 2.3524, + "num_input_tokens_seen": 2657091584, + "step": 1267 + }, + { + "epoch": 0.08, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.3525, + "num_input_tokens_seen": 2659188736, + "step": 1268 + }, + { + "epoch": 0.08, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.3705, + "num_input_tokens_seen": 2661285888, + "step": 1269 + }, + { + "epoch": 0.08, + "grad_norm": 0.98828125, + "learning_rate": 2e-05, + "loss": 2.3608, + "num_input_tokens_seen": 2663383040, + "step": 1270 + }, + { + "epoch": 0.08, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.3262, + "num_input_tokens_seen": 2665480192, + "step": 1271 + }, + { + "epoch": 0.08, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.3614, + "num_input_tokens_seen": 2667577344, + "step": 1272 + }, + { + "epoch": 0.08, + "grad_norm": 0.7265625, + "learning_rate": 2e-05, + "loss": 2.3722, + "num_input_tokens_seen": 2669674496, + "step": 1273 + }, + { + "epoch": 0.08, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.3397, + "num_input_tokens_seen": 2671771648, + "step": 1274 + }, + { + "epoch": 0.08, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.3184, + "num_input_tokens_seen": 2673868800, + "step": 1275 + }, + { + "epoch": 0.08, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.3394, + "num_input_tokens_seen": 2675965952, + "step": 1276 + }, + { + "epoch": 0.08, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.3482, + "num_input_tokens_seen": 2678063104, + "step": 1277 + }, + { + "epoch": 0.08, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.3027, + "num_input_tokens_seen": 2680160256, + "step": 1278 + }, + { + "epoch": 0.08, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.363, + "num_input_tokens_seen": 2682257408, + "step": 1279 + }, + { + "epoch": 0.08, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.3426, + "num_input_tokens_seen": 2684354560, + "step": 1280 + }, + { + "epoch": 0.08, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.3406, + "num_input_tokens_seen": 2686451712, + "step": 1281 + }, + { + "epoch": 0.08, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.3247, + "num_input_tokens_seen": 2688548864, + "step": 1282 + }, + { + "epoch": 0.08, + "grad_norm": 0.7265625, + "learning_rate": 2e-05, + "loss": 2.3495, + "num_input_tokens_seen": 2690646016, + "step": 1283 + }, + { + "epoch": 0.08, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.352, + "num_input_tokens_seen": 2692743168, + "step": 1284 + }, + { + "epoch": 0.08, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.3591, + "num_input_tokens_seen": 2694840320, + "step": 1285 + }, + { + "epoch": 0.08, + "grad_norm": 0.765625, + "learning_rate": 2e-05, + "loss": 2.342, + "num_input_tokens_seen": 2696937472, + "step": 1286 + }, + { + "epoch": 0.08, + "grad_norm": 0.65234375, + "learning_rate": 2e-05, + "loss": 2.3596, + "num_input_tokens_seen": 2699034624, + "step": 1287 + }, + { + "epoch": 0.08, + "grad_norm": 0.70703125, + "learning_rate": 2e-05, + "loss": 2.3537, + "num_input_tokens_seen": 2701131776, + "step": 1288 + }, + { + "epoch": 0.08, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.3567, + "num_input_tokens_seen": 2703228928, + "step": 1289 + }, + { + "epoch": 0.08, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.3327, + "num_input_tokens_seen": 2705326080, + "step": 1290 + }, + { + "epoch": 0.08, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.3391, + "num_input_tokens_seen": 2707423232, + "step": 1291 + }, + { + "epoch": 0.08, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.3264, + "num_input_tokens_seen": 2709520384, + "step": 1292 + }, + { + "epoch": 0.08, + "grad_norm": 0.63671875, + "learning_rate": 2e-05, + "loss": 2.3398, + "num_input_tokens_seen": 2711617536, + "step": 1293 + }, + { + "epoch": 0.08, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.3584, + "num_input_tokens_seen": 2713714688, + "step": 1294 + }, + { + "epoch": 0.08, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.347, + "num_input_tokens_seen": 2715811840, + "step": 1295 + }, + { + "epoch": 0.08, + "grad_norm": 0.75, + "learning_rate": 2e-05, + "loss": 2.3231, + "num_input_tokens_seen": 2717908992, + "step": 1296 + }, + { + "epoch": 0.08, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.3543, + "num_input_tokens_seen": 2720006144, + "step": 1297 + }, + { + "epoch": 0.08, + "grad_norm": 0.76171875, + "learning_rate": 2e-05, + "loss": 2.3675, + "num_input_tokens_seen": 2722103296, + "step": 1298 + }, + { + "epoch": 0.08, + "grad_norm": 0.765625, + "learning_rate": 2e-05, + "loss": 2.3499, + "num_input_tokens_seen": 2724200448, + "step": 1299 + }, + { + "epoch": 0.08, + "grad_norm": 0.98046875, + "learning_rate": 2e-05, + "loss": 2.3662, + "num_input_tokens_seen": 2726297600, + "step": 1300 + }, + { + "epoch": 0.08, + "grad_norm": 0.6640625, + "learning_rate": 2e-05, + "loss": 2.3167, + "num_input_tokens_seen": 2728394752, + "step": 1301 + }, + { + "epoch": 0.08, + "grad_norm": 0.9921875, + "learning_rate": 2e-05, + "loss": 2.3177, + "num_input_tokens_seen": 2730491904, + "step": 1302 + }, + { + "epoch": 0.08, + "grad_norm": 0.75, + "learning_rate": 2e-05, + "loss": 2.3457, + "num_input_tokens_seen": 2732589056, + "step": 1303 + }, + { + "epoch": 0.08, + "grad_norm": 0.67578125, + "learning_rate": 2e-05, + "loss": 2.3247, + "num_input_tokens_seen": 2734686208, + "step": 1304 + }, + { + "epoch": 0.08, + "grad_norm": 0.69140625, + "learning_rate": 2e-05, + "loss": 2.3278, + "num_input_tokens_seen": 2736783360, + "step": 1305 + }, + { + "epoch": 0.09, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.3253, + "num_input_tokens_seen": 2738880512, + "step": 1306 + }, + { + "epoch": 0.09, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.3687, + "num_input_tokens_seen": 2740977664, + "step": 1307 + }, + { + "epoch": 0.09, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.3549, + "num_input_tokens_seen": 2743074816, + "step": 1308 + }, + { + "epoch": 0.09, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 2.3305, + "num_input_tokens_seen": 2745171968, + "step": 1309 + }, + { + "epoch": 0.09, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.3443, + "num_input_tokens_seen": 2747269120, + "step": 1310 + }, + { + "epoch": 0.09, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.3134, + "num_input_tokens_seen": 2749366272, + "step": 1311 + }, + { + "epoch": 0.09, + "grad_norm": 0.703125, + "learning_rate": 2e-05, + "loss": 2.3547, + "num_input_tokens_seen": 2751463424, + "step": 1312 + }, + { + "epoch": 0.09, + "grad_norm": 0.98828125, + "learning_rate": 2e-05, + "loss": 2.3549, + "num_input_tokens_seen": 2753560576, + "step": 1313 + }, + { + "epoch": 0.09, + "grad_norm": 0.66796875, + "learning_rate": 2e-05, + "loss": 2.3566, + "num_input_tokens_seen": 2755657728, + "step": 1314 + }, + { + "epoch": 0.09, + "grad_norm": 0.703125, + "learning_rate": 2e-05, + "loss": 2.3337, + "num_input_tokens_seen": 2757754880, + "step": 1315 + }, + { + "epoch": 0.09, + "grad_norm": 0.984375, + "learning_rate": 2e-05, + "loss": 2.3448, + "num_input_tokens_seen": 2759852032, + "step": 1316 + }, + { + "epoch": 0.09, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.3522, + "num_input_tokens_seen": 2761949184, + "step": 1317 + }, + { + "epoch": 0.09, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.3557, + "num_input_tokens_seen": 2764046336, + "step": 1318 + }, + { + "epoch": 0.09, + "grad_norm": 0.796875, + "learning_rate": 2e-05, + "loss": 2.36, + "num_input_tokens_seen": 2766143488, + "step": 1319 + }, + { + "epoch": 0.09, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.3518, + "num_input_tokens_seen": 2768240640, + "step": 1320 + }, + { + "epoch": 0.09, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.3473, + "num_input_tokens_seen": 2770337792, + "step": 1321 + }, + { + "epoch": 0.09, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.3245, + "num_input_tokens_seen": 2772434944, + "step": 1322 + }, + { + "epoch": 0.09, + "grad_norm": 0.70703125, + "learning_rate": 2e-05, + "loss": 2.3347, + "num_input_tokens_seen": 2774532096, + "step": 1323 + }, + { + "epoch": 0.09, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.3589, + "num_input_tokens_seen": 2776629248, + "step": 1324 + }, + { + "epoch": 0.09, + "grad_norm": 0.69921875, + "learning_rate": 2e-05, + "loss": 2.3474, + "num_input_tokens_seen": 2778726400, + "step": 1325 + }, + { + "epoch": 0.09, + "grad_norm": 1.125, + "learning_rate": 2e-05, + "loss": 2.3154, + "num_input_tokens_seen": 2780823552, + "step": 1326 + }, + { + "epoch": 0.09, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.342, + "num_input_tokens_seen": 2782920704, + "step": 1327 + }, + { + "epoch": 0.09, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.3405, + "num_input_tokens_seen": 2785017856, + "step": 1328 + }, + { + "epoch": 0.09, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 2.325, + "num_input_tokens_seen": 2787115008, + "step": 1329 + }, + { + "epoch": 0.09, + "grad_norm": 0.71875, + "learning_rate": 2e-05, + "loss": 2.3214, + "num_input_tokens_seen": 2789212160, + "step": 1330 + }, + { + "epoch": 0.09, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.3296, + "num_input_tokens_seen": 2791309312, + "step": 1331 + }, + { + "epoch": 0.09, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.3388, + "num_input_tokens_seen": 2793406464, + "step": 1332 + }, + { + "epoch": 0.09, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.3579, + "num_input_tokens_seen": 2795503616, + "step": 1333 + }, + { + "epoch": 0.09, + "grad_norm": 0.98828125, + "learning_rate": 2e-05, + "loss": 2.3578, + "num_input_tokens_seen": 2797600768, + "step": 1334 + }, + { + "epoch": 0.09, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.3503, + "num_input_tokens_seen": 2799697920, + "step": 1335 + }, + { + "epoch": 0.09, + "grad_norm": 0.71875, + "learning_rate": 2e-05, + "loss": 2.3739, + "num_input_tokens_seen": 2801795072, + "step": 1336 + }, + { + "epoch": 0.09, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.3405, + "num_input_tokens_seen": 2803892224, + "step": 1337 + }, + { + "epoch": 0.09, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.3737, + "num_input_tokens_seen": 2805989376, + "step": 1338 + }, + { + "epoch": 0.09, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.3609, + "num_input_tokens_seen": 2808086528, + "step": 1339 + }, + { + "epoch": 0.09, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.349, + "num_input_tokens_seen": 2810183680, + "step": 1340 + }, + { + "epoch": 0.09, + "grad_norm": 0.69921875, + "learning_rate": 2e-05, + "loss": 2.3427, + "num_input_tokens_seen": 2812280832, + "step": 1341 + }, + { + "epoch": 0.09, + "grad_norm": 0.6953125, + "learning_rate": 2e-05, + "loss": 2.3392, + "num_input_tokens_seen": 2814377984, + "step": 1342 + }, + { + "epoch": 0.09, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.3159, + "num_input_tokens_seen": 2816475136, + "step": 1343 + }, + { + "epoch": 0.09, + "grad_norm": 0.66015625, + "learning_rate": 2e-05, + "loss": 2.357, + "num_input_tokens_seen": 2818572288, + "step": 1344 + }, + { + "epoch": 0.09, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.352, + "num_input_tokens_seen": 2820669440, + "step": 1345 + }, + { + "epoch": 0.09, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.3447, + "num_input_tokens_seen": 2822766592, + "step": 1346 + }, + { + "epoch": 0.09, + "grad_norm": 0.6796875, + "learning_rate": 2e-05, + "loss": 2.3139, + "num_input_tokens_seen": 2824863744, + "step": 1347 + }, + { + "epoch": 0.09, + "grad_norm": 0.7265625, + "learning_rate": 2e-05, + "loss": 2.3333, + "num_input_tokens_seen": 2826960896, + "step": 1348 + }, + { + "epoch": 0.09, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.3452, + "num_input_tokens_seen": 2829058048, + "step": 1349 + }, + { + "epoch": 0.09, + "grad_norm": 0.640625, + "learning_rate": 2e-05, + "loss": 2.3266, + "num_input_tokens_seen": 2831155200, + "step": 1350 + }, + { + "epoch": 0.09, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.3383, + "num_input_tokens_seen": 2833252352, + "step": 1351 + }, + { + "epoch": 0.09, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.3234, + "num_input_tokens_seen": 2835349504, + "step": 1352 + }, + { + "epoch": 0.1, + "grad_norm": 0.69140625, + "learning_rate": 2e-05, + "loss": 2.3516, + "num_input_tokens_seen": 2837446656, + "step": 1353 + }, + { + "epoch": 0.1, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.3414, + "num_input_tokens_seen": 2839543808, + "step": 1354 + }, + { + "epoch": 0.1, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.3235, + "num_input_tokens_seen": 2841640960, + "step": 1355 + }, + { + "epoch": 0.1, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.3593, + "num_input_tokens_seen": 2843738112, + "step": 1356 + }, + { + "epoch": 0.1, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.3439, + "num_input_tokens_seen": 2845835264, + "step": 1357 + }, + { + "epoch": 0.1, + "grad_norm": 0.6875, + "learning_rate": 2e-05, + "loss": 2.3227, + "num_input_tokens_seen": 2847932416, + "step": 1358 + }, + { + "epoch": 0.1, + "grad_norm": 0.62109375, + "learning_rate": 2e-05, + "loss": 2.3124, + "num_input_tokens_seen": 2850029568, + "step": 1359 + }, + { + "epoch": 0.1, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.3324, + "num_input_tokens_seen": 2852126720, + "step": 1360 + }, + { + "epoch": 0.1, + "grad_norm": 0.72265625, + "learning_rate": 2e-05, + "loss": 2.359, + "num_input_tokens_seen": 2854223872, + "step": 1361 + }, + { + "epoch": 0.1, + "grad_norm": 0.64453125, + "learning_rate": 2e-05, + "loss": 2.3182, + "num_input_tokens_seen": 2856321024, + "step": 1362 + }, + { + "epoch": 0.1, + "grad_norm": 0.6796875, + "learning_rate": 2e-05, + "loss": 2.2981, + "num_input_tokens_seen": 2858418176, + "step": 1363 + }, + { + "epoch": 0.1, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.3326, + "num_input_tokens_seen": 2860515328, + "step": 1364 + }, + { + "epoch": 0.1, + "grad_norm": 0.6796875, + "learning_rate": 2e-05, + "loss": 2.3344, + "num_input_tokens_seen": 2862612480, + "step": 1365 + }, + { + "epoch": 0.1, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 2.3564, + "num_input_tokens_seen": 2864709632, + "step": 1366 + }, + { + "epoch": 0.1, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.301, + "num_input_tokens_seen": 2866806784, + "step": 1367 + }, + { + "epoch": 0.1, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.3407, + "num_input_tokens_seen": 2868903936, + "step": 1368 + }, + { + "epoch": 0.1, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.3235, + "num_input_tokens_seen": 2871001088, + "step": 1369 + }, + { + "epoch": 0.1, + "grad_norm": 0.9921875, + "learning_rate": 2e-05, + "loss": 2.3369, + "num_input_tokens_seen": 2873098240, + "step": 1370 + }, + { + "epoch": 0.1, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.321, + "num_input_tokens_seen": 2875195392, + "step": 1371 + }, + { + "epoch": 0.1, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.3392, + "num_input_tokens_seen": 2877292544, + "step": 1372 + }, + { + "epoch": 0.1, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.3167, + "num_input_tokens_seen": 2879389696, + "step": 1373 + }, + { + "epoch": 0.1, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.3246, + "num_input_tokens_seen": 2881486848, + "step": 1374 + }, + { + "epoch": 0.1, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.3355, + "num_input_tokens_seen": 2883584000, + "step": 1375 + }, + { + "epoch": 0.1, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.305, + "num_input_tokens_seen": 2885681152, + "step": 1376 + }, + { + "epoch": 0.1, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.3311, + "num_input_tokens_seen": 2887778304, + "step": 1377 + }, + { + "epoch": 0.1, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.3486, + "num_input_tokens_seen": 2889875456, + "step": 1378 + }, + { + "epoch": 0.1, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.3106, + "num_input_tokens_seen": 2891972608, + "step": 1379 + }, + { + "epoch": 0.1, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.3444, + "num_input_tokens_seen": 2894069760, + "step": 1380 + }, + { + "epoch": 0.1, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.3102, + "num_input_tokens_seen": 2896166912, + "step": 1381 + }, + { + "epoch": 0.1, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.3087, + "num_input_tokens_seen": 2898264064, + "step": 1382 + }, + { + "epoch": 0.1, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.3304, + "num_input_tokens_seen": 2900361216, + "step": 1383 + }, + { + "epoch": 0.1, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.3188, + "num_input_tokens_seen": 2902458368, + "step": 1384 + }, + { + "epoch": 0.1, + "grad_norm": 0.69140625, + "learning_rate": 2e-05, + "loss": 2.285, + "num_input_tokens_seen": 2904555520, + "step": 1385 + }, + { + "epoch": 0.1, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.3244, + "num_input_tokens_seen": 2906652672, + "step": 1386 + }, + { + "epoch": 0.1, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.3279, + "num_input_tokens_seen": 2908749824, + "step": 1387 + }, + { + "epoch": 0.1, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.3265, + "num_input_tokens_seen": 2910846976, + "step": 1388 + }, + { + "epoch": 0.1, + "grad_norm": 0.9296875, + "learning_rate": 2e-05, + "loss": 2.3447, + "num_input_tokens_seen": 2912944128, + "step": 1389 + }, + { + "epoch": 0.1, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.335, + "num_input_tokens_seen": 2915041280, + "step": 1390 + }, + { + "epoch": 0.1, + "grad_norm": 0.67578125, + "learning_rate": 2e-05, + "loss": 2.3385, + "num_input_tokens_seen": 2917138432, + "step": 1391 + }, + { + "epoch": 0.1, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.339, + "num_input_tokens_seen": 2919235584, + "step": 1392 + }, + { + "epoch": 0.1, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.3238, + "num_input_tokens_seen": 2921332736, + "step": 1393 + }, + { + "epoch": 0.1, + "eval_loss": 2.345611572265625, + "eval_runtime": 2602.686, + "eval_samples_per_second": 1.515, + "eval_steps_per_second": 0.379, + "num_input_tokens_seen": 2921332736, + "step": 1393 + }, + { + "epoch": 0.1, + "grad_norm": 0.6875, + "learning_rate": 2e-05, + "loss": 2.3365, + "num_input_tokens_seen": 2923429888, + "step": 1394 + }, + { + "epoch": 0.1, + "grad_norm": 1.0859375, + "learning_rate": 2e-05, + "loss": 2.308, + "num_input_tokens_seen": 2925527040, + "step": 1395 + }, + { + "epoch": 0.1, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.3388, + "num_input_tokens_seen": 2927624192, + "step": 1396 + }, + { + "epoch": 0.1, + "grad_norm": 0.9296875, + "learning_rate": 2e-05, + "loss": 2.3425, + "num_input_tokens_seen": 2929721344, + "step": 1397 + }, + { + "epoch": 0.1, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.3268, + "num_input_tokens_seen": 2931818496, + "step": 1398 + }, + { + "epoch": 0.1, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 2.3037, + "num_input_tokens_seen": 2933915648, + "step": 1399 + }, + { + "epoch": 0.1, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.345, + "num_input_tokens_seen": 2936012800, + "step": 1400 + }, + { + "epoch": 0.11, + "grad_norm": 1.0, + "learning_rate": 2e-05, + "loss": 2.3231, + "num_input_tokens_seen": 2938109952, + "step": 1401 + }, + { + "epoch": 0.11, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.3219, + "num_input_tokens_seen": 2940207104, + "step": 1402 + }, + { + "epoch": 0.11, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.3231, + "num_input_tokens_seen": 2942304256, + "step": 1403 + }, + { + "epoch": 0.11, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.3063, + "num_input_tokens_seen": 2944401408, + "step": 1404 + }, + { + "epoch": 0.11, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.3101, + "num_input_tokens_seen": 2946498560, + "step": 1405 + }, + { + "epoch": 0.11, + "grad_norm": 0.64453125, + "learning_rate": 2e-05, + "loss": 2.323, + "num_input_tokens_seen": 2948595712, + "step": 1406 + }, + { + "epoch": 0.11, + "grad_norm": 1.21875, + "learning_rate": 2e-05, + "loss": 2.321, + "num_input_tokens_seen": 2950692864, + "step": 1407 + }, + { + "epoch": 0.11, + "grad_norm": 1.0, + "learning_rate": 2e-05, + "loss": 2.3055, + "num_input_tokens_seen": 2952790016, + "step": 1408 + }, + { + "epoch": 0.11, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.3156, + "num_input_tokens_seen": 2954887168, + "step": 1409 + }, + { + "epoch": 0.11, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.3065, + "num_input_tokens_seen": 2956984320, + "step": 1410 + }, + { + "epoch": 0.11, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.3132, + "num_input_tokens_seen": 2959081472, + "step": 1411 + }, + { + "epoch": 0.11, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 2.2997, + "num_input_tokens_seen": 2961178624, + "step": 1412 + }, + { + "epoch": 0.11, + "grad_norm": 0.92578125, + "learning_rate": 2e-05, + "loss": 2.3062, + "num_input_tokens_seen": 2963275776, + "step": 1413 + }, + { + "epoch": 0.11, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.3117, + "num_input_tokens_seen": 2965372928, + "step": 1414 + }, + { + "epoch": 0.11, + "grad_norm": 0.69921875, + "learning_rate": 2e-05, + "loss": 2.2983, + "num_input_tokens_seen": 2967470080, + "step": 1415 + }, + { + "epoch": 0.11, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.3026, + "num_input_tokens_seen": 2969567232, + "step": 1416 + }, + { + "epoch": 0.11, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.3114, + "num_input_tokens_seen": 2971664384, + "step": 1417 + }, + { + "epoch": 0.11, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 2.2923, + "num_input_tokens_seen": 2973761536, + "step": 1418 + }, + { + "epoch": 0.11, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.3226, + "num_input_tokens_seen": 2975858688, + "step": 1419 + }, + { + "epoch": 0.11, + "grad_norm": 0.796875, + "learning_rate": 2e-05, + "loss": 2.308, + "num_input_tokens_seen": 2977955840, + "step": 1420 + }, + { + "epoch": 0.11, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.3125, + "num_input_tokens_seen": 2980052992, + "step": 1421 + }, + { + "epoch": 0.11, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.3037, + "num_input_tokens_seen": 2982150144, + "step": 1422 + }, + { + "epoch": 0.11, + "grad_norm": 1.0, + "learning_rate": 2e-05, + "loss": 2.2974, + "num_input_tokens_seen": 2984247296, + "step": 1423 + }, + { + "epoch": 0.11, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.2781, + "num_input_tokens_seen": 2986344448, + "step": 1424 + }, + { + "epoch": 0.11, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.3029, + "num_input_tokens_seen": 2988441600, + "step": 1425 + }, + { + "epoch": 0.11, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.2919, + "num_input_tokens_seen": 2990538752, + "step": 1426 + }, + { + "epoch": 0.11, + "grad_norm": 0.96875, + "learning_rate": 2e-05, + "loss": 2.3393, + "num_input_tokens_seen": 2992635904, + "step": 1427 + }, + { + "epoch": 0.11, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.3319, + "num_input_tokens_seen": 2994733056, + "step": 1428 + }, + { + "epoch": 0.11, + "grad_norm": 0.765625, + "learning_rate": 2e-05, + "loss": 2.3246, + "num_input_tokens_seen": 2996830208, + "step": 1429 + }, + { + "epoch": 0.11, + "grad_norm": 0.99609375, + "learning_rate": 2e-05, + "loss": 2.3309, + "num_input_tokens_seen": 2998927360, + "step": 1430 + }, + { + "epoch": 0.11, + "grad_norm": 0.96875, + "learning_rate": 2e-05, + "loss": 2.3551, + "num_input_tokens_seen": 3001024512, + "step": 1431 + }, + { + "epoch": 0.11, + "grad_norm": 0.703125, + "learning_rate": 2e-05, + "loss": 2.3123, + "num_input_tokens_seen": 3003121664, + "step": 1432 + }, + { + "epoch": 0.11, + "grad_norm": 1.0234375, + "learning_rate": 2e-05, + "loss": 2.2897, + "num_input_tokens_seen": 3005218816, + "step": 1433 + }, + { + "epoch": 0.11, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.3345, + "num_input_tokens_seen": 3007315968, + "step": 1434 + }, + { + "epoch": 0.11, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.2838, + "num_input_tokens_seen": 3009413120, + "step": 1435 + }, + { + "epoch": 0.11, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.3246, + "num_input_tokens_seen": 3011510272, + "step": 1436 + }, + { + "epoch": 0.11, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.3546, + "num_input_tokens_seen": 3013607424, + "step": 1437 + }, + { + "epoch": 0.11, + "grad_norm": 0.91796875, + "learning_rate": 2e-05, + "loss": 2.3171, + "num_input_tokens_seen": 3015704576, + "step": 1438 + }, + { + "epoch": 0.11, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.3081, + "num_input_tokens_seen": 3017801728, + "step": 1439 + }, + { + "epoch": 0.11, + "grad_norm": 0.6875, + "learning_rate": 2e-05, + "loss": 2.2926, + "num_input_tokens_seen": 3019898880, + "step": 1440 + }, + { + "epoch": 0.11, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.3037, + "num_input_tokens_seen": 3021996032, + "step": 1441 + }, + { + "epoch": 0.11, + "grad_norm": 0.796875, + "learning_rate": 2e-05, + "loss": 2.2944, + "num_input_tokens_seen": 3024093184, + "step": 1442 + }, + { + "epoch": 0.11, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2713, + "num_input_tokens_seen": 3026190336, + "step": 1443 + }, + { + "epoch": 0.11, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.3103, + "num_input_tokens_seen": 3028287488, + "step": 1444 + }, + { + "epoch": 0.11, + "grad_norm": 1.0234375, + "learning_rate": 2e-05, + "loss": 2.3076, + "num_input_tokens_seen": 3030384640, + "step": 1445 + }, + { + "epoch": 0.11, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.3239, + "num_input_tokens_seen": 3032481792, + "step": 1446 + }, + { + "epoch": 0.11, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.3302, + "num_input_tokens_seen": 3034578944, + "step": 1447 + }, + { + "epoch": 0.11, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.3397, + "num_input_tokens_seen": 3036676096, + "step": 1448 + }, + { + "epoch": 0.12, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.3288, + "num_input_tokens_seen": 3038773248, + "step": 1449 + }, + { + "epoch": 0.12, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.314, + "num_input_tokens_seen": 3040870400, + "step": 1450 + }, + { + "epoch": 0.12, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.3288, + "num_input_tokens_seen": 3042967552, + "step": 1451 + }, + { + "epoch": 0.12, + "grad_norm": 1.2265625, + "learning_rate": 2e-05, + "loss": 2.2871, + "num_input_tokens_seen": 3045064704, + "step": 1452 + }, + { + "epoch": 0.12, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.3282, + "num_input_tokens_seen": 3047161856, + "step": 1453 + }, + { + "epoch": 0.12, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.2942, + "num_input_tokens_seen": 3049259008, + "step": 1454 + }, + { + "epoch": 0.12, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.2913, + "num_input_tokens_seen": 3051356160, + "step": 1455 + }, + { + "epoch": 0.12, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.3423, + "num_input_tokens_seen": 3053453312, + "step": 1456 + }, + { + "epoch": 0.12, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.3195, + "num_input_tokens_seen": 3055550464, + "step": 1457 + }, + { + "epoch": 0.12, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.3354, + "num_input_tokens_seen": 3057647616, + "step": 1458 + }, + { + "epoch": 0.12, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2998, + "num_input_tokens_seen": 3059744768, + "step": 1459 + }, + { + "epoch": 0.12, + "grad_norm": 0.765625, + "learning_rate": 2e-05, + "loss": 2.2993, + "num_input_tokens_seen": 3061841920, + "step": 1460 + }, + { + "epoch": 0.12, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.2753, + "num_input_tokens_seen": 3063939072, + "step": 1461 + }, + { + "epoch": 0.12, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.2919, + "num_input_tokens_seen": 3066036224, + "step": 1462 + }, + { + "epoch": 0.12, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.3217, + "num_input_tokens_seen": 3068133376, + "step": 1463 + }, + { + "epoch": 0.12, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.3034, + "num_input_tokens_seen": 3070230528, + "step": 1464 + }, + { + "epoch": 0.12, + "grad_norm": 0.96875, + "learning_rate": 2e-05, + "loss": 2.307, + "num_input_tokens_seen": 3072327680, + "step": 1465 + }, + { + "epoch": 0.12, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.3311, + "num_input_tokens_seen": 3074424832, + "step": 1466 + }, + { + "epoch": 0.12, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.3075, + "num_input_tokens_seen": 3076521984, + "step": 1467 + }, + { + "epoch": 0.12, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.3534, + "num_input_tokens_seen": 3078619136, + "step": 1468 + }, + { + "epoch": 0.12, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.2958, + "num_input_tokens_seen": 3080716288, + "step": 1469 + }, + { + "epoch": 0.12, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.2973, + "num_input_tokens_seen": 3082813440, + "step": 1470 + }, + { + "epoch": 0.12, + "grad_norm": 0.69921875, + "learning_rate": 2e-05, + "loss": 2.3137, + "num_input_tokens_seen": 3084910592, + "step": 1471 + }, + { + "epoch": 0.12, + "grad_norm": 0.94140625, + "learning_rate": 2e-05, + "loss": 2.3125, + "num_input_tokens_seen": 3087007744, + "step": 1472 + }, + { + "epoch": 0.12, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.2951, + "num_input_tokens_seen": 3089104896, + "step": 1473 + }, + { + "epoch": 0.12, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.3418, + "num_input_tokens_seen": 3091202048, + "step": 1474 + }, + { + "epoch": 0.12, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.3099, + "num_input_tokens_seen": 3093299200, + "step": 1475 + }, + { + "epoch": 0.12, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.2985, + "num_input_tokens_seen": 3095396352, + "step": 1476 + }, + { + "epoch": 0.12, + "grad_norm": 0.765625, + "learning_rate": 2e-05, + "loss": 2.3031, + "num_input_tokens_seen": 3097493504, + "step": 1477 + }, + { + "epoch": 0.12, + "grad_norm": 0.92578125, + "learning_rate": 2e-05, + "loss": 2.2491, + "num_input_tokens_seen": 3099590656, + "step": 1478 + }, + { + "epoch": 0.12, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.3481, + "num_input_tokens_seen": 3101687808, + "step": 1479 + }, + { + "epoch": 0.12, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.2968, + "num_input_tokens_seen": 3103784960, + "step": 1480 + }, + { + "epoch": 0.12, + "grad_norm": 0.984375, + "learning_rate": 2e-05, + "loss": 2.3091, + "num_input_tokens_seen": 3105882112, + "step": 1481 + }, + { + "epoch": 0.12, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2881, + "num_input_tokens_seen": 3107979264, + "step": 1482 + }, + { + "epoch": 0.12, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.3319, + "num_input_tokens_seen": 3110076416, + "step": 1483 + }, + { + "epoch": 0.12, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.3146, + "num_input_tokens_seen": 3112173568, + "step": 1484 + }, + { + "epoch": 0.12, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.317, + "num_input_tokens_seen": 3114270720, + "step": 1485 + }, + { + "epoch": 0.12, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.3253, + "num_input_tokens_seen": 3116367872, + "step": 1486 + }, + { + "epoch": 0.12, + "grad_norm": 0.7265625, + "learning_rate": 2e-05, + "loss": 2.2923, + "num_input_tokens_seen": 3118465024, + "step": 1487 + }, + { + "epoch": 0.12, + "grad_norm": 0.76171875, + "learning_rate": 2e-05, + "loss": 2.3056, + "num_input_tokens_seen": 3120562176, + "step": 1488 + }, + { + "epoch": 0.12, + "grad_norm": 0.76171875, + "learning_rate": 2e-05, + "loss": 2.3201, + "num_input_tokens_seen": 3122659328, + "step": 1489 + }, + { + "epoch": 0.12, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.3172, + "num_input_tokens_seen": 3124756480, + "step": 1490 + }, + { + "epoch": 0.12, + "grad_norm": 0.69921875, + "learning_rate": 2e-05, + "loss": 2.3064, + "num_input_tokens_seen": 3126853632, + "step": 1491 + }, + { + "epoch": 0.12, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.3084, + "num_input_tokens_seen": 3128950784, + "step": 1492 + }, + { + "epoch": 0.12, + "grad_norm": 0.8125, + "learning_rate": 2e-05, + "loss": 2.3146, + "num_input_tokens_seen": 3131047936, + "step": 1493 + }, + { + "epoch": 0.12, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.2993, + "num_input_tokens_seen": 3133145088, + "step": 1494 + }, + { + "epoch": 0.12, + "grad_norm": 0.765625, + "learning_rate": 2e-05, + "loss": 2.2999, + "num_input_tokens_seen": 3135242240, + "step": 1495 + }, + { + "epoch": 0.12, + "grad_norm": 1.140625, + "learning_rate": 2e-05, + "loss": 2.3379, + "num_input_tokens_seen": 3137339392, + "step": 1496 + }, + { + "epoch": 0.13, + "grad_norm": 0.75, + "learning_rate": 2e-05, + "loss": 2.3023, + "num_input_tokens_seen": 3139436544, + "step": 1497 + }, + { + "epoch": 0.13, + "grad_norm": 0.71875, + "learning_rate": 2e-05, + "loss": 2.3057, + "num_input_tokens_seen": 3141533696, + "step": 1498 + }, + { + "epoch": 0.13, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2778, + "num_input_tokens_seen": 3143630848, + "step": 1499 + }, + { + "epoch": 0.13, + "grad_norm": 0.6875, + "learning_rate": 2e-05, + "loss": 2.3014, + "num_input_tokens_seen": 3145728000, + "step": 1500 + }, + { + "epoch": 0.13, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.3445, + "num_input_tokens_seen": 3147825152, + "step": 1501 + }, + { + "epoch": 0.13, + "grad_norm": 0.70703125, + "learning_rate": 2e-05, + "loss": 2.3035, + "num_input_tokens_seen": 3149922304, + "step": 1502 + }, + { + "epoch": 0.13, + "grad_norm": 0.75, + "learning_rate": 2e-05, + "loss": 2.3127, + "num_input_tokens_seen": 3152019456, + "step": 1503 + }, + { + "epoch": 0.13, + "grad_norm": 0.70703125, + "learning_rate": 2e-05, + "loss": 2.3074, + "num_input_tokens_seen": 3154116608, + "step": 1504 + }, + { + "epoch": 0.13, + "grad_norm": 0.72265625, + "learning_rate": 2e-05, + "loss": 2.3211, + "num_input_tokens_seen": 3156213760, + "step": 1505 + }, + { + "epoch": 0.13, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.3333, + "num_input_tokens_seen": 3158310912, + "step": 1506 + }, + { + "epoch": 0.13, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 2.347, + "num_input_tokens_seen": 3160408064, + "step": 1507 + }, + { + "epoch": 0.13, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.3159, + "num_input_tokens_seen": 3162505216, + "step": 1508 + }, + { + "epoch": 0.13, + "grad_norm": 0.66796875, + "learning_rate": 2e-05, + "loss": 2.3148, + "num_input_tokens_seen": 3164602368, + "step": 1509 + }, + { + "epoch": 0.13, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.3245, + "num_input_tokens_seen": 3166699520, + "step": 1510 + }, + { + "epoch": 0.13, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 2.2918, + "num_input_tokens_seen": 3168796672, + "step": 1511 + }, + { + "epoch": 0.13, + "grad_norm": 0.67578125, + "learning_rate": 2e-05, + "loss": 2.3361, + "num_input_tokens_seen": 3170893824, + "step": 1512 + }, + { + "epoch": 0.13, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.2995, + "num_input_tokens_seen": 3172990976, + "step": 1513 + }, + { + "epoch": 0.13, + "grad_norm": 0.671875, + "learning_rate": 2e-05, + "loss": 2.3034, + "num_input_tokens_seen": 3175088128, + "step": 1514 + }, + { + "epoch": 0.13, + "grad_norm": 0.64453125, + "learning_rate": 2e-05, + "loss": 2.2793, + "num_input_tokens_seen": 3177185280, + "step": 1515 + }, + { + "epoch": 0.13, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.2936, + "num_input_tokens_seen": 3179282432, + "step": 1516 + }, + { + "epoch": 0.13, + "grad_norm": 0.6875, + "learning_rate": 2e-05, + "loss": 2.2948, + "num_input_tokens_seen": 3181379584, + "step": 1517 + }, + { + "epoch": 0.13, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.2864, + "num_input_tokens_seen": 3183476736, + "step": 1518 + }, + { + "epoch": 0.13, + "grad_norm": 0.67578125, + "learning_rate": 2e-05, + "loss": 2.2957, + "num_input_tokens_seen": 3185573888, + "step": 1519 + }, + { + "epoch": 0.13, + "grad_norm": 0.65625, + "learning_rate": 2e-05, + "loss": 2.2876, + "num_input_tokens_seen": 3187671040, + "step": 1520 + }, + { + "epoch": 0.13, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.3028, + "num_input_tokens_seen": 3189768192, + "step": 1521 + }, + { + "epoch": 0.13, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.3318, + "num_input_tokens_seen": 3191865344, + "step": 1522 + }, + { + "epoch": 0.13, + "grad_norm": 0.671875, + "learning_rate": 2e-05, + "loss": 2.305, + "num_input_tokens_seen": 3193962496, + "step": 1523 + }, + { + "epoch": 0.13, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.3164, + "num_input_tokens_seen": 3196059648, + "step": 1524 + }, + { + "epoch": 0.13, + "grad_norm": 0.62109375, + "learning_rate": 2e-05, + "loss": 2.3211, + "num_input_tokens_seen": 3198156800, + "step": 1525 + }, + { + "epoch": 0.13, + "grad_norm": 0.75390625, + "learning_rate": 2e-05, + "loss": 2.3152, + "num_input_tokens_seen": 3200253952, + "step": 1526 + }, + { + "epoch": 0.13, + "grad_norm": 0.68359375, + "learning_rate": 2e-05, + "loss": 2.3243, + "num_input_tokens_seen": 3202351104, + "step": 1527 + }, + { + "epoch": 0.13, + "grad_norm": 0.6875, + "learning_rate": 2e-05, + "loss": 2.3001, + "num_input_tokens_seen": 3204448256, + "step": 1528 + }, + { + "epoch": 0.13, + "grad_norm": 0.7265625, + "learning_rate": 2e-05, + "loss": 2.292, + "num_input_tokens_seen": 3206545408, + "step": 1529 + }, + { + "epoch": 0.13, + "grad_norm": 0.6953125, + "learning_rate": 2e-05, + "loss": 2.3027, + "num_input_tokens_seen": 3208642560, + "step": 1530 + }, + { + "epoch": 0.13, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.2806, + "num_input_tokens_seen": 3210739712, + "step": 1531 + }, + { + "epoch": 0.13, + "grad_norm": 0.66796875, + "learning_rate": 2e-05, + "loss": 2.3103, + "num_input_tokens_seen": 3212836864, + "step": 1532 + }, + { + "epoch": 0.13, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.3184, + "num_input_tokens_seen": 3214934016, + "step": 1533 + }, + { + "epoch": 0.13, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.2944, + "num_input_tokens_seen": 3217031168, + "step": 1534 + }, + { + "epoch": 0.13, + "grad_norm": 0.6875, + "learning_rate": 2e-05, + "loss": 2.3094, + "num_input_tokens_seen": 3219128320, + "step": 1535 + }, + { + "epoch": 0.13, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.2967, + "num_input_tokens_seen": 3221225472, + "step": 1536 + }, + { + "epoch": 0.13, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.3103, + "num_input_tokens_seen": 3223322624, + "step": 1537 + }, + { + "epoch": 0.13, + "grad_norm": 0.76171875, + "learning_rate": 2e-05, + "loss": 2.2884, + "num_input_tokens_seen": 3225419776, + "step": 1538 + }, + { + "epoch": 0.13, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.2545, + "num_input_tokens_seen": 3227516928, + "step": 1539 + }, + { + "epoch": 0.13, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.3273, + "num_input_tokens_seen": 3229614080, + "step": 1540 + }, + { + "epoch": 0.13, + "grad_norm": 0.6875, + "learning_rate": 2e-05, + "loss": 2.302, + "num_input_tokens_seen": 3231711232, + "step": 1541 + }, + { + "epoch": 0.13, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.2974, + "num_input_tokens_seen": 3233808384, + "step": 1542 + }, + { + "epoch": 0.13, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.2889, + "num_input_tokens_seen": 3235905536, + "step": 1543 + }, + { + "epoch": 0.14, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.2623, + "num_input_tokens_seen": 3238002688, + "step": 1544 + }, + { + "epoch": 0.14, + "grad_norm": 0.66015625, + "learning_rate": 2e-05, + "loss": 2.2779, + "num_input_tokens_seen": 3240099840, + "step": 1545 + }, + { + "epoch": 0.14, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.3033, + "num_input_tokens_seen": 3242196992, + "step": 1546 + }, + { + "epoch": 0.14, + "grad_norm": 0.69140625, + "learning_rate": 2e-05, + "loss": 2.33, + "num_input_tokens_seen": 3244294144, + "step": 1547 + }, + { + "epoch": 0.14, + "grad_norm": 0.6640625, + "learning_rate": 2e-05, + "loss": 2.2953, + "num_input_tokens_seen": 3246391296, + "step": 1548 + }, + { + "epoch": 0.14, + "grad_norm": 0.68359375, + "learning_rate": 2e-05, + "loss": 2.3035, + "num_input_tokens_seen": 3248488448, + "step": 1549 + }, + { + "epoch": 0.14, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.2579, + "num_input_tokens_seen": 3250585600, + "step": 1550 + }, + { + "epoch": 0.14, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.3027, + "num_input_tokens_seen": 3252682752, + "step": 1551 + }, + { + "epoch": 0.14, + "grad_norm": 0.7265625, + "learning_rate": 2e-05, + "loss": 2.2756, + "num_input_tokens_seen": 3254779904, + "step": 1552 + }, + { + "epoch": 0.14, + "grad_norm": 0.796875, + "learning_rate": 2e-05, + "loss": 2.2685, + "num_input_tokens_seen": 3256877056, + "step": 1553 + }, + { + "epoch": 0.14, + "grad_norm": 0.6015625, + "learning_rate": 2e-05, + "loss": 2.2914, + "num_input_tokens_seen": 3258974208, + "step": 1554 + }, + { + "epoch": 0.14, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.296, + "num_input_tokens_seen": 3261071360, + "step": 1555 + }, + { + "epoch": 0.14, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.2912, + "num_input_tokens_seen": 3263168512, + "step": 1556 + }, + { + "epoch": 0.14, + "grad_norm": 0.61328125, + "learning_rate": 2e-05, + "loss": 2.3095, + "num_input_tokens_seen": 3265265664, + "step": 1557 + }, + { + "epoch": 0.14, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.3127, + "num_input_tokens_seen": 3267362816, + "step": 1558 + }, + { + "epoch": 0.14, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2919, + "num_input_tokens_seen": 3269459968, + "step": 1559 + }, + { + "epoch": 0.14, + "grad_norm": 0.63671875, + "learning_rate": 2e-05, + "loss": 2.2808, + "num_input_tokens_seen": 3271557120, + "step": 1560 + }, + { + "epoch": 0.14, + "grad_norm": 0.9765625, + "learning_rate": 2e-05, + "loss": 2.2931, + "num_input_tokens_seen": 3273654272, + "step": 1561 + }, + { + "epoch": 0.14, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2928, + "num_input_tokens_seen": 3275751424, + "step": 1562 + }, + { + "epoch": 0.14, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.2614, + "num_input_tokens_seen": 3277848576, + "step": 1563 + }, + { + "epoch": 0.14, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.2983, + "num_input_tokens_seen": 3279945728, + "step": 1564 + }, + { + "epoch": 0.14, + "grad_norm": 0.71875, + "learning_rate": 2e-05, + "loss": 2.2946, + "num_input_tokens_seen": 3282042880, + "step": 1565 + }, + { + "epoch": 0.14, + "grad_norm": 1.1484375, + "learning_rate": 2e-05, + "loss": 2.3209, + "num_input_tokens_seen": 3284140032, + "step": 1566 + }, + { + "epoch": 0.14, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.291, + "num_input_tokens_seen": 3286237184, + "step": 1567 + }, + { + "epoch": 0.14, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.3051, + "num_input_tokens_seen": 3288334336, + "step": 1568 + }, + { + "epoch": 0.14, + "grad_norm": 0.70703125, + "learning_rate": 2e-05, + "loss": 2.2759, + "num_input_tokens_seen": 3290431488, + "step": 1569 + }, + { + "epoch": 0.14, + "grad_norm": 0.984375, + "learning_rate": 2e-05, + "loss": 2.3034, + "num_input_tokens_seen": 3292528640, + "step": 1570 + }, + { + "epoch": 0.14, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.2953, + "num_input_tokens_seen": 3294625792, + "step": 1571 + }, + { + "epoch": 0.14, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2751, + "num_input_tokens_seen": 3296722944, + "step": 1572 + }, + { + "epoch": 0.14, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.3009, + "num_input_tokens_seen": 3298820096, + "step": 1573 + }, + { + "epoch": 0.14, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.2618, + "num_input_tokens_seen": 3300917248, + "step": 1574 + }, + { + "epoch": 0.14, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.3361, + "num_input_tokens_seen": 3303014400, + "step": 1575 + }, + { + "epoch": 0.14, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.3172, + "num_input_tokens_seen": 3305111552, + "step": 1576 + }, + { + "epoch": 0.14, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.2948, + "num_input_tokens_seen": 3307208704, + "step": 1577 + }, + { + "epoch": 0.14, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.3025, + "num_input_tokens_seen": 3309305856, + "step": 1578 + }, + { + "epoch": 0.14, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.3234, + "num_input_tokens_seen": 3311403008, + "step": 1579 + }, + { + "epoch": 0.14, + "grad_norm": 0.671875, + "learning_rate": 2e-05, + "loss": 2.3175, + "num_input_tokens_seen": 3313500160, + "step": 1580 + }, + { + "epoch": 0.14, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.291, + "num_input_tokens_seen": 3315597312, + "step": 1581 + }, + { + "epoch": 0.14, + "grad_norm": 0.8125, + "learning_rate": 2e-05, + "loss": 2.3051, + "num_input_tokens_seen": 3317694464, + "step": 1582 + }, + { + "epoch": 0.14, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.3157, + "num_input_tokens_seen": 3319791616, + "step": 1583 + }, + { + "epoch": 0.14, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.3184, + "num_input_tokens_seen": 3321888768, + "step": 1584 + }, + { + "epoch": 0.14, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.2849, + "num_input_tokens_seen": 3323985920, + "step": 1585 + }, + { + "epoch": 0.14, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.3032, + "num_input_tokens_seen": 3326083072, + "step": 1586 + }, + { + "epoch": 0.14, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.3154, + "num_input_tokens_seen": 3328180224, + "step": 1587 + }, + { + "epoch": 0.14, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.3037, + "num_input_tokens_seen": 3330277376, + "step": 1588 + }, + { + "epoch": 0.14, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 2.2993, + "num_input_tokens_seen": 3332374528, + "step": 1589 + }, + { + "epoch": 0.14, + "grad_norm": 0.99609375, + "learning_rate": 2e-05, + "loss": 2.3348, + "num_input_tokens_seen": 3334471680, + "step": 1590 + }, + { + "epoch": 0.14, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.3247, + "num_input_tokens_seen": 3336568832, + "step": 1591 + }, + { + "epoch": 0.15, + "grad_norm": 0.72265625, + "learning_rate": 2e-05, + "loss": 2.2836, + "num_input_tokens_seen": 3338665984, + "step": 1592 + }, + { + "epoch": 0.15, + "eval_loss": 2.326249837875366, + "eval_runtime": 3014.9343, + "eval_samples_per_second": 1.307, + "eval_steps_per_second": 0.327, + "num_input_tokens_seen": 3338665984, + "step": 1592 + }, + { + "epoch": 0.15, + "grad_norm": 0.984375, + "learning_rate": 2e-05, + "loss": 2.2936, + "num_input_tokens_seen": 3340763136, + "step": 1593 + }, + { + "epoch": 0.15, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.2818, + "num_input_tokens_seen": 3342860288, + "step": 1594 + }, + { + "epoch": 0.15, + "grad_norm": 0.75390625, + "learning_rate": 2e-05, + "loss": 2.2946, + "num_input_tokens_seen": 3344957440, + "step": 1595 + }, + { + "epoch": 0.15, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.2852, + "num_input_tokens_seen": 3347054592, + "step": 1596 + }, + { + "epoch": 0.15, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.2745, + "num_input_tokens_seen": 3349151744, + "step": 1597 + }, + { + "epoch": 0.15, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.2982, + "num_input_tokens_seen": 3351248896, + "step": 1598 + }, + { + "epoch": 0.15, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.3064, + "num_input_tokens_seen": 3353346048, + "step": 1599 + }, + { + "epoch": 0.15, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.3027, + "num_input_tokens_seen": 3355443200, + "step": 1600 + }, + { + "epoch": 0.15, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.2891, + "num_input_tokens_seen": 3357540352, + "step": 1601 + }, + { + "epoch": 0.15, + "grad_norm": 0.94140625, + "learning_rate": 2e-05, + "loss": 2.2901, + "num_input_tokens_seen": 3359637504, + "step": 1602 + }, + { + "epoch": 0.15, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.3202, + "num_input_tokens_seen": 3361734656, + "step": 1603 + }, + { + "epoch": 0.15, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.2695, + "num_input_tokens_seen": 3363831808, + "step": 1604 + }, + { + "epoch": 0.15, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2895, + "num_input_tokens_seen": 3365928960, + "step": 1605 + }, + { + "epoch": 0.15, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.2827, + "num_input_tokens_seen": 3368026112, + "step": 1606 + }, + { + "epoch": 0.15, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.2657, + "num_input_tokens_seen": 3370123264, + "step": 1607 + }, + { + "epoch": 0.15, + "grad_norm": 0.734375, + "learning_rate": 2e-05, + "loss": 2.2842, + "num_input_tokens_seen": 3372220416, + "step": 1608 + }, + { + "epoch": 0.15, + "grad_norm": 1.0234375, + "learning_rate": 2e-05, + "loss": 2.257, + "num_input_tokens_seen": 3374317568, + "step": 1609 + }, + { + "epoch": 0.15, + "grad_norm": 0.9453125, + "learning_rate": 2e-05, + "loss": 2.2934, + "num_input_tokens_seen": 3376414720, + "step": 1610 + }, + { + "epoch": 0.15, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2541, + "num_input_tokens_seen": 3378511872, + "step": 1611 + }, + { + "epoch": 0.15, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.297, + "num_input_tokens_seen": 3380609024, + "step": 1612 + }, + { + "epoch": 0.15, + "grad_norm": 0.96875, + "learning_rate": 2e-05, + "loss": 2.2778, + "num_input_tokens_seen": 3382706176, + "step": 1613 + }, + { + "epoch": 0.15, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.3148, + "num_input_tokens_seen": 3384803328, + "step": 1614 + }, + { + "epoch": 0.15, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.2765, + "num_input_tokens_seen": 3386900480, + "step": 1615 + }, + { + "epoch": 0.15, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2946, + "num_input_tokens_seen": 3388997632, + "step": 1616 + }, + { + "epoch": 0.15, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 2.2784, + "num_input_tokens_seen": 3391094784, + "step": 1617 + }, + { + "epoch": 0.15, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.306, + "num_input_tokens_seen": 3393191936, + "step": 1618 + }, + { + "epoch": 0.15, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2996, + "num_input_tokens_seen": 3395289088, + "step": 1619 + }, + { + "epoch": 0.15, + "grad_norm": 0.72265625, + "learning_rate": 2e-05, + "loss": 2.2958, + "num_input_tokens_seen": 3397386240, + "step": 1620 + }, + { + "epoch": 0.15, + "grad_norm": 0.609375, + "learning_rate": 2e-05, + "loss": 2.2875, + "num_input_tokens_seen": 3399483392, + "step": 1621 + }, + { + "epoch": 0.15, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.3253, + "num_input_tokens_seen": 3401580544, + "step": 1622 + }, + { + "epoch": 0.15, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.3098, + "num_input_tokens_seen": 3403677696, + "step": 1623 + }, + { + "epoch": 0.15, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 2.3131, + "num_input_tokens_seen": 3405774848, + "step": 1624 + }, + { + "epoch": 0.15, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.2679, + "num_input_tokens_seen": 3407872000, + "step": 1625 + }, + { + "epoch": 0.15, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.2878, + "num_input_tokens_seen": 3409969152, + "step": 1626 + }, + { + "epoch": 0.15, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.3364, + "num_input_tokens_seen": 3412066304, + "step": 1627 + }, + { + "epoch": 0.15, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 2.3332, + "num_input_tokens_seen": 3414163456, + "step": 1628 + }, + { + "epoch": 0.15, + "grad_norm": 0.703125, + "learning_rate": 2e-05, + "loss": 2.3267, + "num_input_tokens_seen": 3416260608, + "step": 1629 + }, + { + "epoch": 0.15, + "grad_norm": 0.75390625, + "learning_rate": 2e-05, + "loss": 2.2794, + "num_input_tokens_seen": 3418357760, + "step": 1630 + }, + { + "epoch": 0.15, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.3222, + "num_input_tokens_seen": 3420454912, + "step": 1631 + }, + { + "epoch": 0.15, + "grad_norm": 0.6484375, + "learning_rate": 2e-05, + "loss": 2.3013, + "num_input_tokens_seen": 3422552064, + "step": 1632 + }, + { + "epoch": 0.15, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.3088, + "num_input_tokens_seen": 3424649216, + "step": 1633 + }, + { + "epoch": 0.15, + "grad_norm": 0.765625, + "learning_rate": 2e-05, + "loss": 2.3134, + "num_input_tokens_seen": 3426746368, + "step": 1634 + }, + { + "epoch": 0.15, + "grad_norm": 0.60546875, + "learning_rate": 2e-05, + "loss": 2.3117, + "num_input_tokens_seen": 3428843520, + "step": 1635 + }, + { + "epoch": 0.15, + "grad_norm": 0.609375, + "learning_rate": 2e-05, + "loss": 2.2822, + "num_input_tokens_seen": 3430940672, + "step": 1636 + }, + { + "epoch": 0.15, + "grad_norm": 0.640625, + "learning_rate": 2e-05, + "loss": 2.2607, + "num_input_tokens_seen": 3433037824, + "step": 1637 + }, + { + "epoch": 0.15, + "grad_norm": 0.59765625, + "learning_rate": 2e-05, + "loss": 2.2904, + "num_input_tokens_seen": 3435134976, + "step": 1638 + }, + { + "epoch": 0.15, + "grad_norm": 0.62109375, + "learning_rate": 2e-05, + "loss": 2.2887, + "num_input_tokens_seen": 3437232128, + "step": 1639 + }, + { + "epoch": 0.16, + "grad_norm": 0.5859375, + "learning_rate": 2e-05, + "loss": 2.2991, + "num_input_tokens_seen": 3439329280, + "step": 1640 + }, + { + "epoch": 0.16, + "grad_norm": 0.67578125, + "learning_rate": 2e-05, + "loss": 2.3296, + "num_input_tokens_seen": 3441426432, + "step": 1641 + }, + { + "epoch": 0.16, + "grad_norm": 0.640625, + "learning_rate": 2e-05, + "loss": 2.3204, + "num_input_tokens_seen": 3443523584, + "step": 1642 + }, + { + "epoch": 0.16, + "grad_norm": 0.63671875, + "learning_rate": 2e-05, + "loss": 2.307, + "num_input_tokens_seen": 3445620736, + "step": 1643 + }, + { + "epoch": 0.16, + "grad_norm": 0.65625, + "learning_rate": 2e-05, + "loss": 2.2909, + "num_input_tokens_seen": 3447717888, + "step": 1644 + }, + { + "epoch": 0.16, + "grad_norm": 0.6640625, + "learning_rate": 2e-05, + "loss": 2.331, + "num_input_tokens_seen": 3449815040, + "step": 1645 + }, + { + "epoch": 0.16, + "grad_norm": 0.59375, + "learning_rate": 2e-05, + "loss": 2.3177, + "num_input_tokens_seen": 3451912192, + "step": 1646 + }, + { + "epoch": 0.16, + "grad_norm": 0.6796875, + "learning_rate": 2e-05, + "loss": 2.3045, + "num_input_tokens_seen": 3454009344, + "step": 1647 + }, + { + "epoch": 0.16, + "grad_norm": 0.69921875, + "learning_rate": 2e-05, + "loss": 2.3235, + "num_input_tokens_seen": 3456106496, + "step": 1648 + }, + { + "epoch": 0.16, + "grad_norm": 0.63671875, + "learning_rate": 2e-05, + "loss": 2.3073, + "num_input_tokens_seen": 3458203648, + "step": 1649 + }, + { + "epoch": 0.16, + "grad_norm": 0.6015625, + "learning_rate": 2e-05, + "loss": 2.2951, + "num_input_tokens_seen": 3460300800, + "step": 1650 + }, + { + "epoch": 0.16, + "grad_norm": 0.75390625, + "learning_rate": 2e-05, + "loss": 2.2826, + "num_input_tokens_seen": 3462397952, + "step": 1651 + }, + { + "epoch": 0.16, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.2537, + "num_input_tokens_seen": 3464495104, + "step": 1652 + }, + { + "epoch": 0.16, + "grad_norm": 0.6171875, + "learning_rate": 2e-05, + "loss": 2.294, + "num_input_tokens_seen": 3466592256, + "step": 1653 + }, + { + "epoch": 0.16, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.273, + "num_input_tokens_seen": 3468689408, + "step": 1654 + }, + { + "epoch": 0.16, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.3012, + "num_input_tokens_seen": 3470786560, + "step": 1655 + }, + { + "epoch": 0.16, + "grad_norm": 0.671875, + "learning_rate": 2e-05, + "loss": 2.2721, + "num_input_tokens_seen": 3472883712, + "step": 1656 + }, + { + "epoch": 0.16, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.2877, + "num_input_tokens_seen": 3474980864, + "step": 1657 + }, + { + "epoch": 0.16, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.3078, + "num_input_tokens_seen": 3477078016, + "step": 1658 + }, + { + "epoch": 0.16, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 2.2888, + "num_input_tokens_seen": 3479175168, + "step": 1659 + }, + { + "epoch": 0.16, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.316, + "num_input_tokens_seen": 3481272320, + "step": 1660 + }, + { + "epoch": 0.16, + "grad_norm": 0.66796875, + "learning_rate": 2e-05, + "loss": 2.2982, + "num_input_tokens_seen": 3483369472, + "step": 1661 + }, + { + "epoch": 0.16, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.2853, + "num_input_tokens_seen": 3485466624, + "step": 1662 + }, + { + "epoch": 0.16, + "grad_norm": 0.734375, + "learning_rate": 2e-05, + "loss": 2.3363, + "num_input_tokens_seen": 3487563776, + "step": 1663 + }, + { + "epoch": 0.16, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.3287, + "num_input_tokens_seen": 3489660928, + "step": 1664 + }, + { + "epoch": 0.16, + "grad_norm": 0.703125, + "learning_rate": 2e-05, + "loss": 2.3049, + "num_input_tokens_seen": 3491758080, + "step": 1665 + }, + { + "epoch": 0.16, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2986, + "num_input_tokens_seen": 3493855232, + "step": 1666 + }, + { + "epoch": 0.16, + "grad_norm": 0.71875, + "learning_rate": 2e-05, + "loss": 2.3511, + "num_input_tokens_seen": 3495952384, + "step": 1667 + }, + { + "epoch": 0.16, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2855, + "num_input_tokens_seen": 3498049536, + "step": 1668 + }, + { + "epoch": 0.16, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.2881, + "num_input_tokens_seen": 3500146688, + "step": 1669 + }, + { + "epoch": 0.16, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 2.3014, + "num_input_tokens_seen": 3502243840, + "step": 1670 + }, + { + "epoch": 0.16, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.3102, + "num_input_tokens_seen": 3504340992, + "step": 1671 + }, + { + "epoch": 0.16, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.2604, + "num_input_tokens_seen": 3506438144, + "step": 1672 + }, + { + "epoch": 0.16, + "grad_norm": 0.6328125, + "learning_rate": 2e-05, + "loss": 2.3257, + "num_input_tokens_seen": 3508535296, + "step": 1673 + }, + { + "epoch": 0.16, + "grad_norm": 0.765625, + "learning_rate": 2e-05, + "loss": 2.2873, + "num_input_tokens_seen": 3510632448, + "step": 1674 + }, + { + "epoch": 0.16, + "grad_norm": 0.75390625, + "learning_rate": 2e-05, + "loss": 2.3142, + "num_input_tokens_seen": 3512729600, + "step": 1675 + }, + { + "epoch": 0.16, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2739, + "num_input_tokens_seen": 3514826752, + "step": 1676 + }, + { + "epoch": 0.16, + "grad_norm": 0.75, + "learning_rate": 2e-05, + "loss": 2.2643, + "num_input_tokens_seen": 3516923904, + "step": 1677 + }, + { + "epoch": 0.16, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.3014, + "num_input_tokens_seen": 3519021056, + "step": 1678 + }, + { + "epoch": 0.16, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.308, + "num_input_tokens_seen": 3521118208, + "step": 1679 + }, + { + "epoch": 0.16, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.3124, + "num_input_tokens_seen": 3523215360, + "step": 1680 + }, + { + "epoch": 0.16, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.3048, + "num_input_tokens_seen": 3525312512, + "step": 1681 + }, + { + "epoch": 0.16, + "grad_norm": 0.984375, + "learning_rate": 2e-05, + "loss": 2.275, + "num_input_tokens_seen": 3527409664, + "step": 1682 + }, + { + "epoch": 0.16, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 2.2858, + "num_input_tokens_seen": 3529506816, + "step": 1683 + }, + { + "epoch": 0.16, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 2.2759, + "num_input_tokens_seen": 3531603968, + "step": 1684 + }, + { + "epoch": 0.16, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 2.2927, + "num_input_tokens_seen": 3533701120, + "step": 1685 + }, + { + "epoch": 0.16, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.2666, + "num_input_tokens_seen": 3535798272, + "step": 1686 + }, + { + "epoch": 0.17, + "grad_norm": 1.171875, + "learning_rate": 2e-05, + "loss": 2.2874, + "num_input_tokens_seen": 3537895424, + "step": 1687 + }, + { + "epoch": 0.17, + "grad_norm": 1.3125, + "learning_rate": 2e-05, + "loss": 2.2401, + "num_input_tokens_seen": 3539992576, + "step": 1688 + }, + { + "epoch": 0.17, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 2.2555, + "num_input_tokens_seen": 3542089728, + "step": 1689 + }, + { + "epoch": 0.17, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.2911, + "num_input_tokens_seen": 3544186880, + "step": 1690 + }, + { + "epoch": 0.17, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.3082, + "num_input_tokens_seen": 3546284032, + "step": 1691 + }, + { + "epoch": 0.17, + "grad_norm": 0.94140625, + "learning_rate": 2e-05, + "loss": 2.2766, + "num_input_tokens_seen": 3548381184, + "step": 1692 + }, + { + "epoch": 0.17, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.2442, + "num_input_tokens_seen": 3550478336, + "step": 1693 + }, + { + "epoch": 0.17, + "grad_norm": 0.9765625, + "learning_rate": 2e-05, + "loss": 2.2798, + "num_input_tokens_seen": 3552575488, + "step": 1694 + }, + { + "epoch": 0.17, + "grad_norm": 0.9296875, + "learning_rate": 2e-05, + "loss": 2.2557, + "num_input_tokens_seen": 3554672640, + "step": 1695 + }, + { + "epoch": 0.17, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.2351, + "num_input_tokens_seen": 3556769792, + "step": 1696 + }, + { + "epoch": 0.17, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.2715, + "num_input_tokens_seen": 3558866944, + "step": 1697 + }, + { + "epoch": 0.17, + "grad_norm": 1.0234375, + "learning_rate": 2e-05, + "loss": 2.2869, + "num_input_tokens_seen": 3560964096, + "step": 1698 + }, + { + "epoch": 0.17, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 2.2346, + "num_input_tokens_seen": 3563061248, + "step": 1699 + }, + { + "epoch": 0.17, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.2722, + "num_input_tokens_seen": 3565158400, + "step": 1700 + } + ], + "logging_steps": 1, + "max_steps": 4768, + "num_input_tokens_seen": 3565158400, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "total_flos": 1.5210377742479524e+20, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}