{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.4, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032, "grad_norm": 0.3297976851463318, "learning_rate": 0.0002990322580645161, "loss": 1.0389, "step": 1 }, { "epoch": 0.064, "grad_norm": 0.4069916307926178, "learning_rate": 0.0002980645161290322, "loss": 1.3377, "step": 2 }, { "epoch": 0.096, "grad_norm": 0.42084500193595886, "learning_rate": 0.00029709677419354836, "loss": 0.9366, "step": 3 }, { "epoch": 0.128, "grad_norm": 0.4641948938369751, "learning_rate": 0.0002961290322580645, "loss": 1.0086, "step": 4 }, { "epoch": 0.16, "grad_norm": 0.3840750455856323, "learning_rate": 0.00029516129032258065, "loss": 0.8333, "step": 5 }, { "epoch": 0.192, "grad_norm": 0.4263865053653717, "learning_rate": 0.00029419354838709674, "loss": 0.854, "step": 6 }, { "epoch": 0.224, "grad_norm": 0.48615148663520813, "learning_rate": 0.0002932258064516129, "loss": 0.9548, "step": 7 }, { "epoch": 0.256, "grad_norm": 0.44419369101524353, "learning_rate": 0.00029225806451612903, "loss": 0.8482, "step": 8 }, { "epoch": 0.288, "grad_norm": 0.5317733883857727, "learning_rate": 0.0002912903225806451, "loss": 0.9426, "step": 9 }, { "epoch": 0.32, "grad_norm": 0.47260937094688416, "learning_rate": 0.00029032258064516127, "loss": 0.9816, "step": 10 }, { "epoch": 0.352, "grad_norm": 0.39063283801078796, "learning_rate": 0.00028935483870967736, "loss": 0.84, "step": 11 }, { "epoch": 0.384, "grad_norm": 0.39234670996665955, "learning_rate": 0.0002883870967741935, "loss": 0.7476, "step": 12 }, { "epoch": 0.416, "grad_norm": 0.40661805868148804, "learning_rate": 0.00028741935483870965, "loss": 0.9282, "step": 13 }, { "epoch": 0.448, "grad_norm": 0.42970865964889526, "learning_rate": 0.0002864516129032258, "loss": 0.7858, "step": 14 }, { "epoch": 0.48, "grad_norm": 0.3780193626880646, "learning_rate": 0.00028548387096774194, "loss": 0.7968, "step": 15 }, { "epoch": 0.512, "grad_norm": 0.37006014585494995, "learning_rate": 0.00028451612903225803, "loss": 0.6801, "step": 16 }, { "epoch": 0.544, "grad_norm": 0.3660840392112732, "learning_rate": 0.0002835483870967742, "loss": 0.5914, "step": 17 }, { "epoch": 0.576, "grad_norm": 0.3270975351333618, "learning_rate": 0.00028258064516129027, "loss": 0.6449, "step": 18 }, { "epoch": 0.608, "grad_norm": 0.3859024941921234, "learning_rate": 0.0002816129032258064, "loss": 0.8144, "step": 19 }, { "epoch": 0.64, "grad_norm": 0.37092071771621704, "learning_rate": 0.00028064516129032256, "loss": 0.7667, "step": 20 }, { "epoch": 0.672, "grad_norm": 0.37667015194892883, "learning_rate": 0.0002796774193548387, "loss": 0.7751, "step": 21 }, { "epoch": 0.704, "grad_norm": 0.3832458555698395, "learning_rate": 0.0002787096774193548, "loss": 0.755, "step": 22 }, { "epoch": 0.736, "grad_norm": 0.327288419008255, "learning_rate": 0.00027774193548387095, "loss": 0.7178, "step": 23 }, { "epoch": 0.768, "grad_norm": 0.34552687406539917, "learning_rate": 0.0002767741935483871, "loss": 0.7057, "step": 24 }, { "epoch": 0.8, "grad_norm": 0.3611259460449219, "learning_rate": 0.0002758064516129032, "loss": 0.8159, "step": 25 }, { "epoch": 0.832, "grad_norm": 0.3345054090023041, "learning_rate": 0.00027483870967741933, "loss": 0.7208, "step": 26 }, { "epoch": 0.864, "grad_norm": 0.3697254955768585, "learning_rate": 0.0002738709677419355, "loss": 0.8964, "step": 27 }, { "epoch": 0.896, "grad_norm": 0.3905017375946045, "learning_rate": 0.00027290322580645157, "loss": 0.7794, "step": 28 }, { "epoch": 0.928, "grad_norm": 0.3715725243091583, "learning_rate": 0.0002719354838709677, "loss": 0.6966, "step": 29 }, { "epoch": 0.96, "grad_norm": 0.3650343120098114, "learning_rate": 0.00027096774193548386, "loss": 0.5761, "step": 30 }, { "epoch": 0.992, "grad_norm": 0.33932459354400635, "learning_rate": 0.00027, "loss": 0.556, "step": 31 }, { "epoch": 1.024, "grad_norm": 0.6371742486953735, "learning_rate": 0.0002690322580645161, "loss": 0.847, "step": 32 }, { "epoch": 1.056, "grad_norm": 0.37499895691871643, "learning_rate": 0.00026806451612903224, "loss": 0.8419, "step": 33 }, { "epoch": 1.088, "grad_norm": 0.33221954107284546, "learning_rate": 0.0002670967741935484, "loss": 0.6011, "step": 34 }, { "epoch": 1.12, "grad_norm": 0.344096839427948, "learning_rate": 0.0002661290322580645, "loss": 0.6501, "step": 35 }, { "epoch": 1.152, "grad_norm": 0.38429391384124756, "learning_rate": 0.0002651612903225806, "loss": 0.8091, "step": 36 }, { "epoch": 1.184, "grad_norm": 0.38014867901802063, "learning_rate": 0.00026419354838709677, "loss": 0.7668, "step": 37 }, { "epoch": 1.216, "grad_norm": 0.3352573812007904, "learning_rate": 0.00026322580645161286, "loss": 0.5444, "step": 38 }, { "epoch": 1.248, "grad_norm": 0.33811062574386597, "learning_rate": 0.000262258064516129, "loss": 0.512, "step": 39 }, { "epoch": 1.28, "grad_norm": 0.3998416066169739, "learning_rate": 0.00026129032258064515, "loss": 0.6315, "step": 40 }, { "epoch": 1.312, "grad_norm": 0.3983341157436371, "learning_rate": 0.0002603225806451613, "loss": 0.5882, "step": 41 }, { "epoch": 1.3439999999999999, "grad_norm": 0.4585898816585541, "learning_rate": 0.0002593548387096774, "loss": 0.761, "step": 42 }, { "epoch": 1.376, "grad_norm": 0.4080730080604553, "learning_rate": 0.00025838709677419354, "loss": 0.6716, "step": 43 }, { "epoch": 1.408, "grad_norm": 0.4068273901939392, "learning_rate": 0.0002574193548387096, "loss": 0.6376, "step": 44 }, { "epoch": 1.44, "grad_norm": 0.4406949579715729, "learning_rate": 0.00025645161290322577, "loss": 0.4594, "step": 45 }, { "epoch": 1.472, "grad_norm": 0.34500986337661743, "learning_rate": 0.0002554838709677419, "loss": 0.3672, "step": 46 }, { "epoch": 1.504, "grad_norm": 0.4760681390762329, "learning_rate": 0.00025451612903225806, "loss": 0.6331, "step": 47 }, { "epoch": 1.536, "grad_norm": 0.39281558990478516, "learning_rate": 0.0002535483870967742, "loss": 0.5845, "step": 48 }, { "epoch": 1.568, "grad_norm": 0.4265002906322479, "learning_rate": 0.0002525806451612903, "loss": 0.4461, "step": 49 }, { "epoch": 1.6, "grad_norm": 0.40967294573783875, "learning_rate": 0.00025161290322580645, "loss": 0.7011, "step": 50 }, { "epoch": 1.6320000000000001, "grad_norm": 0.4288088381290436, "learning_rate": 0.00025064516129032254, "loss": 0.6928, "step": 51 }, { "epoch": 1.6640000000000001, "grad_norm": 0.4356289803981781, "learning_rate": 0.0002496774193548387, "loss": 0.7972, "step": 52 }, { "epoch": 1.696, "grad_norm": 0.3827487826347351, "learning_rate": 0.0002487096774193548, "loss": 0.2991, "step": 53 }, { "epoch": 1.728, "grad_norm": 0.40093398094177246, "learning_rate": 0.0002477419354838709, "loss": 0.416, "step": 54 }, { "epoch": 1.76, "grad_norm": 0.41548973321914673, "learning_rate": 0.00024677419354838707, "loss": 0.5501, "step": 55 }, { "epoch": 1.792, "grad_norm": 0.4093388617038727, "learning_rate": 0.0002458064516129032, "loss": 0.5557, "step": 56 }, { "epoch": 1.8239999999999998, "grad_norm": 0.3934040665626526, "learning_rate": 0.00024483870967741936, "loss": 0.602, "step": 57 }, { "epoch": 1.8559999999999999, "grad_norm": 0.42221033573150635, "learning_rate": 0.00024387096774193545, "loss": 0.6421, "step": 58 }, { "epoch": 1.888, "grad_norm": 0.4351339340209961, "learning_rate": 0.0002429032258064516, "loss": 0.5615, "step": 59 }, { "epoch": 1.92, "grad_norm": 0.4319838881492615, "learning_rate": 0.00024193548387096771, "loss": 0.6804, "step": 60 }, { "epoch": 1.952, "grad_norm": 0.40016525983810425, "learning_rate": 0.00024096774193548386, "loss": 0.5432, "step": 61 }, { "epoch": 1.984, "grad_norm": 0.3905942440032959, "learning_rate": 0.00023999999999999998, "loss": 0.4187, "step": 62 }, { "epoch": 2.016, "grad_norm": 0.8056382536888123, "learning_rate": 0.0002390322580645161, "loss": 1.0174, "step": 63 }, { "epoch": 2.048, "grad_norm": 0.3835236430168152, "learning_rate": 0.00023806451612903224, "loss": 0.5992, "step": 64 }, { "epoch": 2.08, "grad_norm": 0.41092216968536377, "learning_rate": 0.00023709677419354836, "loss": 0.4746, "step": 65 }, { "epoch": 2.112, "grad_norm": 0.39536622166633606, "learning_rate": 0.0002361290322580645, "loss": 0.3946, "step": 66 }, { "epoch": 2.144, "grad_norm": 0.3927665948867798, "learning_rate": 0.0002351612903225806, "loss": 0.5187, "step": 67 }, { "epoch": 2.176, "grad_norm": 0.39792704582214355, "learning_rate": 0.00023419354838709674, "loss": 0.4568, "step": 68 }, { "epoch": 2.208, "grad_norm": 0.5023652911186218, "learning_rate": 0.0002332258064516129, "loss": 0.6166, "step": 69 }, { "epoch": 2.24, "grad_norm": 0.425017774105072, "learning_rate": 0.000232258064516129, "loss": 0.42, "step": 70 }, { "epoch": 2.2720000000000002, "grad_norm": 0.46458110213279724, "learning_rate": 0.00023129032258064516, "loss": 0.4613, "step": 71 }, { "epoch": 2.304, "grad_norm": 0.49037960171699524, "learning_rate": 0.00023032258064516125, "loss": 0.5509, "step": 72 }, { "epoch": 2.336, "grad_norm": 0.5233697891235352, "learning_rate": 0.0002293548387096774, "loss": 0.6396, "step": 73 }, { "epoch": 2.368, "grad_norm": 0.4720582962036133, "learning_rate": 0.0002283870967741935, "loss": 0.5076, "step": 74 }, { "epoch": 2.4, "grad_norm": 0.4900650382041931, "learning_rate": 0.00022741935483870966, "loss": 0.4794, "step": 75 }, { "epoch": 2.432, "grad_norm": 0.6321704983711243, "learning_rate": 0.0002264516129032258, "loss": 0.6677, "step": 76 }, { "epoch": 2.464, "grad_norm": 0.5305324792861938, "learning_rate": 0.00022548387096774192, "loss": 0.5102, "step": 77 }, { "epoch": 2.496, "grad_norm": 0.5799248218536377, "learning_rate": 0.00022451612903225804, "loss": 0.5274, "step": 78 }, { "epoch": 2.528, "grad_norm": 0.4990101456642151, "learning_rate": 0.00022354838709677416, "loss": 0.5407, "step": 79 }, { "epoch": 2.56, "grad_norm": 0.4779827296733856, "learning_rate": 0.0002225806451612903, "loss": 0.5166, "step": 80 }, { "epoch": 2.592, "grad_norm": 0.5140111446380615, "learning_rate": 0.00022161290322580645, "loss": 0.3288, "step": 81 }, { "epoch": 2.624, "grad_norm": 0.5674853920936584, "learning_rate": 0.00022064516129032257, "loss": 0.666, "step": 82 }, { "epoch": 2.656, "grad_norm": 0.5277597308158875, "learning_rate": 0.00021967741935483871, "loss": 0.5335, "step": 83 }, { "epoch": 2.6879999999999997, "grad_norm": 0.6029439568519592, "learning_rate": 0.0002187096774193548, "loss": 0.693, "step": 84 }, { "epoch": 2.7199999999999998, "grad_norm": 0.5039327144622803, "learning_rate": 0.00021774193548387095, "loss": 0.5728, "step": 85 }, { "epoch": 2.752, "grad_norm": 0.5564692616462708, "learning_rate": 0.00021677419354838707, "loss": 0.4734, "step": 86 }, { "epoch": 2.784, "grad_norm": 0.5278319120407104, "learning_rate": 0.00021580645161290322, "loss": 0.5834, "step": 87 }, { "epoch": 2.816, "grad_norm": 0.5445135831832886, "learning_rate": 0.00021483870967741936, "loss": 0.4642, "step": 88 }, { "epoch": 2.848, "grad_norm": 0.5394749045372009, "learning_rate": 0.00021387096774193545, "loss": 0.4779, "step": 89 }, { "epoch": 2.88, "grad_norm": 0.5756134390830994, "learning_rate": 0.0002129032258064516, "loss": 0.5607, "step": 90 }, { "epoch": 2.912, "grad_norm": 0.48361241817474365, "learning_rate": 0.00021193548387096772, "loss": 0.4278, "step": 91 }, { "epoch": 2.944, "grad_norm": 0.5017121434211731, "learning_rate": 0.00021096774193548386, "loss": 0.4834, "step": 92 }, { "epoch": 2.976, "grad_norm": 0.4741989076137543, "learning_rate": 0.00020999999999999998, "loss": 0.468, "step": 93 }, { "epoch": 3.008, "grad_norm": 1.003368854522705, "learning_rate": 0.0002090322580645161, "loss": 0.8614, "step": 94 }, { "epoch": 3.04, "grad_norm": 0.4782228469848633, "learning_rate": 0.00020806451612903225, "loss": 0.4111, "step": 95 }, { "epoch": 3.072, "grad_norm": 0.4558674395084381, "learning_rate": 0.00020709677419354836, "loss": 0.3463, "step": 96 }, { "epoch": 3.104, "grad_norm": 0.4409371316432953, "learning_rate": 0.0002061290322580645, "loss": 0.2571, "step": 97 }, { "epoch": 3.136, "grad_norm": 0.5415034890174866, "learning_rate": 0.00020516129032258063, "loss": 0.5707, "step": 98 }, { "epoch": 3.168, "grad_norm": 0.6157724857330322, "learning_rate": 0.00020419354838709677, "loss": 0.5692, "step": 99 }, { "epoch": 3.2, "grad_norm": 0.4855688810348511, "learning_rate": 0.00020322580645161287, "loss": 0.3311, "step": 100 }, { "epoch": 3.232, "grad_norm": 0.569878101348877, "learning_rate": 0.000202258064516129, "loss": 0.4707, "step": 101 }, { "epoch": 3.2640000000000002, "grad_norm": 0.645232081413269, "learning_rate": 0.00020129032258064516, "loss": 0.5504, "step": 102 }, { "epoch": 3.296, "grad_norm": 0.5775763392448425, "learning_rate": 0.00020032258064516128, "loss": 0.3651, "step": 103 }, { "epoch": 3.328, "grad_norm": 0.5808250904083252, "learning_rate": 0.00019935483870967742, "loss": 0.5068, "step": 104 }, { "epoch": 3.36, "grad_norm": 0.689313530921936, "learning_rate": 0.0001983870967741935, "loss": 0.4936, "step": 105 }, { "epoch": 3.392, "grad_norm": 0.6571519374847412, "learning_rate": 0.00019741935483870966, "loss": 0.3671, "step": 106 }, { "epoch": 3.424, "grad_norm": 0.6340517401695251, "learning_rate": 0.00019645161290322578, "loss": 0.4783, "step": 107 }, { "epoch": 3.456, "grad_norm": 0.7031407952308655, "learning_rate": 0.00019548387096774192, "loss": 0.427, "step": 108 }, { "epoch": 3.488, "grad_norm": 0.728496789932251, "learning_rate": 0.00019451612903225807, "loss": 0.5497, "step": 109 }, { "epoch": 3.52, "grad_norm": 0.6106727719306946, "learning_rate": 0.00019354838709677416, "loss": 0.392, "step": 110 }, { "epoch": 3.552, "grad_norm": 0.5296047329902649, "learning_rate": 0.0001925806451612903, "loss": 0.3412, "step": 111 }, { "epoch": 3.584, "grad_norm": 0.6282025575637817, "learning_rate": 0.00019161290322580643, "loss": 0.4081, "step": 112 }, { "epoch": 3.616, "grad_norm": 0.6166461110115051, "learning_rate": 0.00019064516129032257, "loss": 0.4771, "step": 113 }, { "epoch": 3.648, "grad_norm": 0.5448863506317139, "learning_rate": 0.0001896774193548387, "loss": 0.404, "step": 114 }, { "epoch": 3.68, "grad_norm": 0.6598389148712158, "learning_rate": 0.0001887096774193548, "loss": 0.3915, "step": 115 }, { "epoch": 3.7119999999999997, "grad_norm": 0.5567564368247986, "learning_rate": 0.00018774193548387095, "loss": 0.3862, "step": 116 }, { "epoch": 3.7439999999999998, "grad_norm": 0.6524521708488464, "learning_rate": 0.00018677419354838707, "loss": 0.5315, "step": 117 }, { "epoch": 3.776, "grad_norm": 0.7040128707885742, "learning_rate": 0.00018580645161290322, "loss": 0.5387, "step": 118 }, { "epoch": 3.808, "grad_norm": 0.690262496471405, "learning_rate": 0.00018483870967741934, "loss": 0.4877, "step": 119 }, { "epoch": 3.84, "grad_norm": 0.6928034424781799, "learning_rate": 0.00018387096774193548, "loss": 0.4895, "step": 120 }, { "epoch": 3.872, "grad_norm": 0.7148469686508179, "learning_rate": 0.00018290322580645157, "loss": 0.4814, "step": 121 }, { "epoch": 3.904, "grad_norm": 0.6096572875976562, "learning_rate": 0.00018193548387096772, "loss": 0.3403, "step": 122 }, { "epoch": 3.936, "grad_norm": 0.7132399678230286, "learning_rate": 0.00018096774193548387, "loss": 0.4258, "step": 123 }, { "epoch": 3.968, "grad_norm": 0.7302684187889099, "learning_rate": 0.00017999999999999998, "loss": 0.7215, "step": 124 }, { "epoch": 4.0, "grad_norm": 1.5244004726409912, "learning_rate": 0.00017903225806451613, "loss": 0.8544, "step": 125 }, { "epoch": 4.032, "grad_norm": 0.6032777428627014, "learning_rate": 0.00017806451612903222, "loss": 0.4183, "step": 126 }, { "epoch": 4.064, "grad_norm": 0.6349691152572632, "learning_rate": 0.00017709677419354837, "loss": 0.5871, "step": 127 }, { "epoch": 4.096, "grad_norm": 0.5730060935020447, "learning_rate": 0.00017612903225806449, "loss": 0.3786, "step": 128 }, { "epoch": 4.128, "grad_norm": 0.6988044381141663, "learning_rate": 0.00017516129032258063, "loss": 0.3216, "step": 129 }, { "epoch": 4.16, "grad_norm": 0.7379153370857239, "learning_rate": 0.00017419354838709678, "loss": 0.4026, "step": 130 }, { "epoch": 4.192, "grad_norm": 0.7058238983154297, "learning_rate": 0.00017322580645161287, "loss": 0.4328, "step": 131 }, { "epoch": 4.224, "grad_norm": 0.80663001537323, "learning_rate": 0.00017225806451612901, "loss": 0.3849, "step": 132 }, { "epoch": 4.256, "grad_norm": 0.899818480014801, "learning_rate": 0.00017129032258064513, "loss": 0.4191, "step": 133 }, { "epoch": 4.288, "grad_norm": 0.8538224697113037, "learning_rate": 0.00017032258064516128, "loss": 0.3587, "step": 134 }, { "epoch": 4.32, "grad_norm": 0.8948169350624084, "learning_rate": 0.00016935483870967742, "loss": 0.3957, "step": 135 }, { "epoch": 4.352, "grad_norm": 0.7195591926574707, "learning_rate": 0.00016838709677419354, "loss": 0.3361, "step": 136 }, { "epoch": 4.384, "grad_norm": 0.7769681215286255, "learning_rate": 0.00016741935483870966, "loss": 0.3519, "step": 137 }, { "epoch": 4.416, "grad_norm": 0.9509867429733276, "learning_rate": 0.00016645161290322578, "loss": 0.4216, "step": 138 }, { "epoch": 4.448, "grad_norm": 0.7923309206962585, "learning_rate": 0.00016548387096774193, "loss": 0.3999, "step": 139 }, { "epoch": 4.48, "grad_norm": 0.8961685299873352, "learning_rate": 0.00016451612903225804, "loss": 0.5385, "step": 140 }, { "epoch": 4.5120000000000005, "grad_norm": 0.7496562004089355, "learning_rate": 0.0001635483870967742, "loss": 0.341, "step": 141 }, { "epoch": 4.5440000000000005, "grad_norm": 0.8512839674949646, "learning_rate": 0.00016258064516129034, "loss": 0.3847, "step": 142 }, { "epoch": 4.576, "grad_norm": 0.7487362027168274, "learning_rate": 0.00016161290322580643, "loss": 0.3694, "step": 143 }, { "epoch": 4.608, "grad_norm": 0.7957774996757507, "learning_rate": 0.00016064516129032257, "loss": 0.3379, "step": 144 }, { "epoch": 4.64, "grad_norm": 0.7299221754074097, "learning_rate": 0.0001596774193548387, "loss": 0.2989, "step": 145 }, { "epoch": 4.672, "grad_norm": 0.7909884452819824, "learning_rate": 0.00015870967741935484, "loss": 0.3675, "step": 146 }, { "epoch": 4.704, "grad_norm": 0.7321597933769226, "learning_rate": 0.00015774193548387093, "loss": 0.3243, "step": 147 }, { "epoch": 4.736, "grad_norm": 0.7196181416511536, "learning_rate": 0.00015677419354838708, "loss": 0.2709, "step": 148 }, { "epoch": 4.768, "grad_norm": 0.7918142676353455, "learning_rate": 0.00015580645161290322, "loss": 0.3934, "step": 149 }, { "epoch": 4.8, "grad_norm": 0.8657622337341309, "learning_rate": 0.00015483870967741934, "loss": 0.3583, "step": 150 }, { "epoch": 4.832, "grad_norm": 0.8207722306251526, "learning_rate": 0.00015387096774193549, "loss": 0.412, "step": 151 }, { "epoch": 4.864, "grad_norm": 0.7206109166145325, "learning_rate": 0.00015290322580645158, "loss": 0.3594, "step": 152 }, { "epoch": 4.896, "grad_norm": 0.8529183864593506, "learning_rate": 0.00015193548387096772, "loss": 0.512, "step": 153 }, { "epoch": 4.928, "grad_norm": 0.6895930171012878, "learning_rate": 0.00015096774193548384, "loss": 0.333, "step": 154 }, { "epoch": 4.96, "grad_norm": 0.7422910332679749, "learning_rate": 0.00015, "loss": 0.2872, "step": 155 }, { "epoch": 4.992, "grad_norm": 0.7366386651992798, "learning_rate": 0.0001490322580645161, "loss": 0.3415, "step": 156 }, { "epoch": 5.024, "grad_norm": 2.1416280269622803, "learning_rate": 0.00014806451612903225, "loss": 0.9961, "step": 157 }, { "epoch": 5.056, "grad_norm": 0.7944900393486023, "learning_rate": 0.00014709677419354837, "loss": 0.3372, "step": 158 }, { "epoch": 5.088, "grad_norm": 0.7071006298065186, "learning_rate": 0.00014612903225806452, "loss": 0.2732, "step": 159 }, { "epoch": 5.12, "grad_norm": 0.7874396443367004, "learning_rate": 0.00014516129032258063, "loss": 0.2861, "step": 160 }, { "epoch": 5.152, "grad_norm": 0.8244249224662781, "learning_rate": 0.00014419354838709675, "loss": 0.3428, "step": 161 }, { "epoch": 5.184, "grad_norm": 0.81637042760849, "learning_rate": 0.0001432258064516129, "loss": 0.3037, "step": 162 }, { "epoch": 5.216, "grad_norm": 0.9916559457778931, "learning_rate": 0.00014225806451612902, "loss": 0.3337, "step": 163 }, { "epoch": 5.248, "grad_norm": 0.9077599048614502, "learning_rate": 0.00014129032258064514, "loss": 0.287, "step": 164 }, { "epoch": 5.28, "grad_norm": 0.9824132919311523, "learning_rate": 0.00014032258064516128, "loss": 0.3852, "step": 165 }, { "epoch": 5.312, "grad_norm": 1.0016467571258545, "learning_rate": 0.0001393548387096774, "loss": 0.3234, "step": 166 }, { "epoch": 5.344, "grad_norm": 0.8697543144226074, "learning_rate": 0.00013838709677419355, "loss": 0.2848, "step": 167 }, { "epoch": 5.376, "grad_norm": 0.8214029669761658, "learning_rate": 0.00013741935483870966, "loss": 0.3377, "step": 168 }, { "epoch": 5.408, "grad_norm": 0.9105691313743591, "learning_rate": 0.00013645161290322578, "loss": 0.2944, "step": 169 }, { "epoch": 5.44, "grad_norm": 0.9642040133476257, "learning_rate": 0.00013548387096774193, "loss": 0.3624, "step": 170 }, { "epoch": 5.4719999999999995, "grad_norm": 0.9218887686729431, "learning_rate": 0.00013451612903225805, "loss": 0.3938, "step": 171 }, { "epoch": 5.504, "grad_norm": 0.8704710006713867, "learning_rate": 0.0001335483870967742, "loss": 0.3629, "step": 172 }, { "epoch": 5.536, "grad_norm": 0.8207693099975586, "learning_rate": 0.0001325806451612903, "loss": 0.3169, "step": 173 }, { "epoch": 5.568, "grad_norm": 0.9315701127052307, "learning_rate": 0.00013161290322580643, "loss": 0.429, "step": 174 }, { "epoch": 5.6, "grad_norm": 0.860234260559082, "learning_rate": 0.00013064516129032258, "loss": 0.3842, "step": 175 }, { "epoch": 5.632, "grad_norm": 0.8927604556083679, "learning_rate": 0.0001296774193548387, "loss": 0.3405, "step": 176 }, { "epoch": 5.664, "grad_norm": 0.8084587454795837, "learning_rate": 0.0001287096774193548, "loss": 0.306, "step": 177 }, { "epoch": 5.696, "grad_norm": 0.9102941155433655, "learning_rate": 0.00012774193548387096, "loss": 0.3285, "step": 178 }, { "epoch": 5.728, "grad_norm": 0.763113796710968, "learning_rate": 0.0001267741935483871, "loss": 0.2729, "step": 179 }, { "epoch": 5.76, "grad_norm": 0.8704251646995544, "learning_rate": 0.00012580645161290322, "loss": 0.3164, "step": 180 }, { "epoch": 5.792, "grad_norm": 0.9634932279586792, "learning_rate": 0.00012483870967741934, "loss": 0.2939, "step": 181 }, { "epoch": 5.824, "grad_norm": 1.1567790508270264, "learning_rate": 0.00012387096774193546, "loss": 0.3076, "step": 182 }, { "epoch": 5.856, "grad_norm": 0.9096764922142029, "learning_rate": 0.0001229032258064516, "loss": 0.3289, "step": 183 }, { "epoch": 5.888, "grad_norm": 0.9840425848960876, "learning_rate": 0.00012193548387096773, "loss": 0.2772, "step": 184 }, { "epoch": 5.92, "grad_norm": 0.725844144821167, "learning_rate": 0.00012096774193548386, "loss": 0.2151, "step": 185 }, { "epoch": 5.952, "grad_norm": 0.8343638181686401, "learning_rate": 0.00011999999999999999, "loss": 0.3825, "step": 186 }, { "epoch": 5.984, "grad_norm": 0.8040199279785156, "learning_rate": 0.00011903225806451612, "loss": 0.2571, "step": 187 }, { "epoch": 6.016, "grad_norm": 1.6932090520858765, "learning_rate": 0.00011806451612903225, "loss": 0.5538, "step": 188 }, { "epoch": 6.048, "grad_norm": 0.744048535823822, "learning_rate": 0.00011709677419354837, "loss": 0.2335, "step": 189 }, { "epoch": 6.08, "grad_norm": 0.6974924206733704, "learning_rate": 0.0001161290322580645, "loss": 0.2891, "step": 190 }, { "epoch": 6.112, "grad_norm": 0.7202953696250916, "learning_rate": 0.00011516129032258062, "loss": 0.2017, "step": 191 }, { "epoch": 6.144, "grad_norm": 0.8437547087669373, "learning_rate": 0.00011419354838709676, "loss": 0.2175, "step": 192 }, { "epoch": 6.176, "grad_norm": 1.0741796493530273, "learning_rate": 0.0001132258064516129, "loss": 0.3913, "step": 193 }, { "epoch": 6.208, "grad_norm": 1.031754493713379, "learning_rate": 0.00011225806451612902, "loss": 0.298, "step": 194 }, { "epoch": 6.24, "grad_norm": 0.9575178027153015, "learning_rate": 0.00011129032258064515, "loss": 0.3201, "step": 195 }, { "epoch": 6.272, "grad_norm": 0.9503082633018494, "learning_rate": 0.00011032258064516128, "loss": 0.2005, "step": 196 }, { "epoch": 6.304, "grad_norm": 1.2572892904281616, "learning_rate": 0.0001093548387096774, "loss": 0.3045, "step": 197 }, { "epoch": 6.336, "grad_norm": 1.5667368173599243, "learning_rate": 0.00010838709677419353, "loss": 0.4053, "step": 198 }, { "epoch": 6.368, "grad_norm": 0.9439151883125305, "learning_rate": 0.00010741935483870968, "loss": 0.2721, "step": 199 }, { "epoch": 6.4, "grad_norm": 1.0985567569732666, "learning_rate": 0.0001064516129032258, "loss": 0.2543, "step": 200 } ], "logging_steps": 1, "max_steps": 310, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.7401861644288e+16, "train_batch_size": 3, "trial_name": null, "trial_params": null }