{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0015, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1e-05, "grad_norm": 8.875, "learning_rate": 9.999999997532599e-06, "loss": 1.6459, "step": 1 }, { "epoch": 2e-05, "grad_norm": 4.40625, "learning_rate": 9.999999990130395e-06, "loss": 1.6742, "step": 2 }, { "epoch": 3e-05, "grad_norm": 4.3125, "learning_rate": 9.99999997779339e-06, "loss": 1.6223, "step": 3 }, { "epoch": 4e-05, "grad_norm": 4.0625, "learning_rate": 9.999999960521582e-06, "loss": 1.5398, "step": 4 }, { "epoch": 5e-05, "grad_norm": 3.375, "learning_rate": 9.999999938314972e-06, "loss": 1.5666, "step": 5 }, { "epoch": 6e-05, "grad_norm": 1.859375, "learning_rate": 9.999999911173561e-06, "loss": 1.5981, "step": 6 }, { "epoch": 7e-05, "grad_norm": 1.734375, "learning_rate": 9.999999879097347e-06, "loss": 1.644, "step": 7 }, { "epoch": 8e-05, "grad_norm": 1.796875, "learning_rate": 9.999999842086332e-06, "loss": 1.6331, "step": 8 }, { "epoch": 9e-05, "grad_norm": 1.3125, "learning_rate": 9.999999800140514e-06, "loss": 1.626, "step": 9 }, { "epoch": 0.0001, "grad_norm": 1.9296875, "learning_rate": 9.999999753259893e-06, "loss": 1.5778, "step": 10 }, { "epoch": 0.00011, "grad_norm": 1.34375, "learning_rate": 9.99999970144447e-06, "loss": 1.6286, "step": 11 }, { "epoch": 0.00012, "grad_norm": 1.203125, "learning_rate": 9.999999644694247e-06, "loss": 1.5614, "step": 12 }, { "epoch": 0.00013, "grad_norm": 1.015625, "learning_rate": 9.999999583009221e-06, "loss": 1.6447, "step": 13 }, { "epoch": 0.00014, "grad_norm": 1.3359375, "learning_rate": 9.999999516389394e-06, "loss": 1.5258, "step": 14 }, { "epoch": 0.00015, "grad_norm": 1.25, "learning_rate": 9.999999444834763e-06, "loss": 1.6336, "step": 15 }, { "epoch": 0.00016, "grad_norm": 1.5546875, "learning_rate": 9.999999368345333e-06, "loss": 1.6073, "step": 16 }, { "epoch": 0.00017, "grad_norm": 1.34375, "learning_rate": 9.999999286921101e-06, "loss": 1.5919, "step": 17 }, { "epoch": 0.00018, "grad_norm": 0.96875, "learning_rate": 9.999999200562065e-06, "loss": 1.543, "step": 18 }, { "epoch": 0.00019, "grad_norm": 1.6875, "learning_rate": 9.99999910926823e-06, "loss": 1.6101, "step": 19 }, { "epoch": 0.0002, "grad_norm": 1.578125, "learning_rate": 9.999999013039593e-06, "loss": 1.5796, "step": 20 }, { "epoch": 0.00021, "grad_norm": 2.578125, "learning_rate": 9.999998911876154e-06, "loss": 1.5748, "step": 21 }, { "epoch": 0.00022, "grad_norm": 1.203125, "learning_rate": 9.999998805777915e-06, "loss": 1.5479, "step": 22 }, { "epoch": 0.00023, "grad_norm": 1.4921875, "learning_rate": 9.999998694744875e-06, "loss": 1.5318, "step": 23 }, { "epoch": 0.00024, "grad_norm": 1.125, "learning_rate": 9.999998578777036e-06, "loss": 1.6259, "step": 24 }, { "epoch": 0.00025, "grad_norm": 2.21875, "learning_rate": 9.999998457874392e-06, "loss": 1.5525, "step": 25 }, { "epoch": 0.00026, "grad_norm": 3.234375, "learning_rate": 9.99999833203695e-06, "loss": 1.5576, "step": 26 }, { "epoch": 0.00027, "grad_norm": 2.046875, "learning_rate": 9.999998201264707e-06, "loss": 1.3934, "step": 27 }, { "epoch": 0.00028, "grad_norm": 3.15625, "learning_rate": 9.999998065557664e-06, "loss": 1.5423, "step": 28 }, { "epoch": 0.00029, "grad_norm": 1.2734375, "learning_rate": 9.999997924915818e-06, "loss": 1.5679, "step": 29 }, { "epoch": 0.0003, "grad_norm": 1.625, "learning_rate": 9.999997779339175e-06, "loss": 1.5329, "step": 30 }, { "epoch": 0.00031, "grad_norm": 1.2421875, "learning_rate": 9.999997628827732e-06, "loss": 1.4603, "step": 31 }, { "epoch": 0.00032, "grad_norm": 1.46875, "learning_rate": 9.999997473381487e-06, "loss": 1.5774, "step": 32 }, { "epoch": 0.00033, "grad_norm": 2.0625, "learning_rate": 9.999997313000444e-06, "loss": 1.5522, "step": 33 }, { "epoch": 0.00034, "grad_norm": 1.421875, "learning_rate": 9.9999971476846e-06, "loss": 1.5964, "step": 34 }, { "epoch": 0.00035, "grad_norm": 1.59375, "learning_rate": 9.999996977433957e-06, "loss": 1.6129, "step": 35 }, { "epoch": 0.00036, "grad_norm": 1.78125, "learning_rate": 9.999996802248514e-06, "loss": 1.548, "step": 36 }, { "epoch": 0.00037, "grad_norm": 0.94140625, "learning_rate": 9.999996622128274e-06, "loss": 1.5662, "step": 37 }, { "epoch": 0.00038, "grad_norm": 4.84375, "learning_rate": 9.999996437073236e-06, "loss": 1.6197, "step": 38 }, { "epoch": 0.00039, "grad_norm": 3.234375, "learning_rate": 9.999996247083397e-06, "loss": 1.5308, "step": 39 }, { "epoch": 0.0004, "grad_norm": 1.375, "learning_rate": 9.99999605215876e-06, "loss": 1.5846, "step": 40 }, { "epoch": 0.00041, "grad_norm": 1.34375, "learning_rate": 9.999995852299324e-06, "loss": 1.4274, "step": 41 }, { "epoch": 0.00042, "grad_norm": 1.15625, "learning_rate": 9.999995647505092e-06, "loss": 1.4986, "step": 42 }, { "epoch": 0.00043, "grad_norm": 0.94140625, "learning_rate": 9.99999543777606e-06, "loss": 1.5135, "step": 43 }, { "epoch": 0.00044, "grad_norm": 0.8125, "learning_rate": 9.999995223112231e-06, "loss": 1.5472, "step": 44 }, { "epoch": 0.00045, "grad_norm": 1.5703125, "learning_rate": 9.999995003513605e-06, "loss": 1.5635, "step": 45 }, { "epoch": 0.00046, "grad_norm": 3.34375, "learning_rate": 9.999994778980182e-06, "loss": 1.5506, "step": 46 }, { "epoch": 0.00047, "grad_norm": 5.4375, "learning_rate": 9.99999454951196e-06, "loss": 1.5071, "step": 47 }, { "epoch": 0.00048, "grad_norm": 1.578125, "learning_rate": 9.999994315108943e-06, "loss": 1.5532, "step": 48 }, { "epoch": 0.00049, "grad_norm": 1.828125, "learning_rate": 9.999994075771128e-06, "loss": 1.6061, "step": 49 }, { "epoch": 0.0005, "grad_norm": 1.4453125, "learning_rate": 9.999993831498517e-06, "loss": 1.5629, "step": 50 }, { "epoch": 0.00051, "grad_norm": 1.109375, "learning_rate": 9.999993582291112e-06, "loss": 1.5536, "step": 51 }, { "epoch": 0.00052, "grad_norm": 1.0078125, "learning_rate": 9.999993328148909e-06, "loss": 1.5013, "step": 52 }, { "epoch": 0.00053, "grad_norm": 1.171875, "learning_rate": 9.999993069071912e-06, "loss": 1.4943, "step": 53 }, { "epoch": 0.00054, "grad_norm": 1.8203125, "learning_rate": 9.999992805060117e-06, "loss": 1.5057, "step": 54 }, { "epoch": 0.00055, "grad_norm": 0.97265625, "learning_rate": 9.99999253611353e-06, "loss": 1.6486, "step": 55 }, { "epoch": 0.00056, "grad_norm": 3.765625, "learning_rate": 9.999992262232145e-06, "loss": 1.4421, "step": 56 }, { "epoch": 0.00057, "grad_norm": 2.21875, "learning_rate": 9.999991983415968e-06, "loss": 1.5346, "step": 57 }, { "epoch": 0.00058, "grad_norm": 0.953125, "learning_rate": 9.999991699664996e-06, "loss": 1.5433, "step": 58 }, { "epoch": 0.00059, "grad_norm": 3.171875, "learning_rate": 9.99999141097923e-06, "loss": 1.5306, "step": 59 }, { "epoch": 0.0006, "grad_norm": 1.0546875, "learning_rate": 9.99999111735867e-06, "loss": 1.4453, "step": 60 }, { "epoch": 0.00061, "grad_norm": 0.796875, "learning_rate": 9.999990818803316e-06, "loss": 1.6219, "step": 61 }, { "epoch": 0.00062, "grad_norm": 1.078125, "learning_rate": 9.99999051531317e-06, "loss": 1.5296, "step": 62 }, { "epoch": 0.00063, "grad_norm": 1.2109375, "learning_rate": 9.999990206888231e-06, "loss": 1.5165, "step": 63 }, { "epoch": 0.00064, "grad_norm": 1.0234375, "learning_rate": 9.999989893528499e-06, "loss": 1.5697, "step": 64 }, { "epoch": 0.00065, "grad_norm": 3.0625, "learning_rate": 9.999989575233975e-06, "loss": 1.5121, "step": 65 }, { "epoch": 0.00066, "grad_norm": 0.9296875, "learning_rate": 9.999989252004657e-06, "loss": 1.4815, "step": 66 }, { "epoch": 0.00067, "grad_norm": 1.53125, "learning_rate": 9.999988923840551e-06, "loss": 1.4624, "step": 67 }, { "epoch": 0.00068, "grad_norm": 0.80859375, "learning_rate": 9.999988590741651e-06, "loss": 1.5089, "step": 68 }, { "epoch": 0.00069, "grad_norm": 1.4453125, "learning_rate": 9.999988252707961e-06, "loss": 1.4864, "step": 69 }, { "epoch": 0.0007, "grad_norm": 2.328125, "learning_rate": 9.999987909739481e-06, "loss": 1.5284, "step": 70 }, { "epoch": 0.00071, "grad_norm": 1.4296875, "learning_rate": 9.99998756183621e-06, "loss": 1.4759, "step": 71 }, { "epoch": 0.00072, "grad_norm": 1.2890625, "learning_rate": 9.999987208998151e-06, "loss": 1.5659, "step": 72 }, { "epoch": 0.00073, "grad_norm": 3.1875, "learning_rate": 9.9999868512253e-06, "loss": 1.6112, "step": 73 }, { "epoch": 0.00074, "grad_norm": 1.09375, "learning_rate": 9.999986488517661e-06, "loss": 1.503, "step": 74 }, { "epoch": 0.00075, "grad_norm": 0.80078125, "learning_rate": 9.999986120875233e-06, "loss": 1.4688, "step": 75 }, { "epoch": 0.00076, "grad_norm": 2.09375, "learning_rate": 9.999985748298016e-06, "loss": 1.543, "step": 76 }, { "epoch": 0.00077, "grad_norm": 0.89453125, "learning_rate": 9.999985370786011e-06, "loss": 1.5166, "step": 77 }, { "epoch": 0.00078, "grad_norm": 1.859375, "learning_rate": 9.999984988339219e-06, "loss": 1.5451, "step": 78 }, { "epoch": 0.00079, "grad_norm": 1.6640625, "learning_rate": 9.999984600957639e-06, "loss": 1.5026, "step": 79 }, { "epoch": 0.0008, "grad_norm": 1.5234375, "learning_rate": 9.999984208641271e-06, "loss": 1.4629, "step": 80 }, { "epoch": 0.00081, "grad_norm": 1.0, "learning_rate": 9.999983811390117e-06, "loss": 1.5866, "step": 81 }, { "epoch": 0.00082, "grad_norm": 1.2734375, "learning_rate": 9.999983409204178e-06, "loss": 1.4361, "step": 82 }, { "epoch": 0.00083, "grad_norm": 1.9609375, "learning_rate": 9.999983002083451e-06, "loss": 1.5498, "step": 83 }, { "epoch": 0.00084, "grad_norm": 3.125, "learning_rate": 9.999982590027942e-06, "loss": 1.5787, "step": 84 }, { "epoch": 0.00085, "grad_norm": 0.9453125, "learning_rate": 9.999982173037645e-06, "loss": 1.5674, "step": 85 }, { "epoch": 0.00086, "grad_norm": 0.91015625, "learning_rate": 9.999981751112563e-06, "loss": 1.5676, "step": 86 }, { "epoch": 0.00087, "grad_norm": 1.3203125, "learning_rate": 9.999981324252698e-06, "loss": 1.5031, "step": 87 }, { "epoch": 0.00088, "grad_norm": 1.40625, "learning_rate": 9.99998089245805e-06, "loss": 1.5127, "step": 88 }, { "epoch": 0.00089, "grad_norm": 0.92578125, "learning_rate": 9.999980455728618e-06, "loss": 1.5867, "step": 89 }, { "epoch": 0.0009, "grad_norm": 1.0078125, "learning_rate": 9.999980014064404e-06, "loss": 1.4413, "step": 90 }, { "epoch": 0.00091, "grad_norm": 0.93359375, "learning_rate": 9.999979567465405e-06, "loss": 1.3676, "step": 91 }, { "epoch": 0.00092, "grad_norm": 0.9609375, "learning_rate": 9.999979115931626e-06, "loss": 1.5553, "step": 92 }, { "epoch": 0.00093, "grad_norm": 0.75, "learning_rate": 9.999978659463065e-06, "loss": 1.4763, "step": 93 }, { "epoch": 0.00094, "grad_norm": 0.6796875, "learning_rate": 9.999978198059722e-06, "loss": 1.4483, "step": 94 }, { "epoch": 0.00095, "grad_norm": 0.8046875, "learning_rate": 9.9999777317216e-06, "loss": 1.5449, "step": 95 }, { "epoch": 0.00096, "grad_norm": 1.0859375, "learning_rate": 9.999977260448697e-06, "loss": 1.4965, "step": 96 }, { "epoch": 0.00097, "grad_norm": 1.0, "learning_rate": 9.999976784241014e-06, "loss": 1.596, "step": 97 }, { "epoch": 0.00098, "grad_norm": 0.94921875, "learning_rate": 9.999976303098552e-06, "loss": 1.5585, "step": 98 }, { "epoch": 0.00099, "grad_norm": 0.921875, "learning_rate": 9.99997581702131e-06, "loss": 1.501, "step": 99 }, { "epoch": 0.001, "grad_norm": 1.8984375, "learning_rate": 9.999975326009292e-06, "loss": 1.4553, "step": 100 }, { "epoch": 0.00101, "grad_norm": 0.7890625, "learning_rate": 9.999974830062494e-06, "loss": 1.5695, "step": 101 }, { "epoch": 0.00102, "grad_norm": 0.73046875, "learning_rate": 9.99997432918092e-06, "loss": 1.4606, "step": 102 }, { "epoch": 0.00103, "grad_norm": 0.98828125, "learning_rate": 9.999973823364568e-06, "loss": 1.5428, "step": 103 }, { "epoch": 0.00104, "grad_norm": 0.90234375, "learning_rate": 9.99997331261344e-06, "loss": 1.5081, "step": 104 }, { "epoch": 0.00105, "grad_norm": 1.875, "learning_rate": 9.999972796927537e-06, "loss": 1.5, "step": 105 }, { "epoch": 0.00106, "grad_norm": 0.95703125, "learning_rate": 9.999972276306858e-06, "loss": 1.5322, "step": 106 }, { "epoch": 0.00107, "grad_norm": 1.6015625, "learning_rate": 9.999971750751405e-06, "loss": 1.5786, "step": 107 }, { "epoch": 0.00108, "grad_norm": 1.4375, "learning_rate": 9.999971220261177e-06, "loss": 1.4891, "step": 108 }, { "epoch": 0.00109, "grad_norm": 1.34375, "learning_rate": 9.999970684836174e-06, "loss": 1.4762, "step": 109 }, { "epoch": 0.0011, "grad_norm": 0.80078125, "learning_rate": 9.9999701444764e-06, "loss": 1.5361, "step": 110 }, { "epoch": 0.00111, "grad_norm": 0.88671875, "learning_rate": 9.999969599181852e-06, "loss": 1.4876, "step": 111 }, { "epoch": 0.00112, "grad_norm": 0.80078125, "learning_rate": 9.999969048952532e-06, "loss": 1.4864, "step": 112 }, { "epoch": 0.00113, "grad_norm": 0.875, "learning_rate": 9.99996849378844e-06, "loss": 1.5043, "step": 113 }, { "epoch": 0.00114, "grad_norm": 0.8515625, "learning_rate": 9.999967933689577e-06, "loss": 1.4646, "step": 114 }, { "epoch": 0.00115, "grad_norm": 0.83203125, "learning_rate": 9.999967368655942e-06, "loss": 1.5111, "step": 115 }, { "epoch": 0.00116, "grad_norm": 1.125, "learning_rate": 9.99996679868754e-06, "loss": 1.4959, "step": 116 }, { "epoch": 0.00117, "grad_norm": 0.8984375, "learning_rate": 9.999966223784368e-06, "loss": 1.5595, "step": 117 }, { "epoch": 0.00118, "grad_norm": 3.703125, "learning_rate": 9.999965643946425e-06, "loss": 1.1576, "step": 118 }, { "epoch": 0.00119, "grad_norm": 0.96484375, "learning_rate": 9.999965059173715e-06, "loss": 1.5184, "step": 119 }, { "epoch": 0.0012, "grad_norm": 0.96484375, "learning_rate": 9.999964469466236e-06, "loss": 1.4559, "step": 120 }, { "epoch": 0.00121, "grad_norm": 1.03125, "learning_rate": 9.999963874823993e-06, "loss": 1.5143, "step": 121 }, { "epoch": 0.00122, "grad_norm": 0.6640625, "learning_rate": 9.999963275246983e-06, "loss": 1.5526, "step": 122 }, { "epoch": 0.00123, "grad_norm": 0.6640625, "learning_rate": 9.999962670735205e-06, "loss": 1.5328, "step": 123 }, { "epoch": 0.00124, "grad_norm": 0.65625, "learning_rate": 9.999962061288662e-06, "loss": 1.5333, "step": 124 }, { "epoch": 0.00125, "grad_norm": 0.69140625, "learning_rate": 9.999961446907354e-06, "loss": 1.5423, "step": 125 }, { "epoch": 0.00126, "grad_norm": 0.69140625, "learning_rate": 9.999960827591283e-06, "loss": 1.5687, "step": 126 }, { "epoch": 0.00127, "grad_norm": 0.671875, "learning_rate": 9.999960203340447e-06, "loss": 1.5186, "step": 127 }, { "epoch": 0.00128, "grad_norm": 0.609375, "learning_rate": 9.99995957415485e-06, "loss": 1.5319, "step": 128 }, { "epoch": 0.00129, "grad_norm": 0.6484375, "learning_rate": 9.99995894003449e-06, "loss": 1.4797, "step": 129 }, { "epoch": 0.0013, "grad_norm": 0.734375, "learning_rate": 9.999958300979367e-06, "loss": 1.5232, "step": 130 }, { "epoch": 0.00131, "grad_norm": 0.6953125, "learning_rate": 9.999957656989482e-06, "loss": 1.4463, "step": 131 }, { "epoch": 0.00132, "grad_norm": 0.65234375, "learning_rate": 9.99995700806484e-06, "loss": 1.4935, "step": 132 }, { "epoch": 0.00133, "grad_norm": 0.6484375, "learning_rate": 9.999956354205437e-06, "loss": 1.6034, "step": 133 }, { "epoch": 0.00134, "grad_norm": 0.6171875, "learning_rate": 9.999955695411274e-06, "loss": 1.5081, "step": 134 }, { "epoch": 0.00135, "grad_norm": 0.6640625, "learning_rate": 9.999955031682354e-06, "loss": 1.4503, "step": 135 }, { "epoch": 0.00136, "grad_norm": 0.5625, "learning_rate": 9.999954363018675e-06, "loss": 1.3321, "step": 136 }, { "epoch": 0.00137, "grad_norm": 0.83203125, "learning_rate": 9.999953689420238e-06, "loss": 1.5462, "step": 137 }, { "epoch": 0.00138, "grad_norm": 0.70703125, "learning_rate": 9.999953010887047e-06, "loss": 1.5052, "step": 138 }, { "epoch": 0.00139, "grad_norm": 0.609375, "learning_rate": 9.999952327419098e-06, "loss": 1.3766, "step": 139 }, { "epoch": 0.0014, "grad_norm": 0.66796875, "learning_rate": 9.999951639016396e-06, "loss": 1.6074, "step": 140 }, { "epoch": 0.00141, "grad_norm": 0.75390625, "learning_rate": 9.999950945678939e-06, "loss": 1.5381, "step": 141 }, { "epoch": 0.00142, "grad_norm": 0.61328125, "learning_rate": 9.999950247406725e-06, "loss": 1.5463, "step": 142 }, { "epoch": 0.00143, "grad_norm": 0.625, "learning_rate": 9.99994954419976e-06, "loss": 1.5013, "step": 143 }, { "epoch": 0.00144, "grad_norm": 0.61328125, "learning_rate": 9.999948836058045e-06, "loss": 1.4789, "step": 144 }, { "epoch": 0.00145, "grad_norm": 0.6171875, "learning_rate": 9.999948122981576e-06, "loss": 1.4872, "step": 145 }, { "epoch": 0.00146, "grad_norm": 0.609375, "learning_rate": 9.999947404970356e-06, "loss": 1.5426, "step": 146 }, { "epoch": 0.00147, "grad_norm": 0.68359375, "learning_rate": 9.999946682024386e-06, "loss": 1.5073, "step": 147 }, { "epoch": 0.00148, "grad_norm": 0.6015625, "learning_rate": 9.999945954143665e-06, "loss": 1.5537, "step": 148 }, { "epoch": 0.00149, "grad_norm": 0.89453125, "learning_rate": 9.999945221328198e-06, "loss": 1.5759, "step": 149 }, { "epoch": 0.0015, "grad_norm": 0.61328125, "learning_rate": 9.999944483577982e-06, "loss": 1.5019, "step": 150 } ], "logging_steps": 1.0, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.4582692888576e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }