{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.001, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1e-05, "grad_norm": 8.875, "learning_rate": 9.999999997532599e-06, "loss": 1.6459, "step": 1 }, { "epoch": 2e-05, "grad_norm": 4.40625, "learning_rate": 9.999999990130395e-06, "loss": 1.6742, "step": 2 }, { "epoch": 3e-05, "grad_norm": 4.3125, "learning_rate": 9.99999997779339e-06, "loss": 1.6223, "step": 3 }, { "epoch": 4e-05, "grad_norm": 4.0625, "learning_rate": 9.999999960521582e-06, "loss": 1.5398, "step": 4 }, { "epoch": 5e-05, "grad_norm": 3.375, "learning_rate": 9.999999938314972e-06, "loss": 1.5666, "step": 5 }, { "epoch": 6e-05, "grad_norm": 1.859375, "learning_rate": 9.999999911173561e-06, "loss": 1.5981, "step": 6 }, { "epoch": 7e-05, "grad_norm": 1.734375, "learning_rate": 9.999999879097347e-06, "loss": 1.644, "step": 7 }, { "epoch": 8e-05, "grad_norm": 1.796875, "learning_rate": 9.999999842086332e-06, "loss": 1.6331, "step": 8 }, { "epoch": 9e-05, "grad_norm": 1.3125, "learning_rate": 9.999999800140514e-06, "loss": 1.626, "step": 9 }, { "epoch": 0.0001, "grad_norm": 1.9296875, "learning_rate": 9.999999753259893e-06, "loss": 1.5778, "step": 10 }, { "epoch": 0.00011, "grad_norm": 1.34375, "learning_rate": 9.99999970144447e-06, "loss": 1.6286, "step": 11 }, { "epoch": 0.00012, "grad_norm": 1.203125, "learning_rate": 9.999999644694247e-06, "loss": 1.5614, "step": 12 }, { "epoch": 0.00013, "grad_norm": 1.015625, "learning_rate": 9.999999583009221e-06, "loss": 1.6447, "step": 13 }, { "epoch": 0.00014, "grad_norm": 1.3359375, "learning_rate": 9.999999516389394e-06, "loss": 1.5258, "step": 14 }, { "epoch": 0.00015, "grad_norm": 1.25, "learning_rate": 9.999999444834763e-06, "loss": 1.6336, "step": 15 }, { "epoch": 0.00016, "grad_norm": 1.5546875, "learning_rate": 9.999999368345333e-06, "loss": 1.6073, "step": 16 }, { "epoch": 0.00017, "grad_norm": 1.34375, "learning_rate": 9.999999286921101e-06, "loss": 1.5919, "step": 17 }, { "epoch": 0.00018, "grad_norm": 0.96875, "learning_rate": 9.999999200562065e-06, "loss": 1.543, "step": 18 }, { "epoch": 0.00019, "grad_norm": 1.6875, "learning_rate": 9.99999910926823e-06, "loss": 1.6101, "step": 19 }, { "epoch": 0.0002, "grad_norm": 1.578125, "learning_rate": 9.999999013039593e-06, "loss": 1.5796, "step": 20 }, { "epoch": 0.00021, "grad_norm": 2.578125, "learning_rate": 9.999998911876154e-06, "loss": 1.5748, "step": 21 }, { "epoch": 0.00022, "grad_norm": 1.203125, "learning_rate": 9.999998805777915e-06, "loss": 1.5479, "step": 22 }, { "epoch": 0.00023, "grad_norm": 1.4921875, "learning_rate": 9.999998694744875e-06, "loss": 1.5318, "step": 23 }, { "epoch": 0.00024, "grad_norm": 1.125, "learning_rate": 9.999998578777036e-06, "loss": 1.6259, "step": 24 }, { "epoch": 0.00025, "grad_norm": 2.21875, "learning_rate": 9.999998457874392e-06, "loss": 1.5525, "step": 25 }, { "epoch": 0.00026, "grad_norm": 3.234375, "learning_rate": 9.99999833203695e-06, "loss": 1.5576, "step": 26 }, { "epoch": 0.00027, "grad_norm": 2.046875, "learning_rate": 9.999998201264707e-06, "loss": 1.3934, "step": 27 }, { "epoch": 0.00028, "grad_norm": 3.15625, "learning_rate": 9.999998065557664e-06, "loss": 1.5423, "step": 28 }, { "epoch": 0.00029, "grad_norm": 1.2734375, "learning_rate": 9.999997924915818e-06, "loss": 1.5679, "step": 29 }, { "epoch": 0.0003, "grad_norm": 1.625, "learning_rate": 9.999997779339175e-06, "loss": 1.5329, "step": 30 }, { "epoch": 0.00031, "grad_norm": 1.2421875, "learning_rate": 9.999997628827732e-06, "loss": 1.4603, "step": 31 }, { "epoch": 0.00032, "grad_norm": 1.46875, "learning_rate": 9.999997473381487e-06, "loss": 1.5774, "step": 32 }, { "epoch": 0.00033, "grad_norm": 2.0625, "learning_rate": 9.999997313000444e-06, "loss": 1.5522, "step": 33 }, { "epoch": 0.00034, "grad_norm": 1.421875, "learning_rate": 9.9999971476846e-06, "loss": 1.5964, "step": 34 }, { "epoch": 0.00035, "grad_norm": 1.59375, "learning_rate": 9.999996977433957e-06, "loss": 1.6129, "step": 35 }, { "epoch": 0.00036, "grad_norm": 1.78125, "learning_rate": 9.999996802248514e-06, "loss": 1.548, "step": 36 }, { "epoch": 0.00037, "grad_norm": 0.94140625, "learning_rate": 9.999996622128274e-06, "loss": 1.5662, "step": 37 }, { "epoch": 0.00038, "grad_norm": 4.84375, "learning_rate": 9.999996437073236e-06, "loss": 1.6197, "step": 38 }, { "epoch": 0.00039, "grad_norm": 3.234375, "learning_rate": 9.999996247083397e-06, "loss": 1.5308, "step": 39 }, { "epoch": 0.0004, "grad_norm": 1.375, "learning_rate": 9.99999605215876e-06, "loss": 1.5846, "step": 40 }, { "epoch": 0.00041, "grad_norm": 1.34375, "learning_rate": 9.999995852299324e-06, "loss": 1.4274, "step": 41 }, { "epoch": 0.00042, "grad_norm": 1.15625, "learning_rate": 9.999995647505092e-06, "loss": 1.4986, "step": 42 }, { "epoch": 0.00043, "grad_norm": 0.94140625, "learning_rate": 9.99999543777606e-06, "loss": 1.5135, "step": 43 }, { "epoch": 0.00044, "grad_norm": 0.8125, "learning_rate": 9.999995223112231e-06, "loss": 1.5472, "step": 44 }, { "epoch": 0.00045, "grad_norm": 1.5703125, "learning_rate": 9.999995003513605e-06, "loss": 1.5635, "step": 45 }, { "epoch": 0.00046, "grad_norm": 3.34375, "learning_rate": 9.999994778980182e-06, "loss": 1.5506, "step": 46 }, { "epoch": 0.00047, "grad_norm": 5.4375, "learning_rate": 9.99999454951196e-06, "loss": 1.5071, "step": 47 }, { "epoch": 0.00048, "grad_norm": 1.578125, "learning_rate": 9.999994315108943e-06, "loss": 1.5532, "step": 48 }, { "epoch": 0.00049, "grad_norm": 1.828125, "learning_rate": 9.999994075771128e-06, "loss": 1.6061, "step": 49 }, { "epoch": 0.0005, "grad_norm": 1.4453125, "learning_rate": 9.999993831498517e-06, "loss": 1.5629, "step": 50 }, { "epoch": 0.00051, "grad_norm": 1.109375, "learning_rate": 9.999993582291112e-06, "loss": 1.5536, "step": 51 }, { "epoch": 0.00052, "grad_norm": 1.0078125, "learning_rate": 9.999993328148909e-06, "loss": 1.5013, "step": 52 }, { "epoch": 0.00053, "grad_norm": 1.171875, "learning_rate": 9.999993069071912e-06, "loss": 1.4943, "step": 53 }, { "epoch": 0.00054, "grad_norm": 1.8203125, "learning_rate": 9.999992805060117e-06, "loss": 1.5057, "step": 54 }, { "epoch": 0.00055, "grad_norm": 0.97265625, "learning_rate": 9.99999253611353e-06, "loss": 1.6486, "step": 55 }, { "epoch": 0.00056, "grad_norm": 3.765625, "learning_rate": 9.999992262232145e-06, "loss": 1.4421, "step": 56 }, { "epoch": 0.00057, "grad_norm": 2.21875, "learning_rate": 9.999991983415968e-06, "loss": 1.5346, "step": 57 }, { "epoch": 0.00058, "grad_norm": 0.953125, "learning_rate": 9.999991699664996e-06, "loss": 1.5433, "step": 58 }, { "epoch": 0.00059, "grad_norm": 3.171875, "learning_rate": 9.99999141097923e-06, "loss": 1.5306, "step": 59 }, { "epoch": 0.0006, "grad_norm": 1.0546875, "learning_rate": 9.99999111735867e-06, "loss": 1.4453, "step": 60 }, { "epoch": 0.00061, "grad_norm": 0.796875, "learning_rate": 9.999990818803316e-06, "loss": 1.6219, "step": 61 }, { "epoch": 0.00062, "grad_norm": 1.078125, "learning_rate": 9.99999051531317e-06, "loss": 1.5296, "step": 62 }, { "epoch": 0.00063, "grad_norm": 1.2109375, "learning_rate": 9.999990206888231e-06, "loss": 1.5165, "step": 63 }, { "epoch": 0.00064, "grad_norm": 1.0234375, "learning_rate": 9.999989893528499e-06, "loss": 1.5697, "step": 64 }, { "epoch": 0.00065, "grad_norm": 3.0625, "learning_rate": 9.999989575233975e-06, "loss": 1.5121, "step": 65 }, { "epoch": 0.00066, "grad_norm": 0.9296875, "learning_rate": 9.999989252004657e-06, "loss": 1.4815, "step": 66 }, { "epoch": 0.00067, "grad_norm": 1.53125, "learning_rate": 9.999988923840551e-06, "loss": 1.4624, "step": 67 }, { "epoch": 0.00068, "grad_norm": 0.80859375, "learning_rate": 9.999988590741651e-06, "loss": 1.5089, "step": 68 }, { "epoch": 0.00069, "grad_norm": 1.4453125, "learning_rate": 9.999988252707961e-06, "loss": 1.4864, "step": 69 }, { "epoch": 0.0007, "grad_norm": 2.328125, "learning_rate": 9.999987909739481e-06, "loss": 1.5284, "step": 70 }, { "epoch": 0.00071, "grad_norm": 1.4296875, "learning_rate": 9.99998756183621e-06, "loss": 1.4759, "step": 71 }, { "epoch": 0.00072, "grad_norm": 1.2890625, "learning_rate": 9.999987208998151e-06, "loss": 1.5659, "step": 72 }, { "epoch": 0.00073, "grad_norm": 3.1875, "learning_rate": 9.9999868512253e-06, "loss": 1.6112, "step": 73 }, { "epoch": 0.00074, "grad_norm": 1.09375, "learning_rate": 9.999986488517661e-06, "loss": 1.503, "step": 74 }, { "epoch": 0.00075, "grad_norm": 0.80078125, "learning_rate": 9.999986120875233e-06, "loss": 1.4688, "step": 75 }, { "epoch": 0.00076, "grad_norm": 2.09375, "learning_rate": 9.999985748298016e-06, "loss": 1.543, "step": 76 }, { "epoch": 0.00077, "grad_norm": 0.89453125, "learning_rate": 9.999985370786011e-06, "loss": 1.5166, "step": 77 }, { "epoch": 0.00078, "grad_norm": 1.859375, "learning_rate": 9.999984988339219e-06, "loss": 1.5451, "step": 78 }, { "epoch": 0.00079, "grad_norm": 1.6640625, "learning_rate": 9.999984600957639e-06, "loss": 1.5026, "step": 79 }, { "epoch": 0.0008, "grad_norm": 1.5234375, "learning_rate": 9.999984208641271e-06, "loss": 1.4629, "step": 80 }, { "epoch": 0.00081, "grad_norm": 1.0, "learning_rate": 9.999983811390117e-06, "loss": 1.5866, "step": 81 }, { "epoch": 0.00082, "grad_norm": 1.2734375, "learning_rate": 9.999983409204178e-06, "loss": 1.4361, "step": 82 }, { "epoch": 0.00083, "grad_norm": 1.9609375, "learning_rate": 9.999983002083451e-06, "loss": 1.5498, "step": 83 }, { "epoch": 0.00084, "grad_norm": 3.125, "learning_rate": 9.999982590027942e-06, "loss": 1.5787, "step": 84 }, { "epoch": 0.00085, "grad_norm": 0.9453125, "learning_rate": 9.999982173037645e-06, "loss": 1.5674, "step": 85 }, { "epoch": 0.00086, "grad_norm": 0.91015625, "learning_rate": 9.999981751112563e-06, "loss": 1.5676, "step": 86 }, { "epoch": 0.00087, "grad_norm": 1.3203125, "learning_rate": 9.999981324252698e-06, "loss": 1.5031, "step": 87 }, { "epoch": 0.00088, "grad_norm": 1.40625, "learning_rate": 9.99998089245805e-06, "loss": 1.5127, "step": 88 }, { "epoch": 0.00089, "grad_norm": 0.92578125, "learning_rate": 9.999980455728618e-06, "loss": 1.5867, "step": 89 }, { "epoch": 0.0009, "grad_norm": 1.0078125, "learning_rate": 9.999980014064404e-06, "loss": 1.4413, "step": 90 }, { "epoch": 0.00091, "grad_norm": 0.93359375, "learning_rate": 9.999979567465405e-06, "loss": 1.3676, "step": 91 }, { "epoch": 0.00092, "grad_norm": 0.9609375, "learning_rate": 9.999979115931626e-06, "loss": 1.5553, "step": 92 }, { "epoch": 0.00093, "grad_norm": 0.75, "learning_rate": 9.999978659463065e-06, "loss": 1.4763, "step": 93 }, { "epoch": 0.00094, "grad_norm": 0.6796875, "learning_rate": 9.999978198059722e-06, "loss": 1.4483, "step": 94 }, { "epoch": 0.00095, "grad_norm": 0.8046875, "learning_rate": 9.9999777317216e-06, "loss": 1.5449, "step": 95 }, { "epoch": 0.00096, "grad_norm": 1.0859375, "learning_rate": 9.999977260448697e-06, "loss": 1.4965, "step": 96 }, { "epoch": 0.00097, "grad_norm": 1.0, "learning_rate": 9.999976784241014e-06, "loss": 1.596, "step": 97 }, { "epoch": 0.00098, "grad_norm": 0.94921875, "learning_rate": 9.999976303098552e-06, "loss": 1.5585, "step": 98 }, { "epoch": 0.00099, "grad_norm": 0.921875, "learning_rate": 9.99997581702131e-06, "loss": 1.501, "step": 99 }, { "epoch": 0.001, "grad_norm": 1.8984375, "learning_rate": 9.999975326009292e-06, "loss": 1.4553, "step": 100 } ], "logging_steps": 1.0, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3055128592384e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }