{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0005, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1e-05, "grad_norm": 8.875, "learning_rate": 9.999999997532599e-06, "loss": 1.6459, "step": 1 }, { "epoch": 2e-05, "grad_norm": 4.40625, "learning_rate": 9.999999990130395e-06, "loss": 1.6742, "step": 2 }, { "epoch": 3e-05, "grad_norm": 4.3125, "learning_rate": 9.99999997779339e-06, "loss": 1.6223, "step": 3 }, { "epoch": 4e-05, "grad_norm": 4.0625, "learning_rate": 9.999999960521582e-06, "loss": 1.5398, "step": 4 }, { "epoch": 5e-05, "grad_norm": 3.375, "learning_rate": 9.999999938314972e-06, "loss": 1.5666, "step": 5 }, { "epoch": 6e-05, "grad_norm": 1.859375, "learning_rate": 9.999999911173561e-06, "loss": 1.5981, "step": 6 }, { "epoch": 7e-05, "grad_norm": 1.734375, "learning_rate": 9.999999879097347e-06, "loss": 1.644, "step": 7 }, { "epoch": 8e-05, "grad_norm": 1.796875, "learning_rate": 9.999999842086332e-06, "loss": 1.6331, "step": 8 }, { "epoch": 9e-05, "grad_norm": 1.3125, "learning_rate": 9.999999800140514e-06, "loss": 1.626, "step": 9 }, { "epoch": 0.0001, "grad_norm": 1.9296875, "learning_rate": 9.999999753259893e-06, "loss": 1.5778, "step": 10 }, { "epoch": 0.00011, "grad_norm": 1.34375, "learning_rate": 9.99999970144447e-06, "loss": 1.6286, "step": 11 }, { "epoch": 0.00012, "grad_norm": 1.203125, "learning_rate": 9.999999644694247e-06, "loss": 1.5614, "step": 12 }, { "epoch": 0.00013, "grad_norm": 1.015625, "learning_rate": 9.999999583009221e-06, "loss": 1.6447, "step": 13 }, { "epoch": 0.00014, "grad_norm": 1.3359375, "learning_rate": 9.999999516389394e-06, "loss": 1.5258, "step": 14 }, { "epoch": 0.00015, "grad_norm": 1.25, "learning_rate": 9.999999444834763e-06, "loss": 1.6336, "step": 15 }, { "epoch": 0.00016, "grad_norm": 1.5546875, "learning_rate": 9.999999368345333e-06, "loss": 1.6073, "step": 16 }, { "epoch": 0.00017, "grad_norm": 1.34375, "learning_rate": 9.999999286921101e-06, "loss": 1.5919, "step": 17 }, { "epoch": 0.00018, "grad_norm": 0.96875, "learning_rate": 9.999999200562065e-06, "loss": 1.543, "step": 18 }, { "epoch": 0.00019, "grad_norm": 1.6875, "learning_rate": 9.99999910926823e-06, "loss": 1.6101, "step": 19 }, { "epoch": 0.0002, "grad_norm": 1.578125, "learning_rate": 9.999999013039593e-06, "loss": 1.5796, "step": 20 }, { "epoch": 0.00021, "grad_norm": 2.578125, "learning_rate": 9.999998911876154e-06, "loss": 1.5748, "step": 21 }, { "epoch": 0.00022, "grad_norm": 1.203125, "learning_rate": 9.999998805777915e-06, "loss": 1.5479, "step": 22 }, { "epoch": 0.00023, "grad_norm": 1.4921875, "learning_rate": 9.999998694744875e-06, "loss": 1.5318, "step": 23 }, { "epoch": 0.00024, "grad_norm": 1.125, "learning_rate": 9.999998578777036e-06, "loss": 1.6259, "step": 24 }, { "epoch": 0.00025, "grad_norm": 2.21875, "learning_rate": 9.999998457874392e-06, "loss": 1.5525, "step": 25 }, { "epoch": 0.00026, "grad_norm": 3.234375, "learning_rate": 9.99999833203695e-06, "loss": 1.5576, "step": 26 }, { "epoch": 0.00027, "grad_norm": 2.046875, "learning_rate": 9.999998201264707e-06, "loss": 1.3934, "step": 27 }, { "epoch": 0.00028, "grad_norm": 3.15625, "learning_rate": 9.999998065557664e-06, "loss": 1.5423, "step": 28 }, { "epoch": 0.00029, "grad_norm": 1.2734375, "learning_rate": 9.999997924915818e-06, "loss": 1.5679, "step": 29 }, { "epoch": 0.0003, "grad_norm": 1.625, "learning_rate": 9.999997779339175e-06, "loss": 1.5329, "step": 30 }, { "epoch": 0.00031, "grad_norm": 1.2421875, "learning_rate": 9.999997628827732e-06, "loss": 1.4603, "step": 31 }, { "epoch": 0.00032, "grad_norm": 1.46875, "learning_rate": 9.999997473381487e-06, "loss": 1.5774, "step": 32 }, { "epoch": 0.00033, "grad_norm": 2.0625, "learning_rate": 9.999997313000444e-06, "loss": 1.5522, "step": 33 }, { "epoch": 0.00034, "grad_norm": 1.421875, "learning_rate": 9.9999971476846e-06, "loss": 1.5964, "step": 34 }, { "epoch": 0.00035, "grad_norm": 1.59375, "learning_rate": 9.999996977433957e-06, "loss": 1.6129, "step": 35 }, { "epoch": 0.00036, "grad_norm": 1.78125, "learning_rate": 9.999996802248514e-06, "loss": 1.548, "step": 36 }, { "epoch": 0.00037, "grad_norm": 0.94140625, "learning_rate": 9.999996622128274e-06, "loss": 1.5662, "step": 37 }, { "epoch": 0.00038, "grad_norm": 4.84375, "learning_rate": 9.999996437073236e-06, "loss": 1.6197, "step": 38 }, { "epoch": 0.00039, "grad_norm": 3.234375, "learning_rate": 9.999996247083397e-06, "loss": 1.5308, "step": 39 }, { "epoch": 0.0004, "grad_norm": 1.375, "learning_rate": 9.99999605215876e-06, "loss": 1.5846, "step": 40 }, { "epoch": 0.00041, "grad_norm": 1.34375, "learning_rate": 9.999995852299324e-06, "loss": 1.4274, "step": 41 }, { "epoch": 0.00042, "grad_norm": 1.15625, "learning_rate": 9.999995647505092e-06, "loss": 1.4986, "step": 42 }, { "epoch": 0.00043, "grad_norm": 0.94140625, "learning_rate": 9.99999543777606e-06, "loss": 1.5135, "step": 43 }, { "epoch": 0.00044, "grad_norm": 0.8125, "learning_rate": 9.999995223112231e-06, "loss": 1.5472, "step": 44 }, { "epoch": 0.00045, "grad_norm": 1.5703125, "learning_rate": 9.999995003513605e-06, "loss": 1.5635, "step": 45 }, { "epoch": 0.00046, "grad_norm": 3.34375, "learning_rate": 9.999994778980182e-06, "loss": 1.5506, "step": 46 }, { "epoch": 0.00047, "grad_norm": 5.4375, "learning_rate": 9.99999454951196e-06, "loss": 1.5071, "step": 47 }, { "epoch": 0.00048, "grad_norm": 1.578125, "learning_rate": 9.999994315108943e-06, "loss": 1.5532, "step": 48 }, { "epoch": 0.00049, "grad_norm": 1.828125, "learning_rate": 9.999994075771128e-06, "loss": 1.6061, "step": 49 }, { "epoch": 0.0005, "grad_norm": 1.4453125, "learning_rate": 9.999993831498517e-06, "loss": 1.5629, "step": 50 } ], "logging_steps": 1.0, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1527564296192e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }