{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.2, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032, "grad_norm": 0.3297976851463318, "learning_rate": 0.0002990322580645161, "loss": 1.0389, "step": 1 }, { "epoch": 0.064, "grad_norm": 0.4069916307926178, "learning_rate": 0.0002980645161290322, "loss": 1.3377, "step": 2 }, { "epoch": 0.096, "grad_norm": 0.42084500193595886, "learning_rate": 0.00029709677419354836, "loss": 0.9366, "step": 3 }, { "epoch": 0.128, "grad_norm": 0.4641948938369751, "learning_rate": 0.0002961290322580645, "loss": 1.0086, "step": 4 }, { "epoch": 0.16, "grad_norm": 0.3840750455856323, "learning_rate": 0.00029516129032258065, "loss": 0.8333, "step": 5 }, { "epoch": 0.192, "grad_norm": 0.4263865053653717, "learning_rate": 0.00029419354838709674, "loss": 0.854, "step": 6 }, { "epoch": 0.224, "grad_norm": 0.48615148663520813, "learning_rate": 0.0002932258064516129, "loss": 0.9548, "step": 7 }, { "epoch": 0.256, "grad_norm": 0.44419369101524353, "learning_rate": 0.00029225806451612903, "loss": 0.8482, "step": 8 }, { "epoch": 0.288, "grad_norm": 0.5317733883857727, "learning_rate": 0.0002912903225806451, "loss": 0.9426, "step": 9 }, { "epoch": 0.32, "grad_norm": 0.47260937094688416, "learning_rate": 0.00029032258064516127, "loss": 0.9816, "step": 10 }, { "epoch": 0.352, "grad_norm": 0.39063283801078796, "learning_rate": 0.00028935483870967736, "loss": 0.84, "step": 11 }, { "epoch": 0.384, "grad_norm": 0.39234670996665955, "learning_rate": 0.0002883870967741935, "loss": 0.7476, "step": 12 }, { "epoch": 0.416, "grad_norm": 0.40661805868148804, "learning_rate": 0.00028741935483870965, "loss": 0.9282, "step": 13 }, { "epoch": 0.448, "grad_norm": 0.42970865964889526, "learning_rate": 0.0002864516129032258, "loss": 0.7858, "step": 14 }, { "epoch": 0.48, "grad_norm": 0.3780193626880646, "learning_rate": 0.00028548387096774194, "loss": 0.7968, "step": 15 }, { "epoch": 0.512, "grad_norm": 0.37006014585494995, "learning_rate": 0.00028451612903225803, "loss": 0.6801, "step": 16 }, { "epoch": 0.544, "grad_norm": 0.3660840392112732, "learning_rate": 0.0002835483870967742, "loss": 0.5914, "step": 17 }, { "epoch": 0.576, "grad_norm": 0.3270975351333618, "learning_rate": 0.00028258064516129027, "loss": 0.6449, "step": 18 }, { "epoch": 0.608, "grad_norm": 0.3859024941921234, "learning_rate": 0.0002816129032258064, "loss": 0.8144, "step": 19 }, { "epoch": 0.64, "grad_norm": 0.37092071771621704, "learning_rate": 0.00028064516129032256, "loss": 0.7667, "step": 20 }, { "epoch": 0.672, "grad_norm": 0.37667015194892883, "learning_rate": 0.0002796774193548387, "loss": 0.7751, "step": 21 }, { "epoch": 0.704, "grad_norm": 0.3832458555698395, "learning_rate": 0.0002787096774193548, "loss": 0.755, "step": 22 }, { "epoch": 0.736, "grad_norm": 0.327288419008255, "learning_rate": 0.00027774193548387095, "loss": 0.7178, "step": 23 }, { "epoch": 0.768, "grad_norm": 0.34552687406539917, "learning_rate": 0.0002767741935483871, "loss": 0.7057, "step": 24 }, { "epoch": 0.8, "grad_norm": 0.3611259460449219, "learning_rate": 0.0002758064516129032, "loss": 0.8159, "step": 25 }, { "epoch": 0.832, "grad_norm": 0.3345054090023041, "learning_rate": 0.00027483870967741933, "loss": 0.7208, "step": 26 }, { "epoch": 0.864, "grad_norm": 0.3697254955768585, "learning_rate": 0.0002738709677419355, "loss": 0.8964, "step": 27 }, { "epoch": 0.896, "grad_norm": 0.3905017375946045, "learning_rate": 0.00027290322580645157, "loss": 0.7794, "step": 28 }, { "epoch": 0.928, "grad_norm": 0.3715725243091583, "learning_rate": 0.0002719354838709677, "loss": 0.6966, "step": 29 }, { "epoch": 0.96, "grad_norm": 0.3650343120098114, "learning_rate": 0.00027096774193548386, "loss": 0.5761, "step": 30 }, { "epoch": 0.992, "grad_norm": 0.33932459354400635, "learning_rate": 0.00027, "loss": 0.556, "step": 31 }, { "epoch": 1.024, "grad_norm": 0.6371742486953735, "learning_rate": 0.0002690322580645161, "loss": 0.847, "step": 32 }, { "epoch": 1.056, "grad_norm": 0.37499895691871643, "learning_rate": 0.00026806451612903224, "loss": 0.8419, "step": 33 }, { "epoch": 1.088, "grad_norm": 0.33221954107284546, "learning_rate": 0.0002670967741935484, "loss": 0.6011, "step": 34 }, { "epoch": 1.12, "grad_norm": 0.344096839427948, "learning_rate": 0.0002661290322580645, "loss": 0.6501, "step": 35 }, { "epoch": 1.152, "grad_norm": 0.38429391384124756, "learning_rate": 0.0002651612903225806, "loss": 0.8091, "step": 36 }, { "epoch": 1.184, "grad_norm": 0.38014867901802063, "learning_rate": 0.00026419354838709677, "loss": 0.7668, "step": 37 }, { "epoch": 1.216, "grad_norm": 0.3352573812007904, "learning_rate": 0.00026322580645161286, "loss": 0.5444, "step": 38 }, { "epoch": 1.248, "grad_norm": 0.33811062574386597, "learning_rate": 0.000262258064516129, "loss": 0.512, "step": 39 }, { "epoch": 1.28, "grad_norm": 0.3998416066169739, "learning_rate": 0.00026129032258064515, "loss": 0.6315, "step": 40 }, { "epoch": 1.312, "grad_norm": 0.3983341157436371, "learning_rate": 0.0002603225806451613, "loss": 0.5882, "step": 41 }, { "epoch": 1.3439999999999999, "grad_norm": 0.4585898816585541, "learning_rate": 0.0002593548387096774, "loss": 0.761, "step": 42 }, { "epoch": 1.376, "grad_norm": 0.4080730080604553, "learning_rate": 0.00025838709677419354, "loss": 0.6716, "step": 43 }, { "epoch": 1.408, "grad_norm": 0.4068273901939392, "learning_rate": 0.0002574193548387096, "loss": 0.6376, "step": 44 }, { "epoch": 1.44, "grad_norm": 0.4406949579715729, "learning_rate": 0.00025645161290322577, "loss": 0.4594, "step": 45 }, { "epoch": 1.472, "grad_norm": 0.34500986337661743, "learning_rate": 0.0002554838709677419, "loss": 0.3672, "step": 46 }, { "epoch": 1.504, "grad_norm": 0.4760681390762329, "learning_rate": 0.00025451612903225806, "loss": 0.6331, "step": 47 }, { "epoch": 1.536, "grad_norm": 0.39281558990478516, "learning_rate": 0.0002535483870967742, "loss": 0.5845, "step": 48 }, { "epoch": 1.568, "grad_norm": 0.4265002906322479, "learning_rate": 0.0002525806451612903, "loss": 0.4461, "step": 49 }, { "epoch": 1.6, "grad_norm": 0.40967294573783875, "learning_rate": 0.00025161290322580645, "loss": 0.7011, "step": 50 }, { "epoch": 1.6320000000000001, "grad_norm": 0.4288088381290436, "learning_rate": 0.00025064516129032254, "loss": 0.6928, "step": 51 }, { "epoch": 1.6640000000000001, "grad_norm": 0.4356289803981781, "learning_rate": 0.0002496774193548387, "loss": 0.7972, "step": 52 }, { "epoch": 1.696, "grad_norm": 0.3827487826347351, "learning_rate": 0.0002487096774193548, "loss": 0.2991, "step": 53 }, { "epoch": 1.728, "grad_norm": 0.40093398094177246, "learning_rate": 0.0002477419354838709, "loss": 0.416, "step": 54 }, { "epoch": 1.76, "grad_norm": 0.41548973321914673, "learning_rate": 0.00024677419354838707, "loss": 0.5501, "step": 55 }, { "epoch": 1.792, "grad_norm": 0.4093388617038727, "learning_rate": 0.0002458064516129032, "loss": 0.5557, "step": 56 }, { "epoch": 1.8239999999999998, "grad_norm": 0.3934040665626526, "learning_rate": 0.00024483870967741936, "loss": 0.602, "step": 57 }, { "epoch": 1.8559999999999999, "grad_norm": 0.42221033573150635, "learning_rate": 0.00024387096774193545, "loss": 0.6421, "step": 58 }, { "epoch": 1.888, "grad_norm": 0.4351339340209961, "learning_rate": 0.0002429032258064516, "loss": 0.5615, "step": 59 }, { "epoch": 1.92, "grad_norm": 0.4319838881492615, "learning_rate": 0.00024193548387096771, "loss": 0.6804, "step": 60 }, { "epoch": 1.952, "grad_norm": 0.40016525983810425, "learning_rate": 0.00024096774193548386, "loss": 0.5432, "step": 61 }, { "epoch": 1.984, "grad_norm": 0.3905942440032959, "learning_rate": 0.00023999999999999998, "loss": 0.4187, "step": 62 }, { "epoch": 2.016, "grad_norm": 0.8056382536888123, "learning_rate": 0.0002390322580645161, "loss": 1.0174, "step": 63 }, { "epoch": 2.048, "grad_norm": 0.3835236430168152, "learning_rate": 0.00023806451612903224, "loss": 0.5992, "step": 64 }, { "epoch": 2.08, "grad_norm": 0.41092216968536377, "learning_rate": 0.00023709677419354836, "loss": 0.4746, "step": 65 }, { "epoch": 2.112, "grad_norm": 0.39536622166633606, "learning_rate": 0.0002361290322580645, "loss": 0.3946, "step": 66 }, { "epoch": 2.144, "grad_norm": 0.3927665948867798, "learning_rate": 0.0002351612903225806, "loss": 0.5187, "step": 67 }, { "epoch": 2.176, "grad_norm": 0.39792704582214355, "learning_rate": 0.00023419354838709674, "loss": 0.4568, "step": 68 }, { "epoch": 2.208, "grad_norm": 0.5023652911186218, "learning_rate": 0.0002332258064516129, "loss": 0.6166, "step": 69 }, { "epoch": 2.24, "grad_norm": 0.425017774105072, "learning_rate": 0.000232258064516129, "loss": 0.42, "step": 70 }, { "epoch": 2.2720000000000002, "grad_norm": 0.46458110213279724, "learning_rate": 0.00023129032258064516, "loss": 0.4613, "step": 71 }, { "epoch": 2.304, "grad_norm": 0.49037960171699524, "learning_rate": 0.00023032258064516125, "loss": 0.5509, "step": 72 }, { "epoch": 2.336, "grad_norm": 0.5233697891235352, "learning_rate": 0.0002293548387096774, "loss": 0.6396, "step": 73 }, { "epoch": 2.368, "grad_norm": 0.4720582962036133, "learning_rate": 0.0002283870967741935, "loss": 0.5076, "step": 74 }, { "epoch": 2.4, "grad_norm": 0.4900650382041931, "learning_rate": 0.00022741935483870966, "loss": 0.4794, "step": 75 }, { "epoch": 2.432, "grad_norm": 0.6321704983711243, "learning_rate": 0.0002264516129032258, "loss": 0.6677, "step": 76 }, { "epoch": 2.464, "grad_norm": 0.5305324792861938, "learning_rate": 0.00022548387096774192, "loss": 0.5102, "step": 77 }, { "epoch": 2.496, "grad_norm": 0.5799248218536377, "learning_rate": 0.00022451612903225804, "loss": 0.5274, "step": 78 }, { "epoch": 2.528, "grad_norm": 0.4990101456642151, "learning_rate": 0.00022354838709677416, "loss": 0.5407, "step": 79 }, { "epoch": 2.56, "grad_norm": 0.4779827296733856, "learning_rate": 0.0002225806451612903, "loss": 0.5166, "step": 80 }, { "epoch": 2.592, "grad_norm": 0.5140111446380615, "learning_rate": 0.00022161290322580645, "loss": 0.3288, "step": 81 }, { "epoch": 2.624, "grad_norm": 0.5674853920936584, "learning_rate": 0.00022064516129032257, "loss": 0.666, "step": 82 }, { "epoch": 2.656, "grad_norm": 0.5277597308158875, "learning_rate": 0.00021967741935483871, "loss": 0.5335, "step": 83 }, { "epoch": 2.6879999999999997, "grad_norm": 0.6029439568519592, "learning_rate": 0.0002187096774193548, "loss": 0.693, "step": 84 }, { "epoch": 2.7199999999999998, "grad_norm": 0.5039327144622803, "learning_rate": 0.00021774193548387095, "loss": 0.5728, "step": 85 }, { "epoch": 2.752, "grad_norm": 0.5564692616462708, "learning_rate": 0.00021677419354838707, "loss": 0.4734, "step": 86 }, { "epoch": 2.784, "grad_norm": 0.5278319120407104, "learning_rate": 0.00021580645161290322, "loss": 0.5834, "step": 87 }, { "epoch": 2.816, "grad_norm": 0.5445135831832886, "learning_rate": 0.00021483870967741936, "loss": 0.4642, "step": 88 }, { "epoch": 2.848, "grad_norm": 0.5394749045372009, "learning_rate": 0.00021387096774193545, "loss": 0.4779, "step": 89 }, { "epoch": 2.88, "grad_norm": 0.5756134390830994, "learning_rate": 0.0002129032258064516, "loss": 0.5607, "step": 90 }, { "epoch": 2.912, "grad_norm": 0.48361241817474365, "learning_rate": 0.00021193548387096772, "loss": 0.4278, "step": 91 }, { "epoch": 2.944, "grad_norm": 0.5017121434211731, "learning_rate": 0.00021096774193548386, "loss": 0.4834, "step": 92 }, { "epoch": 2.976, "grad_norm": 0.4741989076137543, "learning_rate": 0.00020999999999999998, "loss": 0.468, "step": 93 }, { "epoch": 3.008, "grad_norm": 1.003368854522705, "learning_rate": 0.0002090322580645161, "loss": 0.8614, "step": 94 }, { "epoch": 3.04, "grad_norm": 0.4782228469848633, "learning_rate": 0.00020806451612903225, "loss": 0.4111, "step": 95 }, { "epoch": 3.072, "grad_norm": 0.4558674395084381, "learning_rate": 0.00020709677419354836, "loss": 0.3463, "step": 96 }, { "epoch": 3.104, "grad_norm": 0.4409371316432953, "learning_rate": 0.0002061290322580645, "loss": 0.2571, "step": 97 }, { "epoch": 3.136, "grad_norm": 0.5415034890174866, "learning_rate": 0.00020516129032258063, "loss": 0.5707, "step": 98 }, { "epoch": 3.168, "grad_norm": 0.6157724857330322, "learning_rate": 0.00020419354838709677, "loss": 0.5692, "step": 99 }, { "epoch": 3.2, "grad_norm": 0.4855688810348511, "learning_rate": 0.00020322580645161287, "loss": 0.3311, "step": 100 } ], "logging_steps": 1, "max_steps": 310, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3700930822144e+16, "train_batch_size": 3, "trial_name": null, "trial_params": null }