|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.2, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.3297976851463318, |
|
"learning_rate": 0.0002990322580645161, |
|
"loss": 1.0389, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.4069916307926178, |
|
"learning_rate": 0.0002980645161290322, |
|
"loss": 1.3377, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.42084500193595886, |
|
"learning_rate": 0.00029709677419354836, |
|
"loss": 0.9366, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.4641948938369751, |
|
"learning_rate": 0.0002961290322580645, |
|
"loss": 1.0086, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.3840750455856323, |
|
"learning_rate": 0.00029516129032258065, |
|
"loss": 0.8333, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.4263865053653717, |
|
"learning_rate": 0.00029419354838709674, |
|
"loss": 0.854, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.48615148663520813, |
|
"learning_rate": 0.0002932258064516129, |
|
"loss": 0.9548, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.44419369101524353, |
|
"learning_rate": 0.00029225806451612903, |
|
"loss": 0.8482, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.5317733883857727, |
|
"learning_rate": 0.0002912903225806451, |
|
"loss": 0.9426, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.47260937094688416, |
|
"learning_rate": 0.00029032258064516127, |
|
"loss": 0.9816, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.39063283801078796, |
|
"learning_rate": 0.00028935483870967736, |
|
"loss": 0.84, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.39234670996665955, |
|
"learning_rate": 0.0002883870967741935, |
|
"loss": 0.7476, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.40661805868148804, |
|
"learning_rate": 0.00028741935483870965, |
|
"loss": 0.9282, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.42970865964889526, |
|
"learning_rate": 0.0002864516129032258, |
|
"loss": 0.7858, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.3780193626880646, |
|
"learning_rate": 0.00028548387096774194, |
|
"loss": 0.7968, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.37006014585494995, |
|
"learning_rate": 0.00028451612903225803, |
|
"loss": 0.6801, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.3660840392112732, |
|
"learning_rate": 0.0002835483870967742, |
|
"loss": 0.5914, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.3270975351333618, |
|
"learning_rate": 0.00028258064516129027, |
|
"loss": 0.6449, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.3859024941921234, |
|
"learning_rate": 0.0002816129032258064, |
|
"loss": 0.8144, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.37092071771621704, |
|
"learning_rate": 0.00028064516129032256, |
|
"loss": 0.7667, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.37667015194892883, |
|
"learning_rate": 0.0002796774193548387, |
|
"loss": 0.7751, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.3832458555698395, |
|
"learning_rate": 0.0002787096774193548, |
|
"loss": 0.755, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.327288419008255, |
|
"learning_rate": 0.00027774193548387095, |
|
"loss": 0.7178, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.34552687406539917, |
|
"learning_rate": 0.0002767741935483871, |
|
"loss": 0.7057, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.3611259460449219, |
|
"learning_rate": 0.0002758064516129032, |
|
"loss": 0.8159, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.3345054090023041, |
|
"learning_rate": 0.00027483870967741933, |
|
"loss": 0.7208, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.3697254955768585, |
|
"learning_rate": 0.0002738709677419355, |
|
"loss": 0.8964, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.3905017375946045, |
|
"learning_rate": 0.00027290322580645157, |
|
"loss": 0.7794, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.3715725243091583, |
|
"learning_rate": 0.0002719354838709677, |
|
"loss": 0.6966, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.3650343120098114, |
|
"learning_rate": 0.00027096774193548386, |
|
"loss": 0.5761, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.33932459354400635, |
|
"learning_rate": 0.00027, |
|
"loss": 0.556, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.6371742486953735, |
|
"learning_rate": 0.0002690322580645161, |
|
"loss": 0.847, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 0.37499895691871643, |
|
"learning_rate": 0.00026806451612903224, |
|
"loss": 0.8419, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.33221954107284546, |
|
"learning_rate": 0.0002670967741935484, |
|
"loss": 0.6011, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.344096839427948, |
|
"learning_rate": 0.0002661290322580645, |
|
"loss": 0.6501, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.38429391384124756, |
|
"learning_rate": 0.0002651612903225806, |
|
"loss": 0.8091, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 0.38014867901802063, |
|
"learning_rate": 0.00026419354838709677, |
|
"loss": 0.7668, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 0.3352573812007904, |
|
"learning_rate": 0.00026322580645161286, |
|
"loss": 0.5444, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 0.33811062574386597, |
|
"learning_rate": 0.000262258064516129, |
|
"loss": 0.512, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.3998416066169739, |
|
"learning_rate": 0.00026129032258064515, |
|
"loss": 0.6315, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 0.3983341157436371, |
|
"learning_rate": 0.0002603225806451613, |
|
"loss": 0.5882, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 0.4585898816585541, |
|
"learning_rate": 0.0002593548387096774, |
|
"loss": 0.761, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 0.4080730080604553, |
|
"learning_rate": 0.00025838709677419354, |
|
"loss": 0.6716, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 0.4068273901939392, |
|
"learning_rate": 0.0002574193548387096, |
|
"loss": 0.6376, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.4406949579715729, |
|
"learning_rate": 0.00025645161290322577, |
|
"loss": 0.4594, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 0.34500986337661743, |
|
"learning_rate": 0.0002554838709677419, |
|
"loss": 0.3672, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 0.4760681390762329, |
|
"learning_rate": 0.00025451612903225806, |
|
"loss": 0.6331, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 0.39281558990478516, |
|
"learning_rate": 0.0002535483870967742, |
|
"loss": 0.5845, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 0.4265002906322479, |
|
"learning_rate": 0.0002525806451612903, |
|
"loss": 0.4461, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.40967294573783875, |
|
"learning_rate": 0.00025161290322580645, |
|
"loss": 0.7011, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 0.4288088381290436, |
|
"learning_rate": 0.00025064516129032254, |
|
"loss": 0.6928, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 0.4356289803981781, |
|
"learning_rate": 0.0002496774193548387, |
|
"loss": 0.7972, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 0.3827487826347351, |
|
"learning_rate": 0.0002487096774193548, |
|
"loss": 0.2991, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 0.40093398094177246, |
|
"learning_rate": 0.0002477419354838709, |
|
"loss": 0.416, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.41548973321914673, |
|
"learning_rate": 0.00024677419354838707, |
|
"loss": 0.5501, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 0.4093388617038727, |
|
"learning_rate": 0.0002458064516129032, |
|
"loss": 0.5557, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 0.3934040665626526, |
|
"learning_rate": 0.00024483870967741936, |
|
"loss": 0.602, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 0.42221033573150635, |
|
"learning_rate": 0.00024387096774193545, |
|
"loss": 0.6421, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 0.4351339340209961, |
|
"learning_rate": 0.0002429032258064516, |
|
"loss": 0.5615, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.4319838881492615, |
|
"learning_rate": 0.00024193548387096771, |
|
"loss": 0.6804, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 0.40016525983810425, |
|
"learning_rate": 0.00024096774193548386, |
|
"loss": 0.5432, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 0.3905942440032959, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 0.4187, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 0.8056382536888123, |
|
"learning_rate": 0.0002390322580645161, |
|
"loss": 1.0174, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 0.3835236430168152, |
|
"learning_rate": 0.00023806451612903224, |
|
"loss": 0.5992, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.41092216968536377, |
|
"learning_rate": 0.00023709677419354836, |
|
"loss": 0.4746, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 0.39536622166633606, |
|
"learning_rate": 0.0002361290322580645, |
|
"loss": 0.3946, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.144, |
|
"grad_norm": 0.3927665948867798, |
|
"learning_rate": 0.0002351612903225806, |
|
"loss": 0.5187, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 0.39792704582214355, |
|
"learning_rate": 0.00023419354838709674, |
|
"loss": 0.4568, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 0.5023652911186218, |
|
"learning_rate": 0.0002332258064516129, |
|
"loss": 0.6166, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.425017774105072, |
|
"learning_rate": 0.000232258064516129, |
|
"loss": 0.42, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 0.46458110213279724, |
|
"learning_rate": 0.00023129032258064516, |
|
"loss": 0.4613, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 0.49037960171699524, |
|
"learning_rate": 0.00023032258064516125, |
|
"loss": 0.5509, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.336, |
|
"grad_norm": 0.5233697891235352, |
|
"learning_rate": 0.0002293548387096774, |
|
"loss": 0.6396, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 0.4720582962036133, |
|
"learning_rate": 0.0002283870967741935, |
|
"loss": 0.5076, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.4900650382041931, |
|
"learning_rate": 0.00022741935483870966, |
|
"loss": 0.4794, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 0.6321704983711243, |
|
"learning_rate": 0.0002264516129032258, |
|
"loss": 0.6677, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 0.5305324792861938, |
|
"learning_rate": 0.00022548387096774192, |
|
"loss": 0.5102, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 0.5799248218536377, |
|
"learning_rate": 0.00022451612903225804, |
|
"loss": 0.5274, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.528, |
|
"grad_norm": 0.4990101456642151, |
|
"learning_rate": 0.00022354838709677416, |
|
"loss": 0.5407, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.4779827296733856, |
|
"learning_rate": 0.0002225806451612903, |
|
"loss": 0.5166, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.592, |
|
"grad_norm": 0.5140111446380615, |
|
"learning_rate": 0.00022161290322580645, |
|
"loss": 0.3288, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.624, |
|
"grad_norm": 0.5674853920936584, |
|
"learning_rate": 0.00022064516129032257, |
|
"loss": 0.666, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.656, |
|
"grad_norm": 0.5277597308158875, |
|
"learning_rate": 0.00021967741935483871, |
|
"loss": 0.5335, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 0.6029439568519592, |
|
"learning_rate": 0.0002187096774193548, |
|
"loss": 0.693, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 0.5039327144622803, |
|
"learning_rate": 0.00021774193548387095, |
|
"loss": 0.5728, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 0.5564692616462708, |
|
"learning_rate": 0.00021677419354838707, |
|
"loss": 0.4734, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.784, |
|
"grad_norm": 0.5278319120407104, |
|
"learning_rate": 0.00021580645161290322, |
|
"loss": 0.5834, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 0.5445135831832886, |
|
"learning_rate": 0.00021483870967741936, |
|
"loss": 0.4642, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.848, |
|
"grad_norm": 0.5394749045372009, |
|
"learning_rate": 0.00021387096774193545, |
|
"loss": 0.4779, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.5756134390830994, |
|
"learning_rate": 0.0002129032258064516, |
|
"loss": 0.5607, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.912, |
|
"grad_norm": 0.48361241817474365, |
|
"learning_rate": 0.00021193548387096772, |
|
"loss": 0.4278, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 0.5017121434211731, |
|
"learning_rate": 0.00021096774193548386, |
|
"loss": 0.4834, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"grad_norm": 0.4741989076137543, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 0.468, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 3.008, |
|
"grad_norm": 1.003368854522705, |
|
"learning_rate": 0.0002090322580645161, |
|
"loss": 0.8614, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.4782228469848633, |
|
"learning_rate": 0.00020806451612903225, |
|
"loss": 0.4111, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.072, |
|
"grad_norm": 0.4558674395084381, |
|
"learning_rate": 0.00020709677419354836, |
|
"loss": 0.3463, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.104, |
|
"grad_norm": 0.4409371316432953, |
|
"learning_rate": 0.0002061290322580645, |
|
"loss": 0.2571, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.136, |
|
"grad_norm": 0.5415034890174866, |
|
"learning_rate": 0.00020516129032258063, |
|
"loss": 0.5707, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.168, |
|
"grad_norm": 0.6157724857330322, |
|
"learning_rate": 0.00020419354838709677, |
|
"loss": 0.5692, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.4855688810348511, |
|
"learning_rate": 0.00020322580645161287, |
|
"loss": 0.3311, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 310, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.3700930822144e+16, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|