|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.02, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002, |
|
"grad_norm": 1.1219478845596313, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 2.2204, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0004, |
|
"grad_norm": 1.0918514728546143, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 2.7879, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0006, |
|
"grad_norm": 0.5491233468055725, |
|
"learning_rate": 0.0002, |
|
"loss": 2.6259, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0008, |
|
"grad_norm": 0.5795004963874817, |
|
"learning_rate": 0.00019793814432989693, |
|
"loss": 2.7188, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.001, |
|
"grad_norm": 0.4776715934276581, |
|
"learning_rate": 0.00019587628865979381, |
|
"loss": 2.3886, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0012, |
|
"grad_norm": 0.5436866283416748, |
|
"learning_rate": 0.00019381443298969073, |
|
"loss": 2.2441, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0014, |
|
"grad_norm": 0.5175201296806335, |
|
"learning_rate": 0.00019175257731958765, |
|
"loss": 2.5653, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0016, |
|
"grad_norm": 0.4766971170902252, |
|
"learning_rate": 0.00018969072164948454, |
|
"loss": 2.1695, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0018, |
|
"grad_norm": 0.534247875213623, |
|
"learning_rate": 0.00018762886597938145, |
|
"loss": 1.9714, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.002, |
|
"grad_norm": 0.47341188788414, |
|
"learning_rate": 0.00018556701030927837, |
|
"loss": 2.01, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0022, |
|
"grad_norm": 0.5537518262863159, |
|
"learning_rate": 0.00018350515463917526, |
|
"loss": 2.3351, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0024, |
|
"grad_norm": 0.6112836003303528, |
|
"learning_rate": 0.00018144329896907217, |
|
"loss": 1.8101, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0026, |
|
"grad_norm": 0.4965449869632721, |
|
"learning_rate": 0.0001793814432989691, |
|
"loss": 1.7917, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0028, |
|
"grad_norm": 0.515200138092041, |
|
"learning_rate": 0.00017731958762886598, |
|
"loss": 1.9197, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.003, |
|
"grad_norm": 0.5032430291175842, |
|
"learning_rate": 0.0001752577319587629, |
|
"loss": 2.0435, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 0.4674118459224701, |
|
"learning_rate": 0.0001731958762886598, |
|
"loss": 1.7773, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0034, |
|
"grad_norm": 0.5223268866539001, |
|
"learning_rate": 0.0001711340206185567, |
|
"loss": 1.9218, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0036, |
|
"grad_norm": 1.04414963722229, |
|
"learning_rate": 0.00016907216494845361, |
|
"loss": 2.0631, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0038, |
|
"grad_norm": 0.5213132500648499, |
|
"learning_rate": 0.00016701030927835053, |
|
"loss": 2.1922, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 0.5390024781227112, |
|
"learning_rate": 0.00016494845360824742, |
|
"loss": 2.2059, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0042, |
|
"grad_norm": 0.4950883388519287, |
|
"learning_rate": 0.00016288659793814434, |
|
"loss": 1.9657, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0044, |
|
"grad_norm": 0.4840719699859619, |
|
"learning_rate": 0.00016082474226804125, |
|
"loss": 1.9423, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0046, |
|
"grad_norm": 0.48367372155189514, |
|
"learning_rate": 0.00015876288659793814, |
|
"loss": 1.8103, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0048, |
|
"grad_norm": 0.704979419708252, |
|
"learning_rate": 0.00015670103092783506, |
|
"loss": 2.0108, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 0.5910339951515198, |
|
"learning_rate": 0.00015463917525773197, |
|
"loss": 1.9767, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0052, |
|
"grad_norm": 0.7773292660713196, |
|
"learning_rate": 0.00015257731958762886, |
|
"loss": 2.1536, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0054, |
|
"grad_norm": 0.5155345797538757, |
|
"learning_rate": 0.00015051546391752578, |
|
"loss": 2.1462, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0056, |
|
"grad_norm": 0.510045051574707, |
|
"learning_rate": 0.0001484536082474227, |
|
"loss": 1.9734, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0058, |
|
"grad_norm": 0.527519166469574, |
|
"learning_rate": 0.00014639175257731958, |
|
"loss": 2.0963, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.006, |
|
"grad_norm": 0.4819324016571045, |
|
"learning_rate": 0.0001443298969072165, |
|
"loss": 1.969, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0062, |
|
"grad_norm": 0.44363436102867126, |
|
"learning_rate": 0.00014226804123711342, |
|
"loss": 1.8159, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 0.4479809105396271, |
|
"learning_rate": 0.0001402061855670103, |
|
"loss": 1.6981, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0066, |
|
"grad_norm": 0.42961710691452026, |
|
"learning_rate": 0.00013814432989690722, |
|
"loss": 1.6154, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0068, |
|
"grad_norm": 0.47089216113090515, |
|
"learning_rate": 0.00013608247422680414, |
|
"loss": 1.8148, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.007, |
|
"grad_norm": 0.5263468623161316, |
|
"learning_rate": 0.00013402061855670103, |
|
"loss": 2.1879, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0072, |
|
"grad_norm": 0.4599800109863281, |
|
"learning_rate": 0.00013195876288659794, |
|
"loss": 1.6743, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0074, |
|
"grad_norm": 0.505596399307251, |
|
"learning_rate": 0.00012989690721649486, |
|
"loss": 1.6814, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0076, |
|
"grad_norm": 0.5242177248001099, |
|
"learning_rate": 0.00012783505154639175, |
|
"loss": 1.9174, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0078, |
|
"grad_norm": 0.5053271651268005, |
|
"learning_rate": 0.00012577319587628866, |
|
"loss": 1.7547, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 0.5670127868652344, |
|
"learning_rate": 0.00012371134020618558, |
|
"loss": 2.0127, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0082, |
|
"grad_norm": 0.608784019947052, |
|
"learning_rate": 0.00012164948453608247, |
|
"loss": 1.401, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0084, |
|
"grad_norm": 0.48027142882347107, |
|
"learning_rate": 0.00011958762886597938, |
|
"loss": 1.5524, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0086, |
|
"grad_norm": 0.4793825149536133, |
|
"learning_rate": 0.0001175257731958763, |
|
"loss": 1.4128, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0088, |
|
"grad_norm": 0.5311229825019836, |
|
"learning_rate": 0.00011546391752577319, |
|
"loss": 1.5239, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.009, |
|
"grad_norm": 0.4911600351333618, |
|
"learning_rate": 0.0001134020618556701, |
|
"loss": 1.4944, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0092, |
|
"grad_norm": 0.5163685083389282, |
|
"learning_rate": 0.00011134020618556702, |
|
"loss": 1.5343, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0094, |
|
"grad_norm": 0.47748181223869324, |
|
"learning_rate": 0.00010927835051546391, |
|
"loss": 1.2547, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 0.6499949097633362, |
|
"learning_rate": 0.00010721649484536083, |
|
"loss": 1.8584, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0098, |
|
"grad_norm": 0.6305411458015442, |
|
"learning_rate": 0.00010515463917525774, |
|
"loss": 1.2266, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.48604005575180054, |
|
"learning_rate": 0.00010309278350515463, |
|
"loss": 1.0111, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0102, |
|
"grad_norm": 0.42038169503211975, |
|
"learning_rate": 0.00010103092783505155, |
|
"loss": 2.0069, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0104, |
|
"grad_norm": 0.41026800870895386, |
|
"learning_rate": 9.896907216494846e-05, |
|
"loss": 1.9782, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0106, |
|
"grad_norm": 0.42623117566108704, |
|
"learning_rate": 9.690721649484537e-05, |
|
"loss": 2.1141, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0108, |
|
"grad_norm": 0.4193218946456909, |
|
"learning_rate": 9.484536082474227e-05, |
|
"loss": 1.8971, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.011, |
|
"grad_norm": 0.43847334384918213, |
|
"learning_rate": 9.278350515463918e-05, |
|
"loss": 1.7533, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0112, |
|
"grad_norm": 0.46704810857772827, |
|
"learning_rate": 9.072164948453609e-05, |
|
"loss": 2.0783, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0114, |
|
"grad_norm": 0.47083616256713867, |
|
"learning_rate": 8.865979381443299e-05, |
|
"loss": 2.2859, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0116, |
|
"grad_norm": 0.3852657079696655, |
|
"learning_rate": 8.65979381443299e-05, |
|
"loss": 1.6736, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0118, |
|
"grad_norm": 0.4302668273448944, |
|
"learning_rate": 8.453608247422681e-05, |
|
"loss": 2.144, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.012, |
|
"grad_norm": 0.4399866759777069, |
|
"learning_rate": 8.247422680412371e-05, |
|
"loss": 2.2605, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0122, |
|
"grad_norm": 0.4025079905986786, |
|
"learning_rate": 8.041237113402063e-05, |
|
"loss": 1.9658, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0124, |
|
"grad_norm": 0.4048742949962616, |
|
"learning_rate": 7.835051546391753e-05, |
|
"loss": 2.0939, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0126, |
|
"grad_norm": 0.38197118043899536, |
|
"learning_rate": 7.628865979381443e-05, |
|
"loss": 1.974, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 0.423852801322937, |
|
"learning_rate": 7.422680412371135e-05, |
|
"loss": 1.8456, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.013, |
|
"grad_norm": 0.3875732123851776, |
|
"learning_rate": 7.216494845360825e-05, |
|
"loss": 1.8775, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0132, |
|
"grad_norm": 0.400006502866745, |
|
"learning_rate": 7.010309278350515e-05, |
|
"loss": 1.9451, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0134, |
|
"grad_norm": 0.4780627191066742, |
|
"learning_rate": 6.804123711340207e-05, |
|
"loss": 2.1757, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0136, |
|
"grad_norm": 0.4084922969341278, |
|
"learning_rate": 6.597938144329897e-05, |
|
"loss": 2.0411, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0138, |
|
"grad_norm": 0.42807289958000183, |
|
"learning_rate": 6.391752577319587e-05, |
|
"loss": 2.3942, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.014, |
|
"grad_norm": 0.411408394575119, |
|
"learning_rate": 6.185567010309279e-05, |
|
"loss": 2.0569, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0142, |
|
"grad_norm": 0.3994385004043579, |
|
"learning_rate": 5.979381443298969e-05, |
|
"loss": 1.8811, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0144, |
|
"grad_norm": 0.4302988648414612, |
|
"learning_rate": 5.7731958762886594e-05, |
|
"loss": 1.8287, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0146, |
|
"grad_norm": 0.43813201785087585, |
|
"learning_rate": 5.567010309278351e-05, |
|
"loss": 2.0936, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0148, |
|
"grad_norm": 0.41323378682136536, |
|
"learning_rate": 5.360824742268041e-05, |
|
"loss": 1.8877, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 0.43512141704559326, |
|
"learning_rate": 5.1546391752577315e-05, |
|
"loss": 1.8419, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0152, |
|
"grad_norm": 0.46052488684654236, |
|
"learning_rate": 4.948453608247423e-05, |
|
"loss": 1.9927, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0154, |
|
"grad_norm": 0.3917546272277832, |
|
"learning_rate": 4.7422680412371134e-05, |
|
"loss": 1.6892, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0156, |
|
"grad_norm": 0.4031961262226105, |
|
"learning_rate": 4.536082474226804e-05, |
|
"loss": 1.7624, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0158, |
|
"grad_norm": 0.4103787839412689, |
|
"learning_rate": 4.329896907216495e-05, |
|
"loss": 1.8082, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 0.41262394189834595, |
|
"learning_rate": 4.1237113402061855e-05, |
|
"loss": 1.8105, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0162, |
|
"grad_norm": 0.41158345341682434, |
|
"learning_rate": 3.9175257731958764e-05, |
|
"loss": 1.8007, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0164, |
|
"grad_norm": 0.4416099786758423, |
|
"learning_rate": 3.7113402061855674e-05, |
|
"loss": 1.7926, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.0166, |
|
"grad_norm": 0.45428088307380676, |
|
"learning_rate": 3.5051546391752576e-05, |
|
"loss": 1.9448, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.0168, |
|
"grad_norm": 0.48291468620300293, |
|
"learning_rate": 3.2989690721649485e-05, |
|
"loss": 2.0836, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.017, |
|
"grad_norm": 0.46375176310539246, |
|
"learning_rate": 3.0927835051546395e-05, |
|
"loss": 1.9663, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0172, |
|
"grad_norm": 0.4698142409324646, |
|
"learning_rate": 2.8865979381443297e-05, |
|
"loss": 1.8841, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.0174, |
|
"grad_norm": 0.48520392179489136, |
|
"learning_rate": 2.6804123711340206e-05, |
|
"loss": 1.9849, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.0176, |
|
"grad_norm": 0.49598732590675354, |
|
"learning_rate": 2.4742268041237116e-05, |
|
"loss": 1.9452, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.0178, |
|
"grad_norm": 0.4195699691772461, |
|
"learning_rate": 2.268041237113402e-05, |
|
"loss": 1.3229, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.018, |
|
"grad_norm": 0.6012292504310608, |
|
"learning_rate": 2.0618556701030927e-05, |
|
"loss": 1.7777, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0182, |
|
"grad_norm": 0.44116055965423584, |
|
"learning_rate": 1.8556701030927837e-05, |
|
"loss": 1.5625, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.0184, |
|
"grad_norm": 0.5609015226364136, |
|
"learning_rate": 1.6494845360824743e-05, |
|
"loss": 1.9485, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0186, |
|
"grad_norm": 0.5063101053237915, |
|
"learning_rate": 1.4432989690721649e-05, |
|
"loss": 1.5872, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0188, |
|
"grad_norm": 0.45711398124694824, |
|
"learning_rate": 1.2371134020618558e-05, |
|
"loss": 1.4548, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.019, |
|
"grad_norm": 0.5572103261947632, |
|
"learning_rate": 1.0309278350515464e-05, |
|
"loss": 1.8041, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 0.5026547908782959, |
|
"learning_rate": 8.247422680412371e-06, |
|
"loss": 1.4015, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0194, |
|
"grad_norm": 0.5131281018257141, |
|
"learning_rate": 6.185567010309279e-06, |
|
"loss": 1.3779, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.0196, |
|
"grad_norm": 0.5810936689376831, |
|
"learning_rate": 4.123711340206186e-06, |
|
"loss": 1.4948, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.0198, |
|
"grad_norm": 0.5234789848327637, |
|
"learning_rate": 2.061855670103093e-06, |
|
"loss": 1.4101, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5946288704872131, |
|
"learning_rate": 0.0, |
|
"loss": 1.1919, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 529353635839488.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|