lesso01's picture
Training in progress, step 100, checkpoint
0d96350 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7380073800738007,
"eval_steps": 9,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007380073800738007,
"grad_norm": 0.06256721913814545,
"learning_rate": 1e-05,
"loss": 10.3611,
"step": 1
},
{
"epoch": 0.007380073800738007,
"eval_loss": 10.364021301269531,
"eval_runtime": 0.2134,
"eval_samples_per_second": 534.314,
"eval_steps_per_second": 70.304,
"step": 1
},
{
"epoch": 0.014760147601476014,
"grad_norm": 0.07218925654888153,
"learning_rate": 2e-05,
"loss": 10.3569,
"step": 2
},
{
"epoch": 0.02214022140221402,
"grad_norm": 0.05393102392554283,
"learning_rate": 3e-05,
"loss": 10.368,
"step": 3
},
{
"epoch": 0.02952029520295203,
"grad_norm": 0.05452526733279228,
"learning_rate": 4e-05,
"loss": 10.3642,
"step": 4
},
{
"epoch": 0.03690036900369004,
"grad_norm": 0.054869394749403,
"learning_rate": 5e-05,
"loss": 10.3578,
"step": 5
},
{
"epoch": 0.04428044280442804,
"grad_norm": 0.05008178576827049,
"learning_rate": 6e-05,
"loss": 10.3658,
"step": 6
},
{
"epoch": 0.05166051660516605,
"grad_norm": 0.05989466607570648,
"learning_rate": 7e-05,
"loss": 10.3585,
"step": 7
},
{
"epoch": 0.05904059040590406,
"grad_norm": 0.04574362933635712,
"learning_rate": 8e-05,
"loss": 10.356,
"step": 8
},
{
"epoch": 0.06642066420664207,
"grad_norm": 0.06692593544721603,
"learning_rate": 9e-05,
"loss": 10.3609,
"step": 9
},
{
"epoch": 0.06642066420664207,
"eval_loss": 10.363554000854492,
"eval_runtime": 0.2538,
"eval_samples_per_second": 449.195,
"eval_steps_per_second": 59.105,
"step": 9
},
{
"epoch": 0.07380073800738007,
"grad_norm": 0.064938485622406,
"learning_rate": 0.0001,
"loss": 10.3572,
"step": 10
},
{
"epoch": 0.08118081180811808,
"grad_norm": 0.057259056717157364,
"learning_rate": 9.99695413509548e-05,
"loss": 10.3593,
"step": 11
},
{
"epoch": 0.08856088560885608,
"grad_norm": 0.057903651148080826,
"learning_rate": 9.987820251299122e-05,
"loss": 10.3648,
"step": 12
},
{
"epoch": 0.0959409594095941,
"grad_norm": 0.06259284913539886,
"learning_rate": 9.972609476841367e-05,
"loss": 10.3627,
"step": 13
},
{
"epoch": 0.1033210332103321,
"grad_norm": 0.04629024863243103,
"learning_rate": 9.951340343707852e-05,
"loss": 10.3572,
"step": 14
},
{
"epoch": 0.11070110701107011,
"grad_norm": 0.04599935561418533,
"learning_rate": 9.924038765061042e-05,
"loss": 10.3596,
"step": 15
},
{
"epoch": 0.11808118081180811,
"grad_norm": 0.05176709592342377,
"learning_rate": 9.890738003669029e-05,
"loss": 10.36,
"step": 16
},
{
"epoch": 0.12546125461254612,
"grad_norm": 0.05611785873770714,
"learning_rate": 9.851478631379982e-05,
"loss": 10.3604,
"step": 17
},
{
"epoch": 0.13284132841328414,
"grad_norm": 0.04586149379611015,
"learning_rate": 9.806308479691595e-05,
"loss": 10.3655,
"step": 18
},
{
"epoch": 0.13284132841328414,
"eval_loss": 10.362375259399414,
"eval_runtime": 0.2517,
"eval_samples_per_second": 452.974,
"eval_steps_per_second": 59.602,
"step": 18
},
{
"epoch": 0.14022140221402213,
"grad_norm": 0.0533270500600338,
"learning_rate": 9.755282581475769e-05,
"loss": 10.3591,
"step": 19
},
{
"epoch": 0.14760147601476015,
"grad_norm": 0.0650496557354927,
"learning_rate": 9.698463103929542e-05,
"loss": 10.3644,
"step": 20
},
{
"epoch": 0.15498154981549817,
"grad_norm": 0.05515008047223091,
"learning_rate": 9.635919272833938e-05,
"loss": 10.3548,
"step": 21
},
{
"epoch": 0.16236162361623616,
"grad_norm": 0.05346156656742096,
"learning_rate": 9.567727288213005e-05,
"loss": 10.3573,
"step": 22
},
{
"epoch": 0.16974169741697417,
"grad_norm": 0.05062566325068474,
"learning_rate": 9.493970231495835e-05,
"loss": 10.3632,
"step": 23
},
{
"epoch": 0.17712177121771217,
"grad_norm": 0.06177437677979469,
"learning_rate": 9.414737964294636e-05,
"loss": 10.3565,
"step": 24
},
{
"epoch": 0.18450184501845018,
"grad_norm": 0.05376908555626869,
"learning_rate": 9.330127018922194e-05,
"loss": 10.3621,
"step": 25
},
{
"epoch": 0.1918819188191882,
"grad_norm": 0.05945334956049919,
"learning_rate": 9.24024048078213e-05,
"loss": 10.3584,
"step": 26
},
{
"epoch": 0.1992619926199262,
"grad_norm": 0.049115318804979324,
"learning_rate": 9.145187862775209e-05,
"loss": 10.3698,
"step": 27
},
{
"epoch": 0.1992619926199262,
"eval_loss": 10.36121654510498,
"eval_runtime": 0.2541,
"eval_samples_per_second": 448.64,
"eval_steps_per_second": 59.032,
"step": 27
},
{
"epoch": 0.2066420664206642,
"grad_norm": 0.0591144897043705,
"learning_rate": 9.045084971874738e-05,
"loss": 10.3573,
"step": 28
},
{
"epoch": 0.2140221402214022,
"grad_norm": 0.04607441648840904,
"learning_rate": 8.940053768033609e-05,
"loss": 10.3615,
"step": 29
},
{
"epoch": 0.22140221402214022,
"grad_norm": 0.05733785033226013,
"learning_rate": 8.83022221559489e-05,
"loss": 10.365,
"step": 30
},
{
"epoch": 0.22878228782287824,
"grad_norm": 0.05551564693450928,
"learning_rate": 8.715724127386972e-05,
"loss": 10.3667,
"step": 31
},
{
"epoch": 0.23616236162361623,
"grad_norm": 0.06119786947965622,
"learning_rate": 8.596699001693255e-05,
"loss": 10.3568,
"step": 32
},
{
"epoch": 0.24354243542435425,
"grad_norm": 0.06199464574456215,
"learning_rate": 8.473291852294987e-05,
"loss": 10.3634,
"step": 33
},
{
"epoch": 0.25092250922509224,
"grad_norm": 0.05202634260058403,
"learning_rate": 8.345653031794292e-05,
"loss": 10.35,
"step": 34
},
{
"epoch": 0.25830258302583026,
"grad_norm": 0.06763176620006561,
"learning_rate": 8.213938048432697e-05,
"loss": 10.3585,
"step": 35
},
{
"epoch": 0.2656826568265683,
"grad_norm": 0.05460091680288315,
"learning_rate": 8.07830737662829e-05,
"loss": 10.3556,
"step": 36
},
{
"epoch": 0.2656826568265683,
"eval_loss": 10.36009693145752,
"eval_runtime": 0.2643,
"eval_samples_per_second": 431.338,
"eval_steps_per_second": 56.755,
"step": 36
},
{
"epoch": 0.2730627306273063,
"grad_norm": 0.055408477783203125,
"learning_rate": 7.938926261462366e-05,
"loss": 10.3602,
"step": 37
},
{
"epoch": 0.28044280442804426,
"grad_norm": 0.06199764087796211,
"learning_rate": 7.795964517353735e-05,
"loss": 10.3503,
"step": 38
},
{
"epoch": 0.2878228782287823,
"grad_norm": 0.06058415398001671,
"learning_rate": 7.649596321166024e-05,
"loss": 10.3625,
"step": 39
},
{
"epoch": 0.2952029520295203,
"grad_norm": 0.05479194596409798,
"learning_rate": 7.500000000000001e-05,
"loss": 10.3597,
"step": 40
},
{
"epoch": 0.3025830258302583,
"grad_norm": 0.06328034400939941,
"learning_rate": 7.347357813929454e-05,
"loss": 10.3501,
"step": 41
},
{
"epoch": 0.30996309963099633,
"grad_norm": 0.06398607045412064,
"learning_rate": 7.191855733945387e-05,
"loss": 10.3631,
"step": 42
},
{
"epoch": 0.3173431734317343,
"grad_norm": 0.050947874784469604,
"learning_rate": 7.033683215379002e-05,
"loss": 10.3514,
"step": 43
},
{
"epoch": 0.3247232472324723,
"grad_norm": 0.059516243636608124,
"learning_rate": 6.873032967079561e-05,
"loss": 10.3551,
"step": 44
},
{
"epoch": 0.33210332103321033,
"grad_norm": 0.060571130365133286,
"learning_rate": 6.710100716628344e-05,
"loss": 10.3614,
"step": 45
},
{
"epoch": 0.33210332103321033,
"eval_loss": 10.359009742736816,
"eval_runtime": 0.2603,
"eval_samples_per_second": 438.013,
"eval_steps_per_second": 57.633,
"step": 45
},
{
"epoch": 0.33948339483394835,
"grad_norm": 0.06294915080070496,
"learning_rate": 6.545084971874738e-05,
"loss": 10.3544,
"step": 46
},
{
"epoch": 0.34686346863468637,
"grad_norm": 0.06438196450471878,
"learning_rate": 6.378186779084995e-05,
"loss": 10.3484,
"step": 47
},
{
"epoch": 0.35424354243542433,
"grad_norm": 0.0660022422671318,
"learning_rate": 6.209609477998338e-05,
"loss": 10.358,
"step": 48
},
{
"epoch": 0.36162361623616235,
"grad_norm": 0.054936427623033524,
"learning_rate": 6.0395584540887963e-05,
"loss": 10.3584,
"step": 49
},
{
"epoch": 0.36900369003690037,
"grad_norm": 0.05536165460944176,
"learning_rate": 5.868240888334653e-05,
"loss": 10.3482,
"step": 50
},
{
"epoch": 0.3763837638376384,
"grad_norm": 0.06291848421096802,
"learning_rate": 5.695865504800327e-05,
"loss": 10.3517,
"step": 51
},
{
"epoch": 0.3837638376383764,
"grad_norm": 0.0627998635172844,
"learning_rate": 5.522642316338268e-05,
"loss": 10.3638,
"step": 52
},
{
"epoch": 0.39114391143911437,
"grad_norm": 0.07972683012485504,
"learning_rate": 5.348782368720626e-05,
"loss": 10.3552,
"step": 53
},
{
"epoch": 0.3985239852398524,
"grad_norm": 0.06060464307665825,
"learning_rate": 5.174497483512506e-05,
"loss": 10.3564,
"step": 54
},
{
"epoch": 0.3985239852398524,
"eval_loss": 10.3579740524292,
"eval_runtime": 0.2553,
"eval_samples_per_second": 446.596,
"eval_steps_per_second": 58.763,
"step": 54
},
{
"epoch": 0.4059040590405904,
"grad_norm": 0.05204768851399422,
"learning_rate": 5e-05,
"loss": 10.3626,
"step": 55
},
{
"epoch": 0.4132841328413284,
"grad_norm": 0.07554280757904053,
"learning_rate": 4.825502516487497e-05,
"loss": 10.3532,
"step": 56
},
{
"epoch": 0.42066420664206644,
"grad_norm": 0.06683514267206192,
"learning_rate": 4.6512176312793736e-05,
"loss": 10.355,
"step": 57
},
{
"epoch": 0.4280442804428044,
"grad_norm": 0.08101712167263031,
"learning_rate": 4.477357683661734e-05,
"loss": 10.3584,
"step": 58
},
{
"epoch": 0.4354243542435424,
"grad_norm": 0.05437375232577324,
"learning_rate": 4.3041344951996746e-05,
"loss": 10.3642,
"step": 59
},
{
"epoch": 0.44280442804428044,
"grad_norm": 0.07148314267396927,
"learning_rate": 4.131759111665349e-05,
"loss": 10.3536,
"step": 60
},
{
"epoch": 0.45018450184501846,
"grad_norm": 0.06779924035072327,
"learning_rate": 3.960441545911204e-05,
"loss": 10.3531,
"step": 61
},
{
"epoch": 0.4575645756457565,
"grad_norm": 0.06029814854264259,
"learning_rate": 3.790390522001662e-05,
"loss": 10.3641,
"step": 62
},
{
"epoch": 0.46494464944649444,
"grad_norm": 0.06448680907487869,
"learning_rate": 3.6218132209150045e-05,
"loss": 10.3531,
"step": 63
},
{
"epoch": 0.46494464944649444,
"eval_loss": 10.357136726379395,
"eval_runtime": 0.2545,
"eval_samples_per_second": 447.928,
"eval_steps_per_second": 58.938,
"step": 63
},
{
"epoch": 0.47232472324723246,
"grad_norm": 0.05686628445982933,
"learning_rate": 3.4549150281252636e-05,
"loss": 10.3655,
"step": 64
},
{
"epoch": 0.4797047970479705,
"grad_norm": 0.06419065594673157,
"learning_rate": 3.289899283371657e-05,
"loss": 10.3568,
"step": 65
},
{
"epoch": 0.4870848708487085,
"grad_norm": 0.06342583149671555,
"learning_rate": 3.12696703292044e-05,
"loss": 10.3573,
"step": 66
},
{
"epoch": 0.4944649446494465,
"grad_norm": 0.056026969105005264,
"learning_rate": 2.9663167846209998e-05,
"loss": 10.36,
"step": 67
},
{
"epoch": 0.5018450184501845,
"grad_norm": 0.07585887610912323,
"learning_rate": 2.8081442660546125e-05,
"loss": 10.3511,
"step": 68
},
{
"epoch": 0.5092250922509225,
"grad_norm": 0.0630531907081604,
"learning_rate": 2.6526421860705473e-05,
"loss": 10.355,
"step": 69
},
{
"epoch": 0.5166051660516605,
"grad_norm": 0.06669856607913971,
"learning_rate": 2.500000000000001e-05,
"loss": 10.3538,
"step": 70
},
{
"epoch": 0.5239852398523985,
"grad_norm": 0.07067961245775223,
"learning_rate": 2.350403678833976e-05,
"loss": 10.3578,
"step": 71
},
{
"epoch": 0.5313653136531366,
"grad_norm": 0.06368213891983032,
"learning_rate": 2.2040354826462668e-05,
"loss": 10.3611,
"step": 72
},
{
"epoch": 0.5313653136531366,
"eval_loss": 10.356532096862793,
"eval_runtime": 0.2507,
"eval_samples_per_second": 454.656,
"eval_steps_per_second": 59.823,
"step": 72
},
{
"epoch": 0.5387453874538746,
"grad_norm": 0.07432413101196289,
"learning_rate": 2.061073738537635e-05,
"loss": 10.3526,
"step": 73
},
{
"epoch": 0.5461254612546126,
"grad_norm": 0.06468094140291214,
"learning_rate": 1.9216926233717085e-05,
"loss": 10.3627,
"step": 74
},
{
"epoch": 0.5535055350553506,
"grad_norm": 0.0772942453622818,
"learning_rate": 1.7860619515673033e-05,
"loss": 10.355,
"step": 75
},
{
"epoch": 0.5608856088560885,
"grad_norm": 0.08569208532571793,
"learning_rate": 1.6543469682057106e-05,
"loss": 10.3477,
"step": 76
},
{
"epoch": 0.5682656826568265,
"grad_norm": 0.06884515285491943,
"learning_rate": 1.526708147705013e-05,
"loss": 10.3559,
"step": 77
},
{
"epoch": 0.5756457564575646,
"grad_norm": 0.08712355047464371,
"learning_rate": 1.4033009983067452e-05,
"loss": 10.3498,
"step": 78
},
{
"epoch": 0.5830258302583026,
"grad_norm": 0.0719294399023056,
"learning_rate": 1.2842758726130283e-05,
"loss": 10.3497,
"step": 79
},
{
"epoch": 0.5904059040590406,
"grad_norm": 0.07984092831611633,
"learning_rate": 1.1697777844051105e-05,
"loss": 10.3511,
"step": 80
},
{
"epoch": 0.5977859778597786,
"grad_norm": 0.06634163111448288,
"learning_rate": 1.0599462319663905e-05,
"loss": 10.3598,
"step": 81
},
{
"epoch": 0.5977859778597786,
"eval_loss": 10.356156349182129,
"eval_runtime": 0.3485,
"eval_samples_per_second": 327.078,
"eval_steps_per_second": 43.037,
"step": 81
},
{
"epoch": 0.6051660516605166,
"grad_norm": 0.06071637198328972,
"learning_rate": 9.549150281252633e-06,
"loss": 10.3581,
"step": 82
},
{
"epoch": 0.6125461254612546,
"grad_norm": 0.0657428652048111,
"learning_rate": 8.548121372247918e-06,
"loss": 10.3592,
"step": 83
},
{
"epoch": 0.6199261992619927,
"grad_norm": 0.06935244798660278,
"learning_rate": 7.597595192178702e-06,
"loss": 10.3532,
"step": 84
},
{
"epoch": 0.6273062730627307,
"grad_norm": 0.07533907890319824,
"learning_rate": 6.698729810778065e-06,
"loss": 10.3595,
"step": 85
},
{
"epoch": 0.6346863468634686,
"grad_norm": 0.0672995075583458,
"learning_rate": 5.852620357053651e-06,
"loss": 10.3543,
"step": 86
},
{
"epoch": 0.6420664206642066,
"grad_norm": 0.07711373269557953,
"learning_rate": 5.060297685041659e-06,
"loss": 10.3447,
"step": 87
},
{
"epoch": 0.6494464944649446,
"grad_norm": 0.05943749099969864,
"learning_rate": 4.322727117869951e-06,
"loss": 10.3582,
"step": 88
},
{
"epoch": 0.6568265682656826,
"grad_norm": 0.06703980267047882,
"learning_rate": 3.6408072716606346e-06,
"loss": 10.3553,
"step": 89
},
{
"epoch": 0.6642066420664207,
"grad_norm": 0.07490800321102142,
"learning_rate": 3.0153689607045845e-06,
"loss": 10.3604,
"step": 90
},
{
"epoch": 0.6642066420664207,
"eval_loss": 10.356014251708984,
"eval_runtime": 0.263,
"eval_samples_per_second": 433.463,
"eval_steps_per_second": 57.035,
"step": 90
},
{
"epoch": 0.6715867158671587,
"grad_norm": 0.06388845294713974,
"learning_rate": 2.4471741852423237e-06,
"loss": 10.358,
"step": 91
},
{
"epoch": 0.6789667896678967,
"grad_norm": 0.07117879390716553,
"learning_rate": 1.9369152030840556e-06,
"loss": 10.3492,
"step": 92
},
{
"epoch": 0.6863468634686347,
"grad_norm": 0.08810685575008392,
"learning_rate": 1.4852136862001764e-06,
"loss": 10.3545,
"step": 93
},
{
"epoch": 0.6937269372693727,
"grad_norm": 0.0710313469171524,
"learning_rate": 1.0926199633097157e-06,
"loss": 10.3588,
"step": 94
},
{
"epoch": 0.7011070110701108,
"grad_norm": 0.06251402199268341,
"learning_rate": 7.596123493895991e-07,
"loss": 10.3527,
"step": 95
},
{
"epoch": 0.7084870848708487,
"grad_norm": 0.07826722413301468,
"learning_rate": 4.865965629214819e-07,
"loss": 10.3565,
"step": 96
},
{
"epoch": 0.7158671586715867,
"grad_norm": 0.08423243463039398,
"learning_rate": 2.7390523158633554e-07,
"loss": 10.351,
"step": 97
},
{
"epoch": 0.7232472324723247,
"grad_norm": 0.06485340744256973,
"learning_rate": 1.2179748700879012e-07,
"loss": 10.3546,
"step": 98
},
{
"epoch": 0.7306273062730627,
"grad_norm": 0.06696449220180511,
"learning_rate": 3.04586490452119e-08,
"loss": 10.3548,
"step": 99
},
{
"epoch": 0.7306273062730627,
"eval_loss": 10.355989456176758,
"eval_runtime": 0.2507,
"eval_samples_per_second": 454.706,
"eval_steps_per_second": 59.83,
"step": 99
},
{
"epoch": 0.7380073800738007,
"grad_norm": 0.06878489255905151,
"learning_rate": 0.0,
"loss": 10.3647,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5230244659200.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}