test / trainer_state.json
hegehongcha's picture
Upload 12 files
c18697c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.984025559105431,
"eval_steps": 500,
"global_step": 780,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06389776357827476,
"grad_norm": 130.31602916853151,
"learning_rate": 1.282051282051282e-06,
"loss": 3.6603,
"step": 10
},
{
"epoch": 0.12779552715654952,
"grad_norm": 14.781977919866863,
"learning_rate": 2.564102564102564e-06,
"loss": 0.3776,
"step": 20
},
{
"epoch": 0.19169329073482427,
"grad_norm": 4.3629613510157546,
"learning_rate": 3.846153846153847e-06,
"loss": 0.1595,
"step": 30
},
{
"epoch": 0.25559105431309903,
"grad_norm": 3.870157217981069,
"learning_rate": 5.128205128205128e-06,
"loss": 0.1568,
"step": 40
},
{
"epoch": 0.3194888178913738,
"grad_norm": 5.134748713124099,
"learning_rate": 6.410256410256412e-06,
"loss": 0.1566,
"step": 50
},
{
"epoch": 0.38338658146964855,
"grad_norm": 2.9173590197272645,
"learning_rate": 7.692307692307694e-06,
"loss": 0.146,
"step": 60
},
{
"epoch": 0.4472843450479233,
"grad_norm": 7.85067184702089,
"learning_rate": 8.974358974358976e-06,
"loss": 0.1465,
"step": 70
},
{
"epoch": 0.5111821086261981,
"grad_norm": 4.0072039812523155,
"learning_rate": 9.999799726899261e-06,
"loss": 0.152,
"step": 80
},
{
"epoch": 0.5750798722044729,
"grad_norm": 4.781263018607648,
"learning_rate": 9.992791852820709e-06,
"loss": 0.1531,
"step": 90
},
{
"epoch": 0.6389776357827476,
"grad_norm": 2.108926556744404,
"learning_rate": 9.975786361654959e-06,
"loss": 0.1298,
"step": 100
},
{
"epoch": 0.7028753993610224,
"grad_norm": 2.8153254735688513,
"learning_rate": 9.948817305370145e-06,
"loss": 0.1161,
"step": 110
},
{
"epoch": 0.7667731629392971,
"grad_norm": 4.0418418716369535,
"learning_rate": 9.911938687078324e-06,
"loss": 0.1011,
"step": 120
},
{
"epoch": 0.8306709265175719,
"grad_norm": 3.9843806569017146,
"learning_rate": 9.86522435289912e-06,
"loss": 0.0841,
"step": 130
},
{
"epoch": 0.8945686900958466,
"grad_norm": 1.3291937913520984,
"learning_rate": 9.80876784408948e-06,
"loss": 0.0698,
"step": 140
},
{
"epoch": 0.9584664536741214,
"grad_norm": 2.7308366191283513,
"learning_rate": 9.742682209735727e-06,
"loss": 0.0553,
"step": 150
},
{
"epoch": 1.0223642172523961,
"grad_norm": 1.1100404502138488,
"learning_rate": 9.66709978038292e-06,
"loss": 0.0516,
"step": 160
},
{
"epoch": 1.0862619808306708,
"grad_norm": 1.2597692857731515,
"learning_rate": 9.582171903054815e-06,
"loss": 0.0454,
"step": 170
},
{
"epoch": 1.1501597444089458,
"grad_norm": 1.1022185938066056,
"learning_rate": 9.488068638195072e-06,
"loss": 0.0387,
"step": 180
},
{
"epoch": 1.2140575079872205,
"grad_norm": 1.7172177840149363,
"learning_rate": 9.384978419136469e-06,
"loss": 0.0302,
"step": 190
},
{
"epoch": 1.2779552715654952,
"grad_norm": 1.5637027754240362,
"learning_rate": 9.273107674780102e-06,
"loss": 0.0325,
"step": 200
},
{
"epoch": 1.34185303514377,
"grad_norm": 2.783634090632637,
"learning_rate": 9.152680416240059e-06,
"loss": 0.0223,
"step": 210
},
{
"epoch": 1.4057507987220448,
"grad_norm": 1.3608054132422451,
"learning_rate": 9.023937788281278e-06,
"loss": 0.0195,
"step": 220
},
{
"epoch": 1.4696485623003195,
"grad_norm": 0.9992709605849684,
"learning_rate": 8.88713758644883e-06,
"loss": 0.0129,
"step": 230
},
{
"epoch": 1.5335463258785942,
"grad_norm": 1.3478860777735155,
"learning_rate": 8.742553740855507e-06,
"loss": 0.0143,
"step": 240
},
{
"epoch": 1.5974440894568689,
"grad_norm": 0.5160241577185327,
"learning_rate": 8.590475767661371e-06,
"loss": 0.012,
"step": 250
},
{
"epoch": 1.6613418530351438,
"grad_norm": 1.0951030534716653,
"learning_rate": 8.43120818934367e-06,
"loss": 0.0072,
"step": 260
},
{
"epoch": 1.7252396166134185,
"grad_norm": 1.087985002445579,
"learning_rate": 8.265069924917925e-06,
"loss": 0.0095,
"step": 270
},
{
"epoch": 1.7891373801916934,
"grad_norm": 1.7998564363536036,
"learning_rate": 8.092393651331275e-06,
"loss": 0.006,
"step": 280
},
{
"epoch": 1.8530351437699681,
"grad_norm": 0.18457425441572486,
"learning_rate": 7.913525137306756e-06,
"loss": 0.0062,
"step": 290
},
{
"epoch": 1.9169329073482428,
"grad_norm": 0.6609117232245165,
"learning_rate": 7.728822550972523e-06,
"loss": 0.0058,
"step": 300
},
{
"epoch": 1.9808306709265175,
"grad_norm": 0.4879357476412688,
"learning_rate": 7.53865574266234e-06,
"loss": 0.0034,
"step": 310
},
{
"epoch": 2.0447284345047922,
"grad_norm": 0.5473744973987851,
"learning_rate": 7.343405504323519e-06,
"loss": 0.0055,
"step": 320
},
{
"epoch": 2.108626198083067,
"grad_norm": 0.20699630966461133,
"learning_rate": 7.143462807015271e-06,
"loss": 0.0023,
"step": 330
},
{
"epoch": 2.1725239616613417,
"grad_norm": 0.5079647163308747,
"learning_rate": 6.939228018024275e-06,
"loss": 0.0016,
"step": 340
},
{
"epoch": 2.236421725239617,
"grad_norm": 0.06394720110875801,
"learning_rate": 6.731110099165165e-06,
"loss": 0.0006,
"step": 350
},
{
"epoch": 2.3003194888178915,
"grad_norm": 0.043984983795711925,
"learning_rate": 6.519525787871235e-06,
"loss": 0.0006,
"step": 360
},
{
"epoch": 2.364217252396166,
"grad_norm": 0.08347825436904602,
"learning_rate": 6.304898762715187e-06,
"loss": 0.0002,
"step": 370
},
{
"epoch": 2.428115015974441,
"grad_norm": 0.00554983020767827,
"learning_rate": 6.087658795030838e-06,
"loss": 0.0001,
"step": 380
},
{
"epoch": 2.4920127795527156,
"grad_norm": 0.002074949463970475,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.0001,
"step": 390
},
{
"epoch": 2.5559105431309903,
"grad_norm": 0.00091993210892952,
"learning_rate": 5.647084407270277e-06,
"loss": 0.0,
"step": 400
},
{
"epoch": 2.619808306709265,
"grad_norm": 0.001187914956817107,
"learning_rate": 5.424632197820325e-06,
"loss": 0.0004,
"step": 410
},
{
"epoch": 2.68370607028754,
"grad_norm": 0.04613840276160312,
"learning_rate": 5.201329700547077e-06,
"loss": 0.0006,
"step": 420
},
{
"epoch": 2.747603833865815,
"grad_norm": 0.07217118291405861,
"learning_rate": 4.977624058637783e-06,
"loss": 0.0004,
"step": 430
},
{
"epoch": 2.8115015974440896,
"grad_norm": 0.019680043938271063,
"learning_rate": 4.75396322254061e-06,
"loss": 0.0015,
"step": 440
},
{
"epoch": 2.8753993610223643,
"grad_norm": 0.17125091829931163,
"learning_rate": 4.530795052984104e-06,
"loss": 0.0012,
"step": 450
},
{
"epoch": 2.939297124600639,
"grad_norm": 0.011882926064553328,
"learning_rate": 4.308566424176336e-06,
"loss": 0.0005,
"step": 460
},
{
"epoch": 3.0031948881789137,
"grad_norm": 0.020395494744052244,
"learning_rate": 4.087722328979437e-06,
"loss": 0.0001,
"step": 470
},
{
"epoch": 3.0670926517571884,
"grad_norm": 0.47336792227463537,
"learning_rate": 3.86870498785139e-06,
"loss": 0.0002,
"step": 480
},
{
"epoch": 3.130990415335463,
"grad_norm": 0.04112389438112822,
"learning_rate": 3.6519529633392825e-06,
"loss": 0.0006,
"step": 490
},
{
"epoch": 3.194888178913738,
"grad_norm": 0.9883623259675256,
"learning_rate": 3.4379002818972122e-06,
"loss": 0.0002,
"step": 500
},
{
"epoch": 3.258785942492013,
"grad_norm": 0.11942001202877613,
"learning_rate": 3.226975564787322e-06,
"loss": 0.0002,
"step": 510
},
{
"epoch": 3.3226837060702876,
"grad_norm": 0.014912857922381487,
"learning_rate": 3.019601169804216e-06,
"loss": 0.0005,
"step": 520
},
{
"epoch": 3.3865814696485623,
"grad_norm": 0.005218117954201568,
"learning_rate": 2.816192345541437e-06,
"loss": 0.0001,
"step": 530
},
{
"epoch": 3.450479233226837,
"grad_norm": 0.003478383092221035,
"learning_rate": 2.6171563998934605e-06,
"loss": 0.0,
"step": 540
},
{
"epoch": 3.5143769968051117,
"grad_norm": 0.002282251966100028,
"learning_rate": 2.422891884458241e-06,
"loss": 0.0,
"step": 550
},
{
"epoch": 3.5782747603833864,
"grad_norm": 0.0013298086133440442,
"learning_rate": 2.2337877964734324e-06,
"loss": 0.0,
"step": 560
},
{
"epoch": 3.642172523961661,
"grad_norm": 0.001191917339054205,
"learning_rate": 2.050222799884387e-06,
"loss": 0.0,
"step": 570
},
{
"epoch": 3.7060702875399363,
"grad_norm": 0.0019405632722846752,
"learning_rate": 1.8725644671036125e-06,
"loss": 0.0,
"step": 580
},
{
"epoch": 3.769968051118211,
"grad_norm": 0.001688089251520516,
"learning_rate": 1.7011685429800596e-06,
"loss": 0.0,
"step": 590
},
{
"epoch": 3.8338658146964857,
"grad_norm": 0.0013767767393313205,
"learning_rate": 1.5363782324520033e-06,
"loss": 0.0,
"step": 600
},
{
"epoch": 3.8977635782747604,
"grad_norm": 0.0015121179776807685,
"learning_rate": 1.3785235133100088e-06,
"loss": 0.0,
"step": 610
},
{
"epoch": 3.961661341853035,
"grad_norm": 0.0010094220393594992,
"learning_rate": 1.2279204754460494e-06,
"loss": 0.0,
"step": 620
},
{
"epoch": 4.02555910543131,
"grad_norm": 0.0008270190129985694,
"learning_rate": 1.0848706879118893e-06,
"loss": 0.0,
"step": 630
},
{
"epoch": 4.0894568690095845,
"grad_norm": 0.0009842840206759395,
"learning_rate": 9.496605950541676e-07,
"loss": 0.0,
"step": 640
},
{
"epoch": 4.15335463258786,
"grad_norm": 0.0011455384204312185,
"learning_rate": 8.225609429353187e-07,
"loss": 0.0,
"step": 650
},
{
"epoch": 4.217252396166134,
"grad_norm": 0.0009744990691985237,
"learning_rate": 7.03826237188916e-07,
"loss": 0.0,
"step": 660
},
{
"epoch": 4.281150159744409,
"grad_norm": 0.0010987588457946568,
"learning_rate": 5.936942333950063e-07,
"loss": 0.0,
"step": 670
},
{
"epoch": 4.345047923322683,
"grad_norm": 0.0007905637486817264,
"learning_rate": 4.9238546099592e-07,
"loss": 0.0,
"step": 680
},
{
"epoch": 4.4089456869009584,
"grad_norm": 0.0009142256811990887,
"learning_rate": 4.001027817058789e-07,
"loss": 0.0,
"step": 690
},
{
"epoch": 4.472843450479234,
"grad_norm": 0.0011330787800452912,
"learning_rate": 3.1703098329864237e-07,
"loss": 0.0,
"step": 700
},
{
"epoch": 4.536741214057508,
"grad_norm": 0.0007900970323302085,
"learning_rate": 2.4333640958659144e-07,
"loss": 0.0,
"step": 710
},
{
"epoch": 4.600638977635783,
"grad_norm": 0.0005185271373596653,
"learning_rate": 1.7916662733218848e-07,
"loss": 0.0,
"step": 720
},
{
"epoch": 4.664536741214057,
"grad_norm": 0.0007954554045334525,
"learning_rate": 1.2465013075879884e-07,
"loss": 0.0,
"step": 730
},
{
"epoch": 4.728434504792332,
"grad_norm": 0.001375306774010583,
"learning_rate": 7.989608425254924e-08,
"loss": 0.0,
"step": 740
},
{
"epoch": 4.792332268370607,
"grad_norm": 0.0009216072149331596,
"learning_rate": 4.499410377045765e-08,
"loss": 0.0,
"step": 750
},
{
"epoch": 4.856230031948882,
"grad_norm": 0.0008157911744637327,
"learning_rate": 2.0014077392525035e-08,
"loss": 0.0,
"step": 760
},
{
"epoch": 4.920127795527156,
"grad_norm": 0.0009134259574951338,
"learning_rate": 5.006025377138901e-09,
"loss": 0.0,
"step": 770
},
{
"epoch": 4.984025559105431,
"grad_norm": 0.0008560713988891386,
"learning_rate": 0.0,
"loss": 0.0,
"step": 780
},
{
"epoch": 4.984025559105431,
"step": 780,
"total_flos": 216765195878400.0,
"train_loss": 0.07692194245522935,
"train_runtime": 12037.1174,
"train_samples_per_second": 16.615,
"train_steps_per_second": 0.065
}
],
"logging_steps": 10,
"max_steps": 780,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 216765195878400.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}