|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.984025559105431, |
|
"eval_steps": 500, |
|
"global_step": 780, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06389776357827476, |
|
"grad_norm": 130.31602916853151, |
|
"learning_rate": 1.282051282051282e-06, |
|
"loss": 3.6603, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.12779552715654952, |
|
"grad_norm": 14.781977919866863, |
|
"learning_rate": 2.564102564102564e-06, |
|
"loss": 0.3776, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.19169329073482427, |
|
"grad_norm": 4.3629613510157546, |
|
"learning_rate": 3.846153846153847e-06, |
|
"loss": 0.1595, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.25559105431309903, |
|
"grad_norm": 3.870157217981069, |
|
"learning_rate": 5.128205128205128e-06, |
|
"loss": 0.1568, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3194888178913738, |
|
"grad_norm": 5.134748713124099, |
|
"learning_rate": 6.410256410256412e-06, |
|
"loss": 0.1566, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.38338658146964855, |
|
"grad_norm": 2.9173590197272645, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.146, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4472843450479233, |
|
"grad_norm": 7.85067184702089, |
|
"learning_rate": 8.974358974358976e-06, |
|
"loss": 0.1465, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5111821086261981, |
|
"grad_norm": 4.0072039812523155, |
|
"learning_rate": 9.999799726899261e-06, |
|
"loss": 0.152, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5750798722044729, |
|
"grad_norm": 4.781263018607648, |
|
"learning_rate": 9.992791852820709e-06, |
|
"loss": 0.1531, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6389776357827476, |
|
"grad_norm": 2.108926556744404, |
|
"learning_rate": 9.975786361654959e-06, |
|
"loss": 0.1298, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7028753993610224, |
|
"grad_norm": 2.8153254735688513, |
|
"learning_rate": 9.948817305370145e-06, |
|
"loss": 0.1161, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7667731629392971, |
|
"grad_norm": 4.0418418716369535, |
|
"learning_rate": 9.911938687078324e-06, |
|
"loss": 0.1011, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8306709265175719, |
|
"grad_norm": 3.9843806569017146, |
|
"learning_rate": 9.86522435289912e-06, |
|
"loss": 0.0841, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8945686900958466, |
|
"grad_norm": 1.3291937913520984, |
|
"learning_rate": 9.80876784408948e-06, |
|
"loss": 0.0698, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9584664536741214, |
|
"grad_norm": 2.7308366191283513, |
|
"learning_rate": 9.742682209735727e-06, |
|
"loss": 0.0553, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0223642172523961, |
|
"grad_norm": 1.1100404502138488, |
|
"learning_rate": 9.66709978038292e-06, |
|
"loss": 0.0516, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0862619808306708, |
|
"grad_norm": 1.2597692857731515, |
|
"learning_rate": 9.582171903054815e-06, |
|
"loss": 0.0454, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1501597444089458, |
|
"grad_norm": 1.1022185938066056, |
|
"learning_rate": 9.488068638195072e-06, |
|
"loss": 0.0387, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2140575079872205, |
|
"grad_norm": 1.7172177840149363, |
|
"learning_rate": 9.384978419136469e-06, |
|
"loss": 0.0302, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.2779552715654952, |
|
"grad_norm": 1.5637027754240362, |
|
"learning_rate": 9.273107674780102e-06, |
|
"loss": 0.0325, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.34185303514377, |
|
"grad_norm": 2.783634090632637, |
|
"learning_rate": 9.152680416240059e-06, |
|
"loss": 0.0223, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4057507987220448, |
|
"grad_norm": 1.3608054132422451, |
|
"learning_rate": 9.023937788281278e-06, |
|
"loss": 0.0195, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.4696485623003195, |
|
"grad_norm": 0.9992709605849684, |
|
"learning_rate": 8.88713758644883e-06, |
|
"loss": 0.0129, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.5335463258785942, |
|
"grad_norm": 1.3478860777735155, |
|
"learning_rate": 8.742553740855507e-06, |
|
"loss": 0.0143, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.5974440894568689, |
|
"grad_norm": 0.5160241577185327, |
|
"learning_rate": 8.590475767661371e-06, |
|
"loss": 0.012, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6613418530351438, |
|
"grad_norm": 1.0951030534716653, |
|
"learning_rate": 8.43120818934367e-06, |
|
"loss": 0.0072, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.7252396166134185, |
|
"grad_norm": 1.087985002445579, |
|
"learning_rate": 8.265069924917925e-06, |
|
"loss": 0.0095, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.7891373801916934, |
|
"grad_norm": 1.7998564363536036, |
|
"learning_rate": 8.092393651331275e-06, |
|
"loss": 0.006, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8530351437699681, |
|
"grad_norm": 0.18457425441572486, |
|
"learning_rate": 7.913525137306756e-06, |
|
"loss": 0.0062, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.9169329073482428, |
|
"grad_norm": 0.6609117232245165, |
|
"learning_rate": 7.728822550972523e-06, |
|
"loss": 0.0058, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9808306709265175, |
|
"grad_norm": 0.4879357476412688, |
|
"learning_rate": 7.53865574266234e-06, |
|
"loss": 0.0034, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.0447284345047922, |
|
"grad_norm": 0.5473744973987851, |
|
"learning_rate": 7.343405504323519e-06, |
|
"loss": 0.0055, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.108626198083067, |
|
"grad_norm": 0.20699630966461133, |
|
"learning_rate": 7.143462807015271e-06, |
|
"loss": 0.0023, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.1725239616613417, |
|
"grad_norm": 0.5079647163308747, |
|
"learning_rate": 6.939228018024275e-06, |
|
"loss": 0.0016, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.236421725239617, |
|
"grad_norm": 0.06394720110875801, |
|
"learning_rate": 6.731110099165165e-06, |
|
"loss": 0.0006, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.3003194888178915, |
|
"grad_norm": 0.043984983795711925, |
|
"learning_rate": 6.519525787871235e-06, |
|
"loss": 0.0006, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.364217252396166, |
|
"grad_norm": 0.08347825436904602, |
|
"learning_rate": 6.304898762715187e-06, |
|
"loss": 0.0002, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.428115015974441, |
|
"grad_norm": 0.00554983020767827, |
|
"learning_rate": 6.087658795030838e-06, |
|
"loss": 0.0001, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.4920127795527156, |
|
"grad_norm": 0.002074949463970475, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 0.0001, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.5559105431309903, |
|
"grad_norm": 0.00091993210892952, |
|
"learning_rate": 5.647084407270277e-06, |
|
"loss": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.619808306709265, |
|
"grad_norm": 0.001187914956817107, |
|
"learning_rate": 5.424632197820325e-06, |
|
"loss": 0.0004, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.68370607028754, |
|
"grad_norm": 0.04613840276160312, |
|
"learning_rate": 5.201329700547077e-06, |
|
"loss": 0.0006, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.747603833865815, |
|
"grad_norm": 0.07217118291405861, |
|
"learning_rate": 4.977624058637783e-06, |
|
"loss": 0.0004, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.8115015974440896, |
|
"grad_norm": 0.019680043938271063, |
|
"learning_rate": 4.75396322254061e-06, |
|
"loss": 0.0015, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.8753993610223643, |
|
"grad_norm": 0.17125091829931163, |
|
"learning_rate": 4.530795052984104e-06, |
|
"loss": 0.0012, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.939297124600639, |
|
"grad_norm": 0.011882926064553328, |
|
"learning_rate": 4.308566424176336e-06, |
|
"loss": 0.0005, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.0031948881789137, |
|
"grad_norm": 0.020395494744052244, |
|
"learning_rate": 4.087722328979437e-06, |
|
"loss": 0.0001, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.0670926517571884, |
|
"grad_norm": 0.47336792227463537, |
|
"learning_rate": 3.86870498785139e-06, |
|
"loss": 0.0002, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.130990415335463, |
|
"grad_norm": 0.04112389438112822, |
|
"learning_rate": 3.6519529633392825e-06, |
|
"loss": 0.0006, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.194888178913738, |
|
"grad_norm": 0.9883623259675256, |
|
"learning_rate": 3.4379002818972122e-06, |
|
"loss": 0.0002, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.258785942492013, |
|
"grad_norm": 0.11942001202877613, |
|
"learning_rate": 3.226975564787322e-06, |
|
"loss": 0.0002, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.3226837060702876, |
|
"grad_norm": 0.014912857922381487, |
|
"learning_rate": 3.019601169804216e-06, |
|
"loss": 0.0005, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.3865814696485623, |
|
"grad_norm": 0.005218117954201568, |
|
"learning_rate": 2.816192345541437e-06, |
|
"loss": 0.0001, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.450479233226837, |
|
"grad_norm": 0.003478383092221035, |
|
"learning_rate": 2.6171563998934605e-06, |
|
"loss": 0.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.5143769968051117, |
|
"grad_norm": 0.002282251966100028, |
|
"learning_rate": 2.422891884458241e-06, |
|
"loss": 0.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.5782747603833864, |
|
"grad_norm": 0.0013298086133440442, |
|
"learning_rate": 2.2337877964734324e-06, |
|
"loss": 0.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.642172523961661, |
|
"grad_norm": 0.001191917339054205, |
|
"learning_rate": 2.050222799884387e-06, |
|
"loss": 0.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.7060702875399363, |
|
"grad_norm": 0.0019405632722846752, |
|
"learning_rate": 1.8725644671036125e-06, |
|
"loss": 0.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.769968051118211, |
|
"grad_norm": 0.001688089251520516, |
|
"learning_rate": 1.7011685429800596e-06, |
|
"loss": 0.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.8338658146964857, |
|
"grad_norm": 0.0013767767393313205, |
|
"learning_rate": 1.5363782324520033e-06, |
|
"loss": 0.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.8977635782747604, |
|
"grad_norm": 0.0015121179776807685, |
|
"learning_rate": 1.3785235133100088e-06, |
|
"loss": 0.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.961661341853035, |
|
"grad_norm": 0.0010094220393594992, |
|
"learning_rate": 1.2279204754460494e-06, |
|
"loss": 0.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.02555910543131, |
|
"grad_norm": 0.0008270190129985694, |
|
"learning_rate": 1.0848706879118893e-06, |
|
"loss": 0.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.0894568690095845, |
|
"grad_norm": 0.0009842840206759395, |
|
"learning_rate": 9.496605950541676e-07, |
|
"loss": 0.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.15335463258786, |
|
"grad_norm": 0.0011455384204312185, |
|
"learning_rate": 8.225609429353187e-07, |
|
"loss": 0.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.217252396166134, |
|
"grad_norm": 0.0009744990691985237, |
|
"learning_rate": 7.03826237188916e-07, |
|
"loss": 0.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.281150159744409, |
|
"grad_norm": 0.0010987588457946568, |
|
"learning_rate": 5.936942333950063e-07, |
|
"loss": 0.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.345047923322683, |
|
"grad_norm": 0.0007905637486817264, |
|
"learning_rate": 4.9238546099592e-07, |
|
"loss": 0.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.4089456869009584, |
|
"grad_norm": 0.0009142256811990887, |
|
"learning_rate": 4.001027817058789e-07, |
|
"loss": 0.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.472843450479234, |
|
"grad_norm": 0.0011330787800452912, |
|
"learning_rate": 3.1703098329864237e-07, |
|
"loss": 0.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.536741214057508, |
|
"grad_norm": 0.0007900970323302085, |
|
"learning_rate": 2.4333640958659144e-07, |
|
"loss": 0.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.600638977635783, |
|
"grad_norm": 0.0005185271373596653, |
|
"learning_rate": 1.7916662733218848e-07, |
|
"loss": 0.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.664536741214057, |
|
"grad_norm": 0.0007954554045334525, |
|
"learning_rate": 1.2465013075879884e-07, |
|
"loss": 0.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.728434504792332, |
|
"grad_norm": 0.001375306774010583, |
|
"learning_rate": 7.989608425254924e-08, |
|
"loss": 0.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.792332268370607, |
|
"grad_norm": 0.0009216072149331596, |
|
"learning_rate": 4.499410377045765e-08, |
|
"loss": 0.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.856230031948882, |
|
"grad_norm": 0.0008157911744637327, |
|
"learning_rate": 2.0014077392525035e-08, |
|
"loss": 0.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.920127795527156, |
|
"grad_norm": 0.0009134259574951338, |
|
"learning_rate": 5.006025377138901e-09, |
|
"loss": 0.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 4.984025559105431, |
|
"grad_norm": 0.0008560713988891386, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 4.984025559105431, |
|
"step": 780, |
|
"total_flos": 216765195878400.0, |
|
"train_loss": 0.07692194245522935, |
|
"train_runtime": 12037.1174, |
|
"train_samples_per_second": 16.615, |
|
"train_steps_per_second": 0.065 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 780, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 216765195878400.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|