|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9961671837926627, |
|
"eval_steps": 500, |
|
"global_step": 2052, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014601204599379448, |
|
"grad_norm": 31.200588193935076, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0665, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.029202409198758897, |
|
"grad_norm": 2.1030174817289726, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9851, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.043803613798138345, |
|
"grad_norm": 1.7889745137305164, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9413, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.058404818397517794, |
|
"grad_norm": 0.7798640642359425, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9234, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07300602299689725, |
|
"grad_norm": 2.7003950764835163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9077, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08760722759627669, |
|
"grad_norm": 6.611494407685939, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9045, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10220843219565615, |
|
"grad_norm": 1.081246334652253, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8965, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11680963679503559, |
|
"grad_norm": 0.8101716177472186, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8855, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13141084139441503, |
|
"grad_norm": 0.8801376249396373, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8827, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1460120459937945, |
|
"grad_norm": 0.6385442821205225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8714, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16061325059317394, |
|
"grad_norm": 0.6872556086848158, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8735, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17521445519255338, |
|
"grad_norm": 0.6992499417358933, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8682, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18981565979193282, |
|
"grad_norm": 0.6013577080845172, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8662, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2044168643913123, |
|
"grad_norm": 0.5988600270173734, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8649, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.21901806899069173, |
|
"grad_norm": 0.6460225651419232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8647, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23361927359007117, |
|
"grad_norm": 0.5603551860244527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8533, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.24822047818945062, |
|
"grad_norm": 0.5553710866489995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8544, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.26282168278883006, |
|
"grad_norm": 0.6554387689230537, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8592, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2774228873882095, |
|
"grad_norm": 0.6786037864304745, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8518, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.292024091987589, |
|
"grad_norm": 0.5561181889312125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8505, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3066252965869684, |
|
"grad_norm": 0.7917259340608006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8475, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3212265011863479, |
|
"grad_norm": 0.5838972916992158, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8468, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.33582770578572735, |
|
"grad_norm": 0.5415756077794452, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8478, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.35042891038510676, |
|
"grad_norm": 0.561460816685303, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8471, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.36503011498448623, |
|
"grad_norm": 0.5431016015146285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.845, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.37963131958386565, |
|
"grad_norm": 0.5777928639036234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8427, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3942325241832451, |
|
"grad_norm": 0.6855206088461627, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8406, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4088337287826246, |
|
"grad_norm": 0.5959537867792327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.843, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.423434933382004, |
|
"grad_norm": 0.5787987185587301, |
|
"learning_rate": 5e-06, |
|
"loss": 0.842, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.43803613798138347, |
|
"grad_norm": 0.9829016985861171, |
|
"learning_rate": 5e-06, |
|
"loss": 0.841, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.45263734258076294, |
|
"grad_norm": 0.7109664833342627, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8376, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.46723854718014235, |
|
"grad_norm": 0.5953929615896101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8352, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4818397517795218, |
|
"grad_norm": 0.6459745420821242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8322, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.49644095637890123, |
|
"grad_norm": 0.7286780710714444, |
|
"learning_rate": 5e-06, |
|
"loss": 0.833, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5110421609782807, |
|
"grad_norm": 0.8624769767543123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8326, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5256433655776601, |
|
"grad_norm": 0.7489286697832975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8344, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5402445701770396, |
|
"grad_norm": 0.6965027768353624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8329, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.554845774776419, |
|
"grad_norm": 0.5898605613508874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8396, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5694469793757985, |
|
"grad_norm": 0.669429389652064, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8318, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.584048183975178, |
|
"grad_norm": 0.6580798841963941, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8314, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5986493885745574, |
|
"grad_norm": 0.624128356604639, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8282, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6132505931739368, |
|
"grad_norm": 0.5755522646670556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8321, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6278517977733163, |
|
"grad_norm": 0.8196980265974857, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8313, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6424530023726958, |
|
"grad_norm": 0.48261475886925087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8238, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6570542069720752, |
|
"grad_norm": 0.5343014097762563, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8296, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6716554115714547, |
|
"grad_norm": 0.8585815714707374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.823, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6862566161708341, |
|
"grad_norm": 0.7315620836524508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8331, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7008578207702135, |
|
"grad_norm": 0.4711661790189355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8245, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7154590253695929, |
|
"grad_norm": 0.546263482109446, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8212, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7300602299689725, |
|
"grad_norm": 0.5757304431326317, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8252, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7446614345683519, |
|
"grad_norm": 0.5563752904399338, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8251, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7592626391677313, |
|
"grad_norm": 0.48890029763799747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8244, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7738638437671108, |
|
"grad_norm": 0.6121148728397559, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8219, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7884650483664902, |
|
"grad_norm": 0.651565586948898, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8203, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8030662529658696, |
|
"grad_norm": 0.5365587518038645, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8244, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8176674575652492, |
|
"grad_norm": 0.5585874614674294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8261, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8322686621646286, |
|
"grad_norm": 0.48225482309598716, |
|
"learning_rate": 5e-06, |
|
"loss": 0.828, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.846869866764008, |
|
"grad_norm": 0.6379018399000604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8187, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8614710713633875, |
|
"grad_norm": 0.8248757003628987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8245, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8760722759627669, |
|
"grad_norm": 0.7072642911500023, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8199, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8906734805621463, |
|
"grad_norm": 0.6066965111128374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.821, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9052746851615259, |
|
"grad_norm": 0.49608072224953953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8263, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9198758897609053, |
|
"grad_norm": 0.6053461220096085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8225, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9344770943602847, |
|
"grad_norm": 0.5575666035835788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8211, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9490782989596642, |
|
"grad_norm": 0.5170427420902555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8202, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9636795035590436, |
|
"grad_norm": 0.5652214016440857, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8219, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.978280708158423, |
|
"grad_norm": 0.5065476265832586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8121, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.9928819127578025, |
|
"grad_norm": 0.5713479763199619, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8154, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.9987223945975543, |
|
"eval_loss": 0.8185040950775146, |
|
"eval_runtime": 729.2812, |
|
"eval_samples_per_second": 25.306, |
|
"eval_steps_per_second": 0.396, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.0074831173571819, |
|
"grad_norm": 0.5659100587324225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8068, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0220843219565614, |
|
"grad_norm": 0.5725390345160268, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7796, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.036685526555941, |
|
"grad_norm": 0.5067331567131128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7784, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0512867311553202, |
|
"grad_norm": 0.5633875492368658, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7789, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0658879357546998, |
|
"grad_norm": 0.6503391798526155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7785, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.0804891403540793, |
|
"grad_norm": 0.6157238494098765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7788, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.0950903449534586, |
|
"grad_norm": 0.6692246242398756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7791, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.109691549552838, |
|
"grad_norm": 0.772994893504376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7772, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1242927541522176, |
|
"grad_norm": 0.5654007018077802, |
|
"learning_rate": 5e-06, |
|
"loss": 0.778, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.138893958751597, |
|
"grad_norm": 0.5871850769943243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7797, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1534951633509765, |
|
"grad_norm": 0.6081431556291285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7776, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.168096367950356, |
|
"grad_norm": 0.5943447291419969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7812, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1826975725497353, |
|
"grad_norm": 0.5174382592861106, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7742, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.1972987771491148, |
|
"grad_norm": 0.5335467085784507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7821, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2118999817484943, |
|
"grad_norm": 0.5424184832410203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7832, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2265011863478736, |
|
"grad_norm": 0.5401853269685924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7764, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2411023909472532, |
|
"grad_norm": 0.5532297607385643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.776, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.2557035955466325, |
|
"grad_norm": 0.4600563956098031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7746, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.270304800146012, |
|
"grad_norm": 0.5135474289282321, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7725, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.2849060047453915, |
|
"grad_norm": 0.6354802982105713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7787, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.299507209344771, |
|
"grad_norm": 0.5869839476501474, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7712, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3141084139441503, |
|
"grad_norm": 0.49495760536344496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.777, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3287096185435299, |
|
"grad_norm": 0.5322628773610525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7791, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.3433108231429092, |
|
"grad_norm": 0.6394355119269733, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7813, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3579120277422887, |
|
"grad_norm": 0.6150475948115007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7718, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.3725132323416682, |
|
"grad_norm": 0.6284466998832495, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7716, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.3871144369410477, |
|
"grad_norm": 0.4995594773156744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7801, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.401715641540427, |
|
"grad_norm": 0.5533231758658743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7749, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4163168461398066, |
|
"grad_norm": 0.5566318311264558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7809, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4309180507391859, |
|
"grad_norm": 0.5996092713965696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7769, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4455192553385654, |
|
"grad_norm": 0.4923370749506076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7733, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.460120459937945, |
|
"grad_norm": 0.5718051545730899, |
|
"learning_rate": 5e-06, |
|
"loss": 0.778, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4747216645373245, |
|
"grad_norm": 0.4966605100244046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7755, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.4893228691367038, |
|
"grad_norm": 0.5104108866561695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7762, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.5039240737360833, |
|
"grad_norm": 0.5790841364965528, |
|
"learning_rate": 5e-06, |
|
"loss": 0.775, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.5185252783354626, |
|
"grad_norm": 0.5079205962955746, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7791, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.533126482934842, |
|
"grad_norm": 0.4897829483446737, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7732, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.5477276875342216, |
|
"grad_norm": 0.5375326427308407, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7734, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.5623288921336012, |
|
"grad_norm": 0.4714533263773857, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7786, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.5769300967329805, |
|
"grad_norm": 0.5170403858384673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.772, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.5915313013323598, |
|
"grad_norm": 0.5584745095875884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7788, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.6061325059317393, |
|
"grad_norm": 0.5632792125524021, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7764, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6207337105311188, |
|
"grad_norm": 0.5303585273369582, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7698, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.6353349151304983, |
|
"grad_norm": 0.5292556194617752, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7754, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.6499361197298779, |
|
"grad_norm": 0.5319736770394399, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7754, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.6645373243292572, |
|
"grad_norm": 0.5409862397072692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7732, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.6791385289286365, |
|
"grad_norm": 0.5347398767131228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.775, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.693739733528016, |
|
"grad_norm": 0.5887598823053857, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7734, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.7083409381273955, |
|
"grad_norm": 0.588980481311897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7776, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.722942142726775, |
|
"grad_norm": 0.5476017973657227, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7718, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.7375433473261546, |
|
"grad_norm": 0.5548638443373327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.778, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.7521445519255339, |
|
"grad_norm": 0.5443995408512653, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7731, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7667457565249132, |
|
"grad_norm": 0.5134399032378028, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7762, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.7813469611242927, |
|
"grad_norm": 0.5143443520325698, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7737, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.7959481657236722, |
|
"grad_norm": 0.5712512301925389, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7752, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.8105493703230517, |
|
"grad_norm": 0.5022436155237929, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7746, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.825150574922431, |
|
"grad_norm": 0.5183607046169039, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7758, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.8397517795218106, |
|
"grad_norm": 0.5327048894936923, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7737, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.8543529841211899, |
|
"grad_norm": 0.4828373941208032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7734, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.8689541887205694, |
|
"grad_norm": 0.4692396568766125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7702, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.883555393319949, |
|
"grad_norm": 0.5272353821412613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7738, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.8981565979193284, |
|
"grad_norm": 0.5974378803453756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7744, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.9127578025187078, |
|
"grad_norm": 0.45897414900404526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7707, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.9273590071180873, |
|
"grad_norm": 0.5661797189222842, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7729, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.9419602117174666, |
|
"grad_norm": 0.5291159788169262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7704, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.956561416316846, |
|
"grad_norm": 0.5803039839795054, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7711, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.9711626209162256, |
|
"grad_norm": 0.5668430658536179, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7714, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.9857638255156052, |
|
"grad_norm": 0.5292288364377395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7681, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.9989049096550464, |
|
"eval_loss": 0.8045554161071777, |
|
"eval_runtime": 732.5889, |
|
"eval_samples_per_second": 25.191, |
|
"eval_steps_per_second": 0.394, |
|
"step": 1369 |
|
}, |
|
{ |
|
"epoch": 2.0003650301149847, |
|
"grad_norm": 0.5917011918049785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7813, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.0149662347143638, |
|
"grad_norm": 0.6437888983866474, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7275, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.0295674393137433, |
|
"grad_norm": 0.5993159674827806, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7301, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.044168643913123, |
|
"grad_norm": 0.6004688564094799, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7279, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.0587698485125023, |
|
"grad_norm": 0.5968765010977406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.732, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.073371053111882, |
|
"grad_norm": 0.6368286520923802, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7327, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.0879722577112614, |
|
"grad_norm": 0.5121224799191383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7289, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.1025734623106405, |
|
"grad_norm": 0.5426488835411897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.729, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.11717466691002, |
|
"grad_norm": 0.5360711433495, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7321, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.1317758715093995, |
|
"grad_norm": 0.570345866307846, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7283, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.146377076108779, |
|
"grad_norm": 0.5646482286111343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7341, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.1609782807081586, |
|
"grad_norm": 0.6170916412089019, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7273, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.175579485307538, |
|
"grad_norm": 0.5669820051659463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7327, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.190180689906917, |
|
"grad_norm": 0.5032603903192315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7259, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.2047818945062967, |
|
"grad_norm": 0.536250519862031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7309, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.219383099105676, |
|
"grad_norm": 0.4989195000116721, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7351, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.2339843037050557, |
|
"grad_norm": 0.48084465295135953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7319, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.2485855083044353, |
|
"grad_norm": 0.6041492880249871, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7331, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.2631867129038143, |
|
"grad_norm": 0.48501706414438883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7364, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.277787917503194, |
|
"grad_norm": 0.486621431249399, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7336, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.2923891221025734, |
|
"grad_norm": 0.6249224122437264, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7335, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.306990326701953, |
|
"grad_norm": 0.6255968466832591, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7315, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.3215915313013324, |
|
"grad_norm": 0.5271636183137467, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7322, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.336192735900712, |
|
"grad_norm": 0.5924207573496902, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7345, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.3507939405000915, |
|
"grad_norm": 0.5587622345452513, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7344, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.3653951450994706, |
|
"grad_norm": 0.6269568570302153, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7359, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.37999634969885, |
|
"grad_norm": 0.6533009549527986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7368, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.3945975542982296, |
|
"grad_norm": 0.5195452792704209, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7355, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.409198758897609, |
|
"grad_norm": 0.5528537289554203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7355, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.4237999634969887, |
|
"grad_norm": 0.5434124551027695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7343, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.4384011680963678, |
|
"grad_norm": 0.5166289013156083, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7348, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.4530023726957473, |
|
"grad_norm": 0.5085481998475664, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7384, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.467603577295127, |
|
"grad_norm": 0.47895714940004425, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7319, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.4822047818945063, |
|
"grad_norm": 0.6790087073589551, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7344, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.496805986493886, |
|
"grad_norm": 0.5337750367727961, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7348, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.511407191093265, |
|
"grad_norm": 0.578024943637955, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7335, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.526008395692645, |
|
"grad_norm": 0.5321237652992599, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7341, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.540609600292024, |
|
"grad_norm": 0.552250362014024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7345, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.5552108048914035, |
|
"grad_norm": 0.47716769494866995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7364, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.569812009490783, |
|
"grad_norm": 0.5382810891628738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7327, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.5844132140901626, |
|
"grad_norm": 0.5291652309846363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.734, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.599014418689542, |
|
"grad_norm": 0.583404809051004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7309, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.613615623288921, |
|
"grad_norm": 0.607832211058636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7352, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.6282168278883007, |
|
"grad_norm": 0.5598928601891838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7356, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.64281803248768, |
|
"grad_norm": 0.5949750898099526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7353, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.6574192370870597, |
|
"grad_norm": 0.6066820856022053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7347, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.6720204416864393, |
|
"grad_norm": 0.5198033507111653, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7354, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.6866216462858183, |
|
"grad_norm": 0.5040452692645214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7391, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.701222850885198, |
|
"grad_norm": 0.5156854247429866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7388, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.7158240554845774, |
|
"grad_norm": 0.5617334329140413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.737, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.730425260083957, |
|
"grad_norm": 0.4970472716656489, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7359, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.7450264646833364, |
|
"grad_norm": 0.6666729572656519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7349, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.759627669282716, |
|
"grad_norm": 0.660456603270783, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7363, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.7742288738820955, |
|
"grad_norm": 0.5479397279932245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7359, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.7888300784814746, |
|
"grad_norm": 0.5184737073351016, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7383, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.803431283080854, |
|
"grad_norm": 0.501451603194624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7344, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.8180324876802336, |
|
"grad_norm": 0.5543991291124852, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7382, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.832633692279613, |
|
"grad_norm": 0.6053239113120223, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7356, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.8472348968789927, |
|
"grad_norm": 0.5618006505391813, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7377, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.8618361014783718, |
|
"grad_norm": 0.5815392261505143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7337, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.8764373060777513, |
|
"grad_norm": 0.7488694605510656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7362, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.891038510677131, |
|
"grad_norm": 0.5769073126410138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7359, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.9056397152765103, |
|
"grad_norm": 0.5750570915989177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7331, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.92024091987589, |
|
"grad_norm": 0.5354199731148004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7341, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.9348421244752694, |
|
"grad_norm": 0.5855570342179945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7404, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.949443329074649, |
|
"grad_norm": 0.6261526281235102, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7337, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.964044533674028, |
|
"grad_norm": 0.5504549828167312, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7348, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.9786457382734075, |
|
"grad_norm": 0.529021801831048, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7354, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.993246942872787, |
|
"grad_norm": 0.5245972765419218, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7372, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.9961671837926627, |
|
"eval_loss": 0.8026307821273804, |
|
"eval_runtime": 732.5471, |
|
"eval_samples_per_second": 25.193, |
|
"eval_steps_per_second": 0.395, |
|
"step": 2052 |
|
}, |
|
{ |
|
"epoch": 2.9961671837926627, |
|
"step": 2052, |
|
"total_flos": 3436967047987200.0, |
|
"train_loss": 0.7862561077286161, |
|
"train_runtime": 121149.8135, |
|
"train_samples_per_second": 8.683, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2052, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3436967047987200.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|