|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.998204667863555, |
|
"eval_steps": 500, |
|
"global_step": 2505, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011968880909634948, |
|
"grad_norm": 26.500904248426494, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0918, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.023937761819269897, |
|
"grad_norm": 2.050351070135692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9643, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03590664272890485, |
|
"grad_norm": 1.5804902490894754, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8994, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.047875523638539794, |
|
"grad_norm": 1.3296407219003534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8624, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.059844404548174746, |
|
"grad_norm": 1.2478037629085867, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8386, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0718132854578097, |
|
"grad_norm": 1.5244344651863047, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8163, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08378216636744465, |
|
"grad_norm": 1.4588200289873845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8072, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09575104727707959, |
|
"grad_norm": 1.0982027372762801, |
|
"learning_rate": 5e-06, |
|
"loss": 0.793, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10771992818671454, |
|
"grad_norm": 0.8816222607041236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7803, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11968880909634949, |
|
"grad_norm": 0.756880071100982, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7751, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13165769000598443, |
|
"grad_norm": 1.055426572397012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7709, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1436265709156194, |
|
"grad_norm": 0.6572909169517717, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7696, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15559545182525433, |
|
"grad_norm": 0.9133460033726442, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7674, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1675643327348893, |
|
"grad_norm": 1.3148870123285326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7689, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17953321364452424, |
|
"grad_norm": 1.2147160089970106, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7629, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.19150209455415917, |
|
"grad_norm": 1.0090571814802656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7539, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20347097546379414, |
|
"grad_norm": 0.6786427727932486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7581, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21543985637342908, |
|
"grad_norm": 0.6292227320745046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7586, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.22740873728306404, |
|
"grad_norm": 0.7713241812517966, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7502, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23937761819269898, |
|
"grad_norm": 0.7205115952689147, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7547, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2513464991023339, |
|
"grad_norm": 0.6517310095502118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7483, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.26331538001196886, |
|
"grad_norm": 0.9369409311428439, |
|
"learning_rate": 5e-06, |
|
"loss": 0.746, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.27528426092160385, |
|
"grad_norm": 0.6858210048955122, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7441, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2872531418312388, |
|
"grad_norm": 0.7081095639219623, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7445, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2992220227408737, |
|
"grad_norm": 0.8305808826463342, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7422, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.31119090365050867, |
|
"grad_norm": 0.641119207471586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7436, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3231597845601436, |
|
"grad_norm": 0.6092200095597452, |
|
"learning_rate": 5e-06, |
|
"loss": 0.742, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3351286654697786, |
|
"grad_norm": 0.7329815044595642, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7446, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.34709754637941354, |
|
"grad_norm": 0.837913540141014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7405, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3590664272890485, |
|
"grad_norm": 0.7796180622052511, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7361, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3710353081986834, |
|
"grad_norm": 0.6206194415830822, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7385, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.38300418910831835, |
|
"grad_norm": 0.807676253948667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7354, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.39497307001795334, |
|
"grad_norm": 0.6508042723489613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7342, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4069419509275883, |
|
"grad_norm": 1.127544531436106, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7393, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4189108318372232, |
|
"grad_norm": 0.7264448945571097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7335, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.43087971274685816, |
|
"grad_norm": 0.7993530026874973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7367, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4428485936564931, |
|
"grad_norm": 0.9608462791926143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7402, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4548174745661281, |
|
"grad_norm": 0.7292313869577105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7346, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.466786355475763, |
|
"grad_norm": 1.047700429826994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7306, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.47875523638539796, |
|
"grad_norm": 0.7162891952015032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7298, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4907241172950329, |
|
"grad_norm": 0.8782360080252781, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7313, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5026929982046678, |
|
"grad_norm": 0.6666343216734302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7314, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5146618791143028, |
|
"grad_norm": 0.8384501076164775, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7355, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5266307600239377, |
|
"grad_norm": 0.6150369560469455, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7298, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5385996409335727, |
|
"grad_norm": 0.6124319945799728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7292, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5505685218432077, |
|
"grad_norm": 0.6189565248445754, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7263, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5625374027528426, |
|
"grad_norm": 0.6421927415859262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7299, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5745062836624776, |
|
"grad_norm": 0.7163639686041191, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7288, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5864751645721125, |
|
"grad_norm": 0.7901780871296374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7233, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5984440454817475, |
|
"grad_norm": 0.6075768057357427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7285, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6104129263913824, |
|
"grad_norm": 0.7432513250953633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7255, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6223818073010173, |
|
"grad_norm": 0.6882126061650138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7228, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6343506882106523, |
|
"grad_norm": 0.6444331452668448, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7208, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6463195691202872, |
|
"grad_norm": 0.6113206044775283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.725, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6582884500299222, |
|
"grad_norm": 0.6186132585168679, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7237, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6702573309395572, |
|
"grad_norm": 0.8008035313826654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7214, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6822262118491921, |
|
"grad_norm": 0.7712979616981473, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7201, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6941950927588271, |
|
"grad_norm": 0.771784813198081, |
|
"learning_rate": 5e-06, |
|
"loss": 0.712, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.706163973668462, |
|
"grad_norm": 0.7490151127607958, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7186, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.718132854578097, |
|
"grad_norm": 0.7472145149920045, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7215, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7301017354877319, |
|
"grad_norm": 0.669766192904856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7203, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7420706163973668, |
|
"grad_norm": 0.7074773906209623, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7206, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7540394973070018, |
|
"grad_norm": 0.6359019991344024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7173, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7660083782166367, |
|
"grad_norm": 0.7248185956514057, |
|
"learning_rate": 5e-06, |
|
"loss": 0.715, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7779772591262717, |
|
"grad_norm": 0.5916566575834068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7153, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7899461400359067, |
|
"grad_norm": 0.9435696828561574, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7132, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8019150209455416, |
|
"grad_norm": 1.043671294723693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7208, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8138839018551766, |
|
"grad_norm": 0.6012906477211288, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7169, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8258527827648114, |
|
"grad_norm": 0.6760825364942948, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7099, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8378216636744464, |
|
"grad_norm": 0.6518956331777058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7148, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8497905445840814, |
|
"grad_norm": 0.7071013655191297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.714, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8617594254937163, |
|
"grad_norm": 0.6247282830782587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7147, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8737283064033513, |
|
"grad_norm": 0.6919222170502861, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7128, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8856971873129862, |
|
"grad_norm": 0.6537145672155599, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7162, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8976660682226212, |
|
"grad_norm": 0.68375157978155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7147, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9096349491322562, |
|
"grad_norm": 0.6647354124963691, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7126, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9216038300418911, |
|
"grad_norm": 0.5977043002608496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7149, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.933572710951526, |
|
"grad_norm": 0.5976562834062398, |
|
"learning_rate": 5e-06, |
|
"loss": 0.715, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9455415918611609, |
|
"grad_norm": 0.6047572754156897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7098, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9575104727707959, |
|
"grad_norm": 0.7904694447754029, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7144, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9694793536804309, |
|
"grad_norm": 0.5807742093881076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7101, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9814482345900658, |
|
"grad_norm": 0.7039850599689602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7148, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9934171154997008, |
|
"grad_norm": 0.6174865022883058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7062, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9994015559545183, |
|
"eval_loss": 0.7114372849464417, |
|
"eval_runtime": 147.5298, |
|
"eval_samples_per_second": 152.606, |
|
"eval_steps_per_second": 0.596, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.0053859964093357, |
|
"grad_norm": 0.8962593548100513, |
|
"learning_rate": 5e-06, |
|
"loss": 0.728, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.0173548773189707, |
|
"grad_norm": 0.7772228818285795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6687, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0293237582286057, |
|
"grad_norm": 0.7166716512779154, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6638, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.0412926391382407, |
|
"grad_norm": 0.6621044482167691, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6701, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.0532615200478754, |
|
"grad_norm": 0.6053908490954104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6647, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.0652304009575104, |
|
"grad_norm": 0.6887973939100516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6638, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.0771992818671454, |
|
"grad_norm": 0.702826603937809, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6683, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.0891681627767804, |
|
"grad_norm": 0.7995617189411401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6661, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.1011370436864154, |
|
"grad_norm": 0.6519665847110662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6656, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.1131059245960502, |
|
"grad_norm": 0.6273625611261167, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6642, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.1250748055056852, |
|
"grad_norm": 0.7185612601640737, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6655, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.1370436864153202, |
|
"grad_norm": 0.6572849791275432, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6694, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.1490125673249552, |
|
"grad_norm": 0.6670384809386003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6702, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.1609814482345902, |
|
"grad_norm": 0.6742291672674092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6694, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.172950329144225, |
|
"grad_norm": 0.7341117675868765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6661, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.18491921005386, |
|
"grad_norm": 0.647754764205428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6626, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.196888090963495, |
|
"grad_norm": 0.6349210444739344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.673, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.20885697187313, |
|
"grad_norm": 0.7055438799763827, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6678, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.220825852782765, |
|
"grad_norm": 0.5932598557482555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6636, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.2327947336923997, |
|
"grad_norm": 0.5945872005782129, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6734, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.2447636146020347, |
|
"grad_norm": 0.6634460553155971, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6686, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.2567324955116697, |
|
"grad_norm": 0.6119696190494924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6632, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.2687013764213046, |
|
"grad_norm": 0.5680998374934575, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6639, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.2806702573309394, |
|
"grad_norm": 0.7632716086431686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6707, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.2926391382405744, |
|
"grad_norm": 0.5773596573256996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.667, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.3046080191502094, |
|
"grad_norm": 0.7249658131418688, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6627, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.3165769000598444, |
|
"grad_norm": 0.6627381756172364, |
|
"learning_rate": 5e-06, |
|
"loss": 0.67, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.3285457809694794, |
|
"grad_norm": 0.5614593187441069, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6647, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.3405146618791144, |
|
"grad_norm": 0.582853382816816, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6669, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.3524835427887494, |
|
"grad_norm": 0.7100796913546954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6726, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.3644524236983842, |
|
"grad_norm": 0.6440445846779764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6645, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.3764213046080191, |
|
"grad_norm": 0.6414609210769443, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6649, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.3883901855176541, |
|
"grad_norm": 0.6066442073205667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6671, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.400359066427289, |
|
"grad_norm": 0.5517122631957174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6676, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.412327947336924, |
|
"grad_norm": 0.6431261043778982, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6639, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.424296828246559, |
|
"grad_norm": 0.7260721545398744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6701, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.436265709156194, |
|
"grad_norm": 0.7161457181189081, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6559, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4482345900658289, |
|
"grad_norm": 0.6352035083119371, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6681, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.4602034709754639, |
|
"grad_norm": 0.7063524982241797, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6654, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.4721723518850989, |
|
"grad_norm": 0.6879403956072693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6694, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.4841412327947336, |
|
"grad_norm": 0.6783332447999513, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6666, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.4961101137043686, |
|
"grad_norm": 0.6478235600361606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6675, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.5080789946140036, |
|
"grad_norm": 0.5883450422091347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6645, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.5200478755236384, |
|
"grad_norm": 0.6660437650036387, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6611, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.5320167564332734, |
|
"grad_norm": 0.5948419571007313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6661, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.5439856373429084, |
|
"grad_norm": 0.5853041784577795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6681, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.5559545182525434, |
|
"grad_norm": 0.7356480278739408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6678, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5679233991621784, |
|
"grad_norm": 0.6949205522779035, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6669, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.5798922800718134, |
|
"grad_norm": 0.6507079001414935, |
|
"learning_rate": 5e-06, |
|
"loss": 0.67, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.5918611609814484, |
|
"grad_norm": 0.6100230292150697, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6653, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.6038300418910831, |
|
"grad_norm": 0.594443951679209, |
|
"learning_rate": 5e-06, |
|
"loss": 0.664, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.6157989228007181, |
|
"grad_norm": 0.6096985220862555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6647, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.6277678037103531, |
|
"grad_norm": 0.5760594804542581, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6669, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.639736684619988, |
|
"grad_norm": 0.691475316154413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6659, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.6517055655296229, |
|
"grad_norm": 0.6527081133567483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6682, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.6636744464392579, |
|
"grad_norm": 0.620764034153086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6658, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.6756433273488929, |
|
"grad_norm": 0.5624863643686968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6629, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.6876122082585279, |
|
"grad_norm": 0.5866917073665039, |
|
"learning_rate": 5e-06, |
|
"loss": 0.662, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.6995810891681629, |
|
"grad_norm": 0.5725120938110386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6679, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.7115499700777979, |
|
"grad_norm": 0.6259302322367788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6681, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.7235188509874326, |
|
"grad_norm": 0.6855515829037336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6642, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.7354877318970676, |
|
"grad_norm": 0.586477759363614, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6645, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.7474566128067026, |
|
"grad_norm": 0.7473190145452194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6657, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.7594254937163374, |
|
"grad_norm": 0.6856637572630547, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6666, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.7713943746259724, |
|
"grad_norm": 0.6356257155539595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6632, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.7833632555356074, |
|
"grad_norm": 0.5780948416794784, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6651, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.7953321364452424, |
|
"grad_norm": 0.6040049759436176, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6687, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.8073010173548774, |
|
"grad_norm": 0.6116754536140059, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6611, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.8192698982645124, |
|
"grad_norm": 0.6285933290005205, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6685, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.8312387791741473, |
|
"grad_norm": 0.6069230952248156, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6633, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.8432076600837821, |
|
"grad_norm": 0.5757493835177011, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6595, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.8551765409934171, |
|
"grad_norm": 0.6335783323377564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6667, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.867145421903052, |
|
"grad_norm": 0.5676650286620293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.662, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.8791143028126869, |
|
"grad_norm": 0.6035969519376175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6628, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.8910831837223219, |
|
"grad_norm": 0.5896338538296376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.664, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.9030520646319569, |
|
"grad_norm": 0.6039070875737724, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6658, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.9150209455415919, |
|
"grad_norm": 0.6567541390615723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6622, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.9269898264512269, |
|
"grad_norm": 0.6276807530688414, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6604, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.9389587073608618, |
|
"grad_norm": 0.8336847726568148, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6663, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.9509275882704968, |
|
"grad_norm": 0.7013759476620051, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6664, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.9628964691801316, |
|
"grad_norm": 0.611731666630219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6663, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.9748653500897666, |
|
"grad_norm": 0.7128314798758998, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6692, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.9868342309994016, |
|
"grad_norm": 0.6475123732829592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6627, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.9988031119090364, |
|
"grad_norm": 0.6022136617663713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6629, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7005711197853088, |
|
"eval_runtime": 146.1272, |
|
"eval_samples_per_second": 154.071, |
|
"eval_steps_per_second": 0.602, |
|
"step": 1671 |
|
}, |
|
{ |
|
"epoch": 2.0107719928186714, |
|
"grad_norm": 0.7068198157977486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6508, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.0227408737283064, |
|
"grad_norm": 0.7453932940551323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6156, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.0347097546379413, |
|
"grad_norm": 0.8134275271355035, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6119, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.0466786355475763, |
|
"grad_norm": 0.7119278959971719, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6121, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.0586475164572113, |
|
"grad_norm": 0.581844000106152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.614, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.0706163973668463, |
|
"grad_norm": 0.6161063490938795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6179, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.0825852782764813, |
|
"grad_norm": 0.6987854084656979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6166, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.0945541591861163, |
|
"grad_norm": 0.6139695159601696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6181, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.106523040095751, |
|
"grad_norm": 0.6608056746575605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6169, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.118491921005386, |
|
"grad_norm": 0.8330792159736297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.621, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.130460801915021, |
|
"grad_norm": 0.6496515220438018, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6168, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.142429682824656, |
|
"grad_norm": 0.5764478857585879, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6172, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.154398563734291, |
|
"grad_norm": 0.5983155038303072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6165, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.166367444643926, |
|
"grad_norm": 0.6670875041460964, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6154, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.178336325553561, |
|
"grad_norm": 0.5905406904657257, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6136, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.190305206463196, |
|
"grad_norm": 0.7480577496080604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6174, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.202274087372831, |
|
"grad_norm": 0.6481682954787796, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6193, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.2142429682824654, |
|
"grad_norm": 0.697692521344505, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6151, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.2262118491921004, |
|
"grad_norm": 0.6518002225384566, |
|
"learning_rate": 5e-06, |
|
"loss": 0.62, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.2381807301017353, |
|
"grad_norm": 0.7977413337144527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6174, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.2501496110113703, |
|
"grad_norm": 0.6920554891456565, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6206, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.2621184919210053, |
|
"grad_norm": 1.0045237891952645, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6174, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.2740873728306403, |
|
"grad_norm": 0.7405972846803555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6218, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.2860562537402753, |
|
"grad_norm": 0.7565616201062902, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6212, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.2980251346499103, |
|
"grad_norm": 0.6602100152398376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6165, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.3099940155595453, |
|
"grad_norm": 0.6690244571651464, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6213, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.3219628964691803, |
|
"grad_norm": 0.7205634813241224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6199, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.3339317773788153, |
|
"grad_norm": 0.6533808767235285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6197, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.34590065828845, |
|
"grad_norm": 0.6671459242816263, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6248, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.357869539198085, |
|
"grad_norm": 0.5765039507445094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6198, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.36983842010772, |
|
"grad_norm": 0.6003833286471706, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6211, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.381807301017355, |
|
"grad_norm": 0.6310887330444114, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6181, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.39377618192699, |
|
"grad_norm": 0.6533436296134522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6223, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.405745062836625, |
|
"grad_norm": 0.7613938683728527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6209, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.41771394374626, |
|
"grad_norm": 0.7227768963538012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6203, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.429682824655895, |
|
"grad_norm": 0.8304498753841795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6154, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.44165170556553, |
|
"grad_norm": 0.7574321335984159, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6221, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.4536205864751643, |
|
"grad_norm": 0.6154411257768034, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6213, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.4655894673847993, |
|
"grad_norm": 0.6619336134055549, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6229, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.4775583482944343, |
|
"grad_norm": 0.5882549911232717, |
|
"learning_rate": 5e-06, |
|
"loss": 0.62, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.4895272292040693, |
|
"grad_norm": 0.6446559545029906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6256, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.5014961101137043, |
|
"grad_norm": 0.6769098922036072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6233, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.5134649910233393, |
|
"grad_norm": 0.7292441062228914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6193, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.5254338719329743, |
|
"grad_norm": 0.8318160977446282, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6187, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.5374027528426093, |
|
"grad_norm": 0.6089671262894073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.624, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.5493716337522443, |
|
"grad_norm": 0.7615201829189324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6171, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.561340514661879, |
|
"grad_norm": 0.724596159991658, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6217, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.5733093955715143, |
|
"grad_norm": 0.612004698756559, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6226, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.585278276481149, |
|
"grad_norm": 0.596628658148184, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6271, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.597247157390784, |
|
"grad_norm": 0.6069344442414866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6224, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.609216038300419, |
|
"grad_norm": 0.6286980131185356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6212, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.621184919210054, |
|
"grad_norm": 0.6326929087930854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6235, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.633153800119689, |
|
"grad_norm": 0.7985721977529328, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6237, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.645122681029324, |
|
"grad_norm": 0.644881510300382, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6269, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.657091561938959, |
|
"grad_norm": 0.6012955887624781, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6213, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.669060442848594, |
|
"grad_norm": 0.7267444291377996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6261, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.6810293237582288, |
|
"grad_norm": 0.7195410501187053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6238, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.6929982046678633, |
|
"grad_norm": 0.6633140798395725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6224, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.7049670855774988, |
|
"grad_norm": 0.6423861744102355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6245, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.7169359664871333, |
|
"grad_norm": 0.6091203181297667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.626, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.7289048473967683, |
|
"grad_norm": 0.5794538334059404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6213, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.7408737283064033, |
|
"grad_norm": 0.6730274420363707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6257, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.7528426092160383, |
|
"grad_norm": 0.7260668423987153, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6209, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.7648114901256733, |
|
"grad_norm": 0.6505297721856697, |
|
"learning_rate": 5e-06, |
|
"loss": 0.627, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.7767803710353083, |
|
"grad_norm": 0.6701846540950109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.626, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.7887492519449433, |
|
"grad_norm": 0.6651727549955714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6284, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.800718132854578, |
|
"grad_norm": 0.5926739432014144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6277, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.8126870137642133, |
|
"grad_norm": 0.5816608379432976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6214, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.824655894673848, |
|
"grad_norm": 0.6395212133787649, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6176, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.836624775583483, |
|
"grad_norm": 0.6634354922942822, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6228, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.848593656493118, |
|
"grad_norm": 0.608805000460622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6236, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.860562537402753, |
|
"grad_norm": 0.6392304210790648, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6247, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.872531418312388, |
|
"grad_norm": 0.6074342205287091, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6227, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.884500299222023, |
|
"grad_norm": 0.5852140611153283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.625, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.8964691801316578, |
|
"grad_norm": 0.7091297497420493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6195, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.9084380610412928, |
|
"grad_norm": 0.5525054193264387, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6234, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.9204069419509278, |
|
"grad_norm": 0.7138183691173519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6247, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.9323758228605623, |
|
"grad_norm": 0.5805881225718816, |
|
"learning_rate": 5e-06, |
|
"loss": 0.623, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.9443447037701977, |
|
"grad_norm": 0.6015585624593602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6258, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.9563135846798323, |
|
"grad_norm": 0.5689711805490192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6196, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.9682824655894673, |
|
"grad_norm": 0.5963313497836371, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6265, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.9802513464991023, |
|
"grad_norm": 0.6519931383722206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6217, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.9922202274087373, |
|
"grad_norm": 0.5587383883911408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6193, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.998204667863555, |
|
"eval_loss": 0.7030414938926697, |
|
"eval_runtime": 143.5127, |
|
"eval_samples_per_second": 156.878, |
|
"eval_steps_per_second": 0.613, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 2.998204667863555, |
|
"step": 2505, |
|
"total_flos": 4195130781204480.0, |
|
"train_loss": 0.6780577095206864, |
|
"train_runtime": 21666.0023, |
|
"train_samples_per_second": 59.229, |
|
"train_steps_per_second": 0.116 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2505, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4195130781204480.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|