{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998204667863555, "eval_steps": 500, "global_step": 2505, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011968880909634948, "grad_norm": 26.500904248426494, "learning_rate": 5e-06, "loss": 1.0918, "step": 10 }, { "epoch": 0.023937761819269897, "grad_norm": 2.050351070135692, "learning_rate": 5e-06, "loss": 0.9643, "step": 20 }, { "epoch": 0.03590664272890485, "grad_norm": 1.5804902490894754, "learning_rate": 5e-06, "loss": 0.8994, "step": 30 }, { "epoch": 0.047875523638539794, "grad_norm": 1.3296407219003534, "learning_rate": 5e-06, "loss": 0.8624, "step": 40 }, { "epoch": 0.059844404548174746, "grad_norm": 1.2478037629085867, "learning_rate": 5e-06, "loss": 0.8386, "step": 50 }, { "epoch": 0.0718132854578097, "grad_norm": 1.5244344651863047, "learning_rate": 5e-06, "loss": 0.8163, "step": 60 }, { "epoch": 0.08378216636744465, "grad_norm": 1.4588200289873845, "learning_rate": 5e-06, "loss": 0.8072, "step": 70 }, { "epoch": 0.09575104727707959, "grad_norm": 1.0982027372762801, "learning_rate": 5e-06, "loss": 0.793, "step": 80 }, { "epoch": 0.10771992818671454, "grad_norm": 0.8816222607041236, "learning_rate": 5e-06, "loss": 0.7803, "step": 90 }, { "epoch": 0.11968880909634949, "grad_norm": 0.756880071100982, "learning_rate": 5e-06, "loss": 0.7751, "step": 100 }, { "epoch": 0.13165769000598443, "grad_norm": 1.055426572397012, "learning_rate": 5e-06, "loss": 0.7709, "step": 110 }, { "epoch": 0.1436265709156194, "grad_norm": 0.6572909169517717, "learning_rate": 5e-06, "loss": 0.7696, "step": 120 }, { "epoch": 0.15559545182525433, "grad_norm": 0.9133460033726442, "learning_rate": 5e-06, "loss": 0.7674, "step": 130 }, { "epoch": 0.1675643327348893, "grad_norm": 1.3148870123285326, "learning_rate": 5e-06, "loss": 0.7689, "step": 140 }, { "epoch": 0.17953321364452424, "grad_norm": 1.2147160089970106, "learning_rate": 5e-06, "loss": 0.7629, "step": 150 }, { "epoch": 0.19150209455415917, "grad_norm": 1.0090571814802656, "learning_rate": 5e-06, "loss": 0.7539, "step": 160 }, { "epoch": 0.20347097546379414, "grad_norm": 0.6786427727932486, "learning_rate": 5e-06, "loss": 0.7581, "step": 170 }, { "epoch": 0.21543985637342908, "grad_norm": 0.6292227320745046, "learning_rate": 5e-06, "loss": 0.7586, "step": 180 }, { "epoch": 0.22740873728306404, "grad_norm": 0.7713241812517966, "learning_rate": 5e-06, "loss": 0.7502, "step": 190 }, { "epoch": 0.23937761819269898, "grad_norm": 0.7205115952689147, "learning_rate": 5e-06, "loss": 0.7547, "step": 200 }, { "epoch": 0.2513464991023339, "grad_norm": 0.6517310095502118, "learning_rate": 5e-06, "loss": 0.7483, "step": 210 }, { "epoch": 0.26331538001196886, "grad_norm": 0.9369409311428439, "learning_rate": 5e-06, "loss": 0.746, "step": 220 }, { "epoch": 0.27528426092160385, "grad_norm": 0.6858210048955122, "learning_rate": 5e-06, "loss": 0.7441, "step": 230 }, { "epoch": 0.2872531418312388, "grad_norm": 0.7081095639219623, "learning_rate": 5e-06, "loss": 0.7445, "step": 240 }, { "epoch": 0.2992220227408737, "grad_norm": 0.8305808826463342, "learning_rate": 5e-06, "loss": 0.7422, "step": 250 }, { "epoch": 0.31119090365050867, "grad_norm": 0.641119207471586, "learning_rate": 5e-06, "loss": 0.7436, "step": 260 }, { "epoch": 0.3231597845601436, "grad_norm": 0.6092200095597452, "learning_rate": 5e-06, "loss": 0.742, "step": 270 }, { "epoch": 0.3351286654697786, "grad_norm": 0.7329815044595642, "learning_rate": 5e-06, "loss": 0.7446, "step": 280 }, { "epoch": 0.34709754637941354, "grad_norm": 0.837913540141014, "learning_rate": 5e-06, "loss": 0.7405, "step": 290 }, { "epoch": 0.3590664272890485, "grad_norm": 0.7796180622052511, "learning_rate": 5e-06, "loss": 0.7361, "step": 300 }, { "epoch": 0.3710353081986834, "grad_norm": 0.6206194415830822, "learning_rate": 5e-06, "loss": 0.7385, "step": 310 }, { "epoch": 0.38300418910831835, "grad_norm": 0.807676253948667, "learning_rate": 5e-06, "loss": 0.7354, "step": 320 }, { "epoch": 0.39497307001795334, "grad_norm": 0.6508042723489613, "learning_rate": 5e-06, "loss": 0.7342, "step": 330 }, { "epoch": 0.4069419509275883, "grad_norm": 1.127544531436106, "learning_rate": 5e-06, "loss": 0.7393, "step": 340 }, { "epoch": 0.4189108318372232, "grad_norm": 0.7264448945571097, "learning_rate": 5e-06, "loss": 0.7335, "step": 350 }, { "epoch": 0.43087971274685816, "grad_norm": 0.7993530026874973, "learning_rate": 5e-06, "loss": 0.7367, "step": 360 }, { "epoch": 0.4428485936564931, "grad_norm": 0.9608462791926143, "learning_rate": 5e-06, "loss": 0.7402, "step": 370 }, { "epoch": 0.4548174745661281, "grad_norm": 0.7292313869577105, "learning_rate": 5e-06, "loss": 0.7346, "step": 380 }, { "epoch": 0.466786355475763, "grad_norm": 1.047700429826994, "learning_rate": 5e-06, "loss": 0.7306, "step": 390 }, { "epoch": 0.47875523638539796, "grad_norm": 0.7162891952015032, "learning_rate": 5e-06, "loss": 0.7298, "step": 400 }, { "epoch": 0.4907241172950329, "grad_norm": 0.8782360080252781, "learning_rate": 5e-06, "loss": 0.7313, "step": 410 }, { "epoch": 0.5026929982046678, "grad_norm": 0.6666343216734302, "learning_rate": 5e-06, "loss": 0.7314, "step": 420 }, { "epoch": 0.5146618791143028, "grad_norm": 0.8384501076164775, "learning_rate": 5e-06, "loss": 0.7355, "step": 430 }, { "epoch": 0.5266307600239377, "grad_norm": 0.6150369560469455, "learning_rate": 5e-06, "loss": 0.7298, "step": 440 }, { "epoch": 0.5385996409335727, "grad_norm": 0.6124319945799728, "learning_rate": 5e-06, "loss": 0.7292, "step": 450 }, { "epoch": 0.5505685218432077, "grad_norm": 0.6189565248445754, "learning_rate": 5e-06, "loss": 0.7263, "step": 460 }, { "epoch": 0.5625374027528426, "grad_norm": 0.6421927415859262, "learning_rate": 5e-06, "loss": 0.7299, "step": 470 }, { "epoch": 0.5745062836624776, "grad_norm": 0.7163639686041191, "learning_rate": 5e-06, "loss": 0.7288, "step": 480 }, { "epoch": 0.5864751645721125, "grad_norm": 0.7901780871296374, "learning_rate": 5e-06, "loss": 0.7233, "step": 490 }, { "epoch": 0.5984440454817475, "grad_norm": 0.6075768057357427, "learning_rate": 5e-06, "loss": 0.7285, "step": 500 }, { "epoch": 0.6104129263913824, "grad_norm": 0.7432513250953633, "learning_rate": 5e-06, "loss": 0.7255, "step": 510 }, { "epoch": 0.6223818073010173, "grad_norm": 0.6882126061650138, "learning_rate": 5e-06, "loss": 0.7228, "step": 520 }, { "epoch": 0.6343506882106523, "grad_norm": 0.6444331452668448, "learning_rate": 5e-06, "loss": 0.7208, "step": 530 }, { "epoch": 0.6463195691202872, "grad_norm": 0.6113206044775283, "learning_rate": 5e-06, "loss": 0.725, "step": 540 }, { "epoch": 0.6582884500299222, "grad_norm": 0.6186132585168679, "learning_rate": 5e-06, "loss": 0.7237, "step": 550 }, { "epoch": 0.6702573309395572, "grad_norm": 0.8008035313826654, "learning_rate": 5e-06, "loss": 0.7214, "step": 560 }, { "epoch": 0.6822262118491921, "grad_norm": 0.7712979616981473, "learning_rate": 5e-06, "loss": 0.7201, "step": 570 }, { "epoch": 0.6941950927588271, "grad_norm": 0.771784813198081, "learning_rate": 5e-06, "loss": 0.712, "step": 580 }, { "epoch": 0.706163973668462, "grad_norm": 0.7490151127607958, "learning_rate": 5e-06, "loss": 0.7186, "step": 590 }, { "epoch": 0.718132854578097, "grad_norm": 0.7472145149920045, "learning_rate": 5e-06, "loss": 0.7215, "step": 600 }, { "epoch": 0.7301017354877319, "grad_norm": 0.669766192904856, "learning_rate": 5e-06, "loss": 0.7203, "step": 610 }, { "epoch": 0.7420706163973668, "grad_norm": 0.7074773906209623, "learning_rate": 5e-06, "loss": 0.7206, "step": 620 }, { "epoch": 0.7540394973070018, "grad_norm": 0.6359019991344024, "learning_rate": 5e-06, "loss": 0.7173, "step": 630 }, { "epoch": 0.7660083782166367, "grad_norm": 0.7248185956514057, "learning_rate": 5e-06, "loss": 0.715, "step": 640 }, { "epoch": 0.7779772591262717, "grad_norm": 0.5916566575834068, "learning_rate": 5e-06, "loss": 0.7153, "step": 650 }, { "epoch": 0.7899461400359067, "grad_norm": 0.9435696828561574, "learning_rate": 5e-06, "loss": 0.7132, "step": 660 }, { "epoch": 0.8019150209455416, "grad_norm": 1.043671294723693, "learning_rate": 5e-06, "loss": 0.7208, "step": 670 }, { "epoch": 0.8138839018551766, "grad_norm": 0.6012906477211288, "learning_rate": 5e-06, "loss": 0.7169, "step": 680 }, { "epoch": 0.8258527827648114, "grad_norm": 0.6760825364942948, "learning_rate": 5e-06, "loss": 0.7099, "step": 690 }, { "epoch": 0.8378216636744464, "grad_norm": 0.6518956331777058, "learning_rate": 5e-06, "loss": 0.7148, "step": 700 }, { "epoch": 0.8497905445840814, "grad_norm": 0.7071013655191297, "learning_rate": 5e-06, "loss": 0.714, "step": 710 }, { "epoch": 0.8617594254937163, "grad_norm": 0.6247282830782587, "learning_rate": 5e-06, "loss": 0.7147, "step": 720 }, { "epoch": 0.8737283064033513, "grad_norm": 0.6919222170502861, "learning_rate": 5e-06, "loss": 0.7128, "step": 730 }, { "epoch": 0.8856971873129862, "grad_norm": 0.6537145672155599, "learning_rate": 5e-06, "loss": 0.7162, "step": 740 }, { "epoch": 0.8976660682226212, "grad_norm": 0.68375157978155, "learning_rate": 5e-06, "loss": 0.7147, "step": 750 }, { "epoch": 0.9096349491322562, "grad_norm": 0.6647354124963691, "learning_rate": 5e-06, "loss": 0.7126, "step": 760 }, { "epoch": 0.9216038300418911, "grad_norm": 0.5977043002608496, "learning_rate": 5e-06, "loss": 0.7149, "step": 770 }, { "epoch": 0.933572710951526, "grad_norm": 0.5976562834062398, "learning_rate": 5e-06, "loss": 0.715, "step": 780 }, { "epoch": 0.9455415918611609, "grad_norm": 0.6047572754156897, "learning_rate": 5e-06, "loss": 0.7098, "step": 790 }, { "epoch": 0.9575104727707959, "grad_norm": 0.7904694447754029, "learning_rate": 5e-06, "loss": 0.7144, "step": 800 }, { "epoch": 0.9694793536804309, "grad_norm": 0.5807742093881076, "learning_rate": 5e-06, "loss": 0.7101, "step": 810 }, { "epoch": 0.9814482345900658, "grad_norm": 0.7039850599689602, "learning_rate": 5e-06, "loss": 0.7148, "step": 820 }, { "epoch": 0.9934171154997008, "grad_norm": 0.6174865022883058, "learning_rate": 5e-06, "loss": 0.7062, "step": 830 }, { "epoch": 0.9994015559545183, "eval_loss": 0.7114372849464417, "eval_runtime": 147.5298, "eval_samples_per_second": 152.606, "eval_steps_per_second": 0.596, "step": 835 }, { "epoch": 1.0053859964093357, "grad_norm": 0.8962593548100513, "learning_rate": 5e-06, "loss": 0.728, "step": 840 }, { "epoch": 1.0173548773189707, "grad_norm": 0.7772228818285795, "learning_rate": 5e-06, "loss": 0.6687, "step": 850 }, { "epoch": 1.0293237582286057, "grad_norm": 0.7166716512779154, "learning_rate": 5e-06, "loss": 0.6638, "step": 860 }, { "epoch": 1.0412926391382407, "grad_norm": 0.6621044482167691, "learning_rate": 5e-06, "loss": 0.6701, "step": 870 }, { "epoch": 1.0532615200478754, "grad_norm": 0.6053908490954104, "learning_rate": 5e-06, "loss": 0.6647, "step": 880 }, { "epoch": 1.0652304009575104, "grad_norm": 0.6887973939100516, "learning_rate": 5e-06, "loss": 0.6638, "step": 890 }, { "epoch": 1.0771992818671454, "grad_norm": 0.702826603937809, "learning_rate": 5e-06, "loss": 0.6683, "step": 900 }, { "epoch": 1.0891681627767804, "grad_norm": 0.7995617189411401, "learning_rate": 5e-06, "loss": 0.6661, "step": 910 }, { "epoch": 1.1011370436864154, "grad_norm": 0.6519665847110662, "learning_rate": 5e-06, "loss": 0.6656, "step": 920 }, { "epoch": 1.1131059245960502, "grad_norm": 0.6273625611261167, "learning_rate": 5e-06, "loss": 0.6642, "step": 930 }, { "epoch": 1.1250748055056852, "grad_norm": 0.7185612601640737, "learning_rate": 5e-06, "loss": 0.6655, "step": 940 }, { "epoch": 1.1370436864153202, "grad_norm": 0.6572849791275432, "learning_rate": 5e-06, "loss": 0.6694, "step": 950 }, { "epoch": 1.1490125673249552, "grad_norm": 0.6670384809386003, "learning_rate": 5e-06, "loss": 0.6702, "step": 960 }, { "epoch": 1.1609814482345902, "grad_norm": 0.6742291672674092, "learning_rate": 5e-06, "loss": 0.6694, "step": 970 }, { "epoch": 1.172950329144225, "grad_norm": 0.7341117675868765, "learning_rate": 5e-06, "loss": 0.6661, "step": 980 }, { "epoch": 1.18491921005386, "grad_norm": 0.647754764205428, "learning_rate": 5e-06, "loss": 0.6626, "step": 990 }, { "epoch": 1.196888090963495, "grad_norm": 0.6349210444739344, "learning_rate": 5e-06, "loss": 0.673, "step": 1000 }, { "epoch": 1.20885697187313, "grad_norm": 0.7055438799763827, "learning_rate": 5e-06, "loss": 0.6678, "step": 1010 }, { "epoch": 1.220825852782765, "grad_norm": 0.5932598557482555, "learning_rate": 5e-06, "loss": 0.6636, "step": 1020 }, { "epoch": 1.2327947336923997, "grad_norm": 0.5945872005782129, "learning_rate": 5e-06, "loss": 0.6734, "step": 1030 }, { "epoch": 1.2447636146020347, "grad_norm": 0.6634460553155971, "learning_rate": 5e-06, "loss": 0.6686, "step": 1040 }, { "epoch": 1.2567324955116697, "grad_norm": 0.6119696190494924, "learning_rate": 5e-06, "loss": 0.6632, "step": 1050 }, { "epoch": 1.2687013764213046, "grad_norm": 0.5680998374934575, "learning_rate": 5e-06, "loss": 0.6639, "step": 1060 }, { "epoch": 1.2806702573309394, "grad_norm": 0.7632716086431686, "learning_rate": 5e-06, "loss": 0.6707, "step": 1070 }, { "epoch": 1.2926391382405744, "grad_norm": 0.5773596573256996, "learning_rate": 5e-06, "loss": 0.667, "step": 1080 }, { "epoch": 1.3046080191502094, "grad_norm": 0.7249658131418688, "learning_rate": 5e-06, "loss": 0.6627, "step": 1090 }, { "epoch": 1.3165769000598444, "grad_norm": 0.6627381756172364, "learning_rate": 5e-06, "loss": 0.67, "step": 1100 }, { "epoch": 1.3285457809694794, "grad_norm": 0.5614593187441069, "learning_rate": 5e-06, "loss": 0.6647, "step": 1110 }, { "epoch": 1.3405146618791144, "grad_norm": 0.582853382816816, "learning_rate": 5e-06, "loss": 0.6669, "step": 1120 }, { "epoch": 1.3524835427887494, "grad_norm": 0.7100796913546954, "learning_rate": 5e-06, "loss": 0.6726, "step": 1130 }, { "epoch": 1.3644524236983842, "grad_norm": 0.6440445846779764, "learning_rate": 5e-06, "loss": 0.6645, "step": 1140 }, { "epoch": 1.3764213046080191, "grad_norm": 0.6414609210769443, "learning_rate": 5e-06, "loss": 0.6649, "step": 1150 }, { "epoch": 1.3883901855176541, "grad_norm": 0.6066442073205667, "learning_rate": 5e-06, "loss": 0.6671, "step": 1160 }, { "epoch": 1.400359066427289, "grad_norm": 0.5517122631957174, "learning_rate": 5e-06, "loss": 0.6676, "step": 1170 }, { "epoch": 1.412327947336924, "grad_norm": 0.6431261043778982, "learning_rate": 5e-06, "loss": 0.6639, "step": 1180 }, { "epoch": 1.424296828246559, "grad_norm": 0.7260721545398744, "learning_rate": 5e-06, "loss": 0.6701, "step": 1190 }, { "epoch": 1.436265709156194, "grad_norm": 0.7161457181189081, "learning_rate": 5e-06, "loss": 0.6559, "step": 1200 }, { "epoch": 1.4482345900658289, "grad_norm": 0.6352035083119371, "learning_rate": 5e-06, "loss": 0.6681, "step": 1210 }, { "epoch": 1.4602034709754639, "grad_norm": 0.7063524982241797, "learning_rate": 5e-06, "loss": 0.6654, "step": 1220 }, { "epoch": 1.4721723518850989, "grad_norm": 0.6879403956072693, "learning_rate": 5e-06, "loss": 0.6694, "step": 1230 }, { "epoch": 1.4841412327947336, "grad_norm": 0.6783332447999513, "learning_rate": 5e-06, "loss": 0.6666, "step": 1240 }, { "epoch": 1.4961101137043686, "grad_norm": 0.6478235600361606, "learning_rate": 5e-06, "loss": 0.6675, "step": 1250 }, { "epoch": 1.5080789946140036, "grad_norm": 0.5883450422091347, "learning_rate": 5e-06, "loss": 0.6645, "step": 1260 }, { "epoch": 1.5200478755236384, "grad_norm": 0.6660437650036387, "learning_rate": 5e-06, "loss": 0.6611, "step": 1270 }, { "epoch": 1.5320167564332734, "grad_norm": 0.5948419571007313, "learning_rate": 5e-06, "loss": 0.6661, "step": 1280 }, { "epoch": 1.5439856373429084, "grad_norm": 0.5853041784577795, "learning_rate": 5e-06, "loss": 0.6681, "step": 1290 }, { "epoch": 1.5559545182525434, "grad_norm": 0.7356480278739408, "learning_rate": 5e-06, "loss": 0.6678, "step": 1300 }, { "epoch": 1.5679233991621784, "grad_norm": 0.6949205522779035, "learning_rate": 5e-06, "loss": 0.6669, "step": 1310 }, { "epoch": 1.5798922800718134, "grad_norm": 0.6507079001414935, "learning_rate": 5e-06, "loss": 0.67, "step": 1320 }, { "epoch": 1.5918611609814484, "grad_norm": 0.6100230292150697, "learning_rate": 5e-06, "loss": 0.6653, "step": 1330 }, { "epoch": 1.6038300418910831, "grad_norm": 0.594443951679209, "learning_rate": 5e-06, "loss": 0.664, "step": 1340 }, { "epoch": 1.6157989228007181, "grad_norm": 0.6096985220862555, "learning_rate": 5e-06, "loss": 0.6647, "step": 1350 }, { "epoch": 1.6277678037103531, "grad_norm": 0.5760594804542581, "learning_rate": 5e-06, "loss": 0.6669, "step": 1360 }, { "epoch": 1.639736684619988, "grad_norm": 0.691475316154413, "learning_rate": 5e-06, "loss": 0.6659, "step": 1370 }, { "epoch": 1.6517055655296229, "grad_norm": 0.6527081133567483, "learning_rate": 5e-06, "loss": 0.6682, "step": 1380 }, { "epoch": 1.6636744464392579, "grad_norm": 0.620764034153086, "learning_rate": 5e-06, "loss": 0.6658, "step": 1390 }, { "epoch": 1.6756433273488929, "grad_norm": 0.5624863643686968, "learning_rate": 5e-06, "loss": 0.6629, "step": 1400 }, { "epoch": 1.6876122082585279, "grad_norm": 0.5866917073665039, "learning_rate": 5e-06, "loss": 0.662, "step": 1410 }, { "epoch": 1.6995810891681629, "grad_norm": 0.5725120938110386, "learning_rate": 5e-06, "loss": 0.6679, "step": 1420 }, { "epoch": 1.7115499700777979, "grad_norm": 0.6259302322367788, "learning_rate": 5e-06, "loss": 0.6681, "step": 1430 }, { "epoch": 1.7235188509874326, "grad_norm": 0.6855515829037336, "learning_rate": 5e-06, "loss": 0.6642, "step": 1440 }, { "epoch": 1.7354877318970676, "grad_norm": 0.586477759363614, "learning_rate": 5e-06, "loss": 0.6645, "step": 1450 }, { "epoch": 1.7474566128067026, "grad_norm": 0.7473190145452194, "learning_rate": 5e-06, "loss": 0.6657, "step": 1460 }, { "epoch": 1.7594254937163374, "grad_norm": 0.6856637572630547, "learning_rate": 5e-06, "loss": 0.6666, "step": 1470 }, { "epoch": 1.7713943746259724, "grad_norm": 0.6356257155539595, "learning_rate": 5e-06, "loss": 0.6632, "step": 1480 }, { "epoch": 1.7833632555356074, "grad_norm": 0.5780948416794784, "learning_rate": 5e-06, "loss": 0.6651, "step": 1490 }, { "epoch": 1.7953321364452424, "grad_norm": 0.6040049759436176, "learning_rate": 5e-06, "loss": 0.6687, "step": 1500 }, { "epoch": 1.8073010173548774, "grad_norm": 0.6116754536140059, "learning_rate": 5e-06, "loss": 0.6611, "step": 1510 }, { "epoch": 1.8192698982645124, "grad_norm": 0.6285933290005205, "learning_rate": 5e-06, "loss": 0.6685, "step": 1520 }, { "epoch": 1.8312387791741473, "grad_norm": 0.6069230952248156, "learning_rate": 5e-06, "loss": 0.6633, "step": 1530 }, { "epoch": 1.8432076600837821, "grad_norm": 0.5757493835177011, "learning_rate": 5e-06, "loss": 0.6595, "step": 1540 }, { "epoch": 1.8551765409934171, "grad_norm": 0.6335783323377564, "learning_rate": 5e-06, "loss": 0.6667, "step": 1550 }, { "epoch": 1.867145421903052, "grad_norm": 0.5676650286620293, "learning_rate": 5e-06, "loss": 0.662, "step": 1560 }, { "epoch": 1.8791143028126869, "grad_norm": 0.6035969519376175, "learning_rate": 5e-06, "loss": 0.6628, "step": 1570 }, { "epoch": 1.8910831837223219, "grad_norm": 0.5896338538296376, "learning_rate": 5e-06, "loss": 0.664, "step": 1580 }, { "epoch": 1.9030520646319569, "grad_norm": 0.6039070875737724, "learning_rate": 5e-06, "loss": 0.6658, "step": 1590 }, { "epoch": 1.9150209455415919, "grad_norm": 0.6567541390615723, "learning_rate": 5e-06, "loss": 0.6622, "step": 1600 }, { "epoch": 1.9269898264512269, "grad_norm": 0.6276807530688414, "learning_rate": 5e-06, "loss": 0.6604, "step": 1610 }, { "epoch": 1.9389587073608618, "grad_norm": 0.8336847726568148, "learning_rate": 5e-06, "loss": 0.6663, "step": 1620 }, { "epoch": 1.9509275882704968, "grad_norm": 0.7013759476620051, "learning_rate": 5e-06, "loss": 0.6664, "step": 1630 }, { "epoch": 1.9628964691801316, "grad_norm": 0.611731666630219, "learning_rate": 5e-06, "loss": 0.6663, "step": 1640 }, { "epoch": 1.9748653500897666, "grad_norm": 0.7128314798758998, "learning_rate": 5e-06, "loss": 0.6692, "step": 1650 }, { "epoch": 1.9868342309994016, "grad_norm": 0.6475123732829592, "learning_rate": 5e-06, "loss": 0.6627, "step": 1660 }, { "epoch": 1.9988031119090364, "grad_norm": 0.6022136617663713, "learning_rate": 5e-06, "loss": 0.6629, "step": 1670 }, { "epoch": 2.0, "eval_loss": 0.7005711197853088, "eval_runtime": 146.1272, "eval_samples_per_second": 154.071, "eval_steps_per_second": 0.602, "step": 1671 }, { "epoch": 2.0107719928186714, "grad_norm": 0.7068198157977486, "learning_rate": 5e-06, "loss": 0.6508, "step": 1680 }, { "epoch": 2.0227408737283064, "grad_norm": 0.7453932940551323, "learning_rate": 5e-06, "loss": 0.6156, "step": 1690 }, { "epoch": 2.0347097546379413, "grad_norm": 0.8134275271355035, "learning_rate": 5e-06, "loss": 0.6119, "step": 1700 }, { "epoch": 2.0466786355475763, "grad_norm": 0.7119278959971719, "learning_rate": 5e-06, "loss": 0.6121, "step": 1710 }, { "epoch": 2.0586475164572113, "grad_norm": 0.581844000106152, "learning_rate": 5e-06, "loss": 0.614, "step": 1720 }, { "epoch": 2.0706163973668463, "grad_norm": 0.6161063490938795, "learning_rate": 5e-06, "loss": 0.6179, "step": 1730 }, { "epoch": 2.0825852782764813, "grad_norm": 0.6987854084656979, "learning_rate": 5e-06, "loss": 0.6166, "step": 1740 }, { "epoch": 2.0945541591861163, "grad_norm": 0.6139695159601696, "learning_rate": 5e-06, "loss": 0.6181, "step": 1750 }, { "epoch": 2.106523040095751, "grad_norm": 0.6608056746575605, "learning_rate": 5e-06, "loss": 0.6169, "step": 1760 }, { "epoch": 2.118491921005386, "grad_norm": 0.8330792159736297, "learning_rate": 5e-06, "loss": 0.621, "step": 1770 }, { "epoch": 2.130460801915021, "grad_norm": 0.6496515220438018, "learning_rate": 5e-06, "loss": 0.6168, "step": 1780 }, { "epoch": 2.142429682824656, "grad_norm": 0.5764478857585879, "learning_rate": 5e-06, "loss": 0.6172, "step": 1790 }, { "epoch": 2.154398563734291, "grad_norm": 0.5983155038303072, "learning_rate": 5e-06, "loss": 0.6165, "step": 1800 }, { "epoch": 2.166367444643926, "grad_norm": 0.6670875041460964, "learning_rate": 5e-06, "loss": 0.6154, "step": 1810 }, { "epoch": 2.178336325553561, "grad_norm": 0.5905406904657257, "learning_rate": 5e-06, "loss": 0.6136, "step": 1820 }, { "epoch": 2.190305206463196, "grad_norm": 0.7480577496080604, "learning_rate": 5e-06, "loss": 0.6174, "step": 1830 }, { "epoch": 2.202274087372831, "grad_norm": 0.6481682954787796, "learning_rate": 5e-06, "loss": 0.6193, "step": 1840 }, { "epoch": 2.2142429682824654, "grad_norm": 0.697692521344505, "learning_rate": 5e-06, "loss": 0.6151, "step": 1850 }, { "epoch": 2.2262118491921004, "grad_norm": 0.6518002225384566, "learning_rate": 5e-06, "loss": 0.62, "step": 1860 }, { "epoch": 2.2381807301017353, "grad_norm": 0.7977413337144527, "learning_rate": 5e-06, "loss": 0.6174, "step": 1870 }, { "epoch": 2.2501496110113703, "grad_norm": 0.6920554891456565, "learning_rate": 5e-06, "loss": 0.6206, "step": 1880 }, { "epoch": 2.2621184919210053, "grad_norm": 1.0045237891952645, "learning_rate": 5e-06, "loss": 0.6174, "step": 1890 }, { "epoch": 2.2740873728306403, "grad_norm": 0.7405972846803555, "learning_rate": 5e-06, "loss": 0.6218, "step": 1900 }, { "epoch": 2.2860562537402753, "grad_norm": 0.7565616201062902, "learning_rate": 5e-06, "loss": 0.6212, "step": 1910 }, { "epoch": 2.2980251346499103, "grad_norm": 0.6602100152398376, "learning_rate": 5e-06, "loss": 0.6165, "step": 1920 }, { "epoch": 2.3099940155595453, "grad_norm": 0.6690244571651464, "learning_rate": 5e-06, "loss": 0.6213, "step": 1930 }, { "epoch": 2.3219628964691803, "grad_norm": 0.7205634813241224, "learning_rate": 5e-06, "loss": 0.6199, "step": 1940 }, { "epoch": 2.3339317773788153, "grad_norm": 0.6533808767235285, "learning_rate": 5e-06, "loss": 0.6197, "step": 1950 }, { "epoch": 2.34590065828845, "grad_norm": 0.6671459242816263, "learning_rate": 5e-06, "loss": 0.6248, "step": 1960 }, { "epoch": 2.357869539198085, "grad_norm": 0.5765039507445094, "learning_rate": 5e-06, "loss": 0.6198, "step": 1970 }, { "epoch": 2.36983842010772, "grad_norm": 0.6003833286471706, "learning_rate": 5e-06, "loss": 0.6211, "step": 1980 }, { "epoch": 2.381807301017355, "grad_norm": 0.6310887330444114, "learning_rate": 5e-06, "loss": 0.6181, "step": 1990 }, { "epoch": 2.39377618192699, "grad_norm": 0.6533436296134522, "learning_rate": 5e-06, "loss": 0.6223, "step": 2000 }, { "epoch": 2.405745062836625, "grad_norm": 0.7613938683728527, "learning_rate": 5e-06, "loss": 0.6209, "step": 2010 }, { "epoch": 2.41771394374626, "grad_norm": 0.7227768963538012, "learning_rate": 5e-06, "loss": 0.6203, "step": 2020 }, { "epoch": 2.429682824655895, "grad_norm": 0.8304498753841795, "learning_rate": 5e-06, "loss": 0.6154, "step": 2030 }, { "epoch": 2.44165170556553, "grad_norm": 0.7574321335984159, "learning_rate": 5e-06, "loss": 0.6221, "step": 2040 }, { "epoch": 2.4536205864751643, "grad_norm": 0.6154411257768034, "learning_rate": 5e-06, "loss": 0.6213, "step": 2050 }, { "epoch": 2.4655894673847993, "grad_norm": 0.6619336134055549, "learning_rate": 5e-06, "loss": 0.6229, "step": 2060 }, { "epoch": 2.4775583482944343, "grad_norm": 0.5882549911232717, "learning_rate": 5e-06, "loss": 0.62, "step": 2070 }, { "epoch": 2.4895272292040693, "grad_norm": 0.6446559545029906, "learning_rate": 5e-06, "loss": 0.6256, "step": 2080 }, { "epoch": 2.5014961101137043, "grad_norm": 0.6769098922036072, "learning_rate": 5e-06, "loss": 0.6233, "step": 2090 }, { "epoch": 2.5134649910233393, "grad_norm": 0.7292441062228914, "learning_rate": 5e-06, "loss": 0.6193, "step": 2100 }, { "epoch": 2.5254338719329743, "grad_norm": 0.8318160977446282, "learning_rate": 5e-06, "loss": 0.6187, "step": 2110 }, { "epoch": 2.5374027528426093, "grad_norm": 0.6089671262894073, "learning_rate": 5e-06, "loss": 0.624, "step": 2120 }, { "epoch": 2.5493716337522443, "grad_norm": 0.7615201829189324, "learning_rate": 5e-06, "loss": 0.6171, "step": 2130 }, { "epoch": 2.561340514661879, "grad_norm": 0.724596159991658, "learning_rate": 5e-06, "loss": 0.6217, "step": 2140 }, { "epoch": 2.5733093955715143, "grad_norm": 0.612004698756559, "learning_rate": 5e-06, "loss": 0.6226, "step": 2150 }, { "epoch": 2.585278276481149, "grad_norm": 0.596628658148184, "learning_rate": 5e-06, "loss": 0.6271, "step": 2160 }, { "epoch": 2.597247157390784, "grad_norm": 0.6069344442414866, "learning_rate": 5e-06, "loss": 0.6224, "step": 2170 }, { "epoch": 2.609216038300419, "grad_norm": 0.6286980131185356, "learning_rate": 5e-06, "loss": 0.6212, "step": 2180 }, { "epoch": 2.621184919210054, "grad_norm": 0.6326929087930854, "learning_rate": 5e-06, "loss": 0.6235, "step": 2190 }, { "epoch": 2.633153800119689, "grad_norm": 0.7985721977529328, "learning_rate": 5e-06, "loss": 0.6237, "step": 2200 }, { "epoch": 2.645122681029324, "grad_norm": 0.644881510300382, "learning_rate": 5e-06, "loss": 0.6269, "step": 2210 }, { "epoch": 2.657091561938959, "grad_norm": 0.6012955887624781, "learning_rate": 5e-06, "loss": 0.6213, "step": 2220 }, { "epoch": 2.669060442848594, "grad_norm": 0.7267444291377996, "learning_rate": 5e-06, "loss": 0.6261, "step": 2230 }, { "epoch": 2.6810293237582288, "grad_norm": 0.7195410501187053, "learning_rate": 5e-06, "loss": 0.6238, "step": 2240 }, { "epoch": 2.6929982046678633, "grad_norm": 0.6633140798395725, "learning_rate": 5e-06, "loss": 0.6224, "step": 2250 }, { "epoch": 2.7049670855774988, "grad_norm": 0.6423861744102355, "learning_rate": 5e-06, "loss": 0.6245, "step": 2260 }, { "epoch": 2.7169359664871333, "grad_norm": 0.6091203181297667, "learning_rate": 5e-06, "loss": 0.626, "step": 2270 }, { "epoch": 2.7289048473967683, "grad_norm": 0.5794538334059404, "learning_rate": 5e-06, "loss": 0.6213, "step": 2280 }, { "epoch": 2.7408737283064033, "grad_norm": 0.6730274420363707, "learning_rate": 5e-06, "loss": 0.6257, "step": 2290 }, { "epoch": 2.7528426092160383, "grad_norm": 0.7260668423987153, "learning_rate": 5e-06, "loss": 0.6209, "step": 2300 }, { "epoch": 2.7648114901256733, "grad_norm": 0.6505297721856697, "learning_rate": 5e-06, "loss": 0.627, "step": 2310 }, { "epoch": 2.7767803710353083, "grad_norm": 0.6701846540950109, "learning_rate": 5e-06, "loss": 0.626, "step": 2320 }, { "epoch": 2.7887492519449433, "grad_norm": 0.6651727549955714, "learning_rate": 5e-06, "loss": 0.6284, "step": 2330 }, { "epoch": 2.800718132854578, "grad_norm": 0.5926739432014144, "learning_rate": 5e-06, "loss": 0.6277, "step": 2340 }, { "epoch": 2.8126870137642133, "grad_norm": 0.5816608379432976, "learning_rate": 5e-06, "loss": 0.6214, "step": 2350 }, { "epoch": 2.824655894673848, "grad_norm": 0.6395212133787649, "learning_rate": 5e-06, "loss": 0.6176, "step": 2360 }, { "epoch": 2.836624775583483, "grad_norm": 0.6634354922942822, "learning_rate": 5e-06, "loss": 0.6228, "step": 2370 }, { "epoch": 2.848593656493118, "grad_norm": 0.608805000460622, "learning_rate": 5e-06, "loss": 0.6236, "step": 2380 }, { "epoch": 2.860562537402753, "grad_norm": 0.6392304210790648, "learning_rate": 5e-06, "loss": 0.6247, "step": 2390 }, { "epoch": 2.872531418312388, "grad_norm": 0.6074342205287091, "learning_rate": 5e-06, "loss": 0.6227, "step": 2400 }, { "epoch": 2.884500299222023, "grad_norm": 0.5852140611153283, "learning_rate": 5e-06, "loss": 0.625, "step": 2410 }, { "epoch": 2.8964691801316578, "grad_norm": 0.7091297497420493, "learning_rate": 5e-06, "loss": 0.6195, "step": 2420 }, { "epoch": 2.9084380610412928, "grad_norm": 0.5525054193264387, "learning_rate": 5e-06, "loss": 0.6234, "step": 2430 }, { "epoch": 2.9204069419509278, "grad_norm": 0.7138183691173519, "learning_rate": 5e-06, "loss": 0.6247, "step": 2440 }, { "epoch": 2.9323758228605623, "grad_norm": 0.5805881225718816, "learning_rate": 5e-06, "loss": 0.623, "step": 2450 }, { "epoch": 2.9443447037701977, "grad_norm": 0.6015585624593602, "learning_rate": 5e-06, "loss": 0.6258, "step": 2460 }, { "epoch": 2.9563135846798323, "grad_norm": 0.5689711805490192, "learning_rate": 5e-06, "loss": 0.6196, "step": 2470 }, { "epoch": 2.9682824655894673, "grad_norm": 0.5963313497836371, "learning_rate": 5e-06, "loss": 0.6265, "step": 2480 }, { "epoch": 2.9802513464991023, "grad_norm": 0.6519931383722206, "learning_rate": 5e-06, "loss": 0.6217, "step": 2490 }, { "epoch": 2.9922202274087373, "grad_norm": 0.5587383883911408, "learning_rate": 5e-06, "loss": 0.6193, "step": 2500 }, { "epoch": 2.998204667863555, "eval_loss": 0.7030414938926697, "eval_runtime": 143.5127, "eval_samples_per_second": 156.878, "eval_steps_per_second": 0.613, "step": 2505 }, { "epoch": 2.998204667863555, "step": 2505, "total_flos": 4195130781204480.0, "train_loss": 0.6780577095206864, "train_runtime": 21666.0023, "train_samples_per_second": 59.229, "train_steps_per_second": 0.116 } ], "logging_steps": 10, "max_steps": 2505, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4195130781204480.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }