|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0002243326104838, |
|
"eval_steps": 300, |
|
"global_step": 3344, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00029911014731174755, |
|
"eval_loss": 2.900763511657715, |
|
"eval_runtime": 20.57, |
|
"eval_samples_per_second": 21.39, |
|
"eval_steps_per_second": 21.39, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007477753682793689, |
|
"grad_norm": 9.5, |
|
"learning_rate": 5e-06, |
|
"loss": 2.3534, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.014955507365587378, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.022433261048381066, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 1.5749, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.029911014731174756, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3312, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.037388768413968446, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 1.9999824904190002e-05, |
|
"loss": 1.2434, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.04486652209676213, |
|
"grad_norm": 6.375, |
|
"learning_rate": 1.999929962289171e-05, |
|
"loss": 1.1479, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05234427577955582, |
|
"grad_norm": 6.625, |
|
"learning_rate": 1.9998424174500043e-05, |
|
"loss": 1.103, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.05982202946234951, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 1.9997198589672462e-05, |
|
"loss": 1.0688, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0672997831451432, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 1.9995622911327924e-05, |
|
"loss": 1.0343, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.07477753682793689, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.9993697194645362e-05, |
|
"loss": 0.9735, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08225529051073058, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 1.9991421507061763e-05, |
|
"loss": 0.975, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.08973304419352426, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.99887959282698e-05, |
|
"loss": 0.9508, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08973304419352426, |
|
"eval_loss": 1.0376813411712646, |
|
"eval_runtime": 19.5994, |
|
"eval_samples_per_second": 22.45, |
|
"eval_steps_per_second": 22.45, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09721079787631795, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.9985820550215044e-05, |
|
"loss": 0.9618, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.10468855155911164, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.998249547709273e-05, |
|
"loss": 0.9224, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.11216630524190534, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.9978820825344147e-05, |
|
"loss": 0.8982, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.11964405892469902, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.9974796723652508e-05, |
|
"loss": 0.9116, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1271218126074927, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.9970423312938488e-05, |
|
"loss": 0.8981, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.1345995662902864, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.996570074635527e-05, |
|
"loss": 0.8831, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1420773199730801, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.9960629189283165e-05, |
|
"loss": 0.8393, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.14955507365587378, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.9955208819323864e-05, |
|
"loss": 0.851, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.15703282733866747, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.9949439826294178e-05, |
|
"loss": 0.8164, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.16451058102146116, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.9943322412219398e-05, |
|
"loss": 0.8388, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.17198833470425484, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.9936856791326255e-05, |
|
"loss": 0.8275, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.17946608838704853, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.9930043190035364e-05, |
|
"loss": 0.8071, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.17946608838704853, |
|
"eval_loss": 0.913748562335968, |
|
"eval_runtime": 20.7611, |
|
"eval_samples_per_second": 21.193, |
|
"eval_steps_per_second": 21.193, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.18694384206984221, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.9922881846953333e-05, |
|
"loss": 0.8231, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.1944215957526359, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.99153730128644e-05, |
|
"loss": 0.8186, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.2018993494354296, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.9907516950721638e-05, |
|
"loss": 0.7854, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.20937710311822327, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.9899313935637764e-05, |
|
"loss": 0.7851, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.216854856801017, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.9890764254875488e-05, |
|
"loss": 0.7724, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.22433261048381067, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.9881868207837466e-05, |
|
"loss": 0.7909, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.23181036416660436, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.987262610605581e-05, |
|
"loss": 0.7608, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.23928811784939805, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.9863038273181187e-05, |
|
"loss": 0.765, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.24676587153219173, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.985310504497146e-05, |
|
"loss": 0.7695, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.2542436252149854, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.9842826769279965e-05, |
|
"loss": 0.7542, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.26172137889777913, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.9832203806043296e-05, |
|
"loss": 0.7329, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.2691991325805728, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.9821236527268727e-05, |
|
"loss": 0.7359, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2691991325805728, |
|
"eval_loss": 0.8687529563903809, |
|
"eval_runtime": 19.5776, |
|
"eval_samples_per_second": 22.475, |
|
"eval_steps_per_second": 22.475, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2766768862633665, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.980992531702117e-05, |
|
"loss": 0.731, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.2841546399461602, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.979827057140973e-05, |
|
"loss": 0.7352, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.2916323936289539, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.978627269857383e-05, |
|
"loss": 0.72, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.29911014731174757, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.9773932118668924e-05, |
|
"loss": 0.7262, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.30658790099454125, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.9761249263851777e-05, |
|
"loss": 0.7234, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.31406565467733494, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.9748224578265338e-05, |
|
"loss": 0.7164, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3215434083601286, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.973485851802318e-05, |
|
"loss": 0.7072, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.3290211620429223, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.9721151551193534e-05, |
|
"loss": 0.7091, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.336498915725716, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.970710415778289e-05, |
|
"loss": 0.715, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.3439766694085097, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.9692716829719197e-05, |
|
"loss": 0.691, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.35145442309130337, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.967799007083462e-05, |
|
"loss": 0.7097, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.35893217677409706, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 1.9662924396847923e-05, |
|
"loss": 0.7124, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.35893217677409706, |
|
"eval_loss": 0.8409842848777771, |
|
"eval_runtime": 20.5718, |
|
"eval_samples_per_second": 21.389, |
|
"eval_steps_per_second": 21.389, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.36640993045689074, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.9647520335346377e-05, |
|
"loss": 0.6843, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.37388768413968443, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 1.963177842576731e-05, |
|
"loss": 0.6928, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3813654378224781, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.961569921937921e-05, |
|
"loss": 0.6937, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.3888431915052718, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.9599283279262393e-05, |
|
"loss": 0.6796, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3963209451880655, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.9582531180289342e-05, |
|
"loss": 0.6784, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.4037986988708592, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.956544350910452e-05, |
|
"loss": 0.6916, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.41127645255365286, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.954802086410385e-05, |
|
"loss": 0.6677, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.41875420623644655, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.9530263855413763e-05, |
|
"loss": 0.6672, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.42623195991924023, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 1.951217310486982e-05, |
|
"loss": 0.6689, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.433709713602034, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.9493749245994946e-05, |
|
"loss": 0.6642, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.44118746728482766, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 1.947499292397724e-05, |
|
"loss": 0.652, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.44866522096762135, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 1.945590479564738e-05, |
|
"loss": 0.648, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.44866522096762135, |
|
"eval_loss": 0.8143633008003235, |
|
"eval_runtime": 20.3858, |
|
"eval_samples_per_second": 21.584, |
|
"eval_steps_per_second": 21.584, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.45614297465041503, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.9436485529455628e-05, |
|
"loss": 0.6777, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.4636207283332087, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.941673580544841e-05, |
|
"loss": 0.6505, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.4710984820160024, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.9396656315244507e-05, |
|
"loss": 0.6435, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.4785762356987961, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 1.9376247762010844e-05, |
|
"loss": 0.6271, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4860539893815898, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 1.9355510860437852e-05, |
|
"loss": 0.6428, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.49353174306438347, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.9334446336714446e-05, |
|
"loss": 0.6559, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.5010094967471771, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 1.9313054928502596e-05, |
|
"loss": 0.6709, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.5084872504299708, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.929133738491149e-05, |
|
"loss": 0.6513, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5159650041127646, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.9269294466471306e-05, |
|
"loss": 0.6478, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.5234427577955583, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.9246926945106574e-05, |
|
"loss": 0.6424, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.530920511478352, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.9224235604109153e-05, |
|
"loss": 0.6362, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.5383982651611456, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 1.9201221238110783e-05, |
|
"loss": 0.6278, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5383982651611456, |
|
"eval_loss": 0.785617470741272, |
|
"eval_runtime": 20.5638, |
|
"eval_samples_per_second": 21.397, |
|
"eval_steps_per_second": 21.397, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5458760188439393, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 1.917788465305528e-05, |
|
"loss": 0.6042, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.553353772526733, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.9154226666170296e-05, |
|
"loss": 0.6188, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5608315262095267, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 1.9130248105938705e-05, |
|
"loss": 0.6206, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.5683092798923204, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.9105949812069592e-05, |
|
"loss": 0.644, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5757870335751141, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 1.9081332635468844e-05, |
|
"loss": 0.6196, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.5832647872579078, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.9056397438209366e-05, |
|
"loss": 0.6196, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5907425409407014, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.9031145093500855e-05, |
|
"loss": 0.6264, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.5982202946234951, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.9005576485659274e-05, |
|
"loss": 0.5989, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6056980483062888, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 1.897969251007584e-05, |
|
"loss": 0.6121, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.6131758019890825, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.8953494073185684e-05, |
|
"loss": 0.6285, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.6206535556718762, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.8926982092436117e-05, |
|
"loss": 0.6158, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.6281313093546699, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.890015749625448e-05, |
|
"loss": 0.6174, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6281313093546699, |
|
"eval_loss": 0.7781485915184021, |
|
"eval_runtime": 21.1506, |
|
"eval_samples_per_second": 20.803, |
|
"eval_steps_per_second": 20.803, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6356090630374636, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 1.8873021224015662e-05, |
|
"loss": 0.5945, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.6430868167202572, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 1.884557422600917e-05, |
|
"loss": 0.6124, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6505645704030509, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.8817817463405872e-05, |
|
"loss": 0.6007, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.6580423240858446, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.878975190822434e-05, |
|
"loss": 0.6045, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6655200777686383, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 1.8761378543296795e-05, |
|
"loss": 0.5837, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.672997831451432, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.8732698362234696e-05, |
|
"loss": 0.5839, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.6804755851342257, |
|
"grad_norm": 3.875, |
|
"learning_rate": 1.8703712369393953e-05, |
|
"loss": 0.5932, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.6879533388170194, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.867442157983975e-05, |
|
"loss": 0.5795, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.695431092499813, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.8644827019310984e-05, |
|
"loss": 0.5887, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.7029088461826067, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.861492972418437e-05, |
|
"loss": 0.6041, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.7103865998654004, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.8584730741438128e-05, |
|
"loss": 0.5676, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.7178643535481941, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.855423112861532e-05, |
|
"loss": 0.5752, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7178643535481941, |
|
"eval_loss": 0.7577213644981384, |
|
"eval_runtime": 19.9112, |
|
"eval_samples_per_second": 22.098, |
|
"eval_steps_per_second": 22.098, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7253421072309878, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.8523431953786838e-05, |
|
"loss": 0.5731, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.7328198609137815, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 1.8492334295513968e-05, |
|
"loss": 0.5611, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.7402976145965752, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.846093924281065e-05, |
|
"loss": 0.5787, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.7477753682793689, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.8429247895105314e-05, |
|
"loss": 0.5611, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7552531219621625, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.8397261362202402e-05, |
|
"loss": 0.5805, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.7627308756449562, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 1.836498076424349e-05, |
|
"loss": 0.5648, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.7702086293277499, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 1.833240723166807e-05, |
|
"loss": 0.5809, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.7776863830105436, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.8299541905173955e-05, |
|
"loss": 0.5789, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7851641366933373, |
|
"grad_norm": 3.75, |
|
"learning_rate": 1.8266385935677338e-05, |
|
"loss": 0.5672, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.792641890376131, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 1.8232940484272482e-05, |
|
"loss": 0.5657, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.8001196440589247, |
|
"grad_norm": 3.875, |
|
"learning_rate": 1.819920672219108e-05, |
|
"loss": 0.5514, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.8075973977417183, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.8165185830761193e-05, |
|
"loss": 0.5625, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.8075973977417183, |
|
"eval_loss": 0.736329972743988, |
|
"eval_runtime": 19.4873, |
|
"eval_samples_per_second": 22.579, |
|
"eval_steps_per_second": 22.579, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.815075151424512, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1.8130879001365944e-05, |
|
"loss": 0.5391, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.8225529051073057, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.8096287435401744e-05, |
|
"loss": 0.5438, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.8300306587900994, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.8061412344236245e-05, |
|
"loss": 0.5504, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.8375084124728931, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 1.8026254949165915e-05, |
|
"loss": 0.5569, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8449861661556868, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.7990816481373267e-05, |
|
"loss": 0.5397, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.8524639198384805, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.795509818188375e-05, |
|
"loss": 0.5519, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.8599416735212743, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.791910130152227e-05, |
|
"loss": 0.539, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.867419427204068, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.788282710086942e-05, |
|
"loss": 0.5517, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8748971808868616, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 1.784627685021731e-05, |
|
"loss": 0.5283, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.8823749345696553, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.7809451829525083e-05, |
|
"loss": 0.5439, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.889852688252449, |
|
"grad_norm": 3.75, |
|
"learning_rate": 1.777235332837411e-05, |
|
"loss": 0.5311, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.8973304419352427, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.773498264592281e-05, |
|
"loss": 0.5298, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8973304419352427, |
|
"eval_loss": 0.7269648313522339, |
|
"eval_runtime": 20.5732, |
|
"eval_samples_per_second": 21.387, |
|
"eval_steps_per_second": 21.387, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9048081956180364, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.7697341090861163e-05, |
|
"loss": 0.5296, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.9122859493008301, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.7659429981364887e-05, |
|
"loss": 0.5318, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.9197637029836238, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.7621250645049267e-05, |
|
"loss": 0.5406, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.9272414566664174, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.7582804418922666e-05, |
|
"loss": 0.5264, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9347192103492111, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.7544092649339704e-05, |
|
"loss": 0.5119, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.9421969640320048, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.7505116691954117e-05, |
|
"loss": 0.5283, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.9496747177147985, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.746587791167126e-05, |
|
"loss": 0.5297, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.9571524713975922, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.7426377682600345e-05, |
|
"loss": 0.52, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9646302250803859, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 1.738661738800629e-05, |
|
"loss": 0.515, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.9721079787631796, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.7346598420261294e-05, |
|
"loss": 0.5171, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.9795857324459732, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 1.730632218079607e-05, |
|
"loss": 0.5142, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.9870634861287669, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.7265790080050772e-05, |
|
"loss": 0.5355, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9870634861287669, |
|
"eval_loss": 0.706967294216156, |
|
"eval_runtime": 20.875, |
|
"eval_samples_per_second": 21.078, |
|
"eval_steps_per_second": 21.078, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9945412398115606, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.7225003537425603e-05, |
|
"loss": 0.5135, |
|
"step": 3325 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 13372, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 1672, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.2890860801163264e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|