llama3.1-8b-translate-ro-rup-en / trainer_state.json
snisioi's picture
Upload folder using huggingface_hub
390c8ad verified
raw
history blame
25 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0002243326104838,
"eval_steps": 300,
"global_step": 3344,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00029911014731174755,
"eval_loss": 2.900763511657715,
"eval_runtime": 20.57,
"eval_samples_per_second": 21.39,
"eval_steps_per_second": 21.39,
"step": 1
},
{
"epoch": 0.007477753682793689,
"grad_norm": 9.5,
"learning_rate": 5e-06,
"loss": 2.3534,
"step": 25
},
{
"epoch": 0.014955507365587378,
"grad_norm": 9.0,
"learning_rate": 1e-05,
"loss": 1.8,
"step": 50
},
{
"epoch": 0.022433261048381066,
"grad_norm": 8.1875,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.5749,
"step": 75
},
{
"epoch": 0.029911014731174756,
"grad_norm": 7.0625,
"learning_rate": 2e-05,
"loss": 1.3312,
"step": 100
},
{
"epoch": 0.037388768413968446,
"grad_norm": 6.84375,
"learning_rate": 1.9999824904190002e-05,
"loss": 1.2434,
"step": 125
},
{
"epoch": 0.04486652209676213,
"grad_norm": 6.375,
"learning_rate": 1.999929962289171e-05,
"loss": 1.1479,
"step": 150
},
{
"epoch": 0.05234427577955582,
"grad_norm": 6.625,
"learning_rate": 1.9998424174500043e-05,
"loss": 1.103,
"step": 175
},
{
"epoch": 0.05982202946234951,
"grad_norm": 6.96875,
"learning_rate": 1.9997198589672462e-05,
"loss": 1.0688,
"step": 200
},
{
"epoch": 0.0672997831451432,
"grad_norm": 6.34375,
"learning_rate": 1.9995622911327924e-05,
"loss": 1.0343,
"step": 225
},
{
"epoch": 0.07477753682793689,
"grad_norm": 6.0,
"learning_rate": 1.9993697194645362e-05,
"loss": 0.9735,
"step": 250
},
{
"epoch": 0.08225529051073058,
"grad_norm": 6.40625,
"learning_rate": 1.9991421507061763e-05,
"loss": 0.975,
"step": 275
},
{
"epoch": 0.08973304419352426,
"grad_norm": 6.03125,
"learning_rate": 1.99887959282698e-05,
"loss": 0.9508,
"step": 300
},
{
"epoch": 0.08973304419352426,
"eval_loss": 1.0376813411712646,
"eval_runtime": 19.5994,
"eval_samples_per_second": 22.45,
"eval_steps_per_second": 22.45,
"step": 300
},
{
"epoch": 0.09721079787631795,
"grad_norm": 5.3125,
"learning_rate": 1.9985820550215044e-05,
"loss": 0.9618,
"step": 325
},
{
"epoch": 0.10468855155911164,
"grad_norm": 5.3125,
"learning_rate": 1.998249547709273e-05,
"loss": 0.9224,
"step": 350
},
{
"epoch": 0.11216630524190534,
"grad_norm": 5.3125,
"learning_rate": 1.9978820825344147e-05,
"loss": 0.8982,
"step": 375
},
{
"epoch": 0.11964405892469902,
"grad_norm": 5.125,
"learning_rate": 1.9974796723652508e-05,
"loss": 0.9116,
"step": 400
},
{
"epoch": 0.1271218126074927,
"grad_norm": 5.5625,
"learning_rate": 1.9970423312938488e-05,
"loss": 0.8981,
"step": 425
},
{
"epoch": 0.1345995662902864,
"grad_norm": 5.21875,
"learning_rate": 1.996570074635527e-05,
"loss": 0.8831,
"step": 450
},
{
"epoch": 0.1420773199730801,
"grad_norm": 5.125,
"learning_rate": 1.9960629189283165e-05,
"loss": 0.8393,
"step": 475
},
{
"epoch": 0.14955507365587378,
"grad_norm": 5.21875,
"learning_rate": 1.9955208819323864e-05,
"loss": 0.851,
"step": 500
},
{
"epoch": 0.15703282733866747,
"grad_norm": 4.9375,
"learning_rate": 1.9949439826294178e-05,
"loss": 0.8164,
"step": 525
},
{
"epoch": 0.16451058102146116,
"grad_norm": 5.28125,
"learning_rate": 1.9943322412219398e-05,
"loss": 0.8388,
"step": 550
},
{
"epoch": 0.17198833470425484,
"grad_norm": 4.90625,
"learning_rate": 1.9936856791326255e-05,
"loss": 0.8275,
"step": 575
},
{
"epoch": 0.17946608838704853,
"grad_norm": 5.125,
"learning_rate": 1.9930043190035364e-05,
"loss": 0.8071,
"step": 600
},
{
"epoch": 0.17946608838704853,
"eval_loss": 0.913748562335968,
"eval_runtime": 20.7611,
"eval_samples_per_second": 21.193,
"eval_steps_per_second": 21.193,
"step": 600
},
{
"epoch": 0.18694384206984221,
"grad_norm": 4.625,
"learning_rate": 1.9922881846953333e-05,
"loss": 0.8231,
"step": 625
},
{
"epoch": 0.1944215957526359,
"grad_norm": 4.96875,
"learning_rate": 1.99153730128644e-05,
"loss": 0.8186,
"step": 650
},
{
"epoch": 0.2018993494354296,
"grad_norm": 4.90625,
"learning_rate": 1.9907516950721638e-05,
"loss": 0.7854,
"step": 675
},
{
"epoch": 0.20937710311822327,
"grad_norm": 4.65625,
"learning_rate": 1.9899313935637764e-05,
"loss": 0.7851,
"step": 700
},
{
"epoch": 0.216854856801017,
"grad_norm": 4.4375,
"learning_rate": 1.9890764254875488e-05,
"loss": 0.7724,
"step": 725
},
{
"epoch": 0.22433261048381067,
"grad_norm": 4.5,
"learning_rate": 1.9881868207837466e-05,
"loss": 0.7909,
"step": 750
},
{
"epoch": 0.23181036416660436,
"grad_norm": 4.5625,
"learning_rate": 1.987262610605581e-05,
"loss": 0.7608,
"step": 775
},
{
"epoch": 0.23928811784939805,
"grad_norm": 4.78125,
"learning_rate": 1.9863038273181187e-05,
"loss": 0.765,
"step": 800
},
{
"epoch": 0.24676587153219173,
"grad_norm": 4.6875,
"learning_rate": 1.985310504497146e-05,
"loss": 0.7695,
"step": 825
},
{
"epoch": 0.2542436252149854,
"grad_norm": 4.3125,
"learning_rate": 1.9842826769279965e-05,
"loss": 0.7542,
"step": 850
},
{
"epoch": 0.26172137889777913,
"grad_norm": 4.53125,
"learning_rate": 1.9832203806043296e-05,
"loss": 0.7329,
"step": 875
},
{
"epoch": 0.2691991325805728,
"grad_norm": 4.4375,
"learning_rate": 1.9821236527268727e-05,
"loss": 0.7359,
"step": 900
},
{
"epoch": 0.2691991325805728,
"eval_loss": 0.8687529563903809,
"eval_runtime": 19.5776,
"eval_samples_per_second": 22.475,
"eval_steps_per_second": 22.475,
"step": 900
},
{
"epoch": 0.2766768862633665,
"grad_norm": 4.375,
"learning_rate": 1.980992531702117e-05,
"loss": 0.731,
"step": 925
},
{
"epoch": 0.2841546399461602,
"grad_norm": 4.34375,
"learning_rate": 1.979827057140973e-05,
"loss": 0.7352,
"step": 950
},
{
"epoch": 0.2916323936289539,
"grad_norm": 4.875,
"learning_rate": 1.978627269857383e-05,
"loss": 0.72,
"step": 975
},
{
"epoch": 0.29911014731174757,
"grad_norm": 4.375,
"learning_rate": 1.9773932118668924e-05,
"loss": 0.7262,
"step": 1000
},
{
"epoch": 0.30658790099454125,
"grad_norm": 4.40625,
"learning_rate": 1.9761249263851777e-05,
"loss": 0.7234,
"step": 1025
},
{
"epoch": 0.31406565467733494,
"grad_norm": 4.40625,
"learning_rate": 1.9748224578265338e-05,
"loss": 0.7164,
"step": 1050
},
{
"epoch": 0.3215434083601286,
"grad_norm": 5.1875,
"learning_rate": 1.973485851802318e-05,
"loss": 0.7072,
"step": 1075
},
{
"epoch": 0.3290211620429223,
"grad_norm": 4.9375,
"learning_rate": 1.9721151551193534e-05,
"loss": 0.7091,
"step": 1100
},
{
"epoch": 0.336498915725716,
"grad_norm": 4.25,
"learning_rate": 1.970710415778289e-05,
"loss": 0.715,
"step": 1125
},
{
"epoch": 0.3439766694085097,
"grad_norm": 4.59375,
"learning_rate": 1.9692716829719197e-05,
"loss": 0.691,
"step": 1150
},
{
"epoch": 0.35145442309130337,
"grad_norm": 4.5625,
"learning_rate": 1.967799007083462e-05,
"loss": 0.7097,
"step": 1175
},
{
"epoch": 0.35893217677409706,
"grad_norm": 3.96875,
"learning_rate": 1.9662924396847923e-05,
"loss": 0.7124,
"step": 1200
},
{
"epoch": 0.35893217677409706,
"eval_loss": 0.8409842848777771,
"eval_runtime": 20.5718,
"eval_samples_per_second": 21.389,
"eval_steps_per_second": 21.389,
"step": 1200
},
{
"epoch": 0.36640993045689074,
"grad_norm": 4.25,
"learning_rate": 1.9647520335346377e-05,
"loss": 0.6843,
"step": 1225
},
{
"epoch": 0.37388768413968443,
"grad_norm": 4.03125,
"learning_rate": 1.963177842576731e-05,
"loss": 0.6928,
"step": 1250
},
{
"epoch": 0.3813654378224781,
"grad_norm": 4.15625,
"learning_rate": 1.961569921937921e-05,
"loss": 0.6937,
"step": 1275
},
{
"epoch": 0.3888431915052718,
"grad_norm": 4.0,
"learning_rate": 1.9599283279262393e-05,
"loss": 0.6796,
"step": 1300
},
{
"epoch": 0.3963209451880655,
"grad_norm": 4.21875,
"learning_rate": 1.9582531180289342e-05,
"loss": 0.6784,
"step": 1325
},
{
"epoch": 0.4037986988708592,
"grad_norm": 4.34375,
"learning_rate": 1.956544350910452e-05,
"loss": 0.6916,
"step": 1350
},
{
"epoch": 0.41127645255365286,
"grad_norm": 4.25,
"learning_rate": 1.954802086410385e-05,
"loss": 0.6677,
"step": 1375
},
{
"epoch": 0.41875420623644655,
"grad_norm": 4.5,
"learning_rate": 1.9530263855413763e-05,
"loss": 0.6672,
"step": 1400
},
{
"epoch": 0.42623195991924023,
"grad_norm": 3.96875,
"learning_rate": 1.951217310486982e-05,
"loss": 0.6689,
"step": 1425
},
{
"epoch": 0.433709713602034,
"grad_norm": 4.15625,
"learning_rate": 1.9493749245994946e-05,
"loss": 0.6642,
"step": 1450
},
{
"epoch": 0.44118746728482766,
"grad_norm": 3.953125,
"learning_rate": 1.947499292397724e-05,
"loss": 0.652,
"step": 1475
},
{
"epoch": 0.44866522096762135,
"grad_norm": 3.84375,
"learning_rate": 1.945590479564738e-05,
"loss": 0.648,
"step": 1500
},
{
"epoch": 0.44866522096762135,
"eval_loss": 0.8143633008003235,
"eval_runtime": 20.3858,
"eval_samples_per_second": 21.584,
"eval_steps_per_second": 21.584,
"step": 1500
},
{
"epoch": 0.45614297465041503,
"grad_norm": 4.125,
"learning_rate": 1.9436485529455628e-05,
"loss": 0.6777,
"step": 1525
},
{
"epoch": 0.4636207283332087,
"grad_norm": 4.375,
"learning_rate": 1.941673580544841e-05,
"loss": 0.6505,
"step": 1550
},
{
"epoch": 0.4710984820160024,
"grad_norm": 4.09375,
"learning_rate": 1.9396656315244507e-05,
"loss": 0.6435,
"step": 1575
},
{
"epoch": 0.4785762356987961,
"grad_norm": 3.734375,
"learning_rate": 1.9376247762010844e-05,
"loss": 0.6271,
"step": 1600
},
{
"epoch": 0.4860539893815898,
"grad_norm": 3.953125,
"learning_rate": 1.9355510860437852e-05,
"loss": 0.6428,
"step": 1625
},
{
"epoch": 0.49353174306438347,
"grad_norm": 3.984375,
"learning_rate": 1.9334446336714446e-05,
"loss": 0.6559,
"step": 1650
},
{
"epoch": 0.5010094967471771,
"grad_norm": 3.96875,
"learning_rate": 1.9313054928502596e-05,
"loss": 0.6709,
"step": 1675
},
{
"epoch": 0.5084872504299708,
"grad_norm": 4.21875,
"learning_rate": 1.929133738491149e-05,
"loss": 0.6513,
"step": 1700
},
{
"epoch": 0.5159650041127646,
"grad_norm": 4.21875,
"learning_rate": 1.9269294466471306e-05,
"loss": 0.6478,
"step": 1725
},
{
"epoch": 0.5234427577955583,
"grad_norm": 4.21875,
"learning_rate": 1.9246926945106574e-05,
"loss": 0.6424,
"step": 1750
},
{
"epoch": 0.530920511478352,
"grad_norm": 4.53125,
"learning_rate": 1.9224235604109153e-05,
"loss": 0.6362,
"step": 1775
},
{
"epoch": 0.5383982651611456,
"grad_norm": 3.9375,
"learning_rate": 1.9201221238110783e-05,
"loss": 0.6278,
"step": 1800
},
{
"epoch": 0.5383982651611456,
"eval_loss": 0.785617470741272,
"eval_runtime": 20.5638,
"eval_samples_per_second": 21.397,
"eval_steps_per_second": 21.397,
"step": 1800
},
{
"epoch": 0.5458760188439393,
"grad_norm": 3.78125,
"learning_rate": 1.917788465305528e-05,
"loss": 0.6042,
"step": 1825
},
{
"epoch": 0.553353772526733,
"grad_norm": 4.0,
"learning_rate": 1.9154226666170296e-05,
"loss": 0.6188,
"step": 1850
},
{
"epoch": 0.5608315262095267,
"grad_norm": 5.84375,
"learning_rate": 1.9130248105938705e-05,
"loss": 0.6206,
"step": 1875
},
{
"epoch": 0.5683092798923204,
"grad_norm": 3.8125,
"learning_rate": 1.9105949812069592e-05,
"loss": 0.644,
"step": 1900
},
{
"epoch": 0.5757870335751141,
"grad_norm": 3.890625,
"learning_rate": 1.9081332635468844e-05,
"loss": 0.6196,
"step": 1925
},
{
"epoch": 0.5832647872579078,
"grad_norm": 4.59375,
"learning_rate": 1.9056397438209366e-05,
"loss": 0.6196,
"step": 1950
},
{
"epoch": 0.5907425409407014,
"grad_norm": 3.859375,
"learning_rate": 1.9031145093500855e-05,
"loss": 0.6264,
"step": 1975
},
{
"epoch": 0.5982202946234951,
"grad_norm": 4.09375,
"learning_rate": 1.9005576485659274e-05,
"loss": 0.5989,
"step": 2000
},
{
"epoch": 0.6056980483062888,
"grad_norm": 3.796875,
"learning_rate": 1.897969251007584e-05,
"loss": 0.6121,
"step": 2025
},
{
"epoch": 0.6131758019890825,
"grad_norm": 4.1875,
"learning_rate": 1.8953494073185684e-05,
"loss": 0.6285,
"step": 2050
},
{
"epoch": 0.6206535556718762,
"grad_norm": 4.28125,
"learning_rate": 1.8926982092436117e-05,
"loss": 0.6158,
"step": 2075
},
{
"epoch": 0.6281313093546699,
"grad_norm": 3.375,
"learning_rate": 1.890015749625448e-05,
"loss": 0.6174,
"step": 2100
},
{
"epoch": 0.6281313093546699,
"eval_loss": 0.7781485915184021,
"eval_runtime": 21.1506,
"eval_samples_per_second": 20.803,
"eval_steps_per_second": 20.803,
"step": 2100
},
{
"epoch": 0.6356090630374636,
"grad_norm": 3.921875,
"learning_rate": 1.8873021224015662e-05,
"loss": 0.5945,
"step": 2125
},
{
"epoch": 0.6430868167202572,
"grad_norm": 3.921875,
"learning_rate": 1.884557422600917e-05,
"loss": 0.6124,
"step": 2150
},
{
"epoch": 0.6505645704030509,
"grad_norm": 3.71875,
"learning_rate": 1.8817817463405872e-05,
"loss": 0.6007,
"step": 2175
},
{
"epoch": 0.6580423240858446,
"grad_norm": 4.3125,
"learning_rate": 1.878975190822434e-05,
"loss": 0.6045,
"step": 2200
},
{
"epoch": 0.6655200777686383,
"grad_norm": 3.84375,
"learning_rate": 1.8761378543296795e-05,
"loss": 0.5837,
"step": 2225
},
{
"epoch": 0.672997831451432,
"grad_norm": 3.515625,
"learning_rate": 1.8732698362234696e-05,
"loss": 0.5839,
"step": 2250
},
{
"epoch": 0.6804755851342257,
"grad_norm": 3.875,
"learning_rate": 1.8703712369393953e-05,
"loss": 0.5932,
"step": 2275
},
{
"epoch": 0.6879533388170194,
"grad_norm": 4.0625,
"learning_rate": 1.867442157983975e-05,
"loss": 0.5795,
"step": 2300
},
{
"epoch": 0.695431092499813,
"grad_norm": 4.125,
"learning_rate": 1.8644827019310984e-05,
"loss": 0.5887,
"step": 2325
},
{
"epoch": 0.7029088461826067,
"grad_norm": 4.21875,
"learning_rate": 1.861492972418437e-05,
"loss": 0.6041,
"step": 2350
},
{
"epoch": 0.7103865998654004,
"grad_norm": 3.53125,
"learning_rate": 1.8584730741438128e-05,
"loss": 0.5676,
"step": 2375
},
{
"epoch": 0.7178643535481941,
"grad_norm": 4.3125,
"learning_rate": 1.855423112861532e-05,
"loss": 0.5752,
"step": 2400
},
{
"epoch": 0.7178643535481941,
"eval_loss": 0.7577213644981384,
"eval_runtime": 19.9112,
"eval_samples_per_second": 22.098,
"eval_steps_per_second": 22.098,
"step": 2400
},
{
"epoch": 0.7253421072309878,
"grad_norm": 3.6875,
"learning_rate": 1.8523431953786838e-05,
"loss": 0.5731,
"step": 2425
},
{
"epoch": 0.7328198609137815,
"grad_norm": 3.890625,
"learning_rate": 1.8492334295513968e-05,
"loss": 0.5611,
"step": 2450
},
{
"epoch": 0.7402976145965752,
"grad_norm": 3.859375,
"learning_rate": 1.846093924281065e-05,
"loss": 0.5787,
"step": 2475
},
{
"epoch": 0.7477753682793689,
"grad_norm": 3.984375,
"learning_rate": 1.8429247895105314e-05,
"loss": 0.5611,
"step": 2500
},
{
"epoch": 0.7552531219621625,
"grad_norm": 4.15625,
"learning_rate": 1.8397261362202402e-05,
"loss": 0.5805,
"step": 2525
},
{
"epoch": 0.7627308756449562,
"grad_norm": 4.03125,
"learning_rate": 1.836498076424349e-05,
"loss": 0.5648,
"step": 2550
},
{
"epoch": 0.7702086293277499,
"grad_norm": 3.78125,
"learning_rate": 1.833240723166807e-05,
"loss": 0.5809,
"step": 2575
},
{
"epoch": 0.7776863830105436,
"grad_norm": 3.484375,
"learning_rate": 1.8299541905173955e-05,
"loss": 0.5789,
"step": 2600
},
{
"epoch": 0.7851641366933373,
"grad_norm": 3.75,
"learning_rate": 1.8266385935677338e-05,
"loss": 0.5672,
"step": 2625
},
{
"epoch": 0.792641890376131,
"grad_norm": 3.796875,
"learning_rate": 1.8232940484272482e-05,
"loss": 0.5657,
"step": 2650
},
{
"epoch": 0.8001196440589247,
"grad_norm": 3.875,
"learning_rate": 1.819920672219108e-05,
"loss": 0.5514,
"step": 2675
},
{
"epoch": 0.8075973977417183,
"grad_norm": 4.09375,
"learning_rate": 1.8165185830761193e-05,
"loss": 0.5625,
"step": 2700
},
{
"epoch": 0.8075973977417183,
"eval_loss": 0.736329972743988,
"eval_runtime": 19.4873,
"eval_samples_per_second": 22.579,
"eval_steps_per_second": 22.579,
"step": 2700
},
{
"epoch": 0.815075151424512,
"grad_norm": 3.703125,
"learning_rate": 1.8130879001365944e-05,
"loss": 0.5391,
"step": 2725
},
{
"epoch": 0.8225529051073057,
"grad_norm": 3.71875,
"learning_rate": 1.8096287435401744e-05,
"loss": 0.5438,
"step": 2750
},
{
"epoch": 0.8300306587900994,
"grad_norm": 3.859375,
"learning_rate": 1.8061412344236245e-05,
"loss": 0.5504,
"step": 2775
},
{
"epoch": 0.8375084124728931,
"grad_norm": 3.828125,
"learning_rate": 1.8026254949165915e-05,
"loss": 0.5569,
"step": 2800
},
{
"epoch": 0.8449861661556868,
"grad_norm": 3.46875,
"learning_rate": 1.7990816481373267e-05,
"loss": 0.5397,
"step": 2825
},
{
"epoch": 0.8524639198384805,
"grad_norm": 3.609375,
"learning_rate": 1.795509818188375e-05,
"loss": 0.5519,
"step": 2850
},
{
"epoch": 0.8599416735212743,
"grad_norm": 3.609375,
"learning_rate": 1.791910130152227e-05,
"loss": 0.539,
"step": 2875
},
{
"epoch": 0.867419427204068,
"grad_norm": 4.125,
"learning_rate": 1.788282710086942e-05,
"loss": 0.5517,
"step": 2900
},
{
"epoch": 0.8748971808868616,
"grad_norm": 3.421875,
"learning_rate": 1.784627685021731e-05,
"loss": 0.5283,
"step": 2925
},
{
"epoch": 0.8823749345696553,
"grad_norm": 3.6875,
"learning_rate": 1.7809451829525083e-05,
"loss": 0.5439,
"step": 2950
},
{
"epoch": 0.889852688252449,
"grad_norm": 3.75,
"learning_rate": 1.777235332837411e-05,
"loss": 0.5311,
"step": 2975
},
{
"epoch": 0.8973304419352427,
"grad_norm": 3.984375,
"learning_rate": 1.773498264592281e-05,
"loss": 0.5298,
"step": 3000
},
{
"epoch": 0.8973304419352427,
"eval_loss": 0.7269648313522339,
"eval_runtime": 20.5732,
"eval_samples_per_second": 21.387,
"eval_steps_per_second": 21.387,
"step": 3000
},
{
"epoch": 0.9048081956180364,
"grad_norm": 4.125,
"learning_rate": 1.7697341090861163e-05,
"loss": 0.5296,
"step": 3025
},
{
"epoch": 0.9122859493008301,
"grad_norm": 3.375,
"learning_rate": 1.7659429981364887e-05,
"loss": 0.5318,
"step": 3050
},
{
"epoch": 0.9197637029836238,
"grad_norm": 4.125,
"learning_rate": 1.7621250645049267e-05,
"loss": 0.5406,
"step": 3075
},
{
"epoch": 0.9272414566664174,
"grad_norm": 3.6875,
"learning_rate": 1.7582804418922666e-05,
"loss": 0.5264,
"step": 3100
},
{
"epoch": 0.9347192103492111,
"grad_norm": 4.25,
"learning_rate": 1.7544092649339704e-05,
"loss": 0.5119,
"step": 3125
},
{
"epoch": 0.9421969640320048,
"grad_norm": 3.546875,
"learning_rate": 1.7505116691954117e-05,
"loss": 0.5283,
"step": 3150
},
{
"epoch": 0.9496747177147985,
"grad_norm": 3.8125,
"learning_rate": 1.746587791167126e-05,
"loss": 0.5297,
"step": 3175
},
{
"epoch": 0.9571524713975922,
"grad_norm": 3.609375,
"learning_rate": 1.7426377682600345e-05,
"loss": 0.52,
"step": 3200
},
{
"epoch": 0.9646302250803859,
"grad_norm": 3.828125,
"learning_rate": 1.738661738800629e-05,
"loss": 0.515,
"step": 3225
},
{
"epoch": 0.9721079787631796,
"grad_norm": 3.671875,
"learning_rate": 1.7346598420261294e-05,
"loss": 0.5171,
"step": 3250
},
{
"epoch": 0.9795857324459732,
"grad_norm": 3.65625,
"learning_rate": 1.730632218079607e-05,
"loss": 0.5142,
"step": 3275
},
{
"epoch": 0.9870634861287669,
"grad_norm": 3.375,
"learning_rate": 1.7265790080050772e-05,
"loss": 0.5355,
"step": 3300
},
{
"epoch": 0.9870634861287669,
"eval_loss": 0.706967294216156,
"eval_runtime": 20.875,
"eval_samples_per_second": 21.078,
"eval_steps_per_second": 21.078,
"step": 3300
},
{
"epoch": 0.9945412398115606,
"grad_norm": 3.5625,
"learning_rate": 1.7225003537425603e-05,
"loss": 0.5135,
"step": 3325
}
],
"logging_steps": 25,
"max_steps": 13372,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 1672,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.2890860801163264e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}