ganga-1b / trainer_state.json
Aamod37's picture
Upload folder using huggingface_hub
ec38b03 verified
raw
history blame
39.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.47914153807761095,
"eval_steps": 3000,
"global_step": 10800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002218247861470421,
"grad_norm": 0.15810145437717438,
"learning_rate": 2e-05,
"loss": 2.8627,
"step": 50
},
{
"epoch": 0.004436495722940842,
"grad_norm": 0.1590433567762375,
"learning_rate": 4e-05,
"loss": 2.8607,
"step": 100
},
{
"epoch": 0.006654743584411263,
"grad_norm": 0.15798641741275787,
"learning_rate": 6e-05,
"loss": 2.8623,
"step": 150
},
{
"epoch": 0.008872991445881684,
"grad_norm": 0.16127805411815643,
"learning_rate": 8e-05,
"loss": 2.8608,
"step": 200
},
{
"epoch": 0.011091239307352105,
"grad_norm": 0.1587396264076233,
"learning_rate": 0.0001,
"loss": 2.8608,
"step": 250
},
{
"epoch": 0.013309487168822525,
"grad_norm": 0.160736083984375,
"learning_rate": 0.00012,
"loss": 2.8563,
"step": 300
},
{
"epoch": 0.015527735030292948,
"grad_norm": 0.16256989538669586,
"learning_rate": 0.00014,
"loss": 2.8549,
"step": 350
},
{
"epoch": 0.01774598289176337,
"grad_norm": 0.16194568574428558,
"learning_rate": 0.00016,
"loss": 2.8557,
"step": 400
},
{
"epoch": 0.01996423075323379,
"grad_norm": 0.15836463868618011,
"learning_rate": 0.00018,
"loss": 2.8545,
"step": 450
},
{
"epoch": 0.02218247861470421,
"grad_norm": 0.16059577465057373,
"learning_rate": 0.0002,
"loss": 2.8522,
"step": 500
},
{
"epoch": 0.024400726476174632,
"grad_norm": 0.16031378507614136,
"learning_rate": 0.00022000000000000003,
"loss": 2.8481,
"step": 550
},
{
"epoch": 0.02661897433764505,
"grad_norm": 0.16000501811504364,
"learning_rate": 0.00024,
"loss": 2.8431,
"step": 600
},
{
"epoch": 0.028837222199115473,
"grad_norm": 0.15952646732330322,
"learning_rate": 0.00026000000000000003,
"loss": 2.8475,
"step": 650
},
{
"epoch": 0.031055470060585896,
"grad_norm": 0.16443726420402527,
"learning_rate": 0.00028,
"loss": 2.8452,
"step": 700
},
{
"epoch": 0.033273717922056315,
"grad_norm": 0.1644088476896286,
"learning_rate": 0.00030000000000000003,
"loss": 2.8458,
"step": 750
},
{
"epoch": 0.03549196578352674,
"grad_norm": 0.16272033751010895,
"learning_rate": 0.00032,
"loss": 2.8435,
"step": 800
},
{
"epoch": 0.03771021364499716,
"grad_norm": 0.16485804319381714,
"learning_rate": 0.00034,
"loss": 2.8481,
"step": 850
},
{
"epoch": 0.03992846150646758,
"grad_norm": 0.1669188290834427,
"learning_rate": 0.00036,
"loss": 2.8555,
"step": 900
},
{
"epoch": 0.042146709367938,
"grad_norm": 0.16288943588733673,
"learning_rate": 0.00038,
"loss": 2.851,
"step": 950
},
{
"epoch": 0.04436495722940842,
"grad_norm": 0.1651136726140976,
"learning_rate": 0.0004,
"loss": 2.8443,
"step": 1000
},
{
"epoch": 0.04658320509087884,
"grad_norm": 0.16190673410892487,
"learning_rate": 0.00039999468202328424,
"loss": 2.8398,
"step": 1050
},
{
"epoch": 0.048801452952349264,
"grad_norm": 0.1649934947490692,
"learning_rate": 0.00039997872837594555,
"loss": 2.8371,
"step": 1100
},
{
"epoch": 0.051019700813819686,
"grad_norm": 0.16184477508068085,
"learning_rate": 0.00039995213990639536,
"loss": 2.8347,
"step": 1150
},
{
"epoch": 0.0532379486752901,
"grad_norm": 0.1629864126443863,
"learning_rate": 0.0003999149180286022,
"loss": 2.834,
"step": 1200
},
{
"epoch": 0.055456196536760524,
"grad_norm": 0.1627526730298996,
"learning_rate": 0.00039986706472201685,
"loss": 2.8309,
"step": 1250
},
{
"epoch": 0.05767444439823095,
"grad_norm": 0.1642647087574005,
"learning_rate": 0.000399808582531467,
"loss": 2.8352,
"step": 1300
},
{
"epoch": 0.05989269225970137,
"grad_norm": 0.16397783160209656,
"learning_rate": 0.000399739474567022,
"loss": 2.8317,
"step": 1350
},
{
"epoch": 0.06211094012117179,
"grad_norm": 0.16319701075553894,
"learning_rate": 0.00039965974450382726,
"loss": 2.8322,
"step": 1400
},
{
"epoch": 0.06432918798264221,
"grad_norm": 0.16067005693912506,
"learning_rate": 0.000399569396581909,
"loss": 2.8279,
"step": 1450
},
{
"epoch": 0.06654743584411263,
"grad_norm": 0.16118553280830383,
"learning_rate": 0.00039946843560594866,
"loss": 2.8323,
"step": 1500
},
{
"epoch": 0.06876568370558306,
"grad_norm": 0.16291728615760803,
"learning_rate": 0.0003993568669450274,
"loss": 2.8301,
"step": 1550
},
{
"epoch": 0.07098393156705347,
"grad_norm": 0.1590035855770111,
"learning_rate": 0.0003992346965323407,
"loss": 2.8214,
"step": 1600
},
{
"epoch": 0.07320217942852389,
"grad_norm": 0.16236472129821777,
"learning_rate": 0.00039910193086488253,
"loss": 2.8242,
"step": 1650
},
{
"epoch": 0.07542042728999432,
"grad_norm": 0.1617489606142044,
"learning_rate": 0.0003989585770031003,
"loss": 2.8231,
"step": 1700
},
{
"epoch": 0.07763867515146473,
"grad_norm": 0.15960238873958588,
"learning_rate": 0.000398804642570519,
"loss": 2.8248,
"step": 1750
},
{
"epoch": 0.07985692301293516,
"grad_norm": 0.16391754150390625,
"learning_rate": 0.0003986401357533358,
"loss": 2.8222,
"step": 1800
},
{
"epoch": 0.08207517087440558,
"grad_norm": 0.16161847114562988,
"learning_rate": 0.000398465065299985,
"loss": 2.8153,
"step": 1850
},
{
"epoch": 0.084293418735876,
"grad_norm": 0.16447125375270844,
"learning_rate": 0.00039827944052067265,
"loss": 2.818,
"step": 1900
},
{
"epoch": 0.08651166659734642,
"grad_norm": 0.16384591162204742,
"learning_rate": 0.0003980832712868812,
"loss": 2.8093,
"step": 1950
},
{
"epoch": 0.08872991445881684,
"grad_norm": 0.16317427158355713,
"learning_rate": 0.0003978765680308447,
"loss": 2.8113,
"step": 2000
},
{
"epoch": 0.09094816232028727,
"grad_norm": 0.16197824478149414,
"learning_rate": 0.00039765934174499436,
"loss": 2.8134,
"step": 2050
},
{
"epoch": 0.09316641018175768,
"grad_norm": 0.16196754574775696,
"learning_rate": 0.00039743160398137344,
"loss": 2.8147,
"step": 2100
},
{
"epoch": 0.0953846580432281,
"grad_norm": 0.16696424782276154,
"learning_rate": 0.00039719336685102314,
"loss": 2.811,
"step": 2150
},
{
"epoch": 0.09760290590469853,
"grad_norm": 0.16266262531280518,
"learning_rate": 0.0003969446430233386,
"loss": 2.8103,
"step": 2200
},
{
"epoch": 0.09982115376616894,
"grad_norm": 0.16161397099494934,
"learning_rate": 0.0003966854457253951,
"loss": 2.8017,
"step": 2250
},
{
"epoch": 0.10203940162763937,
"grad_norm": 0.1631053388118744,
"learning_rate": 0.0003964157887412445,
"loss": 2.8034,
"step": 2300
},
{
"epoch": 0.10425764948910979,
"grad_norm": 0.16185788810253143,
"learning_rate": 0.00039613568641118255,
"loss": 2.8027,
"step": 2350
},
{
"epoch": 0.1064758973505802,
"grad_norm": 0.16428661346435547,
"learning_rate": 0.00039584515363098584,
"loss": 2.8031,
"step": 2400
},
{
"epoch": 0.10869414521205063,
"grad_norm": 0.1625480055809021,
"learning_rate": 0.00039554420585112,
"loss": 2.7968,
"step": 2450
},
{
"epoch": 0.11091239307352105,
"grad_norm": 0.1638619303703308,
"learning_rate": 0.0003952328590759179,
"loss": 2.8007,
"step": 2500
},
{
"epoch": 0.11313064093499148,
"grad_norm": 0.16504357755184174,
"learning_rate": 0.0003949111298627286,
"loss": 2.7921,
"step": 2550
},
{
"epoch": 0.1153488887964619,
"grad_norm": 0.16429375112056732,
"learning_rate": 0.0003945790353210367,
"loss": 2.7951,
"step": 2600
},
{
"epoch": 0.11756713665793232,
"grad_norm": 0.166097030043602,
"learning_rate": 0.0003942365931115526,
"loss": 2.7948,
"step": 2650
},
{
"epoch": 0.11978538451940274,
"grad_norm": 0.16275139153003693,
"learning_rate": 0.0003938838214452733,
"loss": 2.79,
"step": 2700
},
{
"epoch": 0.12200363238087315,
"grad_norm": 0.16379590332508087,
"learning_rate": 0.0003935207390825137,
"loss": 2.7896,
"step": 2750
},
{
"epoch": 0.12422188024234358,
"grad_norm": 0.16332408785820007,
"learning_rate": 0.0003931473653319095,
"loss": 2.7848,
"step": 2800
},
{
"epoch": 0.126440128103814,
"grad_norm": 0.16235879063606262,
"learning_rate": 0.00039276372004938987,
"loss": 2.7836,
"step": 2850
},
{
"epoch": 0.12865837596528443,
"grad_norm": 0.1654053032398224,
"learning_rate": 0.00039236982363712145,
"loss": 2.7845,
"step": 2900
},
{
"epoch": 0.13087662382675483,
"grad_norm": 0.16393068432807922,
"learning_rate": 0.00039196569704242376,
"loss": 2.7796,
"step": 2950
},
{
"epoch": 0.13309487168822526,
"grad_norm": 0.16517628729343414,
"learning_rate": 0.0003915513617566551,
"loss": 2.7738,
"step": 3000
},
{
"epoch": 0.13309487168822526,
"eval_accuracy": 0.4247227650219834,
"eval_loss": 2.8996665477752686,
"eval_runtime": 243.2366,
"eval_samples_per_second": 8.222,
"eval_steps_per_second": 1.028,
"step": 3000
},
{
"epoch": 0.1353131195496957,
"grad_norm": 0.16540038585662842,
"learning_rate": 0.00039112683981406936,
"loss": 2.7708,
"step": 3050
},
{
"epoch": 0.13753136741116612,
"grad_norm": 0.16403205692768097,
"learning_rate": 0.00039069215379064465,
"loss": 2.7709,
"step": 3100
},
{
"epoch": 0.13974961527263652,
"grad_norm": 0.16498889029026031,
"learning_rate": 0.0003902473268028826,
"loss": 2.7683,
"step": 3150
},
{
"epoch": 0.14196786313410695,
"grad_norm": 0.16713927686214447,
"learning_rate": 0.00038979238250657863,
"loss": 2.7578,
"step": 3200
},
{
"epoch": 0.14418611099557738,
"grad_norm": 0.16905058920383453,
"learning_rate": 0.00038932734509556467,
"loss": 2.7602,
"step": 3250
},
{
"epoch": 0.14640435885704778,
"grad_norm": 0.16431044042110443,
"learning_rate": 0.0003888522393004219,
"loss": 2.7685,
"step": 3300
},
{
"epoch": 0.1486226067185182,
"grad_norm": 0.163705512881279,
"learning_rate": 0.00038836709038716583,
"loss": 2.8434,
"step": 3350
},
{
"epoch": 0.15084085457998864,
"grad_norm": 0.1622520387172699,
"learning_rate": 0.0003878719241559027,
"loss": 2.8349,
"step": 3400
},
{
"epoch": 0.15305910244145907,
"grad_norm": 0.16072827577590942,
"learning_rate": 0.00038736676693945746,
"loss": 2.8369,
"step": 3450
},
{
"epoch": 0.15527735030292947,
"grad_norm": 0.16206832230091095,
"learning_rate": 0.0003868516456019733,
"loss": 2.8404,
"step": 3500
},
{
"epoch": 0.1574955981643999,
"grad_norm": 0.16148249804973602,
"learning_rate": 0.0003863265875374829,
"loss": 2.836,
"step": 3550
},
{
"epoch": 0.15971384602587033,
"grad_norm": 0.16401226818561554,
"learning_rate": 0.0003857916206684519,
"loss": 2.8369,
"step": 3600
},
{
"epoch": 0.16193209388734073,
"grad_norm": 0.15987250208854675,
"learning_rate": 0.00038524677344429386,
"loss": 2.8363,
"step": 3650
},
{
"epoch": 0.16415034174881116,
"grad_norm": 0.16117645800113678,
"learning_rate": 0.00038469207483985725,
"loss": 2.8426,
"step": 3700
},
{
"epoch": 0.1663685896102816,
"grad_norm": 0.16374363005161285,
"learning_rate": 0.00038412755435388474,
"loss": 2.8416,
"step": 3750
},
{
"epoch": 0.168586837471752,
"grad_norm": 0.16495129466056824,
"learning_rate": 0.0003835532420074444,
"loss": 2.8396,
"step": 3800
},
{
"epoch": 0.17080508533322242,
"grad_norm": 0.16315814852714539,
"learning_rate": 0.0003829691683423329,
"loss": 2.8358,
"step": 3850
},
{
"epoch": 0.17302333319469285,
"grad_norm": 0.16098596155643463,
"learning_rate": 0.00038237536441945193,
"loss": 2.8354,
"step": 3900
},
{
"epoch": 0.17524158105616328,
"grad_norm": 0.16226187348365784,
"learning_rate": 0.00038177186181715577,
"loss": 2.8352,
"step": 3950
},
{
"epoch": 0.17745982891763368,
"grad_norm": 0.15939714014530182,
"learning_rate": 0.00038115869262957233,
"loss": 2.835,
"step": 4000
},
{
"epoch": 0.1796780767791041,
"grad_norm": 0.1622256189584732,
"learning_rate": 0.00038053588946489615,
"loss": 2.8391,
"step": 4050
},
{
"epoch": 0.18189632464057454,
"grad_norm": 0.16176052391529083,
"learning_rate": 0.0003799034854436545,
"loss": 2.8371,
"step": 4100
},
{
"epoch": 0.18411457250204494,
"grad_norm": 0.16316720843315125,
"learning_rate": 0.0003792615141969462,
"loss": 2.8365,
"step": 4150
},
{
"epoch": 0.18633282036351537,
"grad_norm": 0.16207610070705414,
"learning_rate": 0.0003786100098646524,
"loss": 2.8346,
"step": 4200
},
{
"epoch": 0.1885510682249858,
"grad_norm": 0.1638234406709671,
"learning_rate": 0.000377949007093622,
"loss": 2.8319,
"step": 4250
},
{
"epoch": 0.1907693160864562,
"grad_norm": 0.1628599315881729,
"learning_rate": 0.0003772785410358283,
"loss": 2.8369,
"step": 4300
},
{
"epoch": 0.19298756394792663,
"grad_norm": 0.1653933823108673,
"learning_rate": 0.00037659864734650026,
"loss": 2.8304,
"step": 4350
},
{
"epoch": 0.19520581180939706,
"grad_norm": 0.16370812058448792,
"learning_rate": 0.0003759093621822259,
"loss": 2.8369,
"step": 4400
},
{
"epoch": 0.19742405967086749,
"grad_norm": 0.1629050225019455,
"learning_rate": 0.0003752107221990298,
"loss": 2.8339,
"step": 4450
},
{
"epoch": 0.1996423075323379,
"grad_norm": 0.1610899269580841,
"learning_rate": 0.00037450276455042354,
"loss": 2.829,
"step": 4500
},
{
"epoch": 0.20186055539380832,
"grad_norm": 0.1629941165447235,
"learning_rate": 0.00037378552688543005,
"loss": 2.8351,
"step": 4550
},
{
"epoch": 0.20407880325527875,
"grad_norm": 0.16439735889434814,
"learning_rate": 0.0003730590473465814,
"loss": 2.8316,
"step": 4600
},
{
"epoch": 0.20629705111674915,
"grad_norm": 0.16572241485118866,
"learning_rate": 0.00037232336456789023,
"loss": 2.8335,
"step": 4650
},
{
"epoch": 0.20851529897821958,
"grad_norm": 0.16380038857460022,
"learning_rate": 0.00037157851767279543,
"loss": 2.8286,
"step": 4700
},
{
"epoch": 0.21073354683969,
"grad_norm": 0.16284549236297607,
"learning_rate": 0.00037082454627208156,
"loss": 2.8301,
"step": 4750
},
{
"epoch": 0.2129517947011604,
"grad_norm": 0.16597482562065125,
"learning_rate": 0.0003700614904617721,
"loss": 2.8323,
"step": 4800
},
{
"epoch": 0.21517004256263084,
"grad_norm": 0.16378666460514069,
"learning_rate": 0.0003692893908209973,
"loss": 2.8299,
"step": 4850
},
{
"epoch": 0.21738829042410127,
"grad_norm": 0.1630343496799469,
"learning_rate": 0.0003685082884098363,
"loss": 2.8333,
"step": 4900
},
{
"epoch": 0.2196065382855717,
"grad_norm": 0.16490814089775085,
"learning_rate": 0.00036771822476713346,
"loss": 2.8307,
"step": 4950
},
{
"epoch": 0.2218247861470421,
"grad_norm": 0.1655721366405487,
"learning_rate": 0.00036691924190828935,
"loss": 2.8301,
"step": 5000
},
{
"epoch": 0.22404303400851253,
"grad_norm": 0.16776245832443237,
"learning_rate": 0.0003661113823230264,
"loss": 2.8228,
"step": 5050
},
{
"epoch": 0.22626128186998296,
"grad_norm": 0.1626349687576294,
"learning_rate": 0.00036529468897312926,
"loss": 2.8262,
"step": 5100
},
{
"epoch": 0.22847952973145336,
"grad_norm": 0.16331753134727478,
"learning_rate": 0.00036446920529016,
"loss": 2.8282,
"step": 5150
},
{
"epoch": 0.2306977775929238,
"grad_norm": 0.1676001250743866,
"learning_rate": 0.00036363497517314877,
"loss": 2.8313,
"step": 5200
},
{
"epoch": 0.23291602545439422,
"grad_norm": 0.16441357135772705,
"learning_rate": 0.000362792042986259,
"loss": 2.8278,
"step": 5250
},
{
"epoch": 0.23513427331586464,
"grad_norm": 0.16601622104644775,
"learning_rate": 0.000361940453556428,
"loss": 2.8303,
"step": 5300
},
{
"epoch": 0.23735252117733505,
"grad_norm": 0.1679011583328247,
"learning_rate": 0.0003610802521709833,
"loss": 2.8252,
"step": 5350
},
{
"epoch": 0.23957076903880548,
"grad_norm": 0.1650955229997635,
"learning_rate": 0.0003602114845752345,
"loss": 2.8299,
"step": 5400
},
{
"epoch": 0.2417890169002759,
"grad_norm": 0.16651777923107147,
"learning_rate": 0.00035933419697004,
"loss": 2.832,
"step": 5450
},
{
"epoch": 0.2440072647617463,
"grad_norm": 0.166709303855896,
"learning_rate": 0.00035844843600935024,
"loss": 2.8262,
"step": 5500
},
{
"epoch": 0.24622551262321674,
"grad_norm": 0.16586416959762573,
"learning_rate": 0.000357554248797727,
"loss": 2.8255,
"step": 5550
},
{
"epoch": 0.24844376048468717,
"grad_norm": 0.1647614985704422,
"learning_rate": 0.00035665168288783795,
"loss": 2.8298,
"step": 5600
},
{
"epoch": 0.2506620083461576,
"grad_norm": 0.16310204565525055,
"learning_rate": 0.000355740786277928,
"loss": 2.8273,
"step": 5650
},
{
"epoch": 0.252880256207628,
"grad_norm": 0.1629767119884491,
"learning_rate": 0.00035482160740926683,
"loss": 2.8231,
"step": 5700
},
{
"epoch": 0.2550985040690984,
"grad_norm": 0.16427451372146606,
"learning_rate": 0.00035389419516357253,
"loss": 2.8188,
"step": 5750
},
{
"epoch": 0.25731675193056885,
"grad_norm": 0.1655891388654709,
"learning_rate": 0.0003529585988604125,
"loss": 2.8258,
"step": 5800
},
{
"epoch": 0.25953499979203926,
"grad_norm": 0.16402335464954376,
"learning_rate": 0.0003520148682545803,
"loss": 2.8254,
"step": 5850
},
{
"epoch": 0.26175324765350966,
"grad_norm": 0.1638861894607544,
"learning_rate": 0.0003510630535334497,
"loss": 2.8298,
"step": 5900
},
{
"epoch": 0.2639714955149801,
"grad_norm": 0.16864845156669617,
"learning_rate": 0.0003501032053143061,
"loss": 2.8238,
"step": 5950
},
{
"epoch": 0.2661897433764505,
"grad_norm": 0.16578635573387146,
"learning_rate": 0.0003491353746416541,
"loss": 2.8225,
"step": 6000
},
{
"epoch": 0.2661897433764505,
"eval_accuracy": 0.4264626282364436,
"eval_loss": 2.8843319416046143,
"eval_runtime": 242.3694,
"eval_samples_per_second": 8.252,
"eval_steps_per_second": 1.031,
"step": 6000
},
{
"epoch": 0.268407991237921,
"grad_norm": 0.16673897206783295,
"learning_rate": 0.00034815961298450377,
"loss": 2.823,
"step": 6050
},
{
"epoch": 0.2706262390993914,
"grad_norm": 0.16588376462459564,
"learning_rate": 0.0003471759722336326,
"loss": 2.8193,
"step": 6100
},
{
"epoch": 0.2728444869608618,
"grad_norm": 0.16813361644744873,
"learning_rate": 0.00034618450469882687,
"loss": 2.8267,
"step": 6150
},
{
"epoch": 0.27506273482233223,
"grad_norm": 0.16656942665576935,
"learning_rate": 0.0003451852631060991,
"loss": 2.8219,
"step": 6200
},
{
"epoch": 0.27728098268380263,
"grad_norm": 0.1666443794965744,
"learning_rate": 0.0003441783005948846,
"loss": 2.8233,
"step": 6250
},
{
"epoch": 0.27949923054527304,
"grad_norm": 0.1673704832792282,
"learning_rate": 0.0003431636707152152,
"loss": 2.824,
"step": 6300
},
{
"epoch": 0.2817174784067435,
"grad_norm": 0.16707104444503784,
"learning_rate": 0.00034214142742487177,
"loss": 2.8221,
"step": 6350
},
{
"epoch": 0.2839357262682139,
"grad_norm": 0.16775397956371307,
"learning_rate": 0.0003411116250865143,
"loss": 2.8234,
"step": 6400
},
{
"epoch": 0.2861539741296843,
"grad_norm": 0.16813720762729645,
"learning_rate": 0.0003400743184647915,
"loss": 2.8258,
"step": 6450
},
{
"epoch": 0.28837222199115475,
"grad_norm": 0.16362161934375763,
"learning_rate": 0.00033902956272342783,
"loss": 2.8232,
"step": 6500
},
{
"epoch": 0.29059046985262516,
"grad_norm": 0.16950780153274536,
"learning_rate": 0.00033797741342229054,
"loss": 2.821,
"step": 6550
},
{
"epoch": 0.29280871771409556,
"grad_norm": 0.1657160073518753,
"learning_rate": 0.00033691792651443435,
"loss": 2.8181,
"step": 6600
},
{
"epoch": 0.295026965575566,
"grad_norm": 0.1689310073852539,
"learning_rate": 0.0003358511583431264,
"loss": 2.8257,
"step": 6650
},
{
"epoch": 0.2972452134370364,
"grad_norm": 0.16674135625362396,
"learning_rate": 0.00033477716563884956,
"loss": 2.8209,
"step": 6700
},
{
"epoch": 0.2994634612985068,
"grad_norm": 0.16600748896598816,
"learning_rate": 0.00033369600551628586,
"loss": 2.8227,
"step": 6750
},
{
"epoch": 0.3016817091599773,
"grad_norm": 0.16666853427886963,
"learning_rate": 0.0003326077354712789,
"loss": 2.8199,
"step": 6800
},
{
"epoch": 0.3038999570214477,
"grad_norm": 0.1671936959028244,
"learning_rate": 0.00033151241337777624,
"loss": 2.82,
"step": 6850
},
{
"epoch": 0.30611820488291813,
"grad_norm": 0.1675061583518982,
"learning_rate": 0.00033041009748475166,
"loss": 2.8246,
"step": 6900
},
{
"epoch": 0.30833645274438853,
"grad_norm": 0.16512750089168549,
"learning_rate": 0.0003293008464131079,
"loss": 2.8178,
"step": 6950
},
{
"epoch": 0.31055470060585894,
"grad_norm": 0.1670486181974411,
"learning_rate": 0.0003281847191525585,
"loss": 2.8185,
"step": 7000
},
{
"epoch": 0.3127729484673294,
"grad_norm": 0.1692744940519333,
"learning_rate": 0.0003270617750584913,
"loss": 2.8184,
"step": 7050
},
{
"epoch": 0.3149911963287998,
"grad_norm": 0.16573506593704224,
"learning_rate": 0.0003259320738488119,
"loss": 2.823,
"step": 7100
},
{
"epoch": 0.3172094441902702,
"grad_norm": 0.17004618048667908,
"learning_rate": 0.00032479567560076745,
"loss": 2.8174,
"step": 7150
},
{
"epoch": 0.31942769205174065,
"grad_norm": 0.16867642104625702,
"learning_rate": 0.00032365264074775223,
"loss": 2.8183,
"step": 7200
},
{
"epoch": 0.32164593991321105,
"grad_norm": 0.16543437540531158,
"learning_rate": 0.00032250303007609366,
"loss": 2.8178,
"step": 7250
},
{
"epoch": 0.32386418777468146,
"grad_norm": 0.16606374084949493,
"learning_rate": 0.0003213469047218194,
"loss": 2.8182,
"step": 7300
},
{
"epoch": 0.3260824356361519,
"grad_norm": 0.1708928942680359,
"learning_rate": 0.0003201843261674067,
"loss": 2.8194,
"step": 7350
},
{
"epoch": 0.3283006834976223,
"grad_norm": 0.16661237180233002,
"learning_rate": 0.00031901535623851245,
"loss": 2.8226,
"step": 7400
},
{
"epoch": 0.3305189313590927,
"grad_norm": 0.16710756719112396,
"learning_rate": 0.0003178400571006852,
"loss": 2.8187,
"step": 7450
},
{
"epoch": 0.3327371792205632,
"grad_norm": 0.16679760813713074,
"learning_rate": 0.00031665849125605937,
"loss": 2.8163,
"step": 7500
},
{
"epoch": 0.3349554270820336,
"grad_norm": 0.16872857511043549,
"learning_rate": 0.00031547072154003154,
"loss": 2.8147,
"step": 7550
},
{
"epoch": 0.337173674943504,
"grad_norm": 0.1672954261302948,
"learning_rate": 0.0003142768111179187,
"loss": 2.8167,
"step": 7600
},
{
"epoch": 0.33939192280497443,
"grad_norm": 0.16654394567012787,
"learning_rate": 0.00031307682348159907,
"loss": 2.816,
"step": 7650
},
{
"epoch": 0.34161017066644483,
"grad_norm": 0.16810841858386993,
"learning_rate": 0.00031187082244613567,
"loss": 2.8139,
"step": 7700
},
{
"epoch": 0.34382841852791524,
"grad_norm": 0.1682497262954712,
"learning_rate": 0.00031065887214638284,
"loss": 2.8157,
"step": 7750
},
{
"epoch": 0.3460466663893857,
"grad_norm": 0.17154847085475922,
"learning_rate": 0.00030944103703357524,
"loss": 2.8143,
"step": 7800
},
{
"epoch": 0.3482649142508561,
"grad_norm": 0.16658836603164673,
"learning_rate": 0.00030821738187190075,
"loss": 2.8143,
"step": 7850
},
{
"epoch": 0.35048316211232655,
"grad_norm": 0.16820305585861206,
"learning_rate": 0.00030698797173505586,
"loss": 2.8157,
"step": 7900
},
{
"epoch": 0.35270140997379695,
"grad_norm": 0.16843385994434357,
"learning_rate": 0.0003057528720027853,
"loss": 2.8103,
"step": 7950
},
{
"epoch": 0.35491965783526735,
"grad_norm": 0.17145898938179016,
"learning_rate": 0.0003045121483574054,
"loss": 2.8161,
"step": 8000
},
{
"epoch": 0.3571379056967378,
"grad_norm": 0.1709701269865036,
"learning_rate": 0.00030326586678031066,
"loss": 2.8134,
"step": 8050
},
{
"epoch": 0.3593561535582082,
"grad_norm": 0.16859866678714752,
"learning_rate": 0.0003020140935484653,
"loss": 2.818,
"step": 8100
},
{
"epoch": 0.3615744014196786,
"grad_norm": 0.16738031804561615,
"learning_rate": 0.00030075689523087804,
"loss": 2.8128,
"step": 8150
},
{
"epoch": 0.36379264928114907,
"grad_norm": 0.1693500131368637,
"learning_rate": 0.00029949433868506293,
"loss": 2.8138,
"step": 8200
},
{
"epoch": 0.3660108971426195,
"grad_norm": 0.16915106773376465,
"learning_rate": 0.00029822649105348294,
"loss": 2.8209,
"step": 8250
},
{
"epoch": 0.3682291450040899,
"grad_norm": 0.17108069360256195,
"learning_rate": 0.00029695341975998006,
"loss": 2.8174,
"step": 8300
},
{
"epoch": 0.37044739286556033,
"grad_norm": 0.16659317910671234,
"learning_rate": 0.00029567519250618907,
"loss": 2.8153,
"step": 8350
},
{
"epoch": 0.37266564072703073,
"grad_norm": 0.16678906977176666,
"learning_rate": 0.0002943918772679379,
"loss": 2.8163,
"step": 8400
},
{
"epoch": 0.37488388858850114,
"grad_norm": 0.16928167641162872,
"learning_rate": 0.00029310354229163197,
"loss": 2.8165,
"step": 8450
},
{
"epoch": 0.3771021364499716,
"grad_norm": 0.1695391833782196,
"learning_rate": 0.0002918102560906254,
"loss": 2.8197,
"step": 8500
},
{
"epoch": 0.379320384311442,
"grad_norm": 0.17006346583366394,
"learning_rate": 0.0002905120874415772,
"loss": 2.8172,
"step": 8550
},
{
"epoch": 0.3815386321729124,
"grad_norm": 0.16821132600307465,
"learning_rate": 0.0002892091053807939,
"loss": 2.8137,
"step": 8600
},
{
"epoch": 0.38375688003438285,
"grad_norm": 0.17077401280403137,
"learning_rate": 0.000287901379200558,
"loss": 2.8174,
"step": 8650
},
{
"epoch": 0.38597512789585325,
"grad_norm": 0.17006562650203705,
"learning_rate": 0.0002865889784454435,
"loss": 2.813,
"step": 8700
},
{
"epoch": 0.3881933757573237,
"grad_norm": 0.16847462952136993,
"learning_rate": 0.0002852719729086167,
"loss": 2.8158,
"step": 8750
},
{
"epoch": 0.3904116236187941,
"grad_norm": 0.16790613532066345,
"learning_rate": 0.0002839504326281256,
"loss": 2.816,
"step": 8800
},
{
"epoch": 0.3926298714802645,
"grad_norm": 0.16898341476917267,
"learning_rate": 0.00028262442788317446,
"loss": 2.8143,
"step": 8850
},
{
"epoch": 0.39484811934173497,
"grad_norm": 0.17099575698375702,
"learning_rate": 0.00028129402919038695,
"loss": 2.812,
"step": 8900
},
{
"epoch": 0.3970663672032054,
"grad_norm": 0.17063932120800018,
"learning_rate": 0.00027995930730005577,
"loss": 2.815,
"step": 8950
},
{
"epoch": 0.3992846150646758,
"grad_norm": 0.1704034060239792,
"learning_rate": 0.00027862033319238025,
"loss": 2.8144,
"step": 9000
},
{
"epoch": 0.3992846150646758,
"eval_accuracy": 0.42786541279921836,
"eval_loss": 2.8759515285491943,
"eval_runtime": 250.6732,
"eval_samples_per_second": 7.979,
"eval_steps_per_second": 0.997,
"step": 9000
},
{
"epoch": 0.40150286292614623,
"grad_norm": 0.1675969511270523,
"learning_rate": 0.0002772771780736917,
"loss": 2.8128,
"step": 9050
},
{
"epoch": 0.40372111078761663,
"grad_norm": 0.1697956621646881,
"learning_rate": 0.0002759299133726665,
"loss": 2.8121,
"step": 9100
},
{
"epoch": 0.40593935864908703,
"grad_norm": 0.1710100620985031,
"learning_rate": 0.00027457861073652785,
"loss": 2.8156,
"step": 9150
},
{
"epoch": 0.4081576065105575,
"grad_norm": 0.16877809166908264,
"learning_rate": 0.00027322334202723527,
"loss": 2.815,
"step": 9200
},
{
"epoch": 0.4103758543720279,
"grad_norm": 0.17122440040111542,
"learning_rate": 0.0002718641793176631,
"loss": 2.8119,
"step": 9250
},
{
"epoch": 0.4125941022334983,
"grad_norm": 0.16771045327186584,
"learning_rate": 0.0002705011948877679,
"loss": 2.808,
"step": 9300
},
{
"epoch": 0.41481235009496875,
"grad_norm": 0.16941729187965393,
"learning_rate": 0.0002691344612207442,
"loss": 2.8121,
"step": 9350
},
{
"epoch": 0.41703059795643915,
"grad_norm": 0.1719992607831955,
"learning_rate": 0.00026776405099917014,
"loss": 2.8094,
"step": 9400
},
{
"epoch": 0.41924884581790955,
"grad_norm": 0.1693243533372879,
"learning_rate": 0.00026639003710114223,
"loss": 2.8103,
"step": 9450
},
{
"epoch": 0.42146709367938,
"grad_norm": 0.17014500498771667,
"learning_rate": 0.0002650124925963998,
"loss": 2.8129,
"step": 9500
},
{
"epoch": 0.4236853415408504,
"grad_norm": 0.1709510087966919,
"learning_rate": 0.00026363149074243867,
"loss": 2.8084,
"step": 9550
},
{
"epoch": 0.4259035894023208,
"grad_norm": 0.16937118768692017,
"learning_rate": 0.0002622471049806159,
"loss": 2.814,
"step": 9600
},
{
"epoch": 0.42812183726379127,
"grad_norm": 0.1713036149740219,
"learning_rate": 0.00026085940893224403,
"loss": 2.8162,
"step": 9650
},
{
"epoch": 0.4303400851252617,
"grad_norm": 0.17020347714424133,
"learning_rate": 0.0002594684763946758,
"loss": 2.8116,
"step": 9700
},
{
"epoch": 0.43255833298673213,
"grad_norm": 0.16786696016788483,
"learning_rate": 0.0002580743813373796,
"loss": 2.8111,
"step": 9750
},
{
"epoch": 0.43477658084820253,
"grad_norm": 0.17273075878620148,
"learning_rate": 0.00025667719789800606,
"loss": 2.8131,
"step": 9800
},
{
"epoch": 0.43699482870967293,
"grad_norm": 0.16986466944217682,
"learning_rate": 0.00025527700037844515,
"loss": 2.8139,
"step": 9850
},
{
"epoch": 0.4392130765711434,
"grad_norm": 0.17129731178283691,
"learning_rate": 0.00025387386324087494,
"loss": 2.8125,
"step": 9900
},
{
"epoch": 0.4414313244326138,
"grad_norm": 0.16890868544578552,
"learning_rate": 0.00025246786110380163,
"loss": 2.8142,
"step": 9950
},
{
"epoch": 0.4436495722940842,
"grad_norm": 0.17167522013187408,
"learning_rate": 0.00025105906873809154,
"loss": 2.8142,
"step": 10000
},
{
"epoch": 0.44586782015555465,
"grad_norm": 0.17136669158935547,
"learning_rate": 0.0002496475610629947,
"loss": 2.8112,
"step": 10050
},
{
"epoch": 0.44808606801702505,
"grad_norm": 0.16926760971546173,
"learning_rate": 0.00024823341314216056,
"loss": 2.8156,
"step": 10100
},
{
"epoch": 0.45030431587849545,
"grad_norm": 0.16898435354232788,
"learning_rate": 0.00024681670017964627,
"loss": 2.8079,
"step": 10150
},
{
"epoch": 0.4525225637399659,
"grad_norm": 0.17237040400505066,
"learning_rate": 0.0002453974975159173,
"loss": 2.813,
"step": 10200
},
{
"epoch": 0.4547408116014363,
"grad_norm": 0.16995486617088318,
"learning_rate": 0.00024397588062384095,
"loss": 2.8117,
"step": 10250
},
{
"epoch": 0.4569590594629067,
"grad_norm": 0.17290563881397247,
"learning_rate": 0.00024255192510467245,
"loss": 2.8121,
"step": 10300
},
{
"epoch": 0.45917730732437717,
"grad_norm": 0.17059782147407532,
"learning_rate": 0.00024112570668403472,
"loss": 2.8138,
"step": 10350
},
{
"epoch": 0.4613955551858476,
"grad_norm": 0.17196382582187653,
"learning_rate": 0.00023969730120789132,
"loss": 2.8095,
"step": 10400
},
{
"epoch": 0.463613803047318,
"grad_norm": 0.16942380368709564,
"learning_rate": 0.00023826678463851285,
"loss": 2.8124,
"step": 10450
},
{
"epoch": 0.46583205090878843,
"grad_norm": 0.17288681864738464,
"learning_rate": 0.00023683423305043749,
"loss": 2.813,
"step": 10500
},
{
"epoch": 0.46805029877025883,
"grad_norm": 0.17040428519248962,
"learning_rate": 0.00023539972262642502,
"loss": 2.8141,
"step": 10550
},
{
"epoch": 0.4702685466317293,
"grad_norm": 0.17321184277534485,
"learning_rate": 0.00023396332965340585,
"loss": 2.8146,
"step": 10600
},
{
"epoch": 0.4724867944931997,
"grad_norm": 0.17026926577091217,
"learning_rate": 0.00023252513051842373,
"loss": 2.8086,
"step": 10650
},
{
"epoch": 0.4747050423546701,
"grad_norm": 0.1710352748632431,
"learning_rate": 0.00023108520170457398,
"loss": 2.8099,
"step": 10700
},
{
"epoch": 0.47692329021614055,
"grad_norm": 0.17067080736160278,
"learning_rate": 0.00022964361978693542,
"loss": 2.8099,
"step": 10750
},
{
"epoch": 0.47914153807761095,
"grad_norm": 0.17244164645671844,
"learning_rate": 0.0002282004614284989,
"loss": 2.8054,
"step": 10800
}
],
"logging_steps": 50,
"max_steps": 22540,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5864483085358727e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}