top_19_ranking_stackexchange / trainer_state.json
sedrickkeh's picture
End of training
3617dc9 verified
raw
history blame
33.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9961671837926627,
"eval_steps": 500,
"global_step": 2052,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014601204599379448,
"grad_norm": 31.200588193935076,
"learning_rate": 5e-06,
"loss": 1.0665,
"step": 10
},
{
"epoch": 0.029202409198758897,
"grad_norm": 2.1030174817289726,
"learning_rate": 5e-06,
"loss": 0.9851,
"step": 20
},
{
"epoch": 0.043803613798138345,
"grad_norm": 1.7889745137305164,
"learning_rate": 5e-06,
"loss": 0.9413,
"step": 30
},
{
"epoch": 0.058404818397517794,
"grad_norm": 0.7798640642359425,
"learning_rate": 5e-06,
"loss": 0.9234,
"step": 40
},
{
"epoch": 0.07300602299689725,
"grad_norm": 2.7003950764835163,
"learning_rate": 5e-06,
"loss": 0.9077,
"step": 50
},
{
"epoch": 0.08760722759627669,
"grad_norm": 6.611494407685939,
"learning_rate": 5e-06,
"loss": 0.9045,
"step": 60
},
{
"epoch": 0.10220843219565615,
"grad_norm": 1.081246334652253,
"learning_rate": 5e-06,
"loss": 0.8965,
"step": 70
},
{
"epoch": 0.11680963679503559,
"grad_norm": 0.8101716177472186,
"learning_rate": 5e-06,
"loss": 0.8855,
"step": 80
},
{
"epoch": 0.13141084139441503,
"grad_norm": 0.8801376249396373,
"learning_rate": 5e-06,
"loss": 0.8827,
"step": 90
},
{
"epoch": 0.1460120459937945,
"grad_norm": 0.6385442821205225,
"learning_rate": 5e-06,
"loss": 0.8714,
"step": 100
},
{
"epoch": 0.16061325059317394,
"grad_norm": 0.6872556086848158,
"learning_rate": 5e-06,
"loss": 0.8735,
"step": 110
},
{
"epoch": 0.17521445519255338,
"grad_norm": 0.6992499417358933,
"learning_rate": 5e-06,
"loss": 0.8682,
"step": 120
},
{
"epoch": 0.18981565979193282,
"grad_norm": 0.6013577080845172,
"learning_rate": 5e-06,
"loss": 0.8662,
"step": 130
},
{
"epoch": 0.2044168643913123,
"grad_norm": 0.5988600270173734,
"learning_rate": 5e-06,
"loss": 0.8649,
"step": 140
},
{
"epoch": 0.21901806899069173,
"grad_norm": 0.6460225651419232,
"learning_rate": 5e-06,
"loss": 0.8647,
"step": 150
},
{
"epoch": 0.23361927359007117,
"grad_norm": 0.5603551860244527,
"learning_rate": 5e-06,
"loss": 0.8533,
"step": 160
},
{
"epoch": 0.24822047818945062,
"grad_norm": 0.5553710866489995,
"learning_rate": 5e-06,
"loss": 0.8544,
"step": 170
},
{
"epoch": 0.26282168278883006,
"grad_norm": 0.6554387689230537,
"learning_rate": 5e-06,
"loss": 0.8592,
"step": 180
},
{
"epoch": 0.2774228873882095,
"grad_norm": 0.6786037864304745,
"learning_rate": 5e-06,
"loss": 0.8518,
"step": 190
},
{
"epoch": 0.292024091987589,
"grad_norm": 0.5561181889312125,
"learning_rate": 5e-06,
"loss": 0.8505,
"step": 200
},
{
"epoch": 0.3066252965869684,
"grad_norm": 0.7917259340608006,
"learning_rate": 5e-06,
"loss": 0.8475,
"step": 210
},
{
"epoch": 0.3212265011863479,
"grad_norm": 0.5838972916992158,
"learning_rate": 5e-06,
"loss": 0.8468,
"step": 220
},
{
"epoch": 0.33582770578572735,
"grad_norm": 0.5415756077794452,
"learning_rate": 5e-06,
"loss": 0.8478,
"step": 230
},
{
"epoch": 0.35042891038510676,
"grad_norm": 0.561460816685303,
"learning_rate": 5e-06,
"loss": 0.8471,
"step": 240
},
{
"epoch": 0.36503011498448623,
"grad_norm": 0.5431016015146285,
"learning_rate": 5e-06,
"loss": 0.845,
"step": 250
},
{
"epoch": 0.37963131958386565,
"grad_norm": 0.5777928639036234,
"learning_rate": 5e-06,
"loss": 0.8427,
"step": 260
},
{
"epoch": 0.3942325241832451,
"grad_norm": 0.6855206088461627,
"learning_rate": 5e-06,
"loss": 0.8406,
"step": 270
},
{
"epoch": 0.4088337287826246,
"grad_norm": 0.5959537867792327,
"learning_rate": 5e-06,
"loss": 0.843,
"step": 280
},
{
"epoch": 0.423434933382004,
"grad_norm": 0.5787987185587301,
"learning_rate": 5e-06,
"loss": 0.842,
"step": 290
},
{
"epoch": 0.43803613798138347,
"grad_norm": 0.9829016985861171,
"learning_rate": 5e-06,
"loss": 0.841,
"step": 300
},
{
"epoch": 0.45263734258076294,
"grad_norm": 0.7109664833342627,
"learning_rate": 5e-06,
"loss": 0.8376,
"step": 310
},
{
"epoch": 0.46723854718014235,
"grad_norm": 0.5953929615896101,
"learning_rate": 5e-06,
"loss": 0.8352,
"step": 320
},
{
"epoch": 0.4818397517795218,
"grad_norm": 0.6459745420821242,
"learning_rate": 5e-06,
"loss": 0.8322,
"step": 330
},
{
"epoch": 0.49644095637890123,
"grad_norm": 0.7286780710714444,
"learning_rate": 5e-06,
"loss": 0.833,
"step": 340
},
{
"epoch": 0.5110421609782807,
"grad_norm": 0.8624769767543123,
"learning_rate": 5e-06,
"loss": 0.8326,
"step": 350
},
{
"epoch": 0.5256433655776601,
"grad_norm": 0.7489286697832975,
"learning_rate": 5e-06,
"loss": 0.8344,
"step": 360
},
{
"epoch": 0.5402445701770396,
"grad_norm": 0.6965027768353624,
"learning_rate": 5e-06,
"loss": 0.8329,
"step": 370
},
{
"epoch": 0.554845774776419,
"grad_norm": 0.5898605613508874,
"learning_rate": 5e-06,
"loss": 0.8396,
"step": 380
},
{
"epoch": 0.5694469793757985,
"grad_norm": 0.669429389652064,
"learning_rate": 5e-06,
"loss": 0.8318,
"step": 390
},
{
"epoch": 0.584048183975178,
"grad_norm": 0.6580798841963941,
"learning_rate": 5e-06,
"loss": 0.8314,
"step": 400
},
{
"epoch": 0.5986493885745574,
"grad_norm": 0.624128356604639,
"learning_rate": 5e-06,
"loss": 0.8282,
"step": 410
},
{
"epoch": 0.6132505931739368,
"grad_norm": 0.5755522646670556,
"learning_rate": 5e-06,
"loss": 0.8321,
"step": 420
},
{
"epoch": 0.6278517977733163,
"grad_norm": 0.8196980265974857,
"learning_rate": 5e-06,
"loss": 0.8313,
"step": 430
},
{
"epoch": 0.6424530023726958,
"grad_norm": 0.48261475886925087,
"learning_rate": 5e-06,
"loss": 0.8238,
"step": 440
},
{
"epoch": 0.6570542069720752,
"grad_norm": 0.5343014097762563,
"learning_rate": 5e-06,
"loss": 0.8296,
"step": 450
},
{
"epoch": 0.6716554115714547,
"grad_norm": 0.8585815714707374,
"learning_rate": 5e-06,
"loss": 0.823,
"step": 460
},
{
"epoch": 0.6862566161708341,
"grad_norm": 0.7315620836524508,
"learning_rate": 5e-06,
"loss": 0.8331,
"step": 470
},
{
"epoch": 0.7008578207702135,
"grad_norm": 0.4711661790189355,
"learning_rate": 5e-06,
"loss": 0.8245,
"step": 480
},
{
"epoch": 0.7154590253695929,
"grad_norm": 0.546263482109446,
"learning_rate": 5e-06,
"loss": 0.8212,
"step": 490
},
{
"epoch": 0.7300602299689725,
"grad_norm": 0.5757304431326317,
"learning_rate": 5e-06,
"loss": 0.8252,
"step": 500
},
{
"epoch": 0.7446614345683519,
"grad_norm": 0.5563752904399338,
"learning_rate": 5e-06,
"loss": 0.8251,
"step": 510
},
{
"epoch": 0.7592626391677313,
"grad_norm": 0.48890029763799747,
"learning_rate": 5e-06,
"loss": 0.8244,
"step": 520
},
{
"epoch": 0.7738638437671108,
"grad_norm": 0.6121148728397559,
"learning_rate": 5e-06,
"loss": 0.8219,
"step": 530
},
{
"epoch": 0.7884650483664902,
"grad_norm": 0.651565586948898,
"learning_rate": 5e-06,
"loss": 0.8203,
"step": 540
},
{
"epoch": 0.8030662529658696,
"grad_norm": 0.5365587518038645,
"learning_rate": 5e-06,
"loss": 0.8244,
"step": 550
},
{
"epoch": 0.8176674575652492,
"grad_norm": 0.5585874614674294,
"learning_rate": 5e-06,
"loss": 0.8261,
"step": 560
},
{
"epoch": 0.8322686621646286,
"grad_norm": 0.48225482309598716,
"learning_rate": 5e-06,
"loss": 0.828,
"step": 570
},
{
"epoch": 0.846869866764008,
"grad_norm": 0.6379018399000604,
"learning_rate": 5e-06,
"loss": 0.8187,
"step": 580
},
{
"epoch": 0.8614710713633875,
"grad_norm": 0.8248757003628987,
"learning_rate": 5e-06,
"loss": 0.8245,
"step": 590
},
{
"epoch": 0.8760722759627669,
"grad_norm": 0.7072642911500023,
"learning_rate": 5e-06,
"loss": 0.8199,
"step": 600
},
{
"epoch": 0.8906734805621463,
"grad_norm": 0.6066965111128374,
"learning_rate": 5e-06,
"loss": 0.821,
"step": 610
},
{
"epoch": 0.9052746851615259,
"grad_norm": 0.49608072224953953,
"learning_rate": 5e-06,
"loss": 0.8263,
"step": 620
},
{
"epoch": 0.9198758897609053,
"grad_norm": 0.6053461220096085,
"learning_rate": 5e-06,
"loss": 0.8225,
"step": 630
},
{
"epoch": 0.9344770943602847,
"grad_norm": 0.5575666035835788,
"learning_rate": 5e-06,
"loss": 0.8211,
"step": 640
},
{
"epoch": 0.9490782989596642,
"grad_norm": 0.5170427420902555,
"learning_rate": 5e-06,
"loss": 0.8202,
"step": 650
},
{
"epoch": 0.9636795035590436,
"grad_norm": 0.5652214016440857,
"learning_rate": 5e-06,
"loss": 0.8219,
"step": 660
},
{
"epoch": 0.978280708158423,
"grad_norm": 0.5065476265832586,
"learning_rate": 5e-06,
"loss": 0.8121,
"step": 670
},
{
"epoch": 0.9928819127578025,
"grad_norm": 0.5713479763199619,
"learning_rate": 5e-06,
"loss": 0.8154,
"step": 680
},
{
"epoch": 0.9987223945975543,
"eval_loss": 0.8185040950775146,
"eval_runtime": 729.2812,
"eval_samples_per_second": 25.306,
"eval_steps_per_second": 0.396,
"step": 684
},
{
"epoch": 1.0074831173571819,
"grad_norm": 0.5659100587324225,
"learning_rate": 5e-06,
"loss": 0.8068,
"step": 690
},
{
"epoch": 1.0220843219565614,
"grad_norm": 0.5725390345160268,
"learning_rate": 5e-06,
"loss": 0.7796,
"step": 700
},
{
"epoch": 1.036685526555941,
"grad_norm": 0.5067331567131128,
"learning_rate": 5e-06,
"loss": 0.7784,
"step": 710
},
{
"epoch": 1.0512867311553202,
"grad_norm": 0.5633875492368658,
"learning_rate": 5e-06,
"loss": 0.7789,
"step": 720
},
{
"epoch": 1.0658879357546998,
"grad_norm": 0.6503391798526155,
"learning_rate": 5e-06,
"loss": 0.7785,
"step": 730
},
{
"epoch": 1.0804891403540793,
"grad_norm": 0.6157238494098765,
"learning_rate": 5e-06,
"loss": 0.7788,
"step": 740
},
{
"epoch": 1.0950903449534586,
"grad_norm": 0.6692246242398756,
"learning_rate": 5e-06,
"loss": 0.7791,
"step": 750
},
{
"epoch": 1.109691549552838,
"grad_norm": 0.772994893504376,
"learning_rate": 5e-06,
"loss": 0.7772,
"step": 760
},
{
"epoch": 1.1242927541522176,
"grad_norm": 0.5654007018077802,
"learning_rate": 5e-06,
"loss": 0.778,
"step": 770
},
{
"epoch": 1.138893958751597,
"grad_norm": 0.5871850769943243,
"learning_rate": 5e-06,
"loss": 0.7797,
"step": 780
},
{
"epoch": 1.1534951633509765,
"grad_norm": 0.6081431556291285,
"learning_rate": 5e-06,
"loss": 0.7776,
"step": 790
},
{
"epoch": 1.168096367950356,
"grad_norm": 0.5943447291419969,
"learning_rate": 5e-06,
"loss": 0.7812,
"step": 800
},
{
"epoch": 1.1826975725497353,
"grad_norm": 0.5174382592861106,
"learning_rate": 5e-06,
"loss": 0.7742,
"step": 810
},
{
"epoch": 1.1972987771491148,
"grad_norm": 0.5335467085784507,
"learning_rate": 5e-06,
"loss": 0.7821,
"step": 820
},
{
"epoch": 1.2118999817484943,
"grad_norm": 0.5424184832410203,
"learning_rate": 5e-06,
"loss": 0.7832,
"step": 830
},
{
"epoch": 1.2265011863478736,
"grad_norm": 0.5401853269685924,
"learning_rate": 5e-06,
"loss": 0.7764,
"step": 840
},
{
"epoch": 1.2411023909472532,
"grad_norm": 0.5532297607385643,
"learning_rate": 5e-06,
"loss": 0.776,
"step": 850
},
{
"epoch": 1.2557035955466325,
"grad_norm": 0.4600563956098031,
"learning_rate": 5e-06,
"loss": 0.7746,
"step": 860
},
{
"epoch": 1.270304800146012,
"grad_norm": 0.5135474289282321,
"learning_rate": 5e-06,
"loss": 0.7725,
"step": 870
},
{
"epoch": 1.2849060047453915,
"grad_norm": 0.6354802982105713,
"learning_rate": 5e-06,
"loss": 0.7787,
"step": 880
},
{
"epoch": 1.299507209344771,
"grad_norm": 0.5869839476501474,
"learning_rate": 5e-06,
"loss": 0.7712,
"step": 890
},
{
"epoch": 1.3141084139441503,
"grad_norm": 0.49495760536344496,
"learning_rate": 5e-06,
"loss": 0.777,
"step": 900
},
{
"epoch": 1.3287096185435299,
"grad_norm": 0.5322628773610525,
"learning_rate": 5e-06,
"loss": 0.7791,
"step": 910
},
{
"epoch": 1.3433108231429092,
"grad_norm": 0.6394355119269733,
"learning_rate": 5e-06,
"loss": 0.7813,
"step": 920
},
{
"epoch": 1.3579120277422887,
"grad_norm": 0.6150475948115007,
"learning_rate": 5e-06,
"loss": 0.7718,
"step": 930
},
{
"epoch": 1.3725132323416682,
"grad_norm": 0.6284466998832495,
"learning_rate": 5e-06,
"loss": 0.7716,
"step": 940
},
{
"epoch": 1.3871144369410477,
"grad_norm": 0.4995594773156744,
"learning_rate": 5e-06,
"loss": 0.7801,
"step": 950
},
{
"epoch": 1.401715641540427,
"grad_norm": 0.5533231758658743,
"learning_rate": 5e-06,
"loss": 0.7749,
"step": 960
},
{
"epoch": 1.4163168461398066,
"grad_norm": 0.5566318311264558,
"learning_rate": 5e-06,
"loss": 0.7809,
"step": 970
},
{
"epoch": 1.4309180507391859,
"grad_norm": 0.5996092713965696,
"learning_rate": 5e-06,
"loss": 0.7769,
"step": 980
},
{
"epoch": 1.4455192553385654,
"grad_norm": 0.4923370749506076,
"learning_rate": 5e-06,
"loss": 0.7733,
"step": 990
},
{
"epoch": 1.460120459937945,
"grad_norm": 0.5718051545730899,
"learning_rate": 5e-06,
"loss": 0.778,
"step": 1000
},
{
"epoch": 1.4747216645373245,
"grad_norm": 0.4966605100244046,
"learning_rate": 5e-06,
"loss": 0.7755,
"step": 1010
},
{
"epoch": 1.4893228691367038,
"grad_norm": 0.5104108866561695,
"learning_rate": 5e-06,
"loss": 0.7762,
"step": 1020
},
{
"epoch": 1.5039240737360833,
"grad_norm": 0.5790841364965528,
"learning_rate": 5e-06,
"loss": 0.775,
"step": 1030
},
{
"epoch": 1.5185252783354626,
"grad_norm": 0.5079205962955746,
"learning_rate": 5e-06,
"loss": 0.7791,
"step": 1040
},
{
"epoch": 1.533126482934842,
"grad_norm": 0.4897829483446737,
"learning_rate": 5e-06,
"loss": 0.7732,
"step": 1050
},
{
"epoch": 1.5477276875342216,
"grad_norm": 0.5375326427308407,
"learning_rate": 5e-06,
"loss": 0.7734,
"step": 1060
},
{
"epoch": 1.5623288921336012,
"grad_norm": 0.4714533263773857,
"learning_rate": 5e-06,
"loss": 0.7786,
"step": 1070
},
{
"epoch": 1.5769300967329805,
"grad_norm": 0.5170403858384673,
"learning_rate": 5e-06,
"loss": 0.772,
"step": 1080
},
{
"epoch": 1.5915313013323598,
"grad_norm": 0.5584745095875884,
"learning_rate": 5e-06,
"loss": 0.7788,
"step": 1090
},
{
"epoch": 1.6061325059317393,
"grad_norm": 0.5632792125524021,
"learning_rate": 5e-06,
"loss": 0.7764,
"step": 1100
},
{
"epoch": 1.6207337105311188,
"grad_norm": 0.5303585273369582,
"learning_rate": 5e-06,
"loss": 0.7698,
"step": 1110
},
{
"epoch": 1.6353349151304983,
"grad_norm": 0.5292556194617752,
"learning_rate": 5e-06,
"loss": 0.7754,
"step": 1120
},
{
"epoch": 1.6499361197298779,
"grad_norm": 0.5319736770394399,
"learning_rate": 5e-06,
"loss": 0.7754,
"step": 1130
},
{
"epoch": 1.6645373243292572,
"grad_norm": 0.5409862397072692,
"learning_rate": 5e-06,
"loss": 0.7732,
"step": 1140
},
{
"epoch": 1.6791385289286365,
"grad_norm": 0.5347398767131228,
"learning_rate": 5e-06,
"loss": 0.775,
"step": 1150
},
{
"epoch": 1.693739733528016,
"grad_norm": 0.5887598823053857,
"learning_rate": 5e-06,
"loss": 0.7734,
"step": 1160
},
{
"epoch": 1.7083409381273955,
"grad_norm": 0.588980481311897,
"learning_rate": 5e-06,
"loss": 0.7776,
"step": 1170
},
{
"epoch": 1.722942142726775,
"grad_norm": 0.5476017973657227,
"learning_rate": 5e-06,
"loss": 0.7718,
"step": 1180
},
{
"epoch": 1.7375433473261546,
"grad_norm": 0.5548638443373327,
"learning_rate": 5e-06,
"loss": 0.778,
"step": 1190
},
{
"epoch": 1.7521445519255339,
"grad_norm": 0.5443995408512653,
"learning_rate": 5e-06,
"loss": 0.7731,
"step": 1200
},
{
"epoch": 1.7667457565249132,
"grad_norm": 0.5134399032378028,
"learning_rate": 5e-06,
"loss": 0.7762,
"step": 1210
},
{
"epoch": 1.7813469611242927,
"grad_norm": 0.5143443520325698,
"learning_rate": 5e-06,
"loss": 0.7737,
"step": 1220
},
{
"epoch": 1.7959481657236722,
"grad_norm": 0.5712512301925389,
"learning_rate": 5e-06,
"loss": 0.7752,
"step": 1230
},
{
"epoch": 1.8105493703230517,
"grad_norm": 0.5022436155237929,
"learning_rate": 5e-06,
"loss": 0.7746,
"step": 1240
},
{
"epoch": 1.825150574922431,
"grad_norm": 0.5183607046169039,
"learning_rate": 5e-06,
"loss": 0.7758,
"step": 1250
},
{
"epoch": 1.8397517795218106,
"grad_norm": 0.5327048894936923,
"learning_rate": 5e-06,
"loss": 0.7737,
"step": 1260
},
{
"epoch": 1.8543529841211899,
"grad_norm": 0.4828373941208032,
"learning_rate": 5e-06,
"loss": 0.7734,
"step": 1270
},
{
"epoch": 1.8689541887205694,
"grad_norm": 0.4692396568766125,
"learning_rate": 5e-06,
"loss": 0.7702,
"step": 1280
},
{
"epoch": 1.883555393319949,
"grad_norm": 0.5272353821412613,
"learning_rate": 5e-06,
"loss": 0.7738,
"step": 1290
},
{
"epoch": 1.8981565979193284,
"grad_norm": 0.5974378803453756,
"learning_rate": 5e-06,
"loss": 0.7744,
"step": 1300
},
{
"epoch": 1.9127578025187078,
"grad_norm": 0.45897414900404526,
"learning_rate": 5e-06,
"loss": 0.7707,
"step": 1310
},
{
"epoch": 1.9273590071180873,
"grad_norm": 0.5661797189222842,
"learning_rate": 5e-06,
"loss": 0.7729,
"step": 1320
},
{
"epoch": 1.9419602117174666,
"grad_norm": 0.5291159788169262,
"learning_rate": 5e-06,
"loss": 0.7704,
"step": 1330
},
{
"epoch": 1.956561416316846,
"grad_norm": 0.5803039839795054,
"learning_rate": 5e-06,
"loss": 0.7711,
"step": 1340
},
{
"epoch": 1.9711626209162256,
"grad_norm": 0.5668430658536179,
"learning_rate": 5e-06,
"loss": 0.7714,
"step": 1350
},
{
"epoch": 1.9857638255156052,
"grad_norm": 0.5292288364377395,
"learning_rate": 5e-06,
"loss": 0.7681,
"step": 1360
},
{
"epoch": 1.9989049096550464,
"eval_loss": 0.8045554161071777,
"eval_runtime": 732.5889,
"eval_samples_per_second": 25.191,
"eval_steps_per_second": 0.394,
"step": 1369
},
{
"epoch": 2.0003650301149847,
"grad_norm": 0.5917011918049785,
"learning_rate": 5e-06,
"loss": 0.7813,
"step": 1370
},
{
"epoch": 2.0149662347143638,
"grad_norm": 0.6437888983866474,
"learning_rate": 5e-06,
"loss": 0.7275,
"step": 1380
},
{
"epoch": 2.0295674393137433,
"grad_norm": 0.5993159674827806,
"learning_rate": 5e-06,
"loss": 0.7301,
"step": 1390
},
{
"epoch": 2.044168643913123,
"grad_norm": 0.6004688564094799,
"learning_rate": 5e-06,
"loss": 0.7279,
"step": 1400
},
{
"epoch": 2.0587698485125023,
"grad_norm": 0.5968765010977406,
"learning_rate": 5e-06,
"loss": 0.732,
"step": 1410
},
{
"epoch": 2.073371053111882,
"grad_norm": 0.6368286520923802,
"learning_rate": 5e-06,
"loss": 0.7327,
"step": 1420
},
{
"epoch": 2.0879722577112614,
"grad_norm": 0.5121224799191383,
"learning_rate": 5e-06,
"loss": 0.7289,
"step": 1430
},
{
"epoch": 2.1025734623106405,
"grad_norm": 0.5426488835411897,
"learning_rate": 5e-06,
"loss": 0.729,
"step": 1440
},
{
"epoch": 2.11717466691002,
"grad_norm": 0.5360711433495,
"learning_rate": 5e-06,
"loss": 0.7321,
"step": 1450
},
{
"epoch": 2.1317758715093995,
"grad_norm": 0.570345866307846,
"learning_rate": 5e-06,
"loss": 0.7283,
"step": 1460
},
{
"epoch": 2.146377076108779,
"grad_norm": 0.5646482286111343,
"learning_rate": 5e-06,
"loss": 0.7341,
"step": 1470
},
{
"epoch": 2.1609782807081586,
"grad_norm": 0.6170916412089019,
"learning_rate": 5e-06,
"loss": 0.7273,
"step": 1480
},
{
"epoch": 2.175579485307538,
"grad_norm": 0.5669820051659463,
"learning_rate": 5e-06,
"loss": 0.7327,
"step": 1490
},
{
"epoch": 2.190180689906917,
"grad_norm": 0.5032603903192315,
"learning_rate": 5e-06,
"loss": 0.7259,
"step": 1500
},
{
"epoch": 2.2047818945062967,
"grad_norm": 0.536250519862031,
"learning_rate": 5e-06,
"loss": 0.7309,
"step": 1510
},
{
"epoch": 2.219383099105676,
"grad_norm": 0.4989195000116721,
"learning_rate": 5e-06,
"loss": 0.7351,
"step": 1520
},
{
"epoch": 2.2339843037050557,
"grad_norm": 0.48084465295135953,
"learning_rate": 5e-06,
"loss": 0.7319,
"step": 1530
},
{
"epoch": 2.2485855083044353,
"grad_norm": 0.6041492880249871,
"learning_rate": 5e-06,
"loss": 0.7331,
"step": 1540
},
{
"epoch": 2.2631867129038143,
"grad_norm": 0.48501706414438883,
"learning_rate": 5e-06,
"loss": 0.7364,
"step": 1550
},
{
"epoch": 2.277787917503194,
"grad_norm": 0.486621431249399,
"learning_rate": 5e-06,
"loss": 0.7336,
"step": 1560
},
{
"epoch": 2.2923891221025734,
"grad_norm": 0.6249224122437264,
"learning_rate": 5e-06,
"loss": 0.7335,
"step": 1570
},
{
"epoch": 2.306990326701953,
"grad_norm": 0.6255968466832591,
"learning_rate": 5e-06,
"loss": 0.7315,
"step": 1580
},
{
"epoch": 2.3215915313013324,
"grad_norm": 0.5271636183137467,
"learning_rate": 5e-06,
"loss": 0.7322,
"step": 1590
},
{
"epoch": 2.336192735900712,
"grad_norm": 0.5924207573496902,
"learning_rate": 5e-06,
"loss": 0.7345,
"step": 1600
},
{
"epoch": 2.3507939405000915,
"grad_norm": 0.5587622345452513,
"learning_rate": 5e-06,
"loss": 0.7344,
"step": 1610
},
{
"epoch": 2.3653951450994706,
"grad_norm": 0.6269568570302153,
"learning_rate": 5e-06,
"loss": 0.7359,
"step": 1620
},
{
"epoch": 2.37999634969885,
"grad_norm": 0.6533009549527986,
"learning_rate": 5e-06,
"loss": 0.7368,
"step": 1630
},
{
"epoch": 2.3945975542982296,
"grad_norm": 0.5195452792704209,
"learning_rate": 5e-06,
"loss": 0.7355,
"step": 1640
},
{
"epoch": 2.409198758897609,
"grad_norm": 0.5528537289554203,
"learning_rate": 5e-06,
"loss": 0.7355,
"step": 1650
},
{
"epoch": 2.4237999634969887,
"grad_norm": 0.5434124551027695,
"learning_rate": 5e-06,
"loss": 0.7343,
"step": 1660
},
{
"epoch": 2.4384011680963678,
"grad_norm": 0.5166289013156083,
"learning_rate": 5e-06,
"loss": 0.7348,
"step": 1670
},
{
"epoch": 2.4530023726957473,
"grad_norm": 0.5085481998475664,
"learning_rate": 5e-06,
"loss": 0.7384,
"step": 1680
},
{
"epoch": 2.467603577295127,
"grad_norm": 0.47895714940004425,
"learning_rate": 5e-06,
"loss": 0.7319,
"step": 1690
},
{
"epoch": 2.4822047818945063,
"grad_norm": 0.6790087073589551,
"learning_rate": 5e-06,
"loss": 0.7344,
"step": 1700
},
{
"epoch": 2.496805986493886,
"grad_norm": 0.5337750367727961,
"learning_rate": 5e-06,
"loss": 0.7348,
"step": 1710
},
{
"epoch": 2.511407191093265,
"grad_norm": 0.578024943637955,
"learning_rate": 5e-06,
"loss": 0.7335,
"step": 1720
},
{
"epoch": 2.526008395692645,
"grad_norm": 0.5321237652992599,
"learning_rate": 5e-06,
"loss": 0.7341,
"step": 1730
},
{
"epoch": 2.540609600292024,
"grad_norm": 0.552250362014024,
"learning_rate": 5e-06,
"loss": 0.7345,
"step": 1740
},
{
"epoch": 2.5552108048914035,
"grad_norm": 0.47716769494866995,
"learning_rate": 5e-06,
"loss": 0.7364,
"step": 1750
},
{
"epoch": 2.569812009490783,
"grad_norm": 0.5382810891628738,
"learning_rate": 5e-06,
"loss": 0.7327,
"step": 1760
},
{
"epoch": 2.5844132140901626,
"grad_norm": 0.5291652309846363,
"learning_rate": 5e-06,
"loss": 0.734,
"step": 1770
},
{
"epoch": 2.599014418689542,
"grad_norm": 0.583404809051004,
"learning_rate": 5e-06,
"loss": 0.7309,
"step": 1780
},
{
"epoch": 2.613615623288921,
"grad_norm": 0.607832211058636,
"learning_rate": 5e-06,
"loss": 0.7352,
"step": 1790
},
{
"epoch": 2.6282168278883007,
"grad_norm": 0.5598928601891838,
"learning_rate": 5e-06,
"loss": 0.7356,
"step": 1800
},
{
"epoch": 2.64281803248768,
"grad_norm": 0.5949750898099526,
"learning_rate": 5e-06,
"loss": 0.7353,
"step": 1810
},
{
"epoch": 2.6574192370870597,
"grad_norm": 0.6066820856022053,
"learning_rate": 5e-06,
"loss": 0.7347,
"step": 1820
},
{
"epoch": 2.6720204416864393,
"grad_norm": 0.5198033507111653,
"learning_rate": 5e-06,
"loss": 0.7354,
"step": 1830
},
{
"epoch": 2.6866216462858183,
"grad_norm": 0.5040452692645214,
"learning_rate": 5e-06,
"loss": 0.7391,
"step": 1840
},
{
"epoch": 2.701222850885198,
"grad_norm": 0.5156854247429866,
"learning_rate": 5e-06,
"loss": 0.7388,
"step": 1850
},
{
"epoch": 2.7158240554845774,
"grad_norm": 0.5617334329140413,
"learning_rate": 5e-06,
"loss": 0.737,
"step": 1860
},
{
"epoch": 2.730425260083957,
"grad_norm": 0.4970472716656489,
"learning_rate": 5e-06,
"loss": 0.7359,
"step": 1870
},
{
"epoch": 2.7450264646833364,
"grad_norm": 0.6666729572656519,
"learning_rate": 5e-06,
"loss": 0.7349,
"step": 1880
},
{
"epoch": 2.759627669282716,
"grad_norm": 0.660456603270783,
"learning_rate": 5e-06,
"loss": 0.7363,
"step": 1890
},
{
"epoch": 2.7742288738820955,
"grad_norm": 0.5479397279932245,
"learning_rate": 5e-06,
"loss": 0.7359,
"step": 1900
},
{
"epoch": 2.7888300784814746,
"grad_norm": 0.5184737073351016,
"learning_rate": 5e-06,
"loss": 0.7383,
"step": 1910
},
{
"epoch": 2.803431283080854,
"grad_norm": 0.501451603194624,
"learning_rate": 5e-06,
"loss": 0.7344,
"step": 1920
},
{
"epoch": 2.8180324876802336,
"grad_norm": 0.5543991291124852,
"learning_rate": 5e-06,
"loss": 0.7382,
"step": 1930
},
{
"epoch": 2.832633692279613,
"grad_norm": 0.6053239113120223,
"learning_rate": 5e-06,
"loss": 0.7356,
"step": 1940
},
{
"epoch": 2.8472348968789927,
"grad_norm": 0.5618006505391813,
"learning_rate": 5e-06,
"loss": 0.7377,
"step": 1950
},
{
"epoch": 2.8618361014783718,
"grad_norm": 0.5815392261505143,
"learning_rate": 5e-06,
"loss": 0.7337,
"step": 1960
},
{
"epoch": 2.8764373060777513,
"grad_norm": 0.7488694605510656,
"learning_rate": 5e-06,
"loss": 0.7362,
"step": 1970
},
{
"epoch": 2.891038510677131,
"grad_norm": 0.5769073126410138,
"learning_rate": 5e-06,
"loss": 0.7359,
"step": 1980
},
{
"epoch": 2.9056397152765103,
"grad_norm": 0.5750570915989177,
"learning_rate": 5e-06,
"loss": 0.7331,
"step": 1990
},
{
"epoch": 2.92024091987589,
"grad_norm": 0.5354199731148004,
"learning_rate": 5e-06,
"loss": 0.7341,
"step": 2000
},
{
"epoch": 2.9348421244752694,
"grad_norm": 0.5855570342179945,
"learning_rate": 5e-06,
"loss": 0.7404,
"step": 2010
},
{
"epoch": 2.949443329074649,
"grad_norm": 0.6261526281235102,
"learning_rate": 5e-06,
"loss": 0.7337,
"step": 2020
},
{
"epoch": 2.964044533674028,
"grad_norm": 0.5504549828167312,
"learning_rate": 5e-06,
"loss": 0.7348,
"step": 2030
},
{
"epoch": 2.9786457382734075,
"grad_norm": 0.529021801831048,
"learning_rate": 5e-06,
"loss": 0.7354,
"step": 2040
},
{
"epoch": 2.993246942872787,
"grad_norm": 0.5245972765419218,
"learning_rate": 5e-06,
"loss": 0.7372,
"step": 2050
},
{
"epoch": 2.9961671837926627,
"eval_loss": 0.8026307821273804,
"eval_runtime": 732.5471,
"eval_samples_per_second": 25.193,
"eval_steps_per_second": 0.395,
"step": 2052
},
{
"epoch": 2.9961671837926627,
"step": 2052,
"total_flos": 3436967047987200.0,
"train_loss": 0.7862561077286161,
"train_runtime": 121149.8135,
"train_samples_per_second": 8.683,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 2052,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3436967047987200.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}