trenkert's picture
Upload folder using huggingface_hub
b22ff7a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.996510275300504,
"eval_steps": 500,
"global_step": 966,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.031019775106630478,
"grad_norm": 4.631979942321777,
"learning_rate": 5.517241379310345e-06,
"loss": 3.6051,
"step": 10
},
{
"epoch": 0.062039550213260956,
"grad_norm": 1.5514965057373047,
"learning_rate": 1.2413793103448277e-05,
"loss": 2.545,
"step": 20
},
{
"epoch": 0.09305932531989143,
"grad_norm": 1.3494954109191895,
"learning_rate": 1.9310344827586207e-05,
"loss": 2.1342,
"step": 30
},
{
"epoch": 0.12407910042652191,
"grad_norm": 1.3906075954437256,
"learning_rate": 2.620689655172414e-05,
"loss": 2.0661,
"step": 40
},
{
"epoch": 0.15509887553315238,
"grad_norm": 1.3953957557678223,
"learning_rate": 3.310344827586207e-05,
"loss": 2.0186,
"step": 50
},
{
"epoch": 0.18611865063978286,
"grad_norm": 1.6760238409042358,
"learning_rate": 4e-05,
"loss": 1.7399,
"step": 60
},
{
"epoch": 0.21713842574641334,
"grad_norm": 1.727597951889038,
"learning_rate": 4.689655172413793e-05,
"loss": 1.6954,
"step": 70
},
{
"epoch": 0.24815820085304383,
"grad_norm": 1.604373812675476,
"learning_rate": 5.379310344827586e-05,
"loss": 1.6684,
"step": 80
},
{
"epoch": 0.2791779759596743,
"grad_norm": 1.6111727952957153,
"learning_rate": 6.068965517241379e-05,
"loss": 1.6644,
"step": 90
},
{
"epoch": 0.31019775106630476,
"grad_norm": 1.4779224395751953,
"learning_rate": 6.758620689655173e-05,
"loss": 1.562,
"step": 100
},
{
"epoch": 0.34121752617293527,
"grad_norm": 1.940397024154663,
"learning_rate": 7.448275862068966e-05,
"loss": 1.5314,
"step": 110
},
{
"epoch": 0.3722373012795657,
"grad_norm": 1.7813657522201538,
"learning_rate": 8.137931034482759e-05,
"loss": 1.5245,
"step": 120
},
{
"epoch": 0.4032570763861962,
"grad_norm": 2.0770983695983887,
"learning_rate": 8.827586206896552e-05,
"loss": 1.5824,
"step": 130
},
{
"epoch": 0.4342768514928267,
"grad_norm": 1.6462304592132568,
"learning_rate": 9.517241379310345e-05,
"loss": 1.4695,
"step": 140
},
{
"epoch": 0.46529662659945714,
"grad_norm": 2.1206908226013184,
"learning_rate": 9.999670548734657e-05,
"loss": 1.5062,
"step": 150
},
{
"epoch": 0.49631640170608765,
"grad_norm": 2.2103354930877686,
"learning_rate": 9.993814845038307e-05,
"loss": 1.3786,
"step": 160
},
{
"epoch": 0.5273361768127182,
"grad_norm": 2.300985336303711,
"learning_rate": 9.980647870476639e-05,
"loss": 1.4175,
"step": 170
},
{
"epoch": 0.5583559519193486,
"grad_norm": 2.7707035541534424,
"learning_rate": 9.960188902359786e-05,
"loss": 1.4232,
"step": 180
},
{
"epoch": 0.5893757270259791,
"grad_norm": 2.5883567333221436,
"learning_rate": 9.932467893952367e-05,
"loss": 1.3551,
"step": 190
},
{
"epoch": 0.6203955021326095,
"grad_norm": 2.2564728260040283,
"learning_rate": 9.897525430619965e-05,
"loss": 1.3123,
"step": 200
},
{
"epoch": 0.65141527723924,
"grad_norm": 2.3438875675201416,
"learning_rate": 9.855412670409493e-05,
"loss": 1.265,
"step": 210
},
{
"epoch": 0.6824350523458705,
"grad_norm": 2.057579755783081,
"learning_rate": 9.806191269150479e-05,
"loss": 1.2062,
"step": 220
},
{
"epoch": 0.713454827452501,
"grad_norm": 2.852278470993042,
"learning_rate": 9.749933290186913e-05,
"loss": 1.2265,
"step": 230
},
{
"epoch": 0.7444746025591314,
"grad_norm": 2.790354013442993,
"learning_rate": 9.686721098871789e-05,
"loss": 1.2365,
"step": 240
},
{
"epoch": 0.7754943776657619,
"grad_norm": 2.6249144077301025,
"learning_rate": 9.61664724197888e-05,
"loss": 1.1627,
"step": 250
},
{
"epoch": 0.8065141527723924,
"grad_norm": 2.421135663986206,
"learning_rate": 9.539814312208195e-05,
"loss": 1.2203,
"step": 260
},
{
"epoch": 0.8375339278790229,
"grad_norm": 2.2487196922302246,
"learning_rate": 9.45633479798359e-05,
"loss": 1.1503,
"step": 270
},
{
"epoch": 0.8685537029856534,
"grad_norm": 3.0988595485687256,
"learning_rate": 9.366330918762361e-05,
"loss": 1.1575,
"step": 280
},
{
"epoch": 0.8995734780922838,
"grad_norm": 2.629244327545166,
"learning_rate": 9.269934446097986e-05,
"loss": 1.1328,
"step": 290
},
{
"epoch": 0.9305932531989143,
"grad_norm": 3.2107417583465576,
"learning_rate": 9.16728651071797e-05,
"loss": 1.1052,
"step": 300
},
{
"epoch": 0.9616130283055447,
"grad_norm": 2.520418405532837,
"learning_rate": 9.058537395899252e-05,
"loss": 1.0805,
"step": 310
},
{
"epoch": 0.9926328034121753,
"grad_norm": 2.8015685081481934,
"learning_rate": 8.943846317443673e-05,
"loss": 1.0565,
"step": 320
},
{
"epoch": 1.0236525785188058,
"grad_norm": 3.1201586723327637,
"learning_rate": 8.823381190575654e-05,
"loss": 0.8105,
"step": 330
},
{
"epoch": 1.0546723536254363,
"grad_norm": 2.659348487854004,
"learning_rate": 8.69731838410335e-05,
"loss": 0.7056,
"step": 340
},
{
"epoch": 1.0856921287320667,
"grad_norm": 2.9386487007141113,
"learning_rate": 8.565842462203197e-05,
"loss": 0.7194,
"step": 350
},
{
"epoch": 1.1167119038386972,
"grad_norm": 3.119506359100342,
"learning_rate": 8.42914591420592e-05,
"loss": 0.6653,
"step": 360
},
{
"epoch": 1.1477316789453276,
"grad_norm": 3.151764154434204,
"learning_rate": 8.287428872779583e-05,
"loss": 0.6434,
"step": 370
},
{
"epoch": 1.1787514540519581,
"grad_norm": 2.7570505142211914,
"learning_rate": 8.140898820922307e-05,
"loss": 0.6401,
"step": 380
},
{
"epoch": 1.2097712291585885,
"grad_norm": 3.0509796142578125,
"learning_rate": 7.989770288193614e-05,
"loss": 0.6535,
"step": 390
},
{
"epoch": 1.240791004265219,
"grad_norm": 2.9341208934783936,
"learning_rate": 7.834264536629148e-05,
"loss": 0.6892,
"step": 400
},
{
"epoch": 1.2718107793718496,
"grad_norm": 4.456368446350098,
"learning_rate": 7.674609236798621e-05,
"loss": 0.6351,
"step": 410
},
{
"epoch": 1.30283055447848,
"grad_norm": 3.2823486328125,
"learning_rate": 7.511038134481237e-05,
"loss": 0.6424,
"step": 420
},
{
"epoch": 1.3338503295851105,
"grad_norm": 3.2044315338134766,
"learning_rate": 7.343790708446609e-05,
"loss": 0.6297,
"step": 430
},
{
"epoch": 1.3648701046917409,
"grad_norm": 2.9563002586364746,
"learning_rate": 7.173111819842222e-05,
"loss": 0.55,
"step": 440
},
{
"epoch": 1.3958898797983714,
"grad_norm": 2.863718271255493,
"learning_rate": 6.999251353700718e-05,
"loss": 0.564,
"step": 450
},
{
"epoch": 1.426909654905002,
"grad_norm": 2.617347240447998,
"learning_rate": 6.822463853091911e-05,
"loss": 0.5635,
"step": 460
},
{
"epoch": 1.4579294300116323,
"grad_norm": 2.8376879692077637,
"learning_rate": 6.643008146455114e-05,
"loss": 0.5589,
"step": 470
},
{
"epoch": 1.488949205118263,
"grad_norm": 2.8635995388031006,
"learning_rate": 6.46114696865741e-05,
"loss": 0.5668,
"step": 480
},
{
"epoch": 1.5199689802248932,
"grad_norm": 2.637256622314453,
"learning_rate": 6.277146576332657e-05,
"loss": 0.5273,
"step": 490
},
{
"epoch": 1.5509887553315238,
"grad_norm": 3.0905144214630127,
"learning_rate": 6.091276358064408e-05,
"loss": 0.5076,
"step": 500
},
{
"epoch": 1.5820085304381544,
"grad_norm": 3.2317402362823486,
"learning_rate": 5.903808439983428e-05,
"loss": 0.4969,
"step": 510
},
{
"epoch": 1.613028305544785,
"grad_norm": 3.3929502964019775,
"learning_rate": 5.7150172873572906e-05,
"loss": 0.4832,
"step": 520
},
{
"epoch": 1.6440480806514153,
"grad_norm": 3.057800531387329,
"learning_rate": 5.525179302755303e-05,
"loss": 0.4986,
"step": 530
},
{
"epoch": 1.6750678557580456,
"grad_norm": 3.7113149166107178,
"learning_rate": 5.3345724213771145e-05,
"loss": 0.492,
"step": 540
},
{
"epoch": 1.7060876308646762,
"grad_norm": 3.076077699661255,
"learning_rate": 5.143475704137433e-05,
"loss": 0.4645,
"step": 550
},
{
"epoch": 1.7371074059713068,
"grad_norm": 3.2166106700897217,
"learning_rate": 4.9521689291026406e-05,
"loss": 0.4516,
"step": 560
},
{
"epoch": 1.7681271810779373,
"grad_norm": 2.9462897777557373,
"learning_rate": 4.760932181877439e-05,
"loss": 0.4372,
"step": 570
},
{
"epoch": 1.7991469561845677,
"grad_norm": 2.8754382133483887,
"learning_rate": 4.570045445541253e-05,
"loss": 0.4232,
"step": 580
},
{
"epoch": 1.830166731291198,
"grad_norm": 3.5116498470306396,
"learning_rate": 4.379788190734712e-05,
"loss": 0.4268,
"step": 590
},
{
"epoch": 1.8611865063978286,
"grad_norm": 3.0945634841918945,
"learning_rate": 4.190438966496407e-05,
"loss": 0.4093,
"step": 600
},
{
"epoch": 1.8922062815044591,
"grad_norm": 2.920431613922119,
"learning_rate": 4.002274992448911e-05,
"loss": 0.3945,
"step": 610
},
{
"epoch": 1.9232260566110897,
"grad_norm": 2.7673826217651367,
"learning_rate": 3.815571752931162e-05,
"loss": 0.3948,
"step": 620
},
{
"epoch": 1.95424583171772,
"grad_norm": 2.909179925918579,
"learning_rate": 3.630602593671405e-05,
"loss": 0.3731,
"step": 630
},
{
"epoch": 1.9852656068243504,
"grad_norm": 2.851840019226074,
"learning_rate": 3.4476383215912114e-05,
"loss": 0.3476,
"step": 640
},
{
"epoch": 2.016285381930981,
"grad_norm": 2.012725353240967,
"learning_rate": 3.266946808326466e-05,
"loss": 0.2479,
"step": 650
},
{
"epoch": 2.0473051570376115,
"grad_norm": 2.0801613330841064,
"learning_rate": 3.0887925980458154e-05,
"loss": 0.1408,
"step": 660
},
{
"epoch": 2.078324932144242,
"grad_norm": 1.9638011455535889,
"learning_rate": 2.913436520140731e-05,
"loss": 0.1383,
"step": 670
},
{
"epoch": 2.1093447072508726,
"grad_norm": 2.354614019393921,
"learning_rate": 2.74113530735426e-05,
"loss": 0.1431,
"step": 680
},
{
"epoch": 2.1403644823575028,
"grad_norm": 1.8083250522613525,
"learning_rate": 2.5721412199075372e-05,
"loss": 0.1253,
"step": 690
},
{
"epoch": 2.1713842574641333,
"grad_norm": 2.4336416721343994,
"learning_rate": 2.4067016761743515e-05,
"loss": 0.1316,
"step": 700
},
{
"epoch": 2.202404032570764,
"grad_norm": 2.073843240737915,
"learning_rate": 2.2450588904444968e-05,
"loss": 0.1251,
"step": 710
},
{
"epoch": 2.2334238076773945,
"grad_norm": 1.7229952812194824,
"learning_rate": 2.0874495183062503e-05,
"loss": 0.1151,
"step": 720
},
{
"epoch": 2.2644435827840246,
"grad_norm": 1.7413322925567627,
"learning_rate": 1.9341043101671412e-05,
"loss": 0.1176,
"step": 730
},
{
"epoch": 2.295463357890655,
"grad_norm": 1.87288236618042,
"learning_rate": 1.7852477734202954e-05,
"loss": 0.1099,
"step": 740
},
{
"epoch": 2.3264831329972857,
"grad_norm": 1.6436331272125244,
"learning_rate": 1.641097843750952e-05,
"loss": 0.1061,
"step": 750
},
{
"epoch": 2.3575029081039163,
"grad_norm": 2.2512550354003906,
"learning_rate": 1.5018655660644055e-05,
"loss": 0.0849,
"step": 760
},
{
"epoch": 2.388522683210547,
"grad_norm": 1.8482258319854736,
"learning_rate": 1.3677547855024907e-05,
"loss": 0.1089,
"step": 770
},
{
"epoch": 2.419542458317177,
"grad_norm": 1.5719361305236816,
"learning_rate": 1.2389618490009775e-05,
"loss": 0.0837,
"step": 780
},
{
"epoch": 2.4505622334238075,
"grad_norm": 1.8144450187683105,
"learning_rate": 1.1156753178248564e-05,
"loss": 0.1133,
"step": 790
},
{
"epoch": 2.481582008530438,
"grad_norm": 2.3764026165008545,
"learning_rate": 9.980756915023332e-06,
"loss": 0.09,
"step": 800
},
{
"epoch": 2.5126017836370687,
"grad_norm": 1.60674250125885,
"learning_rate": 8.863351435617395e-06,
"loss": 0.0831,
"step": 810
},
{
"epoch": 2.543621558743699,
"grad_norm": 1.7593117952346802,
"learning_rate": 7.806172694582487e-06,
"loss": 0.0951,
"step": 820
},
{
"epoch": 2.5746413338503293,
"grad_norm": 2.17861008644104,
"learning_rate": 6.810768470594553e-06,
"loss": 0.0938,
"step": 830
},
{
"epoch": 2.60566110895696,
"grad_norm": 1.4292868375778198,
"learning_rate": 5.878596100404743e-06,
"loss": 0.0738,
"step": 840
},
{
"epoch": 2.6366808840635905,
"grad_norm": 1.06781005859375,
"learning_rate": 5.01102034520326e-06,
"loss": 0.0893,
"step": 850
},
{
"epoch": 2.667700659170221,
"grad_norm": 1.4993703365325928,
"learning_rate": 4.209311392519955e-06,
"loss": 0.0706,
"step": 860
},
{
"epoch": 2.6987204342768516,
"grad_norm": 1.5031639337539673,
"learning_rate": 3.4746429965867967e-06,
"loss": 0.0752,
"step": 870
},
{
"epoch": 2.7297402093834817,
"grad_norm": 1.7316648960113525,
"learning_rate": 2.808090759885207e-06,
"loss": 0.075,
"step": 880
},
{
"epoch": 2.7607599844901123,
"grad_norm": 1.4538629055023193,
"learning_rate": 2.2106305583936617e-06,
"loss": 0.0768,
"step": 890
},
{
"epoch": 2.791779759596743,
"grad_norm": 0.8957967162132263,
"learning_rate": 1.6831371128416983e-06,
"loss": 0.0726,
"step": 900
},
{
"epoch": 2.8227995347033734,
"grad_norm": 1.7930282354354858,
"learning_rate": 1.2263827080616074e-06,
"loss": 0.0655,
"step": 910
},
{
"epoch": 2.853819309810004,
"grad_norm": 1.678768277168274,
"learning_rate": 8.410360623130554e-07,
"loss": 0.0743,
"step": 920
},
{
"epoch": 2.884839084916634,
"grad_norm": 1.4549870491027832,
"learning_rate": 5.276613482359138e-07,
"loss": 0.0773,
"step": 930
},
{
"epoch": 2.9158588600232647,
"grad_norm": 1.54071044921875,
"learning_rate": 2.867173668646583e-07,
"loss": 0.0804,
"step": 940
},
{
"epoch": 2.9468786351298952,
"grad_norm": 1.3247941732406616,
"learning_rate": 1.1855687591376675e-07,
"loss": 0.0756,
"step": 950
},
{
"epoch": 2.977898410236526,
"grad_norm": 1.7488839626312256,
"learning_rate": 2.342607331733415e-08,
"loss": 0.0805,
"step": 960
}
],
"logging_steps": 10,
"max_steps": 966,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.822238727303332e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}