wahdie11's picture
Upload folder using huggingface_hub
5405343 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8635578583765112,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008635578583765112,
"grad_norm": 0.6623008251190186,
"learning_rate": 0.0002,
"loss": 1.7783,
"step": 10
},
{
"epoch": 0.017271157167530225,
"grad_norm": 0.5188003182411194,
"learning_rate": 0.00019994965423831854,
"loss": 1.3036,
"step": 20
},
{
"epoch": 0.025906735751295335,
"grad_norm": 0.5021234154701233,
"learning_rate": 0.00019979866764718843,
"loss": 0.9562,
"step": 30
},
{
"epoch": 0.03454231433506045,
"grad_norm": 0.4302387833595276,
"learning_rate": 0.00019954719225730847,
"loss": 0.9219,
"step": 40
},
{
"epoch": 0.04317789291882556,
"grad_norm": 0.5690401792526245,
"learning_rate": 0.00019919548128307954,
"loss": 0.9422,
"step": 50
},
{
"epoch": 0.05181347150259067,
"grad_norm": 0.5356280207633972,
"learning_rate": 0.00019874388886763944,
"loss": 0.8915,
"step": 60
},
{
"epoch": 0.06044905008635579,
"grad_norm": 0.4261893332004547,
"learning_rate": 0.00019819286972627066,
"loss": 0.8435,
"step": 70
},
{
"epoch": 0.0690846286701209,
"grad_norm": 0.5050117373466492,
"learning_rate": 0.00019754297868854073,
"loss": 0.8846,
"step": 80
},
{
"epoch": 0.07772020725388601,
"grad_norm": 0.39581066370010376,
"learning_rate": 0.00019679487013963564,
"loss": 0.8297,
"step": 90
},
{
"epoch": 0.08635578583765112,
"grad_norm": 0.45784541964530945,
"learning_rate": 0.00019594929736144976,
"loss": 0.8334,
"step": 100
},
{
"epoch": 0.09499136442141623,
"grad_norm": 0.43146270513534546,
"learning_rate": 0.00019500711177409454,
"loss": 0.8614,
"step": 110
},
{
"epoch": 0.10362694300518134,
"grad_norm": 0.38489606976509094,
"learning_rate": 0.00019396926207859084,
"loss": 0.7923,
"step": 120
},
{
"epoch": 0.11226252158894647,
"grad_norm": 0.4087812006473541,
"learning_rate": 0.00019283679330160726,
"loss": 0.8122,
"step": 130
},
{
"epoch": 0.12089810017271158,
"grad_norm": 0.579022228717804,
"learning_rate": 0.00019161084574320696,
"loss": 0.7754,
"step": 140
},
{
"epoch": 0.12953367875647667,
"grad_norm": 0.4284251928329468,
"learning_rate": 0.00019029265382866214,
"loss": 0.8205,
"step": 150
},
{
"epoch": 0.1381692573402418,
"grad_norm": 0.4063330590724945,
"learning_rate": 0.00018888354486549237,
"loss": 0.8301,
"step": 160
},
{
"epoch": 0.14680483592400692,
"grad_norm": 0.42471617460250854,
"learning_rate": 0.00018738493770697852,
"loss": 0.8253,
"step": 170
},
{
"epoch": 0.15544041450777202,
"grad_norm": 0.44914358854293823,
"learning_rate": 0.00018579834132349772,
"loss": 0.8211,
"step": 180
},
{
"epoch": 0.16407599309153714,
"grad_norm": 0.35791847109794617,
"learning_rate": 0.00018412535328311814,
"loss": 0.8241,
"step": 190
},
{
"epoch": 0.17271157167530224,
"grad_norm": 0.41551169753074646,
"learning_rate": 0.0001823676581429833,
"loss": 0.8187,
"step": 200
},
{
"epoch": 0.18134715025906736,
"grad_norm": 0.3767964243888855,
"learning_rate": 0.00018052702575310588,
"loss": 0.7815,
"step": 210
},
{
"epoch": 0.18998272884283246,
"grad_norm": 0.3870415687561035,
"learning_rate": 0.00017860530947427875,
"loss": 0.7907,
"step": 220
},
{
"epoch": 0.19861830742659758,
"grad_norm": 0.4934289753437042,
"learning_rate": 0.0001766044443118978,
"loss": 0.7788,
"step": 230
},
{
"epoch": 0.20725388601036268,
"grad_norm": 0.40702807903289795,
"learning_rate": 0.0001745264449675755,
"loss": 0.7932,
"step": 240
},
{
"epoch": 0.2158894645941278,
"grad_norm": 0.40032950043678284,
"learning_rate": 0.00017237340381050703,
"loss": 0.7682,
"step": 250
},
{
"epoch": 0.22452504317789293,
"grad_norm": 0.4420917332172394,
"learning_rate": 0.00017014748877063214,
"loss": 0.7663,
"step": 260
},
{
"epoch": 0.23316062176165803,
"grad_norm": 0.3428107500076294,
"learning_rate": 0.00016785094115571322,
"loss": 0.7693,
"step": 270
},
{
"epoch": 0.24179620034542315,
"grad_norm": 0.7044374942779541,
"learning_rate": 0.00016548607339452853,
"loss": 0.7809,
"step": 280
},
{
"epoch": 0.2504317789291883,
"grad_norm": 0.37226754426956177,
"learning_rate": 0.00016305526670845226,
"loss": 0.8263,
"step": 290
},
{
"epoch": 0.25906735751295334,
"grad_norm": 0.40080901980400085,
"learning_rate": 0.00016056096871376667,
"loss": 0.8182,
"step": 300
},
{
"epoch": 0.26770293609671847,
"grad_norm": 0.37441566586494446,
"learning_rate": 0.00015800569095711982,
"loss": 0.7597,
"step": 310
},
{
"epoch": 0.2763385146804836,
"grad_norm": 0.4305630624294281,
"learning_rate": 0.00015539200638661104,
"loss": 0.801,
"step": 320
},
{
"epoch": 0.2849740932642487,
"grad_norm": 0.482138991355896,
"learning_rate": 0.00015272254676105025,
"loss": 0.7541,
"step": 330
},
{
"epoch": 0.29360967184801384,
"grad_norm": 0.40154337882995605,
"learning_rate": 0.00015000000000000001,
"loss": 0.8057,
"step": 340
},
{
"epoch": 0.3022452504317789,
"grad_norm": 0.3598850667476654,
"learning_rate": 0.0001472271074772683,
"loss": 0.756,
"step": 350
},
{
"epoch": 0.31088082901554404,
"grad_norm": 0.46537917852401733,
"learning_rate": 0.00014440666126057744,
"loss": 0.7563,
"step": 360
},
{
"epoch": 0.31951640759930916,
"grad_norm": 0.44803386926651,
"learning_rate": 0.00014154150130018866,
"loss": 0.7908,
"step": 370
},
{
"epoch": 0.3281519861830743,
"grad_norm": 0.4054509401321411,
"learning_rate": 0.00013863451256931287,
"loss": 0.7768,
"step": 380
},
{
"epoch": 0.33678756476683935,
"grad_norm": 0.3694852292537689,
"learning_rate": 0.00013568862215918717,
"loss": 0.7508,
"step": 390
},
{
"epoch": 0.3454231433506045,
"grad_norm": 0.44693148136138916,
"learning_rate": 0.00013270679633174218,
"loss": 0.7723,
"step": 400
},
{
"epoch": 0.3540587219343696,
"grad_norm": 0.47046658396720886,
"learning_rate": 0.0001296920375328275,
"loss": 0.7662,
"step": 410
},
{
"epoch": 0.3626943005181347,
"grad_norm": 0.4129233658313751,
"learning_rate": 0.00012664738136900348,
"loss": 0.7546,
"step": 420
},
{
"epoch": 0.37132987910189985,
"grad_norm": 0.34191545844078064,
"learning_rate": 0.00012357589355094275,
"loss": 0.7586,
"step": 430
},
{
"epoch": 0.3799654576856649,
"grad_norm": 0.37853655219078064,
"learning_rate": 0.00012048066680651908,
"loss": 0.7964,
"step": 440
},
{
"epoch": 0.38860103626943004,
"grad_norm": 0.4268665313720703,
"learning_rate": 0.00011736481776669306,
"loss": 0.758,
"step": 450
},
{
"epoch": 0.39723661485319517,
"grad_norm": 0.3980403542518616,
"learning_rate": 0.00011423148382732853,
"loss": 0.7618,
"step": 460
},
{
"epoch": 0.4058721934369603,
"grad_norm": 0.3414579927921295,
"learning_rate": 0.00011108381999010111,
"loss": 0.7867,
"step": 470
},
{
"epoch": 0.41450777202072536,
"grad_norm": 0.3817692697048187,
"learning_rate": 0.00010792499568567884,
"loss": 0.738,
"step": 480
},
{
"epoch": 0.4231433506044905,
"grad_norm": 0.43348562717437744,
"learning_rate": 0.00010475819158237425,
"loss": 0.8304,
"step": 490
},
{
"epoch": 0.4317789291882556,
"grad_norm": 0.31839361786842346,
"learning_rate": 0.00010158659638348081,
"loss": 0.7905,
"step": 500
},
{
"epoch": 0.44041450777202074,
"grad_norm": 0.3816024661064148,
"learning_rate": 9.84134036165192e-05,
"loss": 0.7286,
"step": 510
},
{
"epoch": 0.44905008635578586,
"grad_norm": 0.39195144176483154,
"learning_rate": 9.524180841762577e-05,
"loss": 0.7773,
"step": 520
},
{
"epoch": 0.45768566493955093,
"grad_norm": 0.4253169298171997,
"learning_rate": 9.207500431432115e-05,
"loss": 0.7905,
"step": 530
},
{
"epoch": 0.46632124352331605,
"grad_norm": 0.4840407967567444,
"learning_rate": 8.891618000989891e-05,
"loss": 0.7924,
"step": 540
},
{
"epoch": 0.4749568221070812,
"grad_norm": 0.36509501934051514,
"learning_rate": 8.57685161726715e-05,
"loss": 0.772,
"step": 550
},
{
"epoch": 0.4835924006908463,
"grad_norm": 0.3916187584400177,
"learning_rate": 8.263518223330697e-05,
"loss": 0.7499,
"step": 560
},
{
"epoch": 0.49222797927461137,
"grad_norm": 0.39139890670776367,
"learning_rate": 7.951933319348095e-05,
"loss": 0.7694,
"step": 570
},
{
"epoch": 0.5008635578583766,
"grad_norm": 0.4077214300632477,
"learning_rate": 7.642410644905726e-05,
"loss": 0.7335,
"step": 580
},
{
"epoch": 0.5094991364421416,
"grad_norm": 0.39956483244895935,
"learning_rate": 7.335261863099651e-05,
"loss": 0.7627,
"step": 590
},
{
"epoch": 0.5181347150259067,
"grad_norm": 0.5192585587501526,
"learning_rate": 7.030796246717255e-05,
"loss": 0.7224,
"step": 600
},
{
"epoch": 0.5267702936096719,
"grad_norm": 0.3494277000427246,
"learning_rate": 6.729320366825784e-05,
"loss": 0.7576,
"step": 610
},
{
"epoch": 0.5354058721934369,
"grad_norm": 0.3863460123538971,
"learning_rate": 6.431137784081282e-05,
"loss": 0.7628,
"step": 620
},
{
"epoch": 0.5440414507772021,
"grad_norm": 0.3836175501346588,
"learning_rate": 6.136548743068713e-05,
"loss": 0.7852,
"step": 630
},
{
"epoch": 0.5526770293609672,
"grad_norm": 0.3461022973060608,
"learning_rate": 5.845849869981137e-05,
"loss": 0.7745,
"step": 640
},
{
"epoch": 0.5613126079447323,
"grad_norm": 0.4282170534133911,
"learning_rate": 5.559333873942259e-05,
"loss": 0.7755,
"step": 650
},
{
"epoch": 0.5699481865284974,
"grad_norm": 0.457685261964798,
"learning_rate": 5.277289252273174e-05,
"loss": 0.7358,
"step": 660
},
{
"epoch": 0.5785837651122625,
"grad_norm": 0.37797123193740845,
"learning_rate": 5.000000000000002e-05,
"loss": 0.7377,
"step": 670
},
{
"epoch": 0.5872193436960277,
"grad_norm": 0.3950762450695038,
"learning_rate": 4.727745323894976e-05,
"loss": 0.7795,
"step": 680
},
{
"epoch": 0.5958549222797928,
"grad_norm": 0.3642215132713318,
"learning_rate": 4.4607993613388976e-05,
"loss": 0.7442,
"step": 690
},
{
"epoch": 0.6044905008635578,
"grad_norm": 0.3860316574573517,
"learning_rate": 4.19943090428802e-05,
"loss": 0.7364,
"step": 700
},
{
"epoch": 0.613126079447323,
"grad_norm": 0.36184850335121155,
"learning_rate": 3.943903128623335e-05,
"loss": 0.7424,
"step": 710
},
{
"epoch": 0.6217616580310881,
"grad_norm": 0.3922266662120819,
"learning_rate": 3.694473329154778e-05,
"loss": 0.7364,
"step": 720
},
{
"epoch": 0.6303972366148531,
"grad_norm": 0.38554486632347107,
"learning_rate": 3.45139266054715e-05,
"loss": 0.7684,
"step": 730
},
{
"epoch": 0.6390328151986183,
"grad_norm": 0.3428330421447754,
"learning_rate": 3.21490588442868e-05,
"loss": 0.7346,
"step": 740
},
{
"epoch": 0.6476683937823834,
"grad_norm": 0.41248151659965515,
"learning_rate": 2.9852511229367865e-05,
"loss": 0.7606,
"step": 750
},
{
"epoch": 0.6563039723661486,
"grad_norm": 0.36212557554244995,
"learning_rate": 2.7626596189492983e-05,
"loss": 0.7511,
"step": 760
},
{
"epoch": 0.6649395509499136,
"grad_norm": 0.36411234736442566,
"learning_rate": 2.5473555032424533e-05,
"loss": 0.7316,
"step": 770
},
{
"epoch": 0.6735751295336787,
"grad_norm": 0.34422898292541504,
"learning_rate": 2.339555568810221e-05,
"loss": 0.7304,
"step": 780
},
{
"epoch": 0.6822107081174439,
"grad_norm": 0.4062047302722931,
"learning_rate": 2.139469052572127e-05,
"loss": 0.7369,
"step": 790
},
{
"epoch": 0.690846286701209,
"grad_norm": 0.3605830669403076,
"learning_rate": 1.947297424689414e-05,
"loss": 0.7435,
"step": 800
},
{
"epoch": 0.6994818652849741,
"grad_norm": 0.39452221989631653,
"learning_rate": 1.763234185701673e-05,
"loss": 0.733,
"step": 810
},
{
"epoch": 0.7081174438687392,
"grad_norm": 0.5000078082084656,
"learning_rate": 1.587464671688187e-05,
"loss": 0.7747,
"step": 820
},
{
"epoch": 0.7167530224525043,
"grad_norm": 0.35579192638397217,
"learning_rate": 1.4201658676502294e-05,
"loss": 0.7468,
"step": 830
},
{
"epoch": 0.7253886010362695,
"grad_norm": 0.36388713121414185,
"learning_rate": 1.2615062293021507e-05,
"loss": 0.74,
"step": 840
},
{
"epoch": 0.7340241796200345,
"grad_norm": 0.352222740650177,
"learning_rate": 1.1116455134507664e-05,
"loss": 0.735,
"step": 850
},
{
"epoch": 0.7426597582037997,
"grad_norm": 0.42218002676963806,
"learning_rate": 9.707346171337894e-06,
"loss": 0.7086,
"step": 860
},
{
"epoch": 0.7512953367875648,
"grad_norm": 0.38099607825279236,
"learning_rate": 8.38915425679304e-06,
"loss": 0.7615,
"step": 870
},
{
"epoch": 0.7599309153713298,
"grad_norm": 0.35596323013305664,
"learning_rate": 7.163206698392744e-06,
"loss": 0.7398,
"step": 880
},
{
"epoch": 0.768566493955095,
"grad_norm": 0.38860464096069336,
"learning_rate": 6.030737921409169e-06,
"loss": 0.7362,
"step": 890
},
{
"epoch": 0.7772020725388601,
"grad_norm": 0.4052976965904236,
"learning_rate": 4.992888225905468e-06,
"loss": 0.7543,
"step": 900
},
{
"epoch": 0.7858376511226253,
"grad_norm": 0.3824405372142792,
"learning_rate": 4.050702638550275e-06,
"loss": 0.724,
"step": 910
},
{
"epoch": 0.7944732297063903,
"grad_norm": 0.40496277809143066,
"learning_rate": 3.2051298603643753e-06,
"loss": 0.706,
"step": 920
},
{
"epoch": 0.8031088082901554,
"grad_norm": 0.3866259455680847,
"learning_rate": 2.4570213114592954e-06,
"loss": 0.741,
"step": 930
},
{
"epoch": 0.8117443868739206,
"grad_norm": 0.35403546690940857,
"learning_rate": 1.8071302737293295e-06,
"loss": 0.7306,
"step": 940
},
{
"epoch": 0.8203799654576857,
"grad_norm": 0.4546634554862976,
"learning_rate": 1.2561111323605712e-06,
"loss": 0.7597,
"step": 950
},
{
"epoch": 0.8290155440414507,
"grad_norm": 0.3442727029323578,
"learning_rate": 8.04518716920466e-07,
"loss": 0.7845,
"step": 960
},
{
"epoch": 0.8376511226252159,
"grad_norm": 0.37662404775619507,
"learning_rate": 4.5280774269154115e-07,
"loss": 0.7295,
"step": 970
},
{
"epoch": 0.846286701208981,
"grad_norm": 0.34127819538116455,
"learning_rate": 2.0133235281156736e-07,
"loss": 0.7542,
"step": 980
},
{
"epoch": 0.8549222797927462,
"grad_norm": 0.38656285405158997,
"learning_rate": 5.0345761681491746e-08,
"loss": 0.7498,
"step": 990
},
{
"epoch": 0.8635578583765112,
"grad_norm": 0.3828294575214386,
"learning_rate": 0.0,
"loss": 0.74,
"step": 1000
}
],
"logging_steps": 10,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.3420339184869376e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}