Jamesb1974's picture
Upload folder using huggingface_hub
f21ecc5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.994334277620396,
"eval_steps": 500,
"global_step": 3174,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05,
"grad_norm": 1.3795244693756104,
"learning_rate": 2.358490566037736e-06,
"loss": 2.6953,
"step": 25
},
{
"epoch": 0.09,
"grad_norm": 1.2704118490219116,
"learning_rate": 4.716981132075472e-06,
"loss": 2.7089,
"step": 50
},
{
"epoch": 0.14,
"grad_norm": 1.1683423519134521,
"learning_rate": 7.0754716981132075e-06,
"loss": 2.6511,
"step": 75
},
{
"epoch": 0.19,
"grad_norm": 1.2082363367080688,
"learning_rate": 9.433962264150944e-06,
"loss": 2.5193,
"step": 100
},
{
"epoch": 0.24,
"grad_norm": 1.1215194463729858,
"learning_rate": 1.179245283018868e-05,
"loss": 2.306,
"step": 125
},
{
"epoch": 0.28,
"grad_norm": 0.6498327255249023,
"learning_rate": 1.4150943396226415e-05,
"loss": 2.1279,
"step": 150
},
{
"epoch": 0.33,
"grad_norm": 0.5128926038742065,
"learning_rate": 1.650943396226415e-05,
"loss": 1.979,
"step": 175
},
{
"epoch": 0.38,
"grad_norm": 0.4196425676345825,
"learning_rate": 1.8867924528301888e-05,
"loss": 1.857,
"step": 200
},
{
"epoch": 0.42,
"grad_norm": 0.39977598190307617,
"learning_rate": 2.122641509433962e-05,
"loss": 1.7905,
"step": 225
},
{
"epoch": 0.47,
"grad_norm": 0.3468642830848694,
"learning_rate": 2.358490566037736e-05,
"loss": 1.7396,
"step": 250
},
{
"epoch": 0.52,
"grad_norm": 0.3703348934650421,
"learning_rate": 2.5943396226415097e-05,
"loss": 1.6826,
"step": 275
},
{
"epoch": 0.57,
"grad_norm": 0.3471335470676422,
"learning_rate": 2.830188679245283e-05,
"loss": 1.6333,
"step": 300
},
{
"epoch": 0.61,
"grad_norm": 0.30928292870521545,
"learning_rate": 2.9926470588235295e-05,
"loss": 1.5934,
"step": 325
},
{
"epoch": 0.66,
"grad_norm": 0.33998557925224304,
"learning_rate": 2.966386554621849e-05,
"loss": 1.5503,
"step": 350
},
{
"epoch": 0.71,
"grad_norm": 0.3642776906490326,
"learning_rate": 2.940126050420168e-05,
"loss": 1.5243,
"step": 375
},
{
"epoch": 0.76,
"grad_norm": 0.31012433767318726,
"learning_rate": 2.9138655462184876e-05,
"loss": 1.4618,
"step": 400
},
{
"epoch": 0.8,
"grad_norm": 0.4258916974067688,
"learning_rate": 2.8876050420168067e-05,
"loss": 1.4161,
"step": 425
},
{
"epoch": 0.85,
"grad_norm": 0.3025980591773987,
"learning_rate": 2.8613445378151262e-05,
"loss": 1.419,
"step": 450
},
{
"epoch": 0.9,
"grad_norm": 0.3354116678237915,
"learning_rate": 2.8350840336134453e-05,
"loss": 1.3576,
"step": 475
},
{
"epoch": 0.94,
"grad_norm": 0.3400489091873169,
"learning_rate": 2.8088235294117648e-05,
"loss": 1.3323,
"step": 500
},
{
"epoch": 0.99,
"grad_norm": 0.315164715051651,
"learning_rate": 2.7825630252100843e-05,
"loss": 1.344,
"step": 525
},
{
"epoch": 1.04,
"grad_norm": 0.3593141734600067,
"learning_rate": 2.7563025210084034e-05,
"loss": 1.3023,
"step": 550
},
{
"epoch": 1.09,
"grad_norm": 0.4317137598991394,
"learning_rate": 2.730042016806723e-05,
"loss": 1.3028,
"step": 575
},
{
"epoch": 1.13,
"grad_norm": 0.3506380617618561,
"learning_rate": 2.703781512605042e-05,
"loss": 1.3026,
"step": 600
},
{
"epoch": 1.18,
"grad_norm": 0.33726122975349426,
"learning_rate": 2.6775210084033615e-05,
"loss": 1.319,
"step": 625
},
{
"epoch": 1.23,
"grad_norm": 0.37094271183013916,
"learning_rate": 2.6512605042016806e-05,
"loss": 1.2583,
"step": 650
},
{
"epoch": 1.27,
"grad_norm": 0.41374334692955017,
"learning_rate": 2.625e-05,
"loss": 1.2486,
"step": 675
},
{
"epoch": 1.32,
"grad_norm": 0.46000728011131287,
"learning_rate": 2.5987394957983196e-05,
"loss": 1.2463,
"step": 700
},
{
"epoch": 1.37,
"grad_norm": 0.3955087661743164,
"learning_rate": 2.5724789915966387e-05,
"loss": 1.2397,
"step": 725
},
{
"epoch": 1.42,
"grad_norm": 0.4096736013889313,
"learning_rate": 2.546218487394958e-05,
"loss": 1.229,
"step": 750
},
{
"epoch": 1.46,
"grad_norm": 0.3845139741897583,
"learning_rate": 2.5199579831932773e-05,
"loss": 1.2314,
"step": 775
},
{
"epoch": 1.51,
"grad_norm": 0.4077882170677185,
"learning_rate": 2.4936974789915968e-05,
"loss": 1.2219,
"step": 800
},
{
"epoch": 1.56,
"grad_norm": 0.36021721363067627,
"learning_rate": 2.467436974789916e-05,
"loss": 1.234,
"step": 825
},
{
"epoch": 1.61,
"grad_norm": 0.36913222074508667,
"learning_rate": 2.4411764705882354e-05,
"loss": 1.1998,
"step": 850
},
{
"epoch": 1.65,
"grad_norm": 0.35471582412719727,
"learning_rate": 2.414915966386555e-05,
"loss": 1.1988,
"step": 875
},
{
"epoch": 1.7,
"grad_norm": 0.3558790683746338,
"learning_rate": 2.3886554621848737e-05,
"loss": 1.2106,
"step": 900
},
{
"epoch": 1.75,
"grad_norm": 0.36467084288597107,
"learning_rate": 2.362394957983193e-05,
"loss": 1.1717,
"step": 925
},
{
"epoch": 1.79,
"grad_norm": 0.381874680519104,
"learning_rate": 2.3361344537815126e-05,
"loss": 1.1896,
"step": 950
},
{
"epoch": 1.84,
"grad_norm": 0.3758748769760132,
"learning_rate": 2.309873949579832e-05,
"loss": 1.1712,
"step": 975
},
{
"epoch": 1.89,
"grad_norm": 0.35793235898017883,
"learning_rate": 2.2836134453781513e-05,
"loss": 1.1389,
"step": 1000
},
{
"epoch": 1.94,
"grad_norm": 0.44111478328704834,
"learning_rate": 2.2573529411764707e-05,
"loss": 1.1726,
"step": 1025
},
{
"epoch": 1.98,
"grad_norm": 0.3741939663887024,
"learning_rate": 2.2310924369747902e-05,
"loss": 1.1607,
"step": 1050
},
{
"epoch": 2.03,
"grad_norm": 0.3894720673561096,
"learning_rate": 2.2048319327731093e-05,
"loss": 1.2186,
"step": 1075
},
{
"epoch": 2.08,
"grad_norm": 0.3636987805366516,
"learning_rate": 2.1785714285714285e-05,
"loss": 1.1376,
"step": 1100
},
{
"epoch": 2.12,
"grad_norm": 0.42893752455711365,
"learning_rate": 2.152310924369748e-05,
"loss": 1.158,
"step": 1125
},
{
"epoch": 2.17,
"grad_norm": 0.3795158863067627,
"learning_rate": 2.1260504201680674e-05,
"loss": 1.1574,
"step": 1150
},
{
"epoch": 2.22,
"grad_norm": 0.36902275681495667,
"learning_rate": 2.0997899159663866e-05,
"loss": 1.1523,
"step": 1175
},
{
"epoch": 2.27,
"grad_norm": 0.431219220161438,
"learning_rate": 2.073529411764706e-05,
"loss": 1.1433,
"step": 1200
},
{
"epoch": 2.31,
"grad_norm": 0.4199659824371338,
"learning_rate": 2.0472689075630252e-05,
"loss": 1.1481,
"step": 1225
},
{
"epoch": 2.36,
"grad_norm": 0.6324878334999084,
"learning_rate": 2.0210084033613447e-05,
"loss": 1.1526,
"step": 1250
},
{
"epoch": 2.41,
"grad_norm": 0.523536205291748,
"learning_rate": 1.9947478991596638e-05,
"loss": 1.1216,
"step": 1275
},
{
"epoch": 2.46,
"grad_norm": 0.5140235424041748,
"learning_rate": 1.9684873949579833e-05,
"loss": 1.1539,
"step": 1300
},
{
"epoch": 2.5,
"grad_norm": 0.3695720136165619,
"learning_rate": 1.9422268907563027e-05,
"loss": 1.1666,
"step": 1325
},
{
"epoch": 2.55,
"grad_norm": 0.4080689251422882,
"learning_rate": 1.915966386554622e-05,
"loss": 1.1037,
"step": 1350
},
{
"epoch": 2.6,
"grad_norm": 0.35790908336639404,
"learning_rate": 1.889705882352941e-05,
"loss": 1.136,
"step": 1375
},
{
"epoch": 2.64,
"grad_norm": 0.42846861481666565,
"learning_rate": 1.8634453781512605e-05,
"loss": 1.1325,
"step": 1400
},
{
"epoch": 2.69,
"grad_norm": 0.37662366032600403,
"learning_rate": 1.83718487394958e-05,
"loss": 1.1439,
"step": 1425
},
{
"epoch": 2.74,
"grad_norm": 0.4963545501232147,
"learning_rate": 1.810924369747899e-05,
"loss": 1.1701,
"step": 1450
},
{
"epoch": 2.79,
"grad_norm": 0.4511197507381439,
"learning_rate": 1.7846638655462186e-05,
"loss": 1.1338,
"step": 1475
},
{
"epoch": 2.83,
"grad_norm": 0.44771987199783325,
"learning_rate": 1.758403361344538e-05,
"loss": 1.1021,
"step": 1500
},
{
"epoch": 2.88,
"grad_norm": 0.4158724248409271,
"learning_rate": 1.7321428571428572e-05,
"loss": 1.094,
"step": 1525
},
{
"epoch": 2.93,
"grad_norm": 0.43490564823150635,
"learning_rate": 1.7058823529411763e-05,
"loss": 1.1154,
"step": 1550
},
{
"epoch": 2.97,
"grad_norm": 0.4746383726596832,
"learning_rate": 1.6796218487394958e-05,
"loss": 1.1311,
"step": 1575
},
{
"epoch": 3.02,
"grad_norm": 0.4157463312149048,
"learning_rate": 1.6533613445378153e-05,
"loss": 1.1202,
"step": 1600
},
{
"epoch": 3.07,
"grad_norm": 0.38272300362586975,
"learning_rate": 1.6271008403361344e-05,
"loss": 1.1173,
"step": 1625
},
{
"epoch": 3.12,
"grad_norm": 0.5032052397727966,
"learning_rate": 1.600840336134454e-05,
"loss": 1.1313,
"step": 1650
},
{
"epoch": 3.16,
"grad_norm": 0.3842039704322815,
"learning_rate": 1.5745798319327734e-05,
"loss": 1.0984,
"step": 1675
},
{
"epoch": 3.21,
"grad_norm": 0.43160513043403625,
"learning_rate": 1.5483193277310925e-05,
"loss": 1.1108,
"step": 1700
},
{
"epoch": 3.26,
"grad_norm": 0.420173704624176,
"learning_rate": 1.5220588235294118e-05,
"loss": 1.144,
"step": 1725
},
{
"epoch": 3.31,
"grad_norm": 0.43490853905677795,
"learning_rate": 1.4957983193277311e-05,
"loss": 1.0752,
"step": 1750
},
{
"epoch": 3.35,
"grad_norm": 0.45708540081977844,
"learning_rate": 1.4695378151260504e-05,
"loss": 1.1447,
"step": 1775
},
{
"epoch": 3.4,
"grad_norm": 0.417322039604187,
"learning_rate": 1.4432773109243699e-05,
"loss": 1.102,
"step": 1800
},
{
"epoch": 3.45,
"grad_norm": 0.4371644854545593,
"learning_rate": 1.417016806722689e-05,
"loss": 1.1473,
"step": 1825
},
{
"epoch": 3.49,
"grad_norm": 0.4273310899734497,
"learning_rate": 1.3907563025210085e-05,
"loss": 1.0967,
"step": 1850
},
{
"epoch": 3.54,
"grad_norm": 0.5089781880378723,
"learning_rate": 1.3644957983193278e-05,
"loss": 1.1297,
"step": 1875
},
{
"epoch": 3.59,
"grad_norm": 0.48617228865623474,
"learning_rate": 1.3382352941176471e-05,
"loss": 1.0955,
"step": 1900
},
{
"epoch": 3.64,
"grad_norm": 0.4370473623275757,
"learning_rate": 1.3119747899159664e-05,
"loss": 1.0791,
"step": 1925
},
{
"epoch": 3.68,
"grad_norm": 0.4495941400527954,
"learning_rate": 1.2857142857142857e-05,
"loss": 1.0648,
"step": 1950
},
{
"epoch": 3.73,
"grad_norm": 0.4138700067996979,
"learning_rate": 1.259453781512605e-05,
"loss": 1.0948,
"step": 1975
},
{
"epoch": 3.78,
"grad_norm": 0.4161551296710968,
"learning_rate": 1.2331932773109243e-05,
"loss": 1.0947,
"step": 2000
},
{
"epoch": 3.82,
"grad_norm": 0.3938988745212555,
"learning_rate": 1.2069327731092438e-05,
"loss": 1.0863,
"step": 2025
},
{
"epoch": 3.87,
"grad_norm": 0.44733569025993347,
"learning_rate": 1.180672268907563e-05,
"loss": 1.1015,
"step": 2050
},
{
"epoch": 3.92,
"grad_norm": 0.4151917099952698,
"learning_rate": 1.1544117647058824e-05,
"loss": 1.0817,
"step": 2075
},
{
"epoch": 3.97,
"grad_norm": 0.45207536220550537,
"learning_rate": 1.1281512605042017e-05,
"loss": 1.0935,
"step": 2100
},
{
"epoch": 4.01,
"grad_norm": 0.43334582448005676,
"learning_rate": 1.1018907563025212e-05,
"loss": 1.0843,
"step": 2125
},
{
"epoch": 4.06,
"grad_norm": 0.44301116466522217,
"learning_rate": 1.0756302521008403e-05,
"loss": 1.0617,
"step": 2150
},
{
"epoch": 4.11,
"grad_norm": 0.42584851384162903,
"learning_rate": 1.0493697478991596e-05,
"loss": 1.102,
"step": 2175
},
{
"epoch": 4.15,
"grad_norm": 0.46070751547813416,
"learning_rate": 1.0231092436974791e-05,
"loss": 1.0943,
"step": 2200
},
{
"epoch": 4.2,
"grad_norm": 0.43757393956184387,
"learning_rate": 9.968487394957983e-06,
"loss": 1.082,
"step": 2225
},
{
"epoch": 4.25,
"grad_norm": 0.43552663922309875,
"learning_rate": 9.705882352941177e-06,
"loss": 1.1033,
"step": 2250
},
{
"epoch": 4.3,
"grad_norm": 0.44868725538253784,
"learning_rate": 9.44327731092437e-06,
"loss": 1.0912,
"step": 2275
},
{
"epoch": 4.34,
"grad_norm": 0.43542513251304626,
"learning_rate": 9.180672268907563e-06,
"loss": 1.1113,
"step": 2300
},
{
"epoch": 4.39,
"grad_norm": 0.47481635212898254,
"learning_rate": 8.918067226890756e-06,
"loss": 1.0455,
"step": 2325
},
{
"epoch": 4.44,
"grad_norm": 0.46137455105781555,
"learning_rate": 8.65546218487395e-06,
"loss": 1.0898,
"step": 2350
},
{
"epoch": 4.49,
"grad_norm": 0.4473894536495209,
"learning_rate": 8.392857142857143e-06,
"loss": 1.0836,
"step": 2375
},
{
"epoch": 4.53,
"grad_norm": 0.39784467220306396,
"learning_rate": 8.130252100840336e-06,
"loss": 1.0629,
"step": 2400
},
{
"epoch": 4.58,
"grad_norm": 0.48481184244155884,
"learning_rate": 7.86764705882353e-06,
"loss": 1.1173,
"step": 2425
},
{
"epoch": 4.63,
"grad_norm": 0.485196590423584,
"learning_rate": 7.605042016806723e-06,
"loss": 1.0673,
"step": 2450
},
{
"epoch": 4.67,
"grad_norm": 0.5114961266517639,
"learning_rate": 7.342436974789916e-06,
"loss": 1.0877,
"step": 2475
},
{
"epoch": 4.72,
"grad_norm": 0.4506637752056122,
"learning_rate": 7.07983193277311e-06,
"loss": 1.0995,
"step": 2500
},
{
"epoch": 4.77,
"grad_norm": 0.45109784603118896,
"learning_rate": 6.817226890756303e-06,
"loss": 1.0819,
"step": 2525
},
{
"epoch": 4.82,
"grad_norm": 0.4272564947605133,
"learning_rate": 6.554621848739496e-06,
"loss": 1.1109,
"step": 2550
},
{
"epoch": 4.86,
"grad_norm": 0.4301404058933258,
"learning_rate": 6.29201680672269e-06,
"loss": 1.0738,
"step": 2575
},
{
"epoch": 4.91,
"grad_norm": 0.49940961599349976,
"learning_rate": 6.029411764705883e-06,
"loss": 1.0865,
"step": 2600
},
{
"epoch": 4.96,
"grad_norm": 0.41319113969802856,
"learning_rate": 5.7773109243697485e-06,
"loss": 1.0535,
"step": 2625
},
{
"epoch": 5.0,
"grad_norm": 0.4326096773147583,
"learning_rate": 5.5147058823529415e-06,
"loss": 1.0745,
"step": 2650
},
{
"epoch": 5.05,
"grad_norm": 0.4360290765762329,
"learning_rate": 5.252100840336135e-06,
"loss": 1.0745,
"step": 2675
},
{
"epoch": 5.1,
"grad_norm": 0.42354682087898254,
"learning_rate": 4.989495798319328e-06,
"loss": 1.0685,
"step": 2700
},
{
"epoch": 5.15,
"grad_norm": 0.49250248074531555,
"learning_rate": 4.726890756302521e-06,
"loss": 1.0841,
"step": 2725
},
{
"epoch": 5.19,
"grad_norm": 0.4505230784416199,
"learning_rate": 4.464285714285715e-06,
"loss": 1.0935,
"step": 2750
},
{
"epoch": 5.24,
"grad_norm": 0.41872066259384155,
"learning_rate": 4.201680672268908e-06,
"loss": 1.0827,
"step": 2775
},
{
"epoch": 5.29,
"grad_norm": 0.45635831356048584,
"learning_rate": 3.939075630252101e-06,
"loss": 1.0973,
"step": 2800
},
{
"epoch": 5.34,
"grad_norm": 0.49893826246261597,
"learning_rate": 3.6764705882352942e-06,
"loss": 1.0859,
"step": 2825
},
{
"epoch": 5.38,
"grad_norm": 0.5377572774887085,
"learning_rate": 3.4138655462184873e-06,
"loss": 1.088,
"step": 2850
},
{
"epoch": 5.43,
"grad_norm": 0.45102909207344055,
"learning_rate": 3.1512605042016808e-06,
"loss": 1.0875,
"step": 2875
},
{
"epoch": 5.48,
"grad_norm": 0.3922051191329956,
"learning_rate": 2.8886554621848742e-06,
"loss": 1.0708,
"step": 2900
},
{
"epoch": 5.52,
"grad_norm": 0.4416084289550781,
"learning_rate": 2.6260504201680673e-06,
"loss": 1.0816,
"step": 2925
},
{
"epoch": 5.57,
"grad_norm": 0.5171985626220703,
"learning_rate": 2.3634453781512604e-06,
"loss": 1.0859,
"step": 2950
},
{
"epoch": 5.62,
"grad_norm": 0.4239521920681,
"learning_rate": 2.100840336134454e-06,
"loss": 1.0387,
"step": 2975
},
{
"epoch": 5.67,
"grad_norm": 0.5627429485321045,
"learning_rate": 1.8382352941176471e-06,
"loss": 1.0818,
"step": 3000
},
{
"epoch": 5.71,
"grad_norm": 0.4605351686477661,
"learning_rate": 1.5756302521008404e-06,
"loss": 1.0637,
"step": 3025
},
{
"epoch": 5.76,
"grad_norm": 0.40121838450431824,
"learning_rate": 1.3130252100840336e-06,
"loss": 1.039,
"step": 3050
},
{
"epoch": 5.81,
"grad_norm": 0.45940887928009033,
"learning_rate": 1.050420168067227e-06,
"loss": 1.0434,
"step": 3075
},
{
"epoch": 5.85,
"grad_norm": 0.4496408998966217,
"learning_rate": 7.878151260504202e-07,
"loss": 1.1024,
"step": 3100
},
{
"epoch": 5.9,
"grad_norm": 0.4458378553390503,
"learning_rate": 5.252100840336135e-07,
"loss": 1.0948,
"step": 3125
},
{
"epoch": 5.95,
"grad_norm": 0.49208617210388184,
"learning_rate": 2.6260504201680673e-07,
"loss": 1.0673,
"step": 3150
}
],
"logging_steps": 25,
"max_steps": 3174,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"total_flos": 1.3634839262527488e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}