Muhammad Khalifa
add models that have input in the prompt
89e26b6
raw
history blame
39.8 kB
{
"best_metric": 0.6470396518707275,
"best_model_checkpoint": "checkpoints/instrucode/with_input/1a_52k/checkpoint-3000",
"epoch": 1.9198464122870171,
"eval_steps": 200,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 2.6999999999999996e-05,
"loss": 1.8998,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 5.6999999999999996e-05,
"loss": 1.8477,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 8.4e-05,
"loss": 1.5682,
"step": 30
},
{
"epoch": 0.03,
"learning_rate": 0.00011399999999999999,
"loss": 1.1025,
"step": 40
},
{
"epoch": 0.03,
"learning_rate": 0.00014399999999999998,
"loss": 0.9534,
"step": 50
},
{
"epoch": 0.04,
"learning_rate": 0.00017399999999999997,
"loss": 0.9024,
"step": 60
},
{
"epoch": 0.04,
"learning_rate": 0.000204,
"loss": 0.7691,
"step": 70
},
{
"epoch": 0.05,
"learning_rate": 0.000234,
"loss": 0.712,
"step": 80
},
{
"epoch": 0.06,
"learning_rate": 0.00026399999999999997,
"loss": 0.7302,
"step": 90
},
{
"epoch": 0.06,
"learning_rate": 0.000294,
"loss": 0.6993,
"step": 100
},
{
"epoch": 0.07,
"learning_rate": 0.0002996887159533074,
"loss": 0.6977,
"step": 110
},
{
"epoch": 0.08,
"learning_rate": 0.0002992996108949416,
"loss": 0.6908,
"step": 120
},
{
"epoch": 0.08,
"learning_rate": 0.00029891050583657586,
"loss": 0.6971,
"step": 130
},
{
"epoch": 0.09,
"learning_rate": 0.0002985214007782101,
"loss": 0.6838,
"step": 140
},
{
"epoch": 0.1,
"learning_rate": 0.00029813229571984433,
"loss": 0.6588,
"step": 150
},
{
"epoch": 0.1,
"learning_rate": 0.0002977431906614786,
"loss": 0.672,
"step": 160
},
{
"epoch": 0.11,
"learning_rate": 0.0002973540856031128,
"loss": 0.6933,
"step": 170
},
{
"epoch": 0.12,
"learning_rate": 0.00029696498054474705,
"loss": 0.6926,
"step": 180
},
{
"epoch": 0.12,
"learning_rate": 0.0002965758754863813,
"loss": 0.6862,
"step": 190
},
{
"epoch": 0.13,
"learning_rate": 0.00029618677042801553,
"loss": 0.672,
"step": 200
},
{
"epoch": 0.13,
"eval_loss": 0.6792568564414978,
"eval_runtime": 292.2523,
"eval_samples_per_second": 6.843,
"eval_steps_per_second": 0.855,
"step": 200
},
{
"epoch": 0.13,
"learning_rate": 0.00029579766536964977,
"loss": 0.6773,
"step": 210
},
{
"epoch": 0.14,
"learning_rate": 0.000295408560311284,
"loss": 0.652,
"step": 220
},
{
"epoch": 0.15,
"learning_rate": 0.00029501945525291825,
"loss": 0.662,
"step": 230
},
{
"epoch": 0.15,
"learning_rate": 0.0002946303501945525,
"loss": 0.6919,
"step": 240
},
{
"epoch": 0.16,
"learning_rate": 0.00029424124513618673,
"loss": 0.6828,
"step": 250
},
{
"epoch": 0.17,
"learning_rate": 0.00029385214007782097,
"loss": 0.6734,
"step": 260
},
{
"epoch": 0.17,
"learning_rate": 0.0002934630350194552,
"loss": 0.6605,
"step": 270
},
{
"epoch": 0.18,
"learning_rate": 0.00029307392996108945,
"loss": 0.6779,
"step": 280
},
{
"epoch": 0.19,
"learning_rate": 0.0002926848249027237,
"loss": 0.6751,
"step": 290
},
{
"epoch": 0.19,
"learning_rate": 0.00029229571984435793,
"loss": 0.6514,
"step": 300
},
{
"epoch": 0.2,
"learning_rate": 0.00029190661478599217,
"loss": 0.6764,
"step": 310
},
{
"epoch": 0.2,
"learning_rate": 0.0002915175097276264,
"loss": 0.6946,
"step": 320
},
{
"epoch": 0.21,
"learning_rate": 0.00029112840466926065,
"loss": 0.6687,
"step": 330
},
{
"epoch": 0.22,
"learning_rate": 0.00029073929961089494,
"loss": 0.6504,
"step": 340
},
{
"epoch": 0.22,
"learning_rate": 0.0002903501945525292,
"loss": 0.6598,
"step": 350
},
{
"epoch": 0.23,
"learning_rate": 0.0002899610894941634,
"loss": 0.6719,
"step": 360
},
{
"epoch": 0.24,
"learning_rate": 0.00028957198443579766,
"loss": 0.6514,
"step": 370
},
{
"epoch": 0.24,
"learning_rate": 0.0002891828793774319,
"loss": 0.6696,
"step": 380
},
{
"epoch": 0.25,
"learning_rate": 0.00028879377431906614,
"loss": 0.6726,
"step": 390
},
{
"epoch": 0.26,
"learning_rate": 0.0002884046692607004,
"loss": 0.6505,
"step": 400
},
{
"epoch": 0.26,
"eval_loss": 0.6674855351448059,
"eval_runtime": 303.0692,
"eval_samples_per_second": 6.599,
"eval_steps_per_second": 0.825,
"step": 400
},
{
"epoch": 0.26,
"learning_rate": 0.0002880155642023346,
"loss": 0.6722,
"step": 410
},
{
"epoch": 0.27,
"learning_rate": 0.00028762645914396886,
"loss": 0.6618,
"step": 420
},
{
"epoch": 0.28,
"learning_rate": 0.0002872373540856031,
"loss": 0.6679,
"step": 430
},
{
"epoch": 0.28,
"learning_rate": 0.00028684824902723734,
"loss": 0.6371,
"step": 440
},
{
"epoch": 0.29,
"learning_rate": 0.0002864591439688716,
"loss": 0.6574,
"step": 450
},
{
"epoch": 0.29,
"learning_rate": 0.0002860700389105058,
"loss": 0.6779,
"step": 460
},
{
"epoch": 0.3,
"learning_rate": 0.00028568093385214006,
"loss": 0.674,
"step": 470
},
{
"epoch": 0.31,
"learning_rate": 0.0002852918287937743,
"loss": 0.6661,
"step": 480
},
{
"epoch": 0.31,
"learning_rate": 0.00028490272373540853,
"loss": 0.6515,
"step": 490
},
{
"epoch": 0.32,
"learning_rate": 0.0002845136186770428,
"loss": 0.6719,
"step": 500
},
{
"epoch": 0.33,
"learning_rate": 0.000284124513618677,
"loss": 0.6875,
"step": 510
},
{
"epoch": 0.33,
"learning_rate": 0.00028373540856031125,
"loss": 0.6976,
"step": 520
},
{
"epoch": 0.34,
"learning_rate": 0.0002833463035019455,
"loss": 0.6636,
"step": 530
},
{
"epoch": 0.35,
"learning_rate": 0.00028295719844357973,
"loss": 0.6481,
"step": 540
},
{
"epoch": 0.35,
"learning_rate": 0.00028256809338521397,
"loss": 0.6504,
"step": 550
},
{
"epoch": 0.36,
"learning_rate": 0.0002821789883268482,
"loss": 0.6776,
"step": 560
},
{
"epoch": 0.36,
"learning_rate": 0.00028178988326848245,
"loss": 0.6906,
"step": 570
},
{
"epoch": 0.37,
"learning_rate": 0.0002814007782101167,
"loss": 0.6714,
"step": 580
},
{
"epoch": 0.38,
"learning_rate": 0.00028101167315175093,
"loss": 0.6495,
"step": 590
},
{
"epoch": 0.38,
"learning_rate": 0.00028062256809338517,
"loss": 0.6508,
"step": 600
},
{
"epoch": 0.38,
"eval_loss": 0.6634581089019775,
"eval_runtime": 294.3504,
"eval_samples_per_second": 6.795,
"eval_steps_per_second": 0.849,
"step": 600
},
{
"epoch": 0.39,
"learning_rate": 0.0002802334630350194,
"loss": 0.6613,
"step": 610
},
{
"epoch": 0.4,
"learning_rate": 0.0002798443579766537,
"loss": 0.6702,
"step": 620
},
{
"epoch": 0.4,
"learning_rate": 0.00027945525291828794,
"loss": 0.649,
"step": 630
},
{
"epoch": 0.41,
"learning_rate": 0.0002790661478599222,
"loss": 0.6718,
"step": 640
},
{
"epoch": 0.42,
"learning_rate": 0.0002786770428015564,
"loss": 0.6647,
"step": 650
},
{
"epoch": 0.42,
"learning_rate": 0.00027828793774319066,
"loss": 0.6441,
"step": 660
},
{
"epoch": 0.43,
"learning_rate": 0.0002778988326848249,
"loss": 0.6628,
"step": 670
},
{
"epoch": 0.44,
"learning_rate": 0.00027750972762645914,
"loss": 0.6358,
"step": 680
},
{
"epoch": 0.44,
"learning_rate": 0.0002771206225680934,
"loss": 0.6568,
"step": 690
},
{
"epoch": 0.45,
"learning_rate": 0.0002767315175097276,
"loss": 0.6428,
"step": 700
},
{
"epoch": 0.45,
"learning_rate": 0.0002763424124513618,
"loss": 0.6644,
"step": 710
},
{
"epoch": 0.46,
"learning_rate": 0.00027595330739299605,
"loss": 0.6401,
"step": 720
},
{
"epoch": 0.47,
"learning_rate": 0.00027556420233463034,
"loss": 0.6733,
"step": 730
},
{
"epoch": 0.47,
"learning_rate": 0.0002751750972762646,
"loss": 0.6773,
"step": 740
},
{
"epoch": 0.48,
"learning_rate": 0.0002747859922178988,
"loss": 0.667,
"step": 750
},
{
"epoch": 0.49,
"learning_rate": 0.00027439688715953306,
"loss": 0.6729,
"step": 760
},
{
"epoch": 0.49,
"learning_rate": 0.0002740077821011673,
"loss": 0.6609,
"step": 770
},
{
"epoch": 0.5,
"learning_rate": 0.00027361867704280154,
"loss": 0.6349,
"step": 780
},
{
"epoch": 0.51,
"learning_rate": 0.0002732295719844358,
"loss": 0.6647,
"step": 790
},
{
"epoch": 0.51,
"learning_rate": 0.00027284046692607,
"loss": 0.6557,
"step": 800
},
{
"epoch": 0.51,
"eval_loss": 0.6586535573005676,
"eval_runtime": 307.0004,
"eval_samples_per_second": 6.515,
"eval_steps_per_second": 0.814,
"step": 800
},
{
"epoch": 0.52,
"learning_rate": 0.00027245136186770426,
"loss": 0.6461,
"step": 810
},
{
"epoch": 0.52,
"learning_rate": 0.0002720622568093385,
"loss": 0.6451,
"step": 820
},
{
"epoch": 0.53,
"learning_rate": 0.00027167315175097274,
"loss": 0.6556,
"step": 830
},
{
"epoch": 0.54,
"learning_rate": 0.000271284046692607,
"loss": 0.6176,
"step": 840
},
{
"epoch": 0.54,
"learning_rate": 0.0002708949416342412,
"loss": 0.6425,
"step": 850
},
{
"epoch": 0.55,
"learning_rate": 0.00027050583657587545,
"loss": 0.658,
"step": 860
},
{
"epoch": 0.56,
"learning_rate": 0.0002701167315175097,
"loss": 0.6498,
"step": 870
},
{
"epoch": 0.56,
"learning_rate": 0.00026972762645914393,
"loss": 0.6337,
"step": 880
},
{
"epoch": 0.57,
"learning_rate": 0.00026933852140077817,
"loss": 0.6435,
"step": 890
},
{
"epoch": 0.58,
"learning_rate": 0.0002689494163424124,
"loss": 0.6735,
"step": 900
},
{
"epoch": 0.58,
"learning_rate": 0.0002685603112840467,
"loss": 0.6674,
"step": 910
},
{
"epoch": 0.59,
"learning_rate": 0.00026817120622568095,
"loss": 0.6515,
"step": 920
},
{
"epoch": 0.6,
"learning_rate": 0.0002677821011673152,
"loss": 0.6725,
"step": 930
},
{
"epoch": 0.6,
"learning_rate": 0.0002673929961089494,
"loss": 0.6484,
"step": 940
},
{
"epoch": 0.61,
"learning_rate": 0.0002670038910505836,
"loss": 0.6595,
"step": 950
},
{
"epoch": 0.61,
"learning_rate": 0.00026661478599221785,
"loss": 0.6581,
"step": 960
},
{
"epoch": 0.62,
"learning_rate": 0.0002662256809338521,
"loss": 0.7032,
"step": 970
},
{
"epoch": 0.63,
"learning_rate": 0.00026583657587548633,
"loss": 0.6614,
"step": 980
},
{
"epoch": 0.63,
"learning_rate": 0.00026544747081712057,
"loss": 0.6715,
"step": 990
},
{
"epoch": 0.64,
"learning_rate": 0.0002650583657587548,
"loss": 0.6224,
"step": 1000
},
{
"epoch": 0.64,
"eval_loss": 0.6567265391349792,
"eval_runtime": 296.8164,
"eval_samples_per_second": 6.738,
"eval_steps_per_second": 0.842,
"step": 1000
},
{
"epoch": 0.65,
"learning_rate": 0.00026466926070038905,
"loss": 0.6769,
"step": 1010
},
{
"epoch": 0.65,
"learning_rate": 0.00026428015564202334,
"loss": 0.6349,
"step": 1020
},
{
"epoch": 0.66,
"learning_rate": 0.0002638910505836576,
"loss": 0.6293,
"step": 1030
},
{
"epoch": 0.67,
"learning_rate": 0.0002635019455252918,
"loss": 0.6704,
"step": 1040
},
{
"epoch": 0.67,
"learning_rate": 0.00026311284046692606,
"loss": 0.6562,
"step": 1050
},
{
"epoch": 0.68,
"learning_rate": 0.0002627237354085603,
"loss": 0.6572,
"step": 1060
},
{
"epoch": 0.68,
"learning_rate": 0.00026233463035019454,
"loss": 0.6496,
"step": 1070
},
{
"epoch": 0.69,
"learning_rate": 0.0002619455252918288,
"loss": 0.6933,
"step": 1080
},
{
"epoch": 0.7,
"learning_rate": 0.000261556420233463,
"loss": 0.6777,
"step": 1090
},
{
"epoch": 0.7,
"learning_rate": 0.00026116731517509726,
"loss": 0.6612,
"step": 1100
},
{
"epoch": 0.71,
"learning_rate": 0.0002607782101167315,
"loss": 0.6434,
"step": 1110
},
{
"epoch": 0.72,
"learning_rate": 0.00026038910505836574,
"loss": 0.6605,
"step": 1120
},
{
"epoch": 0.72,
"learning_rate": 0.00026,
"loss": 0.6629,
"step": 1130
},
{
"epoch": 0.73,
"learning_rate": 0.0002596108949416342,
"loss": 0.6511,
"step": 1140
},
{
"epoch": 0.74,
"learning_rate": 0.00025922178988326846,
"loss": 0.6427,
"step": 1150
},
{
"epoch": 0.74,
"learning_rate": 0.0002588326848249027,
"loss": 0.6788,
"step": 1160
},
{
"epoch": 0.75,
"learning_rate": 0.00025844357976653694,
"loss": 0.6766,
"step": 1170
},
{
"epoch": 0.76,
"learning_rate": 0.0002580544747081712,
"loss": 0.644,
"step": 1180
},
{
"epoch": 0.76,
"learning_rate": 0.0002576653696498054,
"loss": 0.6465,
"step": 1190
},
{
"epoch": 0.77,
"learning_rate": 0.00025727626459143965,
"loss": 0.6685,
"step": 1200
},
{
"epoch": 0.77,
"eval_loss": 0.6553727984428406,
"eval_runtime": 303.8445,
"eval_samples_per_second": 6.582,
"eval_steps_per_second": 0.823,
"step": 1200
},
{
"epoch": 0.77,
"learning_rate": 0.0002568871595330739,
"loss": 0.6359,
"step": 1210
},
{
"epoch": 0.78,
"learning_rate": 0.00025649805447470813,
"loss": 0.639,
"step": 1220
},
{
"epoch": 0.79,
"learning_rate": 0.0002561089494163424,
"loss": 0.6284,
"step": 1230
},
{
"epoch": 0.79,
"learning_rate": 0.0002557198443579766,
"loss": 0.6625,
"step": 1240
},
{
"epoch": 0.8,
"learning_rate": 0.00025533073929961085,
"loss": 0.6389,
"step": 1250
},
{
"epoch": 0.81,
"learning_rate": 0.0002549416342412451,
"loss": 0.6784,
"step": 1260
},
{
"epoch": 0.81,
"learning_rate": 0.00025455252918287933,
"loss": 0.6379,
"step": 1270
},
{
"epoch": 0.82,
"learning_rate": 0.00025416342412451357,
"loss": 0.6553,
"step": 1280
},
{
"epoch": 0.83,
"learning_rate": 0.0002537743190661478,
"loss": 0.6721,
"step": 1290
},
{
"epoch": 0.83,
"learning_rate": 0.0002533852140077821,
"loss": 0.6442,
"step": 1300
},
{
"epoch": 0.84,
"learning_rate": 0.00025299610894941634,
"loss": 0.642,
"step": 1310
},
{
"epoch": 0.84,
"learning_rate": 0.0002526070038910506,
"loss": 0.6109,
"step": 1320
},
{
"epoch": 0.85,
"learning_rate": 0.0002522178988326848,
"loss": 0.6829,
"step": 1330
},
{
"epoch": 0.86,
"learning_rate": 0.00025182879377431906,
"loss": 0.634,
"step": 1340
},
{
"epoch": 0.86,
"learning_rate": 0.0002514396887159533,
"loss": 0.6483,
"step": 1350
},
{
"epoch": 0.87,
"learning_rate": 0.00025105058365758754,
"loss": 0.6571,
"step": 1360
},
{
"epoch": 0.88,
"learning_rate": 0.0002506614785992218,
"loss": 0.6739,
"step": 1370
},
{
"epoch": 0.88,
"learning_rate": 0.000250272373540856,
"loss": 0.6575,
"step": 1380
},
{
"epoch": 0.89,
"learning_rate": 0.00024988326848249026,
"loss": 0.6446,
"step": 1390
},
{
"epoch": 0.9,
"learning_rate": 0.0002494941634241245,
"loss": 0.6264,
"step": 1400
},
{
"epoch": 0.9,
"eval_loss": 0.6528891921043396,
"eval_runtime": 304.6435,
"eval_samples_per_second": 6.565,
"eval_steps_per_second": 0.821,
"step": 1400
},
{
"epoch": 0.9,
"learning_rate": 0.00024910505836575874,
"loss": 0.6295,
"step": 1410
},
{
"epoch": 0.91,
"learning_rate": 0.000248715953307393,
"loss": 0.6534,
"step": 1420
},
{
"epoch": 0.92,
"learning_rate": 0.0002483268482490272,
"loss": 0.6448,
"step": 1430
},
{
"epoch": 0.92,
"learning_rate": 0.00024793774319066146,
"loss": 0.6337,
"step": 1440
},
{
"epoch": 0.93,
"learning_rate": 0.0002475486381322957,
"loss": 0.6344,
"step": 1450
},
{
"epoch": 0.93,
"learning_rate": 0.00024715953307392994,
"loss": 0.6451,
"step": 1460
},
{
"epoch": 0.94,
"learning_rate": 0.0002467704280155642,
"loss": 0.6562,
"step": 1470
},
{
"epoch": 0.95,
"learning_rate": 0.0002463813229571984,
"loss": 0.6478,
"step": 1480
},
{
"epoch": 0.95,
"learning_rate": 0.00024599221789883266,
"loss": 0.6532,
"step": 1490
},
{
"epoch": 0.96,
"learning_rate": 0.0002456031128404669,
"loss": 0.6384,
"step": 1500
},
{
"epoch": 0.97,
"learning_rate": 0.00024521400778210114,
"loss": 0.6422,
"step": 1510
},
{
"epoch": 0.97,
"learning_rate": 0.0002448249027237354,
"loss": 0.6557,
"step": 1520
},
{
"epoch": 0.98,
"learning_rate": 0.0002444357976653696,
"loss": 0.6405,
"step": 1530
},
{
"epoch": 0.99,
"learning_rate": 0.00024404669260700388,
"loss": 0.6654,
"step": 1540
},
{
"epoch": 0.99,
"learning_rate": 0.00024365758754863812,
"loss": 0.6676,
"step": 1550
},
{
"epoch": 1.0,
"learning_rate": 0.00024326848249027236,
"loss": 0.6544,
"step": 1560
},
{
"epoch": 1.0,
"learning_rate": 0.0002428793774319066,
"loss": 0.6426,
"step": 1570
},
{
"epoch": 1.01,
"learning_rate": 0.00024249027237354084,
"loss": 0.6309,
"step": 1580
},
{
"epoch": 1.02,
"learning_rate": 0.00024210116731517508,
"loss": 0.6543,
"step": 1590
},
{
"epoch": 1.02,
"learning_rate": 0.00024171206225680932,
"loss": 0.6244,
"step": 1600
},
{
"epoch": 1.02,
"eval_loss": 0.652025043964386,
"eval_runtime": 302.2996,
"eval_samples_per_second": 6.616,
"eval_steps_per_second": 0.827,
"step": 1600
},
{
"epoch": 1.03,
"learning_rate": 0.00024132295719844356,
"loss": 0.6328,
"step": 1610
},
{
"epoch": 1.04,
"learning_rate": 0.0002409338521400778,
"loss": 0.6316,
"step": 1620
},
{
"epoch": 1.04,
"learning_rate": 0.00024054474708171207,
"loss": 0.6558,
"step": 1630
},
{
"epoch": 1.05,
"learning_rate": 0.0002401556420233463,
"loss": 0.6512,
"step": 1640
},
{
"epoch": 1.06,
"learning_rate": 0.00023976653696498054,
"loss": 0.6193,
"step": 1650
},
{
"epoch": 1.06,
"learning_rate": 0.00023937743190661478,
"loss": 0.6447,
"step": 1660
},
{
"epoch": 1.07,
"learning_rate": 0.000238988326848249,
"loss": 0.6507,
"step": 1670
},
{
"epoch": 1.08,
"learning_rate": 0.00023859922178988324,
"loss": 0.6281,
"step": 1680
},
{
"epoch": 1.08,
"learning_rate": 0.00023821011673151748,
"loss": 0.6331,
"step": 1690
},
{
"epoch": 1.09,
"learning_rate": 0.00023782101167315172,
"loss": 0.6306,
"step": 1700
},
{
"epoch": 1.09,
"learning_rate": 0.00023743190661478595,
"loss": 0.6233,
"step": 1710
},
{
"epoch": 1.1,
"learning_rate": 0.0002370428015564202,
"loss": 0.627,
"step": 1720
},
{
"epoch": 1.11,
"learning_rate": 0.00023665369649805443,
"loss": 0.6494,
"step": 1730
},
{
"epoch": 1.11,
"learning_rate": 0.0002362645914396887,
"loss": 0.6319,
"step": 1740
},
{
"epoch": 1.12,
"learning_rate": 0.00023587548638132294,
"loss": 0.6666,
"step": 1750
},
{
"epoch": 1.13,
"learning_rate": 0.00023548638132295718,
"loss": 0.6097,
"step": 1760
},
{
"epoch": 1.13,
"learning_rate": 0.00023509727626459142,
"loss": 0.6222,
"step": 1770
},
{
"epoch": 1.14,
"learning_rate": 0.00023470817120622566,
"loss": 0.6345,
"step": 1780
},
{
"epoch": 1.15,
"learning_rate": 0.0002343190661478599,
"loss": 0.6506,
"step": 1790
},
{
"epoch": 1.15,
"learning_rate": 0.00023392996108949414,
"loss": 0.6451,
"step": 1800
},
{
"epoch": 1.15,
"eval_loss": 0.6515942215919495,
"eval_runtime": 301.7791,
"eval_samples_per_second": 6.627,
"eval_steps_per_second": 0.828,
"step": 1800
},
{
"epoch": 1.16,
"learning_rate": 0.00023354085603112838,
"loss": 0.664,
"step": 1810
},
{
"epoch": 1.16,
"learning_rate": 0.00023315175097276262,
"loss": 0.6299,
"step": 1820
},
{
"epoch": 1.17,
"learning_rate": 0.00023276264591439688,
"loss": 0.6286,
"step": 1830
},
{
"epoch": 1.18,
"learning_rate": 0.00023237354085603112,
"loss": 0.6216,
"step": 1840
},
{
"epoch": 1.18,
"learning_rate": 0.00023198443579766536,
"loss": 0.652,
"step": 1850
},
{
"epoch": 1.19,
"learning_rate": 0.0002315953307392996,
"loss": 0.6092,
"step": 1860
},
{
"epoch": 1.2,
"learning_rate": 0.00023120622568093384,
"loss": 0.6369,
"step": 1870
},
{
"epoch": 1.2,
"learning_rate": 0.00023081712062256808,
"loss": 0.6229,
"step": 1880
},
{
"epoch": 1.21,
"learning_rate": 0.00023042801556420232,
"loss": 0.6274,
"step": 1890
},
{
"epoch": 1.22,
"learning_rate": 0.00023003891050583656,
"loss": 0.6472,
"step": 1900
},
{
"epoch": 1.22,
"learning_rate": 0.00022964980544747077,
"loss": 0.6351,
"step": 1910
},
{
"epoch": 1.23,
"learning_rate": 0.000229260700389105,
"loss": 0.6154,
"step": 1920
},
{
"epoch": 1.24,
"learning_rate": 0.00022887159533073925,
"loss": 0.6322,
"step": 1930
},
{
"epoch": 1.24,
"learning_rate": 0.00022848249027237352,
"loss": 0.6323,
"step": 1940
},
{
"epoch": 1.25,
"learning_rate": 0.00022809338521400776,
"loss": 0.6222,
"step": 1950
},
{
"epoch": 1.25,
"learning_rate": 0.000227704280155642,
"loss": 0.6119,
"step": 1960
},
{
"epoch": 1.26,
"learning_rate": 0.00022731517509727624,
"loss": 0.621,
"step": 1970
},
{
"epoch": 1.27,
"learning_rate": 0.00022692607003891048,
"loss": 0.6653,
"step": 1980
},
{
"epoch": 1.27,
"learning_rate": 0.00022653696498054472,
"loss": 0.6739,
"step": 1990
},
{
"epoch": 1.28,
"learning_rate": 0.00022614785992217896,
"loss": 0.6398,
"step": 2000
},
{
"epoch": 1.28,
"eval_loss": 0.6505751013755798,
"eval_runtime": 333.7471,
"eval_samples_per_second": 5.993,
"eval_steps_per_second": 0.749,
"step": 2000
},
{
"epoch": 1.29,
"learning_rate": 0.0002257587548638132,
"loss": 0.6455,
"step": 2010
},
{
"epoch": 1.29,
"learning_rate": 0.00022536964980544744,
"loss": 0.64,
"step": 2020
},
{
"epoch": 1.3,
"learning_rate": 0.0002249805447470817,
"loss": 0.6313,
"step": 2030
},
{
"epoch": 1.31,
"learning_rate": 0.00022459143968871594,
"loss": 0.64,
"step": 2040
},
{
"epoch": 1.31,
"learning_rate": 0.00022420233463035018,
"loss": 0.5932,
"step": 2050
},
{
"epoch": 1.32,
"learning_rate": 0.00022381322957198442,
"loss": 0.6273,
"step": 2060
},
{
"epoch": 1.32,
"learning_rate": 0.00022342412451361866,
"loss": 0.6347,
"step": 2070
},
{
"epoch": 1.33,
"learning_rate": 0.0002230350194552529,
"loss": 0.623,
"step": 2080
},
{
"epoch": 1.34,
"learning_rate": 0.00022264591439688714,
"loss": 0.6247,
"step": 2090
},
{
"epoch": 1.34,
"learning_rate": 0.00022225680933852138,
"loss": 0.6411,
"step": 2100
},
{
"epoch": 1.35,
"learning_rate": 0.00022186770428015565,
"loss": 0.6342,
"step": 2110
},
{
"epoch": 1.36,
"learning_rate": 0.0002214785992217899,
"loss": 0.6416,
"step": 2120
},
{
"epoch": 1.36,
"learning_rate": 0.00022108949416342413,
"loss": 0.6068,
"step": 2130
},
{
"epoch": 1.37,
"learning_rate": 0.00022070038910505837,
"loss": 0.6255,
"step": 2140
},
{
"epoch": 1.38,
"learning_rate": 0.00022031128404669258,
"loss": 0.6522,
"step": 2150
},
{
"epoch": 1.38,
"learning_rate": 0.00021992217898832682,
"loss": 0.6289,
"step": 2160
},
{
"epoch": 1.39,
"learning_rate": 0.00021953307392996106,
"loss": 0.651,
"step": 2170
},
{
"epoch": 1.4,
"learning_rate": 0.0002191439688715953,
"loss": 0.6257,
"step": 2180
},
{
"epoch": 1.4,
"learning_rate": 0.00021875486381322954,
"loss": 0.6186,
"step": 2190
},
{
"epoch": 1.41,
"learning_rate": 0.00021836575875486378,
"loss": 0.6141,
"step": 2200
},
{
"epoch": 1.41,
"eval_loss": 0.6505178213119507,
"eval_runtime": 305.7766,
"eval_samples_per_second": 6.541,
"eval_steps_per_second": 0.818,
"step": 2200
},
{
"epoch": 1.41,
"learning_rate": 0.00021797665369649802,
"loss": 0.6214,
"step": 2210
},
{
"epoch": 1.42,
"learning_rate": 0.00021758754863813228,
"loss": 0.618,
"step": 2220
},
{
"epoch": 1.43,
"learning_rate": 0.00021719844357976652,
"loss": 0.612,
"step": 2230
},
{
"epoch": 1.43,
"learning_rate": 0.00021680933852140076,
"loss": 0.6406,
"step": 2240
},
{
"epoch": 1.44,
"learning_rate": 0.000216420233463035,
"loss": 0.6184,
"step": 2250
},
{
"epoch": 1.45,
"learning_rate": 0.00021603112840466924,
"loss": 0.6354,
"step": 2260
},
{
"epoch": 1.45,
"learning_rate": 0.00021564202334630348,
"loss": 0.6315,
"step": 2270
},
{
"epoch": 1.46,
"learning_rate": 0.00021525291828793772,
"loss": 0.6254,
"step": 2280
},
{
"epoch": 1.47,
"learning_rate": 0.00021486381322957196,
"loss": 0.6283,
"step": 2290
},
{
"epoch": 1.47,
"learning_rate": 0.0002144747081712062,
"loss": 0.6302,
"step": 2300
},
{
"epoch": 1.48,
"learning_rate": 0.00021408560311284047,
"loss": 0.6234,
"step": 2310
},
{
"epoch": 1.48,
"learning_rate": 0.0002136964980544747,
"loss": 0.6592,
"step": 2320
},
{
"epoch": 1.49,
"learning_rate": 0.00021330739299610895,
"loss": 0.6247,
"step": 2330
},
{
"epoch": 1.5,
"learning_rate": 0.00021291828793774318,
"loss": 0.6383,
"step": 2340
},
{
"epoch": 1.5,
"learning_rate": 0.00021252918287937742,
"loss": 0.6451,
"step": 2350
},
{
"epoch": 1.51,
"learning_rate": 0.00021214007782101166,
"loss": 0.6311,
"step": 2360
},
{
"epoch": 1.52,
"learning_rate": 0.0002117509727626459,
"loss": 0.6441,
"step": 2370
},
{
"epoch": 1.52,
"learning_rate": 0.00021136186770428014,
"loss": 0.6405,
"step": 2380
},
{
"epoch": 1.53,
"learning_rate": 0.00021097276264591438,
"loss": 0.6447,
"step": 2390
},
{
"epoch": 1.54,
"learning_rate": 0.0002105836575875486,
"loss": 0.6311,
"step": 2400
},
{
"epoch": 1.54,
"eval_loss": 0.6494212746620178,
"eval_runtime": 305.7108,
"eval_samples_per_second": 6.542,
"eval_steps_per_second": 0.818,
"step": 2400
},
{
"epoch": 1.54,
"learning_rate": 0.00021019455252918283,
"loss": 0.6296,
"step": 2410
},
{
"epoch": 1.55,
"learning_rate": 0.0002098054474708171,
"loss": 0.6444,
"step": 2420
},
{
"epoch": 1.56,
"learning_rate": 0.00020941634241245134,
"loss": 0.6366,
"step": 2430
},
{
"epoch": 1.56,
"learning_rate": 0.00020902723735408558,
"loss": 0.6433,
"step": 2440
},
{
"epoch": 1.57,
"learning_rate": 0.00020863813229571982,
"loss": 0.6302,
"step": 2450
},
{
"epoch": 1.57,
"learning_rate": 0.00020824902723735406,
"loss": 0.6134,
"step": 2460
},
{
"epoch": 1.58,
"learning_rate": 0.0002078599221789883,
"loss": 0.617,
"step": 2470
},
{
"epoch": 1.59,
"learning_rate": 0.00020747081712062254,
"loss": 0.6574,
"step": 2480
},
{
"epoch": 1.59,
"learning_rate": 0.00020708171206225678,
"loss": 0.6267,
"step": 2490
},
{
"epoch": 1.6,
"learning_rate": 0.00020669260700389102,
"loss": 0.6151,
"step": 2500
},
{
"epoch": 1.61,
"learning_rate": 0.00020630350194552528,
"loss": 0.66,
"step": 2510
},
{
"epoch": 1.61,
"learning_rate": 0.00020591439688715952,
"loss": 0.6372,
"step": 2520
},
{
"epoch": 1.62,
"learning_rate": 0.00020552529182879376,
"loss": 0.6184,
"step": 2530
},
{
"epoch": 1.63,
"learning_rate": 0.000205136186770428,
"loss": 0.6115,
"step": 2540
},
{
"epoch": 1.63,
"learning_rate": 0.00020474708171206224,
"loss": 0.6616,
"step": 2550
},
{
"epoch": 1.64,
"learning_rate": 0.00020435797665369648,
"loss": 0.6367,
"step": 2560
},
{
"epoch": 1.64,
"learning_rate": 0.00020396887159533072,
"loss": 0.6464,
"step": 2570
},
{
"epoch": 1.65,
"learning_rate": 0.00020357976653696496,
"loss": 0.6304,
"step": 2580
},
{
"epoch": 1.66,
"learning_rate": 0.0002031906614785992,
"loss": 0.6015,
"step": 2590
},
{
"epoch": 1.66,
"learning_rate": 0.00020280155642023347,
"loss": 0.6353,
"step": 2600
},
{
"epoch": 1.66,
"eval_loss": 0.6482927203178406,
"eval_runtime": 309.3391,
"eval_samples_per_second": 6.465,
"eval_steps_per_second": 0.808,
"step": 2600
},
{
"epoch": 1.67,
"learning_rate": 0.0002024124513618677,
"loss": 0.6311,
"step": 2610
},
{
"epoch": 1.68,
"learning_rate": 0.00020202334630350195,
"loss": 0.6221,
"step": 2620
},
{
"epoch": 1.68,
"learning_rate": 0.0002016342412451362,
"loss": 0.6439,
"step": 2630
},
{
"epoch": 1.69,
"learning_rate": 0.0002012451361867704,
"loss": 0.6284,
"step": 2640
},
{
"epoch": 1.7,
"learning_rate": 0.00020085603112840464,
"loss": 0.6402,
"step": 2650
},
{
"epoch": 1.7,
"learning_rate": 0.00020046692607003888,
"loss": 0.6373,
"step": 2660
},
{
"epoch": 1.71,
"learning_rate": 0.00020007782101167312,
"loss": 0.6205,
"step": 2670
},
{
"epoch": 1.72,
"learning_rate": 0.00019968871595330736,
"loss": 0.6343,
"step": 2680
},
{
"epoch": 1.72,
"learning_rate": 0.0001992996108949416,
"loss": 0.6272,
"step": 2690
},
{
"epoch": 1.73,
"learning_rate": 0.00019891050583657586,
"loss": 0.6262,
"step": 2700
},
{
"epoch": 1.73,
"learning_rate": 0.0001985214007782101,
"loss": 0.6295,
"step": 2710
},
{
"epoch": 1.74,
"learning_rate": 0.00019813229571984434,
"loss": 0.6256,
"step": 2720
},
{
"epoch": 1.75,
"learning_rate": 0.00019774319066147858,
"loss": 0.6485,
"step": 2730
},
{
"epoch": 1.75,
"learning_rate": 0.00019735408560311282,
"loss": 0.6518,
"step": 2740
},
{
"epoch": 1.76,
"learning_rate": 0.00019696498054474706,
"loss": 0.6627,
"step": 2750
},
{
"epoch": 1.77,
"learning_rate": 0.0001965758754863813,
"loss": 0.594,
"step": 2760
},
{
"epoch": 1.77,
"learning_rate": 0.00019618677042801554,
"loss": 0.6289,
"step": 2770
},
{
"epoch": 1.78,
"learning_rate": 0.00019579766536964978,
"loss": 0.6393,
"step": 2780
},
{
"epoch": 1.79,
"learning_rate": 0.00019540856031128405,
"loss": 0.6073,
"step": 2790
},
{
"epoch": 1.79,
"learning_rate": 0.0001950194552529183,
"loss": 0.6331,
"step": 2800
},
{
"epoch": 1.79,
"eval_loss": 0.647841215133667,
"eval_runtime": 307.5546,
"eval_samples_per_second": 6.503,
"eval_steps_per_second": 0.813,
"step": 2800
},
{
"epoch": 1.8,
"learning_rate": 0.00019463035019455253,
"loss": 0.6353,
"step": 2810
},
{
"epoch": 1.8,
"learning_rate": 0.00019424124513618677,
"loss": 0.6444,
"step": 2820
},
{
"epoch": 1.81,
"learning_rate": 0.000193852140077821,
"loss": 0.6227,
"step": 2830
},
{
"epoch": 1.82,
"learning_rate": 0.00019346303501945525,
"loss": 0.6505,
"step": 2840
},
{
"epoch": 1.82,
"learning_rate": 0.00019307392996108949,
"loss": 0.6545,
"step": 2850
},
{
"epoch": 1.83,
"learning_rate": 0.00019268482490272372,
"loss": 0.6354,
"step": 2860
},
{
"epoch": 1.84,
"learning_rate": 0.00019229571984435796,
"loss": 0.5997,
"step": 2870
},
{
"epoch": 1.84,
"learning_rate": 0.00019190661478599218,
"loss": 0.6465,
"step": 2880
},
{
"epoch": 1.85,
"learning_rate": 0.00019151750972762642,
"loss": 0.6198,
"step": 2890
},
{
"epoch": 1.86,
"learning_rate": 0.00019112840466926068,
"loss": 0.6407,
"step": 2900
},
{
"epoch": 1.86,
"learning_rate": 0.00019073929961089492,
"loss": 0.6413,
"step": 2910
},
{
"epoch": 1.87,
"learning_rate": 0.00019035019455252916,
"loss": 0.6468,
"step": 2920
},
{
"epoch": 1.88,
"learning_rate": 0.0001899610894941634,
"loss": 0.6133,
"step": 2930
},
{
"epoch": 1.88,
"learning_rate": 0.00018957198443579764,
"loss": 0.6055,
"step": 2940
},
{
"epoch": 1.89,
"learning_rate": 0.00018918287937743188,
"loss": 0.6155,
"step": 2950
},
{
"epoch": 1.89,
"learning_rate": 0.00018879377431906612,
"loss": 0.6253,
"step": 2960
},
{
"epoch": 1.9,
"learning_rate": 0.00018840466926070036,
"loss": 0.631,
"step": 2970
},
{
"epoch": 1.91,
"learning_rate": 0.0001880155642023346,
"loss": 0.6332,
"step": 2980
},
{
"epoch": 1.91,
"learning_rate": 0.00018762645914396887,
"loss": 0.6229,
"step": 2990
},
{
"epoch": 1.92,
"learning_rate": 0.0001872373540856031,
"loss": 0.6291,
"step": 3000
},
{
"epoch": 1.92,
"eval_loss": 0.6470396518707275,
"eval_runtime": 303.5236,
"eval_samples_per_second": 6.589,
"eval_steps_per_second": 0.824,
"step": 3000
}
],
"logging_steps": 10,
"max_steps": 7810,
"num_train_epochs": 5,
"save_steps": 200,
"total_flos": 9.12950959125037e+17,
"trial_name": null,
"trial_params": null
}