|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.996510275300504, |
|
"eval_steps": 500, |
|
"global_step": 966, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.031019775106630478, |
|
"grad_norm": 4.631979942321777, |
|
"learning_rate": 5.517241379310345e-06, |
|
"loss": 3.6051, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.062039550213260956, |
|
"grad_norm": 1.5514965057373047, |
|
"learning_rate": 1.2413793103448277e-05, |
|
"loss": 2.545, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09305932531989143, |
|
"grad_norm": 1.3494954109191895, |
|
"learning_rate": 1.9310344827586207e-05, |
|
"loss": 2.1342, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12407910042652191, |
|
"grad_norm": 1.3906075954437256, |
|
"learning_rate": 2.620689655172414e-05, |
|
"loss": 2.0661, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15509887553315238, |
|
"grad_norm": 1.3953957557678223, |
|
"learning_rate": 3.310344827586207e-05, |
|
"loss": 2.0186, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18611865063978286, |
|
"grad_norm": 1.6760238409042358, |
|
"learning_rate": 4e-05, |
|
"loss": 1.7399, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21713842574641334, |
|
"grad_norm": 1.727597951889038, |
|
"learning_rate": 4.689655172413793e-05, |
|
"loss": 1.6954, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24815820085304383, |
|
"grad_norm": 1.604373812675476, |
|
"learning_rate": 5.379310344827586e-05, |
|
"loss": 1.6684, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2791779759596743, |
|
"grad_norm": 1.6111727952957153, |
|
"learning_rate": 6.068965517241379e-05, |
|
"loss": 1.6644, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.31019775106630476, |
|
"grad_norm": 1.4779224395751953, |
|
"learning_rate": 6.758620689655173e-05, |
|
"loss": 1.562, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.34121752617293527, |
|
"grad_norm": 1.940397024154663, |
|
"learning_rate": 7.448275862068966e-05, |
|
"loss": 1.5314, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3722373012795657, |
|
"grad_norm": 1.7813657522201538, |
|
"learning_rate": 8.137931034482759e-05, |
|
"loss": 1.5245, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4032570763861962, |
|
"grad_norm": 2.0770983695983887, |
|
"learning_rate": 8.827586206896552e-05, |
|
"loss": 1.5824, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4342768514928267, |
|
"grad_norm": 1.6462304592132568, |
|
"learning_rate": 9.517241379310345e-05, |
|
"loss": 1.4695, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46529662659945714, |
|
"grad_norm": 2.1206908226013184, |
|
"learning_rate": 9.999670548734657e-05, |
|
"loss": 1.5062, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.49631640170608765, |
|
"grad_norm": 2.2103354930877686, |
|
"learning_rate": 9.993814845038307e-05, |
|
"loss": 1.3786, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5273361768127182, |
|
"grad_norm": 2.300985336303711, |
|
"learning_rate": 9.980647870476639e-05, |
|
"loss": 1.4175, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5583559519193486, |
|
"grad_norm": 2.7707035541534424, |
|
"learning_rate": 9.960188902359786e-05, |
|
"loss": 1.4232, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5893757270259791, |
|
"grad_norm": 2.5883567333221436, |
|
"learning_rate": 9.932467893952367e-05, |
|
"loss": 1.3551, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6203955021326095, |
|
"grad_norm": 2.2564728260040283, |
|
"learning_rate": 9.897525430619965e-05, |
|
"loss": 1.3123, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.65141527723924, |
|
"grad_norm": 2.3438875675201416, |
|
"learning_rate": 9.855412670409493e-05, |
|
"loss": 1.265, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6824350523458705, |
|
"grad_norm": 2.057579755783081, |
|
"learning_rate": 9.806191269150479e-05, |
|
"loss": 1.2062, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.713454827452501, |
|
"grad_norm": 2.852278470993042, |
|
"learning_rate": 9.749933290186913e-05, |
|
"loss": 1.2265, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7444746025591314, |
|
"grad_norm": 2.790354013442993, |
|
"learning_rate": 9.686721098871789e-05, |
|
"loss": 1.2365, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7754943776657619, |
|
"grad_norm": 2.6249144077301025, |
|
"learning_rate": 9.61664724197888e-05, |
|
"loss": 1.1627, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8065141527723924, |
|
"grad_norm": 2.421135663986206, |
|
"learning_rate": 9.539814312208195e-05, |
|
"loss": 1.2203, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8375339278790229, |
|
"grad_norm": 2.2487196922302246, |
|
"learning_rate": 9.45633479798359e-05, |
|
"loss": 1.1503, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8685537029856534, |
|
"grad_norm": 3.0988595485687256, |
|
"learning_rate": 9.366330918762361e-05, |
|
"loss": 1.1575, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8995734780922838, |
|
"grad_norm": 2.629244327545166, |
|
"learning_rate": 9.269934446097986e-05, |
|
"loss": 1.1328, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9305932531989143, |
|
"grad_norm": 3.2107417583465576, |
|
"learning_rate": 9.16728651071797e-05, |
|
"loss": 1.1052, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9616130283055447, |
|
"grad_norm": 2.520418405532837, |
|
"learning_rate": 9.058537395899252e-05, |
|
"loss": 1.0805, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9926328034121753, |
|
"grad_norm": 2.8015685081481934, |
|
"learning_rate": 8.943846317443673e-05, |
|
"loss": 1.0565, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0236525785188058, |
|
"grad_norm": 3.1201586723327637, |
|
"learning_rate": 8.823381190575654e-05, |
|
"loss": 0.8105, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0546723536254363, |
|
"grad_norm": 2.659348487854004, |
|
"learning_rate": 8.69731838410335e-05, |
|
"loss": 0.7056, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0856921287320667, |
|
"grad_norm": 2.9386487007141113, |
|
"learning_rate": 8.565842462203197e-05, |
|
"loss": 0.7194, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1167119038386972, |
|
"grad_norm": 3.119506359100342, |
|
"learning_rate": 8.42914591420592e-05, |
|
"loss": 0.6653, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1477316789453276, |
|
"grad_norm": 3.151764154434204, |
|
"learning_rate": 8.287428872779583e-05, |
|
"loss": 0.6434, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1787514540519581, |
|
"grad_norm": 2.7570505142211914, |
|
"learning_rate": 8.140898820922307e-05, |
|
"loss": 0.6401, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2097712291585885, |
|
"grad_norm": 3.0509796142578125, |
|
"learning_rate": 7.989770288193614e-05, |
|
"loss": 0.6535, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.240791004265219, |
|
"grad_norm": 2.9341208934783936, |
|
"learning_rate": 7.834264536629148e-05, |
|
"loss": 0.6892, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2718107793718496, |
|
"grad_norm": 4.456368446350098, |
|
"learning_rate": 7.674609236798621e-05, |
|
"loss": 0.6351, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.30283055447848, |
|
"grad_norm": 3.2823486328125, |
|
"learning_rate": 7.511038134481237e-05, |
|
"loss": 0.6424, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3338503295851105, |
|
"grad_norm": 3.2044315338134766, |
|
"learning_rate": 7.343790708446609e-05, |
|
"loss": 0.6297, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3648701046917409, |
|
"grad_norm": 2.9563002586364746, |
|
"learning_rate": 7.173111819842222e-05, |
|
"loss": 0.55, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3958898797983714, |
|
"grad_norm": 2.863718271255493, |
|
"learning_rate": 6.999251353700718e-05, |
|
"loss": 0.564, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.426909654905002, |
|
"grad_norm": 2.617347240447998, |
|
"learning_rate": 6.822463853091911e-05, |
|
"loss": 0.5635, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4579294300116323, |
|
"grad_norm": 2.8376879692077637, |
|
"learning_rate": 6.643008146455114e-05, |
|
"loss": 0.5589, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.488949205118263, |
|
"grad_norm": 2.8635995388031006, |
|
"learning_rate": 6.46114696865741e-05, |
|
"loss": 0.5668, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5199689802248932, |
|
"grad_norm": 2.637256622314453, |
|
"learning_rate": 6.277146576332657e-05, |
|
"loss": 0.5273, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5509887553315238, |
|
"grad_norm": 3.0905144214630127, |
|
"learning_rate": 6.091276358064408e-05, |
|
"loss": 0.5076, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5820085304381544, |
|
"grad_norm": 3.2317402362823486, |
|
"learning_rate": 5.903808439983428e-05, |
|
"loss": 0.4969, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.613028305544785, |
|
"grad_norm": 3.3929502964019775, |
|
"learning_rate": 5.7150172873572906e-05, |
|
"loss": 0.4832, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6440480806514153, |
|
"grad_norm": 3.057800531387329, |
|
"learning_rate": 5.525179302755303e-05, |
|
"loss": 0.4986, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6750678557580456, |
|
"grad_norm": 3.7113149166107178, |
|
"learning_rate": 5.3345724213771145e-05, |
|
"loss": 0.492, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7060876308646762, |
|
"grad_norm": 3.076077699661255, |
|
"learning_rate": 5.143475704137433e-05, |
|
"loss": 0.4645, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7371074059713068, |
|
"grad_norm": 3.2166106700897217, |
|
"learning_rate": 4.9521689291026406e-05, |
|
"loss": 0.4516, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7681271810779373, |
|
"grad_norm": 2.9462897777557373, |
|
"learning_rate": 4.760932181877439e-05, |
|
"loss": 0.4372, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7991469561845677, |
|
"grad_norm": 2.8754382133483887, |
|
"learning_rate": 4.570045445541253e-05, |
|
"loss": 0.4232, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.830166731291198, |
|
"grad_norm": 3.5116498470306396, |
|
"learning_rate": 4.379788190734712e-05, |
|
"loss": 0.4268, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8611865063978286, |
|
"grad_norm": 3.0945634841918945, |
|
"learning_rate": 4.190438966496407e-05, |
|
"loss": 0.4093, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8922062815044591, |
|
"grad_norm": 2.920431613922119, |
|
"learning_rate": 4.002274992448911e-05, |
|
"loss": 0.3945, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9232260566110897, |
|
"grad_norm": 2.7673826217651367, |
|
"learning_rate": 3.815571752931162e-05, |
|
"loss": 0.3948, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.95424583171772, |
|
"grad_norm": 2.909179925918579, |
|
"learning_rate": 3.630602593671405e-05, |
|
"loss": 0.3731, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9852656068243504, |
|
"grad_norm": 2.851840019226074, |
|
"learning_rate": 3.4476383215912114e-05, |
|
"loss": 0.3476, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.016285381930981, |
|
"grad_norm": 2.012725353240967, |
|
"learning_rate": 3.266946808326466e-05, |
|
"loss": 0.2479, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0473051570376115, |
|
"grad_norm": 2.0801613330841064, |
|
"learning_rate": 3.0887925980458154e-05, |
|
"loss": 0.1408, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.078324932144242, |
|
"grad_norm": 1.9638011455535889, |
|
"learning_rate": 2.913436520140731e-05, |
|
"loss": 0.1383, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.1093447072508726, |
|
"grad_norm": 2.354614019393921, |
|
"learning_rate": 2.74113530735426e-05, |
|
"loss": 0.1431, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1403644823575028, |
|
"grad_norm": 1.8083250522613525, |
|
"learning_rate": 2.5721412199075372e-05, |
|
"loss": 0.1253, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.1713842574641333, |
|
"grad_norm": 2.4336416721343994, |
|
"learning_rate": 2.4067016761743515e-05, |
|
"loss": 0.1316, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.202404032570764, |
|
"grad_norm": 2.073843240737915, |
|
"learning_rate": 2.2450588904444968e-05, |
|
"loss": 0.1251, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.2334238076773945, |
|
"grad_norm": 1.7229952812194824, |
|
"learning_rate": 2.0874495183062503e-05, |
|
"loss": 0.1151, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.2644435827840246, |
|
"grad_norm": 1.7413322925567627, |
|
"learning_rate": 1.9341043101671412e-05, |
|
"loss": 0.1176, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.295463357890655, |
|
"grad_norm": 1.87288236618042, |
|
"learning_rate": 1.7852477734202954e-05, |
|
"loss": 0.1099, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.3264831329972857, |
|
"grad_norm": 1.6436331272125244, |
|
"learning_rate": 1.641097843750952e-05, |
|
"loss": 0.1061, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3575029081039163, |
|
"grad_norm": 2.2512550354003906, |
|
"learning_rate": 1.5018655660644055e-05, |
|
"loss": 0.0849, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.388522683210547, |
|
"grad_norm": 1.8482258319854736, |
|
"learning_rate": 1.3677547855024907e-05, |
|
"loss": 0.1089, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.419542458317177, |
|
"grad_norm": 1.5719361305236816, |
|
"learning_rate": 1.2389618490009775e-05, |
|
"loss": 0.0837, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4505622334238075, |
|
"grad_norm": 1.8144450187683105, |
|
"learning_rate": 1.1156753178248564e-05, |
|
"loss": 0.1133, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.481582008530438, |
|
"grad_norm": 2.3764026165008545, |
|
"learning_rate": 9.980756915023332e-06, |
|
"loss": 0.09, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.5126017836370687, |
|
"grad_norm": 1.60674250125885, |
|
"learning_rate": 8.863351435617395e-06, |
|
"loss": 0.0831, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.543621558743699, |
|
"grad_norm": 1.7593117952346802, |
|
"learning_rate": 7.806172694582487e-06, |
|
"loss": 0.0951, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.5746413338503293, |
|
"grad_norm": 2.17861008644104, |
|
"learning_rate": 6.810768470594553e-06, |
|
"loss": 0.0938, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.60566110895696, |
|
"grad_norm": 1.4292868375778198, |
|
"learning_rate": 5.878596100404743e-06, |
|
"loss": 0.0738, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.6366808840635905, |
|
"grad_norm": 1.06781005859375, |
|
"learning_rate": 5.01102034520326e-06, |
|
"loss": 0.0893, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.667700659170221, |
|
"grad_norm": 1.4993703365325928, |
|
"learning_rate": 4.209311392519955e-06, |
|
"loss": 0.0706, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.6987204342768516, |
|
"grad_norm": 1.5031639337539673, |
|
"learning_rate": 3.4746429965867967e-06, |
|
"loss": 0.0752, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.7297402093834817, |
|
"grad_norm": 1.7316648960113525, |
|
"learning_rate": 2.808090759885207e-06, |
|
"loss": 0.075, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.7607599844901123, |
|
"grad_norm": 1.4538629055023193, |
|
"learning_rate": 2.2106305583936617e-06, |
|
"loss": 0.0768, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.791779759596743, |
|
"grad_norm": 0.8957967162132263, |
|
"learning_rate": 1.6831371128416983e-06, |
|
"loss": 0.0726, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.8227995347033734, |
|
"grad_norm": 1.7930282354354858, |
|
"learning_rate": 1.2263827080616074e-06, |
|
"loss": 0.0655, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.853819309810004, |
|
"grad_norm": 1.678768277168274, |
|
"learning_rate": 8.410360623130554e-07, |
|
"loss": 0.0743, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.884839084916634, |
|
"grad_norm": 1.4549870491027832, |
|
"learning_rate": 5.276613482359138e-07, |
|
"loss": 0.0773, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.9158588600232647, |
|
"grad_norm": 1.54071044921875, |
|
"learning_rate": 2.867173668646583e-07, |
|
"loss": 0.0804, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.9468786351298952, |
|
"grad_norm": 1.3247941732406616, |
|
"learning_rate": 1.1855687591376675e-07, |
|
"loss": 0.0756, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.977898410236526, |
|
"grad_norm": 1.7488839626312256, |
|
"learning_rate": 2.342607331733415e-08, |
|
"loss": 0.0805, |
|
"step": 960 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 966, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.822238727303332e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|