|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.47914153807761095, |
|
"eval_steps": 3000, |
|
"global_step": 10800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002218247861470421, |
|
"grad_norm": 0.15810145437717438, |
|
"learning_rate": 2e-05, |
|
"loss": 2.8627, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.004436495722940842, |
|
"grad_norm": 0.1590433567762375, |
|
"learning_rate": 4e-05, |
|
"loss": 2.8607, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.006654743584411263, |
|
"grad_norm": 0.15798641741275787, |
|
"learning_rate": 6e-05, |
|
"loss": 2.8623, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.008872991445881684, |
|
"grad_norm": 0.16127805411815643, |
|
"learning_rate": 8e-05, |
|
"loss": 2.8608, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.011091239307352105, |
|
"grad_norm": 0.1587396264076233, |
|
"learning_rate": 0.0001, |
|
"loss": 2.8608, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.013309487168822525, |
|
"grad_norm": 0.160736083984375, |
|
"learning_rate": 0.00012, |
|
"loss": 2.8563, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.015527735030292948, |
|
"grad_norm": 0.16256989538669586, |
|
"learning_rate": 0.00014, |
|
"loss": 2.8549, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.01774598289176337, |
|
"grad_norm": 0.16194568574428558, |
|
"learning_rate": 0.00016, |
|
"loss": 2.8557, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.01996423075323379, |
|
"grad_norm": 0.15836463868618011, |
|
"learning_rate": 0.00018, |
|
"loss": 2.8545, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.02218247861470421, |
|
"grad_norm": 0.16059577465057373, |
|
"learning_rate": 0.0002, |
|
"loss": 2.8522, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.024400726476174632, |
|
"grad_norm": 0.16031378507614136, |
|
"learning_rate": 0.00022000000000000003, |
|
"loss": 2.8481, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.02661897433764505, |
|
"grad_norm": 0.16000501811504364, |
|
"learning_rate": 0.00024, |
|
"loss": 2.8431, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.028837222199115473, |
|
"grad_norm": 0.15952646732330322, |
|
"learning_rate": 0.00026000000000000003, |
|
"loss": 2.8475, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.031055470060585896, |
|
"grad_norm": 0.16443726420402527, |
|
"learning_rate": 0.00028, |
|
"loss": 2.8452, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.033273717922056315, |
|
"grad_norm": 0.1644088476896286, |
|
"learning_rate": 0.00030000000000000003, |
|
"loss": 2.8458, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.03549196578352674, |
|
"grad_norm": 0.16272033751010895, |
|
"learning_rate": 0.00032, |
|
"loss": 2.8435, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.03771021364499716, |
|
"grad_norm": 0.16485804319381714, |
|
"learning_rate": 0.00034, |
|
"loss": 2.8481, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.03992846150646758, |
|
"grad_norm": 0.1669188290834427, |
|
"learning_rate": 0.00036, |
|
"loss": 2.8555, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.042146709367938, |
|
"grad_norm": 0.16288943588733673, |
|
"learning_rate": 0.00038, |
|
"loss": 2.851, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.04436495722940842, |
|
"grad_norm": 0.1651136726140976, |
|
"learning_rate": 0.0004, |
|
"loss": 2.8443, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.04658320509087884, |
|
"grad_norm": 0.16190673410892487, |
|
"learning_rate": 0.00039999468202328424, |
|
"loss": 2.8398, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.048801452952349264, |
|
"grad_norm": 0.1649934947490692, |
|
"learning_rate": 0.00039997872837594555, |
|
"loss": 2.8371, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.051019700813819686, |
|
"grad_norm": 0.16184477508068085, |
|
"learning_rate": 0.00039995213990639536, |
|
"loss": 2.8347, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.0532379486752901, |
|
"grad_norm": 0.1629864126443863, |
|
"learning_rate": 0.0003999149180286022, |
|
"loss": 2.834, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.055456196536760524, |
|
"grad_norm": 0.1627526730298996, |
|
"learning_rate": 0.00039986706472201685, |
|
"loss": 2.8309, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.05767444439823095, |
|
"grad_norm": 0.1642647087574005, |
|
"learning_rate": 0.000399808582531467, |
|
"loss": 2.8352, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.05989269225970137, |
|
"grad_norm": 0.16397783160209656, |
|
"learning_rate": 0.000399739474567022, |
|
"loss": 2.8317, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.06211094012117179, |
|
"grad_norm": 0.16319701075553894, |
|
"learning_rate": 0.00039965974450382726, |
|
"loss": 2.8322, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.06432918798264221, |
|
"grad_norm": 0.16067005693912506, |
|
"learning_rate": 0.000399569396581909, |
|
"loss": 2.8279, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.06654743584411263, |
|
"grad_norm": 0.16118553280830383, |
|
"learning_rate": 0.00039946843560594866, |
|
"loss": 2.8323, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.06876568370558306, |
|
"grad_norm": 0.16291728615760803, |
|
"learning_rate": 0.0003993568669450274, |
|
"loss": 2.8301, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.07098393156705347, |
|
"grad_norm": 0.1590035855770111, |
|
"learning_rate": 0.0003992346965323407, |
|
"loss": 2.8214, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.07320217942852389, |
|
"grad_norm": 0.16236472129821777, |
|
"learning_rate": 0.00039910193086488253, |
|
"loss": 2.8242, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.07542042728999432, |
|
"grad_norm": 0.1617489606142044, |
|
"learning_rate": 0.0003989585770031003, |
|
"loss": 2.8231, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.07763867515146473, |
|
"grad_norm": 0.15960238873958588, |
|
"learning_rate": 0.000398804642570519, |
|
"loss": 2.8248, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.07985692301293516, |
|
"grad_norm": 0.16391754150390625, |
|
"learning_rate": 0.0003986401357533358, |
|
"loss": 2.8222, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.08207517087440558, |
|
"grad_norm": 0.16161847114562988, |
|
"learning_rate": 0.000398465065299985, |
|
"loss": 2.8153, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.084293418735876, |
|
"grad_norm": 0.16447125375270844, |
|
"learning_rate": 0.00039827944052067265, |
|
"loss": 2.818, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.08651166659734642, |
|
"grad_norm": 0.16384591162204742, |
|
"learning_rate": 0.0003980832712868812, |
|
"loss": 2.8093, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.08872991445881684, |
|
"grad_norm": 0.16317427158355713, |
|
"learning_rate": 0.0003978765680308447, |
|
"loss": 2.8113, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.09094816232028727, |
|
"grad_norm": 0.16197824478149414, |
|
"learning_rate": 0.00039765934174499436, |
|
"loss": 2.8134, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.09316641018175768, |
|
"grad_norm": 0.16196754574775696, |
|
"learning_rate": 0.00039743160398137344, |
|
"loss": 2.8147, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.0953846580432281, |
|
"grad_norm": 0.16696424782276154, |
|
"learning_rate": 0.00039719336685102314, |
|
"loss": 2.811, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.09760290590469853, |
|
"grad_norm": 0.16266262531280518, |
|
"learning_rate": 0.0003969446430233386, |
|
"loss": 2.8103, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.09982115376616894, |
|
"grad_norm": 0.16161397099494934, |
|
"learning_rate": 0.0003966854457253951, |
|
"loss": 2.8017, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.10203940162763937, |
|
"grad_norm": 0.1631053388118744, |
|
"learning_rate": 0.0003964157887412445, |
|
"loss": 2.8034, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.10425764948910979, |
|
"grad_norm": 0.16185788810253143, |
|
"learning_rate": 0.00039613568641118255, |
|
"loss": 2.8027, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.1064758973505802, |
|
"grad_norm": 0.16428661346435547, |
|
"learning_rate": 0.00039584515363098584, |
|
"loss": 2.8031, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.10869414521205063, |
|
"grad_norm": 0.1625480055809021, |
|
"learning_rate": 0.00039554420585112, |
|
"loss": 2.7968, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.11091239307352105, |
|
"grad_norm": 0.1638619303703308, |
|
"learning_rate": 0.0003952328590759179, |
|
"loss": 2.8007, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.11313064093499148, |
|
"grad_norm": 0.16504357755184174, |
|
"learning_rate": 0.0003949111298627286, |
|
"loss": 2.7921, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.1153488887964619, |
|
"grad_norm": 0.16429375112056732, |
|
"learning_rate": 0.0003945790353210367, |
|
"loss": 2.7951, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.11756713665793232, |
|
"grad_norm": 0.166097030043602, |
|
"learning_rate": 0.0003942365931115526, |
|
"loss": 2.7948, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.11978538451940274, |
|
"grad_norm": 0.16275139153003693, |
|
"learning_rate": 0.0003938838214452733, |
|
"loss": 2.79, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.12200363238087315, |
|
"grad_norm": 0.16379590332508087, |
|
"learning_rate": 0.0003935207390825137, |
|
"loss": 2.7896, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.12422188024234358, |
|
"grad_norm": 0.16332408785820007, |
|
"learning_rate": 0.0003931473653319095, |
|
"loss": 2.7848, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.126440128103814, |
|
"grad_norm": 0.16235879063606262, |
|
"learning_rate": 0.00039276372004938987, |
|
"loss": 2.7836, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.12865837596528443, |
|
"grad_norm": 0.1654053032398224, |
|
"learning_rate": 0.00039236982363712145, |
|
"loss": 2.7845, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.13087662382675483, |
|
"grad_norm": 0.16393068432807922, |
|
"learning_rate": 0.00039196569704242376, |
|
"loss": 2.7796, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.13309487168822526, |
|
"grad_norm": 0.16517628729343414, |
|
"learning_rate": 0.0003915513617566551, |
|
"loss": 2.7738, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.13309487168822526, |
|
"eval_accuracy": 0.4247227650219834, |
|
"eval_loss": 2.8996665477752686, |
|
"eval_runtime": 243.2366, |
|
"eval_samples_per_second": 8.222, |
|
"eval_steps_per_second": 1.028, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.1353131195496957, |
|
"grad_norm": 0.16540038585662842, |
|
"learning_rate": 0.00039112683981406936, |
|
"loss": 2.7708, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.13753136741116612, |
|
"grad_norm": 0.16403205692768097, |
|
"learning_rate": 0.00039069215379064465, |
|
"loss": 2.7709, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.13974961527263652, |
|
"grad_norm": 0.16498889029026031, |
|
"learning_rate": 0.0003902473268028826, |
|
"loss": 2.7683, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.14196786313410695, |
|
"grad_norm": 0.16713927686214447, |
|
"learning_rate": 0.00038979238250657863, |
|
"loss": 2.7578, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.14418611099557738, |
|
"grad_norm": 0.16905058920383453, |
|
"learning_rate": 0.00038932734509556467, |
|
"loss": 2.7602, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.14640435885704778, |
|
"grad_norm": 0.16431044042110443, |
|
"learning_rate": 0.0003888522393004219, |
|
"loss": 2.7685, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.1486226067185182, |
|
"grad_norm": 0.163705512881279, |
|
"learning_rate": 0.00038836709038716583, |
|
"loss": 2.8434, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.15084085457998864, |
|
"grad_norm": 0.1622520387172699, |
|
"learning_rate": 0.0003878719241559027, |
|
"loss": 2.8349, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.15305910244145907, |
|
"grad_norm": 0.16072827577590942, |
|
"learning_rate": 0.00038736676693945746, |
|
"loss": 2.8369, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.15527735030292947, |
|
"grad_norm": 0.16206832230091095, |
|
"learning_rate": 0.0003868516456019733, |
|
"loss": 2.8404, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.1574955981643999, |
|
"grad_norm": 0.16148249804973602, |
|
"learning_rate": 0.0003863265875374829, |
|
"loss": 2.836, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.15971384602587033, |
|
"grad_norm": 0.16401226818561554, |
|
"learning_rate": 0.0003857916206684519, |
|
"loss": 2.8369, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.16193209388734073, |
|
"grad_norm": 0.15987250208854675, |
|
"learning_rate": 0.00038524677344429386, |
|
"loss": 2.8363, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.16415034174881116, |
|
"grad_norm": 0.16117645800113678, |
|
"learning_rate": 0.00038469207483985725, |
|
"loss": 2.8426, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.1663685896102816, |
|
"grad_norm": 0.16374363005161285, |
|
"learning_rate": 0.00038412755435388474, |
|
"loss": 2.8416, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.168586837471752, |
|
"grad_norm": 0.16495129466056824, |
|
"learning_rate": 0.0003835532420074444, |
|
"loss": 2.8396, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.17080508533322242, |
|
"grad_norm": 0.16315814852714539, |
|
"learning_rate": 0.0003829691683423329, |
|
"loss": 2.8358, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.17302333319469285, |
|
"grad_norm": 0.16098596155643463, |
|
"learning_rate": 0.00038237536441945193, |
|
"loss": 2.8354, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.17524158105616328, |
|
"grad_norm": 0.16226187348365784, |
|
"learning_rate": 0.00038177186181715577, |
|
"loss": 2.8352, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.17745982891763368, |
|
"grad_norm": 0.15939714014530182, |
|
"learning_rate": 0.00038115869262957233, |
|
"loss": 2.835, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1796780767791041, |
|
"grad_norm": 0.1622256189584732, |
|
"learning_rate": 0.00038053588946489615, |
|
"loss": 2.8391, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.18189632464057454, |
|
"grad_norm": 0.16176052391529083, |
|
"learning_rate": 0.0003799034854436545, |
|
"loss": 2.8371, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.18411457250204494, |
|
"grad_norm": 0.16316720843315125, |
|
"learning_rate": 0.0003792615141969462, |
|
"loss": 2.8365, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.18633282036351537, |
|
"grad_norm": 0.16207610070705414, |
|
"learning_rate": 0.0003786100098646524, |
|
"loss": 2.8346, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.1885510682249858, |
|
"grad_norm": 0.1638234406709671, |
|
"learning_rate": 0.000377949007093622, |
|
"loss": 2.8319, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.1907693160864562, |
|
"grad_norm": 0.1628599315881729, |
|
"learning_rate": 0.0003772785410358283, |
|
"loss": 2.8369, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.19298756394792663, |
|
"grad_norm": 0.1653933823108673, |
|
"learning_rate": 0.00037659864734650026, |
|
"loss": 2.8304, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.19520581180939706, |
|
"grad_norm": 0.16370812058448792, |
|
"learning_rate": 0.0003759093621822259, |
|
"loss": 2.8369, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.19742405967086749, |
|
"grad_norm": 0.1629050225019455, |
|
"learning_rate": 0.0003752107221990298, |
|
"loss": 2.8339, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.1996423075323379, |
|
"grad_norm": 0.1610899269580841, |
|
"learning_rate": 0.00037450276455042354, |
|
"loss": 2.829, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.20186055539380832, |
|
"grad_norm": 0.1629941165447235, |
|
"learning_rate": 0.00037378552688543005, |
|
"loss": 2.8351, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.20407880325527875, |
|
"grad_norm": 0.16439735889434814, |
|
"learning_rate": 0.0003730590473465814, |
|
"loss": 2.8316, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.20629705111674915, |
|
"grad_norm": 0.16572241485118866, |
|
"learning_rate": 0.00037232336456789023, |
|
"loss": 2.8335, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.20851529897821958, |
|
"grad_norm": 0.16380038857460022, |
|
"learning_rate": 0.00037157851767279543, |
|
"loss": 2.8286, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.21073354683969, |
|
"grad_norm": 0.16284549236297607, |
|
"learning_rate": 0.00037082454627208156, |
|
"loss": 2.8301, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.2129517947011604, |
|
"grad_norm": 0.16597482562065125, |
|
"learning_rate": 0.0003700614904617721, |
|
"loss": 2.8323, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.21517004256263084, |
|
"grad_norm": 0.16378666460514069, |
|
"learning_rate": 0.0003692893908209973, |
|
"loss": 2.8299, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.21738829042410127, |
|
"grad_norm": 0.1630343496799469, |
|
"learning_rate": 0.0003685082884098363, |
|
"loss": 2.8333, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.2196065382855717, |
|
"grad_norm": 0.16490814089775085, |
|
"learning_rate": 0.00036771822476713346, |
|
"loss": 2.8307, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.2218247861470421, |
|
"grad_norm": 0.1655721366405487, |
|
"learning_rate": 0.00036691924190828935, |
|
"loss": 2.8301, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.22404303400851253, |
|
"grad_norm": 0.16776245832443237, |
|
"learning_rate": 0.0003661113823230264, |
|
"loss": 2.8228, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.22626128186998296, |
|
"grad_norm": 0.1626349687576294, |
|
"learning_rate": 0.00036529468897312926, |
|
"loss": 2.8262, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.22847952973145336, |
|
"grad_norm": 0.16331753134727478, |
|
"learning_rate": 0.00036446920529016, |
|
"loss": 2.8282, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.2306977775929238, |
|
"grad_norm": 0.1676001250743866, |
|
"learning_rate": 0.00036363497517314877, |
|
"loss": 2.8313, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.23291602545439422, |
|
"grad_norm": 0.16441357135772705, |
|
"learning_rate": 0.000362792042986259, |
|
"loss": 2.8278, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.23513427331586464, |
|
"grad_norm": 0.16601622104644775, |
|
"learning_rate": 0.000361940453556428, |
|
"loss": 2.8303, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.23735252117733505, |
|
"grad_norm": 0.1679011583328247, |
|
"learning_rate": 0.0003610802521709833, |
|
"loss": 2.8252, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.23957076903880548, |
|
"grad_norm": 0.1650955229997635, |
|
"learning_rate": 0.0003602114845752345, |
|
"loss": 2.8299, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.2417890169002759, |
|
"grad_norm": 0.16651777923107147, |
|
"learning_rate": 0.00035933419697004, |
|
"loss": 2.832, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.2440072647617463, |
|
"grad_norm": 0.166709303855896, |
|
"learning_rate": 0.00035844843600935024, |
|
"loss": 2.8262, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.24622551262321674, |
|
"grad_norm": 0.16586416959762573, |
|
"learning_rate": 0.000357554248797727, |
|
"loss": 2.8255, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.24844376048468717, |
|
"grad_norm": 0.1647614985704422, |
|
"learning_rate": 0.00035665168288783795, |
|
"loss": 2.8298, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.2506620083461576, |
|
"grad_norm": 0.16310204565525055, |
|
"learning_rate": 0.000355740786277928, |
|
"loss": 2.8273, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.252880256207628, |
|
"grad_norm": 0.1629767119884491, |
|
"learning_rate": 0.00035482160740926683, |
|
"loss": 2.8231, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.2550985040690984, |
|
"grad_norm": 0.16427451372146606, |
|
"learning_rate": 0.00035389419516357253, |
|
"loss": 2.8188, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.25731675193056885, |
|
"grad_norm": 0.1655891388654709, |
|
"learning_rate": 0.0003529585988604125, |
|
"loss": 2.8258, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.25953499979203926, |
|
"grad_norm": 0.16402335464954376, |
|
"learning_rate": 0.0003520148682545803, |
|
"loss": 2.8254, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.26175324765350966, |
|
"grad_norm": 0.1638861894607544, |
|
"learning_rate": 0.0003510630535334497, |
|
"loss": 2.8298, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.2639714955149801, |
|
"grad_norm": 0.16864845156669617, |
|
"learning_rate": 0.0003501032053143061, |
|
"loss": 2.8238, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.2661897433764505, |
|
"grad_norm": 0.16578635573387146, |
|
"learning_rate": 0.0003491353746416541, |
|
"loss": 2.8225, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.2661897433764505, |
|
"eval_accuracy": 0.4264626282364436, |
|
"eval_loss": 2.8843319416046143, |
|
"eval_runtime": 242.3694, |
|
"eval_samples_per_second": 8.252, |
|
"eval_steps_per_second": 1.031, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.268407991237921, |
|
"grad_norm": 0.16673897206783295, |
|
"learning_rate": 0.00034815961298450377, |
|
"loss": 2.823, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.2706262390993914, |
|
"grad_norm": 0.16588376462459564, |
|
"learning_rate": 0.0003471759722336326, |
|
"loss": 2.8193, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.2728444869608618, |
|
"grad_norm": 0.16813361644744873, |
|
"learning_rate": 0.00034618450469882687, |
|
"loss": 2.8267, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.27506273482233223, |
|
"grad_norm": 0.16656942665576935, |
|
"learning_rate": 0.0003451852631060991, |
|
"loss": 2.8219, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.27728098268380263, |
|
"grad_norm": 0.1666443794965744, |
|
"learning_rate": 0.0003441783005948846, |
|
"loss": 2.8233, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.27949923054527304, |
|
"grad_norm": 0.1673704832792282, |
|
"learning_rate": 0.0003431636707152152, |
|
"loss": 2.824, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.2817174784067435, |
|
"grad_norm": 0.16707104444503784, |
|
"learning_rate": 0.00034214142742487177, |
|
"loss": 2.8221, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.2839357262682139, |
|
"grad_norm": 0.16775397956371307, |
|
"learning_rate": 0.0003411116250865143, |
|
"loss": 2.8234, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.2861539741296843, |
|
"grad_norm": 0.16813720762729645, |
|
"learning_rate": 0.0003400743184647915, |
|
"loss": 2.8258, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.28837222199115475, |
|
"grad_norm": 0.16362161934375763, |
|
"learning_rate": 0.00033902956272342783, |
|
"loss": 2.8232, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.29059046985262516, |
|
"grad_norm": 0.16950780153274536, |
|
"learning_rate": 0.00033797741342229054, |
|
"loss": 2.821, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.29280871771409556, |
|
"grad_norm": 0.1657160073518753, |
|
"learning_rate": 0.00033691792651443435, |
|
"loss": 2.8181, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.295026965575566, |
|
"grad_norm": 0.1689310073852539, |
|
"learning_rate": 0.0003358511583431264, |
|
"loss": 2.8257, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.2972452134370364, |
|
"grad_norm": 0.16674135625362396, |
|
"learning_rate": 0.00033477716563884956, |
|
"loss": 2.8209, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.2994634612985068, |
|
"grad_norm": 0.16600748896598816, |
|
"learning_rate": 0.00033369600551628586, |
|
"loss": 2.8227, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.3016817091599773, |
|
"grad_norm": 0.16666853427886963, |
|
"learning_rate": 0.0003326077354712789, |
|
"loss": 2.8199, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.3038999570214477, |
|
"grad_norm": 0.1671936959028244, |
|
"learning_rate": 0.00033151241337777624, |
|
"loss": 2.82, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.30611820488291813, |
|
"grad_norm": 0.1675061583518982, |
|
"learning_rate": 0.00033041009748475166, |
|
"loss": 2.8246, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.30833645274438853, |
|
"grad_norm": 0.16512750089168549, |
|
"learning_rate": 0.0003293008464131079, |
|
"loss": 2.8178, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.31055470060585894, |
|
"grad_norm": 0.1670486181974411, |
|
"learning_rate": 0.0003281847191525585, |
|
"loss": 2.8185, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.3127729484673294, |
|
"grad_norm": 0.1692744940519333, |
|
"learning_rate": 0.0003270617750584913, |
|
"loss": 2.8184, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.3149911963287998, |
|
"grad_norm": 0.16573506593704224, |
|
"learning_rate": 0.0003259320738488119, |
|
"loss": 2.823, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.3172094441902702, |
|
"grad_norm": 0.17004618048667908, |
|
"learning_rate": 0.00032479567560076745, |
|
"loss": 2.8174, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.31942769205174065, |
|
"grad_norm": 0.16867642104625702, |
|
"learning_rate": 0.00032365264074775223, |
|
"loss": 2.8183, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.32164593991321105, |
|
"grad_norm": 0.16543437540531158, |
|
"learning_rate": 0.00032250303007609366, |
|
"loss": 2.8178, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.32386418777468146, |
|
"grad_norm": 0.16606374084949493, |
|
"learning_rate": 0.0003213469047218194, |
|
"loss": 2.8182, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.3260824356361519, |
|
"grad_norm": 0.1708928942680359, |
|
"learning_rate": 0.0003201843261674067, |
|
"loss": 2.8194, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.3283006834976223, |
|
"grad_norm": 0.16661237180233002, |
|
"learning_rate": 0.00031901535623851245, |
|
"loss": 2.8226, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.3305189313590927, |
|
"grad_norm": 0.16710756719112396, |
|
"learning_rate": 0.0003178400571006852, |
|
"loss": 2.8187, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.3327371792205632, |
|
"grad_norm": 0.16679760813713074, |
|
"learning_rate": 0.00031665849125605937, |
|
"loss": 2.8163, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.3349554270820336, |
|
"grad_norm": 0.16872857511043549, |
|
"learning_rate": 0.00031547072154003154, |
|
"loss": 2.8147, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.337173674943504, |
|
"grad_norm": 0.1672954261302948, |
|
"learning_rate": 0.0003142768111179187, |
|
"loss": 2.8167, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.33939192280497443, |
|
"grad_norm": 0.16654394567012787, |
|
"learning_rate": 0.00031307682348159907, |
|
"loss": 2.816, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.34161017066644483, |
|
"grad_norm": 0.16810841858386993, |
|
"learning_rate": 0.00031187082244613567, |
|
"loss": 2.8139, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.34382841852791524, |
|
"grad_norm": 0.1682497262954712, |
|
"learning_rate": 0.00031065887214638284, |
|
"loss": 2.8157, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.3460466663893857, |
|
"grad_norm": 0.17154847085475922, |
|
"learning_rate": 0.00030944103703357524, |
|
"loss": 2.8143, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.3482649142508561, |
|
"grad_norm": 0.16658836603164673, |
|
"learning_rate": 0.00030821738187190075, |
|
"loss": 2.8143, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.35048316211232655, |
|
"grad_norm": 0.16820305585861206, |
|
"learning_rate": 0.00030698797173505586, |
|
"loss": 2.8157, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.35270140997379695, |
|
"grad_norm": 0.16843385994434357, |
|
"learning_rate": 0.0003057528720027853, |
|
"loss": 2.8103, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.35491965783526735, |
|
"grad_norm": 0.17145898938179016, |
|
"learning_rate": 0.0003045121483574054, |
|
"loss": 2.8161, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.3571379056967378, |
|
"grad_norm": 0.1709701269865036, |
|
"learning_rate": 0.00030326586678031066, |
|
"loss": 2.8134, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.3593561535582082, |
|
"grad_norm": 0.16859866678714752, |
|
"learning_rate": 0.0003020140935484653, |
|
"loss": 2.818, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.3615744014196786, |
|
"grad_norm": 0.16738031804561615, |
|
"learning_rate": 0.00030075689523087804, |
|
"loss": 2.8128, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.36379264928114907, |
|
"grad_norm": 0.1693500131368637, |
|
"learning_rate": 0.00029949433868506293, |
|
"loss": 2.8138, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.3660108971426195, |
|
"grad_norm": 0.16915106773376465, |
|
"learning_rate": 0.00029822649105348294, |
|
"loss": 2.8209, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.3682291450040899, |
|
"grad_norm": 0.17108069360256195, |
|
"learning_rate": 0.00029695341975998006, |
|
"loss": 2.8174, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.37044739286556033, |
|
"grad_norm": 0.16659317910671234, |
|
"learning_rate": 0.00029567519250618907, |
|
"loss": 2.8153, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.37266564072703073, |
|
"grad_norm": 0.16678906977176666, |
|
"learning_rate": 0.0002943918772679379, |
|
"loss": 2.8163, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.37488388858850114, |
|
"grad_norm": 0.16928167641162872, |
|
"learning_rate": 0.00029310354229163197, |
|
"loss": 2.8165, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.3771021364499716, |
|
"grad_norm": 0.1695391833782196, |
|
"learning_rate": 0.0002918102560906254, |
|
"loss": 2.8197, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.379320384311442, |
|
"grad_norm": 0.17006346583366394, |
|
"learning_rate": 0.0002905120874415772, |
|
"loss": 2.8172, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.3815386321729124, |
|
"grad_norm": 0.16821132600307465, |
|
"learning_rate": 0.0002892091053807939, |
|
"loss": 2.8137, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.38375688003438285, |
|
"grad_norm": 0.17077401280403137, |
|
"learning_rate": 0.000287901379200558, |
|
"loss": 2.8174, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.38597512789585325, |
|
"grad_norm": 0.17006562650203705, |
|
"learning_rate": 0.0002865889784454435, |
|
"loss": 2.813, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.3881933757573237, |
|
"grad_norm": 0.16847462952136993, |
|
"learning_rate": 0.0002852719729086167, |
|
"loss": 2.8158, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.3904116236187941, |
|
"grad_norm": 0.16790613532066345, |
|
"learning_rate": 0.0002839504326281256, |
|
"loss": 2.816, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.3926298714802645, |
|
"grad_norm": 0.16898341476917267, |
|
"learning_rate": 0.00028262442788317446, |
|
"loss": 2.8143, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.39484811934173497, |
|
"grad_norm": 0.17099575698375702, |
|
"learning_rate": 0.00028129402919038695, |
|
"loss": 2.812, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.3970663672032054, |
|
"grad_norm": 0.17063932120800018, |
|
"learning_rate": 0.00027995930730005577, |
|
"loss": 2.815, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.3992846150646758, |
|
"grad_norm": 0.1704034060239792, |
|
"learning_rate": 0.00027862033319238025, |
|
"loss": 2.8144, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.3992846150646758, |
|
"eval_accuracy": 0.42786541279921836, |
|
"eval_loss": 2.8759515285491943, |
|
"eval_runtime": 250.6732, |
|
"eval_samples_per_second": 7.979, |
|
"eval_steps_per_second": 0.997, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.40150286292614623, |
|
"grad_norm": 0.1675969511270523, |
|
"learning_rate": 0.0002772771780736917, |
|
"loss": 2.8128, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.40372111078761663, |
|
"grad_norm": 0.1697956621646881, |
|
"learning_rate": 0.0002759299133726665, |
|
"loss": 2.8121, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.40593935864908703, |
|
"grad_norm": 0.1710100620985031, |
|
"learning_rate": 0.00027457861073652785, |
|
"loss": 2.8156, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.4081576065105575, |
|
"grad_norm": 0.16877809166908264, |
|
"learning_rate": 0.00027322334202723527, |
|
"loss": 2.815, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.4103758543720279, |
|
"grad_norm": 0.17122440040111542, |
|
"learning_rate": 0.0002718641793176631, |
|
"loss": 2.8119, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.4125941022334983, |
|
"grad_norm": 0.16771045327186584, |
|
"learning_rate": 0.0002705011948877679, |
|
"loss": 2.808, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.41481235009496875, |
|
"grad_norm": 0.16941729187965393, |
|
"learning_rate": 0.0002691344612207442, |
|
"loss": 2.8121, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.41703059795643915, |
|
"grad_norm": 0.1719992607831955, |
|
"learning_rate": 0.00026776405099917014, |
|
"loss": 2.8094, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.41924884581790955, |
|
"grad_norm": 0.1693243533372879, |
|
"learning_rate": 0.00026639003710114223, |
|
"loss": 2.8103, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.42146709367938, |
|
"grad_norm": 0.17014500498771667, |
|
"learning_rate": 0.0002650124925963998, |
|
"loss": 2.8129, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.4236853415408504, |
|
"grad_norm": 0.1709510087966919, |
|
"learning_rate": 0.00026363149074243867, |
|
"loss": 2.8084, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.4259035894023208, |
|
"grad_norm": 0.16937118768692017, |
|
"learning_rate": 0.0002622471049806159, |
|
"loss": 2.814, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.42812183726379127, |
|
"grad_norm": 0.1713036149740219, |
|
"learning_rate": 0.00026085940893224403, |
|
"loss": 2.8162, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.4303400851252617, |
|
"grad_norm": 0.17020347714424133, |
|
"learning_rate": 0.0002594684763946758, |
|
"loss": 2.8116, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.43255833298673213, |
|
"grad_norm": 0.16786696016788483, |
|
"learning_rate": 0.0002580743813373796, |
|
"loss": 2.8111, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.43477658084820253, |
|
"grad_norm": 0.17273075878620148, |
|
"learning_rate": 0.00025667719789800606, |
|
"loss": 2.8131, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.43699482870967293, |
|
"grad_norm": 0.16986466944217682, |
|
"learning_rate": 0.00025527700037844515, |
|
"loss": 2.8139, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.4392130765711434, |
|
"grad_norm": 0.17129731178283691, |
|
"learning_rate": 0.00025387386324087494, |
|
"loss": 2.8125, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.4414313244326138, |
|
"grad_norm": 0.16890868544578552, |
|
"learning_rate": 0.00025246786110380163, |
|
"loss": 2.8142, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.4436495722940842, |
|
"grad_norm": 0.17167522013187408, |
|
"learning_rate": 0.00025105906873809154, |
|
"loss": 2.8142, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.44586782015555465, |
|
"grad_norm": 0.17136669158935547, |
|
"learning_rate": 0.0002496475610629947, |
|
"loss": 2.8112, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.44808606801702505, |
|
"grad_norm": 0.16926760971546173, |
|
"learning_rate": 0.00024823341314216056, |
|
"loss": 2.8156, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.45030431587849545, |
|
"grad_norm": 0.16898435354232788, |
|
"learning_rate": 0.00024681670017964627, |
|
"loss": 2.8079, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.4525225637399659, |
|
"grad_norm": 0.17237040400505066, |
|
"learning_rate": 0.0002453974975159173, |
|
"loss": 2.813, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.4547408116014363, |
|
"grad_norm": 0.16995486617088318, |
|
"learning_rate": 0.00024397588062384095, |
|
"loss": 2.8117, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.4569590594629067, |
|
"grad_norm": 0.17290563881397247, |
|
"learning_rate": 0.00024255192510467245, |
|
"loss": 2.8121, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.45917730732437717, |
|
"grad_norm": 0.17059782147407532, |
|
"learning_rate": 0.00024112570668403472, |
|
"loss": 2.8138, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.4613955551858476, |
|
"grad_norm": 0.17196382582187653, |
|
"learning_rate": 0.00023969730120789132, |
|
"loss": 2.8095, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.463613803047318, |
|
"grad_norm": 0.16942380368709564, |
|
"learning_rate": 0.00023826678463851285, |
|
"loss": 2.8124, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.46583205090878843, |
|
"grad_norm": 0.17288681864738464, |
|
"learning_rate": 0.00023683423305043749, |
|
"loss": 2.813, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.46805029877025883, |
|
"grad_norm": 0.17040428519248962, |
|
"learning_rate": 0.00023539972262642502, |
|
"loss": 2.8141, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.4702685466317293, |
|
"grad_norm": 0.17321184277534485, |
|
"learning_rate": 0.00023396332965340585, |
|
"loss": 2.8146, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.4724867944931997, |
|
"grad_norm": 0.17026926577091217, |
|
"learning_rate": 0.00023252513051842373, |
|
"loss": 2.8086, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.4747050423546701, |
|
"grad_norm": 0.1710352748632431, |
|
"learning_rate": 0.00023108520170457398, |
|
"loss": 2.8099, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.47692329021614055, |
|
"grad_norm": 0.17067080736160278, |
|
"learning_rate": 0.00022964361978693542, |
|
"loss": 2.8099, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.47914153807761095, |
|
"grad_norm": 0.17244164645671844, |
|
"learning_rate": 0.0002282004614284989, |
|
"loss": 2.8054, |
|
"step": 10800 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 22540, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5864483085358727e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|