|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 939, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03194888178913738, |
|
"grad_norm": 2.244253158569336, |
|
"learning_rate": 9.997201868901463e-06, |
|
"loss": 0.6628, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06389776357827476, |
|
"grad_norm": 2.3703079223632812, |
|
"learning_rate": 9.988810607420912e-06, |
|
"loss": 0.5322, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09584664536741214, |
|
"grad_norm": 2.0787246227264404, |
|
"learning_rate": 9.974835607498224e-06, |
|
"loss": 0.5513, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12779552715654952, |
|
"grad_norm": 1.8989689350128174, |
|
"learning_rate": 9.955292510686156e-06, |
|
"loss": 0.5212, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1597444089456869, |
|
"grad_norm": 2.0805258750915527, |
|
"learning_rate": 9.930203190643491e-06, |
|
"loss": 0.5511, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.19169329073482427, |
|
"grad_norm": 2.0137438774108887, |
|
"learning_rate": 9.899595728652883e-06, |
|
"loss": 0.5134, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22364217252396165, |
|
"grad_norm": 2.0841853618621826, |
|
"learning_rate": 9.863504382190838e-06, |
|
"loss": 0.4956, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.25559105431309903, |
|
"grad_norm": 1.5514801740646362, |
|
"learning_rate": 9.821969546584922e-06, |
|
"loss": 0.5168, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.28753993610223644, |
|
"grad_norm": 2.0024027824401855, |
|
"learning_rate": 9.775037709801206e-06, |
|
"loss": 0.4878, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3194888178913738, |
|
"grad_norm": 1.877454161643982, |
|
"learning_rate": 9.722761400412496e-06, |
|
"loss": 0.5156, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3514376996805112, |
|
"grad_norm": 1.4342358112335205, |
|
"learning_rate": 9.6651991288056e-06, |
|
"loss": 0.5025, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.38338658146964855, |
|
"grad_norm": 1.7300364971160889, |
|
"learning_rate": 9.602415321693434e-06, |
|
"loss": 0.5102, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.41533546325878595, |
|
"grad_norm": 1.5914117097854614, |
|
"learning_rate": 9.534480250005263e-06, |
|
"loss": 0.5074, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4472843450479233, |
|
"grad_norm": 1.7477545738220215, |
|
"learning_rate": 9.461469950235795e-06, |
|
"loss": 0.5025, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4792332268370607, |
|
"grad_norm": 1.8615833520889282, |
|
"learning_rate": 9.38346613934115e-06, |
|
"loss": 0.5066, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5111821086261981, |
|
"grad_norm": 1.7704814672470093, |
|
"learning_rate": 9.300556123276955e-06, |
|
"loss": 0.4952, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5431309904153354, |
|
"grad_norm": 1.7545205354690552, |
|
"learning_rate": 9.212832699280942e-06, |
|
"loss": 0.5093, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5750798722044729, |
|
"grad_norm": 2.059382438659668, |
|
"learning_rate": 9.120394052009412e-06, |
|
"loss": 0.4938, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6070287539936102, |
|
"grad_norm": 1.4858242273330688, |
|
"learning_rate": 9.023343643643821e-06, |
|
"loss": 0.4984, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6389776357827476, |
|
"grad_norm": 1.5544804334640503, |
|
"learning_rate": 8.921790098090477e-06, |
|
"loss": 0.4969, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.670926517571885, |
|
"grad_norm": 1.8466405868530273, |
|
"learning_rate": 8.815847079402972e-06, |
|
"loss": 0.5071, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7028753993610224, |
|
"grad_norm": 1.684263825416565, |
|
"learning_rate": 8.705633164563413e-06, |
|
"loss": 0.4731, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7348242811501597, |
|
"grad_norm": 1.578497052192688, |
|
"learning_rate": 8.591271710764839e-06, |
|
"loss": 0.4741, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7667731629392971, |
|
"grad_norm": 1.6205824613571167, |
|
"learning_rate": 8.472890717343391e-06, |
|
"loss": 0.4648, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7987220447284346, |
|
"grad_norm": 1.6458181142807007, |
|
"learning_rate": 8.350622682514735e-06, |
|
"loss": 0.5115, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8306709265175719, |
|
"grad_norm": 1.6952828168869019, |
|
"learning_rate": 8.224604455075115e-06, |
|
"loss": 0.4802, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8626198083067093, |
|
"grad_norm": 1.5683045387268066, |
|
"learning_rate": 8.094977081233006e-06, |
|
"loss": 0.5013, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8945686900958466, |
|
"grad_norm": 1.5408000946044922, |
|
"learning_rate": 7.961885646742793e-06, |
|
"loss": 0.4897, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9265175718849841, |
|
"grad_norm": 1.5393075942993164, |
|
"learning_rate": 7.825479114517197e-06, |
|
"loss": 0.4759, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9584664536741214, |
|
"grad_norm": 1.638293743133545, |
|
"learning_rate": 7.685910157900158e-06, |
|
"loss": 0.4782, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9904153354632588, |
|
"grad_norm": 1.3112537860870361, |
|
"learning_rate": 7.5433349897868445e-06, |
|
"loss": 0.4874, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0223642172523961, |
|
"grad_norm": 1.236051321029663, |
|
"learning_rate": 7.397913187781962e-06, |
|
"loss": 0.3528, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0543130990415335, |
|
"grad_norm": 1.8970476388931274, |
|
"learning_rate": 7.249807515592149e-06, |
|
"loss": 0.2994, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0862619808306708, |
|
"grad_norm": 1.694257378578186, |
|
"learning_rate": 7.099183740852296e-06, |
|
"loss": 0.2844, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1182108626198084, |
|
"grad_norm": 1.5203874111175537, |
|
"learning_rate": 6.946210449589714e-06, |
|
"loss": 0.2881, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1501597444089458, |
|
"grad_norm": 1.5055556297302246, |
|
"learning_rate": 6.791058857533814e-06, |
|
"loss": 0.2746, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.182108626198083, |
|
"grad_norm": 2.622091293334961, |
|
"learning_rate": 6.633902618482484e-06, |
|
"loss": 0.283, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2140575079872205, |
|
"grad_norm": 1.4760558605194092, |
|
"learning_rate": 6.474917629939652e-06, |
|
"loss": 0.2775, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2460063897763578, |
|
"grad_norm": 1.5865486860275269, |
|
"learning_rate": 6.314281836241573e-06, |
|
"loss": 0.2881, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2779552715654952, |
|
"grad_norm": 1.7005813121795654, |
|
"learning_rate": 6.1521750293922035e-06, |
|
"loss": 0.2846, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3099041533546325, |
|
"grad_norm": 1.4311730861663818, |
|
"learning_rate": 5.988778647830554e-06, |
|
"loss": 0.2871, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.34185303514377, |
|
"grad_norm": 1.6767691373825073, |
|
"learning_rate": 5.824275573355278e-06, |
|
"loss": 0.281, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3738019169329074, |
|
"grad_norm": 1.725629448890686, |
|
"learning_rate": 5.658849926433774e-06, |
|
"loss": 0.2917, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.4057507987220448, |
|
"grad_norm": 1.8225221633911133, |
|
"learning_rate": 5.4926868601249e-06, |
|
"loss": 0.2755, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4376996805111821, |
|
"grad_norm": 1.9555097818374634, |
|
"learning_rate": 5.325972352845965e-06, |
|
"loss": 0.2813, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4696485623003195, |
|
"grad_norm": 1.4284628629684448, |
|
"learning_rate": 5.1588930002159255e-06, |
|
"loss": 0.2819, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.5015974440894568, |
|
"grad_norm": 1.607223391532898, |
|
"learning_rate": 4.991635806207788e-06, |
|
"loss": 0.2734, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5335463258785942, |
|
"grad_norm": 1.7128931283950806, |
|
"learning_rate": 4.824387973843957e-06, |
|
"loss": 0.2712, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5654952076677318, |
|
"grad_norm": 1.7128102779388428, |
|
"learning_rate": 4.6573366956687885e-06, |
|
"loss": 0.2638, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5974440894568689, |
|
"grad_norm": 1.6286168098449707, |
|
"learning_rate": 4.4906689442328935e-06, |
|
"loss": 0.2761, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6293929712460065, |
|
"grad_norm": 1.7579160928726196, |
|
"learning_rate": 4.3245712628236356e-06, |
|
"loss": 0.2772, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6613418530351438, |
|
"grad_norm": 2.3854265213012695, |
|
"learning_rate": 4.159229556676111e-06, |
|
"loss": 0.2604, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6932907348242812, |
|
"grad_norm": 1.5023325681686401, |
|
"learning_rate": 3.994828884898267e-06, |
|
"loss": 0.2842, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.7252396166134185, |
|
"grad_norm": 1.7911590337753296, |
|
"learning_rate": 3.8315532533430285e-06, |
|
"loss": 0.2718, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7571884984025559, |
|
"grad_norm": 1.4628454446792603, |
|
"learning_rate": 3.6695854086593126e-06, |
|
"loss": 0.28, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7891373801916934, |
|
"grad_norm": 1.6940367221832275, |
|
"learning_rate": 3.509106633752387e-06, |
|
"loss": 0.285, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8210862619808306, |
|
"grad_norm": 1.588935375213623, |
|
"learning_rate": 3.350296544882543e-06, |
|
"loss": 0.28, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8530351437699681, |
|
"grad_norm": 1.9102530479431152, |
|
"learning_rate": 3.19333289062915e-06, |
|
"loss": 0.2878, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8849840255591053, |
|
"grad_norm": 1.792137622833252, |
|
"learning_rate": 3.0383913529451286e-06, |
|
"loss": 0.2796, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.9169329073482428, |
|
"grad_norm": 1.5013296604156494, |
|
"learning_rate": 2.8856453505245018e-06, |
|
"loss": 0.2599, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9488817891373802, |
|
"grad_norm": 1.6300023794174194, |
|
"learning_rate": 2.7352658447030882e-06, |
|
"loss": 0.2654, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9808306709265175, |
|
"grad_norm": 1.6086463928222656, |
|
"learning_rate": 2.587421148109619e-06, |
|
"loss": 0.2619, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.012779552715655, |
|
"grad_norm": 1.1864526271820068, |
|
"learning_rate": 2.4422767362814045e-06, |
|
"loss": 0.2232, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.0447284345047922, |
|
"grad_norm": 1.4812664985656738, |
|
"learning_rate": 2.299995062455459e-06, |
|
"loss": 0.1507, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.07667731629393, |
|
"grad_norm": 1.800999402999878, |
|
"learning_rate": 2.16073537574229e-06, |
|
"loss": 0.1401, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.108626198083067, |
|
"grad_norm": 1.3780609369277954, |
|
"learning_rate": 2.0246535428859652e-06, |
|
"loss": 0.1365, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.1405750798722045, |
|
"grad_norm": 1.6491374969482422, |
|
"learning_rate": 1.8919018738098704e-06, |
|
"loss": 0.1325, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.1725239616613417, |
|
"grad_norm": 1.7934684753417969, |
|
"learning_rate": 1.762628951143454e-06, |
|
"loss": 0.1374, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.2044728434504792, |
|
"grad_norm": 1.9819718599319458, |
|
"learning_rate": 1.6369794639207626e-06, |
|
"loss": 0.1281, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.236421725239617, |
|
"grad_norm": 1.7123699188232422, |
|
"learning_rate": 1.5150940456368784e-06, |
|
"loss": 0.1306, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.268370607028754, |
|
"grad_norm": 1.4628161191940308, |
|
"learning_rate": 1.3971091168435463e-06, |
|
"loss": 0.1312, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.3003194888178915, |
|
"grad_norm": 1.4270434379577637, |
|
"learning_rate": 1.2831567324601325e-06, |
|
"loss": 0.1376, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.3322683706070286, |
|
"grad_norm": 1.568036675453186, |
|
"learning_rate": 1.173364433970835e-06, |
|
"loss": 0.1381, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.364217252396166, |
|
"grad_norm": 1.4632725715637207, |
|
"learning_rate": 1.0678551066735671e-06, |
|
"loss": 0.1442, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.3961661341853033, |
|
"grad_norm": 1.6986589431762695, |
|
"learning_rate": 9.66746842140287e-07, |
|
"loss": 0.1309, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.428115015974441, |
|
"grad_norm": 1.4087769985198975, |
|
"learning_rate": 8.701528060427194e-07, |
|
"loss": 0.1366, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.460063897763578, |
|
"grad_norm": 1.3985058069229126, |
|
"learning_rate": 7.781811114913995e-07, |
|
"loss": 0.1315, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.4920127795527156, |
|
"grad_norm": 2.15179443359375, |
|
"learning_rate": 6.909346980298093e-07, |
|
"loss": 0.1402, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.523961661341853, |
|
"grad_norm": 1.854887843132019, |
|
"learning_rate": 6.085112164190466e-07, |
|
"loss": 0.1409, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.5559105431309903, |
|
"grad_norm": 1.7915081977844238, |
|
"learning_rate": 5.310029193419697e-07, |
|
"loss": 0.1208, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.587859424920128, |
|
"grad_norm": 1.8391714096069336, |
|
"learning_rate": 4.5849655814915683e-07, |
|
"loss": 0.13, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.619808306709265, |
|
"grad_norm": 1.8333430290222168, |
|
"learning_rate": 3.9107328576224736e-07, |
|
"loss": 0.1346, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.6517571884984026, |
|
"grad_norm": 1.4059584140777588, |
|
"learning_rate": 3.2880856584333043e-07, |
|
"loss": 0.1255, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.68370607028754, |
|
"grad_norm": 1.555797815322876, |
|
"learning_rate": 2.717720883320685e-07, |
|
"loss": 0.1314, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.7156549520766773, |
|
"grad_norm": 1.521854281425476, |
|
"learning_rate": 2.2002769144504943e-07, |
|
"loss": 0.1293, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.747603833865815, |
|
"grad_norm": 1.4914255142211914, |
|
"learning_rate": 1.7363329022471564e-07, |
|
"loss": 0.1269, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.779552715654952, |
|
"grad_norm": 1.473074197769165, |
|
"learning_rate": 1.3264081171780797e-07, |
|
"loss": 0.1291, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.8115015974440896, |
|
"grad_norm": 1.8382492065429688, |
|
"learning_rate": 9.709613685589314e-08, |
|
"loss": 0.1232, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.8434504792332267, |
|
"grad_norm": 2.253413200378418, |
|
"learning_rate": 6.703904910301929e-08, |
|
"loss": 0.1278, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.8753993610223643, |
|
"grad_norm": 1.3528082370758057, |
|
"learning_rate": 4.250318992797375e-08, |
|
"loss": 0.1279, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.9073482428115014, |
|
"grad_norm": 1.2942605018615723, |
|
"learning_rate": 2.351602115099272e-08, |
|
"loss": 0.1259, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.939297124600639, |
|
"grad_norm": 1.6663892269134521, |
|
"learning_rate": 1.0098794207047402e-08, |
|
"loss": 0.1332, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.9712460063897765, |
|
"grad_norm": 1.7137141227722168, |
|
"learning_rate": 2.2665263601240328e-09, |
|
"loss": 0.1277, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 939, |
|
"total_flos": 9.762156657969725e+17, |
|
"train_loss": 0.3061042113806874, |
|
"train_runtime": 22579.097, |
|
"train_samples_per_second": 2.66, |
|
"train_steps_per_second": 0.042 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 939, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50.0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.762156657969725e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|