{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 939, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03194888178913738, "grad_norm": 2.244253158569336, "learning_rate": 9.997201868901463e-06, "loss": 0.6628, "step": 10 }, { "epoch": 0.06389776357827476, "grad_norm": 2.3703079223632812, "learning_rate": 9.988810607420912e-06, "loss": 0.5322, "step": 20 }, { "epoch": 0.09584664536741214, "grad_norm": 2.0787246227264404, "learning_rate": 9.974835607498224e-06, "loss": 0.5513, "step": 30 }, { "epoch": 0.12779552715654952, "grad_norm": 1.8989689350128174, "learning_rate": 9.955292510686156e-06, "loss": 0.5212, "step": 40 }, { "epoch": 0.1597444089456869, "grad_norm": 2.0805258750915527, "learning_rate": 9.930203190643491e-06, "loss": 0.5511, "step": 50 }, { "epoch": 0.19169329073482427, "grad_norm": 2.0137438774108887, "learning_rate": 9.899595728652883e-06, "loss": 0.5134, "step": 60 }, { "epoch": 0.22364217252396165, "grad_norm": 2.0841853618621826, "learning_rate": 9.863504382190838e-06, "loss": 0.4956, "step": 70 }, { "epoch": 0.25559105431309903, "grad_norm": 1.5514801740646362, "learning_rate": 9.821969546584922e-06, "loss": 0.5168, "step": 80 }, { "epoch": 0.28753993610223644, "grad_norm": 2.0024027824401855, "learning_rate": 9.775037709801206e-06, "loss": 0.4878, "step": 90 }, { "epoch": 0.3194888178913738, "grad_norm": 1.877454161643982, "learning_rate": 9.722761400412496e-06, "loss": 0.5156, "step": 100 }, { "epoch": 0.3514376996805112, "grad_norm": 1.4342358112335205, "learning_rate": 9.6651991288056e-06, "loss": 0.5025, "step": 110 }, { "epoch": 0.38338658146964855, "grad_norm": 1.7300364971160889, "learning_rate": 9.602415321693434e-06, "loss": 0.5102, "step": 120 }, { "epoch": 0.41533546325878595, "grad_norm": 1.5914117097854614, "learning_rate": 9.534480250005263e-06, "loss": 0.5074, "step": 130 }, { "epoch": 0.4472843450479233, "grad_norm": 1.7477545738220215, "learning_rate": 9.461469950235795e-06, "loss": 0.5025, "step": 140 }, { "epoch": 0.4792332268370607, "grad_norm": 1.8615833520889282, "learning_rate": 9.38346613934115e-06, "loss": 0.5066, "step": 150 }, { "epoch": 0.5111821086261981, "grad_norm": 1.7704814672470093, "learning_rate": 9.300556123276955e-06, "loss": 0.4952, "step": 160 }, { "epoch": 0.5431309904153354, "grad_norm": 1.7545205354690552, "learning_rate": 9.212832699280942e-06, "loss": 0.5093, "step": 170 }, { "epoch": 0.5750798722044729, "grad_norm": 2.059382438659668, "learning_rate": 9.120394052009412e-06, "loss": 0.4938, "step": 180 }, { "epoch": 0.6070287539936102, "grad_norm": 1.4858242273330688, "learning_rate": 9.023343643643821e-06, "loss": 0.4984, "step": 190 }, { "epoch": 0.6389776357827476, "grad_norm": 1.5544804334640503, "learning_rate": 8.921790098090477e-06, "loss": 0.4969, "step": 200 }, { "epoch": 0.670926517571885, "grad_norm": 1.8466405868530273, "learning_rate": 8.815847079402972e-06, "loss": 0.5071, "step": 210 }, { "epoch": 0.7028753993610224, "grad_norm": 1.684263825416565, "learning_rate": 8.705633164563413e-06, "loss": 0.4731, "step": 220 }, { "epoch": 0.7348242811501597, "grad_norm": 1.578497052192688, "learning_rate": 8.591271710764839e-06, "loss": 0.4741, "step": 230 }, { "epoch": 0.7667731629392971, "grad_norm": 1.6205824613571167, "learning_rate": 8.472890717343391e-06, "loss": 0.4648, "step": 240 }, { "epoch": 0.7987220447284346, "grad_norm": 1.6458181142807007, "learning_rate": 8.350622682514735e-06, "loss": 0.5115, "step": 250 }, { "epoch": 0.8306709265175719, "grad_norm": 1.6952828168869019, "learning_rate": 8.224604455075115e-06, "loss": 0.4802, "step": 260 }, { "epoch": 0.8626198083067093, "grad_norm": 1.5683045387268066, "learning_rate": 8.094977081233006e-06, "loss": 0.5013, "step": 270 }, { "epoch": 0.8945686900958466, "grad_norm": 1.5408000946044922, "learning_rate": 7.961885646742793e-06, "loss": 0.4897, "step": 280 }, { "epoch": 0.9265175718849841, "grad_norm": 1.5393075942993164, "learning_rate": 7.825479114517197e-06, "loss": 0.4759, "step": 290 }, { "epoch": 0.9584664536741214, "grad_norm": 1.638293743133545, "learning_rate": 7.685910157900158e-06, "loss": 0.4782, "step": 300 }, { "epoch": 0.9904153354632588, "grad_norm": 1.3112537860870361, "learning_rate": 7.5433349897868445e-06, "loss": 0.4874, "step": 310 }, { "epoch": 1.0223642172523961, "grad_norm": 1.236051321029663, "learning_rate": 7.397913187781962e-06, "loss": 0.3528, "step": 320 }, { "epoch": 1.0543130990415335, "grad_norm": 1.8970476388931274, "learning_rate": 7.249807515592149e-06, "loss": 0.2994, "step": 330 }, { "epoch": 1.0862619808306708, "grad_norm": 1.694257378578186, "learning_rate": 7.099183740852296e-06, "loss": 0.2844, "step": 340 }, { "epoch": 1.1182108626198084, "grad_norm": 1.5203874111175537, "learning_rate": 6.946210449589714e-06, "loss": 0.2881, "step": 350 }, { "epoch": 1.1501597444089458, "grad_norm": 1.5055556297302246, "learning_rate": 6.791058857533814e-06, "loss": 0.2746, "step": 360 }, { "epoch": 1.182108626198083, "grad_norm": 2.622091293334961, "learning_rate": 6.633902618482484e-06, "loss": 0.283, "step": 370 }, { "epoch": 1.2140575079872205, "grad_norm": 1.4760558605194092, "learning_rate": 6.474917629939652e-06, "loss": 0.2775, "step": 380 }, { "epoch": 1.2460063897763578, "grad_norm": 1.5865486860275269, "learning_rate": 6.314281836241573e-06, "loss": 0.2881, "step": 390 }, { "epoch": 1.2779552715654952, "grad_norm": 1.7005813121795654, "learning_rate": 6.1521750293922035e-06, "loss": 0.2846, "step": 400 }, { "epoch": 1.3099041533546325, "grad_norm": 1.4311730861663818, "learning_rate": 5.988778647830554e-06, "loss": 0.2871, "step": 410 }, { "epoch": 1.34185303514377, "grad_norm": 1.6767691373825073, "learning_rate": 5.824275573355278e-06, "loss": 0.281, "step": 420 }, { "epoch": 1.3738019169329074, "grad_norm": 1.725629448890686, "learning_rate": 5.658849926433774e-06, "loss": 0.2917, "step": 430 }, { "epoch": 1.4057507987220448, "grad_norm": 1.8225221633911133, "learning_rate": 5.4926868601249e-06, "loss": 0.2755, "step": 440 }, { "epoch": 1.4376996805111821, "grad_norm": 1.9555097818374634, "learning_rate": 5.325972352845965e-06, "loss": 0.2813, "step": 450 }, { "epoch": 1.4696485623003195, "grad_norm": 1.4284628629684448, "learning_rate": 5.1588930002159255e-06, "loss": 0.2819, "step": 460 }, { "epoch": 1.5015974440894568, "grad_norm": 1.607223391532898, "learning_rate": 4.991635806207788e-06, "loss": 0.2734, "step": 470 }, { "epoch": 1.5335463258785942, "grad_norm": 1.7128931283950806, "learning_rate": 4.824387973843957e-06, "loss": 0.2712, "step": 480 }, { "epoch": 1.5654952076677318, "grad_norm": 1.7128102779388428, "learning_rate": 4.6573366956687885e-06, "loss": 0.2638, "step": 490 }, { "epoch": 1.5974440894568689, "grad_norm": 1.6286168098449707, "learning_rate": 4.4906689442328935e-06, "loss": 0.2761, "step": 500 }, { "epoch": 1.6293929712460065, "grad_norm": 1.7579160928726196, "learning_rate": 4.3245712628236356e-06, "loss": 0.2772, "step": 510 }, { "epoch": 1.6613418530351438, "grad_norm": 2.3854265213012695, "learning_rate": 4.159229556676111e-06, "loss": 0.2604, "step": 520 }, { "epoch": 1.6932907348242812, "grad_norm": 1.5023325681686401, "learning_rate": 3.994828884898267e-06, "loss": 0.2842, "step": 530 }, { "epoch": 1.7252396166134185, "grad_norm": 1.7911590337753296, "learning_rate": 3.8315532533430285e-06, "loss": 0.2718, "step": 540 }, { "epoch": 1.7571884984025559, "grad_norm": 1.4628454446792603, "learning_rate": 3.6695854086593126e-06, "loss": 0.28, "step": 550 }, { "epoch": 1.7891373801916934, "grad_norm": 1.6940367221832275, "learning_rate": 3.509106633752387e-06, "loss": 0.285, "step": 560 }, { "epoch": 1.8210862619808306, "grad_norm": 1.588935375213623, "learning_rate": 3.350296544882543e-06, "loss": 0.28, "step": 570 }, { "epoch": 1.8530351437699681, "grad_norm": 1.9102530479431152, "learning_rate": 3.19333289062915e-06, "loss": 0.2878, "step": 580 }, { "epoch": 1.8849840255591053, "grad_norm": 1.792137622833252, "learning_rate": 3.0383913529451286e-06, "loss": 0.2796, "step": 590 }, { "epoch": 1.9169329073482428, "grad_norm": 1.5013296604156494, "learning_rate": 2.8856453505245018e-06, "loss": 0.2599, "step": 600 }, { "epoch": 1.9488817891373802, "grad_norm": 1.6300023794174194, "learning_rate": 2.7352658447030882e-06, "loss": 0.2654, "step": 610 }, { "epoch": 1.9808306709265175, "grad_norm": 1.6086463928222656, "learning_rate": 2.587421148109619e-06, "loss": 0.2619, "step": 620 }, { "epoch": 2.012779552715655, "grad_norm": 1.1864526271820068, "learning_rate": 2.4422767362814045e-06, "loss": 0.2232, "step": 630 }, { "epoch": 2.0447284345047922, "grad_norm": 1.4812664985656738, "learning_rate": 2.299995062455459e-06, "loss": 0.1507, "step": 640 }, { "epoch": 2.07667731629393, "grad_norm": 1.800999402999878, "learning_rate": 2.16073537574229e-06, "loss": 0.1401, "step": 650 }, { "epoch": 2.108626198083067, "grad_norm": 1.3780609369277954, "learning_rate": 2.0246535428859652e-06, "loss": 0.1365, "step": 660 }, { "epoch": 2.1405750798722045, "grad_norm": 1.6491374969482422, "learning_rate": 1.8919018738098704e-06, "loss": 0.1325, "step": 670 }, { "epoch": 2.1725239616613417, "grad_norm": 1.7934684753417969, "learning_rate": 1.762628951143454e-06, "loss": 0.1374, "step": 680 }, { "epoch": 2.2044728434504792, "grad_norm": 1.9819718599319458, "learning_rate": 1.6369794639207626e-06, "loss": 0.1281, "step": 690 }, { "epoch": 2.236421725239617, "grad_norm": 1.7123699188232422, "learning_rate": 1.5150940456368784e-06, "loss": 0.1306, "step": 700 }, { "epoch": 2.268370607028754, "grad_norm": 1.4628161191940308, "learning_rate": 1.3971091168435463e-06, "loss": 0.1312, "step": 710 }, { "epoch": 2.3003194888178915, "grad_norm": 1.4270434379577637, "learning_rate": 1.2831567324601325e-06, "loss": 0.1376, "step": 720 }, { "epoch": 2.3322683706070286, "grad_norm": 1.568036675453186, "learning_rate": 1.173364433970835e-06, "loss": 0.1381, "step": 730 }, { "epoch": 2.364217252396166, "grad_norm": 1.4632725715637207, "learning_rate": 1.0678551066735671e-06, "loss": 0.1442, "step": 740 }, { "epoch": 2.3961661341853033, "grad_norm": 1.6986589431762695, "learning_rate": 9.66746842140287e-07, "loss": 0.1309, "step": 750 }, { "epoch": 2.428115015974441, "grad_norm": 1.4087769985198975, "learning_rate": 8.701528060427194e-07, "loss": 0.1366, "step": 760 }, { "epoch": 2.460063897763578, "grad_norm": 1.3985058069229126, "learning_rate": 7.781811114913995e-07, "loss": 0.1315, "step": 770 }, { "epoch": 2.4920127795527156, "grad_norm": 2.15179443359375, "learning_rate": 6.909346980298093e-07, "loss": 0.1402, "step": 780 }, { "epoch": 2.523961661341853, "grad_norm": 1.854887843132019, "learning_rate": 6.085112164190466e-07, "loss": 0.1409, "step": 790 }, { "epoch": 2.5559105431309903, "grad_norm": 1.7915081977844238, "learning_rate": 5.310029193419697e-07, "loss": 0.1208, "step": 800 }, { "epoch": 2.587859424920128, "grad_norm": 1.8391714096069336, "learning_rate": 4.5849655814915683e-07, "loss": 0.13, "step": 810 }, { "epoch": 2.619808306709265, "grad_norm": 1.8333430290222168, "learning_rate": 3.9107328576224736e-07, "loss": 0.1346, "step": 820 }, { "epoch": 2.6517571884984026, "grad_norm": 1.4059584140777588, "learning_rate": 3.2880856584333043e-07, "loss": 0.1255, "step": 830 }, { "epoch": 2.68370607028754, "grad_norm": 1.555797815322876, "learning_rate": 2.717720883320685e-07, "loss": 0.1314, "step": 840 }, { "epoch": 2.7156549520766773, "grad_norm": 1.521854281425476, "learning_rate": 2.2002769144504943e-07, "loss": 0.1293, "step": 850 }, { "epoch": 2.747603833865815, "grad_norm": 1.4914255142211914, "learning_rate": 1.7363329022471564e-07, "loss": 0.1269, "step": 860 }, { "epoch": 2.779552715654952, "grad_norm": 1.473074197769165, "learning_rate": 1.3264081171780797e-07, "loss": 0.1291, "step": 870 }, { "epoch": 2.8115015974440896, "grad_norm": 1.8382492065429688, "learning_rate": 9.709613685589314e-08, "loss": 0.1232, "step": 880 }, { "epoch": 2.8434504792332267, "grad_norm": 2.253413200378418, "learning_rate": 6.703904910301929e-08, "loss": 0.1278, "step": 890 }, { "epoch": 2.8753993610223643, "grad_norm": 1.3528082370758057, "learning_rate": 4.250318992797375e-08, "loss": 0.1279, "step": 900 }, { "epoch": 2.9073482428115014, "grad_norm": 1.2942605018615723, "learning_rate": 2.351602115099272e-08, "loss": 0.1259, "step": 910 }, { "epoch": 2.939297124600639, "grad_norm": 1.6663892269134521, "learning_rate": 1.0098794207047402e-08, "loss": 0.1332, "step": 920 }, { "epoch": 2.9712460063897765, "grad_norm": 1.7137141227722168, "learning_rate": 2.2665263601240328e-09, "loss": 0.1277, "step": 930 }, { "epoch": 3.0, "step": 939, "total_flos": 9.762156657969725e+17, "train_loss": 0.3061042113806874, "train_runtime": 22579.097, "train_samples_per_second": 2.66, "train_steps_per_second": 0.042 } ], "logging_steps": 10, "max_steps": 939, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.762156657969725e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }