|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9963285024154587, |
|
"eval_steps": 500, |
|
"global_step": 1938, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015458937198067632, |
|
"grad_norm": 16.53294779967864, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0792, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.030917874396135265, |
|
"grad_norm": 2.649325227711683, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9837, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0463768115942029, |
|
"grad_norm": 8.284136228811613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9474, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06183574879227053, |
|
"grad_norm": 2.524509268818302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9301, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07729468599033816, |
|
"grad_norm": 1.5326850741391629, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9155, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0927536231884058, |
|
"grad_norm": 1.0781869595697948, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9037, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10821256038647344, |
|
"grad_norm": 0.8943372564864077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8967, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12367149758454106, |
|
"grad_norm": 0.8868808075595438, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8867, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1391304347826087, |
|
"grad_norm": 0.6755190824568067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8792, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.15458937198067632, |
|
"grad_norm": 0.6359486667796315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8769, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17004830917874397, |
|
"grad_norm": 0.6299873717513695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8717, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1855072463768116, |
|
"grad_norm": 0.5832655638295026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8671, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.20096618357487922, |
|
"grad_norm": 0.5586732401198556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8642, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.21642512077294687, |
|
"grad_norm": 0.6759403203697444, |
|
"learning_rate": 5e-06, |
|
"loss": 0.863, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2318840579710145, |
|
"grad_norm": 0.5234393167054423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.861, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.24734299516908212, |
|
"grad_norm": 0.5405697501688478, |
|
"learning_rate": 5e-06, |
|
"loss": 0.864, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.26280193236714977, |
|
"grad_norm": 0.5609477995758988, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8528, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2782608695652174, |
|
"grad_norm": 0.648827500892738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.857, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.293719806763285, |
|
"grad_norm": 0.6627079853918527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8511, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.30917874396135264, |
|
"grad_norm": 0.6915034637639949, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8485, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.32463768115942027, |
|
"grad_norm": 0.6366893171242987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8462, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.34009661835748795, |
|
"grad_norm": 0.5070708715215638, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8446, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 0.5871716320705027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8445, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3710144927536232, |
|
"grad_norm": 0.7730510671832644, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8497, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3864734299516908, |
|
"grad_norm": 0.7142598054734569, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8438, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.40193236714975844, |
|
"grad_norm": 0.5417070693738956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8476, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.41739130434782606, |
|
"grad_norm": 0.6163533652451042, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8442, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.43285024154589374, |
|
"grad_norm": 0.5863440757370957, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8384, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.44830917874396137, |
|
"grad_norm": 0.6841438022567938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8416, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.463768115942029, |
|
"grad_norm": 0.5830887688966261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8363, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4792270531400966, |
|
"grad_norm": 0.7565147804535631, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8391, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.49468599033816424, |
|
"grad_norm": 0.5377296055296723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8384, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5101449275362319, |
|
"grad_norm": 0.6313225350990711, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8344, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5256038647342995, |
|
"grad_norm": 0.6011085474374273, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8352, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5410628019323671, |
|
"grad_norm": 0.6313718464958992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8319, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5565217391304348, |
|
"grad_norm": 0.5602186929251594, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8305, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5719806763285025, |
|
"grad_norm": 0.5682159814727703, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8283, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.58743961352657, |
|
"grad_norm": 0.5718556832144389, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8341, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6028985507246377, |
|
"grad_norm": 0.546768373760242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8291, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6183574879227053, |
|
"grad_norm": 0.5993423126379529, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8361, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.633816425120773, |
|
"grad_norm": 0.5491459229199431, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8291, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6492753623188405, |
|
"grad_norm": 0.5168339143544802, |
|
"learning_rate": 5e-06, |
|
"loss": 0.83, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6647342995169082, |
|
"grad_norm": 0.5210184948415354, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8237, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6801932367149759, |
|
"grad_norm": 0.5424122263787127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8228, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 0.5637417843194678, |
|
"learning_rate": 5e-06, |
|
"loss": 0.829, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 0.4743888435043112, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8221, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7265700483091787, |
|
"grad_norm": 0.5224626427829437, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8254, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7420289855072464, |
|
"grad_norm": 0.6508696496425336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8264, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7574879227053141, |
|
"grad_norm": 0.5904629481154552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8295, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7729468599033816, |
|
"grad_norm": 0.574388340778744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8223, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7884057971014493, |
|
"grad_norm": 0.5353158279457252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8257, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8038647342995169, |
|
"grad_norm": 0.5821234180105461, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8265, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8193236714975846, |
|
"grad_norm": 0.5068894400573091, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8224, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8347826086956521, |
|
"grad_norm": 0.5375828449207095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.821, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8502415458937198, |
|
"grad_norm": 0.5410980089439529, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8271, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8657004830917875, |
|
"grad_norm": 0.5953566167479901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8168, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.881159420289855, |
|
"grad_norm": 0.6599677690788177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8234, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8966183574879227, |
|
"grad_norm": 0.6548750016255851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8198, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9120772946859903, |
|
"grad_norm": 0.6314610392042965, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8169, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.927536231884058, |
|
"grad_norm": 0.4920135279723095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8225, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9429951690821256, |
|
"grad_norm": 0.5279136309564921, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8188, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9584541062801932, |
|
"grad_norm": 0.5531816471285108, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8217, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9739130434782609, |
|
"grad_norm": 0.5924801838934433, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8221, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9893719806763285, |
|
"grad_norm": 0.5117042192273262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8188, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9986473429951691, |
|
"eval_loss": 0.8183467984199524, |
|
"eval_runtime": 686.568, |
|
"eval_samples_per_second": 25.389, |
|
"eval_steps_per_second": 0.398, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.0050241545893719, |
|
"grad_norm": 0.6925909025947767, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8235, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.0204830917874397, |
|
"grad_norm": 0.6132973767615294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7771, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0359420289855072, |
|
"grad_norm": 0.6797848846411009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7802, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0514009661835748, |
|
"grad_norm": 0.5295735808202817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7777, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0668599033816426, |
|
"grad_norm": 0.5271721002677758, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7751, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0823188405797102, |
|
"grad_norm": 0.47521281338293253, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7808, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0977777777777777, |
|
"grad_norm": 0.5201403409762577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7769, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.1132367149758453, |
|
"grad_norm": 0.5374055398678584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7775, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.128695652173913, |
|
"grad_norm": 0.520683864449963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7787, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.1441545893719807, |
|
"grad_norm": 0.5406489528118505, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7816, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1596135265700482, |
|
"grad_norm": 0.585881797178412, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7811, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.175072463768116, |
|
"grad_norm": 0.5490222258224376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7763, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1905314009661836, |
|
"grad_norm": 0.6049557272461074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7821, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.2059903381642512, |
|
"grad_norm": 0.6287813068938076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7771, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.221449275362319, |
|
"grad_norm": 0.5791771698431348, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7832, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.2369082125603865, |
|
"grad_norm": 0.552647068072239, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7795, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.252367149758454, |
|
"grad_norm": 0.48953182542010515, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7767, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2678260869565217, |
|
"grad_norm": 0.5809037976182655, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7784, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2832850241545894, |
|
"grad_norm": 0.49664609280994976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7765, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.298743961352657, |
|
"grad_norm": 0.5514267021897065, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7791, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3142028985507246, |
|
"grad_norm": 0.6174163379347436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7775, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.3296618357487922, |
|
"grad_norm": 0.5893029009867757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7743, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.34512077294686, |
|
"grad_norm": 0.5884521899466931, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7768, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.3605797101449275, |
|
"grad_norm": 0.526781782612563, |
|
"learning_rate": 5e-06, |
|
"loss": 0.773, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.376038647342995, |
|
"grad_norm": 0.5133541303046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.774, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3914975845410629, |
|
"grad_norm": 0.5514217537787884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7802, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4069565217391304, |
|
"grad_norm": 0.5829849669974898, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7787, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.422415458937198, |
|
"grad_norm": 0.6099035981973764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7738, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.4378743961352658, |
|
"grad_norm": 0.4767884324426242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7773, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.4533333333333334, |
|
"grad_norm": 0.5611337081061908, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7767, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.468792270531401, |
|
"grad_norm": 0.47930773858272085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7765, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4842512077294687, |
|
"grad_norm": 0.498168257215718, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7728, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4997101449275363, |
|
"grad_norm": 0.5576989172567428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7777, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.5151690821256039, |
|
"grad_norm": 0.5873903650866855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7747, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.5306280193236717, |
|
"grad_norm": 0.5564865473674926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7786, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.546086956521739, |
|
"grad_norm": 0.6746662280932265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7823, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5615458937198068, |
|
"grad_norm": 0.550553366091711, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7704, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.5770048309178744, |
|
"grad_norm": 0.555996816403915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7758, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.592463768115942, |
|
"grad_norm": 0.5621088990135378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7751, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.6079227053140097, |
|
"grad_norm": 0.4672348676970037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7742, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.6233816425120773, |
|
"grad_norm": 0.49112359521062937, |
|
"learning_rate": 5e-06, |
|
"loss": 0.777, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.6388405797101449, |
|
"grad_norm": 0.5517626252028611, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7757, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.6542995169082126, |
|
"grad_norm": 0.5518129870744243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7744, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.6697584541062802, |
|
"grad_norm": 0.685405898117341, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7753, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6852173913043478, |
|
"grad_norm": 0.5720673042328214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7753, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.7006763285024156, |
|
"grad_norm": 0.4690028175072265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.774, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.7161352657004831, |
|
"grad_norm": 0.55568178811657, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7772, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.7315942028985507, |
|
"grad_norm": 0.5185570493500619, |
|
"learning_rate": 5e-06, |
|
"loss": 0.781, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.7470531400966185, |
|
"grad_norm": 0.5292299708932318, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7749, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.7625120772946858, |
|
"grad_norm": 0.5884058161213621, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7719, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.7779710144927536, |
|
"grad_norm": 0.5072506431239099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7753, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7934299516908214, |
|
"grad_norm": 0.5551938392960334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7777, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.8088888888888888, |
|
"grad_norm": 0.5566487078925049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7774, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.8243478260869566, |
|
"grad_norm": 0.4749917546235466, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7734, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.8398067632850241, |
|
"grad_norm": 0.5022635709311233, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7743, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.8552657004830917, |
|
"grad_norm": 0.5442982810099344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7728, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.8707246376811595, |
|
"grad_norm": 0.5155014433123901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.774, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.886183574879227, |
|
"grad_norm": 0.5934285413681538, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7746, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.9016425120772946, |
|
"grad_norm": 0.5260175972638601, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7693, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.9171014492753624, |
|
"grad_norm": 0.515080447493818, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7717, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.93256038647343, |
|
"grad_norm": 0.6011160845737209, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7754, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.9480193236714975, |
|
"grad_norm": 0.46061302659355685, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7755, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.9634782608695653, |
|
"grad_norm": 0.46039484020056154, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7722, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.9789371980676327, |
|
"grad_norm": 0.5658493454639554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7755, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.9943961352657005, |
|
"grad_norm": 0.5908199178180503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7709, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.9990338164251207, |
|
"eval_loss": 0.8051349520683289, |
|
"eval_runtime": 690.0891, |
|
"eval_samples_per_second": 25.259, |
|
"eval_steps_per_second": 0.396, |
|
"step": 1293 |
|
}, |
|
{ |
|
"epoch": 2.0100483091787438, |
|
"grad_norm": 0.6943407863572525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7604, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.0255072463768116, |
|
"grad_norm": 0.5840764739328596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7299, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.0409661835748794, |
|
"grad_norm": 0.645835170219903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7254, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.0564251207729467, |
|
"grad_norm": 0.6967100498978297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7312, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.0718840579710145, |
|
"grad_norm": 0.5424795093750152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7283, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.0873429951690823, |
|
"grad_norm": 0.5651081335517218, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7322, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.1028019323671496, |
|
"grad_norm": 0.5793019251125064, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7317, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.1182608695652174, |
|
"grad_norm": 0.5653295937261641, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7331, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.133719806763285, |
|
"grad_norm": 0.6945092784765529, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7346, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.1491787439613526, |
|
"grad_norm": 0.5795163218543443, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7336, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.1646376811594203, |
|
"grad_norm": 0.5922357321216497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7299, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.1800966183574877, |
|
"grad_norm": 0.5570557796263025, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7333, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.1955555555555555, |
|
"grad_norm": 0.5392312450784695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7371, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.2110144927536233, |
|
"grad_norm": 0.569063560563541, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7314, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.2264734299516906, |
|
"grad_norm": 0.6107660118171969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7322, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.2419323671497584, |
|
"grad_norm": 0.6566517138097786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7356, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.257391304347826, |
|
"grad_norm": 0.5806353609910259, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7418, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.2728502415458935, |
|
"grad_norm": 0.544246667709765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7319, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.2883091787439613, |
|
"grad_norm": 0.5424208252581, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7332, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.303768115942029, |
|
"grad_norm": 0.5380434193955503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7342, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.3192270531400965, |
|
"grad_norm": 0.5919093406358342, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7345, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.3346859903381643, |
|
"grad_norm": 0.5815232359700448, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7357, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.350144927536232, |
|
"grad_norm": 0.6561512544812266, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7339, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.3656038647342994, |
|
"grad_norm": 0.5328952220385875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7297, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.381062801932367, |
|
"grad_norm": 0.5216733576185124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7298, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.396521739130435, |
|
"grad_norm": 0.6063067814678768, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7368, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.4119806763285023, |
|
"grad_norm": 0.5818602690123681, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7353, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.42743961352657, |
|
"grad_norm": 0.5913577701518534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7338, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.442898550724638, |
|
"grad_norm": 0.5527497540961946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7329, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.4583574879227053, |
|
"grad_norm": 0.6737570445790982, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7367, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.473816425120773, |
|
"grad_norm": 0.6619470586787684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.733, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.4892753623188404, |
|
"grad_norm": 0.4750068577638755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7375, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.504734299516908, |
|
"grad_norm": 0.6847743909506772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7374, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.520193236714976, |
|
"grad_norm": 0.5239840846624293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7311, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.5356521739130433, |
|
"grad_norm": 0.4721718835375596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7308, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.551111111111111, |
|
"grad_norm": 0.51093602092176, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7337, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.566570048309179, |
|
"grad_norm": 0.5517386015611798, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7318, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.5820289855072462, |
|
"grad_norm": 0.6326674619813268, |
|
"learning_rate": 5e-06, |
|
"loss": 0.736, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.597487922705314, |
|
"grad_norm": 0.5232840712675151, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7325, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.6129468599033814, |
|
"grad_norm": 0.4969751533645812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7375, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.628405797101449, |
|
"grad_norm": 0.49538430512331766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7366, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.643864734299517, |
|
"grad_norm": 0.6208865012276192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7372, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.6593236714975843, |
|
"grad_norm": 0.5276942120485377, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7351, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.674782608695652, |
|
"grad_norm": 0.570808842039396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7384, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.69024154589372, |
|
"grad_norm": 0.5214638213365278, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7361, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.7057004830917872, |
|
"grad_norm": 0.5190586781651014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7309, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.721159420289855, |
|
"grad_norm": 0.5317230869170978, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7296, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.736618357487923, |
|
"grad_norm": 0.5917255596181432, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7406, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.75207729468599, |
|
"grad_norm": 0.49202576322983876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7324, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.767536231884058, |
|
"grad_norm": 0.5594574654106287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7331, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.7829951690821257, |
|
"grad_norm": 0.6198580773466541, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7372, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.798454106280193, |
|
"grad_norm": 0.5740394274550438, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7359, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.813913043478261, |
|
"grad_norm": 0.5501912428656768, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7384, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.8293719806763287, |
|
"grad_norm": 0.5104778986757859, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7324, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.844830917874396, |
|
"grad_norm": 0.5395220598812313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.736, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.860289855072464, |
|
"grad_norm": 0.6030104859258091, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7327, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.8757487922705316, |
|
"grad_norm": 0.556906171705928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7374, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.891207729468599, |
|
"grad_norm": 0.6174821846225631, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7351, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.9066666666666667, |
|
"grad_norm": 0.5078906232420815, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7326, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.9221256038647345, |
|
"grad_norm": 0.6177111487230912, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7321, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.937584541062802, |
|
"grad_norm": 0.5520929737500946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7395, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.9530434782608697, |
|
"grad_norm": 0.5185834378400617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7368, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.9685024154589374, |
|
"grad_norm": 0.5204851978024219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7339, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.983961352657005, |
|
"grad_norm": 0.5807949438616106, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7386, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.9963285024154587, |
|
"eval_loss": 0.8025317192077637, |
|
"eval_runtime": 693.9451, |
|
"eval_samples_per_second": 25.119, |
|
"eval_steps_per_second": 0.393, |
|
"step": 1938 |
|
}, |
|
{ |
|
"epoch": 2.9963285024154587, |
|
"step": 1938, |
|
"total_flos": 3246012802007040.0, |
|
"train_loss": 0.7871413270263357, |
|
"train_runtime": 114590.6996, |
|
"train_samples_per_second": 8.67, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1938, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3246012802007040.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|