|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1548, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01937984496124031, |
|
"grad_norm": 3.1349940241372636, |
|
"learning_rate": 2e-06, |
|
"loss": 0.7225, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03875968992248062, |
|
"grad_norm": 1.478554813389481, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6518, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05813953488372093, |
|
"grad_norm": 1.40892259720196, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6267, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07751937984496124, |
|
"grad_norm": 1.6253951402557405, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6193, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09689922480620156, |
|
"grad_norm": 1.474392955252126, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6139, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11627906976744186, |
|
"grad_norm": 3.0751907911279477, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6069, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13565891472868216, |
|
"grad_norm": 2.9223888412970873, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6024, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15503875968992248, |
|
"grad_norm": 2.357095375310306, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6052, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1744186046511628, |
|
"grad_norm": 1.3933025656894271, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5906, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1937984496124031, |
|
"grad_norm": 1.820645926185548, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5929, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2131782945736434, |
|
"grad_norm": 1.7924399526821087, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5958, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.23255813953488372, |
|
"grad_norm": 1.7736991192509453, |
|
"learning_rate": 2e-06, |
|
"loss": 0.587, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.25193798449612403, |
|
"grad_norm": 1.9201526538063434, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5972, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2713178294573643, |
|
"grad_norm": 1.9904730096438479, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5837, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.29069767441860467, |
|
"grad_norm": 1.706489075036192, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5837, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.31007751937984496, |
|
"grad_norm": 1.6008690300286739, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5824, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.32945736434108525, |
|
"grad_norm": 1.7784490780851796, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5776, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3488372093023256, |
|
"grad_norm": 1.4254130054140608, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5817, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3682170542635659, |
|
"grad_norm": 1.4517537061348775, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5745, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3875968992248062, |
|
"grad_norm": 1.421098096153975, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5822, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4069767441860465, |
|
"grad_norm": 1.4340247949454419, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5803, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4263565891472868, |
|
"grad_norm": 1.2607995600168784, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5683, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.44573643410852715, |
|
"grad_norm": 1.4088258448752224, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5669, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 1.4547922751942315, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5711, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4844961240310077, |
|
"grad_norm": 1.3166116102995473, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5715, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5038759689922481, |
|
"grad_norm": 1.2865076146935452, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5706, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5232558139534884, |
|
"grad_norm": 1.9514850118526776, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5711, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5426356589147286, |
|
"grad_norm": 1.3656201312041214, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5665, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.562015503875969, |
|
"grad_norm": 1.2117280702790532, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5685, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5813953488372093, |
|
"grad_norm": 1.5936964457861214, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5767, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6007751937984496, |
|
"grad_norm": 1.262836540024248, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5639, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6201550387596899, |
|
"grad_norm": 1.5253983364542758, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5613, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6395348837209303, |
|
"grad_norm": 1.695119554349292, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5629, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6589147286821705, |
|
"grad_norm": 1.2779875033648296, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5591, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6782945736434108, |
|
"grad_norm": 1.6166948739589506, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5561, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6976744186046512, |
|
"grad_norm": 1.3469984241923647, |
|
"learning_rate": 2e-06, |
|
"loss": 0.561, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7170542635658915, |
|
"grad_norm": 1.2982439077151307, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5662, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7364341085271318, |
|
"grad_norm": 1.3681013360427425, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5671, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7558139534883721, |
|
"grad_norm": 1.2116770625468472, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5568, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7751937984496124, |
|
"grad_norm": 1.433614262268133, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5581, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7945736434108527, |
|
"grad_norm": 1.1848821794193687, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5637, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.813953488372093, |
|
"grad_norm": 1.3158386135055107, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5582, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 2.3927036985893504, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5612, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8527131782945736, |
|
"grad_norm": 1.1806090504426345, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5588, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.872093023255814, |
|
"grad_norm": 1.2223208556627352, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5621, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8914728682170543, |
|
"grad_norm": 1.1512657163820463, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5513, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9108527131782945, |
|
"grad_norm": 1.1826527683444463, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5577, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 1.578741070129489, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5584, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9496124031007752, |
|
"grad_norm": 1.364010417258464, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5517, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9689922480620154, |
|
"grad_norm": 1.564561079833014, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5528, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9883720930232558, |
|
"grad_norm": 1.1934813597924194, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5573, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0077519379844961, |
|
"grad_norm": 2.2183985963794735, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5359, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.0271317829457365, |
|
"grad_norm": 1.444929607797708, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5096, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0465116279069768, |
|
"grad_norm": 1.3333146357476189, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5072, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0658914728682172, |
|
"grad_norm": 1.1259644665340547, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5052, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0852713178294573, |
|
"grad_norm": 1.4403172490091567, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5169, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.1046511627906976, |
|
"grad_norm": 1.157835174296272, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5102, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.124031007751938, |
|
"grad_norm": 1.1960407146223624, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5038, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.1434108527131783, |
|
"grad_norm": 1.253507090792846, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5034, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1627906976744187, |
|
"grad_norm": 1.2979047842512628, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5038, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1821705426356588, |
|
"grad_norm": 1.2576281990523717, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5137, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.2015503875968991, |
|
"grad_norm": 1.2748802075166439, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5076, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.2209302325581395, |
|
"grad_norm": 1.5223812042187916, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5077, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2403100775193798, |
|
"grad_norm": 1.3632540604107077, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5083, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2596899224806202, |
|
"grad_norm": 1.2321831811826818, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5096, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.2790697674418605, |
|
"grad_norm": 1.3323190632353188, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5063, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2984496124031009, |
|
"grad_norm": 1.2066604050023704, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5122, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.3178294573643412, |
|
"grad_norm": 1.186727847270962, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5121, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.3372093023255813, |
|
"grad_norm": 1.397263671569467, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5089, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.3565891472868217, |
|
"grad_norm": 1.573664700028339, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5099, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.375968992248062, |
|
"grad_norm": 1.4467914138897073, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5134, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3953488372093024, |
|
"grad_norm": 1.3706902755992394, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5123, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.4147286821705427, |
|
"grad_norm": 1.2411084475779852, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5078, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.4341085271317828, |
|
"grad_norm": 1.5902510539142722, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5137, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.4534883720930232, |
|
"grad_norm": 1.7787034613442634, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5094, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4728682170542635, |
|
"grad_norm": 1.2787619752439543, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5063, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.4922480620155039, |
|
"grad_norm": 1.2898677794347344, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5097, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.5116279069767442, |
|
"grad_norm": 1.2320340010541546, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5127, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.5310077519379846, |
|
"grad_norm": 1.1998558863263413, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5097, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.550387596899225, |
|
"grad_norm": 1.2491228211939762, |
|
"learning_rate": 2e-06, |
|
"loss": 0.507, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5697674418604652, |
|
"grad_norm": 1.2235970378609549, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5052, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.5891472868217056, |
|
"grad_norm": 1.4176990973616905, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5039, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.6085271317829457, |
|
"grad_norm": 1.3149977315253063, |
|
"learning_rate": 2e-06, |
|
"loss": 0.512, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.627906976744186, |
|
"grad_norm": 1.2333272112594988, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5066, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.6472868217054264, |
|
"grad_norm": 1.1574106550887124, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5113, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 1.418576038122765, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5137, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.6860465116279069, |
|
"grad_norm": 1.187391106828372, |
|
"learning_rate": 2e-06, |
|
"loss": 0.506, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.7054263565891472, |
|
"grad_norm": 1.170649147011855, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5107, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.7248062015503876, |
|
"grad_norm": 1.3798145976951228, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5099, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.744186046511628, |
|
"grad_norm": 1.725987629268818, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5139, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7635658914728682, |
|
"grad_norm": 1.8813904948358928, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5059, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.7829457364341086, |
|
"grad_norm": 1.8516754910417244, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5088, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.802325581395349, |
|
"grad_norm": 1.2095051182172416, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5096, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.8217054263565893, |
|
"grad_norm": 1.2435069711568396, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5077, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.8410852713178296, |
|
"grad_norm": 1.3586556706802664, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5102, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.8604651162790697, |
|
"grad_norm": 1.1979327298252027, |
|
"learning_rate": 2e-06, |
|
"loss": 0.512, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.87984496124031, |
|
"grad_norm": 1.7904993765105046, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5096, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.8992248062015504, |
|
"grad_norm": 1.377601184927356, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5057, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.9186046511627906, |
|
"grad_norm": 1.1816932854326225, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5118, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.937984496124031, |
|
"grad_norm": 1.235638000745403, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5092, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9573643410852712, |
|
"grad_norm": 1.2730535171048605, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5104, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.9767441860465116, |
|
"grad_norm": 1.2382473568182877, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5059, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.996124031007752, |
|
"grad_norm": 1.2856863827108878, |
|
"learning_rate": 2e-06, |
|
"loss": 0.51, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.0155038759689923, |
|
"grad_norm": 1.6836529854231972, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4603, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.0348837209302326, |
|
"grad_norm": 1.4922089047807388, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4585, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.054263565891473, |
|
"grad_norm": 1.3990249605527378, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4571, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.0736434108527133, |
|
"grad_norm": 1.5500783972286114, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4591, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.0930232558139537, |
|
"grad_norm": 1.4063766317234851, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4537, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.112403100775194, |
|
"grad_norm": 1.4498154054826256, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4567, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.1317829457364343, |
|
"grad_norm": 1.4659238804284036, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4564, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.1511627906976742, |
|
"grad_norm": 1.4331071288445956, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4597, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.1705426356589146, |
|
"grad_norm": 1.2265580968692957, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4613, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.189922480620155, |
|
"grad_norm": 1.430430093258902, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4582, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.2093023255813953, |
|
"grad_norm": 1.3191685780949374, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4567, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.2286821705426356, |
|
"grad_norm": 1.4143557981830728, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4593, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.248062015503876, |
|
"grad_norm": 1.2038927572265354, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4571, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.2674418604651163, |
|
"grad_norm": 1.5101700084835745, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4576, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.2868217054263567, |
|
"grad_norm": 1.3238249446175274, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4648, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.306201550387597, |
|
"grad_norm": 1.4453440150118313, |
|
"learning_rate": 2e-06, |
|
"loss": 0.46, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"grad_norm": 1.291302766406732, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4564, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.3449612403100777, |
|
"grad_norm": 1.3371668345858843, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4605, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.3643410852713176, |
|
"grad_norm": 1.4583613955952517, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4639, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.383720930232558, |
|
"grad_norm": 1.3603243092911215, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4591, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.4031007751937983, |
|
"grad_norm": 1.4381588797038276, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4654, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.4224806201550386, |
|
"grad_norm": 1.2897514442264095, |
|
"learning_rate": 2e-06, |
|
"loss": 0.46, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.441860465116279, |
|
"grad_norm": 1.4149740934308317, |
|
"learning_rate": 2e-06, |
|
"loss": 0.462, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.4612403100775193, |
|
"grad_norm": 1.2334708007117, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4636, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.4806201550387597, |
|
"grad_norm": 1.3277525646372448, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4637, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.341129908785728, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4572, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.5193798449612403, |
|
"grad_norm": 1.2028868010871578, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4643, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.5387596899224807, |
|
"grad_norm": 1.2788628805207698, |
|
"learning_rate": 2e-06, |
|
"loss": 0.458, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.558139534883721, |
|
"grad_norm": 1.1964555955603347, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4702, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.5775193798449614, |
|
"grad_norm": 1.2630513178641603, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4622, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.5968992248062017, |
|
"grad_norm": 1.4987867600057845, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4708, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.616279069767442, |
|
"grad_norm": 1.328908367081974, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4622, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.6356589147286824, |
|
"grad_norm": 1.572958887182858, |
|
"learning_rate": 2e-06, |
|
"loss": 0.459, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.6550387596899228, |
|
"grad_norm": 1.2305373948782317, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4597, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.6744186046511627, |
|
"grad_norm": 1.231294042248163, |
|
"learning_rate": 2e-06, |
|
"loss": 0.468, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.693798449612403, |
|
"grad_norm": 1.2696874963913714, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4598, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.7131782945736433, |
|
"grad_norm": 1.2272319783021322, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4678, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.7325581395348837, |
|
"grad_norm": 1.3352293934792803, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4594, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.751937984496124, |
|
"grad_norm": 1.306891032131746, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4613, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.7713178294573644, |
|
"grad_norm": 1.2665315216624808, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4667, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.7906976744186047, |
|
"grad_norm": 1.3240308251203166, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4656, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.810077519379845, |
|
"grad_norm": 1.4793024854316217, |
|
"learning_rate": 2e-06, |
|
"loss": 0.463, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.8294573643410854, |
|
"grad_norm": 1.4217782757634918, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4703, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.8488372093023253, |
|
"grad_norm": 1.2290713413876615, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4661, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.8682170542635657, |
|
"grad_norm": 1.4408298205910421, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4667, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.887596899224806, |
|
"grad_norm": 1.4463509582697884, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4692, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.9069767441860463, |
|
"grad_norm": 1.3757313989698736, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4585, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.9263565891472867, |
|
"grad_norm": 1.3950705796217753, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4627, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.945736434108527, |
|
"grad_norm": 1.269632815277089, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4679, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.9651162790697674, |
|
"grad_norm": 1.3792098795510677, |
|
"learning_rate": 2e-06, |
|
"loss": 0.463, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.9844961240310077, |
|
"grad_norm": 1.2723166198917764, |
|
"learning_rate": 2e-06, |
|
"loss": 0.468, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1548, |
|
"total_flos": 2591282618695680.0, |
|
"train_loss": 0.1541580106552873, |
|
"train_runtime": 7454.5076, |
|
"train_samples_per_second": 106.232, |
|
"train_steps_per_second": 0.208 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1548, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2591282618695680.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|