|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.993258426966292, |
|
"eval_steps": 500, |
|
"global_step": 999, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0299625468164794, |
|
"grad_norm": 2.159330801984932, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7997, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0599250936329588, |
|
"grad_norm": 2.575219453584873, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7216, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0898876404494382, |
|
"grad_norm": 0.9472938732683952, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7072, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1198501872659176, |
|
"grad_norm": 0.9037652307729707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.686, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.149812734082397, |
|
"grad_norm": 0.9332908648588308, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6654, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1797752808988764, |
|
"grad_norm": 2.546264321590982, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6592, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20973782771535582, |
|
"grad_norm": 0.8963518726180082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6539, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2397003745318352, |
|
"grad_norm": 0.6474151088056208, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6484, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2696629213483146, |
|
"grad_norm": 0.7184102062533572, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6455, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.299625468164794, |
|
"grad_norm": 0.7912271314589597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6422, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3295880149812734, |
|
"grad_norm": 0.5702593280439339, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6335, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3595505617977528, |
|
"grad_norm": 0.6067788640506643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6252, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3895131086142322, |
|
"grad_norm": 0.7543519184412004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6278, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.41947565543071164, |
|
"grad_norm": 1.3158399586116993, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6344, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 0.9280984142139298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6261, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4794007490636704, |
|
"grad_norm": 0.8050698415939164, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6177, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5093632958801498, |
|
"grad_norm": 0.6632585021759287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.622, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5393258426966292, |
|
"grad_norm": 0.7452935828094429, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6223, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5692883895131086, |
|
"grad_norm": 0.5259568899134429, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6183, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.599250936329588, |
|
"grad_norm": 1.0908063668428212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6164, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6292134831460674, |
|
"grad_norm": 0.5904142262846527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6247, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6591760299625468, |
|
"grad_norm": 0.48963729877546575, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6097, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6891385767790262, |
|
"grad_norm": 0.5647584677724115, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6193, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7191011235955056, |
|
"grad_norm": 0.6352290021696486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6183, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7490636704119851, |
|
"grad_norm": 0.5114633845114385, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6163, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7790262172284644, |
|
"grad_norm": 0.7063368471913241, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6093, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8089887640449438, |
|
"grad_norm": 0.9028219544074879, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6135, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8389513108614233, |
|
"grad_norm": 0.6542303935292434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6146, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8689138576779026, |
|
"grad_norm": 0.9490011650791124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6258, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 0.5141275957416789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.608, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9288389513108615, |
|
"grad_norm": 0.4695093914592938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6086, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9588014981273408, |
|
"grad_norm": 0.4886269533641591, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6055, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9887640449438202, |
|
"grad_norm": 0.4785326651206929, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6122, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9977528089887641, |
|
"eval_loss": 0.6095167398452759, |
|
"eval_runtime": 180.8435, |
|
"eval_samples_per_second": 49.723, |
|
"eval_steps_per_second": 0.393, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.0187265917602997, |
|
"grad_norm": 0.8136150743659245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5787, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.048689138576779, |
|
"grad_norm": 0.5667433740151928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5582, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0786516853932584, |
|
"grad_norm": 0.5036359850721361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5638, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1086142322097379, |
|
"grad_norm": 0.5623087149261949, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5599, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1385767790262173, |
|
"grad_norm": 0.488809840644991, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5619, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1685393258426966, |
|
"grad_norm": 0.7591237017789003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5617, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.198501872659176, |
|
"grad_norm": 0.7394067985885456, |
|
"learning_rate": 5e-06, |
|
"loss": 0.554, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2284644194756553, |
|
"grad_norm": 0.6131933237418792, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5646, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2584269662921348, |
|
"grad_norm": 0.554150130587659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5653, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2883895131086143, |
|
"grad_norm": 0.5751376390924479, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5581, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3183520599250937, |
|
"grad_norm": 0.5120666058669939, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5651, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.348314606741573, |
|
"grad_norm": 0.8368555832046994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5669, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3782771535580525, |
|
"grad_norm": 0.7407992028160174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.554, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4082397003745317, |
|
"grad_norm": 0.5156759371159569, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5588, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4382022471910112, |
|
"grad_norm": 0.49080996761818096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5632, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4681647940074907, |
|
"grad_norm": 0.4910042182094872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.561, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.4981273408239701, |
|
"grad_norm": 0.5255273950611783, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5605, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5280898876404494, |
|
"grad_norm": 0.46327981869558943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5618, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5580524344569289, |
|
"grad_norm": 0.55012055750815, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5568, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5880149812734081, |
|
"grad_norm": 0.5105338531939311, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5672, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6179775280898876, |
|
"grad_norm": 0.5336419797454037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5593, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.647940074906367, |
|
"grad_norm": 0.9436464610139725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5608, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6779026217228465, |
|
"grad_norm": 0.6016864999378152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5588, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7078651685393258, |
|
"grad_norm": 0.5010354324689145, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5548, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7378277153558053, |
|
"grad_norm": 0.4589724783243399, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5665, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7677902621722845, |
|
"grad_norm": 0.5015630178996147, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5636, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.797752808988764, |
|
"grad_norm": 0.5495613719565868, |
|
"learning_rate": 5e-06, |
|
"loss": 0.565, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8277153558052435, |
|
"grad_norm": 0.5539975776071888, |
|
"learning_rate": 5e-06, |
|
"loss": 0.562, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.857677902621723, |
|
"grad_norm": 0.5053725200868951, |
|
"learning_rate": 5e-06, |
|
"loss": 0.551, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.8876404494382022, |
|
"grad_norm": 0.4543290751635621, |
|
"learning_rate": 5e-06, |
|
"loss": 0.556, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9176029962546817, |
|
"grad_norm": 0.47157192643041534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5602, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.947565543071161, |
|
"grad_norm": 0.4784340330073252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.557, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.9775280898876404, |
|
"grad_norm": 0.44605574199738396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.562, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.9985018726591761, |
|
"eval_loss": 0.6012518405914307, |
|
"eval_runtime": 181.0833, |
|
"eval_samples_per_second": 49.657, |
|
"eval_steps_per_second": 0.392, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 2.00749063670412, |
|
"grad_norm": 0.8623975331438202, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5432, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0374531835205993, |
|
"grad_norm": 0.6324789071436193, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5019, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.067415730337079, |
|
"grad_norm": 0.5758535175615167, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5099, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.097378277153558, |
|
"grad_norm": 0.6234430093296897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5112, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1273408239700373, |
|
"grad_norm": 0.5881227652440947, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5022, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.157303370786517, |
|
"grad_norm": 0.6192814926150049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5066, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.1872659176029963, |
|
"grad_norm": 0.5117435754957025, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5064, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2172284644194757, |
|
"grad_norm": 0.5619958642740768, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5046, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.247191011235955, |
|
"grad_norm": 0.8267859788370541, |
|
"learning_rate": 5e-06, |
|
"loss": 0.511, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.2771535580524347, |
|
"grad_norm": 0.7317931221994743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5092, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3071161048689137, |
|
"grad_norm": 0.6283385492658163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5133, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.337078651685393, |
|
"grad_norm": 0.5246018099952993, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5131, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.3670411985018727, |
|
"grad_norm": 0.5366582911908819, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5163, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.397003745318352, |
|
"grad_norm": 0.5455507417027214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5174, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4269662921348316, |
|
"grad_norm": 0.47939087317987383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5117, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.4569288389513106, |
|
"grad_norm": 0.5848286494949987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5101, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.48689138576779, |
|
"grad_norm": 0.5885231295942317, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5098, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5168539325842696, |
|
"grad_norm": 0.49238621301397356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5128, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.546816479400749, |
|
"grad_norm": 0.5319632499579365, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5129, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.5767790262172285, |
|
"grad_norm": 0.5235625882031714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5111, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.606741573033708, |
|
"grad_norm": 0.5063693847116817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5125, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.6367041198501875, |
|
"grad_norm": 0.5425085022408588, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5067, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.49758835042040306, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5112, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.696629213483146, |
|
"grad_norm": 0.5128503684055458, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5163, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.7265917602996255, |
|
"grad_norm": 0.5751048660393648, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5104, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.756554307116105, |
|
"grad_norm": 0.6107618204066423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5124, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.7865168539325844, |
|
"grad_norm": 0.4918065441260285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5062, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8164794007490634, |
|
"grad_norm": 0.49772220806864265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5119, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.846441947565543, |
|
"grad_norm": 0.560475095793381, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5134, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.8764044943820224, |
|
"grad_norm": 0.5127175297281757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5117, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.906367041198502, |
|
"grad_norm": 0.6130240398398701, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5188, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.9363295880149813, |
|
"grad_norm": 0.6513015524907453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5102, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.966292134831461, |
|
"grad_norm": 0.5631449617834976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.517, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.993258426966292, |
|
"eval_loss": 0.605403482913971, |
|
"eval_runtime": 181.5085, |
|
"eval_samples_per_second": 49.54, |
|
"eval_steps_per_second": 0.391, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 2.993258426966292, |
|
"step": 999, |
|
"total_flos": 1672943448883200.0, |
|
"train_loss": 0.5696583624716636, |
|
"train_runtime": 30032.5257, |
|
"train_samples_per_second": 17.065, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 999, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1672943448883200.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|