{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.993258426966292, "eval_steps": 500, "global_step": 999, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0299625468164794, "grad_norm": 2.159330801984932, "learning_rate": 5e-06, "loss": 0.7997, "step": 10 }, { "epoch": 0.0599250936329588, "grad_norm": 2.575219453584873, "learning_rate": 5e-06, "loss": 0.7216, "step": 20 }, { "epoch": 0.0898876404494382, "grad_norm": 0.9472938732683952, "learning_rate": 5e-06, "loss": 0.7072, "step": 30 }, { "epoch": 0.1198501872659176, "grad_norm": 0.9037652307729707, "learning_rate": 5e-06, "loss": 0.686, "step": 40 }, { "epoch": 0.149812734082397, "grad_norm": 0.9332908648588308, "learning_rate": 5e-06, "loss": 0.6654, "step": 50 }, { "epoch": 0.1797752808988764, "grad_norm": 2.546264321590982, "learning_rate": 5e-06, "loss": 0.6592, "step": 60 }, { "epoch": 0.20973782771535582, "grad_norm": 0.8963518726180082, "learning_rate": 5e-06, "loss": 0.6539, "step": 70 }, { "epoch": 0.2397003745318352, "grad_norm": 0.6474151088056208, "learning_rate": 5e-06, "loss": 0.6484, "step": 80 }, { "epoch": 0.2696629213483146, "grad_norm": 0.7184102062533572, "learning_rate": 5e-06, "loss": 0.6455, "step": 90 }, { "epoch": 0.299625468164794, "grad_norm": 0.7912271314589597, "learning_rate": 5e-06, "loss": 0.6422, "step": 100 }, { "epoch": 0.3295880149812734, "grad_norm": 0.5702593280439339, "learning_rate": 5e-06, "loss": 0.6335, "step": 110 }, { "epoch": 0.3595505617977528, "grad_norm": 0.6067788640506643, "learning_rate": 5e-06, "loss": 0.6252, "step": 120 }, { "epoch": 0.3895131086142322, "grad_norm": 0.7543519184412004, "learning_rate": 5e-06, "loss": 0.6278, "step": 130 }, { "epoch": 0.41947565543071164, "grad_norm": 1.3158399586116993, "learning_rate": 5e-06, "loss": 0.6344, "step": 140 }, { "epoch": 0.449438202247191, "grad_norm": 0.9280984142139298, "learning_rate": 5e-06, "loss": 0.6261, "step": 150 }, { "epoch": 0.4794007490636704, "grad_norm": 0.8050698415939164, "learning_rate": 5e-06, "loss": 0.6177, "step": 160 }, { "epoch": 0.5093632958801498, "grad_norm": 0.6632585021759287, "learning_rate": 5e-06, "loss": 0.622, "step": 170 }, { "epoch": 0.5393258426966292, "grad_norm": 0.7452935828094429, "learning_rate": 5e-06, "loss": 0.6223, "step": 180 }, { "epoch": 0.5692883895131086, "grad_norm": 0.5259568899134429, "learning_rate": 5e-06, "loss": 0.6183, "step": 190 }, { "epoch": 0.599250936329588, "grad_norm": 1.0908063668428212, "learning_rate": 5e-06, "loss": 0.6164, "step": 200 }, { "epoch": 0.6292134831460674, "grad_norm": 0.5904142262846527, "learning_rate": 5e-06, "loss": 0.6247, "step": 210 }, { "epoch": 0.6591760299625468, "grad_norm": 0.48963729877546575, "learning_rate": 5e-06, "loss": 0.6097, "step": 220 }, { "epoch": 0.6891385767790262, "grad_norm": 0.5647584677724115, "learning_rate": 5e-06, "loss": 0.6193, "step": 230 }, { "epoch": 0.7191011235955056, "grad_norm": 0.6352290021696486, "learning_rate": 5e-06, "loss": 0.6183, "step": 240 }, { "epoch": 0.7490636704119851, "grad_norm": 0.5114633845114385, "learning_rate": 5e-06, "loss": 0.6163, "step": 250 }, { "epoch": 0.7790262172284644, "grad_norm": 0.7063368471913241, "learning_rate": 5e-06, "loss": 0.6093, "step": 260 }, { "epoch": 0.8089887640449438, "grad_norm": 0.9028219544074879, "learning_rate": 5e-06, "loss": 0.6135, "step": 270 }, { "epoch": 0.8389513108614233, "grad_norm": 0.6542303935292434, "learning_rate": 5e-06, "loss": 0.6146, "step": 280 }, { "epoch": 0.8689138576779026, "grad_norm": 0.9490011650791124, "learning_rate": 5e-06, "loss": 0.6258, "step": 290 }, { "epoch": 0.898876404494382, "grad_norm": 0.5141275957416789, "learning_rate": 5e-06, "loss": 0.608, "step": 300 }, { "epoch": 0.9288389513108615, "grad_norm": 0.4695093914592938, "learning_rate": 5e-06, "loss": 0.6086, "step": 310 }, { "epoch": 0.9588014981273408, "grad_norm": 0.4886269533641591, "learning_rate": 5e-06, "loss": 0.6055, "step": 320 }, { "epoch": 0.9887640449438202, "grad_norm": 0.4785326651206929, "learning_rate": 5e-06, "loss": 0.6122, "step": 330 }, { "epoch": 0.9977528089887641, "eval_loss": 0.6095167398452759, "eval_runtime": 180.8435, "eval_samples_per_second": 49.723, "eval_steps_per_second": 0.393, "step": 333 }, { "epoch": 1.0187265917602997, "grad_norm": 0.8136150743659245, "learning_rate": 5e-06, "loss": 0.5787, "step": 340 }, { "epoch": 1.048689138576779, "grad_norm": 0.5667433740151928, "learning_rate": 5e-06, "loss": 0.5582, "step": 350 }, { "epoch": 1.0786516853932584, "grad_norm": 0.5036359850721361, "learning_rate": 5e-06, "loss": 0.5638, "step": 360 }, { "epoch": 1.1086142322097379, "grad_norm": 0.5623087149261949, "learning_rate": 5e-06, "loss": 0.5599, "step": 370 }, { "epoch": 1.1385767790262173, "grad_norm": 0.488809840644991, "learning_rate": 5e-06, "loss": 0.5619, "step": 380 }, { "epoch": 1.1685393258426966, "grad_norm": 0.7591237017789003, "learning_rate": 5e-06, "loss": 0.5617, "step": 390 }, { "epoch": 1.198501872659176, "grad_norm": 0.7394067985885456, "learning_rate": 5e-06, "loss": 0.554, "step": 400 }, { "epoch": 1.2284644194756553, "grad_norm": 0.6131933237418792, "learning_rate": 5e-06, "loss": 0.5646, "step": 410 }, { "epoch": 1.2584269662921348, "grad_norm": 0.554150130587659, "learning_rate": 5e-06, "loss": 0.5653, "step": 420 }, { "epoch": 1.2883895131086143, "grad_norm": 0.5751376390924479, "learning_rate": 5e-06, "loss": 0.5581, "step": 430 }, { "epoch": 1.3183520599250937, "grad_norm": 0.5120666058669939, "learning_rate": 5e-06, "loss": 0.5651, "step": 440 }, { "epoch": 1.348314606741573, "grad_norm": 0.8368555832046994, "learning_rate": 5e-06, "loss": 0.5669, "step": 450 }, { "epoch": 1.3782771535580525, "grad_norm": 0.7407992028160174, "learning_rate": 5e-06, "loss": 0.554, "step": 460 }, { "epoch": 1.4082397003745317, "grad_norm": 0.5156759371159569, "learning_rate": 5e-06, "loss": 0.5588, "step": 470 }, { "epoch": 1.4382022471910112, "grad_norm": 0.49080996761818096, "learning_rate": 5e-06, "loss": 0.5632, "step": 480 }, { "epoch": 1.4681647940074907, "grad_norm": 0.4910042182094872, "learning_rate": 5e-06, "loss": 0.561, "step": 490 }, { "epoch": 1.4981273408239701, "grad_norm": 0.5255273950611783, "learning_rate": 5e-06, "loss": 0.5605, "step": 500 }, { "epoch": 1.5280898876404494, "grad_norm": 0.46327981869558943, "learning_rate": 5e-06, "loss": 0.5618, "step": 510 }, { "epoch": 1.5580524344569289, "grad_norm": 0.55012055750815, "learning_rate": 5e-06, "loss": 0.5568, "step": 520 }, { "epoch": 1.5880149812734081, "grad_norm": 0.5105338531939311, "learning_rate": 5e-06, "loss": 0.5672, "step": 530 }, { "epoch": 1.6179775280898876, "grad_norm": 0.5336419797454037, "learning_rate": 5e-06, "loss": 0.5593, "step": 540 }, { "epoch": 1.647940074906367, "grad_norm": 0.9436464610139725, "learning_rate": 5e-06, "loss": 0.5608, "step": 550 }, { "epoch": 1.6779026217228465, "grad_norm": 0.6016864999378152, "learning_rate": 5e-06, "loss": 0.5588, "step": 560 }, { "epoch": 1.7078651685393258, "grad_norm": 0.5010354324689145, "learning_rate": 5e-06, "loss": 0.5548, "step": 570 }, { "epoch": 1.7378277153558053, "grad_norm": 0.4589724783243399, "learning_rate": 5e-06, "loss": 0.5665, "step": 580 }, { "epoch": 1.7677902621722845, "grad_norm": 0.5015630178996147, "learning_rate": 5e-06, "loss": 0.5636, "step": 590 }, { "epoch": 1.797752808988764, "grad_norm": 0.5495613719565868, "learning_rate": 5e-06, "loss": 0.565, "step": 600 }, { "epoch": 1.8277153558052435, "grad_norm": 0.5539975776071888, "learning_rate": 5e-06, "loss": 0.562, "step": 610 }, { "epoch": 1.857677902621723, "grad_norm": 0.5053725200868951, "learning_rate": 5e-06, "loss": 0.551, "step": 620 }, { "epoch": 1.8876404494382022, "grad_norm": 0.4543290751635621, "learning_rate": 5e-06, "loss": 0.556, "step": 630 }, { "epoch": 1.9176029962546817, "grad_norm": 0.47157192643041534, "learning_rate": 5e-06, "loss": 0.5602, "step": 640 }, { "epoch": 1.947565543071161, "grad_norm": 0.4784340330073252, "learning_rate": 5e-06, "loss": 0.557, "step": 650 }, { "epoch": 1.9775280898876404, "grad_norm": 0.44605574199738396, "learning_rate": 5e-06, "loss": 0.562, "step": 660 }, { "epoch": 1.9985018726591761, "eval_loss": 0.6012518405914307, "eval_runtime": 181.0833, "eval_samples_per_second": 49.657, "eval_steps_per_second": 0.392, "step": 667 }, { "epoch": 2.00749063670412, "grad_norm": 0.8623975331438202, "learning_rate": 5e-06, "loss": 0.5432, "step": 670 }, { "epoch": 2.0374531835205993, "grad_norm": 0.6324789071436193, "learning_rate": 5e-06, "loss": 0.5019, "step": 680 }, { "epoch": 2.067415730337079, "grad_norm": 0.5758535175615167, "learning_rate": 5e-06, "loss": 0.5099, "step": 690 }, { "epoch": 2.097378277153558, "grad_norm": 0.6234430093296897, "learning_rate": 5e-06, "loss": 0.5112, "step": 700 }, { "epoch": 2.1273408239700373, "grad_norm": 0.5881227652440947, "learning_rate": 5e-06, "loss": 0.5022, "step": 710 }, { "epoch": 2.157303370786517, "grad_norm": 0.6192814926150049, "learning_rate": 5e-06, "loss": 0.5066, "step": 720 }, { "epoch": 2.1872659176029963, "grad_norm": 0.5117435754957025, "learning_rate": 5e-06, "loss": 0.5064, "step": 730 }, { "epoch": 2.2172284644194757, "grad_norm": 0.5619958642740768, "learning_rate": 5e-06, "loss": 0.5046, "step": 740 }, { "epoch": 2.247191011235955, "grad_norm": 0.8267859788370541, "learning_rate": 5e-06, "loss": 0.511, "step": 750 }, { "epoch": 2.2771535580524347, "grad_norm": 0.7317931221994743, "learning_rate": 5e-06, "loss": 0.5092, "step": 760 }, { "epoch": 2.3071161048689137, "grad_norm": 0.6283385492658163, "learning_rate": 5e-06, "loss": 0.5133, "step": 770 }, { "epoch": 2.337078651685393, "grad_norm": 0.5246018099952993, "learning_rate": 5e-06, "loss": 0.5131, "step": 780 }, { "epoch": 2.3670411985018727, "grad_norm": 0.5366582911908819, "learning_rate": 5e-06, "loss": 0.5163, "step": 790 }, { "epoch": 2.397003745318352, "grad_norm": 0.5455507417027214, "learning_rate": 5e-06, "loss": 0.5174, "step": 800 }, { "epoch": 2.4269662921348316, "grad_norm": 0.47939087317987383, "learning_rate": 5e-06, "loss": 0.5117, "step": 810 }, { "epoch": 2.4569288389513106, "grad_norm": 0.5848286494949987, "learning_rate": 5e-06, "loss": 0.5101, "step": 820 }, { "epoch": 2.48689138576779, "grad_norm": 0.5885231295942317, "learning_rate": 5e-06, "loss": 0.5098, "step": 830 }, { "epoch": 2.5168539325842696, "grad_norm": 0.49238621301397356, "learning_rate": 5e-06, "loss": 0.5128, "step": 840 }, { "epoch": 2.546816479400749, "grad_norm": 0.5319632499579365, "learning_rate": 5e-06, "loss": 0.5129, "step": 850 }, { "epoch": 2.5767790262172285, "grad_norm": 0.5235625882031714, "learning_rate": 5e-06, "loss": 0.5111, "step": 860 }, { "epoch": 2.606741573033708, "grad_norm": 0.5063693847116817, "learning_rate": 5e-06, "loss": 0.5125, "step": 870 }, { "epoch": 2.6367041198501875, "grad_norm": 0.5425085022408588, "learning_rate": 5e-06, "loss": 0.5067, "step": 880 }, { "epoch": 2.6666666666666665, "grad_norm": 0.49758835042040306, "learning_rate": 5e-06, "loss": 0.5112, "step": 890 }, { "epoch": 2.696629213483146, "grad_norm": 0.5128503684055458, "learning_rate": 5e-06, "loss": 0.5163, "step": 900 }, { "epoch": 2.7265917602996255, "grad_norm": 0.5751048660393648, "learning_rate": 5e-06, "loss": 0.5104, "step": 910 }, { "epoch": 2.756554307116105, "grad_norm": 0.6107618204066423, "learning_rate": 5e-06, "loss": 0.5124, "step": 920 }, { "epoch": 2.7865168539325844, "grad_norm": 0.4918065441260285, "learning_rate": 5e-06, "loss": 0.5062, "step": 930 }, { "epoch": 2.8164794007490634, "grad_norm": 0.49772220806864265, "learning_rate": 5e-06, "loss": 0.5119, "step": 940 }, { "epoch": 2.846441947565543, "grad_norm": 0.560475095793381, "learning_rate": 5e-06, "loss": 0.5134, "step": 950 }, { "epoch": 2.8764044943820224, "grad_norm": 0.5127175297281757, "learning_rate": 5e-06, "loss": 0.5117, "step": 960 }, { "epoch": 2.906367041198502, "grad_norm": 0.6130240398398701, "learning_rate": 5e-06, "loss": 0.5188, "step": 970 }, { "epoch": 2.9363295880149813, "grad_norm": 0.6513015524907453, "learning_rate": 5e-06, "loss": 0.5102, "step": 980 }, { "epoch": 2.966292134831461, "grad_norm": 0.5631449617834976, "learning_rate": 5e-06, "loss": 0.517, "step": 990 }, { "epoch": 2.993258426966292, "eval_loss": 0.605403482913971, "eval_runtime": 181.5085, "eval_samples_per_second": 49.54, "eval_steps_per_second": 0.391, "step": 999 }, { "epoch": 2.993258426966292, "step": 999, "total_flos": 1672943448883200.0, "train_loss": 0.5696583624716636, "train_runtime": 30032.5257, "train_samples_per_second": 17.065, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 999, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1672943448883200.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }