{ "best_metric": null, "best_model_checkpoint": null, "epoch": 97.44590163934426, "eval_steps": 500, "global_step": 3800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.2885245901639344, "grad_norm": 49.25, "learning_rate": 0.00019747235387045816, "loss": 6.9218, "step": 50 }, { "epoch": 2.577049180327869, "grad_norm": 73.5, "learning_rate": 0.0001948393891521854, "loss": 3.5446, "step": 100 }, { "epoch": 3.865573770491803, "grad_norm": 58.25, "learning_rate": 0.0001922064244339126, "loss": 3.191, "step": 150 }, { "epoch": 5.131147540983607, "grad_norm": 49.0, "learning_rate": 0.00018957345971563983, "loss": 2.9104, "step": 200 }, { "epoch": 6.419672131147541, "grad_norm": 66.5, "learning_rate": 0.00018694049499736707, "loss": 2.0795, "step": 250 }, { "epoch": 7.7081967213114755, "grad_norm": 45.75, "learning_rate": 0.00018430753027909427, "loss": 2.3055, "step": 300 }, { "epoch": 8.99672131147541, "grad_norm": 56.25, "learning_rate": 0.0001816745655608215, "loss": 1.8394, "step": 350 }, { "epoch": 10.262295081967213, "grad_norm": 50.75, "learning_rate": 0.00017904160084254874, "loss": 1.5723, "step": 400 }, { "epoch": 11.550819672131148, "grad_norm": 48.5, "learning_rate": 0.00017640863612427594, "loss": 1.4006, "step": 450 }, { "epoch": 12.839344262295082, "grad_norm": 39.75, "learning_rate": 0.00017377567140600318, "loss": 1.363, "step": 500 }, { "epoch": 14.104918032786886, "grad_norm": 38.5, "learning_rate": 0.0001711427066877304, "loss": 1.3352, "step": 550 }, { "epoch": 15.39344262295082, "grad_norm": 45.0, "learning_rate": 0.00016850974196945762, "loss": 1.1165, "step": 600 }, { "epoch": 16.681967213114753, "grad_norm": 44.0, "learning_rate": 0.00016587677725118485, "loss": 0.8736, "step": 650 }, { "epoch": 17.970491803278687, "grad_norm": 43.5, "learning_rate": 0.00016324381253291208, "loss": 1.0635, "step": 700 }, { "epoch": 19.236065573770492, "grad_norm": 34.25, "learning_rate": 0.0001606108478146393, "loss": 0.7858, "step": 750 }, { "epoch": 20.524590163934427, "grad_norm": 37.25, "learning_rate": 0.00015797788309636652, "loss": 0.8236, "step": 800 }, { "epoch": 21.81311475409836, "grad_norm": 35.5, "learning_rate": 0.00015534491837809376, "loss": 0.7766, "step": 850 }, { "epoch": 23.078688524590163, "grad_norm": 33.0, "learning_rate": 0.00015271195365982096, "loss": 0.6612, "step": 900 }, { "epoch": 24.367213114754097, "grad_norm": 33.75, "learning_rate": 0.0001500789889415482, "loss": 0.6364, "step": 950 }, { "epoch": 25.65573770491803, "grad_norm": 38.25, "learning_rate": 0.00014744602422327543, "loss": 0.6553, "step": 1000 }, { "epoch": 26.944262295081966, "grad_norm": 29.25, "learning_rate": 0.00014481305950500263, "loss": 0.5468, "step": 1050 }, { "epoch": 28.20983606557377, "grad_norm": 35.25, "learning_rate": 0.00014218009478672987, "loss": 0.5311, "step": 1100 }, { "epoch": 29.498360655737706, "grad_norm": 27.75, "learning_rate": 0.0001395471300684571, "loss": 0.5019, "step": 1150 }, { "epoch": 30.78688524590164, "grad_norm": 28.125, "learning_rate": 0.0001369141653501843, "loss": 0.6387, "step": 1200 }, { "epoch": 32.05245901639344, "grad_norm": 38.75, "learning_rate": 0.00013428120063191154, "loss": 0.5054, "step": 1250 }, { "epoch": 33.34098360655738, "grad_norm": 21.875, "learning_rate": 0.00013164823591363877, "loss": 0.4805, "step": 1300 }, { "epoch": 34.62950819672131, "grad_norm": 29.375, "learning_rate": 0.00012901527119536598, "loss": 0.5118, "step": 1350 }, { "epoch": 35.91803278688525, "grad_norm": 36.0, "learning_rate": 0.0001263823064770932, "loss": 0.447, "step": 1400 }, { "epoch": 37.18360655737705, "grad_norm": 24.125, "learning_rate": 0.00012374934175882045, "loss": 0.3921, "step": 1450 }, { "epoch": 38.472131147540985, "grad_norm": 21.875, "learning_rate": 0.00012111637704054765, "loss": 0.4268, "step": 1500 }, { "epoch": 39.760655737704916, "grad_norm": 22.25, "learning_rate": 0.00011848341232227489, "loss": 0.3317, "step": 1550 }, { "epoch": 41.02622950819672, "grad_norm": 15.0625, "learning_rate": 0.00011585044760400212, "loss": 0.387, "step": 1600 }, { "epoch": 42.31475409836066, "grad_norm": 20.875, "learning_rate": 0.00011321748288572934, "loss": 0.3285, "step": 1650 }, { "epoch": 43.60327868852459, "grad_norm": 21.375, "learning_rate": 0.00011058451816745656, "loss": 0.3281, "step": 1700 }, { "epoch": 44.89180327868853, "grad_norm": 22.75, "learning_rate": 0.00010795155344918379, "loss": 0.3148, "step": 1750 }, { "epoch": 46.157377049180326, "grad_norm": 18.75, "learning_rate": 0.00010531858873091101, "loss": 0.2567, "step": 1800 }, { "epoch": 47.445901639344264, "grad_norm": 23.75, "learning_rate": 0.00010268562401263824, "loss": 0.2609, "step": 1850 }, { "epoch": 48.734426229508195, "grad_norm": 18.75, "learning_rate": 0.00010005265929436546, "loss": 0.2365, "step": 1900 }, { "epoch": 50.0, "grad_norm": 5.09375, "learning_rate": 9.74196945760927e-05, "loss": 0.2555, "step": 1950 }, { "epoch": 51.28852459016394, "grad_norm": 11.9375, "learning_rate": 9.478672985781992e-05, "loss": 0.2184, "step": 2000 }, { "epoch": 52.57704918032787, "grad_norm": 12.8125, "learning_rate": 9.215376513954714e-05, "loss": 0.2279, "step": 2050 }, { "epoch": 53.86557377049181, "grad_norm": 13.8125, "learning_rate": 8.952080042127437e-05, "loss": 0.202, "step": 2100 }, { "epoch": 55.131147540983605, "grad_norm": 13.0625, "learning_rate": 8.688783570300159e-05, "loss": 0.1651, "step": 2150 }, { "epoch": 56.41967213114754, "grad_norm": 11.4375, "learning_rate": 8.425487098472881e-05, "loss": 0.2015, "step": 2200 }, { "epoch": 57.708196721311474, "grad_norm": 16.375, "learning_rate": 8.162190626645604e-05, "loss": 0.1504, "step": 2250 }, { "epoch": 58.99672131147541, "grad_norm": 13.0625, "learning_rate": 7.898894154818326e-05, "loss": 0.1725, "step": 2300 }, { "epoch": 60.26229508196721, "grad_norm": 13.6875, "learning_rate": 7.635597682991048e-05, "loss": 0.1499, "step": 2350 }, { "epoch": 61.55081967213115, "grad_norm": 7.59375, "learning_rate": 7.372301211163771e-05, "loss": 0.145, "step": 2400 }, { "epoch": 62.83934426229508, "grad_norm": 7.0625, "learning_rate": 7.109004739336493e-05, "loss": 0.1379, "step": 2450 }, { "epoch": 64.10491803278688, "grad_norm": 4.15625, "learning_rate": 6.845708267509215e-05, "loss": 0.1244, "step": 2500 }, { "epoch": 65.39344262295081, "grad_norm": 7.0, "learning_rate": 6.582411795681939e-05, "loss": 0.1214, "step": 2550 }, { "epoch": 66.68196721311476, "grad_norm": 9.3125, "learning_rate": 6.31911532385466e-05, "loss": 0.1341, "step": 2600 }, { "epoch": 67.97049180327869, "grad_norm": 7.09375, "learning_rate": 6.0558188520273826e-05, "loss": 0.1201, "step": 2650 }, { "epoch": 69.23606557377049, "grad_norm": 20.5, "learning_rate": 5.792522380200106e-05, "loss": 0.1049, "step": 2700 }, { "epoch": 70.52459016393442, "grad_norm": 5.90625, "learning_rate": 5.529225908372828e-05, "loss": 0.1033, "step": 2750 }, { "epoch": 71.81311475409836, "grad_norm": 3.25, "learning_rate": 5.2659294365455505e-05, "loss": 0.1028, "step": 2800 }, { "epoch": 73.07868852459016, "grad_norm": 8.0625, "learning_rate": 5.002632964718273e-05, "loss": 0.1003, "step": 2850 }, { "epoch": 74.3672131147541, "grad_norm": 6.9375, "learning_rate": 4.739336492890996e-05, "loss": 0.0993, "step": 2900 }, { "epoch": 75.65573770491804, "grad_norm": 3.734375, "learning_rate": 4.4760400210637185e-05, "loss": 0.0988, "step": 2950 }, { "epoch": 76.94426229508197, "grad_norm": 4.84375, "learning_rate": 4.2127435492364404e-05, "loss": 0.0885, "step": 3000 }, { "epoch": 78.20983606557377, "grad_norm": 2.71875, "learning_rate": 3.949447077409163e-05, "loss": 0.0816, "step": 3050 }, { "epoch": 79.4983606557377, "grad_norm": 1.59375, "learning_rate": 3.686150605581886e-05, "loss": 0.0969, "step": 3100 }, { "epoch": 80.78688524590164, "grad_norm": 2.5625, "learning_rate": 3.422854133754608e-05, "loss": 0.0886, "step": 3150 }, { "epoch": 82.05245901639344, "grad_norm": 4.6875, "learning_rate": 3.15955766192733e-05, "loss": 0.0801, "step": 3200 }, { "epoch": 83.34098360655737, "grad_norm": 2.53125, "learning_rate": 2.896261190100053e-05, "loss": 0.0888, "step": 3250 }, { "epoch": 84.62950819672132, "grad_norm": 4.1875, "learning_rate": 2.6329647182727753e-05, "loss": 0.0872, "step": 3300 }, { "epoch": 85.91803278688525, "grad_norm": 2.9375, "learning_rate": 2.369668246445498e-05, "loss": 0.0807, "step": 3350 }, { "epoch": 87.18360655737705, "grad_norm": 2.84375, "learning_rate": 2.1063717746182202e-05, "loss": 0.0779, "step": 3400 }, { "epoch": 88.47213114754098, "grad_norm": 1.8125, "learning_rate": 1.843075302790943e-05, "loss": 0.0741, "step": 3450 }, { "epoch": 89.76065573770492, "grad_norm": 2.171875, "learning_rate": 1.579778830963665e-05, "loss": 0.0833, "step": 3500 }, { "epoch": 91.02622950819672, "grad_norm": 2.484375, "learning_rate": 1.3164823591363876e-05, "loss": 0.0861, "step": 3550 }, { "epoch": 92.31475409836065, "grad_norm": 2.046875, "learning_rate": 1.0531858873091101e-05, "loss": 0.08, "step": 3600 }, { "epoch": 93.6032786885246, "grad_norm": 2.84375, "learning_rate": 7.898894154818326e-06, "loss": 0.0785, "step": 3650 }, { "epoch": 94.89180327868853, "grad_norm": 2.28125, "learning_rate": 5.2659294365455505e-06, "loss": 0.0936, "step": 3700 }, { "epoch": 96.15737704918033, "grad_norm": 1.7734375, "learning_rate": 2.6329647182727753e-06, "loss": 0.0741, "step": 3750 }, { "epoch": 97.44590163934426, "grad_norm": 2.21875, "learning_rate": 0.0, "loss": 0.0927, "step": 3800 } ], "logging_steps": 50, "max_steps": 3800, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.470967617037125e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }