{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 8500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09411764705882353, "grad_norm": 0.30087053775787354, "learning_rate": 0.0002, "loss": 1.4482, "step": 200 }, { "epoch": 0.18823529411764706, "grad_norm": 0.26450973749160767, "learning_rate": 0.0002, "loss": 1.3122, "step": 400 }, { "epoch": 0.2823529411764706, "grad_norm": 0.2747337520122528, "learning_rate": 0.0002, "loss": 1.2564, "step": 600 }, { "epoch": 0.3764705882352941, "grad_norm": 0.291003555059433, "learning_rate": 0.0002, "loss": 1.2267, "step": 800 }, { "epoch": 0.47058823529411764, "grad_norm": 0.29030027985572815, "learning_rate": 0.0002, "loss": 1.2034, "step": 1000 }, { "epoch": 0.5647058823529412, "grad_norm": 0.41770946979522705, "learning_rate": 0.0002, "loss": 1.1856, "step": 1200 }, { "epoch": 0.6588235294117647, "grad_norm": 0.29782745242118835, "learning_rate": 0.0002, "loss": 1.1751, "step": 1400 }, { "epoch": 0.7529411764705882, "grad_norm": 0.28214672207832336, "learning_rate": 0.0002, "loss": 1.1574, "step": 1600 }, { "epoch": 0.8470588235294118, "grad_norm": 0.29813048243522644, "learning_rate": 0.0002, "loss": 1.1432, "step": 1800 }, { "epoch": 0.9411764705882353, "grad_norm": 0.3031373918056488, "learning_rate": 0.0002, "loss": 1.1341, "step": 2000 }, { "epoch": 1.0, "eval_loss": 1.1254174709320068, "eval_runtime": 443.4321, "eval_samples_per_second": 4.792, "eval_steps_per_second": 0.6, "step": 2125 }, { "epoch": 1.035294117647059, "grad_norm": 0.30628809332847595, "learning_rate": 0.0002, "loss": 1.111, "step": 2200 }, { "epoch": 1.1294117647058823, "grad_norm": 0.3264883756637573, "learning_rate": 0.0002, "loss": 1.0876, "step": 2400 }, { "epoch": 1.223529411764706, "grad_norm": 0.3304358720779419, "learning_rate": 0.0002, "loss": 1.0777, "step": 2600 }, { "epoch": 1.3176470588235294, "grad_norm": 0.3507118821144104, "learning_rate": 0.0002, "loss": 1.075, "step": 2800 }, { "epoch": 1.4117647058823528, "grad_norm": 0.34798240661621094, "learning_rate": 0.0002, "loss": 1.0705, "step": 3000 }, { "epoch": 1.5058823529411764, "grad_norm": 0.33348146080970764, "learning_rate": 0.0002, "loss": 1.0616, "step": 3200 }, { "epoch": 1.6, "grad_norm": 0.3142307698726654, "learning_rate": 0.0002, "loss": 1.0592, "step": 3400 }, { "epoch": 1.6941176470588235, "grad_norm": 0.33189332485198975, "learning_rate": 0.0002, "loss": 1.0563, "step": 3600 }, { "epoch": 1.788235294117647, "grad_norm": 0.31737592816352844, "learning_rate": 0.0002, "loss": 1.0508, "step": 3800 }, { "epoch": 1.8823529411764706, "grad_norm": 0.2998281717300415, "learning_rate": 0.0002, "loss": 1.0468, "step": 4000 }, { "epoch": 1.9764705882352942, "grad_norm": 0.34619805216789246, "learning_rate": 0.0002, "loss": 1.0422, "step": 4200 }, { "epoch": 2.0, "eval_loss": 1.061407208442688, "eval_runtime": 443.4749, "eval_samples_per_second": 4.792, "eval_steps_per_second": 0.6, "step": 4250 }, { "epoch": 2.070588235294118, "grad_norm": 0.35598576068878174, "learning_rate": 0.0002, "loss": 1.0008, "step": 4400 }, { "epoch": 2.164705882352941, "grad_norm": 0.3873290419578552, "learning_rate": 0.0002, "loss": 0.9917, "step": 4600 }, { "epoch": 2.2588235294117647, "grad_norm": 0.3637497127056122, "learning_rate": 0.0002, "loss": 0.9911, "step": 4800 }, { "epoch": 2.3529411764705883, "grad_norm": 0.35753560066223145, "learning_rate": 0.0002, "loss": 0.9929, "step": 5000 }, { "epoch": 2.447058823529412, "grad_norm": 0.3278402090072632, "learning_rate": 0.0002, "loss": 0.9876, "step": 5200 }, { "epoch": 2.541176470588235, "grad_norm": 0.3679386377334595, "learning_rate": 0.0002, "loss": 0.9842, "step": 5400 }, { "epoch": 2.635294117647059, "grad_norm": 0.3931664526462555, "learning_rate": 0.0002, "loss": 0.9885, "step": 5600 }, { "epoch": 2.7294117647058824, "grad_norm": 0.3553083539009094, "learning_rate": 0.0002, "loss": 0.9806, "step": 5800 }, { "epoch": 2.8235294117647056, "grad_norm": 0.37587428092956543, "learning_rate": 0.0002, "loss": 0.9796, "step": 6000 }, { "epoch": 2.9176470588235293, "grad_norm": 0.3934173285961151, "learning_rate": 0.0002, "loss": 0.9786, "step": 6200 }, { "epoch": 3.0, "eval_loss": 1.0245047807693481, "eval_runtime": 443.1629, "eval_samples_per_second": 4.795, "eval_steps_per_second": 0.6, "step": 6375 }, { "epoch": 3.011764705882353, "grad_norm": 0.4304977059364319, "learning_rate": 0.0002, "loss": 0.9719, "step": 6400 }, { "epoch": 3.1058823529411765, "grad_norm": 0.39775729179382324, "learning_rate": 0.0002, "loss": 0.9174, "step": 6600 }, { "epoch": 3.2, "grad_norm": 0.40233707427978516, "learning_rate": 0.0002, "loss": 0.9271, "step": 6800 }, { "epoch": 3.2941176470588234, "grad_norm": 0.39777445793151855, "learning_rate": 0.0002, "loss": 0.9155, "step": 7000 }, { "epoch": 3.388235294117647, "grad_norm": 0.4547841548919678, "learning_rate": 0.0002, "loss": 0.9265, "step": 7200 }, { "epoch": 3.4823529411764707, "grad_norm": 0.3900696039199829, "learning_rate": 0.0002, "loss": 0.9314, "step": 7400 }, { "epoch": 3.576470588235294, "grad_norm": 0.5135142207145691, "learning_rate": 0.0002, "loss": 0.9302, "step": 7600 }, { "epoch": 3.6705882352941175, "grad_norm": 0.40233081579208374, "learning_rate": 0.0002, "loss": 0.9227, "step": 7800 }, { "epoch": 3.764705882352941, "grad_norm": 0.40172523260116577, "learning_rate": 0.0002, "loss": 0.9268, "step": 8000 }, { "epoch": 3.8588235294117648, "grad_norm": 0.38751304149627686, "learning_rate": 0.0002, "loss": 0.9243, "step": 8200 }, { "epoch": 3.9529411764705884, "grad_norm": 0.39530816674232483, "learning_rate": 0.0002, "loss": 0.9293, "step": 8400 }, { "epoch": 4.0, "eval_loss": 1.0030452013015747, "eval_runtime": 443.7686, "eval_samples_per_second": 4.789, "eval_steps_per_second": 0.599, "step": 8500 } ], "logging_steps": 200, "max_steps": 8500, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.981816312902189e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }