{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4705882352941178, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014705882352941176, "eval_loss": 0.851502537727356, "eval_runtime": 17.1345, "eval_samples_per_second": 6.712, "eval_steps_per_second": 0.875, "step": 1 }, { "epoch": 0.04411764705882353, "grad_norm": 0.49747511744499207, "learning_rate": 1.5e-05, "loss": 0.8488, "step": 3 }, { "epoch": 0.08823529411764706, "grad_norm": 0.5120547413825989, "learning_rate": 3e-05, "loss": 0.8632, "step": 6 }, { "epoch": 0.1323529411764706, "grad_norm": 0.43988457322120667, "learning_rate": 4.5e-05, "loss": 0.8061, "step": 9 }, { "epoch": 0.1323529411764706, "eval_loss": 0.7254419326782227, "eval_runtime": 17.3867, "eval_samples_per_second": 6.614, "eval_steps_per_second": 0.863, "step": 9 }, { "epoch": 0.17647058823529413, "grad_norm": 0.4664570689201355, "learning_rate": 4.993910125649561e-05, "loss": 0.6967, "step": 12 }, { "epoch": 0.22058823529411764, "grad_norm": 0.6955499649047852, "learning_rate": 4.962019382530521e-05, "loss": 0.5685, "step": 15 }, { "epoch": 0.2647058823529412, "grad_norm": 0.8682049512863159, "learning_rate": 4.9031542398457974e-05, "loss": 0.4343, "step": 18 }, { "epoch": 0.2647058823529412, "eval_loss": 0.3259250223636627, "eval_runtime": 17.3828, "eval_samples_per_second": 6.616, "eval_steps_per_second": 0.863, "step": 18 }, { "epoch": 0.3088235294117647, "grad_norm": 0.4803406298160553, "learning_rate": 4.817959636416969e-05, "loss": 0.3142, "step": 21 }, { "epoch": 0.35294117647058826, "grad_norm": 0.4086674749851227, "learning_rate": 4.707368982147318e-05, "loss": 0.2366, "step": 24 }, { "epoch": 0.39705882352941174, "grad_norm": 0.29409703612327576, "learning_rate": 4.572593931387604e-05, "loss": 0.2209, "step": 27 }, { "epoch": 0.39705882352941174, "eval_loss": 0.1954878568649292, "eval_runtime": 17.3758, "eval_samples_per_second": 6.618, "eval_steps_per_second": 0.863, "step": 27 }, { "epoch": 0.4411764705882353, "grad_norm": 0.2741072475910187, "learning_rate": 4.415111107797445e-05, "loss": 0.1942, "step": 30 }, { "epoch": 0.4852941176470588, "grad_norm": 0.22216534614562988, "learning_rate": 4.2366459261474933e-05, "loss": 0.1973, "step": 33 }, { "epoch": 0.5294117647058824, "grad_norm": 0.204613596200943, "learning_rate": 4.039153688314145e-05, "loss": 0.1892, "step": 36 }, { "epoch": 0.5294117647058824, "eval_loss": 0.167263001203537, "eval_runtime": 17.4022, "eval_samples_per_second": 6.608, "eval_steps_per_second": 0.862, "step": 36 }, { "epoch": 0.5735294117647058, "grad_norm": 0.19633035361766815, "learning_rate": 3.824798160583012e-05, "loss": 0.1902, "step": 39 }, { "epoch": 0.6176470588235294, "grad_norm": 0.19374072551727295, "learning_rate": 3.5959278669726935e-05, "loss": 0.1596, "step": 42 }, { "epoch": 0.6617647058823529, "grad_norm": 0.138106107711792, "learning_rate": 3.355050358314172e-05, "loss": 0.1574, "step": 45 }, { "epoch": 0.6617647058823529, "eval_loss": 0.1535487323999405, "eval_runtime": 17.4439, "eval_samples_per_second": 6.593, "eval_steps_per_second": 0.86, "step": 45 }, { "epoch": 0.7058823529411765, "grad_norm": 0.14004965126514435, "learning_rate": 3.104804738999169e-05, "loss": 0.1638, "step": 48 }, { "epoch": 0.75, "grad_norm": 0.13989396393299103, "learning_rate": 2.8479327524001636e-05, "loss": 0.1545, "step": 51 }, { "epoch": 0.7941176470588235, "grad_norm": 0.18269434571266174, "learning_rate": 2.587248741756253e-05, "loss": 0.1738, "step": 54 }, { "epoch": 0.7941176470588235, "eval_loss": 0.14321838319301605, "eval_runtime": 17.406, "eval_samples_per_second": 6.607, "eval_steps_per_second": 0.862, "step": 54 }, { "epoch": 0.8382352941176471, "grad_norm": 0.16118720173835754, "learning_rate": 2.3256088156396868e-05, "loss": 0.1492, "step": 57 }, { "epoch": 0.8823529411764706, "grad_norm": 0.14809435606002808, "learning_rate": 2.0658795558326743e-05, "loss": 0.1662, "step": 60 }, { "epoch": 0.9264705882352942, "grad_norm": 0.14554592967033386, "learning_rate": 1.8109066104575023e-05, "loss": 0.1641, "step": 63 }, { "epoch": 0.9264705882352942, "eval_loss": 0.13681022822856903, "eval_runtime": 17.3622, "eval_samples_per_second": 6.624, "eval_steps_per_second": 0.864, "step": 63 }, { "epoch": 0.9705882352941176, "grad_norm": 0.13518203794956207, "learning_rate": 1.56348351646022e-05, "loss": 0.1611, "step": 66 }, { "epoch": 1.0147058823529411, "grad_norm": 0.13351187109947205, "learning_rate": 1.3263210930352737e-05, "loss": 0.1525, "step": 69 }, { "epoch": 1.0588235294117647, "grad_norm": 0.1250705122947693, "learning_rate": 1.1020177413231334e-05, "loss": 0.1333, "step": 72 }, { "epoch": 1.0588235294117647, "eval_loss": 0.13294555246829987, "eval_runtime": 17.428, "eval_samples_per_second": 6.599, "eval_steps_per_second": 0.861, "step": 72 }, { "epoch": 1.1029411764705883, "grad_norm": 0.12626980245113373, "learning_rate": 8.930309757836517e-06, "loss": 0.1452, "step": 75 }, { "epoch": 1.1470588235294117, "grad_norm": 0.19401168823242188, "learning_rate": 7.016504991533726e-06, "loss": 0.1409, "step": 78 }, { "epoch": 1.1911764705882353, "grad_norm": 0.13765017688274384, "learning_rate": 5.299731159831953e-06, "loss": 0.1361, "step": 81 }, { "epoch": 1.1911764705882353, "eval_loss": 0.1305573731660843, "eval_runtime": 17.413, "eval_samples_per_second": 6.604, "eval_steps_per_second": 0.861, "step": 81 }, { "epoch": 1.2352941176470589, "grad_norm": 0.15578369796276093, "learning_rate": 3.798797596089351e-06, "loss": 0.125, "step": 84 }, { "epoch": 1.2794117647058822, "grad_norm": 0.14650852978229523, "learning_rate": 2.5301488425208296e-06, "loss": 0.1454, "step": 87 }, { "epoch": 1.3235294117647058, "grad_norm": 0.16093911230564117, "learning_rate": 1.5076844803522922e-06, "loss": 0.1642, "step": 90 }, { "epoch": 1.3235294117647058, "eval_loss": 0.1295652836561203, "eval_runtime": 17.4313, "eval_samples_per_second": 6.597, "eval_steps_per_second": 0.861, "step": 90 }, { "epoch": 1.3676470588235294, "grad_norm": 0.14731736481189728, "learning_rate": 7.426068431000882e-07, "loss": 0.1479, "step": 93 }, { "epoch": 1.4117647058823528, "grad_norm": 0.1577821522951126, "learning_rate": 2.4329828146074095e-07, "loss": 0.1197, "step": 96 }, { "epoch": 1.4558823529411764, "grad_norm": 0.15286415815353394, "learning_rate": 1.522932452260595e-08, "loss": 0.1588, "step": 99 }, { "epoch": 1.4558823529411764, "eval_loss": 0.12933845818042755, "eval_runtime": 17.4499, "eval_samples_per_second": 6.59, "eval_steps_per_second": 0.86, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.961057620510638e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }