{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.6, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "grad_norm": 2.2721662521362305, "learning_rate": 4.999643406399275e-05, "loss": 4.0376, "num_input_tokens_seen": 6208, "step": 5 }, { "epoch": 0.32, "grad_norm": 2.1794309616088867, "learning_rate": 4.998573727324295e-05, "loss": 3.9633, "num_input_tokens_seen": 11968, "step": 10 }, { "epoch": 0.48, "grad_norm": 1.3466308116912842, "learning_rate": 4.9967912679276316e-05, "loss": 3.7175, "num_input_tokens_seen": 18016, "step": 15 }, { "epoch": 0.64, "grad_norm": 1.3052715063095093, "learning_rate": 4.994296536700177e-05, "loss": 3.6465, "num_input_tokens_seen": 23760, "step": 20 }, { "epoch": 0.8, "grad_norm": 1.289433479309082, "learning_rate": 4.9910902453260824e-05, "loss": 3.6384, "num_input_tokens_seen": 29648, "step": 25 }, { "epoch": 0.96, "grad_norm": 1.0926741361618042, "learning_rate": 4.987173308479738e-05, "loss": 3.5703, "num_input_tokens_seen": 35968, "step": 30 }, { "epoch": 1.12, "grad_norm": 1.3285070657730103, "learning_rate": 4.982546843564834e-05, "loss": 3.5096, "num_input_tokens_seen": 42112, "step": 35 }, { "epoch": 1.28, "grad_norm": 1.691308856010437, "learning_rate": 4.977212170395598e-05, "loss": 3.2806, "num_input_tokens_seen": 48352, "step": 40 }, { "epoch": 1.44, "grad_norm": 1.3265893459320068, "learning_rate": 4.971170810820279e-05, "loss": 3.4431, "num_input_tokens_seen": 54144, "step": 45 }, { "epoch": 1.6, "grad_norm": 1.2742599248886108, "learning_rate": 4.964424488287009e-05, "loss": 3.2597, "num_input_tokens_seen": 60224, "step": 50 }, { "epoch": 1.76, "grad_norm": 1.2702339887619019, "learning_rate": 4.9569751273521454e-05, "loss": 3.275, "num_input_tokens_seen": 66512, "step": 55 }, { "epoch": 1.92, "grad_norm": 1.6329398155212402, "learning_rate": 4.948824853131236e-05, "loss": 3.1563, "num_input_tokens_seen": 72208, "step": 60 }, { "epoch": 2.08, "grad_norm": 1.348823070526123, "learning_rate": 4.939975990692789e-05, "loss": 3.1883, "num_input_tokens_seen": 78368, "step": 65 }, { "epoch": 2.24, "grad_norm": 1.441171646118164, "learning_rate": 4.930431064394977e-05, "loss": 3.2454, "num_input_tokens_seen": 84288, "step": 70 }, { "epoch": 2.4, "grad_norm": 1.3091288805007935, "learning_rate": 4.920192797165511e-05, "loss": 3.1905, "num_input_tokens_seen": 90464, "step": 75 }, { "epoch": 2.56, "grad_norm": 1.6830319166183472, "learning_rate": 4.909264109724853e-05, "loss": 3.0087, "num_input_tokens_seen": 96704, "step": 80 }, { "epoch": 2.7199999999999998, "grad_norm": 1.941502332687378, "learning_rate": 4.897648119753006e-05, "loss": 3.0016, "num_input_tokens_seen": 102352, "step": 85 }, { "epoch": 2.88, "grad_norm": 1.8244129419326782, "learning_rate": 4.885348141000122e-05, "loss": 3.1813, "num_input_tokens_seen": 108112, "step": 90 }, { "epoch": 3.04, "grad_norm": 1.6675082445144653, "learning_rate": 4.872367682341173e-05, "loss": 3.1158, "num_input_tokens_seen": 114240, "step": 95 }, { "epoch": 3.2, "grad_norm": 1.8476835489273071, "learning_rate": 4.858710446774951e-05, "loss": 2.9404, "num_input_tokens_seen": 119936, "step": 100 }, { "epoch": 3.36, "grad_norm": 1.8608956336975098, "learning_rate": 4.844380330367701e-05, "loss": 2.9749, "num_input_tokens_seen": 125984, "step": 105 }, { "epoch": 3.52, "grad_norm": 1.9425894021987915, "learning_rate": 4.829381421141671e-05, "loss": 2.9456, "num_input_tokens_seen": 131808, "step": 110 }, { "epoch": 3.68, "grad_norm": 2.114993095397949, "learning_rate": 4.8137179979088995e-05, "loss": 2.8505, "num_input_tokens_seen": 137792, "step": 115 }, { "epoch": 3.84, "grad_norm": 1.7963783740997314, "learning_rate": 4.7973945290505766e-05, "loss": 3.0044, "num_input_tokens_seen": 144336, "step": 120 }, { "epoch": 4.0, "grad_norm": 1.7597969770431519, "learning_rate": 4.780415671242334e-05, "loss": 3.0709, "num_input_tokens_seen": 150336, "step": 125 }, { "epoch": 4.16, "grad_norm": 1.8621456623077393, "learning_rate": 4.7627862681258037e-05, "loss": 2.912, "num_input_tokens_seen": 156768, "step": 130 }, { "epoch": 4.32, "grad_norm": 2.021226167678833, "learning_rate": 4.7445113489268544e-05, "loss": 2.8063, "num_input_tokens_seen": 163232, "step": 135 }, { "epoch": 4.48, "grad_norm": 2.2448067665100098, "learning_rate": 4.725596127020879e-05, "loss": 2.7714, "num_input_tokens_seen": 169616, "step": 140 }, { "epoch": 4.64, "grad_norm": 5.535823822021484, "learning_rate": 4.706045998445548e-05, "loss": 2.8277, "num_input_tokens_seen": 175664, "step": 145 }, { "epoch": 4.8, "grad_norm": 2.4096429347991943, "learning_rate": 4.685866540361456e-05, "loss": 2.7456, "num_input_tokens_seen": 181232, "step": 150 }, { "epoch": 4.96, "grad_norm": 2.699846029281616, "learning_rate": 4.665063509461097e-05, "loss": 2.7187, "num_input_tokens_seen": 186960, "step": 155 }, { "epoch": 5.12, "grad_norm": 2.661548376083374, "learning_rate": 4.643642840326627e-05, "loss": 2.7918, "num_input_tokens_seen": 192640, "step": 160 }, { "epoch": 5.28, "grad_norm": 2.395092725753784, "learning_rate": 4.621610643736878e-05, "loss": 2.6223, "num_input_tokens_seen": 198672, "step": 165 }, { "epoch": 5.44, "grad_norm": 2.329503059387207, "learning_rate": 4.598973204924097e-05, "loss": 2.654, "num_input_tokens_seen": 204976, "step": 170 }, { "epoch": 5.6, "grad_norm": 2.3466107845306396, "learning_rate": 4.5757369817809415e-05, "loss": 2.7321, "num_input_tokens_seen": 211168, "step": 175 }, { "epoch": 5.76, "grad_norm": 2.9179327487945557, "learning_rate": 4.551908603018191e-05, "loss": 2.71, "num_input_tokens_seen": 217072, "step": 180 }, { "epoch": 5.92, "grad_norm": 2.7011966705322266, "learning_rate": 4.527494866273753e-05, "loss": 2.7663, "num_input_tokens_seen": 223136, "step": 185 }, { "epoch": 6.08, "grad_norm": 2.610811948776245, "learning_rate": 4.502502736173462e-05, "loss": 2.4595, "num_input_tokens_seen": 229088, "step": 190 }, { "epoch": 6.24, "grad_norm": 2.6016838550567627, "learning_rate": 4.476939342344246e-05, "loss": 2.5734, "num_input_tokens_seen": 235184, "step": 195 }, { "epoch": 6.4, "grad_norm": 3.063948392868042, "learning_rate": 4.45081197738023e-05, "loss": 2.3804, "num_input_tokens_seen": 241152, "step": 200 }, { "epoch": 6.5600000000000005, "grad_norm": 3.9621613025665283, "learning_rate": 4.424128094762331e-05, "loss": 2.4395, "num_input_tokens_seen": 247136, "step": 205 }, { "epoch": 6.72, "grad_norm": 2.887089967727661, "learning_rate": 4.3968953067319777e-05, "loss": 2.3477, "num_input_tokens_seen": 253232, "step": 210 }, { "epoch": 6.88, "grad_norm": 2.8491384983062744, "learning_rate": 4.369121382119523e-05, "loss": 2.5377, "num_input_tokens_seen": 258992, "step": 215 }, { "epoch": 7.04, "grad_norm": 3.3222053050994873, "learning_rate": 4.340814244127993e-05, "loss": 2.5576, "num_input_tokens_seen": 265392, "step": 220 }, { "epoch": 7.2, "grad_norm": 3.4029548168182373, "learning_rate": 4.3119819680728e-05, "loss": 2.2721, "num_input_tokens_seen": 271344, "step": 225 }, { "epoch": 7.36, "grad_norm": 3.4295144081115723, "learning_rate": 4.282632779078051e-05, "loss": 2.4025, "num_input_tokens_seen": 277120, "step": 230 }, { "epoch": 7.52, "grad_norm": 3.2438597679138184, "learning_rate": 4.2527750497301323e-05, "loss": 2.2175, "num_input_tokens_seen": 283152, "step": 235 }, { "epoch": 7.68, "grad_norm": 3.3982954025268555, "learning_rate": 4.222417297689217e-05, "loss": 2.1803, "num_input_tokens_seen": 289136, "step": 240 }, { "epoch": 7.84, "grad_norm": 2.70150089263916, "learning_rate": 4.191568183259394e-05, "loss": 2.4197, "num_input_tokens_seen": 295968, "step": 245 }, { "epoch": 8.0, "grad_norm": 3.426187515258789, "learning_rate": 4.160236506918098e-05, "loss": 2.2351, "num_input_tokens_seen": 301472, "step": 250 }, { "epoch": 8.16, "grad_norm": 4.023019313812256, "learning_rate": 4.128431206805557e-05, "loss": 2.1748, "num_input_tokens_seen": 307216, "step": 255 }, { "epoch": 8.32, "grad_norm": 3.1614391803741455, "learning_rate": 4.096161356174959e-05, "loss": 2.1173, "num_input_tokens_seen": 313424, "step": 260 }, { "epoch": 8.48, "grad_norm": 3.4006361961364746, "learning_rate": 4.063436160804092e-05, "loss": 2.0966, "num_input_tokens_seen": 319472, "step": 265 }, { "epoch": 8.64, "grad_norm": 4.167815208435059, "learning_rate": 4.030264956369157e-05, "loss": 2.0825, "num_input_tokens_seen": 325504, "step": 270 }, { "epoch": 8.8, "grad_norm": 4.040388584136963, "learning_rate": 3.9966572057815373e-05, "loss": 2.0189, "num_input_tokens_seen": 331520, "step": 275 }, { "epoch": 8.96, "grad_norm": 5.056305885314941, "learning_rate": 3.962622496488269e-05, "loss": 2.0671, "num_input_tokens_seen": 337680, "step": 280 }, { "epoch": 9.12, "grad_norm": 4.466280937194824, "learning_rate": 3.928170537736981e-05, "loss": 2.0337, "num_input_tokens_seen": 343936, "step": 285 }, { "epoch": 9.28, "grad_norm": 4.8046698570251465, "learning_rate": 3.893311157806091e-05, "loss": 1.7892, "num_input_tokens_seen": 349632, "step": 290 }, { "epoch": 9.44, "grad_norm": 5.066511631011963, "learning_rate": 3.858054301201047e-05, "loss": 1.8558, "num_input_tokens_seen": 356080, "step": 295 }, { "epoch": 9.6, "grad_norm": 4.678648948669434, "learning_rate": 3.822410025817406e-05, "loss": 1.9685, "num_input_tokens_seen": 362048, "step": 300 } ], "logging_steps": 5, "max_steps": 930, "num_input_tokens_seen": 362048, "num_train_epochs": 30, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2867893962080256.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }