{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 6120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016339869281045753, "grad_norm": 0.05770206078886986, "learning_rate": 4.085801838610828e-07, "loss": 2.6907, "step": 100 }, { "epoch": 0.032679738562091505, "grad_norm": 0.05021829903125763, "learning_rate": 8.171603677221656e-07, "loss": 2.6813, "step": 200 }, { "epoch": 0.049019607843137254, "grad_norm": 0.06600035727024078, "learning_rate": 1.2257405515832485e-06, "loss": 2.699, "step": 300 }, { "epoch": 0.06535947712418301, "grad_norm": 0.07294648885726929, "learning_rate": 1.6343207354443311e-06, "loss": 2.6897, "step": 400 }, { "epoch": 0.08169934640522876, "grad_norm": 0.08237992972135544, "learning_rate": 2.042900919305414e-06, "loss": 2.6484, "step": 500 }, { "epoch": 0.09803921568627451, "grad_norm": 0.0935598686337471, "learning_rate": 2.451481103166497e-06, "loss": 2.6607, "step": 600 }, { "epoch": 0.11437908496732026, "grad_norm": 0.10522934049367905, "learning_rate": 2.8600612870275796e-06, "loss": 2.6464, "step": 700 }, { "epoch": 0.13071895424836602, "grad_norm": 0.12197130918502808, "learning_rate": 3.2686414708886623e-06, "loss": 2.6393, "step": 800 }, { "epoch": 0.14705882352941177, "grad_norm": 0.12897726893424988, "learning_rate": 3.6772216547497446e-06, "loss": 2.6262, "step": 900 }, { "epoch": 0.16339869281045752, "grad_norm": 0.1309213489294052, "learning_rate": 4.085801838610828e-06, "loss": 2.617, "step": 1000 }, { "epoch": 0.17973856209150327, "grad_norm": 0.1498122215270996, "learning_rate": 4.494382022471911e-06, "loss": 2.6183, "step": 1100 }, { "epoch": 0.19607843137254902, "grad_norm": 0.1503172665834427, "learning_rate": 4.902962206332994e-06, "loss": 2.6103, "step": 1200 }, { "epoch": 0.21241830065359477, "grad_norm": 0.16900745034217834, "learning_rate": 5.311542390194075e-06, "loss": 2.5884, "step": 1300 }, { "epoch": 0.22875816993464052, "grad_norm": 0.16944456100463867, "learning_rate": 5.720122574055159e-06, "loss": 2.5994, "step": 1400 }, { "epoch": 0.24509803921568626, "grad_norm": 0.18548929691314697, "learning_rate": 6.1287027579162415e-06, "loss": 2.5932, "step": 1500 }, { "epoch": 0.26143790849673204, "grad_norm": 0.2077314257621765, "learning_rate": 6.537282941777325e-06, "loss": 2.5718, "step": 1600 }, { "epoch": 0.2777777777777778, "grad_norm": 0.2167440503835678, "learning_rate": 6.945863125638407e-06, "loss": 2.5758, "step": 1700 }, { "epoch": 0.29411764705882354, "grad_norm": 0.21927382051944733, "learning_rate": 7.354443309499489e-06, "loss": 2.5607, "step": 1800 }, { "epoch": 0.3104575163398693, "grad_norm": 0.21177807450294495, "learning_rate": 7.763023493360572e-06, "loss": 2.5597, "step": 1900 }, { "epoch": 0.32679738562091504, "grad_norm": 0.2335982471704483, "learning_rate": 8.171603677221655e-06, "loss": 2.5518, "step": 2000 }, { "epoch": 0.3431372549019608, "grad_norm": 0.23431697487831116, "learning_rate": 8.580183861082738e-06, "loss": 2.5276, "step": 2100 }, { "epoch": 0.35947712418300654, "grad_norm": 0.26647108793258667, "learning_rate": 8.988764044943822e-06, "loss": 2.5402, "step": 2200 }, { "epoch": 0.3758169934640523, "grad_norm": 0.27895891666412354, "learning_rate": 9.397344228804903e-06, "loss": 2.5168, "step": 2300 }, { "epoch": 0.39215686274509803, "grad_norm": 0.2913816273212433, "learning_rate": 9.805924412665988e-06, "loss": 2.5188, "step": 2400 }, { "epoch": 0.4084967320261438, "grad_norm": 0.26464876532554626, "learning_rate": 1.0214504596527069e-05, "loss": 2.509, "step": 2500 }, { "epoch": 0.42483660130718953, "grad_norm": 0.3019787669181824, "learning_rate": 1.062308478038815e-05, "loss": 2.5055, "step": 2600 }, { "epoch": 0.4411764705882353, "grad_norm": 0.3169211447238922, "learning_rate": 1.1031664964249235e-05, "loss": 2.498, "step": 2700 }, { "epoch": 0.45751633986928103, "grad_norm": 0.2861559987068176, "learning_rate": 1.1440245148110318e-05, "loss": 2.4814, "step": 2800 }, { "epoch": 0.4738562091503268, "grad_norm": 0.30439117550849915, "learning_rate": 1.1848825331971402e-05, "loss": 2.4866, "step": 2900 }, { "epoch": 0.49019607843137253, "grad_norm": 0.3125614821910858, "learning_rate": 1.2257405515832483e-05, "loss": 2.4667, "step": 3000 }, { "epoch": 0.5065359477124183, "grad_norm": 0.33123213052749634, "learning_rate": 1.2665985699693566e-05, "loss": 2.4812, "step": 3100 }, { "epoch": 0.5228758169934641, "grad_norm": 0.37793877720832825, "learning_rate": 1.307456588355465e-05, "loss": 2.4739, "step": 3200 }, { "epoch": 0.5392156862745098, "grad_norm": 0.39265045523643494, "learning_rate": 1.348314606741573e-05, "loss": 2.4524, "step": 3300 }, { "epoch": 0.5555555555555556, "grad_norm": 0.32753610610961914, "learning_rate": 1.3891726251276814e-05, "loss": 2.4668, "step": 3400 }, { "epoch": 0.5718954248366013, "grad_norm": 0.34903815388679504, "learning_rate": 1.4300306435137897e-05, "loss": 2.4633, "step": 3500 }, { "epoch": 0.5882352941176471, "grad_norm": 0.41583487391471863, "learning_rate": 1.4708886618998978e-05, "loss": 2.4481, "step": 3600 }, { "epoch": 0.6045751633986928, "grad_norm": 0.36139875650405884, "learning_rate": 1.5117466802860063e-05, "loss": 2.4422, "step": 3700 }, { "epoch": 0.6209150326797386, "grad_norm": 0.39948606491088867, "learning_rate": 1.5526046986721144e-05, "loss": 2.443, "step": 3800 }, { "epoch": 0.6372549019607843, "grad_norm": 0.393020898103714, "learning_rate": 1.593462717058223e-05, "loss": 2.4375, "step": 3900 }, { "epoch": 0.6535947712418301, "grad_norm": 0.39577871561050415, "learning_rate": 1.634320735444331e-05, "loss": 2.4391, "step": 4000 }, { "epoch": 0.6699346405228758, "grad_norm": 0.38020357489585876, "learning_rate": 1.6751787538304395e-05, "loss": 2.4203, "step": 4100 }, { "epoch": 0.6862745098039216, "grad_norm": 0.38298335671424866, "learning_rate": 1.7160367722165477e-05, "loss": 2.4129, "step": 4200 }, { "epoch": 0.7026143790849673, "grad_norm": 0.4053588807582855, "learning_rate": 1.7568947906026558e-05, "loss": 2.4055, "step": 4300 }, { "epoch": 0.7189542483660131, "grad_norm": 0.40465113520622253, "learning_rate": 1.7977528089887643e-05, "loss": 2.3975, "step": 4400 }, { "epoch": 0.7352941176470589, "grad_norm": 0.4370296895503998, "learning_rate": 1.8386108273748724e-05, "loss": 2.3822, "step": 4500 }, { "epoch": 0.7516339869281046, "grad_norm": 0.44034600257873535, "learning_rate": 1.8794688457609806e-05, "loss": 2.4174, "step": 4600 }, { "epoch": 0.7679738562091504, "grad_norm": 0.45608291029930115, "learning_rate": 1.920326864147089e-05, "loss": 2.4016, "step": 4700 }, { "epoch": 0.7843137254901961, "grad_norm": 0.4033947288990021, "learning_rate": 1.9611848825331975e-05, "loss": 2.3975, "step": 4800 }, { "epoch": 0.8006535947712419, "grad_norm": 0.4388486444950104, "learning_rate": 1.9999177886783194e-05, "loss": 2.4068, "step": 4900 }, { "epoch": 0.8169934640522876, "grad_norm": 0.4106648564338684, "learning_rate": 1.9639628606958535e-05, "loss": 2.3926, "step": 5000 }, { "epoch": 0.8333333333333334, "grad_norm": 0.41734156012535095, "learning_rate": 1.8649548579446938e-05, "loss": 2.3915, "step": 5100 }, { "epoch": 0.8496732026143791, "grad_norm": 0.41231608390808105, "learning_rate": 1.709369921005258e-05, "loss": 2.3906, "step": 5200 }, { "epoch": 0.8660130718954249, "grad_norm": 0.4466901421546936, "learning_rate": 1.5073849032208823e-05, "loss": 2.383, "step": 5300 }, { "epoch": 0.8823529411764706, "grad_norm": 0.44348859786987305, "learning_rate": 1.2722116999329712e-05, "loss": 2.369, "step": 5400 }, { "epoch": 0.8986928104575164, "grad_norm": 0.4730405807495117, "learning_rate": 1.0192330547876871e-05, "loss": 2.3652, "step": 5500 }, { "epoch": 0.9150326797385621, "grad_norm": 0.48740801215171814, "learning_rate": 7.649963702603848e-06, "loss": 2.3792, "step": 5600 }, { "epoch": 0.9313725490196079, "grad_norm": 0.39891138672828674, "learning_rate": 5.2613133752700145e-06, "loss": 2.3746, "step": 5700 }, { "epoch": 0.9477124183006536, "grad_norm": 0.45519766211509705, "learning_rate": 3.1826218380900066e-06, "loss": 2.3756, "step": 5800 }, { "epoch": 0.9640522875816994, "grad_norm": 0.4656579792499542, "learning_rate": 1.5498568739099907e-06, "loss": 2.3776, "step": 5900 }, { "epoch": 0.9803921568627451, "grad_norm": 0.4382289946079254, "learning_rate": 4.6981808628712823e-07, "loss": 2.3749, "step": 6000 }, { "epoch": 0.9967320261437909, "grad_norm": 0.49260008335113525, "learning_rate": 1.3151108166156168e-08, "loss": 2.3821, "step": 6100 }, { "epoch": 1.0, "step": 6120, "total_flos": 8.917940750474281e+17, "train_loss": 2.4952426230985356, "train_runtime": 1729.9843, "train_samples_per_second": 56.6, "train_steps_per_second": 3.538 } ], "logging_steps": 100, "max_steps": 6120, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.917940750474281e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }