{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5, "grad_norm": 14.416919708251953, "learning_rate": 9.75e-05, "loss": 0.3808, "step": 10 }, { "epoch": 1.0, "grad_norm": 14.942909240722656, "learning_rate": 9.5e-05, "loss": 0.1636, "step": 20 }, { "epoch": 1.0, "eval_loss": 0.1972920000553131, "eval_mse": 0.1972920149564743, "eval_runtime": 1.8069, "eval_samples_per_second": 21.584, "eval_steps_per_second": 2.767, "step": 20 }, { "epoch": 1.5, "grad_norm": 2.4087517261505127, "learning_rate": 9.250000000000001e-05, "loss": 0.2302, "step": 30 }, { "epoch": 2.0, "grad_norm": 21.778547286987305, "learning_rate": 9e-05, "loss": 0.1331, "step": 40 }, { "epoch": 2.0, "eval_loss": 0.046439673751592636, "eval_mse": 0.046439677476882935, "eval_runtime": 1.622, "eval_samples_per_second": 24.044, "eval_steps_per_second": 3.083, "step": 40 }, { "epoch": 2.5, "grad_norm": 1.1269134283065796, "learning_rate": 8.75e-05, "loss": 0.05, "step": 50 }, { "epoch": 3.0, "grad_norm": 4.305675983428955, "learning_rate": 8.5e-05, "loss": 0.0289, "step": 60 }, { "epoch": 3.0, "eval_loss": 0.03575298935174942, "eval_mse": 0.03575298190116882, "eval_runtime": 1.6208, "eval_samples_per_second": 24.063, "eval_steps_per_second": 3.085, "step": 60 }, { "epoch": 3.5, "grad_norm": 1.7014061212539673, "learning_rate": 8.25e-05, "loss": 0.0246, "step": 70 }, { "epoch": 4.0, "grad_norm": 0.39174097776412964, "learning_rate": 8e-05, "loss": 0.0221, "step": 80 }, { "epoch": 4.0, "eval_loss": 0.03326353803277016, "eval_mse": 0.03326353803277016, "eval_runtime": 1.6114, "eval_samples_per_second": 24.203, "eval_steps_per_second": 3.103, "step": 80 }, { "epoch": 4.5, "grad_norm": 2.6145267486572266, "learning_rate": 7.75e-05, "loss": 0.021, "step": 90 }, { "epoch": 5.0, "grad_norm": 13.847771644592285, "learning_rate": 7.500000000000001e-05, "loss": 0.0223, "step": 100 }, { "epoch": 5.0, "eval_loss": 0.03398064896464348, "eval_mse": 0.033980656415224075, "eval_runtime": 1.6515, "eval_samples_per_second": 23.615, "eval_steps_per_second": 3.028, "step": 100 }, { "epoch": 5.5, "grad_norm": 0.43660324811935425, "learning_rate": 7.25e-05, "loss": 0.0131, "step": 110 }, { "epoch": 6.0, "grad_norm": 2.1774399280548096, "learning_rate": 7e-05, "loss": 0.0117, "step": 120 }, { "epoch": 6.0, "eval_loss": 0.04457540437579155, "eval_mse": 0.04457540065050125, "eval_runtime": 1.6081, "eval_samples_per_second": 24.252, "eval_steps_per_second": 3.109, "step": 120 }, { "epoch": 6.5, "grad_norm": 2.61739182472229, "learning_rate": 6.750000000000001e-05, "loss": 0.0168, "step": 130 }, { "epoch": 7.0, "grad_norm": 2.7107529640197754, "learning_rate": 6.500000000000001e-05, "loss": 0.0107, "step": 140 }, { "epoch": 7.0, "eval_loss": 0.03702976927161217, "eval_mse": 0.037029776722192764, "eval_runtime": 1.6027, "eval_samples_per_second": 24.334, "eval_steps_per_second": 3.12, "step": 140 }, { "epoch": 7.5, "grad_norm": 4.807140350341797, "learning_rate": 6.25e-05, "loss": 0.0112, "step": 150 }, { "epoch": 8.0, "grad_norm": 1.6699814796447754, "learning_rate": 6e-05, "loss": 0.0096, "step": 160 }, { "epoch": 8.0, "eval_loss": 0.03073795698583126, "eval_mse": 0.030737943947315216, "eval_runtime": 1.678, "eval_samples_per_second": 23.242, "eval_steps_per_second": 2.98, "step": 160 }, { "epoch": 8.5, "grad_norm": 5.444133281707764, "learning_rate": 5.7499999999999995e-05, "loss": 0.0099, "step": 170 }, { "epoch": 9.0, "grad_norm": 1.5312561988830566, "learning_rate": 5.500000000000001e-05, "loss": 0.0142, "step": 180 }, { "epoch": 9.0, "eval_loss": 0.03504549711942673, "eval_mse": 0.03504551202058792, "eval_runtime": 1.6103, "eval_samples_per_second": 24.218, "eval_steps_per_second": 3.105, "step": 180 }, { "epoch": 9.5, "grad_norm": 1.527550220489502, "learning_rate": 5.25e-05, "loss": 0.0051, "step": 190 }, { "epoch": 10.0, "grad_norm": 1.0232219696044922, "learning_rate": 5e-05, "loss": 0.0069, "step": 200 }, { "epoch": 10.0, "eval_loss": 0.032399099320173264, "eval_mse": 0.03239908814430237, "eval_runtime": 1.61, "eval_samples_per_second": 24.224, "eval_steps_per_second": 3.106, "step": 200 }, { "epoch": 10.5, "grad_norm": 1.1013288497924805, "learning_rate": 4.75e-05, "loss": 0.0034, "step": 210 }, { "epoch": 11.0, "grad_norm": 0.35051777958869934, "learning_rate": 4.5e-05, "loss": 0.0028, "step": 220 }, { "epoch": 11.0, "eval_loss": 0.02933628484606743, "eval_mse": 0.029336294159293175, "eval_runtime": 1.7012, "eval_samples_per_second": 22.925, "eval_steps_per_second": 2.939, "step": 220 }, { "epoch": 11.5, "grad_norm": 1.1170843839645386, "learning_rate": 4.25e-05, "loss": 0.0019, "step": 230 }, { "epoch": 12.0, "grad_norm": 1.3299288749694824, "learning_rate": 4e-05, "loss": 0.0044, "step": 240 }, { "epoch": 12.0, "eval_loss": 0.028278259560465813, "eval_mse": 0.028278270736336708, "eval_runtime": 1.5914, "eval_samples_per_second": 24.506, "eval_steps_per_second": 3.142, "step": 240 }, { "epoch": 12.5, "grad_norm": 1.6604584455490112, "learning_rate": 3.7500000000000003e-05, "loss": 0.002, "step": 250 }, { "epoch": 13.0, "grad_norm": 1.2441127300262451, "learning_rate": 3.5e-05, "loss": 0.0011, "step": 260 }, { "epoch": 13.0, "eval_loss": 0.029920559376478195, "eval_mse": 0.029920564964413643, "eval_runtime": 1.6282, "eval_samples_per_second": 23.953, "eval_steps_per_second": 3.071, "step": 260 }, { "epoch": 13.5, "grad_norm": 0.7714802026748657, "learning_rate": 3.2500000000000004e-05, "loss": 0.0008, "step": 270 }, { "epoch": 14.0, "grad_norm": 0.5498138070106506, "learning_rate": 3e-05, "loss": 0.0005, "step": 280 }, { "epoch": 14.0, "eval_loss": 0.027942122891545296, "eval_mse": 0.027942117303609848, "eval_runtime": 1.5994, "eval_samples_per_second": 24.384, "eval_steps_per_second": 3.126, "step": 280 }, { "epoch": 14.5, "grad_norm": 0.5462870001792908, "learning_rate": 2.7500000000000004e-05, "loss": 0.0006, "step": 290 }, { "epoch": 15.0, "grad_norm": 0.32672354578971863, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 300 }, { "epoch": 15.0, "eval_loss": 0.029117202386260033, "eval_mse": 0.02911720797419548, "eval_runtime": 1.595, "eval_samples_per_second": 24.451, "eval_steps_per_second": 3.135, "step": 300 }, { "epoch": 15.5, "grad_norm": 0.7088171243667603, "learning_rate": 2.25e-05, "loss": 0.0012, "step": 310 }, { "epoch": 16.0, "grad_norm": 0.3224898874759674, "learning_rate": 2e-05, "loss": 0.0011, "step": 320 }, { "epoch": 16.0, "eval_loss": 0.028802577406167984, "eval_mse": 0.028802569955587387, "eval_runtime": 1.6242, "eval_samples_per_second": 24.012, "eval_steps_per_second": 3.078, "step": 320 }, { "epoch": 16.5, "grad_norm": 0.2536928951740265, "learning_rate": 1.75e-05, "loss": 0.0002, "step": 330 }, { "epoch": 17.0, "grad_norm": 0.2693057060241699, "learning_rate": 1.5e-05, "loss": 0.0003, "step": 340 }, { "epoch": 17.0, "eval_loss": 0.028974896296858788, "eval_mse": 0.02897489443421364, "eval_runtime": 1.5937, "eval_samples_per_second": 24.472, "eval_steps_per_second": 3.137, "step": 340 }, { "epoch": 17.5, "grad_norm": 0.22231905162334442, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 350 }, { "epoch": 18.0, "grad_norm": 0.44173210859298706, "learning_rate": 1e-05, "loss": 0.0001, "step": 360 }, { "epoch": 18.0, "eval_loss": 0.029911378398537636, "eval_mse": 0.029911383986473083, "eval_runtime": 1.5913, "eval_samples_per_second": 24.509, "eval_steps_per_second": 3.142, "step": 360 }, { "epoch": 18.5, "grad_norm": 0.2958744764328003, "learning_rate": 7.5e-06, "loss": 0.0001, "step": 370 }, { "epoch": 19.0, "grad_norm": 0.41316938400268555, "learning_rate": 5e-06, "loss": 0.0001, "step": 380 }, { "epoch": 19.0, "eval_loss": 0.029724078252911568, "eval_mse": 0.029724083840847015, "eval_runtime": 1.5907, "eval_samples_per_second": 24.518, "eval_steps_per_second": 3.143, "step": 380 }, { "epoch": 19.5, "grad_norm": 0.0391409695148468, "learning_rate": 2.5e-06, "loss": 0.0, "step": 390 }, { "epoch": 20.0, "grad_norm": 0.021498844027519226, "learning_rate": 0.0, "loss": 0.0, "step": 400 } ], "logging_steps": 10, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }