End of training

2d1d4fd verified 21 days ago

7.29 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 2.987551867219917,
	"eval_steps": 500,
	"global_step": 360,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.08298755186721991,
	"grad_norm": 0.4138890994781238,
	"learning_rate": 5e-06,
	"loss": 0.6819,
	"step": 10
	},
	{
	"epoch": 0.16597510373443983,
	"grad_norm": 0.29279799441362253,
	"learning_rate": 5e-06,
	"loss": 0.6152,
	"step": 20
	},
	{
	"epoch": 0.24896265560165975,
	"grad_norm": 0.22155490022279595,
	"learning_rate": 5e-06,
	"loss": 0.5911,
	"step": 30
	},
	{
	"epoch": 0.33195020746887965,
	"grad_norm": 0.21703792973144043,
	"learning_rate": 5e-06,
	"loss": 0.5758,
	"step": 40
	},
	{
	"epoch": 0.4149377593360996,
	"grad_norm": 0.19782125949582666,
	"learning_rate": 5e-06,
	"loss": 0.5658,
	"step": 50
	},
	{
	"epoch": 0.4979253112033195,
	"grad_norm": 0.20429620764864578,
	"learning_rate": 5e-06,
	"loss": 0.5621,
	"step": 60
	},
	{
	"epoch": 0.5809128630705395,
	"grad_norm": 0.1947179711144,
	"learning_rate": 5e-06,
	"loss": 0.5481,
	"step": 70
	},
	{
	"epoch": 0.6639004149377593,
	"grad_norm": 0.20952965041956714,
	"learning_rate": 5e-06,
	"loss": 0.5491,
	"step": 80
	},
	{
	"epoch": 0.7468879668049793,
	"grad_norm": 0.2092027679734135,
	"learning_rate": 5e-06,
	"loss": 0.5473,
	"step": 90
	},
	{
	"epoch": 0.8298755186721992,
	"grad_norm": 0.1973704614234666,
	"learning_rate": 5e-06,
	"loss": 0.5412,
	"step": 100
	},
	{
	"epoch": 0.9128630705394191,
	"grad_norm": 0.22215520376065145,
	"learning_rate": 5e-06,
	"loss": 0.5361,
	"step": 110
	},
	{
	"epoch": 0.995850622406639,
	"grad_norm": 0.20002555613598916,
	"learning_rate": 5e-06,
	"loss": 0.5354,
	"step": 120
	},
	{
	"epoch": 0.995850622406639,
	"eval_loss": 0.5283368229866028,
	"eval_runtime": 121.5237,
	"eval_samples_per_second": 26.703,
	"eval_steps_per_second": 0.42,
	"step": 120
	},
	{
	"epoch": 1.0788381742738589,
	"grad_norm": 0.23151869944663353,
	"learning_rate": 5e-06,
	"loss": 0.5334,
	"step": 130
	},
	{
	"epoch": 1.161825726141079,
	"grad_norm": 0.20416069004838694,
	"learning_rate": 5e-06,
	"loss": 0.5122,
	"step": 140
	},
	{
	"epoch": 1.2448132780082988,
	"grad_norm": 0.21256654137396935,
	"learning_rate": 5e-06,
	"loss": 0.509,
	"step": 150
	},
	{
	"epoch": 1.3278008298755186,
	"grad_norm": 0.21018667523519946,
	"learning_rate": 5e-06,
	"loss": 0.5041,
	"step": 160
	},
	{
	"epoch": 1.4107883817427385,
	"grad_norm": 0.219240042940767,
	"learning_rate": 5e-06,
	"loss": 0.4998,
	"step": 170
	},
	{
	"epoch": 1.4937759336099585,
	"grad_norm": 0.22681455392212077,
	"learning_rate": 5e-06,
	"loss": 0.5037,
	"step": 180
	},
	{
	"epoch": 1.5767634854771784,
	"grad_norm": 0.227133839723048,
	"learning_rate": 5e-06,
	"loss": 0.4977,
	"step": 190
	},
	{
	"epoch": 1.6597510373443982,
	"grad_norm": 0.21040711904959797,
	"learning_rate": 5e-06,
	"loss": 0.4941,
	"step": 200
	},
	{
	"epoch": 1.7427385892116183,
	"grad_norm": 0.23482785666403702,
	"learning_rate": 5e-06,
	"loss": 0.4945,
	"step": 210
	},
	{
	"epoch": 1.8257261410788381,
	"grad_norm": 0.2035179907011211,
	"learning_rate": 5e-06,
	"loss": 0.4904,
	"step": 220
	},
	{
	"epoch": 1.908713692946058,
	"grad_norm": 0.21720290177963564,
	"learning_rate": 5e-06,
	"loss": 0.491,
	"step": 230
	},
	{
	"epoch": 1.991701244813278,
	"grad_norm": 0.2214820393037949,
	"learning_rate": 5e-06,
	"loss": 0.4901,
	"step": 240
	},
	{
	"epoch": 2.0,
	"eval_loss": 0.5045989155769348,
	"eval_runtime": 122.3281,
	"eval_samples_per_second": 26.527,
	"eval_steps_per_second": 0.417,
	"step": 241
	},
	{
	"epoch": 2.074688796680498,
	"grad_norm": 0.25601226331965665,
	"learning_rate": 5e-06,
	"loss": 0.4945,
	"step": 250
	},
	{
	"epoch": 2.1576763485477177,
	"grad_norm": 0.2537099080076595,
	"learning_rate": 5e-06,
	"loss": 0.4617,
	"step": 260
	},
	{
	"epoch": 2.240663900414938,
	"grad_norm": 0.2445352596834903,
	"learning_rate": 5e-06,
	"loss": 0.4648,
	"step": 270
	},
	{
	"epoch": 2.323651452282158,
	"grad_norm": 0.24195048816699535,
	"learning_rate": 5e-06,
	"loss": 0.4688,
	"step": 280
	},
	{
	"epoch": 2.4066390041493775,
	"grad_norm": 0.3297443855710949,
	"learning_rate": 5e-06,
	"loss": 0.46,
	"step": 290
	},
	{
	"epoch": 2.4896265560165975,
	"grad_norm": 0.2227067008121754,
	"learning_rate": 5e-06,
	"loss": 0.4679,
	"step": 300
	},
	{
	"epoch": 2.572614107883817,
	"grad_norm": 0.24268677689146825,
	"learning_rate": 5e-06,
	"loss": 0.4642,
	"step": 310
	},
	{
	"epoch": 2.6556016597510372,
	"grad_norm": 0.24131530500929413,
	"learning_rate": 5e-06,
	"loss": 0.4597,
	"step": 320
	},
	{
	"epoch": 2.7385892116182573,
	"grad_norm": 0.22997089130920098,
	"learning_rate": 5e-06,
	"loss": 0.4617,
	"step": 330
	},
	{
	"epoch": 2.821576763485477,
	"grad_norm": 0.23994756278793414,
	"learning_rate": 5e-06,
	"loss": 0.4597,
	"step": 340
	},
	{
	"epoch": 2.904564315352697,
	"grad_norm": 0.23257285232469585,
	"learning_rate": 5e-06,
	"loss": 0.4545,
	"step": 350
	},
	{
	"epoch": 2.987551867219917,
	"grad_norm": 0.22525776234601527,
	"learning_rate": 5e-06,
	"loss": 0.4618,
	"step": 360
	},
	{
	"epoch": 2.987551867219917,
	"eval_loss": 0.49428611993789673,
	"eval_runtime": 121.9138,
	"eval_samples_per_second": 26.617,
	"eval_steps_per_second": 0.418,
	"step": 360
	},
	{
	"epoch": 2.987551867219917,
	"step": 360,
	"total_flos": 602804028702720.0,
	"train_loss": 0.5124640332327949,
	"train_runtime": 20041.158,
	"train_samples_per_second": 9.227,
	"train_steps_per_second": 0.018
	}
	],
	"logging_steps": 10,
	"max_steps": 360,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 3,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 602804028702720.0,
	"train_batch_size": 8,
	"trial_name": null,
	"trial_params": null
	}