|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 550, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": 2.9437830448150635, |
|
"learning_rate": 9.818181818181818e-05, |
|
"loss": 0.2953, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 5.8472676277160645, |
|
"learning_rate": 9.636363636363637e-05, |
|
"loss": 0.349, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"grad_norm": 2.7091057300567627, |
|
"learning_rate": 9.454545454545455e-05, |
|
"loss": 0.2583, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 3.0798821449279785, |
|
"learning_rate": 9.272727272727273e-05, |
|
"loss": 0.2806, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.9187162518501282, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.3229, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 1.0290968418121338, |
|
"learning_rate": 8.90909090909091e-05, |
|
"loss": 0.271, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"grad_norm": 1.4106591939926147, |
|
"learning_rate": 8.727272727272727e-05, |
|
"loss": 0.3005, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 2.5027103424072266, |
|
"learning_rate": 8.545454545454545e-05, |
|
"loss": 0.3279, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"grad_norm": 1.4250562191009521, |
|
"learning_rate": 8.363636363636364e-05, |
|
"loss": 0.261, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 2.6990318298339844, |
|
"learning_rate": 8.181818181818183e-05, |
|
"loss": 0.3056, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.4321504831314087, |
|
"learning_rate": 8e-05, |
|
"loss": 0.3135, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.26422303915023804, |
|
"eval_mse": 0.26422300934791565, |
|
"eval_runtime": 146.8663, |
|
"eval_samples_per_second": 5.992, |
|
"eval_steps_per_second": 0.191, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 1.3414746522903442, |
|
"learning_rate": 7.818181818181818e-05, |
|
"loss": 0.2876, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1818181818181819, |
|
"grad_norm": 2.420074939727783, |
|
"learning_rate": 7.636363636363637e-05, |
|
"loss": 0.2263, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 1.7859795093536377, |
|
"learning_rate": 7.454545454545455e-05, |
|
"loss": 0.2656, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 1.711840033531189, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.3181, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 1.9030543565750122, |
|
"learning_rate": 7.090909090909092e-05, |
|
"loss": 0.2948, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.5454545454545454, |
|
"grad_norm": 4.079201698303223, |
|
"learning_rate": 6.90909090909091e-05, |
|
"loss": 0.3092, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"grad_norm": 1.5639630556106567, |
|
"learning_rate": 6.727272727272727e-05, |
|
"loss": 0.3237, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.7272727272727273, |
|
"grad_norm": 1.0921226739883423, |
|
"learning_rate": 6.545454545454546e-05, |
|
"loss": 0.2761, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 1.2289607524871826, |
|
"learning_rate": 6.363636363636364e-05, |
|
"loss": 0.2839, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.9090909090909092, |
|
"grad_norm": 7.732837677001953, |
|
"learning_rate": 6.181818181818182e-05, |
|
"loss": 0.3596, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.0822261571884155, |
|
"learning_rate": 6e-05, |
|
"loss": 0.2541, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.2631426453590393, |
|
"eval_mse": 0.2631426453590393, |
|
"eval_runtime": 144.7981, |
|
"eval_samples_per_second": 6.077, |
|
"eval_steps_per_second": 0.193, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.090909090909091, |
|
"grad_norm": 7.311972618103027, |
|
"learning_rate": 5.818181818181818e-05, |
|
"loss": 0.2924, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 2.816499948501587, |
|
"learning_rate": 5.636363636363636e-05, |
|
"loss": 0.2654, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 2.1038618087768555, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.3056, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.3636363636363638, |
|
"grad_norm": 2.7272915840148926, |
|
"learning_rate": 5.272727272727272e-05, |
|
"loss": 0.2994, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.4545454545454546, |
|
"grad_norm": 1.5483731031417847, |
|
"learning_rate": 5.090909090909091e-05, |
|
"loss": 0.2938, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 1.6636422872543335, |
|
"learning_rate": 4.909090909090909e-05, |
|
"loss": 0.2762, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.6363636363636362, |
|
"grad_norm": 2.477865219116211, |
|
"learning_rate": 4.7272727272727275e-05, |
|
"loss": 0.3129, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 2.1476693153381348, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 0.2656, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.8181818181818183, |
|
"grad_norm": 1.3794795274734497, |
|
"learning_rate": 4.3636363636363636e-05, |
|
"loss": 0.2629, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 0.8552964925765991, |
|
"learning_rate": 4.181818181818182e-05, |
|
"loss": 0.2593, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.4841196537017822, |
|
"learning_rate": 4e-05, |
|
"loss": 0.2589, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.2576321065425873, |
|
"eval_mse": 0.2576321065425873, |
|
"eval_runtime": 145.7562, |
|
"eval_samples_per_second": 6.037, |
|
"eval_steps_per_second": 0.192, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.090909090909091, |
|
"grad_norm": 3.3481814861297607, |
|
"learning_rate": 3.818181818181819e-05, |
|
"loss": 0.267, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 1.704964280128479, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.2177, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.2727272727272725, |
|
"grad_norm": 1.3325728178024292, |
|
"learning_rate": 3.454545454545455e-05, |
|
"loss": 0.2815, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.3636363636363638, |
|
"grad_norm": 2.831594705581665, |
|
"learning_rate": 3.272727272727273e-05, |
|
"loss": 0.2813, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.4545454545454546, |
|
"grad_norm": 1.7833585739135742, |
|
"learning_rate": 3.090909090909091e-05, |
|
"loss": 0.2745, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.5454545454545454, |
|
"grad_norm": 2.4361367225646973, |
|
"learning_rate": 2.909090909090909e-05, |
|
"loss": 0.3067, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 1.7597070932388306, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 0.2706, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.7272727272727275, |
|
"grad_norm": 2.5651698112487793, |
|
"learning_rate": 2.5454545454545454e-05, |
|
"loss": 0.2742, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.8181818181818183, |
|
"grad_norm": 1.827591896057129, |
|
"learning_rate": 2.3636363636363637e-05, |
|
"loss": 0.2732, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.909090909090909, |
|
"grad_norm": 3.2626404762268066, |
|
"learning_rate": 2.1818181818181818e-05, |
|
"loss": 0.2653, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.742932081222534, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3396, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.2595486342906952, |
|
"eval_mse": 0.2595486640930176, |
|
"eval_runtime": 142.819, |
|
"eval_samples_per_second": 6.162, |
|
"eval_steps_per_second": 0.196, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.090909090909091, |
|
"grad_norm": 1.4092158079147339, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.2322, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.181818181818182, |
|
"grad_norm": 4.581083297729492, |
|
"learning_rate": 1.6363636363636366e-05, |
|
"loss": 0.3015, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.2727272727272725, |
|
"grad_norm": 1.8915051221847534, |
|
"learning_rate": 1.4545454545454545e-05, |
|
"loss": 0.2979, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 2.127157688140869, |
|
"learning_rate": 1.2727272727272727e-05, |
|
"loss": 0.2525, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.454545454545454, |
|
"grad_norm": 3.3702645301818848, |
|
"learning_rate": 1.0909090909090909e-05, |
|
"loss": 0.2603, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 2.1041481494903564, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 0.2564, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.636363636363637, |
|
"grad_norm": 2.5933034420013428, |
|
"learning_rate": 7.272727272727272e-06, |
|
"loss": 0.2677, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.7272727272727275, |
|
"grad_norm": 2.349623918533325, |
|
"learning_rate": 5.4545454545454545e-06, |
|
"loss": 0.2752, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.818181818181818, |
|
"grad_norm": 5.803585529327393, |
|
"learning_rate": 3.636363636363636e-06, |
|
"loss": 0.2943, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.909090909090909, |
|
"grad_norm": 1.2074114084243774, |
|
"learning_rate": 1.818181818181818e-06, |
|
"loss": 0.2917, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.9745060801506042, |
|
"learning_rate": 0.0, |
|
"loss": 0.2587, |
|
"step": 550 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 550, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|