|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.3337505214851898, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008343763037129746, |
|
"grad_norm": 0.7562159895896912, |
|
"learning_rate": 0.0001951951951951952, |
|
"loss": 1.9181, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.016687526074259492, |
|
"grad_norm": 0.8098325133323669, |
|
"learning_rate": 0.0001901901901901902, |
|
"loss": 1.1849, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.025031289111389236, |
|
"grad_norm": 0.3777216374874115, |
|
"learning_rate": 0.0001851851851851852, |
|
"loss": 1.6916, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.033375052148518984, |
|
"grad_norm": 0.5206697583198547, |
|
"learning_rate": 0.00018018018018018018, |
|
"loss": 1.1422, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.041718815185648725, |
|
"grad_norm": 0.38817787170410156, |
|
"learning_rate": 0.0001751751751751752, |
|
"loss": 1.6667, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05006257822277847, |
|
"grad_norm": 0.6388385891914368, |
|
"learning_rate": 0.0001701701701701702, |
|
"loss": 1.2033, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05840634125990822, |
|
"grad_norm": 0.4120958149433136, |
|
"learning_rate": 0.00016516516516516518, |
|
"loss": 1.725, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.06675010429703797, |
|
"grad_norm": 0.46837368607521057, |
|
"learning_rate": 0.00016016016016016018, |
|
"loss": 1.065, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07509386733416772, |
|
"grad_norm": 0.4679395854473114, |
|
"learning_rate": 0.00015515515515515516, |
|
"loss": 1.7601, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.08343763037129745, |
|
"grad_norm": 0.380135178565979, |
|
"learning_rate": 0.00015015015015015014, |
|
"loss": 1.0251, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0917813934084272, |
|
"grad_norm": 0.3675175905227661, |
|
"learning_rate": 0.00014514514514514515, |
|
"loss": 1.6459, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.10012515644555695, |
|
"grad_norm": 0.48725131154060364, |
|
"learning_rate": 0.00014014014014014013, |
|
"loss": 1.0373, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1084689194826867, |
|
"grad_norm": 0.42459967732429504, |
|
"learning_rate": 0.00013513513513513514, |
|
"loss": 1.6939, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.11681268251981644, |
|
"grad_norm": 0.44615626335144043, |
|
"learning_rate": 0.00013013013013013014, |
|
"loss": 1.1265, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1251564455569462, |
|
"grad_norm": 0.3729555904865265, |
|
"learning_rate": 0.00012512512512512512, |
|
"loss": 1.7411, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.13350020859407594, |
|
"grad_norm": 0.42633363604545593, |
|
"learning_rate": 0.00012012012012012013, |
|
"loss": 1.1436, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14184397163120568, |
|
"grad_norm": 0.4313684105873108, |
|
"learning_rate": 0.00011511511511511512, |
|
"loss": 1.6521, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.15018773466833543, |
|
"grad_norm": 0.3195270597934723, |
|
"learning_rate": 0.00011011011011011012, |
|
"loss": 0.984, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.15853149770546515, |
|
"grad_norm": 0.279301255941391, |
|
"learning_rate": 0.00010510510510510511, |
|
"loss": 1.6523, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.1668752607425949, |
|
"grad_norm": 0.32661929726600647, |
|
"learning_rate": 0.00010010010010010012, |
|
"loss": 1.0151, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1668752607425949, |
|
"eval_loss": 1.5524340867996216, |
|
"eval_runtime": 511.702, |
|
"eval_samples_per_second": 2.931, |
|
"eval_steps_per_second": 0.367, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.17521902377972465, |
|
"grad_norm": 0.27320751547813416, |
|
"learning_rate": 9.50950950950951e-05, |
|
"loss": 1.6524, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.1835627868168544, |
|
"grad_norm": 0.393206387758255, |
|
"learning_rate": 9.009009009009009e-05, |
|
"loss": 0.9658, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.19190654985398414, |
|
"grad_norm": 0.29037702083587646, |
|
"learning_rate": 8.50850850850851e-05, |
|
"loss": 1.6467, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.2002503128911139, |
|
"grad_norm": 0.3599018156528473, |
|
"learning_rate": 8.008008008008009e-05, |
|
"loss": 1.0368, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.20859407592824364, |
|
"grad_norm": 0.397064208984375, |
|
"learning_rate": 7.507507507507507e-05, |
|
"loss": 1.7412, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.2169378389653734, |
|
"grad_norm": 0.3856523334980011, |
|
"learning_rate": 7.007007007007007e-05, |
|
"loss": 1.1053, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.22528160200250313, |
|
"grad_norm": 0.30977582931518555, |
|
"learning_rate": 6.506506506506507e-05, |
|
"loss": 1.7623, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.23362536503963288, |
|
"grad_norm": 0.4029249846935272, |
|
"learning_rate": 6.0060060060060066e-05, |
|
"loss": 1.1681, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.24196912807676263, |
|
"grad_norm": 0.3093183636665344, |
|
"learning_rate": 5.505505505505506e-05, |
|
"loss": 1.7267, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.2503128911138924, |
|
"grad_norm": 0.45136523246765137, |
|
"learning_rate": 5.005005005005006e-05, |
|
"loss": 1.1256, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2586566541510221, |
|
"grad_norm": 0.2787851393222809, |
|
"learning_rate": 4.5045045045045046e-05, |
|
"loss": 1.7174, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.2670004171881519, |
|
"grad_norm": 0.33277878165245056, |
|
"learning_rate": 4.0040040040040046e-05, |
|
"loss": 0.9848, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2753441802252816, |
|
"grad_norm": 0.3014131784439087, |
|
"learning_rate": 3.503503503503503e-05, |
|
"loss": 1.6547, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.28368794326241137, |
|
"grad_norm": 0.27079567313194275, |
|
"learning_rate": 3.0030030030030033e-05, |
|
"loss": 1.1372, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2920317062995411, |
|
"grad_norm": 0.2874036729335785, |
|
"learning_rate": 2.502502502502503e-05, |
|
"loss": 1.7384, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.30037546933667086, |
|
"grad_norm": 0.38741400837898254, |
|
"learning_rate": 2.0020020020020023e-05, |
|
"loss": 1.0731, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3087192323738006, |
|
"grad_norm": 0.3522486090660095, |
|
"learning_rate": 1.5015015015015016e-05, |
|
"loss": 1.6537, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.3170629954109303, |
|
"grad_norm": 0.41225603222846985, |
|
"learning_rate": 1.0010010010010011e-05, |
|
"loss": 1.0646, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.32540675844806005, |
|
"grad_norm": 0.33286789059638977, |
|
"learning_rate": 5.005005005005006e-06, |
|
"loss": 1.6279, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.3337505214851898, |
|
"grad_norm": 0.405441552400589, |
|
"learning_rate": 0.0, |
|
"loss": 1.0594, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3337505214851898, |
|
"eval_loss": 1.5415141582489014, |
|
"eval_runtime": 511.169, |
|
"eval_samples_per_second": 2.934, |
|
"eval_steps_per_second": 0.368, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 250, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.341797200653312e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|