|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 591, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.050761421319796954, |
|
"grad_norm": 2.884176452669992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7058, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10152284263959391, |
|
"grad_norm": 1.1031748057601416, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6396, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15228426395939088, |
|
"grad_norm": 0.8763770464242356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6263, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.20304568527918782, |
|
"grad_norm": 0.8637694735287882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6163, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.25380710659898476, |
|
"grad_norm": 0.6443562464039193, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5983, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.30456852791878175, |
|
"grad_norm": 0.7181436435893084, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5841, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3553299492385787, |
|
"grad_norm": 0.5736245789251513, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5851, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.40609137055837563, |
|
"grad_norm": 0.5370831215793392, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5743, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.45685279187817257, |
|
"grad_norm": 0.7868112154827747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5631, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5076142131979695, |
|
"grad_norm": 0.6384967672565564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5621, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5583756345177665, |
|
"grad_norm": 0.6105632833081082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5626, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6091370558375635, |
|
"grad_norm": 0.4980885433595067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5576, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6598984771573604, |
|
"grad_norm": 0.5567494710307067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5634, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7106598984771574, |
|
"grad_norm": 0.5747955173184647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5556, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7614213197969543, |
|
"grad_norm": 0.7422760458895346, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5485, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8121827411167513, |
|
"grad_norm": 0.5519407925537647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5555, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8629441624365483, |
|
"grad_norm": 0.3938384364399605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5534, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9137055837563451, |
|
"grad_norm": 0.4274758585586587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5504, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9644670050761421, |
|
"grad_norm": 0.4623946226615239, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5428, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.545667827129364, |
|
"eval_runtime": 69.9223, |
|
"eval_samples_per_second": 75.899, |
|
"eval_steps_per_second": 0.601, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.015228426395939, |
|
"grad_norm": 0.4892463181375632, |
|
"learning_rate": 5e-06, |
|
"loss": 0.538, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0659898477157361, |
|
"grad_norm": 0.5258103265854674, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5005, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.116751269035533, |
|
"grad_norm": 0.43953346933881265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5116, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.16751269035533, |
|
"grad_norm": 0.555350254923011, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5039, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.218274111675127, |
|
"grad_norm": 0.4240258545882722, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4979, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.2690355329949239, |
|
"grad_norm": 0.6055626365057643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4977, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3197969543147208, |
|
"grad_norm": 0.5183814019968731, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5015, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.3705583756345177, |
|
"grad_norm": 0.44198994392173363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5052, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4213197969543148, |
|
"grad_norm": 0.4597615345223239, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4974, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4720812182741116, |
|
"grad_norm": 0.47036560762175594, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5044, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.5228426395939088, |
|
"grad_norm": 0.4356873877920564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5134, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5736040609137056, |
|
"grad_norm": 0.4547653150275499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5022, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.6243654822335025, |
|
"grad_norm": 0.5243750306692014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5001, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.6751269035532994, |
|
"grad_norm": 0.5072947429791255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.504, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.7258883248730963, |
|
"grad_norm": 0.4591167012346128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5008, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.7766497461928934, |
|
"grad_norm": 0.559953933092174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5058, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.8274111675126905, |
|
"grad_norm": 0.4615691289798925, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5013, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.8781725888324874, |
|
"grad_norm": 0.4673102096867509, |
|
"learning_rate": 5e-06, |
|
"loss": 0.505, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.9289340101522843, |
|
"grad_norm": 0.41382969409293313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5029, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.9796954314720812, |
|
"grad_norm": 0.4786556881881671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4991, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.5367357730865479, |
|
"eval_runtime": 69.7466, |
|
"eval_samples_per_second": 76.09, |
|
"eval_steps_per_second": 0.602, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.030456852791878, |
|
"grad_norm": 0.7058322019162009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4785, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.081218274111675, |
|
"grad_norm": 0.45765334889032916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4563, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.1319796954314723, |
|
"grad_norm": 0.4288566534513188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4564, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.182741116751269, |
|
"grad_norm": 0.4543531173101596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4559, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.233502538071066, |
|
"grad_norm": 0.5677467898296061, |
|
"learning_rate": 5e-06, |
|
"loss": 0.454, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.284263959390863, |
|
"grad_norm": 0.5213901371480834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4584, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.33502538071066, |
|
"grad_norm": 0.5106205098926287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4603, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.3857868020304567, |
|
"grad_norm": 0.48058806334750254, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4586, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.436548223350254, |
|
"grad_norm": 0.4462504100857437, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4547, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.487309644670051, |
|
"grad_norm": 0.47373720391622604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4553, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.5380710659898478, |
|
"grad_norm": 0.44268127361884035, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4552, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.5888324873096447, |
|
"grad_norm": 0.5168789659092264, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4495, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.6395939086294415, |
|
"grad_norm": 0.4759213683260235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4601, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.6903553299492384, |
|
"grad_norm": 0.5405150525609788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4639, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.7411167512690353, |
|
"grad_norm": 0.48554448131156325, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4561, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.7918781725888326, |
|
"grad_norm": 0.5715450026884347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.458, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.8426395939086295, |
|
"grad_norm": 0.45709891570844524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4582, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.8934010152284264, |
|
"grad_norm": 0.4329230281361173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4535, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.9441624365482233, |
|
"grad_norm": 0.4707895543051601, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4609, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.99492385786802, |
|
"grad_norm": 0.4717664390758382, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4636, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.5386124849319458, |
|
"eval_runtime": 67.8364, |
|
"eval_samples_per_second": 78.232, |
|
"eval_steps_per_second": 0.619, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 591, |
|
"total_flos": 989528252743680.0, |
|
"train_loss": 0.5134775242224563, |
|
"train_runtime": 10543.3353, |
|
"train_samples_per_second": 28.688, |
|
"train_steps_per_second": 0.056 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 591, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 989528252743680.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|