|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7473598700243704, |
|
"eval_steps": 500, |
|
"global_step": 460, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016246953696181964, |
|
"grad_norm": 0.11087504774332047, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2851, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03249390739236393, |
|
"grad_norm": 0.14227746427059174, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1866, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.048740861088545896, |
|
"grad_norm": 0.15468546748161316, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3344, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06498781478472786, |
|
"grad_norm": 0.1867746114730835, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4973, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08123476848090982, |
|
"grad_norm": 0.6773258447647095, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7041, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09748172217709179, |
|
"grad_norm": 0.09346572309732437, |
|
"learning_rate": 0.0002, |
|
"loss": 1.239, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11372867587327376, |
|
"grad_norm": 0.11293257027864456, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1884, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12997562956945571, |
|
"grad_norm": 0.1317419856786728, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3503, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1462225832656377, |
|
"grad_norm": 0.20818021893501282, |
|
"learning_rate": 0.0002, |
|
"loss": 1.505, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16246953696181965, |
|
"grad_norm": 0.5995267629623413, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7264, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17871649065800163, |
|
"grad_norm": 0.10515395551919937, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1812, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.19496344435418358, |
|
"grad_norm": 0.11150451004505157, |
|
"learning_rate": 0.0002, |
|
"loss": 1.287, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.21121039805036557, |
|
"grad_norm": 0.1395130306482315, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2411, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.22745735174654752, |
|
"grad_norm": 0.15016046166419983, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4182, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2437043054427295, |
|
"grad_norm": 0.4636495113372803, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7269, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.25995125913891143, |
|
"grad_norm": 0.0939592495560646, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2139, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.27619821283509344, |
|
"grad_norm": 0.09509933739900589, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1963, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2924451665312754, |
|
"grad_norm": 0.1235380694270134, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2873, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.30869212022745735, |
|
"grad_norm": 0.1839320808649063, |
|
"learning_rate": 0.0002, |
|
"loss": 1.455, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3249390739236393, |
|
"grad_norm": 0.481478750705719, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7412, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3411860276198213, |
|
"grad_norm": 0.08681885898113251, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2474, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.35743298131600326, |
|
"grad_norm": 0.09558644890785217, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2158, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3736799350121852, |
|
"grad_norm": 0.12771648168563843, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3005, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.38992688870836717, |
|
"grad_norm": 0.17630507051944733, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4772, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4061738424045491, |
|
"grad_norm": 0.44942063093185425, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6645, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.42242079610073113, |
|
"grad_norm": 0.09558656066656113, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1758, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4386677497969131, |
|
"grad_norm": 0.09703896939754486, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2423, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.45491470349309504, |
|
"grad_norm": 0.12717948853969574, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3671, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.471161657189277, |
|
"grad_norm": 0.1827155202627182, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5173, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.487408610885459, |
|
"grad_norm": 0.4099660813808441, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6514, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.503655564581641, |
|
"grad_norm": 0.12756651639938354, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1952, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5199025182778229, |
|
"grad_norm": 0.09647507965564728, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1511, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5361494719740049, |
|
"grad_norm": 0.12492221593856812, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1778, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5523964256701869, |
|
"grad_norm": 0.18637599050998688, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4947, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5686433793663688, |
|
"grad_norm": 0.5491933226585388, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6702, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5848903330625508, |
|
"grad_norm": 0.091646708548069, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1607, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6011372867587328, |
|
"grad_norm": 0.10480837523937225, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2112, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6173842404549147, |
|
"grad_norm": 0.12856683135032654, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2448, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6336311941510967, |
|
"grad_norm": 0.221836119890213, |
|
"learning_rate": 0.0002, |
|
"loss": 1.471, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6498781478472786, |
|
"grad_norm": 0.442389577627182, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7765, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6661251015434606, |
|
"grad_norm": 0.08752889186143875, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2821, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6823720552396426, |
|
"grad_norm": 0.10227832943201065, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1901, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6986190089358245, |
|
"grad_norm": 0.12081281840801239, |
|
"learning_rate": 0.0002, |
|
"loss": 1.302, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7148659626320065, |
|
"grad_norm": 0.1923910230398178, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5397, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7311129163281884, |
|
"grad_norm": 0.5226483941078186, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6591, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7473598700243704, |
|
"grad_norm": 0.09780491888523102, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1514, |
|
"step": 460 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0671426142086758e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|