|
{ |
|
"best_metric": 6.792733192443848, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-450", |
|
"epoch": 0.11763321962122103, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00023526643924244207, |
|
"eval_loss": 6.933060169219971, |
|
"eval_runtime": 3.2084, |
|
"eval_samples_per_second": 557.906, |
|
"eval_steps_per_second": 139.632, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0023526643924244206, |
|
"grad_norm": 0.6924540996551514, |
|
"learning_rate": 4.2800000000000004e-05, |
|
"loss": 6.9289, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004705328784848841, |
|
"grad_norm": 0.6077510118484497, |
|
"learning_rate": 8.560000000000001e-05, |
|
"loss": 6.9322, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007057993177273262, |
|
"grad_norm": 0.5621480941772461, |
|
"learning_rate": 0.0001284, |
|
"loss": 6.9184, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.009410657569697682, |
|
"grad_norm": 0.8597236275672913, |
|
"learning_rate": 0.00017120000000000001, |
|
"loss": 6.8977, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.011763321962122103, |
|
"grad_norm": 0.7145811319351196, |
|
"learning_rate": 0.000214, |
|
"loss": 6.8562, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.011763321962122103, |
|
"eval_loss": 6.864169597625732, |
|
"eval_runtime": 3.1888, |
|
"eval_samples_per_second": 561.335, |
|
"eval_steps_per_second": 140.491, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.014115986354546525, |
|
"grad_norm": 0.49429962038993835, |
|
"learning_rate": 0.00021373935337780118, |
|
"loss": 6.8976, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.016468650746970945, |
|
"grad_norm": 0.4391353726387024, |
|
"learning_rate": 0.00021295868335534802, |
|
"loss": 6.8643, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.018821315139395365, |
|
"grad_norm": 0.4785984754562378, |
|
"learning_rate": 0.0002116617932785172, |
|
"loss": 6.8459, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.021173979531819785, |
|
"grad_norm": 0.5224839448928833, |
|
"learning_rate": 0.00020985500146540012, |
|
"loss": 6.8126, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.023526643924244205, |
|
"grad_norm": 0.7273280024528503, |
|
"learning_rate": 0.0002075471104240922, |
|
"loss": 6.8084, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.023526643924244205, |
|
"eval_loss": 6.832787036895752, |
|
"eval_runtime": 3.2348, |
|
"eval_samples_per_second": 553.365, |
|
"eval_steps_per_second": 138.496, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.025879308316668626, |
|
"grad_norm": 0.907507598400116, |
|
"learning_rate": 0.00020474936396775828, |
|
"loss": 6.8848, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.02823197270909305, |
|
"grad_norm": 0.47431233525276184, |
|
"learning_rate": 0.00020147539243590517, |
|
"loss": 6.8506, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03058463710151747, |
|
"grad_norm": 0.5569249391555786, |
|
"learning_rate": 0.00019774114628873756, |
|
"loss": 6.8298, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03293730149394189, |
|
"grad_norm": 0.5526637434959412, |
|
"learning_rate": 0.00019356481839811937, |
|
"loss": 6.7974, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03528996588636631, |
|
"grad_norm": 0.7836293578147888, |
|
"learning_rate": 0.00018896675541373064, |
|
"loss": 6.7893, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03528996588636631, |
|
"eval_loss": 6.818791389465332, |
|
"eval_runtime": 3.2485, |
|
"eval_samples_per_second": 551.016, |
|
"eval_steps_per_second": 137.908, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03764263027879073, |
|
"grad_norm": 0.6871911883354187, |
|
"learning_rate": 0.00018396935863623567, |
|
"loss": 6.8675, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.039995294671215154, |
|
"grad_norm": 0.5481328964233398, |
|
"learning_rate": 0.00017859697488039784, |
|
"loss": 6.8491, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04234795906363957, |
|
"grad_norm": 0.5421447157859802, |
|
"learning_rate": 0.00017287577785984542, |
|
"loss": 6.8076, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.044700623456063994, |
|
"grad_norm": 0.5816627740859985, |
|
"learning_rate": 0.0001668336406713699, |
|
"loss": 6.7903, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.04705328784848841, |
|
"grad_norm": 0.8540289402008057, |
|
"learning_rate": 0.0001605, |
|
"loss": 6.7782, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04705328784848841, |
|
"eval_loss": 6.80977725982666, |
|
"eval_runtime": 3.211, |
|
"eval_samples_per_second": 557.461, |
|
"eval_steps_per_second": 139.521, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.049405952240912834, |
|
"grad_norm": 0.6835295557975769, |
|
"learning_rate": 0.00015390571270643128, |
|
"loss": 6.8697, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.05175861663333725, |
|
"grad_norm": 0.723054826259613, |
|
"learning_rate": 0.0001470829054955026, |
|
"loss": 6.8365, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.054111281025761675, |
|
"grad_norm": 0.6531696319580078, |
|
"learning_rate": 0.00014006481839811937, |
|
"loss": 6.8019, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0564639454181861, |
|
"grad_norm": 0.5882543921470642, |
|
"learning_rate": 0.00013288564282916442, |
|
"loss": 6.7801, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.058816609810610515, |
|
"grad_norm": 0.7119730710983276, |
|
"learning_rate": 0.00012558035501036158, |
|
"loss": 6.7837, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.058816609810610515, |
|
"eval_loss": 6.803854465484619, |
|
"eval_runtime": 3.3022, |
|
"eval_samples_per_second": 542.059, |
|
"eval_steps_per_second": 135.666, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06116927420303494, |
|
"grad_norm": 0.7334095239639282, |
|
"learning_rate": 0.00011818454556963892, |
|
"loss": 6.8623, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.06352193859545936, |
|
"grad_norm": 0.4964336156845093, |
|
"learning_rate": 0.00011073424614716762, |
|
"loss": 6.8151, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.06587460298788378, |
|
"grad_norm": 0.6035408973693848, |
|
"learning_rate": 0.00010326575385283242, |
|
"loss": 6.8015, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0682272673803082, |
|
"grad_norm": 0.723812460899353, |
|
"learning_rate": 9.58154544303611e-05, |
|
"loss": 6.7626, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07057993177273263, |
|
"grad_norm": 1.4514328241348267, |
|
"learning_rate": 8.841964498963846e-05, |
|
"loss": 6.7715, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07057993177273263, |
|
"eval_loss": 6.798593044281006, |
|
"eval_runtime": 3.2084, |
|
"eval_samples_per_second": 557.906, |
|
"eval_steps_per_second": 139.632, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07293259616515704, |
|
"grad_norm": 0.9387493133544922, |
|
"learning_rate": 8.111435717083556e-05, |
|
"loss": 6.8594, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.07528526055758146, |
|
"grad_norm": 0.4966665506362915, |
|
"learning_rate": 7.393518160188063e-05, |
|
"loss": 6.8124, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.07763792495000588, |
|
"grad_norm": 0.5690203905105591, |
|
"learning_rate": 6.69170945044974e-05, |
|
"loss": 6.7967, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.07999058934243031, |
|
"grad_norm": 0.5846540927886963, |
|
"learning_rate": 6.009428729356871e-05, |
|
"loss": 6.788, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08234325373485472, |
|
"grad_norm": 0.8786140084266663, |
|
"learning_rate": 5.3500000000000026e-05, |
|
"loss": 6.7735, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.08234325373485472, |
|
"eval_loss": 6.795702934265137, |
|
"eval_runtime": 3.2583, |
|
"eval_samples_per_second": 549.37, |
|
"eval_steps_per_second": 137.496, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.08469591812727914, |
|
"grad_norm": 0.5708802342414856, |
|
"learning_rate": 4.7166359328630106e-05, |
|
"loss": 6.8333, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.08704858251970357, |
|
"grad_norm": 0.5529999136924744, |
|
"learning_rate": 4.112422214015456e-05, |
|
"loss": 6.8258, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.08940124691212799, |
|
"grad_norm": 0.6375375390052795, |
|
"learning_rate": 3.5403025119602206e-05, |
|
"loss": 6.7989, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0917539113045524, |
|
"grad_norm": 0.974999189376831, |
|
"learning_rate": 3.0030641363764346e-05, |
|
"loss": 6.751, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.09410657569697682, |
|
"grad_norm": 1.1716803312301636, |
|
"learning_rate": 2.5033244586269365e-05, |
|
"loss": 6.7699, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09410657569697682, |
|
"eval_loss": 6.793699741363525, |
|
"eval_runtime": 3.2263, |
|
"eval_samples_per_second": 554.821, |
|
"eval_steps_per_second": 138.86, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09645924008940125, |
|
"grad_norm": 0.6257851719856262, |
|
"learning_rate": 2.0435181601880635e-05, |
|
"loss": 6.8415, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.09881190448182567, |
|
"grad_norm": 0.5798721313476562, |
|
"learning_rate": 1.625885371126242e-05, |
|
"loss": 6.825, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.10116456887425009, |
|
"grad_norm": 0.5897112488746643, |
|
"learning_rate": 1.2524607564094813e-05, |
|
"loss": 6.797, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1035172332666745, |
|
"grad_norm": 0.7111787796020508, |
|
"learning_rate": 9.250636032241695e-06, |
|
"loss": 6.772, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.10586989765909893, |
|
"grad_norm": 1.2426642179489136, |
|
"learning_rate": 6.45288957590781e-06, |
|
"loss": 6.7647, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.10586989765909893, |
|
"eval_loss": 6.792733192443848, |
|
"eval_runtime": 3.2252, |
|
"eval_samples_per_second": 555.008, |
|
"eval_steps_per_second": 138.907, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.10822256205152335, |
|
"grad_norm": 0.5731855630874634, |
|
"learning_rate": 4.144998534599878e-06, |
|
"loss": 6.8432, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.11057522644394777, |
|
"grad_norm": 0.7041200399398804, |
|
"learning_rate": 2.3382067214827915e-06, |
|
"loss": 6.8216, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1129278908363722, |
|
"grad_norm": 0.7549869418144226, |
|
"learning_rate": 1.0413166446519713e-06, |
|
"loss": 6.7779, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.11528055522879661, |
|
"grad_norm": 0.7202994227409363, |
|
"learning_rate": 2.6064662219881083e-07, |
|
"loss": 6.7666, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.11763321962122103, |
|
"grad_norm": 1.0722975730895996, |
|
"learning_rate": 0.0, |
|
"loss": 6.7527, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.11763321962122103, |
|
"eval_loss": 6.792914867401123, |
|
"eval_runtime": 3.2274, |
|
"eval_samples_per_second": 554.628, |
|
"eval_steps_per_second": 138.812, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 1 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1201410048000.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|