|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 97.44590163934426, |
|
"eval_steps": 500, |
|
"global_step": 3800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.2885245901639344, |
|
"grad_norm": 49.25, |
|
"learning_rate": 0.00019747235387045816, |
|
"loss": 6.9218, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.577049180327869, |
|
"grad_norm": 73.5, |
|
"learning_rate": 0.0001948393891521854, |
|
"loss": 3.5446, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.865573770491803, |
|
"grad_norm": 58.25, |
|
"learning_rate": 0.0001922064244339126, |
|
"loss": 3.191, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 5.131147540983607, |
|
"grad_norm": 49.0, |
|
"learning_rate": 0.00018957345971563983, |
|
"loss": 2.9104, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.419672131147541, |
|
"grad_norm": 66.5, |
|
"learning_rate": 0.00018694049499736707, |
|
"loss": 2.0795, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.7081967213114755, |
|
"grad_norm": 45.75, |
|
"learning_rate": 0.00018430753027909427, |
|
"loss": 2.3055, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 8.99672131147541, |
|
"grad_norm": 56.25, |
|
"learning_rate": 0.0001816745655608215, |
|
"loss": 1.8394, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 10.262295081967213, |
|
"grad_norm": 50.75, |
|
"learning_rate": 0.00017904160084254874, |
|
"loss": 1.5723, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 11.550819672131148, |
|
"grad_norm": 48.5, |
|
"learning_rate": 0.00017640863612427594, |
|
"loss": 1.4006, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 12.839344262295082, |
|
"grad_norm": 39.75, |
|
"learning_rate": 0.00017377567140600318, |
|
"loss": 1.363, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 14.104918032786886, |
|
"grad_norm": 38.5, |
|
"learning_rate": 0.0001711427066877304, |
|
"loss": 1.3352, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 15.39344262295082, |
|
"grad_norm": 45.0, |
|
"learning_rate": 0.00016850974196945762, |
|
"loss": 1.1165, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 16.681967213114753, |
|
"grad_norm": 44.0, |
|
"learning_rate": 0.00016587677725118485, |
|
"loss": 0.8736, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 17.970491803278687, |
|
"grad_norm": 43.5, |
|
"learning_rate": 0.00016324381253291208, |
|
"loss": 1.0635, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 19.236065573770492, |
|
"grad_norm": 34.25, |
|
"learning_rate": 0.0001606108478146393, |
|
"loss": 0.7858, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 20.524590163934427, |
|
"grad_norm": 37.25, |
|
"learning_rate": 0.00015797788309636652, |
|
"loss": 0.8236, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 21.81311475409836, |
|
"grad_norm": 35.5, |
|
"learning_rate": 0.00015534491837809376, |
|
"loss": 0.7766, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 23.078688524590163, |
|
"grad_norm": 33.0, |
|
"learning_rate": 0.00015271195365982096, |
|
"loss": 0.6612, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 24.367213114754097, |
|
"grad_norm": 33.75, |
|
"learning_rate": 0.0001500789889415482, |
|
"loss": 0.6364, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 25.65573770491803, |
|
"grad_norm": 38.25, |
|
"learning_rate": 0.00014744602422327543, |
|
"loss": 0.6553, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 26.944262295081966, |
|
"grad_norm": 29.25, |
|
"learning_rate": 0.00014481305950500263, |
|
"loss": 0.5468, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 28.20983606557377, |
|
"grad_norm": 35.25, |
|
"learning_rate": 0.00014218009478672987, |
|
"loss": 0.5311, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 29.498360655737706, |
|
"grad_norm": 27.75, |
|
"learning_rate": 0.0001395471300684571, |
|
"loss": 0.5019, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 30.78688524590164, |
|
"grad_norm": 28.125, |
|
"learning_rate": 0.0001369141653501843, |
|
"loss": 0.6387, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 32.05245901639344, |
|
"grad_norm": 38.75, |
|
"learning_rate": 0.00013428120063191154, |
|
"loss": 0.5054, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 33.34098360655738, |
|
"grad_norm": 21.875, |
|
"learning_rate": 0.00013164823591363877, |
|
"loss": 0.4805, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 34.62950819672131, |
|
"grad_norm": 29.375, |
|
"learning_rate": 0.00012901527119536598, |
|
"loss": 0.5118, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 35.91803278688525, |
|
"grad_norm": 36.0, |
|
"learning_rate": 0.0001263823064770932, |
|
"loss": 0.447, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 37.18360655737705, |
|
"grad_norm": 24.125, |
|
"learning_rate": 0.00012374934175882045, |
|
"loss": 0.3921, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 38.472131147540985, |
|
"grad_norm": 21.875, |
|
"learning_rate": 0.00012111637704054765, |
|
"loss": 0.4268, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 39.760655737704916, |
|
"grad_norm": 22.25, |
|
"learning_rate": 0.00011848341232227489, |
|
"loss": 0.3317, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 41.02622950819672, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 0.00011585044760400212, |
|
"loss": 0.387, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 42.31475409836066, |
|
"grad_norm": 20.875, |
|
"learning_rate": 0.00011321748288572934, |
|
"loss": 0.3285, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 43.60327868852459, |
|
"grad_norm": 21.375, |
|
"learning_rate": 0.00011058451816745656, |
|
"loss": 0.3281, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 44.89180327868853, |
|
"grad_norm": 22.75, |
|
"learning_rate": 0.00010795155344918379, |
|
"loss": 0.3148, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 46.157377049180326, |
|
"grad_norm": 18.75, |
|
"learning_rate": 0.00010531858873091101, |
|
"loss": 0.2567, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 47.445901639344264, |
|
"grad_norm": 23.75, |
|
"learning_rate": 0.00010268562401263824, |
|
"loss": 0.2609, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 48.734426229508195, |
|
"grad_norm": 18.75, |
|
"learning_rate": 0.00010005265929436546, |
|
"loss": 0.2365, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 9.74196945760927e-05, |
|
"loss": 0.2555, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 51.28852459016394, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 9.478672985781992e-05, |
|
"loss": 0.2184, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 52.57704918032787, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 9.215376513954714e-05, |
|
"loss": 0.2279, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 53.86557377049181, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 8.952080042127437e-05, |
|
"loss": 0.202, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 55.131147540983605, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 8.688783570300159e-05, |
|
"loss": 0.1651, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 56.41967213114754, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 8.425487098472881e-05, |
|
"loss": 0.2015, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 57.708196721311474, |
|
"grad_norm": 16.375, |
|
"learning_rate": 8.162190626645604e-05, |
|
"loss": 0.1504, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 58.99672131147541, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 7.898894154818326e-05, |
|
"loss": 0.1725, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 60.26229508196721, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 7.635597682991048e-05, |
|
"loss": 0.1499, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 61.55081967213115, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 7.372301211163771e-05, |
|
"loss": 0.145, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 62.83934426229508, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 7.109004739336493e-05, |
|
"loss": 0.1379, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 64.10491803278688, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 6.845708267509215e-05, |
|
"loss": 0.1244, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 65.39344262295081, |
|
"grad_norm": 7.0, |
|
"learning_rate": 6.582411795681939e-05, |
|
"loss": 0.1214, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 66.68196721311476, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 6.31911532385466e-05, |
|
"loss": 0.1341, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 67.97049180327869, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 6.0558188520273826e-05, |
|
"loss": 0.1201, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 69.23606557377049, |
|
"grad_norm": 20.5, |
|
"learning_rate": 5.792522380200106e-05, |
|
"loss": 0.1049, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 70.52459016393442, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 5.529225908372828e-05, |
|
"loss": 0.1033, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 71.81311475409836, |
|
"grad_norm": 3.25, |
|
"learning_rate": 5.2659294365455505e-05, |
|
"loss": 0.1028, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 73.07868852459016, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 5.002632964718273e-05, |
|
"loss": 0.1003, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 74.3672131147541, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 4.739336492890996e-05, |
|
"loss": 0.0993, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 75.65573770491804, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 4.4760400210637185e-05, |
|
"loss": 0.0988, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 76.94426229508197, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 4.2127435492364404e-05, |
|
"loss": 0.0885, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 78.20983606557377, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 3.949447077409163e-05, |
|
"loss": 0.0816, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 79.4983606557377, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 3.686150605581886e-05, |
|
"loss": 0.0969, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 80.78688524590164, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 3.422854133754608e-05, |
|
"loss": 0.0886, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 82.05245901639344, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 3.15955766192733e-05, |
|
"loss": 0.0801, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 83.34098360655737, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.896261190100053e-05, |
|
"loss": 0.0888, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 84.62950819672132, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 2.6329647182727753e-05, |
|
"loss": 0.0872, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 85.91803278688525, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 2.369668246445498e-05, |
|
"loss": 0.0807, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 87.18360655737705, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 2.1063717746182202e-05, |
|
"loss": 0.0779, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 88.47213114754098, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.843075302790943e-05, |
|
"loss": 0.0741, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 89.76065573770492, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.579778830963665e-05, |
|
"loss": 0.0833, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 91.02622950819672, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.3164823591363876e-05, |
|
"loss": 0.0861, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 92.31475409836065, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.0531858873091101e-05, |
|
"loss": 0.08, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 93.6032786885246, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 7.898894154818326e-06, |
|
"loss": 0.0785, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 94.89180327868853, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 5.2659294365455505e-06, |
|
"loss": 0.0936, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 96.15737704918033, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 2.6329647182727753e-06, |
|
"loss": 0.0741, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 97.44590163934426, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 0.0, |
|
"loss": 0.0927, |
|
"step": 3800 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 3800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.470967617037125e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|