|
{ |
|
"best_metric": 1.8849581480026245, |
|
"best_model_checkpoint": "/content/drive/MyDrive/Hugh Mann/Qwen_SMS_Final/checkpoint-700", |
|
"epoch": 0.7261410788381742, |
|
"eval_steps": 50, |
|
"global_step": 700, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01037344398340249, |
|
"grad_norm": 4.638428688049316, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 4.6475, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02074688796680498, |
|
"grad_norm": 4.252462387084961, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 4.5383, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03112033195020747, |
|
"grad_norm": 3.85965895652771, |
|
"learning_rate": 2.4e-05, |
|
"loss": 4.3536, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04149377593360996, |
|
"grad_norm": 4.050565719604492, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 3.9934, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05186721991701245, |
|
"grad_norm": 3.7960705757141113, |
|
"learning_rate": 3.8400000000000005e-05, |
|
"loss": 3.3933, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05186721991701245, |
|
"eval_loss": 3.1315643787384033, |
|
"eval_runtime": 132.8003, |
|
"eval_samples_per_second": 25.813, |
|
"eval_steps_per_second": 12.907, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06224066390041494, |
|
"grad_norm": 2.2538864612579346, |
|
"learning_rate": 4.64e-05, |
|
"loss": 2.8554, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07261410788381743, |
|
"grad_norm": 0.8144139051437378, |
|
"learning_rate": 5.440000000000001e-05, |
|
"loss": 2.6473, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08298755186721991, |
|
"grad_norm": 1.0595426559448242, |
|
"learning_rate": 6.240000000000001e-05, |
|
"loss": 2.5251, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09336099585062241, |
|
"grad_norm": 1.4303592443466187, |
|
"learning_rate": 7.04e-05, |
|
"loss": 2.2689, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1037344398340249, |
|
"grad_norm": 1.6767040491104126, |
|
"learning_rate": 7.840000000000001e-05, |
|
"loss": 2.1918, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1037344398340249, |
|
"eval_loss": 2.1281208992004395, |
|
"eval_runtime": 133.7825, |
|
"eval_samples_per_second": 25.624, |
|
"eval_steps_per_second": 12.812, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11410788381742738, |
|
"grad_norm": 1.005283236503601, |
|
"learning_rate": 7.925925925925926e-05, |
|
"loss": 2.0838, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12448132780082988, |
|
"grad_norm": 0.6838532090187073, |
|
"learning_rate": 7.833333333333333e-05, |
|
"loss": 1.978, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13485477178423236, |
|
"grad_norm": 0.7325747609138489, |
|
"learning_rate": 7.740740740740741e-05, |
|
"loss": 2.0304, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14522821576763487, |
|
"grad_norm": 0.7506985664367676, |
|
"learning_rate": 7.648148148148149e-05, |
|
"loss": 1.9942, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15560165975103735, |
|
"grad_norm": 0.8646144270896912, |
|
"learning_rate": 7.555555555555556e-05, |
|
"loss": 1.9951, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15560165975103735, |
|
"eval_loss": 1.9528735876083374, |
|
"eval_runtime": 133.6246, |
|
"eval_samples_per_second": 25.654, |
|
"eval_steps_per_second": 12.827, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16597510373443983, |
|
"grad_norm": 0.6079633831977844, |
|
"learning_rate": 7.462962962962964e-05, |
|
"loss": 1.9617, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17634854771784234, |
|
"grad_norm": 0.5766311883926392, |
|
"learning_rate": 7.37037037037037e-05, |
|
"loss": 1.9178, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18672199170124482, |
|
"grad_norm": 0.6486707329750061, |
|
"learning_rate": 7.277777777777778e-05, |
|
"loss": 1.892, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1970954356846473, |
|
"grad_norm": 0.7130193114280701, |
|
"learning_rate": 7.185185185185186e-05, |
|
"loss": 1.9972, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2074688796680498, |
|
"grad_norm": 0.6239674687385559, |
|
"learning_rate": 7.092592592592593e-05, |
|
"loss": 1.9559, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2074688796680498, |
|
"eval_loss": 1.9308879375457764, |
|
"eval_runtime": 133.959, |
|
"eval_samples_per_second": 25.59, |
|
"eval_steps_per_second": 12.795, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21784232365145229, |
|
"grad_norm": 0.7013466954231262, |
|
"learning_rate": 7.000000000000001e-05, |
|
"loss": 2.056, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.22821576763485477, |
|
"grad_norm": 0.7093988656997681, |
|
"learning_rate": 6.907407407407407e-05, |
|
"loss": 1.881, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.23858921161825727, |
|
"grad_norm": 0.6386205554008484, |
|
"learning_rate": 6.814814814814815e-05, |
|
"loss": 2.01, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.24896265560165975, |
|
"grad_norm": 0.5995863080024719, |
|
"learning_rate": 6.722222222222223e-05, |
|
"loss": 1.9305, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.25933609958506226, |
|
"grad_norm": 0.640533447265625, |
|
"learning_rate": 6.62962962962963e-05, |
|
"loss": 2.0669, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.25933609958506226, |
|
"eval_loss": 1.9190937280654907, |
|
"eval_runtime": 133.9066, |
|
"eval_samples_per_second": 25.6, |
|
"eval_steps_per_second": 12.8, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2697095435684647, |
|
"grad_norm": 0.5778368711471558, |
|
"learning_rate": 6.537037037037038e-05, |
|
"loss": 1.8447, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2800829875518672, |
|
"grad_norm": 0.7321183681488037, |
|
"learning_rate": 6.444444444444446e-05, |
|
"loss": 1.9436, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.29045643153526973, |
|
"grad_norm": 0.7635217308998108, |
|
"learning_rate": 6.351851851851852e-05, |
|
"loss": 1.9401, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3008298755186722, |
|
"grad_norm": 0.7025775909423828, |
|
"learning_rate": 6.25925925925926e-05, |
|
"loss": 1.9252, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3112033195020747, |
|
"grad_norm": 0.7111702561378479, |
|
"learning_rate": 6.166666666666667e-05, |
|
"loss": 1.8944, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3112033195020747, |
|
"eval_loss": 1.9082934856414795, |
|
"eval_runtime": 133.3981, |
|
"eval_samples_per_second": 25.698, |
|
"eval_steps_per_second": 12.849, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3215767634854772, |
|
"grad_norm": 0.6737669110298157, |
|
"learning_rate": 6.074074074074075e-05, |
|
"loss": 1.9494, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.33195020746887965, |
|
"grad_norm": 0.6313813924789429, |
|
"learning_rate": 5.981481481481482e-05, |
|
"loss": 2.0403, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.34232365145228216, |
|
"grad_norm": 0.6727941632270813, |
|
"learning_rate": 5.8888888888888896e-05, |
|
"loss": 1.8966, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.35269709543568467, |
|
"grad_norm": 0.72395259141922, |
|
"learning_rate": 5.796296296296297e-05, |
|
"loss": 2.1252, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3630705394190871, |
|
"grad_norm": 0.5979896783828735, |
|
"learning_rate": 5.7037037037037035e-05, |
|
"loss": 1.9482, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3630705394190871, |
|
"eval_loss": 1.9038680791854858, |
|
"eval_runtime": 134.2066, |
|
"eval_samples_per_second": 25.543, |
|
"eval_steps_per_second": 12.771, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.37344398340248963, |
|
"grad_norm": 0.688392698764801, |
|
"learning_rate": 5.6111111111111114e-05, |
|
"loss": 1.9134, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.38381742738589214, |
|
"grad_norm": 0.6470796465873718, |
|
"learning_rate": 5.518518518518519e-05, |
|
"loss": 1.8787, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3941908713692946, |
|
"grad_norm": 0.6241974830627441, |
|
"learning_rate": 5.425925925925926e-05, |
|
"loss": 1.9488, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4045643153526971, |
|
"grad_norm": 0.6315338015556335, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 1.913, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4149377593360996, |
|
"grad_norm": 0.6824229955673218, |
|
"learning_rate": 5.2407407407407406e-05, |
|
"loss": 1.9351, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4149377593360996, |
|
"eval_loss": 1.898223876953125, |
|
"eval_runtime": 133.7224, |
|
"eval_samples_per_second": 25.635, |
|
"eval_steps_per_second": 12.818, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.42531120331950206, |
|
"grad_norm": 0.7064498066902161, |
|
"learning_rate": 5.1481481481481486e-05, |
|
"loss": 2.0337, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.43568464730290457, |
|
"grad_norm": 0.5973237752914429, |
|
"learning_rate": 5.055555555555556e-05, |
|
"loss": 2.2631, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4460580912863071, |
|
"grad_norm": 0.5477844476699829, |
|
"learning_rate": 4.962962962962963e-05, |
|
"loss": 1.9058, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.45643153526970953, |
|
"grad_norm": 0.772850513458252, |
|
"learning_rate": 4.8703703703703704e-05, |
|
"loss": 1.8676, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.46680497925311204, |
|
"grad_norm": 0.6943506598472595, |
|
"learning_rate": 4.777777777777778e-05, |
|
"loss": 1.9578, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.46680497925311204, |
|
"eval_loss": 1.8946939706802368, |
|
"eval_runtime": 133.6674, |
|
"eval_samples_per_second": 25.646, |
|
"eval_steps_per_second": 12.823, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.47717842323651455, |
|
"grad_norm": 0.6540839076042175, |
|
"learning_rate": 4.685185185185186e-05, |
|
"loss": 1.918, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.487551867219917, |
|
"grad_norm": 0.7142683863639832, |
|
"learning_rate": 4.592592592592593e-05, |
|
"loss": 1.9145, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4979253112033195, |
|
"grad_norm": 0.7420536875724792, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.8697, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.508298755186722, |
|
"grad_norm": 0.6981884837150574, |
|
"learning_rate": 4.4074074074074076e-05, |
|
"loss": 1.9068, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5186721991701245, |
|
"grad_norm": 0.6794917583465576, |
|
"learning_rate": 4.3148148148148155e-05, |
|
"loss": 1.942, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5186721991701245, |
|
"eval_loss": 1.8924171924591064, |
|
"eval_runtime": 134.1209, |
|
"eval_samples_per_second": 25.559, |
|
"eval_steps_per_second": 12.78, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.529045643153527, |
|
"grad_norm": 0.6879429221153259, |
|
"learning_rate": 4.222222222222223e-05, |
|
"loss": 2.0168, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5394190871369294, |
|
"grad_norm": 0.6709438562393188, |
|
"learning_rate": 4.12962962962963e-05, |
|
"loss": 1.9738, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.549792531120332, |
|
"grad_norm": 0.6758420467376709, |
|
"learning_rate": 4.0370370370370374e-05, |
|
"loss": 1.8662, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5601659751037344, |
|
"grad_norm": 0.6657466888427734, |
|
"learning_rate": 3.944444444444445e-05, |
|
"loss": 1.8798, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5705394190871369, |
|
"grad_norm": 0.6013324856758118, |
|
"learning_rate": 3.851851851851852e-05, |
|
"loss": 1.8723, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5705394190871369, |
|
"eval_loss": 1.8904341459274292, |
|
"eval_runtime": 133.769, |
|
"eval_samples_per_second": 25.626, |
|
"eval_steps_per_second": 12.813, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5809128630705395, |
|
"grad_norm": 0.6017671823501587, |
|
"learning_rate": 3.759259259259259e-05, |
|
"loss": 1.8163, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5912863070539419, |
|
"grad_norm": 0.6171760559082031, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 1.9758, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6016597510373444, |
|
"grad_norm": 0.6185418963432312, |
|
"learning_rate": 3.5740740740740745e-05, |
|
"loss": 1.9105, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6120331950207469, |
|
"grad_norm": 0.7011654376983643, |
|
"learning_rate": 3.481481481481482e-05, |
|
"loss": 1.8835, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6224066390041494, |
|
"grad_norm": 0.8195033669471741, |
|
"learning_rate": 3.388888888888889e-05, |
|
"loss": 1.9759, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6224066390041494, |
|
"eval_loss": 1.8884820938110352, |
|
"eval_runtime": 133.8726, |
|
"eval_samples_per_second": 25.606, |
|
"eval_steps_per_second": 12.803, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6327800829875518, |
|
"grad_norm": 0.5987865328788757, |
|
"learning_rate": 3.2962962962962964e-05, |
|
"loss": 2.0053, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6431535269709544, |
|
"grad_norm": 0.6399624347686768, |
|
"learning_rate": 3.203703703703704e-05, |
|
"loss": 1.921, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6535269709543569, |
|
"grad_norm": 0.7136725783348083, |
|
"learning_rate": 3.111111111111112e-05, |
|
"loss": 1.8195, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6639004149377593, |
|
"grad_norm": 0.6902799010276794, |
|
"learning_rate": 3.018518518518519e-05, |
|
"loss": 1.8582, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6742738589211619, |
|
"grad_norm": 0.6140012145042419, |
|
"learning_rate": 2.9259259259259262e-05, |
|
"loss": 1.9133, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6742738589211619, |
|
"eval_loss": 1.8871186971664429, |
|
"eval_runtime": 133.4215, |
|
"eval_samples_per_second": 25.693, |
|
"eval_steps_per_second": 12.847, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6846473029045643, |
|
"grad_norm": 0.6831647753715515, |
|
"learning_rate": 2.833333333333334e-05, |
|
"loss": 1.9191, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6950207468879668, |
|
"grad_norm": 0.6378768682479858, |
|
"learning_rate": 2.740740740740741e-05, |
|
"loss": 1.9567, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7053941908713693, |
|
"grad_norm": 0.5885735750198364, |
|
"learning_rate": 2.6481481481481485e-05, |
|
"loss": 1.8426, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7157676348547718, |
|
"grad_norm": 0.6207602024078369, |
|
"learning_rate": 2.5555555555555554e-05, |
|
"loss": 1.8769, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7261410788381742, |
|
"grad_norm": 0.6759030818939209, |
|
"learning_rate": 2.462962962962963e-05, |
|
"loss": 1.9621, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7261410788381742, |
|
"eval_loss": 1.8849581480026245, |
|
"eval_runtime": 133.2177, |
|
"eval_samples_per_second": 25.732, |
|
"eval_steps_per_second": 12.866, |
|
"step": 700 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 964, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.91193623298048e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|