{ "best_metric": 1.8849581480026245, "best_model_checkpoint": "/content/drive/MyDrive/Hugh Mann/Qwen_SMS_Final/checkpoint-700", "epoch": 0.7261410788381742, "eval_steps": 50, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01037344398340249, "grad_norm": 4.638428688049316, "learning_rate": 8.000000000000001e-06, "loss": 4.6475, "step": 10 }, { "epoch": 0.02074688796680498, "grad_norm": 4.252462387084961, "learning_rate": 1.6000000000000003e-05, "loss": 4.5383, "step": 20 }, { "epoch": 0.03112033195020747, "grad_norm": 3.85965895652771, "learning_rate": 2.4e-05, "loss": 4.3536, "step": 30 }, { "epoch": 0.04149377593360996, "grad_norm": 4.050565719604492, "learning_rate": 3.2000000000000005e-05, "loss": 3.9934, "step": 40 }, { "epoch": 0.05186721991701245, "grad_norm": 3.7960705757141113, "learning_rate": 3.8400000000000005e-05, "loss": 3.3933, "step": 50 }, { "epoch": 0.05186721991701245, "eval_loss": 3.1315643787384033, "eval_runtime": 132.8003, "eval_samples_per_second": 25.813, "eval_steps_per_second": 12.907, "step": 50 }, { "epoch": 0.06224066390041494, "grad_norm": 2.2538864612579346, "learning_rate": 4.64e-05, "loss": 2.8554, "step": 60 }, { "epoch": 0.07261410788381743, "grad_norm": 0.8144139051437378, "learning_rate": 5.440000000000001e-05, "loss": 2.6473, "step": 70 }, { "epoch": 0.08298755186721991, "grad_norm": 1.0595426559448242, "learning_rate": 6.240000000000001e-05, "loss": 2.5251, "step": 80 }, { "epoch": 0.09336099585062241, "grad_norm": 1.4303592443466187, "learning_rate": 7.04e-05, "loss": 2.2689, "step": 90 }, { "epoch": 0.1037344398340249, "grad_norm": 1.6767040491104126, "learning_rate": 7.840000000000001e-05, "loss": 2.1918, "step": 100 }, { "epoch": 0.1037344398340249, "eval_loss": 2.1281208992004395, "eval_runtime": 133.7825, "eval_samples_per_second": 25.624, "eval_steps_per_second": 12.812, "step": 100 }, { "epoch": 0.11410788381742738, "grad_norm": 1.005283236503601, "learning_rate": 7.925925925925926e-05, "loss": 2.0838, "step": 110 }, { "epoch": 0.12448132780082988, "grad_norm": 0.6838532090187073, "learning_rate": 7.833333333333333e-05, "loss": 1.978, "step": 120 }, { "epoch": 0.13485477178423236, "grad_norm": 0.7325747609138489, "learning_rate": 7.740740740740741e-05, "loss": 2.0304, "step": 130 }, { "epoch": 0.14522821576763487, "grad_norm": 0.7506985664367676, "learning_rate": 7.648148148148149e-05, "loss": 1.9942, "step": 140 }, { "epoch": 0.15560165975103735, "grad_norm": 0.8646144270896912, "learning_rate": 7.555555555555556e-05, "loss": 1.9951, "step": 150 }, { "epoch": 0.15560165975103735, "eval_loss": 1.9528735876083374, "eval_runtime": 133.6246, "eval_samples_per_second": 25.654, "eval_steps_per_second": 12.827, "step": 150 }, { "epoch": 0.16597510373443983, "grad_norm": 0.6079633831977844, "learning_rate": 7.462962962962964e-05, "loss": 1.9617, "step": 160 }, { "epoch": 0.17634854771784234, "grad_norm": 0.5766311883926392, "learning_rate": 7.37037037037037e-05, "loss": 1.9178, "step": 170 }, { "epoch": 0.18672199170124482, "grad_norm": 0.6486707329750061, "learning_rate": 7.277777777777778e-05, "loss": 1.892, "step": 180 }, { "epoch": 0.1970954356846473, "grad_norm": 0.7130193114280701, "learning_rate": 7.185185185185186e-05, "loss": 1.9972, "step": 190 }, { "epoch": 0.2074688796680498, "grad_norm": 0.6239674687385559, "learning_rate": 7.092592592592593e-05, "loss": 1.9559, "step": 200 }, { "epoch": 0.2074688796680498, "eval_loss": 1.9308879375457764, "eval_runtime": 133.959, "eval_samples_per_second": 25.59, "eval_steps_per_second": 12.795, "step": 200 }, { "epoch": 0.21784232365145229, "grad_norm": 0.7013466954231262, "learning_rate": 7.000000000000001e-05, "loss": 2.056, "step": 210 }, { "epoch": 0.22821576763485477, "grad_norm": 0.7093988656997681, "learning_rate": 6.907407407407407e-05, "loss": 1.881, "step": 220 }, { "epoch": 0.23858921161825727, "grad_norm": 0.6386205554008484, "learning_rate": 6.814814814814815e-05, "loss": 2.01, "step": 230 }, { "epoch": 0.24896265560165975, "grad_norm": 0.5995863080024719, "learning_rate": 6.722222222222223e-05, "loss": 1.9305, "step": 240 }, { "epoch": 0.25933609958506226, "grad_norm": 0.640533447265625, "learning_rate": 6.62962962962963e-05, "loss": 2.0669, "step": 250 }, { "epoch": 0.25933609958506226, "eval_loss": 1.9190937280654907, "eval_runtime": 133.9066, "eval_samples_per_second": 25.6, "eval_steps_per_second": 12.8, "step": 250 }, { "epoch": 0.2697095435684647, "grad_norm": 0.5778368711471558, "learning_rate": 6.537037037037038e-05, "loss": 1.8447, "step": 260 }, { "epoch": 0.2800829875518672, "grad_norm": 0.7321183681488037, "learning_rate": 6.444444444444446e-05, "loss": 1.9436, "step": 270 }, { "epoch": 0.29045643153526973, "grad_norm": 0.7635217308998108, "learning_rate": 6.351851851851852e-05, "loss": 1.9401, "step": 280 }, { "epoch": 0.3008298755186722, "grad_norm": 0.7025775909423828, "learning_rate": 6.25925925925926e-05, "loss": 1.9252, "step": 290 }, { "epoch": 0.3112033195020747, "grad_norm": 0.7111702561378479, "learning_rate": 6.166666666666667e-05, "loss": 1.8944, "step": 300 }, { "epoch": 0.3112033195020747, "eval_loss": 1.9082934856414795, "eval_runtime": 133.3981, "eval_samples_per_second": 25.698, "eval_steps_per_second": 12.849, "step": 300 }, { "epoch": 0.3215767634854772, "grad_norm": 0.6737669110298157, "learning_rate": 6.074074074074075e-05, "loss": 1.9494, "step": 310 }, { "epoch": 0.33195020746887965, "grad_norm": 0.6313813924789429, "learning_rate": 5.981481481481482e-05, "loss": 2.0403, "step": 320 }, { "epoch": 0.34232365145228216, "grad_norm": 0.6727941632270813, "learning_rate": 5.8888888888888896e-05, "loss": 1.8966, "step": 330 }, { "epoch": 0.35269709543568467, "grad_norm": 0.72395259141922, "learning_rate": 5.796296296296297e-05, "loss": 2.1252, "step": 340 }, { "epoch": 0.3630705394190871, "grad_norm": 0.5979896783828735, "learning_rate": 5.7037037037037035e-05, "loss": 1.9482, "step": 350 }, { "epoch": 0.3630705394190871, "eval_loss": 1.9038680791854858, "eval_runtime": 134.2066, "eval_samples_per_second": 25.543, "eval_steps_per_second": 12.771, "step": 350 }, { "epoch": 0.37344398340248963, "grad_norm": 0.688392698764801, "learning_rate": 5.6111111111111114e-05, "loss": 1.9134, "step": 360 }, { "epoch": 0.38381742738589214, "grad_norm": 0.6470796465873718, "learning_rate": 5.518518518518519e-05, "loss": 1.8787, "step": 370 }, { "epoch": 0.3941908713692946, "grad_norm": 0.6241974830627441, "learning_rate": 5.425925925925926e-05, "loss": 1.9488, "step": 380 }, { "epoch": 0.4045643153526971, "grad_norm": 0.6315338015556335, "learning_rate": 5.333333333333333e-05, "loss": 1.913, "step": 390 }, { "epoch": 0.4149377593360996, "grad_norm": 0.6824229955673218, "learning_rate": 5.2407407407407406e-05, "loss": 1.9351, "step": 400 }, { "epoch": 0.4149377593360996, "eval_loss": 1.898223876953125, "eval_runtime": 133.7224, "eval_samples_per_second": 25.635, "eval_steps_per_second": 12.818, "step": 400 }, { "epoch": 0.42531120331950206, "grad_norm": 0.7064498066902161, "learning_rate": 5.1481481481481486e-05, "loss": 2.0337, "step": 410 }, { "epoch": 0.43568464730290457, "grad_norm": 0.5973237752914429, "learning_rate": 5.055555555555556e-05, "loss": 2.2631, "step": 420 }, { "epoch": 0.4460580912863071, "grad_norm": 0.5477844476699829, "learning_rate": 4.962962962962963e-05, "loss": 1.9058, "step": 430 }, { "epoch": 0.45643153526970953, "grad_norm": 0.772850513458252, "learning_rate": 4.8703703703703704e-05, "loss": 1.8676, "step": 440 }, { "epoch": 0.46680497925311204, "grad_norm": 0.6943506598472595, "learning_rate": 4.777777777777778e-05, "loss": 1.9578, "step": 450 }, { "epoch": 0.46680497925311204, "eval_loss": 1.8946939706802368, "eval_runtime": 133.6674, "eval_samples_per_second": 25.646, "eval_steps_per_second": 12.823, "step": 450 }, { "epoch": 0.47717842323651455, "grad_norm": 0.6540839076042175, "learning_rate": 4.685185185185186e-05, "loss": 1.918, "step": 460 }, { "epoch": 0.487551867219917, "grad_norm": 0.7142683863639832, "learning_rate": 4.592592592592593e-05, "loss": 1.9145, "step": 470 }, { "epoch": 0.4979253112033195, "grad_norm": 0.7420536875724792, "learning_rate": 4.5e-05, "loss": 1.8697, "step": 480 }, { "epoch": 0.508298755186722, "grad_norm": 0.6981884837150574, "learning_rate": 4.4074074074074076e-05, "loss": 1.9068, "step": 490 }, { "epoch": 0.5186721991701245, "grad_norm": 0.6794917583465576, "learning_rate": 4.3148148148148155e-05, "loss": 1.942, "step": 500 }, { "epoch": 0.5186721991701245, "eval_loss": 1.8924171924591064, "eval_runtime": 134.1209, "eval_samples_per_second": 25.559, "eval_steps_per_second": 12.78, "step": 500 }, { "epoch": 0.529045643153527, "grad_norm": 0.6879429221153259, "learning_rate": 4.222222222222223e-05, "loss": 2.0168, "step": 510 }, { "epoch": 0.5394190871369294, "grad_norm": 0.6709438562393188, "learning_rate": 4.12962962962963e-05, "loss": 1.9738, "step": 520 }, { "epoch": 0.549792531120332, "grad_norm": 0.6758420467376709, "learning_rate": 4.0370370370370374e-05, "loss": 1.8662, "step": 530 }, { "epoch": 0.5601659751037344, "grad_norm": 0.6657466888427734, "learning_rate": 3.944444444444445e-05, "loss": 1.8798, "step": 540 }, { "epoch": 0.5705394190871369, "grad_norm": 0.6013324856758118, "learning_rate": 3.851851851851852e-05, "loss": 1.8723, "step": 550 }, { "epoch": 0.5705394190871369, "eval_loss": 1.8904341459274292, "eval_runtime": 133.769, "eval_samples_per_second": 25.626, "eval_steps_per_second": 12.813, "step": 550 }, { "epoch": 0.5809128630705395, "grad_norm": 0.6017671823501587, "learning_rate": 3.759259259259259e-05, "loss": 1.8163, "step": 560 }, { "epoch": 0.5912863070539419, "grad_norm": 0.6171760559082031, "learning_rate": 3.6666666666666666e-05, "loss": 1.9758, "step": 570 }, { "epoch": 0.6016597510373444, "grad_norm": 0.6185418963432312, "learning_rate": 3.5740740740740745e-05, "loss": 1.9105, "step": 580 }, { "epoch": 0.6120331950207469, "grad_norm": 0.7011654376983643, "learning_rate": 3.481481481481482e-05, "loss": 1.8835, "step": 590 }, { "epoch": 0.6224066390041494, "grad_norm": 0.8195033669471741, "learning_rate": 3.388888888888889e-05, "loss": 1.9759, "step": 600 }, { "epoch": 0.6224066390041494, "eval_loss": 1.8884820938110352, "eval_runtime": 133.8726, "eval_samples_per_second": 25.606, "eval_steps_per_second": 12.803, "step": 600 }, { "epoch": 0.6327800829875518, "grad_norm": 0.5987865328788757, "learning_rate": 3.2962962962962964e-05, "loss": 2.0053, "step": 610 }, { "epoch": 0.6431535269709544, "grad_norm": 0.6399624347686768, "learning_rate": 3.203703703703704e-05, "loss": 1.921, "step": 620 }, { "epoch": 0.6535269709543569, "grad_norm": 0.7136725783348083, "learning_rate": 3.111111111111112e-05, "loss": 1.8195, "step": 630 }, { "epoch": 0.6639004149377593, "grad_norm": 0.6902799010276794, "learning_rate": 3.018518518518519e-05, "loss": 1.8582, "step": 640 }, { "epoch": 0.6742738589211619, "grad_norm": 0.6140012145042419, "learning_rate": 2.9259259259259262e-05, "loss": 1.9133, "step": 650 }, { "epoch": 0.6742738589211619, "eval_loss": 1.8871186971664429, "eval_runtime": 133.4215, "eval_samples_per_second": 25.693, "eval_steps_per_second": 12.847, "step": 650 }, { "epoch": 0.6846473029045643, "grad_norm": 0.6831647753715515, "learning_rate": 2.833333333333334e-05, "loss": 1.9191, "step": 660 }, { "epoch": 0.6950207468879668, "grad_norm": 0.6378768682479858, "learning_rate": 2.740740740740741e-05, "loss": 1.9567, "step": 670 }, { "epoch": 0.7053941908713693, "grad_norm": 0.5885735750198364, "learning_rate": 2.6481481481481485e-05, "loss": 1.8426, "step": 680 }, { "epoch": 0.7157676348547718, "grad_norm": 0.6207602024078369, "learning_rate": 2.5555555555555554e-05, "loss": 1.8769, "step": 690 }, { "epoch": 0.7261410788381742, "grad_norm": 0.6759030818939209, "learning_rate": 2.462962962962963e-05, "loss": 1.9621, "step": 700 }, { "epoch": 0.7261410788381742, "eval_loss": 1.8849581480026245, "eval_runtime": 133.2177, "eval_samples_per_second": 25.732, "eval_steps_per_second": 12.866, "step": 700 } ], "logging_steps": 10, "max_steps": 964, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.91193623298048e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }