qwen-SMS / checkpoint-700 /trainer_state.json
Jonathanmann's picture
Upload folder using huggingface_hub
87eda1f verified
{
"best_metric": 1.8849581480026245,
"best_model_checkpoint": "/content/drive/MyDrive/Hugh Mann/Qwen_SMS_Final/checkpoint-700",
"epoch": 0.7261410788381742,
"eval_steps": 50,
"global_step": 700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01037344398340249,
"grad_norm": 4.638428688049316,
"learning_rate": 8.000000000000001e-06,
"loss": 4.6475,
"step": 10
},
{
"epoch": 0.02074688796680498,
"grad_norm": 4.252462387084961,
"learning_rate": 1.6000000000000003e-05,
"loss": 4.5383,
"step": 20
},
{
"epoch": 0.03112033195020747,
"grad_norm": 3.85965895652771,
"learning_rate": 2.4e-05,
"loss": 4.3536,
"step": 30
},
{
"epoch": 0.04149377593360996,
"grad_norm": 4.050565719604492,
"learning_rate": 3.2000000000000005e-05,
"loss": 3.9934,
"step": 40
},
{
"epoch": 0.05186721991701245,
"grad_norm": 3.7960705757141113,
"learning_rate": 3.8400000000000005e-05,
"loss": 3.3933,
"step": 50
},
{
"epoch": 0.05186721991701245,
"eval_loss": 3.1315643787384033,
"eval_runtime": 132.8003,
"eval_samples_per_second": 25.813,
"eval_steps_per_second": 12.907,
"step": 50
},
{
"epoch": 0.06224066390041494,
"grad_norm": 2.2538864612579346,
"learning_rate": 4.64e-05,
"loss": 2.8554,
"step": 60
},
{
"epoch": 0.07261410788381743,
"grad_norm": 0.8144139051437378,
"learning_rate": 5.440000000000001e-05,
"loss": 2.6473,
"step": 70
},
{
"epoch": 0.08298755186721991,
"grad_norm": 1.0595426559448242,
"learning_rate": 6.240000000000001e-05,
"loss": 2.5251,
"step": 80
},
{
"epoch": 0.09336099585062241,
"grad_norm": 1.4303592443466187,
"learning_rate": 7.04e-05,
"loss": 2.2689,
"step": 90
},
{
"epoch": 0.1037344398340249,
"grad_norm": 1.6767040491104126,
"learning_rate": 7.840000000000001e-05,
"loss": 2.1918,
"step": 100
},
{
"epoch": 0.1037344398340249,
"eval_loss": 2.1281208992004395,
"eval_runtime": 133.7825,
"eval_samples_per_second": 25.624,
"eval_steps_per_second": 12.812,
"step": 100
},
{
"epoch": 0.11410788381742738,
"grad_norm": 1.005283236503601,
"learning_rate": 7.925925925925926e-05,
"loss": 2.0838,
"step": 110
},
{
"epoch": 0.12448132780082988,
"grad_norm": 0.6838532090187073,
"learning_rate": 7.833333333333333e-05,
"loss": 1.978,
"step": 120
},
{
"epoch": 0.13485477178423236,
"grad_norm": 0.7325747609138489,
"learning_rate": 7.740740740740741e-05,
"loss": 2.0304,
"step": 130
},
{
"epoch": 0.14522821576763487,
"grad_norm": 0.7506985664367676,
"learning_rate": 7.648148148148149e-05,
"loss": 1.9942,
"step": 140
},
{
"epoch": 0.15560165975103735,
"grad_norm": 0.8646144270896912,
"learning_rate": 7.555555555555556e-05,
"loss": 1.9951,
"step": 150
},
{
"epoch": 0.15560165975103735,
"eval_loss": 1.9528735876083374,
"eval_runtime": 133.6246,
"eval_samples_per_second": 25.654,
"eval_steps_per_second": 12.827,
"step": 150
},
{
"epoch": 0.16597510373443983,
"grad_norm": 0.6079633831977844,
"learning_rate": 7.462962962962964e-05,
"loss": 1.9617,
"step": 160
},
{
"epoch": 0.17634854771784234,
"grad_norm": 0.5766311883926392,
"learning_rate": 7.37037037037037e-05,
"loss": 1.9178,
"step": 170
},
{
"epoch": 0.18672199170124482,
"grad_norm": 0.6486707329750061,
"learning_rate": 7.277777777777778e-05,
"loss": 1.892,
"step": 180
},
{
"epoch": 0.1970954356846473,
"grad_norm": 0.7130193114280701,
"learning_rate": 7.185185185185186e-05,
"loss": 1.9972,
"step": 190
},
{
"epoch": 0.2074688796680498,
"grad_norm": 0.6239674687385559,
"learning_rate": 7.092592592592593e-05,
"loss": 1.9559,
"step": 200
},
{
"epoch": 0.2074688796680498,
"eval_loss": 1.9308879375457764,
"eval_runtime": 133.959,
"eval_samples_per_second": 25.59,
"eval_steps_per_second": 12.795,
"step": 200
},
{
"epoch": 0.21784232365145229,
"grad_norm": 0.7013466954231262,
"learning_rate": 7.000000000000001e-05,
"loss": 2.056,
"step": 210
},
{
"epoch": 0.22821576763485477,
"grad_norm": 0.7093988656997681,
"learning_rate": 6.907407407407407e-05,
"loss": 1.881,
"step": 220
},
{
"epoch": 0.23858921161825727,
"grad_norm": 0.6386205554008484,
"learning_rate": 6.814814814814815e-05,
"loss": 2.01,
"step": 230
},
{
"epoch": 0.24896265560165975,
"grad_norm": 0.5995863080024719,
"learning_rate": 6.722222222222223e-05,
"loss": 1.9305,
"step": 240
},
{
"epoch": 0.25933609958506226,
"grad_norm": 0.640533447265625,
"learning_rate": 6.62962962962963e-05,
"loss": 2.0669,
"step": 250
},
{
"epoch": 0.25933609958506226,
"eval_loss": 1.9190937280654907,
"eval_runtime": 133.9066,
"eval_samples_per_second": 25.6,
"eval_steps_per_second": 12.8,
"step": 250
},
{
"epoch": 0.2697095435684647,
"grad_norm": 0.5778368711471558,
"learning_rate": 6.537037037037038e-05,
"loss": 1.8447,
"step": 260
},
{
"epoch": 0.2800829875518672,
"grad_norm": 0.7321183681488037,
"learning_rate": 6.444444444444446e-05,
"loss": 1.9436,
"step": 270
},
{
"epoch": 0.29045643153526973,
"grad_norm": 0.7635217308998108,
"learning_rate": 6.351851851851852e-05,
"loss": 1.9401,
"step": 280
},
{
"epoch": 0.3008298755186722,
"grad_norm": 0.7025775909423828,
"learning_rate": 6.25925925925926e-05,
"loss": 1.9252,
"step": 290
},
{
"epoch": 0.3112033195020747,
"grad_norm": 0.7111702561378479,
"learning_rate": 6.166666666666667e-05,
"loss": 1.8944,
"step": 300
},
{
"epoch": 0.3112033195020747,
"eval_loss": 1.9082934856414795,
"eval_runtime": 133.3981,
"eval_samples_per_second": 25.698,
"eval_steps_per_second": 12.849,
"step": 300
},
{
"epoch": 0.3215767634854772,
"grad_norm": 0.6737669110298157,
"learning_rate": 6.074074074074075e-05,
"loss": 1.9494,
"step": 310
},
{
"epoch": 0.33195020746887965,
"grad_norm": 0.6313813924789429,
"learning_rate": 5.981481481481482e-05,
"loss": 2.0403,
"step": 320
},
{
"epoch": 0.34232365145228216,
"grad_norm": 0.6727941632270813,
"learning_rate": 5.8888888888888896e-05,
"loss": 1.8966,
"step": 330
},
{
"epoch": 0.35269709543568467,
"grad_norm": 0.72395259141922,
"learning_rate": 5.796296296296297e-05,
"loss": 2.1252,
"step": 340
},
{
"epoch": 0.3630705394190871,
"grad_norm": 0.5979896783828735,
"learning_rate": 5.7037037037037035e-05,
"loss": 1.9482,
"step": 350
},
{
"epoch": 0.3630705394190871,
"eval_loss": 1.9038680791854858,
"eval_runtime": 134.2066,
"eval_samples_per_second": 25.543,
"eval_steps_per_second": 12.771,
"step": 350
},
{
"epoch": 0.37344398340248963,
"grad_norm": 0.688392698764801,
"learning_rate": 5.6111111111111114e-05,
"loss": 1.9134,
"step": 360
},
{
"epoch": 0.38381742738589214,
"grad_norm": 0.6470796465873718,
"learning_rate": 5.518518518518519e-05,
"loss": 1.8787,
"step": 370
},
{
"epoch": 0.3941908713692946,
"grad_norm": 0.6241974830627441,
"learning_rate": 5.425925925925926e-05,
"loss": 1.9488,
"step": 380
},
{
"epoch": 0.4045643153526971,
"grad_norm": 0.6315338015556335,
"learning_rate": 5.333333333333333e-05,
"loss": 1.913,
"step": 390
},
{
"epoch": 0.4149377593360996,
"grad_norm": 0.6824229955673218,
"learning_rate": 5.2407407407407406e-05,
"loss": 1.9351,
"step": 400
},
{
"epoch": 0.4149377593360996,
"eval_loss": 1.898223876953125,
"eval_runtime": 133.7224,
"eval_samples_per_second": 25.635,
"eval_steps_per_second": 12.818,
"step": 400
},
{
"epoch": 0.42531120331950206,
"grad_norm": 0.7064498066902161,
"learning_rate": 5.1481481481481486e-05,
"loss": 2.0337,
"step": 410
},
{
"epoch": 0.43568464730290457,
"grad_norm": 0.5973237752914429,
"learning_rate": 5.055555555555556e-05,
"loss": 2.2631,
"step": 420
},
{
"epoch": 0.4460580912863071,
"grad_norm": 0.5477844476699829,
"learning_rate": 4.962962962962963e-05,
"loss": 1.9058,
"step": 430
},
{
"epoch": 0.45643153526970953,
"grad_norm": 0.772850513458252,
"learning_rate": 4.8703703703703704e-05,
"loss": 1.8676,
"step": 440
},
{
"epoch": 0.46680497925311204,
"grad_norm": 0.6943506598472595,
"learning_rate": 4.777777777777778e-05,
"loss": 1.9578,
"step": 450
},
{
"epoch": 0.46680497925311204,
"eval_loss": 1.8946939706802368,
"eval_runtime": 133.6674,
"eval_samples_per_second": 25.646,
"eval_steps_per_second": 12.823,
"step": 450
},
{
"epoch": 0.47717842323651455,
"grad_norm": 0.6540839076042175,
"learning_rate": 4.685185185185186e-05,
"loss": 1.918,
"step": 460
},
{
"epoch": 0.487551867219917,
"grad_norm": 0.7142683863639832,
"learning_rate": 4.592592592592593e-05,
"loss": 1.9145,
"step": 470
},
{
"epoch": 0.4979253112033195,
"grad_norm": 0.7420536875724792,
"learning_rate": 4.5e-05,
"loss": 1.8697,
"step": 480
},
{
"epoch": 0.508298755186722,
"grad_norm": 0.6981884837150574,
"learning_rate": 4.4074074074074076e-05,
"loss": 1.9068,
"step": 490
},
{
"epoch": 0.5186721991701245,
"grad_norm": 0.6794917583465576,
"learning_rate": 4.3148148148148155e-05,
"loss": 1.942,
"step": 500
},
{
"epoch": 0.5186721991701245,
"eval_loss": 1.8924171924591064,
"eval_runtime": 134.1209,
"eval_samples_per_second": 25.559,
"eval_steps_per_second": 12.78,
"step": 500
},
{
"epoch": 0.529045643153527,
"grad_norm": 0.6879429221153259,
"learning_rate": 4.222222222222223e-05,
"loss": 2.0168,
"step": 510
},
{
"epoch": 0.5394190871369294,
"grad_norm": 0.6709438562393188,
"learning_rate": 4.12962962962963e-05,
"loss": 1.9738,
"step": 520
},
{
"epoch": 0.549792531120332,
"grad_norm": 0.6758420467376709,
"learning_rate": 4.0370370370370374e-05,
"loss": 1.8662,
"step": 530
},
{
"epoch": 0.5601659751037344,
"grad_norm": 0.6657466888427734,
"learning_rate": 3.944444444444445e-05,
"loss": 1.8798,
"step": 540
},
{
"epoch": 0.5705394190871369,
"grad_norm": 0.6013324856758118,
"learning_rate": 3.851851851851852e-05,
"loss": 1.8723,
"step": 550
},
{
"epoch": 0.5705394190871369,
"eval_loss": 1.8904341459274292,
"eval_runtime": 133.769,
"eval_samples_per_second": 25.626,
"eval_steps_per_second": 12.813,
"step": 550
},
{
"epoch": 0.5809128630705395,
"grad_norm": 0.6017671823501587,
"learning_rate": 3.759259259259259e-05,
"loss": 1.8163,
"step": 560
},
{
"epoch": 0.5912863070539419,
"grad_norm": 0.6171760559082031,
"learning_rate": 3.6666666666666666e-05,
"loss": 1.9758,
"step": 570
},
{
"epoch": 0.6016597510373444,
"grad_norm": 0.6185418963432312,
"learning_rate": 3.5740740740740745e-05,
"loss": 1.9105,
"step": 580
},
{
"epoch": 0.6120331950207469,
"grad_norm": 0.7011654376983643,
"learning_rate": 3.481481481481482e-05,
"loss": 1.8835,
"step": 590
},
{
"epoch": 0.6224066390041494,
"grad_norm": 0.8195033669471741,
"learning_rate": 3.388888888888889e-05,
"loss": 1.9759,
"step": 600
},
{
"epoch": 0.6224066390041494,
"eval_loss": 1.8884820938110352,
"eval_runtime": 133.8726,
"eval_samples_per_second": 25.606,
"eval_steps_per_second": 12.803,
"step": 600
},
{
"epoch": 0.6327800829875518,
"grad_norm": 0.5987865328788757,
"learning_rate": 3.2962962962962964e-05,
"loss": 2.0053,
"step": 610
},
{
"epoch": 0.6431535269709544,
"grad_norm": 0.6399624347686768,
"learning_rate": 3.203703703703704e-05,
"loss": 1.921,
"step": 620
},
{
"epoch": 0.6535269709543569,
"grad_norm": 0.7136725783348083,
"learning_rate": 3.111111111111112e-05,
"loss": 1.8195,
"step": 630
},
{
"epoch": 0.6639004149377593,
"grad_norm": 0.6902799010276794,
"learning_rate": 3.018518518518519e-05,
"loss": 1.8582,
"step": 640
},
{
"epoch": 0.6742738589211619,
"grad_norm": 0.6140012145042419,
"learning_rate": 2.9259259259259262e-05,
"loss": 1.9133,
"step": 650
},
{
"epoch": 0.6742738589211619,
"eval_loss": 1.8871186971664429,
"eval_runtime": 133.4215,
"eval_samples_per_second": 25.693,
"eval_steps_per_second": 12.847,
"step": 650
},
{
"epoch": 0.6846473029045643,
"grad_norm": 0.6831647753715515,
"learning_rate": 2.833333333333334e-05,
"loss": 1.9191,
"step": 660
},
{
"epoch": 0.6950207468879668,
"grad_norm": 0.6378768682479858,
"learning_rate": 2.740740740740741e-05,
"loss": 1.9567,
"step": 670
},
{
"epoch": 0.7053941908713693,
"grad_norm": 0.5885735750198364,
"learning_rate": 2.6481481481481485e-05,
"loss": 1.8426,
"step": 680
},
{
"epoch": 0.7157676348547718,
"grad_norm": 0.6207602024078369,
"learning_rate": 2.5555555555555554e-05,
"loss": 1.8769,
"step": 690
},
{
"epoch": 0.7261410788381742,
"grad_norm": 0.6759030818939209,
"learning_rate": 2.462962962962963e-05,
"loss": 1.9621,
"step": 700
},
{
"epoch": 0.7261410788381742,
"eval_loss": 1.8849581480026245,
"eval_runtime": 133.2177,
"eval_samples_per_second": 25.732,
"eval_steps_per_second": 12.866,
"step": 700
}
],
"logging_steps": 10,
"max_steps": 964,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.91193623298048e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}