selm_ours_1_iter_2 / trainer_state.json
YYYYYYibo's picture
Model save
6dec937 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 500,
"global_step": 156,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"eta": 0.0010000000474974513,
"grad_norm": 18.85951805804989,
"learning_rate": 3.125e-08,
"logits/chosen": -2.2437264919281006,
"logits/rejected": -2.1319897174835205,
"logps/chosen": -136.11781311035156,
"logps/pi_response": -276.34149169921875,
"logps/ref_response": -276.34149169921875,
"logps/rejected": -134.32876586914062,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.06,
"eta": 0.0010000000474974513,
"grad_norm": 15.166671167458636,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.3832309246063232,
"logits/rejected": -2.3636457920074463,
"logps/chosen": -155.76785278320312,
"logps/pi_response": -274.42333984375,
"logps/ref_response": -272.425048828125,
"logps/rejected": -158.64793395996094,
"loss": 0.6926,
"rewards/accuracies": 0.4270833432674408,
"rewards/chosen": -0.00683738524094224,
"rewards/margins": -0.00022508477559313178,
"rewards/rejected": -0.006612300407141447,
"step": 10
},
{
"epoch": 0.13,
"eta": 0.0010000000474974513,
"grad_norm": 15.006144425101914,
"learning_rate": 4.989935734988097e-07,
"logits/chosen": -2.3142848014831543,
"logits/rejected": -2.337123394012451,
"logps/chosen": -169.91624450683594,
"logps/pi_response": -305.30267333984375,
"logps/ref_response": -275.4255065917969,
"logps/rejected": -177.8936767578125,
"loss": 0.692,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.154428631067276,
"rewards/margins": 0.013679690659046173,
"rewards/rejected": -0.16810832917690277,
"step": 20
},
{
"epoch": 0.19,
"eta": 0.0010000000474974513,
"grad_norm": 25.3042204309977,
"learning_rate": 4.877641290737883e-07,
"logits/chosen": -2.33975887298584,
"logits/rejected": -2.3151369094848633,
"logps/chosen": -189.64102172851562,
"logps/pi_response": -321.45294189453125,
"logps/ref_response": -261.0726013183594,
"logps/rejected": -194.24017333984375,
"loss": 0.6936,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": -0.33687421679496765,
"rewards/margins": 0.005444393027573824,
"rewards/rejected": -0.3423186242580414,
"step": 30
},
{
"epoch": 0.26,
"eta": 0.0010000000474974513,
"grad_norm": 14.197120755569808,
"learning_rate": 4.646121984004665e-07,
"logits/chosen": -2.4677834510803223,
"logits/rejected": -2.4844822883605957,
"logps/chosen": -176.27413940429688,
"logps/pi_response": -294.74114990234375,
"logps/ref_response": -256.48724365234375,
"logps/rejected": -179.17926025390625,
"loss": 0.6905,
"rewards/accuracies": 0.578125,
"rewards/chosen": -0.2434801161289215,
"rewards/margins": -0.00023287050134968013,
"rewards/rejected": -0.24324722588062286,
"step": 40
},
{
"epoch": 0.32,
"eta": 0.0010000000474974513,
"grad_norm": 15.243074464797877,
"learning_rate": 4.3069871595684787e-07,
"logits/chosen": -2.445664882659912,
"logits/rejected": -2.4546258449554443,
"logps/chosen": -189.74288940429688,
"logps/pi_response": -307.9224548339844,
"logps/ref_response": -267.48931884765625,
"logps/rejected": -200.6833038330078,
"loss": 0.6929,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.2953701615333557,
"rewards/margins": 0.01801210641860962,
"rewards/rejected": -0.3133822977542877,
"step": 50
},
{
"epoch": 0.38,
"eta": 0.0010000000474974513,
"grad_norm": 16.31963304577925,
"learning_rate": 3.877242453630256e-07,
"logits/chosen": -2.477487087249756,
"logits/rejected": -2.4785385131835938,
"logps/chosen": -185.6737518310547,
"logps/pi_response": -291.3546447753906,
"logps/ref_response": -254.33984375,
"logps/rejected": -188.54415893554688,
"loss": 0.6887,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.26406130194664,
"rewards/margins": 0.008015439845621586,
"rewards/rejected": -0.2720767557621002,
"step": 60
},
{
"epoch": 0.45,
"eta": 0.0010000000474974513,
"grad_norm": 16.071293511713314,
"learning_rate": 3.378437060203357e-07,
"logits/chosen": -2.359812021255493,
"logits/rejected": -2.355583429336548,
"logps/chosen": -209.6289520263672,
"logps/pi_response": -323.3484802246094,
"logps/ref_response": -260.3892517089844,
"logps/rejected": -210.88949584960938,
"loss": 0.6913,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.5319920778274536,
"rewards/margins": 0.012827059254050255,
"rewards/rejected": -0.5448191165924072,
"step": 70
},
{
"epoch": 0.51,
"eta": 0.0010000000474974513,
"grad_norm": 16.843259909148866,
"learning_rate": 2.8355831645441387e-07,
"logits/chosen": -2.4202404022216797,
"logits/rejected": -2.362644910812378,
"logps/chosen": -196.7368927001953,
"logps/pi_response": -324.1815490722656,
"logps/ref_response": -271.9532470703125,
"logps/rejected": -200.54006958007812,
"loss": 0.6866,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.402200847864151,
"rewards/margins": 0.04481234401464462,
"rewards/rejected": -0.4470131993293762,
"step": 80
},
{
"epoch": 0.58,
"eta": 0.0010000000474974513,
"grad_norm": 20.3973840448787,
"learning_rate": 2.2759017277414164e-07,
"logits/chosen": -2.416743278503418,
"logits/rejected": -2.436403751373291,
"logps/chosen": -202.49813842773438,
"logps/pi_response": -311.49896240234375,
"logps/ref_response": -254.697509765625,
"logps/rejected": -198.43460083007812,
"loss": 0.6881,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.3861594796180725,
"rewards/margins": 0.029975295066833496,
"rewards/rejected": -0.4161347448825836,
"step": 90
},
{
"epoch": 0.64,
"eta": 0.0010000000474974513,
"grad_norm": 19.031326931010696,
"learning_rate": 1.7274575140626315e-07,
"logits/chosen": -2.3818562030792236,
"logits/rejected": -2.393977403640747,
"logps/chosen": -200.36917114257812,
"logps/pi_response": -338.3829040527344,
"logps/ref_response": -261.9479064941406,
"logps/rejected": -211.229736328125,
"loss": 0.6832,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.5124812722206116,
"rewards/margins": 0.037171028554439545,
"rewards/rejected": -0.5496522188186646,
"step": 100
},
{
"epoch": 0.7,
"eta": 0.0010000000474974513,
"grad_norm": 15.411774484824894,
"learning_rate": 1.2177518064852348e-07,
"logits/chosen": -2.4000236988067627,
"logits/rejected": -2.2974681854248047,
"logps/chosen": -204.61512756347656,
"logps/pi_response": -330.70831298828125,
"logps/ref_response": -250.0836639404297,
"logps/rejected": -208.4372100830078,
"loss": 0.6858,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.5626230239868164,
"rewards/margins": 0.039384625852108,
"rewards/rejected": -0.6020076274871826,
"step": 110
},
{
"epoch": 0.77,
"eta": 0.0010000000474974513,
"grad_norm": 15.901638390226227,
"learning_rate": 7.723433775328384e-08,
"logits/chosen": -2.2554521560668945,
"logits/rejected": -2.3232483863830566,
"logps/chosen": -223.6305694580078,
"logps/pi_response": -362.30291748046875,
"logps/ref_response": -276.22747802734375,
"logps/rejected": -226.0797119140625,
"loss": 0.6871,
"rewards/accuracies": 0.5218750238418579,
"rewards/chosen": -0.6587863564491272,
"rewards/margins": 0.01464476902037859,
"rewards/rejected": -0.6734310984611511,
"step": 120
},
{
"epoch": 0.83,
"eta": 0.0010000000474974513,
"grad_norm": 15.415238088814565,
"learning_rate": 4.1356686569674335e-08,
"logits/chosen": -2.2723705768585205,
"logits/rejected": -2.223027229309082,
"logps/chosen": -215.4403839111328,
"logps/pi_response": -351.7759094238281,
"logps/ref_response": -266.7939453125,
"logps/rejected": -221.5651397705078,
"loss": 0.6829,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.6349462270736694,
"rewards/margins": 0.02343112602829933,
"rewards/rejected": -0.6583773493766785,
"step": 130
},
{
"epoch": 0.9,
"eta": 0.0010000000474974513,
"grad_norm": 17.415730450012774,
"learning_rate": 1.5941282340065697e-08,
"logits/chosen": -2.319291591644287,
"logits/rejected": -2.372563600540161,
"logps/chosen": -211.4011688232422,
"logps/pi_response": -335.5188903808594,
"logps/ref_response": -254.0056915283203,
"logps/rejected": -212.92123413085938,
"loss": 0.6817,
"rewards/accuracies": 0.5218750238418579,
"rewards/chosen": -0.5842172503471375,
"rewards/margins": 0.024492263793945312,
"rewards/rejected": -0.6087095141410828,
"step": 140
},
{
"epoch": 0.96,
"eta": 0.0010000000474974513,
"grad_norm": 16.173628365681967,
"learning_rate": 2.2625595580163247e-09,
"logits/chosen": -2.2677788734436035,
"logits/rejected": -2.2873096466064453,
"logps/chosen": -209.3054656982422,
"logps/pi_response": -347.63079833984375,
"logps/ref_response": -265.3609313964844,
"logps/rejected": -218.402587890625,
"loss": 0.6852,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.5842490196228027,
"rewards/margins": 0.023630866780877113,
"rewards/rejected": -0.6078798770904541,
"step": 150
},
{
"epoch": 1.0,
"step": 156,
"total_flos": 0.0,
"train_loss": 0.688222443828216,
"train_runtime": 31827.1935,
"train_samples_per_second": 0.628,
"train_steps_per_second": 0.005
}
],
"logging_steps": 10,
"max_steps": 156,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}