two_agent_dpo_iter_2 / trainer_state.json
YYYYYYibo's picture
Model save
a7902e5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9945,
"eval_steps": 500,
"global_step": 153,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 17.99154727967293,
"learning_rate": 3.125e-08,
"logits/chosen": -2.152977705001831,
"logits/rejected": -2.3121213912963867,
"logps/chosen": -254.60496520996094,
"logps/rejected": -224.12643432617188,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.07,
"grad_norm": 15.888543656019479,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.4332120418548584,
"logits/rejected": -2.349087953567505,
"logps/chosen": -218.75491333007812,
"logps/rejected": -224.596923828125,
"loss": 0.6928,
"rewards/accuracies": 0.43589743971824646,
"rewards/chosen": -0.0051579843275249004,
"rewards/margins": 0.0010460736230015755,
"rewards/rejected": -0.006204057950526476,
"step": 10
},
{
"epoch": 0.13,
"grad_norm": 18.00599863061672,
"learning_rate": 4.989490450759331e-07,
"logits/chosen": -2.44968318939209,
"logits/rejected": -2.3714213371276855,
"logps/chosen": -227.60235595703125,
"logps/rejected": -245.01002502441406,
"loss": 0.6881,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.2174525260925293,
"rewards/margins": 0.025719773024320602,
"rewards/rejected": -0.243172287940979,
"step": 20
},
{
"epoch": 0.2,
"grad_norm": 17.361907335413566,
"learning_rate": 4.872270441827174e-07,
"logits/chosen": -2.3445615768432617,
"logits/rejected": -2.285738706588745,
"logps/chosen": -264.4424743652344,
"logps/rejected": -267.0367431640625,
"loss": 0.6952,
"rewards/accuracies": 0.4923076927661896,
"rewards/chosen": -0.6213539838790894,
"rewards/margins": 0.022094279527664185,
"rewards/rejected": -0.6434482336044312,
"step": 30
},
{
"epoch": 0.26,
"grad_norm": 17.40493821515386,
"learning_rate": 4.6308512113530063e-07,
"logits/chosen": -2.5398178100585938,
"logits/rejected": -2.436666488647461,
"logps/chosen": -252.19931030273438,
"logps/rejected": -264.8144226074219,
"loss": 0.6921,
"rewards/accuracies": 0.5730769038200378,
"rewards/chosen": -0.43709850311279297,
"rewards/margins": 0.043844155967235565,
"rewards/rejected": -0.4809426963329315,
"step": 40
},
{
"epoch": 0.33,
"grad_norm": 14.207926284792292,
"learning_rate": 4.277872161641681e-07,
"logits/chosen": -2.579521894454956,
"logits/rejected": -2.5611038208007812,
"logps/chosen": -251.29273986816406,
"logps/rejected": -248.70640563964844,
"loss": 0.6889,
"rewards/accuracies": 0.5269230604171753,
"rewards/chosen": -0.18656322360038757,
"rewards/margins": 0.01753987930715084,
"rewards/rejected": -0.20410311222076416,
"step": 50
},
{
"epoch": 0.39,
"grad_norm": 16.71668002097283,
"learning_rate": 3.8318133624280046e-07,
"logits/chosen": -2.5447630882263184,
"logits/rejected": -2.5085957050323486,
"logps/chosen": -254.7252960205078,
"logps/rejected": -250.2134552001953,
"loss": 0.6877,
"rewards/accuracies": 0.4769230782985687,
"rewards/chosen": -0.20926551520824432,
"rewards/margins": 0.024503052234649658,
"rewards/rejected": -0.23376856744289398,
"step": 60
},
{
"epoch": 0.46,
"grad_norm": 15.128093177880084,
"learning_rate": 3.316028034595861e-07,
"logits/chosen": -2.4117591381073,
"logits/rejected": -2.3856163024902344,
"logps/chosen": -240.95860290527344,
"logps/rejected": -251.49269104003906,
"loss": 0.6945,
"rewards/accuracies": 0.5692307949066162,
"rewards/chosen": -0.2835744321346283,
"rewards/margins": 0.03665730357170105,
"rewards/rejected": -0.32023176550865173,
"step": 70
},
{
"epoch": 0.52,
"grad_norm": 15.984185802661708,
"learning_rate": 2.7575199021178855e-07,
"logits/chosen": -2.502197742462158,
"logits/rejected": -2.4183943271636963,
"logps/chosen": -259.74285888671875,
"logps/rejected": -259.47991943359375,
"loss": 0.6796,
"rewards/accuracies": 0.557692289352417,
"rewards/chosen": -0.22889916598796844,
"rewards/margins": 0.03880741447210312,
"rewards/rejected": -0.26770660281181335,
"step": 80
},
{
"epoch": 0.58,
"grad_norm": 16.944533463136676,
"learning_rate": 2.1855294234408068e-07,
"logits/chosen": -2.4329493045806885,
"logits/rejected": -2.377147674560547,
"logps/chosen": -238.77903747558594,
"logps/rejected": -275.1826171875,
"loss": 0.6783,
"rewards/accuracies": 0.5692307949066162,
"rewards/chosen": -0.33898892998695374,
"rewards/margins": 0.07301792502403259,
"rewards/rejected": -0.41200685501098633,
"step": 90
},
{
"epoch": 0.65,
"grad_norm": 20.171571457821667,
"learning_rate": 1.6300029195778453e-07,
"logits/chosen": -2.404921770095825,
"logits/rejected": -2.3588438034057617,
"logps/chosen": -249.01068115234375,
"logps/rejected": -261.3023376464844,
"loss": 0.671,
"rewards/accuracies": 0.5807692408561707,
"rewards/chosen": -0.362224817276001,
"rewards/margins": 0.0861731767654419,
"rewards/rejected": -0.4483979642391205,
"step": 100
},
{
"epoch": 0.71,
"grad_norm": 22.208608685833642,
"learning_rate": 1.1200247470632392e-07,
"logits/chosen": -2.384084939956665,
"logits/rejected": -2.4500632286071777,
"logps/chosen": -260.6557312011719,
"logps/rejected": -259.0586853027344,
"loss": 0.6855,
"rewards/accuracies": 0.4653846025466919,
"rewards/chosen": -0.40450161695480347,
"rewards/margins": 0.021371768787503242,
"rewards/rejected": -0.42587342858314514,
"step": 110
},
{
"epoch": 0.78,
"grad_norm": 21.198358455810045,
"learning_rate": 6.822945986946385e-08,
"logits/chosen": -2.3326570987701416,
"logits/rejected": -2.2644848823547363,
"logps/chosen": -269.0829772949219,
"logps/rejected": -277.46099853515625,
"loss": 0.6829,
"rewards/accuracies": 0.5692307949066162,
"rewards/chosen": -0.45251235365867615,
"rewards/margins": 0.05605296790599823,
"rewards/rejected": -0.508565366268158,
"step": 120
},
{
"epoch": 0.84,
"grad_norm": 24.487908321616903,
"learning_rate": 3.397296523427806e-08,
"logits/chosen": -2.3290350437164307,
"logits/rejected": -2.368476629257202,
"logps/chosen": -264.3924255371094,
"logps/rejected": -283.0903625488281,
"loss": 0.6724,
"rewards/accuracies": 0.5346153974533081,
"rewards/chosen": -0.5441482663154602,
"rewards/margins": 0.08576709777116776,
"rewards/rejected": -0.6299152970314026,
"step": 130
},
{
"epoch": 0.91,
"grad_norm": 24.363492054663,
"learning_rate": 1.1026475173977978e-08,
"logits/chosen": -2.415417194366455,
"logits/rejected": -2.2751691341400146,
"logps/chosen": -280.5184326171875,
"logps/rejected": -270.88372802734375,
"loss": 0.683,
"rewards/accuracies": 0.5807692408561707,
"rewards/chosen": -0.5419987440109253,
"rewards/margins": 0.07570147514343262,
"rewards/rejected": -0.6177002787590027,
"step": 140
},
{
"epoch": 0.97,
"grad_norm": 24.646600457502558,
"learning_rate": 5.913435276374834e-10,
"logits/chosen": -2.4706640243530273,
"logits/rejected": -2.419950485229492,
"logps/chosen": -264.6922302246094,
"logps/rejected": -289.4066467285156,
"loss": 0.6716,
"rewards/accuracies": 0.6038461327552795,
"rewards/chosen": -0.4906882047653198,
"rewards/margins": 0.13459746539592743,
"rewards/rejected": -0.6252856850624084,
"step": 150
},
{
"epoch": 0.99,
"step": 153,
"total_flos": 0.0,
"train_loss": 0.6840333373718013,
"train_runtime": 39783.2449,
"train_samples_per_second": 0.503,
"train_steps_per_second": 0.004
}
],
"logging_steps": 10,
"max_steps": 153,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}