two_agent_rdpo_iter_2 / trainer_state.json
YYYYYYibo's picture
Model save
8097c02 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9945,
"eval_steps": 500,
"global_step": 153,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"eta": 0.0010000000474974513,
"grad_norm": 18.06976070111927,
"learning_rate": 3.125e-08,
"logits/chosen": -2.1194543838500977,
"logits/rejected": -2.2610020637512207,
"logps/chosen": -254.6973419189453,
"logps/pi_response": -318.5512390136719,
"logps/ref_response": -318.5512390136719,
"logps/rejected": -224.19918823242188,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.07,
"eta": 0.0010000000474974513,
"grad_norm": 16.01968550655723,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.385725975036621,
"logits/rejected": -2.3076765537261963,
"logps/chosen": -218.1858673095703,
"logps/pi_response": -268.26239013671875,
"logps/ref_response": -266.3502197265625,
"logps/rejected": -224.54605102539062,
"loss": 0.6927,
"rewards/accuracies": 0.44871795177459717,
"rewards/chosen": -0.006226478144526482,
"rewards/margins": 0.0014922022819519043,
"rewards/rejected": -0.007718680426478386,
"step": 10
},
{
"epoch": 0.13,
"eta": 0.0010000000474974513,
"grad_norm": 16.02312687332099,
"learning_rate": 4.989490450759331e-07,
"logits/chosen": -2.397501230239868,
"logits/rejected": -2.3145182132720947,
"logps/chosen": -228.54135131835938,
"logps/pi_response": -300.3511047363281,
"logps/ref_response": -260.5257873535156,
"logps/rejected": -246.08518981933594,
"loss": 0.6874,
"rewards/accuracies": 0.5230769515037537,
"rewards/chosen": -0.22692929208278656,
"rewards/margins": 0.027445880696177483,
"rewards/rejected": -0.254375159740448,
"step": 20
},
{
"epoch": 0.2,
"eta": 0.0010000000474974513,
"grad_norm": 18.496071851642174,
"learning_rate": 4.872270441827174e-07,
"logits/chosen": -2.2651801109313965,
"logits/rejected": -2.206923007965088,
"logps/chosen": -264.01336669921875,
"logps/pi_response": -342.78717041015625,
"logps/ref_response": -260.0426940917969,
"logps/rejected": -266.82000732421875,
"loss": 0.696,
"rewards/accuracies": 0.4730769097805023,
"rewards/chosen": -0.6173264980316162,
"rewards/margins": 0.023560278117656708,
"rewards/rejected": -0.6408867835998535,
"step": 30
},
{
"epoch": 0.26,
"eta": 0.0010000000474974513,
"grad_norm": 17.411127458960003,
"learning_rate": 4.6308512113530063e-07,
"logits/chosen": -2.4821219444274902,
"logits/rejected": -2.384908437728882,
"logps/chosen": -243.66514587402344,
"logps/pi_response": -308.2298583984375,
"logps/ref_response": -255.49522399902344,
"logps/rejected": -256.4287109375,
"loss": 0.6903,
"rewards/accuracies": 0.5538461804389954,
"rewards/chosen": -0.3519066572189331,
"rewards/margins": 0.04566844180226326,
"rewards/rejected": -0.3975750505924225,
"step": 40
},
{
"epoch": 0.33,
"eta": 0.0010000000474974513,
"grad_norm": 14.517352316678341,
"learning_rate": 4.277872161641681e-07,
"logits/chosen": -2.545145273208618,
"logits/rejected": -2.5242159366607666,
"logps/chosen": -247.29806518554688,
"logps/pi_response": -293.09326171875,
"logps/ref_response": -275.4074401855469,
"logps/rejected": -244.6110076904297,
"loss": 0.6891,
"rewards/accuracies": 0.557692289352417,
"rewards/chosen": -0.1450691968202591,
"rewards/margins": 0.017832614481449127,
"rewards/rejected": -0.16290180385112762,
"step": 50
},
{
"epoch": 0.39,
"eta": 0.0010000000474974513,
"grad_norm": 17.626546770334993,
"learning_rate": 3.8318133624280046e-07,
"logits/chosen": -2.511488676071167,
"logits/rejected": -2.480903148651123,
"logps/chosen": -257.10791015625,
"logps/pi_response": -302.22210693359375,
"logps/ref_response": -270.9198303222656,
"logps/rejected": -252.54173278808594,
"loss": 0.6912,
"rewards/accuracies": 0.48846152424812317,
"rewards/chosen": -0.23286302387714386,
"rewards/margins": 0.02372700721025467,
"rewards/rejected": -0.2565900385379791,
"step": 60
},
{
"epoch": 0.46,
"eta": 0.0010000000474974513,
"grad_norm": 15.787453453048895,
"learning_rate": 3.316028034595861e-07,
"logits/chosen": -2.3820903301239014,
"logits/rejected": -2.3555104732513428,
"logps/chosen": -250.5824432373047,
"logps/pi_response": -322.5419921875,
"logps/ref_response": -270.929931640625,
"logps/rejected": -260.3626403808594,
"loss": 0.695,
"rewards/accuracies": 0.557692289352417,
"rewards/chosen": -0.37925985455513,
"rewards/margins": 0.030029216781258583,
"rewards/rejected": -0.40928906202316284,
"step": 70
},
{
"epoch": 0.52,
"eta": 0.0010000000474974513,
"grad_norm": 14.876878099596093,
"learning_rate": 2.7575199021178855e-07,
"logits/chosen": -2.463655948638916,
"logits/rejected": -2.3770523071289062,
"logps/chosen": -270.34197998046875,
"logps/pi_response": -317.0186767578125,
"logps/ref_response": -278.0060119628906,
"logps/rejected": -269.89398193359375,
"loss": 0.682,
"rewards/accuracies": 0.5538461804389954,
"rewards/chosen": -0.3352661728858948,
"rewards/margins": 0.037235379219055176,
"rewards/rejected": -0.37250155210494995,
"step": 80
},
{
"epoch": 0.58,
"eta": 0.0010000000474974513,
"grad_norm": 18.475251229694116,
"learning_rate": 2.1855294234408068e-07,
"logits/chosen": -2.37835693359375,
"logits/rejected": -2.3291523456573486,
"logps/chosen": -244.36837768554688,
"logps/pi_response": -331.8450012207031,
"logps/ref_response": -267.61846923828125,
"logps/rejected": -280.1258544921875,
"loss": 0.6786,
"rewards/accuracies": 0.5692307949066162,
"rewards/chosen": -0.39472243189811707,
"rewards/margins": 0.06756081432104111,
"rewards/rejected": -0.46228325366973877,
"step": 90
},
{
"epoch": 0.65,
"eta": 0.0010000000474974513,
"grad_norm": 21.467183952290178,
"learning_rate": 1.6300029195778453e-07,
"logits/chosen": -2.353746175765991,
"logits/rejected": -2.2994236946105957,
"logps/chosen": -259.2685546875,
"logps/pi_response": -356.6775817871094,
"logps/ref_response": -270.3107604980469,
"logps/rejected": -271.2638854980469,
"loss": 0.6687,
"rewards/accuracies": 0.5769230723381042,
"rewards/chosen": -0.4701143503189087,
"rewards/margins": 0.08206918090581894,
"rewards/rejected": -0.5521835088729858,
"step": 100
},
{
"epoch": 0.71,
"eta": 0.0010000000474974513,
"grad_norm": 22.52617676998604,
"learning_rate": 1.1200247470632392e-07,
"logits/chosen": -2.292710542678833,
"logits/rejected": -2.357100486755371,
"logps/chosen": -274.7545471191406,
"logps/pi_response": -387.603515625,
"logps/ref_response": -285.7787780761719,
"logps/rejected": -272.741943359375,
"loss": 0.6847,
"rewards/accuracies": 0.4923076927661896,
"rewards/chosen": -0.5492157936096191,
"rewards/margins": 0.018456529825925827,
"rewards/rejected": -0.5676723718643188,
"step": 110
},
{
"epoch": 0.78,
"eta": 0.0010000000474974513,
"grad_norm": 23.08131985031319,
"learning_rate": 6.822945986946385e-08,
"logits/chosen": -2.2610812187194824,
"logits/rejected": -2.1860787868499756,
"logps/chosen": -282.33782958984375,
"logps/pi_response": -372.6535949707031,
"logps/ref_response": -265.4132080078125,
"logps/rejected": -290.5935363769531,
"loss": 0.6808,
"rewards/accuracies": 0.5461538434028625,
"rewards/chosen": -0.5871608257293701,
"rewards/margins": 0.05175128951668739,
"rewards/rejected": -0.6389120817184448,
"step": 120
},
{
"epoch": 0.84,
"eta": 0.0010000000474974513,
"grad_norm": 22.961865819374246,
"learning_rate": 3.397296523427806e-08,
"logits/chosen": -2.2791786193847656,
"logits/rejected": -2.31884503364563,
"logps/chosen": -277.06085205078125,
"logps/pi_response": -365.2495422363281,
"logps/ref_response": -260.3804626464844,
"logps/rejected": -295.7681884765625,
"loss": 0.672,
"rewards/accuracies": 0.5653846263885498,
"rewards/chosen": -0.6707223057746887,
"rewards/margins": 0.08594530820846558,
"rewards/rejected": -0.7566676139831543,
"step": 130
},
{
"epoch": 0.91,
"eta": 0.0010000000474974513,
"grad_norm": 22.470744853394315,
"learning_rate": 1.1026475173977978e-08,
"logits/chosen": -2.3486456871032715,
"logits/rejected": -2.201983690261841,
"logps/chosen": -294.6012268066406,
"logps/pi_response": -374.5638732910156,
"logps/ref_response": -269.6314697265625,
"logps/rejected": -284.3193359375,
"loss": 0.6824,
"rewards/accuracies": 0.5653846263885498,
"rewards/chosen": -0.683001697063446,
"rewards/margins": 0.06894499808549881,
"rewards/rejected": -0.751946747303009,
"step": 140
},
{
"epoch": 0.97,
"eta": 0.0010000000474974513,
"grad_norm": 26.50699233707175,
"learning_rate": 5.913435276374834e-10,
"logits/chosen": -2.3918538093566895,
"logits/rejected": -2.34391450881958,
"logps/chosen": -279.7143859863281,
"logps/pi_response": -387.8644104003906,
"logps/ref_response": -273.84423828125,
"logps/rejected": -304.1023254394531,
"loss": 0.6713,
"rewards/accuracies": 0.6153846383094788,
"rewards/chosen": -0.6463515758514404,
"rewards/margins": 0.1319323182106018,
"rewards/rejected": -0.7782838940620422,
"step": 150
},
{
"epoch": 0.99,
"step": 153,
"total_flos": 0.0,
"train_loss": 0.6840069897813734,
"train_runtime": 41065.6381,
"train_samples_per_second": 0.487,
"train_steps_per_second": 0.004
}
],
"logging_steps": 10,
"max_steps": 153,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}