zephyr-7b-dpo-full / trainer_state.json
fenguhao's picture
Model save
54e9029 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9990186457311089,
"eval_steps": 100,
"global_step": 509,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 9.803921568627451e-09,
"logits/chosen": -2.7483465671539307,
"logits/rejected": -2.739339828491211,
"logps/chosen": -287.5325927734375,
"logps/rejected": -235.635986328125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"learning_rate": 9.80392156862745e-08,
"logits/chosen": -2.709578037261963,
"logits/rejected": -2.7113540172576904,
"logps/chosen": -260.56292724609375,
"logps/rejected": -256.438232421875,
"loss": 0.6932,
"rewards/accuracies": 0.4194444417953491,
"rewards/chosen": 0.00014394157915376127,
"rewards/margins": 1.0432106591906631e-06,
"rewards/rejected": 0.00014289839600678533,
"step": 10
},
{
"epoch": 0.04,
"learning_rate": 1.96078431372549e-07,
"logits/chosen": -2.728665828704834,
"logits/rejected": -2.7061820030212402,
"logps/chosen": -280.0662536621094,
"logps/rejected": -254.76626586914062,
"loss": 0.6926,
"rewards/accuracies": 0.5724999904632568,
"rewards/chosen": -4.974007424607407e-06,
"rewards/margins": 0.0005589541979134083,
"rewards/rejected": -0.0005639282753691077,
"step": 20
},
{
"epoch": 0.06,
"learning_rate": 2.941176470588235e-07,
"logits/chosen": -2.7290821075439453,
"logits/rejected": -2.742999315261841,
"logps/chosen": -279.2391357421875,
"logps/rejected": -253.37265014648438,
"loss": 0.6895,
"rewards/accuracies": 0.6349999904632568,
"rewards/chosen": 0.0049138437025249004,
"rewards/margins": 0.007674422115087509,
"rewards/rejected": -0.002760578179731965,
"step": 30
},
{
"epoch": 0.08,
"learning_rate": 3.92156862745098e-07,
"logits/chosen": -2.7134017944335938,
"logits/rejected": -2.698641777038574,
"logps/chosen": -274.20147705078125,
"logps/rejected": -255.8253936767578,
"loss": 0.6782,
"rewards/accuracies": 0.6924999952316284,
"rewards/chosen": 0.0260241087526083,
"rewards/margins": 0.026919733732938766,
"rewards/rejected": -0.0008956241654232144,
"step": 40
},
{
"epoch": 0.1,
"learning_rate": 4.901960784313725e-07,
"logits/chosen": -2.6435346603393555,
"logits/rejected": -2.6110424995422363,
"logps/chosen": -302.06768798828125,
"logps/rejected": -261.10919189453125,
"loss": 0.6612,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.023571131750941277,
"rewards/margins": 0.07649616152048111,
"rewards/rejected": -0.05292503535747528,
"step": 50
},
{
"epoch": 0.12,
"learning_rate": 4.995237599803335e-07,
"logits/chosen": -2.6205055713653564,
"logits/rejected": -2.5843255519866943,
"logps/chosen": -300.914306640625,
"logps/rejected": -286.0216064453125,
"loss": 0.6451,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.05583832785487175,
"rewards/margins": 0.11994686722755432,
"rewards/rejected": -0.17578519880771637,
"step": 60
},
{
"epoch": 0.14,
"learning_rate": 4.978798275112142e-07,
"logits/chosen": -2.607668161392212,
"logits/rejected": -2.568187952041626,
"logps/chosen": -308.4685974121094,
"logps/rejected": -305.6259460449219,
"loss": 0.6212,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1777888685464859,
"rewards/margins": 0.19118839502334595,
"rewards/rejected": -0.3689771890640259,
"step": 70
},
{
"epoch": 0.16,
"learning_rate": 4.950700530747689e-07,
"logits/chosen": -2.6067116260528564,
"logits/rejected": -2.5767879486083984,
"logps/chosen": -300.19488525390625,
"logps/rejected": -295.8065185546875,
"loss": 0.6196,
"rewards/accuracies": 0.6850000023841858,
"rewards/chosen": -0.13195012509822845,
"rewards/margins": 0.25833892822265625,
"rewards/rejected": -0.3902890384197235,
"step": 80
},
{
"epoch": 0.18,
"learning_rate": 4.911076517558622e-07,
"logits/chosen": -2.5809831619262695,
"logits/rejected": -2.555103302001953,
"logps/chosen": -325.28692626953125,
"logps/rejected": -330.8323974609375,
"loss": 0.5844,
"rewards/accuracies": 0.7300000190734863,
"rewards/chosen": -0.21861158311367035,
"rewards/margins": 0.3220059275627136,
"rewards/rejected": -0.5406175851821899,
"step": 90
},
{
"epoch": 0.2,
"learning_rate": 4.860112597371772e-07,
"logits/chosen": -2.5413742065429688,
"logits/rejected": -2.5363407135009766,
"logps/chosen": -295.8542175292969,
"logps/rejected": -310.6338195800781,
"loss": 0.5764,
"rewards/accuracies": 0.6675000190734863,
"rewards/chosen": -0.26630619168281555,
"rewards/margins": 0.3358945846557617,
"rewards/rejected": -0.6022006869316101,
"step": 100
},
{
"epoch": 0.2,
"eval_logits/chosen": -2.4791219234466553,
"eval_logits/rejected": -2.4360005855560303,
"eval_logps/chosen": -313.6502990722656,
"eval_logps/rejected": -340.86053466796875,
"eval_loss": 0.5828901529312134,
"eval_rewards/accuracies": 0.6931137442588806,
"eval_rewards/chosen": -0.3592246174812317,
"eval_rewards/margins": 0.40203189849853516,
"eval_rewards/rejected": -0.7612565159797668,
"eval_runtime": 494.2516,
"eval_samples_per_second": 4.047,
"eval_steps_per_second": 0.338,
"step": 100
},
{
"epoch": 0.22,
"learning_rate": 4.798048466485017e-07,
"logits/chosen": -2.0916123390197754,
"logits/rejected": -2.1291110515594482,
"logps/chosen": -337.0193786621094,
"logps/rejected": -372.4815368652344,
"loss": 0.5665,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6119796633720398,
"rewards/margins": 0.5584384799003601,
"rewards/rejected": -1.1704181432724,
"step": 110
},
{
"epoch": 0.24,
"learning_rate": 4.725176028314541e-07,
"logits/chosen": -1.8370585441589355,
"logits/rejected": -1.7712280750274658,
"logps/chosen": -370.1864318847656,
"logps/rejected": -398.8289794921875,
"loss": 0.56,
"rewards/accuracies": 0.7350000143051147,
"rewards/chosen": -0.8116917610168457,
"rewards/margins": 0.6380540728569031,
"rewards/rejected": -1.449745774269104,
"step": 120
},
{
"epoch": 0.26,
"learning_rate": 4.641838020498713e-07,
"logits/chosen": -1.7485500574111938,
"logits/rejected": -1.5671393871307373,
"logps/chosen": -380.29913330078125,
"logps/rejected": -424.1035461425781,
"loss": 0.5461,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": -0.8717474937438965,
"rewards/margins": 0.6444628834724426,
"rewards/rejected": -1.5162103176116943,
"step": 130
},
{
"epoch": 0.27,
"learning_rate": 4.5484264029156733e-07,
"logits/chosen": -1.9667887687683105,
"logits/rejected": -1.6983026266098022,
"logps/chosen": -322.9972839355469,
"logps/rejected": -379.5963134765625,
"loss": 0.5416,
"rewards/accuracies": 0.7149999737739563,
"rewards/chosen": -0.6348860263824463,
"rewards/margins": 0.6040786504745483,
"rewards/rejected": -1.2389646768569946,
"step": 140
},
{
"epoch": 0.29,
"learning_rate": 4.445380514196192e-07,
"logits/chosen": -1.2058897018432617,
"logits/rejected": -0.9969528317451477,
"logps/chosen": -379.3441467285156,
"logps/rejected": -449.9009704589844,
"loss": 0.5485,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9173200726509094,
"rewards/margins": 0.7758927941322327,
"rewards/rejected": -1.6932127475738525,
"step": 150
},
{
"epoch": 0.31,
"learning_rate": 4.33318500540218e-07,
"logits/chosen": -1.7521625757217407,
"logits/rejected": -1.4877443313598633,
"logps/chosen": -356.1580810546875,
"logps/rejected": -389.0058288574219,
"loss": 0.5183,
"rewards/accuracies": 0.7850000262260437,
"rewards/chosen": -0.6841800212860107,
"rewards/margins": 0.7851129174232483,
"rewards/rejected": -1.4692928791046143,
"step": 160
},
{
"epoch": 0.33,
"learning_rate": 4.2123675605892985e-07,
"logits/chosen": -1.6861900091171265,
"logits/rejected": -1.4684306383132935,
"logps/chosen": -379.7774658203125,
"logps/rejected": -437.3900451660156,
"loss": 0.5146,
"rewards/accuracies": 0.7300000190734863,
"rewards/chosen": -0.8159699440002441,
"rewards/margins": 0.7220683097839355,
"rewards/rejected": -1.5380383729934692,
"step": 170
},
{
"epoch": 0.35,
"learning_rate": 4.0834964149744333e-07,
"logits/chosen": -1.3343206644058228,
"logits/rejected": -1.0179518461227417,
"logps/chosen": -358.3331298828125,
"logps/rejected": -399.9204406738281,
"loss": 0.5536,
"rewards/accuracies": 0.7074999809265137,
"rewards/chosen": -0.8257815837860107,
"rewards/margins": 0.7000215649604797,
"rewards/rejected": -1.5258032083511353,
"step": 180
},
{
"epoch": 0.37,
"learning_rate": 3.947177682380738e-07,
"logits/chosen": -1.2010215520858765,
"logits/rejected": -0.8926857709884644,
"logps/chosen": -375.1010437011719,
"logps/rejected": -433.2417297363281,
"loss": 0.5309,
"rewards/accuracies": 0.7425000071525574,
"rewards/chosen": -0.7876387238502502,
"rewards/margins": 0.7681831121444702,
"rewards/rejected": -1.5558221340179443,
"step": 190
},
{
"epoch": 0.39,
"learning_rate": 3.804052504529933e-07,
"logits/chosen": -1.1186742782592773,
"logits/rejected": -0.7032889723777771,
"logps/chosen": -351.2778625488281,
"logps/rejected": -416.71820068359375,
"loss": 0.5169,
"rewards/accuracies": 0.7475000023841858,
"rewards/chosen": -0.7259469032287598,
"rewards/margins": 0.874809741973877,
"rewards/rejected": -1.6007568836212158,
"step": 200
},
{
"epoch": 0.39,
"eval_logits/chosen": -1.201006293296814,
"eval_logits/rejected": -0.8443379402160645,
"eval_logps/chosen": -366.2012023925781,
"eval_logps/rejected": -426.77203369140625,
"eval_loss": 0.531209409236908,
"eval_rewards/accuracies": 0.7065868377685547,
"eval_rewards/chosen": -0.8847335577011108,
"eval_rewards/margins": 0.7356376647949219,
"eval_rewards/rejected": -1.6203712224960327,
"eval_runtime": 494.1792,
"eval_samples_per_second": 4.047,
"eval_steps_per_second": 0.338,
"step": 200
},
{
"epoch": 0.41,
"learning_rate": 3.654794035589483e-07,
"logits/chosen": -0.9955520629882812,
"logits/rejected": -0.5436328649520874,
"logps/chosen": -402.7477722167969,
"logps/rejected": -444.9473876953125,
"loss": 0.5126,
"rewards/accuracies": 0.7225000262260437,
"rewards/chosen": -1.0243951082229614,
"rewards/margins": 0.7689486742019653,
"rewards/rejected": -1.7933436632156372,
"step": 210
},
{
"epoch": 0.43,
"learning_rate": 3.5001042761570826e-07,
"logits/chosen": -0.7878814935684204,
"logits/rejected": -0.33438754081726074,
"logps/chosen": -379.41448974609375,
"logps/rejected": -452.28009033203125,
"loss": 0.5159,
"rewards/accuracies": 0.7475000023841858,
"rewards/chosen": -1.0701900720596313,
"rewards/margins": 0.8491780161857605,
"rewards/rejected": -1.919368028640747,
"step": 220
},
{
"epoch": 0.45,
"learning_rate": 3.34071077157304e-07,
"logits/chosen": -0.6851831078529358,
"logits/rejected": -0.29147180914878845,
"logps/chosen": -360.47869873046875,
"logps/rejected": -406.3958740234375,
"loss": 0.5399,
"rewards/accuracies": 0.7149999737739563,
"rewards/chosen": -0.9100778698921204,
"rewards/margins": 0.7056692242622375,
"rewards/rejected": -1.6157469749450684,
"step": 230
},
{
"epoch": 0.47,
"learning_rate": 3.1773631900892204e-07,
"logits/chosen": -0.6293848752975464,
"logits/rejected": -0.2972988784313202,
"logps/chosen": -364.2557067871094,
"logps/rejected": -426.8414306640625,
"loss": 0.5184,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.945137083530426,
"rewards/margins": 0.7834777235984802,
"rewards/rejected": -1.7286149263381958,
"step": 240
},
{
"epoch": 0.49,
"learning_rate": 3.0108297969883103e-07,
"logits/chosen": -0.6830095052719116,
"logits/rejected": -0.20727473497390747,
"logps/chosen": -377.15960693359375,
"logps/rejected": -440.8514709472656,
"loss": 0.5199,
"rewards/accuracies": 0.7475000023841858,
"rewards/chosen": -0.9253360033035278,
"rewards/margins": 0.7137148380279541,
"rewards/rejected": -1.6390507221221924,
"step": 250
},
{
"epoch": 0.51,
"learning_rate": 2.8418938412365013e-07,
"logits/chosen": -0.595008909702301,
"logits/rejected": -0.22117982804775238,
"logps/chosen": -378.3102722167969,
"logps/rejected": -421.2056884765625,
"loss": 0.5259,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0280470848083496,
"rewards/margins": 0.6548060774803162,
"rewards/rejected": -1.682853102684021,
"step": 260
},
{
"epoch": 0.53,
"learning_rate": 2.671349871664101e-07,
"logits/chosen": -0.4738517105579376,
"logits/rejected": -0.06301561743021011,
"logps/chosen": -391.0889892578125,
"logps/rejected": -433.60174560546875,
"loss": 0.4996,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.925932765007019,
"rewards/margins": 0.8979344367980957,
"rewards/rejected": -1.8238672018051147,
"step": 270
},
{
"epoch": 0.55,
"learning_rate": 2.5e-07,
"logits/chosen": -0.29330724477767944,
"logits/rejected": 0.11182761192321777,
"logps/chosen": -400.1533203125,
"logps/rejected": -453.4571228027344,
"loss": 0.5108,
"rewards/accuracies": 0.7174999713897705,
"rewards/chosen": -1.1598564386367798,
"rewards/margins": 0.7635893821716309,
"rewards/rejected": -1.9234455823898315,
"step": 280
},
{
"epoch": 0.57,
"learning_rate": 2.3286501283358982e-07,
"logits/chosen": -0.049084682017564774,
"logits/rejected": 0.32071781158447266,
"logps/chosen": -421.474853515625,
"logps/rejected": -480.5507507324219,
"loss": 0.5107,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2823936939239502,
"rewards/margins": 0.920534610748291,
"rewards/rejected": -2.202928304672241,
"step": 290
},
{
"epoch": 0.59,
"learning_rate": 2.1581061587634987e-07,
"logits/chosen": -0.3210409879684448,
"logits/rejected": 0.13426151871681213,
"logps/chosen": -392.66351318359375,
"logps/rejected": -457.4385681152344,
"loss": 0.5133,
"rewards/accuracies": 0.7825000286102295,
"rewards/chosen": -1.2225959300994873,
"rewards/margins": 0.9219253659248352,
"rewards/rejected": -2.1445212364196777,
"step": 300
},
{
"epoch": 0.59,
"eval_logits/chosen": -0.38526856899261475,
"eval_logits/rejected": 0.0459565594792366,
"eval_logps/chosen": -396.590576171875,
"eval_logps/rejected": -460.7764892578125,
"eval_loss": 0.5159304141998291,
"eval_rewards/accuracies": 0.7245509028434753,
"eval_rewards/chosen": -1.1886271238327026,
"eval_rewards/margins": 0.7717891931533813,
"eval_rewards/rejected": -1.9604166746139526,
"eval_runtime": 494.4328,
"eval_samples_per_second": 4.045,
"eval_steps_per_second": 0.338,
"step": 300
},
{
"epoch": 0.61,
"learning_rate": 1.9891702030116897e-07,
"logits/chosen": -0.6406633257865906,
"logits/rejected": 0.15507885813713074,
"logps/chosen": -384.56219482421875,
"logps/rejected": -443.3284912109375,
"loss": 0.5192,
"rewards/accuracies": 0.7599999904632568,
"rewards/chosen": -1.066334843635559,
"rewards/margins": 0.8297566175460815,
"rewards/rejected": -1.8960914611816406,
"step": 310
},
{
"epoch": 0.63,
"learning_rate": 1.8226368099107792e-07,
"logits/chosen": -0.6926136016845703,
"logits/rejected": -0.09604160487651825,
"logps/chosen": -414.7826232910156,
"logps/rejected": -454.5480041503906,
"loss": 0.5065,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0457278490066528,
"rewards/margins": 0.7350744605064392,
"rewards/rejected": -1.7808022499084473,
"step": 320
},
{
"epoch": 0.65,
"learning_rate": 1.6592892284269594e-07,
"logits/chosen": -0.5141594409942627,
"logits/rejected": 0.11050853878259659,
"logps/chosen": -402.63348388671875,
"logps/rejected": -431.8319091796875,
"loss": 0.5093,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0640606880187988,
"rewards/margins": 0.7925867438316345,
"rewards/rejected": -1.8566473722457886,
"step": 330
},
{
"epoch": 0.67,
"learning_rate": 1.4998957238429172e-07,
"logits/chosen": -0.08297364413738251,
"logits/rejected": 0.21859808266162872,
"logps/chosen": -390.8412170410156,
"logps/rejected": -461.3310546875,
"loss": 0.505,
"rewards/accuracies": 0.7275000214576721,
"rewards/chosen": -1.190333604812622,
"rewards/margins": 0.8922053575515747,
"rewards/rejected": -2.0825393199920654,
"step": 340
},
{
"epoch": 0.69,
"learning_rate": 1.345205964410517e-07,
"logits/chosen": -0.539190948009491,
"logits/rejected": -0.053236301988363266,
"logps/chosen": -392.14385986328125,
"logps/rejected": -447.09844970703125,
"loss": 0.5125,
"rewards/accuracies": 0.7774999737739563,
"rewards/chosen": -0.9940242767333984,
"rewards/margins": 0.9291434288024902,
"rewards/rejected": -1.9231675863265991,
"step": 350
},
{
"epoch": 0.71,
"learning_rate": 1.1959474954700665e-07,
"logits/chosen": -0.6150873303413391,
"logits/rejected": -0.08470536023378372,
"logps/chosen": -377.5425109863281,
"logps/rejected": -434.1069030761719,
"loss": 0.5266,
"rewards/accuracies": 0.7599999904632568,
"rewards/chosen": -1.0171641111373901,
"rewards/margins": 0.7864332795143127,
"rewards/rejected": -1.803597092628479,
"step": 360
},
{
"epoch": 0.73,
"learning_rate": 1.0528223176192615e-07,
"logits/chosen": -0.464309424161911,
"logits/rejected": 0.11655576527118683,
"logps/chosen": -397.9951477050781,
"logps/rejected": -446.141845703125,
"loss": 0.4885,
"rewards/accuracies": 0.7350000143051147,
"rewards/chosen": -1.1220192909240723,
"rewards/margins": 0.7690063714981079,
"rewards/rejected": -1.8910256624221802,
"step": 370
},
{
"epoch": 0.75,
"learning_rate": 9.16503585025567e-08,
"logits/chosen": -0.3131292462348938,
"logits/rejected": 0.1059599220752716,
"logps/chosen": -398.6189880371094,
"logps/rejected": -455.5489807128906,
"loss": 0.4785,
"rewards/accuracies": 0.7774999737739563,
"rewards/chosen": -1.180424451828003,
"rewards/margins": 0.9602058529853821,
"rewards/rejected": -2.1406302452087402,
"step": 380
},
{
"epoch": 0.77,
"learning_rate": 7.876324394107017e-08,
"logits/chosen": -0.06371825933456421,
"logits/rejected": 0.4222162663936615,
"logps/chosen": -408.15203857421875,
"logps/rejected": -469.3525085449219,
"loss": 0.4945,
"rewards/accuracies": 0.7774999737739563,
"rewards/chosen": -1.2744272947311401,
"rewards/margins": 0.8693990111351013,
"rewards/rejected": -2.1438262462615967,
"step": 390
},
{
"epoch": 0.79,
"learning_rate": 6.668149945978201e-08,
"logits/chosen": -0.4337286353111267,
"logits/rejected": 0.11450805515050888,
"logps/chosen": -406.1577453613281,
"logps/rejected": -468.1871337890625,
"loss": 0.4968,
"rewards/accuracies": 0.7574999928474426,
"rewards/chosen": -1.204884648323059,
"rewards/margins": 0.9240193367004395,
"rewards/rejected": -2.128904104232788,
"step": 400
},
{
"epoch": 0.79,
"eval_logits/chosen": -0.2552393972873688,
"eval_logits/rejected": 0.20138485729694366,
"eval_logps/chosen": -402.1766357421875,
"eval_logps/rejected": -475.3639221191406,
"eval_loss": 0.5057728290557861,
"eval_rewards/accuracies": 0.7140718698501587,
"eval_rewards/chosen": -1.2444883584976196,
"eval_rewards/margins": 0.8618020415306091,
"eval_rewards/rejected": -2.106290578842163,
"eval_runtime": 493.9837,
"eval_samples_per_second": 4.049,
"eval_steps_per_second": 0.338,
"step": 400
},
{
"epoch": 0.8,
"learning_rate": 5.546194858038072e-08,
"logits/chosen": -0.3444100618362427,
"logits/rejected": 0.08428356051445007,
"logps/chosen": -419.0089111328125,
"logps/rejected": -482.5577392578125,
"loss": 0.488,
"rewards/accuracies": 0.7325000166893005,
"rewards/chosen": -1.1570134162902832,
"rewards/margins": 0.9088660478591919,
"rewards/rejected": -2.0658795833587646,
"step": 410
},
{
"epoch": 0.82,
"learning_rate": 4.5157359708432626e-08,
"logits/chosen": -0.3363034129142761,
"logits/rejected": 0.1421819031238556,
"logps/chosen": -417.26116943359375,
"logps/rejected": -475.9188537597656,
"loss": 0.5012,
"rewards/accuracies": 0.7549999952316284,
"rewards/chosen": -1.1876376867294312,
"rewards/margins": 0.9119570255279541,
"rewards/rejected": -2.0995945930480957,
"step": 420
},
{
"epoch": 0.84,
"learning_rate": 3.581619795012874e-08,
"logits/chosen": -0.4450594186782837,
"logits/rejected": 0.03785795345902443,
"logps/chosen": -404.95281982421875,
"logps/rejected": -467.25531005859375,
"loss": 0.4861,
"rewards/accuracies": 0.7724999785423279,
"rewards/chosen": -1.1584584712982178,
"rewards/margins": 0.9622448086738586,
"rewards/rejected": -2.1207032203674316,
"step": 430
},
{
"epoch": 0.86,
"learning_rate": 2.748239716854589e-08,
"logits/chosen": -0.31011733412742615,
"logits/rejected": 0.310569167137146,
"logps/chosen": -389.67132568359375,
"logps/rejected": -470.01104736328125,
"loss": 0.5105,
"rewards/accuracies": 0.7350000143051147,
"rewards/chosen": -1.1304560899734497,
"rewards/margins": 0.8861461877822876,
"rewards/rejected": -2.016602039337158,
"step": 440
},
{
"epoch": 0.88,
"learning_rate": 2.0195153351498323e-08,
"logits/chosen": -0.3003827631473541,
"logits/rejected": 0.046957388520240784,
"logps/chosen": -412.5171203613281,
"logps/rejected": -481.26898193359375,
"loss": 0.5128,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1812173128128052,
"rewards/margins": 0.8305546641349792,
"rewards/rejected": -2.0117719173431396,
"step": 450
},
{
"epoch": 0.9,
"learning_rate": 1.3988740262822846e-08,
"logits/chosen": -0.47582343220710754,
"logits/rejected": -0.11152289062738419,
"logps/chosen": -410.2917175292969,
"logps/rejected": -457.774658203125,
"loss": 0.5044,
"rewards/accuracies": 0.7649999856948853,
"rewards/chosen": -1.1460288763046265,
"rewards/margins": 0.8546761870384216,
"rewards/rejected": -2.0007050037384033,
"step": 460
},
{
"epoch": 0.92,
"learning_rate": 8.892348244137788e-09,
"logits/chosen": -0.5770422220230103,
"logits/rejected": -0.025662722066044807,
"logps/chosen": -372.98187255859375,
"logps/rejected": -467.86199951171875,
"loss": 0.4973,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": -1.0886142253875732,
"rewards/margins": 0.8808639049530029,
"rewards/rejected": -1.9694780111312866,
"step": 470
},
{
"epoch": 0.94,
"learning_rate": 4.929946925231076e-09,
"logits/chosen": -0.5876446962356567,
"logits/rejected": -0.16365936398506165,
"logps/chosen": -400.3377685546875,
"logps/rejected": -455.9208068847656,
"loss": 0.5072,
"rewards/accuracies": 0.7024999856948853,
"rewards/chosen": -1.1451067924499512,
"rewards/margins": 0.7030719518661499,
"rewards/rejected": -1.848178744316101,
"step": 480
},
{
"epoch": 0.96,
"learning_rate": 2.1201724887858484e-09,
"logits/chosen": -0.4430970847606659,
"logits/rejected": 0.12594802677631378,
"logps/chosen": -409.6846008300781,
"logps/rejected": -458.5526428222656,
"loss": 0.4887,
"rewards/accuracies": 0.7574999928474426,
"rewards/chosen": -1.0775573253631592,
"rewards/margins": 0.9305427074432373,
"rewards/rejected": -2.0081000328063965,
"step": 490
},
{
"epoch": 0.98,
"learning_rate": 4.762400196664518e-10,
"logits/chosen": -0.41937455534935,
"logits/rejected": -0.08660510927438736,
"logps/chosen": -385.8563232421875,
"logps/rejected": -454.9473571777344,
"loss": 0.4833,
"rewards/accuracies": 0.7699999809265137,
"rewards/chosen": -1.093034267425537,
"rewards/margins": 0.9196186661720276,
"rewards/rejected": -2.012652635574341,
"step": 500
},
{
"epoch": 0.98,
"eval_logits/chosen": -0.4496035575866699,
"eval_logits/rejected": 0.04359949380159378,
"eval_logps/chosen": -395.9374084472656,
"eval_logps/rejected": -470.5448303222656,
"eval_loss": 0.5045374631881714,
"eval_rewards/accuracies": 0.726047933101654,
"eval_rewards/chosen": -1.182096004486084,
"eval_rewards/margins": 0.876003086566925,
"eval_rewards/rejected": -2.0580990314483643,
"eval_runtime": 494.2334,
"eval_samples_per_second": 4.047,
"eval_steps_per_second": 0.338,
"step": 500
},
{
"epoch": 1.0,
"step": 509,
"total_flos": 0.0,
"train_loss": 0.5401819272219315,
"train_runtime": 34352.758,
"train_samples_per_second": 1.78,
"train_steps_per_second": 0.015
}
],
"logging_steps": 10,
"max_steps": 509,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}